diff --git a/.automation_scripts/run_pytorch_unit_tests.py b/.automation_scripts/run_pytorch_unit_tests.py
index fff20bf64fec..1e06c9eecf0f 100644
--- a/.automation_scripts/run_pytorch_unit_tests.py
+++ b/.automation_scripts/run_pytorch_unit_tests.py
@@ -338,7 +338,11 @@ def run_test_and_summarize_results(
 
     # copy current environment variables
     _environ = dict(os.environ)
+<<<<<<< HEAD
 
+=======
+    
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # modify path
     test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh"
     test_run_test_path = pytorch_root_dir + "/test/run_test.py"
@@ -385,6 +389,13 @@ def run_test_and_summarize_results(
     global CONSOLIDATED_LOG_FILE_PATH
     CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME
 
+<<<<<<< HEAD
+=======
+    # Check multi gpu availability if distributed tests are enabled
+    if ("distributed" in test_config) or len(distributed_list) != 0:
+        check_num_gpus_for_distributed()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Install test requirements
     command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
     run_command_and_capture_output(command)
@@ -393,15 +404,23 @@ def run_test_and_summarize_results(
     if not priority_tests and not default_list and not distributed_list and not inductor_list:
         # run entire tests for default, distributed and inductor workflows → use test.sh
         if not test_config:
+<<<<<<< HEAD
+=======
+            check_num_gpus_for_distributed()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # default test process
             res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
             res_all_tests_dict["default"] = res_default_all
             # distributed test process
+<<<<<<< HEAD
             res_distributed_all = {}
             if is_multi_gpus_available_for_distributed():
                 res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
             else:
                 print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
+=======
+            res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res_all_tests_dict["distributed"] = res_distributed_all
             # inductor test process
             res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
@@ -414,11 +433,15 @@ def run_test_and_summarize_results(
                 res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
                 res_all_tests_dict["default"] = res_default_all
             if "distributed" in workflow_list:
+<<<<<<< HEAD
                 res_distributed_all = {}
                 if is_multi_gpus_available_for_distributed():
                     res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
                 else:
                     print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
+=======
+                res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 res_all_tests_dict["distributed"] = res_distributed_all
             if "inductor" in workflow_list:
                 res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
@@ -426,15 +449,23 @@ def run_test_and_summarize_results(
     # Run priority test for each workflow
     elif priority_tests and not default_list and not distributed_list and not inductor_list:
         if not test_config:
+<<<<<<< HEAD
+=======
+            check_num_gpus_for_distributed()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # default test process
             res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
             res_all_tests_dict["default"] = res_default_priority
             # distributed test process
+<<<<<<< HEAD
             res_distributed_priority = {}
             if is_multi_gpus_available_for_distributed():
                 res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
             else:
                 print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
+=======
+            res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res_all_tests_dict["distributed"] = res_distributed_priority
             # will not run inductor priority tests
             print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
@@ -446,11 +477,15 @@ def run_test_and_summarize_results(
                 res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
                 res_all_tests_dict["default"] = res_default_priority
             if "distributed" in workflow_list:
+<<<<<<< HEAD
                 res_distributed_priority = {}
                 if is_multi_gpus_available_for_distributed():
                     res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
                 else:
                     print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
+=======
+                res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 res_all_tests_dict["distributed"] = res_distributed_priority
             if "inductor" in workflow_list:
                 print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
@@ -466,11 +501,15 @@ def run_test_and_summarize_results(
             distributed_workflow_list = []
             for item in distributed_list:
                 distributed_workflow_list.append(item)
+<<<<<<< HEAD
             res_distributed_selected = {}
             if is_multi_gpus_available_for_distributed():
                 res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
             else:
                 print("Warning: Cannot run distributed unit tests. Number of visible GPUs should be >1 to run distributed unit tests.")
+=======
+            res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             res_all_tests_dict["distributed"] = res_distributed_selected
         if inductor_list:
             inductor_workflow_list = []
@@ -518,10 +557,17 @@ def parse_args():
                                                             "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
     return parser.parse_args()
 
+<<<<<<< HEAD
 def is_multi_gpus_available_for_distributed():
     p = subprocess.run("rocminfo | grep -cE 'Name:\\s+gfx'", shell=True, capture_output=True, text=True)
     num_gpus_visible = int(p.stdout)
     return num_gpus_visible > 1
+=======
+def check_num_gpus_for_distributed():
+    p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
+    num_gpus_visible = int(p.stdout)
+    assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def main():
     args = parse_args()
diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh
index ff3337e3f6d8..5371e4eab362 100644
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@@ -3,10 +3,15 @@ set -eux -o pipefail
 
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 
+<<<<<<< HEAD
 if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
     export TORCH_CUDA_ARCH_LIST="9.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
     export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
+=======
+if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@@ -27,6 +32,10 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then
     USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
     echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
+<<<<<<< HEAD
+=======
+    export USE_SYSTEM_NCCL=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
     USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
index a182f4d36bd9..e0ea94032e13 100755
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@@ -31,6 +31,7 @@ def build_ArmComputeLibrary() -> None:
         "build=native",
     ]
     acl_install_dir = "/acl"
+<<<<<<< HEAD
     acl_checkout_dir = "ComputeLibrary"
     os.makedirs(acl_install_dir)
     check_call(
@@ -52,6 +53,30 @@ def build_ArmComputeLibrary() -> None:
         cwd=acl_checkout_dir,
     )
     for d in ["arm_compute", "include", "utils", "support", "src"]:
+=======
+    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
+    if os.path.isdir(acl_install_dir):
+        shutil.rmtree(acl_install_dir)
+    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
+        check_call(
+            [
+                "git",
+                "clone",
+                "https://github.com/ARM-software/ComputeLibrary.git",
+                "-b",
+                "v25.02",
+                "--depth",
+                "1",
+                "--shallow-submodules",
+            ]
+        )
+
+    check_call(
+        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
+        cwd=acl_checkout_dir,
+    )
+    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
 
 
@@ -87,7 +112,11 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
         "/usr/local/cuda/lib64/libcusparseLt.so.0",
         "/usr/local/cuda/lib64/libcusolver.so.11",
         "/usr/local/cuda/lib64/libcurand.so.10",
+<<<<<<< HEAD
         "/usr/local/cuda/lib64/libnvToolsExt.so.1",
+=======
+        "/usr/local/cuda/lib64/libnccl.so.2",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "/usr/local/cuda/lib64/libnvJitLink.so.12",
         "/usr/local/cuda/lib64/libnvrtc.so.12",
         "/usr/local/cuda/lib64/libcudnn_adv.so.9",
@@ -107,9 +136,15 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
         "/usr/local/lib/libnvpl_blas_core.so.0",
     ]
 
+<<<<<<< HEAD
     if "128" in desired_cuda:
         libs_to_copy += [
             "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
+=======
+    if "129" in desired_cuda:
+        libs_to_copy += [
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "/usr/local/cuda/lib64/libcufile.so.0",
             "/usr/local/cuda/lib64/libcufile_rdma.so.1",
         ]
@@ -203,8 +238,15 @@ def parse_arguments():
     ).decode()
 
     print("Building PyTorch wheel")
+<<<<<<< HEAD
     build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
     os.system("cd /pytorch; python setup.py clean")
+=======
+    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
+    if enable_cuda:
+        build_vars = "MAX_JOBS=5 " + build_vars
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
     desired_cuda = os.getenv("DESIRED_CUDA")
diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py
index c6593a179cfa..02b5c4a5fcad 100755
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@@ -19,13 +19,19 @@
 
 # AMI images for us-east-1, change the following based on your ~/.aws/config
 os_amis = {
+<<<<<<< HEAD
     "ubuntu18_04": "ami-078eece1d8119409f",  # login_name: ubuntu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
     "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
     "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
 }
 
+<<<<<<< HEAD
 ubuntu18_04_ami = os_amis["ubuntu18_04"]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ubuntu20_04_ami = os_amis["ubuntu20_04"]
 
 
@@ -659,6 +665,7 @@ def configure_system(
             "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
         )
     host.run_cmd("pip3 install dataclasses typing-extensions")
+<<<<<<< HEAD
     # Install and switch to gcc-8 on Ubuntu-18.04
     if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
         host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
@@ -671,6 +678,8 @@ def configure_system(
         host.run_cmd(
             "sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not use_conda:
         print("Installing Cython + numpy from PyPy")
         host.run_cmd("sudo pip3 install Cython")
@@ -1026,7 +1035,11 @@ def parse_arguments():
         install_condaforge_python(host, args.python_version)
         sys.exit(0)
 
+<<<<<<< HEAD
     python_version = args.python_version if args.python_version is not None else "3.8"
+=======
+    python_version = args.python_version if args.python_version is not None else "3.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if args.use_torch_from_pypi:
         configure_system(host, compiler=args.compiler, python_version=python_version)
diff --git a/.ci/caffe2/README.md b/.ci/caffe2/README.md
index c22cd8f228a3..4be1122ab164 100644
--- a/.ci/caffe2/README.md
+++ b/.ci/caffe2/README.md
@@ -10,5 +10,8 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
 built on Jenkins and are used in triggered builds already have this
 environment variable set in their manifest. Also see
 `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
+<<<<<<< HEAD
 
 Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/caffe2/test.sh b/.ci/caffe2/test.sh
index a8adfc1fa0c7..1d6fd150e7be 100755
--- a/.ci/caffe2/test.sh
+++ b/.ci/caffe2/test.sh
@@ -13,10 +13,13 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
   echo 'Skipping tests'
   exit 0
 fi
+<<<<<<< HEAD
 if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
   # temporary to locate some kernel issues on the CI nodes
   export HSAKMT_DEBUG_LEVEL=4
 fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # These additional packages are needed for circleci ROCm builds.
 if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
     # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
diff --git a/.ci/docker/README.md b/.ci/docker/README.md
index 68df30763151..d9b2d8e1b4d5 100644
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@@ -34,5 +34,9 @@ See `build.sh` for valid build environments (it's the giant switch).
 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
 
 # Set flags (see build.sh) and build image
+<<<<<<< HEAD
 sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
+=======
+sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile
index 5f17a6332dd1..86b50d5f1a1a 100644
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@@ -1,6 +1,13 @@
+<<<<<<< HEAD
 ARG CUDA_VERSION=12.4
 ARG BASE_TARGET=cuda${CUDA_VERSION}
 FROM amd64/almalinux:8 as base
+=======
+ARG CUDA_VERSION=12.6
+ARG BASE_TARGET=cuda${CUDA_VERSION}
+ARG ROCM_IMAGE=rocm/dev-almalinux-8:6.3-complete
+FROM amd64/almalinux:8.10-20250519 as base
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
@@ -8,12 +15,19 @@ ENV LANGUAGE en_US.UTF-8
 
 ARG DEVTOOLSET_VERSION=11
 
+<<<<<<< HEAD
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 
 RUN yum -y update
 RUN yum -y install epel-release
+=======
+RUN yum -y update
+RUN yum -y install epel-release
+# install glibc-langpack-en make sure en_US.UTF-8 locale is available
+RUN yum -y install glibc-langpack-en
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
@@ -41,15 +55,25 @@ RUN bash ./install_conda.sh && rm install_conda.sh
 
 # Install CUDA
 FROM base as cuda
+<<<<<<< HEAD
 ARG CUDA_VERSION=12.4
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
+=======
+ARG CUDA_VERSION=12.6
+RUN rm -rf /usr/local/cuda-*
+ADD ./common/install_cuda.sh install_cuda.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+COPY ./common/install_cusparselt.sh install_cusparselt.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
 # Make things in our path by default
 ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
 
+<<<<<<< HEAD
 FROM cuda as cuda11.8
 RUN bash ./install_cuda.sh 11.8
 ENV DESIRED_CUDA=11.8
@@ -62,10 +86,29 @@ FROM cuda as cuda12.4
 RUN bash ./install_cuda.sh 12.4
 ENV DESIRED_CUDA=12.4
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 ENV DESIRED_CUDA=12.6
 
+<<<<<<< HEAD
+=======
+FROM cuda as cuda12.8
+RUN bash ./install_cuda.sh 12.8
+ENV DESIRED_CUDA=12.8
+
+FROM cuda as cuda12.9
+RUN bash ./install_cuda.sh 12.9
+ENV DESIRED_CUDA=12.9
+
+FROM ${ROCM_IMAGE} as rocm
+ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+ENV MKLROOT /opt/intel
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Install MNIST test data
 FROM base as mnist
 ADD ./common/install_mnist.sh install_mnist.sh
@@ -73,9 +116,15 @@ RUN bash ./install_mnist.sh
 
 FROM base as all_cuda
 COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
+<<<<<<< HEAD
 COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
 COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
+=======
+COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
+COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
+COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Final step
 FROM ${BASE_TARGET} as final
diff --git a/.ci/docker/almalinux/build.sh b/.ci/docker/almalinux/build.sh
index cf81bdf4aea0..7c4f639b5b30 100755
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@@ -1,12 +1,17 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline
 
+<<<<<<< HEAD
 set -eou pipefail
+=======
+set -exou pipefail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 image="$1"
 shift
 
 if [ -z "${image}" ]; then
+<<<<<<< HEAD
   echo "Usage: $0 IMAGE"
   exit 1
 fi
@@ -79,4 +84,66 @@ if [[ "${WITH_PUSH:-}" == true ]]; then
         docker push "${DOCKER_IMAGE_SHA_TAG}"
     fi
   )
+=======
+  echo "Usage: $0 IMAGENAME:ARCHTAG"
+  exit 1
+fi
+
+# Go from imagename:tag to tag
+DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+
+CUDA_VERSION=""
+ROCM_VERSION=""
+EXTRA_BUILD_ARGS=""
+if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
+    # extract cuda version from image name and tag.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
+    CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
+    EXTRA_BUILD_ARGS="--build-arg CUDA_VERSION=${CUDA_VERSION}"
+elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
+    # extract rocm version from image name and tag.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
+    ROCM_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
+    EXTRA_BUILD_ARGS="--build-arg ROCM_IMAGE=rocm/dev-almalinux-8:${ROCM_VERSION}-complete"
+fi
+
+case ${DOCKER_TAG_PREFIX} in
+  cpu)
+    BASE_TARGET=base
+    ;;
+  cuda*)
+    BASE_TARGET=cuda${CUDA_VERSION}
+    ;;
+  rocm*)
+    BASE_TARGET=rocm
+    ;;
+  *)
+    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
+    exit 1
+    ;;
+esac
+
+# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+sudo systemctl daemon-reload
+sudo systemctl restart docker
+
+export DOCKER_BUILDKIT=1
+TOPDIR=$(git rev-parse --show-toplevel)
+tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
+
+docker build \
+  --target final \
+  --progress plain \
+  --build-arg "BASE_TARGET=${BASE_TARGET}" \
+  --build-arg "DEVTOOLSET_VERSION=11" \
+  ${EXTRA_BUILD_ARGS} \
+  -t ${tmp_tag} \
+  $@ \
+  -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
+  ${TOPDIR}/.ci/docker/
+
+if [ -n "${CUDA_VERSION}" ]; then
+  # Test that we're using the right CUDA compiler
+  docker run --rm "${tmp_tag}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index a7594b0f82b3..ac52ab2567b4 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -50,32 +50,46 @@ if [[ "$image" == *xla* ]]; then
   exit 0
 fi
 
+<<<<<<< HEAD
 if [[ "$image" == *-focal* ]]; then
   UBUNTU_VERSION=20.04
 elif [[ "$image" == *-jammy* ]]; then
+=======
+if [[ "$image" == *-jammy* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   UBUNTU_VERSION=22.04
 elif [[ "$image" == *-noble* ]]; then
   UBUNTU_VERSION=24.04
 elif [[ "$image" == *ubuntu* ]]; then
   extract_version_from_image_name ubuntu UBUNTU_VERSION
+<<<<<<< HEAD
 elif [[ "$image" == *centos* ]]; then
   extract_version_from_image_name centos CENTOS_VERSION
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 if [ -n "${UBUNTU_VERSION}" ]; then
   OS="ubuntu"
+<<<<<<< HEAD
 elif [ -n "${CENTOS_VERSION}" ]; then
   OS="centos"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else
   echo "Unable to derive operating system base..."
   exit 1
 fi
 
 DOCKERFILE="${OS}/Dockerfile"
+<<<<<<< HEAD
 # When using ubuntu - 22.04, start from Ubuntu docker image, instead of nvidia/cuda docker image.
 if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then
   DOCKERFILE="${OS}-cuda/Dockerfile"
 elif [[ "$image" == *rocm* ]]; then
+=======
+if [[ "$image" == *rocm* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   DOCKERFILE="${OS}-rocm/Dockerfile"
 elif [[ "$image" == *xpu* ]]; then
   DOCKERFILE="${OS}-xpu/Dockerfile"
@@ -87,9 +101,12 @@ elif [[ "$image" == *linter* ]]; then
   DOCKERFILE="linter/Dockerfile"
 fi
 
+<<<<<<< HEAD
 # CMake 3.18 is needed to support CUDA17 language variant
 CMAKE_VERSION=3.18.5
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
 _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 if [[ "$image" == *rocm* ]]; then
@@ -97,6 +114,7 @@ if [[ "$image" == *rocm* ]]; then
   _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
 fi
 
+<<<<<<< HEAD
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
@@ -108,10 +126,24 @@ case "$image" in
     GCC_VERSION=11
     PROTOBUF=yes
     DB=yes
+=======
+tag=$(echo $image | awk -F':' '{print $2}')
+
+# It's annoying to rename jobs every time you want to rewrite a
+# configuration, so we hardcode everything here rather than do it
+# from scratch
+case "$tag" in
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
+    CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+<<<<<<< HEAD
     CONDA_CMAKE=yes
     TRITON=yes
     ;;
@@ -122,10 +154,20 @@ case "$image" in
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
+=======
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+<<<<<<< HEAD
     CONDA_CMAKE=yes
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
@@ -137,10 +179,21 @@ case "$image" in
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
+=======
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+<<<<<<< HEAD
     CONDA_CMAKE=yes
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
@@ -152,25 +205,46 @@ case "$image" in
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
+=======
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.13
+    GCC_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+<<<<<<< HEAD
     CONDA_CMAKE=yes
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
   pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9)
+=======
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CUDA_VERSION=12.6.3
     CUDNN_VERSION=9
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=9
+<<<<<<< HEAD
     PROTOBUF=yes
     DB=yes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+<<<<<<< HEAD
     CONDA_CMAKE=yes
     TRITON=yes
     ;;
@@ -181,10 +255,20 @@ case "$image" in
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
+=======
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+<<<<<<< HEAD
     CONDA_CMAKE=yes
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
@@ -196,10 +280,21 @@ case "$image" in
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
+=======
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+<<<<<<< HEAD
     CONDA_CMAKE=yes
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
@@ -211,10 +306,21 @@ case "$image" in
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
+=======
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.13
+    GCC_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+<<<<<<< HEAD
     CONDA_CMAKE=yes
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
@@ -226,10 +332,21 @@ case "$image" in
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
+=======
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+<<<<<<< HEAD
     CONDA_CMAKE=yes
     TRITON=yes
     ;;
@@ -297,12 +414,47 @@ case "$image" in
     ROCM_VERSION=6.3
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
+=======
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-py3-clang12-onnx)
+    ANACONDA_PYTHON_VERSION=3.9
+    CLANG_VERSION=12
+    VISION=yes
+    ONNX=yes
+    ;;
+  pytorch-linux-jammy-py3.9-clang12)
+    ANACONDA_PYTHON_VERSION=3.9
+    CLANG_VERSION=12
+    VISION=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-py3.11-clang12)
+    ANACONDA_PYTHON_VERSION=3.11
+    CLANG_VERSION=12
+    VISION=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-py3.9-gcc9)
+    ANACONDA_PYTHON_VERSION=3.9
+    GCC_VERSION=9
+    VISION=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-rocm-n-1-py3)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    VISION=yes
+    ROCM_VERSION=6.3
+    NINJA_VERSION=1.9.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TRITON=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     INDUCTOR_BENCHMARKS=yes
     ;;
+<<<<<<< HEAD
   pytorch-linux-jammy-xpu-2024.0-py3)
     ANACONDA_PYTHON_VERSION=3.9
     GCC_VERSION=11
@@ -313,30 +465,63 @@ case "$image" in
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
     TRITON=yes
+=======
+  pytorch-linux-jammy-rocm-n-py3)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    VISION=yes
+    ROCM_VERSION=6.4
+    NINJA_VERSION=1.9.0
+    TRITON=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    INDUCTOR_BENCHMARKS=yes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ;;
   pytorch-linux-jammy-xpu-2025.0-py3)
     ANACONDA_PYTHON_VERSION=3.9
     GCC_VERSION=11
+<<<<<<< HEAD
     PROTOBUF=yes
     DB=yes
     VISION=yes
     XPU_VERSION=2025.0
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
+=======
+    VISION=yes
+    XPU_VERSION=2025.0
+    NINJA_VERSION=1.9.0
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-xpu-2025.1-py3)
+    ANACONDA_PYTHON_VERSION=3.9
+    GCC_VERSION=11
+    VISION=yes
+    XPU_VERSION=2025.1
+    NINJA_VERSION=1.9.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TRITON=yes
     ;;
     pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
     ANACONDA_PYTHON_VERSION=3.9
     GCC_VERSION=11
+<<<<<<< HEAD
     PROTOBUF=yes
     DB=yes
     VISION=yes
     KATEX=yes
     CONDA_CMAKE=yes
+=======
+    VISION=yes
+    KATEX=yes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TRITON=yes
     DOCS=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
+<<<<<<< HEAD
   pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12)
     ANACONDA_PYTHON_VERSION=3.9
     CUDA_VERSION=11.8
@@ -344,38 +529,60 @@ case "$image" in
     CLANG_VERSION=12
     PROTOBUF=yes
     DB=yes
+=======
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
+    ANACONDA_PYTHON_VERSION=3.9
+    CUDA_VERSION=12.8.1
+    CUDNN_VERSION=9
+    CLANG_VERSION=12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     TRITON=yes
     ;;
   pytorch-linux-jammy-py3-clang12-asan)
     ANACONDA_PYTHON_VERSION=3.9
     CLANG_VERSION=12
+<<<<<<< HEAD
     PROTOBUF=yes
     DB=yes
     VISION=yes
     CONDA_CMAKE=yes
+=======
+    VISION=yes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TRITON=yes
     ;;
   pytorch-linux-jammy-py3-clang15-asan)
     ANACONDA_PYTHON_VERSION=3.10
     CLANG_VERSION=15
+<<<<<<< HEAD
     CONDA_CMAKE=yes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     ;;
   pytorch-linux-jammy-py3-clang18-asan)
     ANACONDA_PYTHON_VERSION=3.10
     CLANG_VERSION=18
+<<<<<<< HEAD
     CONDA_CMAKE=yes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     ;;
   pytorch-linux-jammy-py3.9-gcc11)
     ANACONDA_PYTHON_VERSION=3.9
     GCC_VERSION=11
+<<<<<<< HEAD
     PROTOBUF=yes
     DB=yes
     VISION=yes
     KATEX=yes
     CONDA_CMAKE=yes
+=======
+    VISION=yes
+    KATEX=yes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TRITON=yes
     DOCS=yes
     UNINSTALL_DILL=yes
@@ -383,14 +590,20 @@ case "$image" in
   pytorch-linux-jammy-py3-clang12-executorch)
     ANACONDA_PYTHON_VERSION=3.10
     CLANG_VERSION=12
+<<<<<<< HEAD
     CONDA_CMAKE=yes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     EXECUTORCH=yes
     ;;
   pytorch-linux-jammy-py3.12-halide)
     CUDA_VERSION=12.6
     ANACONDA_PYTHON_VERSION=3.12
     GCC_VERSION=11
+<<<<<<< HEAD
     CONDA_CMAKE=yes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     HALIDE=yes
     TRITON=yes
     ;;
@@ -398,6 +611,7 @@ case "$image" in
     CUDA_VERSION=12.6
     ANACONDA_PYTHON_VERSION=3.12
     GCC_VERSION=11
+<<<<<<< HEAD
     CONDA_CMAKE=yes
     TRITON_CPU=yes
     ;;
@@ -412,15 +626,34 @@ case "$image" in
     ANACONDA_PYTHON_VERSION=3.9
     CUDA_VERSION=11.8
     CONDA_CMAKE=yes
+=======
+    TRITON_CPU=yes
+    ;;
+  pytorch-linux-jammy-linter)
+    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
+    # We will need to update mypy version eventually, but that's for another day. The task
+    # would be to upgrade mypy to 1.0.0 with Python 3.11
+    PYTHON_VERSION=3.9
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
+    PYTHON_VERSION=3.9
+    CUDA_VERSION=12.8.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ;;
   pytorch-linux-jammy-aarch64-py3.10-gcc11)
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     ACL=yes
+<<<<<<< HEAD
     PROTOBUF=yes
     DB=yes
     VISION=yes
     CONDA_CMAKE=yes
+=======
+    VISION=yes
+    CONDA_CMAKE=yes
+    OPENBLAS=yes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # snadampal: skipping llvm src build install because the current version
     # from pytorch/llvm:9.0.1 is x86 specific
     SKIP_LLVM_SRC_BUILD_INSTALL=yes
@@ -429,10 +662,16 @@ case "$image" in
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     ACL=yes
+<<<<<<< HEAD
     PROTOBUF=yes
     DB=yes
     VISION=yes
     CONDA_CMAKE=yes
+=======
+    VISION=yes
+    CONDA_CMAKE=yes
+    OPENBLAS=yes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # snadampal: skipping llvm src build install because the current version
     # from pytorch/llvm:9.0.1 is x86 specific
     SKIP_LLVM_SRC_BUILD_INSTALL=yes
@@ -440,8 +679,11 @@ case "$image" in
     ;;
   *)
     # Catch-all for builds that are not hardcoded.
+<<<<<<< HEAD
     PROTOBUF=yes
     DB=yes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     VISION=yes
     echo "image '$image' did not match an existing build configuration"
     if [[ "$image" == *py* ]]; then
@@ -457,8 +699,12 @@ case "$image" in
       TRITON=yes
       # To ensure that any ROCm config will build using conda cmake
       # and thus have LAPACK/MKL enabled
+<<<<<<< HEAD
       CONDA_CMAKE=yes
     fi
+=======
+      fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if [[ "$image" == *centos7* ]]; then
       NINJA_VERSION=1.10.2
     fi
@@ -474,14 +720,18 @@ case "$image" in
     if [[ "$image" == *glibc* ]]; then
       extract_version_from_image_name glibc GLIBC_VERSION
     fi
+<<<<<<< HEAD
     if [[ "$image" == *cmake* ]]; then
       extract_version_from_image_name cmake CMAKE_VERSION
     fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ;;
 esac
 
 tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 
+<<<<<<< HEAD
 #when using cudnn version 8 install it separately from cuda
 if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
   IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
@@ -493,10 +743,19 @@ fi
 DOCKER_PROGRESS="--progress=plain"
 if [[ "${DOCKER_BUILDKIT}" == 0 ]]; then
   DOCKER_PROGRESS=""
+=======
+no_cache_flag=""
+progress_flag=""
+# Do not use cache and progress=plain when in CI
+if [[ -n "${CI:-}" ]]; then
+  no_cache_flag="--no-cache"
+  progress_flag="--progress=plain"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 # Build image
 docker build \
+<<<<<<< HEAD
        --no-cache \
        ${DOCKER_PROGRESS} \
        --build-arg "BUILD_ENVIRONMENT=${image}" \
@@ -506,18 +765,33 @@ docker build \
        --build-arg "VISION=${VISION:-}" \
        --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
        --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
+=======
+       ${no_cache_flag} \
+       ${progress_flag} \
+       --build-arg "BUILD_ENVIRONMENT=${image}" \
+       --build-arg "LLVMDEV=${LLVMDEV:-}" \
+       --build-arg "VISION=${VISION:-}" \
+       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        --build-arg "DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" \
        --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \
        --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
        --build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \
+<<<<<<< HEAD
+=======
+       --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        --build-arg "GCC_VERSION=${GCC_VERSION}" \
        --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
        --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
        --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
        --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
+<<<<<<< HEAD
        --build-arg "VULKAN_SDK_VERSION=${VULKAN_SDK_VERSION}" \
        --build-arg "SWIFTSHADER=${SWIFTSHADER}" \
        --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
        --build-arg "KATEX=${KATEX:-}" \
        --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
@@ -525,7 +799,10 @@ docker build \
        --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
        --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
        --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
+<<<<<<< HEAD
        --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        --build-arg "TRITON=${TRITON}" \
        --build-arg "TRITON_CPU=${TRITON_CPU}" \
        --build-arg "ONNX=${ONNX}" \
@@ -534,7 +811,13 @@ docker build \
        --build-arg "EXECUTORCH=${EXECUTORCH}" \
        --build-arg "HALIDE=${HALIDE}" \
        --build-arg "XPU_VERSION=${XPU_VERSION}" \
+<<<<<<< HEAD
+       --build-arg "ACL=${ACL:-}" \
+=======
+       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
        --build-arg "ACL=${ACL:-}" \
+       --build-arg "OPENBLAS=${OPENBLAS:-}" \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
        --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
        -f $(dirname ${DOCKERFILE})/Dockerfile \
@@ -551,7 +834,11 @@ docker build \
 UBUNTU_VERSION=$(echo ${UBUNTU_VERSION} | sed 's/-rc$//')
 
 function drun() {
+<<<<<<< HEAD
   docker run --rm "$tmp_tag" $*
+=======
+  docker run --rm "$tmp_tag" "$@"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 if [[ "$OS" == "ubuntu" ]]; then
@@ -599,3 +886,26 @@ if [ -n "$KATEX" ]; then
     exit 1
   fi
 fi
+<<<<<<< HEAD
+=======
+
+HAS_TRITON=$(drun python -c "import triton" > /dev/null 2>&1 && echo "yes" || echo "no")
+if [[ -n "$TRITON" || -n "$TRITON_CPU" ]]; then
+  if [ "$HAS_TRITON" = "no" ]; then
+    echo "expecting triton to be installed, but it is not"
+    exit 1
+  fi
+elif [ "$HAS_TRITON" = "yes" ]; then
+  echo "expecting triton to not be installed, but it is"
+  exit 1
+fi
+
+# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
+# they support 4.0.0 yet, so exclude them from this check.
+CMAKE_VERSION=$(drun cmake --version)
+if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
+  echo "CMake version is not 4.0.0:"
+  drun cmake --version
+  exit 1
+fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/centos-rocm/Dockerfile b/.ci/docker/centos-rocm/Dockerfile
index e683e587d1eb..a9e567dcd765 100644
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@@ -1,7 +1,13 @@
 ARG CENTOS_VERSION
 
+<<<<<<< HEAD
 FROM quay.io/centos/centos:stream${CENTOS_VERSION}
 
+=======
+FROM centos:${CENTOS_VERSION}
+
+ARG CENTOS_VERSION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Set AMD gpu targets to build for
 ARG PYTORCH_ROCM_ARCH
@@ -13,6 +19,7 @@ ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 COPY ./common/install_base.sh install_base.sh
 RUN bash ./install_base.sh && rm install_base.sh
 
+<<<<<<< HEAD
 #Install langpack
 RUN yum install -y glibc-langpack-en
 
@@ -28,6 +35,21 @@ ENV BASH_ENV "/etc/profile"
 # Install ninja
 RUN dnf --enablerepo=crb install -y ninja-build
 
+=======
+# Update CentOS git version
+RUN yum -y remove git
+RUN yum -y remove git-*
+RUN yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \
+    sed -i 's/packages.endpoint/packages.endpointdev/' /etc/yum.repos.d/endpoint.repo
+RUN yum install -y git
+
+# Install devtoolset
+ARG DEVTOOLSET_VERSION
+COPY ./common/install_devtoolset.sh install_devtoolset.sh
+RUN bash ./install_devtoolset.sh && rm install_devtoolset.sh
+ENV BASH_ENV "/etc/profile"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install non-default glibc version
 ARG GLIBC_VERSION
 COPY ./common/install_glibc.sh install_glibc.sh
@@ -40,7 +62,10 @@ RUN bash ./install_user.sh && rm install_user.sh
 
 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+<<<<<<< HEAD
 ARG CONDA_CMAKE
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
@@ -49,6 +74,7 @@ COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 
+<<<<<<< HEAD
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
 COPY ./common/install_protobuf.sh install_protobuf.sh
@@ -57,6 +83,9 @@ RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}
 
 # (optional) Install vision packages like OpenCV and ffmpeg
+=======
+# (optional) Install vision packages like OpenCV
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@@ -69,12 +98,19 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
+<<<<<<< HEAD
 RUN bash ./install_rocm_magma.sh
+=======
+RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN rm install_rocm_magma.sh
 COPY ./common/install_amdsmi.sh install_amdsmi.sh
 RUN bash ./install_amdsmi.sh
 RUN rm install_amdsmi.sh
+<<<<<<< HEAD
 ENV ROCM_PATH /opt/rocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
 ENV PATH /opt/rocm/hip/bin:$PATH
@@ -84,12 +120,15 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG en_US.utf8
 ENV LC_ALL en_US.utf8
 
+<<<<<<< HEAD
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt
index 6e9cfe33fe63..8c6dc1ab2b7e 100644
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 ebe8522378c3f9944aaaef44868f5ececdd845fc
+=======
+56392aa978594cc155fa8af48cd949f5b5f1823a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/ci_commit_pins/nccl-cu12.txt b/.ci/docker/ci_commit_pins/nccl-cu12.txt
index 4ddb4745d2c4..fa2c32a000de 100644
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 v2.26.2-1
+=======
+v2.27.3-1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt
index 7669ab74ea7c..258f62c5c597 100644
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 0bcc8265e677e5321606a3311bf71470f14456a8
+=======
+ae324eeac8e102a2b40370e341460f3791353398
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 24d633a34ead..a4aba649eb6a 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 9c7bc0a3d41407bff948b40cd0e9c793147e49bc
+=======
+21876a4bbaf371bcb83df8e6ee4f43a92f524dfe
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/common/cache_vision_models.sh b/.ci/docker/common/cache_vision_models.sh
index 8380c48177de..760cbb85cd2a 100644
--- a/.ci/docker/common/cache_vision_models.sh
+++ b/.ci/docker/common/cache_vision_models.sh
@@ -2,6 +2,7 @@
 
 set -ex
 
+<<<<<<< HEAD
 # Skip pytorch-nightly installation in docker images
 # Installation of pytorch-nightly is needed to prefetch mobilenet_v2 avd v3 models for some tests.
 # Came from https://github.com/ROCm/pytorch/commit/85bd6bc0105162293fa0bbfb7b661f85ec67f85a
@@ -16,6 +17,8 @@ set -ex
 echo "Skip torch-nightly installation"
 exit 0
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 
 # Cache the test models at ~/.cache/torch/hub/
diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
index 0e15bc931746..968033fcc843 100755
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@@ -33,6 +33,7 @@ install_ubuntu() {
     maybe_libomp_dev=""
   fi
 
+<<<<<<< HEAD
   # HACK: UCC testing relies on libnccl library from NVIDIA repo, and version 2.16 crashes
   # See https://github.com/pytorch/pytorch/pull/105260#issuecomment-1673399729
   # TODO: Eliminate this hack, we should not relay on apt-get installation
@@ -45,6 +46,8 @@ install_ubuntu() {
     maybe_libnccl_dev=""
   fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Install common dependencies
   apt-get update
   # TODO: Some of these may not be necessary
@@ -73,7 +76,10 @@ install_ubuntu() {
     libasound2-dev \
     libsndfile-dev \
     ${maybe_libomp_dev} \
+<<<<<<< HEAD
     ${maybe_libnccl_dev} \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     software-properties-common \
     wget \
     sudo \
@@ -90,12 +96,15 @@ install_ubuntu() {
   # see: https://github.com/pytorch/pytorch/issues/65931
   apt-get install -y libgnutls30
 
+<<<<<<< HEAD
   # Required to install the fortran after gcc update
   if [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
     apt autoremove -y gfortran
     apt-get update -y
     apt-get install -y gfortran
   fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Cleanup package manager
   apt-get autoclean && apt-get clean
   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
@@ -104,6 +113,7 @@ install_ubuntu() {
 install_centos() {
   # Need EPEL for many packages we depend on.
   # See http://fedoraproject.org/wiki/EPEL
+<<<<<<< HEAD
   # extras repo is not there for CentOS 9 and epel-release is already part of repo list
   if [[ $OS_VERSION == 9 ]]; then
       yum install -y epel-release
@@ -119,6 +129,13 @@ install_centos() {
   # for Caffe2. That said, we still install them to make sure the build
   # system opts to build/use protoc and libprotobuf from third-party.
   yum install -y $ALLOW_ERASE \
+=======
+  yum --enablerepo=extras install -y epel-release
+
+  ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt"
+  numpy_deps="gcc-gfortran"
+  yum install -y \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     $ccache_deps \
     $numpy_deps \
     autoconf \
@@ -135,13 +152,20 @@ install_centos() {
     glibc-headers \
     glog-devel \
     libstdc++-devel \
+<<<<<<< HEAD
     make \
+=======
+    libsndfile-devel \
+    make \
+    opencv-devel \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sudo \
     wget \
     vim \
     unzip \
     gdb
 
+<<<<<<< HEAD
   if [[ $OS_VERSION == 9 ]]
   then
 	  dnf --enablerepo=crb -y install libsndfile-devel
@@ -152,6 +176,8 @@ install_centos() {
 	    libsndfile-devel
   fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Cleanup
   yum clean all
   rm -rf /var/cache/yum
@@ -159,10 +185,15 @@ install_centos() {
   rm -rf /var/lib/yum/history
 }
 
+<<<<<<< HEAD
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
 
 # Install base packages depending on the base OS
+=======
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 case "$ID" in
   ubuntu)
     install_ubuntu
diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh
index 0e181bd7f43d..75b684268ff1 100644
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@@ -9,7 +9,11 @@ install_ubuntu() {
   # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
   apt-get install -y cargo
   echo "Checking out sccache repo"
+<<<<<<< HEAD
   git clone https://github.com/mozilla/sccache -b v0.9.1
+=======
+  git clone https://github.com/mozilla/sccache -b v0.10.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cd sccache
   echo "Building sccache"
   cargo build --release
@@ -36,12 +40,16 @@ sed -e 's|PATH="\(.*\)"|PATH="/opt/cache/bin:\1"|g' -i /etc/environment
 export PATH="/opt/cache/bin:$PATH"
 
 # Setup compiler cache
+<<<<<<< HEAD
 if [ -n "$ROCM_VERSION" ]; then
   curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache
 else
   install_ubuntu
 fi
 
+=======
+install_ubuntu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 chmod a+x /opt/cache/bin/sccache
 
 function write_sccache_stub() {
diff --git a/.ci/docker/common/install_clang.sh b/.ci/docker/common/install_clang.sh
index f7ef2fb374e4..baca548c0d99 100755
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@@ -4,6 +4,7 @@ set -ex
 
 if [ -n "$CLANG_VERSION" ]; then
 
+<<<<<<< HEAD
   if [[ $CLANG_VERSION == 9 && $UBUNTU_VERSION == 18.04 ]]; then
     sudo apt-get update
     # gpg-agent is not available by default on 18.04
@@ -14,6 +15,12 @@ if [ -n "$CLANG_VERSION" ]; then
     # work around ubuntu apt-get conflicts
     sudo apt-get -y -f install
     wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
+=======
+  if [[ $UBUNTU_VERSION == 22.04 ]]; then
+    # work around ubuntu apt-get conflicts
+    sudo apt-get -y -f install
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if [[ $CLANG_VERSION == 18 ]]; then
       apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
     fi
@@ -41,7 +48,11 @@ if [ -n "$CLANG_VERSION" ]; then
   # clang's packaging is a little messed up (the runtime libs aren't
   # added into the linker path), so give it a little help
   clang_lib=("/usr/lib/llvm-$CLANG_VERSION/lib/clang/"*"/lib/linux")
+<<<<<<< HEAD
   echo "$clang_lib" > /etc/ld.so.conf.d/clang.conf
+=======
+  echo "$clang_lib" >/etc/ld.so.conf.d/clang.conf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ldconfig
 
   # Cleanup package manager
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index ae6d870836b1..d040499f679c 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -4,12 +4,17 @@ set -ex
 
 # Optionally install conda
 if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
+<<<<<<< HEAD
   BASE_URL="https://repo.anaconda.com/miniconda"
   CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
   if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]] || [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
     BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
     CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
   fi
+=======
+  BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"  # @lint-ignore
+  CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
   MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2)
@@ -21,7 +26,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
       exit 1
       ;;
   esac
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mkdir -p /opt/conda
   chown jenkins:jenkins /opt/conda
 
@@ -45,6 +53,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
 
   # Prevent conda from updating to 4.14.0, which causes docker build failures
   # See https://hud.pytorch.org/pytorch/pytorch/commit/754d7f05b6841e555cea5a4b2c505dd9e0baec1d
+<<<<<<< HEAD
   # Uncomment the below when resolved to track the latest conda update,
   # but this is required for CentOS stream 9 builds
   ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
@@ -52,6 +61,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   if [[ $ID == centos && $OS_VERSION == 9 ]]; then
     as_jenkins conda update -y -n base conda
   fi
+=======
+  # Uncomment the below when resolved to track the latest conda update
+  # as_jenkins conda update -y -n base conda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if [[ $(uname -m) == "aarch64" ]]; then
     export SYSROOT_DEP="sysroot_linux-aarch64=2.17"
@@ -85,6 +98,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   # and libpython-static for torch deploy
   conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"
 
+<<<<<<< HEAD
   # Use conda cmake in some cases. Conda cmake will be newer than our supported
   # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those
   # following builds that we know should use conda. Specifically, Ubuntu bionic
@@ -93,11 +107,17 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     conda_install cmake=3.31.6
   fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Magma package names are concatenation of CUDA major and minor ignoring revision
   # I.e. magma-cuda102 package corresponds to CUDA_VERSION=10.2 and CUDA_VERSION=10.2.89
   # Magma is installed from a tarball in the ossci-linux bucket into the conda env
   if [ -n "$CUDA_VERSION" ]; then
+<<<<<<< HEAD
     ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) ${ANACONDA_PYTHON_VERSION}
+=======
+    conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fi
 
   if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then
diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh
index c6a9b27721b8..32e5f37c9ea8 100755
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@@ -3,11 +3,19 @@
 set -uex -o pipefail
 
 PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
+<<<<<<< HEAD
 PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
 
 # Python versions to be installed in /opt/$VERSION_NO
 CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
+=======
+PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads  # @lint-ignore
+GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
+
+# Python versions to be installed in /opt/$VERSION_NO
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 function check_var {
     if [ -z "$1" ]; then
diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
index 943e8826e1ee..aa3ce39aea4a 100644
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@@ -2,6 +2,7 @@
 
 set -ex
 
+<<<<<<< HEAD
 NCCL_VERSION=v2.26.2-1
 CUDNN_VERSION=9.5.1.17
 
@@ -136,10 +137,61 @@ function install_126 {
   rm -rf nccl
 
   install_cusparselt_063
+=======
+arch_path=''
+targetarch=${TARGETARCH:-$(uname -m)}
+if [ ${targetarch} = 'amd64' ] || [ "${targetarch}" = 'x86_64' ]; then
+  arch_path='x86_64'
+else
+  arch_path='sbsa'
+fi
+
+function install_cuda {
+  version=$1
+  runfile=$2
+  major_minor=${version%.*}
+  rm -rf /usr/local/cuda-${major_minor} /usr/local/cuda
+  if [[ ${arch_path} == 'sbsa' ]]; then
+      runfile="${runfile}_sbsa"
+  fi
+  runfile="${runfile}.run"
+  wget -q https://developer.download.nvidia.com/compute/cuda/${version}/local_installers/${runfile} -O ${runfile}
+  chmod +x ${runfile}
+  ./${runfile} --toolkit --silent
+  rm -f ${runfile}
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-${major_minor} /usr/local/cuda
+}
+
+function install_cudnn {
+  cuda_major_version=$1
+  cudnn_version=$2
+  mkdir tmp_cudnn && cd tmp_cudnn
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  filepath="cudnn-linux-${arch_path}-${cudnn_version}_cuda${cuda_major_version}-archive"
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-${arch_path}/${filepath}.tar.xz
+  tar xf ${filepath}.tar.xz
+  cp -a ${filepath}/include/* /usr/local/cuda/include/
+  cp -a ${filepath}/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+}
+
+function install_126 {
+  CUDNN_VERSION=9.10.2.21
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+  install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
+
+  install_cudnn 12 $CUDNN_VERSION
+
+  CUDA_VERSION=12.6 bash install_nccl.sh
+
+  CUDA_VERSION=12.6 bash install_cusparselt.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ldconfig
 }
 
+<<<<<<< HEAD
 function prune_118 {
     echo "Pruning CUDA 11.8 and cuDNN"
     #####################################################################################
@@ -203,6 +255,22 @@ function prune_124 {
   #####################################################################################
   export CUDA_BASE="/usr/local/cuda-12.4/"
   rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+=======
+function install_129 {
+  CUDNN_VERSION=9.10.2.21
+  echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+  # install CUDA 12.9.1 in the same container
+  install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  install_cudnn 12 $CUDNN_VERSION
+
+  CUDA_VERSION=12.9 bash install_nccl.sh
+
+  CUDA_VERSION=12.9 bash install_cusparselt.sh
+
+  ldconfig
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 function prune_126 {
@@ -240,6 +308,7 @@ function prune_126 {
 }
 
 function install_128 {
+<<<<<<< HEAD
   CUDNN_VERSION=9.7.1.26
   echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
   rm -rf /usr/local/cuda-12.8 /usr/local/cuda
@@ -269,6 +338,19 @@ function install_128 {
   rm -rf nccl
 
   install_cusparselt_063
+=======
+  CUDNN_VERSION=9.8.0.87
+  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+  # install CUDA 12.8.1 in the same container
+  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  install_cudnn 12 $CUDNN_VERSION
+
+  CUDA_VERSION=12.8 bash install_nccl.sh
+
+  CUDA_VERSION=12.8 bash install_cusparselt.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ldconfig
 }
@@ -277,6 +359,7 @@ function install_128 {
 while test $# -gt 0
 do
     case "$1" in
+<<<<<<< HEAD
     11.8) install_118; prune_118
         ;;
     12.4) install_124; prune_124
@@ -284,6 +367,13 @@ do
     12.6) install_126; prune_126
         ;;
     12.8) install_128;
+=======
+    12.6|12.6.*) install_126; prune_126
+        ;;
+    12.8|12.8.*) install_128;
+        ;;
+    12.9|12.9.*) install_129;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ;;
     *) echo "bad argument $1"; exit 1
         ;;
diff --git a/.ci/docker/common/install_cudnn.sh b/.ci/docker/common/install_cudnn.sh
index e008cda5c7a6..90b8cd58fe69 100644
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@@ -4,12 +4,19 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
     # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
     mkdir tmp_cudnn
     pushd tmp_cudnn
+<<<<<<< HEAD
     if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
         CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
     elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
         CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
     elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
         CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
+=======
+    if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
+    elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
         CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
     else
diff --git a/.ci/docker/common/install_cusparselt.sh b/.ci/docker/common/install_cusparselt.sh
index 0603739fb041..19debfcb400f 100644
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@@ -5,12 +5,17 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt
 
+<<<<<<< HEAD
 if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-8]$ ]]; then
+=======
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arch_path='sbsa'
     export TARGETARCH=${TARGETARCH:-$(uname -m)}
     if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
         arch_path='x86_64'
     fi
+<<<<<<< HEAD
     CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.3.2-archive"
     curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
@@ -24,6 +29,10 @@ elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
 elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
     CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
     curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
+=======
+    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.7.1.0-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else
     echo "Not sure which libcusparselt version to install for this ${CUDA_VERSION}"
 fi
diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh
index e30e0a787bbe..b7ea7e3f2cfa 100755
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@@ -13,7 +13,11 @@ clone_executorch() {
   # and fetch the target commit
   pushd executorch
   git checkout "${EXECUTORCH_PINNED_COMMIT}"
+<<<<<<< HEAD
   git submodule update --init
+=======
+  git submodule update --init --recursive
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   popd
 
   chown -R jenkins executorch
diff --git a/.ci/docker/common/install_halide.sh b/.ci/docker/common/install_halide.sh
index ed1d7d33649d..d589a179a67f 100644
--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@@ -17,7 +17,11 @@ if [ -n "${UBUNTU_VERSION}" ];then
                   libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
 fi
 
+<<<<<<< HEAD
 conda_install numpy scipy imageio cmake ninja
+=======
+pip_install numpy scipy imageio cmake ninja
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
 cmake -DCMAKE_BUILD_TYPE=Release \
diff --git a/.ci/docker/common/install_inductor_benchmark_deps.sh b/.ci/docker/common/install_inductor_benchmark_deps.sh
index 5b775af6539f..28b145413aae 100644
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@@ -16,7 +16,11 @@ function install_timm() {
 
   pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
   # Clean up
+<<<<<<< HEAD
   conda_run pip uninstall -y cmake torch torchvision triton
+=======
+  conda_run pip uninstall -y torch torchvision triton
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 # Pango is needed for weasyprint which is needed for doctr
diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh
index a7f008fb735d..1d694604472a 100644
--- a/.ci/docker/common/install_linter.sh
+++ b/.ci/docker/common/install_linter.sh
@@ -2,8 +2,11 @@
 
 set -ex
 
+<<<<<<< HEAD
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if [ -n "${UBUNTU_VERSION}" ]; then
   apt update
   apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5
@@ -15,8 +18,13 @@ chown -R jenkins pytorch
 
 pushd pytorch
 # Install all linter dependencies
+<<<<<<< HEAD
 pip_install -r requirements.txt
 conda_run lintrunner init
+=======
+pip install -r requirements.txt
+lintrunner init
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Cache .lintbin directory as part of the Docker image
 cp -r .lintbin /tmp
diff --git a/.ci/docker/common/install_magma_conda.sh b/.ci/docker/common/install_magma_conda.sh
index 9557d943004c..62f6303a3166 100755
--- a/.ci/docker/common/install_magma_conda.sh
+++ b/.ci/docker/common/install_magma_conda.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env bash
+<<<<<<< HEAD
 # Script that replaces the magma install from a conda package
 
 set -eou pipefail
@@ -24,3 +25,27 @@ function do_install() {
 }
 
 do_install $1 $2
+=======
+# Script that installs magma from tarball inside conda environment.
+# It replaces anaconda magma-cuda package which is no longer published.
+# Execute it inside active conda environment.
+# See issue: https://github.com/pytorch/pytorch/issues/138506
+
+set -eou pipefail
+
+cuda_version_nodot=${1/./}
+anaconda_dir=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+
+MAGMA_VERSION="2.6.1"
+magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+(
+    set -x
+    tmp_dir=$(mktemp -d)
+    pushd ${tmp_dir}
+    curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
+    tar -xvf "${magma_archive}"
+    mv include/* "${anaconda_dir}/include/"
+    mv lib/* "${anaconda_dir}/lib"
+    popd
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/common/install_nccl.sh b/.ci/docker/common/install_nccl.sh
new file mode 100644
index 000000000000..17d80ebe7d27
--- /dev/null
+++ b/.ci/docker/common/install_nccl.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -ex
+
+NCCL_VERSION=""
+if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
+  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
+elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
+  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
+else
+  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
+  exit 1
+fi
+
+if [[ -n "${NCCL_VERSION}" ]]; then
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  pushd nccl
+  make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  popd
+  rm -rf nccl
+  ldconfig
+fi
diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
index fdd0f9acf135..51a52792945f 100755
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@@ -8,6 +8,7 @@ retry () {
     "$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@")
 }
 
+<<<<<<< HEAD
 # A bunch of custom pip dependencies for ONNX
 pip_install \
   beartype==0.15.0 \
@@ -18,6 +19,8 @@ pip_install \
   networkx==2.5 \
   numpy==1.24.2
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # ONNXRuntime should be installed before installing
 # onnx-weekly. Otherwise, onnx-weekly could be
 # overwritten by onnx.
@@ -29,12 +32,17 @@ pip_install \
   transformers==4.36.2
 
 pip_install coloredlogs packaging
+<<<<<<< HEAD
 
 pip_install onnxruntime==1.18.1
 pip_install onnx==1.17.0
 pip_install onnxscript==0.2.2 --no-deps
 # required by onnxscript
 pip_install ml_dtypes
+=======
+pip_install onnxruntime==1.18.1
+pip_install onnxscript==0.3.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh
index 7f0b3620bdc1..e71a678a3483 100644
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@@ -4,9 +4,15 @@
 set -ex
 
 cd /
+<<<<<<< HEAD
 git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.29 --depth 1 --shallow-submodules
 
 
+=======
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
+
+OPENBLAS_CHECKOUT_DIR="OpenBLAS"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OPENBLAS_BUILD_FLAGS="
 NUM_THREADS=128
 USE_OPENMP=1
@@ -14,9 +20,15 @@ NO_SHARED=0
 DYNAMIC_ARCH=1
 TARGET=ARMV8
 CFLAGS=-O3
+<<<<<<< HEAD
 "
 
 OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 
+=======
+BUILD_BFLOAT16=1
+"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
 make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
diff --git a/.ci/docker/common/install_python.sh b/.ci/docker/common/install_python.sh
new file mode 100644
index 000000000000..be5a09b80a60
--- /dev/null
+++ b/.ci/docker/common/install_python.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -ex
+
+apt-get update
+# Use deadsnakes in case we need an older python version
+sudo add-apt-repository ppa:deadsnakes/ppa
+apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-pip python${PYTHON_VERSION}-venv
+
+# Use a venv because uv and some other package managers don't support --user install
+ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
+python -m venv /var/lib/jenkins/ci_env
+source /var/lib/jenkins/ci_env/bin/activate
+
+python -mpip install --upgrade pip
+python -mpip install -r /opt/requirements-ci.txt
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index 5f655f2010d4..39de64f7006f 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -21,6 +21,21 @@ install_ubuntu() {
     apt-get install -y libc++1
     apt-get install -y libc++abi1
 
+<<<<<<< HEAD
+=======
+    # Make sure rocm packages from repo.radeon.com have highest priority
+    cat << EOF > /etc/apt/preferences.d/rocm-pin-600
+Package: *
+Pin: release o=repo.radeon.com
+Pin-Priority: 600
+EOF
+
+    # we want the patch version of 6.4 instead
+    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
+        ROCM_VERSION="${ROCM_VERSION}.1"
+    fi
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Add amdgpu repository
     UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
     echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
@@ -61,17 +76,42 @@ install_ubuntu() {
     done
 
     # ROCm 6.3 had a regression where initializing static code objects had significant overhead
+<<<<<<< HEAD
     if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
         # clr build needs CppHeaderParser but can only find it using conda's python
         /opt/conda/bin/python -m pip install CppHeaderParser
         git clone https://github.com/ROCm/HIP -b rocm-6.3.x
         HIP_COMMON_DIR=$(readlink -f HIP)
         git clone https://github.com/jeffdaily/clr -b release/rocm-rel-6.3-statco-hotfix
+=======
+    # ROCm 6.4 did not yet fix the regression, also HIP branch names are different
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.3) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
+        if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
+            HIP_BRANCH=release/rocm-rel-6.4
+            VER_STR=6.4
+            VER_PATCH=.1
+        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
+            HIP_BRANCH=release/rocm-rel-6.4
+            VER_STR=6.4
+        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
+            HIP_BRANCH=rocm-6.3.x
+            VER_STR=6.3
+        fi
+        # clr build needs CppHeaderParser but can only find it using conda's python
+        /opt/conda/bin/python -m pip install CppHeaderParser
+        git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
+        HIP_COMMON_DIR=$(readlink -f HIP)
+        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mkdir -p clr/build
         pushd clr/build
         cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
         make -j
+<<<<<<< HEAD
         cp hipamd/lib/libamdhip64.so.6.3.* /opt/rocm/lib/libamdhip64.so.6.3.*
+=======
+        cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         popd
         rm -rf HIP clr
     fi
@@ -86,6 +126,7 @@ install_centos() {
   yum update -y
   yum install -y kmod
   yum install -y wget
+<<<<<<< HEAD
   
   if [[ $OS_VERSION == 9 ]]; then 
       dnf install -y openblas-serial
@@ -122,6 +163,28 @@ install_centos() {
   else
       local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}/main"
   fi
+=======
+  yum install -y openblas-devel
+
+  yum install -y epel-release
+  yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`
+
+  # Add amdgpu repository
+  local amdgpu_baseurl
+  if [[ $OS_VERSION == 9 ]]; then
+      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64"
+  else
+      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
+  fi
+  echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
+  echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
+  echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
+  echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
+  echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
+  echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo
+
+  local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
   echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo
   echo "baseurl=${rocm_baseurl}" >> /etc/yum.repos.d/rocm.repo
@@ -129,6 +192,7 @@ install_centos() {
   echo "gpgcheck=1" >> /etc/yum.repos.d/rocm.repo
   echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/rocm.repo
 
+<<<<<<< HEAD
   if [[ $OS_VERSION == 9 ]]; then
       yum update -y --nogpgcheck
       dnf --enablerepo=crb install -y perl-File-BaseDir python3-wheel
@@ -136,6 +200,11 @@ install_centos() {
   else
       yum update -y
       yum install -y \
+=======
+  yum update -y
+
+  yum install -y \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    rocm-dev \
                    rocm-utils \
                    rocm-libs \
@@ -143,7 +212,10 @@ install_centos() {
                    rocprofiler-dev \
                    roctracer-dev \
                    amd-smi-lib
+<<<<<<< HEAD
   fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   # precompiled miopen kernels; search for all unversioned packages
   # if search fails it will abort this script; use true to avoid case where search fails
@@ -167,8 +239,11 @@ install_centos() {
   rm -rf /var/lib/yum/history
 }
 
+<<<<<<< HEAD
 OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Install Python packages depending on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
index db826ed6e027..d3677483ea7c 100644
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #!/bin/bash
 # Script used in CI and CD pipeline
 
@@ -58,3 +59,42 @@ LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
 make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
 popd
 mv magma /opt/rocm
+=======
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+function do_install() {
+    rocm_version=$1
+    if [[ ${rocm_version} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+        # chop off any patch version
+        rocm_version="${rocm_version%.*}"
+    fi
+
+    rocm_version_nodot=${rocm_version//./}
+
+    # Version 2.7.2 + ROCm related updates
+    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+
+    rocm_dir="/opt/rocm"
+    (
+        set -x
+        tmp_dir=$(mktemp -d)
+        pushd ${tmp_dir}
+        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
+        if tar -xvf "${magma_archive}"
+        then
+            mkdir -p "${rocm_dir}/magma"
+            mv include "${rocm_dir}/magma/include"
+            mv lib "${rocm_dir}/magma/lib"
+        else
+            echo "${magma_archive} not found, skipping magma install"
+        fi
+        popd
+    )
+}
+
+do_install $1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index b72020d822d5..f2a3363935f1 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -2,6 +2,7 @@
 
 set -ex
 
+<<<<<<< HEAD
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 
 get_conda_version() {
@@ -10,6 +11,18 @@ get_conda_version() {
 
 conda_reinstall() {
   as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
+=======
+mkdir -p /opt/triton
+if [ -z "${TRITON}" ] && [ -z "${TRITON_CPU}" ]; then
+  echo "TRITON and TRITON_CPU are not set. Exiting..."
+  exit 0
+fi
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+get_pip_version() {
+  conda_run pip list | grep -w $* | head -n 1 | awk '{print $2}'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 if [ -n "${XPU_VERSION}" ]; then
@@ -31,11 +44,17 @@ if [ -n "${UBUNTU_VERSION}" ];then
     apt-get install -y gpg-agent
 fi
 
+<<<<<<< HEAD
 if [ -n "${CONDA_CMAKE}" ]; then
   # Keep the current cmake and numpy version here, so we can reinstall them later
   CMAKE_VERSION=$(get_conda_version cmake)
   NUMPY_VERSION=$(get_conda_version numpy)
 fi
+=======
+# Keep the current cmake and numpy version here, so we can reinstall them later
+CMAKE_VERSION=$(get_pip_version cmake)
+NUMPY_VERSION=$(get_pip_version numpy)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if [ -z "${MAX_JOBS}" ]; then
     export MAX_JOBS=$(nproc)
@@ -51,7 +70,17 @@ as_jenkins git clone --recursive ${TRITON_REPO} triton
 cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
 as_jenkins git submodule update --init --recursive
+<<<<<<< HEAD
 cd python
+=======
+
+# Old versions of python have setup.py in ./python; newer versions have it in ./
+if [ ! -f setup.py ]; then
+  cd python
+fi
+
+pip_install pybind11==2.13.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
@@ -60,12 +89,17 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
   # Triton needs at least gcc-9 to build
   apt-get install -y g++-9
 
+<<<<<<< HEAD
   CXX=g++-9 pip_install .
+=======
+  CXX=g++-9 conda_run python setup.py bdist_wheel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
   # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
   add-apt-repository -y ppa:ubuntu-toolchain-r/test
   apt-get install -y g++-9
 
+<<<<<<< HEAD
   CXX=g++-9 pip_install .
 else
   pip_install .
@@ -84,4 +118,34 @@ if [ -n "${CONDA_CMAKE}" ]; then
   conda_reinstall cmake="${CMAKE_VERSION}"
   # Note that we install numpy with pip as conda might not have the version we want
   pip_install --force-reinstall numpy=="${NUMPY_VERSION}"
+=======
+  CXX=g++-9 conda_run python setup.py bdist_wheel
+else
+  conda_run python setup.py bdist_wheel
+fi
+
+# Copy the wheel to /opt for multi stage docker builds
+cp dist/*.whl /opt/triton
+# Install the wheel for docker builds that don't use multi stage
+pip_install dist/*.whl
+
+# TODO: This is to make sure that the same cmake and numpy version from install conda
+# script is used. Without this step, the newer cmake version (3.25.2) downloaded by
+# triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
+# this can be removed.
+#
+# The correct numpy version also needs to be set here because conda claims that it
+# causes inconsistent environment.  Without this, conda will attempt to install the
+# latest numpy version, which fails ASAN tests with the following import error: Numba
+# needs NumPy 1.20 or less.
+# Note that we install numpy with pip as conda might not have the version we want
+if [ -n "${CMAKE_VERSION}" ]; then
+  pip_install "cmake==${CMAKE_VERSION}"
+fi
+if [ -n "${NUMPY_VERSION}" ]; then
+  pip_install "numpy==${NUMPY_VERSION}"
+fi
+if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
+  pip_install helion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
diff --git a/.ci/docker/common/install_vision.sh b/.ci/docker/common/install_vision.sh
index 532d8d14a55c..665b8c0805c6 100755
--- a/.ci/docker/common/install_vision.sh
+++ b/.ci/docker/common/install_vision.sh
@@ -15,6 +15,7 @@ install_ubuntu() {
 install_centos() {
   # Need EPEL for many packages we depend on.
   # See http://fedoraproject.org/wiki/EPEL
+<<<<<<< HEAD
   if [[ $OS_VERSION == 9 ]]; then
       yum install -y epel-release
   else
@@ -23,6 +24,12 @@ install_centos() {
           opencv-devel \
           ffmpeg-devel
   fi
+=======
+  yum --enablerepo=extras install -y epel-release
+
+  yum install -y \
+      opencv-devel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   # Cleanup
   yum clean all
@@ -31,8 +38,11 @@ install_centos() {
   rm -rf /var/lib/yum/history
 }
 
+<<<<<<< HEAD
 OS_VERSION=$(grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"')
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Install base packages depending on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh
index 08e6f3aa6d1a..2270497f0025 100644
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@@ -26,7 +26,11 @@ function install_ubuntu() {
     wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
         | gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg.gpg
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg.gpg] \
+<<<<<<< HEAD
         https://apt.repos.intel.com/${XPU_REPO_NAME} all main" \
+=======
+        https://apt.repos.intel.com/oneapi all main" \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         | tee /etc/apt/sources.list.d/oneAPI.list
 
     # Update the packages list and repository index
@@ -47,9 +51,12 @@ function install_ubuntu() {
     # Development Packages
     apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
     # Install Intel Support Packages
+<<<<<<< HEAD
     if [[ "$XPU_VERSION" == "2025.0" ]]; then
         XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl=2025.0.1-6"
     fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     apt-get install -y ${XPU_PACKAGES}
 
     # Cleanup
@@ -77,7 +84,11 @@ function install_rhel() {
     tee > /etc/yum.repos.d/oneAPI.repo << EOF
 [oneAPI]
 name=Intel for Pytorch GPU dev repository
+<<<<<<< HEAD
 baseurl=https://yum.repos.intel.com/${XPU_REPO_NAME}
+=======
+baseurl=https://yum.repos.intel.com/oneapi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 enabled=1
 gpgcheck=1
 repo_gpgcheck=1
@@ -85,9 +96,12 @@ gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.
 EOF
 
     # Install Intel Support Packages
+<<<<<<< HEAD
     if [[ "$XPU_VERSION" == "2025.0" ]]; then
         XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl-2025.0.1-6"
     fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     yum install -y ${XPU_PACKAGES}
     # The xpu-smi packages
     dnf install -y xpu-smi
@@ -124,7 +138,11 @@ function install_sles() {
         https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
     rpm --import https://repositories.intel.com/gpu/intel-graphics.key
     # To add the online network network package repository for the Intel Support Packages
+<<<<<<< HEAD
     zypper addrepo https://yum.repos.intel.com/${XPU_REPO_NAME} oneAPI
+=======
+    zypper addrepo https://yum.repos.intel.com/oneapi oneAPI
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
 
     # The xpu-smi packages
@@ -147,10 +165,17 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
     XPU_DRIVER_VERSION=""
 fi
 
+<<<<<<< HEAD
 XPU_REPO_NAME="intel-for-pytorch-gpu-dev"
 XPU_PACKAGES="intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9"
 if [[ "$XPU_VERSION" == "2025.0" ]]; then
     XPU_REPO_NAME="oneapi"
+=======
+# Default use Intel® oneAPI Deep Learning Essentials 2025.0
+if [[ "$XPU_VERSION" == "2025.1" ]]; then
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
+else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
 fi
 
diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile
index b83071b25aa5..34ee4426ef15 100644
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@@ -49,6 +49,7 @@ RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM cpu as cuda
 ADD ./common/install_cuda.sh install_cuda.sh
 ADD ./common/install_magma.sh install_magma.sh
+<<<<<<< HEAD
 ENV CUDA_HOME /usr/local/cuda
 
 FROM cuda as cuda11.8
@@ -61,6 +62,13 @@ RUN bash ./install_cuda.sh 12.4
 RUN bash ./install_magma.sh 12.4
 RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda
 
+=======
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+COPY ./common/install_cusparselt.sh install_cusparselt.sh
+ENV CUDA_HOME /usr/local/cuda
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 RUN bash ./install_magma.sh 12.6
@@ -71,7 +79,17 @@ RUN bash ./install_cuda.sh 12.8
 RUN bash ./install_magma.sh 12.8
 RUN ln -sf /usr/local/cuda-12.8 /usr/local/cuda
 
+<<<<<<< HEAD
+FROM cpu as rocm
+=======
+FROM cuda as cuda12.9
+RUN bash ./install_cuda.sh 12.9
+RUN bash ./install_magma.sh 12.9
+RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda
+
 FROM cpu as rocm
+ARG ROCM_VERSION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ENV MKLROOT /opt/intel
@@ -86,11 +104,19 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 # gfortran and python needed for building magma from source for ROCm
 RUN apt-get update -y && \
     apt-get install gfortran -y && \
+<<<<<<< HEAD
     apt-get install python -y && \
     apt-get clean
 
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+=======
+    apt-get install python3 python-is-python3 -y && \
+    apt-get clean
+
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh
index fd9932f8def8..9931d01ce866 100755
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@@ -1,12 +1,17 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline
 
+<<<<<<< HEAD
 set -eou pipefail
+=======
+set -eoux pipefail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 image="$1"
 shift
 
 if [ -z "${image}" ]; then
+<<<<<<< HEAD
   echo "Usage: $0 IMAGE"
   exit 1
 fi
@@ -44,10 +49,52 @@ case ${GPU_ARCH_TYPE} in
         ;;
     *)
         echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+=======
+  echo "Usage: $0 IMAGENAME:ARCHTAG"
+  exit 1
+fi
+
+TOPDIR=$(git rev-parse --show-toplevel)
+
+DOCKER=${DOCKER:-docker}
+
+# Go from imagename:tag to tag
+DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+
+GPU_ARCH_VERSION=""
+if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
+    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
+elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
+    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
+fi
+
+case ${DOCKER_TAG_PREFIX} in
+    cpu)
+        BASE_TARGET=cpu
+        GPU_IMAGE=ubuntu:20.04
+        DOCKER_GPU_BUILD_ARG=""
+        ;;
+    cuda*)
+        BASE_TARGET=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=ubuntu:20.04
+        DOCKER_GPU_BUILD_ARG=""
+        ;;
+    rocm*)
+        BASE_TARGET=rocm
+        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
+        ;;
+    *)
+        echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         exit 1
         ;;
 esac
 
+<<<<<<< HEAD
 
 (
     set -x
@@ -81,3 +128,16 @@ if [[ "${WITH_PUSH}" == true ]]; then
     fi
   )
 fi
+=======
+tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
+
+DOCKER_BUILDKIT=1 ${DOCKER} build \
+    --target final \
+    ${DOCKER_GPU_BUILD_ARG} \
+    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+    --build-arg "BASE_TARGET=${BASE_TARGET}" \
+    -t "${tmp_tag}" \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
+    "${TOPDIR}/.ci/docker/"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/linter-cuda/Dockerfile b/.ci/docker/linter-cuda/Dockerfile
index 8084bf627124..a8db2f257ac9 100644
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@@ -18,6 +18,7 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh
 
 # Install conda and other packages (e.g., numpy, pytest)
+<<<<<<< HEAD
 ARG ANACONDA_PYTHON_VERSION
 ARG CONDA_CMAKE
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
@@ -27,19 +28,43 @@ COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./common/install_magma_conda.sh install_magma_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
+=======
+ARG PYTHON_VERSION
+ARG PIP_CMAKE
+# Put venv into the env vars so users don't need to activate it
+ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
+ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
+COPY requirements-ci.txt /opt/requirements-ci.txt
+COPY ./common/install_python.sh install_python.sh
+RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
+<<<<<<< HEAD
 RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
+=======
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+COPY ./common/install_cusparselt.sh install_cusparselt.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
 
 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+<<<<<<< HEAD
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
 RUN rm install_linter.sh common_utils.sh
+=======
+RUN bash ./install_linter.sh
+RUN rm install_linter.sh
+
+RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 USER jenkins
 CMD ["bash"]
diff --git a/.ci/docker/linter/Dockerfile b/.ci/docker/linter/Dockerfile
index 968918a3617c..607e8839559f 100644
--- a/.ci/docker/linter/Dockerfile
+++ b/.ci/docker/linter/Dockerfile
@@ -15,6 +15,7 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh
 
 # Install conda and other packages (e.g., numpy, pytest)
+<<<<<<< HEAD
 ARG ANACONDA_PYTHON_VERSION
 ARG CONDA_CMAKE
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
@@ -29,6 +30,19 @@ COPY ./common/install_linter.sh install_linter.sh
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
 RUN rm install_linter.sh common_utils.sh
+=======
+ARG PYTHON_VERSION
+ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
+ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
+COPY requirements-ci.txt /opt/requirements-ci.txt
+COPY ./common/install_python.sh install_python.sh
+RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+
+# Note that Docker build forbids copying file outside the build context
+COPY ./common/install_linter.sh install_linter.sh
+RUN bash ./install_linter.sh
+RUN rm install_linter.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 USER jenkins
 CMD ["bash"]
diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28
index e63657f391b3..21448786a7b7 100644
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@@ -7,8 +7,13 @@ ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 
+<<<<<<< HEAD
 ARG DEVTOOLSET_VERSION=11
 RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
+=======
+ARG DEVTOOLSET_VERSION=13
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 
@@ -26,17 +31,31 @@ ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 
 
+<<<<<<< HEAD
 # remove unncessary python versions
+=======
+# remove unnecessary python versions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 
 FROM base as cuda
+<<<<<<< HEAD
 ARG BASE_CUDA_VERSION=11.8
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
 RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+=======
+ARG BASE_CUDA_VERSION=12.6
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+COPY ./common/install_cusparselt.sh install_cusparselt.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 FROM base as intel
 # MKL
@@ -44,7 +63,11 @@ ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 
 FROM base as magma
+<<<<<<< HEAD
 ARG BASE_CUDA_VERSION=10.2
+=======
+ARG BASE_CUDA_VERSION=12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
@@ -61,7 +84,11 @@ ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh
 
 FROM ${GPU_IMAGE} as common
+<<<<<<< HEAD
 ARG DEVTOOLSET_VERSION=11
+=======
+ARG DEVTOOLSET_VERSION=13
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
@@ -84,6 +111,7 @@ RUN yum install -y \
         wget \
         which \
         xz \
+<<<<<<< HEAD
         gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
         glibc-langpack-en
 RUN yum install -y \
@@ -91,6 +119,14 @@ RUN yum install -y \
     https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
 
 RUN yum swap -y git git236-core
+=======
+        glibc-langpack-en \
+        gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
+        gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
+        gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
+        gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
@@ -101,6 +137,10 @@ ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 # Install LLVM version
 COPY --from=openssl            /opt/openssl                          /opt/openssl
 COPY --from=base               /opt/python                           /opt/python
+<<<<<<< HEAD
+=======
+COPY --from=base               /usr/local/lib/                       /usr/local/lib/
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 COPY --from=base               /opt/_internal                        /opt/_internal
 COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
 COPY --from=intel              /opt/intel                            /opt/intel
@@ -114,8 +154,13 @@ COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
 
 FROM common as cpu_final
+<<<<<<< HEAD
 ARG BASE_CUDA_VERSION=11.8
 ARG DEVTOOLSET_VERSION=11
+=======
+ARG BASE_CUDA_VERSION=12.6
+ARG DEVTOOLSET_VERSION=13
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
@@ -154,11 +199,22 @@ ENV ROCM_PATH /opt/rocm
 # and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
 RUN python3 -m pip install --upgrade pip && \
     python3 -mpip install cmake==3.28.4
+<<<<<<< HEAD
 ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 ENV MKLROOT /opt/intel
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+=======
+# replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
+ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+# ROCm 6.4 rocm-smi depends on system drm.h header
+RUN yum install -y libdrm-devel
+ENV MKLROOT /opt/intel
+ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 
@@ -169,6 +225,10 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
     python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
+<<<<<<< HEAD
 ENV XPU_VERSION 2025.0
+=======
+ENV XPU_VERSION 2025.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
index 8f5d4c3361ce..35faa9bf2605 100644
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@@ -1,9 +1,15 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base
 
+<<<<<<< HEAD
 # Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8.
 ARG GCCTOOLSET_VERSION=11
 
 # Language variabes
+=======
+ARG GCCTOOLSET_VERSION=13
+
+# Language variables
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ENV LC_ALL=en_US.UTF-8
 ENV LANG=en_US.UTF-8
 ENV LANGUAGE=en_US.UTF-8
@@ -36,7 +42,14 @@ RUN yum install -y \
   yasm \
   zstd \
   sudo \
+<<<<<<< HEAD
   gcc-toolset-${GCCTOOLSET_VERSION}-toolchain
+=======
+  gcc-toolset-${GCCTOOLSET_VERSION}-gcc \
+  gcc-toolset-${GCCTOOLSET_VERSION}-gcc-c++ \
+  gcc-toolset-${GCCTOOLSET_VERSION}-gcc-gfortran \
+  gcc-toolset-${GCCTOOLSET_VERSION}-gdb
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
@@ -56,12 +69,20 @@ RUN git config --global --add safe.directory "*"
 
 FROM base as openblas
 # Install openblas
+<<<<<<< HEAD
+=======
+ARG OPENBLAS_VERSION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh
 
 FROM base as final
 
+<<<<<<< HEAD
 # remove unncessary python versions
+=======
+# remove unnecessary python versions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
index dfd766b4dd5a..654e50f11caa 100644
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@@ -1,7 +1,11 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base
 
 # Cuda ARM build needs gcc 11
+<<<<<<< HEAD
 ARG DEVTOOLSET_VERSION=11
+=======
+ARG DEVTOOLSET_VERSION=13
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Language variables
 ENV LC_ALL=en_US.UTF-8
@@ -34,7 +38,14 @@ RUN yum install -y \
   zstd \
   libgomp \
   sudo \
+<<<<<<< HEAD
   gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
+=======
+  gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
+  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
+  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
+  gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
@@ -57,7 +68,11 @@ RUN bash ./install_openssl.sh && rm install_openssl.sh
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 
 FROM openssl as final
+<<<<<<< HEAD
 # remove unncessary python versions
+=======
+# remove unnecessary python versions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
@@ -66,8 +81,16 @@ RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 FROM base as cuda
 ARG BASE_CUDA_VERSION
 # Install CUDA
+<<<<<<< HEAD
 ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
 RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh
+=======
+ADD ./common/install_cuda.sh install_cuda.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./common/install_cusparselt.sh install_cusparselt.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 FROM base as magma
 ARG BASE_CUDA_VERSION
diff --git a/.ci/docker/manywheel/Dockerfile_s390x b/.ci/docker/manywheel/Dockerfile_s390x
index 63a6a67c28ce..a9fc14ad1164 100644
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@@ -5,7 +5,13 @@ ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 ENV LANGUAGE=C.UTF-8
 
+<<<<<<< HEAD
 ARG DEVTOOLSET_VERSION=13
+=======
+# there is a bugfix in gcc >= 14 for precompiled headers and s390x vectorization interaction.
+# with earlier gcc versions test/inductor/test_cpu_cpp_wrapper.py will fail.
+ARG DEVTOOLSET_VERSION=14
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
 RUN yum -y install epel-release
@@ -42,6 +48,10 @@ RUN yum install -y \
   llvm-devel \
   libzstd-devel \
   python3.12-devel \
+<<<<<<< HEAD
+=======
+  python3.12-test \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python3.12-setuptools \
   python3.12-pip \
   python3-virtualenv \
@@ -57,7 +67,12 @@ RUN yum install -y \
   libxslt-devel \
   libxml2-devel \
   openssl-devel \
+<<<<<<< HEAD
   valgrind
+=======
+  valgrind \
+  ninja-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
@@ -101,6 +116,7 @@ CMD ["/bin/bash"]
 
 # install test dependencies:
 # - grpcio requires system openssl, bundled crypto fails to build
+<<<<<<< HEAD
 # - ml_dtypes 0.4.0 requires some fixes provided in later commits to build
 RUN dnf install -y \
   protobuf-devel \
@@ -122,3 +138,37 @@ RUN cd ~ && \
   python3 setup.py bdist_wheel && \
   pip3 install dist/*.whl && \
   rm -rf ml_dtypes
+=======
+RUN dnf install -y \
+  hdf5-devel \
+  python3-h5py \
+  git
+
+RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
+
+# cmake-3.28.0 from pip for onnxruntime
+RUN python3 -mpip install cmake==3.28.0
+
+# build onnxruntime 1.21.0 from sources.
+# it is not possible to build it from sources using pip,
+# so just build it from upstream repository.
+# h5py is dependency of onnxruntime_training.
+# h5py==3.11.0 builds with hdf5-devel 1.10.5 from repository.
+# h5py 3.11.0 doesn't build with numpy >= 2.3.0.
+# install newest flatbuffers version first:
+# for some reason old version is getting pulled in otherwise.
+# packaging package is required for onnxruntime wheel build.
+RUN pip3 install flatbuffers && \
+  pip3 install cython 'pkgconfig>=1.5.5' 'setuptools>=77' 'numpy<2.3.0' && \
+  pip3 install --no-build-isolation h5py==3.11.0 && \
+  pip3 install packaging && \
+  git clone https://github.com/microsoft/onnxruntime && \
+  cd onnxruntime && git checkout v1.21.0 && \
+  git submodule update --init --recursive && \
+  ./build.sh --config Release --parallel 0 --enable_pybind \
+  --build_wheel --enable_training --enable_training_apis \
+  --enable_training_ops --skip_tests --allow_running_as_root \
+  --compile_no_warning_as_error && \
+  pip3 install ./build/Linux/Release/dist/onnxruntime_training-*.whl && \
+  cd .. && /bin/rm -rf ./onnxruntime
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh
index 0601d7605d84..dfab41d4f373 100755
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@@ -1,7 +1,11 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline
 
+<<<<<<< HEAD
 set -eou pipefail
+=======
+set -exou pipefail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TOPDIR=$(git rev-parse --show-toplevel)
 
@@ -9,6 +13,7 @@ image="$1"
 shift
 
 if [ -z "${image}" ]; then
+<<<<<<< HEAD
   echo "Usage: $0 IMAGE"
   exit 1
 fi
@@ -54,17 +59,61 @@ case ${GPU_ARCH_TYPE} in
     cpu-cxx11-abi)
         TARGET=final
         DOCKER_TAG=cpu-cxx11-abi
+=======
+  echo "Usage: $0 IMAGE:ARCHTAG"
+  exit 1
+fi
+
+# Go from imagename:tag to tag
+DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+
+GPU_ARCH_VERSION=""
+if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
+    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
+elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
+    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
+fi
+
+MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
+DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
+OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
+
+case ${image} in
+    manylinux2_28-builder:cpu)
+        TARGET=cpu_final
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    manylinux2_28_aarch64-builder:cpu-aarch64)
+        TARGET=final
+        GPU_IMAGE=arm64v8/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
+        MANY_LINUX_VERSION="2_28_aarch64"
+        OPENBLAS_VERSION="v0.3.30"
+        ;;
+    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
+        TARGET=final
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         GPU_IMAGE=""
         DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
         MANY_LINUX_VERSION="cxx11-abi"
         ;;
+<<<<<<< HEAD
     cpu-s390x)
         TARGET=final
         DOCKER_TAG=cpu-s390x
+=======
+    manylinuxs390x-builder:cpu-s390x)
+        TARGET=final
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         GPU_IMAGE=s390x/almalinux:8
         DOCKER_GPU_BUILD_ARG=""
         MANY_LINUX_VERSION="s390x"
         ;;
+<<<<<<< HEAD
     cuda)
         TARGET=cuda_final
         DOCKER_TAG=cuda${GPU_ARCH_VERSION}
@@ -75,10 +124,15 @@ case ${GPU_ARCH_TYPE} in
     cuda-manylinux_2_28)
         TARGET=cuda_final
         DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+=======
+    manylinux2_28-builder:cuda11*)
+        TARGET=cuda_final
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         GPU_IMAGE=amd64/almalinux:8
         DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
         MANY_LINUX_VERSION="2_28"
         ;;
+<<<<<<< HEAD
     cuda-aarch64)
         TARGET=cuda_final
         DOCKER_TAG=cuda${GPU_ARCH_VERSION}
@@ -103,16 +157,46 @@ case ${GPU_ARCH_TYPE} in
     xpu)
         TARGET=xpu_final
         DOCKER_TAG=xpu
+=======
+    manylinux2_28-builder:cuda12*)
+        TARGET=cuda_final
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    manylinuxaarch64-builder:cuda*)
+        TARGET=cuda_final
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
+        MANY_LINUX_VERSION="aarch64"
+        DOCKERFILE_SUFFIX="_cuda_aarch64"
+        ;;
+    manylinux2_28-builder:rocm*)
+        TARGET=rocm_final
+        MANY_LINUX_VERSION="2_28"
+        DEVTOOLSET_VERSION="11"
+        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
+        ;;
+    manylinux2_28-builder:xpu)
+        TARGET=xpu_final
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         GPU_IMAGE=amd64/almalinux:8
         DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
         MANY_LINUX_VERSION="2_28"
         ;;
     *)
+<<<<<<< HEAD
         echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+=======
+        echo "ERROR: Unrecognized image name: ${image}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         exit 1
         ;;
 esac
 
+<<<<<<< HEAD
 IMAGES=''
 
 if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
@@ -158,3 +242,28 @@ if [[ "${WITH_PUSH}" == true ]]; then
         fi
     )
 fi
+=======
+if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
+    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
+fi
+# Only activate this if in CI
+if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
+    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+    sudo systemctl daemon-reload
+    sudo systemctl restart docker
+fi
+
+tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
+
+DOCKER_BUILDKIT=1 docker build  \
+    ${DOCKER_GPU_BUILD_ARG} \
+    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
+    --target "${TARGET}" \
+    -t "${tmp_tag}" \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
+    "${TOPDIR}/.ci/docker/"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/manywheel/build_scripts/build.sh b/.ci/docker/manywheel/build_scripts/build.sh
index e2cb1c7f27cd..75610a6fa94e 100644
--- a/.ci/docker/manywheel/build_scripts/build.sh
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@@ -97,7 +97,11 @@ find /opt/_internal -type f -print0 \
     | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
 # We do not need the Python test suites, or indeed the precompiled .pyc and
 # .pyo files. Partially cribbed from:
+<<<<<<< HEAD
 #    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
+=======
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 find /opt/_internal \
      \( -type d -a -name test -o -name tests \) \
   -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
diff --git a/.ci/docker/manywheel/build_scripts/build_utils.sh b/.ci/docker/manywheel/build_scripts/build_utils.sh
index cec871cac4f6..fa7631fb76f6 100755
--- a/.ci/docker/manywheel/build_scripts/build_utils.sh
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@@ -2,7 +2,11 @@
 # Helper utilities for build
 # Script used only in CD pipeline
 
+<<<<<<< HEAD
 OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
+=======
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CURL_DOWNLOAD_URL=https://curl.se/download
 
 AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 56abad7aafeb..a1d6751cdee4 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -16,7 +16,11 @@ click
 #test that import:
 
 coremltools==5.0b5 ; python_version < "3.12"
+<<<<<<< HEAD
 coremltools==7.2 ; python_version == "3.12"
+=======
+coremltools==8.3 ; python_version == "3.12"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #Description: Apple framework for ML integration
 #Pinned versions: 5.0b5
 #test that import:
@@ -42,9 +46,15 @@ fbscribelogger==0.1.7
 #Pinned versions: 0.1.6
 #test that import:
 
+<<<<<<< HEAD
 flatbuffers==2.0
 #Description: cross platform serialization library
 #Pinned versions: 2.0
+=======
+flatbuffers==24.12.23
+#Description: cross platform serialization library
+#Pinned versions: 24.12.23
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #test that import:
 
 hypothesis==5.35.1
@@ -92,10 +102,17 @@ librosa==0.10.2 ; python_version == "3.12"
 #Pinned versions:
 #test that import:
 
+<<<<<<< HEAD
 mypy==1.14.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
 #Pinned versions: 1.14.0
+=======
+mypy==1.16.0
+# Pin MyPy version because new errors are likely to appear with each release
+#Description: linter
+#Pinned versions: 1.16.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #test that import: test_typing.py, test_type_hints.py
 
 networkx==2.8.8
@@ -104,10 +121,17 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch
 
+<<<<<<< HEAD
 #ninja
 #Description: build system.  Note that it install from
 #here breaks things so it is commented out
 #Pinned versions: 1.10.0.post1
+=======
+ninja==1.11.1.3
+#Description: build system. Used in some tests. Used in build to generate build
+#time tracing information
+#Pinned versions: 1.11.1.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
 numba==0.60.0 ; python_version == "3.9"
@@ -258,11 +282,14 @@ scipy==1.14.1 ; python_version > "3.9"
 #Pinned versions:
 #test that import:
 
+<<<<<<< HEAD
 tlparse==0.3.30
 #Description: parse logs produced by torch.compile
 #Pinned versions:
 #test that import: dynamo/test_structured_trace.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # needed by torchgen utils
 typing-extensions>=4.10.0
 #Description: type hints for python
@@ -340,7 +367,11 @@ onnx==1.18.0 ; python_version == "3.13"
 #Pinned versions:
 #test that import:
 
+<<<<<<< HEAD
 onnxscript==0.2.2
+=======
+onnxscript==0.3.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@@ -354,7 +385,11 @@ parameterized==0.8.1
 #Pinned versions: 1.24.0
 #test that import: test_sac_estimator.py
 
+<<<<<<< HEAD
 pwlf==2.2.1 ; python_version >= "3.8"
+=======
+pwlf==2.2.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #Description: required for testing torch/distributed/_tools/sac_estimator.py
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py
@@ -366,10 +401,16 @@ PyYAML
 pyzstd
 setuptools
 
+<<<<<<< HEAD
 ninja==1.11.1 ; platform_machine == "aarch64"
 scons==4.5.2 ; platform_machine == "aarch64"
 
 pulp==2.9.0 ; python_version >= "3.8"
+=======
+scons==4.5.2 ; platform_machine == "aarch64"
+
+pulp==2.9.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #Description: required for testing ilp formulaiton under torch/distributed/_tools
 #Pinned versions: 2.9.0
 #test that import: test_sac_ilp.py
@@ -378,3 +419,16 @@ dataclasses_json==0.6.7
 #Description: required for data pipeline and scripts under tools/stats
 #Pinned versions: 0.6.7
 #test that import:
+<<<<<<< HEAD
+=======
+
+cmake==4.0.0
+#Description: required for building
+
+tlparse==0.3.30
+#Description: required for log parsing
+
+cuda-bindings>=12.0,<13.0
+#Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
+#test that import: test_cuda.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index dcbdb42ee64c..3524240ef2ea 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -1,15 +1,35 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
+<<<<<<< HEAD
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
 # something related to Docker setup. We can investigate this later
+=======
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
+
+# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
+# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
+# something related to Docker setup. We can investigate this later.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.8.6
 
+<<<<<<< HEAD
+=======
+sphinxext-opengraph==0.9.1
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 0.9.1
+
+sphinx_sitemap==2.6.0
+#Description: This is used to generate sitemap for PyTorch docs
+#Pinned versions: 2.6.0
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
@@ -46,5 +66,10 @@ myst-nb==0.17.2
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
+<<<<<<< HEAD
 sphinx-panels==0.4.1
+=======
+sphinx-design==0.4.0
+sphinxcontrib-mermaid==1.0.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 myst-parser==0.18.1
diff --git a/.ci/docker/triton_version.txt b/.ci/docker/triton_version.txt
index bea438e9ade7..346f470c5c21 100644
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 3.3.1
+=======
+3.4.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/docker/triton_xpu_version.txt b/.ci/docker/triton_xpu_version.txt
new file mode 100644
index 000000000000..18091983f59d
--- /dev/null
+++ b/.ci/docker/triton_xpu_version.txt
@@ -0,0 +1 @@
+3.4.0
diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
index a041ff3a3671..0f1986d91455 100644
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@@ -28,7 +28,10 @@ ARG ANACONDA_PYTHON_VERSION
 ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+<<<<<<< HEAD
 ARG CONDA_CMAKE
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
@@ -44,6 +47,7 @@ ARG CLANG_VERSION
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh
 
+<<<<<<< HEAD
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
 COPY ./common/install_protobuf.sh install_protobuf.sh
@@ -58,6 +62,8 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@@ -71,7 +77,11 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
+<<<<<<< HEAD
 RUN bash ./install_rocm_magma.sh
+=======
+RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
@@ -116,12 +126,15 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
 
+<<<<<<< HEAD
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile
index 41f690c4ab38..d1fc652cf71d 100644
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@@ -28,7 +28,10 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 
 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+<<<<<<< HEAD
 ARG CONDA_CMAKE
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ARG DOCS
 ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
@@ -73,6 +76,7 @@ ARG TRITON
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt
+<<<<<<< HEAD
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt
@@ -84,6 +88,12 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}
 
+=======
+COPY triton_xpu_version.txt triton_version.txt
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@@ -91,12 +101,15 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}
 
+<<<<<<< HEAD
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 44bc3b8f2c25..c00d2e38248f 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -1,6 +1,10 @@
 ARG UBUNTU_VERSION
 
+<<<<<<< HEAD
 FROM ubuntu:${UBUNTU_VERSION}
+=======
+FROM ubuntu:${UBUNTU_VERSION} as base
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ARG UBUNTU_VERSION
 
@@ -28,7 +32,10 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 
 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+<<<<<<< HEAD
 ARG CONDA_CMAKE
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ARG DOCS
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
@@ -52,9 +59,23 @@ RUN  bash ./install_lcov.sh && rm install_lcov.sh
 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
+<<<<<<< HEAD
 RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
+=======
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+COPY ./common/install_cusparselt.sh install_cusparselt.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
+ENV DESIRED_CUDA ${CUDA_VERSION}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
+# No effect if cuda not installed
+ENV USE_SYSTEM_NCCL=1
+ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
+ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # (optional) Install UCC
 ARG UCX_COMMIT
@@ -67,6 +88,7 @@ ADD ./common/install_ucc.sh install_ucc.sh
 RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
 RUN rm install_ucc.sh
 
+<<<<<<< HEAD
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
 COPY ./common/install_protobuf.sh install_protobuf.sh
@@ -81,6 +103,8 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@@ -88,6 +112,7 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}
 
+<<<<<<< HEAD
 # (optional) Install Vulkan SDK
 ARG VULKAN_SDK_VERSION
 COPY ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh
@@ -106,6 +131,8 @@ COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
@@ -127,6 +154,7 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
 
 ARG TRITON
+<<<<<<< HEAD
 # Install triton, this needs to be done before sccache because the latter will
 # try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
@@ -141,6 +169,23 @@ COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
 RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-cpu.txt
+=======
+ARG TRITON_CPU
+
+# Create a separate stage for building Triton and Triton-CPU.  install_triton
+# will check for the presence of env vars
+FROM base as triton-builder
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton.txt triton.txt
+COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
+RUN bash ./install_triton.sh
+
+FROM base as final
+COPY --from=triton-builder /opt/triton /opt/triton
+RUN if [ -n "${TRITON}" ] || [ -n "${TRITON_CPU}" ]; then pip install /opt/triton/*.whl; chown -R jenkins:jenkins /opt/conda; fi
+RUN rm -rf /opt/triton
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ARG EXECUTORCH
 # Build and install executorch
@@ -171,6 +216,15 @@ RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi
 RUN rm install_acl.sh
 ENV INSTALLED_ACL ${ACL}
 
+<<<<<<< HEAD
+=======
+ARG OPENBLAS
+COPY ./common/install_openblas.sh install_openblas.sh
+RUN if [ -n "${OPENBLAS}" ]; then bash ./install_openblas.sh; fi
+RUN rm install_openblas.sh
+ENV INSTALLED_OPENBLAS ${OPENBLAS}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Install ccache/sccache (do this last, so we get priority in PATH)
 ARG SKIP_SCCACHE_INSTALL
 COPY ./common/install_cache.sh install_cache.sh
diff --git a/.ci/magma-rocm/.gitignore b/.ci/magma-rocm/.gitignore
new file mode 100644
index 000000000000..6c64316195bc
--- /dev/null
+++ b/.ci/magma-rocm/.gitignore
@@ -0,0 +1,2 @@
+output/
+magma-rocm*/
diff --git a/.ci/magma-rocm/Makefile b/.ci/magma-rocm/Makefile
new file mode 100644
index 000000000000..5f63da87bc4d
--- /dev/null
+++ b/.ci/magma-rocm/Makefile
@@ -0,0 +1,35 @@
+SHELL=/usr/bin/env bash
+
+DOCKER_CMD ?= docker
+DESIRED_ROCM ?= 6.4
+DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
+PACKAGE_NAME = magma-rocm
+# inherit this from underlying docker image, do not pass this env var to docker
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
+
+DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
+	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
+	-w /builder \
+	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_ROCM_SHORT} \
+	-e DESIRED_ROCM=${DESIRED_ROCM} \
+	"pytorch/almalinux-builder:rocm${DESIRED_ROCM}" \
+	magma-rocm/build_magma.sh
+
+.PHONY: all
+all: magma-rocm64
+all: magma-rocm63
+
+.PHONY:
+clean:
+	$(RM) -r magma-*
+	$(RM) -r output
+
+.PHONY: magma-rocm64
+magma-rocm64: DESIRED_ROCM := 6.4
+magma-rocm64:
+	$(DOCKER_RUN)
+
+.PHONY: magma-rocm63
+magma-rocm63: DESIRED_ROCM := 6.3
+magma-rocm63:
+	$(DOCKER_RUN)
diff --git a/.ci/magma-rocm/README.md b/.ci/magma-rocm/README.md
new file mode 100644
index 000000000000..cfc3cd3ab163
--- /dev/null
+++ b/.ci/magma-rocm/README.md
@@ -0,0 +1,48 @@
+# Magma ROCm
+
+This folder contains the scripts and configurations to build libmagma.so, linked for various versions of ROCm.
+
+## Building
+
+Look in the `Makefile` for available targets to build. To build any target, for example `magma-rocm63`, run
+
+```
+# Using `docker`
+make magma-rocm63
+
+# Using `podman`
+DOCKER_CMD=podman make magma-rocm63
+```
+
+This spawns a `pytorch/manylinux-rocm<version>` docker image, which has the required `devtoolset` and ROCm versions installed.
+Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files
+into a tarball, with the following structure:
+
+```
+.
+├── include       # header files
+├── lib           # libmagma.so
+├── info
+│   ├── licenses  # license file
+│   └── recipe    # build script
+```
+
+More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version.
+Outputted binaries should be in the `output` folder.
+
+
+## Pushing
+
+Packages can be uploaded to an S3 bucket using:
+
+```
+aws s3 cp output/*/magma-cuda*.bz2 <bucket-with-path>
+```
+
+If you do not have upload permissions, please ping @seemethere or @soumith to gain access
+
+## New versions
+
+New ROCm versions can be added by creating a new make target with the next desired version. For ROCm version N.n, the target should be named `magma-rocmNn`.
+
+Make sure to edit the appropriate environment variables (e.g., DESIRED_ROCM) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct.
diff --git a/.ci/magma-rocm/build_magma.sh b/.ci/magma-rocm/build_magma.sh
new file mode 100755
index 000000000000..4acb3fb0dc3b
--- /dev/null
+++ b/.ci/magma-rocm/build_magma.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+# Environment variables
+# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+# Version 2.7.2 + ROCm related updates
+MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+
+# Folders for the build
+PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
+PACKAGE_DIR=${ROOT_DIR}/magma-rocm/${PACKAGE_NAME} # build workspace
+PACKAGE_OUTPUT=${ROOT_DIR}/magma-rocm/output # where tarballs are stored
+PACKAGE_BUILD=${PACKAGE_DIR} # where the content of the tarball is prepared
+PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe
+PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses
+mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE}
+
+# Fetch magma sources and verify checksum
+pushd ${PACKAGE_DIR}
+git clone https://bitbucket.org/icl/magma.git
+pushd magma
+git checkout ${MAGMA_VERSION}
+popd
+popd
+
+# build
+pushd ${PACKAGE_DIR}/magma
+# The build.sh script expects to be executed from the sources root folder
+INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh
+popd
+
+# Package recipe, license and tarball
+# Folder and package name are backward compatible for the build workflow
+cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
+cp ${PACKAGE_DIR}/magma/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT
+pushd ${PACKAGE_BUILD}
+tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info
+echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2
+popd
diff --git a/.ci/magma-rocm/package_files/build.sh b/.ci/magma-rocm/package_files/build.sh
new file mode 100755
index 000000000000..d0f0911db525
--- /dev/null
+++ b/.ci/magma-rocm/package_files/build.sh
@@ -0,0 +1,38 @@
+# Magma build scripts need `python`
+ln -sf /usr/bin/python3 /usr/bin/python
+
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  almalinux)
+    yum install -y gcc-gfortran
+    ;;
+  *)
+    echo "No preinstalls to build magma..."
+    ;;
+esac
+
+MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
+
+cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
+echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
+if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
+fi
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
+echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
+export PATH="${PATH}:/opt/rocm/bin"
+if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
+  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
+else
+  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
+fi
+for arch in $amdgpu_targets; do
+  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
+done
+# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
+sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
+make -f make.gen.hipMAGMA -j $(nproc)
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
+cp -R lib ${INSTALL_DIR}
+cp -R include ${INSTALL_DIR}
diff --git a/.ci/magma/Makefile b/.ci/magma/Makefile
index 17c62b71d4e2..bd8587b94162 100644
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@@ -1,7 +1,11 @@
 SHELL=/usr/bin/env bash
 
 DOCKER_CMD ?= docker
+<<<<<<< HEAD
 DESIRED_CUDA ?= 11.8
+=======
+DESIRED_CUDA ?= 12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DESIRED_CUDA_SHORT = $(subst .,,$(DESIRED_CUDA))
 PACKAGE_NAME = magma-cuda
 CUDA_ARCH_LIST ?= -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
@@ -12,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
 	-e DESIRED_CUDA=${DESIRED_CUDA} \
 	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
+<<<<<<< HEAD
 	"pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \
 	magma/build_magma.sh
 
@@ -20,12 +25,30 @@ all: magma-cuda128
 all: magma-cuda126
 all: magma-cuda124
 all: magma-cuda118
+=======
+	"pytorch/almalinux-builder:cuda${DESIRED_CUDA}-main" \
+	magma/build_magma.sh
+
+.PHONY: all
+all: magma-cuda129
+all: magma-cuda128
+all: magma-cuda126
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .PHONY:
 clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 
+<<<<<<< HEAD
+=======
+.PHONY: magma-cuda129
+magma-cuda129: DESIRED_CUDA := 12.9
+magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
+magma-cuda129:
+	$(DOCKER_RUN)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .PHONY: magma-cuda128
 magma-cuda128: DESIRED_CUDA := 12.8
 magma-cuda128: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
@@ -36,6 +59,7 @@ magma-cuda128:
 magma-cuda126: DESIRED_CUDA := 12.6
 magma-cuda126:
 	$(DOCKER_RUN)
+<<<<<<< HEAD
 
 .PHONY: magma-cuda124
 magma-cuda124: DESIRED_CUDA := 12.4
@@ -47,3 +71,5 @@ magma-cuda118: DESIRED_CUDA := 11.8
 magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
 magma-cuda118:
 	$(DOCKER_RUN)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/manywheel/build_common.sh b/.ci/manywheel/build_common.sh
index 025b19a24a98..daaad261e671 100644
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@@ -18,12 +18,19 @@ retry () {
     $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }
 
+<<<<<<< HEAD
 PLATFORM="manylinux2014_x86_64"
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
     retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+=======
+PLATFORM=""
+# TODO move this into the Docker images
+OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
+if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     retry yum install -q -y zip openssl
     PLATFORM="manylinux_2_28_x86_64"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
@@ -33,9 +40,17 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
     # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
     # shellcheck disable=SC2046
     sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
+<<<<<<< HEAD
 
     retry apt-get update
     retry apt-get -y install zip openssl
+=======
+    retry apt-get update
+    retry apt-get -y install zip openssl
+else
+    echo "Unknown OS: '$OS_NAME'"
+    exit 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 # We use the package name to test the package by passing this to 'pip install'
@@ -79,8 +94,11 @@ if [[ -e /opt/openssl ]]; then
     export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
 fi
 
+<<<<<<< HEAD
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mkdir -p /tmp/$WHEELHOUSE_DIR
 
 export PATCHELF_BIN=/usr/local/bin/patchelf
@@ -99,6 +117,10 @@ if [[ -z "$PYTORCH_ROOT" ]]; then
     exit 1
 fi
 pushd "$PYTORCH_ROOT"
+<<<<<<< HEAD
+=======
+retry pip install -q cmake
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 python setup.py clean
 retry pip install -qr requirements.txt
 case ${DESIRED_PYTHON} in
@@ -111,12 +133,15 @@ case ${DESIRED_PYTHON} in
     ;;
 esac
 
+<<<<<<< HEAD
 if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
     export _GLIBCXX_USE_CXX11_ABI=1
 else
     export _GLIBCXX_USE_CXX11_ABI=0
 fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
     echo "Calling build_amd.py at $(date)"
     python tools/amd_build/build_amd.py
@@ -158,7 +183,11 @@ if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
     BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
     BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
     USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
+<<<<<<< HEAD
     python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR --cmake
+=======
+    CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
 else
     time CMAKE_ARGS=${CMAKE_ARGS[@]} \
@@ -209,12 +238,15 @@ if [[ -n "$BUILD_PYTHONLESS" ]]; then
 
     mkdir -p /tmp/$LIBTORCH_HOUSE_DIR
 
+<<<<<<< HEAD
     if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
         LIBTORCH_ABI="cxx11-abi-"
     else
         LIBTORCH_ABI=
     fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
     cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
        /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh
index 8f8b37b46e59..dc07294b1450 100644
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@@ -15,6 +15,12 @@ export INSTALL_TEST=0 # dont install test binaries into site-packages
 export USE_CUPTI_SO=0
 export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build
 export USE_CUFILE=${USE_CUFILE:-1}
+<<<<<<< HEAD
+=======
+export USE_SYSTEM_NCCL=1
+export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
+export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Keep an array of cmake variables to add to
 if [[ -z "$CMAKE_ARGS" ]]; then
@@ -36,10 +42,15 @@ if [[ -n "$DESIRED_CUDA" ]]; then
     if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then
         CUDA_VERSION=${DESIRED_CUDA}
     else
+<<<<<<< HEAD
         # cu90, cu92, cu100, cu101
         if [[ ${#DESIRED_CUDA} -eq 4 ]]; then
             CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}"
         elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then
+=======
+        # cu126, cu128 etc...
+        if [[ ${#DESIRED_CUDA} -eq 5 ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}"
         fi
     fi
@@ -50,6 +61,7 @@ else
 fi
 
 cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
+<<<<<<< HEAD
 
 TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
@@ -68,6 +80,25 @@ case ${CUDA_VERSION} in
     11.8)
         TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0"
         EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+=======
+EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+
+case ${CUDA_VERSION} in
+    #removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
+    #however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
+    12.8)
+        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
+        ;;
+    12.9)
+        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
+        # WAR to resolve the ld error in libtorch build with CUDA 12.9
+        if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
+            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
+        fi
+        ;;
+    12.6)
+        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ;;
     *)
         echo "unknown cuda version $CUDA_VERSION"
@@ -91,14 +122,24 @@ fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
+<<<<<<< HEAD
 if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
     LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+=======
+if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
     LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
     LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
+<<<<<<< HEAD
+=======
+else
+    echo "Unknown OS: '$OS_NAME'"
+    exit 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 DEPS_LIST=(
@@ -108,6 +149,7 @@ DEPS_SONAME=(
     "libgomp.so.1"
 )
 
+<<<<<<< HEAD
 # CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
 # since nvidia-cusparselt-cu11 is not available in PYPI
 if [[ $USE_CUSPARSELT == "1" && $CUDA_VERSION == "11.8" ]]; then
@@ -128,11 +170,18 @@ fi
 
 
 # CUDA_VERSION 12.4, 12.6, 12.8
+=======
+
+# CUDA_VERSION 12.6, 12.8, 12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if [[ $CUDA_VERSION == 12* ]]; then
     export USE_STATIC_CUDNN=0
     # Try parallelizing nvcc as well
     export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
         echo "Bundling with cudnn and cublas."
         DEPS_LIST+=(
@@ -148,9 +197,16 @@ if [[ $CUDA_VERSION == 12* ]]; then
             "/usr/local/cuda/lib64/libcublasLt.so.12"
             "/usr/local/cuda/lib64/libcusparseLt.so.0"
             "/usr/local/cuda/lib64/libcudart.so.12"
+<<<<<<< HEAD
             "/usr/local/cuda/lib64/libnvToolsExt.so.1"
             "/usr/local/cuda/lib64/libnvrtc.so.12"
             "/usr/local/cuda/lib64/libnvrtc-builtins.so"
+=======
+            "/usr/local/cuda/lib64/libnvrtc.so.12"
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
+            "/usr/local/cuda/lib64/libcufile.so.0"
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         DEPS_SONAME+=(
             "libcudnn_adv.so.9"
@@ -165,6 +221,7 @@ if [[ $CUDA_VERSION == 12* ]]; then
             "libcublasLt.so.12"
             "libcusparseLt.so.0"
             "libcudart.so.12"
+<<<<<<< HEAD
             "libnvToolsExt.so.1"
             "libnvrtc.so.12"
             "libnvrtc-builtins.so"
@@ -179,6 +236,13 @@ if [[ $CUDA_VERSION == 12* ]]; then
                 "libcufile_rdma.so.1"
             )
         fi
+=======
+            "libnvrtc.so.12"
+            "libnvrtc-builtins.so"
+            "libcufile.so.0"
+            "libcufile_rdma.so.1"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else
         echo "Using nvidia libs from pypi."
         CUDA_RPATHS=(
@@ -191,6 +255,7 @@ if [[ $CUDA_VERSION == 12* ]]; then
             '$ORIGIN/../../nvidia/curand/lib'
             '$ORIGIN/../../nvidia/cusolver/lib'
             '$ORIGIN/../../nvidia/cusparse/lib'
+<<<<<<< HEAD
             '$ORIGIN/../../cusparselt/lib'
             '$ORIGIN/../../nvidia/nccl/lib'
             '$ORIGIN/../../nvidia/nvtx/lib'
@@ -267,18 +332,31 @@ elif [[ $CUDA_VERSION == "11.8" ]]; then
             '$ORIGIN/../../nvidia/cusparse/lib'
             '$ORIGIN/../../nvidia/nccl/lib'
             '$ORIGIN/../../nvidia/nvtx/lib'
+=======
+            '$ORIGIN/../../nvidia/cusparselt/lib'
+            '$ORIGIN/../../cusparselt/lib'
+            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/nvtx/lib'
+            '$ORIGIN/../../nvidia/cufile/lib'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
         export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
         export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
         export FORCE_RPATH="--force-rpath"
         export USE_STATIC_NCCL=0
+<<<<<<< HEAD
         export USE_SYSTEM_NCCL=1
         export ATEN_STATIC_CUDA=0
         export USE_CUDA_STATIC_LINK=0
         export USE_CUPTI_SO=1
         export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
         export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
+=======
+        export ATEN_STATIC_CUDA=0
+        export USE_CUDA_STATIC_LINK=0
+        export USE_CUPTI_SO=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fi
 else
     echo "Unknown cuda version $CUDA_VERSION"
diff --git a/.ci/manywheel/build_libtorch.sh b/.ci/manywheel/build_libtorch.sh
index 41d8c4e15272..f2da1a850d2d 100644
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@@ -22,9 +22,13 @@ retry () {
 
 # TODO move this into the Docker images
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
+<<<<<<< HEAD
 if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
     retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+=======
+if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
     retry dnf install -q -y zip openssl
@@ -35,6 +39,12 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
     sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
     retry apt-get update
     retry apt-get -y install zip openssl
+<<<<<<< HEAD
+=======
+else
+    echo "Unknown OS: '$OS_NAME'"
+    exit 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 # Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
@@ -91,16 +101,23 @@ if [[ -z "$PYTORCH_ROOT" ]]; then
     exit 1
 fi
 pushd "$PYTORCH_ROOT"
+<<<<<<< HEAD
+=======
+retry pip install -q cmake
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 python setup.py clean
 retry pip install -qr requirements.txt
 retry pip install -q numpy==2.0.1
 
+<<<<<<< HEAD
 if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
     export _GLIBCXX_USE_CXX11_ABI=1
 else
     export _GLIBCXX_USE_CXX11_ABI=0
 fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
     echo "Calling build_amd.py at $(date)"
     python tools/amd_build/build_amd.py
@@ -169,12 +186,15 @@ fi
 
 )
 
+<<<<<<< HEAD
 if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
     LIBTORCH_ABI="cxx11-abi-"
 else
     LIBTORCH_ABI=
 fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 (
     set -x
 
diff --git a/.ci/manywheel/build_rocm.sh b/.ci/manywheel/build_rocm.sh
index 703248d44aa9..331b3f369516 100755
--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@@ -95,6 +95,10 @@ ROCM_SO_FILES=(
     "libroctracer64.so"
     "libroctx64.so"
     "libhipblaslt.so"
+<<<<<<< HEAD
+=======
+    "libhipsparselt.so"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "libhiprtc.so"
 )
 
@@ -186,6 +190,7 @@ do
     OS_SO_FILES[${#OS_SO_FILES[@]}]=$file_name # Append lib to array
 done
 
+<<<<<<< HEAD
 # rocBLAS library files
 ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
 ROCBLAS_LIB_DST=lib/rocblas/library
@@ -193,13 +198,36 @@ ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; seperated arch list
 ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
 OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
 ROCBLAS_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)
+=======
+ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; separated arch list to bar for grep
+
+# rocBLAS library files
+ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
+ROCBLAS_LIB_DST=lib/rocblas/library
+ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
+ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
+ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # hipblaslt library files
 HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
 HIPBLASLT_LIB_DST=lib/hipblaslt/library
+<<<<<<< HEAD
 ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH)
 OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx)
 HIPBLASLT_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)
+=======
+HIPBLASLT_ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH)
+HIPBLASLT_OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx)
+HIPBLASLT_LIB_FILES=($HIPBLASLT_ARCH_SPECIFIC_FILES $HIPBLASLT_OTHER_FILES)
+
+# hipsparselt library files
+HIPSPARSELT_LIB_SRC=$ROCM_HOME/lib/hipsparselt/library
+HIPSPARSELT_LIB_DST=lib/hipsparselt/library
+HIPSPARSELT_ARCH_SPECIFIC_FILES=$(ls $HIPSPARSELT_LIB_SRC | grep -E $ARCH)
+#HIPSPARSELT_OTHER_FILES=$(ls $HIPSPARSELT_LIB_SRC | grep -v gfx)
+HIPSPARSELT_LIB_FILES=($HIPSPARSELT_ARCH_SPECIFIC_FILES $HIPSPARSELT_OTHER_FILES)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # ROCm library files
 ROCM_SO_PATHS=()
@@ -234,12 +262,20 @@ DEPS_SONAME=(
 DEPS_AUX_SRCLIST=(
     "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_SRC/}"
     "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_SRC/}"
+<<<<<<< HEAD
+=======
+    "${HIPSPARSELT_LIB_FILES[@]/#/$HIPSPARSELT_LIB_SRC/}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "/opt/amdgpu/share/libdrm/amdgpu.ids"
 )
 
 DEPS_AUX_DSTLIST=(
     "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_DST/}"
     "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_DST/}"
+<<<<<<< HEAD
+=======
+    "${HIPSPARSELT_LIB_FILES[@]/#/$HIPSPARSELT_LIB_DST/}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "share/libdrm/amdgpu.ids"
 )
 
diff --git a/.ci/manywheel/build_xpu.sh b/.ci/manywheel/build_xpu.sh
index 2bc60dd10727..b6016d45a96a 100755
--- a/.ci/manywheel/build_xpu.sh
+++ b/.ci/manywheel/build_xpu.sh
@@ -20,7 +20,15 @@ fi
 source /opt/intel/oneapi/compiler/latest/env/vars.sh
 source /opt/intel/oneapi/pti/latest/env/vars.sh
 source /opt/intel/oneapi/umf/latest/env/vars.sh
+<<<<<<< HEAD
 export USE_STATIC_MKL=1
+=======
+source /opt/intel/oneapi/ccl/latest/env/vars.sh
+source /opt/intel/oneapi/mpi/latest/env/vars.sh
+export USE_STATIC_MKL=1
+export USE_ONEMKL=1
+export USE_XCCL=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 WHEELHOUSE_DIR="wheelhousexpu"
 LIBTORCH_HOUSE_DIR="libtorch_housexpu"
diff --git a/.ci/onnx/README.md b/.ci/onnx/README.md
index 837e9b7d8109..bee250c271ef 100644
--- a/.ci/onnx/README.md
+++ b/.ci/onnx/README.md
@@ -10,5 +10,8 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
 built on Jenkins and are used in triggered builds already have this
 environment variable set in their manifest. Also see
 `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
+<<<<<<< HEAD
 
 Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
index dfc4e0fab927..58137d79acdf 100755
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@@ -27,6 +27,15 @@ cmake --version
 echo "Environment variables:"
 env
 
+<<<<<<< HEAD
+=======
+# The sccache wrapped version of nvcc gets put in /opt/cache/lib in docker since
+# there are some issues if it is always wrapped, so we need to add it to PATH
+# during CI builds.
+# https://github.com/pytorch/pytorch/blob/0b6c0898e6c352c8ea93daec854e704b41485375/.ci/docker/common/install_cache.sh#L97
+export PATH="/opt/cache/lib:$PATH"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
   # Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289
   export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
@@ -35,7 +44,11 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
+<<<<<<< HEAD
   if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then
+=======
+  if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: there is a linking issue when building with UCC using clang,
     # disable it for now and to be fix later.
     # TODO: disable UCC temporarily to enable CUDA 12.1 in CI
@@ -52,12 +65,15 @@ fi
 export USE_LLVM=/opt/llvm
 export LLVM_DIR=/opt/llvm/lib/cmake/llvm
 
+<<<<<<< HEAD
 if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
   # To build test_edge_op_registration
   export BUILD_EXECUTORCH=ON
   export USE_CUDA=0
 fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if ! which conda; then
   # In ROCm CIs, we are doing cross compilation on build machines with
   # intel cpu and later run tests on machines with amd cpu.
@@ -171,6 +187,15 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   # shellcheck disable=SC1091
   source /opt/intel/oneapi/compiler/latest/env/vars.sh
+<<<<<<< HEAD
+=======
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/ccl/latest/env/vars.sh
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/mpi/latest/env/vars.sh
+  # Enable XCCL build
+  export USE_XCCL=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
   export USE_KINETO=0
   export TORCH_XPU_ARCH_LIST=pvc
@@ -251,6 +276,10 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
   set -e -o pipefail
 
   get_bazel
+<<<<<<< HEAD
+=======
+  python3 tools/optional_submodules.py checkout_eigen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   # Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing
   # the runner
@@ -277,10 +306,15 @@ else
     # or building non-XLA tests.
     if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
           "$BUILD_ENVIRONMENT" != *xla* ]]; then
+<<<<<<< HEAD
       if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
         # Install numpy-2.0.2 for builds which are backward compatible with 1.X
         python -mpip install numpy==2.0.2
       fi
+=======
+      # Install numpy-2.0.2 for builds which are backward compatible with 1.X
+      python -mpip install numpy==2.0.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       WERROR=1 python setup.py clean
 
@@ -303,6 +337,21 @@ else
     fi
     pip_install_whl "$(echo dist/*.whl)"
 
+<<<<<<< HEAD
+=======
+    if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
+      echo "Checking that xpu is compiled"
+      pushd dist/
+      if python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'; then
+        echo "XPU support is compiled in."
+      else
+        echo "XPU support is NOT compiled in."
+        exit 1
+      fi
+      popd
+    fi
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: I'm not sure why, but somehow we lose verbose commands
     set -x
 
diff --git a/.ci/pytorch/check_binary.sh b/.ci/pytorch/check_binary.sh
index f77d8a5b0777..74c7f301eff7 100755
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@@ -63,6 +63,7 @@ fi
 # Check GCC ABI
 ###############################################################################
 
+<<<<<<< HEAD
 # NOTE [ Building libtorch with old vs. new gcc ABI ]
 #
 # Packages built with one version of ABI could not be linked against by client
@@ -121,6 +122,14 @@ if [[ "$(uname)" != 'Darwin' ]]; then
   fi
 
   # We also check that there are [not] cxx11 symbols in libtorch
+=======
+# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce
+#       wheels with cxx11-abi
+
+echo "Checking that the gcc ABI is what we expect"
+if [[ "$(uname)" != 'Darwin' ]]; then
+  # We also check that there are cxx11 symbols in libtorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   #
   echo "Checking that symbols in libtorch.so have the right gcc abi"
   python3 "$(dirname ${BASH_SOURCE[0]})/smoke_test/check_binary_symbols.py"
@@ -198,6 +207,7 @@ setup_link_flags () {
 
 TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code"
 build_and_run_example_cpp () {
+<<<<<<< HEAD
   if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
     GLIBCXX_USE_CXX11_ABI=1
   else
@@ -227,6 +237,13 @@ build_example_cpp_with_incorrect_abi () {
   fi
 }
 
+=======
+  setup_link_flags
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  ./$1
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ###############################################################################
 # Check simple Python/C++ calls
 ###############################################################################
@@ -236,11 +253,14 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
     export LD_LIBRARY_PATH=/usr/local/cuda/lib64
   fi
   build_and_run_example_cpp simple-torch-test
+<<<<<<< HEAD
   # `_GLIBCXX_USE_CXX11_ABI` is always ignored by gcc in devtoolset7, so we test
   # the expected failure case for Ubuntu 16.04 + gcc 5.4 only.
   if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
     build_example_cpp_with_incorrect_abi simple-torch-test
   fi
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else
   pushd /tmp
   python -c 'import torch'
@@ -298,6 +318,17 @@ else
 fi
 
 ###############################################################################
+<<<<<<< HEAD
+=======
+# Check XPU configured correctly
+###############################################################################
+if [[ "$DESIRED_CUDA" == 'xpu' && "$PACKAGE_TYPE" != 'libtorch' ]]; then
+  echo "Checking that xpu is compiled"
+  python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'
+fi
+
+###############################################################################
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Check CUDA configured correctly
 ###############################################################################
 # Skip these for Windows machines without GPUs
@@ -375,10 +406,30 @@ except RuntimeError as e:
 fi
 
 ###############################################################################
+<<<<<<< HEAD
 # Check for C++ ABI compatibility between gcc7 and gcc9 compiled binaries
 ###############################################################################
 if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
   pushd /tmp
   python -c "import torch; exit(0 if torch.compiled_with_cxx11_abi() else (0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1011' else 1))"
+=======
+# Check for C++ ABI compatibility to GCC-11 - GCC 13
+###############################################################################
+if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
+  pushd /tmp
+  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
+  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
+  # gcc 11 - CUDA 11.8, xpu, rocm
+  # gcc 13 - CUDA 12.6, 12.8 and cpu
+  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
+  if [[ "$(uname -m)" == "s390x" ]]; then
+    cxx_abi="19"
+  elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
+    cxx_abi="18"
+  else
+    cxx_abi="16"
+  fi
+  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   popd
 fi
diff --git a/.ci/pytorch/common.sh b/.ci/pytorch/common.sh
index e71f6d6eaf0b..e8ccdb3bb635 100644
--- a/.ci/pytorch/common.sh
+++ b/.ci/pytorch/common.sh
@@ -13,6 +13,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
   # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
   unset HIP_PLATFORM
   export PYTORCH_TEST_WITH_ROCM=1
+<<<<<<< HEAD
   # temporary to locate some kernel issues on the CI nodes
   export HSAKMT_DEBUG_LEVEL=4
   # improve rccl performance for distributed tests
@@ -20,5 +21,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
 fi
 
 # TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
+=======
+fi
+
+# TODO: Reenable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # shellcheck disable=SC2034
 BUILD_TEST_LIBTORCH=0
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index 4f8439bd832d..fc74707ff80b 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -67,13 +67,21 @@ function pip_install_whl() {
     # Loop through each path and install individually
     for path in "${paths[@]}"; do
       echo "Installing $path"
+<<<<<<< HEAD
       python3 -mpip install "$path"
+=======
+      python3 -mpip install --no-index --no-deps "$path"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     done
   else
     # Loop through each argument and install individually
     for path in "${args[@]}"; do
       echo "Installing $path"
+<<<<<<< HEAD
       python3 -mpip install "$path"
+=======
+      python3 -mpip install --no-index --no-deps "$path"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     done
   fi
 }
@@ -159,7 +167,10 @@ function install_torchvision() {
   fi
 }
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 function install_torchrec_and_fbgemm() {
   local torchrec_commit
   torchrec_commit=$(get_pinned_commit torchrec)
@@ -198,7 +209,11 @@ function install_torchrec_and_fbgemm() {
 
 function clone_pytorch_xla() {
   if [[ ! -d ./xla ]]; then
+<<<<<<< HEAD
     git clone --recursive -b r2.7 https://github.com/pytorch/xla.git
+=======
+    git clone --recursive -b r2.8 https://github.com/pytorch/xla.git
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pushd xla
     # pin the xla hash so that we don't get broken by changes to xla
     git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh
index 4a2f63a2ed10..524d04e84dbe 100755
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@@ -33,6 +33,7 @@ if which sccache > /dev/null; then
   export PATH="${tmp_dir}:$PATH"
 fi
 
+<<<<<<< HEAD
 cross_compile_arm64() {
   # Cross compilation for arm64
   # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
@@ -83,6 +84,17 @@ else
   compile_x86_64
 fi
 
+=======
+print_cmake_info
+if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+else
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
+fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if which sccache > /dev/null; then
   print_sccache_stats
 fi
diff --git a/.ci/pytorch/macos-common.sh b/.ci/pytorch/macos-common.sh
index 1c7bc103673d..05feb61bc8d9 100755
--- a/.ci/pytorch/macos-common.sh
+++ b/.ci/pytorch/macos-common.sh
@@ -20,6 +20,7 @@ print_cmake_info() {
   CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC")
   # Print all libraries under cmake rpath for debugging
   ls -la "$CONDA_INSTALLATION_DIR/../lib"
+<<<<<<< HEAD
 
   export CMAKE_EXEC
   # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
@@ -30,4 +31,6 @@ print_cmake_info() {
   # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
   # with an exit code 137 otherwise
   codesign -f -s - "${CMAKE_EXEC}" || true
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh
index 179556cc59d0..709074a1d47c 100755
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@@ -5,11 +5,14 @@ set -x
 # shellcheck source=./macos-common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh"
 
+<<<<<<< HEAD
 if [[ -n "$CONDA_ENV" ]]; then
   # Use binaries under conda environment
   export PATH="$CONDA_ENV/bin":$PATH
 fi
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Test that OpenMP is enabled
 pushd test
 if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
@@ -42,6 +45,19 @@ test_python_all() {
   assert_git_not_dirty
 }
 
+<<<<<<< HEAD
+=======
+test_python_mps() {
+  setup_test_python
+
+  time python test/run_test.py --verbose --mps
+  MTL_CAPTURE_ENABLED=1 ${CONDA_RUN} python3 test/test_mps.py --verbose -k test_metal_capture
+
+  assert_git_not_dirty
+}
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 test_python_shard() {
   if [[ -z "$NUM_TEST_SHARDS" ]]; then
     echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@@ -155,6 +171,10 @@ test_jit_hooks() {
 torchbench_setup_macos() {
   git clone --recursive https://github.com/pytorch/vision torchvision
   git clone --recursive https://github.com/pytorch/audio torchaudio
+<<<<<<< HEAD
+=======
+  brew install jpeg-turbo libpng
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   pushd torchvision
   git fetch
@@ -169,7 +189,12 @@ torchbench_setup_macos() {
   git checkout "$(cat ../.github/ci_commit_pins/audio.txt)"
   git submodule update --init --recursive
   python setup.py clean
+<<<<<<< HEAD
   python setup.py develop
+=======
+  #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
+  USE_OPENMP=0 python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   popd
 
   # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
@@ -177,9 +202,14 @@ torchbench_setup_macos() {
   checkout_install_torchbench
 }
 
+<<<<<<< HEAD
 conda_benchmark_deps() {
   conda install -y astunparse numpy scipy ninja pyyaml setuptools cmake typing-extensions requests protobuf numba cython scikit-learn
   conda install -y -c conda-forge librosa
+=======
+pip_benchmark_deps() {
+  python -mpip install --no-input astunparse requests cython scikit-learn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -187,7 +217,11 @@ test_torchbench_perf() {
   print_cmake_info
 
   echo "Launching torchbench setup"
+<<<<<<< HEAD
   conda_benchmark_deps
+=======
+  pip_benchmark_deps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   torchbench_setup_macos
 
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
@@ -214,13 +248,18 @@ test_torchbench_smoketest() {
   print_cmake_info
 
   echo "Launching torchbench setup"
+<<<<<<< HEAD
   conda_benchmark_deps
+=======
+  pip_benchmark_deps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # shellcheck disable=SC2119,SC2120
   torchbench_setup_macos
 
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
 
+<<<<<<< HEAD
   local backend=eager
   local dtype=notset
   local device=mps
@@ -240,6 +279,56 @@ test_torchbench_smoketest() {
     PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
       --performance --only "$model" --backend "$backend" --inference --devices "$device" \
       --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
+=======
+  local device=mps
+  local dtypes=(undefined float16 bfloat16 notset)
+  local dtype=${dtypes[$1]}
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+
+  for backend in eager inductor; do
+
+    echo "Launching torchbench inference performance run for backend ${backend} and dtype ${dtype}"
+    local dtype_arg="--${dtype}"
+    if [ "$dtype" == notset ]; then
+        dtype_arg="--float32"
+    fi
+    touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
+    for model in "${models[@]}"; do
+      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+        --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
+        --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
+      if [ "$backend" == "inductor" ]; then
+        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+          --accuracy --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
+          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
+      fi
+    done
+    if [ "$backend" == "inductor" ]; then
+      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
+        --performance --backend "$backend" --inference --devices "$device" "$dtype_arg" \
+        --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_performance.csv" || true
+      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
+        --accuracy --backend "$backend" --inference --devices "$device" "$dtype_arg" \
+        --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
+    fi
+
+    if [ "$dtype" == notset ]; then
+      for dtype_ in notset amp; do
+        echo "Launching torchbench training performance run for backend ${backend} and dtype ${dtype_}"
+        touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype_}_training_${device}_performance.csv"
+        local dtype_arg="--${dtype_}"
+        if [ "$dtype_" == notset ]; then
+          dtype_arg="--float32"
+        fi
+        for model in "${models[@]}"; do
+          PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+            --performance --only "$model" --backend "$backend" --training --devices "$device" "$dtype_arg" \
+            --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype_}_training_${device}_performance.csv" || true
+        done
+      done
+    fi
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   done
 
   echo "Pytorch benchmark on mps device completed"
@@ -249,7 +338,11 @@ test_hf_perf() {
   print_cmake_info
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
+<<<<<<< HEAD
   conda_benchmark_deps
+=======
+  pip_benchmark_deps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   torchbench_setup_macos
 
   echo "Launching HuggingFace training perf run"
@@ -265,7 +358,11 @@ test_timm_perf() {
   print_cmake_info
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
+<<<<<<< HEAD
   conda_benchmark_deps
+=======
+  pip_benchmark_deps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   torchbench_setup_macos
 
   echo "Launching timm training perf run"
@@ -288,7 +385,13 @@ elif [[ $TEST_CONFIG == *"perf_hf"* ]]; then
 elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
   test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
+<<<<<<< HEAD
   test_torchbench_smoketest
+=======
+  test_torchbench_smoketest "${SHARD_NUMBER}"
+elif [[ $TEST_CONFIG == *"mps"* ]]; then
+  test_python_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
   test_python_shard "${SHARD_NUMBER}"
   if [[ "${SHARD_NUMBER}" == 1 ]]; then
diff --git a/.ci/pytorch/python_doc_push_script.sh b/.ci/pytorch/python_doc_push_script.sh
index 229a4a5b5297..6c8f983b98d3 100755
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@@ -119,12 +119,15 @@ popd
 git rm -rf "$install_path" || true
 mv "$pt_checkout/docs/build/html" "$install_path"
 
+<<<<<<< HEAD
 # Prevent Google from indexing $install_path/_modules. This folder contains
 # generated source files.
 # NB: the following only works on gnu sed. The sed shipped with mac os is different.
 # One can `brew install gnu-sed` on a mac and then use "gsed" instead of "sed".
 find "$install_path/_modules" -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">'
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 git add "$install_path" || true
 git status
 git config user.email "soumith+bot@pytorch.org"
diff --git a/.ci/pytorch/run_tests.sh b/.ci/pytorch/run_tests.sh
index 6c1c55468864..87714d654de4 100755
--- a/.ci/pytorch/run_tests.sh
+++ b/.ci/pytorch/run_tests.sh
@@ -76,7 +76,11 @@ fi
 # Environment initialization
 if [[ "$(uname)" == Darwin ]]; then
     # Install the testing dependencies
+<<<<<<< HEAD
     retry conda install -yq future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
+=======
+    retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else
     retry pip install -qr requirements.txt || true
     retry pip install -q hypothesis protobuf pytest setuptools || true
@@ -91,7 +95,10 @@ fi
 
 echo "Testing with:"
 pip freeze
+<<<<<<< HEAD
 conda list || true
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ##############################################################################
 # Smoke tests
diff --git a/.ci/pytorch/smoke_test/check_binary_symbols.py b/.ci/pytorch/smoke_test/check_binary_symbols.py
index 97d6482d63bc..2663ccaab062 100755
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@@ -80,7 +80,11 @@ def _get_symbols_chunk(i):
         return functools.reduce(list.__add__, (x.result() for x in tasks), [])
 
 
+<<<<<<< HEAD
 def check_lib_symbols_for_abi_correctness(lib: str, pre_cxx11_abi: bool = True) -> None:
+=======
+def check_lib_symbols_for_abi_correctness(lib: str) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     print(f"lib: {lib}")
     cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
     pre_cxx11_symbols = grep_symbols(lib, LIBTORCH_PRE_CXX11_PATTERNS)
@@ -88,6 +92,7 @@ def check_lib_symbols_for_abi_correctness(lib: str, pre_cxx11_abi: bool = True)
     num_pre_cxx11_symbols = len(pre_cxx11_symbols)
     print(f"num_cxx11_symbols: {num_cxx11_symbols}")
     print(f"num_pre_cxx11_symbols: {num_pre_cxx11_symbols}")
+<<<<<<< HEAD
     if pre_cxx11_abi:
         if num_cxx11_symbols > 0:
             raise RuntimeError(
@@ -110,6 +115,14 @@ def check_lib_symbols_for_abi_correctness(lib: str, pre_cxx11_abi: bool = True)
             )
         if num_cxx11_symbols < 100:
             raise RuntimeError("Didn't find enought cxx11 symbols")
+=======
+    if num_pre_cxx11_symbols > 0:
+        raise RuntimeError(
+            f"Found pre-cxx11 symbols, but there shouldn't be any, see: {pre_cxx11_symbols[:100]}"
+        )
+    if num_cxx11_symbols < 100:
+        raise RuntimeError("Didn't find enough cxx11 symbols")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def main() -> None:
@@ -121,9 +134,14 @@ def main() -> None:
         else:
             install_root = Path(distutils.sysconfig.get_python_lib()) / "torch"
 
+<<<<<<< HEAD
     libtorch_cpu_path = install_root / "lib" / "libtorch_cpu.so"
     pre_cxx11_abi = "cxx11-abi" not in os.getenv("DESIRED_DEVTOOLSET", "")
     check_lib_symbols_for_abi_correctness(libtorch_cpu_path, pre_cxx11_abi)
+=======
+    libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
+    check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/.ci/pytorch/smoke_test/check_gomp.py b/.ci/pytorch/smoke_test/check_gomp.py
index 93430ff39906..a47b76201bbe 100644
--- a/.ci/pytorch/smoke_test/check_gomp.py
+++ b/.ci/pytorch/smoke_test/check_gomp.py
@@ -46,6 +46,12 @@ def get_gomp_thread():
 
     # use the default gomp path of AlmaLinux OS
     libgomp_path = "/usr/lib64/libgomp.so.1"
+<<<<<<< HEAD
+=======
+    # if it does not exist, try Ubuntu path
+    if not os.path.exists(libgomp_path):
+        libgomp_path = f"/usr/lib/{os.uname().machine}-linux-gnu/libgomp.so.1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     os.environ["GOMP_CPU_AFFINITY"] = "0-3"
 
diff --git a/.ci/pytorch/smoke_test/smoke_test.py b/.ci/pytorch/smoke_test/smoke_test.py
index 6f5531178319..43c2a3fb753f 100644
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@@ -7,6 +7,10 @@
 import sys
 from pathlib import Path
 from tempfile import NamedTemporaryFile
+<<<<<<< HEAD
+=======
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo
@@ -195,8 +199,46 @@ def test_cuda_gds_errors_captured() -> None:
         )
 
 
+<<<<<<< HEAD
 def smoke_test_cuda(
     package: str, runtime_error_check: str, torch_compile_check: str
+=======
+def find_pypi_package_version(package: str) -> Optional[str]:
+    from importlib import metadata
+
+    dists = metadata.distributions()
+    for dist in dists:
+        if dist.metadata["Name"].startswith(package):
+            return dist.version
+    return None
+
+
+def cudnn_to_version_str(cudnn_version: int) -> str:
+    patch = int(cudnn_version % 10)
+    minor = int((cudnn_version / 100) % 100)
+    major = int((cudnn_version / 10000) % 10000)
+    return f"{major}.{minor}.{patch}"
+
+
+def compare_pypi_to_torch_versions(
+    package: str, pypi_version: str, torch_version: str
+) -> None:
+    if pypi_version is None:
+        raise RuntimeError(f"Can't find {package} in PyPI for Torch: {torch_version}")
+    if pypi_version.startswith(torch_version):
+        print(f"Found matching {package}. Torch: {torch_version} PyPI {pypi_version}")
+    else:
+        raise RuntimeError(
+            f"Wrong {package} version. Torch: {torch_version} PyPI: {pypi_version}"
+        )
+
+
+def smoke_test_cuda(
+    package: str,
+    runtime_error_check: str,
+    torch_compile_check: str,
+    pypi_pkg_check: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     if not torch.cuda.is_available() and is_cuda_system:
         raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.")
@@ -226,20 +268,44 @@ def smoke_test_cuda(
             raise RuntimeError(
                 f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}"
             )
+<<<<<<< HEAD
         print(f"torch cuda: {torch.version.cuda}")
         # todo add cudnn version validation
         print(f"torch cudnn: {torch.backends.cudnn.version()}")
         print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")
 
+=======
+
+        print(f"torch cuda: {torch.version.cuda}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.cuda.init()
         print("CUDA initialized successfully")
         print(f"Number of CUDA devices: {torch.cuda.device_count()}")
         for i in range(torch.cuda.device_count()):
             print(f"Device {i}: {torch.cuda.get_device_name(i)}")
 
+<<<<<<< HEAD
         # nccl is availbale only on Linux
         if sys.platform in ["linux", "linux2"]:
             print(f"torch nccl version: {torch.cuda.nccl.version()}")
+=======
+        print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")
+        torch_cudnn_version = cudnn_to_version_str(torch.backends.cudnn.version())
+        print(f"Torch cuDNN version: {torch_cudnn_version}")
+
+        if sys.platform in ["linux", "linux2"]:
+            torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version())
+            print(f"Torch nccl; version: {torch_nccl_version}")
+
+        # Pypi dependencies are installed on linux only and nccl is available only on Linux.
+        if pypi_pkg_check == "enabled" and sys.platform in ["linux", "linux2"]:
+            compare_pypi_to_torch_versions(
+                "cudnn", find_pypi_package_version("nvidia-cudnn"), torch_cudnn_version
+            )
+            compare_pypi_to_torch_versions(
+                "nccl", find_pypi_package_version("nvidia-nccl"), torch_nccl_version
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if runtime_error_check == "enabled":
             test_cuda_runtime_errors_captured()
@@ -398,6 +464,16 @@ def parse_args():
         choices=["enabled", "disabled"],
         default="enabled",
     )
+<<<<<<< HEAD
+=======
+    parser.add_argument(
+        "--pypi-pkg-check",
+        help="Check pypi package versions cudnn and nccl",
+        type=str,
+        choices=["enabled", "disabled"],
+        default="enabled",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return parser.parse_args()
 
 
@@ -422,7 +498,14 @@ def main() -> None:
         smoke_test_modules()
 
     smoke_test_cuda(
+<<<<<<< HEAD
         options.package, options.runtime_error_check, options.torch_compile_check
+=======
+        options.package,
+        options.runtime_error_check,
+        options.torch_compile_check,
+        options.pypi_pkg_check,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 369148833ea9..c4ca78cc6ae4 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -191,8 +191,17 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
     # shellcheck disable=SC1091
     source /opt/intel/oneapi/umf/latest/env/vars.sh
   fi
+<<<<<<< HEAD
   # Check XPU status before testing
   xpu-smi discovery
+=======
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/ccl/latest/env/vars.sh
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/mpi/latest/env/vars.sh
+  # Check XPU status before testing
+  timeout 30 xpu-smi discovery || true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
 
 if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
@@ -220,7 +229,11 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     export PYTORCH_TEST_WITH_ASAN=1
     export PYTORCH_TEST_WITH_UBSAN=1
     # TODO: Figure out how to avoid hard-coding these paths
+<<<<<<< HEAD
     export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-15/bin/llvm-symbolizer
+=======
+    export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-18/bin/llvm-symbolizer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     export TORCH_USE_RTLD_GLOBAL=1
     # NB: We load libtorch.so with RTLD_GLOBAL for UBSAN, unlike our
     # default behavior.
@@ -312,6 +325,26 @@ test_python() {
   assert_git_not_dirty
 }
 
+<<<<<<< HEAD
+=======
+test_python_smoke() {
+  # Smoke tests for H100
+  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
+test_h100_distributed() {
+  # Distributed tests at H100
+  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  # This test requires multicast support
+  time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  # symmetric memory test
+  time python test/run_test.py --include distributed/test_symmetric_memory.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 test_lazy_tensor_meta_reference_disabled() {
   export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
   echo "Testing lazy tensor operations without meta reference"
@@ -340,6 +373,20 @@ test_dynamo_wrapped_shard() {
   assert_git_not_dirty
 }
 
+<<<<<<< HEAD
+=======
+test_einops() {
+  pip install einops==0.6.1
+  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
+  pip install einops==0.7.0
+  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
+  pip install einops==0.8.1
+  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 test_inductor_distributed() {
   # Smuggle a few multi-gpu tests here so that we don't have to request another large node
   echo "Testing multi_gpu tests in test_torchinductor"
@@ -396,8 +443,20 @@ test_inductor_aoti() {
     # We need to hipify before building again
     python3 tools/amd_build/build_amd.py
   fi
+<<<<<<< HEAD
   BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
   CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
+=======
+  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
+    BUILD_AOT_INDUCTOR_TEST=1 TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop
+    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
+    LD_LIBRARY_PATH=/opt/conda/envs/py_3.10/lib/:${TORCH_LIB_DIR}:$LD_LIBRARY_PATH
+    CPP_TESTS_DIR="${BUILD_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
+  else
+    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
+  fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 test_inductor_cpp_wrapper_shard() {
@@ -412,10 +471,18 @@ test_inductor_cpp_wrapper_shard() {
 
   if [[ "$1" -eq "2" ]]; then
     # For now, manually put the opinfo tests in shard 2, and all other tests in
+<<<<<<< HEAD
     # shard 1.  Test specific things triggering past bugs, for now.
     python test/run_test.py \
       --include inductor/test_torchinductor_opinfo \
       -k 'linalg or to_sparse' \
+=======
+    # shard 1.  Run all CPU tests, as well as specific GPU tests triggering past
+    # bugs, for now.
+    python test/run_test.py \
+      --include inductor/test_torchinductor_opinfo \
+      -k 'linalg or to_sparse or TestInductorOpInfoCPU' \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       --verbose
     exit
   fi
@@ -570,7 +637,13 @@ test_perf_for_dashboard() {
 
   local device=cuda
   if [[ "${TEST_CONFIG}" == *cpu* ]]; then
+<<<<<<< HEAD
     if [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
+=======
+    if [[ "${TEST_CONFIG}" == *zen_cpu_x86* ]]; then
+      device=zen_cpu_x86
+    elif [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       device=cpu_x86
     elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then
       device=cpu_aarch64
@@ -800,6 +873,7 @@ test_inductor_torchbench_smoketest_perf() {
   done
 }
 
+<<<<<<< HEAD
 test_inductor_get_core_number() {
   if [[ "${TEST_CONFIG}" == *aarch64* ]]; then
     echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
@@ -810,6 +884,9 @@ test_inductor_get_core_number() {
 
 test_inductor_set_cpu_affinity(){
   #set jemalloc
+=======
+test_inductor_set_cpu_affinity(){
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
   export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
@@ -821,14 +898,33 @@ test_inductor_set_cpu_affinity(){
     export KMP_AFFINITY=granularity=fine,compact,1,0
     export KMP_BLOCKTIME=1
   fi
+<<<<<<< HEAD
   cores=$(test_inductor_get_core_number)
   # Set number of cores to 16 on Aarch64 for performance runs.
+=======
+
+  # Use nproc here instead of lscpu because it takes into account cgroups slice
+  cpus=$(nproc)
+  thread_per_core=$(lscpu | grep 'Thread(s) per core:' | awk '{print $4}')
+  cores=$((cpus / thread_per_core))
+
+  # Set number of cores to 16 on aarch64 for performance runs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
     cores=16
   fi
   export OMP_NUM_THREADS=$cores
+<<<<<<< HEAD
   end_core=$((cores-1))
   export TASKSET="taskset -c 0-$end_core"
+=======
+
+  # Handle cgroups slice start and end CPU
+  start_cpu=$(python -c 'import os; print(min(os.sched_getaffinity(0)))')
+  # Leaving one physical CPU for other tasks
+  end_cpu=$(($(python -c 'import os; print(max(os.sched_getaffinity(0)))') - thread_per_core))
+  export TASKSET="taskset -c $start_cpu-$end_cpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 test_inductor_torchbench_cpu_smoketest_perf(){
@@ -1111,6 +1207,15 @@ test_custom_backend() {
 
 test_custom_script_ops() {
   echo "Testing custom script operators"
+<<<<<<< HEAD
+=======
+
+  if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
+    echo "Skipping custom script operators until it's fixed"
+    return 0
+  fi
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
   pushd test/custom_operator
   cp -a "$CUSTOM_OP_BUILD" build
@@ -1173,7 +1278,10 @@ build_xla() {
   # These functions are defined in .circleci/common.sh in pytorch/xla repo
   retry install_pre_deps_pytorch_xla $XLA_DIR $USE_CACHE
   CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SANDBOX_BUILD=1 build_torch_xla $XLA_DIR
+<<<<<<< HEAD
   retry install_post_deps_pytorch_xla
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   assert_git_not_dirty
 }
 
@@ -1475,8 +1583,11 @@ test_executorch() {
   export PYTHON_EXECUTABLE=python
   export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
 
+<<<<<<< HEAD
   # For llama3
   bash examples/models/llama3_2_vision/install_requirements.sh
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
   # from the PR
   bash .ci/scripts/setup-linux.sh --build-tool cmake
@@ -1503,7 +1614,11 @@ test_executorch() {
 test_linux_aarch64() {
   python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
         test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
+<<<<<<< HEAD
         test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
+=======
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops test_cpp_extensions_open_device_registration \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 
   # Dynamo tests
@@ -1519,12 +1634,41 @@ test_linux_aarch64() {
        inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
        inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
        inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
+<<<<<<< HEAD
        inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
+=======
+       inductor/test_split_cat_fx_passes inductor/test_compile inductor/test_torchinductor \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
        inductor/test_triton_cpu_backend inductor/test_triton_extension_backend inductor/test_mkldnn_pattern_matcher inductor/test_cpu_cpp_wrapper \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 }
 
+<<<<<<< HEAD
+=======
+test_operator_benchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  TEST_DIR=$(pwd)
+
+  test_inductor_set_cpu_affinity
+
+  cd benchmarks/operator_benchmark/pt_extension
+  python setup.py install
+
+  cd "${TEST_DIR}"/benchmarks/operator_benchmark
+  $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
+      --output-csv "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.json" \
+
+  pip_install pandas
+  python check_perf_csv.py \
+      --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
+      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
+}
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@@ -1555,6 +1699,22 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
   if [[ "${SHARD_NUMBER}" == 1 ]]; then
     test_rpc
   fi
+<<<<<<< HEAD
+=======
+elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
+  TEST_MODE="short"
+
+  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
+    if [[ "${TEST_CONFIG}" == *long* ]]; then
+      TEST_MODE="long"
+    elif [[ "${TEST_CONFIG}" == *all* ]]; then
+      TEST_MODE="all"
+    fi
+
+    test_operator_benchmark cpu ${TEST_MODE}
+
+  fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
   test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@@ -1588,7 +1748,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
     install_torchaudio cuda
   fi
   install_torchvision
+<<<<<<< HEAD
   TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git
+=======
+  TORCH_CUDA_ARCH_LIST="8.0;8.6" install_torchao
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   id=$((SHARD_NUMBER-1))
   # https://github.com/opencv/opencv-python/issues/885
   pip_install opencv-python==4.8.0.74
@@ -1617,6 +1781,10 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
   install_torchvision
   checkout_install_torchbench hf_T5 llama moco
   PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
+<<<<<<< HEAD
+=======
+  test_inductor_aoti
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
   install_torchvision
   test_inductor_shard "${SHARD_NUMBER}"
@@ -1625,6 +1793,11 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
       test_inductor_distributed
     fi
   fi
+<<<<<<< HEAD
+=======
+elif [[ "${TEST_CONFIG}" == *einops* ]]; then
+  test_einops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
   install_torchvision
   test_dynamo_wrapped_shard "${SHARD_NUMBER}"
@@ -1670,6 +1843,13 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
   test_python
   test_aten
   test_xpu_bin
+<<<<<<< HEAD
+=======
+elif [[ "${TEST_CONFIG}" == smoke ]]; then
+  test_python_smoke
+elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
+  test_h100_distributed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else
   install_torchvision
   install_monkeytype
diff --git a/.ci/pytorch/test_example_code/CMakeLists.txt b/.ci/pytorch/test_example_code/CMakeLists.txt
index e87f37ae61fb..688395d1615d 100644
--- a/.ci/pytorch/test_example_code/CMakeLists.txt
+++ b/.ci/pytorch/test_example_code/CMakeLists.txt
@@ -16,7 +16,11 @@ target_link_libraries(simple-torch-test CUDA::cudart CUDA::cufft CUDA::cusparse
 find_library(CUDNN_LIBRARY NAMES cudnn)
 target_link_libraries(simple-torch-test  ${CUDNN_LIBRARY} )
 if(MSVC)
+<<<<<<< HEAD
   file(GLOB TORCH_DLLS  "$ENV{CUDA_PATH}/bin/cudnn64_8.dll" "$ENV{NVTOOLSEXT_PATH}/bin/x64/*.dll")
+=======
+  file(GLOB TORCH_DLLS  "$ENV{CUDA_PATH}/bin/cudnn64_8.dll")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   message("dlls to copy "  ${TORCH_DLLS})
   add_custom_command(TARGET simple-torch-test
                      POST_BUILD
diff --git a/.ci/pytorch/win-build.sh b/.ci/pytorch/win-build.sh
index 7966e56695c2..44c8874e3a2d 100755
--- a/.ci/pytorch/win-build.sh
+++ b/.ci/pytorch/win-build.sh
@@ -31,7 +31,11 @@ PYLONG_API_CHECK=$?
 if [[ $PYLONG_API_CHECK == 0 ]]; then
   echo "Usage of PyLong_{From,As}{Unsigned}Long API may lead to overflow errors on Windows"
   echo "because \`sizeof(long) == 4\` and \`sizeof(unsigned long) == 4\`."
+<<<<<<< HEAD
   echo "Please include \"torch/csrc/utils/python_numbers.h\" and use the correspoding APIs instead."
+=======
+  echo "Please include \"torch/csrc/utils/python_numbers.h\" and use the corresponding APIs instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   echo "PyLong_FromLong -> THPUtils_packInt32 / THPUtils_packInt64"
   echo "PyLong_AsLong -> THPUtils_unpackInt (32-bit) / THPUtils_unpackLong (64-bit)"
   echo "PyLong_FromUnsignedLong -> THPUtils_packUInt32 / THPUtils_packUInt64"
diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat
index 297c0a689b24..9cdb0bf0cbf9 100644
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@@ -10,7 +10,11 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol
 :: able to see what our cl.exe commands are (since you can actually
 :: just copy-paste them into a local Windows setup to just rebuild a
 :: single file.)
+<<<<<<< HEAD
 :: log sizes are too long, but leaving this here incase someone wants to use it locally
+=======
+:: log sizes are too long, but leaving this here in case someone wants to use it locally
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 :: set CMAKE_VERBOSE_MAKEFILE=1
 
 
@@ -37,6 +41,14 @@ call %INSTALLER_DIR%\activate_miniconda3.bat
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 
+<<<<<<< HEAD
+=======
+:: Update CMake
+call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=System' --apply-install-arguments-to-dependencies --version=3.27.9
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
@@ -88,7 +100,11 @@ set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 :cuda_build_end
 
 set DISTUTILS_USE_SDK=1
+<<<<<<< HEAD
 set PATH=%TMP_DIR_WIN%\bin;%PATH%
+=======
+set PATH=%TMP_DIR_WIN%\bin;C:\Program Files\CMake\bin;%PATH%
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 :: The latest Windows CUDA test is running on AWS G5 runner with A10G GPU
 if "%TORCH_CUDA_ARCH_LIST%" == "" set TORCH_CUDA_ARCH_LIST=8.6
diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat b/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat
index d0fbf5b20d88..9ac40c5c23bb 100644
--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat
@@ -24,7 +24,11 @@ if "%CUDA_SUFFIX%" == "" (
 
 if "%REBUILD%"=="" (
   if "%BUILD_ENVIRONMENT%"=="" (
+<<<<<<< HEAD
     curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z
+=======
+    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z & REM @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ) else (
     aws s3 cp s3://ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet
   )
diff --git a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
index 6df547d4a3eb..7f3f252c6eb9 100755
--- a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
+++ b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
@@ -52,7 +52,11 @@
             if os.path.exists(debugger):
                 command_args = [debugger, "-o", "-c", "~*g; q"] + command_args
                 command_string = " ".join(command_args)
+<<<<<<< HEAD
                 print("Reruning with traceback enabled")
+=======
+                print("Rerunning with traceback enabled")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 print("Command:", command_string)
                 subprocess.run(command_args, check=False)
             sys.exit(e.returncode)
diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh
index 0426982a3ad9..7437435d55c3 100755
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@@ -38,7 +38,11 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi
 
 # TODO: Move both of them to Windows AMI
+<<<<<<< HEAD
 python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 pytest-subtests==0.13.1
+=======
+python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.12.2.0
diff --git a/.ci/pytorch/windows/arm64/bootstrap_libuv.bat b/.ci/pytorch/windows/arm64/bootstrap_libuv.bat
index 33272f3ef09d..9447512ee2eb 100644
--- a/.ci/pytorch/windows/arm64/bootstrap_libuv.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_libuv.bat
@@ -7,7 +7,11 @@ if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR%
 if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%
 
 :: activate visual studio
+<<<<<<< HEAD
 call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+=======
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 where cl.exe
 
 cd %DEPENDENCIES_DIR%
diff --git a/.ci/pytorch/windows/arm64/bootstrap_openblas.bat b/.ci/pytorch/windows/arm64/bootstrap_openblas.bat
index 463e765ede12..6e87228c4542 100644
--- a/.ci/pytorch/windows/arm64/bootstrap_openblas.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_openblas.bat
@@ -7,7 +7,11 @@ if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR%
 if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%
 
 :: activate visual studio
+<<<<<<< HEAD
 call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+=======
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 where cl.exe
 
 :: Clone OpenBLAS
diff --git a/.ci/pytorch/windows/arm64/bootstrap_tests.bat b/.ci/pytorch/windows/arm64/bootstrap_tests.bat
index c0fc48702604..debac12d9740 100644
--- a/.ci/pytorch/windows/arm64/bootstrap_tests.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_tests.bat
@@ -2,7 +2,11 @@
 cd %PYTORCH_ROOT%
 
 :: activate visual studio
+<<<<<<< HEAD
 call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+=======
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 where cl.exe
 
 :: create virtual environment
diff --git a/.ci/pytorch/windows/arm64/build_libtorch.bat b/.ci/pytorch/windows/arm64/build_libtorch.bat
index 139e0b47be58..c2f251b5ddce 100644
--- a/.ci/pytorch/windows/arm64/build_libtorch.bat
+++ b/.ci/pytorch/windows/arm64/build_libtorch.bat
@@ -21,7 +21,11 @@ if %ENABLE_APL% == 1 (
 )
 
 :: activate visual studio
+<<<<<<< HEAD
 call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+=======
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 where cl.exe
 
 :: change to source directory
diff --git a/.ci/pytorch/windows/arm64/build_pytorch.bat b/.ci/pytorch/windows/arm64/build_pytorch.bat
index b4d67b48e4fc..6f5133c32ddf 100644
--- a/.ci/pytorch/windows/arm64/build_pytorch.bat
+++ b/.ci/pytorch/windows/arm64/build_pytorch.bat
@@ -21,7 +21,11 @@ if %ENABLE_APL% == 1 (
 )
 
 :: activate visual studio
+<<<<<<< HEAD
 call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+=======
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 where cl.exe
 
 :: change to source directory
diff --git a/.ci/pytorch/windows/arm64/smoke_test.bat b/.ci/pytorch/windows/arm64/smoke_test.bat
index 378413cffc85..db410f4b5139 100644
--- a/.ci/pytorch/windows/arm64/smoke_test.bat
+++ b/.ci/pytorch/windows/arm64/smoke_test.bat
@@ -33,7 +33,11 @@ pushd tmp
 set VC_VERSION_LOWER=14
 set VC_VERSION_UPPER=36
 
+<<<<<<< HEAD
 call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+=======
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 set install_root=%CD%
 set INCLUDE=%INCLUDE%;%install_root%\include;%install_root%\include\torch\csrc\api\include
diff --git a/.ci/pytorch/windows/build_pytorch.bat b/.ci/pytorch/windows/build_pytorch.bat
index 2a1b73a527d1..de69b6ad5d29 100644
--- a/.ci/pytorch/windows/build_pytorch.bat
+++ b/.ci/pytorch/windows/build_pytorch.bat
@@ -1,7 +1,12 @@
 @echo off
 
+<<<<<<< HEAD
 :: This script parses args, installs required libraries (miniconda, MKL,
 :: Magma), and then delegates to cpu.bat, cuda80.bat, etc.
+=======
+:: This script parses args, installs required libraries (MKL, Magma, libuv)
+:: and then delegates to cpu.bat, cuda80.bat, etc.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if not "%CUDA_VERSION%" == "" if not "%PYTORCH_BUILD_VERSION%" == "" if not "%PYTORCH_BUILD_NUMBER%" == "" goto env_end
 if "%~1"=="" goto arg_error
@@ -36,6 +41,7 @@ set DESIRED_PYTHON_PREFIX=py%DESIRED_PYTHON_PREFIX:;=;py%
 set SRC_DIR=%~dp0
 pushd %SRC_DIR%
 
+<<<<<<< HEAD
 :: Install Miniconda3
 set "CONDA_HOME=%CD%\conda"
 set "tmp_conda=%CONDA_HOME%"
@@ -49,15 +55,28 @@ set "ORIG_PATH=%PATH%"
 set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%"
 
 :: create a new conda environment and install packages
+=======
+set "ORIG_PATH=%PATH%"
+
+:: setup build environment
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 :try
 SET /A tries=3
 :loop
 IF %tries% LEQ 0 GOTO :exception
+<<<<<<< HEAD
 call condaenv.bat
 IF %ERRORLEVEL% EQU 0 GOTO :done
 SET /A "tries=%tries%-1"
 :exception
 echo "Failed to create conda env"
+=======
+call setup_build.bat
+IF %ERRORLEVEL% EQU 0 GOTO :done
+SET /A "tries=%tries%-1"
+:exception
+echo "Failed to setup build environment"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 exit /B 1
 :done
 
@@ -73,7 +92,11 @@ if "%DEBUG%" == "1" (
 if not "%CUDA_VERSION%" == "cpu" if not "%CUDA_VERSION%" == "xpu" (
     rmdir /s /q magma_%CUDA_PREFIX%_%BUILD_TYPE%
     del magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z
+<<<<<<< HEAD
     curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z
+=======
+    curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z %= @lint-ignore =%
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     7z x -aoa magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z -omagma_%CUDA_PREFIX%_%BUILD_TYPE%
 )
 
@@ -107,6 +130,7 @@ set TH_BINARY_BUILD=1
 set INSTALL_TEST=0
 
 for %%v in (%DESIRED_PYTHON_PREFIX%) do (
+<<<<<<< HEAD
     :: Activate Python Environment
     set PYTHON_PREFIX=%%v
     set "CONDA_LIB_PATH=%CONDA_HOME%\envs\%%v\Library\bin"
@@ -115,11 +139,26 @@ for %%v in (%DESIRED_PYTHON_PREFIX%) do (
     ) else (
         set "PATH=%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%"
     )
+=======
+
+    :: Set Environment vars for the build
+    set "CMAKE_PREFIX_PATH=%CD%\Python\Library\;%PATH%"
+    set "PYTHON_LIB_PATH=%CD%\Python\Library\bin"
+
+    if not "%ADDITIONAL_PATH%" == "" (
+        set "PATH=%ADDITIONAL_PATH%;%PATH%"
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pip install ninja
     @setlocal
     :: Set Flags
     if not "%CUDA_VERSION%"=="cpu" if not "%CUDA_VERSION%" == "xpu" (
+<<<<<<< HEAD
         set MAGMA_HOME=%cd%\magma_%CUDA_PREFIX%_%BUILD_TYPE%
+=======
+        set "MAGMA_HOME=%cd%\magma_%CUDA_PREFIX%_%BUILD_TYPE%"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     echo "Calling arch build script"
     call %CUDA_PREFIX%.bat
diff --git a/.ci/pytorch/windows/cuda126.bat b/.ci/pytorch/windows/cuda126.bat
index efb8cfec63e7..2db616810ecb 100644
--- a/.ci/pytorch/windows/cuda126.bat
+++ b/.ci/pytorch/windows/cuda126.bat
@@ -18,6 +18,7 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
 
+<<<<<<< HEAD
 IF "%NVTOOLSEXT_PATH%"=="" (
     IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
         set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
@@ -27,6 +28,8 @@ IF "%NVTOOLSEXT_PATH%"=="" (
     )
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IF "%CUDA_PATH_V126%"=="" (
     IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\nvcc.exe" (
         set "CUDA_PATH_V126=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6"
@@ -37,7 +40,11 @@ IF "%CUDA_PATH_V126%"=="" (
 )
 
 IF "%BUILD_VISION%" == "" (
+<<<<<<< HEAD
     set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
+=======
+    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
     set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
diff --git a/.ci/pytorch/windows/cuda128.bat b/.ci/pytorch/windows/cuda128.bat
index f660f1d0a699..1657918b73a3 100644
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@@ -18,6 +18,7 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
 
+<<<<<<< HEAD
 IF "%NVTOOLSEXT_PATH%"=="" (
     IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
         set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
@@ -27,6 +28,8 @@ IF "%NVTOOLSEXT_PATH%"=="" (
     )
 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IF "%CUDA_PATH_V128%"=="" (
     IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc.exe" (
         set "CUDA_PATH_V128=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8"
@@ -37,7 +40,11 @@ IF "%CUDA_PATH_V128%"=="" (
 )
 
 IF "%BUILD_VISION%" == "" (
+<<<<<<< HEAD
     set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
+=======
+    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
     set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
diff --git a/.ci/pytorch/windows/cuda129.bat b/.ci/pytorch/windows/cuda129.bat
new file mode 100644
index 000000000000..9ef36342f269
--- /dev/null
+++ b/.ci/pytorch/windows/cuda129.bat
@@ -0,0 +1,50 @@
+@echo off
+
+set MODULE_NAME=pytorch
+
+IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
+    call internal\clone.bat
+    cd %~dp0
+) ELSE (
+    call internal\clean.bat
+)
+IF ERRORLEVEL 1 goto :eof
+
+call internal\check_deps.bat
+IF ERRORLEVEL 1 goto :eof
+
+REM Check for optional components
+
+set USE_CUDA=
+set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+
+IF "%CUDA_PATH_V129%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\nvcc.exe" (
+        set "CUDA_PATH_V129=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
+    ) ELSE (
+        echo CUDA 12.9 not found, failing
+        exit /b 1
+    )
+)
+
+IF "%BUILD_VISION%" == "" (
+    set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+) ELSE (
+    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
+)
+
+set "CUDA_PATH=%CUDA_PATH_V129%"
+set "PATH=%CUDA_PATH_V129%\bin;%PATH%"
+
+:optcheck
+
+call internal\check_opts.bat
+IF ERRORLEVEL 1 goto :eof
+
+if exist "%NIGHTLIES_PYTORCH_ROOT%" cd %NIGHTLIES_PYTORCH_ROOT%\..
+call  %~dp0\internal\copy.bat
+IF ERRORLEVEL 1 goto :eof
+
+call  %~dp0\internal\setup.bat
+IF ERRORLEVEL 1 goto :eof
diff --git a/.ci/pytorch/windows/internal/7z_install.bat b/.ci/pytorch/windows/internal/7z_install.bat
index d5a1156360d9..50522e60f9ff 100644
--- a/.ci/pytorch/windows/internal/7z_install.bat
+++ b/.ci/pytorch/windows/internal/7z_install.bat
@@ -1,6 +1,10 @@
 @echo off
 
+<<<<<<< HEAD
 curl -k https://www.7-zip.org/a/7z1805-x64.exe -O
+=======
+curl -k -L "https://sourceforge.net/projects/sevenzip/files/7-Zip/18.05/7z1805-x64.exe/download" -o 7z1805-x64.exe
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if errorlevel 1 exit /b 1
 
 start /wait 7z1805-x64.exe /S
diff --git a/.ci/pytorch/windows/internal/check_deps.bat b/.ci/pytorch/windows/internal/check_deps.bat
index 46f438615774..0c4a65553f92 100644
--- a/.ci/pytorch/windows/internal/check_deps.bat
+++ b/.ci/pytorch/windows/internal/check_deps.bat
@@ -65,7 +65,11 @@ for /F "usebackq delims=" %%i in (`python -c "import sys; print('{0[0]}{0[1]}'.f
 if  %PYVER% LSS 35 (
     echo Warning: PyTorch for Python 2 under Windows is experimental.
     echo Python x64 3.5 or up is recommended to compile PyTorch on Windows
+<<<<<<< HEAD
     echo Maybe you can create a virual environment if you have conda installed:
+=======
+    echo Maybe you can create a virtual environment if you have conda installed:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     echo ^> conda create -n test python=3.6 pyyaml numpy
     echo ^> activate test
 )
diff --git a/.ci/pytorch/windows/internal/clone.bat b/.ci/pytorch/windows/internal/clone.bat
index d76d13db1763..0ee9bebba223 100644
--- a/.ci/pytorch/windows/internal/clone.bat
+++ b/.ci/pytorch/windows/internal/clone.bat
@@ -8,7 +8,11 @@ goto submodule
 
 :clone_pytorch
 
+<<<<<<< HEAD
 git clone https://github.com/%PYTORCH_REPO%/%MODULE_NAME%
+=======
+git clone https://github.com/%PYTORCH_REPO%/%MODULE_NAME% & REM @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 cd %MODULE_NAME%
 
diff --git a/.ci/pytorch/windows/internal/copy.bat b/.ci/pytorch/windows/internal/copy.bat
index b2d078944a5d..38fd82c25f4e 100644
--- a/.ci/pytorch/windows/internal/copy.bat
+++ b/.ci/pytorch/windows/internal/copy.bat
@@ -9,8 +9,12 @@ copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib
 
+<<<<<<< HEAD
 copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib
 copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
+=======
+copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 :: Should be set in build_pytorch.bat
 copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
diff --git a/.ci/pytorch/windows/internal/copy_cpu.bat b/.ci/pytorch/windows/internal/copy_cpu.bat
index 864180d85dd1..1c324b600f25 100644
--- a/.ci/pytorch/windows/internal/copy_cpu.bat
+++ b/.ci/pytorch/windows/internal/copy_cpu.bat
@@ -1,3 +1,9 @@
+<<<<<<< HEAD
 copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
 :: Should be set in build_pytorch.bat
-copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
\ No newline at end of file
+copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
+=======
+copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
+:: Should be set in build_pytorch.bat
+copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/pytorch/windows/internal/cuda_install.bat b/.ci/pytorch/windows/internal/cuda_install.bat
index 7e33b0805c9c..87bb978d5550 100644
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@@ -23,14 +23,21 @@ set CUDNN_LIB_FOLDER="lib\x64"
 :: Skip all of this if we already have cuda installed
 if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" goto set_cuda_env_vars
 
+<<<<<<< HEAD
 if %CUDA_VER% EQU 118 goto cuda118
 if %CUDA_VER% EQU 124 goto cuda124
 if %CUDA_VER% EQU 126 goto cuda126
 if %CUDA_VER% EQU 128 goto cuda128
+=======
+if %CUDA_VER% EQU 126 goto cuda126
+if %CUDA_VER% EQU 128 goto cuda128
+if %CUDA_VER% EQU 129 goto cuda129
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 echo CUDA %CUDA_VERSION_STR% is not supported
 exit /b 1
 
+<<<<<<< HEAD
 :cuda118
 
 set CUDA_INSTALL_EXE=cuda_11.8.0_522.06_windows.exe
@@ -83,13 +90,19 @@ curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%
 7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
 xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 goto cuda_common
 
 :cuda126
 
 set CUDA_INSTALL_EXE=cuda_12.6.2_560.94_windows.exe
 if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
+<<<<<<< HEAD
     curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+=======
+    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if errorlevel 1 exit /b 1
     set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
     set "ARGS=cuda_profiler_api_12.6 thrust_12.6 nvcc_12.6 cuobjdump_12.6 nvprune_12.6 nvprof_12.6 cupti_12.6 cublas_12.6 cublas_dev_12.6 cudart_12.6 cufft_12.6 cufft_dev_12.6 curand_12.6 curand_dev_12.6 cusolver_12.6 cusolver_dev_12.6 cusparse_12.6 cusparse_dev_12.6 npp_12.6 npp_dev_12.6 nvrtc_12.6 nvrtc_dev_12.6 nvml_dev_12.6 nvjitlink_12.6 nvtx_12.6"
@@ -99,7 +112,11 @@ set CUDNN_FOLDER=cudnn-windows-x86_64-9.5.0.50_cuda12-archive
 set CUDNN_LIB_FOLDER="lib"
 set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
+<<<<<<< HEAD
     curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+=======
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if errorlevel 1 exit /b 1
     set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
 )
@@ -116,7 +133,11 @@ goto cuda_common
 
 set CUDA_INSTALL_EXE=cuda_12.8.0_571.96_windows.exe
 if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
+<<<<<<< HEAD
     curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+=======
+    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if errorlevel 1 exit /b 1
     set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
     set "ARGS=cuda_profiler_api_12.8 thrust_12.8 nvcc_12.8 cuobjdump_12.8 nvprune_12.8 nvprof_12.8 cupti_12.8 cublas_12.8 cublas_dev_12.8 cudart_12.8 cufft_12.8 cufft_dev_12.8 curand_12.8 curand_dev_12.8 cusolver_12.8 cusolver_dev_12.8 cusparse_12.8 cusparse_dev_12.8 npp_12.8 npp_dev_12.8 nvrtc_12.8 nvrtc_dev_12.8 nvml_dev_12.8 nvjitlink_12.8 nvtx_12.8"
@@ -126,7 +147,38 @@ set CUDNN_FOLDER=cudnn-windows-x86_64-9.7.0.66_cuda12-archive
 set CUDNN_LIB_FOLDER="lib"
 set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
+<<<<<<< HEAD
     curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+=======
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+)
+
+@REM cuDNN 8.3+ required zlib to be installed on the path
+echo Installing ZLIB dlls
+curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
+7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
+xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
+
+goto cuda_common
+
+:cuda129
+
+set CUDA_INSTALL_EXE=cuda_12.9.1_576.57_windows.exe
+if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
+    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    set "ARGS=cuda_profiler_api_12.9 thrust_12.9 nvcc_12.9 cuobjdump_12.9 nvprune_12.9 nvprof_12.9 cupti_12.9 cublas_12.9 cublas_dev_12.9 cudart_12.9 cufft_12.9 cufft_dev_12.9 curand_12.9 curand_dev_12.9 cusolver_12.9 cusolver_dev_12.9 cusparse_12.9 cusparse_dev_12.9 npp_12.9 npp_dev_12.9 nvrtc_12.9 nvrtc_dev_12.9 nvml_dev_12.9 nvjitlink_12.9 nvtx_12.9"
+)
+
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.10.2.21_cuda12-archive
+set CUDNN_LIB_FOLDER="lib"
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
+if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if errorlevel 1 exit /b 1
     set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
 )
@@ -145,11 +197,14 @@ goto cuda_common
 :: If you cannot find the CUDA version you want to build for here then please
 :: add it @ https://github.com/pytorch/test-infra/tree/main/aws/ami/windows
 if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
+<<<<<<< HEAD
     if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
         curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "%SRC_DIR%\temp_build\NvToolsExt.7z"
         if errorlevel 1 exit /b 1
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" (
         curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
         if errorlevel 1 exit /b 1
@@ -176,6 +231,7 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_
         xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations"
     )
 
+<<<<<<< HEAD
     echo Installing NvToolsExt...
     7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
     mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
@@ -185,6 +241,8 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_
     xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
     xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     echo Installing cuDNN...
     7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
     xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
@@ -215,4 +273,7 @@ echo Setting up environment...
 set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%"
 set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
 set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
+<<<<<<< HEAD
 set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.ci/pytorch/windows/internal/driver_update.bat b/.ci/pytorch/windows/internal/driver_update.bat
index 551aa9c7a8a4..bd80f3657e75 100644
--- a/.ci/pytorch/windows/internal/driver_update.bat
+++ b/.ci/pytorch/windows/internal/driver_update.bat
@@ -1,5 +1,9 @@
 set WIN_DRIVER_VN=528.89
+<<<<<<< HEAD
 set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe"
+=======
+set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe
 if errorlevel 1 exit /b 1
 
diff --git a/.ci/pytorch/windows/internal/install_python.bat b/.ci/pytorch/windows/internal/install_python.bat
new file mode 100644
index 000000000000..642acdb3981b
--- /dev/null
+++ b/.ci/pytorch/windows/internal/install_python.bat
@@ -0,0 +1,20 @@
+set ADDITIONAL_OPTIONS=""
+set PYTHON_EXEC="python"
+if "%DESIRED_PYTHON%" == "3.13t" (
+    echo Python version is set to 3.13t
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
+    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
+    set PYTHON_EXEC="python3.13t"
+) else (
+    echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON%
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =%
+)
+
+del python-amd64.exe
+curl --retry 3 -kL "%PYTHON_INSTALLER_URL%" --output python-amd64.exe
+if errorlevel 1 exit /b 1
+
+start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_test=0 %ADDITIONAL_OPTIONS% TargetDir=%CD%\Python
+if errorlevel 1 exit /b 1
+
+set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
diff --git a/.ci/pytorch/windows/internal/setup.bat b/.ci/pytorch/windows/internal/setup.bat
index f57bdcbec4bc..d8db85aa23d5 100644
--- a/.ci/pytorch/windows/internal/setup.bat
+++ b/.ci/pytorch/windows/internal/setup.bat
@@ -51,7 +51,11 @@ mkdir libtorch\test
 
 mkdir build
 pushd build
+<<<<<<< HEAD
 python ../tools/build_libtorch.py
+=======
+%PYTHON_EXEC% ../tools/build_libtorch.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 popd
 
 IF ERRORLEVEL 1 exit /b 1
@@ -86,7 +90,11 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_
 goto build_end
 
 :pytorch
+<<<<<<< HEAD
 python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
+=======
+%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 :build_end
 IF ERRORLEVEL 1 exit /b 1
diff --git a/.ci/pytorch/windows/internal/smoke_test.bat b/.ci/pytorch/windows/internal/smoke_test.bat
index 3f9cc83cbb8d..d15997bdaaf5 100644
--- a/.ci/pytorch/windows/internal/smoke_test.bat
+++ b/.ci/pytorch/windows/internal/smoke_test.bat
@@ -35,6 +35,7 @@ exit /b 1
 :wheel
 echo "install wheel package"
 
+<<<<<<< HEAD
 set PYTHON_INSTALLER_URL=
 if "%DESIRED_PYTHON%" == "3.13t" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
 if "%DESIRED_PYTHON%" == "3.13" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
@@ -66,6 +67,10 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
 if errorlevel 1 exit /b 1
 
 set "PATH=%CD%\Python%PYTHON_VERSION%\Scripts;%CD%\Python;%PATH%"
+=======
+call "internal\install_python.bat"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install --pre numpy==2.2.1 protobuf
 if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install --pre numpy==2.1.2 protobuf
 if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf
@@ -82,7 +87,11 @@ if "%PYTORCH_BUILD_VERSION:dev=%" NEQ "%PYTORCH_BUILD_VERSION%" (
 )
 
 set "EXTRA_INDEX= "
+<<<<<<< HEAD
 if "%CUDA_VERSION%" == "xpu" set "EXTRA_INDEX=--index-url https://download.pytorch.org/whl/%CHANNEL%/xpu"
+=======
+if "%CUDA_VERSION%" == "xpu" set "EXTRA_INDEX=--index-url https://download.pytorch.org/whl/%CHANNEL%/xpu"  %= @lint-ignore =%
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do %PYTHON_EXEC% -m pip install "%%i" %EXTRA_INDEX%
 if errorlevel 1 exit /b 1
@@ -128,7 +137,10 @@ goto end
 :libtorch
 echo "install and test libtorch"
 
+<<<<<<< HEAD
 if "%VC_YEAR%" == "2019" powershell internal\vs2019_install.ps1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if "%VC_YEAR%" == "2022" powershell internal\vs2022_install.ps1
 
 if ERRORLEVEL 1 exit /b 1
@@ -140,10 +152,13 @@ pushd tmp\libtorch
 
 set VC_VERSION_LOWER=17
 set VC_VERSION_UPPER=18
+<<<<<<< HEAD
 IF "%VC_YEAR%" == "2019" (
     set VC_VERSION_LOWER=16
     set VC_VERSION_UPPER=17
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
     if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
diff --git a/.ci/pytorch/windows/internal/static_lib_test.bat b/.ci/pytorch/windows/internal/static_lib_test.bat
index ed8729408983..5e20ed914f07 100644
--- a/.ci/pytorch/windows/internal/static_lib_test.bat
+++ b/.ci/pytorch/windows/internal/static_lib_test.bat
@@ -37,7 +37,11 @@ if "%DEBUG%" == "1" (
 if not "%CUDA_VERSION%" == "cpu" (
     rmdir /s /q magma_%CUDA_PREFIX%_%BUILD_TYPE%
     del magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z
+<<<<<<< HEAD
     curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z
+=======
+    curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z & REM @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     7z x -aoa magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z -omagma_%CUDA_PREFIX%_%BUILD_TYPE%
     set LIB=%CD%\magma_%CUDA_PREFIX%_%BUILD_TYPE%\lib;%LIB%
 )
diff --git a/.ci/pytorch/windows/internal/vc_install_helper.bat b/.ci/pytorch/windows/internal/vc_install_helper.bat
index 61ab6d5f8c98..bc425a224ff6 100644
--- a/.ci/pytorch/windows/internal/vc_install_helper.bat
+++ b/.ci/pytorch/windows/internal/vc_install_helper.bat
@@ -1,12 +1,18 @@
+<<<<<<< HEAD
 if "%VC_YEAR%" == "2019" powershell windows/internal/vs2019_install.ps1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if "%VC_YEAR%" == "2022" powershell windows/internal/vs2022_install.ps1
 
 set VC_VERSION_LOWER=17
 set VC_VERSION_UPPER=18
+<<<<<<< HEAD
 if "%VC_YEAR%" == "2019" (
     set VC_VERSION_LOWER=16
     set VC_VERSION_UPPER=17
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"  -products Microsoft.VisualStudio.Product.BuildTools -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
     if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
diff --git a/.ci/pytorch/windows/internal/xpu_install.bat b/.ci/pytorch/windows/internal/xpu_install.bat
index 94e7554cf13f..f341ac0df091 100644
--- a/.ci/pytorch/windows/internal/xpu_install.bat
+++ b/.ci/pytorch/windows/internal/xpu_install.bat
@@ -10,6 +10,7 @@ if not "%CUDA_VERSION%" == "xpu" (
 set SRC_DIR=%NIGHTLIES_PYTORCH_ROOT%
 if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
 
+<<<<<<< HEAD
 set XPU_INSTALL_MODE=%~1
 if "%XPU_INSTALL_MODE%"=="" goto xpu_bundle_install_start
 if "%XPU_INSTALL_MODE%"=="bundle" goto xpu_bundle_install_start
@@ -57,6 +58,25 @@ if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.0] (
     set XPU_EXTRA_VERSION=2025.0.1+1226
     set XPU_EXTRA_INSTALLED=0
     set XPU_EXTRA_UNINSTALL=0
+=======
+:xpu_bundle_install_start
+
+set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
+set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
+set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
+set XPU_BUNDLE_VERSION=2025.0.1+20
+set XPU_BUNDLE_INSTALLED=0
+set XPU_BUNDLE_UNINSTALL=0
+set XPU_EXTRA_URL=NULL
+set XPU_EXTRA_PRODUCT_NAME=intel.oneapi.win.compiler.product
+set XPU_EXTRA_VERSION=2025.0.1+1226
+set XPU_EXTRA_INSTALLED=0
+set XPU_EXTRA_UNINSTALL=0
+
+if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+    set XPU_BUNDLE_VERSION=2025.1.3+5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 :: Check if XPU bundle is target version or already installed
diff --git a/.ci/pytorch/windows/setup_build.bat b/.ci/pytorch/windows/setup_build.bat
new file mode 100644
index 000000000000..9b492eef664d
--- /dev/null
+++ b/.ci/pytorch/windows/setup_build.bat
@@ -0,0 +1,27 @@
+IF "%DESIRED_PYTHON%"=="" (
+    echo DESIRED_PYTHON is NOT defined.
+    exit /b 1
+)
+
+call "internal\install_python.bat"
+
+%PYTHON_EXEC% --version
+set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
+if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
+if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
+if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
+if "%DESIRED_PYTHON%" == "3.11" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
+if "%DESIRED_PYTHON%" == "3.10" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
+if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
+
+%PYTHON_EXEC% -m pip install pyyaml
+%PYTHON_EXEC% -m pip install mkl-include mkl-static
+%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0
+
+where cmake.exe
+
+:: Install libuv
+curl -k https://s3.amazonaws.com/ossci-windows/libuv-1.40.0-h8ffe710_0.tar.bz2 -o libuv-1.40.0-h8ffe710_0.tar.bz2
+7z x -aoa libuv-1.40.0-h8ffe710_0.tar.bz2
+tar -xvf libuv-1.40.0-h8ffe710_0.tar -C %CD%\Python\
+set libuv_ROOT=%CD%\Python\Library
diff --git a/.ci/pytorch/windows/xpu.bat b/.ci/pytorch/windows/xpu.bat
index f9f5d9833839..975c7bd5bb3d 100644
--- a/.ci/pytorch/windows/xpu.bat
+++ b/.ci/pytorch/windows/xpu.bat
@@ -26,6 +26,10 @@ set VS2022INSTALLDIR=%VS15INSTALLDIR%
 set XPU_BUNDLE_ROOT=%ProgramFiles(x86)%\Intel\oneAPI
 call "%XPU_BUNDLE_ROOT%\compiler\latest\env\vars.bat"
 call "%XPU_BUNDLE_ROOT%\ocloc\latest\env\vars.bat"
+<<<<<<< HEAD
+=======
+set USE_ONEMKL=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IF ERRORLEVEL 1 goto :eof
 
 if exist "%NIGHTLIES_PYTORCH_ROOT%" cd %NIGHTLIES_PYTORCH_ROOT%\..
diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh
index b6b0d978cc23..4465c9184f25 100755
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@@ -206,7 +206,11 @@ if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
     BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir"
     echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
     echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+<<<<<<< HEAD
     BUILD_PYTHON_ONLY=1 BUILD_LIBTORCH_WHL=0 python setup.py bdist_wheel -d "$whl_tmp_dir" --cmake
+=======
+    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
 else
     python setup.py bdist_wheel -d "$whl_tmp_dir"
diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index b2db95ab62fa..f677aaca80c3 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -90,11 +90,28 @@ fi
 /pytorch/.ci/pytorch/check_binary.sh
 
 if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_TYPE" != *rocm*  && "$PACKAGE_TYPE" != libtorch ]]; then
+<<<<<<< HEAD
   # Exclude s390, xpu, rocm and libtorch builds from smoke testing
   python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled
 
   if [[ "\$GPU_ARCH_TYPE" != *cpu-aarch64* ]]; then
     # test for issue https://github.com/pytorch/pytorch/issues/149422
+=======
+
+  torch_pkg_size="$(ls -1 /final_pkgs/torch-* | sort |tail -1 |xargs wc -c |cut -d ' ' -f1)"
+  # todo: implement check for large binaries
+  # if the package is larger than 1.5GB, we disable the pypi check.
+  # this package contains all libraries packaged in torch libs folder
+  # example of such package is https://download.pytorch.org/whl/cu126_full/torch
+  if [[ "\$torch_pkg_size" -gt  1500000000 ]]; then
+    python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled --pypi-pkg-check disabled
+  else
+    python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled $extra_parameters
+  fi
+
+  if [[ "\$GPU_ARCH_TYPE" != *cpu-aarch64* ]]; then
+    # https://github.com/pytorch/pytorch/issues/149422
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     python /pytorch/.ci/pytorch/smoke_test/check_gomp.py
   fi
 fi
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 67c69ba7e3ce..be2f3bc63353 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -79,8 +79,13 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
 
+<<<<<<< HEAD
 # CUDA 12.8 builds have triton for Linux and Linux aarch64 binaries.
 if [[ "$DESIRED_CUDA" == cu128 ]]; then
+=======
+# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
+if [[ "$DESIRED_CUDA" == "cu129" ]]; then
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TRITON_CONSTRAINT="platform_system == 'Linux'"
 fi
 
@@ -109,6 +114,10 @@ fi
 
 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
+<<<<<<< HEAD
+=======
+    TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_xpu_version.txt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
     if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
         TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
diff --git a/.circleci/scripts/binary_upload.sh b/.circleci/scripts/binary_upload.sh
index 28140b832028..b97bf85fb51d 100755
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@@ -55,12 +55,23 @@ s3_upload() {
     s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
   fi
   (
+<<<<<<< HEAD
+=======
+    cache_control_flag=""
+    if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
+      cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
+    fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for pkg in ${PKG_DIR}/*.${extension}; do
       (
         set -x
         shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
         ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
+<<<<<<< HEAD
           --metadata "checksum-sha256=${shm_id}"
+=======
+          --metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       )
     done
   )
diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh
index 2d618ac53082..cc39e6cad175 100644
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@@ -4,16 +4,30 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
 
+<<<<<<< HEAD
 export CUDA_VERSION="${DESIRED_CUDA/cu/}"
 export USE_SCCACHE=1
 export SCCACHE_BUCKET=ossci-compiler-cache
 export SCCACHE_IGNORE_SERVER_IO_ERROR=1
 export VC_YEAR=2019
+=======
+if [[ "$OS" != "windows-arm64" ]]; then
+    export CUDA_VERSION="${DESIRED_CUDA/cu/}"
+    export USE_SCCACHE=1
+    export SCCACHE_BUCKET=ossci-compiler-cache
+    export SCCACHE_IGNORE_SERVER_IO_ERROR=1
+    export VC_YEAR=2022
+fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
     export VC_YEAR=2022
     export USE_SCCACHE=0
+<<<<<<< HEAD
     export XPU_VERSION=2025.0
+=======
+    export XPU_VERSION=2025.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     export XPU_ENABLE_KINETO=1
 fi
 
@@ -22,7 +36,20 @@ df -h
 
 pushd "$PYTORCH_ROOT/.ci/pytorch/"
 export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
+<<<<<<< HEAD
 ./windows/internal/build_wheels.bat
+=======
+
+if [[ "$OS" == "windows-arm64" ]]; then
+    if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
+        ./windows/arm64/build_libtorch.bat
+    elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
+        ./windows/arm64/build_pytorch.bat
+    fi
+else
+    ./windows/internal/build_wheels.bat
+fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 echo "Free space on filesystem after build:"
 df -h
diff --git a/.circleci/scripts/binary_windows_test.sh b/.circleci/scripts/binary_windows_test.sh
index 5e44ef0427c1..23521a6d31a1 100644
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@@ -4,6 +4,7 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"
 
 export CUDA_VERSION="${DESIRED_CUDA/cu/}"
+<<<<<<< HEAD
 export VC_YEAR=2019
 
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
@@ -13,5 +14,21 @@ fi
 
 pushd "$PYTORCH_ROOT/.ci/pytorch/"
 ./windows/internal/smoke_test.bat
+=======
+export VC_YEAR=2022
+
+if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
+    export VC_YEAR=2022
+    export XPU_VERSION=2025.1
+fi
+
+pushd "$PYTORCH_ROOT/.ci/pytorch/"
+
+if [[ "$OS" == "windows-arm64" ]]; then
+    ./windows/arm64/smoke_test.bat
+else
+    ./windows/internal/smoke_test.bat
+fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 popd
diff --git a/.clang-tidy b/.clang-tidy
index a45142433ef7..187bcd046dc3 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -48,12 +48,18 @@ misc-*,
 -misc-no-recursion,
 -misc-non-private-member-variables-in-classes,
 -misc-unused-using-decls,
+<<<<<<< HEAD
 -misc-use-internal-linkage,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 modernize-*,
 -modernize-macro-to-enum,
 -modernize-return-braced-init-list,
 -modernize-use-auto,
+<<<<<<< HEAD
 -modernize-use-default-member-init,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 -modernize-use-using,
 -modernize-use-trailing-return-type,
 -modernize-use-nodiscard,
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index e151576219af..343acae90cfe 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 FROM mcr.microsoft.com/vscode/devcontainers/miniconda:0-3
 
 #  I am suprised this is needed
@@ -12,10 +13,36 @@ RUN if [ -f "/tmp/conda-tmp/environment.yml" ]; then umask 0002 && /opt/conda/bi
 # Tools needed for llvm
 RUN sudo apt-get -y update
 RUN sudo apt install -y lsb-release wget software-properties-common gnupg
+=======
+FROM mcr.microsoft.com/vscode/devcontainers/base:ubuntu-22.04
+
+# Tools needed for development
+RUN apt-get -y update && \
+    apt-get install -y \
+    build-essential \
+    cmake \
+    ninja-build \
+    git \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-venv \
+    libopenblas-dev
+
+# Tools needed for llvm
+RUN apt-get install --no-install-recommends -y lsb-release wget software-properties-common gnupg && \
+    sudo apt-get clean -y
+
+# Create Python virtual environment
+# RUN python3 -m venv /opt/venv
+# ENV PATH="/opt/venv/bin:$PATH"
+RUN pip3 install --upgrade pip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Install CLANG if version is specified
 ARG CLANG_VERSION
 RUN if [ -n "$CLANG_VERSION" ]; then \
+<<<<<<< HEAD
     sudo wget https://apt.llvm.org/llvm.sh; \
     chmod +x llvm.sh; \
     sudo ./llvm.sh "${CLANG_VERSION}"; \
@@ -32,3 +59,29 @@ ARG CUDA_VERSION
 RUN if [ -n "$CUDA_VERSION" ]; then \
        conda install -y cuda -c "nvidia/label/cuda-${CUDA_VERSION}"; \
     fi
+=======
+    wget https://apt.llvm.org/llvm.sh; \
+    chmod +x llvm.sh; \
+    ./llvm.sh "${CLANG_VERSION}"; \
+    echo 'export CC=clang' >> ~/.bashrc; \
+    echo 'export CXX=clang++' >> ~/.bashrc; \
+    apt-get install --no-install-recommends -y clang libomp-dev && \
+    apt-get clean -y; \
+    fi
+
+
+# Install CUDA if version is specified
+ARG CUDA_VERSION
+RUN if [ -n "$CUDA_VERSION" ]; then \
+    CUDA_REPO_VERSION=$(echo ${CUDA_VERSION} | sed 's/\./\-/g'); \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb && \
+    apt-get install --no-install-recommends -y cuda-toolkit-${CUDA_VERSION} && \
+    apt-get clean -y; \
+    fi
+
+# Set PATH for CUDA
+ENV PATH="/usr/local/cuda/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.devcontainer/cpu/devcontainer.json b/.devcontainer/cpu/devcontainer.json
index aaca1e0e9066..c2483e731890 100644
--- a/.devcontainer/cpu/devcontainer.json
+++ b/.devcontainer/cpu/devcontainer.json
@@ -3,7 +3,11 @@
 {
   "name": "PyTorch - CPU",
   "build": {
+<<<<<<< HEAD
     "context": "../..",
+=======
+    "context": "./",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "dockerfile": "../Dockerfile",
     "args": {
       "USERNAME": "vscode",
@@ -11,6 +15,15 @@
       "CLANG_VERSION": ""
     }
   },
+<<<<<<< HEAD
+=======
+  // Mount the full repo only after the container starts
+  "workspaceMount": "source=${localWorkspaceFolder},target=/workspace/pytorch,type=bind,consistency=cached",
+  "workspaceFolder": "/workspace/pytorch",
+  "containerEnv": {
+    "PIP_USER": "0" // <‑‑ disable implicit --user
+  },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Features to add to the dev container. More info: https://containers.dev/features.
   "features": {
diff --git a/.devcontainer/cuda/devcontainer.json b/.devcontainer/cuda/devcontainer.json
index b0d448b8dc47..757e7c851de4 100644
--- a/.devcontainer/cuda/devcontainer.json
+++ b/.devcontainer/cuda/devcontainer.json
@@ -3,16 +3,34 @@
 {
   "name": "PyTorch - CUDA",
   "build": {
+<<<<<<< HEAD
     "context": "../..",
+=======
+    "context": "./",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "dockerfile": "../Dockerfile",
     "args": {
       "USERNAME": "vscode",
       "BUILDKIT_INLINE_CACHE": "0",
+<<<<<<< HEAD
       "CUDA_VERSION": "11.8.0",
       "CLANG_VERSION": ""
     }
   },
   "runArgs": ["--gpus", "all"],
+=======
+      "CUDA_VERSION": "12.8.0",
+      "CLANG_VERSION": ""
+    }
+  },
+  "runArgs": ["--runtime", "nvidia", "--gpus", "all"],
+  // Mount the full repo only after the container starts
+  "workspaceMount": "source=${localWorkspaceFolder},target=/workspace/pytorch,type=bind,consistency=cached",
+  "workspaceFolder": "/workspace/pytorch",
+  "containerEnv": {
+    "PIP_USER": "0" // <‑‑ disable implicit --user
+  },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Use 'forwardPorts' to make a list of ports inside the container available locally.
   // "forwardPorts": [],
 
diff --git a/.devcontainer/cuda/requirements.txt b/.devcontainer/cuda/requirements.txt
new file mode 100644
index 000000000000..eb0a8b9c2b19
--- /dev/null
+++ b/.devcontainer/cuda/requirements.txt
@@ -0,0 +1,2 @@
+cmake
+ninja
\ No newline at end of file
diff --git a/.devcontainer/scripts/install-dev-tools.sh b/.devcontainer/scripts/install-dev-tools.sh
index f33f294645e7..b03b0e47ca57 100644
--- a/.devcontainer/scripts/install-dev-tools.sh
+++ b/.devcontainer/scripts/install-dev-tools.sh
@@ -8,6 +8,12 @@ git submodule update --init --recursive
 make setup-lint
 
 # Add CMAKE_PREFIX_PATH to bashrc
+<<<<<<< HEAD
 echo 'export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}' >> ~/.bashrc
 # Add linker path so that cuda-related libraries can be found
 echo 'export LDFLAGS="-L${CONDA_PREFIX}/lib/ $LDFLAGS"' >> ~/.bashrc
+=======
+echo 'export CMAKE_PREFIX_PATH=/usr/local' >> ~/.bashrc
+# Add linker path so that cuda-related libraries can be found
+echo 'export LDFLAGS="-L/usr/local/cuda/lib64/ $LDFLAGS"' >> ~/.bashrc
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 000000000000..74c90164422d
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,14 @@
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+
+# Python
+[*.py]
+indent_style = space
+indent_size = 4
+
+# Make
+[Makefile]
+indent_style = tab
diff --git a/.flake8 b/.flake8
index c30f95886924..89f8318c6e52 100644
--- a/.flake8
+++ b/.flake8
@@ -19,6 +19,11 @@ ignore =
     G100,G101,G200
     # these ignores are from flake8-simplify. please fix or ignore with commented reason
     SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
+<<<<<<< HEAD
+=======
+    # SIM104 is already covered by pyupgrade ruff
+    SIM104,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # flake8-simplify code styles
     SIM102,SIM103,SIM106,SIM112,
     # TorchFix codes that don't make sense for PyTorch itself:
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 458f283507fc..92e02d2ac274 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -12,7 +12,13 @@ body:
     description: |
       Please provide a clear and concise description of what the bug is.
 
+<<<<<<< HEAD
       If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
+=======
+      If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently.
+      Your example should be fully self-contained and not rely on any artifact that should be downloaded.
+      For example:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       ```python
       # All necessary imports at the beginning
@@ -26,6 +32,10 @@ body:
       If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
 
       Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+<<<<<<< HEAD
+=======
+      If your issue is related to numerical accuracy or reproducibility, please read the [numerical accuracy](https://docs.pytorch.org/docs/stable/notes/numerical_accuracy.html) and [reproducibility](https://docs.pytorch.org/docs/stable/notes/randomness.html) notes. If the difference is not expected as described in these documents, please provide appropriate justification on why one result is wrong and the other is correct.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     placeholder: |
       A clear and concise description of what the bug is.
 
diff --git a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
index b4b078badb34..8bdcc8a2c085 100644
--- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
+++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
@@ -5,7 +5,11 @@ title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
 labels: "module: ci"
 ---
 
+<<<<<<< HEAD
 > For example, DISABLED pull / win-vs2019-cpu-py3 / test (default). Once
+=======
+> For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 > created, the job will be disabled within 15 minutes. You can check the
 > list of disabled jobs at https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json
 
diff --git a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
index be22b1446b4e..e6ea0b4b82a9 100644
--- a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
@@ -20,7 +20,11 @@ body:
 
         - Don't compare indices of max/min etc, because that avoids the above requirement
 
+<<<<<<< HEAD
         - If comparing eager and torch.compile at fp16/bf16, you should use fp32 as baseline
+=======
+        - When comparing eager and torch.compile, use a higher precision result as a baseline. `torch._dynamo.utils.same` with fp64_ref will handle this comparison.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         - Ensure rng state used to compare results is equivalent. Use `torch._inductor.config.fallback_random=True` and reset the torch rng seed between comparisons
 
diff --git a/.github/ISSUE_TEMPLATE/release-feature-request.yml b/.github/ISSUE_TEMPLATE/release-feature-request.yml
new file mode 100644
index 000000000000..80f10807ae56
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/release-feature-request.yml
@@ -0,0 +1,111 @@
+name: 🚀 Release highlight for proposed Feature
+description: Submit a Release highlight for proposed Feature
+labels: ["release-feature-request"]
+
+body:
+- type: textarea
+  attributes:
+    label: Release highlight for proposed Feature
+    description: >
+      Example: “A torch.special module, analogous to SciPy's special module.”
+- type: input
+  id: contact
+  attributes:
+    label: Point(s) of contact
+    description: How can we get in touch with you if we need more info?
+    placeholder: ex. github username
+  validations:
+    required: false
+- type: dropdown
+  attributes:
+    label: Release Mode (pytorch/pytorch features only)
+    description: |
+      If "out-of-tree", please include the GH repo name
+    options:
+      - In-tree
+      - Out-of-tree
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Out-Of-Tree Repo
+    description: >
+      please include the GH repo name
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Description and value to the user
+    description: >
+      Please provide a brief description of the feature and how it will benefit the user.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Link to design doc, GitHub issues, past submissions, etc
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: What feedback adopters have provided
+    description: >
+      Please list users/teams that have tried the feature and provided feedback. If that feedback motivated material changes (API, doc, etc..), a quick overview of the changes and the status (planned, in progress, implemented) would be helpful as well.
+  validations:
+    required: false
+- type: dropdown
+  attributes:
+    label: Plan for documentations / tutorials
+    description: |
+      Select One of the following options
+    options:
+      - Tutorial exists
+      - Will submit a PR to pytorch/tutorials
+      - Will submit a PR to a repo
+      - Tutorial is not needed
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Additional context for tutorials
+    description: >
+      Please provide a link for existing tutorial or link to a repo or context for why tutorial is not needed.
+  validations:
+    required: false
+- type: dropdown
+  attributes:
+    label: Marketing/Blog Coverage
+    description: |
+      Are you requesting feature Inclusion in the release blogs?
+    options:
+      - "Yes"
+      - "No"
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Are you requesting other marketing assistance with this feature?
+    description: >
+      E.g. supplementary blogs, social media amplification, etc.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Release Version
+    description: >
+      Please include release version for marketing coverage.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: OS / Platform / Compute Coverage
+    description: >
+      Please list the platforms supported by the proposed feature. If the feature supports all the platforms, write "all". Goal of this section is to clearly share if this feature works in all PyTorch configurations or is it limited to only certain platforms/configurations (e.g. CPU only, GPU only, Linux only, etc...)
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Testing Support (CI, test cases, etc..)
+    description: >
+      Please provide an overview of test coverage. This includes unit testing and integration testing, but if E2E validation testing has been done to show that the feature works for a certain set of use cases or models please mention that as well.
+  validations:
+    required: false
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index 35e1323ab8b8..370a9b7ddd11 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -14,6 +14,10 @@ self-hosted-runner:
     - linux.12xlarge
     - linux.24xlarge
     - linux.24xlarge.ephemeral
+<<<<<<< HEAD
+=======
+    - linux.24xlarge.amd
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - linux.arm64.2xlarge
     - linux.arm64.2xlarge.ephemeral
     - linux.arm64.m7g.4xlarge
@@ -45,10 +49,24 @@ self-hosted-runner:
     - windows.g5.4xlarge.nvidia.gpu
     # Windows ARM64 runners
     - windows-11-arm64
+<<<<<<< HEAD
     # Organization-wide AMD hosted runners
     - linux.rocm.gpu
     - linux.rocm.gpu.2
     - linux.rocm.gpu.4
+=======
+    - windows-11-arm64-preview
+    # Organization-wide AMD-hosted runners
+    # MI2xx runners
+    - linux.rocm.gpu
+    - linux.rocm.gpu.mi250
+    - linux.rocm.gpu.2
+    - linux.rocm.gpu.4
+    # MI300 runners
+    - linux.rocm.gpu.mi300.2
+    - linux.rocm.gpu.mi300.4
+    - rocm-docker
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Repo-specific Apple hosted  runners
     - macos-m1-ultra
     - macos-m2-14
diff --git a/.github/actions/binary-docker-build/action.yml b/.github/actions/binary-docker-build/action.yml
new file mode 100644
index 000000000000..bc6e2b2196d1
--- /dev/null
+++ b/.github/actions/binary-docker-build/action.yml
@@ -0,0 +1,70 @@
+name: Binary docker build
+
+description: Build docker image for binary builds
+
+inputs:
+  docker-image-name:
+    description: Docker image name for PR builds
+    required: true
+  docker-build-dir:
+    description: Location of the build.sh relative to .ci/docker
+    required: true
+  custom-tag-prefix:
+    description: Custom tag prefix for the docker image
+    required: false
+  DOCKER_TOKEN:
+    description: Docker token for authentication
+    required: true
+  DOCKER_ID:
+    description: Docker ID for authentication
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Checkout PyTorch
+      uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+    - name: Calculate docker image
+      id: calculate-docker-image
+      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+      with:
+        docker-image-name: ${{ inputs.docker-image-name }}
+        docker-build-dir: .ci/docker
+        custom-tag-prefix: ${{ inputs.custom-tag-prefix }}
+        docker-build-script: ${{ inputs.docker-build-dir }}/build.sh
+        always-rebuild: true
+        push: true
+
+    - name: Tag and (if WITH_PUSH) push docker image to docker.io
+      env:
+        DOCKER_TOKEN: ${{ inputs.DOCKER_TOKEN }}
+        DOCKER_ID: ${{ inputs.DOCKER_ID }}
+        DOCKER_IMAGE_NAME: ${{ inputs.docker-image-name }}
+        DOCKER_IMAGE_PREFIX: ${{ inputs.custom-tag-prefix }}
+        CREATED_FULL_DOCKER_IMAGE_NAME: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      shell: bash
+      run: |
+        set -euox pipefail
+        GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+        GIT_BRANCH_NAME=${GITHUB_REF##*/}
+        GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+        CI_FOLDER_SHA=$(git rev-parse HEAD:.ci/docker)
+
+        DOCKER_IMAGE_NAME_PREFIX=docker.io/pytorch/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_PREFIX}
+
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
+
+        # Pretty sure Github will mask tokens and I'm not sure if it will even be
+        # printed due to pipe, but just in case
+        set +x
+        if [[ ${WITH_PUSH:-false} == "true" ]]; then
+          echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
+        fi
diff --git a/.github/actions/build-android/action.yml b/.github/actions/build-android/action.yml
index 1d4d71fd9d36..f66f4988e29a 100644
--- a/.github/actions/build-android/action.yml
+++ b/.github/actions/build-android/action.yml
@@ -9,7 +9,11 @@ inputs:
   arch-for-build-env:
     description: |
       arch to pass to build environment.
+<<<<<<< HEAD
       This is currently different than the arch name we use elswhere, which
+=======
+      This is currently different than the arch name we use elsewhere, which
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       should be fixed.
     required: true
   github-secret:
diff --git a/.github/actions/checkout-pytorch/action.yml b/.github/actions/checkout-pytorch/action.yml
index 7908e9a12c02..4e2fdb634173 100644
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@@ -23,12 +23,54 @@ runs:
       id: check_container_runner
       run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
 
+<<<<<<< HEAD
     - name: Clean workspace
+=======
+    - name: Set up parallel fetch and clean workspace
+      id: first-clean
+      continue-on-error: true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       shell: bash
       if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
       env:
         NO_SUDO: ${{ inputs.no-sudo }}
       run: |
+<<<<<<< HEAD
+=======
+        # Use all available CPUs for fetching
+        cd "${GITHUB_WORKSPACE}"
+        git config --global fetch.parallel 0
+        git config --global submodule.fetchJobs 0
+
+        # Clean workspace. The default checkout action should also do this, but
+        # do it here as well just in case
+        if [[ -d .git ]]; then
+          if [ -z "${NO_SUDO}" ]; then
+            sudo git clean -ffdx
+          else
+            git clean -ffdx
+          fi
+        fi
+
+    - name: Checkout PyTorch
+      id: first-checkout-attempt
+      continue-on-error: true
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+        # --depth=1 for speed, manually fetch history and other refs as necessary
+        fetch-depth: ${{ inputs.fetch-depth }}
+        submodules: ${{ inputs.submodules }}
+        show-progress: false
+
+    - name: Clean workspace (try again)
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
+        (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
+      shell: bash
+      env:
+        NO_SUDO: ${{ inputs.no-sudo }}
+      run: |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         retry () {
           $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
         }
@@ -40,6 +82,7 @@ runs:
         fi
         mkdir "${GITHUB_WORKSPACE}"
 
+<<<<<<< HEAD
         # Use all available CPUs for fetching
         cd "${GITHUB_WORKSPACE}"
         git config --global fetch.parallel 0
@@ -50,6 +93,13 @@ runs:
       with:
         ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
         # --depth=1 for speed, manually fetch history and other refs as necessary
+=======
+    - name: Checkout PyTorch (try again)
+      uses: actions/checkout@v4
+      if: ${{ steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success' }}
+      with:
+        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fetch-depth: ${{ inputs.fetch-depth }}
         submodules: ${{ inputs.submodules }}
         show-progress: false
diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml
index 7da1ce3fe071..493bb0f15b56 100644
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@@ -157,4 +157,8 @@ runs:
         echo "Is keep-going label set? ${{ steps.filter.outputs.keep-going }}"
 
         echo
+<<<<<<< HEAD
         echo "Renabled issues? ${{ steps.filter.outputs.reenabled-issues }}"
+=======
+        echo "Reenabled issues? ${{ steps.filter.outputs.reenabled-issues }}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/actions/linux-test/action.yml b/.github/actions/linux-test/action.yml
index 0b031046a7a5..7edcba38c776 100644
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@@ -66,7 +66,11 @@ runs:
 
     - name: configure aws credentials
       if : ${{ inputs.aws-role-to-assume != '' }}
+<<<<<<< HEAD
       uses: aws-actions/configure-aws-credentials@v3
+=======
+      uses: aws-actions/configure-aws-credentials@v4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       with:
         role-to-assume: ${{ inputs.aws-role-to-assume }}
         role-session-name: gha-linux-test
@@ -153,7 +157,11 @@ runs:
         github-token: ${{ inputs.GITHUB_TOKEN }}
 
     - name: Check for keep-going label and re-enabled test issues
+<<<<<<< HEAD
       # This uses the filter-test-configs action because it conviniently
+=======
+      # This uses the filter-test-configs action because it conveniently
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # checks for labels and re-enabled test issues.  It does not actually do
       # any filtering.  All filtering is done in the build step.
       id: keep-going
diff --git a/.github/actions/reuse-old-whl/action.yml b/.github/actions/reuse-old-whl/action.yml
new file mode 100644
index 000000000000..1976a30828ed
--- /dev/null
+++ b/.github/actions/reuse-old-whl/action.yml
@@ -0,0 +1,47 @@
+name: Reuse old wheel if possible
+
+description:
+  Reuse old wheel if possible
+
+inputs:
+  build-environment:
+    description: Build environment
+    required: true
+  run-id:
+    description: Workflow run ID
+    required: true
+  github-token:
+    description: GitHub token
+    required: true
+  job-id:
+    description: Job ID
+    required: true
+  job-name:
+    description: Job name
+    required: true
+
+outputs:
+  reuse:
+    description: Whether the wheel is reused or not
+    value: ${{ steps.check-file-changes.outputs.reuse }}
+
+runs:
+  using: composite
+
+  steps:
+    # Check out pytorch with fetch depth 0
+    - name: Check file changes
+      id: check-file-changes
+      shell: bash
+      continue-on-error: true
+      env:
+        GITHUB_TOKEN: ${{ inputs.github-token }}
+        JOB_ID: ${{ inputs.job-id }}
+        JOB_NAME: ${{ inputs.job-name }}
+      run: |
+        set -x
+        python3 -m pip install boto3==1.35.42
+        python3 ${GITHUB_ACTION_PATH}/reuse_old_whl.py \
+          --build-environment "${{ inputs.build-environment }}" \
+          --run-id "${{ inputs.run-id }}" \
+          --github-ref "${{ github.ref }}"
diff --git a/.github/actions/reuse-old-whl/reuse_old_whl.py b/.github/actions/reuse-old-whl/reuse_old_whl.py
new file mode 100644
index 000000000000..c4756f4a2f4c
--- /dev/null
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@@ -0,0 +1,385 @@
+import argparse
+import os
+import subprocess
+import sys
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, cast, Optional, Union
+
+import requests
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+sys.path.insert(0, str(REPO_ROOT))
+from tools.stats.upload_metrics import emit_metric
+
+
+sys.path.remove(str(REPO_ROOT))  # Clean up sys.path after import
+
+
+FORCE_REBUILD_LABEL = "ci-force-rebuild"
+
+
+@lru_cache
+def get_merge_base() -> str:
+    merge_base = subprocess.check_output(
+        ["git", "merge-base", "HEAD", "origin/main"],
+        text=True,
+        stderr=subprocess.DEVNULL,
+    ).strip()
+    # Remove this when we turn this off for the main branch
+    if merge_base == get_head_sha():
+        print("Merge base is the same as HEAD, using HEAD^")
+        merge_base = subprocess.check_output(
+            ["git", "rev-parse", "HEAD^"],
+            text=True,
+            stderr=subprocess.DEVNULL,
+        ).strip()
+    print(f"Merge base: {merge_base}")
+    return merge_base
+
+
+@lru_cache
+def get_head_sha() -> str:
+    sha = subprocess.check_output(
+        ["git", "rev-parse", "HEAD"],
+        text=True,
+        stderr=subprocess.DEVNULL,
+    ).strip()
+    return sha
+
+
+def is_main_branch() -> bool:
+    return False
+    # Testing on main branch for now
+    # print(
+    #     f"Checking if we are on main branch: merge base {get_merge_base()}, head {get_head_sha()}"
+    # )
+    # return get_merge_base() == get_head_sha()
+
+
+def query_github_api(url: str) -> Any:
+    headers = {
+        "Accept": "application/vnd.github.v3+json",
+        "Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}",
+    }
+    response = requests.get(url, headers=headers)
+    return response.json()
+
+
+@lru_cache
+def check_labels_for_pr() -> bool:
+    # Check if the current commit is part of a PR and if it has the
+    # FORCE_REBUILD_LABEL
+    head_sha = get_head_sha()
+    url = f"https://api.github.com/repos/pytorch/pytorch/commits/{head_sha}/pulls"
+    response = query_github_api(url)
+
+    print(
+        f"Found {len(response)} PRs for commit {head_sha}: {[pr['number'] for pr in response]}"
+    )
+    for pr in response:
+        labels = pr.get("labels", [])
+        for label in labels:
+            if label["name"] == FORCE_REBUILD_LABEL:
+                print(f"Found label {FORCE_REBUILD_LABEL} in PR {pr['number']}.")
+                return True
+    return False
+
+
+def check_issue_open() -> bool:
+    # Check if issue #153759 is open.  This is the config issue for quickly
+    # forcing everyone to build
+    url = "https://api.github.com/repos/pytorch/pytorch/issues/153759"
+    response = query_github_api(url)
+    if response.get("state") == "open":
+        print("Issue #153759 is open.")
+        return True
+    else:
+        print("Issue #153759 is not open.")
+        return False
+
+
+def get_workflow_id(run_id: str) -> Optional[str]:
+    # Get the workflow ID that corresponds to the file for the run ID
+    url = f"https://api.github.com/repos/pytorch/pytorch/actions/runs/{run_id}"
+    response = query_github_api(url)
+    if "workflow_id" in response:
+        print(f"Found workflow ID for run ID {run_id}: {response['workflow_id']}")
+        return cast(str, response["workflow_id"])
+    else:
+        print("No workflow ID found.")
+        return None
+
+
+def ok_changed_file(file: str) -> bool:
+    # Return true if the file is in the list of allowed files to be changed to
+    # reuse the old whl
+    if (
+        file.startswith("torch/")
+        and file.endswith(".py")
+        and not file.startswith("torch/csrc/")
+    ):
+        return True
+    if file.startswith("test/") and file.endswith(".py"):
+        return True
+    if file.startswith("docs/") and file.endswith((".md", ".rst")):
+        return True
+    return False
+
+
+def check_changed_files(sha: str) -> bool:
+    # Return true if all the changed files are in the list of allowed files to
+    # be changed to reuse the old whl
+
+    # Removing files in the torch folder is not allowed since rsync will not
+    # remove files
+    removed_files = (
+        subprocess.check_output(
+            [
+                "git",
+                "diff",
+                "--name-only",
+                sha,
+                "HEAD",
+                "--diff-filter=D",
+                "--no-renames",
+            ],
+            text=True,
+            stderr=subprocess.DEVNULL,
+        )
+        .strip()
+        .split()
+    )
+    if any(file.startswith("torch/") for file in removed_files):
+        print(
+            f"Removed files between {sha} and HEAD: {removed_files}, cannot reuse old whl"
+        )
+        return False
+
+    changed_files = (
+        subprocess.check_output(
+            ["git", "diff", "--name-only", sha, "HEAD", "--no-renames"],
+            text=True,
+            stderr=subprocess.DEVNULL,
+        )
+        .strip()
+        .split()
+    )
+    print(f"Checking changed files between {sha} and HEAD:")
+    for file in changed_files:
+        if not ok_changed_file(file):
+            print(f"  File {file} is not allowed to be changed.")
+            return False
+        else:
+            print(f"  File {file} is allowed to be changed.")
+    return True
+
+
+def find_old_whl(workflow_id: str, build_environment: str, sha: str) -> bool:
+    # Find the old whl on s3 and download it to artifacts.zip
+    if build_environment is None:
+        print("BUILD_ENVIRONMENT is not set.")
+        return False
+    print(f"SHA: {sha}, workflow_id: {workflow_id}")
+
+    workflow_runs = query_github_api(
+        f"https://api.github.com/repos/pytorch/pytorch/actions/workflows/{workflow_id}/runs?head_sha={sha}&branch=main&per_page=100"
+    )
+    if workflow_runs.get("total_count", 0) == 0:
+        print("No workflow runs found.")
+        return False
+    for run in workflow_runs.get("workflow_runs", []):
+        # Look in s3 for the old whl
+        run_id = run["id"]
+        try:
+            url = f"https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/{run_id}/{build_environment}/artifacts.zip"
+            print(f"Checking for old whl at {url}")
+            response = requests.get(
+                url,
+            )
+            if response.status_code == 200:
+                with open("artifacts.zip", "wb") as f:
+                    f.write(response.content)
+                    print(f"Found old whl file from s3: {url}")
+                    return True
+        except requests.RequestException as e:
+            print(f"Error checking for old whl: {e}")
+            continue
+    return False
+
+
+def unzip_artifact_and_replace_files() -> None:
+    # Unzip the artifact and replace files
+    subprocess.check_output(
+        ["unzip", "-o", "artifacts.zip", "-d", "artifacts"],
+    )
+    os.remove("artifacts.zip")
+
+    head_sha = get_head_sha()
+
+    # Rename wheel into zip
+    wheel_path = Path("artifacts/dist").glob("*.whl")
+    for path in wheel_path:
+        # Should be of the form torch-2.0.0+git1234567-cp37-etc.whl
+        # Should usually be the merge base sha but for the ones that didn't do
+        # the replacement, it won't be.  Can probably change it to just be merge
+        # base later
+        old_version = f"+git{path.stem.split('+')[1].split('-')[0][3:]}"
+        new_version = f"+git{head_sha[:7]}"
+
+        def rename_to_new_version(file: Union[str, Path]) -> None:
+            # Rename file with old_version to new_version
+            subprocess.check_output(
+                ["mv", file, str(file).replace(old_version, new_version)]
+            )
+
+        def change_content_to_new_version(file: Union[str, Path]) -> None:
+            # Check if is a file
+            if os.path.isdir(file):
+                return
+            # Replace the old version in the file with the new version
+            with open(file) as f:
+                content = f.read()
+                content = content.replace(old_version, new_version)
+            with open(file, "w") as f:
+                f.write(content)
+
+        zip_path = path.with_suffix(".zip")
+        os.rename(path, zip_path)
+        old_stem = zip_path.stem
+        # Unzip the wheel
+        subprocess.check_output(
+            ["unzip", "-o", zip_path, "-d", f"artifacts/dist/{old_stem}"],
+        )
+
+        # Remove the old wheel (which is now a zip file)
+        os.remove(zip_path)
+
+        # Copy python files into the artifact
+        subprocess.check_output(
+            ["rsync", "-avz", "torch", f"artifacts/dist/{old_stem}"],
+        )
+
+        change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
+
+        for file in Path(f"artifacts/dist/{old_stem}").glob(
+            "*.dist-info/**",
+        ):
+            change_content_to_new_version(file)
+
+        rename_to_new_version(f"artifacts/dist/{old_stem}")
+        new_stem = old_stem.replace(old_version, new_version)
+
+        for file in Path(f"artifacts/dist/{new_stem}").glob(
+            "*.dist-info",
+        ):
+            rename_to_new_version(file)
+
+        # Zip the wheel back
+        subprocess.check_output(
+            ["zip", "-r", f"{new_stem}.zip", "."],
+            cwd=f"artifacts/dist/{new_stem}",
+        )
+
+        subprocess.check_output(
+            [
+                "mv",
+                f"artifacts/dist/{new_stem}/{new_stem}.zip",
+                f"artifacts/dist/{new_stem}.whl",
+            ],
+        )
+
+        # Remove the extracted folder
+        subprocess.check_output(
+            ["rm", "-rf", f"artifacts/dist/{new_stem}"],
+        )
+
+    # Rezip the artifact
+    subprocess.check_output(["zip", "-r", "artifacts.zip", "."], cwd="artifacts")
+    subprocess.check_output(
+        ["mv", "artifacts/artifacts.zip", "."],
+    )
+    return None
+
+
+def set_output() -> None:
+    # Disable for now so we can monitor first
+    # pass
+    if os.getenv("GITHUB_OUTPUT"):
+        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
+            print("reuse=true", file=env)
+    else:
+        print("::set-output name=reuse::true")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Check for old whl files.")
+    parser.add_argument("--run-id", type=str, required=True, help="Workflow ID")
+    parser.add_argument(
+        "--build-environment", type=str, required=True, help="Build environment"
+    )
+    parser.add_argument(
+        "--github-ref",
+        type=str,
+    )
+    return parser.parse_args()
+
+
+def can_reuse_whl(args: argparse.Namespace) -> tuple[bool, str]:
+    if args.github_ref and any(
+        args.github_ref.startswith(x)
+        for x in [
+            "refs/heads/release",
+            "refs/tags/v",
+            "refs/heads/nightly",
+        ]
+    ):
+        print("Release branch, rebuild whl")
+        return (False, "Release branch")
+
+    if not check_changed_files(get_merge_base()):
+        print("Cannot use old whl due to the changed files, rebuild whl")
+        return (False, "Changed files not allowed")
+
+    if check_labels_for_pr():
+        print(f"Found {FORCE_REBUILD_LABEL} label on PR, rebuild whl")
+        return (False, "Found FORCE_REBUILD_LABEL on PR")
+
+    if check_issue_open():
+        print("Issue #153759 is open, rebuild whl")
+        return (False, "Issue #153759 is open")
+
+    workflow_id = get_workflow_id(args.run_id)
+    if workflow_id is None:
+        print("No workflow ID found, rebuild whl")
+        return (False, "No workflow ID found")
+
+    if not find_old_whl(workflow_id, args.build_environment, get_merge_base()):
+        print("No old whl found, rebuild whl")
+        return (False, "No old whl found")
+        # TODO: go backwards from merge base to find more runs
+
+    return (True, "Found old whl")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    reuse_whl, reason = can_reuse_whl(args)
+
+    if reuse_whl:
+        print("Reusing old whl")
+        unzip_artifact_and_replace_files()
+        set_output()
+
+    emit_metric(
+        "reuse_old_whl",
+        {
+            "reuse_whl": reuse_whl,
+            "reason": reason,
+            "build_environment": args.build_environment,
+            "merge_base": get_merge_base(),
+            "head_sha": get_head_sha(),
+        },
+    )
diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
index da514c04a69f..ee70cb47dc97 100644
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -33,14 +33,22 @@ runs:
       id: check_container_runner
       run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
 
+<<<<<<< HEAD
     - name: Start docker if docker deamon is not running
+=======
+    - name: Start docker if docker daemon is not running
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       shell: bash
       if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
       run: |
         if systemctl is-active --quiet docker; then
             echo "Docker daemon is running...";
         else
+<<<<<<< HEAD
             echo "Starting docker deamon..." && sudo systemctl start docker;
+=======
+            echo "Starting docker daemon..." && sudo systemctl start docker;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fi
 
     - name: Log in to ECR
diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml
index 0982df529dd4..1d5772bc7e70 100644
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@@ -5,6 +5,15 @@ description: Set up ROCm host for CI
 runs:
   using: composite
   steps:
+<<<<<<< HEAD
+=======
+    - name: Runner ROCm version
+      if: always()
+      shell: bash
+      run: |
+        dpkg -l | grep -E "  rocm"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - name: Stop all running docker containers
       if: always()
       shell: bash
diff --git a/.github/actions/setup-xpu/action.yml b/.github/actions/setup-xpu/action.yml
index 50411e4bdf33..139f82b09fc1 100644
--- a/.github/actions/setup-xpu/action.yml
+++ b/.github/actions/setup-xpu/action.yml
@@ -29,13 +29,21 @@ runs:
       if: always()
       shell: bash
       run: |
+<<<<<<< HEAD
         xpu-smi discovery
+=======
+        timeout 30 xpu-smi discovery || true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     - name: Runner health check GPU count
       if: always()
       shell: bash
       run: |
+<<<<<<< HEAD
         ngpu=$(xpu-smi discovery | grep -c -E 'Device Name')
+=======
+        ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
         if [[ $ngpu -eq 0 ]]; then
           echo "Error: Failed to detect any GPUs on the runner"
diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml
index 51fc8d14f474..03b9e69bf72d 100644
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@@ -15,7 +15,10 @@ runs:
           -e BINARY_ENV_FILE \
           -e BUILD_ENVIRONMENT \
           -e DESIRED_CUDA \
+<<<<<<< HEAD
           -e DESIRED_DEVTOOLSET \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           -e DESIRED_PYTHON \
           -e GITHUB_ACTIONS \
           -e GPU_ARCH_TYPE \
diff --git a/.github/actions/upload-sccache-stats/action.yml b/.github/actions/upload-sccache-stats/action.yml
index 1561a72ee786..8e30d056d5db 100644
--- a/.github/actions/upload-sccache-stats/action.yml
+++ b/.github/actions/upload-sccache-stats/action.yml
@@ -22,6 +22,7 @@ runs:
         retention-days: 14
         if-no-files-found: warn
         path: sccache-stats-*.json
+<<<<<<< HEAD
 
     - name: Format sccache stats
       shell: bash
@@ -37,3 +38,5 @@ runs:
         dry-run: false
         schema-version: v3
         github-token: ${{ inputs.github-token }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml
index 76b0e5533ce6..6cc23ff7f98c 100644
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@@ -48,6 +48,7 @@ runs:
       run: |
         # Remove any previous usage logs if they exist
         rm -f logs-*.zip
+<<<<<<< HEAD
         # this workflow is also run in bazel build test, but we dont generate usage reports for it
         # so check to see if the file exists first
         if [ -f 'usage_log.txt' ]; then
@@ -56,6 +57,10 @@ runs:
         if find "test/test-reports" -name "*.log" 2>/dev/null | grep -q .; then
             zip -r "logs-${FILE_SUFFIX}.zip" test/test-reports -i '*.log'
         fi
+=======
+        zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt' || true
+        zip -r "logs-${FILE_SUFFIX}.zip" test/test-reports -i '*.log' || true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     - name: Zip debugging artifacts for upload
       if: runner.os != 'Windows' && !inputs.use-gha
diff --git a/.github/actions/upload-utilization-stats/action.yml b/.github/actions/upload-utilization-stats/action.yml
index 662a95330bb2..40fbb97b6a12 100644
--- a/.github/actions/upload-utilization-stats/action.yml
+++ b/.github/actions/upload-utilization-stats/action.yml
@@ -1,6 +1,10 @@
 name: upload-utilization-stats
 
+<<<<<<< HEAD
 description: Upload utilization stats to artifacts
+=======
+description: Upload utilization stats to artifacts.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 inputs:
     workflow_run_id:
@@ -23,6 +27,20 @@ inputs:
       type: string
       description: 'the job name of the test'
       required: True
+<<<<<<< HEAD
+=======
+    local_path:
+      type: string
+      description: 'the local path to the utilization stats file'
+      required: False
+      default: ''
+    artifact_prefix:
+      type: string
+      description: |
+          'the prefix of the raw utilization data, for data stored in zip file, this is the prefix of the parent zip file'
+      default: ""
+      required: False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 runs:
   using: composite
@@ -35,6 +53,11 @@ runs:
         echo "workflow_Name: ${{inputs.workflow_name}}"
         echo "job_id: ${{inputs.job_id}}"
         echo "job_name:  ${{inputs.job_name}}"
+<<<<<<< HEAD
+=======
+        echo "artifact_prefix: ${{inputs.artifact_prefix}}"
+        python3 --version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - uses: nick-fields/retry@v3.0.0
       name: Setup dependencies
       with:
@@ -44,7 +67,11 @@ runs:
         retry_wait_seconds: 30
         command: |
           set -eu
+<<<<<<< HEAD
           python3 -m pip install python-dateutil==2.8.2 boto3==1.35.42 pandas==2.1.3
+=======
+          python3 -m pip install python-dateutil==2.8.2 boto3==1.35.42 pandas==2.1.3 dataclasses_json==0.6.7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - name: Upload utilizatoin stats to s3
       shell: bash
       run: |
@@ -53,4 +80,10 @@ runs:
           --workflow-name "${{inputs.workflow_name}}" \
           --workflow-run-attempt "${{inputs.workflow_attempt}}" \
           --job-id "${{inputs.job_id}}" \
+<<<<<<< HEAD
           --job-name "${{inputs.job_name}}"
+=======
+          --job-name "${{inputs.job_name}}" \
+          --local-path "${{inputs.local_path}}" \
+          --artifact-prefix "${{inputs.artifact_prefix}}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index f0b99d5801e4..9b47f5a0cf38 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 c670ad81fda266b6598aeeef434583eb98197ae8
+=======
+4e94321c54617dd738a05bfedfc28bc0fa635b5c
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt
index 7e5c1c641e94..39e56b06da60 100644
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 373ffb19dc470f4423a3176a4133f8f4b3cdb5bd
+=======
+e03a63be43e33596f7f0a43b0f530353785e4a59
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index c642e5d08c80..a3ad9654d8be 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 d23a6e1664d20707c11781299611436e1f0c104f
+=======
+966da7e46f65d6d49df3e31214470a4fe5cc8e66
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 110dab1a870d..52f75199b046 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 r2.7
+=======
+r2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/label_to_label.yml b/.github/label_to_label.yml
index 5d6544a2f50f..ddaaf1517f2e 100644
--- a/.github/label_to_label.yml
+++ b/.github/label_to_label.yml
@@ -42,7 +42,11 @@
   - "module: aotinductor"
   - "module: cudagraphs"
   - "oncall: export"
+<<<<<<< HEAD
   - "module: startup-tracing-compile"
+=======
+  - "module: compile-time"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - "module: compiled autograd"
   - "module: flex attention"
   - "module: dynamic shapes"
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 5bf481fd6f34..dc25850ea932 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -112,3 +112,24 @@
 - torch/csrc/inductor/aoti_include/xpu.h
 - torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h
 - torch/csrc/inductor/cpp_wrapper/xpu.h
+<<<<<<< HEAD
+=======
+
+"release notes: inductor (aoti)":
+- torch/_C/_aoti.pyi
+- torch/_dynamo/repro/aoti.py
+- torch/_higher_order_ops/aoti_call_delegate.py
+- torch/_inductor/codegen/aoti_runtime/**
+- torch/_inductor/codegen/aoti_hipify_utils.py
+- torch/_inductor/codegen/cpp_wrapper_cpu.py
+- torch/_inductor/codegen/cpp_wrapper_gpu.py
+- torch/_inductor/aoti_eager.py
+- torch/csrc/inductor/aoti_runtime/**
+- torch/csrc/inductor/aoti_torch/**
+- torch/csrc/inductor/aoti_runner/**
+- torch/csrc/inductor/aoti_eager/**
+- torch/csrc/inductor/aoti_package/**
+- torch/csrc/inductor/aoti_include/**
+- torchgen/aoti/**
+- torchgen/gen_aoti_c_shim.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index f4b0dc127aa7..74dc75ae1733 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -123,6 +123,11 @@
   - torch/*docs.py
   approved_by:
   - svekars
+<<<<<<< HEAD
+=======
+  - sekyondaMeta
+  - AlannaBurke
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -393,19 +398,34 @@
   - torch/_inductor/mkldnn_lowerings.py
   - torch/_inductor/fx_passes/mkldnn_fusion.py
   - torch/_inductor/fx_passes/quantization.py
+<<<<<<< HEAD
   - torch/_inductor/codegen/cpp_prefix.h
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - torch/_inductor/codegen/cpp.py
   - torch/_inductor/codegen/cpp_utils.py
   - torch/_inductor/codegen/cpp_micro_gemm.py
   - torch/_inductor/codegen/cpp_template_kernel.py
   - torch/_inductor/codegen/cpp_template.py
+<<<<<<< HEAD
   - torch/_inductor/codegen/cpp_gemm_template.py
+=======
+  - torch/_inductor/codegen/cpp_bmm_template.py
+  - torch/_inductor/codegen/cpp_gemm_template.py
+  - torch/_inductor/codegen/cpp_grouped_gemm_template.py
+  - torch/_inductor/codegen/cpp_flex_attention_template.py
+  - torch/csrc/inductor/cpp_prefix.h
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - test/inductor/test_mkldnn_pattern_matcher.py
   - test/inductor/test_cpu_repro.py
   - test/inductor/test_cpu_cpp_wrapper.py
   - test/inductor/test_cpu_select_algorithm.py
   - aten/src/ATen/cpu/**
   - aten/src/ATen/native/quantized/cpu/**
+<<<<<<< HEAD
+=======
+  - aten/src/ATen/test/vec_test_all_types.*
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - test/quantization/core/test_quantized_op.py
   - torch/ao/quantization/quantizer/x86_inductor_quantizer.py
   - test/quantization/pt2e/test_x86inductor_quantizer.py
@@ -413,6 +433,10 @@
   - leslie-fang-intel
   - jgong5
   - EikanWang
+<<<<<<< HEAD
+=======
+  - CaoE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -501,7 +525,13 @@
 - name: XPU
   patterns:
   - '**xpu**'
+<<<<<<< HEAD
+  - '**sycl**'
+=======
+  - '**XPU**'
   - '**sycl**'
+  - '**SYCL**'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   approved_by:
   - EikanWang
   - jgong5
@@ -538,6 +568,10 @@
   - bdhirsh
   - zou3519
   - isuruf
+<<<<<<< HEAD
+=======
+  - Chillee
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mandatory_checks_name:
   - EasyCLA
   - Lint
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index ccb71e6a9bf0..e22b47feb618 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -11,11 +11,19 @@ ciflow_push_tags:
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-micro-benchmark-cpu-x86
+<<<<<<< HEAD
+=======
+- ciflow/inductor-perf-test-nightly-x86-zen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - ciflow/inductor-cu126
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic
+<<<<<<< HEAD
+=======
+- ciflow/periodic-rocm-mi300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
@@ -24,7 +32,14 @@ ciflow_push_tags:
 - ciflow/unstable
 - ciflow/xpu
 - ciflow/torchbench
+<<<<<<< HEAD
 - ciflow/autoformat
+=======
+- ciflow/op-benchmark
+- ciflow/pull
+- ciflow/h100
+- ciflow/h100-distributed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 retryable_workflows:
 - pull
 - trunk
diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt
index caabd1edf200..b0b3c046454b 100644
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@@ -10,5 +10,9 @@ lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
 pyyaml==6.0
+<<<<<<< HEAD
 requests==2.32.2
+=======
+requests==2.32.4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 rich==10.9.0
diff --git a/.github/requirements/README.md b/.github/requirements/README.md
index 102ac4d420f0..b3c9bde0b143 100644
--- a/.github/requirements/README.md
+++ b/.github/requirements/README.md
@@ -11,6 +11,7 @@ jobs, but it also allows them to be cached properly to improve CI
 reliability.
 
 The list of support files are as follows:
+<<<<<<< HEAD
 
 * Conda:
   * conda-env-iOS. This is used by iOS build and test jobs to setup the
@@ -22,5 +23,8 @@ The list of support files are as follows:
 * Pip:
   * pip-requirements-iOS.txt. This is used by iOS build and test jobs to
     setup the pip environment
+=======
+* Pip:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   * pip-requirements-macOS.txt. This is used by MacOS build and test jobs to
     setup the pip environment
diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64
index 24ba665883ff..df9701270655 100644
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 numpy=1.22.3
 pyyaml=6.0
 setuptools=72.1.0
@@ -20,3 +21,10 @@ certifi
 # Cross-compiling arm64 from x86-64 picks up 1.40.0 while testing on arm64
 # itself only has up to 1.39.0 from upstream conda. Both work though
 libuv>=1.39.0,<=1.40.0
+=======
+# Not pinning certifi so that we can always get the latest certificates
+certifi
+pip=23.2.1
+pkg-config=0.29.2
+wheel=0.37.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
index 06e0428c883b..13d4ae1e8f3f 100644
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@@ -1,4 +1,5 @@
 boto3==1.35.42
+<<<<<<< HEAD
 hypothesis==6.56.4
 expecttest==0.3.0
 fbscribelogger==0.1.7
@@ -31,3 +32,40 @@ optree==0.13.0
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
 parameterized==0.8.1
+=======
+cmake==3.27.*
+expecttest==0.3.0
+fbscribelogger==0.1.7
+filelock==3.6.0
+hypothesis==6.56.4
+librosa>=0.6.2
+mpmath==1.3.0
+networkx==2.8.7
+ninja==1.10.2.4
+numba==0.59.0
+numpy==1.26.4
+opt-einsum>=3.3
+optree==0.13.0
+packaging==23.1
+parameterized==0.8.1
+pillow==10.3.0
+protobuf==5.29.4
+psutil==5.9.1
+pygments==2.15.0
+pytest-cpp==2.3.0
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==10.3
+pytest-subtests==0.13.1
+pytest-xdist==3.3.1
+pytest==7.3.2
+pyyaml==6.0.2
+scipy==1.12.0
+setuptools==72.1.0
+sympy==1.13.3
+tlparse==0.3.30
+tensorboard==2.13.0
+typing-extensions==4.12.2
+unittest-xml-reporting<=3.2.0,>=2.0.0
+xdoctest==1.1.0
+z3-solver==4.12.2.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/scripts/amd/patch_triton_wheel.sh b/.github/scripts/amd/patch_triton_wheel.sh
index ac233bdc4318..cbf159d08d42 100755
--- a/.github/scripts/amd/patch_triton_wheel.sh
+++ b/.github/scripts/amd/patch_triton_wheel.sh
@@ -76,7 +76,11 @@ for pkg in /$WHEELHOUSE_DIR/*triton*.whl; do
         echo "Copied $filepath to $patchedpath"
     done
 
+<<<<<<< HEAD
     # Go through all required shared objects and see if any of our other objects are dependants.  If so, replace so.ver wth so
+=======
+    # Go through all required shared objects and see if any of our other objects are dependants.  If so, replace so.ver with so
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for ((i=0;i<${#deps[@]};++i)); do
         echo "replacing "${deps_soname[i]} ${patched[i]}
         replace_needed_sofiles $PREFIX/$ROCM_LIB ${deps_soname[i]} ${patched[i]}
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 9bf48ff011a2..303fdc451d83 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -22,8 +22,16 @@ def read_triton_pin(device: str = "cuda") -> str:
         return f.read().strip()
 
 
+<<<<<<< HEAD
 def read_triton_version() -> str:
     with open(REPO_DIR / ".ci" / "docker" / "triton_version.txt") as f:
+=======
+def read_triton_version(device: str = "cuda") -> str:
+    triton_version_file = "triton_version.txt"
+    if device == "xpu":
+        triton_version_file = "triton_xpu_version.txt"
+    with open(REPO_DIR / ".ci" / "docker" / triton_version_file) as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f.read().strip()
 
 
@@ -95,6 +103,10 @@ def build_triton(
     with TemporaryDirectory() as tmpdir:
         triton_basedir = Path(tmpdir) / "triton"
         triton_pythondir = triton_basedir / "python"
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
             triton_repo = "https://github.com/ROCm/triton"
@@ -126,7 +138,11 @@ def build_triton(
         patch_init_py(
             triton_pythondir / "triton" / "__init__.py",
             version=f"{version}",
+<<<<<<< HEAD
             expected_version=None,
+=======
+            expected_version=read_triton_version(device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         if device == "rocm":
@@ -137,11 +153,27 @@ def build_triton(
             )
             print("ROCm libraries setup for triton installation...")
 
+<<<<<<< HEAD
         check_call(
             [sys.executable, "setup.py", "bdist_wheel"], cwd=triton_pythondir, env=env
         )
 
         whl_path = next(iter((triton_pythondir / "dist").glob("*.whl")))
+=======
+        # old triton versions have setup.py in the python/ dir,
+        # new versions have it in the root dir.
+        triton_setupdir = (
+            triton_basedir
+            if (triton_basedir / "setup.py").exists()
+            else triton_pythondir
+        )
+
+        check_call(
+            [sys.executable, "setup.py", "bdist_wheel"], cwd=triton_setupdir, env=env
+        )
+
+        whl_path = next(iter((triton_setupdir / "dist").glob("*.whl")))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shutil.copy(whl_path, Path.cwd())
 
         if device == "rocm":
@@ -164,15 +196,29 @@ def main() -> None:
     parser.add_argument("--py-version", type=str)
     parser.add_argument("--commit-hash", type=str)
     parser.add_argument("--with-clang-ldd", action="store_true")
+<<<<<<< HEAD
     parser.add_argument("--triton-version", type=str, default=read_triton_version())
     args = parser.parse_args()
 
+=======
+    parser.add_argument("--triton-version", type=str, default=None)
+    args = parser.parse_args()
+
+    triton_version = read_triton_version(args.device)
+    if args.triton_version:
+        triton_version = args.triton_version
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     build_triton(
         device=args.device,
         commit_hash=(
             args.commit_hash if args.commit_hash else read_triton_pin(args.device)
         ),
+<<<<<<< HEAD
         version=args.triton_version,
+=======
+        version=triton_version,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         py_version=args.py_version,
         release=args.release,
         with_clang_ldd=args.with_clang_ldd,
diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py
index a10c3c3f886c..15640a856611 100644
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@@ -28,12 +28,20 @@ def main() -> None:
     issue = repo.get_issue(issue_number)
     issue_labels = issue.labels
     docathon_label_present = any(
+<<<<<<< HEAD
         label.name == "docathon-h1-2024" for label in issue_labels
+=======
+        label.name == "docathon-h1-2025" for label in issue_labels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     # if the issue has a docathon label, add all labels from the issue to the PR.
     if not docathon_label_present:
+<<<<<<< HEAD
         print("The 'docathon-h1-2024' label is not present in the issue.")
+=======
+        print("The 'docathon-h1-2025' label is not present in the issue.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
     pull_request_labels = pull_request.get_labels()
     pull_request_label_names = [label.name for label in pull_request_labels]
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index a65e427e8c22..46786f79117f 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+<<<<<<< HEAD
+=======
+# ruff: noqa: LOG015
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import json
 import logging
@@ -39,9 +43,15 @@ def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool:
 }
 
 # The link to the published list of disabled jobs
+<<<<<<< HEAD
 DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=n.FT07XR3dLMwOLBwmRNquyYSeGk8Het"
 # and unstable jobs
 UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=.Ox7WAXa21I1PVqadHyPfhMRPhl0aCnD"
+=======
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=HnkH0xQWnnsoeMsSIVf9291NE5c4jWSa"
+# and unstable jobs
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=iP_F8gBs60PfOMAJ8gnn1paVrzM1WYsK"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
@@ -79,7 +89,11 @@ def parse_args() -> Any:
     parser.add_argument(
         "--job-name",
         type=str,
+<<<<<<< HEAD
         help="the name of the current job, i.e. linux-focal-py3.8-gcc7 / build",
+=======
+        help="the name of the current job, i.e. linux-jammy-py3.8-gcc7 / build",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     parser.add_argument("--pr-number", type=str, help="the pull request number")
     parser.add_argument("--tag", type=str, help="the associated tag if it exists")
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index 67f86d878a88..608fd8666c49 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -15,6 +15,7 @@
 from typing import Optional
 
 
+<<<<<<< HEAD
 # NOTE: Also update the CUDA sources in tools/nightly.py when changing this list
 CUDA_ARCHES = ["11.8", "12.6", "12.8"]
 CUDA_STABLE = "12.6"
@@ -36,10 +37,32 @@
 
 CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
 
+=======
+# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
+CUDA_ARCHES = ["12.6", "12.8", "12.9"]
+CUDA_STABLE = "12.8"
+CUDA_ARCHES_FULL_VERSION = {
+    "12.6": "12.6.3",
+    "12.8": "12.8.1",
+    "12.9": "12.9.1",
+}
+CUDA_ARCHES_CUDNN_VERSION = {
+    "12.6": "9",
+    "12.8": "9",
+    "12.9": "9",
+}
+
+# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
+ROCM_ARCHES = ["6.3", "6.4"]
+
+XPU_ARCHES = ["xpu"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CPU_AARCH64_ARCH = ["cpu-aarch64"]
 
 CPU_S390X_ARCH = ["cpu-s390x"]
 
+<<<<<<< HEAD
 CUDA_AARCH64_ARCHES = ["12.8-aarch64"]
 
 
@@ -57,23 +80,39 @@
         "nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
     ),
+=======
+CUDA_AARCH64_ARCHES = ["12.9-aarch64"]
+
+
+PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "12.6": (
         "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+<<<<<<< HEAD
         "nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+=======
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+<<<<<<< HEAD
         "nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+=======
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
     ),
     "12.8": (
+<<<<<<< HEAD
         "nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@@ -101,6 +140,60 @@
         "tcmlib==1.2.0 | "
         "umf==0.9.1 | "
         "intel-pti==0.10.1"
+=======
+        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
+    "12.9": (
+        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
+    "xpu": (
+        "intel-cmplr-lib-rt==2025.1.1 | "
+        "intel-cmplr-lib-ur==2025.1.1 | "
+        "intel-cmplr-lic-rt==2025.1.1 | "
+        "intel-sycl-rt==2025.1.1 | "
+        "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "onemkl-sycl-blas==2025.1.0 | "
+        "onemkl-sycl-dft==2025.1.0 | "
+        "onemkl-sycl-lapack==2025.1.0 | "
+        "onemkl-sycl-rng==2025.1.0 | "
+        "onemkl-sycl-sparse==2025.1.0 | "
+        "dpcpp-cpp-rt==2025.1.1 | "
+        "intel-opencl-rt==2025.1.1 | "
+        "mkl==2025.1.0 | "
+        "intel-openmp==2025.1.1 | "
+        "tbb==2022.1.0 | "
+        "tcmlib==1.3.0 | "
+        "umf==0.10.0 | "
+        "intel-pti==0.12.3"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
 }
 
@@ -146,8 +239,11 @@ def arch_type(arch_version: str) -> str:
         return "rocm"
     elif arch_version in XPU_ARCHES:
         return "xpu"
+<<<<<<< HEAD
     elif arch_version in CPU_CXX11_ABI_ARCH:
         return "cpu-cxx11-abi"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif arch_version in CPU_AARCH64_ARCH:
         return "cpu-aarch64"
     elif arch_version in CPU_S390X_ARCH:
@@ -158,6 +254,7 @@ def arch_type(arch_version: str) -> str:
         return "cpu"
 
 
+<<<<<<< HEAD
 # This can be updated to the release version when cutting release branch, i.e. 2.1
 DEFAULT_TAG = os.getenv("RELEASE_VERSION_TAG", "main")
 
@@ -201,6 +298,30 @@ def arch_type(arch_version: str) -> str:
         for gpu_arch in ROCM_ARCHES
     },
     ("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
+=======
+DEFAULT_TAG = os.getenv("RELEASE_VERSION_TAG", "main")
+
+WHEEL_CONTAINER_IMAGES = {
+    **{gpu_arch: f"manylinux2_28-builder:cuda{gpu_arch}" for gpu_arch in CUDA_ARCHES},
+    **{
+        gpu_arch: f"manylinuxaarch64-builder:cuda{gpu_arch.replace('-aarch64', '')}"
+        for gpu_arch in CUDA_AARCH64_ARCHES
+    },
+    **{gpu_arch: f"manylinux2_28-builder:rocm{gpu_arch}" for gpu_arch in ROCM_ARCHES},
+    "xpu": "manylinux2_28-builder:xpu",
+    "cpu": "manylinux2_28-builder:cpu",
+    "cpu-aarch64": "manylinux2_28_aarch64-builder:cpu-aarch64",
+    "cpu-s390x": "pytorch/manylinuxs390x-builder:cpu-s390x",
+}
+
+RELEASE = "release"
+DEBUG = "debug"
+
+LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
+    **{gpu_arch: f"libtorch-cxx11-builder:cuda{gpu_arch}" for gpu_arch in CUDA_ARCHES},
+    **{gpu_arch: f"libtorch-cxx11-builder:rocm{gpu_arch}" for gpu_arch in ROCM_ARCHES},
+    "cpu": "libtorch-cxx11-builder:cpu",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
@@ -210,7 +331,10 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
     return {
         "cpu": "cpu",
         "cpu-aarch64": "cpu",
+<<<<<<< HEAD
         "cpu-cxx11-abi": "cpu-cxx11-abi",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "cpu-s390x": "cpu",
         "cuda": f"cu{gpu_arch_version.replace('.', '')}",
         "cuda-aarch64": f"cu{gpu_arch_version.replace('-aarch64', '').replace('.', '')}",
@@ -225,7 +349,11 @@ def list_without(in_list: list[str], without: list[str]) -> list[str]:
 
 def generate_libtorch_matrix(
     os: str,
+<<<<<<< HEAD
     abi_version: str,
+=======
+    release_type: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arches: Optional[list[str]] = None,
     libtorch_variants: Optional[list[str]] = None,
 ) -> list[dict[str, str]]:
@@ -247,9 +375,12 @@ def generate_libtorch_matrix(
     ret: list[dict[str, str]] = []
     for arch_version in arches:
         for libtorch_variant in libtorch_variants:
+<<<<<<< HEAD
             # one of the values in the following list must be exactly
             # CXX11_ABI, but the precise value of the other one doesn't
             # matter
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gpu_arch_type = arch_type(arch_version)
             gpu_arch_version = "" if arch_version == "cpu" else arch_version
             # ROCm builds without-deps failed even in ROCm runners; skip for now
@@ -262,6 +393,7 @@ def generate_libtorch_matrix(
                     "desired_cuda": translate_desired_cuda(
                         gpu_arch_type, gpu_arch_version
                     ),
+<<<<<<< HEAD
                     "libtorch_variant": libtorch_variant,
                     "libtorch_config": abi_version
                     if os in ("windows", "windows-arm64")
@@ -271,11 +403,26 @@ def generate_libtorch_matrix(
                     else "",
                     "container_image": (
                         LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
+=======
+                    "libtorch_config": release_type,
+                    "libtorch_variant": libtorch_variant,
+                    "container_image": (
+                        LIBTORCH_CONTAINER_IMAGES[arch_version].split(":")[0]
+                        if os not in ("windows", "windows-arm64")
+                        else ""
+                    ),
+                    "container_image_tag_prefix": (
+                        LIBTORCH_CONTAINER_IMAGES[arch_version].split(":")[1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         if os not in ("windows", "windows-arm64")
                         else ""
                     ),
                     "package_type": "libtorch",
+<<<<<<< HEAD
                     "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(
+=======
+                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{release_type}".replace(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         ".", "_"
                     ),
                 }
@@ -301,7 +448,11 @@ def generate_wheels_matrix(
         # Define default compute archivectures
         arches = ["cpu"]
         if os == "linux":
+<<<<<<< HEAD
             arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
+=======
+            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif os == "windows":
             arches += CUDA_ARCHES + XPU_ARCHES
         elif os == "linux-aarch64":
@@ -320,7 +471,10 @@ def generate_wheels_matrix(
             gpu_arch_version = (
                 ""
                 if arch_version == "cpu"
+<<<<<<< HEAD
                 or arch_version == "cpu-cxx11-abi"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 or arch_version == "cpu-aarch64"
                 or arch_version == "cpu-s390x"
                 or arch_version == "xpu"
@@ -332,10 +486,17 @@ def generate_wheels_matrix(
                 continue
 
             if use_split_build and (
+<<<<<<< HEAD
                 arch_version not in ["12.6", "12.8", "11.8", "cpu"] or os != "linux"
             ):
                 raise RuntimeError(
                     "Split build is only supported on linux with cuda 12*, 11.8, and cpu.\n"
+=======
+                arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"
+            ):
+                raise RuntimeError(
+                    "Split build is only supported on linux with cuda 12* and cpu.\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
                     "Please modify the matrix generation to exclude this combination."
                 )
@@ -343,7 +504,11 @@ def generate_wheels_matrix(
             # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
 
             if (
+<<<<<<< HEAD
                 arch_version in ["12.8", "12.6", "11.8"]
+=======
+                arch_version in ["12.9", "12.8", "12.6"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and os == "linux"
                 or arch_version in CUDA_AARCH64_ARCHES
             ):
@@ -355,8 +520,17 @@ def generate_wheels_matrix(
                         "gpu_arch_version": gpu_arch_version,
                         "desired_cuda": desired_cuda,
                         "use_split_build": "True" if use_split_build else "False",
+<<<<<<< HEAD
                         "devtoolset": "cxx11-abi",
                         "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
+=======
+                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
+                            ":"
+                        )[0],
+                        "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
+                            arch_version
+                        ].split(":")[1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "package_type": package_type,
                         "pytorch_extra_install_requirements": (
                             PYTORCH_EXTRA_INSTALL_REQUIREMENTS[
@@ -384,8 +558,17 @@ def generate_wheels_matrix(
                                 gpu_arch_type, gpu_arch_version
                             ),
                             "use_split_build": "True" if use_split_build else "False",
+<<<<<<< HEAD
                             "devtoolset": "",
                             "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
+=======
+                            "container_image": WHEEL_CONTAINER_IMAGES[
+                                arch_version
+                            ].split(":")[0],
+                            "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
+                                arch_version
+                            ].split(":")[1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             "package_type": package_type,
                             "pytorch_extra_install_requirements": "",
                             "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
@@ -403,6 +586,7 @@ def generate_wheels_matrix(
                             gpu_arch_type, gpu_arch_version
                         ),
                         "use_split_build": "True" if use_split_build else "False",
+<<<<<<< HEAD
                         "devtoolset": (
                             "cxx11-abi"
                             if (arch_version in ["cpu-cxx11-abi", "cpu-aarch64"])
@@ -410,6 +594,14 @@ def generate_wheels_matrix(
                             else ""
                         ),
                         "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
+=======
+                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
+                            ":"
+                        )[0],
+                        "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
+                            arch_version
+                        ].split(":")[1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "package_type": package_type,
                         "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
                             ".", "_"
@@ -425,6 +617,12 @@ def generate_wheels_matrix(
     return ret
 
 
+<<<<<<< HEAD
 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
 validate_nccl_dep_consistency("11.8")
+=======
+validate_nccl_dep_consistency("12.9")
+validate_nccl_dep_consistency("12.8")
+validate_nccl_dep_consistency("12.6")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 520845413e20..da9fe66bbf1e 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -54,7 +54,10 @@ class BinaryBuildWorkflow:
 
     # Optional fields
     build_environment: str = ""
+<<<<<<< HEAD
     abi_version: str = ""
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig)
     is_scheduled: str = ""
     branches: str = "nightly"
@@ -62,6 +65,7 @@ class BinaryBuildWorkflow:
     cross_compile_arm64: bool = False
     macos_runner: str = "macos-14-xlarge"
     use_split_build: bool = False
+<<<<<<< HEAD
 
     def __post_init__(self) -> None:
         if self.abi_version:
@@ -70,6 +74,18 @@ def __post_init__(self) -> None:
             )
         else:
             self.build_environment = f"{self.os}-binary-{self.package_type}"
+=======
+    # Mainly used for libtorch builds
+    build_variant: str = ""
+
+    def __post_init__(self) -> None:
+        if self.build_environment == "":
+            self.build_environment = "-".join(
+                item
+                for item in [self.os, "binary", self.package_type, self.build_variant]
+                if item != ""
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.use_split_build:
             # added to distinguish concurrency groups
             self.build_environment += "-split"
@@ -133,10 +149,16 @@ class OperatingSystem:
     BinaryBuildWorkflow(
         os=OperatingSystem.LINUX,
         package_type="libtorch",
+<<<<<<< HEAD
         abi_version=generate_binary_build_matrix.CXX11_ABI,
         build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
             OperatingSystem.LINUX,
             generate_binary_build_matrix.CXX11_ABI,
+=======
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.LINUX,
+            generate_binary_build_matrix.RELEASE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             libtorch_variants=["shared-with-deps"],
         ),
         ciflow_config=CIFlowConfig(
@@ -152,7 +174,11 @@ class OperatingSystem:
         package_type="manywheel",
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(
             OperatingSystem.LINUX,
+<<<<<<< HEAD
             arches=["11.8", "12.6", "12.8"],
+=======
+            arches=["12.6", "12.8", "12.9", "6.4"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             python_versions=["3.9"],
         ),
         branches="main",
@@ -176,10 +202,17 @@ class OperatingSystem:
     BinaryBuildWorkflow(
         os=OperatingSystem.LINUX,
         package_type="libtorch",
+<<<<<<< HEAD
         abi_version=generate_binary_build_matrix.CXX11_ABI,
         build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
             OperatingSystem.LINUX,
             generate_binary_build_matrix.CXX11_ABI,
+=======
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.LINUX,
+            generate_binary_build_matrix.RELEASE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             arches=["cpu"],
             libtorch_variants=["shared-with-deps"],
         ),
@@ -202,7 +235,11 @@ class OperatingSystem:
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS,
         package_type="libtorch",
+<<<<<<< HEAD
         abi_version=generate_binary_build_matrix.RELEASE,
+=======
+        build_variant=generate_binary_build_matrix.RELEASE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
             OperatingSystem.WINDOWS,
             generate_binary_build_matrix.RELEASE,
@@ -216,7 +253,11 @@ class OperatingSystem:
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS,
         package_type="libtorch",
+<<<<<<< HEAD
         abi_version=generate_binary_build_matrix.DEBUG,
+=======
+        build_variant=generate_binary_build_matrix.DEBUG,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
             OperatingSystem.WINDOWS,
             generate_binary_build_matrix.DEBUG,
@@ -227,13 +268,63 @@ class OperatingSystem:
             isolated_workflow=True,
         ),
     ),
+<<<<<<< HEAD
+=======
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS_ARM64,
+        package_type="wheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.WINDOWS_ARM64,
+            arches=["cpu"],
+            python_versions=["3.11", "3.12", "3.13"],
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS_ARM64,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS_ARM64,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS_ARM64,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.DEBUG,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS_ARM64,
+            generate_binary_build_matrix.DEBUG,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
+            isolated_workflow=True,
+        ),
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 WINDOWS_BINARY_SMOKE_WORKFLOWS = [
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS,
         package_type="libtorch",
+<<<<<<< HEAD
         abi_version=generate_binary_build_matrix.RELEASE,
+=======
+        build_variant=generate_binary_build_matrix.RELEASE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
             OperatingSystem.WINDOWS,
             generate_binary_build_matrix.RELEASE,
@@ -248,7 +339,11 @@ class OperatingSystem:
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS,
         package_type="libtorch",
+<<<<<<< HEAD
         abi_version=generate_binary_build_matrix.DEBUG,
+=======
+        build_variant=generate_binary_build_matrix.DEBUG,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
             OperatingSystem.WINDOWS,
             generate_binary_build_matrix.DEBUG,
@@ -262,6 +357,7 @@ class OperatingSystem:
     ),
 ]
 
+<<<<<<< HEAD
 WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS_ARM64,
@@ -308,14 +404,23 @@ class OperatingSystem:
     ),
 ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MACOS_BINARY_BUILD_WORKFLOWS = [
     BinaryBuildWorkflow(
         os=OperatingSystem.MACOS_ARM64,
         package_type="libtorch",
+<<<<<<< HEAD
         abi_version=generate_binary_build_matrix.CXX11_ABI,
         build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
             OperatingSystem.MACOS,
             generate_binary_build_matrix.CXX11_ABI,
+=======
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.MACOS,
+            generate_binary_build_matrix.RELEASE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             libtorch_variants=["shared-with-deps"],
         ),
         cross_compile_arm64=False,
@@ -403,10 +508,13 @@ def main() -> None:
             WINDOWS_BINARY_SMOKE_WORKFLOWS,
         ),
         (
+<<<<<<< HEAD
             jinja_env.get_template("windows_arm64_binary_build_workflow.yml.j2"),
             WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS,
         ),
         (
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
             MACOS_BINARY_BUILD_WORKFLOWS,
         ),
diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py
index cfbfe315bf69..7c8cac54bff9 100644
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@@ -64,7 +64,11 @@ def fetch_url(
             )
         exception_message = (
             "Is github alright?",
+<<<<<<< HEAD
             f"Recieved status code '{err.code}' when attempting to retrieve {url}:\n",
+=======
+            f"Received status code '{err.code}' when attempting to retrieve {url}:\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"{err.reason}\n\nheaders={err.headers}",
         )
         raise RuntimeError(exception_message) from err
diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py
index 3a42298cdf37..cef5aa20f950 100644
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@@ -128,7 +128,11 @@ def gh_fetch_json_dict(
 
 def gh_graphql(query: str, **kwargs: Any) -> dict[str, Any]:
     rc = gh_fetch_url(
+<<<<<<< HEAD
         "https://api.github.com/graphql",
+=======
+        "https://api.github.com/graphql",  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         data={"query": query, "variables": kwargs},
         reader=json.load,
     )
diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py
index 43ee063bd634..4b572f2b681c 100644
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@@ -211,7 +211,11 @@ def compute_branch_diffs(
         self, from_branch: str, to_branch: str
     ) -> tuple[list[str], list[str]]:
         """
+<<<<<<< HEAD
         Returns list of commmits that are missing in each other branch since their merge base
+=======
+        Returns list of commits that are missing in each other branch since their merge base
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Might be slow if merge base is between two branches is pretty far off
         """
         from_ref = self.rev_parse(from_branch)
diff --git a/.github/scripts/label_utils.py b/.github/scripts/label_utils.py
index 00c7cbf8e322..542acbd9b281 100644
--- a/.github/scripts/label_utils.py
+++ b/.github/scripts/label_utils.py
@@ -45,7 +45,11 @@ def get_last_page_num_from_header(header: Any) -> int:
     # rel="next", <https://api.github.com/repositories/65600975/labels?per_page=100&page=3>; rel="last"
     link_info = header["link"]
     # Docs does not specify that it should be present for projects with just few labels
+<<<<<<< HEAD
     # And https://github.com/malfet/deleteme/actions/runs/7334565243/job/19971396887 it's not the case
+=======
+    # And https://github.com/malfet/deleteme/actions/runs/7334565243/job/19971396887 it's not the case  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if link_info is None:
         return 1
     prefix = "&page="
diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh
index a3d78d116b3b..f1f479a0b3f9 100755
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 set -ex
 
+<<<<<<< HEAD
 # The generic Linux job chooses to use base env, not the one setup by the image
 CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
 eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
@@ -8,6 +9,10 @@ conda activate "${CONDA_ENV}"
 
 # Use uv to speed up lintrunner init
 python3 -m pip install uv==0.1.45
+=======
+# Use uv to speed up lintrunner init
+python3 -m pip install uv==0.1.45 setuptools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 CACHE_DIRECTORY="/tmp/.lintbin"
 # Try to recover the cached binaries
@@ -36,6 +41,12 @@ python3 -m tools.pyi.gen_pyi \
     --deprecated-functions-path "tools/autograd/deprecated.yaml"
 python3 torch/utils/data/datapipes/gen_pyi.py
 
+<<<<<<< HEAD
+=======
+# Also check generated pyi files
+find torch -name '*.pyi' -exec git add --force -- "{}" +
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RC=0
 # Run lintrunner on all files
 if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
@@ -46,6 +57,12 @@ if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS}
     RC=1
 fi
 
+<<<<<<< HEAD
+=======
+# Unstage temporally added pyi files
+find torch -name '*.pyi' -exec git restore --staged -- "{}" +
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
 jq --raw-output \
     '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
diff --git a/.github/scripts/pr-sanity-check.sh b/.github/scripts/pr-sanity-check.sh
index 2b33dd91f770..82a74b535a6e 100644
--- a/.github/scripts/pr-sanity-check.sh
+++ b/.github/scripts/pr-sanity-check.sh
@@ -12,7 +12,11 @@ BASE=${BASE:-HEAD~1}
 HEAD=${HEAD:-HEAD}
 
 ancestor=$(git merge-base "${BASE}" "${HEAD}")
+<<<<<<< HEAD
 echo "INFO: Checking aginst the following stats"
+=======
+echo "INFO: Checking against the following stats"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 (
     set -x
     git diff --stat=10000 "$ancestor" "${HEAD}" | sed '$d' > "${TMPFILE}"
diff --git a/.github/scripts/runner_determinator.py b/.github/scripts/runner_determinator.py
index e6846e42475b..69045db55bd7 100644
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@@ -199,6 +199,19 @@ def parse_args() -> Any:
         help="comma separated list of experiments to check, if omitted all experiments marked with default=True are checked",
     )
     parser.add_argument(
+<<<<<<< HEAD
+=======
+        "--opt-out-experiments",
+        type=_str_comma_separated_to_set,
+        required=False,
+        default="",
+        help=(
+            "comma separated list of experiments to opt-out of. If unset, no opt-outs will occur. "
+            "If the same experiment is listed both here and in '--eligible-experiments' opt-out will take priority."
+        ),
+    )
+    parser.add_argument(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "--pr-number",
         type=str,
         required=False,
@@ -422,6 +435,10 @@ def get_runner_prefix(
     workflow_requestors: Iterable[str],
     branch: str,
     eligible_experiments: frozenset[str] = frozenset(),
+<<<<<<< HEAD
+=======
+    opt_out_experiments: frozenset[str] = frozenset(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_canary: bool = False,
 ) -> str:
     settings = parse_settings(rollout_state)
@@ -436,6 +453,17 @@ def get_runner_prefix(
             )
             continue
 
+<<<<<<< HEAD
+=======
+        if opt_out_experiments:
+            if experiment_name in opt_out_experiments:
+                opt_out_exp_list = ", ".join(opt_out_experiments)
+                log.info(
+                    f"Skipping experiment '{experiment_name}', as this workflow has opted-out (opted out experiments are: {opt_out_exp_list})"
+                )
+                continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if eligible_experiments:
             if experiment_name not in eligible_experiments:
                 exp_list = ", ".join(eligible_experiments)
@@ -600,6 +628,10 @@ def main() -> None:
             (args.github_issue_owner, username),
             args.github_branch,
             args.eligible_experiments,
+<<<<<<< HEAD
+=======
+            args.opt_out_experiments,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is_canary,
         )
 
diff --git a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
index 7e7f47a459f3..0b0bf7f5d020 100644
--- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
@@ -5,6 +5,53 @@ FROM --platform=linux/amd64 docker.io/ubuntu:24.04 as ld-prefix
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get -y install ca-certificates libicu74 libssl3
 
+<<<<<<< HEAD
+=======
+# Patched podman
+FROM --platform=linux/s390x docker.io/ubuntu:24.04 as podman
+ENV DEBIAN_FRONTEND=noninteractive
+RUN sed -i 's/^Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/ubuntu.sources
+RUN apt-get update && \
+    apt-get install -y \
+        cmake \
+        curl \
+        devscripts \
+        dpkg-dev \
+        gdb \
+        less \
+        make \
+        python3 \
+        python3-pip \
+        quilt \
+        rsync \
+        software-properties-common \
+        stress-ng \
+        vim \
+        nano \
+        wget && \
+    apt-get build-dep -y podman && \
+    apt-get source podman
+
+COPY podman-patches/podman-25245.patch /tmp/podman-25245.patch
+COPY podman-patches/podman-25102-backport.patch /tmp/podman-25102-backport.patch
+
+# import and apply patches
+# patches:
+# https://github.com/containers/podman/pull/25102
+# https://github.com/containers/podman/pull/25245
+RUN cd /libpod-* && \
+    quilt import /tmp/podman-25245.patch && quilt push && \
+    quilt import /tmp/podman-25102-backport.patch && quilt push && \
+    dch -i "Fix podman deadlock and add option to clean up build leftovers" && \
+    /bin/rm /tmp/podman-25245.patch /tmp/podman-25102-backport.patch
+
+# build patched podman
+RUN cd /libpod-* && \
+    debuild -i -us -uc -b && \
+    /bin/rm /podman-remote_*.deb && \
+    mkdir /tmp/podman && cp -v /podman*.deb /tmp/podman
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Main image.
 FROM --platform=linux/s390x docker.io/ubuntu:24.04
 
@@ -45,7 +92,15 @@ COPY fs/ /
 RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint
 
 # install podman
+<<<<<<< HEAD
 RUN apt -y install podman podman-docker
+=======
+# RUN apt-get update && apt -y install podman podman-docker
+
+# install patched podman
+COPY --from=podman /tmp/podman /tmp/podman
+RUN apt-get update && apt -y install /tmp/podman/*.deb && /bin/rm -rfv /tmp/podman
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # amd64 Github Actions Runner.
 RUN useradd -m actions-runner
@@ -65,7 +120,11 @@ RUN virtualenv --system-site-packages venv
 #
 COPY --chown=actions-runner:actions-runner manywheel-s390x.tar /home/actions-runner/manywheel-s390x.tar
 
+<<<<<<< HEAD
 RUN curl -L https://github.com/actions/runner/releases/download/v2.317.0/actions-runner-linux-x64-2.317.0.tar.gz | tar -xz
+=======
+RUN curl -L https://github.com/actions/runner/releases/download/v2.322.0/actions-runner-linux-x64-2.322.0.tar.gz | tar -xz
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ENTRYPOINT ["/usr/bin/entrypoint"]
 CMD ["/usr/bin/actions-runner"]
diff --git a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
index 0fe99fe85da7..121f18c71a19 100644
--- a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
+++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
@@ -27,6 +27,12 @@ unset ACCESS_TOKEN
 # it does one job, stops and unregisters
 registration_token=$(jq --raw-output .token "$token_file")
 
+<<<<<<< HEAD
+=======
+# workaround for https://gitlab.com/qemu-project/qemu/-/issues/2600
+export DOTNET_EnableWriteXorExecute=0
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ./config.sh \
         --unattended \
         --ephemeral \
@@ -44,8 +50,11 @@ rm -f "$token_file"
 # and it doesn't work for non-root user
 source venv/bin/activate
 
+<<<<<<< HEAD
 # workaround for https://gitlab.com/qemu-project/qemu/-/issues/2600
 export DOTNET_EnableWriteXorExecute=0
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Run one job.
 ./run.sh
diff --git a/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch
new file mode 100644
index 000000000000..16dc10e85f6d
--- /dev/null
+++ b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch
@@ -0,0 +1,358 @@
+diff --git a/cmd/podman/system/prune.go b/cmd/podman/system/prune.go
+index f7cf7b551..739f87cde 100644
+--- a/cmd/podman/system/prune.go
++++ b/cmd/podman/system/prune.go
+@@ -48,6 +48,7 @@ func init() {
+ 	flags.BoolVarP(&force, "force", "f", false, "Do not prompt for confirmation.  The default is false")
+ 	flags.BoolVarP(&pruneOptions.All, "all", "a", false, "Remove all unused data")
+ 	flags.BoolVar(&pruneOptions.External, "external", false, "Remove container data in storage not controlled by podman")
++	flags.BoolVar(&pruneOptions.Build, "build", false, "Remove build containers")
+ 	flags.BoolVar(&pruneOptions.Volume, "volumes", false, "Prune volumes")
+ 	filterFlagName := "filter"
+ 	flags.StringArrayVar(&filters, filterFlagName, []string{}, "Provide filter values (e.g. 'label=<key>=<value>')")
+@@ -64,8 +65,12 @@ func prune(cmd *cobra.Command, args []string) error {
+ 			volumeString = `
+ 	- all volumes not used by at least one container`
+ 		}
+-
+-		fmt.Printf(createPruneWarningMessage(pruneOptions), volumeString, "Are you sure you want to continue? [y/N] ")
++		buildString := ""
++		if pruneOptions.Build {
++			buildString = `
++	- all build containers`
++		}
++		fmt.Printf(createPruneWarningMessage(pruneOptions), volumeString, buildString, "Are you sure you want to continue? [y/N] ")
+ 
+ 		answer, err := reader.ReadString('\n')
+ 		if err != nil {
+@@ -124,7 +129,7 @@ func createPruneWarningMessage(pruneOpts entities.SystemPruneOptions) string {
+ 	if pruneOpts.All {
+ 		return `WARNING! This command removes:
+ 	- all stopped containers
+-	- all networks not used by at least one container%s
++	- all networks not used by at least one container%s%s
+ 	- all images without at least one container associated with them
+ 	- all build cache
+ 
+@@ -132,7 +137,7 @@ func createPruneWarningMessage(pruneOpts entities.SystemPruneOptions) string {
+ 	}
+ 	return `WARNING! This command removes:
+ 	- all stopped containers
+-	- all networks not used by at least one container%s
++	- all networks not used by at least one container%s%s
+ 	- all dangling images
+ 	- all dangling build cache
+ 
+diff --git a/docs/source/markdown/podman-system-prune.1.md b/docs/source/markdown/podman-system-prune.1.md
+index 52f9ec1c7..95099d018 100644
+--- a/docs/source/markdown/podman-system-prune.1.md
++++ b/docs/source/markdown/podman-system-prune.1.md
+@@ -7,20 +7,28 @@ podman\-system\-prune - Remove all unused pods, containers, images, networks, an
+ **podman system prune** [*options*]
+ 
+ ## DESCRIPTION
+-**podman system prune** removes all unused containers (both dangling and unreferenced), pods, networks, and optionally, volumes from local storage.
++**podman system prune** removes all unused containers (both dangling and unreferenced), build containers, pods, networks, and optionally, volumes from local storage.
+ 
+ Use the **--all** option to delete all unused images.  Unused images are dangling images as well as any image that does not have any containers based on it.
+ 
+ By default, volumes are not removed to prevent important data from being deleted if there is currently no container using the volume. Use the **--volumes** flag when running the command to prune volumes as well.
+ 
++By default, build containers are not removed to prevent interference with builds in progress. Use the **--build** flag when running the command to remove build containers as well.
++
+ ## OPTIONS
+ #### **--all**, **-a**
+ 
+ Recursively remove all unused pods, containers, images, networks, and volume data. (Maximum 50 iterations.)
+ 
++#### **--build**
++
++Removes any build containers that were created during the build, but were not removed because the build was unexpectedly terminated.
++
++Note: **This is not safe operation and should be executed only when no builds are in progress. It can interfere with builds in progress.**
++
+ #### **--external**
+ 
+-Removes all leftover container storage files from local storage not managed by Podman. In normal circumstances, no such data exists, but in case of an unclean shutdown, the Podman database may be corrupted and cause this.
++Tries to clean up remainders of previous containers or layers that are not references in the storage json files. These can happen in the case of unclean shutdowns or regular restarts in transient storage mode.
+ 
+ However, when using transient storage mode, the Podman database does not persist. This means containers leave the writable layers on disk after a reboot. When using a transient store, it is recommended that the **podman system prune --external** command is run during boot.
+ 
+diff --git a/libpod/runtime.go b/libpod/runtime.go
+index 986e40f60..609fbba57 100644
+--- a/libpod/runtime.go
++++ b/libpod/runtime.go
+@@ -33,6 +33,7 @@ import (
+ 	"github.com/containers/podman/v4/libpod/lock"
+ 	"github.com/containers/podman/v4/libpod/plugin"
+ 	"github.com/containers/podman/v4/libpod/shutdown"
++	"github.com/containers/podman/v4/pkg/domain/entities/reports"
+ 	"github.com/containers/podman/v4/pkg/rootless"
+ 	"github.com/containers/podman/v4/pkg/systemd"
+ 	"github.com/containers/podman/v4/pkg/util"
+@@ -1250,3 +1251,52 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, []uint32, error) {
+ 
+ 	return toReturn, locksHeld, nil
+ }
++
++// Exists checks whether a file or directory exists at the given path.
++// If the path is a symlink, the symlink is followed.
++func Exists(path string) error {
++	// It uses unix.Faccessat which is a faster operation compared to os.Stat for
++	// simply checking the existence of a file.
++	err := unix.Faccessat(unix.AT_FDCWD, path, unix.F_OK, 0)
++	if err != nil {
++		return &os.PathError{Op: "faccessat", Path: path, Err: err}
++	}
++	return nil
++}
++
++// PruneBuildContainers removes any build containers that were created during the build,
++// but were not removed because the build was unexpectedly terminated.
++//
++// Note: This is not safe operation and should be executed only when no builds are in progress. It can interfere with builds in progress.
++func (r *Runtime) PruneBuildContainers() ([]*reports.PruneReport, error) {
++	stageContainersPruneReports := []*reports.PruneReport{}
++
++	containers, err := r.store.Containers()
++	if err != nil {
++		return stageContainersPruneReports, err
++	}
++	for _, container := range containers {
++		path, err := r.store.ContainerDirectory(container.ID)
++		if err != nil {
++			return stageContainersPruneReports, err
++		}
++		if err := Exists(filepath.Join(path, "buildah.json")); err != nil {
++			continue
++		}
++
++		report := &reports.PruneReport{
++			Id: container.ID,
++		}
++		size, err := r.store.ContainerSize(container.ID)
++		if err != nil {
++			report.Err = err
++		}
++		report.Size = uint64(size)
++
++		if err := r.store.DeleteContainer(container.ID); err != nil {
++			report.Err = errors.Join(report.Err, err)
++		}
++		stageContainersPruneReports = append(stageContainersPruneReports, report)
++	}
++	return stageContainersPruneReports, nil
++}
+diff --git a/pkg/api/handlers/libpod/system.go b/pkg/api/handlers/libpod/system.go
+index 70d4493f8..7c129b1ba 100644
+--- a/pkg/api/handlers/libpod/system.go
++++ b/pkg/api/handlers/libpod/system.go
+@@ -22,6 +22,7 @@ func SystemPrune(w http.ResponseWriter, r *http.Request) {
+ 		All      bool `schema:"all"`
+ 		Volumes  bool `schema:"volumes"`
+ 		External bool `schema:"external"`
++		Build    bool `schema:"build"`
+ 	}{}
+ 
+ 	if err := decoder.Decode(&query, r.URL.Query()); err != nil {
+@@ -43,6 +44,7 @@ func SystemPrune(w http.ResponseWriter, r *http.Request) {
+ 		Volume:   query.Volumes,
+ 		Filters:  *filterMap,
+ 		External: query.External,
++		Build:    query.Build,
+ 	}
+ 	report, err := containerEngine.SystemPrune(r.Context(), pruneOptions)
+ 	if err != nil {
+diff --git a/pkg/bindings/system/types.go b/pkg/bindings/system/types.go
+index 89e093f68..b4a4ff064 100644
+--- a/pkg/bindings/system/types.go
++++ b/pkg/bindings/system/types.go
+@@ -18,6 +18,7 @@ type PruneOptions struct {
+ 	Filters  map[string][]string
+ 	Volumes  *bool
+ 	External *bool
++	Build    *bool
+ }
+ 
+ // VersionOptions are optional options for getting version info
+diff --git a/pkg/bindings/system/types_prune_options.go b/pkg/bindings/system/types_prune_options.go
+index d00498520..5f3bd652c 100644
+--- a/pkg/bindings/system/types_prune_options.go
++++ b/pkg/bindings/system/types_prune_options.go
+@@ -76,3 +76,18 @@ func (o *PruneOptions) GetExternal() bool {
+ 	}
+ 	return *o.External
+ }
++
++// WithBuild set field Build to given value
++func (o *PruneOptions) WithBuild(value bool) *PruneOptions {
++	o.Build = &value
++	return o
++}
++
++// GetBuild returns value of field Build
++func (o *PruneOptions) GetBuild() bool {
++	if o.Build == nil {
++		var z bool
++		return z
++	}
++	return *o.Build
++}
+diff --git a/pkg/domain/entities/system.go b/pkg/domain/entities/system.go
+index 473db3530..f6938652a 100644
+--- a/pkg/domain/entities/system.go
++++ b/pkg/domain/entities/system.go
+@@ -22,6 +22,7 @@ type SystemPruneOptions struct {
+ 	Volume   bool
+ 	Filters  map[string][]string `json:"filters" schema:"filters"`
+ 	External bool
++	Build    bool
+ }
+ 
+ // SystemPruneReport provides report after system prune is executed.
+diff --git a/pkg/domain/infra/abi/system.go b/pkg/domain/infra/abi/system.go
+index 24ee64d29..ea3e5f203 100644
+--- a/pkg/domain/infra/abi/system.go
++++ b/pkg/domain/infra/abi/system.go
+@@ -150,16 +150,16 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool)
+ 	return nil
+ }
+ 
+-// SystemPrune removes unused data from the system. Pruning pods, containers, networks, volumes and images.
++// SystemPrune removes unused data from the system. Pruning pods, containers, build container, networks, volumes and images.
+ func (ic *ContainerEngine) SystemPrune(ctx context.Context, options entities.SystemPruneOptions) (*entities.SystemPruneReport, error) {
+ 	var systemPruneReport = new(entities.SystemPruneReport)
+ 
+ 	if options.External {
+-		if options.All || options.Volume || len(options.Filters) > 0 {
++		if options.All || options.Volume || len(options.Filters) > 0 || options.Build {
+ 			return nil, fmt.Errorf("system prune --external cannot be combined with other options")
+ 		}
+-		err := ic.Libpod.GarbageCollect()
+-		if err != nil {
++
++		if err := ic.Libpod.GarbageCollect(); err != nil {
+ 			return nil, err
+ 		}
+ 		return systemPruneReport, nil
+@@ -170,6 +170,17 @@ func (ic *ContainerEngine) SystemPrune(ctx context.Context, options entities.Sys
+ 		filters = append(filters, fmt.Sprintf("%s=%s", k, v[0]))
+ 	}
+ 	reclaimedSpace := (uint64)(0)
++
++	// Prune Build Containers
++	if options.Build {
++		stageContainersPruneReports, err := ic.Libpod.PruneBuildContainers()
++		if err != nil {
++			return nil, err
++		}
++		reclaimedSpace += reports.PruneReportsSize(stageContainersPruneReports)
++		systemPruneReport.ContainerPruneReports = append(systemPruneReport.ContainerPruneReports, stageContainersPruneReports...)
++	}
++
+ 	found := true
+ 	for found {
+ 		found = false
+diff --git a/pkg/domain/infra/tunnel/system.go b/pkg/domain/infra/tunnel/system.go
+index fc82e7b2b..142a9fa5c 100644
+--- a/pkg/domain/infra/tunnel/system.go
++++ b/pkg/domain/infra/tunnel/system.go
+@@ -19,7 +19,7 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool)
+ 
+ // SystemPrune prunes unused data from the system.
+ func (ic *ContainerEngine) SystemPrune(ctx context.Context, opts entities.SystemPruneOptions) (*entities.SystemPruneReport, error) {
+-	options := new(system.PruneOptions).WithAll(opts.All).WithVolumes(opts.Volume).WithFilters(opts.Filters).WithExternal(opts.External)
++	options := new(system.PruneOptions).WithAll(opts.All).WithVolumes(opts.Volume).WithFilters(opts.Filters).WithExternal(opts.External).WithBuild(opts.Build)
+ 	return system.Prune(ic.ClientCtx, options)
+ }
+ 
+diff --git a/test/e2e/prune_test.go b/test/e2e/prune_test.go
+index 01e848478..57bd5582d 100644
+--- a/test/e2e/prune_test.go
++++ b/test/e2e/prune_test.go
+@@ -4,6 +4,8 @@ import (
+ 	"fmt"
+ 	"os"
+ 	"path/filepath"
++	"syscall"
++	"time"
+ 
+ 	. "github.com/containers/podman/v4/test/utils"
+ 	. "github.com/onsi/ginkgo/v2"
+@@ -22,6 +24,11 @@ FROM scratch
+ ENV test1=test1
+ ENV test2=test2`
+ 
++var longBuildImage = fmt.Sprintf(`
++FROM %s
++RUN echo "Hello, World!"
++RUN RUN echo "Please use signal 9 this will never ends" && sleep 10000s`, ALPINE)
++
+ var _ = Describe("Podman prune", func() {
+ 
+ 	It("podman container prune containers", func() {
+@@ -593,4 +600,63 @@ var _ = Describe("Podman prune", func() {
+ 		Expect(err).ToNot(HaveOccurred())
+ 		Expect(dirents).To(HaveLen(3))
+ 	})
++
++	It("podman system prune --build clean up after terminated build", func() {
++		useCustomNetworkDir(podmanTest, tempdir)
++
++		podmanTest.BuildImage(pruneImage, "alpine_notleaker:latest", "false")
++
++		create := podmanTest.Podman([]string{"create", "--name", "test", BB, "sleep", "10000"})
++		create.WaitWithDefaultTimeout()
++		Expect(create).Should(ExitCleanly())
++
++		containerFilePath := filepath.Join(podmanTest.TempDir, "ContainerFile-podman-leaker")
++		err := os.WriteFile(containerFilePath, []byte(longBuildImage), 0755)
++		Expect(err).ToNot(HaveOccurred())
++
++		build := podmanTest.Podman([]string{"build", "-f", containerFilePath, "-t", "podmanleaker"})
++		// Build will never finish so let's wait for build to ask for SIGKILL to simulate a failed build that leaves stage containers.
++		matchedOutput := false
++		for range 900 {
++			if build.LineInOutputContains("Please use signal 9") {
++				matchedOutput = true
++				build.Signal(syscall.SIGKILL)
++				break
++			}
++			time.Sleep(100 * time.Millisecond)
++		}
++		if !matchedOutput {
++			Fail("Did not match special string in podman build")
++		}
++
++		// Check Intermediate image of stage container
++		none := podmanTest.Podman([]string{"images", "-a"})
++		none.WaitWithDefaultTimeout()
++		Expect(none).Should(ExitCleanly())
++		Expect(none.OutputToString()).Should(ContainSubstring("none"))
++
++		// Check if Container and Stage Container exist
++		count := podmanTest.Podman([]string{"ps", "-aq", "--external"})
++		count.WaitWithDefaultTimeout()
++		Expect(count).Should(ExitCleanly())
++		Expect(count.OutputToStringArray()).To(HaveLen(3))
++
++		prune := podmanTest.Podman([]string{"system", "prune", "--build", "-f"})
++		prune.WaitWithDefaultTimeout()
++		Expect(prune).Should(ExitCleanly())
++
++		// Container should still exist, but no stage containers
++		count = podmanTest.Podman([]string{"ps", "-aq", "--external"})
++		count.WaitWithDefaultTimeout()
++		Expect(count).Should(ExitCleanly())
++		Expect(count.OutputToString()).To(BeEmpty())
++
++		Expect(podmanTest.NumberOfContainers()).To(Equal(0))
++
++		after := podmanTest.Podman([]string{"images", "-a"})
++		after.WaitWithDefaultTimeout()
++		Expect(after).Should(ExitCleanly())
++		Expect(after.OutputToString()).ShouldNot(ContainSubstring("none"))
++		Expect(after.OutputToString()).Should(ContainSubstring("notleaker"))
++	})
+ })
+
diff --git a/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch
new file mode 100644
index 000000000000..bf79f7904035
--- /dev/null
+++ b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch
@@ -0,0 +1,21 @@
+diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c
+index 4f71d49e5c..3d74af6a6c 100644
+--- a/pkg/rootless/rootless_linux.c
++++ b/pkg/rootless/rootless_linux.c
+@@ -658,7 +658,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
+   if (pipe (p) < 0)
+     return -1;
+
+-  pid = fork ();
++  pid = syscall_clone (SIGCHLD, NULL);
+   if (pid < 0)
+     {
+       close (p[0]);
+@@ -689,7 +689,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
+       close (p[0]);
+
+       setsid ();
+-      pid = fork ();
++      pid = syscall_clone (SIGCHLD, NULL);
+       if (pid < 0)
+         _exit (EXIT_FAILURE);
diff --git a/.github/scripts/test_filter_test_configs.py b/.github/scripts/test_filter_test_configs.py
index 378f72237601..15bcd3ef6873 100755
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@@ -347,26 +347,46 @@ def test_set_periodic_modes(self) -> None:
             {
                 "job_name": "a-ci-job",
                 "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
+<<<<<<< HEAD
                 "descripion": "Replicate each periodic mode in a different config",
+=======
+                "description": "Replicate each periodic mode in a different config",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
             {
                 "job_name": "a-ci-cuda11.8-job",
                 "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
+<<<<<<< HEAD
                 "descripion": "Replicate each periodic mode in a different config for a CUDA job",
+=======
+                "description": "Replicate each periodic mode in a different config for a CUDA job",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
             {
                 "job_name": "a-ci-rocm-job",
                 "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
+<<<<<<< HEAD
                 "descripion": "Replicate each periodic mode in a different config for a ROCm job",
+=======
+                "description": "Replicate each periodic mode in a different config for a ROCm job",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
             {
                 "job_name": "",
                 "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
+<<<<<<< HEAD
                 "descripion": "Empty job name",
             },
             {
                 "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
                 "descripion": "Missing job name",
+=======
+                "description": "Empty job name",
+            },
+            {
+                "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
+                "description": "Missing job name",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
         ]
 
@@ -807,7 +827,11 @@ def test_parse_reenabled_issues(self) -> None:
         # test bad things
         pr_body = (
             "fixes189 fixeshttps://github.com/pytorch/pytorch/issues/75123 "
+<<<<<<< HEAD
             "closedhttps://githubcom/pytorch/pytorch/issues/75123"
+=======
+            "closedhttps://githubcom/pytorch/pytorch/issues/75123"  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "fix 234, fixes # 45, fixing #123, close 234, closes#45, closing #123 resolve 234, "
             "resolves  #45, resolving #123"
         )
diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py
index 1a152dc95945..225546e48720 100755
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@@ -19,6 +19,10 @@
 from github_utils import gh_graphql
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge import (
+<<<<<<< HEAD
+=======
+    _revlist_to_prs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     categorize_checks,
     DRCI_CHECKRUN_NAME,
     find_matching_merge_rule,
@@ -264,7 +268,11 @@ def commits_resolving_gh_pr(self, pr_num: int) -> list[str]:
         return ["FakeCommitSha"]
 
     def commit_message(self, ref: str) -> str:
+<<<<<<< HEAD
         return "super awsome commit message"
+=======
+        return "super awesome commit message"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@@ -432,7 +440,11 @@ def test_get_checkruns_many_runs(self, *args: Any) -> None:
         )
 
     def test_cancelled_gets_ignored(self, *args: Any) -> None:
+<<<<<<< HEAD
         """Tests that cancelled workflow does not override existing successfull status"""
+=======
+        """Tests that cancelled workflow does not override existing successful status"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pr = GitHubPR("pytorch", "pytorch", 110367)
         conclusions = pr.get_checkrun_conclusions()
         lint_checks = [name for name in conclusions.keys() if "Lint" in name]
@@ -1088,5 +1100,54 @@ def test_merge_ghstack_into(
         )
 
 
+<<<<<<< HEAD
+=======
+@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
+@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
+@mock.patch(
+    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
+)
+@mock.patch.object(DummyGitRepo, "commit_message")
+class TestRevListToPR(TestCase):
+    # Tests for _revlist_to_prs function
+    def test__revlist_to_prs_zero_matches(
+        self, mock_commit_message: mock.MagicMock, *args: Any
+    ) -> None:
+        # If zero PRs are mentioned in the commit message, it should raise an error
+        pr_num = 154098
+        pr = GitHubPR("pytorch", "pytorch", pr_num)
+        repo = DummyGitRepo()
+        mock_commit_message.return_value = "no PRs"
+        self.assertRaisesRegex(
+            RuntimeError,
+            "PRs mentioned in commit dummy: 0.",
+            lambda: _revlist_to_prs(repo, pr, ["dummy"]),
+        )
+
+    def test__revlist_to_prs_two_prs(
+        self, mock_commit_message: mock.MagicMock, *args: Any
+    ) -> None:
+        # If two PRs are mentioned in the commit message, it should raise an error
+        pr_num = 154394
+        pr = GitHubPR("pytorch", "pytorch", pr_num)
+        repo = DummyGitRepo()
+        # https://github.com/pytorch/pytorch/commit/343c56e7650f55fd030aca0b9275d6d73501d3f4
+
+        commit_message = """add sticky cache pgo
+
+ghstack-source-id: 9bc6dee0b427819f978bfabccb72727ba8be2f81
+Pull-Request-resolved: https://github.com/pytorch/pytorch/pull/154098
+
+ghstack-source-id: 9bc6dee0b427819f978bfabccb72727ba8be2f81
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394"""
+        mock_commit_message.return_value = commit_message
+        self.assertRaisesRegex(
+            RuntimeError,
+            "PRs mentioned in commit dummy: 2.",
+            lambda: _revlist_to_prs(repo, pr, ["dummy"]),
+        )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     main()
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index e43494e31301..0f4889b10516 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -434,7 +434,11 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
 RE_GHSTACK_HEAD_REF = re.compile(r"^(gh/[^/]+/[0-9]+/)head$")
 RE_GHSTACK_DESC = re.compile(r"Stack.*:\r?\n(\* [^\r\n]+\r?\n)+", re.MULTILINE)
 RE_PULL_REQUEST_RESOLVED = re.compile(
+<<<<<<< HEAD
     r"Pull Request resolved: "
+=======
+    r"(Pull Request resolved|Pull-Request-resolved|Pull-Request): "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"https://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/pull/(?P<number>[0-9]+)",
     re.MULTILINE,
 )
@@ -628,11 +632,25 @@ def _revlist_to_prs(
     rc: list[tuple[GitHubPR, str]] = []
     for idx, rev in enumerate(rev_list):
         msg = repo.commit_message(rev)
+<<<<<<< HEAD
         m = RE_PULL_REQUEST_RESOLVED.search(msg)
         if m is None:
             raise RuntimeError(
                 f"Could not find PR-resolved string in {msg} of ghstacked PR {pr.pr_num}"
             )
+=======
+        # findall doesn't return named captures, so we need to use finditer
+        all_matches = list(RE_PULL_REQUEST_RESOLVED.finditer(msg))
+        if len(all_matches) != 1:
+            raise RuntimeError(
+                f"Found an unexpected number of PRs mentioned in commit {rev}: "
+                f"{len(all_matches)}.  This is probably because you are using an "
+                "old version of ghstack.  Please update ghstack and resubmit "
+                "your PRs"
+            )
+
+        m = all_matches[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if pr.org != m.group("owner") or pr.project != m.group("repo"):
             raise RuntimeError(
                 f"PR {m.group('number')} resolved to wrong owner/repo pair"
@@ -666,6 +684,12 @@ def skip_func(idx: int, candidate: "GitHubPR") -> bool:
 
     assert pr.is_ghstack_pr()
     entire_stack = _revlist_to_prs(repo, pr, reversed(rev_list), skip_func)
+<<<<<<< HEAD
+=======
+    print(
+        f"Found {len(entire_stack)} PRs in the stack for {pr.pr_num}: {[x[0].pr_num for x in entire_stack]}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for stacked_pr, rev in entire_stack:
         if stacked_pr.is_closed():
@@ -819,10 +843,16 @@ def _get_reviews(self) -> list[tuple[str, str]]:
                     cursor=info["reviews"]["pageInfo"]["startCursor"],
                 )
                 info = rc["data"]["repository"]["pullRequest"]
+<<<<<<< HEAD
         reviews = {}
         for author, state in self._reviews:
             if state != "COMMENTED":
                 reviews[author] = state
+=======
+        reviews = {
+            author: state for author, state in self._reviews if state != "COMMENTED"
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return list(reviews.items())
 
     def get_approved_by(self) -> list[str]:
@@ -940,6 +970,15 @@ def get_pr_next_checksuites(checksuites: Any) -> Any:
                     summary=None,
                 )
 
+<<<<<<< HEAD
+=======
+        # Making an exception for Apply lint auggestions/autoformat because the
+        # bot adds a merged label -> triggers workflow -> sometimes needs
+        # approval -> is read as failure, which results in a blocked merge, but
+        # this workflow doesn't provide mergability info
+        self.conclusions.pop("Apply lint suggestions", None)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.conclusions
 
     def get_authors(self) -> dict[str, str]:
@@ -1939,6 +1978,10 @@ def get_ghstack_dependent_prs(
 
 def do_revert_prs(
     repo: GitRepo,
+<<<<<<< HEAD
+=======
+    original_pr: GitHubPR,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shas_and_prs: list[tuple[str, GitHubPR]],
     *,
     author_login: str,
@@ -1960,9 +2003,22 @@ def do_revert_prs(
 
     # Comment/reopen PRs
     for commit_sha, pr in shas_and_prs:
+<<<<<<< HEAD
         revert_message = (
             f"@{pr.get_pr_creator_login()} your PR has been successfully reverted."
         )
+=======
+        revert_message = ""
+        if pr.pr_num == original_pr.pr_num:
+            revert_message += (
+                f"@{pr.get_pr_creator_login()} your PR has been successfully reverted."
+            )
+        else:
+            revert_message += (
+                f"@{pr.get_pr_creator_login()} your PR has been reverted as part of the stack under "
+                f"#{original_pr.pr_num}.\n"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             pr.has_internal_changes()
             and not pr.has_no_connected_diff()
@@ -2014,6 +2070,10 @@ def try_revert(
 
     do_revert_prs(
         repo,
+<<<<<<< HEAD
+=======
+        pr,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shas_and_prs,
         author_login=author_login,
         extra_msg=extra_msg,
@@ -2032,7 +2092,11 @@ def check_for_sev(org: str, project: str, skip_mandatory_checks: bool) -> None:
     response = cast(
         dict[str, Any],
         gh_fetch_json_list(
+<<<<<<< HEAD
             "https://api.github.com/search/issues",
+=======
+            "https://api.github.com/search/issues",  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Having two label: queries is an AND operation
             params={
                 "q": f'repo:{org}/{project} is:open is:issue label:"ci: sev" label:"merge blocking"'
@@ -2282,7 +2346,12 @@ def merge(
         except MandatoryChecksMissingError as ex:
             last_exception = str(ex)
             print(
+<<<<<<< HEAD
                 f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min"
+=======
+                f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min",
+                flush=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             time.sleep(5 * 60)
     # Finally report timeout back
diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py
index 0f6d74e8346e..ddeff1d62d98 100755
--- a/.github/scripts/tryrebase.py
+++ b/.github/scripts/tryrebase.py
@@ -132,17 +132,30 @@ def rebase_ghstack_onto(
         # The contents of a successful push result should look like:
         # Summary of changes (ghstack 0.6.0)
 
+<<<<<<< HEAD
         #  - Updated https://github.com/clee2000/random-testing/pull/2
         #  - Updated https://github.com/clee2000/random-testing/pull/1
+=======
+        #  - Updated https://github.com/clee2000/random-testing-public/pull/2
+        #  - Updated https://github.com/clee2000/random-testing-public/pull/1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Facebook employees can import your changes by running
         # (on a Facebook machine):
 
+<<<<<<< HEAD
         #     ghimport -s https://github.com/clee2000/random-testing/pull/2
 
         # If you want to work on this diff stack on another machine:
 
         #     ghstack checkout https://github.com/clee2000/random-testing/pull/2
+=======
+        #     ghimport -s https://github.com/clee2000/random-testing-public/pull/2
+
+        # If you want to work on this diff stack on another machine:
+
+        #     ghstack checkout https://github.com/clee2000/random-testing-public/pull/2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         org, project = repo.gh_owner_and_name()
         for line in push_result.splitlines():
             if "Updated" in line:
diff --git a/.github/scripts/windows/build_magma.bat b/.github/scripts/windows/build_magma.bat
index beabb0070554..b203f9fe64ee 100644
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@@ -17,7 +17,10 @@ if errorlevel 1 exit /b 1
 
 set "PATH=C:\Tools;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\libnvvp;%PATH%"
 set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%
+<<<<<<< HEAD
 set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 mkdir magma_cuda%CUVER_NODOT%
 cd magma_cuda%CUVER_NODOT%
@@ -35,6 +38,7 @@ cd magma
 mkdir build && cd build
 
 set GPU_TARGET=All
+<<<<<<< HEAD
 if "%CUVER_NODOT%" == "128" (
   set CUDA_ARCH_LIST=-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
 )
@@ -44,6 +48,17 @@ if "%CUVER_NODOT:~0,2%" == "12" if NOT "%CUVER_NODOT%" == "128" (
 if "%CUVER_NODOT%" == "118" (
   set CUDA_ARCH_LIST= -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
 )
+=======
+if "%CUVER_NODOT%" == "129" (
+  set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
+)
+if "%CUVER_NODOT%" == "128" (
+  set CUDA_ARCH_LIST=-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
+)
+if "%CUVER_NODOT%" == "126" (
+  set CUDA_ARCH_LIST=-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 set CC=cl.exe
 set CXX=cl.exe
@@ -54,7 +69,12 @@ cmake .. -DGPU_TARGET="%GPU_TARGET%" ^
             -DCMAKE_BUILD_TYPE=%CONFIG% ^
             -DCMAKE_GENERATOR=Ninja ^
             -DCMAKE_INSTALL_PREFIX=..\install\ ^
+<<<<<<< HEAD
             -DCUDA_ARCH_LIST="%CUDA_ARCH_LIST%"
+=======
+            -DCUDA_ARCH_LIST="%CUDA_ARCH_LIST%" ^
+            -DCMAKE_POLICY_VERSION_MINIMUM=3.5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if errorlevel 1 exit /b 1
 
 cmake --build . --target install --config %CONFIG% -- -j%NUMBER_OF_PROCESSORS%
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index 1a2b282690c1..25905be3c775 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -32,7 +32,11 @@ concurrency:
 {%- macro setup_ec2_windows() -%}
       !{{ display_ec2_information() }}
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index efb415759c95..9bc9a3015051 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -29,6 +29,12 @@ on:
 {%- endfor %}
   workflow_dispatch:
 
+<<<<<<< HEAD
+=======
+permissions:
+  id-token: write
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 env:
   # Needed for conda builds
   {%- if "aarch64" in build_environment %}
@@ -53,7 +59,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -111,12 +121,21 @@ jobs:
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       {%- elif config["gpu_arch_type"] == "rocm" %}
       runs_on: linux.rocm.gpu
+<<<<<<< HEAD
       {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] == "12.8" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
       {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] != "12.8"%}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.4xlarge.nvidia.gpu
+=======
+      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.8", "12.9"] %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+      {%- elif config["gpu_arch_type"] == "cuda" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {%- else %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.4xlarge
@@ -135,7 +154,11 @@ jobs:
         uses: ./.github/actions/setup-xpu
       - name: configure aws credentials
         id: aws_creds
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v1.7.0
+=======
+        uses: aws-actions/configure-aws-credentials@v4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
           aws-region: us-east-1
@@ -148,6 +171,7 @@ jobs:
           name: !{{ config["build_name"] }}
           path: "${{ runner.temp }}/artifacts/"
       !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
@@ -158,6 +182,29 @@ jobs:
         uses: ./.github/actions/teardown-xpu
     {%- else %}
     runs-on: linux.rocm.gpu
+=======
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: !{{ config["container_image"] }}
+          custom-tag-prefix: !{{ config["container_image_tag_prefix"] }}
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown XPU
+        uses: ./.github/actions/teardown-xpu
+    {%- else %}
+    runs-on: linux.rocm.gpu.mi250
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timeout-minutes: !{{ common.timeout_minutes }}
     !{{ upload.binary_env(config) }}
     steps:
@@ -172,12 +219,40 @@ jobs:
       - name: ROCm set GPU_FLAG
         run: |
           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: !{{ config["container_image"] }}
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: !{{ config["container_image"] }}
+          custom-tag-prefix: !{{ config["container_image_tag_prefix"] }}
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
     {%- endif %}
diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2
index 9190ef7deb88..0b3fcb8882cf 100644
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@@ -23,11 +23,17 @@
 {%- endif %}
 {%- if not is_windows %}
       DOCKER_IMAGE: !{{ config["container_image"] }}
+<<<<<<< HEAD
 {%- endif %}
 {%- if config["package_type"] == "manywheel" %}
   {%- if config["devtoolset"] %}
       DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
   {%- endif %}
+=======
+      DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }}
+{%- endif %}
+{%- if config["package_type"] == "manywheel" %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   {%- if config.use_split_build is defined %}
       use_split_build: !{{ config["use_split_build"] }}
   {%- endif %}
@@ -37,9 +43,12 @@
       LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
   {%- endif %}
       LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }}
+<<<<<<< HEAD
   {%- if config["devtoolset"] %}
       DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
   {%- endif %}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   {%- if is_windows %}
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2
index 5bb241b66db9..a47922772eaa 100644
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@@ -49,13 +49,29 @@ env:
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
   SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
+=======
+  OS: !{{ os }}
+{%- if os == "windows-arm64" %}
+  PYTORCH_ROOT: /pytorch
+  DOWNLOADS_DIR: c:\temp\downloads
+  DEPENDENCIES_DIR: c:\temp\dependencies
+  ENABLE_APL: 1
+  ENABLE_OPENBLAS: 0
+  MSVC_VERSION : 14.42
+{%- endif %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 !{{ common.concurrency(build_environment) }}
 
 jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -66,20 +82,80 @@ jobs:
   !{{ config["build_name"] }}-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
+=======
+    {%- if os == "windows-arm64" %}
+    runs-on: "windows-11-arm64-preview"
+    {%- else %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {%- if branches == "nightly" %}
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
     {%- else %}
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
     {%- endif %}
+<<<<<<< HEAD
+=======
+    {%- endif %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
     !{{ upload.binary_env(config, True) }}
     {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
     {%- endif %}
     steps:
+<<<<<<< HEAD
       !{{ common.setup_ec2_windows() }}
       !{{ set_runner_specific_vars() }}
       !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+=======
+{%- if os == "windows-arm64" %}
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+      - name: Bootstrap folders
+        shell: cmd
+        run: |
+          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
+          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - name: Bootstrap sccache
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
+      - name: Bootstrap Libuv
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
+{%- else %}
+      !{{ set_runner_specific_vars() }}
+      !{{ common.setup_ec2_windows() }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+{%- endif %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -95,12 +171,24 @@ jobs:
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
+      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
+=======
+{%- if os != "windows-arm64" %}
       !{{ common.wait_and_kill_ssh_windows('pytorch') }}
+{% endif %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   !{{ config["build_name"] }}-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
       - !{{ config["build_name"] }}-build
       - get-label-type
+<<<<<<< HEAD
+=======
+{%- if os == "windows-arm64" %}
+    runs-on: "windows-11-arm64-preview"
+{%- else %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {%- if config["gpu_arch_type"] == "cuda" %}
 {%- if branches == "nightly" %}
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
@@ -113,18 +201,62 @@ jobs:
 {%- else %}
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
 {%- endif %}
+{%- endif %}
+<<<<<<< HEAD
+    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
+    !{{ upload.binary_env(config, True) }}
+    steps:
+      !{{ common.setup_ec2_windows() }}
+      !{{ set_runner_specific_vars() }}
+=======
 {%- endif %}
     timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
     !{{ upload.binary_env(config, True) }}
     steps:
+{%- if os == "windows-arm64" %}
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+{%- else %}
       !{{ common.setup_ec2_windows() }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
       !{{ set_runner_specific_vars() }}
+{%- endif %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - uses: !{{ common.download_artifact_action }}
         name: Download Build Artifacts
         with:
           name: !{{ config["build_name"] }}
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -133,8 +265,18 @@ jobs:
         shell: bash
         run: |
           "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+<<<<<<< HEAD
+      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
+  {%- if branches == "nightly" %}
+  !{{ upload.upload_binaries(config, True) }}
+  {%- endif %}
+{%- endfor %}
+=======
+{%- if os != "windows-arm64" %}
       !{{ common.wait_and_kill_ssh_windows('pytorch') }}
+{%- endif %}
   {%- if branches == "nightly" %}
   !{{ upload.upload_binaries(config, True) }}
   {%- endif %}
 {%- endfor %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml
index 0f7ed87f2a4c..739c82447f93 100644
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@@ -47,7 +47,11 @@ jobs:
       reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: false
@@ -69,25 +73,41 @@ jobs:
     runs-on: ${{ matrix.runner }}
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ${{ inputs.docker-image-name }}
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -97,7 +117,11 @@ jobs:
         run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
 
       - name: Output disk space left
@@ -209,5 +233,9 @@ jobs:
           file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index eab7c43800bc..c9e930c99348 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -23,7 +23,11 @@ on:
         description: Hardware to run this "build" job on, linux.12xlarge or linux.arm64.2xlarge.
       timeout-minutes:
         required: false
+<<<<<<< HEAD
         default: 210
+=======
+        default: 240
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         type: number
         description: timeout for the job
       use_split_build:
@@ -62,6 +66,13 @@ on:
         required: true
         type: string
         description: Docker image to use
+<<<<<<< HEAD
+=======
+      DOCKER_IMAGE_TAG_PREFIX:
+        required: true
+        type: string
+        description: Docker image tag to use
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       LIBTORCH_CONFIG:
         required: false
         type: string
@@ -70,10 +81,13 @@ on:
         required: false
         type: string
         description: Desired libtorch variant (for libtorch builds only)
+<<<<<<< HEAD
       DESIRED_DEVTOOLSET:
         required: false
         type: string
         description: Desired dev toolset
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON:
         required: false
         type: string
@@ -88,6 +102,12 @@ on:
         required: true
         description: Github Token
 
+<<<<<<< HEAD
+=======
+permissions:
+  id-token: write
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 jobs:
   build:
     runs-on: ${{ inputs.runner_prefix}}${{ inputs.runs_on }}
@@ -104,7 +124,10 @@ jobs:
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }}
       LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }}
+<<<<<<< HEAD
       DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }}
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: ${{ inputs.PYTORCH_EXTRA_INSTALL_REQUIREMENTS }}
       ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }}
@@ -130,7 +153,10 @@ jobs:
             echo "SKIP_ALL_TESTS=${{ env.SKIP_ALL_TESTS }}"
             echo "LIBTORCH_CONFIG=${{ env.LIBTORCH_CONFIG }}"
             echo "LIBTORCH_VARIANT=${{ env.LIBTORCH_VARIANT }}"
+<<<<<<< HEAD
             echo "DESIRED_DEVTOOLSET=${{ env.DESIRED_DEVTOOLSET }}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             echo "DESIRED_PYTHON=${{ env.DESIRED_PYTHON }}"
             echo "PYTORCH_EXTRA_INSTALL_REQUIREMENTS=${{ env.PYTORCH_EXTRA_INSTALL_REQUIREMENTS }}"
             echo "ALPINE_IMAGE=${{ env.ALPINE_IMAGE }}"
@@ -150,13 +176,21 @@ jobs:
 
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         if: inputs.build_environment != 'linux-s390x-binary-manywheel'
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.github-token }}
 
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}
 
@@ -184,7 +218,11 @@ jobs:
           fi
 
       - name: Checkout PyTorch to pytorch dir
+<<<<<<< HEAD
         uses: actions/checkout@v4
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: recursive
           path: pytorch
@@ -208,6 +246,7 @@ jobs:
               { config: "default" },
             ]}
 
+<<<<<<< HEAD
       - name: Pull Docker image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
@@ -216,6 +255,42 @@ jobs:
 
       - name: Build PyTorch binary
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
+=======
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' && startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          # If doing this in release/2.8 or release branch, use docker.io. Otherwise
+          # use ECR
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: ${{ inputs.DOCKER_IMAGE }}
+          custom-tag-prefix: ${{ inputs.DOCKER_IMAGE_TAG_PREFIX }}
+          # The build.sh script in this folder is not actually the correct one,
+          # this is just needed for sha calculation
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+
+      - name: Pull Docker image
+        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Build PyTorch binary
+        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image || format('{0}:{1}', inputs.DOCKER_IMAGE, inputs.DOCKER_IMAGE_TAG_PREFIX) }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run: |
           set -x
           mkdir -p artifacts/
@@ -223,7 +298,10 @@ jobs:
             -e BINARY_ENV_FILE \
             -e BUILD_ENVIRONMENT \
             -e DESIRED_CUDA \
+<<<<<<< HEAD
             -e DESIRED_DEVTOOLSET \
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             -e DESIRED_PYTHON \
             -e GITHUB_ACTIONS \
             -e GPU_ARCH_TYPE \
@@ -256,7 +334,11 @@ jobs:
           # Ensure the working directory gets chowned back to the current user
           docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
 
+<<<<<<< HEAD
       - uses: actions/upload-artifact@v4.4.0
+=======
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
         with:
           name: ${{ inputs.build_name }}
@@ -266,7 +348,11 @@ jobs:
 
       - name: Teardown Linux
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Chown workspace
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index 153f1e6d2f1a..23f7fa91a51c 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -39,6 +39,13 @@ on:
         required: true
         type: string
         description: Docker image to use
+<<<<<<< HEAD
+=======
+      DOCKER_IMAGE_TAG_PREFIX:
+        required: true
+        type: string
+        description: Docker image tag to use
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       LIBTORCH_CONFIG:
         required: false
         type: string
@@ -47,10 +54,13 @@ on:
         required: false
         type: string
         description: Desired libtorch variant (for libtorch builds only)
+<<<<<<< HEAD
       DESIRED_DEVTOOLSET:
         required: false
         type: string
         description: Desired dev toolset
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON:
         required: false
         type: string
@@ -76,6 +86,12 @@ on:
         required: true
         description: Github Token
 
+<<<<<<< HEAD
+=======
+permissions:
+  id-token: write
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 jobs:
   test:
     runs-on: ${{ inputs.runner_prefix}}${{ inputs.runs_on }}
@@ -92,7 +108,10 @@ jobs:
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }}
       LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }}
+<<<<<<< HEAD
       DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }}
       ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }}
       AWS_DEFAULT_REGION: us-east-1
@@ -118,7 +137,10 @@ jobs:
             echo "SKIP_ALL_TESTS=${{ env.SKIP_ALL_TESTS }}"
             echo "LIBTORCH_CONFIG=${{ env.LIBTORCH_CONFIG }}"
             echo "LIBTORCH_VARIANT=${{ env.LIBTORCH_VARIANT }}"
+<<<<<<< HEAD
             echo "DESIRED_DEVTOOLSET=${{ env.DESIRED_DEVTOOLSET }}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             echo "DESIRED_PYTHON=${{ env.DESIRED_PYTHON }}"
 
             echo "ALPINE_IMAGE=${{ env.ALPINE_IMAGE }}"
@@ -133,14 +155,22 @@ jobs:
 
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         if: inputs.build_environment != 'linux-s390x-binary-manywheel'
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.github-token }}
 
         # Setup the environment
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}
 
@@ -161,7 +191,11 @@ jobs:
           mkdir "${GITHUB_WORKSPACE}"
 
       - name: Checkout PyTorch to pytorch dir
+<<<<<<< HEAD
         uses: actions/checkout@v4
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: recursive
           show-progress: false
@@ -187,12 +221,17 @@ jobs:
 
       - name: Download Build Artifacts
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
+<<<<<<< HEAD
         uses: actions/download-artifact@v4.1.7
+=======
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           name: ${{ inputs.build_name }}
           path: "${{ runner.temp }}/artifacts/"
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
         if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
 
@@ -201,14 +240,53 @@ jobs:
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: ${{ inputs.DOCKER_IMAGE }}
+=======
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8
+        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
+
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' && startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: ${{ inputs.DOCKER_IMAGE }}
+          custom-tag-prefix: ${{ inputs.DOCKER_IMAGE_TAG_PREFIX }}
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+
+      - name: Pull Docker image
+        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Test Pytorch binary
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
         uses: ./pytorch/.github/actions/test-pytorch-binary
+<<<<<<< HEAD
 
       - name: Teardown Linux
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image || format('{0}:{1}', inputs.DOCKER_IMAGE, inputs.DOCKER_IMAGE_TAG_PREFIX) }}
+
+      - name: Teardown Linux
+        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Chown workspace
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml
index 296ac999c8c2..828441619ab9 100644
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@@ -35,6 +35,13 @@ on:
         required: false
         type: string
         description: Docker image to use
+<<<<<<< HEAD
+=======
+      DOCKER_IMAGE_TAG_PREFIX:
+        required: false
+        type: string
+        description: Docker image tag to use
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       LIBTORCH_CONFIG:
         required: false
         type: string
@@ -43,10 +50,13 @@ on:
         required: false
         type: string
         description: Desired libtorch variant (for libtorch builds only)
+<<<<<<< HEAD
       DESIRED_DEVTOOLSET:
         required: false
         type: string
         description: Desired dev toolset
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON:
         required: false
         type: string
@@ -80,7 +90,10 @@ jobs:
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }}
       LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }}
+<<<<<<< HEAD
       DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }}
       BINARY_ENV_FILE: /tmp/env
       GITHUB_TOKEN: ${{ secrets.github-token }}
@@ -90,20 +103,32 @@ jobs:
       USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
       - name: Configure AWS credentials(PyTorch account) for nightly
         if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/nightly' }}
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
           aws-region: us-east-1
 
       - name: Configure AWS credentials(PyTorch account) for RC builds
         if: ${{ github.event_name == 'push' &&  (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
           aws-region: us-east-1
@@ -113,7 +138,11 @@ jobs:
         # NB: When the previous build job is skipped, there won't be any artifacts and
         # this step will fail. Binary build jobs can only be skipped on CI, not nightly
         continue-on-error: true
+<<<<<<< HEAD
         uses: actions/download-artifact@v4.1.7
+=======
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           name: ${{ inputs.build_name }}
           path: "${{ runner.temp }}/artifacts/"
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index cf1788a2d78a..560cb6a166b9 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -84,7 +84,11 @@ jobs:
     name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -95,14 +99,22 @@ jobs:
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
       - name: configure aws credentials
         if : ${{ inputs.aws-role-to-assume != '' }}
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: ${{ inputs.aws-role-to-assume }}
           role-session-name: gha-linux-test
@@ -110,12 +122,20 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -185,14 +205,22 @@ jobs:
 
       - name: configure aws credentials
         if : ${{ inputs.upload-aws-role-to-assume != '' }}
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: ${{ inputs.upload-aws-role-to-assume }}
           role-session-name: gha-linux-test
           aws-region: us-east-1
 
       - name: Upload Python Docs Preview
+<<<<<<< HEAD
         uses: seemethere/upload-artifact-s3@v5
+=======
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' && steps.build-docs.outcome == 'success' }}
         with:
           retention-days: 14
@@ -202,7 +230,11 @@ jobs:
           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}
 
       - name: Upload C++ Docs Preview
+<<<<<<< HEAD
         uses: seemethere/upload-artifact-s3@v5
+=======
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' && steps.build-docs.outcome == 'success' }}
         with:
           retention-days: 14
@@ -212,7 +244,11 @@ jobs:
           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs
 
       - name: Upload functorch Docs Preview
+<<<<<<< HEAD
         uses: seemethere/upload-artifact-s3@v5
+=======
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
         with:
           retention-days: 14
@@ -222,5 +258,9 @@ jobs:
           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml
new file mode 100644
index 000000000000..bfe554decf53
--- /dev/null
+++ b/.github/workflows/_link_check.yml
@@ -0,0 +1,60 @@
+on:
+  workflow_call:
+    inputs:
+      runner:
+        type: string
+        required: true
+      ref:
+        type: string
+        required: true
+
+jobs:
+  lint-urls:
+    if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+    with:
+      timeout: 120
+      runner: ${{ inputs.runner }}linux.2xlarge
+      docker-image: ci-image:pytorch-linux-jammy-linter
+      fetch-depth: 0
+      submodules: false
+      ref: ${{ inputs.ref }}
+      script: |
+        ./scripts/lint_urls.sh $(
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}"
+          else
+            echo "${{ github.event.before }}" "${{ github.sha }}"
+          fi
+        ) || {
+          echo
+          echo "URL lint failed."
+          echo "If this is a transient outage, you can bypass it by adding the \`skip-url-lint\` label to your PR."
+          echo "Or add \`@lint-ignore\` somewhere on the same line as the URL you want to skip checking."
+          exit 1
+        }
+
+  lint-xrefs:
+    if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+    with:
+      timeout: 60
+      runner: ${{ inputs.runner }}linux.2xlarge
+      docker-image: ci-image:pytorch-linux-jammy-linter
+      fetch-depth: 0
+      submodules: false
+      ref: ${{ inputs.ref }}
+      script: |
+        ./scripts/lint_xrefs.sh $(
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}"
+          else
+            echo "${{ github.event.before }}" "${{ github.sha }}"
+          fi
+        ) || {
+          echo
+          echo "Xref lint failed."
+          echo "If this is a transient outage, you can bypass it by adding the \`skip-xref-lint\` label to your PR."
+          echo "Or add \`@lint-ignore\` somewhere on the same line as the reference you want to skip checking."
+          exit 1
+        }
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index 7426b62428a9..0df563f49682 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -74,6 +74,35 @@ on:
           Overwrite the number of jobs to use for the build
         required: false
         type: string
+<<<<<<< HEAD
+=======
+      disable-monitor:
+        description: |
+          Disable utilization monitoring for build job
+        required: false
+        type: boolean
+        default: false
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
+
+      allow-reuse-old-whl:
+        description: |
+          If set, the build try to pull an old wheel from s3 that was built on a
+          commit with no cpp changes from this commit
+        required: false
+        type: boolean
+        default: true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     secrets:
       HUGGING_FACE_HUB_TOKEN:
@@ -106,7 +135,11 @@ jobs:
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -116,7 +149,11 @@ jobs:
       # checkout because when we run this action we don't *have* a local
       # checkout. In other cases you should prefer a local checkout.
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
@@ -125,13 +162,18 @@ jobs:
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
 
       - name: configure aws credentials
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
         with:
           role-to-assume: ${{ inputs.aws-role-to-assume }}
           role-session-name: gha-linux-build
           aws-region: us-east-1
 
+<<<<<<< HEAD
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
@@ -159,6 +201,8 @@ jobs:
         id: parse-ref
         run: .github/scripts/parse_ref.py
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
@@ -166,6 +210,47 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
+<<<<<<< HEAD
+=======
+      - name: Check if can use old whl build
+        id: use-old-whl
+        uses: ./.github/actions/reuse-old-whl
+        if: ${{ inputs.allow-reuse-old-whl }}
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          run-id: ${{ github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          job-id: ${{ steps.get-job-id.outputs.job-id }}
+          job-name: ${{ steps.get-job-id.outputs.job-name }}
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
+        with:
+          docker-image-name: ${{ inputs.docker-image-name }}
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
+        env:
+          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        shell: bash
+        run: |
+          tag=${ECR_DOCKER_IMAGE##*:}
+          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # Apply the filter logic to the build step too if the test-config label is already there
       - name: Select all requested test configurations (if the test matrix is available)
         id: filter
@@ -176,17 +261,49 @@ jobs:
           selected-test-configs: ${{ inputs.selected-test-configs }}
           job-name: ${{ steps.get-job-id.outputs.job-name }}
 
+<<<<<<< HEAD
       - name: Download pytest cache
         uses: ./.github/actions/pytest-cache-download
         continue-on-error: true
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
+=======
+      - name: Start monitoring script
+        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
+        shell: bash
+        continue-on-error: true
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+        run: |
+          mkdir -p ../../usage_logs
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          python3 -m tools.stats.monitor \
+          --log-interval "$MONITOR_LOG_INTERVAL" \
+          --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
+          > "../../usage_logs/usage_log_build_${JOB_ID}.txt" 2>&1 &
+          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
+
+      - name: Download pytest cache
+        uses: ./.github/actions/pytest-cache-download
+        continue-on-error: true
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           cache_dir: .pytest_cache
           job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
           s3_bucket: ${{ inputs.s3-bucket }}
 
       - name: Build
+<<<<<<< HEAD
         if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
+=======
+        if: (steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '') && steps.use-old-whl.outputs.reuse != 'true'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         id: build
         env:
           BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
@@ -280,14 +397,33 @@ jobs:
           END_TIME=$(date +%s)
           echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
 
+<<<<<<< HEAD
       - name: Archive artifacts into zip
         if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
+=======
+      - name: Stop monitoring script
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
+        shell: bash
+        continue-on-error: true
+        env:
+          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        run: |
+          kill "$MONITOR_SCRIPT_PID"
+
+      - name: Archive artifacts into zip
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && steps.use-old-whl.outputs.reuse != 'true'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run: |
           zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
 
       - name: Store PyTorch Build Artifacts on S3
+<<<<<<< HEAD
         uses: seemethere/upload-artifact-s3@v5
         if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
+=======
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+        if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment != 'linux-s390x-binary-manywheel'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           name: ${{ inputs.build-environment }}
           retention-days: 14
@@ -296,14 +432,41 @@ jobs:
           s3-bucket: ${{ inputs.s3-bucket }}
 
       - name: Store PyTorch Build Artifacts for s390x
+<<<<<<< HEAD
         uses: actions/upload-artifact@v4
         if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment == 'linux-s390x-binary-manywheel'
+=======
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment == 'linux-s390x-binary-manywheel'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           name: ${{ inputs.build-environment }}
           retention-days: 14
           if-no-files-found: error
           path: artifacts.zip
 
+<<<<<<< HEAD
+=======
+      - name: copy logs
+        shell: bash
+        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
+        continue-on-error: true
+        run: |
+          rm -f ./usage_logs
+          mkdir -p ./usage_logs
+          cp ../../usage_logs/usage_log_build_*.txt ./usage_logs/
+
+      - name: Upload raw usage log to s3
+        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
+          retention-days: 14
+          if-no-files-found: warn
+          path: usage_logs/usage_log_build_*.txt
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Upload sccache stats
         if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
         uses: ./.github/actions/upload-sccache-stats
@@ -311,8 +474,25 @@ jobs:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           build-time: ${{ steps.build.outputs.build_time }}
 
+<<<<<<< HEAD
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+      - name: Upload utilization stats
+        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+          artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
 
       - name: Cleanup docker
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 389a65a782c8..44d09f39d94d 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -55,6 +55,21 @@ on:
         required: false
         type: boolean
         default: false
+<<<<<<< HEAD
+=======
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       HUGGING_FACE_HUB_TOKEN:
         required: false
@@ -80,7 +95,11 @@ jobs:
     timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -89,7 +108,11 @@ jobs:
               docker exec -it $(docker container ps --format '{{.ID}}') bash
 
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
@@ -99,7 +122,11 @@ jobs:
 
       - name: configure aws credentials
         if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: ${{ inputs.aws-role-to-assume }}
           role-session-name: gha-linux-test
@@ -107,7 +134,11 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image-name: ${{ inputs.docker-image }}
@@ -119,11 +150,19 @@ jobs:
           ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
         shell: bash
         run: |
+<<<<<<< HEAD
           tag=${ECR_DOCKER_IMAGE##*/}
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
+=======
+          tag=${ECR_DOCKER_IMAGE##*:}
+          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@@ -135,7 +174,11 @@ jobs:
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
 
       - name: Setup GPU_FLAG for docker run
@@ -172,9 +215,17 @@ jobs:
           JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
           WORKFLOW_NAME: ${{ github.workflow }}
           WORKFLOW_RUN_ID: ${{github.run_id}}
+<<<<<<< HEAD
         run: |
           python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 dataclasses_json==0.6.7
           python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+=======
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+        run: |
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
       - name: Download build artifacts
@@ -193,7 +244,11 @@ jobs:
         run: .github/scripts/parse_ref.py
 
       - name: Check for keep-going label and re-enabled test issues
+<<<<<<< HEAD
         # This uses the filter-test-configs action because it conviniently
+=======
+        # This uses the filter-test-configs action because it conveniently
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # checks for labels and re-enabled test issues.  It does not actually do
         # any filtering.  All filtering is done in the build step.
         id: keep-going
@@ -362,7 +417,11 @@ jobs:
       - name: Upload pytest cache if tests failed
         uses: ./.github/actions/pytest-cache-upload
         continue-on-error: true
+<<<<<<< HEAD
         if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
+=======
+        if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' && inputs.build-environment != 'linux-s390x-binary-manywheel'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           cache_dir: .pytest_cache
           shard: ${{ matrix.shard }}
@@ -371,7 +430,12 @@ jobs:
           job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
 
       - name: Upload the benchmark results
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           benchmark-results-dir: test/test-reports
           dry-run: false
@@ -408,7 +472,11 @@ jobs:
           find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
 
       - name: Store Core dumps on S3
+<<<<<<< HEAD
         uses: seemethere/upload-artifact-s3@v5
+=======
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: failure()
         with:
           name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
@@ -417,7 +485,11 @@ jobs:
           path: ./**/core.[1-9]*
 
       - name: Upload utilization stats
+<<<<<<< HEAD
         if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
+=======
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         uses: ./.github/actions/upload-utilization-stats
         with:
@@ -428,7 +500,11 @@ jobs:
           workflow_attempt: ${{github.run_attempt}}
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
 
       # NB: We are currently having an intermittent GPU-related issue on G5 runners with
@@ -445,8 +521,11 @@ jobs:
       - name: Check NVIDIA driver installation step
         if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
         shell: bash
+<<<<<<< HEAD
         env:
           RUNNER_WORKSPACE: ${{ runner.workspace }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run: |
           set +e
           set -x
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 0c0d42d398a6..f42e768b62d7 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -30,6 +30,7 @@ on:
       python-version:
         required: false
         type: string
+<<<<<<< HEAD
         default: "3.9"
         description: |
           The python version to be used. Will be 3.9 by default
@@ -37,6 +38,11 @@ on:
         required: false
         type: string
         description: Set the conda environment file used to setup macOS build.
+=======
+        default: "3.12"
+        description: |
+          The python version to be used. Will be 3.9 by default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix:
         required: false
         type: string
@@ -71,11 +77,19 @@ jobs:
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
     steps:
       - name: Clean up disk space before running MacOS workflow
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8
+
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Set xcode version
         env:
@@ -85,6 +99,7 @@ jobs:
             echo "DEVELOPER_DIR=/Applications/Xcode_${XCODE_VERSION}.app/Contents/Developer" >> "${GITHUB_ENV}"
           fi
 
+<<<<<<< HEAD
       - name: Setup miniconda
         if: inputs.environment-file == ''
         uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
@@ -105,6 +120,16 @@ jobs:
 
       - name: Install sccache (only for non-forked PRs, and pushes to trunk)
         uses: nick-fields/retry@v3.0.0
+=======
+      - name: Setup Python
+        uses: pytorch/test-infra/.github/actions/setup-python@release/2.8
+        with:
+          python-version: ${{ inputs.python-version }}
+          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
+
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
         with:
           timeout_minutes: 5
@@ -186,7 +211,11 @@ jobs:
           zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .additional_ci_files
 
       - name: Store PyTorch Build Artifacts on GHA
+<<<<<<< HEAD
         uses: actions/upload-artifact@v4
+=======
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
         with:
           name: ${{ env.BUILD_ENVIRONMENT }}
@@ -195,7 +224,11 @@ jobs:
           path: artifacts.zip
 
       - name: Upload sccache stats to GHA
+<<<<<<< HEAD
         uses: actions/upload-artifact@v4
+=======
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Only if sccache is installed, see above
         if: ${{ (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && steps.build.outcome != 'skipped' }}
         with:
@@ -207,4 +240,8 @@ jobs:
       - name: Clean up disk space
         if: always()
         continue-on-error: true
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 013461825f9a..8a2632b7d1ee 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -21,7 +21,11 @@ on:
       python-version:
         required: false
         type: string
+<<<<<<< HEAD
         default: "3.9"
+=======
+        default: "3.12"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         description: |
           The python version to be used. Will be 3.9 by default
       timeout-minutes:
@@ -38,13 +42,36 @@ on:
         required: false
         type: boolean
         default: true
+<<<<<<< HEAD
+=======
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   test:
     # Don't run on forked repos or empty test matrix
     if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+<<<<<<< HEAD
     # For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179
     # Also ensure that we always run with the right architecture
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     defaults:
       run:
         shell: bash -e -l {0}
@@ -73,6 +100,13 @@ jobs:
             pkill "${PROCESS}" || true
           done
 
+<<<<<<< HEAD
+=======
+      - name: Clean up leftover miniconda installation
+        continue-on-error: true
+        run: brew uninstall miniconda || true
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Clean up leftover local python3 site-packages on MacOS pet runner
         continue-on-error: true
         run: |
@@ -82,18 +116,46 @@ jobs:
           done
 
       - name: Clean up disk space before running MacOS workflow
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8
+
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Start monitoring script
         id: monitor-script
         if: ${{ !inputs.disable-monitor }}
         continue-on-error: true
+<<<<<<< HEAD
         run: |
           ${CONDA_RUN} python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+=======
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+        run: |
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
       - name: Download build artifacts
@@ -108,17 +170,27 @@ jobs:
         with:
           use-gha: true
 
+<<<<<<< HEAD
       - name: Setup miniconda
         uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
         with:
           python-version: ${{ inputs.python-version }}
           environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
           pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
+=======
+      - name: Setup Python
+        uses: pytorch/test-infra/.github/actions/setup-python@release/2.8
+        with:
+          python-version: ${{ inputs.python-version }}
+          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
+          default-packages: ""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
 
+<<<<<<< HEAD
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
@@ -128,6 +200,10 @@ jobs:
 
       - name: Check for keep-going label and re-enabled test issues
         # This uses the filter-test-configs action because it conviniently
+=======
+      - name: Check for keep-going label and re-enabled test issues
+        # This uses the filter-test-configs action because it conveniently
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # checks for labels and re-enabled test issues.  It does not actually do
         # any filtering.  All filtering is done in the build step.
         id: keep-going
@@ -166,10 +242,15 @@ jobs:
           JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
           JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
+<<<<<<< HEAD
+=======
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run: |
           # shellcheck disable=SC1090
           set -ex
 
+<<<<<<< HEAD
           arch
 
           if [[ -n "$CONDA_ENV" ]]; then
@@ -186,21 +267,46 @@ jobs:
           ${CONDA_RUN} python --version
 
           ${CONDA_RUN} python3 -mpip install --no-index --no-deps dist/*.whl
+=======
+          # TODO: Remove me later, and properly activate venv
+          PATH="$(dirname "$(which python)"):$PATH"
+          export PATH
+
+          # Print out some information about the test environment
+          for tool in python3 python; do
+            which $tool
+            $tool --version
+          done
+
+          python3 -mpip install --no-index --no-deps dist/*.whl
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           set +e
           pushd "${RUNNER_TEMP}"
           # Install pip dependencies if they are not found. This is to mitigate a peculiar
           # flaky missing dependencies on MacOS
+<<<<<<< HEAD
           ${CONDA_RUN} python3 -c "import torch"
+=======
+          python3 -c "import torch"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           RC=$?
           popd
 
           if [ "${RC}" -ne 0 ]; then
+<<<<<<< HEAD
             ${CONDA_RUN} python3 -mpip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}"
           fi
           set -e
 
           ${CONDA_RUN} .ci/pytorch/macos-test.sh
+=======
+            python3 -mpip install --ignore-installed -r "${PIP_REQUIREMENTS_FILE}"
+          fi
+          set -e
+
+          .ci/pytorch/macos-test.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Print remaining test logs
         shell: bash
@@ -208,6 +314,16 @@ jobs:
         run: |
           cat test/**/*_toprint.log || true
 
+<<<<<<< HEAD
+=======
+      - name: Run OP benchmark
+        shell: bash
+        if: ${{ contains(steps.get-job-id.outputs.job-name, 'mps') }}
+        run: |
+          python3 test/bench_mps_ops.py
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Stop monitoring script
         if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
         continue-on-error: true
@@ -224,14 +340,37 @@ jobs:
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Upload the benchmark results
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           benchmark-results-dir: test/test-reports
           dry-run: false
           schema-version: v3
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
+<<<<<<< HEAD
       - name: Clean up disk space
         if: always()
         continue-on-error: true
         uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7
+=======
+      - name: Upload utilization stats
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+          local_path: usage_log.txt
+
+      - name: Clean up disk space
+        if: always()
+        continue-on-error: true
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index babcc4c9bac9..1c01ef54fe8f 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -50,7 +50,22 @@ on:
         required: false
         type: boolean
         default: true
+<<<<<<< HEAD
 
+=======
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
 
@@ -70,7 +85,11 @@ jobs:
     steps:
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
@@ -79,7 +98,11 @@ jobs:
 
       - name: configure aws credentials
         id: aws_creds
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v4
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
           aws-region: us-east-1
@@ -88,27 +111,63 @@ jobs:
       - name: Login to Amazon ECR
         id: login-ecr
         continue-on-error: true
+<<<<<<< HEAD
         uses: aws-actions/amazon-ecr-login@v2
 
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
+=======
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
       - name: Start monitoring script
         id: monitor-script
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Start monitoring script
+        id: monitor-script
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ !inputs.disable-monitor }}
         shell: bash
         continue-on-error: true
         run: |
+<<<<<<< HEAD
           python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
           python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+=======
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
       - name: Download build artifacts
@@ -124,6 +183,7 @@ jobs:
         id: parse-ref
         run: .github/scripts/parse_ref.py
 
+<<<<<<< HEAD
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
@@ -133,6 +193,10 @@ jobs:
 
       - name: Check for keep-going label and re-enabled test issues
         # This uses the filter-test-configs action because it conviniently
+=======
+      - name: Check for keep-going label and re-enabled test issues
+        # This uses the filter-test-configs action because it conveniently
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # checks for labels and re-enabled test issues.  It does not actually do
         # any filtering.  All filtering is done in the build step.
         id: keep-going
@@ -285,7 +349,11 @@ jobs:
           find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
 
       - name: Store Core dumps on GitHub
+<<<<<<< HEAD
         uses: actions/upload-artifact@v4
+=======
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: failure()
         with:
           name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
@@ -294,7 +362,11 @@ jobs:
           path: ./**/core.[1-9]*
 
       - name: Authenticate with AWS
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v4
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
           # The max duration enforced by the server side
@@ -302,12 +374,30 @@ jobs:
           aws-region: us-east-1
 
       - name: Upload the benchmark results
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           benchmark-results-dir: test/test-reports
           dry-run: false
           schema-version: v3
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
+<<<<<<< HEAD
+=======
+      - name: Upload utilization stats
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
diff --git a/.github/workflows/_runner-determinator.yml b/.github/workflows/_runner-determinator.yml
index b608a71c055a..ded600fa9230 100644
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@@ -7,7 +7,15 @@ on:
         required: false
         type: string
         description: |
+<<<<<<< HEAD
           List of experiments for this workfow. If not defined, all default experiments are included.
+=======
+          List of experiments for this workflow. If not defined, all default experiments are included.
+      opt_out_experiments:
+        required: false
+        type: string
+        description: Comma-separated list of experiments this workflow will opt-out of.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       triggering_actor:
         required: true
         type: string
@@ -51,10 +59,18 @@ jobs:
       TRIGGERING_ACTOR: ${{ inputs.triggering_actor }}
       ISSUE_OWNER: ${{ inputs.issue_owner }}
       CHECK_EXPERIMENTS: ${{ inputs.check_experiments }}
+<<<<<<< HEAD
       PR_NUMBER: ${{ github.event.pull_request.number }}
     steps:
       # - name: Checkout PyTorch
       #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+      OPT_OUT_EXPERIMENTS: ${{ inputs.opt_out_experiments }}
+      PR_NUMBER: ${{ github.event.pull_request.number }}
+    steps:
+      # - name: Checkout PyTorch
+      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       #   with:
       #     fetch-depth: 1
       #     submodules: true
@@ -267,6 +283,19 @@ jobs:
                   help="comma separated list of experiments to check, if omitted all experiments marked with default=True are checked",
               )
               parser.add_argument(
+<<<<<<< HEAD
+=======
+                  "--opt-out-experiments",
+                  type=_str_comma_separated_to_set,
+                  required=False,
+                  default="",
+                  help=(
+                      "comma separated list of experiments to opt-out of. If unset, no opt-outs will occur. "
+                      "If the same experiment is listed both here and in '--eligible-experiments' opt-out will take priority."
+                  ),
+              )
+              parser.add_argument(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   "--pr-number",
                   type=str,
                   required=False,
@@ -490,6 +519,10 @@ jobs:
               workflow_requestors: Iterable[str],
               branch: str,
               eligible_experiments: frozenset[str] = frozenset(),
+<<<<<<< HEAD
+=======
+              opt_out_experiments: frozenset[str] = frozenset(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               is_canary: bool = False,
           ) -> str:
               settings = parse_settings(rollout_state)
@@ -504,6 +537,17 @@ jobs:
                       )
                       continue
 
+<<<<<<< HEAD
+=======
+                  if opt_out_experiments:
+                      if experiment_name in opt_out_experiments:
+                          opt_out_exp_list = ", ".join(opt_out_experiments)
+                          log.info(
+                              f"Skipping experiment '{experiment_name}', as this workflow has opted-out (opted out experiments are: {opt_out_exp_list})"
+                          )
+                          continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   if eligible_experiments:
                       if experiment_name not in eligible_experiments:
                           exp_list = ", ".join(eligible_experiments)
@@ -668,6 +712,10 @@ jobs:
                       (args.github_issue_owner, username),
                       args.github_branch,
                       args.eligible_experiments,
+<<<<<<< HEAD
+=======
+                      args.opt_out_experiments,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                       is_canary,
                   )
 
@@ -705,4 +753,8 @@ jobs:
             --github-ref-type "$curr_ref_type" \
             --github-repo "$GITHUB_REPOSITORY" \
             --eligible-experiments "$CHECK_EXPERIMENTS" \
+<<<<<<< HEAD
+=======
+            --opt-out-experiments "$OPT_OUT_EXPERIMENTS" \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             --pr-number "${PR_NUMBER}"
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index 27f75767b685..6a59a1f86286 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -23,7 +23,11 @@ on:
       vc-year:
         required: false
         type: string
+<<<<<<< HEAD
         default: "2019"
+=======
+        default: "2022"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         description: The Visual Studio year to use for building.
       build-with-debug:
         required: false
@@ -84,10 +88,17 @@ jobs:
           git config --global core.fsmonitor false
 
       - name: Clean up leftover processes on non-ephemeral Windows runner
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.7
 
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.8
+
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -98,11 +109,19 @@ jobs:
             To start build locally, change working folder to \actions-runner\_work\pytorch\pytorch,
             Activate miniconda and Visual Studio environment, by running:
               call C:\Jenkins\Miniconda3\Scripts\activate.bat C:\Jenkins\Miniconda3
+<<<<<<< HEAD
               call "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+              call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64
+
+      # [see note: pytorch repo ref]
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
@@ -173,7 +192,11 @@ jobs:
       # Upload to github so that people can click and download artifacts
       - name: Upload artifacts to s3
         if: steps.build.outcome != 'skipped'
+<<<<<<< HEAD
         uses: seemethere/upload-artifact-s3@v5
+=======
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           retention-days: 14
           if-no-files-found: error
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 544e6389c46c..60c989ce58ac 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -36,7 +36,22 @@ on:
         required: false
         type: boolean
         default: true
+<<<<<<< HEAD
 
+=======
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
 
@@ -66,10 +81,17 @@ jobs:
           git config --global core.fsmonitor false
 
       - name: Clean up leftover processes on non-ephemeral Windows runner
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.7
 
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.8
+
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -80,12 +102,20 @@ jobs:
             To start tests locally, change working folder to \actions-runner\_work\pytorch\pytorch\test,
             Activate miniconda and Visual Studio environment and set PYTHON_PATH, by running:
               call C:\Jenkins\Miniconda3\Scripts\activate.bat C:\Jenkins\Miniconda3
+<<<<<<< HEAD
               call "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64
+=======
+              call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               set PYTHONPATH=C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           no-sudo: true
 
@@ -96,7 +126,11 @@ jobs:
 
       # TODO: Move to a requirements.txt file for windows
       - name: Install pip dependencies
+<<<<<<< HEAD
         uses: nick-fields/retry@v3.0.0
+=======
+        uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           shell: bash
           timeout_minutes: 5
@@ -106,18 +140,46 @@ jobs:
             set -eu
             python3 -m pip install 'xdoctest>=1.1.0'
 
+<<<<<<< HEAD
       - name: Start monitoring script
         id: monitor-script
+=======
+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Start monitoring script
+        id: monitor-script
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shell: bash
         if: ${{ !inputs.disable-monitor }}
         continue-on-error: true
         run: |
           # Windows conda doesn't have python3 binary, only python, but it's python3
+<<<<<<< HEAD
           ${CONDA_RUN} python -m tools.stats.monitor > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
       - name: Download PyTorch Build Artifacts
         uses: seemethere/download-artifact-s3@v4
+=======
+          ${CONDA_RUN} python -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+          ${CONDA_RUN} python -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
+          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
+
+      - name: Download PyTorch Build Artifacts
+        uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           name: ${{ inputs.build-environment }}
           path: C:\${{ github.run_id }}\build-results
@@ -131,6 +193,7 @@ jobs:
         continue-on-error: true
         uses: ./.github/actions/download-td-artifacts
 
+<<<<<<< HEAD
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
@@ -140,6 +203,10 @@ jobs:
 
       - name: Check for keep-going label and re-enabled test issues
         # This uses the filter-test-configs action because it conviniently
+=======
+      - name: Check for keep-going label and re-enabled test issues
+        # This uses the filter-test-configs action because it conveniently
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # checks for labels and re-enabled test issues.  It does not actually do
         # any filtering.  All filtering is done in the build step.
         id: keep-going
@@ -172,8 +239,13 @@ jobs:
           NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
           VC_PRODUCT: "BuildTools"
           VC_VERSION: ""
+<<<<<<< HEAD
           VS_VERSION: "16.8.6"
           VC_YEAR: "2019"
+=======
+          VS_VERSION: "17.4.1"
+          VC_YEAR: "2022"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           AWS_DEFAULT_REGION: us-east-1
           PR_NUMBER: ${{ github.event.pull_request.number }}
           GITHUB_REPOSITORY: ${{ github.repository }}
@@ -236,6 +308,20 @@ jobs:
         with:
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
 
+<<<<<<< HEAD
+=======
+      - name: Upload utilization stats
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Parse ref
         id: parse-ref
         shell: bash
diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml
index baee45d2e9b1..2d1185e51433 100644
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@@ -46,6 +46,24 @@ on:
         required: false
         type: boolean
         default: true
+<<<<<<< HEAD
+=======
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
+permissions:
+  id-token: write
+  contents: read
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@@ -62,20 +80,29 @@ jobs:
     steps:
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup XPU
         uses: ./.github/actions/setup-xpu
 
       - name: configure aws credentials
         id: aws_creds
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v1.7.0
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
           aws-region: us-east-1
 
       - name: Login to Amazon ECR
         id: login-ecr
+<<<<<<< HEAD
         uses: aws-actions/amazon-ecr-login@v2
 
       - name: Calculate docker image
@@ -89,14 +116,58 @@ jobs:
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
+=======
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-image-name: ${{ inputs.docker-image }}
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        env:
+          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        shell: bash
+        run: |
+          tag=${ECR_DOCKER_IMAGE##*:}
+          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Start monitoring script
         id: monitor-script
         if: ${{ !inputs.disable-monitor }}
         shell: bash
         continue-on-error: true
+<<<<<<< HEAD
         run: |
           python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
           python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+=======
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+        run: |
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 nvidia-ml-py==11.525.84
+          python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
       - name: Download build artifacts
@@ -108,6 +179,7 @@ jobs:
         id: parse-ref
         run: .github/scripts/parse_ref.py
 
+<<<<<<< HEAD
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
@@ -117,6 +189,10 @@ jobs:
 
       - name: Check for keep-going label and re-enabled test issues
         # This uses the filter-test-configs action because it conviniently
+=======
+      - name: Check for keep-going label and re-enabled test issues
+        # This uses the filter-test-configs action because it conveniently
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # checks for labels and re-enabled test issues.  It does not actually do
         # any filtering.  All filtering is done in the build step.
         id: keep-going
@@ -244,6 +320,14 @@ jobs:
           # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
           docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"
 
+<<<<<<< HEAD
+=======
+      - name: Change permissions
+        if: ${{ always() && steps.test.conclusion }}
+        run: |
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Print remaining test logs
         shell: bash
         if: always() && steps.test.conclusion
@@ -266,6 +350,20 @@ jobs:
           use-gha: true
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
 
+<<<<<<< HEAD
+=======
+      - name: Upload utilization stats
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Collect backtraces from coredumps (if any)
         if: always()
         run: |
@@ -279,7 +377,11 @@ jobs:
           docker stop "${{ env.CONTAINER_NAME }}"
 
       - name: Store Core dumps on GitHub
+<<<<<<< HEAD
         uses: actions/upload-artifact@v4
+=======
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: failure()
         with:
           name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
diff --git a/.github/workflows/assigntome-docathon.yml b/.github/workflows/assigntome-docathon.yml
index 31fa28289b04..5bac7d3872c5 100644
--- a/.github/workflows/assigntome-docathon.yml
+++ b/.github/workflows/assigntome-docathon.yml
@@ -12,7 +12,11 @@ jobs:
       issues: write
     steps:
       - name: Check for "/assigntome" in comment
+<<<<<<< HEAD
         uses: actions/github-script@v6
+=======
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
@@ -28,14 +32,22 @@ jobs:
                   repo: context.repo.repo,
                   issue_number: issueNumber
                 });
+<<<<<<< HEAD
               const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2024');
+=======
+              const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2025');
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               if (hasLabel) {
                 if (issue.assignee !== null) {
                   await github.rest.issues.createComment({
                     owner: context.repo.owner,
                     repo: context.repo.repo,
                     issue_number: issueNumber,
+<<<<<<< HEAD
                     body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)."
+=======
+                    body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2025 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2025)."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   });
                 } else {
                   await github.rest.issues.addAssignees({
@@ -46,7 +58,11 @@ jobs:
                   });
                 }
               } else {
+<<<<<<< HEAD
                 const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)."
+=======
+                const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2025 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2025)."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 await github.rest.issues.createComment({
                   owner: context.repo.owner,
                   repo: context.repo.repo,
diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml
index 9aaf21193512..75138e8d3e78 100644
--- a/.github/workflows/auto_request_review.yml
+++ b/.github/workflows/auto_request_review.yml
@@ -15,7 +15,11 @@ jobs:
     steps:
       - name: Request review based on files changes and/or groups the author belongs to
         # v0.7.0
+<<<<<<< HEAD
         uses: necojackarc/auto-request-review@e08cdffa277d50854744de3f76230260e61c67f4
+=======
+        uses: necojackarc/auto-request-review@e08cdffa277d50854744de3f76230260e61c67f4 # v0.7.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
 
diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml
index 68aa873037f0..ccecd7e64ab9 100644
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@@ -11,6 +11,7 @@ on:
       # Release candidate tags look like: v1.11.0-rc1
       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
     paths:
+<<<<<<< HEAD
       - '.ci/docker/almalinux/*'
       - '.ci/docker/common/*'
       - .github/workflows/build-almalinux-images.yml
@@ -19,11 +20,25 @@ on:
       - '.ci/docker/almalinux/*'
       - '.ci/docker/common/*'
       - .github/workflows/build-almalinux-images.yml
+=======
+      - .ci/docker/**
+      - .github/workflows/build-almalinux-images.yml
+      - .github/actions/binary-docker-build/**
+  pull_request:
+    paths:
+      - .ci/docker/**
+      - .github/workflows/build-almalinux-images.yml
+      - .github/actions/binary-docker-build/**
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 env:
   DOCKER_REGISTRY: "docker.io"
   DOCKER_BUILDKIT: 1
+<<<<<<< HEAD
   WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+=======
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -32,6 +47,7 @@ concurrency:
 jobs:
   build-docker:
     if: github.repository_owner == 'pytorch'
+<<<<<<< HEAD
     environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
     runs-on: linux.9xlarge.ephemeral
     strategy:
@@ -71,3 +87,19 @@ jobs:
           retry_wait_seconds: 90
           command: |
             .ci/docker/almalinux/build.sh almalinux-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
+=======
+    environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }}
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"]
+    steps:
+      - name: Build docker image
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8
+        with:
+          docker-image-name: almalinux-builder
+          custom-tag-prefix: ${{matrix.tag}}
+          docker-build-dir: almalinux
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml
index 3372888cf848..1d52e860f9ba 100644
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@@ -10,6 +10,7 @@ on:
       # Release candidate tags look like: v1.11.0-rc1
       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
     paths:
+<<<<<<< HEAD
       - '.ci/docker/libtorch/*'
       - '.ci/docker/common/*'
       - .github/workflows/build-libtorch-images.yml
@@ -18,11 +19,25 @@ on:
       - '.ci/docker/libtorch/*'
       - '.ci/docker/common/*'
       - .github/workflows/build-libtorch-images.yml
+=======
+      - .ci/docker/**
+      - .github/workflows/build-libtorch-images.yml
+      - .github/actions/binary-docker-build/**
+  pull_request:
+    paths:
+      - .ci/docker/**
+      - .github/workflows/build-libtorch-images.yml
+      - .github/actions/binary-docker-build/**
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 env:
   DOCKER_REGISTRY: "docker.io"
   DOCKER_BUILDKIT: 1
+<<<<<<< HEAD
   WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+=======
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -32,13 +47,18 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
   build-docker-cuda:
     environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
     needs: get-label-type
@@ -159,3 +179,30 @@ jobs:
           retry_wait_seconds: 90
           command: |
             .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cpu
+=======
+  build:
+    environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }}
+    needs: get-label-type
+    runs-on: ${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral
+    name: libtorch-cxx11-builder:${{ matrix.tag }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          { tag: "cuda12.9" },
+          { tag: "cuda12.8" },
+          { tag: "cuda12.6" },
+          { tag: "rocm6.3"  },
+          { tag: "rocm6.4"  },
+          { tag: "cpu"      },
+        ]
+    steps:
+      - name: Build docker image
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8
+        with:
+          docker-image-name: libtorch-cxx11-builder
+          custom-tag-prefix: ${{ matrix.tag }}
+          docker-build-dir: libtorch
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/build-magma-linux.yml b/.github/workflows/build-magma-linux.yml
index aeaf6e6717a8..cd57d925bc02 100644
--- a/.github/workflows/build-magma-linux.yml
+++ b/.github/workflows/build-magma-linux.yml
@@ -34,23 +34,38 @@ jobs:
       id-token: write
     strategy:
       matrix:
+<<<<<<< HEAD
         cuda_version: ["128", "126", "124", "118"]
     steps:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
+=======
+        cuda_version: ["129", "128", "126"]
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Build Magma Cuda
         working-directory: .ci/magma
         run: |
           # Produces artifacts under magma/output/linux-64/magma-cuda*.bz2
           make magma-cuda${{ matrix.cuda_version }}
       - name: Save as artifact
+<<<<<<< HEAD
         uses: actions/upload-artifact@v4
+=======
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           path: .ci/magma/output/linux-64/magma-cuda*.bz2
           name: artifact_${{ matrix.cuda_version }}
       - name: Configure AWS credentials(PyTorch account)
         if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_ossci_linux_windows_read_write
           aws-region: us-east-1
diff --git a/.github/workflows/build-magma-rocm-linux.yml b/.github/workflows/build-magma-rocm-linux.yml
new file mode 100644
index 000000000000..b6eb09188fd4
--- /dev/null
+++ b/.github/workflows/build-magma-rocm-linux.yml
@@ -0,0 +1,69 @@
+name: build-linux-magma-rocm
+
+on:
+  push:
+    branches:
+      main
+    paths:
+      - .ci/magma-rocm/*
+      - .ci/magma-rocm/package_files/*
+      - .github/workflows/build-magma-rocm-linux.yml
+  pull_request:
+    paths:
+      - .ci/magma-rocm/*
+      - .ci/magma-rocm/package_files/*
+      - .github/workflows/build-magma-rocm-linux.yml
+
+defaults:
+  run:
+    shell: bash -x -e -l {0}
+env:
+  BUILD_ENVIRONMENT: build-linux-magma-rocm
+  IN_CI: 1
+  IS_GHA: 1
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-linux-magma-rocm:
+    if: github.repository_owner == 'pytorch'
+    runs-on: linux.2xlarge
+    permissions:
+      id-token: write
+    strategy:
+      matrix:
+        rocm_version: ["64", "63"]
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Build Magma Rocm
+        working-directory: .ci/magma-rocm
+        run: |
+          # Produces artifacts under magma-rocm/output/linux-64/magma-rocm*.bz2
+          make magma-rocm${{ matrix.rocm_version }}
+      - name: Save as artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          path: .ci/magma-rocm/output/linux-64/magma-rocm*.bz2
+          name: artifact_${{ matrix.rocm_version }}
+      - name: Configure AWS credentials(PyTorch account)
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_ossci_linux_windows_read_write
+          aws-region: us-east-1
+      - name: Set DRY_RUN
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
+        run: |
+            echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Upload binaries
+        shell: bash
+        env:
+            PKG_DIR: ".ci/magma-rocm/output/linux-64/"
+            TARGET_OS: "linux"
+            PKG_INCLUDE: "magma-rocm*.tar.bz2"
+        run: |
+            set -ex
+            bash .github/scripts/upload_aws_ossci.sh
diff --git a/.github/workflows/build-magma-windows.yml b/.github/workflows/build-magma-windows.yml
index 9a1970a5feb7..168215334e3e 100644
--- a/.github/workflows/build-magma-windows.yml
+++ b/.github/workflows/build-magma-windows.yml
@@ -19,17 +19,31 @@ concurrency:
 jobs:
   build-windows-magma:
     if: github.repository_owner == 'pytorch'
+<<<<<<< HEAD
     runs-on: windows-2019
     strategy:
       matrix:
         cuda_version: ["128", "126", "124", "118"]
+=======
+    runs-on: windows-2022
+    strategy:
+      matrix:
+        cuda_version: ["129", "128", "126"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         config: ["Release", "Debug"]
     env:
       CUDA_VERSION: ${{ matrix.cuda_version }}
       CONFIG: ${{ matrix.config }}
+<<<<<<< HEAD
     steps:
       - name: Checkout pytorch/pytorch
         uses: actions/checkout@v4
+=======
+      VC_YEAR: "2022"
+    steps:
+      - name: Checkout pytorch/pytorch
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Enable MSVC dev commands to enable cl.exe  # FYI incompatible with shell: bash
         uses: ilammy/msvc-dev-cmd@dd5e2fa0a7de1e7929605d9ecc020e749d9856a3
       - name: Install CUDA Toolkit
@@ -37,7 +51,11 @@ jobs:
       - name: Build MAGMA and push to S3
         run: .github/scripts/windows/build_magma.bat
       - name: Save as artifact
+<<<<<<< HEAD
         uses: actions/upload-artifact@v4
+=======
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           path: magma_*_cuda*_*.7z
           name: artifact_${{ matrix.cuda_version }}_${{ matrix.config }}
@@ -49,12 +67,21 @@ jobs:
     needs: build-windows-magma
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: actions/checkout@v4
       - name: Download all artifacts
         uses: actions/download-artifact@v4
       - name: Configure AWS credentials(PyTorch account)
         if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Download all artifacts
+        uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
+      - name: Configure AWS credentials(PyTorch account)
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_ossci_linux_windows_read_write
           aws-region: us-east-1
diff --git a/.github/workflows/build-manywheel-images-s390x.yml b/.github/workflows/build-manywheel-images-s390x.yml
index decedf8a334b..43d4a023a4c2 100644
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@@ -3,6 +3,7 @@ name: Build manywheel docker images for s390x
 on:
   workflow_dispatch:
   push:
+<<<<<<< HEAD
     branches:
       - main
       - release/*
@@ -20,13 +21,22 @@ on:
       - '.ci/docker/manywheel/*'
       - '.ci/docker/manywheel/build_scripts/*'
       - '.ci/docker/common/*'
+=======
+    tags:
+      - ciflow/s390/*
+    paths:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - .github/workflows/build-manywheel-images-s390x.yml
 
 
 env:
   DOCKER_REGISTRY: "docker.io"
   DOCKER_BUILDKIT: 1
+<<<<<<< HEAD
   WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+=======
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -35,6 +45,7 @@ concurrency:
 jobs:
   build-docker-cpu-s390x:
     if: github.repository_owner == 'pytorch'
+<<<<<<< HEAD
     environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
     runs-on: linux.s390x
     env:
@@ -57,12 +68,66 @@ jobs:
       - name: Build Docker Image
         run: |
           .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x
+=======
+    environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }}
+    runs-on: linux.s390x
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+        with:
+          submodules: false
+          no-sudo: true
+
+      - name: Build Docker Image
+        run: |
+          .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x -t manylinuxs390x-builder:cpu-s390x
+
+      - name: Tag and (if WITH_PUSH) push docker image to docker.io
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
+          CREATED_FULL_DOCKER_IMAGE_NAME: manylinuxs390x-builder:cpu-s390x
+        shell: bash
+        run: |
+          set -euox pipefail
+          GITHUB_REF="${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}"
+          GIT_BRANCH_NAME="${GITHUB_REF##*/}"
+          GIT_COMMIT_SHA="${GITHUB_SHA:-$(git rev-parse HEAD)}"
+          CI_FOLDER_SHA="$(git rev-parse HEAD:.ci/docker)"
+
+          DOCKER_IMAGE_NAME_PREFIX="docker.io/pytorch/${CREATED_FULL_DOCKER_IMAGE_NAME}"
+
+          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}"
+          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}"
+          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}"
+
+          # Pretty sure Github will mask tokens and I'm not sure if it will even be
+          # printed due to pipe, but just in case
+          set +x
+          if [[ "${WITH_PUSH:-false}" == "true" ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}"
+            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}"
+            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}"
+          fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Cleanup docker
         if: cancelled()
         shell: bash
         run: |
+<<<<<<< HEAD
           # if podman build command is interrupted,
           # it can leave a couple of processes still running.
           # order them to stop for clean shutdown.
+=======
+          # If podman build command is interrupted,
+          # it can leave a couple of processes still running.
+          # Order them to stop for clean shutdown.
+          # It looks like sometimes some processes remain
+          # after first cleanup.
+          # Wait a bit and do cleanup again. It looks like it helps.
+          docker system prune --build -f || true
+          sleep 60
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           docker system prune --build -f || true
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
index 1eaf692414e3..d54e1a7f69d2 100644
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@@ -11,6 +11,7 @@ on:
       # Release candidate tags look like: v1.11.0-rc1
       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
     paths:
+<<<<<<< HEAD
       - '.ci/docker/common/*'
       - '.ci/docker/manywheel/*'
       - '.ci/docker/manywheel/build_scripts/*'
@@ -22,12 +23,26 @@ on:
       - '.ci/docker/manywheel/build_scripts/*'
       - .github/workflows/build-manywheel-images.yml
 
+=======
+      - .ci/docker/**
+      - .github/workflows/build-manywheel-images.yml
+      - .github/actions/binary-docker-build/**
+  pull_request:
+    paths:
+      - .ci/docker/**
+      - .github/workflows/build-manywheel-images.yml
+      - .github/actions/binary-docker-build/**
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 env:
   DOCKER_REGISTRY: "docker.io"
   DOCKER_BUILDKIT: 1
+<<<<<<< HEAD
   WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
 
+=======
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -36,13 +51,18 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
   build-docker-cuda-manylinux_2_28:
     environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
     needs: get-label-type
@@ -362,3 +382,35 @@ jobs:
           retry_wait_seconds: 90
           command: |
             .ci/docker/manywheel/build.sh manylinux2_28-builder:xpu
+=======
+  build:
+    environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }}
+    needs: get-label-type
+    strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          { name: "manylinux2_28-builder",          tag: "cuda12.9",         runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
+        ]
+    runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}
+    name: ${{ matrix.name }}:${{ matrix.tag }}
+    steps:
+      - name: Build docker image
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.8
+        with:
+          docker-image-name: ${{ matrix.name }}
+          custom-tag-prefix: ${{ matrix.tag }}
+          docker-build-dir: manywheel
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index 988d18fe736c..0650e34ea652 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -3,7 +3,11 @@ name: Build Triton wheels
 on:
   push:
     branches:
+<<<<<<< HEAD
       - release/2.7
+=======
+      - release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tags:
       # NOTE: Binary build pipelines should only get triggered on release candidate builds
       # Release candidate tags look like: v1.11.0-rc1
@@ -16,6 +20,10 @@ on:
       - .github/scripts/windows/build_triton.bat
       - .ci/docker/ci_commit_pins/triton.txt
       - .ci/docker/ci_commit_pins/triton-xpu.txt
+<<<<<<< HEAD
+=======
+  workflow_dispatch:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   pull_request:
     paths:
       - .github/workflows/build-triton-wheel.yml
@@ -34,7 +42,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -53,7 +65,11 @@ jobs:
         docker-image: ["pytorch/manylinux2_28-builder:cpu"]
         include:
           - device: "rocm"
+<<<<<<< HEAD
             rocm_version: "6.3"
+=======
+            rocm_version: "6.4"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
           - device: "cuda"
             rocm_version: ""
@@ -72,12 +88,20 @@ jobs:
       PLATFORM: 'manylinux_2_28_x86_64'
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
 
@@ -85,7 +109,11 @@ jobs:
         uses: ./.github/actions/setup-linux
 
       - name: Pull Docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ env.DOCKER_IMAGE }}
 
@@ -138,6 +166,18 @@ jobs:
 
           docker exec -t "${container_name}" yum install -y zlib-devel zip
           docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel
+<<<<<<< HEAD
+=======
+          set +e
+          docker exec -t "${container_name}" command -v pip
+          has_pip=$?
+          set -e
+          if [ $has_pip -eq 0 ] ; then
+              docker exec -t "${container_name}" pip install -U cmake --force-reinstall
+          else
+              docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U cmake --force-reinstall
+          fi
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "rocm" || "${{ matrix.device }}" == "aarch64" ) ]]; then
             # With this install, it gets clang 16.0.6.
@@ -160,14 +200,22 @@ jobs:
           fi
           docker exec -t "${container_name}" chown -R 1000.1000 /artifacts/wheelhouse
 
+<<<<<<< HEAD
       - uses: actions/upload-artifact@v4.4.0
+=======
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}-${{ env.PLATFORM }}
           if-no-files-found: error
           path: ${{ runner.temp }}/artifacts/wheelhouse/*
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
 
   build-wheel-win:
@@ -200,7 +248,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -214,7 +266,11 @@ jobs:
           # in https://github.com/actions/checkout/issues/1018
           git config --global core.fsmonitor false
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: actions/checkout@v4
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: false
@@ -247,7 +303,11 @@ jobs:
           .github/scripts/windows/build_triton.bat
           mkdir -p "${RUNNER_TEMP}/artifacts/"
           mv ./*.whl "${RUNNER_TEMP}/artifacts/"
+<<<<<<< HEAD
       - uses: actions/upload-artifact@v4.4.0
+=======
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}
           if-no-files-found: error
@@ -266,24 +326,40 @@ jobs:
       image: continuumio/miniconda3:4.12.0
     environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
     steps:
+<<<<<<< HEAD
       - uses: actions/checkout@v3
 
       - name: Configure AWS credentials(PyTorch account) for main
         if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
         uses: aws-actions/configure-aws-credentials@v3
+=======
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Configure AWS credentials(PyTorch account) for main
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
           aws-region: us-east-1
 
       - name: Configure AWS credentials(PyTorch account) for RC builds
         if: ${{ github.event_name == 'push' &&  (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
           aws-region: us-east-1
 
       - name: Download Build Artifacts
+<<<<<<< HEAD
         uses: actions/download-artifact@v4.1.7
+=======
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           # Download all available artifacts
           path: ${{ runner.temp }}/artifacts-all
diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml
index dc7ee1930920..aa7e64961d85 100644
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@@ -38,13 +38,21 @@ jobs:
     runs-on: linux.24_04.4x
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: 1
 
       - name: Setup Python
+<<<<<<< HEAD
         uses: actions/setup-python@v4
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.9'
           architecture: x64
diff --git a/.github/workflows/check_mergeability_ghstack.yml b/.github/workflows/check_mergeability_ghstack.yml
index ddf5311cbf01..804ae479fe6c 100644
--- a/.github/workflows/check_mergeability_ghstack.yml
+++ b/.github/workflows/check_mergeability_ghstack.yml
@@ -10,7 +10,11 @@ jobs:
     if: github.repository_owner == 'pytorch'
     runs-on: ubuntu-latest
     steps:
+<<<<<<< HEAD
       - uses: actions/checkout@v4
+=======
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 0
 
@@ -50,7 +54,11 @@ jobs:
           fi
 
       - name: Setup Python
+<<<<<<< HEAD
         uses: actions/setup-python@v4
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.9'
           cache: pip
diff --git a/.github/workflows/cherry-pick.yml b/.github/workflows/cherry-pick.yml
index d8eeeb6b4ec8..b8ce2452af99 100644
--- a/.github/workflows/cherry-pick.yml
+++ b/.github/workflows/cherry-pick.yml
@@ -14,13 +14,21 @@ jobs:
     steps:
       - name: Checkout repo
         id: checkout
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 0
           token: ${{ secrets.GH_PYTORCHBOT_CHERRY_PICK_TOKEN }}
 
       - name: Setup Python
+<<<<<<< HEAD
         uses: actions/setup-python@v4
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.11'
           cache: pip
diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml
index b17789f9abe9..047e29cff981 100644
--- a/.github/workflows/close-nonexistent-disable-issues.yml
+++ b/.github/workflows/close-nonexistent-disable-issues.yml
@@ -13,7 +13,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index c6bf6803c766..89b5ebd89d75 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -19,7 +19,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -36,7 +40,11 @@ jobs:
     outputs:
       pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
     steps:
+<<<<<<< HEAD
       - uses: actions/checkout@v4
+=======
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           show-progress: false
           submodules: 'recursive'
@@ -55,6 +63,11 @@ jobs:
           tag_or_branch="${tag_or_branch//\//_}"
           echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
           echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
+<<<<<<< HEAD
+=======
+      - name: Checkout optional submodules
+        run: python3 tools/optional_submodules.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Create source distribution
         run: |
             # Create new folder with specified name so extracting the archive yields that
@@ -69,18 +82,30 @@ jobs:
             echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
       - name: Upload source distribution for release
         if: ${{ github.event_name == 'release' }}
+<<<<<<< HEAD
         uses: softprops/action-gh-release@v1
+=======
+        uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           files: ${{env.PT_RELEASE_FILE}}
       - name: Upload source distribution to GHA artifacts for release tags
         if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
+<<<<<<< HEAD
         uses: actions/upload-artifact@v4.4.0
+=======
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           name: ${{ env.PT_RELEASE_FILE }}
           path: ${{ env.PT_RELEASE_FILE }}
       - name: Set output
         id: release_name
+<<<<<<< HEAD
         run: echo "::set-output name=pt_release_name::${{ env.PT_RELEASE_NAME }}.tar.gz"
+=======
+        run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   upload_source_code_to_s3:
     if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
@@ -93,6 +118,7 @@ jobs:
       - get-label-type
       - release
     steps:
+<<<<<<< HEAD
       - uses: actions/download-artifact@v4.1.7
         with:
           name: ${{ needs.release.outputs.pt_release_name }}
@@ -102,6 +128,17 @@ jobs:
           role-to-assume: arn:aws:iam::749337293305:role/gha_pytorch_source_code_upload_role
           aws-region: us-east-1
       - uses: seemethere/upload-artifact-s3@v5
+=======
+      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+        with:
+          name: ${{ needs.release.outputs.pt_release_name }}
+      - name: Configure AWS credentials(PyTorch account)
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_pytorch_source_code_upload_role
+          aws-region: us-east-1
+      - uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           s3-bucket: pytorch
           s3-prefix: source_code/test
diff --git a/.github/workflows/delete_old_branches.yml b/.github/workflows/delete_old_branches.yml
index eabb98e32065..803418360826 100644
--- a/.github/workflows/delete_old_branches.yml
+++ b/.github/workflows/delete_old_branches.yml
@@ -22,12 +22,20 @@ jobs:
 
     steps:
       - name: Checkout repo
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 0
 
       - name: Setup Python
+<<<<<<< HEAD
         uses: actions/setup-python@v4
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.11'
           architecture: x64
diff --git a/.github/workflows/docathon-sync-label.yml b/.github/workflows/docathon-sync-label.yml
index 08703be573a6..65cb16e67d90 100644
--- a/.github/workflows/docathon-sync-label.yml
+++ b/.github/workflows/docathon-sync-label.yml
@@ -14,11 +14,19 @@ jobs:
       pull-requests: write
     steps:
       - name: Check out the repo
+<<<<<<< HEAD
         uses: actions/checkout@v2
         with:
           fetch-depth: 1
       - name: Set up Python
         uses: actions/setup-python@v2
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 1
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: 3.x
       - name: Install dependencies
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 903c81fd539e..b798cdf36703 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -33,7 +33,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -49,6 +53,7 @@ jobs:
       matrix:
         runner: [linux.12xlarge]
         docker-image-name: [
+<<<<<<< HEAD
           pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks,
@@ -74,20 +79,54 @@ jobs:
           pytorch-linux-focal-py3-clang10-onnx,
           pytorch-linux-focal-linter,
           pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter,
+=======
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
+          pytorch-linux-jammy-py3.9-clang12,
+          pytorch-linux-jammy-py3.11-clang12,
+          pytorch-linux-jammy-py3.12-clang12,
+          pytorch-linux-jammy-py3.13-clang12,
+          pytorch-linux-jammy-rocm-n-1-py3,
+          pytorch-linux-jammy-rocm-n-py3,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
+          pytorch-linux-jammy-py3.9-gcc11,
+          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-py3.12-halide,
+          pytorch-linux-jammy-xpu-2025.0-py3,
+          pytorch-linux-jammy-xpu-2025.1-py3,
+          pytorch-linux-jammy-py3-clang15-asan,
+          pytorch-linux-jammy-py3-clang18-asan,
+          pytorch-linux-jammy-py3-clang12-onnx,
+          pytorch-linux-jammy-linter,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           pytorch-linux-jammy-py3-clang12-executorch,
           pytorch-linux-jammy-py3.12-triton-cpu
         ]
         include:
           - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
+<<<<<<< HEAD
             runner: linux.arm64.2xlarge
+=======
+            runner: linux.arm64.m7g.4xlarge
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
             runner: linux.arm64.m7g.4xlarge
             timeout-minutes: 600
     # Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358
     # runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
     runs-on: "${{ matrix.runner }}"
+<<<<<<< HEAD
     env:
       DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       - name: Clean workspace
         shell: bash
@@ -99,32 +138,54 @@ jobs:
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required for git merge-base
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
       - name: Build docker image
         id: build-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
         with:
           docker-image-name: ${{ matrix.docker-image-name }}
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-image-name: ci-image:${{ matrix.docker-image-name }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           always-rebuild: true
           push: true
 
       - name: Pull docker image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: ${{ steps.build-docker-image.outputs.docker-image }}
 
       - uses: nick-fields/retry@v3.0.0
         name: Push to https://https://ghcr.io/
+=======
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.build-docker-image.outputs.docker-image }}
+
+      - uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
+        name: Push to https://ghcr.io/
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         id: push-to-ghcr-io
         if: ${{ github.event_name == 'push' }}
         env:
           ECR_DOCKER_IMAGE: ${{ steps.build-docker-image.outputs.docker-image }}
           GHCR_PAT: ${{ secrets.GHCR_PAT }}
+<<<<<<< HEAD
           IMAGE_NAME: ${{ matrix.docker-image-name }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           shell: bash
           timeout_minutes: 30
@@ -135,8 +196,13 @@ jobs:
             tag=${ECR_DOCKER_IMAGE##*:}
             # Push docker image to the ghcr.io
             echo $GHCR_PAT | docker login ghcr.io -u pytorch --password-stdin
+<<<<<<< HEAD
             docker tag "${ECR_DOCKER_IMAGE}" "${ghcr_image}:${IMAGE_NAME}-${tag}"
             docker push "${ghcr_image}:${IMAGE_NAME}-${tag}"
+=======
+            docker tag "${ECR_DOCKER_IMAGE}" "${ghcr_image}:${tag}"
+            docker push "${ghcr_image}:${tag}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Chown workspace
         uses: ./.github/actions/chown-workspace
@@ -145,5 +211,9 @@ jobs:
         if: always()
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
diff --git a/.github/workflows/docker-cache-mi300.yml b/.github/workflows/docker-cache-mi300.yml
new file mode 100644
index 000000000000..065030dbf68b
--- /dev/null
+++ b/.github/workflows/docker-cache-mi300.yml
@@ -0,0 +1,55 @@
+name: docker-cache-mi300
+
+on:
+  # run every 6 hours
+  schedule:
+    - cron: 0 0,6,12,18 * * *
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  docker-cache:
+    if: github.repository_owner == 'pytorch'
+    runs-on: rocm-docker
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+        with:
+          no-sudo: true
+
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        continue-on-error: false
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+          push: false
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Tar and upload to S3 bucket
+        run: |
+          sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }}
+          sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index fa8116f03109..7af30009ce08 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -37,7 +37,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -52,7 +56,11 @@ jobs:
       matrix: ${{ steps.generate-matrix.outputs.matrix }}
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: true
@@ -82,13 +90,21 @@ jobs:
       CUDNN_VERSION: ${{ matrix.cudnn_version }}
     steps:
       - name: Setup SSH (Click me for login details)
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required for git merge-base
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 0
           submodules: 'recursive'
@@ -96,18 +112,30 @@ jobs:
         uses: ./.github/actions/setup-linux
       - name: Login to GitHub Container Registry
         if: ${{ env.WITH_PUSH == 'true' }}
+<<<<<<< HEAD
         uses: docker/login-action@v2
+=======
+        uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           registry: ghcr.io
           username: pytorch
           password: ${{ secrets.GHCR_PAT }}
       # Setup multi-arch image builds
       - name: Set up QEMU
+<<<<<<< HEAD
         uses: docker/setup-qemu-action@v3
         env:
           QEMU_BINARY_PATH: ${{ runner.temp }}/bin
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
+=======
+        uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
+        env:
+          QEMU_BINARY_PATH: ${{ runner.temp }}/bin
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           version: latest
           driver-opts: image=moby/buildkit:v0.19.0
@@ -156,7 +184,11 @@ jobs:
 
           docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
 
+<<<<<<< HEAD
           # Please note, here we ned to pin specific verison of CUDA as with latest label
+=======
+          # Please note, here we need to pin specific version of CUDA as with latest label
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           if [[ ${CUDA_VERSION_SHORT} == "${STABLE_CUDA_VERSION}" ]]; then
             docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \
                     ghcr.io/pytorch/pytorch-nightly:latest
@@ -164,12 +196,20 @@ jobs:
           fi
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
 
   validate:
     needs: build
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.7
+=======
+    uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       channel: test
       ref: main
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index 108fff638950..8108bb233f05 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -18,6 +18,12 @@ on:
       - 'ciflow/binaries_wheel/*'
   workflow_dispatch:
 
+<<<<<<< HEAD
+=======
+permissions:
+  id-token: write
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 env:
   # Needed for conda builds
   ALPINE_IMAGE: "arm64v8/alpine"
@@ -38,7 +44,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -55,8 +65,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -79,8 +94,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cpu-aarch64
@@ -103,8 +123,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cpu-aarch64
@@ -112,7 +137,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_9-cuda-aarch64-12_8-build:
+=======
+  manywheel-py3_9-cuda-aarch64-12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -121,16 +150,25 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
+<<<<<<< HEAD
       build_name: manywheel-py3_9-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
@@ -138,16 +176,30 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda-aarch64-12_8-upload:  # Uploading
+=======
+      build_name: manywheel-py3_9-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda-aarch64-12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: manywheel-py3_9-cuda-aarch64-12_8-build
+=======
+    needs: manywheel-py3_9-cuda-aarch64-12_9-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
@@ -156,6 +208,16 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda-aarch64-12_8
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda-aarch64-12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -171,8 +233,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -195,8 +262,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu-aarch64
@@ -219,8 +291,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu-aarch64
@@ -228,7 +305,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_10-cuda-aarch64-12_8-build:
+=======
+  manywheel-py3_10-cuda-aarch64-12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -237,16 +318,25 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
+<<<<<<< HEAD
       build_name: manywheel-py3_10-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
@@ -254,16 +344,30 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda-aarch64-12_8-upload:  # Uploading
+=======
+      build_name: manywheel-py3_10-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda-aarch64-12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: manywheel-py3_10-cuda-aarch64-12_8-build
+=======
+    needs: manywheel-py3_10-cuda-aarch64-12_9-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
@@ -272,6 +376,16 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda-aarch64-12_8
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda-aarch64-12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -287,8 +401,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -311,8 +430,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu-aarch64
@@ -335,8 +459,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu-aarch64
@@ -344,7 +473,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_11-cuda-aarch64-12_8-build:
+=======
+  manywheel-py3_11-cuda-aarch64-12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -353,16 +486,25 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
+<<<<<<< HEAD
       build_name: manywheel-py3_11-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
@@ -370,16 +512,30 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda-aarch64-12_8-upload:  # Uploading
+=======
+      build_name: manywheel-py3_11-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda-aarch64-12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: manywheel-py3_11-cuda-aarch64-12_8-build
+=======
+    needs: manywheel-py3_11-cuda-aarch64-12_9-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
@@ -388,6 +544,16 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda-aarch64-12_8
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda-aarch64-12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -403,8 +569,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -427,8 +598,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu-aarch64
@@ -451,8 +627,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu-aarch64
@@ -460,7 +641,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_12-cuda-aarch64-12_8-build:
+=======
+  manywheel-py3_12-cuda-aarch64-12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -469,16 +654,25 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
+<<<<<<< HEAD
       build_name: manywheel-py3_12-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
@@ -486,16 +680,30 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda-aarch64-12_8-upload:  # Uploading
+=======
+      build_name: manywheel-py3_12-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda-aarch64-12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: manywheel-py3_12-cuda-aarch64-12_8-build
+=======
+    needs: manywheel-py3_12-cuda-aarch64-12_9-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
@@ -504,6 +712,16 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda-aarch64-12_8
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda-aarch64-12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -519,8 +737,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -543,8 +766,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu-aarch64
@@ -567,8 +795,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu-aarch64
@@ -576,7 +809,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13-cuda-aarch64-12_8-build:
+=======
+  manywheel-py3_13-cuda-aarch64-12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -585,16 +822,25 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
+<<<<<<< HEAD
       build_name: manywheel-py3_13-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
@@ -602,16 +848,30 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda-aarch64-12_8-upload:  # Uploading
+=======
+      build_name: manywheel-py3_13-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda-aarch64-12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: manywheel-py3_13-cuda-aarch64-12_8-build
+=======
+    needs: manywheel-py3_13-cuda-aarch64-12_9-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
@@ -620,6 +880,16 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda-aarch64-12_8
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda-aarch64-12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -635,8 +905,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -659,8 +934,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cpu-aarch64
@@ -683,8 +963,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-aarch64
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cpu-aarch64
@@ -692,7 +977,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13t-cuda-aarch64-12_8-build:
+=======
+  manywheel-py3_13t-cuda-aarch64-12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -701,16 +990,25 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
+<<<<<<< HEAD
       build_name: manywheel-py3_13t-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
@@ -718,16 +1016,30 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda-aarch64-12_8-upload:  # Uploading
+=======
+      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda-aarch64-12_9-upload:  # Uploading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     needs: manywheel-py3_13t-cuda-aarch64-12_8-build
+=======
+    needs: manywheel-py3_13t-cuda-aarch64-12_9-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8-aarch64
       GPU_ARCH_TYPE: cuda-aarch64
@@ -736,6 +1048,16 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda-aarch64-12_8
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
new file mode 100644
index 000000000000..f7aa0ba20236
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@@ -0,0 +1,543 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-libtorch
+
+
+on:
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_libtorch/*'
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-libtorch
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: linux-binary-libtorch-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  libtorch-cpu-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cpu-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cpu-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cpu-shared-with-deps-release-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cpu-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cpu-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cpu-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cpu-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-cuda12_6-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cuda12_6-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_6-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_6-shared-with-deps-release-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cuda12_6-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_6-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_6-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cuda12_6-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-cuda12_8-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cuda12_8-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_8-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_8-shared-with-deps-release-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cuda12_8-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_8-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_8-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cuda12_8-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-cuda12_9-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_9-shared-with-deps-release-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_9-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_3-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-rocm6_3-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_3-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm6_3-shared-with-deps-release-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm6_3-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: libtorch-cxx11-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm6_3-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_3-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-rocm6_3-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_4-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-rocm6_4-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm6_4-shared-with-deps-release-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm6_4-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: libtorch-cxx11-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm6_4-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_4-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-rocm6_4-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-linux-binary-libtorch-release-main.yml b/.github/workflows/generated-linux-binary-libtorch-release-main.yml
new file mode 100644
index 000000000000..1b231ca5ffb6
--- /dev/null
+++ b/.github/workflows/generated-linux-binary-libtorch-release-main.yml
@@ -0,0 +1,87 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-binary-libtorch-release
+
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - 'ciflow/trunk/*'
+  workflow_dispatch:
+
+permissions:
+  id-token: write
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-binary-libtorch-release
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: linux-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  libtorch-cpu-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cpu-shared-with-deps-release
+      build_environment: linux-binary-libtorch-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cpu-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cpu-shared-with-deps-release-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cpu-shared-with-deps-release
+      build_environment: linux-binary-libtorch-release
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml
index 524d7dca0c77..60725edae2b9 100644
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@@ -13,6 +13,12 @@ on:
       - 'ciflow/trunk/*'
   workflow_dispatch:
 
+<<<<<<< HEAD
+=======
+permissions:
+  id-token: write
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 env:
   # Needed for conda builds
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
@@ -33,12 +39,17 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
   manywheel-py3_9-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -86,6 +97,8 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_9-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -98,14 +111,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_9-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_6-test:  # Testing
@@ -122,14 +144,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -145,14 +176,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_9-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_8-test:  # Testing
@@ -169,13 +209,162 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_9-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 6d5e940571fc..41b8ae2d9de2 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -18,6 +18,12 @@ on:
       - 'ciflow/binaries_wheel/*'
   workflow_dispatch:
 
+<<<<<<< HEAD
+=======
+permissions:
+  id-token: write
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 env:
   # Needed for conda builds
   ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
@@ -38,7 +44,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -55,8 +65,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -77,8 +92,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cpu
@@ -100,8 +120,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cpu
@@ -109,6 +134,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_9-cpu-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -243,6 +269,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_9-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -255,14 +283,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_9-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_6-test:  # Testing
@@ -279,14 +316,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_6-upload:  # Uploading
@@ -303,8 +349,13 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda12_6
@@ -324,14 +375,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_9-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_8-test:  # Testing
@@ -348,14 +408,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_8-upload:  # Uploading
@@ -372,8 +441,13 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda12_8
@@ -381,7 +455,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_9-rocm6_2_4-build:
+=======
+  manywheel-py3_9-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -390,6 +468,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -458,11 +537,33 @@ jobs:
       id-token: write
       contents: read
     needs: manywheel-py3_9-rocm6_2_4-test
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -471,6 +572,40 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-rocm6_2_4
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -487,8 +622,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -501,7 +641,11 @@ jobs:
     needs:
       - manywheel-py3_9-rocm6_3-build
       - get-label-type
+<<<<<<< HEAD
     runs-on: linux.rocm.gpu
+=======
+    runs-on: linux.rocm.gpu.mi250
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: /pytorch
@@ -512,8 +656,13 @@ jobs:
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
     steps:
@@ -538,12 +687,40 @@ jobs:
       - name: ROCm set GPU_FLAG
         run: |
           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
   manywheel-py3_9-rocm6_3-upload:  # Uploading
@@ -560,8 +737,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-rocm6_3
@@ -569,6 +751,122 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
+=======
+  manywheel-py3_9-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_9-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_9-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_9-rocm6_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-rocm6_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-rocm6_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_9-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -580,14 +878,23 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_9-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-xpu-test:  # Testing
@@ -605,8 +912,13 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
     permissions:
@@ -617,7 +929,11 @@ jobs:
         uses: ./.github/actions/setup-xpu
       - name: configure aws credentials
         id: aws_creds
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v1.7.0
+=======
+        uses: aws-actions/configure-aws-credentials@v4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
           aws-region: us-east-1
@@ -640,12 +956,32 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:xpu-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: xpu
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown XPU
         uses: ./.github/actions/teardown-xpu
   manywheel-py3_9-xpu-upload:  # Uploading
@@ -661,8 +997,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-xpu
@@ -681,8 +1022,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -703,8 +1049,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu
@@ -726,8 +1077,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu
@@ -735,6 +1091,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_10-cpu-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -869,6 +1226,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_10-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -881,14 +1240,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_6-test:  # Testing
@@ -905,14 +1273,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_6-upload:  # Uploading
@@ -929,8 +1306,13 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_6
@@ -950,14 +1332,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_8-test:  # Testing
@@ -974,14 +1365,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_8-upload:  # Uploading
@@ -998,8 +1398,13 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_8
@@ -1007,7 +1412,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_10-rocm6_2_4-build:
+=======
+  manywheel-py3_10-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -1016,6 +1425,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -1084,11 +1494,33 @@ jobs:
       id-token: write
       contents: read
     needs: manywheel-py3_10-rocm6_2_4-test
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_10-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_10-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -1097,6 +1529,40 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-rocm6_2_4
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1113,8 +1579,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -1127,7 +1598,11 @@ jobs:
     needs:
       - manywheel-py3_10-rocm6_3-build
       - get-label-type
+<<<<<<< HEAD
     runs-on: linux.rocm.gpu
+=======
+    runs-on: linux.rocm.gpu.mi250
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: /pytorch
@@ -1138,8 +1613,13 @@ jobs:
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
     steps:
@@ -1164,12 +1644,40 @@ jobs:
       - name: ROCm set GPU_FLAG
         run: |
           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
   manywheel-py3_10-rocm6_3-upload:  # Uploading
@@ -1186,8 +1694,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-rocm6_3
@@ -1195,6 +1708,122 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
+=======
+  manywheel-py3_10-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_10-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_10-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_10-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_10-rocm6_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-rocm6_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-rocm6_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_10-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1206,14 +1835,23 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-xpu-test:  # Testing
@@ -1231,8 +1869,13 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
     permissions:
@@ -1243,7 +1886,11 @@ jobs:
         uses: ./.github/actions/setup-xpu
       - name: configure aws credentials
         id: aws_creds
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v1.7.0
+=======
+        uses: aws-actions/configure-aws-credentials@v4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
           aws-region: us-east-1
@@ -1266,12 +1913,32 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:xpu-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: xpu
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown XPU
         uses: ./.github/actions/teardown-xpu
   manywheel-py3_10-xpu-upload:  # Uploading
@@ -1287,8 +1954,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-xpu
@@ -1307,8 +1979,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -1329,8 +2006,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu
@@ -1352,8 +2034,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu
@@ -1361,6 +2048,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_11-cpu-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1495,6 +2183,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_11-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1507,14 +2197,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_6-test:  # Testing
@@ -1531,14 +2230,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_6-upload:  # Uploading
@@ -1555,8 +2263,13 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_6
@@ -1564,6 +2277,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_11-cuda12_6-full-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1629,6 +2343,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_11-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1641,14 +2357,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_8-test:  # Testing
@@ -1665,14 +2390,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_8-upload:  # Uploading
@@ -1689,8 +2423,13 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_8
@@ -1698,7 +2437,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_11-rocm6_2_4-build:
+=======
+  manywheel-py3_11-cuda12_8-full-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -1707,6 +2450,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -1775,19 +2519,143 @@ jobs:
       id-token: write
       contents: read
     needs: manywheel-py3_11-rocm6_2_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.2.4
-      GPU_ARCH_VERSION: 6.2.4
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.2.4-2.7
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
+=======
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_8-full
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_8-full-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_8-full-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
+      DESIRED_CUDA: rocm6.2.4
+      GPU_ARCH_VERSION: 6.2.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.2.4-2.7
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-rocm6_2_4
+=======
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_8-full
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_8-full-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_8-full-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_8-full
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_11-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1804,8 +2672,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -1818,7 +2691,11 @@ jobs:
     needs:
       - manywheel-py3_11-rocm6_3-build
       - get-label-type
+<<<<<<< HEAD
     runs-on: linux.rocm.gpu
+=======
+    runs-on: linux.rocm.gpu.mi250
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: /pytorch
@@ -1829,8 +2706,13 @@ jobs:
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
     steps:
@@ -1855,12 +2737,40 @@ jobs:
       - name: ROCm set GPU_FLAG
         run: |
           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
   manywheel-py3_11-rocm6_3-upload:  # Uploading
@@ -1877,8 +2787,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-rocm6_3
@@ -1886,6 +2801,122 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
+=======
+  manywheel-py3_11-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_11-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_11-rocm6_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-rocm6_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-rocm6_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_11-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1897,14 +2928,23 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-xpu-test:  # Testing
@@ -1922,8 +2962,13 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
     permissions:
@@ -1934,7 +2979,11 @@ jobs:
         uses: ./.github/actions/setup-xpu
       - name: configure aws credentials
         id: aws_creds
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v1.7.0
+=======
+        uses: aws-actions/configure-aws-credentials@v4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
           aws-region: us-east-1
@@ -1957,12 +3006,32 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:xpu-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: xpu
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown XPU
         uses: ./.github/actions/teardown-xpu
   manywheel-py3_11-xpu-upload:  # Uploading
@@ -1978,8 +3047,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-xpu
@@ -1998,8 +3072,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -2020,8 +3099,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu
@@ -2043,8 +3127,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu
@@ -2052,6 +3141,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_12-cpu-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2186,6 +3276,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_12-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2198,14 +3290,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_6-test:  # Testing
@@ -2222,14 +3323,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_6-upload:  # Uploading
@@ -2246,8 +3356,13 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_6
@@ -2267,14 +3382,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-test:  # Testing
@@ -2291,14 +3415,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-upload:  # Uploading
@@ -2315,8 +3448,13 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_8
@@ -2324,7 +3462,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_12-rocm6_2_4-build:
+=======
+  manywheel-py3_12-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -2333,6 +3475,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -2401,11 +3544,33 @@ jobs:
       id-token: write
       contents: read
     needs: manywheel-py3_12-rocm6_2_4-test
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_12-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_12-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -2414,6 +3579,40 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-rocm6_2_4
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -2430,8 +3629,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -2444,7 +3648,11 @@ jobs:
     needs:
       - manywheel-py3_12-rocm6_3-build
       - get-label-type
+<<<<<<< HEAD
     runs-on: linux.rocm.gpu
+=======
+    runs-on: linux.rocm.gpu.mi250
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: /pytorch
@@ -2455,8 +3663,13 @@ jobs:
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
     steps:
@@ -2481,37 +3694,186 @@ jobs:
       - name: ROCm set GPU_FLAG
         run: |
           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_12-rocm6_3-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-rocm6_3-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.3
+      GPU_ARCH_VERSION: 6.3
+      GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
+      DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
+      DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-rocm6_3
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+<<<<<<< HEAD
+=======
+  manywheel-py3_12-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_12-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_12-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.12"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_12-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_12-rocm6_3-upload:  # Uploading
+  manywheel-py3_12-rocm6_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_12-rocm6_3-test
+    needs: manywheel-py3_12-rocm6_4-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: 6.3
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
-      DESIRED_DEVTOOLSET: cxx11-abi
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       use_split_build: False
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm6_3
+      build_name: manywheel-py3_12-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_12-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2523,14 +3885,23 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-xpu-test:  # Testing
@@ -2548,8 +3919,13 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
     permissions:
@@ -2560,7 +3936,11 @@ jobs:
         uses: ./.github/actions/setup-xpu
       - name: configure aws credentials
         id: aws_creds
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v1.7.0
+=======
+        uses: aws-actions/configure-aws-credentials@v4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
           aws-region: us-east-1
@@ -2583,12 +3963,32 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:xpu-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: xpu
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown XPU
         uses: ./.github/actions/teardown-xpu
   manywheel-py3_12-xpu-upload:  # Uploading
@@ -2604,8 +4004,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-xpu
@@ -2624,8 +4029,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -2646,8 +4056,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu
@@ -2669,8 +4084,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu
@@ -2678,6 +4098,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13-cpu-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2812,6 +4233,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_13-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2824,14 +4247,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_6-test:  # Testing
@@ -2848,14 +4280,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_6-upload:  # Uploading
@@ -2872,8 +4313,13 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_6
@@ -2893,14 +4339,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_8-test:  # Testing
@@ -2917,14 +4372,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_8-upload:  # Uploading
@@ -2941,8 +4405,13 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cuda12_8
@@ -2950,7 +4419,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13-rocm6_2_4-build:
+=======
+  manywheel-py3_13-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -2959,6 +4432,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -3027,11 +4501,33 @@ jobs:
       id-token: write
       contents: read
     needs: manywheel-py3_13-rocm6_2_4-test
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -3040,6 +4536,40 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-rocm6_2_4
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -3056,8 +4586,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -3070,7 +4605,11 @@ jobs:
     needs:
       - manywheel-py3_13-rocm6_3-build
       - get-label-type
+<<<<<<< HEAD
     runs-on: linux.rocm.gpu
+=======
+    runs-on: linux.rocm.gpu.mi250
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: /pytorch
@@ -3081,8 +4620,13 @@ jobs:
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
     steps:
@@ -3107,12 +4651,40 @@ jobs:
       - name: ROCm set GPU_FLAG
         run: |
           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
   manywheel-py3_13-rocm6_3-upload:  # Uploading
@@ -3129,8 +4701,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-rocm6_3
@@ -3138,6 +4715,122 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
+=======
+  manywheel-py3_13-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.13"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_13-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_13-rocm6_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-rocm6_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-rocm6_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_13-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3149,14 +4842,23 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-xpu-test:  # Testing
@@ -3174,8 +4876,13 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
     permissions:
@@ -3186,7 +4893,11 @@ jobs:
         uses: ./.github/actions/setup-xpu
       - name: configure aws credentials
         id: aws_creds
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v1.7.0
+=======
+        uses: aws-actions/configure-aws-credentials@v4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
           aws-region: us-east-1
@@ -3209,12 +4920,32 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:xpu-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: xpu
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown XPU
         uses: ./.github/actions/teardown-xpu
   manywheel-py3_13-xpu-upload:  # Uploading
@@ -3230,8 +4961,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-xpu
@@ -3250,8 +4986,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -3272,8 +5013,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cpu
@@ -3295,8 +5041,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cpu
@@ -3304,6 +5055,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13t-cpu-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3438,6 +5190,8 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_13t-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3450,14 +5204,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_6-test:  # Testing
@@ -3474,14 +5237,23 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.4xlarge.nvidia.gpu
+=======
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_6-upload:  # Uploading
@@ -3498,8 +5270,13 @@ jobs:
       DESIRED_CUDA: cu126
       GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_6
@@ -3519,14 +5296,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_8-test:  # Testing
@@ -3543,14 +5329,23 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
+=======
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_8-upload:  # Uploading
@@ -3567,8 +5362,13 @@ jobs:
       DESIRED_CUDA: cu128
       GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-cuda12_8
@@ -3576,7 +5376,11 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
   manywheel-py3_13t-rocm6_2_4-build:
+=======
+  manywheel-py3_13t-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -3585,6 +5389,7 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -3653,11 +5458,33 @@ jobs:
       id-token: write
       contents: read
     needs: manywheel-py3_13t-rocm6_2_4-test
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: rocm6.2.4
       GPU_ARCH_VERSION: 6.2.4
       GPU_ARCH_TYPE: rocm
@@ -3666,6 +5493,40 @@ jobs:
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-rocm6_2_4
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -3682,8 +5543,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@@ -3696,7 +5562,11 @@ jobs:
     needs:
       - manywheel-py3_13t-rocm6_3-build
       - get-label-type
+<<<<<<< HEAD
     runs-on: linux.rocm.gpu
+=======
+    runs-on: linux.rocm.gpu.mi250
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: /pytorch
@@ -3707,8 +5577,13 @@ jobs:
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
     steps:
@@ -3733,12 +5608,40 @@ jobs:
       - name: ROCm set GPU_FLAG
         run: |
           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:rocm6.3-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.3
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
   manywheel-py3_13t-rocm6_3-upload:  # Uploading
@@ -3755,8 +5658,13 @@ jobs:
       DESIRED_CUDA: rocm6.3
       GPU_ARCH_VERSION: 6.3
       GPU_ARCH_TYPE: rocm
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:rocm6.3-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-rocm6_3
@@ -3764,6 +5672,122 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+<<<<<<< HEAD
+=======
+  manywheel-py3_13t-rocm6_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-rocm6_4
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-rocm6_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-rocm6_4-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_13t-rocm6_4
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_13t-rocm6_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-rocm6_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-rocm6_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   manywheel-py3_13t-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3775,14 +5799,23 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-xpu
       build_environment: linux-binary-manywheel
+<<<<<<< HEAD
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+=======
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-xpu-test:  # Testing
@@ -3800,8 +5833,13 @@ jobs:
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
     permissions:
@@ -3812,7 +5850,11 @@ jobs:
         uses: ./.github/actions/setup-xpu
       - name: configure aws credentials
         id: aws_creds
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v1.7.0
+=======
+        uses: aws-actions/configure-aws-credentials@v4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
           aws-region: us-east-1
@@ -3835,12 +5877,32 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+<<<<<<< HEAD
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
         with:
           docker-image: pytorch/manylinux2_28-builder:xpu-2.7
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
+=======
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: manylinux2_28-builder
+          custom-tag-prefix: xpu
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Teardown XPU
         uses: ./.github/actions/teardown-xpu
   manywheel-py3_13t-xpu-upload:  # Uploading
@@ -3856,8 +5918,13 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-2.7
       DESIRED_DEVTOOLSET: cxx11-abi
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13t"
       build_name: manywheel-py3_13t-xpu
diff --git a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
index 4d97845dd9fe..a6e37eaad62e 100644
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@@ -18,6 +18,12 @@ on:
       - 'ciflow/binaries_wheel/*'
   workflow_dispatch:
 
+<<<<<<< HEAD
+=======
+permissions:
+  id-token: write
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 env:
   # Needed for conda builds
   ALPINE_IMAGE: "docker.io/s390x/alpine"
@@ -38,7 +44,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -55,7 +65,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       runs_on: linux.s390x
@@ -78,7 +93,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cpu-s390x
@@ -100,7 +120,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cpu-s390x
@@ -119,7 +144,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       runs_on: linux.s390x
@@ -142,7 +172,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu-s390x
@@ -164,7 +199,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cpu-s390x
@@ -183,7 +223,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       runs_on: linux.s390x
@@ -206,7 +251,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu-s390x
@@ -228,7 +278,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cpu-s390x
@@ -247,7 +302,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       runs_on: linux.s390x
@@ -270,7 +330,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu-s390x
@@ -292,7 +357,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cpu-s390x
@@ -311,7 +381,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       runs_on: linux.s390x
@@ -334,7 +409,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu-s390x
@@ -356,7 +436,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu-s390x
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-2.7
+=======
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use_split_build: False
       DESIRED_PYTHON: "3.13"
       build_name: manywheel-py3_13-cpu-s390x
diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
new file mode 100644
index 000000000000..9f1976c6de04
--- /dev/null
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@@ -0,0 +1,136 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: macos-arm64-binary-libtorch-release
+
+on:
+# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_libtorch/*'
+  workflow_dispatch:
+
+env:
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+  BUILD_ENVIRONMENT: macos-arm64-binary-libtorch-release
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: macos-arm64-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  libtorch-cpu-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-14-xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
+
+          # Build
+          USE_PYTORCH_METAL_EXPORT=1
+          USE_COREML_DELEGATE=1
+          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
+          export USE_PYTORCH_METAL_EXPORT
+          export USE_COREML_DELEGATE
+          export TORCH_PACKAGE_NAME
+          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  libtorch-cpu-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cpu-shared-with-deps-release-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cpu-shared-with-deps-release
+      use_s3: False
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index d5eb16b786eb..79d110248479 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -145,7 +145,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.9"
       build_name: wheel-py3_9-cpu
       use_s3: False
@@ -267,7 +272,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.10"
       build_name: wheel-py3_10-cpu
       use_s3: False
@@ -389,7 +399,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.11"
       build_name: wheel-py3_11-cpu
       use_s3: False
@@ -511,7 +526,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.12"
       build_name: wheel-py3_12-cpu
       use_s3: False
@@ -633,7 +653,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13"
       build_name: wheel-py3_13-cpu
       use_s3: False
@@ -755,7 +780,12 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
       DOCKER_IMAGE: pytorch/manylinux2_28-builder:cpu-2.7
+=======
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DESIRED_PYTHON: "3.13t"
       build_name: wheel-py3_13t-cpu
       use_s3: False
diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
index 1c9888286ab1..7aa7fa626216 100644
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@@ -1,11 +1,19 @@
 # @generated DO NOT EDIT MANUALLY
 
+<<<<<<< HEAD
 # Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+=======
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-libtorch-debug
 
 on:
   push:
+<<<<<<< HEAD
+=======
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     branches:
       - nightly
     tags:
@@ -17,24 +25,44 @@ on:
   workflow_dispatch:
 
 env:
+<<<<<<< HEAD
+=======
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   BUILD_ENVIRONMENT: windows-arm64-binary-libtorch-debug
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
   SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
+=======
+  OS: windows-arm64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PYTORCH_ROOT: /pytorch
   DOWNLOADS_DIR: c:\temp\downloads
   DEPENDENCIES_DIR: c:\temp\dependencies
   ENABLE_APL: 1
   ENABLE_OPENBLAS: 0
   MSVC_VERSION : 14.42
+<<<<<<< HEAD
   AWS_DEFAULT_REGION: us-east-1
+=======
+concurrency:
+  group: windows-arm64-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -43,8 +71,13 @@ jobs:
   libtorch-cpu-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "windows-11-arm64"
     timeout-minutes: 240
+=======
+    runs-on: "windows-11-arm64-preview"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -59,9 +92,12 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: cmd
         run: |
@@ -73,10 +109,19 @@ jobs:
         run: |
           mkdir "%NIGHTLIES_PYTORCH_ROOT%"
           mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+<<<<<<< HEAD
+=======
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Git checkout PyTorch
         uses: actions/checkout@v4
         with:
           path: "pytorch"
+<<<<<<< HEAD
       - name: Bootstrap Build Tools
         shell: cmd
         run: |
@@ -93,6 +138,8 @@ jobs:
         uses: actions/checkout@v4
         with:
           path: "pytorch"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
       - name: Bootstrap Python
         shell: cmd
@@ -117,11 +164,19 @@ jobs:
       - name: Populate binary env
         shell: bash
         run: |
+<<<<<<< HEAD
           "pytorch/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         shell: bash
         run: |
           "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
+=======
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
@@ -134,8 +189,13 @@ jobs:
     needs:
       - libtorch-cpu-shared-with-deps-debug-build
       - get-label-type
+<<<<<<< HEAD
     runs-on: "windows-11-arm64"
     timeout-minutes: 240
+=======
+    runs-on: "windows-11-arm64-preview"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -150,15 +210,19 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: cmd
         run: |
           echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
           echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
           echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+<<<<<<< HEAD
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
@@ -176,6 +240,13 @@ jobs:
         shell: cmd
         run: |
           rmdir /s /q "pytorch"
+=======
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Git checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -189,14 +260,18 @@ jobs:
         shell: cmd
         run: |
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+<<<<<<< HEAD
       - name: Bootstrap Build Tools
         shell: cmd
         run: |
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Bootstrap Rust
         shell: cmd
         run: |
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+<<<<<<< HEAD
       - name: Populate binary env
         shell: bash
         run: |
@@ -205,6 +280,21 @@ jobs:
         shell: bash
         run: |
           "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+=======
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cpu-shared-with-deps-debug-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
index 68600ac7ab9c..0ac01b390fee 100644
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@@ -1,11 +1,19 @@
 # @generated DO NOT EDIT MANUALLY
 
+<<<<<<< HEAD
 # Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+=======
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-libtorch-release
 
 on:
   push:
+<<<<<<< HEAD
+=======
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     branches:
       - nightly
     tags:
@@ -17,24 +25,44 @@ on:
   workflow_dispatch:
 
 env:
+<<<<<<< HEAD
+=======
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   BUILD_ENVIRONMENT: windows-arm64-binary-libtorch-release
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
   SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
+=======
+  OS: windows-arm64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PYTORCH_ROOT: /pytorch
   DOWNLOADS_DIR: c:\temp\downloads
   DEPENDENCIES_DIR: c:\temp\dependencies
   ENABLE_APL: 1
   ENABLE_OPENBLAS: 0
   MSVC_VERSION : 14.42
+<<<<<<< HEAD
   AWS_DEFAULT_REGION: us-east-1
+=======
+concurrency:
+  group: windows-arm64-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -43,8 +71,13 @@ jobs:
   libtorch-cpu-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
+<<<<<<< HEAD
     runs-on: "windows-11-arm64"
     timeout-minutes: 240
+=======
+    runs-on: "windows-11-arm64-preview"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -59,9 +92,12 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: cmd
         run: |
@@ -73,10 +109,19 @@ jobs:
         run: |
           mkdir "%NIGHTLIES_PYTORCH_ROOT%"
           mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+<<<<<<< HEAD
+=======
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Git checkout PyTorch
         uses: actions/checkout@v4
         with:
           path: "pytorch"
+<<<<<<< HEAD
       - name: Bootstrap Build Tools
         shell: cmd
         run: |
@@ -93,6 +138,8 @@ jobs:
         uses: actions/checkout@v4
         with:
           path: "pytorch"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
       - name: Bootstrap Python
         shell: cmd
@@ -117,11 +164,19 @@ jobs:
       - name: Populate binary env
         shell: bash
         run: |
+<<<<<<< HEAD
           "pytorch/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         shell: bash
         run: |
           "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
+=======
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
@@ -134,8 +189,13 @@ jobs:
     needs:
       - libtorch-cpu-shared-with-deps-release-build
       - get-label-type
+<<<<<<< HEAD
     runs-on: "windows-11-arm64"
     timeout-minutes: 240
+=======
+    runs-on: "windows-11-arm64-preview"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -150,15 +210,19 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: cmd
         run: |
           echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
           echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
           echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+<<<<<<< HEAD
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
@@ -176,6 +240,13 @@ jobs:
         shell: cmd
         run: |
           rmdir /s /q "pytorch"
+=======
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Git checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -189,14 +260,18 @@ jobs:
         shell: cmd
         run: |
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+<<<<<<< HEAD
       - name: Bootstrap Build Tools
         shell: cmd
         run: |
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Bootstrap Rust
         shell: cmd
         run: |
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+<<<<<<< HEAD
       - name: Populate binary env
         shell: bash
         run: |
@@ -205,6 +280,21 @@ jobs:
         shell: bash
         run: |
           "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+=======
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cpu-shared-with-deps-release-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
diff --git a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
index 1b6373276f5e..30fef7f4e2e6 100644
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@@ -1,11 +1,19 @@
 # @generated DO NOT EDIT MANUALLY
 
+<<<<<<< HEAD
 # Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+=======
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-wheel
 
 on:
   push:
+<<<<<<< HEAD
+=======
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     branches:
       - nightly
     tags:
@@ -17,34 +25,62 @@ on:
   workflow_dispatch:
 
 env:
+<<<<<<< HEAD
+=======
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   BUILD_ENVIRONMENT: windows-arm64-binary-wheel
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
   SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
+=======
+  OS: windows-arm64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PYTORCH_ROOT: /pytorch
   DOWNLOADS_DIR: c:\temp\downloads
   DEPENDENCIES_DIR: c:\temp\dependencies
   ENABLE_APL: 1
   ENABLE_OPENBLAS: 0
   MSVC_VERSION : 14.42
+<<<<<<< HEAD
   AWS_DEFAULT_REGION: us-east-1
+=======
+concurrency:
+  group: windows-arm64-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
   wheel-py3_12-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64"
     timeout-minutes: 240
+=======
+  wheel-py3_11-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "windows-11-arm64-preview"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -53,11 +89,16 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.12"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
+=======
+      DESIRED_PYTHON: "3.11"
+    steps:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: cmd
         run: |
@@ -69,10 +110,19 @@ jobs:
         run: |
           mkdir "%NIGHTLIES_PYTORCH_ROOT%"
           mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+<<<<<<< HEAD
+=======
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Git checkout PyTorch
         uses: actions/checkout@v4
         with:
           path: "pytorch"
+<<<<<<< HEAD
       - name: Bootstrap Build Tools
         shell: cmd
         run: |
@@ -89,6 +139,8 @@ jobs:
         uses: actions/checkout@v4
         with:
           path: "pytorch"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           submodules: recursive
       - name: Bootstrap Python
         shell: cmd
@@ -113,6 +165,7 @@ jobs:
       - name: Populate binary env
         shell: bash
         run: |
+<<<<<<< HEAD
           "pytorch/.circleci/scripts/binary_populate_env.sh"
       - name: Build PyTorch binary
         shell: bash
@@ -132,6 +185,27 @@ jobs:
       - get-label-type
     runs-on: "windows-11-arm64"
     timeout-minutes: 240
+=======
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_11-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_11-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_11-cpu-build
+      - get-label-type
+    runs-on: "windows-11-arm64-preview"
+    timeout-minutes: 300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -140,17 +214,23 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
       DESIRED_PYTHON: "3.12"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
+=======
+      DESIRED_PYTHON: "3.11"
+    steps:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: cmd
         run: |
           echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
           echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
           echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+<<<<<<< HEAD
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
@@ -168,6 +248,13 @@ jobs:
         shell: cmd
         run: |
           rmdir /s /q "pytorch"
+=======
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Git checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -181,14 +268,18 @@ jobs:
         shell: cmd
         run: |
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+<<<<<<< HEAD
       - name: Bootstrap Build Tools
         shell: cmd
         run: |
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Bootstrap Rust
         shell: cmd
         run: |
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+<<<<<<< HEAD
       - name: Populate binary env
         shell: bash
         run: |
@@ -197,6 +288,168 @@ jobs:
         shell: bash
         run: |
           "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+=======
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_11-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+  wheel-py3_11-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_11-cpu-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_12-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "windows-11-arm64-preview"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.12"
+    steps:
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+      - name: Bootstrap folders
+        shell: cmd
+        run: |
+          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
+          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - name: Bootstrap sccache
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
+      - name: Bootstrap Libuv
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_12-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_12-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_12-cpu-build
+      - get-label-type
+    runs-on: "windows-11-arm64-preview"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.12"
+    steps:
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_12-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   wheel-py3_12-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
@@ -215,3 +468,153 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
+=======
+  wheel-py3_13-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "windows-11-arm64-preview"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.13"
+    steps:
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+      - name: Bootstrap folders
+        shell: cmd
+        run: |
+          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
+          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - name: Bootstrap sccache
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
+      - name: Bootstrap Libuv
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_13-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_13-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_13-cpu-build
+      - get-label-type
+    runs-on: "windows-11-arm64-preview"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.13"
+    steps:
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_13-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+  wheel-py3_13-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_13-cpu-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
index 98accb3deec9..54f0687ab6ff 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@@ -19,6 +19,10 @@ env:
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
   SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
+=======
+  OS: windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 concurrency:
   group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -27,7 +31,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -52,6 +60,18 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -67,7 +87,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -96,6 +120,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -105,6 +130,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -144,6 +171,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cpu-shared-with-deps-debug-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -180,7 +211,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -209,6 +244,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -223,6 +272,7 @@ jobs:
         with:
           name: libtorch-cpu-shared-with-deps-debug
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -234,6 +284,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
index 5f02c2636e10..db1283d97318 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@@ -26,6 +26,10 @@ env:
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
   SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
+=======
+  OS: windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 concurrency:
   group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -34,7 +38,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -59,6 +67,18 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -74,7 +94,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -103,6 +127,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -112,6 +137,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -151,6 +178,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cpu-shared-with-deps-debug-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -187,7 +218,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -216,6 +251,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -230,6 +279,7 @@ jobs:
         with:
           name: libtorch-cpu-shared-with-deps-debug
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -241,6 +291,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -284,6 +336,7 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   libtorch-cuda11_8-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
@@ -531,6 +584,8 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cuda12_6-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
@@ -551,6 +606,18 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -566,7 +633,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -595,6 +666,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -604,6 +676,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -643,6 +717,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cuda12_6-shared-with-deps-debug-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -680,7 +758,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -709,6 +791,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -723,6 +819,7 @@ jobs:
         with:
           name: libtorch-cuda12_6-shared-with-deps-debug
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -734,6 +831,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -798,6 +897,18 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -813,7 +924,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -842,6 +957,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -851,6 +967,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -890,6 +1008,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cuda12_8-shared-with-deps-debug-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -927,7 +1049,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -956,6 +1082,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -970,6 +1110,7 @@ jobs:
         with:
           name: libtorch-cuda12_8-shared-with-deps-debug
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -981,6 +1122,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -1025,3 +1168,254 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
+=======
+  libtorch-cuda12_9-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: libtorch-cuda12_9-shared-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  libtorch-cuda12_9-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_9-shared-with-deps-debug-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda12_9-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_9-shared-with-deps-debug-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_9-shared-with-deps-debug-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+      build_name: libtorch-cuda12_9-shared-with-deps-debug
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-main.yml b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
index dd8c039761ae..5eda1e8b89cd 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@@ -19,6 +19,10 @@ env:
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
   SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
+=======
+  OS: windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 concurrency:
   group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -27,7 +31,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -52,6 +60,18 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -67,7 +87,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -96,6 +120,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -105,6 +130,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -144,6 +171,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cpu-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -180,7 +211,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -209,6 +244,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -223,6 +272,7 @@ jobs:
         with:
           name: libtorch-cpu-shared-with-deps-release
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -234,6 +284,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
index 69f16fbaf95b..77b5789c1f82 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@@ -26,6 +26,10 @@ env:
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
   SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
+=======
+  OS: windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 concurrency:
   group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -34,7 +38,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -59,6 +67,18 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -74,7 +94,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -103,6 +127,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -112,6 +137,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -151,6 +178,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cpu-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -187,7 +218,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -216,6 +251,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -230,6 +279,7 @@ jobs:
         with:
           name: libtorch-cpu-shared-with-deps-release
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -241,6 +291,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -284,6 +336,7 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   libtorch-cuda11_8-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
@@ -531,6 +584,8 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cuda12_6-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
@@ -551,6 +606,18 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -566,7 +633,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -595,6 +666,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -604,6 +676,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -643,6 +717,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cuda12_6-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -680,7 +758,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -709,6 +791,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -723,6 +819,7 @@ jobs:
         with:
           name: libtorch-cuda12_6-shared-with-deps-release
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -734,6 +831,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -798,6 +897,18 @@ jobs:
       # without this value pip does not get installed for some reason
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -813,7 +924,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -842,6 +957,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -851,6 +967,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -890,6 +1008,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   libtorch-cuda12_8-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -927,7 +1049,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -956,6 +1082,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -970,6 +1110,7 @@ jobs:
         with:
           name: libtorch-cuda12_8-shared-with-deps-release
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -981,6 +1122,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -1025,3 +1168,254 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
+=======
+  libtorch-cuda12_9-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: libtorch-cuda12_9-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_9-shared-with-deps-release-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda12_9-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_9-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.9"
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml
index ca9fed87cabb..bb8cbcdf8b62 100644
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@@ -26,6 +26,10 @@ env:
   PR_NUMBER: ${{ github.event.pull_request.number }}
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
   SKIP_ALL_TESTS: 1
+<<<<<<< HEAD
+=======
+  OS: windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 concurrency:
   group: windows-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -34,7 +38,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -55,6 +63,18 @@ jobs:
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -70,7 +90,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -99,6 +123,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -108,6 +133,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -147,6 +174,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   wheel-py3_9-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -179,7 +210,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -208,6 +243,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -222,6 +271,7 @@ jobs:
         with:
           name: wheel-py3_9-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -233,6 +283,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -272,6 +324,7 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   wheel-py3_9-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
@@ -507,6 +560,8 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   wheel-py3_9-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
@@ -523,6 +578,18 @@ jobs:
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -538,7 +605,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -567,6 +638,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -576,6 +648,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -615,6 +689,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   wheel-py3_9-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -648,7 +726,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -677,6 +759,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -691,6 +787,7 @@ jobs:
         with:
           name: wheel-py3_9-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -702,6 +799,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -758,6 +857,18 @@ jobs:
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -773,7 +884,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -802,6 +917,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -811,6 +927,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -850,6 +968,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   wheel-py3_9-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -883,7 +1005,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -912,6 +1038,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -926,6 +1066,7 @@ jobs:
         with:
           name: wheel-py3_9-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -937,6 +1078,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -977,7 +1120,11 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
   wheel-py3_9-xpu-build:
+=======
+  wheel-py3_9-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -987,12 +1134,30 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
       DESIRED_CUDA: xpu
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
       PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
     steps:
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -1008,7 +1173,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1037,15 +1206,8 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -1068,7 +1230,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_9-xpu
+          name: wheel-py3_9-cuda12_9
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1085,20 +1247,22 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-xpu-test:  # Testing
+
+  wheel-py3_9-cuda12_9-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_9-xpu-build
+      - wheel-py3_9-cuda12_9-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
@@ -1117,7 +1281,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1146,6 +1310,18 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -1155,22 +1331,13 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-xpu
+          name: wheel-py3_9-cuda12_9
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
       - name: Populate binary env
         shell: bash
         run: |
@@ -1192,25 +1359,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-xpu-upload:  # Uploading
+  wheel-py3_9-cuda12_9-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_9-xpu-test
+    needs: wheel-py3_9-cuda12_9-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-xpu
+      build_name: wheel-py3_9-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cpu-build:
+  wheel-py3_9-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -1220,11 +1388,21 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.9"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
     steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
       - name: Display EC2 information
         shell: bash
         run: |
@@ -1240,7 +1418,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1269,15 +1447,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -1300,7 +1470,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-cpu
+          name: wheel-py3_9-xpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1317,10 +1487,14 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_9-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-cpu-build
+      - wheel-py3_9-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
     timeout-minutes: 300
@@ -1329,10 +1503,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1349,7 +1523,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1378,6 +1556,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -1390,8 +1582,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cpu
+          name: wheel-py3_9-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -1403,6 +1596,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -1424,25 +1619,25 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-upload:  # Uploading
+  wheel-py3_9-xpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-cpu-test
+    needs: wheel-py3_9-xpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cpu
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda11_8-build:
+  wheel-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -1452,12 +1647,23 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -1473,7 +1679,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1502,6 +1712,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -1511,6 +1722,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -1533,7 +1746,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-cuda11_8
+          name: wheel-py3_10-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1550,21 +1763,24 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_8-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_10-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-cuda11_8-build
+      - wheel-py3_10-cpu-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
@@ -1583,7 +1799,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1612,6 +1832,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -1624,8 +1858,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda11_8
+          name: wheel-py3_10-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -1637,6 +1872,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -1658,26 +1895,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_8-upload:  # Uploading
+  wheel-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-cuda11_8-test
+    needs: wheel-py3_10-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda11_8
+      build_name: wheel-py3_10-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda12_6-build:
+<<<<<<< HEAD
+  wheel-py3_10-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -1687,8 +1924,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -1768,7 +2005,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-cuda12_6
+          name: wheel-py3_10-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1785,10 +2022,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda12_6-test:  # Testing
+  wheel-py3_10-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-cuda12_6-build
+      - wheel-py3_10-cuda11_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
@@ -1797,8 +2034,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -1859,7 +2096,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda12_6
+          name: wheel-py3_10-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -1893,26 +2130,28 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda12_6-upload:  # Uploading
+  wheel-py3_10-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-cuda12_6-test
+    needs: wheel-py3_10-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda12_6
+      build_name: wheel-py3_10-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda12_8-build:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_10-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -1922,12 +2161,24 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -1943,7 +2194,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1972,6 +2227,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -1981,6 +2237,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -2003,7 +2261,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-cuda12_8
+          name: wheel-py3_10-cuda12_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2020,10 +2278,14 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda12_8-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_10-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-cuda12_8-build
+      - wheel-py3_10-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
@@ -2032,8 +2294,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -2053,7 +2315,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2082,6 +2348,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -2094,8 +2374,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda12_8
+          name: wheel-py3_10-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -2107,6 +2388,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -2128,26 +2411,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda12_8-upload:  # Uploading
+  wheel-py3_10-cuda12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-cuda12_8-test
+    needs: wheel-py3_10-cuda12_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda12_8
+      build_name: wheel-py3_10-cuda12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-xpu-build:
+  wheel-py3_10-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -2157,12 +2440,24 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -2178,7 +2473,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2207,6 +2506,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -2216,6 +2516,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -2238,7 +2540,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-xpu
+          name: wheel-py3_10-cuda12_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2255,20 +2557,25 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-xpu-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_10-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-xpu-build
+      - wheel-py3_10-cuda12_8-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
@@ -2287,7 +2594,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2316,6 +2627,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -2328,8 +2653,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-xpu
+          name: wheel-py3_10-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -2341,6 +2667,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -2362,25 +2690,30 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-xpu-upload:  # Uploading
+  wheel-py3_10-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-xpu-test
+    needs: wheel-py3_10-cuda12_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-xpu
+      build_name: wheel-py3_10-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cpu-build:
+<<<<<<< HEAD
+  wheel-py3_10-xpu-build:
+=======
+  wheel-py3_10-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -2390,11 +2723,30 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+<<<<<<< HEAD
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
     steps:
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -2410,7 +2762,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2439,15 +2795,8 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -2470,7 +2819,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-cpu
+          name: wheel-py3_10-cuda12_9
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2487,22 +2836,24 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cpu-test:  # Testing
+
+  wheel-py3_10-cuda12_9-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-cpu-build
+      - wheel-py3_10-cuda12_9-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2519,7 +2870,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2548,6 +2899,18 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -2557,22 +2920,1602 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cpu
+          name: wheel-py3_10-cuda12_9
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
+      - name: Populate binary env
+        shell: bash
         run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
         working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-cuda12_9-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_10-xpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_10-xpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_10-xpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_10-xpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-xpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-xpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-xpu-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-xpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_11-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+    steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_11-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_11-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_11-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_11-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_11-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_11-cpu-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
+  wheel-py3_11-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_11-cuda11_8
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_11-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_11-cuda11_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_11-cuda11_8
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_11-cuda11_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_11-cuda11_8-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cuda11_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_11-cuda12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+    steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_11-cuda12_6
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_11-cuda12_6-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_11-cuda12_6-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_11-cuda12_6
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_11-cuda12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_11-cuda12_6-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cuda12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_11-cuda12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+    steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_11-cuda12_8
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_11-cuda12_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_11-cuda12_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_11-cuda12_8
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_11-cuda12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_11-cuda12_8-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cuda12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+<<<<<<< HEAD
+  wheel-py3_11-xpu-build:
+=======
+  wheel-py3_11-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+<<<<<<< HEAD
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+    steps:
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_11-cuda12_9
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+
+  wheel-py3_11-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - wheel-py3_11-cuda12_9-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    timeout-minutes: 300
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_11-cuda12_9
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
         run: |
@@ -2594,25 +4537,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cpu-upload:  # Uploading
+  wheel-py3_11-cuda12_9-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-cpu-test
+    needs: wheel-py3_11-cuda12_9-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cpu
+      build_name: wheel-py3_11-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cuda11_8-build:
+  wheel-py3_11-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -2622,12 +4566,21 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
     steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
       - name: Display EC2 information
         shell: bash
         run: |
@@ -2643,7 +4596,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2672,15 +4625,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -2703,7 +4648,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-cuda11_8
+          name: wheel-py3_11-xpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2720,21 +4665,24 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda11_8-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_11-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-cuda11_8-build
+      - wheel-py3_11-xpu-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
     steps:
@@ -2753,7 +4701,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2782,6 +4734,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -2794,8 +4760,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cuda11_8
+          name: wheel-py3_11-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -2807,6 +4774,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -2828,26 +4797,25 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda11_8-upload:  # Uploading
+  wheel-py3_11-xpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-cuda11_8-test
+    needs: wheel-py3_11-xpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cuda11_8
+      build_name: wheel-py3_11-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cuda12_6-build:
+  wheel-py3_12-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -2857,12 +4825,23 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.12"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -2878,7 +4857,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2907,6 +4890,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -2916,6 +4900,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -2938,7 +4924,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-cuda12_6
+          name: wheel-py3_12-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2955,23 +4941,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda12_6-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_12-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-cuda12_6-build
+      - wheel-py3_12-cpu-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2988,7 +4977,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3017,6 +5010,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -3029,8 +5036,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cuda12_6
+          name: wheel-py3_12-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -3042,6 +5050,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -3063,26 +5073,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda12_6-upload:  # Uploading
+  wheel-py3_12-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-cuda12_6-test
+    needs: wheel-py3_12-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cuda12_6
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cuda12_8-build:
+<<<<<<< HEAD
+  wheel-py3_12-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -3092,11 +5102,11 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3173,7 +5183,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-cuda12_8
+          name: wheel-py3_12-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3190,10 +5200,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda12_8-test:  # Testing
+  wheel-py3_12-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-cuda12_8-build
+      - wheel-py3_12-cuda11_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
@@ -3202,11 +5212,11 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3264,7 +5274,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cuda12_8
+          name: wheel-py3_12-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -3298,26 +5308,28 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda12_8-upload:  # Uploading
+  wheel-py3_12-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-cuda12_8-test
+    needs: wheel-py3_12-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cuda12_8
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-xpu-build:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_12-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -3327,12 +5339,24 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+      DESIRED_PYTHON: "3.12"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -3348,7 +5372,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3377,6 +5405,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -3386,6 +5415,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -3408,7 +5439,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-xpu
+          name: wheel-py3_12-cuda12_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3425,22 +5456,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-xpu-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_12-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-xpu-build
+      - wheel-py3_12-cuda12_6-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3457,7 +5493,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3486,6 +5526,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -3498,8 +5552,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-xpu
+          name: wheel-py3_12-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -3511,6 +5566,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -3532,25 +5589,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-xpu-upload:  # Uploading
+  wheel-py3_12-cuda12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-xpu-test
+    needs: wheel-py3_12-cuda12_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-xpu
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-cuda12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cpu-build:
+  wheel-py3_12-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -3560,11 +5618,24 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -3580,7 +5651,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3609,6 +5684,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -3618,6 +5694,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -3640,7 +5718,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-cpu
+          name: wheel-py3_12-cuda12_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3657,20 +5735,25 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cpu-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_12-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-cpu-build
+      - wheel-py3_12-cuda12_8-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
     steps:
@@ -3689,7 +5772,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3718,6 +5805,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -3730,8 +5831,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cpu
+          name: wheel-py3_12-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -3743,6 +5845,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -3764,25 +5868,30 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cpu-upload:  # Uploading
+  wheel-py3_12-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-cpu-test
+    needs: wheel-py3_12-cuda12_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cpu
+      build_name: wheel-py3_12-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cuda11_8-build:
+<<<<<<< HEAD
+  wheel-py3_12-xpu-build:
+=======
+  wheel-py3_12-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -3792,12 +5901,30 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+<<<<<<< HEAD
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.12"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+    steps:
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
     steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -3813,7 +5940,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3842,15 +5973,8 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -3873,7 +5997,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-cuda11_8
+          name: wheel-py3_12-cuda12_9
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3890,10 +6014,11 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda11_8-test:  # Testing
+
+  wheel-py3_12-cuda12_9-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-cuda11_8-build
+      - wheel-py3_12-cuda12_9-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
@@ -3902,8 +6027,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3923,7 +6048,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3952,6 +6077,18 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -3961,22 +6098,13 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cuda11_8
+          name: wheel-py3_12-cuda12_9
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
       - name: Populate binary env
         shell: bash
         run: |
@@ -3998,26 +6126,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda11_8-upload:  # Uploading
+  wheel-py3_12-cuda12_9-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-cuda11_8-test
+    needs: wheel-py3_12-cuda12_9-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cuda11_8
+      build_name: wheel-py3_12-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cuda12_6-build:
+  wheel-py3_12-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -4027,12 +6155,21 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
     steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
       - name: Display EC2 information
         shell: bash
         run: |
@@ -4048,7 +6185,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4077,15 +6214,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -4108,7 +6237,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-cuda12_6
+          name: wheel-py3_12-xpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -4125,21 +6254,24 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda12_6-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_12-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-cuda12_6-build
+      - wheel-py3_12-xpu-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
     steps:
@@ -4158,7 +6290,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4187,6 +6323,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -4199,8 +6349,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cuda12_6
+          name: wheel-py3_12-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -4212,6 +6363,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -4233,26 +6386,25 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda12_6-upload:  # Uploading
+  wheel-py3_12-xpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-cuda12_6-test
+    needs: wheel-py3_12-xpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cuda12_6
+      build_name: wheel-py3_12-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cuda12_8-build:
+  wheel-py3_13-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -4262,12 +6414,23 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -4283,7 +6446,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4312,6 +6479,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -4321,6 +6489,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -4343,7 +6513,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-cuda12_8
+          name: wheel-py3_13-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -4360,23 +6530,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda12_8-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_13-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-cuda12_8-build
+      - wheel-py3_13-cpu-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -4393,7 +6566,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4422,6 +6599,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -4434,8 +6625,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cuda12_8
+          name: wheel-py3_13-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -4447,6 +6639,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -4468,26 +6662,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda12_8-upload:  # Uploading
+  wheel-py3_13-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-cuda12_8-test
+    needs: wheel-py3_13-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cuda12_8
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-xpu-build:
+<<<<<<< HEAD
+  wheel-py3_13-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -4497,11 +6691,11 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+      DESIRED_PYTHON: "3.13"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -4578,7 +6772,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-xpu
+          name: wheel-py3_13-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -4595,22 +6789,23 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-xpu-test:  # Testing
+  wheel-py3_13-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-xpu-build
+      - wheel-py3_13-cuda11_8-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
+      DESIRED_PYTHON: "3.13"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -4668,7 +6863,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-xpu
+          name: wheel-py3_13-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -4702,25 +6897,28 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-xpu-upload:  # Uploading
+  wheel-py3_13-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-xpu-test
+    needs: wheel-py3_13-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-xpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-cpu-build:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_13-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -4730,11 +6928,24 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -4750,7 +6961,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4779,6 +6994,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -4788,6 +7004,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -4810,7 +7028,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-cpu
+          name: wheel-py3_13-cuda12_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -4827,20 +7045,25 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cpu-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_13-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-cpu-build
+      - wheel-py3_13-cuda12_6-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
     steps:
@@ -4859,7 +7082,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4888,6 +7115,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -4900,8 +7141,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-cpu
+          name: wheel-py3_13-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -4913,6 +7155,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -4934,25 +7178,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cpu-upload:  # Uploading
+  wheel-py3_13-cuda12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-cpu-test
+    needs: wheel-py3_13-cuda12_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-cpu
+      build_name: wheel-py3_13-cuda12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-cuda11_8-build:
+  wheel-py3_13-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -4962,12 +7207,24 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -4983,7 +7240,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5012,6 +7273,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -5021,6 +7283,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -5043,7 +7307,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-cuda11_8
+          name: wheel-py3_13-cuda12_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5060,10 +7324,14 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cuda11_8-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_13-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-cuda11_8-build
+      - wheel-py3_13-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
@@ -5072,8 +7340,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -5093,7 +7361,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5122,6 +7394,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -5134,8 +7420,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-cuda11_8
+          name: wheel-py3_13-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -5147,6 +7434,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -5168,26 +7457,30 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cuda11_8-upload:  # Uploading
+  wheel-py3_13-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-cuda11_8-test
+    needs: wheel-py3_13-cuda12_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-cuda11_8
+      build_name: wheel-py3_13-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-cuda12_6-build:
+<<<<<<< HEAD
+  wheel-py3_13-xpu-build:
+=======
+  wheel-py3_13-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -5197,12 +7490,30 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+<<<<<<< HEAD
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.13"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+    steps:
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
     steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -5218,7 +7529,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5247,15 +7562,8 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -5278,7 +7586,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-cuda12_6
+          name: wheel-py3_13-cuda12_9
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5295,10 +7603,11 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cuda12_6-test:  # Testing
+
+  wheel-py3_13-cuda12_9-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-cuda12_6-build
+      - wheel-py3_13-cuda12_9-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
@@ -5307,8 +7616,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -5328,7 +7637,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5357,6 +7666,18 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -5366,22 +7687,13 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-cuda12_6
+          name: wheel-py3_13-cuda12_9
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
       - name: Populate binary env
         shell: bash
         run: |
@@ -5403,26 +7715,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cuda12_6-upload:  # Uploading
+  wheel-py3_13-cuda12_9-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-cuda12_6-test
+    needs: wheel-py3_13-cuda12_9-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-cuda12_6
+      build_name: wheel-py3_13-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-cuda12_8-build:
+  wheel-py3_13-xpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -5432,12 +7744,21 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
     steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
       - name: Display EC2 information
         shell: bash
         run: |
@@ -5453,7 +7774,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5482,15 +7803,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -5513,7 +7826,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-cuda12_8
+          name: wheel-py3_13-xpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5530,21 +7843,24 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cuda12_8-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_13-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-cuda12_8-build
+      - wheel-py3_13-xpu-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
     steps:
@@ -5563,7 +7879,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5592,6 +7912,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -5604,8 +7938,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-cuda12_8
+          name: wheel-py3_13-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -5617,6 +7952,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -5638,26 +7975,25 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cuda12_8-upload:  # Uploading
+  wheel-py3_13-xpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-cuda12_8-test
+    needs: wheel-py3_13-xpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
       DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-cuda12_8
+      build_name: wheel-py3_13-xpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-xpu-build:
+  wheel-py3_13t-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -5667,12 +8003,23 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+      DESIRED_PYTHON: "3.13t"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -5688,7 +8035,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5717,6 +8068,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -5726,6 +8078,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -5748,7 +8102,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-xpu
+          name: wheel-py3_13t-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5765,10 +8119,14 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-xpu-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_13t-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-xpu-build
+      - wheel-py3_13t-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
     timeout-minutes: 300
@@ -5776,11 +8134,11 @@ jobs:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.13"
+      DESIRED_PYTHON: "3.13t"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -5797,7 +8155,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5826,6 +8188,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -5838,8 +8214,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-xpu
+          name: wheel-py3_13t-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -5851,6 +8228,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -5872,25 +8251,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-xpu-upload:  # Uploading
+  wheel-py3_13t-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-xpu-test
+    needs: wheel-py3_13t-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-xpu
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.13t"
+      build_name: wheel-py3_13t-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-cpu-build:
+<<<<<<< HEAD
+  wheel-py3_13t-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -5900,8 +8280,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
     steps:
@@ -5980,7 +8361,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-cpu
+          name: wheel-py3_13t-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5997,20 +8378,21 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cpu-test:  # Testing
+  wheel-py3_13t-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13t-cpu-build
+      - wheel-py3_13t-cuda11_8-build
       - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
     steps:
@@ -6070,7 +8452,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13t-cpu
+          name: wheel-py3_13t-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -6104,25 +8486,28 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cpu-upload:  # Uploading
+  wheel-py3_13t-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-cpu-test
+    needs: wheel-py3_13t-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-cpu
+      build_name: wheel-py3_13t-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-cuda11_8-build:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_13t-cuda12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -6132,12 +8517,24 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -6153,7 +8550,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6182,6 +8583,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -6191,6 +8593,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -6213,7 +8617,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-cuda11_8
+          name: wheel-py3_13t-cuda12_6
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -6230,10 +8634,14 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cuda11_8-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_13t-cuda12_6-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13t-cuda11_8-build
+      - wheel-py3_13t-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
@@ -6242,8 +8650,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -6263,7 +8671,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6292,6 +8704,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -6304,8 +8730,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13t-cuda11_8
+          name: wheel-py3_13t-cuda12_6
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -6317,6 +8744,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -6338,26 +8767,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cuda11_8-upload:  # Uploading
+  wheel-py3_13t-cuda12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-cuda11_8-test
+    needs: wheel-py3_13t-cuda12_6-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: 12.6
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-cuda11_8
+      build_name: wheel-py3_13t-cuda12_6
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-cuda12_6-build:
+  wheel-py3_13t-cuda12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -6367,12 +8796,24 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
     steps:
+<<<<<<< HEAD
+=======
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -6388,7 +8829,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6417,6 +8862,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -6426,6 +8872,8 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -6448,7 +8896,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-cuda12_6
+          name: wheel-py3_13t-cuda12_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -6465,10 +8913,14 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cuda12_6-test:  # Testing
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+  wheel-py3_13t-cuda12_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13t-cuda12_6-build
+      - wheel-py3_13t-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
@@ -6477,8 +8929,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -6498,7 +8950,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6527,6 +8983,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -6539,8 +9009,9 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13t-cuda12_6
+          name: wheel-py3_13t-cuda12_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -6552,6 +9023,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
@@ -6573,26 +9046,30 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cuda12_6-upload:  # Uploading
+  wheel-py3_13t-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-cuda12_6-test
+    needs: wheel-py3_13t-cuda12_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-cuda12_6
+      build_name: wheel-py3_13t-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-cuda12_8-build:
+<<<<<<< HEAD
+  wheel-py3_13t-xpu-build:
+=======
+  wheel-py3_13t-cuda12_9-build:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@@ -6602,12 +9079,30 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+<<<<<<< HEAD
+      DESIRED_CUDA: xpu
+      GPU_ARCH_TYPE: xpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.13t"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+    steps:
+=======
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
     steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Display EC2 information
         shell: bash
         run: |
@@ -6623,7 +9118,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6652,15 +9151,8 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -6683,7 +9175,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-cuda12_8
+          name: wheel-py3_13t-cuda12_9
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -6700,10 +9192,11 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cuda12_8-test:  # Testing
+
+  wheel-py3_13t-cuda12_9-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13t-cuda12_8-build
+      - wheel-py3_13t-cuda12_9-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
     timeout-minutes: 300
@@ -6712,8 +9205,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -6733,7 +9226,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6762,6 +9255,18 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -6771,22 +9276,13 @@ jobs:
           echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
           echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
           echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+<<<<<<< HEAD
+=======
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13t-cuda12_8
+          name: wheel-py3_13t-cuda12_9
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
       - name: Populate binary env
         shell: bash
         run: |
@@ -6808,22 +9304,22 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cuda12_8-upload:  # Uploading
+  wheel-py3_13t-cuda12_9-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-cuda12_8-test
+    needs: wheel-py3_13t-cuda12_9-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: 12.8
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-cuda12_8
+      build_name: wheel-py3_13t-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -6841,8 +9337,17 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | intel-sycl-rt==2025.0.4; platform_system == 'Linux' | intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | intel-sycl-rt==2025.0.5; platform_system == 'Windows' | tcmlib==1.2.0 | umf==0.9.1 | intel-pti==0.10.1
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
     steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
       - name: Display EC2 information
         shell: bash
         run: |
@@ -6858,7 +9363,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6887,15 +9392,7 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -6935,6 +9432,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   wheel-py3_13t-xpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
@@ -6967,7 +9468,11 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6996,6 +9501,20 @@ jobs:
           # Let's both exclude the path and disable Windows Defender completely just to be sure
           # that it doesn't interfere
           Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+<<<<<<< HEAD
+=======
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
       #       runner.temp variable, which we need.
@@ -7010,6 +9529,7 @@ jobs:
         with:
           name: wheel-py3_13t-xpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+<<<<<<< HEAD
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
@@ -7021,6 +9541,8 @@ jobs:
           # Remove any artifacts from the previous checkouts
           git clean -fxd
         working-directory: pytorch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Populate binary env
         shell: bash
         run: |
diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml
new file mode 100644
index 000000000000..6283a0e8fbec
--- /dev/null
+++ b/.github/workflows/h100-distributed.yml
@@ -0,0 +1,53 @@
+name: Limited CI for distributed tests on H100
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/h100-distributed.yml
+  workflow_dispatch:
+  push:
+    tags:
+      - ciflow/h100-distributed/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-dist:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: "linux.12xlarge"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '9.0'
+      test-matrix: |
+        { include: [
+          { config: "h100_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.8" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-dist
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-dist.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-dist.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/inductor-micro-benchmark-x86.yml b/.github/workflows/inductor-micro-benchmark-x86.yml
index bcdfcedc2abf..e1cb27e151f2 100644
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@@ -22,7 +22,11 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-jammy-py3.9-gcc11
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # Use metal host for benchmark jobs
       test-matrix: |
         { include: [
diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml
index dabb071bbc5e..46675f45b1c0 100644
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@@ -18,23 +18,39 @@ permissions: read-all
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
 
   linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-build:
     name: cuda12.6-py3.10-gcc9-sm80
+=======
+      opt_out_experiments: lf
+
+  build:
+    name: cuda12.8-py3.10-gcc9-sm80
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-default-label-prefix
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
@@ -42,6 +58,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-test:
     name: cuda12.6-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-test.yml
@@ -50,5 +67,15 @@ jobs:
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
       docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-build.outputs.test-matrix }}
+=======
+  test:
+    name: cuda12.8-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       timeout-minutes: 720
     secrets: inherit
diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml
index 31ed751bf440..8d910c42f47f 100644
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@@ -4,6 +4,12 @@ on:
   pull_request:
     paths:
       - .github/workflows/inductor-nightly.yml
+<<<<<<< HEAD
+=======
+      - benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
+      - benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
+      - benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   workflow_dispatch:
   schedule:
     # Run every day at 7:00 AM UTC
@@ -18,13 +24,21 @@ permissions: read-all
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
+=======
+      opt_out_experiments: lf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
     name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
@@ -32,7 +46,11 @@ jobs:
     needs: get-default-label-prefix
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       test-matrix: |
         { include: [
diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml
index 2a12f3440ee5..39199924c29e 100644
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@@ -16,22 +16,38 @@ jobs:
   get-default-label-prefix:
     if: github.repository_owner == 'pytorch'
     name: get-default-label-prefix
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
 
   linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
     name: cuda12.6-py3.10-gcc9-sm80
+=======
+      opt_out_experiments: lf
+
+  build:
+    name: cuda12.8-py3.10-gcc9-sm80
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-default-label-prefix
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
@@ -42,6 +58,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
     name: cuda12.6-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-test.yml
@@ -52,4 +69,18 @@ jobs:
       test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
       # disable monitor in perf tests for more investigation
       disable-monitor: true
+=======
+  test:
+    name: cuda12.8-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      # disable monitor in perf tests for more investigation
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
index 2ee84e45ecc2..631d6c05081b 100644
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@@ -53,13 +53,21 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
+=======
+      opt_out_experiments: lf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   linux-jammy-aarch64-py3_10-inductor-build:
     name: linux-jammy-aarch64-py3.10-inductor
@@ -69,7 +77,11 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runner: linux.arm64.m7g.4xlarge
       build-environment: linux-jammy-aarch64-py3.10
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 9, runner: "linux.arm64.m7g.metal" },
@@ -96,6 +108,7 @@ jobs:
           { config: "inductor_timm_perf_cpu_aarch64", shard: 13, num_shards: 15, runner: "linux.arm64.m7g.metal" },
           { config: "inductor_timm_perf_cpu_aarch64", shard: 14, num_shards: 15, runner: "linux.arm64.m7g.metal" },
           { config: "inductor_timm_perf_cpu_aarch64", shard: 15, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+<<<<<<< HEAD
           { config: "inductor_torchbench_perf_cpu_aarch64", shard:  1, num_shards: 12, runner: "linux.arm64.m7g.metal" },
           { config: "inductor_torchbench_perf_cpu_aarch64", shard:  2, num_shards: 12, runner: "linux.arm64.m7g.metal" },
           { config: "inductor_torchbench_perf_cpu_aarch64", shard:  3, num_shards: 12, runner: "linux.arm64.m7g.metal" },
@@ -108,6 +121,23 @@ jobs:
           { config: "inductor_torchbench_perf_cpu_aarch64", shard: 10, num_shards: 12, runner: "linux.arm64.m7g.metal" },
           { config: "inductor_torchbench_perf_cpu_aarch64", shard: 11, num_shards: 12, runner: "linux.arm64.m7g.metal" },
           { config: "inductor_torchbench_perf_cpu_aarch64", shard: 12, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+=======
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  1, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  2, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  3, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  4, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  5, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  6, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  7, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  8, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  9, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 10, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 11, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 12, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 13, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 14, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 15, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
       selected-test-configs: ${{ inputs.benchmark_configs }}
     secrets: inherit
@@ -125,7 +155,13 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-aarch64-py3_10-inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
       # disable monitor in perf tests for more investigation
+<<<<<<< HEAD
       disable-monitor: true
+=======
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
 
diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml
index 682df7b212b4..13581a2c5e4b 100644
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@@ -2,7 +2,11 @@ name: inductor-perf-nightly-h100
 
 on:
   schedule:
+<<<<<<< HEAD
     - cron: 0 7 * * 1-6
+=======
+    - cron: 15 0,4,8,12,16,20 * * 1-6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - cron: 0 7 * * 0
   # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
   # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
@@ -68,23 +72,40 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
 
   # NB: Keep this in sync with trunk.yml
   build:
     name: cuda12.6-py3.10-gcc9-sm90
+=======
+      opt_out_experiments: lf
+
+  # NB: Keep this in sync with trunk.yml
+  build:
+    name: cuda12.8-py3.10-gcc9-sm90
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '9.0'
       test-matrix: |
         { include: [
@@ -93,6 +114,7 @@ jobs:
           { config: "inductor_huggingface_perf_cuda_h100", shard: 3, num_shards: 5, runner: "linux.aws.h100" },
           { config: "inductor_huggingface_perf_cuda_h100", shard: 4, num_shards: 5, runner: "linux.aws.h100" },
           { config: "inductor_huggingface_perf_cuda_h100", shard: 5, num_shards: 5, runner: "linux.aws.h100" },
+<<<<<<< HEAD
           { config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 6, runner: "linux.aws.h100" },
           { config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 6, runner: "linux.aws.h100" },
           { config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 6, runner: "linux.aws.h100" },
@@ -105,10 +127,29 @@ jobs:
           { config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 6, runner: "linux.aws.h100" },
           { config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 6, runner: "linux.aws.h100" },
           { config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 6, runner: "linux.aws.h100" },
+=======
+          { config: "inductor_timm_perf_cuda_h100", shard: 1, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 2, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 3, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 4, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 5, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 6, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_timm_perf_cuda_h100", shard: 7, num_shards: 7, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 1, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 2, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 3, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 4, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 5, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 6, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 7, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 8, num_shards: 9, runner: "linux.aws.h100" },
+          { config: "inductor_torchbench_perf_cuda_h100", shard: 9, num_shards: 9, runner: "linux.aws.h100" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
       selected-test-configs: ${{ inputs.benchmark_configs }}
     secrets: inherit
 
+<<<<<<< HEAD
   test-nightly:
     name: cuda12.6-py3.10-gcc9-sm90
     uses: ./.github/workflows/_linux-test.yml
@@ -116,40 +157,85 @@ jobs:
     if: github.event.schedule == '0 7 * * 1-6'
     with:
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90
+=======
+  test-periodically:
+    name: cuda12.8-py3.10-gcc9-sm90
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event.schedule == '15 0,4,8,12,16,20 * * 1-6'
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
       docker-image: ${{ needs.build.outputs.docker-image }}
       test-matrix: ${{ needs.build.outputs.test-matrix }}
       timeout-minutes: 720
+<<<<<<< HEAD
       # disable monitor in perf tests for more investigation
       disable-monitor: true
     secrets: inherit
 
   test-weekly:
     name: cuda12.6-py3.10-gcc9-sm90
+=======
+      # disable monitor in perf tests, next step is to enable it
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-weekly:
+    name: cuda12.8-py3.10-gcc9-sm90
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '0 7 * * 0'
     with:
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
       docker-image: ${{ needs.build.outputs.docker-image }}
       test-matrix: ${{ needs.build.outputs.test-matrix }}
       timeout-minutes: 1440
+<<<<<<< HEAD
       # disable monitor in perf tests for more investigation
       disable-monitor: true
     secrets: inherit
 
   test:
     name: cuda12.6-py3.10-gcc9-sm90
+=======
+      # disable monitor in perf tests, next step is to enable it
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test:
+    name: cuda12.8-py3.10-gcc9-sm90
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event_name == 'workflow_dispatch'
     with:
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
       docker-image: ${{ needs.build.outputs.docker-image }}
       test-matrix: ${{ needs.build.outputs.test-matrix }}
       timeout-minutes: 720
       # disable monitor in perf tests for more investigation
+<<<<<<< HEAD
       disable-monitor: true
+=======
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor-perf-test-nightly-macos.yml b/.github/workflows/inductor-perf-test-nightly-macos.yml
index a63731f759b9..5107eed3b227 100644
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 name: perf-nightly-macos
 # Technically not an inductor test, but uses it as a template for tracking macos performance
+=======
+name: inductor-perf-nightly-macos
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 on:
   schedule:
@@ -21,9 +25,19 @@ on:
         required: false
         type: string
         default: torchbench_perf_mps
+<<<<<<< HEAD
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+=======
+  pull_request:
+    paths:
+      - .github/workflows/inductor-perf-test-nightly-macos.yml
+      - .ci/pytorch/macos-test.sh
+
+concurrency:
+  group:  ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cancel-in-progress: true
 
 permissions: read-all
@@ -35,6 +49,7 @@ jobs:
     uses: ./.github/workflows/_mac-build.yml
     with:
       sync-tag: macos-perf-py3-arm64-build
+<<<<<<< HEAD
       build-environment: macos-py3-arm64
       runner-type: macos-m1-stable
       build-generates-artifacts: true
@@ -43,6 +58,18 @@ jobs:
       test-matrix: |
         { include: [
           { config: "perf_smoketest", shard: 1, num_shards: 1, runner: "macos-m1-14" },
+=======
+      build-environment: macos-py3-arm64-distributed
+      runner-type: macos-m1-stable
+      build-generates-artifacts: true
+      # To match the one pre-installed in the m1 runners
+      python-version: 3.12.7
+      test-matrix: |
+        { include: [
+          { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
+          { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
+          { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
@@ -51,10 +78,21 @@ jobs:
     uses: ./.github/workflows/_mac-test.yml
     needs: macos-perf-py3-arm64-build
     with:
+<<<<<<< HEAD
       build-environment: macos-py3-arm64
       # Same as the build job
       python-version: 3.9.12
       test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
       # disable monitor in perf tests for more investigation
       disable-monitor: true
+=======
+      build-environment: macos-py3-arm64-distributed
+      # Same as the build job
+      python-version: 3.12.7
+      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm.yml
index 30489f34254a..3212c0f66c31 100644
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@@ -70,13 +70,18 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
 
   linux-focal-rocm6_3-py3_10-inductor-benchmark-build:
     if: github.repository_owner == 'pytorch'
@@ -85,6 +90,17 @@ jobs:
     with:
       build-environment: linux-focal-rocm6_3-py3_10
       docker-image-name: pytorch-linux-focal-rocm-n-py3
+=======
+      opt_out_experiments: lf
+
+  linux-jammy-rocm-py3_10-inductor-benchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: rocm-py3_10-inductor-benchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
@@ -102,6 +118,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-inductor-benchmark-test:
     permissions:
       id-token: write
@@ -117,4 +134,23 @@ jobs:
       timeout-minutes: 720
       # Disable monitor in perf tests for more investigation
       disable-monitor: true
+=======
+  linux-jammy-rocm-py3_10-inductor-benchmark-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: rocm-py3_10-inductor-benchmark-test
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-jammy-rocm-py3_10-inductor-benchmark-build
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }}
+      timeout-minutes: 720
+      # Disable monitor in perf tests for more investigation
+      disable-monitor: true
+      monitor-log-interval: 10
+      monitor-data-collect-interval: 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
new file mode 100644
index 000000000000..68418859a9b1
--- /dev/null
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@@ -0,0 +1,129 @@
+name: inductor-perf-nightly-x86-zen
+
+on:
+  push:
+    tags:
+      - ciflow/inductor-perf-test-nightly-x86-zen/*
+  schedule:
+    # - cron: 0 7 * * 1-6
+    # - cron: 0 7 * * 0
+    # Does not perform max_autotune on CPU, so skip the weekly run setup
+    - cron: 0 7 * * *
+  # NB: GitHub has an upper limit of 10 inputs here
+  workflow_dispatch:
+    inputs:
+      training:
+        # CPU for training is not typical, but leave the option open here
+        description: Run training (off by default)?
+        required: false
+        type: boolean
+        default: false
+      inference:
+        description: Run inference (on by default)?
+        required: false
+        type: boolean
+        default: true
+      default:
+        description: Run inductor_default?
+        required: false
+        type: boolean
+        default: true
+      dynamic:
+        description: Run inductor_dynamic_shapes?
+        required: false
+        type: boolean
+        default: false
+      cppwrapper:
+        description: Run inductor_cpp_wrapper?
+        required: false
+        type: boolean
+        default: false
+      aotinductor:
+        description: Run aot_inductor for inference?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+        default: inductor_huggingface_perf_zen_cpu_x86,inductor_timm_perf_zen_cpu_x86,inductor_torchbench_perf_zen_cpu_x86
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
+          { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 2, num_shards: 3, runner: "linux.24xlarge.amd" },
+          { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 3, num_shards: 3, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_zen_cpu_x86", shard: 1, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_zen_cpu_x86", shard: 2, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_zen_cpu_x86", shard: 3, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_zen_cpu_x86", shard: 4, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_zen_cpu_x86", shard: 5, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 1, num_shards: 4, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 2, num_shards: 4, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 3, num_shards: 4, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 4, num_shards: 4, runner: "linux.24xlarge.amd" },
+        ]}
+      selected-test-configs: ${{ inputs.benchmark_configs }}
+    secrets: inherit
+
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
+    if: github.event.schedule == '0 7 * * *'
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
+      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      timeout-minutes: 720
+      # disable monitor in perf tests
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+
+  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
+    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
+    if: github.event_name == 'workflow_dispatch'
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
+      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      timeout-minutes: 720
+      # disable monitor in perf tests
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml
index 7db8089fd5f6..e0eb134fd117 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@@ -55,13 +55,21 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
+=======
+      opt_out_experiments: lf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   linux-jammy-cpu-py3_9-gcc11-inductor-build:
     name: linux-jammy-cpu-py3.9-gcc11-inductor
@@ -70,7 +78,11 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.9-gcc11-build
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
@@ -101,8 +113,15 @@ jobs:
       docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
+<<<<<<< HEAD
       # disable monitor in perf tests for more investigation
       disable-monitor: true
+=======
+      # disable monitor in perf tests
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
 
@@ -117,6 +136,13 @@ jobs:
       docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
+<<<<<<< HEAD
       # disable monitor in perf tests for more investigation
       disable-monitor: true
+=======
+      # disable monitor in perf tests
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
index 5541bfe22ac6..f905d381e83e 100644
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@@ -68,23 +68,40 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
 
   # NB: Keep this in sync with trunk.yml
   linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
     name: cuda12.6-py3.10-gcc9-sm80
+=======
+      opt_out_experiments: lf
+
+  # NB: Keep this in sync with trunk.yml
+  build:
+    name: cuda12.8-py3.10-gcc9-sm80
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
@@ -111,6 +128,7 @@ jobs:
       selected-test-configs: ${{ inputs.benchmark_configs }}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc9-inductor-test-nightly:
     name: cuda12.6-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-test.yml
@@ -154,4 +172,53 @@ jobs:
       timeout-minutes: 720
       # disable monitor in perf tests for more investigation
       disable-monitor: true
+=======
+  test-nightly:
+    name: cuda12.8-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event.schedule == '0 7 * * 1-6'
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-weekly:
+    name: cuda12.8-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event.schedule == '0 7 * * 0'
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      timeout-minutes: 1440
+      # disable monitor in perf tests, next step is to enable it
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test:
+    name: cuda12.8-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event_name == 'workflow_dispatch'
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
index ada7139a81a2..708d30f9f18a 100644
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@@ -20,22 +20,38 @@ permissions: read-all
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
 
   linux-focal-cuda12_6-py3_10-gcc9-periodic-dynamo-benchmarks-build:
     name: cuda12.6-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+=======
+      opt_out_experiments: lf
+
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-default-label-prefix
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '8.6'
       test-matrix: |
         { include: [
@@ -57,6 +73,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc9-periodic-dynamo-benchmarks-test:
     name: cuda12.6-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
     uses: ./.github/workflows/_linux-test.yml
@@ -74,6 +91,25 @@ jobs:
     with:
       build-environment: linux-focal-rocm6_3-py3_10
       docker-image-name: pytorch-linux-focal-rocm-n-py3
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
+    if: github.repository_owner == 'pytorch'
+    name: rocm-py3_10-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sync-tag: rocm-build
       test-matrix: |
         { include: [
@@ -95,6 +131,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-periodic-dynamo-benchmarks-test:
     permissions:
       id-token: write
@@ -110,13 +147,35 @@ jobs:
 
   linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp:
     name: cuda12.6-py3.10-gcc9-sm80
+=======
+  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: rocm-py3_10-periodic-dynamo-benchmarks
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
+    name: cuda12.8-py3.10-gcc9-sm80
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-default-label-prefix
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
@@ -124,6 +183,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc9-inductor-test-gcp:
     name: cuda12.6-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-test.yml
@@ -134,6 +194,16 @@ jobs:
       test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
       # disable monitor in smoke perf tests for more investigation
       disable-monitor: true
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
+    name: cuda12.8-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
@@ -142,7 +212,11 @@ jobs:
     needs: get-default-label-prefix
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       test-matrix: |
         { include: [
@@ -171,6 +245,7 @@ jobs:
     secrets: inherit
 
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
     name: cuda12.6-py3.10-gcc9-sm86
     uses: ./.github/workflows/_linux-build.yml
@@ -181,6 +256,18 @@ jobs:
       cuda-arch-list: '8.6'
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       sync-tag: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
@@ -196,6 +283,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
     name: cuda12.6-py3.10-gcc9-sm86
     uses: ./.github/workflows/_linux-test.yml
@@ -204,6 +292,16 @@ jobs:
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
       docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-jammy-cpu-py3_9-gcc11-inductor-build:
@@ -212,7 +310,11 @@ jobs:
     needs: get-default-label-prefix
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
       test-matrix: |
diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml
index da19dde06b78..3fbaf2da74c9 100644
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@@ -28,22 +28,38 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
 
   linux-focal-rocm6_3-py3_10-inductor-build:
     name: rocm6.3-py3.10-inductor
+=======
+      opt_out_experiments: lf
+
+  linux-jammy-rocm-py3_10-inductor-build:
+    name: rocm-py3.10-inductor-mi300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image-name: pytorch-linux-focal-rocm-n-py3
+=======
+      build-environment: linux-jammy-rocm-py3.10-mi300
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
@@ -51,6 +67,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-inductor-test:
     permissions:
       id-token: write
@@ -62,4 +79,17 @@ jobs:
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.docker-image }}
       test-matrix:  ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.test-matrix }}
+=======
+  linux-jammy-rocm-py3_10-inductor-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: rocm-py3.10-inductor-mi300
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-jammy-rocm-py3_10-inductor-build
+    with:
+      build-environment: linux-jammy-rocm-py3.10-mi300
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor-rocm.yml b/.github/workflows/inductor-rocm.yml
index b224f3c68827..eaa7a2794691 100644
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@@ -21,22 +21,38 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
 
   linux-focal-rocm6_3-py3_10-inductor-build:
     name: rocm6.3-py3.10-inductor
+=======
+      opt_out_experiments: lf
+
+  linux-jammy-rocm-py3_10-inductor-build:
+    name: rocm-py3.10-inductor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image-name: pytorch-linux-focal-rocm-n-py3
+=======
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
@@ -44,6 +60,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-inductor-test:
     permissions:
       id-token: write
@@ -55,4 +72,17 @@ jobs:
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.docker-image }}
       test-matrix:  ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.test-matrix }}
+=======
+  linux-jammy-rocm-py3_10-inductor-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: rocm-py3.10-inductor
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-jammy-rocm-py3_10-inductor-build
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml
index ffc32540931b..cc414fb8af0f 100644
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@@ -1,6 +1,10 @@
 # Workflow: Inductor Unit Test
 # 1. runs unit tests for inductor.
+<<<<<<< HEAD
 # 2. perfoms daily memory leak checks and reruns of disabled tests, scheduled at `29 8 * * *`.
+=======
+# 2. performs daily memory leak checks and reruns of disabled tests, scheduled at `29 8 * * *`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 name: inductor-unittest
 
 on:
@@ -17,13 +21,18 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
 
   linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
     name: cuda12.6-py3.10-gcc9-sm86
@@ -32,10 +41,22 @@ jobs:
     with:
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+=======
+      opt_out_experiments: lf
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '8.6'
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
         { include: [
+<<<<<<< HEAD
           { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
@@ -61,10 +82,38 @@ jobs:
     with:
       build-environment: linux-focal-cuda12.6-py3.12-gcc9-sm86
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks
+=======
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
+    name: cuda12.8-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '8.6'
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
         { include: [
+<<<<<<< HEAD
           { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
@@ -78,6 +127,21 @@ jobs:
       build-environment: linux-focal-cuda12.6-py3.12-gcc9-sm86
       docker-image: ${{ needs.linux-focal-cuda12_6-py3_12-gcc9-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_6-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+=======
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
+    name: cuda12.8-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-jammy-cpu-py3_12-inductor-halide-build:
@@ -86,11 +150,19 @@ jobs:
     needs: get-label-type
     with:
       build-environment: linux-jammy-py3.12-gcc11
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.12-halide
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
         { include: [
           { config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.12-halide
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
@@ -110,11 +182,19 @@ jobs:
     needs: get-label-type
     with:
       build-environment: linux-jammy-py3.12-gcc11
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.12-triton-cpu
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
         { include: [
           { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.12-triton-cpu
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
@@ -134,6 +214,7 @@ jobs:
     needs: get-label-type
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
@@ -142,6 +223,16 @@ jobs:
           { config: "inductor_amx", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
           { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
           { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
@@ -155,6 +246,7 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_13-gcc9-inductor-build:
     name: cuda12.6-py3.13-gcc9-sm86
     uses: ./.github/workflows/_linux-build.yml
@@ -178,4 +270,29 @@ jobs:
       build-environment: linux-focal-cuda12.6-py3.13-gcc9-sm86
       docker-image: ${{ needs.linux-focal-cuda12_6-py3_13-gcc9-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_6-py3_13-gcc9-inductor-build.outputs.test-matrix }}
+=======
+  linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
+    name: cuda12.8-py3.13-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
+    name: cuda12.8-py3.13-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 0cccdd96a67f..5e3c466b5333 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -33,13 +33,18 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
+<<<<<<< HEAD
 
   linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
     name: cuda12.6-py3.10-gcc9-sm86
@@ -69,6 +74,38 @@ jobs:
       build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
       docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+=======
+      opt_out_experiments: lf
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-jammy-cpu-py3_9-gcc11-inductor-build:
@@ -77,11 +114,16 @@ jobs:
     needs: get-label-type
     with:
       build-environment: linux-jammy-py3.9-gcc11-build
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
       test-matrix: |
         { include: [
+<<<<<<< HEAD
           { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
           { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
           { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
@@ -90,6 +132,16 @@ jobs:
           { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
           { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
           { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
+=======
+          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/lint-autoformat.yml b/.github/workflows/lint-autoformat.yml
index bf68a0877b90..89b9ce15f10c 100644
--- a/.github/workflows/lint-autoformat.yml
+++ b/.github/workflows/lint-autoformat.yml
@@ -1,10 +1,15 @@
 name: Apply lint suggestions
 
 on:
+<<<<<<< HEAD
 
   push:
     tags:
       - ciflow/autoformat/*
+=======
+  pull_request:
+    types: [opened, synchronize, reopened, labeled, unlabeled]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 jobs:
   lintrunner-autoformat:
@@ -12,6 +17,7 @@ jobs:
       contents: read
       pull-requests: write
     runs-on: lf.linux.2xlarge
+<<<<<<< HEAD
     if: ${{ github.repository_owner == 'pytorch' && github.event.pull_request.user.login != 'ezyang' && github.event.pull_request.user.login != 'malfet' && !startsWith(github.head_ref, 'export-') }}
     steps:
       - name: Checkout pytorch
@@ -28,6 +34,22 @@ jobs:
         # we can't run all files here because only changes around where the diff are shown in the PR UI
         run: |
           export ADDITIONAL_LINTRUNNER_ARGS="format"
+=======
+    if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.labels.*.name, 'autoformat') }}
+    steps:
+      - name: Checkout pytorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+        with:
+          submodules: true
+          fetch-depth: 0
+      - name: Run lintrunner (nonretryable)
+        continue-on-error: true
+        run: |
+          set -ex
+          python3 -m venv /tmp/venv
+          source /tmp/venv/bin/activate
+          export ADDITIONAL_LINTRUNNER_ARGS="format --all-files"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           bash .github/scripts/lintrunner.sh
       - name: Check for changes
         id: git-check
@@ -37,7 +59,11 @@ jobs:
       - name: Suggest changes
         if: steps.git-check.outputs.changes == 'true'
         continue-on-error: true
+<<<<<<< HEAD
         uses: parkerbxyz/suggest-changes@v1
+=======
+        uses: parkerbxyz/suggest-changes@a2ec1653b0c4cc8287d682f0066dba4a173cc7f3 # v1.0.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           comment: "Please commit the suggested changes from pytorch's linter."
 
diff --git a/.github/workflows/lint-bc.yml b/.github/workflows/lint-bc.yml
index 64ed12e9c5b8..b0d55d6a3a11 100644
--- a/.github/workflows/lint-bc.yml
+++ b/.github/workflows/lint-bc.yml
@@ -20,7 +20,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Run BC Lint Action
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/bc-lint@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/bc-lint@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repo: ${{ github.event.pull_request.head.repo.full_name }}
           base_sha: ${{ github.event.pull_request.base.sha }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 82d2532fa995..efc18675fce7 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -10,6 +10,11 @@ on:
       - main
       - release/*
       - landchecks/*
+<<<<<<< HEAD
+=======
+    tags:
+      - ciflow/pull/*
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   workflow_dispatch:
 
 permissions: read-all
@@ -19,19 +24,31 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
 
   lintrunner-clang:
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: get-label-type
     with:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+<<<<<<< HEAD
       docker-image: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter
+=======
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
       # to run git rev-parse HEAD~:.ci/docker when a new image is needed
       fetch-depth: 0
@@ -43,12 +60,20 @@ jobs:
         .github/scripts/lintrunner.sh
 
   lintrunner-noclang:
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: get-label-type
     with:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+<<<<<<< HEAD
       docker-image: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter
+=======
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
       # to run git rev-parse HEAD~:.ci/docker when a new image is needed
       fetch-depth: 0
@@ -59,11 +84,16 @@ jobs:
         .github/scripts/lintrunner.sh
 
   quick-checks:
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: get-label-type
     with:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+<<<<<<< HEAD
       docker-image: pytorch-linux-focal-linter
       fetch-depth: 0
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -72,6 +102,12 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+=======
+      docker-image: ci-image:pytorch-linux-jammy-linter
+      fetch-depth: 0
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Ensure no non-breaking spaces
         # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2
         # does not support the '\u000a' syntax (which is relevant for local linters)
@@ -103,7 +139,11 @@ jobs:
     if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: -1
@@ -116,22 +156,35 @@ jobs:
           bash .github/scripts/pr-sanity-check.sh
 
   workflow-checks:
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: get-label-type
     with:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+<<<<<<< HEAD
       docker-image: pytorch-linux-focal-linter
+=======
+      docker-image: ci-image:pytorch-linux-jammy-linter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       fetch-depth: -1
       submodules: true
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
+<<<<<<< HEAD
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
         # Regenerate workflows
         export RELEASE_VERSION_TAG=2.7
+=======
+        # Regenerate workflows
+        export RELEASE_VERSION_TAG=2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .github/scripts/generate_ci_workflows.py
 
         RC=0
@@ -155,11 +208,16 @@ jobs:
         exit $RC
 
   toc:
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: get-label-type
     with:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+<<<<<<< HEAD
       docker-image: pytorch-linux-focal-linter
       fetch-depth: 0
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -168,6 +226,12 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+=======
+      docker-image: ci-image:pytorch-linux-jammy-linter
+      fetch-depth: 0
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Regenerate ToCs and check that they didn't change
         set -eu
 
@@ -195,11 +259,16 @@ jobs:
   test-tools:
     name: Test tools
     if: ${{ github.repository == 'pytorch/pytorch' }}
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: get-label-type
     with:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+<<<<<<< HEAD
       docker-image: pytorch-linux-focal-linter
       fetch-depth: 0
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -208,6 +277,12 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+=======
+      docker-image: ci-image:pytorch-linux-jammy-linter
+      fetch-depth: 0
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test tools
         PYTHONPATH=$(pwd) pytest tools/stats
         PYTHONPATH=$(pwd) pytest tools/test -o "python_files=test*.py"
@@ -219,12 +294,20 @@ jobs:
     runs-on: linux.24_04.4x
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: 1
       - name: Setup Python 3.9
+<<<<<<< HEAD
         uses: actions/setup-python@v4
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.9'
           architecture: x64
@@ -242,15 +325,32 @@ jobs:
   test_collect_env:
     if: ${{ github.repository == 'pytorch/pytorch' }}
     name: Test collect_env
+<<<<<<< HEAD
     runs-on: linux.24_04.4x
     strategy:
       matrix:
         test_type: [with_torch, without_torch, older_python_version]
+=======
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      matrix:
+        include:
+          - test_type: with_torch
+            runner: linux.24_04.4x
+          - test_type: without_torch
+            runner: linux.24_04.4x
+          - test_type: older_python_version
+            runner: linux.24_04.4x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     steps:
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required, to allow us to use git log
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: 1
@@ -263,7 +363,11 @@ jobs:
           echo "MIN_PYTHON_VERSION=$(python3 .github/scripts/get_ci_variable.py --min-python-version)" >> "${GITHUB_OUTPUT}"
       - name: Setup Old Python version
         if: matrix.test_type == 'older_python_version'
+<<<<<<< HEAD
         uses: actions/setup-python@v4
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: 3.8
           architecture: x64
@@ -273,7 +377,11 @@ jobs:
             **/requirements.txt
       - name: Setup Min Python version
         if: matrix.test_type != 'older_python_version'
+<<<<<<< HEAD
         uses: actions/setup-python@v4
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: ${{ steps.get-min-python-version.outputs.MIN_PYTHON_VERSION }}
           architecture: x64
@@ -292,6 +400,18 @@ jobs:
           # All we need to see is that it passes
           python3 torch/utils/collect_env.py
 
+<<<<<<< HEAD
+=======
+  link-check:
+    name: Link checks
+    needs: get-label-type
+    uses: ./.github/workflows/_link_check.yml
+    with:
+      runner: ${{ needs.get-label-type.outputs.label-type }}
+      ref:    ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+    secrets: inherit
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml
index 31dcc855de4b..c422607a0d08 100644
--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@@ -19,7 +19,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -33,6 +37,7 @@ jobs:
     with:
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
       build-environment: linux-jammy-aarch64-py3.10
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
       runner: linux.arm64.2xlarge
       test-matrix: |
@@ -44,6 +49,18 @@ jobs:
           { config: "default", shard: 1, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
           { config: "default", shard: 2, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
           { config: "default", shard: 3, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
+      runner: linux.arm64.m7g.4xlarge
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m8g.4xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m8g.4xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m8g.4xlarge" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/llm_td_retrieval.yml b/.github/workflows/llm_td_retrieval.yml
index 3b7baeb04f44..6fe0e391b885 100644
--- a/.github/workflows/llm_td_retrieval.yml
+++ b/.github/workflows/llm_td_retrieval.yml
@@ -12,7 +12,11 @@ jobs:
     name: get-label-type
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -27,7 +31,11 @@ jobs:
     needs: get-label-type
     steps:
       - name: Clone PyTorch
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repository: pytorch/pytorch
           fetch-depth: 0
@@ -37,31 +45,48 @@ jobs:
         uses: ./pytorch/.github/actions/setup-linux
 
       - name: Clone CodeLlama
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repository: osalpekar/codellama
           ref: main
           path: codellama
 
       - name: Clone Target Determination Code
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repository: osalpekar/llm-target-determinator
           ref: v0.0.2
           path: llm-target-determinator
 
+<<<<<<< HEAD
       - name: Setup miniconda
         uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
         with:
           python-version: "3.9"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - name: Install requirements
         shell: bash
         run: |
           set -euxo pipefail
+<<<<<<< HEAD
           ${CONDA_RUN} pip install -r llm-target-determinator/requirements.txt
           cd "${GITHUB_WORKSPACE}/codellama"
           ${CONDA_RUN} pip install -e .
+=======
+          python3 -m pip install -r llm-target-determinator/requirements.txt
+          cd "${GITHUB_WORKSPACE}/codellama"
+          python3 -m pip install -e .
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Fetch CodeLlama Checkpoint
         shell: bash
@@ -72,7 +97,11 @@ jobs:
           aws s3 cp "s3://target-determinator-assets/CodeLlama-7b-Python" "CodeLlama-7b-Python" --recursive --no-progress
 
       - name: Fetch indexes
+<<<<<<< HEAD
         uses: nick-fields/retry@v3.0.0
+=======
+        uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           max_attempts: 3
           retry_wait_seconds: 10
@@ -80,7 +109,11 @@ jobs:
           shell: bash
           command: |
             set -euxo pipefail
+<<<<<<< HEAD
             ${CONDA_RUN} python -m pip install awscli==1.29.40
+=======
+            python3 -m pip install awscli==1.29.40
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets
             aws s3 cp "s3://target-determinator-assets/indexes/latest" . --recursive
 
@@ -94,7 +127,12 @@ jobs:
         run: |
           set -euxo pipefail
           cd "${GITHUB_WORKSPACE}"/llm-target-determinator
+<<<<<<< HEAD
           ${CONDA_RUN} torchrun \
+=======
+          export PATH="$HOME/.local/bin:$PATH"
+          torchrun \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             --standalone \
             --nnodes=1 \
             --nproc-per-node=1 \
@@ -105,7 +143,11 @@ jobs:
           zip -r mappings.zip mappings
 
       - name: Upload results to s3
+<<<<<<< HEAD
         uses: seemethere/upload-artifact-s3@v5
+=======
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: ${{ steps.run_retriever.outcome == 'success' }}
         with:
           name: llm_results
@@ -120,5 +162,9 @@ jobs:
           AWS_REGION: ""
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml
index c655b66d31c1..077506af36a8 100644
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@@ -23,7 +23,11 @@ jobs:
       runner-type: macos-m1-stable
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
+<<<<<<< HEAD
       python-version: 3.9.12
+=======
+      python-version: 3.12.7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # The runner macos-m2-14 is not a typo, it's a custom runner that is different
       # than our AWS macos-m1-14 runners
       test-matrix: |
@@ -36,12 +40,22 @@ jobs:
 
   macos-py3-arm64-mps-test:
     name: macos-py3-arm64-mps
+<<<<<<< HEAD
     uses: ./.github/workflows/_mac-test-mps.yml
+=======
+    uses: ./.github/workflows/_mac-test.yml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     needs: macos-py3-arm64-build
     with:
       sync-tag: macos-py3-arm64-mps-test
       build-environment: macos-py3-arm64
       # Same as the build job
+<<<<<<< HEAD
       python-version: 3.9.12
       test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
+=======
+      python-version: 3.12.7
+      test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
+      disable-monitor: false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/nightly-s3-uploads.yml b/.github/workflows/nightly-s3-uploads.yml
index fc52df29b521..419a21b8de09 100644
--- a/.github/workflows/nightly-s3-uploads.yml
+++ b/.github/workflows/nightly-s3-uploads.yml
@@ -23,12 +23,20 @@ jobs:
     environment: upload-stats
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: false
 
+<<<<<<< HEAD
       - uses: actions/setup-python@v4
+=======
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.11'
           cache: pip
@@ -37,13 +45,21 @@ jobs:
           pip3 install requests==2.32.2 boto3==1.35.42
 
       - name: Authenticate with AWS
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v4
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_upload_external_contrib_stats
           aws-region: us-east-1
 
       - name: Upload external contribution stats
+<<<<<<< HEAD
         uses: nick-fields/retry@v3.0.0
+=======
+        uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 3014ac777a5c..82d8542ce9ce 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -5,8 +5,12 @@ on:
     - cron: 0 0 * * *
   push:
     tags:
+<<<<<<< HEAD
       # Final Release tags look like: v1.11.0
       - v[0-9]+.[0-9]+.[0-9]+
+=======
+      # NOTE: Doc build pipelines should only get triggered on release candidate builds
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # Release candidate tags look like: v1.11.0-rc1
       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
       - ciflow/nightly/*
@@ -20,7 +24,11 @@ concurrency:
 jobs:
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -28,6 +36,18 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
+=======
+  link-check:
+    name: Link checks
+    needs: get-label-type
+    uses: ./.github/workflows/_link_check.yml
+    with:
+      runner: ${{ needs.get-label-type.outputs.label-type }}
+      ref:    ${{ github.sha }}
+    secrets: inherit
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   docs-build:
     name: docs build
     uses: ./.github/workflows/_linux-build.yml
@@ -35,7 +55,11 @@ jobs:
     with:
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
       build-environment: linux-jammy-py3.9-gcc11
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   docs-push:
@@ -48,7 +72,11 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.9-gcc11
       docker-image: ${{ needs.docs-build.outputs.docker-image }}
+<<<<<<< HEAD
       push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || (startsWith(github.event.ref, 'refs/tags/v') && !contains(github.event.ref, 'rc')) }}
+=======
+      push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       run-doxygen: true
     secrets:
       GH_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
@@ -79,7 +107,11 @@ jobs:
     if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
     steps:
       - name: "${{ matrix.repo-owner }}/${{ matrix.repo-name }} update-commit-hash"
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repo-owner: ${{ matrix.repo-owner }}
           repo-name: ${{ matrix.repo-name }}
diff --git a/.github/workflows/nitpicker.yml b/.github/workflows/nitpicker.yml
index 4c769a2b9e02..5cf28563573f 100644
--- a/.github/workflows/nitpicker.yml
+++ b/.github/workflows/nitpicker.yml
@@ -19,7 +19,11 @@ jobs:
     if: ${{ github.event.pull_request.number != 26921 && github.repository_owner == 'pytorch' }}
     steps:
     - name: Checkout PyTorch
+<<<<<<< HEAD
       uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+      uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - uses: ethanis/nitpicker@v1
       with:
         nitpicks: '.github/nitpicks.yml'
diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml
new file mode 100644
index 000000000000..810602b9c57b
--- /dev/null
+++ b/.github/workflows/operator_benchmark.yml
@@ -0,0 +1,59 @@
+name: operator_benchmark
+
+on:
+  push:
+    tags:
+      - ciflow/op-benchmark/*
+  workflow_dispatch:
+    inputs:
+      test_mode:
+        required: false
+        type: string
+        default: 'short'
+        description: tag filter for operator benchmarks, options from long, short, all
+  schedule:
+    # Run at 07:00 UTC every Sunday
+    - cron: 0 7 * * 0
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
+    if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
+    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
+    with:
+      build-environment: linux-jammy-py3.9-gcc11-build
+      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml
new file mode 100644
index 000000000000..e0983577cec5
--- /dev/null
+++ b/.github/workflows/periodic-rocm-mi300.yml
@@ -0,0 +1,81 @@
+name: periodic-rocm-mi300
+
+on:
+  schedule:
+    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
+    # Also run less frequently on weekends.
+    - cron: 45 0,8,16 * * 1-5
+    - cron: 45 4 * * 0,6
+    - cron: 45 4,12,20 * * 1-5
+    - cron: 45 12 * * 0,6
+    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
+  push:
+    tags:
+      - ciflow/periodic-rocm-mi300/*
+    branches:
+      - release/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  llm-td:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10-mi300
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10-mi300
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10-mi300
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10-mi300
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 76953638d64c..9f525e1096f9 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -41,7 +41,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -49,14 +53,24 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc11-build:
     name: linux-focal-cuda12.6-py3.10-gcc11
+=======
+  linux-jammy-cuda12_8-py3_10-gcc11-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc11
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@@ -68,6 +82,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc11-test:
     name: linux-focal-cuda12.6-py3.10-gcc11
     uses: ./.github/workflows/_linux-test.yml
@@ -82,12 +97,33 @@ jobs:
 
   linux-focal-cuda11_8-py3_9-gcc9-build:
     name: linux-focal-cuda11.8-py3.9-gcc9
+=======
+  linux-jammy-cuda12_8-py3_10-gcc11-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_9-gcc9-build:
+    name: linux-jammy-cuda12.8-py3.9-gcc9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda11.8-py3.9-gcc9
       docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+=======
+      build-environment: linux-jammy-cuda12.8-py3.9-gcc9
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
@@ -97,6 +133,7 @@ jobs:
       build-with-debug: false
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda11_8-py3_9-gcc9-test:
     name: linux-focal-cuda11.8-py3.9-gcc9
     uses: ./.github/workflows/_linux-test.yml
@@ -109,12 +146,31 @@ jobs:
 
   linux-focal-cuda11_8-py3_10-gcc9-debug-build:
     name: linux-focal-cuda11.8-py3.10-gcc9-debug
+=======
+  linux-jammy-cuda12_8-py3_9-gcc9-test:
+    name: linux-jammy-cuda12.8-py3.9-gcc9
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_9-gcc9-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.9-gcc9
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc9-debug
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda11.8-py3.10-gcc9-debug
       docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build-with-debug: true
       test-matrix: |
         { include: [
@@ -128,6 +184,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda11_8-py3_10-gcc9-debug-test:
     name: linux-focal-cuda11.8-py3.10-gcc9-debug
     uses: ./.github/workflows/_linux-test.yml
@@ -142,12 +199,33 @@ jobs:
 
   linux-focal-rocm6_3-py3_10-build:
     name: linux-focal-rocm6.3-py3.10
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-debug-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc9-debug
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc9-debug-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image-name: pytorch-linux-focal-rocm-n-py3
+=======
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
@@ -156,6 +234,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-test:
     permissions:
       id-token: write
@@ -173,10 +252,30 @@ jobs:
 
   linux-focal-cuda12_6-py3-gcc11-slow-gradcheck-build:
     name: linux-focal-cuda12.6-py3-gcc11-slow-gradcheck
+=======
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
+    name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3-gcc11-slow-gradcheck
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11
       cuda-arch-list: 8.6
@@ -220,3 +319,33 @@ jobs:
           { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
         ]}
     secrets: inherit
+=======
+      build-environment: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: 8.6
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-test:
+    name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build.outputs.test-matrix }}
+      timeout-minutes: 300
+    secrets: inherit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 2881210f3f3d..169e6fca4d5e 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -9,6 +9,11 @@ on:
       - main
       - release/*
       - landchecks/*
+<<<<<<< HEAD
+=======
+    tags:
+      - ciflow/pull/*
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   workflow_dispatch:
   schedule:
     - cron: 29 8 * * *  # about 1:29am PDT
@@ -38,7 +43,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -52,7 +61,11 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.9-gcc11
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -79,7 +92,10 @@ jobs:
       build-environment: linux-jammy-py3.9-gcc11
       docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }}
+<<<<<<< HEAD
       timeout-minutes: 300
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-docs:
@@ -98,7 +114,11 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.9-gcc11-no-ops
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
@@ -112,21 +132,35 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.9-gcc11-pch
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-py3_10-clang15-asan-build:
     name: linux-jammy-py3.10-clang15-asan
+=======
+  linux-jammy-py3_10-clang18-asan-build:
+    name: linux-jammy-py3.10-clang18-asan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-clang15-asan
       docker-image-name: pytorch-linux-jammy-py3-clang15-asan
+=======
+      build-environment: linux-jammy-py3.10-clang18-asan
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@@ -140,6 +174,7 @@ jobs:
     secrets: inherit
 
 
+<<<<<<< HEAD
   linux-jammy-py3_10-clang15-asan-test:
     name: linux-jammy-py3.10-clang15-asan
     uses: ./.github/workflows/_linux-test.yml
@@ -155,12 +190,34 @@ jobs:
 
   linux-focal-py3_9-clang10-onnx-build:
     name: linux-focal-py3.9-clang10-onnx
+=======
+  linux-jammy-py3_10-clang18-asan-test:
+    name: linux-jammy-py3.10-clang18-asan
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_10-clang18-asan-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.10-clang18-asan
+      docker-image: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.test-matrix }}
+      sync-tag: asan-test
+    secrets: inherit
+
+  linux-jammy-py3_9-clang12-onnx-build:
+    name: linux-jammy-py3.9-clang12-onnx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-py3.9-clang10-onnx
       docker-image-name: pytorch-linux-focal-py3-clang10-onnx
+=======
+      build-environment: linux-jammy-py3.9-clang12-onnx
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -168,6 +225,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-py3_9-clang10-onnx-test:
     name: linux-focal-py3.9-clang10-onnx
     uses: ./.github/workflows/_linux-test.yml
@@ -217,12 +275,33 @@ jobs:
 
   linux-focal-py3_13-clang10-build:
     name: linux-focal-py3.13-clang10
+=======
+  linux-jammy-py3_9-clang12-onnx-test:
+    name: linux-jammy-py3.9-clang12-onnx
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_9-clang12-onnx-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.9-clang12-onnx
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-py3_9-clang12-build:
+    name: linux-jammy-py3.9-clang12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-py3.13-clang10
       docker-image-name: pytorch-linux-focal-py3.13-clang10
+=======
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@@ -235,6 +314,7 @@ jobs:
           { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+<<<<<<< HEAD
         ]}
     secrets: inherit
 
@@ -251,12 +331,72 @@ jobs:
 
   linux-focal-cuda11_8-py3_10-gcc9-build:
     name: linux-focal-cuda11.8-py3.10-gcc9
+=======
+          { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3_9-clang12-test:
+    name: linux-jammy-py3.9-clang12
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_9-clang12-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-py3_13-clang12-build:
+    name: linux-jammy-py3.13-clang12
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.13-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.13-clang12
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3_13-clang12-test:
+    name: linux-jammy-py3.13-clang12
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3_13-clang12-build
+    with:
+      build-environment: linux-jammy-py3.13-clang12
+      docker-image: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
+      timeout-minutes: 600
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda11.8-py3.10-gcc9
       docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '7.5'
       test-matrix: |
         { include: [
@@ -266,6 +406,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda11_8-py3_10-gcc9-test:
     name: linux-focal-cuda11.8-py3.10-gcc9
     uses: ./.github/workflows/_linux-test.yml
@@ -281,12 +422,34 @@ jobs:
 
   linux-focal-cuda12_6-py3_10-gcc11-build:
     name: linux-focal-cuda12.6-py3.10-gcc11
+=======
+  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc11
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
@@ -297,6 +460,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc11-test:
     name: linux-focal-cuda12.6-py3.10-gcc11
     uses: ./.github/workflows/_linux-test.yml
@@ -308,6 +472,19 @@ jobs:
       build-environment: linux-focal-cuda12.6-py3.10-gcc11
       docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc11-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc11-build.outputs.test-matrix }}
+=======
+  linux-jammy-cuda12_8-py3_10-gcc11-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-jammy-py3-clang12-mobile-build:
@@ -317,7 +494,11 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3-clang12-mobile-build
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3-clang15-asan
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang15-asan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build-generates-artifacts: false
       test-matrix: |
         { include: [
@@ -325,27 +506,46 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-cuda-11_8-cudnn9-py3_9-clang12-build:
     name: linux-jammy-cuda11.8-cudnn9-py3.9-clang12
+=======
+  linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
+    name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-cuda11.8-cudnn9-py3.9-clang12
       docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12
+=======
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-py3_9-clang9-xla-build:
     name: linux-focal-py3_9-clang9-xla
+=======
+  linux-jammy-py3_9-clang9-xla-build:
+    name: linux-jammy-py3_9-clang9-xla
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-py3.9-clang9-xla
+=======
+      build-environment: linux-jammy-py3.9-clang9-xla
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
       test-matrix: |
         { include: [
@@ -353,6 +553,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-py3_9-clang9-xla-test:
     name: linux-focal-py3_9-clang9-xla
     uses: ./.github/workflows/_linux-test.yml
@@ -384,12 +585,31 @@ jobs:
 
   linux-focal-cpu-py3_10-gcc11-bazel-test:
     name: linux-focal-cpu-py3.10-gcc11-bazel-test
+=======
+  linux-jammy-py3_9-clang9-xla-test:
+    name: linux-jammy-py3_9-clang9-xla
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3_9-clang9-xla-build
+    with:
+      build-environment: linux-jammy-py3.9-clang9-xla
+      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_10-gcc11-bazel-test:
+    name: linux-jammy-cpu-py3.10-gcc11-bazel-test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_bazel-build-test.yml
     needs: get-label-type
     with:
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc11-bazel-test
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-bazel-test
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-version: cpu
       test-matrix: |
         { include: [
@@ -404,7 +624,11 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build-generates-artifacts: false
       test-matrix: |
         { include: [
@@ -412,16 +636,28 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-build:
     # don't run build twice on main
     if: github.event_name == 'pull_request'
     name: linux-focal-rocm6.3-py3.10
+=======
+  linux-jammy-rocm-py3_10-build:
+    # don't run build twice on main
+    if: github.event_name == 'pull_request'
+    name: linux-jammy-rocm-py3.10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image-name: pytorch-linux-focal-rocm-n-py3
+=======
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sync-tag: rocm-build
       test-matrix: |
         { include: [
@@ -431,14 +667,24 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc11-sm89-build:
     name: linux-focal-cuda12.6-py3.10-gcc11-sm89
+=======
+  linux-jammy-cuda12_8-py3_10-gcc11-sm89-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc11-sm89
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: 8.9
       test-matrix: |
         { include: [
@@ -450,6 +696,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   unstable-linux-focal-cuda12_6-py3_10-gcc11-sm89-build-xfail:
     # A version of the build that sets a larger number of jobs for a build.  May
     # OOM
@@ -480,6 +727,18 @@ jobs:
       build-environment: linux-focal-cuda12.6-py3.10-gcc11-sm89
       docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc11-sm89-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc11-sm89-build.outputs.test-matrix }}
+=======
+  linux-jammy-cuda12_8-py3_10-gcc11-sm89-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm89-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   linux-jammy-py3-clang12-executorch-build:
@@ -489,7 +748,11 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3-clang12-executorch
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3-clang12-executorch
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -500,20 +763,34 @@ jobs:
     name: linux-jammy-py3-clang12-executorch
     uses: ./.github/workflows/_linux-test.yml
     needs: linux-jammy-py3-clang12-executorch-build
+<<<<<<< HEAD
+=======
+    if: false # Has been broken for a while
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       build-environment: linux-jammy-py3-clang12-executorch
       docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
     name: cuda12.4-py3.10-gcc9-sm75
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm75
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm75
       docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '7.5'
       test-matrix: |
         { include: [
@@ -521,6 +798,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
     name: cuda12.4-py3.10-gcc9-sm75
     uses: ./.github/workflows/_linux-test.yml
@@ -540,6 +818,27 @@ jobs:
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
       build-environment: linux-jammy-xpu-2025.0-py3.9
       docker-image-name: pytorch-linux-jammy-xpu-2025.0-py3
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-xpu-2025_1-py3_9-build:
+    name: linux-jammy-xpu-2025.1-py3.9
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      sync-tag: linux-xpu-2025-1-build
+      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml
index 5b3cb1265c8b..708b7e234f56 100644
--- a/.github/workflows/revert.yml
+++ b/.github/workflows/revert.yml
@@ -13,14 +13,22 @@ jobs:
         GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
     steps:
       - name: Checkout repo
+<<<<<<< HEAD
         uses: actions/checkout@v2
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         id: checkout
         with:
           fetch-depth: 0
           token: ${{ secrets.MERGEBOT_TOKEN }}
 
       - name: Setup Python
+<<<<<<< HEAD
         uses: actions/setup-python@v4
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.9'
           architecture: x64
diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml
index e83e776223a6..1b462fe10fe3 100644
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@@ -28,7 +28,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -36,15 +40,26 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-build:
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     name: linux-focal-rocm6.3-py3.10
+=======
+  linux-jammy-rocm-py3_10-build:
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    name: linux-jammy-rocm-py3.10-mi300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image-name: pytorch-linux-focal-rocm-n-py3
+=======
+      build-environment: linux-jammy-rocm-py3.10-mi300
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sync-tag: rocm-build
       test-matrix: |
         { include: [
@@ -57,6 +72,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-test:
     permissions:
       id-token: write
@@ -70,4 +86,19 @@ jobs:
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }}
+=======
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10-mi300
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10-mi300
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml
index 6ff8667a9d94..af105e7e07ca 100644
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@@ -26,6 +26,7 @@ jobs:
       id-token: write
       contents: read
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-build:
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     name: linux-focal-rocm6.3-py3.10
@@ -33,6 +34,15 @@ jobs:
     with:
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image-name: pytorch-linux-focal-rocm-n-py3
+=======
+  linux-jammy-rocm-py3_10-build:
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sync-tag: rocm-build
       test-matrix: |
         { include: [
@@ -45,6 +55,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-test:
     permissions:
       id-token: write
@@ -58,4 +69,19 @@ jobs:
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }}
+=======
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/runner-determinator-validator.yml b/.github/workflows/runner-determinator-validator.yml
index 72581829f7a0..df2de9fbc861 100644
--- a/.github/workflows/runner-determinator-validator.yml
+++ b/.github/workflows/runner-determinator-validator.yml
@@ -20,7 +20,11 @@ jobs:
 
     steps:
     - name: Checkout repository
+<<<<<<< HEAD
       uses: actions/checkout@v2
+=======
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     - name: Run Hardcode runner-determinator script
       id: hardcode-script
@@ -38,4 +42,8 @@ jobs:
         # version embedded into .github/workflows/_runner-determinator.yml
         diff runner_determinator_workflow.py .github/scripts/runner_determinator.py
       # Fail the job if the scripts are not identical
-      continue-on-error: false
\ No newline at end of file
+<<<<<<< HEAD
+      continue-on-error: false
+=======
+      continue-on-error: false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/runner_determinator_script_sync.yaml b/.github/workflows/runner_determinator_script_sync.yaml
index a47c3b418860..102793c8162a 100644
--- a/.github/workflows/runner_determinator_script_sync.yaml
+++ b/.github/workflows/runner_determinator_script_sync.yaml
@@ -15,7 +15,11 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
+<<<<<<< HEAD
       - uses: actions/checkout@v4
+=======
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           sparse-checkout: |
             .github
diff --git a/.github/workflows/s390.yml b/.github/workflows/s390.yml
index f95af2bd1561..eeacc99ee0b2 100644
--- a/.github/workflows/s390.yml
+++ b/.github/workflows/s390.yml
@@ -2,8 +2,11 @@ name: s390
 
 on:
   push:
+<<<<<<< HEAD
     branches:
       - main
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tags:
       - ciflow/s390/*
   workflow_dispatch:
@@ -21,6 +24,10 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-s390x-binary-manywheel
+<<<<<<< HEAD
       docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+=======
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner: linux.s390x
     secrets: inherit
diff --git a/.github/workflows/s390x-periodic.yml b/.github/workflows/s390x-periodic.yml
index 67f68fcaee9a..07c027bfb0e1 100644
--- a/.github/workflows/s390x-periodic.yml
+++ b/.github/workflows/s390x-periodic.yml
@@ -9,8 +9,11 @@ on:
     tags:
       - ciflow/periodic/*
       - ciflow/s390/*
+<<<<<<< HEAD
     branches:
       - release/*
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   workflow_dispatch:
 
 concurrency:
@@ -42,7 +45,11 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-s390x-binary-manywheel
+<<<<<<< HEAD
       docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+=======
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner: linux.s390x
       test-matrix: |
         { include: [
@@ -70,8 +77,14 @@ jobs:
       - target-determination
     with:
       build-environment: linux-s390x-binary-manywheel
+<<<<<<< HEAD
       docker-image: pytorch/manylinuxs390x-builder:cpu-s390x-main
       test-matrix: ${{ needs.linux-manylinux-2_28-py3-cpu-s390x-build.outputs.test-matrix }}
       timeout-minutes: 480
+=======
+      docker-image: pytorch/manylinuxs390x-builder:cpu-s390x
+      test-matrix: ${{ needs.linux-manylinux-2_28-py3-cpu-s390x-build.outputs.test-matrix }}
+      timeout-minutes: 600
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       use-gha: "yes"
     secrets: inherit
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 9567e15d2f5d..96581b082787 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -25,12 +25,20 @@ jobs:
 
     steps:
       - name: "Checkout code"
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           persist-credentials: false
 
       - name: "Run analysis"
+<<<<<<< HEAD
         uses: ossf/scorecard-action@865b4092859256271290c77adbd10a43f4779972 # tag=v2.0.3
+=======
+        uses: ossf/scorecard-action@865b4092859256271290c77adbd10a43f4779972 # v2.0.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           results_file: results.sarif
           results_format: sarif
@@ -42,7 +50,11 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
+<<<<<<< HEAD
         uses: actions/upload-artifact@v4
+=======
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           name: SARIF file
           path: results.sarif
@@ -50,6 +62,10 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
+<<<<<<< HEAD
         uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # tag=v1.0.26
+=======
+        uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index b0c73f0a3969..496f9186b1e4 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -39,7 +39,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -47,14 +51,24 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc11-sm86-build:
     name: linux-focal-cuda12.6-py3.10-gcc11-sm86
+=======
+  linux-jammy-cuda12_8-py3_10-gcc11-sm86-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc11-sm86
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
@@ -64,6 +78,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc11-sm86-test:
     name: linux-focal-cuda12.6-py3.10-gcc11-sm86
     uses: ./.github/workflows/_linux-test.yml
@@ -78,12 +93,33 @@ jobs:
 
   linux-focal-py3_9-clang10-build:
     name: linux-focal-py3.9-clang10
+=======
+  linux-jammy-cuda12_8-py3_10-gcc11-sm86-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm86-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-py3_9-clang12-build:
+    name: linux-jammy-py3.9-clang12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-py3.9-clang10
       docker-image-name: pytorch-linux-focal-py3.9-clang10
+=======
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
@@ -91,6 +127,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-py3_9-clang10-test:
     name: linux-focal-py3.9-clang10
     uses: ./.github/workflows/_linux-test.yml
@@ -105,12 +142,33 @@ jobs:
 
   linux-focal-rocm6_3-py3_10-build:
     name: linux-focal-rocm6.3-py3.10
+=======
+  linux-jammy-py3_9-clang12-test:
+    name: linux-jammy-py3.9-clang12
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_9-clang12-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.9-clang12
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image-name: pytorch-linux-focal-rocm-n-py3
+=======
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
@@ -118,6 +176,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-test:
     permissions:
       id-token: write
@@ -135,10 +194,30 @@ jobs:
 
   linux-jammy-py3_10-clang15-asan-build:
     name: linux-jammy-py3.10-clang15-asan
+=======
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-py3_10-clang18-asan-build:
+    name: linux-jammy-py3.10-clang18-asan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-jammy-py3.10-clang15-asan
       docker-image-name: pytorch-linux-jammy-py3-clang15-asan
       test-matrix: |
@@ -146,10 +225,20 @@ jobs:
           { config: "slow", shard: 1, num_shards: 3, runner: "linux.4xlarge" },
           { config: "slow", shard: 2, num_shards: 3, runner: "linux.4xlarge" },
           { config: "slow", shard: 3, num_shards: 3, runner: "linux.4xlarge" },
+=======
+      build-environment: linux-jammy-py3.10-clang18-asan
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
       sync-tag: asan-build
     secrets: inherit
 
+<<<<<<< HEAD
   linux-jammy-py3_10-clang15-asan-test:
     name: linux-jammy-py3.10-clang15-asan
     uses: ./.github/workflows/_linux-test.yml
@@ -160,5 +249,17 @@ jobs:
       build-environment: linux-jammy-py3.10-clang15-asan
       docker-image: ${{ needs.linux-jammy-py3_10-clang15-asan-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-py3_10-clang15-asan-build.outputs.test-matrix }}
+=======
+  linux-jammy-py3_10-clang18-asan-test:
+    name: linux-jammy-py3.10-clang18-asan
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_10-clang18-asan-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.10-clang18-asan
+      docker-image: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sync-tag: asan-test
     secrets: inherit
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 047e4a47ab97..08822d4943f4 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -27,7 +27,11 @@ jobs:
       pull-requests: write
 
     steps:
+<<<<<<< HEAD
       - uses: actions/github-script@v6
+=======
+      - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           script: |
             // Do some dumb retries on requests.
diff --git a/.github/workflows/target-determination-indexer.yml b/.github/workflows/target-determination-indexer.yml
index 363b59b78054..af0d59fead54 100644
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@@ -13,7 +13,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -26,7 +30,11 @@ jobs:
     environment: target-determinator-env
     steps:
       - name: Clone PyTorch
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           path: pytorch
 
@@ -35,9 +43,15 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
         with:
           docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+=======
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.8
+        with:
+          docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           working-directory: pytorch
 
       - name: Use following to pull public copy of the image
@@ -46,34 +60,57 @@ jobs:
           ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
         shell: bash
         run: |
+<<<<<<< HEAD
           tag=${ECR_DOCKER_IMAGE##*/}
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
+=======
+          tag=${ECR_DOCKER_IMAGE##*:}
+          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
 
       - name: Clone CodeLlama
         uses: actions/checkout@v3
+=======
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.8
+
+      - name: Clone CodeLlama
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repository: osalpekar/codellama
           ref: 1ec50e0cfc0fadc3b6ceb146617e2119ab26eb34
           path: codellama
 
       - name: Clone Target Determination Code
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repository: osalpekar/llm-target-determinator
           ref: v0.0.2
           path: llm-target-determinator
 
       - name: Configure AWS credentials
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_target_determinator_s3_read_write
           aws-region: us-east-1
@@ -100,6 +137,11 @@ jobs:
           AWS_DEFAULT_REGION: us-east-1
         run: |
           # detached container should get cleaned up by teardown_ec2_linux
+<<<<<<< HEAD
+=======
+          # Disable shellcheck warning for GPU_FLAG
+          # shellcheck disable=SC2086
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           container_name=$(docker run \
             ${GPU_FLAG:-} \
             -e MAX_JOBS="$(nproc --ignore=2)" \
@@ -147,7 +189,11 @@ jobs:
             "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}"
 
       - name: Teardown Linux
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: always()
 
 concurrency:
diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml
index 7ed28deb94f2..fcb2586a1e9a 100644
--- a/.github/workflows/target_determination.yml
+++ b/.github/workflows/target_determination.yml
@@ -9,7 +9,11 @@ jobs:
     name: get-label-type
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -27,7 +31,11 @@ jobs:
       # checkout because when we run this action we don't *have* a local
       # checkout. In other cases you should prefer a local checkout.
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
 
@@ -49,7 +57,11 @@ jobs:
           job_identifier: ${{ github.workflow }}
 
       - name: Download LLM Artifacts from S3
+<<<<<<< HEAD
         uses: seemethere/download-artifact-s3@v4
+=======
+        uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           name: llm_results
@@ -76,7 +88,11 @@ jobs:
           python3 tools/testing/do_target_determination_for_s3.py
 
       - name: Upload TD results to s3
+<<<<<<< HEAD
         uses: seemethere/upload-artifact-s3@v5
+=======
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: steps.td.outcome == 'success'
         with:
           name: td_results
@@ -85,7 +101,11 @@ jobs:
           path: td_results.json
 
       - name: Store TD results on GHA
+<<<<<<< HEAD
         uses: actions/upload-artifact@v4
+=======
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if: steps.td.outcome == 'success'
         with:
           name: td_results.json
diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml
index c6898d36353e..c30166dfd5c9 100644
--- a/.github/workflows/test-check-binary.yml
+++ b/.github/workflows/test-check-binary.yml
@@ -15,26 +15,47 @@ jobs:
   check_binary_linux_cpu:
     if: github.repository_owner == 'pytorch'
     name: Test check_binary.sh for Linux CPU
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       docker-image: python:3.11
       docker-build-dir: "skip-docker-build"
       script: |
           pushd .ci/pytorch/
+<<<<<<< HEAD
           pip install --pre torch --index-url https://download.pytorch.org/whl/test/cpu
           DESIRED_PYTHON=3.11 DESIRED_CUDA=cpu DESIRED_DEVTOOLSET=cxx11-abi PACKAGE_TYPE=manywheel ./check_binary.sh
+=======
+          pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          DESIRED_PYTHON=3.11 DESIRED_CUDA=cpu PACKAGE_TYPE=manywheel ./check_binary.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           popd
 
   check_binary_linux_cuda:
     if: github.repository_owner == 'pytorch'
     name: Test check_binary.sh for Linux CUDA
+<<<<<<< HEAD
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.7
+=======
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       runner: linux.4xlarge.nvidia.gpu
       docker-image: python:3.11
       docker-build-dir: "skip-docker-build"
       script: |
+<<<<<<< HEAD
           pushd .ci/pytorch/
           pip install --pre torch --index-url https://download.pytorch.org/whl/test/cu126
           DESIRED_PYTHON=3.11 DESIRED_CUDA=cu126 DESIRED_DEVTOOLSET=cxx11-abi PACKAGE_TYPE=manywheel ./check_binary.sh
+=======
+          STABLE_CUDA_VERSION=$(python3 .github/scripts/get_ci_variable.py --cuda-stable-version)
+          CUDA_VERSION_NODOT=$(echo ${STABLE_CUDA_VERSION} | tr -d '.')
+          pushd .ci/pytorch/
+          pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_NODOT}
+          DESIRED_PYTHON=3.11 DESIRED_CUDA=cu${CUDA_VERSION_NODOT} PACKAGE_TYPE=manywheel ./check_binary.sh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           popd
diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml
new file mode 100644
index 000000000000..9d92e969686f
--- /dev/null
+++ b/.github/workflows/test-h100.yml
@@ -0,0 +1,55 @@
+name: Limited CI on H100
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/test-h100.yml
+  workflow_dispatch:
+  schedule:
+    - cron: 0 4,10,16,22 * * *  # every 6 hours
+  push:
+    tags:
+      - ciflow/h100/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm90-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm90
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: "linux.12xlarge"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '9.0'
+      test-matrix: |
+        { include: [
+          { config: "smoke", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm90
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml
index 4717c309c788..a3b617e2c5bc 100644
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@@ -14,22 +14,36 @@ jobs:
   get-default-label-prefix:
     if: github.repository_owner == 'pytorch'
     name: get-default-label-prefix
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
   linux-focal-cuda12_4-py3_10-gcc9-torchbench-build-gcp:
     name: cuda12.4-py3.10-gcc9-sm80
+=======
+  build:
+    name: cuda12.8-py3.10-gcc9-sm80
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-default-label-prefix
     with:
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
       docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
@@ -37,6 +51,7 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-cuda12_4-py3_10-gcc9-torchbench-test-gcp:
     name: cuda12.4-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-test.yml
@@ -45,4 +60,14 @@ jobs:
       build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
       docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-torchbench-build-gcp.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-torchbench-build-gcp.outputs.test-matrix }}
+=======
+  test:
+    name: cuda12.8-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
diff --git a/.github/workflows/trunk-tagging.yml b/.github/workflows/trunk-tagging.yml
new file mode 100644
index 000000000000..b460195c37e6
--- /dev/null
+++ b/.github/workflows/trunk-tagging.yml
@@ -0,0 +1,224 @@
+name: trunk-tagging
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+    inputs:
+      commit_sha:
+        description: 'Commit SHA to tag (leave empty for current HEAD)'
+        required: false
+        type: string
+
+concurrency:
+  group: trunk-tagging-${{ github.event.inputs.commit_sha || github.sha }}
+  cancel-in-progress: false
+
+permissions:
+  contents: write
+
+jobs:
+  tag-trunk-commit:
+    name: Tag trunk commit
+    runs-on: ubuntu-latest
+    if: github.repository_owner == 'pytorch'
+
+    steps:
+      - name: Pre-checkout validation
+        run: |
+          # For workflow_dispatch, validate SHA format before checkout
+          if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
+            COMMIT_SHA="${{ github.event.inputs.commit_sha }}"
+
+            # Verify it's a well-formed SHA (40 hex characters)
+            if ! echo "${COMMIT_SHA}" | grep -qE '^[a-f0-9]{40}$'; then
+              echo "Error: Invalid commit SHA format. Expected 40 hexadecimal characters, got: ${COMMIT_SHA}"
+              exit 1
+            fi
+
+            echo "✅ Pre-checkout validation passed for: ${COMMIT_SHA}"
+          else
+            echo "✅ Using current commit SHA - no pre-checkout validation needed"
+          fi
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          # Fetch full history to ensure we have all commits
+          fetch-depth: 0
+          # For workflow_dispatch, checkout the specified commit
+          ref: ${{ github.event.inputs.commit_sha || github.sha }}
+
+      - name: Set commit SHA
+        id: commit
+        run: |
+          if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
+            COMMIT_SHA="${{ github.event.inputs.commit_sha }}"
+          else
+            COMMIT_SHA="${{ github.sha }}"
+          fi
+          echo "sha=${COMMIT_SHA}" >> "${GITHUB_OUTPUT}"
+          echo "tag_name=trunk/${COMMIT_SHA}" >> "${GITHUB_OUTPUT}"
+
+      - name: Validate commit SHA
+        run: |
+          COMMIT_SHA="${{ steps.commit.outputs.sha }}"
+
+          # Verify the commit exists and is valid
+          if ! git cat-file -e "${COMMIT_SHA}"; then
+            echo "Error: Commit SHA ${COMMIT_SHA} does not exist in repository"
+            exit 1
+          fi
+
+          # For workflow_dispatch, verify the commit exists on main branch
+          if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
+            echo "Manual dispatch detected - validating commit is on main branch..."
+
+            # Get all commits reachable from main branch
+            if ! git merge-base --is-ancestor "${COMMIT_SHA}" origin/main; then
+              echo "Error: Commit ${COMMIT_SHA} is not reachable from main branch"
+              echo "Only commits that exist on the main branch can be tagged"
+              exit 1
+            fi
+
+            echo "✅ Commit ${COMMIT_SHA} is valid and exists on main branch"
+          else
+            echo "✅ Commit ${COMMIT_SHA} is valid (automatic push trigger)"
+          fi
+
+      - name: Create and push tag with retry
+        id: check_tag
+        env:
+          TAG_NAME: ${{ steps.commit.outputs.tag_name }}
+          COMMIT_SHA: ${{ steps.commit.outputs.sha }}
+        run: |
+          set -e
+
+          # Check if tag already exists
+          check_tag_exists() {
+            # Check if tag exists locally
+            if git tag -l "${TAG_NAME}" | grep -q "${TAG_NAME}"; then
+              echo "Tag ${TAG_NAME} already exists locally"
+              return 0
+            fi
+
+            # Check if tag exists on remote
+            if git ls-remote --tags origin "${TAG_NAME}" | grep -q "${TAG_NAME}"; then
+              echo "Tag ${TAG_NAME} already exists on remote"
+              return 0
+            fi
+
+            return 1
+          }
+
+          # Exit early if tag already exists
+          if check_tag_exists; then
+            echo "✅ Tag already exists - no action needed"
+            echo "exists=true" >> "${GITHUB_OUTPUT}"
+            exit 0
+          fi
+
+          echo "Tag ${TAG_NAME} does not exist, proceeding with creation"
+
+          # Retry configuration
+          MAX_RETRIES=5
+          BASE_DELAY=2
+          BACKOFF_MULTIPLIER=4
+          MAX_DELAY=3600
+
+          # Common retry function with exponential backoff
+          retry_with_backoff() {
+            local command="${1}"
+            local description="${2}"
+            local retry_count=0
+
+            while [ "${retry_count}" -le "${MAX_RETRIES}" ]; do
+              echo "Attempt $((retry_count + 1))/$((MAX_RETRIES + 1)): ${description}"
+
+              if eval "${command}"; then
+                echo "Success on attempt $((retry_count + 1))"
+                return 0
+              fi
+
+              retry_count=$((retry_count + 1))
+
+              if [ "${retry_count}" -le "${MAX_RETRIES}" ]; then
+                # Calculate delay with exponential backoff
+                local delay=$((BASE_DELAY * (BACKOFF_MULTIPLIER ** retry_count)))
+                if [ "${delay}" -gt "${MAX_DELAY}" ]; then
+                  delay="${MAX_DELAY}"
+                fi
+
+                echo "Failed. Retrying in ${delay} seconds..."
+                sleep "${delay}"
+              fi
+            done
+
+            echo "All retry attempts exhausted"
+            return 1
+          }
+
+          # Function to create and push tag
+          create_and_push_tag() {
+            # Create the tag
+            if ! git tag "${TAG_NAME}" "${COMMIT_SHA}"; then
+              echo "Failed to create local tag"
+              return 1
+            fi
+
+            # Push the tag
+            if git push origin "${TAG_NAME}"; then
+              echo "Successfully created and pushed tag ${TAG_NAME}"
+              return 0
+            else
+              echo "Failed to push tag to remote"
+              # Clean up local tag for retry
+              git tag -d "${TAG_NAME}" 2>/dev/null || true
+              return 1
+            fi
+          }
+
+          # Function to handle retries with race condition checks
+          tag_with_retry() {
+            # Check if tag exists before attempting creation
+            if check_tag_exists; then
+              echo "Tag ${TAG_NAME} was created by another process, exiting successfully"
+              return 0
+            fi
+
+            create_and_push_tag || {
+              # Fetch latest state for next retry
+              git fetch origin --tags
+              return 1
+            }
+          }
+
+          # Execute with retry
+          if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
+            echo "exists=false" >> "${GITHUB_OUTPUT}"
+            exit 0
+          else
+            echo "Tag creation failed after all retry attempts"
+            exit 1
+          fi
+
+      - name: Tag creation summary
+        if: always()
+        run: |
+          if [ "${{ steps.check_tag.outputs.exists }}" = "true" ]; then
+            echo "✅ Tag ${{ steps.commit.outputs.tag_name }} already existed - no action needed"
+          elif [ "${{ job.status }}" = "success" ]; then
+            echo "✅ Successfully created tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
+          else
+            echo "❌ Failed to create tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
+          fi
+
+          echo ""
+          echo "Tag details:"
+          echo "  Name: ${{ steps.commit.outputs.tag_name }}"
+          echo "  Commit: ${{ steps.commit.outputs.sha }}"
+          echo "  Trigger: ${{ github.event_name }}"
+          if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
+            echo "  Manual commit: ${{ github.event.inputs.commit_sha }}"
+          fi
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 6d0fa57ef212..571842e24cdd 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -37,7 +37,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -45,6 +49,7 @@ jobs:
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
+<<<<<<< HEAD
   libtorch-linux-focal-cuda12_6-py3_10-gcc11-debug-build:
     name: libtorch-linux-focal-cuda12.6-py3.10-gcc11-debug
     uses: ./.github/workflows/_linux-build.yml
@@ -52,6 +57,15 @@ jobs:
     with:
       build-environment: libtorch-linux-focal-cuda12.6-py3.10-gcc11
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11
+=======
+  libtorch-linux-jammy-cuda12_8-py3_10-gcc11-debug-build:
+    name: libtorch-linux-jammy-cuda12.8-py3.10-gcc11-debug
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: libtorch-linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       build-generates-artifacts: false
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runner: "linux.4xlarge"
@@ -62,14 +76,24 @@ jobs:
     secrets: inherit
 
   # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
+<<<<<<< HEAD
   linux-focal-cuda12_6-py3_10-gcc11-no-ops-build:
     name: linux-focal-cuda12.6-py3.10-gcc11-no-ops
+=======
+  linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-cuda12.6-py3.10-gcc11-no-ops
       docker-image-name: pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11
+=======
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
@@ -86,12 +110,17 @@ jobs:
       runner-type: macos-m1-stable
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
+<<<<<<< HEAD
       python-version: 3.9.12
+=======
+      python-version: 3.12.7
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
           { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
           { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
+<<<<<<< HEAD
         ]}
     secrets: inherit
 
@@ -109,6 +138,11 @@ jobs:
         { include: [
           { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
           { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
+=======
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]}
     secrets: inherit
 
@@ -121,8 +155,14 @@ jobs:
     with:
       build-environment: macos-py3-arm64
       # Same as the build job
+<<<<<<< HEAD
       python-version: 3.9.12
       test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
+=======
+      python-version: 3.12.7
+      test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
+      disable-monitor: false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   win-vs2022-cpu-py3-build:
@@ -132,7 +172,10 @@ jobs:
     with:
       build-environment: win-vs2022-cpu-py3
       cuda-version: cpu
+<<<<<<< HEAD
       sync-tag: win-cpu-build
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
       test-matrix: |
         { include: [
@@ -152,6 +195,10 @@ jobs:
       build-environment: win-vs2022-cpu-py3
       cuda-version: cpu
       test-matrix: ${{ needs.win-vs2022-cpu-py3-build.outputs.test-matrix }}
+<<<<<<< HEAD
+=======
+      disable-monitor: false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   win-vs2022-cuda12_6-py3-build:
@@ -164,15 +211,26 @@ jobs:
       runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-build:
     if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
     name: linux-focal-rocm6.3-py3.10
+=======
+  linux-jammy-rocm-py3_10-build:
+    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
+    name: linux-jammy-rocm-py3.10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+<<<<<<< HEAD
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image-name: pytorch-linux-focal-rocm-n-py3
+=======
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       sync-tag: rocm-build
       test-matrix: |
         { include: [
@@ -182,11 +240,16 @@ jobs:
         ]}
     secrets: inherit
 
+<<<<<<< HEAD
   linux-focal-rocm6_3-py3_10-test:
+=======
+  linux-jammy-rocm-py3_10-test:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
     permissions:
       id-token: write
       contents: read
+<<<<<<< HEAD
     name: linux-focal-rocm6.3-py3.10
     uses: ./.github/workflows/_rocm-test.yml
     needs:
@@ -196,10 +259,22 @@ jobs:
       build-environment: linux-focal-rocm6.3-py3.10
       docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }}
+=======
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
     secrets: inherit
 
   # NB: Keep this in sync with inductor-perf-test-nightly.yml
+<<<<<<< HEAD
   linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
     name: cuda12.4-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-build.yml
@@ -207,6 +282,15 @@ jobs:
     with:
       build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
       docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+=======
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       cuda-arch-list: '8.0'
     secrets: inherit
 
@@ -217,7 +301,11 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.9-gcc11
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       test-matrix: |
         { include: [
           { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index dc93cd24e19a..a373f93545f5 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -16,13 +16,21 @@ jobs:
     steps:
       - name: Checkout repo
         id: checkout
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 0
           token: ${{ secrets.MERGEBOT_TOKEN }}
 
       - name: Setup Python
+<<<<<<< HEAD
         uses: actions/setup-python@v4
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.9'
           check-latest: false
@@ -87,7 +95,11 @@ jobs:
           python3 .github/scripts/comment_on_pr.py "${PR_NUM}" "merge"
 
       - name: configure aws credentials
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
@@ -96,13 +108,21 @@ jobs:
       - name: Upload merge record to s3
         if: always()
         continue-on-error: true
+<<<<<<< HEAD
         uses: seemethere/upload-artifact-s3@v5
+=======
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           s3-bucket: ossci-raw-job-status
           s3-prefix: merges/${{ github.repository }}/${{ github.event.client_payload.pr_num }}/${{ github.event.client_payload.comment_id }}/${{ github.run_id }}
           path: merge_record.json
 
+<<<<<<< HEAD
 # We want newer merge commands to supercede old ones
+=======
+# We want newer merge commands to supersede old ones
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 concurrency:
   group: try-merge-${{ github.event.client_payload.pr_num }}
   cancel-in-progress: true
diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml
index f6039c59245d..a54b03aa80ef 100644
--- a/.github/workflows/tryrebase.yml
+++ b/.github/workflows/tryrebase.yml
@@ -13,13 +13,21 @@ jobs:
     steps:
       - name: Checkout repo
         id: checkout
+<<<<<<< HEAD
         uses: actions/checkout@v2
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 0
           token: ${{ secrets.MERGEBOT_TOKEN }}
 
       - name: Setup Python
+<<<<<<< HEAD
         uses: actions/setup-python@v4
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.9'
           architecture: x64
diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml
index 13e189234cfe..b4765f067bda 100644
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@@ -44,7 +44,11 @@ jobs:
 
   get-label-type:
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index a326f4db5b45..960ce696d502 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -18,7 +18,11 @@ jobs:
     environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
     steps:
       - name: Update viable/strict
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         id: update_viablestrict
         with:
           repository: pytorch/pytorch
@@ -30,7 +34,11 @@ jobs:
           clickhouse-password: ${{ secrets.CLICKHOUSE_VIABLESTRICT_PASSWORD }}
 
       - name: Authenticate to AWS with OIDC
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v4
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
           aws-region: us-east-1
diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml
index 68b41c626035..c1fc945bf940 100644
--- a/.github/workflows/update_pytorch_labels.yml
+++ b/.github/workflows/update_pytorch_labels.yml
@@ -17,13 +17,21 @@ jobs:
       contents: read
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: false
       - name: configure aws credentials
         id: aws_creds
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v4
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_update_pytorch_labels
           aws-region: us-east-1
diff --git a/.github/workflows/upload-test-stats-while-running.yml b/.github/workflows/upload-test-stats-while-running.yml
index 938edd11b9ec..15d907a6f9a8 100644
--- a/.github/workflows/upload-test-stats-while-running.yml
+++ b/.github/workflows/upload-test-stats-while-running.yml
@@ -16,7 +16,11 @@ jobs:
     runs-on: linux.2xlarge
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: false
@@ -24,6 +28,7 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
+<<<<<<< HEAD
       - name: Setup miniconda
         uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
         with:
@@ -32,9 +37,18 @@ jobs:
       - name: Install requirements
         run: |
           ${CONDA_RUN} pip install requests==2.32.2 boto3==1.35.42
+=======
+      - name: Install requirements
+        run: |
+          python3 -m pip install requests==2.32.2 boto3==1.35.42
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       - name: Upload test stats
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
+<<<<<<< HEAD
           ${CONDA_RUN} python -m tools.stats.upload_test_stats_running_jobs
+=======
+          python3 -m tools.stats.upload_test_stats_running_jobs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
index c7c2acbb9c46..2f146b66b3df 100644
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@@ -2,7 +2,29 @@ name: Upload test stats
 
 on:
   workflow_run:
+<<<<<<< HEAD
     workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, rocm-mi300, inductor-micro-benchmark, inductor-micro-benchmark-x86, inductor-cu124, inductor-rocm, inductor-rocm-mi300, mac-mps]
+=======
+    workflows:
+      - pull
+      - trunk
+      - periodic
+      - periodic-rocm-mi300
+      - inductor
+      - unstable
+      - slow
+      - unstable-periodic
+      - inductor-periodic
+      - rocm
+      - rocm-mi300
+      - inductor-micro-benchmark
+      - inductor-micro-benchmark-x86
+      - inductor-cu124
+      - inductor-rocm
+      - inductor-rocm-mi300
+      - mac-mps
+      - linux-aarch64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     types:
       - completed
 
@@ -17,7 +39,11 @@ jobs:
     steps:
       - name: Get workflow run conclusion
         # TODO (huydhn): Pin this once https://github.com/octokit/request-action/issues/315 is resolved
+<<<<<<< HEAD
         uses: octokit/request-action@release/2.7
+=======
+        uses: octokit/request-action@05a2312de9f8207044c4c9e41fe19703986acc13 # v2.x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         id: get_conclusion
         with:
           route: GET /repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/attempts/${{ github.event.workflow_run.run_attempt }}
@@ -39,16 +65,27 @@ jobs:
         run: echo "${TRIGGERING_WORKFLOW}"
 
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
 
       - name: Configure aws credentials
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+
+      - name: Configure aws credentials
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-torch-test-stats
           aws-region: us-east-1
 
+<<<<<<< HEAD
       - uses: actions/setup-python@v4
+=======
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.11'
           cache: pip
diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml
index d9979b2dcaf0..f2056850dee2 100644
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@@ -2,7 +2,11 @@ name: Upload torch dynamo performance stats
 
 on:
   workflow_run:
+<<<<<<< HEAD
     workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100]
+=======
+    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, inductor-perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     types:
       - completed
 
@@ -14,7 +18,11 @@ jobs:
     steps:
       - name: Get workflow run conclusion
         # TODO (huydhn): Pin this once https://github.com/octokit/request-action/issues/315 is resolved
+<<<<<<< HEAD
         uses: octokit/request-action@release/2.7
+=======
+        uses: octokit/request-action@05a2312de9f8207044c4c9e41fe19703986acc13 # v2.x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         id: get-conclusion
         with:
           route: GET /repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/attempts/${{ github.event.workflow_run.run_attempt }}
@@ -32,19 +40,31 @@ jobs:
     name: Upload dynamo performance stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           submodules: false
           fetch-depth: 1
 
       - name: Configure aws credentials
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v3
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue-on-error: true
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-torch-test-stats
           aws-region: us-east-1
 
+<<<<<<< HEAD
       - uses: actions/setup-python@v4
+=======
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.11'
           cache: pip
diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml
index e8958ea8b651..a3dad9a8da48 100644
--- a/.github/workflows/upload_test_stats_intermediate.yml
+++ b/.github/workflows/upload_test_stats_intermediate.yml
@@ -17,12 +17,20 @@ jobs:
     environment: upload-stats
     steps:
       - name: Checkout PyTorch
+<<<<<<< HEAD
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
+=======
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 1
           submodules: false
 
+<<<<<<< HEAD
       - uses: actions/setup-python@v4
+=======
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.11'
           cache: pip
@@ -31,7 +39,11 @@ jobs:
           pip3 install requests==2.32.2 boto3==1.35.42
 
       - name: Authenticate with AWS
+<<<<<<< HEAD
         uses: aws-actions/configure-aws-credentials@v4
+=======
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           role-to-assume: arn:aws:iam::308535385114:role/gha_upload_test_stats_intermediate_workflow
           aws-region: us-east-1
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 84b2f2f2a122..e1175cd6103b 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -17,12 +17,20 @@ jobs:
     environment: update-commit-hash
     steps:
       - name: Checkout repo
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 0
       - name: update-xla-commit-hash
         continue-on-error: true
+<<<<<<< HEAD
         uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.7
+=======
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           repo-name: xla
           branch: master
@@ -37,12 +45,20 @@ jobs:
     environment: update-commit-hash
     steps:
       - name: Checkout repo
+<<<<<<< HEAD
         uses: actions/checkout@v3
+=======
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           fetch-depth: 0
           token: ${{ secrets.UPDATEBOT_TOKEN }}
       - name: Setup Python
+<<<<<<< HEAD
         uses: actions/setup-python@v2
+=======
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with:
           python-version: '3.9'
       - name: Install requirements
diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml
index c5a420f3b243..b118a21358f1 100644
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@@ -15,7 +15,11 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
+<<<<<<< HEAD
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+=======
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -30,6 +34,7 @@ jobs:
       sync-tag: linux-xpu-2025-0-build
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
       build-environment: linux-jammy-xpu-2025.0-py3.9
+<<<<<<< HEAD
       docker-image-name: pytorch-linux-jammy-xpu-2025.0-py3
       runner: linux.12xlarge
       test-matrix: |
@@ -45,13 +50,59 @@ jobs:
     name: linux-jammy-xpu-2025.0-py3.9
     uses: ./.github/workflows/_xpu-test.yml
     needs: linux-jammy-xpu-2025_0-py3_9-build
+=======
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3
+      runner: linux.12xlarge
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-xpu-2025_1-py3_9-build:
+    name: linux-jammy-xpu-2025.1-py3.9
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      sync-tag: linux-xpu-2025-1-build
+      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
+      runner: linux.12xlarge
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-xpu-2025_1-py3_9-test:
+    name: linux-jammy-xpu-2025.1-py3.9
+    uses: ./.github/workflows/_xpu-test.yml
+    needs: linux-jammy-xpu-2025_1-py3_9-build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     permissions:
       id-token: write
       contents: read
     with:
+<<<<<<< HEAD
       build-environment: linux-jammy-xpu-2025.0-py3.9
       docker-image: ${{ needs.linux-jammy-xpu-2025_0-py3_9-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-xpu-2025_0-py3_9-build.outputs.test-matrix }}
+=======
+      build-environment: linux-jammy-xpu-2025.1-py3.9
+      docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     secrets: inherit
 
   windows-xpu-2025_0-build:
@@ -65,3 +116,18 @@ jobs:
       xpu-version: '2025.0'
       vc-year: '2022'
     secrets: inherit
+<<<<<<< HEAD
+=======
+
+  windows-xpu-2025_1-build:
+    if: github.repository_owner == 'pytorch'
+    name: win-vs2022-xpu-2025_1-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2022-xpu-py3
+      cuda-version: cpu
+      use-xpu: true
+      xpu-version: '2025.1'
+      vc-year: '2022'
+    secrets: inherit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.gitignore b/.gitignore
index 7557c564a6de..cdd9c8cfb4d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,10 @@ docs/source/generated/
 docs/source/compile/generated/
 log
 usage_log.txt
+<<<<<<< HEAD
+=======
+usage_log*
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 test-reports/
 test/*.bak
 test/**/*.bak
@@ -62,9 +66,13 @@ test/forward_backward_compatibility/nightly_schemas.txt
 dropout_model.pt
 test/generated_type_hints_smoketest.py
 test/htmlcov
+<<<<<<< HEAD
 test/cpp_extensions/install/
 test/cpp_extensions/open_registration_extension/install
 test/cpp_extensions/libtorch_agnostic_extension/install
+=======
+test/cpp_extensions/**/install
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 test/kernel.errors.txt
 third_party/build/
 third_party/nccl/
@@ -180,6 +188,10 @@ compile_commands.json
 *.egg-info/
 docs/source/scripts/activation_images/
 docs/source/scripts/quantization_backend_configs/
+<<<<<<< HEAD
+=======
+docs/source/scripts/lr_scheduler_images/
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## General
 
@@ -213,6 +225,7 @@ docs/source/scripts/quantization_backend_configs/
 # Compiled MATLAB
 *.mex*
 
+<<<<<<< HEAD
 # IPython notebook checkpoints
 .ipynb_checkpoints
 
@@ -222,6 +235,8 @@ docs/source/scripts/quantization_backend_configs/
 *.swp
 *~
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NFS handle files
 **/.nfs*
 
@@ -391,3 +406,9 @@ android/pytorch_android_torchvision/.cxx
 .arcconfig
 .stable_pyre_client
 .pyre_client
+<<<<<<< HEAD
+=======
+
+# Claude Code local configuration
+CLAUDE.local.md
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.gitmodules b/.gitmodules
index 3408fb8a87c5..2aec6224669a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,10 +2,13 @@
     ignore = dirty
     path = third_party/pybind11
     url = https://github.com/pybind/pybind11.git
+<<<<<<< HEAD
 [submodule "third_party/eigen"]
     ignore = dirty
     path = third_party/eigen
     url = https://gitlab.com/libeigen/eigen.git
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 [submodule "third_party/googletest"]
     ignore = dirty
     path = third_party/googletest
@@ -25,7 +28,11 @@
 [submodule "third_party/gloo"]
     ignore = dirty
     path = third_party/gloo
+<<<<<<< HEAD
     url = https://github.com/facebookincubator/gloo
+=======
+    url = https://github.com/pytorch/gloo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 [submodule "third_party/NNPACK_deps/pthreadpool"]
     ignore = dirty
     path = third_party/pthreadpool
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 17163c016b24..c219b76cb535 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -18,6 +18,11 @@ exclude_patterns = [
     'torch/_inductor/autoheuristic/artifacts/**',
     'scripts/**',
     'test/generated_type_hints_smoketest.py',
+<<<<<<< HEAD
+=======
+    # CPython tests
+    'test/dynamo/cpython/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Tests from the NumPy test suite
     'test/torch_np/numpy_test/**/*.py',
     'third_party/**',
@@ -55,12 +60,20 @@ init_command = [
 code = 'CLANGFORMAT'
 include_patterns = [
     'aten/src/ATen/*.h',
+<<<<<<< HEAD
+=======
+    'aten/src/ATen/cpu/vec/**/*.h',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'aten/src/ATen/mps/**/*.mm',
     'aten/src/ATen/mps/**/*.h',
     'aten/src/ATen/xpu/**/*.h',
     'aten/src/ATen/xpu/**/*.cpp',
     'aten/src/ATen/core/boxing/**/*.h',
     'aten/src/ATen/core/dispatch/**/*.h',
+<<<<<<< HEAD
+=======
+    'aten/src/ATen/core/Formatting.cpp',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'aten/src/ATen/native/mps/**/*.metal',
     'aten/src/ATen/native/mps/**/*.mm',
     'aten/src/ATen/native/mps/**/*.h',
@@ -81,6 +94,12 @@ include_patterns = [
     'torch/csrc/**/*.h',
     'torch/csrc/**/*.hpp',
     'torch/csrc/**/*.cpp',
+<<<<<<< HEAD
+=======
+    'torch/nativert/**/*.h',
+    'torch/nativert/**/*.cpp',
+    'torch/headeronly/**/*.h',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'test/cpp/**/*.h',
     'test/cpp/**/*.cpp',
 ]
@@ -147,6 +166,7 @@ init_command = [
     'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
     'numpy==2.1.0 ; python_version >= "3.12"',
     'expecttest==0.3.0',
+<<<<<<< HEAD
     'mypy==1.14.0',
     'sympy==1.13.3',
     'types-requests==2.27.25',
@@ -155,13 +175,27 @@ init_command = [
     'types-protobuf==3.19.18',
     'types-pkg-resources==0.1.3',
     'types-Jinja2==2.11.9',
+=======
+    'mypy==1.16.0',
+    'sympy==1.13.3',
+    'types-requests==2.27.25',
+    'types-pyyaml==6.0.1',
+    'types-tabulate==0.8.8',
+    'types-protobuf==5.29.1.20250403',
+    'types-setuptools==79.0.0.20250422',
+    'types-jinja2==2.11.9',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'types-colorama==0.4.6',
     'filelock==3.13.1',
     'junitparser==2.1.1',
     'rich==10.9.0',
     'pyyaml==6.0.1',
     'optree==0.13.0',
+<<<<<<< HEAD
     'dataclasses_json==0.6.7',
+=======
+    'dataclasses-json==0.6.7',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'pandas==2.2.3',
 ]
 
@@ -223,12 +257,24 @@ include_patterns = [
     'c10/**/*.cpp',
     'c10/**/*.h',
     'torch/*.h',
+<<<<<<< HEAD
+=======
+    'torch/_inductor/codegen/aoti_runtime/interface.cpp',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'torch/csrc/*.h',
     'torch/csrc/*.cpp',
     'torch/csrc/**/*.h',
     'torch/csrc/**/*.cpp',
     'torch/csrc/jit/serialization/*.h',
     'torch/csrc/jit/serialization/*.cpp',
+<<<<<<< HEAD
+=======
+    'torch/nativert/*.h',
+    'torch/nativert/*.cpp',
+    'torch/nativert/**/*.h',
+    'torch/nativert/**/*.cpp',
+    'torch/headeronly/**/*.h',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 exclude_patterns = [
     # The negative filters below are to exclude files that include onnx_pb.h or
@@ -271,6 +317,10 @@ exclude_patterns = [
     'torch/csrc/utils/generated_serialization_types.h',
     'torch/csrc/utils/pythoncapi_compat.h',
     'torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h',
+<<<<<<< HEAD
+=======
+    'aten/src/ATen/ExpandBase.h',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 init_command = [
     'python3',
@@ -366,7 +416,11 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
+<<<<<<< HEAD
     'ruamel.yaml==0.17.4',
+=======
+    'ruamel.yaml==0.18.10',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 is_formatter = true
 
@@ -379,6 +433,15 @@ command = [
     '--',
     '@{{PATHSFILE}}'
 ]
+<<<<<<< HEAD
+=======
+init_command = [
+    'python3',
+    'tools/linter/adapters/pip_init.py',
+    '--dry-run={{DRYRUN}}',
+    'ruamel.yaml==0.18.10',
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 [[linter]]
 code = 'NEWLINE'
@@ -396,8 +459,15 @@ exclude_patterns=[
     'tools/clang_format_hash/**',
     'test/cpp/jit/upgrader_models/*.ptl',
     'test/cpp/jit/upgrader_models/*.ptl.ff',
+<<<<<<< HEAD
+    '**/*.png',
+    '**/*.gz',
+=======
+    'test/dynamo/cpython/**',
     '**/*.png',
     '**/*.gz',
+    '**/*.patch',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 command = [
     'python3',
@@ -452,7 +522,11 @@ exclude_patterns = [
     'test/cpp/jit/upgrader_models/*.ptl.ff',
     '.ci/docker/common/install_rocm_drm.sh',
     '.lintrunner.toml',
+<<<<<<< HEAD
     '.ci/magma/package_files/*.patch',
+=======
+    '**/*.patch',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 command = [
     'python3',
@@ -521,6 +595,10 @@ include_patterns = [
     'c10/**',
     'aten/**',
     'torch/csrc/**',
+<<<<<<< HEAD
+=======
+    'torch/nativert/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 exclude_patterns = [
     'aten/src/ATen/native/quantized/cpu/qnnpack/**',
@@ -748,6 +826,10 @@ include_patterns = [
     'aten/**',
     'c10/**',
     'torch/csrc/**',
+<<<<<<< HEAD
+=======
+    'torch/nativert/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 exclude_patterns = [
     'aten/src/ATen/cuda/CUDAContext.cpp',
@@ -933,6 +1015,10 @@ include_patterns = [
 exclude_patterns = [
     'test/run_test.py',
     '**/fb/**',
+<<<<<<< HEAD
+=======
+    'test/dynamo/cpython/3.13/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'test/quantization/**',  # should be run through test/test_quantization.py
     'test/jit/**',  # should be run through test/test_jit.py
     'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py
@@ -1001,6 +1087,10 @@ include_patterns = [
     'c10/**',
     'aten/**',
     'torch/csrc/**',
+<<<<<<< HEAD
+=======
+    'torch/nativert/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 exclude_patterns = [
     'c10/util/CallOnce.h',
@@ -1045,6 +1135,10 @@ include_patterns = [
     'c10/**',
     'aten/**',
     'torch/csrc/**',
+<<<<<<< HEAD
+=======
+    'torch/nativert/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 exclude_patterns = [
     '**/fb/**',
@@ -1107,6 +1201,63 @@ init_command = [
     'PyYAML==6.0.1',
 ]
 
+<<<<<<< HEAD
+=======
+[[linter]]
+code = 'CODESPELL'
+command = [
+    'python3',
+    'tools/linter/adapters/codespell_linter.py',
+    '--',
+    '@{{PATHSFILE}}'
+]
+include_patterns = [
+    '**',
+]
+exclude_patterns = [
+    # We don't care too much about files in this directory, don't enforce
+    # spelling on them
+    'caffe2/**',
+    'fb/**',
+    '**/fb/**',
+    'third_party/**',
+    'test/dynamo/cpython/**',
+    'torch/_vendor/**',
+    'torch/_inductor/fx_passes/serialized_patterns/**',
+    'torch/_inductor/autoheuristic/artifacts/**',
+    # These files are all grandfathered in, feel free to remove from this list
+    # as necessary
+    # NOTE: remove the patterns in the order they are listed
+    'aten/**',
+    'aten/src/ATen/native/**',
+    'aten/src/ATen/native/q*/**',
+    'aten/src/ATen/native/[a-pA-P]*/**',
+    'aten/src/ATen/[a-mA-M]*/**',
+    'test/**',
+    'test/test_*',
+    'test/[a-hA-h]*/**',
+    'test/inductor/**',
+    'test/dynamo/**',
+    'test/distributed/**',
+    'torch/**',
+    'torch/_*/**',
+    'torch/ao/**',
+    'torch/fx/**',
+    'torch/distributed/tensor/**',
+    'torch/[j-o]*/**',
+    'torch/utils/**',
+    'torch/csrc/jit/**',
+    'torch/csrc/jit/[a-o]*/**',
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/pip_init.py',
+    '--dry-run={{DRYRUN}}',
+    'codespell[toml]==2.4.1',
+]
+is_formatter = true
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # usort + ruff-format
 [[linter]]
 code = 'PYFMT'
@@ -1128,6 +1279,10 @@ exclude_patterns = [
     'caffe2/**/*.pyi',
     'fb/**',
     '**/fb/**',
+<<<<<<< HEAD
+=======
+    'test/dynamo/cpython/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'third_party/**/*.py',
     'third_party/**/*.pyi',
     'torch/_vendor/**',
@@ -1135,12 +1290,15 @@ exclude_patterns = [
     'torch/_inductor/autoheuristic/artifacts/**',
     # These files are all grandfathered in, feel free to remove from this list
     # as necessary
+<<<<<<< HEAD
     'test/_nvfuser/__init__.py',
     'test/_nvfuser/test_dynamo.py',
     'test/_nvfuser/test_python_frontend.py',
     'test/_nvfuser/test_torchscript.py',
     'test/delete.py',
     'test/expect/__init__.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'test/quantization/__init__.py',
     'test/quantization/core/__init__.py',
     'test/quantization/core/experimental/apot_fx_graph_mode_ptq.py',
@@ -1162,6 +1320,7 @@ exclude_patterns = [
     'test/quantization/core/test_utils.py',
     'test/quantization/core/test_workflow_module.py',
     'test/quantization/core/test_workflow_ops.py',
+<<<<<<< HEAD
     'test/quantization/eager/__init__.py',
     'test/quantization/eager/test_bias_correction_eager.py',
     'test/quantization/eager/test_equalize_eager.py',
@@ -1170,14 +1329,19 @@ exclude_patterns = [
     'test/quantization/eager/test_numeric_suite_eager.py',
     'test/quantization/eager/test_quantize_eager_ptq.py',
     'test/quantization/eager/test_quantize_eager_qat.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'test/quantization/fx/__init__.py',
     'test/quantization/fx/test_equalize_fx.py',
     'test/quantization/fx/test_model_report_fx.py',
     'test/quantization/fx/test_numeric_suite_fx.py',
     'test/quantization/fx/test_quantize_fx.py',
     'test/quantization/fx/test_subgraph_rewriter.py',
+<<<<<<< HEAD
     'test/test_fake_tensor.py',
     'test/test_flop_counter.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'test/test_function_schema.py',
     'test/test_functional_autograd_benchmark.py',
     'test/test_functional_optim.py',
@@ -1255,10 +1419,13 @@ exclude_patterns = [
     'test/test_unary_ufuncs.py',
     'test/test_vulkan.py',
     'torch/_awaits/__init__.py',
+<<<<<<< HEAD
     'torch/_custom_op/__init__.py',
     'torch/_custom_op/autograd.py',
     'torch/_custom_op/functional.py',
     'torch/_custom_op/impl.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'torch/_export/__init__.py',
     'torch/_export/constraints.py',
     'torch/_export/db/__init__.py',
@@ -1296,6 +1463,7 @@ exclude_patterns = [
     'torch/_export/db/examples/type_reflection_method.py',
     'torch/_export/db/gen_example.py',
     'torch/_export/db/logging.py',
+<<<<<<< HEAD
     'torch/_export/error.py',
     'torch/_export/exported_program.py',
     'torch/_export/pass_base.py',
@@ -1314,6 +1482,8 @@ exclude_patterns = [
     'torch/_export/serde/upgrade.py',
     'torch/_export/trace.py',
     'torch/_export/verifier.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'torch/testing/_internal/__init__.py',
     'torch/testing/_internal/autocast_test_lists.py',
     'torch/testing/_internal/autograd_function_db.py',
@@ -1321,7 +1491,10 @@ exclude_patterns = [
     'torch/testing/_internal/codegen/__init__.py',
     'torch/testing/_internal/codegen/random_topo_test.py',
     'torch/testing/_internal/common_cuda.py',
+<<<<<<< HEAD
     'torch/testing/_internal/common_distributed.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'torch/testing/_internal/common_jit.py',
     'torch/testing/_internal/common_methods_invocations.py',
     'torch/testing/_internal/common_modules.py',
@@ -1338,6 +1511,7 @@ exclude_patterns = [
     'torch/testing/_internal/data/network1.py',
     'torch/testing/_internal/data/network2.py',
     'torch/testing/_internal/dist_utils.py',
+<<<<<<< HEAD
     'torch/testing/_internal/distributed/__init__.py',
     'torch/testing/_internal/distributed/_shard/__init__.py',
     'torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py',
@@ -1370,6 +1544,8 @@ exclude_patterns = [
     'torch/testing/_internal/distributed/rpc/rpc_test.py',
     'torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py',
     'torch/testing/_internal/distributed/rpc_utils.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'torch/testing/_internal/generated/__init__.py',
     'torch/testing/_internal/hypothesis_utils.py',
     'torch/testing/_internal/inductor_utils.py',
@@ -1386,6 +1562,7 @@ exclude_patterns = [
     'torch/testing/_internal/test_module/__init__.py',
     'torch/testing/_internal/test_module/future_div.py',
     'torch/testing/_internal/test_module/no_future_div.py',
+<<<<<<< HEAD
     'torch/utils/_contextlib.py',
     'torch/utils/_cpp_extension_versioner.py',
     'torch/utils/_crash_handler.py',
@@ -1405,6 +1582,11 @@ exclude_patterns = [
     'torch/utils/benchmark/examples/blas_compare_setup.py',
     'torch/utils/benchmark/examples/compare.py',
     'torch/utils/benchmark/examples/end_to_end.py',
+=======
+    'torch/utils/benchmark/__init__.py',
+    'torch/utils/benchmark/examples/__init__.py',
+    'torch/utils/benchmark/examples/compare.py',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'torch/utils/benchmark/examples/fuzzer.py',
     'torch/utils/benchmark/examples/op_benchmark.py',
     'torch/utils/benchmark/examples/simple_timeit.py',
@@ -1466,7 +1648,10 @@ exclude_patterns = [
     'torch/utils/throughput_benchmark.py',
     'torch/utils/viz/__init__.py',
     'torch/utils/viz/_cycles.py',
+<<<<<<< HEAD
     'torch/utils/weak.py',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 init_command = [
     'python3',
@@ -1475,8 +1660,13 @@ init_command = [
     '--no-black-binary',
     'black==23.12.1',
     'usort==1.0.8.post1',
+<<<<<<< HEAD
     'isort==5.13.2',
     'ruff==0.9.8',  # sync with RUFF
+=======
+    'isort==6.0.1',
+    'ruff==0.11.13',  # sync with RUFF
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 is_formatter = true
 
@@ -1537,13 +1727,26 @@ command = [
 
 [[linter]]
 code = 'RUFF'
+<<<<<<< HEAD
 include_patterns = ['**/*.py', '**/*.pyi']
+=======
+include_patterns = [
+    '**/*.py',
+    '**/*.pyi',
+    '**/*.ipynb',
+    'pyproject.toml',
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 exclude_patterns = [
     'caffe2/**',
     'functorch/docs/**',
     'functorch/notebooks/**',
     'torch/_inductor/fx_passes/serialized_patterns/**',
     'torch/_inductor/autoheuristic/artifacts/**',
+<<<<<<< HEAD
+=======
+    'test/dynamo/cpython/**',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'scripts/**',
     'third_party/**',
     'fb/**',
@@ -1561,11 +1764,19 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
+<<<<<<< HEAD
     'ruff==0.9.8',  # sync with PYFMT
 ]
 is_formatter = true
 
 # This linter prevents merge conlicts in csv files in pytorch by enforcing
+=======
+    'ruff==0.11.13',  # sync with PYFMT
+]
+is_formatter = true
+
+# This linter prevents merge conflicts in csv files in pytorch by enforcing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # three lines of whitespace between entries such that unless people are modifying
 # the same line, merge conflicts should not arise in git or hg
 [[linter]]
@@ -1720,7 +1931,11 @@ command = [
     '@{{PATHSFILE}}'
 ]
 include_patterns = [
+<<<<<<< HEAD
    'torch/**/not-exist.py'
+=======
+   'torch/_inductor/**/*.py'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 is_formatter = false
 
@@ -1737,3 +1952,30 @@ include_patterns = [
    'torch/_dynamo/**',
 ]
 is_formatter = false
+<<<<<<< HEAD
+=======
+
+[[linter]]
+code = 'TEST_DEVICE_BIAS'
+command = [
+    'python3',
+    'tools/linter/adapters/test_device_bias_linter.py',
+    '--',
+    '@{{PATHSFILE}}',
+]
+include_patterns = [
+    'test/**/test_*.py',
+]
+
+# 'header_only_linter' reports on properly testing header-only APIs.
+[[linter]]
+code = 'HEADER_ONLY_LINTER'
+command = [
+    'python3',
+    'tools/linter/adapters/header_only_linter.py',
+]
+include_patterns = [
+    'torch/header_only_apis.txt',
+]
+is_formatter = false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
index 9b22ad8d65e5..b8207d4cd459 100644
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -1,5 +1,17 @@
 {
+<<<<<<< HEAD
   "recommendations": [
     "ms-python.python",
   ]
+=======
+    "recommendations": [
+        "ms-python.python",
+        "charliermarsh.ruff",
+        "ms-python.flake8",
+        "ms-python.mypy-type-checker",
+        "ms-vscode.cmake-tools",
+        "EditorConfig.EditorConfig",
+        "streetsidesoftware.code-spell-checker",
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/.vscode/settings_recommended.json b/.vscode/settings_recommended.json
index 551a3ec2a5a3..30a059b15ec3 100644
--- a/.vscode/settings_recommended.json
+++ b/.vscode/settings_recommended.json
@@ -1,15 +1,67 @@
 {
+<<<<<<< HEAD
     "[python]": {
         "editor.tabSize": 4
     },
     "files.associations": {
         "*.py.in": "python",
         "*.pyi.in": "python"
+=======
+    "files.associations": {
+        ".clang-format": "yaml",
+        ".clang-tidy": "yaml",
+        ".flake8": "ini",
+        ".coveragerc": "ini",
+        "*.py.in": "python",
+        "*.pyi.in": "python",
+        "*requirements*.txt": "pip-requirements",
+        "*requirements*.in": "pip-requirements",
+        "*.cpp.in": "cpp",
+        "*.h.in": "cpp",
+        "*.cmake.in": "cmake",
+        "Makefile.*": "makefile",
+        "*.Makefile": "makefile",
+        "BUCK": "starlark",
+        "BUCK.*": "starlark"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     },
     "files.eol": "\n",
     "files.insertFinalNewline": true,
     "files.trimFinalNewlines": true,
     "files.trimTrailingWhitespace": true,
+<<<<<<< HEAD
     "python.linting.enabled": true,
     "python.linting.flake8Enabled": true
+=======
+    "cmake.preferredGenerators": [
+        "Ninja",
+        "Unix Makefiles"
+    ],
+    "cmake.configureEnvironment": {
+        "CMAKE_EXPORT_COMPILE_COMMANDS": "ON"
+    },
+    "cmake.sourceDirectory": "${workspaceFolder}",
+    "cmake.buildDirectory": "${workspaceFolder}/build",
+    "cmake.configureArgs": [
+        "-DPython_EXECUTABLE=${workspaceFolder}/venv/bin/python",
+        "-DPython_ROOT_DIR=${workspaceFolder}/venv"
+    ],
+    "[python]": {
+        "editor.tabSize": 4,
+        "editor.defaultFormatter": "charliermarsh.ruff"
+    },
+    "python.defaultInterpreterPath": "${workspaceFolder}/venv/bin/python",
+    "python.analysis.inlayHints.functionReturnTypes": true,
+    "flake8.importStrategy": "fromEnvironment",
+    "flake8.args": [
+        "--append-config=${workspaceFolder}/.flake8"
+    ],
+    "ruff.importStrategy": "fromEnvironment",
+    "ruff.lineLength": 88,
+    "ruff.organizeImports": false,
+    "ruff.configurationPreference": "filesystemFirst",
+    "mypy-type-checker.importStrategy": "fromEnvironment",
+    "mypy-type-checker.preferDaemon": true,
+    "mypy-type-checker.reportingScope": "workspace"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 000000000000..daf0f491702b
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1 @@
+- This is the only AGENTS.md, there are no recursive AGENTS.md
diff --git a/BUILD.bazel b/BUILD.bazel
index e848f441541d..9495e554b77b 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1,4 +1,8 @@
 load("@bazel_skylib//lib:paths.bzl", "paths")
+<<<<<<< HEAD
+=======
+load("@com_github_google_flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
 load("@rules_python//python:defs.bzl", "py_library", "py_test")
@@ -289,6 +293,10 @@ header_template_rule(
     substitutions = {
         "@AT_CUDNN_ENABLED@": "1",
         "@AT_CUSPARSELT_ENABLED@": "0",
+<<<<<<< HEAD
+=======
+        "@AT_HIPSPARSELT_ENABLED@": "0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "@AT_ROCM_ENABLED@": "0",
         "@AT_MAGMA_ENABLED@": "0",
         "@NVCC_FLAGS_EXTRA@": "",
@@ -375,6 +383,10 @@ cc_library(
         ":torch_headers",
         "@fbgemm",
         "@ideep",
+<<<<<<< HEAD
+=======
+        "@nlohmann",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
     alwayslink = True,
 )
@@ -497,7 +509,11 @@ filegroup(
 # To achieve finer granularity and make debug easier, caffe2 is split into three libraries:
 # ATen, caffe2 and caffe2_for_aten_headers. ATen lib group up source codes under
 # aten/ directory and caffe2 contains most files under `caffe2/` directory. Since the
+<<<<<<< HEAD
 # ATen lib and the caffe2 lib would depend on each other, `caffe2_for_aten_headers` is splitted
+=======
+# ATen lib and the caffe2 lib would depend on each other, `caffe2_for_aten_headers` is split
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # out from `caffe2` to avoid dependency cycle.
 cc_library(
     name = "caffe2_for_aten_headers",
@@ -579,9 +595,15 @@ cc_library(
 cu_library(
     name = "torch_cuda",
     srcs = [
+<<<<<<< HEAD
         "torch/csrc/distributed/c10d/intra_node_comm.cu",
         "torch/csrc/distributed/c10d/NanCheck.cu",
         "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
+=======
+        "torch/csrc/distributed/c10d/NanCheck.cu",
+        "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
+        "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
     copts = torch_cuda_half_options,
     visibility = ["//visibility:public"],
@@ -659,6 +681,18 @@ cc_library(
 # torch
 torch_cuda_headers = glob(["torch/csrc/cuda/*.h"])
 
+<<<<<<< HEAD
+=======
+flatbuffer_cc_library(
+    name = "torch_flatbuffers",
+    srcs = [
+        "torch/csrc/jit/serialization/mobile_bytecode.fbs",
+    ],
+    flatc_args = ["--cpp", "--gen-mutable", "--scoped-enums"],
+    out_prefix = "torch/csrc/jit/serialization/",
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 cc_library(
     name = "torch_headers",
     hdrs = if_cuda(
@@ -672,6 +706,10 @@ cc_library(
         ],
         exclude = [
             "torch/csrc/*/generated/*.h",
+<<<<<<< HEAD
+=======
+            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ] + torch_cuda_headers,
     ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
     includes = [
@@ -686,6 +724,10 @@ cc_library(
     deps = [
         ":aten_headers",
         ":caffe2_headers",
+<<<<<<< HEAD
+=======
+        ":torch_flatbuffers",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "//c10",
         "@com_github_google_flatbuffers//:flatbuffers",
         "@local_config_python//:python_headers",
@@ -723,6 +765,7 @@ cc_library(
     srcs = if_cuda(glob(
         libtorch_cuda_sources,
         exclude = [
+<<<<<<< HEAD
             "torch/csrc/cuda/python_nccl.cpp",
             "torch/csrc/cuda/nccl.cpp",
             "torch/csrc/distributed/c10d/intra_node_comm.cu",
@@ -731,6 +774,17 @@ cc_library(
             "torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
             "torch/csrc/distributed/c10d/NanCheck.cu",
             "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
+=======
+            "torch/csrc/cuda/nccl.cpp",
+            "torch/csrc/cuda/python_nccl.cpp",
+            "torch/csrc/distributed/c10d/NanCheck.cu",
+            "torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
+            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
+            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
+            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
+            "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
+            "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
     )) + torch_sources,
     copts = TORCH_COPTS,
diff --git a/CITATION.cff b/CITATION.cff
index e6de8772cbf2..02e112a96d12 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -113,4 +113,8 @@ preferred-citation:
   publisher:
     name: ACM
   doi: "10.1145/3620665.3640366"
+<<<<<<< HEAD
   url: "https://pytorch.org/assets/pytorch2-2.pdf"
+=======
+  url: "https://docs.pytorch.org/assets/pytorch2-2.pdf"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f0dcbff73cd..e273b5cb4f5c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+=======
+cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
 
 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
@@ -6,6 +10,10 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 # one is detected as "AppleClang".
 cmake_policy(SET CMP0010 NEW)
 cmake_policy(SET CMP0025 NEW)
+<<<<<<< HEAD
+=======
+cmake_policy(SET CMP0126 OLD)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Enables CMake to set LTO on compilers other than Intel.
 cmake_policy(SET CMP0069 NEW)
@@ -16,6 +24,11 @@ cmake_policy(SET CMP0069 NEW)
 # we do this (and we don't if cmake is old), but it's nice when it's possible,
 # and it's possible on our Windows configs.
 cmake_policy(SET CMP0092 NEW)
+<<<<<<< HEAD
+=======
+# Don't remove the FindCUDA module
+cmake_policy(SET CMP0146 OLD)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Prohibit in-source builds
 if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
@@ -54,21 +67,34 @@ set(CMAKE_C_STANDARD
 # ---[ Utils
 include(cmake/public/utils.cmake)
 
+<<<<<<< HEAD
 # --- [ Check that minimal gcc version is 9.2+
 if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.2)
   message(
     FATAL_ERROR
       "GCC-9.2 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
+=======
+# --- [ Check that minimal gcc version is 9.3+
+if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.3)
+  message(
+    FATAL_ERROR
+      "GCC-9.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   )
 endif()
 
 # This define is needed to preserve behavior given anticpated changes to
 # cccl/thrust
+<<<<<<< HEAD
 # https://nvidia.github.io/libcudacxx/standard_api/numerics_library/complex.html
+=======
+# https://nvidia.github.io/cccl/libcudacxx/standard_api/numerics_library/complex.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 string(APPEND CMAKE_CUDA_FLAGS
        " -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS")
 
 if(LINUX)
+<<<<<<< HEAD
   include(cmake/CheckAbi.cmake)
   string(APPEND CMAKE_CXX_FLAGS
          " -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
@@ -82,6 +108,9 @@ if(LINUX)
     # compiled by the same toolchain again
     append_cxx_flag_if_supported("-fabi-version=11" CMAKE_CXX_FLAGS)
   endif()
+=======
+  set(CXX_STANDARD_REQUIRED ON)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -271,6 +300,11 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                        "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+<<<<<<< HEAD
+=======
+cmake_dependent_option(USE_XCCL "Use XCCL" ON
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
 cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
@@ -344,10 +378,21 @@ cmake_dependent_option(
   USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
     "USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
 cmake_dependent_option(
+<<<<<<< HEAD
+=======
+  USE_GLOO_IBVERBS "Use Gloo with ibverbs backend. Only available if USE_GLOO is on." OFF
+    "USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
+cmake_dependent_option(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
 cmake_dependent_option(
     USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
 cmake_dependent_option(
+<<<<<<< HEAD
+=======
+    USE_C10D_XCCL "USE C10D XCCL" ON "USE_DISTRIBUTED;USE_XCCL" OFF)
+cmake_dependent_option(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
     USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
@@ -531,7 +576,10 @@ if(USE_LIGHTWEIGHT_DISPATCH AND NOT STATIC_DISPATCH_BACKEND)
 endif()
 option(TRACING_BASED
        "Master flag to build Lite Interpreter with tracing build option" OFF)
+<<<<<<< HEAD
 option(BUILD_EXECUTORCH "Master flag to build Executorch" ON)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This is a fix for a rare build issue on Ubuntu: symbol lookup error:
 # miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol:
 # mkl_blas_dsyrk
@@ -563,6 +611,13 @@ if(MSVC)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler  /Zc:__cplusplus")
 
   set(CMAKE_NINJA_CMCLDEPS_RC OFF)
+<<<<<<< HEAD
+=======
+  if(MSVC_Z7_OVERRIDE)
+    # CMake set debug flags to use /Z7
+    set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT Embedded)
+  endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   foreach(
     flag_var
     CMAKE_C_FLAGS
@@ -575,12 +630,15 @@ if(MSVC)
     CMAKE_CXX_FLAGS_RELEASE
     CMAKE_CXX_FLAGS_MINSIZEREL
     CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+<<<<<<< HEAD
     # Replace /Zi and /ZI with /Z7
     if(MSVC_Z7_OVERRIDE)
       if(${flag_var} MATCHES "/Z[iI]")
         string(REGEX REPLACE "/Z[iI]" "/Z7" ${flag_var} "${${flag_var}}")
       endif(${flag_var} MATCHES "/Z[iI]")
     endif(MSVC_Z7_OVERRIDE)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if(${CAFFE2_USE_MSVC_STATIC_RUNTIME})
       if(${flag_var} MATCHES "/MD")
@@ -703,7 +761,11 @@ endif()
 if(USE_KLEIDIAI AND CMAKE_C_COMPILER_VERSION)
     if(CMAKE_C_COMPILER_VERSION VERSION_LESS 11)
       set(USE_KLEIDIAI OFF)
+<<<<<<< HEAD
       message(WARNING "Disabling KleidiAI: Requires atleast GCC 11 or Clang 11")
+=======
+      message(WARNING "Disabling KleidiAI: Requires at least GCC 11 or Clang 11")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     endif()
 endif()
 
@@ -872,7 +934,11 @@ cmake_dependent_option(
   "Whether to build the flash_attention kernel for scaled dot product attention.\
   Will be disabled if not supported by the platform"
   ON
+<<<<<<< HEAD
   "USE_CUDA OR USE_ROCM;NOT MSVC"
+=======
+  "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   OFF)
 
 # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
@@ -888,7 +954,11 @@ cmake_dependent_option(
 # USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
 #
 if(USE_ROCM)
+<<<<<<< HEAD
   if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
+=======
+  if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     include(cmake/External/aotriton.cmake)
   endif()
 endif()
@@ -988,8 +1058,22 @@ endif()
 # ---[ Build flags Re-include to override append_cxx_flag_if_supported from
 # third_party/FBGEMM
 include(cmake/public/utils.cmake)
+<<<<<<< HEAD
 if(NOT MSVC)
   string(APPEND CMAKE_CXX_FLAGS " -O2 -fPIC")
+=======
+if(USE_COLORIZE_OUTPUT)
+  set(CMAKE_COLOR_DIAGNOSTICS ON)
+endif()
+if(NOT MSVC)
+  string(APPEND CMAKE_CXX_FLAGS " -O2 -fPIC")
+
+  # This prevents use of `c10::optional`, `c10::nullopt` etc within the codebase
+  string(APPEND CMAKE_CXX_FLAGS " -DC10_NODEPRECATED")
+  string(APPEND CMAKE_CUDA_FLAGS " -DC10_NODEPRECATED")
+  string(APPEND CMAKE_OBJCXX_FLAGS " -DC10_NODEPRECATED")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Eigen fails to build with some versions, so convert this to a warning
   # Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
   string(APPEND CMAKE_CXX_FLAGS " -Wall")
@@ -1057,6 +1141,7 @@ if(NOT MSVC)
                                CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Qunused-arguments" CMAKE_CXX_FLAGS)
 
+<<<<<<< HEAD
   if(${USE_COLORIZE_OUTPUT})
     # Why compiler checks are necessary even when `try_compile` is used Because
     # of the bug in ccache that can incorrectly identify `-fcolor-diagnostics`
@@ -1070,6 +1155,8 @@ if(NOT MSVC)
     endif()
   endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   append_cxx_flag_if_supported("-faligned-new" CMAKE_CXX_FLAGS)
 
   if(WERROR)
@@ -1096,7 +1183,10 @@ if(NOT MSVC)
   if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
     append_cxx_flag_if_supported("-Wno-dangling-reference" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-error=dangling-reference" CMAKE_CXX_FLAGS)
+<<<<<<< HEAD
     append_cxx_flag_if_supported("-Wno-error=redundant-move" CMAKE_CXX_FLAGS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
 else()
   # Define export functions for AOTI.
@@ -1256,7 +1346,11 @@ endif()
 add_subdirectory(c10)
 add_subdirectory(caffe2)
 
+<<<<<<< HEAD
 # ---[ CMake related files Uninistall option.
+=======
+# ---[ CMake related files Uninstall option.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(NOT TARGET caffe2_uninstall)
   configure_file(
     ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
diff --git a/CODEOWNERS b/CODEOWNERS
index ed5edc0abbb4..3b08a825a6b6 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -7,13 +7,21 @@
 # Each line is a file pattern followed by one or more owners.
 # For module labels => owners mapping, please see https://github.com/pytorch/pytorch/issues/24422.
 
+<<<<<<< HEAD
 /torch/utils/cpp_extension.py @fmassa @soumith @ezyang
+=======
+/torch/utils/cpp_extension.py @fmassa @ezyang @malfet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Not there to strictly require the approval, but to be tagged as a reviewer
 # on the PRs to push them into a high priority inbox.
 /torch/csrc/autograd/ @albanD @soulitzer
 /torch/autograd/ @albanD @soulitzer
 /tools/autograd/ @albanD @soulitzer
+<<<<<<< HEAD
+=======
+/torch/header_only_apis.txt @janeyx99
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /torch/nn/ @albanD @jbschlosser @mikaylagawarecki
 /torch/optim/ @albanD @janeyx99
 /test/test_public_bindings.py @albanD
@@ -21,6 +29,10 @@
 /test/forward_backward_compatibility/check_forward_backward_compatibility.py @larryliu0820
 /docs/source/conf.py @albanD
 /aten/src/ATen/native/tags.yaml @ezyang
+<<<<<<< HEAD
+=======
+/.github/merge_rules.yaml @albanD @malfet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Architecture Optimization (quantization, sparsity, etc.)
 /aten/src/ATen/native/ao_sparse @salilsdesai @kimishpatel @digantdesai @jianyuh
@@ -49,12 +61,21 @@ nn/qat/ @jerryzh168
 /torch/csrc/distributed/c10d/Ops.* @kwen2501
 
 # ONNX Export
+<<<<<<< HEAD
 /torch/_dynamo/backends/onnxrt.py @wschin @xadupre
 /torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1 @xadupre
 /torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1 @xadupre
 /torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1 @xadupre
 /torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin @xadupre
 /test/onnx/  @titaiwangms @shubhambhokare1 @justinchuby @wschin @xadupre
+=======
+/torch/_dynamo/backends/onnxrt.py @wschin
+/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1
+/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1
+/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1
+/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
+/test/onnx/  @titaiwangms @shubhambhokare1 @justinchuby @wschin
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # CI
 /.ci  @pytorch/pytorch-dev-infra
@@ -134,7 +155,11 @@ torch/profiler/ @sraikund16
 test/functorch/test_aotdispatch.py @ezyang @Chillee
 
 # Dataloader
+<<<<<<< HEAD
 torch/utils/data/ @andrewkho @divyanshk
+=======
+torch/utils/data/ @divyanshk @ramanishsingh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # hipify
 torch/utils/hipify/ @jeffdaily @jithunnair-amd
@@ -164,6 +189,14 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 /torch/export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
 /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
 
+<<<<<<< HEAD
+=======
+# Dynamic Shapes
+/torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka
+/torch/fx/experimental/sym_node.py @bobrenjc93 @laithsakka
+/torch/fx/experimental/recording.py @bobrenjc93 @laithsakka
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # serialization-related files
 /aten/src/ATen/MapAllocator* @mikaylagawarecki
 /caffe2/serialize/ @mikaylagawarecki
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e48eee1889eb..3c591eb9c8f0 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -112,8 +112,12 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
   lazy.)
 
   ```bash
+<<<<<<< HEAD
   conda uninstall pytorch -y
   yes | pip uninstall torch
+=======
+  pip uninstall torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ```
 
   Next run `python setup.py clean`. After that, you can install in `develop` mode again.
@@ -130,8 +134,13 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
       git submodule deinit -f .
       git clean -xdf
       python setup.py clean
+<<<<<<< HEAD
       git submodule update --init --recursive # very important to sync the submodules
       python setup.py develop                 # then try running the command again
+=======
+      git submodule update --init --recursive
+      python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ```
   4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to
     experiment with some environment variables, you can pass them into the command:
@@ -149,7 +158,11 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
 
   - If you encounter an error such as
     ```
+<<<<<<< HEAD
     fatal: unable to access 'https://github.com/pybind11/pybind11.git': could not load PEM client certificate ...
+=======
+    fatal: unable to access 'https://github.com/pybind/pybind11.git': could not load PEM client certificate ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ```
     this is likely that you are using HTTP proxying and the certificate expired. To check if the certificate is valid, run
     `git config --global --list` and search for config like `http.proxysslcert=<cert_file>`. Then check certificate valid date by running
@@ -180,6 +193,7 @@ You can use this script to check out a new nightly branch with the following:
 source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
 ```
 
+<<<<<<< HEAD
 Or if you would like to re-use an existing conda environment, you can pass in
 the prefix argument (`--prefix`):
 
@@ -188,6 +202,8 @@ the prefix argument (`--prefix`):
 source my-env/bin/activate  # or `& .\my-env\Scripts\Activate.ps1` on Windows
 ```
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 To install the nightly binaries built with CUDA, you can pass in the flag `--cuda`:
 
 ```bash
@@ -237,6 +253,11 @@ dependencies as well as the nightly binaries into the repo directory.
           details.
         * [cuda](aten/src/ATen/native/cuda) - CUDA implementations of
           operators.
+<<<<<<< HEAD
+=======
+        * [mps](aten/src/ATen/native/mps) - MPS implementations of
+          operators for Apple's Metal GPU family.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         * [sparse](aten/src/ATen/native/sparse) - CPU and CUDA
           implementations of COO sparse tensor operations
         * [mkl](aten/src/ATen/native/mkl) [mkldnn](aten/src/ATen/native/mkldnn)
@@ -281,8 +302,11 @@ dependencies as well as the nightly binaries into the repo directory.
 * [caffe2](caffe2) - The Caffe2 library.
   * [core](caffe2/core) - Core files of Caffe2, e.g., tensor, workspace,
     blobs, etc.
+<<<<<<< HEAD
   * [operators](caffe2/operators) - Operators of Caffe2.
   * [python](caffe2/python) - Python bindings to Caffe2.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   * ...
 * [.circleci](.circleci) - CircleCI configuration management. [README](.circleci/README.md)
 
@@ -291,7 +315,11 @@ dependencies as well as the nightly binaries into the repo directory.
 ### Python Unit Testing
 
 **Prerequisites**:
+<<<<<<< HEAD
 The following packages should be installed with either `conda` or `pip`:
+=======
+The following packages should be installed with `pip`:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - `expecttest` and `hypothesis` - required to run tests
 - `mypy` - recommended for linting
 - `pytest` - recommended to run tests more selectively
@@ -354,6 +382,7 @@ command runs tests such as `TestNN.test_BCELoss` and
 
 ### Local linting
 
+<<<<<<< HEAD
 Install all prerequisites by running
 
 ```bash
@@ -361,6 +390,9 @@ make setup-lint
 ```
 
 You can now run the same linting steps that are used in CI locally via `make`:
+=======
+You can run the same linting steps that are used in CI locally via `make`:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ```bash
 make lint
@@ -436,7 +468,11 @@ PyTorch has two main types of documentation:
 These are the docs that you see over at [our docs website](https://pytorch.org/docs).
 - **Developer facing documentation**:
 Developer facing documentation is spread around our READMEs in our codebase and in
+<<<<<<< HEAD
 the [PyTorch Developer Wiki](https://pytorch.org/wiki).
+=======
+the [PyTorch Developer Wiki](https://github.com/pytorch/pytorch/wiki).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 If you're interested in adding new developer docs, please read this [page on the wiki](https://github.com/pytorch/pytorch/wiki/Where-or-how-should-I-add-documentation) on our best practices for where to put it.
 
 The rest of this section is about user-facing documentation.
@@ -484,7 +520,11 @@ In addition to the standard Google Style docstring formatting rules, the followi
 
 ### Building documentation
 
+<<<<<<< HEAD
 To build the documentation:
+=======
+Note that the docs will only build with Python versions <3.13. To build the documentation:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 1. Build and install PyTorch
 
@@ -499,8 +539,12 @@ pip install -r requirements.txt
 # Or if you prefer an uncontaminated global executable environment or do not want to go through the node configuration:
 # npm install katex && export PATH="$PATH:$(pwd)/node_modules/.bin"
 ```
+<<<<<<< HEAD
 > Note: if you installed `nodejs` with a different package manager (e.g.,
 `conda`) then `npm` will probably install a version of `katex` that is not
+=======
+> Note: if you installed `nodejs` with a different package manager then `npm` will probably install a version of `katex` that is not
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 compatible with your version of `nodejs` and doc builds will fail.
 A combination of versions that is known to work is `node@6.13.1` and
 `katex@0.13.18`. To install the latter with `npm` you can run
@@ -595,9 +639,14 @@ rsync -az me@my_machine:/path/to/pytorch/docs/cpp/build/html cpp/build
 
 ### Previewing documentation on PRs
 
+<<<<<<< HEAD
 PyTorch will host documentation previews at `https://docs-preview.pytorch.org/pytorch/pytorch/<pr number>/index.html` once the
 `pytorch_python_doc_build` GitHub Actions job has completed on your PR. You can visit that page directly
 or find its link in the automated Dr. CI comment on your PR.
+=======
+PyTorch will host documentation previews at `https://docs-preview.pytorch.org/pytorch/pytorch/<pr number>/index.html` once the docs GitHub Actions job has completed on your PR. You can find its link in the automated pytorchbot comment on your PR or go to the URL
+directly.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ### Adding documentation tests
 
@@ -669,16 +718,28 @@ details.
 One downside to using `python setup.py develop` is that your development
 version of PyTorch will be installed globally on your account (e.g., if
 you run `import torch` anywhere else, the development version will be
+<<<<<<< HEAD
 used.
 
 If you want to manage multiple builds of PyTorch, you can make use of
 [conda environments](https://conda.io/docs/using/envs.html) to maintain
+=======
+used).
+
+If you want to manage multiple builds of PyTorch, you can make use of
+[venv environments](https://docs.python.org/3/library/venv.html) to maintain
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 separate Python package environments, each of which can be tied to a
 specific build of PyTorch. To set one up:
 
 ```bash
+<<<<<<< HEAD
 conda create -n pytorch-myfeature
 source activate pytorch-myfeature
+=======
+python -m venv pytorch-myfeature
+source pytorch-myfeature/bin/activate  # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # if you run python now, torch will NOT be installed
 python setup.py develop
 ```
@@ -756,7 +817,10 @@ same. Using ccache in a situation like this is a real time-saver.
 Before building pytorch, install ccache from your package manager of choice:
 
 ```bash
+<<<<<<< HEAD
 conda install ccache -c conda-forge
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 sudo apt install ccache
 sudo yum install ccache
 brew install ccache
@@ -990,7 +1054,11 @@ If you are working on the CUDA code, here are some useful CUDA debugging tips:
 3. CUDA supports a lot of C++11/14 features such as, `std::numeric_limits`, `std::nextafter`,
    `std::tuple` etc. in device code. Many of such features are possible because of the
    [--expt-relaxed-constexpr](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-functions)
+<<<<<<< HEAD
    nvcc flag. There is a known [issue](https://github.com/ROCm-Developer-Tools/HIP/issues/374)
+=======
+   nvcc flag. There is a known [issue](https://github.com/ROCm/hip/issues/374)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    that ROCm errors out on device code, which uses such stl functions.
 4. A good performance metric for a CUDA kernel is the
    [Effective Memory Bandwidth](https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/).
@@ -1048,8 +1116,12 @@ than Linux, which are worth keeping in mind when fixing these problems.
 
 3. If you have a Windows box (we have a few on EC2 which you can request access to) and
    you want to run the build, the easiest way is to just run `.ci/pytorch/win-build.sh`.
+<<<<<<< HEAD
    If you need to rebuild, run `REBUILD=1 .ci/pytorch/win-build.sh` (this will avoid
    blowing away your Conda environment.)
+=======
+   If you need to rebuild, run `REBUILD=1 .ci/pytorch/win-build.sh`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Even if you don't know anything about MSVC, you can use cmake to build simple programs on
 Windows; this can be helpful if you want to learn more about some peculiar linking behavior
@@ -1137,7 +1209,11 @@ CUDA, MSVC, and PyTorch versions are interdependent; please install matching ver
 | 10.2         | Visual Studio 2019 (16.X) (`_MSC_VER` < 1930)           |  1.5.0 ~ 1.7.0  |
 | 11.0         | Visual Studio 2019 (16.X) (`_MSC_VER` < 1930)           |      1.7.0      |
 
+<<<<<<< HEAD
 Note: There's a [compilation issue](https://github.com/oneapi-src/oneDNN/issues/812) in several Visual Studio 2019 versions since 16.7.1, so please make sure your Visual Studio 2019 version is not in 16.7.1 ~ 16.7.5
+=======
+Note: There's a [compilation issue](https://github.com/uxlfoundation/oneDNN/issues/812) in several Visual Studio 2019 versions since 16.7.1, so please make sure your Visual Studio 2019 version is not in 16.7.1 ~ 16.7.5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## Pre-commit tidy/linting hook
 
@@ -1266,7 +1342,11 @@ in the meantime there will be some separation.
 There are a few "unusual" directories which, for historical reasons,
 are Caffe2/PyTorch specific. Here they are:
 
+<<<<<<< HEAD
 - `CMakeLists.txt`, `Makefile`, `binaries`, `cmake`, `conda`, `modules`,
+=======
+- `CMakeLists.txt`, `Makefile`, `binaries`, `cmake`, `modules`,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   `scripts` are Caffe2-specific. Don't put PyTorch code in them without
   extra coordination.
 
diff --git a/Dockerfile b/Dockerfile
index 5cec2173063b..0c66b38c9f05 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -70,7 +70,11 @@ RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}
 
 ARG TARGETPLATFORM
 
+<<<<<<< HEAD
 # INSTALL_CHANNEL whl - release, whl/nightly - nightly, whle/test - test channels
+=======
+# INSTALL_CHANNEL whl - release, whl/nightly - nightly, whl/test - test channels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN case ${TARGETPLATFORM} in \
          "linux/arm64")  pip install --extra-index-url https://download.pytorch.org/whl/cpu/ torch torchvision torchaudio ;; \
          *)              pip install --index-url https://download.pytorch.org/${INSTALL_CHANNEL}/${CUDA_PATH#.}/ torch torchvision torchaudio ;; \
diff --git a/Makefile b/Makefile
index e5b4386b5dd2..d2c66d31b914 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
 # This makefile does nothing but delegating the actual building to cmake.
+<<<<<<< HEAD
 PYTHON = python3
 PIP = $(PYTHON) -m pip
 NIGHTLY_TOOL_OPTS := pull
@@ -18,17 +19,58 @@ ios:
 clean: # This will remove ALL build folders.
 	@rm -r build*/
 
+=======
+
+SHELL        = /bin/bash
+.SHELLFLAGS := -eu -o pipefail -c
+PYTHON      ?= $(shell command -v python3 || command -v python)
+PIP          = $(PYTHON) -m pip
+NIGHTLY_TOOL_OPTS := pull
+
+.PHONY: all
+all:
+	@cmake -S . -B build $(shell $(PYTHON) ./scripts/get_python_cmake_flags.py) && \
+		cmake --build build --parallel --
+
+.PHONY: local
+local:
+	@./scripts/build_local.sh
+
+.PHONY: android
+android:
+	@./scripts/build_android.sh
+
+.PHONY: ios
+ios:
+	@./scripts/build_ios.sh
+
+.PHONY: triton
+triton:
+	$(PIP) uninstall -y triton
+	@./scripts/install_triton_wheel.sh
+
+.PHONY: clean
+clean: # This will remove ALL build folders.
+	@rm -r build*/ || true
+
+.PHONY: linecount
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 linecount:
 	@cloc --read-lang-def=caffe.cloc caffe2 || \
 		echo "Cloc is not available on the machine. You can install cloc with " && \
 		echo "    sudo apt-get install cloc"
 
+<<<<<<< HEAD
+=======
+.PHONY: ensure-branch-clean
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ensure-branch-clean:
 	@if [ -n "$(shell git status --porcelain)" ]; then \
 		echo "Please commit or stash all changes before running this script"; \
 		exit 1; \
 	fi
 
+<<<<<<< HEAD
 setup-env: ensure-branch-clean
 	$(PYTHON) tools/nightly.py $(NIGHTLY_TOOL_OPTS)
 
@@ -57,3 +99,52 @@ quicklint:
 triton:
 	$(PIP) uninstall -y triton
 	@./scripts/install_triton_wheel.sh
+=======
+.PHONY: setup-env
+setup-env: ensure-branch-clean
+	$(PYTHON) tools/nightly.py $(NIGHTLY_TOOL_OPTS)
+
+.PHONY: setup-env-cuda
+setup-env-cuda:
+	$(MAKE) setup-env PYTHON="$(PYTHON)" NIGHTLY_TOOL_OPTS="$(NIGHTLY_TOOL_OPTS) --cuda"
+
+.PHONY: setup-env-rocm
+setup-env-rocm:
+	$(MAKE) setup-env PYTHON="$(PYTHON)" NIGHTLY_TOOL_OPTS="$(NIGHTLY_TOOL_OPTS) --rocm"
+
+.lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml
+	@echo "Setting up lintrunner..."
+	$(PIP) install lintrunner
+	lintrunner init
+	@echo "Generating .lintrunner.sha256..."
+	@mkdir -p .lintbin
+	@sha256sum requirements.txt pyproject.toml .lintrunner.toml > .lintbin/.lintrunner.sha256
+
+.PHONY: setup-lint
+setup-lint: .lintbin/.lintrunner.sha256
+
+.PHONY: lazy-setup-lint
+lazy-setup-lint: .lintbin/.lintrunner.sha256
+	@if [ ! -x "$(shell command -v lintrunner)" ]; then \
+		$(MAKE) setup-lint; \
+	fi
+
+.PHONY: lint
+lint: lazy-setup-lint
+	lintrunner --all-files
+
+.PHONY: quicklint
+quicklint: lazy-setup-lint
+	lintrunner
+
+.PHONY: quickfix
+quickfix: lazy-setup-lint
+	lintrunner --apply-patches
+
+# Deprecated target aliases
+.PHONY: setup_env setup_env_cuda setup_env_rocm setup_lint
+setup_env: setup-env
+setup_env_cuda: setup-env-cuda
+setup_env_rocm: setup-env-rocm
+setup_lint: setup-lint
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/README.md b/README.md
index eccd24e16cf4..f013dbe862a5 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,10 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
     - [Using pre-built images](#using-pre-built-images)
     - [Building the image yourself](#building-the-image-yourself)
   - [Building the Documentation](#building-the-documentation)
+<<<<<<< HEAD
+=======
+    - [Building a PDF](#building-a-pdf)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - [Previous Versions](#previous-versions)
 - [Getting Started](#getting-started)
 - [Resources](#resources)
@@ -169,8 +173,11 @@ Professional, or Community Editions. You can also install the build tools from
 https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
 come with Visual Studio Code by default.
 
+<<<<<<< HEAD
 \* We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 An example of environment setup is shown below:
 
 * Linux:
@@ -190,16 +197,33 @@ $ conda activate <CONDA_NAME>
 $ call "C:\Program Files\Microsoft Visual Studio\<VERSION>\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
 ```
 
+<<<<<<< HEAD
+=======
+A conda environment is not required.  You can also do a PyTorch build in a
+standard virtual environment, e.g., created with tools like `uv`, provided
+your system has installed all the necessary dependencies unavailable as pip
+packages (e.g., CUDA, MKL.)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ##### NVIDIA CUDA Support
 If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
 - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
 - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA
 
+<<<<<<< HEAD
 Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
 
 If you want to disable CUDA support, export the environment variable `USE_CUDA=0`.
 Other potentially useful environment variables may be found in `setup.py`.
+=======
+Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
+
+If you want to disable CUDA support, export the environment variable `USE_CUDA=0`.
+Other potentially useful environment variables may be found in `setup.py`.  If
+CUDA is installed in a non-standard location, set PATH so that the nvcc you
+want to use can be found (e.g., `export PATH=/usr/local/cuda-12.8/bin:$PATH`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xavier), Instructions to install PyTorch for Jetson Nano are [available here](https://devtalk.nvidia.com/default/topic/1049071/jetson-nano/pytorch-for-jetson-nano/)
 
@@ -223,7 +247,11 @@ Other potentially useful environment variables may be found in `setup.py`.
 
 #### Get the PyTorch Source
 ```bash
+<<<<<<< HEAD
 git clone --recursive https://github.com/pytorch/pytorch
+=======
+git clone https://github.com/pytorch/pytorch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 cd pytorch
 # if you are updating an existing checkout
 git submodule sync
@@ -245,7 +273,12 @@ pip install -r requirements.txt
 ```bash
 pip install mkl-static mkl-include
 # CUDA only: Add LAPACK support for the GPU if needed
+<<<<<<< HEAD
 conda install -c pytorch magma-cuda121  # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo
+=======
+# magma installation: run with active conda environment. specify CUDA version to install
+.ci/docker/common/install_magma_conda.sh 12.4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # (optional) If using torch.compile with inductor/triton, install the matching version of triton
 # Run from the pytorch directory after cloning
@@ -274,6 +307,7 @@ conda install -c conda-forge libuv=1.39
 #### Install PyTorch
 **On Linux**
 
+<<<<<<< HEAD
 If you would like to compile PyTorch with [new C++ ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) enabled, then first run this command:
 ```bash
 export _GLIBCXX_USE_CXX11_ABI=1
@@ -281,6 +315,8 @@ export _GLIBCXX_USE_CXX11_ABI=1
 
 Please **note** that starting from PyTorch 2.5, the PyTorch build with XPU supports both new and old C++ ABIs. Previously, XPU only supported the new C++ ABI. If you want to compile with Intel GPU support, please follow [Intel GPU Support](#intel-gpu-support).
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 If you're compiling for AMD ROCm then first run this command:
 ```bash
 # Only run this if you're compiling for ROCm
@@ -384,14 +420,22 @@ with such a step.
 On Linux
 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
+<<<<<<< HEAD
 python setup.py build --cmake-only
+=======
+CMAKE_ONLY=1 python setup.py build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ccmake build  # or cmake-gui build
 ```
 
 On macOS
 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
+<<<<<<< HEAD
 MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build --cmake-only
+=======
+MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ccmake build  # or cmake-gui build
 ```
 
@@ -431,8 +475,22 @@ make -f docker.Makefile
 
 ### Building the Documentation
 
+<<<<<<< HEAD
 To build documentation in various formats, you will need [Sphinx](http://www.sphinx-doc.org) and the
 readthedocs theme.
+=======
+To build documentation in various formats, you will need [Sphinx](http://www.sphinx-doc.org)
+and the pytorch_sphinx_theme2.
+
+Before you build the documentation locally, ensure `torch` is
+installed in your environment. For small fixes, you can install the
+nightly version as described in [Getting Started](https://pytorch.org/get-started/locally/).
+
+For more complex fixes, such as adding a new module and docstrings for
+the new module, you might need to install torch [from source](#from-source).
+See [Docstring Guidelines](https://github.com/pytorch/pytorch/wiki/Docstring-Guidelines)
+for docstring conventions.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ```bash
 cd docs/
@@ -446,17 +504,74 @@ Run `make` to get a list of all available output formats.
 If you get a katex error run `npm install katex`.  If it persists, try
 `npm install -g katex`
 
+<<<<<<< HEAD
 > Note: if you installed `nodejs` with a different package manager (e.g.,
 `conda`) then `npm` will probably install a version of `katex` that is not
 compatible with your version of `nodejs` and doc builds will fail.
 A combination of versions that is known to work is `node@6.13.1` and
 `katex@0.13.18`. To install the latter with `npm` you can run
 ```npm install -g katex@0.13.18```
+=======
+> [!NOTE]
+> If you installed `nodejs` with a different package manager (e.g.,
+> `conda`) then `npm` will probably install a version of `katex` that is not
+> compatible with your version of `nodejs` and doc builds will fail.
+> A combination of versions that is known to work is `node@6.13.1` and
+> `katex@0.13.18`. To install the latter with `npm` you can run
+> ```npm install -g katex@0.13.18```
+
+> [!NOTE]
+> If you see a numpy incompatibility error, run:
+> ```
+> pip install 'numpy<2'
+> ```
+
+When you make changes to the dependencies run by CI, edit the
+`.ci/docker/requirements-docs.txt` file.
+
+#### Building a PDF
+
+To compile a PDF of all PyTorch documentation, ensure you have
+`texlive` and LaTeX installed. On macOS, you can install them using:
+
+```
+brew install --cask mactex
+```
+
+To create the PDF:
+
+1. Run:
+
+   ```
+   make latexpdf
+   ```
+
+   This will generate the necessary files in the `build/latex` directory.
+
+2. Navigate to this directory and execute:
+
+   ```
+   make LATEXOPTS="-interaction=nonstopmode"
+   ```
+
+   This will produce a `pytorch.pdf` with the desired content. Run this
+   command one more time so that it generates the correct table
+   of contents and index.
+
+> [!NOTE]
+> To view the Table of Contents, switch to the **Table of Contents**
+> view in your PDF viewer.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ### Previous Versions
 
 Installation instructions and binaries for previous PyTorch versions may be found
+<<<<<<< HEAD
 on [our website](https://pytorch.org/previous-versions).
+=======
+on [our website](https://pytorch.org/get-started/previous-versions).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ## Getting Started
@@ -504,7 +619,11 @@ To learn more about making a contribution to Pytorch, please see our [Contributi
 PyTorch is a community-driven project with several skillful engineers and researchers contributing to it.
 
 PyTorch is currently maintained by [Soumith Chintala](http://soumith.ch), [Gregory Chanan](https://github.com/gchanan), [Dmytro Dzhulgakov](https://github.com/dzhulgakov), [Edward Yang](https://github.com/ezyang), and [Nikita Shulga](https://github.com/malfet) with major contributions coming from hundreds of talented individuals in various forms and means.
+<<<<<<< HEAD
 A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jamesb93), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito).
+=======
+A non-exhaustive but growing list needs to mention: [Trevor Killeen](https://github.com/killeent), [Sasank Chilamkurthy](https://github.com/chsasank), [Sergey Zagoruyko](https://github.com/szagoruyko), [Adam Lerer](https://github.com/adamlerer), [Francisco Massa](https://github.com/fmassa), [Alykhan Tejani](https://github.com/alykhantejani), [Luca Antiga](https://github.com/lantiga), [Alban Desmaison](https://github.com/albanD), [Andreas Koepf](https://github.com/andreaskoepf), [James Bradbury](https://github.com/jekbradbury), [Zeming Lin](https://github.com/ebetica), [Yuandong Tian](https://github.com/yuandong-tian), [Guillaume Lample](https://github.com/glample), [Marat Dukhan](https://github.com/Maratyszcza), [Natalia Gimelshein](https://github.com/ngimel), [Christian Sarofeen](https://github.com/csarofeen), [Martin Raison](https://github.com/martinraison), [Edward Yang](https://github.com/ezyang), [Zachary Devito](https://github.com/zdevito). <!-- codespell:ignore -->
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Note: This project is unrelated to [hughperkins/pytorch](https://github.com/hughperkins/pytorch) with the same name. Hugh is a valuable contributor to the Torch community and has helped with many things Torch and PyTorch.
 
diff --git a/RELEASE.md b/RELEASE.md
index 30b03b42435a..828559ea7962 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -19,7 +19,11 @@
     - [Cherry Picking Fixes](#cherry-picking-fixes)
       - [How to do Cherry Picking](#how-to-do-cherry-picking)
     - [Cherry Picking Reverts](#cherry-picking-reverts)
+<<<<<<< HEAD
   - [Preparing and Creating Final Release candidate](#preparing-and-creating-final-release-candidate)
+=======
+  - [Preparing and Creating Final Release Candidate](#preparing-and-creating-final-release-candidate)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - [Promoting RCs to Stable](#promoting-rcs-to-stable)
   - [Additional Steps to prepare for release day](#additional-steps-to-prepare-for-release-day)
     - [Modify release matrix](#modify-release-matrix)
@@ -63,7 +67,11 @@ Following is the Release Compatibility Matrix for PyTorch releases:
 
 ## Release Cadence
 
+<<<<<<< HEAD
 Following is the release cadence. All future dates below are tentative, for latest updates on the release scheduled please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
+=======
+Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 | Minor Version | Release branch cut | Release date | First patch release date | Second patch release date|
 | --- | --- | --- | --- | --- |
@@ -91,6 +99,7 @@ Releasing a new version of PyTorch generally entails 3 major steps:
 
 ### Frequently Asked Questions
 
+<<<<<<< HEAD
 * Q: What is release branch cut  ?
   * A: When bulk of the tracked features merged into the main branch, the primary release engineer starts the release process of cutting the release branch by creating a new git branch based off of the current `main` development branch of PyTorch. This allows PyTorch development flow on `main` to continue uninterrupted, while the release engineering team focuses on stabilizing the release branch in order to release a series of release candidates (RC). The activities in the release branch include both regression and performance testing as well as polishing new features and fixing release-specific bugs. In general, new features *are not* added to the release branch after it was created.
 
@@ -105,6 +114,22 @@ Following Requirements needs to be met prior to cutting a release branch:
 ``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/1.11 --milestone-id 26 --missing-in-branch ```
 * Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems(Linux, MacOS, Windows), Python versions as well as CPU architectures(x86 and arm) and accelerator versions(CUDA, ROCm, XPU).
 * All the nightly jobs for pytorch and domain libraries should be green. Validate this using following HUD links:
+=======
+* Q: What is a release branch cut  ?
+  * A: When bulk of the tracked features merged into the main branch, the primary release engineer starts the release process of cutting the release branch by creating a new git branch based off of the current `main` development branch of PyTorch. This allows PyTorch development flow on `main` to continue uninterrupted, while the release engineering team focuses on stabilizing the release branch in order to release a series of release candidates (RC). The activities in the release branch include both regression and performance testing as well as polishing new features and fixing release-specific bugs. In general, new features *are not* added to the release branch after it was created.
+
+* Q: What is a cherry-pick ?
+  * A: A cherry pick is a process of propagating commits from the main into the release branch, utilizing git's built in [cherry-pick feature](https://git-scm.com/docs/git-cherry-pick). These commits are typically limited to small fixes or documentation updates to ensure that the release engineering team has sufficient time to complete a thorough round of testing on the release branch. To nominate a fix for cherry-picking, a separate pull request must be created against the respective release branch and then mentioned in the Release Tracker issue (example: https://github.com/pytorch/pytorch/issues/94937) following the template from the issue description. The comment nominating a particular cherry-pick for inclusion in the release should include the committed PR against main branch, the newly created cherry-pick PR, as well as the acceptance criteria for why the cherry-pick is needed in the first place.  This process can be automated by using entering a comment `@pytorchbot cherry-pick -c [reason]` on the PR you wish to cherry-pick.
+
+## Cutting a release branch preparations
+
+Following requirements need to be met prior to cutting a release branch:
+
+* Resolve all outstanding issues in the milestones (for example [1.11.0](https://github.com/pytorch/pytorch/milestone/28)) before first RC cut is completed. After RC cut is completed, the following script should be executed from test-infra repo in order to validate the presence of the fixes in the release branch:
+``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/1.11 --milestone-id 26 --missing-in-branch ```
+* Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems (Linux, MacOS, Windows), Python versions as well as CPU architectures (x86 and arm) and accelerator versions (CUDA, ROCm, XPU).
+* All the nightly jobs for pytorch and domain libraries should be green. Validate this using the following HUD links:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   * [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/nightly)
   * [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/nightly)
   * [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/nightly)
@@ -132,7 +157,11 @@ This script should create 2 branches:
 ### PyTorch ecosystem libraries
 
 *Note*:  Release branches for individual ecosystem libraries should be created after first release candidate build of PyTorch is available in staging channels (which happens about a week after PyTorch release branch has been created). This is absolutely required to allow sufficient testing time for each of the domain library. Domain libraries branch cut is performed by Ecosystem Library POC.
+<<<<<<< HEAD
 Test-Infra branch cut should be performed at the same time as Pytorch core branch cut. Convenience script can also be used domains.
+=======
+Test-Infra branch cut should be performed at the same time as Pytorch core branch cut. Convenience script can also be used for domains.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 > NOTE: RELEASE_VERSION only needs to be specified if version.txt is not available in root directory
 
@@ -141,8 +170,16 @@ DRY_RUN=disabled GIT_BRANCH_TO_CUT_FROM=main RELEASE_VERSION=1.11 scripts/releas
 ```
 
 ### Making release branch specific changes for PyTorch
+<<<<<<< HEAD
 
 These are examples of changes that should be made to release branches so that CI / tooling can function normally on
+=======
+First you should cut a release branch for pytorch/test-infra:
+* Create a new branch using the naming convention `release/[major].[minor]`, e.g. `release/2.7`
+* On that release branch, update branch pointers for any pytorch-managed reusable actions or workflows to point to the new release's branch ([example](https://github.com/pytorch/test-infra/commit/749b9e36afa23298ad5498c9f5bcd96f5467baff#diff-d41015f3ac6cfa64b00e366bec416bb9487ac27493de7ebe7778fdfc7518b003R39)).
+
+Here are examples of changes that should be made to the pytorch/pytorch release branches so that CI / tooling can function normally on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 them:
 
 * Update backwards compatibility tests to use RC binaries instead of nightlies
@@ -163,8 +200,15 @@ Ecosystem libraries branch cut is done a few days after branch cut for the `pyto
 After the branch cut is performed, the Pytorch Dev Infra member should be informed of the branch cut and Domain Library specific change is required before Drafting RC for this domain library.
 
 Follow these examples of PR that updates the version and sets RC Candidate upload channel:
+<<<<<<< HEAD
 * torchvision : https://github.com/pytorch/vision/pull/5400
 * torchaudio: https://github.com/pytorch/audio/pull/2210
+=======
+* torchvision : [Update version.txt](https://github.com/pytorch/vision/pull/8968) and [change workflow branch references](https://github.com/pytorch/vision/pull/8969)
+* torchaudio: [Update version.txt](https://github.com/pytorch/audio/commit/654fee8fd17784271be1637eac1293fd834b4e9a) and [change workflow branch references](https://github.com/pytorch/audio/pull/3890)
+
+The CI workflow updating part of the above PRs can be automated by running: `python release/apply-release-changes.py [version]` (where version is something like '2.7').  That script lives in both pytorch/audio and pytorch/vision.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## Running Launch Execution team Core XFN sync
 
@@ -207,9 +251,13 @@ git tag -f  v1.12.0-rc2
 git push origin  v1.12.0-rc2
 ```
 
+<<<<<<< HEAD
 Pushing a release candidate should trigger the `binary_builds` workflow within CircleCI using [`pytorch/pytorch-probot`](https://github.com/pytorch/pytorch-probot)'s [`trigger-circleci-workflows`](trigger-circleci-workflows) functionality.
 
 This trigger functionality is configured here: [`pytorch-circleci-labels.yml`](https://github.com/pytorch/pytorch/blob/main/.github/pytorch-circleci-labels.yml)
+=======
+Pushing a release candidate tag should trigger the `binary_build` workflows. This trigger functionality is configured in [`linux_binary_build_workflow.yml.j2]`][(https://github.com/pytorch/pytorch/blob/main/.github/pytorch-circleci-labels.yml](https://github.com/pytorch/pytorch/blob/main/.github/templates/linux_binary_build_workflow.yml.j2#L19-L22)) and in the matching templates for the other OSes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 To view the state of the release build, please navigate to [HUD](https://hud.pytorch.org/hud/pytorch/pytorch/release%2F1.12). And make sure all binary builds are successful.
 ### Release Candidate Storage
@@ -218,18 +266,30 @@ Release candidates are currently stored in the following places:
 
 * Wheels: https://download.pytorch.org/whl/test/
 * Conda: https://anaconda.org/pytorch-test
+<<<<<<< HEAD
 * Libtorch: https://download.pytorch.org/libtorch/test
+=======
+* Libtorch: https://download.pytorch.org/libtorch/test <!-- @lint-ignore -->
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Backups are stored in a non-public S3 bucket at [`s3://pytorch-backup`](https://s3.console.aws.amazon.com/s3/buckets/pytorch-backup?region=us-east-1&tab=objects)
 
 ### Release Candidate health validation
 
+<<<<<<< HEAD
 Validate the release jobs for pytorch and domain libraries should be green. Validate this using following HUD links:
+=======
+Validate that the release jobs for pytorch and domain libraries are green. Validate this using the following HUD links:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   * [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/release%2F1.12)
   * [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/release%2F1.12)
   * [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/release%2F1.12)
 
+<<<<<<< HEAD
 Validate that the documentation build has completed and generated entry corresponding to the release in  [docs repository](https://github.com/pytorch/docs/tree/main/).
+=======
+Validate that the documentation build has completed and generated an entry corresponding to the release in the [docs repository](https://github.com/pytorch/docs/tree/main/).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ### Cherry Picking Fixes
 
@@ -274,6 +334,7 @@ requires `pytorchbot`, so it's only available in PyTorch atm.
 
 ### Cherry Picking Reverts
 
+<<<<<<< HEAD
 If PR that has been cherry-picked into release branch has been reverted, its cherry-pick must be reverted as well.
 
 Reverts for changes that was committed into the main branch prior to the branch cut, must be propagated into release branch as well.
@@ -283,6 +344,17 @@ Reverts for changes that was committed into the main branch prior to the branch
 The following requirements need to be met prior to creating final Release Candidate :
 
 * Resolve all outstanding open issues in the milestone. There should be no open issues/PRs (for example [2.1.2](https://github.com/pytorch/pytorch/milestone/39)). The issue should either be closed or de-milestoned.
+=======
+If a PR that has been cherry-picked into the release branch has been reverted, its cherry-pick must be reverted as well.
+
+Reverts for changes that were committed into the main branch prior to the branch cut must be propagated into the release branch as well.
+
+## Preparing and Creating Final Release Candidate
+
+The following requirements need to be met prior to creating the final Release Candidate:
+
+* Resolve all outstanding open issues in the milestone. There should be no open issues/PRs (for example [2.1.2](https://github.com/pytorch/pytorch/milestone/39)). Each issue should either be closed or de-milestoned.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 * Validate that all closed milestone PRs are present in the release branch. Confirm this by running:
 ``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/2.2 --milestone-id 40 --missing-in-branch ```
@@ -291,7 +363,11 @@ The following requirements need to be met prior to creating final Release Candid
 
 * Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal.
 
+<<<<<<< HEAD
 After the final RC is created. The following tasks should be performed :
+=======
+After the final RC is created, the following tasks should be performed:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 * Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal.
 
@@ -319,6 +395,7 @@ Promotion should occur in two steps:
 * Promote S3 artifacts (wheels, libtorch) and Conda packages
 * Promote S3 wheels to PyPI
 
+<<<<<<< HEAD
 **NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypa/warehouse/issues/726 for a discussion on potential draft releases within PyPI)
 
 ## Additional Steps to prepare for release day
@@ -336,12 +413,35 @@ Please note: This PR needs to be merged on the release day and hence it should b
 ### Open Google Colab issue
 
 This is normally done right after the release is completed. We would need to create Google Colab Issue see following [PR](https://github.com/googlecolab/colabtools/issues/2372)
+=======
+**NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypi/warehouse/issues/726 for a discussion on potential draft releases within PyPI)
+
+## Additional Steps to prepare for release day
+
+The following should be prepared for the release day:
+
+### Modify release matrix
+
+Modify the release matrix for the get started page. See the following [PR](https://github.com/pytorch/test-infra/pull/4611) as reference.
+
+The PR to update published_versions.json and quick-start-module.js is auto generated. See the following [PR](https://github.com/pytorch/pytorch.github.io/pull/1467) as reference.
+
+Please note: This PR needs to be merged on the release day and hence it should be absolutely free of any failures. To test this PR, open another test PR pointing to the Release Candidate location as described in the [Release Candidate Storage](#release-candidate-storage) section.
+
+### Open Google Colab issue
+
+This is normally done right after the release is completed. We need to create a Google Colab issue. See the following example [issue](https://github.com/googlecolab/colabtools/issues/2372)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Patch Releases
 
 A patch release is a maintenance release of PyTorch that includes fixes for regressions found in a previous minor release. Patch releases typically will bump the `patch` version from semver (i.e. `[major].[minor].[patch]`).
 
+<<<<<<< HEAD
 Please note: Starting from 2.1 one can expect up to 2 patch releases after every minor ones. Patch releases would only be published for latest minor release.
+=======
+Please note: Starting from 2.1, one can expect up to 2 patch releases after every minor release. Patch releases are only published for the latest minor release.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## Patch Release Criteria
 
@@ -363,6 +463,7 @@ Patch releases should be considered if a regression meets the following criteria
 > Main POC: Patch Release Managers, Triage Reviewers
 
 Patch releases should follow these high-level phases. This process starts immediately after the previous release has completed.
+<<<<<<< HEAD
 Patch release process takes around 4-5 weeks to complete.
 
 1. Triage, is a process where issues are identified, graded, compared to Patch Release Criteria and added to Patch Release milestone. This process normally takes 2 weeks after the release completion.
@@ -372,20 +473,43 @@ Patch release process takes around 4-5 weeks to complete.
 3. Cherry picking phase starts after the decision is made to create patch release. At this point a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks.
 4. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger new build and produce new release candidate. Announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks.
 5. General Availability
+=======
+The patch release process takes around 4-5 weeks to complete.
+
+1. Triage is a process where issues are identified, graded, compared to Patch Release Criteria and added to Patch Release milestone. This process normally takes 2 weeks after the release completion.
+2. Go/No Go meeting between PyTorch Releng, PyTorch Core and Project Managers where potential issues triggering a release in milestones are reviewed, and following decisions are made:
+  * Should the new patch release be created?
+  * Timeline execution for the patch release
+3. Cherry picking phase starts after the decision is made to create a patch release. At this point, a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks.
+4. Updating `version.txt` in the release branch to match expected patch release version, see https://github.com/pytorch/pytorch/commit/f77213d3dae5d103a39cdaf93f21863843571e8d as an example
+5. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger a new build and produce a new release candidate. An announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks.
+6. General Availability
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ### Triage
 
 > Main POC: Triage Reviewers
 
+<<<<<<< HEAD
 1. Tag issues / pull requests that are candidates for a potential patch release with `triage review`
     * ![adding triage review label](https://user-images.githubusercontent.com/1700823/132589089-a9210a14-6159-409d-95e5-f79067f6fa38.png)
 2. Triage reviewers will then check if the regression / fix identified fits within above mentioned [Patch Release Criteria](#patch-release-criteria)
 3. Triage reviewers will then add the issue / pull request to the related milestone (i.e. `1.9.1`) if the regressions is found to be within the [Patch Release Criteria](#patch-release-criteria)
+=======
+1. Tag issues/pull requests that are candidates for a potential patch release with `triage review`
+    * ![adding triage review label](https://user-images.githubusercontent.com/1700823/132589089-a9210a14-6159-409d-95e5-f79067f6fa38.png)
+2. Triage reviewers will then check if the regression/fix identified fits within the above mentioned [Patch Release Criteria](#patch-release-criteria)
+3. Triage reviewers will then add the issue/pull request to the related milestone (i.e. `1.9.1`) if the regression is found to be within the [Patch Release Criteria](#patch-release-criteria)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     * ![adding to milestone](https://user-images.githubusercontent.com/1700823/131175980-148ff38d-44c3-4611-8a1f-cd2fd1f4c49d.png)
 
 ### Issue Tracker for Patch releases
 
+<<<<<<< HEAD
 For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
+=======
+For patch releases, an issue tracker needs to be created. For a patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 * https://github.com/pytorch/pytorch/issues/128436
 
 Only following issues are accepted:
diff --git a/SECURITY.md b/SECURITY.md
index 79514f2c282b..dfd67a86a76c 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -3,6 +3,10 @@
  - [**Reporting a Vulnerability**](#reporting-a-vulnerability)
  - [**Using Pytorch Securely**](#using-pytorch-securely)
    - [Untrusted models](#untrusted-models)
+<<<<<<< HEAD
+=======
+   - [TorchScript models](#torchscript-models)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    - [Untrusted inputs](#untrusted-inputs)
    - [Data privacy](#data-privacy)
    - [Using distributed features](#using-distributed-features)
@@ -38,6 +42,13 @@ Important Note: The trustworthiness of a model is not binary. You must always de
     https://arxiv.org/abs/2312.04748
     https://arxiv.org/abs/2401.05566
 
+<<<<<<< HEAD
+=======
+### TorchScript models
+
+TorchScript models should treated the same way as locally executable code from an unknown source. Only run TorchScript models if you trust the provider. Please note, that tools for introspecting TorchScript models (such as `torch.utils.model_dump`) may also execute partial or full code stored in those models, therefore they should be used only if you trust the provider of the binary you are about to load.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ### Untrusted inputs during training and prediction
 
 If you plan to open your model to untrusted inputs, be aware that inputs can also be used as vectors by malicious agents. To minimize risks, make sure to give your model only the permissions strictly required, and keep your libraries updated with the latest security patches.
diff --git a/WORKSPACE b/WORKSPACE
index ae7c0644e203..2fb9833df67a 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -144,8 +144,13 @@ new_local_repository(
 
 new_local_repository(
     name = "asmjit",
+<<<<<<< HEAD
     build_file = "//third_party:fbgemm/third_party/asmjit.BUILD",
     path = "third_party/fbgemm/third_party/asmjit",
+=======
+    build_file = "//third_party:fbgemm/external/asmjit.BUILD",
+    path = "third_party/fbgemm/external/asmjit",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 new_local_repository(
@@ -185,6 +190,15 @@ new_local_repository(
 )
 
 new_local_repository(
+<<<<<<< HEAD
+=======
+    name = "moodycamel",
+    build_file = "//third_party:moodycamel.BUILD",
+    path = "third_party/concurrentqueue",
+)
+
+new_local_repository(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     name = "tensorpipe",
     build_file = "//third_party:tensorpipe.BUILD",
     path = "third_party/tensorpipe",
diff --git a/android/README.md b/android/README.md
index d6a1ba1d4479..5b8e588dcd5f 100644
--- a/android/README.md
+++ b/android/README.md
@@ -2,7 +2,13 @@
 
 ## Demo applications and tutorials
 
+<<<<<<< HEAD
 Demo applications with code walk-through can be find in [this github repo](https://github.com/pytorch/android-demo-app).
+=======
+Please refer to [pytorch-labs/executorch-examples](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch).
+
+Please join our [Discord](https://discord.com/channels/1334270993966825602/1349854760299270284) for any questions.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ## Publishing
 
@@ -119,8 +125,11 @@ We also have to add all transitive dependencies of our aars.
 As `pytorch_android` [depends](https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/build.gradle#L76-L77) on `'com.facebook.soloader:nativeloader:0.10.5'` and `'com.facebook.fbjni:fbjni-java-only:0.2.2'`, we need to add them.
 (In case of using maven dependencies they are added automatically from `pom.xml`).
 
+<<<<<<< HEAD
 You can check out [test app example](https://github.com/pytorch/pytorch/blob/master/android/test_app/app/build.gradle) that uses aars directly.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## Linking to prebuilt libtorch library from gradle dependency
 
 In some cases, you may want to use libtorch from your android native build.
@@ -202,7 +211,11 @@ find_library(FBJNI_LIBRARY fbjni
   NO_CMAKE_FIND_ROOT_PATH)
 
 target_link_libraries(${PROJECT_NAME}
+<<<<<<< HEAD
   ${PYTORCH_LIBRARY})
+=======
+  ${PYTORCH_LIBRARY}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ${FBJNI_LIBRARY})
 
 ```
@@ -233,8 +246,11 @@ void loadAndForwardModel(const std::string& modelPath) {
 
 To load torchscript model for mobile we need some special setup which is placed in `struct JITCallGuard` in this example. It may change in future, you can track the latest changes keeping an eye in our [pytorch android jni code]([https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp#L28)
 
+<<<<<<< HEAD
 [Example of linking to libtorch from aar](https://github.com/pytorch/pytorch/tree/master/android/test_app)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ## PyTorch Android API Javadoc
 
 You can find more details about the PyTorch Android API in the [Javadoc](https://pytorch.org/javadoc/).
diff --git a/android/pytorch_android/CMakeLists.txt b/android/pytorch_android/CMakeLists.txt
index 0d46f87094ca..1a6de640a518 100644
--- a/android/pytorch_android/CMakeLists.txt
+++ b/android/pytorch_android/CMakeLists.txt
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 cmake_minimum_required(VERSION 3.4.1)
+=======
+cmake_minimum_required(VERSION 3.5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 option(BUILD_LITE_INTERPRETER "Master flag to build pytorch_jni_lite" ON)
 message(
   STATUS
diff --git a/android/pytorch_android_torchvision/CMakeLists.txt b/android/pytorch_android_torchvision/CMakeLists.txt
index 849e4d07cc1d..d001a2d2ab82 100644
--- a/android/pytorch_android_torchvision/CMakeLists.txt
+++ b/android/pytorch_android_torchvision/CMakeLists.txt
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 cmake_minimum_required(VERSION 3.4.1)
+=======
+cmake_minimum_required(VERSION 3.5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 project(pytorch_vision_jni CXX)
 set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
 set(CMAKE_VERBOSE_MAKEFILE ON)
diff --git a/android/settings.gradle b/android/settings.gradle
index 743f388b6507..5a291d7df9dc 100644
--- a/android/settings.gradle
+++ b/android/settings.gradle
@@ -3,4 +3,7 @@ include ':app', ':pytorch_android', ':pytorch_android_torchvision', ':pytorch_ho
 project(':pytorch_android_torchvision').projectDir = file('pytorch_android_torchvision')
 
 project(':pytorch_host').projectDir = file('pytorch_android/host')
+<<<<<<< HEAD
 project(':test_app').projectDir = file('test_app/app')
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index bda6aea32706..54facb45fd95 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -18,6 +18,10 @@ cmake_policy(SET CMP0012 NEW)
 #############################################
 
 set(ATen_CPU_SRCS)
+<<<<<<< HEAD
+=======
+set(ATen_MTIA_SRCS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 set(ATen_XPU_SRCS)
 set(ATen_XPU_INCLUDE)
 set(ATen_CPU_TEST_SRCS)
@@ -101,6 +105,16 @@ else()
   set(AT_CUSPARSELT_ENABLED 1)
 endif()
 
+<<<<<<< HEAD
+=======
+# Add hipSPARSELt support flag if the package is available.
+if(USE_ROCM AND hipsparselt_FOUND)
+  set(AT_HIPSPARSELT_ENABLED 1)
+else()
+  set(AT_HIPSPARSELT_ENABLED 0)
+endif()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 list(APPEND ATen_CPU_INCLUDE
   ${CMAKE_CURRENT_SOURCE_DIR}/src)
 add_subdirectory(src/ATen)
@@ -108,6 +122,10 @@ add_subdirectory(src/ATen)
 # Pass source, includes, and libs to parent
 set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
 set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
+<<<<<<< HEAD
+=======
+set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} PARENT_SCOPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
 set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 7ae6e39d7289..73be0e752477 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+=======
+cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 
 if(NOT MSVC)
@@ -34,6 +38,10 @@ set_bool(AT_MAGMA_ENABLED USE_MAGMA)
 set_bool(CAFFE2_STATIC_LINK_CUDA_INT CAFFE2_STATIC_LINK_CUDA)
 set_bool(AT_CUDNN_ENABLED CAFFE2_USE_CUDNN)
 set_bool(AT_CUSPARSELT_ENABLED CAFFE2_USE_CUSPARSELT)
+<<<<<<< HEAD
+=======
+set_bool(AT_HIPSPARSELT_ENABLED CAFFE2_USE_HIPSPARSELT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
 # TODO: Do not generate CUDAConfig.h for ROCm BUILDS
@@ -65,6 +73,15 @@ file(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh")
 file(GLOB cudnn_cpp "cudnn/*.cpp")
 file(GLOB ops_h "ops/*.h")
 
+<<<<<<< HEAD
+=======
+# MTIA
+file(GLOB mtia_h "mtia/*.h" "mtia/detail/*.h")
+file(GLOB mtia_cpp "mtia/*.cpp" "mtia/detail/*.cpp")
+file(GLOB_RECURSE native_mtia_cpp "native/mtia/*.cpp")
+file(GLOB_RECURSE native_mtia_h "native/mtia/*.h")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 file(GLOB xpu_h "xpu/*.h" "xpu/detail/*.h")
 file(GLOB xpu_cpp "xpu/*.cpp" "xpu/detail/*.cpp")
 
@@ -162,6 +179,7 @@ file(GLOB native_transformers_hip_hip "native/transformers/hip/*.hip")
 file(GLOB native_transformers_hip_cpp "native/transformers/hip/*.cpp")
 file(GLOB native_quantized_cudnn_hip_cpp "native/quantized/cudnn/hip/*.cpp")
 file(GLOB native_utils_cpp "native/utils/*.cpp")
+<<<<<<< HEAD
 
 # flash_attention sources
 file(GLOB flash_attention_cuda_kernels_cu ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src/*.cu)
@@ -170,6 +188,12 @@ file(GLOB flash_attention_cuda_cpp
     "${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src/*.cpp"
     "native/transformers/cuda/flash_attn/flash_api.cpp"
 )
+=======
+file(GLOB flash_attention_cuda_kernels_cu ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src/*.cu)
+file(GLOB flash_attention_cuda_cpp ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src/*.cpp)
+file(GLOB native_flash_attn_api_cpp "native/transformers/cuda/flash_attn/flash_api.cpp")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # flash_attention hip sources
 file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip")
@@ -201,10 +225,36 @@ file(GLOB mem_eff_attention_cuda_cu "native/transformers/cuda/mem_eff_attention/
 file(GLOB mem_eff_attention_cuda_kernels_cu "native/transformers/cuda/mem_eff_attention/kernels/*.cu")
 file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention/*.cpp")
 
+<<<<<<< HEAD
 if(USE_FLASH_ATTENTION)
   list(APPEND native_transformers_cuda_cu ${flash_attention_cuda_cu})
   list(APPEND native_transformers_cuda_cu ${flash_attention_cuda_kernels_cu})
   list(APPEND native_transformers_cuda_cpp ${flash_attention_cuda_cpp})
+=======
+if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
+  add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})
+
+  target_include_directories(flash_attention PUBLIC
+    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
+    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
+    ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
+    ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src
+  )
+
+  target_compile_definitions(flash_attention PRIVATE
+    # Copied from https://github.com/pytorch/pytorch/blob/a10024d7dea47c52469059a47efe376eb20adca0/caffe2/CMakeLists.txt#L1431
+    FLASH_NAMESPACE=pytorch_flash
+    FLASHATTENTION_DISABLE_ALIBI
+    FLASHATTENTION_DISABLE_SOFTCAP
+    UNFUSE_FMA
+  )
+
+  set_target_properties(flash_attention PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+if(USE_FLASH_ATTENTION)
+  list(APPEND native_transformers_cuda_cpp ${native_flash_attn_api_cpp})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   list(APPEND FLASH_ATTENTION_CUDA_SOURCES ${flash_attention_cuda_cu} ${flash_attention_cuda_kernels_cu})
   list(APPEND ATen_ATTENTION_KERNEL_SRCS ${flash_attention_cuda_kernels_cu})
 
@@ -259,7 +309,10 @@ if(AT_MKL_ENABLED)
 endif()
 if(AT_KLEIDIAI_ENABLED)
   set(all_cpu_cpp ${all_cpu_cpp} ${native_kleidiai})
+<<<<<<< HEAD
   include_directories(SYSTEM INTERFACE ${KLEIDIAI_INCLUDE_DIRS})
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 if(AT_MKLDNN_ENABLED)
   set(all_cpu_cpp ${all_cpu_cpp} ${mkldnn_cpp})
@@ -270,6 +323,13 @@ else()
   set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp})
 endif()
 
+<<<<<<< HEAD
+=======
+if(USE_MTIA)
+    set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} ${mtia_cpp} ${mtia_h} ${native_mtia_cpp} ${native_mtia_h})
+endif()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(USE_XPU)
   list(APPEND ATen_XPU_SRCS ${mkldnn_xpu_cpp})
   list(APPEND ATen_XPU_DEPENDENCY_LIBS xpu_mkldnn)
@@ -384,12 +444,20 @@ endif()
     ${native_quantized_hip_hip}
     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
   )
+<<<<<<< HEAD
   if(WIN32) # Windows doesn't support Composable Kernels and Triton
     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
     file(GLOB native_hip_ck "native/hip/ck*.hip")
     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
       ${native_hip_bgemm} ${native_hip_ck}
       ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
+=======
+  if(WIN32) # Windows doesn't support Composable Kernels
+    file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
+    file(GLOB native_hip_ck "native/hip/ck*.hip")
+    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+      ${native_hip_bgemm} ${native_hip_ck})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
   list(APPEND all_hip_cpp
@@ -408,9 +476,12 @@ endif()
     ${miopen_cpp}
     ${all_hip_cpp}
   )
+<<<<<<< HEAD
   if(WIN32) # Windows doesn't support Triton
     exclude(all_hip_cpp "${all_hip_cpp}" ${native_transformers_hip_cpp})
   endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 if(USE_XPU)
@@ -422,6 +493,7 @@ endif()
 list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..)
 
 if(BLAS_FOUND)
+<<<<<<< HEAD
   if($ENV{TH_BINARY_BUILD})
     message(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")
     list(APPEND ATen_CPU_DEPENDENCY_LIBS
@@ -429,6 +501,9 @@ if(BLAS_FOUND)
   else($ENV{TH_BINARY_BUILD})
     list(APPEND ATen_CPU_DEPENDENCY_LIBS ${BLAS_LIBRARIES})
   endif($ENV{TH_BINARY_BUILD})
+=======
+  list(APPEND ATen_CPU_DEPENDENCY_LIBS ${BLAS_LIBRARIES})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif(BLAS_FOUND)
 
 if(LAPACK_FOUND)
@@ -614,8 +689,12 @@ endif()
       if($ENV{TH_BINARY_BUILD})
         # Do not do this on Linux: see Note [Extra MKL symbols for MAGMA in torch_cpu]
         # in caffe2/CMakeLists.txt
+<<<<<<< HEAD
         list(APPEND ATen_CUDA_DEPENDENCY_LIBS
           "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
+=======
+        list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${BLAS_LIBRARIES})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       endif($ENV{TH_BINARY_BUILD})
     endif(MSVC)
   endif(USE_MAGMA)
@@ -689,7 +768,11 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
 
 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS} ${native_nested_h} ${ATen_TRANSFORMER_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
+<<<<<<< HEAD
   list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${xpu_h} ${mps_h} ${native_kleidiai_h} ${native_mps_h} ${native_utils_h} ${miopen_h} ${mkldnn_xpu_h})
+=======
+  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${native_mtia_h} ${cudnn_h} ${hip_h} ${mtia_h} ${xpu_h} ${mps_h} ${native_kleidiai_h} ${native_mps_h} ${native_utils_h} ${miopen_h} ${mkldnn_xpu_h})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Metal
   if(USE_PYTORCH_METAL_EXPORT)
     # Add files needed from exporting metal models(optimized_for_mobile)
@@ -760,6 +843,10 @@ set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SC
 set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
 set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
 set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
+<<<<<<< HEAD
+=======
+set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} PARENT_SCOPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
 set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp
index 4bbe3624a5b0..08ad9b003fd9 100644
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@@ -69,7 +69,11 @@ Generator createCPUGenerator(uint64_t seed_val) {
  * Helper function to concatenate two 32 bit unsigned int
  * and return them as a 64 bit unsigned int
  */
+<<<<<<< HEAD
 inline uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) {
+=======
+inline static uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (static_cast<uint64_t>(hi) << 32) | lo;
 }
 
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 08e49d5e1b57..26d61641cdc3 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -335,15 +335,28 @@ at::BlasBackend Context::blasPreferredBackend() {
     static const bool hipblaslt_preferred = []() {
       static const std::vector<std::string> archs = {
           "gfx90a", "gfx942",
+<<<<<<< HEAD
 #if ROCM_VERSION >= 60300
           "gfx1200", "gfx1201",
 #endif
+=======
+#if ROCM_VERSION >= 60400
+          "gfx1200", "gfx1201",
+#endif
+#if ROCM_VERSION >= 60402
+          "gfx1150", "gfx1151",
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if ROCM_VERSION >= 60500
           "gfx950"
 #endif
       };
       for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
+<<<<<<< HEAD
         if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
+=======
+        if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return false;
         }
       }
@@ -364,12 +377,22 @@ at::BlasBackend Context::blasPreferredBackend() {
 #if ROCM_VERSION >= 60300
           "gfx1100", "gfx1101", "gfx1200", "gfx1201",
 #endif
+<<<<<<< HEAD
+=======
+#if ROCM_VERSION >= 60402
+          "gfx1150", "gfx1151",
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if ROCM_VERSION >= 60500
           "gfx950"
 #endif
       };
       for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
+<<<<<<< HEAD
         if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
+=======
+        if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           TORCH_WARN_ONCE(
             "Attempting to use hipBLASLt on an unsupported architecture! "
             "Overriding blas backend to hipblas");
@@ -422,7 +445,11 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
           "gfx90a",  "gfx942", "gfx950"
       };
       for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
+<<<<<<< HEAD
         if (!detail::getCUDAHooks().isGPUArch(index, archs)) {
+=======
+        if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           TORCH_WARN_ONCE(
             "Attempting to use CK on an unsupported architecture! Cannot set backend to CK");
           return true;
@@ -618,7 +645,11 @@ Allocator* getCPUAllocator() {
 //    means the allow_tf32 flags are overridden and tf32 is force disabled
 // override_allow_tf32_flag = false
 //    means the original allow_tf32 flags are followed
+<<<<<<< HEAD
 thread_local bool override_allow_tf32_flag = false;
+=======
+thread_local static bool override_allow_tf32_flag = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 NoTF32Guard::NoTF32Guard() {
   if (!override_allow_tf32_flag) {
@@ -641,7 +672,11 @@ bool NoTF32Guard::should_disable_tf32() {
 // This information can be used, for example, to select implementations
 // with different numerical or performance characteristics.
 // See https://pytorch.org/docs/stable/notes/numerical_accuracy.html for details.
+<<<<<<< HEAD
 thread_local bool rocm_is_backward_pass;
+=======
+thread_local static bool rocm_is_backward_pass;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ROCmBackwardPassGuard::ROCmBackwardPassGuard() {
   rocm_is_backward_pass = true;
@@ -697,7 +732,11 @@ void Context::setAllowFP16ReductionCPU(bool b) {
 #else
     if (true)
 #endif
+<<<<<<< HEAD
       throw std::runtime_error("Float16 arithmetic is not supported by the CPU!");
+=======
+      TORCH_CHECK(false, "Float16 arithmetic is not supported by the CPU!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   allow_fp16_reduction_cpu = b;
 }
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 7d0f4c445f38..5ccbdb346cd2 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -550,7 +550,12 @@ inline size_t getNumGPUs() {
   // devices for a specific device type, add that function to the
   // relevant library (e.g., similar to at::cuda::device_count())
   if (hasCUDA() && hasHIP()) {
+<<<<<<< HEAD
     throw std::runtime_error(
+=======
+    TORCH_CHECK(
+        false,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "Enabling both CUDA and HIP in ATen is not supported, as HIP masquerades "
         "to be CUDA (e.g., when you say CUDA, on a HIP build of ATen, this actually "
         "means HIP.  Rebuild PyTorch with one or the other disabled.");
diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index 2d16299c780d..8dda0c3acfbc 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -71,6 +71,12 @@ DLDataType getDLDataType(const Tensor& t) {
     case ScalarType::Float8_e8m0fnu:
       TORCH_CHECK(false, "float8 types are not supported by dlpack");
       break;
+<<<<<<< HEAD
+=======
+    case ScalarType::Float4_e2m1fn_x2:
+      TORCH_CHECK(false, "float4 types are not supported by dlpack");
+      break;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case ScalarType::QInt8:
     case ScalarType::QUInt8:
     case ScalarType::QInt32:
diff --git a/aten/src/ATen/DeviceAccelerator.cpp b/aten/src/ATen/DeviceAccelerator.cpp
index 7efa561e1801..4e4927cd9a06 100644
--- a/aten/src/ATen/DeviceAccelerator.cpp
+++ b/aten/src/ATen/DeviceAccelerator.cpp
@@ -76,7 +76,11 @@ c10::DeviceIndex deviceCount() {
     return static_cast<c10::DeviceIndex>(0);
   }
   c10::impl::VirtualGuardImpl impl(device_type.value());
+<<<<<<< HEAD
   return static_cast<c10::DeviceIndex>(impl.deviceCount());
+=======
+  return impl.deviceCount();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void setDeviceIndex(c10::DeviceIndex device_index) {
@@ -88,7 +92,11 @@ void setDeviceIndex(c10::DeviceIndex device_index) {
 c10::DeviceIndex getDeviceIndex() {
   const auto device_type = getAccelerator(true).value();
   c10::impl::VirtualGuardImpl impl(device_type);
+<<<<<<< HEAD
   return static_cast<c10::DeviceIndex>(impl.getDevice().index());
+=======
+  return impl.getDevice().index();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void setCurrentStream(c10::Stream stream) {
@@ -115,6 +123,24 @@ void synchronizeDevice(c10::DeviceIndex device_index) {
   // impl.synchronizeDevice should can be safely called from any device
   impl.synchronizeDevice(device_index);
 }
+<<<<<<< HEAD
+=======
+
+c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  c10::impl::VirtualGuardImpl impl(device_type);
+  return impl.exchangeDevice({device_type, device_index}).index();
+}
+
+c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  c10::impl::VirtualGuardImpl impl(device_type);
+  // Avoid creating a new context if the context for the given device_index
+  // is not initialized.
+  impl.uncheckedSetDevice({device_type, device_index});
+  return impl.getDevice().index();
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // NOLINTEND(bugprone-unchecked-optional-access)
 
 } // namespace at::accelerator
diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h
index 60e74a90d604..6b57ce8e2d39 100644
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@@ -26,6 +26,7 @@ TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
 // Check if the given device type is an accelerator.
 TORCH_API bool isAccelerator(c10::DeviceType device_type);
 
+<<<<<<< HEAD
 // Check if the given device type is an accelerator, not an excluded one.
 TORCH_API inline bool isAcceleratorExcluded(
     c10::DeviceType device_type,
@@ -33,6 +34,8 @@ TORCH_API inline bool isAcceleratorExcluded(
   return device_type != excluded && isAccelerator(device_type);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Check if the given device type is an accelerator, not the excluded ones.
 template <
     typename... T,
@@ -41,8 +44,17 @@ TORCH_API inline bool isAcceleratorExcluded(
     c10::DeviceType device_type,
     c10::DeviceType first_excluded,
     T... rest_excluded) {
+<<<<<<< HEAD
   return device_type != first_excluded &&
       isAcceleratorExcluded(device_type, rest_excluded...);
+=======
+  if constexpr (sizeof...(rest_excluded) > 0) {
+    return device_type != first_excluded &&
+        isAcceleratorExcluded(device_type, rest_excluded...);
+  } else {
+    return device_type != first_excluded && isAccelerator(device_type);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Return the number of the device available. Note that this is *REQUIRED* to
@@ -66,6 +78,18 @@ TORCH_API c10::Stream getCurrentStream(c10::DeviceIndex device_index);
 // on the given device index has been completed.
 TORCH_API void synchronizeDevice(c10::DeviceIndex device_index);
 
+<<<<<<< HEAD
+=======
+// Set the current device index to the given device_index and return the
+// original device index that was active before the change.
+TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
+
+// Set the current device index to the given device_index. Avoid creating a new
+// context if the context for device_index is not initialized. Return the
+// original device index that was active before the change.
+TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::accelerator
 
 namespace at {
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index 5c7b39c6427a..6bdc3d9e2f5e 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -200,7 +200,11 @@ inline at::ScalarType scalar_type(at::ScalarType s) {
     switch (_st) {                                                          \
       __VA_ARGS__                                                           \
       default:                                                              \
+<<<<<<< HEAD
         TORCH_CHECK(                                                        \
+=======
+        TORCH_CHECK_NOT_IMPLEMENTED(                                        \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             false,                                                          \
             '"',                                                            \
             at_dispatch_name,                                               \
diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index 5361d6b2d0c3..9199936eee87 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -28,8 +28,12 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
       opt_device_type = at::getAccelerator(false);
     }
     if (opt_device_type.has_value()) {
+<<<<<<< HEAD
       return at::globalContext().getPinnedMemoryAllocator(
           opt_device_type.value());
+=======
+      return at::globalContext().getPinnedMemoryAllocator(opt_device_type);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       TORCH_CHECK(
           false, "Need to provide pin_memory allocator to use pin memory.")
@@ -160,17 +164,33 @@ SymInt computeStorageNbytes(
   // of the last element according to stride
   SymInt size = 1;
   for (const auto i : c10::irange(sizes.size())) {
+<<<<<<< HEAD
     if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[i].sym_eq(0))) {
       return 0;
     }
 
+=======
+    if (TORCH_GUARD_OR_FALSE(sizes[i].sym_eq(0))) {
+      return 0;
+    }
+
+    // NOTE: while this can technically return negative sizes for
+    // 0-element tensors, there's a check in TensorShape:set_storage_meta__symint
+    // that skips setting nbytes with unbacked expressions.
+    // Would probably be safer to wrap this with a max(*, 0),
+    // once our min/max symbolic reasoning improves.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     size += strides[i] * (sizes[i] - 1);
   }
   return itemsize_bytes * (storage_offset + size);
 }
 
 template <typename T>
+<<<<<<< HEAD
 TensorBase _empty_generic(
+=======
+static TensorBase _empty_generic(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ArrayRef<T> size,
     c10::Allocator* allocator,
     c10::DispatchKeySet ks,
@@ -223,7 +243,11 @@ TensorBase empty_generic_symint(
 }
 
 template <typename T>
+<<<<<<< HEAD
 TensorBase _empty_strided_generic(
+=======
+static TensorBase _empty_strided_generic(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T size,
     T stride,
     c10::Allocator* allocator,
diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp
index 5f6be741ce01..a2354b3d88ae 100644
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@@ -59,7 +59,11 @@ SymDimVector infer_size_symdimvector(SymIntArrayRef a, SymIntArrayRef b) {
 }
 
 template<typename Container>
+<<<<<<< HEAD
 C10_ALWAYS_INLINE InferExpandGeometryResult<Container> inferExpandGeometryImpl(
+=======
+C10_ALWAYS_INLINE static InferExpandGeometryResult<Container> inferExpandGeometryImpl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IntArrayRef tensor_sizes,
     IntArrayRef tensor_strides,
     IntArrayRef sizes) {
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index e9abc85b59c3..c2348306b2f7 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -461,9 +461,23 @@ inline Tensor _sum_to(
     reduce_dims.push_back(i);
   }
   for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
+<<<<<<< HEAD
     if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(shape[i - leading_dims], 1)) &&
         TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(sizes[i], 1))) {
       reduce_dims.push_back(i);
+=======
+    if (TORCH_GUARD_OR_FALSE(sym_eq(shape[i - leading_dims], 1)) &&
+        TORCH_GUARD_OR_TRUE(sym_ne(sizes[i], 1))) {
+      reduce_dims.push_back(i);
+    } else {
+      // if we assume no reduction due to unbacked we ensure that at runtime.
+      TORCH_MAYBE_SYM_CHECK(
+          sym_eq(shape[i - leading_dims], sizes[i]),
+          "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:",
+          shape[i - leading_dims],
+          ", ",
+          sizes[i])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index 409f944a88e3..97316bfd6b68 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -737,7 +737,11 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
 }
 
 template <typename T>
+<<<<<<< HEAD
 bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
+=======
+static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (list.size() == 0) return false;
   auto functional_count = 0;
   for (const auto& tensor : list) {
@@ -803,7 +807,11 @@ void set_sizes_strides_offset(const std::vector<Tensor>& outs, const std::vector
   }
 }
 
+<<<<<<< HEAD
 thread_local bool _functionalizationReapplyViews;
+=======
+thread_local static bool _functionalizationReapplyViews;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool getFunctionalizationReapplyViewsTLS() {
   return _functionalizationReapplyViews;
diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
index 36b6f91c1d99..63d26e7042d6 100644
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@@ -7,6 +7,10 @@
 #include <torch/library.h>
 #include <c10/util/irange.h>
 #include <c10/util/strides.h>
+<<<<<<< HEAD
+=======
+#include <ATen/EmptyTensor.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/ATen.h>
@@ -315,8 +319,38 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
   // See  Note [Propagating strides in the functionalization pass]
   // (for _unsafe_view, I'm just manually doing the shape inference rule here instead of calling the meta function for unsafe_view)
   auto inferred_size = at::infer_size_dv(size, self.sym_numel());
+<<<<<<< HEAD
   auto stride = at::detail::computeStride(self.sym_sizes(), self.sym_strides(), inferred_size);
   TORCH_INTERNAL_ASSERT(stride.has_value());
+=======
+
+  auto stride = at::detail::computeStride(self.sym_sizes(), self.sym_strides(), inferred_size);
+
+  if (!stride.has_value()) {
+    // With unbacked symints, computeStride could fail even on contiguous
+    // tensors. In this case, we can use the strides of an empty tensor of
+    // inferred_size.
+    TORCH_CHECK(
+        self.is_contiguous(),
+        "View is not valid from size:",
+        self.sym_sizes(),
+        " stride: ",
+        self.sym_strides(),
+        " to shape: ",
+        inferred_size,
+        " in case of unbacked symbols consider adding torch.check to guide computing strides.");
+
+    stride = at::detail::empty_symint_meta(
+                 inferred_size,
+                 std::nullopt,
+                 std::nullopt,
+                 std::nullopt,
+                 std::nullopt,
+                 std::nullopt)
+                 .sym_strides();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   out.unsafeGetTensorImpl()->set_sizes_and_strides(inferred_size, stride.value());
   return out;
 }
diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h
index 3bcccfad971c..b7adde7ca6fd 100644
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@@ -25,11 +25,16 @@ inline void infer_size_impl(
   // N.B. this is an index, not a sym dim!
   std::optional<int64_t> infer_dim;
   for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
+<<<<<<< HEAD
     if (shape[dim] == -1) {
+=======
+    if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (infer_dim) {
         throw std::runtime_error("only one dimension can be inferred");
       }
       infer_dim = dim;
+<<<<<<< HEAD
     } else if (shape[dim] >= 0) {
       newsize *= shape[dim];
     } else {
@@ -62,6 +67,56 @@ inline void infer_size_impl(
   std::ostringstream ss;
   ss << "shape '" << shape << "' is invalid for input of size " << numel;
   throw std::runtime_error(ss.str());
+=======
+    } else {
+      // in case of unbacked shape[dim] we assume it's not -1 and add a runtime
+      // assertion.
+      TORCH_MAYBE_SYM_CHECK(
+          sym_gt(shape[dim], -1),
+          "invalid shape dimension ",
+          shape[dim],
+          " at index ",
+          dim,
+          " of shape ",
+          shape);
+      newsize *= shape[dim];
+    }
+  }
+
+  auto set_infer_dim = [&]() {
+    // We have a degree of freedom here to select the dimension size; follow
+    // NumPy semantics and just bail.  However, a nice error message is needed
+    // because users often use `view` as a way to flatten & unflatten
+    // dimensions and will otherwise be confused why
+    //   empty_tensor.view( 0, 0)
+    // works yet
+    //   empty_tensor.view(-1, 0)
+    // doesn't.
+    TORCH_CHECK(
+        newsize != 0,
+        "cannot reshape tensor of 0 elements into shape ",
+        shape,
+        " because the unspecified dimension size -1 can be any "
+        "value and is ambiguous");
+    res[*infer_dim] = numel / newsize;
+    return;
+  };
+
+  if (infer_dim && newsize > 0 && numel % newsize == 0) {
+    set_infer_dim();
+    return;
+  }
+
+  TORCH_MAYBE_SYM_CHECK(
+      sym_eq(numel, newsize),
+      "shape '",
+      shape,
+      "' is invalid for input of size ",
+      numel);
+  if (infer_dim) {
+    set_infer_dim();
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {
diff --git a/aten/src/ATen/LegacyVmapMode.cpp b/aten/src/ATen/LegacyVmapMode.cpp
index 731f8cbafd82..7e5eab1cb93a 100644
--- a/aten/src/ATen/LegacyVmapMode.cpp
+++ b/aten/src/ATen/LegacyVmapMode.cpp
@@ -2,7 +2,11 @@
 
 namespace at::impl {
 
+<<<<<<< HEAD
 thread_local int64_t VmapMode_current_vmap_level = 0;
+=======
+thread_local static int64_t VmapMode_current_vmap_level = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 int64_t VmapMode::current_vmap_level() {
   return VmapMode_current_vmap_level;
diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp
index 61336037d71b..15e59033f2da 100644
--- a/aten/src/ATen/MemoryOverlap.cpp
+++ b/aten/src/ATen/MemoryOverlap.cpp
@@ -35,7 +35,11 @@ MemOverlap has_internal_overlap(TensorImpl* t) {
     // SymInts.  Thus, if I have u0 size, we should assume that this has > 1
     // elements (first expression), but if I have a u0 stride, I should NOT
     // assume that it is not zero (second expression)
+<<<<<<< HEAD
     if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[i].sym_gt(1)) && strides[i] == 0) {
+=======
+    if (TORCH_GUARD_OR_FALSE(sizes[i].sym_gt(1)) && strides[i] == 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return MemOverlap::Yes;
     }
   }
diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
index b64ac79bc9f5..93b16ed17f08 100644
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@@ -71,7 +71,11 @@ c10::DispatchKeySet get_view_key_set(const at::Tensor& base) {
 
 namespace at::native {
 
+<<<<<<< HEAD
 inline std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
+=======
+inline static std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // torch.tensor([]) is considered to have `dim() = 1` and `size(0) = 0`
   // torch.nested_tensor([]) should also has `dim() = 1` and `size(0) = 0`
   if (sizes.dim() == 0) {
@@ -182,7 +186,11 @@ NestedTensorImpl::NestedTensorImpl(
       "coverage, and works with torch.compile.");
   auto storage_device = storage_.device();
   TORCH_INTERNAL_ASSERT(
+<<<<<<< HEAD
       storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_privateuseone(),
+=======
+      storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_hpu() || storage_device.is_privateuseone(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "NestedTensorImpl storage must be either CUDA, CPU, XPU or ", get_privateuse1_backend(), " but got ",
       storage_device);
   validate_nested_tensor_metadata(nested_sizes_, nested_strides_, storage_offsets_);
diff --git a/aten/src/ATen/OpMathType.h b/aten/src/ATen/OpMathType.h
index d00195b07e49..0cdb18eea703 100644
--- a/aten/src/ATen/OpMathType.h
+++ b/aten/src/ATen/OpMathType.h
@@ -41,6 +41,13 @@ struct OpMathType<at::Float8_e4m3fnuz> {
   using type = float;
 };
 template <>
+<<<<<<< HEAD
+=======
+struct OpMathType<at::Float8_e8m0fnu> {
+  using type = float;
+};
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct OpMathType<c10::complex<Half>> {
   using type = c10::complex<float>;
 };
diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h
index f9f69aa3c42b..254efbdd543c 100644
--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@@ -29,12 +29,29 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
       bool is_non_overlapping_and_dense = true)
       : TensorImpl(key_set, data_type, device),
         opaque_handle_(std::move(opaque_handle)) {
+<<<<<<< HEAD
     set_storage_access_should_throw();
     set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
     sizes_and_strides_.set_sizes(sizes);
     refresh_numel();
     // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
     is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
+=======
+    constructor_impl(sizes, is_non_overlapping_and_dense);
+  }
+
+  OpaqueTensorImpl(
+      TensorImpl::ImplType impl_type,
+      c10::Storage&& storage,
+      at::DispatchKeySet key_set,
+      const caffe2::TypeMeta data_type,
+      OpaqueHandle opaque_handle,
+      c10::IntArrayRef sizes,
+      bool is_non_overlapping_and_dense = true)
+      : TensorImpl(impl_type, std::move(storage), key_set, data_type),
+        opaque_handle_(std::move(opaque_handle)) {
+    constructor_impl(sizes, is_non_overlapping_and_dense);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Destructor doesn't call release_resources because it's
@@ -181,6 +198,20 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
     return "OpaqueTensorImpl";
   }
 
+<<<<<<< HEAD
+=======
+  void constructor_impl(
+      c10::IntArrayRef sizes,
+      bool is_non_overlapping_and_dense) {
+    set_storage_access_should_throw();
+    set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
+    sizes_and_strides_.set_sizes(sizes);
+    refresh_numel();
+    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+    is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   OpaqueHandle opaque_handle_;
 };
 
diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp
index 699c47e36725..db6867681e80 100644
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@@ -222,8 +222,12 @@ void set_num_threads(int nthreads) {
     int stored_nthreads = num_intraop_threads.load();
     if (stored_nthreads <= 0) {
       // plus one because of master thread
+<<<<<<< HEAD
       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
       stored_nthreads = _get_intraop_pool().size() + 1;
+=======
+      stored_nthreads = static_cast<int>(_get_intraop_pool().size() + 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     if (stored_nthreads != nthreads) {
       TORCH_WARN(
@@ -251,8 +255,12 @@ int get_num_threads() {
     return intraop_default_num_threads();
   } else {
     TORCH_INTERNAL_ASSERT(nthreads == CONSUMED);
+<<<<<<< HEAD
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
     return _get_intraop_pool().size() + 1;
+=======
+    return static_cast<int>(_get_intraop_pool().size() + 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #else
   caffe2::PThreadPool* const pool = caffe2::pthreadpool();
diff --git a/aten/src/ATen/ParallelOpenMP.cpp b/aten/src/ATen/ParallelOpenMP.cpp
index 388cbb1a4b9f..2e09fba48f69 100644
--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@@ -10,6 +10,7 @@
 #include <mkl.h>
 #endif
 
+<<<<<<< HEAD
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 
 namespace at {
@@ -19,6 +20,15 @@ namespace native::mkldnn {
 void clear_computation_cache();
 } // namespace native::mkldnn
 #endif
+=======
+#if AT_MKLDNN_ENABLED()
+#include <ATen/native/mkldnn/IDeepRegistration.h>
+#endif
+
+#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
+
+namespace at {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 // Number of threads set by the user
diff --git a/aten/src/ATen/SavedTensorHooks.cpp b/aten/src/ATen/SavedTensorHooks.cpp
index 0313849f788f..b9a3603cb663 100644
--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@@ -26,9 +26,15 @@ bool SavedTensorDefaultHooks::is_enabled() {
   return !tls.disabled_error_message.has_value();
 }
 
+<<<<<<< HEAD
 void SavedTensorDefaultHooks::disable(const std::string& message) {
   tls.disabled_error_message = message;
   if (!tls.stack.empty()) {
+=======
+void SavedTensorDefaultHooks::disable(const std::string& message, const bool fail_if_non_empty) {
+  tls.disabled_error_message = message;
+  if (fail_if_non_empty && !tls.stack.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assertSavedTensorHooksNotDisabled();
   }
 }
@@ -72,9 +78,15 @@ std::pair<SafePyObject, SafePyObject> SavedTensorDefaultHooks::pop_hooks() {
   return hooks;
 }
 
+<<<<<<< HEAD
 std::optional<std::pair<SafePyObject, SafePyObject>> SavedTensorDefaultHooks::get_hooks() {
   // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
   if (!is_initialized || tls.stack.empty() || tls.is_tracing) {
+=======
+std::optional<std::pair<SafePyObject, SafePyObject>> SavedTensorDefaultHooks::get_hooks(bool ignore_is_tracing) {
+  // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
+  if (!is_initialized || tls.stack.empty() || (!ignore_is_tracing && tls.is_tracing)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::nullopt;
   }
   return tls.stack.top();
diff --git a/aten/src/ATen/SavedTensorHooks.h b/aten/src/ATen/SavedTensorHooks.h
index 2803bdc64668..3dfc5a535a06 100644
--- a/aten/src/ATen/SavedTensorHooks.h
+++ b/aten/src/ATen/SavedTensorHooks.h
@@ -36,7 +36,11 @@ struct TORCH_API SavedTensorDefaultHooks {
       c10::SafePyObject unpack_hook);
   static std::pair<c10::SafePyObject, c10::SafePyObject> pop_hooks();
   static std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
+<<<<<<< HEAD
   get_hooks();
+=======
+  get_hooks(bool ignore_is_tracing = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static void lazy_initialize();
 
   static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
@@ -48,7 +52,13 @@ struct TORCH_API SavedTensorDefaultHooks {
   // disabled, then the following will raise an error:
   // - Attempting to push_hooks
   // - calling disable(message) with a non-zero stack (hooks) size
+<<<<<<< HEAD
   static void disable(const std::string& error_message);
+=======
+  static void disable(
+      const std::string& error_message,
+      const bool fail_if_non_empty = true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static void enable();
   static bool is_enabled();
   static const std::optional<std::string>& get_disabled_error_message();
diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp
index 693fb46e639f..c5a4447b932a 100644
--- a/aten/src/ATen/ScalarOps.cpp
+++ b/aten/src/ATen/ScalarOps.cpp
@@ -8,7 +8,32 @@ namespace at {
 namespace {
 template <typename scalar_t>
 inline void fill_inplace(Tensor& self, const Scalar& value_scalar) {
+<<<<<<< HEAD
   auto value = value_scalar.to<scalar_t>();
+=======
+  scalar_t value{};
+
+  if constexpr (std::is_same_v<scalar_t, at::Half> ||
+                std::is_same_v<scalar_t, at::BFloat16> ||
+                std::is_same_v<scalar_t, at::Float8_e5m2> ||
+                std::is_same_v<scalar_t, at::Float8_e5m2fnuz> ||
+                std::is_same_v<scalar_t, at::Float8_e4m3fn> ||
+                std::is_same_v<scalar_t, at::Float8_e4m3fnuz> ||
+                std::is_same_v<scalar_t, at::Float8_e8m0fnu>) {
+    // relaxed float cast: allow inf similar to the torch.tensor constructor
+    //
+    // without this, we had the following divergence:
+    //   torch.tensor(1123581321.0, dtype=torch.float16)
+    //     => tensor(inf, dtype=torch.float16)
+    //   torch.ops.aten.scalar_tensor.default(1123581321, dtype=torch.float16)
+    //     => RuntimeError: value cannot be converted to type at::Half without overflow
+
+    value = static_cast<scalar_t>(value_scalar.to<double>());
+  } else {
+    value = value_scalar.to<scalar_t>();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   scalar_t* dptr = static_cast<scalar_t*>(self.data_ptr());
   *dptr = value;
 }
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 2a3b9481255f..8a64d62040c2 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -108,7 +108,11 @@ void SparseTensorImpl::set_indices_and_values_unsafe(const Tensor& indices, cons
   AT_ASSERT(device() == values_.device());
   AT_ASSERT(values_.device() == indices_.device());
 
+<<<<<<< HEAD
   coalesced_ = TORCH_GUARD_SIZE_OBLIVIOUS(sym_nnz().sym_lt(2));
+=======
+  coalesced_ = TORCH_GUARD_OR_FALSE(sym_nnz().sym_lt(2));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp
index cc6fc929f533..8dd30590eb26 100644
--- a/aten/src/ATen/TensorGeometry.cpp
+++ b/aten/src/ATen/TensorGeometry.cpp
@@ -5,7 +5,11 @@ namespace at {
 
 // See TensorGeometry.h on why this is useful now that we cache is_contiguous.
 template <typename T>
+<<<<<<< HEAD
 bool _geometry_is_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides) {
+=======
+static bool _geometry_is_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   assert(!overflows<std::int64_t>(sizes.size()));
   auto dim = static_cast<std::int64_t>(sizes.size());
   T expected_stride = 1;
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index 38fe78901ce7..a090164a8bef 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -222,8 +222,13 @@ inline Tensor applySlice(
         ? (*self_sizes)[dim]
         : self.sym_size(dim);
     if (!disable_slice_optimization &&
+<<<<<<< HEAD
         TORCH_GUARD_SIZE_OBLIVIOUS(start.sym_eq(0)) &&
         TORCH_GUARD_SIZE_OBLIVIOUS(length.sym_eq(stop)) && step == 1) {
+=======
+        TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
+        TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return self;
     }
   }
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index 805f1f2f6c2e..7a6d5f8e30db 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -1388,7 +1388,11 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
     case FastSetupType::NON_OVERLAPPING_DENSE:
       {
         // find the index of a defined tensor in operands_ start from input tensor
+<<<<<<< HEAD
         int i_defined; // NOLINT(cppcoreguidelines-init-variables)
+=======
+        int i_defined = -1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for (i_defined = ntensors() - 1; i_defined >= 0; --i_defined) {
           if (tensor(i_defined).defined()) break;
         }
@@ -1535,7 +1539,10 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
   // Nothing beyond this point is important for meta functions, so it's fine to exit early here.
   // Extend the condition to MAIA tesnors as MAIA tensors also don't have storage.
   if (privateuse1_without_storage  ||
+<<<<<<< HEAD
       common_device_.type() == DeviceType::MTIA ||
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       common_device_.type() == DeviceType::XLA  ||
       common_device_.type() == DeviceType::IPU  ||
       common_device_.type() == DeviceType::Lazy ||
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 399164688f86..0e23a5b9824d 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -327,7 +327,11 @@ std::vector<int64_t> defaultStrides(IntArrayRef sizes) {
 // see overloads of computeStride() below.
 //
 template <typename ResultVec, typename NewShapeVec, typename Numel>
+<<<<<<< HEAD
 inline std::optional<ResultVec> computeStride_impl(
+=======
+inline static std::optional<ResultVec> computeStride_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const NewShapeVec& oldshape,
     const NewShapeVec& oldstride,
     const NewShapeVec& newshape,
@@ -343,7 +347,11 @@ inline std::optional<ResultVec> computeStride_impl(
   // This could perhaps be combined with the below code, but the complexity
   // didn't seem worth it.
   const Numel numel = c10::multiply_integers(oldshape);
+<<<<<<< HEAD
   bool zero_numel = TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, 0));
+=======
+  bool zero_numel = TORCH_GUARD_OR_FALSE(sym_eq(numel, 0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (zero_numel && oldshape.equals(newshape)) {
     return toResult(oldstride);
   }
@@ -367,19 +375,47 @@ inline std::optional<ResultVec> computeStride_impl(
   // numel in current chunk
   Numel tensor_numel = 1;
   Numel view_numel = 1;
+<<<<<<< HEAD
+=======
+
+ // The usages of TORCH_GUARD_OR_TRUE/TORCH_GUARD_OR_FALSE below could result in returning
+ // std::nullopt which has an effect of falling back to a clone when unbacked symints are present.
+ // But it will not result in returning different or wrong results.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (int64_t tensor_d = oldshape.size() - 1; tensor_d >= 0; tensor_d--) {
     tensor_numel *= oldshape[tensor_d];
     // if end of tensor size chunk, check view
     if ((tensor_d == 0) ||
+<<<<<<< HEAD
         (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(oldshape[tensor_d - 1], 1)) &&
          TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(oldstride[tensor_d - 1], tensor_numel * chunk_base_stride)))) {
       while (view_d >= 0 &&
             (TORCH_GUARD_SIZE_OBLIVIOUS(sym_lt(view_numel, tensor_numel)) || TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(newshape[view_d], 1)))) {
+=======
+        (TORCH_GUARD_OR_TRUE(sym_ne(oldshape[tensor_d - 1], 1)) &&
+        TORCH_GUARD_OR_TRUE(sym_ne(oldstride[tensor_d - 1], tensor_numel * chunk_base_stride)))) {
+     // We want to accumulate stuff in view_numel until view_numel == tensor_numel, if we do not
+     // know if that is satisfied we keep accumalating. For example if view_numel = 1 and tensor_numel = u1,
+     // we want to take that path, view_numel will become u0. Next iteration if u0==u1 we want to stop.
+     // Thats why we use TORCH_GUARD_OR_TRUE below.
+
+     // we use TORCH_GUARD_OR_FALSE and not TORCH_GUARD_OR_TRUE when comparing newshape[view_d] ==1 because
+     // if we know view_numel < tensor_numel is false, we want to stop. Unless we know for sure newshape[view_d]==1
+     // in that case we would stop in the next iteration anyway. For example, if view_numel = u0 and tensor_numel = u1,
+     // and u0==u1, then want to stop unless newshape[view_d]==1. taking one more iteration will keep [view_numel = u0
+     // and tensor_numel = u1].
+      while (view_d >= 0 &&
+            (TORCH_GUARD_OR_TRUE(sym_lt(view_numel, tensor_numel)) || TORCH_GUARD_OR_FALSE(sym_eq(newshape[view_d], 1)))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         newstride[view_d] = view_numel * chunk_base_stride;
         view_numel *= newshape[view_d];
         view_d--;
       }
+<<<<<<< HEAD
       if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(view_numel, tensor_numel))) {
+=======
+      if (TORCH_GUARD_OR_TRUE(sym_ne(view_numel, tensor_numel))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return std::nullopt;
       }
       if (tensor_d > 0) {
diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
index 51d5f2d6412f..b01b4697c281 100644
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@@ -105,7 +105,11 @@ std::string get_cpu_capability() {
       return "DEFAULT";
     case native::CPUCapability::ZVECTOR:
       return "Z VECTOR";
+<<<<<<< HEAD
 #elif defined(HAVE_SVE_CPU_DEFINITION)
+=======
+#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case native::CPUCapability::DEFAULT:
       return "DEFAULT";
     case native::CPUCapability::SVE256:
diff --git a/aten/src/ATen/VmapModeRegistrations.cpp b/aten/src/ATen/VmapModeRegistrations.cpp
index 5e137c7396ba..538c802b19cf 100644
--- a/aten/src/ATen/VmapModeRegistrations.cpp
+++ b/aten/src/ATen/VmapModeRegistrations.cpp
@@ -20,12 +20,20 @@ namespace at {
 // We haven't made a decision on that yet so we are temporarily banning random
 // operations inside of vmap while we gather user feedback.
 
+<<<<<<< HEAD
 template <typename... Args> Tensor unsupportedRandomOp(Args... args) {
+=======
+template <typename... Args> static Tensor unsupportedRandomOp(Args... args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "vmap: We do not yet support calling random operations inside of vmap. ",
               "Please perform random operations outside of vmap as a workaround");
 }
 
+<<<<<<< HEAD
 template <typename... Args> Tensor& unsupportedRandomOp_(Args... args) {
+=======
+template <typename... Args> static Tensor& unsupportedRandomOp_(Args... args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "vmap: We do not yet support calling random operations inside of vmap. ",
               "Please perform random operations outside of vmap as a workaround");
 }
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 4fae147e2815..f23cdd51342e 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -64,7 +64,11 @@ thread_local std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
         at::ScalarType::Undefined, // IDEEP.
         at::kHalf, // AMD HIP
         at::ScalarType::Undefined, // FPGA
+<<<<<<< HEAD
         at::ScalarType::Undefined, // ONNX Runtime / Microsoft
+=======
+        at::kBFloat16, // ONNX Runtime / Microsoft
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         at::kBFloat16, // XLA / TPU
         at::ScalarType::Undefined, // Vulkan
         at::ScalarType::Undefined, // Metal
@@ -500,6 +504,47 @@ TORCH_LIBRARY_IMPL(aten, AutocastMTIA, m) {
          TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
 }
 
+<<<<<<< HEAD
+=======
+// MAIA
+TORCH_LIBRARY_IMPL(_, AutocastMAIA, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+
+TORCH_LIBRARY_IMPL(aten, AutocastMAIA, m) {
+  // lower_precision_fp
+#define _KERNEL_MAIA_LOW_PRECISION_FP(...) \
+  KERNEL_MAIA(__VA_ARGS__, lower_precision_fp)
+
+  AT_FORALL_LOWER_PRECISION_FP(_KERNEL_MAIA_LOW_PRECISION_FP)
+
+  // fp32
+#define _KERNEL_MAIA_FP32(...) KERNEL_MAIA(__VA_ARGS__, fp32)
+
+  AT_FORALL_FP32(_KERNEL_MAIA_FP32)
+
+  // fp32_set_opt_dtype
+#define _KERNEL_MAIA_FP32_SET_OPT_DTYPE(...) \
+  KERNEL_MAIA(__VA_ARGS__, fp32_set_opt_dtype)
+
+  AT_FORALL_FP32_SET_OPT_DTYPE(_KERNEL_MAIA_FP32_SET_OPT_DTYPE)
+
+  // fp32_append_dtype
+  // The fp32_append_dtype wrapper overrides implicit promotion behavior.
+  // norm does not implicitly promote, but be aware when adding new ops to this policy.
+  AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(
+      KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA)
+
+  // promote
+#define _KERNEL_MAIA_PROMOTE(...) KERNEL_MAIA(__VA_ARGS__, promote)
+
+  AT_FORALL_PROMOTE(_KERNEL_MAIA_PROMOTE)
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
+         TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // XPU
 TORCH_LIBRARY_IMPL(_, AutocastXPU, m) {
   m.fallback(torch::CppFunction::makeFallthrough());
diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
index ec30eb66834a..6b7b575b3c19 100644
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@@ -123,12 +123,23 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
   _(privateuseone, at::kPrivateUse1)
 
 // deprecated other backend specific autocast APIs
+<<<<<<< HEAD
 AT_FORALL_DEPRECATED_AUTOCAST_BACKENDS(DECLARE_DEPRECATED_AUTOCAST_APIS)
 
 const std::array<at::DeviceType, 9> _AUTOCAST_SUPPORTED_DEVICES{
     at::kCPU,
     at::kCUDA,
     at::kMTIA,
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+AT_FORALL_DEPRECATED_AUTOCAST_BACKENDS(DECLARE_DEPRECATED_AUTOCAST_APIS)
+
+const std::array<at::DeviceType, 10> _AUTOCAST_SUPPORTED_DEVICES{
+    at::kCPU,
+    at::kCUDA,
+    at::kMTIA,
+    at::kMAIA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::kXPU,
     at::kIPU,
     at::kHPU,
@@ -149,6 +160,11 @@ inline bool is_autocast_eligible(
           tensor.is_floating_point();
     case c10::DeviceType::MTIA:
       return tensor.is_mtia() && tensor.is_floating_point();
+<<<<<<< HEAD
+=======
+    case c10::DeviceType::MAIA:
+      return tensor.is_maia() && tensor.is_floating_point();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case c10::DeviceType::XPU:
       return tensor.is_xpu() && tensor.is_floating_point();
     case c10::DeviceType::IPU:
@@ -176,6 +192,11 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
       return DispatchKey::AutocastCPU;
     case c10::DeviceType::MTIA:
       return DispatchKey::AutocastMTIA;
+<<<<<<< HEAD
+=======
+    case c10::DeviceType::MAIA:
+      return DispatchKey::AutocastMAIA;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case c10::DeviceType::XPU:
       return DispatchKey::AutocastXPU;
     case c10::DeviceType::IPU:
@@ -189,7 +210,12 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
     case c10::DeviceType::MPS:
       return DispatchKey::AutocastMPS;
     default:
+<<<<<<< HEAD
       throw std::runtime_error(
+=======
+      TORCH_CHECK(
+          false,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "unknown device type for autocast in get_autocast_dispatch_key_from_device_type");
   }
 }
@@ -210,7 +236,12 @@ inline at::ScalarType get_lower_precision_fp_from_device_type(
   if (is_autocast_available(device_type)) {
     return get_autocast_dtype(device_type);
   } else {
+<<<<<<< HEAD
     throw std::runtime_error(
+=======
+    TORCH_CHECK(
+        false,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "unknown device type for autocast in get_lower_precision_fp_from_device_type");
   }
 }
@@ -747,6 +778,27 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
       REDISPATCH_SIGNATURE,                         \
       POLICY)
 
+<<<<<<< HEAD
+=======
+// KERNEL_MAIA/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA
+// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastMAIA
+#define KERNEL_MAIA(...) KERNEL(c10::DeviceType::MAIA, __VA_ARGS__)
+
+#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA( \
+    REDISPATCH_FUNC,                                \
+    REGISTER_NAME,                                  \
+    REGISTER_SIGNATURE,                             \
+    REDISPATCH_SIGNATURE,                           \
+    POLICY)                                         \
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(            \
+      c10::DeviceType::MAIA,                        \
+      REDISPATCH_FUNC,                              \
+      REGISTER_NAME,                                \
+      REGISTER_SIGNATURE,                           \
+      REDISPATCH_SIGNATURE,                         \
+      POLICY)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // KERNEL_XPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU
 // registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastXPU
 #define KERNEL_XPU(...) KERNEL(c10::DeviceType::XPU, __VA_ARGS__)
diff --git a/aten/src/ATen/core/ATen_pch.h b/aten/src/ATen/core/ATen_pch.h
index f10c191a4c1f..cd9b48cc0549 100644
--- a/aten/src/ATen/core/ATen_pch.h
+++ b/aten/src/ATen/core/ATen_pch.h
@@ -3,11 +3,14 @@
 #pragma push_macro("TORCH_ASSERT_NO_OPERATORS")
 #define TORCH_ASSERT_NO_OPERATORS
 
+<<<<<<< HEAD
 // This macro doesn't work if defined after the first time inttypes.h
 // is included, so won't work anywhere if not defined here.
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cinttypes>
 
 // This list of headers was generated using a script that finds
diff --git a/aten/src/ATen/core/CachingHostAllocator.cpp b/aten/src/ATen/core/CachingHostAllocator.cpp
new file mode 100644
index 000000000000..5939253caf55
--- /dev/null
+++ b/aten/src/ATen/core/CachingHostAllocator.cpp
@@ -0,0 +1,33 @@
+#include <ATen/core/CachingHostAllocator.h>
+
+#include <array>
+
+namespace at {
+
+namespace {
+
+static std::array<HostAllocator*, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+    allocator_array{};
+static std::array<uint8_t, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+    allocator_priority{};
+
+} // anonymous namespace
+
+void setHostAllocator(
+    at::DeviceType device_type,
+    at::HostAllocator* allocator,
+    uint8_t priority) {
+  if (priority >= allocator_priority[static_cast<int>(device_type)]) {
+    allocator_array[static_cast<int>(device_type)] = allocator;
+    allocator_priority[static_cast<int>(device_type)] = priority;
+  }
+}
+
+at::HostAllocator* getHostAllocator(at::DeviceType device_type) {
+  auto* allocator = allocator_array[static_cast<int>(device_type)];
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      allocator, "Host Allocator for ", device_type, " is not set.");
+  return allocator;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h
index 76981dff46b8..414cbfbb551e 100644
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@@ -1,4 +1,11 @@
+<<<<<<< HEAD
 #include <c10/core/Allocator.h>
+=======
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Stream.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/thread_pool.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/llvmMathExtras.h>
@@ -46,7 +53,11 @@ namespace {
 }
 
 // Struct containing memory allocator summary statistics for host.
+<<<<<<< HEAD
 struct HostStats {
+=======
+struct TORCH_API HostStats {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // COUNT: allocations requested by client code. Note that active
   // count can be extracted by looking at current allocations
   Stat allocation;
@@ -174,7 +185,16 @@ template <
     typename E,
     typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
+<<<<<<< HEAD
   virtual ~CachingHostAllocatorImpl() = default;
+=======
+  virtual ~CachingHostAllocatorImpl() {
+    active_ = false;
+    if (pinned_use_background_threads()) {
+      getBackgroundThreadPool()->waitWorkComplete();
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
   // return data_ptr and block pair.
@@ -211,7 +231,11 @@ struct CachingHostAllocatorImpl {
       // Launch the background thread and process events in a loop.
       static bool background_thread_flag [[maybe_unused]] = [this] {
         getBackgroundThreadPool()->run([&]() {
+<<<<<<< HEAD
           while (true) {
+=======
+          while (active_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             process_events();
             std::this_thread::sleep_for(std::chrono::microseconds(100));
           }
@@ -274,7 +298,12 @@ struct CachingHostAllocatorImpl {
     }
   }
 
+<<<<<<< HEAD
   virtual bool record_event(void* ptr, void* ctx, S stream) {
+=======
+  virtual bool record_event(void* ptr, void* ctx, c10::Stream s) {
+    S stream = S(s);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto* block = reinterpret_cast<B*>(ctx);
 
     // Note: we need to check if the passed-in `ctx` is valid. This is because
@@ -467,7 +496,11 @@ struct CachingHostAllocatorImpl {
   virtual B* get_free_block(size_t size) {
     auto index = size_index(size);
     std::lock_guard<std::mutex> g(free_list_[index].mutex_);
+<<<<<<< HEAD
     if (free_list_[index].list_.size() > 0) {
+=======
+    if (!free_list_[index].list_.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       B* block = free_list_[index].list_.back();
       free_list_[index].list_.pop_back();
       block->allocated_ = true;
@@ -616,28 +649,78 @@ struct CachingHostAllocatorImpl {
 
   alignas(64) std::mutex events_mutex_;
   std::deque<std::pair<E, B*>> events_; // event queue paired with block
+<<<<<<< HEAD
+=======
+
+  // Indicates whether the object is active.
+  // Set to false in the destructor to signal background threads to stop.
+  std::atomic<bool> active_{true};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 protected:
   alignas(64) HostStatsStaged stats_;
 };
 
+<<<<<<< HEAD
 template <typename T>
 struct CachingHostAllocatorInterface : public at::Allocator {
   CachingHostAllocatorInterface() : impl_(std::make_unique<T>()) {}
 
   at::DataPtr allocate(size_t size) override {
     TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for allocate");
+=======
+struct TORCH_API HostAllocator : public at::Allocator {
+  // Associates the pinned memory allocation with a stream to track
+  // dependencies. This ensures the memory won't be reused until the stream's
+  // operations complete
+  virtual bool record_event(void* ptr, void* ctx, c10::Stream stream) = 0;
+
+  // Frees all cached pinned memory and returns it to the system, clearing the
+  // allocator's internal cache
+  virtual void empty_cache() = 0;
+
+  // Returns comprehensive statistics about the allocator's memory usage,
+  // allocation patterns, and timing metrics
+  virtual HostStats get_stats() = 0;
+
+  // Resets the cumulative allocation statistics
+  virtual void reset_accumulated_stats() = 0;
+
+  // Resets the peak memory usage metrics
+  virtual void reset_peak_stats() = 0;
+};
+
+template <typename T, c10::DeleterFnPtr deleteFunc>
+struct CachingHostAllocatorInterface : public HostAllocator {
+  CachingHostAllocatorInterface() : impl_(std::make_unique<T>()) {}
+
+  at::DataPtr allocate(size_t size) override {
+    auto ptr_and_ctx = impl_->allocate(size);
+    return {
+        ptr_and_ctx.first,
+        ptr_and_ctx.second,
+        deleteFunc, // Use the template parameter deleter function
+        at::DeviceType::CPU};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void free(void* ctx) {
     impl_->free(ctx);
   }
 
+<<<<<<< HEAD
   template <typename S>
   bool record_event(void* ptr, void* ctx, S stream) {
     return impl_->record_event(ptr, ctx, stream);
   }
 
   void empty_cache() {
+=======
+  bool record_event(void* ptr, void* ctx, c10::Stream stream) override {
+    return impl_->record_event(ptr, ctx, stream);
+  }
+
+  void empty_cache() override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     impl_->empty_cache();
   }
 
@@ -646,6 +729,7 @@ struct CachingHostAllocatorInterface : public at::Allocator {
     impl_->copy_data(dest, src, count);
   }
 
+<<<<<<< HEAD
   HostStats getStats() {
     return impl_->getStats();
   }
@@ -655,11 +739,59 @@ struct CachingHostAllocatorInterface : public at::Allocator {
   }
 
   void resetPeakStats() {
+=======
+  HostStats get_stats() override {
+    return impl_->getStats();
+  }
+
+  void reset_accumulated_stats() override {
+    impl_->resetAccumulatedStats();
+  }
+
+  void reset_peak_stats() override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     impl_->resetPeakStats();
   }
 
   std::unique_ptr<T> impl_;
 };
 
+<<<<<<< HEAD
+=======
+#define DECLARE_HOST_ALLOCATOR(name, impl, deleter, instance)       \
+  void deleter(void* ptr);                                          \
+  struct name final                                                 \
+      : public at::CachingHostAllocatorInterface<impl, deleter> {}; \
+  static name instance;                                                    \
+  void deleter(void* ptr) {                                         \
+    instance.free(ptr);                                             \
+  }
+
+/**
+ * Set the host allocator for DeviceType `device_type`. This allocator manages
+ * pinned memory on the host that can be accessed efficiently by the specified
+ * device type. Note that this function is not thread-safe.
+ */
+TORCH_API void setHostAllocator(
+    at::DeviceType device_type,
+    at::HostAllocator* allocator,
+    uint8_t priority = 0);
+
+TORCH_API at::HostAllocator* getHostAllocator(at::DeviceType device_type);
+
+template <DeviceType device_type>
+struct HostAllocatorRegistry {
+  explicit HostAllocatorRegistry(HostAllocator* allocator) {
+    at::setHostAllocator(device_type, allocator);
+  }
+};
+
+#define REGISTER_HOST_ALLOCATOR(device_type, allocator) \
+  namespace {                                           \
+  static at::HostAllocatorRegistry<device_type>         \
+      g_host_allocator_registry_instance(allocator);    \
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at
 C10_DIAGNOSTIC_POP()
diff --git a/aten/src/ATen/core/Dict.h b/aten/src/ATen/core/Dict.h
index d187d7b7c116..f288c8fa5da5 100644
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@@ -116,10 +116,14 @@ class DictIterator final {
 
   DictIterator(const DictIterator& rhs): entryRef_(rhs.entryRef_) {}
   DictIterator(DictIterator&& rhs) noexcept: entryRef_(std::move(rhs.entryRef_)) {}
+<<<<<<< HEAD
   DictIterator& operator=(const DictIterator& rhs) {
     entryRef_ = rhs.entryRef_;
     return *this;
   }
+=======
+  DictIterator& operator=(const DictIterator& rhs) = default;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   DictIterator& operator=(DictIterator&& rhs) noexcept {
     entryRef_ = std::move(rhs.entryRef_);
     return *this;
diff --git a/aten/src/ATen/core/Dict_inl.h b/aten/src/ATen/core/Dict_inl.h
index 6261af5fb66a..3e8e555557eb 100644
--- a/aten/src/ATen/core/Dict_inl.h
+++ b/aten/src/ATen/core/Dict_inl.h
@@ -53,8 +53,12 @@ inline size_t DictKeyHash::operator()(const IValue& ivalue) const {
   } else if (ivalue.isDevice()) {
     return std::hash<Device>()(ivalue.toDevice());
   } else {
+<<<<<<< HEAD
     throw std::runtime_error(
         "Can't hash IValues with tag '" + ivalue.tagKind() + "'");
+=======
+    TORCH_CHECK(false, "Can't hash IValues with tag '", ivalue.tagKind(), "'");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/aten/src/ATen/core/Dimname.cpp b/aten/src/ATen/core/Dimname.cpp
index 88e368c605cf..f645224fd8a4 100644
--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@@ -25,9 +25,16 @@ bool Dimname::isValidName(const std::string& name) {
   }
   for (auto it = name.begin(); it != name.end(); ++it) {
     // NOLINTNEXTLINE(bugprone-branch-clone)
+<<<<<<< HEAD
     if (std::isalpha(*it) || *it == '_') {
       continue;
     } else if (it != name.begin() && std::isdigit(*it)) {
+=======
+    const unsigned char ch = static_cast<unsigned char>(*it);
+    if (std::isalpha(ch) || ch == '_') {
+      continue;
+    } else if (it != name.begin() && std::isdigit(ch)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       continue;
     }
     return false;
diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
index 7762e543234a..9b53525535b1 100644
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -1,5 +1,6 @@
 #include <ATen/core/Formatting.h>
 #include <c10/util/irange.h>
+<<<<<<< HEAD
 
 #include <cmath>
 #include <cstdint>
@@ -13,6 +14,25 @@ std::ostream& operator<<(std::ostream & out, Backend b) {
 }
 
 std::ostream& operator<<(std::ostream & out, const Scalar& s) {
+=======
+#include <fmt/compile.h>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <tuple>
+
+namespace c10 {
+std::ostream& operator<<(std::ostream& out, Backend b) {
+  return out << toString(b);
+}
+
+std::ostream& operator<<(std::ostream& out, const Scalar& s) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (s.isFloatingPoint()) {
     return out << s.toDouble();
   }
@@ -35,6 +55,7 @@ std::ostream& operator<<(std::ostream & out, const Scalar& s) {
 }
 
 std::string toString(const Scalar& s) {
+<<<<<<< HEAD
   std::stringstream out;
   out << s;
   return std::move(out).str();
@@ -75,17 +96,56 @@ static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor&
   if(size == 0) {
     return std::make_tuple(1., 0);
   }
+=======
+  return fmt::format("{}", fmt::streamed(s));
+}
+} // namespace c10
+
+namespace at {
+
+std::ostream& operator<<(std::ostream& out, const DeprecatedTypeProperties& t) {
+  return out << t.toString();
+}
+
+enum class FormatType {
+  Default, // 'g' format (defaultfloat equivalent)
+  Scientific, // 'e' format with precision 4
+  Fixed // 'f' format with precision 4
+};
+
+struct PrintFormat {
+  double scale;
+  int width;
+  FormatType type;
+
+  PrintFormat(double s, int w, FormatType t = FormatType::Default)
+      : scale(s), width(w), type(t) {}
+};
+
+static PrintFormat __printFormat(const Tensor& self) {
+  auto size = self.numel();
+  if (size == 0) {
+    return PrintFormat(1., 0);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool intMode = true;
   auto self_p = self.const_data_ptr<double>();
   for (const auto i : c10::irange(size)) {
     auto z = self_p[i];
+<<<<<<< HEAD
     if(std::isfinite(z)) {
       if(z != std::ceil(z)) {
+=======
+    if (std::isfinite(z)) {
+      if (z != std::ceil(z)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         intMode = false;
         break;
       }
     }
   }
+<<<<<<< HEAD
   int64_t offset = 0;
   while(!std::isfinite(self_p[offset])) {
     offset = offset + 1;
@@ -110,16 +170,42 @@ static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor&
       }
     }
     if(expMin != 0) {
+=======
+
+  int64_t offset = 0;
+  while (offset < size && !std::isfinite(self_p[offset])) {
+    offset = offset + 1;
+  }
+
+  double expMin = 1;
+  double expMax = 1;
+  if (offset != size) {
+    expMin = std::fabs(self_p[offset]);
+    expMax = std::fabs(self_p[offset]);
+    for (const auto i : c10::irange(offset, size)) {
+      double z = std::fabs(self_p[i]);
+      if (std::isfinite(z)) {
+        expMin = std::min(expMin, z);
+        expMax = std::max(expMax, z);
+      }
+    }
+    if (expMin != 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       expMin = std::floor(std::log10(expMin)) + 1;
     } else {
       expMin = 1;
     }
+<<<<<<< HEAD
     if(expMax != 0) {
+=======
+    if (expMax != 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       expMax = std::floor(std::log10(expMax)) + 1;
     } else {
       expMax = 1;
     }
   }
+<<<<<<< HEAD
   double scale = 1;
   int sz = 11;
   if(intMode) {
@@ -144,10 +230,39 @@ static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor&
         stream << std::fixed << std::setprecision(4);
       } else {
         if(expMax == 0) {
+=======
+
+  double scale = 1;
+  int sz = 11;
+
+  if (intMode) {
+    if (expMax > 9) {
+      sz = 11;
+      return PrintFormat(scale, sz, FormatType::Scientific);
+    } else {
+      sz = static_cast<int>(expMax) + 1;
+      return PrintFormat(scale, sz, FormatType::Default);
+    }
+  } else {
+    if (expMax - expMin > 4) {
+      sz = 11;
+      if (std::fabs(expMax) > 99 || std::fabs(expMin) > 99) {
+        sz = sz + 1;
+      }
+      return PrintFormat(scale, sz, FormatType::Scientific);
+    } else {
+      if (expMax > 5 || expMax < 0) {
+        sz = 7;
+        scale = std::pow(10, expMax - 1);
+        return PrintFormat(scale, sz, FormatType::Fixed);
+      } else {
+        if (expMax == 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           sz = 7;
         } else {
           sz = static_cast<int>(expMax) + 6;
         }
+<<<<<<< HEAD
         stream << std::fixed << std::setprecision(4);
       }
     }
@@ -176,10 +291,54 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
   int64_t lastColumn = -1;
   while(firstColumn < self.size(1)) {
     if(firstColumn + nColumnPerLine <= self.size(1)) {
+=======
+        return PrintFormat(scale, sz, FormatType::Fixed);
+      }
+    }
+  }
+}
+
+// Precompiled format specs
+static constexpr auto FMT_G = FMT_COMPILE("{:>{}g}");
+static constexpr auto FMT_E4 = FMT_COMPILE("{:>{}.4e}");
+static constexpr auto FMT_F4 = FMT_COMPILE("{:>{}.4f}");
+
+// Print a single value directly into the stream buffer with no temporaries
+static void printValue(std::ostream& stream, double v, const PrintFormat& pf) {
+  auto out_it = std::ostreambuf_iterator<char>(stream);
+  double val = v / pf.scale;
+  switch (pf.type) {
+    case FormatType::Default:
+      fmt::format_to(out_it, FMT_G, val, pf.width);
+      break;
+    case FormatType::Scientific:
+      fmt::format_to(out_it, FMT_E4, val, pf.width);
+      break;
+    case FormatType::Fixed:
+      fmt::format_to(out_it, FMT_F4, val, pf.width);
+      break;
+  }
+}
+
+static void __printMatrix(
+    std::ostream& stream,
+    const Tensor& self,
+    int64_t linesize,
+    int64_t indent) {
+  auto printFmt = __printFormat(self);
+
+  int64_t nColumnPerLine = (linesize - indent) / (printFmt.width + 1);
+  int64_t firstColumn = 0;
+  int64_t lastColumn = -1;
+
+  while (firstColumn < self.size(1)) {
+    if (firstColumn + nColumnPerLine <= self.size(1)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       lastColumn = firstColumn + nColumnPerLine - 1;
     } else {
       lastColumn = self.size(1) - 1;
     }
+<<<<<<< HEAD
     if(nColumnPerLine < self.size(1)) {
       if(firstColumn != 0) {
         stream << '\n';
@@ -208,6 +367,45 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
           }
         } else {
           stream << " ";
+=======
+
+    if (nColumnPerLine < self.size(1)) {
+      if (firstColumn != 0) {
+        stream.put('\n');
+      }
+      fmt::print(
+          stream,
+          "Columns {} to {}{:>{}s}",
+          firstColumn + 1,
+          lastColumn + 1,
+          "", // empty string to pad
+          indent // width to pad to
+      );
+    }
+
+    if (printFmt.scale != 1) {
+      fmt::print(stream, "{} *\n{:>{}s}", printFmt.scale, "", indent);
+    }
+
+    for (const auto l : c10::irange(self.size(0))) {
+      Tensor row = self.select(0, l);
+      const double* row_ptr = row.const_data_ptr<double>();
+
+      for (const auto c : c10::irange(firstColumn, lastColumn + 1)) {
+        printValue(stream, row_ptr[c], printFmt);
+
+        if (c == lastColumn) {
+          stream.put('\n');
+          if (l != self.size(0) - 1) {
+            if (printFmt.scale != 1) {
+              fmt::print(stream, "{:>{}s} ", "", indent);
+            } else {
+              fmt::print(stream, "{:>{}s}", "", indent);
+            }
+          }
+        } else {
+          stream.put(' ');
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       }
     }
@@ -215,6 +413,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
   }
 }
 
+<<<<<<< HEAD
 static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
 {
   std::vector<int64_t> counter(self.ndimension()-2);
@@ -229,6 +428,23 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
       counter[i] = counter[i] + 1;
       if(counter[i] >= self.size(i)) {
         if(i == self.ndimension()-3) {
+=======
+static void __printTensor(
+    std::ostream& stream,
+    Tensor& self,
+    int64_t linesize) {
+  std::vector<int64_t> counter(self.ndimension() - 2, 0);
+  counter[0] = -1;
+
+  bool start = true;
+  bool finished = false;
+
+  while (true) {
+    for (int64_t i = 0; self.ndimension() - 2; i++) {
+      counter[i] = counter[i] + 1;
+      if (counter[i] >= self.size(i)) {
+        if (i == self.ndimension() - 3) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           finished = true;
           break;
         }
@@ -237,6 +453,7 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
         break;
       }
     }
+<<<<<<< HEAD
     if(finished) {
       break;
     }
@@ -252,10 +469,29 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
       stream << counter[i]+1 << ",";
     }
     stream << ".,.) = " << '\n';
+=======
+    if (finished) {
+      break;
+    }
+    if (start) {
+      start = false;
+    } else {
+      stream.put('\n');
+    }
+
+    stream.put('(');
+    Tensor tensor = self;
+    for (const auto i : c10::irange(self.ndimension() - 2)) {
+      tensor = tensor.select(0, counter[i]);
+      fmt::print(stream, "{},", counter[i] + 1);
+    }
+    fmt::print(stream, ".,.) = \n");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __printMatrix(stream, tensor, linesize, 1);
   }
 }
 
+<<<<<<< HEAD
 void print(const Tensor & t, int64_t linesize) {
   print(std::cout,t,linesize);
 }
@@ -342,3 +578,115 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
 }
 
 }
+=======
+void print(const Tensor& t, int64_t linesize) {
+  print(std::cout, t, linesize);
+}
+
+std::ostream& print(
+    std::ostream& stream,
+    const Tensor& tensor_,
+    int64_t linesize) {
+  if (!tensor_.defined()) {
+    fmt::print(stream, "[ Tensor (undefined) ]");
+    return stream;
+  }
+
+  if (tensor_.is_sparse()) {
+    fmt::print(stream, "[ {}{{}}\nindices:\n", tensor_.toString());
+    print(stream, tensor_._indices(), linesize);
+    fmt::print(stream, "\nvalues:\n");
+    print(stream, tensor_._values(), linesize);
+    fmt::print(stream, "\nsize:\n{}\n]", fmt::streamed(tensor_.sizes()));
+    return stream;
+  }
+
+  Tensor tensor;
+
+  if (tensor_.is_quantized()) {
+    tensor = tensor_.dequantize().to(kCPU, kDouble).contiguous();
+  } else if (tensor_.is_mkldnn()) {
+    fmt::print(stream, "MKLDNN Tensor: ");
+    tensor = tensor_.to_dense().to(kCPU, kDouble).contiguous();
+  } else if (tensor_.is_mps()) {
+    // MPS does not support double tensors, so first copy then convert
+    tensor = tensor_.to(kCPU).to(kDouble).contiguous();
+  } else {
+    tensor = tensor_.to(kCPU, kDouble).contiguous();
+  }
+
+  if (tensor.ndimension() == 0) {
+    fmt::print(
+        stream,
+        "{}\n[ {}{{}}",
+        tensor.const_data_ptr<double>()[0],
+        tensor_.toString());
+  } else if (tensor.ndimension() == 1) {
+    if (tensor.numel() > 0) {
+      auto printFmt = __printFormat(tensor);
+      if (printFmt.scale != 1) {
+        fmt::print(stream, "{} *\n", printFmt.scale);
+      }
+      const double* tensor_p = tensor.const_data_ptr<double>();
+      for (const auto i : c10::irange(tensor.size(0))) {
+        printValue(stream, tensor_p[i], printFmt);
+        stream.put('\n');
+      }
+    }
+    fmt::print(stream, "[ {}{{{}}}", tensor_.toString(), tensor.size(0));
+  } else if (tensor.ndimension() == 2) {
+    if (tensor.numel() > 0) {
+      __printMatrix(stream, tensor, linesize, 0);
+    }
+    fmt::print(
+        stream,
+        "[ {}{{{},{}}}",
+        tensor_.toString(),
+        tensor.size(0),
+        tensor.size(1));
+  } else {
+    if (tensor.numel() > 0) {
+      __printTensor(stream, tensor, linesize);
+    }
+    fmt::print(stream, "[ {}{{{}", tensor_.toString(), tensor.size(0));
+    for (const auto i : c10::irange(1, tensor.ndimension())) {
+      fmt::print(stream, ",{}", tensor.size(i));
+    }
+    fmt::print(stream, "}}");
+  }
+
+  // Add quantization info
+  if (tensor_.is_quantized()) {
+    fmt::print(stream, ", qscheme: {}", toString(tensor_.qscheme()));
+    if (tensor_.qscheme() == c10::kPerTensorAffine) {
+      fmt::print(
+          stream,
+          ", scale: {}, zero_point: {}",
+          tensor_.q_scale(),
+          tensor_.q_zero_point());
+    } else if (
+        tensor_.qscheme() == c10::kPerChannelAffine ||
+        tensor_.qscheme() == c10::kPerChannelAffineFloatQParams) {
+      fmt::print(stream, ", scales: ");
+      print(stream, tensor_.q_per_channel_scales(), linesize);
+      fmt::print(stream, ", zero_points: ");
+      print(stream, tensor_.q_per_channel_zero_points(), linesize);
+      fmt::print(stream, ", axis: {}", tensor_.q_per_channel_axis());
+    }
+  }
+
+  // Proxy check for if autograd was built
+  if (tensor.getIntrusivePtr()->autograd_meta()) {
+    auto& fw_grad = tensor._fw_grad(/* level */ 0);
+    if (fw_grad.defined()) {
+      fmt::print(stream, ", tangent:\n");
+      print(stream, fw_grad, linesize);
+    }
+  }
+
+  fmt::print(stream, " ]");
+  return stream;
+}
+
+} // namespace at
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/core/IListRef_inl.h b/aten/src/ATen/core/IListRef_inl.h
index a21bd22cf16c..01220d3c6b2b 100644
--- a/aten/src/ATen/core/IListRef_inl.h
+++ b/aten/src/ATen/core/IListRef_inl.h
@@ -168,7 +168,13 @@ class IListRefTagImpl<IListRefTag::Boxed, at::OptionalTensorRef>
    */
   static IListRefConstRef<at::OptionalTensorRef> iterator_get(
       const typename list_type::const_iterator& it) {
+<<<<<<< HEAD
     const auto& ivalue = (*it).get();
+=======
+    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdangling-reference")
+    const auto& ivalue = (*it).get();
+    C10_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!ivalue.isNone()) {
         const auto& tensor = ivalue.toTensor();
         return (tensor.defined()) ? tensor : at::OptionalTensorRef{};
diff --git a/aten/src/ATen/core/IListRef_test.cpp b/aten/src/ATen/core/IListRef_test.cpp
index 505a80216d67..502c669ee59e 100644
--- a/aten/src/ATen/core/IListRef_test.cpp
+++ b/aten/src/ATen/core/IListRef_test.cpp
@@ -42,7 +42,11 @@ static std::vector<at::OptionalTensorRef> get_unboxed_opt_tensor_vector() {
 }
 
 template <typename T>
+<<<<<<< HEAD
 void check_elements_same(at::ITensorListRef list, const T& thing, int use_count) {
+=======
+static void check_elements_same(at::ITensorListRef list, const T& thing, int use_count) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   EXPECT_EQ(thing.size(), list.size());
   size_t i = 0;
   for (const auto& t : list) {
diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp
index 71029598aab2..41bf5d4d46be 100644
--- a/aten/src/ATen/core/List_test.cpp
+++ b/aten/src/ATen/core/List_test.cpp
@@ -2,6 +2,10 @@
 #include <gtest/gtest.h>
 
 using namespace c10;
+<<<<<<< HEAD
+=======
+using std::string;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // NOLINTBEGIN(performance-move-const-arg, bugprone-use-after-move, *analyzer*Move)
 TEST(ListTestIValueBasedList, givenEmptyList_whenCallingEmpty_thenReturnsTrue) {
diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp
index b1126e212265..547d951adff3 100644
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@@ -5,7 +5,11 @@
 
 namespace at {
 
+<<<<<<< HEAD
 thread_local bool NamesMode_enabled = true;
+=======
+thread_local static bool NamesMode_enabled = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool NamesMode::is_enabled() {
   return NamesMode_enabled;
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
index 43474515db0f..bd23edf7cbc2 100644
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -51,9 +51,14 @@ TensorBase TensorBase::to(
 }
 
 void TensorBase::enforce_invariants() {
+<<<<<<< HEAD
   if (impl_.get() == nullptr) {
     throw std::runtime_error("TensorImpl with nullptr is not supported");
   }
+=======
+  TORCH_CHECK(
+      impl_.get() != nullptr, "TensorImpl with nullptr is not supported");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Following line throws if the method is not a POD data type or is not
   // supported by ATen
   scalar_type();
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 8d300debebe3..9722cbb06526 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -57,16 +57,27 @@ inline bool variable_excluded_from_dispatch() {
 // NOTE: [Tensor vs. TensorBase]
 //
 // Tensor, being the central data structure in PyTorch, gets used and
+<<<<<<< HEAD
 // it's header included almost everywhere. Unfortunately this means
 // every time an operator signature is updated or changed in
 // native_functions.yaml, you (and every other PyTorch developer) need
 // to recompile all of ATen and it's dependencies.
+=======
+// its header included almost everywhere. Unfortunately this means
+// every time an operator signature is updated or changed in
+// native_functions.yaml, you (and every other PyTorch developer) need
+// to recompile all of ATen and its dependencies.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //
 // TensorBase aims to break up these header dependencies, and improve
 // incremental build times for all PyTorch developers. TensorBase
 // represents a reference counted handle to TensorImpl, exactly the
 // same as Tensor. However, TensorBase doesn't have code generated
+<<<<<<< HEAD
 // methods in it's API and thus no dependence on native_functions.yaml.
+=======
+// methods in its API and thus no dependence on native_functions.yaml.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //
 // Usage tips
 // ----------
@@ -75,9 +86,15 @@ inline bool variable_excluded_from_dispatch() {
 //   native_functions.yaml (direct or indirect).
 // - Tensor inherits from TensorBase, so functions taking
 //   `const TensorBase &` are callable with Tensor as well.
+<<<<<<< HEAD
 // - TensorBase can be converted to tensor with `Tensor(tensor_base)`,
 //   but this requires a reference-count bump. OptionalTensorRef on
 //   the other hand can materialize a `const Tensor &` without
+=======
+// - TensorBase can be converted to Tensor with `Tensor(tensor_base)`,
+//   but this requires a reference-count bump. OptionalTensorRef, on
+//   the other hand, can materialize a `const Tensor &` without
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //   touching the reference-count.
 class TORCH_API TensorBase {
  public:
diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp
index 390d9189190e..a8fd023320e5 100644
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@@ -80,6 +80,13 @@ TORCH_LIBRARY_IMPL(_, AutogradMTIA, m) {
   m.fallback(AUTOGRAD_FALLBACK);
 }
 
+<<<<<<< HEAD
+=======
+TORCH_LIBRARY_IMPL(_, AutogradMAIA, m) {
+  m.fallback(AUTOGRAD_FALLBACK);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
   m.fallback(AUTOGRAD_FALLBACK);
 }
diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp
index 13b8eda63859..01e3720b2e44 100644
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@@ -1,4 +1,8 @@
 #include <ATen/core/Vitals.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cstdlib>
 #include <iostream>
 
@@ -41,9 +45,15 @@ bool torchVitalEnabled() {
   // If this is a performance hit, make `enabled` variable static
   // and return `const bool&` instead
   bool enabled = []() {
+<<<<<<< HEAD
     auto e = getenv("TORCH_VITAL");
     if (e != nullptr) {
       return e[0] != '\0';
+=======
+    auto const e = c10::utils::get_env("TORCH_VITAL");
+    if (e.has_value()) {
+      return !e.value().empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return false;
   }();
diff --git a/aten/src/ATen/core/adaption.cpp b/aten/src/ATen/core/adaption.cpp
index ef06b9606ba7..a16f2099ad66 100644
--- a/aten/src/ATen/core/adaption.cpp
+++ b/aten/src/ATen/core/adaption.cpp
@@ -5,9 +5,14 @@ namespace c10::impl {
 
 void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
   TORCH_CHECK(false,
+<<<<<<< HEAD
     "Expected all tensors to be on the same device, but "
     "found at least two devices, ", common_device, " and ", tensor.device(), "! "
     "(when checking argument for argument ", argName, " in method ", methodName, ")");
+=======
+    "Expected all tensors to be on the same device, but got ", argName, " is on ", tensor.device(),
+    ", different from other tensors on ", common_device, " (when checking argument in method ", methodName, ")");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace c10::impl
diff --git a/aten/src/ATen/core/alias_info.h b/aten/src/ATen/core/alias_info.h
index a8a55bb782c4..6b029cee77d0 100644
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@@ -1,4 +1,9 @@
 #pragma once
+<<<<<<< HEAD
+=======
+#include <set>
+#include <string>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <unordered_set>
 #include <vector>
 #include <ATen/core/symbol.h>
@@ -18,6 +23,18 @@ namespace c10 {
  */
 class AliasInfo {
  public:
+<<<<<<< HEAD
+=======
+  AliasInfo() = default;
+  AliasInfo(bool is_write, const std::set<std::string>& before_qual_strings, const std::set<std::string>& after_qual_strings) : isWrite_(is_write) {
+    for (const auto& s: before_qual_strings) {
+      beforeSets_.insert(Symbol::fromQualString(s));
+    }
+    for (const auto& s : after_qual_strings) {
+      afterSets_.insert(Symbol::fromQualString(s));
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Symbol for the set that can alias anything
   static Symbol wildcardSet() {
     static const Symbol wc = Symbol::fromQualString("alias::*");
diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp
index b13f827b8f17..8ec24e1c473e 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction.cpp
@@ -28,7 +28,11 @@ void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, D
     "Autograd dispatch key for the backend.\n",
     "If you only want to run inference instead of training, in C++, add `c10::InferenceMode mode;` "
     "before model.forward(); in Python, use `torch.inference_mode()` as a context manager (see "
+<<<<<<< HEAD
     "https://pytorch.org/docs/stable/generated/torch.inference_mode.html).",
+=======
+    "https://pytorch.org/docs/stable/generated/torch.autograd.grad_mode.inference_mode.html).",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
 }
 
diff --git a/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp
index b25c4e543a40..f804aac737a7 100644
--- a/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp
@@ -519,7 +519,11 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithDictInput
   EXPECT_EQ(2, captured_dict_size);
 }
 
+<<<<<<< HEAD
 string kernelWithDictInputWithOutput(Dict<string, string> input1) {
+=======
+std::string kernelWithDictInputWithOutput(Dict<string, string> input1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return input1.at("key2");
 }
 
@@ -581,7 +585,11 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithUnordered
   EXPECT_EQ(2, captured_dict_size);
 }
 
+<<<<<<< HEAD
 string kernelWithUnorderedMapInputWithOutput(std::unordered_map<string, string> input1) {
+=======
+std::string kernelWithUnorderedMapInputWithOutput(std::unordered_map<string, string> input1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return input1.at("key2");
 }
 
diff --git a/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp b/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp
index 4ebb18d82f96..a1ecea263ab1 100644
--- a/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/kernel_function_test.cpp
@@ -468,7 +468,11 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithDictInput_witho
   EXPECT_EQ(2, captured_dict_size);
 }
 
+<<<<<<< HEAD
 string kernelWithDictInputWithOutput(Dict<string, string> input1) {
+=======
+std::string kernelWithDictInputWithOutput(Dict<string, string> input1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return input1.at("key2");
 }
 
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp
index 42c06c38ac34..866bbda2c92b 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp
@@ -463,7 +463,11 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withou
 }
 
 struct KernelWithDictInputWithOutput final : OperatorKernel {
+<<<<<<< HEAD
   string operator()(Dict<string, string> input1) {
+=======
+std::string operator()(Dict<string, std::string> input1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return input1.at("key2");
   }
 };
@@ -475,7 +479,11 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withOu
   auto op = c10::Dispatcher::singleton().findSchema({"_test::dict_input", ""});
   ASSERT_TRUE(op.has_value());
 
+<<<<<<< HEAD
   Dict<string, string> dict;
+=======
+  Dict<string, std::string> dict;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dict.insert("key1", "value1");
   dict.insert("key2", "value2");
   auto outputs = callOp(*op, dict);
@@ -484,7 +492,11 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withOu
 }
 
 struct KernelWithDictOutput final : OperatorKernel {
+<<<<<<< HEAD
   Dict<string, string> operator()(Dict<string, string> input) {
+=======
+  Dict<string, std::string> operator()(Dict<string, std::string> input) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return input;
   }
 };
@@ -496,12 +508,20 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictOutput_whenR
   auto op = c10::Dispatcher::singleton().findSchema({"_test::dict_output", ""});
   ASSERT_TRUE(op.has_value());
 
+<<<<<<< HEAD
   Dict<string, string> dict;
+=======
+  Dict<string, std::string> dict;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dict.insert("key1", "value1");
   dict.insert("key2", "value2");
   auto outputs = callOp(*op, dict);
   EXPECT_EQ(1, outputs.size());
+<<<<<<< HEAD
   auto output = c10::impl::toTypedDict<string, string>(outputs[0].toGenericDict());
+=======
+  auto output = c10::impl::toTypedDict<string, std::string>(outputs[0].toGenericDict());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_EQ(2, output.size());
   EXPECT_EQ("value1", output.at("key1"));
@@ -520,7 +540,11 @@ class KernelWithCache final : public OperatorKernel {
 };
 
 struct KernelWithTupleInput final : OperatorKernel {
+<<<<<<< HEAD
   string operator()(std::tuple<string, int64_t, double> input1) {
+=======
+  std::string operator()(std::tuple<string, int64_t, double> input1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::get<0>(input1);
   }
 };
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
index 27438b926db5..5ae5d88e0eed 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -152,8 +152,16 @@ struct TORCH_API DispatchKeyExtractor final {
         // no safe toTensorRef method, alas)
         ks = ks | ivalue.unsafeToTensorImpl()->key_set();
       } else if (C10_UNLIKELY(ivalue.isTensorList())) {
+<<<<<<< HEAD
         for (const at::Tensor& tensor : ivalue.toTensorList()) {
           ks = ks | tensor.key_set();
+=======
+        // NB: use toListRef as it doesn't induce refcount bumps
+        // (toTensorListRef is not a thing)
+        for (const auto& nv : ivalue.toListRef()) {
+          auto* tensor = nv.unsafeToTensorImpl();
+          ks = ks | tensor->key_set();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       }
       // Tensor?[] translates to a c10::List<IValue> so we need to peek inside
@@ -200,6 +208,34 @@ struct TORCH_API DispatchKeyExtractor final {
   void checkInvariants(const FunctionSchema& schema) const;
 
  private:
+<<<<<<< HEAD
+=======
+  static bool isDispatchType(const Type& type) {
+    // Checking isSubtypeOf on a DynamicType heap-allocates a
+    // DynamicType version of the argument if it's not a DynamicType
+    // already, and this has measurable overhead during startup.
+#ifdef C10_MOBILE
+    struct CachedTypes {
+      DynamicTypePtr listOfTensors;
+      DynamicTypePtr listOfOptionalTensors;
+      DynamicTypePtr optionalOfTensor;
+    };
+    static const CachedTypes ct = {
+        DynamicType::create(*ListType::ofTensors()),
+        DynamicType::create(*ListType::ofOptionalTensors()),
+        DynamicType::create(*OptionalType::ofTensor())};
+    return type.isSubtypeOf(c10::TypeFactory::get<TensorType>()) ||
+        type.isSubtypeOf(ct.listOfTensors) ||
+        type.isSubtypeOf(ct.listOfOptionalTensors) ||
+        type.isSubtypeOf(ct.optionalOfTensor);
+#else // C10_MOBILE
+    return type.isSubtypeOf(*TensorType::get()) ||
+        type.isSubtypeOf(*ListType::ofTensors()) ||
+        type.isSubtypeOf(*ListType::ofOptionalTensors()) ||
+        type.isSubtypeOf(*OptionalType::ofTensor());
+#endif // C10_MOBILE
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static c10::utils::bitset makeBitsetForDispatchArgs(
       const FunctionSchema& schema) {
     TORCH_CHECK(
@@ -210,6 +246,7 @@ struct TORCH_API DispatchKeyExtractor final {
         c10::utils::bitset::NUM_BITS());
     c10::utils::bitset dispatch_arg_indices_reverse;
     for (const auto index : c10::irange(schema.arguments().size())) {
+<<<<<<< HEAD
       if (schema.arguments()[index].type()->isSubtypeOf(*TensorType::get()) ||
           schema.arguments()[index].type()->isSubtypeOf(
               *ListType::ofTensors()) ||
@@ -217,6 +254,9 @@ struct TORCH_API DispatchKeyExtractor final {
               *ListType::ofOptionalTensors()) ||
           schema.arguments()[index].type()->isSubtypeOf(
               *OptionalType::ofTensor())) {
+=======
+      if (isDispatchType(*schema.arguments()[index].type())) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dispatch_arg_indices_reverse.set(schema.arguments().size() - 1 - index);
       }
     }
@@ -225,8 +265,12 @@ struct TORCH_API DispatchKeyExtractor final {
 
   explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse)
       : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse),
+<<<<<<< HEAD
         nonFallthroughKeys_(DispatchKeySet::FULL),
         requiresBitsetPerBackend_(false) {
+=======
+        nonFallthroughKeys_(DispatchKeySet::FULL) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
       nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL;
     }
@@ -252,7 +296,11 @@ struct TORCH_API DispatchKeyExtractor final {
   // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast
   // path), or if we need to fall back to the slower path and check
   // nonFallthroughKeysPerBackend_
+<<<<<<< HEAD
   bool requiresBitsetPerBackend_;
+=======
+  bool requiresBitsetPerBackend_{false};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace c10
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 7ff4901a16b0..e9d0cb88e95e 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -1,9 +1,15 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/PythonOpRegistrationTrampoline.h>
+<<<<<<< HEAD
 #include <chrono>
 #include <list>
 #include <sstream>
 #include <utility>
+=======
+#include <list>
+#include <utility>
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef FBCODE_CAFFE2
 #include <c10/util/static_tracepoint.h>
@@ -17,6 +23,7 @@ TORCH_SDT_DEFINE_SEMAPHORE(operator_end)
 #endif
 
 bool show_dispatch_trace() {
+<<<<<<< HEAD
   static auto envar = std::getenv("TORCH_SHOW_DISPATCH_TRACE");
 
   if (envar) {
@@ -24,6 +31,15 @@ bool show_dispatch_trace() {
       return false;
     }
     if (strcmp(envar, "1") == 0) {
+=======
+  static auto envar = c10::utils::get_env("TORCH_SHOW_DISPATCH_TRACE");
+
+  if (envar.has_value()) {
+    if (envar == "0") {
+      return false;
+    }
+    if (envar == "1") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return true;
     }
     TORCH_WARN(
@@ -180,6 +196,21 @@ const std::vector<OperatorName> Dispatcher::getAllOpNames() {
   });
 }
 
+<<<<<<< HEAD
+=======
+const std::vector<OperatorName> Dispatcher::getAllOpNamesForDispatchKey(DispatchKey k) {
+  return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> std::vector<OperatorName> {
+    std::vector<OperatorName> allOpNames;
+    for (const auto& op : operatorLookupTable) {
+      if (op.second.hasKernelForDispatchKey(k)) {
+        allOpNames.push_back(op.first);
+      }
+    }
+    return allOpNames;
+  });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Postcondition: caller is responsible for disposing of registration when they
 // are done
 OperatorHandle Dispatcher::findOrRegisterName_(const OperatorName& op_name) {
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index dbc501afe7ce..9407250dffa4 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -165,6 +165,13 @@ class TORCH_API Dispatcher final {
   // Returns a list of all operator names present in the operatorLookupTable_
   const std::vector<OperatorName> getAllOpNames();
 
+<<<<<<< HEAD
+=======
+  // Returns a list of all operator names present in the operatorLookupTable_
+  // for a given dispatch key
+  const std::vector<OperatorName> getAllOpNamesForDispatchKey(DispatchKey k);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // ------------------------------------------------------------------------
   //
   // Invoking operators
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 751577df6f2d..b197a3da0ef1 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -2,6 +2,14 @@
 #include <ATen/core/op_registration/infer_schema.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/dispatch/ObservedOperators.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/irange.h>
+
+#include <array>
+#include <utility>
+#include <vector>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10::impl {
 
@@ -17,6 +25,48 @@ namespace {
 #endif
 }
 
+<<<<<<< HEAD
+=======
+static const std::vector<DispatchKey>& allDispatchKeysInFullSet() {
+  static const auto result = []() {
+    std::vector<DispatchKey> vec;
+    for (const auto dispatch_key: DispatchKeySet(DispatchKeySet::FULL)) {
+      vec.push_back(dispatch_key);
+    }
+    return vec;
+  }();
+  return result;
+}
+
+// Returns an array of the same size as the dispatch table, where each
+// entry is the DispatchKey that the corresponding index in the
+// dispatch table represents.
+static const auto& getDispatchTableIndexToKey() {
+  static const auto result = []() {
+    using result_type = std::array<DispatchKey, c10::num_runtime_entries>;
+    result_type arr;
+    arr.fill(DispatchKey::Undefined);
+    for (const auto dispatch_key: allDispatchKeysInFullSet()) {
+      const auto index = getDispatchTableIndexForDispatchKey(dispatch_key);
+      TORCH_INTERNAL_ASSERT(arr.at(index) == DispatchKey::Undefined);
+      arr.at(index) = dispatch_key;
+    }
+    // Self-test. Should be plenty cheap enough to just run in prod
+    // builds. We just need to make sure that we have the dispatch key
+    // for every entry in the table, and we assert in
+    // update_array_entry above that we also don't have any conflicts
+    // during computation.
+    TORCH_INTERNAL_ASSERT(getDispatchTableIndexForDispatchKey(DispatchKey::Undefined) == 0);
+    TORCH_INTERNAL_ASSERT(arr[0] == DispatchKey::Undefined);
+    for (const auto index : c10::irange(1, arr.size())) {
+      TORCH_INTERNAL_ASSERT(arr[index] != DispatchKey::Undefined, "missing dispatch key at index ", index);
+    }
+    return arr;
+  }();
+  return result;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OperatorEntry::OperatorEntry(OperatorName&& operator_name)
 : name_(std::move(operator_name))
 , schema_()
@@ -31,8 +81,32 @@ OperatorEntry::OperatorEntry(OperatorName&& operator_name)
 , is_observed_(ObservedOperators::isObserved(name_))
 {
   // Pick up any backend fallbacks that were registered prior to this
+<<<<<<< HEAD
   // OperatorEntry being created
   updateDispatchTableFull_(c10::Dispatcher::singleton());
+=======
+  // OperatorEntry being created.
+
+  // We are essentially directly implementing
+  // updateDispatchTableFull_, taking into account that we know
+  // kernels_ is empty() and therefore
+  // computeDispatchTableEntryWithDebug cases 1 and 2.1 through 2.5
+  // won't do anything.
+  const auto& dispatcher = c10::Dispatcher::singleton();
+  const auto& dispatch_table_index_to_key = getDispatchTableIndexToKey();
+  for (const auto dispatch_ix: c10::irange(dispatcher.backendFallbackKernels_.size())) {
+    const auto& bfk = dispatcher.backendFallbackKernels_[dispatch_ix];
+    if (bfk.kernel.isValid()) {
+      dispatchTable_[dispatch_ix] = bfk.kernel;
+      if (bfk.kernel.isFallthrough()) {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dispatch_ix < dispatch_table_index_to_key.size());
+        dispatchKeyExtractor_.setOperatorHasFallthroughForKey(dispatch_table_index_to_key[dispatch_ix], true);
+      }
+    } else {
+      dispatchTable_[dispatch_ix] = missingKernel().kernel;
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 namespace {
@@ -150,7 +224,12 @@ OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel(
 #endif
     // Suppress the warning for Meta key as we are overriding C++ meta functions with python meta functions
     // for some ops
+<<<<<<< HEAD
     if (dispatch_key != DispatchKey::Meta) {
+=======
+    // Also suppress the warning for MTIA, as MTIA achieves CPU fallback by overriding registration.
+    if (dispatch_key != DispatchKey::Meta && dispatch_key != DispatchKey::MTIA) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_WARN_ONCE("Warning only once for all operators,  other operators may also be overridden.\n",
             "  Overriding a previously registered kernel for the same operator and the same dispatch key\n",
             "  operator: ", (schema_.has_value() ? toString(schema_->schema) : toString(name_)), "\n",
@@ -290,7 +369,11 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   //   CompositExplicitAutogradNonFunctional > CompositeExplicitAutograd > CompositeImplicitAutograd > Autograd
   // Note [CompositeExplicitAutograd and CompositeImplicitAutograd]
   //   When there're registrations to both CompositeExplicitAutograd & CompositeImplicitAutograd & Autograd, from (2.2) we know CompositeExplicitAutograd
+<<<<<<< HEAD
   //   and Autograd kernels will be picked up and CompositeImplicitAutograd is overriden.
+=======
+  //   and Autograd kernels will be picked up and CompositeImplicitAutograd is overridden.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //   This is fine and in practice CompositeExplicitAutograd and CompositeImplicitAutograd shouldn't co-exist for an op.
   // TODO: Update alias key precedence after we add new alias keys AutogradDispatchCPUOrCUDA .
 
@@ -452,7 +535,11 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher)
   // or CompositeImplicitAutograd alias key so that we don't break the support. Ideally isIncludedInAlias(Undefined, CompositeImplicitAutograd)
   // should return true, it returns false because Undefined cannot be represented in a DispatchKeySet.
   updateDispatchTable_(dispatcher, DispatchKey::Undefined);
+<<<<<<< HEAD
   for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
+=======
+  for (auto k : allDispatchKeysInFullSet()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     updateDispatchTable_(dispatcher, k);
   }
 }
@@ -466,7 +553,11 @@ void OperatorEntry::checkInvariants() const {
   for (const auto& kv : kernels_) {
     TORCH_INTERNAL_ASSERT(!kv.second.empty(), dumpState());
   }
+<<<<<<< HEAD
   for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
+=======
+  for (auto k : allDispatchKeysInFullSet()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), k);
     auto idx = getDispatchTableIndexForDispatchKey(k);
     if (C10_UNLIKELY(idx == -1)) {
@@ -483,7 +574,11 @@ std::string OperatorEntry::listAllDispatchKeys() const {
   str << "[";
 
   bool has_kernels = false;
+<<<<<<< HEAD
   for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
+=======
+  for (auto k : allDispatchKeysInFullSet()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto iter = getDispatchTableIndexForDispatchKey(k);
     if (iter == -1 || !dispatchTable_[iter].isValid()) {
       continue;
@@ -569,7 +664,11 @@ std::string OperatorEntry::dumpComputedTable() const {
   // Need to handle Undefined separately, because its a runtime key that can't be represented
   // in a DispatchKeySet.
   std::vector<DispatchKey> runtime_keys = {DispatchKey::Undefined};
+<<<<<<< HEAD
   for (auto k : DispatchKeySet(DispatchKeySet::FULL)) runtime_keys.push_back(k);
+=======
+  for (auto k : allDispatchKeysInFullSet()) runtime_keys.push_back(k);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   for (auto k : runtime_keys) {
     auto kernel_prov = computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k);
diff --git a/aten/src/ATen/core/dynamic_type.cpp b/aten/src/ATen/core/dynamic_type.cpp
index 543c6f830f40..fdbf394e5372 100644
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@@ -78,15 +78,23 @@ DynamicType::~DynamicType() {
   arguments_.~Arguments();
 }
 
+<<<<<<< HEAD
 std::shared_ptr<const DynamicType> DynamicType::create(const Type& other) {
+=======
+SingletonOrSharedTypePtr<const DynamicType> DynamicType::create(const Type& other) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (auto dynRaw = other.castRaw<DynamicType>()) {
     TORCH_INTERNAL_ASSERT(
         !dynRaw->weak_from_this().expired(),
         "Error creating dynamic type instance not managed by shared_ptr: ",
         other.str());
+<<<<<<< HEAD
   }
   if (auto dyn = other.cast<DynamicType>()) {
     return dyn;
+=======
+    return SingletonTypePtr<const DynamicType>(dynRaw);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return std::shared_ptr<const DynamicType>(new DynamicType{other});
 }
diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h
index 2e7b7cbc5d31..2d095dc1a81d 100644
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@@ -187,7 +187,13 @@ class DynamicType : public SharedType {
     return false;
   }
   friend struct Type;
+<<<<<<< HEAD
   static std::shared_ptr<const DynamicType> create(const Type& ty);
+=======
+  // NOTE: Here we are using SingletonOrSharedTypePtr to mean
+  // "original-type-because-it-was-actually-a-DynamicType or shared".
+  static SingletonOrSharedTypePtr<const DynamicType> create(const Type& ty);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   DynamicType(const Type& other);
   bool equals(const DynamicType& other) const;
 
diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp
index 9478b11ee108..def0d11a2d89 100644
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@@ -41,9 +41,21 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
     }
   };
   std::vector<Argument> new_arguments, new_returns;
+<<<<<<< HEAD
   std::transform(arguments().begin(), arguments().end(), std::back_inserter(new_arguments), cloneWithRealTypes);
   // NB: SymInt returns are always SymInt
   std::transform(returns().begin(), returns().end(), std::back_inserter(new_returns), alwaysCloneWithRealTypes);
+=======
+  new_arguments.reserve(arguments().size());
+  for (const auto& arg: arguments()) {
+    new_arguments.push_back(cloneWithRealTypes(arg));
+  }
+  // NB: SymInt returns are always SymInt
+  new_returns.reserve(returns().size());
+  for (const auto& ret: returns()) {
+    new_returns.push_back(alwaysCloneWithRealTypes(ret));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return FunctionSchema(
     name(),
     overload_name(),
diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h
index f4d5ee6a3fd3..b79475f72ea5 100644
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@@ -71,7 +71,11 @@ inline void FunctionSchema::checkAndNormalizeInputs(
     for(const auto& k : kwargs) {
       names.emplace_back(k.first);
     }
+<<<<<<< HEAD
     throw std::runtime_error(findErrorInKwargs(names));
+=======
+    TORCH_CHECK(false, findErrorInKwargs(names));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp
index b8a5b418bbc0..5a9b885900fd 100644
--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@@ -1,6 +1,10 @@
 #include <torch/library.h>
 
 #include <ATen/core/dispatch/Dispatcher.h>
+<<<<<<< HEAD
+=======
+#include <fmt/format.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch {
 
@@ -11,7 +15,11 @@ namespace {
 #ifdef STRIP_ERROR_MESSAGES
     return std::string();
 #else
+<<<<<<< HEAD
     return c10::str("registered at ", file, ":", line);
+=======
+    return fmt::format("registered at {}:{}", file, line);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
 
@@ -58,6 +66,29 @@ void Library::reset() {
 
 #define ERROR_CONTEXT "(Error occurred while processing ", toString(kind_), " block at ", file_, ":", line_, ")"
 
+<<<<<<< HEAD
+=======
+#if defined(TORCH_LIBRARY_THREAD_UNSAFE_LAZY_INIT) && defined(C10_MOBILE)
+namespace detail {
+  // Insertion of library initializers into torch_library_initializers is not
+  // thread-safe as we expect this to be handled by the applications dynamic
+  // library loader, which would guarantee that only one thread is inserting
+  // libraries into the vector. We do require thread safety when calling
+  // initialize_torch_libraries however, as this can be called from any
+  // thread, and potentially race and corrupt the library initializer vector.
+  std::mutex torch_library_initializer_mutex;
+  std::vector<TorchLibraryInit*> torch_library_initializers;
+} // namespace detail
+void initialize_torch_libraries() {
+  const std::lock_guard<std::mutex> lock(detail::torch_library_initializer_mutex);
+  for (auto* initializer : detail::torch_library_initializers) {
+    initializer->initialize();
+  }
+  detail::torch_library_initializers.clear();
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Library::Library(Kind kind, std::string ns, std::optional<c10::DispatchKey> k, const char* file, uint32_t line)
   : kind_(kind)
   , ns_(ns == "_" ? std::nullopt : std::make_optional(std::move(ns)))
diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp
index 0ffc061870f1..415aaf18bcdd 100644
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@@ -1787,8 +1787,12 @@ TEST(NewOperatorRegistrationTest, dispatchAutogradPrecedence) {
 }
 
 TEST(NewOperatorRegistrationTest, throwsWhenRegisterToBackendMapsToAutogradOther) {
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   bool fpga_called, math_called = false;
+=======
+  bool fpga_called = false, math_called = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto m = MAKE_TORCH_LIBRARY(test);
   m.def("fn", torch::dispatch(c10::DispatchKey::FPGA, [&](const Tensor& x) { fpga_called = true; return x; }));
   m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; });
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index b94e3cd6bd87..0092b0d8056d 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -7,6 +7,10 @@
 #include <ATen/core/grad_mode.h>
 #include <ATen/core/jit_type.h>
 #include <c10/macros/Macros.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
 #include <array>
@@ -45,9 +49,15 @@ static_assert(
     "getTypePtr<std::tuple<int64_t, int64_t>> not returning const ref!");
 
 TypeVerbosity type_verbosity() {
+<<<<<<< HEAD
   static const char* c_verbosity = std::getenv("PYTORCH_JIT_TYPE_VERBOSITY");
   static TypeVerbosity verbosity = c_verbosity ?
     static_cast<TypeVerbosity>(std::stoi(c_verbosity)) : TypeVerbosity::Default;
+=======
+  static const auto c_verbosity = c10::utils::get_env("PYTORCH_JIT_TYPE_VERBOSITY");
+  static TypeVerbosity verbosity = c_verbosity ?
+    static_cast<TypeVerbosity>(std::stoi(c_verbosity.value())) : TypeVerbosity::Default;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return verbosity;
 }
 
diff --git a/aten/src/ATen/core/type_factory.h b/aten/src/ATen/core/type_factory.h
index 5b573b5c41e9..157451f7218e 100644
--- a/aten/src/ATen/core/type_factory.h
+++ b/aten/src/ATen/core/type_factory.h
@@ -44,7 +44,11 @@ struct TORCH_API TypeFactoryBase<c10::DynamicType> {
         c10::DynamicType::Arguments{});
   }
   template <typename T>
+<<<<<<< HEAD
   C10_ERASE static c10::DynamicTypePtr get() {
+=======
+  C10_ERASE static decltype(auto) get() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return DynamicTypeTrait<T>::getBaseType();
   }
   static const std::unordered_map<std::string, c10::TypePtr>& basePythonTypes();
diff --git a/aten/src/ATen/cpu/vec/functional_base.h b/aten/src/ATen/cpu/vec/functional_base.h
index 4d1d05ea8d32..a099e87de759 100644
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@@ -6,8 +6,29 @@
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/irange.h>
 
+<<<<<<< HEAD
 namespace at::vec {
 
+=======
+namespace at {
+namespace detail {
+// We prefer to convert through float for reduced-precision floating
+// point types if we have a Vectorized specialization for float and we
+// don't have one for the actual type in question.
+template <typename T>
+struct should_prefer_converting_through_float
+    : std::bool_constant<
+          is_reduced_floating_point_v<T> &&
+          vec::is_vec_specialized_for_v<float> &&
+          !vec::is_vec_specialized_for_v<T>> {};
+
+template <typename T>
+constexpr auto should_prefer_converting_through_float_v =
+    should_prefer_converting_through_float<T>::value;
+} // namespace detail
+
+namespace vec {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // slow path
 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(
@@ -29,16 +50,33 @@ inline scalar_t vec_reduce_all(
 
 template <typename scalar_t, typename Op>
 struct VecReduceAllSIMD {
+<<<<<<< HEAD
   static inline scalar_t apply(const Op& vec_fun, const Vectorized<scalar_t>& acc_vec) {
+=======
+  static inline scalar_t apply(
+      const Op& vec_fun,
+      const Vectorized<scalar_t>& acc_vec) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return vec_reduce_all(vec_fun, acc_vec, Vectorized<scalar_t>::size());
   }
 };
 
+<<<<<<< HEAD
 #if defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
 #if defined(CPU_CAPABILITY_AVX2)
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
   static inline float apply(const Op& vec_fun, const Vectorized<float>& acc_vec) {
+=======
+#if defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && \
+    !defined(C10_MOBILE)
+#if defined(CPU_CAPABILITY_AVX2)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(
+      const Op& vec_fun,
+      const Vectorized<float>& acc_vec) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     using Vec = Vectorized<float>;
     Vec v = acc_vec;
     // 128-bit shuffle
@@ -57,7 +95,13 @@ struct VecReduceAllSIMD<float, Op> {
 #if defined(CPU_CAPABILITY_AVX512)
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
+<<<<<<< HEAD
   static inline float apply(const Op& vec_fun, const Vectorized<float>& acc_vec) {
+=======
+  static inline float apply(
+      const Op& vec_fun,
+      const Vectorized<float>& acc_vec) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     using Vec = Vectorized<float>;
     Vec v = acc_vec;
     // 256-bit shuffle
@@ -76,6 +120,7 @@ struct VecReduceAllSIMD<float, Op> {
   }
 };
 #endif // defined(CPU_CAPABILITY_AVX512)
+<<<<<<< HEAD
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
 
 #if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE)
@@ -86,26 +131,75 @@ struct VecReduceAllSIMD<float, Op> {
     Vec v = acc_vec;
 
     // 64-bit shuffle: [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] -> [a3+a7, a4+a8, a1+a5, a2+a6, -, -, -, -]
+=======
+#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
+       // !defined(C10_MOBILE)
+
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(
+      const Op& vec_fun,
+      const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+
+    // 64-bit shuffle: [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] -> [a3+a7,
+    // a4+a8, a1+a5, a2+a6, -, -, -, -]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     float32x4_t v1_1 = vextq_f32(v, v, 2);
     Vec v1 = v1_1;
     // [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -]
     v = vec_fun(v, v1);
 
+<<<<<<< HEAD
     // 32-bit shuffle: [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -] -> [a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, -, -, -, -]
     v1_1 = vrev64q_f32(v);
     v1 = v1_1;
     // [a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, -, -, -, -]
+=======
+    // 32-bit shuffle: [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -,
+    // -, -, -] -> [a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, -, -, -,
+    // -]
+    v1_1 = vrev64q_f32(v);
+    v1 = v1_1;
+    // [a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8,
+    // a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, -, -, -, -]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     v = vec_fun(v, v1);
 
     return v[0];
   }
 };
+<<<<<<< HEAD
 #endif // defined(__aarch64__)
 
 #if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && defined(CPU_CAPABILITY_SVE256)
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
   static inline float apply(const Op& vec_fun, const Vectorized<float>& acc_vec) {
+=======
+
+template <>
+struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
+  static inline float apply(
+      const std::plus<Vectorized<float>>& vec_fun,
+      const Vectorized<float>& acc_vec) {
+    return vaddvq_f32(acc_vec);
+  }
+};
+#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+       // && !defined(CPU_CAPABILITY_SVE)
+
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    defined(CPU_CAPABILITY_SVE256)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(
+      const Op& vec_fun,
+      const Vectorized<float>& acc_vec) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     using Vec = Vectorized<float>;
     Vec v = acc_vec;
     // 128-bit shuffle
@@ -123,6 +217,7 @@ struct VecReduceAllSIMD<float, Op> {
     return svlasta(svpfalse(), v);
   }
 };
+<<<<<<< HEAD
 #endif // defined(__aarch64__)
 
 
@@ -134,6 +229,26 @@ inline scalar_t vec_reduce_all(const Op& vec_fun, const Vectorized<scalar_t>& ac
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
 inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
+=======
+#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+       // && defined(CPU_CAPABILITY_SVE256)
+
+template <typename scalar_t, typename Op>
+inline scalar_t vec_reduce_all(
+    const Op& vec_fun,
+    const Vectorized<scalar_t>& acc_vec) {
+  return VecReduceAllSIMD<scalar_t, Op>::apply(vec_fun, acc_vec);
+}
+
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline scalar_t reduce_all(
+    const Op& vec_fun,
+    const scalar_t* data,
+    int64_t size) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vec = vec::Vectorized<scalar_t>;
   if (size < Vec::size())
     return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
@@ -151,16 +266,34 @@ inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size
 }
 
 // similar to reduce_all, but reduces into two outputs
+<<<<<<< HEAD
 template <typename scalar_t, typename Op1, typename Op2,
           typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
 inline std::pair<scalar_t, scalar_t> reduce2_all(const Op1& vec_fun1, const Op2& vec_fun2,
     const scalar_t* data, int64_t size) {
+=======
+template <
+    typename scalar_t,
+    typename Op1,
+    typename Op2,
+    typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline std::pair<scalar_t, scalar_t> reduce2_all(
+    const Op1& vec_fun1,
+    const Op2& vec_fun2,
+    const scalar_t* data,
+    int64_t size) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vec = vec::Vectorized<scalar_t>;
   if (size < Vec::size()) {
     auto loaded_data = Vec::loadu(data, size);
     return std::pair<scalar_t, scalar_t>(
+<<<<<<< HEAD
       vec_reduce_all(vec_fun1, loaded_data, size),
       vec_reduce_all(vec_fun2, loaded_data, size));
+=======
+        vec_reduce_all(vec_fun1, loaded_data, size),
+        vec_reduce_all(vec_fun2, loaded_data, size));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   int64_t d = Vec::size();
   Vec acc_vec1 = Vec::loadu(data);
@@ -176,12 +309,23 @@ inline std::pair<scalar_t, scalar_t> reduce2_all(const Op1& vec_fun1, const Op2&
     acc_vec2 = Vec::set(acc_vec2, vec_fun2(acc_vec2, data_vec), size - d);
   }
   return std::pair<scalar_t, scalar_t>(
+<<<<<<< HEAD
     vec_reduce_all(vec_fun1, acc_vec1),
     vec_reduce_all(vec_fun2, acc_vec2));
 }
 
 template <typename scalar_t, typename MapOp, typename ReduceOp,
           typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+      vec_reduce_all(vec_fun1, acc_vec1), vec_reduce_all(vec_fun2, acc_vec2));
+}
+
+template <
+    typename scalar_t,
+    typename MapOp,
+    typename ReduceOp,
+    typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline scalar_t map_reduce_all(
     const MapOp& map_fun,
     const ReduceOp& red_fun,
@@ -205,8 +349,16 @@ inline scalar_t map_reduce_all(
   return vec_reduce_all(red_fun, acc_vec);
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename MapOp, typename ReduceOp,
           typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename MapOp,
+    typename ReduceOp,
+    typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline scalar_t map2_reduce_all(
     const MapOp& map_fun,
     const ReduceOp& red_fun,
@@ -237,8 +389,16 @@ inline scalar_t map2_reduce_all(
   return vec_reduce_all(red_fun, acc_vec);
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename MapOp, typename ReduceOp,
           typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename MapOp,
+    typename ReduceOp,
+    typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline scalar_t map3_reduce_all(
     const MapOp& map_fun,
     const ReduceOp& red_fun,
@@ -274,8 +434,18 @@ inline scalar_t map3_reduce_all(
   return vec_reduce_all(red_fun, acc_vec);
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<
+        !detail::should_prefer_converting_through_float_v<scalar_t> &&
+            std::is_invocable_v<Op, vec::Vectorized<scalar_t>>,
+        int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void map(
     const Op& vec_fun,
     scalar_t* output_data,
@@ -293,8 +463,21 @@ inline void map(
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<
+        !detail::should_prefer_converting_through_float_v<scalar_t> &&
+            std::is_invocable_v<
+                Op,
+                vec::Vectorized<scalar_t>,
+                vec::Vectorized<scalar_t>>,
+        int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void map2(
     const Op& vec_fun,
     scalar_t* output_data,
@@ -317,8 +500,22 @@ inline void map2(
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<
+        !detail::should_prefer_converting_through_float_v<scalar_t> &&
+            std::is_invocable_v<
+                Op,
+                vec::Vectorized<scalar_t>,
+                vec::Vectorized<scalar_t>,
+                vec::Vectorized<scalar_t>>,
+        int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void map3(
     const Op& vec_fun,
     scalar_t* output_data,
@@ -344,8 +541,23 @@ inline void map3(
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<
+        !detail::should_prefer_converting_through_float_v<scalar_t> &&
+            std::is_invocable_v<
+                Op,
+                vec::Vectorized<scalar_t>,
+                vec::Vectorized<scalar_t>,
+                vec::Vectorized<scalar_t>,
+                vec::Vectorized<scalar_t>>,
+        int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void map4(
     const Op& vec_fun,
     scalar_t* output_data,
@@ -374,4 +586,9 @@ inline void map4(
   }
 }
 
+<<<<<<< HEAD
 } // namespace at::vec
+=======
+} // namespace vec
+} // namespace at
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/functional_bfloat16.h b/aten/src/ATen/cpu/vec/functional_bfloat16.h
index 3bd22b3820f0..7c13df08320c 100644
--- a/aten/src/ATen/cpu/vec/functional_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/functional_bfloat16.h
@@ -6,27 +6,50 @@
 #include <ATen/cpu/vec/vec.h>
 
 namespace at::vec {
+<<<<<<< HEAD
 
 // BFloat16 specification
 template <typename scalar_t> struct VecScalarType { using type = scalar_t; };
 template <> struct VecScalarType<BFloat16> { using type = float; };
 template <> struct VecScalarType<Half> { using type = float; };
+=======
+// BFloat16 specification
+template <typename scalar_t>
+struct VecScalarType {
+  using type = scalar_t;
+};
+template <>
+struct VecScalarType<BFloat16> {
+  using type = float;
+};
+template <>
+struct VecScalarType<Half> {
+  using type = float;
+};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // This is different from at::acc_type since we only need to specialize BFloat16
 template <typename scalar_t>
 using vec_scalar_t = typename VecScalarType<scalar_t>::type;
 
 // Vector conversion between float and bfloat16/half
+<<<<<<< HEAD
 template <typename scalar_t,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_to_float(const Vectorized<scalar_t>&);
 
 template <>
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_to_float<BFloat16> (const Vectorized<BFloat16>& a) {
+=======
+template <>
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_to_float<
+    BFloat16>(const Vectorized<BFloat16>& a) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return convert_bfloat16_float(a);
 }
 
 template <>
+<<<<<<< HEAD
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_to_float<Half> (const Vectorized<Half>& a) {
     return convert_half_float(a);
 }
@@ -37,10 +60,22 @@ inline Vectorized<scalar_t> convert_from_float(const Vectorized<float>&, const V
 
 template <>
 inline Vectorized<BFloat16> convert_from_float<BFloat16>(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_to_float<Half>(
+    const Vectorized<Half>& a) {
+  return convert_half_float(a);
+}
+
+template <>
+inline Vectorized<BFloat16> convert_from_float<BFloat16>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return convert_float_bfloat16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 inline Vectorized<Half> convert_from_float<Half>(const Vectorized<float>& a, const Vectorized<float>& b) {
   return convert_float_half(a, b);
 }
@@ -51,10 +86,32 @@ inline void load_to_float(const scalar_t *data, Vectorized<float> &out1, Vectori
 
 template <>
 inline void load_to_float<BFloat16> (const BFloat16 *data, Vectorized<float> &out1, Vectorized<float> &out2) {
+=======
+inline Vectorized<Half> convert_from_float<Half>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return convert_float_half(a, b);
+}
+
+template <
+    typename scalar_t,
+    typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void load_to_float(
+    const scalar_t* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2);
+
+template <>
+inline void load_to_float<BFloat16>(
+    const BFloat16* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   load_fp32_from_bf16(data, out1, out2);
 }
 
 template <>
+<<<<<<< HEAD
 inline void load_to_float<Half> (const Half *data, Vectorized<float> &out1, Vectorized<float> &out2) {
   load_fp32_from_fp16(data, out1, out2);
 }
@@ -65,21 +122,49 @@ inline void load_to_float(const scalar_t *data, Vectorized<float> &out);
 
 template <>
 inline void load_to_float<BFloat16> (const BFloat16 *data, Vectorized<float> &out) {
+=======
+inline void load_to_float<Half>(
+    const Half* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  load_fp32_from_fp16(data, out1, out2);
+}
+
+template <
+    typename scalar_t,
+    typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void load_to_float(const scalar_t* data, Vectorized<float>& out);
+
+template <>
+inline void load_to_float<BFloat16>(
+    const BFloat16* data,
+    Vectorized<float>& out) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   load_fp32_from_bf16(data, out);
 }
 
 template <>
+<<<<<<< HEAD
 inline void load_to_float<Half> (const Half *data, Vectorized<float> &out) {
   load_fp32_from_fp16(data, out);
 }
 
 // Note that we already have specialized member of Vectorized<scalar_t> for BFloat16
 // so the following functions would run smoothly:
+=======
+inline void load_to_float<Half>(const Half* data, Vectorized<float>& out) {
+  load_fp32_from_fp16(data, out);
+}
+
+// Note that we already have specialized member of Vectorized<scalar_t> for
+// BFloat16 so the following functions would run smoothly:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //   using Vec = Vectorized<BFloat16>;
 //   Vec one = Vec(BFloat16(1));
 //   vec::map([](Vec x) { return one / (one + x.exp()); }, y_ptr, x_ptr, N);
 //
 // Then why we still need to specialize "functional"?
+<<<<<<< HEAD
 //   If we do specialization at Vectorized<> level, the above example would need 3 pairs of
 //   conversion of bf16->fp32/fp32->bf16, each for ".exp()", "+" and "/".
 //   If we do specialization at vec::map<>() level, we have only 1 pair of conversion
@@ -88,6 +173,17 @@ inline void load_to_float<Half> (const Half *data, Vectorized<float> &out) {
 // The following BFloat16 functionality will only do data type conversion for input
 // and output vector (reduce functionality will only convert the final scalar back to bf16).
 // Compared to Vectorized<> specialization,
+=======
+//   If we do specialization at Vectorized<> level, the above example would need
+//   3 pairs of conversion of bf16->fp32/fp32->bf16, each for ".exp()", "+" and
+//   "/". If we do specialization at vec::map<>() level, we have only 1 pair of
+//   conversion of bf16->fp32/fp32->bf16, for the input and output BFloat16
+//   vector only.
+//
+// The following BFloat16 functionality will only do data type conversion for
+// input and output vector (reduce functionality will only convert the final
+// scalar back to bf16). Compared to Vectorized<> specialization,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //   1. better performance since we have less data type conversion;
 //   2. less rounding error since immediate results are kept in fp32;
 //   3. accumulation done on data type of fp32.
@@ -95,8 +191,15 @@ inline void load_to_float<Half> (const Half *data, Vectorized<float> &out) {
 //  If you plan to extend this file, please ensure adding unit tests at
 //    aten/src/ATen/test/vec_test_all_types.cpp
 //
+<<<<<<< HEAD
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
   using bVec = vec::Vectorized<scalar_t>;
   using fVec = vec::Vectorized<float>;
@@ -104,7 +207,12 @@ inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
     bVec data_bvec = bVec::loadu(data, size);
     auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     if (size > fVec::size()) {
+<<<<<<< HEAD
       data_fvec0 = fVec::set(data_fvec0, vec_fun(data_fvec0, data_fvec1), size - fVec::size());
+=======
+      data_fvec0 = fVec::set(
+          data_fvec0, vec_fun(data_fvec0, data_fvec1), size - fVec::size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return vec_reduce_all<float>(vec_fun, data_fvec0, fVec::size());
     } else {
       return vec_reduce_all<float>(vec_fun, data_fvec0, size);
@@ -124,27 +232,55 @@ inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
     auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     if (size - d > fVec::size()) {
       acc_fvec0 = vec_fun(acc_fvec0, data_fvec0);
+<<<<<<< HEAD
       acc_fvec1 = fVec::set(acc_fvec1, vec_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
     } else {
       acc_fvec0 = fVec::set(acc_fvec0, vec_fun(acc_fvec0, data_fvec0), size - d);
+=======
+      acc_fvec1 = fVec::set(
+          acc_fvec1, vec_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
+    } else {
+      acc_fvec0 =
+          fVec::set(acc_fvec0, vec_fun(acc_fvec0, data_fvec0), size - d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   acc_fvec0 = vec_fun(acc_fvec0, acc_fvec1);
   return vec_reduce_all<float>(vec_fun, acc_fvec0);
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename Op1, typename Op2,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
 inline std::pair<float, float> reduce2_all(const Op1& vec_fun1, const Op2& vec_fun2,
     const scalar_t* data, int64_t size) {
+=======
+template <
+    typename scalar_t,
+    typename Op1,
+    typename Op2,
+    typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline std::pair<float, float> reduce2_all(
+    const Op1& vec_fun1,
+    const Op2& vec_fun2,
+    const scalar_t* data,
+    int64_t size) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using bVec = vec::Vectorized<scalar_t>;
   using fVec = vec::Vectorized<float>;
   if (size < bVec::size()) {
     bVec data_bvec = bVec::loadu(data, size);
     auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     if (size > fVec::size()) {
+<<<<<<< HEAD
       fVec acc1_fvec = fVec::set(data_fvec0, vec_fun1(data_fvec0, data_fvec1), size - fVec::size());
       fVec acc2_fvec = fVec::set(data_fvec0, vec_fun2(data_fvec0, data_fvec1), size - fVec::size());
+=======
+      fVec acc1_fvec = fVec::set(
+          data_fvec0, vec_fun1(data_fvec0, data_fvec1), size - fVec::size());
+      fVec acc2_fvec = fVec::set(
+          data_fvec0, vec_fun2(data_fvec0, data_fvec1), size - fVec::size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return std::pair<scalar_t, scalar_t>(
           vec_reduce_all<float>(vec_fun1, acc1_fvec, fVec::size()),
           vec_reduce_all<float>(vec_fun2, acc2_fvec, fVec::size()));
@@ -171,12 +307,29 @@ inline std::pair<float, float> reduce2_all(const Op1& vec_fun1, const Op2& vec_f
     auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     if (size - d > fVec::size()) {
       acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0);
+<<<<<<< HEAD
       acc1_fvec1 = fVec::set(acc1_fvec1, vec_fun1(acc1_fvec1, data_fvec1), size - d - fVec::size());
       acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0);
       acc2_fvec1 = fVec::set(acc2_fvec1, vec_fun2(acc2_fvec1, data_fvec1), size - d - fVec::size());
     } else {
       acc1_fvec0 = fVec::set(acc1_fvec0, vec_fun1(acc1_fvec0, data_fvec0), size - d);
       acc2_fvec0 = fVec::set(acc2_fvec0, vec_fun2(acc2_fvec0, data_fvec0), size - d);
+=======
+      acc1_fvec1 = fVec::set(
+          acc1_fvec1,
+          vec_fun1(acc1_fvec1, data_fvec1),
+          size - d - fVec::size());
+      acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0);
+      acc2_fvec1 = fVec::set(
+          acc2_fvec1,
+          vec_fun2(acc2_fvec1, data_fvec1),
+          size - d - fVec::size());
+    } else {
+      acc1_fvec0 =
+          fVec::set(acc1_fvec0, vec_fun1(acc1_fvec0, data_fvec0), size - d);
+      acc2_fvec0 =
+          fVec::set(acc2_fvec0, vec_fun2(acc2_fvec0, data_fvec0), size - d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   acc1_fvec0 = vec_fun1(acc1_fvec0, acc1_fvec1);
@@ -186,8 +339,16 @@ inline std::pair<float, float> reduce2_all(const Op1& vec_fun1, const Op2& vec_f
       vec_reduce_all<float>(vec_fun2, acc2_fvec0));
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename MapOp, typename ReduceOp,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename MapOp,
+    typename ReduceOp,
+    typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline float map_reduce_all(
     const MapOp& map_fun,
     const ReduceOp& red_fun,
@@ -201,7 +362,12 @@ inline float map_reduce_all(
     if (size > fVec::size()) {
       data_fvec0 = map_fun(data_fvec0);
       data_fvec1 = map_fun(data_fvec1);
+<<<<<<< HEAD
       data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
+=======
+      data_fvec0 = fVec::set(
+          data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return vec_reduce_all<float>(red_fun, data_fvec0, fVec::size());
     } else {
       data_fvec0 = map_fun(data_fvec0);
@@ -228,18 +394,35 @@ inline float map_reduce_all(
       data_fvec0 = map_fun(data_fvec0);
       data_fvec1 = map_fun(data_fvec1);
       acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
+<<<<<<< HEAD
       acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
     } else {
       data_fvec0 = map_fun(data_fvec0);
       acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
+=======
+      acc_fvec1 = fVec::set(
+          acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
+    } else {
+      data_fvec0 = map_fun(data_fvec0);
+      acc_fvec0 =
+          fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
   return vec_reduce_all<float>(red_fun, acc_fvec0);
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename MapOp, typename ReduceOp,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename MapOp,
+    typename ReduceOp,
+    typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline float map2_reduce_all(
     const MapOp& map_fun,
     const ReduceOp& red_fun,
@@ -256,7 +439,12 @@ inline float map2_reduce_all(
     if (size > fVec::size()) {
       data_fvec0 = map_fun(data_fvec0, data2_fvec0);
       data_fvec1 = map_fun(data_fvec1, data2_fvec1);
+<<<<<<< HEAD
       data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
+=======
+      data_fvec0 = fVec::set(
+          data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return vec_reduce_all<float>(red_fun, data_fvec0, fVec::size());
     } else {
       data_fvec0 = map_fun(data_fvec0, data2_fvec0);
@@ -289,18 +477,35 @@ inline float map2_reduce_all(
       data_fvec0 = map_fun(data_fvec0, data2_fvec0);
       data_fvec1 = map_fun(data_fvec1, data2_fvec1);
       acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
+<<<<<<< HEAD
       acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
     } else {
       data_fvec0 = map_fun(data_fvec0, data2_fvec0);
       acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
+=======
+      acc_fvec1 = fVec::set(
+          acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
+    } else {
+      data_fvec0 = map_fun(data_fvec0, data2_fvec0);
+      acc_fvec0 =
+          fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
   return vec_reduce_all<float>(red_fun, acc_fvec0);
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename MapOp, typename ReduceOp,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename MapOp,
+    typename ReduceOp,
+    typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline float map3_reduce_all(
     const MapOp& map_fun,
     const ReduceOp& red_fun,
@@ -320,7 +525,12 @@ inline float map3_reduce_all(
     if (size > fVec::size()) {
       data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
       data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
+<<<<<<< HEAD
       data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
+=======
+      data_fvec0 = fVec::set(
+          data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return vec_reduce_all<float>(red_fun, data_fvec0, fVec::size());
     } else {
       data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
@@ -359,18 +569,37 @@ inline float map3_reduce_all(
       data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
       data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
       acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
+<<<<<<< HEAD
       acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
     } else {
       data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
       acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
+=======
+      acc_fvec1 = fVec::set(
+          acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
+    } else {
+      data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
+      acc_fvec0 =
+          fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
   return vec_reduce_all<float>(red_fun, acc_fvec0);
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<
+        !(!detail::should_prefer_converting_through_float_v<scalar_t> &&
+          std::is_invocable_v<Op, vec::Vectorized<scalar_t>>),
+        int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void map(
     const Op& vec_fun,
     scalar_t* output_data,
@@ -397,8 +626,15 @@ inline void map(
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void map(
     const Op& vec_fun,
     scalar_t* output_data,
@@ -419,7 +655,12 @@ inline void map(
     fVec data_fvec0, data_fvec1;
     if (size - d > fVec::size()) {
       data_fvec0 = fVec::loadu(input_data + d);
+<<<<<<< HEAD
       data_fvec1 = fVec::loadu(input_data + d + fVec::size(), size - d - fVec::size());
+=======
+      data_fvec1 =
+          fVec::loadu(input_data + d + fVec::size(), size - d - fVec::size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       // choose to align with behaviour of bVec::loadu(ptr, size),
       // which leaves data_fvec1 uninitialized
@@ -432,8 +673,21 @@ inline void map(
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<
+        !(!detail::should_prefer_converting_through_float_v<scalar_t> &&
+          std::is_invocable_v<
+              Op,
+              vec::Vectorized<scalar_t>,
+              vec::Vectorized<scalar_t>>),
+        int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void map2(
     const Op& vec_fun,
     scalar_t* output_data,
@@ -465,8 +719,22 @@ inline void map2(
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<
+        !(!detail::should_prefer_converting_through_float_v<scalar_t> &&
+          std::is_invocable_v<
+              Op,
+              vec::Vectorized<scalar_t>,
+              vec::Vectorized<scalar_t>,
+              vec::Vectorized<scalar_t>>),
+        int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void map3(
     const Op& vec_fun,
     scalar_t* output_data,
@@ -503,8 +771,23 @@ inline void map3(
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename Op,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+=======
+template <
+    typename scalar_t,
+    typename Op,
+    typename std::enable_if_t<
+        !(!detail::should_prefer_converting_through_float_v<scalar_t> &&
+          std::is_invocable_v<
+              Op,
+              vec::Vectorized<scalar_t>,
+              vec::Vectorized<scalar_t>,
+              vec::Vectorized<scalar_t>,
+              vec::Vectorized<scalar_t>>),
+        int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void map4(
     const Op& vec_fun,
     scalar_t* output_data,
@@ -525,8 +808,15 @@ inline void map4(
     auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
     bVec data4_bvec = bVec::loadu(input_data4 + d);
     auto [data4_fvec0, data4_fvec1] = convert_to_float<scalar_t>(data4_bvec);
+<<<<<<< HEAD
     fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0);
     fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1);
+=======
+    fVec output_fvec0 =
+        vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0);
+    fVec output_fvec1 =
+        vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
     output_bvec.store(output_data + d);
   }
@@ -539,8 +829,15 @@ inline void map4(
     auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
     bVec data4_bvec = bVec::loadu(input_data4 + d, size - d);
     auto [data4_fvec0, data4_fvec1] = convert_to_float<scalar_t>(data4_bvec);
+<<<<<<< HEAD
     fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0);
     fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1);
+=======
+    fVec output_fvec0 =
+        vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0);
+    fVec output_fvec1 =
+        vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
     output_bvec.store(output_data + d, size - d);
   }
diff --git a/aten/src/ATen/cpu/vec/intrinsics.h b/aten/src/ATen/cpu/vec/intrinsics.h
index 48b18793b079..fd7d793e846b 100644
--- a/aten/src/ATen/cpu/vec/intrinsics.h
+++ b/aten/src/ATen/cpu/vec/intrinsics.h
@@ -13,10 +13,21 @@
 /* Microsoft C/C++-compatible compiler */
 #include <intrin.h>
 #if _MSC_VER <= 1900
+<<<<<<< HEAD
 #define _mm256_extract_epi64(X, Y) (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
 #define _mm256_extract_epi32(X, Y) (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
 #define _mm256_extract_epi16(X, Y) (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
 #define _mm256_extract_epi8(X, Y) (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
+=======
+#define _mm256_extract_epi64(X, Y) \
+  (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
+#define _mm256_extract_epi32(X, Y) \
+  (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
+#define _mm256_extract_epi16(X, Y) \
+  (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
+#define _mm256_extract_epi8(X, Y) \
+  (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 #elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
 /* GCC-compatible compiler, targeting ARM with NEON */
@@ -25,9 +36,15 @@
 /* GCC-compatible compiler, targeting ARM with SVE */
 #include <arm_sve.h>
 #endif
+<<<<<<< HEAD
 #if defined (MISSING_ARM_VLD1)
 #include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
 #elif defined (MISSING_ARM_VST1)
+=======
+#if defined(MISSING_ARM_VLD1)
+#include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
+#elif defined(MISSING_ARM_VST1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cpu/vec/vec256/missing_vst1_neon.h>
 #endif
 #elif defined(__GNUC__) && defined(__IWMMXT__)
@@ -36,8 +53,13 @@
 #elif defined(__s390x__)
 // targets Z/architecture
 // we will include vecintrin later
+<<<<<<< HEAD
 #elif (defined(__GNUC__) || defined(__xlC__)) &&                               \
         (defined(__VEC__) || defined(__ALTIVEC__))
+=======
+#elif (defined(__GNUC__) || defined(__xlC__)) && \
+    (defined(__VEC__) || defined(__ALTIVEC__))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
 #include <altivec.h>
 /* We need to undef those tokens defined by <altivec.h> to avoid conflicts
diff --git a/aten/src/ATen/cpu/vec/sve/sve_helper.h b/aten/src/ATen/cpu/vec/sve/sve_helper.h
index e511ebb52b2e..3005f77aed0a 100644
--- a/aten/src/ATen/cpu/vec/sve/sve_helper.h
+++ b/aten/src/ATen/cpu/vec/sve/sve_helper.h
@@ -7,6 +7,7 @@
 #if defined(CPU_CAPABILITY_SVE)
 
 // Define the data type of VLS(vector-length specific).
+<<<<<<< HEAD
 typedef svbool_t vls_pred_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
 typedef svint8_t vls_int8_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
 typedef svint16_t vls_int16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
@@ -19,6 +20,34 @@ typedef svuint64_t vls_uint64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH
 typedef svfloat16_t vls_float16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
 typedef svfloat32_t vls_float32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
 typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+=======
+typedef svbool_t vls_pred_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint8_t vls_int8_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint16_t vls_int16_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint32_t vls_int32_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint64_t vls_int64_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint8_t vls_uint8_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint16_t vls_uint16_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint32_t vls_uint32_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint64_t vls_uint64_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat16_t vls_float16_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svbfloat16_t vls_bfloat16_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat32_t vls_float32_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat64_t vls_float64_t
+    __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define ptrue svptrue_b8()
 #define ZERO_S8 svdup_n_s8(0)
@@ -32,7 +61,11 @@ typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDT
 #define ZERO_F16 svdup_n_f16(0.f)
 #define ZERO_F32 svdup_n_f32(0.f)
 #define ZERO_F64 svdup_n_f64(0.0)
+<<<<<<< HEAD
 #define ONE_S8  svdup_n_s8(1)
+=======
+#define ONE_S8 svdup_n_s8(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define ONE_S16 svdup_n_s16(1)
 #define ONE_S32 svdup_n_s32(1)
 #define ONE_S64 svdup_n_s64(1)
@@ -41,6 +74,10 @@ typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDT
 #define ONE_U32 svdup_n_u32(1)
 #define ONE_U64 svdup_n_u64(1)
 #define ONE_F16 svdup_n_f16(1.f)
+<<<<<<< HEAD
+=======
+#define ONE_BF16 svdup_n_bf16(1.f)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define ONE_F32 svdup_n_f32(1.f)
 #define ONE_F64 svdup_n_f64(1.0)
 #define ALL_S8_TRUE_MASK svdup_n_s8(0xff)
@@ -55,6 +92,11 @@ typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDT
 #define ALL_U8_FALSE_MASK svdup_n_u8(0x00)
 #define ALL_F16_TRUE_MASK svreinterpret_f16_s16(ALL_S16_TRUE_MASK)
 #define ALL_F16_FALSE_MASK svreinterpret_f16_s16(ALL_S16_FALSE_MASK)
+<<<<<<< HEAD
+=======
+#define ALL_BF16_TRUE_MASK svreinterpret_bf16_s16(ALL_S16_TRUE_MASK)
+#define ALL_BF16_FALSE_MASK svreinterpret_bf16_s16(ALL_S16_FALSE_MASK)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define ALL_F32_TRUE_MASK svreinterpret_f32_s32(ALL_S32_TRUE_MASK)
 #define ALL_F32_FALSE_MASK svreinterpret_f32_s32(ALL_S32_FALSE_MASK)
 #define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK)
diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
new file mode 100644
index 000000000000..7f05c2ad166f
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@@ -0,0 +1,580 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#include <ATen/cpu/vec/sve/vec_float.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <cmath>
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+
+template <>
+struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<BFloat16> {
+ private:
+  vls_bfloat16_t values;
+
+ public:
+  using value_type = BFloat16;
+  using size_type = int;
+
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(BFloat16);
+  }
+
+  Vectorized() {}
+  Vectorized(svbfloat16_t v) : values(v) {}
+  Vectorized(int val);
+  Vectorized(BFloat16 val);
+
+  template <
+      typename... Args,
+      typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ BFloat16 buffer[size()] = {vals...};
+    values = svld1_bf16(ptrue, reinterpret_cast<const bfloat16_t*>(buffer));
+  }
+
+  operator svbfloat16_t() const {
+    return values;
+  }
+  static Vectorized<BFloat16> blendv(
+      const Vectorized<BFloat16>& a,
+      const Vectorized<BFloat16>& b,
+      const Vectorized<BFloat16>& mask_) {
+    svbool_t mask =
+        svcmpeq_s16(ptrue, svreinterpret_s16_bf16(mask_), ALL_S16_TRUE_MASK);
+    return svsel_bf16(mask, b, a);
+  }
+  template <typename step_t>
+  static Vectorized<BFloat16> arange(
+      BFloat16 base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    __at_align__ BFloat16 buffer[size()];
+    for (int64_t i = 0; i < size(); i++) {
+      buffer[i] = base + i * step;
+    }
+    return svld1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(buffer));
+  }
+  static Vectorized<BFloat16> set(
+      const Vectorized<BFloat16>& a,
+      const Vectorized<BFloat16>& b,
+      int64_t count = size()) {
+    if (count == 0) {
+      return a;
+    } else if (count < size()) {
+      return svsel_bf16(svwhilelt_b16(0ull, count), b, a);
+    }
+    return b;
+  }
+  static Vectorized<BFloat16> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_bf16(ptrue, reinterpret_cast<const bfloat16_t*>(ptr));
+    svbool_t pg = svwhilelt_b16(0ull, count);
+    return svld1_bf16(pg, reinterpret_cast<const bfloat16_t*>(ptr));
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    __at_align__ bfloat16_t tmp[size()];
+    std::memset(tmp, 0, sizeof(tmp));
+    if (count == size()) {
+      svst1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(tmp), values);
+    } else {
+      svbool_t pg = svwhilelt_b16(0ull, count);
+      svst1_bf16(pg, reinterpret_cast<bfloat16_t*>(tmp), values);
+    }
+    std::memcpy(
+        reinterpret_cast<bfloat16_t*>(ptr),
+        reinterpret_cast<const bfloat16_t*>(tmp),
+        count * sizeof(bfloat16_t));
+  }
+  const BFloat16& operator[](int idx) const = delete;
+  BFloat16& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    int64_t mask = 0;
+    // returns an integer mask where all zero elements are translated to
+    // 1-bit and others are translated to 0-bit int64_t mask = 0;
+    __at_align__ int16_t mask_array[size()];
+
+    svbool_t svbool_mask =
+        svcmpeq_f16(ptrue, svreinterpret_f16_bf16(values), ZERO_F16);
+    svst1_s16(
+        ptrue,
+        mask_array,
+        svsel_s16(svbool_mask, ALL_S16_TRUE_MASK, ALL_S16_FALSE_MASK));
+    for (int64_t i = 0; i < size(); ++i) {
+      if (mask_array[i])
+        mask |= (1ull << i);
+    }
+    return mask;
+  }
+  Vectorized<BFloat16> isnan() const;
+  bool has_inf_nan() const;
+  Vectorized<BFloat16> map(BFloat16 (*f)(BFloat16)) const {
+    __at_align__ BFloat16 tmp[size()];
+    store(tmp);
+    for (int64_t i = 0; i < size(); ++i) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<BFloat16> abs() const {
+    auto mask = svdup_n_u16(0x7FFF);
+    auto vals = svreinterpret_u16_bf16(values);
+    vals = svand_u16_x(ptrue, vals, mask);
+    return svreinterpret_bf16_u16(vals);
+  }
+  Vectorized<BFloat16> angle() const;
+  Vectorized<BFloat16> real() const {
+    return values;
+  }
+  Vectorized<BFloat16> imag() const {
+    return Vectorized<BFloat16>(0.f);
+  }
+  Vectorized<BFloat16> conj() const {
+    return values;
+  }
+  Vectorized<BFloat16> acos() const;
+  Vectorized<BFloat16> acosh() const;
+  Vectorized<BFloat16> asin() const;
+  Vectorized<BFloat16> atan() const;
+  Vectorized<BFloat16> atanh() const;
+  Vectorized<BFloat16> atan2(const Vectorized<BFloat16>& b) const;
+  Vectorized<BFloat16> copysign(const Vectorized<BFloat16>& sign) const;
+  Vectorized<BFloat16> erf() const;
+  Vectorized<BFloat16> erfc() const;
+  Vectorized<BFloat16> erfinv() const;
+  Vectorized<BFloat16> exp() const;
+  Vectorized<BFloat16> exp2() const;
+  Vectorized<BFloat16> expm1() const;
+  Vectorized<BFloat16> exp_u20() const {
+    return exp();
+  }
+  Vectorized<BFloat16> fmod(const Vectorized<BFloat16>& q) const;
+  Vectorized<BFloat16> hypot(const Vectorized<BFloat16>& b) const;
+  Vectorized<BFloat16> i0() const;
+  Vectorized<BFloat16> i0e() const;
+  Vectorized<BFloat16> digamma() const;
+  Vectorized<BFloat16> igamma(const Vectorized<BFloat16>& x) const;
+  Vectorized<BFloat16> igammac(const Vectorized<BFloat16>& x) const;
+  Vectorized<BFloat16> nextafter(const Vectorized<BFloat16>& b) const;
+  Vectorized<BFloat16> log() const;
+  Vectorized<BFloat16> log2() const;
+  Vectorized<BFloat16> log10() const;
+  Vectorized<BFloat16> log1p() const;
+  Vectorized<BFloat16> frac() const;
+  Vectorized<BFloat16> sin() const;
+  Vectorized<BFloat16> sinh() const;
+  Vectorized<BFloat16> cos() const;
+  Vectorized<BFloat16> cosh() const;
+  Vectorized<BFloat16> ceil() const;
+  Vectorized<BFloat16> floor() const;
+  Vectorized<BFloat16> neg() const {
+    auto mask = svdup_n_u16(0x8000);
+    auto vals = svreinterpret_u16_bf16(values);
+    vals = sveor_u16_x(ptrue, vals, mask);
+    return svreinterpret_bf16_u16(vals);
+  };
+  Vectorized<BFloat16> round() const;
+  Vectorized<BFloat16> tan() const;
+  Vectorized<BFloat16> tanh() const;
+  Vectorized<BFloat16> trunc() const;
+  Vectorized<BFloat16> lgamma() const;
+  Vectorized<BFloat16> sqrt() const;
+  Vectorized<BFloat16> reciprocal() const;
+  Vectorized<BFloat16> rsqrt() const;
+  Vectorized<BFloat16> pow(const Vectorized<BFloat16>& b) const;
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<BFloat16> operator==(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> operator!=(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> operator<(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> operator<=(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> operator>(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> operator>=(const Vectorized<BFloat16>& other) const;
+
+  Vectorized<BFloat16> eq(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ne(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> gt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ge(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> lt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
+};
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
+    const Vectorized<c10::BFloat16>& a) {
+  static_assert(
+      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
+  auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
+  auto bf16_vec1 = svzip1_bf16(zero, a);
+  auto bf16_vec2 = svzip2_bf16(zero, a);
+  auto x1 = svreinterpret_f32_bf16(bf16_vec1);
+  auto x2 = svreinterpret_f32_bf16(bf16_vec2);
+  return {Vectorized<float>(x1), Vectorized<float>(x2)};
+}
+
+inline Vectorized<c10::BFloat16> convert_float_bfloat16(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  static_assert(
+      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
+  svbfloat16_t x1 = svcvt_bf16_f32_z(ptrue, a);
+  svbfloat16_t x2 = svcvt_bf16_f32_z(ptrue, b);
+  return Vectorized<c10::BFloat16>(svuzp1_bf16(x1, x2));
+}
+
+inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
+  __at_align__ float values[Vectorized<float>::size()];
+  for (const auto k : c10::irange(Vectorized<float>::size())) {
+    values[k] = data[k];
+  }
+  out = Vectorized<float>::loadu(values);
+}
+
+inline void load_fp32_from_bf16(
+    const BFloat16* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  Vectorized<BFloat16> bf16_vec = Vectorized<BFloat16>::loadu(data);
+  auto floats = convert_bfloat16_float(bf16_vec);
+  out1 = std::get<0>(floats);
+  out2 = std::get<1>(floats);
+}
+
+template <typename Op>
+Vectorized<c10::BFloat16> binary_operator_via_float(
+    Op op,
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  const auto [a_float_low, a_float_high] = convert_bfloat16_float(a);
+  const auto [b_float_low, b_float_high] = convert_bfloat16_float(b);
+  return convert_float_bfloat16(
+      op(a_float_low, b_float_low), op(a_float_high, b_float_high));
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator+(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator-(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator*(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
+}
+
+template <>
+Vectorized<c10::BFloat16> inline operator/(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
+}
+
+inline Vectorized<BFloat16>::Vectorized(int val) {
+  auto vals_f = svdup_n_f32(val);
+  values = convert_float_bfloat16(vals_f, vals_f);
+}
+
+inline Vectorized<BFloat16>::Vectorized(BFloat16 val) {
+  auto vals_f = svdup_n_f32((float)val);
+  values = convert_float_bfloat16(vals_f, vals_f);
+}
+
+bool inline Vectorized<c10::BFloat16>::has_inf_nan() const {
+  auto [v1, v2] = convert_bfloat16_float(values);
+  return v1.has_inf_nan() || v2.has_inf_nan();
+}
+// frac. Implement this here so we can use subtraction
+Vectorized<BFloat16> inline Vectorized<BFloat16>::frac() const {
+  return *this - this->trunc();
+}
+
+#define DEFINE_BF16_FUNC_VIA_FLOAT(func_name)                           \
+  Vectorized<BFloat16> inline Vectorized<BFloat16>::func_name() const { \
+    auto [v1, v2] = convert_bfloat16_float(*this);                      \
+    v1 = v1.func_name();                                                \
+    v2 = v2.func_name();                                                \
+    return convert_float_bfloat16(v1, v2);                              \
+  }
+
+#define DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(func_name)            \
+  Vectorized<BFloat16> inline Vectorized<BFloat16>::func_name( \
+      const Vectorized<BFloat16>& a) const {                   \
+    auto [v1, v2] = convert_bfloat16_float(*this);             \
+    auto [v3, v4] = convert_bfloat16_float(a);                 \
+    v1 = v1.func_name(v3);                                     \
+    v2 = v2.func_name(v4);                                     \
+    return convert_float_bfloat16(v1, v2);                     \
+  }
+
+DEFINE_BF16_FUNC_VIA_FLOAT(isnan);
+DEFINE_BF16_FUNC_VIA_FLOAT(angle);
+DEFINE_BF16_FUNC_VIA_FLOAT(acos);
+DEFINE_BF16_FUNC_VIA_FLOAT(acosh);
+DEFINE_BF16_FUNC_VIA_FLOAT(asin);
+DEFINE_BF16_FUNC_VIA_FLOAT(atan);
+DEFINE_BF16_FUNC_VIA_FLOAT(atanh);
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(atan2);
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(copysign);
+DEFINE_BF16_FUNC_VIA_FLOAT(erf);
+DEFINE_BF16_FUNC_VIA_FLOAT(erfc);
+DEFINE_BF16_FUNC_VIA_FLOAT(exp);
+DEFINE_BF16_FUNC_VIA_FLOAT(exp2);
+DEFINE_BF16_FUNC_VIA_FLOAT(expm1);
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(fmod);
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(hypot);
+DEFINE_BF16_FUNC_VIA_FLOAT(i0);
+DEFINE_BF16_FUNC_VIA_FLOAT(i0e);
+DEFINE_BF16_FUNC_VIA_FLOAT(digamma);
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igamma);
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(igammac);
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(nextafter);
+DEFINE_BF16_FUNC_VIA_FLOAT(log);
+DEFINE_BF16_FUNC_VIA_FLOAT(log2);
+DEFINE_BF16_FUNC_VIA_FLOAT(log10);
+DEFINE_BF16_FUNC_VIA_FLOAT(log1p);
+DEFINE_BF16_FUNC_VIA_FLOAT(sin);
+DEFINE_BF16_FUNC_VIA_FLOAT(sinh);
+DEFINE_BF16_FUNC_VIA_FLOAT(cos);
+DEFINE_BF16_FUNC_VIA_FLOAT(cosh);
+DEFINE_BF16_FUNC_VIA_FLOAT(ceil);
+DEFINE_BF16_FUNC_VIA_FLOAT(floor);
+DEFINE_BF16_FUNC_VIA_FLOAT(round);
+DEFINE_BF16_FUNC_VIA_FLOAT(tan);
+DEFINE_BF16_FUNC_VIA_FLOAT(tanh);
+DEFINE_BF16_FUNC_VIA_FLOAT(trunc);
+DEFINE_BF16_FUNC_VIA_FLOAT(lgamma);
+DEFINE_BF16_FUNC_VIA_FLOAT(sqrt);
+DEFINE_BF16_FUNC_VIA_FLOAT(reciprocal);
+DEFINE_BF16_FUNC_VIA_FLOAT(rsqrt);
+DEFINE_BF16_FUNC_VIA_FLOAT_W_ARG(pow);
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator==(
+    const Vectorized<BFloat16>& other) const {
+  auto [f1, f2] = convert_bfloat16_float(values);
+  auto [f3, f4] = convert_bfloat16_float(other);
+  svbool_t mask1 = svcmpeq_f32(ptrue, f1, f3);
+  svbool_t mask2 = svcmpeq_f32(ptrue, f2, f4);
+  auto res1 = svsel_f32(mask1, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  auto res2 = svsel_f32(mask2, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+
+  auto bf16_1 = svreinterpret_bf16_f32(res1);
+  auto bf16_2 = svreinterpret_bf16_f32(res2);
+  return svuzp1_bf16(bf16_1, bf16_2);
+}
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator!=(
+    const Vectorized<BFloat16>& other) const {
+  auto [f1, f2] = convert_bfloat16_float(values);
+  auto [f3, f4] = convert_bfloat16_float(other);
+  svbool_t mask1 = svcmpne_f32(ptrue, f1, f3);
+  svbool_t mask2 = svcmpne_f32(ptrue, f2, f4);
+  auto res1 = svsel_f32(mask1, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  auto res2 = svsel_f32(mask2, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+
+  auto bf16_1 = svreinterpret_bf16_f32(res1);
+  auto bf16_2 = svreinterpret_bf16_f32(res2);
+  return svuzp1_bf16(bf16_1, bf16_2);
+}
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator>(
+    const Vectorized<BFloat16>& other) const {
+  auto [v1, v2] = convert_bfloat16_float(*this);
+  auto [v3, v4] = convert_bfloat16_float(other);
+  return convert_float_bfloat16(v1 > v3, v2 > v4);
+}
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator>=(
+    const Vectorized<BFloat16>& other) const {
+  auto [v1, v2] = convert_bfloat16_float(*this);
+  auto [v3, v4] = convert_bfloat16_float(other);
+  return convert_float_bfloat16(v1 >= v3, v2 >= v4);
+}
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator<(
+    const Vectorized<BFloat16>& other) const {
+  auto [v1, v2] = convert_bfloat16_float(*this);
+  auto [v3, v4] = convert_bfloat16_float(other);
+  return convert_float_bfloat16(v1 < v3, v2 < v4);
+}
+Vectorized<BFloat16> inline Vectorized<BFloat16>::operator<=(
+    const Vectorized<BFloat16>& other) const {
+  auto [v1, v2] = convert_bfloat16_float(*this);
+  auto [v3, v4] = convert_bfloat16_float(other);
+  return convert_float_bfloat16(v1 <= v3, v2 <= v4);
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline maximum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_operator_via_float(
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&maximum),
+      a,
+      b);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline minimum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_operator_via_float(
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&minimum),
+      a,
+      b);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_max(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& max) {
+  return binary_operator_via_float(
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&clamp_max),
+      a,
+      max);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_min(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min) {
+  return binary_operator_via_float(
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&clamp_min),
+      a,
+      min);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min,
+    const Vectorized<BFloat16>& max) {
+  return clamp_min(clamp_max(a, max), min);
+}
+
+template <>
+Vectorized<BFloat16> inline operator&(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return svreinterpret_bf16_u16(
+      svand_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b)));
+}
+
+template <>
+Vectorized<BFloat16> inline operator|(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return svreinterpret_bf16_u16(
+      svorr_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b)));
+}
+
+template <>
+Vectorized<BFloat16> inline operator^(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return svreinterpret_bf16_u16(
+      sveor_u16_x(ptrue, svreinterpret_u16_bf16(a), svreinterpret_u16_bf16(b)));
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::eq(
+    const Vectorized<BFloat16>& other) const {
+  return (*this == other) & Vectorized<BFloat16>(1.0f);
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::ne(
+    const Vectorized<BFloat16>& other) const {
+  return (*this != other) & Vectorized<BFloat16>(1.0f);
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::gt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this > other) & Vectorized<BFloat16>(1.0f);
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::ge(
+    const Vectorized<BFloat16>& other) const {
+  return (*this >= other) & Vectorized<BFloat16>(1.0f);
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::lt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this < other) & Vectorized<BFloat16>(1.0f);
+}
+
+Vectorized<BFloat16> inline Vectorized<BFloat16>::le(
+    const Vectorized<BFloat16>& other) const {
+  return (*this <= other) & Vectorized<BFloat16>(1.0f);
+}
+
+template <>
+inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<BFloat16>::size();
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<BFloat16>::size()) {
+    svst1_bf16(
+        ptrue,
+        const_cast<bfloat16_t*>(reinterpret_cast<const bfloat16_t*>(dst)) + i,
+        svldnt1_bf16(
+            ptrue,
+            const_cast<bfloat16_t*>(reinterpret_cast<const bfloat16_t*>(src)) +
+                i));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<BFloat16>::size()) {
+    svbool_t pg = svwhilelt_b16(i, n);
+    svst1_bf16(
+        pg,
+        const_cast<bfloat16_t*>(reinterpret_cast<const bfloat16_t*>(dst)) + i,
+        svldnt1_bf16(
+            pg,
+            const_cast<bfloat16_t*>(reinterpret_cast<const bfloat16_t*>(src)) +
+                i));
+  }
+}
+
+template <>
+Vectorized<BFloat16> inline fmadd(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b,
+    const Vectorized<BFloat16>& c) {
+  return a * b + c;
+}
+
+#endif // defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
+
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
diff --git a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
index c7968e271f91..d20556c504f9 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
@@ -5,17 +5,30 @@
 
 #include <ATen/cpu/vec/intrinsics.h>
 
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/sve/sve_helper.h>
 
 #if defined(CPU_CAPABILITY_SVE)
 #include <ATen/cpu/vec/sve/vec_float.h>
 #include <ATen/cpu/vec/sve/vec_double.h>
+=======
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/vec_base.h>
+
+#if defined(CPU_CAPABILITY_SVE)
+#include <ATen/cpu/vec/sve/vec_bfloat16.h>
+#include <ATen/cpu/vec/sve/vec_double.h>
+#include <ATen/cpu/vec/sve/vec_float.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cpu/vec/sve/vec_int.h>
 #include <ATen/cpu/vec/sve/vec_qint.h>
 #endif
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::vec {
 // Note [CPU_CAPABILITY namespace]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -30,6 +43,7 @@ inline namespace CPU_CAPABILITY {
 #if defined(CPU_CAPABILITY_SVE)
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+<<<<<<< HEAD
 
 template<>
 inline Vectorized<float> cast<float, double>(const Vectorized<double>& src) {
@@ -71,11 +85,57 @@ template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
 inline gather(const float* base_addr, const Vectorized<int32_t>& vindex_) {
   svint32_t vindex = svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
+=======
+#define DEFINE_SVE_CAST(t1_t, t1_prefix, t2_t, t2_prefix)                 \
+  template <>                                                             \
+  inline Vectorized<t1_t> cast<t1_t, t2_t>(const Vectorized<t2_t>& src) { \
+    return svreinterpret_##t1_prefix##_##t2_prefix(src);                  \
+  }                                                                       \
+  template <>                                                             \
+  inline Vectorized<t2_t> cast<t2_t, t1_t>(const Vectorized<t1_t>& src) { \
+    return svreinterpret_##t2_prefix##_##t1_prefix(src);                  \
+  }
+
+DEFINE_SVE_CAST(int64_t, s64, double, f64)
+DEFINE_SVE_CAST(int32_t, s32, double, f64)
+DEFINE_SVE_CAST(int16_t, s16, double, f64)
+DEFINE_SVE_CAST(int64_t, s64, float, f32)
+DEFINE_SVE_CAST(int32_t, s32, float, f32)
+DEFINE_SVE_CAST(int16_t, s16, float, f32)
+DEFINE_SVE_CAST(float, f32, double, f64)
+
+#ifdef __ARM_FEATURE_BF16
+DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16)
+DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16)
+DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16)
+#endif // __ARM_FEATURE_BF16
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        double>> inline gather(const double* base_addr, const Vectorized<int64_t>& vindex_) {
+  svint64_t vindex =
+      svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3);
+  return svld1_gather_s64index_f64(ptrue, base_addr, vindex);
+}
+
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        float>> inline gather(const float* base_addr, const Vectorized<int32_t>& vindex_) {
+  svint32_t vindex =
+      svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svld1_gather_s32index_f32(ptrue, base_addr, vindex);
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+<<<<<<< HEAD
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline mask_gather(const Vectorized<double>& src, const double* base_addr,
@@ -94,12 +154,43 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
                               ALL_S32_TRUE_MASK);
   svint32_t vindex = svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
   return svsel_f32(mask, svld1_gather_s32index_f32(mask, base_addr, vindex), src);
+=======
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>> inline mask_gather(
+        const Vectorized<double>& src,
+        const double* base_addr,
+        const Vectorized<int64_t>& vindex_,
+        const Vectorized<double>& mask_) {
+  svbool_t mask =
+      svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_), ALL_S64_TRUE_MASK);
+  svint64_t vindex =
+      svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3);
+  return svsel_f64(
+      mask, svld1_gather_s64index_f64(mask, base_addr, vindex), src);
+}
+
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>> inline mask_gather(
+        const Vectorized<float>& src,
+        const float* base_addr,
+        const Vectorized<int32_t>& vindex_,
+        const Vectorized<float>& mask_) {
+  svbool_t mask =
+      svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_), ALL_S32_TRUE_MASK);
+  svint32_t vindex =
+      svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
+  return svsel_f32(
+      mask, svld1_gather_s32index_f32(mask, base_addr, vindex), src);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 // Only works for inputs in the range: [-2^51, 2^51]
 // From: https://stackoverflow.com/a/41148578
+<<<<<<< HEAD
 template<>
 Vectorized<int64_t>
 inline convert_to_int_of_same_size<double>(const Vectorized<double> &src) {
@@ -112,20 +203,42 @@ inline convert_to_int_of_same_size<double>(const Vectorized<double> &src) {
 template<>
 Vectorized<int32_t>
 inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
+=======
+template <>
+Vectorized<int64_t> inline convert_to_int_of_same_size<double>(
+    const Vectorized<double>& src) {
+  svfloat64_t x = svadd_f64_x(ptrue, src, svdup_n_f64(0x0018000000000000));
+  return svsub_s64_x(
+      ptrue,
+      svreinterpret_s64_f64(x),
+      svreinterpret_s64_f64(svdup_n_f64(0x0018000000000000)));
+}
+
+template <>
+Vectorized<int32_t> inline convert_to_int_of_same_size<float>(
+    const Vectorized<float>& src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svcvt_s32_f32_x(ptrue, src);
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <>
+<<<<<<< HEAD
 std::pair<Vectorized<double>, Vectorized<double>>
 inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+std::pair<Vectorized<double>, Vectorized<double>> inline interleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inputs:
   //   a = {a0, a1, a3, a3}
   //   b = {b0, b1, b2, b3}
   // group cols crossing lanes:
   //   return {a0, b0, a1, b1}
   //          {a2, b2, a3, b3}
+<<<<<<< HEAD
   return std::make_pair(Vectorized<double>(svzip1_f64(a, b)),
                         Vectorized<double>(svzip2_f64(a, b)));
 }
@@ -133,12 +246,24 @@ inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>
 template <>
 std::pair<Vectorized<float>, Vectorized<float>>
 inline interleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  return std::make_pair(
+      Vectorized<double>(svzip1_f64(a, b)),
+      Vectorized<double>(svzip2_f64(a, b)));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline interleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inputs:
   //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
   //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
   // group cols crossing lanes:
   //   return {a0, b0, a1, b1, a2, b2, a3, b3}
   //          {a4, b4, a5, b5, a6, b6, a7, b7}
+<<<<<<< HEAD
   return std::make_pair(Vectorized<float>(svzip1_f32(a, b)),
                         Vectorized<float>(svzip2_f32(a, b)));
 }
@@ -148,12 +273,45 @@ inline interleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b
 template <>
 std::pair<Vectorized<double>, Vectorized<double>>
 inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+  return std::make_pair(
+      Vectorized<float>(svzip1_f32(a, b)), Vectorized<float>(svzip2_f32(a, b)));
+}
+
+#ifdef __ARM_FEATURE_BF16
+template <>
+std::pair<
+    Vectorized<c10::BFloat16>,
+    Vectorized<c10::BFloat16>> inline interleave2<c10::
+                                                      BFloat16>(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  return std::make_pair(
+      Vectorized<c10::BFloat16>(svzip1_bf16(a, b)),
+      Vectorized<c10::BFloat16>(svzip2_bf16(a, b)));
+}
+#endif // __ARM_FEATURE_BF16
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline deinterleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inputs:
   //   a = {a0, b0, a1, b1}
   //   b = {a2, b2, a3, b3}
   // swap lanes:
   //   return {a0, a1, a2, a3}
   //          {b0, b1, b2, b3}
+<<<<<<< HEAD
   return std::make_pair(Vectorized<double>(svuzp1_f64(a, b)),
                         Vectorized<double>(svuzp2_f64(a, b)));
 }
@@ -161,12 +319,24 @@ inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<doubl
 template <>
 std::pair<Vectorized<float>, Vectorized<float>>
 inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  return std::make_pair(
+      Vectorized<double>(svuzp1_f64(a, b)),
+      Vectorized<double>(svuzp2_f64(a, b)));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inputs:
   //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
   //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
   // swap lanes:
   //   return {a0, a1, a2, a3, a4, a5, a6, a7}
   //          {b0, b1, b2, b3, b4, b5, b6, b7}
+<<<<<<< HEAD
   return std::make_pair(Vectorized<float>(svuzp1_f32(a, b)),
                         Vectorized<float>(svuzp2_f32(a, b)));
 }
@@ -174,3 +344,33 @@ inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>&
 #endif // defined(CPU_CAPABILITY_SVE)
 
 }}
+=======
+  return std::make_pair(
+      Vectorized<float>(svuzp1_f32(a, b)), Vectorized<float>(svuzp2_f32(a, b)));
+}
+
+#ifdef __ARM_FEATURE_BF16
+template <>
+std::pair<
+    Vectorized<c10::BFloat16>,
+    Vectorized<c10::BFloat16>> inline deinterleave2<c10::
+                                                        BFloat16>(
+    const Vectorized<c10::BFloat16>& a,
+    const Vectorized<c10::BFloat16>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+  // swap lanes:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  return std::make_pair(
+      Vectorized<c10::BFloat16>(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)),
+      Vectorized<c10::BFloat16>(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b)));
+}
+#endif // __ARM_FEATURE_BF16
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/sve/vec_double.h b/aten/src/ATen/cpu/vec/sve/vec_double.h
index 23626e29ce1c..b144ccafc2a4 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_double.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_double.h
@@ -1,8 +1,13 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/sve/sve_helper.h>
+=======
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cmath>
 #if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
 #include <sleef.h>
@@ -24,10 +29,22 @@ inline namespace CPU_CAPABILITY {
 
 #if defined(CPU_CAPABILITY_SVE)
 
+<<<<<<< HEAD
 template <> class Vectorized<double> {
 private:
   vls_float64_t values;
 public:
+=======
+template <>
+struct is_vec_specialized_for<double> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<double> {
+ private:
+  vls_float64_t values;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = double;
   using size_type = int;
   static constexpr size_type size() {
@@ -38,24 +55,41 @@ template <> class Vectorized<double> {
   Vectorized(double val) {
     values = svdup_n_f64(val);
   }
+<<<<<<< HEAD
   template<typename... Args,
            typename = std::enable_if_t<(sizeof...(Args) == size())>>
   Vectorized(Args... vals) {
     __at_align__ double buffer[size()] = { vals... };
+=======
+  template <
+      typename... Args,
+      typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ double buffer[size()] = {vals...};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     values = svld1_f64(ptrue, buffer);
   }
   operator svfloat64_t() const {
     return values;
   }
   template <uint64_t mask>
+<<<<<<< HEAD
   static Vectorized<double> blend(const Vectorized<double>& a, const Vectorized<double>& b) {
     // Build an array of flags: each element is 1 if the corresponding bit in 'mask' is set, 0 otherwise.
+=======
+  static Vectorized<double> blend(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b) {
+    // Build an array of flags: each element is 1 if the corresponding bit in
+    // 'mask' is set, 0 otherwise.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ int64_t flag_arr[size()];
     for (int i = 0; i < size(); i++) {
       flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0;
     }
     // Load the flag array into an SVE int64 vector.
     svint64_t int_mask = svld1_s64(svptrue_b64(), flag_arr);
+<<<<<<< HEAD
     // Compare each lane of int_mask to 0; returns an svbool_t predicate where true indicates a nonzero flag.
     svbool_t blend_mask = svcmpne_n_s64(svptrue_b64(), int_mask, 0);
 
@@ -71,14 +105,44 @@ template <> class Vectorized<double> {
   }
   template<typename step_t>
   static Vectorized<double> arange(double base = 0., step_t step = static_cast<step_t>(1)) {
+=======
+    // Compare each lane of int_mask to 0; returns an svbool_t predicate where
+    // true indicates a nonzero flag.
+    svbool_t blend_mask = svcmpne_n_s64(svptrue_b64(), int_mask, 0);
+
+    // Use svsel to select elements from b where the predicate is true, else
+    // from a.
+    svfloat64_t result = svsel(blend_mask, b.values, a.values);
+    return Vectorized<double>(result);
+  }
+  static Vectorized<double> blendv(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      const Vectorized<double>& mask_) {
+    svbool_t mask =
+        svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_), ALL_S64_TRUE_MASK);
+    return svsel_f64(mask, b, a);
+  }
+  template <typename step_t>
+  static Vectorized<double> arange(
+      double base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ double buffer[size()];
     for (int64_t i = 0; i < size(); i++) {
       buffer[i] = base + i * step;
     }
     return svld1_f64(ptrue, buffer);
   }
+<<<<<<< HEAD
   static Vectorized<double> set(const Vectorized<double>& a, const Vectorized<double>& b,
                            int64_t count = size()) {
+=======
+  static Vectorized<double> set(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (count == 0) {
       return a;
     } else if (count < size()) {
@@ -100,19 +164,37 @@ template <> class Vectorized<double> {
       svst1_f64(pg, reinterpret_cast<double*>(ptr), values);
     }
   }
+<<<<<<< HEAD
   const double& operator[](int idx) const  = delete;
   double& operator[](int idx) = delete;
   int64_t zero_mask() const {
     // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+=======
+  const double& operator[](int idx) const = delete;
+  double& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t mask = 0;
     __at_align__ int64_t mask_array[size()];
 
     svbool_t svbool_mask = svcmpeq_f64(ptrue, values, ZERO_F64);
+<<<<<<< HEAD
     svst1_s64(ptrue, mask_array, svsel_s64(svbool_mask,
                                           ALL_S64_TRUE_MASK,
                                           ALL_S64_FALSE_MASK));
     for (int64_t i = 0; i < size(); ++i) {
       if (mask_array[i]) mask |= (1ull << i);
+=======
+    svst1_s64(
+        ptrue,
+        mask_array,
+        svsel_s64(svbool_mask, ALL_S64_TRUE_MASK, ALL_S64_FALSE_MASK));
+    for (int64_t i = 0; i < size(); ++i) {
+      if (mask_array[i])
+        mask |= (1ull << i);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return mask;
   }
@@ -122,7 +204,13 @@ template <> class Vectorized<double> {
     return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
   }
   bool has_inf_nan() const {
+<<<<<<< HEAD
     return svptest_any(ptrue, svcmpuo_f64(ptrue, svsub_f64_x(ptrue, values, values), ZERO_F64));
+=======
+    return svptest_any(
+        ptrue,
+        svcmpuo_f64(ptrue, svsub_f64_x(ptrue, values, values), ZERO_F64));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> map(double (*f)(double)) const {
     __at_align__ double tmp[size()];
@@ -155,6 +243,7 @@ template <> class Vectorized<double> {
     return *this;
   }
   Vectorized<double> acos() const {
+<<<<<<< HEAD
     return USE_SLEEF(Vectorized<double>(Sleef_acosdx_u10sve(values)),map(std::acos));
   }
   Vectorized<double> acosh() const {
@@ -174,6 +263,33 @@ template <> class Vectorized<double> {
   }
   Vectorized<double> atan2(const Vectorized<double> &b) const {
     USE_SLEEF({return Vectorized<double>(Sleef_atan2dx_u10sve(values, b));},
+=======
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_acosdx_u10sve(values)), map(std::acos));
+  }
+  Vectorized<double> acosh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_acoshdx_u10sve(values)), map(std::acosh));
+  }
+  Vectorized<double> asin() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_asindx_u10sve(values)), map(std::asin));
+  }
+  Vectorized<double> asinh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_asinhdx_u10sve(values)), map(std::asinh));
+  }
+  Vectorized<double> atan() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_atandx_u10sve(values)), map(std::atan));
+  }
+  Vectorized<double> atanh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_atanhdx_u10sve(values)), map(std::atanh));
+  }
+  Vectorized<double> atan2(const Vectorized<double>& b) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_atan2dx_u10sve(values, b)); },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {
         __at_align__ double tmp[size()];
         __at_align__ double tmp_b[size()];
@@ -183,6 +299,7 @@ template <> class Vectorized<double> {
           tmp[i] = std::atan2(tmp[i], tmp_b[i]);
         }
         return loadu(tmp);
+<<<<<<< HEAD
       }
     )
   }
@@ -205,11 +322,33 @@ template <> class Vectorized<double> {
   }
   Vectorized<double> erfc() const {
     return USE_SLEEF(Vectorized<double>(Sleef_erfcdx_u15sve(values)),map(std::erfc));
+=======
+      })} Vectorized<double> copysign(const Vectorized<double>& sign) const {
+      USE_SLEEF(
+          { return Vectorized<double>(Sleef_copysigndx_sve(values, sign)); },
+          {
+            __at_align__ double tmp[size()];
+            __at_align__ double tmp_sign[size()];
+            store(tmp);
+            sign.store(tmp_sign);
+            for (int64_t i = 0; i < size(); i++) {
+              tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+            }
+            return loadu(tmp);
+          })} Vectorized<double> erf() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_erfdx_u10sve(values)), map(std::erf));
+  }
+  Vectorized<double> erfc() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_erfcdx_u15sve(values)), map(std::erfc));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> erfinv() const {
     return map(calc_erfinv);
   }
   Vectorized<double> exp() const {
+<<<<<<< HEAD
     return USE_SLEEF(Vectorized<double>(Sleef_expdx_u10sve(values)),map(std::exp));
   }
   Vectorized<double> exp2() const {
@@ -217,13 +356,31 @@ template <> class Vectorized<double> {
   }
   Vectorized<double> expm1() const {
     return USE_SLEEF(Vectorized<double>(Sleef_expm1dx_u10sve(values)),map(std::expm1));
+=======
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_expdx_u10sve(values)), map(std::exp));
+  }
+  Vectorized<double> exp2() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_exp2dx_u10sve(values)), map(std::exp2));
+  }
+  Vectorized<double> expm1() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_expm1dx_u10sve(values)), map(std::expm1));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<double> fmod(const Vectorized<double>& q) const {
     USE_SLEEF({return Vectorized<double>(Sleef_fmoddx_sve(values, q));},
     {
+=======
+  Vectorized<double> fmod(const Vectorized<double>& q) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_fmoddx_sve(values, q)); },
+      {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         __at_align__ double tmp[size()];
         __at_align__ double tmp_q[size()];
         store(tmp);
@@ -232,6 +389,7 @@ template <> class Vectorized<double> {
           tmp[i] = std::fmod(tmp[i], tmp_q[i]);
         }
         return loadu(tmp);
+<<<<<<< HEAD
       }
     )
   }
@@ -249,6 +407,21 @@ template <> class Vectorized<double> {
     })
   }
   Vectorized<double> i0() const {
+=======
+      })} Vectorized<double> hypot(const Vectorized<double>& b) const {
+      USE_SLEEF(
+          { return Vectorized<double>(Sleef_hypotdx_u05sve(values, b)); },
+          {
+            __at_align__ double tmp[size()];
+            __at_align__ double tmp_b[size()];
+            store(tmp);
+            b.store(tmp_b);
+            for (int64_t i = 0; i < size(); i++) {
+              tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+            }
+            return loadu(tmp);
+          })} Vectorized<double> i0() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(calc_i0);
   }
   Vectorized<double> i0e() const {
@@ -257,7 +430,11 @@ template <> class Vectorized<double> {
   Vectorized<double> digamma() const {
     return map(calc_digamma);
   }
+<<<<<<< HEAD
   Vectorized<double> igamma(const Vectorized<double> &x) const {
+=======
+  Vectorized<double> igamma(const Vectorized<double>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ double tmp[size()];
     __at_align__ double tmp_x[size()];
     store(tmp);
@@ -267,7 +444,11 @@ template <> class Vectorized<double> {
     }
     return loadu(tmp);
   }
+<<<<<<< HEAD
   Vectorized<double> igammac(const Vectorized<double> &x) const {
+=======
+  Vectorized<double> igammac(const Vectorized<double>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ double tmp[size()];
     __at_align__ double tmp_x[size()];
     store(tmp);
@@ -277,11 +458,16 @@ template <> class Vectorized<double> {
     }
     return loadu(tmp);
   }
+<<<<<<< HEAD
   Vectorized<double> nextafter(const Vectorized<double> &b) const {
     USE_SLEEF(
       {
         return Vectorized<double>(Sleef_nextafterdx_sve(values, b));
       },
+=======
+  Vectorized<double> nextafter(const Vectorized<double>& b) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_nextafterdx_sve(values, b)); },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {
         __at_align__ double tmp[size()];
         __at_align__ double tmp_b[size()];
@@ -291,6 +477,7 @@ template <> class Vectorized<double> {
           tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
         }
         return loadu(tmp);
+<<<<<<< HEAD
       }
     )
   }
@@ -318,6 +505,40 @@ template <> class Vectorized<double> {
   }
   Vectorized<double> cosh() const {
     return USE_SLEEF( Vectorized<double>(Sleef_coshdx_u10sve(values)),map(std::cosh));
+=======
+      })} Vectorized<double> log() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_logdx_u10sve(values)), map(std::log));
+  }
+  Vectorized<double> log2() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_log2dx_u10sve(values)), map(std::log2));
+  }
+  Vectorized<double> log10() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_log10dx_u10sve(values)), map(std::log10));
+  }
+  Vectorized<double> log1p() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_log1pdx_u10sve(values)), map(std::log1p));
+  }
+  Vectorized<double> frac() const;
+  Vectorized<double> sin() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_sindx_u10sve(values)), map(std::sin));
+  }
+  Vectorized<double> sinh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_sinhdx_u10sve(values)), map(std::sinh));
+  }
+  Vectorized<double> cos() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_cosdx_u10sve(values)), map(std::cos));
+  }
+  Vectorized<double> cosh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_coshdx_u10sve(values)), map(std::cosh));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> ceil() const {
     return svrintp_f64_x(ptrue, values);
@@ -332,16 +553,30 @@ template <> class Vectorized<double> {
     return svrinti_f64_x(ptrue, values);
   }
   Vectorized<double> tan() const {
+<<<<<<< HEAD
     return USE_SLEEF( Vectorized<double>(Sleef_tandx_u10sve(values)),map(std::tan));
   }
   Vectorized<double> tanh() const {
     return USE_SLEEF( Vectorized<double>(Sleef_tanhdx_u10sve(values)),map(std::tanh));
+=======
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_tandx_u10sve(values)), map(std::tan));
+  }
+  Vectorized<double> tanh() const {
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_tanhdx_u10sve(values)), map(std::tanh));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> trunc() const {
     return svrintz_f64_x(ptrue, values);
   }
   Vectorized<double> lgamma() const {
+<<<<<<< HEAD
     return USE_SLEEF( Vectorized<double>(Sleef_lgammadx_u10sve(values)),map(std::lgamma));
+=======
+    return USE_SLEEF(
+        Vectorized<double>(Sleef_lgammadx_u10sve(values)), map(std::lgamma));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> sqrt() const {
     return svsqrt_f64_x(ptrue, values);
@@ -352,6 +587,7 @@ template <> class Vectorized<double> {
   Vectorized<double> rsqrt() const {
     return svdivr_f64_x(ptrue, svsqrt_f64_x(ptrue, values), ONE_F64);
   }
+<<<<<<< HEAD
   Vectorized<double> pow(const Vectorized<double> &b) const {
    USE_SLEEF( {return Vectorized<double>(Sleef_powdx_u10sve(values, b));},
     {
@@ -369,6 +605,22 @@ template <> class Vectorized<double> {
   // Comparison using the _CMP_**_OQ predicate.
   //   `O`: get false if an operand is NaN
   //   `Q`: do not raise if an operand is NaN
+=======
+  Vectorized<double> pow(const Vectorized<double>& b) const {USE_SLEEF(
+      { return Vectorized<double>(Sleef_powdx_u10sve(values, b)); },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::pow(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} // Comparison using the _CMP_**_OQ predicate.
+          //   `O`: get false if an operand is NaN
+          //   `Q`: do not raise if an operand is NaN
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<double> operator==(const Vectorized<double>& other) const {
     svbool_t mask = svcmpeq_f64(ptrue, values, other);
     return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
@@ -408,22 +660,46 @@ template <> class Vectorized<double> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator+(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svadd_f64_x(ptrue, a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator-(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svsub_f64_x(ptrue, a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator*(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmul_f64_x(ptrue, a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator/(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svdiv_f64_x(ptrue, a, b);
 }
 
@@ -435,33 +711,65 @@ Vectorized<double> inline Vectorized<double>::frac() const {
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline maximum(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline maximum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmax_f64_x(ptrue, a, b);
 }
 
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline minimum(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline minimum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmin_f64_x(ptrue, a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline clamp(const Vectorized<double>& a, const Vectorized<double>& min, const Vectorized<double>& max) {
+=======
+Vectorized<double> inline clamp(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min,
+    const Vectorized<double>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmin_f64_x(ptrue, max, svmax_f64_x(ptrue, min, a));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline clamp_max(const Vectorized<double>& a, const Vectorized<double>& max) {
+=======
+Vectorized<double> inline clamp_max(
+    const Vectorized<double>& a,
+    const Vectorized<double>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmin_f64_x(ptrue, max, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline clamp_min(const Vectorized<double>& a, const Vectorized<double>& min) {
+=======
+Vectorized<double> inline clamp_min(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmax_f64_x(ptrue, min, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
   return svreinterpret_f64_s64(svand_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
 }
@@ -497,6 +805,58 @@ Vectorized<double> inline Vectorized<double>::lt(const Vectorized<double>& other
 }
 
 Vectorized<double> inline Vectorized<double>::le(const Vectorized<double>& other) const {
+=======
+Vectorized<double> inline operator&(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(
+      svand_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+template <>
+Vectorized<double> inline operator|(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(
+      svorr_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+template <>
+Vectorized<double> inline operator^(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(
+      sveor_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+Vectorized<double> inline Vectorized<double>::eq(
+    const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::ne(
+    const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::gt(
+    const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::ge(
+    const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::lt(
+    const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::le(
+    const Vectorized<double>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<double>(1.0);
 }
 
@@ -515,10 +875,22 @@ inline void convert(const double* src, double* dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline fmadd(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+=======
+Vectorized<double> inline fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmad_f64_x(ptrue, a, b, c);
 }
 
 #endif // defined(CPU_CAPABILITY_SVE)
 
+<<<<<<< HEAD
 }}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/sve/vec_float.h b/aten/src/ATen/cpu/vec/sve/vec_float.h
index 6a3dc2bc1c10..bbaa6e686cf7 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_float.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_float.h
@@ -1,8 +1,13 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/sve/sve_helper.h>
+=======
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cmath>
 #if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
 #include <sleef.h>
@@ -24,10 +29,22 @@ inline namespace CPU_CAPABILITY {
 
 #if defined(CPU_CAPABILITY_SVE)
 
+<<<<<<< HEAD
 template <> class Vectorized<float> {
 private:
   vls_float32_t values;
 public:
+=======
+template <>
+struct is_vec_specialized_for<float> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<float> {
+ private:
+  vls_float32_t values;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = float;
   using size_type = int;
   static constexpr size_type size() {
@@ -38,16 +55,25 @@ template <> class Vectorized<float> {
   Vectorized(float val) {
     values = svdup_n_f32(val);
   }
+<<<<<<< HEAD
   template<typename... Args,
            typename = std::enable_if_t<(sizeof...(Args) == size())>>
   Vectorized(Args... vals) {
     __at_align__ float buffer[size()] = { vals... };
+=======
+  template <
+      typename... Args,
+      typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ float buffer[size()] = {vals...};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     values = svld1_f32(ptrue, buffer);
   }
   operator svfloat32_t() const {
     return values;
   }
   template <uint64_t mask>
+<<<<<<< HEAD
   static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
     // Build an array of flags: each element is 1 if the corresponding bit in 'mask' is set, 0 otherwise.
     __at_align__ int32_t flag_arr[size()];
@@ -70,14 +96,54 @@ template <> class Vectorized<float> {
   }
   template<typename step_t>
   static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
+=======
+  static Vectorized<float> blend(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b) {
+    // Build an array of flags: each element is 1 if the corresponding bit in
+    // 'mask' is set, 0 otherwise.
+    __at_align__ int32_t flag_arr[size()];
+    for (int i = 0; i < size(); i++) {
+      flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0;
+    }
+    // Load the flag array into an SVE int32 vector.
+    svint32_t int_mask = svld1_s32(svptrue_b32(), flag_arr);
+    // Compare each lane of int_mask to 0; returns an svbool_t predicate where
+    // true indicates a nonzero flag.
+    svbool_t blend_mask = svcmpne_n_s32(svptrue_b32(), int_mask, 0);
+    // Use svsel to select elements from b where the predicate is true, else
+    // from a.
+    svfloat32_t result = svsel_f32(blend_mask, b.values, a.values);
+    return Vectorized<float>(result);
+  }
+  static Vectorized<float> blendv(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      const Vectorized<float>& mask_) {
+    svbool_t mask =
+        svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_), ALL_S32_TRUE_MASK);
+    return svsel_f32(mask, b, a);
+  }
+  template <typename step_t>
+  static Vectorized<float> arange(
+      float base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ float buffer[size()];
     for (int64_t i = 0; i < size(); i++) {
       buffer[i] = base + i * step;
     }
     return svld1_f32(ptrue, buffer);
   }
+<<<<<<< HEAD
   static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
                            int64_t count = size()) {
+=======
+  static Vectorized<float> set(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (count == 0) {
       return a;
     } else if (count < size()) {
@@ -85,6 +151,74 @@ template <> class Vectorized<float> {
     }
     return b;
   }
+<<<<<<< HEAD
+=======
+  // Implementation is picked from
+  // https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L105
+  inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x) const {
+    const auto c1 =
+        svreinterpret_f32_u32(svdup_n_u32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f
+    const auto c2 =
+        svreinterpret_f32_u32(svdup_n_u32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f
+    const auto c3 =
+        svreinterpret_f32_u32(svdup_n_u32(0x3e2aaf33)); // x^3: 0x1.555e66p-3f
+    const auto c4 =
+        svreinterpret_f32_u32(svdup_n_u32(0x3d2b9f17)); // x^4: 0x1.573e2ep-5f
+    const auto c5 =
+        svreinterpret_f32_u32(svdup_n_u32(0x3c072010)); // x^5: 0x1.0e4020p-7f
+    const auto shift = svreinterpret_f32_u32(
+        svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = svreinterpret_f32_u32(
+        svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi = svreinterpret_f32_u32(svdup_n_u32(
+        0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo = svreinterpret_f32_u32(svdup_n_u32(
+        0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+    const auto inf = svdup_n_f32(std::numeric_limits<float>::infinity());
+    const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
+    const auto zero = svdup_n_f32(0.f);
+    const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
+    // Range reduction:
+    //   e^x = 2^n * e^r
+    // where:
+    //   n = floor(x / ln(2))
+    //   r = x - n * ln(2)
+    //
+    // By adding x / ln(2) with 2^23 + 127 (shift):
+    //   * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127
+    //   forces decimal part
+    //     of x / ln(2) out of the result. The integer part of x / ln(2) (i.e.
+    //     n) + 127 will occupy the whole fraction part of z in FP32 format.
+    //     Subtracting 2^23 + 127 (shift) from z will result in the integer part
+    //     of x / ln(2) (i.e. n) because the decimal part has been pushed out
+    //     and lost.
+    //   * The addition of 127 makes the FP32 fraction part of z ready to be
+    //   used as the exponent
+    //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
+    const auto z = svmla_f32_z(pg, shift, x, inv_ln2);
+    const auto n = svsub_f32_z(pg, z, shift);
+    const auto scale = svreinterpret_f32_u32(
+        svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n
+    // The calculation of n * ln(2) is done using 2 steps to achieve accuracy
+    // beyond FP32. This outperforms longer Taylor series (3-4 tabs) both in
+    // term of accuracy and performance.
+    const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi);
+    const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
+    // Compute the truncated Taylor series of e^r.
+    //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
+    const auto r2 = svmul_f32_z(pg, r, r);
+    const auto p1 = svmul_f32_z(pg, c1, r);
+    const auto p23 = svmla_f32_z(pg, c2, c3, r);
+    const auto p45 = svmla_f32_z(pg, c4, c5, r);
+    const auto p2345 = svmla_f32_z(pg, p23, p45, r2);
+    const auto p12345 = svmla_f32_z(pg, p1, p2345, r2);
+    auto poly = svmla_f32_z(pg, scale, p12345, scale);
+    // Handle underflow and overflow.
+    poly = svsel_f32(svcmplt_f32(pg, x, min_input), zero, poly);
+    poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly);
+    return poly;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
     if (count == size())
       return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
@@ -99,19 +233,37 @@ template <> class Vectorized<float> {
       svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
     }
   }
+<<<<<<< HEAD
   const float& operator[](int idx) const  = delete;
   float& operator[](int idx) = delete;
   int64_t zero_mask() const {
     // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+=======
+  const float& operator[](int idx) const = delete;
+  float& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t mask = 0;
     __at_align__ int32_t mask_array[size()];
 
     svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
+<<<<<<< HEAD
     svst1_s32(ptrue, mask_array, svsel_s32(svbool_mask,
                                           ALL_S32_TRUE_MASK,
                                           ALL_S32_FALSE_MASK));
     for (int64_t i = 0; i < size(); ++i) {
       if (mask_array[i]) mask |= (1ull << i);
+=======
+    svst1_s32(
+        ptrue,
+        mask_array,
+        svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK));
+    for (int64_t i = 0; i < size(); ++i) {
+      if (mask_array[i])
+        mask |= (1ull << i);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return mask;
   }
@@ -121,7 +273,13 @@ template <> class Vectorized<float> {
     return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
   }
   bool has_inf_nan() const {
+<<<<<<< HEAD
     return svptest_any(ptrue, svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
+=======
+    return svptest_any(
+        ptrue,
+        svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> map(float (*f)(float)) const {
     __at_align__ float tmp[size()];
@@ -154,6 +312,7 @@ template <> class Vectorized<float> {
     return values;
   }
   Vectorized<float> acos() const {
+<<<<<<< HEAD
     return USE_SLEEF(Vectorized<float>(Sleef_acosfx_u10sve(values)),map(std::acos));
   }
   Vectorized<float> acosh() const {
@@ -173,11 +332,39 @@ template <> class Vectorized<float> {
   }
   Vectorized<float> atan2(const Vectorized<float> &b) const {
      USE_SLEEF({return Vectorized<float>(Sleef_atan2fx_u10sve(values, b));},
+=======
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_acosfx_u10sve(values)), map(std::acos));
+  }
+  Vectorized<float> acosh() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_acoshfx_u10sve(values)), map(std::acosh));
+  }
+  Vectorized<float> asin() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_asinfx_u10sve(values)), map(std::asin));
+  }
+  Vectorized<float> asinh() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_asinhfx_u10sve(values)), map(std::asinh));
+  }
+  Vectorized<float> atan() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_atanfx_u10sve(values)), map(std::atan));
+  }
+  Vectorized<float> atanh() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_atanhfx_u10sve(values)), map(std::atanh));
+  }
+  Vectorized<float> atan2(const Vectorized<float>& b) const {USE_SLEEF(
+      { return Vectorized<float>(Sleef_atan2fx_u10sve(values, b)); },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {
         __at_align__ float tmp[size()];
         __at_align__ float tmp_b[size()];
         store(tmp);
         b.store(tmp_b);
+<<<<<<< HEAD
         for (int64_t i = 0; i < size(); i++){
           tmp[i] = std::atan2(tmp[i], tmp_b[i]);
         }
@@ -204,11 +391,38 @@ template <> class Vectorized<float> {
   }
   Vectorized<float> erfc() const {
     return USE_SLEEF(Vectorized<float>(Sleef_erfcfx_u15sve(values)),map(std::erfc));
+=======
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} Vectorized<float> copysign(const Vectorized<float>& sign) const {
+
+      USE_SLEEF(
+          { return Vectorized<float>(Sleef_copysignfx_sve(values, sign)); },
+          {
+            __at_align__ float tmp[size()];
+            __at_align__ float tmp_sign[size()];
+            store(tmp);
+            sign.store(tmp_sign);
+            for (int64_t i = 0; i < size(); ++i) {
+              tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+            }
+            return loadu(tmp);
+          })} Vectorized<float> erf() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_erffx_u10sve(values)), map(std::erf));
+  }
+  Vectorized<float> erfc() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_erfcfx_u15sve(values)), map(std::erfc));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> erfinv() const {
     return map(calc_erfinv);
   }
   Vectorized<float> exp() const {
+<<<<<<< HEAD
     return USE_SLEEF(Vectorized<float>(Sleef_expfx_u10sve(values)),map(std::exp));
   }
   Vectorized<float> exp2() const {
@@ -216,13 +430,31 @@ template <> class Vectorized<float> {
   }
   Vectorized<float> expm1() const {
     return USE_SLEEF(Vectorized<float>(Sleef_expm1fx_u10sve(values)),map(std::expm1));
+=======
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_expfx_u10sve(values)), map(std::exp));
+  }
+  Vectorized<float> exp2() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_exp2fx_u10sve(values)), map(std::exp2));
+  }
+  Vectorized<float> expm1() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_expm1fx_u10sve(values)), map(std::expm1));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<float> fmod(const Vectorized<float>& q) const {
    USE_SLEEF({return Vectorized<float>(Sleef_fmodfx_sve(values, q));},
     {
+=======
+  Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
+      { return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
+      {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         __at_align__ float tmp[size()];
         __at_align__ float tmp_q[size()];
         store(tmp);
@@ -231,6 +463,7 @@ template <> class Vectorized<float> {
           tmp[i] = std::fmod(tmp[i], tmp_q[i]);
         }
         return loadu(tmp);
+<<<<<<< HEAD
       })
   }
   Vectorized<float> hypot(const Vectorized<float> &b) const {
@@ -248,6 +481,21 @@ template <> class Vectorized<float> {
       )
   }
   Vectorized<float> i0() const {
+=======
+      })} Vectorized<float> hypot(const Vectorized<float>& b) const {
+      USE_SLEEF(
+          { return Vectorized<float>(Sleef_hypotfx_u05sve(values, b)); },
+          {
+            __at_align__ float tmp[size()];
+            __at_align__ float tmp_b[size()];
+            store(tmp);
+            b.store(tmp_b);
+            for (int64_t i = 0; i < size(); i++) {
+              tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+            }
+            return loadu(tmp);
+          })} Vectorized<float> i0() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(calc_i0);
   }
   Vectorized<float> i0e() const {
@@ -256,7 +504,11 @@ template <> class Vectorized<float> {
   Vectorized<float> digamma() const {
     return map(calc_digamma);
   }
+<<<<<<< HEAD
   Vectorized<float> igamma(const Vectorized<float> &x) const {
+=======
+  Vectorized<float> igamma(const Vectorized<float>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ float tmp[size()];
     __at_align__ float tmp_x[size()];
     store(tmp);
@@ -266,7 +518,11 @@ template <> class Vectorized<float> {
     }
     return loadu(tmp);
   }
+<<<<<<< HEAD
   Vectorized<float> igammac(const Vectorized<float> &x) const {
+=======
+  Vectorized<float> igammac(const Vectorized<float>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ float tmp[size()];
     __at_align__ float tmp_x[size()];
     store(tmp);
@@ -276,11 +532,16 @@ template <> class Vectorized<float> {
     }
     return loadu(tmp);
   }
+<<<<<<< HEAD
   Vectorized<float> nextafter(const Vectorized<float> &b) const {
     USE_SLEEF(
       {
         return Vectorized<float>(Sleef_nextafterfx_sve(values, b));
       },
+=======
+  Vectorized<float> nextafter(const Vectorized<float>& b) const {USE_SLEEF(
+      { return Vectorized<float>(Sleef_nextafterfx_sve(values, b)); },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {
         __at_align__ float tmp[size()];
         __at_align__ float tmp_b[size()];
@@ -290,6 +551,7 @@ template <> class Vectorized<float> {
           tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
         }
         return loadu(tmp);
+<<<<<<< HEAD
       }
     )
   }
@@ -317,6 +579,40 @@ template <> class Vectorized<float> {
   }
   Vectorized<float> cosh() const {
     return USE_SLEEF(Vectorized<float>(Sleef_coshfx_u10sve(values)),map(std::cosh));
+=======
+      })} Vectorized<float> log() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_logfx_u10sve(values)), map(std::log));
+  }
+  Vectorized<float> log2() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_log2fx_u10sve(values)), map(std::log2));
+  }
+  Vectorized<float> log10() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_log10fx_u10sve(values)), map(std::log10));
+  }
+  Vectorized<float> log1p() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_log1pfx_u10sve(values)), map(std::log1p));
+  }
+  Vectorized<float> frac() const;
+  Vectorized<float> sin() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_sinfx_u10sve(values)), map(std::sin));
+  }
+  Vectorized<float> sinh() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_sinhfx_u10sve(values)), map(std::sinh));
+  }
+  Vectorized<float> cos() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_cosfx_u10sve(values)), map(std::cos));
+  }
+  Vectorized<float> cosh() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_coshfx_u10sve(values)), map(std::cosh));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> ceil() const {
     return svrintp_f32_x(ptrue, values);
@@ -331,16 +627,67 @@ template <> class Vectorized<float> {
     return svrinti_f32_x(ptrue, values);
   }
   Vectorized<float> tan() const {
+<<<<<<< HEAD
     return USE_SLEEF(Vectorized<float>(Sleef_tanfx_u10sve(values)),map(std::tan));
   }
   Vectorized<float> tanh() const {
     return USE_SLEEF(Vectorized<float>(Sleef_tanhfx_u10sve(values)),map(std::tanh));
+=======
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_tanfx_u10sve(values)), map(std::tan));
+  }
+  // Implementation is picked from
+  // https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179
+  Vectorized<float> tanh() const {
+    // Constants used for the tanh calculation.
+    const svfloat32_t CONST_1 =
+        svdup_n_f32(1.f); // Constant 1.0f for the tanh formula.
+    const svfloat32_t CONST_2 = svdup_n_f32(
+        2.f); // Constant 2.0f for the tanh formula (used in exp(2x)).
+    const svfloat32_t CONST_MIN_TANH = svdup_n_f32(
+        -10.f); // Minimum threshold for input values to prevent overflow.
+    const svfloat32_t CONST_MAX_TANH = svdup_n_f32(
+        10.f); // Maximum threshold for input values to prevent overflow.
+
+    // Step 1: Clamp the values within the range [-10, 10] to prevent overflow
+    // during exponentiation. The tanh function approaches ±1 rapidly as the
+    // input grows large, so we limit the input range to avoid numerical
+    // instability. svmax_f32_z ensures values are greater than -10, and
+    // svmin_f32_z ensures they are less than 10.
+    svfloat32_t x = svmin_f32_z(
+        ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH);
+
+    // Step 2: Calculate exp(2 * x), where x is the clamped value.
+    // svmul_f32_z computes 2 * x, and svexp_f32_z computes the exponential of
+    // the result.
+    svfloat32_t exp2x = svexp_f32_z(ptrue, svmul_f32_z(ptrue, CONST_2, x));
+
+    // Step 3: Calculate the numerator of the tanh function, which is exp(2x)
+    // - 1.
+    svfloat32_t num = svsub_f32_z(ptrue, exp2x, CONST_1);
+
+    // Step 4: Calculate the denominator of the tanh function, which is exp(2x)
+    // + 1.
+    svfloat32_t den = svadd_f32_z(ptrue, exp2x, CONST_1);
+
+    // Step 5: Calculate the tanh function as the ratio of the numerator and
+    // denominator: num / den.
+    svfloat32_t tanh = svdiv_f32_z(ptrue, num, den);
+
+    // Return the calculated tanh values.
+    return tanh;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> trunc() const {
     return svrintz_f32_x(ptrue, values);
   }
   Vectorized<float> lgamma() const {
+<<<<<<< HEAD
     return USE_SLEEF(Vectorized<float>(Sleef_lgammafx_u10sve(values)),map(std::lgamma));
+=======
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_lgammafx_u10sve(values)), map(std::lgamma));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> sqrt() const {
     return svsqrt_f32_x(ptrue, values);
@@ -351,6 +698,7 @@ template <> class Vectorized<float> {
   Vectorized<float> rsqrt() const {
     return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
   }
+<<<<<<< HEAD
   Vectorized<float> pow(const Vectorized<float> &b) const {
    USE_SLEEF( {return Vectorized<float>(Sleef_powfx_u10sve(values, b));},
     {
@@ -368,6 +716,22 @@ template <> class Vectorized<float> {
   // Comparison using the _CMP_**_OQ predicate.
   //   `O`: get false if an operand is NaN
   //   `Q`: do not raise if an operand is NaN
+=======
+  Vectorized<float> pow(const Vectorized<float>& b) const {USE_SLEEF(
+      { return Vectorized<float>(Sleef_powfx_u10sve(values, b)); },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::pow(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      })} // Comparison using the _CMP_**_OQ predicate.
+          //   `O`: get false if an operand is NaN
+          //   `Q`: do not raise if an operand is NaN
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<float> operator==(const Vectorized<float>& other) const {
     svbool_t mask = svcmpeq_f32(ptrue, values, other);
     return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
@@ -407,22 +771,46 @@ template <> class Vectorized<float> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator+(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svadd_f32_x(ptrue, a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator-(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svsub_f32_x(ptrue, a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator*(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmul_f32_x(ptrue, a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator/(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svdiv_f32_x(ptrue, a, b);
 }
 
@@ -434,33 +822,65 @@ Vectorized<float> inline Vectorized<float>::frac() const {
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline maximum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmax_f32_x(ptrue, a, b);
 }
 
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline minimum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmin_f32_x(ptrue, a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
+=======
+Vectorized<float> inline clamp(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min,
+    const Vectorized<float>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
+=======
+Vectorized<float> inline clamp_max(
+    const Vectorized<float>& a,
+    const Vectorized<float>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmin_f32_x(ptrue, max, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
+=======
+Vectorized<float> inline clamp_min(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmax_f32_x(ptrue, min, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
   return svreinterpret_f32_s32(svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
 }
@@ -496,6 +916,58 @@ Vectorized<float> inline Vectorized<float>::lt(const Vectorized<float>& other) c
 }
 
 Vectorized<float> inline Vectorized<float>::le(const Vectorized<float>& other) const {
+=======
+Vectorized<float> inline operator&(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(
+      svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+template <>
+Vectorized<float> inline operator|(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(
+      svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+template <>
+Vectorized<float> inline operator^(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(
+      sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+Vectorized<float> inline Vectorized<float>::eq(
+    const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::ne(
+    const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::gt(
+    const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::ge(
+    const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::lt(
+    const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::le(
+    const Vectorized<float>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<float>(1.0f);
 }
 
@@ -514,33 +986,52 @@ inline void convert(const float* src, float* dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const float *src, at::Half *dst, int64_t n) {
+=======
+inline void convert(const float* src, at::Half* dst, int64_t n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t fraction = n % Vectorized<float>::size();
   svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
   svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
 #pragma unroll
   for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+<<<<<<< HEAD
     svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
                                     ZERO_F16);
+=======
+    svfloat16_t src_vec = svuzp1_f16(
+        svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
   }
 #pragma unroll
   for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
     pg_16 = svwhilelt_b16(i, n);
     pg_32 = svwhilelt_b32(i, n);
+<<<<<<< HEAD
     svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
                                      ZERO_F16);
+=======
+    svfloat16_t src_vec = svuzp1_f16(
+        svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
   }
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const at::Half *src, float *dst, int64_t n) {
+=======
+inline void convert(const at::Half* src, float* dst, int64_t n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t fraction = n % Vectorized<float>::size();
   svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
   svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
 #pragma unroll
   for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+<<<<<<< HEAD
     svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
                                     ZERO_F16);
     svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
@@ -551,18 +1042,41 @@ inline void convert(const at::Half *src, float *dst, int64_t n) {
     pg_32 = svwhilelt_b32(i, n);
     svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
                                      ZERO_F16);
+=======
+    svfloat16_t src_vec = svzip1_f16(
+        svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
+        ZERO_F16);
+    svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    pg_16 = svwhilelt_b16(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svfloat16_t src_vec = svzip1_f16(
+        svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
+        ZERO_F16);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
   }
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const bool *src, float *dst, int64_t n) {
+=======
+inline void convert(const bool* src, float* dst, int64_t n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t fraction = n % Vectorized<float>::size();
   svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<float>::size());
   svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
 #pragma unroll
   for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+<<<<<<< HEAD
     svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+=======
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
     svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
     svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
@@ -571,7 +1085,12 @@ inline void convert(const bool *src, float *dst, int64_t n) {
   for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
     pg_8 = svwhilelt_b8(i, n);
     pg_32 = svwhilelt_b32(i, n);
+<<<<<<< HEAD
     svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+=======
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
     svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
     svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
@@ -579,10 +1098,22 @@ inline void convert(const bool *src, float *dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+=======
+Vectorized<float> inline fmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svmad_f32_x(ptrue, a, b, c);
 }
 
 #endif // defined(CPU_CAPABILITY_SVE)
 
+<<<<<<< HEAD
 }}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/sve/vec_int.h b/aten/src/ATen/cpu/vec/sve/vec_int.h
index 1e8c76ab0572..03c26a6ac909 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_int.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_int.h
@@ -1,9 +1,14 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/sve/sve_helper.h>
 
+=======
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::vec {
 // Note [CPU_CAPABILITY namespace]
@@ -18,6 +23,7 @@ inline namespace CPU_CAPABILITY {
 
 #if defined(CPU_CAPABILITY_SVE)
 
+<<<<<<< HEAD
 #define VEC_INT_SVE_TEMPLATE(vl, bit)                                                                   \
 template <> class Vectorized<int##bit##_t> {                                                            \
 private:                                                                                                \
@@ -217,6 +223,247 @@ Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::lt(const Vectorized<in
 Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::le(const Vectorized<int##bit##_t>& other) const {    \
   return (*this <= other) & Vectorized<int##bit##_t>(1);                                                \
 }
+=======
+#define VEC_INT_SVE_TEMPLATE(vl, bit)                                         \
+  template <>                                                                 \
+  struct is_vec_specialized_for<int##bit##_t> : std::bool_constant<true> {};  \
+                                                                              \
+  template <>                                                                 \
+  class Vectorized<int##bit##_t> {                                            \
+   private:                                                                   \
+    vls_int##bit##_t values;                                                  \
+                                                                              \
+   public:                                                                    \
+    using value_type = int##bit##_t;                                          \
+    using size_type = int;                                                    \
+    static constexpr size_type size() {                                       \
+      return vl;                                                              \
+    }                                                                         \
+    Vectorized() {}                                                           \
+    Vectorized(svint##bit##_t v) : values(v) {}                               \
+    Vectorized(int##bit##_t val) {                                            \
+      values = svdup_n_s##bit(val);                                           \
+    }                                                                         \
+    template <                                                                \
+        typename... Args,                                                     \
+        typename = std::enable_if_t<(sizeof...(Args) == size())>>             \
+    Vectorized(Args... vals) {                                                \
+      __at_align__ int##bit##_t buffer[size()] = {vals...};                   \
+      values = svld1_s##bit(ptrue, buffer);                                   \
+    }                                                                         \
+    operator svint##bit##_t() const {                                         \
+      return values;                                                          \
+    }                                                                         \
+    template <uint64_t mask>                                                  \
+    static Vectorized<int##bit##_t> blend(                                    \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b) {                                  \
+      __at_align__ int##bit##_t flag_arr[size()];                             \
+      for (int i = 0; i < size(); ++i) {                                      \
+        flag_arr[i] = (i < 64 && (mask & (1ULL << i))) ? 1 : 0;               \
+      }                                                                       \
+      svbool_t blend_mask = svcmpne_n_s##bit(                                 \
+          svptrue_b##bit(), svld1_s##bit(svptrue_b##bit(), flag_arr), 0);     \
+      return Vectorized<int##bit##_t>(                                        \
+          svsel_s##bit(blend_mask, b.values, a.values));                      \
+    }                                                                         \
+    static Vectorized<int##bit##_t> blendv(                                   \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b,                                    \
+        const Vectorized<int##bit##_t>& mask_) {                              \
+      svbool_t mask = svcmpeq_s##bit(ptrue, mask_, ALL_S##bit##_TRUE_MASK);   \
+      return svsel_s##bit(mask, b, a);                                        \
+    }                                                                         \
+    /* step sometimes requires a higher precision type (e.g., T=int,          \
+     * step_t=double) */                                                      \
+    template <typename step_t>                                                \
+    static Vectorized<int##bit##_t> arange(                                   \
+        int##bit##_t base = 0,                                                \
+        step_t step = static_cast<step_t>(1)) {                               \
+      __at_align__ int##bit##_t buffer[size()];                               \
+      for (int64_t i = 0; i < size(); i++) {                                  \
+        buffer[i] = base + i * step;                                          \
+      }                                                                       \
+      return svld1_s##bit(ptrue, buffer);                                     \
+    }                                                                         \
+    static Vectorized<int##bit##_t> set(                                      \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b,                                    \
+        int##bit##_t count = size()) {                                        \
+      if (count == 0) {                                                       \
+        return a;                                                             \
+      } else if (count < size()) {                                            \
+        return svsel_s##bit(svwhilelt_b##bit(0ull, count), b, a);             \
+      }                                                                       \
+      return b;                                                               \
+    }                                                                         \
+    static Vectorized<int##bit##_t> loadu(                                    \
+        const void* ptr,                                                      \
+        int64_t count = size()) {                                             \
+      if (count == size())                                                    \
+        return svld1_s##bit(                                                  \
+            ptrue, reinterpret_cast<const int##bit##_t*>(ptr));               \
+      svbool_t pg = svwhilelt_b##bit(0ull, count);                            \
+      return svld1_s##bit(pg, reinterpret_cast<const int##bit##_t*>(ptr));    \
+    }                                                                         \
+    void store(void* ptr, int64_t count = size()) const {                     \
+      if (count == size()) {                                                  \
+        svst1_s##bit(ptrue, reinterpret_cast<int##bit##_t*>(ptr), values);    \
+      } else {                                                                \
+        svbool_t pg = svwhilelt_b##bit(0ull, count);                          \
+        svst1_s##bit(pg, reinterpret_cast<int##bit##_t*>(ptr), values);       \
+      }                                                                       \
+    }                                                                         \
+    const int##bit##_t& operator[](int idx) const = delete;                   \
+    int##bit##_t& operator[](int idx) = delete;                               \
+    Vectorized<int##bit##_t> abs() const {                                    \
+      return svabs_s##bit##_x(ptrue, values);                                 \
+    }                                                                         \
+    Vectorized<int##bit##_t> real() const {                                   \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<int##bit##_t> imag() const {                                   \
+      return svdup_n_s##bit(0);                                               \
+    }                                                                         \
+    Vectorized<int##bit##_t> conj() const {                                   \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<int##bit##_t> frac() const;                                    \
+    Vectorized<int##bit##_t> neg() const {                                    \
+      return svneg_s##bit##_x(ptrue, values);                                 \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator==(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmpeq_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator!=(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmpne_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator<(                                       \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmplt_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator<=(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmple_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator>(                                       \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmpgt_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator>=(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      svbool_t mask = svcmpge_s##bit(ptrue, values, other);                   \
+      return svsel_s##bit(                                                    \
+          mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);             \
+    }                                                                         \
+    Vectorized<int##bit##_t> eq(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> ne(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> gt(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> ge(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> lt(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> le(const Vectorized<int##bit##_t>& other) const; \
+  };                                                                          \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator+(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svadd_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator-(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svsub_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator*(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svmul_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline maximum(                                    \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svmax_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline minimum(                                    \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svmin_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline clamp(                                      \
+      const Vectorized<int##bit##_t>& a,                                      \
+      const Vectorized<int##bit##_t>& min,                                    \
+      const Vectorized<int##bit##_t>& max) {                                  \
+    return svmin_s##bit##_x(ptrue, max, svmax_s##bit##_x(ptrue, min, a));     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline clamp_max(                                  \
+      const Vectorized<int##bit##_t>& a,                                      \
+      const Vectorized<int##bit##_t>& max) {                                  \
+    return svmin_s##bit##_x(ptrue, max, a);                                   \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline clamp_min(                                  \
+      const Vectorized<int##bit##_t>& a,                                      \
+      const Vectorized<int##bit##_t>& min) {                                  \
+    return svmax_s##bit##_x(ptrue, min, a);                                   \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator&(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svand_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator|(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return svorr_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator^(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return sveor_s##bit##_x(ptrue, a, b);                                     \
+  }                                                                           \
+  template <>                                                                 \
+  inline Vectorized<int##bit##_t> operator~(                                  \
+      const Vectorized<int##bit##_t>& a) {                                    \
+    return sveor_s##bit##_x(ptrue, a, svdup_n_s##bit(-1));                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::eq(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this == other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ne(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this != other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::gt(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this > other) & Vectorized<int##bit##_t>(1);                     \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ge(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this >= other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::lt(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this < other) & Vectorized<int##bit##_t>(1);                     \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::le(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this <= other) & Vectorized<int##bit##_t>(1);                    \
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int64_t), 64)
 VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int32_t), 32)
@@ -224,7 +471,13 @@ VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int16_t), 16)
 VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int8_t), 8)
 
 template <typename T>
+<<<<<<< HEAD
 Vectorized<T> inline intdiv_nosve(const Vectorized<T>& a, const Vectorized<T>& b) {
+=======
+Vectorized<T> inline intdiv_nosve(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   T values_a[Vectorized<T>::size()];
   T values_b[Vectorized<T>::size()];
   a.store(values_a);
@@ -236,27 +489,55 @@ Vectorized<T> inline intdiv_nosve(const Vectorized<T>& a, const Vectorized<T>& b
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator/(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svdiv_s64_x(ptrue, a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator/(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator/(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svdiv_s32_x(ptrue, a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator/(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator/(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return intdiv_nosve(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator/(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator/(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return intdiv_nosve(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const int32_t *src, int64_t *dst, int64_t n) {
+=======
+inline void convert(const int32_t* src, int64_t* dst, int64_t n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t fraction = n % Vectorized<int64_t>::size();
   svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int64_t>::size());
   svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
@@ -272,14 +553,23 @@ inline void convert(const int32_t *src, int64_t *dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const int64_t *src, float *dst, int64_t n) {
+=======
+inline void convert(const int64_t* src, float* dst, int64_t n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t fraction = n % Vectorized<int64_t>::size();
   svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int64_t>::size());
   svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
 #pragma unroll
   for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size()) {
     svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i);
+<<<<<<< HEAD
     svfloat32_t src_vec_f32 = svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
+=======
+    svfloat32_t src_vec_f32 =
+        svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svst1_f32(pg_32, dst + i, src_vec_f32);
   }
 #pragma unroll
@@ -287,13 +577,22 @@ inline void convert(const int64_t *src, float *dst, int64_t n) {
     pg_32 = svwhilelt_b32(i, n);
     pg_64 = svwhilelt_b64(i, n);
     svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i);
+<<<<<<< HEAD
     svfloat32_t src_vec_f32 = svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
+=======
+    svfloat32_t src_vec_f32 =
+        svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svst1_f32(pg_32, dst + i, src_vec_f32);
   }
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const int32_t *src, float *dst, int64_t n) {
+=======
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t fraction = n % Vectorized<int32_t>::size();
   svbool_t pg = svwhilelt_b32(0ull, Vectorized<int32_t>::size());
 #pragma unroll
@@ -310,14 +609,25 @@ inline void convert(const int32_t *src, float *dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const bool *src, int64_t *dst, int64_t n) {
+=======
+inline void convert(const bool* src, int64_t* dst, int64_t n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t fraction = n % Vectorized<int64_t>::size();
   svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<int64_t>::size());
   svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
 #pragma unroll
   for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size()) {
+<<<<<<< HEAD
     svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
     svuint64_t src_vec_u64 = svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
+=======
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint64_t src_vec_u64 =
+        svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64);
     svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64));
   }
@@ -325,21 +635,37 @@ inline void convert(const bool *src, int64_t *dst, int64_t n) {
   for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
     pg_8 = svwhilelt_b8(i, n);
     pg_64 = svwhilelt_b64(i, n);
+<<<<<<< HEAD
     svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
     svuint64_t src_vec_u64 = svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
+=======
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint64_t src_vec_u64 =
+        svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64);
     svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64));
   }
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const bool *src, int32_t *dst, int64_t n) {
+=======
+inline void convert(const bool* src, int32_t* dst, int64_t n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t fraction = n % Vectorized<int32_t>::size();
   svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<int32_t>::size());
   svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int32_t>::size());
 #pragma unroll
   for (int64_t i = 0; i < n - fraction; i += Vectorized<int32_t>::size()) {
+<<<<<<< HEAD
     svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+=======
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
     svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
     svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32));
@@ -348,7 +674,12 @@ inline void convert(const bool *src, int32_t *dst, int64_t n) {
   for (int64_t i = n - fraction; i < n; i += Vectorized<int32_t>::size()) {
     pg_8 = svwhilelt_b8(i, n);
     pg_32 = svwhilelt_b32(i, n);
+<<<<<<< HEAD
     svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+=======
+    svuint8_t src_vec_u8 =
+        svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
     svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
     svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32));
@@ -356,64 +687,135 @@ inline void convert(const bool *src, int32_t *dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const uint8_t *src, bool *dst, int64_t n) {
+=======
+inline void convert(const uint8_t* src, bool* dst, int64_t n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t fraction = n % Vectorized<uint8_t>::size();
   svbool_t pg = svwhilelt_b8(0ull, Vectorized<uint8_t>::size());
 #pragma unroll
   for (int64_t i = 0; i < n - fraction; i += Vectorized<uint8_t>::size()) {
     svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8);
+<<<<<<< HEAD
     svst1_u8(pg, reinterpret_cast<uint8_t*>(dst) + i,
             svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
+=======
+    svst1_u8(
+        pg,
+        reinterpret_cast<uint8_t*>(dst) + i,
+        svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #pragma unroll
   for (int64_t i = n - fraction; i < n; i += Vectorized<uint8_t>::size()) {
     pg = svwhilelt_b8(i, n);
     svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8);
+<<<<<<< HEAD
     svst1_u8(pg, reinterpret_cast<uint8_t*>(dst) + i,
              svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
+=======
+    svst1_u8(
+        pg,
+        reinterpret_cast<uint8_t*>(dst) + i,
+        svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator<<(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svlsl_s64_x(ptrue, a, svreinterpret_u64_s64(b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator<<(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svlsl_s32_x(ptrue, a, svreinterpret_u32_s32(b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator<<(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svlsl_s16_x(ptrue, a, svreinterpret_u16_s16(b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator<<(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svlsl_s8_x(ptrue, a, svreinterpret_u8_s8(b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator>>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svasr_s64_x(ptrue, a, svreinterpret_u64_s64(b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator>>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svasr_s32_x(ptrue, a, svreinterpret_u32_s32(b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator>>(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svasr_s16_x(ptrue, a, svreinterpret_u16_s16(b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator>>(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b));
 }
 
 #endif // defined(CPU_CAPABILITY_SVE)
 
+<<<<<<< HEAD
 }}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/sve/vec_qint.h b/aten/src/ATen/cpu/vec/sve/vec_qint.h
index 96e201ef36a2..5f9172f60256 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_qint.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_qint.h
@@ -32,9 +32,14 @@
 // specified by float_vec_return_type.
 //
 // When writing kernels with these vectors, it is expected that floating-
+<<<<<<< HEAD
 // point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
 // iterations.
 
+=======
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::vec {
 // Note [CPU_CAPABILITY namespace]
@@ -108,8 +113,15 @@ struct VectorizedQuantizedConverter {
     for (int i = 0; i < float_num_vecs(); ++i) {
       float tmp_vals[Vectorized<float>::size()];
       for (int j = 0; j < Vectorized<float>::size(); ++j) {
+<<<<<<< HEAD
         tmp_vals[j] =
           at::native::dequantize_val<T>(tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized<float>::size() * i + j]));
+=======
+        tmp_vals[j] = at::native::dequantize_val<T>(
+            tmp_scale[j],
+            tmp_zero_point[j],
+            T(vals[Vectorized<float>::size() * i + j]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       rv[i] = Vectorized<float>::loadu(tmp_vals);
     }
@@ -127,8 +139,15 @@ struct VectorizedQuantizedConverter {
     for (int i = 0; i < float_num_vecs(); ++i) {
       float tmp_vals[Vectorized<float>::size()];
       for (int j = 0; j < Vectorized<float>::size(); ++j) {
+<<<<<<< HEAD
         tmp_vals[j] =
           at::native::dequantize_val<T>(tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized<float>::size() * i + j]));
+=======
+        tmp_vals[j] = at::native::dequantize_val<T>(
+            tmp_scale[j],
+            tmp_zero_point[j],
+            T(vals[Vectorized<float>::size() * i + j]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       rv[i] = Vectorized<float>::loadu(tmp_vals);
     }
@@ -140,11 +159,22 @@ struct VectorizedQuantizedConverter {
 };
 
 template <>
+<<<<<<< HEAD
 struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
                                  c10::qint32,
                                  std::array<Vectorized<float>, 1>,
                                  std::array<Vectorized<c10::qint32>, 1>,
                                  VECTOR_WIDTH / 4> {
+=======
+struct is_vec_specialized_for<c10::qint32> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
+                                     c10::qint32,
+                                     std::array<Vectorized<float>, 1>,
+                                     std::array<Vectorized<c10::qint32>, 1>,
+                                     VECTOR_WIDTH / 4> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized()
       : VectorizedQuantizedConverter<
             c10::qint32,
@@ -169,6 +199,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
   }
 
   static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+<<<<<<< HEAD
       __at_align__ value_type tmp_values[size()];
       // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
       // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
@@ -181,6 +212,26 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
   }
 #else
   static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count = size()) {
+=======
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+#else
+  static Vectorized<c10::qint32> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (count == size())
       return svld1_s32(ptrue, reinterpret_cast<const int32_t*>(ptr));
     svbool_t pg = svwhilelt_b32(0ull, count);
@@ -196,7 +247,13 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
     std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
 
     for (int i = 0; i < float_num_vecs(); ++i) {
+<<<<<<< HEAD
       rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
+=======
+      rhs[i].store(
+          &float_vals[i * Vectorized<float>::size()],
+          Vectorized<float>::size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     at::native::quantize_vec<c10::qint32, /*precision=*/32>(
@@ -225,11 +282,18 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
     return retval;
   }
 
+<<<<<<< HEAD
   Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const  {
     return maximum(zero_point);
   }
 
 
+=======
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return maximum(zero_point);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<c10::qint32> relu6(
       Vectorized<c10::qint32> zero_point,
       Vectorized<c10::qint32> q_six) {
@@ -264,7 +328,13 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+=======
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
@@ -291,11 +361,22 @@ Vectorized<c10::qint32> inline operator+(
 }
 
 template <>
+<<<<<<< HEAD
 struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
                                 c10::qint8,
                                 std::array<Vectorized<float>, 4>,
                                 std::array<Vectorized<c10::qint32>, 4>,
                                 VECTOR_WIDTH> {
+=======
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
+                                    c10::qint8,
+                                    std::array<Vectorized<float>, 4>,
+                                    std::array<Vectorized<c10::qint32>, 4>,
+                                    VECTOR_WIDTH> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized()
       : VectorizedQuantizedConverter<
             c10::qint8,
@@ -320,6 +401,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
   }
 
   static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+<<<<<<< HEAD
       __at_align__ value_type tmp_values[size()];
       // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
       // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
@@ -329,6 +411,21 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
       }
       std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
       return loadu(tmp_values);
+=======
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   static Vectorized<c10::qint8> quantize(
@@ -340,7 +437,13 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
     std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
 
     for (int i = 0; i < float_num_vecs(); ++i) {
+<<<<<<< HEAD
       rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
+=======
+      rhs[i].store(
+          &float_vals[i * Vectorized<float>::size()],
+          Vectorized<float>::size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     at::native::quantize_vec<c10::qint8>(
@@ -418,16 +521,33 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+=======
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
 template <>
+<<<<<<< HEAD
 struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
                                  c10::quint8,
                                  std::array<Vectorized<float>, 4>,
                                  std::array<Vectorized<c10::qint32>, 4>,
                                  VECTOR_WIDTH> {
+=======
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
+                                     c10::quint8,
+                                     std::array<Vectorized<float>, 4>,
+                                     std::array<Vectorized<c10::qint32>, 4>,
+                                     VECTOR_WIDTH> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized()
       : VectorizedQuantizedConverter<
             c10::quint8,
@@ -452,6 +572,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
   }
 
   static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+<<<<<<< HEAD
       __at_align__ value_type tmp_values[size()];
       // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
       // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
@@ -464,6 +585,26 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
   }
 #else
   static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count = size()) {
+=======
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+#else
+  static Vectorized<c10::quint8> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (count == size())
       return svld1_u8(ptrue, reinterpret_cast<const uint8_t*>(ptr));
     svbool_t pg = svwhilelt_b8(0ull, count);
@@ -479,7 +620,13 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
     std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
 
     for (int i = 0; i < float_num_vecs(); ++i) {
+<<<<<<< HEAD
       rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
+=======
+      rhs[i].store(
+          &float_vals[i * Vectorized<float>::size()],
+          Vectorized<float>::size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     at::native::quantize_vec<c10::quint8>(
@@ -512,7 +659,10 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
     return maximum(zero_point);
   }
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<c10::quint8> relu6(
       Vectorized<c10::quint8> zero_point,
       Vectorized<c10::quint8> q_six) {
@@ -558,10 +708,21 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+=======
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
 #endif // defined(CPU_CAPABILITY_SVE)
 
+<<<<<<< HEAD
 }}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec.h b/aten/src/ATen/cpu/vec/vec.h
index e4b0c4b95d84..6e2ea59d6a1e 100644
--- a/aten/src/ATen/cpu/vec/vec.h
+++ b/aten/src/ATen/cpu/vec/vec.h
@@ -28,12 +28,19 @@ inline Vectorized<bool> Vectorized<bool>::loadu(const void* ptr) {
 }
 
 template <>
+<<<<<<< HEAD
 inline Vectorized<bool> Vectorized<bool>::loadu(const void* ptr, int64_t count) {
+=======
+inline Vectorized<bool> Vectorized<bool>::loadu(
+    const void* ptr,
+    int64_t count) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // See NOTE [Loading boolean values]
   return convert_to_bool(Vectorized<int8_t>::loadu(ptr, count));
 }
 
 template <typename VT>
+<<<<<<< HEAD
 struct VecHoldType { using hold_type = typename VT::value_type; };
 
 template <>
@@ -41,8 +48,28 @@ struct VecHoldType<Vectorized<BFloat16>> { using hold_type = BFloat16; };
 
 template <>
 struct VecHoldType<Vectorized<Half>> {using hold_type = Half; };
+=======
+struct VecHoldType {
+  using hold_type = typename VT::value_type;
+};
+
+template <>
+struct VecHoldType<Vectorized<BFloat16>> {
+  using hold_type = BFloat16;
+};
+
+template <>
+struct VecHoldType<Vectorized<Half>> {
+  using hold_type = Half;
+};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename VT>
 using vechold_type = typename VecHoldType<VT>::hold_type;
 
+<<<<<<< HEAD
 }} // namespace at::vec::CPU_CAPABILITY
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
index 7d594c696f7a..b476c31e9a95 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@@ -11,8 +11,12 @@
 
 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
+<<<<<<< HEAD
 inline
 namespace CPU_CAPABILITY {
+=======
+inline namespace CPU_CAPABILITY {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Following vec128_half_neon.h, we only support aarch64.
 #if !defined(C10_MOBILE) && defined(__aarch64__)
@@ -26,6 +30,7 @@ namespace CPU_CAPABILITY {
 // definitions in case they are actually there!). (See
 // https://godbolt.org/z/orv6e94n4 ) So, we need to handle it as
 // uint16_t in that case.
+<<<<<<< HEAD
 #define IMPLEMENT_AT_BF16_SHIM(vec_suffix)                              \
   inline at_bfloat16x4_t at_vget_low_bf16(                              \
       at_bfloat16x8_t a) {                                              \
@@ -90,6 +95,65 @@ namespace CPU_CAPABILITY {
     } else {                                                            \
       return vreinterpret_u16_bf16(val);                                \
     }                                                                   \
+=======
+#define IMPLEMENT_AT_BF16_SHIM(vec_suffix)                               \
+  inline at_bfloat16x4_t at_vget_low_bf16(at_bfloat16x8_t a) {           \
+    return vget_low_##vec_suffix(a);                                     \
+  }                                                                      \
+                                                                         \
+  inline at_bfloat16x4_t at_vget_high_bf16(at_bfloat16x8_t a) {          \
+    return vget_high_##vec_suffix(a);                                    \
+  }                                                                      \
+                                                                         \
+  inline at_bfloat16x8_t at_vcombine_bf16(                               \
+      at_bfloat16x4_t low, at_bfloat16x4_t high) {                       \
+    return vcombine_##vec_suffix(low, high);                             \
+  }                                                                      \
+                                                                         \
+  inline at_bfloat16x8_t at_vdupq_n_bf16(at_bfloat16_t value) {          \
+    return vdupq_n_##vec_suffix(value);                                  \
+  }                                                                      \
+                                                                         \
+  inline at_bfloat16x8_t at_vld1q_bf16(const at_bfloat16_t* ptr) {       \
+    return vld1q_##vec_suffix(ptr);                                      \
+  }                                                                      \
+                                                                         \
+  inline void at_vst1q_bf16(at_bfloat16_t* ptr, at_bfloat16x8_t value) { \
+    vst1q_##vec_suffix(ptr, value);                                      \
+  }                                                                      \
+                                                                         \
+  template <typename T>                                                  \
+  inline at_bfloat16x8_t at_vreinterpretq_bf16_u16(T val) {              \
+    if constexpr (std::is_same_v<at_bfloat16x8_t, uint16x8_t>) {         \
+      return val;                                                        \
+    } else {                                                             \
+      return vreinterpretq_bf16_u16(val);                                \
+    }                                                                    \
+  }                                                                      \
+  template <typename T>                                                  \
+  inline at_bfloat16x4_t at_vreinterpret_bf16_u16(T val) {               \
+    if constexpr (std::is_same_v<at_bfloat16x4_t, uint16x4_t>) {         \
+      return val;                                                        \
+    } else {                                                             \
+      return vreinterpret_bf16_u16(val);                                 \
+    }                                                                    \
+  }                                                                      \
+  template <typename T>                                                  \
+  inline uint16x8_t at_vreinterpretq_u16_bf16(T val) {                   \
+    if constexpr (std::is_same_v<at_bfloat16x8_t, uint16x8_t>) {         \
+      return val;                                                        \
+    } else {                                                             \
+      return vreinterpretq_u16_bf16(val);                                \
+    }                                                                    \
+  }                                                                      \
+  template <typename T>                                                  \
+  inline uint16x4_t at_vreinterpret_u16_bf16(T val) {                    \
+    if constexpr (std::is_same_v<at_bfloat16x4_t, uint16x4_t>) {         \
+      return val;                                                        \
+    } else {                                                             \
+      return vreinterpret_u16_bf16(val);                                 \
+    }                                                                    \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 #ifdef __ARM_FEATURE_BF16
@@ -137,11 +201,34 @@ struct BlendBFloat16Regs<index, false> {
 };
 
 template <>
+<<<<<<< HEAD
 class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFloat16, BlendBFloat16Regs, Vectorized<c10::BFloat16>> {
   using Base = Vectorized16<at_bfloat16x8_t, c10::BFloat16, BlendBFloat16Regs, Vectorized<c10::BFloat16>>;
   friend Base;
   friend std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(const Vectorized<c10::BFloat16>& a);
   friend Vectorized<c10::BFloat16> convert_float_bfloat16(const Vectorized<float>& a, const Vectorized<float>& b);
+=======
+struct is_vec_specialized_for<c10::BFloat16> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<c10::BFloat16> : public Vectorized16<
+                                      at_bfloat16x8_t,
+                                      c10::BFloat16,
+                                      BlendBFloat16Regs,
+                                      Vectorized<c10::BFloat16>> {
+  using Base = Vectorized16<
+      at_bfloat16x8_t,
+      c10::BFloat16,
+      BlendBFloat16Regs,
+      Vectorized<c10::BFloat16>>;
+  friend Base;
+  friend std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
+      const Vectorized<c10::BFloat16>& a);
+  friend Vectorized<c10::BFloat16> convert_float_bfloat16(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   Vectorized<c10::BFloat16> map2(
       const Vectorized<c10::BFloat16>& second,
@@ -171,10 +258,21 @@ class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFlo
 #else
     static_assert(std::is_same_v<uint16x4_t, at_bfloat16x4_t>);
     uint32x4_t as_uint32 = vreinterpretq_u32_f32(f32);
+<<<<<<< HEAD
     uint32x4_t rounding_bias = vaddq_u32(vandq_u32(vshrq_n_u32(as_uint32, 16), vdupq_n_u32(1)), vdupq_n_u32(0x7FFF));
     at_bfloat16x4_t rounded = vshrn_n_u32(vaddq_u32(as_uint32, rounding_bias), 16);
     const auto bf16_nan = vdup_n_u16(0x7FC0);
     return vbsl_u16(vmovn_u32(vreinterpretq_u32_f32(f32.isnan())), bf16_nan, rounded);
+=======
+    uint32x4_t rounding_bias = vaddq_u32(
+        vandq_u32(vshrq_n_u32(as_uint32, 16), vdupq_n_u32(1)),
+        vdupq_n_u32(0x7FFF));
+    at_bfloat16x4_t rounded =
+        vshrn_n_u32(vaddq_u32(as_uint32, rounding_bias), 16);
+    const auto bf16_nan = vdup_n_u16(0x7FC0);
+    return vbsl_u16(
+        vmovn_u32(vreinterpretq_u32_f32(f32.isnan())), bf16_nan, rounded);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // __ARM_FEATURE_BF16
   }
 
@@ -215,9 +313,18 @@ class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFlo
     Vectorized<float> mv0 = (Vectorized<float>(v00).*m)(second_v00);
     Vectorized<float> mv1 = (Vectorized<float>(v01).*m)(second_v01);
     // Assume the operator returns a bitmask, not "real" floats, and
+<<<<<<< HEAD
     // just narrow the bits. All-ones is a NaN and will get mangled by conversion!
     at_bfloat16x4_t r00 = at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0)));
     at_bfloat16x4_t r01 = at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1)));
+=======
+    // just narrow the bits. All-ones is a NaN and will get mangled by
+    // conversion!
+    at_bfloat16x4_t r00 =
+        at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0)));
+    at_bfloat16x4_t r01 =
+        at_vreinterpret_bf16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<c10::BFloat16>(at_vcombine_bf16(r00, r01));
   }
 
@@ -226,7 +333,12 @@ class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFlo
 
   Vectorized() = default;
 
+<<<<<<< HEAD
   Vectorized(c10::BFloat16 val) : Vectorized16(at_vdupq_n_bf16(c10::bit_cast<at_bfloat16_t>(val.x))) {}
+=======
+  Vectorized(c10::BFloat16 val)
+      : Vectorized16(at_vdupq_n_bf16(c10::bit_cast<at_bfloat16_t>(val.x))) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(float val) : Vectorized(c10::BFloat16(val)) {}
   Vectorized(
       value_type val0,
@@ -238,6 +350,7 @@ class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFlo
       value_type val6,
       value_type val7)
       : Vectorized16(at_bfloat16x8_t{
+<<<<<<< HEAD
           c10::bit_cast<at_bfloat16_t>(val0.x),
           c10::bit_cast<at_bfloat16_t>(val1.x),
           c10::bit_cast<at_bfloat16_t>(val2.x),
@@ -247,11 +360,22 @@ class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFlo
           c10::bit_cast<at_bfloat16_t>(val6.x),
           c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
 
+=======
+            c10::bit_cast<at_bfloat16_t>(val0.x),
+            c10::bit_cast<at_bfloat16_t>(val1.x),
+            c10::bit_cast<at_bfloat16_t>(val2.x),
+            c10::bit_cast<at_bfloat16_t>(val3.x),
+            c10::bit_cast<at_bfloat16_t>(val4.x),
+            c10::bit_cast<at_bfloat16_t>(val5.x),
+            c10::bit_cast<at_bfloat16_t>(val6.x),
+            c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static Vectorized<c10::BFloat16> blendv(
       const Vectorized<c10::BFloat16>& a,
       const Vectorized<c10::BFloat16>& b,
       const Vectorized<c10::BFloat16>& mask) {
+<<<<<<< HEAD
     // NOTE: blendv has the same problems as it does for Half; see comments in vec128_half_neon.h.
     Vectorized<c10::BFloat16> vec(mask.values);
     vec.values = at_vreinterpretq_bf16_u16(
@@ -259,6 +383,15 @@ class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFlo
             at_vreinterpretq_u16_bf16(vec.values),
             at_vreinterpretq_u16_bf16(b.values),
             at_vreinterpretq_u16_bf16(a.values)));
+=======
+    // NOTE: blendv has the same problems as it does for Half; see comments in
+    // vec128_half_neon.h.
+    Vectorized<c10::BFloat16> vec(mask.values);
+    vec.values = at_vreinterpretq_bf16_u16(vbslq_u16(
+        at_vreinterpretq_u16_bf16(vec.values),
+        at_vreinterpretq_u16_bf16(b.values),
+        at_vreinterpretq_u16_bf16(a.values)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return vec;
   }
   static Vectorized<c10::BFloat16> set(
@@ -271,6 +404,7 @@ class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFlo
     }
     uint16x8_t mask = vld1q_u16(pre_mask);
 
+<<<<<<< HEAD
     Vectorized<c10::BFloat16> vec(
         at_vreinterpretq_bf16_u16(
             vbslq_u16(
@@ -281,6 +415,18 @@ class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFlo
     return vec;
   }
   static Vectorized<c10::BFloat16> loadu(const void* ptr, int64_t count = size()) {
+=======
+    Vectorized<c10::BFloat16> vec(at_vreinterpretq_bf16_u16(vbslq_u16(
+        mask,
+        at_vreinterpretq_u16_bf16(b.values),
+        at_vreinterpretq_u16_bf16(a.values))));
+
+    return vec;
+  }
+  static Vectorized<c10::BFloat16> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (count == size()) {
       return at_vld1q_bf16(reinterpret_cast<const at_bfloat16_t*>(ptr));
     }
@@ -332,9 +478,16 @@ class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFlo
     return map_with_vec_float_method(&Vectorized<float>::name); \
   }
 
+<<<<<<< HEAD
 #define DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(name)        \
   Vectorized name(const Vectorized& other) const {                      \
     return map2_bitmask_with_vec_float_method(other, &Vectorized<float>::name); \
+=======
+#define DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(name) \
+  Vectorized name(const Vectorized& other) const {               \
+    return map2_bitmask_with_vec_float_method(                   \
+        other, &Vectorized<float>::name);                        \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
@@ -361,6 +514,7 @@ class Vectorized<c10::BFloat16> : public Vectorized16<at_bfloat16x8_t, c10::BFlo
   Vectorized le(const Vectorized& other) const;
 }; // Vectorized<c10::BFloat16>
 
+<<<<<<< HEAD
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
   static_assert(Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
   at_bfloat16x8_t x = a;
@@ -370,6 +524,24 @@ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(c
 }
 inline Vectorized<c10::BFloat16> convert_float_bfloat16(const Vectorized<float>& a, const Vectorized<float>& b) {
   static_assert(Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
+=======
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
+    const Vectorized<c10::BFloat16>& a) {
+  static_assert(
+      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
+  at_bfloat16x8_t x = a;
+  float32x4_t x1 =
+      Vectorized<c10::BFloat16>::convert_f32_bf16(at_vget_low_bf16(x));
+  float32x4_t x2 =
+      Vectorized<c10::BFloat16>::convert_f32_bf16(at_vget_high_bf16(x));
+  return {Vectorized<float>(x1), Vectorized<float>(x2)};
+}
+inline Vectorized<c10::BFloat16> convert_float_bfloat16(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  static_assert(
+      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at_bfloat16x4_t x1 = Vectorized<c10::BFloat16>::convert_bf16_f32(a);
   at_bfloat16x4_t x2 = Vectorized<c10::BFloat16>::convert_bf16_f32(b);
   return Vectorized<c10::BFloat16>(at_vcombine_bf16(x1, x2));
@@ -383,8 +555,12 @@ Vectorized<c10::BFloat16> binary_operator_via_float(
   const auto [a_float_low, a_float_high] = convert_bfloat16_float(a);
   const auto [b_float_low, b_float_high] = convert_bfloat16_float(b);
   return convert_float_bfloat16(
+<<<<<<< HEAD
       op(a_float_low, b_float_low),
       op(a_float_high, b_float_high));
+=======
+      op(a_float_low, b_float_low), op(a_float_high, b_float_high));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
@@ -425,7 +601,12 @@ Vectorized<c10::BFloat16> inline maximum(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b) {
   return binary_operator_via_float(
+<<<<<<< HEAD
       static_cast<Vectorized<float>(*)(const Vectorized<float>&, const Vectorized<float>&)>(&maximum),
+=======
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&maximum),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       a,
       b);
 }
@@ -435,7 +616,12 @@ Vectorized<c10::BFloat16> inline minimum(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b) {
   return binary_operator_via_float(
+<<<<<<< HEAD
       static_cast<Vectorized<float>(*)(const Vectorized<float>&, const Vectorized<float>&)>(&minimum),
+=======
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&minimum),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       a,
       b);
 }
@@ -466,24 +652,39 @@ template <>
 Vectorized<c10::BFloat16> inline operator&(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b) {
+<<<<<<< HEAD
   return Vectorized<c10::BFloat16>(at_vreinterpretq_bf16_u16(vandq_u16(
       at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b))));
+=======
+  return Vectorized<c10::BFloat16>(at_vreinterpretq_bf16_u16(
+      vandq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
 Vectorized<c10::BFloat16> inline operator|(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b) {
+<<<<<<< HEAD
   return Vectorized<c10::BFloat16>(at_vreinterpretq_bf16_u16(vorrq_u16(
       at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b))));
+=======
+  return Vectorized<c10::BFloat16>(at_vreinterpretq_bf16_u16(
+      vorrq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
 Vectorized<c10::BFloat16> inline operator^(
     const Vectorized<c10::BFloat16>& a,
     const Vectorized<c10::BFloat16>& b) {
+<<<<<<< HEAD
   return Vectorized<c10::BFloat16>(at_vreinterpretq_bf16_u16(veorq_u16(
       at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b))));
+=======
+  return Vectorized<c10::BFloat16>(at_vreinterpretq_bf16_u16(
+      veorq_u16(at_vreinterpretq_u16_bf16(a), at_vreinterpretq_u16_bf16(b))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline Vectorized<c10::BFloat16> Vectorized<c10::BFloat16>::eq(
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
index 4131802c9923..6bda73664ab8 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@@ -11,8 +11,12 @@ struct VecConvert<
     1,
     src_t,
     1,
+<<<<<<< HEAD
     typename std::enable_if_t<is_8bit_integer_v<src_t>,
         void>> {
+=======
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
     return convert_int8_half_register_to_float(src[0]);
   }
@@ -23,8 +27,12 @@ struct VecConvert<
     2,
     src_t,
     1,
+<<<<<<< HEAD
     typename std::enable_if_t<is_8bit_integer_v<src_t>,
         void>> {
+=======
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
     const auto [v0, v1] = convert_int8_to_float(src[0]);
     return VectorizedN<float, 2>(v0, v1);
@@ -39,8 +47,15 @@ struct VecConvert<float, 2, BFloat16, 1> {
     uint16x8_t u16_8 = vld1q_u16(reinterpret_cast<const uint16_t*>(&src[0]));
     auto u16_low1 = vget_low_u16(u16_8);
     auto u16_high1 = vget_high_u16(u16_8);
+<<<<<<< HEAD
     float32x4_t f32x4_0 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_low1), 16));
     float32x4_t f32x4_1 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_high1), 16));
+=======
+    float32x4_t f32x4_0 =
+        vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_low1), 16));
+    float32x4_t f32x4_1 =
+        vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_high1), 16));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result[0] = f32x4_0;
     result[1] = f32x4_1;
     return result;
@@ -53,7 +68,12 @@ struct VecConvert<float, 1, BFloat16, 1> {
       const VectorizedN<BFloat16, 1>& src) {
     VectorizedN<float, 1> result;
     uint16x4_t u16_8 = vld1_u16(reinterpret_cast<const uint16_t*>(&src[0]));
+<<<<<<< HEAD
     float32x4_t f32x4_0 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_8), 16));
+=======
+    float32x4_t f32x4_0 =
+        vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_8), 16));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result[0] = f32x4_0;
     return result;
   }
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
index a51a8777fb6d..57a2df9f9ec2 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@@ -41,6 +41,7 @@ inline namespace CPU_CAPABILITY {
 #define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
 #endif
 
+<<<<<<< HEAD
 template<int index, bool mask_val>
 struct BlendRegs {
   static float32x4_t impl(
@@ -51,22 +52,59 @@ template<int index>
 struct BlendRegs<index, true>{
   static float32x4_t impl(
       const float32x4_t& a, const float32x4_t& b, float32x4_t& res) {
+=======
+template <int index, bool mask_val>
+struct BlendRegs {
+  static float32x4_t impl(
+      const float32x4_t& a,
+      const float32x4_t& b,
+      float32x4_t& res);
+};
+
+template <int index>
+struct BlendRegs<index, true> {
+  static float32x4_t impl(
+      const float32x4_t& a,
+      const float32x4_t& b,
+      float32x4_t& res) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index);
   }
 };
 
+<<<<<<< HEAD
 template<int index>
 struct BlendRegs<index, false>{
   static float32x4_t impl(
       const float32x4_t& a, const float32x4_t& b, float32x4_t& res) {
+=======
+template <int index>
+struct BlendRegs<index, false> {
+  static float32x4_t impl(
+      const float32x4_t& a,
+      const float32x4_t& b,
+      float32x4_t& res) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index);
   }
 };
 
+<<<<<<< HEAD
 template <> class Vectorized<float> {
 private:
   float32x4_t values;
 public:
+=======
+template <>
+struct is_vec_specialized_for<float> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<float> {
+ private:
+  float32x4_t values;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = float;
   using size_type = int;
   static constexpr size_type size() {
@@ -75,13 +113,19 @@ template <> class Vectorized<float> {
   Vectorized() {}
   Vectorized(float32x4_t v) : values(v) {}
   Vectorized(float val) : values{vdupq_n_f32(val)} {}
+<<<<<<< HEAD
   Vectorized(float val0, float val1, float val2, float val3) :
          values{val0, val1, val2, val3} {}
+=======
+  Vectorized(float val0, float val1, float val2, float val3)
+      : values{val0, val1, val2, val3} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(float (&arr)[4]) : Vectorized(arr[0], arr[1], arr[2], arr[3]) {}
   operator float32x4_t() const {
     return values;
   }
   template <int64_t mask>
+<<<<<<< HEAD
   static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
     Vectorized<float> vec;
     vec.values =
@@ -100,12 +144,33 @@ template <> class Vectorized<float> {
   }
   static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
                               const Vectorized<float>& mask) {
+=======
+  static Vectorized<float> blend(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b) {
+    Vectorized<float> vec;
+    vec.values = BlendRegs < 0,
+    (mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 1,
+    (mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 2,
+    (mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 3,
+    (mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values);
+    return vec;
+  }
+  static Vectorized<float> blendv(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      const Vectorized<float>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // TODO
     // NB: This requires that each value, i.e., each uint value,
     // of the mask either all be zeros or all be 1s.
     // We perhaps need some kind of an assert?
     // But that will affect performance.
     Vectorized<float> vec(mask.values);
+<<<<<<< HEAD
     vec.values = vbslq_f32(
         vreinterpretq_u32_f32(vec.values),
         b.values,
@@ -114,11 +179,22 @@ template <> class Vectorized<float> {
   }
   template<typename step_t>
   static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
+=======
+    vec.values =
+        vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values);
+    return vec;
+  }
+  template <typename step_t>
+  static Vectorized<float> arange(
+      float base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Vectorized<float> base_vec(base);
     const Vectorized<float> step_vec(step);
     const Vectorized<float> step_sizes(0, 1, 2, 3);
     return fmadd(step_sizes, step_vec, base_vec);
   }
+<<<<<<< HEAD
   static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
                            int64_t count = size()) {
     switch (count) {
@@ -157,6 +233,39 @@ template <> class Vectorized<float> {
               a.values);
           return vec;
         }
+=======
+  static Vectorized<float> set(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1: {
+        Vectorized<float> vec;
+        static uint32x4_t mask_low = {0xFFFFFFFF, 0x0, 0x0, 0x0};
+        vec.values = vreinterpretq_f32_u32(mask_low);
+        vec.values =
+            vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values);
+        return vec;
+      }
+      case 2: {
+        Vectorized<float> vec;
+        static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0};
+        vec.values = vreinterpretq_f32_u32(mask_low);
+        vec.values =
+            vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values);
+        return vec;
+      }
+      case 3: {
+        Vectorized<float> vec;
+        static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
+        vec.values = vreinterpretq_f32_u32(mask_low);
+        vec.values =
+            vbslq_f32(vreinterpretq_u32_f32(vec.values), b.values, a.values);
+        return vec;
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return b;
   }
@@ -204,7 +313,11 @@ template <> class Vectorized<float> {
     __at_align__ float tmp[size()];
     store(tmp);
     int mask = 0;
+<<<<<<< HEAD
     for (int i = 0; i < size(); ++ i) {
+=======
+    for (int i = 0; i < size(); ++i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (tmp[i] == 0.f) {
         mask |= (1 << i);
       }
@@ -218,7 +331,11 @@ template <> class Vectorized<float> {
     __at_align__ float tmp[size()];
     store(tmp);
     for (const auto i : c10::irange(size())) {
+<<<<<<< HEAD
       if(_isnan(tmp[i]) || _isinf(tmp[i])) {
+=======
+      if (_isnan(tmp[i]) || _isinf(tmp[i])) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return true;
       }
     }
@@ -262,6 +379,7 @@ template <> class Vectorized<float> {
   Vectorized<float> conj() const {
     return *this;
   }
+<<<<<<< HEAD
 #define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(name, sleef_name) \
   Vectorized<float> name() const {                                      \
     return USE_SLEEF(                                                   \
@@ -272,6 +390,17 @@ template <> class Vectorized<float> {
 
 #define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(name)    \
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(name, Sleef_##name##f4_u10)
+=======
+#define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(      \
+    name, sleef_name)                                                        \
+  Vectorized<float> name() const {                                           \
+    return USE_SLEEF(Vectorized<float>(sleef_name(values)), map(std::name)); \
+  }
+
+#define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(name)      \
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \
+      name, Sleef_##name##f4_u10)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(acos)
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(acosh)
@@ -280,6 +409,7 @@ template <> class Vectorized<float> {
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(atan)
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(atanh)
 
+<<<<<<< HEAD
 #define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(name, sleef_name) \
   Vectorized<float> name(const Vectorized<float> &arg) const {          \
     return USE_SLEEF(                                                   \
@@ -295,6 +425,28 @@ template <> class Vectorized<float> {
   DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(copysign, Sleef_copysignf4)
   Vectorized<float> erf() const;
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(erfc, Sleef_erfcf4_u15)
+=======
+#define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \
+    name, sleef_name)                                                    \
+  Vectorized<float> name(const Vectorized<float>& arg) const {           \
+    return USE_SLEEF(                                                    \
+        Vectorized<float>(sleef_name(values, arg.values)),               \
+        map2(arg, std::name));                                           \
+  }
+
+#define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(name)      \
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME( \
+      name, Sleef_##name##f4_u10)
+
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(atan2)
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
+      copysign,
+      Sleef_copysignf4)
+  Vectorized<float> erf() const;
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
+      erfc,
+      Sleef_erfcf4_u15)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<float> erfinv() const {
     return map(calc_erfinv);
   }
@@ -304,8 +456,17 @@ template <> class Vectorized<float> {
   Vectorized<float> exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(fmod, Sleef_fmodf4)
   DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(hypot, Sleef_hypotf4_u05)
+=======
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
+      fmod,
+      Sleef_fmodf4)
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
+      hypot,
+      Sleef_hypotf4_u05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<float> i0() const {
     return map(calc_i0);
   }
@@ -315,17 +476,30 @@ template <> class Vectorized<float> {
   Vectorized<float> digamma() const {
     return map(calc_digamma);
   }
+<<<<<<< HEAD
   Vectorized<float> igamma(const Vectorized<float> &x) const {
     return map2(x, calc_igamma);
   }
   Vectorized<float> igammac(const Vectorized<float> &x) const {
+=======
+  Vectorized<float> igamma(const Vectorized<float>& x) const {
+    return map2(x, calc_igamma);
+  }
+  Vectorized<float> igammac(const Vectorized<float>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map2(x, calc_igammac);
   }
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log)
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log10)
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log1p)
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log2)
+<<<<<<< HEAD
   DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(nextafter, Sleef_nextafterf4)
+=======
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
+      nextafter,
+      Sleef_nextafterf4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<float> frac() const;
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(sin)
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(sinh)
@@ -338,11 +512,19 @@ template <> class Vectorized<float> {
     return map(at::native::floor_impl);
   }
   Vectorized<float> neg() const {
+<<<<<<< HEAD
     return Vectorized<float>(
         vnegq_f32(values));
   }
   Vectorized<float> round() const {
     // We do not use std::round because we would like to round midway numbers to the nearest even integer.
+=======
+    return Vectorized<float>(vnegq_f32(values));
+  }
+  Vectorized<float> round() const {
+    // We do not use std::round because we would like to round midway numbers to
+    // the nearest even integer.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(at::native::round_impl);
   }
   DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(tan)
@@ -362,16 +544,27 @@ template <> class Vectorized<float> {
   }
   DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(pow)
   Vectorized<float> operator==(const Vectorized<float>& other) const {
+<<<<<<< HEAD
     return Vectorized<float>(vreinterpretq_f32_u32(vceqq_f32(values, other.values)));
   }
 
   Vectorized<float> operator!=(const Vectorized<float>& other) const {
     float32x4_t r0 = vreinterpretq_f32_u32(
         vmvnq_u32(vceqq_f32(values, other.values)));
+=======
+    return Vectorized<float>(
+        vreinterpretq_f32_u32(vceqq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    float32x4_t r0 =
+        vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(values, other.values)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<float>(r0);
   }
 
   Vectorized<float> operator<(const Vectorized<float>& other) const {
+<<<<<<< HEAD
     return Vectorized<float>(vreinterpretq_f32_u32(vcltq_f32(values, other.values)));
   }
 
@@ -385,6 +578,25 @@ template <> class Vectorized<float> {
 
   Vectorized<float> operator>=(const Vectorized<float>& other) const {
     return Vectorized<float>(vreinterpretq_f32_u32(vcgeq_f32(values, other.values)));
+=======
+    return Vectorized<float>(
+        vreinterpretq_f32_u32(vcltq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    return Vectorized<float>(
+        vreinterpretq_f32_u32(vcleq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    return Vectorized<float>(
+        vreinterpretq_f32_u32(vcgtq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    return Vectorized<float>(
+        vreinterpretq_f32_u32(vcgeq_f32(values, other.values)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<float> eq(const Vectorized<float>& other) const;
@@ -396,22 +608,46 @@ template <> class Vectorized<float> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator+(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Vectorized<float>(vaddq_f32(a, b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator-(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Vectorized<float>(vsubq_f32(a, b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator*(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Vectorized<float>(vmulq_f32(a, b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator/(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Vectorized<float>(vdivq_f32(a, b));
 }
 
@@ -420,6 +656,7 @@ inline Vectorized<float> Vectorized<float>::frac() const {
   return *this - this->trunc();
 }
 
+<<<<<<< HEAD
 //Added sleef Implementation for Maximum
 Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b)  {
   if(!a.has_inf_nan() && !b.has_inf_nan()){
@@ -430,31 +667,64 @@ Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<fl
   else{
     return Vectorized<float>(vmaxq_f32(a, b));
   }
+=======
+template <>
+Vectorized<float> inline maximum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vmaxq_f32(a, b));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline minimum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Vectorized<float>(vminq_f32(a, b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
+=======
+Vectorized<float> inline clamp(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min,
+    const Vectorized<float>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return minimum(max, maximum(min, a));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
+=======
+Vectorized<float> inline clamp_max(
+    const Vectorized<float>& a,
+    const Vectorized<float>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return minimum(max, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
+=======
+Vectorized<float> inline clamp_min(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return maximum(min, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
   return Vectorized<float>(vreinterpretq_f32_u32(vandq_u32(
       vreinterpretq_u32_f32(a),
@@ -496,6 +766,58 @@ inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) c
 }
 
 inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
+=======
+Vectorized<float> inline operator&(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vreinterpretq_f32_u32(
+      vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))));
+}
+
+template <>
+Vectorized<float> inline operator|(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vreinterpretq_f32_u32(
+      vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))));
+}
+
+template <>
+Vectorized<float> inline operator^(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return Vectorized<float>(vreinterpretq_f32_u32(
+      veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))));
+}
+
+inline Vectorized<float> Vectorized<float>::eq(
+    const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ne(
+    const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::gt(
+    const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ge(
+    const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::lt(
+    const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::le(
+    const Vectorized<float>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<float>(1.0f);
 }
 
@@ -505,7 +827,12 @@ inline void convert(const float* src, int32_t* dst, int64_t n) {
 #ifndef __msvc_cl__
 #pragma unroll
 #endif
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+=======
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i)));
   }
 #ifndef __msvc_cl__
@@ -522,7 +849,12 @@ inline void convert(const int32_t* src, float* dst, int64_t n) {
 #ifndef __msvc_cl__
 #pragma unroll
 #endif
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+=======
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i)));
   }
 #ifndef __msvc_cl__
@@ -534,11 +866,19 @@ inline void convert(const int32_t* src, float* dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+=======
+Vectorized<float> inline fmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Vectorized<float>(vfmaq_f32(c, a, b));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
   return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b)));
 }
@@ -573,9 +913,54 @@ inline Vectorized<float> Vectorized<float>::erf() const{
     auto tmp6 = t * tmp5;
     auto tmp7 = fmadd(tmp6, r, one_vec);
     return tmp7 ^ sign_mask;
+=======
+Vectorized<float> inline fmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b)));
+}
+
+inline Vectorized<float> Vectorized<float>::erf() const {
+  // constants
+  const Vectorized<float> neg_zero_vec(-0.f);
+  const Vectorized<float> one_vec(1.0f);
+  const Vectorized<float> p(0.3275911f);
+  const Vectorized<float> p1(0.254829592f);
+  const Vectorized<float> p2(-0.284496736f);
+  const Vectorized<float> p3(1.421413741f);
+  const Vectorized<float> p4(-1.453152027f);
+  const Vectorized<float> p5(1.061405429f);
+  // sign(x)
+  auto sign_mask = neg_zero_vec & *this;
+  auto abs_vec = this->abs();
+  // t = 1 / (p * abs(x) + 1)
+  auto tmp0 = fmadd(p, abs_vec, one_vec);
+  auto t = one_vec / tmp0;
+  // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
+  auto tmp1 = fmadd(p5, t, p4);
+  auto tmp2 = fmadd(tmp1, t, p3);
+  auto tmp3 = fmadd(tmp2, t, p2);
+  auto r = fmadd(tmp3, t, p1);
+  // - exp(- x * x)
+  auto pow_2 = (*this) * (*this);
+  auto neg_pow_2 = pow_2 ^ neg_zero_vec;
+  auto tmp4 = neg_pow_2.map(
+      std::exp); // This can be swapped for a faster implementation of exp.
+  auto tmp5 = tmp4 ^ neg_zero_vec;
+  // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
+  auto tmp6 = t * tmp5;
+  auto tmp7 = fmadd(tmp6, r, one_vec);
+  return tmp7 ^ sign_mask;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #undef DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC
 #undef DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC
 #endif /* defined(aarch64) */
 
+<<<<<<< HEAD
 }} // namespace at::vec::CPU_CAPABILITY
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
index e75e9d67655c..eef48a1a8fd6 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
@@ -58,12 +58,33 @@ struct BlendHalfRegs<index, false> {
   }
 };
 
+<<<<<<< HEAD
 // On ARM, Half type supports float16_t->Half constructor and Half->float16_t
 // conversion
 template <>
 class Vectorized<c10::Half> : public Vectorized16<float16x8_t, c10::Half, BlendHalfRegs, Vectorized<c10::Half>> {
   using Base = Vectorized16<float16x8_t, c10::Half, BlendHalfRegs, Vectorized<c10::Half>>;
   friend Base;
+=======
+template <>
+struct is_vec_specialized_for<c10::Half> : std::bool_constant<true> {};
+
+// On ARM, Half type supports float16_t->Half constructor and Half->float16_t
+// conversion
+template <>
+class Vectorized<c10::Half> : public Vectorized16<
+                                  float16x8_t,
+                                  c10::Half,
+                                  BlendHalfRegs,
+                                  Vectorized<c10::Half>> {
+  using Base = Vectorized16<
+      float16x8_t,
+      c10::Half,
+      BlendHalfRegs,
+      Vectorized<c10::Half>>;
+  friend Base;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   // We use these private map functions to implement various methods
   Vectorized<c10::Half> map_with_vec_float_method(
@@ -85,8 +106,15 @@ class Vectorized<c10::Half> : public Vectorized16<float16x8_t, c10::Half, BlendH
     float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values));
     float32x4_t second_v00 = vcvt_f32_f16(vget_low_f16(second.values));
     float32x4_t second_v01 = vcvt_f32_f16(vget_high_f16(second.values));
+<<<<<<< HEAD
     Vectorized<float> mv0 = (Vectorized<float>(v00).*m)(Vectorized<float>(second_v00));
     Vectorized<float> mv1 = (Vectorized<float>(v01).*m)(Vectorized<float>(second_v01));
+=======
+    Vectorized<float> mv0 =
+        (Vectorized<float>(v00).*m)(Vectorized<float>(second_v00));
+    Vectorized<float> mv1 =
+        (Vectorized<float>(v01).*m)(Vectorized<float>(second_v01));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     float16x4_t r00 = vcvt_f16_f32(mv0);
     float16x4_t r01 = vcvt_f16_f32(mv1);
 
@@ -102,12 +130,26 @@ class Vectorized<c10::Half> : public Vectorized16<float16x8_t, c10::Half, BlendH
     float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values));
     float32x4_t second_v00 = vcvt_f32_f16(vget_low_f16(second.values));
     float32x4_t second_v01 = vcvt_f32_f16(vget_high_f16(second.values));
+<<<<<<< HEAD
     Vectorized<float> mv0 = (Vectorized<float>(v00).*m)(Vectorized<float>(second_v00));
     Vectorized<float> mv1 = (Vectorized<float>(v01).*m)(Vectorized<float>(second_v01));
     // Assume the operator returns a bitmask, not "real" floats, and
     // just narrow the bits. All-ones is a NaN and will get mangled by conversion!
     float16x4_t r00 = vreinterpret_f16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0)));
     float16x4_t r01 = vreinterpret_f16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1)));
+=======
+    Vectorized<float> mv0 =
+        (Vectorized<float>(v00).*m)(Vectorized<float>(second_v00));
+    Vectorized<float> mv1 =
+        (Vectorized<float>(v01).*m)(Vectorized<float>(second_v01));
+    // Assume the operator returns a bitmask, not "real" floats, and
+    // just narrow the bits. All-ones is a NaN and will get mangled by
+    // conversion!
+    float16x4_t r00 =
+        vreinterpret_f16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0)));
+    float16x4_t r01 =
+        vreinterpret_f16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Pack result into Vectorized<c10::Half>
     return Vectorized<c10::Half>(vcombine_f16(r00, r01));
@@ -120,10 +162,15 @@ class Vectorized<c10::Half> : public Vectorized16<float16x8_t, c10::Half, BlendH
 
   // A ctor that accepts c10::Half is needed to fit interface with vec_base.h
   // A second constructor that takes float16_t is also included
+<<<<<<< HEAD
   Vectorized(c10::Half val)
       : Vectorized((float16_t)val) {}
   Vectorized(float16_t val)
       : Vectorized16(vdupq_n_f16(val)) {}
+=======
+  Vectorized(c10::Half val) : Vectorized((float16_t)val) {}
+  Vectorized(float16_t val) : Vectorized16(vdupq_n_f16(val)) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(
       value_type val0,
       value_type val1,
@@ -133,6 +180,7 @@ class Vectorized<c10::Half> : public Vectorized16<float16x8_t, c10::Half, BlendH
       value_type val5,
       value_type val6,
       value_type val7)
+<<<<<<< HEAD
       : Vectorized16(float16x8_t{
             val0,
             val1,
@@ -143,6 +191,10 @@ class Vectorized<c10::Half> : public Vectorized16<float16x8_t, c10::Half, BlendH
             val6,
             val7}) {}
 
+=======
+      : Vectorized16(
+            float16x8_t{val0, val1, val2, val3, val4, val5, val6, val7}) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   static Vectorized<c10::Half> blendv(
       const Vectorized<c10::Half>& a,
@@ -162,11 +214,18 @@ class Vectorized<c10::Half> : public Vectorized16<float16x8_t, c10::Half, BlendH
     // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC. vbslq_u16 generates the
     // same instruction anyway. see https://godbolt.org/z/cY4a55Y7P
     Vectorized<c10::Half> vec(mask.values);
+<<<<<<< HEAD
     vec.values = vreinterpretq_f16_u16(
         vbslq_u16(
             vreinterpretq_u16_f16(vec.values),
             vreinterpretq_u16_f16(b.values),
             vreinterpretq_u16_f16(a.values)));
+=======
+    vec.values = vreinterpretq_f16_u16(vbslq_u16(
+        vreinterpretq_u16_f16(vec.values),
+        vreinterpretq_u16_f16(b.values),
+        vreinterpretq_u16_f16(a.values)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return vec;
   }
   static Vectorized<c10::Half> set(
@@ -181,12 +240,19 @@ class Vectorized<c10::Half> : public Vectorized16<float16x8_t, c10::Half, BlendH
 
     // Using blendv is awkward because 0xFFFF is one of many NaN's in FP16
     // so we directly use vbslq_u16 instead. (See NOTE [vbslq_f16] above.)
+<<<<<<< HEAD
     Vectorized<c10::Half> vec(
         vreinterpretq_f16_u16(
             vbslq_u16(
                 mask,
                 vreinterpretq_u16_f16(b.values),
                 vreinterpretq_u16_f16(a.values))));
+=======
+    Vectorized<c10::Half> vec(vreinterpretq_f16_u16(vbslq_u16(
+        mask,
+        vreinterpretq_u16_f16(b.values),
+        vreinterpretq_u16_f16(a.values))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return vec;
   }
@@ -284,50 +350,98 @@ class Vectorized<c10::Half> : public Vectorized16<float16x8_t, c10::Half, BlendH
   }
   Vectorized<c10::Half> operator==(const Vectorized<c10::Half>& other) const {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+<<<<<<< HEAD
     return Vectorized<c10::Half>(vreinterpretq_f16_u16(vceqq_f16(values, other.values)));
 #else
     return map2_bitmask_with_vec_float_method(other, &Vectorized<float>::operator==);
+=======
+    return Vectorized<c10::Half>(
+        vreinterpretq_f16_u16(vceqq_f16(values, other.values)));
+#else
+    return map2_bitmask_with_vec_float_method(
+        other, &Vectorized<float>::operator==);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
 
   Vectorized<c10::Half> operator!=(const Vectorized<c10::Half>& other) const {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+<<<<<<< HEAD
     return Vectorized<c10::Half>(vreinterpretq_f16_u16(
                                      vmvnq_u16(vceqq_f16(values, other.values))));
 #else
     return map2_bitmask_with_vec_float_method(other, &Vectorized<float>::operator!=);
+=======
+    return Vectorized<c10::Half>(
+        vreinterpretq_f16_u16(vmvnq_u16(vceqq_f16(values, other.values))));
+#else
+    return map2_bitmask_with_vec_float_method(
+        other, &Vectorized<float>::operator!=);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
 
   Vectorized<c10::Half> operator<(const Vectorized<c10::Half>& other) const {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+<<<<<<< HEAD
     return Vectorized<c10::Half>(vreinterpretq_f16_u16(vcltq_f16(values, other.values)));
 #else
     return map2_bitmask_with_vec_float_method(other, &Vectorized<float>::operator<);
+=======
+    return Vectorized<c10::Half>(
+        vreinterpretq_f16_u16(vcltq_f16(values, other.values)));
+#else
+    return map2_bitmask_with_vec_float_method(
+        other, &Vectorized<float>::operator<);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
 
   Vectorized<c10::Half> operator<=(const Vectorized<c10::Half>& other) const {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+<<<<<<< HEAD
     return Vectorized<c10::Half>(vreinterpretq_f16_u16(vcleq_f16(values, other.values)));
 #else
     return map2_bitmask_with_vec_float_method(other, &Vectorized<float>::operator<=);
+=======
+    return Vectorized<c10::Half>(
+        vreinterpretq_f16_u16(vcleq_f16(values, other.values)));
+#else
+    return map2_bitmask_with_vec_float_method(
+        other, &Vectorized<float>::operator<=);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
 
   Vectorized<c10::Half> operator>(const Vectorized<c10::Half>& other) const {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+<<<<<<< HEAD
     return Vectorized<c10::Half>(vreinterpretq_f16_u16(vcgtq_f16(values, other.values)));
 #else
     return map2_bitmask_with_vec_float_method(other, &Vectorized<float>::operator>);
+=======
+    return Vectorized<c10::Half>(
+        vreinterpretq_f16_u16(vcgtq_f16(values, other.values)));
+#else
+    return map2_bitmask_with_vec_float_method(
+        other, &Vectorized<float>::operator>);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
 
   Vectorized<c10::Half> operator>=(const Vectorized<c10::Half>& other) const {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+<<<<<<< HEAD
     return Vectorized<c10::Half>(vreinterpretq_f16_u16(vcgeq_f16(values, other.values)));
 #else
     return map2_bitmask_with_vec_float_method(other, &Vectorized<float>::operator>=);
+=======
+    return Vectorized<c10::Half>(
+        vreinterpretq_f16_u16(vcgeq_f16(values, other.values)));
+#else
+    return map2_bitmask_with_vec_float_method(
+        other, &Vectorized<float>::operator>=);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
 
@@ -339,14 +453,27 @@ class Vectorized<c10::Half> : public Vectorized16<float16x8_t, c10::Half, BlendH
   Vectorized<c10::Half> le(const Vectorized<c10::Half>& other) const;
 }; // Vectorized<Half>
 
+<<<<<<< HEAD
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(const Vectorized<Half>& a) {
+=======
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(
+    const Vectorized<Half>& a) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
   float16x8_t x = a;
   float32x4_t x1 = vcvt_f32_f16(vget_low_f16(x));
   float32x4_t x2 = vcvt_f32_f16(vget_high_f16(x));
+<<<<<<< HEAD
   return { Vectorized<float>(x1), Vectorized<float>(x2) };
 }
 inline Vectorized<Half> convert_float_half(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  return {Vectorized<float>(x1), Vectorized<float>(x2)};
+}
+inline Vectorized<Half> convert_float_half(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
   float32x4_t x = a;
   float32x4_t y = b;
@@ -363,8 +490,12 @@ Vectorized<c10::Half> binary_operator_via_float(
   const auto [a_float_low, a_float_high] = convert_half_float(a);
   const auto [b_float_low, b_float_high] = convert_half_float(b);
   return convert_float_half(
+<<<<<<< HEAD
       op(a_float_low, b_float_low),
       op(a_float_high, b_float_high));
+=======
+      op(a_float_low, b_float_low), op(a_float_high, b_float_high));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
@@ -426,7 +557,12 @@ Vectorized<c10::Half> inline maximum(
   return Vectorized<c10::Half>(vmaxq_f16(a, b));
 #else
   return binary_operator_via_float(
+<<<<<<< HEAD
       static_cast<Vectorized<float>(*)(const Vectorized<float>&, const Vectorized<float>&)>(&maximum),
+=======
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&maximum),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       a,
       b);
 #endif
@@ -442,7 +578,12 @@ Vectorized<c10::Half> inline minimum(
   return Vectorized<c10::Half>(vminq_f16(a, b));
 #else
   return binary_operator_via_float(
+<<<<<<< HEAD
       static_cast<Vectorized<float>(*)(const Vectorized<float>&, const Vectorized<float>&)>(&minimum),
+=======
+      static_cast<Vectorized<float> (*)(
+          const Vectorized<float>&, const Vectorized<float>&)>(&minimum),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       a,
       b);
 #endif
@@ -474,24 +615,39 @@ template <>
 Vectorized<c10::Half> inline operator&(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
+<<<<<<< HEAD
   return Vectorized<c10::Half>(vreinterpretq_f16_u16(vandq_u16(
       vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))));
+=======
+  return Vectorized<c10::Half>(vreinterpretq_f16_u16(
+      vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
 Vectorized<c10::Half> inline operator|(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
+<<<<<<< HEAD
   return Vectorized<c10::Half>(vreinterpretq_f16_u16(vorrq_u16(
       vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))));
+=======
+  return Vectorized<c10::Half>(vreinterpretq_f16_u16(
+      vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
 Vectorized<c10::Half> inline operator^(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
+<<<<<<< HEAD
   return Vectorized<c10::Half>(vreinterpretq_f16_u16(veorq_u16(
       vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))));
+=======
+  return Vectorized<c10::Half>(vreinterpretq_f16_u16(
+      veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline Vectorized<c10::Half> Vectorized<c10::Half>::eq(
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h
index fec580eef4d6..6a2b88a47cab 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h
@@ -10,10 +10,22 @@ inline namespace CPU_CAPABILITY {
 // Shared implementation between Vectorized<c10::Half> and
 // Vectorized<c10::BFloat16>. Uses CRTP to allow derived class
 // customization.
+<<<<<<< HEAD
 template <typename VecT, typename ValueT, template <int, bool> typename BlendRegs, typename Derived>
 struct Vectorized16 {
  protected:
   VecT values;
+=======
+template <
+    typename VecT,
+    typename ValueT,
+    template <int, bool> typename BlendRegs,
+    typename Derived>
+struct Vectorized16 {
+ protected:
+  VecT values;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  public:
   using value_type = ValueT;
   using size_type = int;
@@ -28,7 +40,12 @@ struct Vectorized16 {
       value_type (*const f)(value_type, value_type)) const {
     __at_align__ value_type tmp_first[size()];
     __at_align__ value_type tmp_second[size()];
+<<<<<<< HEAD
     static_cast<const Derived*>(this)->store(tmp_first); // store this to tmp_first
+=======
+    static_cast<const Derived*>(this)->store(
+        tmp_first); // store this to tmp_first
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     second.store(tmp_second);
     for (const auto i : c10::irange(size())) {
       tmp_first[i] = f(tmp_first[i], tmp_second[i]);
@@ -47,6 +64,7 @@ struct Vectorized16 {
   template <int64_t mask>
   static Derived blend(const Derived& a, const Derived& b) {
     Derived vec;
+<<<<<<< HEAD
     vec.values = BlendRegs<0, (mask & 0x01) != 0>::impl(
         a.values, b.values, vec.values);
     vec.values = BlendRegs<1, (mask & 0x02) != 0>::impl(
@@ -64,6 +82,25 @@ struct Vectorized16 {
         a.values, b.values, vec.values);
     vec.values = BlendRegs<7, (mask & 0x80) != 0>::impl(
         a.values, b.values, vec.values);
+=======
+    vec.values = BlendRegs < 0,
+    (mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 1,
+    (mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 2,
+    (mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 3,
+    (mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values);
+
+    vec.values = BlendRegs < 4,
+    (mask & 0x10) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 5,
+    (mask & 0x20) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 6,
+    (mask & 0x40) != 0 > ::impl(a.values, b.values, vec.values);
+    vec.values = BlendRegs < 7,
+    (mask & 0x80) != 0 > ::impl(a.values, b.values, vec.values);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return vec;
   }
@@ -120,8 +157,17 @@ struct Vectorized16 {
   Derived angle() const {
     auto zero = Derived(0);
     auto pi = Derived(c10::pi<value_type>);
+<<<<<<< HEAD
     auto tmp = Derived::blendv(zero, pi, *static_cast<const Derived*>(this) < zero);
     return Derived::blendv(tmp, *static_cast<const Derived*>(this), static_cast<const Derived*>(this)->isnan());
+=======
+    auto tmp =
+        Derived::blendv(zero, pi, *static_cast<const Derived*>(this) < zero);
+    return Derived::blendv(
+        tmp,
+        *static_cast<const Derived*>(this),
+        static_cast<const Derived*>(this)->isnan());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Derived real() const {
     return *this;
@@ -137,6 +183,7 @@ struct Vectorized16 {
   // converting to FP32, applying the math function, and then converting back to
   // FP16/BF16.
   Derived acos() const {
+<<<<<<< HEAD
     return static_cast<const Derived*>(this)->map_with_vec_float_method(&Vectorized<float>::acos);
   }
   Derived acosh() const {
@@ -180,12 +227,73 @@ struct Vectorized16 {
   }
   Derived exp_u20() const {
     return static_cast<const Derived*>(this)->map_with_vec_float_method(&Vectorized<float>::exp_u20);
+=======
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::acos);
+  }
+  Derived acosh() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::acosh);
+  }
+  Derived asin() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::asin);
+  }
+  Derived asinh() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::asinh);
+  }
+  Derived atan() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::atan);
+  }
+  Derived atanh() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::atanh);
+  }
+  Derived atan2(const Derived& exp) const {
+    return static_cast<const Derived*>(this)->map2_with_vec_float_method(
+        exp, &Vectorized<float>::atan2);
+  }
+  Derived copysign(const Derived& sign) const {
+    return static_cast<const Derived*>(this)->map2_with_vec_float_method(
+        sign, &Vectorized<float>::copysign);
+  }
+  Derived erf() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::erf);
+  }
+  Derived erfc() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::erfc);
+  }
+  Derived erfinv() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::erfinv);
+  }
+  Derived exp() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::exp);
+  }
+  Derived exp2() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::exp2);
+  }
+  Derived expm1() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::expm1);
+  }
+  Derived exp_u20() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::exp_u20);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Derived fmod(const Derived& q) const {
     // This function is questionable with a conversion, so we use map2
     return map2(q, std::fmod);
   }
   Derived hypot(const Derived& b) const {
+<<<<<<< HEAD
     return static_cast<const Derived*>(this)->map2_with_vec_float_method(b, &Vectorized<float>::hypot);
   }
   Derived i0() const {
@@ -214,12 +322,53 @@ struct Vectorized16 {
   }
   Derived log2() const {
     return static_cast<const Derived*>(this)->map_with_vec_float_method(&Vectorized<float>::log2);
+=======
+    return static_cast<const Derived*>(this)->map2_with_vec_float_method(
+        b, &Vectorized<float>::hypot);
+  }
+  Derived i0() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::i0);
+  }
+  Derived i0e() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::i0e);
+  }
+  Derived digamma() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::digamma);
+  }
+  Derived igamma(const Derived& x) const {
+    return static_cast<const Derived*>(this)->map2_with_vec_float_method(
+        x, &Vectorized<float>::igamma);
+  }
+  Derived igammac(const Derived& x) const {
+    return static_cast<const Derived*>(this)->map2_with_vec_float_method(
+        x, &Vectorized<float>::igammac);
+  }
+  Derived log() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::log);
+  }
+  Derived log10() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::log10);
+  }
+  Derived log1p() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::log1p);
+  }
+  Derived log2() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::log2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Derived nextafter(const Derived& b) const {
     // This function does not make sense with conversion, so we use map2
     return map2(b, std::nextafter);
   }
   Derived sin() const {
+<<<<<<< HEAD
     return static_cast<const Derived*>(this)->map_with_vec_float_method(&Vectorized<float>::sin);
   }
   Derived sinh() const {
@@ -230,6 +379,22 @@ struct Vectorized16 {
   }
   Derived cosh() const {
     return static_cast<const Derived*>(this)->map_with_vec_float_method(&Vectorized<float>::cosh);
+=======
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::sin);
+  }
+  Derived sinh() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::sinh);
+  }
+  Derived cos() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::cos);
+  }
+  Derived cosh() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::cosh);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Derived ceil() const {
     // This function is questionable with a conversion, so we use map
@@ -244,6 +409,7 @@ struct Vectorized16 {
     return map(at::native::round_impl);
   }
   Derived tan() const {
+<<<<<<< HEAD
     return static_cast<const Derived*>(this)->map_with_vec_float_method(&Vectorized<float>::tan);
   }
   Derived tanh() const {
@@ -251,16 +417,36 @@ struct Vectorized16 {
   }
   Derived lgamma() const {
     return static_cast<const Derived*>(this)->map_with_vec_float_method(&Vectorized<float>::lgamma);
+=======
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::tan);
+  }
+  Derived tanh() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::tanh);
+  }
+  Derived lgamma() const {
+    return static_cast<const Derived*>(this)->map_with_vec_float_method(
+        &Vectorized<float>::lgamma);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Derived rsqrt() const {
     return static_cast<const Derived*>(this)->sqrt().reciprocal();
   }
   Derived pow(const Derived& exp) const {
+<<<<<<< HEAD
     return static_cast<const Derived*>(this)->map2_with_vec_float_method(exp, &Vectorized<float>::pow);
   }
 
 };
 
 
+=======
+    return static_cast<const Derived*>(this)->map2_with_vec_float_method(
+        exp, &Vectorized<float>::pow);
+  }
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h b/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h
index 5540c8bc782f..a5dfbcc24338 100644
--- a/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/missing_vld1_neon.h
@@ -1,261 +1,459 @@
 /* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7.  */
 
 __extension__ extern __inline uint8x8x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u8_x2 (const uint8_t *__a)
 {
   uint8x8x2_t ret;
   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_u8_x2(const uint8_t* __a) {
+  uint8x8x2_t ret;
+  asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline int8x8x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s8_x2 (const int8_t *__a)
 {
   int8x8x2_t ret;
   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_s8_x2(const int8_t* __a) {
+  int8x8x2_t ret;
+  asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline uint16x4x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u16_x2 (const uint16_t *__a)
 {
   uint16x4x2_t ret;
   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_u16_x2(const uint16_t* __a) {
+  uint16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline int16x4x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s16_x2 (const int16_t *__a)
 {
   int16x4x2_t ret;
   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_s16_x2(const int16_t* __a) {
+  int16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline uint32x2x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u32_x2 (const uint32_t *__a)
 {
   uint32x2x2_t ret;
   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_u32_x2(const uint32_t* __a) {
+  uint32x2x2_t ret;
+  asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline int32x2x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s32_x2 (const int32_t *__a)
 {
   int32x2x2_t ret;
   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_s32_x2(const int32_t* __a) {
+  int32x2x2_t ret;
+  asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline uint64x1x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u64_x2 (const uint64_t *__a)
 {
   uint64x1x2_t ret;
   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_u64_x2(const uint64_t* __a) {
+  uint64x1x2_t ret;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline int64x1x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s64_x2 (const int64_t *__a)
 {
   int64x1x2_t ret;
   __builtin_aarch64_simd_oi __o;
   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_s64_x2(const int64_t* __a) {
+  int64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline float16x4x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_f16_x2 (const float16_t *__a)
 {
   float16x4x2_t ret;
   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_f16_x2(const float16_t* __a) {
+  float16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline float32x2x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_f32_x2 (const float32_t *__a)
 {
   float32x2x2_t ret;
   asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_f32_x2(const float32_t* __a) {
+  float32x2x2_t ret;
+  asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline float64x1x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_f64_x2 (const float64_t *__a)
 {
   float64x1x2_t ret;
   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_f64_x2(const float64_t* __a) {
+  float64x1x2_t ret;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline poly8x8x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_p8_x2 (const poly8_t *__a)
 {
   poly8x8x2_t ret;
   asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_p8_x2(const poly8_t* __a) {
+  poly8x8x2_t ret;
+  asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline poly16x4x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_p16_x2 (const poly16_t *__a)
 {
   poly16x4x2_t ret;
   asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_p16_x2(const poly16_t* __a) {
+  poly16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline poly64x1x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_p64_x2 (const poly64_t *__a)
 {
   poly64x1x2_t ret;
   asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1_p64_x2(const poly64_t* __a) {
+  poly64x1x2_t ret;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline uint8x16x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_u8_x2 (const uint8_t *__a)
 {
   uint8x16x2_t ret;
   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_u8_x2(const uint8_t* __a) {
+  uint8x16x2_t ret;
+  asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline int8x16x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_s8_x2 (const int8_t *__a)
 {
   int8x16x2_t ret;
   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_s8_x2(const int8_t* __a) {
+  int8x16x2_t ret;
+  asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline uint16x8x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_u16_x2 (const uint16_t *__a)
 {
   uint16x8x2_t ret;
   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_u16_x2(const uint16_t* __a) {
+  uint16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline int16x8x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_s16_x2 (const int16_t *__a)
 {
   int16x8x2_t ret;
   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_s16_x2(const int16_t* __a) {
+  int16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline uint32x4x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_u32_x2 (const uint32_t *__a)
 {
   uint32x4x2_t ret;
   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_u32_x2(const uint32_t* __a) {
+  uint32x4x2_t ret;
+  asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline int32x4x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_s32_x2 (const int32_t *__a)
 {
   int32x4x2_t ret;
   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_s32_x2(const int32_t* __a) {
+  int32x4x2_t ret;
+  asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline uint64x2x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_u64_x2 (const uint64_t *__a)
 {
   uint64x2x2_t ret;
   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_u64_x2(const uint64_t* __a) {
+  uint64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline int64x2x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_s64_x2 (const int64_t *__a)
 {
   int64x2x2_t ret;
   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_s64_x2(const int64_t* __a) {
+  int64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline float16x8x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_f16_x2 (const float16_t *__a)
 {
   float16x8x2_t ret;
   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_f16_x2(const float16_t* __a) {
+  float16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline float32x4x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_f32_x2 (const float32_t *__a)
 {
   float32x4x2_t ret;
   asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_f32_x2(const float32_t* __a) {
+  float32x4x2_t ret;
+  asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline float64x2x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_f64_x2 (const float64_t *__a)
 {
   float64x2x2_t ret;
   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_f64_x2(const float64_t* __a) {
+  float64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline poly8x16x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_p8_x2 (const poly8_t *__a)
 {
   poly8x16x2_t ret;
   asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_p8_x2(const poly8_t* __a) {
+  poly8x16x2_t ret;
+  asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline poly16x8x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_p16_x2 (const poly16_t *__a)
 {
   poly16x8x2_t ret;
   asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_p16_x2(const poly16_t* __a) {
+  poly16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 __extension__ extern __inline poly64x2x2_t
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vld1q_p64_x2 (const poly64_t *__a)
 {
   poly64x2x2_t ret;
   asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vld1q_p64_x2(const poly64_t* __a) {
+  poly64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w"(ret) : "Q"(*__a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
 /* vst1x2 */
 
 __extension__ extern __inline void
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
 {
@@ -449,4 +647,171 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
 {
   asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_s64_x2(int64_t* __a, int64x1x2_t val) {
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_u64_x2(uint64_t* __a, uint64x1x2_t val) {
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_f64_x2(float64_t* __a, float64x1x2_t val) {
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_s8_x2(int8_t* __a, int8x8x2_t val) {
+  asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_p8_x2(poly8_t* __a, poly8x8x2_t val) {
+  asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_s16_x2(int16_t* __a, int16x4x2_t val) {
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_p16_x2(poly16_t* __a, poly16x4x2_t val) {
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_s32_x2(int32_t* __a, int32x2x2_t val) {
+  asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_u8_x2(uint8_t* __a, uint8x8x2_t val) {
+  asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_u16_x2(uint16_t* __a, uint16x4x2_t val) {
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_u32_x2(uint32_t* __a, uint32x2x2_t val) {
+  asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_f16_x2(float16_t* __a, float16x4x2_t val) {
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_f32_x2(float32_t* __a, float32x2x2_t val) {
+  asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1_p64_x2(poly64_t* __a, poly64x1x2_t val) {
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_s8_x2(int8_t* __a, int8x16x2_t val) {
+  asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_p8_x2(poly8_t* __a, poly8x16x2_t val) {
+  asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_s16_x2(int16_t* __a, int16x8x2_t val) {
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_p16_x2(poly16_t* __a, poly16x8x2_t val) {
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_s32_x2(int32_t* __a, int32x4x2_t val) {
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_s64_x2(int64_t* __a, int64x2x2_t val) {
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_u8_x2(uint8_t* __a, uint8x16x2_t val) {
+  asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_u16_x2(uint16_t* __a, uint16x8x2_t val) {
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_u32_x2(uint32_t* __a, uint32x4x2_t val) {
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_u64_x2(uint64_t* __a, uint64x2x2_t val) {
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_f16_x2(float16_t* __a, float16x8x2_t val) {
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_f64_x2(float64_t* __a, float64x2x2_t val) {
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
+}
+
+__extension__ extern __inline void
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_p64_x2(poly64_t* __a, poly64x2x2_t val) {
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q"(*__a) : "w"(val));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h b/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h
index 711d16f9b231..9a327d11f1f6 100644
--- a/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/missing_vst1_neon.h
@@ -1,8 +1,14 @@
 /* Workaround for missing vst1q_f32_x2 in gcc-8.  */
 
 __extension__ extern __inline void
+<<<<<<< HEAD
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
 {
   asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
+=======
+    __attribute__((__always_inline__, __gnu_inline__, __artificial__))
+    vst1q_f32_x2(float32_t* __a, float32x4x2_t val) {
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q"(*__a) : "w"(val));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index 83bb70bdbcbf..42847bcd0c51 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@@ -6,6 +6,7 @@
 #include <ATen/cpu/vec/intrinsics.h>
 
 #include <ATen/cpu/vec/vec_base.h>
+<<<<<<< HEAD
 #if !(defined(__VSX__)  || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR))
 #if defined(CPU_CAPABILITY_SVE256)
 #include <ATen/cpu/vec/sve/vec_common_sve.h>
@@ -24,6 +25,35 @@
 #include <ATen/cpu/vec/vec256/zarch/vec256_zarch.h>
 #include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
 #include <ATen/cpu/vec/vec256/vec256_half.h>
+=======
+#if !(                                                 \
+    defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \
+    defined(CPU_CAPABILITY_ZVECTOR))
+#if defined(CPU_CAPABILITY_SVE256)
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#else
+// clang-format off
+#include <ATen/cpu/vec/vec256/vec256_float.h>
+#include <ATen/cpu/vec/vec256/vec256_double.h>
+#include <ATen/cpu/vec/vec256/vec256_int.h>
+#include <ATen/cpu/vec/vec256/vec256_qint.h>
+#endif
+#if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16)
+#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
+#endif
+#include <ATen/cpu/vec/vec256/vec256_half.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
+// clang-format on
+#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX)
+#include <ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h>
+#else
+// clang-format off
+#include <ATen/cpu/vec/vec256/zarch/vec256_zarch.h>
+#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
+#include <ATen/cpu/vec/vec256/vec256_half.h>
+// clang-format on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #include <ATen/cpu/vec/vec256/vec256_convert.h>
@@ -75,34 +105,56 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
   return stream;
 }
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(CPU_CAPABILITY_AVX2)
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+<<<<<<< HEAD
 template<>
+=======
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<float> cast<float, double>(const Vectorized<double>& src) {
   return _mm256_castpd_ps(src);
 }
 
+<<<<<<< HEAD
 template<>
+=======
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<double> cast<double, float>(const Vectorized<float>& src) {
   return _mm256_castps_pd(src);
 }
 
+<<<<<<< HEAD
 template<>
+=======
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<float> cast<float, int32_t>(const Vectorized<int32_t>& src) {
   return _mm256_castsi256_ps(src);
 }
 
+<<<<<<< HEAD
 template<>
 inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src) {
+=======
+template <>
+inline Vectorized<double> cast<double, int64_t>(
+    const Vectorized<int64_t>& src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_castsi256_pd(src);
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #ifndef _MSC_VER
 // MSVC is not working well on complex function overload.
+<<<<<<< HEAD
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
@@ -112,12 +164,28 @@ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
 inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+=======
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        double>> inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
+  return _mm256_i64gather_pd(base_addr, vindex, scale);
+}
+
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        float>> inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_i32gather_ps(base_addr, vindex, scale);
 }
 #endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #ifndef _MSC_VER
 // MSVC is not working well on complex function overload.
+<<<<<<< HEAD
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline mask_gather(const Vectorized<double>& src, const double* base_addr,
@@ -129,6 +197,25 @@ template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
 inline mask_gather(const Vectorized<float>& src, const float* base_addr,
                    const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
+=======
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>> inline mask_gather(
+        const Vectorized<double>& src,
+        const double* base_addr,
+        const Vectorized<int64_t>& vindex,
+        Vectorized<double>& mask) {
+  return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale);
+}
+
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>> inline mask_gather(
+        const Vectorized<float>& src,
+        const float* base_addr,
+        const Vectorized<int32_t>& vindex,
+        Vectorized<float>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
 }
 #endif
@@ -136,6 +223,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
 
 // Only works for inputs in the range: [-2^51, 2^51]
 // From: https://stackoverflow.com/a/41148578
+<<<<<<< HEAD
 template<>
 Vectorized<int64_t>
 inline convert_to_int_of_same_size<double>(const Vectorized<double> &src) {
@@ -149,10 +237,25 @@ inline convert_to_int_of_same_size<double>(const Vectorized<double> &src) {
 template<>
 Vectorized<int32_t>
 inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
+=======
+template <>
+Vectorized<int64_t> inline convert_to_int_of_same_size<double>(
+    const Vectorized<double>& src) {
+  auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000));
+  return _mm256_sub_epi64(
+      _mm256_castpd_si256(x),
+      _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
+}
+
+template <>
+Vectorized<int32_t> inline convert_to_int_of_same_size<float>(
+    const Vectorized<float>& src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_cvttps_epi32(src);
 }
 
 // From: https://stackoverflow.com/a/41148578
+<<<<<<< HEAD
 template<>
 Vectorized<double>
 inline convert_to_fp_of_same_size<double>(const Vectorized<int64_t> &src) {
@@ -173,27 +276,69 @@ inline convert_to_fp_of_same_size<double>(const Vectorized<int64_t> &src) {
 template<>
 Vectorized<float>
 inline convert_to_fp_of_same_size<float>(const Vectorized<int32_t> &src) {
+=======
+template <>
+Vectorized<double> inline convert_to_fp_of_same_size<double>(
+    const Vectorized<int64_t>& src) {
+  __m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); /* 2^52 */
+  __m256i magic_i_hi32 =
+      _mm256_set1_epi64x(0x4530000080000000); /* 2^84 + 2^63 */
+  __m256i magic_i_all =
+      _mm256_set1_epi64x(0x4530000080100000); /* 2^84 + 2^63 + 2^52 */
+  __m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
+
+  __m256i v_lo = _mm256_blend_epi32(
+      magic_i_lo, src, 0b01010101); /* v_low = low32 + 2^52 */
+  __m256i v_hi = _mm256_srli_epi64(src, 32);
+  v_hi = _mm256_xor_si256(
+      v_hi, magic_i_hi32); /* v_hi = high32*2^32 + 2^84 + 2^63 */
+  /* int64 = low32 + high32*2^32 = v_hi + v_lo - 2^52 - 2^63 - 2^84 */
+  __m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
+  __m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
+  return result;
+}
+
+template <>
+Vectorized<float> inline convert_to_fp_of_same_size<float>(
+    const Vectorized<int32_t>& src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_cvtepi32_ps(src);
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <>
+<<<<<<< HEAD
 std::pair<Vectorized<double>, Vectorized<double>>
 inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
   // inputs:
   //   a = {a0, a1, a3, a3}
+=======
+std::pair<Vectorized<double>, Vectorized<double>> inline interleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //   b = {b0, b1, b2, b3}
 
   // swap lanes:
   //   a_swapped = {a0, a1, b0, b1}
   //   b_swapped = {a2, a3, b2, b3}
+<<<<<<< HEAD
   auto a_swapped = _mm256_permute2f128_pd(a, b, 0b0100000);  // 0, 2.   4 bits apart
   auto b_swapped = _mm256_permute2f128_pd(a, b, 0b0110001);  // 1, 3.   4 bits apart
+=======
+  auto a_swapped =
+      _mm256_permute2f128_pd(a, b, 0b0100000); // 0, 2.   4 bits apart
+  auto b_swapped =
+      _mm256_permute2f128_pd(a, b, 0b0110001); // 1, 3.   4 bits apart
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // group cols crossing lanes:
   //   return {a0, b0, a1, b1}
   //          {a2, b2, a3, b3}
+<<<<<<< HEAD
   return std::make_pair(_mm256_permute4x64_pd(a_swapped, 0b11011000),  // 0, 2, 1, 3
                         _mm256_permute4x64_pd(b_swapped, 0b11011000)); // 0, 2, 1, 3
 }
@@ -201,6 +346,17 @@ inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>
 template <>
 std::pair<Vectorized<float>, Vectorized<float>>
 inline interleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  return std::make_pair(
+      _mm256_permute4x64_pd(a_swapped, 0b11011000), // 0, 2, 1, 3
+      _mm256_permute4x64_pd(b_swapped, 0b11011000)); // 0, 2, 1, 3
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline interleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inputs:
   //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
   //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
@@ -209,22 +365,41 @@ inline interleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b
   //   a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3}
   //   b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7}
   // TODO: can we support caching this?
+<<<<<<< HEAD
   auto a_swapped = _mm256_permute2f128_ps(a, b, 0b0100000);  // 0, 2.   4 bits apart
   auto b_swapped = _mm256_permute2f128_ps(a, b, 0b0110001);  // 1, 3.   4 bits apart
+=======
+  auto a_swapped =
+      _mm256_permute2f128_ps(a, b, 0b0100000); // 0, 2.   4 bits apart
+  auto b_swapped =
+      _mm256_permute2f128_ps(a, b, 0b0110001); // 1, 3.   4 bits apart
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // group cols crossing lanes:
   //   return {a0, b0, a1, b1, a2, b2, a3, b3}
   //          {a4, b4, a5, b5, a6, b6, a7, b7}
   const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+<<<<<<< HEAD
   return std::make_pair(_mm256_permutevar8x32_ps(a_swapped, group_ctrl),
                         _mm256_permutevar8x32_ps(b_swapped, group_ctrl));
+=======
+  return std::make_pair(
+      _mm256_permutevar8x32_ps(a_swapped, group_ctrl),
+      _mm256_permutevar8x32_ps(b_swapped, group_ctrl));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <>
+<<<<<<< HEAD
 std::pair<Vectorized<double>, Vectorized<double>>
 inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+std::pair<Vectorized<double>, Vectorized<double>> inline deinterleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inputs:
   //   a = {a0, b0, a1, b1}
   //   b = {a2, b2, a3, b3}
@@ -232,12 +407,18 @@ inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<doubl
   // group cols crossing lanes:
   //   a_grouped = {a0, a1, b0, b1}
   //   b_grouped = {a2, a3, b2, b3}
+<<<<<<< HEAD
   auto a_grouped = _mm256_permute4x64_pd(a, 0b11011000);  // 0, 2, 1, 3
   auto b_grouped = _mm256_permute4x64_pd(b, 0b11011000);  // 0, 2, 1, 3
+=======
+  auto a_grouped = _mm256_permute4x64_pd(a, 0b11011000); // 0, 2, 1, 3
+  auto b_grouped = _mm256_permute4x64_pd(b, 0b11011000); // 0, 2, 1, 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // swap lanes:
   //   return {a0, a1, a2, a3}
   //          {b0, b1, b2, b3}
+<<<<<<< HEAD
   return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, 0b0100000),  // 0, 2.   4 bits apart
                         _mm256_permute2f128_pd(a_grouped, b_grouped, 0b0110001)); // 1, 3.   4 bits apart
 }
@@ -245,6 +426,19 @@ inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<doubl
 template <>
 std::pair<Vectorized<float>, Vectorized<float>>
 inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  return std::make_pair(
+      _mm256_permute2f128_pd(
+          a_grouped, b_grouped, 0b0100000), // 0, 2.   4 bits apart
+      _mm256_permute2f128_pd(
+          a_grouped, b_grouped, 0b0110001)); // 1, 3.   4 bits apart
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inputs:
   //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
   //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
@@ -260,18 +454,32 @@ inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>&
   // swap lanes:
   //   return {a0, a1, a2, a3, a4, a5, a6, a7}
   //          {b0, b1, b2, b3, b4, b5, b6, b7}
+<<<<<<< HEAD
   return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, 0b0100000),  // 0, 2.   4 bits apart
                         _mm256_permute2f128_ps(a_grouped, b_grouped, 0b0110001)); // 1, 3.   4 bits apart
+=======
+  return std::make_pair(
+      _mm256_permute2f128_ps(
+          a_grouped, b_grouped, 0b0100000), // 0, 2.   4 bits apart
+      _mm256_permute2f128_ps(
+          a_grouped, b_grouped, 0b0110001)); // 1, 3.   4 bits apart
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+<<<<<<< HEAD
 template<>
 inline Vectorized<float> flip(const Vectorized<float> & v) {
+=======
+template <>
+inline Vectorized<float> flip(const Vectorized<float>& v) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const __m256i mask_float = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
   return _mm256_permutevar8x32_ps(v, mask_float);
 }
 
+<<<<<<< HEAD
 template<>
 inline Vectorized<double> flip(const Vectorized<double> & v) {
   return _mm256_permute4x64_pd(v, 27);  // 27 == _MM_SHUFFLE(0, 1, 2, 3)
@@ -284,29 +492,119 @@ inline Vectorized<int64_t> flip(const Vectorized<int64_t> & v) {
 
 template<>
 inline Vectorized<int32_t> flip(const Vectorized<int32_t> & v) {
+=======
+template <>
+inline Vectorized<double> flip(const Vectorized<double>& v) {
+  return _mm256_permute4x64_pd(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3)
+}
+
+template <>
+inline Vectorized<int64_t> flip(const Vectorized<int64_t>& v) {
+  return _mm256_permute4x64_epi64(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3)
+}
+
+template <>
+inline Vectorized<int32_t> flip(const Vectorized<int32_t>& v) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const __m256i mask_int32 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
   return _mm256_permutevar8x32_epi32(v, mask_int32);
 }
 
+<<<<<<< HEAD
 template<>
 inline Vectorized<int16_t> flip(const Vectorized<int16_t> & v) {
   const __m256i mask = _mm256_set_epi8(
     1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
     1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
   );
+=======
+template <>
+inline Vectorized<int16_t> flip(const Vectorized<int16_t>& v) {
+  const __m256i mask = _mm256_set_epi8(
+      1,
+      0,
+      3,
+      2,
+      5,
+      4,
+      7,
+      6,
+      9,
+      8,
+      11,
+      10,
+      13,
+      12,
+      15,
+      14,
+      1,
+      0,
+      3,
+      2,
+      5,
+      4,
+      7,
+      6,
+      9,
+      8,
+      11,
+      10,
+      13,
+      12,
+      15,
+      14);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto reversed = _mm256_shuffle_epi8(v, mask);
   return _mm256_permute2x128_si256(reversed, reversed, 1);
 }
 
+<<<<<<< HEAD
 inline __m256i flip8(const __m256i & v) {
   const __m256i mask_int8 = _mm256_set_epi8(
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   );
+=======
+inline __m256i flip8(const __m256i& v) {
+  const __m256i mask_int8 = _mm256_set_epi8(
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto reversed = _mm256_shuffle_epi8(v, mask_int8);
   return _mm256_permute2x128_si256(reversed, reversed, 1);
 }
 
+<<<<<<< HEAD
 template<>
 inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
   return flip8(v);
@@ -314,6 +612,15 @@ inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
 
 template<>
 inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+=======
+template <>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t>& v) {
+  return flip8(v);
+}
+
+template <>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t>& v) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return flip8(v);
 }
 
@@ -330,4 +637,9 @@ inline Vectorized<bool> operator&&(
 
 #endif // (defined(CPU_CAPABILITY_AVX2)
 
+<<<<<<< HEAD
 }} // namepsace at::vec::CPU_CAPABILITY
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
index e661f69b40d7..45ec0ba11f4e 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_16bit_float.h
@@ -3,12 +3,22 @@
 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
 // See Note [Do not compile initializers with AVX]
 
+<<<<<<< HEAD
 // Used for shared functions and classes for vec256_bfloat16.h and vec256_half.h.
 // Any functions/classes that are common between those two files should be defined here.
 // Any non-shared functions/classes should be defined in the respective files.
 
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/intrinsics.h>
+=======
+// Used for shared functions and classes for vec256_bfloat16.h and
+// vec256_half.h. Any functions/classes that are common between those two files
+// should be defined here. Any non-shared functions/classes should be defined in
+// the respective files.
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(CPU_CAPABILITY_AVX2)
 #define SLEEF_STATIC_LIBS
@@ -32,7 +42,10 @@ inline namespace CPU_CAPABILITY {
 #define SLEEF_CONST_OLD
 #endif
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // bfloat16 conversion
 static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
   o = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(a), 16));
@@ -61,7 +74,12 @@ static inline __m128i cvtfp32_bf16(const __m256& src) {
   t_value = _mm256_srli_epi32(t_value, 16);
   // Check NaN before converting back to bf16
   t_value = _mm256_blendv_epi8(nan, t_value, mask);
+<<<<<<< HEAD
   t_value = _mm256_packus_epi32(t_value, t_value);   // t[4-7] t[4-7] t[0-4] t[0-4]
+=======
+  t_value =
+      _mm256_packus_epi32(t_value, t_value); // t[4-7] t[4-7] t[0-4] t[0-4]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   t_value = _mm256_permute4x64_epi64(t_value, 0xd8); // 11     01     10     00
   return _mm256_castsi256_si128(t_value);
 }
@@ -90,8 +108,14 @@ static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) {
   t_lo = _mm256_blendv_epi8(nan, t_lo, mask_lo);
   t_hi = _mm256_blendv_epi8(nan, t_hi, mask_hi);
 
+<<<<<<< HEAD
   t_lo = _mm256_packus_epi32(t_lo, t_hi);      // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4]
   return _mm256_permute4x64_epi64(t_lo, 0xd8); // 11        01        10        00
+=======
+  t_lo = _mm256_packus_epi32(
+      t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4]
+  return _mm256_permute4x64_epi64(t_lo, 0xd8); // 11        01        10 00
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static inline __m256i merge_compare_result(const __m256& a, const __m256& b) {
@@ -116,6 +140,7 @@ static inline void cvtfp16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
 }
 
 static inline __m128i cvtfp32_fp16(const __m256& src) {
+<<<<<<< HEAD
   return _mm256_cvtps_ph(
       src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
 }
@@ -125,10 +150,21 @@ static inline __m256i cvtfp32_fp16(const __m256& a, const __m256& b) {
       a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
   __m128i hi = _mm256_cvtps_ph(
       b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+  return _mm256_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
+static inline __m256i cvtfp32_fp16(const __m256& a, const __m256& b) {
+  __m128i lo =
+      _mm256_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  __m128i hi =
+      _mm256_cvtps_ph(b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
 }
 
 // dtype conversion between float16/bfloat16 and float32
+<<<<<<< HEAD
 template <typename T, typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
 inline void cvt_to_fp32(const __m128i& a, __m256& o);
 template <> inline void cvt_to_fp32<BFloat16>(const __m128i& a, __m256& o) {
@@ -160,17 +196,77 @@ template <> inline __m256i cvt_from_fp32<Half, false>(const __m256& a, const __m
   return cvtfp32_fp16(a, b);
 }
 template <> inline __m256i cvt_from_fp32<Half, true>(const __m256& a, const __m256& b) {
+=======
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m128i& a, __m256& o);
+template <>
+inline void cvt_to_fp32<BFloat16>(const __m128i& a, __m256& o) {
+  cvtbf16_fp32(a, o);
+}
+template <>
+inline void cvt_to_fp32<Half>(const __m128i& a, __m256& o) {
+  cvtfp16_fp32(a, o);
+}
+
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m256i& a, __m256& o1, __m256& o2);
+template <>
+inline void cvt_to_fp32<BFloat16>(const __m256i& a, __m256& o1, __m256& o2) {
+  cvtbf16_fp32(a, o1, o2);
+}
+template <>
+inline void cvt_to_fp32<Half>(const __m256i& a, __m256& o1, __m256& o2) {
+  cvtfp16_fp32(a, o1, o2);
+}
+
+template <
+    typename T,
+    bool is_compare_op = false,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline __m256i cvt_from_fp32(const __m256& a, const __m256& b);
+template <>
+inline __m256i cvt_from_fp32<BFloat16, false>(
+    const __m256& a,
+    const __m256& b) {
+  return cvtfp32_bf16(a, b);
+}
+template <>
+inline __m256i cvt_from_fp32<BFloat16, true>(const __m256& a, const __m256& b) {
+  return merge_compare_result(a, b);
+}
+template <>
+inline __m256i cvt_from_fp32<Half, false>(const __m256& a, const __m256& b) {
+  return cvtfp32_fp16(a, b);
+}
+template <>
+inline __m256i cvt_from_fp32<Half, true>(const __m256& a, const __m256& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return cvtfp32_fp16(a, b);
 }
 
 template <typename T>
 class Vectorized16 {
+<<<<<<< HEAD
 static_assert(
   is_reduced_floating_point_v<T>,
   "Support only float16 and bfloat16.");
 protected:
   __m256i values;
 public:
+=======
+  static_assert(
+      is_reduced_floating_point_v<T>,
+      "Support only float16 and bfloat16.");
+
+ protected:
+  __m256i values;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = uint16_t;
   using size_type = int;
   static constexpr size_type size() {
@@ -182,6 +278,7 @@ static_assert(
     value_type uw = val.x;
     values = _mm256_set1_epi16(uw);
   }
+<<<<<<< HEAD
   Vectorized16(T val1, T val2, T val3, T val4,
          T val5, T val6, T val7, T val8,
          T val9, T val10, T val11, T val12,
@@ -189,14 +286,57 @@ static_assert(
     values = _mm256_setr_epi16(
         val1.x, val2.x, val3.x, val4.x, val5.x, val6.x, val7.x, val8.x,
         val9.x, val10.x, val11.x, val12.x, val13.x, val14.x, val15.x, val16.x);
+=======
+  Vectorized16(
+      T val1,
+      T val2,
+      T val3,
+      T val4,
+      T val5,
+      T val6,
+      T val7,
+      T val8,
+      T val9,
+      T val10,
+      T val11,
+      T val12,
+      T val13,
+      T val14,
+      T val15,
+      T val16) {
+    values = _mm256_setr_epi16(
+        val1.x,
+        val2.x,
+        val3.x,
+        val4.x,
+        val5.x,
+        val6.x,
+        val7.x,
+        val8.x,
+        val9.x,
+        val10.x,
+        val11.x,
+        val12.x,
+        val13.x,
+        val14.x,
+        val15.x,
+        val16.x);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   operator __m256i() const {
     return values;
   }
   T& operator[](int idx) = delete;
+<<<<<<< HEAD
   const T& operator[](int idx) const  = delete;
   int zero_mask() const {
     // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+=======
+  const T& operator[](int idx) const = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256i cmp = _mm256_cmpeq_epi16(values, _mm256_set1_epi16(0));
     return _mm256_movemask_epi8(cmp);
   }
@@ -261,6 +401,7 @@ static_assert(
       tmp_values[15] = _mm256_extract_epi16(b.values, 15);
     return loadu(tmp_values);
   }
+<<<<<<< HEAD
   static Vectorized<T> blendv(const Vectorized<T>& a,
       const Vectorized<T>& b, const Vectorized<T>& mask) {
     return _mm256_blendv_epi8(a.values, b.values, mask.values);
@@ -275,6 +416,40 @@ static_assert(
   }
   static Vectorized<T> set(const Vectorized<T>& a,
       const Vectorized<T>& b, int64_t count = size()) {
+=======
+  static Vectorized<T> blendv(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<T> arange(
+      T base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+  static Vectorized<T> set(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -312,9 +487,16 @@ static_assert(
     return b;
   }
 
+<<<<<<< HEAD
 // 'const' type qualifier on return type has no effect, but sleef defines this this way
 // For example `Sleef_exp2f8_u10` signature is `const __m256 (__m256)`
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wignored-qualifiers")
+=======
+  // 'const' type qualifier on return type has no effect, but sleef defines this
+  // this way For example `Sleef_exp2f8_u10` signature is `const __m256
+  // (__m256)`
+  C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wignored-qualifiers")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
     __m256 lo, hi;
     cvt_to_fp32<T>(values, lo, hi);
@@ -322,7 +504,11 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wignored-qualifiers")
     const auto o2 = vop(hi);
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
 C10_DIAGNOSTIC_POP()
+=======
+  C10_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> isnan() const {
     __m256 lo, hi;
     cvt_to_fp32<T>(values, lo, hi);
@@ -376,7 +562,11 @@ C10_DIAGNOSTIC_POP()
   Vectorized<T> atanh() const {
     return map(Sleef_atanhf8_u10);
   }
+<<<<<<< HEAD
   Vectorized<T> atan2(const Vectorized<T> &b) const {
+=======
+  Vectorized<T> atan2(const Vectorized<T>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 lo, hi;
     __m256 b1, b2;
     cvt_to_fp32<T>(values, lo, hi);
@@ -385,12 +575,20 @@ C10_DIAGNOSTIC_POP()
     auto o2 = Sleef_atan2f8_u10(hi, b2);
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
   Vectorized<T> copysign(const Vectorized<T> &sign) const {
     // copy sign bit (0x8000) from sign and remaining bits from values
     __m256i mask_value = _mm256_set1_epi32(~0x80008000);
     __m256i mask_signbit = _mm256_set1_epi32(0x80008000);
     return Vectorized<T>(
       _mm256_or_si256(
+=======
+  Vectorized<T> copysign(const Vectorized<T>& sign) const {
+    // copy sign bit (0x8000) from sign and remaining bits from values
+    __m256i mask_value = _mm256_set1_epi32(~0x80008000);
+    __m256i mask_signbit = _mm256_set1_epi32(0x80008000);
+    return Vectorized<T>(_mm256_or_si256(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _mm256_and_si256(values, mask_value),
         _mm256_and_si256(sign, mask_signbit)));
   }
@@ -426,7 +624,11 @@ C10_DIAGNOSTIC_POP()
   Vectorized<T> exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<T> fmod(const Vectorized<T> & q) const {
+=======
+  Vectorized<T> fmod(const Vectorized<T>& q) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 x_lo, x_hi;
     cvt_to_fp32<T>(values, x_lo, x_hi);
     __m256 q_lo, q_hi;
@@ -435,7 +637,11 @@ C10_DIAGNOSTIC_POP()
     auto o2 = Sleef_fmodf8(x_hi, q_hi);
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
   Vectorized<T> hypot(const Vectorized<T> &b) const {
+=======
+  Vectorized<T> hypot(const Vectorized<T>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 lo, hi;
     __m256 b1, b2;
     cvt_to_fp32<T>(values, lo, hi);
@@ -490,7 +696,11 @@ C10_DIAGNOSTIC_POP()
     const auto o2 = _mm256_loadu_ps(tmp2);
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
   Vectorized<T> igamma(const Vectorized<T> &x) const {
+=======
+  Vectorized<T> igamma(const Vectorized<T>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 lo, hi;
     __m256 xlo, xhi;
     cvt_to_fp32<T>(values, lo, hi);
@@ -510,7 +720,11 @@ C10_DIAGNOSTIC_POP()
     return cvt_from_fp32<T>(o1, o2);
   }
 
+<<<<<<< HEAD
   Vectorized<T> igammac(const Vectorized<T> &x) const {
+=======
+  Vectorized<T> igammac(const Vectorized<T>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 lo, hi;
     __m256 xlo, xhi;
     cvt_to_fp32<T>(values, lo, hi);
@@ -573,8 +787,15 @@ C10_DIAGNOSTIC_POP()
   Vectorized<T> round() const {
     __m256 lo, hi;
     cvt_to_fp32<T>(values, lo, hi);
+<<<<<<< HEAD
     auto o1 = _mm256_round_ps(lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
     auto o2 = _mm256_round_ps(hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+    auto o1 =
+        _mm256_round_ps(lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    auto o2 =
+        _mm256_round_ps(hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cvt_from_fp32<T>(o1, o2);
   }
   Vectorized<T> tan() const {
@@ -616,7 +837,11 @@ C10_DIAGNOSTIC_POP()
     auto o2 = _mm256_div_ps(ones, _mm256_sqrt_ps(hi));
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
   Vectorized<T> pow(const Vectorized<T> &b) const {
+=======
+  Vectorized<T> pow(const Vectorized<T>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 lo, hi;
     __m256 b1, b2;
     cvt_to_fp32<T>(values, lo, hi);
@@ -625,8 +850,14 @@ C10_DIAGNOSTIC_POP()
     auto o2 = Sleef_powf8_u10(hi, b2);
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
 private:
   template<typename Op, typename VectorizedType>
+=======
+
+ private:
+  template <typename Op, typename VectorizedType>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> inline binary_compare(const VectorizedType& b, Op op) const {
     __m256 a_lo, a_hi;
     __m256 b_lo, b_hi;
@@ -634,6 +865,7 @@ C10_DIAGNOSTIC_POP()
     cvt_to_fp32<T>(b.values, b_lo, b_hi);
     auto o1 = op(a_lo, b_lo);
     auto o2 = op(a_hi, b_hi);
+<<<<<<< HEAD
     return cvt_from_fp32<T, /*is_compare_op*/true>(o1, o2);
   }
 
@@ -660,6 +892,49 @@ C10_DIAGNOSTIC_POP()
 
 template<typename T, typename Op>
 static inline Vectorized<T> binary_op_as_fp32(const Vectorized<T>& a, const Vectorized<T>& b, Op op) {
+=======
+    return cvt_from_fp32<T, /*is_compare_op*/ true>(o1, o2);
+  }
+
+ public:
+  Vectorized<T> inline operator>(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_GT_OQ);
+    });
+  }
+  Vectorized<T> inline operator<(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_LT_OQ);
+    });
+  }
+  Vectorized<T> inline operator>=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_GE_OQ);
+    });
+  }
+  Vectorized<T> inline operator<=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_LE_OQ);
+    });
+  }
+  Vectorized<T> inline operator==(const Vectorized16<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_EQ_OQ);
+    });
+  }
+  Vectorized<T> inline operator!=(const Vectorized16<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) {
+      return _mm256_cmp_ps(x, y, _CMP_NEQ_UQ);
+    });
+  }
+};
+
+template <typename T, typename Op>
+static inline Vectorized<T> binary_op_as_fp32(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 b_lo, b_hi;
   cvt_to_fp32<T>(__m256i(a), a_lo, a_hi);
@@ -669,6 +944,7 @@ static inline Vectorized<T> binary_op_as_fp32(const Vectorized<T>& a, const Vect
   return cvt_from_fp32<T>(o1, o2);
 }
 
+<<<<<<< HEAD
 #define CONVERT_VECTORIZED_INIT(type, name) \
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
   __m256 o1, o2; \
@@ -735,3 +1011,80 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
 
 #endif // CPU_CAPABILITY_AVX2
 }} // namespace::at::vec::CPU_CAPABILITY
+=======
+#define CONVERT_VECTORIZED_INIT(type, name)                     \
+  inline std::tuple<Vectorized<float>, Vectorized<float>>       \
+      convert_##name##_float(const Vectorized<type>& a) {       \
+    __m256 o1, o2;                                              \
+    cvt_to_fp32<type>(__m256i(a), o1, o2);                      \
+    return std::make_tuple(o1, o2);                             \
+  }                                                             \
+  inline Vectorized<type> convert_float_##name(                 \
+      const Vectorized<float>& a, const Vectorized<float>& b) { \
+    return cvt_from_fp32<type>(__m256(a), __m256(b));           \
+  }
+
+#define LOAD_FP32_VECTORIZED_INIT(type, name)                               \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out) {                           \
+    auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));  \
+    __m256 out_values;                                                      \
+    cvt_to_fp32<type>(values, out_values);                                  \
+    out = out_values;                                                       \
+  }                                                                         \
+                                                                            \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+    auto vec = Vectorized<type>::loadu(data);                               \
+    __m256 out1_values, out2_values;                                        \
+    cvt_to_fp32<type>(vec, out1_values, out2_values);                       \
+    out1 = out1_values;                                                     \
+    out2 = out2_values;                                                     \
+  }
+
+#else // CPU_CAPABILITY_AVX2
+
+#define CONVERT_NON_VECTORIZED_INIT(type, name)                     \
+  inline std::tuple<Vectorized<float>, Vectorized<float>>           \
+      convert_##name##_float(const Vectorized<type>& a) {           \
+    constexpr int64_t K = Vectorized<type>::size();                 \
+    __at_align__ float arr[K];                                      \
+    __at_align__ type arr2[K];                                      \
+    a.store(arr2);                                                  \
+    convert(arr2, arr, K);                                          \
+    return std::make_tuple(                                         \
+        Vectorized<float>::loadu(arr),                              \
+        Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
+  }                                                                 \
+  inline Vectorized<type> convert_float_##name(                     \
+      const Vectorized<float>& a, const Vectorized<float>& b) {     \
+    constexpr int64_t K = Vectorized<type>::size();                 \
+    __at_align__ float arr[K];                                      \
+    __at_align__ type arr2[K];                                      \
+    a.store(arr);                                                   \
+    b.store(arr + Vectorized<float>::size());                       \
+    convert(arr, arr2, K);                                          \
+    return Vectorized<type>::loadu(arr2);                           \
+  }
+
+#define LOAD_FP32_NON_VECTORIZED_INIT(type, name)                           \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out) {                           \
+    __at_align__ float values[Vectorized<float>::size()];                   \
+    for (const auto k : c10::irange(Vectorized<float>::size())) {           \
+      values[k] = data[k];                                                  \
+    }                                                                       \
+    out = Vectorized<float>::loadu(values);                                 \
+  }                                                                         \
+                                                                            \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+    load_fp32_from_##name(data, out1);                                      \
+    data += Vectorized<float>::size();                                      \
+    load_fp32_from_##name(data, out2);                                      \
+  }
+
+#endif // CPU_CAPABILITY_AVX2
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
index ac69e8613f71..d6c6df51b123 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@@ -13,8 +13,16 @@ inline namespace CPU_CAPABILITY {
 #if defined(CPU_CAPABILITY_AVX2)
 
 template <>
+<<<<<<< HEAD
 class Vectorized<BFloat16>: public Vectorized16<BFloat16> {
 public:
+=======
+struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<BFloat16> : public Vectorized16<BFloat16> {
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vectorized16::Vectorized16;
 
   using value_type = BFloat16;
@@ -29,6 +37,7 @@ class Vectorized<BFloat16>: public Vectorized16<BFloat16> {
   Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
 };
 
+<<<<<<< HEAD
 Vectorized<BFloat16> inline operator+(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
   return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_add_ps(x, y); });
 }
@@ -67,6 +76,74 @@ inline Vectorized<BFloat16> Vectorized<BFloat16>::lt(const Vectorized<BFloat16>&
   return (*this < other) & Vectorized<BFloat16>(1.0f);
 }
 inline Vectorized<BFloat16> Vectorized<BFloat16>::le(const Vectorized<BFloat16>& other) const {
+=======
+Vectorized<BFloat16> inline operator+(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_add_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator-(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_sub_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator*(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_mul_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator/(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_div_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator&(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm256_and_si256(a, b);
+}
+Vectorized<BFloat16> inline operator|(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm256_or_si256(a, b);
+}
+Vectorized<BFloat16> inline operator^(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm256_xor_si256(a, b);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::eq(
+    const Vectorized<BFloat16>& other) const {
+  return (*this == other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ne(
+    const Vectorized<BFloat16>& other) const {
+  return (*this != other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::gt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this > other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ge(
+    const Vectorized<BFloat16>& other) const {
+  return (*this >= other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::lt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this < other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::le(
+    const Vectorized<BFloat16>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<BFloat16>(1.0f);
 }
 
@@ -78,7 +155,13 @@ inline Vectorized<BFloat16> Vectorized<BFloat16>::frac() const {
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline maximum(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+=======
+Vectorized<BFloat16> inline maximum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 b_lo, b_hi;
   cvtbf16_fp32(__m256i(a), a_lo, a_hi);
@@ -96,7 +179,13 @@ Vectorized<BFloat16> inline maximum(const Vectorized<BFloat16>& a, const Vectori
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline minimum(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+=======
+Vectorized<BFloat16> inline minimum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 b_lo, b_hi;
   cvtbf16_fp32(__m256i(a), a_lo, a_hi);
@@ -112,8 +201,15 @@ Vectorized<BFloat16> inline minimum(const Vectorized<BFloat16>& a, const Vectori
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline clamp(const Vectorized<BFloat16>& a,
     const Vectorized<BFloat16>& min, const Vectorized<BFloat16>& max) {
+=======
+Vectorized<BFloat16> inline clamp(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min,
+    const Vectorized<BFloat16>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 min_lo, min_hi;
   __m256 max_lo, max_hi;
@@ -126,7 +222,13 @@ Vectorized<BFloat16> inline clamp(const Vectorized<BFloat16>& a,
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline clamp_max(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& max) {
+=======
+Vectorized<BFloat16> inline clamp_max(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 max_lo, max_hi;
   cvtbf16_fp32(__m256i(a), a_lo, a_hi);
@@ -137,7 +239,13 @@ Vectorized<BFloat16> inline clamp_max(const Vectorized<BFloat16>& a, const Vecto
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline clamp_min(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& min) {
+=======
+Vectorized<BFloat16> inline clamp_min(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 min_lo, min_hi;
   cvtbf16_fp32(__m256i(a), a_lo, a_hi);
@@ -153,8 +261,15 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
 #ifndef __msvc_cl__
 #pragma unroll
 #endif
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
     auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
+=======
+  for (i = 0; i <= (n - Vectorized<BFloat16>::size());
+       i += Vectorized<BFloat16>::size()) {
+    auto vsrc =
+        _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
   }
 #ifndef __msvc_cl__
@@ -168,7 +283,12 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
 template <>
 inline void convert(const float* src, BFloat16* dst, int64_t n) {
   int64_t i;
+<<<<<<< HEAD
   for (i = 0; i + Vectorized<BFloat16>::size() <= n; i += Vectorized<BFloat16>::size()) {
+=======
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n;
+       i += Vectorized<BFloat16>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 a = _mm256_loadu_ps(&src[i]);
     __m256 b = _mm256_loadu_ps(&src[i + 8]);
 
@@ -182,7 +302,11 @@ inline void convert(const float* src, BFloat16* dst, int64_t n) {
 
 template <>
 inline void convert(const double* src, BFloat16* dst, int64_t n) {
+<<<<<<< HEAD
   auto load_float = [](const double *src) -> __m256 {
+=======
+  auto load_float = [](const double* src) -> __m256 {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Load one float vector from an array of doubles
     __m128 a = _mm256_cvtpd_ps(_mm256_loadu_pd(src));
     __m128 b = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4));
@@ -190,7 +314,12 @@ inline void convert(const double* src, BFloat16* dst, int64_t n) {
   };
 
   int64_t i;
+<<<<<<< HEAD
   for (i = 0; i + Vectorized<BFloat16>::size() <= n; i += Vectorized<BFloat16>::size()) {
+=======
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n;
+       i += Vectorized<BFloat16>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 a = load_float(&src[i]);
     __m256 b = load_float(&src[i + 8]);
 
@@ -203,8 +332,15 @@ inline void convert(const double* src, BFloat16* dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline fmadd(const Vectorized<BFloat16>& a,
     const Vectorized<BFloat16>& b, const Vectorized<BFloat16>& c) {
+=======
+Vectorized<BFloat16> inline fmadd(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b,
+    const Vectorized<BFloat16>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 b_lo, b_hi;
   __m256 c_lo, c_hi;
@@ -221,10 +357,21 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
 
 #else // defined(CPU_CAPABILITY_AVX2)
 
+<<<<<<< HEAD
 #if !(defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256))
+=======
+#if !(                                                                      \
+    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE256))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
 #endif
 
 LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
 #endif // defined(CPU_CAPABILITY_AVX2)
+<<<<<<< HEAD
 }} // namsepace at::vec::CPU_CAPABILITY
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
index b4d8776d7ae4..155eaf03e4e2 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -3,10 +3,17 @@
 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
 // See Note [Do not compile initializers with AVX]
 
+<<<<<<< HEAD
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
+=======
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(CPU_CAPABILITY_AVX2)
 #define SLEEF_STATIC_LIBS
@@ -19,10 +26,23 @@ inline namespace CPU_CAPABILITY {
 
 #if defined(CPU_CAPABILITY_AVX2)
 
+<<<<<<< HEAD
 template <> class Vectorized<c10::complex<double>> {
 private:
   __m256d values;
 public:
+=======
+template <>
+struct is_vec_specialized_for<c10::complex<double>> : std::bool_constant<true> {
+};
+
+template <>
+class Vectorized<c10::complex<double>> {
+ private:
+  __m256d values;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = c10::complex<double>;
   using size_type = int;
   static constexpr size_type size() {
@@ -33,20 +53,35 @@ template <> class Vectorized<c10::complex<double>> {
   Vectorized(c10::complex<double> val) {
     double real_value = val.real();
     double imag_value = val.imag();
+<<<<<<< HEAD
     values = _mm256_setr_pd(real_value, imag_value,
                             real_value, imag_value);
   }
   Vectorized(c10::complex<double> val1, c10::complex<double> val2) {
     values = _mm256_setr_pd(val1.real(), val1.imag(),
                             val2.real(), val2.imag());
+=======
+    values = _mm256_setr_pd(real_value, imag_value, real_value, imag_value);
+  }
+  Vectorized(c10::complex<double> val1, c10::complex<double> val2) {
+    values = _mm256_setr_pd(val1.real(), val1.imag(), val2.real(), val2.imag());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   operator __m256d() const {
     return values;
   }
   template <int64_t mask>
+<<<<<<< HEAD
   static Vectorized<c10::complex<double>> blend(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
      // convert c10::complex<V> index mask to V index mask: xy -> xxyy
     static_assert (mask > -1 && mask < 4, "Unexpected mask value");
+=======
+  static Vectorized<c10::complex<double>> blend(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    static_assert(mask > -1 && mask < 4, "Unexpected mask value");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (mask) {
       case 0:
         return a;
@@ -54,6 +89,7 @@ template <> class Vectorized<c10::complex<double>> {
         return _mm256_blend_pd(a.values, b.values, 0x03);
       case 2:
         return _mm256_blend_pd(a.values, b.values, 0x0c);
+<<<<<<< HEAD
       case 3: break;
     }
     return b;
@@ -72,6 +108,31 @@ template <> class Vectorized<c10::complex<double>> {
   }
   static Vectorized<c10::complex<double>> set(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b,
                             int64_t count = size()) {
+=======
+      case 3:
+        break;
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<double>> blendv(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b,
+      const Vectorized<c10::complex<double>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm256_unpacklo_pd(mask.values, mask.values);
+    return _mm256_blendv_pd(a.values, b.values, mask_);
+  }
+  template <typename step_t>
+  static Vectorized<c10::complex<double>> arange(
+      c10::complex<double> base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<double>>(base, base + step);
+  }
+  static Vectorized<c10::complex<double>> set(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -80,6 +141,7 @@ template <> class Vectorized<c10::complex<double>> {
     }
     return b;
   }
+<<<<<<< HEAD
   static Vectorized<c10::complex<double>> loadu(const void* ptr, int64_t count = size()) {
     if (count == size())
       return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
@@ -89,6 +151,20 @@ template <> class Vectorized<c10::complex<double>> {
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
     for (const auto i : c10::irange(2*size())) {
+=======
+  static Vectorized<c10::complex<double>> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __at_align__ double tmp_values[2 * size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2 * size())) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       tmp_values[i] = 0.0;
     }
     std::memcpy(
@@ -101,14 +177,25 @@ template <> class Vectorized<c10::complex<double>> {
     if (count == size()) {
       _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
     } else if (count > 0) {
+<<<<<<< HEAD
       double tmp_values[2*size()];
+=======
+      double tmp_values[2 * size()];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<double>));
     }
   }
+<<<<<<< HEAD
   const c10::complex<double>& operator[](int idx) const  = delete;
   c10::complex<double>& operator[](int idx) = delete;
   Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
+=======
+  const c10::complex<double>& operator[](int idx) const = delete;
+  c10::complex<double>& operator[](int idx) = delete;
+  Vectorized<c10::complex<double>> map(
+      c10::complex<double> (*const f)(const c10::complex<double>&)) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ c10::complex<double> tmp[size()];
     store(tmp);
     for (const auto i : c10::irange(size())) {
@@ -117,6 +204,7 @@ template <> class Vectorized<c10::complex<double>> {
     return loadu(tmp);
   }
   __m256d abs_2_() const {
+<<<<<<< HEAD
     auto val_2 = _mm256_mul_pd(values, values);     // a*a     b*b
     return _mm256_hadd_pd(val_2, val_2);            // a*a+b*b a*a+b*b
   }
@@ -141,6 +229,38 @@ template <> class Vectorized<c10::complex<double>> {
                                                                      0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
     auto angle = _mm256_permute_pd(angle_(), 0x05); // angle    90-angle
     return _mm256_and_pd(angle, real_mask);         // angle    0
+=======
+    auto val_2 = _mm256_mul_pd(values, values); // a*a     b*b
+    return _mm256_hadd_pd(val_2, val_2); // a*a+b*b a*a+b*b
+  }
+  __m256d abs_() const {
+    auto real = _mm256_movedup_pd(values); // real real
+    // movehdup_pd does not exist...
+    auto imag = _mm256_permute_pd(values, 0xf); // imag imag
+    return Sleef_hypotd4_u05(real, imag); // abs  abs
+  }
+  Vectorized<c10::complex<double>> abs() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+    return _mm256_and_pd(abs_(), real_mask); // abs     0
+  }
+  __m256d angle_() const {
+    // angle = atan2(b/a)
+    auto b_a = _mm256_permute_pd(values, 0x05); // b        a
+    return Sleef_atan2d4_u10(values, b_a); // 90-angle angle
+  }
+  Vectorized<c10::complex<double>> angle() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+    auto angle = _mm256_permute_pd(angle_(), 0x05); // angle    90-angle
+    return _mm256_and_pd(angle, real_mask); // angle    0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<double>> sgn() const {
     auto abs = abs_();
@@ -150,14 +270,23 @@ template <> class Vectorized<c10::complex<double>> {
     return _mm256_blendv_pd(div, zero, mask);
   }
   __m256d real_() const {
+<<<<<<< HEAD
     const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
                                                                      0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+=======
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _mm256_and_pd(values, real_mask);
   }
   Vectorized<c10::complex<double>> real() const {
     return real_();
   }
   __m256d imag_() const {
+<<<<<<< HEAD
     const __m256d imag_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF,
                                                                      0x0000000000000000, 0xFFFFFFFFFFFFFFFF));
     return _mm256_and_pd(values, imag_mask);
@@ -168,12 +297,32 @@ template <> class Vectorized<c10::complex<double>> {
   __m256d conj_() const {
     const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0);
     return _mm256_xor_pd(values, sign_mask);           // a       -b
+=======
+    const __m256d imag_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF));
+    return _mm256_and_pd(values, imag_mask);
+  }
+  Vectorized<c10::complex<double>> imag() const {
+    return _mm256_permute_pd(imag_(), 0x05); // b        a
+  }
+  __m256d conj_() const {
+    const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0);
+    return _mm256_xor_pd(values, sign_mask); // a       -b
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<double>> conj() const {
     return conj_();
   }
   Vectorized<c10::complex<double>> log() const {
+<<<<<<< HEAD
     // Most trigonomic ops use the log() op to improve complex number performance.
+=======
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(std::log);
   }
   Vectorized<c10::complex<double>> log2() const {
@@ -188,7 +337,12 @@ template <> class Vectorized<c10::complex<double>> {
     return map(std::log1p);
   }
   Vectorized<c10::complex<double>> asin() const {
+<<<<<<< HEAD
     // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
+=======
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // // asin(x)
     // // = -i*ln(iz + sqrt(1 -z^2))
     // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
@@ -196,6 +350,7 @@ template <> class Vectorized<c10::complex<double>> {
     // const __m256d one = _mm256_set1_pd(1);
 
     // auto conj = conj_();
+<<<<<<< HEAD
     // auto b_a = _mm256_permute_pd(conj, 0x05);                         //-b        a
     // auto ab = _mm256_mul_pd(conj, b_a);                               //-ab       -ab
     // auto im = _mm256_add_pd(ab, ab);                                  //-2ab      -2ab
@@ -207,6 +362,20 @@ template <> class Vectorized<c10::complex<double>> {
     // auto root = Vectorized(_mm256_blend_pd(re, im, 0x0A)).sqrt();         //sqrt(re + i*im)
     // auto ln = Vectorized(_mm256_add_pd(b_a, root)).log();                 //ln(iz + sqrt())
     // return Vectorized(_mm256_permute_pd(ln.values, 0x05)).conj();         //-i*ln()
+=======
+    // auto b_a = _mm256_permute_pd(conj, 0x05);                         //-b a
+    // auto ab = _mm256_mul_pd(conj, b_a);                               //-ab
+    // -ab auto im = _mm256_add_pd(ab, ab); //-2ab      -2ab
+
+    // auto val_2 = _mm256_mul_pd(values, values);                       // a*a
+    // b*b auto re = _mm256_hsub_pd(val_2, _mm256_permute_pd(val_2, 0x05));  //
+    // a*a-b*b  b*b-a*a re = _mm256_sub_pd(one, re);
+
+    // auto root = Vectorized(_mm256_blend_pd(re, im, 0x0A)).sqrt(); //sqrt(re +
+    // i*im) auto ln = Vectorized(_mm256_add_pd(b_a, root)).log(); //ln(iz +
+    // sqrt()) return Vectorized(_mm256_permute_pd(ln.values, 0x05)).conj();
+    // //-i*ln()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(std::asin);
   }
   Vectorized<c10::complex<double>> acos() const {
@@ -220,6 +389,7 @@ template <> class Vectorized<c10::complex<double>> {
     return map(std::atanh);
   }
   Vectorized<c10::complex<double>> exp() const {
+<<<<<<< HEAD
     // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
     // //exp(a + bi)
     // // = exp(a)*(cos(b) + sin(b)i)
@@ -229,13 +399,32 @@ template <> class Vectorized<c10::complex<double>> {
     // auto sin_cos = Sleef_sincosd4_u10(values);                        //[sin(a), cos(a)] [sin(b), cos(b)]
     // auto cos_sin = _mm256_blend_pd(_mm256_permute_pd(sin_cos.y, 0x05),
     //                                sin_cos.x, 0x0A);                  //cos(b)           sin(b)
+=======
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // //exp(a + bi)
+    // // = exp(a)*(cos(b) + sin(b)i)
+    // auto exp = Sleef_expd4_u10(values); //exp(a)           exp(b) exp =
+    // _mm256_blend_pd(exp, _mm256_permute_pd(exp, 0x05), 0x0A);   //exp(a)
+    // exp(a)
+
+    // auto sin_cos = Sleef_sincosd4_u10(values); //[sin(a), cos(a)] [sin(b),
+    // cos(b)] auto cos_sin = _mm256_blend_pd(_mm256_permute_pd(sin_cos.y,
+    // 0x05),
+    //                                sin_cos.x, 0x0A); //cos(b) sin(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // return _mm256_mul_pd(exp, cos_sin);
     return map(std::exp);
   }
   Vectorized<c10::complex<double>> exp2() const {
     // Use identity 2**x = exp(log(2) * x)
     const __m256d ln_2 = _mm256_set1_pd(c10::ln_2<double>);
+<<<<<<< HEAD
     Vectorized<c10::complex<double>> scaled_values = _mm256_mul_pd(values, ln_2);
+=======
+    Vectorized<c10::complex<double>> scaled_values =
+        _mm256_mul_pd(values, ln_2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return scaled_values.exp();
   }
   Vectorized<c10::complex<double>> expm1() const {
@@ -264,7 +453,12 @@ template <> class Vectorized<c10::complex<double>> {
     return _mm256_sub_pd(zero, values);
   }
   Vectorized<c10::complex<double>> round() const {
+<<<<<<< HEAD
     return _mm256_round_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+    return _mm256_round_pd(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<double>> tan() const {
     return map(std::tan);
@@ -282,7 +476,12 @@ template <> class Vectorized<c10::complex<double>> {
   Vectorized<c10::complex<double>> rsqrt() const {
     return sqrt().reciprocal();
   }
+<<<<<<< HEAD
   Vectorized<c10::complex<double>> pow(const Vectorized<c10::complex<double>> &exp) const {
+=======
+  Vectorized<c10::complex<double>> pow(
+      const Vectorized<c10::complex<double>>& exp) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ c10::complex<double> x_tmp[size()];
     __at_align__ c10::complex<double> y_tmp[size()];
     store(x_tmp);
@@ -295,6 +494,7 @@ template <> class Vectorized<c10::complex<double>> {
   // Comparison using the _CMP_**_OQ predicate.
   //   `O`: get false if an operand is NaN
   //   `Q`: do not raise if an operand is NaN
+<<<<<<< HEAD
   Vectorized<c10::complex<double>> operator==(const Vectorized<c10::complex<double>>& other) const {
     return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ);
   }
@@ -341,13 +541,88 @@ template <> Vectorized<c10::complex<double>> inline operator*(const Vectorized<c
 
 template <> Vectorized<c10::complex<double>> inline operator/(const Vectorized<c10::complex<double>> &a, const Vectorized<c10::complex<double>> &b) {
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
+=======
+  Vectorized<c10::complex<double>> operator==(
+      const Vectorized<c10::complex<double>>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ);
+  }
+  Vectorized<c10::complex<double>> operator!=(
+      const Vectorized<c10::complex<double>>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
+  }
+  Vectorized<c10::complex<double>> operator<(
+      const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator<=(
+      const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>(
+      const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>=(
+      const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<double>> eq(
+      const Vectorized<c10::complex<double>>& other) const;
+  Vectorized<c10::complex<double>> ne(
+      const Vectorized<c10::complex<double>>& other) const;
+};
+
+template <>
+Vectorized<c10::complex<double>> inline operator+(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm256_add_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator-(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm256_sub_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator*(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0);
+  auto ac_bd = _mm256_mul_pd(a, b); // ac       bd
+
+  auto d_c = _mm256_permute_pd(b, 0x05); // d        c
+  d_c = _mm256_xor_pd(sign_mask, d_c); // d       -c
+  auto ad_bc = _mm256_mul_pd(a, d_c); // ad      -bc
+
+  auto ret = _mm256_hsub_pd(ac_bd, ad_bc); // ac - bd  ad + bc
+  return ret;
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator/(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // //re + im*i = (a + bi)  / (c + di)
   // auto mask = _mm256_set1_pd(-0.f);
   // auto fabs_cd = _mm256_andnot_pd(mask, b);     // |c|    |d|
   // auto fabs_dc = _mm256_permute_pd(fabs_cd, 0x05);   // |d|    |c|
+<<<<<<< HEAD
   // auto scale = _mm256_div_pd(_mm256_set1_pd(1.0f), _mm256_max_pd(fabs_cd, fabs_dc));  // 1/sc     1/sc
   // auto a2 = _mm256_mul_pd(a, scale);         // a/sc     b/sc
   // auto b2 = _mm256_mul_pd(b, scale);         // c/sc     d/sc
+=======
+  // auto scale = _mm256_div_pd(_mm256_set1_pd(1.0f), _mm256_max_pd(fabs_cd,
+  // fabs_dc));  // 1/sc     1/sc auto a2 = _mm256_mul_pd(a, scale);         //
+  // a/sc     b/sc auto b2 = _mm256_mul_pd(b, scale);         // c/sc     d/sc
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // auto acbd2 = _mm256_mul_pd(a2, b2);
 
   // const __m256d sign_mask = _mm256_setr_pd(-0.0, 0.0, -0.0, 0.0);
@@ -357,12 +632,24 @@ template <> Vectorized<c10::complex<double>> inline operator/(const Vectorized<c
   // auto res2 = _mm256_hadd_pd(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
 
   // // get the denominator
+<<<<<<< HEAD
   // auto denom2 = Vectorized<c10::complex<double>>(b2).abs_2_();  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
   // res2 = _mm256_div_pd(res2, denom2);
   // return res2;
   __at_align__ c10::complex<double> tmp1[Vectorized<c10::complex<double>>::size()];
   __at_align__ c10::complex<double> tmp2[Vectorized<c10::complex<double>>::size()];
   __at_align__ c10::complex<double> out[Vectorized<c10::complex<double>>::size()];
+=======
+  // auto denom2 = Vectorized<c10::complex<double>>(b2).abs_2_();  //
+  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2 res2 = _mm256_div_pd(res2, denom2); return
+  // res2;
+  __at_align__ c10::complex<double>
+      tmp1[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      tmp2[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      out[Vectorized<c10::complex<double>>::size()];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   a.store(tmp1);
   b.store(tmp2);
   for (const auto i : c10::irange(Vectorized<c10::complex<double>>::size())) {
@@ -372,8 +659,15 @@ template <> Vectorized<c10::complex<double>> inline operator/(const Vectorized<c
 }
 
 // reciprocal. Implement this here so we can use multiplication.
+<<<<<<< HEAD
 inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::reciprocal() const{
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
+=======
+inline Vectorized<c10::complex<double>> Vectorized<
+    c10::complex<double>>::reciprocal() const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // //re + im*i = (a + bi)  / (c + di)
   // //re = (ac + bd)/abs_2() = c/abs_2()
   // //im = (bc - ad)/abs_2() = d/abs_2()
@@ -388,21 +682,41 @@ inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::recipr
   return loadu(tmp);
 }
 
+<<<<<<< HEAD
 inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::atan() const {
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
+=======
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::atan()
+    const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // // atan(x) = i/2 * ln((i + z)/(i - z))
   // const __m256d i = _mm256_setr_pd(0.0, 1.0, 0.0, 1.0);
   // const Vectorized i_half = _mm256_setr_pd(0.0, 0.5, 0.0, 0.5);
 
+<<<<<<< HEAD
   // auto sum = Vectorized(_mm256_add_pd(i, values));                      // a        1+b
   // auto sub = Vectorized(_mm256_sub_pd(i, values));                      // -a       1-b
   // auto ln = (sum/sub).log();                                        // ln((i + z)/(i - z))
   // return i_half*ln;                                                 // i/2*ln()
+=======
+  // auto sum = Vectorized(_mm256_add_pd(i, values));                      // a
+  // 1+b auto sub = Vectorized(_mm256_sub_pd(i, values)); // -a       1-b auto
+  // ln = (sum/sub).log();                                        // ln((i +
+  // z)/(i - z)) return i_half*ln; // i/2*ln()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return map(std::atan);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<double>> inline maximum(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
+=======
+Vectorized<c10::complex<double>> inline maximum(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto abs_a = a.abs_2_();
   auto abs_b = b.abs_2_();
   auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_LT_OQ);
@@ -413,7 +727,13 @@ Vectorized<c10::complex<double>> inline maximum(const Vectorized<c10::complex<do
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<double>> inline minimum(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
+=======
+Vectorized<c10::complex<double>> inline minimum(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto abs_a = a.abs_2_();
   auto abs_b = b.abs_2_();
   auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_GT_OQ);
@@ -424,16 +744,29 @@ Vectorized<c10::complex<double>> inline minimum(const Vectorized<c10::complex<do
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<double>> inline operator&(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
+=======
+Vectorized<c10::complex<double>> inline operator&(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_and_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<double>> inline operator|(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
+=======
+Vectorized<c10::complex<double>> inline operator|(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_or_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<double>> inline operator^(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
   return _mm256_xor_pd(a, b);
 }
@@ -448,8 +781,37 @@ inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::ne(con
   auto ne = (*this != other);  // compares real and imag individually
   // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
   return (ne.real() | ne.imag()) & Vectorized<c10::complex<double>>(_mm256_set1_pd(1.0));
+=======
+Vectorized<c10::complex<double>> inline operator^(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm256_xor_pd(a, b);
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::eq(
+    const Vectorized<c10::complex<double>>& other) const {
+  auto eq = (*this == other); // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers
+  // are equal
+  return (eq.real() & eq.imag()) &
+      Vectorized<c10::complex<double>>(_mm256_set1_pd(1.0));
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::ne(
+    const Vectorized<c10::complex<double>>& other) const {
+  auto ne = (*this != other); // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex
+  // numbers are not equal
+  return (ne.real() | ne.imag()) &
+      Vectorized<c10::complex<double>>(_mm256_set1_pd(1.0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #endif
 
+<<<<<<< HEAD
 }} // namespace at::vec::CPU_CAPABILITY
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
index bec9490c7554..1ac0865043ee 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
@@ -3,10 +3,17 @@
 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
 // See Note [Do not compile initializers with AVX]
 
+<<<<<<< HEAD
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
+=======
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(CPU_CAPABILITY_AVX2)
 #define SLEEF_STATIC_LIBS
 #include <sleef.h>
@@ -18,10 +25,23 @@ inline namespace CPU_CAPABILITY {
 
 #if defined(CPU_CAPABILITY_AVX2)
 
+<<<<<<< HEAD
 template <> class Vectorized<c10::complex<float>> {
 private:
   __m256 values;
 public:
+=======
+template <>
+struct is_vec_specialized_for<c10::complex<float>> : std::bool_constant<true> {
+};
+
+template <>
+class Vectorized<c10::complex<float>> {
+ private:
+  __m256 values;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = c10::complex<float>;
   using size_type = int;
   static constexpr size_type size() {
@@ -32,6 +52,7 @@ template <> class Vectorized<c10::complex<float>> {
   Vectorized(c10::complex<float> val) {
     float real_value = val.real();
     float imag_value = val.imag();
+<<<<<<< HEAD
     values = _mm256_setr_ps(real_value, imag_value,
                             real_value, imag_value,
                             real_value, imag_value,
@@ -44,18 +65,52 @@ template <> class Vectorized<c10::complex<float>> {
                             val3.real(), val3.imag(),
                             val4.real(), val4.imag()
                             );
+=======
+    values = _mm256_setr_ps(
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value);
+  }
+  Vectorized(
+      c10::complex<float> val1,
+      c10::complex<float> val2,
+      c10::complex<float> val3,
+      c10::complex<float> val4) {
+    values = _mm256_setr_ps(
+        val1.real(),
+        val1.imag(),
+        val2.real(),
+        val2.imag(),
+        val3.real(),
+        val3.imag(),
+        val4.real(),
+        val4.imag());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   operator __m256() const {
     return values;
   }
   template <int64_t mask>
+<<<<<<< HEAD
   static Vectorized<c10::complex<float>> blend(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
      // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+=======
+  static Vectorized<c10::complex<float>> blend(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     static_assert(mask > -1 && mask < 16, "Unexpected mask range");
     switch (mask) {
       case 0:
         return a;
       case 1:
+<<<<<<< HEAD
         return _mm256_blend_ps(a.values, b.values, 0x03); //b0000 0001 = b0000 0011
       case 2:
         return _mm256_blend_ps(a.values, b.values, 0x0C); //b0000 0010 = b0000 1100
@@ -103,6 +158,76 @@ template <> class Vectorized<c10::complex<float>> {
   }
   static Vectorized<c10::complex<float>> set(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b,
                             int64_t count = size()) {
+=======
+        return _mm256_blend_ps(
+            a.values, b.values, 0x03); // b0000 0001 = b0000 0011
+      case 2:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x0C); // b0000 0010 = b0000 1100
+      case 3:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x0F); // b0000 0011 = b0000 1111
+      case 4:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x30); // b0000 0100 = b0011 0000
+      case 5:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x33); // b0000 0101 = b0011 0011
+      case 6:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x3C); // b0000 0110 = b0011 1100
+      case 7:
+        return _mm256_blend_ps(
+            a.values, b.values, 0x3F); // b0000 0111 = b0011 1111
+      case 8:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xC0); // b0000 1000 = b1100 0000
+      case 9:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xC3); // b0000 1001 = b1100 0011
+      case 10:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xCC); // b0000 1010 = b1100 1100
+      case 11:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xCF); // b0000 1011 = b1100 1111
+      case 12:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xF0); // b0000 1100 = b1111 0000
+      case 13:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xF3); // b0000 1101 = b1111 0011
+      case 14:
+        return _mm256_blend_ps(
+            a.values, b.values, 0xFC); // b0000 1110 = b1111 1100
+      default:
+        break;
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<float>> blendv(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b,
+      const Vectorized<c10::complex<float>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm256_unpacklo_ps(mask.values, mask.values);
+    return _mm256_blendv_ps(a.values, b.values, mask_);
+  }
+  template <typename step_t>
+  static Vectorized<c10::complex<float>> arange(
+      c10::complex<float> base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<float>>(
+        base,
+        base + step,
+        base + c10::complex<float>(2) * step,
+        base + c10::complex<float>(3) * step);
+  }
+  static Vectorized<c10::complex<float>> set(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -115,6 +240,7 @@ template <> class Vectorized<c10::complex<float>> {
     }
     return b;
   }
+<<<<<<< HEAD
   static Vectorized<c10::complex<float>> loadu(const void* ptr, int64_t count = size()) {
     if (count == size())
       return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
@@ -124,6 +250,20 @@ template <> class Vectorized<c10::complex<float>> {
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
     for (const auto i : c10::irange(2*size())) {
+=======
+  static Vectorized<c10::complex<float>> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
+
+    __at_align__ float tmp_values[2 * size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2 * size())) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       tmp_values[i] = 0.0;
     }
     std::memcpy(
@@ -136,14 +276,25 @@ template <> class Vectorized<c10::complex<float>> {
     if (count == size()) {
       _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
     } else if (count > 0) {
+<<<<<<< HEAD
       float tmp_values[2*size()];
+=======
+      float tmp_values[2 * size()];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<float>));
     }
   }
+<<<<<<< HEAD
   const c10::complex<float>& operator[](int idx) const  = delete;
   c10::complex<float>& operator[](int idx) = delete;
   Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
+=======
+  const c10::complex<float>& operator[](int idx) const = delete;
+  c10::complex<float>& operator[](int idx) = delete;
+  Vectorized<c10::complex<float>> map(
+      c10::complex<float> (*const f)(const c10::complex<float>&)) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ c10::complex<float> tmp[size()];
     store(tmp);
     for (const auto i : c10::irange(size())) {
@@ -152,6 +303,7 @@ template <> class Vectorized<c10::complex<float>> {
     return loadu(tmp);
   }
   __m256 abs_2_() const {
+<<<<<<< HEAD
     auto val_2 = _mm256_mul_ps(values, values);     // a*a     b*b
     auto ret = _mm256_hadd_ps(val_2, val_2);        // a*a+b*b a*a+b*b
     return _mm256_permute_ps(ret, 0xD8);
@@ -176,6 +328,46 @@ template <> class Vectorized<c10::complex<float>> {
                                                                    0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
     auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle    90-angle
     return _mm256_and_ps(angle, real_mask);         // angle    0
+=======
+    auto val_2 = _mm256_mul_ps(values, values); // a*a     b*b
+    auto ret = _mm256_hadd_ps(val_2, val_2); // a*a+b*b a*a+b*b
+    return _mm256_permute_ps(ret, 0xD8);
+  }
+  __m256 abs_() const {
+    auto real = _mm256_moveldup_ps(values); // real real
+    auto imag = _mm256_movehdup_ps(values); // imag imag
+    return Sleef_hypotf8_u05(real, imag); // abs  abs
+  }
+  Vectorized<c10::complex<float>> abs() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+    return _mm256_and_ps(abs_(), real_mask); // abs     0
+  }
+  __m256 angle_() const {
+    // angle = atan2(b/a)
+    auto b_a = _mm256_permute_ps(values, 0xB1); // b        a
+    return Sleef_atan2f8_u10(values, b_a); // 90-angle angle
+  }
+  Vectorized<c10::complex<float>> angle() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+    auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle    90-angle
+    return _mm256_and_ps(angle, real_mask); // angle    0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<float>> sgn() const {
     auto abs = abs_();
@@ -185,14 +377,27 @@ template <> class Vectorized<c10::complex<float>> {
     return _mm256_blendv_ps(div, zero, mask);
   }
   __m256 real_() const {
+<<<<<<< HEAD
     const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
                                                                    0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+=======
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _mm256_and_ps(values, real_mask);
   }
   Vectorized<c10::complex<float>> real() const {
     return real_();
   }
   __m256 imag_() const {
+<<<<<<< HEAD
     const __m256 imag_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
                                                                    0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF));
     return _mm256_and_ps(values, imag_mask);
@@ -203,12 +408,37 @@ template <> class Vectorized<c10::complex<float>> {
   __m256 conj_() const {
     const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
     return _mm256_xor_ps(values, sign_mask);        // a       -b
+=======
+    const __m256 imag_mask = _mm256_castsi256_ps(_mm256_setr_epi32(
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF));
+    return _mm256_and_ps(values, imag_mask);
+  }
+  Vectorized<c10::complex<float>> imag() const {
+    return _mm256_permute_ps(imag_(), 0xB1); // b        a
+  }
+  __m256 conj_() const {
+    const __m256 sign_mask =
+        _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+    return _mm256_xor_ps(values, sign_mask); // a       -b
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<float>> conj() const {
     return conj_();
   }
   Vectorized<c10::complex<float>> log() const {
+<<<<<<< HEAD
     // Most trigonomic ops use the log() op to improve complex number performance.
+=======
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(std::log);
   }
   Vectorized<c10::complex<float>> log2() const {
@@ -223,7 +453,12 @@ template <> class Vectorized<c10::complex<float>> {
     return map(std::log1p);
   }
   Vectorized<c10::complex<float>> asin() const {
+<<<<<<< HEAD
     // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
+=======
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // // asin(x)
     // // = -i*ln(iz + sqrt(1 -z^2))
     // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
@@ -231,6 +466,7 @@ template <> class Vectorized<c10::complex<float>> {
     // const __m256 one = _mm256_set1_ps(1);
 
     // auto conj = conj_();
+<<<<<<< HEAD
     // auto b_a = _mm256_permute_ps(conj, 0xB1);                         //-b        a
     // auto ab = _mm256_mul_ps(conj, b_a);                               //-ab       -ab
     // auto im = _mm256_add_ps(ab, ab);                                  //-2ab      -2ab
@@ -243,6 +479,21 @@ template <> class Vectorized<c10::complex<float>> {
     // auto root = Vectorized(_mm256_blend_ps(re, im, 0xAA)).sqrt();         //sqrt(re + i*im)
     // auto ln = Vectorized(_mm256_add_ps(b_a, root)).log();                 //ln(iz + sqrt())
     // return Vectorized(_mm256_permute_ps(ln.values, 0xB1)).conj();         //-i*ln()
+=======
+    // auto b_a = _mm256_permute_ps(conj, 0xB1);                         //-b a
+    // auto ab = _mm256_mul_ps(conj, b_a);                               //-ab
+    // -ab auto im = _mm256_add_ps(ab, ab); //-2ab      -2ab
+
+    // auto val_2 = _mm256_mul_ps(values, values);                       // a*a
+    // b*b auto re = _mm256_hsub_ps(val_2, _mm256_permute_ps(val_2, 0xB1));  //
+    // a*a-b*b  b*b-a*a re = _mm256_permute_ps(re, 0xD8); re =
+    // _mm256_sub_ps(one, re);
+
+    // auto root = Vectorized(_mm256_blend_ps(re, im, 0xAA)).sqrt(); //sqrt(re +
+    // i*im) auto ln = Vectorized(_mm256_add_ps(b_a, root)).log(); //ln(iz +
+    // sqrt()) return Vectorized(_mm256_permute_ps(ln.values, 0xB1)).conj();
+    // //-i*ln()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(std::asin);
   }
   Vectorized<c10::complex<float>> acos() const {
@@ -253,6 +504,7 @@ template <> class Vectorized<c10::complex<float>> {
     return map(std::atanh);
   }
   Vectorized<c10::complex<float>> exp() const {
+<<<<<<< HEAD
     // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
     // //exp(a + bi)
     // // = exp(a)*(cos(b) + sin(b)i)
@@ -262,6 +514,20 @@ template <> class Vectorized<c10::complex<float>> {
     // auto sin_cos = Sleef_sincosf8_u10(values);                        //[sin(a), cos(a)] [sin(b), cos(b)]
     // auto cos_sin = _mm256_blend_ps(_mm256_permute_ps(sin_cos.y, 0xB1),
     //                                sin_cos.x, 0xAA);                  //cos(b)           sin(b)
+=======
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // //exp(a + bi)
+    // // = exp(a)*(cos(b) + sin(b)i)
+    // auto exp = Sleef_expf8_u10(values); //exp(a)           exp(b) exp =
+    // _mm256_blend_ps(exp, _mm256_permute_ps(exp, 0xB1), 0xAA);   //exp(a)
+    // exp(a)
+
+    // auto sin_cos = Sleef_sincosf8_u10(values); //[sin(a), cos(a)] [sin(b),
+    // cos(b)] auto cos_sin = _mm256_blend_ps(_mm256_permute_ps(sin_cos.y,
+    // 0xB1),
+    //                                sin_cos.x, 0xAA); //cos(b) sin(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // return _mm256_mul_ps(exp, cos_sin);
     return map(std::exp);
   }
@@ -297,7 +563,12 @@ template <> class Vectorized<c10::complex<float>> {
     return _mm256_sub_ps(zero, values);
   }
   Vectorized<c10::complex<float>> round() const {
+<<<<<<< HEAD
     return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+    return _mm256_round_ps(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<float>> tan() const {
     return map(std::tan);
@@ -315,7 +586,12 @@ template <> class Vectorized<c10::complex<float>> {
   Vectorized<c10::complex<float>> rsqrt() const {
     return sqrt().reciprocal();
   }
+<<<<<<< HEAD
   Vectorized<c10::complex<float>> pow(const Vectorized<c10::complex<float>> &exp) const {
+=======
+  Vectorized<c10::complex<float>> pow(
+      const Vectorized<c10::complex<float>>& exp) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ c10::complex<float> x_tmp[size()];
     __at_align__ c10::complex<float> y_tmp[size()];
     store(x_tmp);
@@ -328,6 +604,7 @@ template <> class Vectorized<c10::complex<float>> {
   // Comparison using the _CMP_**_OQ predicate.
   //   `O`: get false if an operand is NaN
   //   `Q`: do not raise if an operand is NaN
+<<<<<<< HEAD
   Vectorized<c10::complex<float>> operator==(const Vectorized<c10::complex<float>>& other) const {
     return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
   }
@@ -369,34 +646,123 @@ template <> Vectorized<c10::complex<float>> inline operator*(const Vectorized<c1
   auto ad_bc = _mm256_mul_ps(a, d_c);       //ad      -bc
 
   auto ret = _mm256_hsub_ps(ac_bd, ad_bc);  //ac - bd  ad + bc
+=======
+  Vectorized<c10::complex<float>> operator==(
+      const Vectorized<c10::complex<float>>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
+  }
+  Vectorized<c10::complex<float>> operator!=(
+      const Vectorized<c10::complex<float>>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ);
+  }
+  Vectorized<c10::complex<float>> operator<(
+      const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator<=(
+      const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>(
+      const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>=(
+      const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<float>> eq(
+      const Vectorized<c10::complex<float>>& other) const;
+  Vectorized<c10::complex<float>> ne(
+      const Vectorized<c10::complex<float>>& other) const;
+};
+
+template <>
+Vectorized<c10::complex<float>> inline operator+(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm256_add_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator-(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm256_sub_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator*(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m256 sign_mask =
+      _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+  auto ac_bd = _mm256_mul_ps(a, b); // ac       bd
+
+  auto d_c = _mm256_permute_ps(b, 0xB1); // d        c
+  d_c = _mm256_xor_ps(sign_mask, d_c); // d       -c
+  auto ad_bc = _mm256_mul_ps(a, d_c); // ad      -bc
+
+  auto ret = _mm256_hsub_ps(ac_bd, ad_bc); // ac - bd  ad + bc
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ret = _mm256_permute_ps(ret, 0xD8);
   return ret;
 }
 
+<<<<<<< HEAD
 template <> Vectorized<c10::complex<float>> inline operator/(const Vectorized<c10::complex<float>> &a, const Vectorized<c10::complex<float>> &b) {
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
+=======
+template <>
+Vectorized<c10::complex<float>> inline operator/(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // //re + im*i = (a + bi)  / (c + di)
   // auto mask = _mm256_set1_ps(-0.f);
   // auto fabs_cd = _mm256_andnot_ps(mask, b);     // |c|    |d|
   // auto fabs_dc = _mm256_permute_ps(fabs_cd, 0xB1);   // |d|    |c|
+<<<<<<< HEAD
   // auto scale = _mm256_rcp_ps(_mm256_max_ps(fabs_cd, fabs_dc));  // 1/sc     1/sc
+=======
+  // auto scale = _mm256_rcp_ps(_mm256_max_ps(fabs_cd, fabs_dc));  // 1/sc 1/sc
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // auto a2 = _mm256_mul_ps(a, scale);         // a/sc     b/sc
   // auto b2 = _mm256_mul_ps(b, scale);         // c/sc     d/sc
   // auto acbd2 = _mm256_mul_ps(a2, b2);
 
+<<<<<<< HEAD
   // const __m256 sign_mask = _mm256_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0);
   // auto dc2 = _mm256_permute_ps(b2, 0xB1);    // d/sc         c/sc
+=======
+  // const __m256 sign_mask = _mm256_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0,
+  // -0.0, 0.0); auto dc2 = _mm256_permute_ps(b2, 0xB1);    // d/sc         c/sc
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // dc2 = _mm256_xor_ps(sign_mask, dc2);       // -d/|c,d|        c/sc
   // auto adbc2 = _mm256_mul_ps(a2, dc2);       //-ad/sc^2      bc/sc^2
   // auto res2 = _mm256_hadd_ps(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
   // res2 = _mm256_permute_ps(res2, 0xD8);
 
   // // get the denominator
+<<<<<<< HEAD
   // auto denom2 = Vectorized<c10::complex<float>>(b2).abs_2_();  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
   // res2 = _mm256_div_ps(res2, denom2);
   // return res2;
   __at_align__ c10::complex<float> tmp1[Vectorized<c10::complex<float>>::size()];
   __at_align__ c10::complex<float> tmp2[Vectorized<c10::complex<float>>::size()];
+=======
+  // auto denom2 = Vectorized<c10::complex<float>>(b2).abs_2_();  //
+  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2 res2 = _mm256_div_ps(res2, denom2); return
+  // res2;
+  __at_align__ c10::complex<float>
+      tmp1[Vectorized<c10::complex<float>>::size()];
+  __at_align__ c10::complex<float>
+      tmp2[Vectorized<c10::complex<float>>::size()];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __at_align__ c10::complex<float> out[Vectorized<c10::complex<float>>::size()];
   a.store(tmp1);
   b.store(tmp2);
@@ -407,6 +773,7 @@ template <> Vectorized<c10::complex<float>> inline operator/(const Vectorized<c1
 }
 
 // reciprocal. Implement this here so we can use multiplication.
+<<<<<<< HEAD
 inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::reciprocal() const {
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
   // //re + im*i = (a + bi)  / (c + di)
@@ -414,6 +781,17 @@ inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::reciproc
   // //im = (bc - ad)/abs_2() = d/abs_2()
   // const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
   // auto c_d = _mm256_xor_ps(sign_mask, values);    //c       -d
+=======
+inline Vectorized<c10::complex<float>> Vectorized<
+    c10::complex<float>>::reciprocal() const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // //re = (ac + bd)/abs_2() = c/abs_2()
+  // //im = (bc - ad)/abs_2() = d/abs_2()
+  // const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+  // 0.0, -0.0); auto c_d = _mm256_xor_ps(sign_mask, values);    //c       -d
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // return _mm256_div_ps(c_d, abs_2_());
   __at_align__ c10::complex<float> tmp[size()];
   store(tmp);
@@ -423,6 +801,7 @@ inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::reciproc
   return loadu(tmp);
 }
 
+<<<<<<< HEAD
 inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::atan() const {
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
   // // atan(x) = i/2 * ln((i + z)/(i - z))
@@ -433,11 +812,32 @@ inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::atan() c
   // auto sub = Vectorized(_mm256_sub_ps(i, values));                      // -a       1-b
   // auto ln = (sum/sub).log();                                        // ln((i + z)/(i - z))
   // return i_half*ln;                                                 // i/2*ln()
+=======
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::atan()
+    const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // // atan(x) = i/2 * ln((i + z)/(i - z))
+  // const __m256 i = _mm256_setr_ps(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+  // const Vectorized i_half = _mm256_setr_ps(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+  // 0.5);
+
+  // auto sum = Vectorized(_mm256_add_ps(i, values));                      // a
+  // 1+b auto sub = Vectorized(_mm256_sub_ps(i, values)); // -a       1-b auto
+  // ln = (sum/sub).log();                                        // ln((i +
+  // z)/(i - z)) return i_half*ln; // i/2*ln()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return map(std::atan);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<float>> inline maximum(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+=======
+Vectorized<c10::complex<float>> inline maximum(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto abs_a = a.abs_2_();
   auto abs_b = b.abs_2_();
   auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ);
@@ -448,7 +848,13 @@ Vectorized<c10::complex<float>> inline maximum(const Vectorized<c10::complex<flo
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<float>> inline minimum(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+=======
+Vectorized<c10::complex<float>> inline minimum(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto abs_a = a.abs_2_();
   auto abs_b = b.abs_2_();
   auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ);
@@ -459,34 +865,73 @@ Vectorized<c10::complex<float>> inline minimum(const Vectorized<c10::complex<flo
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<float>> inline operator&(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+=======
+Vectorized<c10::complex<float>> inline operator&(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_and_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<float>> inline operator|(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+=======
+Vectorized<c10::complex<float>> inline operator|(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_or_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<float>> inline operator^(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+=======
+Vectorized<c10::complex<float>> inline operator^(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_xor_ps(a, b);
 }
 
 inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::eq(
     const Vectorized<c10::complex<float>>& other) const {
+<<<<<<< HEAD
   auto eq = (*this == other);  // compares real and imag individually
   // If both real numbers and imag numbers are equal, then the complex numbers are equal
   return (eq.real() & eq.imag()) & Vectorized<c10::complex<float>>(_mm256_set1_ps(1.0f));
+=======
+  auto eq = (*this == other); // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers
+  // are equal
+  return (eq.real() & eq.imag()) &
+      Vectorized<c10::complex<float>>(_mm256_set1_ps(1.0f));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::ne(
     const Vectorized<c10::complex<float>>& other) const {
+<<<<<<< HEAD
   auto ne = (*this != other);  // compares real and imag individually
   // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
   return (ne.real() | ne.imag()) & Vectorized<c10::complex<float>>(_mm256_set1_ps(1.0f));
+=======
+  auto ne = (*this != other); // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex
+  // numbers are not equal
+  return (ne.real() | ne.imag()) &
+      Vectorized<c10::complex<float>>(_mm256_set1_ps(1.0f));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #endif
 
+<<<<<<< HEAD
 }} // namespace at::vec::CPU_CAPABILITY
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
index 9dbdb4f3dfb2..9517b11d7a89 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@@ -1,6 +1,10 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <ATen/cpu/vec/functional_bfloat16.h>
+=======
+#include <ATen/cpu/vec/functional.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec_convert.h>
@@ -117,6 +121,7 @@ struct VecConvert<int64_t, 2, float, 1> {
     src.store(buffer);
     at::vec::VectorizedN<int64_t, 2> result;
     result[0] = Vectorized<int64_t>(
+<<<<<<< HEAD
       static_cast<int64_t>(buffer[0]),
       static_cast<int64_t>(buffer[1]),
       static_cast<int64_t>(buffer[2]),
@@ -126,6 +131,17 @@ struct VecConvert<int64_t, 2, float, 1> {
       static_cast<int64_t>(buffer[5]),
       static_cast<int64_t>(buffer[6]),
       static_cast<int64_t>(buffer[7]));
+=======
+        static_cast<int64_t>(buffer[0]),
+        static_cast<int64_t>(buffer[1]),
+        static_cast<int64_t>(buffer[2]),
+        static_cast<int64_t>(buffer[3]));
+    result[1] = Vectorized<int64_t>(
+        static_cast<int64_t>(buffer[4]),
+        static_cast<int64_t>(buffer[5]),
+        static_cast<int64_t>(buffer[6]),
+        static_cast<int64_t>(buffer[7]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return result;
   }
 };
@@ -171,12 +187,19 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
   }
 };
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 struct VecConvert<int32_t, 1, float, 1> {
   static inline VectorizedN<int32_t, 1> apply(
       const VectorizedN<float, 1>& src) {
+<<<<<<< HEAD
     return  Vectorized<int32_t>(_mm256_cvttps_epi32(src[0]));
+=======
+    return Vectorized<int32_t>(_mm256_cvttps_epi32(src[0]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -184,7 +207,11 @@ template <>
 struct VecConvert<float, 1, int32_t, 1> {
   static inline VectorizedN<float, 1> apply(
       const VectorizedN<int32_t, 1>& src) {
+<<<<<<< HEAD
     return  Vectorized<float>(_mm256_cvtepi32_ps(src[0]));
+=======
+    return Vectorized<float>(_mm256_cvtepi32_ps(src[0]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -219,15 +246,24 @@ struct VecConvert<
     1,
     float,
     2,
+<<<<<<< HEAD
     typename std::enable_if_t<is_8bit_integer_v<dst_t>,
         void>> {
+=======
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>, void>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
     at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
     at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
     __m128 lane2 = _mm256_castps256_ps128(_mm256_castsi256_ps(vec2));
     __m256 combined = _mm256_insertf128_ps(_mm256_castsi256_ps(vec1), lane2, 1);
     // Shuffle [191:128] bit from combined in to [127:64] bit of result
+<<<<<<< HEAD
     __m256i result = _mm256_permute4x64_epi64(_mm256_castps_si256(combined), 0b11011000);
+=======
+    __m256i result =
+        _mm256_permute4x64_epi64(_mm256_castps_si256(combined), 0b11011000);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return at::vec::Vectorized<dst_t>(result);
   }
 };
@@ -238,8 +274,12 @@ struct VecConvert<
     1,
     float,
     1,
+<<<<<<< HEAD
     typename std::enable_if_t<is_8bit_integer_v<dst_t>,
         void>> {
+=======
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>, void>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
     return convert_float_to_int8<dst_t>(src[0]);
   }
@@ -251,6 +291,7 @@ struct VecConvert<
     2,
     src_t,
     1,
+<<<<<<< HEAD
     typename std::enable_if_t<is_8bit_integer_v<src_t>,
         void>> {
   static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
@@ -262,6 +303,19 @@ struct VecConvert<
       )
     );
     return VectorizedN<float, 2>(convert_int8_to_float<src_t>(src[0]), convert_int8_to_float<src_t>(src2));
+=======
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    // Shuffle [127:64] bit from src[0] in to [191:128] bit of shuffled
+    __m256i shuffled = _mm256_permute4x64_epi64(src[0], 0b11011000);
+    __m256i src2 =
+        _mm256_castsi128_si256(_mm_castps_si128(_mm256_extractf128_ps(
+            _mm256_castsi256_ps(shuffled), 1) // Extract the second 128-bit lane
+                                                ));
+    return VectorizedN<float, 2>(
+        convert_int8_to_float<src_t>(src[0]),
+        convert_int8_to_float<src_t>(src2));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -272,8 +326,12 @@ struct VecConvert<
     int64_t,
     2,
     std::enable_if_t<
+<<<<<<< HEAD
         std::is_same_v<dst_t, int8_t> ||
         std::is_same_v<dst_t, uint8_t>>> {
+=======
+        std::is_same_v<dst_t, int8_t> || std::is_same_v<dst_t, uint8_t>>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<dst_t, 1> apply(
       const VectorizedN<int64_t, 2>& src) {
     return VecConvert<dst_t, 1, int32_t, 1>::apply(
@@ -283,7 +341,10 @@ struct VecConvert<
 
 #endif /* defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) */
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER))
 template <typename src_t>
 struct VecConvert<
@@ -291,14 +352,63 @@ struct VecConvert<
     1,
     src_t,
     1,
+<<<<<<< HEAD
     typename std::enable_if_t<is_8bit_integer_v<src_t>,
         void>> {
+=======
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
     return convert_int8_to_float<src_t>(src[0]);
   }
 };
 #endif
 
+<<<<<<< HEAD
+=======
+#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+
+template <>
+struct VecConvert<float, 1, BFloat16, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 1> res;
+    // Load 16-bit unsigned integers from src into an SVE vector
+    svuint16_t u16x4 =
+        svld1_u16(svptrue_b16(), reinterpret_cast<const uint16_t*>(&src[0]));
+    // Zero-extend to 32-bit SVE does not have direct vmovl_u16 equivalent.
+    vls_uint32_t u32x4 =
+        svreinterpret_u32_u16(svzip1_u16(svdup_n_u16(0), u16x4));
+    // Reinterpret as float32
+    vls_float32_t f32x4 = svreinterpret_f32_u32(u32x4);
+    res[0] = Vectorized<float>(f32x4);
+    return res;
+  }
+};
+
+template <>
+struct VecConvert<float, 2, BFloat16, 1> {
+  static inline VectorizedN<float, 2> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 2> res;
+    std::tie(res[0], res[1]) = convert_bfloat16_float(src[0]);
+    return res;
+  }
+};
+
+template <>
+struct VecConvert<BFloat16, 1, float, 2> {
+  static inline VectorizedN<BFloat16, 1> apply(
+      const VectorizedN<float, 2>& src) {
+    VectorizedN<BFloat16, 1> res;
+    res[0] = convert_float_bfloat16(src[0], src[1]);
+    return res;
+  }
+};
+
+#endif // defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename src_t>
 struct VecConvert<
     float,
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
index b4b878859cbb..e38c983c017c 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@@ -15,6 +15,7 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
+<<<<<<< HEAD
 
 #if defined(CPU_CAPABILITY_AVX2)
 
@@ -22,6 +23,19 @@ template <> class Vectorized<double> {
 private:
   __m256d values;
 public:
+=======
+#if defined(CPU_CAPABILITY_AVX2)
+
+template <>
+struct is_vec_specialized_for<double> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<double> {
+ private:
+  __m256d values;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = double;
   using size_type = int;
   static constexpr size_type size() {
@@ -39,6 +53,7 @@ template <> class Vectorized<double> {
     return values;
   }
   template <int64_t mask>
+<<<<<<< HEAD
   static Vectorized<double> blend(const Vectorized<double>& a, const Vectorized<double>& b) {
     return _mm256_blend_pd(a.values, b.values, mask);
   }
@@ -52,6 +67,30 @@ template <> class Vectorized<double> {
   }
   static Vectorized<double> set(const Vectorized<double>& a, const Vectorized<double>& b,
                             int64_t count = size()) {
+=======
+  static Vectorized<double> blend(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b) {
+    return _mm256_blend_pd(a.values, b.values, mask);
+  }
+  static Vectorized<double> blendv(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      const Vectorized<double>& mask) {
+    return _mm256_blendv_pd(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<double> arange(
+      double base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<double>(
+        base, base + step, base + 2 * step, base + 3 * step);
+  }
+  static Vectorized<double> set(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -68,11 +107,19 @@ template <> class Vectorized<double> {
     if (count == size())
       return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
 
+<<<<<<< HEAD
 
     __at_align__ double tmp_values[size()];
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
+=======
+    __at_align__ double tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0.0;
     }
@@ -91,10 +138,18 @@ template <> class Vectorized<double> {
       std::memcpy(ptr, tmp_values, count * sizeof(double));
     }
   }
+<<<<<<< HEAD
   const double& operator[](int idx) const  = delete;
   double& operator[](int idx) = delete;
   int zero_mask() const {
     // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+=======
+  const double& operator[](int idx) const = delete;
+  double& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256d cmp = _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_EQ_OQ);
     return _mm256_movemask_pd(cmp);
   }
@@ -102,8 +157,14 @@ template <> class Vectorized<double> {
     return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q);
   }
   bool has_inf_nan() const {
+<<<<<<< HEAD
     __m256d self_sub  = _mm256_sub_pd(values, values);
     return (_mm256_movemask_epi8(_mm256_castpd_si256(self_sub)) & 0x77777777) != 0;
+=======
+    __m256d self_sub = _mm256_sub_pd(values, values);
+    return (_mm256_movemask_epi8(_mm256_castpd_si256(self_sub)) & 0x77777777) !=
+        0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> map(double (*const f)(double)) const {
     __at_align__ double tmp[size()];
@@ -156,10 +217,17 @@ template <> class Vectorized<double> {
   Vectorized<double> atanh() const {
     return Vectorized<double>(Sleef_atanhd4_u10(values));
   }
+<<<<<<< HEAD
   Vectorized<double> atan2(const Vectorized<double> &b) const {
     return Vectorized<double>(Sleef_atan2d4_u10(values, b));
   }
   Vectorized<double> copysign(const Vectorized<double> &sign) const {
+=======
+  Vectorized<double> atan2(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_atan2d4_u10(values, b));
+  }
+  Vectorized<double> copysign(const Vectorized<double>& sign) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<double>(Sleef_copysignd4(values, sign));
   }
   Vectorized<double> erf() const {
@@ -186,7 +254,11 @@ template <> class Vectorized<double> {
   Vectorized<double> fmod(const Vectorized<double>& q) const {
     return Vectorized<double>(Sleef_fmodd4(values, q));
   }
+<<<<<<< HEAD
   Vectorized<double> hypot(const Vectorized<double> &b) const {
+=======
+  Vectorized<double> hypot(const Vectorized<double>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<double>(Sleef_hypotd4_u05(values, b));
   }
   Vectorized<double> i0() const {
@@ -198,7 +270,11 @@ template <> class Vectorized<double> {
   Vectorized<double> digamma() const {
     return map(calc_digamma);
   }
+<<<<<<< HEAD
   Vectorized<double> igamma(const Vectorized<double> &x) const {
+=======
+  Vectorized<double> igamma(const Vectorized<double>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ double tmp[size()];
     __at_align__ double tmp_x[size()];
     store(tmp);
@@ -208,7 +284,11 @@ template <> class Vectorized<double> {
     }
     return loadu(tmp);
   }
+<<<<<<< HEAD
   Vectorized<double> igammac(const Vectorized<double> &x) const {
+=======
+  Vectorized<double> igammac(const Vectorized<double>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ double tmp[size()];
     __at_align__ double tmp_x[size()];
     store(tmp);
@@ -252,11 +332,20 @@ template <> class Vectorized<double> {
   Vectorized<double> neg() const {
     return _mm256_xor_pd(_mm256_set1_pd(-0.), values);
   }
+<<<<<<< HEAD
   Vectorized<double> nextafter(const Vectorized<double> &b) const {
     return Vectorized<double>(Sleef_nextafterd4(values, b));
   }
   Vectorized<double> round() const {
     return _mm256_round_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+  Vectorized<double> nextafter(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_nextafterd4(values, b));
+  }
+  Vectorized<double> round() const {
+    return _mm256_round_pd(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> tan() const {
     return Vectorized<double>(Sleef_tand4_u10(values));
@@ -279,7 +368,11 @@ template <> class Vectorized<double> {
   Vectorized<double> rsqrt() const {
     return _mm256_div_pd(_mm256_set1_pd(1), _mm256_sqrt_pd(values));
   }
+<<<<<<< HEAD
   Vectorized<double> pow(const Vectorized<double> &b) const {
+=======
+  Vectorized<double> pow(const Vectorized<double>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<double>(Sleef_powd4_u10(values, b));
   }
   // Comparison using the _CMP_**_OQ predicate.
@@ -318,22 +411,46 @@ template <> class Vectorized<double> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator+(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_add_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator-(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_sub_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator*(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_mul_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator/(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_div_pd(a, b);
 }
 
@@ -345,7 +462,13 @@ inline Vectorized<double> Vectorized<double>::frac() const {
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline maximum(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline maximum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<double> max = _mm256_max_pd(a, b);
   Vectorized<double> isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q);
   // Exploit the fact that all-ones is a NaN.
@@ -355,7 +478,13 @@ Vectorized<double> inline maximum(const Vectorized<double>& a, const Vectorized<
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline minimum(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline minimum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<double> min = _mm256_min_pd(a, b);
   Vectorized<double> isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q);
   // Exploit the fact that all-ones is a NaN.
@@ -363,31 +492,63 @@ Vectorized<double> inline minimum(const Vectorized<double>& a, const Vectorized<
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline clamp(const Vectorized<double>& a, const Vectorized<double>& min, const Vectorized<double>& max) {
+=======
+Vectorized<double> inline clamp(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min,
+    const Vectorized<double>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_pd(max, _mm256_max_pd(min, a));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline clamp_min(const Vectorized<double>& a, const Vectorized<double>& min) {
+=======
+Vectorized<double> inline clamp_min(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_max_pd(min, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline clamp_max(const Vectorized<double>& a, const Vectorized<double>& max) {
+=======
+Vectorized<double> inline clamp_max(
+    const Vectorized<double>& a,
+    const Vectorized<double>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_pd(max, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator&(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_and_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator|(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_or_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
   return _mm256_xor_pd(a, b);
 }
@@ -413,6 +574,41 @@ inline Vectorized<double> Vectorized<double>::lt(const Vectorized<double>& other
 }
 
 inline Vectorized<double> Vectorized<double>::le(const Vectorized<double>& other) const {
+=======
+Vectorized<double> inline operator^(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm256_xor_pd(a, b);
+}
+
+inline Vectorized<double> Vectorized<double>::eq(
+    const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ne(
+    const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::gt(
+    const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ge(
+    const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::lt(
+    const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::le(
+    const Vectorized<double>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<double>(1.0);
 }
 
@@ -422,7 +618,12 @@ inline void convert(const double* src, double* dst, int64_t n) {
 #ifndef __msvc_cl__
 #pragma unroll
 #endif
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
+=======
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
   }
 #ifndef __msvc_cl__
@@ -435,16 +636,35 @@ inline void convert(const double* src, double* dst, int64_t n) {
 
 #ifdef CPU_CAPABILITY_AVX2
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline fmadd(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+=======
+Vectorized<double> inline fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_fmadd_pd(a, b, c);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline fmsub(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+=======
+Vectorized<double> inline fmsub(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_fmsub_pd(a, b, c);
 }
 #endif
 
 #endif
 
+<<<<<<< HEAD
 }} // namespace at::vec::CPU_CAPABILITY
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
index d57c28cfdbdc..4bb854314825 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@@ -17,10 +17,22 @@ inline namespace CPU_CAPABILITY {
 
 #if defined(CPU_CAPABILITY_AVX2)
 
+<<<<<<< HEAD
 template <> class Vectorized<float> {
 private:
   __m256 values;
 public:
+=======
+template <>
+struct is_vec_specialized_for<float> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<float> {
+ private:
+  __m256 values;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = float;
   using size_type = int;
   static constexpr size_type size() {
@@ -31,16 +43,41 @@ template <> class Vectorized<float> {
   Vectorized(float val) {
     values = _mm256_set1_ps(val);
   }
+<<<<<<< HEAD
   Vectorized(float val1, float val2, float val3, float val4,
          float val5, float val6, float val7, float val8) {
     values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8);
   }
   Vectorized(const float (&arr)[8])
       : Vectorized(arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]) {}
+=======
+  Vectorized(
+      float val1,
+      float val2,
+      float val3,
+      float val4,
+      float val5,
+      float val6,
+      float val7,
+      float val8) {
+    values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
+  Vectorized(const float (&arr)[8])
+      : Vectorized(
+            arr[0],
+            arr[1],
+            arr[2],
+            arr[3],
+            arr[4],
+            arr[5],
+            arr[6],
+            arr[7]) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   operator __m256() const {
     return values;
   }
   template <int64_t mask>
+<<<<<<< HEAD
   static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
     return _mm256_blend_ps(a.values, b.values, mask);
   }
@@ -56,6 +93,37 @@ template <> class Vectorized<float> {
   }
   static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
                            int64_t count = size()) {
+=======
+  static Vectorized<float> blend(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b) {
+    return _mm256_blend_ps(a.values, b.values, mask);
+  }
+  static Vectorized<float> blendv(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      const Vectorized<float>& mask) {
+    return _mm256_blendv_ps(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<float> arange(
+      float base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<float>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<float> set(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -80,9 +148,16 @@ template <> class Vectorized<float> {
     if (count == size())
       return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
     __at_align__ float tmp_values[size()];
+<<<<<<< HEAD
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
+=======
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0.0;
     }
@@ -99,10 +174,18 @@ template <> class Vectorized<float> {
       std::memcpy(ptr, tmp_values, count * sizeof(float));
     }
   }
+<<<<<<< HEAD
   const float& operator[](int idx) const  = delete;
   float& operator[](int idx) = delete;
   int zero_mask() const {
     // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+=======
+  const float& operator[](int idx) const = delete;
+  float& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
     return _mm256_movemask_ps(cmp);
   }
@@ -111,8 +194,14 @@ template <> class Vectorized<float> {
   }
 
   bool has_inf_nan() const {
+<<<<<<< HEAD
     __m256 self_sub  = _mm256_sub_ps(values, values);
     return (_mm256_movemask_epi8(_mm256_castps_si256(self_sub)) & 0x77777777) != 0;
+=======
+    __m256 self_sub = _mm256_sub_ps(values, values);
+    return (_mm256_movemask_epi8(_mm256_castps_si256(self_sub)) & 0x77777777) !=
+        0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<float> map(float (*const f)(float)) const {
@@ -166,10 +255,17 @@ template <> class Vectorized<float> {
   Vectorized<float> atanh() const {
     return Vectorized<float>(Sleef_atanhf8_u10(values));
   }
+<<<<<<< HEAD
   Vectorized<float> atan2(const Vectorized<float> &b) const {
     return Vectorized<float>(Sleef_atan2f8_u10(values, b));
   }
   Vectorized<float> copysign(const Vectorized<float> &sign) const {
+=======
+  Vectorized<float> atan2(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_atan2f8_u10(values, b));
+  }
+  Vectorized<float> copysign(const Vectorized<float>& sign) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<float>(Sleef_copysignf8(values, sign));
   }
   Vectorized<float> erf() const {
@@ -237,9 +333,18 @@ template <> class Vectorized<float> {
     const __m256 vec_one = _mm256_set1_ps(1.f);
     const __m256 vec_zero = _mm256_set1_ps(0.f);
     const __m256 vec_two = _mm256_set1_ps(2.f);
+<<<<<<< HEAD
     const __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
     const __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
     const __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
+=======
+    const __m256 vec_ln2f =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
+    const __m256 vec_ln_flt_min =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
+    const __m256 vec_ln_flt_max =
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
     const int n_mantissa_bits = 23;
 
@@ -315,7 +420,11 @@ template <> class Vectorized<float> {
   Vectorized<float> floor() const {
     return _mm256_floor_ps(values);
   }
+<<<<<<< HEAD
   Vectorized<float> hypot(const Vectorized<float> &b) const {
+=======
+  Vectorized<float> hypot(const Vectorized<float>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<float>(Sleef_hypotf8_u05(values, b));
   }
   Vectorized<float> i0() const {
@@ -327,7 +436,11 @@ template <> class Vectorized<float> {
   Vectorized<float> digamma() const {
     return map(calc_digamma);
   }
+<<<<<<< HEAD
   Vectorized<float> igamma(const Vectorized<float> &x) const {
+=======
+  Vectorized<float> igamma(const Vectorized<float>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ float tmp[size()];
     __at_align__ float tmp_x[size()];
     store(tmp);
@@ -337,7 +450,11 @@ template <> class Vectorized<float> {
     }
     return loadu(tmp);
   }
+<<<<<<< HEAD
   Vectorized<float> igammac(const Vectorized<float> &x) const {
+=======
+  Vectorized<float> igammac(const Vectorized<float>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ float tmp[size()];
     __at_align__ float tmp_x[size()];
     store(tmp);
@@ -350,11 +467,20 @@ template <> class Vectorized<float> {
   Vectorized<float> neg() const {
     return _mm256_xor_ps(_mm256_set1_ps(-0.f), values);
   }
+<<<<<<< HEAD
   Vectorized<float> nextafter(const Vectorized<float> &b) const {
     return Vectorized<float>(Sleef_nextafterf8(values, b));
   }
   Vectorized<float> round() const {
     return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+  Vectorized<float> nextafter(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_nextafterf8(values, b));
+  }
+  Vectorized<float> round() const {
+    return _mm256_round_ps(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> tan() const {
     return Vectorized<float>(Sleef_tanf8_u10(values));
@@ -377,7 +503,11 @@ template <> class Vectorized<float> {
   Vectorized<float> rsqrt() const {
     return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values));
   }
+<<<<<<< HEAD
   Vectorized<float> pow(const Vectorized<float> &b) const {
+=======
+  Vectorized<float> pow(const Vectorized<float>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<float>(Sleef_powf8_u10(values, b));
   }
   float reduce_add() const {
@@ -442,22 +572,46 @@ template <> class Vectorized<float> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator+(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_add_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator-(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_sub_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator*(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_mul_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator/(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_div_ps(a, b);
 }
 
@@ -469,7 +623,13 @@ inline Vectorized<float> Vectorized<float>::frac() const {
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline maximum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<float> max = _mm256_max_ps(a, b);
   Vectorized<float> isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
   // Exploit the fact that all-ones is a NaN.
@@ -479,7 +639,13 @@ Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<fl
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline minimum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<float> min = _mm256_min_ps(a, b);
   Vectorized<float> isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
   // Exploit the fact that all-ones is a NaN.
@@ -487,31 +653,63 @@ Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<fl
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
+=======
+Vectorized<float> inline clamp(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min,
+    const Vectorized<float>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_ps(max, _mm256_max_ps(min, a));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
+=======
+Vectorized<float> inline clamp_max(
+    const Vectorized<float>& a,
+    const Vectorized<float>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_ps(max, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
+=======
+Vectorized<float> inline clamp_min(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_max_ps(min, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator&(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_and_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator|(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_or_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
   return _mm256_xor_ps(a, b);
 }
@@ -537,6 +735,41 @@ inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) c
 }
 
 inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
+=======
+Vectorized<float> inline operator^(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm256_xor_ps(a, b);
+}
+
+inline Vectorized<float> Vectorized<float>::eq(
+    const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ne(
+    const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::gt(
+    const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ge(
+    const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::lt(
+    const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::le(
+    const Vectorized<float>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<float>(1.0f);
 }
 
@@ -546,7 +779,12 @@ inline void convert(const float* src, float* dst, int64_t n) {
 #ifndef __msvc_cl__
 #pragma unroll
 #endif
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+=======
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
   }
 #ifndef __msvc_cl__
@@ -557,20 +795,39 @@ inline void convert(const float* src, float* dst, int64_t n) {
   }
 }
 
+<<<<<<< HEAD
 
 template <>
 Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+=======
+template <>
+Vectorized<float> inline fmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_fmadd_ps(a, b, c);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+=======
+Vectorized<float> inline fmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_fmsub_ps(a, b, c);
 }
 
 // TODO: rewrite with ATEN vectorized (need to add unpack and shuffle)
 // Used by Inductor CPP codegen for micro gemm
+<<<<<<< HEAD
 inline void transpose_block(at::vec::VectorizedN<float, 8> &input) {
+=======
+inline void transpose_block(at::vec::VectorizedN<float, 8>& input) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 temp0[8];
   // unpacking and interleaving 32-bit elements
   // a0  b0  a1  b1  a4  b4  a5  b5
@@ -600,6 +857,7 @@ inline void transpose_block(at::vec::VectorizedN<float, 8> &input) {
   //  e1  f1  g1  h1 ...
   //  e2  f2  g2  h2 ...
   //  e3  f3  g3  h3 ...
+<<<<<<< HEAD
   temp1[0] = _mm256_castpd_ps(
       _mm256_unpacklo_pd(_mm256_castps_pd(temp0[0]), _mm256_castps_pd(temp0[2])));
   temp1[1] = _mm256_castpd_ps(
@@ -616,6 +874,24 @@ inline void transpose_block(at::vec::VectorizedN<float, 8> &input) {
       _mm256_unpacklo_pd(_mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7])));
   temp1[7] = _mm256_castpd_ps(
       _mm256_unpackhi_pd(_mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7])));
+=======
+  temp1[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(
+      _mm256_castps_pd(temp0[0]), _mm256_castps_pd(temp0[2])));
+  temp1[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(
+      _mm256_castps_pd(temp0[0]), _mm256_castps_pd(temp0[2])));
+  temp1[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(
+      _mm256_castps_pd(temp0[1]), _mm256_castps_pd(temp0[3])));
+  temp1[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(
+      _mm256_castps_pd(temp0[1]), _mm256_castps_pd(temp0[3])));
+  temp1[4] = _mm256_castpd_ps(_mm256_unpacklo_pd(
+      _mm256_castps_pd(temp0[4]), _mm256_castps_pd(temp0[6])));
+  temp1[5] = _mm256_castpd_ps(_mm256_unpackhi_pd(
+      _mm256_castps_pd(temp0[4]), _mm256_castps_pd(temp0[6])));
+  temp1[6] = _mm256_castpd_ps(_mm256_unpacklo_pd(
+      _mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7])));
+  temp1[7] = _mm256_castpd_ps(_mm256_unpackhi_pd(
+      _mm256_castps_pd(temp0[5]), _mm256_castps_pd(temp0[7])));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   //  shuffle 128-bits (composed of 4 32-bit elements)
   //  a0  b0  c0  d0  e0  f0  g0  h0
@@ -637,7 +913,11 @@ inline void transpose_block(at::vec::VectorizedN<float, 8> &input) {
 }
 
 // Used by Inductor CPP codegen
+<<<<<<< HEAD
 template<>
+=======
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void transpose_mxn<float, 8, 8>(
     const float* src,
     int64_t ld_src,
@@ -672,12 +952,17 @@ inline void transpose_mxn<float, 8, 8>(
   }
 }
 
+<<<<<<< HEAD
 template<>
+=======
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void transpose_mxn<float, 16, 16>(
     const float* src,
     int64_t ld_src,
     float* dst,
     int64_t ld_dst) {
+<<<<<<< HEAD
   transpose_mxn<float, 8, 8>(
           src , ld_src, dst, ld_dst);
   transpose_mxn<float, 8, 8>(
@@ -690,3 +975,15 @@ inline void transpose_mxn<float, 16, 16>(
 #endif
 
 }} // namespace at::vec::CPU_CAPABILITY
+=======
+  transpose_mxn<float, 8, 8>(src, ld_src, dst, ld_dst);
+  transpose_mxn<float, 8, 8>(src + 8, ld_src, dst + 8 * ld_dst, ld_dst);
+  transpose_mxn<float, 8, 8>(src + 8 * ld_src, ld_src, dst + 8, ld_dst);
+  transpose_mxn<float, 8, 8>(
+      src + 8 * ld_src + 8, ld_src, dst + 8 * ld_dst + 8, ld_dst);
+}
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_half.h b/aten/src/ATen/cpu/vec/vec256/vec256_half.h
index b27f33c84323..7d08c61e10c4 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_half.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_half.h
@@ -13,8 +13,16 @@ inline namespace CPU_CAPABILITY {
 #ifdef CPU_CAPABILITY_AVX2
 
 template <>
+<<<<<<< HEAD
 class Vectorized<Half>: public Vectorized16<Half> {
 public:
+=======
+struct is_vec_specialized_for<Half> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<Half> : public Vectorized16<Half> {
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vectorized16::Vectorized16;
 
   using value_type = Half;
@@ -29,6 +37,7 @@ class Vectorized<Half>: public Vectorized16<Half> {
   Vectorized<Half> le(const Vectorized<Half>& other) const;
 };
 
+<<<<<<< HEAD
 Vectorized<Half> inline operator+(const Vectorized<Half>& a, const Vectorized<Half>& b) {
   return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_add_ps(x, y); });
 }
@@ -67,6 +76,74 @@ inline Vectorized<Half> Vectorized<Half>::lt(const Vectorized<Half>& other) cons
   return (*this < other) & Vectorized<Half>(1.0f);
 }
 inline Vectorized<Half> Vectorized<Half>::le(const Vectorized<Half>& other) const {
+=======
+Vectorized<Half> inline operator+(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_add_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator-(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_sub_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator*(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_mul_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator/(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) {
+    return _mm256_div_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator&(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm256_and_si256(a, b);
+}
+Vectorized<Half> inline operator|(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm256_or_si256(a, b);
+}
+Vectorized<Half> inline operator^(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm256_xor_si256(a, b);
+}
+
+inline Vectorized<Half> Vectorized<Half>::eq(
+    const Vectorized<Half>& other) const {
+  return (*this == other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::ne(
+    const Vectorized<Half>& other) const {
+  return (*this != other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::gt(
+    const Vectorized<Half>& other) const {
+  return (*this > other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::ge(
+    const Vectorized<Half>& other) const {
+  return (*this >= other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::lt(
+    const Vectorized<Half>& other) const {
+  return (*this < other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::le(
+    const Vectorized<Half>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<Half>(1.0f);
 }
 
@@ -78,7 +155,13 @@ inline Vectorized<Half> Vectorized<Half>::frac() const {
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline maximum(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+=======
+Vectorized<Half> inline maximum(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 b_lo, b_hi;
   cvtfp16_fp32(__m256i(a), a_lo, a_hi);
@@ -96,7 +179,13 @@ Vectorized<Half> inline maximum(const Vectorized<Half>& a, const Vectorized<Half
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline minimum(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+=======
+Vectorized<Half> inline minimum(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 b_lo, b_hi;
   cvtfp16_fp32(__m256i(a), a_lo, a_hi);
@@ -112,8 +201,15 @@ Vectorized<Half> inline minimum(const Vectorized<Half>& a, const Vectorized<Half
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline clamp(const Vectorized<Half>& a,
     const Vectorized<Half>& min, const Vectorized<Half>& max) {
+=======
+Vectorized<Half> inline clamp(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& min,
+    const Vectorized<Half>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 min_lo, min_hi;
   __m256 max_lo, max_hi;
@@ -126,7 +222,13 @@ Vectorized<Half> inline clamp(const Vectorized<Half>& a,
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline clamp_max(const Vectorized<Half>& a, const Vectorized<Half>& max) {
+=======
+Vectorized<Half> inline clamp_max(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 max_lo, max_hi;
   cvtfp16_fp32(__m256i(a), a_lo, a_hi);
@@ -137,7 +239,13 @@ Vectorized<Half> inline clamp_max(const Vectorized<Half>& a, const Vectorized<Ha
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline clamp_min(const Vectorized<Half>& a, const Vectorized<Half>& min) {
+=======
+Vectorized<Half> inline clamp_min(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 min_lo, min_hi;
   cvtfp16_fp32(__m256i(a), a_lo, a_hi);
@@ -153,8 +261,15 @@ inline void convert(const Half* src, Half* dst, int64_t n) {
 #ifndef __msvc_cl__
 #pragma unroll
 #endif
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<Half>::size()); i += Vectorized<Half>::size()) {
     auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
+=======
+  for (i = 0; i <= (n - Vectorized<Half>::size());
+       i += Vectorized<Half>::size()) {
+    auto vsrc =
+        _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
   }
 #ifndef __msvc_cl__
@@ -168,7 +283,12 @@ inline void convert(const Half* src, Half* dst, int64_t n) {
 template <>
 inline void convert(const float* src, Half* dst, int64_t n) {
   int64_t i;
+<<<<<<< HEAD
   for (i = 0; i + Vectorized<Half>::size() <= n; i += Vectorized<Half>::size()) {
+=======
+  for (i = 0; i + Vectorized<Half>::size() <= n;
+       i += Vectorized<Half>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 a = _mm256_loadu_ps(&src[i]);
     __m256 b = _mm256_loadu_ps(&src[i + 8]);
 
@@ -182,7 +302,11 @@ inline void convert(const float* src, Half* dst, int64_t n) {
 
 template <>
 inline void convert(const double* src, Half* dst, int64_t n) {
+<<<<<<< HEAD
   auto load_float = [](const double *src) -> __m256 {
+=======
+  auto load_float = [](const double* src) -> __m256 {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Load one float vector from an array of doubles
     __m128 a = _mm256_cvtpd_ps(_mm256_loadu_pd(src));
     __m128 b = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4));
@@ -190,7 +314,12 @@ inline void convert(const double* src, Half* dst, int64_t n) {
   };
 
   int64_t i;
+<<<<<<< HEAD
   for (i = 0; i + Vectorized<Half>::size() <= n; i += Vectorized<Half>::size()) {
+=======
+  for (i = 0; i + Vectorized<Half>::size() <= n;
+       i += Vectorized<Half>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m256 a = load_float(&src[i]);
     __m256 b = load_float(&src[i + 8]);
 
@@ -203,8 +332,15 @@ inline void convert(const double* src, Half* dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline fmadd(const Vectorized<Half>& a,
     const Vectorized<Half>& b, const Vectorized<Half>& c) {
+=======
+Vectorized<Half> inline fmadd(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b,
+    const Vectorized<Half>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256 a_lo, a_hi;
   __m256 b_lo, b_hi;
   __m256 c_lo, c_hi;
@@ -221,10 +357,21 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)
 
 #else // defined(CPU_CAPABILITY_AVX2)
 
+<<<<<<< HEAD
 #if !(defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256))
+=======
+#if !(                                                                      \
+    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
+    !defined(CPU_CAPABILITY_SVE256))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CONVERT_NON_VECTORIZED_INIT(Half, half)
 #endif
 
 LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
 #endif // defined(CPU_CAPABILITY_AVX2)
+<<<<<<< HEAD
 }} // namsepace at::vec::CPU_CAPABILITY
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index 03929eecfed3..0d80d92f081b 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -14,14 +14,23 @@ inline namespace CPU_CAPABILITY {
 #ifdef CPU_CAPABILITY_AVX2
 
 struct Vectorizedi {
+<<<<<<< HEAD
 protected:
+=======
+ protected:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256i values;
 
   static inline __m256i invert(const __m256i& v) {
     const auto ones = _mm256_set1_epi64x(-1);
     return _mm256_xor_si256(ones, v);
   }
+<<<<<<< HEAD
 public:
+=======
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorizedi() {}
   Vectorizedi(__m256i v) : values(v) {}
   operator __m256i() const {
@@ -31,17 +40,32 @@ struct Vectorizedi {
 
 #else
 
+<<<<<<< HEAD
 struct Vectorizedi {};  // dummy definition to make Vectorizedi always defined
+=======
+struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #endif // CPU_CAPABILITY_AVX2
 
 #ifdef CPU_CAPABILITY_AVX2
 
 template <>
+<<<<<<< HEAD
 class Vectorized<int64_t> : public Vectorizedi {
 private:
   static const Vectorized<int64_t> ones;
 public:
+=======
+struct is_vec_specialized_for<int64_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int64_t> : public Vectorizedi {
+ private:
+  static const Vectorized<int64_t> ones;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = int64_t;
   using size_type = int;
   static constexpr size_type size() {
@@ -49,12 +73,24 @@ class Vectorized<int64_t> : public Vectorizedi {
   }
   using Vectorizedi::Vectorizedi;
   Vectorized() {}
+<<<<<<< HEAD
   Vectorized(int64_t v) { values = _mm256_set1_epi64x(v); }
+=======
+  Vectorized(int64_t v) {
+    values = _mm256_set1_epi64x(v);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized(int64_t val1, int64_t val2, int64_t val3, int64_t val4) {
     values = _mm256_setr_epi64x(val1, val2, val3, val4);
   }
   template <int64_t mask>
+<<<<<<< HEAD
   static Vectorized<int64_t> blend(Vectorized<int64_t> a, Vectorized<int64_t> b) {
+=======
+  static Vectorized<int64_t> blend(
+      Vectorized<int64_t> a,
+      Vectorized<int64_t> b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ int64_t tmp_values[size()];
     a.store(tmp_values);
     if (mask & 0x01)
@@ -67,6 +103,7 @@ class Vectorized<int64_t> : public Vectorizedi {
       tmp_values[3] = _mm256_extract_epi64(b.values, 3);
     return loadu(tmp_values);
   }
+<<<<<<< HEAD
   static Vectorized<int64_t> blendv(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b,
                                 const Vectorized<int64_t>& mask) {
     return _mm256_blendv_epi8(a.values, b.values, mask.values);
@@ -77,6 +114,25 @@ class Vectorized<int64_t> : public Vectorizedi {
   }
   static Vectorized<int64_t>
   set(Vectorized<int64_t> a, Vectorized<int64_t> b, int64_t count = size()) {
+=======
+  static Vectorized<int64_t> blendv(
+      const Vectorized<int64_t>& a,
+      const Vectorized<int64_t>& b,
+      const Vectorized<int64_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<int64_t> arange(
+      int64_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int64_t>(
+        base, base + step, base + 2 * step, base + 3 * step);
+  }
+  static Vectorized<int64_t> set(
+      Vectorized<int64_t> a,
+      Vectorized<int64_t> b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -94,9 +150,16 @@ class Vectorized<int64_t> : public Vectorizedi {
   }
   static Vectorized<int64_t> loadu(const void* ptr, int64_t count) {
     __at_align__ int64_t tmp_values[size()];
+<<<<<<< HEAD
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
+=======
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
@@ -114,8 +177,13 @@ class Vectorized<int64_t> : public Vectorizedi {
       std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
     }
   }
+<<<<<<< HEAD
   const int64_t& operator[](int idx) const  = delete;
   int64_t& operator[](int idx)  = delete;
+=======
+  const int64_t& operator[](int idx) const = delete;
+  int64_t& operator[](int idx) = delete;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<int64_t> abs() const {
     auto zero = _mm256_set1_epi64x(0);
     auto is_larger = _mm256_cmpgt_epi64(zero, values);
@@ -160,16 +228,28 @@ class Vectorized<int64_t> : public Vectorizedi {
 };
 
 template <>
+<<<<<<< HEAD
 class Vectorized<int32_t> : public Vectorizedi {
 private:
   static const Vectorized<int32_t> ones;
 public:
+=======
+struct is_vec_specialized_for<int32_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int32_t> : public Vectorizedi {
+ private:
+  static const Vectorized<int32_t> ones;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = int32_t;
   static constexpr int size() {
     return 8;
   }
   using Vectorizedi::Vectorizedi;
   Vectorized() {}
+<<<<<<< HEAD
   Vectorized(int32_t v) { values = _mm256_set1_epi32(v); }
   Vectorized(int32_t val1, int32_t val2, int32_t val3, int32_t val4,
          int32_t val5, int32_t val6, int32_t val7, int32_t val8) {
@@ -191,6 +271,52 @@ class Vectorized<int32_t> : public Vectorizedi {
   }
   static Vectorized<int32_t>
   set(Vectorized<int32_t> a, Vectorized<int32_t> b, int32_t count = size()) {
+=======
+  Vectorized(int32_t v) {
+    values = _mm256_set1_epi32(v);
+  }
+  Vectorized(
+      int32_t val1,
+      int32_t val2,
+      int32_t val3,
+      int32_t val4,
+      int32_t val5,
+      int32_t val6,
+      int32_t val7,
+      int32_t val8) {
+    values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
+  template <int64_t mask>
+  static Vectorized<int32_t> blend(
+      Vectorized<int32_t> a,
+      Vectorized<int32_t> b) {
+    return _mm256_blend_epi32(a, b, mask);
+  }
+  static Vectorized<int32_t> blendv(
+      const Vectorized<int32_t>& a,
+      const Vectorized<int32_t>& b,
+      const Vectorized<int32_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<int32_t> arange(
+      int32_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int32_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<int32_t> set(
+      Vectorized<int32_t> a,
+      Vectorized<int32_t> b,
+      int32_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -216,9 +342,16 @@ class Vectorized<int32_t> : public Vectorizedi {
   }
   static Vectorized<int32_t> loadu(const void* ptr, int32_t count) {
     __at_align__ int32_t tmp_values[size()];
+<<<<<<< HEAD
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
+=======
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
@@ -236,8 +369,13 @@ class Vectorized<int32_t> : public Vectorizedi {
       std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
     }
   }
+<<<<<<< HEAD
   const int32_t& operator[](int idx) const  = delete;
   int32_t& operator[](int idx)  = delete;
+=======
+  const int32_t& operator[](int idx) const = delete;
+  int32_t& operator[](int idx) = delete;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<int32_t> abs() const {
     return _mm256_abs_epi32(values);
   }
@@ -306,6 +444,7 @@ class Vectorized<int32_t> : public Vectorizedi {
 };
 
 template <>
+<<<<<<< HEAD
 inline void convert(const int32_t *src, float *dst, int64_t n) {
   int64_t i;
   // int32_t and float have same size
@@ -314,11 +453,27 @@ inline void convert(const int32_t *src, float *dst, int64_t n) {
 #endif
   for (i = 0; i <= (n - Vectorized<int32_t>::size()); i += Vectorized<int32_t>::size()) {
     auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
+=======
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  int64_t i;
+  // int32_t and float have same size
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<int32_t>::size());
+       i += Vectorized<int32_t>::size()) {
+    auto input_vec =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto output_vec = _mm256_cvtepi32_ps(input_vec);
     _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
   }
 #ifndef _MSC_VER
+<<<<<<< HEAD
 # pragma unroll
+=======
+#pragma unroll
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   for (; i < n; i++) {
     dst[i] = static_cast<float>(src[i]);
@@ -326,6 +481,7 @@ inline void convert(const int32_t *src, float *dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const int32_t *src, double *dst, int64_t n) {
   int64_t i;
   // int32_t has half the size of double
@@ -334,11 +490,27 @@ inline void convert(const int32_t *src, double *dst, int64_t n) {
 #endif
   for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
     auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
+=======
+inline void convert(const int32_t* src, double* dst, int64_t n) {
+  int64_t i;
+  // int32_t has half the size of double
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+    auto input_128_vec =
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
     _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
   }
 #ifndef _MSC_VER
+<<<<<<< HEAD
 # pragma unroll
+=======
+#pragma unroll
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   for (; i < n; i++) {
     dst[i] = static_cast<double>(src[i]);
@@ -346,16 +518,28 @@ inline void convert(const int32_t *src, double *dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 class Vectorized<int16_t> : public Vectorizedi {
 private:
   static const Vectorized<int16_t> ones;
 public:
+=======
+struct is_vec_specialized_for<int16_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int16_t> : public Vectorizedi {
+ private:
+  static const Vectorized<int16_t> ones;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = int16_t;
   static constexpr int size() {
     return 16;
   }
   using Vectorizedi::Vectorizedi;
   Vectorized() {}
+<<<<<<< HEAD
   Vectorized(int16_t v) { values = _mm256_set1_epi16(v); }
   Vectorized(int16_t val1, int16_t val2, int16_t val3, int16_t val4,
          int16_t val5, int16_t val6, int16_t val7, int16_t val8,
@@ -366,6 +550,50 @@ class Vectorized<int16_t> : public Vectorizedi {
   }
   template <int64_t mask>
   static Vectorized<int16_t> blend(Vectorized<int16_t> a, Vectorized<int16_t> b) {
+=======
+  Vectorized(int16_t v) {
+    values = _mm256_set1_epi16(v);
+  }
+  Vectorized(
+      int16_t val1,
+      int16_t val2,
+      int16_t val3,
+      int16_t val4,
+      int16_t val5,
+      int16_t val6,
+      int16_t val7,
+      int16_t val8,
+      int16_t val9,
+      int16_t val10,
+      int16_t val11,
+      int16_t val12,
+      int16_t val13,
+      int16_t val14,
+      int16_t val15,
+      int16_t val16) {
+    values = _mm256_setr_epi16(
+        val1,
+        val2,
+        val3,
+        val4,
+        val5,
+        val6,
+        val7,
+        val8,
+        val9,
+        val10,
+        val11,
+        val12,
+        val13,
+        val14,
+        val15,
+        val16);
+  }
+  template <int64_t mask>
+  static Vectorized<int16_t> blend(
+      Vectorized<int16_t> a,
+      Vectorized<int16_t> b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ int16_t tmp_values[size()];
     a.store(tmp_values);
     if (mask & 0x01)
@@ -402,6 +630,7 @@ class Vectorized<int16_t> : public Vectorizedi {
       tmp_values[15] = _mm256_extract_epi16(b.values, 15);
     return loadu(tmp_values);
   }
+<<<<<<< HEAD
   static Vectorized<int16_t> blendv(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b,
                                 const Vectorized<int16_t>& mask) {
     return _mm256_blendv_epi8(a.values, b.values, mask.values);
@@ -416,6 +645,40 @@ class Vectorized<int16_t> : public Vectorizedi {
   }
   static Vectorized<int16_t>
   set(Vectorized<int16_t> a, Vectorized<int16_t> b, int16_t count = size()) {
+=======
+  static Vectorized<int16_t> blendv(
+      const Vectorized<int16_t>& a,
+      const Vectorized<int16_t>& b,
+      const Vectorized<int16_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<int16_t> arange(
+      int16_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int16_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+  static Vectorized<int16_t> set(
+      Vectorized<int16_t> a,
+      Vectorized<int16_t> b,
+      int16_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -457,9 +720,16 @@ class Vectorized<int16_t> : public Vectorizedi {
   }
   static Vectorized<int16_t> loadu(const void* ptr, int16_t count) {
     __at_align__ int16_t tmp_values[size()];
+<<<<<<< HEAD
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
+=======
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
@@ -477,8 +747,13 @@ class Vectorized<int16_t> : public Vectorizedi {
       std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
     }
   }
+<<<<<<< HEAD
   const int16_t& operator[](int idx) const  = delete;
   int16_t& operator[](int idx)  = delete;
+=======
+  const int16_t& operator[](int idx) const = delete;
+  int16_t& operator[](int idx) = delete;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<int16_t> abs() const {
     return _mm256_abs_epi16(values);
   }
@@ -522,17 +797,28 @@ class Vectorized<int16_t> : public Vectorizedi {
 template <typename T>
 class Vectorized8 : public Vectorizedi {
   static_assert(
+<<<<<<< HEAD
     std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
     "Only int8_t/uint8_t are supported");
 protected:
   static const Vectorized<T> ones;
 public:
+=======
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
+      "Only int8_t/uint8_t are supported");
+
+ protected:
+  static const Vectorized<T> ones;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = T;
   static constexpr int size() {
     return 32;
   }
   using Vectorizedi::Vectorizedi;
   Vectorized8() {}
+<<<<<<< HEAD
   Vectorized8(T v) { values = _mm256_set1_epi8(v); }
   Vectorized8(T val1, T val2, T val3, T val4,
          T val5, T val6, T val7, T val8,
@@ -546,6 +832,77 @@ class Vectorized8 : public Vectorizedi {
                               val9, val10, val11, val12, val13, val14, val15, val16,
                               val17, val18, val19, val20, val21, val22, val23, val24,
                               val25, val26, val27, val28, val29, val30, val31, val32);
+=======
+  Vectorized8(T v) {
+    values = _mm256_set1_epi8(v);
+  }
+  Vectorized8(
+      T val1,
+      T val2,
+      T val3,
+      T val4,
+      T val5,
+      T val6,
+      T val7,
+      T val8,
+      T val9,
+      T val10,
+      T val11,
+      T val12,
+      T val13,
+      T val14,
+      T val15,
+      T val16,
+      T val17,
+      T val18,
+      T val19,
+      T val20,
+      T val21,
+      T val22,
+      T val23,
+      T val24,
+      T val25,
+      T val26,
+      T val27,
+      T val28,
+      T val29,
+      T val30,
+      T val31,
+      T val32) {
+    values = _mm256_setr_epi8(
+        val1,
+        val2,
+        val3,
+        val4,
+        val5,
+        val6,
+        val7,
+        val8,
+        val9,
+        val10,
+        val11,
+        val12,
+        val13,
+        val14,
+        val15,
+        val16,
+        val17,
+        val18,
+        val19,
+        val20,
+        val21,
+        val22,
+        val23,
+        val24,
+        val25,
+        val26,
+        val27,
+        val28,
+        val29,
+        val30,
+        val31,
+        val32);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   template <int64_t mask>
   static Vectorized<T> blend(Vectorized<T> a, Vectorized<T> b) {
@@ -617,6 +974,7 @@ class Vectorized8 : public Vectorizedi {
       tmp_values[31] = _mm256_extract_epi8(b.values, 31);
     return loadu(tmp_values);
   }
+<<<<<<< HEAD
   static Vectorized<T> blendv(const Vectorized<T>& a, const Vectorized<T>& b,
                                const Vectorized<T>& mask) {
     return _mm256_blendv_epi8(a.values, b.values, mask.values);
@@ -635,6 +993,53 @@ class Vectorized8 : public Vectorizedi {
   }
   static Vectorized<T>
   set(Vectorized<T> a, Vectorized<T> b, T count = size()) {
+=======
+  static Vectorized<T> blendv(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<T> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step,
+        base + 16 * step,
+        base + 17 * step,
+        base + 18 * step,
+        base + 19 * step,
+        base + 20 * step,
+        base + 21 * step,
+        base + 22 * step,
+        base + 23 * step,
+        base + 24 * step,
+        base + 25 * step,
+        base + 26 * step,
+        base + 27 * step,
+        base + 28 * step,
+        base + 29 * step,
+        base + 30 * step,
+        base + 31 * step);
+  }
+  static Vectorized<T> set(Vectorized<T> a, Vectorized<T> b, T count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -707,6 +1112,7 @@ class Vectorized8 : public Vectorizedi {
     return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
   }
   static Vectorized<T> loadu_one_fourth(const void* ptr) {
+<<<<<<< HEAD
       // Fast path if only load element number of 8.
       // Note: We didn't merge it as fast path of loadu(const void* ptr, T count),
       // Because loadu(const void* ptr, T count) requires zero initialization for upper 128 bits.
@@ -721,6 +1127,24 @@ class Vectorized8 : public Vectorizedi {
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
+=======
+    // Fast path if only load element number of 8.
+    // Note: We didn't merge it as fast path of loadu(const void* ptr, T count),
+    // Because loadu(const void* ptr, T count) requires zero initialization for
+    // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128
+    // bits of the result are undefined.
+    // TODO<leslie> We can use _mm256_zextsi128_si256 in the furture,
+    // since gcc 9.3 doesn't support it now.
+    __m128i input_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr));
+    return _mm256_castsi128_si256(input_128);
+  }
+  static Vectorized<T> loadu(const void* ptr, T count) {
+    __at_align__ T tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
@@ -735,7 +1159,12 @@ class Vectorized8 : public Vectorizedi {
     } else if (count > 0) {
       if (count == 8) {
         // Fast path if only store element number of 8
+<<<<<<< HEAD
         _mm_storel_epi64(reinterpret_cast<__m128i*>(ptr), _mm256_castsi256_si128(values));
+=======
+        _mm_storel_epi64(
+            reinterpret_cast<__m128i*>(ptr), _mm256_castsi256_si128(values));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } else {
         __at_align__ T tmp_values[size()];
         _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
@@ -743,8 +1172,13 @@ class Vectorized8 : public Vectorizedi {
       }
     }
   }
+<<<<<<< HEAD
   const T& operator[](int idx) const  = delete;
   T& operator[](int idx)  = delete;
+=======
+  const T& operator[](int idx) const = delete;
+  T& operator[](int idx) = delete;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> real() const {
     return *this;
   }
@@ -756,15 +1190,28 @@ class Vectorized8 : public Vectorizedi {
   }
 };
 
+<<<<<<< HEAD
 template<>
 class Vectorized<int8_t>: public Vectorized8<int8_t> {
 public:
+=======
+template <>
+struct is_vec_specialized_for<int8_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int8_t> : public Vectorized8<int8_t> {
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vectorized8::Vectorized8;
 
   Vectorized<int8_t> neg() const;
 
   Vectorized<int8_t> abs() const {
+<<<<<<< HEAD
    return _mm256_abs_epi8(values);
+=======
+    return _mm256_abs_epi8(values);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<int8_t> operator==(const Vectorized<int8_t>& other) const {
@@ -794,9 +1241,18 @@ class Vectorized<int8_t>: public Vectorized8<int8_t> {
   Vectorized<int8_t> le(const Vectorized<int8_t>& other) const;
 };
 
+<<<<<<< HEAD
 template<>
 class Vectorized<uint8_t>: public Vectorized8<uint8_t> {
 public:
+=======
+template <>
+struct is_vec_specialized_for<uint8_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<uint8_t> : public Vectorized8<uint8_t> {
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vectorized8::Vectorized8;
 
   Vectorized<uint8_t> neg() const;
@@ -835,52 +1291,112 @@ class Vectorized<uint8_t>: public Vectorized8<uint8_t> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator+(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_add_epi64(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator+(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_add_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator+(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_add_epi16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator+(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator+(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_add_epi8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline operator+(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline operator+(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_add_epi8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator-(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator-(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_sub_epi64(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator-(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator-(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_sub_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator-(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator-(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_sub_epi16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator-(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator-(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_sub_epi8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline operator-(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline operator-(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_sub_epi8(a, b);
 }
 
@@ -909,7 +1425,14 @@ inline Vectorized<uint8_t> Vectorized<uint8_t>::neg() const {
 // by extracting each element, performing the operation pointwise,
 // then combining the results into a vector.
 template <typename op_t>
+<<<<<<< HEAD
 Vectorized<int64_t> inline emulate(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b, const op_t& op) {
+=======
+Vectorized<int64_t> inline emulate(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b,
+    const op_t& op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t a0 = _mm256_extract_epi64(a, 0);
   int64_t a1 = _mm256_extract_epi64(a, 1);
   int64_t a2 = _mm256_extract_epi64(a, 2);
@@ -929,7 +1452,15 @@ Vectorized<int64_t> inline emulate(const Vectorized<int64_t>& a, const Vectorize
 }
 
 template <typename op_t>
+<<<<<<< HEAD
 Vectorized<int64_t> inline emulate(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b, const Vectorized<int64_t>& c, const op_t& op) {
+=======
+Vectorized<int64_t> inline emulate(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b,
+    const Vectorized<int64_t>& c,
+    const op_t& op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t a0 = _mm256_extract_epi64(a, 0);
   int64_t a1 = _mm256_extract_epi64(a, 1);
   int64_t a2 = _mm256_extract_epi64(a, 2);
@@ -959,22 +1490,51 @@ Vectorized<int64_t> inline emulate(const Vectorized<int64_t>& a, const Vectorize
 // code for add as well.
 // Note: intentionally ignores undefined behavior like (-lowest * -1).
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator*(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return emulate(a, b, [](int64_t a_point, int64_t b_point) __ubsan_ignore_undefined__ {return a_point * b_point;});
 }
 
 template <>
 Vectorized<int32_t> inline operator*(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int64_t> inline operator*(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return emulate(
+      a, b, [](int64_t a_point, int64_t b_point) __ubsan_ignore_undefined__ {
+        return a_point * b_point;
+      });
+}
+
+template <>
+Vectorized<int32_t> inline operator*(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_mullo_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator*(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator*(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_mullo_epi16(a, b);
 }
 
 template <typename T, typename Op>
+<<<<<<< HEAD
 Vectorized<T> inline int_elementwise_binary_256(const Vectorized<T>& a, const Vectorized<T>& b, Op op) {
+=======
+Vectorized<T> inline int_elementwise_binary_256(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   T values_a[Vectorized<T>::size()];
   T values_b[Vectorized<T>::size()];
   a.store(values_a);
@@ -986,7 +1546,13 @@ Vectorized<T> inline int_elementwise_binary_256(const Vectorized<T>& a, const Ve
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator*(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator*(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We don't have an instruction for multiplying int8_t
 #ifndef CPU_CAPABILITY_AVX2
   return int_elementwise_binary_256(a, b, std::multiplies<int8_t>());
@@ -1004,14 +1570,25 @@ Vectorized<int8_t> inline operator*(const Vectorized<int8_t>& a, const Vectorize
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline operator*(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline operator*(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We don't have an instruction for multiplying uint8_t
 #ifndef CPU_CAPABILITY_AVX2
   return int_elementwise_binary_256(a, b, std::multiplies<uint8_t>());
 #else
   __m256i mask00FF = _mm256_set1_epi16(0x00FF);
+<<<<<<< HEAD
   __m256i a_lo = _mm256_and_si256 (a, mask00FF);
   __m256i b_lo = _mm256_and_si256 (b, mask00FF);
+=======
+  __m256i a_lo = _mm256_and_si256(a, mask00FF);
+  __m256i b_lo = _mm256_and_si256(b, mask00FF);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256i a_hi = _mm256_srli_epi16(a, 8);
   __m256i b_hi = _mm256_srli_epi16(b, 8);
   __m256i res_lo = _mm256_and_si256(_mm256_mullo_epi16(a_lo, b_lo), mask00FF);
@@ -1022,9 +1599,19 @@ Vectorized<uint8_t> inline operator*(const Vectorized<uint8_t>& a, const Vectori
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline minimum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
 #ifndef CPU_CAPABILITY_AVX2
   return emulate(a, b, [](int64_t a_point, int64_t b_point) {return std::min(a_point, b_point);});
+=======
+Vectorized<int64_t> inline minimum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, b, [](int64_t a_point, int64_t b_point) {
+    return std::min(a_point, b_point);
+  });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   __m256i cmp = _mm256_cmpgt_epi64(a, b);
   return _mm256_blendv_epi8(a, b, cmp);
@@ -1032,29 +1619,63 @@ Vectorized<int64_t> inline minimum(const Vectorized<int64_t>& a, const Vectorize
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline minimum(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline minimum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline minimum(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline minimum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epi16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline minimum(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline minimum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epi8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline minimum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline minimum(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epu8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline maximum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
 #ifndef CPU_CAPABILITY_AVX2
   return emulate(a, b, [](int64_t a_point, int64_t b_point) {return std::max(a_point, b_point);});
+=======
+Vectorized<int64_t> inline maximum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, b, [](int64_t a_point, int64_t b_point) {
+    return std::max(a_point, b_point);
+  });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   __m256i cmp = _mm256_cmpgt_epi64(a, b);
   return _mm256_blendv_epi8(b, a, cmp);
@@ -1062,108 +1683,238 @@ Vectorized<int64_t> inline maximum(const Vectorized<int64_t>& a, const Vectorize
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline maximum(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline maximum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_max_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline maximum(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline maximum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_max_epi16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline maximum(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline maximum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_max_epi8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline maximum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline maximum(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_max_epu8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline clamp(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val, const Vectorized<int64_t>& max_val) {
 #ifndef CPU_CAPABILITY_AVX2
   return emulate(a, min_val, max_val, [](int64_t a_point, int64_t min_point, int64_t max_point) {return std::min(max_point, std::max(a_point, min_point));});
+=======
+Vectorized<int64_t> inline clamp(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min_val,
+    const Vectorized<int64_t>& max_val) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(
+      a,
+      min_val,
+      max_val,
+      [](int64_t a_point, int64_t min_point, int64_t max_point) {
+        return std::min(max_point, std::max(a_point, min_point));
+      });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   return minimum(maximum(a, min_val), max_val);
 #endif
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline clamp(const Vectorized<int32_t>& a, const Vectorized<int32_t>& min_val, const Vectorized<int32_t>& max_val) {
+=======
+Vectorized<int32_t> inline clamp(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min_val,
+    const Vectorized<int32_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epi32(max_val, _mm256_max_epi32(a, min_val));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline clamp(const Vectorized<int16_t>& a, const Vectorized<int16_t>& min_val, const Vectorized<int16_t>& max_val) {
+=======
+Vectorized<int16_t> inline clamp(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min_val,
+    const Vectorized<int16_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epi16(max_val, _mm256_max_epi16(a, min_val));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline clamp(const Vectorized<int8_t>& a, const Vectorized<int8_t>& min_val, const Vectorized<int8_t>& max_val) {
+=======
+Vectorized<int8_t> inline clamp(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min_val,
+    const Vectorized<int8_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epi8(max_val, _mm256_max_epi8(a, min_val));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline clamp(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val, const Vectorized<uint8_t>& max_val) {
+=======
+Vectorized<uint8_t> inline clamp(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& min_val,
+    const Vectorized<uint8_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epu8(max_val, _mm256_max_epu8(a, min_val));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline clamp_max(const Vectorized<int64_t>& a, const Vectorized<int64_t>& max_val) {
 #ifndef CPU_CAPABILITY_AVX2
   return emulate(a, max_val, [](int64_t a_point, int64_t max_point) {return std::min(max_point, a_point);});
+=======
+Vectorized<int64_t> inline clamp_max(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& max_val) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, max_val, [](int64_t a_point, int64_t max_point) {
+    return std::min(max_point, a_point);
+  });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   return minimum(max_val, a);
 #endif
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline clamp_max(const Vectorized<int32_t>& a, const Vectorized<int32_t>& max_val) {
+=======
+Vectorized<int32_t> inline clamp_max(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epi32(max_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline clamp_max(const Vectorized<int16_t>& a, const Vectorized<int16_t>& max_val) {
+=======
+Vectorized<int16_t> inline clamp_max(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epi16(max_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline clamp_max(const Vectorized<int8_t>& a, const Vectorized<int8_t>& max_val) {
+=======
+Vectorized<int8_t> inline clamp_max(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epi8(max_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline clamp_max(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& max_val) {
+=======
+Vectorized<uint8_t> inline clamp_max(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_min_epu8(max_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline clamp_min(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val) {
 #ifndef CPU_CAPABILITY_AVX2
   return emulate(a, min_val, [](int64_t a_point, int64_t min_point) {return std::max(min_point, a_point);});
+=======
+Vectorized<int64_t> inline clamp_min(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min_val) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, min_val, [](int64_t a_point, int64_t min_point) {
+    return std::max(min_point, a_point);
+  });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   return maximum(min_val, a);
 #endif
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline clamp_min(const Vectorized<int32_t>& a, const Vectorized<int32_t>& min_val) {
+=======
+Vectorized<int32_t> inline clamp_min(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_max_epi32(min_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline clamp_min(const Vectorized<int16_t>& a, const Vectorized<int16_t>& min_val) {
+=======
+Vectorized<int16_t> inline clamp_min(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_max_epi16(min_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline clamp_min(const Vectorized<int8_t>& a, const Vectorized<int8_t>& min_val) {
+=======
+Vectorized<int8_t> inline clamp_min(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_max_epi8(min_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline clamp_min(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val) {
   return _mm256_max_epu8(min_val, a);
 }
@@ -1179,17 +1930,52 @@ std::enable_if_t<std::is_same_v<T, int8_t>, Vectorized<int32_t>>
 inline convert_to_int32(const int8_t* ptr, int count=Vectorized<int32_t>::size()) {
   if (count == Vectorized<int32_t>::size()) {
     return _mm256_cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr)));
+=======
+Vectorized<uint8_t> inline clamp_min(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& min_val) {
+  return _mm256_max_epu8(min_val, a);
+}
+
+template <typename T>
+std::enable_if_t<
+    !(std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>),
+    Vectorized<
+        int32_t>> inline convert_to_int32(const T* ptr, int count = Vectorized<int32_t>::size()) {
+  return Vectorized<int32_t>::loadu(ptr, count);
+}
+
+template <typename T>
+std::
+    enable_if_t<std::is_same_v<T, int8_t>, Vectorized<int32_t>> inline convert_to_int32(
+        const int8_t* ptr,
+        int count = Vectorized<int32_t>::size()) {
+  if (count == Vectorized<int32_t>::size()) {
+    return _mm256_cvtepi8_epi32(
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     auto a = Vectorized<int8_t>::loadu(ptr, count);
     return _mm256_cvtepi8_epi32(_mm256_castsi256_si128(a));
   }
 }
 
+<<<<<<< HEAD
 template<typename T>
 std::enable_if_t<std::is_same_v<T, uint8_t>, Vectorized<int32_t>>
 inline convert_to_int32(const uint8_t* ptr, int count=Vectorized<int32_t>::size()) {
   if (count == Vectorized<int32_t>::size()) {
     return _mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr)));
+=======
+template <typename T>
+std::
+    enable_if_t<std::is_same_v<T, uint8_t>, Vectorized<int32_t>> inline convert_to_int32(
+        const uint8_t* ptr,
+        int count = Vectorized<int32_t>::size()) {
+  if (count == Vectorized<int32_t>::size()) {
+    return _mm256_cvtepu8_epi32(
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     auto a = Vectorized<uint8_t>::loadu(ptr, count);
     return _mm256_cvtepu8_epi32(_mm256_castsi256_si128(a));
@@ -1197,6 +1983,7 @@ inline convert_to_int32(const uint8_t* ptr, int count=Vectorized<int32_t>::size(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return int_elementwise_binary_256(a, b, std::divides<int64_t>());
 }
@@ -1230,10 +2017,72 @@ inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
   return _mm256_xor_si256(a, b);
 }
 template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+=======
+Vectorized<int64_t> inline operator/(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int64_t>());
+}
+template <>
+Vectorized<int32_t> inline operator/(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int32_t>());
+}
+template <>
+Vectorized<int16_t> inline operator/(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int16_t>());
+}
+template <>
+Vectorized<int8_t> inline operator/(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int8_t>());
+}
+template <>
+Vectorized<uint8_t> inline operator/(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<uint8_t>());
+}
+
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm256_and_si256(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm256_or_si256(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm256_xor_si256(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<T> operator~(const Vectorized<T>& a) {
   return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
 }
 
+<<<<<<< HEAD
 inline Vectorized<int64_t> Vectorized<int64_t>::eq(const Vectorized<int64_t>& other) const {
   return (*this == other) & Vectorized<int64_t>(1);
 }
@@ -1351,11 +2200,166 @@ inline Vectorized<uint8_t> Vectorized<uint8_t>::lt(const Vectorized<uint8_t>& ot
 }
 
 inline Vectorized<uint8_t> Vectorized<uint8_t>::le(const Vectorized<uint8_t>& other) const {
+=======
+inline Vectorized<int64_t> Vectorized<int64_t>::eq(
+    const Vectorized<int64_t>& other) const {
+  return (*this == other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ne(
+    const Vectorized<int64_t>& other) const {
+  return (*this != other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::gt(
+    const Vectorized<int64_t>& other) const {
+  return (*this > other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ge(
+    const Vectorized<int64_t>& other) const {
+  return (*this >= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::lt(
+    const Vectorized<int64_t>& other) const {
+  return (*this < other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::le(
+    const Vectorized<int64_t>& other) const {
+  return (*this <= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::eq(
+    const Vectorized<int32_t>& other) const {
+  return (*this == other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ne(
+    const Vectorized<int32_t>& other) const {
+  return (*this != other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::gt(
+    const Vectorized<int32_t>& other) const {
+  return (*this > other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ge(
+    const Vectorized<int32_t>& other) const {
+  return (*this >= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::lt(
+    const Vectorized<int32_t>& other) const {
+  return (*this < other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::le(
+    const Vectorized<int32_t>& other) const {
+  return (*this <= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::eq(
+    const Vectorized<int16_t>& other) const {
+  return (*this == other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ne(
+    const Vectorized<int16_t>& other) const {
+  return (*this != other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::gt(
+    const Vectorized<int16_t>& other) const {
+  return (*this > other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ge(
+    const Vectorized<int16_t>& other) const {
+  return (*this >= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::lt(
+    const Vectorized<int16_t>& other) const {
+  return (*this < other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::le(
+    const Vectorized<int16_t>& other) const {
+  return (*this <= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::eq(
+    const Vectorized<int8_t>& other) const {
+  return (*this == other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ne(
+    const Vectorized<int8_t>& other) const {
+  return (*this != other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::gt(
+    const Vectorized<int8_t>& other) const {
+  return (*this > other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ge(
+    const Vectorized<int8_t>& other) const {
+  return (*this >= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::lt(
+    const Vectorized<int8_t>& other) const {
+  return (*this < other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::le(
+    const Vectorized<int8_t>& other) const {
+  return (*this <= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::eq(
+    const Vectorized<uint8_t>& other) const {
+  return (*this == other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ne(
+    const Vectorized<uint8_t>& other) const {
+  return (*this != other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::gt(
+    const Vectorized<uint8_t>& other) const {
+  return (*this > other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ge(
+    const Vectorized<uint8_t>& other) const {
+  return (*this >= other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::lt(
+    const Vectorized<uint8_t>& other) const {
+  return (*this < other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::le(
+    const Vectorized<uint8_t>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<uint8_t>(1);
 }
 
 template <bool left_shift>
+<<<<<<< HEAD
 Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline shift_256_16(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // No vector instruction for shifting int16_t, so emulating it instead.
 
   // Control masks for shuffle operation, treating 256 bits as an
@@ -1364,6 +2368,7 @@ Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vect
   // M!=N) is set so that shuffle will move element with index M from
   // input pair into element with index N in output pair, and element
   // with index M in output pair will be set to all 0s.
+<<<<<<< HEAD
   __m256i ctl_0_1 = _mm256_set_epi8(29, 28, 0x80, 0x80, 25, 24, 0x80, 0x80,
                                     21, 20, 0x80, 0x80, 17, 16, 0x80, 0x80,
                                     13, 12, 0x80, 0x80, 9, 8, 0x80, 0x80,
@@ -1372,6 +2377,74 @@ Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vect
                                     0x80, 0x80, 23, 22, 0x80, 0x80, 19, 18,
                                     0x80, 0x80, 15, 14, 0x80, 0x80, 11, 10,
                                     0x80, 0x80, 7, 6, 0x80, 0x80, 3, 2);
+=======
+  __m256i ctl_0_1 = _mm256_set_epi8(
+      29,
+      28,
+      0x80,
+      0x80,
+      25,
+      24,
+      0x80,
+      0x80,
+      21,
+      20,
+      0x80,
+      0x80,
+      17,
+      16,
+      0x80,
+      0x80,
+      13,
+      12,
+      0x80,
+      0x80,
+      9,
+      8,
+      0x80,
+      0x80,
+      5,
+      4,
+      0x80,
+      0x80,
+      1,
+      0,
+      0x80,
+      0x80);
+  __m256i ctl_1_0 = _mm256_set_epi8(
+      0x80,
+      0x80,
+      31,
+      30,
+      0x80,
+      0x80,
+      27,
+      26,
+      0x80,
+      0x80,
+      23,
+      22,
+      0x80,
+      0x80,
+      19,
+      18,
+      0x80,
+      0x80,
+      15,
+      14,
+      0x80,
+      0x80,
+      11,
+      10,
+      0x80,
+      0x80,
+      7,
+      6,
+      0x80,
+      0x80,
+      3,
+      2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Masks for bitwise and operation, treating 256 bits as an array of
   // 16-bit elements, and considering them in pairs of neighboring
@@ -1423,8 +2496,20 @@ Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vect
   return c;
 }
 
+<<<<<<< HEAD
 template <bool left_shift, typename T, typename std::enable_if_t<std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>, int> = 0>
 Vectorized<T> inline shift_256_8(const Vectorized<T>& a, const Vectorized<T>& b) {
+=======
+template <
+    bool left_shift,
+    typename T,
+    typename std::enable_if_t<
+        std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
+        int> = 0>
+Vectorized<T> inline shift_256_8(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // No vector instruction for shifting int8_t/uint8_t, so emulating
   // it instead.
 
@@ -1435,6 +2520,7 @@ Vectorized<T> inline shift_256_8(const Vectorized<T>& a, const Vectorized<T>& b)
   // with index M from input quadruple into element with index N in
   // output quadruple, and other elements in output quadruple will be
   // set to all 0s.
+<<<<<<< HEAD
   __m256i ctl_0_3 = _mm256_set_epi8(28, 0x80, 0x80, 0x80, 24, 0x80, 0x80, 0x80,
                                     20, 0x80, 0x80, 0x80, 16, 0x80, 0x80, 0x80,
                                     12, 0x80, 0x80, 0x80, 8, 0x80, 0x80, 0x80,
@@ -1467,6 +2553,272 @@ Vectorized<T> inline shift_256_8(const Vectorized<T>& a, const Vectorized<T>& b)
                                     0x80, 23, 0x80, 0x80, 0x80, 19, 0x80, 0x80,
                                     0x80, 15, 0x80, 0x80, 0x80, 11, 0x80, 0x80,
                                     0x80, 7, 0x80, 0x80, 0x80, 3, 0x80, 0x80);
+=======
+  __m256i ctl_0_3 = _mm256_set_epi8(
+      28,
+      0x80,
+      0x80,
+      0x80,
+      24,
+      0x80,
+      0x80,
+      0x80,
+      20,
+      0x80,
+      0x80,
+      0x80,
+      16,
+      0x80,
+      0x80,
+      0x80,
+      12,
+      0x80,
+      0x80,
+      0x80,
+      8,
+      0x80,
+      0x80,
+      0x80,
+      4,
+      0x80,
+      0x80,
+      0x80,
+      0,
+      0x80,
+      0x80,
+      0x80);
+  __m256i ctl_1_0 = _mm256_set_epi8(
+      0x80,
+      0x80,
+      0x80,
+      29,
+      0x80,
+      0x80,
+      0x80,
+      25,
+      0x80,
+      0x80,
+      0x80,
+      21,
+      0x80,
+      0x80,
+      0x80,
+      17,
+      0x80,
+      0x80,
+      0x80,
+      13,
+      0x80,
+      0x80,
+      0x80,
+      9,
+      0x80,
+      0x80,
+      0x80,
+      5,
+      0x80,
+      0x80,
+      0x80,
+      1);
+  __m256i ctl_1_3 = _mm256_set_epi8(
+      29,
+      0x80,
+      0x80,
+      0x80,
+      25,
+      0x80,
+      0x80,
+      0x80,
+      21,
+      0x80,
+      0x80,
+      0x80,
+      17,
+      0x80,
+      0x80,
+      0x80,
+      13,
+      0x80,
+      0x80,
+      0x80,
+      9,
+      0x80,
+      0x80,
+      0x80,
+      5,
+      0x80,
+      0x80,
+      0x80,
+      1,
+      0x80,
+      0x80,
+      0x80);
+  __m256i ctl_2_0 = _mm256_set_epi8(
+      0x80,
+      0x80,
+      0x80,
+      30,
+      0x80,
+      0x80,
+      0x80,
+      26,
+      0x80,
+      0x80,
+      0x80,
+      22,
+      0x80,
+      0x80,
+      0x80,
+      18,
+      0x80,
+      0x80,
+      0x80,
+      14,
+      0x80,
+      0x80,
+      0x80,
+      10,
+      0x80,
+      0x80,
+      0x80,
+      6,
+      0x80,
+      0x80,
+      0x80,
+      2);
+  __m256i ctl_2_3 = _mm256_set_epi8(
+      30,
+      0x80,
+      0x80,
+      0x80,
+      26,
+      0x80,
+      0x80,
+      0x80,
+      22,
+      0x80,
+      0x80,
+      0x80,
+      18,
+      0x80,
+      0x80,
+      0x80,
+      14,
+      0x80,
+      0x80,
+      0x80,
+      10,
+      0x80,
+      0x80,
+      0x80,
+      6,
+      0x80,
+      0x80,
+      0x80,
+      2,
+      0x80,
+      0x80,
+      0x80);
+  __m256i ctl_3_0 = _mm256_set_epi8(
+      0x80,
+      0x80,
+      0x80,
+      31,
+      0x80,
+      0x80,
+      0x80,
+      27,
+      0x80,
+      0x80,
+      0x80,
+      23,
+      0x80,
+      0x80,
+      0x80,
+      19,
+      0x80,
+      0x80,
+      0x80,
+      15,
+      0x80,
+      0x80,
+      0x80,
+      11,
+      0x80,
+      0x80,
+      0x80,
+      7,
+      0x80,
+      0x80,
+      0x80,
+      3);
+  __m256i ctl_3_1 = _mm256_set_epi8(
+      0x80,
+      0x80,
+      31,
+      0x80,
+      0x80,
+      0x80,
+      27,
+      0x80,
+      0x80,
+      0x80,
+      23,
+      0x80,
+      0x80,
+      0x80,
+      19,
+      0x80,
+      0x80,
+      0x80,
+      15,
+      0x80,
+      0x80,
+      0x80,
+      11,
+      0x80,
+      0x80,
+      0x80,
+      7,
+      0x80,
+      0x80,
+      0x80,
+      3,
+      0x80);
+  __m256i ctl_3_2 = _mm256_set_epi8(
+      0x80,
+      31,
+      0x80,
+      0x80,
+      0x80,
+      27,
+      0x80,
+      0x80,
+      0x80,
+      23,
+      0x80,
+      0x80,
+      0x80,
+      19,
+      0x80,
+      0x80,
+      0x80,
+      15,
+      0x80,
+      0x80,
+      0x80,
+      11,
+      0x80,
+      0x80,
+      0x80,
+      7,
+      0x80,
+      0x80,
+      0x80,
+      3,
+      0x80,
+      0x80);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Masks for bitwise and operation, treating 256 bits as an array of
   // 8-bit elements, and considering them in quadruples of neighboring
@@ -1497,11 +2849,18 @@ Vectorized<T> inline shift_256_8(const Vectorized<T>& a, const Vectorized<T>& b)
   __m256i c0;
   if (left_shift)
     c0 = _mm256_sllv_epi32(a0, b0);
+<<<<<<< HEAD
   else
     if constexpr (std::is_same_v<T, int8_t>)
       c0 = _mm256_srav_epi32(a0, b0);
     else
       c0 = _mm256_srlv_epi32(a0, b0);
+=======
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c0 = _mm256_srav_epi32(a0, b0);
+  else
+    c0 = _mm256_srlv_epi32(a0, b0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c0 = _mm256_shuffle_epi8(c0, ctl_3_0);
 
   // Peform shifting the same way for input array elements with
@@ -1511,11 +2870,18 @@ Vectorized<T> inline shift_256_8(const Vectorized<T>& a, const Vectorized<T>& b)
   __m256i c1;
   if (left_shift)
     c1 = _mm256_sllv_epi32(a1, b1);
+<<<<<<< HEAD
   else
     if constexpr (std::is_same_v<T, int8_t>)
       c1 = _mm256_srav_epi32(a1, b1);
     else
       c1 = _mm256_srlv_epi32(a1, b1);
+=======
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c1 = _mm256_srav_epi32(a1, b1);
+  else
+    c1 = _mm256_srlv_epi32(a1, b1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c1 = _mm256_shuffle_epi8(c1, ctl_3_1);
 
   // Peform shifting the same way for input array elements with
@@ -1525,25 +2891,43 @@ Vectorized<T> inline shift_256_8(const Vectorized<T>& a, const Vectorized<T>& b)
   __m256i c2;
   if (left_shift)
     c2 = _mm256_sllv_epi32(a2, b2);
+<<<<<<< HEAD
   else
     if constexpr (std::is_same_v<T, int8_t>)
       c2 = _mm256_srav_epi32(a2, b2);
     else
       c2 = _mm256_srlv_epi32(a2, b2);
+=======
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c2 = _mm256_srav_epi32(a2, b2);
+  else
+    c2 = _mm256_srlv_epi32(a2, b2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c2 = _mm256_shuffle_epi8(c2, ctl_3_2);
 
   // Peform shifting the same way for input array elements with
   // idx%4==3.
+<<<<<<< HEAD
   __m256i a3 =  _mm256_and_si256(a, keep_3);
+=======
+  __m256i a3 = _mm256_and_si256(a, keep_3);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);
   __m256i c3;
   if (left_shift)
     c3 = _mm256_sllv_epi32(a3, b3);
+<<<<<<< HEAD
   else
     if constexpr (std::is_same_v<T, int8_t>)
       c3 = _mm256_srav_epi32(a3, b3);
     else
       c3 = _mm256_srlv_epi32(a3, b3);
+=======
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c3 = _mm256_srav_epi32(a3, b3);
+  else
+    c3 = _mm256_srlv_epi32(a3, b3);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c3 = _mm256_and_si256(c3, keep_3);
 
   // Merge partial results into the final result.
@@ -1555,31 +2939,62 @@ Vectorized<T> inline shift_256_8(const Vectorized<T>& a, const Vectorized<T>& b)
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator<<(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_sllv_epi64(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator<<(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_sllv_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator<<(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return shift_256_16<true>(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator<<(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return shift_256_8<true>(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline operator<<(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline operator<<(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return shift_256_8<true>(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   // No vector instruction for right arithmetic shifting int64_t, so emulating it
   // instead.
@@ -1589,6 +3004,20 @@ Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vector
   __m256i zero = _mm256_set1_epi64x(0);
   __m256i max_shift = _mm256_set1_epi64x(64);
   __m256i mask = _mm256_or_si256(_mm256_cmpgt_epi64(zero, b), _mm256_cmpgt_epi64(b, max_shift));
+=======
+Vectorized<int64_t> inline operator>>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  // No vector instruction for right arithmetic shifting int64_t, so emulating
+  // it instead.
+
+  // Clamp the shift values such that shift values < 0 and > 64 are changed to
+  // 64 which results in -1 for negative input and 0 for non-negative input.
+  __m256i zero = _mm256_set1_epi64x(0);
+  __m256i max_shift = _mm256_set1_epi64x(64);
+  __m256i mask = _mm256_or_si256(
+      _mm256_cmpgt_epi64(zero, b), _mm256_cmpgt_epi64(b, max_shift));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m256i shift = _mm256_blendv_epi8(b, max_shift, mask);
   // Shift the number logically to the right, thus filling the most
   // significant bits with 0s.  Then, replace these bits with the sign
@@ -1603,25 +3032,54 @@ Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vector
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator>>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_srav_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator>>(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return shift_256_16<false>(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator>>(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return shift_256_8<false>(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline operator>>(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline operator>>(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return shift_256_8<false>(a, b);
 }
 
 #endif
 
+<<<<<<< HEAD
 }} // namespace at::vec::CPU_CAPABILITY
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
index 9b900cd0f63e..99ceb6eba8d4 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@@ -35,8 +35,13 @@
 // specified by float_vec_return_type.
 //
 // When writing kernels with these vectors, it is expected that floating-
+<<<<<<< HEAD
 // point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
 // iterations.
+=======
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::vec {
 inline namespace CPU_CAPABILITY {
@@ -103,10 +108,19 @@ inline __m256i pack_saturate_and_clamp<uint8_t>(
 }
 
 template <typename T>
+<<<<<<< HEAD
 typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<float>>
 inline convert_int8_to_float(at::vec::Vectorized<T> src) {
   // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
   // Only handle first 8*8 bits
+=======
+typename std::enable_if_t<
+    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+    at::vec::Vectorized<
+        float>> inline convert_int8_to_float(at::vec::Vectorized<T> src) {
+  // Note: this function only convert inputs number of elements equal to
+  // at::vec::Vectorized<float>.size() Only handle first 8*8 bits
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m128i input_128 = _mm256_castsi256_si128(src);
   // Convert from 8*uint8/int8 to 8*int32
   __m256i input_256_int32;
@@ -119,8 +133,15 @@ inline convert_int8_to_float(at::vec::Vectorized<T> src) {
 }
 
 template <typename T>
+<<<<<<< HEAD
 typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<T>>
 inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+=======
+typename std::enable_if_t<
+    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+    at::vec::Vectorized<
+        T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Convert from float32 to int32 with truncation
   __m256i x_values_int32 = _mm256_cvttps_epi32(src);
 
@@ -131,10 +152,17 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
   constexpr auto max_val = std::numeric_limits<T>::max();
 
   // Convert from int16 to uint8/int8 using unsigned saturation
+<<<<<<< HEAD
   __m256i xyzw_clamped_v = pack_saturate_and_clamp<T>(
       xy_packed_v, xy_packed_v, min_val, max_val);
   __m256i permute_mask_v =
     _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+=======
+  __m256i xyzw_clamped_v =
+      pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
+  __m256i permute_mask_v =
+      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
 }
 
@@ -255,6 +283,7 @@ __FORCE_INLINE void QuantizeAvx2(
   }
 }
 
+<<<<<<< HEAD
 template<>
 struct Vectorized<c10::qint32> : public Vectorizedqi {
     using size_type = int;
@@ -384,6 +413,151 @@ struct Vectorized<c10::qint32> : public Vectorizedqi {
 
 template <>
 Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+=======
+template <>
+struct is_vec_specialized_for<c10::qint32> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint32> : public Vectorizedqi {
+  using size_type = int;
+  static constexpr size_type kSize = Vectorized<int>::size();
+  static constexpr size_type size() {
+    return kSize;
+  }
+
+  static constexpr int kFloatNumVecs = kSize / Vectorized<float>::size();
+  static constexpr int float_num_vecs() {
+    return kFloatNumVecs;
+  }
+
+  static constexpr int int_num_vecs() {
+    return 1;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
+  using value_type = c10::qint32::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+  Vectorized() {}
+
+  Vectorized(__m256i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::qint32& val) {
+    value_type uw = val.val_;
+    vals = _mm256_set1_epi32(uw);
+  }
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm256_storeu_si256((__m256i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return _mm256_loadu_si256((const __m256i*)tmp_values);
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> /*zero_point*/,
+      Vectorized<float> scale_zp_premul) const {
+    __m256 float_vals = _mm256_cvtepi32_ps(vals);
+    return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    __m256 float_vals = _mm256_cvtepi32_ps(vals);
+    return {(Vectorized<float>(float_vals) - zero_point) * scale};
+  }
+
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float /*inverse_scale*/) {
+    Vectorized<c10::qint32> retval;
+    auto rhs_data = (__m256)rhs[0];
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale,
+        zero_point,
+        (float*)&rhs_data,
+        (c10::qint32*)&retval.vals,
+        size());
+    return retval;
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    return _mm256_max_epi32(vals, b.vals);
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    return _mm256_min_epi32(vals, b.vals);
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    return _mm256_min_epi32(
+        _mm256_max_epi32(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    return {_mm256_sub_epi32(vals, b)};
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m256 multiplier_v = _mm256_set1_ps(multiplier);
+    __m256i zero_point_v = _mm256_set1_epi32(zero_point);
+
+    __m256 scaled = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier_v);
+    __m256i rounded = _mm256_cvtps_epi32(scaled);
+    return _mm256_add_epi32(rounded, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm256_loadu_si256((const __m256i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
@@ -447,6 +621,7 @@ __m256i RequantizeAvx2(
   return xyzw_clamped_v;
 }
 
+<<<<<<< HEAD
 template<>
 struct Vectorized<c10::qint8> : public Vectorizedqi {
     static constexpr int kSize = VECTOR_WIDTH;
@@ -519,6 +694,88 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
     __m256i cvtepi8_epi32(__m128i epi8_vals) const {
         return _mm256_cvtepi8_epi32(epi8_vals);
     }
+=======
+template <>
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint8> : public Vectorizedqi {
+  static constexpr int kSize = VECTOR_WIDTH;
+  static constexpr int size() {
+    return kSize;
+  }
+
+  static constexpr int kFloatNumVecs = kSize / Vectorized<float>::size();
+  static constexpr int float_num_vecs() {
+    return kFloatNumVecs;
+  }
+
+  static constexpr int kIntNumVecs = kSize / Vectorized<int>::size();
+  static constexpr int int_num_vecs() {
+    return kIntNumVecs;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, kIntNumVecs>;
+  using value_type = typename c10::qint8::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+
+  Vectorized() {}
+  Vectorized(__m256i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::qint8& val) {
+    value_type uw = val.val_;
+    vals = _mm256_set1_epi8(uw);
+  }
+
+  // This is needed because the compiler emits awful code for the default
+  // constructor for moving the enum
+  // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy)
+  C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wdeprecated-copy")
+  C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy")
+#endif
+  Vectorized(const Vectorized<c10::qint8>& other) : Vectorizedqi(other.vals) {}
+  C10_CLANG_DIAGNOSTIC_POP()
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm256_storeu_si256((__m256i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return _mm256_loadu_si256((const __m256i*)tmp_values);
+  }
+
+ private:
+  __m256i cvtepi8_epi32(__m128i epi8_vals) const {
+    return _mm256_cvtepi8_epi32(epi8_vals);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
   float_vec_return_type dequantize(
@@ -579,6 +836,7 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
   }
 
   Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+<<<<<<< HEAD
       return _mm256_max_epi8(vals, b.vals);
     }
 
@@ -720,6 +978,159 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
     __m256i cvtepu8_epi32(__m128i epu8_vals) const {
         return _mm256_cvtepu8_epi32(epu8_vals);
     }
+=======
+    return _mm256_max_epi8(vals, b.vals);
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    return _mm256_min_epi8(vals, b.vals);
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    return _mm256_min_epi8(_mm256_max_epi8(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256i int32_val0 = cvtepi8_epi32(int_val0);
+    __m256i int32_val1 = cvtepi8_epi32(int_val1);
+    __m256i int32_val2 = cvtepi8_epi32(int_val2);
+    __m256i int32_val3 = cvtepi8_epi32(int_val3);
+
+    __m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0));
+    __m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1));
+    __m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2));
+    __m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3));
+
+    __m256i int32_b0 = cvtepi8_epi32(int_b0);
+    __m256i int32_b1 = cvtepi8_epi32(int_b1);
+    __m256i int32_b2 = cvtepi8_epi32(int_b2);
+    __m256i int32_b3 = cvtepi8_epi32(int_b3);
+
+    __m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0);
+    __m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1);
+    __m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2);
+    __m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3);
+
+    return {
+        Vectorized<c10::qint32>(res_0),
+        Vectorized<c10::qint32>(res_1),
+        Vectorized<c10::qint32>(res_2),
+        Vectorized<c10::qint32>(res_3)};
+  }
+
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m256 multiplier_v = _mm256_set1_ps(multiplier);
+    __m256i zero_point_v = _mm256_set1_epi32(zero_point);
+    return RequantizeAvx2<value_type>(inp, multiplier_v, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm256_loadu_si256((const __m256i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::quint8> : public Vectorizedqi {
+  static constexpr int kSize = VECTOR_WIDTH;
+  static constexpr int size() {
+    return kSize;
+  }
+
+  static constexpr int kFloatNumVecs = kSize / Vectorized<float>::size();
+  static constexpr int float_num_vecs() {
+    return kFloatNumVecs;
+  }
+
+  static constexpr int kIntNumVecs = kSize / Vectorized<int>::size();
+  static constexpr int int_num_vecs() {
+    return kIntNumVecs;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, kIntNumVecs>;
+  using value_type = typename c10::quint8::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+  Vectorized() {}
+
+  Vectorized(__m256i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::quint8& val) {
+    value_type uw = val.val_;
+    vals = _mm256_set1_epi8(uw);
+  }
+
+  // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy)
+  C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wdeprecated-copy")
+  C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy")
+#endif
+  Vectorized(const Vectorized<c10::quint8>& other) : Vectorizedqi(other.vals) {}
+  C10_CLANG_DIAGNOSTIC_POP()
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm256_storeu_si256((__m256i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return _mm256_loadu_si256((const __m256i*)tmp_values);
+  }
+
+ private:
+  __m256i cvtepu8_epi32(__m128i epu8_vals) const {
+    return _mm256_cvtepu8_epi32(epu8_vals);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
   float_vec_return_type dequantize(
@@ -780,6 +1191,7 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
   }
 
   Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+<<<<<<< HEAD
       return _mm256_max_epu8(vals, b.vals);
     }
 
@@ -848,6 +1260,77 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
 
 template <>
 Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+=======
+    return _mm256_max_epu8(vals, b.vals);
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    return _mm256_min_epu8(vals, b.vals);
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    return _mm256_min_epu8(_mm256_max_epu8(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256i int32_val0 = cvtepu8_epi32(int_val0);
+    __m256i int32_val1 = cvtepu8_epi32(int_val1);
+    __m256i int32_val2 = cvtepu8_epi32(int_val2);
+    __m256i int32_val3 = cvtepu8_epi32(int_val3);
+
+    __m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0));
+    __m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1));
+    __m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2));
+    __m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3));
+
+    __m256i int32_b0 = cvtepu8_epi32(int_b0);
+    __m256i int32_b1 = cvtepu8_epi32(int_b1);
+    __m256i int32_b2 = cvtepu8_epi32(int_b2);
+    __m256i int32_b3 = cvtepu8_epi32(int_b3);
+
+    __m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0);
+    __m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1);
+    __m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2);
+    __m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3);
+    return {
+        Vectorized<c10::qint32>(res_0),
+        Vectorized<c10::qint32>(res_1),
+        Vectorized<c10::qint32>(res_2),
+        Vectorized<c10::qint32>(res_3)};
+  }
+
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m256 multiplier_v = _mm256_set1_ps(multiplier);
+    __m256i zero_point_v = _mm256_set1_epi32(zero_point);
+    return RequantizeAvx2<value_type>(inp, multiplier_v, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm256_loadu_si256((const __m256i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
@@ -908,7 +1391,13 @@ struct VectorizedQuantizedConverter {
       float tmp_vals[Vectorized<float>::size()];
       for (const auto j : c10::irange(Vectorized<float>::size())) {
         tmp_vals[j] = at::native::dequantize_val<T>(
+<<<<<<< HEAD
             scale[j], zero_point[j], T(vals[Vectorized<float>::size() * i + j]));
+=======
+            scale[j],
+            zero_point[j],
+            T(vals[Vectorized<float>::size() * i + j]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       rv[i] = Vectorized<float>(tmp_vals);
     }
@@ -928,10 +1417,17 @@ struct VectorizedQuantizedConverter {
 
 template <>
 struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
+<<<<<<< HEAD
                                  c10::qint32,
                                  std::array<Vectorized<float>, 1>,
                                  std::array<Vectorized<c10::qint32>, 1>,
                                  Vectorized<int>::size()> {
+=======
+                                     c10::qint32,
+                                     std::array<Vectorized<float>, 1>,
+                                     std::array<Vectorized<c10::qint32>, 1>,
+                                     Vectorized<int>::size()> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using VectorizedQuantizedConverter::VectorizedQuantizedConverter;
 
   static Vectorized<c10::qint32> loadu(const void* ptr) {
@@ -940,14 +1436,27 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 
   static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
     __at_align__ value_type tmp_values[size()];
+<<<<<<< HEAD
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
+=======
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(
+<<<<<<< HEAD
         tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+=======
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<c10::qint32>(tmp_values);
   }
 
@@ -989,11 +1498,18 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
     return retval;
   }
 
+<<<<<<< HEAD
   Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const  {
     return maximum(zero_point);
   }
 
 
+=======
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return maximum(zero_point);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<c10::qint32> relu6(
       Vectorized<c10::qint32> zero_point,
       Vectorized<c10::qint32> q_six) {
@@ -1028,7 +1544,13 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+=======
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
@@ -1055,11 +1577,22 @@ Vectorized<c10::qint32> inline operator+(
 }
 
 template <>
+<<<<<<< HEAD
 struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
                                 c10::qint8,
                                 std::array<Vectorized<float>, 4>,
                                 std::array<Vectorized<c10::qint32>, 4>,
                                 4 * Vectorized<float>::size()> {
+=======
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
+                                    c10::qint8,
+                                    std::array<Vectorized<float>, 4>,
+                                    std::array<Vectorized<c10::qint32>, 4>,
+                                    4 * Vectorized<float>::size()> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using VectorizedQuantizedConverter::VectorizedQuantizedConverter;
 
   static Vectorized<c10::qint8> loadu(const void* ptr) {
@@ -1068,14 +1601,27 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
 
   static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
     __at_align__ value_type tmp_values[size()];
+<<<<<<< HEAD
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
+=======
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(
+<<<<<<< HEAD
         tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+=======
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<c10::qint8>(tmp_values);
   }
 
@@ -1166,16 +1712,33 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+=======
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
 template <>
+<<<<<<< HEAD
 struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
                                  c10::quint8,
                                  std::array<Vectorized<float>, 4>,
                                  std::array<Vectorized<c10::qint32>, 4>,
                                  4 * Vectorized<float>::size()> {
+=======
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
+                                     c10::quint8,
+                                     std::array<Vectorized<float>, 4>,
+                                     std::array<Vectorized<c10::qint32>, 4>,
+                                     4 * Vectorized<float>::size()> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using VectorizedQuantizedConverter::VectorizedQuantizedConverter;
 
   static Vectorized<c10::quint8> loadu(const void* ptr) {
@@ -1184,14 +1747,27 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
 
   static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
     __at_align__ value_type tmp_values[size()];
+<<<<<<< HEAD
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
+=======
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
     std::memcpy(
+<<<<<<< HEAD
         tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+=======
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<c10::quint8>(tmp_values);
   }
 
@@ -1237,7 +1813,10 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
     return maximum(zero_point);
   }
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<c10::quint8> relu6(
       Vectorized<c10::quint8> zero_point,
       Vectorized<c10::quint8> q_six) {
@@ -1283,13 +1862,20 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+=======
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
 #endif // if defined(CPU_CAPABILITY_AVX2)
 
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
+<<<<<<< HEAD
 std::pair<Vectorized<float>, Vectorized<float>>
 inline convert_int8_to_float(at::vec::Vectorized<int8_t> src) {
     auto s8x8 = vld1_s8(src.operator const int8_t*());
@@ -1332,3 +1918,52 @@ inline convert_int8_half_register_to_float(at::vec::Vectorized<uint8_t> src) {
 
 #endif
 }} // namespace at::vec::CPU_CAPABILITY
+=======
+std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
+    at::vec::Vectorized<int8_t> src) {
+  auto s8x8 = vld1_s8(src.operator const int8_t*());
+  auto s16x8 = vmovl_s8(s8x8);
+
+  auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
+  auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
+
+  return std::make_pair(
+      Vectorized<float>(vcvtq_f32_s32(s32x4_lo)),
+      Vectorized<float>(vcvtq_f32_s32(s32x4_hi)));
+}
+
+std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
+    at::vec::Vectorized<uint8_t> src) {
+  auto u8x8 = vld1_u8(src.operator const uint8_t*());
+  auto u16x8 = vmovl_u8(u8x8);
+  auto u32x4_hi = vmovl_u16(vget_high_u16(u16x8));
+  auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
+
+  return std::make_pair(
+      Vectorized<float>(vcvtq_f32_u32(u32x4_lo)),
+      Vectorized<float>(vcvtq_f32_u32(u32x4_hi)));
+}
+
+Vectorized<float> inline convert_int8_half_register_to_float(
+    at::vec::Vectorized<int8_t> src) {
+  auto s8x8 = vld1_s8(src.operator const int8_t*());
+  auto s16x8 = vmovl_s8(s8x8);
+
+  auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
+
+  return Vectorized<float>(vcvtq_f32_s32(s32x4_lo));
+}
+
+Vectorized<float> inline convert_int8_half_register_to_float(
+    at::vec::Vectorized<uint8_t> src) {
+  auto u8x8 = vld1_u8(src.operator const uint8_t*());
+  auto u16x8 = vmovl_u8(u8x8);
+  auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
+
+  return Vectorized<float>(vcvtq_f32_u32(u32x4_lo));
+}
+
+#endif
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
index 2d8afd9ef295..97d4d1f83ca6 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
@@ -34,7 +34,13 @@ inline Vectorized<BFloat16> convert_float_bfloat16(
   return Vectorized<BFloat16>::loadu(arr2);
 }
 
+<<<<<<< HEAD
 inline void load_fp32_from_bf16(const c10::BFloat16* data, Vectorized<float>& out) {
+=======
+inline void load_fp32_from_bf16(
+    const c10::BFloat16* data,
+    Vectorized<float>& out) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __at_align__ float values[Vectorized<float>::size()];
   for (const auto k : c10::irange(Vectorized<float>::size())) {
     values[k] = data[k];
@@ -68,6 +74,10 @@ inline void load_fp32_from_fp16(
   load_fp32_from_fp16(data, out2);
 }
 
+<<<<<<< HEAD
 } // namespace
+=======
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h
index 98ac83963179..04bed3a64b66 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h
@@ -1,8 +1,13 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Note: header order is important here
 #include <ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h>
@@ -14,8 +19,13 @@
 #include <ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h>
 #include <ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h>
 
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h>
 #include <ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h>
 
@@ -82,8 +92,12 @@ convert_to_int_of_same_size<double>(const Vectorized<double>& src) {
 
 template <>
 Vectorized<int32_t> C10_ALWAYS_INLINE
+<<<<<<< HEAD
 convert_to_int_of_same_size<float>(
     const Vectorized<float>& src) {
+=======
+convert_to_int_of_same_size<float>(const Vectorized<float>& src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Vectorized<int32_t>{vec_signed(src.vec0()), vec_signed(src.vec1())};
 }
 
@@ -91,10 +105,19 @@ template <>
 inline void convert(const int32_t* src, float* dst, int64_t n) {
   // int32_t and float have same size
   int64_t i;
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
     const int32_t* src_a = src + i;
     float* dst_a = dst + i;
     vint32 input_vec0 = vec_vsx_ld(offset0, reinterpret_cast<const vint32*>(src_a));
+=======
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+    const int32_t* src_a = src + i;
+    float* dst_a = dst + i;
+    vint32 input_vec0 =
+        vec_vsx_ld(offset0, reinterpret_cast<const vint32*>(src_a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     vint32 input_vec1 =
         vec_vsx_ld(offset16, reinterpret_cast<const vint32*>(src_a));
     vfloat32 c0 = vec_float(input_vec0);
@@ -111,7 +134,12 @@ inline void convert(const int32_t* src, float* dst, int64_t n) {
 template <>
 inline void convert(const int64_t* src, double* dst, int64_t n) {
   int64_t i;
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
+=======
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const int64_t* src_a = src + i;
     double* dst_a = dst + i;
     vint64 input_vec0 =
@@ -127,8 +155,13 @@ inline void convert(const int64_t* src, double* dst, int64_t n) {
     dst[i] = static_cast<double>(src[i]);
   }
 }
+<<<<<<< HEAD
 //Generic implementation to fix compiler error
 //TO-DO : Add optimized version for ppc64
+=======
+// Generic implementation to fix compiler error
+// TO-DO : Add optimized version for ppc64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(
     const Vectorized<Half>& a) {
   constexpr int64_t K = Vectorized<Half>::size();
@@ -137,12 +170,22 @@ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(
   a.store(arr2);
   convert(arr2, arr, K);
   return std::make_tuple(
+<<<<<<< HEAD
        Vectorized<float>::loadu(arr),
        Vectorized<float>::loadu(arr + Vectorized<float>::size()));
 }
 
 inline Vectorized<Half> convert_float_half(
     const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+      Vectorized<float>::loadu(arr),
+      Vectorized<float>::loadu(arr + Vectorized<float>::size()));
+}
+
+inline Vectorized<Half> convert_float_half(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr int64_t K = Vectorized<Half>::size();
   __at_align__ float arr[K];
   __at_align__ Half arr2[K];
@@ -241,6 +284,10 @@ std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
       Vectorized<float>{aa0123, aa0123_2}, Vectorized<float>{bb0123, bb0123_2});
 }
 
+<<<<<<< HEAD
 } // namespace
+=======
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
index 2c74847758d8..5a90ffe2f411 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
@@ -1,7 +1,12 @@
 #pragma once
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 
@@ -12,6 +17,12 @@ inline namespace CPU_CAPABILITY {
 using ComplexDbl = c10::complex<double>;
 
 template <>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<ComplexDbl> : std::bool_constant<true> {};
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Vectorized<ComplexDbl> {
   union {
     struct {
@@ -36,8 +47,15 @@ class Vectorized<ComplexDbl> {
   Vectorized() {}
   C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {}
   C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+<<<<<<< HEAD
   C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) : _vec0{v1}, _vec1{v2} {}
   C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {}
+=======
+  C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2)
+      : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Vectorized(ComplexDbl val) {
     double real_value = val.real();
@@ -58,30 +76,62 @@ class Vectorized<ComplexDbl> {
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoiceComplexDbl(mask) == 0, Vectorized<ComplexDbl>>
       C10_ALWAYS_INLINE
       blend(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+=======
+  static std::
+      enable_if_t<blendChoiceComplexDbl(mask) == 0, Vectorized<ComplexDbl>>
+          C10_ALWAYS_INLINE blend(
+              const Vectorized<ComplexDbl>& a,
+              const Vectorized<ComplexDbl>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return a;
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoiceComplexDbl(mask) == 1, Vectorized<ComplexDbl>>
       C10_ALWAYS_INLINE
       blend(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+=======
+  static std::
+      enable_if_t<blendChoiceComplexDbl(mask) == 1, Vectorized<ComplexDbl>>
+          C10_ALWAYS_INLINE blend(
+              const Vectorized<ComplexDbl>& a,
+              const Vectorized<ComplexDbl>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return b;
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoiceComplexDbl(mask) == 2, Vectorized<ComplexDbl>>
       C10_ALWAYS_INLINE
       blend(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+=======
+  static std::
+      enable_if_t<blendChoiceComplexDbl(mask) == 2, Vectorized<ComplexDbl>>
+          C10_ALWAYS_INLINE blend(
+              const Vectorized<ComplexDbl>& a,
+              const Vectorized<ComplexDbl>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {b._vec0, a._vec1};
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoiceComplexDbl(mask) == 3, Vectorized<ComplexDbl>>
       C10_ALWAYS_INLINE
       blend(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+=======
+  static std::
+      enable_if_t<blendChoiceComplexDbl(mask) == 3, Vectorized<ComplexDbl>>
+          C10_ALWAYS_INLINE blend(
+              const Vectorized<ComplexDbl>& a,
+              const Vectorized<ComplexDbl>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {a._vec0, b._vec1};
   }
 
@@ -100,8 +150,13 @@ class Vectorized<ComplexDbl> {
       const Vectorized<ComplexDbl>& b,
       const Vectorized<ComplexDbl>& mask) {
     // convert std::complex<V> index mask to V index mask: xy -> xxyy
+<<<<<<< HEAD
     auto mask_complex =
         Vectorized<ComplexDbl>(vec_splat(mask._vec0, 0), vec_splat(mask._vec1, 0));
+=======
+    auto mask_complex = Vectorized<ComplexDbl>(
+        vec_splat(mask._vec0, 0), vec_splat(mask._vec1, 0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {
         vec_sel(a._vec0, b._vec0, mask_complex._vecb0),
         vec_sel(a._vec1, b._vec1, mask_complex._vecb1)};
@@ -210,16 +265,26 @@ class Vectorized<ComplexDbl> {
   }
 
   static Vectorized<ComplexDbl> el_mergee(
+<<<<<<< HEAD
       Vectorized<ComplexDbl>& first,
       Vectorized<ComplexDbl>& second) {
+=======
+      const Vectorized<ComplexDbl>& first,
+      const Vectorized<ComplexDbl>& second) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {
         vec_mergeh(first._vec0, second._vec0),
         vec_mergeh(first._vec1, second._vec1)};
   }
 
   static Vectorized<ComplexDbl> el_mergeo(
+<<<<<<< HEAD
       Vectorized<ComplexDbl>& first,
       Vectorized<ComplexDbl>& second) {
+=======
+      const Vectorized<ComplexDbl>& first,
+      const Vectorized<ComplexDbl>& second) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {
         vec_mergel(first._vec0, second._vec0),
         vec_mergel(first._vec1, second._vec1)};
@@ -235,7 +300,13 @@ class Vectorized<ComplexDbl> {
   Vectorized<ComplexDbl> abs_() const {
     auto vi = el_mergeo();
     auto vr = el_mergee();
+<<<<<<< HEAD
     return {Sleef_hypotd2_u05vsx(vr._vec0, vi._vec0), Sleef_hypotd2_u05vsx(vr._vec1, vi._vec1)};
+=======
+    return {
+        Sleef_hypotd2_u05vsx(vr._vec0, vi._vec0),
+        Sleef_hypotd2_u05vsx(vr._vec1, vi._vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<ComplexDbl> abs() const {
@@ -394,8 +465,13 @@ class Vectorized<ComplexDbl> {
       Vectorized<ComplexDbl>& second) {
     // Operates on individual floats, see _mm_hadd_ps
     // {f0+f1, s0+s1, f2+f3, s2+s3, ...}
+<<<<<<< HEAD
     // i.e. it sums the re and im of each value and interleaves first and second:
     // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+=======
+    // i.e. it sums the re and im of each value and interleaves first and
+    // second: {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return el_mergee(first, second) + el_mergeo(first, second);
   }
 
@@ -413,7 +489,12 @@ class Vectorized<ComplexDbl> {
     return el_mergee(first_ret, second_ret); // 2 mergee's
   }
 
+<<<<<<< HEAD
   Vectorized<ComplexDbl> inline operator*(const Vectorized<ComplexDbl>& b) const {
+=======
+  Vectorized<ComplexDbl> inline operator*(
+      const Vectorized<ComplexDbl>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
 #if 1
     // this is more vsx friendly than simulating horizontal from x86
@@ -422,7 +503,11 @@ class Vectorized<ComplexDbl> {
     vi = vi ^ vd_rsign_mask;
     auto ret = elwise_mult(vr);
     auto vx_swapped = el_swapped();
+<<<<<<< HEAD
     ret = vx_swapped.el_madd(vi, ret);
+=======
+    ret = vx_swapped.elwise_mult(vi) + ret;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
     auto ac_bd = elwise_mult(b);
     auto d_c = b.el_swapped();
@@ -433,6 +518,7 @@ class Vectorized<ComplexDbl> {
     return ret;
   }
 
+<<<<<<< HEAD
   Vectorized<ComplexDbl> inline operator/(const Vectorized<ComplexDbl>& b) const {
     // re + im*i = (a + bi)  / (c + di)
     // re = (ac + bd)/abs_2()
@@ -452,6 +538,41 @@ class Vectorized<ComplexDbl> {
     auto denom2 = b2.abs_2_();                // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2
     ret = ret.elwise_div(denom2);
     return ret;
+=======
+  Vectorized<ComplexDbl> inline operator/(
+      const Vectorized<ComplexDbl>& b) const {
+    // re + im*i = (a + bi)  / (c + di)
+    // re = (ac + bd)/abs_2()
+    // im = (bc - ad)/abs_2()
+    // auto fabs_cd =  Vectorized{
+    //    vec_andc(b._vec0, vd_sign_mask),
+    //    vec_andc(b._vec1, vd_sign_mask)};       // |c|            |d|
+    // auto fabs_dc =  fabs_cd.el_swapped();     // |d|            |c|
+    // auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|)
+    // auto a2 = elwise_div(scale);              // a/sc           b/sc
+    // auto b2 = b.elwise_div(scale);            // c/sc           d/sc
+    // auto acbd2 = a2.elwise_mult(b2);          // ac/sc^2        bd/sc^2
+    // auto dc2 = b2.el_swapped();               // d/sc           c/sc
+    // dc2 = dc2 ^ vd_rsign_mask;                // -d/sc          c/sc
+    // auto adbc2 = a2.elwise_mult(dc2);         // -ad/sc^2       bc/sc^2
+    // auto ret = horizontal_add(acbd2, adbc2);  // (ac+bd)/sc^2   (bc-ad)/sc^2
+    // auto denom2 = b2.abs_2_();                // (c^2+d^2)/sc^2
+    // (c^2+d^2)/sc^2 ret = ret.elwise_div(denom2); return ret;
+
+    __at_align__ c10::complex<double>
+        tmp1[Vectorized<c10::complex<double>>::size()];
+    __at_align__ c10::complex<double>
+        tmp2[Vectorized<c10::complex<double>>::size()];
+    __at_align__ c10::complex<double>
+        out[Vectorized<c10::complex<double>>::size()];
+    this->store(tmp1);
+    b.store(tmp2);
+
+    for (const auto i : c10::irange(Vectorized<c10::complex<double>>::size())) {
+      out[i] = tmp1[i] / tmp2[i];
+    }
+    return loadu(out);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<ComplexDbl> exp() const {
@@ -493,6 +614,7 @@ class Vectorized<ComplexDbl> {
   }
 
   Vectorized<ComplexDbl> eq(const Vectorized<ComplexDbl>& other) const {
+<<<<<<< HEAD
     auto eq = (*this == other);  // compares real and imag individually
     // If both real numbers and imag numbers are equal, then the complex numbers are equal
     return (eq.real() & eq.imag()) & vd_one;
@@ -500,6 +622,17 @@ class Vectorized<ComplexDbl> {
   Vectorized<ComplexDbl> ne(const Vectorized<ComplexDbl>& other) const {
     auto ne = (*this != other);  // compares real and imag individually
     // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
+=======
+    auto eq = (*this == other); // compares real and imag individually
+    // If both real numbers and imag numbers are equal, then the complex numbers
+    // are equal
+    return (eq.real() & eq.imag()) & vd_one;
+  }
+  Vectorized<ComplexDbl> ne(const Vectorized<ComplexDbl>& other) const {
+    auto ne = (*this != other); // compares real and imag individually
+    // If either real numbers or imag numbers are not equal, then the complex
+    // numbers are not equal
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (ne.real() | ne.imag()) & vd_one;
   }
 
@@ -555,6 +688,7 @@ Vectorized<ComplexDbl> inline minimum(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<ComplexDbl> C10_ALWAYS_INLINE operator+(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
   return Vectorized<ComplexDbl>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
 }
@@ -580,5 +714,95 @@ Vectorized<ComplexDbl> C10_ALWAYS_INLINE operator^(const Vectorized<ComplexDbl>&
 }
 
 } // namespace
+=======
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator+(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator-(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator&(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator|(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator^(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator*(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  // (a + ib) * (c + id) = (ac - bd) + i(ad + bc)
+  // Split into real and imaginary parts
+  auto a_real = a.el_mergee(); // real part of a
+  auto a_imag = a.el_mergeo(); // imag part of a
+  auto b_real = b.el_mergee(); // real part of b
+  auto b_imag = b.el_mergeo(); // imag part of b
+
+  // Compute components
+  auto ac = a_real.elwise_mult(b_real); // real*real
+  auto bd = a_imag.elwise_mult(b_imag); // imag*imag
+
+  // Real part: ac - bd
+  auto real = ac - bd;
+
+  auto ad = a_real.elwise_mult(b_imag); // real*imag
+  auto bc = a_imag.elwise_mult(b_real); // imag*real
+
+  // Imag = ad + bc
+  auto imag = ad + bc;
+
+  // Merge real and imaginary parts into vectors
+  __vector double v0 = vec_mergeh(real.vec0(), imag.vec0()); // [r0, i0]
+  __vector double v1 = vec_mergeh(real.vec1(), imag.vec1()); // [r1, i1]
+
+  // Create the final result
+  auto result = Vectorized<ComplexDbl>{v0, v1};
+  return result;
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+operator/(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  // re + im*i = (a + bi)  / (c + di)
+  // re = (ac + bd)/abs_2()
+  // im = (bc - ad)/abs_2()
+  // Take absolute values of real and imaginary parts of b
+  __at_align__ c10::complex<double>
+      tmp1[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      tmp2[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      out[Vectorized<c10::complex<double>>::size()];
+  a.store(tmp1);
+  b.store(tmp2);
+  for (const auto i : c10::irange(Vectorized<c10::complex<double>>::size())) {
+    out[i] = tmp1[i] / tmp2[i];
+  }
+  return Vectorized<ComplexDbl>::loadu(out);
+}
+
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
index 58fdd34b18d8..65582fcc1a2e 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
@@ -1,8 +1,13 @@
 
 #pragma once
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 
@@ -13,6 +18,12 @@ inline namespace CPU_CAPABILITY {
 using ComplexFlt = c10::complex<float>;
 
 template <>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<ComplexFlt> : std::bool_constant<true> {};
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Vectorized<ComplexFlt> {
  private:
   union {
@@ -40,8 +51,15 @@ class Vectorized<ComplexFlt> {
 
   C10_ALWAYS_INLINE Vectorized(vfloat32 v) : _vec0{v}, _vec1{v} {}
   C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+<<<<<<< HEAD
   C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2) : _vec0{v1}, _vec1{v2} {}
   C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {}
+=======
+  C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2)
+      : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Vectorized(ComplexFlt val) {
     float real_value = val.real();
@@ -50,7 +68,15 @@ class Vectorized<ComplexFlt> {
     _vec1 = vfloat32{real_value, imag_value, real_value, imag_value};
   }
 
+<<<<<<< HEAD
   Vectorized(ComplexFlt val1, ComplexFlt val2, ComplexFlt val3, ComplexFlt val4) {
+=======
+  Vectorized(
+      ComplexFlt val1,
+      ComplexFlt val2,
+      ComplexFlt val3,
+      ComplexFlt val4) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _vec0 = vfloat32{val1.real(), val1.imag(), val2.real(), val2.imag()};
     _vec1 = vfloat32{val3.real(), val3.imag(), val4.real(), val4.imag()};
   }
@@ -153,8 +179,15 @@ class Vectorized<ComplexFlt> {
     auto mask_complex = Vectorized<ComplexFlt>(
         vec_mergeh(mask._vec0, mask._vec0), vec_mergeh(mask._vec1, mask._vec1));
     return {
+<<<<<<< HEAD
         vec_sel(a._vec0, b._vec0, reinterpret_cast<vbool32>(mask_complex._vec0)),
         vec_sel(a._vec1, b._vec1, reinterpret_cast<vbool32>(mask_complex._vec1)),
+=======
+        vec_sel(
+            a._vec0, b._vec0, reinterpret_cast<vbool32>(mask_complex._vec0)),
+        vec_sel(
+            a._vec1, b._vec1, reinterpret_cast<vbool32>(mask_complex._vec1)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     };
   }
 
@@ -250,8 +283,13 @@ class Vectorized<ComplexFlt> {
       Vectorized<ComplexFlt>& second) {
     // Operates on individual floats, see _mm_hadd_ps
     // {f0+f1, s0+s1, f2+f3, s2+s3, ...}
+<<<<<<< HEAD
     // i.e. it sums the re and im of each value and interleaves first and second:
     // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+=======
+    // i.e. it sums the re and im of each value and interleaves first and
+    // second: {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return el_mergee(first, second) + el_mergeo(first, second);
   }
 
@@ -279,7 +317,13 @@ class Vectorized<ComplexFlt> {
   Vectorized<ComplexFlt> abs_() const {
     auto vi = el_mergeo();
     auto vr = el_mergee();
+<<<<<<< HEAD
     return {Sleef_hypotf4_u05vsx(vr._vec0, vi._vec0), Sleef_hypotf4_u05vsx(vr._vec1, vi._vec1)};
+=======
+    return {
+        Sleef_hypotf4_u05vsx(vr._vec0, vi._vec0),
+        Sleef_hypotf4_u05vsx(vr._vec1, vi._vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<ComplexFlt> abs() const {
@@ -354,16 +398,26 @@ class Vectorized<ComplexFlt> {
   }
 
   static Vectorized<ComplexFlt> el_mergee(
+<<<<<<< HEAD
       Vectorized<ComplexFlt>& first,
       Vectorized<ComplexFlt>& second) {
+=======
+      const Vectorized<ComplexFlt>& first,
+      const Vectorized<ComplexFlt>& second) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {
         vec_mergee(first._vecb0, second._vecb0),
         vec_mergee(first._vecb1, second._vecb1)};
   }
 
   static Vectorized<ComplexFlt> el_mergeo(
+<<<<<<< HEAD
       Vectorized<ComplexFlt>& first,
       Vectorized<ComplexFlt>& second) {
+=======
+      const Vectorized<ComplexFlt>& first,
+      const Vectorized<ComplexFlt>& second) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {
         vec_mergeo(first._vecb0, second._vecb0),
         vec_mergeo(first._vecb1, second._vecb1)};
@@ -469,7 +523,12 @@ class Vectorized<ComplexFlt> {
     return Vectorized(pi_2) - asin();
   }
 
+<<<<<<< HEAD
   Vectorized<ComplexFlt> inline operator*(const Vectorized<ComplexFlt>& b) const {
+=======
+  Vectorized<ComplexFlt> inline operator*(
+      const Vectorized<ComplexFlt>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
 
 #if 1
@@ -480,7 +539,11 @@ class Vectorized<ComplexFlt> {
     vi = vi ^ rsign_mask;
     auto ret = elwise_mult(vr);
     auto vx_swapped = el_swapped();
+<<<<<<< HEAD
     ret = vx_swapped.el_madd(vi, ret);
+=======
+    ret = vx_swapped.elwise_mult(vi) + ret;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ret;
 
 #else
@@ -494,6 +557,7 @@ class Vectorized<ComplexFlt> {
 #endif
   }
 
+<<<<<<< HEAD
   Vectorized<ComplexFlt> inline operator/(const Vectorized<ComplexFlt>& b) const {
     // re + im*i = (a + bi)  / (c + di)
     // re = (ac + bd)/abs_2()
@@ -513,6 +577,40 @@ class Vectorized<ComplexFlt> {
     auto denom2 = b2.abs_2_();                // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2
     ret = ret.elwise_div(denom2);
     return ret;
+=======
+  Vectorized<ComplexFlt> inline operator/(
+      const Vectorized<ComplexFlt>& b) const {
+#if 1
+    __at_align__ c10::complex<float>
+        tmp1[Vectorized<c10::complex<float>>::size()];
+    __at_align__ c10::complex<float>
+        tmp2[Vectorized<c10::complex<float>>::size()];
+    __at_align__ c10::complex<float>
+        out[Vectorized<c10::complex<float>>::size()];
+    this->store(tmp1);
+    b.store(tmp2);
+
+    for (const auto i : c10::irange(Vectorized<c10::complex<float>>::size())) {
+      out[i] = tmp1[i] / tmp2[i];
+    }
+    return loadu(out);
+#else
+    auto fabs_cd = Vectorized{
+        vec_andc(b._vec0, sign_mask), vec_andc(b._vec1, sign_mask)}; // |c| |d|
+    auto fabs_dc = fabs_cd.el_swapped(); // |d|            |c|
+    auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|)
+    auto a2 = elwise_div(scale); // a/sc           b/sc
+    auto b2 = b.elwise_div(scale); // c/sc           d/sc
+    auto acbd2 = a2.elwise_mult(b2); // ac/sc^2        bd/s
+    auto dc2 = b2.el_swapped(); // d/sc           c/sc
+    dc2 = dc2 ^ rsign_mask; // -d/sc          c/sc
+    auto adbc2 = a2.elwise_mult(dc2); // -ad/sc^2       bc/sc^2
+    auto ret = horizontal_add(acbd2, adbc2); // (ac+bd)/sc^2   (bc-ad)/sc^2
+    auto denom2 = b2.abs_2_(); // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2
+    ret = ret.elwise_div(denom2);
+    return ret;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<ComplexFlt> asin() const {
@@ -549,6 +647,7 @@ class Vectorized<ComplexFlt> {
   }
 
   Vectorized<ComplexFlt> eq(const Vectorized<ComplexFlt>& other) const {
+<<<<<<< HEAD
     auto eq = (*this == other);  // compares real and imag individually
     // If both real numbers and imag numbers are equal, then the complex numbers are equal
     return (eq.real() & eq.imag()) & one;
@@ -556,6 +655,17 @@ class Vectorized<ComplexFlt> {
   Vectorized<ComplexFlt> ne(const Vectorized<ComplexFlt>& other) const {
     auto ne = (*this != other);  // compares real and imag individually
     // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
+=======
+    auto eq = (*this == other); // compares real and imag individually
+    // If both real numbers and imag numbers are equal, then the complex numbers
+    // are equal
+    return (eq.real() & eq.imag()) & one;
+  }
+  Vectorized<ComplexFlt> ne(const Vectorized<ComplexFlt>& other) const {
+    auto ne = (*this != other); // compares real and imag individually
+    // If either real numbers or imag numbers are not equal, then the complex
+    // numbers are not equal
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (ne.real() | ne.imag()) & one;
   }
 
@@ -631,6 +741,7 @@ Vectorized<ComplexFlt> inline minimum(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<ComplexFlt> C10_ALWAYS_INLINE operator+(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
   return Vectorized<ComplexFlt>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
 }
@@ -656,5 +767,114 @@ Vectorized<ComplexFlt> C10_ALWAYS_INLINE operator^(const Vectorized<ComplexFlt>&
 }
 
 } // namespace
+=======
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator+(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator-(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator&(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator|(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator^(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator*(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  // (a + ib) * (c + id) = (ac - bd) + i(ad + bc)
+  // Split into real and imaginary parts
+  auto a_real = a.el_mergee(); // real part of a
+  auto a_imag = a.el_mergeo(); // imag part of a
+  auto b_real = b.el_mergee(); // real part of b
+  auto b_imag = b.el_mergeo(); // imag part of b
+
+  auto b_imag_neg = b_imag ^ rsign_mask;
+  // Compute components
+  auto ac = a_real.elwise_mult(b_real); // real * real
+  auto bd = a_imag.elwise_mult(b_imag_neg); // imag * imag
+  auto ad = a_real.elwise_mult(b_imag); // real * imag
+  auto bc = a_imag.elwise_mult(b_real); // imag * real
+
+  // Real = ac - bd (fix the negative bd part)
+  auto real = ac + bd; // Real part calculation
+  auto imag = ad + bc; // Imaginary part calculation
+
+  // Step 1: Extract from real and imag
+  __vector float r0 = real.vec0(); // {r0, r1, r2, r3}
+  __vector float i0 = imag.vec0(); // {i0, i1, i2, i3}
+
+  __vector float r1 = real.vec1(); // imag[0..3]
+  __vector float i1 = imag.vec1(); // imag[4..7]
+
+  __vector unsigned char perm_lo = {
+      0,
+      1,
+      2,
+      3, // r0
+      16,
+      17,
+      18,
+      19, //
+      8,
+      9,
+      10,
+      11, // r1
+      24,
+      25,
+      26,
+      27};
+  __vector float v0 =
+      vec_perm(r0, i0, perm_lo); // Interleave r0 and i0, r1 and i1
+  __vector float v1 = vec_perm(r1, i1, perm_lo);
+  Vectorized<ComplexFlt> result(v0, v1);
+  return result;
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+operator/(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  // Take absolute values of real and imaginary parts of b
+  __at_align__ c10::complex<float>
+      tmp1[Vectorized<c10::complex<float>>::size()];
+  __at_align__ c10::complex<float>
+      tmp2[Vectorized<c10::complex<float>>::size()];
+  __at_align__ c10::complex<float> out[Vectorized<c10::complex<float>>::size()];
+  a.store(tmp1);
+  b.store(tmp2);
+  for (const auto i :
+       c10::irange(Vectorized<c10::complex<float>>::
+                       size())) { //{Vectorized<c10::complex<float>>::size()))
+                                  //{
+    out[i] = tmp1[i] / tmp2[i];
+  }
+  return Vectorized<ComplexFlt>::loadu(out);
+}
+
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
index ff10618611f9..7fdeb12c2f19 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
@@ -1,8 +1,13 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 
 #include <sleef.h>
@@ -12,6 +17,11 @@ namespace vec {
 
 inline namespace CPU_CAPABILITY {
 
+<<<<<<< HEAD
+=======
+template <>
+struct is_vec_specialized_for<double> : std::bool_constant<true> {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <>
 class Vectorized<double> {
@@ -39,8 +49,15 @@ class Vectorized<double> {
   Vectorized() {}
   C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {}
   C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+<<<<<<< HEAD
   C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) : _vec0{v1}, _vec1{v2} {}
   C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {}
+=======
+  C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2)
+      : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_ALWAYS_INLINE Vectorized(double scalar)
       : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
   C10_ALWAYS_INLINE Vectorized(
@@ -63,6 +80,7 @@ class Vectorized<double> {
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoiceDbl(mask) == 0, Vectorized<double>> C10_ALWAYS_INLINE
       blend(const Vectorized<double>& a, const Vectorized<double>& b) {
       return a;
@@ -120,10 +138,72 @@ class Vectorized<double> {
       // generated masks
       return { b._vec0,
           (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) };
+=======
+  static std::enable_if_t<blendChoiceDbl(mask) == 0, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    return a;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 1, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    return b;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 2, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 3, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    return {a._vec0, b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 4, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    const vbool64 mask_1st = VsxDblMask1(mask);
+    return {(vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 5, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    const vbool64 mask_1st = VsxDblMask1(mask);
+    return {(vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 6, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    const vbool64 mask_2nd = VsxDblMask2(mask);
+    // generated masks
+    return {a._vec0, (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 7, Vectorized<double>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    const vbool64 mask_2nd = VsxDblMask2(mask);
+    // generated masks
+    return {b._vec0, (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   template <int64_t mask>
   static std::enable_if_t<blendChoiceDbl(mask) == 8, Vectorized<double>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<double>& a, const Vectorized<double>& b) {
       const vbool64 mask_1st = VsxDblMask1(mask);
       const vbool64 mask_2nd = VsxDblMask2(mask);
@@ -133,6 +213,17 @@ class Vectorized<double> {
   }
 
 
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    const vbool64 mask_1st = VsxDblMask1(mask);
+    const vbool64 mask_2nd = VsxDblMask2(mask);
+    return {
+        (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static Vectorized<double> C10_ALWAYS_INLINE blendv(
       const Vectorized<double>& a,
       const Vectorized<double>& b,
@@ -144,12 +235,26 @@ class Vectorized<double> {
         vec_sel(a._vec1, b._vec1, mask._vecb1)};
   }
   template <typename step_t>
+<<<<<<< HEAD
   static Vectorized<double> arange(double base = 0., step_t step = static_cast<step_t>(1)) {
     return Vectorized<double>(base, base + step, base + 2 * step, base + 3 * step);
   }
 
   static Vectorized<double> C10_ALWAYS_INLINE
   set(const Vectorized<double>& a, const Vectorized<double>& b, size_t count = size()) {
+=======
+  static Vectorized<double> arange(
+      double base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<double>(
+        base, base + step, base + 2 * step, base + 3 * step);
+  }
+
+  static Vectorized<double> C10_ALWAYS_INLINE
+  set(const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      size_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -192,15 +297,24 @@ class Vectorized<double> {
   double& operator[](int idx) = delete;
   Vectorized<double> map(double (*const f)(double)) const {
     Vectorized<double> ret;
+<<<<<<< HEAD
     for (const auto i : c10::irange(size()/2)) {
         ret._vec0[i] = f(_vec0[i]);
     }
     for (const auto i : c10::irange(size()/2)) {
         ret._vec1[i] = f(_vec1[i]);
+=======
+    for (const auto i : c10::irange(size() / 2)) {
+      ret._vec0[i] = f(_vec0[i]);
+    }
+    for (const auto i : c10::irange(size() / 2)) {
+      ret._vec1[i] = f(_vec1[i]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return ret;
   }
 
+<<<<<<< HEAD
   Vectorized<double> mapbi(double (*const f)(double, double), const Vectorized<double>& other)
       const {
     Vectorized<double> ret;
@@ -209,6 +323,17 @@ class Vectorized<double> {
     }
     for (const auto i : c10::irange(size()/2)) {
         ret._vec1[i] = f(_vec1[i], other._vec1[i]);
+=======
+  Vectorized<double> mapbi(
+      double (*const f)(double, double),
+      const Vectorized<double>& other) const {
+    Vectorized<double> ret;
+    for (const auto i : c10::irange(size() / 2)) {
+      ret._vec0[i] = f(_vec0[i], other._vec0[i]);
+    }
+    for (const auto i : c10::irange(size() / 2)) {
+      ret._vec1[i] = f(_vec1[i], other._vec1[i]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return ret;
   }
@@ -217,6 +342,7 @@ class Vectorized<double> {
   }
 
   Vectorized<double> C10_ALWAYS_INLINE acos() const {
+<<<<<<< HEAD
      return {Sleef_acosd2_u10(_vec0), Sleef_acosd2_u10(_vec1)};
   }
   Vectorized<double> C10_ALWAYS_INLINE acosh() const {
@@ -248,11 +374,48 @@ class Vectorized<double> {
   }
   Vectorized<double> C10_ALWAYS_INLINE exp() const {
      return {Sleef_expd2_u10(_vec0), Sleef_expd2_u10(_vec1)};
+=======
+    return {Sleef_acosd2_u10(_vec0), Sleef_acosd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE acosh() const {
+    return {Sleef_acoshd2_u10(_vec0), Sleef_acoshd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE asin() const {
+    return {Sleef_asind2_u10(_vec0), Sleef_asind2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE asinh() const {
+    return {Sleef_asinhd2_u10(_vec0), Sleef_asinhd2_u10(_vec1)};
+  }
+  Vectorized<double> atan() const {
+    return {Sleef_atand2_u10(_vec0), Sleef_atand2_u10(_vec1)};
+  }
+  Vectorized<double> atanh() const {
+    return {Sleef_atanhd2_u10(_vec0), Sleef_atanhd2_u10(_vec1)};
+  }
+  Vectorized<double> atan2(const Vectorized<double>& b) const {
+    return {
+        Sleef_atan2d2_u10(_vec0, b._vec0), Sleef_atan2d2_u10(_vec1, b._vec1)};
+  }
+  Vectorized<double> copysign(const Vectorized<double>& sign) const {
+    return {
+        Sleef_copysignd2(_vec0, sign._vec0),
+        Sleef_copysignd2(_vec1, sign._vec1)};
+  }
+  Vectorized<double> erf() const {
+    return {Sleef_erfd2_u10(_vec0), Sleef_erfd2_u10(_vec1)};
+  }
+  Vectorized<double> erfc() const {
+    return {Sleef_erfcd2_u15(_vec0), Sleef_erfcd2_u15(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE exp() const {
+    return {Sleef_expd2_u10(_vec0), Sleef_expd2_u10(_vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> C10_ALWAYS_INLINE exp2() const {
     return {Sleef_exp2d2_u10(_vec0), Sleef_exp2d2_u10(_vec1)};
   }
   Vectorized<double> expm1() const {
+<<<<<<< HEAD
      return {Sleef_expm1d2_u10(_vec0), Sleef_expm1d2_u10(_vec1)};
   }
   Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
@@ -261,6 +424,16 @@ class Vectorized<double> {
 
   Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
      return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
+=======
+    return {Sleef_expm1d2_u10(_vec0), Sleef_expm1d2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
+    return exp();
+  }
+
+  Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
+    return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<double> erfinv() const {
@@ -269,7 +442,13 @@ class Vectorized<double> {
 
   Vectorized<double> angle() const {
     auto tmp = blendv(
+<<<<<<< HEAD
       Vectorized<double>(0), Vectorized<double>(c10::pi<double>), *this < Vectorized<double>(0));
+=======
+        Vectorized<double>(0),
+        Vectorized<double>(c10::pi<double>),
+        *this < Vectorized<double>(0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return blendv(tmp, *this, isnan());
   }
   Vectorized<double> real() const {
@@ -283,6 +462,7 @@ class Vectorized<double> {
   }
 
   Vectorized<double> C10_ALWAYS_INLINE log() const {
+<<<<<<< HEAD
      return {Sleef_logd2_u10(_vec0), Sleef_logd2_u10(_vec1)};
   }
   Vectorized<double> C10_ALWAYS_INLINE log10() const {
@@ -293,15 +473,34 @@ class Vectorized<double> {
   }
   Vectorized<double> C10_ALWAYS_INLINE log2() const {
      return {Sleef_log2d2_u10(_vec0), Sleef_log2d2_u10(_vec1)};
+=======
+    return {Sleef_logd2_u10(_vec0), Sleef_logd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE log10() const {
+    return {Sleef_log10d2_u10(_vec0), Sleef_log10d2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE log1p() const {
+    return {Sleef_log1pd2_u10(_vec0), Sleef_log1pd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE log2() const {
+    return {Sleef_log2d2_u10(_vec0), Sleef_log2d2_u10(_vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> C10_ALWAYS_INLINE ceil() const {
     return {vec_ceil(_vec0), vec_ceil(_vec1)};
   }
   Vectorized<double> C10_ALWAYS_INLINE cos() const {
+<<<<<<< HEAD
      return {Sleef_cosd2_u10(_vec0), Sleef_cosd2_u10(_vec1)};
   }
   Vectorized<double> C10_ALWAYS_INLINE cosh() const {
      return {Sleef_coshd2_u10(_vec0), Sleef_coshd2_u10(_vec1)};
+=======
+    return {Sleef_cosd2_u10(_vec0), Sleef_cosd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE cosh() const {
+    return {Sleef_coshd2_u10(_vec0), Sleef_coshd2_u10(_vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> C10_ALWAYS_INLINE floor() const {
     return {vec_floor(_vec0), vec_floor(_vec1)};
@@ -313,6 +512,7 @@ class Vectorized<double> {
     return {vec_rint(_vec0), vec_rint(_vec1)};
   }
   Vectorized<double> C10_ALWAYS_INLINE sin() const {
+<<<<<<< HEAD
      return {Sleef_sind2_u10(_vec0), Sleef_sind2_u10(_vec1)};
   }
   Vectorized<double> C10_ALWAYS_INLINE sinh() const {
@@ -323,6 +523,18 @@ class Vectorized<double> {
   }
   Vectorized<double> C10_ALWAYS_INLINE tanh() const {
      return {Sleef_tanhd2_u10(_vec0), Sleef_tanhd2_u10(_vec1)};
+=======
+    return {Sleef_sind2_u10(_vec0), Sleef_sind2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE sinh() const {
+    return {Sleef_sinhd2_u10(_vec0), Sleef_sinhd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE tan() const {
+    return {Sleef_tand2_u10(_vec0), Sleef_tand2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE tanh() const {
+    return {Sleef_tanhd2_u10(_vec0), Sleef_tanhd2_u10(_vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> C10_ALWAYS_INLINE trunc() const {
     return {vec_trunc(_vec0), vec_trunc(_vec1)};
@@ -345,6 +557,7 @@ class Vectorized<double> {
   }
 
   Vectorized<double> C10_ALWAYS_INLINE pow(const Vectorized<double>& b) const {
+<<<<<<< HEAD
      return {Sleef_powd2_u10(_vec0, b._vec0), Sleef_powd2_u10(_vec1, b._vec1)};
   }
   Vectorized<double> C10_ALWAYS_INLINE fmod(const Vectorized<double>& b) const {
@@ -357,6 +570,22 @@ class Vectorized<double> {
 
   Vectorized<double> nextafter(const Vectorized<double>& b) const {
      return {Sleef_nextafterd2(_vec0, b._vec0), Sleef_nextafterd2(_vec1, b._vec1)};
+=======
+    return {Sleef_powd2_u10(_vec0, b._vec0), Sleef_powd2_u10(_vec1, b._vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE fmod(const Vectorized<double>& b) const {
+    return {Sleef_fmodd2(_vec0, b._vec0), Sleef_fmodd2(_vec1, b._vec1)};
+  }
+
+  Vectorized<double> hypot(const Vectorized<double>& b) const {
+    return {
+        Sleef_hypotd2_u05(_vec0, b._vec0), Sleef_hypotd2_u05(_vec1, b._vec1)};
+  }
+
+  Vectorized<double> nextafter(const Vectorized<double>& b) const {
+    return {
+        Sleef_nextafterd2(_vec0, b._vec0), Sleef_nextafterd2(_vec1, b._vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<double> igamma(const Vectorized<double>& x) const {
@@ -367,7 +596,10 @@ class Vectorized<double> {
     return mapbi(calc_igammac, x);
   }
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<double> i0() const {
     return map(calc_i0);
   }
@@ -390,6 +622,7 @@ class Vectorized<double> {
     return ret._nor();
   }
   bool has_inf_nan() const {
+<<<<<<< HEAD
     for (const auto i : c10::irange(size()/2)) {
       if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
         return true;
@@ -397,6 +630,15 @@ class Vectorized<double> {
     }
     for (const auto i : c10::irange(size()/2)) {
       if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+=======
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return true;
       }
     }
@@ -441,6 +683,7 @@ Vectorized<double> inline minimum(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> C10_ALWAYS_INLINE operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
   return Vectorized<double>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
 }
@@ -476,5 +719,56 @@ Vectorized<double> C10_ALWAYS_INLINE operator^(const Vectorized<double>& a, cons
 }
 
 } // namespace
+=======
+Vectorized<double> C10_ALWAYS_INLINE
+operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE
+operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
index 246f0e8a7f1e..04556891c5d7 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
@@ -1,8 +1,13 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <sleef.h>
 namespace at {
 namespace vec {
@@ -11,6 +16,12 @@ namespace vec {
 inline namespace CPU_CAPABILITY {
 
 template <>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<float> : std::bool_constant<true> {};
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Vectorized<float> {
  private:
   union {
@@ -38,8 +49,15 @@ class Vectorized<float> {
 
   C10_ALWAYS_INLINE Vectorized(vfloat32 v) : _vec0{v}, _vec1{v} {}
   C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+<<<<<<< HEAD
   C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2) : _vec0{v1}, _vec1{v2} {}
   C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {}
+=======
+  C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2)
+      : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_ALWAYS_INLINE Vectorized(float scalar)
       : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
   C10_ALWAYS_INLINE Vectorized(
@@ -61,62 +79,116 @@ class Vectorized<float> {
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoice(mask) == 0, Vectorized<float>> C10_ALWAYS_INLINE
   blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  static std::enable_if_t<blendChoice(mask) == 0, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return a;
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoice(mask) == 1, Vectorized<float>> C10_ALWAYS_INLINE
   blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  static std::enable_if_t<blendChoice(mask) == 1, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return b;
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoice(mask) == 2, Vectorized<float>> C10_ALWAYS_INLINE
   blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  static std::enable_if_t<blendChoice(mask) == 2, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {b._vec0, a._vec1};
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoice(mask) == 3, Vectorized<float>> C10_ALWAYS_INLINE
   blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  static std::enable_if_t<blendChoice(mask) == 3, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {a._vec0, b._vec1};
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoice(mask) == 4, Vectorized<float>> C10_ALWAYS_INLINE
   blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  static std::enable_if_t<blendChoice(mask) == 4, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const vbool32 mask_1st = VsxMask1(mask);
     return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1};
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoice(mask) == 5, Vectorized<float>> C10_ALWAYS_INLINE
   blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  static std::enable_if_t<blendChoice(mask) == 5, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const vbool32 mask_1st = VsxMask1(mask);
     return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1};
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoice(mask) == 6, Vectorized<float>> C10_ALWAYS_INLINE
   blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  static std::enable_if_t<blendChoice(mask) == 6, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const vbool32 mask_2nd = VsxMask2(mask);
     // generated masks
     return {a._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoice(mask) == 7, Vectorized<float>> C10_ALWAYS_INLINE
   blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  static std::enable_if_t<blendChoice(mask) == 7, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const vbool32 mask_2nd = VsxMask2(mask);
     // generated masks
     return {b._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
   }
 
   template <int64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<blendChoice(mask) == 8, Vectorized<float>> C10_ALWAYS_INLINE
   blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+  static std::enable_if_t<blendChoice(mask) == 8, Vectorized<float>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const vbool32 mask_1st = VsxMask1(mask);
     const vbool32 mask_2nd = VsxMask2(mask);
     return {
@@ -136,7 +208,13 @@ class Vectorized<float> {
   }
 
   template <typename step_t>
+<<<<<<< HEAD
   static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
+=======
+  static Vectorized<float> arange(
+      float base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<float>(
         base,
         base + step,
@@ -212,8 +290,14 @@ class Vectorized<float> {
     return ret;
   }
 
+<<<<<<< HEAD
   Vectorized<float> mapbi(float (*const f)(float, float), const Vectorized<float>& other)
       const {
+=======
+  Vectorized<float> mapbi(
+      float (*const f)(float, float),
+      const Vectorized<float>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized<float> ret;
     for (int i = 0; i < size() / 2; i++) {
       ret._vec0[i] = f(_vec0[i], other._vec0[i]);
@@ -235,6 +319,7 @@ class Vectorized<float> {
   }
 
   bool has_inf_nan() const {
+<<<<<<< HEAD
     for (const auto i : c10::irange(size()/2)) {
       if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
         return true;
@@ -242,6 +327,15 @@ class Vectorized<float> {
     }
     for (const auto i : c10::irange(size()/2)) {
       if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+=======
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return true;
       }
     }
@@ -268,7 +362,11 @@ class Vectorized<float> {
     return {Sleef_acosf4_u10(_vec0), Sleef_acosf4_u10(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE acosh() const {
+<<<<<<< HEAD
   return {Sleef_acoshf4_u10(_vec0), Sleef_acoshf4_u10(_vec1)};
+=======
+    return {Sleef_acoshf4_u10(_vec0), Sleef_acoshf4_u10(_vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> C10_ALWAYS_INLINE asin() const {
     return {Sleef_asinf4_u10(_vec0), Sleef_asinf4_u10(_vec1)};
@@ -283,10 +381,20 @@ class Vectorized<float> {
     return {Sleef_atanhf4_u10(_vec0), Sleef_atanhf4_u10(_vec1)};
   }
   Vectorized<float> atan2(const Vectorized<float>& b) const {
+<<<<<<< HEAD
     return {Sleef_atan2f4_u10(_vec0, b._vec0), Sleef_atan2f4_u10(_vec1, b._vec1)};
   }
   Vectorized<float> copysign(const Vectorized<float> &sign) const {
     return {Sleef_copysignf4(_vec0, sign._vec0), Sleef_copysignf4(_vec1, sign._vec1)};
+=======
+    return {
+        Sleef_atan2f4_u10(_vec0, b._vec0), Sleef_atan2f4_u10(_vec1, b._vec1)};
+  }
+  Vectorized<float> copysign(const Vectorized<float>& sign) const {
+    return {
+        Sleef_copysignf4(_vec0, sign._vec0),
+        Sleef_copysignf4(_vec1, sign._vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> lgamma() const {
     return {Sleef_lgammaf4_u10(_vec0), Sleef_lgammaf4_u10(_vec1)};
@@ -305,7 +413,13 @@ class Vectorized<float> {
 
   Vectorized<float> angle() const {
     auto tmp = blendv(
+<<<<<<< HEAD
       Vectorized<float>(0), Vectorized<float>(c10::pi<float>), *this < Vectorized<float>(0));
+=======
+        Vectorized<float>(0),
+        Vectorized<float>(c10::pi<float>),
+        *this < Vectorized<float>(0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return blendv(tmp, *this, isnan());
   }
   Vectorized<float> real() const {
@@ -393,6 +507,7 @@ class Vectorized<float> {
   }
 
   Vectorized<float> C10_ALWAYS_INLINE pow(const Vectorized<float>& exp) const {
+<<<<<<< HEAD
     return {Sleef_powf4_u10(_vec0, exp._vec0), Sleef_powf4_u10(_vec1, exp._vec1)};
   }
 
@@ -406,6 +521,24 @@ class Vectorized<float> {
 
   Vectorized<float> nextafter(const Vectorized<float>& b) const {
     return {Sleef_nextafterf4(_vec0, b._vec0), Sleef_nextafterf4(_vec1, b._vec1)};
+=======
+    return {
+        Sleef_powf4_u10(_vec0, exp._vec0), Sleef_powf4_u10(_vec1, exp._vec1)};
+  }
+
+  Vectorized<float> fmod(const Vectorized<float>& b) const {
+    return {Sleef_fmodf4(_vec0, b._vec0), Sleef_fmodf4(_vec1, b._vec1)};
+  }
+
+  Vectorized<float> hypot(const Vectorized<float>& b) const {
+    return {
+        Sleef_hypotf4_u05(_vec0, b._vec0), Sleef_hypotf4_u05(_vec1, b._vec1)};
+  }
+
+  Vectorized<float> nextafter(const Vectorized<float>& b) const {
+    return {
+        Sleef_nextafterf4(_vec0, b._vec0), Sleef_nextafterf4(_vec1, b._vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<float> igamma(const Vectorized<float>& x) const {
@@ -453,16 +586,29 @@ class Vectorized<float> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline maximum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline minimum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.minimum(b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> C10_ALWAYS_INLINE operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
   return Vectorized<float>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
 }
@@ -498,5 +644,56 @@ Vectorized<float> C10_ALWAYS_INLINE operator^(const Vectorized<float>& a, const
 }
 
 } // namespace
+=======
+Vectorized<float> C10_ALWAYS_INLINE
+operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE
+operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
index ae146dae4d42..006053d6fed3 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
@@ -1,14 +1,25 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at {
 namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
 template <>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<int16_t> : std::bool_constant<true> {};
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Vectorized<int16_t> {
  private:
   union {
@@ -35,7 +46,12 @@ class Vectorized<int16_t> {
   C10_ALWAYS_INLINE Vectorized(vint16 v) : _vec0{v}, _vec1{v} {}
   C10_ALWAYS_INLINE Vectorized(vbool16 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
   C10_ALWAYS_INLINE Vectorized(vint16 v1, vint16 v2) : _vec0{v1}, _vec1{v2} {}
+<<<<<<< HEAD
   C10_ALWAYS_INLINE Vectorized(vbool16 v1, vbool16 v2) : _vecb0{v1}, _vecb1{v2} {}
+=======
+  C10_ALWAYS_INLINE Vectorized(vbool16 v1, vbool16 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_ALWAYS_INLINE Vectorized(int16_t scalar)
       : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
 
@@ -89,7 +105,12 @@ class Vectorized<int16_t> {
 
   template <uint64_t mask>
   static std::enable_if_t<(mask & 65535) == 65535, Vectorized<int16_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return b;
   }
 
@@ -101,7 +122,12 @@ class Vectorized<int16_t> {
 
   template <uint64_t mask>
   static std::enable_if_t<(mask > 0 && mask < 255), Vectorized<int16_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr int16_t g0 = (mask & 1) * 0xffff;
     constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff;
     constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff;
@@ -119,7 +145,12 @@ class Vectorized<int16_t> {
   static std::enable_if_t<
       (mask > 255 && (mask & 65535) != 65535 && ((mask & 255) == 255)),
       Vectorized<int16_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr int16_t g0_2 = (mask & 1) * 0xffff;
     constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff;
     constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff;
@@ -139,7 +170,12 @@ class Vectorized<int16_t> {
   static std::enable_if_t<
       (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) == 0)),
       Vectorized<int16_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr int16_t mask2 = (mask & 65535) >> 16;
     constexpr int16_t g0_2 = (mask & 1) * 0xffff;
     constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff;
@@ -161,7 +197,12 @@ class Vectorized<int16_t> {
       (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) != 0) &&
        ((mask & 255) != 255)),
       Vectorized<int16_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr int16_t g0 = (mask & 1) * 0xffff;
     constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff;
     constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff;
@@ -202,7 +243,13 @@ class Vectorized<int16_t> {
   }
 
   template <typename step_t>
+<<<<<<< HEAD
   static Vectorized<int16_t> arange(int16_t base = 0, step_t step = static_cast<step_t>(1)) {
+=======
+  static Vectorized<int16_t> arange(
+      int16_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<int16_t>(
         base,
         base + step,
@@ -282,7 +329,12 @@ class Vectorized<int16_t> {
       __at_align__ value_type tmp_values[size()];
       vec_vsx_st(_vec0, offset0, tmp_values);
       vec_vsx_st(_vec1, offset16, tmp_values);
+<<<<<<< HEAD
       std::memcpy(ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+=======
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   const int16_t& operator[](int idx) const = delete;
@@ -290,7 +342,13 @@ class Vectorized<int16_t> {
 
   Vectorized<int16_t> angle() const {
     return blendv(
+<<<<<<< HEAD
       Vectorized<int16_t>(0), Vectorized<int16_t>(c10::pi<int16_t>), *this < Vectorized<int16_t>(0));
+=======
+        Vectorized<int16_t>(0),
+        Vectorized<int16_t>(c10::pi<int16_t>),
+        *this < Vectorized<int16_t>(0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<int16_t> real() const {
     return *this;
@@ -335,6 +393,7 @@ class Vectorized<int16_t> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
                vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
                vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
@@ -346,6 +405,25 @@ Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vector
                vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
                vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1()) ;
          return Vectorized<int16_t>{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+=======
+Vectorized<int16_t> inline operator<<(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
+  vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
+  return Vectorized<int16_t>{
+      vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
+  vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
+  return Vectorized<int16_t>{
+      vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
@@ -363,6 +441,7 @@ Vectorized<int16_t> inline minimum(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> C10_ALWAYS_INLINE operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
   return Vectorized<int16_t>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
 }
@@ -398,5 +477,55 @@ Vectorized<int16_t> C10_ALWAYS_INLINE operator^(const Vectorized<int16_t>& a, co
 }
 
 } // namespace
+=======
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator-(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator*(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator/(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{a.vec0() / b.vec0(), a.vec1() / b.vec1()};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator&(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator|(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE
+operator^(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
index 98401381c6e8..6ec23580acac 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
@@ -1,14 +1,25 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at {
 namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
 template <>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<int32_t> : std::bool_constant<true> {};
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Vectorized<int32_t> {
  private:
   union {
@@ -35,7 +46,12 @@ class Vectorized<int32_t> {
   C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {}
   C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
   C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {}
+<<<<<<< HEAD
   C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {}
+=======
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_ALWAYS_INLINE Vectorized(int32_t scalar)
       : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
   C10_ALWAYS_INLINE Vectorized(
@@ -63,8 +79,14 @@ class Vectorized<int32_t> {
   }
 
   template <uint64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<(mask & 255) == 255, Vectorized<int32_t>> C10_ALWAYS_INLINE
   blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+  static std::enable_if_t<(mask & 255) == 255, Vectorized<int32_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return b;
   }
 
@@ -76,7 +98,12 @@ class Vectorized<int32_t> {
 
   template <uint64_t mask>
   static std::enable_if_t<(mask > 0 && mask < 15), Vectorized<int32_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr uint32_t g0 = (mask & 1) * 0xffffffff;
     constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
     constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
@@ -90,7 +117,12 @@ class Vectorized<int32_t> {
   static std::enable_if_t<
       (mask > 15 && (mask & 255) != 255 && ((mask & 15) == 15)),
       Vectorized<int32_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr uint32_t mask2 = (mask & 255) >> 4;
     constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff;
     constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff;
@@ -106,7 +138,12 @@ class Vectorized<int32_t> {
   static std::enable_if_t<
       (mask > 15 && ((mask & 255) != 255) && ((mask & 15) == 0)),
       Vectorized<int32_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr uint32_t mask2 = (mask & 255) >> 4;
     constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff;
     constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff;
@@ -123,7 +160,12 @@ class Vectorized<int32_t> {
       (mask > 15 && ((mask & 255) != 255) && ((mask & 15) != 0) &&
        ((mask & 15) != 15)),
       Vectorized<int32_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr uint32_t g0 = (mask & 1) * 0xffffffff;
     constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
     constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
@@ -155,7 +197,13 @@ class Vectorized<int32_t> {
   }
 
   template <typename step_t>
+<<<<<<< HEAD
   static Vectorized<int32_t> arange(int32_t base = 0.f, step_t step = static_cast<step_t>(1)) {
+=======
+  static Vectorized<int32_t> arange(
+      int32_t base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<int32_t>(
         base,
         base + step,
@@ -221,7 +269,13 @@ class Vectorized<int32_t> {
 
   Vectorized<int32_t> angle() const {
     return blendv(
+<<<<<<< HEAD
       Vectorized<int32_t>(0), Vectorized<int32_t>(c10::pi<int32_t>), *this < Vectorized<int32_t>(0));
+=======
+        Vectorized<int32_t>(0),
+        Vectorized<int32_t>(c10::pi<int32_t>),
+        *this < Vectorized<int32_t>(0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<int32_t> real() const {
     return *this;
@@ -266,6 +320,7 @@ class Vectorized<int32_t> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
                 vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
                 vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1()) ;
@@ -277,6 +332,25 @@ Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vector
                 vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
                 vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1()) ;
           return Vectorized<int32_t>{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+=======
+Vectorized<int32_t> inline operator<<(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
+  vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1());
+  return Vectorized<int32_t>{
+      vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
+  vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1());
+  return Vectorized<int32_t>{
+      vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
@@ -294,6 +368,7 @@ Vectorized<int32_t> inline minimum(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> C10_ALWAYS_INLINE operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
   return Vectorized<int32_t>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
 }
@@ -329,5 +404,55 @@ Vectorized<int32_t> C10_ALWAYS_INLINE operator^(const Vectorized<int32_t>& a, co
 }
 
 } // namespace
+=======
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator-(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator*(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator/(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{a.vec0() / b.vec0(), a.vec1() / b.vec1()};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator&(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator|(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+operator^(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
index f8217930fa49..fa164b13672c 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
@@ -1,14 +1,25 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at {
 namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
 template <>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<int64_t> : std::bool_constant<true> {};
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Vectorized<int64_t> {
  private:
   union {
@@ -36,7 +47,12 @@ class Vectorized<int64_t> {
   C10_ALWAYS_INLINE Vectorized(vint64 v) : _vec0{v}, _vec1{v} {}
   C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
   C10_ALWAYS_INLINE Vectorized(vint64 v1, vint64 v2) : _vec0{v1}, _vec1{v2} {}
+<<<<<<< HEAD
   C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {}
+=======
+  C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_ALWAYS_INLINE Vectorized(int64_t scalar)
       : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
   C10_ALWAYS_INLINE Vectorized(
@@ -66,14 +82,26 @@ class Vectorized<int64_t> {
   }
 
   template <uint64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<(mask & 15) == 15, Vectorized<int64_t>> C10_ALWAYS_INLINE
   blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+  static std::enable_if_t<(mask & 15) == 15, Vectorized<int64_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return b;
   }
 
   template <uint64_t mask>
+<<<<<<< HEAD
   static std::enable_if_t<(mask > 0 && mask < 3), Vectorized<int64_t>> C10_ALWAYS_INLINE
   blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+  static std::enable_if_t<(mask > 0 && mask < 3), Vectorized<int64_t>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
     constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
     const vbool64 mask_1st = (vbool64){g0, g1};
@@ -82,7 +110,12 @@ class Vectorized<int64_t> {
 
   template <uint64_t mask>
   static std::enable_if_t<(mask > 3) && (mask & 3) == 0, Vectorized<int64_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff;
     constexpr uint64_t g1_2 = ((mask & 8) >> 3) * 0xffffffffffffffff;
 
@@ -94,7 +127,12 @@ class Vectorized<int64_t> {
   static std::enable_if_t<
       (mask > 3) && (mask & 3) != 0 && (mask & 15) != 15,
       Vectorized<int64_t>>
+<<<<<<< HEAD
       C10_ALWAYS_INLINE blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
     constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
     constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff;
@@ -118,8 +156,16 @@ class Vectorized<int64_t> {
         vec_sel(a._vec1, b._vec1, mask._vecb1)};
   }
   template <typename step_t>
+<<<<<<< HEAD
   static Vectorized<int64_t> arange(int64_t base = 0., step_t step = static_cast<step_t>(1)) {
     return Vectorized<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
+=======
+  static Vectorized<int64_t> arange(
+      int64_t base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int64_t>(
+        base, base + step, base + 2 * step, base + 3 * step);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   static Vectorized<int64_t> C10_ALWAYS_INLINE
@@ -174,7 +220,13 @@ class Vectorized<int64_t> {
 
   Vectorized<int64_t> angle() const {
     return blendv(
+<<<<<<< HEAD
       Vectorized<int64_t>(0), Vectorized<int64_t>(c10::pi<int64_t>), *this < Vectorized<int64_t>(0));
+=======
+        Vectorized<int64_t>(0),
+        Vectorized<int64_t>(c10::pi<int64_t>),
+        *this < Vectorized<int64_t>(0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<int64_t> real() const {
     return *this;
@@ -219,6 +271,7 @@ class Vectorized<int64_t> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
                 vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
                 vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1()) ;
@@ -230,6 +283,25 @@ Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vector
                 vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
                 vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1()) ;
           return Vectorized<int64_t>{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+=======
+Vectorized<int64_t> inline operator<<(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
+  vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1());
+  return Vectorized<int64_t>{
+      vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
+  vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1());
+  return Vectorized<int64_t>{
+      vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
@@ -247,6 +319,7 @@ Vectorized<int64_t> inline minimum(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> C10_ALWAYS_INLINE operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return Vectorized<int64_t>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
 }
@@ -282,5 +355,56 @@ Vectorized<int64_t> C10_ALWAYS_INLINE operator^(const Vectorized<int64_t>& a, co
 }
 
 } // namespace
+=======
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator-(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator*(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator&(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator|(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+operator^(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
index 8068d6102f4a..c74483abdef6 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
@@ -1,8 +1,13 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/qint32.h>
 #include <array>
 
@@ -24,14 +29,24 @@
 // specified by float_vec_return_type.
 //
 // When writing kernels with these vectors, it is expected that floating-
+<<<<<<< HEAD
 // point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
 // iterations.
+=======
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at {
 namespace vec {
 inline namespace CPU_CAPABILITY {
 
 template <>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<c10::qint32> : std::bool_constant<true> {};
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct Vectorized<c10::qint32> {
  private:
   union {
@@ -68,7 +83,12 @@ struct Vectorized<c10::qint32> {
   C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {}
   C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
   C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {}
+<<<<<<< HEAD
   C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {}
+=======
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2)
+      : _vecb0{v1}, _vecb1{v2} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Vectorized(const c10::qint32& val)
       : _vec0(vec_splats(val.val_)), _vec1(vec_splats(val.val_)) {}
@@ -114,11 +134,23 @@ struct Vectorized<c10::qint32> {
     vfloat32 float_vals1 = vec_float(_vec1);
     vfloat32 scale_vec0 = scale.vec0();
     vfloat32 scale_vec1 = scale.vec1();
+<<<<<<< HEAD
     vfloat32 scale_zp_premul0 = scale_zp_premul.vec0();
     vfloat32 scale_zp_premul1 = scale_zp_premul.vec1();
     return {Vectorized<float>{
         vec_madd(scale_vec0, float_vals0, scale_zp_premul0),
         vec_madd(scale_vec1, float_vals1, scale_zp_premul1)}};
+=======
+    vfloat32 zero_point_vec0 = zero_point.vec0();
+    vfloat32 zero_point_vec1 = zero_point.vec1();
+
+    vfloat32 vec_sub_zero_point_0 = vec_sub(float_vals0, zero_point_vec0);
+    vfloat32 vec_sub_zero_point_1 = vec_sub(float_vals1, zero_point_vec1);
+    Vectorized<float> vf0 = {
+        vec_mul(scale_vec0, vec_sub_zero_point_0),
+        vec_mul(scale_vec1, vec_sub_zero_point_1)};
+    return {vf0};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   float_vec_return_type dequantize(
@@ -154,8 +186,13 @@ struct Vectorized<c10::qint32> {
     vecf1 = vec_mul(vecf1, inverse_scale_v);
     vecf0 = vec_add(vec_rint(vecf0), vec_zero_point);
     vecf1 = vec_add(vec_rint(vecf1), vec_zero_point);
+<<<<<<< HEAD
     vint32 veci0  = vec_signed(vecf0);
     vint32 veci1  = vec_signed(vecf1);
+=======
+    vint32 veci0 = vec_signed(vecf0);
+    vint32 veci1 = vec_signed(vecf1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     veci0 = vec_max(veci0, vmin);
     veci1 = vec_max(veci1, vmin);
@@ -199,8 +236,13 @@ struct Vectorized<c10::qint32> {
     vecf0 = vec_rint(vecf0);
     vecf1 = vec_rint(vecf1);
 
+<<<<<<< HEAD
     vint32 veci0  = vec_add(vec_signed(vecf0),vec_zero_point);
     vint32 veci1  = vec_add(vec_signed(vecf1),vec_zero_point);
+=======
+    vint32 veci0 = vec_add(vec_signed(vecf0), vec_zero_point);
+    vint32 veci1 = vec_add(vec_signed(vecf1), vec_zero_point);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     veci0 = vec_max(veci0, vmin);
     veci1 = vec_max(veci1, vmin);
@@ -242,6 +284,7 @@ Vectorized<c10::qint32> inline minimum(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::qint32> C10_ALWAYS_INLINE operator+(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
   return Vectorized<c10::qint32>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
 }
@@ -277,5 +320,55 @@ Vectorized<c10::qint32> C10_ALWAYS_INLINE operator^(const Vectorized<c10::qint32
 }
 
 } // namespace
+=======
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator+(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator-(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator*(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator/(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{a.vec0() / b.vec0(), a.vec1() / b.vec1()};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator&(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator|(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE
+operator^(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
index f67d42a4cb51..03087e55802d 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
@@ -1,8 +1,13 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/qint8.h>
 #include <array>
 
@@ -24,14 +29,24 @@
 // specified by float_vec_return_type.
 //
 // When writing kernels with these vectors, it is expected that floating-
+<<<<<<< HEAD
 // point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
 // iterations.
+=======
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at {
 namespace vec {
 inline namespace CPU_CAPABILITY {
 
 template <>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct Vectorized<c10::qint8> {
  private:
   union {
@@ -144,6 +159,7 @@ struct Vectorized<c10::qint8> {
     vfloat32 vecf1_3 = vec_float(veci7);
     vfloat32 scale_vec0 = scale.vec0();
     vfloat32 scale_vec1 = scale.vec1();
+<<<<<<< HEAD
     vfloat32 scale_zp_premul0 = scale_zp_premul.vec0();
     vfloat32 scale_zp_premul1 = scale_zp_premul.vec1();
     return {
@@ -164,6 +180,41 @@ struct Vectorized<c10::qint8> {
   float_vec_return_type C10_ALWAYS_INLINE dequantize(
       Vectorized<float> scale,
       Vectorized<float> zero_point) const {
+=======
+
+    vfloat32 zero_point_vec0 = zero_point.vec0();
+    vfloat32 zero_point_vec1 = zero_point.vec1();
+
+    vfloat32 vec_substract_src_zp0_0 = vec_sub(vecf0_0, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_0 = vec_sub(vecf1_0, zero_point_vec1);
+    Vectorized<float> vf0_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_0),
+        vec_mul(scale_vec1, vec_substract_src_zp1_0)};
+
+    vfloat32 vec_substract_src_zp0_1 = vec_sub(vecf0_1, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_1 = vec_sub(vecf1_1, zero_point_vec1);
+    Vectorized<float> vf1_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_1),
+        vec_mul(scale_vec1, vec_substract_src_zp1_1)};
+
+    vfloat32 vec_substract_src_zp0_2 = vec_sub(vecf0_2, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_2 = vec_sub(vecf1_2, zero_point_vec1);
+    Vectorized<float> vf2_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_2),
+        vec_mul(scale_vec1, vec_substract_src_zp1_2)};
+
+    vfloat32 vec_substract_src_zp0_3 = vec_sub(vecf0_3, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_3 = vec_sub(vecf1_3, zero_point_vec1);
+    Vectorized<float> vf3_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_3),
+        vec_mul(scale_vec1, vec_substract_src_zp1_3)};
+
+    return {vf0_zp, vf1_zp, vf2_zp, vf3_zp};
+  }
+
+  float_vec_return_type C10_ALWAYS_INLINE
+  dequantize(Vectorized<float> scale, Vectorized<float> zero_point) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     vint16 vecshi0 = vec_unpackh(_vec0);
     vint16 vecshi1 = vec_unpackl(_vec0);
 
@@ -290,7 +341,12 @@ struct Vectorized<c10::qint8> {
     return {vec0, vec1};
   }
 
+<<<<<<< HEAD
   Vectorized<c10::qint8> C10_ALWAYS_INLINE relu(Vectorized<c10::qint8> zero_point) const {
+=======
+  Vectorized<c10::qint8> C10_ALWAYS_INLINE
+  relu(Vectorized<c10::qint8> zero_point) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)};
   }
 
@@ -444,6 +500,7 @@ Vectorized<c10::qint8> inline minimum(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::qint8> C10_ALWAYS_INLINE operator+(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
   return Vectorized<c10::qint8>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
 }
@@ -479,5 +536,55 @@ Vectorized<c10::qint8> C10_ALWAYS_INLINE operator^(const Vectorized<c10::qint8>&
 }
 
 } // namespace
+=======
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator+(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator-(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator*(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator/(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{a.vec0() / b.vec0(), a.vec1() / b.vec1()};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator&(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator|(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE
+operator^(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
index c0d77d500491..aa1089551bf5 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
@@ -1,8 +1,13 @@
 #pragma once
 
 #include <ATen/cpu/vec/intrinsics.h>
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+=======
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <c10/util/irange.h>
 #include <c10/util/quint8.h>
@@ -26,13 +31,24 @@
 // specified by float_vec_return_type.
 //
 // When writing kernels with these vectors, it is expected that floating-
+<<<<<<< HEAD
 // point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
 // iterations.
+=======
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at {
 namespace vec {
 inline namespace CPU_CAPABILITY {
 
+<<<<<<< HEAD
+=======
+template <>
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 const vint16 mask_unsigned = vec_splats((short int)0xFF);
 template <>
 struct Vectorized<c10::quint8> {
@@ -155,6 +171,7 @@ struct Vectorized<c10::quint8> {
     vfloat32 vecf1_3 = vec_float(veci7);
     vfloat32 scale_vec0 = scale.vec0();
     vfloat32 scale_vec1 = scale.vec1();
+<<<<<<< HEAD
     vfloat32 scale_zp_premul0 = scale_zp_premul.vec0();
     vfloat32 scale_zp_premul1 = scale_zp_premul.vec1();
     return {
@@ -175,6 +192,41 @@ struct Vectorized<c10::quint8> {
   float_vec_return_type C10_ALWAYS_INLINE dequantize(
       Vectorized<float> scale,
       Vectorized<float> zero_point) const {
+=======
+
+    vfloat32 zero_point_vec0 = zero_point.vec0();
+    vfloat32 zero_point_vec1 = zero_point.vec1();
+
+    vfloat32 vec_substract_src_zp0_0 = vec_sub(vecf0_0, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_0 = vec_sub(vecf1_0, zero_point_vec1);
+    Vectorized<float> vf0_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_0),
+        vec_mul(scale_vec1, vec_substract_src_zp1_0)};
+
+    vfloat32 vec_substract_src_zp0_1 = vec_sub(vecf0_1, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_1 = vec_sub(vecf1_1, zero_point_vec1);
+    Vectorized<float> vf1_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_1),
+        vec_mul(scale_vec1, vec_substract_src_zp1_1)};
+
+    vfloat32 vec_substract_src_zp0_2 = vec_sub(vecf0_2, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_2 = vec_sub(vecf1_2, zero_point_vec1);
+    Vectorized<float> vf2_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_2),
+        vec_mul(scale_vec1, vec_substract_src_zp1_2)};
+
+    vfloat32 vec_substract_src_zp0_3 = vec_sub(vecf0_3, zero_point_vec0);
+    vfloat32 vec_substract_src_zp1_3 = vec_sub(vecf1_3, zero_point_vec1);
+    Vectorized<float> vf3_zp = {
+        vec_mul(scale_vec0, vec_substract_src_zp0_3),
+        vec_mul(scale_vec1, vec_substract_src_zp1_3)};
+
+    return {vf0_zp, vf1_zp, vf2_zp, vf3_zp};
+  }
+
+  float_vec_return_type C10_ALWAYS_INLINE
+  dequantize(Vectorized<float> scale, Vectorized<float> zero_point) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // unpacking unsigned as signed
     vint16 vecshi0 = vec_unpackh((vint8)_vec0);
     vint16 vecshi1 = vec_unpackl((vint8)_vec0);
@@ -214,6 +266,10 @@ struct Vectorized<c10::quint8> {
     vfloat32 vecf1_3 = vec_float(veci7);
     vfloat32 scale_vec0 = scale.vec0();
     vfloat32 scale_vec1 = scale.vec1();
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     vfloat32 zero_point0 = zero_point.vec0();
     vfloat32 zero_point1 = zero_point.vec1();
     return {
@@ -298,12 +354,23 @@ struct Vectorized<c10::quint8> {
     return {vec0, vec1};
   }
 
+<<<<<<< HEAD
   Vectorized<c10::quint8> C10_ALWAYS_INLINE relu(Vectorized<c10::quint8> zero_point) const {
     return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)};
   }
 
   Vectorized<c10::quint8> C10_ALWAYS_INLINE
   relu6(Vectorized<c10::quint8> zero_point, Vectorized<c10::quint8> q_six) const {
+=======
+  Vectorized<c10::quint8> C10_ALWAYS_INLINE
+  relu(Vectorized<c10::quint8> zero_point) const {
+    return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)};
+  }
+
+  Vectorized<c10::quint8> C10_ALWAYS_INLINE relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     vuint8 max0 = vec_max(_vec0, zero_point._vec0);
     vuint8 max1 = vec_max(_vec1, zero_point._vec1);
     return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)};
@@ -462,6 +529,7 @@ Vectorized<c10::quint8> inline minimum(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::quint8> C10_ALWAYS_INLINE operator+(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
   return Vectorized<c10::quint8>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
 }
@@ -497,5 +565,55 @@ Vectorized<c10::quint8> C10_ALWAYS_INLINE operator^(const Vectorized<c10::quint8
 }
 
 } // namespace
+=======
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator+(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator-(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator*(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator/(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{a.vec0() / b.vec0(), a.vec1() / b.vec1()};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator&(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator|(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE
+operator^(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{
+      vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
index 1dc742f3cbb1..d8139e086c43 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
@@ -1,13 +1,20 @@
 #pragma once
+<<<<<<< HEAD
 #include <cstdint>
 #include <c10/macros/Macros.h>
 #include <ATen/cpu/vec/intrinsics.h>
+=======
+#include <ATen/cpu/vec/intrinsics.h>
+#include <c10/macros/Macros.h>
+#include <cstdint>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(__clang__)
 typedef __vector __bool char vbool8;
 typedef __vector __bool short vbool16;
 typedef __vector __bool int vbool32;
 typedef __vector __bool long long vbool64;
+<<<<<<< HEAD
 using vint8    = __attribute__((vector_size(16))) signed char;
 using vint16   = __attribute__((vector_size(16))) signed short;
 using vint32   = __attribute__((vector_size(16))) signed int;
@@ -33,6 +40,37 @@ using vuint32  =  __attribute__((altivec(vector__)))  unsigned  int;
 using vuint64  =  __attribute__((altivec(vector__)))  unsigned long long;
 using vfloat32 =  __attribute__((altivec(vector__)))  float;
 using vfloat64 =  __attribute__((altivec(vector__)))  double;
+=======
+using vint8 = __attribute__((vector_size(16))) signed char;
+using vint16 = __attribute__((vector_size(16))) signed short;
+using vint32 = __attribute__((vector_size(16))) signed int;
+using vint64 = __attribute__((vector_size(16))) signed long long;
+using vuint8 = __attribute__((vector_size(16))) unsigned char;
+using vuint16 = __attribute__((vector_size(16))) unsigned short;
+using vuint32 = __attribute__((vector_size(16))) unsigned int;
+using vuint64 = __attribute__((vector_size(16))) unsigned long long;
+using vfloat32 = __attribute__((vector_size(16))) float;
+using vfloat64 = __attribute__((vector_size(16))) double;
+#else
+using vbool8 =
+    __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) char;
+using vbool16 =
+    __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) short;
+using vbool32 =
+    __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) int;
+using vbool64 = __attribute__((altivec(vector__)))
+__attribute__((altivec(bool__))) long long;
+using vint8 = __attribute__((altivec(vector__))) signed char;
+using vint16 = __attribute__((altivec(vector__))) signed short;
+using vint32 = __attribute__((altivec(vector__))) signed int;
+using vint64 = __attribute__((altivec(vector__))) signed long long;
+using vuint8 = __attribute__((altivec(vector__))) unsigned char;
+using vuint16 = __attribute__((altivec(vector__))) unsigned short;
+using vuint32 = __attribute__((altivec(vector__))) unsigned int;
+using vuint64 = __attribute__((altivec(vector__))) unsigned long long;
+using vfloat32 = __attribute__((altivec(vector__))) float;
+using vfloat64 = __attribute__((altivec(vector__))) double;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #if !defined(vec_float)
@@ -71,7 +109,11 @@ C10_ALWAYS_INLINE vfloat64 vec_neg(const vfloat64& vec_in) {
 }
 
 C10_ALWAYS_INLINE vint16 vec_neg(const vint16& vec_in) {
+<<<<<<< HEAD
   vint16 vint0 = {0, 0, 0, 0 ,0, 0, 0, 0};
+=======
+  vint16 vint0 = {0, 0, 0, 0, 0, 0, 0, 0};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return vec_vsubuhm(vint0, vec_in);
 }
 
@@ -116,6 +158,7 @@ C10_ALWAYS_INLINE T vec_max_nan(const T& a, const T& b) {
 }
 
 // Specializations for float/double taken from Eigen
+<<<<<<< HEAD
 template<>
 C10_ALWAYS_INLINE vfloat32 vec_min_nan<vfloat32>(const vfloat32& a, const vfloat32& b)
 {
@@ -148,6 +191,52 @@ C10_ALWAYS_INLINE vfloat64 vec_max_nan<vfloat64>(const vfloat64& a, const vfloat
   // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
   vfloat64 ret;
   __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+=======
+template <>
+C10_ALWAYS_INLINE vfloat32
+vec_min_nan<vfloat32>(const vfloat32& a, const vfloat32& b) {
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE
+  // regarding NaN
+  vfloat32 ret;
+  __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0"
+          : "=&wa"(ret)
+          : "wa"(a), "wa"(b));
+  return ret;
+}
+// Specializations for float/double taken from Eigen
+template <>
+C10_ALWAYS_INLINE vfloat32
+vec_max_nan<vfloat32>(const vfloat32& a, const vfloat32& b) {
+  // NOTE: about 10% slower than vec_max, but consistent with std::min and SSE
+  // regarding NaN
+  vfloat32 ret;
+  __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0"
+          : "=&wa"(ret)
+          : "wa"(a), "wa"(b));
+  return ret;
+}
+
+template <>
+C10_ALWAYS_INLINE vfloat64
+vec_min_nan<vfloat64>(const vfloat64& a, const vfloat64& b) {
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE
+  // regarding NaN
+  vfloat64 ret;
+  __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0"
+          : "=&wa"(ret)
+          : "wa"(a), "wa"(b));
+  return ret;
+}
+template <>
+C10_ALWAYS_INLINE vfloat64
+vec_max_nan<vfloat64>(const vfloat64& a, const vfloat64& b) {
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE
+  // regarding NaN
+  vfloat64 ret;
+  __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0"
+          : "=&wa"(ret)
+          : "wa"(a), "wa"(b));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return ret;
 }
 
@@ -168,18 +257,29 @@ C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat64, vbool64, vec_max)
 
 #undef C10_VSX_VEC_NAN_PROPAG
 
+<<<<<<< HEAD
 #define DEFINE_MEMBER_UNARY_OP(op, op_type, func)     \
+=======
+#define DEFINE_MEMBER_UNARY_OP(op, op_type, func)         \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<op_type> C10_ALWAYS_INLINE op() const {      \
     return Vectorized<op_type>{func(_vec0), func(_vec1)}; \
   }
 
 #define DEFINE_MEMBER_OP(op, op_type, func)                                  \
+<<<<<<< HEAD
   Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) const { \
     return Vectorized<op_type>{                                                  \
+=======
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) \
+      const {                                                                \
+    return Vectorized<op_type>{                                              \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         func(_vec0, other._vec0), func(_vec1, other._vec1)};                 \
   }
 
 #define DEFINE_MEMBER_BITWISE_OP(op, op_type, func)                          \
+<<<<<<< HEAD
   Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) const { \
     return Vectorized<op_type>{                                                  \
         func(_vecb0, other._vecb0), func(_vecb1, other._vecb1)};             \
@@ -256,12 +356,101 @@ C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat64, vbool64, vec_max)
 
 // it can be used to emulate blend faster
 constexpr int blendChoice(uint32_t mask, uint32_t half1 = 0xF, uint32_t half2 = 0xF0) {
+=======
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) \
+      const {                                                                \
+    return Vectorized<op_type>{                                              \
+        func(_vecb0, other._vecb0), func(_vecb1, other._vecb1)};             \
+  }
+
+#define DEFINE_MEMBER_TERNARY_OP(op, op_type, func)                       \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(                               \
+      const Vectorized<op_type>& b, const Vectorized<op_type>& c) const { \
+    return Vectorized<op_type>{                                           \
+        func(_vec0, b._vec0, c._vec0), func(_vec1, b._vec1, c._vec1)};    \
+  }
+
+#define DEFINE_MEMBER_EMULATE_BINARY_OP(op, op_type, binary_op)          \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& b) \
+      const {                                                            \
+    Vectorized<op_type>::vec_internal_type ret_0;                        \
+    Vectorized<op_type>::vec_internal_type ret_1;                        \
+    for (int i = 0; i < Vectorized<op_type>::size() / 2; i++) {          \
+      ret_0[i] = _vec0[i] binary_op b._vec0[i];                          \
+      ret_1[i] = _vec1[i] binary_op b._vec1[i];                          \
+    }                                                                    \
+    return Vectorized<op_type>{ret_0, ret_1};                            \
+  }
+
+#define DEFINE_MEMBER_OP_AND_ONE(op, op_type, func)                          \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) \
+      const {                                                                \
+    using vvtype = Vectorized<op_type>::vec_internal_type;                   \
+    const vvtype v_one = vec_splats(static_cast<op_type>(1.0));              \
+    vvtype ret0 = (vvtype)func(_vec0, other._vec0);                          \
+    vvtype ret1 = (vvtype)func(_vec1, other._vec1);                          \
+    return Vectorized<op_type>{vec_and(ret0, v_one), vec_and(ret1, v_one)};  \
+  }
+
+#define DEFINE_CLAMP_FUNCS(operand_type)                                       \
+  template <>                                                                  \
+  Vectorized<operand_type> C10_ALWAYS_INLINE clamp(                            \
+      const Vectorized<operand_type>& a,                                       \
+      const Vectorized<operand_type>& min,                                     \
+      const Vectorized<operand_type>& max) {                                   \
+    return Vectorized<operand_type>{                                           \
+        vec_min_nan(vec_max_nan(a.vec0(), min.vec0()), max.vec0()),            \
+        vec_min_nan(vec_max_nan(a.vec1(), min.vec1()), max.vec1())};           \
+  }                                                                            \
+  template <>                                                                  \
+  Vectorized<operand_type> C10_ALWAYS_INLINE clamp_min(                        \
+      const Vectorized<operand_type>& a,                                       \
+      const Vectorized<operand_type>& min) {                                   \
+    return Vectorized<operand_type>{                                           \
+        vec_max_nan(a.vec0(), min.vec0()), vec_max_nan(a.vec1(), min.vec1())}; \
+  }                                                                            \
+  template <>                                                                  \
+  Vectorized<operand_type> C10_ALWAYS_INLINE clamp_max(                        \
+      const Vectorized<operand_type>& a,                                       \
+      const Vectorized<operand_type>& max) {                                   \
+    return Vectorized<operand_type>{                                           \
+        vec_min_nan(a.vec0(), max.vec0()), vec_min_nan(a.vec1(), max.vec1())}; \
+  }
+
+#define DEFINE_REINTERPRET_CAST_FUNCS(                                 \
+    first_type, cast_type, cast_inner_vector_type)                     \
+  template <>                                                          \
+  C10_ALWAYS_INLINE Vectorized<cast_type> cast<cast_type, first_type>( \
+      const Vectorized<first_type>& src) {                             \
+    return Vectorized<cast_type>{                                      \
+        (cast_inner_vector_type)src.vec0(),                            \
+        (cast_inner_vector_type)src.vec1()};                           \
+  }
+
+#define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(first_type)      \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, double, vfloat64) \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, float, vfloat32)  \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, int64_t, vint64)  \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, int32_t, vint32)  \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, int16_t, vint16)
+
+// it can be used to emulate blend faster
+constexpr int blendChoice(
+    uint32_t mask,
+    uint32_t half1 = 0xF,
+    uint32_t half2 = 0xF0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint32_t none = 0;
   uint32_t both = half1 | half2;
   // clamp it between 0 and both
   mask = mask & both;
   // return  (a._vec0, a._vec1)
+<<<<<<< HEAD
   if (mask == none) return 0;
+=======
+  if (mask == none)
+    return 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // return (b._vec0,b._vec1)
   else if (mask == both)
     return 1;
@@ -320,18 +509,36 @@ constexpr vbool64 VsxDblMask2(uint32_t mask) {
 constexpr int maskForComplex(uint32_t mask) {
   mask = mask & 0xF;
   int complex_mask = 0;
+<<<<<<< HEAD
   if (mask & 1) complex_mask |= 3;
   if (mask & 2) complex_mask |= (3 << 2);
   if (mask & 4) complex_mask |= (3 << 4);
   if (mask & 8) complex_mask |= (3 << 6);
+=======
+  if (mask & 1)
+    complex_mask |= 3;
+  if (mask & 2)
+    complex_mask |= (3 << 2);
+  if (mask & 4)
+    complex_mask |= (3 << 4);
+  if (mask & 8)
+    complex_mask |= (3 << 6);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return complex_mask;
 }
 
 constexpr int maskForComplexDbl(uint32_t mask) {
   mask = mask & 0x3;
   int complex_mask = 0;
+<<<<<<< HEAD
   if (mask & 1) complex_mask |= 3;
   if (mask & 2) complex_mask |= (3 << 2);
+=======
+  if (mask & 1)
+    complex_mask |= 3;
+  if (mask & 2)
+    complex_mask |= (3 << 2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return complex_mask;
 }
 
@@ -352,7 +559,13 @@ constexpr vbool32 VsxComplexMask2(uint32_t mask) {
   return VsxMask1(maskForComplex(mask2));
 }
 
+<<<<<<< HEAD
 constexpr vbool64 VsxComplexDblMask1(uint32_t mask) { return VsxDblMask1(mask); }
+=======
+constexpr vbool64 VsxComplexDblMask1(uint32_t mask) {
+  return VsxDblMask1(mask);
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 constexpr vbool64 VsxComplexDblMask2(uint32_t mask) {
   uint32_t mask2 = (mask & 0xF) >> 2;
@@ -369,8 +582,28 @@ constexpr int offset0 = 0;
 constexpr int offset16 = 16;
 
 // #Constants
+<<<<<<< HEAD
 const vuint8 mask_zero_bits = vuint8{128, 128, 128, 128, 128, 128, 128, 128,
                                 128, 128, 128, 128, 96,  64,  32,  0};
+=======
+const vuint8 mask_zero_bits = vuint8{
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    128,
+    96,
+    64,
+    32,
+    0};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 const vuint8 swap_mask =
     vuint8{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
@@ -391,9 +624,15 @@ const vbool32 imag_mask = vbool32{0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF};
 const vbool32 isign_mask = vbool32{0x0, 0x80000000, 0x0, 0x80000000};
 const vbool32 rsign_mask = vbool32{0x80000000, 0x0, 0x80000000, 0x0};
 
+<<<<<<< HEAD
 const vbool64 vd_sign_mask  = vbool64{0x8000000000000000, 0x8000000000000000};
 const vbool64 vd_imag_mask  = vbool64{0x0, 0xFFFFFFFFFFFFFFFF};
 const vbool64 vd_real_mask  = vbool64{0xFFFFFFFFFFFFFFFF, 0x0};
+=======
+const vbool64 vd_sign_mask = vbool64{0x8000000000000000, 0x8000000000000000};
+const vbool64 vd_imag_mask = vbool64{0x0, 0xFFFFFFFFFFFFFFFF};
+const vbool64 vd_real_mask = vbool64{0xFFFFFFFFFFFFFFFF, 0x0};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 const vbool64 vd_isign_mask = vbool64{0x0, 0x8000000000000000};
 const vbool64 vd_rsign_mask = vbool64{0x8000000000000000, 0x0};
 
@@ -403,7 +642,12 @@ const vfloat32 one = vec_splats(1.f);
 const vfloat32 two = vec_splats(2.0f);
 const vfloat32 _4div_pi = vec_splats(1.27323954473516f);
 const vfloat32 v_inf = (vfloat32)vec_splats(0x7f800000u);
+<<<<<<< HEAD
 const vfloat32 v_minus_inf = vfloat32{ 0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u };
+=======
+const vfloat32 v_minus_inf =
+    vfloat32{0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 const vfloat32 v_nan = (vfloat32)vec_splats(0x7fffffff);
 const vfloat32 log10e_inv = vec_splats(0.43429448190325176f);
 const vfloat32 log2e_inv = vec_splats(1.4426950408889634f);
@@ -432,7 +676,12 @@ const vfloat32 log_p8 = vec_splats(+3.3333331174E-1f);
 const vfloat32 log_q1 = vec_splats(-2.12194440e-4f);
 const vfloat32 log_q2 = vec_splats(0.693359375f);
 const vfloat32 max_logf = vec_splats(88.02969187150841f);
+<<<<<<< HEAD
 const vfloat32 max_numf = vec_splats(1.7014117331926442990585209174225846272e38f);
+=======
+const vfloat32 max_numf =
+    vec_splats(1.7014117331926442990585209174225846272e38f);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 const vfloat32 min_inf = (vfloat32)vec_splats(0xff800000u);
 const vfloat32 min_norm_pos = (vfloat32)vec_splats(0x0800000u);
 const vfloat32 minus_cephes_dp1 = vec_splats(-0.78515625f);
@@ -456,8 +705,16 @@ const vfloat32 tanh_p4 = vec_splats(-3.33332819422E-1f);
 const vfloat32 vcheck = vec_splats((float)(1LL << 24));
 const vfloat32 imag_one = vfloat32{0.f, 1.f, 0.f, 1.f};
 const vfloat32 imag_half = vfloat32{0.f, 0.5f, 0.f, 0.5f};
+<<<<<<< HEAD
 const vfloat32 sqrt2_2 = vfloat32{0.70710676908493042f, 0.70710676908493042,
                           0.70710676908493042, 0.70710676908493042};
+=======
+const vfloat32 sqrt2_2 = vfloat32{
+    0.70710676908493042f,
+    0.70710676908493042,
+    0.70710676908493042,
+    0.70710676908493042};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 const vfloat32 pi_2 = vfloat32{M_PI / 2, 0.0, M_PI / 2, 0.0};
 const vfloat32 vf_89 = vfloat32{89.f, 89.f, 89.f, 89.f};
 const vfloat64 vd_one = vec_splats(1.0);
@@ -469,6 +726,10 @@ const vfloat64 vd_imag_half = vfloat64{0.0, 0.5};
 const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757};
 const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0};
 
+<<<<<<< HEAD
 } // namespace
+=======
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
index 7c2932b3aab7..87fa0477bd7a 100644
--- a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
+++ b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
@@ -31,8 +31,12 @@ constexpr bool is_zarch_implemented() {
 template <typename T>
 constexpr bool is_zarch_implemented_quant() {
   return (
+<<<<<<< HEAD
       std::is_same_v<T, c10::qint32> ||
       std::is_same_v<T, c10::qint8> ||
+=======
+      std::is_same_v<T, c10::qint32> || std::is_same_v<T, c10::qint8> ||
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::is_same_v<T, c10::quint8>);
 }
 
@@ -364,6 +368,13 @@ constexpr auto GetSwapMaskFloat() {
 }
 
 template <typename T>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<T, std::enable_if_t<is_zarch_implemented<T>()>>
+    : std::bool_constant<true> {};
+
+template <typename T>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
  public:
   using value_type = T;
@@ -386,7 +397,12 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   Vectorized() {}
 
   C10_ALWAYS_INLINE Vectorized(vtype v) : _vec0{v}, _vec1{v} {}
+<<<<<<< HEAD
   C10_ALWAYS_INLINE Vectorized(const vinner_data &v) : _vec0{v.first}, _vec1{v.second} {}
+=======
+  C10_ALWAYS_INLINE Vectorized(const vinner_data& v)
+      : _vec0{v.first}, _vec1{v.second} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_ALWAYS_INLINE Vectorized(vtype v1, vtype v2) : _vec0{v1}, _vec1{v2} {}
   C10_ALWAYS_INLINE Vectorized(T s)
       : _vec0{vec_splats((ElementType)s)}, _vec1{vec_splats((ElementType)s)} {}
@@ -396,7 +412,12 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
     static Vectorized<T> C10_ALWAYS_INLINE
     loadu(const U* ptr, int count = size()) {
       __at_align__ ElementType tmp_values[size()] = {};
+<<<<<<< HEAD
       std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
+=======
+      std::memcpy(
+          tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       return {
           vec_xl(offset0, &(tmp_values[0])),
@@ -409,6 +430,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
     static Vectorized<T> C10_ALWAYS_INLINE
     loadu(const ElementType* ptr, int count = size()) {
       if (count == size()) {
+<<<<<<< HEAD
         return {
             vec_xl(offset0, ptr),
             vec_xl(offset16, ptr)};
@@ -416,6 +438,14 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
       __at_align__ ElementType tmp_values[size()] = {};
       std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
+=======
+        return {vec_xl(offset0, ptr), vec_xl(offset16, ptr)};
+      }
+
+      __at_align__ ElementType tmp_values[size()] = {};
+      std::memcpy(
+          tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       return {
           vec_xl(offset0, &(tmp_values[0])),
@@ -430,8 +460,12 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   }
 
   template <typename U>
+<<<<<<< HEAD
   static Vectorized<T> C10_ALWAYS_INLINE
   loadu_one_fourth(const U* ptr) {
+=======
+  static Vectorized<T> C10_ALWAYS_INLINE loadu_one_fourth(const U* ptr) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // load only first 8 bytes
     // only intended to be used with uint8_t
     return loadu(ptr, 8 / sizeof(ElementType));
@@ -439,7 +473,12 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <typename U, typename DUMMY = void>
   struct StoreHelper {
+<<<<<<< HEAD
     static void C10_ALWAYS_INLINE store(const Vectorized<T> &vec, U* ptr, int count = size()) {
+=======
+    static void C10_ALWAYS_INLINE
+    store(const Vectorized<T>& vec, U* ptr, int count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (count > 0) {
         __at_align__ ElementType tmp_values[size()];
         vec_xst(vec._vec0, offset0, &(tmp_values[0]));
@@ -452,7 +491,12 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <typename DUMMY>
   struct StoreHelper<ElementType, DUMMY> {
+<<<<<<< HEAD
     static void C10_ALWAYS_INLINE store(const Vectorized<T> &vec, ElementType* ptr, int count = size()) {
+=======
+    static void C10_ALWAYS_INLINE
+    store(const Vectorized<T>& vec, ElementType* ptr, int count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (count == size()) {
         vec_xst(vec._vec0, offset0, ptr);
         vec_xst(vec._vec1, offset16, ptr);
@@ -788,16 +832,24 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
     return (*this <= other) & Vectorized<T>((T)1.0);
   }
 
+<<<<<<< HEAD
   template <
       typename U = T,
       std::enable_if_t<!std::is_unsigned_v<U>, int> = 0>
+=======
+  template <typename U = T, std::enable_if_t<!std::is_unsigned_v<U>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<U> C10_ALWAYS_INLINE abs() const {
     return {vec_abs(_vec0), vec_abs(_vec1)};
   }
 
+<<<<<<< HEAD
   template <
       typename U = T,
       std::enable_if_t<std::is_unsigned_v<U>, int> = 0>
+=======
+  template <typename U = T, std::enable_if_t<std::is_unsigned_v<U>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<U> C10_ALWAYS_INLINE abs() const {
     return {_vec0, _vec1};
   }
@@ -813,6 +865,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   }
 
   bool has_inf_nan() const {
+<<<<<<< HEAD
     for (const auto i : c10::irange(size()/2)) {
       if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
         return true;
@@ -820,6 +873,15 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
     }
     for (const auto i : c10::irange(size()/2)) {
       if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+=======
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size() / 2)) {
+      if (_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return true;
       }
     }
@@ -900,9 +962,13 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
     return sqrt().reciprocal();
   }
 
+<<<<<<< HEAD
   template <
       typename U = T,
       std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+=======
+  template <typename U = T, std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   inline Vectorized<T> mapOrdinary(float (*const f)(float)) const {
     float a00 = f(_vec0[0]);
     float a01 = f(_vec0[1]);
@@ -922,9 +988,13 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
     return Vectorized<T>(f(_vec0[0]), f(_vec0[1]), f(_vec1[0]), f(_vec1[1]));
   }
 
+<<<<<<< HEAD
   template <
       typename U = T,
       std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+=======
+  template <typename U = T, std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   inline Vectorized<T> mapOrdinary(
       float (*const f)(float, float),
       const Vectorized<T>& b) const {
@@ -1122,7 +1192,12 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
       typename U = T,
       std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> minimum(const Vectorized<T>& other) const {
+<<<<<<< HEAD
     Vectorized<T> tmp = {vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)};
+=======
+    Vectorized<T> tmp = {
+        vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tmp = blendv(tmp, *this, isnan());
     return blendv(tmp, other, other.isnan());
   }
@@ -1139,7 +1214,12 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
       typename U = T,
       std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> maximum(const Vectorized<T>& other) const {
+<<<<<<< HEAD
     Vectorized<T> tmp = {vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)};
+=======
+    Vectorized<T> tmp = {
+        vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tmp = blendv(tmp, *this, isnan());
     return blendv(tmp, other, other.isnan());
   }
@@ -1176,9 +1256,13 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
     return blendv(tmp, *this, isnan());
   }
 
+<<<<<<< HEAD
   template <
       typename U = T,
       std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+=======
+  template <typename U = T, std::enable_if_t<std::is_same_v<U, float>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> swapped() const {
     auto swap_mask = GetSwapMaskFloat();
     vtype v0 = vec_perm(_vec0, _vec0, swap_mask);
@@ -1260,6 +1344,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
       std::enable_if_t<std::is_same_v<U, uint8_t>, int> = 0>
   Vectorized<int32_t> to_vec_float_helper() const {
     int32_t values[8] = {
+<<<<<<< HEAD
       _vec0[0],
       _vec0[1],
       _vec0[2],
@@ -1274,6 +1359,27 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
       values[0], values[1], values[2], values[3],
       values[4], values[5], values[6], values[7]
     };
+=======
+        _vec0[0],
+        _vec0[1],
+        _vec0[2],
+        _vec0[3],
+        _vec0[4],
+        _vec0[5],
+        _vec0[6],
+        _vec0[7],
+    };
+
+    return Vectorized<int32_t>{
+        values[0],
+        values[1],
+        values[2],
+        values[3],
+        values[4],
+        values[5],
+        values[6],
+        values[7]};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   template <
@@ -1282,6 +1388,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   Vectorized<uint8_t> to_vec_uint8_helper() const {
     // helper function for float to uint8_t conversion
     uint8_t values[8] = {
+<<<<<<< HEAD
       static_cast<uint8_t>(_vec0[0]),
       static_cast<uint8_t>(_vec0[1]),
       static_cast<uint8_t>(_vec0[2]),
@@ -1301,10 +1408,30 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
       0, 0, 0, 0,
       0, 0, 0, 0,
       0, 0, 0, 0,
+=======
+        static_cast<uint8_t>(_vec0[0]),
+        static_cast<uint8_t>(_vec0[1]),
+        static_cast<uint8_t>(_vec0[2]),
+        static_cast<uint8_t>(_vec0[3]),
+        static_cast<uint8_t>(_vec1[0]),
+        static_cast<uint8_t>(_vec1[1]),
+        static_cast<uint8_t>(_vec1[2]),
+        static_cast<uint8_t>(_vec1[3]),
+    };
+
+    return Vectorized<uint8_t>{
+        values[0], values[1], values[2], values[3], values[4], values[5],
+        values[6], values[7], 0,         0,         0,         0,
+        0,         0,         0,         0,         0,         0,
+        0,         0,         0,         0,         0,         0,
+        0,         0,         0,         0,         0,         0,
+        0,         0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     };
   }
 };
 
+<<<<<<< HEAD
 #define ZVECTOR_OPERATORS(typex)                                                                           \
   template <>                                                                                              \
   Vectorized<typex> C10_ALWAYS_INLINE operator+(const Vectorized<typex>& a, const Vectorized<typex>& b) {  \
@@ -1376,6 +1503,92 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   Vectorized<typex> C10_ALWAYS_INLINE operator<=(const Vectorized<typex>& a, const Vectorized<typex>& b) { \
     return Vectorized<typex>{                                                                              \
         vec_cmple(a.vec0(), b.vec0()), vec_cmple(a.vec1(), b.vec1())};                                     \
+=======
+#define ZVECTOR_OPERATORS(typex)                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator+(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{a.vec0() + b.vec0(), a.vec1() + b.vec1()}; \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator-(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{a.vec0() - b.vec0(), a.vec1() - b.vec1()}; \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator*(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{a.vec0() * b.vec0(), a.vec1() * b.vec1()}; \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator/(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{a.vec0() / b.vec0(), a.vec1() / b.vec1()}; \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator&(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        (Vectorized<typex>::vtype)(a.vecb0() & b.vecb0()),              \
+        (Vectorized<typex>::vtype)(a.vecb1() & b.vecb1())};             \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator|(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        (Vectorized<typex>::vtype)(a.vecb0() | b.vecb0()),              \
+        (Vectorized<typex>::vtype)(a.vecb1() | b.vecb1())};             \
+  }                                                                     \
+                                                                        \
+  template <>                                                           \
+  Vectorized<typex> C10_ALWAYS_INLINE operator^(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        (Vectorized<typex>::vtype)(a.vecb0() ^ b.vecb0()),              \
+        (Vectorized<typex>::vtype)(a.vecb1() ^ b.vecb1())};             \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator==(                       \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmpeq(a.vec0(), b.vec0()), vec_cmpeq(a.vec1(), b.vec1())};  \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator!=(                       \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmpeq(a.vec0(), b.vec0()), vec_cmpeq(a.vec1(), b.vec1())}   \
+        ._not();                                                        \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmpgt(a.vec0(), b.vec0()), vec_cmpgt(a.vec1(), b.vec1())};  \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>=(                       \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmpge(a.vec0(), b.vec0()), vec_cmpge(a.vec1(), b.vec1())};  \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<(                        \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmplt(a.vec0(), b.vec0()), vec_cmplt(a.vec1(), b.vec1())};  \
+  }                                                                     \
+                                                                        \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<=(                       \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {         \
+    return Vectorized<typex>{                                           \
+        vec_cmple(a.vec0(), b.vec0()), vec_cmple(a.vec1(), b.vec1())};  \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 ZVECTOR_OPERATORS(float)
@@ -1389,6 +1602,7 @@ ZVECTOR_OPERATORS(int64_t)
 
 #undef ZVECTOR_OPERATORS
 
+<<<<<<< HEAD
 #define ZVECTOR_OPERATORS(typex)                                                                           \
   template <>                                                                                              \
   Vectorized<typex> C10_ALWAYS_INLINE operator<<(const Vectorized<typex>& a, const Vectorized<typex>& b) { \
@@ -1442,6 +1656,67 @@ ZVECTOR_OPERATORS(int64_t)
   template <>                                                                                              \
   inline Vectorized<typex> operator~(const Vectorized<typex>& a) {                                         \
     return a._not();                                                                                       \
+=======
+#define ZVECTOR_OPERATORS(typex)                                          \
+  template <>                                                             \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<<(                         \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {           \
+    constexpr Vectorized<typex>::ElementType max_shift =                  \
+        sizeof(Vectorized<typex>::ElementType) * CHAR_BIT;                \
+                                                                          \
+    Vectorized<typex>::ElementType a_array[Vectorized<typex>::size()];    \
+    Vectorized<typex>::ElementType b_array[Vectorized<typex>::size()];    \
+    Vectorized<typex>::ElementType c_array[Vectorized<typex>::size()];    \
+                                                                          \
+    a.store(a_array);                                                     \
+    b.store(b_array);                                                     \
+                                                                          \
+    for (int i = 0; i != Vectorized<typex>::size(); i++) {                \
+      typex shift = b_array[i];                                           \
+      if ((static_cast<std::make_signed_t<typex>>(shift) < 0) ||          \
+          (shift >= max_shift)) {                                         \
+        c_array[i] = 0;                                                   \
+      } else {                                                            \
+        c_array[i] = static_cast<std::make_unsigned_t<typex>>(a_array[i]) \
+            << shift;                                                     \
+      }                                                                   \
+    }                                                                     \
+                                                                          \
+    return Vectorized<typex>::loadu(c_array);                             \
+  }                                                                       \
+                                                                          \
+  template <>                                                             \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>>(                         \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {           \
+    /* right shift value to retain sign bit for signed and no bits for    \
+     * unsigned */                                                        \
+    constexpr Vectorized<typex>::ElementType max_shift =                  \
+        sizeof(typex) * CHAR_BIT - std::is_signed_v<typex>;               \
+                                                                          \
+    Vectorized<typex>::ElementType a_array[Vectorized<typex>::size()];    \
+    Vectorized<typex>::ElementType b_array[Vectorized<typex>::size()];    \
+    Vectorized<typex>::ElementType c_array[Vectorized<typex>::size()];    \
+                                                                          \
+    a.store(a_array);                                                     \
+    b.store(b_array);                                                     \
+                                                                          \
+    for (int i = 0; i != Vectorized<typex>::size(); i++) {                \
+      typex shift = b_array[i];                                           \
+      if ((static_cast<std::make_signed_t<typex>>(shift) < 0) ||          \
+          (shift >= max_shift)) {                                         \
+        c_array[i] = a_array[i] >> max_shift;                             \
+      } else {                                                            \
+        c_array[i] = a_array[i] >> shift;                                 \
+      }                                                                   \
+    }                                                                     \
+                                                                          \
+    return Vectorized<typex>::loadu(c_array);                             \
+  }                                                                       \
+                                                                          \
+  template <>                                                             \
+  inline Vectorized<typex> operator~(const Vectorized<typex>& a) {        \
+    return a._not();                                                      \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 ZVECTOR_OPERATORS(int8_t)
@@ -1727,6 +2002,15 @@ C10_DIAGNOSTIC_POP()
 
 //////////////////////////////////QUANT///////////////////////////////////////////
 template <typename T>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<
+    T,
+    std::enable_if_t<is_zarch_implemented_quant<T>()>>
+    : std::bool_constant<true> {};
+
+template <typename T>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
  public:
   using value_type = typename T::underlying;
@@ -1906,7 +2190,11 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
         (vecf_0 - zero_point) * scale,
         (vecf_1 - zero_point) * scale,
         (vecf_2 - zero_point) * scale,
+<<<<<<< HEAD
         (vecf_3 - zero_point) * scale };
+=======
+        (vecf_3 - zero_point) * scale};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   template <
@@ -2034,6 +2322,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
   }
 };
 
+<<<<<<< HEAD
 #define ZVECTOR_OPERATORS(typex)                                                                           \
   template <>                                                                                              \
   Vectorized<typex> C10_ALWAYS_INLINE operator+(const Vectorized<typex>& a, const Vectorized<typex>& b) {  \
@@ -2092,6 +2381,79 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
                                                                                                            \
   Vectorized<typex> C10_ALWAYS_INLINE operator<=(const Vectorized<typex>& a, const Vectorized<typex>& b) { \
     return Vectorized<typex>{a.vec() <= b.vec()};                                                          \
+=======
+#define ZVECTOR_OPERATORS(typex)                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator+(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() + b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator-(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() - b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator*(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() * b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator/(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() / b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator&(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() & b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator|(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() | b.vec()};                \
+  }                                                             \
+                                                                \
+  template <>                                                   \
+  Vectorized<typex> C10_ALWAYS_INLINE operator^(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() ^ b.vec()};                \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator==(               \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() == b.vec()};               \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator!=(               \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() != b.vec()};               \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() > b.vec()};                \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>=(               \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() >= b.vec()};               \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<(                \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() < b.vec()};                \
+  }                                                             \
+                                                                \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<=(               \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) { \
+    return Vectorized<typex>{a.vec() <= b.vec()};               \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 ZVECTOR_OPERATORS(c10::qint32)
@@ -2185,6 +2547,15 @@ constexpr U log10e_inv() {
 }
 
 template <typename T>
+<<<<<<< HEAD
+=======
+struct is_vec_specialized_for<
+    T,
+    std::enable_if_t<is_zarch_implemented_complex<T>()>>
+    : std::bool_constant<true> {};
+
+template <typename T>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
  public:
   using underline_type = decltype(std::declval<T>().imag());
@@ -2205,7 +2576,12 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
  public:
   Vectorized() {}
 
+<<<<<<< HEAD
   C10_ALWAYS_INLINE Vectorized(const vinner_data &v) : _vec{v.first, v.second} {}
+=======
+  C10_ALWAYS_INLINE Vectorized(const vinner_data& v)
+      : _vec{v.first, v.second} {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   template <typename U = T, std::enable_if_t<(sizeof(U) == 16), int> = 0>
   C10_ALWAYS_INLINE Vectorized(T s1, T s2)
@@ -2406,10 +2782,17 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
   template <
       typename U = T,
       std::enable_if_t<std::is_same<U, c10::complex<float>>::value, int> = 0>
+<<<<<<< HEAD
   static typename Vectorized<T>::vinner_type real_neg(const typename Vectorized<T>::vinner_type &a)
   {
     const auto swap_mask = ZSimdVectBinary<uint8_t>{
       0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31};
+=======
+  static typename Vectorized<T>::vinner_type real_neg(
+      const typename Vectorized<T>::vinner_type& a) {
+    const auto swap_mask = ZSimdVectBinary<uint8_t>{
+        0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto a_neg = a.neg();
     vtype v0 = vec_perm(a_neg.vec0(), a.vec0(), swap_mask);
@@ -2420,12 +2803,21 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
   template <
       typename U = T,
       std::enable_if_t<std::is_same<U, c10::complex<double>>::value, int> = 0>
+<<<<<<< HEAD
   static typename Vectorized<T>::vinner_type real_neg(const typename Vectorized<T>::vinner_type &a)
   {
     auto a_neg = a.neg();
     vtype v0 = {a_neg.vec0()[0], a.vec0()[1]};
     vtype v1 = {a_neg.vec1()[0], a.vec1()[1]};
     return { v0, v1 };
+=======
+  static typename Vectorized<T>::vinner_type real_neg(
+      const typename Vectorized<T>::vinner_type& a) {
+    auto a_neg = a.neg();
+    vtype v0 = {a_neg.vec0()[0], a.vec0()[1]};
+    vtype v1 = {a_neg.vec1()[0], a.vec1()[1]};
+    return {v0, v1};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<T> angle2_() const {
@@ -2516,15 +2908,27 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
   }
 
   Vectorized<T> C10_ALWAYS_INLINE eq(const Vectorized<T>& other) const {
+<<<<<<< HEAD
     auto eq = _vec.eq(other._vec);  // compares real and imag individually
     // If both real numbers and imag numbers are equal, then the complex numbers are equal
+=======
+    auto eq = _vec.eq(other._vec); // compares real and imag individually
+    // If both real numbers and imag numbers are equal, then the complex numbers
+    // are equal
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto real = eq & vinner_type(real_mask<underline_type>());
     auto imag = (eq & vinner_type(image_mask<underline_type>())).swapped();
     return Vectorized<T>{real & imag};
   }
   Vectorized<T> C10_ALWAYS_INLINE ne(const Vectorized<T>& other) const {
+<<<<<<< HEAD
     auto ne = _vec.ne(other._vec);  // compares real and imag individually
     // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
+=======
+    auto ne = _vec.ne(other._vec); // compares real and imag individually
+    // If either real numbers or imag numbers are not equal, then the complex
+    // numbers are not equal
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto real = ne & vinner_type(real_mask<underline_type>());
     auto imag = (ne & vinner_type(image_mask<underline_type>())).swapped();
     return Vectorized<T>{real | imag};
@@ -2551,8 +2955,12 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
     return a.mergee().data();
   }
 
+<<<<<<< HEAD
   static T abs_helper(const T &value)
   {
+=======
+  static T abs_helper(const T& value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return T(std::abs(value));
   }
 
@@ -2633,6 +3041,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
   }
 };
 
+<<<<<<< HEAD
 #define ZVECTOR_OPERATORS(typex)                                                                           \
   template <>                                                                                              \
   Vectorized<typex> C10_ALWAYS_INLINE operator+(const Vectorized<typex>& a, const Vectorized<typex>& b) {  \
@@ -2721,6 +3130,114 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
                                                                                                            \
   Vectorized<typex> C10_ALWAYS_INLINE operator>=(const Vectorized<typex>& a, const Vectorized<typex>& b) { \
     TORCH_CHECK(false, "not supported for complex numbers");                                               \
+=======
+#define ZVECTOR_OPERATORS(typex)                                              \
+  template <>                                                                 \
+  Vectorized<typex> C10_ALWAYS_INLINE operator+(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() + b.vec()};                              \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> C10_ALWAYS_INLINE operator-(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() - b.vec()};                              \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> inline operator*(                                         \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    /* (a + bi)  * (c + di) = (ac - bd) + (ad + bc)i */                       \
+    Vectorized<typex>::vinner_type bv = b.vec();                              \
+                                                                              \
+    /* this is more z arch friendly than simulating horizontal from x86 */    \
+    Vectorized<typex>::vinner_type vi = bv.mergeo();                          \
+    Vectorized<typex>::vinner_type vr = bv.mergee();                          \
+    vi = vi ^                                                                 \
+        Vectorized<typex>::vinner_type(                                       \
+             rsign_mask<Vectorized<typex>::underline_type>());                \
+    Vectorized<typex>::vinner_type ret = a.vec() * vr;                        \
+    Vectorized<typex>::vinner_type vx_swapped = a.vec().swapped();            \
+    ret = fmadd(vx_swapped, vi, ret);                                         \
+                                                                              \
+    return Vectorized<typex>{ret};                                            \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> inline operator/(                                         \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    /* Unfortunately, this breaks some tests */                               \
+    /* Implement it like it's done for avx2 */                                \
+    auto fabs_cd = b.vec().abs(); /* |c|    |d| */                            \
+    auto fabs_dc = fabs_cd.swapped(); /* |d|    |c| */                        \
+    auto scale = Vectorized<typex>::vinner_type{1.0} /                        \
+        maximum(fabs_cd, fabs_dc); /* 1/sc     1/sc */                        \
+    auto a2 = a.vec() * scale; /* a/sc     b/sc */                            \
+    auto b2 = b.vec() * scale; /* c/sc     d/sc */                            \
+    auto acbd2 = a2 * b2; /* ac/sc^2  bd/sc^2 */                              \
+                                                                              \
+    auto dc2 = b2.swapped(); /* d/sc         c/sc */                          \
+    dc2 = Vectorized<typex>::real_neg(dc2); /* -d/|c,d|        c/sc */        \
+    auto adbc2 = a2 * dc2; /* -ad/sc^2      bc/sc^2 */                        \
+    auto sum1 = acbd2 + acbd2.swapped(); /* (ac+bd)/sc^2  (ac+bd)/sc^2 */     \
+    auto sum2 = adbc2 + adbc2.swapped(); /* (bc-ad)/sc^2  (bc-ad)/sc^2 */     \
+    auto res2 = Vectorized<typex>::vinner_type::mergee(                       \
+        sum1, sum2); /* (ac+bd)/sc^2  (bc-ad)/sc^2 */                         \
+                                                                              \
+    /* get the denominator */                                                 \
+    Vectorized<typex>::vinner_type denom2 =                                   \
+        Vectorized<typex>{b2}.abs_2_(); /* (c^2+d^2)/sc^2   (c^2+d^2)/sc^2 */ \
+    res2 = res2 / denom2;                                                     \
+    return Vectorized<typex>{res2};                                           \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> C10_ALWAYS_INLINE operator&(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() & b.vec()};                              \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> C10_ALWAYS_INLINE operator|(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() | b.vec()};                              \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  Vectorized<typex> C10_ALWAYS_INLINE operator^(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() ^ b.vec()};                              \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator==(                             \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() == b.vec()};                             \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator!=(                             \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    return Vectorized<typex>{a.vec() != b.vec()};                             \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    TORCH_CHECK(false, "not supported for complex numbers");                  \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator<=(                             \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    TORCH_CHECK(false, "not supported for complex numbers");                  \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>(                              \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    TORCH_CHECK(false, "not supported for complex numbers");                  \
+  }                                                                           \
+                                                                              \
+  Vectorized<typex> C10_ALWAYS_INLINE operator>=(                             \
+      const Vectorized<typex>& a, const Vectorized<typex>& b) {               \
+    TORCH_CHECK(false, "not supported for complex numbers");                  \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 ZVECTOR_OPERATORS(c10::complex<float>)
@@ -2872,28 +3389,55 @@ std::pair<Vectorized<int64_t>, Vectorized<int64_t>> inline deinterleave2<
 }
 
 template <typename T>
+<<<<<<< HEAD
 std::enable_if_t<std::is_same_v<T, uint8_t>, at::vec::Vectorized<float>>
 inline convert_int8_to_float(const Vectorized<T> &src) {
   // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
   // Only handle first 64 bits
+=======
+std::enable_if_t<
+    std::is_same_v<T, uint8_t>,
+    at::vec::Vectorized<
+        float>> inline convert_int8_to_float(const Vectorized<T>& src) {
+  // Note: this function only convert inputs number of elements equal to
+  // at::vec::Vectorized<float>.size() Only handle first 64 bits
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto vec_int = src.to_vec_float_helper();
 
   return zvec_convert_to_float(vec_int);
 }
 
 template <typename T>
+<<<<<<< HEAD
 std::enable_if_t<std::is_same_v<T, uint8_t>, at::vec::Vectorized<T>>
 inline convert_float_to_int8(const Vectorized<float> &src) {
   constexpr auto min_val = std::numeric_limits<T>::min();
   constexpr auto max_val = std::numeric_limits<T>::max();
 
   auto vec_int = clamp(zvec_convert_to_int(src), Vectorized<int32_t>(min_val), Vectorized<int32_t>(max_val));
+=======
+std::enable_if_t<
+    std::is_same_v<T, uint8_t>,
+    at::vec::Vectorized<
+        T>> inline convert_float_to_int8(const Vectorized<float>& src) {
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+
+  auto vec_int = clamp(
+      zvec_convert_to_int(src),
+      Vectorized<int32_t>(min_val),
+      Vectorized<int32_t>(max_val));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return vec_int.to_vec_uint8_helper();
 }
 
 #undef DEFINE_CLAMP_MAXMIN_FUNCS
 #undef DEFINE_MAXMIN_FUNCS
+<<<<<<< HEAD
 } // namespace
+=======
+} // namespace CPU_CAPABILITY
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
index d593d184c319..091ad39d4eb3 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@@ -5,9 +5,17 @@
 
 #include <ATen/cpu/vec/intrinsics.h>
 
+<<<<<<< HEAD
 #include <ATen/cpu/vec/vec_base.h>
 #include <ATen/cpu/vec/vec512/vec512_float.h>
 #include <ATen/cpu/vec/vec512/vec512_bfloat16.h>
+=======
+// clang-format off
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec512/vec512_float.h>
+#include <ATen/cpu/vec/vec512/vec512_bfloat16.h>
+#include <ATen/cpu/vec/vec512/vec512_float8.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cpu/vec/vec512/vec512_double.h>
 #include <ATen/cpu/vec/vec512/vec512_int.h>
 #include <ATen/cpu/vec/vec512/vec512_qint.h>
@@ -15,6 +23,10 @@
 #include <ATen/cpu/vec/vec512/vec512_complex_double.h>
 #include <ATen/cpu/vec/vec512/vec512_convert.h>
 #include <ATen/cpu/vec/vec512/vec512_mask.h>
+<<<<<<< HEAD
+=======
+// clang-format on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <algorithm>
 #include <cstddef>
@@ -56,34 +68,58 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
   return stream;
 }
 
+<<<<<<< HEAD
 
 #if defined(CPU_CAPABILITY_AVX512)
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template<>
+=======
+#if defined(CPU_CAPABILITY_AVX512)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512)
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<float> cast<float, double>(const Vectorized<double>& src) {
   return _mm512_castpd_ps(src);
 }
 
+<<<<<<< HEAD
 template<>
+=======
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<double> cast<double, float>(const Vectorized<float>& src) {
   return _mm512_castps_pd(src);
 }
 
+<<<<<<< HEAD
 template<>
+=======
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<float> cast<float, int32_t>(const Vectorized<int32_t>& src) {
   return _mm512_castsi512_ps(src);
 }
 
+<<<<<<< HEAD
 template<>
 inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src) {
+=======
+template <>
+inline Vectorized<double> cast<double, int64_t>(
+    const Vectorized<int64_t>& src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_castsi512_pd(src);
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #ifndef _MSC_VER
 // MSVC is not working well on complex function overload.
+<<<<<<< HEAD
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
@@ -93,25 +129,60 @@ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
 inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+=======
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        double>> inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
+  return _mm512_i64gather_pd(vindex, base_addr, scale);
+}
+
+template <int64_t scale = 1>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        float>> inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_i32gather_ps(vindex, base_addr, scale);
 }
 #endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #ifndef _MSC_VER
 // MSVC is not working well on complex function overload.
+<<<<<<< HEAD
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline mask_gather(const Vectorized<double>& src, const double* base_addr,
                    const Vectorized<int64_t>& vindex, Vectorized<double>& mask) {
+=======
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>> inline mask_gather(
+        const Vectorized<double>& src,
+        const double* base_addr,
+        const Vectorized<int64_t>& vindex,
+        Vectorized<double>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto all_ones = _mm512_castsi512_pd(_mm512_set1_epi64(0xFFFFFFFFFFFFFFFF));
   auto mask_ = _mm512_cmp_pd_mask(all_ones, mask.values, _CMP_EQ_OQ);
   return _mm512_mask_i64gather_pd(src, mask_, vindex, base_addr, scale);
 }
 
+<<<<<<< HEAD
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
 inline mask_gather(const Vectorized<float>& src, const float* base_addr,
                    const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
+=======
+template <int64_t scale = 1>
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>> inline mask_gather(
+        const Vectorized<float>& src,
+        const float* base_addr,
+        const Vectorized<int32_t>& vindex,
+        Vectorized<float>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto all_ones = _mm512_castsi512_ps(_mm512_set1_epi32(0xFFFFFFFF));
   auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
   return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
@@ -119,6 +190,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
 #endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+<<<<<<< HEAD
 template<>
 Vectorized<int64_t>
 inline convert_to_int_of_same_size<double>(const Vectorized<double> &src) {
@@ -140,14 +212,43 @@ inline convert_to_fp_of_same_size<double>(const Vectorized<int64_t> &src) {
 template<>
 Vectorized<float>
 inline convert_to_fp_of_same_size<float>(const Vectorized<int32_t> &src) {
+=======
+template <>
+Vectorized<int64_t> inline convert_to_int_of_same_size<double>(
+    const Vectorized<double>& src) {
+  return _mm512_cvtpd_epi64(src);
+}
+
+template <>
+Vectorized<int32_t> inline convert_to_int_of_same_size<float>(
+    const Vectorized<float>& src) {
+  return _mm512_cvttps_epi32(src);
+}
+
+template <>
+Vectorized<double> inline convert_to_fp_of_same_size<double>(
+    const Vectorized<int64_t>& src) {
+  return _mm512_cvtepi64_pd(src);
+}
+
+template <>
+Vectorized<float> inline convert_to_fp_of_same_size<float>(
+    const Vectorized<int32_t>& src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_cvtepi32_ps(src);
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <>
+<<<<<<< HEAD
 std::pair<Vectorized<double>, Vectorized<double>>
 inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+std::pair<Vectorized<double>, Vectorized<double>> inline interleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inputs:
   //   a = {a0, a1, a3, a3, a4, a5, a6, a7}
   //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
@@ -156,6 +257,7 @@ inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>
   //          {a4, b4, a5, b5, a6, b6, a7, b7}
   __m512i idx1 = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0);
   __m512i idx2 = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4);
+<<<<<<< HEAD
   return std::make_pair(_mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
                         _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
 }
@@ -176,19 +278,53 @@ inline interleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b
                                   27, 11, 26, 10, 25, 9, 24, 8);
   return std::make_pair(_mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
                         _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+=======
+  return std::make_pair(
+      _mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+      _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline interleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14,
+  //   a15} b = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13,
+  //   b14, b15}
+  //
+  //  return:
+  //    {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
+  //    {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15,
+  //    b15}
+  __m512i idx1 =
+      _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
+  __m512i idx2 = _mm512_set_epi32(
+      31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
+  return std::make_pair(
+      _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+      _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <>
+<<<<<<< HEAD
 std::pair<Vectorized<double>, Vectorized<double>>
 inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+std::pair<Vectorized<double>, Vectorized<double>> inline deinterleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inputs:
   //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
   //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
   // output:
   //   return {a0, a1, a2, a3, a4, a5, a6, a7}
   //          {b0, b1, b2, b3, b4, b5, b6, b7}
+<<<<<<< HEAD
   // The members of indices have been written in binary format for better understandability
   __m512i idx1 = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0);
   __m512i idx2 = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
@@ -213,10 +349,44 @@ inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>&
 
   return std::make_pair(_mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
                         _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+=======
+  // The members of indices have been written in binary format for better
+  // understandability
+  __m512i idx1 = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0);
+  __m512i idx2 = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
+
+  return std::make_pair(
+      _mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+      _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
+  //   b = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14,
+  //   a15, b15}
+  // output:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14,
+  //   a15}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14,
+  //          b15}
+  __m512i idx1 = _mm512_set_epi32(
+      30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m512i idx2 = _mm512_set_epi32(
+      31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+  return std::make_pair(
+      _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+      _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+<<<<<<< HEAD
 template<>
 inline Vectorized<float> flip(const Vectorized<float> & v) {
   const __m512i mask = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7,
@@ -226,16 +396,33 @@ inline Vectorized<float> flip(const Vectorized<float> & v) {
 
 template<>
 inline Vectorized<double> flip(const Vectorized<double> & v) {
+=======
+template <>
+inline Vectorized<float> flip(const Vectorized<float>& v) {
+  const __m512i mask =
+      _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm512_permutexvar_ps(mask, v);
+}
+
+template <>
+inline Vectorized<double> flip(const Vectorized<double>& v) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
   return _mm512_permutexvar_pd(mask, v);
 }
 
+<<<<<<< HEAD
 template<>
 inline Vectorized<int64_t> flip(const Vectorized<int64_t> & v) {
+=======
+template <>
+inline Vectorized<int64_t> flip(const Vectorized<int64_t>& v) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
   return _mm512_permutexvar_epi64(mask, v);
 }
 
+<<<<<<< HEAD
 template<>
 inline Vectorized<int32_t> flip(const Vectorized<int32_t> & v) {
   const __m512i mask = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7,
@@ -259,11 +446,125 @@ inline __m512i flip8(const __m512i & v) {
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   );
+=======
+template <>
+inline Vectorized<int32_t> flip(const Vectorized<int32_t>& v) {
+  const __m512i mask =
+      _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm512_permutexvar_epi32(mask, v);
+}
+
+template <>
+inline Vectorized<int16_t> flip(const Vectorized<int16_t>& v) {
+  const __m512i mask = _mm512_set_epi16(
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      16,
+      17,
+      18,
+      19,
+      20,
+      21,
+      22,
+      23,
+      24,
+      25,
+      26,
+      27,
+      28,
+      29,
+      30,
+      31);
+  return _mm512_permutexvar_epi16(mask, v);
+}
+
+inline __m512i flip8(const __m512i& v) {
+  const __m512i mask1 = _mm512_set_epi8(
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const __m512i mask2 = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
   auto reversed_vec = _mm512_shuffle_epi8(v, mask1);
   return _mm512_permutexvar_epi64(mask2, reversed_vec);
 }
 
+<<<<<<< HEAD
 template<>
 inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
   return flip8(v);
@@ -271,6 +572,15 @@ inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
 
 template<>
 inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+=======
+template <>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t>& v) {
+  return flip8(v);
+}
+
+template <>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t>& v) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return flip8(v);
 }
 
@@ -288,4 +598,10 @@ inline Vectorized<bool> operator&&(
 
 #endif // defined(CPU_CAPABILITY_AVX512)
 
+<<<<<<< HEAD
 }}}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
index f116929f8b08..7ddca47f000a 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@@ -12,7 +12,10 @@
 #include <sleef.h>
 #endif
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
@@ -85,7 +88,12 @@ static inline __m512i cvtfp32_bf16(const __m512& a, const __m512& b) {
   t_lo = _mm512_mask_blend_epi32(mask_lo, nan, t_lo);
   t_hi = _mm512_mask_blend_epi32(mask_hi, nan, t_hi);
 
+<<<<<<< HEAD
   t_lo = _mm512_packus_epi32(t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4]
+=======
+  t_lo = _mm512_packus_epi32(
+      t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512i idx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
   return _mm512_permutexvar_epi64(idx, t_lo);
 }
@@ -113,6 +121,7 @@ static inline void cvtfp16_fp32(const __m512i& a, __m512& o1, __m512& o2) {
 }
 
 static inline __m256i cvtfp32_fp16(const __m512& src) {
+<<<<<<< HEAD
   return _mm512_cvtps_ph(
       src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
 }
@@ -122,12 +131,23 @@ static inline __m512i cvtfp32_fp16(const __m512& a, const __m512& b) {
       a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
   __m256i hi = _mm512_cvtps_ph(
       b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+  return _mm512_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
+static inline __m512i cvtfp32_fp16(const __m512& a, const __m512& b) {
+  __m256i lo =
+      _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  __m256i hi =
+      _mm512_cvtps_ph(b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 t_lo = _mm512_castsi512_ps(_mm512_castsi256_si512(lo));
   __m256 t_hi = _mm256_castsi256_ps(hi);
   return _mm512_castps_si512(_mm512_insertf32x8(t_lo, t_hi, 1));
 }
 
 // dtype conversion between float16/bfloat16 and float32
+<<<<<<< HEAD
 template <typename T, typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
 inline void cvt_to_fp32(const __m256i& a, __m512& o);
 template <> inline void cvt_to_fp32<BFloat16>(const __m256i& a, __m512& o) {
@@ -159,17 +179,77 @@ template <> inline __m512i cvt_from_fp32<Half, false>(const __m512& a, const __m
   return cvtfp32_fp16(a, b);
 }
 template <> inline __m512i cvt_from_fp32<Half, true>(const __m512& a, const __m512& b) {
+=======
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m256i& a, __m512& o);
+template <>
+inline void cvt_to_fp32<BFloat16>(const __m256i& a, __m512& o) {
+  cvtbf16_fp32(a, o);
+}
+template <>
+inline void cvt_to_fp32<Half>(const __m256i& a, __m512& o) {
+  cvtfp16_fp32(a, o);
+}
+
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m512i& a, __m512& o1, __m512& o2);
+template <>
+inline void cvt_to_fp32<BFloat16>(const __m512i& a, __m512& o1, __m512& o2) {
+  cvtbf16_fp32(a, o1, o2);
+}
+template <>
+inline void cvt_to_fp32<Half>(const __m512i& a, __m512& o1, __m512& o2) {
+  cvtfp16_fp32(a, o1, o2);
+}
+
+template <
+    typename T,
+    bool is_compare_op = false,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline __m512i cvt_from_fp32(const __m512& a, const __m512& b);
+template <>
+inline __m512i cvt_from_fp32<BFloat16, false>(
+    const __m512& a,
+    const __m512& b) {
+  return cvtfp32_bf16(a, b);
+}
+template <>
+inline __m512i cvt_from_fp32<BFloat16, true>(const __m512& a, const __m512& b) {
+  return merge_compare_result(a, b);
+}
+template <>
+inline __m512i cvt_from_fp32<Half, false>(const __m512& a, const __m512& b) {
+  return cvtfp32_fp16(a, b);
+}
+template <>
+inline __m512i cvt_from_fp32<Half, true>(const __m512& a, const __m512& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return cvtfp32_fp16(a, b);
 }
 
 template <typename T>
 class Vectorized16 {
+<<<<<<< HEAD
 static_assert(
   is_reduced_floating_point_v<T>,
   "Support only float16 and bfloat16.");
 private:
   __m512i values;
 public:
+=======
+  static_assert(
+      is_reduced_floating_point_v<T>,
+      "Support only float16 and bfloat16.");
+
+ private:
+  __m512i values;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = uint16_t;
   using size_type = int;
   static constexpr size_type size() {
@@ -181,6 +261,7 @@ static_assert(
     value_type uw = val.x;
     values = _mm512_set1_epi16(uw);
   }
+<<<<<<< HEAD
   Vectorized16(T val1, T val2, T val3, T val4,
          T val5, T val6, T val7, T val8,
          T val9, T val10, T val11, T val12,
@@ -194,14 +275,89 @@ static_assert(
         val24.x, val23.x, val22.x, val21.x, val20.x, val19.x, val18.x, val17.x,
         val16.x, val15.x, val14.x, val13.x, val12.x, val11.x, val10.x, val9.x,
         val8.x, val7.x, val6.x, val5.x, val4.x, val3.x, val2.x, val1.x);
+=======
+  Vectorized16(
+      T val1,
+      T val2,
+      T val3,
+      T val4,
+      T val5,
+      T val6,
+      T val7,
+      T val8,
+      T val9,
+      T val10,
+      T val11,
+      T val12,
+      T val13,
+      T val14,
+      T val15,
+      T val16,
+      T val17,
+      T val18,
+      T val19,
+      T val20,
+      T val21,
+      T val22,
+      T val23,
+      T val24,
+      T val25,
+      T val26,
+      T val27,
+      T val28,
+      T val29,
+      T val30,
+      T val31,
+      T val32) {
+    values = _mm512_set_epi16(
+        val32.x,
+        val31.x,
+        val30.x,
+        val29.x,
+        val28.x,
+        val27.x,
+        val26.x,
+        val25.x,
+        val24.x,
+        val23.x,
+        val22.x,
+        val21.x,
+        val20.x,
+        val19.x,
+        val18.x,
+        val17.x,
+        val16.x,
+        val15.x,
+        val14.x,
+        val13.x,
+        val12.x,
+        val11.x,
+        val10.x,
+        val9.x,
+        val8.x,
+        val7.x,
+        val6.x,
+        val5.x,
+        val4.x,
+        val3.x,
+        val2.x,
+        val1.x);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   operator __m512i() const {
     return values;
   }
   T& operator[](int idx) = delete;
+<<<<<<< HEAD
   const T& operator[](int idx) const  = delete;
   int zero_mask() const {
     // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+=======
+  const T& operator[](int idx) const = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _mm512_cmpeq_epi16_mask(values, _mm512_set1_epi16(0));
   }
   static Vectorized<T> loadu(const void* ptr, int16_t count = size()) {
@@ -223,12 +379,20 @@ static_assert(
   static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
     return _mm512_mask_blend_epi16(mask, a.values, b.values);
   }
+<<<<<<< HEAD
   static Vectorized<T> blendv(const Vectorized<T>& a,
       const Vectorized<T>& b, const Vectorized<T>& mask) {
+=======
+  static Vectorized<T> blendv(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto all_ones = _mm512_set1_epi16(0xFFFF);
     auto mask_ = _mm512_cmp_epi16_mask(mask, all_ones, _MM_CMPINT_EQ);
     return _mm512_mask_blend_epi16(mask_, a.values, b.values);
   }
+<<<<<<< HEAD
   template<typename step_t>
   static Vectorized<T> arange(T base = 0.f, step_t step = static_cast<step_t>(1)) {
     return Vectorized<T>(
@@ -243,6 +407,50 @@ static_assert(
   }
   static Vectorized<T> set(const Vectorized<T>& a,
       const Vectorized<T>& b, int64_t count = size()) {
+=======
+  template <typename step_t>
+  static Vectorized<T> arange(
+      T base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step,
+        base + 16 * step,
+        base + 17 * step,
+        base + 18 * step,
+        base + 19 * step,
+        base + 20 * step,
+        base + 21 * step,
+        base + 22 * step,
+        base + 23 * step,
+        base + 24 * step,
+        base + 25 * step,
+        base + 26 * step,
+        base + 27 * step,
+        base + 28 * step,
+        base + 29 * step,
+        base + 30 * step,
+        base + 31 * step);
+  }
+  static Vectorized<T> set(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -311,8 +519,13 @@ static_assert(
     }
     return b;
   }
+<<<<<<< HEAD
   #pragma clang diagnostic push
   #pragma clang diagnostic ignored "-Wignored-qualifiers"
+=======
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wignored-qualifiers"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
     __m512 lo, hi;
@@ -328,12 +541,23 @@ static_assert(
     __m512 zero = _mm512_set1_ps(0.0);
     __m512i zeroi = _mm512_castps_si512(zero);
     lo_mask = _mm512_cmp_ps_mask(lo, zero, _CMP_UNORD_Q);
+<<<<<<< HEAD
     lo = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zeroi, lo_mask, 0xFFFF'FFFF));
     hi_mask = _mm512_cmp_ps_mask(hi, zero, _CMP_UNORD_Q);
     hi = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zeroi, hi_mask, 0xFFFF'FFFF));
     return merge_compare_result(lo, hi);
   }
   #pragma clang diagnostic pop
+=======
+    lo = _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zeroi, lo_mask, 0xFFFF'FFFF));
+    hi_mask = _mm512_cmp_ps_mask(hi, zero, _CMP_UNORD_Q);
+    hi = _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zeroi, hi_mask, 0xFFFF'FFFF));
+    return merge_compare_result(lo, hi);
+  }
+#pragma clang diagnostic pop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> abs() const {
     return _mm512_andnot_si512(_mm512_set1_epi16(0x8000), values);
   }
@@ -344,10 +568,17 @@ static_assert(
       const auto zero_vec = _mm512_set1_ps(0.f);
       const auto nan_vec = _mm512_set1_ps(NAN);
       const auto not_nan_mask = _mm512_cmp_ps_mask(values, values, _CMP_EQ_OQ);
+<<<<<<< HEAD
       const auto non_nan_mask_vec = _mm512_mask_set1_epi32(_mm512_castps_si512(zero_vec),
                                                            not_nan_mask, 0xFFFFFFFF);
       const auto nan_mask = _mm512_cmp_ps_mask(_mm512_castsi512_ps(non_nan_mask_vec),
                                                zero_vec, _CMP_EQ_OQ);
+=======
+      const auto non_nan_mask_vec = _mm512_mask_set1_epi32(
+          _mm512_castps_si512(zero_vec), not_nan_mask, 0xFFFFFFFF);
+      const auto nan_mask = _mm512_cmp_ps_mask(
+          _mm512_castsi512_ps(non_nan_mask_vec), zero_vec, _CMP_EQ_OQ);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const auto pi = _mm512_set1_ps(c10::pi<float>);
 
       const auto neg_mask = _mm512_cmp_ps_mask(values, zero_vec, _CMP_LT_OQ);
@@ -386,7 +617,11 @@ static_assert(
   Vectorized<T> atanh() const {
     return map(Sleef_atanhf16_u10);
   }
+<<<<<<< HEAD
   Vectorized<T> atan2(const Vectorized<T> &b) const {
+=======
+  Vectorized<T> atan2(const Vectorized<T>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m512 lo, hi;
     __m512 b1, b2;
     cvt_to_fp32<T>(values, lo, hi);
@@ -395,12 +630,20 @@ static_assert(
     auto o2 = Sleef_atan2f16_u10(hi, b2);
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
   Vectorized<T> copysign(const Vectorized<T> &sign) const {
     // copy sign bit (0x8000) from sign and remaining bits from values
     __m512i mask_value = _mm512_set1_epi32(~0x80008000);
     __m512i mask_signbit = _mm512_set1_epi32(0x80008000);
     return Vectorized<T>(
       _mm512_or_si512(
+=======
+  Vectorized<T> copysign(const Vectorized<T>& sign) const {
+    // copy sign bit (0x8000) from sign and remaining bits from values
+    __m512i mask_value = _mm512_set1_epi32(~0x80008000);
+    __m512i mask_signbit = _mm512_set1_epi32(0x80008000);
+    return Vectorized<T>(_mm512_or_si512(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _mm512_and_si512(values, mask_value),
         _mm512_and_si512(sign, mask_signbit)));
   }
@@ -436,7 +679,11 @@ static_assert(
   Vectorized<T> exp_u20() const {
     return exp();
   }
+<<<<<<< HEAD
   Vectorized<T> fmod(const Vectorized<T> & q) const {
+=======
+  Vectorized<T> fmod(const Vectorized<T>& q) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m512 x_lo, x_hi;
     cvt_to_fp32<T>(values, x_lo, x_hi);
     __m512 q_lo, q_hi;
@@ -445,7 +692,11 @@ static_assert(
     auto o2 = Sleef_fmodf16(x_hi, q_hi);
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
   Vectorized<T> hypot(const Vectorized<T> &b) const {
+=======
+  Vectorized<T> hypot(const Vectorized<T>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m512 lo, hi;
     __m512 b1, b2;
     cvt_to_fp32<T>(values, lo, hi);
@@ -500,7 +751,11 @@ static_assert(
     const auto o2 = _mm512_loadu_ps(tmp2);
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
   Vectorized<T> igamma(const Vectorized<T> &x) const {
+=======
+  Vectorized<T> igamma(const Vectorized<T>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m512 lo, hi;
     __m512 xlo, xhi;
     cvt_to_fp32<T>(values, lo, hi);
@@ -520,7 +775,11 @@ static_assert(
     return cvt_from_fp32<T>(o1, o2);
   }
 
+<<<<<<< HEAD
   Vectorized<T> igammac(const Vectorized<T> &x) const {
+=======
+  Vectorized<T> igammac(const Vectorized<T>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m512 lo, hi;
     __m512 xlo, xhi;
     cvt_to_fp32<T>(values, lo, hi);
@@ -583,8 +842,15 @@ static_assert(
   Vectorized<T> round() const {
     __m512 lo, hi;
     cvt_to_fp32<T>(values, lo, hi);
+<<<<<<< HEAD
     auto o1 = _mm512_roundscale_ps(lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
     auto o2 = _mm512_roundscale_ps(hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+    auto o1 = _mm512_roundscale_ps(
+        lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    auto o2 = _mm512_roundscale_ps(
+        hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cvt_from_fp32<T>(o1, o2);
   }
   Vectorized<T> tan() const {
@@ -596,8 +862,15 @@ static_assert(
   Vectorized<T> trunc() const {
     __m512 lo, hi;
     cvt_to_fp32<T>(values, lo, hi);
+<<<<<<< HEAD
     auto o1 = _mm512_roundscale_ps(lo, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
     auto o2 = _mm512_roundscale_ps(hi, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+=======
+    auto o1 =
+        _mm512_roundscale_ps(lo, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    auto o2 =
+        _mm512_roundscale_ps(hi, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cvt_from_fp32<T>(o1, o2);
   }
   Vectorized<T> lgamma() const {
@@ -626,7 +899,11 @@ static_assert(
     auto o2 = _mm512_div_ps(ones, _mm512_sqrt_ps(hi));
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
   Vectorized<T> pow(const Vectorized<T> &b) const {
+=======
+  Vectorized<T> pow(const Vectorized<T>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m512 lo, hi;
     __m512 b1, b2;
     cvt_to_fp32<T>(values, lo, hi);
@@ -635,8 +912,14 @@ static_assert(
     auto o2 = Sleef_powf16_u10(hi, b2);
     return cvt_from_fp32<T>(o1, o2);
   }
+<<<<<<< HEAD
 private:
   template<typename Op, typename VectorizedType>
+=======
+
+ private:
+  template <typename Op, typename VectorizedType>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> inline binary_compare(const VectorizedType& b, Op op) const {
     __m512 a_lo, a_hi;
     __m512 b_lo, b_hi;
@@ -644,56 +927,101 @@ static_assert(
     cvt_to_fp32<T>(b.values, b_lo, b_hi);
     auto o1 = op(a_lo, b_lo);
     auto o2 = op(a_hi, b_hi);
+<<<<<<< HEAD
     return cvt_from_fp32<T, /*is_compare_op*/true>(o1, o2);
   }
 
 public:
+=======
+    return cvt_from_fp32<T, /*is_compare_op*/ true>(o1, o2);
+  }
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> inline operator>(const Vectorized<T>& other) const {
     return binary_compare(other, [](__m512 x, __m512 y) {
       auto zero_vec = _mm512_set1_epi32(0);
       auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ);
+<<<<<<< HEAD
       return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+=======
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     });
   }
   Vectorized<T> inline operator<(const Vectorized<T>& other) const {
     return binary_compare(other, [](__m512 x, __m512 y) {
       auto zero_vec = _mm512_set1_epi32(0);
       auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ);
+<<<<<<< HEAD
       return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+=======
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     });
   }
   Vectorized<T> inline operator>=(const Vectorized<T>& other) const {
     return binary_compare(other, [](__m512 x, __m512 y) {
       auto zero_vec = _mm512_set1_epi32(0);
       auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
+<<<<<<< HEAD
       return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+=======
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     });
   }
   Vectorized<T> inline operator<=(const Vectorized<T>& other) const {
     return binary_compare(other, [](__m512 x, __m512 y) {
       auto zero_vec = _mm512_set1_epi32(0);
       auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ);
+<<<<<<< HEAD
       return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+=======
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     });
   }
   Vectorized<T> inline operator==(const Vectorized16<T>& other) const {
     return binary_compare(other, [](__m512 x, __m512 y) {
       auto zero_vec = _mm512_set1_epi32(0);
       auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ);
+<<<<<<< HEAD
       return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+=======
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     });
   }
   Vectorized<T> inline operator!=(const Vectorized16<T>& other) const {
     return binary_compare(other, [](__m512 x, __m512 y) {
       auto zero_vec = _mm512_set1_epi32(0);
       auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ);
+<<<<<<< HEAD
       return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+=======
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     });
   }
 };
 
+<<<<<<< HEAD
 template<typename T, typename Op>
 static inline Vectorized<T> binary_op_as_fp32(const Vectorized<T>& a, const Vectorized<T>& b, Op op) {
+=======
+template <typename T, typename Op>
+static inline Vectorized<T> binary_op_as_fp32(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 b_lo, b_hi;
   cvt_to_fp32<T>(__m512i(a), a_lo, a_hi);
@@ -704,8 +1032,16 @@ static inline Vectorized<T> binary_op_as_fp32(const Vectorized<T>& a, const Vect
 }
 
 template <>
+<<<<<<< HEAD
 class Vectorized<BFloat16>: public Vectorized16<BFloat16> {
 public:
+=======
+struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<BFloat16> : public Vectorized16<BFloat16> {
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vectorized16::Vectorized16;
 
   using value_type = BFloat16;
@@ -720,6 +1056,7 @@ class Vectorized<BFloat16>: public Vectorized16<BFloat16> {
   Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
 };
 
+<<<<<<< HEAD
 Vectorized<BFloat16> inline operator+(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
   return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_add_ps(x, y); });
 }
@@ -763,6 +1100,79 @@ inline Vectorized<BFloat16> Vectorized<BFloat16>::lt(const Vectorized<BFloat16>&
 }
 
 inline Vectorized<BFloat16> Vectorized<BFloat16>::le(const Vectorized<BFloat16>& other) const {
+=======
+Vectorized<BFloat16> inline operator+(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_add_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator-(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_sub_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator*(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_mul_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator/(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_div_ps(x, y);
+  });
+}
+Vectorized<BFloat16> inline operator&(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm512_and_si512(a, b);
+}
+Vectorized<BFloat16> inline operator|(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm512_or_si512(a, b);
+}
+Vectorized<BFloat16> inline operator^(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::eq(
+    const Vectorized<BFloat16>& other) const {
+  return (*this == other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ne(
+    const Vectorized<BFloat16>& other) const {
+  return (*this != other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::gt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this > other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ge(
+    const Vectorized<BFloat16>& other) const {
+  return (*this >= other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::lt(
+    const Vectorized<BFloat16>& other) const {
+  return (*this < other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::le(
+    const Vectorized<BFloat16>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<BFloat16>(1.0f);
 }
 
@@ -774,7 +1184,13 @@ inline Vectorized<BFloat16> Vectorized<BFloat16>::frac() const {
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline maximum(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+=======
+Vectorized<BFloat16> inline maximum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 b_lo, b_hi;
   cvtbf16_fp32(__m512i(a), a_lo, a_hi);
@@ -794,7 +1210,13 @@ Vectorized<BFloat16> inline maximum(const Vectorized<BFloat16>& a, const Vectori
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline minimum(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+=======
+Vectorized<BFloat16> inline minimum(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 b_lo, b_hi;
   __m512i zero_vec = _mm512_set1_epi32(0);
@@ -804,10 +1226,17 @@ Vectorized<BFloat16> inline minimum(const Vectorized<BFloat16>& a, const Vectori
   auto min_hi = _mm512_min_ps(a_hi, b_hi);
   auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q);
   auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q);
+<<<<<<< HEAD
   auto nan_lo = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, nan_lo_mask,
                                                            0xFFFFFFFF));
   auto nan_hi = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, nan_hi_mask,
                                                            0xFFFFFFFF));
+=======
+  auto nan_lo = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, nan_lo_mask, 0xFFFFFFFF));
+  auto nan_hi = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, nan_hi_mask, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Exploit the fact that all-ones is a NaN.
   auto o1 = _mm512_or_ps(min_lo, nan_lo);
   auto o2 = _mm512_or_ps(min_hi, nan_hi);
@@ -815,8 +1244,15 @@ Vectorized<BFloat16> inline minimum(const Vectorized<BFloat16>& a, const Vectori
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline clamp(const Vectorized<BFloat16>& a,
     const Vectorized<BFloat16>& min, const Vectorized<BFloat16>& max) {
+=======
+Vectorized<BFloat16> inline clamp(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min,
+    const Vectorized<BFloat16>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 min_lo, min_hi;
   __m512 max_lo, max_hi;
@@ -829,7 +1265,13 @@ Vectorized<BFloat16> inline clamp(const Vectorized<BFloat16>& a,
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline clamp_max(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& max) {
+=======
+Vectorized<BFloat16> inline clamp_max(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 max_lo, max_hi;
   cvtbf16_fp32(__m512i(a), a_lo, a_hi);
@@ -840,7 +1282,13 @@ Vectorized<BFloat16> inline clamp_max(const Vectorized<BFloat16>& a, const Vecto
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline clamp_min(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& min) {
+=======
+Vectorized<BFloat16> inline clamp_min(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 min_lo, min_hi;
   cvtbf16_fp32(__m512i(a), a_lo, a_hi);
@@ -856,8 +1304,15 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
 #ifndef __msvc_cl__
 #pragma unroll
 #endif
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
     auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
+=======
+  for (i = 0; i <= (n - Vectorized<BFloat16>::size());
+       i += Vectorized<BFloat16>::size()) {
+    auto vsrc =
+        _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
   }
 #ifndef __msvc_cl__
@@ -871,7 +1326,12 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
 template <>
 inline void convert(const float* src, BFloat16* dst, int64_t n) {
   int64_t i;
+<<<<<<< HEAD
   for (i = 0; i + Vectorized<BFloat16>::size() <= n; i += Vectorized<BFloat16>::size()) {
+=======
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n;
+       i += Vectorized<BFloat16>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m512 a = _mm512_loadu_ps(&src[i]);
     __m512 b = _mm512_loadu_ps(&src[i + 16]);
 
@@ -885,7 +1345,11 @@ inline void convert(const float* src, BFloat16* dst, int64_t n) {
 
 template <>
 inline void convert(const double* src, BFloat16* dst, int64_t n) {
+<<<<<<< HEAD
   auto load_float = [](const double *src) -> __m512 {
+=======
+  auto load_float = [](const double* src) -> __m512 {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Load one float vector from an array of doubles
     __m256 a = _mm512_cvtpd_ps(_mm512_loadu_pd(src));
     __m256 b = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8));
@@ -893,7 +1357,12 @@ inline void convert(const double* src, BFloat16* dst, int64_t n) {
   };
 
   int64_t i;
+<<<<<<< HEAD
   for (i = 0; i + Vectorized<BFloat16>::size() <= n; i += Vectorized<BFloat16>::size()) {
+=======
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n;
+       i += Vectorized<BFloat16>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m512 a = load_float(&src[i]);
     __m512 b = load_float(&src[i + 16]);
 
@@ -906,8 +1375,15 @@ inline void convert(const double* src, BFloat16* dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<BFloat16> inline fmadd(const Vectorized<BFloat16>& a,
     const Vectorized<BFloat16>& b, const Vectorized<BFloat16>& c) {
+=======
+Vectorized<BFloat16> inline fmadd(
+    const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b,
+    const Vectorized<BFloat16>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 b_lo, b_hi;
   __m512 c_lo, c_hi;
@@ -921,6 +1397,7 @@ Vectorized<BFloat16> inline fmadd(const Vectorized<BFloat16>& a,
 
 static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
   __m512i r[8];
+<<<<<<< HEAD
   // a0a1 a2a3 a4a5 a6a7 a8a9 a10a11 a12a13 a14a15   e0e1 e2e3 e4e5 e6e7 e8e9 e10e11 e12e13 e14e15
   // b0-b15  f0-f15
   // c0-c15  g0-g15
@@ -929,11 +1406,17 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
   // j0-j15  n0-n15
   // k0-k15  o0-o15
   // l0-l15  p0-p15
+=======
+  // a0a1 a2a3 a4a5 a6a7 a8a9 a10a11 a12a13 a14a15   e0e1 e2e3 e4e5 e6e7 e8e9
+  // e10e11 e12e13 e14e15 b0-b15  f0-f15 c0-c15  g0-g15 d0-d15  h0-h15 i0-i15
+  // m0-m15 j0-j15  n0-n15 k0-k15  o0-o15 l0-l15  p0-p15
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef __msvc_cl__
 #pragma unroll(4)
 #endif
   for (int i = 0; i < 4; i++) {
     r[i] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i]), t[i + 4], 0x01);
+<<<<<<< HEAD
     r[i + 4] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01);
   }
 
@@ -943,6 +1426,18 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
   // u3: c4c5 d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15   g4g5 h4h5 g6g7 h6h7 g12g13 h12h13 g14g15 h14h15
   // i j  m n
   // k l  o p
+=======
+    r[i + 4] =
+        _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01);
+  }
+
+  // u0: a0a1 b0b1 a2a3 b2b3 a8a9 b8b9 a10a11 b10b11   e0e1 f0f1 e2e3 f2f3 e8e9
+  // f8f9 e10e11 f10f11 u1: a4a5 b4b5 a6a7 b6b7 a12a13 b12b13 a14a15 b14b15 e4e5
+  // f4f5 e6e7 f6f7 e12e13 f12f13 e14e15 f14f15 u2: c0c1 d0d1 c2c3 d2d3 c8c9
+  // d8d9 c10c11 d10d11   g0g1 h0h1 g2g3 h2h3 g8g9 h8h9 g10g11 h10h11 u3: c4c5
+  // d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15   g4g5 h4h5 g6g7 h6h7 g12g13
+  // h12h13 g14g15 h14h15 i j  m n k l  o p
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef __msvc_cl__
 #pragma unroll(4)
 #endif
@@ -951,11 +1446,19 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
     u[i + 1] = _mm512_unpackhi_epi32(r[i], r[i + 1]);
   }
 
+<<<<<<< HEAD
   // r0: a0a1 b0b1 c0c1 d0d1 a8a9 b8b9 c8c9 d8d9  e0e1 f0f1 g0g1 h0h1 e8e9 f8f9 g8g9 h8h9
   // r1: a2a3 b2b3 c2c3 d2d3 a10a11 b10b11 c10c11 d10d11  e2e3 f2f3 g2g3 h2h3 e10e11 f10f11 g10g11 h10h11
   // r2: a4a5 b4b5 c4c5 d4b5 a12a13 b12b13 c12c13 d12d13
   // r3: a6a7 b6b7 c6c7 d6b7 a14a15 b14b15 c14c15 d14d15
   // r4: i j k l m n o p
+=======
+  // r0: a0a1 b0b1 c0c1 d0d1 a8a9 b8b9 c8c9 d8d9  e0e1 f0f1 g0g1 h0h1 e8e9 f8f9
+  // g8g9 h8h9 r1: a2a3 b2b3 c2c3 d2d3 a10a11 b10b11 c10c11 d10d11  e2e3 f2f3
+  // g2g3 h2h3 e10e11 f10f11 g10g11 h10h11 r2: a4a5 b4b5 c4c5 d4b5 a12a13 b12b13
+  // c12c13 d12d13 r3: a6a7 b6b7 c6c7 d6b7 a14a15 b14b15 c14c15 d14d15 r4: i j k
+  // l m n o p
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   r[0] = _mm512_unpacklo_epi64(u[0], u[2]);
   r[1] = _mm512_unpackhi_epi64(u[0], u[2]);
   r[2] = _mm512_unpacklo_epi64(u[1], u[3]);
@@ -1020,7 +1523,11 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
 // TODO(Leslie): Add the AVX2 Version of transpose_mxn for BFloat16 and Float16
 // Code referred to FBGEMM:
 // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607
+<<<<<<< HEAD
 template<>
+=======
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void transpose_mxn<BFloat16, 16, 16>(
     const BFloat16* src,
     int64_t ld_src,
@@ -1048,7 +1555,12 @@ inline void transpose_mxn<BFloat16, 16, 16>(
 #pragma unroll(16)
 #endif
   for (int i = 0; i < 16; i++) {
+<<<<<<< HEAD
     t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
+=======
+    t[i] =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   __m512i u[8];
@@ -1059,8 +1571,13 @@ inline void transpose_mxn<BFloat16, 16, 16>(
 #endif
   for (int i = 0; i < 8; i++) {
     _mm256_storeu_si256(
+<<<<<<< HEAD
       reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
       _mm512_extracti32x8_epi32(u[i], 0x0));
+=======
+        reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
+        _mm512_extracti32x8_epi32(u[i], 0x0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm256_storeu_si256(
         reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst),
         _mm512_extracti32x8_epi32(u[i], 0x01));
@@ -1069,7 +1586,11 @@ inline void transpose_mxn<BFloat16, 16, 16>(
 
 // Code referred to FBGEMM:
 // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607
+<<<<<<< HEAD
 template<>
+=======
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void transpose_mxn<Half, 16, 16>(
     const Half* src,
     int64_t ld_src,
@@ -1082,7 +1603,12 @@ inline void transpose_mxn<Half, 16, 16>(
 #pragma unroll(16)
 #endif
   for (int i = 0; i < 16; i++) {
+<<<<<<< HEAD
     t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
+=======
+    t[i] =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   __m512i u[8];
@@ -1093,8 +1619,13 @@ inline void transpose_mxn<Half, 16, 16>(
 #endif
   for (int i = 0; i < 8; i++) {
     _mm256_storeu_si256(
+<<<<<<< HEAD
       reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
       _mm512_extracti32x8_epi32(u[i], 0x0));
+=======
+        reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
+        _mm512_extracti32x8_epi32(u[i], 0x0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm256_storeu_si256(
         reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst),
         _mm512_extracti32x8_epi32(u[i], 0x01));
@@ -1106,6 +1637,7 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
   // t[1]: 4 36 5 37 6 38 7 39 12 44 13 45 14 46 15 47 20 ... 63
   // t[2]: 64 96 65 97 66 98 67 99 72 104 73 105 74 106 75 ... 123
   // t[3]: 68 100 69 101 70 102 71 103 76 108 77 109 78 110 79 111 84 ... 127
+<<<<<<< HEAD
   // t[4]: 128 160 129 161 130 162 131 163 136 168 137 169 138 170 139 171 144 ... 187
   // t[5]: 132 164 133 165 134 166 135 167 140 172 141 173 142 174 143 175 148 ... 191
   // t[6]: 192 224 193 225 194 226 195 227 200 232 201 233 202 234 203 235 208 ... 251
@@ -1121,6 +1653,26 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
   // t[16]: 512 544 513 545 514 546 515 547 520 552 521 553 522 554 523 555 528 ... 571
   // ...
   // t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007 980 ... 1023
+=======
+  // t[4]: 128 160 129 161 130 162 131 163 136 168 137 169 138 170 139 171 144
+  // ... 187 t[5]: 132 164 133 165 134 166 135 167 140 172 141 173 142 174 143
+  // 175 148 ... 191 t[6]: 192 224 193 225 194 226 195 227 200 232 201 233 202
+  // 234 203 235 208 ... 251 t[7]: 196 228 197 229 198 230 199 231 204 236 205
+  // 237 206 238 207 239 212 ... 255 t[8]: 256 288 257 289 258 290 259 291 264
+  // 296 265 297 266 298 267 299 272 ... 315 t[9]: 260 292 261 293 262 294 263
+  // 295 268 300 269 301 270 302 271 303 276 ... 319 t[10]: 320 352 321 353 322
+  // 354 323 355 328 360 329 361 330 362 331 363 336 ... 379 t[11]: 324 356 325
+  // 357 326 358 327 359 332 364 333 365 334 366 335 367 340 ... 383 t[12]: 384
+  // 416 385 417 386 418 387 419 392 424 393 425 394 426 395 427 400 ... 443
+  // t[13]: 388 420 389 421 390 422 391 423 396 428 397 429 398 430 399 431 404
+  // ... 447 t[14]: 448 480 449 481 450 482 451 483 456 488 457 489 458 490 459
+  // 491 464 ... 507 t[15]: 452 484 453 485 454 486 455 487 460 492 461 493 462
+  // 494 463 495 468 ... 511 t[16]: 512 544 513 545 514 546 515 547 520 552 521
+  // 553 522 554 523 555 528 ... 571
+  // ...
+  // t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007
+  // 980 ... 1023
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef __msvc_cl__
 #pragma unroll(16)
 #endif
@@ -1133,6 +1685,7 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
   // t[1]: 2 34 66 98 3 35 67 99 10 42 74 106 11 43 75 107 18 ... 123
   // t[2]: 4 36 68 100 5 37 69 101 12 44 76 108 13 45 77 109 20 ... 125
   // t[3]: 6 38 70 102 7 39 71 103 14 46 78 110 15 47 79 111 22 ... 127
+<<<<<<< HEAD
   // t[4]: 128 160 192 224 129 161 193 225 136 168 200 232 137 169 201 233 144 ... 249
   // t[5]: 130 162 194 226 131 163 195 227 138 170 202 234 139 171 203 235 146 ... 251
   // t[6]: 132 164 196 228 133 165 197 229 140 172 204 236 141 173 205 237 148 ... 253
@@ -1148,6 +1701,26 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
   // t[16]: 512 544 576 608 513 545 577 609 520 552 584 616 521 553 585 617 528 ... 633
   // ...
   // t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007 918 ... 1023
+=======
+  // t[4]: 128 160 192 224 129 161 193 225 136 168 200 232 137 169 201 233 144
+  // ... 249 t[5]: 130 162 194 226 131 163 195 227 138 170 202 234 139 171 203
+  // 235 146 ... 251 t[6]: 132 164 196 228 133 165 197 229 140 172 204 236 141
+  // 173 205 237 148 ... 253 t[7]: 134 166 198 230 135 167 199 231 142 174 206
+  // 238 143 175 207 239 150 ... 255 t[8]: 256 288 320 352 257 289 321 353 264
+  // 296 328 360 265 297 329 361 272 ... 377 t[9]: 258 290 322 354 259 291 323
+  // 355 266 298 330 362 267 299 331 363 274 ... 379 t[10]: 260 292 324 356 261
+  // 293 325 357 268 300 332 364 269 301 333 365 276 ... 381 t[11]: 262 294 326
+  // 358 263 295 327 359 270 302 334 366 271 303 335 367 278 ... 383 t[12]: 384
+  // 416 448 480 385 417 449 481 392 424 456 488 393 425 457 489 400 ... 505
+  // t[13]: 386 418 450 482 387 419 451 483 394 426 458 490 395 427 459 491 402
+  // ... 507 t[14]: 388 420 452 484 389 421 453 485 396 428 460 492 397 429 461
+  // 493 404 ... 509 t[15]: 390 422 454 486 391 423 455 487 398 430 462 494 399
+  // 431 463 495 406 ... 511 t[16]: 512 544 576 608 513 545 577 609 520 552 584
+  // 616 521 553 585 617 528 ... 633
+  // ...
+  // t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007
+  // 918 ... 1023
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef __msvc_cl__
 #pragma unroll(8)
 #endif
@@ -1166,6 +1739,7 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
   // t[5]: 5 37 69 101 133 165 197 229 13 45 77 109 141 173 205 237 21 ... 253
   // t[6]: 6 38 70 102 134 166 198 230 14 46 78 110 142 174 206 238 22 ... 254
   // t[7]: 7 39 71 103 135 167 199 231 15 47 79 111 143 175 207 239 23 ... 255
+<<<<<<< HEAD
   // t[8]: 256 288 320 352 384 416 448 480 264 296 328 360 392 424 456 488 272 ... 504
   // t[9]: 257 289 321 353 385 417 449 481 265 297 329 361 393 425 457 489 273 ... 505
   // t[10]: 258 290 322 354 386 418 450 482 266 298 330 362 394 426 458 490 274 ... 506
@@ -1177,6 +1751,21 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
   // t[16]: 512 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760
   // ...
   // t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791 ... 1023
+=======
+  // t[8]: 256 288 320 352 384 416 448 480 264 296 328 360 392 424 456 488 272
+  // ... 504 t[9]: 257 289 321 353 385 417 449 481 265 297 329 361 393 425 457
+  // 489 273 ... 505 t[10]: 258 290 322 354 386 418 450 482 266 298 330 362 394
+  // 426 458 490 274 ... 506 t[11]: 259 291 323 355 387 419 451 483 267 299 331
+  // 363 395 427 459 491 275 ... 507 t[12]: 260 292 324 356 388 420 452 484 268
+  // 300 332 364 396 428 460 492 276 ... 508 t[13]: 261 293 325 357 389 421 453
+  // 485 269 301 333 365 397 429 461 493 277 ... 509 t[14]: 262 294 326 358 390
+  // 422 454 486 270 302 334 366 398 430 462 494 278 ... 510 t[15]: 263 295 327
+  // 359 391 423 455 487 271 303 335 367 399 431 463 495 279 ... 511 t[16]: 512
+  // 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760
+  // ...
+  // t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791
+  // ... 1023
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef __msvc_cl__
 #pragma unroll(4)
 #endif
@@ -1195,6 +1784,7 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
   // t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 17 ... 497
   // t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 450 482 18 ... 498
   // t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 387 419 451 483 19 ... 499
+<<<<<<< HEAD
   // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 20 ... 500
   // t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 21 ... 501
   // t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486 22 ... 502
@@ -1210,6 +1800,25 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
   // t[16]: 512 544 576 608 640 672 704 736 768 800 832 864 896 928 960 992 528 ... 1008
   // ...
   // t[31]: 527 559 591 623 655 687 719 751 783 815 847 879 911 943 975 1007 543 ... 1023
+=======
+  // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 20 ...
+  // 500 t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 21
+  // ... 501 t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486
+  // 22 ... 502 t[7]: 7 39 71 103 135 167 199 231 263 295 327 359 391 423 455
+  // 487 23 ... 503 t[8]: 8 40 72 104 136 168 200 232 264 296 328 360 392 424
+  // 456 488 24 ... 504 t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393
+  // 425 457 489 25 ... 505 t[10]: 10 42 74 106 138 170 202 234 266 298 330 362
+  // 394 426 458 490 26 ... 506 t[11]: 11 43 75 107 139 171 203 235 267 299 331
+  // 363 395 427 459 491 27 ... 507 t[12]: 12 44 76 108 140 172 204 236 268 300
+  // 332 364 396 428 460 492 28 ... 508 t[13]: 13 45 77 109 141 173 205 237 269
+  // 301 333 365 397 429 461 493 29 ... 509 t[14]: 14 46 78 110 142 174 206 238
+  // 270 302 334 366 398 430 462 494 30 ... 510 t[15]: 15 47 79 111 143 175 207
+  // 239 271 303 335 367 399 431 463 495 31 ... 511 t[16]: 512 544 576 608 640
+  // 672 704 736 768 800 832 864 896 928 960 992 528 ... 1008
+  // ...
+  // t[31]: 527 559 591 623 655 687 719 751 783 815 847 879 911 943 975 1007 543
+  // ... 1023
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512i const1 = _mm512_set_epi64(
       0x000000000000000d,
       0x000000000000000c,
@@ -1232,6 +1841,7 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
 #pragma unroll(8)
 #endif
   for (int i = 0; i < 8; ++i) {
+<<<<<<< HEAD
     r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/const1, d[i + 8]);
     r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/const2, d[i + 8]);
     r[i + 16] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/const1, d[i + 24]);
@@ -1257,6 +1867,37 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
   // t[16]: 16 48 80 112 144 176 208 240 272 304 336 368 400 432 464 496 528 560 ... 1008
   // ...
   // t[31]: 31 63 95 127 159 191 223 255 287 319 351 383 415 447 479 511 543 575 ... 1023
+=======
+    r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/ const1, d[i + 8]);
+    r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/ const2, d[i + 8]);
+    r[i + 16] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/ const1, d[i + 24]);
+    r[i + 24] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/ const2, d[i + 24]);
+  }
+
+  // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 512 544
+  // ... 992 t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481
+  // 513 545 ... 993 t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418
+  // 450 482 514 546 ... 994 t[3]: 3 35 67 99 131 163 195 227 259 291 323 355
+  // 387 419 451 483 515 547 ... 995 t[4]: 4 36 68 100 132 164 196 228 260 292
+  // 324 356 388 420 452 484 516 548 ... 996 t[5]: 5 37 69 101 133 165 197 229
+  // 261 293 325 357 389 421 453 485 517 549 ... 997 t[6]: 6 38 70 102 134 166
+  // 198 230 262 294 326 358 390 422 454 486 518 550 ... 998 t[7]: 7 39 71 103
+  // 135 167 199 231 263 295 327 359 391 423 455 487 519 551 ... 999 t[8]: 8 40
+  // 72 104 136 168 200 232 264 296 328 360 392 424 456 488 520 552 ... 1000
+  // t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 425 457 489 521 553
+  // ... 1001 t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 394 426 458
+  // 490 522 554 ... 1002 t[11]: 11 43 75 107 139 171 203 235 267 299 331 363
+  // 395 427 459 491 523 555 ... 1003 t[12]: 12 44 76 108 140 172 204 236 268
+  // 300 332 364 396 428 460 492 524 556 ... 1004 t[13]: 13 45 77 109 141 173
+  // 205 237 269 301 333 365 397 429 461 493 525 557 ... 1005 t[14]: 14 46 78
+  // 110 142 174 206 238 270 302 334 366 398 430 462 494 526 558 ... 1006 t[15]:
+  // 15 47 79 111 143 175 207 239 271 303 335 367 399 431 463 495 527 559 ...
+  // 1007 t[16]: 16 48 80 112 144 176 208 240 272 304 336 368 400 432 464 496
+  // 528 560 ... 1008
+  // ...
+  // t[31]: 31 63 95 127 159 191 223 255 287 319 351 383 415 447 479 511 543 575
+  // ... 1023
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512i const3 = _mm512_set_epi64(
       0x000000000000000b,
       0x000000000000000a,
@@ -1279,17 +1920,36 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
 #pragma unroll(16)
 #endif
   for (int i = 0; i < 16; ++i) {
+<<<<<<< HEAD
     d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/const3, r[i + 16]);
     d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/const4, r[i + 16]);
+=======
+    d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/ const3, r[i + 16]);
+    d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/ const4, r[i + 16]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 // Code referred to FBGEMM:
 // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#LL19C6-L19C6
+<<<<<<< HEAD
 template<>
 inline void transpose_mxn<BFloat16>(const BFloat16* src, int64_t ld_src, BFloat16* dst, int64_t ld_dst, int M, int N) {
   // load from src
   TORCH_CHECK(M <= 32 && N <= 32, "transpose_mxn<BFloat16> expects M, N <= 32.");
+=======
+template <>
+inline void transpose_mxn<BFloat16>(
+    const BFloat16* src,
+    int64_t ld_src,
+    BFloat16* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+  // load from src
+  TORCH_CHECK(
+      M <= 32 && N <= 32, "transpose_mxn<BFloat16> expects M, N <= 32.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512i r[32];
   int i;
   if (N == 32) {
@@ -1322,6 +1982,7 @@ inline void transpose_mxn<BFloat16>(const BFloat16* src, int64_t ld_src, BFloat1
   }
 }
 
+<<<<<<< HEAD
 template <typename T, int M, int N,
           typename std::enable_if_t<std::is_same_v<T, BFloat16> && ((M <= 32 && M != 16) || (N <= 32 && N != 16)), int> = 0>
 inline void transpose_mxn(const BFloat16* src, int64_t ld_src, BFloat16* dst, int64_t ld_dst) {
@@ -1330,6 +1991,32 @@ inline void transpose_mxn(const BFloat16* src, int64_t ld_src, BFloat16* dst, in
 
 template<>
 inline void transpose_mxn<Half>(const Half* src, int64_t ld_src, Half* dst, int64_t ld_dst, int M, int N) {
+=======
+template <
+    typename T,
+    int M,
+    int N,
+    typename std::enable_if_t<
+        std::is_same_v<T, BFloat16> &&
+            ((M <= 32 && M != 16) || (N <= 32 && N != 16)),
+        int> = 0>
+inline void transpose_mxn(
+    const BFloat16* src,
+    int64_t ld_src,
+    BFloat16* dst,
+    int64_t ld_dst) {
+  transpose_mxn<BFloat16>(src, ld_src, dst, ld_dst, M, N);
+}
+
+template <>
+inline void transpose_mxn<Half>(
+    const Half* src,
+    int64_t ld_src,
+    Half* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(M <= 32 && N <= 32, "transpose_mxn<Half> expects M, N <= 32.");
   // load from src
   __m512i r[32];
@@ -1364,15 +2051,39 @@ inline void transpose_mxn<Half>(const Half* src, int64_t ld_src, Half* dst, int6
   }
 }
 
+<<<<<<< HEAD
 template <typename T, int M, int N,
           typename std::enable_if_t<std::is_same_v<T, Half> && ((M <= 32 && M != 16) || (N <= 32 && N != 16)), int> = 0>
 inline void transpose_mxn(const Half* src, int64_t ld_src, Half* dst, int64_t ld_dst) {
+=======
+template <
+    typename T,
+    int M,
+    int N,
+    typename std::enable_if_t<
+        std::is_same_v<T, Half> &&
+            ((M <= 32 && M != 16) || (N <= 32 && N != 16)),
+        int> = 0>
+inline void transpose_mxn(
+    const Half* src,
+    int64_t ld_src,
+    Half* dst,
+    int64_t ld_dst) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   transpose_mxn<Half>(src, ld_src, dst, ld_dst, M, N);
 }
 
 template <>
+<<<<<<< HEAD
 class Vectorized<Half>: public Vectorized16<Half> {
 public:
+=======
+struct is_vec_specialized_for<Half> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<Half> : public Vectorized16<Half> {
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Vectorized16::Vectorized16;
 
   using value_type = Half;
@@ -1387,6 +2098,7 @@ class Vectorized<Half>: public Vectorized16<Half> {
   Vectorized<Half> le(const Vectorized<Half>& other) const;
 };
 
+<<<<<<< HEAD
 Vectorized<Half> inline operator+(const Vectorized<Half>& a, const Vectorized<Half>& b) {
   return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_add_ps(x, y); });
 }
@@ -1431,6 +2143,80 @@ inline Vectorized<Half> Vectorized<Half>::lt(const Vectorized<Half>& other) cons
 }
 
 inline Vectorized<Half> Vectorized<Half>::le(const Vectorized<Half>& other) const {
+=======
+Vectorized<Half> inline operator+(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_add_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator-(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_sub_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator*(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_mul_ps(x, y);
+  });
+}
+Vectorized<Half> inline operator/(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_div_ps(x, y);
+  });
+}
+
+Vectorized<Half> inline operator&(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm512_and_si512(a, b);
+}
+Vectorized<Half> inline operator|(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm512_or_si512(a, b);
+}
+Vectorized<Half> inline operator^(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+inline Vectorized<Half> Vectorized<Half>::eq(
+    const Vectorized<Half>& other) const {
+  return (*this == other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::ne(
+    const Vectorized<Half>& other) const {
+  return (*this != other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::gt(
+    const Vectorized<Half>& other) const {
+  return (*this > other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::ge(
+    const Vectorized<Half>& other) const {
+  return (*this >= other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::lt(
+    const Vectorized<Half>& other) const {
+  return (*this < other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::le(
+    const Vectorized<Half>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<Half>(1.0f);
 }
 
@@ -1442,7 +2228,13 @@ inline Vectorized<Half> Vectorized<Half>::frac() const {
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline maximum(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+=======
+Vectorized<Half> inline maximum(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 b_lo, b_hi;
   cvtfp16_fp32(__m512i(a), a_lo, a_hi);
@@ -1462,7 +2254,13 @@ Vectorized<Half> inline maximum(const Vectorized<Half>& a, const Vectorized<Half
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline minimum(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+=======
+Vectorized<Half> inline minimum(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 b_lo, b_hi;
   __m512i zero_vec = _mm512_set1_epi32(0);
@@ -1472,10 +2270,17 @@ Vectorized<Half> inline minimum(const Vectorized<Half>& a, const Vectorized<Half
   auto min_hi = _mm512_min_ps(a_hi, b_hi);
   auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q);
   auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q);
+<<<<<<< HEAD
   auto nan_lo = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, nan_lo_mask,
                                                            0xFFFFFFFF));
   auto nan_hi = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, nan_hi_mask,
                                                            0xFFFFFFFF));
+=======
+  auto nan_lo = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, nan_lo_mask, 0xFFFFFFFF));
+  auto nan_hi = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, nan_hi_mask, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Exploit the fact that all-ones is a NaN.
   auto o1 = _mm512_or_ps(min_lo, nan_lo);
   auto o2 = _mm512_or_ps(min_hi, nan_hi);
@@ -1483,8 +2288,15 @@ Vectorized<Half> inline minimum(const Vectorized<Half>& a, const Vectorized<Half
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline clamp(const Vectorized<Half>& a,
     const Vectorized<Half>& min, const Vectorized<Half>& max) {
+=======
+Vectorized<Half> inline clamp(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& min,
+    const Vectorized<Half>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 min_lo, min_hi;
   __m512 max_lo, max_hi;
@@ -1497,7 +2309,13 @@ Vectorized<Half> inline clamp(const Vectorized<Half>& a,
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline clamp_max(const Vectorized<Half>& a, const Vectorized<Half>& max) {
+=======
+Vectorized<Half> inline clamp_max(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 max_lo, max_hi;
   cvtfp16_fp32(__m512i(a), a_lo, a_hi);
@@ -1508,7 +2326,13 @@ Vectorized<Half> inline clamp_max(const Vectorized<Half>& a, const Vectorized<Ha
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline clamp_min(const Vectorized<Half>& a, const Vectorized<Half>& min) {
+=======
+Vectorized<Half> inline clamp_min(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 min_lo, min_hi;
   cvtfp16_fp32(__m512i(a), a_lo, a_hi);
@@ -1524,8 +2348,15 @@ inline void convert(const Half* src, Half* dst, int64_t n) {
 #ifndef __msvc_cl__
 #pragma unroll
 #endif
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<Half>::size()); i += Vectorized<Half>::size()) {
     auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
+=======
+  for (i = 0; i <= (n - Vectorized<Half>::size());
+       i += Vectorized<Half>::size()) {
+    auto vsrc =
+        _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
   }
 #ifndef __msvc_cl__
@@ -1539,7 +2370,12 @@ inline void convert(const Half* src, Half* dst, int64_t n) {
 template <>
 inline void convert(const float* src, Half* dst, int64_t n) {
   int64_t i;
+<<<<<<< HEAD
   for (i = 0; i + Vectorized<Half>::size() <= n; i += Vectorized<Half>::size()) {
+=======
+  for (i = 0; i + Vectorized<Half>::size() <= n;
+       i += Vectorized<Half>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m512 a = _mm512_loadu_ps(&src[i]);
     __m512 b = _mm512_loadu_ps(&src[i + 16]);
 
@@ -1553,7 +2389,11 @@ inline void convert(const float* src, Half* dst, int64_t n) {
 
 template <>
 inline void convert(const double* src, Half* dst, int64_t n) {
+<<<<<<< HEAD
   auto load_float = [](const double *src) -> __m512 {
+=======
+  auto load_float = [](const double* src) -> __m512 {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Load one float vector from an array of doubles
     __m256 a = _mm512_cvtpd_ps(_mm512_loadu_pd(src));
     __m256 b = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8));
@@ -1561,7 +2401,12 @@ inline void convert(const double* src, Half* dst, int64_t n) {
   };
 
   int64_t i;
+<<<<<<< HEAD
   for (i = 0; i + Vectorized<Half>::size() <= n; i += Vectorized<Half>::size()) {
+=======
+  for (i = 0; i + Vectorized<Half>::size() <= n;
+       i += Vectorized<Half>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m512 a = load_float(&src[i]);
     __m512 b = load_float(&src[i + 16]);
 
@@ -1574,8 +2419,15 @@ inline void convert(const double* src, Half* dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<Half> inline fmadd(const Vectorized<Half>& a,
     const Vectorized<Half>& b, const Vectorized<Half>& c) {
+=======
+Vectorized<Half> inline fmadd(
+    const Vectorized<Half>& a,
+    const Vectorized<Half>& b,
+    const Vectorized<Half>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 a_lo, a_hi;
   __m512 b_lo, b_hi;
   __m512 c_lo, c_hi;
@@ -1587,6 +2439,7 @@ Vectorized<Half> inline fmadd(const Vectorized<Half>& a,
   return cvtfp32_fp16(o1, o2);
 }
 
+<<<<<<< HEAD
 #define CONVERT_VECTORIZED_INIT(type, name) \
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
   __m512 o1, o2; \
@@ -1627,12 +2480,59 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
   } \
   return Vectorized<type>::loadu(arr2); \
 }
+=======
+#define CONVERT_VECTORIZED_INIT(type, name)                     \
+  inline std::tuple<Vectorized<float>, Vectorized<float>>       \
+      convert_##name##_float(const Vectorized<type>& a) {       \
+    __m512 o1, o2;                                              \
+    cvt_to_fp32<type>(__m512i(a), o1, o2);                      \
+    return std::make_tuple(o1, o2);                             \
+  }                                                             \
+                                                                \
+  inline Vectorized<type> convert_float_##name(                 \
+      const Vectorized<float>& a, const Vectorized<float>& b) { \
+    return cvt_from_fp32<type>(__m512(a), __m512(b));           \
+  }
+CONVERT_VECTORIZED_INIT(BFloat16, bfloat16)
+CONVERT_VECTORIZED_INIT(Half, half)
+
+#else // defined(CPU_CAPABILITY_AVX512)
+
+#define CONVERT_NON_VECTORIZED_INIT(type, name)                     \
+  inline std::tuple<Vectorized<float>, Vectorized<float>>           \
+      convert_##name##_float(const Vectorized<type>& a) {           \
+    constexpr int64_t K = Vectorized<type>::size();                 \
+    __at_align__ float arr[K];                                      \
+    __at_align__ type arr2[K];                                      \
+    a.store(arr2);                                                  \
+    for (const auto k : c10::irange(K)) {                           \
+      arr[k] = c10::convert<float>(arr2[k]);                        \
+    }                                                               \
+    return std::make_tuple(                                         \
+        Vectorized<float>::loadu(arr),                              \
+        Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
+  }                                                                 \
+                                                                    \
+  inline Vectorized<type> convert_float_##name(                     \
+      const Vectorized<float>& a, const Vectorized<float>& b) {     \
+    constexpr int64_t K = Vectorized<type>::size();                 \
+    __at_align__ float arr[K];                                      \
+    __at_align__ type arr2[K];                                      \
+    a.store(arr);                                                   \
+    b.store(arr + Vectorized<float>::size());                       \
+    for (const auto k : c10::irange(K)) {                           \
+      arr2[k] = c10::convert<type>(arr[k]);                         \
+    }                                                               \
+    return Vectorized<type>::loadu(arr2);                           \
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
 CONVERT_NON_VECTORIZED_INIT(Half, half)
 
 #endif // defined(CPU_CAPABILITY_AVX512)
 
 #if defined(CPU_CAPABILITY_AVX512)
+<<<<<<< HEAD
 #define LOAD_FP32_VECTORIZED_INIT(type, name) \
 inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
   auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
@@ -1648,10 +2548,30 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
   out1 = out1_values; \
   out2 = out2_values; \
 }
+=======
+#define LOAD_FP32_VECTORIZED_INIT(type, name)                                 \
+  inline void load_fp32_from_##name(                                          \
+      const type* data, Vectorized<float>& out) {                             \
+    auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
+    __m512 out_values;                                                        \
+    cvt_to_fp32<type>(values, out_values);                                    \
+    out = out_values;                                                         \
+  }                                                                           \
+                                                                              \
+  inline void load_fp32_from_##name(                                          \
+      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) {   \
+    auto vec = Vectorized<type>::loadu(data);                                 \
+    __m512 out1_values, out2_values;                                          \
+    cvt_to_fp32<type>(vec, out1_values, out2_values);                         \
+    out1 = out1_values;                                                       \
+    out2 = out2_values;                                                       \
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
 LOAD_FP32_VECTORIZED_INIT(Half, fp16)
 
 #else // defined(CPU_CAPABILITY_AVX512)
+<<<<<<< HEAD
 #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
 inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
   __at_align__ float values[Vectorized<float>::size()]; \
@@ -1666,8 +2586,31 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
   data += Vectorized<float>::size(); \
   load_fp32_from_##name(data, out2); \
 }
+=======
+#define LOAD_FP32_NON_VECTORIZED_INIT(type, name)                           \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out) {                           \
+    __at_align__ float values[Vectorized<float>::size()];                   \
+    for (const auto k : c10::irange(Vectorized<float>::size())) {           \
+      values[k] = data[k];                                                  \
+    }                                                                       \
+    out = Vectorized<float>::loadu(values);                                 \
+  }                                                                         \
+                                                                            \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+    load_fp32_from_##name(data, out1);                                      \
+    data += Vectorized<float>::size();                                      \
+    load_fp32_from_##name(data, out2);                                      \
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
 LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
 
 #endif
+<<<<<<< HEAD
 }}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
index 444b41cfb7e5..68e93034cb5f 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
@@ -3,27 +3,51 @@
 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
 // See Note [Do not compile initializers with AVX]
 
+<<<<<<< HEAD
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
+=======
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(CPU_CAPABILITY_AVX512)
 #define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
 #if defined(CPU_CAPABILITY_AVX512)
 
+<<<<<<< HEAD
 template <> class Vectorized<c10::complex<double>> {
 private:
   __m512d values;
   static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
 public:
+=======
+template <>
+struct is_vec_specialized_for<c10::complex<double>> : std::bool_constant<true> {
+};
+
+template <>
+class Vectorized<c10::complex<double>> {
+ private:
+  __m512d values;
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = c10::complex<double>;
   using size_type = int;
   static constexpr size_type size() {
@@ -34,6 +58,7 @@ template <> class Vectorized<c10::complex<double>> {
   Vectorized(c10::complex<double> val) {
     double real_value = val.real();
     double imag_value = val.imag();
+<<<<<<< HEAD
     values = _mm512_setr_pd(real_value, imag_value, real_value, imag_value,
                             real_value, imag_value, real_value, imag_value);
   }
@@ -43,19 +68,53 @@ template <> class Vectorized<c10::complex<double>> {
                             val2.real(), val2.imag(),
                             val3.real(), val3.imag(),
                             val4.real(), val4.imag());
+=======
+    values = _mm512_setr_pd(
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value);
+  }
+  Vectorized(
+      c10::complex<double> val1,
+      c10::complex<double> val2,
+      c10::complex<double> val3,
+      c10::complex<double> val4) {
+    values = _mm512_setr_pd(
+        val1.real(),
+        val1.imag(),
+        val2.real(),
+        val2.imag(),
+        val3.real(),
+        val3.imag(),
+        val4.real(),
+        val4.imag());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   operator __m512d() const {
     return values;
   }
   template <int64_t mask>
+<<<<<<< HEAD
   static Vectorized<c10::complex<double>> blend(const Vectorized<c10::complex<double>>& a,
                                                const Vectorized<c10::complex<double>>& b) {
      // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+=======
+  static Vectorized<c10::complex<double>> blend(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // NOLINTNEXTLINE(clang-diagnostic-warning)
     switch (mask) {
       case 0:
         return a;
       case 1:
+<<<<<<< HEAD
         return _mm512_mask_blend_pd(0x03, a.values, b.values); //b0000 0001 = b0000 0011
       case 2:
         return _mm512_mask_blend_pd(0x0C, a.values, b.values); //b0000 0010 = b0000 1100
@@ -108,6 +167,80 @@ template <> class Vectorized<c10::complex<double>> {
   static Vectorized<c10::complex<double>> set(const Vectorized<c10::complex<double>>& a,
                                              const Vectorized<c10::complex<double>>& b,
                                              int64_t count = size()) {
+=======
+        return _mm512_mask_blend_pd(
+            0x03, a.values, b.values); // b0000 0001 = b0000 0011
+      case 2:
+        return _mm512_mask_blend_pd(
+            0x0C, a.values, b.values); // b0000 0010 = b0000 1100
+      case 3:
+        return _mm512_mask_blend_pd(
+            0x0F, a.values, b.values); // b0000 0011 = b0000 1111
+      case 4:
+        return _mm512_mask_blend_pd(
+            0x30, a.values, b.values); // b0000 0100 = b0011 0000
+      case 5:
+        return _mm512_mask_blend_pd(
+            0x33, a.values, b.values); // b0000 0101 = b0011 0011
+      case 6:
+        return _mm512_mask_blend_pd(
+            0x3C, a.values, b.values); // b0000 0110 = b0011 1100
+      case 7:
+        return _mm512_mask_blend_pd(
+            0x3F, a.values, b.values); // b0000 0111 = b0011 1111
+      case 8:
+        return _mm512_mask_blend_pd(
+            0xC0, a.values, b.values); // b0000 1000 = b1100 0000
+      case 9:
+        return _mm512_mask_blend_pd(
+            0xC3, a.values, b.values); // b0000 1001 = b1100 0011
+      case 10:
+        return _mm512_mask_blend_pd(
+            0xCC, a.values, b.values); // b0000 1010 = b1100 1100
+      case 11:
+        return _mm512_mask_blend_pd(
+            0xCF, a.values, b.values); // b0000 1011 = b1100 1111
+      case 12:
+        return _mm512_mask_blend_pd(
+            0xF0, a.values, b.values); // b0000 1100 = b1111 0000
+      case 13:
+        return _mm512_mask_blend_pd(
+            0xF3, a.values, b.values); // b0000 1101 = b1111 0011
+      case 14:
+        return _mm512_mask_blend_pd(
+            0xFC, a.values, b.values); // b0000 1110 = b1111 1100
+      case 15:
+        return _mm512_mask_blend_pd(
+            0xFF, a.values, b.values); // b0000 1111 = b1111 1111
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<double>> blendv(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b,
+      const Vectorized<c10::complex<double>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm512_unpacklo_pd(mask.values, mask.values);
+    auto all_ones = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF);
+    auto mmask = _mm512_cmp_epi64_mask(
+        _mm512_castpd_si512(mask_), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_pd(mmask, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<c10::complex<double>> arange(
+      c10::complex<double> base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<double>>(
+        base,
+        base + c10::complex<double>(1) * step,
+        base + c10::complex<double>(2) * step,
+        base + c10::complex<double>(3) * step);
+  }
+  static Vectorized<c10::complex<double>> set(
+      const Vectorized<c10::complex<double>>& a,
+      const Vectorized<c10::complex<double>>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -120,6 +253,7 @@ template <> class Vectorized<c10::complex<double>> {
     }
     return b;
   }
+<<<<<<< HEAD
   static Vectorized<c10::complex<double>> loadu(const void* ptr, int64_t count = size()) {
     if (count == size())
       return _mm512_loadu_pd(reinterpret_cast<const double*>(ptr));
@@ -129,6 +263,20 @@ template <> class Vectorized<c10::complex<double>> {
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
     for (const auto i : c10::irange(2*size())) {
+=======
+  static Vectorized<c10::complex<double>> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __at_align__ double tmp_values[2 * size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2 * size())) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       tmp_values[i] = 0.0;
     }
     std::memcpy(
@@ -141,14 +289,25 @@ template <> class Vectorized<c10::complex<double>> {
     if (count == size()) {
       _mm512_storeu_pd(reinterpret_cast<double*>(ptr), values);
     } else if (count > 0) {
+<<<<<<< HEAD
       double tmp_values[2*size()];
+=======
+      double tmp_values[2 * size()];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       _mm512_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<double>));
     }
   }
+<<<<<<< HEAD
   const c10::complex<double>& operator[](int idx) const  = delete;
   c10::complex<double>& operator[](int idx) = delete;
   Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
+=======
+  const c10::complex<double>& operator[](int idx) const = delete;
+  c10::complex<double>& operator[](int idx) = delete;
+  Vectorized<c10::complex<double>> map(
+      c10::complex<double> (*const f)(const c10::complex<double>&)) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ c10::complex<double> tmp[size()];
     store(tmp);
     for (const auto i : c10::irange(size())) {
@@ -159,6 +318,7 @@ template <> class Vectorized<c10::complex<double>> {
   // AVX512 doesn't have horizontal add & horizontal sub instructions.
   // TODO: hadd_pd() & hsub_pd() may have scope for improvement.
   static inline __m512d hadd_pd(__m512d a, __m512d b) {
+<<<<<<< HEAD
   __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0);
   __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1);
   return _mm512_add_pd(_mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
@@ -199,6 +359,60 @@ template <> class Vectorized<c10::complex<double>> {
                                                                     0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
     auto angle = _mm512_permute_pd(angle_(), 0x55); // angle    90-angle
     return _mm512_and_pd(angle, real_mask);         // angle    0
+=======
+    __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0);
+    __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1);
+    return _mm512_add_pd(
+        _mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+        _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+  }
+  static inline __m512d hsub_pd(__m512d a, __m512d b) {
+    __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0);
+    __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1);
+    return _mm512_sub_pd(
+        _mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+        _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+  }
+  __m512d abs_2_() const {
+    auto val_2 = _mm512_mul_pd(values, values); // a*a     b*b
+    return hadd_pd(val_2, val_2); // a*a+b*b a*a+b*b
+  }
+  __m512d abs_() const {
+    auto real = _mm512_movedup_pd(values); // real real
+    // movehdup_pd does not exist...
+    auto imag = _mm512_permute_pd(values, 0xff); // imag imag
+    return Sleef_hypotd8_u05(real, imag); // abs  abs
+  }
+  Vectorized<c10::complex<double>> abs() const {
+    const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+    return _mm512_and_pd(abs_(), real_mask); // abs     0
+  }
+  __m512d angle_() const {
+    // angle = atan2(b/a)
+    auto b_a = _mm512_permute_pd(values, 0x55); // b        a
+    return Sleef_atan2d8_u10(values, b_a); // 90-angle angle
+  }
+  Vectorized<c10::complex<double>> angle() const {
+    const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+    auto angle = _mm512_permute_pd(angle_(), 0x55); // angle    90-angle
+    return _mm512_and_pd(angle, real_mask); // angle    0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<double>> sgn() const {
     auto abs = abs_();
@@ -208,16 +422,29 @@ template <> class Vectorized<c10::complex<double>> {
     return _mm512_mask_blend_pd(mask, div, zero);
   }
   __m512d real_() const {
+<<<<<<< HEAD
     const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
                                                                     0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
                                                                     0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
                                                                     0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+=======
+    const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _mm512_and_pd(values, real_mask);
   }
   Vectorized<c10::complex<double>> real() const {
     return real_();
   }
   __m512d imag_() const {
+<<<<<<< HEAD
     const __m512d imag_mask = _mm512_castsi512_pd(_mm512_setr_epi64(0x0000000000000000, 0xFFFFFFFFFFFFFFFF,
                                                                     0x0000000000000000, 0xFFFFFFFFFFFFFFFF,
                                                                     0x0000000000000000, 0xFFFFFFFFFFFFFFFF,
@@ -230,12 +457,37 @@ template <> class Vectorized<c10::complex<double>> {
   __m512d conj_() const {
     const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
     return _mm512_xor_pd(values, sign_mask);           // a       -b
+=======
+    const __m512d imag_mask = _mm512_castsi512_pd(_mm512_setr_epi64(
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF,
+        0x0000000000000000,
+        0xFFFFFFFFFFFFFFFF));
+    return _mm512_and_pd(values, imag_mask);
+  }
+  Vectorized<c10::complex<double>> imag() const {
+    return _mm512_permute_pd(imag_(), 0x55); // b        a
+  }
+  __m512d conj_() const {
+    const __m512d sign_mask =
+        _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+    return _mm512_xor_pd(values, sign_mask); // a       -b
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<double>> conj() const {
     return conj_();
   }
   Vectorized<c10::complex<double>> log() const {
+<<<<<<< HEAD
     // Most trigonomic ops use the log() op to improve complex number performance.
+=======
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(std::log);
   }
   Vectorized<c10::complex<double>> log2() const {
@@ -250,7 +502,12 @@ template <> class Vectorized<c10::complex<double>> {
     return map(std::log1p);
   }
   Vectorized<c10::complex<double>> asin() const {
+<<<<<<< HEAD
     // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
+=======
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // // asin(x)
     // // = -i*ln(iz + sqrt(1 -z^2))
     // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
@@ -258,6 +515,7 @@ template <> class Vectorized<c10::complex<double>> {
     // const __m512d one = _mm512_set1_pd(1);
 
     // auto conj = conj_();
+<<<<<<< HEAD
     // auto b_a = _mm512_permute_pd(conj, 0x55);                         //-b        a
     // auto ab = _mm512_mul_pd(conj, b_a);                               //-ab       -ab
     // auto im = _mm512_add_pd(ab, ab);                                  //-2ab      -2ab
@@ -269,12 +527,31 @@ template <> class Vectorized<c10::complex<double>> {
     // auto root = Vectorized(_mm512_mask_blend_pd(0xAA, re, im)).sqrt();         //sqrt(re + i*im)
     // auto ln = Vectorized(_mm512_add_pd(b_a, root)).log();                 //ln(iz + sqrt())
     // return Vectorized(_mm512_permute_pd(ln.values, 0x55)).conj();         //-i*ln()
+=======
+    // auto b_a = _mm512_permute_pd(conj, 0x55);                         //-b a
+    // auto ab = _mm512_mul_pd(conj, b_a);                               //-ab
+    // -ab auto im = _mm512_add_pd(ab, ab); //-2ab      -2ab
+
+    // auto val_2 = _mm512_mul_pd(values, values);                       // a*a
+    // b*b auto re = hsub_pd(val_2, _mm512_permute_pd(val_2, 0x55));  // a*a-b*b
+    // b*b-a*a re = _mm512_sub_pd(one, re);
+
+    // auto root = Vectorized(_mm512_mask_blend_pd(0xAA, re, im)).sqrt();
+    // //sqrt(re + i*im) auto ln = Vectorized(_mm512_add_pd(b_a, root)).log();
+    // //ln(iz + sqrt()) return Vectorized(_mm512_permute_pd(ln.values,
+    // 0x55)).conj();         //-i*ln()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(std::asin);
   }
   Vectorized<c10::complex<double>> acos() const {
     // acos(x) = pi/2 - asin(x)
     constexpr auto pi_2d = c10::pi<double> / 2;
+<<<<<<< HEAD
     const __m512d pi_2 = _mm512_setr_pd(pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0);
+=======
+    const __m512d pi_2 =
+        _mm512_setr_pd(pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _mm512_sub_pd(pi_2, asin());
   }
   Vectorized<c10::complex<double>> atan() const;
@@ -282,6 +559,7 @@ template <> class Vectorized<c10::complex<double>> {
     return map(std::atanh);
   }
   Vectorized<c10::complex<double>> exp() const {
+<<<<<<< HEAD
     // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
     // //exp(a + bi)
     // // = exp(a)*(cos(b) + sin(b)i)
@@ -291,13 +569,33 @@ template <> class Vectorized<c10::complex<double>> {
     // auto sin_cos = Sleef_sincosd8_u10(values);                        //[sin(a), cos(a)] [sin(b), cos(b)]
     // auto cos_sin = _mm512_mask_blend_pd(0xAA, _mm512_permute_pd(sin_cos.y, 0x55),
     //                                sin_cos.x);                  //cos(b)           sin(b)
+=======
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // //exp(a + bi)
+    // // = exp(a)*(cos(b) + sin(b)i)
+    // auto exp = Sleef_expd8_u10(values); //exp(a)           exp(b) exp =
+    // _mm512_mask_blend_pd(0xAA, exp, _mm512_permute_pd(exp, 0x55));   //exp(a)
+    // exp(a)
+
+    // auto sin_cos = Sleef_sincosd8_u10(values); //[sin(a), cos(a)] [sin(b),
+    // cos(b)] auto cos_sin = _mm512_mask_blend_pd(0xAA,
+    // _mm512_permute_pd(sin_cos.y, 0x55),
+    //                                sin_cos.x);                  //cos(b)
+    //                                sin(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // return _mm512_mul_pd(exp, cos_sin);
     return map(std::exp);
   }
   Vectorized<c10::complex<double>> exp2() const {
     // Use identity 2**x = exp(log(2) * x)
     const __m512d ln_2 = _mm512_set1_pd(c10::ln_2<double>);
+<<<<<<< HEAD
     Vectorized<c10::complex<double>> scaled_values = _mm512_mul_pd(values, ln_2);
+=======
+    Vectorized<c10::complex<double>> scaled_values =
+        _mm512_mul_pd(values, ln_2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return scaled_values.exp();
   }
   Vectorized<c10::complex<double>> expm1() const {
@@ -326,7 +624,12 @@ template <> class Vectorized<c10::complex<double>> {
     return _mm512_sub_pd(zero, values);
   }
   Vectorized<c10::complex<double>> round() const {
+<<<<<<< HEAD
     return _mm512_roundscale_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+    return _mm512_roundscale_pd(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<double>> tan() const {
     return map(std::tan);
@@ -335,7 +638,12 @@ template <> class Vectorized<c10::complex<double>> {
     return map(std::tanh);
   }
   Vectorized<c10::complex<double>> trunc() const {
+<<<<<<< HEAD
     return _mm512_roundscale_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+=======
+    return _mm512_roundscale_pd(
+        values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<double>> sqrt() const {
     return map(std::sqrt);
@@ -344,7 +652,12 @@ template <> class Vectorized<c10::complex<double>> {
   Vectorized<c10::complex<double>> rsqrt() const {
     return sqrt().reciprocal();
   }
+<<<<<<< HEAD
   Vectorized<c10::complex<double>> pow(const Vectorized<c10::complex<double>> &exp) const {
+=======
+  Vectorized<c10::complex<double>> pow(
+      const Vectorized<c10::complex<double>>& exp) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ c10::complex<double> x_tmp[size()];
     __at_align__ c10::complex<double> y_tmp[size()];
     store(x_tmp);
@@ -357,6 +670,7 @@ template <> class Vectorized<c10::complex<double>> {
   // Comparison using the _CMP_**_OQ predicate.
   //   `O`: get false if an operand is NaN
   //   `Q`: do not raise if an operand is NaN
+<<<<<<< HEAD
   Vectorized<c10::complex<double>> operator==(const Vectorized<c10::complex<double>>& other) const {
     auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ);
     return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, mask,
@@ -411,10 +725,86 @@ template <> Vectorized<c10::complex<double>> inline operator*(const Vectorized<c
 template <> Vectorized<c10::complex<double>> inline operator/(const Vectorized<c10::complex<double>> &a,
                                                              const Vectorized<c10::complex<double>> &b) {
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
+=======
+  Vectorized<c10::complex<double>> operator==(
+      const Vectorized<c10::complex<double>>& other) const {
+    auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF));
+  }
+  Vectorized<c10::complex<double>> operator!=(
+      const Vectorized<c10::complex<double>>& other) const {
+    auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_NEQ_UQ);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF));
+  }
+  Vectorized<c10::complex<double>> operator<(
+      const Vectorized<c10::complex<double>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator<=(
+      const Vectorized<c10::complex<double>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>(
+      const Vectorized<c10::complex<double>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>=(
+      const Vectorized<c10::complex<double>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<double>> eq(
+      const Vectorized<c10::complex<double>>& other) const;
+  Vectorized<c10::complex<double>> ne(
+      const Vectorized<c10::complex<double>>& other) const;
+};
+
+template <>
+Vectorized<c10::complex<double>> inline operator+(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm512_add_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator-(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm512_sub_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator*(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m512d sign_mask =
+      _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+  auto ac_bd = _mm512_mul_pd(a, b); // ac       bd
+
+  auto d_c = _mm512_permute_pd(b, 0x55); // d        c
+  d_c = _mm512_xor_pd(sign_mask, d_c); // d       -c
+  auto ad_bc = _mm512_mul_pd(a, d_c); // ad      -bc
+
+  auto ret = Vectorized<c10::complex<double>>::hsub_pd(
+      ac_bd, ad_bc); // ac - bd  ad + bc
+  return ret;
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator/(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // //re + im*i = (a + bi)  / (c + di)
   // auto mask = _mm512_set1_pd(-0.f);
   // auto fabs_cd = _mm512_andnot_pd(mask, b);     // |c|    |d|
   // auto fabs_dc = _mm512_permute_pd(fabs_cd, 0x55);   // |d|    |c|
+<<<<<<< HEAD
   // auto scale = _mm512_rcp14_pd(_mm512_max_pd(fabs_cd, fabs_dc));  // 1/sc     1/sc
   // auto a2 = _mm512_mul_pd(a, scale);         // a/sc     b/sc
   // auto b2 = _mm512_mul_pd(b, scale);         // c/sc     d/sc
@@ -433,6 +823,30 @@ template <> Vectorized<c10::complex<double>> inline operator/(const Vectorized<c
   __at_align__ c10::complex<double> tmp1[Vectorized<c10::complex<double>>::size()];
   __at_align__ c10::complex<double> tmp2[Vectorized<c10::complex<double>>::size()];
   __at_align__ c10::complex<double> out[Vectorized<c10::complex<double>>::size()];
+=======
+  // auto scale = _mm512_rcp14_pd(_mm512_max_pd(fabs_cd, fabs_dc));  // 1/sc
+  // 1/sc auto a2 = _mm512_mul_pd(a, scale);         // a/sc     b/sc auto b2 =
+  // _mm512_mul_pd(b, scale);         // c/sc     d/sc auto acbd2 =
+  // _mm512_mul_pd(a2, b2);
+
+  // const __m512d sign_mask = _mm512_setr_pd(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0,
+  // -0.0, 0.0); auto dc2 = _mm512_permute_pd(b2, 0x55);    // d/sc         c/sc
+  // dc2 = _mm512_xor_pd(sign_mask, dc2);       // -d/|c,d|        c/sc
+  // auto adbc2 = _mm512_mul_pd(a2, dc2);       //-ad/sc^2      bc/sc^2
+  // auto res2 = Vectorized<c10::complex<double>>::hadd_pd(acbd2, adbc2);
+  // //(ac+bd)/sc^2  (bc-ad)/sc^2
+
+  // // get the denominator
+  // auto denom2 = Vectorized<c10::complex<double>>(b2).abs_2_();  //
+  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2 res2 = _mm512_div_pd(res2, denom2); return
+  // res2;
+  __at_align__ c10::complex<double>
+      tmp1[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      tmp2[Vectorized<c10::complex<double>>::size()];
+  __at_align__ c10::complex<double>
+      out[Vectorized<c10::complex<double>>::size()];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   a.store(tmp1);
   b.store(tmp2);
   for (const auto i : c10::irange(Vectorized<c10::complex<double>>::size())) {
@@ -442,6 +856,7 @@ template <> Vectorized<c10::complex<double>> inline operator/(const Vectorized<c
 }
 
 // reciprocal. Implement this here so we can use multiplication.
+<<<<<<< HEAD
 inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::reciprocal() const{
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
   // //re + im*i = (a + bi)  / (c + di)
@@ -449,6 +864,17 @@ inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::recipr
   // //im = (bc - ad)/abs_2() = d/abs_2()
   // const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
   // auto c_d = _mm512_xor_pd(sign_mask, values);    //c       -d
+=======
+inline Vectorized<c10::complex<double>> Vectorized<
+    c10::complex<double>>::reciprocal() const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // //re = (ac + bd)/abs_2() = c/abs_2()
+  // //im = (bc - ad)/abs_2() = d/abs_2()
+  // const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+  // 0.0, -0.0); auto c_d = _mm512_xor_pd(sign_mask, values);    //c       -d
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // return _mm512_div_pd(c_d, abs_2_());
   __at_align__ c10::complex<double> tmp[size()];
   store(tmp);
@@ -458,6 +884,7 @@ inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::recipr
   return loadu(tmp);
 }
 
+<<<<<<< HEAD
 inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::atan() const {
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
   // // atan(x) = i/2 * ln((i + z)/(i - z))
@@ -468,12 +895,33 @@ inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::atan()
   // auto sub = Vectorized(_mm512_sub_pd(i, values));                      // -a       1-b
   // auto ln = (sum/sub).log();                                        // ln((i + z)/(i - z))
   // return i_half*ln;                                                 // i/2*ln()
+=======
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::atan()
+    const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // // atan(x) = i/2 * ln((i + z)/(i - z))
+  // const __m512d i = _mm512_setr_pd(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+  // const Vectorized i_half = _mm512_setr_pd(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+  // 0.5);
+
+  // auto sum = Vectorized(_mm512_add_pd(i, values));                      // a
+  // 1+b auto sub = Vectorized(_mm512_sub_pd(i, values)); // -a       1-b auto
+  // ln = (sum/sub).log();                                        // ln((i +
+  // z)/(i - z)) return i_half*ln; // i/2*ln()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return map(std::atan);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<double>> inline maximum(const Vectorized<c10::complex<double>>& a,
                                                const Vectorized<c10::complex<double>>& b) {
+=======
+Vectorized<c10::complex<double>> inline maximum(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto zero_vec = _mm512_set1_epi64(0);
   auto abs_a = a.abs_2_();
   auto abs_b = b.abs_2_();
@@ -481,14 +929,24 @@ Vectorized<c10::complex<double>> inline maximum(const Vectorized<c10::complex<do
   auto max = _mm512_mask_blend_pd(mask, a, b);
   // Exploit the fact that all-ones is a NaN.
   auto isnan_mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_UNORD_Q);
+<<<<<<< HEAD
   auto isnan = _mm512_mask_set1_epi64(zero_vec, isnan_mask,
                                       0xFFFFFFFFFFFFFFFF);
+=======
+  auto isnan = _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_or_pd(max, _mm512_castsi512_pd(isnan));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<double>> inline minimum(const Vectorized<c10::complex<double>>& a,
                                                const Vectorized<c10::complex<double>>& b) {
+=======
+Vectorized<c10::complex<double>> inline minimum(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto zero_vec = _mm512_set1_epi64(0);
   auto abs_a = a.abs_2_();
   auto abs_b = b.abs_2_();
@@ -496,24 +954,41 @@ Vectorized<c10::complex<double>> inline minimum(const Vectorized<c10::complex<do
   auto min = _mm512_mask_blend_pd(mask, a, b);
   // Exploit the fact that all-ones is a NaN.
   auto isnan_mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_UNORD_Q);
+<<<<<<< HEAD
   auto isnan = _mm512_mask_set1_epi64(zero_vec, isnan_mask,
                                       0xFFFFFFFFFFFFFFFF);
+=======
+  auto isnan = _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_or_pd(min, _mm512_castsi512_pd(isnan));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<double>> inline operator&(const Vectorized<c10::complex<double>>& a,
                                                  const Vectorized<c10::complex<double>>& b) {
+=======
+Vectorized<c10::complex<double>> inline operator&(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_and_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<double>> inline operator|(const Vectorized<c10::complex<double>>& a,
                                                  const Vectorized<c10::complex<double>>& b) {
+=======
+Vectorized<c10::complex<double>> inline operator|(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_or_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<double>> inline operator^(const Vectorized<c10::complex<double>>& a,
                                                  const Vectorized<c10::complex<double>>& b) {
   return _mm512_xor_pd(a, b);
@@ -529,8 +1004,37 @@ inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::ne(con
   auto ne = (*this != other);  // compares real and imag individually
   // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
   return (ne.real() | ne.imag()) & Vectorized<c10::complex<double>>(_mm512_set1_pd(1.0));
+=======
+Vectorized<c10::complex<double>> inline operator^(
+    const Vectorized<c10::complex<double>>& a,
+    const Vectorized<c10::complex<double>>& b) {
+  return _mm512_xor_pd(a, b);
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::eq(
+    const Vectorized<c10::complex<double>>& other) const {
+  auto eq = (*this == other); // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers
+  // are equal
+  return (eq.real() & eq.imag()) &
+      Vectorized<c10::complex<double>>(_mm512_set1_pd(1.0));
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::ne(
+    const Vectorized<c10::complex<double>>& other) const {
+  auto ne = (*this != other); // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex
+  // numbers are not equal
+  return (ne.real() | ne.imag()) &
+      Vectorized<c10::complex<double>>(_mm512_set1_pd(1.0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #endif
 
+<<<<<<< HEAD
 }}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
index 4b07fb3af863..7b2a00cfce8a 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
@@ -3,27 +3,51 @@
 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
 // See Note [Do not compile initializers with AVX]
 
+<<<<<<< HEAD
 #include <c10/util/complex.h>
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
+=======
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(CPU_CAPABILITY_AVX512)
 #define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
 #if defined(CPU_CAPABILITY_AVX512)
 
+<<<<<<< HEAD
 template <> class Vectorized<c10::complex<float>> {
 private:
   __m512 values;
   static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
 public:
+=======
+template <>
+struct is_vec_specialized_for<c10::complex<float>> : std::bool_constant<true> {
+};
+
+template <>
+class Vectorized<c10::complex<float>> {
+ private:
+  __m512 values;
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = c10::complex<float>;
   using size_type = int;
   static constexpr size_type size() {
@@ -34,6 +58,7 @@ template <> class Vectorized<c10::complex<float>> {
   Vectorized(c10::complex<float> val) {
     float real_value = val.real();
     float imag_value = val.imag();
+<<<<<<< HEAD
     values = _mm512_setr_ps(real_value, imag_value,
                             real_value, imag_value,
                             real_value, imag_value,
@@ -55,13 +80,65 @@ template <> class Vectorized<c10::complex<float>> {
                             val6.real(), val6.imag(),
                             val7.real(), val7.imag(),
                             val8.real(), val8.imag());
+=======
+    values = _mm512_setr_ps(
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value,
+        real_value,
+        imag_value);
+  }
+  Vectorized(
+      c10::complex<float> val1,
+      c10::complex<float> val2,
+      c10::complex<float> val3,
+      c10::complex<float> val4,
+      c10::complex<float> val5,
+      c10::complex<float> val6,
+      c10::complex<float> val7,
+      c10::complex<float> val8) {
+    values = _mm512_setr_ps(
+        val1.real(),
+        val1.imag(),
+        val2.real(),
+        val2.imag(),
+        val3.real(),
+        val3.imag(),
+        val4.real(),
+        val4.imag(),
+        val5.real(),
+        val5.imag(),
+        val6.real(),
+        val6.imag(),
+        val7.real(),
+        val7.imag(),
+        val8.real(),
+        val8.imag());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   operator __m512() const {
     return values;
   }
   template <int64_t mask>
+<<<<<<< HEAD
   static Vectorized<c10::complex<float>> blend(const Vectorized<c10::complex<float>>& a,
                                               const Vectorized<c10::complex<float>>& b) {
+=======
+  static Vectorized<c10::complex<float>> blend(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // convert c10::complex<V> index mask to V index mask: xy -> xxyy
     static_assert(mask > -1 && mask < 256, "Unexpected mask value");
     // The compiler would hopefully convert this switch condition
@@ -577,6 +654,7 @@ template <> class Vectorized<c10::complex<float>> {
         return _mm512_mask_blend_ps(0xFFF3, a.values, b.values);
       case 254:
         return _mm512_mask_blend_ps(0xFFFC, a.values, b.values);
+<<<<<<< HEAD
       default: break;
     }
     return b;
@@ -605,6 +683,42 @@ template <> class Vectorized<c10::complex<float>> {
   static Vectorized<c10::complex<float>> set(const Vectorized<c10::complex<float>>& a,
                                             const Vectorized<c10::complex<float>>& b,
                             int64_t count = size()) {
+=======
+      default:
+        break;
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<float>> blendv(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b,
+      const Vectorized<c10::complex<float>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm512_unpacklo_ps(mask.values, mask.values);
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto mmask = _mm512_cmp_epi32_mask(
+        _mm512_castps_si512(mask_), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_ps(mmask, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<c10::complex<float>> arange(
+      c10::complex<float> base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<float>>(
+        base,
+        base + step,
+        base + c10::complex<float>(2) * step,
+        base + c10::complex<float>(3) * step,
+        base + c10::complex<float>(4) * step,
+        base + c10::complex<float>(5) * step,
+        base + c10::complex<float>(6) * step,
+        base + c10::complex<float>(7) * step);
+  }
+  static Vectorized<c10::complex<float>> set(
+      const Vectorized<c10::complex<float>>& a,
+      const Vectorized<c10::complex<float>>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -625,6 +739,7 @@ template <> class Vectorized<c10::complex<float>> {
     }
     return b;
   }
+<<<<<<< HEAD
   static Vectorized<c10::complex<float>> loadu(const void* ptr, int64_t count = size()) {
     if (count == size())
       return _mm512_loadu_ps(reinterpret_cast<const float*>(ptr));
@@ -634,6 +749,20 @@ template <> class Vectorized<c10::complex<float>> {
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
     for (const auto i : c10::irange(2*size())) {
+=======
+  static Vectorized<c10::complex<float>> loadu(
+      const void* ptr,
+      int64_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_ps(reinterpret_cast<const float*>(ptr));
+
+    __at_align__ float tmp_values[2 * size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2 * size())) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       tmp_values[i] = 0.0;
     }
     std::memcpy(
@@ -646,7 +775,11 @@ template <> class Vectorized<c10::complex<float>> {
     if (count == size()) {
       _mm512_storeu_ps(reinterpret_cast<float*>(ptr), values);
     } else if (count > 0) {
+<<<<<<< HEAD
       float tmp_values[2*size()];
+=======
+      float tmp_values[2 * size()];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       _mm512_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<float>));
     }
@@ -654,6 +787,7 @@ template <> class Vectorized<c10::complex<float>> {
   // AVX512 doesn't have horizontal add & horizontal sub instructions.
   // TODO: hadd_pd() & hsub_pd() may have scope for improvement.
   static inline __m512 hadd_ps(__m512 a, __m512 b) {
+<<<<<<< HEAD
   __m512i idx1 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
   __m512i idx2 = _mm512_set_epi32(31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
   return _mm512_add_ps(_mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
@@ -668,6 +802,29 @@ template <> class Vectorized<c10::complex<float>> {
   const c10::complex<float>& operator[](int idx) const  = delete;
   c10::complex<float>& operator[](int idx) = delete;
   Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
+=======
+    __m512i idx1 = _mm512_set_epi32(
+        30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
+    __m512i idx2 = _mm512_set_epi32(
+        31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
+    return _mm512_add_ps(
+        _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+        _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+  }
+  static inline __m512 hsub_ps(__m512 a, __m512 b) {
+    __m512i idx1 = _mm512_set_epi32(
+        30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
+    __m512i idx2 = _mm512_set_epi32(
+        31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
+    return _mm512_sub_ps(
+        _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+        _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+  }
+  const c10::complex<float>& operator[](int idx) const = delete;
+  c10::complex<float>& operator[](int idx) = delete;
+  Vectorized<c10::complex<float>> map(
+      c10::complex<float> (*const f)(const c10::complex<float>&)) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ c10::complex<float> tmp[size()];
     store(tmp);
     for (const auto i : c10::irange(size())) {
@@ -676,6 +833,7 @@ template <> class Vectorized<c10::complex<float>> {
     return loadu(tmp);
   }
   __m512 abs_2_() const {
+<<<<<<< HEAD
     auto val_2 = _mm512_mul_ps(values, values);     // a*a     b*b
     auto ret = hadd_ps(val_2, val_2);               // a*a+b*b a*a+b*b
     return ret;
@@ -704,6 +862,62 @@ template <> class Vectorized<c10::complex<float>> {
                                                                    0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
     auto angle = _mm512_permute_ps(angle_(), 0xB1); // angle    90-angle
     return _mm512_and_ps(angle, real_mask);         // angle    0
+=======
+    auto val_2 = _mm512_mul_ps(values, values); // a*a     b*b
+    auto ret = hadd_ps(val_2, val_2); // a*a+b*b a*a+b*b
+    return ret;
+  }
+  __m512 abs_() const {
+    auto real = _mm512_moveldup_ps(values); // real real
+    auto imag = _mm512_movehdup_ps(values); // imag imag
+    return Sleef_hypotf16_u05(real, imag); // abs  abs
+  }
+  Vectorized<c10::complex<float>> abs() const {
+    const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+    return _mm512_and_ps(abs_(), real_mask); // abs     0
+  }
+  __m512 angle_() const {
+    // angle = atan2(b/a)
+    auto b_a = _mm512_permute_ps(values, 0xB1); // b        a
+    return Sleef_atan2f16_u10(values, b_a); // 90-angle angle
+  }
+  Vectorized<c10::complex<float>> angle() const {
+    const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+    auto angle = _mm512_permute_ps(angle_(), 0xB1); // angle    90-angle
+    return _mm512_and_ps(angle, real_mask); // angle    0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<float>> sgn() const {
     auto abs = abs_();
@@ -713,16 +927,37 @@ template <> class Vectorized<c10::complex<float>> {
     return _mm512_mask_blend_ps(mask, div, zero);
   }
   __m512 real_() const {
+<<<<<<< HEAD
     const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
                                                                    0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
                                                                    0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
                                                                    0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+=======
+    const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _mm512_and_ps(values, real_mask);
   }
   Vectorized<c10::complex<float>> real() const {
     return real_();
   }
   __m512 imag_() const {
+<<<<<<< HEAD
     const __m512 imag_mask = _mm512_castsi512_ps(_mm512_setr_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
                                                                    0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
                                                                    0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
@@ -736,12 +971,60 @@ template <> class Vectorized<c10::complex<float>> {
     const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
                                             0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
     return _mm512_xor_ps(values, sign_mask);        // a       -b
+=======
+    const __m512 imag_mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF,
+        0x00000000,
+        0xFFFFFFFF));
+    return _mm512_and_ps(values, imag_mask);
+  }
+  Vectorized<c10::complex<float>> imag() const {
+    return _mm512_permute_ps(imag_(), 0xB1); // b        a
+  }
+  __m512 conj_() const {
+    const __m512 sign_mask = _mm512_setr_ps(
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0,
+        0.0,
+        -0.0);
+    return _mm512_xor_ps(values, sign_mask); // a       -b
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<float>> conj() const {
     return conj_();
   }
   Vectorized<c10::complex<float>> log() const {
+<<<<<<< HEAD
     // Most trigonomic ops use the log() op to improve complex number performance.
+=======
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(std::log);
   }
   Vectorized<c10::complex<float>> log2() const {
@@ -756,7 +1039,12 @@ template <> class Vectorized<c10::complex<float>> {
     return map(std::log1p);
   }
   Vectorized<c10::complex<float>> asin() const {
+<<<<<<< HEAD
     // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
+=======
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // // asin(x)
     // // = -i*ln(iz + sqrt(1 -z^2))
     // // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
@@ -764,6 +1052,7 @@ template <> class Vectorized<c10::complex<float>> {
     // const __m512 one = _mm512_set1_ps(1);
 
     // auto conj = conj_();
+<<<<<<< HEAD
     // auto b_a = _mm512_permute_ps(conj, 0xB1);                         //-b        a
     // auto ab = _mm512_mul_ps(conj, b_a);                               //-ab       -ab
     // auto im = _mm512_add_ps(ab, ab);                                  //-2ab      -2ab
@@ -775,6 +1064,20 @@ template <> class Vectorized<c10::complex<float>> {
     // auto root = Vectorized(_mm512_mask_blend_ps(0xAAAA, re, im)).sqrt();         //sqrt(re + i*im)
     // auto ln = Vectorized(_mm512_add_ps(b_a, root)).log();                 //ln(iz + sqrt())
     // return Vectorized(_mm512_permute_ps(ln.values, 0xB1)).conj();         //-i*ln()
+=======
+    // auto b_a = _mm512_permute_ps(conj, 0xB1);                         //-b a
+    // auto ab = _mm512_mul_ps(conj, b_a);                               //-ab
+    // -ab auto im = _mm512_add_ps(ab, ab); //-2ab      -2ab
+
+    // auto val_2 = _mm512_mul_ps(values, values);                       // a*a
+    // b*b auto re = hsub_ps(val_2, _mm512_permute_ps(val_2, 0xB1));  // a*a-b*b
+    // b*b-a*a re = _mm512_sub_ps(one, re);
+
+    // auto root = Vectorized(_mm512_mask_blend_ps(0xAAAA, re, im)).sqrt();
+    // //sqrt(re + i*im) auto ln = Vectorized(_mm512_add_ps(b_a, root)).log();
+    // //ln(iz + sqrt()) return Vectorized(_mm512_permute_ps(ln.values,
+    // 0xB1)).conj();         //-i*ln()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(std::asin);
   }
   Vectorized<c10::complex<float>> acos() const {
@@ -785,6 +1088,7 @@ template <> class Vectorized<c10::complex<float>> {
     return map(std::atanh);
   }
   Vectorized<c10::complex<float>> exp() const {
+<<<<<<< HEAD
     // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
     // //exp(a + bi)
     // // = exp(a)*(cos(b) + sin(b)i)
@@ -794,6 +1098,21 @@ template <> class Vectorized<c10::complex<float>> {
     // auto sin_cos = Sleef_sincosf16_u10(values);                        //[sin(a), cos(a)] [sin(b), cos(b)]
     // auto cos_sin = _mm512_mask_blend_ps(0xAAAA, _mm512_permute_ps(sin_cos.y, 0xB1),
     //                                sin_cos.x);                  //cos(b)           sin(b)
+=======
+    // TODO: The vectorized implementation requires special handling for the
+    // case where real number/imag number is 0/Inf/NaN.
+    // //exp(a + bi)
+    // // = exp(a)*(cos(b) + sin(b)i)
+    // auto exp = Sleef_expf16_u10(values); //exp(a)           exp(b) exp =
+    // _mm512_mask_blend_ps(0xAAAA, exp, _mm512_permute_ps(exp, 0xB1)); //exp(a)
+    // exp(a)
+
+    // auto sin_cos = Sleef_sincosf16_u10(values); //[sin(a), cos(a)] [sin(b),
+    // cos(b)] auto cos_sin = _mm512_mask_blend_ps(0xAAAA,
+    // _mm512_permute_ps(sin_cos.y, 0xB1),
+    //                                sin_cos.x);                  //cos(b)
+    //                                sin(b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // return _mm512_mul_ps(exp, cos_sin);
     return map(std::exp);
   }
@@ -829,7 +1148,12 @@ template <> class Vectorized<c10::complex<float>> {
     return _mm512_sub_ps(zero, values);
   }
   Vectorized<c10::complex<float>> round() const {
+<<<<<<< HEAD
     return _mm512_roundscale_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+    return _mm512_roundscale_ps(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<float>> tan() const {
     return map(std::tan);
@@ -838,7 +1162,12 @@ template <> class Vectorized<c10::complex<float>> {
     return map(std::tanh);
   }
   Vectorized<c10::complex<float>> trunc() const {
+<<<<<<< HEAD
     return _mm512_roundscale_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+=======
+    return _mm512_roundscale_ps(
+        values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<c10::complex<float>> sqrt() const {
     return map(std::sqrt);
@@ -847,7 +1176,12 @@ template <> class Vectorized<c10::complex<float>> {
   Vectorized<c10::complex<float>> rsqrt() const {
     return sqrt().reciprocal();
   }
+<<<<<<< HEAD
   Vectorized<c10::complex<float>> pow(const Vectorized<c10::complex<float>> &exp) const {
+=======
+  Vectorized<c10::complex<float>> pow(
+      const Vectorized<c10::complex<float>>& exp) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ c10::complex<float> x_tmp[size()];
     __at_align__ c10::complex<float> y_tmp[size()];
     store(x_tmp);
@@ -860,6 +1194,7 @@ template <> class Vectorized<c10::complex<float>> {
   // Comparison using the _CMP_**_OQ predicate.
   //   `O`: get false if an operand is NaN
   //   `Q`: do not raise if an operand is NaN
+<<<<<<< HEAD
   Vectorized<c10::complex<float>> operator==(const Vectorized<c10::complex<float>>& other) const {
     auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ);
     return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF));
@@ -913,10 +1248,101 @@ template <> Vectorized<c10::complex<float>> inline operator*(const Vectorized<c1
 template <> Vectorized<c10::complex<float>> inline operator/(const Vectorized<c10::complex<float>> &a,
                                                             const Vectorized<c10::complex<float>> &b) {
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
+=======
+  Vectorized<c10::complex<float>> operator==(
+      const Vectorized<c10::complex<float>>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF));
+  }
+  Vectorized<c10::complex<float>> operator!=(
+      const Vectorized<c10::complex<float>>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_NEQ_UQ);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF));
+  }
+  Vectorized<c10::complex<float>> operator<(
+      const Vectorized<c10::complex<float>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator<=(
+      const Vectorized<c10::complex<float>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>(
+      const Vectorized<c10::complex<float>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>=(
+      const Vectorized<c10::complex<float>>& other [[maybe_unused]]) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<float>> eq(
+      const Vectorized<c10::complex<float>>& other) const;
+  Vectorized<c10::complex<float>> ne(
+      const Vectorized<c10::complex<float>>& other) const;
+};
+
+template <>
+Vectorized<c10::complex<float>> inline operator+(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm512_add_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator-(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  return _mm512_sub_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator*(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m512 sign_mask = _mm512_setr_ps(
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0,
+      0.0,
+      -0.0);
+  auto ac_bd = _mm512_mul_ps(a, b); // ac       bd
+
+  auto d_c = _mm512_permute_ps(b, 0xB1); // d        c
+  d_c = _mm512_xor_ps(sign_mask, d_c); // d       -c
+  auto ad_bc = _mm512_mul_ps(a, d_c); // ad      -bc
+
+  auto ret = Vectorized<c10::complex<float>>::hsub_ps(
+      ac_bd, ad_bc); // ac - bd  ad + bc
+  return ret;
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator/(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // //re + im*i = (a + bi)  / (c + di)
   // auto mask = _mm512_set1_ps(-0.f);
   // auto fabs_cd = _mm512_andnot_ps(mask, b);     // |c|    |d|
   // auto fabs_dc = _mm512_permute_ps(fabs_cd, 0xB1);   // |d|    |c|
+<<<<<<< HEAD
   // auto scale = _mm512_rcp14_ps(_mm512_max_ps(fabs_cd, fabs_dc));  // 1/sc     1/sc
   // auto a2 = _mm512_mul_ps(a, scale);         // a/sc     b/sc
   // auto b2 = _mm512_mul_ps(b, scale);         // c/sc     d/sc
@@ -935,6 +1361,31 @@ template <> Vectorized<c10::complex<float>> inline operator/(const Vectorized<c1
   // return res2;
   __at_align__ c10::complex<float> tmp1[Vectorized<c10::complex<float>>::size()];
   __at_align__ c10::complex<float> tmp2[Vectorized<c10::complex<float>>::size()];
+=======
+  // auto scale = _mm512_rcp14_ps(_mm512_max_ps(fabs_cd, fabs_dc));  // 1/sc
+  // 1/sc auto a2 = _mm512_mul_ps(a, scale);         // a/sc     b/sc auto b2 =
+  // _mm512_mul_ps(b, scale);         // c/sc     d/sc auto acbd2 =
+  // _mm512_mul_ps(a2, b2);
+
+  // const __m512 sign_mask = _mm512_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0,
+  // -0.0, 0.0,
+  //                                         -0.0, 0.0, -0.0, 0.0, -0.0, 0.0,
+  //                                         -0.0, 0.0);
+  // auto dc2 = _mm512_permute_ps(b2, 0xB1);    // d/sc         c/sc
+  // dc2 = _mm512_xor_ps(sign_mask, dc2);       // -d/|c,d|        c/sc
+  // auto adbc2 = _mm512_mul_ps(a2, dc2);       //-ad/sc^2      bc/sc^2
+  // auto res2 = Vectorized<c10::complex<float>>::hadd_ps(acbd2, adbc2);
+  // //(ac+bd)/sc^2  (bc-ad)/sc^2
+
+  // // get the denominator
+  // auto denom2 = Vectorized<c10::complex<float>>(b2).abs_2_();  //
+  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2 res2 = _mm512_div_ps(res2, denom2); return
+  // res2;
+  __at_align__ c10::complex<float>
+      tmp1[Vectorized<c10::complex<float>>::size()];
+  __at_align__ c10::complex<float>
+      tmp2[Vectorized<c10::complex<float>>::size()];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __at_align__ c10::complex<float> out[Vectorized<c10::complex<float>>::size()];
   a.store(tmp1);
   b.store(tmp2);
@@ -945,6 +1396,7 @@ template <> Vectorized<c10::complex<float>> inline operator/(const Vectorized<c1
 }
 
 // reciprocal. Implement this here so we can use multiplication.
+<<<<<<< HEAD
 inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::reciprocal() const {
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
   // //re + im*i = (a + bi)  / (c + di)
@@ -952,6 +1404,19 @@ inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::reciproc
   // //im = (bc - ad)/abs_2() = d/abs_2()
   // const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
   //                                         0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+=======
+inline Vectorized<c10::complex<float>> Vectorized<
+    c10::complex<float>>::reciprocal() const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // //re + im*i = (a + bi)  / (c + di)
+  // //re = (ac + bd)/abs_2() = c/abs_2()
+  // //im = (bc - ad)/abs_2() = d/abs_2()
+  // const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+  // 0.0, -0.0,
+  //                                         0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+  //                                         0.0, -0.0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // auto c_d = _mm512_xor_ps(sign_mask, values);    //c       -d
   // return _mm512_div_ps(c_d, abs_2_());
   __at_align__ c10::complex<float> tmp[size()];
@@ -962,6 +1427,7 @@ inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::reciproc
   return loadu(tmp);
 }
 
+<<<<<<< HEAD
 inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::atan() const {
   // TODO: The vectorized implementation requires special handling for the case where real number/imag number is 0/Inf/NaN.
   // // atan(x) = i/2 * ln((i + z)/(i - z))
@@ -974,12 +1440,36 @@ inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::atan() c
   // auto sub = Vectorized(_mm512_sub_ps(i, values));                      // -a       1-b
   // auto ln = (sum/sub).log();                                        // ln((i + z)/(i - z))
   // return i_half*ln;                                                 // i/2*ln()
+=======
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::atan()
+    const {
+  // TODO: The vectorized implementation requires special handling for the case
+  // where real number/imag number is 0/Inf/NaN.
+  // // atan(x) = i/2 * ln((i + z)/(i - z))
+  // const __m512 i = _mm512_setr_ps(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+  //                                 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+  // const Vectorized i_half = _mm512_setr_ps(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+  // 0.5,
+  //                                         0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
+  //                                         0.5);
+
+  // auto sum = Vectorized(_mm512_add_ps(i, values));                      // a
+  // 1+b auto sub = Vectorized(_mm512_sub_ps(i, values)); // -a       1-b auto
+  // ln = (sum/sub).log();                                        // ln((i +
+  // z)/(i - z)) return i_half*ln; // i/2*ln()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return map(std::atan);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<float>> inline maximum(const Vectorized<c10::complex<float>>& a,
                                               const Vectorized<c10::complex<float>>& b) {
+=======
+Vectorized<c10::complex<float>> inline maximum(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto zero_vector = _mm512_set1_epi32(0);
   auto abs_a = a.abs_2_();
   auto abs_b = b.abs_2_();
@@ -992,8 +1482,14 @@ Vectorized<c10::complex<float>> inline maximum(const Vectorized<c10::complex<flo
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<float>> inline minimum(const Vectorized<c10::complex<float>>& a,
                                               const Vectorized<c10::complex<float>>& b) {
+=======
+Vectorized<c10::complex<float>> inline minimum(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto zero_vector = _mm512_set1_epi32(0);
   auto abs_a = a.abs_2_();
   auto abs_b = b.abs_2_();
@@ -1006,37 +1502,76 @@ Vectorized<c10::complex<float>> inline minimum(const Vectorized<c10::complex<flo
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<float>> inline operator&(const Vectorized<c10::complex<float>>& a,
                                                 const Vectorized<c10::complex<float>>& b) {
+=======
+Vectorized<c10::complex<float>> inline operator&(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_and_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<float>> inline operator|(const Vectorized<c10::complex<float>>& a,
                                                 const Vectorized<c10::complex<float>>& b) {
+=======
+Vectorized<c10::complex<float>> inline operator|(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_or_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::complex<float>> inline operator^(const Vectorized<c10::complex<float>>& a,
                                                 const Vectorized<c10::complex<float>>& b) {
+=======
+Vectorized<c10::complex<float>> inline operator^(
+    const Vectorized<c10::complex<float>>& a,
+    const Vectorized<c10::complex<float>>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_xor_ps(a, b);
 }
 
 inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::eq(
     const Vectorized<c10::complex<float>>& other) const {
+<<<<<<< HEAD
   auto eq = (*this == other);  // compares real and imag individually
   // If both real numbers and imag numbers are equal, then the complex numbers are equal
   return (eq.real() & eq.imag()) & Vectorized<c10::complex<float>>(_mm512_set1_ps(1.0f));
+=======
+  auto eq = (*this == other); // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers
+  // are equal
+  return (eq.real() & eq.imag()) &
+      Vectorized<c10::complex<float>>(_mm512_set1_ps(1.0f));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::ne(
     const Vectorized<c10::complex<float>>& other) const {
+<<<<<<< HEAD
   auto ne = (*this != other);  // compares real and imag individually
   // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
   return (ne.real() | ne.imag()) & Vectorized<c10::complex<float>>(_mm512_set1_ps(1.0f));
+=======
+  auto ne = (*this != other); // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex
+  // numbers are not equal
+  return (ne.real() | ne.imag()) &
+      Vectorized<c10::complex<float>>(_mm512_set1_ps(1.0f));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #endif
 
+<<<<<<< HEAD
 }}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
index af4801cccf48..a39b5b665e90 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
@@ -159,7 +159,11 @@ template <>
 struct VecConvert<int32_t, 1, float, 1> {
   static inline VectorizedN<int32_t, 1> apply(
       const VectorizedN<float, 1>& src) {
+<<<<<<< HEAD
     return  Vectorized<int32_t>(_mm512_cvttps_epi32(src[0]));
+=======
+    return Vectorized<int32_t>(_mm512_cvttps_epi32(src[0]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -167,7 +171,11 @@ template <>
 struct VecConvert<float, 1, int32_t, 1> {
   static inline VectorizedN<float, 1> apply(
       const VectorizedN<int32_t, 1>& src) {
+<<<<<<< HEAD
     return  Vectorized<float>(_mm512_cvtepi32_ps(src[0]));
+=======
+    return Vectorized<float>(_mm512_cvtepi32_ps(src[0]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -220,13 +228,24 @@ struct VecConvert<
     1,
     float,
     2,
+<<<<<<< HEAD
     typename std::enable_if_t<is_8bit_integer_v<dst_t>,
         void>> {
+=======
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>, void>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
     at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
     at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
     __m128 lane2 = _mm512_castps512_ps128(_mm512_castsi512_ps(vec2));
+<<<<<<< HEAD
     __m512 result = _mm512_insertf32x4(_mm512_castsi512_ps(vec1), lane2, 1); // Insert lane2 into the second 128-bit lane
+=======
+    __m512 result = _mm512_insertf32x4(
+        _mm512_castsi512_ps(vec1),
+        lane2,
+        1); // Insert lane2 into the second 128-bit lane
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return at::vec::Vectorized<dst_t>(_mm512_castps_si512(result));
   }
 };
@@ -237,8 +256,12 @@ struct VecConvert<
     1,
     float,
     1,
+<<<<<<< HEAD
     typename std::enable_if_t<is_8bit_integer_v<dst_t>,
         void>> {
+=======
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>, void>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
     return convert_float_to_int8<dst_t>(src[0]);
   }
@@ -250,6 +273,7 @@ struct VecConvert<
     2,
     src_t,
     1,
+<<<<<<< HEAD
     typename std::enable_if_t<is_8bit_integer_v<src_t>,
         void>> {
   static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
@@ -259,6 +283,17 @@ struct VecConvert<
       )
     );
     return VectorizedN<float, 2>(convert_int8_to_float<src_t>(src[0]), convert_int8_to_float<src_t>(src2));
+=======
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    __m512i src2 =
+        _mm512_castsi128_si512(_mm_castps_si128(_mm512_extractf32x4_ps(
+            _mm512_castsi512_ps(src[0]), 1) // Extract the second 128-bit lane
+                                                ));
+    return VectorizedN<float, 2>(
+        convert_int8_to_float<src_t>(src[0]),
+        convert_int8_to_float<src_t>(src2));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -268,8 +303,12 @@ struct VecConvert<
     1,
     src_t,
     1,
+<<<<<<< HEAD
     typename std::enable_if_t<is_8bit_integer_v<src_t>,
         void>> {
+=======
+    typename std::enable_if_t<is_8bit_integer_v<src_t>, void>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
     return convert_int8_to_float<src_t>(src[0]);
   }
@@ -282,8 +321,12 @@ struct VecConvert<
     int64_t,
     2,
     std::enable_if_t<
+<<<<<<< HEAD
         std::is_same_v<dst_t, int8_t> ||
         std::is_same_v<dst_t, uint8_t>>> {
+=======
+        std::is_same_v<dst_t, int8_t> || std::is_same_v<dst_t, uint8_t>>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<dst_t, 1> apply(
       const VectorizedN<int64_t, 2>& src) {
     return VecConvert<dst_t, 1, int32_t, 1>::apply(
@@ -291,6 +334,53 @@ struct VecConvert<
   }
 };
 
+<<<<<<< HEAD
+=======
+template <>
+struct VecConvert<Float8_e4m3fn, 1, float, 1> {
+  static inline VectorizedN<Float8_e4m3fn, 1> apply(
+      const VectorizedN<float, 1>& src_n) {
+    at::vec::Vectorized<float> src = src_n[0];
+    __m128i res128 = cvtfp32_fp8e4m3(src);
+    return at::vec::Vectorized<Float8_e4m3fn>(_mm512_castsi128_si512(res128));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Float8_e4m3fn, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<Float8_e4m3fn, 1>& src_n) {
+    // cvt first 16x8 bits from Float8_e4m3fn to float
+    at::vec::Vectorized<Float8_e4m3fn> src = src_n[0];
+    __m512 result;
+    cvtfp8e4m3_fp32(_mm512_castsi512_si128(src), result);
+    return at::vec::Vectorized<float>(result);
+  }
+};
+
+template <>
+struct VecConvert<Float8_e5m2, 1, float, 1> {
+  static inline VectorizedN<Float8_e5m2, 1> apply(
+      const VectorizedN<float, 1>& src_n) {
+    at::vec::Vectorized<float> src = src_n[0];
+    __m128i res128 = cvtfp32_fp8e5m2(src);
+    return at::vec::Vectorized<Float8_e5m2>(_mm512_castsi128_si512(res128));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Float8_e5m2, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<Float8_e5m2, 1>& src_n) {
+    // cvt first 16x8 bits from Float8_e5m2 to float
+    at::vec::Vectorized<Float8_e5m2> src = src_n[0];
+    __m512 result;
+    cvtfp8e5m2_fp32(_mm512_castsi512_si128(src), result);
+    return at::vec::Vectorized<float>(result);
+  }
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 } // namespace CPU_CAPABILITY
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
index 4d2554f231d4..150c03ee8a2e 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@@ -11,17 +11,32 @@
 #include <sleef.h>
 #endif
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
 #if defined(CPU_CAPABILITY_AVX512)
 
+<<<<<<< HEAD
 template <> class Vectorized<double> {
 private:
   static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
 public:
+=======
+template <>
+struct is_vec_specialized_for<double> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<double> {
+ private:
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // values needs to be public for compilation with clang
   // as vec512.h uses it
   __m512d values;
@@ -35,14 +50,27 @@ template <> class Vectorized<double> {
   Vectorized(double val) {
     values = _mm512_set1_pd(val);
   }
+<<<<<<< HEAD
   Vectorized(double val1, double val2, double val3, double val4,
          double val5, double val6, double val7, double val8) {
+=======
+  Vectorized(
+      double val1,
+      double val2,
+      double val3,
+      double val4,
+      double val5,
+      double val6,
+      double val7,
+      double val8) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     values = _mm512_setr_pd(val1, val2, val3, val4, val5, val6, val7, val8);
   }
   operator __m512d() const {
     return values;
   }
   template <int64_t mask>
+<<<<<<< HEAD
   static Vectorized<double> blend(const Vectorized<double>& a, const Vectorized<double>& b) {
     return _mm512_mask_blend_pd(mask, a.values, b.values);
   }
@@ -60,6 +88,40 @@ template <> class Vectorized<double> {
   }
   static Vectorized<double> set(const Vectorized<double>& a, const Vectorized<double>& b,
                             int64_t count = size()) {
+=======
+  static Vectorized<double> blend(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b) {
+    return _mm512_mask_blend_pd(mask, a.values, b.values);
+  }
+  static Vectorized<double> blendv(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      const Vectorized<double>& mask) {
+    auto all_ones = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF);
+    auto mmask = _mm512_cmp_epi64_mask(
+        _mm512_castpd_si512(mask.values), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_pd(mmask, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<double> arange(
+      double base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<double>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<double> set(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -95,14 +157,23 @@ template <> class Vectorized<double> {
       _mm512_mask_storeu_pd(reinterpret_cast<double*>(ptr), mask, values);
     }
   }
+<<<<<<< HEAD
   const double& operator[](int idx) const  = delete;
   double& operator[](int idx) = delete;
   int zero_mask() const {
     // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+=======
+  const double& operator[](int idx) const = delete;
+  double& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __mmask8 cmp = _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_EQ_OQ);
     return static_cast<int32_t>(cmp);
   }
   Vectorized<double> isnan() const {
+<<<<<<< HEAD
     auto cmp_mask = _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_UNORD_Q);
     return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
                                                       0xFFFFFFFFFFFFFFFF));
@@ -110,6 +181,17 @@ template <> class Vectorized<double> {
   bool has_inf_nan() const {
     __m512d self_sub  = _mm512_sub_pd(values, values);
     return (_mm512_movepi8_mask(_mm512_castpd_si512(self_sub)) & 0x7777777777777777) != 0;
+=======
+    auto cmp_mask =
+        _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_UNORD_Q);
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+  }
+  bool has_inf_nan() const {
+    __m512d self_sub = _mm512_sub_pd(values, values);
+    return (_mm512_movepi8_mask(_mm512_castpd_si512(self_sub)) &
+            0x7777777777777777) != 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> map(double (*const f)(double)) const {
     __at_align__ double tmp[size()];
@@ -127,10 +209,17 @@ template <> class Vectorized<double> {
     const auto zero_vec = _mm512_castsi512_pd(zero_vector);
     const auto nan_vec = _mm512_set1_pd(NAN);
     const auto not_nan_mask = _mm512_cmp_pd_mask(values, values, _CMP_EQ_OQ);
+<<<<<<< HEAD
     const auto not_nan = _mm512_mask_set1_epi64(zero_vector, not_nan_mask,
                                                 0xFFFFFFFFFFFFFFFF);
     const auto nan_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(not_nan),
                                              zero_vec, _CMP_EQ_OQ);
+=======
+    const auto not_nan =
+        _mm512_mask_set1_epi64(zero_vector, not_nan_mask, 0xFFFFFFFFFFFFFFFF);
+    const auto nan_mask =
+        _mm512_cmp_pd_mask(_mm512_castsi512_pd(not_nan), zero_vec, _CMP_EQ_OQ);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto pi = _mm512_set1_pd(c10::pi<double>);
 
     const auto neg_mask = _mm512_cmp_pd_mask(values, zero_vec, _CMP_LT_OQ);
@@ -165,10 +254,17 @@ template <> class Vectorized<double> {
   Vectorized<double> atanh() const {
     return Vectorized<double>(Sleef_atanhd8_u10(values));
   }
+<<<<<<< HEAD
   Vectorized<double> atan2(const Vectorized<double> &b) const {
     return Vectorized<double>(Sleef_atan2d8_u10(values, b));
   }
   Vectorized<double> copysign(const Vectorized<double> &sign) const {
+=======
+  Vectorized<double> atan2(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_atan2d8_u10(values, b));
+  }
+  Vectorized<double> copysign(const Vectorized<double>& sign) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<double>(Sleef_copysignd8(values, sign));
   }
   Vectorized<double> erf() const {
@@ -195,7 +291,11 @@ template <> class Vectorized<double> {
   Vectorized<double> fmod(const Vectorized<double>& q) const {
     return Vectorized<double>(Sleef_fmodd8(values, q));
   }
+<<<<<<< HEAD
   Vectorized<double> hypot(const Vectorized<double> &b) const {
+=======
+  Vectorized<double> hypot(const Vectorized<double>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<double>(Sleef_hypotd8_u05(values, b));
   }
   Vectorized<double> i0() const {
@@ -207,7 +307,11 @@ template <> class Vectorized<double> {
   Vectorized<double> digamma() const {
     return map(calc_digamma);
   }
+<<<<<<< HEAD
   Vectorized<double> igamma(const Vectorized<double> &x) const {
+=======
+  Vectorized<double> igamma(const Vectorized<double>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ double tmp[size()];
     __at_align__ double tmp_x[size()];
     store(tmp);
@@ -217,7 +321,11 @@ template <> class Vectorized<double> {
     }
     return loadu(tmp);
   }
+<<<<<<< HEAD
   Vectorized<double> igammac(const Vectorized<double> &x) const {
+=======
+  Vectorized<double> igammac(const Vectorized<double>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ double tmp[size()];
     __at_align__ double tmp_x[size()];
     store(tmp);
@@ -261,11 +369,20 @@ template <> class Vectorized<double> {
   Vectorized<double> neg() const {
     return _mm512_xor_pd(_mm512_set1_pd(-0.), values);
   }
+<<<<<<< HEAD
   Vectorized<double> nextafter(const Vectorized<double> &b) const {
     return Vectorized<double>(Sleef_nextafterd8(values, b));
   }
   Vectorized<double> round() const {
     return _mm512_roundscale_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+  Vectorized<double> nextafter(const Vectorized<double>& b) const {
+    return Vectorized<double>(Sleef_nextafterd8(values, b));
+  }
+  Vectorized<double> round() const {
+    return _mm512_roundscale_pd(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> tan() const {
     return Vectorized<double>(Sleef_tand8_u10(values));
@@ -274,7 +391,12 @@ template <> class Vectorized<double> {
     return Vectorized<double>(Sleef_tanhd8_u10(values));
   }
   Vectorized<double> trunc() const {
+<<<<<<< HEAD
     return _mm512_roundscale_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+=======
+    return _mm512_roundscale_pd(
+        values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<double> lgamma() const {
     return Vectorized<double>(Sleef_lgammad8_u10(values));
@@ -288,7 +410,11 @@ template <> class Vectorized<double> {
   Vectorized<double> rsqrt() const {
     return _mm512_div_pd(_mm512_set1_pd(1), _mm512_sqrt_pd(values));
   }
+<<<<<<< HEAD
   Vectorized<double> pow(const Vectorized<double> &b) const {
+=======
+  Vectorized<double> pow(const Vectorized<double>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<double>(Sleef_powd8_u10(values, b));
   }
   // Comparison using the _CMP_**_OQ predicate.
@@ -296,38 +422,68 @@ template <> class Vectorized<double> {
   //   `Q`: do not raise if an operand is NaN
   Vectorized<double> operator==(const Vectorized<double>& other) const {
     auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ);
+<<<<<<< HEAD
     return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
                                                       0xFFFFFFFFFFFFFFFF));
+=======
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<double> operator!=(const Vectorized<double>& other) const {
     auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_NEQ_UQ);
+<<<<<<< HEAD
     return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
                                                       0xFFFFFFFFFFFFFFFF));
+=======
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<double> operator<(const Vectorized<double>& other) const {
     auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_LT_OQ);
+<<<<<<< HEAD
     return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
                                                       0xFFFFFFFFFFFFFFFF));
+=======
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<double> operator<=(const Vectorized<double>& other) const {
     auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_LE_OQ);
+<<<<<<< HEAD
     return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
                                                       0xFFFFFFFFFFFFFFFF));
+=======
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<double> operator>(const Vectorized<double>& other) const {
     auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_GT_OQ);
+<<<<<<< HEAD
     return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
                                                       0xFFFFFFFFFFFFFFFF));
+=======
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<double> operator>=(const Vectorized<double>& other) const {
     auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_GE_OQ);
+<<<<<<< HEAD
     return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
                                                       0xFFFFFFFFFFFFFFFF));
+=======
+    return _mm512_castsi512_pd(
+        _mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<double> eq(const Vectorized<double>& other) const;
@@ -339,22 +495,46 @@ template <> class Vectorized<double> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator+(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_add_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator-(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_sub_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator*(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_mul_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator/(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_div_pd(a, b);
 }
 
@@ -366,12 +546,23 @@ inline Vectorized<double> Vectorized<double>::frac() const {
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline maximum(const Vectorized<double>& a, const Vectorized<double>& b) {
   auto zero_vec = _mm512_set1_epi64(0);
   Vectorized<double> max = _mm512_max_pd(a, b);
   auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q);
   auto isnan = _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vec, isnan_mask,
                                                           0xFFFFFFFFFFFFFFFF));
+=======
+Vectorized<double> inline maximum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  auto zero_vec = _mm512_set1_epi64(0);
+  Vectorized<double> max = _mm512_max_pd(a, b);
+  auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Exploit the fact that all-ones is a NaN.
   return _mm512_or_pd(max, isnan);
 }
@@ -379,42 +570,85 @@ Vectorized<double> inline maximum(const Vectorized<double>& a, const Vectorized<
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline minimum(const Vectorized<double>& a, const Vectorized<double>& b) {
   auto zero_vec = _mm512_set1_epi64(0);
   Vectorized<double> min = _mm512_min_pd(a, b);
   auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q);
   auto isnan = _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vec, isnan_mask,
                                                           0xFFFFFFFFFFFFFFFF));
+=======
+Vectorized<double> inline minimum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  auto zero_vec = _mm512_set1_epi64(0);
+  Vectorized<double> min = _mm512_min_pd(a, b);
+  auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_pd(
+      _mm512_mask_set1_epi64(zero_vec, isnan_mask, 0xFFFFFFFFFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Exploit the fact that all-ones is a NaN.
   return _mm512_or_pd(min, isnan);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline clamp(const Vectorized<double>& a, const Vectorized<double>& min, const Vectorized<double>& max) {
+=======
+Vectorized<double> inline clamp(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min,
+    const Vectorized<double>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_pd(max, _mm512_max_pd(min, a));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline clamp_min(const Vectorized<double>& a, const Vectorized<double>& min) {
+=======
+Vectorized<double> inline clamp_min(
+    const Vectorized<double>& a,
+    const Vectorized<double>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_pd(min, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline clamp_max(const Vectorized<double>& a, const Vectorized<double>& max) {
+=======
+Vectorized<double> inline clamp_max(
+    const Vectorized<double>& a,
+    const Vectorized<double>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_pd(max, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator&(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_and_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
+=======
+Vectorized<double> inline operator|(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_or_pd(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
   return _mm512_xor_pd(a, b);
 }
@@ -440,6 +674,41 @@ inline Vectorized<double> Vectorized<double>::lt(const Vectorized<double>& other
 }
 
 inline Vectorized<double> Vectorized<double>::le(const Vectorized<double>& other) const {
+=======
+Vectorized<double> inline operator^(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return _mm512_xor_pd(a, b);
+}
+
+inline Vectorized<double> Vectorized<double>::eq(
+    const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ne(
+    const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::gt(
+    const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ge(
+    const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::lt(
+    const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::le(
+    const Vectorized<double>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<double>(1.0);
 }
 
@@ -449,7 +718,12 @@ inline void convert(const double* src, double* dst, int64_t n) {
 #ifndef __msvc_cl__
 #pragma unroll
 #endif
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
+=======
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm512_storeu_pd(dst + i, _mm512_loadu_pd(src + i));
   }
 #ifndef __msvc_cl__
@@ -461,15 +735,34 @@ inline void convert(const double* src, double* dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline fmadd(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+=======
+Vectorized<double> inline fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_fmadd_pd(a, b, c);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<double> inline fmsub(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+=======
+Vectorized<double> inline fmsub(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_fmsub_pd(a, b, c);
 }
 
 #endif
 
+<<<<<<< HEAD
 }}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
index 43a8e5c48cbe..d9627b416d8f 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@@ -11,17 +11,32 @@
 #include <sleef.h>
 #endif
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
 #if defined(CPU_CAPABILITY_AVX512)
 
+<<<<<<< HEAD
 template <> class Vectorized<float> {
 private:
   static constexpr __m512i zero_vec {0, 0, 0, 0, 0, 0, 0, 0};
 public:
+=======
+template <>
+struct is_vec_specialized_for<float> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<float> {
+ private:
+  static constexpr __m512i zero_vec{0, 0, 0, 0, 0, 0, 0, 0};
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 values;
   using value_type = float;
   using size_type = int;
@@ -33,6 +48,7 @@ template <> class Vectorized<float> {
   Vectorized(float val) {
     values = _mm512_set1_ps(val);
   }
+<<<<<<< HEAD
   Vectorized(float val1, float val2, float val3, float val4,
          float val5, float val6, float val7, float val8,
          float val9, float val10, float val11, float val12,
@@ -43,10 +59,66 @@ template <> class Vectorized<float> {
   Vectorized(const float (&arr)[16])
       : Vectorized(arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7],
                    arr[8], arr[9], arr[10], arr[11], arr[12], arr[13], arr[14], arr[15]) {}
+=======
+  Vectorized(
+      float val1,
+      float val2,
+      float val3,
+      float val4,
+      float val5,
+      float val6,
+      float val7,
+      float val8,
+      float val9,
+      float val10,
+      float val11,
+      float val12,
+      float val13,
+      float val14,
+      float val15,
+      float val16) {
+    values = _mm512_setr_ps(
+        val1,
+        val2,
+        val3,
+        val4,
+        val5,
+        val6,
+        val7,
+        val8,
+        val9,
+        val10,
+        val11,
+        val12,
+        val13,
+        val14,
+        val15,
+        val16);
+  }
+  Vectorized(const float (&arr)[16])
+      : Vectorized(
+            arr[0],
+            arr[1],
+            arr[2],
+            arr[3],
+            arr[4],
+            arr[5],
+            arr[6],
+            arr[7],
+            arr[8],
+            arr[9],
+            arr[10],
+            arr[11],
+            arr[12],
+            arr[13],
+            arr[14],
+            arr[15]) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   operator __m512() const {
     return values;
   }
   template <int64_t mask>
+<<<<<<< HEAD
   static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
     return _mm512_mask_blend_ps(mask, a.values, b.values);
   }
@@ -66,6 +138,48 @@ template <> class Vectorized<float> {
   }
   static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
                            int64_t count = size()) {
+=======
+  static Vectorized<float> blend(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b) {
+    return _mm512_mask_blend_ps(mask, a.values, b.values);
+  }
+  static Vectorized<float> blendv(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      const Vectorized<float>& mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto mmask = _mm512_cmp_epi32_mask(
+        _mm512_castps_si512(mask.values), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_ps(mmask, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<float> arange(
+      float base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<float>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+  static Vectorized<float> set(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -117,14 +231,23 @@ template <> class Vectorized<float> {
       _mm512_mask_storeu_ps(reinterpret_cast<float*>(ptr), mask, values);
     }
   }
+<<<<<<< HEAD
   const float& operator[](int idx) const  = delete;
   float& operator[](int idx) = delete;
   int zero_mask() const {
     // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+=======
+  const float& operator[](int idx) const = delete;
+  float& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __mmask16 cmp = _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_EQ_OQ);
     return static_cast<int32_t>(cmp);
   }
   Vectorized<float> isnan() const {
+<<<<<<< HEAD
     auto mask =  _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_UNORD_Q);
     return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
                                                       0xFFFFFFFF));
@@ -132,6 +255,16 @@ template <> class Vectorized<float> {
   bool has_inf_nan() const {
     __m512 self_sub  = _mm512_sub_ps(values, values);
     return (_mm512_movepi8_mask(_mm512_castps_si512(self_sub)) & 0x7777777777777777) != 0;
+=======
+    auto mask = _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_UNORD_Q);
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+  }
+  bool has_inf_nan() const {
+    __m512 self_sub = _mm512_sub_ps(values, values);
+    return (_mm512_movepi8_mask(_mm512_castps_si512(self_sub)) &
+            0x7777777777777777) != 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> map(float (*const f)(float)) const {
     __at_align__ float tmp[size()];
@@ -149,10 +282,17 @@ template <> class Vectorized<float> {
     __m512 zero_vec = _mm512_set1_ps(0.f);
     const auto nan_vec = _mm512_set1_ps(NAN);
     const auto not_nan_mask = _mm512_cmp_ps_mask(values, values, _CMP_EQ_OQ);
+<<<<<<< HEAD
     const auto not_nan_vec = _mm512_mask_set1_epi32(_mm512_castps_si512(zero_vec),
                                                     not_nan_mask, 0xFFFFFFFF);
     const auto nan_mask = _mm512_cmp_ps_mask(_mm512_castsi512_ps(not_nan_vec),
                                              zero_vec, _CMP_EQ_OQ);
+=======
+    const auto not_nan_vec = _mm512_mask_set1_epi32(
+        _mm512_castps_si512(zero_vec), not_nan_mask, 0xFFFFFFFF);
+    const auto nan_mask = _mm512_cmp_ps_mask(
+        _mm512_castsi512_ps(not_nan_vec), zero_vec, _CMP_EQ_OQ);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto pi = _mm512_set1_ps(c10::pi<double>);
 
     const auto neg_mask = _mm512_cmp_ps_mask(values, zero_vec, _CMP_LT_OQ);
@@ -187,10 +327,17 @@ template <> class Vectorized<float> {
   Vectorized<float> atanh() const {
     return Vectorized<float>(Sleef_atanhf16_u10(values));
   }
+<<<<<<< HEAD
   Vectorized<float> atan2(const Vectorized<float> &b) const {
     return Vectorized<float>(Sleef_atan2f16_u10(values, b));
   }
   Vectorized<float> copysign(const Vectorized<float> &sign) const {
+=======
+  Vectorized<float> atan2(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_atan2f16_u10(values, b));
+  }
+  Vectorized<float> copysign(const Vectorized<float>& sign) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<float>(Sleef_copysignf16(values, sign));
   }
   Vectorized<float> erf() const {
@@ -258,9 +405,18 @@ template <> class Vectorized<float> {
     const __m512 vec_one = _mm512_set1_ps(1.f);
     const __m512 vec_zero = _mm512_set1_ps(0.f);
     const __m512 vec_two = _mm512_set1_ps(2.f);
+<<<<<<< HEAD
     const __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
     const __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
     const __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
+=======
+    const __m512 vec_ln2f =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
+    const __m512 vec_ln_flt_min =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
+    const __m512 vec_ln_flt_max =
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
     const int n_mantissa_bits = 23;
 
@@ -338,7 +494,11 @@ template <> class Vectorized<float> {
   Vectorized<float> floor() const {
     return _mm512_floor_ps(values);
   }
+<<<<<<< HEAD
   Vectorized<float> hypot(const Vectorized<float> &b) const {
+=======
+  Vectorized<float> hypot(const Vectorized<float>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<float>(Sleef_hypotf16_u05(values, b));
   }
   Vectorized<float> i0() const {
@@ -350,7 +510,11 @@ template <> class Vectorized<float> {
   Vectorized<float> digamma() const {
     return map(calc_digamma);
   }
+<<<<<<< HEAD
   Vectorized<float> igamma(const Vectorized<float> &x) const {
+=======
+  Vectorized<float> igamma(const Vectorized<float>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ float tmp[size()];
     __at_align__ float tmp_x[size()];
     store(tmp);
@@ -360,7 +524,11 @@ template <> class Vectorized<float> {
     }
     return loadu(tmp);
   }
+<<<<<<< HEAD
   Vectorized<float> igammac(const Vectorized<float> &x) const {
+=======
+  Vectorized<float> igammac(const Vectorized<float>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __at_align__ float tmp[size()];
     __at_align__ float tmp_x[size()];
     store(tmp);
@@ -373,11 +541,20 @@ template <> class Vectorized<float> {
   Vectorized<float> neg() const {
     return _mm512_xor_ps(_mm512_set1_ps(-0.f), values);
   }
+<<<<<<< HEAD
   Vectorized<float> nextafter(const Vectorized<float> &b) const {
     return Vectorized<float>(Sleef_nextafterf16(values, b));
   }
   Vectorized<float> round() const {
     return _mm512_roundscale_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+=======
+  Vectorized<float> nextafter(const Vectorized<float>& b) const {
+    return Vectorized<float>(Sleef_nextafterf16(values, b));
+  }
+  Vectorized<float> round() const {
+    return _mm512_roundscale_ps(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> tan() const {
     return Vectorized<float>(Sleef_tanf16_u10(values));
@@ -386,7 +563,12 @@ template <> class Vectorized<float> {
     return Vectorized<float>(Sleef_tanhf16_u10(values));
   }
   Vectorized<float> trunc() const {
+<<<<<<< HEAD
     return _mm512_roundscale_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+=======
+    return _mm512_roundscale_ps(
+        values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<float> lgamma() const {
     return Vectorized<float>(Sleef_lgammaf16_u10(values));
@@ -400,7 +582,11 @@ template <> class Vectorized<float> {
   Vectorized<float> rsqrt() const {
     return _mm512_div_ps(_mm512_set1_ps(1), _mm512_sqrt_ps(values));
   }
+<<<<<<< HEAD
   Vectorized<float> pow(const Vectorized<float> &b) const {
+=======
+  Vectorized<float> pow(const Vectorized<float>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized<float>(Sleef_powf16_u10(values, b));
   }
   float reduce_add() const {
@@ -414,38 +600,68 @@ template <> class Vectorized<float> {
   //   `Q`: do not raise if an operand is NaN
   Vectorized<float> operator==(const Vectorized<float>& other) const {
     auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ);
+<<<<<<< HEAD
     return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
                                                       0xFFFFFFFF));
+=======
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<float> operator!=(const Vectorized<float>& other) const {
     auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_NEQ_UQ);
+<<<<<<< HEAD
     return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
                                                       0xFFFFFFFF));
+=======
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<float> operator<(const Vectorized<float>& other) const {
     auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_LT_OQ);
+<<<<<<< HEAD
     return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
                                                       0xFFFFFFFF));
+=======
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<float> operator<=(const Vectorized<float>& other) const {
     auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_LE_OQ);
+<<<<<<< HEAD
     return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
                                                       0xFFFFFFFF));
+=======
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<float> operator>(const Vectorized<float>& other) const {
     auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_GT_OQ);
+<<<<<<< HEAD
     return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
                                                       0xFFFFFFFF));
+=======
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<float> operator>=(const Vectorized<float>& other) const {
     auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_GE_OQ);
+<<<<<<< HEAD
     return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
                                                       0xFFFFFFFF));
+=======
+    return _mm512_castsi512_ps(
+        _mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Vectorized<float> eq(const Vectorized<float>& other) const;
@@ -457,22 +673,46 @@ template <> class Vectorized<float> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator+(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_add_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator-(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_sub_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator*(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_mul_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator/(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_div_ps(a, b);
 }
 
@@ -484,12 +724,23 @@ inline Vectorized<float> Vectorized<float>::frac() const {
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
   auto zero_vec = _mm512_set1_epi32(0);
   auto max = _mm512_max_ps(a, b);
   auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
   auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
                                                           0xFFFFFFFF));
+=======
+Vectorized<float> inline maximum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  auto zero_vec = _mm512_set1_epi32(0);
+  auto max = _mm512_max_ps(a, b);
+  auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, isnan_mask, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Exploit the fact that all-ones is a NaN.
   return _mm512_or_ps(max, isnan);
 }
@@ -497,42 +748,85 @@ Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<fl
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
   auto zero_vec = _mm512_set1_epi32(0);
   auto min = _mm512_min_ps(a, b);
   auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
   auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
                                                           0xFFFFFFFF));
+=======
+Vectorized<float> inline minimum(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  auto zero_vec = _mm512_set1_epi32(0);
+  auto min = _mm512_min_ps(a, b);
+  auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_ps(
+      _mm512_mask_set1_epi32(zero_vec, isnan_mask, 0xFFFFFFFF));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Exploit the fact that all-ones is a NaN.
   return _mm512_or_ps(min, isnan);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
+=======
+Vectorized<float> inline clamp(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min,
+    const Vectorized<float>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_ps(max, _mm512_max_ps(min, a));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
+=======
+Vectorized<float> inline clamp_max(
+    const Vectorized<float>& a,
+    const Vectorized<float>& max) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_ps(max, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
+=======
+Vectorized<float> inline clamp_min(
+    const Vectorized<float>& a,
+    const Vectorized<float>& min) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_ps(min, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator&(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_and_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+=======
+Vectorized<float> inline operator|(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_or_ps(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
   return _mm512_xor_ps(a, b);
 }
@@ -558,6 +852,41 @@ inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) c
 }
 
 inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
+=======
+Vectorized<float> inline operator^(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return _mm512_xor_ps(a, b);
+}
+
+inline Vectorized<float> Vectorized<float>::eq(
+    const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ne(
+    const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::gt(
+    const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ge(
+    const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::lt(
+    const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::le(
+    const Vectorized<float>& other) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (*this <= other) & Vectorized<float>(1.0f);
 }
 
@@ -567,7 +896,12 @@ inline void convert(const float* src, float* dst, int64_t n) {
 #ifndef __msvc_cl__
 #pragma unroll
 #endif
+<<<<<<< HEAD
   for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+=======
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm512_storeu_ps(dst + i, _mm512_loadu_ps(src + i));
   }
 #ifndef __msvc_cl__
@@ -579,12 +913,26 @@ inline void convert(const float* src, float* dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+=======
+Vectorized<float> inline fmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_fmadd_ps(a, b, c);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+=======
+Vectorized<float> inline fmsub(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_fmsub_ps(a, b, c);
 }
 
@@ -594,7 +942,14 @@ Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<floa
 // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304
 // kernel for transposing mxn where m, n <= 16
 // (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + N instructions
+<<<<<<< HEAD
 inline void transpose_block(at::vec::VectorizedN<float, 16> &input, int M=16, int N=16) {
+=======
+inline void transpose_block(
+    at::vec::VectorizedN<float, 16>& input,
+    int M = 16,
+    int N = 16) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(M <= 16 && N <= 16, "transpose_block expects M, N <= 16.");
   // unpacking and interleaving 32-bit elements
   __m512 temp[16];
@@ -653,7 +1008,17 @@ inline void transpose_block(at::vec::VectorizedN<float, 16> &input, int M=16, in
 // https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L230-L304
 // kernel for transposing mxn where m, n <= 16
 // M + (M + 1) / 2 * 2 + (M + 3) / 4 * 4 + (M + 7) / 8 * 8 + 2 * N instructions
+<<<<<<< HEAD
 inline void transpose_mxn_16x16(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
+=======
+inline void transpose_mxn_16x16(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(M <= 16 && N <= 16, "transpose_mxn<float> expects M, N <= 16.");
   // load from src to registers
   at::vec::VectorizedN<float, 16> input;
@@ -690,8 +1055,19 @@ inline void transpose_mxn_16x16(const float* src, int64_t ld_src, float* dst, in
   }
 }
 
+<<<<<<< HEAD
 template<>
 inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, int64_t ld_dst, int M, int N) {
+=======
+template <>
+inline void transpose_mxn<float>(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t i = 0;
   for (; i < M / 16 * 16; i += 16) {
     int64_t j = 0;
@@ -721,12 +1097,30 @@ inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, i
   }
 }
 
+<<<<<<< HEAD
 template <typename T, int M, int N,
           typename std::enable_if_t<std::is_same_v<T, float>, int> = 0>
 inline void transpose_mxn(const float* src, int64_t ld_src, float* dst, int64_t ld_dst) {
+=======
+template <
+    typename T,
+    int M,
+    int N,
+    typename std::enable_if_t<std::is_same_v<T, float>, int> = 0>
+inline void transpose_mxn(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   transpose_mxn<float>(src, ld_src, dst, ld_dst, M, N);
 }
 
 #endif
 
+<<<<<<< HEAD
 }}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float8.h b/aten/src/ATen/cpu/vec/vec512/vec512_float8.h
new file mode 100644
index 000000000000..12ee4c460641
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float8.h
@@ -0,0 +1,661 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#if (defined(CPU_CAPABILITY_AVX512))
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+static inline void cvtfp8e4m3_fp32(const __m128i& a, __m512& o) {
+  // Zero Extend
+  __m512i x = _mm512_cvtepu8_epi32(a);
+  __m512i val = _mm512_and_epi32(
+      _mm512_slli_epi32(x, 24), _mm512_set1_epi32(0x7FFFFFFF)); // nonsign_val
+  __m512i mant =
+      _mm512_and_si512(x, _mm512_set1_epi32(0x07)); // mantissa = x & 0x07
+  __m512i exp = _mm512_and_si512(
+      _mm512_srli_epi32(x, 3),
+      _mm512_set1_epi32(0x0F)); // exp = (x >> 3) & 0x0F
+  __m512i sign =
+      _mm512_and_si512(x, _mm512_set1_epi32(0x80)); // sign = x & 0x80
+  __m512i _zeros = _mm512_setzero_si512();
+
+  // --- Step 1: Calculate the renorm_shift
+  __m512i renorm_shift = _zeros;
+  // Denorm case (exp == 0 && mant != 0) ---
+  __mmask16 denormal_mask = _mm512_cmpeq_epi32_mask(exp, _zeros) &
+      _mm512_cmpneq_epi32_mask(mant, _zeros);
+  if (denormal_mask) {
+    // An alternative solution is as what scalar did in
+    // pytorch/c10/util/Float8_e4m3fn.h To count the num of leading zeros, since
+    // here we know the unsigned denorm value has zero sign and exp which is 5
+    // leading zeros, we need to count the leading zero of mant (3bit) which may
+    // done through table lookup for example: const uint8_t lz_table[8] = {3, 2,
+    // 1, 1, 0, 0, 0, 0}; num_leading_zero = lz_table[mant] + 5;
+
+    __m512i _ones = _mm512_set1_epi32(1);
+    __m512i _twos = _mm512_set1_epi32(2);
+    __m512i _threes = _mm512_set1_epi32(3);
+
+    // Default leading zero number for denorm value is 1 = 5 - 4
+    __m512i denorm_renorm_shift = _ones;
+    // For mant 001, leading zero number is 3 = 7 -4
+    __mmask16 leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _ones);
+    denorm_renorm_shift =
+        _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _threes);
+    // For mant 010 and 011, leading zero number is 2 = 6 -4
+    leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _twos);
+    denorm_renorm_shift =
+        _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _twos);
+    leading_Zero_mask = _mm512_cmpeq_epi32_mask(mant, _threes);
+    denorm_renorm_shift =
+        _mm512_mask_mov_epi32(denorm_renorm_shift, leading_Zero_mask, _twos);
+
+    renorm_shift =
+        _mm512_mask_mov_epi32(renorm_shift, denormal_mask, denorm_renorm_shift);
+  }
+
+  // --- Step 2: calculate norm and denorm ---
+  __m512i norm_shifted =
+      _mm512_srli_epi32(_mm512_sllv_epi32(val, renorm_shift), 4);
+  // exponent bias adjustment: (0x78 - renorm_shift) << 23
+  __m512i exp_bias = _mm512_slli_epi32(
+      _mm512_sub_epi32(_mm512_set1_epi32(0x78), renorm_shift), 23);
+  val = _mm512_add_epi32(norm_shifted, exp_bias);
+
+  // --- Step 3: Nan case (exp == 0xF && mant == 0x07) ---
+  __mmask16 nan_mask = _mm512_cmpeq_epi32_mask(exp, _mm512_set1_epi32(0xF)) &
+      _mm512_cmpeq_epi32_mask(mant, _mm512_set1_epi32(0x07));
+  if (nan_mask) {
+    const __m512i nan_values = _mm512_set1_epi32(0x7FC00000);
+    val = _mm512_mask_mov_epi32(val, nan_mask, nan_values);
+  }
+
+  // --- Step 4: Zero case (exp == 0x00 && mant == 0x00) ---
+  __mmask16 zero_mask = _mm512_cmpeq_epi32_mask(exp, _zeros) &
+      _mm512_cmpeq_epi32_mask(mant, _zeros);
+  if (zero_mask) {
+    val = _mm512_mask_mov_epi32(val, zero_mask, _zeros);
+  }
+
+  // --- Step 5: OR with sign (sign bit << 24 to get to bit 31) ---
+  val = _mm512_or_si512(val, _mm512_slli_epi32(sign, 24));
+
+  o = _mm512_castsi512_ps(val);
+}
+
+static inline __m128i cvtfp32_fp8e4m3(const __m512& src) {
+  // cvt 16x32 from fp32 to fp8 e4m3
+  const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
+  const __m512i fp8_max = _mm512_set1_epi32(UINT32_C(1087) << 20);
+  const __m512i denorm_thresh = _mm512_set1_epi32(UINT32_C(121) << 23);
+  const __m512i denorm_mask = _mm512_set1_epi32(UINT32_C(141) << 23);
+  const __m512i bias_part1 = _mm512_set1_epi32((uint32_t)(7 - 127) << 23);
+  const __m512i rounding_bias = _mm512_set1_epi32(0x7FFFF);
+  __m512i f_bits = _mm512_castps_si512(src);
+  // Extract and save sign
+  __m512i sign = _mm512_and_epi32(f_bits, sign_mask);
+  f_bits = _mm512_xor_epi32(f_bits, sign);
+
+  // Prepare result containers
+  __m512i result = _mm512_setzero_si512();
+
+  // Step 1: Handle case of overflow
+  // (f_bits >= fp8_max): set result = 0x7f
+  __mmask16 overflow_mask = _mm512_cmpge_epu32_mask(f_bits, fp8_max);
+  if (overflow_mask) {
+    result = _mm512_mask_set1_epi32(result, overflow_mask, 0x7f);
+  }
+
+  // Step 2: Handle small numbers (denormals)
+  // Small numbers (f_bits < denorm_thresh)
+  __mmask16 denorm_thresh_mask = _mm512_cmplt_epu32_mask(f_bits, denorm_thresh);
+
+  if (denorm_thresh_mask) {
+    __m512 small_input = _mm512_castsi512_ps(f_bits);
+    __m512 small_denorm =
+        _mm512_add_ps(small_input, _mm512_castsi512_ps(denorm_mask));
+    __m512i small_denorm_bits = _mm512_castps_si512(small_denorm);
+    __m512i small_result = _mm512_sub_epi32(small_denorm_bits, denorm_mask);
+    result = _mm512_mask_mov_epi32(result, denorm_thresh_mask, small_result);
+  }
+
+  // Step 3: Handle normal numbers
+  __mmask16 normal_mask = ~(overflow_mask | denorm_thresh_mask);
+
+  if (normal_mask) {
+    // mant_odd = (f_bits >> 20) & 1
+    __m512i mant_odd =
+        _mm512_and_epi32(_mm512_srli_epi32(f_bits, 20), _mm512_set1_epi32(1));
+    // f_bits += bias_part1 + rounding_bias
+    __m512i rounded = _mm512_add_epi32(f_bits, bias_part1);
+    rounded = _mm512_add_epi32(rounded, rounding_bias);
+    // Add mant_odd
+    rounded = _mm512_add_epi32(rounded, mant_odd);
+    // Shift right by 20 bits
+    __m512i normal_result = _mm512_srli_epi32(rounded, 20);
+    result = _mm512_mask_mov_epi32(result, normal_mask, normal_result);
+  }
+
+  // Merge back the sign
+  __m512i sign_shifted = _mm512_srli_epi32(sign, 24);
+  result = _mm512_or_epi32(result, sign_shifted);
+
+  // Now result is 16 x 32-bit integers, but we only need 8-bit for each
+  __m512i packed = _mm512_and_si512(result, _mm512_set1_epi32(0xFF));
+
+  // Narrow 32-bit integers to 8-bit
+  return _mm512_cvtepi32_epi8(packed);
+}
+
+static inline float fp8e4m3_to_fp32_scalar(uint8_t val) {
+  __m512i v = _mm512_set1_epi8(val);
+  __m128i v_128 = _mm512_castsi512_si128(v);
+  __m512 o;
+  cvtfp8e4m3_fp32(v_128, o);
+  return _mm512_cvtss_f32(o);
+}
+
+static inline uint8_t fp32_to_fp8e4m3_scalar(float val) {
+  __m512 v = _mm512_set1_ps(val);
+  __m128i o = cvtfp32_fp8e4m3(v);
+  return static_cast<std::uint8_t>(_mm_cvtsi128_si32(o));
+}
+
+static inline void cvtfp8e5m2_fp32(const __m128i& a, __m512& o) {
+  __m256i a_256 = _mm256_castsi128_si256(a);
+  __m512i a_512 = _mm512_cvtepu8_epi16(a_256);
+  a_512 = _mm512_slli_epi16(a_512, 8);
+  a_256 = _mm512_castsi512_si256(a_512);
+  cvtfp16_fp32(a_256, o);
+}
+
+static inline __m128i cvtfp32_fp8e5m2(const __m512& src) {
+  constexpr uint32_t fp32_inf = UINT32_C(255) << 23;
+  constexpr uint32_t fp8_max = UINT32_C(143) << 23;
+  constexpr uint32_t denorm_mask = UINT32_C(134) << 23;
+
+  // Cvt to bits
+  __m512i input_bits = _mm512_castps_si512(src);
+  __m512i result = _mm512_setzero_si512();
+
+  // Get the sign
+  __m512i sign = _mm512_and_si512(input_bits, _mm512_set1_epi32(0x80000000));
+
+  // Get the unsigned input
+  input_bits = _mm512_xor_si512(input_bits, sign);
+
+  // Calculate the mask for inf, nan and denorm
+  __mmask16 greater_than_fp8_max =
+      _mm512_cmpge_epi32_mask(input_bits, _mm512_set1_epi32(fp8_max));
+  __mmask16 greater_than_fp32_inf =
+      _mm512_cmpgt_epi32_mask(input_bits, _mm512_set1_epi32(fp32_inf));
+  __mmask16 less_than_normal = _mm512_cmpgt_epi32_mask(
+      _mm512_set1_epi32((UINT32_C(113) << 23)), input_bits);
+  __m512i temp_bits_for_denorm = _mm512_setzero_si512();
+  if (less_than_normal) {
+    __m512i denorm_mask_512i = _mm512_set1_epi32(denorm_mask);
+    temp_bits_for_denorm = _mm512_castps_si512(_mm512_add_ps(
+        _mm512_castsi512_ps(input_bits),
+        _mm512_castsi512_ps(denorm_mask_512i)));
+    temp_bits_for_denorm =
+        _mm512_sub_epi32(temp_bits_for_denorm, denorm_mask_512i);
+  }
+
+  // Step 1: Norm Val
+  __m512i mant_odd_mask =
+      _mm512_and_epi32(_mm512_srli_epi32(input_bits, 21), _mm512_set1_epi32(1));
+  input_bits = _mm512_add_epi32(
+      input_bits, _mm512_set1_epi32(((uint32_t)(15 - 127) << 23) + 0xFFFFF));
+  input_bits = _mm512_add_epi32(input_bits, mant_odd_mask);
+  result = _mm512_srli_epi32(input_bits, 21);
+
+  // Step 2: INF and NAN
+  if (greater_than_fp8_max) {
+    result = _mm512_mask_mov_epi32(
+        result, greater_than_fp8_max, _mm512_set1_epi8(0x7C));
+    if (greater_than_fp32_inf) {
+      result = _mm512_mask_mov_epi32(
+          result, greater_than_fp32_inf, _mm512_set1_epi8(0x7F));
+    }
+  }
+
+  // Step 3: Denorm val
+  if (less_than_normal) {
+    result =
+        _mm512_mask_mov_epi32(result, less_than_normal, temp_bits_for_denorm);
+  }
+
+  // Step 4: restore sign
+  result = _mm512_or_si512(result, _mm512_srli_epi32(sign, 24));
+
+  return _mm512_cvtepi32_epi8(result);
+}
+
+static inline float fp8e5m2_to_fp32_scalar(uint8_t val) {
+  __m512i v = _mm512_set1_epi8(val);
+  __m128i v_128 = _mm512_castsi512_si128(v);
+  __m512 o;
+  cvtfp8e5m2_fp32(v_128, o);
+  return _mm512_cvtss_f32(o);
+}
+
+static inline uint8_t fp32_to_fp8e5m2_scalar(float val) {
+  __m512 v = _mm512_set1_ps(val);
+  __m128i o = cvtfp32_fp8e5m2(v);
+  return static_cast<std::uint8_t>(_mm_cvtsi128_si32(o));
+}
+
+template <typename T>
+class Vectorizedf8 {
+  static_assert(
+      std::integral_constant < bool,
+      std::is_same_v<T, at::Float8_e4m3fn> || std::is_same_v < T,
+      at::Float8_e5m2 >> ::value,
+      "Support only float8 e4m3.");
+
+ private:
+  __m512i values;
+  template <typename Op, typename VectorizedType>
+  Vectorized<T> inline binary_compare(const VectorizedType& b, Op op) const {
+    __m512 a0, a1, a2, a3;
+    __m512 b0, b1, b2, b3;
+    __m512 o0, o1, o2, o3;
+    if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 0), a0);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 0), b0);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 1), a1);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 1), b1);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 2), a2);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 2), b2);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(values, 3), a3);
+      cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b.values, 3), b3);
+    } else {
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 0), a0);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 0), b0);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 1), a1);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 1), b1);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 2), a2);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 2), b2);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(values, 3), a3);
+      cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b.values, 3), b3);
+    }
+
+    o0 = op(a0, b0);
+    o1 = op(a1, b1);
+    o2 = op(a2, b2);
+    o3 = op(a3, b3);
+    __m128i o128_0, o128_1, o128_2, o128_3;
+    if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+      o128_0 = cvtfp32_fp8e4m3(o0);
+      o128_1 = cvtfp32_fp8e4m3(o1);
+      o128_2 = cvtfp32_fp8e4m3(o2);
+      o128_3 = cvtfp32_fp8e4m3(o3);
+    } else {
+      o128_0 = cvtfp32_fp8e5m2(o0);
+      o128_1 = cvtfp32_fp8e5m2(o1);
+      o128_2 = cvtfp32_fp8e5m2(o2);
+      o128_3 = cvtfp32_fp8e5m2(o3);
+    }
+
+    __m512i result = _mm512_setzero_si512();
+    result = _mm512_inserti32x4(result, o128_0, 0);
+    result = _mm512_inserti32x4(result, o128_1, 1);
+    result = _mm512_inserti32x4(result, o128_2, 2);
+    result = _mm512_inserti32x4(result, o128_3, 3);
+
+    return result;
+  }
+
+ public:
+  using value_type = uint8_t;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 64;
+  }
+  Vectorizedf8() {}
+  Vectorizedf8(__m512i v) : values(v) {}
+  Vectorizedf8(T val) {
+    value_type uw = val.x;
+    values = _mm512_set1_epi8(uw);
+  }
+  operator __m512i() const {
+    return values;
+  }
+  T& operator[](int idx) = delete;
+  const T& operator[](int idx) const = delete;
+  static Vectorized<T> loadu(const void* ptr, int16_t count = size()) {
+    if (count == size()) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+    } else if (count == 16) {
+      // Fast path if only load element number of 16
+      __m128i input_128 =
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+      return _mm512_castsi128_si512(input_128);
+    } else {
+      __mmask64 mask = (1ULL << count) - 1;
+      return _mm512_maskz_loadu_epi8(mask, ptr);
+    }
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      if (count == 16) {
+        // Fast path if only store element number of 16
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i*>(ptr), _mm512_castsi512_si128(values));
+      } else {
+        __mmask64 mask = (1ULL << count) - 1;
+        _mm512_mask_storeu_epi8(ptr, mask, values);
+      }
+    }
+  }
+
+  Vectorized<T> abs() const {
+    return _mm512_andnot_si512(_mm512_set1_epi8(0x80), values);
+  }
+
+  Vectorized<T> inline operator==(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+
+  Vectorized<T> inline operator!=(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+
+  Vectorized<T> inline operator>(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+
+  Vectorized<T> inline operator>=(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+
+  Vectorized<T> inline operator<(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+
+  Vectorized<T> inline operator<=(const Vectorizedf8<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ);
+      return _mm512_castsi512_ps(
+          _mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+};
+
+template <>
+class Vectorized<Float8_e4m3fn> : public Vectorizedf8<Float8_e4m3fn> {
+ public:
+  using Vectorizedf8::Vectorizedf8;
+
+  using value_type = Float8_e4m3fn;
+
+  Vectorized<Float8_e4m3fn> eq(const Vectorized<Float8_e4m3fn>& other) const;
+  Vectorized<Float8_e4m3fn> ne(const Vectorized<Float8_e4m3fn>& other) const;
+  Vectorized<Float8_e4m3fn> gt(const Vectorized<Float8_e4m3fn>& other) const;
+  Vectorized<Float8_e4m3fn> ge(const Vectorized<Float8_e4m3fn>& other) const;
+  Vectorized<Float8_e4m3fn> lt(const Vectorized<Float8_e4m3fn>& other) const;
+  Vectorized<Float8_e4m3fn> le(const Vectorized<Float8_e4m3fn>& other) const;
+};
+
+template <
+    typename T,
+    typename Op,
+    std::enable_if_t<
+        std::is_same_v<T, c10::Float8_e4m3fn> ||
+            std::is_same_v<T, c10::Float8_e5m2>,
+        int> = 0>
+static inline Vectorized<T> binary_fp8_op_as_fp32(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+  __m512 a0, a1, a2, a3;
+  __m512 b0, b1, b2, b3;
+  __m512 o0, o1, o2, o3;
+  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 0), a0);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 0), b0);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 1), a1);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 1), b1);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 2), a2);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 2), b2);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(a, 3), a3);
+    cvtfp8e4m3_fp32(_mm512_extracti32x4_epi32(b, 3), b3);
+  } else {
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 0), a0);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 0), b0);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 1), a1);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 1), b1);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 2), a2);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 2), b2);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(a, 3), a3);
+    cvtfp8e5m2_fp32(_mm512_extracti32x4_epi32(b, 3), b3);
+  }
+  o0 = op(a0, b0);
+  o1 = op(a1, b1);
+  o2 = op(a2, b2);
+  o3 = op(a3, b3);
+
+  __m128i o128_0, o128_1, o128_2, o128_3;
+  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+    o128_0 = cvtfp32_fp8e4m3(o0);
+    o128_1 = cvtfp32_fp8e4m3(o1);
+    o128_2 = cvtfp32_fp8e4m3(o2);
+    o128_3 = cvtfp32_fp8e4m3(o3);
+  } else {
+    o128_0 = cvtfp32_fp8e5m2(o0);
+    o128_1 = cvtfp32_fp8e5m2(o1);
+    o128_2 = cvtfp32_fp8e5m2(o2);
+    o128_3 = cvtfp32_fp8e5m2(o3);
+  }
+
+  __m512i result = _mm512_setzero_si512();
+  result = _mm512_inserti32x4(result, o128_0, 0);
+  result = _mm512_inserti32x4(result, o128_1, 1);
+  result = _mm512_inserti32x4(result, o128_2, 2);
+  result = _mm512_inserti32x4(result, o128_3, 3);
+
+  return result;
+}
+
+// Refer to
+// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
+// -, *, /, planed to be deleted in the future and here is just to make compiler
+// happy
+Vectorized<Float8_e4m3fn> inline operator+(
+    const Vectorized<Float8_e4m3fn>& a,
+    const Vectorized<Float8_e4m3fn>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_add_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e4m3fn> inline operator-(
+    const Vectorized<Float8_e4m3fn>& a,
+    const Vectorized<Float8_e4m3fn>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_sub_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e4m3fn> inline operator*(
+    const Vectorized<Float8_e4m3fn>& a,
+    const Vectorized<Float8_e4m3fn>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_mul_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e4m3fn> inline operator/(
+    const Vectorized<Float8_e4m3fn>& a,
+    const Vectorized<Float8_e4m3fn>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_div_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e4m3fn> inline operator&(
+    const Vectorized<Float8_e4m3fn>& a,
+    const Vectorized<Float8_e4m3fn>& b) {
+  return _mm512_and_si512(a, b);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::eq(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this == other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::ne(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this == other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::gt(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this > other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::ge(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this >= other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::lt(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this < other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+inline Vectorized<Float8_e4m3fn> Vectorized<Float8_e4m3fn>::le(
+    const Vectorized<Float8_e4m3fn>& other) const {
+  return (*this <= other) & Vectorized<Float8_e4m3fn>(1.0f);
+}
+
+template <>
+class Vectorized<Float8_e5m2> : public Vectorizedf8<Float8_e5m2> {
+ public:
+  using Vectorizedf8::Vectorizedf8;
+
+  using value_type = Float8_e5m2;
+
+  Vectorized<Float8_e5m2> eq(const Vectorized<Float8_e5m2>& other) const;
+  Vectorized<Float8_e5m2> ne(const Vectorized<Float8_e5m2>& other) const;
+  Vectorized<Float8_e5m2> gt(const Vectorized<Float8_e5m2>& other) const;
+  Vectorized<Float8_e5m2> ge(const Vectorized<Float8_e5m2>& other) const;
+  Vectorized<Float8_e5m2> lt(const Vectorized<Float8_e5m2>& other) const;
+  Vectorized<Float8_e5m2> le(const Vectorized<Float8_e5m2>& other) const;
+};
+
+// Refer to
+// https://github.com/pytorch/pytorch/pull/153364#discussion_r2086509353 FP8 +,
+// -, *, /, planed to be deleted in the future and here is just to make compiler
+// happy
+Vectorized<Float8_e5m2> inline operator+(
+    const Vectorized<Float8_e5m2>& a,
+    const Vectorized<Float8_e5m2>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_add_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e5m2> inline operator-(
+    const Vectorized<Float8_e5m2>& a,
+    const Vectorized<Float8_e5m2>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_sub_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e5m2> inline operator*(
+    const Vectorized<Float8_e5m2>& a,
+    const Vectorized<Float8_e5m2>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_mul_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e5m2> inline operator/(
+    const Vectorized<Float8_e5m2>& a,
+    const Vectorized<Float8_e5m2>& b) {
+  return binary_fp8_op_as_fp32(a, b, [](const __m512& x, const __m512& y) {
+    return _mm512_div_ps(x, y);
+  });
+}
+
+Vectorized<Float8_e5m2> inline operator&(
+    const Vectorized<Float8_e5m2>& a,
+    const Vectorized<Float8_e5m2>& b) {
+  return _mm512_and_si512(a, b);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::eq(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this == other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::ne(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this == other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::gt(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this > other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::ge(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this >= other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::lt(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this < other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+inline Vectorized<Float8_e5m2> Vectorized<Float8_e5m2>::le(
+    const Vectorized<Float8_e5m2>& other) const {
+  return (*this <= other) & Vectorized<Float8_e5m2>(1.0f);
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
index aa19977e332f..12f45990cf8c 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@@ -8,21 +8,35 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::vec {
 inline namespace CPU_CAPABILITY {
 
 #ifdef CPU_CAPABILITY_AVX512
 
 struct Vectorizedi {
+<<<<<<< HEAD
 protected:
   __m512i values;
   static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
+=======
+ protected:
+  __m512i values;
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline __m512i invert(const __m512i& v) {
     const auto ones = _mm512_set1_epi64(-1);
     return _mm512_xor_si512(ones, v);
   }
+<<<<<<< HEAD
 public:
+=======
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorizedi() {}
   Vectorizedi(__m512i v) : values(v) {}
   operator __m512i() const {
@@ -32,17 +46,32 @@ struct Vectorizedi {
 
 #else
 
+<<<<<<< HEAD
 struct Vectorizedi {};  // dummy definition to make Vectorizedi always defined
+=======
+struct Vectorizedi {}; // dummy definition to make Vectorizedi always defined
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #endif // CPU_CAPABILITY_AVX512
 
 #ifdef CPU_CAPABILITY_AVX512
 
 template <>
+<<<<<<< HEAD
 class Vectorized<int64_t> : public Vectorizedi {
 private:
   static const Vectorized<int64_t> ones;
 public:
+=======
+struct is_vec_specialized_for<int64_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int64_t> : public Vectorizedi {
+ private:
+  static const Vectorized<int64_t> ones;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = int64_t;
   using size_type = int;
   static constexpr size_type size() {
@@ -50,6 +79,7 @@ class Vectorized<int64_t> : public Vectorizedi {
   }
   using Vectorizedi::Vectorizedi;
   Vectorized() {}
+<<<<<<< HEAD
   Vectorized(int64_t v) { values = _mm512_set1_epi64(v); }
   Vectorized(int64_t val1, int64_t val2, int64_t val3, int64_t val4,
          int64_t val5, int64_t val6, int64_t val7, int64_t val8) {
@@ -62,17 +92,63 @@ class Vectorized<int64_t> : public Vectorizedi {
   }
   static Vectorized<int64_t> blendv(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b,
                                 const Vectorized<int64_t>& mask) {
+=======
+  Vectorized(int64_t v) {
+    values = _mm512_set1_epi64(v);
+  }
+  Vectorized(
+      int64_t val1,
+      int64_t val2,
+      int64_t val3,
+      int64_t val4,
+      int64_t val5,
+      int64_t val6,
+      int64_t val7,
+      int64_t val8) {
+    values = _mm512_setr_epi64(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
+  template <int64_t mask>
+  static Vectorized<int64_t> blend(
+      Vectorized<int64_t> a,
+      Vectorized<int64_t> b) {
+    return _mm512_mask_blend_epi64(mask, a.values, b.values);
+  }
+  static Vectorized<int64_t> blendv(
+      const Vectorized<int64_t>& a,
+      const Vectorized<int64_t>& b,
+      const Vectorized<int64_t>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto msb_one = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF);
     auto mask_ = _mm512_cmp_epi64_mask(mask, msb_one, _MM_CMPINT_EQ);
     return _mm512_mask_blend_epi64(mask_, a.values, b.values);
   }
   template <typename step_t>
+<<<<<<< HEAD
   static Vectorized<int64_t> arange(int64_t base = 0, step_t step = static_cast<step_t>(1)) {
     return Vectorized<int64_t>(base,            base + step,     base + 2 * step, base + 3 * step,
                            base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
   }
   static Vectorized<int64_t>
   set(Vectorized<int64_t> a, Vectorized<int64_t> b, int64_t count = size()) {
+=======
+  static Vectorized<int64_t> arange(
+      int64_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int64_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<int64_t> set(
+      Vectorized<int64_t> a,
+      Vectorized<int64_t> b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -114,11 +190,20 @@ class Vectorized<int64_t> : public Vectorizedi {
       _mm512_mask_storeu_epi64(ptr, mask, values);
     }
   }
+<<<<<<< HEAD
   const int64_t& operator[](int idx) const  = delete;
   int64_t& operator[](int idx)  = delete;
   Vectorized<int64_t> abs() const {
     auto is_larger_mask = _mm512_cmpgt_epi64_mask(zero_vector, values);
     auto is_larger = _mm512_mask_set1_epi64(zero_vector, is_larger_mask, 0xFFFFFFFFFFFFFFFF);
+=======
+  const int64_t& operator[](int idx) const = delete;
+  int64_t& operator[](int idx) = delete;
+  Vectorized<int64_t> abs() const {
+    auto is_larger_mask = _mm512_cmpgt_epi64_mask(zero_vector, values);
+    auto is_larger =
+        _mm512_mask_set1_epi64(zero_vector, is_larger_mask, 0xFFFFFFFFFFFFFFFF);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto inverse = _mm512_xor_si512(values, is_larger);
     return _mm512_sub_epi64(inverse, is_larger);
   }
@@ -166,17 +251,29 @@ class Vectorized<int64_t> : public Vectorizedi {
 };
 
 template <>
+<<<<<<< HEAD
 class Vectorized<int32_t> : public Vectorizedi {
 private:
   static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
   static const Vectorized<int32_t> ones;
 public:
+=======
+struct is_vec_specialized_for<int32_t> : std::bool_constant<true> {};
+template <>
+class Vectorized<int32_t> : public Vectorizedi {
+ private:
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+  static const Vectorized<int32_t> ones;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = int32_t;
   static constexpr int size() {
     return 16;
   }
   using Vectorizedi::Vectorizedi;
   Vectorized() {}
+<<<<<<< HEAD
   Vectorized(int32_t v) { values = _mm512_set1_epi32(v); }
   Vectorized(int32_t val1, int32_t val2, int32_t val3, int32_t val4,
             int32_t val5, int32_t val6, int32_t val7, int32_t val8,
@@ -191,11 +288,62 @@ class Vectorized<int32_t> : public Vectorizedi {
   }
   static Vectorized<int32_t> blendv(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b,
                                 const Vectorized<int32_t>& mask) {
+=======
+  Vectorized(int32_t v) {
+    values = _mm512_set1_epi32(v);
+  }
+  Vectorized(
+      int32_t val1,
+      int32_t val2,
+      int32_t val3,
+      int32_t val4,
+      int32_t val5,
+      int32_t val6,
+      int32_t val7,
+      int32_t val8,
+      int32_t val9,
+      int32_t val10,
+      int32_t val11,
+      int32_t val12,
+      int32_t val13,
+      int32_t val14,
+      int32_t val15,
+      int32_t val16) {
+    values = _mm512_setr_epi32(
+        val1,
+        val2,
+        val3,
+        val4,
+        val5,
+        val6,
+        val7,
+        val8,
+        val9,
+        val10,
+        val11,
+        val12,
+        val13,
+        val14,
+        val15,
+        val16);
+  }
+  template <int64_t mask>
+  static Vectorized<int32_t> blend(
+      Vectorized<int32_t> a,
+      Vectorized<int32_t> b) {
+    return _mm512_mask_blend_epi32(mask, a.values, b.values);
+  }
+  static Vectorized<int32_t> blendv(
+      const Vectorized<int32_t>& a,
+      const Vectorized<int32_t>& b,
+      const Vectorized<int32_t>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto msb_one = _mm512_set1_epi32(0xFFFFFFFF);
     auto mask_ = _mm512_cmp_epi32_mask(mask, msb_one, _MM_CMPINT_EQ);
     return _mm512_mask_blend_epi32(mask_, a.values, b.values);
   }
   template <typename step_t>
+<<<<<<< HEAD
   static Vectorized<int32_t> arange(int32_t base = 0, step_t step = static_cast<step_t>(1)) {
     return Vectorized<int32_t>(
       base,             base +      step, base +  2 * step, base +  3 * step,
@@ -205,6 +353,33 @@ class Vectorized<int32_t> : public Vectorizedi {
   }
   static Vectorized<int32_t>
   set(Vectorized<int32_t> a, Vectorized<int32_t> b, int32_t count = size()) {
+=======
+  static Vectorized<int32_t> arange(
+      int32_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int32_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+  static Vectorized<int32_t> set(
+      Vectorized<int32_t> a,
+      Vectorized<int32_t> b,
+      int32_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -262,8 +437,13 @@ class Vectorized<int32_t> : public Vectorizedi {
       _mm512_mask_storeu_epi32(ptr, mask, values);
     }
   }
+<<<<<<< HEAD
   const int32_t& operator[](int idx) const  = delete;
   int32_t& operator[](int idx)  = delete;
+=======
+  const int32_t& operator[](int idx) const = delete;
+  int32_t& operator[](int idx) = delete;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<int32_t> abs() const {
     return _mm512_abs_epi32(values);
   }
@@ -316,6 +496,7 @@ class Vectorized<int32_t> : public Vectorizedi {
 };
 
 template <>
+<<<<<<< HEAD
 inline void convert(const int32_t *src, float *dst, int64_t n) {
   int64_t i;
   // int32_t and float have same size
@@ -324,11 +505,27 @@ inline void convert(const int32_t *src, float *dst, int64_t n) {
 #endif
   for (i = 0; i <= (n - Vectorized<int32_t>::size()); i += Vectorized<int32_t>::size()) {
     auto input_vec = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i));
+=======
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  int64_t i;
+  // int32_t and float have same size
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<int32_t>::size());
+       i += Vectorized<int32_t>::size()) {
+    auto input_vec =
+        _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto output_vec = _mm512_cvtepi32_ps(input_vec);
     _mm512_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
   }
 #ifndef _MSC_VER
+<<<<<<< HEAD
 # pragma unroll
+=======
+#pragma unroll
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   for (; i < n; i++) {
     dst[i] = static_cast<float>(src[i]);
@@ -336,6 +533,7 @@ inline void convert(const int32_t *src, float *dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 inline void convert(const int32_t *src, double *dst, int64_t n) {
   int64_t i;
   // int32_t has half the size of double
@@ -344,11 +542,27 @@ inline void convert(const int32_t *src, double *dst, int64_t n) {
 #endif
   for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
     auto input_256_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
+=======
+inline void convert(const int32_t* src, double* dst, int64_t n) {
+  int64_t i;
+  // int32_t has half the size of double
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+    auto input_256_vec =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto output_vec = _mm512_cvtepi32_pd(input_256_vec);
     _mm512_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
   }
 #ifndef _MSC_VER
+<<<<<<< HEAD
 # pragma unroll
+=======
+#pragma unroll
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   for (; i < n; i++) {
     dst[i] = static_cast<double>(src[i]);
@@ -356,17 +570,30 @@ inline void convert(const int32_t *src, double *dst, int64_t n) {
 }
 
 template <>
+<<<<<<< HEAD
 class Vectorized<int16_t> : public Vectorizedi {
 private:
   static const Vectorized<int16_t> ones;
   static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
 public:
+=======
+struct is_vec_specialized_for<int16_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int16_t> : public Vectorizedi {
+ private:
+  static const Vectorized<int16_t> ones;
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = int16_t;
   static constexpr int size() {
     return 32;
   }
   using Vectorizedi::Vectorizedi;
   Vectorized() {}
+<<<<<<< HEAD
   Vectorized(int16_t v) { values = _mm512_set1_epi16(v); }
   Vectorized(int16_t val1, int16_t val2, int16_t val3, int16_t val4,
          int16_t val5, int16_t val6, int16_t val7, int16_t val8,
@@ -387,11 +614,94 @@ class Vectorized<int16_t> : public Vectorizedi {
   }
   static Vectorized<int16_t> blendv(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b,
                                 const Vectorized<int16_t>& mask) {
+=======
+  Vectorized(int16_t v) {
+    values = _mm512_set1_epi16(v);
+  }
+  Vectorized(
+      int16_t val1,
+      int16_t val2,
+      int16_t val3,
+      int16_t val4,
+      int16_t val5,
+      int16_t val6,
+      int16_t val7,
+      int16_t val8,
+      int16_t val9,
+      int16_t val10,
+      int16_t val11,
+      int16_t val12,
+      int16_t val13,
+      int16_t val14,
+      int16_t val15,
+      int16_t val16,
+      int16_t val17,
+      int16_t val18,
+      int16_t val19,
+      int16_t val20,
+      int16_t val21,
+      int16_t val22,
+      int16_t val23,
+      int16_t val24,
+      int16_t val25,
+      int16_t val26,
+      int16_t val27,
+      int16_t val28,
+      int16_t val29,
+      int16_t val30,
+      int16_t val31,
+      int16_t val32) {
+    values = _mm512_set_epi16(
+        val32,
+        val31,
+        val30,
+        val29,
+        val28,
+        val27,
+        val26,
+        val25,
+        val24,
+        val23,
+        val22,
+        val21,
+        val20,
+        val19,
+        val18,
+        val17,
+        val16,
+        val15,
+        val14,
+        val13,
+        val12,
+        val11,
+        val10,
+        val9,
+        val8,
+        val7,
+        val6,
+        val5,
+        val4,
+        val3,
+        val2,
+        val1);
+  }
+  template <int64_t mask>
+  static Vectorized<int16_t> blend(
+      Vectorized<int16_t> a,
+      Vectorized<int16_t> b) {
+    return _mm512_mask_blend_epi16(mask, a.values, b.values);
+  }
+  static Vectorized<int16_t> blendv(
+      const Vectorized<int16_t>& a,
+      const Vectorized<int16_t>& b,
+      const Vectorized<int16_t>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto msb_one = _mm512_set1_epi16(0xFFFF);
     auto mask_ = _mm512_cmp_epi16_mask(mask, msb_one, _MM_CMPINT_EQ);
     return _mm512_mask_blend_epi16(mask_, a.values, b.values);
   }
   template <typename step_t>
+<<<<<<< HEAD
   static Vectorized<int16_t> arange(int16_t base = 0, step_t step = static_cast<step_t>(1)) {
     return Vectorized<int16_t>(
       base,             base +      step, base +  2 * step, base +  3 * step,
@@ -406,6 +716,49 @@ class Vectorized<int16_t> : public Vectorizedi {
   }
   static Vectorized<int16_t>
   set(Vectorized<int16_t> a, Vectorized<int16_t> b, int16_t count = size()) {
+=======
+  static Vectorized<int16_t> arange(
+      int16_t base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int16_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step,
+        base + 16 * step,
+        base + 17 * step,
+        base + 18 * step,
+        base + 19 * step,
+        base + 20 * step,
+        base + 21 * step,
+        base + 22 * step,
+        base + 23 * step,
+        base + 24 * step,
+        base + 25 * step,
+        base + 26 * step,
+        base + 27 * step,
+        base + 28 * step,
+        base + 29 * step,
+        base + 30 * step,
+        base + 31 * step);
+  }
+  static Vectorized<int16_t> set(
+      Vectorized<int16_t> a,
+      Vectorized<int16_t> b,
+      int16_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -495,8 +848,13 @@ class Vectorized<int16_t> : public Vectorizedi {
       _mm512_mask_storeu_epi16(ptr, mask, values);
     }
   }
+<<<<<<< HEAD
   const int16_t& operator[](int idx) const  = delete;
   int16_t& operator[](int idx)  = delete;
+=======
+  const int16_t& operator[](int idx) const = delete;
+  int16_t& operator[](int idx) = delete;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<int16_t> abs() const {
     return _mm512_abs_epi16(values);
   }
@@ -546,18 +904,30 @@ class Vectorized<int16_t> : public Vectorizedi {
 template <typename T>
 class Vectorized8 : public Vectorizedi {
   static_assert(
+<<<<<<< HEAD
     std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
     "Only int8_t/uint8_t are supported");
 protected:
   static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
   static const Vectorized<T> ones;
 public:
+=======
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
+      "Only int8_t/uint8_t are supported");
+
+ protected:
+  static constexpr __m512i zero_vector{0, 0, 0, 0, 0, 0, 0, 0};
+  static const Vectorized<T> ones;
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = T;
   static constexpr int size() {
     return 64;
   }
   using Vectorizedi::Vectorizedi;
   Vectorized8() {}
+<<<<<<< HEAD
   Vectorized8(T v) { values = _mm512_set1_epi8(v); }
   Vectorized8(T val1, T val2, T val3, T val4,
          T val5, T val6, T val7, T val8,
@@ -583,12 +953,148 @@ class Vectorized8 : public Vectorizedi {
                               val24, val23, val22, val21, val20, val19, val18, val17,
                               val16, val15, val14, val13, val12, val11, val10, val9,
                               val8, val7, val6, val5, val4, val3, val2, val1);
+=======
+  Vectorized8(T v) {
+    values = _mm512_set1_epi8(v);
+  }
+  Vectorized8(
+      T val1,
+      T val2,
+      T val3,
+      T val4,
+      T val5,
+      T val6,
+      T val7,
+      T val8,
+      T val9,
+      T val10,
+      T val11,
+      T val12,
+      T val13,
+      T val14,
+      T val15,
+      T val16,
+      T val17,
+      T val18,
+      T val19,
+      T val20,
+      T val21,
+      T val22,
+      T val23,
+      T val24,
+      T val25,
+      T val26,
+      T val27,
+      T val28,
+      T val29,
+      T val30,
+      T val31,
+      T val32,
+      T val33,
+      T val34,
+      T val35,
+      T val36,
+      T val37,
+      T val38,
+      T val39,
+      T val40,
+      T val41,
+      T val42,
+      T val43,
+      T val44,
+      T val45,
+      T val46,
+      T val47,
+      T val48,
+      T val49,
+      T val50,
+      T val51,
+      T val52,
+      T val53,
+      T val54,
+      T val55,
+      T val56,
+      T val57,
+      T val58,
+      T val59,
+      T val60,
+      T val61,
+      T val62,
+      T val63,
+      T val64) {
+    values = _mm512_set_epi8(
+        val64,
+        val63,
+        val62,
+        val61,
+        val60,
+        val59,
+        val58,
+        val57,
+        val56,
+        val55,
+        val54,
+        val53,
+        val52,
+        val51,
+        val50,
+        val49,
+        val48,
+        val47,
+        val46,
+        val45,
+        val44,
+        val43,
+        val42,
+        val41,
+        val40,
+        val39,
+        val38,
+        val37,
+        val36,
+        val35,
+        val34,
+        val33,
+        val32,
+        val31,
+        val30,
+        val29,
+        val28,
+        val27,
+        val26,
+        val25,
+        val24,
+        val23,
+        val22,
+        val21,
+        val20,
+        val19,
+        val18,
+        val17,
+        val16,
+        val15,
+        val14,
+        val13,
+        val12,
+        val11,
+        val10,
+        val9,
+        val8,
+        val7,
+        val6,
+        val5,
+        val4,
+        val3,
+        val2,
+        val1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   template <int64_t mask>
   static Vectorized<T> blend(Vectorized<T> a, Vectorized<T> b) {
     return _mm512_mask_blend_epi8(mask, a.values, b.values);
   }
   template <typename step_t>
+<<<<<<< HEAD
   static Vectorized<T> arange(T base = 0, step_t step = static_cast<step_t>(1)) {
     return Vectorized<T>(
       base,             base +      step, base +  2 * step, base +  3 * step,
@@ -610,6 +1116,78 @@ class Vectorized8 : public Vectorizedi {
   }
   static Vectorized<T>
   set(Vectorized<T> a, Vectorized<T> b, T count = size()) {
+=======
+  static Vectorized<T> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step,
+        base + 16 * step,
+        base + 17 * step,
+        base + 18 * step,
+        base + 19 * step,
+        base + 20 * step,
+        base + 21 * step,
+        base + 22 * step,
+        base + 23 * step,
+        base + 24 * step,
+        base + 25 * step,
+        base + 26 * step,
+        base + 27 * step,
+        base + 28 * step,
+        base + 29 * step,
+        base + 30 * step,
+        base + 31 * step,
+        base + 32 * step,
+        base + 33 * step,
+        base + 34 * step,
+        base + 35 * step,
+        base + 36 * step,
+        base + 37 * step,
+        base + 38 * step,
+        base + 39 * step,
+        base + 40 * step,
+        base + 41 * step,
+        base + 42 * step,
+        base + 43 * step,
+        base + 44 * step,
+        base + 45 * step,
+        base + 46 * step,
+        base + 47 * step,
+        base + 48 * step,
+        base + 49 * step,
+        base + 50 * step,
+        base + 51 * step,
+        base + 52 * step,
+        base + 53 * step,
+        base + 54 * step,
+        base + 55 * step,
+        base + 56 * step,
+        base + 57 * step,
+        base + 58 * step,
+        base + 59 * step,
+        base + 60 * step,
+        base + 61 * step,
+        base + 62 * step,
+        base + 63 * step);
+  }
+  static Vectorized<T> set(Vectorized<T> a, Vectorized<T> b, T count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (count) {
       case 0:
         return a;
@@ -746,6 +1324,7 @@ class Vectorized8 : public Vectorizedi {
     return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
   }
   static Vectorized<T> loadu_one_fourth(const void* ptr) {
+<<<<<<< HEAD
       // Fast path if only load element number of 16.
       // Note: We didn't merge it as fast path of loadu(const void* ptr, T count),
       // Because loadu(const void* ptr, T count) requires zero initialization for upper 384 bits.
@@ -754,6 +1333,17 @@ class Vectorized8 : public Vectorizedi {
       // since gcc 9.3 doesn't support it now.
       __m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
       return _mm512_castsi128_si512(input_128);
+=======
+    // Fast path if only load element number of 16.
+    // Note: We didn't merge it as fast path of loadu(const void* ptr, T count),
+    // Because loadu(const void* ptr, T count) requires zero initialization for
+    // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384
+    // bits of the result are undefined.
+    // TODO<leslie> We can use _mm512_zextsi128_si512 in the furture,
+    // since gcc 9.3 doesn't support it now.
+    __m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+    return _mm512_castsi128_si512(input_128);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   static Vectorized<T> loadu(const void* ptr, T count) {
     if (count == size()) {
@@ -775,16 +1365,25 @@ class Vectorized8 : public Vectorizedi {
       if (count == 16) {
         // Fast path if only store element number of 16
         _mm_storeu_si128(
+<<<<<<< HEAD
           reinterpret_cast<__m128i*>(ptr),
           _mm512_castsi512_si128(values));
+=======
+            reinterpret_cast<__m128i*>(ptr), _mm512_castsi512_si128(values));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } else {
         __mmask64 mask = (1ULL << count) - 1;
         _mm512_mask_storeu_epi8(ptr, mask, values);
       }
     }
   }
+<<<<<<< HEAD
   const T& operator[](int idx) const  = delete;
   T& operator[](int idx)  = delete;
+=======
+  const T& operator[](int idx) const = delete;
+  T& operator[](int idx) = delete;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> real() const {
     return *this;
   }
@@ -796,6 +1395,7 @@ class Vectorized8 : public Vectorizedi {
   }
 };
 
+<<<<<<< HEAD
 template<>
 class Vectorized<int8_t>: public Vectorized8<int8_t> {
 public:
@@ -803,6 +1403,20 @@ class Vectorized<int8_t>: public Vectorized8<int8_t> {
 
   static Vectorized<int8_t> blendv(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b,
                                const Vectorized<int8_t>& mask) {
+=======
+template <>
+struct is_vec_specialized_for<int8_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<int8_t> : public Vectorized8<int8_t> {
+ public:
+  using Vectorized8::Vectorized8;
+
+  static Vectorized<int8_t> blendv(
+      const Vectorized<int8_t>& a,
+      const Vectorized<int8_t>& b,
+      const Vectorized<int8_t>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto msb_one = _mm512_set1_epi8(0xFF);
     auto mask_ = _mm512_cmp_epi8_mask(mask, msb_one, _MM_CMPINT_EQ);
     return _mm512_mask_blend_epi8(mask_, a.values, b.values);
@@ -845,6 +1459,7 @@ class Vectorized<int8_t>: public Vectorized8<int8_t> {
   Vectorized<int8_t> le(const Vectorized<int8_t>& other) const;
 };
 
+<<<<<<< HEAD
 template<>
 class Vectorized<uint8_t>: public Vectorized8<uint8_t> {
 public:
@@ -852,6 +1467,20 @@ class Vectorized<uint8_t>: public Vectorized8<uint8_t> {
 
   static Vectorized<uint8_t> blendv(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b,
                                const Vectorized<uint8_t>& mask) {
+=======
+template <>
+struct is_vec_specialized_for<uint8_t> : std::bool_constant<true> {};
+
+template <>
+class Vectorized<uint8_t> : public Vectorized8<uint8_t> {
+ public:
+  using Vectorized8::Vectorized8;
+
+  static Vectorized<uint8_t> blendv(
+      const Vectorized<uint8_t>& a,
+      const Vectorized<uint8_t>& b,
+      const Vectorized<uint8_t>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto msb_one = _mm512_set1_epi8(0xFF);
     auto mask_ = _mm512_cmp_epu8_mask(mask, msb_one, _MM_CMPINT_EQ);
     return _mm512_mask_blend_epi8(mask_, a.values, b.values);
@@ -895,52 +1524,112 @@ class Vectorized<uint8_t>: public Vectorized8<uint8_t> {
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator+(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_add_epi64(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator+(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_add_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator+(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_add_epi16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator+(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator+(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_add_epi8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline operator+(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline operator+(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_add_epi8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator-(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator-(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_sub_epi64(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator-(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator-(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_sub_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator-(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator-(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_sub_epi16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator-(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator-(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_sub_epi8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline operator-(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline operator-(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_sub_epi8(a, b);
 }
 
@@ -966,22 +1655,47 @@ inline Vectorized<uint8_t> Vectorized<uint8_t>::neg() const {
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator*(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator*(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_mullo_epi64(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator*(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator*(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_mullo_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator*(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator*(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_mullo_epi16(a, b);
 }
 
 template <typename T, typename Op>
+<<<<<<< HEAD
 Vectorized<T> inline int_elementwise_binary_512(const Vectorized<T>& a, const Vectorized<T>& b, Op op) {
+=======
+Vectorized<T> inline int_elementwise_binary_512(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   T values_a[Vectorized<T>::size()];
   T values_b[Vectorized<T>::size()];
   a.store(values_a);
@@ -993,7 +1707,13 @@ Vectorized<T> inline int_elementwise_binary_512(const Vectorized<T>& a, const Ve
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator*(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator*(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We don't have an instruction for multiplying int8_t
 #ifndef CPU_CAPABILITY_AVX512
   return int_elementwise_binary_512(a, b, std::multiplies<int8_t>());
@@ -1011,14 +1731,25 @@ Vectorized<int8_t> inline operator*(const Vectorized<int8_t>& a, const Vectorize
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline operator*(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline operator*(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We don't have an instruction for multiplying uint8_t
 #ifndef CPU_CAPABILITY_AVX512
   return int_elementwise_binary_512(a, b, std::multiplies<uint8_t>());
 #else
   __m512i mask00FF = _mm512_set1_epi16(0x00FF);
+<<<<<<< HEAD
   __m512i a_lo = _mm512_and_si512 (a, mask00FF);
   __m512i b_lo = _mm512_and_si512 (b, mask00FF);
+=======
+  __m512i a_lo = _mm512_and_si512(a, mask00FF);
+  __m512i b_lo = _mm512_and_si512(b, mask00FF);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512i a_hi = _mm512_srli_epi16(a, 8);
   __m512i b_hi = _mm512_srli_epi16(b, 8);
   __m512i res_lo = _mm512_and_si512(_mm512_mullo_epi16(a_lo, b_lo), mask00FF);
@@ -1029,126 +1760,276 @@ Vectorized<uint8_t> inline operator*(const Vectorized<uint8_t>& a, const Vectori
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline minimum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline minimum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi64(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline minimum(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline minimum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline minimum(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline minimum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline minimum(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline minimum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline minimum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline minimum(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epu8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline maximum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline maximum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_epi64(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline maximum(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline maximum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline maximum(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline maximum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_epi16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline maximum(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline maximum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_epi8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline maximum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline maximum(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_epu8(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline clamp(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val, const Vectorized<int64_t>& max_val) {
+=======
+Vectorized<int64_t> inline clamp(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min_val,
+    const Vectorized<int64_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi64(max_val, _mm512_max_epi64(a, min_val));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline clamp(const Vectorized<int32_t>& a, const Vectorized<int32_t>& min_val, const Vectorized<int32_t>& max_val) {
+=======
+Vectorized<int32_t> inline clamp(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min_val,
+    const Vectorized<int32_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi32(max_val, _mm512_max_epi32(a, min_val));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline clamp(const Vectorized<int16_t>& a, const Vectorized<int16_t>& min_val, const Vectorized<int16_t>& max_val) {
+=======
+Vectorized<int16_t> inline clamp(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min_val,
+    const Vectorized<int16_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi16(max_val, _mm512_max_epi16(a, min_val));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline clamp(const Vectorized<int8_t>& a, const Vectorized<int8_t>& min_val, const Vectorized<int8_t>& max_val) {
+=======
+Vectorized<int8_t> inline clamp(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min_val,
+    const Vectorized<int8_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi8(max_val, _mm512_max_epi8(a, min_val));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline clamp(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val, const Vectorized<uint8_t>& max_val) {
+=======
+Vectorized<uint8_t> inline clamp(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& min_val,
+    const Vectorized<uint8_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epu8(max_val, _mm512_max_epu8(a, min_val));
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline clamp_max(const Vectorized<int64_t>& a, const Vectorized<int64_t>& max_val) {
+=======
+Vectorized<int64_t> inline clamp_max(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi64(max_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline clamp_max(const Vectorized<int32_t>& a, const Vectorized<int32_t>& max_val) {
+=======
+Vectorized<int32_t> inline clamp_max(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi32(max_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline clamp_max(const Vectorized<int16_t>& a, const Vectorized<int16_t>& max_val) {
+=======
+Vectorized<int16_t> inline clamp_max(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi16(max_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline clamp_max(const Vectorized<int8_t>& a, const Vectorized<int8_t>& max_val) {
+=======
+Vectorized<int8_t> inline clamp_max(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epi8(max_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline clamp_max(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& max_val) {
+=======
+Vectorized<uint8_t> inline clamp_max(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& max_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_min_epu8(max_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline clamp_min(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val) {
+=======
+Vectorized<int64_t> inline clamp_min(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_epi64(min_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline clamp_min(const Vectorized<int32_t>& a, const Vectorized<int32_t>& min_val) {
+=======
+Vectorized<int32_t> inline clamp_min(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_epi32(min_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline clamp_min(const Vectorized<int16_t>& a, const Vectorized<int16_t>& min_val) {
+=======
+Vectorized<int16_t> inline clamp_min(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_epi16(min_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline clamp_min(const Vectorized<int8_t>& a, const Vectorized<int8_t>& min_val) {
+=======
+Vectorized<int8_t> inline clamp_min(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min_val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_max_epi8(min_val, a);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline clamp_min(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val) {
   return _mm512_max_epu8(min_val, a);
 }
@@ -1164,17 +2045,52 @@ std::enable_if_t<std::is_same_v<T, int8_t>, Vectorized<int32_t>>
 inline convert_to_int32(const int8_t* ptr, int count=Vectorized<int32_t>::size()) {
   if (count == Vectorized<int32_t>::size()) {
     return _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
+=======
+Vectorized<uint8_t> inline clamp_min(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& min_val) {
+  return _mm512_max_epu8(min_val, a);
+}
+
+template <typename T>
+std::enable_if_t<
+    !(std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>),
+    Vectorized<
+        int32_t>> inline convert_to_int32(const T* ptr, int count = Vectorized<int32_t>::size()) {
+  return Vectorized<int32_t>::loadu(ptr, count);
+}
+
+template <typename T>
+std::
+    enable_if_t<std::is_same_v<T, int8_t>, Vectorized<int32_t>> inline convert_to_int32(
+        const int8_t* ptr,
+        int count = Vectorized<int32_t>::size()) {
+  if (count == Vectorized<int32_t>::size()) {
+    return _mm512_cvtepi8_epi32(
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     auto a = Vectorized<int8_t>::loadu(ptr, count);
     return _mm512_cvtepi8_epi32(_mm512_castsi512_si128(a));
   }
 }
 
+<<<<<<< HEAD
 template<typename T>
 std::enable_if_t<std::is_same_v<T, uint8_t>, Vectorized<int32_t>>
 inline convert_to_int32(const uint8_t* ptr, int count=Vectorized<int32_t>::size()) {
   if (count == Vectorized<int32_t>::size()) {
     return _mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
+=======
+template <typename T>
+std::
+    enable_if_t<std::is_same_v<T, uint8_t>, Vectorized<int32_t>> inline convert_to_int32(
+        const uint8_t* ptr,
+        int count = Vectorized<int32_t>::size()) {
+  if (count == Vectorized<int32_t>::size()) {
+    return _mm512_cvtepu8_epi32(
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     auto a = Vectorized<uint8_t>::loadu(ptr, count);
     return _mm512_cvtepu8_epi32(_mm512_castsi512_si128(a));
@@ -1182,6 +2098,7 @@ inline convert_to_int32(const uint8_t* ptr, int count=Vectorized<int32_t>::size(
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return int_elementwise_binary_512(a, b, std::divides<int64_t>());
 }
@@ -1215,10 +2132,72 @@ inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
   return _mm512_xor_si512(a, b);
 }
 template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+=======
+Vectorized<int64_t> inline operator/(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int64_t>());
+}
+template <>
+Vectorized<int32_t> inline operator/(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int32_t>());
+}
+template <>
+Vectorized<int16_t> inline operator/(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int16_t>());
+}
+template <>
+Vectorized<int8_t> inline operator/(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int8_t>());
+}
+template <>
+Vectorized<uint8_t> inline operator/(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<uint8_t>());
+}
+
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm512_and_si512(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm512_or_si512(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm512_xor_si512(a, b);
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<T> operator~(const Vectorized<T>& a) {
   return _mm512_xor_si512(a, _mm512_set1_epi32(-1));
 }
 
+<<<<<<< HEAD
 inline Vectorized<int64_t> Vectorized<int64_t>::eq(const Vectorized<int64_t>& other) const {
   return (*this == other) & Vectorized<int64_t>(1);
 }
@@ -1341,6 +2320,167 @@ inline Vectorized<uint8_t> Vectorized<uint8_t>::le(const Vectorized<uint8_t>& ot
 
 template <bool left_shift, typename T, typename std::enable_if_t<std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>, int> = 0>
 Vectorized<T> inline shift_512_8(const Vectorized<T>& a, const Vectorized<T>& b) {
+=======
+inline Vectorized<int64_t> Vectorized<int64_t>::eq(
+    const Vectorized<int64_t>& other) const {
+  return (*this == other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ne(
+    const Vectorized<int64_t>& other) const {
+  return (*this != other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::gt(
+    const Vectorized<int64_t>& other) const {
+  return (*this > other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ge(
+    const Vectorized<int64_t>& other) const {
+  return (*this >= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::lt(
+    const Vectorized<int64_t>& other) const {
+  return (*this < other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::le(
+    const Vectorized<int64_t>& other) const {
+  return (*this <= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::eq(
+    const Vectorized<int32_t>& other) const {
+  return (*this == other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ne(
+    const Vectorized<int32_t>& other) const {
+  return (*this != other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::gt(
+    const Vectorized<int32_t>& other) const {
+  return (*this > other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ge(
+    const Vectorized<int32_t>& other) const {
+  return (*this >= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::lt(
+    const Vectorized<int32_t>& other) const {
+  return (*this < other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::le(
+    const Vectorized<int32_t>& other) const {
+  return (*this <= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::eq(
+    const Vectorized<int16_t>& other) const {
+  return (*this == other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ne(
+    const Vectorized<int16_t>& other) const {
+  return (*this != other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::gt(
+    const Vectorized<int16_t>& other) const {
+  return (*this > other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ge(
+    const Vectorized<int16_t>& other) const {
+  return (*this >= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::lt(
+    const Vectorized<int16_t>& other) const {
+  return (*this < other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::le(
+    const Vectorized<int16_t>& other) const {
+  return (*this <= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::eq(
+    const Vectorized<int8_t>& other) const {
+  return (*this == other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ne(
+    const Vectorized<int8_t>& other) const {
+  return (*this != other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::gt(
+    const Vectorized<int8_t>& other) const {
+  return (*this > other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ge(
+    const Vectorized<int8_t>& other) const {
+  return (*this >= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::lt(
+    const Vectorized<int8_t>& other) const {
+  return (*this < other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::le(
+    const Vectorized<int8_t>& other) const {
+  return (*this <= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::eq(
+    const Vectorized<uint8_t>& other) const {
+  return (*this == other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ne(
+    const Vectorized<uint8_t>& other) const {
+  return (*this != other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::gt(
+    const Vectorized<uint8_t>& other) const {
+  return (*this > other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ge(
+    const Vectorized<uint8_t>& other) const {
+  return (*this >= other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::lt(
+    const Vectorized<uint8_t>& other) const {
+  return (*this < other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::le(
+    const Vectorized<uint8_t>& other) const {
+  return (*this <= other) & Vectorized<uint8_t>(1);
+}
+
+template <
+    bool left_shift,
+    typename T,
+    typename std::enable_if_t<
+        std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
+        int> = 0>
+Vectorized<T> inline shift_512_8(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // No vector instruction for shifting int8_t/uint8_t, so emulating
   // it instead.
 
@@ -1350,6 +2490,7 @@ Vectorized<T> inline shift_512_8(const Vectorized<T>& a, const Vectorized<T>& b)
   // M!=N) is set so that shuffle will move element with index M from
   // input pair into element with index N in output pair, and element
   // with index M in output pair will be set to all 0s.
+<<<<<<< HEAD
   __m512i ctl_0_1 = _mm512_set_epi8(62, 0x80, 60, 0x80, 58, 0x80, 56, 0x80,
                                     54, 0x80, 52, 0x80, 50, 0x80, 48, 0x80,
                                     46, 0x80, 44, 0x80, 42, 0x80, 40, 0x80,
@@ -1366,6 +2507,138 @@ Vectorized<T> inline shift_512_8(const Vectorized<T>& a, const Vectorized<T>& b)
                                     0x80, 23, 0x80, 21, 0x80, 19, 0x80, 17,
                                     0x80, 15, 0x80, 13, 0x80, 11, 0x80, 9,
                                     0x80, 7, 0x80, 5, 0x80, 3, 0x80, 1);
+=======
+  __m512i ctl_0_1 = _mm512_set_epi8(
+      62,
+      0x80,
+      60,
+      0x80,
+      58,
+      0x80,
+      56,
+      0x80,
+      54,
+      0x80,
+      52,
+      0x80,
+      50,
+      0x80,
+      48,
+      0x80,
+      46,
+      0x80,
+      44,
+      0x80,
+      42,
+      0x80,
+      40,
+      0x80,
+      38,
+      0x80,
+      36,
+      0x80,
+      34,
+      0x80,
+      32,
+      0x80,
+      30,
+      0x80,
+      28,
+      0x80,
+      26,
+      0x80,
+      24,
+      0x80,
+      22,
+      0x80,
+      20,
+      0x80,
+      18,
+      0x80,
+      16,
+      0x80,
+      14,
+      0x80,
+      12,
+      0x80,
+      10,
+      0x80,
+      8,
+      0x80,
+      6,
+      0x80,
+      4,
+      0x80,
+      2,
+      0x80,
+      0,
+      0x80);
+  __m512i ctl_1_0 = _mm512_set_epi8(
+      0x80,
+      63,
+      0x80,
+      61,
+      0x80,
+      59,
+      0x80,
+      57,
+      0x80,
+      55,
+      0x80,
+      53,
+      0x80,
+      51,
+      0x80,
+      49,
+      0x80,
+      47,
+      0x80,
+      45,
+      0x80,
+      43,
+      0x80,
+      41,
+      0x80,
+      39,
+      0x80,
+      37,
+      0x80,
+      35,
+      0x80,
+      33,
+      0x80,
+      31,
+      0x80,
+      29,
+      0x80,
+      27,
+      0x80,
+      25,
+      0x80,
+      23,
+      0x80,
+      21,
+      0x80,
+      19,
+      0x80,
+      17,
+      0x80,
+      15,
+      0x80,
+      13,
+      0x80,
+      11,
+      0x80,
+      9,
+      0x80,
+      7,
+      0x80,
+      5,
+      0x80,
+      3,
+      0x80,
+      1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Masks for bitwise and operation, treating 512 bits as an array of
   // 8-bit elements, and considering them in pairs of neighboring
@@ -1396,11 +2669,18 @@ Vectorized<T> inline shift_512_8(const Vectorized<T>& a, const Vectorized<T>& b)
   __m512i c0;
   if (left_shift)
     c0 = _mm512_sllv_epi16(a0, b0);
+<<<<<<< HEAD
   else
     if constexpr (std::is_same_v<T, int8_t>)
       c0 = _mm512_srav_epi16(a0, b0);
     else
       c0 = _mm512_srlv_epi16(a0, b0);
+=======
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c0 = _mm512_srav_epi16(a0, b0);
+  else
+    c0 = _mm512_srlv_epi16(a0, b0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c0 = _mm512_shuffle_epi8(c0, ctl_1_0);
 
   // Peform shifting the same way for input array elements with
@@ -1410,11 +2690,18 @@ Vectorized<T> inline shift_512_8(const Vectorized<T>& a, const Vectorized<T>& b)
   __m512i c1;
   if (left_shift)
     c1 = _mm512_sllv_epi16(a1, b1);
+<<<<<<< HEAD
   else
     if constexpr (std::is_same_v<T, int8_t>)
       c1 = _mm512_srav_epi16(a1, b1);
     else
       c1 = _mm512_srlv_epi16(a1, b1);
+=======
+  else if constexpr (std::is_same_v<T, int8_t>)
+    c1 = _mm512_srav_epi16(a1, b1);
+  else
+    c1 = _mm512_srlv_epi16(a1, b1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c1 = _mm512_and_si512(c1, keep_1);
 
   // Merge partial results into the final result.
@@ -1424,55 +2711,120 @@ Vectorized<T> inline shift_512_8(const Vectorized<T>& a, const Vectorized<T>& b)
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator<<(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_sllv_epi64(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator<<(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_sllv_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator<<(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_sllv_epi16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator<<(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return shift_512_8<true>(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline operator<<(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline operator<<(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return shift_512_8<true>(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+=======
+Vectorized<int64_t> inline operator>>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_srav_epi64(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+=======
+Vectorized<int32_t> inline operator>>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_srav_epi32(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+=======
+Vectorized<int16_t> inline operator>>(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_srav_epi16(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+=======
+Vectorized<int8_t> inline operator>>(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return shift_512_8<false>(a, b);
 }
 
 template <>
+<<<<<<< HEAD
 Vectorized<uint8_t> inline operator>>(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+=======
+Vectorized<uint8_t> inline operator>>(
+    const Vectorized<uint8_t>& a,
+    const Vectorized<uint8_t>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return shift_512_8<false>(a, b);
 }
 
 #endif
 
+<<<<<<< HEAD
 }}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_mask.h b/aten/src/ATen/cpu/vec/vec512/vec512_mask.h
index d32e1da1cf72..a40cdcd3b8eb 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_mask.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_mask.h
@@ -85,8 +85,12 @@ struct VecMaskLoad<
     mask_t,
     dst_n,
     std::enable_if_t<
+<<<<<<< HEAD
         std::is_same_v<data_t, BFloat16> ||
         std::is_same_v<data_t, Half>>> {
+=======
+        std::is_same_v<data_t, BFloat16> || std::is_same_v<data_t, Half>>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<data_t, dst_n> apply(
       const data_t* ptr,
       const VecMask<mask_t, dst_n>& vec_mask) {
@@ -152,8 +156,12 @@ struct VecMaskLoad<
     mask_t,
     1,
     std::enable_if_t<
+<<<<<<< HEAD
         std::is_same_v<data_t, int8_t> ||
         std::is_same_v<data_t, uint8_t>>> {
+=======
+        std::is_same_v<data_t, int8_t> || std::is_same_v<data_t, uint8_t>>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<data_t, 1> apply(
       const data_t* ptr,
       const VecMask<mask_t, 1>& vec_mask) {
@@ -174,8 +182,12 @@ struct VecMaskLoad<
     mask_t,
     1,
     std::enable_if_t<
+<<<<<<< HEAD
         std::is_same_v<data_t, int64_t> ||
         std::is_same_v<data_t, double>>> {
+=======
+        std::is_same_v<data_t, int64_t> || std::is_same_v<data_t, double>>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline VectorizedN<data_t, 2> apply(
       const data_t* ptr,
       const VecMask<mask_t, 1>& vec_mask) {
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
index ec14ef51601b..264145b610cd 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
@@ -35,8 +35,13 @@
 // specified by float_vec_return_type.
 //
 // When writing kernels with these vectors, it is expected that floating-
+<<<<<<< HEAD
 // point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
 // iterations.
+=======
+// point operations will be carried out in a loop over
+// Vectorized<T>::float_num_vecs iterations.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at {
 namespace vec {
@@ -62,7 +67,10 @@ struct Vectorizedqi {
   }
 };
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 __m512i pack_saturate_and_clamp(
     __m512i first,
@@ -106,10 +114,19 @@ inline __m512i pack_saturate_and_clamp<uint8_t>(
 }
 
 template <typename T>
+<<<<<<< HEAD
 typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<float>>
 inline convert_int8_to_float(at::vec::Vectorized<T> src) {
   // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
   // Only handle first 16*8 bits
+=======
+typename std::enable_if_t<
+    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+    at::vec::Vectorized<
+        float>> inline convert_int8_to_float(at::vec::Vectorized<T> src) {
+  // Note: this function only convert inputs number of elements equal to
+  // at::vec::Vectorized<float>.size() Only handle first 16*8 bits
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m128i input_128 = _mm512_castsi512_si128(src);
   // Convert from 16*uint8/int8 to 16*int32
   __m512i input_512_extended;
@@ -122,8 +139,15 @@ inline convert_int8_to_float(at::vec::Vectorized<T> src) {
 }
 
 template <typename T>
+<<<<<<< HEAD
 typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<T>>
 inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+=======
+typename std::enable_if_t<
+    std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>,
+    at::vec::Vectorized<
+        T>> inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Convert from float32 to int32 with truncation
   __m512i x_values_int32 = _mm512_cvttps_epi32(src);
 
@@ -134,11 +158,33 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
   constexpr auto max_val = std::numeric_limits<T>::max();
 
   // Convert from int16 to uint8/int8 using unsigned saturation
+<<<<<<< HEAD
   __m512i xyzw_clamped_v = pack_saturate_and_clamp<T>(
       xy_packed_v, xy_packed_v, min_val, max_val);
   __m512i permute_mask_v =
       _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
                       0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
+=======
+  __m512i xyzw_clamped_v =
+      pack_saturate_and_clamp<T>(xy_packed_v, xy_packed_v, min_val, max_val);
+  __m512i permute_mask_v = _mm512_set_epi32(
+      0x0f,
+      0x0b,
+      0x07,
+      0x03,
+      0x0e,
+      0x0a,
+      0x06,
+      0x02,
+      0x0d,
+      0x09,
+      0x05,
+      0x01,
+      0x0c,
+      0x08,
+      0x04,
+      0x00);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
 }
 
@@ -178,12 +224,49 @@ __FORCE_INLINE void QuantizeAvx512(
       0xff, 0xff, 0xff, 0xff,
       0x0c, 0x08, 0x04, 0x00);
   // clang-format on
+<<<<<<< HEAD
   __m512i permute_mask_v =
       _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
                        0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
   __m512i permute_mask_l8_v =
       _mm512_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                        0x00, 0x00, 0x00, 0x00, 0x0c, 0x08, 0x04, 0x00);
+=======
+  __m512i permute_mask_v = _mm512_set_epi32(
+      0x0f,
+      0x0b,
+      0x07,
+      0x03,
+      0x0e,
+      0x0a,
+      0x06,
+      0x02,
+      0x0d,
+      0x09,
+      0x05,
+      0x01,
+      0x0c,
+      0x08,
+      0x04,
+      0x00);
+  __m512i permute_mask_l8_v = _mm512_set_epi32(
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x00,
+      0x0c,
+      0x08,
+      0x04,
+      0x00);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int len_aligned = len / (VLEN * 4) * (VLEN * 4);
   for (; i < len_aligned; i += 4 * VLEN) {
     // x
@@ -226,8 +309,12 @@ __FORCE_INLINE void QuantizeAvx512(
     __m512i xyzw_clamped_v =
         pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
 
+<<<<<<< HEAD
     xyzw_clamped_v =
         _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
+=======
+    xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + i), xyzw_clamped_v);
   }
 
@@ -269,6 +356,7 @@ __FORCE_INLINE void QuantizeAvx512(
   }
 }
 
+<<<<<<< HEAD
 template<>
 struct Vectorized<c10::qint32> : public Vectorizedqi {
     using size_type = int;
@@ -395,6 +483,145 @@ struct Vectorized<c10::qint32> : public Vectorizedqi {
 
 template <>
 Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+=======
+template <>
+struct is_vec_specialized_for<c10::qint32> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint32> : public Vectorizedqi {
+  using size_type = int;
+  static constexpr size_type size() {
+    return 16;
+  }
+
+  static constexpr int float_num_vecs() {
+    return 1;
+  }
+
+  static constexpr int int_num_vecs() {
+    return 1;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, 1>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
+  using value_type = c10::qint32::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+  Vectorized() {}
+
+  Vectorized(__m512i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::qint32& val) {
+    value_type uw = val.val_;
+    vals = _mm512_set1_epi32(uw);
+  }
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm512_storeu_si512((__m512i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    __m512 float_vals = _mm512_cvtepi32_ps(vals);
+    return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    __m512 float_vals = _mm512_cvtepi32_ps(vals);
+    return {(Vectorized<float>(float_vals) - zero_point) * scale};
+  }
+
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale [[maybe_unused]]) {
+    Vectorized<c10::qint32> retval;
+    auto rhs_data = (__m512)rhs[0];
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 16);
+    return retval;
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    return _mm512_max_epi32(vals, b.vals);
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    return _mm512_min_epi32(vals, b.vals);
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    return _mm512_min_epi32(
+        _mm512_max_epi32(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    return {_mm512_sub_epi32(vals, b)};
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m512 multiplier_v = _mm512_set1_ps(multiplier);
+    __m512i zero_point_v = _mm512_set1_epi32(zero_point);
+
+    __m512 scaled = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier_v);
+    __m512i rounded = _mm512_cvtps_epi32(scaled);
+    return _mm512_add_epi32(rounded, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm512_loadu_si512((const __m512i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
@@ -425,9 +652,29 @@ __m512i RequantizeAvx512(
       "Only int8_t/uint8_t are supported");
   constexpr auto min_val = std::numeric_limits<T>::min();
   constexpr auto max_val = std::numeric_limits<T>::max();
+<<<<<<< HEAD
   __m512i permute_mask_v =
       _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
                        0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
+=======
+  __m512i permute_mask_v = _mm512_set_epi32(
+      0x0f,
+      0x0b,
+      0x07,
+      0x03,
+      0x0e,
+      0x0a,
+      0x06,
+      0x02,
+      0x0d,
+      0x09,
+      0x05,
+      0x01,
+      0x0c,
+      0x08,
+      0x04,
+      0x00);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   __m512 x_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier);
   __m512 y_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[1]), multiplier);
   __m512 z_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[2]), multiplier);
@@ -453,12 +700,18 @@ __m512i RequantizeAvx512(
 
   /*
    * xyzw_clamped_v has results in the following layout so we need to
+<<<<<<< HEAD
    * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7 x8-11 y8-11 z8-11 w8-11 x12-15 y12-15 z12-15 w12-15
+=======
+   * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7 x8-11 y8-11 z8-11 w8-11
+   * x12-15 y12-15 z12-15 w12-15
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    */
   xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
   return xyzw_clamped_v;
 }
 
+<<<<<<< HEAD
 template<>
 struct Vectorized<c10::qint8> : public Vectorizedqi {
     static constexpr int size() {
@@ -526,23 +779,113 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
     __m512i cvtepi8_epi32(__m128i epi8_vals) const {
         return _mm512_cvtepi8_epi32(epi8_vals);
     }
+=======
+template <>
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint8> : public Vectorizedqi {
+  static constexpr int size() {
+    return 64;
+  }
+
+  static constexpr int float_num_vecs() {
+    return 4;
+  }
+
+  static constexpr int int_num_vecs() {
+    return 4;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, 4>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+  using value_type = typename c10::qint8::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+
+  Vectorized() {}
+  Vectorized(__m512i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::qint8& val) {
+    value_type uw = val.val_;
+    vals = _mm512_set1_epi8(uw);
+  }
+
+  // This is needed because the compiler emits awful code for the default
+  // constructor for moving the enum
+  Vectorized(const Vectorized<c10::qint8>& other) : Vectorizedqi(other.vals) {}
+
+  // This is added to avoid error: definition of implicit copy assignment
+  // operator for 'Vectorized<c10::qint8>' is deprecated because it has a
+  // user-declared copy constructor [-Werror,-Wdeprecated-copy]
+  Vectorized& operator=(const Vectorized<c10::qint8>&) = default;
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm512_storeu_si512((__m512i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+ private:
+  __m512i cvtepi8_epi32(__m128i epi8_vals) const {
+    return _mm512_cvtepi8_epi32(epi8_vals);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
   float_vec_return_type dequantize(
       Vectorized<float> scale,
       Vectorized<float> zero_point,
       Vectorized<float> scale_neg_zp_premul) const {
+<<<<<<< HEAD
     #if defined(_MSC_VER) && !defined(__clang__)
+=======
+#if defined(_MSC_VER) && !defined(__clang__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
     __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
     __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
     __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+<<<<<<< HEAD
     #else
+=======
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+<<<<<<< HEAD
     #endif
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
@@ -563,17 +906,29 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
   float_vec_return_type dequantize(
       Vectorized<float> scale,
       Vectorized<float> zero_point) const {
+<<<<<<< HEAD
     #if defined(_MSC_VER) && !defined(__clang__)
+=======
+#if defined(_MSC_VER) && !defined(__clang__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
     __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
     __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
     __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+<<<<<<< HEAD
     #else
+=======
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+<<<<<<< HEAD
     #endif
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
@@ -600,6 +955,7 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
   }
 
   Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+<<<<<<< HEAD
       return _mm512_max_epi8(vals, b.vals);
     }
 
@@ -750,23 +1106,198 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
     __m512i cvtepu8_epi32(__m128i epu8_vals) const {
         return _mm512_cvtepu8_epi32(epu8_vals);
     }
+=======
+    return _mm512_max_epi8(vals, b.vals);
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    return _mm512_min_epi8(vals, b.vals);
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    return _mm512_min_epi8(_mm512_max_epi8(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+#else
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+#endif
+
+    __m512i int32_val0 = cvtepi8_epi32(int_val0);
+    __m512i int32_val1 = cvtepi8_epi32(int_val1);
+    __m512i int32_val2 = cvtepi8_epi32(int_val2);
+    __m512i int32_val3 = cvtepi8_epi32(int_val3);
+
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
+    __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
+    __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
+    __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
+#else
+    __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
+    __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
+    __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
+    __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+#endif
+
+    __m512i int32_b0 = cvtepi8_epi32(int_b0);
+    __m512i int32_b1 = cvtepi8_epi32(int_b1);
+    __m512i int32_b2 = cvtepi8_epi32(int_b2);
+    __m512i int32_b3 = cvtepi8_epi32(int_b3);
+
+    __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0);
+    __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1);
+    __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2);
+    __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3);
+
+    return {
+        Vectorized<c10::qint32>(res_0),
+        Vectorized<c10::qint32>(res_1),
+        Vectorized<c10::qint32>(res_2),
+        Vectorized<c10::qint32>(res_3)};
+  }
+
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m512 multiplier_v = _mm512_set1_ps(multiplier);
+    __m512i zero_point_v = _mm512_set1_epi32(zero_point);
+    return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm512_loadu_si512((const __m512i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::quint8> : public Vectorizedqi {
+  static constexpr int size() {
+    return 64;
+  }
+
+  static constexpr int float_num_vecs() {
+    return 4;
+  }
+
+  static constexpr int int_num_vecs() {
+    return 4;
+  }
+
+  using float_vec_return_type = std::array<Vectorized<float>, 4>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+  using value_type = typename c10::quint8::underlying;
+
+ public:
+  using Vectorizedqi::Vectorizedqi;
+  Vectorized() {}
+
+  Vectorized(__m512i vals_) {
+    vals = vals_;
+  }
+
+  // Broadcast constructor
+  Vectorized(const c10::quint8& val) {
+    value_type uw = val.val_;
+    vals = _mm512_set1_epi8(uw);
+  }
+
+  Vectorized(const Vectorized<c10::quint8>& other) : Vectorizedqi(other.vals) {}
+
+  // This is added to avoid error: definition of implicit copy assignment
+  // operator for 'Vectorized<c10::quint8>' is deprecated because it has a
+  // user-declared copy constructor [-Werror,-Wdeprecated-copy]
+  Vectorized& operator=(const Vectorized<c10::quint8>&) = default;
+
+  void store(void* ptr, int count = size()) const {
+    if (count != size()) {
+      memcpy(ptr, &vals, count * sizeof(value_type));
+    } else {
+      _mm512_storeu_si512((__m512i*)ptr, vals);
+    }
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+ private:
+  __m512i cvtepu8_epi32(__m128i epu8_vals) const {
+    return _mm512_cvtepu8_epi32(epu8_vals);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  public:
   float_vec_return_type dequantize(
       Vectorized<float> scale,
       Vectorized<float> zero_point,
       Vectorized<float> scale_zp_premul) const {
+<<<<<<< HEAD
     #if defined(_MSC_VER) && !defined(__clang__)
+=======
+#if defined(_MSC_VER) && !defined(__clang__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
     __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
     __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
     __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+<<<<<<< HEAD
     #else
+=======
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+<<<<<<< HEAD
     #endif
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
@@ -788,17 +1319,29 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
   float_vec_return_type dequantize(
       Vectorized<float> scale,
       Vectorized<float> zero_point) const {
+<<<<<<< HEAD
     #if defined(_MSC_VER) && !defined(__clang__)
+=======
+#if defined(_MSC_VER) && !defined(__clang__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
     __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
     __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
     __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+<<<<<<< HEAD
     #else
+=======
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+<<<<<<< HEAD
     #endif
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
@@ -826,6 +1369,7 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
   }
 
   Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+<<<<<<< HEAD
       return _mm512_max_epu8(vals, b.vals);
     }
 
@@ -908,6 +1452,91 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
 
 template <>
 Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+=======
+    return _mm512_max_epu8(vals, b.vals);
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    return _mm512_min_epu8(vals, b.vals);
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    return _mm512_min_epu8(_mm512_max_epu8(vals, zero_point.vals), q_six.vals);
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+#else
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+#endif
+
+    __m512i int32_val0 = cvtepu8_epi32(int_val0);
+    __m512i int32_val1 = cvtepu8_epi32(int_val1);
+    __m512i int32_val2 = cvtepu8_epi32(int_val2);
+    __m512i int32_val3 = cvtepu8_epi32(int_val3);
+
+#if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
+    __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
+    __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
+    __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
+#else
+    __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
+    __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
+    __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
+    __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+#endif
+
+    __m512i int32_b0 = cvtepu8_epi32(int_b0);
+    __m512i int32_b1 = cvtepu8_epi32(int_b1);
+    __m512i int32_b2 = cvtepu8_epi32(int_b2);
+    __m512i int32_b3 = cvtepu8_epi32(int_b3);
+
+    __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0);
+    __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1);
+    __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2);
+    __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3);
+    return {
+        Vectorized<c10::qint32>(res_0),
+        Vectorized<c10::qint32>(res_1),
+        Vectorized<c10::qint32>(res_2),
+        Vectorized<c10::qint32>(res_3)};
+  }
+
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    __m512 multiplier_v = _mm512_set1_ps(multiplier);
+    __m512i zero_point_v = _mm512_set1_epi32(zero_point);
+    return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
+  }
+
+ private:
+  // Load from memory constructor
+  Vectorized(const void* ptr) {
+    vals = _mm512_loadu_si512((const __m512i*)ptr);
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
@@ -964,7 +1593,12 @@ struct VectorizedQuantizedConverter {
         tmp_vals[j] = at::native::dequantize_val<T>(
             scale[j], zero_point[j], T(vals[16 * i + j]));
       }
+<<<<<<< HEAD
       rv[i] = Vectorized<float>(tmp_vals[0],
+=======
+      rv[i] = Vectorized<float>(
+          tmp_vals[0],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           tmp_vals[1],
           tmp_vals[2],
           tmp_vals[3],
@@ -996,11 +1630,22 @@ struct VectorizedQuantizedConverter {
 };
 
 template <>
+<<<<<<< HEAD
 struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
                                  c10::qint32,
                                  std::array<Vectorized<float>, 1>,
                                  std::array<Vectorized<c10::qint32>, 1>,
                                  16> {
+=======
+struct is_vec_specialized_for<c10::qint32> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
+                                     c10::qint32,
+                                     std::array<Vectorized<float>, 1>,
+                                     std::array<Vectorized<c10::qint32>, 1>,
+                                     16> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized()
       : VectorizedQuantizedConverter<
             c10::qint32,
@@ -1026,6 +1671,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 
   static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
     __at_align__ value_type tmp_values[size()];
+<<<<<<< HEAD
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
@@ -1033,6 +1679,19 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+=======
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return loadu(tmp_values);
   }
 
@@ -1074,11 +1733,18 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
     return retval;
   }
 
+<<<<<<< HEAD
   Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const  {
     return maximum(zero_point);
   }
 
 
+=======
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return maximum(zero_point);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<c10::qint32> relu6(
       Vectorized<c10::qint32> zero_point,
       Vectorized<c10::qint32> q_six) {
@@ -1113,7 +1779,13 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+=======
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
@@ -1140,11 +1812,22 @@ Vectorized<c10::qint32> inline operator+(
 }
 
 template <>
+<<<<<<< HEAD
 struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
                                 c10::qint8,
                                 std::array<Vectorized<float>, 4>,
                                 std::array<Vectorized<c10::qint32>, 4>,
                                 64> {
+=======
+struct is_vec_specialized_for<c10::qint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
+                                    c10::qint8,
+                                    std::array<Vectorized<float>, 4>,
+                                    std::array<Vectorized<c10::qint32>, 4>,
+                                    64> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized()
       : VectorizedQuantizedConverter<
             c10::qint8,
@@ -1170,6 +1853,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
 
   static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
     __at_align__ value_type tmp_values[size()];
+<<<<<<< HEAD
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
@@ -1177,6 +1861,19 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+=======
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return loadu(tmp_values);
   }
 
@@ -1267,16 +1964,33 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+=======
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
 template <>
+<<<<<<< HEAD
 struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
                                  c10::quint8,
                                  std::array<Vectorized<float>, 4>,
                                  std::array<Vectorized<c10::qint32>, 4>,
                                  64> {
+=======
+struct is_vec_specialized_for<c10::quint8> : std::bool_constant<true> {};
+
+template <>
+struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
+                                     c10::quint8,
+                                     std::array<Vectorized<float>, 4>,
+                                     std::array<Vectorized<c10::qint32>, 4>,
+                                     64> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized()
       : VectorizedQuantizedConverter<
             c10::quint8,
@@ -1302,6 +2016,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
 
   static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
     __at_align__ value_type tmp_values[size()];
+<<<<<<< HEAD
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
@@ -1309,6 +2024,19 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
       tmp_values[i] = 0;
     }
     std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+=======
+    // Ensure uninitialized memory does not change the output value See
+    // https://github.com/pytorch/pytorch/issues/32502 for more details. We do
+    // not initialize arrays to zero using "={0}" because gcc would compile it
+    // to two instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const value_type*>(ptr),
+        count * sizeof(value_type));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return loadu(tmp_values);
   }
 
@@ -1354,7 +2082,10 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
     return maximum(zero_point);
   }
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<c10::quint8> relu6(
       Vectorized<c10::quint8> zero_point,
       Vectorized<c10::quint8> q_six) {
@@ -1400,10 +2131,22 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
 };
 
 template <>
+<<<<<<< HEAD
 Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+=======
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a.maximum(b);
 }
 
 #endif // defined(CPU_CAPABILITY_AVX512) && !defined(MSVC)
 
+<<<<<<< HEAD
 }}}
+=======
+} // namespace CPU_CAPABILITY
+} // namespace vec
+} // namespace at
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index 2591338881ae..6f9b24d08089 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -1,5 +1,10 @@
 #pragma once
+<<<<<<< HEAD
 #if defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ <= 2 && defined(__ARM_FEATURE_SVE)
+=======
+#if defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ <= 2 && \
+    defined(__ARM_FEATURE_SVE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Workaround for https: //gcc.gnu.org/bugzilla/show_bug.cgi?id=117161
 #pragma GCC optimize("no-tree-vectorize")
 #endif
@@ -18,6 +23,7 @@
 // See https://github.com/pytorch/pytorch/issues/37577 for an instance
 // of this bug in the past.
 
+<<<<<<< HEAD
 #include <array>
 #include <algorithm>
 #include <cassert>
@@ -39,6 +45,29 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 #include <c10/util/Load.h>
+=======
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <type_traits>
+
+#include <ATen/NumericUtils.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/cpu/zmath.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/BFloat16-math.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/util/Load.h>
+#include <c10/util/TypeCast.h>
+#include <c10/util/copysign.h>
+#include <c10/util/irange.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(__GNUC__)
 #define __FORCE_INLINE __attribute__((always_inline)) inline
@@ -66,7 +95,12 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 64
 #define int_vector __m512i
+<<<<<<< HEAD
 #elif defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
+=======
+#elif defined(__aarch64__) && \
+    !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // SVE code expects 256-vectors; leave that set for SVE?
 #if defined(__GNUC__)
 #define __at_align__ __attribute__((aligned(16)))
@@ -93,6 +127,7 @@ namespace at::vec {
 inline namespace CPU_CAPABILITY {
 // at::Half and at::BFloat16 should be treated as floating point
 template <typename T>
+<<<<<<< HEAD
 struct is_floating_point:
     std::integral_constant<bool,
       std::is_floating_point_v<T> ||
@@ -118,15 +153,52 @@ struct is_8bit_integer:
     std::integral_constant<bool,
       std::is_same_v<T, unsigned char> ||
       std::is_same_v<T, signed char>> {
+=======
+struct is_floating_point
+    : std::integral_constant<
+          bool,
+          std::is_floating_point_v<T> || std::is_same_v<T, at::Half> ||
+              std::is_same_v<T, at::BFloat16>> {};
+
+template <typename T>
+constexpr bool is_floating_point_v = is_floating_point<T>::value;
+
+template <typename T>
+struct is_reduced_floating_point
+    : std::integral_constant<
+          bool,
+          std::is_same_v<T, at::Half> || std::is_same_v<T, at::BFloat16>> {};
+
+template <typename T>
+constexpr bool is_reduced_floating_point_v =
+    is_reduced_floating_point<T>::value;
+
+template <typename T>
+struct is_8bit_integer
+    : std::integral_constant<
+          bool,
+          std::is_same_v<T, unsigned char> || std::is_same_v<T, signed char>> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 template <typename T>
 constexpr bool is_8bit_integer_v = is_8bit_integer<T>::value;
 
+<<<<<<< HEAD
 template<size_t n> struct int_of_size;
 
 #define DEFINE_INT_OF_SIZE(int_t) \
 template<> struct int_of_size<sizeof(int_t)> { using type = int_t; }
+=======
+template <size_t n>
+struct int_of_size;
+
+#define DEFINE_INT_OF_SIZE(int_t)     \
+  template <>                         \
+  struct int_of_size<sizeof(int_t)> { \
+    using type = int_t;               \
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 DEFINE_INT_OF_SIZE(int64_t);
 DEFINE_INT_OF_SIZE(int32_t);
@@ -138,18 +210,55 @@ DEFINE_INT_OF_SIZE(int8_t);
 template <typename T>
 using int_same_size_t = typename int_of_size<sizeof(T)>::type;
 
+<<<<<<< HEAD
 // NOTE: If you specialize on a type, you must define all operations!
 
 // emulates Vectorized types
 #if defined(__s390x__)
 template <class T, class TEMP=void>
+=======
+/**
+ * Detect at compile time whether Vectorized has an explicit
+ * specialization for T. (You are required to specialize this type
+ * whenever you specialize Vectorized). Useful for generic algorithms
+ * to decide whether to rely on a specialization being fast. For
+ * example, they might choose to handle reduced-precision floating
+ * point types directly if they're supported, or convert through float
+ * if not.
+ */
+#if defined(__s390x__)
+template <class T, class TEMP = void>
+#else
+template <typename T>
+#endif
+struct is_vec_specialized_for : std::bool_constant<false> {
+};
+
+template <typename T>
+constexpr bool is_vec_specialized_for_v = is_vec_specialized_for<T>::value;
+
+// NOTE: If you specialize Vectorized on a type, you must define all
+// operations!  You must also specialize is_vec_specialized_for for
+// that type.
+
+// emulates Vectorized types
+#if defined(__s390x__)
+template <class T, class TEMP = void>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 template <class T>
 #endif
 struct Vectorized {
+<<<<<<< HEAD
 private:
   __at_align__ T values[VECTOR_WIDTH / sizeof(T)];
 public:
+=======
+ private:
+  __at_align__ T values[VECTOR_WIDTH / sizeof(T)];
+
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using value_type = T;
   using size_type = int;
 
@@ -163,11 +272,19 @@ struct Vectorized {
       values[i] = val;
     }
   }
+<<<<<<< HEAD
   template<typename... Args,
            typename = std::enable_if_t<(sizeof...(Args) == size())>>
   Vectorized(Args... vals) : values{vals...}{
   }
   Vectorized(const T(&arr)[kSize]) {
+=======
+  template <
+      typename... Args,
+      typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) : values{vals...} {}
+  Vectorized(const T (&arr)[kSize]) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::memcpy(values, arr, sizeof(values));
   }
   // This also implies const T& operator[](int idx) const
@@ -198,20 +315,39 @@ struct Vectorized {
   }
 // Workaround for https: //gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
 #if __GNUC__ <= 12 && !defined(__clang__) && defined(__ARM_FEATURE_SVE)
+<<<<<<< HEAD
   static Vectorized<T>  __attribute__ ((optimize("-fno-tree-loop-vectorize"))) blendv(const Vectorized<T>& a,
 #else
   static Vectorized<T> blendv(const Vectorized<T>& a,
 #endif
     const Vectorized<T>& b, const Vectorized<T>& mask) {
+=======
+  static Vectorized<T> __attribute__((optimize("-fno-tree-loop-vectorize")))
+  blendv(
+      const Vectorized<T>& a,
+#else
+  static Vectorized<T> blendv(
+      const Vectorized<T>& a,
+#endif
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized vector;
     int_same_size_t<T> buffer[size()];
     mask.store(buffer);
 #if defined(__clang__) && __ARM_FEATURE_SVE
+<<<<<<< HEAD
     #pragma clang loop vectorize(disable)
 #endif
     for (const auto i : c10::irange(size())) {
       if (buffer[i] & 0x01)
        {
+=======
+#pragma clang loop vectorize(disable)
+#endif
+    for (const auto i : c10::irange(size())) {
+      if (buffer[i] & 0x01) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         vector[i] = b[i];
       } else {
         vector[i] = a[i];
@@ -219,15 +355,30 @@ struct Vectorized {
     }
     return vector;
   }
+<<<<<<< HEAD
   template<typename step_t>  // step sometimes requires a higher precision type (e.g., T=int, step_t=double)
   static Vectorized<T> arange(T base = static_cast<T>(0), step_t step = static_cast<step_t>(1)) {
+=======
+  template <typename step_t> // step sometimes requires a higher precision type
+                             // (e.g., T=int, step_t=double)
+  static Vectorized<T> arange(
+      T base = static_cast<T>(0),
+      step_t step = static_cast<step_t>(1)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized vector;
     for (const auto i : c10::irange(size())) {
       vector.values[i] = base + i * step;
     }
     return vector;
   }
+<<<<<<< HEAD
   static Vectorized<T> set(const Vectorized<T>& a, const Vectorized<T>& b, int64_t count = size()) {
+=======
+  static Vectorized<T> set(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      int64_t count = size()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized vector;
     for (const auto i : c10::irange(size())) {
       if (i < count) {
@@ -249,7 +400,13 @@ struct Vectorized {
     return vector;
   }
   static Vectorized<T> loadu_one_fourth(const void* ptr) {
+<<<<<<< HEAD
     static_assert(std::is_same_v<T, signed char> || std::is_same_v<T, unsigned char>, "For byte types only");
+=======
+    static_assert(
+        std::is_same_v<T, signed char> || std::is_same_v<T, unsigned char>,
+        "For byte types only");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Vectorized::loadu(ptr, 8);
   }
 
@@ -257,9 +414,16 @@ struct Vectorized {
     std::memcpy(ptr, values, count * sizeof(T));
   }
   int zero_mask() const {
+<<<<<<< HEAD
     // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
     int mask = 0;
     for (int i = 0; i < size(); ++ i) {
+=======
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    int mask = 0;
+    for (int i = 0; i < size(); ++i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (values[i] == static_cast<T>(0)) {
         mask |= (1 << i);
       }
@@ -279,15 +443,28 @@ struct Vectorized {
   }
   bool has_inf_nan() const {
     for (int64_t i = 0; i != size(); i++) {
+<<<<<<< HEAD
       if(_isnan(values[i]) || _isinf(values[i])) {
+=======
+      if (_isnan(values[i]) || _isinf(values[i])) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return true;
       }
     }
     return false;
   }
+<<<<<<< HEAD
 // MSVC versions between 14.36 and 14.42 has a loop unrolling bug on Windows Arm64
 //       See https://developercommunity.visualstudio.com/t/MSVC-loop-unrolling-problem-194033813-/10720692
 #if defined(_WIN32) && defined(__aarch64__) && ((_MSVC_VER >= 1936) && (_MSVC_VER <= 1942))
+=======
+// MSVC versions between 14.36 and 14.42 has a loop unrolling bug on Windows
+// Arm64
+//       See
+//       https://developercommunity.visualstudio.com/t/MSVC-loop-unrolling-problem-194033813-/10720692
+#if defined(_WIN32) && defined(__aarch64__) && \
+    ((_MSVC_VER >= 1936) && (_MSVC_VER <= 1942))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> map(T (*const f)(T)) const {
     Vectorized<T> ret;
     for (int64_t i = 0; i < size(); i++) {
@@ -322,27 +499,45 @@ struct Vectorized {
     return ret;
   }
 #endif
+<<<<<<< HEAD
   Vectorized<T> map(T (*const f)(const T &)) const {
+=======
+  Vectorized<T> map(T (*const f)(const T&)) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized<T> ret;
     for (int64_t i = 0; i != size(); i++) {
       ret[i] = f(values[i]);
     }
     return ret;
   }
+<<<<<<< HEAD
   T reduce(T (*const f)(const T &)) const {
+=======
+  T reduce(T (*const f)(const T&)) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T ret = 0;
     for (int64_t i = 0; i != size(); i++) {
       ret = f(ret, values[i]);
     }
     return ret;
   }
+<<<<<<< HEAD
   template <typename other_t_abs = T,
             typename std::enable_if_t<!is_floating_point_v<other_t_abs> && !c10::is_complex<other_t_abs>::value, int> = 0>
+=======
+  template <
+      typename other_t_abs = T,
+      typename std::enable_if_t<
+          !is_floating_point_v<other_t_abs> &&
+              !c10::is_complex<other_t_abs>::value,
+          int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> abs() const {
     // other_t_abs is for SFINAE and clarity. Make sure it is not changed.
     static_assert(std::is_same_v<other_t_abs, T>, "other_t_abs must be T");
     return map([](T x) -> T { return x < static_cast<T>(0) ? -x : x; });
   }
+<<<<<<< HEAD
   template <typename float_t_abs = T,
             typename std::enable_if_t<is_floating_point_v<float_t_abs>, int> = 0>
   Vectorized<T> abs() const {
@@ -354,6 +549,21 @@ struct Vectorized {
   }
   template <typename complex_t_abs = T,
             typename std::enable_if_t<c10::is_complex<complex_t_abs>::value, int> = 0>
+=======
+  template <
+      typename float_t_abs = T,
+      typename std::enable_if_t<is_floating_point_v<float_t_abs>, int> = 0>
+  Vectorized<T> abs() const {
+    // float_t_abs is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same_v<float_t_abs, T>, "float_t_abs must be T");
+    // Specifically deal with floating-point because the generic code above
+    // won't handle -0.0 (which should result in 0.0) properly.
+    return map([](T x) -> T { return std::abs(x); });
+  }
+  template <
+      typename complex_t_abs = T,
+      typename std::enable_if_t<c10::is_complex<complex_t_abs>::value, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> abs() const {
     // complex_t_abs is for SFINAE and clarity. Make sure it is not changed.
     static_assert(std::is_same_v<complex_t_abs, T>, "complex_t_abs must be T");
@@ -361,12 +571,19 @@ struct Vectorized {
     return map([](T x) { return static_cast<T>(std::abs(x)); });
   }
 
+<<<<<<< HEAD
   template <typename other_t_sgn = T,
             typename std::enable_if_t<c10::is_complex<other_t_sgn>::value, int> = 0>
+=======
+  template <
+      typename other_t_sgn = T,
+      typename std::enable_if_t<c10::is_complex<other_t_sgn>::value, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> sgn() const {
     return map(at::native::sgn_impl);
   }
 
+<<<<<<< HEAD
   template <typename other_t_angle = T,
             typename std::enable_if_t<!c10::is_complex<other_t_angle>::value, int> = 0>
   Vectorized<T> angle() const {
@@ -383,11 +600,37 @@ struct Vectorized {
   }
   template <typename other_t_real = T,
             typename std::enable_if_t<!c10::is_complex<other_t_real>::value, int> = 0>
+=======
+  template <
+      typename other_t_angle = T,
+      typename std::enable_if_t<!c10::is_complex<other_t_angle>::value, int> =
+          0>
+  Vectorized<T> angle() const {
+    // other_t_angle is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same_v<other_t_angle, T>, "other_t_angle must be T");
+    return map(at::native::angle_impl<T>); // compiler is unable to resolve the
+                                           // overload without <T>
+  }
+  template <
+      typename complex_t_angle = T,
+      typename std::enable_if_t<c10::is_complex<complex_t_angle>::value, int> =
+          0>
+  Vectorized<T> angle() const {
+    // complex_t_angle is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(
+        std::is_same_v<complex_t_angle, T>, "complex_t_angle must be T");
+    return map([](T x) { return static_cast<T>(std::arg(x)); });
+  }
+  template <
+      typename other_t_real = T,
+      typename std::enable_if_t<!c10::is_complex<other_t_real>::value, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> real() const {
     // other_t_real is for SFINAE and clarity. Make sure it is not changed.
     static_assert(std::is_same_v<other_t_real, T>, "other_t_real must be T");
     return *this;
   }
+<<<<<<< HEAD
   template <typename complex_t_real = T,
             typename std::enable_if_t<c10::is_complex<complex_t_real>::value, int> = 0>
   Vectorized<T> real() const {
@@ -397,11 +640,27 @@ struct Vectorized {
   }
   template <typename other_t_imag = T,
             typename std::enable_if_t<!c10::is_complex<other_t_imag>::value, int> = 0>
+=======
+  template <
+      typename complex_t_real = T,
+      typename std::enable_if_t<c10::is_complex<complex_t_real>::value, int> =
+          0>
+  Vectorized<T> real() const {
+    // complex_t_real is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(
+        std::is_same_v<complex_t_real, T>, "complex_t_real must be T");
+    return map([](T x) { return static_cast<T>(x.real()); });
+  }
+  template <
+      typename other_t_imag = T,
+      typename std::enable_if_t<!c10::is_complex<other_t_imag>::value, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> imag() const {
     // other_t_imag is for SFINAE and clarity. Make sure it is not changed.
     static_assert(std::is_same_v<other_t_imag, T>, "other_t_imag must be T");
     return Vectorized(0);
   }
+<<<<<<< HEAD
   template <typename complex_t_imag = T,
             typename std::enable_if_t<c10::is_complex<complex_t_imag>::value, int> = 0>
   Vectorized<T> imag() const {
@@ -411,16 +670,42 @@ struct Vectorized {
   }
   template <typename other_t_conj = T,
             typename std::enable_if_t<!c10::is_complex<other_t_conj>::value, int> = 0>
+=======
+  template <
+      typename complex_t_imag = T,
+      typename std::enable_if_t<c10::is_complex<complex_t_imag>::value, int> =
+          0>
+  Vectorized<T> imag() const {
+    // complex_t_imag is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(
+        std::is_same_v<complex_t_imag, T>, "complex_t_imag must be T");
+    return map([](T x) { return static_cast<T>(x.imag()); });
+  }
+  template <
+      typename other_t_conj = T,
+      typename std::enable_if_t<!c10::is_complex<other_t_conj>::value, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> conj() const {
     // other_t_conj is for SFINAE and clarity. Make sure it is not changed.
     static_assert(std::is_same_v<other_t_conj, T>, "other_t_conj must be T");
     return *this;
   }
+<<<<<<< HEAD
   template <typename complex_t_conj = T,
             typename std::enable_if_t<c10::is_complex<complex_t_conj>::value, int> = 0>
   Vectorized<T> conj() const {
     // complex_t_conj is for SFINAE and clarity. Make sure it is not changed.
     static_assert(std::is_same_v<complex_t_conj, T>, "complex_t_conj must be T");
+=======
+  template <
+      typename complex_t_conj = T,
+      typename std::enable_if_t<c10::is_complex<complex_t_conj>::value, int> =
+          0>
+  Vectorized<T> conj() const {
+    // complex_t_conj is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(
+        std::is_same_v<complex_t_conj, T>, "complex_t_conj must be T");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map([](T x) { return static_cast<T>(std::conj(x)); });
   }
   Vectorized<T> acos() const {
@@ -441,7 +726,11 @@ struct Vectorized {
   Vectorized<T> atanh() const {
     return map(std::atanh);
   }
+<<<<<<< HEAD
   Vectorized<T> atan2(const Vectorized<T> &exp) const {
+=======
+  Vectorized<T> atan2(const Vectorized<T>& exp) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized<T> ret;
     for (const auto i : c10::irange(size())) {
       ret[i] = std::atan2(values[i], exp[i]);
@@ -449,9 +738,15 @@ struct Vectorized {
     return ret;
   }
   template <
+<<<<<<< HEAD
     typename U = T,
     typename std::enable_if_t<is_floating_point_v<U>, int> = 0>
   Vectorized<T> copysign(const Vectorized<T> &sign) const {
+=======
+      typename U = T,
+      typename std::enable_if_t<is_floating_point_v<U>, int> = 0>
+  Vectorized<T> copysign(const Vectorized<T>& sign) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized<T> ret;
     for (size_type i = 0; i < size(); i++) {
       ret[i] = c10::copysign(values[i], sign[i]);
@@ -483,8 +778,13 @@ struct Vectorized {
     return *this - this->trunc();
   }
   template <
+<<<<<<< HEAD
     typename U = T,
     typename std::enable_if_t<is_floating_point_v<U>, int> = 0>
+=======
+      typename U = T,
+      typename std::enable_if_t<is_floating_point_v<U>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> fmod(const Vectorized<T>& q) const {
     // U is for SFINAE purposes only. Make sure it is not changed.
     static_assert(std::is_same_v<U, T>, "U must be T");
@@ -503,13 +803,20 @@ struct Vectorized {
   Vectorized<T> log1p() const {
     return map(std::log1p);
   }
+<<<<<<< HEAD
   template <typename other_t_log2 = T,
             typename std::enable_if_t<!c10::is_complex<other_t_log2>::value, int> = 0>
+=======
+  template <
+      typename other_t_log2 = T,
+      typename std::enable_if_t<!c10::is_complex<other_t_log2>::value, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> log2() const {
     // other_t_log2 is for SFINAE and clarity. Make sure it is not changed.
     static_assert(std::is_same_v<other_t_log2, T>, "other_t_log2 must be T");
     return map(std::log2);
   }
+<<<<<<< HEAD
   template <typename complex_t_log2 = T,
             typename std::enable_if_t<c10::is_complex<complex_t_log2>::value, int> = 0>
   Vectorized<T> log2() const {
@@ -517,6 +824,18 @@ struct Vectorized {
     static_assert(std::is_same_v<complex_t_log2, T>, "complex_t_log2 must be T");
     const T log_2 = T(std::log(2.0));
     return Vectorized(map(std::log))/Vectorized(log_2);
+=======
+  template <
+      typename complex_t_log2 = T,
+      typename std::enable_if_t<c10::is_complex<complex_t_log2>::value, int> =
+          0>
+  Vectorized<T> log2() const {
+    // complex_t_log2 is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(
+        std::is_same_v<complex_t_log2, T>, "complex_t_log2 must be T");
+    const T log_2 = T(std::log(2.0));
+    return Vectorized(map(std::log)) / Vectorized(log_2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   Vectorized<T> ceil() const {
     return map(at::native::ceil_impl);
@@ -530,7 +849,11 @@ struct Vectorized {
   Vectorized<T> floor() const {
     return map(at::native::floor_impl);
   }
+<<<<<<< HEAD
   Vectorized<T> hypot(const Vectorized<T> &b) const {
+=======
+  Vectorized<T> hypot(const Vectorized<T>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized<T> ret;
     for (const auto i : c10::irange(size())) {
       ret[i] = std::hypot(values[i], b[i]);
@@ -546,14 +869,22 @@ struct Vectorized {
   Vectorized<T> digamma() const {
     return map(calc_digamma);
   }
+<<<<<<< HEAD
   Vectorized<T> igamma(const Vectorized<T> &x) const {
+=======
+  Vectorized<T> igamma(const Vectorized<T>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized<T> ret;
     for (const auto i : c10::irange(size())) {
       ret[i] = calc_igamma(values[i], x[i]);
     }
     return ret;
   }
+<<<<<<< HEAD
   Vectorized<T> igammac(const Vectorized<T> &x) const {
+=======
+  Vectorized<T> igammac(const Vectorized<T>& x) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized<T> ret;
     for (const auto i : c10::irange(size())) {
       ret[i] = calc_igammac(values[i], x[i]);
@@ -566,7 +897,11 @@ struct Vectorized {
     // promotion
     return map([](T x) -> T { return -x; });
   }
+<<<<<<< HEAD
   Vectorized<T> nextafter(const Vectorized<T> &b) const {
+=======
+  Vectorized<T> nextafter(const Vectorized<T>& b) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized<T> ret;
     for (const auto i : c10::irange(size())) {
       ret[i] = std::nextafter(values[i], b[i]);
@@ -574,7 +909,12 @@ struct Vectorized {
     return ret;
   }
   Vectorized<T> round() const {
+<<<<<<< HEAD
     // We do not use std::round because we would like to round midway numbers to the nearest even integer.
+=======
+    // We do not use std::round because we would like to round midway numbers to
+    // the nearest even integer.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return map(at::native::round_impl);
   }
   Vectorized<T> sin() const {
@@ -604,20 +944,33 @@ struct Vectorized {
   Vectorized<T> rsqrt() const {
     return map([](T x) { return (T)1 / std::sqrt(x); });
   }
+<<<<<<< HEAD
   Vectorized<T> pow(const Vectorized<T> &exp) const {
+=======
+  Vectorized<T> pow(const Vectorized<T>& exp) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Vectorized<T> ret;
     for (const auto i : c10::irange(size())) {
       ret[i] = std::pow(values[i], exp[i]);
     }
     return ret;
   }
+<<<<<<< HEAD
    T reduce_add() const {
+=======
+  T reduce_add() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return reduce([](T x, T y) -> T { return x + y; });
   }
   T reduce_max() const {
     return reduce(std::max);
   }
+<<<<<<< HEAD
 private:
+=======
+
+ private:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   template <typename Op>
   inline Vectorized<T> binary_pred(const Vectorized<T>& other, Op op) const {
     // All bits are set to 1 if the pred is true, otherwise 0.
@@ -632,6 +985,7 @@ struct Vectorized {
     return vector;
   }
 
+<<<<<<< HEAD
 public:
   Vectorized<T> operator==(const Vectorized<T>& other) const { return binary_pred(other, std::equal_to<T>()); }
   Vectorized<T> operator!=(const Vectorized<T>& other) const { return binary_pred(other, std::not_equal_to<T>()); }
@@ -646,11 +1000,41 @@ struct Vectorized {
     // 1 if the pred is true, otherwise 0.
     Vectorized<T> vector;
     for (int i = 0; i != size(); ++ i) {
+=======
+ public:
+  Vectorized<T> operator==(const Vectorized<T>& other) const {
+    return binary_pred(other, std::equal_to<T>());
+  }
+  Vectorized<T> operator!=(const Vectorized<T>& other) const {
+    return binary_pred(other, std::not_equal_to<T>());
+  }
+  Vectorized<T> operator>=(const Vectorized<T>& other) const {
+    return binary_pred(other, std::greater_equal<T>());
+  }
+  Vectorized<T> operator<=(const Vectorized<T>& other) const {
+    return binary_pred(other, std::less_equal<T>());
+  }
+  Vectorized<T> operator>(const Vectorized<T>& other) const {
+    return binary_pred(other, std::greater<T>());
+  }
+  Vectorized<T> operator<(const Vectorized<T>& other) const {
+    return binary_pred(other, std::less<T>());
+  }
+
+ private:
+  template <typename Op>
+  inline Vectorized<T> binary_pred_bool(const Vectorized<T>& other, Op op)
+      const {
+    // 1 if the pred is true, otherwise 0.
+    Vectorized<T> vector;
+    for (int i = 0; i != size(); ++i) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       vector[i] = static_cast<T>(op(values[i], other.values[i]));
     }
     return vector;
   }
 
+<<<<<<< HEAD
 public:
   Vectorized<T> eq(const Vectorized<T>& other) const { return binary_pred_bool(other, std::equal_to<T>()); }
   Vectorized<T> ne(const Vectorized<T>& other) const { return binary_pred_bool(other, std::not_equal_to<T>()); }
@@ -661,6 +1045,55 @@ struct Vectorized {
 };
 
 template <class T> Vectorized<T> inline operator+(const Vectorized<T> &a, const Vectorized<T> &b) {
+=======
+ public:
+  Vectorized<T> eq(const Vectorized<T>& other) const {
+    return binary_pred_bool(other, std::equal_to<T>());
+  }
+  Vectorized<T> ne(const Vectorized<T>& other) const {
+    return binary_pred_bool(other, std::not_equal_to<T>());
+  }
+  Vectorized<T> gt(const Vectorized<T>& other) const {
+    return binary_pred_bool(other, std::greater<T>());
+  }
+  Vectorized<T> ge(const Vectorized<T>& other) const {
+    return binary_pred_bool(other, std::greater_equal<T>());
+  }
+  Vectorized<T> lt(const Vectorized<T>& other) const {
+    return binary_pred_bool(other, std::less<T>());
+  }
+  Vectorized<T> le(const Vectorized<T>& other) const {
+    return binary_pred_bool(other, std::less_equal<T>());
+  }
+};
+
+template <class T>
+Vectorized<T> inline operator-(const Vectorized<T>& a) {
+  return a.neg();
+}
+
+// There is an implicit conversion that would make this work if
+// these operators weren't template functions, but they are template
+// functions (and can't be moved to be non-member friends defined in
+// the class body as suggested in
+// https://stackoverflow.com/questions/9787593/implicit-type-conversion-with-template/9788255#9788255
+// because we have a lot of disparate specializations of
+// Vectorized). So, just explicitly make scalars work.
+#define VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(name)   \
+  template <class T>                                       \
+  Vectorized<T> inline name(const Vectorized<T>& a, T b) { \
+    return name(a, Vectorized<T>(b));                      \
+  }                                                        \
+  template <class T>                                       \
+  Vectorized<T> inline name(T a, const Vectorized<T>& b) { \
+    return name(Vectorized<T>(a), b);                      \
+  }
+#define VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(op) \
+  VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(operator op)
+
+template <class T>
+Vectorized<T> inline operator+(const Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = a[i] + b[i];
@@ -668,7 +1101,14 @@ template <class T> Vectorized<T> inline operator+(const Vectorized<T> &a, const
   return c;
 }
 
+<<<<<<< HEAD
 template <class T> Vectorized<T> inline operator-(const Vectorized<T> &a, const Vectorized<T> &b) {
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(+)
+
+template <class T>
+Vectorized<T> inline operator-(const Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = a[i] - b[i];
@@ -676,7 +1116,14 @@ template <class T> Vectorized<T> inline operator-(const Vectorized<T> &a, const
   return c;
 }
 
+<<<<<<< HEAD
 template <class T> Vectorized<T> inline operator*(const Vectorized<T> &a, const Vectorized<T> &b) {
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(-)
+
+template <class T>
+Vectorized<T> inline operator*(const Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = a[i] * b[i];
@@ -684,7 +1131,15 @@ template <class T> Vectorized<T> inline operator*(const Vectorized<T> &a, const
   return c;
 }
 
+<<<<<<< HEAD
 template <class T> Vectorized<T> inline operator/(const Vectorized<T> &a, const Vectorized<T> &b) __ubsan_ignore_float_divide_by_zero__ {
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(*)
+
+template <class T>
+Vectorized<T> inline operator/(const Vectorized<T>& a, const Vectorized<T>& b)
+    __ubsan_ignore_float_divide_by_zero__ {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = a[i] / b[i];
@@ -692,6 +1147,7 @@ template <class T> Vectorized<T> inline operator/(const Vectorized<T> &a, const
   return c;
 }
 
+<<<<<<< HEAD
 template <class T,
           typename std::enable_if_t<!is_floating_point_v<T>, int> = 0>
 Vectorized<T> inline operator%(const Vectorized<T> &a, const Vectorized<T> &b) __ubsan_ignore_float_divide_by_zero__ {
@@ -700,6 +1156,22 @@ Vectorized<T> inline operator%(const Vectorized<T> &a, const Vectorized<T> &b) _
 
 template <class T> Vectorized<T> inline operator||(
     const Vectorized<T> &a, const Vectorized<T> &b) {
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(/)
+
+template <class T, typename std::enable_if_t<!is_floating_point_v<T>, int> = 0>
+Vectorized<T> inline operator%(const Vectorized<T>& a, const Vectorized<T>& b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a - a / b * b;
+}
+
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(%)
+
+template <class T>
+Vectorized<T> inline operator||(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = a[i] || b[i];
@@ -707,11 +1179,22 @@ template <class T> Vectorized<T> inline operator||(
   return c;
 }
 
+<<<<<<< HEAD
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <class T,
           typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) {
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(||)
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <
+    class T,
+    typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
+Vectorized<T> inline maximum(const Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = (a[i] > b[i]) ? a[i] : b[i];
@@ -725,9 +1208,16 @@ Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) {
   return c;
 }
 
+<<<<<<< HEAD
 template <class T,
           typename std::enable_if_t<c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) {
+=======
+template <
+    class T,
+    typename std::enable_if_t<c10::is_complex<T>::value, int> = 0>
+Vectorized<T> inline maximum(const Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = (std::abs(a[i]) > std::abs(b[i])) ? a[i] : b[i];
@@ -741,11 +1231,22 @@ Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) {
   return c;
 }
 
+<<<<<<< HEAD
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <class T,
           typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) {
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(maximum)
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <
+    class T,
+    typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
+Vectorized<T> inline minimum(const Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = (a[i] < b[i]) ? a[i] : b[i];
@@ -759,9 +1260,16 @@ Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) {
   return c;
 }
 
+<<<<<<< HEAD
 template <class T,
           typename std::enable_if_t<c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) {
+=======
+template <
+    class T,
+    typename std::enable_if_t<c10::is_complex<T>::value, int> = 0>
+Vectorized<T> inline minimum(const Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = (std::abs(a[i]) < std::abs(b[i])) ? a[i] : b[i];
@@ -775,9 +1283,21 @@ Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) {
   return c;
 }
 
+<<<<<<< HEAD
 template <class T,
           typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline clamp(const Vectorized<T> &a, const Vectorized<T> &min_vec, const Vectorized<T> &max_vec) {
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(minimum)
+
+template <
+    class T,
+    typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
+Vectorized<T> inline clamp(
+    const Vectorized<T>& a,
+    const Vectorized<T>& min_vec,
+    const Vectorized<T>& max_vec) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = std::min(std::max(a[i], min_vec[i]), max_vec[i]);
@@ -785,9 +1305,54 @@ Vectorized<T> inline clamp(const Vectorized<T> &a, const Vectorized<T> &min_vec,
   return c;
 }
 
+<<<<<<< HEAD
 template <class T,
           typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline clamp_max(const Vectorized<T> &a, const Vectorized<T> &max_vec) {
+=======
+#define VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(name)       \
+  template <class T>                                            \
+  Vectorized<T> inline name(                                    \
+      const Vectorized<T>& a, const Vectorized<T>& b, T c) {    \
+    return name(a, b, Vectorized<T>(c));                        \
+  }                                                             \
+                                                                \
+  template <class T>                                            \
+  Vectorized<T> inline name(                                    \
+      const Vectorized<T>& a, T b, const Vectorized<T>& c) {    \
+    return name(a, Vectorized<T>(b), c);                        \
+  }                                                             \
+                                                                \
+  template <class T>                                            \
+  Vectorized<T> inline name(const Vectorized<T>& a, T b, T c) { \
+    return name(a, Vectorized<T>(b), Vectorized<T>(c));         \
+  }                                                             \
+                                                                \
+  template <class T>                                            \
+  Vectorized<T> inline name(                                    \
+      T a, const Vectorized<T>& b, const Vectorized<T>& c) {    \
+    return name(Vectorized<T>(a), b, c);                        \
+  }                                                             \
+                                                                \
+  template <class T>                                            \
+  Vectorized<T> inline name(T a, const Vectorized<T>& b, T c) { \
+    return name(Vectorized<T>(a), b, Vectorized<T>(c));         \
+  }                                                             \
+                                                                \
+  template <class T>                                            \
+  Vectorized<T> inline name(T a, T b, const Vectorized<T>& c) { \
+    return name(Vectorized<T>(a), Vectorized<T>(b), c);         \
+  }
+
+VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(clamp)
+
+template <
+    class T,
+    typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
+Vectorized<T> inline clamp_max(
+    const Vectorized<T>& a,
+    const Vectorized<T>& max_vec) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = a[i] > max_vec[i] ? max_vec[i] : a[i];
@@ -795,9 +1360,20 @@ Vectorized<T> inline clamp_max(const Vectorized<T> &a, const Vectorized<T> &max_
   return c;
 }
 
+<<<<<<< HEAD
 template <class T,
           typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline clamp_min(const Vectorized<T> &a, const Vectorized<T> &min_vec) {
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(clamp_max)
+
+template <
+    class T,
+    typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
+Vectorized<T> inline clamp_min(
+    const Vectorized<T>& a,
+    const Vectorized<T>& min_vec) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     c[i] = a[i] < min_vec[i] ? min_vec[i] : a[i];
@@ -805,10 +1381,16 @@ Vectorized<T> inline clamp_min(const Vectorized<T> &a, const Vectorized<T> &min_
   return c;
 }
 
+<<<<<<< HEAD
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(clamp_min)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct Vectorizedi;
 
 #if defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)
 template <class T, typename Op>
+<<<<<<< HEAD
 static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vectorized<T> &b, Op op) {
   int_vector buffer;
 #if defined(CPU_CAPABILITY_AVX2)
@@ -817,6 +1399,23 @@ static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vect
 #elif defined(CPU_CAPABILITY_AVX512)
   int_vector a_buffer = _mm512_load_si512(reinterpret_cast<const int_vector*>((const T*)a));
   int_vector b_buffer = _mm512_load_si512(reinterpret_cast<const int_vector*>((const T*)b));
+=======
+static inline Vectorized<T> bitwise_binary_op(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+  int_vector buffer;
+#if defined(CPU_CAPABILITY_AVX2)
+  int_vector a_buffer =
+      _mm256_load_si256(reinterpret_cast<const int_vector*>((const T*)a));
+  int_vector b_buffer =
+      _mm256_load_si256(reinterpret_cast<const int_vector*>((const T*)b));
+#elif defined(CPU_CAPABILITY_AVX512)
+  int_vector a_buffer =
+      _mm512_load_si512(reinterpret_cast<const int_vector*>((const T*)a));
+  int_vector b_buffer =
+      _mm512_load_si512(reinterpret_cast<const int_vector*>((const T*)b));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   buffer = op(a_buffer, b_buffer);
   __at_align__ T results[Vectorized<T>::size()];
@@ -829,6 +1428,7 @@ static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vect
   return Vectorized<T>::loadu(results);
 }
 
+<<<<<<< HEAD
 template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
 inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
   // We enclose _mm512_and_si512 or _mm256_and_si256 with lambda because it is always_inline
@@ -854,6 +1454,54 @@ inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
   return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_xor_si256(a, b); });
 #elif defined(CPU_CAPABILITY_AVX512)
   return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_xor_si512(a, b); });
+=======
+template <
+    class T,
+    typename std::enable_if_t<
+        !std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
+  // We enclose _mm512_and_si512 or _mm256_and_si256 with lambda because it is
+  // always_inline
+#if defined(CPU_CAPABILITY_AVX2)
+  return bitwise_binary_op(
+      a, b, [](int_vector a, int_vector b) { return _mm256_and_si256(a, b); });
+#elif defined(CPU_CAPABILITY_AVX512)
+  return bitwise_binary_op(
+      a, b, [](int_vector a, int_vector b) { return _mm512_and_si512(a, b); });
+#endif
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        !std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
+  // We enclose _mm512_or_si512 or _mm256_or_si256 with lambda because it is
+  // always_inline
+#if defined(CPU_CAPABILITY_AVX2)
+  return bitwise_binary_op(
+      a, b, [](int_vector a, int_vector b) { return _mm256_or_si256(a, b); });
+#elif defined(CPU_CAPABILITY_AVX512)
+  return bitwise_binary_op(
+      a, b, [](int_vector a, int_vector b) { return _mm512_or_si512(a, b); });
+#endif
+}
+template <
+    class T,
+    typename std::enable_if_t<
+        !std::is_base_of<Vectorizedi, Vectorized<T>>::value,
+        int> = 0>
+inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
+  // We enclose _mm512_xor_si512 or _mm256_xor_si256 with lambda because it is
+  // always_inline
+#if defined(CPU_CAPABILITY_AVX2)
+  return bitwise_binary_op(
+      a, b, [](int_vector a, int_vector b) { return _mm256_xor_si256(a, b); });
+#elif defined(CPU_CAPABILITY_AVX512)
+  return bitwise_binary_op(
+      a, b, [](int_vector a, int_vector b) { return _mm512_xor_si512(a, b); });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 }
 
@@ -866,12 +1514,28 @@ auto load(char const* data) -> T {
   return ret;
 }
 
+<<<<<<< HEAD
 template<class T, typename Op>
 static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vectorized<T> &b, Op op) {
   static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t);
   __at_align__ intmax_t buffer[element_no];
   static_assert(VECTOR_WIDTH % sizeof(intmax_t) == 0, "VECTOR_WIDTH not a multiple of sizeof(intmax_t)");
   static_assert(sizeof(buffer) == sizeof(Vectorized<T>), "sizeof(buffer) must match sizeof(Vectorized<T>)");
+=======
+template <class T, typename Op>
+static inline Vectorized<T> bitwise_binary_op(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    Op op) {
+  static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t);
+  __at_align__ intmax_t buffer[element_no];
+  static_assert(
+      VECTOR_WIDTH % sizeof(intmax_t) == 0,
+      "VECTOR_WIDTH not a multiple of sizeof(intmax_t)");
+  static_assert(
+      sizeof(buffer) == sizeof(Vectorized<T>),
+      "sizeof(buffer) must match sizeof(Vectorized<T>)");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We should be using memcpy in order to respect the strict aliasing rule
   // see: https://github.com/pytorch/pytorch/issues/66119
   // Using char* is defined in the C11 standard 6.5 Expression paragraph 7
@@ -889,6 +1553,7 @@ static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vect
   return Vectorized<T>::loadu(buffer);
 }
 
+<<<<<<< HEAD
 template<class T, typename std::enable_if_t<!std::is_base_of_v<Vectorizedi, Vectorized<T>>, int> = 0>
 inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
   return bitwise_binary_op(a, b, std::bit_and<intmax_t>());
@@ -898,12 +1563,33 @@ inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
   return bitwise_binary_op(a, b, std::bit_or<intmax_t>());
 }
 template<class T, typename std::enable_if_t<!std::is_base_of_v<Vectorizedi, Vectorized<T>>, int> = 0>
+=======
+template <
+    class T,
+    typename std::
+        enable_if_t<!std::is_base_of_v<Vectorizedi, Vectorized<T>>, int> = 0>
+inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return bitwise_binary_op(a, b, std::bit_and<intmax_t>());
+}
+template <
+    class T,
+    typename std::
+        enable_if_t<!std::is_base_of_v<Vectorizedi, Vectorized<T>>, int> = 0>
+inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return bitwise_binary_op(a, b, std::bit_or<intmax_t>());
+}
+template <
+    class T,
+    typename std::
+        enable_if_t<!std::is_base_of_v<Vectorizedi, Vectorized<T>>, int> = 0>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
   return bitwise_binary_op(a, b, std::bit_xor<intmax_t>());
 }
 
 #endif // defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)
 
+<<<<<<< HEAD
 template<class T, typename std::enable_if_t<!std::is_base_of_v<Vectorizedi, Vectorized<T>>, int> = 0>
 inline Vectorized<T> operator~(const Vectorized<T>& a) {
   using int_t = int_same_size_t<T>;
@@ -912,11 +1598,36 @@ inline Vectorized<T> operator~(const Vectorized<T>& a) {
 }
 
 template <class T> Vectorized<T> inline operator<<(const Vectorized<T> &a, const Vectorized<T> &b) {
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(&)
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(|)
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(^)
+
+template <
+    class T,
+    typename std::
+        enable_if_t<!std::is_base_of_v<Vectorizedi, Vectorized<T>>, int> = 0>
+inline Vectorized<T> operator~(const Vectorized<T>& a) {
+  using int_t = int_same_size_t<T>;
+  Vectorized<T> ones(c10::bit_cast<T>((int_t)(~(int_t)0))); // All bits are 1
+  return a ^ ones;
+}
+
+template <class T>
+Vectorized<T> inline operator<<(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr T max_shift = sizeof(T) * CHAR_BIT;
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     T shift = b[i];
+<<<<<<< HEAD
     if ((static_cast<std::make_signed_t<T>>(shift) < 0) || (shift >= max_shift)) {
+=======
+    if ((static_cast<std::make_signed_t<T>>(shift) < 0) ||
+        (shift >= max_shift)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       c[i] = 0;
     } else {
       c[i] = static_cast<std::make_unsigned_t<T>>(a[i]) << shift;
@@ -925,13 +1636,25 @@ template <class T> Vectorized<T> inline operator<<(const Vectorized<T> &a, const
   return c;
 }
 
+<<<<<<< HEAD
 template <class T> Vectorized<T> inline operator>>(const Vectorized<T> &a, const Vectorized<T> &b) {
+=======
+template <class T>
+Vectorized<T> inline operator>>(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // right shift value to retain sign bit for signed and no bits for unsigned
   constexpr T max_shift = sizeof(T) * CHAR_BIT - std::is_signed_v<T>;
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
     T shift = b[i];
+<<<<<<< HEAD
     if ((static_cast<std::make_signed_t<T>>(shift) < 0) || (shift >= max_shift)) {
+=======
+    if ((static_cast<std::make_signed_t<T>>(shift) < 0) ||
+        (shift >= max_shift)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       c[i] = a[i] >> max_shift;
     } else {
       c[i] = a[i] >> shift;
@@ -941,44 +1664,73 @@ template <class T> Vectorized<T> inline operator>>(const Vectorized<T> &a, const
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T>& operator += (Vectorized<T>& a, const Vectorized<T>& b) {
+=======
+inline Vectorized<T>& operator+=(Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   a = a + b;
   return a;
 }
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T>& operator -= (Vectorized<T>& a, const Vectorized<T>& b) {
+=======
+inline Vectorized<T>& operator-=(Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   a = a - b;
   return a;
 }
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T>& operator /= (Vectorized<T>& a, const Vectorized<T>& b) {
+=======
+inline Vectorized<T>& operator/=(Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   a = a / b;
   return a;
 }
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T>& operator %= (Vectorized<T>& a, const Vectorized<T>& b) {
+=======
+inline Vectorized<T>& operator%=(Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   a = a % b;
   return a;
 }
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T>& operator *= (Vectorized<T>& a, const Vectorized<T>& b) {
+=======
+inline Vectorized<T>& operator*=(Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   a = a * b;
   return a;
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T>& operator <<= (Vectorized<T>& a, const Vectorized<T>& b) {
+=======
+inline Vectorized<T>& operator<<=(Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   a = a << b;
   return a;
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T>& operator >>= (Vectorized<T>& a, const Vectorized<T>& b) {
+=======
+inline Vectorized<T>& operator>>=(Vectorized<T>& a, const Vectorized<T>& b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   a = a >> b;
   return a;
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T> fmadd(const Vectorized<T>& a, const Vectorized<T>& b, const Vectorized<T>& c) {
   return a * b + c;
 }
@@ -988,6 +1740,27 @@ inline Vectorized<T> fmsub(const Vectorized<T>& a, const Vectorized<T>& b, const
   return a * b - c;
 }
 
+=======
+inline Vectorized<T> fmadd(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    const Vectorized<T>& c) {
+  return a * b + c;
+}
+
+VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmadd)
+
+template <typename T>
+inline Vectorized<T> fmsub(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b,
+    const Vectorized<T>& c) {
+  return a * b - c;
+}
+
+VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmsub)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 Vectorized<T> inline operator&&(
     const Vectorized<T>& a,
@@ -999,9 +1772,19 @@ Vectorized<T> inline operator&&(
   return ret;
 }
 
+<<<<<<< HEAD
 template <int64_t scale = 1, typename T = void>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<T>>
 inline gather(T const* base_addr, const Vectorized<int_same_size_t<T>>& vindex) {
+=======
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(&&)
+
+template <int64_t scale = 1, typename T = void>
+std::enable_if_t<
+    scale == 1 || scale == 2 || scale == 4 || scale == 8,
+    Vectorized<
+        T>> inline gather(T const* base_addr, const Vectorized<int_same_size_t<T>>& vindex) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static constexpr int size = Vectorized<T>::size();
   int_same_size_t<T> index_arr[size];
   vindex.store(static_cast<void*>(index_arr));
@@ -1013,36 +1796,65 @@ inline gather(T const* base_addr, const Vectorized<int_same_size_t<T>>& vindex)
 }
 
 template <int64_t scale = 1, typename T = void>
+<<<<<<< HEAD
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<T>>
 inline mask_gather(const Vectorized<T>& src, T const* base_addr,
                    const Vectorized<int_same_size_t<T>>& vindex, Vectorized<T>& mask) {
   static constexpr int size = Vectorized<T>::size();
   T src_arr[size];
   int_same_size_t<T> mask_arr[size];  // use int type so we can logical and
+=======
+std::
+    enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<T>> inline mask_gather(
+        const Vectorized<T>& src,
+        T const* base_addr,
+        const Vectorized<int_same_size_t<T>>& vindex,
+        Vectorized<T>& mask) {
+  static constexpr int size = Vectorized<T>::size();
+  T src_arr[size];
+  int_same_size_t<T> mask_arr[size]; // use int type so we can logical and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int_same_size_t<T> index_arr[size];
   src.store(static_cast<void*>(src_arr));
   mask.store(static_cast<void*>(mask_arr));
   vindex.store(static_cast<void*>(index_arr));
   T buffer[size];
   for (const auto i : c10::irange(size)) {
+<<<<<<< HEAD
     if (mask_arr[i] & 0x01) {  // check highest bit
+=======
+    if (mask_arr[i] & 0x01) { // check highest bit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
     } else {
       buffer[i] = src_arr[i];
     }
   }
+<<<<<<< HEAD
   mask = Vectorized<T>(static_cast<T>(0));  // "zero out" mask
+=======
+  mask = Vectorized<T>(static_cast<T>(0)); // "zero out" mask
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Vectorized<T>::loadu(static_cast<void*>(buffer));
 }
 
 // Cast a given vector to another type without changing the bits representation.
 // So a Vectorized<double> of 512 bits containing all ones can be cast to a
+<<<<<<< HEAD
 // Vectorized<int64_t> of 512 bits containing all ones (i.e., eight negative 1s).
 // A Vec<double> of 256 bits containing all ones can be cast to a
 // Vec<int64_t> of 256 bits containing all ones (i.e., four negative 1s).
 // There is a struct here because we don't have static_if and I can't
 // partially specialize a templated function.
 template<typename dst_t, typename src_t>
+=======
+// Vectorized<int64_t> of 512 bits containing all ones (i.e., eight negative
+// 1s). A Vec<double> of 256 bits containing all ones can be cast to a
+// Vec<int64_t> of 256 bits containing all ones (i.e., four negative 1s).
+// There is a struct here because we don't have static_if and I can't
+// partially specialize a templated function.
+template <typename dst_t, typename src_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct CastImpl {
   static inline Vectorized<dst_t> apply(const Vectorized<src_t>& src) {
     src_t src_arr[Vectorized<src_t>::size()];
@@ -1051,19 +1863,28 @@ struct CastImpl {
   }
 };
 
+<<<<<<< HEAD
 template<typename scalar_t>
+=======
+template <typename scalar_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct CastImpl<scalar_t, scalar_t> {
   static inline Vectorized<scalar_t> apply(const Vectorized<scalar_t>& src) {
     return src;
   }
 };
 
+<<<<<<< HEAD
 template<typename dst_t, typename src_t>
+=======
+template <typename dst_t, typename src_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Vectorized<dst_t> cast(const Vectorized<src_t>& src) {
   return CastImpl<dst_t, src_t>::apply(src);
 }
 
 template <typename T, typename IntType = int_same_size_t<T>>
+<<<<<<< HEAD
 inline Vectorized<IntType> convert_to_int_of_same_size(const Vectorized<T>& src) {
   static_assert(sizeof(T) == sizeof(IntType));
   static constexpr int size = Vectorized<T>::size();
@@ -1073,22 +1894,52 @@ inline Vectorized<IntType> convert_to_int_of_same_size(const Vectorized<T>& src)
   std::array<IntType, size> buffer;
   std::transform(src_arr.cbegin(), src_arr.cend(), buffer.begin(),
                  [](const T& x) { return static_cast<IntType>(x); });
+=======
+inline Vectorized<IntType> convert_to_int_of_same_size(
+    const Vectorized<T>& src) {
+  static_assert(sizeof(T) == sizeof(IntType));
+  static constexpr int size = Vectorized<T>::size();
+
+  std::array<T, size> src_arr = {};
+  src.store(static_cast<void*>(src_arr.data()));
+  std::array<IntType, size> buffer;
+  std::transform(
+      src_arr.cbegin(), src_arr.cend(), buffer.begin(), [](const T& x) {
+        return static_cast<IntType>(x);
+      });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Vectorized<IntType>::loadu(static_cast<const void*>(buffer.data()));
 }
 
 template <typename T, typename IntType = int_same_size_t<T>>
+<<<<<<< HEAD
 inline Vectorized<T> convert_to_fp_of_same_size(const Vectorized<IntType>& src) {
+=======
+inline Vectorized<T> convert_to_fp_of_same_size(
+    const Vectorized<IntType>& src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static_assert(sizeof(T) == sizeof(IntType));
   static constexpr int size = Vectorized<T>::size();
 
   std::array<IntType, size> src_arr;
   src.store(static_cast<void*>(src_arr.data()));
   std::array<T, size> buffer;
+<<<<<<< HEAD
   std::transform(src_arr.cbegin(), src_arr.cend(), buffer.begin(),
                  [](const IntType& x) { return static_cast<T>(x); });
   return Vectorized<T>::loadu(static_cast<const void*>(buffer.data()));
 }
 
+=======
+  std::transform(
+      src_arr.cbegin(), src_arr.cend(), buffer.begin(), [](const IntType& x) {
+        return static_cast<T>(x);
+      });
+  return Vectorized<T>::loadu(static_cast<const void*>(buffer.data()));
+}
+
+// clang-format off
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Example inputs for AVX512:
 // a   Vectorized<float>   = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
 // b   Vectorized<float>   = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15}
@@ -1099,8 +1950,16 @@ inline Vectorized<T> convert_to_fp_of_same_size(const Vectorized<IntType>& src)
 //               b                      Vectorized<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
 //       returns:                       Vectorized<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
 //                                      Vectorized<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
+<<<<<<< HEAD
 template <typename T>
 inline std::enable_if_t<Vectorized<T>::size() % 2 == 0, std::pair<Vectorized<T>, Vectorized<T>>>
+=======
+// clang-format on
+template <typename T>
+inline std::enable_if_t<
+    Vectorized<T>::size() % 2 == 0,
+    std::pair<Vectorized<T>, Vectorized<T>>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
   static constexpr int size = Vectorized<T>::size();
   static constexpr int half_size = size / 2;
@@ -1116,10 +1975,21 @@ deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
     buffer2[i] = a_arr[i * 2 + 1];
     buffer2[half_size + i] = b_arr[i * 2 + 1];
   }
+<<<<<<< HEAD
   return std::make_pair(Vectorized<T>::loadu(static_cast<void*>(buffer1)),
                         Vectorized<T>::loadu(static_cast<void*>(buffer2)));
 }
 
+=======
+  return std::make_pair(
+      Vectorized<T>::loadu(static_cast<void*>(buffer1)),
+      Vectorized<T>::loadu(static_cast<void*>(buffer2)));
+}
+
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(deinterleave2)
+
+// clang-format off
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // inverse operation of deinterleave2
 // Example inputs for AVX512:
 //  a       Vectorized<float>   = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15}
@@ -1131,8 +2001,16 @@ deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
 //                   b                   Vectorized<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
 //       returns:            Vectorized<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
 //                           Vectorized<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
+<<<<<<< HEAD
 template <typename T>
 inline std::enable_if_t<Vectorized<T>::size() % 2 == 0, std::pair<Vectorized<T>, Vectorized<T>>>
+=======
+// clang-format on
+template <typename T>
+inline std::enable_if_t<
+    Vectorized<T>::size() % 2 == 0,
+    std::pair<Vectorized<T>, Vectorized<T>>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
   static constexpr int size = Vectorized<T>::size();
   static constexpr int half_size = size / 2;
@@ -1148,6 +2026,7 @@ interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
     buffer2[i * 2] = a_arr[half_size + i];
     buffer2[i * 2 + 1] = b_arr[half_size + i];
   }
+<<<<<<< HEAD
   return std::make_pair(Vectorized<T>::loadu(static_cast<void*>(buffer1)),
                         Vectorized<T>::loadu(static_cast<void*>(buffer2)));
 }
@@ -1156,6 +2035,23 @@ template <typename src_T, typename dst_T>
 inline void convert(const src_T *src, dst_T *dst, int64_t n) {
 #ifndef _MSC_VER
 # pragma unroll
+=======
+  return std::make_pair(
+      Vectorized<T>::loadu(static_cast<void*>(buffer1)),
+      Vectorized<T>::loadu(static_cast<void*>(buffer2)));
+}
+
+VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(interleave2)
+
+#undef VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC
+#undef VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP
+#undef VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC
+
+template <typename src_T, typename dst_T>
+inline void convert(const src_T* src, dst_T* dst, int64_t n) {
+#ifndef _MSC_VER
+#pragma unroll
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   for ([[maybe_unused]] const auto i : c10::irange(n)) {
     *dst = c10::convert<dst_T>(c10::load(src));
@@ -1165,7 +2061,11 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) {
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline Vectorized<T> flip(const Vectorized<T> & data) {
+=======
+inline Vectorized<T> flip(const Vectorized<T>& data) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static constexpr int size = Vectorized<T>::size();
   T output[size];
   T buffer[size];
@@ -1176,6 +2076,7 @@ inline Vectorized<T> flip(const Vectorized<T> & data) {
   return Vectorized<T>::loadu(static_cast<void*>(output));
 }
 
+<<<<<<< HEAD
 // Transpose the `src` buffer of type `T` and size (M,N) into the `dst` buffer. `ld_src` is the leading
 // dimension of `src` and `ld_dst` is the leading dimension of `dst`.
 template <typename T>
@@ -1183,11 +2084,28 @@ inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst,
   for (int i = 0; i < M; i++) {
     for (int j = 0; j < N; j++) {
       dst[j*ld_dst + i] = src[i*ld_src + j];
+=======
+// Transpose the `src` buffer of type `T` and size (M,N) into the `dst` buffer.
+// `ld_src` is the leading dimension of `src` and `ld_dst` is the leading
+// dimension of `dst`.
+template <typename T>
+inline void transpose_mxn(
+    const T* src,
+    int64_t ld_src,
+    T* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      dst[j * ld_dst + i] = src[i * ld_src + j];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 }
 
 template <typename T, int M, int N>
+<<<<<<< HEAD
 inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst) {
   transpose_mxn<T>(src, ld_src, dst, ld_dst, M, N);
 }
@@ -1198,3 +2116,20 @@ inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst)
 #include <ATen/cpu/vec/vec_n.h>
 #include <ATen/cpu/vec/vec_mask.h>
 #include <ATen/cpu/vec/vec_convert.h>
+=======
+inline void transpose_mxn(
+    const T* src,
+    int64_t ld_src,
+    T* dst,
+    int64_t ld_dst) {
+  transpose_mxn<T>(src, ld_src, dst, ld_dst, M, N);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
+
+// additional headers for more operations that depend on vec_base
+#include <ATen/cpu/vec/vec_convert.h>
+#include <ATen/cpu/vec/vec_mask.h>
+#include <ATen/cpu/vec/vec_n.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/cpu/vec/vec_convert.h b/aten/src/ATen/cpu/vec/vec_convert.h
index a5cee03dabcf..98ad0cab93fb 100644
--- a/aten/src/ATen/cpu/vec/vec_convert.h
+++ b/aten/src/ATen/cpu/vec/vec_convert.h
@@ -28,8 +28,13 @@ struct VecConvert {
 };
 
 template <typename dst_t, typename src_t>
+<<<<<<< HEAD
 inline std::enable_if_t<std::is_same_v<dst_t, src_t>, Vectorized<src_t>>
 convert(const Vectorized<src_t>& src) {
+=======
+inline std::enable_if_t<std::is_same_v<dst_t, src_t>, Vectorized<src_t>> convert(
+    const Vectorized<src_t>& src) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return src;
 }
 
@@ -62,4 +67,21 @@ convert(const VectorizedN<src_t, src_n>& src) {
 }
 
 } // namespace CPU_CAPABILITY
+<<<<<<< HEAD
+=======
+
+template <
+    typename scalar_t,
+    typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_to_float(
+    const Vectorized<scalar_t>&);
+
+template <
+    typename scalar_t,
+    typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline Vectorized<scalar_t> convert_from_float(
+    const Vectorized<float>&,
+    const Vectorized<float>&);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec_half.h b/aten/src/ATen/cpu/vec/vec_half.h
index c7c90cc95b47..67aedf6551f2 100644
--- a/aten/src/ATen/cpu/vec/vec_half.h
+++ b/aten/src/ATen/cpu/vec/vec_half.h
@@ -103,7 +103,13 @@ static inline void transpose_pad_2x32_block(
     _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 32), d1);
   }
 #else
+<<<<<<< HEAD
 TORCH_CHECK(false, "transpose_pad_2x32_block is only supported when avx512 is supported")
+=======
+  TORCH_CHECK(
+      false,
+      "transpose_pad_2x32_block is only supported when avx512 is supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 }
 
@@ -124,16 +130,27 @@ static inline void pack_vnni2(
   for (; bk < _K; bk += 2) {
     int64_t bn = 0;
     for (; bn < _N; bn += 32) {
+<<<<<<< HEAD
       transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src);
     }
     int64_t nrem = N - bn;
     if (nrem > 0) {
       transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 2, nrem);
+=======
+      transpose_pad_2x32_block(
+          src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src);
+    }
+    int64_t nrem = N - bn;
+    if (nrem > 0) {
+      transpose_pad_2x32_block(
+          src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 2, nrem);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   if (K % 2 == 1) {
     int64_t bn = 0;
     for (; bn < _N; bn += 32) {
+<<<<<<< HEAD
       transpose_pad_2x32_block(src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1);
     }
     int64_t nrem = N - bn;
@@ -147,5 +164,21 @@ TORCH_CHECK(false, "pack_vnni2 is only supported when avx512 is supported")
 }
 
 
+=======
+      transpose_pad_2x32_block(
+          src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1);
+    }
+    int64_t nrem = N - bn;
+    if (nrem > 0) {
+      transpose_pad_2x32_block(
+          src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1, nrem);
+    }
+  }
+#else
+  TORCH_CHECK(false, "pack_vnni2 is only supported when avx512 is supported")
+#endif
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec_mask.h b/aten/src/ATen/cpu/vec/vec_mask.h
index c547e5911ecb..b1a0cf0441c0 100644
--- a/aten/src/ATen/cpu/vec/vec_mask.h
+++ b/aten/src/ATen/cpu/vec/vec_mask.h
@@ -68,7 +68,16 @@ struct VecMaskTo {
   }
 };
 
+<<<<<<< HEAD
 template <typename dst_t, int dst_n, typename src_t, int src_n, typename Enabled = void>
+=======
+template <
+    typename dst_t,
+    int dst_n,
+    typename src_t,
+    int src_n,
+    typename Enabled = void>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct VecMaskCast {
   static inline VecMask<dst_t, dst_n> apply(
       const VecMask<src_t, src_n>& vec_mask) {
@@ -88,15 +97,27 @@ struct VecMaskCheck {
   static inline bool all_zero(const VectorizedN<T, N>& vec_mask) {
     __at_align__ T mask[VectorizedN<T, N>::size()];
     vec_mask.store(mask);
+<<<<<<< HEAD
     return std::all_of(
         mask, mask + VectorizedN<T, N>::size(), [](T m) { return m == static_cast<T>(0); });
+=======
+    return std::all_of(mask, mask + VectorizedN<T, N>::size(), [](T m) {
+      return m == static_cast<T>(0);
+    });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   static inline bool all_masked(const VectorizedN<T, N>& vec_mask) {
     __at_align__ T mask[VectorizedN<T, N>::size()];
     vec_mask.store(mask);
+<<<<<<< HEAD
     return std::all_of(
         mask, mask + VectorizedN<T, N>::size(), [](T m) { return m != static_cast<T>(0); });
+=======
+    return std::all_of(mask, mask + VectorizedN<T, N>::size(), [](T m) {
+      return m != static_cast<T>(0);
+    });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   static inline bool is_masked(const VectorizedN<T, N>& vec_mask, int i) {
@@ -159,6 +180,7 @@ class VecMask {
   }
 
   static VecMask<T, N> blendv(
+<<<<<<< HEAD
     const VecMask<T, N>& c,
     const VecMask<T, N>& b,
     const VecMask<T, N>& a) {
@@ -166,6 +188,13 @@ class VecMask {
       VectorizedN<T, N>(c),
       VectorizedN<T, N>(b),
       VectorizedN<T, N>(a));
+=======
+      const VecMask<T, N>& c,
+      const VecMask<T, N>& b,
+      const VecMask<T, N>& a) {
+    VectorizedN<T, N> result = VectorizedN<T, N>::blendv(
+        VectorizedN<T, N>(c), VectorizedN<T, N>(b), VectorizedN<T, N>(a));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return result;
   }
 
@@ -174,14 +203,24 @@ class VecMask {
       const VecMask<T, N>& b,
       int64_t count = size()) {
     VectorizedN<T, N> result = VectorizedN<T, N>::set(
+<<<<<<< HEAD
       VectorizedN<T, N>(a),
       VectorizedN<T, N>(b),
       count);
+=======
+        VectorizedN<T, N>(a), VectorizedN<T, N>(b), count);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return result;
   }
 
   void store(bool* b, int count = size()) {
+<<<<<<< HEAD
     constexpr int L = (VectorizedN<T, N>::size() + Vectorized<bool>::size() - 1)/ Vectorized<bool>::size();
+=======
+    constexpr int L =
+        (VectorizedN<T, N>::size() + Vectorized<bool>::size() - 1) /
+        Vectorized<bool>::size();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto res = this->to<bool, L>();
     res.store(b, count);
     return;
diff --git a/aten/src/ATen/cuda/Atomic.cuh b/aten/src/ATen/cuda/Atomic.cuh
index 5a127b4d7507..a9868dd1937b 100644
--- a/aten/src/ATen/cuda/Atomic.cuh
+++ b/aten/src/ATen/cuda/Atomic.cuh
@@ -330,9 +330,24 @@ inline __device__ void gpuAtomicAddNoReturn(int64_t *address, int64_t val) { gpu
 inline __device__ void gpuAtomicAddNoReturn(bool *address, bool val) { gpuAtomicAdd(address, val); }
 inline __device__ void gpuAtomicAddNoReturn(at::Half *address, at::Half val) { gpuAtomicAdd(address, val); }
 inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BFloat16 val) { gpuAtomicAdd(address, val); }
+<<<<<<< HEAD
 inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }
 
 /* Special case fp32 atomic. */
+=======
+
+/* Note [HIP unsafeAtomicAdd]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Use unsafeAtomicAdd instead of atomicAdd for fp32 and fp64.
+ * On HIP, atomicAdd is always correct but is a slow CAS loop.
+ * unsafeAtomicAdd will use HW instructions and is much faster,
+ * but the caller must guarantee the pointer is GPU memory.
+ * If the pointer is system memory, the result is a silent no-op.
+ * This guarantee is upheld by all PyTorch uses of unsafeAtomicAdd.
+ * AMD HIP atomic header file is named amd_hip_atomic.h and is
+ * under the LLVM compiler directory.
+ */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(USE_ROCM)
 inline __device__ void gpuAtomicAddNoReturn(float *address, float val) {
 #if defined(__gfx908__)
@@ -341,8 +356,15 @@ inline __device__ void gpuAtomicAddNoReturn(float *address, float val) {
   (void)unsafeAtomicAdd(address, val);
 #endif
 }
+<<<<<<< HEAD
+#else
+inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); }
+=======
+inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { (void)unsafeAtomicAdd(address, val); }
 #else
 inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 // Atomic multiplication implementation.
@@ -399,7 +421,11 @@ template <typename T>
 __host__ __device__ T safe_max(T a, T b) {
   #if defined(__HIPCC__)
   // TODO: remove this special case for HIP when issue is fixed:
+<<<<<<< HEAD
   //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+=======
+  //       https://github.com/ROCm/hip/issues/2209
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max<T>(a, b));
   #else
     T max = at::_isnan(b) ? b : std::max<T>(a, b);
@@ -459,7 +485,11 @@ template <typename T>
 __host__ __device__ T safe_min(T a, T b) {
   #if defined(__HIPCC__)
   // TODO: remove this special case for HIP when issue is fixed:
+<<<<<<< HEAD
   //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+=======
+  //       https://github.com/ROCm/hip/issues/2209
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min<T>(a, b));
   #else
     T min = at::_isnan(b) ? b : std::min<T>(a, b);
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index abf45deffeb9..e5b2de4e76bd 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -17,7 +17,10 @@
 #include <c10/core/ScalarType.h>
 
 #ifdef USE_ROCM
+<<<<<<< HEAD
 #include <c10/cuda/CUDAStream.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <hipblaslt/hipblaslt-ext.hpp>
 // until hipblas has an API to accept flags, we must use rocblas here
 #include <hipblas/hipblas.h>
@@ -112,12 +115,24 @@ static cublasOperation_t _cublasOpFromChar(char op) {
   // NOLINTNEXTLINE(bugprone-switch-missing-default-case)
   switch (op) {
     case 'n':
+<<<<<<< HEAD
     case 'N':
       return CUBLAS_OP_N;
     case 't':
     case 'T':
       return CUBLAS_OP_T;
     case 'c':
+=======
+      [[fallthrough]];
+    case 'N':
+      return CUBLAS_OP_N;
+    case 't':
+      [[fallthrough]];
+    case 'T':
+      return CUBLAS_OP_T;
+    case 'c':
+      [[fallthrough]];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case 'C':
       return CUBLAS_OP_C;
   }
@@ -186,6 +201,7 @@ uint32_t _getAlignment(uintptr_t address) {
 }
 #endif
 
+<<<<<<< HEAD
 #ifdef USE_ROCM
 static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) {
   // 0 is default value, meaning full CUs i.e. no mask
@@ -244,6 +260,8 @@ static void _syncCurrentWithCarveoutStream(hipStream_t stream, bool presync) {
 }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct CublasLtWorkspace {
   CublasLtWorkspace() {
     size = at::cuda::getCUDABlasLtWorkspaceSize();
@@ -252,7 +270,10 @@ struct CublasLtWorkspace {
   void * ptr;
   size_t size;
 };
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // anonymous namespace
 
 namespace at::cuda::blas {
@@ -280,7 +301,10 @@ namespace at::cuda::blas {
     CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, num_batches);  \
   } while (0)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 // Following the pattern of CuSparseDescriptor
 // Defined here for now because this is the only place cublas_lt interface is
@@ -366,11 +390,30 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
 } // namespace
 
 
+<<<<<<< HEAD
 template <typename Dtype>
 inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
   cudaDataType_t abcType = CUDA_R_32F;
   cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
   cudaDataType_t scaleType = CUDA_R_32F;
+=======
+template <typename Dtype, typename C_Dtype = Dtype>
+static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+#if defined(USE_ROCM) && ROCM_VERSION == 60400
+  // regression in ROCm 6.4, planned fixed in 6.4.1, hipblaslt TT fp32 calculation errors
+  // best to disallow hipblaslt for this specific case
+  if constexpr (std::is_same_v<Dtype, float>) {
+    if (_cublasOpFromChar(transa) == CUBLAS_OP_T && _cublasOpFromChar(transb) == CUBLAS_OP_T) {
+        return false;
+    }
+  }
+#endif
+  cudaDataType_t abType = CUDA_R_32F;
+  cudaDataType_t cType = CUDA_R_32F;
+  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
+  cudaDataType_t scaleType = CUDA_R_32F;
+  CuBlasLtMatmulPreference preference;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   at::Half halpha;
   at::Half hbeta;
@@ -378,7 +421,12 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
   void * alpha_ptr = &alpha;
   void * beta_ptr = &beta;
   if constexpr (std::is_same_v<Dtype, double>) {
+<<<<<<< HEAD
     abcType = CUDA_R_64F;
+=======
+    abType = CUDA_R_64F;
+    cType = CUDA_R_64F;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     computeType = CUBLAS_COMPUTE_64F;
     scaleType = CUDA_R_64F;
   } else if constexpr (std::is_same_v<Dtype, float>) {
@@ -386,26 +434,60 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
       computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
     }
   } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
+<<<<<<< HEAD
     abcType = CUDA_C_64F;
     computeType = CUBLAS_COMPUTE_64F;
     scaleType = CUDA_C_64F;
   } else if constexpr (std::is_same_v<Dtype, c10::complex<float>>) {
     abcType = CUDA_C_32F;
+=======
+    abType = CUDA_C_64F;
+    cType = CUDA_C_64F;
+    computeType = CUBLAS_COMPUTE_64F;
+    scaleType = CUDA_C_64F;
+  } else if constexpr (std::is_same_v<Dtype, c10::complex<float>>) {
+    abType = CUDA_C_32F;
+    cType = CUDA_C_32F;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scaleType = CUDA_C_32F;
   } else if constexpr (std::is_same_v<Dtype, at::Half>) {
 #ifndef USE_ROCM
     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
     if (prop->major >= 7 && at::globalContext().allowFP16AccumulationCuBLAS()) {
       computeType = CUBLAS_COMPUTE_16F;
+<<<<<<< HEAD
+=======
+      scaleType = CUDA_R_16F;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       halpha = alpha;
       hbeta = beta;
       alpha_ptr = &halpha;
       beta_ptr = &hbeta;
     }
 #endif
+<<<<<<< HEAD
     abcType = CUDA_R_16F;
   } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
     abcType = CUDA_R_16BF;
+=======
+    abType = CUDA_R_16F;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
+#ifndef USE_ROCM
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    }
+#endif
+  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
+    abType = CUDA_R_16BF;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
+#ifndef USE_ROCM
+    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
   }
@@ -419,7 +501,10 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
   CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, opa);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, opb);
+<<<<<<< HEAD
   auto stream = at::cuda::getCurrentCUDAStream();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     computeDesc.setAttribute<int32_t>(
@@ -427,6 +512,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
         at::cuda::getCurrentDeviceProperties()->multiProcessorCount -
             at::globalContext()._SMCarveout_EXPERIMENTAL().value());
   }
+<<<<<<< HEAD
 #else
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     stream = _getCarveoutStream(
@@ -437,6 +523,12 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
   CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, opa == CUBLAS_OP_T);
   CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, opb == CUBLAS_OP_T);
   CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc);
+=======
+#endif
+  CuBlasLtMatrixLayout Adesc(abType, m, k, lda, opa == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Bdesc(abType, k, n, ldb, opb == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Cdesc(cType, m, n, ldc);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (num_batches > 1) {
     int num_batches_as_int = static_cast<int>(num_batches);
@@ -448,8 +540,11 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
     Cdesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stridec);
   }
 
+<<<<<<< HEAD
   CuBlasLtMatmulPreference preference;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   uint32_t a_alignment = _getAlignment(reinterpret_cast<uintptr_t>(a));
   uint32_t b_alignment = _getAlignment(reinterpret_cast<uintptr_t>(b));
@@ -463,6 +558,10 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
   TORCH_CHECK(ltworkspace.ptr != nullptr, "OOM trying to allocate workspace for cublaslt");
   preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size);
 
+<<<<<<< HEAD
+=======
+  cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
   TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
@@ -477,10 +576,17 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
       &heuristicResult,
       &returnedResult));
   if (returnedResult == 0) {
+<<<<<<< HEAD
     TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
   }
 
   cublasStatus_t cublasStatus = cublasLtMatmul(
+=======
+    cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED;
+  }
+  else {
+    cublasStatus = cublasLtMatmul(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ltHandle,
       computeDesc.descriptor(),
       alpha_ptr,
@@ -496,6 +602,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
       &heuristicResult.algo,
       ltworkspace.ptr,
       ltworkspace.size,
+<<<<<<< HEAD
       stream);
 #ifdef USE_ROCM
     if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
@@ -505,6 +612,13 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
   TORCH_CHECK(
       cublasStatus == CUBLAS_STATUS_SUCCESS,
       "CUDA error: ",
+=======
+      at::cuda::getCurrentCUDAStream());
+  }
+  if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
+    TORCH_WARN(
+      "bgemm_internal_cublaslt error: ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::cuda::blas::_cublasGetErrorEnum(cublasStatus),
       " when calling cublasLtMatmul with transpose_mat1 ",
       (opa == CUBLAS_OP_T),
@@ -522,6 +636,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
       ldb,
       " ldc ",
       ldc,
+<<<<<<< HEAD
       " abcType ",
       abcType,
       " computeType ",
@@ -534,6 +649,26 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
 template <typename Dtype>
 inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
   static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublas: not implemented");
+=======
+      " abType ",
+      abType,
+      " cType ",
+      cType,
+      " computeType ",
+      computeType,
+      " scaleType ",
+      scaleType,
+      ". Will attempt to recover by calling cublas instead.");
+    return false;
+  }
+  return true;
+}
+
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::bgemm: not implemented for input type ", typeid(Dtype).name(), " and output type ", typeid(C_Dtype).name());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
@@ -592,8 +727,13 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
       reinterpret_cast<cuComplex*>(c), ldc, stridec, num_batches));
 }
 
+<<<<<<< HEAD
 template <>
 void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
+=======
+template <typename C_Dtype>
+inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -638,6 +778,7 @@ void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
       handle, opa, opb, m, n, k,
       alpha_ptr, a, CUDA_R_16F, lda, stridea,
       b, CUDA_R_16F, ldb, strideb, beta_ptr,
+<<<<<<< HEAD
       c, CUDA_R_16F, ldc, stridec,
       num_batches, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   } else {
@@ -648,13 +789,40 @@ void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
         alpha, (a + i * stridea), lda,
         (b + i * strideb), ldb, beta,
         (c + i * stridec), ldc);
+=======
+      c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F, ldc, stridec,
+      num_batches, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  } else {
+    for (const auto i : c10::irange(num_batches)) {
+      if (std::is_same_v<C_Dtype, float>) {
+        float* c_ptr = (float*)(c + i * stridec);
+        at::cuda::blas::gemm<at::Half, float>(
+            transa, transb,
+            m, n, k,
+            alpha, (a + i * stridea), lda,
+            (b + i * strideb), ldb, beta,
+            c_ptr, ldc);
+      } else {
+        at::cuda::blas::gemm<at::Half>(
+            transa, transb,
+            m, n, k,
+            alpha, (a + i * stridea), lda,
+            (b + i * strideb), ldb, beta,
+            (c + i * stridec), ldc);
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 #endif // USE_ROCM
 }
 
+<<<<<<< HEAD
 template <>
 void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+=======
+template <typename C_Dtype>
+inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   BGEMM_CHECK_ARGVALUES(at::BFloat16);
@@ -671,6 +839,7 @@ void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
   auto compute_type = CUDA_R_32F;
 #endif
   TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedEx(handle,
+<<<<<<< HEAD
                                   opa, opb, (int)m, (int)n, (int)k,
                                   (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
                                   b, CUDA_R_16BF, (int)ldb, strideb,
@@ -681,6 +850,40 @@ void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
 }
 
 template <>
+=======
+                              opa, opb, (int)m, (int)n, (int)k,
+                              (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
+                              b, CUDA_R_16BF, (int)ldb, strideb,
+                              (void*)&fbeta, c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16BF,
+                              (int)ldc, stridec, (int)num_batches,
+                              compute_type,
+                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+
+template <>
+void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
+  bgemm_internal_cublas_half_helper<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+}
+
+template <>
+void bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  bgemm_internal_cublas_half_helper<float>(CUDABLAS_BGEMM_ARGS(at::Half));
+}
+
+template <>
+void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+  bgemm_internal_cublas_bfloat16_helper<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+}
+
+
+template <>
+void bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  bgemm_internal_cublas_bfloat16_helper<float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+}
+
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
 {
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
@@ -688,7 +891,13 @@ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
     // hipblaslt does not support double gemm yet
     bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGS(double));
 #else
+<<<<<<< HEAD
     bgemm_internal_cublaslt<double>(CUDABLAS_BGEMM_ARGS(double));
+=======
+    if (!bgemm_internal_cublaslt<double>(CUDABLAS_BGEMM_ARGS(double))) {
+      bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGS(double));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
   else {
@@ -700,7 +909,13 @@ template <>
 void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
 {
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+<<<<<<< HEAD
     bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float));
+=======
+    if (!bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float))) {
+      bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   else {
     bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
@@ -715,7 +930,13 @@ void bgemm_internal<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<d
     // hipblaslt does not support complex<double> gemm yet
     bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
 #else
+<<<<<<< HEAD
     bgemm_internal_cublaslt<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+=======
+    if (!bgemm_internal_cublaslt<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>))) {
+      bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
   else {
@@ -731,7 +952,13 @@ void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<fl
     // hipblaslt does not support complex<float> gemm yet
     bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
 #else
+<<<<<<< HEAD
     bgemm_internal_cublaslt<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+=======
+    if (!bgemm_internal_cublaslt<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>))) {
+      bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   }
   else {
@@ -743,7 +970,13 @@ template <>
 void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
 {
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+<<<<<<< HEAD
     bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+=======
+    if (!bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half))) {
+      bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   else {
     bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
@@ -754,9 +987,17 @@ template <>
 void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
 {
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+<<<<<<< HEAD
     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
   }
 #ifdef USE_ROCM
+=======
+    if (!bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16))) {
+      bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+    }
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
   }
@@ -766,9 +1007,56 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
   }
 }
 
+<<<<<<< HEAD
 template <typename DType>
 inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) {
   tunable::GemmStridedBatchedParams<DType> params;
+=======
+template<>
+void bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
+{
+  if (at::globalContext().allowFP16AccumulationCuBLAS()) {
+    // Do not allow fp16 reductions with fp32 output
+    TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
+  }
+
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    if (!bgemm_internal_cublaslt<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half))) {
+      bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
+    }
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
+  }
+}
+
+template<>
+void bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    if (!bgemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16))) {
+      bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+    }
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  tunable::GemmStridedBatchedParams<Dtype> params;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   params.transa = transa;
   params.transb = transb;
   params.m = m;
@@ -791,6 +1079,7 @@ inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) {
   bool transb_ = ((transb != 'n') && (transb != 'N'));
 
   if (transa_ && transb_) {
+<<<<<<< HEAD
     static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::T> bgemm{};
     bgemm(&params);
   }
@@ -804,6 +1093,21 @@ inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) {
   }
   else if (!transa_ && !transb_) {
     static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::N> bgemm{};
+=======
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::T, tunable::BlasOp::T> bgemm{};
+    bgemm(&params);
+  }
+  else if (transa_ && !transb_) {
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::T, tunable::BlasOp::N> bgemm{};
+    bgemm(&params);
+  }
+  else if (!transa_ && transb_) {
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::N, tunable::BlasOp::T> bgemm{};
+    bgemm(&params);
+  }
+  else if (!transa_ && !transb_) {
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::N, tunable::BlasOp::N> bgemm{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bgemm(&params);
   }
   else {
@@ -877,6 +1181,7 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
   }
 }
 
+<<<<<<< HEAD
 template <typename Dtype>
 inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
   // forward to bgemm implementation but set strides and batches to 0
@@ -889,6 +1194,39 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
 }
 
 
+=======
+template <>
+void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm");
+  #endif
+  // TODO: Support tuning for Half inputs and FP32 output
+  bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
+}
+
+
+template <>
+void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm");
+  #else
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+
+    if (prop->major < 8)
+      TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is only supported for CUDA devices with compute capability 8.0 or higher");
+  #endif
+  // TODO: Support tuning for BFloat16 inputs and FP32 output
+  bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+}
+
+
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::gemm: not implemented for input type ", typeid(Dtype).name(), " and output type ", typeid(C_Dtype).name());
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
   // See Note [Writing Nondeterministic Operations]
@@ -945,8 +1283,13 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
       reinterpret_cast<cuComplex*>(c), ldc));
 }
 
+<<<<<<< HEAD
 template <>
 void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+=======
+template <typename C_Dtype>
+inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -1025,7 +1368,11 @@ void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
         ldb,
         beta_ptr,
         c,
+<<<<<<< HEAD
         CUDA_R_16F,
+=======
+        std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ldc,
         compute_type,
         CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -1047,14 +1394,23 @@ void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
         ldb,
         &fbeta,
         c,
+<<<<<<< HEAD
         CUDA_R_16F,
+=======
+        std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ldc));
   }
 #endif
 }
 
+<<<<<<< HEAD
 template <>
 void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+=======
+template <typename C_Dtype>
+inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
@@ -1091,7 +1447,11 @@ void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
       ldb,
       &fbeta,
       c,
+<<<<<<< HEAD
       CUDA_R_16BF,
+=======
+      std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16BF,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ldc,
       compute_type,
       CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -1099,6 +1459,37 @@ void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 }
 
 template <>
+<<<<<<< HEAD
+=======
+void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+  gemm_internal_cublas_half_helper<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+}
+
+template <>
+void gemm_internal_cublas<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  gemm_internal_cublas_half_helper<float>(CUDABLAS_GEMM_ARGS(at::Half));
+}
+
+template <>
+void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+  gemm_internal_cublas_bfloat16_helper<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+}
+
+template <>
+void gemm_internal_cublas<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  gemm_internal_cublas_bfloat16_helper<float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+}
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  // forward to bgemm implementation but set strides and batches to 0
+  if (!bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0)) {
+    gemm_internal_cublas(CUDABLAS_GEMM_ARGS(Dtype));
+  }
+}
+
+template <>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
 {
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
@@ -1109,7 +1500,11 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
 #endif
   }
+<<<<<<< HEAD
 #ifdef USE_ROCM
+=======
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
   }
@@ -1125,9 +1520,19 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
   }
+<<<<<<< HEAD
 #ifdef USE_ROCM
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
+=======
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
+      gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+    } else{
+      at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #endif
   else {
@@ -1173,7 +1578,11 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
   }
+<<<<<<< HEAD
 #ifdef USE_ROCM
+=======
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
   }
@@ -1189,7 +1598,11 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
   }
+<<<<<<< HEAD
 #ifdef USE_ROCM
+=======
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
   }
@@ -1199,8 +1612,50 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
   }
 }
 
+<<<<<<< HEAD
 template <typename DType>
 inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES(DType)) {
+=======
+template<>
+void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
+{
+  if (at::globalContext().allowFP16AccumulationCuBLAS()) {
+    // Do not allow fp16 reductions with fp32 output
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
+  }
+
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    gemm_internal_cublaslt<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    gemm_internal_cublas<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+}
+
+template<>
+void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    gemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    gemm_internal_cublas<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename DType, typename C_Dtype>
+inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(DType, C_Dtype)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tunable::GemmParams<DType> params;
   params.transa = transa;
   params.transb = transb;
@@ -1306,9 +1761,39 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
   }
 }
 
+<<<<<<< HEAD
 
 template <typename Dtype>
 void gemm_and_bias(
+=======
+template <>
+void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  #endif
+  // TODO: Support Tuning for fp16-fp32 gemm
+  gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+}
+
+
+template <>
+void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+  #else
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+
+    if (prop->major < 8)
+      TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is only supported for CUDA devices with compute capability 8.0 or higher");
+  #endif
+  // TODO: Support Tuning for bf16-fp32 gemm
+  gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+}
+
+
+template <typename Dtype, typename C_Dtype>
+bool gemm_and_bias(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool transpose_mat1,
     bool transpose_mat2,
     int64_t m,
@@ -1320,6 +1805,7 @@ void gemm_and_bias(
     const Dtype* mat2_ptr,
     int64_t mat2_ld,
     const Dtype* bias,
+<<<<<<< HEAD
     Dtype* result_ptr,
     int64_t result_ld,
     GEMMAndBiasActivationEpilogue activation) {
@@ -1329,6 +1815,32 @@ void gemm_and_bias(
   cudaDataType_t abcType = CUDA_R_32F;
   cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
   cudaDataType_t scaleType = CUDA_R_32F;
+=======
+    C_Dtype* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation) {
+
+  if (std::is_same_v<C_Dtype, float> && std::is_same_v<Dtype, at::BFloat16>) {
+    #ifdef USE_ROCM
+    TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+    #endif
+  } else if (std::is_same_v<C_Dtype, float> && std::is_same_v<Dtype, at::Half>) {
+    #ifdef USE_ROCM
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+    #endif
+    if (at::globalContext().allowFP16AccumulationCuBLAS())
+      TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
+  }
+
+  using opmath_t = at::opmath_type<Dtype>;
+  opmath_t beta_val = 0; // bias is added in epilogue
+
+  cudaDataType_t abType = CUDA_R_32F;
+  cudaDataType_t cType = CUDA_R_32F;
+  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
+  cudaDataType_t scaleType = CUDA_R_32F;
+  CuBlasLtMatmulPreference preference;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void * alpha_ptr = &alpha_val;
   void * beta_ptr = &beta_val;
 #ifndef USE_ROCM
@@ -1336,14 +1848,22 @@ void gemm_and_bias(
   at::Half hbeta_val;
 #endif
   if constexpr (std::is_same_v<Dtype, double>) {
+<<<<<<< HEAD
     abcType = CUDA_R_64F;
+=======
+    abType = CUDA_R_64F;
+    cType = CUDA_R_64F;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     computeType = CUBLAS_COMPUTE_64F;
     scaleType = CUDA_R_64F;
   } else if constexpr (std::is_same_v<Dtype, float>) {
     if (at::globalContext().allowTF32CuBLAS()) {
       computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
     }
+<<<<<<< HEAD
     abcType = CUDA_R_32F;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if constexpr (std::is_same_v<Dtype, at::Half>) {
 #ifndef USE_ROCM
     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
@@ -1356,9 +1876,29 @@ void gemm_and_bias(
       beta_ptr = &hbeta_val;
     }
 #endif
+<<<<<<< HEAD
     abcType = CUDA_R_16F;
   } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
     abcType = CUDA_R_16BF;
+=======
+    abType = CUDA_R_16F;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
+#ifndef USE_ROCM
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    }
+#endif
+  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
+    abType = CUDA_R_16BF;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
+#ifndef USE_ROCM
+    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
+      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
+        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
@@ -1366,7 +1906,10 @@ void gemm_and_bias(
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa);
   cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N;
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb);
+<<<<<<< HEAD
   auto stream = at::cuda::getCurrentCUDAStream();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     computeDesc.setAttribute<int32_t>(
@@ -1374,12 +1917,15 @@ void gemm_and_bias(
         at::cuda::getCurrentDeviceProperties()->multiProcessorCount -
             at::globalContext()._SMCarveout_EXPERIMENTAL().value());
   }
+<<<<<<< HEAD
 #else
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     stream = _getCarveoutStream(
         at::globalContext()._SMCarveout_EXPERIMENTAL().value());
     _syncCurrentWithCarveoutStream(stream, true);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
   if (activation == GEMMAndBiasActivationEpilogue::RELU) {
@@ -1395,11 +1941,18 @@ void gemm_and_bias(
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
   }
 
+<<<<<<< HEAD
   CuBlasLtMatrixLayout Adesc(abcType, m, k, mat1_ld, transpose_mat1);
   CuBlasLtMatrixLayout Bdesc(abcType, k, n, mat2_ld, transpose_mat2);
   CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
 
   CuBlasLtMatmulPreference preference;
+=======
+  CuBlasLtMatrixLayout Adesc(abType, m, k, mat1_ld, transpose_mat1);
+  CuBlasLtMatrixLayout Bdesc(abType, k, n, mat2_ld, transpose_mat2);
+  CuBlasLtMatrixLayout Cdesc(cType, m, n, result_ld);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto ltworkspace = CublasLtWorkspace();
   preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size);
 
@@ -1428,11 +1981,20 @@ void gemm_and_bias(
       1,
       &heuristicResult,
       &returnedResult));
+<<<<<<< HEAD
   if (returnedResult == 0) {
     TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
   }
 
   cublasStatus_t cublasStatus = cublasLtMatmul(
+=======
+  cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
+  if (returnedResult == 0) {
+    cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED;
+  }
+  else {
+    cublasStatus = cublasLtMatmul(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ltHandle,
       computeDesc.descriptor(),
       alpha_ptr,
@@ -1448,6 +2010,7 @@ void gemm_and_bias(
       &heuristicResult.algo,
       ltworkspace.ptr,
       ltworkspace.size,
+<<<<<<< HEAD
       stream);
 #ifdef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
@@ -1457,6 +2020,13 @@ void gemm_and_bias(
   TORCH_CHECK(
       cublasStatus == CUBLAS_STATUS_SUCCESS,
       "CUDA error: ",
+=======
+      at::cuda::getCurrentCUDAStream());
+  }
+  if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
+    TORCH_WARN(
+      "gemm_and_bias error: ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::cuda::blas::_cublasGetErrorEnum(cublasStatus),
       " when calling cublasLtMatmul with transpose_mat1 ",
       transpose_mat1,
@@ -1474,6 +2044,7 @@ void gemm_and_bias(
       mat2_ld,
       " result_ld ",
       result_ld,
+<<<<<<< HEAD
       " abcType ",
       abcType,
       " computeType ",
@@ -1483,6 +2054,23 @@ void gemm_and_bias(
 }
 
 template void gemm_and_bias(
+=======
+      " abType ",
+      abType,
+      " cType ",
+      cType,
+      " computeType ",
+      computeType,
+      " scaleType ",
+      scaleType,
+      ". Will attempt to recover by calling unfused cublas path.");
+    return false;
+  }
+  return true;
+}
+
+template bool gemm_and_bias(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool transpose_mat1,
     bool transpose_mat2,
     int64_t m,
@@ -1498,7 +2086,11 @@ template void gemm_and_bias(
     int64_t result_ld,
     GEMMAndBiasActivationEpilogue activation);
 
+<<<<<<< HEAD
 template void gemm_and_bias(
+=======
+template bool gemm_and_bias(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool transpose_mat1,
     bool transpose_mat2,
     int64_t m,
@@ -1514,7 +2106,11 @@ template void gemm_and_bias(
     int64_t result_ld,
     GEMMAndBiasActivationEpilogue activation);
 
+<<<<<<< HEAD
 template void gemm_and_bias(
+=======
+template bool gemm_and_bias(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool transpose_mat1,
     bool transpose_mat2,
     int64_t m,
@@ -1530,7 +2126,27 @@ template void gemm_and_bias(
     int64_t result_ld,
     GEMMAndBiasActivationEpilogue activation);
 
+<<<<<<< HEAD
 template void gemm_and_bias(
+=======
+template bool gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::Half> alpha_val,
+    const at::Half* mat1_ptr,
+    int64_t mat1_ld,
+    const at::Half* mat2_ptr,
+    int64_t mat2_ld,
+    const at::Half* bias,
+    float* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
+template bool gemm_and_bias(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool transpose_mat1,
     bool transpose_mat2,
     int64_t m,
@@ -1546,6 +2162,25 @@ template void gemm_and_bias(
     int64_t result_ld,
     GEMMAndBiasActivationEpilogue activation);
 
+<<<<<<< HEAD
+=======
+template bool gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::BFloat16> alpha_val,
+    const at::BFloat16* mat1_ptr,
+    int64_t mat1_ld,
+    const at::BFloat16* mat2_ptr,
+    int64_t mat2_ld,
+    const at::BFloat16* bias,
+    float* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void scaled_gemm(
     char transa,
     char transb,
@@ -1570,6 +2205,11 @@ void scaled_gemm(
     ScalarType result_dtype,
     bool use_fast_accum,
     bool use_rowwise) {
+<<<<<<< HEAD
+=======
+  // Note: see `cublasCommonArgs` for various non-intuitive manupulations
+  // of input arguments to this function.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if CUDA_VERSION >= 11080 || defined(USE_ROCM)
   const auto computeType = CUBLAS_COMPUTE_32F;
   const auto scaleType = CUDA_R_32F;
@@ -1608,7 +2248,10 @@ void scaled_gemm(
   if (result_scale_ptr != nullptr) {
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
   }
+<<<<<<< HEAD
   auto stream = at::cuda::getCurrentCUDAStream();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     computeDesc.setAttribute<int32_t>(
@@ -1616,6 +2259,7 @@ void scaled_gemm(
         at::cuda::getCurrentDeviceProperties()->multiProcessorCount -
             at::globalContext()._SMCarveout_EXPERIMENTAL().value());
   }
+<<<<<<< HEAD
 #else
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     stream = _getCarveoutStream(
@@ -1627,6 +2271,13 @@ void scaled_gemm(
   const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
 #endif
+=======
+#endif // ifndef USE_ROCM
+#ifndef USE_ROCM
+  const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
+#endif // ifndef USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CuBlasLtMatrixLayout Adesc(ScalarTypeToCudaDataType(mat1_dtype), m, k, mat1_ld, transa == 't');
   CuBlasLtMatrixLayout Bdesc(ScalarTypeToCudaDataType(mat2_dtype), k, n, mat2_ld, transb == 't');
 #ifdef USE_ROCM
@@ -1634,7 +2285,11 @@ void scaled_gemm(
   CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld);
 #else
   CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(bias_dtype), m, n, result_ld);
+<<<<<<< HEAD
 #endif
+=======
+#endif // ifdef USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CuBlasLtMatrixLayout Ddesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld);
   if (bias_ptr) {
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
@@ -1648,7 +2303,18 @@ void scaled_gemm(
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
 #else
     TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 or ROCm 7.0(with gfx950) and above");
+<<<<<<< HEAD
 #endif // CUDA_VERSION >= 12080
+=======
+#endif // if CUDA_VERSION >= 12080
+  } else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) {
+#if CUDA_VERSION >= 12080
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3);
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3);
+#else
+    TORCH_CHECK(false, "scaled_gemm with `torch.float8_e4m3fn` scales is only supported for CUDA 12.8 and above");
+#endif // if CUDA_VERSION >= 12080
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (mat1_scale_dtype == kFloat && mat2_scale_dtype == kFloat && use_rowwise) {
 #if CUDA_VERSION >= 12090 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
@@ -1660,6 +2326,10 @@ void scaled_gemm(
 #endif // if CUDA_VERSION >= 12090
   }
 
+<<<<<<< HEAD
+=======
+  auto stream = c10::cuda::getCurrentCUDAStream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CuBlasLtMatmulPreference preference;
   auto ltworkspace = CublasLtWorkspace();
   preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size);
@@ -1723,7 +2393,11 @@ void scaled_gemm(
         }
     }
     TORCH_CHECK(found, "could not find valid hipblaslt solution");
+<<<<<<< HEAD
 #endif
+=======
+#endif // ifndef USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   cublasStatus_t cublasStatus = cublasLtMatmul(
       ltHandle,
@@ -1738,7 +2412,11 @@ void scaled_gemm(
       result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
 #else
       nullptr,
+<<<<<<< HEAD
 #endif
+=======
+#endif // ifdef USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       Cdesc.descriptor(),
       result_ptr,
       Ddesc.descriptor(),
@@ -1746,11 +2424,14 @@ void scaled_gemm(
       ltworkspace.ptr,
       ltworkspace.size,
       stream);
+<<<<<<< HEAD
 #ifdef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     _syncCurrentWithCarveoutStream(stream, false);
   }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       cublasStatus == CUBLAS_STATUS_SUCCESS,
       "CUDA error: ",
@@ -1776,7 +2457,11 @@ void scaled_gemm(
       " scaleType ",
       scaleType);
   return;
+<<<<<<< HEAD
 #endif // CUDA_VERSION >= 11080 || defined(USE_ROCM)
+=======
+#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
 }
 
@@ -1804,7 +2489,10 @@ void int8_gemm(
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, transa);
   cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N;
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, transb);
+<<<<<<< HEAD
   auto stream = at::cuda::getCurrentCUDAStream();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     computeDesc.setAttribute<int32_t>(
@@ -1812,12 +2500,15 @@ void int8_gemm(
         at::cuda::getCurrentDeviceProperties()->multiProcessorCount -
             at::globalContext()._SMCarveout_EXPERIMENTAL().value());
   }
+<<<<<<< HEAD
 #else
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     stream = _getCarveoutStream(
         at::globalContext()._SMCarveout_EXPERIMENTAL().value());
     _syncCurrentWithCarveoutStream(stream, true);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
   CuBlasLtMatrixLayout Adesc(abType, m, k, mat1_ld, transpose_mat1);
@@ -1879,7 +2570,11 @@ void int8_gemm(
 #else
       0,
 #endif
+<<<<<<< HEAD
       stream);
+=======
+      at::cuda::getCurrentCUDAStream());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       cublasStatus == CUBLAS_STATUS_SUCCESS,
       "CUDA error: ",
@@ -1908,11 +2603,14 @@ void int8_gemm(
       computeType,
       " scaleType ",
       scaleType);
+<<<<<<< HEAD
 #ifdef USE_ROCM
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     _syncCurrentWithCarveoutStream(stream, false);
   }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
@@ -2224,6 +2922,11 @@ void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>)) {
                                    reinterpret_cast<cuDoubleComplex*>(result)));
 }
 
+<<<<<<< HEAD
+=======
+// HIP on Windows does not support
+#if !(defined(USE_ROCM) && defined(_MSC_VER))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 void getrsBatched<float>(CUDABLAS_GETRS_ARGTYPES(float)) {
   TORCH_CUDABLAS_CHECK(cublasSgetrsBatched(
@@ -2422,5 +3125,9 @@ void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::comple
       devInfoArray,
       batchSize));
 }
+<<<<<<< HEAD
+=======
+#endif // !(defined(USE_ROCM) && defined(_MSC_VER))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace at::cuda::blas
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index 6075e7b9c9d8..eaa071bd5a4e 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -39,6 +39,7 @@ class PointerModeGuard {
 
 /* LEVEL 3 BLAS FUNCTIONS */
 
+<<<<<<< HEAD
 #define CUDABLAS_GEMM_ARGTYPES(Dtype)                                                       \
   char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,  \
       const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type<Dtype> beta,\
@@ -51,6 +52,28 @@ inline void gemm(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
   static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm: not implemented");
 }
 
+=======
+#define CUDABLAS_GEMM_ARGTYPES(Dtype) CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
+
+#define CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)                                  \
+  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,  \
+      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type<Dtype> beta,\
+      C_Dtype *c, int64_t ldc
+
+#define CUDABLAS_GEMM_ARGS(Dtype) transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc
+
+#define CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT \
+    ((std::is_same<Dtype, at::Half>::value || std::is_same<Dtype, at::BFloat16>::value) && std::is_same<C_Dtype, float>::value)
+
+template <typename Dtype, typename C_Dtype = Dtype, typename std::enable_if<!CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+inline void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm: not implemented");
+}
+
+template <typename Dtype, typename C_Dtype, typename std::enable_if<CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype));
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
 template <>
@@ -63,9 +86,19 @@ template <>
 void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+<<<<<<< HEAD
 
 template <typename Dtype>
 inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+=======
+template<>
+void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm_internal: not implemented");
 }
 
@@ -81,6 +114,13 @@ template <>
 void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+<<<<<<< HEAD
+=======
+template<>
+void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 enum GEMMAndBiasActivationEpilogue {
   None,
@@ -90,8 +130,13 @@ enum GEMMAndBiasActivationEpilogue {
 
 // NOTE: GELU activation is not supported prior to CUDA 11.4 and will
 // do nothing if passed in that case.
+<<<<<<< HEAD
 template <typename Dtype>
 void gemm_and_bias(
+=======
+template <typename Dtype, typename C_Dtype = Dtype>
+bool gemm_and_bias(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool transpose_mat1,
     bool transpose_mat2,
     int64_t m,
@@ -103,7 +148,11 @@ void gemm_and_bias(
     const Dtype* mat2_ptr,
     int64_t mat2_ld,
     const Dtype* bias,
+<<<<<<< HEAD
     Dtype* result_ptr,
+=======
+    C_Dtype* result_ptr,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t result_ld,
     GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::None);
 
@@ -145,20 +194,41 @@ void scaled_gemm(
     bool use_fast_accum,
     bool use_rowwise);
 
+<<<<<<< HEAD
 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
   char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,    \
       const Dtype *a, int64_t lda, int64_t stridea,                                           \
       const Dtype *b, int64_t ldb, int64_t strideb,                                           \
       at::opmath_type<Dtype> beta, Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches
+=======
+#define CUDABLAS_BGEMM_ARGTYPES(Dtype)  CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
+
+#define CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)                                   \
+  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,    \
+      const Dtype *a, int64_t lda, int64_t stridea,                                           \
+      const Dtype *b, int64_t ldb, int64_t strideb,                                           \
+      at::opmath_type<Dtype> beta, C_Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define CUDABLAS_BGEMM_ARGS(Dtype) \
   transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, beta, c, ldc, stridec, num_batches
 
+<<<<<<< HEAD
 template <typename Dtype>
 inline void bgemm(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
   static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm: not implemented");
 }
 
+=======
+template <typename Dtype, typename C_Dtype = Dtype, typename std::enable_if<!CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+inline void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm: not implemented");
+}
+
+template <typename Dtype, typename C_Dtype, typename std::enable_if<CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype));
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double));
 template <>
@@ -171,9 +241,19 @@ template <>
 void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
 template <>
 void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+<<<<<<< HEAD
 
 template <typename Dtype>
 inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+=======
+template<>
+void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm_internal: not implemented");
 }
 
@@ -189,6 +269,13 @@ template <>
 void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
 template <>
 void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+<<<<<<< HEAD
+=======
+template<>
+void bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define CUDABLAS_TRSM_ARGTYPES(Dtype)                                  \
   cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, \
@@ -292,6 +379,24 @@ void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>));
   int n, int nrhs, Dtype** dA_array, int lda, int* ipiv_array, \
   Dtype** dB_array, int ldb, int* info_array, int batchsize
 
+<<<<<<< HEAD
+=======
+#define CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)                   \
+  cublasHandle_t handle, int m, int n, Dtype **A_array, int lda, \
+      Dtype **tau_array, int *info, int batchsize
+
+#define CUDABLAS_GETRF_ARGTYPES(Dtype)  \
+  int n, Dtype** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize
+
+#define CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)  \
+  cublasHandle_t handle, cublasOperation_t trans, \
+  int m, int n, int nrhs, Dtype** dA_array, int ldda, \
+  Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize
+
+// HIP on Windows does not support getrs, geqrf, getrf, gels
+#if !(defined(USE_ROCM) && defined(_MSC_VER))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template<class Dtype>
 void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
   static_assert(false&&sizeof(Dtype),"at::cuda::blas::getrsBatched: not implemented");
@@ -305,10 +410,13 @@ TORCH_CUDA_CU_API void getrsBatched<c10::complex<float>>(CUDABLAS_GETRS_ARGTYPES
 template<>
 TORCH_CUDA_CU_API void getrsBatched<c10::complex<double>>(CUDABLAS_GETRS_ARGTYPES(c10::complex<double>));
 
+<<<<<<< HEAD
 #define CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)                   \
   cublasHandle_t handle, int m, int n, Dtype **A_array, int lda, \
       Dtype **tau_array, int *info, int batchsize
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <class Dtype>
 void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) {
   static_assert(false&&sizeof(Dtype), "at::cuda::blas::geqrfBatched: not implemented");
@@ -324,12 +432,18 @@ template <>
 TORCH_CUDA_CU_API void geqrfBatched<c10::complex<float>>(
     CUDABLAS_GEQRF_BATCHED_ARGTYPES(c10::complex<float>));
 
+<<<<<<< HEAD
 #define CUDABLAS_GETRF_ARGTYPES(Dtype)  \
   int n, Dtype** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize
 
 template<class Dtype>
 void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) {
   TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not implemented");
+=======
+template<class Dtype>
+void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) {
+  static_assert(false&&sizeof(Dtype), "at::cuda::blas::getrfBatched: not implemented");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 template<>
 TORCH_CUDA_CU_API void getrfBatched<float>(CUDABLAS_GETRF_ARGTYPES(float));
@@ -340,6 +454,7 @@ TORCH_CUDA_CU_API void getrfBatched<c10::complex<double>>(CUDABLAS_GETRF_ARGTYPE
 template<>
 TORCH_CUDA_CU_API void getrfBatched<c10::complex<float>>(CUDABLAS_GETRF_ARGTYPES(c10::complex<float>));
 
+<<<<<<< HEAD
 #define CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)  \
   cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, Dtype** dA_array, int ldda, Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize
 
@@ -348,6 +463,12 @@ void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) {
   static_assert(false&&sizeof(Dtype),"at::cuda::blas::gelsBatched: not implemented");
 }
 
+=======
+template <class Dtype>
+void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) {
+  static_assert(false&&sizeof(Dtype), "at::cuda::blas::gelsBatched: not implemented");
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template<>
 TORCH_CUDA_CU_API void gelsBatched<double>(CUDABLAS_GELS_BATCHED_ARGTYPES(double));
 template<>
@@ -357,4 +478,31 @@ TORCH_CUDA_CU_API void gelsBatched<c10::complex<double>>(CUDABLAS_GELS_BATCHED_A
 template<>
 TORCH_CUDA_CU_API void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<float>));
 
+<<<<<<< HEAD
+=======
+#else // !(defined(USE_ROCM) && defined(_MSC_VER))
+
+template<class Dtype>
+void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::getrsBatched: not supported for HIP on Windows");
+}
+
+template <class Dtype>
+void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::geqrfBatched: not supported for HIP on Windows");
+}
+
+template<class Dtype>
+void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not supported for HIP on Windows");
+}
+
+template <class Dtype>
+void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::gelsBatched: not supported for HIP on Windows");
+}
+
+#endif // !(defined(USE_ROCM) && defined(_MSC_VER))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::cuda::blas
diff --git a/aten/src/ATen/cuda/CUDAConfig.h.in b/aten/src/ATen/cuda/CUDAConfig.h.in
index 7c7f2cc7470a..1cda19296d9a 100644
--- a/aten/src/ATen/cuda/CUDAConfig.h.in
+++ b/aten/src/ATen/cuda/CUDAConfig.h.in
@@ -8,6 +8,10 @@
 // only be included from C++ files.
 #define AT_CUDNN_ENABLED() @AT_CUDNN_ENABLED@
 #define AT_CUSPARSELT_ENABLED() @AT_CUSPARSELT_ENABLED@
+<<<<<<< HEAD
+=======
+#define AT_HIPSPARSELT_ENABLED() @AT_HIPSPARSELT_ENABLED@
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define AT_ROCM_ENABLED() @AT_ROCM_ENABLED@
 #define AT_MAGMA_ENABLED() @AT_MAGMA_ENABLED@
 
diff --git a/aten/src/ATen/cuda/CUDADataType.h b/aten/src/ATen/cuda/CUDADataType.h
index b3ac2b39fcfb..4ebe9aef3f93 100644
--- a/aten/src/ATen/cuda/CUDADataType.h
+++ b/aten/src/ATen/cuda/CUDADataType.h
@@ -78,7 +78,11 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
       return CUDA_R_64I;
     case c10::ScalarType::BFloat16:
       return CUDA_R_16BF;
+<<<<<<< HEAD
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 11080) || (defined(USE_ROCM) && ROCM_VERSION >= 60300)
+=======
+#if !defined(USE_ROCM) || ROCM_VERSION >= 60300
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case c10::ScalarType::Float8_e4m3fn:
       return CUDA_R_8F_E4M3;
     case c10::ScalarType::Float8_e5m2:
@@ -90,6 +94,13 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
     case c10::ScalarType::Float8_e5m2fnuz:
       return HIP_R_8F_E5M2_FNUZ;
 #endif
+<<<<<<< HEAD
+=======
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
+    case c10::ScalarType::Float4_e2m1fn_x2:
+      return CUDA_R_4F_E2M1;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       TORCH_INTERNAL_ASSERT(false, "Cannot convert ScalarType ", scalar_type, " to cudaDataType.")
   }
diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h
index 94ce34645b02..b231589676fe 100644
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@@ -13,6 +13,20 @@
 #include <cstdint>
 #include <utility>
 
+<<<<<<< HEAD
+=======
+/*
+* `cudaEventExternal` is a torch-specific flag that is used to
+* indicate that the CUDAEvent will be used only for synchronization
+* with work outside of the cuda graph, rather than creation of
+* cross-stream dependencies within a cuda graph. Resources:
+* https://docs.nvidia.com/cuda/archive/12.9.0/cuda-c-programming-guide/index.html#cross-stream-dependencies-and-events
+* https://docs.nvidia.com/cuda/archive/12.9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3457b81d1d32c6a00f6132fbc2693d47
+* https://docs.nvidia.com/cuda/archive/12.9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g0c23426b7252eaa9cef695859991304e
+*/
+#define cudaEventExternal 0x08
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::cuda {
 
 /*
@@ -118,7 +132,18 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
     TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_,
       " does not match recording stream's device ", stream.device_index(), ".");
     CUDAGuard guard(device_index_);
+<<<<<<< HEAD
     AT_CUDA_CHECK(cudaEventRecord(event_, stream));
+=======
+
+#ifndef USE_ROCM
+    // it is an error to use cudaEventRecordExternal when not doing stream capture
+    unsigned int flags = (c10::cuda::currentStreamCaptureStatusMayInitCtx() != c10::cuda::CaptureStatus::None && external_) ? cudaEventRecordExternal : cudaEventRecordDefault;
+    AT_CUDA_CHECK(cudaEventRecordWithFlags(event_, stream, flags));
+#else
+    AT_CUDA_CHECK(cudaEventRecord(event_, stream));
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
       (*interp)->trace_gpu_event_record(at::kCUDA,
@@ -134,7 +159,17 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
   void block(const CUDAStream& stream) {
     if (is_created_) {
       CUDAGuard guard(stream.device_index());
+<<<<<<< HEAD
       AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, 0));
+=======
+#ifndef USE_ROCM
+      // it is an error to use cudaEventWaitExternal when not doing stream capture
+      unsigned int flags = (c10::cuda::currentStreamCaptureStatusMayInitCtx() != c10::cuda::CaptureStatus::None && external_) ? cudaEventWaitExternal : cudaEventWaitDefault;
+      AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, flags));
+#else
+      AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_));
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
       if (C10_UNLIKELY(interp)) {
         (*interp)->trace_gpu_event_wait(at::kCUDA,
@@ -147,8 +182,21 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
 
   // Note: cudaEventElapsedTime can be safely called from any device
   float elapsed_time(const CUDAEvent& other) const {
+<<<<<<< HEAD
     TORCH_CHECK(is_created_ && other.isCreated(),
       "Both events must be recorded before calculating elapsed time.");
+=======
+    TORCH_CHECK_VALUE(
+        !(flags_ & cudaEventDisableTiming) && !(other.flags_ & cudaEventDisableTiming),
+        "Both events must be created with argument 'enable_timing=True'.");
+    TORCH_CHECK_VALUE(
+        is_created_ && other.isCreated(),
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && other.query(),
+        "Both events must be completed before calculating elapsed time.");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     float time_ms = 0;
     // We do not strictly have to set the device index to the same as our event,
     // but if we don't and the current device is not initialized, it will
@@ -185,10 +233,22 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
   unsigned int flags_ = cudaEventDisableTiming;
   bool is_created_ = false;
   bool was_recorded_ = false;
+<<<<<<< HEAD
+=======
+  bool external_ = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   DeviceIndex device_index_ = -1;
   cudaEvent_t event_{};
 
   void createEvent(DeviceIndex device_index) {
+<<<<<<< HEAD
+=======
+    external_ = (flags_ & cudaEventExternal) != 0;
+#ifdef USE_ROCM
+    TORCH_CHECK(!external_, "External events are disallowed in rocm");
+#endif
+    flags_ &= ~cudaEventExternal;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device_index_ = device_index;
     CUDAGuard guard(device_index_);
     AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index 3f2916862cac..5d56bfa5b352 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -38,9 +38,16 @@ MempoolId_t graph_pool_handle() {
  * describes memory management for captures.
  */
 
+<<<<<<< HEAD
 CUDAGraph::CUDAGraph()
   // CUDAStreams may not be default-constructed.
   : capture_stream_(at::cuda::getCurrentCUDAStream()) {
+=======
+CUDAGraph::CUDAGraph(bool keep_graph)
+  // CUDAStreams may not be default-constructed.
+  : capture_stream_(at::cuda::getCurrentCUDAStream()),
+    keep_graph_(keep_graph) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void CUDAGraph::register_generator_state(
@@ -126,8 +133,42 @@ void CUDAGraph::capture_end() {
   c10::cuda::CUDACachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);
 
   TORCH_CHECK(graph_ != nullptr, "Invalid capture.");
+<<<<<<< HEAD
+  has_graph_ = true;
+
+=======
+
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    wholegraph_increments = generator_state->capture_epilogue();
+  }
+
+  size_t numCUDAGraphNodes = 0;
+  AT_CUDA_CHECK(cudaGraphGetNodes(graph_, nullptr, &numCUDAGraphNodes));
+  if (numCUDAGraphNodes == 0) {
+      TORCH_WARN("The CUDA Graph is empty. This usually means that the graph was ",
+                 "attempted to be captured on wrong device or stream.");
+  }
+
+  capture_ended_ = true;
   has_graph_ = true;
+  if (!keep_graph_) {
+    instantiate();
+    if (!_cuda_graphs_debug) {
+      AT_CUDA_CHECK(cudaGraphDestroy(graph_));
+    }
+    has_graph_ = false;
+  }
+}
 
+void CUDAGraph::instantiate() {
+  TORCH_CHECK(capture_ended_, "capture_end() must have been called before calling instantiate");
+
+  if (has_graph_exec_) {
+    TORCH_CHECK(keep_graph_, "instantiate() is intended to be called by the user only when keep_graph=true");
+    AT_CUDA_CHECK(cudaGraphExecDestroy(graph_exec_));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // In typical graph usage some tensors (e.g. the tensors used for graph IO) are not freed
   // between replays.
   // If Pytorch compiles and runs with a CUDA 11.4+ toolkit, there's a chance the allocator backend
@@ -139,7 +180,11 @@ void CUDAGraph::capture_end() {
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
   // cudaGraphInstantiateWithFlags
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
+<<<<<<< HEAD
 #if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
+=======
+#if !defined(USE_ROCM) || ROCM_VERSION >= 60200
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int version = 0;
   AT_CUDA_CHECK(cudaDriverGetVersion(&version));
   if (version < 11040) {
@@ -154,13 +199,18 @@ void CUDAGraph::capture_end() {
 #endif
 //Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
 //It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
+<<<<<<< HEAD
 #if ((defined(CUDA_VERSION) && CUDA_VERSION >= 11040) || (defined(USE_ROCM) && ROCM_VERSION >= 60200))
+=======
+#if !defined(USE_ROCM) || ROCM_VERSION >= 60200
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
                                                 graph_,
                                                 cudaGraphInstantiateFlagAutoFreeOnLaunch));
   }
 #endif
+<<<<<<< HEAD
 
   has_graph_exec_ = true;
 
@@ -191,6 +241,20 @@ void CUDAGraph::replay() {
   TORCH_CHECK(has_graph_exec_,
               "Called CUDAGraph::replay without a preceding successful capture.");
 
+=======
+  has_graph_exec_ = true;
+}
+
+void CUDAGraph::replay() {
+  TORCH_CHECK(capture_ended_,
+              "Called CUDAGraph::replay without a preceding successful capture.");
+
+  if (!has_graph_exec_) {
+    TORCH_INTERNAL_ASSERT(keep_graph_);
+    instantiate();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::OptionalDeviceGuard device_guard{capture_stream_.device()};
 
   for (auto& [generator_state, wholegraph_increments] :
@@ -216,14 +280,26 @@ void CUDAGraph::enable_debug_mode() {
 }
 
 void CUDAGraph::debug_dump(const std::string& debug_path) {
+<<<<<<< HEAD
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 11030)|| defined(USE_ROCM)
   if (_cuda_graphs_debug) {
+=======
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
+  if (_cuda_graphs_debug || keep_graph_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_WARN("DEBUG: calling debug_dump()");
     if (has_graph_) {
       TORCH_WARN("DEBUG: calling cudaGraphDebugDotPrint() with ", debug_path);
       C10_CUDA_CHECK_WARN(cudaGraphDebugDotPrint(graph_, debug_path.c_str(), cudaGraphDebugDotFlagsVerbose)); // most verbose output
+<<<<<<< HEAD
       AT_CUDA_CHECK(cudaGraphDestroy(graph_));
       has_graph_ = false;
+=======
+      if (!keep_graph_) {
+        AT_CUDA_CHECK(cudaGraphDestroy(graph_));
+        has_graph_ = false;
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   } else {
     TORCH_WARN("CUDA Graphs debug not enabled, set with [graph].enable_debug_mode()");
@@ -233,6 +309,15 @@ void CUDAGraph::debug_dump(const std::string& debug_path) {
 #endif
 }
 
+<<<<<<< HEAD
+=======
+cudaGraph_t CUDAGraph::raw_cuda_graph() {
+  TORCH_CHECK(keep_graph_, "You cannot access the raw cudaGraph_t instance unless CUDAGraph was initialized with keep_graph=true");
+  TORCH_CHECK(has_graph_, "You cannot access the raw cudaGraph_t instance until capture_end() has been called");
+  return graph_;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void CUDAGraph::reset() {
   // I'd prefer these checks throw exceptions, not print warnings,
   // but the destructor calls reset(), and at least one CI build
@@ -253,9 +338,16 @@ void CUDAGraph::reset() {
   // and the allocator could end up in all kinds of weird states depending where failure occurred.
   // If the user catches the failure exception in a script, or is running in REPL or (god forbid)
   // a Jupyter notebook, I don't see an easy way for reset() to gracefully fix all such possible error states.
+<<<<<<< HEAD
   if (has_graph_ || has_graph_exec_) {
     // notifyCaptureDestroy may throw. How should we handle this?
     c10::cuda::CUDACachingAllocator::releasePool(capture_dev_, mempool_id_);
+=======
+  if (capture_ended_) {
+    // notifyCaptureDestroy may throw. How should we handle this?
+    c10::cuda::CUDACachingAllocator::releasePool(capture_dev_, mempool_id_);
+    capture_ended_ = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if (has_graph_) {
     C10_CUDA_CHECK_WARN(cudaGraphDestroy(graph_));
@@ -269,7 +361,11 @@ void CUDAGraph::reset() {
 
 // Returns an id another graph's capture_begin can use to share the same memory pool as this graph.
 MempoolId_t CUDAGraph::pool() {
+<<<<<<< HEAD
 TORCH_CHECK(has_graph_exec_,
+=======
+TORCH_CHECK(capture_ended_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               "Called CUDAGraph::pool() without a preceding successful capture.");
   return mempool_id_;
 }
diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
index 76a090579d1d..b492b3152797 100644
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@@ -19,7 +19,11 @@ namespace cuda {
 TORCH_CUDA_CPP_API MempoolId_t graph_pool_handle();
 
 struct TORCH_CUDA_CPP_API CUDAGraph {
+<<<<<<< HEAD
   CUDAGraph();
+=======
+  CUDAGraph(bool keep_graph=false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~CUDAGraph();
 
   // See Note [Explicit Registration of Generators to the CUDA Graph]
@@ -29,21 +33,39 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
       MempoolId_t pool = {0, 0},
       cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
   void capture_end();
+<<<<<<< HEAD
+=======
+  void instantiate();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void replay();
   void reset();
   MempoolId_t pool();
   void enable_debug_mode();
   void debug_dump(const std::string& debug_path);
+<<<<<<< HEAD
+=======
+  cudaGraph_t raw_cuda_graph();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  protected:
   cudaGraph_t graph_ = nullptr;
   cudaGraphExec_t graph_exec_ = nullptr;
 
   // internal states so reset() can do its best cleaning up
+<<<<<<< HEAD
   // Set to true in capture_end if cudaStreamEndCapture succeeded
   // Set back to false soon after, when graph_ is consumed by cudaGraphInstantiate
   // to create graph_exec_, then graph_ is deleted
   bool has_graph_ = false;
+=======
+
+  // Set to true in capture_end if cudaStreamEndCapture succeeded
+  // Set back to false after instantiate() unless keep_graph=True or
+  // enable_debug_mode() was called on any CUDAGraph instance.
+  bool has_graph_ = false;
+  // Set to true in capture_end if cudaStreamEndCapture succeeded
+  bool capture_ended_ = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Set to true in capture_end if cudaGraphInstantiate succeeded
   bool has_graph_exec_ = false;
 
@@ -80,6 +102,11 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   // init capture_dev_ as UNDEFINED_DEVICE to check that it stores the real device id in the destructor
   static constexpr c10::DeviceIndex UNDEFINED_DEVICE = -1;
   c10::DeviceIndex capture_dev_{UNDEFINED_DEVICE};
+<<<<<<< HEAD
+=======
+
+  bool keep_graph_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace cuda
diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
index 84711be2ddf3..0df75f3b39c7 100644
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
@@ -75,12 +75,16 @@ cusparseDnMatDescr_t createRawDnMatDescriptor(const Tensor& input, int64_t batch
   auto leading_dimension =
       is_row_major ? input_strides[ndim - 2] : input_strides[ndim - 1];
 
+<<<<<<< HEAD
 #if !defined(USE_ROCM)
   auto order = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
 #else
   TORCH_INTERNAL_ASSERT(is_column_major, "Expected column major input.");
   auto order = CUSPARSE_ORDER_COL;
 #endif
+=======
+  auto order = is_row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto batch_stride = ndim > 2 && batch_offset >= 0 ? input_strides[ndim - 3] : 0;
   // NOLINTNEXTLINE(*const-cast)
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
index 8a039ea3bff9..47e948a9acd4 100644
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -9,6 +9,10 @@
 
 #include <cuda_runtime_api.h>
 #include <future>
+<<<<<<< HEAD
+=======
+#include <unordered_map>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::cuda {
 namespace {
@@ -71,6 +75,11 @@ using Block = HostBlock<CUDAStream>;
 struct CUDACachingHostAllocatorImpl
     : public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
  private:
+<<<<<<< HEAD
+=======
+  std::unordered_map<void*, bool> use_host_register;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void allocate_host_memory(size_t size, void** ptr) override {
     // Pinned memory pointers allocated by any device can be directly used by
     // any other device, regardless of the current device at the time of
@@ -88,41 +97,82 @@ struct CUDACachingHostAllocatorImpl
           at::Device(at::DeviceType::CUDA, *primary_ctx_device_index));
     }
 
+<<<<<<< HEAD
     auto start = std::chrono::system_clock::now();
     if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
             pinned_use_cuda_host_register()) {
+=======
+    auto start = std::chrono::steady_clock::now();
+    bool use_register = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_cuda_host_register();
+    if (use_register) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       allocWithCudaHostRegister(ptr, size);
     } else {
       // Use cudaHostAlloc for allocating pinned memory (global lock in driver)
       C10_CUDA_CHECK(cudaHostAlloc(ptr, size, cudaHostAllocDefault));
     }
+<<<<<<< HEAD
     auto end = std::chrono::system_clock::now();
+=======
+
+    auto end = std::chrono::steady_clock::now();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
     // Update the statistics on the time spent on cudaHostAlloc/hostRegister
     {
       std::lock_guard<std::mutex> g(stats_.timing_mutex_);
+<<<<<<< HEAD
+=======
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(use_host_register.count(*ptr) == 0);
+      use_host_register[*ptr] = use_register;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       stats_.host_alloc_time.increase(duration.count());
     }
   }
 
   void free_block(Block* block) override {
+<<<<<<< HEAD
     auto start = std::chrono::system_clock::now();
     if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
             pinned_use_cuda_host_register()) {
       void* ptr = block->ptr_;
+=======
+    auto start = std::chrono::steady_clock::now();
+    // Users may change the allocator config at will. torch unit tests do this.
+    // However, allocations using cudaHostRegister should use corresonding
+    // cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost.
+    void* ptr = block->ptr_;
+    bool use_register = false;
+    {
+      std::lock_guard<std::mutex> g(stats_.timing_mutex_);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(use_host_register.count(ptr) == 1);
+      use_register = use_host_register[ptr];
+    }
+    if (use_register) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       AT_CUDA_CHECK(cudaHostUnregister(ptr));
       // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
       std::free(ptr);
     } else {
+<<<<<<< HEAD
       AT_CUDA_CHECK(cudaFreeHost(block->ptr_));
     }
     auto end = std::chrono::system_clock::now();
+=======
+      AT_CUDA_CHECK(cudaFreeHost(ptr));
+    }
+    auto end = std::chrono::steady_clock::now();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 
     // Update the statistics on the time spent on cudaFreeHost/hostUnregister
     {
       std::lock_guard<std::mutex> g(stats_.timing_mutex_);
+<<<<<<< HEAD
+=======
+      use_host_register.erase(ptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       stats_.host_free_time.increase(duration.count());
     }
   }
@@ -185,6 +235,7 @@ struct CUDACachingHostAllocatorImpl
     }
   }
 
+<<<<<<< HEAD
   void registerPages(const void* ptr, size_t size) {
     AT_CUDA_CHECK(
         cudaHostRegister((void*)ptr, (size_t)size, cudaHostRegisterDefault));
@@ -200,6 +251,8 @@ struct CUDACachingHostAllocatorImpl
         "");
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void allocWithCudaHostRegister(void** ptr, size_t roundSize) {
     // Here we do regular allocation, pre-fault/map the pages, and then do
     // cudaHostRegister with GPU mapping flags to lock the pages, so we
@@ -249,6 +302,7 @@ struct CUDACachingHostAllocatorImpl
     }
 
     // Register the mapped pages using cudaHostRegister
+<<<<<<< HEAD
     registerPages(*ptr, roundSize);
   }
 };
@@ -307,4 +361,20 @@ void CachingHostAllocator_resetPeakStats() {
   return getCUDACachingHostAllocator().resetPeakStats();
 }
 
+=======
+    AT_CUDA_CHECK(
+        cudaHostRegister(*ptr, roundSize, cudaHostRegisterDefault));
+  }
+};
+
+DECLARE_HOST_ALLOCATOR(
+    CUDACachingHostAllocator,
+    CUDACachingHostAllocatorImpl,
+    raw_local_deleter,
+    caching_host_allocator);
+
+REGISTER_HOST_ALLOCATOR(at::kCUDA, &caching_host_allocator)
+
+} // anonymous namespace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::cuda
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.h b/aten/src/ATen/cuda/CachingHostAllocator.h
index 6c33dfaeb534..826c090b04ed 100644
--- a/aten/src/ATen/cuda/CachingHostAllocator.h
+++ b/aten/src/ATen/cuda/CachingHostAllocator.h
@@ -3,6 +3,10 @@
 #include <ATen/core/CachingHostAllocator.h>
 #include <c10/core/Allocator.h>
 #include <c10/cuda/CUDAStream.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/Deprecated.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::cuda {
 
@@ -18,6 +22,7 @@ namespace at::cuda {
 // call between host and device, and passed the corresponding context from the
 // allocation. This is currently invoked by at::native::copy_kernel_cuda.
 //
+<<<<<<< HEAD
 TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator();
 
 // Records an event in the specified stream. The allocation corresponding to the
@@ -38,5 +43,54 @@ TORCH_CUDA_CPP_API at::HostStats CachingHostAllocator_getStats();
 
 TORCH_CUDA_CPP_API void CachingHostAllocator_resetAccumulatedStats();
 TORCH_CUDA_CPP_API void CachingHostAllocator_resetPeakStats();
+=======
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::getCachingHostAllocator() is deprecated. Please use at::getHostAllocator(at::kCUDA) instead.")
+inline TORCH_CUDA_CPP_API at::HostAllocator* getCachingHostAllocator() {
+  return at::getHostAllocator(at::kCUDA);
+}
+
+// Records an event in the specified stream. The allocation corresponding to the
+// input `ptr`/`ctx` will not be re-used until the event has occurred.
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->record_event(...) instead.")
+inline TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
+    void* ptr,
+    void* ctx,
+    c10::cuda::CUDAStream stream) {
+  return getHostAllocator(at::kCUDA)->record_event(ptr, ctx, stream.unwrap());
+}
+
+// Releases cached pinned memory allocations via cudaHostFree
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_emptyCache() is deprecated. Please use at::getHostAllocator(at::kCUDA)->empty_cache() instead.")
+inline TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache() {
+  getHostAllocator(at::kCUDA)->empty_cache();
+}
+
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::HostAlloc(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->allocate(...) instead.")
+inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) {
+  return getHostAllocator(at::kCUDA)->allocate(size);
+}
+
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_getStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->get_stats() instead.")
+inline TORCH_CUDA_CPP_API at::HostStats CachingHostAllocator_getStats() {
+  return getHostAllocator(at::kCUDA)->get_stats();
+}
+
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_resetAccumulatedStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->reset_accumulated_stats() instead.")
+inline TORCH_CUDA_CPP_API void CachingHostAllocator_resetAccumulatedStats() {
+  getHostAllocator(at::kCUDA)->reset_accumulated_stats();
+}
+
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_resetPeakStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->reset_peak_stats() instead.")
+inline TORCH_CUDA_CPP_API void CachingHostAllocator_resetPeakStats() {
+  getHostAllocator(at::kCUDA)->reset_peak_stats();
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace at::cuda
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 2892a286ed62..9053902b3ba0 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -123,6 +123,7 @@ void clearCublasWorkspaces() {
 }
 
 size_t parseChosenWorkspaceSize() {
+<<<<<<< HEAD
   const char * val = getenv("CUBLAS_WORKSPACE_CONFIG");
 #ifdef USE_ROCM
   if (!val) {
@@ -137,6 +138,20 @@ size_t parseChosenWorkspaceSize() {
   std::string device_arch = properties->gcnArchName;
   const bool gfx94 = device_arch.find("gfx94") != std::string::npos;
   const size_t default_size = gfx94 ? 1024 * 128 * 1024 : 1024 * 32 * 1024;
+=======
+  auto val = c10::utils::get_env("CUBLAS_WORKSPACE_CONFIG");
+#ifdef USE_ROCM
+  if (!val) {
+    val = c10::utils::get_env("HIPBLAS_WORKSPACE_CONFIG");
+  }
+  if (!val) {
+    // for extra convenience
+    val = c10::utils::get_env("ROCBLAS_WORKSPACE_CONFIG");
+  }
+  /* 32MiB default, 128MiB for gfx94x/gfx95x */
+  const bool gfx94_95 = at::detail::getCUDAHooks().isGPUArch({"gfx94", "gfx95"});
+  const size_t default_size = gfx94_95 ? 1024 * 128 * 1024 : 1024 * 32 * 1024;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   /* :4096:2:16:8 default, 32MiB for Hopper */
   cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
@@ -146,7 +161,11 @@ size_t parseChosenWorkspaceSize() {
 
   if (val) {
     size_t total_size = 0;
+<<<<<<< HEAD
     const std::string config(val);
+=======
+    const std::string& config(val.value());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::regex exp(":([0-9]+):([0-9]+)");
     std::sregex_iterator next(config.begin(), config.end(), exp);
     std::sregex_iterator end;
diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h
index 7a24151df205..029cd7338dc0 100644
--- a/aten/src/ATen/cuda/Exceptions.h
+++ b/aten/src/ATen/cuda/Exceptions.h
@@ -117,15 +117,22 @@ constexpr const char* _cusolver_backend_suggestion =            \
   "linear algebra operators with other supported backends. "    \
   "See https://pytorch.org/docs/stable/backends.html#torch.backends.cuda.preferred_linalg_library";
 
+<<<<<<< HEAD
 // When cuda < 11.5, cusolver raises CUSOLVER_STATUS_EXECUTION_FAILED when input contains nan.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // When cuda >= 11.5, cusolver normally finishes execution and sets info array indicating convergence issue.
 #define TORCH_CUSOLVER_CHECK(EXPR)                                      \
   do {                                                                  \
     cusolverStatus_t __err = EXPR;                                      \
+<<<<<<< HEAD
     if ((CUDA_VERSION < 11500 &&                                        \
          __err == CUSOLVER_STATUS_EXECUTION_FAILED) ||                  \
         (CUDA_VERSION >= 11500 &&                                       \
          __err == CUSOLVER_STATUS_INVALID_VALUE)) {                     \
+=======
+    if (__err == CUSOLVER_STATUS_INVALID_VALUE) {                       \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_CHECK_LINALG(                                               \
           false,                                                        \
           "cusolver error: ",                                           \
diff --git a/aten/src/ATen/cuda/PinnedMemoryAllocator.h b/aten/src/ATen/cuda/PinnedMemoryAllocator.h
index 854f5d8dd129..bbe3687ba24c 100644
--- a/aten/src/ATen/cuda/PinnedMemoryAllocator.h
+++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.h
@@ -1,11 +1,19 @@
 #pragma once
 
+<<<<<<< HEAD
 #include <c10/core/Allocator.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cuda/CachingHostAllocator.h>
 
 namespace at::cuda {
 
+<<<<<<< HEAD
 inline TORCH_CUDA_CPP_API at::Allocator* getPinnedMemoryAllocator() {
   return getCachingHostAllocator();
+=======
+inline TORCH_CUDA_CPP_API at::HostAllocator* getPinnedMemoryAllocator() {
+  return at::getHostAllocator(at::kCUDA);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace at::cuda
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-f16-8.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-f16-8.cu
new file mode 100644
index 000000000000..6c20daed2e02
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-f16-8.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_INSTANTIATE_SORT_PAIRS(c10::BFloat16, 8)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int32-1.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-1.cu
new file mode 100644
index 000000000000..2adb6a519882
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-1.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_INSTANTIATE_SORT_PAIRS(int32_t, 1)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int32-2.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-2.cu
new file mode 100644
index 000000000000..39e29b7668c9
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-2.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_INSTANTIATE_SORT_PAIRS(int32_t, 2)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int32-4.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-4.cu
new file mode 100644
index 000000000000..3ad1ebd2a56a
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int32-4.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_INSTANTIATE_SORT_PAIRS(int32_t, 4)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int64-1.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-1.cu
new file mode 100644
index 000000000000..098615b68345
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-1.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_INSTANTIATE_SORT_PAIRS(int64_t, 1)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int64-2.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-2.cu
new file mode 100644
index 000000000000..d58e0c8d5ce7
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-2.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_INSTANTIATE_SORT_PAIRS(int64_t, 2)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-int64-4.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-4.cu
new file mode 100644
index 000000000000..fe24f72151fb
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-int64-4.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_INSTANTIATE_SORT_PAIRS(int64_t, 4)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-scalars.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-scalars.cu
new file mode 100644
index 000000000000..1373668316c2
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-scalars.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-uint16-8.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-uint16-8.cu
new file mode 100644
index 000000000000..f52f97fe588a
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-uint16-8.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_INSTANTIATE_SORT_PAIRS(uint16_t, 8)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-uint32-8.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-uint32-8.cu
new file mode 100644
index 000000000000..db28bb602acc
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-uint32-8.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_INSTANTIATE_SORT_PAIRS(uint32_t, 8)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs-uint64-8.cu b/aten/src/ATen/cuda/cub-RadixSortPairs-uint64-8.cu
new file mode 100644
index 000000000000..7ad51b90b834
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs-uint64-8.cu
@@ -0,0 +1,7 @@
+#include <ATen/cuda/cub-RadixSortPairs.cuh>
+
+namespace at::cuda::cub::detail {
+
+AT_INSTANTIATE_SORT_PAIRS(uint64_t, 8)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs.cuh b/aten/src/ATen/cuda/cub-RadixSortPairs.cuh
new file mode 100644
index 000000000000..bd40deb4125b
--- /dev/null
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs.cuh
@@ -0,0 +1,74 @@
+#pragma once
+
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/cuda/CUDAConfig.h>
+#include <ATen/cuda/cub.cuh>
+
+namespace at::cuda::cub::detail {
+
+template <typename key_t, int value_size>
+void radix_sort_pairs_impl(
+    const key_t* keys_in,
+    key_t* keys_out,
+    const OpaqueType<value_size>* values_in,
+    OpaqueType<value_size>* values_out,
+    int64_t n,
+    bool descending,
+    int64_t begin_bit,
+    int64_t end_bit) {
+  TORCH_CHECK(
+      n <= std::numeric_limits<int>::max(),
+      "cub sort does not support sorting more than INT_MAX elements");
+  using key_t_ = typename detail::cuda_type<key_t>::type;
+
+  auto allocator = c10::cuda::CUDACachingAllocator::get();
+  c10::DataPtr keys_out_owner;
+
+  if (keys_out == nullptr) {
+    keys_out_owner = allocator->allocate(n * sizeof(key_t));
+    keys_out = reinterpret_cast<key_t*>(keys_out_owner.get());
+  }
+
+  const key_t_* keys_in_ = reinterpret_cast<const key_t_*>(keys_in);
+  key_t_* keys_out_ = reinterpret_cast<key_t_*>(keys_out);
+
+  if (descending) {
+    CUB_WRAPPER(
+        NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortPairsDescending,
+        keys_in_,
+        keys_out_,
+        values_in,
+        values_out,
+        n,
+        begin_bit,
+        end_bit,
+        c10::cuda::getCurrentCUDAStream());
+  } else {
+    CUB_WRAPPER(
+        NO_ROCM(at_cuda_detail)::cub::DeviceRadixSort::SortPairs,
+        keys_in_,
+        keys_out_,
+        values_in,
+        values_out,
+        n,
+        begin_bit,
+        end_bit,
+        c10::cuda::getCurrentCUDAStream());
+  }
+}
+
+#define AT_INSTANTIATE_SORT_PAIRS(key_t, value_size) \
+  template void radix_sort_pairs_impl(               \
+      const key_t* keys_in,                          \
+      key_t* keys_out,                               \
+      const OpaqueType<value_size>* values_in,       \
+      OpaqueType<value_size>* values_out,            \
+      int64_t n,                                     \
+      bool descending,                               \
+      int64_t begin_bit,                             \
+      int64_t end_bit);
+
+#define AT_INSTANTIATE_SORT_PAIRS_8(scalar_t, ScalarType) \
+  AT_INSTANTIATE_SORT_PAIRS(scalar_t, 8)
+
+} // namespace at::cuda::cub::detail
diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
index a1a7ab70630b..96df8d77554b 100644
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@@ -37,11 +37,18 @@
 // handle the temporary storage and 'twice' calls for cub API
 #define CUB_WRAPPER(func, ...) do {                                       \
   size_t temp_storage_bytes = 0;                                          \
+<<<<<<< HEAD
   func(nullptr, temp_storage_bytes, __VA_ARGS__);                         \
   auto& caching_allocator = *::c10::cuda::CUDACachingAllocator::get();    \
   auto temp_storage = caching_allocator.allocate(temp_storage_bytes);     \
   func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__);              \
   AT_CUDA_CHECK(cudaGetLastError());                                      \
+=======
+  AT_CUDA_CHECK(func(nullptr, temp_storage_bytes, __VA_ARGS__));          \
+  auto& caching_allocator = *::c10::cuda::CUDACachingAllocator::get();    \
+  auto temp_storage = caching_allocator.allocate(temp_storage_bytes);     \
+  AT_CUDA_CHECK(func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__));\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } while (false)
 
 #ifdef USE_ROCM
@@ -292,7 +299,11 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
 #endif
 }
 
+<<<<<<< HEAD
 # if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
+=======
+# if defined(CUDA_VERSION) || defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template<typename T>
 struct BlockPrefixCallbackOp
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 21484c0dea9a..4645259cde4c 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -19,6 +19,13 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/irange.h>
 
+<<<<<<< HEAD
+=======
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
 #endif
@@ -91,7 +98,31 @@ void CUDAHooks::init() const {
 
   // Sets the CUDA_MODULE_LOADING environment variable
   // if it's not set by the user.
+<<<<<<< HEAD
   c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false);
+=======
+  // CUDA_MODULE_LOADING="LAZY" is default for all drivers released for CUDA 12.2+.
+  // Check the driver version and only set the env variable if needed.
+  bool set_lazy_module_loading = true;
+  #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  auto driver_api = c10::cuda::DriverAPI::get();
+  // Initialize NVML
+  if (driver_api->nvmlInit_v2_() == NVML_SUCCESS) {
+    // Get the driver version
+    int version = -1;
+    auto res = driver_api->nvmlSystemGetCudaDriverVersion_v2_(&version);
+    if (res == NVML_SUCCESS) {
+      // Check if driver is sufficiently new
+      if (version >= 12020) {
+        set_lazy_module_loading = false;
+      }
+    }
+  }
+  #endif
+  if (set_lazy_module_loading) {
+    c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto num_devices = c10::cuda::device_count_ensure_non_zero();
   c10::cuda::CUDACachingAllocator::init(num_devices);
   at::cuda::detail::init_p2p_access_cache(num_devices);
@@ -458,8 +489,19 @@ DeviceIndex CUDAHooks::getCurrentDevice() const {
 }
 
 #ifdef USE_ROCM
+<<<<<<< HEAD
 bool CUDAHooks::isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const {
   hipDeviceProp_t* prop = at::cuda::getDeviceProperties(device_index);
+=======
+bool CUDAHooks::isGPUArch(const std::vector<std::string>& archs, DeviceIndex device_index) const {
+  hipDeviceProp_t* prop;
+  if (device_index == -1){
+      prop = at::cuda::getCurrentDeviceProperties();
+  } else {
+      prop = at::cuda::getDeviceProperties(device_index);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string device_arch = prop->gcnArchName;
   for (std::string arch : archs) {
       size_t substring = device_arch.find(arch);
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index 34f2adee5140..84837dd871d9 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -58,7 +58,11 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   DeviceIndex getCurrentDevice() const override;
 
 #ifdef USE_ROCM
+<<<<<<< HEAD
   bool isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const override;
+=======
+  bool isGPUArch(const std::vector<std::string>& archs, DeviceIndex device_index = -1) const override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   void deviceSynchronize(DeviceIndex device_index) const override;
 };
diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
index c9cabeb9399f..92920db451dd 100644
--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@@ -146,17 +146,30 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
 NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *)
 NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *)
 NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *)
+<<<<<<< HEAD
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
 NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *)
 NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *)
 #endif
+=======
+NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *)
+NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *)
 _STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult)
 NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)
 NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *)
 NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **)
 
+<<<<<<< HEAD
 CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *)
+=======
+CUDA_STUB2(cuModuleLoad, CUmodule*, const char*)
+CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *)
+CUDA_STUB2(cuFuncSetCacheConfig, CUfunction, CUfunc_cache_enum)
+CUDA_STUB3(cuDeviceGetAttribute, int*, CUdevice_attribute_enum, CUdevice)
+CUDA_STUB2(cuDeviceGet, CUdevice*, int)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *)
 CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t)
 CUDA_STUB2(cuGetErrorString, CUresult, const char **)
@@ -169,6 +182,11 @@ CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *)
 CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *)
 CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
 CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
+<<<<<<< HEAD
+=======
+CUDA_STUB3(cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
 CUresult CUDAAPI
diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
index f2e2a0cef55a..689edd8a16eb 100644
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -43,6 +43,10 @@ namespace at::cuda {
   _(nvrtcGetProgramLogSize)                      \
   _(nvrtcGetProgramLog)                          \
   _(nvrtcGetLoweredName)                         \
+<<<<<<< HEAD
+=======
+  _(cuModuleLoad)                                \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _(cuModuleLoadData)                            \
   _(cuModuleLoadDataEx)                          \
   _(cuModuleGetFunction)                         \
@@ -60,6 +64,14 @@ namespace at::cuda {
   _(cuLinkComplete)                              \
   _(cuFuncSetAttribute)                          \
   _(cuFuncGetAttribute)                          \
+<<<<<<< HEAD
+=======
+  _(cuPointerGetAttribute)                       \
+  _(cuFuncSetCacheConfig)                        \
+  _(cuDeviceGetAttribute)                        \
+  _(cuDeviceGet)                        \
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
 #define AT_FORALL_NVRTC_EXTENDED(_)              \
@@ -70,7 +82,11 @@ namespace at::cuda {
   AT_FORALL_NVRTC_BASE(_)
 #endif
 
+<<<<<<< HEAD
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
+=======
+#if defined(CUDA_VERSION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define AT_FORALL_NVRTC(_) \
   AT_FORALL_NVRTC_EXTENDED(_)  \
   _(nvrtcGetCUBINSize)     \
diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h
index 5ed30f74b989..21d85a031e65 100644
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@@ -469,7 +469,11 @@ struct GemmAndBiasParams : OpParams {
   bool duplicate_inputs_{false};
 };
 
+<<<<<<< HEAD
 template <typename T>
+=======
+template <typename T, typename C_Dtype = T>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct GemmStridedBatchedParams : OpParams {
   std::string BLASSignature() const override {
     std::string alpha_str = to_string_opmath<T>(alpha);
@@ -477,7 +481,11 @@ struct GemmStridedBatchedParams : OpParams {
     return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: %ld, stride_b: %ld, stride_c: %ld, stride_d: %ld, "
       "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: %ld, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
       m, n, k, lda, ldb, ldc, ldc, stride_a, stride_b, stride_c, stride_c, alpha_str, beta_str, transa, transb, batch,
+<<<<<<< HEAD
       BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+=======
+      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<C_Dtype>(C_Dtype{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::string Signature() const override {
@@ -517,7 +525,11 @@ struct GemmStridedBatchedParams : OpParams {
     c10::DeviceIndex device = 0;
     AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
     size_t c_size = GetSizeC();
+<<<<<<< HEAD
     copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+=======
+    copy->c = static_cast<C_Dtype*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
         copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
     if (duplicate_inputs) {
@@ -544,7 +556,11 @@ struct GemmStridedBatchedParams : OpParams {
   }
 
   TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
+<<<<<<< HEAD
     auto c_dtype = c10::CppTypeToScalarType<T>::value;
+=======
+    auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
   }
 
@@ -561,7 +577,11 @@ struct GemmStridedBatchedParams : OpParams {
   int64_t ldb{};
   int64_t stride_b{};
   at::opmath_type<T> beta;
+<<<<<<< HEAD
   T* c{};
+=======
+  C_Dtype* c{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t ldc{};
   int64_t stride_c{};
   int64_t batch{};
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
index b899efde0e9c..863907c47595 100644
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -85,6 +85,18 @@ constexpr hipDataType HipDataTypeFor<c10::Float8_e8m0fnu>() {
   return static_cast<hipDataType>(500);
 }
 
+<<<<<<< HEAD
+=======
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float4_e2m1fn_x2>() {
+#if ROCM_VERSION >= 70000
+  return HIP_R_4F_E2M1;
+#else
+  return static_cast<hipDataType>(33);
+#endif
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 int GetBatchFromParams(const GemmParams<T>* params) {
   return 1;
@@ -591,6 +603,17 @@ auto GetHipBlasLtTypeStringAndOps() {
   auto b_datatype = HipDataTypeFor<BT>();
   auto in_out_datatype = HipDataTypeFor<CT>();
   std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+<<<<<<< HEAD
+=======
+#if ROCM_VERSION == 60400
+  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
+  if ((a_datatype == HIP_R_32F || b_datatype == HIP_R_32F || in_out_datatype == HIP_R_32F)
+          && (transa_outer == HIPBLAS_OP_T && transb_outer == HIPBLAS_OP_T)) {
+    std::vector<std::pair<std::string, std::unique_ptr<Callable<ParamsT>>>> ignore;
+    return ignore;
+  }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
   if (at::globalContext().allowTF32CuBLAS()) {
diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
index 71ac97e66688..616e80c6e651 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@@ -524,8 +524,13 @@ void TuningContext::EnableNumericsCheck(bool value) {
 }
 
 bool TuningContext::IsNumericsCheckEnabled() const {
+<<<<<<< HEAD
   const char *env = getenv("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
   if (env != nullptr && strcmp(env, "1") == 0) {
+=======
+  const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
+  if (env == "1") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return true;
   }
   return numerics_check_enable_;
diff --git a/aten/src/ATen/cuda/tunable/Tunable.h b/aten/src/ATen/cuda/tunable/Tunable.h
index b8187b4254bf..5f8e65ea6224 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.h
+++ b/aten/src/ATen/cuda/tunable/Tunable.h
@@ -40,9 +40,15 @@ enum TORCH_CUDA_CPP_API TuningStatus {
 class TORCH_CUDA_CPP_API ResultEntry {
   public:
     explicit ResultEntry(std::string  key, double time) : key_(std::move(key)), time_(time) {}
+<<<<<<< HEAD
     explicit ResultEntry(std::string  key, double time, const std::string& blas_sig ) : key_(std::move(key)), time_(time), blas_sig_(blas_sig) {}
     bool operator==(const ResultEntry& other) { return key_ == other.key_; }
     bool operator!=(const ResultEntry& other) { return key_ != other.key_; }
+=======
+    explicit ResultEntry(std::string  key, double time, std::string blas_sig ) : key_(std::move(key)), time_(time), blas_sig_(std::move(blas_sig)) {}
+    bool operator==(const ResultEntry& other) const { return key_ == other.key_; }
+    bool operator!=(const ResultEntry& other) const { return key_ != other.key_; }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     operator std::string () { return key_; }
     std::string GetKey() const { return key_; }
     double GetTime() const { return time_; }
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index d7c32ac2cf33..b40f9d022f64 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -156,8 +156,12 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
     default:
       TORCH_INTERNAL_ASSERT(false, "unsupported memory_format for cuDNN filters");
   }
+<<<<<<< HEAD
   // NOLINTNEXTLINE(*narrowing-conversions)
   set(getDataType(t), static_cast<int64_t>(dim), size, filter_format);
+=======
+  set(getDataType(t), static_cast<int>(dim), size, filter_format);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
diff --git a/aten/src/ATen/cudnn/README.md b/aten/src/ATen/cudnn/README.md
index 057fbc92ecb0..8f4de9e61745 100644
--- a/aten/src/ATen/cudnn/README.md
+++ b/aten/src/ATen/cudnn/README.md
@@ -1,4 +1,8 @@
 All files living in this directory are written with the assumption that cuDNN is available,
 which means that these code are not guarded by `#if AT_CUDNN_ENABLED()`. Therefore, whenever
 you need to use definitions from here, please guard the `#include<ATen/cudnn/*.h>` and
+<<<<<<< HEAD
 definition usages with `#if AT_CUDNN_ENABLED()` macro, e.g. [native/cudnn/BatchNorm.cpp](native/cudnn/BatchNorm.cpp).
+=======
+definition usages with `#if AT_CUDNN_ENABLED()` macro, e.g. [native/cudnn/BatchNorm.cpp](../native/cudnn/BatchNorm.cpp).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 384d8dfe980f..118a02640967 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -86,7 +86,11 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
     TORCH_CHECK(false, "Cannot get device of pointer on CUDA without ATen_cuda library. ", CUDA_HELP);
   }
 
+<<<<<<< HEAD
   bool isPinnedPtr(const void* data) const override {
+=======
+  bool isPinnedPtr(const void* /*data*/)  const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
 
@@ -200,7 +204,11 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
   }
 
 #ifdef USE_ROCM
+<<<<<<< HEAD
   virtual bool isGPUArch(DeviceIndex /*device_index*/, const std::vector<std::string>& /*archs*/) const {
+=======
+  virtual bool isGPUArch(const std::vector<std::string>& /*archs*/, DeviceIndex = -1 /*device_index*/) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false, "Cannot check GPU arch without ATen_cuda library. ", CUDA_HELP);
   }
 #endif
diff --git a/aten/src/ATen/detail/HIPHooksInterface.h b/aten/src/ATen/detail/HIPHooksInterface.h
index e19a379efbda..f63b6736a8b8 100644
--- a/aten/src/ATen/detail/HIPHooksInterface.h
+++ b/aten/src/ATen/detail/HIPHooksInterface.h
@@ -6,8 +6,11 @@
 
 #include <ATen/detail/AcceleratorHooksInterface.h>
 
+<<<<<<< HEAD
 #include <memory>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // NB: Class must live in `at` due to limitations of Registry.h.
 namespace at {
 
@@ -37,7 +40,11 @@ struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface {
     return -1;
   }
 
+<<<<<<< HEAD
   bool isPinnedPtr(const void* data) const override {
+=======
+  bool isPinnedPtr(const void* /*data*/ ) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
 
@@ -49,7 +56,11 @@ struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface {
     return 0;
   }
 
+<<<<<<< HEAD
   bool hasPrimaryContext(DeviceIndex device_index) const override {
+=======
+  bool hasPrimaryContext(DeviceIndex /*device_index*/ ) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false, "Cannot check primary context without ATen_hip library.");
   }
 };
diff --git a/aten/src/ATen/detail/IPUHooksInterface.h b/aten/src/ATen/detail/IPUHooksInterface.h
index ee29aa352f3d..f86fb42db259 100644
--- a/aten/src/ATen/detail/IPUHooksInterface.h
+++ b/aten/src/ATen/detail/IPUHooksInterface.h
@@ -15,7 +15,11 @@ struct TORCH_API IPUHooksInterface : AcceleratorHooksInterface {
     TORCH_CHECK(false, "Cannot initialize IPU without ATen_ipu library.");
   }
 
+<<<<<<< HEAD
   bool hasPrimaryContext(DeviceIndex device_index) const override {
+=======
+  bool hasPrimaryContext(DeviceIndex /*device_index*/) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false, "Cannot initialize IPU without ATen_ipu library.");
     return false;
   }
@@ -26,7 +30,11 @@ struct TORCH_API IPUHooksInterface : AcceleratorHooksInterface {
   }
 
   Generator getNewGenerator(
+<<<<<<< HEAD
       DeviceIndex device_index [[maybe_unused]] = -1) const override {
+=======
+      DeviceIndex /*device_index*/ = -1) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false, "Cannot initialize IPU without ATen_ipu library.");
   }
 };
diff --git a/aten/src/ATen/detail/MAIAHooksInterface.h b/aten/src/ATen/detail/MAIAHooksInterface.h
index 554cc93043fd..cf4d09832002 100644
--- a/aten/src/ATen/detail/MAIAHooksInterface.h
+++ b/aten/src/ATen/detail/MAIAHooksInterface.h
@@ -17,7 +17,11 @@ struct TORCH_API MAIAHooksInterface : AcceleratorHooksInterface {
     TORCH_CHECK(false, "Cannot initialize MAIA without ATen_maia library.");
   }
 
+<<<<<<< HEAD
   bool hasPrimaryContext(DeviceIndex device_index) const override {
+=======
+  bool hasPrimaryContext(DeviceIndex /*device_index*/) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false, "Cannot initialize MAIA without ATen_maia library.");
     return false;
   }
diff --git a/aten/src/ATen/detail/MPSHooksInterface.h b/aten/src/ATen/detail/MPSHooksInterface.h
index 01d6281e8afe..3fd0f71a2c3e 100644
--- a/aten/src/ATen/detail/MPSHooksInterface.h
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@@ -78,6 +78,12 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
   virtual uint32_t acquireEvent(bool enable_timing) const {
     FAIL_MPSHOOKS_FUNC(__func__);
   }
+<<<<<<< HEAD
+=======
+  Device getDeviceFromPtr(void* data) const override {
+    TORCH_CHECK(false, "Cannot get device of pointer on MPS without ATen_mps library. ");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void releaseEvent(uint32_t event_id) const {
     FAIL_MPSHOOKS_FUNC(__func__);
   }
diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h
index b69e0027ea13..4bd62b9ffa4a 100644
--- a/aten/src/ATen/detail/MTIAHooksInterface.h
+++ b/aten/src/ATen/detail/MTIAHooksInterface.h
@@ -12,7 +12,10 @@
 #include <ATen/detail/AcceleratorHooksInterface.h>
 
 #include <string>
+<<<<<<< HEAD
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at {
 class Context;
 }
@@ -46,7 +49,11 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
     return 0;
   }
 
+<<<<<<< HEAD
   virtual void deviceSynchronize(c10::DeviceIndex device_index) const {
+=======
+  virtual void deviceSynchronize(c10::DeviceIndex /*device_index*/) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FAIL_MTIAHOOKS_FUNC(__func__);
   }
 
@@ -54,11 +61,19 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
     FAIL_MTIAHOOKS_FUNC(__func__);
   }
 
+<<<<<<< HEAD
   bool hasPrimaryContext(DeviceIndex device_index) const override {
     return false;
   }
 
   void setCurrentDevice(DeviceIndex device) const override {
+=======
+  bool hasPrimaryContext(DeviceIndex /*device_index*/) const override {
+    return false;
+  }
+
+  void setCurrentDevice(DeviceIndex /*device*/) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FAIL_MTIAHOOKS_FUNC(__func__);
   }
 
@@ -67,31 +82,60 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
     return -1;
   }
 
+<<<<<<< HEAD
   DeviceIndex exchangeDevice(DeviceIndex device) const override {
+=======
+  DeviceIndex exchangeDevice(DeviceIndex /*device*/) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FAIL_MTIAHOOKS_FUNC(__func__);
     return -1;
   }
 
+<<<<<<< HEAD
   DeviceIndex maybeExchangeDevice(DeviceIndex device) const override {
+=======
+  DeviceIndex maybeExchangeDevice(DeviceIndex /*device*/) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FAIL_MTIAHOOKS_FUNC(__func__);
     return -1;
   }
 
+<<<<<<< HEAD
   virtual c10::Stream getCurrentStream(DeviceIndex device) const {
+=======
+  virtual c10::Stream getCurrentStream(DeviceIndex /*device*/) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FAIL_MTIAHOOKS_FUNC(__func__);
     return c10::Stream::unpack3(-1, 0, c10::DeviceType::MTIA);
   }
 
+<<<<<<< HEAD
   virtual c10::Stream getDefaultStream(DeviceIndex device) const {
+=======
+  virtual int64_t getCurrentRawStream(DeviceIndex /*device*/) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return -1;
+  }
+
+  virtual c10::Stream getDefaultStream(DeviceIndex /*device*/) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FAIL_MTIAHOOKS_FUNC(__func__);
     return c10::Stream::unpack3(-1, 0, c10::DeviceType::MTIA);
   }
 
+<<<<<<< HEAD
   virtual void setCurrentStream(const c10::Stream& stream) const {
     FAIL_MTIAHOOKS_FUNC(__func__);
   }
 
   bool isPinnedPtr(const void* data) const override {
+=======
+  virtual void setCurrentStream(const c10::Stream& /*stream*/ ) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
+  bool isPinnedPtr(const void* /*data*/) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
 
@@ -100,12 +144,25 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
     return nullptr;
   }
 
+<<<<<<< HEAD
   virtual PyObject* memoryStats(DeviceIndex device) const {
+=======
+  virtual PyObject* memoryStats(DeviceIndex /*device*/) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FAIL_MTIAHOOKS_FUNC(__func__);
     return nullptr;
   }
 
+<<<<<<< HEAD
   virtual PyObject* getDeviceCapability(DeviceIndex device) const {
+=======
+  virtual PyObject* getDeviceCapability(DeviceIndex /*device*/) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return nullptr;
+  }
+
+  virtual PyObject* getDeviceProperties(DeviceIndex device) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FAIL_MTIAHOOKS_FUNC(__func__);
     return nullptr;
   }
@@ -116,6 +173,7 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
 
 
   virtual void recordMemoryHistory(
+<<<<<<< HEAD
     const std::optional<std::string>& enabled,
     const std::string& stacks,
     size_t max_entries) const {
@@ -123,6 +181,15 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
   }
 
   virtual PyObject* memorySnapshot() const {
+=======
+    const std::optional<std::string>& /*enabled*/,
+    const std::string& /*stacks*/,
+    size_t /*max_entries*/) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
+  virtual PyObject* memorySnapshot(const std::optional<std::string>& local_path) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FAIL_MTIAHOOKS_FUNC(__func__);
     return nullptr;
   }
@@ -132,10 +199,21 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
     return 0;
   }
 
+<<<<<<< HEAD
   virtual void resetPeakMemoryStats(DeviceIndex device) const {
     FAIL_MTIAHOOKS_FUNC(__func__);
   }
 
+=======
+  virtual void resetPeakMemoryStats(DeviceIndex /*device*/) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
+  virtual void attachOutOfMemoryObserver(PyObject* observer) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct TORCH_API MTIAHooksArgs {};
@@ -149,4 +227,7 @@ TORCH_API const MTIAHooksInterface& getMTIAHooks();
 TORCH_API bool isMTIAHooksBuilt();
 } // namespace detail
 } // namespace at
+<<<<<<< HEAD
 C10_DIAGNOSTIC_POP()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.h b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
index 69819c764260..0dbc878cf821 100644
--- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h
+++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
@@ -23,6 +23,17 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
 
   ~PrivateUse1HooksInterface() override = default;
 
+<<<<<<< HEAD
+=======
+  bool isBuilt() const override {
+    FAIL_PRIVATEUSE1HOOKS_FUNC(__func__);
+  }
+
+  bool isAvailable() const override {
+    FAIL_PRIVATEUSE1HOOKS_FUNC(__func__);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const at::Generator& getDefaultGenerator(
       c10::DeviceIndex device_index) const override {
     FAIL_PRIVATEUSE1HOOKS_FUNC(__func__);
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index cca20e9e553e..9bcaccda5840 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -193,6 +193,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE(_lu_with_info);
   OP_DECOMPOSE(matmul);
   OP_DECOMPOSE(matrix_H);
+<<<<<<< HEAD
+=======
+  OP_DECOMPOSE(matrix_exp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   OP_DECOMPOSE(matrix_power);
   OP_DECOMPOSE2(max, other );
   OP_DECOMPOSE(max_pool1d);
diff --git a/aten/src/ATen/functorch/BatchRulesPooling.cpp b/aten/src/ATen/functorch/BatchRulesPooling.cpp
index c6cab4a42d6f..fe998e027513 100644
--- a/aten/src/ATen/functorch/BatchRulesPooling.cpp
+++ b/aten/src/ATen/functorch/BatchRulesPooling.cpp
@@ -12,7 +12,11 @@
 namespace at::functorch {
 
 template <typename Func>
+<<<<<<< HEAD
 std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>>
+=======
+static std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 max_pool_with_indices_batch_rule_helper(
   const Tensor& self, std::optional<int64_t> self_bdim,
   IntArrayRef kernel_size, IntArrayRef stride,
diff --git a/aten/src/ATen/functorch/BatchRulesRandomness.cpp b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
index b578047dd6fd..a39c8c31d06d 100644
--- a/aten/src/ATen/functorch/BatchRulesRandomness.cpp
+++ b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
@@ -20,7 +20,11 @@
 namespace at::functorch {
 
 template <typename F, F Func, typename... ExtraArgs>
+<<<<<<< HEAD
 Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) {
+=======
+static Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
@@ -37,7 +41,11 @@ Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) {
 }
 
 template <typename F, F Func, typename... ExtraArgs>
+<<<<<<< HEAD
 Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) {
+=======
+static Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
@@ -108,7 +116,11 @@ static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor
 }
 
 template <typename F, F Func, typename... ExtraArgs>
+<<<<<<< HEAD
 Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) {
+=======
+static Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   auto const batch_size = maybe_layer->batchSize();
@@ -127,7 +139,11 @@ Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) {
 }
 
 template <typename F, F Func, typename... ExtraArgs>
+<<<<<<< HEAD
 Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extra_args) {
+=======
+static Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extra_args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
@@ -153,7 +169,11 @@ Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extr
 }
 
 template<typename F, F Func, typename... ExtraArgs>
+<<<<<<< HEAD
 Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args) {
+=======
+static Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
@@ -272,7 +292,11 @@ struct RandomBatchRuleHelper<F, Func, typelist<T1, T...>> {
 };
 
 template <typename F, F Func, typename... T>
+<<<<<<< HEAD
 Tensor rand_int_wrapper(SymIntArrayRef shape, c10::SymInt high, T... extra_args) {
+=======
+static Tensor rand_int_wrapper(SymIntArrayRef shape, c10::SymInt high, T... extra_args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Func(high, shape, std::forward<T>(extra_args)...);
 }
 
@@ -299,7 +323,11 @@ struct RandIntBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
 };
 
 template <typename F, F Func, typename T0, typename T1, typename... T>
+<<<<<<< HEAD
 Tensor rand_int_low_wrapper(SymIntArrayRef shape, T0 scalar0, T1 scalar1, T... extra_args) {
+=======
+static Tensor rand_int_low_wrapper(SymIntArrayRef shape, T0 scalar0, T1 scalar1, T... extra_args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Func(scalar0, scalar1, shape, std::forward<T>(extra_args)...);
 }
 
@@ -346,7 +374,11 @@ struct NormalPointwiseBatchRule<F, Func, typelist<A0, T...>> {
 };
 
 template<typename F, F Func, typename... T>
+<<<<<<< HEAD
 Tensor normal_wrapper(const Tensor& tensor, double scalar, T... extra_args) {
+=======
+static Tensor normal_wrapper(const Tensor& tensor, double scalar, T... extra_args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Func(scalar, tensor, extra_args...);
 }
 
diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index a7366eef4fd3..6f22c16bfa78 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -385,9 +385,17 @@ namespace {
     // next broadcast all index tensors together
     try {
       indices = at::expand_outplace(indices);
+<<<<<<< HEAD
     } catch (std::exception &e) {
       TORCH_CHECK_INDEX(false, "shape mismatch: indexing tensors could not be broadcast together"
                                " with shapes ");
+=======
+    } catch (std::exception&) {
+      TORCH_CHECK_INDEX(
+          false,
+          "shape mismatch: indexing tensors could not be broadcast together"
+          " with shapes ");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     // add missing null Tensors so that it matches self.dim()
     while (indices.size() < static_cast<size_t>(self.dim())) {
@@ -771,6 +779,18 @@ std::tuple<Tensor, std::optional<int64_t>> scatter_add_batch_rule(
                             self, self_bdim, dim, index, index_bdim, src, src_bdim);
 }
 
+<<<<<<< HEAD
+=======
+std::tuple<Tensor, std::optional<int64_t>> scatter_add__batch_rule(
+    const Tensor& self, std::optional<int64_t> self_bdim,
+    int64_t dim,
+    const Tensor& index, std::optional<int64_t> index_bdim,
+    const Tensor& src, std::optional<int64_t> src_bdim) {
+  return scatter_batch_rule(ATEN_FN(scatter_add_),
+                            self, self_bdim, dim, index, index_bdim, src, src_bdim);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::tuple<Tensor, std::optional<int64_t>> scatter_reduce_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     int64_t dim,
@@ -1276,6 +1296,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT2(scatter, value, scatter_value_batch_rule);
   VMAP_SUPPORT2(scatter, src, scatter_src_batch_rule);
   VMAP_SUPPORT(scatter_add, scatter_add_batch_rule);
+<<<<<<< HEAD
+=======
+  VMAP_SUPPORT(scatter_add_, scatter_add__batch_rule);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   VMAP_SUPPORT2(scatter, reduce, scatter_reduce_batch_rule);
   VMAP_SUPPORT2(scatter, value_reduce, scatter_value_reduce_batch_rule);
   VMAP_SUPPORT2(scatter_reduce, two, scatter_reduce_two_batch_rule);
diff --git a/aten/src/ATen/functorch/BatchedFallback.cpp b/aten/src/ATen/functorch/BatchedFallback.cpp
index 55d9e91834d2..d2278e3c5502 100644
--- a/aten/src/ATen/functorch/BatchedFallback.cpp
+++ b/aten/src/ATen/functorch/BatchedFallback.cpp
@@ -19,7 +19,11 @@
 
 namespace at::functorch {
 
+<<<<<<< HEAD
 bool kVmapFallbackWarningEnabled = true;
+=======
+static bool kVmapFallbackWarningEnabled = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool isVmapFallbackWarningEnabled() {
   return kVmapFallbackWarningEnabled;
@@ -29,7 +33,11 @@ void setVmapFallbackWarningEnabled(bool enabled) {
   kVmapFallbackWarningEnabled = enabled;
 }
 
+<<<<<<< HEAD
 bool kVmapFallbackEnabled = true;
+=======
+static bool kVmapFallbackEnabled = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool isVmapFallbackEnabled() {
   return kVmapFallbackEnabled;
diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h
index e42f8dd87b50..62e3ade38937 100644
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@@ -159,6 +159,10 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
   DispatchKey::XLA,
   DispatchKey::CUDA,
   DispatchKey::CPU,
+<<<<<<< HEAD
+=======
+  DispatchKey::PrivateUse1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 });
 
 inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) {
diff --git a/aten/src/ATen/functorch/Interpreter.h b/aten/src/ATen/functorch/Interpreter.h
index bdea11d3b2a0..84edfb40e1db 100644
--- a/aten/src/ATen/functorch/Interpreter.h
+++ b/aten/src/ATen/functorch/Interpreter.h
@@ -8,6 +8,11 @@
 #include <utility>
 #include <variant>
 
+<<<<<<< HEAD
+=======
+#include <nlohmann/json.hpp>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::functorch {
 
 // NOTE: [functorch interpreter stack]
@@ -91,24 +96,112 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t);
 struct VmapInterpreterMeta {
   explicit VmapInterpreterMeta(c10::SymInt batchSize, RandomnessType randomness) :
     batchSize_(std::move(batchSize)), randomness_(randomness) {}
+<<<<<<< HEAD
+  c10::SymInt batchSize_;
+  RandomnessType randomness_;
+=======
+
   c10::SymInt batchSize_;
   RandomnessType randomness_;
+
+  VmapInterpreterMeta() = default;
+  VmapInterpreterMeta(const VmapInterpreterMeta&) = default;
+  VmapInterpreterMeta(VmapInterpreterMeta&&) = default;
+  VmapInterpreterMeta& operator=(const VmapInterpreterMeta&) = default;
+  VmapInterpreterMeta& operator=(VmapInterpreterMeta&&) = default;
+  ~VmapInterpreterMeta() = default;
+
+  template <typename T>
+  friend void to_json(T& json_j, const VmapInterpreterMeta& json_t) {
+    if (json_t.batchSize_.is_heap_allocated()) {
+      throw std::runtime_error("Serialization for heap-allocated SymInt is not implemented yet");
+    }
+    json_j["batchSize"] = json_t.batchSize_.as_int_unchecked();
+    json_j["randomness"] = static_cast<int64_t>(json_t.randomness_);
+  }
+
+  template <typename T>
+  friend void from_json(const T& json_j, VmapInterpreterMeta& json_t) {
+    json_t.batchSize_ = c10::SymInt(SymInt::Unchecked::UNCHECKED, json_j["batchSize"]);
+    json_t.randomness_ = static_cast<RandomnessType>(json_j["randomness"]);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct GradInterpreterMeta {
   explicit GradInterpreterMeta(bool prevGradMode): prevGradMode_(prevGradMode) {}
+<<<<<<< HEAD
+  bool prevGradMode_;
+=======
+  GradInterpreterMeta() = default;
+  GradInterpreterMeta(const GradInterpreterMeta&) = default;
+  GradInterpreterMeta(GradInterpreterMeta&&) = default;
+  GradInterpreterMeta& operator=(const GradInterpreterMeta&) = default;
+  GradInterpreterMeta& operator=(GradInterpreterMeta&&) = default;
+  ~GradInterpreterMeta() = default;
+
   bool prevGradMode_;
+  template <typename T>
+  friend void to_json(T& json_j, const GradInterpreterMeta& json_t) {
+    json_j["prevGradMode"] = json_t.prevGradMode_;
+  }
+
+  template <typename T>
+  friend void from_json(const T& json_j, GradInterpreterMeta& json_t) {
+    json_t.prevGradMode_ = json_j["prevGradMode"];
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct JvpInterpreterMeta {
   explicit JvpInterpreterMeta(bool prevFwdGradMode) : prevFwdGradMode_(prevFwdGradMode) {}
+<<<<<<< HEAD
+  bool prevFwdGradMode_;
+=======
+  JvpInterpreterMeta() = default;
+  JvpInterpreterMeta(const JvpInterpreterMeta&) = default;
+  JvpInterpreterMeta(JvpInterpreterMeta&&) = default;
+  JvpInterpreterMeta& operator=(const JvpInterpreterMeta&) = default;
+  JvpInterpreterMeta& operator=(JvpInterpreterMeta&&) = default;
+  ~JvpInterpreterMeta() = default;
+
   bool prevFwdGradMode_;
+  template <typename T>
+  friend void to_json(T& json_j, const JvpInterpreterMeta& json_t) {
+    json_j["prevFwdGradMode"] = json_t.prevFwdGradMode_;
+  }
+
+  template <typename T>
+  friend void from_json(const T& json_j, JvpInterpreterMeta& json_t) {
+    json_t.prevFwdGradMode_ = json_j["prevFwdGradMode"];
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct FunctionalizeInterpreterMeta {
   explicit FunctionalizeInterpreterMeta(bool functionalizeAddBackViews) :
     functionalizeAddBackViews_(functionalizeAddBackViews) {}
+<<<<<<< HEAD
+  bool functionalizeAddBackViews_;
+=======
+  FunctionalizeInterpreterMeta() = default;
+  FunctionalizeInterpreterMeta(const FunctionalizeInterpreterMeta&) = default;
+  FunctionalizeInterpreterMeta(FunctionalizeInterpreterMeta&&) = default;
+  FunctionalizeInterpreterMeta& operator=(const FunctionalizeInterpreterMeta&) = default;
+  FunctionalizeInterpreterMeta& operator=(FunctionalizeInterpreterMeta&&) = default;
+  ~FunctionalizeInterpreterMeta() = default;
+
   bool functionalizeAddBackViews_;
+  template <typename T>
+  friend void to_json(T& json_j, const FunctionalizeInterpreterMeta& json_t) {
+    json_j["functionalizeAddBackViews"] = json_t.functionalizeAddBackViews_;
+  }
+
+  template <typename T>
+  friend void from_json(const T& json_j, FunctionalizeInterpreterMeta& json_t) {
+    json_t.functionalizeAddBackViews_ = json_j["functionalizeAddBackViews"];
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 typedef std::variant<
@@ -172,6 +265,78 @@ struct Interpreter {
   // Please don't use this
   explicit Interpreter() = default;
 
+<<<<<<< HEAD
+=======
+  template <typename T>
+  friend void to_json(T& json_j, const Interpreter& json_t) {
+    json_j["type"] = static_cast<int64_t>(json_t.type_);
+    json_j["level"] = json_t.level_;
+    if (json_t.savedLocalDispatchKeySet_) {
+      json_j["savedLocalDispatchKeySet"] = {
+        {"included", json_t.savedLocalDispatchKeySet_->included_.raw_repr()},
+        {"excluded", json_t.savedLocalDispatchKeySet_->excluded_.raw_repr()}
+      };
+    } else {
+      json_j["savedLocalDispatchKeySet"] = nlohmann::json();
+    }
+    json_j["is_alive"] = *json_t.is_alive_;
+    std::visit([&](auto&& arg) {
+        using V = std::decay_t<decltype(arg)>;
+        if constexpr (std::is_same_v<V, int64_t>) {
+          json_j["meta"] = {{"Torch", arg}};
+        } else if constexpr (std::is_same_v<V, GradInterpreterMeta>) {
+          json_j["meta"] = {{"Grad", arg}};
+        } else if constexpr (std::is_same_v<V, JvpInterpreterMeta>) {
+          json_j["meta"] = {{"Jvp", arg}};
+        } else if constexpr (std::is_same_v<V, VmapInterpreterMeta>) {
+          json_j["meta"] = {{"Vmap", arg}};
+        } else if constexpr (std::is_same_v<V, FunctionalizeInterpreterMeta>) {
+          json_j["meta"] = {{"Functionalize", arg}};
+        } else {
+          static_assert(false && sizeof(V), "unknown variant case");
+        }
+    }, json_t.meta_);
+  }
+
+  template <typename T>
+  friend void from_json(const T& json_j, Interpreter& json_t) {
+    json_t.type_ = static_cast<TransformType>(json_j["type"]);
+    json_t.level_ = json_j["level"];
+    auto savedLocalDispatchKeySet = json_j["savedLocalDispatchKeySet"];
+    if (savedLocalDispatchKeySet.is_null()) {
+      json_t.savedLocalDispatchKeySet_ = std::nullopt;
+    } else {
+      c10::impl::PODLocalDispatchKeySet pod;
+      pod.set_included(DispatchKeySet::from_raw_repr(savedLocalDispatchKeySet["included"].template get<uint64_t>()));
+      pod.set_excluded(DispatchKeySet::from_raw_repr(savedLocalDispatchKeySet["excluded"].template get<uint64_t>()));
+      json_t.savedLocalDispatchKeySet_ = c10::impl::LocalDispatchKeySet(pod);
+    }
+    json_t.is_alive_ = std::make_shared<bool>(json_j["is_alive"]);
+    auto meta = json_j["meta"];
+    if (meta.contains("Torch")) {
+      json_t.meta_.emplace<int64_t>(meta["Torch"].template get<int64_t>());
+    } else if (meta.contains("Grad")) {
+      json_t.meta_.emplace<GradInterpreterMeta>(meta["Grad"].template get<GradInterpreterMeta>());
+    } else if (meta.contains("Jvp")) {
+      json_t.meta_.emplace<JvpInterpreterMeta>(meta["Jvp"].template get<JvpInterpreterMeta>());
+    } else if (meta.contains("Vmap")) {
+      json_t.meta_.emplace<VmapInterpreterMeta>(meta["Vmap"].template get<VmapInterpreterMeta>());
+    } else if (meta.contains("Functionalize")) {
+      json_t.meta_.emplace<FunctionalizeInterpreterMeta>(meta["Functionalize"].template get<FunctionalizeInterpreterMeta>());
+    } else {
+      throw std::runtime_error("unknown interpreter metadata type");
+    }
+  }
+
+  std::string serialize() const {
+    return nlohmann::json(*this).dump();
+  }
+
+  static Interpreter deserialize(const std::string& serialized) {
+    return nlohmann::json::parse(serialized).get<Interpreter>();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   explicit Interpreter(TransformType type, int64_t level, InterpreterMeta meta):
     type_(type), level_(level), is_alive_(std::make_shared<bool>(false)), meta_(std::move(meta)) {}
diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
index 7bc3a3cbfe44..f9aad2862469 100644
--- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
+++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
@@ -143,7 +143,11 @@ static Tensor make_feature_noise(const Tensor& input) {
 }
 
 static bool is_fused_kernel_acceptable(const Tensor& input, double p) {
+<<<<<<< HEAD
   return (input.is_cuda() || input.is_xpu() || input.is_lazy()) && p > 0 && p < 1 && input.numel() > 0;
+=======
+  return (input.is_cuda() || input.is_xpu() || input.is_lazy() || input.is_privateuseone()) && p > 0 && p < 1 && input.numel() > 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // NB: sure, we could have used different overloads here, but I would feel insecure
diff --git a/aten/src/ATen/functorch/TensorWrapper.cpp b/aten/src/ATen/functorch/TensorWrapper.cpp
index 4f50a1fe2b40..5c39ddb4ddd8 100644
--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@@ -56,7 +56,12 @@ void dumpTensorCout(const Tensor& tensor) {
 
 static c10::intrusive_ptr<TensorWrapper> makeTensorWrapperPtr(const Tensor& tensor, int64_t level, const std::shared_ptr<bool>& life_handle) {
   auto keys_to_propagate = kKeysToPropagateToWrapper | DispatchKeySet({
+<<<<<<< HEAD
       DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA});
+=======
+      DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA,
+      DispatchKey::AutogradPrivateUse1});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto key_set = getKeysToPropagateToWrapper(tensor, keys_to_propagate);
   key_set = key_set.add(DispatchKey::FuncTorchGradWrapper);
   return c10::make_intrusive<TensorWrapper>(key_set, tensor, level, life_handle);
@@ -76,7 +81,12 @@ static Tensor unsafeMakeTensorWrapper(
   }
 
   auto keys_to_propagate = kKeysToPropagateToWrapper | DispatchKeySet({
+<<<<<<< HEAD
       DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA});
+=======
+      DispatchKey::AutogradCPU, DispatchKey::AutogradCUDA, DispatchKey::AutogradXLA,
+      DispatchKey::AutogradPrivateUse1});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto key_set = getKeysToPropagateToWrapper(tensor, keys_to_propagate);
   key_set = key_set.add(DispatchKey::FuncTorchGradWrapper);
   auto result = at::detail::make_tensor<TensorWrapper>(
diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
index 9e714101d5a9..fff700505191 100644
--- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@@ -67,8 +67,13 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
     allocator_->setMemoryFraction(fraction, device);
   }
 
+<<<<<<< HEAD
   void emptyCache() override {
     allocator_->emptyCache();
+=======
+  void emptyCache(MempoolId_t mempool_id = {0, 0}) override {
+    allocator_->emptyCache(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void enable(bool value) override {
@@ -103,8 +108,13 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
     allocator_->resetPeakStats(device);
   }
 
+<<<<<<< HEAD
   HIPCachingAllocator::SnapshotInfo snapshot() override {
     return allocator_->snapshot();
+=======
+  HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override {
+    return allocator_->snapshot(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void beginAllocateToPool(
@@ -128,10 +138,22 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
     return allocator_->getPoolUseCount(device, mempool_id);
   }
 
+<<<<<<< HEAD
   void ensureExistsAndIncrefPool(
       c10::DeviceIndex device,
       MempoolId_t mempool_id) override {
     allocator_->ensureExistsAndIncrefPool(device, mempool_id);
+=======
+  void createOrIncrefPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      HIPAllocator* allocator = nullptr) override {
+    allocator_->createOrIncrefPool(device, mempool_id, allocator);
+  }
+
+  void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    allocator_->setUseOnOOM(device, mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   bool checkPoolLiveAllocations(
@@ -157,8 +179,14 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
       bool enabled,
       HIPCachingAllocator::CreateContextFn context_recorder,
       size_t alloc_trace_max_entries,
+<<<<<<< HEAD
       HIPCachingAllocator::RecordContext when) override {
     allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when);
+=======
+      HIPCachingAllocator::RecordContext when,
+      bool clearHistory) override {
+    allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void recordAnnotation(
@@ -166,6 +194,17 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
     allocator_->recordAnnotation(md);
   }
 
+<<<<<<< HEAD
+=======
+  void pushCompileContext(std::string& md) override {
+    allocator_->pushCompileContext(md);
+  }
+
+  void popCompileContext() override {
+    allocator_->popCompileContext();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override {
     allocator_->attachOutOfMemoryObserver(observer);
   }
diff --git a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
index 0edd69f08a91..d3ef24c3351f 100644
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
@@ -37,8 +37,13 @@ inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
   return get()->setMemoryFraction(fraction, device);
 }
 
+<<<<<<< HEAD
 inline void emptyCache() {
   return get()->emptyCache();
+=======
+inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
+  return get()->emptyCache(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void enable(bool value) {
@@ -70,8 +75,13 @@ inline void resetPeakStats(c10::DeviceIndex device) {
   return get()->resetPeakStats(device);
 }
 
+<<<<<<< HEAD
 inline HIPCachingAllocator::SnapshotInfo snapshot() {
   return get()->snapshot();
+=======
+inline HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
+  return get()->snapshot(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
@@ -101,9 +111,16 @@ inline void recordHistory(
     bool enabled,
     HIPCachingAllocator::CreateContextFn context_recorder,
     size_t alloc_trace_max_entries,
+<<<<<<< HEAD
     HIPCachingAllocator::RecordContext when) {
   return get()->recordHistory(
       enabled, context_recorder, alloc_trace_max_entries, when);
+=======
+    HIPCachingAllocator::RecordContext when,
+    bool clearHistory) {
+  return get()->recordHistory(
+      enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void recordAnnotation(
@@ -111,6 +128,17 @@ inline void recordAnnotation(
   return get()->recordAnnotation(md);
 }
 
+<<<<<<< HEAD
+=======
+inline void pushCompileContext(std::string& md) {
+  return get()->pushCompileContext(md);
+}
+
+inline void popCompileContext() {
+  return get()->popCompileContext();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline bool isHistoryEnabled() {
   return get()->isHistoryEnabled();
 }
@@ -135,10 +163,22 @@ inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
   return get()->releasePool(device, mempool_id);
 }
 
+<<<<<<< HEAD
 inline void ensureExistsAndIncrefPool(
     c10::DeviceIndex device,
     MempoolId_t mempool_id) {
   get()->ensureExistsAndIncrefPool(device, mempool_id);
+=======
+inline void createOrIncrefPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    HIPCachingAllocator::HIPAllocator* allocator_ptr = nullptr) {
+  get()->createOrIncrefPool(device, mempool_id, allocator_ptr);
+}
+
+inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->setUseOnOOM(device, mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h
index fae1befb5d15..97492565ceab 100644
--- a/aten/src/ATen/miopen/Descriptors.h
+++ b/aten/src/ATen/miopen/Descriptors.h
@@ -67,7 +67,11 @@ struct DescriptorDeleter {
 // function.
 template <typename T, miopenStatus_t (*ctor)(T**), miopenStatus_t (*dtor)(T*)>
 // NOLINTNEXTLINE(bugprone-exception-escape)
+<<<<<<< HEAD
 class TORCH_CUDA_CPP_API Descriptor {
+=======
+class TORCH_HIP_CPP_API Descriptor {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  public:
   // Use desc() to access the underlying descriptor pointer in
   // a read-only fashion.  Most client code should use this.
@@ -93,7 +97,11 @@ class TORCH_CUDA_CPP_API Descriptor {
   std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
 };
 
+<<<<<<< HEAD
 class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
+=======
+class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor<
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                                miopenTensorDescriptor,
                                                &miopenCreateTensorDescriptor,
                                                &miopenDestroyTensorDescriptor> {
@@ -122,7 +130,11 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
 
+<<<<<<< HEAD
 class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
+=======
+class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor<
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                                miopenTensorDescriptor,
                                                &miopenCreateTensorDescriptor,
                                                &miopenDestroyTensorDescriptor> {
@@ -141,7 +153,11 @@ class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
   }
 };
 
+<<<<<<< HEAD
 struct TORCH_CUDA_CPP_API ConvolutionDescriptor
+=======
+struct TORCH_HIP_CPP_API ConvolutionDescriptor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     : public Descriptor<
           miopenConvolutionDescriptor,
           &miopenCreateConvolutionDescriptor,
@@ -156,11 +172,20 @@ struct TORCH_CUDA_CPP_API ConvolutionDescriptor
   }
 };
 
+<<<<<<< HEAD
 struct DropoutDescriptor
   : public Descriptor<miopenDropoutDescriptor,
                       &miopenCreateDropoutDescriptor,
                       &miopenDestroyDropoutDescriptor>
 {
+=======
+// NOLINTNEXTLINE(bugprone-exception-escape)
+struct TORCH_HIP_CPP_API DropoutDescriptor
+    : public Descriptor<
+          miopenDropoutDescriptor,
+          &miopenCreateDropoutDescriptor,
+          &miopenDestroyDropoutDescriptor> {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     void set(miopenHandle_t handle, float dropout, void* states, size_t stateSizeInBytes,
              unsigned long long seed, bool use_mask, bool state_evo, miopenRNGType_t rng_mode) {
       MIOPEN_CHECK(miopenSetDropoutDescriptor(mut_desc(), handle, dropout, states, stateSizeInBytes, seed, use_mask, state_evo, rng_mode));
@@ -172,7 +197,11 @@ struct DropoutDescriptor
     }
 };
 
+<<<<<<< HEAD
 struct TORCH_CUDA_CPP_API RNNDescriptor
+=======
+struct TORCH_HIP_CPP_API RNNDescriptor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   : public Descriptor<miopenRNNDescriptor,
                       &miopenCreateRNNDescriptor,
                       &miopenDestroyRNNDescriptor>
diff --git a/aten/src/ATen/miopen/Handle.h b/aten/src/ATen/miopen/Handle.h
index 2ec059c6f5f5..af852c2ae672 100644
--- a/aten/src/ATen/miopen/Handle.h
+++ b/aten/src/ATen/miopen/Handle.h
@@ -5,6 +5,10 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 TORCH_CUDA_CPP_API miopenHandle_t getMiopenHandle();
 
+=======
+TORCH_HIP_CPP_API miopenHandle_t getMiopenHandle();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/miopen/Types.h b/aten/src/ATen/miopen/Types.h
index 0a8a1a952e2e..75dfa3021060 100644
--- a/aten/src/ATen/miopen/Types.h
+++ b/aten/src/ATen/miopen/Types.h
@@ -6,7 +6,11 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 TORCH_CUDA_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
+=======
+TORCH_HIP_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 int64_t miopen_version();
 
diff --git a/aten/src/ATen/mkl/Descriptors.h b/aten/src/ATen/mkl/Descriptors.h
index 4a006639a7f7..0198bbea401e 100644
--- a/aten/src/ATen/mkl/Descriptors.h
+++ b/aten/src/ATen/mkl/Descriptors.h
@@ -17,9 +17,14 @@ struct DftiDescriptorDeleter {
 class DftiDescriptor {
 public:
   void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type, MKL_LONG signal_ndim, MKL_LONG* sizes) {
+<<<<<<< HEAD
     if (desc_ != nullptr) {
       throw std::runtime_error("DFTI DESCRIPTOR can only be initialized once");
     }
+=======
+    TORCH_CHECK(
+        desc_ == nullptr, "DFTI DESCRIPTOR can only be initialized once");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DFTI_DESCRIPTOR *raw_desc;
     if (signal_ndim == 1) {
       MKL_DFTI_CHECK(DftiCreateDescriptor(&raw_desc, precision, signal_type, 1, sizes[0]));
@@ -30,9 +35,14 @@ class DftiDescriptor {
   }
 
   DFTI_DESCRIPTOR *get() const {
+<<<<<<< HEAD
     if (desc_ == nullptr) {
       throw std::runtime_error("DFTI DESCRIPTOR has not been initialized");
     }
+=======
+    TORCH_CHECK(
+        desc_ != nullptr, "DFTI DESCRIPTOR has not been initialized");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return desc_.get();
   }
 
diff --git a/aten/src/ATen/mkl/README.md b/aten/src/ATen/mkl/README.md
index ee10392bdccd..919e59fa9945 100644
--- a/aten/src/ATen/mkl/README.md
+++ b/aten/src/ATen/mkl/README.md
@@ -1,4 +1,8 @@
 All files living in this directory are written with the assumption that MKL is available,
 which means that these code are not guarded by `#if AT_MKL_ENABLED()`. Therefore, whenever
 you need to use definitions from here, please guard the `#include<ATen/mkl/*.h>` and
+<<<<<<< HEAD
 definition usages with `#if AT_MKL_ENABLED()` macro, e.g. [SpectralOps.cpp](native/mkl/SpectralOps.cpp).
+=======
+definition usages with `#if AT_MKL_ENABLED()` macro, e.g. [SpectralOps.cpp](../native/mkl/SpectralOps.cpp).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp
index e6a292ba2a55..e320260b1918 100644
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@@ -1,5 +1,9 @@
 //  Copyright © 2022 Apple Inc.
+<<<<<<< HEAD
 
+=======
+#include <c10/metal/common.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ATen.h>
 #include <ATen/Tensor.h>
 #include <ATen/Utils.h>
@@ -35,7 +39,11 @@ TensorBase empty_mps(
         layout_or_default(layout_opt) == Layout::Strided,
         "only strided tensors are supported on MPS");
 
+<<<<<<< HEAD
     TORCH_CHECK(size.size() <= 16, "MPS supports tensors with dimensions <= 16, but got ", size.size(), ".");
+=======
+    TORCH_CHECK(size.size() <= c10::metal::max_ndim, "MPS supports tensors with dimensions <= ", c10::metal::max_ndim, ", but got ", size.size(), ".");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     check_size_nonnegative(size);
 
diff --git a/aten/src/ATen/mps/MPSAllocator.h b/aten/src/ATen/mps/MPSAllocator.h
index be17e364d58b..de80c5fc3560 100644
--- a/aten/src/ATen/mps/MPSAllocator.h
+++ b/aten/src/ATen/mps/MPSAllocator.h
@@ -345,7 +345,10 @@ class MPSHeapAllocatorImpl {
     return m_device;
   }
 
+<<<<<<< HEAD
   // TODO: make a common function to do size unit conversions in PyTorch.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   inline std::string format_size(uint64_t size) const;
 
  private:
diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm
index cf0ebc869bb4..e4ec69a949bc 100644
--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@@ -5,6 +5,10 @@
 #include <ATen/mps/MPSAllocator.h>
 #include <c10/core/Allocator.h>
 #include <c10/core/Storage.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <iostream>
 
@@ -21,19 +25,34 @@
   init_buffer_pools();
 
   // debug verbosity flags (see DebugVerbosity enum)
+<<<<<<< HEAD
   static const char* verbosity_str = getenv("PYTORCH_DEBUG_MPS_ALLOCATOR");
   m_debug_verbosity = verbosity_str ? strtol(verbosity_str, nullptr, 0) : DebugVerbosity::SILENT;
 
   static const char* high_watermark_ratio_str = getenv("PYTORCH_MPS_HIGH_WATERMARK_RATIO");
   const double high_watermark_ratio =
       high_watermark_ratio_str ? strtod(high_watermark_ratio_str, nullptr) : default_high_watermark_ratio;
+=======
+  static const auto verbosity_str = c10::utils::get_env("PYTORCH_DEBUG_MPS_ALLOCATOR");
+  m_debug_verbosity = verbosity_str ? strtol(verbosity_str->c_str(), nullptr, 0) : DebugVerbosity::SILENT;
+
+  static const auto high_watermark_ratio_str = c10::utils::get_env("PYTORCH_MPS_HIGH_WATERMARK_RATIO");
+  const double high_watermark_ratio =
+      high_watermark_ratio_str ? strtod(high_watermark_ratio_str->c_str(), nullptr) : default_high_watermark_ratio;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   setHighWatermarkRatio(high_watermark_ratio);
 
   const double default_low_watermark_ratio =
       m_device.hasUnifiedMemory ? default_low_watermark_ratio_unified : default_low_watermark_ratio_discrete;
+<<<<<<< HEAD
   static const char* low_watermark_ratio_str = getenv("PYTORCH_MPS_LOW_WATERMARK_RATIO");
   const double low_watermark_ratio =
       low_watermark_ratio_str ? strtod(low_watermark_ratio_str, nullptr) : default_low_watermark_ratio;
+=======
+  static const auto low_watermark_ratio_str = c10::utils::get_env("PYTORCH_MPS_LOW_WATERMARK_RATIO");
+  const double low_watermark_ratio =
+      low_watermark_ratio_str ? strtod(low_watermark_ratio_str->c_str(), nullptr) : default_low_watermark_ratio;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   setLowWatermarkRatio(low_watermark_ratio);
 }
 
@@ -638,7 +657,11 @@
   std::lock_guard<std::recursive_mutex> lock(m_mutex);
 
   BufferBlock* buffer_block = get_allocated_buffer_block(ptr);
+<<<<<<< HEAD
   if (buffer_block && buffer_block->shape.size() > 0) {
+=======
+  if (buffer_block && !buffer_block->shape.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return IntArrayRef{buffer_block->shape};
   }
   return IntArrayRef();
@@ -699,6 +722,7 @@
 }
 
 inline std::string MPSHeapAllocatorImpl::format_size(uint64_t size) const {
+<<<<<<< HEAD
   std::ostringstream os;
   os.precision(2);
   os << std::fixed;
@@ -712,6 +736,9 @@
     os << ((float)size / 1073741824.0) << " GB";
   }
   return os.str();
+=======
+  return c10::CachingAllocator::format_size(size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace HeapAllocator
diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index 03637e7ca65f..ba1be6a97414 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -1,6 +1,10 @@
 //  Copyright © 2022 Apple Inc.
 
 #pragma once
+<<<<<<< HEAD
+=======
+#include <ATen/Device.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/Allocator.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
@@ -70,4 +74,11 @@ TORCH_API bool is_available();
 TORCH_API bool is_macos_13_or_newer(MacOSVersion version);
 TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
 
+<<<<<<< HEAD
+=======
+inline Device getDeviceFromPtr(void* ptr) {
+  return {c10::DeviceType::MPS, 0};
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::mps
diff --git a/aten/src/ATen/mps/MPSFallback.mm b/aten/src/ATen/mps/MPSFallback.mm
index a86c0f5feaa3..61f5caa5929a 100644
--- a/aten/src/ATen/mps/MPSFallback.mm
+++ b/aten/src/ATen/mps/MPSFallback.mm
@@ -2,6 +2,10 @@
 
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/CPUFallback.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <caffe2/core/common.h>
 
 namespace at {
@@ -76,8 +80,13 @@ static Tensor slow_conv2d_forward_mps(const Tensor& self,
 }
 
 TORCH_LIBRARY_IMPL(_, MPS, m) {
+<<<<<<< HEAD
   static const char* enable_mps_fallback = getenv("PYTORCH_ENABLE_MPS_FALLBACK");
   if (!enable_mps_fallback || std::stoi(enable_mps_fallback) == 0) {
+=======
+  static const auto enable_mps_fallback = c10::utils::get_env("PYTORCH_ENABLE_MPS_FALLBACK");
+  if (!enable_mps_fallback || enable_mps_fallback == "0") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     m.fallback(torch::CppFunction::makeFromBoxedFunction<&mps_error_fallback>());
   } else {
     m.fallback(torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
@@ -91,9 +100,13 @@ static Tensor slow_conv2d_forward_mps(const Tensor& self,
   m.impl("embedding_renorm_", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("linalg_svd", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("linalg_svd.U", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
+<<<<<<< HEAD
   m.impl("col2im", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("_slow_conv2d_forward", slow_conv2d_forward_mps);
   m.impl("upsample_nearest3d.vec", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
+=======
+  m.impl("_slow_conv2d_forward", slow_conv2d_forward_mps);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at
diff --git a/aten/src/ATen/mps/MPSGuardImpl.h b/aten/src/ATen/mps/MPSGuardImpl.h
index 7ff2d13ceefa..04dc816a8b7a 100644
--- a/aten/src/ATen/mps/MPSGuardImpl.h
+++ b/aten/src/ATen/mps/MPSGuardImpl.h
@@ -36,7 +36,14 @@ struct TORCH_API MPSGuardImpl final
   // constructor
   MPSGuardImpl() {}
   explicit MPSGuardImpl(c10::DeviceType t) {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(t == c10::DeviceType::MPS);
+=======
+    TORCH_CHECK(
+        t == DeviceType::MPS,
+        "MPSGuardImpl initialized with non-MPS DeviceType: ",
+        t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // returns the type
@@ -57,7 +64,11 @@ struct TORCH_API MPSGuardImpl final
   }
 
   void setDevice(Device d) const override {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(d.is_mps());
+=======
+    TORCH_CHECK(d.is_mps(), "Expected a MPS device, but got ", d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void uncheckedSetDevice(Device d) const noexcept override {
diff --git a/aten/src/ATen/mps/MPSHooks.h b/aten/src/ATen/mps/MPSHooks.h
index 17a3d3a68cec..3345b119849e 100644
--- a/aten/src/ATen/mps/MPSHooks.h
+++ b/aten/src/ATen/mps/MPSHooks.h
@@ -18,6 +18,11 @@ struct MPSHooks : public at::MPSHooksInterface {
   bool hasMPS() const override;
   bool isOnMacOSorNewer(unsigned major, unsigned minor) const override;
 
+<<<<<<< HEAD
+=======
+  Device getDeviceFromPtr(void* data) const override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // MPSGeneratorImpl interface
   const Generator& getDefaultGenerator(
       DeviceIndex device_index = -1) const override;
diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm
index 03c39c957368..9ec7f8b4d468 100644
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@@ -129,6 +129,13 @@
   at::mps::getMPSEventPool()->recordEvent(event_id, /* syncEvent*/ true);
 }
 
+<<<<<<< HEAD
+=======
+Device MPSHooks::getDeviceFromPtr(void* data) const {
+  return at::mps::getDeviceFromPtr(data);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void MPSHooks::waitForEvent(uint32_t event_id) const {
   at::mps::getMPSEventPool()->waitForEvent(event_id, /* syncEvent*/ true);
 }
diff --git a/aten/src/ATen/mps/MPSProfiler.mm b/aten/src/ATen/mps/MPSProfiler.mm
index 6adce7d382a6..3037098ff119 100644
--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@@ -2,6 +2,10 @@
 
 #include <ATen/mps/MPSProfiler.h>
 #include <c10/util/Exception.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <fmt/format.h>
 
 // these need to be literal strings when passed to os_signpost*()
@@ -91,11 +95,19 @@
 
 MPSProfiler::MPSProfiler() : m_os_log_events(nullptr), m_os_log_intervals(nullptr) {
   // see enum LogOptions for the description.
+<<<<<<< HEAD
   static const char* log_options_str = getenv(kEVLogProfileInfoStr);
   m_log_options = log_options_str ? strtol(log_options_str, nullptr, 0) : 0;
   // see enums profilerOptions and SignpostTypes for the description.
   static const char* trace_signpost_str = getenv(kEVTraceSignpostsStr);
   uint32_t trace_signposts = trace_signpost_str ? strtol(trace_signpost_str, nullptr, 0) : 0;
+=======
+  static const auto log_options_str = c10::utils::get_env(kEVLogProfileInfoStr);
+  m_log_options = log_options_str ? strtol(log_options_str->c_str(), nullptr, 0) : 0;
+  // see enums profilerOptions and SignpostTypes for the description.
+  static const auto trace_signpost_str = c10::utils::get_env(kEVTraceSignpostsStr);
+  uint32_t trace_signposts = trace_signpost_str ? strtol(trace_signpost_str->c_str(), nullptr, 0) : 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_CHECK(m_log_options <= LogOptions::LOG_COUNT,
               "invalid log options ",
@@ -779,8 +791,13 @@
 }
 
 // used to capture sigint signal to log profiling stats
+<<<<<<< HEAD
 struct sigaction MPSProfiler::currentSigint {};
 struct sigaction MPSProfiler::previousSigint {};
+=======
+struct sigaction MPSProfiler::currentSigint{};
+struct sigaction MPSProfiler::previousSigint{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool MPSProfiler::isCapturing() const {
   return [captureManager isCapturing];
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
index 8b1182982002..d8043b4664a9 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
@@ -63,6 +63,7 @@ namespace {
     const Tensor& grad_output,
     const Tensor& input)
   {
+<<<<<<< HEAD
     int64_t ndim = grad_output.ndimension();
     for (const auto i : c10::irange(1, ndim)) {
       TORCH_CHECK(grad_output.size(i) > 0,
@@ -77,6 +78,18 @@ namespace {
       "expected dtype ", input.dtype(), " for `grad_output` but got dtype ", grad_output.dtype());
     TORCH_CHECK(input.dtype() == grad_input.dtype(),
       "expected dtype ", input.dtype(), " for `grad_input` but got dtype ", grad_input.dtype());
+=======
+    adaptive_pool_empty_output_check(grad_output, "adaptive_avg_pool2d_backward");
+    int64_t ndim = grad_output.dim();
+    TORCH_CHECK(input.dim() == ndim,
+      __func__, ": Expected dimensions ", input.dim(), " for `grad_output` but got dimensions ", ndim);
+    TORCH_CHECK((ndim == 3 || ndim == 4),
+      __func__, ": Expected 3D or 4D tensor, but got ", input.sizes());
+    TORCH_CHECK(input.dtype() == grad_output.dtype(),
+      __func__, ": Expected dtype ", input.dtype(), " for `grad_output` but got dtype ", grad_output.dtype());
+    TORCH_CHECK(input.dtype() == grad_input.dtype(),
+      __func__, ": Expected dtype ", input.dtype(), " for `grad_input` but got dtype ", grad_input.dtype());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grad_input.resize_(input.sizes(), input.suggest_memory_format());
     grad_input.zero_();
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
index 4897864a378b..4073d60e8fe3 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
@@ -235,6 +235,11 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template(
   auto gradOutput = gradOutput_.contiguous();
 
   adaptive_pool_empty_output_check(gradOutput_, "adaptive_avg_pool3d_backward");
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(input.dim() == gradOutput_.dim(),
+    __func__, ": Expected dimensions ", input.dim(), " for `gradOutput_` but got dimensions ", gradOutput_.dim());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /* sizes */
   int64_t sizeD = input.size(-4);
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 897e83890c79..a126572d5c19 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -125,11 +125,19 @@
 // linear algebra function uses that routine
 #if AT_BUILD_WITH_LAPACK()
 
+<<<<<<< HEAD
+=======
+#ifndef _ARMPL_H  // ArmPL's `cblas.h` pulls in these prototypes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // getrf
 extern "C" void zgetrf_(int *m, int *n, std::complex<double> *a, int *lda, int *ipiv, int *info);
 extern "C" void cgetrf_(int *m, int *n, std::complex<float> *a, int *lda, int *ipiv, int *info);
 extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
 extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info);
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // potrs
 #if defined(_WIN32) && defined(_M_ARM64)
@@ -165,13 +173,25 @@ static inline void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, fl
 
 #else
 
+<<<<<<< HEAD
+=======
+#ifndef _ARMPL_H  // ArmPL's `cblas.h` pulls in these prototypes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb, int *info);
 extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb, int *info);
 extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
 extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
+<<<<<<< HEAD
+
+#endif
 
+=======
 #endif
 
+#endif
+
+#ifndef _ARMPL_H  // ArmPL's `cblas.h` pulls in these prototypes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // potrf
 extern "C" void zpotrf_(char *uplo, int *n, std::complex<double> *a, int *lda, int *info);
 extern "C" void cpotrf_(char *uplo, int *n, std::complex<float> *a, int *lda, int *info);
@@ -317,6 +337,10 @@ extern "C" void zungqr_(int *m, int *n, int *k, std::complex<double> *a, int *ld
 extern "C" void cungqr_(int *m, int *n, int *k, std::complex<float> *a, int *lda, std::complex<float> *tau, std::complex<float> *work, int *lwork, int *info);
 extern "C" void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
 extern "C" void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // ormqr
 #if defined(_WIN32) && defined(_M_ARM64)
@@ -347,11 +371,20 @@ static inline void sormqr_(char *side, char *trans, int *m, int *n, int *k, floa
     *info = LAPACKE_sormqr_work(LAPACK_COL_MAJOR, *side, *trans, *m, *n, *k, a, *lda, tau, c, *ldc, work, *lwork);
 }
 #else
+<<<<<<< HEAD
+=======
+#ifndef _ARMPL_H  // ArmPL's `cblas.h` pulls in these prototypes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 extern "C" void zunmqr_(char *side, char *trans, int *m, int *n, int *k, std::complex<double> *a, int *lda, std::complex<double> *tau, std::complex<double> *c, int *ldc, std::complex<double> *work, int *lwork, int *info);
 extern "C" void cunmqr_(char *side, char *trans, int *m, int *n, int *k, std::complex<float> *a, int *lda, std::complex<float> *tau, std::complex<float> *c, int *ldc, std::complex<float> *work, int *lwork, int *info);
 extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info);
 extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info);
 #endif
+<<<<<<< HEAD
+=======
+#endif
+#ifndef _ARMPL_H  // ArmPL's `cblas.h` pulls in these prototypes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // syevd
 extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a, int *lda, double *w, std::complex<double> *work, int *lwork, double *rwork, int *lrwork, int *iwork, int *liwork, int *info);
 extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a, int *lda, float *w, std::complex<float> *work, int *lwork, float *rwork, int *lrwork, int *iwork, int *liwork, int *info);
@@ -466,14 +499,26 @@ extern "C" void sgelss_(int *m, int *n, int *nrhs,
     float *s, float *rcond, int *rank,
     float *work, int *lwork, int *info);
 #endif
+<<<<<<< HEAD
 
 #if AT_BUILD_WITH_BLAS()
 // trsm
+=======
+#endif
+
+#if AT_BUILD_WITH_BLAS()
+// trsm
+#ifndef _ARMPL_H  // ArmPL's `cblas.h` pulls in these prototypes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 extern "C" void ztrsm_(char *side, char *uplo, char *trans, char *diag, int *n, int *nrhs, std::complex<double> *alpha, std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb);
 extern "C" void ctrsm_(char *side, char *uplo, char *trans, char *diag, int *n, int *nrhs, std::complex<float> *alpha, std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb);
 extern "C" void dtrsm_(char *side, char *uplo, char *trans, char *diag, int *n, int *nrhs, double *alpha, double *a, int *lda, double *b, int *ldb);
 extern "C" void strsm_(char *side, char *uplo, char *trans, char *diag, int *n, int *nrhs, float *alpha, float *a, int *lda, float *b, int *ldb);
 #endif
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::meta {
 
@@ -685,7 +730,11 @@ TORCH_META_FUNC(linalg_cholesky_ex)(const Tensor& A,
   auto ndim = A_shape.size();
 
   // L
+<<<<<<< HEAD
   auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/A.device().type() != at::kMPS);
+=======
+  auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   set_output_strided(0, A_shape, L_strides, A.options(), {});
 
   // info
@@ -849,10 +898,14 @@ namespace at::native {
 // linear algebra operations
 
 template<class scalar_t>
+<<<<<<< HEAD
 void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, int *info);
 
 template<class scalar_t, class value_t=scalar_t>
 void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, value_t *w, scalar_t *work, int lwork, value_t *rwork, int *info);
+=======
+static void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, int *info);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template<> void lapackLu<c10::complex<double>>(int m, int n, c10::complex<double> *a, int lda, int *ipiv, int *info) {
   zgetrf_(&m, &n, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, info);
@@ -2694,20 +2747,30 @@ Tensor& ormqr_out(const Tensor& input, const Tensor& tau, const Tensor& other, b
 
   int64_t left_size_condition = left ? -2 : -1;
   TORCH_CHECK(
+<<<<<<< HEAD
       other.size(left_size_condition) >= tau.size(-1),
       "torch.ormqr: other.shape[",
       left_size_condition,
       "] must be greater than or equal to tau.shape[-1]");
 
   TORCH_CHECK(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       other.size(left_size_condition) == input.size(-2),
       "torch.ormqr: other.shape[",
       left_size_condition,
       "] must be equal to input.shape[-2]");
 
   TORCH_CHECK(
+<<<<<<< HEAD
       tau.size(-1) <= input.size(-1),
       "torch.ormqr: tau.shape[-1] must be less than or equal to input.shape[-1]");
+=======
+      std::min(other.size(left_size_condition), input.size(-1)) == tau.size(-1),
+      "torch.ormqr: tau.shape[-1] must be equal to min(other.shape[",
+      left_size_condition,
+      "], input.shape[-1])");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_CHECK(
       input.dim() - tau.dim() == 1,
@@ -2716,6 +2779,10 @@ Tensor& ormqr_out(const Tensor& input, const Tensor& tau, const Tensor& other, b
       tau.dim(),
       " and input.ndim is equal to ",
       input.dim());
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       input.dim() == other.dim(),
       "torch.ormqr: ",
diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index 8dce552b0e13..8b83b8576431 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -137,6 +137,28 @@ Tensor& cholesky_inverse_kernel_impl(Tensor& result, Tensor& infos, bool upper)
 }
 
 /*
+<<<<<<< HEAD
+=======
+ LAPACK query functions return workspace size as floating point value, which means
+ that it might not be accurately represented if it's size exceed mantissa of the
+ corresponding type. Fix it by adding 1ULP to the value before casting to it
+ For more info see https://github.com/pytorch/pytorch/issues/145801#issuecomment-2631781776
+*/
+template <typename T>
+static inline
+std::enable_if_t<std::is_floating_point_v<T>, int> lapack_work_to_int(const T val) {
+    const auto next_after = std::nextafter(val, std::numeric_limits<T>::infinity());
+    return std::max<int>(1, std::ceil(next_after));
+}
+template <typename T>
+static inline
+std::enable_if_t<c10::is_complex<T>::value, int> lapack_work_to_int(const T val) {
+    return lapack_work_to_int(val.real());
+}
+
+
+/*
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Computes the eigenvalues and eigenvectors of n-by-n matrix 'input'.
   This is an in-place routine, content of 'input', 'values', 'vectors' is overwritten.
   'infos' is an int Tensor containing error codes for each matrix in the batched input.
@@ -178,7 +200,11 @@ void apply_linalg_eig(Tensor& values, Tensor& vectors, Tensor& input, Tensor& in
   lapackEig<scalar_t, value_t>(jobvl, jobvr, n, input_data, lda, values_data,
     lvectors_data, ldvl, rvectors_data, ldvr, &work_query, -1, rwork_data, &infos_data[0]);
 
+<<<<<<< HEAD
   int lwork = std::max<int>(1, static_cast<int>(real_impl<scalar_t, value_t>(work_query)));
+=======
+  int lwork = lapack_work_to_int(work_query);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor work = at::empty({lwork}, input.dtype());
   auto work_data = work.mutable_data_ptr<scalar_t>();
 
@@ -218,6 +244,11 @@ void linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& infos,
   'compute_eigenvectors' controls whether eigenvectors should be computed.
   This function doesn't do any error checks and it's assumed that every argument is valid.
 */
+<<<<<<< HEAD
+=======
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename scalar_t>
 void apply_lapack_eigh(const Tensor& values, const Tensor& vectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
 #if !AT_BUILD_WITH_LAPACK()
@@ -256,8 +287,12 @@ void apply_lapack_eigh(const Tensor& values, const Tensor& vectors, const Tensor
   lapackSyevd<scalar_t, value_t>(jobz, uplo, n, vectors_data, lda, values_data,
     &lwork_query, lwork, &rwork_query, lrwork, &iwork_query, liwork, infos_data);
 
+<<<<<<< HEAD
   value_t next_after_lw = std::nextafter(real_impl<scalar_t, value_t>(lwork_query), std::numeric_limits<value_t>::infinity());
   lwork = std::max<int>(1, std::ceil(next_after_lw));
+=======
+  lwork = lapack_work_to_int(lwork_query);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Tensor work = at::empty({lwork}, vectors.options());
   auto work_data = work.mutable_data_ptr<scalar_t>();
@@ -269,8 +304,12 @@ void apply_lapack_eigh(const Tensor& values, const Tensor& vectors, const Tensor
   Tensor rwork;
   value_t* rwork_data = nullptr;
   if (vectors.is_complex()) {
+<<<<<<< HEAD
     value_t next_after_rwork_query = std::nextafter(rwork_query, std::numeric_limits<value_t>::infinity());
     lrwork = std::max<int>(1, std::ceil(next_after_rwork_query));
+=======
+    lrwork = lapack_work_to_int(rwork_query);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     rwork = at::empty({lrwork}, values.options());
     rwork_data = rwork.mutable_data_ptr<value_t>();
   }
@@ -331,7 +370,10 @@ static void apply_geqrf(const Tensor& input, const Tensor& tau) {
       "Calling torch.geqrf on a CPU tensor requires compiling ",
       "PyTorch with LAPACK. Please use PyTorch built with LAPACK support.");
 #else
+<<<<<<< HEAD
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto input_data = input.data_ptr<scalar_t>();
   auto tau_data = tau.data_ptr<scalar_t>();
   auto input_matrix_stride = matrixStride(input);
@@ -353,7 +395,11 @@ static void apply_geqrf(const Tensor& input, const Tensor& tau) {
 
   // if lwork is less than 'n' then a warning is printed:
   // Intel MKL ERROR: Parameter 7 was incorrect on entry to SGEQRF.
+<<<<<<< HEAD
   lwork = std::max<int>({1, static_cast<int>(n), static_cast<int>(real_impl<scalar_t, value_t>(wkopt))});
+=======
+  lwork = std::max<int>(static_cast<int>(n), lapack_work_to_int(wkopt));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor work = at::empty({lwork}, input.options());
 
   for (const auto i : c10::irange(batch_size)) {
@@ -401,7 +447,10 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) {
     return;
   }
 
+<<<<<<< HEAD
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto self_data = self.data_ptr<scalar_t>();
   auto tau_data = tau.const_data_ptr<scalar_t>();
   auto self_matrix_stride = matrixStride(self);
@@ -425,7 +474,11 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) {
   scalar_t wkopt;
   lapackOrgqr<scalar_t>(m, n, k, self_data, lda, const_cast<scalar_t*>(tau_data), &wkopt, lwork, &info);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
+<<<<<<< HEAD
   lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
+=======
+  lwork = lapack_work_to_int(wkopt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor work = at::empty({lwork}, self.options());
 
   for (const auto i : c10::irange(batch_size)) {
@@ -544,7 +597,11 @@ void apply_lstsq(const Tensor& A, Tensor& B, Tensor& rank, Tensor& singular_valu
     s_working_ptr,
     &iwork_opt);
 
+<<<<<<< HEAD
   lwork = std::max<int>(1, real_impl<scalar_t, value_t>(work_opt));
+=======
+  lwork = lapack_work_to_int(work_opt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor work = at::empty({lwork}, A.options());
   scalar_t* work_data = work.mutable_data_ptr<scalar_t>();
 
@@ -1066,7 +1123,11 @@ static void apply_svd(const Tensor& A,
   {
     scalar_t wkopt;
     lapackSvd<scalar_t, value_t>(jobz, m, n, A_data, lda, S_data, U_data, ldu, Vh_data, ldvh, &wkopt, lwork, rwork_data, iwork_data, info_data);
+<<<<<<< HEAD
     lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
+=======
+    lwork = lapack_work_to_int(wkopt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   auto work = std::vector<scalar_t>(lwork);
   auto* const work_data = work.data();
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index 7e4e77a67a8a..2f48e1e34ada 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -1383,35 +1383,59 @@ Tensor bitwise_right_shift(const Scalar& self, const Tensor& other) {
 }
 
 template <typename Stub>
+<<<<<<< HEAD
 Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& other, Stub& stub) {
+=======
+static Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& other, Stub& stub) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto iter = TensorIterator::comparison_op(result, self, other);
   stub(iter.device_type(), iter);
   return result;
 }
 
 template <typename OutImpl>
+<<<<<<< HEAD
 Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) {
+=======
+static Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor result = at::empty({0}, self.options().dtype(kBool));
   return out_impl(result, self, other);
 }
 
 template <typename OutImpl>
+<<<<<<< HEAD
 Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) {
+=======
+static Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return out_impl(self, self, other);
 }
 
 template <typename OutImpl>
+<<<<<<< HEAD
 Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Scalar& other, OutImpl& out_impl) {
+=======
+static Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Scalar& other, OutImpl& out_impl) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return out_impl(result, self, wrapped_scalar_tensor(other));
 }
 
 template <typename OutImpl>
+<<<<<<< HEAD
 Tensor comparison_op(const Tensor& self, const Scalar& other, OutImpl& out_impl) {
+=======
+static Tensor comparison_op(const Tensor& self, const Scalar& other, OutImpl& out_impl) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return comparison_op(self, wrapped_scalar_tensor(other), out_impl);
 }
 
 template <typename OutImpl>
+<<<<<<< HEAD
 Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) {
+=======
+static Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return out_impl(self, self, wrapped_scalar_tensor(other));
 }
 
diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
index f62c31777822..3c40eaefe5ff 100644
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@@ -7,6 +7,14 @@
 #include <ATen/Config.h>
 
 #include <ATen/native/mkldnn/Matmul.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mkldnn/Linear.h>
+#include <ATen/native/Resize.h>
+#if !defined(__s390x__) && !defined(__powerpc__)
+#include <cpuinfo.h>
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/CPUFunctions.h>
@@ -24,6 +32,12 @@
 #include <ATen/ops/mv_native.h>
 #include <ATen/ops/scalar_tensor_native.h>
 #include <ATen/ops/vdot_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/_scaled_mm_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/matmul.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 namespace at::meta {
@@ -222,4 +236,109 @@ Tensor vdot(const Tensor &self, const Tensor &other){
 
 }
 
+<<<<<<< HEAD
+=======
+static Tensor&
+_scaled_mm_out_cpu_emulated(const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+
+  TORCH_INTERNAL_ASSERT((scale_a.numel() == 1 && scale_b.numel() == 1), "Now _scaled_mm only supports per-tensor scaling for CPU backend.");
+  TORCH_CHECK(
+      !scale_result ||
+          (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
+      "scale_result must be a float scalar");
+  TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
+       " but got ", bias->numel());
+
+  // Check types
+  TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
+  TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
+  TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
+
+  auto mat1_c = mat1.contiguous();
+  auto mat2_c = mat2.contiguous();
+  IntArrayRef mat1_sizes = mat1_c.sizes();
+  IntArrayRef mat2_sizes = mat2_c.sizes();
+  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
+
+  float input_scale = scale_a.item<float>();
+  float weight_scale = scale_b.item<float>();
+  float output_scale = float(1.0);
+  if (scale_result.has_value() &&
+      (*out_dtype == ScalarType::Float8_e4m3fn ||
+       *out_dtype == ScalarType::Float8_e5m2)) {
+    output_scale = scale_result.value().item<float>();
+  }
+  auto fp32_mat1 = at::mul(mat1.to(kFloat), input_scale);
+  auto fp32_mat2 = at::mul(mat2_c.to(kFloat), weight_scale);
+  auto out_tmp = at::matmul(fp32_mat1, fp32_mat2);
+  if (bias) {
+    out_tmp.add_(bias.value());
+  }
+  if (*out_dtype == ScalarType::Float8_e4m3fn ||
+      *out_dtype == ScalarType::Float8_e5m2) {
+    out_tmp = at::mul(out_tmp, 1 / output_scale);
+  }
+  out_tmp = out_tmp.to(out.scalar_type());
+  out.copy_(out_tmp);
+  return out;
+}
+
+Tensor&
+_scaled_mm_out_cpu(const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+#if AT_MKLDNN_ENABLED() && !defined(__powerpc__)
+  if (at::globalContext().userEnabledMkldnn()) {
+    bool mixed_dtype = mat1.scalar_type() != mat2.scalar_type();
+    if ((!mixed_dtype && cpuinfo_has_x86_amx_int8()) ||
+        (mixed_dtype && cpuinfo_has_x86_amx_fp16())) {
+      return mkldnn_scaled_mm(
+          mat1,
+          mat2,
+          scale_a,
+          scale_b,
+          bias,
+          scale_result,
+          out_dtype,
+          use_fast_accum,
+          out);
+    }
+  }
+#endif
+  {
+  return _scaled_mm_out_cpu_emulated(mat1, mat2, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
+  }
+}
+
+Tensor
+_scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+  return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }  // namespace at::native
diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp
index 58cc456254d8..3f7171a8fadd 100644
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@@ -33,11 +33,19 @@ T* remove_const(const T* x) {
 } // namespace
 
 #if AT_BUILD_WITH_BLAS()
+<<<<<<< HEAD
+=======
+#ifndef _ARMPL_H
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 extern "C" double ddot_(int *n, double *x, int *incx, double *y, int *incy);
 extern "C" void dscal_(int *n, double *a, double *x, int *incx);
 extern "C" void sscal_(int *n, float *a, float *x, int *incx);
 extern "C" void dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
 extern "C" void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if AT_BLAS_F2C()
 # define ffloat double
@@ -52,10 +60,18 @@ extern "C" void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int
   extern "C" void cblas_cdotc_sub(const int n, const void *x, const int incx, const void *y, const int incy, void *dotc);
   extern "C" void cblas_zdotc_sub(const int n, const void *x, const int incx, const void *y, const int incy, void *dotc);
 
+<<<<<<< HEAD
+=======
+#ifndef _ARMPL_H
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline ffloat sdot_(const int *n, const float *x, const int *incx, const float *y, const int *incy)
   {
     return cblas_sdot(*n, x, *incx, y, *incy);
   }
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static inline void cdotu_(std::complex<float> *res, const int *n, const std::complex<float> *x, const int *incx,
   const std::complex<float> *y, const int *incy) {
     cblas_cdotu_sub(*n, x, *incx, y, *incy, res);
@@ -86,6 +102,11 @@ namespace at::native {
 #if !defined(C10_MOBILE)
 DEFINE_DISPATCH(fp16_gemv_trans_stub);
 DEFINE_DISPATCH(bf16_gemv_trans_stub);
+<<<<<<< HEAD
+=======
+DEFINE_DISPATCH(fp16_dot_stub);
+DEFINE_DISPATCH(bf16_dot_stub);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // !defined(C10_MOBILE)
 
 namespace blas_impl {
@@ -116,6 +137,7 @@ void fp16_gemv_trans(
   fp16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
+<<<<<<< HEAD
 void bf16_gemv_trans(
     const int m,
     const int n,
@@ -127,10 +149,67 @@ void bf16_gemv_trans(
     const at::BFloat16 beta,
     at::BFloat16* y,
     const int incy);
+=======
+static float fp16_dot(
+  const int64_t n,
+  const Half* x,
+  const int64_t incx,
+  const Half* y,
+  const int64_t incy) {
+  return fp16_dot_stub(kCPU, n, x, incx, y, incy);
+}
+
+static float bf16_dot(
+  const int64_t n,
+  const BFloat16* x,
+  const int64_t incx,
+  const BFloat16* y,
+  const int64_t incy) {
+  return bf16_dot_stub(kCPU, n, x, incx, y, incy);
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #endif // !defined(C10_MOBILE)
 
 #if defined(__aarch64__) && !defined(C10_MOBILE)
+<<<<<<< HEAD
+=======
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
+  for (auto j = 0; j < n; j++) {
+    auto vecCol = vdup_n_f16(x[j]);
+    const auto* column = a + lda * j;
+    for (auto i = 0; i < m; i += 4) {
+      auto yf16 = y + i;
+      auto matRow = vld1_f16(column + i);
+      auto resVec = j != 0 ? vld1_f16(yf16) : vdup_n_f16(0);
+      resVec = vfma_lane_f16(resVec, matRow, vecCol, 0);
+      vst1_f16(yf16, resVec);
+    }
+  }
+}
+#endif
+
+static void fp16_gemv_notrans_fp32_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
+  std::vector<float> sum(m);
+  for (auto j = 0; j < n; j++) {
+    auto vecCol = vdup_n_f32(x[j]);
+    const auto* column = a + lda * j;
+    for (auto i = 0; i < m; i += 4) {
+      auto sf32 = sum.data() + i;
+      auto matRow = vcvt_f32_f16(vld1_f16(column + i));
+      auto resVec = j != 0 ? vld1q_f32(sf32) : vdupq_n_f32(0);
+      resVec = vfmaq_lane_f32(resVec, matRow, vecCol, 0);
+      vst1q_f32(sf32, resVec);
+    }
+  }
+
+  for (auto i = 0; i < m; i+= 4) {
+    vst1_f16(y + i, vcvt_f16_f32(vld1q_f32(sum.data() + i)));
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void fp16_gemv_notrans(
     const int m,
     const int n,
@@ -143,17 +222,66 @@ void fp16_gemv_notrans(
     Half* y,
     const int incy);
 
+<<<<<<< HEAD
 #endif // defined(__aarch64__) && !defined(C10_MOBILE)
 
 template <typename scalar_t>
 bool scal_use_fast_path(
+=======
+void fp16_gemv_notrans(
+    const int m,
+    const int n,
+    const float alpha,
+    const Half* a,
+    const int lda,
+    const Half* x,
+    const int incx,
+    const float beta,
+    Half* y,
+    const int incy) {
+  if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && incy == 1) {
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+    if (at::globalContext().allowFP16ReductionCPU())  {
+      return fp16_gemv_notrans_fp16_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
+    }
+#endif
+    return fp16_gemv_notrans_fp32_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
+  }
+  std::vector<float> sum(m);
+  for (const auto j : c10::irange(n)) {
+    const auto* column_ = a + lda * j;
+    auto z = alpha * x[j * incx];
+    for (const auto i : c10::irange(m)) {
+      sum[i] += z * column_[i];
+    }
+  }
+  if (beta == 0.0) {
+    for (const auto i : c10::irange(m)) {
+      y[i * incy] = sum[i];
+    }
+  } else {
+    for (const auto i : c10::irange(m)) {
+      y[i * incy] += sum[i];
+    }
+  }
+}
+
+#endif // defined(__aarch64__) && !defined(C10_MOBILE)
+
+template <typename scalar_t>
+static bool scal_use_fast_path(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [[maybe_unused]] int64_t n,
     [[maybe_unused]] int64_t incx) {
   return false;
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 bool gemv_use_fast_path(
+=======
+static bool gemv_use_fast_path(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [[maybe_unused]] char trans,
     [[maybe_unused]] int64_t m,
     [[maybe_unused]] int64_t n,
@@ -166,7 +294,11 @@ bool gemv_use_fast_path(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void scal_fast_path(
+=======
+static void scal_fast_path(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [[maybe_unused]] int* n,
     [[maybe_unused]] scalar_t* a,
     [[maybe_unused]] scalar_t* x,
@@ -176,7 +308,11 @@ void scal_fast_path(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void gemv_fast_path(
+=======
+static void gemv_fast_path(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [[maybe_unused]] const char* trans,
     [[maybe_unused]] const int* m,
     [[maybe_unused]] const int* n,
@@ -258,10 +394,13 @@ template <>
 void gemv_fast_path<float>(const char *trans, const int *m, const int *n, const float *alpha, const float *a, const int *lda, const float *x, const int *incx, const float *beta, float *y, const int *incy) {
   sgemv_(remove_const(trans), remove_const(m), remove_const(n), remove_const(alpha), remove_const(a), remove_const(lda), remove_const(x), remove_const(incx), remove_const(beta), y, remove_const(incy));
 }
+<<<<<<< HEAD
 #else
 INSTANTIATE(float)
 INSTANTIATE(double)
 #endif // AT_BUILD_WITH_BLAS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 INSTANTIATE(uint8_t)
 INSTANTIATE(int8_t)
@@ -283,7 +422,11 @@ bool gemv_use_fast_path<at::BFloat16>(
       beta == 0.0;
 }
 
+<<<<<<< HEAD
 void bf16_gemv_trans(
+=======
+static void bf16_gemv_trans(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int m,
   const int n,
   const at::BFloat16 alpha,
@@ -368,6 +511,7 @@ void gemv_fast_path<at::Half>(
       y,
       *incy);
 }
+<<<<<<< HEAD
 #else
 template <>
 bool scal_use_fast_path<at::Half>(
@@ -376,6 +520,9 @@ bool scal_use_fast_path<at::Half>(
   return false;
 }
 
+=======
+#else // !defined(__aarch64__))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 bool gemv_use_fast_path<at::Half>(
     char trans,
@@ -391,6 +538,7 @@ bool gemv_use_fast_path<at::Half>(
       (c10::detail::fp16_from_bits(beta.x) == 0.0f || trans == 't' || trans == 'T');
 }
 
+<<<<<<< HEAD
 #ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
 static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
   for (auto j = 0; j < n; j++) {
@@ -464,6 +612,8 @@ void fp16_gemv_notrans(
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <>
 void gemv_fast_path<at::Half>(
     const char* trans,
@@ -511,6 +661,10 @@ void gemv_fast_path<at::Half>(
 INSTANTIATE(c10::Half)
 INSTANTIATE(c10::BFloat16)
 #endif // !defined(C10_MOBILE)
+<<<<<<< HEAD
+=======
+#endif // AT_BUILD_WITH_BLAS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #undef INSTANTIATE
 
 } // namespace blas_impl
@@ -559,7 +713,11 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, const scalar_t *a, i
       opmath_t sum = 0;
       const scalar_t *row_ = a + lda * i;
       for (const auto j : c10::irange(m)) {
+<<<<<<< HEAD
         sum += x[j * incx] * row_[j];
+=======
+        sum += static_cast<opmath_t>(x[j * incx]) * static_cast<opmath_t>(row_[j]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       if (beta == scalar_t(0)) {
         y[i * incy] = alpha * sum;
@@ -690,7 +848,11 @@ scalar_t dot_impl(int64_t n, const scalar_t* x, int64_t incx, const scalar_t* y,
     incx = 1;
     incy = 1;
   }
+<<<<<<< HEAD
   return blas_impl::dot_naive(n, x, incx, y, incy, std::multiplies<scalar_t>{});
+=======
+  return blas_impl::dot_naive(n, x, incx, y, incy, std::multiplies<at::opmath_type<scalar_t>>{});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
@@ -713,6 +875,37 @@ c10::complex<float> dot_impl(int64_t n, const c10::complex<float>* x, int64_t in
   return dot_impl_floating(n, x, incx, y, incy);
 }
 
+<<<<<<< HEAD
+=======
+template <>
+Half dot_impl(int64_t n, const Half* x, int64_t incx, const Half* y, int64_t incy) {
+  if (n == 1) {
+    incx = 1;
+    incy = 1;
+  }
+#if !defined(C10_MOBILE)
+  if (incx == 1 && incy == 1) {
+    return blas_impl::fp16_dot(n, x, incx, y, incy);
+  }
+#endif // !defined(C10_MOBILE)
+  return blas_impl::dot_naive(n, x, incx, y, incy, std::multiplies<float>{});
+}
+
+template <>
+BFloat16 dot_impl(int64_t n, const BFloat16* x, int64_t incx, const BFloat16* y, int64_t incy) {
+  if (n == 1) {
+    incx = 1;
+    incy = 1;
+  }
+#if !defined(C10_MOBILE)
+  if (incx == 1 && incy == 1) {
+    return blas_impl::bf16_dot(n, x, incx, y, incy);
+  }
+#endif // !defined(C10_MOBILE)
+  return blas_impl::dot_naive(n, x, incx, y, incy, std::multiplies<float>{});
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 template <typename scalar_t>
 struct vdot_op {
@@ -739,7 +932,11 @@ scalar_t vdot_impl(int64_t n, const scalar_t* x, int64_t incx, const scalar_t* y
 #endif
 }
 
+<<<<<<< HEAD
 // Skip reinstantiating the explicitly specialized types `float` and `double`.
+=======
+// Skip reinstantiating the explicitly specialized types `float`, `double`, `half` & `bfloat16`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define INSTANTIATE_DOT_IMPL(scalar_t)  \
   template scalar_t dot_impl<scalar_t>( \
       int64_t n, const scalar_t * x, int64_t incx, const scalar_t * y, int64_t incy);
@@ -748,8 +945,11 @@ INSTANTIATE_DOT_IMPL(int8_t)
 INSTANTIATE_DOT_IMPL(int16_t)
 INSTANTIATE_DOT_IMPL(int)
 INSTANTIATE_DOT_IMPL(int64_t)
+<<<<<<< HEAD
 INSTANTIATE_DOT_IMPL(c10::Half)
 INSTANTIATE_DOT_IMPL(c10::BFloat16)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define INSTANTIATE_VDOT_IMPL(scalar_t)  \
   template scalar_t vdot_impl<scalar_t>( \
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index fb401f076797..72c9ba221abb 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -15,7 +15,11 @@
 #if AT_BUILD_WITH_BLAS()
 #if C10_IOS
 #include <Accelerate/Accelerate.h>
+<<<<<<< HEAD
 #else
+=======
+#elif !defined(_ARMPL_H)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 extern "C" void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, const double *a, int *lda, const double *b, int *ldb, double *beta, double *c, int *ldc);
 extern "C" void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, const float *a, int *lda, const float *b, int *ldb, float *beta, float *c, int *ldc);
 extern "C" void cgemm_(char *transa, char *transb, int *m, int *n, int *k, void *alpha, const void *a, int *lda, const void *b, int *ldb, void *beta, void *c, int *ldc);
@@ -135,6 +139,10 @@ CBLAS_TRANSPOSE to_apple_accelerate_transpose(TransposeType trans) {
 }  // namespace (anonymous)
 
 DEFINE_DISPATCH(gemm_stub);
+<<<<<<< HEAD
+=======
+DEFINE_DISPATCH(gemm_no_downcast_stub);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void gemm(
     TransposeType transa, TransposeType transb,
@@ -179,6 +187,21 @@ void gemm(
       transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
+<<<<<<< HEAD
+=======
+#ifndef armpl_doublecomplex_t
+#define COMPLEX_DBL(a) a
+#define COMPLEX_DBL_CONST(a) a
+#define COMPLEX_FLOAT(a) a
+#define COMPLEX_FLOAT_CONST(a) a
+#else
+#define COMPLEX_DBL(a) ((armpl_doublecomplex_t*)a)
+#define COMPLEX_DBL_CONST(a) ((const armpl_doublecomplex_t*)a)
+#define COMPLEX_FLOAT(a) ((armpl_singlecomplex_t*)a)
+#define COMPLEX_FLOAT_CONST(a) ((const armpl_singlecomplex_t*)a)
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
@@ -256,11 +279,19 @@ void gemm(
     zgemm_(
         &transa_, &transb_,
         &m_, &n_, &k_,
+<<<<<<< HEAD
         &alpha_,
         a, &lda_,
         b, &ldb_,
         &beta_,
         c, &ldc_);
+=======
+        COMPLEX_DBL_CONST(&alpha_),
+        COMPLEX_DBL_CONST(a), &lda_,
+        COMPLEX_DBL_CONST(b), &ldb_,
+        COMPLEX_DBL_CONST(&beta_),
+        COMPLEX_DBL(c), &ldc_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #endif
     return;
   }
@@ -299,11 +330,19 @@ void gemm(
     cgemm_(
         &transa_, &transb_,
         &m_, &n_, &k_,
+<<<<<<< HEAD
         &alpha_,
         a, &lda_,
         b, &ldb_,
         &beta_,
         c, &ldc_);
+=======
+        COMPLEX_FLOAT_CONST(&alpha_),
+        COMPLEX_FLOAT_CONST(a), &lda_,
+        COMPLEX_FLOAT_CONST(b), &ldb_,
+        COMPLEX_FLOAT_CONST(&beta_),
+        COMPLEX_FLOAT(c), &ldc_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #endif
     return;
   }
@@ -322,6 +361,27 @@ void gemm(
    const float beta,
    at::BFloat16 *c, int64_t ldc) {
    internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
+<<<<<<< HEAD
+=======
+#if AT_MKLDNN_ENABLED()
+#ifdef __aarch64__
+   // MKLDNN also supports ARM for bf16, and the bypass is only
+   // currently intended for x86/x86_64.
+   const bool use_bf16_gemv_trans = false;
+#elif defined(__powerpc__)
+   const bool use_bf16_gemv_trans = false;
+#else
+   const bool bf16_gemv_trans_would_be_faster = cpuinfo_initialize() &&
+     !cpuinfo_has_x86_avx512bf16();
+   const bool use_bf16_gemv_trans = bf16_gemv_trans_would_be_faster &&
+     transa == TransposeType::Transpose &&
+     transb == TransposeType::NoTranspose && n == 1 && alpha == 1.0;
+#endif
+   if (!use_bf16_gemv_trans && mkldnn_bf16_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
+     return;
+   }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if AT_BUILD_WITH_BLAS() && defined(BLAS_HAS_SBGEMM)
    if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) {
       int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc;
@@ -343,6 +403,7 @@ void gemm(
       return;
    }
 #endif
+<<<<<<< HEAD
 #if AT_MKLDNN_ENABLED()
 #ifdef __aarch64__
    // MKLDNN also supports ARM for bf16, and the bypass is only
@@ -361,6 +422,8 @@ void gemm(
      return;
    }
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    gemm_stub(
       at::kCPU, at::kBFloat16,
       transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
@@ -423,6 +486,16 @@ void gemm(
       return;
    }
 #endif
+<<<<<<< HEAD
+=======
+#if AT_MKLDNN_ACL_ENABLED()
+// add heuristic based on shape to dispatch to sbgemm_ vs MKLDNN
+   if (mkldnn_bf16f32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
+     return;
+   }
+#endif //AT_MKLDNN_ACL_ENABLED
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef MKL_HAS_SBGEMM
   if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) {
     int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc;
@@ -433,18 +506,31 @@ void gemm(
   // for the fallback path, first compute gemm with beta = 0,
   // and then add c in full precision.
   int64_t c_size = n * m;
+<<<<<<< HEAD
   std::vector<at::BFloat16> bfloat_c(c_size, 0.f);
   gemm_stub(
       at::kCPU, at::kBFloat16,
       transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, bfloat_c.data(), m);
+=======
+  std::vector<float> float_c(c_size, 0.f);
+  gemm_no_downcast_stub(
+      at::kCPU, at::kBFloat16,
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto j : c10::irange(n)) {
     for (const auto i : c10::irange(m)) {
       auto offset = j * ldc + i;
       // beta == 0 won't propagate NaN from C
       if (beta == 0.f) {
+<<<<<<< HEAD
         c[offset] = c10::convert<float>(bfloat_c[j * m + i]);
       } else {
         c[offset] = beta * c[offset] + c10::convert<float>(bfloat_c[j * m + i]);
+=======
+        c[offset] = float_c[j * m + i];
+      } else {
+        c[offset] = beta * c[offset] + float_c[j * m + i];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -554,7 +640,11 @@ using is_blas_library_type = std::integral_constant<bool,
     std::is_same_v<scalar_t, c10::complex<float>>>;
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void gemm_batched_generic(
+=======
+static void gemm_batched_generic(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TransposeType transa, TransposeType transb,
     int64_t batch_size, int64_t m, int64_t n, int64_t k,
     scalar_t alpha,
@@ -568,7 +658,11 @@ void gemm_batched_generic(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void gemm_batched(
+=======
+static void gemm_batched(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TransposeType transa, TransposeType transb,
     int64_t batch_size, int64_t m, int64_t n, int64_t k,
     scalar_t alpha,
@@ -596,7 +690,11 @@ void gemm_batched(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void gemm_batched_with_stride_generic(
+=======
+static void gemm_batched_with_stride_generic(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TransposeType transa, TransposeType transb,
     int64_t batch_size, int64_t m, int64_t n, int64_t k,
     scalar_t alpha,
@@ -739,7 +837,11 @@ void axpy(int64_t n, c10::complex<double> a, const c10::complex<double> *x, int6
     #if C10_IOS
     cblas_zaxpy(i_n, &a, x, i_incx, y, i_incy);
     #else
+<<<<<<< HEAD
     zaxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
+=======
+    zaxpy_(&i_n, COMPLEX_DBL(&a), COMPLEX_DBL_CONST(x), &i_incx, COMPLEX_DBL(y), &i_incy);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #endif
     return;
   }
@@ -764,7 +866,11 @@ void axpy(int64_t n, c10::complex<float> a, const c10::complex<float> *x, int64_
     #if C10_IOS
     cblas_caxpy(i_n, &a, x, i_incx, y, i_incy);
     #else
+<<<<<<< HEAD
     caxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
+=======
+    caxpy_(&i_n, COMPLEX_FLOAT(&a), COMPLEX_FLOAT_CONST(x), &i_incx, COMPLEX_FLOAT(y), &i_incy);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #endif
     return;
   }
@@ -838,7 +944,11 @@ void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<d
     #if C10_IOS
     cblas_zcopy(i_n, x, i_incx, y, i_incy);
     #else
+<<<<<<< HEAD
     zcopy_(&i_n, x, &i_incx, y, &i_incy);
+=======
+    zcopy_(&i_n, COMPLEX_DBL_CONST(x), &i_incx, COMPLEX_DBL(y), &i_incy);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #endif
     return;
   }
@@ -862,7 +972,11 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
     #if C10_IOS
     cblas_ccopy(i_n, &x, i_incx, y, i_incy);
     #else
+<<<<<<< HEAD
     ccopy_(&i_n, x, &i_incx, y, &i_incy);
+=======
+    ccopy_(&i_n, COMPLEX_FLOAT(x), &i_incx, COMPLEX_FLOAT(y), &i_incy);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #endif
     return;
   }
@@ -945,7 +1059,11 @@ struct PackKey {
   }
 };
 
+<<<<<<< HEAD
 inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) {
+=======
+static inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (dtype == ScalarType::Float) {
     return dnnl::memory::data_type::f32;
   } else if (dtype == ScalarType::BFloat16) {
@@ -1347,6 +1465,33 @@ void brgemm(
     "I8 Brgemm is only supported on X64 when oneDNN ukernel is enabled and `amx` is supported");
 }
 
+<<<<<<< HEAD
+=======
+void brgemm(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t ld_a,
+    int64_t ld_b,
+    int64_t ld_c,
+    const bool add_C,
+    const signed char* A,
+    const signed char* B,
+    int32_t* C,
+    bool is_vnni) {
+#if defined(ONEDNN_UKERNEL_ENABLED)
+  if (is_vnni && Brgemm::device_check(ScalarType::Char)) {
+    Brgemm::call<signed char, signed char, int32_t>(
+      M, N, K, ld_a, ld_b, ld_c, add_C, A, B, C);
+    return;
+  }
+#endif
+  // raise an error if the path is not supported
+  TORCH_CHECK(false,
+    "I8 Brgemm is only supported on X64 when oneDNN ukernel is enabled and `amx` is supported");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void brgemm_release(bool is_vnni) {
 #if defined(ONEDNN_UKERNEL_ENABLED)
   if (is_vnni) {
diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h
index c1045f78c430..f0f72a80df3d 100644
--- a/aten/src/ATen/native/CPUBlas.h
+++ b/aten/src/ATen/native/CPUBlas.h
@@ -29,6 +29,21 @@ using gemm_fn = void(*)(
 
 DECLARE_DISPATCH(gemm_fn, gemm_stub)
 
+<<<<<<< HEAD
+=======
+using gemm_no_downcast_fn = void(*)(
+    at::ScalarType type,
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const Scalar& alpha,
+    const void *a, int64_t lda,
+    const void *b, int64_t ldb,
+    const Scalar& beta,
+    void *c, int64_t ldc);
+
+DECLARE_DISPATCH(gemm_no_downcast_fn, gemm_no_downcast_stub)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename scalar_t>
 void gemm(
     TransposeType transa, TransposeType transb,
@@ -259,6 +274,22 @@ TORCH_API void brgemm(
     int32_t* C,
     bool is_vnni = true);
 
+<<<<<<< HEAD
+=======
+TORCH_API void brgemm(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t ld_a,
+    int64_t ld_b,
+    int64_t ld_c,
+    const bool add_C,
+    const signed char* A,
+    const signed char* B,
+    int32_t* C,
+    bool is_vnni = true);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Release brgemm hardware context
 TORCH_API void brgemm_release(bool is_vnni = true);
 
diff --git a/aten/src/ATen/native/CPUFallback.cpp b/aten/src/ATen/native/CPUFallback.cpp
index fd850846ba61..0dd83a5bcada 100644
--- a/aten/src/ATen/native/CPUFallback.cpp
+++ b/aten/src/ATen/native/CPUFallback.cpp
@@ -98,6 +98,7 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool
   const auto arguments_begin = stack->size() - num_arguments;
 
   std::vector<at::Tensor> tensor_args;
+<<<<<<< HEAD
   std::vector<int> tensor_args_indices;
 
   std::vector<c10::List<at::Tensor>> tensorlist_args;
@@ -105,6 +106,15 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool
 
   std::vector<c10::List<std::optional<at::Tensor>>> optional_tensorlist_args;
   std::vector<int> optional_tensorlist_args_indices;
+=======
+  std::vector<size_t> tensor_args_indices;
+
+  std::vector<c10::List<at::Tensor>> tensorlist_args;
+  std::vector<size_t> tensorlist_args_indices;
+
+  std::vector<c10::List<std::optional<at::Tensor>>> optional_tensorlist_args;
+  std::vector<size_t> optional_tensorlist_args_indices;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::optional<c10::Device> tgt_device = std::nullopt;
   // save converted cpu tensor for TensorList and optional TensorList
diff --git a/aten/src/ATen/native/ChanelShuffle.cpp b/aten/src/ATen/native/ChanelShuffle.cpp
index be57917967fa..4b15f480b361 100644
--- a/aten/src/ATen/native/ChanelShuffle.cpp
+++ b/aten/src/ATen/native/ChanelShuffle.cpp
@@ -20,6 +20,20 @@
 namespace at::native {
 
 Tensor channel_shuffle_cpu(const Tensor& self, int64_t groups) {
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(self.dim() > 2,
+              "channel_shuffle expects input with > 2 dims, but got input with sizes ",
+              self.sizes());
+  int64_t c = self.size(1);
+  TORCH_CHECK(groups > 0,
+              "Number of groups to divide channels in must be positive.",
+              " Value of groups:", groups);
+  TORCH_CHECK((c % groups) == 0,
+              "Number of channels must be divisible by groups. Got ",
+              c, " channels and ", groups, " groups.");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor output;
   if (self.numel() == 0) {
     output = self.alias();
diff --git a/aten/src/ATen/native/ComparisonUtils.cpp b/aten/src/ATen/native/ComparisonUtils.cpp
index 4019cf2ff9b1..4518a36c5aae 100644
--- a/aten/src/ATen/native/ComparisonUtils.cpp
+++ b/aten/src/ATen/native/ComparisonUtils.cpp
@@ -13,15 +13,24 @@ class Tensor;
 namespace native {
 
 template<typename O, typename C>
+<<<<<<< HEAD
 void _assert_match(const O& original, const C& compared, const std::string& name) {
+=======
+static void _assert_match(const O& original, const C& compared, const std::string& name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (compared) {
     bool equal = (original == compared.value());
     if (!equal) {
       std::stringstream msg;
+<<<<<<< HEAD
       msg << "Tensor " << name << " mismatch!";
       if (!equal) {
         throw std::runtime_error(msg.str());
       }
+=======
+      msg << "Tensor " << name << " mismatch! Expected: " << compared.value() << ", Got: " << original;
+      throw std::runtime_error(msg.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 }
@@ -30,7 +39,13 @@ void _assert_tensor_metadata_meta_symint(at::Tensor const& tensor, at::OptionalS
   _assert_match(tensor.sym_sizes(), sizes, "sizes");
   _assert_match(tensor.sym_strides(), strides, "strides");
   _assert_match(tensor.dtype(), dtype, "dtype");
+<<<<<<< HEAD
   _assert_match(tensor.device(), device, "device");
+=======
+  if (tensor.device().type() != DeviceType::Meta) {
+    _assert_match(tensor.device(), device, "device");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _assert_match(tensor.layout(), layout, "layout");
 }
 
@@ -38,7 +53,13 @@ void _assert_tensor_metadata(at::Tensor const& tensor, at::OptionalIntArrayRef s
   _assert_match(tensor.sizes(), sizes, "sizes");
   _assert_match(tensor.strides(), strides, "strides");
   _assert_match(tensor.dtype(), dtype, "dtype");
+<<<<<<< HEAD
   _assert_match(tensor.device(), device, "device");
+=======
+  if (tensor.device().type() != DeviceType::Meta) {
+    _assert_match(tensor.device(), device, "device");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _assert_match(tensor.layout(), layout, "layout");
 }
 
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 55b11cdfd698..8439d479cecb 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -454,4 +454,22 @@ inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor
   return is_channel_last(input) || is_channel_last(weight);
 }
 
+<<<<<<< HEAD
+=======
+inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  // check layout only for mps tensor.
+  if (!input.is_mps() || !weight.is_mps()) {
+    return false;
+  }
+  if (!input.defined() || input.is_sparse()) {
+    // suggest channels_first
+    return false;
+  }
+
+  auto fmt = input.suggest_memory_format();
+  return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 162dfe56aa05..bf29a932f664 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -30,6 +30,13 @@
 #include <ATen/native/mkldnn/Utils.h>
 #endif
 
+<<<<<<< HEAD
+=======
+#ifdef USE_MPS
+#include <ATen/mps/MPSDevice.h>
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
@@ -93,7 +100,11 @@ static bool conv_benchmark_empty_cache = true;
 
 // Check workload to activate fast depthwise FP16 cudnn conv kernels
 template <typename T>
+<<<<<<< HEAD
 bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) {
+=======
+static bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto w = at::symint::size<T>(input, 3);  // same as h
   auto ch = at::symint::size<T>(input, 1);
   auto bs = at::symint::size<T>(input, 0);
@@ -216,7 +227,11 @@ bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) {
 
 // simplified version for cudnn 8.2 and above
 template <typename T>
+<<<<<<< HEAD
 bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, T stride, const at::Tensor& weight) {
+=======
+static bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, T stride, const at::Tensor& weight) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // 1D conv
   if(at::symint::size<T>(input, 2) == 1 && stride == 1){
     return true;
@@ -442,11 +457,14 @@ struct ConvParams {
       }
     }
     if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous) {
+<<<<<<< HEAD
       // bypass dilation checks for channels_last convolution
       if (deterministic && is_dilated()) {
         // cudnn doesn't support deterministic dilated convolution fully yet
         return false;
       }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (is_dilated()) {
         return detail::getCUDAHooks().supportsDilatedConvolutionWithCuDNN() && !is_output_padding_big();
       }
@@ -466,6 +484,7 @@ struct ConvParams {
       // always use cudnn_depthwise for channels_last format
       return true;
     }
+<<<<<<< HEAD
     if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
       long cudnn_version = detail::getCUDAHooks().versionCuDNN();
       if (cudnn_version >= 8200) {
@@ -484,10 +503,27 @@ struct ConvParams {
       // keep (7600 <= cudnn < 8200) code unchanged
       bool kernel_cond =  (cudnn_version >= 7600 &&
                            use_cudnn(input, weight) &&
+=======
+    // native kernel doesn't support 64-bit non-splittable case
+    if (cudnn_enabled && needs_64bit_indexing_no_split(input, weight)) {
+      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
+      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
+        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
+                        " if the V8 API is not enabled or before cuDNN version 9.3+."
+                        " Upgrade cuDNN or enable the V8 API to use cuDNN for 64-bit depthwise convolutions.");
+        return false;
+      } else {
+        return true;
+      }
+    }
+    if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
+      bool kernel_cond =  (use_cudnn(input, weight) &&
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                            input.scalar_type() == kHalf && // only for FP16
                            weight.scalar_type() == kHalf &&
                            is_depthwise(input, weight) &&
                            input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
+<<<<<<< HEAD
                            at::symint::size<T>(weight, 2) == at::symint::size<T>(weight, 3) && // only square kernels
                            at::symint::size<T>(input, 2) >= 7 && // min width/height 7
                            !is_dilated() && // no dilation supported
@@ -499,6 +535,15 @@ struct ConvParams {
       } else {
         return false;
       }
+=======
+                           !is_dilated() && // no dilation supported
+                           (stride[0] == stride[1] || at::symint::size<T>(input, 2) == 1) && // square or 1d
+                           at::symint::size<T>(input, 1) >= 32); // min 32 channels supported)
+      if (kernel_cond) {
+        return check_cudnn_depthwise_workload_with_filter<T>(input, stride[1], weight);
+      }
+      return false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       return false;
     }
@@ -639,7 +684,11 @@ REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub)
 REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub)
 
 template <typename T>
+<<<<<<< HEAD
 std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params) {
+=======
+static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   out << "ConvParams {"
       << "  stride = " << IntArrayRef{params.stride}
       << "  padding = " << ArrayRef<T>{params.padding}
@@ -1202,7 +1251,11 @@ at::Tensor convolution_overrideable(
 // a bool indicating whether the bias is defined. This is done to save memory by
 // avoiding saving the full bias tensor for backward.
 template <typename T>
+<<<<<<< HEAD
 ConvBackend _select_conv_backend(
+=======
+static ConvBackend _select_conv_backend(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     const Tensor& weight,
     const std::optional<Tensor>& bias,
@@ -1416,7 +1469,11 @@ static inline at::MemoryFormat determine_backend_memory_format(
     const Tensor& input,
     const Tensor& weight,
     const ConvBackend backend) {
+<<<<<<< HEAD
   at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous;
+=======
+  auto backend_memory_format = at::MemoryFormat::Contiguous;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if !defined(C10_MOBILE)
   auto k = weight.ndimension();
   // See Note [Mobile check segfaults]
@@ -1452,6 +1509,19 @@ static inline at::MemoryFormat determine_backend_memory_format(
         backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
       }
       break;
+<<<<<<< HEAD
+=======
+    case ConvBackend::Mps:
+      if (mps_conv_use_channels_last(input, weight)) {
+#ifdef USE_MPS
+        if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) {
+          break;
+        }
+#endif
+        backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast;
+      }
+      break;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       backend_memory_format = at::MemoryFormat::Contiguous;
   }
diff --git a/aten/src/ATen/native/Cross.cpp b/aten/src/ATen/native/Cross.cpp
index 7297aaed80d3..657ba2014d70 100644
--- a/aten/src/ATen/native/Cross.cpp
+++ b/aten/src/ATen/native/Cross.cpp
@@ -6,6 +6,10 @@
 #include <ATen/WrapDimUtils.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/native/Resize.h>
+<<<<<<< HEAD
+=======
+#include <ATen/MemoryOverlap.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -77,6 +81,12 @@ Tensor & cross_out(const Tensor & input, const Tensor & other, const std::option
 
 TORCH_IMPL_FUNC(linalg_cross_out)
 (const Tensor & input, const Tensor & other, int64_t dim, const Tensor & out) {
+<<<<<<< HEAD
+=======
+  at::assert_no_internal_overlap(out);
+  at::assert_no_overlap(out, input);
+  at::assert_no_overlap(out, other);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dim = maybe_wrap_dim(dim, input.dim());
   auto out_size = out.sizes();
   Tensor input_broadcasted = input.expand(out_size);
diff --git a/aten/src/ATen/native/DispatchStub.cpp b/aten/src/ATen/native/DispatchStub.cpp
index 1be4ec37dfef..396b9fe77127 100644
--- a/aten/src/ATen/native/DispatchStub.cpp
+++ b/aten/src/ATen/native/DispatchStub.cpp
@@ -4,6 +4,10 @@
 #include <c10/core/DeviceType.h>
 #include <c10/util/Array.h>
 #include <c10/util/Exception.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if !defined(__s390x__) && !defined(__powerpc__)
 #include <cpuinfo.h>
@@ -26,6 +30,7 @@ static inline bool cpu_has_vxe()
 #endif
 
 static CPUCapability compute_cpu_capability() {
+<<<<<<< HEAD
   auto envar = std::getenv("ATEN_CPU_CAPABILITY");
   if (envar) {
 #if defined(HAVE_VSX_CPU_DEFINITION)
@@ -34,14 +39,34 @@ static CPUCapability compute_cpu_capability() {
     }
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
     if (strcmp(envar, "zvector") == 0) {
+=======
+  const auto envar = c10::utils::get_env("ATEN_CPU_CAPABILITY");
+  if (envar.has_value()) {
+#if defined(HAVE_VSX_CPU_DEFINITION)
+    if (envar == "vsx") {
+      return CPUCapability::VSX;
+    }
+#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
+    if (envar == "zvector") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return CPUCapability::ZVECTOR;
     }
 #elif defined(HAVE_SVE_CPU_DEFINITION)
     int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
 #ifdef HAVE_SVE256_CPU_DEFINITION
+<<<<<<< HEAD
     if (strcmp(envar, "sve256") == 0) {
       if (sve_vl == 256) {
         return CPUCapability::SVE256;
+=======
+    if (envar == "sve256") {
+      if (sve_vl == 256) {
+#ifdef HAVE_ARM_BF16_CPU_DEFINITION
+        if (cpuinfo_has_arm_bf16()) {
+          return CPUCapability::SVE256;
+        }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
       return CPUCapability::DEFAULT;
@@ -49,20 +74,35 @@ static CPUCapability compute_cpu_capability() {
 #endif
 #else
 #ifdef HAVE_AVX512_CPU_DEFINITION
+<<<<<<< HEAD
     if (strcmp(envar, "avx512") == 0) {
+=======
+    if (envar == "avx512") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return CPUCapability::AVX512;
     }
 #endif
 #ifdef HAVE_AVX2_CPU_DEFINITION
+<<<<<<< HEAD
     if (strcmp(envar, "avx2") == 0) {
+=======
+    if (envar == "avx2") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return CPUCapability::AVX2;
     }
 #endif
 #endif
+<<<<<<< HEAD
     if (strcmp(envar, "default") == 0) {
       return CPUCapability::DEFAULT;
     }
     TORCH_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar);
+=======
+    if (envar == "default") {
+      return CPUCapability::DEFAULT;
+    }
+    TORCH_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar.value());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 #if !defined(__powerpc__) && !defined(__s390x__) && !defined(HAVE_SVE_CPU_DEFINITION)
@@ -102,7 +142,14 @@ static CPUCapability compute_cpu_capability() {
     }
     #ifdef HAVE_SVE256_CPU_DEFINITION
         if (sve_vl == 256) { // Check for SVE256
+<<<<<<< HEAD
             return CPUCapability::SVE256;
+=======
+        #ifdef HAVE_ARM_BF16_CPU_DEFINITION
+          if (cpuinfo_has_arm_bf16())
+            return CPUCapability::SVE256;
+        #endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     #endif
     // Return the default CPU capability.
@@ -147,6 +194,10 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
         c10::DeviceType::MPS,
         c10::DeviceType::MTIA,
         c10::DeviceType::XPU,
+<<<<<<< HEAD
+=======
+        c10::DeviceType::HPU,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         c10::DeviceType::PrivateUse1
     );
     // Check if the device type is supported.
@@ -203,6 +254,12 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
       return xpu_dispatch_ptr != nullptr ? DispatchResult(xpu_dispatch_ptr) : ErrorType::MissingDeviceKernel;
 #endif
 
+<<<<<<< HEAD
+=======
+    case DeviceType::HPU:
+      return hpu_dispatch_ptr != nullptr ? DispatchResult(hpu_dispatch_ptr) : ErrorType::MissingDeviceKernel;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case DeviceType::PrivateUse1:
       return privateuse1_dispatch_ptr != nullptr ? DispatchResult(privateuse1_dispatch_ptr) : ErrorType::MissingDeviceKernel;
 
diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index 725d0d08bae1..fad49a7df94a 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -44,6 +44,10 @@
 //   - MPS: Apple Silicon GPUs (Metal Performance Shaders)
 //   - MTIA: Meta Training and Inference Devices
 //   - XPU: Intel GPUs
+<<<<<<< HEAD
+=======
+//   - HPU: Reserved for HPU (Intel Gaudi) device types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //   - PrivateUse1: Reserved for private/custom device types
 //
 // If you want to update the list of supported devices, add a new dispatch_ptr
@@ -63,7 +67,11 @@ enum class CPUCapability {
   VSX = 1,
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
   ZVECTOR = 1,
+<<<<<<< HEAD
 #elif defined(HAVE_SVE_CPU_DEFINITION)
+=======
+#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   SVE256 = 1,
 #else
   AVX2 = 1,
@@ -196,6 +204,10 @@ struct TORCH_API DispatchStubImpl {
   #if defined(USE_XPU)
     void* xpu_dispatch_ptr;
   #endif
+<<<<<<< HEAD
+=======
+    void* hpu_dispatch_ptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     void* privateuse1_dispatch_ptr;
   #else
     std::atomic<void*> cpu_dispatch_ptr{nullptr};
@@ -206,6 +218,10 @@ struct TORCH_API DispatchStubImpl {
   #if defined(USE_XPU)
     void* xpu_dispatch_ptr = nullptr;
   #endif
+<<<<<<< HEAD
+=======
+    void* hpu_dispatch_ptr = nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     void* privateuse1_dispatch_ptr = nullptr;
   #endif
 };
@@ -259,6 +275,13 @@ struct DispatchStub<rT (*)(Args...), T> {
   }
   #endif
 
+<<<<<<< HEAD
+=======
+  void set_hpu_dispatch_ptr(FnPtr fn_ptr) {
+    impl.hpu_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void set_hip_dispatch_ptr(FnPtr fn_ptr) {
     impl.hip_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
   }
@@ -338,6 +361,16 @@ struct RegisterXPUDispatch {
 };
 
 template <typename DispatchStub>
+<<<<<<< HEAD
+=======
+struct RegisterHPUDispatch {
+  RegisterHPUDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value){
+    stub.set_hpu_dispatch_ptr(value);
+  }
+};
+
+template <typename DispatchStub>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct RegisterMPSDispatch {
   RegisterMPSDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
     stub.set_mps_dispatch_ptr(value);
@@ -437,6 +470,12 @@ struct RegisterPRIVATEUSE1Dispatch {
 #define REGISTER_XPU_DISPATCH(name, fn) \
   static RegisterXPUDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
 
+<<<<<<< HEAD
+=======
+#define REGISTER_HPU_DISPATCH(name, fn) \
+  static RegisterHPUDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define REGISTER_HIP_DISPATCH(name, fn) \
   static RegisterHIPDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
 
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 336bf9364ac0..01c60a5ca7cf 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -112,11 +112,19 @@ index_select_add(
     const Tensor& add_indices,
     const Tensor& src,
     Tensor& output,
+<<<<<<< HEAD
     const Tensor& /*offsets*/,
     bool /*include_last_offset*/,
     Tensor& bag_size,
     index_t padding_idx,
     _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) {
+=======
+    [[maybe_unused]] const Tensor& offsets,
+    [[maybe_unused]] bool include_last_offset,
+    Tensor& bag_size,
+    index_t padding_idx,
+    [[maybe_unused]] _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(select_indices.numel() == add_indices.numel());
   auto* add_indices_data = add_indices.const_data_ptr<index_t>();
   auto* select_indices_data = select_indices.const_data_ptr<index_t>();
@@ -499,11 +507,19 @@ index_select_scale_add(
     const Tensor& scale,
     const Tensor& src,
     Tensor& output,
+<<<<<<< HEAD
     const Tensor& /*offsets*/,
     bool /*include_last_offset*/,
     Tensor& bag_size,
     index_t padding_idx,
     _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) {
+=======
+    [[maybe_unused]] const Tensor& offsets,
+    [[maybe_unused]] bool include_last_offset,
+    Tensor& bag_size,
+    index_t padding_idx,
+    [[maybe_unused]] _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AT_ASSERT(select_indices.numel() == add_indices.numel());
   auto* add_indices_data = add_indices.const_data_ptr<index_t>();
   auto* select_indices_data = select_indices.const_data_ptr<index_t>();
@@ -535,9 +551,15 @@ index_select_scale_add(
     if (idx != padding_idx) {
       auto* src_base = src_data + src_stride0 * idx;
       auto* output_base = output_data + output_stride0 * add_indices_data[i];
+<<<<<<< HEAD
       auto scale = scale_data[i * scale_stride];
       for (const auto j : c10::irange(ddim)) {
         output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
+=======
+      auto element_scale = scale_data[i * scale_stride];
+      for (const auto j : c10::irange(ddim)) {
+        output_base[j * output_stride1] += src_base[j * src_stride1] * element_scale;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     } else if (bag_size_data) {
       // Decrement bag_size to reflect that the index is padded
@@ -718,10 +740,17 @@ index_select_scale_add(
       if (idx != padding_idx) {
         auto* src_base = src_data + src_stride0 * idx;
         auto* output_base_fp32 = output_data_fp32 + ddim * add_indices_data[i];
+<<<<<<< HEAD
         auto scale = scale_data[i * scale_stride];
         for (const auto j : c10::irange(ddim)) {
           output_base_fp32[j] += static_cast<float>(src_base[j * src_stride1]) *
               static_cast<float>(scale);
+=======
+        auto element_scale = scale_data[i * scale_stride];
+        for (const auto j : c10::irange(ddim)) {
+          output_base_fp32[j] += static_cast<float>(src_base[j * src_stride1]) *
+              static_cast<float>(element_scale);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       } else if (bag_size_data) {
         // Decrement bag_size to reflect that the index is padded
@@ -851,9 +880,15 @@ index_select_scale_add(const Tensor &select_indices,
       if (idx != padding_idx) {
         auto* src_base = src_data + src_stride0 * idx;
         auto* output_base = output_data + output_stride0 * add_indices_data[i];
+<<<<<<< HEAD
         auto scale = scale_data[i * scale_stride];
         for (const auto j : c10::irange(ddim)) {
           output_base[j * output_stride1] += src_base[j * src_stride1] * scale;
+=======
+        auto element_scale = scale_data[i * scale_stride];
+        for (const auto j : c10::irange(ddim)) {
+          output_base[j * output_stride1] += src_base[j * src_stride1] * element_scale;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       } else if (bag_size_data) {
         // Decrement bag_size to reflect that the index is padded
@@ -941,7 +976,11 @@ void make_bag_size_out(
 void make_max_indices_out(
     Tensor& max_indices_out,
     const Tensor& weight,
+<<<<<<< HEAD
     const Tensor& indices,
+=======
+    [[maybe_unused]] const Tensor& indices,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& offsets,
     const Tensor& bag_size,
     const int64_t mode,
@@ -1059,13 +1098,21 @@ static Tensor apply_bag_size_backward(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void embedding_bag_cpu_max_out(
+=======
+static void embedding_bag_cpu_max_out(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor* max_indices,
     const Tensor& weight,
     const Tensor& indices,
     const Tensor& offset2bag,
     const Tensor& output,
+<<<<<<< HEAD
     bool include_last_offset,
+=======
+    [[maybe_unused]] bool include_last_offset,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& bag_size,
     int64_t padding_idx) {
   int64_t numIndices = indices.numel();
@@ -1323,9 +1370,15 @@ void _embedding_bag_cpu_out(
     const at::Tensor& weight,
     const at::Tensor& indices_,
     const at::Tensor& offsets_,
+<<<<<<< HEAD
     const bool /* scale_grad_by_freq */,
     const int64_t mode,
     const bool /* sparse */,
+=======
+    [[maybe_unused]] const bool scale_grad_by_freq,
+    const int64_t mode,
+    [[maybe_unused]] const bool sparse,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::optional<at::Tensor>& per_sample_weights,
     const bool include_last_offset,
     const std::optional<int64_t>& padding_idx,
@@ -1505,7 +1558,11 @@ static std::vector<index_t> compute_counts_uniq(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void _embedding_bag_dense_backward_cpu_sum_mean(
+=======
+static void _embedding_bag_dense_backward_cpu_sum_mean(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& grad,
     const Tensor& indices_,
     const Tensor& offset2bag_,
@@ -1553,11 +1610,19 @@ void _embedding_bag_dense_backward_cpu_sum_mean(
         &counts, &grad, &index_grad_weight, &padding_idx
       ](index_t start, index_t end) {
       for (index_t i = start; i < end; i++) {
+<<<<<<< HEAD
         index_t start = i == 0 ? 0 : next_unique_index_idx[i - 1];
         index_t index = indices_data[start];
 
         if (index != static_cast<index_t>(padding_idx)) {
           for (index_t j = start; j < next_unique_index_idx[i]; j++) {
+=======
+        index_t indices_start = i == 0 ? 0 : next_unique_index_idx[i - 1];
+        index_t index = indices_data[indices_start];
+
+        if (index != static_cast<index_t>(padding_idx)) {
+          for (index_t j = indices_start; j < next_unique_index_idx[i]; j++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             index_t source = offset2bag_data[j];
             double scale = 1.0;
             if (per_sample_weights) {
@@ -1641,7 +1706,11 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi
 }
 
 template<typename scalar_t>
+<<<<<<< HEAD
 Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
+=======
+static Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& grad,
     const Tensor& weight,  // NB: embedding table, not per_sample_weights
     const Tensor& indices_,
@@ -1747,7 +1816,11 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu(
 }
 
 Tensor _embedding_bag_sparse_backward_symint(
+<<<<<<< HEAD
     const Tensor &grad_, const Tensor &indices, const Tensor &offsets,
+=======
+    const Tensor &grad_, const Tensor &indices, [[maybe_unused]] const Tensor &offsets,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor &offset2bag, const Tensor &bag_size_, SymInt num_weights,
     bool scale_grad_by_freq, int64_t mode, const std::optional<Tensor>& per_sample_weights_opt,
     int64_t padding_idx) {
diff --git a/aten/src/ATen/native/FusedAdagrad.cpp b/aten/src/ATen/native/FusedAdagrad.cpp
index 2fa6c4c877c1..0f5a66a65233 100644
--- a/aten/src/ATen/native/FusedAdagrad.cpp
+++ b/aten/src/ATen/native/FusedAdagrad.cpp
@@ -11,7 +11,10 @@
 #include <ATen/ops/_fused_adagrad_native.h>
 #endif
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::native {
 
 void _fused_adagrad_kernel_cpu_(
@@ -31,12 +34,17 @@ void _fused_adagrad_kernel_cpu_(
   const float* found_inf_ptr =
       found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
   if (found_inf_ptr && *found_inf_ptr == 1.0) {
+<<<<<<< HEAD
       return;
+=======
+    return;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   size_t n_tensors = params.size();
   TORCH_CHECK(grads.size() == n_tensors);
   TORCH_CHECK(state_sums.size() == n_tensors);
   TORCH_CHECK(state_steps.size() == n_tensors);
+<<<<<<< HEAD
   for (size_t i = 0; i < n_tensors; i++){
     fused_adagrad_stub(
       kCPU,
@@ -45,14 +53,59 @@ void _fused_adagrad_kernel_cpu_(
       state_sums[i],
       state_steps[i],
       lr,
+=======
+  for (size_t i = 0; i < n_tensors; i++) {
+    fused_adagrad_stub(
+        kCPU,
+        params[i],
+        grads[i],
+        state_sums[i],
+        state_steps[i],
+        lr,
+        lr_decay,
+        weight_decay,
+        eps,
+        maximize,
+        grad_scale_ptr);
+  }
+}
+
+void _fused_adagrad_kernel_cpu_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList state_sums,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double lr_decay,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  _fused_adagrad_kernel_cpu_(
+      params,
+      grads,
+      state_sums,
+      state_steps,
+      lr.item<double>(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       lr_decay,
       weight_decay,
       eps,
       maximize,
+<<<<<<< HEAD
       grad_scale_ptr);
   }
+=======
+      grad_scale,
+      found_inf);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 DEFINE_DISPATCH(fused_adagrad_stub);
 
+<<<<<<< HEAD
 }
+=======
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index 5adcdc4daa4e..2e7a9aa5b7e3 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -5,6 +5,11 @@
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/TensorOperators.h>
 #include <c10/util/irange.h>
+<<<<<<< HEAD
+=======
+#include <c10/core/Contiguity.h>
+#include <c10/core/GradMode.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/SymInt.h>
 #include <c10/util/MaybeOwned.h>
 #include <ATen/TensorSubclassLikeUtils.h>
@@ -19,6 +24,10 @@
 #include <ATen/ops/addmm.h>
 #include <ATen/ops/bilinear_native.h>
 #include <ATen/ops/bmm.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/dot.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/einsum_native.h>
 #include <ATen/ops/linear_native.h>
 #include <ATen/ops/matmul.h>
@@ -40,6 +49,7 @@ namespace at::native {
 // Parse environment variable "TORCH_LINEAR_FLATTEN_3D"
 static inline bool parseLinearFlatten3d() {
   // Uninitialized value
+<<<<<<< HEAD
   static int value = -1;
   if (value == -1) {
     const char* env_str = std::getenv("TORCH_LINEAR_FLATTEN_3D");
@@ -50,6 +60,10 @@ static inline bool parseLinearFlatten3d() {
     }
   }
   return bool(value);
+=======
+  static auto value = c10::utils::check_env("TORCH_LINEAR_FLATTEN_3D");
+  return value.has_value() && value.value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // `_flatten_nd_linear` flattens all but the last dimension of the input tensor
@@ -98,9 +112,16 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optional<Ten
   if (bias->defined() && !input.is_xla()) {
     // Also hit the fused path for contiguous 3D input, if not using xla
     // backend. Reshaping/flattening has some performance implications on xla.
+<<<<<<< HEAD
     if (input.is_contiguous() && input_dim == 3) {
       return _flatten_nd_linear(input, weight, *bias);
     } else if (input.is_contiguous() && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) {
+=======
+    bool is_contiguous = definitely_contiguous(input.sym_sizes(), input.sym_strides(), input.sym_numel());
+    if (is_contiguous && input_dim == 3) {
+      return _flatten_nd_linear(input, weight, *bias);
+    } else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return _flatten_nd_linear(input, weight, *bias);
     } else if (parseLinearFlatten3d() && input_dim == 3) {
       // If user forces flattening via env var
@@ -158,11 +179,19 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   Tensor left = left_;
   Tensor right = right_;
   for (const auto i : c10::irange(dim)) {
+<<<<<<< HEAD
     auto sl = left.sym_size(i)!=1;
     auto sr = right.sym_size(i)!=1;
     if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
       if (sl && sr) {  // dimensions nontrivially in both left and right must be of the same size
         TORCH_CHECK(left.sym_size(i)==right.sym_size(i), "non-broadcast dimensions must match");
+=======
+    auto sl = TORCH_GUARD_SIZE_OBLIVIOUS(left.sym_size(i).sym_ne(1));
+    auto sr = TORCH_GUARD_SIZE_OBLIVIOUS(right.sym_size(i).sym_ne(1));
+    if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
+      if (sl && sr) {  // dimensions nontrivially in both left and right must be of the same size
+        TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sum_size *= left.sym_size(i);
       } else if (sl) { // if it is only in one of left and right, we can sum right away
         left = left.sum(i, true);
@@ -171,7 +200,11 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
       }
     } else if (sl && sr) { // now deal with dimensions that will be in the output
       // dimensions nontrivially in both left and right must be of the same size
+<<<<<<< HEAD
       TORCH_CHECK(left.sym_size(i)==right.sym_size(i), "non-broadcast dimensions must match");
+=======
+      TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       lro.push_back(i);
       lro_size *= left.sym_size(i);
     } else if (sl) { // keep track of dimensions appearing only once
@@ -481,10 +514,17 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
         // Iterate over each dimension covered by ellipsis
         const auto ndim = operands[i].ndimension() - (static_cast<int64_t>(op_labels[i].size()) - 1);
         for (auto j = ell_num_dim - ndim; j < ell_num_dim; ++j) {
+<<<<<<< HEAD
           if (op.sym_size(dim) != 1) {
             // Update ellipsis size
             TORCH_CHECK(
                 ell_sizes[j] == 1 || ell_sizes[j] == op.sym_size(dim),
+=======
+          if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) {
+            // Update ellipsis size
+            TORCH_SYM_CHECK(
+                ell_sizes[j].sym_eq(1).sym_or(ell_sizes[j].sym_eq(op.sym_size(dim))),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "einsum(): dimension ",
                 dim,
                 " covered by ellipsis in operand ",
@@ -500,10 +540,17 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
           permutation[ell_index + j] = dim++;
         }
       } else if (permutation[label_perm_index[s]] == -1) {
+<<<<<<< HEAD
         if (op.sym_size(dim) != 1) {
           // Update subscript
           TORCH_CHECK(
               label_size[s] == 1 || label_size[s] == op.sym_size(dim),
+=======
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) {
+          // Update subscript
+          TORCH_SYM_CHECK(
+              label_size[s].sym_eq(1).sym_or(label_size[s].sym_eq(op.sym_size(dim))),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               "einsum(): subscript ",
               subscript_to_label(s),
               " has size ",
@@ -578,16 +625,28 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
     SmallVector<int64_t, 5> a_dims_to_sum;
     SmallVector<int64_t, 5> b_dims_to_sum;
     for (auto dim = out_num_dim; dim < perm_index; ++dim) {
+<<<<<<< HEAD
       if (a.sym_size(dim) != 1 && b.sym_size(dim) != 1) {
+=======
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))
+        && TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (--dim_counts[dim] == 1) {
           sum_dims.push_back(dim);
           dim_counts[dim] = 0;
         }
       } else if (dim_counts[dim] == 1) {
+<<<<<<< HEAD
         if (a.sym_size(dim) != 1) {
           a_dims_to_sum.push_back(dim);
           dim_counts[dim] = 0;
         } else if (b.sym_size(dim) != 1) {
+=======
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))) {
+          a_dims_to_sum.push_back(dim);
+          dim_counts[dim] = 0;
+        } else if (TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           b_dims_to_sum.push_back(dim);
           dim_counts[dim] = 0;
         }
@@ -817,11 +876,43 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,
       rsizes.emplace_back(t2.sym_size(i));
     }
   }
+<<<<<<< HEAD
   // permute and reshape for matrix multiplication
   t1 = t1.permute(p1).reshape_symint({size1, csize});
   t2 = t2.permute(p2).reshape_symint({csize, size2});
   // multiply and reshape to target size
   return at::mm(t1, t2).reshape_symint(rsizes);
+=======
+
+  // Full contraction (size1 == 1 and size2 == 1) is much faster when done with dot ...
+  // TODO(@nikitaved): there are other cases where dot outperforms gemms,
+  // like, for example, when the non-contracted dims are relatively small.
+  // NOTE(@nikitaved): contract with gemm when on MPS,
+  // otherwise issues with the tests xpassing/xfailing
+  // when enabling the fast-path with dot.
+  // TODO: resolve that
+  if ((t1.device().type() == at::kMPS || t2.device().type() == at::kMPS) || size1 != 1 || size2 != 1) {
+    // permute and reshape for matrix multiplication
+    t1 = t1.permute(p1).reshape_symint({size1, csize});
+    t2 = t2.permute(p2).reshape_symint({csize, size2});
+    // multiply and reshape to target size
+    return at::mm(t1, t2).reshape_symint(rsizes);
+  } else {
+    // permute to align for contraction
+    t1 = t1.permute(p1);
+    t2 = t2.permute(p2);
+
+    if (t1.is_contiguous() && t2.is_contiguous()) {
+      // If t1 and t2 are both contiguous, then flatten is a view,
+      // then dot is the method of choice
+      return at::dot(t1.flatten(), t2.flatten()).reshape_symint(rsizes);
+    } else {
+      // Otherwise mul + sum can be faster as it avoids at most 2x contiguous() calls
+      // NOTE: t1.dtype == t2.dtype -- check above
+      return (t1.squeeze() * t2.squeeze()).sum(t1.scalar_type()).reshape_symint(rsizes);
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor &tensordot_out(const Tensor& input1, const Tensor& input2, IntArrayRef dims1, IntArrayRef dims2, Tensor& result) {
@@ -831,6 +922,17 @@ Tensor &tensordot_out(const Tensor& input1, const Tensor& input2, IntArrayRef di
   auto output_device = result.device();
   auto input1_device = input1.device();
   auto input2_device = input2.device();
+<<<<<<< HEAD
+=======
+
+  if(result.defined()) {
+    TORCH_CHECK(
+      !(result.requires_grad() && at::GradMode::is_enabled() && result.sizes() != result_tmp.sizes()),
+      "tensordot(): the 'out' tensor was specified and requires gradients, and its shape does not match the expected result. "
+      "Either remove the 'out' argument, ensure it does not require gradients, or make sure its shape matches the expected output."
+    );
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // check if the input & output tensors are on the same device.
   TORCH_CHECK(
     (output_device == input1_device) && (input1_device == input2_device),
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 1cfff77eb592..6e5db949d5f1 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -23,6 +23,10 @@
 #include <ATen/cpu/Utils.h>
 #include <c10/core/GradMode.h>
 #include <c10/util/accumulate.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 #include <variant>
 
@@ -285,7 +289,11 @@ TORCH_META_FUNC(_linalg_slogdet)(const Tensor& A) {
 }
 
 template <typename Meta>
+<<<<<<< HEAD
 void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+=======
+static void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
   TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
 
@@ -1366,8 +1374,13 @@ static inline int64_t get_mkldnn_matmul_min_dim() {
       //it's enabled on all Neoverse cpus.
       return is_arm_neoverse() ? 8 : 0;
     }();
+<<<<<<< HEAD
     const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_DIM");
     return ptr != nullptr ? std::atoi(ptr) : default_min_dim;
+=======
+    const auto value = c10::utils::get_env("TORCH_MKLDNN_MATMUL_MIN_DIM");
+    return value.has_value() ? std::stoi(value.value()) : default_min_dim;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }();
   return value;
 }
@@ -1380,8 +1393,13 @@ static inline int64_t get_mkldnn_matmul_min_size() {
       // it's enabled on all Neoverse cpus.
       return is_arm_neoverse() ? 8 * 1024 : 0;
     }();
+<<<<<<< HEAD
     const char* ptr = std::getenv("TORCH_MKLDNN_MATMUL_MIN_SIZE");
     return ptr != nullptr ? std::atoi(ptr) : default_min_size;
+=======
+    const auto value = c10::utils::get_env("TORCH_MKLDNN_MATMUL_MIN_SIZE");
+    return value.has_value() ? std::stoi(value.value()) : default_min_size;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }();
   return value;
 }
@@ -1639,7 +1657,11 @@ TORCH_IMPL_FUNC(mm_out_cpu)(const Tensor & self, const Tensor & mat2, const Tens
 }
 
 template <typename scalar_t, bool is_bmm>
+<<<<<<< HEAD
 inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const Tensor& mat2, const Scalar& beta_, const Scalar& alpha_) {
+=======
+static inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const Tensor& mat2, const Scalar& beta_, const Scalar& alpha_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t bs = result.size(0);
   int64_t is = result.size(1);
   int64_t js = result.size(2);
@@ -2652,7 +2674,11 @@ Tensor mexp_impl(
     // `norm_cpu` is used to decide which Tensors require which approximation
     // based on their norm. This decision takes place on CPU.
     // It requires moving data back and forth between devices when `a` is on CUDA,
+<<<<<<< HEAD
     // but at the cost of only one sigle CPU-CUDA synchronization (instead of 6),
+=======
+    // but at the cost of only one single CPU-CUDA synchronization (instead of 6),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // and better performance overall (benchmarked).
     const auto norm_cpu = (a.device().type() == at::kCUDA)
       ? norm.to(at::kCPU) : norm;
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index 1513e756c71d..d6620850b996 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -126,6 +126,10 @@ std::tuple<Tensor, Tensor, size_t, std::vector<int64_t>> ctc_loss_allocate_outpu
 // the alphas from the user by only returning the loss.
 template<typename scalar_t, ScalarType target_scalar_type>
 std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t BLANK) {
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(log_probs.numel() > 0, "log_probs tensor must not be empty");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // log_probs: input_len x batch_size x num_labels
   // targets [int64]: batch_size x target_length OR sum(target_lengths)
   constexpr scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index 47c0a2be0303..36bc8b67c8c1 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -1680,7 +1680,11 @@ inline C10_HOST_DEVICE T calc_ndtri(T y0) {
   return x;
 }
 
+<<<<<<< HEAD
 /* The next function is taken from http://ab-initio.mit.edu/Faddeev */
+=======
+/* The next function is taken from http://ab-initio.mit.edu/faddeeva */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 /* Copyright (c) 2012 Massachusetts Institute of Technology
  *
diff --git a/aten/src/ATen/native/MathBitsFallback.h b/aten/src/ATen/native/MathBitsFallback.h
index de2296634e04..56cf21137c9b 100644
--- a/aten/src/ATen/native/MathBitsFallback.h
+++ b/aten/src/ATen/native/MathBitsFallback.h
@@ -22,7 +22,11 @@ namespace at::native {
 
 // NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit.
 struct MathOpFallback {
+<<<<<<< HEAD
   MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(std::move(op_name_)) {}
+=======
+  MathOpFallback(DispatchKey key_, std::string op_name_) : key(key_), op_name(std::move(op_name_)) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual bool is_bit_set(const Tensor&) = 0;
   void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
     /*
@@ -151,7 +155,11 @@ struct MathOpFallback {
   virtual ~MathOpFallback() = default;
 
   DispatchKey key;
+<<<<<<< HEAD
   string op_name;
+=======
+  std::string op_name;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/NNPACK.cpp b/aten/src/ATen/native/NNPACK.cpp
index 9a5ae286666c..3ee3c9581b08 100644
--- a/aten/src/ATen/native/NNPACK.cpp
+++ b/aten/src/ATen/native/NNPACK.cpp
@@ -25,8 +25,12 @@ at::Tensor _nnpack_spatial_convolution(
     const Tensor& weight, const std::optional<Tensor>& bias_opt,
     const IntArrayRef padding,
     const IntArrayRef stride) {
+<<<<<<< HEAD
   throw std::runtime_error(
       "nnpack_spatial_convolution: ATen not compiled with NNPACK support");
+=======
+  TORCH_CHECK(false, "nnpack_spatial_convolution: ATen not compiled with NNPACK support");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool _nnpack_available() {
@@ -143,6 +147,7 @@ Tensor _nnpack_spatial_convolution(
       input.options());
 
   // Our input Tensor must be in the form N,C,H,W
+<<<<<<< HEAD
   if (input.ndimension() != 4) {
     throw std::runtime_error(
         "NNPack convolutionOutput expects 4D input Tensor N,C,H,W");
@@ -180,14 +185,58 @@ Tensor _nnpack_spatial_convolution(
         << ") in NNPack convolutionOutput";
     throw std::runtime_error(err.str());
   }
+=======
+  TORCH_CHECK(
+      input.ndimension() == 4,
+      "NNPack convolutionOutput expects 4D input Tensor N,C,H,W");
+
+  // Our weight Tensor must be in the form oC,iC,kH,kW
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "NNPack convolutionOutput expects 4D weight Tensor oC,iC,kH,kW");
+
+  // Our output Tensor must be in the form N,oC,oH,oW
+  TORCH_CHECK(
+      output.ndimension() == 4,
+      "NNPack convolutionOutput expects 4D output Tensor N,oC,oH,oW");
+
+  // Some basic shape checking, not comprehensive
+  TORCH_CHECK(
+      input.size(1) == weight.size(1),
+      "Mismatch between number of input channels in input Tensor (",
+      input.size(1),
+      ") and weight Tensor (",
+      weight.size(1),
+      ") in NNPack convolutionOutput");
+
+  TORCH_CHECK(
+      weight.size(0) == output.size(1),
+      "Mismatch between number of output channels in weight Tensor (",
+      weight.size(0),
+      ") and output Tensor (",
+      output.size(1),
+      ") in NNPack convolutionOutput");
+
+  TORCH_CHECK(
+      input.size(0) == output.size(0),
+      "Mismatch between batch size in input Tensor (",
+      input.size(0),
+      ") and output Tensor (",
+      output.size(0),
+      ") in NNPack convolutionOutput");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // All Tensors must be float Tensors
   if (input.device().type() != kCPU || input.scalar_type() != kFloat ||
       weight.device().type() != kCPU || weight.scalar_type() != kFloat ||
       output.device().type() != kCPU || output.scalar_type() != kFloat ||
       (bias.defined() && (bias.device().type() != kCPU || bias.scalar_type() != kFloat))) {
+<<<<<<< HEAD
     throw std::runtime_error(
         "Mismatched Tensor types in NNPack convolutionOutput");
+=======
+    TORCH_CHECK(false, "Mismatched Tensor types in NNPack convolutionOutput");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   const auto algorithm = nnp_convolution_algorithm_auto;
@@ -281,9 +330,15 @@ Tensor _nnpack_spatial_convolution(
   auto size_and_allocate_ws = [&]() {
     // Run a single pass to get the size of memory workspace buffer
     const auto status = compute(batch_size);
+<<<<<<< HEAD
     if (status != nnp_status_success) {
       throw std::runtime_error("NNPACK SpatialConvolution_updateOutput failed");
     }
+=======
+    TORCH_CHECK(
+        status == nnp_status_success,
+        "NNPACK SpatialConvolution_updateOutput failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     workspace.allocate();
   };
 
@@ -304,9 +359,15 @@ Tensor _nnpack_spatial_convolution(
     status = compute(batch_size);
   }
 
+<<<<<<< HEAD
   if (status != nnp_status_success) {
     throw std::runtime_error("NNPACK SpatialConvolution_updateOutput failed");
   }
+=======
+  TORCH_CHECK(
+      status == nnp_status_success,
+      "NNPACK SpatialConvolution_updateOutput failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return output;
 }
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
index cb9f3c469349..7973c01fe38f 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
@@ -20,9 +20,12 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 template<typename scalar_t>
 void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t lda, scalar_t *x, int64_t incx, scalar_t beta, scalar_t *y, int64_t incy);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 
 static inline void slow_conv_transpose3d_shape_check(
@@ -299,7 +302,11 @@ void slow_conv_transpose3d_out_cpu_template(
         int64_t elt;
         // For each elt in batch, do:
         for (elt = 0; elt < batch_size; ++elt) {
+<<<<<<< HEAD
           // Matrix mulitply per output:
+=======
+          // Matrix multiply per output:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           input_n = input.select(0, elt);
           output_n = output.select(0, elt);
 
@@ -523,7 +530,11 @@ void slow_conv_transpose3d_backward_out_cpu_template(
         int64_t elt;
         // For each elt in batch, do:
         for (elt = 0; elt < batch_size; ++elt) {
+<<<<<<< HEAD
           // Matrix mulitply per sample:
+=======
+          // Matrix multiply per sample:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           grad_input_n = grad_input.select(0, elt);
           grad_output_n = grad_output.select(0, elt);
 
@@ -739,12 +750,20 @@ void slow_conv_transpose3d_acc_grad_parameters_cpu(
         int64_t elt;
         // For each elt in batch, do:
         for (elt = 0; elt < batch_size; ++elt) {
+<<<<<<< HEAD
           // Matrix mulitply per output:
+=======
+          // Matrix multiply per output:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           grad_output_n = grad_output.select(0, elt);
 
           // Do Weight:
           if (grad_weight.defined()) {
+<<<<<<< HEAD
             // Matrix mulitply per output:
+=======
+            // Matrix multiply per output:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input_n = input.select(0, elt);
 
             if (need_columns) {
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index ba7d9601fad5..ddd0dbb4b36f 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -61,7 +61,10 @@
 #include <c10/core/SymIntArrayRef.h>
 #include <utility>
 #include <vector>
+<<<<<<< HEAD
 #include <iostream>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static const int MIOPEN_DIM_MAX = 5;
 
@@ -133,7 +136,11 @@ static inline MemoryFormat suggest_memory_format_contig(const Tensor& t) {
 }
 
 template<typename scalar_t, typename param_t>
+<<<<<<< HEAD
 std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
+=======
+static std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input, const Tensor& weight, const Tensor& bias,
     const Tensor& save_mean /* optional */, const Tensor& save_invstd /* optional */,
     const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */,
@@ -198,7 +205,11 @@ std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
 }
 
 template<typename scalar_t, typename param_t, template<typename T> class VarTransform>
+<<<<<<< HEAD
 std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
+=======
+static std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input, const Tensor& running_mean, const Tensor& running_var,
     double momentum, double eps, Tensor& save_mean, Tensor& save_var_transform) {
 
@@ -288,7 +299,11 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
 }
 
 template<typename scalar_t, typename param_t, template<typename T> class VarTransform>
+<<<<<<< HEAD
 std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
+=======
+static std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input, const Tensor& running_mean, const Tensor& running_var,
     double momentum, double eps) {
   int64_t n_input = input.size(1);
@@ -307,7 +322,11 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
 }
 
 template<typename scalar_t, typename param_t>
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
+=======
+static std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& grad_out_, const Tensor& input, const Tensor& weight,
     const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd,
     bool train, double eps, std::array<bool,3> grad_input_mask) {
@@ -528,6 +547,7 @@ BatchNormBackend _select_batch_norm_backend(
   bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(ROCM_VERSION >= 70000);
 
   if (
+<<<<<<< HEAD
       input.is_cuda()
       && (input.dim() <= MIOPEN_DIM_MAX)
       && (input.scalar_type() != at::kDouble)
@@ -541,6 +561,22 @@ BatchNormBackend _select_batch_norm_backend(
       && (input.dim() >= 3)
       && detail::getCUDAHooks().compiledWithMIOpen()
       && cudnn_enabled
+=======
+      detail::getCUDAHooks().compiledWithMIOpen()
+      && cudnn_enabled
+      && input.is_cuda()
+      && input.dim() <= MIOPEN_DIM_MAX
+      && input.dim() >= 3
+      && input.scalar_type() != at::kDouble
+#if (defined(USE_ROCM) && ROCM_VERSION < 60400)
+      && (input.scalar_type() != at::kBFloat16)
+#endif
+      && (detail::getCUDAHooks().versionMIOpen() >= 30400 || input.scalar_type() != at::kBFloat16)
+      && weight.scalar_type() == at::kFloat // only FP32 weight for FP32 or FP16/BF16(mixed) input
+      && weight.defined() && bias.defined()
+      && ((running_mean.defined() && running_var.defined())
+        || (!running_mean.defined() && !running_var.defined() && training))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       && (input.suggest_memory_format() == MemoryFormat::Contiguous
 #if (defined(USE_ROCM) && ROCM_VERSION >= 60500)
         || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
@@ -554,7 +590,10 @@ BatchNormBackend _select_batch_norm_backend(
   return BatchNormBackend::Native;
 }
 
+<<<<<<< HEAD
 bool PYTORCH_MIOPEN_EXTRA_LOGGING = c10::utils::check_env("PYTORCH_MIOPEN_EXTRA_LOGGING").value_or(false);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // _batch_norm_impl_index(_backward) are used in the JIT be able to keep the run-time selection
 // of backends, while enabling it to keep the information about the used backend, so that it can
@@ -565,6 +604,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
     const Tensor& input, const std::optional<Tensor>& weight_opt /* optional */, const std::optional<Tensor>& bias_opt /* optional */, const std::optional<Tensor>& running_mean_opt /* optional */, const std::optional<Tensor>& running_var_opt /* optional */,
     bool training, double momentum, double eps, bool cudnn_enabled) {
   // See [Note: hacky wrapper removal for optional tensor]
+<<<<<<< HEAD
   if (PYTORCH_MIOPEN_EXTRA_LOGGING)
     std :: cout
       << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index"
@@ -579,6 +619,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
       << " cudnn_enabled=" << cudnn_enabled
       << std::endl;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
   const Tensor& bias = bias_opt.value_or(Tensor());
@@ -638,6 +680,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
 
   Tensor reserve = at::empty({0}, input.options().dtype(kByte));
 
+<<<<<<< HEAD
   if (PYTORCH_MIOPEN_EXTRA_LOGGING)
     std::cout
             << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (use_miopen)"
@@ -656,6 +699,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
   if (backend == BatchNormBackend::Miopen) {
     if (PYTORCH_MIOPEN_EXTRA_LOGGING)
 	    std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (calling miopen_batch_norm)" << std::endl;
+=======
+  if (backend == BatchNormBackend::Miopen) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::tuple_cat(
              at::miopen_batch_norm(
                input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(),
@@ -678,8 +724,11 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
     const Tensor& input, const Tensor& grad_output, const std::optional<Tensor>& weight_opt /* optional */, const std::optional<Tensor>& running_mean_opt /* optional */, const std::optional<Tensor>& running_var_opt /* optional */, const std::optional<Tensor>& save_mean_opt /* optional */, const std::optional<Tensor>& save_var_transform_opt /* optional */,
     bool train, double epsilon, std::array<bool, 3> output_mask, const Tensor &reservedSpace) {
   // See [Note: hacky wrapper removal for optional tensor]
+<<<<<<< HEAD
   if (PYTORCH_MIOPEN_EXTRA_LOGGING)
     std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward" << std::endl;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
   const Tensor& running_mean = running_mean_opt.value_or(Tensor());
@@ -710,16 +759,22 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
 
   // backward in inference mode is not supported in cudnn, fallback to native
   if (impl_index == 0 || (!train)) {
+<<<<<<< HEAD
     if (PYTORCH_MIOPEN_EXTRA_LOGGING)
       std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward (calling native_batch_norm_backward)" << std::endl;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return at::native_batch_norm_backward(grad_output, input, weight, running_mean, running_var, save_mean, save_var_transform, train, epsilon, output_mask);
   } else if (impl_index == 1) {
     // TODO: _batch_norm_impl_index_backward is only used in JIT. cudnn NHWC
     // format conversion is done inside cudnn_batch_norm_backward instead
     return at::cudnn_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, epsilon, reservedSpace);
   } else if (impl_index == 2) {
+<<<<<<< HEAD
     if (PYTORCH_MIOPEN_EXTRA_LOGGING)
       std :: cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index_backward (calling miopen_batch_norm_backward)" << std::endl;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return at::miopen_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, epsilon);
   }
   TORCH_INTERNAL_ASSERT(false, "Unsupported impl_index in _batch_norm_impl_index_backward: ", impl_index);
@@ -730,6 +785,7 @@ Tensor batch_norm(
     const Tensor& input, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& bias_opt,
     const std::optional<Tensor>& running_mean_opt, const std::optional<Tensor>& running_var_opt,
     bool training, double momentum, double eps, bool cudnn_enabled) {
+<<<<<<< HEAD
   if (PYTORCH_MIOPEN_EXTRA_LOGGING)
     std :: cout
       << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* batch_norm"
@@ -744,6 +800,8 @@ Tensor batch_norm(
       << " cudnn_enabled=" << cudnn_enabled
       << std::endl;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const Tensor& weight = weight_opt.value_or(Tensor());
   const Tensor& bias = bias_opt.value_or(Tensor());
   const Tensor& running_mean = running_mean_opt.value_or(Tensor());
@@ -835,6 +893,14 @@ std::tuple<Tensor, Tensor> batch_norm_update_stats_cpu(
 
 std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_cpu_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& bias_opt, const std::optional<Tensor>& running_mean_opt, const std::optional<Tensor>& running_var_opt,
                                                   bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) {
+<<<<<<< HEAD
+=======
+  const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined());
+  const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined());
+  TORCH_CHECK_VALUE(has_running_mean == has_running_var,
+    "running_mean and running_var must either both be None or neither be None");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index e7e8a49b452f..195065b13d58 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -880,7 +880,11 @@ struct FullBidirectionalLayer
       step_inputs = input_w.unbind(0);
       auto fw_result = layer_(
           step_inputs, input_hidden.first, params.first, true);
+<<<<<<< HEAD
       TORCH_CHECK(fw_result.outputs.size() > 0, "Expected sequence length to be larger than 0 in RNN");
+=======
+      TORCH_CHECK(!fw_result.outputs.empty(), "Expected sequence length to be larger than 0 in RNN");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto fw_output = at::stack(fw_result.outputs, 0);
       input_w = params.second.linear_ih(input);
       step_inputs = input_w.unbind(0);
@@ -895,7 +899,11 @@ struct FullBidirectionalLayer
 
     step_inputs = input.unbind(0);
     auto fw_result = layer_(step_inputs, input_hidden.first, params.first);
+<<<<<<< HEAD
     TORCH_CHECK(fw_result.outputs.size() > 0, "Expected sequence length to be larger than 0 in RNN");
+=======
+    TORCH_CHECK(!fw_result.outputs.empty(), "Expected sequence length to be larger than 0 in RNN");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto fw_output = at::stack(fw_result.outputs, 0);
     auto rev_step_inputs = reverse(std::move(step_inputs));
     auto rev_result =
diff --git a/aten/src/ATen/native/RangeFactories.cpp b/aten/src/ATen/native/RangeFactories.cpp
index 5ecc0f159331..3703d10b2486 100644
--- a/aten/src/ATen/native/RangeFactories.cpp
+++ b/aten/src/ATen/native/RangeFactories.cpp
@@ -157,12 +157,17 @@ Tensor& range_out(const Scalar& start, const Scalar& end, const Scalar& step, Te
     auto xend = end.to<accscalar_t>();
     auto xstep = step.to<accscalar_t>();
 
+<<<<<<< HEAD
     TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
     TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
              std::isfinite(static_cast<double>(xend)),
              "unsupported range: ", xstart, " -> ", xend);
     TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
              "upper bound and lower bound inconsistent with step sign");
+=======
+    arange_check_bounds(start, end, step);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
     if (result.numel() != size) {
       result.resize_({size});
diff --git a/aten/src/ATen/native/RangeUtils.h b/aten/src/ATen/native/RangeUtils.h
index d1756db75016..94645ff462c9 100644
--- a/aten/src/ATen/native/RangeUtils.h
+++ b/aten/src/ATen/native/RangeUtils.h
@@ -2,6 +2,7 @@
 #include <c10/core/Scalar.h>
 #include <limits>
 
+<<<<<<< HEAD
 namespace at {
 
 namespace native {
@@ -19,6 +20,36 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar
             "unsupported range: ", xstart, " -> ", xend);
   TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
             "upper bound and larger bound inconsistent with step sign");
+=======
+
+
+namespace at::native {
+
+inline void arange_check_bounds(
+    const c10::Scalar& start,
+    const c10::Scalar& end,
+    const c10::Scalar& step) {
+  // use double precision for validation to avoid precision issues
+  double dstart = start.to<double>();
+  double dend = end.to<double>();
+  double dstep = step.to<double>();
+
+  TORCH_CHECK(dstep > 0 || dstep < 0, "step must be nonzero");
+  TORCH_CHECK(
+      std::isfinite(dstart) && std::isfinite(dend),
+      "unsupported range: ",
+      dstart,
+      " -> ",
+      dend);
+  TORCH_CHECK(
+      ((dstep > 0) && (dend >= dstart)) || ((dstep < 0) && (dend <= dstart)),
+      "upper bound and lower bound inconsistent with step sign");
+}
+
+template <typename scalar_t>
+int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar& step) {
+  arange_check_bounds(start, end, step);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // we use double precision for (start - end) / step
   // to compute size_d for consistency across devices.
@@ -29,6 +60,13 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar
   // the corner-case we do want to take into account is int64_t, which has higher precision than double
   double size_d;
   if constexpr (std::is_same_v<scalar_t, int64_t>) {
+<<<<<<< HEAD
+=======
+    using accscalar_t = at::acc_type<scalar_t, false>;
+    auto xstart = start.to<accscalar_t>();
+    auto xend = end.to<accscalar_t>();
+    auto xstep = step.to<accscalar_t>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t sgn = (xstep > 0) - (xstep < 0);
     size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
   } else {
@@ -42,4 +80,8 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar
   return static_cast<int64_t>(size_d);
 }
 
+<<<<<<< HEAD
 }}  // namespace at::native
+=======
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index e5778411870c..53b88ff19b9e 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -472,7 +472,11 @@ Tensor& logcumsumexp_out(const Tensor& self, int64_t dim, Tensor& result) {
 }
 
 template <class Stub>
+<<<<<<< HEAD
 void impl_func_cum_ops(
+=======
+static void impl_func_cum_ops(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     int64_t dim,
     const Tensor& result,
@@ -769,7 +773,11 @@ inline bool isnan_(T x) {
 }
 
 template<typename T1, typename T2, typename Operation>
+<<<<<<< HEAD
 void cummax_cummin_helper(const T1* self_data, T1* values_data, T2* indices_data,
+=======
+static void cummax_cummin_helper(const T1* self_data, T1* values_data, T2* indices_data,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           int self_dim_size, int self_stride, int values_stride, int indices_stride) {
       Operation op;
       T1 out = c10::load(self_data);
@@ -1182,7 +1190,11 @@ std::vector<Tensor> gradient(const Tensor& self, IntArrayRef dim, int64_t edge_o
 
 // ALL REDUCE #################################################################
 
+<<<<<<< HEAD
 inline bool should_use_acc_buffer(at::TensorIterator& iter) {
+=======
+static inline bool should_use_acc_buffer(at::TensorIterator& iter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto ndim = iter.ndim();
   if (!iter.device().is_cpu() || iter.noutputs() != 1) {
     return false;
@@ -1244,7 +1256,11 @@ Tensor& sum_out(const Tensor& self, DimnameList dim,
 Tensor& nansum_out(const Tensor& self, at::OptionalIntArrayRef dim,
                        bool keepdim, std::optional<ScalarType> opt_dtype, Tensor& result) {
   if (self.device().is_cpu()) {
+<<<<<<< HEAD
     TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "nansum does not support complex inputs");
+=======
+    TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "nansum on CPU does not support complex inputs");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // For integral types, use existing sum as
@@ -1591,7 +1607,11 @@ Tensor norm(const Tensor& self, const Scalar& p) {
   return at::norm(self, p, IntArrayRef{}, false);
 }
 
+<<<<<<< HEAD
 inline TensorIterator get_allany_iter(
+=======
+static inline TensorIterator get_allany_iter(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     const Tensor& result,
     OptionalIntArrayRef dims,
@@ -1608,7 +1628,11 @@ inline TensorIterator get_allany_iter(
 }
 
 template <int identity, typename Stub>
+<<<<<<< HEAD
 inline void allany_impl(
+=======
+static inline void allany_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     const Tensor& result,
     OptionalIntArrayRef dims,
@@ -1653,7 +1677,11 @@ TORCH_IMPL_FUNC(any_all_out)(const Tensor& self, const Tensor& result) {
 }
 
 template <bool is_all>
+<<<<<<< HEAD
 Tensor allany_dims_default(const Tensor &self, OptionalIntArrayRef dim, bool keepdim) {
+=======
+static Tensor allany_dims_default(const Tensor &self, OptionalIntArrayRef dim, bool keepdim) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Default implementation in terms of all-reduce or single dim reduce
   if (!dim) {
     Tensor out;
@@ -1732,7 +1760,11 @@ TORCH_IMPL_FUNC(amax_out) (const Tensor& self, IntArrayRef dim, bool keepdim, co
 }
 
 template <class Stub>
+<<<<<<< HEAD
 void argmax_argmin_impl(
+=======
+static void argmax_argmin_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     std::optional<int64_t> dim,
     bool keepdim,
diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp
index fe1db473ea4c..8575a91534b8 100644
--- a/aten/src/ATen/native/Repeat.cpp
+++ b/aten/src/ATen/native/Repeat.cpp
@@ -74,7 +74,11 @@ Tensor repeat_interleave_symint(
   }
 
   Tensor repeats_ = repeats;
+<<<<<<< HEAD
   if (repeats.dim() == 0 || (repeats.dim() == 1 && repeats.sym_size(0) == 1)) {
+=======
+  if (repeats.dim() == 0 || (repeats.dim() == 1 && TORCH_GUARD_OR_FALSE(repeats.sym_size(0).sym_eq(1)))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     repeats_ = repeats.reshape({1}).expand_symint({input.sym_size(dim.value())});
   } else if (repeats.dim() == 1) {
     TORCH_CHECK(
diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
index a80b8eb52b61..9efaad95d76a 100644
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@@ -9,6 +9,10 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/resize_as_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/resize_as_sparse_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/resize_native.h>
 #include <ATen/ops/resize.h>
 #include <ATen/ops/_resize_output.h>
@@ -21,7 +25,11 @@ namespace at::native {
 
 // Returns true if resize is necessary
 template <typename T>
+<<<<<<< HEAD
 bool _resize_output_check(const Tensor& output, ArrayRef<T> shape) {
+=======
+static bool _resize_output_check(const Tensor& output, ArrayRef<T> shape) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Tests for resizing of tensors with one or more elements
   if (at::symint::sizes<T>(output).equals(shape)) {
     return false;
@@ -56,7 +64,11 @@ static void native_resize_(const Tensor& output, SymIntArrayRef shape) {
 }
 
 template <typename T>
+<<<<<<< HEAD
 bool _resize_output(const Tensor& output, ArrayRef<T> shape) {
+=======
+static bool _resize_output(const Tensor& output, ArrayRef<T> shape) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (_resize_output_check<T>(output, shape)) {
     // avoid a redispatch for cpu and cuda.
     // TODO: when resize_cuda_ is re-written to be unified with resize_,
@@ -196,7 +208,11 @@ static void _maybe_resize_storage(TensorImpl* self, c10::SymInt new_size_bytes)
 }
 
 template <typename T>
+<<<<<<< HEAD
 TensorImpl* _resize_impl_(
+=======
+static TensorImpl* _resize_impl_(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorImpl* self,
     ArrayRef<T> size,
     at::OptionalArrayRef<T> stride,
@@ -234,7 +250,11 @@ TensorImpl* resize_impl_cpu_(
 }
 
 template <typename T>
+<<<<<<< HEAD
 const Tensor& _resize_(
+=======
+static const Tensor& _resize_(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     ArrayRef<T> size,
     std::optional<MemoryFormat> optional_memory_format) {
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
index 9111e4a08007..bb320a512983 100644
--- a/aten/src/ATen/native/Resize.h
+++ b/aten/src/ATen/native/Resize.h
@@ -101,7 +101,11 @@ inline void checkInBoundsForStorage(
   // It's ok to always evaluate to False for this early return for SymInts because
   // (1) maybe_convert_symint below only installs guard for int64_t case
   // (2) we check for this condition in the TORCH_MAYBE_SYM_CHECK below
+<<<<<<< HEAD
   if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(storage_size_bytes, 0))) {
+=======
+  if (TORCH_GUARD_OR_FALSE(sym_eq(storage_size_bytes, 0))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // NB: (a tensor with arbitrary 0 dims)'s storage can have any numel.
     return;
   }
@@ -138,7 +142,11 @@ inline void checkSetStorage(Tensor& result, Storage storage, T storage_offset,
 
   // storageOffset
   TORCH_CHECK(
+<<<<<<< HEAD
       storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset);
+=======
+    TORCH_GUARD_OR_TRUE(sym_ge(storage_offset, 0)), "Tensor: invalid storage offset ", storage_offset);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // set_storage_{device} (except set_storage_meta__symint)
   // will (unsafely) set the storage offset and then call resize_impl that
diff --git a/aten/src/ATen/native/ScatterGatherChecks.h b/aten/src/ATen/native/ScatterGatherChecks.h
index 3a826a7a1b93..1333809e51be 100644
--- a/aten/src/ATen/native/ScatterGatherChecks.h
+++ b/aten/src/ATen/native/ScatterGatherChecks.h
@@ -19,8 +19,13 @@ inline void scatter_gather_dtype_check(
 ) {
   if (index.numel() != 0) {
     TORCH_CHECK(
+<<<<<<< HEAD
       index.scalar_type() == at::ScalarType::Long,
       method_name, "(): Expected dtype int64 for index"
+=======
+      index.scalar_type() == at::ScalarType::Long || index.scalar_type() == at::ScalarType::Int,
+      method_name, "(): Expected dtype int32/int64 for index"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     );
   }
 
diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h
index edaa106fc83c..d3fb1b0fa87f 100644
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@@ -26,7 +26,11 @@ template <typename scalar_t>
 inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
 #if defined(__HIPCC__)
   // TODO: remove this special case for HIP when issue is fixed:
+<<<<<<< HEAD
   //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+=======
+  //       https://github.com/ROCm/hip/issues/2209
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b));
 #else
   scalar_t max = at::_isnan(b) ? b : std::max(a, b);
@@ -37,7 +41,11 @@ template <typename scalar_t>
 inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
 #if defined(__HIPCC__)
   // TODO: remove this special case for HIP when issue is fixed:
+<<<<<<< HEAD
   //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+=======
+  //       https://github.com/ROCm/hip/issues/2209
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b));
 #else
   scalar_t min = at::_isnan(b) ? b : std::min(a, b);
diff --git a/aten/src/ATen/native/SobolEngineOpsUtils.cpp b/aten/src/ATen/native/SobolEngineOpsUtils.cpp
index 3d492221c505..5b0f6f7198ef 100644
--- a/aten/src/ATen/native/SobolEngineOpsUtils.cpp
+++ b/aten/src/ATen/native/SobolEngineOpsUtils.cpp
@@ -31,7 +31,11 @@ is present in the working directory). For additional details see [1].
     # read in as dataframe, explicitly use zero values
     df = pd.DataFrame(rows).fillna(0).astype(int)
 
+<<<<<<< HEAD
     # peform conversion according to Section 2.1 of [1]
+=======
+    # perform conversion according to Section 2.1 of [1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     df["poly"] = 2 * df["a"] + 2 ** df["s"] + 1
 
     # ensure columns are properly ordered
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index 92fc59f1c1e7..017b17aa3002 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -559,7 +559,11 @@ Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const std::
       TORCH_CHECK((input_.sizes()[0] == mask.sizes()[0]) && (input_.sizes()[2] == mask.sizes()[1]),
                   "For mask_type == 1 mask shape should be (B, L)");
       if (dim_ != input_.dim() - 1) {
+<<<<<<< HEAD
             // We only process padding mask in the optimized way if softmax is applied along the last dimesion,
+=======
+            // We only process padding mask in the optimized way if softmax is applied along the last dimension,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             // otherwise we need to expand the mask into a generic 4D one
             mask = mask_.view({input_.sizes()[0], 1, 1, input_.sizes()[2]});
             mask = mask.expand(input_.sizes()).contiguous();
@@ -570,7 +574,11 @@ Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const std::
       TORCH_CHECK((mask.dim() == 2) && (input_.sizes()[2] == mask.sizes()[0]) && (input_.sizes()[2] == mask.sizes()[1]),
                   "For mask_type == 0 mask shape should be (L, L)");
       if (dim_ != input_.dim() - 1) {
+<<<<<<< HEAD
             // We only process attention mask in a optimized way if softmax is applied along the last dimesion,
+=======
+            // We only process attention mask in a optimized way if softmax is applied along the last dimension,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             // otherwise we need to expand the mask into a generic 4D one
             mask = mask.view({1, 1, input_.sizes()[2], input_.sizes()[2]});
             mask = mask.expand(input_.sizes()).contiguous();
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 0658ed6f27bd..256eff2610c0 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -756,7 +756,11 @@ static DimVector default_alldims(const Tensor& self, at::OptionalIntArrayRef dim
     IntArrayRef dim_unwrapped = *dim_opt;
     dim.resize(dim_unwrapped.size());
     for (const auto i : c10::irange(dim.size())) {
+<<<<<<< HEAD
       dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim(), /*wrap_scalars=*/false);
+=======
+      dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim(), /*wrap_scalar=*/false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   } else {
     dim.resize(self.dim());
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index d8d19afeeb3d..aeb5af2038ab 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -147,7 +147,10 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 std::string shapes_as_str(TensorList tensors);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AdvancedIndex make_info(Tensor self, IOptTensorListRef orig);
 
 } // namespace at::native
@@ -176,9 +179,16 @@ TORCH_META_FUNC(gather)
   auto is_index_empty = index.numel() == 0;
   if (!is_index_empty) {
     TORCH_CHECK(
+<<<<<<< HEAD
         index.scalar_type() == at::ScalarType::Long,
         "gather",
         "(): Expected dtype int64 for index");
+=======
+        index.scalar_type() == ScalarType::Long ||
+            index.scalar_type() == ScalarType::Int,
+        "gather",
+        "(): Expected dtype int32/int64 for index");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   if (is_index_empty)
     return;
@@ -186,7 +196,11 @@ TORCH_META_FUNC(gather)
 }
 
 template <bool use_new_options = false, typename Meta>
+<<<<<<< HEAD
 void scatter_meta_impl(
+=======
+static void scatter_meta_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Meta& meta,
     const Tensor& self,
     int64_t dim,
@@ -358,7 +372,11 @@ TORCH_PRECOMPUTE_META_FUNC(index_copy)
 }
 
 template <typename Meta>
+<<<<<<< HEAD
 void index_func_meta_impl(
+=======
+static void index_func_meta_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Meta& meta,
     const Tensor& self,
     int64_t dim,
@@ -593,6 +611,7 @@ static bool all_strides_match(TensorList tensors) {
   return true;
 }
 
+<<<<<<< HEAD
 inline std::string shapes_as_str(TensorList tensors) {
   std::ostringstream os;
   bool first = true;
@@ -608,6 +627,8 @@ inline std::string shapes_as_str(TensorList tensors) {
   return os.str();
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Replace indexed dimensions in src with stride 0 and the size of the result
 // tensor. The offset in these dimensions is computed by the kernel using the
 // index tensor's values and the stride of src. The new shape is not meaningful.
@@ -1009,7 +1030,12 @@ Tensor& _index_put_impl_(
   }
   if ((self.device().type() == DeviceType::CUDA ||
        self.device().type() == DeviceType::XPU) &&
+<<<<<<< HEAD
       (accumulate || globalContext().deterministicAlgorithms())) {
+=======
+      (accumulate ||
+       (globalContext().deterministicAlgorithms() && value_.numel() > 1))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(
         value_.device() == self.device(),
         "expected device ",
@@ -2249,7 +2275,11 @@ template <
     typename T,
     typename ReduceStub,
     typename FillStub>
+<<<<<<< HEAD
 void scatter_impl(
+=======
+static void scatter_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     int64_t dim,
     const Tensor& index,
@@ -2822,7 +2852,11 @@ Tensor _gather_sparse_backward(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 int64_t count_nonzero_impl(TensorIteratorBase& iter, Range range) {
+=======
+static int64_t count_nonzero_impl(TensorIteratorBase& iter, Range range) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t num_nonzero = 0;
 
   auto loop = [&](char** data, const int64_t* strides, int64_t n) {
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index f37376b5fc83..b42c2854c1cb 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -89,6 +89,19 @@ static inline void check_for_unsupported_isin_dtype(const ScalarType type) {
       type);
 }
 
+<<<<<<< HEAD
+=======
+static inline void check_for_unsupported_clamp_dtypes(ScalarType dtype) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      !isComplexType(dtype), "clamp is not supported for complex types");
+}
+
+static inline void check_for_unsupported_clamp_dtypes(const Scalar& s) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      !s.isComplex(), "clamp is not supported for complex types");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_META_FUNC(clamp)
 (const Tensor& self, const OptionalScalarRef min, const OptionalScalarRef max) {
   if (!min && !max) {
@@ -96,9 +109,14 @@ TORCH_META_FUNC(clamp)
         false, "torch.clamp: At least one of 'min' or 'max' must not be None");
   }
   // Manual type promotion, since scalars have to participate in it
+<<<<<<< HEAD
   ScalarType result_type = self.scalar_type();
   TORCH_CHECK(
       !isComplexType(result_type), "clamp is not supported for complex types");
+=======
+  auto result_type = self.scalar_type();
+  check_for_unsupported_clamp_dtypes(result_type);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Floating is the highest supported
   if (!isFloatingType(result_type)) {
     at::native::ResultTypeState state = {};
@@ -122,8 +140,12 @@ TORCH_META_FUNC(clamp)
         self.dtype());
   }
   // make sure scalars weren't complex
+<<<<<<< HEAD
   TORCH_CHECK(
       !isComplexType(result_type), "clamp is not supported for complex types");
+=======
+  check_for_unsupported_clamp_dtypes(result_type);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   build_unary_op(maybe_get_output(), self.to(result_type));
 }
 
@@ -132,9 +154,13 @@ TORCH_META_FUNC2(clamp, Tensor)
   TORCH_CHECK(
       min || max,
       "torch.clamp: At least one of 'min' or 'max' must not be None");
+<<<<<<< HEAD
   TORCH_CHECK(
       !isComplexType(self.scalar_type()),
       "clamp is not supported for complex types");
+=======
+  check_for_unsupported_clamp_dtypes(self.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define CLAMP_CONFIG()                      \
   TensorIteratorConfig()                    \
       .set_check_mem_overlap(true)          \
@@ -157,10 +183,16 @@ TORCH_META_FUNC(clamp_max)(const Tensor& self, const Scalar& max) {
   // we could wrap max into tensor and send to tensor overload,
   // but relu is implemented via clamp_min, so for perf an uniformity reasons
   // do a faster but correct thing
+<<<<<<< HEAD
   ScalarType result_type = self.scalar_type();
   TORCH_CHECK(
       !isComplexType(result_type), "clamp is not supported for complex types");
   TORCH_CHECK(!max.isComplex(), "clamp is not supported for complex types");
+=======
+  auto result_type = self.scalar_type();
+  check_for_unsupported_clamp_dtypes(result_type);
+  check_for_unsupported_clamp_dtypes(max);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Floating is the highest supported
   if (!isFloatingType(result_type)) {
     auto result_type = at::native::result_type(self, max);
@@ -183,10 +215,16 @@ TORCH_META_FUNC2(clamp_max, Tensor)(const Tensor& self, const Tensor& max) {
 }
 
 TORCH_META_FUNC(clamp_min)(const Tensor& self, const Scalar& min) {
+<<<<<<< HEAD
   ScalarType result_type = self.scalar_type();
   TORCH_CHECK(
       !isComplexType(result_type), "clamp is not supported for complex types");
   TORCH_CHECK(!min.isComplex(), "clamp is not supported for complex types");
+=======
+  auto result_type = self.scalar_type();
+  check_for_unsupported_clamp_dtypes(result_type);
+  check_for_unsupported_clamp_dtypes(min);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Floating is the highest supported
   if (!isFloatingType(result_type)) {
     auto result_type = at::native::result_type(self, min);
@@ -485,13 +523,21 @@ void _assert_async_cpu(const Tensor& self) {
 void _assert_async_msg_cpu(const Tensor& self, std::string_view assert_msg) {
   TORCH_CHECK(
       native::is_nonzero(self),
+<<<<<<< HEAD
       assert_msg != "" ? assert_msg : "Assertion is failed");
+=======
+      !assert_msg.empty() ? assert_msg : "Assertion is failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void _assert_scalar(const Scalar& scalar, std::string_view assert_msg) {
   TORCH_SYM_CHECK(
       scalar.toSymBool(),
+<<<<<<< HEAD
       assert_msg != "" ? assert_msg : "Assertion is failed");
+=======
+      !assert_msg.empty() ? assert_msg : "Assertion is failed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor _functional_assert_scalar(
@@ -569,7 +615,11 @@ static void isin_sorting(
 }
 
 template <typename... Args>
+<<<<<<< HEAD
 Device out_device(Args&... inps) {
+=======
+static Device out_device(Args&... inps) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& i : {inps...}) {
     if (i.device() != at::kCPU) {
       return i.device();
@@ -739,7 +789,11 @@ std::tuple<Tensor&, Tensor&> mode_out(
 }
 
 template <class Stub>
+<<<<<<< HEAD
 void minmax_out_impl(
+=======
+static void minmax_out_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     int64_t dim,
     bool keepdim,
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 3a60eddbe8fc..690367b0409e 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -806,7 +806,11 @@ Tensor sparse_compressed_to_dense(
 
 // Computes the strides for view_dtype output when the view dtype is
 // smaller than the original dtype
+<<<<<<< HEAD
 inline SymDimVector compute_strides_for_view_dtype_downsize(
+=======
+static inline SymDimVector compute_strides_for_view_dtype_downsize(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SymIntArrayRef old_strides,
     int64_t size_ratio,
     ScalarType old_dtype,
@@ -832,7 +836,11 @@ inline SymDimVector compute_strides_for_view_dtype_downsize(
 
 // Computes the strides for view_dtype output when the view dtype is
 // larger than the original dtype
+<<<<<<< HEAD
 inline SymDimVector compute_strides_for_view_dtype_upsize(
+=======
+static inline SymDimVector compute_strides_for_view_dtype_upsize(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SymIntArrayRef old_strides,
     int64_t size_ratio,
     ScalarType old_dtype,
@@ -1013,6 +1021,7 @@ static Tensor _batch_tile_tensor(
 
 static Tensor _mask_to_indices(const Tensor& mask) {
   // This function returns a vector of the indices at which given
+<<<<<<< HEAD
   // boolean mask is True. at::nonzero can achieve the same, but
   // we yet have to compare the performance difference.
   TORCH_CHECK(
@@ -1039,6 +1048,25 @@ static std::pair<Tensor, Tensor> _not_zero_mask_to_col_row_indices(
           .expand_as(not_zero_mask)
           .masked_select(not_zero_mask);
   return std::pair<Tensor, Tensor>(col_indices, row_indices);
+=======
+  // boolean mask is True. Here at::nonzero performs test (time/mem).
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      mask.dim() == 1, "_mask_to_indices only supports 1-d masks.");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      mask.dtype() == at::kBool, "Expected mask to be of dtype bool.");
+  return at::native::flatten(at::nonzero(mask));
+}
+
+static std::pair<Tensor, Tensor> _not_zero_mask_to_col_row_indices(
+    Tensor not_zero_mask) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      not_zero_mask.dim() == 2,
+      "_not_zero_mask_to_col_row_indices only supports 2-d masks.");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      not_zero_mask.dtype() == at::kBool, "Expected mask to be of dtype bool.");
+  auto nz = not_zero_mask.nonzero();
+  return {nz.select(1, 1), nz.select(1, 0)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Sparse layout conversions Start
@@ -1319,8 +1347,13 @@ static Tensor dense_to_sparse_compressed(
   Tensor col_indices;
   Tensor compressed_indices;
   if (compressed_rows_layout) {
+<<<<<<< HEAD
     std::tie(col_indices, row_indices) = _not_zero_mask_to_col_row_indices(
         not_zero_mask, at::kLong, not_zero_mask.device());
+=======
+    std::tie(col_indices, row_indices) =
+        _not_zero_mask_to_col_row_indices(not_zero_mask);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     compressed_indices = at::_convert_indices_from_coo_to_csr(
         row_indices, not_zero_mask.size(0), false /*out_int32*/);
     {
@@ -1328,8 +1361,13 @@ static Tensor dense_to_sparse_compressed(
       values = values.flatten(0, 1).index_select(0, mask_indices);
     }
   } else {
+<<<<<<< HEAD
     std::tie(row_indices, col_indices) = _not_zero_mask_to_col_row_indices(
         not_zero_mask.transpose(1, 0), at::kLong, not_zero_mask.device());
+=======
+    std::tie(row_indices, col_indices) =
+        _not_zero_mask_to_col_row_indices(not_zero_mask.transpose(1, 0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     compressed_indices = at::_convert_indices_from_coo_to_csr(
         col_indices, not_zero_mask.size(-1), false /*out_int32*/);
     {
@@ -1708,7 +1746,11 @@ static Tensor sparse_compressed_to_flipped(
 
   // Step 4:
   // Convert the COO indices to the CSC/BSC indices and form the output.
+<<<<<<< HEAD
   // We need to sort COO indices along the "tranposed" dim to satisfy the
+=======
+  // We need to sort COO indices along the "transposed" dim to satisfy the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // invariant of sorted plain indices.
   // Hash coo indices by converting 2d indices to linear offsets with
   // more "weight" (aka stride) placed on the "transposed" dimension.
@@ -1989,7 +2031,11 @@ TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_cpu)
  * Modified to ensure sorted BSR column indices.
  */
 template <class index_t, class scalar_t, bool compressed_rows>
+<<<<<<< HEAD
 void _compressed_to_block_compressed_cpu_kernel(
+=======
+static void _compressed_to_block_compressed_cpu_kernel(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const index_t n_compressed, // Tensor size along compressed dimension
     const index_t n_plain, // Tensor size along plain dimension
     const index_t C, // Block size along compressed dimensions
@@ -2086,7 +2132,11 @@ void _compressed_to_block_compressed_cpu_kernel(
  * https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h
  */
 template <class index_t>
+<<<<<<< HEAD
 index_t compressed_count_blocks(
+=======
+static index_t compressed_count_blocks(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const index_t n_compressed, // Tensor size along compressed dimension
     const index_t n_plain, // Tensor size along plain dimension
     const index_t C, // Block size along compressed dimensions
@@ -2110,7 +2160,11 @@ index_t compressed_count_blocks(
 }
 
 template <Layout target_layout>
+<<<<<<< HEAD
 Tensor _compressed_to_block_compressed_cpu(
+=======
+static Tensor _compressed_to_block_compressed_cpu(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     IntArrayRef blocksize) {
   static_assert(
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index b87e7142ea08..a9969ba462a8 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -1214,6 +1214,31 @@ Tensor randint_like(
 
 Tensor randint_like(
     const Tensor& self,
+<<<<<<< HEAD
+=======
+    const Tensor& high,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory,
+    std::optional<c10::MemoryFormat> optional_memory_format) {
+  TORCH_CHECK(
+      high.numel() == 1 && high.ndimension() == 0 && high.device().is_cpu(),
+      "high must be a scalar tensor and on CPU");
+  int64_t high_scalar = high.item<int64_t>();
+  return at::native::randint_like(
+      self,
+      high_scalar,
+      dtype,
+      layout,
+      device,
+      pin_memory,
+      optional_memory_format);
+}
+
+Tensor randint_like(
+    const Tensor& self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t low,
     int64_t high,
     std::optional<ScalarType> dtype,
@@ -2072,22 +2097,40 @@ Tensor vander(const Tensor& x, std::optional<int64_t> N, bool increasing) {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tensor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <typename T>
+<<<<<<< HEAD
 Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options) {
+=======
+static Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return at::detail::tensor_cpu(values, options);
 }
 
 template <typename T>
+<<<<<<< HEAD
 Tensor tensor_backend(ArrayRef<T> values, const TensorOptions& options) {
+=======
+static Tensor tensor_backend(ArrayRef<T> values, const TensorOptions& options) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return at::detail::tensor_backend(values, options);
 }
 
 template <typename T>
+<<<<<<< HEAD
 Tensor tensor_complex_cpu(ArrayRef<T> values, const TensorOptions& options) {
+=======
+static Tensor tensor_complex_cpu(
+    ArrayRef<T> values,
+    const TensorOptions& options) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return at::detail::tensor_complex_cpu(values, options);
 }
 
 template <typename T>
+<<<<<<< HEAD
 Tensor tensor_complex_backend(
+=======
+static Tensor tensor_complex_backend(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ArrayRef<T> values,
     const TensorOptions& options) {
   return at::detail::tensor_complex_backend(values, options);
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index c66ff757641b..b6eea2963e38 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -24,6 +24,10 @@
 #include <ATen/native/cpu/SerialStackImpl.h>
 #include <ATen/native/cpu/StackKernel.h>
 #include <ATen/quantized/QTensorImpl.h>
+<<<<<<< HEAD
+=======
+#include <c10/core/Contiguity.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/GradMode.h>
 #include <c10/util/Exception.h>
 #include <c10/util/SmallVector.h>
@@ -216,7 +220,11 @@
 
 namespace at::meta {
 
+<<<<<<< HEAD
 inline c10::MemoryFormat cat_compute_output_memory_format(
+=======
+static inline c10::MemoryFormat cat_compute_output_memory_format(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const MaterializedITensorListRef& inputs) {
   std::optional<c10::MemoryFormat> format = std::nullopt;
   for (const Tensor& t : inputs) {
@@ -430,7 +438,11 @@ Tensor& set_storage_meta__symint(
       size, stride, storage_offset);
 
   // Matches maybe_resize_storage_cpu no-numel behavior
+<<<<<<< HEAD
   if (TORCH_GUARD_SIZE_OBLIVIOUS(result.sym_numel().sym_ne(0))) {
+=======
+  if (TORCH_GUARD_OR_TRUE(result.sym_numel().sym_ne(0))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // maybe_resize_storage_cpu can handle no storage exists at all but
     // that should never be the case here
     TORCH_INTERNAL_ASSERT(storage);
@@ -439,12 +451,16 @@ Tensor& set_storage_meta__symint(
     // All meta data pointers are the same, so we don't have to "re" allocate
     // it.  TODO: Actually this might not quite be correct if we use special
     // pointers to track whether or not fake cuda tensors are pinned or not
+<<<<<<< HEAD
     const auto itemsize = result.dtype().itemsize();
     c10::SymInt new_size_bytes = result.is_contiguous()
         ? at::detail::computeStorageNbytesContiguous(
               size, itemsize, std::move(storage_offset))
         : at::detail::computeStorageNbytes(
               size, stride, itemsize, std::move(storage_offset));
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // TODO: When there are unbacked SymInts, we unconditionally skip the
     // setter.  This is technically wrong, but we cannot conveniently test
     // the real condition in many cases, because a lot of people are using
@@ -453,10 +469,27 @@ Tensor& set_storage_meta__symint(
     //
     // The old behavior was to unconditionally set_nbytes, but I think not
     // setting it is more safe.
+<<<<<<< HEAD
     if (new_size_bytes.has_hint() && storage.sym_nbytes().has_hint() &&
         TORCH_GUARD_SIZE_OBLIVIOUS(
             new_size_bytes.sym_gt(storage.sym_nbytes()))) {
       storage.set_nbytes(std::move(new_size_bytes));
+=======
+    if (result.sym_numel().has_hint()) {
+      const auto itemsize = result.dtype().itemsize();
+
+      c10::SymInt new_size_bytes = result.is_contiguous()
+          ? at::detail::computeStorageNbytesContiguous(
+                size, itemsize, std::move(storage_offset))
+          : at::detail::computeStorageNbytes(
+                size, stride, itemsize, std::move(storage_offset));
+
+      if (new_size_bytes.has_hint() && storage.sym_nbytes().has_hint() &&
+          TORCH_GUARD_SIZE_OBLIVIOUS(
+              new_size_bytes.sym_gt(storage.sym_nbytes()))) {
+        storage.set_nbytes(std::move(new_size_bytes));
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   return result;
@@ -572,7 +605,11 @@ Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) {
   // }
 
   // Then define for each sparse dim the number of reps for each nnz index/value
+<<<<<<< HEAD
   // due to broadcasting. Repetitions do not take into accout the current value
+=======
+  // due to broadcasting. Repetitions do not take into account the current value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // of nnz - this will be taken care of later {
   auto nnz_repeats = c10::DimVector(res_sparse_dim);
   nnz_repeats.back() = res_sparse_dim_broadcast_mask.back();
@@ -757,22 +794,38 @@ TORCH_IMPL_FUNC(cat_out_cpu)
 }
 
 Tensor& cat_out(TensorList tensors, Dimname dim, Tensor& result) {
+<<<<<<< HEAD
   TORCH_CHECK(!tensors.empty(), "expected a non-empty list of Tensors");
+=======
+  TORCH_CHECK_VALUE(!tensors.empty(), "expected a non-empty list of Tensors");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim));
 }
 
 Tensor cat(TensorList tensors, Dimname dim) {
+<<<<<<< HEAD
   TORCH_CHECK(!tensors.empty(), "expected a non-empty list of Tensors");
+=======
+  TORCH_CHECK_VALUE(!tensors.empty(), "expected a non-empty list of Tensors");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return at::cat(tensors, dimname_to_position(tensors[0], dim));
 }
 
 // torch.concat, alias for torch.cat
 Tensor& concat_out(TensorList tensors, Dimname dim, Tensor& result) {
+<<<<<<< HEAD
   return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim));
 }
 
 Tensor concat(TensorList tensors, Dimname dim) {
   return at::cat(tensors, dimname_to_position(tensors[0], dim));
+=======
+  return cat_out(tensors, dim, result);
+}
+
+Tensor concat(TensorList tensors, Dimname dim) {
+  return at::cat(tensors, dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor& concat_out(TensorList tensors, int64_t dim, Tensor& result) {
@@ -785,11 +838,19 @@ Tensor concat(TensorList tensors, int64_t dim) {
 
 // torch.concatenate, alias for torch.cat
 Tensor& concatenate_out(TensorList tensors, Dimname dim, Tensor& result) {
+<<<<<<< HEAD
   return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim));
 }
 
 Tensor concatenate(TensorList tensors, Dimname dim) {
   return at::cat(tensors, dimname_to_position(tensors[0], dim));
+=======
+  return cat_out(tensors, dim, result);
+}
+
+Tensor concatenate(TensorList tensors, Dimname dim) {
+  return at::cat(tensors, dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor& concatenate_out(TensorList tensors, int64_t dim, Tensor& result) {
@@ -1119,7 +1180,11 @@ std::vector<Tensor> tensor_split_sections_symint(
 }
 
 template <typename T>
+<<<<<<< HEAD
 std::vector<Tensor> _tensor_split_indices(
+=======
+static std::vector<Tensor> _tensor_split_indices(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     ArrayRef<T> indices,
     int64_t dim) {
@@ -1417,7 +1482,11 @@ Tensor as_strided_tensorimpl(
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline void setStridedUnchecked(
+=======
+static inline void setStridedUnchecked(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     ArrayRef<T> size,
     ArrayRef<T> stride,
@@ -1922,7 +1991,11 @@ Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
 // templated for ArrayRef<int64_t> and SmallVector<int64_t> use cases
 //
 template <typename Vec>
+<<<<<<< HEAD
 Tensor alias_with_sizes_and_strides(
+=======
+static Tensor alias_with_sizes_and_strides(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     const Vec& sizes,
     const Vec& strides) {
@@ -1958,7 +2031,11 @@ Tensor alias_with_sizes_and_strides(
 // SymIntArrayRef/ArrayRef<c10::SymInt> and
 // SmallVector<c10::SymInt>/SymDimVector
 template <template <typename...> typename Container>
+<<<<<<< HEAD
 Tensor alias_with_sizes_and_strides(
+=======
+static Tensor alias_with_sizes_and_strides(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     const Container<c10::SymInt>& sizes,
     const Container<c10::SymInt>& strides) {
@@ -1993,11 +2070,23 @@ Tensor reshape_symint(const Tensor& self, c10::SymIntArrayRef proposed_shape) {
     TORCH_CHECK(false, "reshape is not implemented for sparse tensors");
   }
 
+<<<<<<< HEAD
   if (self.is_contiguous() && !self.is_mkldnn()) {
     return self.view_symint(proposed_shape);
   }
 
   c10::SymDimVector shape = infer_size_dv(proposed_shape, self.sym_numel());
+=======
+  auto sym_sizes = self.sym_sizes();
+  auto sym_strides = self.sym_strides();
+  auto sym_numel = self.sym_numel();
+  if (definitely_contiguous(sym_sizes, sym_strides, sym_numel) &&
+      !self.is_mkldnn()) {
+    return self.view_symint(proposed_shape);
+  }
+
+  c10::SymDimVector shape = infer_size_dv(proposed_shape, sym_numel);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (self.is_mkldnn()) {
     return at::_mkldnn_reshape(self, C10_AS_INTARRAYREF_SLOW(shape));
@@ -2005,8 +2094,12 @@ Tensor reshape_symint(const Tensor& self, c10::SymIntArrayRef proposed_shape) {
 
   // `computeStride` returns the proper strides to use if this
   // `reshape` can be just a view.
+<<<<<<< HEAD
   auto stride =
       at::detail::computeStride(self.sym_sizes(), self.sym_strides(), shape);
+=======
+  auto stride = at::detail::computeStride(sym_sizes, sym_strides, shape);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // NB: Even though we have viewable geometry and the target strides here,
   //     we do not just call `as_strided` on `self` because the backward
@@ -3290,7 +3383,11 @@ static inline std::vector<Tensor> get_stack_inputs(
   return inputs;
 }
 
+<<<<<<< HEAD
 bool inline maybe_native_stack(
+=======
+static bool inline maybe_native_stack(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& result,
     TensorList tensors,
     int64_t dim) {
@@ -3366,7 +3463,11 @@ static std::vector<Tensor> _pad_chunk(
     std::vector<int64_t> view_sizes(
         tensor_size.begin(), tensor_size.begin() + dim);
     view_sizes.insert(view_sizes.end(), {num_chunks, -1});
+<<<<<<< HEAD
     padded_tensors.push_back(padded_tensor.view(view_sizes));
+=======
+    padded_tensors.push_back(padded_tensor.reshape(view_sizes));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return padded_tensors;
 }
@@ -3601,7 +3702,11 @@ Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) {
   // in-place operations. For other sparse formats, the in-place
   // transpose would not be possible without shuffling the specified
   // values. So we don't support this as it would defeat the purpose
+<<<<<<< HEAD
   // of in-place opreations of being memory-efficient.
+=======
+  // of in-place operations of being memory-efficient.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (self.is_sparse()) {
     return sparse_transpose_(self, dim0, dim1);
   }
@@ -3614,7 +3719,11 @@ Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) {
   std::swap(sizes[dim0], sizes[dim1]);
   SymDimVector strides(self.sym_strides().begin(), self.sym_strides().end());
   std::swap(strides[dim0], strides[dim1]);
+<<<<<<< HEAD
   auto result = self.as_strided__symint(std::move(sizes), std::move(strides));
+=======
+  self.as_strided__symint(std::move(sizes), std::move(strides));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return self;
 }
 
@@ -4021,7 +4130,11 @@ Tensor& squeeze_(Tensor& self, IntArrayRef dims) {
 // This is a hack because in-place operations on tensors treated like views
 // can be much more expensive than the same operations on non-view tensors.
 
+<<<<<<< HEAD
 inline Tensor view_impl(const Tensor& self, IntArrayRef size) {
+=======
+static inline Tensor view_impl(const Tensor& self, IntArrayRef size) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::DimVector inferred_size = at::infer_size_dv(size, self.numel());
   auto stride =
       at::detail::computeStride(self.sizes(), self.strides(), inferred_size);
@@ -4198,11 +4311,18 @@ Tensor ravel(const Tensor& self) {
 }
 
 static inline void handle_unflatten_exception(
+<<<<<<< HEAD
     const std::runtime_error& e,
     const Tensor& self,
     int64_t dim,
     SymIntArrayRef sizes,
     std::optional<DimnameList> names) {
+=======
+    const std::exception& e,
+    const Tensor& self,
+    int64_t dim,
+    SymIntArrayRef sizes) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!strstr(e.what(), "is invalid for input of size")) {
     TORCH_CHECK(false, "unflatten got an unexpected error:\n", e.what());
   }
@@ -4252,11 +4372,19 @@ static Tensor unflatten_impl(
   SymDimVector inferred_size;
   try {
     inferred_size = at::infer_size_dv(sizes, self.sym_size(dim));
+<<<<<<< HEAD
   } catch (const std::runtime_error& e) {
     // at::infer_size would throw std::runtime_error for invalid size,
     // catch the runtime_error and display the error message in a more
     // user-friendly way for both tensors and named tensors
     handle_unflatten_exception(e, self, dim, sizes, names);
+=======
+  } catch (const std::exception& e) {
+    // at::infer_size would throw std::runtime_error for invalid size,
+    // catch the runtime_error and display the error message in a more
+    // user-friendly way for both tensors and named tensors
+    handle_unflatten_exception(e, self, dim, sizes);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   SymDimVector shape(self.sym_sizes().begin(), self.sym_sizes().end());
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index ce0057909830..3be359201b90 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -258,26 +258,42 @@ TORCH_META_FUNC(neg)(const Tensor& self) {
 
 TORCH_META_FUNC(trunc) (const Tensor& self) {
   // Note: this is consistent with NumPy
+<<<<<<< HEAD
   TORCH_CHECK(!self.is_complex(),
+=======
+  TORCH_CHECK_NOT_IMPLEMENTED(!self.is_complex(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "trunc is not supported for complex inputs");
   build_borrowing_unary_op(maybe_get_output(), self);
 }
 
 TORCH_META_FUNC(floor) (const Tensor& self) {
   // Note: this is consistent with NumPy
+<<<<<<< HEAD
   TORCH_CHECK(!self.is_complex(),
+=======
+  TORCH_CHECK_NOT_IMPLEMENTED(!self.is_complex(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "floor is not supported for complex inputs");
   build_borrowing_unary_op(maybe_get_output(), self);
 }
 
 TORCH_META_FUNC(sign) (const Tensor& self) {
+<<<<<<< HEAD
   TORCH_CHECK(!self.is_complex(),
+=======
+  TORCH_CHECK_NOT_IMPLEMENTED(!self.is_complex(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead.");
   build_borrowing_unary_op(maybe_get_output(), self);
 }
 
 TORCH_META_FUNC(signbit) (const Tensor& self) {
+<<<<<<< HEAD
   TORCH_CHECK(!self.is_complex(), "signbit is not implemented for complex tensors.");
+=======
+  TORCH_CHECK_NOT_IMPLEMENTED(!self.is_complex(), "signbit is not implemented for complex tensors.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(maybe_get_output().defined() ? maybe_get_output().dtype() == at::kBool : true,
               "signbit does not support non-boolean outputs.");
   build_borrowing_unary_force_boolean_op(maybe_get_output(), self);
@@ -285,7 +301,11 @@ TORCH_META_FUNC(signbit) (const Tensor& self) {
 
 TORCH_META_FUNC(ceil) (const Tensor& self) {
   // Note: this is consistent with NumPy
+<<<<<<< HEAD
   TORCH_CHECK(!self.is_complex(),
+=======
+  TORCH_CHECK_NOT_IMPLEMENTED(!self.is_complex(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "ceil is not supported for complex inputs");
   build_borrowing_unary_op(maybe_get_output(), self);
 }
@@ -887,7 +907,11 @@ static inline void mvlgamma_check(const Tensor& self, int64_t p) {
 Tensor mvlgamma(const Tensor& self, int64_t p) {
   mvlgamma_check(self, p);
   auto dtype = c10::scalarTypeToTypeMeta(self.scalar_type());
+<<<<<<< HEAD
   if (at::isIntegralType(self.scalar_type(), /*include_bool=*/true)) {
+=======
+  if (at::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // int -> float promotion
     dtype = c10::get_default_dtype();
   }
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index ffa0b6c4f2b4..a7a58db6207b 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -106,7 +106,10 @@ DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional<Generator>), random_
 
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t, const double), kaiser_window_stub)
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t), polygamma_stub)
+<<<<<<< HEAD
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const Scalar& a, const Scalar& b), clamp_stub)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DECLARE_DISPATCH(
     void (*)(Tensor&, const Tensor&, int64_t, std::optional<Generator>),
     multinomial_with_replacement_stub)
diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index 541207a537a0..16297d56de59 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -341,7 +341,11 @@ inline int64_t nearest_idx(
     int64_t input_size,
     int64_t output_size,
     std::optional<double> scales) {
+<<<<<<< HEAD
   // This method specificly treats cases: output_size == input_size or
+=======
+  // This method specifically treats cases: output_size == input_size or
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // output_size == 2 * input_size, that we would like to get rid of
   // We keep this method for BC and consider as deprecated.
   // See nearest_exact_idx as replacement
diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp
index 428669e6466b..c0beb4b52129 100644
--- a/aten/src/ATen/native/WeightNorm.cpp
+++ b/aten/src/ATen/native/WeightNorm.cpp
@@ -53,8 +53,13 @@ std::tuple<Tensor,Tensor> weight_norm_cpu(
     int64_t dim) {
   auto w = at::empty_like(v, at::MemoryFormat::Contiguous);
 
+<<<<<<< HEAD
   // align with cuda behavior, keep norm in 'Float' when g is 'BFloat16'
   const auto dtype = g.scalar_type() == at::ScalarType::BFloat16 ?
+=======
+  // align with cuda behavior, keep norm in 'Float' when g is 'BFloat16'/'Half'
+  const auto dtype = (g.scalar_type() == at::ScalarType::BFloat16 || g.scalar_type() == at::ScalarType::Half) ?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::ScalarType::Float : g.scalar_type();
   auto norm = at::empty_strided(g.sizes(), g.strides(), g.options().dtype(dtype));
   weight_norm_stub(kCPU, w, norm, v, g, dim);
@@ -93,10 +98,14 @@ Tensor _weight_norm
   auto v = v_in.contiguous();
   auto g = g_in.contiguous();
 
+<<<<<<< HEAD
   auto has_half_dtype = v.scalar_type() == at::ScalarType::Half
     || g.scalar_type() == at::ScalarType::Half;
 
   bool can_use_fused = !has_half_dtype && ((dim == 0) || (dim == v.dim() - 1));
+=======
+  bool can_use_fused = (dim == 0) || (dim == v.dim() - 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (can_use_fused) {
     // weight_norm does not have a derivative defined for it, so this will route back through
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
index 26515a1b3042..a4b688ed26f4 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
@@ -106,7 +106,11 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeight::deserialize(
         std::get<weight_scales_index>(serialized),
         weight_zero_points,
         0, // The output channel axis is 0
+<<<<<<< HEAD
         device(c10::kCPU).dtype(c10::kQInt8));
+=======
+        at::device(c10::kCPU).dtype(c10::kQInt8));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   const at::Tensor loaded_weight_values =
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
index 480c11d262a5..d059e0f0dc6b 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
@@ -6,6 +6,10 @@
 #include <c10/util/accumulate.h>
 
 #include <ATen/native/quantized/cpu/QuantUtils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/quantized/library.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 
 #include <ATen/native/ao_sparse/quantized/cpu/packed_params.h>
@@ -20,8 +24,11 @@
 
 namespace ao::sparse {
 
+<<<<<<< HEAD
 int register_linear_params();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_PYTORCH_QNNPACK
 template <>
 at::Tensor PackedLinearWeightQnnp::apply_dynamic_impl<true>(
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
index 196e711da7eb..1d64e5f355c8 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
@@ -46,7 +46,11 @@ LinearPackedSerializationType PackedLinearWeight::unpack() {
         scales,
         zero_points,
         0, // The output channel axis is 0
+<<<<<<< HEAD
         device(c10::kCPU).dtype(c10::kQInt8));
+=======
+        at::device(c10::kCPU).dtype(c10::kQInt8));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   int8_t* weight_ptr_int8 =
@@ -100,7 +104,11 @@ LinearPackedSerializationType PackedLinearWeightQnnp::unpack() {
         scales,
         zero_points,
         0, // The output channel axis is 0
+<<<<<<< HEAD
         device(c10::kCPU).dtype(c10::kQInt8));
+=======
+        at::device(c10::kCPU).dtype(c10::kQInt8));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   int8_t* weight_ptr_int8 =
diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
index 242a0f8a62b8..057601c39f19 100644
--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@@ -15,6 +15,10 @@
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/TensorIterator.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/cpu/Elu.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/cpu/Gelu.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/Parallel.h>
@@ -190,6 +194,7 @@ static void threshold_kernel(
 void elu_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
   if (at::isReducedFloatingType(it.common_dtype())) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(it.common_dtype(), "elu_cpu", [&]() {
+<<<<<<< HEAD
       auto negcoef = alpha.to<float>() * scale.to<float>();
       auto poscoef = scale.to<float>();
       auto negiptcoef = input_scale.to<float>();
@@ -240,6 +245,19 @@ void elu_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale
               return Vec::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp);
             }
           });
+=======
+      cpu_kernel_vec(
+        it,
+        get_scalar_elu_elementwise_func<scalar_t, float>(alpha.to<float>(), scale.to<float>(), input_scale.to<float>()),
+        get_vectorized_elu_elementwise_func<scalar_t>(alpha.to<float>(), scale.to<float>(), input_scale.to<float>()));
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(it.common_dtype(), "elu_cpu", [&]() {
+      cpu_kernel_vec(
+          it,
+          get_scalar_elu_elementwise_func<scalar_t>(alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>()),
+          get_vectorized_elu_elementwise_func<scalar_t>(alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     });
   }
 }
@@ -832,9 +850,15 @@ void hardswish_backward_kernel(TensorIterator& iter) {
     cpu_kernel_vec(
       iter,
       [&](scalar_t grad_val, scalar_t self_val) -> scalar_t {
+<<<<<<< HEAD
         if (float(self_val) < neg_three) {
           return zero;
         } else if (float(self_val) <= three) {
+=======
+        if (float(self_val) <= neg_three) {
+          return zero;
+        } else if (float(self_val) < three) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return float(grad_val) * ((float(self_val) / three) + one_half);
         } else {
           return grad_val;
@@ -847,19 +871,33 @@ void hardswish_backward_kernel(TensorIterator& iter) {
           Vec::blendv(
             grad_val0 * ((self_val0 / kThreeVec) + kOneHalfVec),
             grad_val0,
+<<<<<<< HEAD
             self_val0 > kThreeVec
           ),
           kZeroVec,
           self_val0 < kNegThreeVec
+=======
+            self_val0 >= kThreeVec
+          ),
+          kZeroVec,
+          self_val0 <= kNegThreeVec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         );
         self_val1 = Vec::blendv(
           Vec::blendv(
             grad_val1 * ((self_val1 / kThreeVec) + kOneHalfVec),
             grad_val1,
+<<<<<<< HEAD
             self_val1 > kThreeVec
           ),
           kZeroVec,
           self_val1 < kNegThreeVec
+=======
+            self_val1 >= kThreeVec
+          ),
+          kZeroVec,
+          self_val1 <= kNegThreeVec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         );
         return convert_from_float<scalar_t>(self_val0, self_val1);
       });
@@ -878,9 +916,15 @@ void hardswish_backward_kernel(TensorIterator& iter) {
     cpu_kernel_vec(
       iter,
       [&](scalar_t grad_val, scalar_t self_val) {
+<<<<<<< HEAD
         if (self_val < neg_three) {
           return zero;
         } else if (self_val <= three) {
+=======
+        if (self_val <= neg_three) {
+          return zero;
+        } else if (self_val < three) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return grad_val * ((self_val / three) + one_half);
         } else {
           return grad_val;
@@ -891,10 +935,17 @@ void hardswish_backward_kernel(TensorIterator& iter) {
           Vec::blendv(
             grad_val * ((self_val / kThreeVec) + kOneHalfVec),
             grad_val,
+<<<<<<< HEAD
             self_val > kThreeVec
           ),
           kZeroVec,
           self_val < kNegThreeVec
+=======
+            self_val >= kThreeVec
+          ),
+          kZeroVec,
+          self_val <= kNegThreeVec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         );
       }
     );
diff --git a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
index d03a047d32d8..6d3ae890fcd0 100644
--- a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
@@ -585,7 +585,11 @@ cpu_adaptive_max_pool3d_channels_last(
   using Vec = vec::Vectorized<scalar_t>;
   using integer_t = vec::int_same_size_t<scalar_t>;
   using iVec = vec::Vectorized<integer_t>;
+<<<<<<< HEAD
   // for the convience of vectorization, use integer of the same size of scalar_t,
+=======
+  // for the convenience of vectorization, use integer of the same size of scalar_t,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //   e.g. int32_t for float, int64_t for double
   // need to make sure doesn't overflow
   TORCH_CHECK(input_height * input_width <= std::numeric_limits<integer_t>::max());
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 42a4d0b564ba..927597cc9c86 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -274,6 +274,29 @@ inline Vectorized<scalar_t> div_floor_floating_vec(
   return floordiv;
 }
 
+<<<<<<< HEAD
+=======
+#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
+
+// Since sve lacks sufficient bf16 intrinsics, do the calculations in f32 to
+// avoid rounding errors. This should not cause performance issues as
+// most of the used instructions would be cast to f32 vectors anyway.
+template<>
+inline Vectorized<c10::BFloat16> div_floor_floating_vec(
+  const Vectorized<c10::BFloat16>& a,
+  const Vectorized<c10::BFloat16>& b) {
+  auto [a1, a2] = convert_bfloat16_float(a);
+  auto [b1, b2] = convert_bfloat16_float(b);
+
+  auto res1 = div_floor_floating_vec(a1, b1);
+  auto res2 = div_floor_floating_vec(a2, b2);
+
+  return convert_float_bfloat16(res1, res2);
+}
+
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void div_floor_kernel(TensorIteratorBase& iter) {
   const auto dtype = iter.common_dtype();
   if (dtype == kByte) {
diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp
index 9d2509f1f24c..095135d1c106 100644
--- a/aten/src/ATen/native/cpu/BlasKernel.cpp
+++ b/aten/src/ATen/native/cpu/BlasKernel.cpp
@@ -99,7 +99,12 @@ auto sum(int64_t N, Func f) {
   return partial_sums[0];
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename opmath_t>
+=======
+template <typename scalar_t, typename opmath_t, typename out_t>
+__ubsan_ignore_signed_int_overflow__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::enable_if_t<std::is_same_v<scalar_t, opmath_t>, void>
 gemm_notrans_(
     int64_t m,
@@ -111,31 +116,52 @@ gemm_notrans_(
     const scalar_t* b,
     int64_t ldb,
     opmath_t beta,
+<<<<<<< HEAD
     scalar_t* c,
+=======
+    out_t* c,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t ldc) {
   // c *= beta
   scale_(m, n, beta, c, ldc);
 
   // c += alpha * (a @ b)
+<<<<<<< HEAD
   for (const auto l : c10::irange(k)) {
     for (const auto j : c10::irange(n)) {
       opmath_t val = b[l + j * ldb] * alpha;
       int64_t i_m = m / 4;
+=======
+  const uint64_t unsigned_m = static_cast<int64_t>(m);
+  const uint64_t i_m = unsigned_m / 4;
+  for (const uint64_t l : c10::irange(k)) {
+    for (const uint64_t j : c10::irange(n)) {
+      opmath_t val = b[l + j * ldb] * alpha;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       for (const auto i_i : c10::irange(i_m)) {
         c[j * ldc + i_i * 4 + 0] += a[i_i * 4 + 0 + l * lda] * val;
         c[j * ldc + i_i * 4 + 1] += a[i_i * 4 + 1 + l * lda] * val;
         c[j * ldc + i_i * 4 + 2] += a[i_i * 4 + 2 + l * lda] * val;
         c[j * ldc + i_i * 4 + 3] += a[i_i * 4 + 3 + l * lda] * val;
       }
+<<<<<<< HEAD
       int64_t i = i_m * 4;
       for (; i < m; i++)
+=======
+      uint64_t i = i_m * 4;
+      for (; i < unsigned_m; i++)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         c[j * ldc + i] += a[i + l * lda] * val;
     }
   }
 }
 
 // std::is_same<scalar_t, at::BFloat16> || std::is_same<scalar_t, at::Half>
+<<<<<<< HEAD
 template <typename scalar_t, typename opmath_t>
+=======
+template <typename scalar_t, typename opmath_t, typename out_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::enable_if_t<!std::is_same_v<scalar_t, opmath_t>, void>
 gemm_notrans_(
     int64_t m,
@@ -147,7 +173,11 @@ gemm_notrans_(
     const scalar_t* b,
     int64_t ldb,
     opmath_t beta,
+<<<<<<< HEAD
     scalar_t* c,
+=======
+    out_t* c,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t ldc) {
   // c += alpha * (a @ b)
   for (const auto i : c10::irange(m)) {
@@ -165,7 +195,11 @@ gemm_notrans_(
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename opmath_t>
+=======
+template <typename scalar_t, typename opmath_t, typename out_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void gemm_transa_(
     TransposeType transa,
     int64_t m, int64_t n, int64_t k,
@@ -173,7 +207,11 @@ void gemm_transa_(
     const scalar_t *a, int64_t lda,
     const scalar_t *b, int64_t ldb,
     opmath_t beta,
+<<<<<<< HEAD
     scalar_t *c, int64_t ldc) {
+=======
+    out_t *c, int64_t ldc) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // c = alpha * (a.T @ b) + beta * c
   const scalar_t *a_ = a;
   for (const auto i : c10::irange(m)) {
@@ -225,6 +263,10 @@ void gemm_transb_impl(
   }
 }
 
+<<<<<<< HEAD
+=======
+// in this case, scalar_t == opmath_t == out_t so out_t template param is not needed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename scalar_t, typename opmath_t>
 std::enable_if_t<std::is_same_v<scalar_t, opmath_t>, void>
 gemm_transb_(
@@ -247,7 +289,11 @@ gemm_transb_(
 }
 
 // std::is_same<scalar_t, at::BFloat16> || std::is_same<scalar_t, at::Half>
+<<<<<<< HEAD
 template <typename scalar_t, typename opmath_t>
+=======
+template <typename scalar_t, typename opmath_t, typename out_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::enable_if_t<!std::is_same_v<scalar_t, opmath_t>, void>
 gemm_transb_(
     TransposeType transb,
@@ -260,7 +306,11 @@ gemm_transb_(
     const scalar_t* b,
     int64_t ldb,
     opmath_t beta,
+<<<<<<< HEAD
     scalar_t* c,
+=======
+    out_t* c,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t ldc) {
   // We need to calculate full-precision dot products for correctness;
   // users notice error accumulation with reduced-width types (e.g.,
@@ -304,7 +354,11 @@ gemm_transb_(
   }
 }
 
+<<<<<<< HEAD
 template <typename scalar_t, typename opmath_t>
+=======
+template <typename scalar_t, typename opmath_t, typename out_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void gemm_transab_(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
@@ -312,7 +366,11 @@ void gemm_transab_(
     const scalar_t *a, int64_t lda,
     const scalar_t *b, int64_t ldb,
     opmath_t beta,
+<<<<<<< HEAD
     scalar_t *c, int64_t ldc) {
+=======
+    out_t *c, int64_t ldc) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // c = beta * c + alpha * (a.T @ b.T)
   for (const auto i : c10::irange(m)) {
     for (const auto j : c10::irange(n)) {
@@ -436,7 +494,11 @@ void gemm_transa_(
 }
 #endif // !defined(C10_MOBILE)
 
+<<<<<<< HEAD
 template <typename scalar_t, typename opmath_t>
+=======
+template <typename scalar_t, typename opmath_t, typename out_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void gemm_core_(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
@@ -444,7 +506,11 @@ void gemm_core_(
     const scalar_t *a, int64_t lda,
     const scalar_t *b, int64_t ldb,
     opmath_t beta,
+<<<<<<< HEAD
     scalar_t *c, int64_t ldc) {
+=======
+    out_t *c, int64_t ldc) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (transa == TransposeType::NoTranspose &&
       transb == TransposeType::NoTranspose) {
     return gemm_notrans_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
@@ -493,6 +559,30 @@ void cpublas_gemm_impl(
       });
 }
 
+<<<<<<< HEAD
+=======
+void cpublas_gemm_no_downcast_impl(
+  at::ScalarType type,
+  TransposeType transa, TransposeType transb,
+  int64_t m, int64_t n, int64_t k,
+  const Scalar& alpha,
+  const void *a, int64_t lda,
+  const void *b, int64_t ldb,
+  const Scalar& beta,
+  void *c, int64_t ldc) {
+_AT_DISPATCH_GEMM_TYPES(type, "cpublas_gemm_no_downcast_impl", [&]{
+      using opmath_t = at::opmath_type<scalar_t>;
+      gemm_core_(
+          transa, transb, m, n, k,
+          alpha.to<opmath_t>(),
+          static_cast<const scalar_t *>(a), lda,
+          static_cast<const scalar_t *>(b), ldb,
+          beta.to<opmath_t>(),
+          static_cast<opmath_t *>(c), ldc);
+    });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void cpublas_axpy_impl(at::ScalarType type, int64_t n, const Scalar& _a, const void *_x, int64_t incx, void *_y, int64_t incy){
   if (type == at::kBool) {
       auto a = _a.to<bool>();
@@ -530,6 +620,10 @@ void cpublas_copy_impl(at::ScalarType type, int64_t n, const void *_x, int64_t i
 
 
 REGISTER_DISPATCH(cpublas::gemm_stub, &cpublas::cpublas_gemm_impl)
+<<<<<<< HEAD
+=======
+REGISTER_DISPATCH(cpublas::gemm_no_downcast_stub, &cpublas::cpublas_gemm_no_downcast_impl)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_DISPATCH(cpublas::axpy_stub, &cpublas::cpublas_axpy_impl)
 REGISTER_DISPATCH(cpublas::copy_stub, &cpublas::cpublas_copy_impl)
 
diff --git a/aten/src/ATen/native/cpu/Elu.h b/aten/src/ATen/native/cpu/Elu.h
new file mode 100644
index 000000000000..d438d4303709
--- /dev/null
+++ b/aten/src/ATen/native/cpu/Elu.h
@@ -0,0 +1,74 @@
+#pragma once
+
+// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to
+// access constants such as M_SQRT2 and M_2_SQRTPI.
+#ifdef _WIN32
+#define _USE_MATH_DEFINES
+#include <cmath>
+#endif // _WIN32
+
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/BFloat16.h> // For c10::is_reduced_floating_point_v.
+
+namespace at::native {
+inline namespace CPU_CAPABILITY {
+/**
+ * Return a function object that calculates ELU with the given
+ * parameters on its input element.  ParamT is the type of the input
+ * and output to the ELU, and MathT is the type (possibly
+ * higher-precision, e.g. float if ParamT is reduced-precision float)
+ * in which to do intermediate calculations.
+ */
+template <typename ParamT, typename MathT=ParamT>
+auto get_scalar_elu_elementwise_func(MathT alpha, MathT scale, MathT input_scale) {
+  const auto negcoef = alpha * scale;
+  const auto poscoef = scale;
+  const auto negiptcoef = input_scale;
+  return [negcoef, negiptcoef, poscoef](ParamT a) -> ParamT {
+    return MathT(a) < MathT(0)
+      ? std::expm1(MathT(a) * negiptcoef) * negcoef
+      : MathT(a) * poscoef;
+  };
+}
+
+/**
+ * Return a function object that calculates ELU with the given
+ * parameters on its input element. The function object takes and
+ * returns Vectorized<T>.
+ */
+template <typename T, std::enable_if_t<!c10::is_reduced_floating_point_v<T>, bool> = true>
+auto get_vectorized_elu_elementwise_func(T alpha, T scale, T input_scale) {
+  const vec::Vectorized<T> negcoef_vec(alpha * scale);
+  const vec::Vectorized<T> poscoef_vec(scale);
+  const vec::Vectorized<T> negiptcoef_vec(input_scale);
+  const vec::Vectorized<T> zero_vec(static_cast<T>(0));
+  return [negcoef_vec, poscoef_vec, negiptcoef_vec, zero_vec](vec::Vectorized<T> a) -> vec::Vectorized<T> {
+    const auto cmp = a >= zero_vec;
+    if (!cmp.zero_mask()) {
+      return a * poscoef_vec;
+    } else {
+      return vec::Vectorized<T>::blendv((a * negiptcoef_vec).expm1() * negcoef_vec, a * poscoef_vec, cmp);
+    }
+  };
+}
+
+/**
+ * Return a function object that calculates ELU with the given
+ * parameters on its input element. The function object takes and
+ * returns Vectorized<ParamT>, and Vectorized<MathT> is the type
+ * (possibly higher-precision) in which to do intermediate
+ * calculations.
+ */
+template <typename T, std::enable_if_t<c10::is_reduced_floating_point_v<T>, bool> = true>
+auto get_vectorized_elu_elementwise_func(float alpha, float scale, float input_scale) {
+  // Takes float->float.
+  const auto float_func = get_vectorized_elu_elementwise_func<float>(alpha, scale, input_scale);
+  return [float_func](vec::Vectorized<T> a) -> vec::Vectorized<T> {
+    auto [a0, a1] = vec::convert_to_float<T>(a);
+    auto res0 = float_func(a0);
+    auto res1 = float_func(a1);
+    return vec::convert_from_float<T>(res0, res1);
+  };
+}
+} // namespace CPU_CAPABILITY
+} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
index b3c40c865b41..5a980fe3939d 100644
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@@ -201,6 +201,12 @@ void reshape_attn_mask_to_4d(
   attn_mask = attn_mask
                 .view({attn_mask_size_0, attn_mask_size_1, attn_mask.size(-2), attn_mask.size(-1)})
                 .expand({attn_mask_size_0, attn_mask_size_1, qSize, kvSize});
+<<<<<<< HEAD
+=======
+  if (attn_mask.sym_stride(-1) != 1 && attn_mask.sym_stride(-1) != 0) {
+    attn_mask = attn_mask.contiguous();
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename scalar_t>
@@ -386,7 +392,11 @@ void cpu_flash_attention(
     int64_t thresh_size = (dtype == at::ScalarType::BFloat16) ? 64 : 16;
     need_pack = kvSize >= thresh_size && qSize >= thresh_size;
     // When the number of gemm is greater than the number of pack,
+<<<<<<< HEAD
     // the pack overhead can be overlaped.
+=======
+    // the pack overhead can be overlapped.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (need_pack) {
       double pack_size = batchSize * num_head * kvSize * headSize;
       double qs_per_thread = (batchSize * num_head * qSlice + num_thread - 1) / num_thread;
diff --git a/aten/src/ATen/native/cpu/Gelu.h b/aten/src/ATen/native/cpu/Gelu.h
index 390d3b28e3c5..4fc3f097281a 100644
--- a/aten/src/ATen/native/cpu/Gelu.h
+++ b/aten/src/ATen/native/cpu/Gelu.h
@@ -1,9 +1,24 @@
 #pragma once
 
+<<<<<<< HEAD
+=======
+// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to
+// access constants such as M_SQRT2 and M_2_SQRTPI.
+#ifdef _WIN32
+#define _USE_MATH_DEFINES
+#include <cmath>
+#include <math.h>
+#endif // _WIN32
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/BFloat16.h> // For c10::is_reduced_floating_point_v.
 
 namespace at::native {
+<<<<<<< HEAD
+=======
+inline namespace CPU_CAPABILITY {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr double kGeluBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
 constexpr double kGeluKappa = 0.044715;
 
@@ -70,5 +85,10 @@ vec::Vectorized<T> vectorized_gelu(vec::Vectorized<T> x) {
   return at::vec::convert_from_float<T>(vectorized_gelu(x0), vectorized_gelu(x1));
 }
 
+<<<<<<< HEAD
 
 } // namespace
+=======
+} // namespace CPU_CAPABILITY
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
index 521a65c7cd94..a6363b423431 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -14,6 +14,15 @@
 
 namespace at::native { namespace {
 
+<<<<<<< HEAD
+=======
+// fixes segfaults for GCC >= 12 on some AArch64 cpus https://github.com/pytorch/pytorch/issues/157626
+#if defined(__GNUC__) && __GNUC__ >= 12 && defined(__aarch64__)
+#pragma GCC push_options
+#pragma GCC optimize ("no-strict-aliasing")
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /**  NOTE [ Grid Sample CPU Kernels ]
  *
  *   Implementation of vectorized grid sample CPU kernels is divided into three
@@ -1014,6 +1023,13 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
   }
 };
 
+<<<<<<< HEAD
+=======
+#if defined(__GNUC__) && __GNUC__ >= 12 && defined(__aarch64__)
+#pragma GCC pop_options
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // ~~~~~~~~~~~~~~~~~~ grid_sample_2d_grid_slice_iterator ~~~~~~~~~~~~~~~~~~~~~~
 // Function to apply a vectorized function on a grid slice tensor (without batch
 // dimension).
diff --git a/aten/src/ATen/native/cpu/LogSoftmaxKernelImpl.h b/aten/src/ATen/native/cpu/LogSoftmaxKernelImpl.h
new file mode 100644
index 000000000000..b8af353e8866
--- /dev/null
+++ b/aten/src/ATen/native/cpu/LogSoftmaxKernelImpl.h
@@ -0,0 +1,337 @@
+#pragma once
+
+#include <ATen/OpMathType.h>
+#include <ATen/Parallel.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <type_traits>
+
+namespace at::native {
+inline namespace CPU_CAPABILITY {
+template <typename scalar_t>
+int64_t vec_log_softmax_lastdim_chunk_size(int64_t grain_size, int64_t outer_size, int64_t dim_size) {
+  // Coincidentally, at::internal::GRAIN_SIZE is 32768, which is equal to the
+  // size of L1D cache on many processors. Some processors have 48 KB L1D cache
+  // nowadays, so maybe in the future, we can leverage the knowledge of a
+  // machine's L1D cache size.
+  int64_t MAX_CHUNK_SIZE = std::max<int64_t>(
+      1,
+      grain_size / (sizeof(scalar_t) * dim_size));
+  return std::min<int64_t>(MAX_CHUNK_SIZE, outer_size);
+}
+
+template <typename scalar_t>
+void serial_vec_log_softmax_lastdim_range(
+    const scalar_t* input_data_base,
+    scalar_t* output_data_base,
+    int64_t dim_size,
+    int64_t chunk_size,
+    int64_t begin,
+    int64_t end) {
+  if (end <= begin) {
+    return;
+  }
+  using Vec = vec::Vectorized<vec::vec_scalar_t<scalar_t>>;
+  // MSVC requires such a declaration of dynamic arrays
+  // Source: https://stackoverflow.com/a/33423538
+  auto tmp_sum_scalar = std::make_unique<scalar_t[]>(chunk_size);
+  auto max_input_arr = std::make_unique<scalar_t[]>(chunk_size);
+  for (int64_t ii = begin; ii < end; ii += chunk_size) {
+    int64_t loop_end = chunk_size;
+    if (ii + chunk_size > end) {
+      loop_end = end - ii;
+    }
+    for (const auto j : c10::irange(loop_end)) {
+      int64_t i = ii + j;
+      const scalar_t* input_data = input_data_base + i * dim_size;
+      max_input_arr[j] = vec::reduce_all<scalar_t>(
+          [](Vec& x, Vec& y) { return vec::maximum(x, y); },
+          input_data,
+          dim_size);
+    }
+    for (const auto j : c10::irange(loop_end)) {
+      int64_t i = ii + j;
+      const scalar_t* input_data = input_data_base + i * dim_size;
+      scalar_t max_input = max_input_arr[j];
+      tmp_sum_scalar[j] = vec::map_reduce_all<scalar_t>(
+          [max_input](Vec x) { return (x - Vec(max_input)).exp(); },
+          [](Vec x, Vec y) { return x + y; },
+          input_data,
+          dim_size);
+    }
+    // See [Note AVX-SSE transitions] for why this should call the
+    // vectorized version (aside from perf improvements).
+    vec::map(
+        [](Vec x) { return x.log(); },
+        tmp_sum_scalar.get(),
+        tmp_sum_scalar.get(),
+        loop_end);
+    for (const auto j : c10::irange(loop_end)) {
+      int64_t i = ii + j;
+      const scalar_t* input_data = input_data_base + i * dim_size;
+      scalar_t* output_data = output_data_base + i * dim_size;
+      scalar_t tmp_sum = tmp_sum_scalar[j];
+      scalar_t max_input = max_input_arr[j];
+
+      // It's necessary to keep the order of the operations below.
+      // In some cases that input is large digits and the difference
+      // is small, if we compute `max_input` plus `tmp_sum` before,
+      // there would be a numerical problem. See an example in
+      // https://github.com/pytorch/pytorch/issues/11752#issuecomment-422883379
+      vec::map(
+          [tmp_sum, max_input](Vec x) {
+            return x - Vec(max_input) - Vec(tmp_sum);
+          },
+          output_data,
+          input_data,
+          dim_size);
+    }
+  }
+}
+
+// Can't include ATen/Parallel.h.
+// TODO: find a way to have only one copy of divup.
+inline int64_t divup(int64_t x, int64_t y) {
+  return (x + y - 1) / y;
+}
+
+template <typename scalar_t, int64_t BLOCK_SIZE = 128 * 1024>
+std::pair<int64_t,int64_t> vec_logsoftmax_chunk_size_and_num_chunks(int64_t inner_size, int64_t dim_size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  int64_t MAX_CHUNK_SIZE = std::max<int64_t>(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
+  MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size();
+  int64_t CHUNK_SIZE = std::min<int64_t>(MAX_CHUNK_SIZE, inner_size);
+  int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
+  return {CHUNK_SIZE, num_chunks};
+}
+
+template <typename scalar_t>
+std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
+serial_vec_logsoftmax_range(
+    const scalar_t* input_data_base,
+    scalar_t* output_data_base,
+    int64_t inner_size,
+    int64_t chunk_size,
+    int64_t num_chunks,
+    int64_t dim_size,
+    int64_t begin,
+    int64_t end) {
+  using Vec = vec::Vectorized<scalar_t>;
+  // thread local temp buffer which holds vertical reduction result: max and sum.
+  auto buffer = std::make_unique<scalar_t []>(chunk_size * 2);
+  scalar_t* input_max_data = buffer.get();
+  scalar_t* tmp_sum_data = buffer.get() + chunk_size;
+
+  for (int64_t i = begin; i < end; i++) {
+    int64_t outer_idx = i / num_chunks;
+    int64_t k = i % num_chunks;
+    int64_t inner_idx_begin = k * chunk_size;
+    int64_t size = std::min(chunk_size, inner_size - inner_idx_begin);
+
+    // init
+    Vec zero_vec = Vec(scalar_t(0));
+    Vec min_vec = Vec(-std::numeric_limits<scalar_t>::infinity());
+    int64_t d0 = 0;
+    for (; d0 < size - (size % Vec::size()); d0 += Vec::size()) {
+      min_vec.store(input_max_data + d0);
+      zero_vec.store(tmp_sum_data + d0);
+    }
+    for (; d0 < size; d0++) {
+      input_max_data[d0] = -std::numeric_limits<scalar_t>::infinity();
+      tmp_sum_data[d0] = scalar_t(0);
+    }
+
+    // compute max
+    for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
+      const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size
+          + dim_idx * inner_size + inner_idx_begin;
+
+      int64_t d1 = 0;
+      for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) {
+        Vec data_vec = Vec::loadu(input_ptr + d1);
+        Vec max_vec = Vec::loadu(input_max_data + d1);
+        max_vec = Vec::blendv(max_vec, data_vec, data_vec > max_vec);
+        max_vec.store(input_max_data + d1);
+      }
+      for (; d1 < size; d1++) {
+        scalar_t data_val = input_ptr[d1];
+        scalar_t max_val = input_max_data[d1];
+        input_max_data[d1] = data_val > max_val ? data_val : max_val;
+      }
+    }
+
+    // compute sum of (x - max).exp()
+    for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
+      const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size
+          + dim_idx * inner_size + inner_idx_begin;
+
+      int64_t d2 = 0;
+      for (; d2 < size - (size % Vec::size()); d2 += Vec::size()) {
+        Vec data_vec = Vec::loadu(input_ptr + d2);
+        Vec sum_vec = Vec::loadu(tmp_sum_data + d2);
+        Vec max_vec = Vec::loadu(input_max_data + d2);
+        sum_vec += (data_vec - max_vec).exp();
+        sum_vec.store(tmp_sum_data + d2);
+      }
+      for (; d2 < size; d2++) {
+        scalar_t data_val = input_ptr[d2];
+        scalar_t max_val = input_max_data[d2];
+        tmp_sum_data[d2] += std::exp(data_val - max_val);
+      }
+    }
+
+    // apply log
+    vec::map([](Vec x) { return x.log(); }, tmp_sum_data, tmp_sum_data, size);
+
+    // compute x - max - sum
+    for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
+      int64_t offset = outer_idx * dim_size * inner_size + dim_idx * inner_size + inner_idx_begin;
+      const scalar_t* input_ptr = input_data_base + offset;
+      scalar_t* output_ptr = output_data_base + offset;
+
+      int64_t d3 = 0;
+      for (; d3 < size - (size % Vec::size()); d3 += Vec::size()) {
+        Vec data_vec = Vec::loadu(input_ptr + d3);
+        Vec max_vec = Vec::loadu(input_max_data + d3);
+        Vec sum_vec = Vec::loadu(tmp_sum_data + d3);
+        Vec out_vec = data_vec - max_vec - sum_vec;
+        out_vec.store(output_ptr + d3);
+      }
+      for (; d3 < size; d3++) {
+        output_ptr[d3] = input_ptr[d3] - input_max_data[d3] - tmp_sum_data[d3];
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
+serial_vec_logsoftmax_range(
+    const scalar_t* input_data_base,
+    scalar_t* output_data_base,
+    int64_t inner_size,
+    int64_t chunk_size,
+    int64_t num_chunks,
+    int64_t dim_size,
+    int64_t begin,
+    int64_t end) {
+  using Vec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  auto buffer = std::make_unique<float []>(chunk_size * 2);
+  float* input_max_data = buffer.get();
+  float* tmp_sum_data = buffer.get() + chunk_size;
+
+  // thread local buffer that holds input data in float32 to save next 2 dtype conversion
+  auto input_buffer = std::make_unique<float []>(dim_size * chunk_size);
+  float* input_buffer_data = input_buffer.get();
+
+  // init
+  for (int64_t i = begin; i < end; i++) {
+    int64_t outer_idx = i / num_chunks;
+    int64_t k = i % num_chunks;
+    int64_t inner_idx_begin = k * chunk_size;
+    int64_t size = std::min(chunk_size, inner_size - inner_idx_begin);
+
+    fVec zero_fvec = fVec(float(0));
+    fVec min_fvec = fVec(-std::numeric_limits<float>::infinity());
+    int64_t d0 = 0;
+    for (; d0 < size - (size % Vec::size()); d0 += Vec::size()) {
+      min_fvec.store(input_max_data + d0);
+      min_fvec.store(input_max_data + d0 + fVec::size());
+      zero_fvec.store(tmp_sum_data + d0);
+      zero_fvec.store(tmp_sum_data + d0 + fVec::size());
+    }
+    for (; d0 < size; d0++) {
+      input_max_data[d0] = -std::numeric_limits<float>::infinity();
+      tmp_sum_data[d0] = float(0);
+    }
+
+    // compute max
+    for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
+      const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size
+          + dim_idx * inner_size + inner_idx_begin;
+      float* input_buffer_ptr = input_buffer_data + dim_idx * chunk_size;
+
+      int64_t d1 = 0;
+      for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) {
+        Vec data_vec = Vec::loadu(input_ptr + d1);
+        auto [data_fvec0, data_fvec1] = vec::convert_to_float<scalar_t>(data_vec);
+        fVec max_fvec0 = fVec::loadu(input_max_data + d1);
+        fVec max_fvec1 = fVec::loadu(input_max_data + d1 + fVec::size());
+        max_fvec0 = fVec::blendv(max_fvec0, data_fvec0, data_fvec0 > max_fvec0);
+        max_fvec1 = fVec::blendv(max_fvec1, data_fvec1, data_fvec1 > max_fvec1);
+        max_fvec0.store(input_max_data + d1);
+        max_fvec1.store(input_max_data + d1 + fVec::size());
+
+        // cache the 'converted' float input
+        data_fvec0.store(input_buffer_ptr + d1);
+        data_fvec1.store(input_buffer_ptr + d1 + fVec::size());
+      }
+      for (; d1 < size; d1++) {
+        float data_val = float(input_ptr[d1]);
+        float max_val = input_max_data[d1];
+        input_max_data[d1] = data_val > max_val ? data_val : max_val;
+        input_buffer_ptr[d1] = data_val;
+      }
+    }
+
+    // compute sum of (x - max).exp()
+    for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
+      float* input_buffer_ptr = input_buffer_data + dim_idx * chunk_size;
+
+      int64_t d2 = 0;
+      for (; d2 < size - (size % Vec::size()); d2 += Vec::size()) {
+        fVec data_fvec0 = fVec::loadu(input_buffer_ptr + d2);
+        fVec data_fvec1 = fVec::loadu(input_buffer_ptr + d2 + fVec::size());
+        fVec sum_fvec0 = fVec::loadu(tmp_sum_data + d2);
+        fVec sum_fvec1 = fVec::loadu(tmp_sum_data + d2 + fVec::size());
+        fVec max_fvec0 = fVec::loadu(input_max_data + d2);
+        fVec max_fvec1 = fVec::loadu(input_max_data + d2 + fVec::size());
+        sum_fvec0 += (data_fvec0 - max_fvec0).exp();
+        sum_fvec1 += (data_fvec1 - max_fvec1).exp();
+        sum_fvec0.store(tmp_sum_data + d2);
+        sum_fvec1.store(tmp_sum_data + d2 + fVec::size());
+      }
+      for (; d2 < size; d2++) {
+        float data_val = input_buffer_ptr[d2];
+        float max_val = input_max_data[d2];
+        tmp_sum_data[d2] += std::exp(data_val - max_val);
+      }
+    }
+
+    // apply log
+    vec::map([](fVec x) { return x.log(); }, tmp_sum_data, tmp_sum_data, size);
+
+    // compute x - max - sum
+    for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
+      float* input_buffer_ptr = input_buffer_data + dim_idx * chunk_size;
+      scalar_t* output_ptr = output_data_base + outer_idx * dim_size * inner_size
+          + dim_idx * inner_size + inner_idx_begin;
+
+      int64_t d3 = 0;
+      for (; d3 < size - (size % Vec::size()); d3 += Vec::size()) {
+        fVec data_fvec0 = fVec::loadu(input_buffer_ptr + d3);
+        fVec data_fvec1 = fVec::loadu(input_buffer_ptr + d3 + fVec::size());
+        fVec max_fvec0 = fVec::loadu(input_max_data + d3);
+        fVec max_fvec1 = fVec::loadu(input_max_data + d3 + fVec::size());
+        fVec sum_fvec0 = fVec::loadu(tmp_sum_data + d3);
+        fVec sum_fvec1 = fVec::loadu(tmp_sum_data + d3 + fVec::size());
+        fVec out_fvec0 = data_fvec0 - max_fvec0 - sum_fvec0;
+        fVec out_fvec1 = data_fvec1 - max_fvec1 - sum_fvec1;
+        Vec out_vec = vec::convert_from_float<scalar_t>(out_fvec0, out_fvec1);
+        out_vec.store(output_ptr + d3);
+      }
+      for (; d3 < size; d3++) {
+        output_ptr[d3] = scalar_t(input_buffer_ptr[d3] - input_max_data[d3] - tmp_sum_data[d3]);
+      }
+    }
+  }
+} // namespace CPU_CAPABILITY
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
index 9f6f17de9a8e..9b1e46f1d055 100644
--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@@ -9,6 +9,10 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Half.h>
 #include <c10/util/Unroll.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/irange.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if defined(__aarch64__) && !defined(C10_MOBILE)
 #include <arm_neon.h>
@@ -68,11 +72,16 @@ float reduce(vec::VectorizedN<Half, kF16RegistersPerIteration>& x) {
   int offset = kF16RegistersPerIteration;
   c10::ForcedUnroll<IntegerLog2(kF16RegistersPerIteration)>{}([&offset, &x](auto idx) {
     offset /= 2;
+<<<<<<< HEAD
     for (int i = 0; i < offset; ++i) {
+=======
+    for (const auto i : c10::irange(offset)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       x[i] = x[i] + x[offset + i];
     }
   });
   const auto [t0, t1] = vec::convert_half_float(x[0]);
+<<<<<<< HEAD
 #if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE)
   return vaddvq_f32(t0 + t1);
 #else
@@ -80,6 +89,11 @@ float reduce(vec::VectorizedN<Half, kF16RegistersPerIteration>& x) {
       std::plus<vec::Vectorized<float>>(),
       t0 + t1);
 #endif
+=======
+  return vec::vec_reduce_all<float>(
+      std::plus<vec::Vectorized<float>>(),
+      t0 + t1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 float fp16_dot_with_fp16_arith(const Half* x, const Half* a, int len) {
@@ -87,7 +101,11 @@ float fp16_dot_with_fp16_arith(const Half* x, const Half* a, int len) {
 
   const auto len_aligned = len & ~(kF16ElementsPerIteration - 1);
   for (int j = 0; j < len_aligned ; j += kF16ElementsPerIteration) {
+<<<<<<< HEAD
     for (int k = 0; k < kF16RegistersPerIteration; ++k) {
+=======
+    for (const auto k : c10::irange(kF16RegistersPerIteration)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const auto temp_x = vec::Vectorized<Half>::loadu(x + j + k * vec::Vectorized<Half>::size());
       const auto temp_a = vec::Vectorized<Half>::loadu(a + j + k * vec::Vectorized<Half>::size());
       sum[k] = vec::fmadd(temp_x, temp_a, sum[k]);
@@ -95,7 +113,11 @@ float fp16_dot_with_fp16_arith(const Half* x, const Half* a, int len) {
   }
   auto reduced_sum = reduce(sum);
 
+<<<<<<< HEAD
   for (int j = len_aligned; j < len; ++j) {
+=======
+  for (const auto j : c10::irange(len_aligned, len)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reduced_sum += x[j] * a[j];
   }
   return reduced_sum;
@@ -104,22 +126,39 @@ float fp16_dot_with_fp16_arith(const Half* x, const Half* a, int len) {
 // Rather than unrolling to process multiple rows (transposed columns)
 // of matrix A at once as done in fp16_gemv_trans_fp16_arith, unroll
 // along an individual dot product.
+<<<<<<< HEAD
 static void fp16_gemv_trans_fp16_arith_by_dot_products(const int m, const int n, const Half* a, const int lda, const Half *x, const float beta, Half* y, int incy) {
   if (beta == 0.0f) {
     parallel_for(0, n, 1, [&](int begin, int end) {
       for (int i = begin; i < end; ++i) {
+=======
+static void fp16_gemv_trans_fp16_arith_by_dot_products(const int m, const int n, const Half* a, const int64_t lda, const Half *x, const float beta, Half* y, int incy) {
+  if (beta == 0.0f) {
+    parallel_for(0, n, 1, [&](int64_t begin, int64_t end) {
+      for (const auto i : c10::irange(begin, end)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y[i * incy] = fp16_dot_with_fp16_arith(x, a + lda * i, m);
       }
     });
   } else if (beta == 1.0f) {
+<<<<<<< HEAD
     parallel_for(0, n, 1, [&](int begin, int end) {
       for (int i = begin; i < end; ++i) {
+=======
+    parallel_for(0, n, 1, [&](int64_t begin, int64_t end) {
+      for (const auto i : c10::irange(begin, end)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y[i * incy] += fp16_dot_with_fp16_arith(x, a + lda * i, m);
       }
     });
   } else {
+<<<<<<< HEAD
     parallel_for(0, n, 1, [&](int begin, int end) {
       for (int i = begin; i < end; ++i) {
+=======
+    parallel_for(0, n, 1, [&](int64_t begin, int64_t end) {
+      for (const auto i : c10::irange(begin, end)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y[i * incy] = beta * y[i * incy] + fp16_dot_with_fp16_arith(x, a + lda * i, m);
       }
     });
@@ -129,6 +168,7 @@ static void fp16_gemv_trans_fp16_arith_by_dot_products(const int m, const int n,
 #endif // !defined(__aarch64__) || defined( __ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
 
 float reduce(vec::Vectorized<float> x) {
+<<<<<<< HEAD
 #if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE)
   return vaddvq_f32(x);
 #else
@@ -136,6 +176,11 @@ float reduce(vec::Vectorized<float> x) {
       std::plus<vec::Vectorized<float>>(),
       x);
 #endif
+=======
+  return vec::vec_reduce_all<float>(
+      std::plus<vec::Vectorized<float>>(),
+      x);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // The below reduce overload and fp16_dot_with_fp32_arith are adapted
@@ -146,7 +191,11 @@ float reduce(vec::VectorizedN<float, kF32RegistersPerIteration>& x) {
   int offset = kF32RegistersPerIteration;
   c10::ForcedUnroll<IntegerLog2(kF32RegistersPerIteration)>{}([&offset, &x](auto idx) {
     offset /= 2;
+<<<<<<< HEAD
     for (int i = 0; i < offset; ++i) {
+=======
+    for (const auto i : c10::irange(offset)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       x[i] = x[i] + x[offset + i];
     }
   });
@@ -156,6 +205,10 @@ float reduce(vec::VectorizedN<float, kF32RegistersPerIteration>& x) {
 // We would have to write a separate SVE-specific path to use SVE
 // BFDOT. Deferring that for now to get the NEON/ASIMD BFDOT path
 // working.
+<<<<<<< HEAD
+=======
+#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
 // https://godbolt.org/z/z8P4Yncra
 #define COMPILER_SUPPORTS_BF16_TARGET 1
@@ -166,6 +219,12 @@ float reduce(vec::VectorizedN<float, kF32RegistersPerIteration>& x) {
 #else // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
 #define COMPILER_SUPPORTS_BF16_TARGET 0
 #endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
+<<<<<<< HEAD
+=======
+#else // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#define COMPILER_SUPPORTS_BF16_TARGET 0
+#endif // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if COMPILER_SUPPORTS_BF16_TARGET
 #define TARGET_ARM_BF16_ATTRIBUTE __attribute__((target("arch=armv8.2-a+bf16")))
@@ -236,7 +295,11 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
 
 // Return a + b_low * c_low + b_high * c_high
 vec::Vectorized<float> fmadd(vec::Vectorized<float> a, vec::Vectorized<Half> b, vec::Vectorized<Half> c) {
+<<<<<<< HEAD
 #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_FML)
+=======
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOTE: this instruction is an optional instruction in ARM v8.2 and
   // v8.3, but mandatory in v8.4 per
   // https://developer.arm.com/documentation/ddi0596/2021-03/SIMD-FP-Instructions/FMLAL--FMLAL2--vector---Floating-point-fused-Multiply-Add-Long-to-accumulator--vector--?lang=en
@@ -365,7 +428,11 @@ static_assert(
   reduced_sum += reduce(tail_sum);                                      \
                                                                         \
   /* Second-tier tail fixup: handle all workloads. */                   \
+<<<<<<< HEAD
   for (int j = len_aligned_vec; j < len; ++j) {                         \
+=======
+  for (const auto j : c10::irange(len_aligned_vec, len)) {                         \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     /* Attempting to use Half here caused multiple test failures; */    \
     /* using float to unbreak. (Suspect we need a scalar FMA.) */       \
     float x1 = vec1[j];                                                 \
@@ -394,23 +461,40 @@ float fp16_dot_with_fp32_arith(const Half* vec1, const Half* vec2, int64_t len)
   return dot_with_fp32_arith_no_bfdot(vec1, vec2, len);
 }
 
+<<<<<<< HEAD
 void fp16_gemv_trans_fp32_arith_by_dot_products(const int m, const int n, const Half* a, const int lda, const Half *x, const float beta, Half* y, int incy) {
   if (beta == 0.0f) {
     parallel_for(0, n, 1, [&](int begin, int end) {
       for (int i = begin; i < end; ++i) {
+=======
+void fp16_gemv_trans_fp32_arith_by_dot_products(const int m, const int n, const Half* a, const int64_t lda, const Half *x, const float beta, Half* y, int incy) {
+  if (beta == 0.0f) {
+    parallel_for(0, n, 1, [&](int64_t begin, int64_t end) {
+      for (const auto i : c10::irange(begin, end)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y[i * incy] = fp16_dot_with_fp32_arith(x, a + lda * i, m);
       }
     });
   } else if (beta == 1.0f) {
+<<<<<<< HEAD
     parallel_for(0, n, 1, [&](int begin, int end) {
       for (int i = begin; i < end; ++i) {
+=======
+    parallel_for(0, n, 1, [&](int64_t begin, int64_t end) {
+      for (const auto i : c10::irange(begin, end)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // We need to accumulate in fp32; y[i * incy] += ... gets wrong results.
         y[i * incy] = static_cast<float>(y[i * incy]) + fp16_dot_with_fp32_arith(x, a + lda * i, m);
       }
     });
   } else {
+<<<<<<< HEAD
     parallel_for(0, n, 1, [&](int begin, int end) {
       for (int i = begin; i < end; ++i) {
+=======
+    parallel_for(0, n, 1, [&](int64_t begin, int64_t end) {
+      for (const auto i : c10::irange(begin, end)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y[i * incy] = beta * y[i * incy] + fp16_dot_with_fp32_arith(x, a + lda * i, m);
       }
     });
@@ -448,9 +532,15 @@ float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec
   }
 }
 
+<<<<<<< HEAD
 void bf16_gemv_trans_fp32_arith_by_dot_products(const int m, const int n, const at::BFloat16* a, const int lda, const at::BFloat16 *x, at::BFloat16* y, int incy) {
   parallel_for(0, n, 1, [&](int begin, int end) {
     for (int i = begin; i < end; ++i) {
+=======
+void bf16_gemv_trans_fp32_arith_by_dot_products(const int m, const int n, const at::BFloat16* a, const int64_t lda, const at::BFloat16 *x, at::BFloat16* y, int incy) {
+  parallel_for(0, n, 1, [&](int64_t begin, int64_t end) {
+    for (const auto i : c10::irange(begin, end)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       y[i * incy] = bf16_dot_with_fp32_arith(x, a + lda * i, m);
     }
   });
@@ -470,12 +560,41 @@ void bf16_gemv_trans(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
   return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
 }
+<<<<<<< HEAD
+=======
+
+float fp16_dot(
+  const int64_t n,
+  const at::Half* x,
+  const int64_t incx,
+  const at::Half* y,
+  const int64_t incy) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && incy == 1);
+  return fp16_dot_with_fp32_arith(x, y, n);
+}
+
+float bf16_dot(
+  const int64_t n,
+  const at::BFloat16* x,
+  const int64_t incx,
+  const at::BFloat16* y,
+  const int64_t incy) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && incy == 1);
+  return bf16_dot_with_fp32_arith(x, y, n);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // !defined(C10_MOBILE)
 } // namespace CPU_CAPABILITY
 
 #if !defined(C10_MOBILE)
 REGISTER_DISPATCH(fp16_gemv_trans_stub, &fp16_gemv_trans)
 REGISTER_DISPATCH(bf16_gemv_trans_stub, &bf16_gemv_trans)
+<<<<<<< HEAD
+=======
+REGISTER_DISPATCH(fp16_dot_stub, &fp16_dot)
+REGISTER_DISPATCH(bf16_dot_stub, &bf16_dot)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif //!defined(C10_MOBILE)
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.h b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.h
index cd7a74cd7402..074074a2133e 100644
--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.h
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.h
@@ -13,6 +13,15 @@ DECLARE_DISPATCH(fp16_gemv_fn, fp16_gemv_trans_stub)
 using bf16_gemv_fn = void(*)(int, int, BFloat16, const BFloat16*, int, const BFloat16*, int, BFloat16, BFloat16*, int);
 DECLARE_DISPATCH(bf16_gemv_fn, bf16_gemv_trans_stub)
 
+<<<<<<< HEAD
+=======
+using fp16_dot_fn = float(*)(const int64_t, const Half*, const int64_t, const Half*, const int64_t);
+DECLARE_DISPATCH(fp16_dot_fn, fp16_dot_stub)
+
+using bf16_dot_fn = float(*)(const int64_t, const BFloat16*, const int64_t, const BFloat16*, const int64_t);
+DECLARE_DISPATCH(bf16_dot_fn, bf16_dot_stub)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline namespace CPU_CAPABILITY {
 float fp16_dot_with_fp32_arith(const Half* vec1, const Half* vec2, int64_t len);
 float bf16_dot_with_fp32_arith(const BFloat16* vec1, const BFloat16* vec2, int64_t len);
diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
index 651fc2d91e88..7f428890d35d 100644
--- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
+++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
@@ -167,10 +167,18 @@ template <bool is_scatter_like = true>
 struct cpu_scatter_gather_base_kernel {
   template <typename func_t>
   void operator()(const Tensor& self, int64_t dim,
+<<<<<<< HEAD
     const Tensor& index, const Scalar& value,
     const std::string& method_name, func_t& kernel_func) {
 
     Tensor buffer;
+=======
+    const Tensor& _index, const Scalar& value,
+    const std::string& method_name, func_t& kernel_func) {
+
+    Tensor buffer;
+    Tensor index = _index.to(ScalarType::Long);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool need_acc = isReducedFloatingType(self.scalar_type());
     create_acc_buffer(buffer, self, need_acc);
 
@@ -263,10 +271,18 @@ struct cpu_scatter_gather_base_kernel {
 
   template <typename func_t>
   void operator()(const Tensor& self, int64_t dim,
+<<<<<<< HEAD
     const Tensor& index, const Tensor& src,
     const std::string& method_name, func_t& kernel_func) {
 
     Tensor buffer;
+=======
+    const Tensor& _index, const Tensor& src,
+    const std::string& method_name, func_t& kernel_func) {
+
+    Tensor buffer;
+    Tensor index = _index.to(ScalarType::Long);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool need_acc = isReducedFloatingType(self.scalar_type());
     create_acc_buffer(buffer, self, need_acc);
 
@@ -358,10 +374,18 @@ struct cpu_scatter_gather_base_kernel {
   }
 
   void operator()(const Tensor& self, int64_t dim,
+<<<<<<< HEAD
     const Tensor& index, const Tensor& src,
     const std::string& method_name, ReduceMean& kernel_func) {
 
     Tensor buffer;
+=======
+    const Tensor& _index, const Tensor& src,
+    const std::string& method_name, ReduceMean& kernel_func) {
+
+    Tensor buffer;
+    Tensor index = _index.to(ScalarType::Long);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool need_acc = isReducedFloatingType(self.scalar_type());
     create_acc_buffer(buffer, self, need_acc);
 
@@ -453,9 +477,16 @@ struct cpu_scatter_gather_base_kernel {
   }
 
   void operator()(const Tensor& self, int64_t dim,
+<<<<<<< HEAD
     const Tensor& index, const Tensor& src,
     const std::string& method_name, ReduceMaximum& kernel_func) {
     Tensor buffer;
+=======
+    const Tensor& _index, const Tensor& src,
+    const std::string& method_name, ReduceMaximum& kernel_func) {
+    Tensor buffer;
+    Tensor index = _index.to(ScalarType::Long);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool need_acc = isReducedFloatingType(self.scalar_type());
     create_acc_buffer(buffer, self, need_acc);
 
@@ -547,10 +578,18 @@ struct cpu_scatter_gather_base_kernel {
   }
 
   void operator()(const Tensor& self, int64_t dim,
+<<<<<<< HEAD
     const Tensor& index, const Tensor& src,
     const std::string& method_name, ReduceMinimum& kernel_func) {
 
     Tensor buffer;
+=======
+    const Tensor& _index, const Tensor& src,
+    const std::string& method_name, ReduceMinimum& kernel_func) {
+
+    Tensor buffer;
+    Tensor index = _index.to(ScalarType::Long);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool need_acc = isReducedFloatingType(self.scalar_type());
     create_acc_buffer(buffer, self, need_acc);
 
@@ -810,7 +849,12 @@ void cpu_scatter_reduce_expanded_index(const Tensor& self, const Tensor& index,
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void cpu_gather_expanded_index_kernel(const Tensor& result, const Tensor& index, const Tensor& self) {
+=======
+void cpu_gather_expanded_index_kernel(const Tensor& result, const Tensor& _index, const Tensor& self) {
+  Tensor index = _index.to(ScalarType::Long);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t* index_data = index.const_data_ptr<int64_t>();
   scalar_t* result_data = result.data_ptr<scalar_t>();
   const scalar_t* self_data = self.const_data_ptr<scalar_t>();
diff --git a/aten/src/ATen/native/cpu/SerialStackImpl.h b/aten/src/ATen/native/cpu/SerialStackImpl.h
index 88ba1c91b6c8..ef7acdb90f28 100644
--- a/aten/src/ATen/native/cpu/SerialStackImpl.h
+++ b/aten/src/ATen/native/cpu/SerialStackImpl.h
@@ -67,7 +67,11 @@ void stack_serial_kernel_impl(Tensor& result, TensorListType tensors, int64_t di
 // - tensors dtype is Double or Float
 template <typename TensorListType>
 bool can_use_native_serial_stack_impl(Tensor& result, TensorListType tensors, int64_t dim) {
+<<<<<<< HEAD
   TORCH_CHECK(tensors.size() > 0, "expected a non-empty list of Tensors");
+=======
+  TORCH_CHECK(!tensors.empty(), "expected a non-empty list of Tensors");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const Tensor& first_tensor = tensors[0];
   // stack dimension should be in range [0,firstTensor.dim())
   // dim == firstTensor.dim() is a valid input, but it is handled by default code path
diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
index 7855191d6c06..56a1219b44bf 100644
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@@ -2,6 +2,11 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/cpu/SoftmaxKernel.h>
 
+<<<<<<< HEAD
+=======
+#include <ATen/native/cpu/LogSoftmaxKernelImpl.h>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <algorithm>
 #include <iterator>
 #include <numeric>
@@ -28,7 +33,10 @@
 // We use a chunk size such that it'd fit in L1D.
 
 namespace at::native {
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 template <typename scalar_t>
 inline void _vec_log_softmax_lastdim(
@@ -36,6 +44,7 @@ inline void _vec_log_softmax_lastdim(
     scalar_t* output_data_base,
     int64_t outer_size,
     int64_t dim_size) {
+<<<<<<< HEAD
   using Vec = vec::Vectorized<vec::vec_scalar_t<scalar_t>>;
   // Coincidentally, at::internal::GRAIN_SIZE is 32768, which is equal to the
   // size of L1D cache on many processors. Some processors have 48 KB L1D cache
@@ -45,6 +54,12 @@ inline void _vec_log_softmax_lastdim(
       1,
       at::internal::GRAIN_SIZE / (sizeof(scalar_t) * dim_size));
   int64_t CHUNK_SIZE = std::min<int64_t>(MAX_CHUNK_SIZE, outer_size);
+=======
+  const auto chunk_size = vec_log_softmax_lastdim_chunk_size<scalar_t>(
+      at::internal::GRAIN_SIZE,
+      outer_size,
+      dim_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Note: grain_size value of 0
   // We don't change the number of OpenMP threads in the OpenMP thread-pool,
   // so some threads do useful work, while others don't.
@@ -52,6 +67,7 @@ inline void _vec_log_softmax_lastdim(
   // work among threads in an equitable manner. We compute CHUNK_SIZE to ensure
   // each thread's computations would be efficient.
   parallel_for(0, outer_size, 0, [&](int64_t begin, int64_t end) {
+<<<<<<< HEAD
     // MSVC requires such a declaration of dynamic arrays
     // Source: https://stackoverflow.com/a/33423538
     auto tmp_sum_scalar = std::make_unique<scalar_t[]>(CHUNK_SIZE);
@@ -106,6 +122,15 @@ inline void _vec_log_softmax_lastdim(
             dim_size);
       }
     }
+=======
+    serial_vec_log_softmax_lastdim_range(
+        input_data_base,
+        output_data_base,
+        dim_size,
+        chunk_size,
+        begin,
+        end);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 }
 
@@ -891,6 +916,7 @@ _vec_logsoftmax(
     int64_t outer_size,
     int64_t inner_size,
     int64_t dim_size) {
+<<<<<<< HEAD
   using Vec = vec::Vectorized<scalar_t>;
   int64_t BLOCK_SIZE = 128 * 1024;
   int64_t MAX_CHUNK_SIZE = std::max<int64_t>(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
@@ -985,6 +1011,25 @@ _vec_logsoftmax(
         }
       }
     }
+=======
+  const auto [CHUNK_SIZE_binding, num_chunks_binding] = vec_logsoftmax_chunk_size_and_num_chunks<scalar_t>(
+      inner_size, dim_size);
+  // Work around "capturing a structured binding is not yet supported in OpenMP".
+  const auto CHUNK_SIZE = CHUNK_SIZE_binding;
+  const auto num_chunks = num_chunks_binding;
+
+  // See Note: grain_size value of 0
+  at::parallel_for(0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) {
+    serial_vec_logsoftmax_range(
+        input_data_base,
+        output_data_base,
+        inner_size,
+        CHUNK_SIZE,
+        num_chunks,
+        dim_size,
+        begin,
+        end);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 }
 
@@ -996,6 +1041,7 @@ _vec_logsoftmax(
     int64_t outer_size,
     int64_t inner_size,
     int64_t dim_size) {
+<<<<<<< HEAD
   using Vec = vec::Vectorized<scalar_t>;
   using fVec = vec::Vectorized<float>;
   int64_t BLOCK_SIZE = 128 * 1024;
@@ -1115,6 +1161,25 @@ _vec_logsoftmax(
         }
       }
     }
+=======
+  const auto [CHUNK_SIZE_binding, num_chunks_binding] = vec_logsoftmax_chunk_size_and_num_chunks<scalar_t>(
+      inner_size, dim_size);
+  // Work around "capturing a structured binding is not yet supported in OpenMP".
+  const auto CHUNK_SIZE = CHUNK_SIZE_binding;
+  const auto num_chunks = num_chunks_binding;
+
+  // See Note: grain_size value of 0
+  at::parallel_for(0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) {
+    serial_vec_logsoftmax_range(
+        input_data_base,
+        output_data_base,
+        inner_size,
+        CHUNK_SIZE,
+        num_chunks,
+        dim_size,
+        begin,
+        end);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 }
 
diff --git a/aten/src/ATen/native/cpu/WeightNormKernel.cpp b/aten/src/ATen/native/cpu/WeightNormKernel.cpp
index 9ee5c97be8bc..0558eeb55aaa 100644
--- a/aten/src/ATen/native/cpu/WeightNormKernel.cpp
+++ b/aten/src/ATen/native/cpu/WeightNormKernel.cpp
@@ -48,7 +48,12 @@ void weight_norm_first_dim_kernel(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void sum_norm_per_row(
+=======
+inline std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, void>
+sum_norm_per_row(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t* out_ptr,
     const scalar_t* v_ptr,
     int64_t size) {
@@ -61,16 +66,30 @@ inline void sum_norm_per_row(
       size);
 }
 
+<<<<<<< HEAD
 inline void sum_norm_per_row(
     float* out_ptr,
     const BFloat16* v_ptr,
     int64_t size) {
   using bVec = vec::Vectorized<BFloat16>;
+=======
+template <typename scalar_t>
+inline std::enable_if_t<is_reduced_floating_point_v<scalar_t>, void>
+sum_norm_per_row(
+    float* out_ptr,
+    const scalar_t* v_ptr,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using fVec = vec::Vectorized<float>;
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec v_bvec = bVec::loadu(v_ptr + d);
+<<<<<<< HEAD
     auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec);
+=======
+    auto [v_fvec0, v_fvec1] = vec::convert_to_float<scalar_t>(v_bvec);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     fVec out_fvec0 = fVec::loadu(out_ptr + d) + v_fvec0 * v_fvec0;
     fVec out_fvec1 = fVec::loadu(out_ptr + d + fVec::size()) + v_fvec1 * v_fvec1;
@@ -84,7 +103,12 @@ inline void sum_norm_per_row(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void apply_norm_per_row(
+=======
+inline std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, void>
+apply_norm_per_row(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t* w_ptr,
     const scalar_t* v_ptr,
     const scalar_t* a_ptr,
@@ -98,21 +122,40 @@ inline void apply_norm_per_row(
       size);
 }
 
+<<<<<<< HEAD
 inline void apply_norm_per_row(
     BFloat16* w_ptr,
     const BFloat16* v_ptr,
     const float* a_ptr,
     int64_t size) {
   using bVec = vec::Vectorized<BFloat16>;
+=======
+template <typename scalar_t>
+inline std::enable_if_t<is_reduced_floating_point_v<scalar_t>, void>
+apply_norm_per_row(
+  scalar_t* w_ptr,
+    const scalar_t* v_ptr,
+    const float* a_ptr,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using fVec = vec::Vectorized<float>;
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec v_bvec = bVec::loadu(v_ptr + d);
+<<<<<<< HEAD
     auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec);
 
     fVec w_fvec0 = fVec::loadu(a_ptr + d) * v_fvec0;
     fVec w_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * v_fvec1;
     bVec w_bvec = convert_float_bfloat16(w_fvec0, w_fvec1);
+=======
+    auto [v_fvec0, v_fvec1] = vec::convert_to_float<scalar_t>(v_bvec);
+
+    fVec w_fvec0 = fVec::loadu(a_ptr + d) * v_fvec0;
+    fVec w_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * v_fvec1;
+    bVec w_bvec = vec::convert_from_float<scalar_t>(w_fvec0, w_fvec1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     w_bvec.store(w_ptr + d);
   }
   for(; d < size; ++d) {
@@ -222,7 +265,12 @@ void weight_norm_backward_first_dim_kernel(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void sum_product_per_row(
+=======
+inline std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, void>
+sum_product_per_row(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t* out_ptr,
     const scalar_t* grad_w_ptr,
     const scalar_t* v_ptr,
@@ -237,19 +285,36 @@ inline void sum_product_per_row(
       size);
 }
 
+<<<<<<< HEAD
 inline void sum_product_per_row(
     float* out_ptr,
     const BFloat16* grad_w_ptr,
     const BFloat16* v_ptr,
     int64_t size) {
   using bVec = vec::Vectorized<BFloat16>;
+=======
+template <typename scalar_t>
+inline std::enable_if_t<is_reduced_floating_point_v<scalar_t>, void>
+sum_product_per_row(
+    float* out_ptr,
+    const scalar_t* grad_w_ptr,
+    const scalar_t* v_ptr,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using fVec = vec::Vectorized<float>;
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec grad_w_bvec = bVec::loadu(grad_w_ptr + d);
+<<<<<<< HEAD
     auto [grad_w_fvec0, grad_w_fvec1] = convert_bfloat16_float(grad_w_bvec);
     bVec v_bvec = bVec::loadu(v_ptr + d);
     auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec);
+=======
+    auto [grad_w_fvec0, grad_w_fvec1] = vec::convert_to_float<scalar_t>(grad_w_bvec);
+    bVec v_bvec = bVec::loadu(v_ptr + d);
+    auto [v_fvec0, v_fvec1] = vec::convert_to_float<scalar_t>(v_bvec);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     fVec out_fvec0 = fVec::loadu(out_ptr + d) + grad_w_fvec0 * v_fvec0;
     fVec out_fvec1 = fVec::loadu(out_ptr + d + fVec::size()) + grad_w_fvec1 * v_fvec1;
@@ -264,7 +329,12 @@ inline void sum_product_per_row(
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void apply_per_row_backward(
+=======
+inline std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, void>
+apply_per_row_backward(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scalar_t* grad_v_ptr,
     const scalar_t* grad_w_ptr,
     const scalar_t* v_ptr,
@@ -282,6 +352,7 @@ inline void apply_per_row_backward(
       size);
 }
 
+<<<<<<< HEAD
 inline void apply_per_row_backward(
     BFloat16* grad_v_ptr,
     const BFloat16* grad_w_ptr,
@@ -290,18 +361,40 @@ inline void apply_per_row_backward(
     const float* b_ptr,
     int64_t size) {
   using bVec = vec::Vectorized<BFloat16>;
+=======
+template <typename scalar_t>
+inline std::enable_if_t<is_reduced_floating_point_v<scalar_t>, void>
+apply_per_row_backward(
+  scalar_t* grad_v_ptr,
+    const scalar_t* grad_w_ptr,
+    const scalar_t* v_ptr,
+    const float* a_ptr,
+    const float* b_ptr,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using fVec = vec::Vectorized<float>;
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec grad_w_bvec = bVec::loadu(grad_w_ptr + d);
+<<<<<<< HEAD
     auto [grad_w_fvec0, grad_w_fvec1] = convert_bfloat16_float(grad_w_bvec);
     bVec v_bvec = bVec::loadu(v_ptr + d);
     auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec);
+=======
+    auto [grad_w_fvec0, grad_w_fvec1] = vec::convert_to_float<scalar_t>(grad_w_bvec);
+    bVec v_bvec = bVec::loadu(v_ptr + d);
+    auto [v_fvec0, v_fvec1] = vec::convert_to_float<scalar_t>(v_bvec);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     fVec grad_v_fvec0 = fVec::loadu(a_ptr + d) * grad_w_fvec0 - fVec::loadu(b_ptr + d) * v_fvec0;
     fVec grad_v_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * grad_w_fvec1
         - fVec::loadu(b_ptr + d + fVec::size()) * v_fvec1;
+<<<<<<< HEAD
     bVec grad_v_bvec = convert_float_bfloat16(grad_v_fvec0, grad_v_fvec1);
+=======
+    bVec grad_v_bvec = vec::convert_from_float<scalar_t>(grad_v_fvec0, grad_v_fvec1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grad_v_bvec.store(grad_v_ptr + d);
   }
   for(; d < size; ++d) {
@@ -395,7 +488,11 @@ void weight_norm_kernel(
     int64_t dim) {
   TORCH_INTERNAL_ASSERT(dim == 0 || dim == v.dim() - 1,
       "fused kernels can only be applied for first or last dim");
+<<<<<<< HEAD
   AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, v.scalar_type(),
+=======
+  AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, v.scalar_type(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "weight_norm_kernel", [&]() {
     using accscalar_t = at::opmath_type<scalar_t>;
     if (dim == 0) {
@@ -420,7 +517,11 @@ void weight_norm_backward_kernel(
     int64_t dim) {
   TORCH_INTERNAL_ASSERT(dim == 0 || dim == saved_v.dim() - 1,
       "fused kernels can only be applied for first or last dim");
+<<<<<<< HEAD
   AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, saved_v.scalar_type(),
+=======
+  AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, saved_v.scalar_type(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "weight_norm_backward_kernel", [&]() {
     using accscalar_t = at::opmath_type<scalar_t>;
     if (dim == 0) {
diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
index 8c1000f8de47..023ed6e757ba 100644
--- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
@@ -570,10 +570,15 @@ ComputeInternalGradients(
   at::parallel_for(0, N * C, 1, [=](int64_t start, int64_t end) {
     constexpr int64_t K = Vec::size();
     const int64_t inner_size = HxW / K * K;
+<<<<<<< HEAD
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
     std::array<opmath_t, K / 2> ds_arr;
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
     std::array<opmath_t, K / 2> db_arr;
+=======
+    std::array<opmath_t, K / 2> ds_arr{};
+    std::array<opmath_t, K / 2> db_arr{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(start, end)) {
       const T* dY_ptr = dY + i * HxW;
       const T* X_ptr = X + i * HxW;
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
index 6779ddfc0981..e6cafdbf209a 100644
--- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
@@ -23,6 +23,40 @@ namespace at::native {
 
 namespace {
 
+<<<<<<< HEAD
+=======
+template <typename T>
+void LayerNormSecondPass(
+    const T* X_ptr,
+    const T* gamma_data,
+    const T* beta_data,
+    T* Y_ptr,
+    int64_t N,
+    T scale,
+    T bias) {
+  using Vec = vec::Vectorized<T>;
+  const bool gamma_null = gamma_data == nullptr;
+  const bool beta_null = beta_data == nullptr;
+  if (gamma_null || beta_null) {
+    for (const auto j : c10::irange(N)) {
+      const T gamma_v = gamma_null ? T(1) : gamma_data[j];
+      const T beta_v = beta_null ? T(0) : beta_data[j];
+      Y_ptr[j] = (X_ptr[j] + bias) * scale * gamma_v + beta_v;
+    }
+  } else {
+    vec::map3<T>(
+      [scale, bias](Vec x, Vec gamma, Vec beta) {
+        return (x + Vec(bias)) * Vec(scale) * gamma + beta;
+      },
+      Y_ptr,
+      X_ptr,
+      gamma_data,
+      beta_data,
+      N);
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T,
           typename std::enable_if_t<!is_reduced_floating_point_v<T>, int> = 0>
 void LayerNormKernelImplInternal(
@@ -35,7 +69,10 @@ void LayerNormKernelImplInternal(
     Tensor* Y,
     Tensor* mean,
     Tensor* rstd) {
+<<<<<<< HEAD
   using Vec = vec::Vectorized<T>;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const T* X_data = X.const_data_ptr<T>();
   const T* gamma_data = gamma.defined() ? gamma.const_data_ptr<T>() : nullptr;
   const T* beta_data = beta.defined() ? beta.const_data_ptr<T>() : nullptr;
@@ -43,8 +80,11 @@ void LayerNormKernelImplInternal(
   T* mean_data = mean ? mean->data_ptr<T>() : nullptr;
   T* rstd_data = rstd ? rstd->data_ptr<T>() : nullptr;
 
+<<<<<<< HEAD
   const bool gamma_null = gamma_data == nullptr;
   const bool beta_null = beta_data == nullptr;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const bool mean_null = mean_data == nullptr;
   const bool rstd_null = rstd_data == nullptr;
   at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) {
@@ -55,6 +95,7 @@ void LayerNormKernelImplInternal(
       rstd_val = T(1) / std::sqrt(rstd_val + eps);
       const T scale = rstd_val;
       const T bias = - mean_val;
+<<<<<<< HEAD
       if (gamma_null || beta_null) {
         for (const auto j : c10::irange(N)) {
           const T gamma_v = gamma_null ? T(1) : gamma_data[j];
@@ -72,6 +113,9 @@ void LayerNormKernelImplInternal(
             beta_data,
             N);
       }
+=======
+      LayerNormSecondPass<T>(X_ptr, gamma_data, beta_data, Y_ptr, N, scale, bias);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (!mean_null) {
         mean_data[i] = mean_val;
       }
@@ -191,6 +235,17 @@ void layer_norm_backward_frame(
     T* dX_data,
     T* dgamma_buffer_ptr,
     T* dbeta_buffer_ptr,
+<<<<<<< HEAD
+=======
+    // NOTE: the below @lint-ignore is only necessary because we compile
+    // specializations of this function for c10::complex.
+    // It's extremely likely that nobody actually takes layer norms of
+    // complex tensors, and even if they are, c10::complex is laid out poorly
+    // and basically should never be used.
+    // So it would be nice in the future to figure out how to stop compiling
+    // specializations of compute kernels for c10::complex.
+    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const opmath_t scale,
     const bool gamma_null,
     const bool dX_null,
@@ -481,7 +536,11 @@ void layer_norm_backward_frame(
         fVec r_fvec0 = fVec(a) * dy_fvec0 * gamma_fvec0 + fVec(b) * x_fvec0 + fVec(c);
         fVec r_fvec1 = fVec(a) * dy_fvec1 * gamma_fvec1 + fVec(b) * x_fvec1 + fVec(c);
         bVec r_bvec = convert_from_float<T>(r_fvec0, r_fvec1);
+<<<<<<< HEAD
         r_bvec.store(dX_ptr + d, N - d);
+=======
+        r_bvec.store(dX_ptr + d, static_cast<int>(N - d));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
diff --git a/aten/src/ATen/native/cuda/ActivationHardswishKernel.cu b/aten/src/ATen/native/cuda/ActivationHardswishKernel.cu
index 359c94c4733d..38c4121ad166 100644
--- a/aten/src/ATen/native/cuda/ActivationHardswishKernel.cu
+++ b/aten/src/ATen/native/cuda/ActivationHardswishKernel.cu
@@ -45,9 +45,15 @@ void hardswish_backward_kernel(TensorIterator& iter) {
       [zero, three, neg_three, one_half]GPU_LAMBDA(scalar_t grad_val_, scalar_t self_val_) -> scalar_t {
         opmath_t grad_val = static_cast<opmath_t>(grad_val_);
         opmath_t self_val = static_cast<opmath_t>(self_val_);
+<<<<<<< HEAD
         if (self_val < neg_three) {
           return zero;
         } else if (self_val <= three) {
+=======
+        if (self_val <= neg_three) {
+          return zero;
+        } else if (self_val < three) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return grad_val * ((self_val / three) + one_half);
         } else {
           return grad_val;
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
index 9db469cd4f75..089ef761e723 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
@@ -608,6 +608,11 @@ namespace {
               input_arg{ input, "input", 3 };
 
     adaptive_pool_empty_output_check(gradOutput_, "adaptive_avg_pool2d_backward");
+<<<<<<< HEAD
+=======
+    TORCH_CHECK(input.dim() == gradOutput_.dim(),
+      __func__, ": Expected dimensions ", input.dim(), " for `gradOutput_` but got dimensions ", gradOutput_.dim());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     checkAllSameGPU(__func__, {grad_input_arg, grad_output_arg, input_arg});
 
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
index e0709420969d..0a03e7581e7d 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
@@ -428,6 +428,11 @@ void adaptive_avg_pool3d_backward_out_cuda_template(
   TensorArg input_arg{input, "input", 3};
 
   adaptive_pool_empty_output_check(gradOutput_, "adaptive_avg_pool3d_backward");
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(input.dim() == gradOutput_.dim(),
+    __func__, ": Expected dimensions ", input.dim(), " for `gradOutput_` but got dimensions ", gradOutput_.dim());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   checkAllSameGPU(
       "adaptive_avg_pool3d_out_cuda",
diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu
index 8c161ca62722..cf9adb96122b 100644
--- a/aten/src/ATen/native/cuda/AmpKernels.cu
+++ b/aten/src/ATen/native/cuda/AmpKernels.cu
@@ -13,7 +13,11 @@
 
 
 namespace {
+<<<<<<< HEAD
 // Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g57a3c8313f570282a1a7bcc78743b08e,
+=======
+// Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH__SINGLE.html,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // to ensure the Cuda math library's isfinite is actually what gets called in
 // _amp_non_finite_check_and_unscale_cuda_'s gpu_kernel lambda.
 //
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 3bfa1859155f..2c85cada3806 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -18,6 +18,10 @@
 #include <c10/util/MaybeOwned.h>
 #include <ATen/native/cuda/RowwiseScaledMM.h>
 #include <ATen/native/cuda/ScaledGroupMM.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/cuda/GroupMM.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -35,6 +39,10 @@
 #include <ATen/ops/copy_native.h>
 #include <ATen/ops/dot_native.h>
 #include <ATen/ops/empty.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/empty_strided.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/gelu.h>
 #include <ATen/ops/max.h>
 #include <ATen/ops/mm_native.h>
@@ -121,6 +129,12 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
  *
  * The transpose flags are derived from the layouts of the passed in tensors
  *
+<<<<<<< HEAD
+=======
+ * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
+ * to their unpacked values to match what cuBLAS expects.
+ *
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * @param mat1 First input matrix
  * @param mat2 Second input matrix
  * @param c Output matrix (result)
@@ -173,6 +187,17 @@ struct cublasCommonArgs {
     result_ld = result->stride(transpose_result ? 0 : 1);
     transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
     transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
+<<<<<<< HEAD
+=======
+
+    // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
+    // if the gemm operands are in packed float4
+    if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
+      k = k * 2;
+      lda = lda * 2;
+      ldb = ldb * 2;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Matrix members
@@ -245,8 +270,13 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
 }
 
 static bool getDisableAddmmCudaLt() {
+<<<<<<< HEAD
     static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
     if (env_value != nullptr && strcmp(env_value, "1") == 0) {
+=======
+    static const auto env_value = c10::utils::get_env("DISABLE_ADDMM_CUDA_LT");
+    if (env_value == "1") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return true;
     }
     return false;
@@ -254,17 +284,28 @@ static bool getDisableAddmmCudaLt() {
 
 #ifdef USE_ROCM
 static bool isSupportedHipLtROCmArch(int index) {
+<<<<<<< HEAD
     hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
     std::string device_arch = prop->gcnArchName;
     static const std::vector<std::string> archs = {
         "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60300
         "gfx1200", "gfx1201",
+=======
+    static const std::vector<std::string> archs = {
+        "gfx90a", "gfx942",
+#if ROCM_VERSION >= 60300
+        "gfx1100", "gfx1101", "gfx1200", "gfx1201",
+#endif
+#if ROCM_VERSION >= 60402
+        "gfx1150", "gfx1151",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 #if ROCM_VERSION >= 60500
         "gfx950"
 #endif
     };
+<<<<<<< HEAD
     for (std::string arch : archs) {
         size_t substring = device_arch.find(arch);
         if (substring != std::string::npos) {
@@ -272,6 +313,9 @@ static bool isSupportedHipLtROCmArch(int index) {
         }
     }
     return false;
+=======
+    return at::detail::getCUDAHooks().isGPUArch(archs, index);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #endif
 
@@ -315,7 +359,11 @@ static void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha
   }
 }
 
+<<<<<<< HEAD
 Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None) {
+=======
+Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None, bool disable_addmm_cuda_lt_override=false) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Make sure to keep addmm_cuda below in sync with this code; it
   // preflights a check to try to avoid actually needing to call
   // expand().
@@ -341,10 +389,28 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 #else
   static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
 #endif
+<<<<<<< HEAD
   at::ScalarType scalar_type = self.scalar_type();
   c10::MaybeOwned<Tensor> self_;
   if (&result != &self) {
 #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
+=======
+  // if lt path fails, we recurse back into this function here and force the lt path to off
+  // we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
+  bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
+#if defined(USE_ROCM) && ROCM_VERSION == 60400
+  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
+  cublasCommonArgs _args(mat1, mat2, result);
+  if (_args.transa == 't' && _args.transb == 't') {
+    disable_addmm_cuda_lt_final = true;
+  }
+#endif
+  at::ScalarType scalar_type = mat1.scalar_type();
+  bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
+  c10::MaybeOwned<Tensor> self_;
+  if (&result != &self) {
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Strangely, if mat2 has only 1 row or column, we get
     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
     // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
@@ -353,7 +419,11 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
     // the last two conditions is to skip 16b transA and non-trans-B having
     // leading dim >> rows when they are sliced from a large tensor
     // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
+<<<<<<< HEAD
     if (!disable_addmm_cuda_lt) {
+=======
+    if (!disable_addmm_cuda_lt_final) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
           result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
           self.is_contiguous() && result.is_contiguous() &&
@@ -435,7 +505,15 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 
   if (useLtInterface) {
 #if defined(USE_ROCM)
+<<<<<<< HEAD
     AT_DISPATCH_FLOATING_TYPES_AND2(
+=======
+    bool okay = true;
+    if (is_float_output_with_half_input) {
+      TORCH_CHECK(false, "float output with half input is not enabled for ROCm");
+    } else {
+      AT_DISPATCH_FLOATING_TYPES_AND2(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         at::ScalarType::Half,
         at::ScalarType::BFloat16,
         scalar_type,
@@ -448,9 +526,52 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
               alpha,
               (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
               activation_to_gemm_and_blas_arg(activation));
+<<<<<<< HEAD
         }
         else {
           at::cuda::blas::gemm_and_bias<scalar_t>(
+=======
+        } else {
+          okay = at::cuda::blas::gemm_and_bias<scalar_t>(
+            args.transa == 't',
+            args.transb == 't',
+            args.m,
+            args.n,
+            args.k,
+            alpha.to<at::opmath_type<scalar_t>>(),
+            args.mata->const_data_ptr<scalar_t>(),
+            args.lda,
+            args.matb->const_data_ptr<scalar_t>(),
+            args.ldb,
+            // This condition is needed for mm case on ROCm for hipblasLt path.
+            // Passing the bias ptr as null to avoid accuracy issues for mm case.
+            (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
+            args.result->data_ptr<scalar_t>(),
+            args.result_ld,
+            activation_to_gemm_and_blas_arg(activation)
+          );
+        }
+      });
+    }
+    if (!okay) {
+      // lt path failed; recurse but disable lt path
+      return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true);
+    }
+#else
+    auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
+    bool okay = true;
+    if (is_float_output_with_half_input) {
+      AT_DISPATCH_REDUCED_FLOATING_TYPES(
+        scalar_type,
+        "addmm_cuda_lt",
+        [&] {
+        auto tuning_ctx = at::cuda::tunable::getTuningContext();
+        if (tuning_ctx->IsTunableOpEnabled()) {
+          TORCH_CHECK(false, "Tunable GEMM is not supported for float output with reduced float input");
+        }
+        else {
+          okay = at::cuda::blas::gemm_and_bias<scalar_t, float>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               args.transa == 't',
               args.transb == 't',
               args.m,
@@ -461,6 +582,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
               args.lda,
               args.matb->const_data_ptr<scalar_t>(),
               args.ldb,
+<<<<<<< HEAD
               // This condition is needed for mm case on ROCm for hipblasLt path.
               // Passing the bias ptr as null to avoid accuracy issues for mm case.
               (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
@@ -481,6 +603,16 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 #endif
 
     AT_DISPATCH_FLOATING_TYPES_AND2(
+=======
+              self.const_data_ptr<scalar_t>(),
+              args.result->data_ptr<float>(),
+              args.result_ld,
+              activation_epilogue
+          );
+        }});
+    } else {
+      AT_DISPATCH_FLOATING_TYPES_AND2(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         at::ScalarType::Half,
         at::ScalarType::BFloat16,
         scalar_type,
@@ -495,7 +627,11 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
               activation_epilogue);
         }
         else {
+<<<<<<< HEAD
           at::cuda::blas::gemm_and_bias<scalar_t>(
+=======
+          okay = at::cuda::blas::gemm_and_bias<scalar_t>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               args.transa == 't',
               args.transb == 't',
               args.m,
@@ -511,11 +647,52 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
               args.result_ld,
               activation_epilogue
           );
+<<<<<<< HEAD
         }});
 #endif
   } else
   {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+=======
+      }});
+    }
+    if (!okay) {
+      // lt path failed; recurse but disable lt path
+      return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true);
+    }
+#endif
+  } else
+  {
+    if (is_float_output_with_half_input) {
+      AT_DISPATCH_REDUCED_FLOATING_TYPES(
+        scalar_type,
+        "addmm_cuda",
+        [&] {
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_t alpha_val = alpha.to<opmath_t>();
+          opmath_t beta_val = beta.to<opmath_t>();
+          const scalar_t* mat1_ptr = args.mata->const_data_ptr<scalar_t>();
+          const scalar_t* mat2_ptr = args.matb->const_data_ptr<scalar_t>();
+
+          float* result_ptr = args.result->mutable_data_ptr<float>();
+          at::cuda::blas::gemm<scalar_t, float>(
+              args.transa,
+              args.transb,
+              args.m,
+              args.n,
+              args.k,
+              alpha_val,
+              mat1_ptr,
+              args.lda,
+              mat2_ptr,
+              args.ldb,
+              beta_val,
+              result_ptr,
+              args.result_ld);
+        });
+    } else {
+      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         at::ScalarType::Half,
         at::ScalarType::BFloat16,
         scalar_type,
@@ -542,6 +719,10 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
               result_ptr,
               args.result_ld);
         });
+<<<<<<< HEAD
+=======
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (activation) {
       case Activation::RELU:
         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
@@ -559,7 +740,11 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 // gating activation_to_gemm_and_blas_arg above; here we are manually
 // performing a post-GELU because we weren't able to use the GELU
 // epilogue above.
+<<<<<<< HEAD
 #if !(defined(CUDA_VERSION) && CUDA_VERSION >= 11080) && !defined(USE_ROCM)
+=======
+#if !defined(CUDA_VERSION) && !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (useLtInterface && activation == Activation::GELU) {
     at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
   }
@@ -614,6 +799,7 @@ const Tensor& baddbmm_out_cuda_impl(const Tensor& result, const Tensor& self, co
   int64_t num_batches = result_->sizes()[0];
 
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj());
+<<<<<<< HEAD
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "baddbmm_cuda", [&] {
     using opmath_t = at::opmath_type<scalar_t>;
@@ -647,6 +833,79 @@ const Tensor& baddbmm_out_cuda_impl(const Tensor& result, const Tensor& self, co
       );
    }
   });
+=======
+  bool is_float_output_with_half_input = (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
+
+  if (is_float_output_with_half_input) {
+    AT_DISPATCH_REDUCED_FLOATING_TYPES(batch1.scalar_type(), "baddbmm_cuda", [&] {
+      using opmath_t = at::opmath_type<scalar_t>;
+      opmath_t alpha_val = alpha.to<opmath_t>();
+      opmath_t beta_val = beta.to<opmath_t>();
+      const scalar_t* batch1_ptr = batch1_->const_data_ptr<scalar_t>();
+      const scalar_t* batch2_ptr = batch2_->const_data_ptr<scalar_t>();
+      const auto transa = transpose_batch1 ? batch1_->is_conj() ? 'c' : 't' : 'n';
+      const auto transb = transpose_batch2 ? batch2_->is_conj() ? 'c' : 't' : 'n';
+
+      float* result_ptr = result_->mutable_data_ptr<float>();
+
+      // If batch is 1 call gemm rather than bgemm
+      if (num_batches == 1) {
+          at::cuda::blas::gemm<scalar_t, float>(
+              transa, transb,
+              m, n, k,
+              alpha_val,
+              batch1_ptr, lda,
+              batch2_ptr, ldb,
+              beta_val,
+              result_ptr, ldc);
+        } else {
+          at::cuda::blas::bgemm<scalar_t, float>(
+            transa, transb,
+            m, n, k,
+            alpha_val,
+            batch1_ptr, lda, batch1_->strides()[0],
+            batch2_ptr, ldb, batch2_->strides()[0],
+            beta_val,
+            result_ptr, ldc, result_->strides()[0],
+            num_batches
+          );
+        }
+    });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, batch1.scalar_type(), "baddbmm_cuda", [&] {
+      using opmath_t = at::opmath_type<scalar_t>;
+      opmath_t alpha_val = alpha.to<opmath_t>();
+      opmath_t beta_val = beta.to<opmath_t>();
+      const scalar_t* batch1_ptr = batch1_->const_data_ptr<scalar_t>();
+      const scalar_t* batch2_ptr = batch2_->const_data_ptr<scalar_t>();
+      const auto transa = transpose_batch1 ? batch1_->is_conj() ? 'c' : 't' : 'n';
+      const auto transb = transpose_batch2 ? batch2_->is_conj() ? 'c' : 't' : 'n';
+      scalar_t* result_ptr = result_->mutable_data_ptr<scalar_t>();
+      // If batch is 1 call gemm rather than bgemm
+      if (num_batches == 1) {
+        at::cuda::blas::gemm<scalar_t>(
+            transa, transb,
+            m, n, k,
+            alpha_val,
+            batch1_ptr, lda,
+            batch2_ptr, ldb,
+            beta_val,
+            result_ptr, ldc);
+      } else {
+        at::cuda::blas::bgemm<scalar_t>(
+          transa, transb,
+          m, n, k,
+          alpha_val,
+          batch1_ptr, lda, batch1_->strides()[0],
+          batch2_ptr, ldb, batch2_->strides()[0],
+          beta_val,
+          result_ptr, ldc, result_->strides()[0],
+          num_batches
+        );
+      }
+    });
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!result.is_same(*result_)) {
     result.copy_(*result_);
   }
@@ -892,7 +1151,11 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result)
 
   TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
 
+<<<<<<< HEAD
 #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11070)) || defined(USE_ROCM)
+=======
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cublasCommonArgs args(self, mat2, result);
 
   at::cuda::blas::int8_gemm(
@@ -927,10 +1190,15 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
   return _int_mm_out_cuda(self, mat2, result);
 }
 
+<<<<<<< HEAD
 static bool _scaled_mm_allowed_device() {
     auto dprops = at::cuda::getCurrentDeviceProperties();
 #ifdef USE_ROCM
     std::string device_arch = dprops->gcnArchName;
+=======
+static bool _scaled_mm_allowed_device(bool sm90_only=false) {
+#ifdef USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     static const std::vector<std::string> archs = {
         "gfx942",
 #if ROCM_VERSION >= 60300
@@ -940,6 +1208,7 @@ static bool _scaled_mm_allowed_device() {
         "gfx950"
 #endif
     };
+<<<<<<< HEAD
     for (std::string arch : archs) {
         size_t substring = device_arch.find(arch);
         if (substring != std::string::npos) {
@@ -949,11 +1218,22 @@ static bool _scaled_mm_allowed_device() {
     return false;
 #else
     return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
+=======
+    return at::detail::getCUDAHooks().isGPUArch(archs);
+#else
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    if (sm90_only) {
+      return dprops->major == 9;
+    } else {
+      return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 }
 
 #ifdef USE_ROCM
 static bool _scaled_mm_is_fnuz() {
+<<<<<<< HEAD
     auto dprops = at::cuda::getCurrentDeviceProperties();
     std::string device_arch = dprops->gcnArchName;
     static const std::vector<std::string> archs = {"gfx942"};
@@ -964,6 +1244,9 @@ static bool _scaled_mm_is_fnuz() {
         }
     }
     return false;
+=======
+    return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #endif
 
@@ -980,7 +1263,11 @@ enum class ScalingType : std::uint8_t {
  * ---------------------------
  * Conditions and corresponding Scaling Types:
  *
+<<<<<<< HEAD
  * - If scale tensors are Float8_e8m0fnu:
+=======
+ * - If scale tensors are both `Float8_e8m0fnu` or `Float8_e4m3fn`:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  *   - Returns BlockWise (with additional size checks).
  *
  * - If scale_a.numel() == 1 && scale_b.numel() == 1:
@@ -1001,6 +1288,7 @@ ScalingType get_scaling_type(
     int64_t dim_m,
     int64_t dim_k,
     int64_t dim_n) {
+<<<<<<< HEAD
   // Check for BlockWise scaling (FP8_E8M0 types)
   if (scale_a.scalar_type() == scale_b.scalar_type() &&
       scale_a.scalar_type() == at::kFloat8_e8m0fnu) {
@@ -1009,6 +1297,24 @@ ScalingType get_scaling_type(
 
     auto ceil_div = [](auto a, auto b) { return (a + b - 1) / b; };
     auto num_k_blocks = ceil_div(dim_k, BLOCK_SIZE_K);
+=======
+  // Check for BlockWise scaling (FP8_E8M0 and FP8_E4M3 types)
+  if ((scale_a.scalar_type() == scale_b.scalar_type()) &&
+      ((scale_a.scalar_type() == at::kFloat8_e8m0fnu) || (scale_a.scalar_type() == at::kFloat8_e4m3fn))) {
+    const bool is_nvfp4 = scale_a.scalar_type() == at::kFloat8_e4m3fn;
+
+    // cuBLAS's mxfp8 gemm: block_size is 1 scale per 32 elements
+    // cuBLAS's nvfp4 gemm: block_size is 1 scale per 16 unpacked elements.
+    const auto BLOCK_SIZE_K = is_nvfp4 ? 16 : 32;
+
+    constexpr int64_t BLOCK_SIZE_MN = 128;
+
+    // adjust for fp4x2 packing if necessary
+    const auto dim_k_unpacked = is_nvfp4 ? dim_k * 2 : dim_k;
+
+    auto ceil_div = [](auto a, auto b) { return (a + b - 1) / b; };
+    auto num_k_blocks = ceil_div(dim_k_unpacked, BLOCK_SIZE_K);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4;
 
     // TODO: We might want to enforce some structure on the shapes of the scale
@@ -1153,8 +1459,13 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
        mat2.sizes()[1], ") must be divisible by 16");
   // Check types
   TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
+<<<<<<< HEAD
   TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
   TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
+=======
+  TORCH_CHECK(isFloat8Type(mat1.scalar_type()) || mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2, "Expected mat1 to be Float8 or Float4_x2 matrix got ", mat1.scalar_type());
+  TORCH_CHECK(isFloat8Type(mat2.scalar_type()) || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2, "Expected mat2 to be Float8 or Float4_x2 matrix got ", mat2.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef USE_ROCM
   // Type restrictions imposed by CuBLASLt as of CUDA-12.1
   TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
@@ -1168,6 +1479,23 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
     TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e4m3fn is only supported for ROCm 6.0 and above");
   }
 #endif
+<<<<<<< HEAD
+=======
+  if (use_fast_accum) {
+    TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype.");
+  }
+#ifdef USE_ROCM
+  if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
+    TORCH_CHECK(ROCM_VERSION >= 70000, "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
+  }
+  if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
+    TORCH_CHECK(ROCM_VERSION >= 70000, "Float8_e5m2 is only supported for ROCm 7.0 and above");
+  }
+  if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
+    TORCH_CHECK(ROCM_VERSION >= 70000, "Float8_e4m3fn is only supported for ROCm 7.0 and above");
+  }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (bias) {
     TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
     TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
@@ -1237,7 +1565,11 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   }
   else if (scaling_choice == ScalingType::BlockWise) {
 #if ROCM_VERSION >= 70000
+<<<<<<< HEAD
     TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch(0, {"gfx950"}),
+=======
+    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}, 0),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
 
     TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
@@ -1396,23 +1728,40 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
 }
 
 namespace {
+<<<<<<< HEAD
   c10::SmallVector<int64_t, 3> compute_grouped_gemm_output_size(const Tensor& mat_a,
   const Tensor& mat_b,
   const std::optional<at::Tensor>& offs
   ) {
+=======
+  at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a,
+  const Tensor& mat_b,
+  const std::optional<at::Tensor>& offs,
+  std::optional<c10::ScalarType> out_dtype
+  ) {
+    c10::SmallVector<int64_t, 3> out_size;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const bool a_is_2d = mat_a.dim() == 2;
     const bool b_is_2d = mat_b.dim() == 2;
     if (a_is_2d) {
       if (b_is_2d) {
+<<<<<<< HEAD
         return {offs->size(0), mat_a.size(0), mat_b.size(1)};
       } else {
         TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
         return {mat_a.size(0), mat_b.size(-1)};
+=======
+        out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)};
+      } else {
+        TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
+        out_size = {mat_a.size(0), mat_b.size(-1)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     } else {
       if (b_is_2d) {
         // this case is not actually encountered for MoE gemms
         TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match");
+<<<<<<< HEAD
         return {mat_a.size(1), mat_b.size(1)};
       } else { // regular bmm
         TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
@@ -1431,6 +1780,48 @@ namespace {
       return false;
     } else {
       TORCH_CHECK(false, "Tensor should not be self-overlapping");
+=======
+        out_size = {mat_a.size(1), mat_b.size(1)};
+      } else { // regular bmm
+        TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
+        out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)};
+      }
+    }
+
+    const auto out_dtype_ = out_dtype.value_or(kBFloat16);
+    TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
+
+    // For TMA transfers, strides of output tensor have to be either
+    // 1, or aligned to 16 bytes.
+    const auto last_dim = out_size.size() - 1;
+    const auto alignment = 16 / c10::elementSize(out_dtype_);
+    const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment;
+    std::vector<int64_t> out_stride;
+    if (a_is_2d != b_is_2d) {
+      out_stride = {size_padded, 1};
+    } else {
+      out_stride = {out_size[1] * size_padded, size_padded, 1};
+    }
+    auto out = at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
+
+    return out;
+  }
+
+  bool check_valid_strides_and_return_transposed(const Tensor& mat) {
+    IntArrayRef tensor_strides = mat.strides();
+    IntArrayRef tensor_sizes = mat.sizes();
+    int end_dim = mat.dim() - 1;
+    int alignment = 16 / mat.element_size();
+    TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n");
+    if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max<int64_t>(1, tensor_sizes[end_dim - 1]))) {
+      TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes");
+      return true;
+    } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max<int64_t>(1, tensor_sizes[end_dim]))) {
+      TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes");
+      return false;
+    } else {
+      TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -1443,7 +1834,11 @@ namespace {
           "D, arg ",
           arg_idx);
       TORCH_CHECK(
+<<<<<<< HEAD
           scale.is_contiguous(), "scale_a must be contiguous for arg ", arg_idx);
+=======
+          scale.is_contiguous(), "scale must be contiguous for arg ", arg_idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_CHECK(
           scale.size(0) == mat.size(dim) * scale_multiplier,
           "scale must have the same length as mat for arg ",
@@ -1456,8 +1851,13 @@ namespace {
           "D for arg ",
           arg_idx);
       TORCH_CHECK(
+<<<<<<< HEAD
           scale.stride(1),
           "scale_a must be contiguous in the last dimension for arg ",
+=======
+          scale.stride(1) == 1,
+          "scale must be contiguous in the last dimension for arg ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           arg_idx);
       TORCH_CHECK(
           scale.size(0) == mat.size(0),
@@ -1496,6 +1896,7 @@ const std::optional<at::Tensor>& scale_result,
 std::optional<c10::ScalarType> out_dtype,
 bool use_fast_accum) {
 #ifndef USE_ROCM
+<<<<<<< HEAD
   bool allowed_device = _scaled_mm_allowed_device();
   TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
 
@@ -1503,6 +1904,15 @@ bool use_fast_accum) {
   TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
   TORCH_CHECK(!transposed(mat_a), "Expected mat1 to not be transposed");
   TORCH_CHECK(transposed(mat_b), "Expected mat2 to be transposed");
+=======
+  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true);
+  TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0");
+
+  TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
+  TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
+  TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
+  TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
   TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
   const bool a_is_2d = mat_a.dim() == 2;
@@ -1520,7 +1930,12 @@ bool use_fast_accum) {
     ").");
 
 
+<<<<<<< HEAD
 
+=======
+  TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
+  TORCH_CHECK(!scale_result.has_value(), "Scale result not supported yet");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix");
 
   if (offs.has_value()) {
@@ -1537,11 +1952,15 @@ bool use_fast_accum) {
   check_scale(mat_a, scale_a, 0 ,0, scale_multiplier);
   check_scale(mat_b, scale_b, 1, 1, scale_multiplier);
 
+<<<<<<< HEAD
   const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
   TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
   const auto out_size = compute_grouped_gemm_output_size(mat_a, mat_b, offs);
   Tensor out = at::empty(out_size, mat_a.options().dtype(out_dtype_));
 
+=======
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   at::cuda::detail::f8f8bf16_grouped_mm(
       mat_a,
@@ -1563,5 +1982,126 @@ bool use_fast_accum) {
 
 }
 
+<<<<<<< HEAD
+=======
+Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype) {
+#ifndef USE_ROCM
+  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true);
+  TORCH_CHECK(allowed_device, "torch._grouped_mm is only supported on CUDA devices with compute capability = 9.0");
+
+  TORCH_CHECK(mat_a.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_a.scalar_type());
+  TORCH_CHECK(mat_b.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_b.scalar_type());
+  TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
+  TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+
+  // check that the strides are valid, the fn will throw an error if not
+  check_valid_strides_and_return_transposed(mat_a);
+  check_valid_strides_and_return_transposed(mat_b);
+  TORCH_CHECK(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d");
+
+  if (offs.has_value()) {
+    TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
+    TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
+  }
+  TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
+
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
+
+  at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
+  return out;
+#else
+  TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
+#endif
+}
+
+Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
+  IntArrayRef batch1_sizes = batch1.sizes();
+  IntArrayRef batch2_sizes = batch2.sizes();
+
+  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
+  return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
+}
+
+Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+
+  Scalar beta(0.0);
+  Scalar alpha(1.0);
+  {
+    NoNamesGuard guard;
+    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
+  }
+
+  return out;
+}
+
+Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
+  // We need to copy the tensor
+  Tensor out = self.clone().to(self.options().dtype(out_dtype));
+
+  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
+}
+
+Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+
+  {
+    NoNamesGuard guard;
+    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
+  }
+
+  return out;
+}
+
+Tensor _mm_dtype_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype) {
+  Tensor result = at::empty({self.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
+  return _mm_dtype_out_cuda(self, mat2, out_dtype, result);
+}
+
+Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype, Tensor &out) {
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "input dtypes must be the same");
+  TORCH_CHECK(out_dtype == self.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+
+  addmm_out_cuda_impl(const_cast<Tensor&>(out), out, self, mat2, 0, 1);
+
+  return out;
+}
+
+Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
+  Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
+  return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
+}
+
+Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+  TORCH_CHECK(out_dtype == self.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+  addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);
+
+  return out;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index 297dcd307fe2..c2d543e189c2 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -295,10 +295,16 @@ static inline void launch_vectorized_kernel(
   auto stream = at::cuda::getCurrentCUDAStream();
 #ifdef USE_ROCM
   int vec_size = memory::can_vectorize_up_to<func_t>(data);
+<<<<<<< HEAD
   // Similar check in vectorized_elementwise_kernel() as well. Both should be in sync.
   c10::DeviceIndex curDevice = -1;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
   int tws = at::detail::getCUDAHooks().isGPUArch(curDevice, {"gfx942"}) ? 16 : elems_per_thread<io_size>();
+=======
+  c10::DeviceIndex curDevice = -1;
+  AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
+  int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? 16 : elems_per_thread<io_size>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   using cpp_type = typename function_traits<func_t>::result_type;
   const uint16_t max_vec_size = memory::can_vectorize_up_to<func_t>(data);
@@ -347,8 +353,13 @@ static inline void launch_vectorized_kernel(
       auto output_calc = TrivialOffsetCalculator<1>();
       auto loader = memory::LoadWithoutCast();
       auto storer = memory::StoreWithoutCast();
+<<<<<<< HEAD
       int64_t grid_unrolled = (N + io_block_work_size<io_size>() - 1) / io_block_work_size<io_size>();
       unrolled_elementwise_kernel<func_t, array_t, elems_per_thread<io_size>()>
+=======
+      int64_t grid_unrolled = (N + elementwise_block_work_size() - 1) / elementwise_block_work_size();
+      unrolled_elementwise_kernel<func_t, array_t, elementwise_thread_work_size()>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           <<<grid_unrolled, num_threads(), 0, stream>>>(
               N, f, data, input_calc, output_calc, loader, storer);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -533,6 +544,7 @@ __global__ void elementwise_kernel(int N, func_t f) {
   }
 }
 
+<<<<<<< HEAD
 #ifdef USE_ROCM
 template <int nt, int vt, typename func_t>
 C10_LAUNCH_BOUNDS_2(nt, 4)
@@ -554,6 +566,8 @@ __global__ void elementwise_kernel_manual_unroll(int N, func_t f) {
 }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <int nt, int vt, typename func_t>
 static void launch_legacy_kernel(int64_t N, const func_t& f) {
   TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
@@ -569,6 +583,28 @@ static void launch_legacy_kernel(int64_t N, const func_t& f) {
 
 #ifdef USE_ROCM
 template <int nt, int vt, typename func_t>
+<<<<<<< HEAD
+=======
+C10_LAUNCH_BOUNDS_2(nt, 4)
+__global__ void elementwise_kernel_manual_unroll(int N, func_t f) {
+  int tid = threadIdx.x;
+  constexpr int nv = nt * vt;
+  int idx = nv * blockIdx.x + tid;
+  if ((idx + nt*(vt-1)) < N) {
+    f(idx, true);
+  } else {
+#pragma unroll
+    for (int i = 0; i < vt; i++) {
+      if (idx < N) {
+        f(idx, false);
+        idx += nt;
+      }
+    }
+  }
+}
+
+template <int nt, int vt, typename func_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void launch_legacy_kernel_manual_unroll(int64_t N, const func_t& f) {
   TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
   if (N == 0) {
@@ -671,6 +707,7 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
   constexpr int unroll_factor = sizeof(arg0_t) >= 4 ? 4 : 8;
   constexpr int grp_sz = 128;
   launch_legacy_kernel_manual_unroll<grp_sz, unroll_factor>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+<<<<<<< HEAD
      if constexpr (unroll_factor == 4) {
        if (unrl) {
          auto offsets0 = offset_calc.get(idx);
@@ -736,6 +773,65 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
          *out = invoke(f, &data[1], &offsets[1], 1);
        }
      }
+=======
+    if (unrl) {
+      if constexpr (unroll_factor == 4) {
+        auto offsets0 = offset_calc.get(idx);
+        auto offsets1 = offset_calc.get(idx+grp_sz);
+        auto offsets2 = offset_calc.get(idx+grp_sz*2);
+        auto offsets3 = offset_calc.get(idx+grp_sz*3);
+        arg0_t* out0 = (arg0_t*)(data[0] + offsets0[0]);
+        arg0_t* out1 = (arg0_t*)(data[0] + offsets1[0]);
+        arg0_t* out2 = (arg0_t*)(data[0] + offsets2[0]);
+        arg0_t* out3 = (arg0_t*)(data[0] + offsets3[0]);
+        auto tmp0 = invoke(f, &data[1], &offsets0[1], 1);
+        auto tmp1 = invoke(f, &data[1], &offsets1[1], 1);
+        auto tmp2 = invoke(f, &data[1], &offsets2[1], 1);
+        auto tmp3 = invoke(f, &data[1], &offsets3[1], 1);
+        *out0 = tmp0;
+        *out1 = tmp1;
+        *out2 = tmp2;
+        *out3 = tmp3;
+      } else {
+        auto offsets0 = offset_calc.get(idx);
+        auto offsets1 = offset_calc.get(idx+grp_sz);
+        auto offsets2 = offset_calc.get(idx+grp_sz*2);
+        auto offsets3 = offset_calc.get(idx+grp_sz*3);
+        auto offsets4 = offset_calc.get(idx+grp_sz*4);
+        auto offsets5 = offset_calc.get(idx+grp_sz*5);
+        auto offsets6 = offset_calc.get(idx+grp_sz*6);
+        auto offsets7 = offset_calc.get(idx+grp_sz*7);
+        arg0_t* out0 = (arg0_t*)(data[0] + offsets0[0]);
+        arg0_t* out1 = (arg0_t*)(data[0] + offsets1[0]);
+        arg0_t* out2 = (arg0_t*)(data[0] + offsets2[0]);
+        arg0_t* out3 = (arg0_t*)(data[0] + offsets3[0]);
+        arg0_t* out4 = (arg0_t*)(data[0] + offsets4[0]);
+        arg0_t* out5 = (arg0_t*)(data[0] + offsets5[0]);
+        arg0_t* out6 = (arg0_t*)(data[0] + offsets6[0]);
+        arg0_t* out7 = (arg0_t*)(data[0] + offsets7[0]);
+        auto tmp0 = invoke(f, &data[1], &offsets0[1], 1);
+        auto tmp1 = invoke(f, &data[1], &offsets1[1], 1);
+        auto tmp2 = invoke(f, &data[1], &offsets2[1], 1);
+        auto tmp3 = invoke(f, &data[1], &offsets3[1], 1);
+        auto tmp4 = invoke(f, &data[1], &offsets4[1], 1);
+        auto tmp5 = invoke(f, &data[1], &offsets5[1], 1);
+        auto tmp6 = invoke(f, &data[1], &offsets6[1], 1);
+        auto tmp7 = invoke(f, &data[1], &offsets7[1], 1);
+        *out0 = tmp0;
+        *out1 = tmp1;
+        *out2 = tmp2;
+        *out3 = tmp3;
+        *out4 = tmp4;
+        *out5 = tmp5;
+        *out6 = tmp6;
+        *out7 = tmp7;
+      }
+    } else {
+      auto offsets = offset_calc.get(idx);
+      arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
+      *out = invoke(f, &data[1], &offsets[1], 1);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 #endif
 }
@@ -893,6 +989,72 @@ struct type_specialized_kernel_launcher {
   }
 };
 
+<<<<<<< HEAD
+=======
+template <int arg_index>
+struct type_specialized_broadcast_kernel_launcher {
+  template <
+      typename func_t,
+      typename array_t,
+      typename dtypes_t,
+      typename calc_t>
+  static void apply(
+      int64_t numel,
+      func_t f,
+      array_t data,
+      dtypes_t dtypes,
+      calc_t offset_calc) {
+        using traits = function_traits<func_t>;
+        using ret_t = typename traits::result_type;
+        using arg0_t = typename traits::template arg<0>::type;
+        using arg1_t = typename traits::template arg<1>::type;
+        if (dtypes[0] == rt_binary_specializations[arg_index][0] &&
+          dtypes[1] == rt_binary_specializations[arg_index][1] &&
+          dtypes[2] == rt_binary_specializations[arg_index][2]) {
+            using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][0]>;
+            using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][1]>;
+            using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][2]>;
+            constexpr int grp_sz = 128;
+            launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+              if (unrl) {
+                auto offsets0 = offset_calc.get(idx);
+                auto offsets1 = offset_calc.get(idx + grp_sz);
+                auto offsets2 = offset_calc.get(idx + grp_sz * 2);
+                auto offsets3 = offset_calc.get(idx + grp_sz * 3);
+                void* out0 = data[0] + offsets0[0];
+                void* out1 = data[0] + offsets1[0];
+                void* out2 = data[0] + offsets2[0];
+                void* out3 = data[0] + offsets3[0];
+                auto u = c10::load<arg0_cpp_t>(data[1] + offsets0[1]);
+                auto v = c10::load<arg1_cpp_t>(data[2] + offsets0[2]);
+                ret_t result0 = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
+                auto u1 = c10::load<arg0_cpp_t>(data[1] + offsets1[1]);
+                auto v1 = c10::load<arg1_cpp_t>(data[2]+ offsets1[2]);
+                ret_t result1 = f(c10::convert<arg0_t>(u1), c10::convert<arg1_t>(v1));
+                auto u2 = c10::load<arg0_cpp_t>(data[1] + offsets2[1]);
+                auto v2 = c10::load<arg1_cpp_t>(data[2] + offsets2[2]);
+                ret_t result2 = f(c10::convert<arg0_t>(u2), c10::convert<arg1_t>(v2));
+                auto u3 = c10::load<arg0_cpp_t>(data[1] + offsets3[1]);
+                auto v3 = c10::load<arg1_cpp_t>(data[2] + offsets3[2]);
+                ret_t result3 = f(c10::convert<arg0_t>(u3), c10::convert<arg1_t>(v3));
+                *(ret_cpp_t*)out0 = c10::convert<ret_cpp_t>(result0);
+                *(ret_cpp_t*)out1 = c10::convert<ret_cpp_t>(result1);
+                *(ret_cpp_t*)out2 = c10::convert<ret_cpp_t>(result2);
+                *(ret_cpp_t*)out3 = c10::convert<ret_cpp_t>(result3);
+              } else {
+                auto offsets = offset_calc.get(idx);
+                void* out = data[0] + offsets[0];
+                auto u = c10::load<arg0_cpp_t>(data[1] + offsets[1]);
+                auto v = c10::load<arg1_cpp_t>(data[2] + offsets[2]);
+                ret_t result = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
+                *(ret_cpp_t*)out = c10::convert<ret_cpp_t>(result);
+              }
+            });
+        }
+      }
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 #endif
 
@@ -969,10 +1131,33 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
       dtypes[i] = iter.dtype(i);
       strides[i] = inner_strides[i];
     }
+<<<<<<< HEAD
     launch_legacy_kernel<512, 1>(numel, [=]GPU_LAMBDA(int idx) {
       void* out = data[0] + strides[0] * idx;
       arg0_t result = invoke(f, &data[1], &strides[1], &dtypes[1], idx);
       c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+=======
+    constexpr int grp_sz = 128;
+    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+      if (unrl) {
+        void* out0 = data[0] + strides[0] * idx;
+        void* out1 = data[0] + strides[0] * (idx + grp_sz);
+        void* out2 = data[0] + strides[0] * (idx + grp_sz * 2);
+        void* out3 = data[0] + strides[0] * (idx + grp_sz * 3);
+        arg0_t result0 = invoke(f, &data[1], &strides[1], &dtypes[1], idx);
+        arg0_t result1 = invoke(f, &data[1], &strides[1], &dtypes[1], (idx + grp_sz));
+        arg0_t result2 = invoke(f, &data[1], &strides[1], &dtypes[1], (idx + grp_sz * 2));
+        arg0_t result3 = invoke(f, &data[1], &strides[1], &dtypes[1], (idx + grp_sz * 3));
+        c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
+        c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
+        c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
+      } else {
+        void* out = data[0] + strides[0] * idx;
+        arg0_t result = invoke(f, &data[1], &strides[1], &dtypes[1], idx);
+        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     });
 #else
     auto loader = memory::LoadWithCast<traits::arity>(iter);
@@ -995,6 +1180,35 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
     }
     auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
 #ifdef USE_ROCM
+<<<<<<< HEAD
+=======
+    if (check_binary_rt_types_for_specialization(iter)) {
+      // constexpr to reduce the amount of kernels generated for
+      // broadcast elementwise with mexed dtypes and limit which functors are actually
+      // applied to the load and store at compile time.
+      using func_tuple = typename traits::ArgsTuple;
+      if constexpr (
+        std::is_same_v<float, arg0_t> && traits::arity == 2 &&
+        check_binary_functor_types_for_specialization<
+          func_tuple,
+          float,
+          float,
+          traits::arity,
+          /*arg_num=*/0>::check()) {
+            memory::detail::static_unroll<
+              type_specialized_broadcast_kernel_launcher,
+              rt_binary_specializations.size()>::with_args(
+                numel,
+                f,
+                data,
+                dtypes,
+                offset_calc
+            );
+            return;
+      }
+    }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr int grp_sz = 128;
     launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
       if (unrl) {
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
index 425cfe273785..14cf8d5c2376 100644
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@@ -441,8 +441,12 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
     auto* ptr = (dst_device == kCPU ? dst : src);
     auto* ctx = host_tensor.storage().data_ptr().get_context();
     // TODO: warn on the return value.
+<<<<<<< HEAD
     CachingHostAllocator_recordEvent(ptr, ctx, stream);
 
+=======
+    at::getHostAllocator(at::kCUDA)->record_event(ptr, ctx, stream.unwrap());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     at::cuda::memcpy_and_sync(dst, src, nbytes, kind, stream);
   }
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index 08d07c4b45a5..ea708dabd605 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -16,7 +16,11 @@
 #include <string>
 #include <unordered_map>
 
+<<<<<<< HEAD
 namespace at { namespace native { namespace detail {
+=======
+namespace at::native::detail {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Enum representing the FFT type
 enum class CuFFTTransformType : int8_t {
@@ -58,7 +62,11 @@ struct CuFFTParams
   }
 };
 
+<<<<<<< HEAD
 static_assert(std::is_trivial_v<CuFFTParams>, "");
+=======
+static_assert(std::is_trivial_v<CuFFTParams> );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Returns true if the transform type has complex input
 inline bool cufft_complex_input(CuFFTTransformType type) {
@@ -491,4 +499,8 @@ void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_si
 int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index);
 void cufft_clear_plan_cache_impl(DeviceIndex device_index);
 
+<<<<<<< HEAD
 }}} // namespace at::native::detail
+=======
+} // namespace at::native::detail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index 0950caee9efd..f9e88b319d0c 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -152,7 +152,11 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba
       if (kernel_h<=MAXh &&
           kernel_w<=MAXw &&
           channels<=MAXc*(blockDim.x*kernel_stride_C)) {
+<<<<<<< HEAD
         scalar_t val [MAXh][MAXw][MAXc] = {0};
+=======
+        scalar_t val [MAXh][MAXw][MAXc] = {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for (int ih = 0; ih < MAXh; ih++) {
           int ih_ = ih*dilation_h+hstart;
           for (int iw = 0; iw < MAXw; iw++) {
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
index 11d44b9d4cd0..004b665a65d7 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@@ -356,7 +356,11 @@ struct CopyFunctor {
   static_assert(depth == 2 && r_args_depth == 1 && res_arg_index == 1);
   template <typename Op>
   __device__ __forceinline__ void operator()(
+<<<<<<< HEAD
       int chunk_size,
+=======
+      int64_t chunk_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TensorListMetadata<depth>& tl,
       Op op) {
     const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
@@ -441,7 +445,10 @@ void foreach_tensor_copy_list_kernel_cuda_(
       self[0].scalar_type(),
       "foreach_tensor_copy",
       [&]() {
+<<<<<<< HEAD
         using opmath_t = at::opmath_type<scalar_t>;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         AT_DISPATCH_SOURCE_TYPES(src[0].scalar_type(), "foreach_tensor_copy", [&] {
           if constexpr (std::is_same_v<scalar_t, src_t>) {
             multi_tensor_apply<2>(
@@ -451,7 +458,11 @@ void foreach_tensor_copy_list_kernel_cuda_(
                     /* depth */ 2,
                     /* r_args_depth */ 1,
                     /* res_arg_index */ 1>(),
+<<<<<<< HEAD
                 Copy<opmath_t, opmath_t>());
+=======
+                Copy<scalar_t, scalar_t>());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           } else {
             // Ref:
             // https://github.com/pytorch/pytorch/blob/656134c38f4737d13c3f43fc5c59470bc23c1d2f/aten/src/ATen/native/Copy.cpp#L299-L301
diff --git a/aten/src/ATen/native/cuda/FusedSgdKernel.cu b/aten/src/ATen/native/cuda/FusedSgdKernel.cu
index 585c57ba1dec..cda103d9da43 100644
--- a/aten/src/ATen/native/cuda/FusedSgdKernel.cu
+++ b/aten/src/ATen/native/cuda/FusedSgdKernel.cu
@@ -232,7 +232,11 @@ void _fused_sgd_with_momentum_kernel_cuda_(
   }
   TORCH_CHECK(
       lr.device() == params[0].device(),
+<<<<<<< HEAD
       "found_inf must be on the same GPU device as the params");
+=======
+      "lr must be on the same GPU device as the params");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
   float* found_inf_ptr =
diff --git a/aten/src/ATen/native/cuda/GroupMM.cu b/aten/src/ATen/native/cuda/GroupMM.cu
new file mode 100644
index 000000000000..d43875e3c8a6
--- /dev/null
+++ b/aten/src/ATen/native/cuda/GroupMM.cu
@@ -0,0 +1,383 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+
+
+// Two warninngs in Cutlass included header files
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
+
+// Determine if the architecture supports rowwise scaled mm
+// Currently failing on windows with:
+// https://github.com/NVIDIA/cutlass/issues/1571
+#if !defined(USE_ROCM) && !defined(_WIN32) && defined(CUDA_VERSION) && \
+    CUDA_VERSION >= 12000
+
+#define BUILD_GG_KERNEL
+#endif
+
+#if defined(BUILD_GG_KERNEL)
+
+#include <cute/tensor.hpp>
+#include <cutlass/core_io.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/gemm/device/gemm.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/trace.h>
+#include <cutlass/version.h>
+#include <ATen/native/cuda/GroupMMCommon.cuh>
+
+#include <cutlass/epilogue/collective/collective_builder.hpp>
+#include <cutlass/epilogue/threadblock/fusion/visitors.hpp>
+#include <cutlass/gemm/collective/collective_builder.hpp>
+#include <cutlass/gemm/device/gemm_universal.h>
+#include <cutlass/gemm/device/gemm_universal_adapter.h>
+#include <cutlass/gemm/kernel/default_gemm_universal_with_visitor.h>
+
+#include <cute/atom/mma_atom.hpp>
+#include <cutlass/gemm/dispatch_policy.hpp>
+#include <cutlass/gemm/kernel/gemm_universal.hpp>
+
+namespace {
+using Strides = at::cuda::detail::Strides; // std::array<int64_t, 3>;
+
+template <bool PONG, typename TB_M, typename TB_N, typename TB_K>
+struct Schedule {
+  using CooperativeSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
+  using PongSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong;
+  using CooperativeEpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
+  using PongEpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using KernelSchedule =
+      cute::conditional_t<PONG, PongSchedule, CooperativeSchedule>;
+  using EpilogueSchedule = cute::
+      conditional_t<PONG, PongEpilogueSchedule, CooperativeEpilogueSchedule>;
+};
+
+int ceildiv(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+int round_up_to_nearest_multiple(int a, int b) {
+  return ceildiv(a, b) * b;
+}
+
+template <
+    bool a_row_major,
+    bool b_row_major,
+    bool Pong,
+    typename TB_M,
+    typename TB_N,
+    typename TB_K>
+void bf16bf16_grouped_gemm_impl_sm90(
+    at::Tensor mat_a, // bf16
+    at::Tensor mat_b, // bf16
+    std::optional<at::Tensor> offs,
+    std::optional<at::Tensor> bias, // BF16
+    at::Tensor& out) {
+  using DtypeA = cutlass::bfloat16_t;
+  using DtypeB = cutlass::bfloat16_t;
+  using DtypeOutput = cutlass::bfloat16_t;
+  using DtypeAccum = float;
+  using LayoutA = cute::conditional_t<
+      a_row_major,
+      cutlass::layout::RowMajor,
+      cutlass::layout::ColumnMajor>;
+  constexpr int AlignmentA = 16 / sizeof(DtypeA);
+
+  using LayoutB = cute::conditional_t<
+      b_row_major,
+      cutlass::layout::RowMajor,
+      cutlass::layout::ColumnMajor>;
+  constexpr int AlignmentB = 16 / sizeof(DtypeB);
+  using LayoutOutput = cutlass::layout::RowMajor;
+  constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using TileShape = cute::Shape<TB_M, TB_N, TB_K>;
+  using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+  using KernelSchedule =
+      typename Schedule<Pong, TB_M, TB_N, TB_K>::KernelSchedule;
+  using EpilogueSchedule =
+      typename Schedule<Pong, TB_M, TB_N, TB_K>::EpilogueSchedule;
+  using ProblemShape = cutlass::gemm::GroupProblemShape<
+      cute::Shape<int32_t, int32_t, int32_t>>; // <M,N,K> per
+                                               // group
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          TileShape,
+          ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto,
+          DtypeAccum,
+          DtypeAccum,
+          DtypeOutput,
+          LayoutOutput*,
+          AlignmentOutput,
+          DtypeOutput,
+          LayoutOutput*,
+          AlignmentOutput,
+          EpilogueSchedule,
+          cutlass::epilogue::fusion::
+              LinearCombination<DtypeOutput, DtypeAccum>>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag,
+          OperatorClass,
+          DtypeA,
+          LayoutA*,
+          AlignmentA,
+          DtypeB,
+          LayoutB*,
+          AlignmentB,
+          DtypeAccum,
+          TileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+  using GemmKernel = cutlass::gemm::kernel::
+      GemmUniversal<ProblemShape, CollectiveMainloop, CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideOutput = typename Gemm::GemmKernel::InternalStrideD;
+  int32_t M, N, K, group_count;
+
+  M = mat_a.size(-2);
+  K = mat_a.size(-1);
+  N = mat_b.size(-1);
+
+  if (mat_a.dim() == 2 && mat_b.dim() == 2) {
+    // if both inputs are ragged, K is dynamic, M and N come from inputs
+    group_count = offs->size(0);
+    K = -1;
+  } else if (mat_a.dim() == 2) {
+    group_count = mat_b.size(0);
+    M = -1;
+  } else if (mat_b.dim() == 2) {
+    group_count = mat_a.size(0);
+    N = -1;
+  } else {
+    // regular bmm
+    group_count = mat_a.size(0);
+  }
+
+  TORCH_CHECK(group_count < 1024, "Can't process more than 1024 groups");
+  const int64_t problem_shape_size =
+      group_count * ((int64_t)sizeof(ProblemShape::UnderlyingProblemShape));
+
+  const int64_t stride_size = 3 * group_count * ((int64_t)sizeof(StrideA));
+
+  // dummy tmas are created based on these pointer-to-pointers
+  // the actual values are never used, they are replaced
+  // by real addresses, but for dummy tma creation to succeed
+  // due to bug in cuda < 12.4 the pointers have to be aligned to 128 bits
+  const int group_alignment = 16 / sizeof(void*);
+  const int aligned_group_count =
+      round_up_to_nearest_multiple(group_count, group_alignment);
+  int64_t input_args_size = aligned_group_count * 3 * sizeof(void*) +
+      problem_shape_size + stride_size;
+
+  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+  auto input_buf = allocator.allocate(input_args_size);
+  void* buf_ptr = input_buf.get();
+  DtypeA** inputA_ptrs = reinterpret_cast<DtypeA**>(buf_ptr);
+  DtypeB** inputB_ptrs =
+      reinterpret_cast<DtypeB**>(inputA_ptrs + aligned_group_count);
+  DtypeOutput** output_ptrs =
+      reinterpret_cast<DtypeOutput**>(inputB_ptrs + aligned_group_count);
+  static_assert(
+      sizeof(StrideA) == 8, "expected StrideA to be 8 bytes for alignment");
+  StrideA* stride_A =
+      reinterpret_cast<StrideA*>(output_ptrs + aligned_group_count);
+  StrideB* stride_B = reinterpret_cast<StrideB*>(stride_A + group_count);
+  StrideOutput* stride_output =
+      reinterpret_cast<StrideOutput*>(stride_B + group_count);
+  ProblemShape::UnderlyingProblemShape* problem_sizes =
+      reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
+          stride_output + group_count);
+
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+  auto make_strides = [](at::IntArrayRef strides) -> Strides {
+    Strides out;
+    std::copy(strides.begin(), strides.end(), out.begin());
+    return out;
+  };
+
+  Strides tensor_StrideA = make_strides(mat_a.strides());
+  Strides tensor_StrideB = make_strides(mat_b.strides());
+  Strides tensor_StrideOutput = make_strides(out.strides());
+
+  at::cuda::detail::prepare_grouped_gemm_data<<<1, group_count, 0, stream>>>(
+      reinterpret_cast<DtypeA*>(mat_a.data_ptr()),
+      reinterpret_cast<DtypeB*>(mat_b.data_ptr()),
+      reinterpret_cast<DtypeOutput*>(out.data_ptr()),
+      static_cast<float*>(nullptr), // type for template inference
+      static_cast<float*>(nullptr), // type for template inference
+      inputA_ptrs,
+      inputB_ptrs,
+      output_ptrs,
+      static_cast<float**>(nullptr), // type for template inference
+      static_cast<float**>(nullptr), // type for template inference
+      problem_sizes,
+      stride_A,
+      stride_B,
+      stride_output,
+      offs.has_value() ? offs->const_data_ptr<int32_t>() : nullptr,
+      M,
+      N,
+      K,
+      tensor_StrideA,
+      tensor_StrideB,
+      tensor_StrideOutput,
+      0,
+      0,
+      a_row_major,
+      b_row_major);
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  typename Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {group_count, problem_sizes, nullptr},
+      {(const DtypeA**)inputA_ptrs,
+       stride_A,
+       (const DtypeB**)inputB_ptrs,
+       stride_B},
+      {{},
+       (const DtypeOutput**)output_ptrs,
+       stride_output,
+       output_ptrs,
+       stride_output}};
+
+  arguments.epilogue.thread.alpha = 1.0;
+  arguments.epilogue.thread.dAlpha = {cute::_0{}, cute::_0{}, 0};
+
+  int sm_count =
+      at::cuda::getDeviceProperties(out.device().index())->multiProcessorCount;
+  if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
+    sm_count -= at::globalContext()._SMCarveout_EXPERIMENTAL().value();
+  }
+  arguments.hw_info.sm_count = sm_count;
+
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+  auto workspace = allocator.allocate(workspace_size);
+  Gemm gemm;
+  TORCH_CHECK(
+      gemm.can_implement(arguments) == cutlass::Status::kSuccess,
+      "cutlass cannot implement");
+  TORCH_CHECK(
+      gemm.initialize(arguments, workspace.get()) == cutlass::Status::kSuccess,
+      "cutlass cannot initialize");
+  auto status = gemm(at::cuda::getCurrentCUDAStream());
+  TORCH_CHECK(
+      status == cutlass::Status::kSuccess,
+      "cutlass cannot run, error ",
+      int(status));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <bool a_row_major, bool b_row_major>
+void dispatch_bf16_grouped_kernel_on_tile_size(
+    at::Tensor mat_a, // bf16
+    at::Tensor mat_b, // bf16
+    std::optional<at::Tensor> offs,
+    std::optional<at::Tensor> bias, // BF16
+    at::Tensor& out) {
+  int32_t M, N, K, group_count;
+
+  M = mat_a.size(-2);
+  K = mat_a.size(-1);
+  N = mat_b.size(-1);
+
+  // below we assume that gemms are approx same size
+  if (mat_a.dim() == 2 && mat_b.dim() == 2) {
+    // if both inputs are ragged, K is dynamic, M and N come from inputs
+    group_count = offs->size(0);
+    K = K / group_count;
+  } else if (mat_a.dim() == 2) {
+    group_count = mat_b.size(0);
+    M = M / group_count;
+  } else if (mat_b.dim() == 2) {
+    group_count = mat_a.size(0);
+    N = N / group_count;
+  }
+  //   bool large =
+  //       ((M >= 2048 && K >= 2048) || (M >= 2048 && N >= 2048) ||
+  //        (K >= 2048 && N >= 2048));
+  bool small = (M <= 128 || N <= 128);
+  if (small) {
+    bf16bf16_grouped_gemm_impl_sm90<
+        a_row_major,
+        b_row_major,
+        /*Pong*/ true,
+        cute::_64,
+        cute::_128,
+        cute::_128>(mat_a, mat_b, offs, bias, out);
+  } else {
+    bf16bf16_grouped_gemm_impl_sm90<
+        a_row_major,
+        b_row_major,
+        /*Pong*/ false,
+        cute::_128,
+        cute::_256,
+        cute::_64>(mat_a, mat_b, offs, bias, out);
+  }
+}
+
+void dispatch_bf16_grouped_kernel_on_ab_transpose(
+    at::Tensor mat_a, // bf16
+    at::Tensor mat_b, // bf16
+    std::optional<at::Tensor> offs,
+    std::optional<at::Tensor> bias, // BF16
+    at::Tensor& out) {
+  // we already checked that one of the strides is 1
+  bool a_row_major = mat_a.stride(-1) == 1;
+  bool b_row_major = mat_b.stride(-1) == 1;
+  if (a_row_major && b_row_major) {
+    dispatch_bf16_grouped_kernel_on_tile_size<true, true>(
+        mat_a, mat_b, offs, bias, out);
+  } else if (a_row_major && !b_row_major) {
+    dispatch_bf16_grouped_kernel_on_tile_size<true, false>(
+        mat_a, mat_b, offs, bias, out);
+  } else if (!a_row_major && b_row_major) {
+    dispatch_bf16_grouped_kernel_on_tile_size<false, true>(
+        mat_a, mat_b, offs, bias, out);
+  } else {
+    dispatch_bf16_grouped_kernel_on_tile_size<false, false>(
+        mat_a, mat_b, offs, bias, out);
+  }
+}
+
+} // namespace
+#endif
+
+namespace at::cuda::detail {
+
+void bf16bf16_grouped_mm(
+    at::Tensor mat_a, // bf16
+    at::Tensor mat_b, // bf16
+    std::optional<at::Tensor> offs,
+    std::optional<at::Tensor> bias, // BF16
+    at::Tensor& out) {
+#if defined(BUILD_GG_KERNEL)
+  dispatch_bf16_grouped_kernel_on_ab_transpose(mat_a, mat_b, offs, bias, out);
+#else
+  TORCH_CHECK(false, "grouped mm is not supported on your system");
+#endif
+}
+
+} // namespace at::cuda::detail
diff --git a/aten/src/ATen/native/cuda/GroupMM.h b/aten/src/ATen/native/cuda/GroupMM.h
new file mode 100644
index 000000000000..1fc23207a090
--- /dev/null
+++ b/aten/src/ATen/native/cuda/GroupMM.h
@@ -0,0 +1,12 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+#include <optional>
+
+namespace at::cuda::detail {
+TORCH_API void bf16bf16_grouped_mm(
+    at::Tensor mat_a, // bf16
+    at::Tensor mat_b, // bf16
+    std::optional<at::Tensor> offs,
+    std::optional<at::Tensor> bias, // BF16
+    at::Tensor& out);
+} // namespace at::cuda::detail
diff --git a/aten/src/ATen/native/cuda/GroupMMCommon.cuh b/aten/src/ATen/native/cuda/GroupMMCommon.cuh
new file mode 100644
index 000000000000..a4d8a97b6fd8
--- /dev/null
+++ b/aten/src/ATen/native/cuda/GroupMMCommon.cuh
@@ -0,0 +1,156 @@
+#pragma once
+#include <cutlass/util/packed_stride.hpp>
+
+namespace at::cuda::detail {
+
+using Strides = std::array<int64_t, 3>;
+
+template <
+    typename DtypeA,
+    typename DtypeB,
+    typename DtypeOutput,
+    typename DtypeScale,
+    typename ProblemShape,
+    typename StrideA,
+    typename StrideB,
+    typename StrideOutput>
+__global__ void prepare_grouped_gemm_data(
+    DtypeA* A,
+    DtypeB* B,
+    DtypeOutput* output,
+    DtypeScale* scale_A,
+    DtypeScale* scale_B,
+    DtypeA** A_ptrs,
+    DtypeB** B_ptrs,
+    DtypeOutput** output_ptrs,
+    DtypeScale** inputA_scale_ptrs,
+    DtypeScale** inputB_scale_ptrs,
+    ProblemShape* problem_sizes,
+    // Strides for cutlass, cute::Stride
+    StrideA* stride_A,
+    StrideB* stride_B,
+    StrideOutput* stride_output,
+    const int32_t* offs,
+    int32_t M,
+    int32_t N,
+    int32_t K,
+    // Original strides of the input tensors
+    Strides tensor_StrideA,
+    Strides tensor_StrideB,
+    Strides tensor_StrideOutput,
+    int64_t a_scale_stride,
+    int64_t b_scale_stride,
+    bool a_row_major = true,
+    bool b_row_major = false) {
+  int32_t tid = threadIdx.x;
+  int32_t delta = 0;
+  if (offs != nullptr) {
+    int32_t start = tid == 0 ? 0 : offs[tid - 1];
+    delta = offs[tid] - start;
+    if (K < 0) {
+      if (!a_row_major && b_row_major) {
+        CUDA_KERNEL_ASSERT(delta >=0 && "expected ofsets to be greater or equal 0\n");
+      } else  {
+        // CUTLASS cannot handle delta=0 here.
+        CUDA_KERNEL_ASSERT(delta >0 && "expected ofsets to be greater than 0\n");
+      }
+    }
+
+    // TMA transfers require global memory tensor addresses to be
+    // aligned to 16 bytes.
+    if (tid < blockDim.x - 1) {
+      // Check this requirement for input tensors, in case group
+      // addresses are increased along the dynamic dimension.
+      if ((K < 0 && a_row_major) ||       // 2D/2D: check along K dimension
+          (M < 0 && !a_row_major)) {      // 3D/2D: check along N dimension
+        int align = 128 / cutlass::sizeof_bits<DtypeA>::value;
+        CUDA_KERNEL_ASSERT(
+                           delta % align == 0 &&
+                           "expected input tensor dynamic dimension byte size to be non-negative multiple of 16\n");
+      }
+      if ((K < 0 && !b_row_major) ||      // 2D/2D: check along K dimension
+          (N < 0 && b_row_major)) {       // 3D/2D: check along N dimension
+        int align = 128 / cutlass::sizeof_bits<DtypeB>::value;
+        CUDA_KERNEL_ASSERT(
+                           delta % align == 0 &&
+                           "expected input tensor dynamic dimension byte size to be non-negative multiple of 16\n");
+      }
+
+      // Check the same requirement for output tensor (that is always
+      // contiguous, and in row-major layout).
+      if (N < 0) {
+        int align = 128 / cutlass::sizeof_bits<DtypeOutput>::value;
+        CUDA_KERNEL_ASSERT(
+                           delta % align == 0 &&
+                           "expected output tensor dynamic dimension byte size to be non-negative multiple of 16\n");
+      }
+    }
+  }
+  int64_t lda, ldb, ldoutput;
+  if (M < 0) {
+    // A and output is 2d
+    M = delta;
+    lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
+    ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2];
+    ldoutput = tensor_StrideOutput[0];
+    A_ptrs[tid] = tid == 0 ? A : A + offs[tid - 1] * tensor_StrideA[0];
+    if (scale_A != nullptr) {
+      inputA_scale_ptrs[tid] = tid == 0 ? scale_A : scale_A + offs[tid - 1];
+      inputB_scale_ptrs[tid] = scale_B + tid * b_scale_stride;
+    }
+    output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1] * ldoutput;
+    B_ptrs[tid] = B + tid * tensor_StrideB[0];
+  } else if (N < 0) {
+    N = delta;
+    lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2];
+    ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; // B is transposed
+    ldoutput = tensor_StrideOutput[0];
+    A_ptrs[tid] = A + tid * tensor_StrideA[0];
+    output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1];
+    B_ptrs[tid] = tid == 0 ? B : B + offs[tid - 1] * tensor_StrideB[1];
+    if (scale_A != nullptr) {
+      inputA_scale_ptrs[tid] = scale_A + tid * a_scale_stride;
+      inputB_scale_ptrs[tid] = tid == 0 ? scale_B : scale_B + offs[tid - 1];
+    }
+  } else if (K < 0) {
+    // A, B is 2d, output is 3d
+    K = delta;
+    lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1];
+    ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1];
+    ldoutput = tensor_StrideOutput[1];
+    A_ptrs[tid] = tid == 0 ? A : A + offs[tid - 1] * tensor_StrideA[1];
+    B_ptrs[tid] = tid == 0 ? B : B + offs[tid - 1] * tensor_StrideB[0];
+    output_ptrs[tid] = output + tid * tensor_StrideOutput[0];
+    if (scale_A != nullptr) {
+      inputA_scale_ptrs[tid] = scale_A + tid * M;
+      inputB_scale_ptrs[tid] = scale_B + tid * N;
+    }
+  } else {
+    // A, B, output are 3D
+    lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2];
+    ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2];
+    ldoutput = tensor_StrideOutput[1];
+    A_ptrs[tid] = A + tid * tensor_StrideA[0];
+    B_ptrs[tid] = B + tid * tensor_StrideB[0];
+    output_ptrs[tid] = output + tid * tensor_StrideOutput[0];
+    if (scale_A != nullptr) {
+      inputA_scale_ptrs[tid] = scale_A + tid * a_scale_stride;
+      inputB_scale_ptrs[tid] = scale_B + tid * b_scale_stride;
+    }
+  }
+  problem_sizes[tid] = ProblemShape(M, N, K);
+
+  // make_cute_packed_stride only replaces one of the stride elements with
+  // one the provided values in the shape arguments
+  // the indices of the src/dst depend on whether A/B are row-major
+  // so constructing shape argument with two similar lda values
+  // while it looks non-sensical (and it is a nonsensical shape)
+  // is fine for these stride construction purposes - the one that will be used
+  // for replacement is correct, the other one is ignored, and we don't have to
+  // branch on whether A/B are row-major
+  stride_A[tid] = cutlass::make_cute_packed_stride(StrideA{}, {lda, lda, 1});
+  stride_B[tid] = cutlass::make_cute_packed_stride(StrideB{}, {ldb, ldb, 1});
+  stride_output[tid] =
+      cutlass::make_cute_packed_stride(StrideOutput{}, {M, ldoutput, 1});
+}
+} // namespace at::cuda::detail
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index d729f04fabec..6e60370097bb 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -14,6 +14,11 @@
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/KernelUtils.cuh>
 #include <ATen/native/quantized/IndexKernel.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/IndexKernelUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <c10/core/Scalar.h>
 
@@ -52,7 +57,11 @@ static void launch_kernel(const int64_t N, const func_t& f) {
 }
 
 template <typename func_t>
+<<<<<<< HEAD
 void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, const IntArrayRef index_stride, const func_t& f) {
+=======
+void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, const IntArrayRef index_stride, const func_t& f, const bool is_gather_like) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto num_indices = index_size.size();
   AT_ASSERT(num_indices == index_stride.size());
   AT_ASSERT(static_cast<int64_t>(num_indices) == iter.ntensors() - 2);
@@ -63,11 +72,38 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
 
   if (!iter.can_use_32bit_indexing()) {
     for (auto& sub_iter : iter.with_32bit_indexing()) {
+<<<<<<< HEAD
       gpu_index_kernel(sub_iter, index_size, index_stride, f);
+=======
+      gpu_index_kernel(sub_iter, index_size, index_stride, f, is_gather_like);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return;
   }
 
+<<<<<<< HEAD
+=======
+
+  char* const out_ptr = static_cast<char*>(iter.data_ptr(0));
+  char* const in_ptr = static_cast<char*>(iter.data_ptr(1));
+
+  if (is_gather_like && num_indices==1) {
+      const size_t element_size = iter.element_size(0);
+      constexpr size_t alignment = 16;
+      if (at::native::fast_gather_kernel_eligible<alignment>(iter, out_ptr, in_ptr, index_stride[0], element_size)) {
+        auto slice_size = iter.shape()[0] * element_size;
+        auto num_ind = iter.shape()[1];
+        auto ind_dim_size = index_size[0];
+        auto inp_stride_bytes = index_stride[0];
+        auto out_stride_bytes = iter.strides(0)[1];
+        if (iter.numel() == 0) return;
+        at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
+        slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
+        return;
+      }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto sizes = std::array<int64_t, MAX_DIMS>{};
   auto strides = std::array<int64_t, MAX_DIMS>{};
   auto index_ptrs = std::array<char*, MAX_DIMS>{};
@@ -77,8 +113,11 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
     index_ptrs[i] = (char*)iter.data_ptr(i + 2);
   }
 
+<<<<<<< HEAD
   char* const out_ptr = static_cast<char*>(iter.data_ptr(0));
   char* const in_ptr = static_cast<char*>(iter.data_ptr(1));
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto offset_calc = make_offset_calculator<3>(iter);
   launch_kernel<launch_size_nd, launch_bound2>(iter.numel(), [=]__device__(int idx) {
@@ -183,14 +222,22 @@ template <typename scalar_t>
 void index_kernel_impl(TensorIteratorBase& iter, const IntArrayRef index_size, const IntArrayRef index_stride) {
   gpu_index_kernel(iter, index_size, index_stride, []C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) {
     *reinterpret_cast<scalar_t*>(out_data) = *reinterpret_cast<const scalar_t*>(in_data + offset);
+<<<<<<< HEAD
   });
+=======
+  }, true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename scalar_t>
 void index_put_kernel_impl(TensorIterator& iter, const IntArrayRef index_size, const IntArrayRef index_stride) {
   gpu_index_kernel(iter, index_size, index_stride, []C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) {
     *reinterpret_cast<scalar_t*>(out_data + offset) = *reinterpret_cast<const scalar_t*>(in_data);
+<<<<<<< HEAD
   });
+=======
+  }, false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void index_kernel(
@@ -280,7 +327,11 @@ void index_put_kernel_quantized_cuda(TensorIterator& iter, const IntArrayRef ind
       // The replacement should generate the same PTX as std::clamp. See https://godbolt.org/z/Wde9KW3v4
       qvalue = (qvalue < qmin) ? qmin : (qmax < qvalue) ? qmax : qvalue;
       *(scalar_t*)(out_data + offset) = static_cast<scalar_t>(qvalue);
+<<<<<<< HEAD
     });
+=======
+    }, false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/IndexKernelUtils.cu b/aten/src/ATen/native/cuda/IndexKernelUtils.cu
new file mode 100644
index 000000000000..3e13f934e21e
--- /dev/null
+++ b/aten/src/ATen/native/cuda/IndexKernelUtils.cu
@@ -0,0 +1,46 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/ceil_div.h>
+
+namespace at::native {
+template <int Alignment, typename index_t>
+__global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx, int num_ind, int64_t slice_size, int64_t ind_dim_size, int64_t inp_stride, int64_t out_stride, bool allow_neg_indices) {
+    int64_t ind = idx[blockIdx.x];
+    if (allow_neg_indices) {
+        ind = (ind < 0) ? ind + ind_dim_size : ind;
+    }
+    CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
+    int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
+    if (off >= slice_size) return;
+    auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
+    at::native::memory::st_vec<Alignment>(out + blockIdx.x * (int32_t)out_stride + off, vec);  // out offset is guaranteed to be within int32 limits
+}
+
+
+
+template <int64_t Alignment, typename index_t>
+void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int num_ind,
+                                     int64_t slice_size_in_bytes, int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, bool allow_neg_indices){
+
+  constexpr int64_t max_num_threads=256;
+  auto num_threads = at::round_up(
+      at::ceil_div(slice_size_in_bytes, Alignment),
+      static_cast<int64_t>(C10_WARP_SIZE));
+  dim3 grid = {static_cast<uint32_t>(num_ind), static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), 1};
+  auto block = std::min(max_num_threads, num_threads);
+  vectorized_gather_kernel<Alignment, index_t><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(out, inp, idx, num_ind, slice_size_in_bytes,
+  ind_dim_size, inp_stride_bytes, out_stride_bytes, allow_neg_indices);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+// explicit template instantiation
+template void vectorized_gather_kernel_launch<16, int64_t>(char * out, char * inp, int64_t * idx, int num_ind, int64_t slice_size_in_bytes,
+int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, bool allow_neg_indices);
+template void vectorized_gather_kernel_launch<16, int32_t>(char * out, char * inp, int32_t * idx, int num_ind, int64_t slice_size_in_bytes,
+int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, bool allow_neg_indices);
+
+}
diff --git a/aten/src/ATen/native/cuda/IndexKernelUtils.h b/aten/src/ATen/native/cuda/IndexKernelUtils.h
new file mode 100644
index 000000000000..20a67dac3f62
--- /dev/null
+++ b/aten/src/ATen/native/cuda/IndexKernelUtils.h
@@ -0,0 +1,35 @@
+
+#include <cstdint>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+namespace at::native {
+
+template<int alignment>
+inline bool fast_gather_kernel_eligible(const TensorIterator& iter, char * const out_ptr, char * const in_ptr, const size_t index_stride_bytes, const size_t element_size) {
+  using at::native::memory::get_alignment;
+  const auto index_element_size = iter.element_size(2);
+  //TensorIterator strides and sizes are ordered fastest moving to slowest moving,
+  //in contrast to regular sizes
+  // we need contiguous source and dst slices and aligned pointers and strides and slice size to do vectorized loads
+  // also we need idx to be expanded in the last dimension so we can copy entire slices
+  // and we need the src tensor to keep 0 stride from restriding
+  // (it could have been deleted by dimension collapse, in this case iterator would still be 2d
+  // but we cannot use fast path)
+
+  return iter.ndim() == 2 && iter.strides(2)[0]==0 && iter.strides(2)[1]==index_element_size &&
+         static_cast<size_t>(iter.strides(0)[0])==element_size &&
+         static_cast<size_t>(iter.strides(1)[0])==element_size && static_cast<size_t>(iter.strides(1)[1] == 0) &&
+         get_alignment(out_ptr) == alignment && get_alignment(in_ptr) == alignment &&
+         get_alignment(static_cast<size_t>(iter.shape()[0] * element_size)) == alignment &&
+         get_alignment(static_cast<size_t>(index_stride_bytes)) == alignment &&
+         get_alignment(static_cast<size_t>(iter.strides(0)[1])) == alignment;
+}
+
+template <int64_t Alignment, typename index_t>
+void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int num_ind,
+                                     int64_t slice_size_in_bytes, int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes,
+                                     bool allow_neg_indices=false);
+
+
+}
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 20b268b733ad..a8e28c171505 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -28,6 +28,10 @@
 #include <ATen/ops/zeros_like.h>
 #include <ATen/ops/ones_like.h>
 #include <ATen/ops/empty_quantized.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/gather.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/index_add_native.h>
 #include <ATen/ops/index_reduce_native.h>
 #include <ATen/ops/index_select_native.h>
@@ -568,7 +572,11 @@ static std::vector<int64_t> computeLinearStride(const Tensor & tensor) {
   return stride;
 }
 
+<<<<<<< HEAD
 static std::tuple<Tensor, int64_t, int64_t, int64_t>
+=======
+static std::tuple<Tensor, int64_t, int64_t, int64_t, int64_t, int64_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 computeLinearIndex(const Tensor & src, TensorList indices, bool check_range) {
   auto strides = computeLinearStride(src);
   const auto& device = src.options().device();
@@ -579,8 +587,15 @@ computeLinearIndex(const Tensor & src, TensorList indices, bool check_range) {
   // are not being index.
   Tensor linearIndex;
   int64_t nElemBefore = 1, nElemAfter = 1, strideBefore =0;
+<<<<<<< HEAD
   for (const auto i: c10::irange(src.dim())) {
     if (indices[i].defined()) {
+=======
+  int64_t dims_before = 0, dims_indexed = 0;
+  for (const auto i: c10::irange(src.dim())) {
+    if (indices[i].defined()) {
+      dims_indexed++;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Cast index to the longType matching src's device
       // This allows us to support ie indexing a cuda tensor with a cpu tensor
       Tensor index = (wrapIndexOnce(indices[i], i, src.size(i), check_range) * strides[i]).to(device);
@@ -595,15 +610,28 @@ computeLinearIndex(const Tensor & src, TensorList indices, bool check_range) {
     } else if (linearIndex.defined()) {
       nElemAfter *= src.size(i);
     } else {
+<<<<<<< HEAD
+=======
+      dims_before++;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       nElemBefore *= src.size(i);
     }
   }
 
+<<<<<<< HEAD
   return std::make_tuple(std::move(linearIndex), nElemBefore, strideBefore, nElemAfter);
 }
 
 
 static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t>> makeLinearIndex(Tensor self, IOptTensorListRef orig, bool check_range) {
+=======
+  return std::make_tuple(std::move(linearIndex), nElemBefore, strideBefore, nElemAfter, dims_before, dims_indexed);
+}
+
+
+static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t>, int64_t, int64_t>
+makeLinearIndex(Tensor self, IOptTensorListRef orig, bool check_range) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   checkIndexTensorTypes(orig, /*allow_int*/true);
   // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
   auto indices = expandTensors(self, orig);
@@ -624,6 +652,7 @@ static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t
   if (!hasContiguousSubspace(indices)) {
     std::tie(self, indices, inversePerm) = transposeToFrontAndInvPerm(self, indices);
   }
+<<<<<<< HEAD
   auto [linearIndex, nElemBefore, strideBefore, nElemAfter] = computeLinearIndex(self, indices, check_range);
   return std::make_tuple(linearIndex, self, nElemBefore, strideBefore, nElemAfter, inversePerm);
 }
@@ -631,6 +660,13 @@ static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t
 
 void index_put_with_sort_kernel_thrust_helper(Tensor &linearIndex, Tensor &orig_indices, Tensor &sorted_indices, int64_t num_indices);
 
+=======
+  auto [linearIndex, nElemBefore, strideBefore, nElemAfter, dims_before, dims_indexed] =
+    computeLinearIndex(self, indices, check_range);
+  return std::make_tuple(linearIndex, self, nElemBefore, strideBefore, nElemAfter, inversePerm,
+                         dims_before, dims_indexed);
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 
 int64_t largestIndex(const Tensor &self) {
@@ -641,6 +677,23 @@ int64_t largestIndex(const Tensor &self) {
   return result;
 }
 
+<<<<<<< HEAD
+=======
+DimVector valsShape(IntArrayRef self_sizes,
+                              int64_t dims_before,
+                              int64_t dims_indexed,
+                              IntArrayRef replacement_shape) {
+  auto shape = DimVector(self_sizes);
+  int64_t end = dims_before + dims_indexed;
+  shape.erase(shape.begin() + dims_before, shape.begin() + end);
+  shape.insert(
+    shape.begin() + dims_before,
+    replacement_shape.begin(),
+    replacement_shape.end());
+  return shape;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Tensor>>& indices, const Tensor & value, bool accumulate, bool unsafe) {
   TORCH_CHECK(!indices.empty() || is_expandable_to(value.sizes(), self.sizes()), "shape mismatch: value tensor of shape ", value.sizes(),
              " cannot be broadcast to indexing result of shape ", self.sizes());
@@ -650,6 +703,7 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
   bool self_contiguous = self.is_contiguous();
   auto self_ = self_contiguous ? self : self.contiguous();
   Tensor linearIndex, src, expandedValue = value;
+<<<<<<< HEAD
   int64_t nElemBefore, strideBefore, sliceSize;
   std::vector<int64_t> inversePerm;
   std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm) = makeLinearIndex(self_, indices, !unsafe);
@@ -671,6 +725,15 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
     expandedValue = expandedValue.expand(expanded_size);
   }
   expandedValue = expandedValue.contiguous();
+=======
+  int64_t nElemBefore, strideBefore, sliceSize, dims_before, dims_indexed;
+  std::vector<int64_t> inversePerm;
+  std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm,
+  dims_before, dims_indexed) = makeLinearIndex(self_, indices, !unsafe);
+  auto vals_shape = valsShape(src.sizes(), dims_before, dims_indexed, linearIndex.sizes());
+  int64_t num_indices = linearIndex.numel();
+  expandedValue = expandedValue.expand(vals_shape).contiguous();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (num_indices > 0 && sliceSize > 0) {
       const bool permuted = !src.is_contiguous();
@@ -682,6 +745,7 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
 
       linearIndex.divide_(sliceSize, "trunc");
 
+<<<<<<< HEAD
       // cub on CUDA <= 11.2 have a bug that for small sizes
       // cub's sort can be much slower than thrust's merge sort
       // this bug is fixed in CUDA 11.3
@@ -691,6 +755,8 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
       } else
 #endif
       {
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Sort the inputs into sorted with the corresponding indices
       auto range = at::arange(num_indices, linearIndex.options());
       // linearIndex can not be negative, and we take advantage of this
@@ -700,7 +766,11 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
         linearIndex.const_data_ptr<int64_t>(), sorted_indices.mutable_data_ptr<int64_t>(),
         range.const_data_ptr<int64_t>(), orig_indices.mutable_data_ptr<int64_t>(),
         num_indices, false, 0, nbits);
+<<<<<<< HEAD
       }
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       TORCH_INTERNAL_ASSERT(
           linearIndex.numel()*sliceSize*nElemBefore == expandedValue.numel(),
@@ -874,6 +944,7 @@ void index_put_with_sort_quantized(Tensor & self, const c10::List<std::optional<
   bool self_contiguous = self.is_contiguous();
   auto self_ = self_contiguous ? self : self.contiguous();
   Tensor linearIndex, src, expandedValue = value;
+<<<<<<< HEAD
   int64_t nElemBefore, strideBefore, sliceSize;
   std::vector<int64_t> inversePerm;
   std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm) = makeLinearIndex(self_, indices, !unsafe);
@@ -892,6 +963,15 @@ void index_put_with_sort_quantized(Tensor & self, const c10::List<std::optional<
     expandedValue = expandedValue.expand(expanded_size);
   }
   expandedValue = expandedValue.contiguous();
+=======
+  int64_t nElemBefore, strideBefore, sliceSize, dims_before, dims_indexed;
+  std::vector<int64_t> inversePerm;
+  std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm,
+  dims_before, dims_indexed) = makeLinearIndex(self_, indices, !unsafe);
+  auto vals_shape = valsShape(src.sizes(), dims_before, dims_indexed, linearIndex.sizes());
+  int64_t num_indices = linearIndex.numel();
+  expandedValue = expandedValue.expand(vals_shape).contiguous();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (num_indices > 0 && sliceSize > 0) {
       const bool permuted = !src.is_contiguous();
@@ -903,6 +983,7 @@ void index_put_with_sort_quantized(Tensor & self, const c10::List<std::optional<
 
       linearIndex.divide_(sliceSize, "trunc");
 
+<<<<<<< HEAD
       // cub on CUDA <= 11.2 have a bug that for small sizes
       // cub's sort can be much slower than thrust's merge sort
       // this bug is fixed in CUDA 11.3
@@ -912,6 +993,8 @@ void index_put_with_sort_quantized(Tensor & self, const c10::List<std::optional<
       } else
 #endif
       {
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Sort the inputs into sorted with the corresponding indices
       auto range = at::arange(num_indices, linearIndex.options());
       // linearIndex can not be negative, and we take advantage of this
@@ -921,7 +1004,11 @@ void index_put_with_sort_quantized(Tensor & self, const c10::List<std::optional<
         linearIndex.const_data_ptr<int64_t>(), sorted_indices.mutable_data_ptr<int64_t>(),
         range.const_data_ptr<int64_t>(), orig_indices.mutable_data_ptr<int64_t>(),
         num_indices, false, 0, nbits);
+<<<<<<< HEAD
       }
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       TORCH_INTERNAL_ASSERT(
           linearIndex.numel()*sliceSize*nElemBefore == expandedValue.numel(),
@@ -1558,6 +1645,7 @@ __global__ void indexSelectSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst
   }
 }
 
+<<<<<<< HEAD
 // We prefer this kernel to balance parallelism across index points,
 // if there are a large number of indices.
 // This kernel in fact works for all choices of problem size, but if
@@ -1604,6 +1692,8 @@ __global__ void indexSelectLargeIndex(cuda::detail::TensorInfo<T, IndexType> dst
     dst.data[dstOffset] = src.data[srcOffset];
   }
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 
@@ -1677,6 +1767,7 @@ void index_select_out_cuda_impl(
       selfSelectDimSize);                                                               \
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
+<<<<<<< HEAD
 #define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE,                           \
                     DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR)                   \
   indexSelectLargeIndex<TENSOR_TYPE, INDICES_TYPE, TYPE,                       \
@@ -1688,10 +1779,13 @@ void index_select_out_cuda_impl(
       selfSelectDimSize);                                                      \
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint64_t defaultMaxBlockThreads = getDefaultMaxThreadsPerBlock();
   dim3 smallIndexGrid(std::min(ceil_div(sliceSize, defaultMaxBlockThreads), (uint64_t) (mpc * 8)));
   dim3 smallIndexBlock(std::min(sliceSize, defaultMaxBlockThreads));
 
+<<<<<<< HEAD
   dim3 largeIndexGrid(std::min(ceil_div(outTotalSize, defaultMaxBlockThreads), (uint64_t) (mpc * 8)));
   // for issue https://github.com/pytorch/pytorch/issues/130806 there are two problems
   // 1: ptrdiff_t was used but it is signed int,  outTotalSize of 2147483648 can cause overflow
@@ -1701,6 +1795,16 @@ void index_select_out_cuda_impl(
   if (cuda::detail::canUse32BitIndexMath(out) &&
       cuda::detail::canUse32BitIndexMath(self) &&
       cuda::detail::canUse32BitIndexMath(index)) {
+=======
+  // SmallIndexKernel is more performant when the number of indices is small, and pre-loading
+  // the index reduces memory accesses. When the number of indices is large, we avoid that
+  // and increase parallellism by calling gather_out which is a generalization of index_select
+  if (cuda::detail::canUse32BitIndexMath(out) &&
+      cuda::detail::canUse32BitIndexMath(self) &&
+      cuda::detail::canUse32BitIndexMath(index) &&
+      numIndices <= 16
+      ) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto outInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo<scalar_t, unsigned int>(out));
     int outSelectDim = outInfo.collapseDims(dim);
     outInfo.reduceDim(outSelectDim);
@@ -1715,6 +1819,7 @@ void index_select_out_cuda_impl(
 
       // A reasonable choice for when to have each thread iterate over
       // indices to choose
+<<<<<<< HEAD
       if (numIndices <= 16) {
         if (outInfo.dims == 1 && selfInfo.dims == 1 && indContig) {
           SMALL_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2);
@@ -1764,6 +1869,27 @@ void index_select_out_cuda_impl(
   }
 #undef SMALL_INDEX
 #undef LARGE_INDEX
+=======
+      if (outInfo.dims == 1 && selfInfo.dims == 1 && indContig) {
+        SMALL_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2);
+      } else if (outInfo.dims == 2 && selfInfo.dims == 2 && indContig) {
+        SMALL_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2);
+      } else if (outInfo.dims == 3 && selfInfo.dims == 3 && indContig) {
+        SMALL_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2);
+      } else {
+        SMALL_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1);
+      }
+    });
+  } else {
+    std::vector<int64_t> tmpSize(newSize.size(), 1);
+    if (self.dim() > 0) {
+      tmpSize[dim] = numIndices;
+    }
+    at::gather_out(out, self, dim, index.view(tmpSize).expand(newSize));
+    return;
+  }
+#undef SMALL_INDEX
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // anonymous namespace
 
@@ -1772,7 +1898,11 @@ Tensor& index_select_out_cuda(
     int64_t dim,
     const Tensor& index,
     Tensor& out) {
+<<<<<<< HEAD
   static constexpr string_view DIM_WARNING =
+=======
+  static constexpr std::string_view DIM_WARNING =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Tensor too large or too many (> 25) dimensions";
   TORCH_CHECK(
       at::cuda::check_device({out, self, index}),
diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
index baa41cc47fa7..b149141d7b8d 100644
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -8,8 +8,13 @@
 // ROCm 6.3 is planned to have these functions, but until then here they are.
 #if defined(USE_ROCM) && ROCM_VERSION >= 60201
 #include <device_functions.h>
+<<<<<<< HEAD
 #include <hip/hip_bf16.h>
 #include <hip/hip_fp16.h>
+=======
+#include <hip/hip_fp16.h>
+#include <hip/hip_bf16.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
 #if (defined(__gfx942__)) && \
@@ -259,6 +264,14 @@ __device__ inline void cmtdStore(void* address, T value) {
 #endif
 
 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__))
+<<<<<<< HEAD
+=======
+// This function implements warp-level opportunistic fastatomics
+// To reduce contention on an atomicAdd, this replaces per-thread atomicAdd with a per-warp atomicAdd.
+// We identify all the threads within a warp that will perform an atomicAdd on the same destination
+// address and perform the addition on the CU. Each warp elects a leader thread which does the
+// atomicAdd to the destination address.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <class scalar_t, class index_t>
 __device__ __forceinline__ void opportunistic_fastAtomicAdd(
     scalar_t* self_ptr,
@@ -267,7 +280,12 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
     scalar_t value) {
 
     scalar_t* dst = self_ptr + index;
+<<<<<<< HEAD
     //Try to pack coalseced bf16 and fp16
+=======
+
+    //pack coalseced bf16 and fp16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if constexpr (std::is_same<scalar_t, c10::BFloat16>::value || std::is_same<scalar_t, c10::Half>::value)
     {
         typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
@@ -309,7 +327,10 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
             return;
         }
     }
+<<<<<<< HEAD
     // not coalsced, so now let try to capture lane-matches...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (numel > 16 /*<-hueristic threshold*/ * 64 ) {
       // well shucks, unlikely to capture same-dest atomics in a wave.
@@ -318,6 +339,7 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
       return;
     }
 
+<<<<<<< HEAD
     auto mask = __match_any_sync(__activemask(), (int64_t)dst);
     int leader = __ffsll(mask) - 1;    // select a leader
     scalar_t crnt_val = (scalar_t)0;
@@ -328,6 +350,30 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
      while (crnt_msk != 0) {
         if (crnt_msk & 1) {
             punner add_val = { .l = __shfl(pnr.l ,crnt_idx) };
+=======
+    // not coalsced, so now let try to capture lane-matches...
+    // __activemask() -- finds the set of threads in the warp that are about to perform atomicAdd
+    // __match_any_sync() -- returns bit mask of the threads that have same dest addr
+    auto mask = __match_any_sync(__activemask(), (int64_t)dst);
+
+    // select a leader thread
+    int leader = __ffsll(mask) - 1;
+
+    scalar_t crnt_val = (scalar_t)0;
+    auto crnt_msk = mask >> (leader);
+    int crnt_idx = leader;
+
+    // __shfl is limited in the dtypes it accepts
+    // That's why, we need these if/else to correctly do the addition on the CU
+    if constexpr(sizeof(scalar_t) <= sizeof(int)) {
+     union punner { int l; scalar_t s; };
+     punner pnr = {};
+     pnr.s = value;
+     while (crnt_msk != 0) {
+        if (crnt_msk & 1) {
+            punner add_val = {};
+            add_val.l = __shfl(pnr.l ,crnt_idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             crnt_val += add_val.s;
         }
         crnt_idx++;
@@ -335,10 +381,20 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
      }
     }
     else if constexpr(sizeof(scalar_t) <= sizeof(long)) {
+<<<<<<< HEAD
      union punner { long l; scalar_t s; } pnr = { .s = value };
      while (crnt_msk != 0) {
         if (crnt_msk & 1) {
             punner add_val = { .l = __shfl(pnr.l ,crnt_idx) };
+=======
+     union punner { long l; scalar_t s; };
+     punner pnr = {};
+     pnr.s = value;
+     while (crnt_msk != 0) {
+        if (crnt_msk & 1) {
+            punner add_val = {};
+            add_val.l = __shfl(pnr.l ,crnt_idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             crnt_val += add_val.s;
         }
         crnt_idx++;
@@ -346,10 +402,20 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
      }
     }
     else if constexpr(sizeof(scalar_t) <= sizeof(long long)) {
+<<<<<<< HEAD
      union punner { long long l; scalar_t s; } pnr = { .s = value };
      while (crnt_msk != 0) {
         if (crnt_msk & 1) {
             punner add_val = { .l = __shfl(pnr.l ,crnt_idx) };
+=======
+     union punner { long long l; scalar_t s; };
+     punner pnr = {};
+     pnr.s = value;
+     while (crnt_msk != 0) {
+        if (crnt_msk & 1) {
+            punner add_val = {};
+            add_val.l = __shfl(pnr.l ,crnt_idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             crnt_val += add_val.s;
         }
         crnt_idx++;
@@ -357,11 +423,22 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
      }
     }
     else {
+<<<<<<< HEAD
      union punner { long long l[2]; scalar_t s; } pnr = { .s = value };
      while (crnt_msk != 0) {
         if (crnt_msk & 1) {
             punner add_val = { .l[0] = __shfl(pnr.l[0] ,crnt_idx) };
 	    add_val.l[1] = __shfl(pnr.l[1] ,crnt_idx);
+=======
+     union punner { long long l[2]; scalar_t s; };
+     punner pnr = {};
+     pnr.s = value;
+     while (crnt_msk != 0) {
+        if (crnt_msk & 1) {
+            punner add_val = {};
+            add_val.l[0] = __shfl(pnr.l[0] ,crnt_idx);
+            add_val.l[1] = __shfl(pnr.l[1] ,crnt_idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             crnt_val += add_val.s;
         }
         crnt_idx++;
@@ -370,7 +447,12 @@ __device__ __forceinline__ void opportunistic_fastAtomicAdd(
     }
 
 
+<<<<<<< HEAD
     if (__lane_id() == leader) {  // only leader-lane does the update
+=======
+    //Once the correct crnt_val is determined, only the leader thread does the update to the dest addr
+    if (__lane_id() == leader) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       fastAtomicAdd(self_ptr, index, numel, crnt_val, true);
     }
 }
diff --git a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
index d6c015ab8d2a..b88e7a5d5cdf 100644
--- a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
+++ b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
@@ -19,6 +19,7 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 void index_put_with_sort_kernel_thrust_helper(Tensor &linearIndex, Tensor &orig_indices, Tensor &sorted_indices, int64_t num_indices) {
   sorted_indices.copy_(linearIndex);
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -41,6 +42,8 @@ void index_put_with_sort_kernel_thrust_helper(Tensor &linearIndex, Tensor &orig_
   thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data, LTOp<int64_t>());
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if !CUB_SUPPORTS_SCAN_BY_KEY()
 
 template<typename index_t>
diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu
index 03fc2cd00470..2c84478acf09 100644
--- a/aten/src/ATen/native/cuda/Loss.cu
+++ b/aten/src/ATen/native/cuda/Loss.cu
@@ -146,11 +146,15 @@ Tensor& binary_cross_entropy_backward_out_cuda(const Tensor& grad, const Tensor&
 namespace {
 
 int nll_loss_threads(int64_t nframe){
+<<<<<<< HEAD
 #if defined(USE_ROCM)
   return std::clamp(1 << static_cast<int64_t>(std::round(std::log2(nframe/16))), 32, 1024);
 #else
   return 32;
 #endif
+=======
+  return std::clamp(1 << static_cast<int64_t>(std::round(std::log2(nframe/16))), 32, 1024);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // NOTE(crcrpar): `Byte` support was added for https://github.com/pytorch/pytorch/issues/59765.
@@ -256,6 +260,7 @@ __global__ void nll_loss_forward_reduce_cuda_kernel_2d(
 
   __syncthreads();
 
+<<<<<<< HEAD
   if (threadIdx.x == 0) {
     accscalar_t output_acc = 0;
     accscalar_t total_weight_acc = 0;
@@ -268,6 +273,22 @@ __global__ void nll_loss_forward_reduce_cuda_kernel_2d(
       *output = static_cast<scalar_t>(output_acc / total_weight_acc);
     } else {
       *output = static_cast<scalar_t>(output_acc);
+=======
+  for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
+    if (threadIdx.x < stride) {
+      sh_inputs[threadIdx.x] += sh_inputs[threadIdx.x + stride];
+      acc_weight[threadIdx.x] += acc_weight[threadIdx.x + stride];
+    }
+    __syncthreads();
+  }
+
+  if (threadIdx.x == 0) {
+    *total_weight = static_cast<scalar_t>(acc_weight[0]);
+    if (size_average) {
+      *output = static_cast<scalar_t>(sh_inputs[0] / acc_weight[0]);
+    } else {
+      *output = static_cast<scalar_t>(sh_inputs[0]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 }
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
index e597b64c0c17..ba13b2480351 100644
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -219,6 +219,10 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
 // backward. The dispatch function will only return the loss.
 template<typename scalar_t, ScalarType target_scalar_type>
 std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t BLANK) {
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(log_probs.numel() > 0, "log_probs tensor must not be empty");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // log_probs: input_len x batch_size x num_labels
   // targets [int64]: batch_size x target_length OR sum(target_lengths)
   CheckedFrom c = "ctc_loss_gpu";
diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh
index 2fe8f5dd2e3a..0ef142b81725 100644
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@@ -766,7 +766,11 @@ const auto sinc_string = jiterator_stringify(
 ); // sinc_string
 
 const auto erfcx_string = jiterator_stringify(
+<<<<<<< HEAD
   /* The next function is taken from http://ab-initio.mit.edu/Faddeev */
+=======
+  /* The next function is taken from http://ab-initio.mit.edu/faddeeva */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /* Copyright (c) 2012 Massachusetts Institute of Technology
   *
diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh
index 21c025f3e135..77dc849bc648 100644
--- a/aten/src/ATen/native/cuda/MemoryAccess.cuh
+++ b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@@ -48,7 +48,11 @@ struct static_unroll {
 template<template<int i> typename func, int end>
 struct static_unroll<func, end, end> {
   template<typename... Args>
+<<<<<<< HEAD
   static inline C10_HOST_DEVICE void with_args(Args... args) {}
+=======
+  static inline C10_HOST_DEVICE void with_args(Args... /*args*/) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // helper structs to be used with static_unroll to load arguments
@@ -540,7 +544,11 @@ inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) {
 template<int i>
 struct can_vectorize_up_to_helper {
   template <typename array_t, typename traits>
+<<<<<<< HEAD
   static C10_HOST_DEVICE void apply(int &result, array_t pointers, traits _) {
+=======
+  static C10_HOST_DEVICE void apply(int &result, array_t pointers, traits /*_*/) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     using arg_t = typename traits::template arg<i>::type;
     // `pointers` hold the data_ptr for tensors [output, input0, input1, ...], so we
     // need a +1 offset to get the input
@@ -560,4 +568,126 @@ inline int can_vectorize_up_to(array_t pointers) {
   return result;
 }
 
+<<<<<<< HEAD
+=======
+
+
+template <typename T>
+__inline__ size_t get_alignment(T ptr_or_size) {
+  auto val = reinterpret_cast<uintptr_t>(ptr_or_size);
+  if (val % 16 == 0) {
+    return 16;
+  } else if (val % 8 == 0) {
+    return 8;
+  } else if (val % 4 == 0) {
+    return 4;
+  } else if (val % 2 == 0) {
+    return 2;
+  } else {
+    return 1;
+  }
+}
+
+template <>
+__inline__ size_t get_alignment<size_t>(size_t size) {
+  return get_alignment(reinterpret_cast<void*>(size));
+}
+
+template <bool Value, class... Args>
+inline constexpr bool dependent_bool_value = Value;
+
+template <class... Args>
+inline constexpr bool dependent_false = dependent_bool_value<false, Args...>;
+
+template <int Size>
+union Vec;
+
+template <>
+union Vec<4> {
+  uint16_t u16[2];
+  uint32_t u32, as_scalar;
+  float f32;
+};
+
+template <>
+union Vec<8> {
+  uint16_t u16[4];
+  uint32_t u32[2];
+  uint64_t u64, as_scalar;
+  float f32[2];
+};
+
+template <>
+union alignas(16) Vec<16> {
+  uint16_t u16[8];
+  uint32_t u32[4];
+  uint64_t u64[2];
+  uint4 u128, as_scalar;
+  float f32[4];
+};
+
+template <int Alignment, typename T>
+__device__ __inline__ Vec<Alignment> ld_vec(const T* addr) {
+  Vec<Alignment> vec;
+  if constexpr (Alignment == 16) {
+#if defined(USE_ROCM)
+    vec.u128 = *reinterpret_cast<const uint4*>(addr);
+  } else if constexpr (Alignment == 8) {
+    vec.u64 = *reinterpret_cast<const uint64_t*>(addr);
+  } else if constexpr (Alignment == 4) {
+    vec.u32 = *reinterpret_cast<const uint32_t*>(addr);
+#else
+    asm("ld.global.v4.u32 {%0,%1,%2,%3}, [%4];"
+        : "=r"(vec.u32[0]), "=r"(vec.u32[1]), "=r"(vec.u32[2]), "=r"(vec.u32[3])
+        : "l"(addr)
+        : "memory");
+  } else if constexpr (Alignment == 8) {
+    asm("ld.global.v2.u32 {%0,%1}, [%2];"
+        : "=r"(vec.u32[0]), "=r"(vec.u32[1])
+        : "l"(addr)
+        : "memory");
+  } else if constexpr (Alignment == 4) {
+    asm("ld.global.u32 %0, [%1];" : "=r"(vec.u32) : "l"(addr) : "memory");
+#endif
+  } else {
+    static_assert(dependent_false<T>);
+  }
+  return vec;
+}
+
+template <int Alignment, typename T>
+__device__ __inline__ void st_vec(T* addr, const Vec<Alignment>& vec) {
+  if constexpr (Alignment == 16) {
+#if defined(USE_ROCM)
+    reinterpret_cast<uint64_t*>(addr)[0] = vec.u64[0];
+    reinterpret_cast<uint64_t*>(addr)[1] = vec.u64[1];
+  } else if constexpr (Alignment == 8) {
+    *reinterpret_cast<uint64_t*>(addr) = vec.u64;
+  } else if constexpr (Alignment == 4) {
+    *reinterpret_cast<uint32_t*>(addr) = vec.u32;
+#else
+    asm("st.global.v4.u32 [%0], {%1,%2,%3,%4};"
+        :
+        : "l"(addr),
+          "r"(vec.u32[0]),
+          "r"(vec.u32[1]),
+          "r"(vec.u32[2]),
+          "r"(vec.u32[3])
+        : "memory");
+  } else if constexpr (Alignment == 8) {
+    asm("st.global.v2.u32 [%0], {%1,%2};"
+        :
+        : "l"(addr), "r"(vec.u32[0]), "r"(vec.u32[1])
+        : "memory");
+  } else if constexpr (Alignment == 4) {
+    asm("st.global.u32 [%0], %1;" : : "l"(addr), "r"(vec.u32) : "memory");
+#endif
+  } else {
+    static_assert(dependent_false<T>);
+  }
+}
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native::memory
diff --git a/aten/src/ATen/native/cuda/MiscUtils.h b/aten/src/ATen/native/cuda/MiscUtils.h
index e616a7d1fcfb..75b21a7a9438 100644
--- a/aten/src/ATen/native/cuda/MiscUtils.h
+++ b/aten/src/ATen/native/cuda/MiscUtils.h
@@ -4,8 +4,13 @@
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/cuda/PinnedMemoryAllocator.h>
 
+<<<<<<< HEAD
 namespace at {
 namespace native {
+=======
+
+namespace at::native {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static inline int cuda_int_cast(int64_t value, const char* varname) {
   auto result = static_cast<int>(value);
@@ -28,5 +33,9 @@ static inline Storage pin_memory(int64_t size) {
       /*resizable=*/false);
 }
 
+<<<<<<< HEAD
 } // namespace native
 } // namespace at
+=======
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/cuda/MixedDtypesLinear.cu b/aten/src/ATen/native/cuda/MixedDtypesLinear.cu
index db4cfb4de78f..6d1fb8fecbb2 100644
--- a/aten/src/ATen/native/cuda/MixedDtypesLinear.cu
+++ b/aten/src/ATen/native/cuda/MixedDtypesLinear.cu
@@ -2,7 +2,11 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAUtils.h>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Doesn't work on ROCm or Windows yet
 // TODO: Add compiler warning? Add PyTorch config flag?
 #else
@@ -20,7 +24,11 @@
 #include <ATen/native/cuda/cutlass_extensions/gemm/threadblock/default_mma.h>
 #endif
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Doesn't work on ROCm or Windows yet
 #else
 #define CUTLASS_STATUS_CHECK(status)                                      \
@@ -32,7 +40,11 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Doesn't work on ROCm or Windows yet or old compiler
 #else
 template<typename ElementInputA, typename ElementInputB, typename EpilogueTag>
@@ -198,7 +210,11 @@ _mixed_dtypes_linear(const Tensor& input, const Tensor& weight,
                      const Tensor& scale,
                      const std::optional<Tensor>& bias_opt,
                      const std::optional<std::string_view> activation_opt) {
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "_mixed_dtypes_linear: not compiled for this platform");
   return Tensor{};
 #else
diff --git a/aten/src/ATen/native/cuda/Nonzero.cu b/aten/src/ATen/native/cuda/Nonzero.cu
index 294a8c4a991c..dbf254b6ae2f 100644
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@@ -187,21 +187,35 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
     cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(
         self_.const_data_ptr<scalar_t>() + idx * chunk_size,
         NonZeroOp<scalar_t>());
+<<<<<<< HEAD
     cub::DeviceReduce::Sum(
+=======
+    AT_CUDA_CHECK(cub::DeviceReduce::Sum(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nullptr,
         temp_storage_bytes,
         itr,
         ((int*)num_nonzeros.get()) + idx,
         remaining,
+<<<<<<< HEAD
         stream);
     auto temp_storage = allocator.allocate(temp_storage_bytes);
     cub::DeviceReduce::Sum(
+=======
+        stream));
+    auto temp_storage = allocator.allocate(temp_storage_bytes);
+    AT_CUDA_CHECK(cub::DeviceReduce::Sum(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         temp_storage.get(),
         temp_storage_bytes,
         itr,
         ((int*)num_nonzeros.get()) + idx,
         remaining,
+<<<<<<< HEAD
         stream);
+=======
+        stream));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   auto pinned_num_nonzeros_h = at::detail::empty_cpu(
       {num_chunks}, /* size */
@@ -248,7 +262,11 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
           itr(self_.const_data_ptr<scalar_t>() + idx * chunk_size,
               NonZeroOp<scalar_t>());
       temp_storage_bytes = 0;
+<<<<<<< HEAD
       cub::DeviceSelect::Flagged(
+=======
+      AT_CUDA_CHECK(cub::DeviceSelect::Flagged(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           nullptr,
           temp_storage_bytes,
           counting_itr,
@@ -256,9 +274,15 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
           out_temp.mutable_data_ptr<int64_t>(),
           ((int*)num_nonzeros.get()) + idx,
           remaining,
+<<<<<<< HEAD
           stream);
       auto temp_storage = allocator.allocate(temp_storage_bytes);
       cub::DeviceSelect::Flagged(
+=======
+          stream));
+      auto temp_storage = allocator.allocate(temp_storage_bytes);
+      AT_CUDA_CHECK(cub::DeviceSelect::Flagged(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           temp_storage.get(),
           temp_storage_bytes,
           counting_itr,
@@ -266,7 +290,11 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out) {
           out_temp.mutable_data_ptr<int64_t>() + curr_nonzeros,
           ((int*)num_nonzeros.get()) + idx,
           remaining,
+<<<<<<< HEAD
           stream);
+=======
+          stream));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       curr_nonzeros +=
           (int)*(pinned_num_nonzeros_h.const_data_ptr<int>() + idx);
     }
@@ -300,7 +328,11 @@ void nonzero_static_cuda_out_impl(
     int64_t size,
     int64_t fill_value,
     Tensor& out) {
+<<<<<<< HEAD
 # if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
+=======
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Tensor self_contiguous_ = self.contiguous();
   // see comment in nonzero_cuda_out_impl on reqs for out
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index 8db7241dee13..52d922f49232 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -435,7 +435,12 @@ void batch_norm_calc_invstd(const Tensor& out_invstd, const Tensor& running_var,
 std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_cuda_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& bias_opt, const std::optional<Tensor>& running_mean_opt, const std::optional<Tensor>& running_var_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
   const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined());
   const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined());
+<<<<<<< HEAD
   TORCH_CHECK(has_running_mean == has_running_var);
+=======
+  TORCH_CHECK_VALUE(has_running_mean == has_running_var,
+    "running_mean and running_var must either both be None or neither be None");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (train) {
     batch_norm_mean_var(self, save_mean, save_invstd);
diff --git a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
index 14807c0200ec..e863fa691cc6 100644
--- a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
@@ -17,7 +17,11 @@ void addcmul_cuda_scalar_tensor2_kernel(
   const Scalar& value
 );
 
+<<<<<<< HEAD
 #if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
+=======
+#if AT_USE_JITERATOR()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr char addcmul_name[] = "addcmul";
 #endif
 void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
@@ -37,10 +41,14 @@ void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
 
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
+<<<<<<< HEAD
     // When using Jiterator, addcmul and addcdiv kernels get stuck during a
     // promotion test on CUDA 11.3, so only enable that from CUDA 11.5:
     // https://github.com/pytorch/pytorch/pull/74234#issuecomment-1100932209
     #if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
+=======
+    #if AT_USE_JITERATOR()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_cuda", [&]() {
         auto alpha = value.to<scalar_t>();
         static const auto addcmul_string = jiterator_stringify(
@@ -93,17 +101,25 @@ void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
   }
 }
 
+<<<<<<< HEAD
 #if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
+=======
+#if AT_USE_JITERATOR()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr char addcmul_scalar_tensor2_name[] = "addcmul_scalar_tensor2";
 #endif
 void addcmul_cuda_scalar_tensor2_kernel(TensorIteratorBase& iter, const Scalar& scalar_tensor2, const Scalar& value) {
   auto dtype = iter.common_dtype();
 
   if (at::isComplexType(dtype)) {
+<<<<<<< HEAD
     // When using Jiterator, addcmul and addcdiv kernels get stuck during a
     // promotion test on CUDA 11.3, so only enable that from CUDA 11.5:
     // https://github.com/pytorch/pytorch/pull/74234#issuecomment-1100932209
     #if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
+=======
+    #if AT_USE_JITERATOR()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_cuda", [&]() {
         auto c = scalar_tensor2.to<scalar_t>();
         auto alpha = value.to<scalar_t>();
@@ -145,17 +161,25 @@ void addcmul_cuda_scalar_tensor2_kernel(TensorIteratorBase& iter, const Scalar&
   }
 }
 
+<<<<<<< HEAD
 #if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
+=======
+#if AT_USE_JITERATOR()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // return a + alpha * (b / static_cast<accscalar_t>(c));
 constexpr char addcdiv_name[] = "addcdiv";
 #endif
 void addcdiv_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
+<<<<<<< HEAD
     // When using Jiterator, addcmul and addcdiv kernels get stuck during a
     // promotion test on CUDA 11.3, so only enable that from CUDA 11.5:
     // https://github.com/pytorch/pytorch/pull/74234#issuecomment-1100932209
     #if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
+=======
+    #if AT_USE_JITERATOR()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_cuda", [&]() {
         auto alpha = value.to<scalar_t>();
         static const auto addcdiv_string =
diff --git a/aten/src/ATen/native/cuda/RNN.cu b/aten/src/ATen/native/cuda/RNN.cu
index 53dd49909b1a..7a761ac3dc96 100644
--- a/aten/src/ATen/native/cuda/RNN.cu
+++ b/aten/src/ATen/native/cuda/RNN.cu
@@ -56,7 +56,11 @@ bool allContiguous(at::TensorList tensors) {
 
 void getLaunchConfig(dim3* block, dim3* grid, int64_t numel) {
   c10::DeviceIndex curDevice = -1;
+<<<<<<< HEAD
   c10::cuda::GetDevice(&curDevice);
+=======
+  AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   *block = cuda::getApplyBlock();
   TORCH_INTERNAL_ASSERT(cuda::getApplyGrid(numel, *grid, curDevice),
                         "Could not get grid size for pointwise apply.");
diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu
index e471ce9f9d77..0565772f1a8e 100644
--- a/aten/src/ATen/native/cuda/RangeFactories.cu
+++ b/aten/src/ATen/native/cuda/RangeFactories.cu
@@ -1,10 +1,20 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+<<<<<<< HEAD
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/detail/FunctionTraits.h>
+=======
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/RangeUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cmath>
 #include <limits>
 
@@ -181,12 +191,17 @@ Tensor& range_cuda_out(const Scalar& start, const Scalar& end, const Scalar& ste
     auto xend = end.to<accscalar_t>();
     auto xstep = step.to<accscalar_t>();
 
+<<<<<<< HEAD
     TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
     TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
              std::isfinite(static_cast<double>(xend)),
              "unsupported range: ", xstart, " -> ", xend);
     TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
              "upper bound and larger bound inconsistent with step sign");
+=======
+    arange_check_bounds(start, end, step);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
 
     if (result.numel() != size) {
@@ -217,12 +232,16 @@ Tensor& arange_cuda_out(const Scalar& start, const Scalar& end, const Scalar& st
     auto xend = end.to<accscalar_t>();
     auto xstep = step.to<accscalar_t>();
 
+<<<<<<< HEAD
     TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
     TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
               std::isfinite(static_cast<double>(xend)),
               "unsupported range: ", xstart, " -> ", xend);
     TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
               "upper bound and larger bound inconsistent with step sign");
+=======
+    arange_check_bounds(start, end, step);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // we use double precision for (start - end) / step
     // to compute size_d for consistency across devices.
diff --git a/aten/src/ATen/native/cuda/Resize.h b/aten/src/ATen/native/cuda/Resize.h
index d5de128cac1d..f4ad2829c04c 100644
--- a/aten/src/ATen/native/cuda/Resize.h
+++ b/aten/src/ATen/native/cuda/Resize.h
@@ -5,7 +5,11 @@
 
 #include <c10/cuda/CUDAGuard.h>
 
+<<<<<<< HEAD
 namespace at { namespace native {
+=======
+namespace at::native {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TORCH_CUDA_CPP_API void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes);
 
@@ -50,4 +54,8 @@ inline TensorImpl* resize_impl_cuda_(
   return self;
 }
 
+<<<<<<< HEAD
 }}
+=======
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/cuda/RowwiseScaledMM.cu b/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
index f6abc7c078d8..4d149ec73f2f 100644
--- a/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
+++ b/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
@@ -8,6 +8,10 @@
 // Two warninngs in Cutlass included header files
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
+<<<<<<< HEAD
+=======
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-field-initializers")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Determine if the architecture supports rowwise scaled mm
 // Currently failing on windows with:
@@ -41,6 +45,11 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
 #include <cutlass/gemm/kernel/gemm_universal.hpp>
 #include <cutlass/util/packed_stride.hpp>
 
+<<<<<<< HEAD
+=======
+#include <ATen/native/cuda/cutlass_common.cuh>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DIAGNOSTIC_POP()
 C10_DIAGNOSTIC_POP()
 
@@ -221,10 +230,18 @@ void f8f8bf16_rowwise_impl(
           typename Schedule<large_tile, FastAccum::value>::type>::
           CollectiveOp;
 
+<<<<<<< HEAD
   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
       cute::Shape<int, int, int>,
       CollectiveMainloop,
       CollectiveEpilogue>;
+=======
+  using GemmKernel = at::cuda::detail::enable_3x_kernel_for_sm9x<
+      cutlass::gemm::kernel::GemmUniversal<
+          cute::Shape<int, int, int>,
+          CollectiveMainloop,
+          CollectiveEpilogue>>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 
@@ -298,8 +315,14 @@ void f8f8bf16_rowwise_impl(
 }
 
 
+<<<<<<< HEAD
 // Cutlass rowwise kernel for SM100
 template <
+=======
+// Cutlass rowwise kernel for SM100/SM120
+template <
+    typename ArchTag,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     typename TileShape,
     typename ClusterShape,
     typename Transposed,
@@ -307,7 +330,11 @@ template <
     typename DtypeA,
     typename DtypeB,
     typename DtypeBias>
+<<<<<<< HEAD
 void f8f8bf16_rowwise_impl_sm100(
+=======
+void f8f8bf16_rowwise_impl_sm100_sm120(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor XQ, // FP8
     at::Tensor WQ, // FP8
     at::Tensor x_scale,
@@ -340,8 +367,11 @@ void f8f8bf16_rowwise_impl_sm100(
       cutlass::layout::RowMajor>;
   constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);
 
+<<<<<<< HEAD
   // Tag indicating the minimum SM that supports the intended feature
   using ArchTag = cutlass::arch::Sm100;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using OperatorClass = cutlass::arch::OpClassTensorOp;
 
   // Implement rowwise scaling epilogue.
@@ -376,7 +406,11 @@ void f8f8bf16_rowwise_impl_sm100(
 
   using EpilogueScheduleType = cutlass::epilogue::collective::EpilogueScheduleAuto;
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+<<<<<<< HEAD
       cutlass::arch::Sm100, OperatorClass,
+=======
+      ArchTag, OperatorClass,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TileShape, ClusterShape,
       cutlass::epilogue::collective::EpilogueTileAuto,
       DtypeAccum, DtypeEpilogue,
@@ -385,7 +419,17 @@ void f8f8bf16_rowwise_impl_sm100(
       EpilogueScheduleType,
       EpilogueEVT>::CollectiveOp;
 
+<<<<<<< HEAD
   using MainloopScheduleType = cutlass::gemm::collective::KernelScheduleAuto;
+=======
+  // as of CUTLASS 3.9.2, on sm120, KernelScheduleAuto resolves to
+  // KernelTmaWarpSpecializedCooperativeSm120<2>>,
+  // which does not support TileShape.M < 128
+  using MainloopScheduleType = std::conditional_t<
+      std::is_same_v<ArchTag, cutlass::arch::Sm120> && cute::size<0>(TileShape{}) < 128,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong,
+      cutlass::gemm::collective::KernelScheduleAuto>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using CollectiveMainloop =
       typename cutlass::gemm::collective::CollectiveBuilder<
           ArchTag,
@@ -402,10 +446,18 @@ void f8f8bf16_rowwise_impl_sm100(
           cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
           MainloopScheduleType>::CollectiveOp;
 
+<<<<<<< HEAD
   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
       cute::Shape<int, int, int>,
       CollectiveMainloop,
       CollectiveEpilogue>;
+=======
+  using GemmKernel = at::cuda::detail::enable_3x_kernel_for_sm10_or_later<
+      cutlass::gemm::kernel::GemmUniversal<
+          cute::Shape<int, int, int>,
+          CollectiveMainloop,
+          CollectiveEpilogue>>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
 
@@ -480,6 +532,12 @@ void f8f8bf16_rowwise_impl_sm100(
 
 // Cutlass rowwise kernel for SM89
 template <
+<<<<<<< HEAD
+=======
+    typename ThreadblockShape,
+    typename WarpShape,
+    int NumStages,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     typename FastAccum,
     typename DtypeA,
     typename DtypeB,
@@ -511,12 +569,16 @@ void f8f8bf16_rowwise_impl_sm89(
   using ThreadblockSwizzle =
       cutlass::gemm::threadblock::ThreadblockSwizzleStreamK;
 
+<<<<<<< HEAD
   // TODO: instead of fixing these values, implement logic alike to
   // what is used for SM90+.
   using ThreadblockShape = cutlass::gemm::GemmShape<64, 128, 64>;
   using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
   constexpr auto NumStages = 4;
+=======
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   using Operator = std::conditional_t<
       FastAccum::value,
@@ -586,6 +648,7 @@ void f8f8bf16_rowwise_impl_sm89(
       Output,
       EVTApplyBias>;
 
+<<<<<<< HEAD
   using EVTKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
       DtypeA, LayoutInputA, cutlass::ComplexTransform::kNone, AlignmentInputA,
       DtypeB, LayoutInputB, cutlass::ComplexTransform::kNone, AlignmentInputB,
@@ -603,6 +666,25 @@ void f8f8bf16_rowwise_impl_sm89(
       Operator,
       NumEVTEpilogueStages
   >::GemmKernel;
+=======
+  using EVTKernel = at::cuda::detail::enable_2x_kernel_for_sm89<
+      typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+          DtypeA, LayoutInputA, cutlass::ComplexTransform::kNone, AlignmentInputA,
+          DtypeB, LayoutInputB, cutlass::ComplexTransform::kNone, AlignmentInputB,
+          DtypeOutput, LayoutOutput, AlignmentOutput,
+          DtypeAccum,
+          DtypeEpilogue,
+          OperatorClass,
+          ArchTag,
+          ThreadblockShape,
+          WarpShape,
+          InstructionShape,
+          EVTOutput,
+          ThreadblockSwizzle,
+          NumStages,
+          Operator,
+          NumEVTEpilogueStages>::GemmKernel>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   using Gemm = cutlass::gemm::device::GemmUniversalAdapter<EVTKernel>;
 
@@ -695,7 +777,11 @@ void f8f8bf16_rowwise_impl_sm89(
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
+<<<<<<< HEAD
 template <typename ClusterShape, typename... Types>
+=======
+template <typename ClusterShape, typename ArchTag, typename... Types>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void dispatch_fp8_rowwise_kernel_on_tile_size(
     at::Tensor XQ,
     at::Tensor WQ,
@@ -712,9 +798,12 @@ void dispatch_fp8_rowwise_kernel_on_tile_size(
     smTarget -= at::globalContext()._SMCarveout_EXPERIMENTAL().value();
   }
 
+<<<<<<< HEAD
   cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
   const bool sm10x = properties != nullptr && properties->major == 10;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We prefer to use smaller tiles (less wasted compute in case of padding),
   // but if this causes us to have more CUDA blocks than there are SMs on the
   // GPU then we'll hit wave quantization, hence we'll switch to larger tiles.
@@ -723,12 +812,24 @@ void dispatch_fp8_rowwise_kernel_on_tile_size(
       smTarget / cute::size(ClusterShape{});
 
   if (use_smaller_tiles) {
+<<<<<<< HEAD
     if (sm10x) {
       return f8f8bf16_rowwise_impl_sm100<
+=======
+    if constexpr (std::is_same_v<ArchTag, cutlass::arch::Sm90>) {
+      return f8f8bf16_rowwise_impl<
+          /*TileShape=*/cute::Shape<cute::_64, cute::_128, cute::_128>,
+          ClusterShape,
+          Types...>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
+    } else {
+      return f8f8bf16_rowwise_impl_sm100_sm120<
+        ArchTag,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         /*TileShape=*/cute::Shape<cute::_64, cute::_128, cute::_128>,
         ClusterShape,
         Types...>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
     }
+<<<<<<< HEAD
     return f8f8bf16_rowwise_impl<
         /*TileShape=*/cute::Shape<cute::_64, cute::_128, cute::_128>,
         ClusterShape,
@@ -736,20 +837,38 @@ void dispatch_fp8_rowwise_kernel_on_tile_size(
   } else {
     if (sm10x) {
       return f8f8bf16_rowwise_impl_sm100<
+=======
+  } else {
+    if constexpr (std::is_same_v<ArchTag, cutlass::arch::Sm90>) {
+      return f8f8bf16_rowwise_impl<
+        /*TileShape=*/cute::Shape<cute::_128, cute::_128, cute::_128>,
+        ClusterShape,
+        Types...>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
+    } else {
+      return f8f8bf16_rowwise_impl_sm100_sm120<
+        ArchTag,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         /*TileShape=*/cute::Shape<cute::_128, cute::_128, cute::_128>,
         ClusterShape,
         Types...>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
     }
+<<<<<<< HEAD
     return f8f8bf16_rowwise_impl<
         /*TileShape=*/cute::Shape<cute::_128, cute::_128, cute::_128>,
         ClusterShape,
         Types...>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
 template <
     typename ClusterShape,
     typename Transposed,
+<<<<<<< HEAD
+=======
+    typename ArchTag,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     typename FastAccum,
     typename DtypeA,
     typename DtypeB,
@@ -765,6 +884,10 @@ void handle_transposition(
   if constexpr (!Transposed::value) {
     dispatch_fp8_rowwise_kernel_on_tile_size<
         ClusterShape,
+<<<<<<< HEAD
+=======
+        ArchTag,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Transposed,
         FastAccum,
         DtypeA,
@@ -773,6 +896,10 @@ void handle_transposition(
   } else {
     dispatch_fp8_rowwise_kernel_on_tile_size<
         ClusterShape,
+<<<<<<< HEAD
+=======
+        ArchTag,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Transposed,
         FastAccum,
         DtypeB,
@@ -881,6 +1008,52 @@ void dispatch_fp8_rowwise_kernel_on_cluster_size_and_transpose(
 }
 
 template <typename... Types>
+<<<<<<< HEAD
+=======
+void dispatch_fp8_rowwise_kernel_sm89(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    std::optional<at::Tensor> bias,
+    at::Tensor out) {
+  int M = XQ.size(0);
+
+  if (M <= 16) {
+    return f8f8bf16_rowwise_impl_sm89<
+        /*ThreadblockShape=*/cutlass::gemm::GemmShape<16, 64, 128>,
+        /*WarpShape=*/cutlass::gemm::GemmShape<16, 64, 64>,
+        /*NumStages=*/5,
+        Types...>(XQ, WQ, x_scale, w_scale, bias, out);
+  } else if (M <= 32) {
+    return f8f8bf16_rowwise_impl_sm89<
+        /*ThreadblockShape=*/cutlass::gemm::GemmShape<32, 64, 128>,
+        /*WarpShape=*/cutlass::gemm::GemmShape<16, 64, 64>,
+        /*NumStages=*/5,
+        Types...>(XQ, WQ, x_scale, w_scale, bias, out);
+  } else if (M <= 64) {
+    return f8f8bf16_rowwise_impl_sm89<
+        /*ThreadblockShape=*/cutlass::gemm::GemmShape<64, 64, 128>,
+        /*WarpShape=*/cutlass::gemm::GemmShape<32, 64, 64>,
+        /*NumStages=*/5,
+        Types...>(XQ, WQ, x_scale, w_scale, bias, out);
+  } else if (M <= 256) {
+    return f8f8bf16_rowwise_impl_sm89<
+        /*ThreadblockShape=*/cutlass::gemm::GemmShape<64, 128, 128>,
+        /*WarpShape=*/cutlass::gemm::GemmShape<64, 64, 64>,
+        /*NumStages=*/3,
+        Types...>(XQ, WQ, x_scale, w_scale, bias, out);
+  } else {
+    return f8f8bf16_rowwise_impl_sm89<
+        /*ThreadblockShape=*/cutlass::gemm::GemmShape<128, 128, 64>,
+        /*WarpShape=*/cutlass::gemm::GemmShape<64, 64, 64>,
+        /*NumStages=*/5,
+        Types...>(XQ, WQ, x_scale, w_scale, bias, out);
+  }
+}
+
+template <typename... Types>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void dispatch_fp8_rowwise_kernel_on_sm(
     at::Tensor XQ,
     at::Tensor WQ,
@@ -892,15 +1065,40 @@ void dispatch_fp8_rowwise_kernel_on_sm(
   const bool sm89 = properties != nullptr && properties->major == 8 && properties->minor == 9;
   const bool sm9x = properties != nullptr && properties->major == 9;
   const bool sm10x = properties != nullptr && properties->major == 10;
+<<<<<<< HEAD
   if (!(sm89 || sm9x || sm10x)) {
+=======
+  const bool sm12x = properties != nullptr && properties->major == 12;
+  if (!(sm89 || sm9x || sm10x || sm12x)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(
         false, "Rowwise scaling is not currently supported on your device");
   }
 
+<<<<<<< HEAD
   if (sm9x || sm10x) {
     dispatch_fp8_rowwise_kernel_on_cluster_size_and_transpose<Types...>(XQ, WQ, x_scale, w_scale, bias, out);
   } else {
     f8f8bf16_rowwise_impl_sm89<Types...>(XQ, WQ, x_scale, w_scale, bias, out);
+=======
+  if (sm9x) {
+    dispatch_fp8_rowwise_kernel_on_cluster_size_and_transpose<
+      /*ArchTag=*/cutlass::arch::Sm90,
+      Types...>(XQ, WQ, x_scale, w_scale, bias, out);
+  } else if (sm10x) {
+    dispatch_fp8_rowwise_kernel_on_cluster_size_and_transpose<
+      /*ArchTag=*/cutlass::arch::Sm100,
+      Types...>(XQ, WQ, x_scale, w_scale, bias, out);
+  } else if (sm12x) {
+    // sm12x doesn't have multicast feature
+    handle_transposition<
+      /*ClusterShape=*/cute::Shape<cute::_1, cute::_1, cute::_1>,
+      /*Transposed=*/std::false_type,
+      /*ArchTag=*/cutlass::arch::Sm120,
+      Types...>(XQ, WQ, x_scale, w_scale, bias, out);
+  } else {
+    dispatch_fp8_rowwise_kernel_sm89<Types...>(XQ, WQ, x_scale, w_scale, bias, out);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/ScaledGroupMM.cu b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
index 24191a273ac7..5c68443d8e11 100644
--- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu
+++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
@@ -22,16 +22,25 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
 
 #if defined(BUILD_ROWWISE_FP8_KERNEL)
 
+<<<<<<< HEAD
 #include <ATen/native/cuda/cutlass_utils.cuh>
+=======
+#include <ATen/native/cuda/GroupMMCommon.cuh>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <cute/tensor.hpp>
 #include <cutlass/core_io.h>
 #include <cutlass/cutlass.h>
 #include <cutlass/gemm/device/gemm.h>
+<<<<<<< HEAD
 #include <cutlass/half.h>
 #include <cutlass/numeric_types.h>
 #include <cutlass/trace.h>
 #include <cutlass/util/host_tensor.h>
+=======
+#include <cutlass/numeric_types.h>
+#include <cutlass/trace.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cutlass/version.h>
 
 #include <cutlass/epilogue/collective/collective_builder.hpp>
@@ -51,6 +60,7 @@ C10_DIAGNOSTIC_POP()
 
 namespace {
 
+<<<<<<< HEAD
 using Strides = std::array<int64_t, 3>;
 
 template <
@@ -146,6 +156,9 @@ __global__ void prepare_gemm_data(
   stride_output[tid] =
       cutlass::make_cute_packed_stride(StrideOutput{}, {M, ldoutput, 1});
 }
+=======
+using Strides = at::cuda::detail::Strides;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 using DtypeScale = float;
 using DtypeAccum = float;
@@ -205,7 +218,10 @@ struct Schedule {
   using ClusterShape = cute::Shape<cute::_2, cute::_2, cute::_1>;
 };
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 int ceildiv(int a, int b) {
   return (a + b - 1) / b;
 }
@@ -256,6 +272,7 @@ void f8f8bf16_grouped_gemm_impl_sm90(
   using EpilogueSchedule =
       typename Schedule<FastAccum::value, Pong::value, TB_M, TB_N, TB_K>::
           EpilogueSchedule;
+<<<<<<< HEAD
   // TODO remove *BroadcastPtrArrays and replace with just Broadcast
   // when  https://github.com/NVIDIA/cutlass/pull/2120/ is in the tagged cutlass version
   // Implement rowwise scaling epilogue.
@@ -270,6 +287,19 @@ void f8f8bf16_grouped_gemm_impl_sm90(
       0,
       TileShape,
       DtypeScale,
+=======
+  using ScaleA = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0,
+      TileShape,
+      DtypeScale*,
+      DtypeScale,
+      cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
+
+  using ScaleB = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0,
+      TileShape,
+      DtypeScale*,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       DtypeScale,
       cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
 
@@ -345,6 +375,11 @@ void f8f8bf16_grouped_gemm_impl_sm90(
     group_count = mat_a.size(0);
   }
 
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(group_count < 1024, "Can't process more than 1024 groups");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t problem_shape_size =
       group_count * ((int64_t)sizeof(ProblemShape::UnderlyingProblemShape));
 
@@ -383,7 +418,10 @@ void f8f8bf16_grouped_gemm_impl_sm90(
       reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
           stride_output + group_count);
 
+<<<<<<< HEAD
   TORCH_CHECK(group_count < 1024, "Can't process more than 1024 groups");
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto stream = at::cuda::getCurrentCUDAStream().stream();
 
   auto make_strides = [](at::IntArrayRef strides) -> Strides {
@@ -400,7 +438,11 @@ void f8f8bf16_grouped_gemm_impl_sm90(
   int64_t a_scale_stride = scale_a.stride(0);
   int64_t b_scale_stride = scale_b.stride(0);
 
+<<<<<<< HEAD
   prepare_gemm_data<<<1, group_count, 0, stream>>>(
+=======
+  at::cuda::detail::prepare_grouped_gemm_data<<<1, group_count, 0, stream>>>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       reinterpret_cast<DtypeA*>(mat_a.data_ptr()),
       reinterpret_cast<DtypeB*>(mat_b.data_ptr()),
       reinterpret_cast<DtypeOutput*>(out.data_ptr()),
@@ -427,6 +469,7 @@ void f8f8bf16_grouped_gemm_impl_sm90(
 
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
+<<<<<<< HEAD
 //   auto buf_cpu = mat_a.new_empty(
 //       input_args_size, at::TensorOptions().dtype(at::kByte).device(at::kCPU));
 //   AT_CUDA_CHECK(cudaMemcpy(
@@ -467,6 +510,52 @@ void f8f8bf16_grouped_gemm_impl_sm90(
 //     std::cout << "strideB" << stride_B_h[i] << "\n";
 //     std::cout << "stride_output" << stride_output_h[i] << "\n";
 //   }
+=======
+  //   auto buf_cpu = mat_a.new_empty(
+  //       input_args_size,
+  //       at::TensorOptions().dtype(at::kByte).device(at::kCPU));
+  //   AT_CUDA_CHECK(cudaMemcpy(
+  //       (char*)buf_cpu.data_ptr(),
+  //       buf_ptr,
+  //       input_args_size,
+  //       cudaMemcpyDeviceToHost));
+  //   char* buf_ptr_cpu = (char*)buf_cpu.data_ptr();
+  //   DtypeA** inputA_ptrs_h = reinterpret_cast<DtypeA**>(buf_ptr_cpu);
+  //   DtypeB** inputB_ptrs_h =
+  //       reinterpret_cast<DtypeB**>(inputA_ptrs_h + aligned_group_count);
+  //   DtypeOutput** output_ptrs_h =
+  //       reinterpret_cast<DtypeOutput**>(inputB_ptrs_h + aligned_group_count);
+  //   DtypeScale** inputA_scale_ptrs_h =
+  //       reinterpret_cast<DtypeScale**>(output_ptrs_h + aligned_group_count);
+  //   DtypeScale** inputB_scale_ptrs_h =
+  //       reinterpret_cast<DtypeScale**>(inputA_scale_ptrs_h +
+  //       aligned_group_count);
+  //   StrideA* stride_A_h =
+  //       reinterpret_cast<StrideA*>(inputB_scale_ptrs_h +
+  //       aligned_group_count);
+  //   StrideB* stride_B_h = reinterpret_cast<StrideB*>(stride_A_h +
+  //   group_count); StrideOutput* stride_output_h =
+  //       reinterpret_cast<StrideOutput*>(stride_B_h + group_count);
+  //   ProblemShape::UnderlyingProblemShape* problem_sizes_h =
+  //       reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
+  //           stride_output_h + group_count);
+
+  //   std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << "
+  //   "
+  //             << out.data_ptr() << " " << scale_a.data_ptr() << " "
+  //             << scale_b.data_ptr() << "\n";
+  //   for (int i = 0; i < group_count; i++) {
+  //     std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n";
+  //     std::cout << "B " << (void*)inputB_ptrs_h[i] << "\n";
+  //     std::cout << "O " << (void*)output_ptrs_h[i] << "\n";
+  //     std::cout << "A_scale " << (void*)inputA_scale_ptrs_h[i] << "\n";
+  //     std::cout << "B_scale " << (void*)inputB_scale_ptrs_h[i] << "\n";
+  //     std::cout << "sizes " << problem_sizes_h[i] << "\n";
+  //     std::cout << "strideA" << stride_A_h[i] << "\n";
+  //     std::cout << "strideB" << stride_B_h[i] << "\n";
+  //     std::cout << "stride_output" << stride_output_h[i] << "\n";
+  //   }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //   int device_id = 0;
   //   cutlass::KernelHardwareInfo kernel_hw_info =
   //   cutlass::KernelHardwareInfo::make_kernel_hardware_info<Gemm::GemmKernel>(device_id);
@@ -478,13 +567,22 @@ void f8f8bf16_grouped_gemm_impl_sm90(
        stride_A,
        (const DtypeB**)inputB_ptrs,
        stride_B},
+<<<<<<< HEAD
       {{{{inputB_scale_ptrs}, {inputA_scale_ptrs}}},
+=======
+      {{{{inputB_scale_ptrs}, {{inputA_scale_ptrs}, {}, {}}, {}}, {}},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        (const DtypeOutput**)output_ptrs,
        stride_output,
        output_ptrs,
        stride_output}};
 
+<<<<<<< HEAD
   int sm_count = at::cuda::getDeviceProperties(out.device().index())->multiProcessorCount;
+=======
+  int sm_count =
+      at::cuda::getDeviceProperties(out.device().index())->multiProcessorCount;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
     sm_count -= at::globalContext()._SMCarveout_EXPERIMENTAL().value();
   }
diff --git a/aten/src/ATen/native/cuda/ScanUtils.cuh b/aten/src/ATen/native/cuda/ScanUtils.cuh
index 1bb644730095..500c6d2262a0 100644
--- a/aten/src/ATen/native/cuda/ScanUtils.cuh
+++ b/aten/src/ATen/native/cuda/ScanUtils.cuh
@@ -453,7 +453,11 @@ void scan_dim(const TensorBase& self, const TensorBase& result,
   if (self.numel() == self.size(dim)) {
     if constexpr (std::is_same_v<BinaryFunction, std::plus<scalar_t>>) {
       if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms()) && (self.is_floating_point() || self.is_complex())) {
+<<<<<<< HEAD
 # if (defined(CUDA_VERSION) && CUDA_VERSION > 11040) || defined(USE_ROCM)
+=======
+#if defined(CUDA_VERSION) || defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cuda::cub::inclusive_deterministic_scan(self_->const_data_ptr<scalar_t>(), result.mutable_data_ptr<scalar_t>(), binary_op, self.numel());
 #else
         globalContext().alertNotDeterministic("cumsum_cuda_kernel");
diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
index 6d4f9a7c09c0..1132e7d423ac 100644
--- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
+++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@@ -1,16 +1,29 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/TensorAdvancedIndexing.h>
+<<<<<<< HEAD
 
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
+=======
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/ceil_div.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/MemoryOverlap.h>
 
 #include <ATen/native/ScatterGatherChecks.h>
 #include <ATen/native/ReduceOpsUtils.h>
+<<<<<<< HEAD
 #include <ATen/native/TensorIterator.h>
 
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/KernelUtils.cuh>
+=======
+#include <ATen/native/cuda/IndexKernelUtils.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
@@ -116,8 +129,12 @@ static void _launch_scatter_gather_kernel(int64_t N, const func_t& f) {
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
+<<<<<<< HEAD
 
 template <bool is_scatter_like, typename scalar_t>
+=======
+template <bool is_scatter_like, typename scalar_t, typename index_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct _cuda_scatter_gather_internal_kernel {
   template <typename func_t>
   void operator() (
@@ -129,7 +146,11 @@ struct _cuda_scatter_gather_internal_kernel {
   ) {
     if (!iter.can_use_32bit_indexing()) {
       for (auto& sub_iter : iter.with_32bit_indexing()) {
+<<<<<<< HEAD
         _cuda_scatter_gather_internal_kernel<is_scatter_like, scalar_t>()(
+=======
+        _cuda_scatter_gather_internal_kernel<is_scatter_like, scalar_t, index_t>()(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           sub_iter, index_size, index_stride, numel, f
         );
       }
@@ -140,13 +161,38 @@ struct _cuda_scatter_gather_internal_kernel {
     char* src_ptr = (char*)iter.data_ptr(1);
     char* index_ptr = (char*)iter.data_ptr(2);
 
+<<<<<<< HEAD
+=======
+    if constexpr (!is_scatter_like) {
+      // we can go to faster path if we are indexing on the first dim
+      // the dst and src are contiguous and all the dims and pts are multiple of 16
+      constexpr size_t element_size = sizeof(scalar_t);
+      constexpr size_t alignment = 16;
+      if (at::native::fast_gather_kernel_eligible<alignment>(iter, self_ptr, src_ptr, index_stride * element_size, element_size)) {
+        auto slice_size = iter.shape()[0] * element_size;
+        auto num_ind = iter.shape()[1];
+        auto ind_dim_size = index_size;
+        auto inp_stride_bytes = index_stride * element_size;
+        auto out_stride_bytes = iter.strides(0)[1];
+        if (iter.numel() == 0) return;
+        at::native::vectorized_gather_kernel_launch<alignment, index_t>(self_ptr, src_ptr, (index_t*)index_ptr, num_ind, slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes);
+        return;
+      }
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto offset_calc = make_offset_calculator<3>(iter);
     auto loop = [=]C10_DEVICE(int i) {
       auto offsets = offset_calc.get(i);
 
+<<<<<<< HEAD
       int64_t idx_dim = *(int64_t*)(index_ptr + offsets[2]);
       CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
         && "index out of bounds");
+=======
+      int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
+      CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
+        && "scatter gather kernel index out of bounds");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       f(
         (scalar_t*)(self_ptr + offsets[0]),
@@ -157,6 +203,10 @@ struct _cuda_scatter_gather_internal_kernel {
     };
 
     _launch_scatter_gather_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }; // struct _cuda_scatter_gather_internal_kernel
 
@@ -213,9 +263,17 @@ struct cuda_scatter_gather_base_kernel {
         using dtype = typename std::conditional<cast_to_opaque,
           OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
 
+<<<<<<< HEAD
         _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
           iter, index_size, index_stride, self.numel(), f
         );
+=======
+        AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_gather_base_kernel_func", [&] () {
+          _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype, index_t>()(
+            iter, index_size, index_stride, self.numel(), f
+          );
+        });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     );
   }
@@ -263,6 +321,7 @@ struct cuda_scatter_gather_base_kernel {
     auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
     auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
 
+<<<<<<< HEAD
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
@@ -276,6 +335,42 @@ struct cuda_scatter_gather_base_kernel {
         );
       }
     );
+=======
+    if (self.is_quantized()) {
+      TORCH_CHECK(
+          self.qscheme() == kPerTensorAffine,
+          "Only per_tensor quantized quantized tensors are supported by gather.")
+      AT_DISPATCH_QINT_TYPES(iter.dtype(), "gather_quant_cuda", [&] {
+        using dtype = typename std::conditional<cast_to_opaque,
+            OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
+        AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_gather_base_kernel_func", [&] () {
+          _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype, index_t>()(
+            iter, index_size, index_stride, self.numel(), f
+          );
+        });
+      });
+    } else {
+      AT_DISPATCH_V2(
+          iter.dtype(),
+          "gather_cuda",
+          AT_WRAP([&] {
+            using dtype = typename std::conditional<cast_to_opaque,
+                OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
+            AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_gather_base_kernel_func", [&] () {
+              _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype, index_t>()(
+                iter, index_size, index_stride, self.numel(), f
+              );
+            });
+          }),
+          AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+          AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+          AT_EXPAND(AT_FLOAT8_TYPES),
+          kComplexHalf,
+          kHalf,
+          kBool,
+          kBFloat16);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   template <typename func_t>
@@ -322,7 +417,10 @@ struct cuda_scatter_gather_base_kernel {
     auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
     auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     AT_DISPATCH_ALL_TYPES_AND2(
       at::ScalarType::Half, at::ScalarType::BFloat16,
       iter.dtype(),
@@ -330,15 +428,27 @@ struct cuda_scatter_gather_base_kernel {
         using dtype = typename std::conditional<cast_to_opaque,
           OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
 
+<<<<<<< HEAD
         _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
           iter, index_size, index_stride, self.numel(), f
         );
+=======
+        AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_gather_base_kernel_func", [&] () {
+          _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype, index_t>()(
+            iter, index_size, index_stride, self.numel(), f
+          );
+        });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     );
   }
 }; // struct cuda_scatter_gather_base_kernel
 
+<<<<<<< HEAD
 template <typename scalar_t>
+=======
+template <typename scalar_t, typename index_t>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct _cuda_scatter_fill_internal_kernel {
   template <typename func_t>
   void operator()(
@@ -351,7 +461,11 @@ struct _cuda_scatter_fill_internal_kernel {
   ) {
     if (!iter.can_use_32bit_indexing()) {
       for (auto& sub_iter : iter.with_32bit_indexing()) {
+<<<<<<< HEAD
         _cuda_scatter_fill_internal_kernel<scalar_t>()(
+=======
+        _cuda_scatter_fill_internal_kernel<scalar_t, index_t>()(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           sub_iter, src_val, index_size, index_stride, numel, f
         );
       }
@@ -365,7 +479,11 @@ struct _cuda_scatter_fill_internal_kernel {
     auto loop = [=]C10_DEVICE(int i) {
       auto offsets = offset_calc.get(i);
 
+<<<<<<< HEAD
       int64_t idx_dim = *(int64_t*)(index_ptr + offsets[1]);
+=======
+      int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
         && "index out of bounds"
       );
@@ -421,9 +539,17 @@ struct cuda_scatter_fill_base_kernel {
         auto src_scalar_val = src.to<scalar_t>();
         auto src_val = *(dtype*)&src_scalar_val;
 
+<<<<<<< HEAD
         _cuda_scatter_fill_internal_kernel<dtype>()(
           iter, src_val, index_size, index_stride, self.numel(), f
         );
+=======
+        AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_fill_base_kernel_func", [&] () {
+          _cuda_scatter_fill_internal_kernel<dtype, index_t>()(
+            iter, src_val, index_size, index_stride, self.numel(), f
+          );
+        });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     );
   }
@@ -464,9 +590,17 @@ struct cuda_scatter_fill_base_kernel {
         auto src_scalar_val = src.to<scalar_t>();
         auto src_val = *(dtype*)&src_scalar_val;
 
+<<<<<<< HEAD
         _cuda_scatter_fill_internal_kernel<dtype>()(
           iter, src_val, index_size, index_stride, self.numel(), f
         );
+=======
+        AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_fill_base_kernel_reduce_multiply", [&] () {
+          _cuda_scatter_fill_internal_kernel<dtype, index_t>()(
+            iter, src_val, index_size, index_stride, self.numel(), f
+          );
+        });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     );
   }
diff --git a/aten/src/ATen/native/cuda/SegmentReduce.cu b/aten/src/ATen/native/cuda/SegmentReduce.cu
index 04bec043b725..b763264e338f 100644
--- a/aten/src/ATen/native/cuda/SegmentReduce.cu
+++ b/aten/src/ATen/native/cuda/SegmentReduce.cu
@@ -1,5 +1,9 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/SegmentReduce.h>
+<<<<<<< HEAD
+=======
+#include <cuda_runtime.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
@@ -17,6 +21,13 @@
 #include <ATen/ops/cumsum.h>
 #endif
 
+<<<<<<< HEAD
+=======
+// SegmentReduce compilation with CUDA-12.9 causes  NVCC crash on Windows
+// See https://github.com/pytorch/pytorch/issues/156181
+#if !defined(_WIN32) || CUDART_VERSION < 12090
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::native {
 
 namespace {
@@ -600,3 +611,8 @@ REGISTER_DISPATCH(
   &_segment_reduce_offsets_backward_cuda_kernel);
 
 } // namespace at::native
+<<<<<<< HEAD
+=======
+
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
index c8153b2c5653..054ce66accba 100644
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@@ -27,7 +27,12 @@ namespace at::native {
 
 constexpr int CAT_ARRAY_BATCH_SIZE = 128;
 constexpr int CAT_ARRAY_MAX_INPUT_DIMS = 4;
+<<<<<<< HEAD
 constexpr int ALIGNED_VEC_LOAD_BYTES = 16;
+=======
+constexpr int ALIGNED_VEC_LOAD_BYTES_16 = 16;
+constexpr int ALIGNED_VEC_LOAD_BYTES_8 = 8;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 
@@ -72,14 +77,22 @@ inline std::tuple<dim3, dim3> getCatGridRocm(unsigned int max_elements_per_tenso
   return std::make_tuple(grid, block);
 }
 
+<<<<<<< HEAD
 template<typename T>
+=======
+template<typename T, int aligned_vec_load_bytes>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline std::tuple<dim3, dim3> getCatGridContig(unsigned int max_elements_per_tensor,
   ptrdiff_t nTensors) {
   constexpr unsigned int threads_per_block = 128;
   constexpr unsigned int min_aligned_vec_per_thread = 1;
   constexpr unsigned int max_tb_per_sm = 32;
 
+<<<<<<< HEAD
   unsigned int elements_per_thread = ALIGNED_VEC_LOAD_BYTES / sizeof(T) *
+=======
+  unsigned int elements_per_thread = aligned_vec_load_bytes / sizeof(T) *
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     min_aligned_vec_per_thread;
   unsigned int max_threads = ceil_div(max_elements_per_tensor, elements_per_thread);
   unsigned int thread_blocks = ceil_div(max_threads, threads_per_block);
@@ -230,16 +243,29 @@ __global__ void CatArrayBatchedCopy_contig(
   to improve memory bandwidth throughput.
 */
 
+<<<<<<< HEAD
 template <typename T, typename IndexType, int Dims, int batch_size, int stride_size>
 __global__ void CatArrayBatchedCopy_aligned16_contig(
+=======
+template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int aligned_vec_load_bytes>
+__global__ void CatArrayBatchedCopy_alignedK_contig(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T* output,
     CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
     TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
     const int concatDim,
     IndexType dimStride) {
 
+<<<<<<< HEAD
     // This kernel tries to use 128 bit loads
     constexpr int kILP = ALIGNED_VEC_LOAD_BYTES / sizeof(T);
+=======
+    // This kernel tries to use aligned_vec_load_bytes*8 bit loads
+    // Special case 2-byte types to use 8-byte vec loads to reduce register pressure
+    // The below lambda is to allow cc compiler to pass kILP>0 checks for large types (e.g. ComplexDouble, 16 bytes)
+    constexpr int kILP = aligned_vec_load_bytes / sizeof(T) > 0 ? aligned_vec_load_bytes / sizeof(T) : ALIGNED_VEC_LOAD_BYTES_16/sizeof(T);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IndexType inputOffset = (blockIdx.x * blockDim.x + threadIdx.x) * kILP;
     IndexType inputStride = gridDim.x * blockDim.x * kILP;
 
@@ -349,7 +375,11 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       isAligned = false;
 #else
       // If at least one of the inputs is not aligned, we can't call the
+<<<<<<< HEAD
       // CatArrayBatchedCopy_aligned16_contig
+=======
+      // CatArrayBatchedCopy_alignedK_contig
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
 #endif
 
@@ -385,7 +415,14 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 #else
     dim3 applyBlock, catGrid;
     if (isContig && sizeof(scalar_t) > 2) {
+<<<<<<< HEAD
       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t>(
+=======
+      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
+          max_elements_per_tensor, batchCounter);
+    } else if (isContig && sizeof(scalar_t) == 2) {
+      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           max_elements_per_tensor, batchCounter);
     } else {
       applyBlock = dim3(32 * 16);
@@ -406,8 +443,17 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
     }
     // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
+<<<<<<< HEAD
     if (isContig && isAligned && sizeof(scalar_t) >= 4 && sizeof(scalar_t) <= 8) {\
       CatArrayBatchedCopy_aligned16_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
+=======
+    if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
+      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
+          catGrid, applyBlock, 0, stream.stream()>>>(\
+              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+    } else if (isContig && isAligned && sizeof(scalar_t) == 2) { \
+      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_8><<<\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           catGrid, applyBlock, 0, stream.stream()>>>(\
               data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
     } else if (isContig) {\
@@ -507,7 +553,12 @@ TORCH_IMPL_FUNC(cat_out_cuda)
             kBool,
             kBFloat16,
             AT_EXPAND(AT_FLOAT8_TYPES),
+<<<<<<< HEAD
             AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+=======
+            AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+            kFloat4_e2m1fn_x2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
   } else if (materialized.size() > 1 &&
       result.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
@@ -542,7 +593,13 @@ TORCH_IMPL_FUNC(cat_out_cuda)
             kFloat8_e4m3fnuz,
             kFloat8_e5m2,
             kFloat8_e5m2fnuz,
+<<<<<<< HEAD
             AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+=======
+            AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+            // TODO(#146647): extend this to other shell dtypes
+            kFloat4_e2m1fn_x2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
   } else {
     int64_t offset = 0;
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index 87d3aafc89a7..0b739a0d7bce 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -89,6 +89,23 @@ struct SoftMaxBackwardEpilogue {
   const AccumT sum;
 };
 
+<<<<<<< HEAD
+=======
+template<typename T, typename AccumT, typename OutT>
+ struct SoftMaxForwardWithMulEpilogue {
+   __device__ __forceinline__ SoftMaxForwardWithMulEpilogue(AccumT max_input, AccumT sum)
+     : max_input(max_input)
+     , sum(sum) {}
+
+   __device__ __forceinline__ OutT operator()(T input) const {
+     return static_cast<OutT>(__expf(input - max_input) * sum);
+   }
+
+   const AccumT max_input;
+   const AccumT sum;
+ };
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -140,6 +157,7 @@ void SpatialSoftMax_getLaunchSizes(
   uint32_t block_threads = block.x * block.y;
   smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t);
   int max_active_blocks;
+<<<<<<< HEAD
 #if defined(USE_ROCM) && TORCH_HIP_VERSION < 305
   // HIP function signature is not compatible yet.
   uint32_t max_blocks;
@@ -150,6 +168,10 @@ void SpatialSoftMax_getLaunchSizes(
   cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
                                                 k, block_threads, smem_size);
 #endif
+=======
+  AT_CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
+                                                              k, block_threads, smem_size));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   max_active_blocks *= at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
   grid = SpatialSoftMax_getGridSize(block, max_active_blocks, outer_size, inner_size);
 }
@@ -179,7 +201,11 @@ inline dim3 SoftMaxForward_getBlockSize(uint64_t dim_size) {
 
   // We need a block size that is a multiple of at::cuda::warp_size() in order
   // to perform block size reductions using warp shuffle instructions.
+<<<<<<< HEAD
   // Since max_threads is also a multiple of C10_WARPS_SIZE we do not
+=======
+  // Since max_threads is also a multiple of at::cuda::warp_size() we do not
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // risk creating a block size larger than the limit.
 
   int warp_size = at::cuda::warp_size();
@@ -388,6 +414,22 @@ struct SumExpFloat
   const AccumT max_k;
 };
 
+<<<<<<< HEAD
+=======
+template<typename T, typename AccumT>
+struct SumExpfFloat
+{
+  __device__ __forceinline__ SumExpfFloat(AccumT v)
+    : max_k(v) {}
+
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
+    return sum + __expf(v - max_k);
+  }
+
+  const AccumT max_k;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <template<typename> class Reduction, typename AccumT>
 __device__ __forceinline__ AccumT
 blockReduce(AccumT* smem, AccumT val,
@@ -450,6 +492,22 @@ T blockReduceWarp(T* smem_cache, T value, const Reduction<T>& op, T defaultVal)
   return smem_cache[0];
 }
 
+<<<<<<< HEAD
+=======
+
+template <template<typename> class Reduction, typename T>
+__device__ __forceinline__
+T blockReduceWarpInverse(T* smem_cache, T value, const Reduction<T>& op, T defaultVal)
+{
+  T result = cuda_utils::BlockReduce<T, Reduction<T>>(value, op, defaultVal, smem_cache);
+  if (threadIdx.x == 0) {
+    smem_cache[0] = 1 / result;
+  }
+  __syncthreads();
+  return smem_cache[0];
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT, typename index_t=int>
 __device__ __forceinline__ AccumT
 ilpReduce(index_t shift,
@@ -665,6 +723,41 @@ WriteBpropResults(
   }
 }
 
+<<<<<<< HEAD
+=======
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class EpilogueWithMul>
+__global__ void
+cunn_SoftMaxForwardFast(outscalar_t *output, const scalar_t *input, int classes)
+{
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+
+  // each block handles a sample in the mini-batch
+  input += static_cast<int64_t>(blockIdx.x) * classes;
+  output += static_cast<int64_t>(blockIdx.x) * classes;
+
+  const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
+
+  // find the max
+  accscalar_t threadMax = ilpReduce<MaxFloat, ILP, scalar_t, accscalar_t>(
+    shift, input, classes, MaxFloat<scalar_t, accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
+  accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(sdata, threadMax,
+    Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
+
+  // reduce all values
+  accscalar_t threadExp = ilpReduce<SumExpfFloat, ILP, scalar_t, accscalar_t>(
+    shift, input, classes, SumExpfFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
+  accscalar_t sumAll = blockReduceWarpInverse<Add, accscalar_t>(sdata, threadExp,
+    Add<accscalar_t>(), static_cast<accscalar_t>(0));
+
+  EpilogueWithMul<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
+
+  for (int offset = threadIdx.x; offset < classes; offset += blockDim.x) {
+    output[offset] = epilogue(input[offset]);
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class Epilogue>
 __global__ void
 cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes)
@@ -756,6 +849,71 @@ cunn_SoftMaxForwardReg(outscalar_t *output, const scalar_t *input, index_t class
   }
 }
 
+<<<<<<< HEAD
+=======
+
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
+  template <typename, typename, typename> class EpilogueWithMul, typename index_t = int32_t>
+__global__ void
+cunn_SoftMaxForwardGmem(outscalar_t *output, const scalar_t *input, index_t classes)
+{
+  // Each thread block processes a sample in the batch
+  input += static_cast<int64_t>(blockIdx.x) * classes;
+  output += static_cast<int64_t>(blockIdx.x) * classes;
+
+  accscalar_t threadMax = -at::numeric_limits<accscalar_t>::max();
+  accscalar_t threadExp = static_cast<accscalar_t>(0);
+
+  // The first smem segment is used to cache input values and the last
+  // segment is used for thread block reductions
+  extern __shared__ unsigned char smem[];
+  auto smem_reduction_cache = reinterpret_cast<accscalar_t*>(smem);
+
+  using LoadT = at::native::memory::aligned_vector<scalar_t, ILP>;
+  const LoadT* const input_vec_ptr = reinterpret_cast<const LoadT*>(input);
+
+  // Do the first step in max calculation:
+  MaxFloat<scalar_t, accscalar_t> maxFunc;
+  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
+    LoadT crnt_vec = input_vec_ptr[offset];
+    #pragma unroll
+    for (int i = 0; i < ILP; ++i) {
+      threadMax = maxFunc(threadMax, crnt_vec.val[i]);
+    }
+  }
+
+  accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(smem_reduction_cache, threadMax,
+    Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
+
+  // Do the second step in sum exp calculation:
+  SumExpfFloat<scalar_t, accscalar_t> sumExpFunc(max_k);
+  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
+    LoadT crnt_vec = input_vec_ptr[offset];
+    #pragma unroll
+    for (int i = 0; i < ILP; ++i) {
+      threadExp = sumExpFunc(threadExp, crnt_vec.val[i]);
+    }
+  }
+
+  accscalar_t sumAll = blockReduceWarpInverse<Add, accscalar_t>(smem_reduction_cache, threadExp,
+    Add<accscalar_t>(), static_cast<accscalar_t>(0));
+
+  EpilogueWithMul<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
+
+  using StoreT = at::native::memory::aligned_vector<outscalar_t, ILP>;
+  StoreT* output_vec_ptr = reinterpret_cast<StoreT*>(output);
+  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
+    LoadT crnt_vec = input_vec_ptr[offset];
+    StoreT out_vec;
+    #pragma unroll
+    for (int i = 0; i < ILP; ++i) {
+      out_vec.val[i] = epilogue(crnt_vec.val[i]);
+    }
+    output_vec_ptr[offset] = out_vec;
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
   template <typename, typename, typename> class Epilogue, typename index_t = int32_t>
 __global__ void
@@ -936,7 +1094,13 @@ cunn_SoftMaxBackwardSmem(scalar_t *gradInput, const outscalar_t *output, const o
   }
 }
 
+<<<<<<< HEAD
 template<template<typename, typename, typename> class Epilogue, bool is_log_softmax>
+=======
+
+ template<template<typename, typename, typename> class Epilogue,
+          template<typename, typename, typename> class EpilogueWithMul, bool is_log_softmax, bool use_fast_softmax>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_to_float, const Tensor& output){
   if (half_to_float) {
     TORCH_CHECK(input_.scalar_type() == ScalarType::Half, "conversion is supported for Half type only");
@@ -978,6 +1142,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
             }
           } else {
             constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
+<<<<<<< HEAD
             dim3 block = SoftMaxForward_getBlockSize(dim_size);
             size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
             auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
@@ -1038,6 +1203,80 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
             } else {
               cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
                 <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+=======
+            if constexpr (use_fast_softmax) {
+              dim3 block(512);
+              size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
+              if (dim_size % ILP == 0) {
+                cunn_SoftMaxForwardGmem<ILP, scalar_t, accscalar_t, scalar_t, EpilogueWithMul>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+              } else {
+                cunn_SoftMaxForwardFast<ILP, scalar_t, accscalar_t, scalar_t, EpilogueWithMul>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+              }
+            } else {
+              dim3 block = SoftMaxForward_getBlockSize(dim_size);
+              size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
+              auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
+                smem_reduction_sz) / sizeof(scalar_t);
+
+              bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
+              can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
+              can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
+              can_use_smem &= !(dim_size % ILP);
+
+              int32_t potential_reg_cnt = potential_register_count(dim_size, block.x);
+              if(potential_reg_cnt < 10){
+                TORCH_INTERNAL_ASSERT(potential_reg_cnt > 0, "potential_reg_cnt for softmax with register should be greater than 0.");
+                switch (potential_reg_cnt) {
+                  // TODO(Wenqin): try to investigate why we couldn't use macro for below code,
+                  // because it seems on MSVS, it seems the macro way didn't expand correct.
+                  case 1:
+                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 1>
+                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                    break;
+                  case 2:
+                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 2>
+                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                    break;
+                  case 3:
+                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 3>
+                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                    break;
+                  case 4:
+                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 4>
+                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                    break;
+                  case 5:
+                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 5>
+                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                    break;
+                  case 6:
+                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 6>
+                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                    break;
+                  case 7:
+                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 7>
+                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                    break;
+                  case 8:
+                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 8>
+                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                    break;
+                  case 9:
+                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 9>
+                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                    break;
+                }
+              } else if (can_use_smem) {
+                size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
+                cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
+                  <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
+              } else {
+                cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
+                  <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+              }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
 
             C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -1057,6 +1296,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
             }
           } else {
             constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
+<<<<<<< HEAD
             dim3 block = SoftMaxForward_getBlockSize(dim_size);
             size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
             auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
@@ -1074,6 +1314,37 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
             } else {
               cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
                 <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+=======
+            if constexpr (use_fast_softmax) {
+              dim3 block(512);
+              size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
+              if (dim_size % ILP == 0) {
+                cunn_SoftMaxForwardGmem<ILP, scalar_t, accscalar_t, accscalar_t, EpilogueWithMul>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+              } else {
+                cunn_SoftMaxForwardFast<ILP, scalar_t, accscalar_t, accscalar_t, EpilogueWithMul>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+              }
+            } else {
+              dim3 block = SoftMaxForward_getBlockSize(dim_size);
+              size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
+              auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
+                smem_reduction_sz) / sizeof(scalar_t);
+
+              bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
+              can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
+              can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
+              can_use_smem &= !(dim_size % ILP);
+
+              if (can_use_smem) {
+                size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
+                cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
+                  <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
+              } else {
+                cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
+                  <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+              }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
 
             C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -1253,7 +1524,11 @@ TORCH_IMPL_FUNC(log_softmax_cuda_out) (
   const int64_t dim,
   const bool half_to_float,
   const Tensor &output) {
+<<<<<<< HEAD
   host_softmax<LogSoftMaxForwardEpilogue,true>(input, dim, half_to_float, output);
+=======
+  host_softmax<LogSoftMaxForwardEpilogue, LogSoftMaxForwardEpilogue, true, false>(input, dim, half_to_float, output);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_IMPL_FUNC(log_softmax_backward_cuda_out) (
@@ -1277,7 +1552,15 @@ TORCH_IMPL_FUNC(softmax_cuda_out) (
   const int64_t dim,
   const bool half_to_float,
   const Tensor &output) {
+<<<<<<< HEAD
   host_softmax<SoftMaxForwardEpilogue,false>(input, dim, half_to_float, output);
+=======
+#if defined(USE_ROCM)
+   host_softmax<SoftMaxForwardEpilogue, SoftMaxForwardWithMulEpilogue, false, true>(input, dim, half_to_float, output);
+ #else
+   host_softmax<SoftMaxForwardEpilogue, SoftMaxForwardWithMulEpilogue, false, false>(input, dim, half_to_float, output);
+ #endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_IMPL_FUNC(softmax_backward_cuda_out)
diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu
index b8471b59dbd3..cbd921e2281b 100644
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+#include <c10/core/ScalarType.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
@@ -404,8 +408,15 @@ Tensor _histc_cuda(
     TORCH_CHECK(false, "HalfTensor is not supported");
   }
   // See Note [Writing Nondeterministic Operations]
+<<<<<<< HEAD
   // Nondeterministic because of atomicAdd usage
   globalContext().alertNotDeterministic("_histc_cuda");
+=======
+  // Nondeterministic for floating types because of atomicAdd usage
+  if (at::isFloatingType(self.scalar_type())){
+    globalContext().alertNotDeterministic("_histc_cuda with floating point input");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "histc", [&] {
     using bounds_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
     return _histc_cuda_template<scalar_t>(
diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cuh b/aten/src/ATen/native/cuda/TensorModeKernel.cuh
index fb43e0d8f347..244b692ff1e1 100644
--- a/aten/src/ATen/native/cuda/TensorModeKernel.cuh
+++ b/aten/src/ATen/native/cuda/TensorModeKernel.cuh
@@ -193,9 +193,13 @@ __device__ inline void bitonicSortKeys(
 // dimension as the innermost dim, such that we can get the particular slice for
 // a Tensor via its linear block dimension * the slice size.
 template <typename T, unsigned int Power2Size>
+<<<<<<< HEAD
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11070
 __launch_bounds__(1024, 1)
 #endif
+=======
+__launch_bounds__(1024, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __global__ void compute_mode(
     const T* input,
     at::cuda::detail::TensorInfo<T, unsigned int> values,
diff --git a/aten/src/ATen/native/cuda/TensorShape.cu b/aten/src/ATen/native/cuda/TensorShape.cu
index d82901ef9452..d6132e88a869 100644
--- a/aten/src/ATen/native/cuda/TensorShape.cu
+++ b/aten/src/ATen/native/cuda/TensorShape.cu
@@ -422,11 +422,20 @@ static __global__ void chunk_cat_cuda_kernel(
 }
 
 bool all_contiguous(TensorList tensors) {
+<<<<<<< HEAD
   bool contiguous = true;
   for (const auto& t : tensors) {
     contiguous &= t.is_non_overlapping_and_dense();
   }
   return contiguous;
+=======
+  for (const auto& t : tensors) {
+    if (!t.is_contiguous()) {
+      return false;
+    }
+  }
+  return true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Get leading dimensions before `dim`-th dimension.
diff --git a/aten/src/ATen/native/cuda/cutlass_common.cuh b/aten/src/ATen/native/cuda/cutlass_common.cuh
new file mode 100644
index 000000000000..0bf4da7a7be8
--- /dev/null
+++ b/aten/src/ATen/native/cuda/cutlass_common.cuh
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <cutlass/cutlass.h>
+
+namespace at::cuda::detail {
+
+template <typename Kernel>
+struct enable_2x_kernel_for_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 890
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_3x_kernel_for_sm9x : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900 && __CUDA_ARCH__ < 1000
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_3x_kernel_for_sm10_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 1000
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+}  // namespace at::cuda::detail
diff --git a/aten/src/ATen/native/cuda/cutlass_extensions/epilogue/thread/ft_fused_activations.h b/aten/src/ATen/native/cuda/cutlass_extensions/epilogue/thread/ft_fused_activations.h
index f38588834410..b2f455f31da9 100644
--- a/aten/src/ATen/native/cuda/cutlass_extensions/epilogue/thread/ft_fused_activations.h
+++ b/aten/src/ATen/native/cuda/cutlass_extensions/epilogue/thread/ft_fused_activations.h
@@ -69,6 +69,7 @@ __forceinline__ __device__ float tanh_opt(float x)
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+<<<<<<< HEAD
 template<>
 struct GELU_taylor<float> {
     static const bool kIsHeavy = true;
@@ -93,6 +94,8 @@ struct GELU_taylor<float> {
     }
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }  // namespace thread
 }  // namespace epilogue
 }  // namespace cutlass
diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu
index dcc9237d737e..3f6474d8cb67 100644
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@@ -127,6 +127,17 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) {
   return diff == 0 ? 0 : uint32_t(Align) - diff;
 }
 
+<<<<<<< HEAD
+=======
+#if defined (__gfx90a__) || defined(__gfx942__)
+#define CDNA2_OR_LATER 1
+#else
+#define CDNA2_OR_LATER 0
+#endif
+
+#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(USE_ROCM)
 // TODO: Support RDNA
 constexpr int32_t kWarpSize = 64;
@@ -135,6 +146,7 @@ template<typename T, uint32_t Rank>
 using VecT = T __attribute__((ext_vector_type(Rank)));
 
 static bool isCDNA2orLater(int index) {
+<<<<<<< HEAD
     hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
     std::string device_arch = prop->gcnArchName;
     static const std::vector<std::string> archs = {"gfx90a", "gfx942"};
@@ -145,12 +157,16 @@ static bool isCDNA2orLater(int index) {
         }
     }
     return false;
+=======
+    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942"}, index);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #else
 constexpr int32_t kWarpSize = 32;
 #endif
 
+<<<<<<< HEAD
 #if defined (__gfx90a__) || defined(__gfx942__)
 #define CDNA2_OR_LATER 1
 #else
@@ -159,6 +175,8 @@ constexpr int32_t kWarpSize = 32;
 
 #if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // f16 vector types
 struct __align__(2) f16x1 {
   __half vals[1];
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index deac503827b8..84339e43c236 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -912,10 +912,13 @@ void codegenOutputQuery(
     compile_to_sass = true;
   }
 
+<<<<<<< HEAD
   #if defined(CUDA_VERSION) && CUDA_VERSION < 11010
     // compile to sass is not allowed prior to CUDA 11.1
     compile_to_sass = false;
   #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 }
 
@@ -1624,7 +1627,11 @@ NvrtcFunction jit_pwise_function(
 
   size_t ptx_size = 0;
   std::vector<char> ptx;
+<<<<<<< HEAD
   #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
+=======
+  #if !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // compile_to_sass determines whether we are generating SASS or PTX, hence
     // the different API.
     const auto getSize = compile_to_sass
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 94c781f6ec68..1003aa29ad71 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -553,6 +553,7 @@ __global__ void GammaBetaBackwardSimpleCUDAKernel(
   }
 }
 
+<<<<<<< HEAD
 // This implementation gets called if M and N divide with 32. This case should
 // be the most common. We can then make better use of warp level intrinsics
 // to improve performance.
@@ -571,10 +572,144 @@ __global__ void GammaBetaBackwardCUDAKernel_32x32(
   T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
   T_ACC* s_dg;
   T_ACC* s_db;
+=======
+template <typename T, typename T_ACC,
+unsigned int block_dim_x,
+unsigned int block_dim_y,
+unsigned int rows_per_block_y,
+bool check_x,
+bool check_y>
+__device__
+__forceinline__
+void
+blockReduceGammaBetaBackwardsHelper(
+    int64_t M_start,
+    int64_t M,
+    int64_t N,
+    const T* __restrict__ dY,
+    const T* __restrict__ X,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd,
+    T* __restrict__ dg,
+    T* __restrict__ db,
+    T_ACC &dg_sum,
+    T_ACC &db_sum
+) {
+  constexpr int rows_per_thread_y = rows_per_block_y / block_dim_y;
+  int64_t thread_x = blockIdx.x * block_dim_x + threadIdx.x;
+
+    int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) & (kWarpSize - 1);
+    int64_t mean_index = M_start + threadIdx.y * rows_per_thread_y;
+    T_ACC warp_mean = 0, warp_rstd = 0;
+    if (lane_id < rows_per_thread_y && mean_index + lane_id < M) {
+      warp_mean = mean[mean_index + lane_id];
+      warp_rstd = rstd[mean_index + lane_id];
+    }
+    // We do a WARP_SYNC() here because we use WARP_SHFL below to access
+    // warp_mean and warp_rstd.
+    WARP_SYNC();
+
+    T_ACC dY_regs[rows_per_thread_y] = {0};
+    T_ACC X_regs[rows_per_thread_y] = {0};
+    #pragma unroll
+    for (int i = 0; i < rows_per_thread_y; ++i) {
+      int64_t current_y = M_start + threadIdx.y * rows_per_thread_y + i;
+      bool active = true;
+      if (check_x && thread_x >= N) {
+        active = false;
+      }
+      if (check_y && current_y >= M) {
+        active = false;
+      }
+      if (active) {
+        dY_regs[i] = dY[current_y * N + thread_x];
+        X_regs[i] = X[current_y * N + thread_x];
+      }
+    }
+
+    #pragma unroll
+    for (int i = 0; i < rows_per_thread_y; ++i) {
+      T_ACC mean_reg = WARP_SHFL(warp_mean, i, kWarpSize);
+      T_ACC rstd_reg = WARP_SHFL(warp_rstd, i, kWarpSize);
+      dg_sum += dY_regs[i] * (X_regs[i] - mean_reg) * rstd_reg;
+      db_sum += dY_regs[i];
+    }
+}
+
+template <typename T, typename T_ACC,
+unsigned int block_dim_x,
+unsigned int block_dim_y,
+unsigned int rows_per_block_y,
+bool check_x,
+bool check_y>
+__device__
+__forceinline__
+void
+blockReduceGammaBetaBackwardsWithChecks(
+    int64_t M,
+    int64_t N,
+    const T* __restrict__ dY,
+    const T* __restrict__ X,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd,
+    T* __restrict__ dg,
+    T* __restrict__ db,
+    T_ACC &dg_sum,
+    T_ACC &db_sum
+) {
+  for (int64_t M_start = blockIdx.y * rows_per_block_y;
+        M_start < M;
+        M_start += rows_per_block_y * gridDim.y) {
+    int64_t M_end = M_start + rows_per_block_y - 1;
+    if (!check_y || M_end < M) {
+      blockReduceGammaBetaBackwardsHelper<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, check_x, false>
+      (M_start, M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
+    } else {
+      blockReduceGammaBetaBackwardsHelper<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, check_x, true>
+      (M_start, M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
+    }
+  }
+}
+
+// block_dim_x is the number of threads in the x dimension per block.
+// block_dim_y is the number of threads in the y dimension per block.
+// rows_per_block_y is the size of the tile (number of data elements)
+// in the y dimension per block.
+// partial_reduction indicates whether we need to reduce across threads
+// or not. If set to true, we will not reduce across threads. This can
+// be faster in the M >> N case but requires another kernel to do a full
+// final reduction.
+// aligned_grid means the data size is a multiple of tile size. In that
+// case we don't need to check for boundary conditions which can provide
+// a further speedup by not needing instructions to check for edge cases
+// and not needing predicate registers.
+template <typename T, typename T_ACC,
+unsigned int block_dim_x, unsigned int block_dim_y,
+unsigned int rows_per_block_y,
+bool partial_reduction,
+bool aligned_grid
+>
+__global__
+void
+__launch_bounds__(block_dim_x * block_dim_y)
+ GammaBetaBackwardCUDAKernelTemplate(
+    int64_t M,
+    int64_t N,
+    const T* __restrict__ dY,
+    const T* __restrict__ X,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd,
+    T* __restrict__ dg,
+    T* __restrict__ db) {
+  // This assert is a compile-time check only.
+  constexpr int rows_per_thread_y = rows_per_block_y / block_dim_y;
+  static_assert(rows_per_thread_y <= kWarpSize);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   T_ACC dg_sum = 0;
   T_ACC db_sum = 0;
 
+<<<<<<< HEAD
   const int64_t j = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;
 
   if (j < N) {
@@ -633,11 +768,61 @@ __global__ void GammaBetaBackwardCUDAKernel_32x32(
 
     s_dg = s_data_typed;
     s_db = s_data_typed + (padded_bx * blockDim.y);
+=======
+  if (aligned_grid) {
+    // When N and M align perfectly with block_dim_x and block_dim_y, we
+    // can skip boundary condition checks that waste instruction issue slots.
+    blockReduceGammaBetaBackwardsWithChecks
+          <T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, false, false>
+          (M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
+  } else {
+    // In the general case we need to check boundary conditions in the M
+    // dimension. However, we can still avoid boundary checks in the N dimension
+    // for the inner blocks. So try to avoid those checks when possible.
+    if (blockIdx.x * block_dim_x + block_dim_x - 1 < N) {
+      blockReduceGammaBetaBackwardsWithChecks
+          <T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, false, true>
+          (M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
+    } else {
+      blockReduceGammaBetaBackwardsWithChecks
+          <T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, true, true>
+          (M, N, dY, X, mean, rstd, dg, db, dg_sum, db_sum);
+    }
+  }
+
+  int64_t thread_x = ((int64_t)blockIdx.x) * block_dim_x + threadIdx.x;
+
+  // When partial_reduction is requested, we don't reduce within a block.
+  // We also don't reduce if we are only a single block in the y dimension.
+  if (partial_reduction || (blockDim.y == 1 && gridDim.y == 1)) {
+    if (aligned_grid || thread_x < N) {
+      int64_t thread_y = ((int64_t)blockIdx.y) * blockDim.y + threadIdx.y;
+      if (dg) {
+        dg[thread_y * N + thread_x] = dg_sum;
+      }
+      if (db) {
+        db[thread_y * N + thread_x] = db_sum;
+      }
+    }
+  } else {
+    // The caller requested a full reduction so we must reduce across
+    // warps using shared memory and warp shuffles.
+    static_assert(rows_per_thread_y <= C10_WARP_SIZE);
+    alignas(sizeof(double)) extern __shared__ char s_data1[];
+    T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
+    T_ACC* s_dg;
+    T_ACC* s_db;
+    int padded_bx = (block_dim_x + 1);
+    // Transpose dg and db.
+    s_dg = s_data_typed;
+    s_db = s_data_typed + (padded_bx * block_dim_y);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     s_dg[threadIdx.y * padded_bx + threadIdx.x] = dg_sum;
     s_db[threadIdx.y * padded_bx + threadIdx.x] = db_sum;
     __syncthreads();
 
     // Load transposed so that a warp holds an entire column
+<<<<<<< HEAD
     T_ACC reg_dg = s_dg[threadIdx.x * padded_bx + threadIdx.y];
     T_ACC reg_db = s_db[threadIdx.x * padded_bx + threadIdx.y];
     for (unsigned delta = C10_WARP_SIZE >> 1; delta >= 1; delta >>= 1) {
@@ -652,11 +837,42 @@ __global__ void GammaBetaBackwardCUDAKernel_32x32(
       }
       if (db) {
         db[j] = reg_db;
+=======
+    // Because block_dim_x != block_dim_y in the general case, we need
+    // some code to handle the general case.
+    static_assert(block_dim_x * block_dim_y % C10_WARP_SIZE == 0);
+    constexpr int warps_available_to_reduce = block_dim_x * block_dim_y / C10_WARP_SIZE;
+    int thread_id = threadIdx.y * block_dim_x + threadIdx.x;
+    int warp_id = thread_id / C10_WARP_SIZE;
+    int lane_id = thread_id & (C10_WARP_SIZE - 1);
+    #pragma unroll
+    for (int i = warp_id; i < block_dim_x; i += warps_available_to_reduce) {
+      T_ACC reg_db, reg_dg;
+      if (lane_id < block_dim_y) {
+        reg_dg = s_dg[lane_id * padded_bx + i];
+        reg_db = s_db[lane_id * padded_bx + i];
+      }
+      #pragma unroll
+      for (unsigned delta = block_dim_y >> 1; delta >= 1; delta >>= 1) {
+        reg_dg += WARP_SHFL_XOR(reg_dg, delta, kWarpSize);
+        reg_db += WARP_SHFL_XOR(reg_db, delta, kWarpSize);
+      }
+      // Reduce is done. Now write it out to global memory.
+      int64_t out_index = ((int64_t)blockIdx.x) * block_dim_x + i;
+      if (threadIdx.x == 0 && (aligned_grid || out_index < N)) {
+        if (dg) {
+          dg[out_index] = reg_dg;
+        }
+        if (db) {
+          db[out_index] = reg_db;
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
 }
 
+<<<<<<< HEAD
 template <typename T, typename T_ACC>
 __global__ void GammaBetaBackwardCUDAKernel(
     int64_t M,
@@ -738,6 +954,151 @@ __global__ void GammaBetaBackwardCUDAKernel(
       if (db) {
         db[j] = s_db[threadIdx.x];
       }
+=======
+template<typename T, typename T_ACC,
+int block_dim_x, int block_dim_y,
+int rows_per_block_y,
+bool partial_reduction>
+void LaunchAndCheckGammaBetaBackwardKernel(
+  bool aligned_grid,
+  dim3 blocks,
+  dim3 threads,
+  size_t shmem_sz,
+  cudaStream_t cuda_stream,
+  const T* dY_data,
+  const T* X_data,
+  const T_ACC* mean_data,
+  const T_ACC* rstd_data,
+  int64_t M,
+  int64_t N,
+  T* dgamma_data,
+  T* dbeta_data) {
+if (aligned_grid) {
+    GammaBetaBackwardCUDAKernelTemplate<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, partial_reduction, true>
+        <<<blocks, threads, shmem_sz, cuda_stream>>>(
+            M,
+            N,
+            dY_data,
+            X_data,
+            mean_data,
+            rstd_data,
+            dgamma_data,
+            dbeta_data);
+  } else {
+    GammaBetaBackwardCUDAKernelTemplate<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, partial_reduction, false>
+        <<<blocks, threads, shmem_sz, cuda_stream>>>(
+            M,
+            N,
+            dY_data,
+            X_data,
+            mean_data,
+            rstd_data,
+            dgamma_data,
+            dbeta_data);
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename T, typename T_ACC,
+int block_dim_x, int block_dim_y,
+int rows_per_block_y>
+void ConfigureAndLaunchGammaBetaBackwardKernel(
+    const T* dY_data,
+    const T* X_data,
+    const T_ACC* mean_data,
+    const T_ACC* rstd_data,
+    int64_t M,
+    int64_t N,
+    Tensor* dgamma,
+    Tensor* dbeta,
+    cudaStream_t cuda_stream) {
+  T* dgamma_data =
+    dgamma->defined() ? dgamma->template data_ptr<T>() : nullptr;
+  T* dbeta_data = dbeta->defined() ? dbeta->template data_ptr<T>() : nullptr;
+  bool aligned_grid = (M % rows_per_block_y == 0) && (N % block_dim_x == 0);
+  dim3 threads{block_dim_x, block_dim_y};
+  dim3 blocks;
+  blocks.x = (N + block_dim_x - 1) / block_dim_x;
+  blocks.y = 1;
+  size_t shmem_sz = (block_dim_x + 1) * block_dim_y * sizeof(T_ACC) * 2;
+  if (blocks.y == 1 && threads.y == 1) {
+    // Optimization: since there is just one thread doing all the summation, we don't need a reduction
+    // across threads. So we set partial_reduction to true.
+    LaunchAndCheckGammaBetaBackwardKernel<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, true>(
+      aligned_grid, blocks, threads, shmem_sz, cuda_stream, dY_data, X_data, mean_data, rstd_data, M, N, dgamma_data, dbeta_data);
+  } else {
+    LaunchAndCheckGammaBetaBackwardKernel<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, false>(
+      aligned_grid, blocks, threads, shmem_sz, cuda_stream, dY_data, X_data, mean_data, rstd_data, M, N, dgamma_data, dbeta_data);
+  }
+
+}
+
+template<typename T, typename T_ACC>
+void LaunchGammaBetaBackwardCUDAKernel(
+    const T* dY_data,
+    const T* X_data,
+    const T_ACC* mean_data,
+    const T_ACC* rstd_data,
+    int64_t M,
+    int64_t N,
+    Tensor* dgamma,
+    Tensor* dbeta,
+    cudaStream_t cuda_stream) {
+  constexpr int block_dim_x = 32;
+  const int sm_count = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  if (M > 64 * 1024 && N / block_dim_x < sm_count / 2) {
+    // We have a situation where M >> N and N is small.
+    // In this case we can speed up the computation by parallelizing in the M dimension.
+    // We launch multiple blocks in the y-dimension, and compute partial sums for the
+    // gradient in the first pass. Then we do a .sum(0) to do a final reduction.
+    // Although we launch 2 kernels, we can get up to a 10x speedup for large M.
+    constexpr int block_dim_y = 1;
+    constexpr int rows_per_block_y = 32;
+    bool aligned_grid = (M % rows_per_block_y == 0) && (N % block_dim_x == 0);
+    dim3 threads{block_dim_x, block_dim_y};
+    dim3 blocks;
+    blocks.x = (N + block_dim_x - 1) / block_dim_x;
+    // int rows_per_block = my_gamma_beta_unroll_factor *
+    blocks.y = (M + rows_per_block_y - 1) / rows_per_block_y;
+    constexpr int max_grid_size = 64 * 1024 / 2;
+    blocks.y = std::min<unsigned int>(max_grid_size / blocks.x, blocks.y);
+    Tensor dgamma_blocks;
+    Tensor dbeta_blocks;
+    T * dgamma_blocks_ptr = nullptr;
+    T * dbeta_blocks_ptr = nullptr;
+    if (dgamma->defined()) {
+      auto options = dgamma->options();
+      dgamma_blocks = at::empty({blocks.y * threads.y, dgamma->size(-1)}, options);
+      dgamma_blocks_ptr = dgamma_blocks.data_ptr<T>();
+    }
+    if (dbeta->defined()) {
+      auto options = dbeta->options();
+      dbeta_blocks = at::empty({blocks.y * threads.y, dgamma->size(-1)}, options);
+      dbeta_blocks_ptr = dbeta_blocks.data_ptr<T>();
+    }
+    LaunchAndCheckGammaBetaBackwardKernel<T, T_ACC, block_dim_x, block_dim_y, rows_per_block_y, true>(
+      aligned_grid, blocks, threads, 0, cuda_stream, dY_data, X_data, mean_data, rstd_data, M, N, dgamma_blocks_ptr, dbeta_blocks_ptr);
+
+    if (dgamma_blocks.defined()) {
+      *dgamma = dgamma_blocks.sum(0);
+    }
+    if (dbeta_blocks.defined()) {
+      *dbeta = dbeta_blocks.sum(0);
+    }
+  } else {
+    // We are in the normal case where M is not that large.
+    // We can change the tile shape (which is the last template parameter) in accordance with M.
+    // For small M it is faster to have a smaller tile, otherwise we could have idle threads.
+    // For larger M we use a bigger tile size.
+    if (M < 64) {
+      ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 1, 8>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
+    } else if (M < 128) {
+      ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 8, 64>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
+    } else if (M < 256) {
+      ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 16, 128>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
+    } else {
+      ConfigureAndLaunchGammaBetaBackwardKernel<T, T_ACC, block_dim_x, 32, 256>(dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 }
@@ -1263,6 +1624,10 @@ void LayerNormBackwardKernelImplInternal(
         dgamma->defined() ? dgamma->template data_ptr<T>() : nullptr;
     T* dbeta_data = dbeta->defined() ? dbeta->template data_ptr<T>() : nullptr;
 
+<<<<<<< HEAD
+=======
+#if defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (M < 128) {
       // For small batch size, do colwise reduce directly.
       const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
@@ -1278,7 +1643,10 @@ void LayerNormBackwardKernelImplInternal(
               dbeta_data);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
+<<<<<<< HEAD
 #if defined(USE_ROCM)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // For small batch size, do colwise reduce directly.
       const int part_size = warp_size;
       const dim3 threads2(warp_size, 4, 1);
@@ -1313,6 +1681,7 @@ void LayerNormBackwardKernelImplInternal(
                       dgamma_data,
                       dbeta_data);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
+<<<<<<< HEAD
 #else
       if ((M % kWarpSize == 0) && (N % kWarpSize == 0)) {
         // This implementation relies on warp primitives and requires that M and N divide
@@ -1354,6 +1723,13 @@ void LayerNormBackwardKernelImplInternal(
       }
 #endif
     }
+=======
+    }
+#else
+    LaunchGammaBetaBackwardCUDAKernel(
+      dY_data, X_data, mean_data, rstd_data, M, N, dgamma, dbeta, cuda_stream);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
index 0853c02d6dfc..cebcaa9765d0 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@@ -1865,8 +1865,11 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
   // We require to perform ?geqrf_gpu again due to this bug in MAGMA:
   // - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
   // - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
+<<<<<<< HEAD
   // Refer to the below link for more details:
   // http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=1015&p=2800&hilit=geqrf_gpu#p2800
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case at::LinalgBackend::Magma:
       return geqrf_magma(input, tau);
     case at::LinalgBackend::Cusolver:
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
index 7885583a0d59..f9da041b8ca1 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -28,6 +28,21 @@
 #include <ATen/ops/zeros.h>
 #endif
 
+<<<<<<< HEAD
+=======
+#if defined(USE_ROCM)
+#include <rocsolver/rocsolver.h>
+#include <ATen/cuda/tunable/GemmRocblas.h>
+#define PYTORCH_ROCSOLVER_VERSION \
+  (ROCSOLVER_VERSION_MAJOR * 10000 + ROCSOLVER_VERSION_MINOR * 100 + ROCSOLVER_VERSION_PATCH)
+#if (PYTORCH_ROCSOLVER_VERSION >= 32600)
+#define ROCSOLVER_SYEVD_BATCHED_ENABLED 1
+#else
+#define ROCSOLVER_SYEVD_BATCHED_ENABLED 0
+#endif
+#endif // defined(USE_ROCM)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::native {
 
 static cublasOperation_t to_cublas(TransposeType trans) {
@@ -1204,6 +1219,118 @@ Tensor& orgqr_helper_cusolver(Tensor& result, const Tensor& tau) {
   return result;
 }
 
+<<<<<<< HEAD
+=======
+#if defined(USE_ROCM) && ROCSOLVER_SYEVD_BATCHED_ENABLED
+template <typename scalar_t>
+rocblas_status _rocsolver_syevd_strided_batched(
+    rocblas_handle handle,
+    const rocblas_evect evect,
+    const rocblas_fill uplo,
+    const rocblas_int n,
+    scalar_t* A,
+    const rocblas_int lda,
+    const rocblas_stride strideA,
+    scalar_t* D,
+    const rocblas_stride strideD,
+    scalar_t* E,
+    const rocblas_stride strideE,
+    rocblas_int* info,
+    const rocblas_int batch_count
+);
+
+template <>
+rocblas_status _rocsolver_syevd_strided_batched<float>(
+    rocblas_handle handle,
+    const rocblas_evect evect,
+    const rocblas_fill uplo,
+    const rocblas_int n,
+    float* A,
+    const rocblas_int lda,
+    const rocblas_stride strideA,
+    float* D,
+    const rocblas_stride strideD,
+    float* E,
+    const rocblas_stride strideE,
+    rocblas_int* info,
+    const rocblas_int batch_count
+){
+  return rocsolver_ssyevd_strided_batched(
+    handle, evect, uplo, n, A, lda, strideA, D, strideD, E, strideE, info, batch_count
+  );
+}
+
+template <>
+rocblas_status _rocsolver_syevd_strided_batched<double>(
+    rocblas_handle handle,
+    const rocblas_evect evect,
+    const rocblas_fill uplo,
+    const rocblas_int n,
+    double* A,
+    const rocblas_int lda,
+    const rocblas_stride strideA,
+    double* D,
+    const rocblas_stride strideD,
+    double* E,
+    const rocblas_stride strideE,
+    rocblas_int* info,
+    const rocblas_int batch_count
+){
+  return rocsolver_dsyevd_strided_batched(
+    handle, evect, uplo, n, A, lda, strideA, D, strideD, E, strideE, info, batch_count
+  );
+}
+
+template <typename scalar_t>
+static void apply_syevd_batched_rocsolver(const Tensor& values, const Tensor& vectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
+
+  using value_t = typename c10::scalar_value_type<scalar_t>::type;
+
+  auto uplo = upper ? rocblas_fill::rocblas_fill_upper : rocblas_fill::rocblas_fill_lower;
+  auto evect = compute_eigenvectors ? rocblas_evect::rocblas_evect_original : rocblas_evect::rocblas_evect_none;
+
+  int64_t n = vectors.size(-1);
+  int64_t lda = std::max<int64_t>(1, n);
+  int64_t batch_size = batchCount(vectors);
+
+  auto vectors_stride = matrixStride(vectors);
+  auto values_stride = n;
+
+  auto vectors_data = vectors.data_ptr<scalar_t>();
+  auto values_data = values.data_ptr<value_t>();
+  auto infos_data = infos.data_ptr<int>();
+
+  auto work_stride = n;
+  auto work_size = work_stride * batch_size;
+      // allocate workspace storage on device
+  auto& allocator = *at::cuda::getCUDADeviceAllocator();
+  auto work_data = allocator.allocate(sizeof(scalar_t) * work_size);
+
+  rocblas_handle handle = static_cast<rocblas_handle>(at::cuda::getCurrentCUDASolverDnHandle());
+
+  // rocsolver will manage the workspace size automatically
+   if(!rocblas_is_managing_device_memory(handle))
+        TORCH_ROCBLAS_CHECK(rocblas_set_workspace(handle, nullptr, 0));
+
+  TORCH_ROCBLAS_CHECK(_rocsolver_syevd_strided_batched<scalar_t>(
+    handle,
+    evect,
+    uplo,
+    n,
+    vectors_data,
+    lda,
+    vectors_stride,
+    values_data,
+    values_stride,
+    static_cast<scalar_t*>(work_data.get()),
+    work_stride,
+    infos_data,
+    batch_size
+  ));
+}
+#endif // USE_ROCM && ROCSOLVER_SYEVD_BATCHED_ENABLED
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename scalar_t>
 static void apply_syevd(const Tensor& values, const Tensor& vectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
@@ -1363,6 +1490,10 @@ static void apply_syevj_batched(const Tensor& values, const Tensor& vectors, con
   auto values_data = values.data_ptr<value_t>();
   auto infos_data = infos.data_ptr<int>();
 
+<<<<<<< HEAD
+=======
+#ifndef USE_CUSOLVER_64_BIT_XSYEV_BATCHED
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // syevj_params controls the numerical accuracy of syevj
   // by default the tolerance is set to machine accuracy
   // the maximum number of iteration of Jacobi method by default is 100
@@ -1406,6 +1537,57 @@ static void apply_syevj_batched(const Tensor& values, const Tensor& vectors, con
       syevj_params,
       batch_size);
   TORCH_CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));
+<<<<<<< HEAD
+=======
+
+#else
+
+  cusolverDnParams_t syev_params;
+  TORCH_CUSOLVER_CHECK(cusolverDnCreateParams(&syev_params));
+
+  auto handle = at::cuda::getCurrentCUDASolverDnHandle();
+
+  // get the optimal work size and allocate workspace tensor
+  size_t worksize_device;
+  size_t worksize_host;
+
+  at::cuda::solver::xsyevBatched_bufferSize<scalar_t>(
+      handle,
+      syev_params,
+      jobz,
+      uplo,
+      n,
+      vectors_data,
+      lda,
+      values_data,
+      &worksize_device,
+      &worksize_host,
+      batch_size);
+
+  // allocate workspace storage on device and host
+  auto& device_allocator = *at::cuda::getCUDADeviceAllocator();
+  auto work_device_data = device_allocator.allocate(worksize_device);
+  auto& host_allocator = *at::getCPUAllocator();
+  auto work_host_data = host_allocator.allocate(worksize_host);
+  at::cuda::solver::xsyevBatched<scalar_t>(
+      handle,
+      syev_params,
+      jobz,
+      uplo,
+      n,
+      vectors_data,
+      lda,
+      values_data,
+      work_device_data.get(),
+      worksize_device,
+      work_host_data.get(),
+      worksize_host,
+      infos_data,
+      batch_size);
+  TORCH_CUSOLVER_CHECK(cusolverDnDestroyParams(syev_params));
+
+#endif // USE_CUSOLVER_64_BIT_XSYEV_BATCHED
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void linalg_eigh_cusolver_syevd(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
@@ -1426,11 +1608,30 @@ static void linalg_eigh_cusolver_syevj_batched(const Tensor& eigenvalues, const
   });
 }
 
+<<<<<<< HEAD
 void linalg_eigh_cusolver(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
 #ifdef USE_ROCM
   // syevj has larger numerical errors than syevd
   linalg_eigh_cusolver_syevd(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
 #else
+=======
+#if defined(USE_ROCM) && ROCSOLVER_SYEVD_BATCHED_ENABLED
+static void linalg_eigh_rocsolver_syevd_batched(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
+    AT_DISPATCH_FLOATING_TYPES(eigenvectors.scalar_type(), "linalg_eigh_cuda", [&]() {
+      apply_syevd_batched_rocsolver<scalar_t>(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);});
+}
+#endif // USE_ROCM && ROCSOLVER_SYEVD_BATCHED_ENABLED
+
+void linalg_eigh_cusolver(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
+#if defined(USE_ROCM)
+#if ROCSOLVER_SYEVD_BATCHED_ENABLED
+  if (batchCount(eigenvectors) > 1 && (eigenvectors.scalar_type() == at::kFloat || eigenvectors.scalar_type() == at::kDouble))
+    linalg_eigh_rocsolver_syevd_batched(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
+  else // not ROCSOLVER_SYEVD_BATCHED_ENABLED or batch==1 or complex input
+#endif // ROCSOLVER_SYEVD_BATCHED_ENABLED
+    linalg_eigh_cusolver_syevd(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
+#else // not USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (use_cusolver_syevj_batched_ && batchCount(eigenvectors) > 1 && eigenvectors.size(-1) <= 32) {
     // Use syevjBatched for batched matrix operation when matrix size <= 32
     // See https://github.com/pytorch/pytorch/pull/53040#issuecomment-788264724
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
index 5e1f255ebe08..49bfdb55c1d9 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
@@ -36,8 +36,13 @@
 // The current pytorch implementation sets gesvdj tolerance to epsilon of a C++ data type to target the best possible precision.
 constexpr int cusolver_gesvdj_max_sweeps = 400;
 
+<<<<<<< HEAD
 namespace at {
 namespace native {
+=======
+
+namespace at::native {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void geqrf_batched_cublas(const Tensor& input, const Tensor& tau);
 void triangular_solve_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular);
@@ -90,4 +95,8 @@ C10_EXPORT void registerLinalgDispatch(const LinalgDispatch&);
 }} // namespace cuda::detail
 #endif
 
+<<<<<<< HEAD
 }}  // namespace at::native
+=======
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
index 7b068b5f5aec..daa300123650 100644
--- a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
+++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
@@ -1956,6 +1956,277 @@ void xsyevd<c10::complex<double>, double>(
 }
 #endif // USE_CUSOLVER_64_BIT
 
+<<<<<<< HEAD
+=======
+#ifdef USE_CUSOLVER_64_BIT_XSYEV_BATCHED
+
+template <>
+void xsyevBatched_bufferSize<float>(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t uplo,
+    int64_t n,
+    const float *A,
+    int64_t lda,
+    const float *W,
+    size_t *workspaceInBytesOnDevice,
+    size_t *workspaceInBytesOnHost,
+    int64_t batchSize) {
+  TORCH_CUSOLVER_CHECK(cusolverDnXsyevBatched_bufferSize(
+       handle,
+       params,
+       jobz,
+       uplo,
+       n,
+       CUDA_R_32F,
+       reinterpret_cast<const void*>(A),
+       lda,
+       CUDA_R_32F,
+       reinterpret_cast<const void*>(W),
+       CUDA_R_32F,
+       workspaceInBytesOnDevice,
+       workspaceInBytesOnHost,
+       batchSize));
+}
+
+template <>
+void xsyevBatched_bufferSize<double>(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t uplo,
+    int64_t n,
+    const double *A,
+    int64_t lda,
+    const double *W,
+    size_t *workspaceInBytesOnDevice,
+    size_t *workspaceInBytesOnHost,
+    int64_t batchSize) {
+  TORCH_CUSOLVER_CHECK(cusolverDnXsyevBatched_bufferSize(
+       handle,
+       params,
+       jobz,
+       uplo,
+       n,
+       CUDA_R_64F,
+       reinterpret_cast<const void*>(A),
+       lda,
+       CUDA_R_64F,
+       reinterpret_cast<const void*>(W),
+       CUDA_R_64F,
+       workspaceInBytesOnDevice,
+       workspaceInBytesOnHost,
+       batchSize));
+}
+
+template <>
+void xsyevBatched_bufferSize<c10::complex<float>, float>(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t uplo,
+    int64_t n,
+    const c10::complex<float> *A,
+    int64_t lda,
+    const float *W,
+    size_t *workspaceInBytesOnDevice,
+    size_t *workspaceInBytesOnHost,
+    int64_t batchSize) {
+  TORCH_CUSOLVER_CHECK(cusolverDnXsyevBatched_bufferSize(
+       handle,
+       params,
+       jobz,
+       uplo,
+       n,
+       CUDA_C_32F,
+       reinterpret_cast<const void*>(A),
+       lda,
+       CUDA_R_32F,
+       reinterpret_cast<const void*>(W),
+       CUDA_C_32F,
+       workspaceInBytesOnDevice,
+       workspaceInBytesOnHost,
+       batchSize));
+}
+
+template <>
+void xsyevBatched_bufferSize<c10::complex<double>, double>(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t uplo,
+    int64_t n,
+    const c10::complex<double> *A,
+    int64_t lda,
+    const double *W,
+    size_t *workspaceInBytesOnDevice,
+    size_t *workspaceInBytesOnHost,
+    int64_t batchSize) {
+  TORCH_CUSOLVER_CHECK(cusolverDnXsyevBatched_bufferSize(
+       handle,
+       params,
+       jobz,
+       uplo,
+       n,
+       CUDA_C_64F,
+       reinterpret_cast<const void*>(A),
+       lda,
+       CUDA_R_64F,
+       reinterpret_cast<const void*>(W),
+       CUDA_C_64F,
+       workspaceInBytesOnDevice,
+       workspaceInBytesOnHost,
+       batchSize));
+}
+
+template <>
+void xsyevBatched<float>(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int64_t n,
+    float *A,
+    int64_t lda,
+    float *W,
+    void *bufferOnDevice,
+    size_t workspaceInBytesOnDevice,
+    void *bufferOnHost,
+    size_t workspaceInBytesOnHost,
+    int *info,
+    int64_t batchSize) {
+  TORCH_CUSOLVER_CHECK(cusolverDnXsyevBatched(
+       handle,
+       params,
+       jobz,
+       uplo,
+       n,
+       CUDA_R_32F,
+       reinterpret_cast<void*>(A),
+       lda,
+       CUDA_R_32F,
+       reinterpret_cast<void*>(W),
+       CUDA_R_32F,
+       bufferOnDevice,
+       workspaceInBytesOnDevice,
+       bufferOnHost,
+       workspaceInBytesOnHost,
+       info,
+       batchSize));
+}
+
+template <>
+void xsyevBatched<double>(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int64_t n,
+    double *A,
+    int64_t lda,
+    double *W,
+    void *bufferOnDevice,
+    size_t workspaceInBytesOnDevice,
+    void *bufferOnHost,
+    size_t workspaceInBytesOnHost,
+    int *info,
+    int64_t batchSize) {
+  TORCH_CUSOLVER_CHECK(cusolverDnXsyevBatched(
+       handle,
+       params,
+       jobz,
+       uplo,
+       n,
+       CUDA_R_64F,
+       reinterpret_cast<void*>(A),
+       lda,
+       CUDA_R_64F,
+       reinterpret_cast<void*>(W),
+       CUDA_R_64F,
+       bufferOnDevice,
+       workspaceInBytesOnDevice,
+       bufferOnHost,
+       workspaceInBytesOnHost,
+       info,
+       batchSize));
+}
+
+template <>
+void xsyevBatched<c10::complex<float>, float>(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int64_t n,
+    c10::complex<float> *A,
+    int64_t lda,
+    float *W,
+    void *bufferOnDevice,
+    size_t workspaceInBytesOnDevice,
+    void *bufferOnHost,
+    size_t workspaceInBytesOnHost,
+    int *info,
+    int64_t batchSize) {
+  TORCH_CUSOLVER_CHECK(cusolverDnXsyevBatched(
+       handle,
+       params,
+       jobz,
+       uplo,
+       n,
+       CUDA_C_32F,
+       reinterpret_cast<void*>(A),
+       lda,
+       CUDA_R_32F,
+       reinterpret_cast<void*>(W),
+       CUDA_C_32F,
+       bufferOnDevice,
+       workspaceInBytesOnDevice,
+       bufferOnHost,
+       workspaceInBytesOnHost,
+       info,
+       batchSize));
+}
+
+template <>
+void xsyevBatched<c10::complex<double>, double>(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int64_t n,
+    c10::complex<double> *A,
+    int64_t lda,
+    double *W,
+    void *bufferOnDevice,
+    size_t workspaceInBytesOnDevice,
+    void *bufferOnHost,
+    size_t workspaceInBytesOnHost,
+    int *info,
+    int64_t batchSize) {
+  TORCH_CUSOLVER_CHECK(cusolverDnXsyevBatched(
+       handle,
+       params,
+       jobz,
+       uplo,
+       n,
+       CUDA_C_64F,
+       reinterpret_cast<void*>(A),
+       lda,
+       CUDA_R_64F,
+       reinterpret_cast<void*>(W),
+       CUDA_C_64F,
+       bufferOnDevice,
+       workspaceInBytesOnDevice,
+       bufferOnHost,
+       workspaceInBytesOnHost,
+       info,
+       batchSize));
+}
+
+#endif // USE_CUSOLVER_64_BIT_XSYEV_BATCHED
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::cuda::solver
 
 #endif // CUDART_VERSION
diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.h b/aten/src/ATen/native/cuda/linalg/CUDASolver.h
index 9b17086646d8..fa94bffa3e0b 100644
--- a/aten/src/ATen/native/cuda/linalg/CUDASolver.h
+++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.h
@@ -7,6 +7,14 @@
 #define USE_CUSOLVER_64_BIT
 #endif
 
+<<<<<<< HEAD
+=======
+#if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && CUSOLVER_VERSION >= 11701
+// cuSOLVER version >= 11701 includes 64-bit API for batched syev
+#define USE_CUSOLVER_64_BIT_XSYEV_BATCHED
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(CUDART_VERSION) || defined(USE_ROCM)
 
 namespace at {
@@ -671,6 +679,87 @@ void xsyevd<c10::complex<double>, double>(
 
 #endif // USE_CUSOLVER_64_BIT
 
+<<<<<<< HEAD
+=======
+#ifdef USE_CUSOLVER_64_BIT_XSYEV_BATCHED
+
+#define CUDASOLVER_XSYEV_BATCHED_BUFFERSIZE_ARGTYPES(scalar_t, value_t) \
+    cusolverDnHandle_t handle,                                          \
+    cusolverDnParams_t params,                                          \
+    cusolverEigMode_t  jobz,                                            \
+    cublasFillMode_t uplo,                                              \
+    int64_t n,                                                          \
+    const scalar_t *A,                                                  \
+    int64_t lda,                                                        \
+    const value_t *W,                                                   \
+    size_t *workspaceInBytesOnDevice,                                   \
+    size_t *workspaceInBytesOnHost,                                     \
+    int64_t batchSize
+
+template <class scalar_t, class value_t = scalar_t>
+void xsyevBatched_bufferSize(
+    CUDASOLVER_XSYEV_BATCHED_BUFFERSIZE_ARGTYPES(scalar_t, value_t)) {
+  static_assert(false&&sizeof(scalar_t),
+      "at::cuda::solver::xsyevBatched_bufferSize: not implemented");
+}
+
+template <>
+void xsyevBatched_bufferSize<float>(
+    CUDASOLVER_XSYEV_BATCHED_BUFFERSIZE_ARGTYPES(float, float));
+
+template <>
+void xsyevBatched_bufferSize<double>(
+    CUDASOLVER_XSYEV_BATCHED_BUFFERSIZE_ARGTYPES(double, double));
+
+template <>
+void xsyevBatched_bufferSize<c10::complex<float>, float>(
+    CUDASOLVER_XSYEV_BATCHED_BUFFERSIZE_ARGTYPES(c10::complex<float>, float));
+
+template <>
+void xsyevBatched_bufferSize<c10::complex<double>, double>(
+    CUDASOLVER_XSYEV_BATCHED_BUFFERSIZE_ARGTYPES(c10::complex<double>, double));
+
+#define CUDASOLVER_XSYEV_BATCHED_ARGTYPES(scalar_t, value_t) \
+    cusolverDnHandle_t handle,                               \
+    cusolverDnParams_t params,                               \
+    cusolverEigMode_t jobz,                                  \
+    cublasFillMode_t uplo,                                   \
+    int64_t n,                                               \
+    scalar_t *A,                                             \
+    int64_t lda,                                             \
+    value_t *W,                                              \
+    void *bufferOnDevice,                                    \
+    size_t workspaceInBytesOnDevice,                         \
+    void *bufferOnHost,                                      \
+    size_t workspaceInBytesOnHost,                           \
+    int *info,                                               \
+    int64_t batchSize
+
+template <class scalar_t, class value_t = scalar_t>
+void xsyevBatched(CUDASOLVER_XSYEV_BATCHED_ARGTYPES(scalar_t, value_t)) {
+  static_assert(false&&sizeof(scalar_t),
+      "at::cuda::solver::xsyevBatched: not implemented");
+}
+
+template <>
+void xsyevBatched<float>(
+    CUDASOLVER_XSYEV_BATCHED_ARGTYPES(float, float));
+
+template <>
+void xsyevBatched<double>(
+    CUDASOLVER_XSYEV_BATCHED_ARGTYPES(double, double));
+
+template <>
+void xsyevBatched<c10::complex<float>, float>(
+    CUDASOLVER_XSYEV_BATCHED_ARGTYPES(c10::complex<float>, float));
+
+template <>
+void xsyevBatched<c10::complex<double>, double>(
+    CUDASOLVER_XSYEV_BATCHED_ARGTYPES(c10::complex<double>, double));
+
+#endif // USE_CUSOLVER_64_BIT_XSYEV_BATCHED
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace solver
 } // namespace cuda
 } // namespace at
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index 1c5b6fb94a22..1b970710cebb 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -551,8 +551,13 @@ class AlgoIterator {
       try {
         f(algoPerf);
         return;
+<<<<<<< HEAD
       } catch (c10::OutOfMemoryError& e) {
         cudaGetLastError(); // clear CUDA error
+=======
+      } catch (c10::OutOfMemoryError&) {
+        std::ignore = cudaGetLastError(); // clear CUDA error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
 
@@ -564,10 +569,17 @@ class AlgoIterator {
         f(algoPerf);
         cache.insert(args.params, algoPerf);
         return;
+<<<<<<< HEAD
       } catch (c10::OutOfMemoryError& e) {
         cudaGetLastError(); // clear CUDA error
       } catch (c10::CuDNNError& e) {
         cudaGetLastError(); // clear CUDA error
+=======
+      } catch (c10::OutOfMemoryError&) {
+        std::ignore = cudaGetLastError(); // clear CUDA error
+      } catch (c10::CuDNNError&) {
+        std::ignore = cudaGetLastError(); // clear CUDA error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     TORCH_CHECK(
diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp
index 266e779aa319..bff10bf2bc84 100644
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@@ -258,19 +258,32 @@ static int getLRUCacheLimit() {
   // 0 is used to indicate no limit
   // negative values are used to indicate no caching
   static int limit = [&] {
+<<<<<<< HEAD
     const char* val = getenv("TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT");
+=======
+    const auto val = c10::utils::get_env("TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!val) {
       return DEFAULT_LIMIT;
     }
     try {
+<<<<<<< HEAD
       return std::stoi(val);
     } catch (std::invalid_argument const& e) {
+=======
+      return std::stoi(val.value());
+    } catch (std::invalid_argument const&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_WARN(
           "invalid TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT,",
           " using default LRU cache limit of ",
           DEFAULT_LIMIT,
           " entries.");
+<<<<<<< HEAD
     } catch (std::out_of_range const& e) {
+=======
+    } catch (std::out_of_range const&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_WARN(
           "invalid TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT,",
           " using default LRU cache limit of ",
@@ -347,7 +360,11 @@ struct BenchmarkCache {
 
 // @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to
 // be thread safe across all engines see Limitations in
+<<<<<<< HEAD
 // https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html
+=======
+// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 thread_local BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyWrapper>
     benchmark_cache;
 thread_local BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyFusedWrapper>
@@ -676,9 +693,15 @@ void generate_and_filter_plans(
       workspace_ptr =
           c10::cuda::CUDACachingAllocator::get()->allocate(max_workspace_size);
       break;
+<<<<<<< HEAD
     } catch (c10::OutOfMemoryError& e) {
       max_workspace_size /= 2;
       (void)cudaGetLastError(); // clear CUDA error
+=======
+    } catch (c10::OutOfMemoryError&) {
+      max_workspace_size /= 2;
+      std::ignore = cudaGetLastError(); // clear CUDA error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       remove_invalid = true;
     }
   }
@@ -876,10 +899,17 @@ void try_plans(
       run_conv_plan(handle, x, y, w, plan, operation);
       benchmark_cache.update(key, plan);
       return;
+<<<<<<< HEAD
     } catch (cudnn_frontend::cudnnException& e) {
     } catch (CuDNNError& e) {
     } catch (c10::OutOfMemoryError& e) {
       (void)cudaGetLastError(); // clear CUDA error
+=======
+    } catch (cudnn_frontend::cudnnException&) {
+    } catch (CuDNNError&) {
+    } catch (c10::OutOfMemoryError&) {
+      std::ignore = cudaGetLastError(); // clear CUDA error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   TORCH_CHECK(
@@ -900,10 +930,17 @@ void try_plans_fused(
       run_conv_plan_fused(handle, x, y, w, z, b, plan);
       benchmark_cache_fused.update(key, plan);
       return;
+<<<<<<< HEAD
     } catch (cudnn_frontend::cudnnException& e) {
     } catch (CuDNNError& e) {
     } catch (c10::OutOfMemoryError& e) {
       (void)cudaGetLastError(); // clear CUDA error
+=======
+    } catch (cudnn_frontend::cudnnException&) {
+    } catch (CuDNNError&) {
+    } catch (c10::OutOfMemoryError&) {
+      std::ignore = cudaGetLastError(); // clear CUDA error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   TORCH_CHECK(
@@ -931,10 +968,17 @@ bool try_configs(
       run_conv_plan(handle, x, y, w, plan, operation);
       benchmark_cache.update(key, plan);
       return true;
+<<<<<<< HEAD
     } catch (cudnn_frontend::cudnnException& e) {
     } catch (CuDNNError& e) {
     } catch (c10::OutOfMemoryError& e) {
       (void)cudaGetLastError(); // clear CUDA error
+=======
+    } catch (cudnn_frontend::cudnnException&) {
+    } catch (CuDNNError&) {
+    } catch (c10::OutOfMemoryError&) {
+      std::ignore = cudaGetLastError(); // clear CUDA error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   return false;
@@ -962,10 +1006,17 @@ bool try_configs_fused(
       run_conv_plan_fused(handle, x, y, w, z, b, plan);
       benchmark_cache_fused.update(key, plan);
       return true;
+<<<<<<< HEAD
     } catch (cudnn_frontend::cudnnException& e) {
     } catch (CuDNNError& e) {
     } catch (c10::OutOfMemoryError& e) {
       (void)cudaGetLastError(); // clear CUDA error
+=======
+    } catch (cudnn_frontend::cudnnException&) {
+    } catch (CuDNNError&) {
+    } catch (c10::OutOfMemoryError&) {
+      std::ignore = cudaGetLastError(); // clear CUDA error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   return false;
@@ -1001,8 +1052,13 @@ void run_single_conv(
     try {
       run_conv_plan(handle, x, y, w, *search, operation);
       return;
+<<<<<<< HEAD
     } catch (c10::OutOfMemoryError& e) {
       (void)cudaGetLastError(); // clear CUDA error
+=======
+    } catch (c10::OutOfMemoryError&) {
+      std::ignore = cudaGetLastError(); // clear CUDA error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   if (!benchmark) {
@@ -1101,8 +1157,13 @@ void run_fused_conv(
     try {
       run_conv_plan_fused(handle, x, y, w, z, b, *search);
       return;
+<<<<<<< HEAD
     } catch (c10::OutOfMemoryError& e) {
       (void)cudaGetLastError(); // clear CUDA error
+=======
+    } catch (c10::OutOfMemoryError&) {
+      std::ignore = cudaGetLastError(); // clear CUDA error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   if (!benchmark) {
@@ -1183,6 +1244,12 @@ void raw_cudnn_convolution_forward_out(
   if (output.numel() == 0) {
     return;
   }
+<<<<<<< HEAD
+=======
+  for (auto it = dilation.begin(); it != dilation.end(); it++) {
+    TORCH_CHECK_VALUE(*it > 0, "Expected positive dilation in convolution.");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (at::native::cudnnv8_enabled_check_debug()) {
     run_single_conv(
         CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
index 915fbed0f066..f855234c29ee 100644
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -151,6 +151,16 @@ bool _use_cudnn_ctc_loss_tensor(
         }
       }
     } else {
+<<<<<<< HEAD
+=======
+      if (target_lengths.device().type() != at::kCUDA ||
+          input_lengths.device().type() != at::kCUDA) {
+        TORCH_CHECK(
+            false,
+            "CTCLoss cannot be graph captured with CPU length tensors. "
+            "Move CPU length tensors to GPU memory to enable graph capture.")
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::_assert_async(at::lt(input_lengths.max(), 256));
       at::_assert_async(at::le(target_lengths, input_lengths).all());
     }
@@ -253,9 +263,24 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
     bool deterministic,
     bool zero_infinity) {
   Tensor targets_t_ = targets_t;
+<<<<<<< HEAD
+  if (targets_t.device().type() == at::kCPU) {
+    targets_t_ = targets_t.to(Device(at::kCUDA));
+  }
+=======
+  Tensor input_lengths_ = input_lengths;
+  Tensor target_lengths_ = target_lengths;
   if (targets_t.device().type() == at::kCPU) {
     targets_t_ = targets_t.to(Device(at::kCUDA));
   }
+  if (input_lengths.device().type() == at::kCPU) {
+    input_lengths_ = input_lengths.to(Device(at::kCUDA));
+  }
+  if (input_lengths.device().type() == at::kCPU) {
+    target_lengths_ = target_lengths.to(Device(at::kCUDA));
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const CheckedFrom c = "cudnn_ctc_loss";
   const TensorArg log_probs{log_probs_t, "log_probs", 1};
   const TensorArg targets{targets_t_, "targets", 2};
@@ -268,9 +293,15 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
   checkBackend(c, {*targets}, Backend::CUDA);
   const auto batch_size = log_probs->size(1);
   int64_t input_lengths_size =
+<<<<<<< HEAD
       input_lengths.sizes().size() ? input_lengths.size(0) : 1;
   int64_t target_lengths_size =
       target_lengths.sizes().size() ? target_lengths.size(0) : 1;
+=======
+      input_lengths_.sizes().size() ? input_lengths_.size(0) : 1;
+  int64_t target_lengths_size =
+      target_lengths_.sizes().size() ? target_lengths_.size(0) : 1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       input_lengths_size == batch_size,
       "input_lengths needs to have size to match batch_size");
@@ -319,8 +350,13 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
       log_probs_desc.desc(),
       log_probs_t.data_ptr(),
       targets_t_.data_ptr<int>(),
+<<<<<<< HEAD
       target_lengths.data_ptr<int>(),
       input_lengths.data_ptr<int>(),
+=======
+      target_lengths_.data_ptr<int>(),
+      input_lengths_.data_ptr<int>(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       costs.data_ptr(),
       grad_desc.desc(),
       grad.data_ptr(),
diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp
index c38d4a095c04..61bdb3b52c09 100644
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@@ -92,6 +92,10 @@ void run_cudnn_SDP_bprop(
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
 #include <ATen/native/cudnn/MHA.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/transformers/sdp_utils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <ATen/cuda/Exceptions.h>
 #include <cudnn_frontend.h>
@@ -296,7 +300,11 @@ struct MHAGraphCache {
 
 // @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to
 // be thread safe across all engines see Limitations in
+<<<<<<< HEAD
 // https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html
+=======
+// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 thread_local MHAGraphCache<graph_and_tensors, MHACacheKeyWrapper> mhagraphcache;
 thread_local MHAGraphCache<graph_and_tensors_backward, MHACacheKeyWrapper>
     mhagraphbackwardcache;
@@ -319,6 +327,7 @@ auto fixSizeOneDimStrideSDPA(
   }
   return strides;
 }
+<<<<<<< HEAD
 
 void alloc_with_matching_layout(
     const Tensor& q,
@@ -401,6 +410,8 @@ bool same_strides(const Tensor& t1, const Tensor& t2) {
       t2_strides_no_ones.begin(),
       t2_strides_no_ones.end());
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 auto build_graph_and_tensors(
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 17039f03e645..692b35d84f5e 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -216,7 +216,11 @@ struct RNNDescriptorParams {
       cudnnDataType_t datatype,
       cudnnDataType_t input_datatype) {
 #endif
+<<<<<<< HEAD
       this->set_mode(mode);
+=======
+      this -> set_mode(mode);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_CUDNN_RNN_V8_API
   this->input_size = input_size;
   this->packed = packed;
@@ -432,8 +436,13 @@ struct TensorDescriptorListParams {
   // Only valid when !is_input_packed
   int64_t batch_sizes_sum; // == sum(batch_sizes)
 
+<<<<<<< HEAD
   bool is_input_packed() const {
     return batch_sizes.size() != 0;
+=======
+  [[nodiscard]] bool is_input_packed() const {
+    return !batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void set(
@@ -465,8 +474,12 @@ struct TensorDescriptorListParams {
 #ifndef USE_CUDNN_RNN_V8_API
   // TODO: check x for consistency with input_size?
   std::vector<TensorDescriptor> descriptors(Tensor x) const {
+<<<<<<< HEAD
     auto is_input_packed = batch_sizes.size() != 0;
     if (is_input_packed) {
+=======
+    if (is_input_packed()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return rnn_descriptor_sequence(x, batch_sizes);
     } else {
       return rnn_descriptor(x[0], seq_length);
@@ -474,8 +487,12 @@ struct TensorDescriptorListParams {
   }
 #else
   auto descriptors(Tensor x) const {
+<<<<<<< HEAD
     auto is_input_packed = batch_sizes.size() != 0;
     if (is_input_packed) {
+=======
+    if (is_input_packed()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return rnn_descriptor_sequence(
           x, mini_batch, batch_sizes, seq_length, x.size(-1));
     } else {
@@ -1204,7 +1221,11 @@ cudnnRNNAlgo_t get_algo(
   // Persistent algos typically don't work for packed inputs with sequence
   // lengths that vary across batch elements, and will return
   // CUDNN_STATUS_NOT_SUPPORTED if attempted. See
+<<<<<<< HEAD
   // https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#features-of-rnn-functions
+=======
+  // https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-890/developer-guide/index.html#features-of-rnn-functions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!tensors.is_input_packed()) {
     auto cudnnDataType = getCudnnDataType(input);
     if (cudnnDataType != CUDNN_DATA_DOUBLE) {
@@ -1253,7 +1274,11 @@ int64_t _cudnn_rnn_flatten_weight_prologue(
   // typeMetaToScalarType is a surprisingly nontrivial function.  We should
   // avoid it if we can.
   TORCH_CHECK(
+<<<<<<< HEAD
       weight_arr.size() > 0,
+=======
+      !weight_arr.empty(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "copy_weights_to_flat_buf_views: cannot flatten empty weight list");
 
   rnn.set(
@@ -1274,7 +1299,11 @@ int64_t _cudnn_rnn_flatten_weight_prologue(
   rnn_desc = rnn.descriptor(handle);
 
   // Why do we pad to 5 dims here (and elsewhere)?
+<<<<<<< HEAD
   // https://docs.nvidia.com/deeplearning/sdk/cudnn-api/index.html#cudnnRNNForwardTraining
+=======
+  // https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-892/api/index.html#cudnnRNNForwardTraining
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // expects descriptors padded to 3 dimensions.
   x_desc.set(flat_buf_datatype, x_geom.sizes(), x_geom.strides(), 5);
 
@@ -1306,7 +1335,11 @@ copy_weights_to_flat_buf_views(
     bool set_orig_weights_to_flat_buf,
     bool allow_type_change /*=false*/,
     bool include_bias /*=true*/) {
+<<<<<<< HEAD
   TORCH_CHECK(weight_arr.size() > 0, "empty weight list");
+=======
+  TORCH_CHECK(!weight_arr.empty(), "empty weight list");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto handle = getCudnnHandle();
   RNNDescriptorParams rnn;
   RNNDescriptor rnn_desc;
@@ -1390,7 +1423,11 @@ Tensor _cudnn_rnn_flatten_weight(
     int64_t fn_num_layers,
     bool batch_first,
     bool fn_bidirectional) {
+<<<<<<< HEAD
   TORCH_CHECK(weight_arr.size() > 0, "empty weight list");
+=======
+  TORCH_CHECK(!weight_arr.empty(), "empty weight list");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // returns flat weight_buf
   return std::get<0>(copy_weights_to_flat_buf_views(
       weight_arr,
@@ -1417,7 +1454,11 @@ Tensor _cudnn_rnn_flatten_weight_meta(
     int64_t num_layers,
     bool batch_first,
     bool bidirectional) {
+<<<<<<< HEAD
   TORCH_CHECK(weight_arr.size() > 0, "empty weight list");
+=======
+  TORCH_CHECK(!weight_arr.empty(), "empty weight list");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto handle = getCudnnHandle();
   RNNDescriptorParams rnn;
   RNNDescriptor rnn_desc;
@@ -1498,7 +1539,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
       datatype);
 #else
   auto input_size = input_r.size(-1);
+<<<<<<< HEAD
   auto packed = fn_batch_sizes.size() != 0;
+=======
+  auto packed = !fn_batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fn.rnn.set(
       fn_mode,
       input_size,
@@ -1520,7 +1565,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   }
 
   // TODO: can batch_first be a wrapper around this function?
+<<<<<<< HEAD
   auto is_input_packed = fn.tensors.batch_sizes.size() != 0;
+=======
+  auto is_input_packed = !fn.tensors.batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (batch_first && !is_input_packed) {
     input = input.transpose(0, 1);
   }
@@ -1775,7 +1824,11 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
       datatype);
 #else
   auto cudnn_input_size = input_r.size(-1);
+<<<<<<< HEAD
   auto packed = fn_batch_sizes.size() != 0;
+=======
+  auto packed = !fn_batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fn.rnn.set(
       fn_mode,
       cudnn_input_size,
@@ -1797,7 +1850,11 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
     TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN");
   }
 
+<<<<<<< HEAD
   auto is_input_packed = fn_batch_sizes.size() != 0;
+=======
+  auto is_input_packed = !fn_batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (batch_first && !is_input_packed) {
     input = input.transpose(0, 1);
     grad_output = grad_output.transpose(0, 1);
@@ -2004,7 +2061,11 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
       datatype);
 #else
   auto cudnn_input_size = input_r.size(-1);
+<<<<<<< HEAD
   auto packed = fn_batch_sizes.size() != 0;
+=======
+  auto packed = !fn_batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fn.rnn.set(
       fn_mode,
       cudnn_input_size,
@@ -2025,7 +2086,11 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
     TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN");
   }
 
+<<<<<<< HEAD
   auto is_input_packed = fn_batch_sizes.size() != 0;
+=======
+  auto is_input_packed = !fn_batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (batch_first && !is_input_packed) {
     input = input.transpose(0, 1);
     output = output.transpose(0, 1);
diff --git a/aten/src/ATen/native/cudnn/RNNUtils.h b/aten/src/ATen/native/cudnn/RNNUtils.h
index 7e2869a80574..684b4521db65 100644
--- a/aten/src/ATen/native/cudnn/RNNUtils.h
+++ b/aten/src/ATen/native/cudnn/RNNUtils.h
@@ -6,9 +6,14 @@
 #include <ATen/cudnn/cudnn-wrapper.h>
 
 // Declares utilities used by RNN.cpp and also needed by external consumers
+<<<<<<< HEAD
 namespace at {
 namespace native {
 namespace cudnn_rnn {
+=======
+
+namespace at::native::cudnn_rnn {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TORCH_CUDA_CPP_API std::tuple<Tensor, std::vector<Tensor>>
 copy_weights_to_flat_buf_views(
@@ -27,6 +32,10 @@ copy_weights_to_flat_buf_views(
     bool allow_type_change = false,
     bool include_bias = true);
 
+<<<<<<< HEAD
 } // namespace cudnn_rnn
 } // namespace native
 } // namespace at
+=======
+} // namespace at::native::cudnn_rnn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/hip/ck_bgemm.h b/aten/src/ATen/native/hip/ck_bgemm.h
index fb1458043f67..62094a2b2b8c 100644
--- a/aten/src/ATen/native/hip/ck_bgemm.h
+++ b/aten/src/ATen/native/hip/ck_bgemm.h
@@ -5,8 +5,13 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 template <typename Dtype>
 inline void bgemm_internal_ck(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+=======
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_internal_ck(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static_assert(false&&sizeof(Dtype),"at::cuda::blas_bgemm_internal_ck: not implemented");
 }
 
diff --git a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
index dd1503de89cb..135288374657 100644
--- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
@@ -469,11 +469,322 @@ void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
   }
 }
 
+<<<<<<< HEAD
+=======
+void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+  // If any of the shapes cant be tiled, we must use padding.
+  bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
+  // Dispatch to best implementation.
+  // TODO add more configurations. Optimize.
+
+  bool transa_ = std::tolower(transa) != 'n';
+  bool transb_ = std::tolower(transb) != 'n';
+
+  if (use_padding) {
+      if(transa_ && transb_) { // col , col
+          gemm_impl_wmma<
+            at::BFloat16,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            true,
+            true,
+            true>
+            (CUDABLAS_GEMM_ARGS(at::BFloat16));
+      }
+      else if(transa_ && !transb_) { // row, col
+          gemm_impl_wmma<
+            at::BFloat16,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            true,
+            true,
+            false>
+            (CUDABLAS_GEMM_ARGS(at::BFloat16));
+      }
+      else if(!transa_ && transb_) { //col, row
+          gemm_impl_wmma<
+            at::BFloat16,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            true,
+            false,
+            true>
+            (CUDABLAS_GEMM_ARGS(at::BFloat16));
+      }
+      else if(!transa_ && !transb_) { //row, row
+          gemm_impl_wmma<
+            at::BFloat16,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            true,
+            false,
+            false>
+            (CUDABLAS_GEMM_ARGS(at::BFloat16));
+      }
+      else {
+        TORCH_CHECK(false, "unreachable");
+      }
+  } else {
+         if(transa_ && transb_) { // col , col
+          gemm_impl_wmma<
+            at::BFloat16,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            false,
+            true,
+            true>
+            (CUDABLAS_GEMM_ARGS(at::BFloat16));
+      }
+      else if(transa_ && !transb_) { // row, col
+          gemm_impl_wmma<
+            at::BFloat16,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            false,
+            true,
+            false>
+            (CUDABLAS_GEMM_ARGS(at::BFloat16));
+      }
+      else if(!transa_ && transb_) { //col, row
+          gemm_impl_wmma<
+            at::BFloat16,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            false,
+            false,
+            true>
+            (CUDABLAS_GEMM_ARGS(at::BFloat16));
+      }
+      else if(!transa_ && !transb_) { //row, row
+          gemm_impl_wmma<
+            at::BFloat16,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>, 8,
+            false,
+            false,
+            false>
+            (CUDABLAS_GEMM_ARGS(at::BFloat16));
+      }
+      else {
+        TORCH_CHECK(false, "unreachable");
+      }
+  }
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+<<<<<<< HEAD
   dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
+=======
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  std::string_view arch(dprops->gcnArchName);
+  if (arch == "gfx1100") {
+    dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  } else{
+    dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/hip/ck_gemm_half.hip b/aten/src/ATen/native/hip/ck_gemm_half.hip
index 60b64ca275c5..de2b76b8ec88 100644
--- a/aten/src/ATen/native/hip/ck_gemm_half.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_half.hip
@@ -297,10 +297,319 @@ void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
   }
 #endif
 }
+<<<<<<< HEAD
 
 template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
   dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
+=======
+void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+  // If any of the shapes cant be tiled, we must use padding.
+  bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
+  // Dispatch to best implementation.
+  // TODO add more configurations. Optimize.
+
+  bool transa_ = std::tolower(transa) != 'n';
+  bool transb_ = std::tolower(transb) != 'n';
+
+  if (use_padding) {
+      if(transa_ && transb_) { // col , col
+          gemm_impl_wmma<
+            at::Half,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            true,
+            true,
+            true>
+            (CUDABLAS_GEMM_ARGS(at::Half));
+      }
+      else if(transa_ && !transb_) { // row, col
+          gemm_impl_wmma<
+            at::Half,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            true,
+            true,
+            false>
+            (CUDABLAS_GEMM_ARGS(at::Half));
+      }
+      else if(!transa_ && transb_) { //col, row
+          gemm_impl_wmma<
+            at::Half,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            true,
+            false,
+            true>
+            (CUDABLAS_GEMM_ARGS(at::Half));
+      }
+      else if(!transa_ && !transb_) { //row, row
+          gemm_impl_wmma<
+            at::Half,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            true,
+            false,
+            false>
+            (CUDABLAS_GEMM_ARGS(at::Half));
+      }
+      else {
+        TORCH_CHECK(false, "unreachable");
+      }
+  } else {
+         if(transa_ && transb_) { // col , col
+          gemm_impl_wmma<
+            at::Half,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            false,
+            true,
+            true>
+            (CUDABLAS_GEMM_ARGS(at::Half));
+      }
+      else if(transa_ && !transb_) { // row, col
+          gemm_impl_wmma<
+            at::Half,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            false,
+            true,
+            false>
+            (CUDABLAS_GEMM_ARGS(at::Half));
+      }
+      else if(!transa_ && transb_) { //col, row
+          gemm_impl_wmma<
+            at::Half,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>,
+            8,
+            false,
+            false,
+            true>
+            (CUDABLAS_GEMM_ARGS(at::Half));
+      }
+      else if(!transa_ && !transb_) { //row, row
+          gemm_impl_wmma<
+            at::Half,
+            256,
+            128,
+            256,
+            64,
+            8,
+            16,
+            16,
+            4,
+            4,
+            S<4, 64, 1>,
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>,
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            1,
+            8,
+            true,
+            1,
+            1,
+            S<1, 32, 1,  8>, 8,
+            false,
+            false,
+            false>
+            (CUDABLAS_GEMM_ARGS(at::Half));
+      }
+      else {
+        TORCH_CHECK(false, "unreachable");
+      }
+  }
+}
+
+template <>
+void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+  if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) {
+    dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
+  } else{
+    dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/hip/ck_gemm_template.h b/aten/src/ATen/native/hip/ck_gemm_template.h
index 41308f30dcd3..3c1364159d30 100644
--- a/aten/src/ATen/native/hip/ck_gemm_template.h
+++ b/aten/src/ATen/native/hip/ck_gemm_template.h
@@ -30,6 +30,10 @@
 #include <ck/library/utility/literals.hpp>
 
 #include <ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp>
+<<<<<<< HEAD
+=======
+#include <ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Define commonly used types.
 template <ck::index_t... Is>
@@ -236,4 +240,183 @@ void gemm_impl(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
  invoker.Run(argument, StreamConfig{stream, false});
 }
 
+<<<<<<< HEAD
+=======
+
+template <
+    typename Dtype,
+    int BLOCK_SIZE,
+    int MBLOCK,
+    int NBLOCK,
+    int KBLOCK,
+    int K1,
+    int MPER_WMMA,
+    int NPER_WMMA,
+    int MPER_WAVE,
+    int NPER_WAVE,
+    typename ABLOCK_CLUSTER_LENS,
+    typename ABLOCK_CLUSTER_ORDER,
+    typename ABLOCK_SRC_ORDER,
+    int ABLOCK_VECTOR_DIM,
+    int ABLOCK_SCALAR_VEC,
+    int ABLOCK_SCALAR_VEC_K1,
+    bool ABLOCK_LDS_EXTRAM,
+    typename BBLOCK_CLUSTER_LENS,
+    typename BBLOCK_CLUSTER_ORDER,
+    typename BBLOCK_SRC_ORDER,
+    int BBLOCK_VECTOR_DIM,
+    int BBLOCK_SCALAR_VEC,
+    int BBLOCK_SCALAR_VEC_AK1,
+    bool BBLOCK_LDS_EXTRAN,
+    int CMPER_WAVE,
+    int CNPER_WAVE,
+    typename CBLOCK_CLUSTER_LENS,
+    int CNPER_BLOCK,
+    bool PADDING = false,
+    bool TRANSA = false,
+    bool TRANSB = false>
+void gemm_impl_wmma(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+  // Get input information.
+  int M = m;
+  int N = n;
+  int K = k;
+
+  int StrideA = lda;
+  int StrideB = ldb;
+  int StrideC = ldc;
+
+  int KBatch = 1;
+
+  float falpha = alpha;
+  float fbeta = beta;
+
+  using ADataType = typename CkMathType<Dtype>::dtype;
+  using BDataType = typename CkMathType<Dtype>::dtype;
+  using CDataType = typename CkMathType<Dtype>::dtype;
+  using DDataType = typename CkMathType<Dtype>::dtype;
+
+  using AccDataType = float;
+  using CShuffleDataType = typename CkMathType<Dtype>::dtype;
+
+  using ALayout = typename CkTensorLayout<TRANSA, TRANSB>::a_layout;
+  using BLayout = typename CkTensorLayout<TRANSA, TRANSB>::b_layout;
+
+  using DLayout = Row;
+  using CLayout = Row;
+
+  using AElementOp = PassThrough;
+  using BElementOp = PassThrough;
+  using CElementOp = PassThrough;
+
+
+  static constexpr auto GemmDefault =
+      ck::tensor_operation::device::GemmSpecialization::Default;
+  static constexpr auto GemmMNKPadding =
+      ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+  static constexpr auto GemmSpec = PADDING ? GemmMNKPadding : GemmDefault;
+
+
+  using DeviceGemmInstance =
+            ck::tensor_operation::device::DeviceGemmWmma_CShuffle<ALayout,
+                                                                  BLayout,
+                                                                  CLayout,
+                                                                  ADataType,
+                                                                  BDataType,
+                                                                  CDataType,
+                                                                  AccDataType,
+                                                                  CShuffleDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  CElementOp,
+                                                                  GemmSpec,
+                                                                  1,   // NumPrefetch
+                                                                  BLOCK_SIZE,
+                                                                  MBLOCK,
+                                                                  NBLOCK,
+                                                                  KBLOCK,
+                                                                  K1,
+                                                                  MPER_WMMA,
+                                                                  NPER_WMMA,
+                                                                  MPER_WAVE,
+                                                                  NPER_WAVE,
+                                                                  ABLOCK_CLUSTER_LENS,
+                                                                  ABLOCK_CLUSTER_ORDER,
+                                                                  ABLOCK_SRC_ORDER,
+                                                                  ABLOCK_VECTOR_DIM,
+                                                                  ABLOCK_SCALAR_VEC,
+                                                                  ABLOCK_SCALAR_VEC_K1,
+                                                                  ABLOCK_LDS_EXTRAM,
+                                                                  BBLOCK_CLUSTER_LENS,
+                                                                  BBLOCK_CLUSTER_ORDER,
+                                                                  BBLOCK_SRC_ORDER,
+                                                                  BBLOCK_VECTOR_DIM,
+                                                                  BBLOCK_SCALAR_VEC,
+                                                                  BBLOCK_SCALAR_VEC_AK1,
+                                                                  BBLOCK_LDS_EXTRAN,
+                                                                  CMPER_WAVE,
+                                                                  CNPER_WAVE,
+                                                                  CBLOCK_CLUSTER_LENS,
+                                                                  CNPER_BLOCK>;
+
+  auto gemm = DeviceGemmInstance{};
+  auto invoker = gemm.MakeInvoker();
+
+  auto a_element_op = AElementOp{};
+  auto b_element_op = BElementOp{};
+  auto c_element_op = CElementOp{};
+
+
+  using DDataArrayType = std::array<const void*, 0>;
+  DDataArrayType DDataArray;
+
+  // We swap A and B inputs here as a temporary workaround
+  auto argument = gemm.MakeArgument(
+     reinterpret_cast<const ADataType*>(b),
+     reinterpret_cast<const BDataType*>(a),
+     reinterpret_cast<CDataType*>(c),
+     N,
+     M,
+     K,
+     StrideB,
+     StrideA,
+     StrideC,
+     b_element_op,
+     a_element_op,
+     c_element_op);
+
+
+ if(!gemm.IsSupportedArgument(argument))
+ {
+        printf("error shape = %ld %ld %ld TRANSA=%d TRANSB=%d \n",
+                        n, m, k,TRANSA, TRANSB);
+
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+ }
+
+
+ auto stream = at::cuda::getCurrentHIPStream().stream();
+#if 1
+ invoker.Run(argument, StreamConfig{stream, false});
+#else
+  float ave_time = invoker.Run(argument, StreamConfig{stream, true});
+  std::size_t flop = std::size_t(2) * M * N * K;
+
+  std::size_t num_btype =
+              sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+  float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+  float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+  std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                          << gb_per_sec << " GB/s, " << N <<" " <<M<<" " << k <<" "
+                          << "stride: "<<StrideA <<" "<<StrideB <<" "<<StrideC <<" "
+                          <<  gemm.GetTypeString()
+                          << std::endl;
+#endif
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h b/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h
index c0835729f88b..95b82c53c181 100644
--- a/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h
+++ b/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <ATen/Config.h>
+<<<<<<< HEAD
 #include <unordered_map>
 #if AT_KLEIDIAI_ENABLED()
 
@@ -13,6 +14,21 @@
 #include <kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h>
 #include <kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h>
 #include <kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h>
+=======
+#if AT_KLEIDIAI_ENABLED()
+#include <unordered_map>
+
+#include <kai/kai_common.h>
+#include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.h>
+#include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm.h>
+#include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp_qsi4c32p_interface.h>
+#include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h>
+#include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h>
+#include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h>
+#include <kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h>
+#include <kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h>
+#include <kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::native::kleidiai {
 
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
index 0972978a1420..40e002d0f7df 100644
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@@ -16,6 +16,10 @@
 #include <ATen/ops/empty_like.h>
 #include <ATen/ops/empty_like_native.h>
 #include <ATen/ops/layer_norm_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/_fused_rms_norm.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/native_batch_norm.h>
 #include <ATen/ops/native_layer_norm.h>
 #include <ATen/ops/native_layer_norm_backward_native.h>
@@ -27,7 +31,10 @@
 #endif
 
 #ifdef USE_MPS
+<<<<<<< HEAD
 #include <ATen/native/mps/operations/RMSNorm.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/GradMode.h>
 #endif
 
@@ -281,7 +288,11 @@ Tensor rms_norm_symint(
 
     if (!(GradMode::is_enabled() && any_inputs_require_grad) && !any_nested && is_input_fp && is_weight_fp) {
       auto eps_val = eps.value_or(std::numeric_limits<double>::epsilon());
+<<<<<<< HEAD
       return mps::rms_norm_mps_kernel(input.contiguous(), normalized_shape, weight.contiguous(), eps_val);
+=======
+      return at::_fused_rms_norm(input.contiguous(), normalized_shape.size(), weight.contiguous(), eps_val);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 #endif
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index 19b64c3f69ec..13a576aaeb0d 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -80,7 +80,13 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     checkAllDefined(c, {running_mean, running_var});
   }
   checkAllSameGPU(c, {input, weight, bias, running_mean, running_var});
+<<<<<<< HEAD
   if (input->scalar_type() != ScalarType::Half && input->scalar_type() != ScalarType::BFloat16) {
+=======
+  if (input->scalar_type() == ScalarType::Half || input->scalar_type() == ScalarType::BFloat16) {
+    checkScalarType(c, weight, ScalarType::Float);
+  } else {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     checkAllSameType(c, {input, weight});
   }
   checkAllSameType(c, {weight, bias, running_mean, running_var});
diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp
index 77a2b745a244..96c99c4c7673 100644
--- a/aten/src/ATen/native/miopen/RNN_miopen.cpp
+++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp
@@ -202,8 +202,13 @@ struct TensorDescriptorListParams {
     int64_t input_size;
     int64_t batch_sizes_sum;
 
+<<<<<<< HEAD
     bool is_input_packed() const {
         return batch_sizes.size() != 0;
+=======
+    [[nodiscard]] bool is_input_packed() const {
+        return !batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     void set(IntArrayRef input_sizes, IntArrayRef batch_sizes_, bool batch_first) {
@@ -227,8 +232,12 @@ struct TensorDescriptorListParams {
     }
 
     std::vector<TensorDescriptor> descriptors(Tensor x) const {
+<<<<<<< HEAD
         auto is_input_packed = batch_sizes.size() != 0;
         if (is_input_packed) {
+=======
+        if (is_input_packed()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return rnn_descriptor_sequence(x, batch_sizes);
         } else {
             return rnn_descriptor(x[0], seq_length);
@@ -545,7 +554,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> miopen_rnn(
         TORCH_CHECK(!cx.defined(), "miopen_rnn: illegal defined cx for non-LSTM RNN.");
     }
 
+<<<<<<< HEAD
     auto is_input_packed = fn.tensors.batch_sizes.size() != 0;
+=======
+    auto is_input_packed = !fn.tensors.batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (batch_first && !is_input_packed) {
         input = input.transpose(0, 1);
     }
@@ -656,7 +669,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> miopen_rnn_backward_input(
         TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN");
     }
 
+<<<<<<< HEAD
     auto is_input_packed = fn_batch_sizes.size() != 0;
+=======
+    auto is_input_packed = !fn_batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (batch_first && !is_input_packed) {
         input = input.transpose(0, 1);
         grad_output = grad_output.transpose(0, 1);
@@ -773,7 +790,11 @@ std::vector<Tensor> miopen_rnn_backward_weight(
         TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN");
     }
 
+<<<<<<< HEAD
     auto is_input_packed = fn_batch_sizes.size() != 0;
+=======
+    auto is_input_packed = !fn_batch_sizes.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (batch_first && !is_input_packed) {
         input = input.transpose(0, 1);
         output = output.transpose(0, 1);
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index 8deefaade89c..7e59cfe84a32 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -279,7 +279,11 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
 
 
 Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
+<<<<<<< HEAD
   TORCH_CHECK(self.is_floating_point());
+=======
+  TORCH_CHECK(self.is_floating_point(), "Only supports floating-point dtypes, but found: ", self.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto input_sizes = self.sizes();
   DimVector out_sizes(input_sizes.begin(), input_sizes.end());
   auto last_dim = dim.back();
@@ -307,7 +311,11 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
 }
 
 Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) {
+<<<<<<< HEAD
   TORCH_CHECK(self.is_complex());
+=======
+  TORCH_CHECK(self.is_complex(), "Only supports complex dtypes, but found: ", self.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (dim.empty()) {
     return self.clone();
   }
@@ -516,7 +524,11 @@ static DimVector _sort_dims(const Tensor& self, IntArrayRef dim, bool exclude_la
 
 // n-dimensional complex to real IFFT
 Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
+<<<<<<< HEAD
   TORCH_CHECK(self.is_complex());
+=======
+  TORCH_CHECK(self.is_complex(), "Only supports complex dtypes, but found: ", self.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOTE: Multi-dimensional C2R transforms don't agree with numpy in cases
   // where the input isn't strictly Hermitian-symmetric. Instead, we use a
   // multi-dim C2C transform followed by a 1D C2R transform.
@@ -539,7 +551,11 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
 
 // n-dimensional real to complex FFT
 Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
+<<<<<<< HEAD
   TORCH_CHECK(self.is_floating_point());
+=======
+  TORCH_CHECK(self.is_floating_point(), "Only supports floating-point dtypes, but found: ", self.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto input_sizes = self.sizes();
   DimVector out_sizes(input_sizes.begin(), input_sizes.end());
   auto last_dim = dim.back();
@@ -560,7 +576,11 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
 
 // n-dimensional complex to complex FFT/IFFT
 Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) {
+<<<<<<< HEAD
   TORCH_CHECK(self.is_complex());
+=======
+  TORCH_CHECK(self.is_complex(), "Only supports complex dtypes, but found: ", self.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (dim.empty()) {
     return self.clone();
   }
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index a8daadfb4c8b..c55ced1af219 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -440,6 +440,10 @@ Tensor mkldnn_convolution_pointwise_binary(
       po.append_eltwise(unary_alg, 0.f, 0.f);
     }
     op_attr.set_post_ops(po);
+<<<<<<< HEAD
+=======
+    auto aprop_kind = ideep::prop_kind::forward_inference;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (bias.defined()) {
       const ideep::tensor b = itensor_from_tensor(bias);
@@ -456,7 +460,13 @@ Tensor mkldnn_convolution_pointwise_binary(
           padding_expanded,
           groups,
           /* is_channels_last */ true,
+<<<<<<< HEAD
           op_attr);
+=======
+          op_attr,
+          ideep::algorithm::convolution_direct,
+          aprop_kind);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       ideep::convolution_forward::compute_binary(
           x,
@@ -470,7 +480,13 @@ Tensor mkldnn_convolution_pointwise_binary(
           padding_expanded,
           groups,
           /* is_channels_last */ true,
+<<<<<<< HEAD
           op_attr);
+=======
+          op_attr,
+          ideep::algorithm::convolution_direct,
+          aprop_kind);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return output;
   } else {
@@ -573,6 +589,10 @@ Tensor& mkldnn_convolution_pointwise_binary_(
     } else {
       op_attr = ideep::attr_t::fuse_sum();
     }
+<<<<<<< HEAD
+=======
+    auto aprop_kind = ideep::prop_kind::forward_inference;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _mkldnn_convolution_out(
         input_t,
         weight_t,
@@ -584,7 +604,12 @@ Tensor& mkldnn_convolution_pointwise_binary_(
         padding_expanded,
         groups,
         true,
+<<<<<<< HEAD
         op_attr);
+=======
+        op_attr,
+        aprop_kind);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     // Fallback case, if inputs are not channels last or have different dtype,
     // OneDNN fusion may have performance regression.
diff --git a/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp b/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp
index f102756ebbb9..257cb620c15f 100644
--- a/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp
+++ b/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp
@@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Config.h>
+<<<<<<< HEAD
 #include <c10/core/Allocator.h>
 
 #if AT_MKLDNN_ENABLED()
@@ -10,6 +11,19 @@
 using namespace ideep;
 
 RegisterEngineAllocator cpu_alloc(
+=======
+
+#if AT_MKLDNN_ENABLED()
+
+#include <c10/core/Allocator.h>
+// needs to be included only once in library.
+#include <ideep_pin_singletons.hpp>
+#include <ATen/native/mkldnn/IDeepRegistration.h>
+
+using namespace ideep;
+
+static RegisterEngineAllocator cpu_alloc(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   engine::cpu_engine(),
   [](size_t size) {
     return c10::GetAllocator(c10::DeviceType::CPU)->raw_allocate(size);
@@ -20,8 +34,11 @@ RegisterEngineAllocator cpu_alloc(
 );
 
 namespace at::native::mkldnn{
+<<<<<<< HEAD
 void clear_computation_cache();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void clear_computation_cache() {
   // Reset computation_cache for forward convolutions
   // As it also caches max number of OpenMP workers
diff --git a/aten/src/ATen/native/mkldnn/IDeepRegistration.h b/aten/src/ATen/native/mkldnn/IDeepRegistration.h
new file mode 100644
index 000000000000..a265bb789bda
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/IDeepRegistration.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace at::native::mkldnn{
+void clear_computation_cache();
+
+
+} // namespace  at::native::mkldnn
diff --git a/aten/src/ATen/native/mkldnn/Linear.cpp b/aten/src/ATen/native/mkldnn/Linear.cpp
index 86304ccbb2a8..060e5a2604b3 100644
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@@ -4,6 +4,10 @@
 #include <ATen/core/Tensor.h>
 #include <torch/library.h>
 #include <ATen/native/mkldnn/Linear.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/Resize.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -46,8 +50,24 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_linear_backward(
   TORCH_CHECK(false, "mkldnn_linear_backward: ATen not compiled with MKLDNN support");
 }
 
+<<<<<<< HEAD
 } // namespace at::native
 
+=======
+Tensor&
+mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+  TORCH_INTERNAL_ASSERT(false, "mkldnn_scaled_mm: ATen not compiled with MKLDNN support");
+}
+
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #else // AT_MKLDNN_ENABLED
 
@@ -187,6 +207,17 @@ Tensor mkldnn_linear_pointwise(
     std::string_view attr,
     c10::List<std::optional<at::Scalar>> scalars,
     std::optional<std::string_view> algorithm) {
+<<<<<<< HEAD
+=======
+  auto aprop_kind = ideep::prop_kind::forward;
+  bool maybe_backward = GradMode::is_enabled() &&
+      (input_t.requires_grad() || weight_t.requires_grad() ||
+       (bias_opt.has_value() && bias_opt->defined() &&
+        bias_opt->requires_grad()));
+  if (!maybe_backward) {
+    aprop_kind = ideep::prop_kind::forward_inference;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto input = input_t.contiguous();
   auto input_size = input.sizes();
 
@@ -238,13 +269,23 @@ Tensor mkldnn_linear_pointwise(
         w,
         mkldnn_bias.value(),
         mkldnn_output,
+<<<<<<< HEAD
         op_attr);
+=======
+        op_attr,
+        aprop_kind);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     ideep::inner_product_forward::compute</*reorder_src=*/false, /*reorder_weight=*/false>(
         mkldnn_input,
         w,
         mkldnn_output,
+<<<<<<< HEAD
         op_attr);
+=======
+        op_attr,
+        aprop_kind);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   if (dim != 2) {
@@ -317,6 +358,10 @@ Tensor mkldnn_linear_pointwise_binary(
 
   auto other_desc = mkldnn_other.get_desc();
   auto op_attr = ideep::attr_t::fuse_binary(it_binary->second, other_desc);
+<<<<<<< HEAD
+=======
+  auto aprop_kind = ideep::prop_kind::forward_inference;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (mkldnn_bias.has_value()) {
     ideep::inner_product_forward::compute_binary</*reorder_src=*/false, /*reorder_weight=*/false>(
@@ -325,10 +370,18 @@ Tensor mkldnn_linear_pointwise_binary(
         w,
         mkldnn_bias.value(),
         mkldnn_output,
+<<<<<<< HEAD
         op_attr);
   } else {
     ideep::inner_product_forward::compute_binary</*reorder_src=*/false, /*reorder_weight=*/false>(
         mkldnn_input, mkldnn_other, w, mkldnn_output, op_attr);
+=======
+        op_attr,
+        aprop_kind);
+  } else {
+    ideep::inner_product_forward::compute_binary</*reorder_src=*/false, /*reorder_weight=*/false>(
+        mkldnn_input, mkldnn_other, w, mkldnn_output, op_attr, aprop_kind);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   if (dim != 2) {
@@ -447,6 +500,138 @@ TORCH_LIBRARY_IMPL(mkldnn, MkldnnCPU, m) {
       TORCH_FN(mkldnn_linear_pointwise_binary));
 }
 
+<<<<<<< HEAD
+=======
+Tensor&
+mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+
+  TORCH_INTERNAL_ASSERT((scale_a.numel() == 1 && scale_b.numel() == 1), "Now _scaled_mm only supports per-tensor scaling for CPU backend.");
+  TORCH_CHECK(
+      !scale_result ||
+          (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
+      "scale_result must be a float scalar");
+  TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
+       " but got ", bias->numel());
+
+  // Check types
+  TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
+  TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
+  TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
+
+  // Validation checks have passed lets resize the output to actual size
+  auto mat1_c = mat1.contiguous();
+  auto mat2_c = mat2.contiguous();
+  IntArrayRef mat1_sizes = mat1_c.sizes();
+  IntArrayRef mat2_sizes = mat2_c.sizes();
+  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
+
+  float input_scale = scale_a.item<float>();
+  float weight_scale = scale_b.item<float>();
+  float output_scale = float(1.0);
+  if (scale_result.has_value() &&
+      (*out_dtype == ScalarType::Float8_e4m3fn ||
+       *out_dtype == ScalarType::Float8_e5m2)) {
+    output_scale = scale_result.value().item<float>();
+  }
+  auto src = at::native::itensor_view_from_dense(mat1_c);
+  auto weight_t = at::native::itensor_view_from_dense(mat2_c);
+  bool with_bias = bias.has_value();
+  int64_t K = mat1_sizes[1], M = mat1_sizes[0],
+          N = mat2_sizes[1];
+
+  std::vector<int64_t> src_dims = {M, K};
+  std::vector<int64_t> weight_dims = {K, N};
+  std::vector<int64_t> dst_dims = {M, N};
+
+  ideep::tensor dst = at::native::itensor_view_from_dense(out);
+  auto src_desc = ideep::tensor::desc(
+      src_dims,
+      get_mkldnn_dtype(mat1.scalar_type()),
+      ideep::format_tag::any);
+  auto weights_desc = ideep::tensor::desc(
+      weight_dims,
+      get_mkldnn_dtype(mat2.scalar_type()),
+      ideep::format_tag::any);
+  auto dst_desc = ideep::tensor::desc(
+      dst_dims,
+      get_mkldnn_dtype(out.scalar_type()),
+      ideep::format_tag::any);
+  ideep::tensor onednn_bias;
+  if (with_bias) {
+    auto bias_value = bias.value();
+    if (bias_value.dim() == 1) {
+      auto b_reshape = bias_value.reshape({1, bias_value.size(0)});
+      onednn_bias = at::native::itensor_view_from_dense(b_reshape);
+    } else {
+      onednn_bias = at::native::itensor_view_from_dense(bias_value);
+    }
+  }
+  auto bias_desc = ideep::tensor::desc();
+  if (with_bias) {
+    bias_desc = ideep::tensor::desc(onednn_bias.get_dims(),
+                        get_mkldnn_dtype(bias.value().scalar_type()),
+                        ideep::format_tag::any);
+  }
+  auto op_attr = ideep::attr_t();
+  if (input_scale != 1.0f) {
+    op_attr.set_scales_mask(DNNL_ARG_SRC, 0);
+  }
+  if (weight_scale != 1.0f) {
+    op_attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+  }
+  if (output_scale != 1.0f) {
+    op_attr.set_scales_mask(DNNL_ARG_DST, 0);
+  }
+
+  op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+  auto engine = ideep::engine::cpu_engine();
+  dnnl::matmul::primitive_desc primitive_desc = with_bias
+      ? dnnl::matmul::primitive_desc(
+            engine, src_desc, weights_desc, bias_desc, dst_desc, op_attr)
+      : dnnl::matmul::primitive_desc(
+            engine, src_desc, weights_desc, dst_desc, op_attr);
+  auto expected_weight = weight_t.reorder_if_differ_in(primitive_desc.weights_desc());
+  auto primitive = dnnl::matmul(primitive_desc);
+
+  // Prepare args and execute primitive
+  ideep::tensor scratchpad(primitive_desc.scratchpad_desc());
+  ideep::exec_args args;
+  args.insert({DNNL_ARG_SRC, src});
+  args.insert({DNNL_ARG_WEIGHTS, expected_weight});
+  args.insert({DNNL_ARG_DST, dst});
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
+  if (with_bias) {
+    args.insert({DNNL_ARG_BIAS, onednn_bias});
+  }
+  ideep::tensor src_scales_t = ideep::tensor(ideep::scale_t(1, input_scale));
+  ideep::tensor wei_scales_t = ideep::tensor(ideep::scale_t(1, weight_scale));
+  ideep::tensor dst_scales_t = ideep::tensor(ideep::scale_t(1, output_scale));
+
+  if (input_scale != 1.0f) {
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales_t});
+  }
+  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scales_t});
+  if (output_scale != 1.0f) {
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales_t});
+  }
+
+  primitive.execute(ideep::stream::default_stream(), args);
+  return out;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at
 
 #endif // AT_MKLDNN_ENABLED
diff --git a/aten/src/ATen/native/mkldnn/Linear.h b/aten/src/ATen/native/mkldnn/Linear.h
index 6a7fcd60b0e6..373cc20aff4b 100644
--- a/aten/src/ATen/native/mkldnn/Linear.h
+++ b/aten/src/ATen/native/mkldnn/Linear.h
@@ -35,3 +35,18 @@ C10_API Tensor mkl_linear(
 } // namespace at
 
 #endif // AT_MKLDNN_ENABLED()
+<<<<<<< HEAD
+=======
+
+namespace at::native {
+Tensor&
+mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum,
+          Tensor& out);
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
index 32daef37a563..99d864c8af38 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
@@ -57,6 +57,13 @@ ideep::tensor::data_type get_mkldnn_dtype(ScalarType type) {
       return ideep::tensor::data_type::bf16;
     case ScalarType::Half:
       return ideep::tensor::data_type::f16;
+<<<<<<< HEAD
+=======
+    case ScalarType::Float8_e4m3fn:
+      return ideep::tensor::data_type::f8_e4m3;
+    case ScalarType::Float8_e5m2:
+      return ideep::tensor::data_type::f8_e5m2;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       TORCH_CHECK(false, "get_mkldnn_dtype: unsupported data type");
   }
@@ -161,8 +168,29 @@ ideep::tensor itensor_view_from_dense(const Tensor& tensor, bool from_const_data
               const_cast<void*>(tensor.const_data_ptr()) :
               tensor.data_ptr()};
   }
+<<<<<<< HEAD
   else {
     TORCH_CHECK(false, "itensor_view_from_dense expects float/bfloat16/half/int8 tensor input");
+=======
+  else if (tensor.scalar_type() == ScalarType::Float8_e4m3fn) {
+    return {{tensor.sizes().vec(),
+            ideep::tensor::data_type::f8_e4m3,
+            tensor.strides().vec()},
+            from_const_data_ptr ?
+              const_cast<void*>(tensor.const_data_ptr()) :
+              tensor.data_ptr()};
+  }
+  else if (tensor.scalar_type() == ScalarType::Float8_e5m2) {
+    return {{tensor.sizes().vec(),
+            ideep::tensor::data_type::f8_e5m2,
+            tensor.strides().vec()},
+            from_const_data_ptr ?
+              const_cast<void*>(tensor.const_data_ptr()) :
+              tensor.data_ptr()};
+  }
+  else {
+    TORCH_CHECK(false, "itensor_view_from_dense expects float/bfloat16/half/int8/fp8 tensor input", tensor.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.h b/aten/src/ATen/native/mkldnn/MKLDNNCommon.h
index cc5739825d7e..c3275074b6a6 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.h
+++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.h
@@ -20,7 +20,11 @@
 #endif
 #endif
 
+<<<<<<< HEAD
 namespace at { namespace native {
+=======
+namespace at::native {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Mapping ScalarType to ideep tensor data_type
 TORCH_API ideep::tensor::data_type get_mkldnn_dtype(ScalarType type);
@@ -62,6 +66,10 @@ TORCH_API ideep::tensor itensor_from_tensor(const Tensor& tensor, bool from_cons
 // Set MKLDNN verbose level
 TORCH_API int set_verbose(int level);
 
+<<<<<<< HEAD
 }}
+=======
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #endif // AT_MKLDNN_ENABLED
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
index b05e2a1a8172..1f056e81aa1a 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
@@ -182,7 +182,11 @@ Tensor mkldnn_reorder_conv2d_weight(
       dilation_expanded,
       groups,
       ideep::algorithm::convolution_direct,
+<<<<<<< HEAD
       ideep::prop_kind::forward,
+=======
+      ideep::prop_kind::forward_inference,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       w.get_data_type(),
       src_dims,
       ideep::attr_t(),
@@ -229,7 +233,11 @@ Tensor mkldnn_reorder_conv3d_weight(
       dilation_expanded,
       groups,
       ideep::algorithm::convolution_direct,
+<<<<<<< HEAD
       ideep::prop_kind::forward,
+=======
+      ideep::prop_kind::forward_inference,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       w.get_data_type(),
       src_dims,
       ideep::attr_t(),
@@ -273,7 +281,12 @@ static Tensor mkldnn_reorder_linear_weight(
       {out_features, in_features},
       input_size,
       /* weight dtype */ dtype,
+<<<<<<< HEAD
       /* src dtype */ dtype);
+=======
+      /* src dtype */ dtype,
+      ideep::prop_kind::forward_inference);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ideep::tensor result;
   result.init(packed_desc);
   result.feed_from(w);
diff --git a/aten/src/ATen/native/mkldnn/Matmul.cpp b/aten/src/ATen/native/mkldnn/Matmul.cpp
index 23e79f5716b4..cbeed7008ae0 100644
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@@ -107,9 +107,34 @@ static bool use_mkldnn_bf32_matmul() {
   return use_mkldnn_bf16_matmul() && at::globalContext().float32MatmulPrecision() == at::Float32MatmulPrecision::MEDIUM;
 }
 
+<<<<<<< HEAD
 
 template<typename scalar_t>
 inline typename std::enable_if_t<
+=======
+// returns an ideep::tensor
+// - dims: shape e.g: {M,N}
+// - idtype: ideep data type e.g: (f32, bf16, f16)
+// - strides: Memory layout
+// - data: data pointer
+template <typename scalar_t>
+inline ideep::tensor make_ideep_tensor(
+    std::vector<int64_t> dims,
+    ideep::tensor::data_type idtype,
+    ideep::tensor::dims& strides,
+    scalar_t *data){
+    ideep::tensor res({
+      dims,
+      idtype,
+      strides
+      },
+    data);
+    return res;
+  }
+
+template<typename scalar_t>
+static inline typename std::enable_if_t<
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::is_same_v<scalar_t, float> ||
     std::is_same_v<scalar_t, c10::Half> ||
     std::is_same_v<scalar_t, c10::BFloat16>,
@@ -155,6 +180,7 @@ mkldnn_gemm(
     idtype = ideep::tensor::data_type::f32;
   }
 
+<<<<<<< HEAD
   ideep::tensor a({
       /*sizes=*/{k, m},
       idtype,
@@ -170,12 +196,18 @@ mkldnn_gemm(
       idtype,
       /*strides=*/c_strides},
     c_data);
+=======
+  ideep::tensor a = make_ideep_tensor<scalar_t>({k, m}, idtype, a_strides, const_cast<scalar_t*>(a_data));
+  ideep::tensor b = make_ideep_tensor<scalar_t>({n, k}, idtype, b_strides, const_cast<scalar_t*>(b_data));
+  ideep::tensor c = make_ideep_tensor<scalar_t>({n, m}, idtype, c_strides, c_data);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ideep::matmul_forward::compute(
       b, a, c, alpha, beta,
       ideep::scale_t(), ideep::scale_t(), ideep::scale_t(), op_attr);
 
   if (c.get_data_handle() != c_data){
+<<<<<<< HEAD
     // ideep will query onednn expect format of output
     // if given output format is not expected, ideep will re-init an output buffer
     // under this case, we need copy the re-inited buffer back to given buffer
@@ -184,6 +216,67 @@ mkldnn_gemm(
         idtype,
         /*strides=*/c_strides},
       c_data);
+=======
+    // ideep will query oneDNN expect format of output
+    // if given output format is not expected, ideep will re-init an output buffer
+    // under this case, we need copy the re-inited buffer back to given buffer
+    ideep::tensor real_output = make_ideep_tensor<scalar_t>({n,m}, idtype, c_strides, c_data);
+    c.reorder_to(real_output);
+  }
+  return true;
+}
+
+template<typename scalar_t>
+inline typename std::enable_if_t<
+    std::is_same_v<scalar_t, c10::BFloat16>,
+    bool>
+mkldnn_gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const scalar_t *a_data, int64_t lda,
+    const scalar_t *b_data, int64_t ldb,
+    float beta,
+    float* c_data, int64_t ldc) {
+// introduce heuristic to validate dispatch to MKLDNN
+// (m * n * k <= 16 * 16 * 16)
+  bool bf16_usable = use_mkldnn_bf16_matmul();
+  if (!bf16_usable) {
+    return false;
+  }
+
+  ideep::attr_t op_attr;
+  // Use mkldnn post ops to perform the add.
+  if (beta != 0.0f) {
+    op_attr = ideep::attr_t::fuse_sum();
+  }
+
+  // NOTE: View as c-contiguous to avoid extra reordering in mkldnn
+  // Use identity: C = AB <=> C^T = B^T A^T
+  ideep::tensor::dims a_strides{{lda, 1}}, b_strides{{ldb, 1}}, c_strides{{ldc, 1}};
+  if (transa != TransposeType::NoTranspose) {
+    std::swap(a_strides[0], a_strides[1]);
+  }
+  if (transb != TransposeType::NoTranspose) {
+    std::swap(b_strides[0], b_strides[1]);
+  }
+
+  auto idtype = ideep::tensor::data_type::bf16;
+
+  ideep::tensor a = make_ideep_tensor<scalar_t>({k, m}, idtype, a_strides, const_cast<scalar_t*>(a_data));
+  ideep::tensor b = make_ideep_tensor<scalar_t>({n, k}, idtype, b_strides, const_cast<scalar_t*>(b_data));
+  ideep::tensor c = make_ideep_tensor<float>({n, m}, ideep::tensor::data_type::f32, c_strides, c_data);
+
+  ideep::matmul_forward::compute(
+      b, a, c, alpha, beta,
+      ideep::scale_t(), ideep::scale_t(), ideep::scale_t(), op_attr);
+
+  if(c.get_data_handle() != c_data){
+    // ideep will query onednn expect format of output
+    // if given output format is not expected, ideep will re-init an output buffer
+    // under this case, we need copy the re-inited buffer back to given buffer
+    ideep::tensor real_output = make_ideep_tensor<float>({n,m}, idtype, c_strides, c_data);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c.reorder_to(real_output);
   }
 
@@ -201,6 +294,20 @@ bool mkldnn_bf16_gemm(
   return mkldnn_gemm<c10::BFloat16>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
+<<<<<<< HEAD
+=======
+bool mkldnn_bf16f32_gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const c10::BFloat16 *a, int64_t lda,
+    const c10::BFloat16 *b, int64_t ldb,
+    float beta,
+    float *c, int64_t ldc) {
+  return mkldnn_gemm<c10::BFloat16>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool mkldnn_fp16_gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
@@ -322,7 +429,11 @@ void mkldnn_matmul(
 
 }
 
+<<<<<<< HEAD
 inline bool checksize(const Tensor& mat1, const Tensor& mat2){
+=======
+static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // if dim = 2, mat1's size = (m * n), mat2's size = (n * k)
   // else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
   // else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
diff --git a/aten/src/ATen/native/mkldnn/Matmul.h b/aten/src/ATen/native/mkldnn/Matmul.h
index b83a5709a46f..8f815b8675ef 100644
--- a/aten/src/ATen/native/mkldnn/Matmul.h
+++ b/aten/src/ATen/native/mkldnn/Matmul.h
@@ -39,6 +39,18 @@ bool mkldnn_bf16_gemm(
     float beta,
     c10::BFloat16 *c, int64_t ldc);
 
+<<<<<<< HEAD
+=======
+bool mkldnn_bf16f32_gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const c10::BFloat16 *a, int64_t lda,
+    const c10::BFloat16 *b, int64_t ldb,
+    float beta,
+    float *c, int64_t ldc);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool mkldnn_fp16_gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp
index b35fb768677d..bb959e33b4ab 100644
--- a/aten/src/ATen/native/mkldnn/Normalization.cpp
+++ b/aten/src/ATen/native/mkldnn/Normalization.cpp
@@ -162,8 +162,12 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm(
     ideep::tensor saved_mean;
     ideep::tensor saved_var;
     ideep::batch_normalization_forward_training::compute(
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
         x, w, b, y, saved_mean, saved_var, momentum, eps);
+=======
+        x, w, b, y, saved_mean, saved_var, static_cast<float>(momentum), static_cast<float>(eps));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (use_running_stat) {
       auto len = x.get_nelems() / w.get_nelems(); // n*h*w
       ideep::tensor m = itensor_from_tensor(running_mean);
@@ -171,8 +175,12 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm(
       const std::vector<float> scales_mean{static_cast<float>(1 - momentum),
                                            static_cast<float>(momentum)};
       const std::vector<float> scales_var{static_cast<float>(1 - momentum),
+<<<<<<< HEAD
                                           // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
                                           static_cast<float>(momentum * len / (len - 1))};
+=======
+                                          static_cast<float>(momentum * static_cast<double>(len) / (static_cast<double>(len) - 1))};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ideep::sum::compute(scales_mean, {m, saved_mean}, m);
       ideep::sum::compute(scales_var, {v, saved_var}, v);
     }
diff --git a/aten/src/ATen/native/mkldnn/RNN.cpp b/aten/src/ATen/native/mkldnn/RNN.cpp
index b3f63b0ee43e..8aaa911851f9 100644
--- a/aten/src/ATen/native/mkldnn/RNN.cpp
+++ b/aten/src/ATen/native/mkldnn/RNN.cpp
@@ -169,7 +169,11 @@ struct RNNParams {
 };
 
 template<bool is_single_direction>
+<<<<<<< HEAD
 std::vector<int64_t> _output_size(const RNNParams& rnn) {
+=======
+static std::vector<int64_t> _output_size(const RNNParams& rnn) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto output_channels = is_single_direction ? rnn.hidden_size
                                              : rnn.hidden_size * rnn.num_directions;
   return {rnn.seq_length, rnn.mini_batch, output_channels};
@@ -457,7 +461,11 @@ static std::tuple<Tensor, Tensor, Tensor> mkldnn_rnn(
     int64_t mode, int64_t hidden_size,
     int64_t num_layers, bool has_biases, bool batch_first, double dropout_p,
     bool train, bool bidirectional, IntArrayRef batch_sizes) {
+<<<<<<< HEAD
   TORCH_CHECK(batch_sizes.size() == 0, "mkldnn_rnn doesn't support packed input");
+=======
+  TORCH_CHECK(batch_sizes.empty(), "mkldnn_rnn doesn't support packed input");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (static_cast<ideep::rnn_kind>(mode) != ideep::rnn_kind::LSTM) {
     TORCH_CHECK(!cx_.defined(), "mkldnn_rnn: illegal defined cx for non-LSTM RNN");
   }
diff --git a/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp b/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
index deea9455b9b2..dc98abc2498c 100644
--- a/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
+++ b/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
@@ -25,7 +25,11 @@ static bool is_mkldnn_fp16_supported() {
   return mkldnn_fp16_device_check();
 }
 
+<<<<<<< HEAD
 constexpr bool is_mkldnn_acl_supported() {
+=======
+static constexpr bool is_mkldnn_acl_supported() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return AT_MKLDNN_ACL_ENABLED();
 }
 
diff --git a/aten/src/ATen/native/mkldnn/Utils.cpp b/aten/src/ATen/native/mkldnn/Utils.cpp
index 561193a959a7..8e2d0e2e8afd 100644
--- a/aten/src/ATen/native/mkldnn/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/Utils.cpp
@@ -85,7 +85,11 @@ void check_mkldnn_binary_fusion_inputs(
     return ideep::attr_t::fuse_##NAME();             \
   }
 
+<<<<<<< HEAD
 AttrFunction attr_func_leaky_relu =
+=======
+static AttrFunction attr_func_leaky_relu =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [](torch::List<std::optional<at::Scalar>> scalars,
        std::optional<std::string_view> algorithm) {
       TORCH_CHECK(
@@ -97,7 +101,11 @@ AttrFunction attr_func_leaky_relu =
       return ideep::attr_t::fuse_relu(1.0, alpha_value);
     };
 
+<<<<<<< HEAD
 AttrFunction attr_func_hardtanh =
+=======
+static AttrFunction attr_func_hardtanh =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [](torch::List<std::optional<at::Scalar>> scalars,
        std::optional<std::string_view> algorithm) {
       TORCH_CHECK(
@@ -113,7 +121,11 @@ AttrFunction attr_func_hardtanh =
       return ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value);
     };
 
+<<<<<<< HEAD
 AttrFunction attr_func_gelu = [](torch::List<std::optional<at::Scalar>> scalars,
+=======
+static AttrFunction attr_func_gelu = [](torch::List<std::optional<at::Scalar>> scalars,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                  std::optional<std::string_view> algorithm) {
   TORCH_CHECK(
       algorithm.has_value(),
@@ -131,7 +143,11 @@ AttrFunction attr_func_gelu = [](torch::List<std::optional<at::Scalar>> scalars,
   return ideep::attr_t::fuse_gelu(1.0, 0.f, 0.f, gelu_type);
 };
 
+<<<<<<< HEAD
 AttrFunction attr_func_hardsigmoid =
+=======
+static AttrFunction attr_func_hardsigmoid =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [](torch::List<std::optional<at::Scalar>> scalars,
        std::optional<std::string_view> algorithm) {
       ideep::attr_t attr;
diff --git a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
index 89b69ec70ddb..5ad75a300c64 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
@@ -1,5 +1,9 @@
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
 #include <ATen/native/transformers/attention.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/transformers/sdp_utils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/transformers/sdp_utils_cpp.h>
 #include <c10/util/Array.h>
 #include <torch/library.h>
@@ -9,27 +13,50 @@ bool check_head_dim_size_xpu(sdp::sdp_params const& params, bool debug) {
   const auto query_size_last = params.query.sym_size(-1);
   const auto key_size_last = params.key.sym_size(-1);
   const auto value_size_last = params.value.sym_size(-1);
+<<<<<<< HEAD
   if ((query_size_last != key_size_last) ||
       (query_size_last != value_size_last)) {
     if (debug) {
       TORCH_WARN(
           "OneDNN attention requires q,k,v to have the same last dimension.",
+=======
+  if (query_size_last != key_size_last) {
+    if (debug) {
+      TORCH_WARN(
+          "OneDNN attention requires q,k to have the same last dimension.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " Got Query.size(-1): ",
           query_size_last,
           ", Key.size(-1): ",
           key_size_last,
+<<<<<<< HEAD
           ", Value.size(-1): ",
           value_size_last,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " instead.");
     }
     return false;
   }
+<<<<<<< HEAD
   if (query_size_last > 256) {
     if (debug) {
       TORCH_WARN(
           "OneDNN attention requires q,k,v to have head dimension less than 256.",
           " Got ",
           query_size_last,
+=======
+
+  constexpr int MAX_HEAD_DIM = 576;
+  const auto max_size_last = query_size_last.max(value_size_last);
+  if (max_size_last > MAX_HEAD_DIM) {
+    if (debug) {
+      TORCH_WARN(
+          "OneDNN attention requires q,k,v to have head dimension less than ",
+          MAX_HEAD_DIM,
+          ". Got ",
+          max_size_last,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " instead.");
     }
     return false;
@@ -174,6 +201,12 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
       query.size(3) == key.size(3),
       "scaled_dot_product_fused_attention_overrideable_xpu: Q/K should have the same head_dim");
   TORCH_INTERNAL_ASSERT(
+<<<<<<< HEAD
+=======
+      query.size(1) % key.size(1) == 0,
+      "scaled_dot_product_fused_attention_overrideable_xpu: number of heads in K/V must divide number of heads in Q");
+  TORCH_INTERNAL_ASSERT(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       dropout_p == 0.0,
       "scaled_dot_product_fused_attention_overrideable_xpu: Currently do not support dropout > 0");
   TORCH_INTERNAL_ASSERT(
@@ -181,31 +214,54 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
       "scaled_dot_product_fused_attention_overrideable_xpu: attn_bias cannot present with is_causal");
 
   const int64_t batch_size = query.size(0);
+<<<<<<< HEAD
   const int64_t num_head = query.size(1);
   const int64_t num_head_kv = key.size(1);
   const int64_t head_dim = query.size(3);
+=======
+  const int64_t num_head_q = query.size(1);
+  const int64_t num_head_kv = key.size(1);
+  const int64_t head_dim_qk = query.size(3);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t head_dim_v = value.size(3);
   const int64_t seq_len_q = query.size(2);
   const int64_t seq_len_kv = key.size(2);
 
+<<<<<<< HEAD
   auto opts = query.options();
   auto output = at::empty({batch_size, num_head, seq_len_q, head_dim}, opts);
+=======
+  at::Tensor output;
+  std::vector<int64_t> output_shape = {
+      batch_size, num_head_q, seq_len_q, head_dim_v};
+  alloc_with_matching_layout(query, output, output_shape);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::Tensor logsumexp, debug_attn_mask; // not supported
 
   at::native::onednn::gpu_float_sdpa(
       batch_size,
       seq_len_q,
       seq_len_kv,
+<<<<<<< HEAD
       num_head,
       num_head_kv,
       head_dim,
+=======
+      num_head_q,
+      num_head_kv,
+      head_dim_qk,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       head_dim_v,
       query,
       key,
       value,
       attn_bias,
       is_causal,
+<<<<<<< HEAD
       scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim)),
+=======
+      scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim_qk)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       output);
 
   // rng not used
diff --git a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
index cc3d4ec9555d..6ebc99afd0e7 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
@@ -91,7 +91,12 @@ Tensor& addmm_out(
     // if result and self are the same tensor, we use post op sum.
     bias = self;
   } else {
+<<<<<<< HEAD
     Tensor binary = self.dim() == 1 ? self.unsqueeze(0) : self;
+=======
+    Tensor binary = self.dim() < 1 ? self.unsqueeze(0) : self;
+    binary = binary.dim() == 1 ? binary.unsqueeze(0) : binary;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool inplace = binary.is_same(result);
     if (inplace) {
       attr.append_post_eltwise(
@@ -154,7 +159,11 @@ Tensor& mm_out(const Tensor& self, const Tensor& mat2, Tensor& result) {
       ")");
   TORCH_CHECK(
       self.dtype() == mat2.dtype(),
+<<<<<<< HEAD
       "expected self and mat2 to have the same dtype, but got: ",
+=======
+      "expected mat1 and mat2 to have the same dtype, but got: ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       self.dtype(),
       " != ",
       mat2.dtype())
@@ -219,7 +228,12 @@ Tensor& baddbmm_out(
   if (beta_ == 0.f) {
     attr.append_post_eltwise(1.f, alpha_, 0.f, attr.kind_with_linear);
   } else {
+<<<<<<< HEAD
     binary = input.dim() < 3 ? input.unsqueeze(0) : input;
+=======
+    Tensor binary = input.dim() < 1 ? input.unsqueeze(0) : input;
+    binary = binary.dim() < 3 ? binary.unsqueeze(0) : binary;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // If input is a 1d tensor need be broadcasted, we need unsqueeze twice.
     binary = binary.dim() < 3 ? binary.unsqueeze_(0) : binary;
     bool inplace = binary.is_same(result);
@@ -418,4 +432,56 @@ TORCH_IMPL_FUNC(addmv_out_xpu)
   xpu::addmv_out(self, mat, vec, beta, alpha, const_cast<Tensor&>(result));
 }
 
+<<<<<<< HEAD
+=======
+Tensor _weight_int4pack_mm_xpu(
+    const Tensor& A,
+    const Tensor& B,
+    int64_t qGroupSize,
+    const Tensor& qScale,
+    const Tensor& qZeros) {
+  auto M = A.size(0); // M
+  auto N = B.size(0); // N1=LCM(N, K)
+  TORCH_CHECK(
+      A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
+      __func__,
+      " : expect A to be either 32-bit or 16-bit float tensor.");
+  TORCH_CHECK(A.is_contiguous(), __func__, " : expect A to be contiguous.");
+  TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
+
+  TORCH_CHECK(B.dtype() == kInt, __func__, " : expect B to be int32 tensor.");
+  TORCH_CHECK(
+      qZeros.dtype() == kChar,
+      __func__,
+      " : expect qZeros to be int8 tensor currently.");
+  TORCH_CHECK(B.dim() == 2, __func__, " : expect B to 2d tensor.");
+
+  TORCH_CHECK(
+      qGroupSize > 1 && qGroupSize % 32 == 0,
+      __func__,
+      " : expect qGroupSize to be multiple of 32 and greater than 1, got ",
+      qGroupSize);
+
+  TORCH_CHECK(
+      qScale.dim() == 2 && qScale.size(1) == N,
+      __func__,
+      ": expect qScale to be 2d tensor with sizes [:, ",
+      N,
+      "]");
+  TORCH_CHECK(
+      qZeros.dim() == 2 && qZeros.size(1) == N,
+      __func__,
+      ": expect qZeros to be 2d tensor with sizes [:, ",
+      N,
+      "]");
+
+  auto C = at::empty({M, N}, A.options());
+
+  // qscale:[K/qGroupSize, N]
+  // qzp:[K/qGroupSize, N]
+  at::native::onednn::woq_matmul_int4(C, A, B, qScale, qZeros, qGroupSize);
+
+  return C;
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
index 52840e2d8753..bc388b89b85d 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
@@ -3,6 +3,11 @@
 #include <ATen/core/ATen_fwd.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/native/ConvUtils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mkldnn/xpu/Conv.h>
+#include <ATen/native/mkldnn/xpu/FusionUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
 #include <ATen/native/utils/ParamUtils.h>
 #include <ATen/ops/full.h>
@@ -309,6 +314,7 @@ static at::Tensor view3d(const at::Tensor& tensor) {
   return tensor.squeeze(2);
 }
 
+<<<<<<< HEAD
 Attr get_onednn_conv_sum_attr(
     const Tensor& input_r,
     const Tensor& weight_r,
@@ -384,6 +390,8 @@ Attr get_onednn_conv_sum_attr(
   return attr;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace impl
 
 using namespace impl;
@@ -476,6 +484,11 @@ Tensor _convolution_out(
           params.output_padding,
           params.groups);
       output = at::empty(dst_tz, input.options(), mfmt);
+<<<<<<< HEAD
+=======
+    } else {
+      output = output_r;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     onednn::deconvolution(
@@ -518,6 +531,11 @@ Tensor _convolution_out(
           params.stride,
           params.dilation);
       output = at::empty(dst_tz, input.options(), mfmt);
+<<<<<<< HEAD
+=======
+    } else {
+      output = output_r;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     onednn::convolution(
         output,
@@ -751,6 +769,122 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward_overrideable(
   return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
 }
 
+<<<<<<< HEAD
+=======
+Tensor convolution_pointwise(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    std::string_view attr,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<std::string_view> algorithm) {
+  c10::DeviceGuard device_guard(input_t.device());
+  Attr att;
+  att = construct_unary_attr(att, attr, scalars, algorithm);
+  const Tensor bias = bias_opt.has_value() ? bias_opt.value() : at::Tensor();
+
+  return _convolution(
+      input_t,
+      weight_t,
+      bias,
+      stride,
+      padding,
+      dilation,
+      /*transposed*/ false,
+      /*output_padding*/ {0},
+      groups,
+      att);
+}
+
+Tensor convolution_pointwise_binary(
+    const Tensor& input_t,
+    const Tensor& other_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    std::string_view binary_attr,
+    std::optional<at::Scalar> alpha,
+    std::optional<std::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<std::string_view> unary_algorithm) {
+  c10::DeviceGuard device_guard(input_t.device());
+  Tensor output;
+  Tensor bias = bias_opt.has_value() ? bias_opt.value() : at::Tensor();
+  // Step1: Construct binary attr
+  Attr attr;
+  attr = construct_binary_attr(attr, binary_attr, other_t);
+  // Step2: Append unary attr
+  if (unary_attr.has_value())
+    attr = construct_unary_attr(
+        attr, unary_attr.value(), unary_scalars, unary_algorithm);
+
+  Tensor res = _convolution_out(
+      output,
+      input_t,
+      weight_t,
+      bias,
+      stride,
+      padding,
+      dilation,
+      /*transposed*/ false,
+      /*output_padding*/ {0},
+      groups,
+      attr);
+
+  // Step3: Run conv
+  return res;
+}
+
+Tensor& convolution_pointwise_binary_(
+    Tensor& other_t,
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    std::string_view binary_attr,
+    std::optional<at::Scalar> alpha,
+    std::optional<std::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<std::string_view> unary_algorithm) {
+  c10::DeviceGuard device_guard(input_t.device());
+  Tensor bias = bias_opt.has_value() ? bias_opt.value() : at::Tensor();
+  // Step1: Construct binary attr
+  Attr attr;
+  attr = construct_binary_attr(attr, binary_attr, other_t);
+
+  // Step2: Append unary attr
+  if (unary_attr.has_value())
+    attr = construct_unary_attr(
+        attr, unary_attr.value(), unary_scalars, unary_algorithm);
+
+  _convolution_out(
+      other_t,
+      input_t,
+      weight_t,
+      bias,
+      stride,
+      padding,
+      dilation,
+      /*transposed*/ false,
+      /*output_padding*/ {0},
+      groups,
+      attr);
+
+  // Step3: Run conv
+  return other_t;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_LIBRARY_IMPL(aten, XPU, m) {
   m.impl("convolution_overrideable", TORCH_FN(convolution_overrideable));
   m.impl(
@@ -758,4 +892,19 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
       TORCH_FN(convolution_backward_overrideable));
 }
 
+<<<<<<< HEAD
+=======
+TORCH_LIBRARY_IMPL(mkldnn, XPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise"),
+      TORCH_FN(convolution_pointwise));
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise.binary"),
+      TORCH_FN(convolution_pointwise_binary));
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise_.binary"),
+      TORCH_FN(convolution_pointwise_binary_));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.h b/aten/src/ATen/native/mkldnn/xpu/Conv.h
new file mode 100644
index 000000000000..31ddf4e89aa5
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <ATen/Tensor.h>
+#include <ATen/core/List.h>
+
+#if AT_MKLDNN_ENABLED()
+
+namespace at::native::xpu {
+C10_API Tensor convolution_pointwise(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    std::string_view attr,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<std::string_view> algorithm);
+
+C10_API Tensor convolution_pointwise_binary(
+    const Tensor& input_t,
+    const Tensor& other_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    std::string_view binary_attr,
+    std::optional<at::Scalar> alpha,
+    std::optional<std::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<std::string_view> unary_algorithm);
+
+C10_API Tensor& convolution_pointwise_binary_(
+    Tensor& other_t,
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    std::string_view binary_attr,
+    std::optional<at::Scalar> alpha,
+    std::optional<std::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<std::string_view> unary_algorithm);
+
+} // namespace at::native::xpu
+
+#endif // AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/mkldnn/xpu/FusionUtils.cpp b/aten/src/ATen/native/mkldnn/xpu/FusionUtils.cpp
new file mode 100644
index 000000000000..74922bdb0c06
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/FusionUtils.cpp
@@ -0,0 +1,145 @@
+#include <ATen/native/mkldnn/xpu/FusionUtils.h>
+
+using namespace at::native::onednn;
+
+namespace at::native::xpu {
+
+onednn::Attr& handle_argument_less(std::string_view unary, onednn::Attr& attr) {
+  static const std::unordered_map<
+      std::string_view,
+      std::function<onednn::Attr&(onednn::Attr&)>>
+      unary_map = {
+          {"relu",
+           [](onednn::Attr& attr) -> onednn::Attr& {
+             return attr.append_post_eltwise(
+                 1.0f, 0.0f, 0.0f, attr.kind_with_relu);
+           }},
+          {"sigmoid",
+           [](onednn::Attr& attr) -> onednn::Attr& {
+             return attr.append_post_eltwise(
+                 1.0f, 0.0f, 0.0f, attr.kind_with_sigmoid);
+           }},
+          {"tanh",
+           [](onednn::Attr& attr) -> onednn::Attr& {
+             return attr.append_post_eltwise(
+                 1.0f, 0.0f, 0.0f, attr.kind_with_tanh);
+           }},
+          {"hardswish",
+           [](onednn::Attr& attr) -> onednn::Attr& {
+             return attr.append_post_eltwise(
+                 1.0f, 1.0f / 6.0f, 1.0f / 2.0f, attr.kind_with_hardswish);
+           }},
+          {"swish",
+           [](onednn::Attr& attr) -> onednn::Attr& {
+             return attr.append_post_eltwise(
+                 1.0f, 1.0f, 0.0f, attr.kind_with_swish);
+           }},
+          {"hardsigmoid",
+           [](onednn::Attr& attr) -> onednn::Attr& {
+             return attr.append_post_eltwise(
+                 1.0f, 1.0f / 6.0f, 1.0f / 2.0f, attr.kind_with_hardsigmoid);
+           }},
+          {"none", [](onednn::Attr& attr) -> onednn::Attr& { return attr; }}};
+
+  if (unary_map.find(unary) != unary_map.end()) {
+    return unary_map.at(unary)(attr);
+  }
+  TORCH_CHECK(
+      false,
+      "Unary attr ",
+      unary,
+      " is not supported for conv/linear post unary fusion");
+}
+
+onednn::Attr& handle_need_sclars(
+    std::string_view unary,
+    onednn::Attr& attr,
+    torch::List<std::optional<at::Scalar>> scalars) {
+  static const std::unordered_map<
+      std::string_view,
+      std::function<onednn::Attr&(
+          onednn::Attr&, torch::List<std::optional<at::Scalar>>)>>
+      unary_map = {
+          {"leaky_relu",
+           [](onednn::Attr& attr,
+              torch::List<std::optional<at::Scalar>> scalars) -> onednn::Attr& {
+             auto alpha =
+                 scalars[0].get().toOptional<at::Scalar>().value().to<float>();
+             return attr.append_post_eltwise(
+                 1.0f, alpha, 0.f, attr.kind_with_relu);
+           }},
+          {"hardtanh",
+           [](onednn::Attr& attr,
+              torch::List<std::optional<at::Scalar>> scalars) -> onednn::Attr& {
+             auto alpha =
+                 scalars[0].get().toOptional<at::Scalar>().value().to<float>();
+             auto beta =
+                 scalars[1].get().toOptional<at::Scalar>().value().to<float>();
+             return attr.append_post_eltwise(
+                 1.0f, alpha, beta, attr.kind_with_clip);
+           }}};
+
+  if (unary_map.find(unary) != unary_map.end()) {
+    return unary_map.at(unary)(attr, scalars);
+  }
+  TORCH_CHECK(
+      false,
+      "Unary attr ",
+      unary,
+      " is not supported for conv/linear post unary fusion");
+}
+
+onednn::Attr& handle_need_algorithm(
+    std::string_view unary,
+    onednn::Attr& attr,
+    std::optional<std::string_view> algorithm) {
+  TORCH_CHECK(
+      unary == "gelu",
+      "GELU is the only unary operation that requires an algorithm currently");
+  if (!algorithm.has_value()) {
+    TORCH_CHECK(
+        false,
+        "GELU algorithm is not specified, please specify it as 'none' or 'tanh'");
+  }
+  enum dnnl::algorithm gelu_type;
+  if (algorithm.value() == "none") {
+    gelu_type = attr.kind_with_gelu_erf;
+  } else {
+    gelu_type = attr.kind_with_gelu_tanh;
+  }
+  return attr.append_post_eltwise(1.0f, 0.0f, 0.0f, gelu_type);
+}
+
+onednn::Attr& construct_unary_attr(
+    onednn::Attr& attr,
+    std::string_view unary,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<std::string_view> algorithm) {
+  // Define sets for unary operations based on their argument requirements.
+  // Category `argument_less`: stateless operations
+  // Category `need_scalars`: require alpha/beta
+  // Category `need_algorithm`: require algorithm specification, only gelu now.
+  // If further unary operations required, they can be added to these sets or
+  // add new sets according to their new categories.
+  static const std::set<std::string_view> argument_less = {
+      "none", "relu", "sigmoid", "tanh", "hardswish", "swish", "hardsigmoid"};
+  static const std::set<std::string_view> need_scalars = {
+      "leaky_relu", "hardtanh"};
+  static const std::set<std::string_view> need_algorithm = {"gelu"};
+
+  if (argument_less.find(unary) != argument_less.end()) {
+    return handle_argument_less(unary, attr);
+  } else if (need_scalars.find(unary) != need_scalars.end()) {
+    return handle_need_sclars(unary, attr, scalars);
+  } else if (need_algorithm.find(unary) != need_algorithm.end()) {
+    return handle_need_algorithm(unary, attr, algorithm);
+  } else {
+    TORCH_CHECK(
+        false,
+        "Unary attr ",
+        unary,
+        " is not supported for conv/linear post unary fusion");
+  }
+}
+
+} // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/FusionUtils.h b/aten/src/ATen/native/mkldnn/xpu/FusionUtils.h
new file mode 100644
index 000000000000..b8b4e1cccf0a
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/FusionUtils.h
@@ -0,0 +1,53 @@
+#pragma once
+#include <detail/oneDNN.h>
+
+//
+// This header file provides utility functions for constructing and managing
+// oneDNN attributes used in fusion operations on XPU devices. These utilities
+// include functions for creating unary and binary post-operations attributes,
+// as well as mapping string representations of operations to oneDNN attributes.
+//
+
+namespace at::native::xpu {
+at::native::onednn::Attr& unary_attr_with_arg(
+    onednn::Attr& attr,
+    std::string_view unary,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<std::string_view> algorithm);
+
+at::native::onednn::Attr& string_to_unary_attr(
+    onednn::Attr& attr,
+    std::string_view unary);
+
+at::native::onednn::Attr& construct_unary_attr(
+    onednn::Attr& attr,
+    std::string_view unary,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<std::string_view> algorithm);
+
+template <bool is_matmul = false>
+onednn::Attr& construct_binary_attr(
+    onednn::Attr& attr,
+    std::string_view binary,
+    const Tensor& other) {
+  if (binary == "mul") {
+    attr.append_post_binary<is_matmul>(attr.kind_with_binary_mul, other);
+  } else if (binary == "sub") {
+    attr.append_post_binary<is_matmul>(attr.kind_with_binary_sub, other);
+  } else if (binary == "div") {
+    attr.append_post_binary<is_matmul>(attr.kind_with_binary_div, other);
+  } else if (binary == "add") {
+    attr.append_post_binary<is_matmul>(attr.kind_with_binary_add, other);
+  } else if (binary == "sum") {
+    attr.append_post_sum(1.f, 1.f, 0);
+  } else {
+    TORCH_CHECK(
+        binary == "none",
+        "Binary attr ",
+        binary,
+        "is not supported for conv/linear post binary fusion");
+  }
+  return attr;
+}
+
+} // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/Linear.cpp b/aten/src/ATen/native/mkldnn/xpu/Linear.cpp
new file mode 100644
index 000000000000..472c726f9420
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/Linear.cpp
@@ -0,0 +1,110 @@
+#include <ATen/DeviceGuard.h>
+#include <torch/library.h>
+
+#include <FusionUtils.h>
+
+namespace at::native::xpu {
+
+std::tuple<std::vector<int64_t>, std::vector<int64_t>, std::vector<int64_t>>
+collapse_in_out_dim(at::Tensor input, int64_t dim, at::Tensor weight) {
+  // dim collapse, e.g. [B, M, K] -> [BM, K]
+  std::vector<int64_t> input_reshaped_size = (dim == 2)
+      ? std::vector<int64_t>(input.size(0), input.size(1))
+      : std::vector<int64_t>{
+            input.numel() / (input.size(input.dim() - 1)),
+            input.size(input.dim() - 1)};
+  // [B, M, K] -> [B, M]
+  std::vector<int64_t> output_size(
+      input.sizes().begin(), input.sizes().end() - 1);
+  // [B, M, N]
+  output_size.push_back(weight.size(0));
+
+  // [BM, N]
+  std::vector<int64_t> output_reshaped_size{
+      input_reshaped_size[0], weight.size(0)};
+  return {input_reshaped_size, output_size, output_reshaped_size};
+}
+
+Tensor linear_pointwise(
+    const Tensor& input_t, // [M, K] or [B, M, K]
+    const Tensor& weight_t, // [N, K]
+    const std::optional<Tensor>& bias_opt,
+    std::string_view attr,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<std::string_view> algorithm) {
+  onednn::Attr att;
+  const OptionalDeviceGuard device_guard(device_of(input_t));
+  att = construct_unary_attr(att, attr, scalars, algorithm);
+  auto input = input_t.contiguous();
+
+  const int64_t dim = input.dim();
+
+  auto [input_reshaped_size, output_size, output_reshaped_size] =
+      collapse_in_out_dim(input, dim, weight_t);
+  Tensor output = at::empty(output_size, input.options());
+  Tensor input_reshaped = input;
+  if (dim != 2) {
+    output = output.reshape(output_reshaped_size);
+    input_reshaped = input_reshaped.reshape(input_reshaped_size);
+  }
+
+  auto bias = bias_opt.has_value() ? bias_opt.value() : at::Tensor();
+  at::native::onednn::matmul(
+      output, input_reshaped, weight_t, bias, /*m2_trans*/ false, att);
+
+  if (dim != 2) {
+    output = output.reshape(output_size);
+  }
+
+  return output;
+}
+
+Tensor linear_pointwise_binary(
+    const Tensor& input_t,
+    const Tensor& other_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    std::string_view binary_attr) {
+  const OptionalDeviceGuard device_guard(device_of(input_t));
+  onednn::Attr attr;
+  attr = construct_binary_attr<true>(attr, binary_attr, other_t);
+  auto input = input_t.contiguous();
+
+  const int64_t dim = input.dim();
+
+  // dim collapse
+  auto [input_reshaped_size, output_size, output_reshaped_size] =
+      collapse_in_out_dim(input, dim, weight_t);
+  Tensor output = at::empty(output_size, input.options());
+  Tensor input_reshaped = input;
+
+  if (dim != 2) {
+    // input [m, k], weight [n, k], output [m, n]
+    output = output.reshape(output_reshaped_size);
+    input_reshaped = input_reshaped.reshape(input_reshaped_size);
+  } else {
+    TORCH_CHECK(
+        output.dim() == other_t.dim(),
+        "linear_binary_run expects the dimension of output and other tensor to be the same");
+  }
+
+  auto bias = bias_opt.has_value() ? bias_opt.value() : at::Tensor();
+  at::native::onednn::matmul(
+      output, input_reshaped, weight_t, bias, /*m2_trans*/ false, attr);
+
+  if (dim != 2) {
+    output = output.reshape(output_size);
+  }
+  return output;
+}
+
+TORCH_LIBRARY_IMPL(mkldnn, XPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_linear_pointwise"),
+      TORCH_FN(linear_pointwise));
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_linear_pointwise.binary"),
+      TORCH_FN(linear_pointwise_binary));
+}
+
+} // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
index 32394a200fca..eaae8c15a76c 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
@@ -1,9 +1,20 @@
+<<<<<<< HEAD
 #include <ATen/native/mkldnn/xpu/detail/Attr.h>
 #include <ATen/native/mkldnn/xpu/detail/Utils.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
 
 #include <oneapi/dnnl/dnnl.hpp>
 
+=======
+#include <ATen/OpMathType.h>
+#include <ATen/native/mkldnn/xpu/detail/Attr.h>
+#include <ATen/native/mkldnn/xpu/detail/Utils.h>
+#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+#include <oneapi/dnnl/dnnl.hpp>
+
+namespace {
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using namespace at::native::onednn;
 using logical_tensor = dnnl::graph::logical_tensor;
 using data_type = logical_tensor::data_type;
@@ -11,7 +22,17 @@ using dims = logical_tensor::dims;
 using op = dnnl::graph::op;
 using partition = dnnl::graph::partition;
 
+<<<<<<< HEAD
 namespace {
+=======
+inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) {
+  return scalar_type == c10::ScalarType::Float   ? data_type::f32
+      : scalar_type == c10::ScalarType::Half     ? data_type::f16
+      : scalar_type == c10::ScalarType::BFloat16 ? data_type::bf16
+                                                 : data_type::undef;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct SDPALogicalParams {
   enum class TensorID {
     query,
@@ -38,17 +59,30 @@ struct SDPALogicalParams {
       const at::Tensor& value_,
       const std::optional<at::Tensor>& attn_mask_,
       const at::Tensor& output_,
+<<<<<<< HEAD
       bool is_causal) {
     const data_type dtype = // to logical_tensor data type
         query_.scalar_type() == c10::ScalarType::Float      ? data_type::f32
         : query_.scalar_type() == c10::ScalarType::Half     ? data_type::f16
         : query_.scalar_type() == c10::ScalarType::BFloat16 ? data_type::bf16
                                                             : data_type::undef;
+=======
+      int batch_size,
+      int seq_len_q,
+      int seq_len_kv,
+      int num_head_q,
+      int num_head_kv,
+      int head_dim_qk,
+      int head_dim_v,
+      bool is_causal) {
+    const data_type dtype = to_logical_tensor_data_type(query_.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT(
         (dtype != data_type::undef),
         "Only FP16/BF16/FP32 datatypes are currently supported");
     const dims scalar_shape = {1};
     std::vector<logical_tensor> inputLogicalTensors;
+<<<<<<< HEAD
     query = {
         static_cast<size_t>(TensorID::query),
         dtype,
@@ -62,27 +96,102 @@ struct SDPALogicalParams {
     scale = {
         static_cast<size_t>(TensorID::scale),
         dtype,
+=======
+
+    at::Tensor reshaped_query = query_;
+    at::Tensor reshaped_key = key_;
+    at::Tensor reshaped_value = value_;
+    at::Tensor reshaped_output = output_;
+    at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor());
+    if (at::native::onednn::is_broadcast(reshaped_query)) {
+      at::native::onednn::undo_broadcast(reshaped_query);
+    }
+    if (at::native::onednn::is_broadcast(reshaped_key)) {
+      at::native::onednn::undo_broadcast(reshaped_key);
+    }
+    if (at::native::onednn::is_broadcast(reshaped_value)) {
+      at::native::onednn::undo_broadcast(reshaped_value);
+    }
+    if (at::native::onednn::is_broadcast(reshaped_output)) {
+      at::native::onednn::undo_broadcast(reshaped_output);
+    }
+    if (attn_mask_.has_value() &&
+        at::native::onednn::is_broadcast(reshaped_attn_mask)) {
+      at::native::onednn::undo_broadcast(reshaped_attn_mask);
+    }
+
+    if (num_head_q != num_head_kv) { // Check whether the attention is a
+                                     // Grouped-Query Attention (GQA)
+      int group_num = num_head_kv;
+      int group_size = num_head_q / num_head_kv;
+      // oneDNN requires the shape of the query tensor to be represented as
+      // [batch_size, num_head_q / num_head_kv, num_head_kv, seq_len_q,
+      // head_dim_qk]. Please refer to
+      // https://uxlfoundation.github.io/oneDNN/dev_guide_graph_gqa.html#gqa-pattern
+      reshaped_query = query_.view(
+          {batch_size, group_num, group_size, seq_len_q, head_dim_qk});
+      reshaped_key = key_.unsqueeze(2);
+      reshaped_value = value_.unsqueeze(2);
+      reshaped_output = output_.view(
+          {batch_size, group_num, group_size, seq_len_q, head_dim_v});
+      if (attn_mask_.has_value() && attn_mask_.value().dim() == 4) {
+        reshaped_attn_mask = attn_mask_.value().unsqueeze(2);
+      }
+    }
+
+    query = {
+        static_cast<size_t>(TensorID::query),
+        dtype,
+        reshaped_query.sizes().vec(),
+        reshaped_query.strides().vec()};
+    key = {
+        static_cast<size_t>(TensorID::key),
+        dtype,
+        reshaped_key.sizes().vec(),
+        reshaped_key.strides().vec()};
+    scale = {
+        static_cast<size_t>(TensorID::scale),
+        to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         scalar_shape,
         logical_tensor::layout_type::strided,
         logical_tensor::property_type::constant};
     if (is_causal) {
       neg_inf = {
           static_cast<size_t>(TensorID::neg_inf),
+<<<<<<< HEAD
           dtype,
+=======
+          to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           scalar_shape,
           logical_tensor::layout_type::strided,
           logical_tensor::property_type::constant};
     }
     if (attn_mask_.has_value()) {
+<<<<<<< HEAD
       attn_mask = {
           static_cast<size_t>(TensorID::attn_mask),
           dtype,
           attn_mask_->sizes().vec(),
           attn_mask_->strides().vec()};
+=======
+      const data_type mask_dtype =
+          to_logical_tensor_data_type(attn_mask_->scalar_type());
+      TORCH_INTERNAL_ASSERT(
+          (mask_dtype != data_type::undef),
+          "Only FP16/BF16/FP32 datatypes are currently supported for attn_mask");
+      attn_mask = {
+          static_cast<size_t>(TensorID::attn_mask),
+          mask_dtype,
+          reshaped_attn_mask.sizes().vec(),
+          reshaped_attn_mask.strides().vec()};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     value = {
         static_cast<size_t>(TensorID::value),
         dtype,
+<<<<<<< HEAD
         value_.sizes().vec(),
         value_.strides().vec()};
     output = {
@@ -90,6 +199,15 @@ struct SDPALogicalParams {
         dtype,
         output_.sizes().vec(),
         output_.strides().vec()};
+=======
+        reshaped_value.sizes().vec(),
+        reshaped_value.strides().vec()};
+    output = {
+        static_cast<size_t>(TensorID::output),
+        dtype,
+        reshaped_output.sizes().vec(),
+        reshaped_output.strides().vec()};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   std::vector<logical_tensor> get_input() const {
     std::vector<logical_tensor> input = {query, key, scale};
@@ -108,23 +226,38 @@ struct SDPALogicalParams {
 };
 
 partition create_sdpa_graph_partition(
+<<<<<<< HEAD
     int batch_size,
     int seq_len_q,
     int seq_len_k,
     int num_head,
     int head_dim,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool is_causal,
     data_type dtype,
     const SDPALogicalParams& params) {
   // graph building and partitioning
   // currently, we assume that Q and K have same sequence length
 
+<<<<<<< HEAD
   dims qk_output_shape = {batch_size, num_head, seq_len_q, seq_len_k};
   dims scale_shape = {1};
   size_t lt_id = static_cast<size_t>(SDPALogicalParams::TensorID::end);
   size_t op_id = 0;
 
   logical_tensor matmul_qk_out{lt_id++, dtype};
+=======
+  size_t lt_id = static_cast<size_t>(SDPALogicalParams::TensorID::end);
+  size_t op_id = 0;
+
+  // OneDNN graph has optimized implementation for `f16` or `bf16` SDPA with
+  // `f32` intermediate data type on Intel Graphics Products with Intel(R) Xe
+  // Matrix Extensions (Intel(R) XMX) support, which means the
+  // Q/K/V tensors have bf16 or f16 data type while the output of the first
+  // MatMul, Scale, Mask, and the input of SoftMax are in f32 data type.
+  logical_tensor matmul_qk_out{lt_id++, data_type::f32};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   op matmul_qk{
       op_id++,
       op::kind::MatMul,
@@ -133,7 +266,11 @@ partition create_sdpa_graph_partition(
       "matmul_qk"};
   matmul_qk.set_attr<bool>(op::attr::transpose_b, true);
 
+<<<<<<< HEAD
   logical_tensor scaled_qk_out{lt_id++, dtype};
+=======
+  logical_tensor scaled_qk_out{lt_id++, data_type::f32};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   op scale_mul{
       op_id++,
       op::kind::Multiply,
@@ -158,7 +295,11 @@ partition create_sdpa_graph_partition(
   if (params.attn_mask.has_value()) {
     TORCH_INTERNAL_ASSERT(
         !is_causal, "Additive mask cannot use with is_causal.");
+<<<<<<< HEAD
     masked_qk_out = {lt_id++, dtype};
+=======
+    masked_qk_out = {lt_id++, data_type::f32};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mask_add = {
         op_id++,
         op::kind::Add,
@@ -193,7 +334,11 @@ partition create_sdpa_graph_partition(
         {mask_gt_out.value()},
         "mask_gt"};
 
+<<<<<<< HEAD
     masked_qk_out = {lt_id++, dtype};
+=======
+    masked_qk_out = {lt_id++, data_type::f32};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mask_select = {
         op_id++,
         op::kind::Select,
@@ -209,6 +354,10 @@ partition create_sdpa_graph_partition(
 
   op softmax{op_id++, op::kind::SoftMax, "softmax"};
   softmax.set_attr<int64_t>(op::attr::axis, -1);
+<<<<<<< HEAD
+=======
+  softmax.set_attr<std::string>(op::attr::mode, "inf_as_zero");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   logical_tensor softmax_out{lt_id++, dtype};
   softmax.add_input(masked_qk_out.value_or(scaled_qk_out));
@@ -246,11 +395,14 @@ partition create_sdpa_graph_partition(
 }
 
 partition& find_or_create_graph_partition(
+<<<<<<< HEAD
     int batch_size,
     int seq_len_q,
     int seq_len_k,
     int num_head,
     int head_dim,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool is_causal,
     const SDPALogicalParams& params) {
   thread_local static PartitionCache cache;
@@ -280,6 +432,7 @@ partition& find_or_create_graph_partition(
   if (!partition_.has_value()) {
     // partition cache no hit
     // graph building and partitioning
+<<<<<<< HEAD
     partition sdp_partition = create_sdpa_graph_partition(
         batch_size,
         seq_len_q,
@@ -289,6 +442,10 @@ partition& find_or_create_graph_partition(
         is_causal,
         dtype,
         params);
+=======
+    partition sdp_partition =
+        create_sdpa_graph_partition(is_causal, dtype, params);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     partition_ = cache.insert_partition_cache(patternID, sdp_partition);
   }
   return *partition_;
@@ -299,10 +456,17 @@ namespace at::native::onednn {
 void gpu_float_sdpa(
     int batch_size,
     int seq_len_q,
+<<<<<<< HEAD
     int seq_len_k,
     int num_head,
     int num_head_kv,
     int head_dim,
+=======
+    int seq_len_kv,
+    int num_head_q,
+    int num_head_kv,
+    int head_dim_qk,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int head_dim_v,
     const Tensor& query,
     const Tensor& key,
@@ -311,22 +475,32 @@ void gpu_float_sdpa(
     bool is_causal,
     float softmax_scale,
     const Tensor& output) {
+<<<<<<< HEAD
   auto eng = GpuEngineManager::Instance().get_engine(
       {c10::kXPU, c10::xpu::current_device()});
   auto strm = GpuStreamManager::Instance().get_stream();
+=======
+  auto& eng = GpuEngineManager::Instance().get_engine();
+  auto& strm = GpuStreamManager::Instance().get_stream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const auto get_tril_mask = [&]() {
     auto opts = query.options();
     auto bool_tril =
+<<<<<<< HEAD
         at::ones_symint(
             {query.sym_size(-2), key.sym_size(-2)}, opts.dtype(at::kBool))
             .tril();
+=======
+        at::ones_symint({seq_len_q, seq_len_kv}, opts.dtype(at::kBool)).tril();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return at::where(
         bool_tril,
         0.f,
         at::scalar_tensor(-std::numeric_limits<float>::infinity(), opts));
   };
 
+<<<<<<< HEAD
   static bool driver_support_implict_causal = true;
   if (attn_mask.has_value()) {
     TORCH_INTERNAL_ASSERT(
@@ -345,10 +519,23 @@ void gpu_float_sdpa(
   }
 
   std::vector<logical_tensor> l_inputs, l_outputs;
+=======
+  // OneDNN doesn't support fp32 ukernel for implicit causal mask,
+  // and the reference implementation is worse than aten math + explict causal
+  // mask. Fall back to explict causal mask until OneDNN v3.9 which has fp32
+  // ukernel for implicit causal mask.
+  if (is_causal && query.dtype() == at::kFloat) {
+    attn_mask = get_tril_mask();
+    is_causal = false;
+  }
+
+  std::vector<dnnl::graph::logical_tensor> l_inputs, l_outputs;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::optional<dnnl::graph::compiled_partition> compiled_partition;
 
   auto get_compiled_partition = [&]() {
     const SDPALogicalParams logical_params(
+<<<<<<< HEAD
         query, key, value, attn_mask, output, is_causal);
     auto& partition_ = find_or_create_graph_partition(
         batch_size,
@@ -358,6 +545,23 @@ void gpu_float_sdpa(
         head_dim,
         is_causal,
         logical_params);
+=======
+        query,
+        key,
+        value,
+        attn_mask,
+        output,
+        batch_size,
+        seq_len_q,
+        seq_len_kv,
+        num_head_q,
+        num_head_kv,
+        head_dim_qk,
+        head_dim_v,
+        is_causal);
+    auto& partition_ =
+        find_or_create_graph_partition(is_causal, logical_params);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto i = logical_params.get_input();
     auto o = logical_params.get_output();
     auto compiled_partition = partition_.compile(i, o, eng);
@@ -366,6 +570,7 @@ void gpu_float_sdpa(
     return compiled_partition;
   };
 
+<<<<<<< HEAD
   // maybe retry without causal mask
   try {
     compiled_partition = get_compiled_partition();
@@ -384,6 +589,20 @@ void gpu_float_sdpa(
   std::optional<at::Tensor> neg_inf;
   if (is_causal) {
     neg_inf = at::full({}, -INFINITY, query.options());
+=======
+  compiled_partition = get_compiled_partition();
+
+  Tensor softmax_scale1 = at::full(
+      {},
+      softmax_scale,
+      query.options().dtype(at::toOpMathType(query.scalar_type())));
+  std::optional<at::Tensor> neg_inf;
+  if (is_causal) {
+    neg_inf = at::full(
+        {},
+        -INFINITY,
+        query.options().dtype(at::toOpMathType(query.scalar_type())));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::vector<dnnl::graph::tensor> outputs = {
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
index 13ae166d1616..c7fffdb45796 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
@@ -131,7 +131,11 @@ struct PostOpParam {
 
 class Attr {
  public:
+<<<<<<< HEAD
   Attr() : q_scale_(1.f), q_zero_point_(0) {}
+=======
+  Attr() : q_scale_(1.f) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Attr(float q_scale, int64_t zp = 0) : q_scale_(q_scale), q_zero_point_(zp) {}
 
   /***** eltwise *****/
@@ -338,8 +342,12 @@ class Attr {
     // [1, C, 1, 1], channel broadcast
     // [dst.shape], no broadcast and eltwise-wise binary operations on dst
 
+<<<<<<< HEAD
     auto engine = GpuEngineManager::Instance().get_engine(
         {c10::kXPU, c10::xpu::current_device()});
+=======
+    auto& engine = GpuEngineManager::Instance().get_engine();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (size_t i = 0; i < ops_params_.size(); ++i) {
       kind_t kind = ops_params_[i].kind_;
       if (kind == kind_t::binary) {
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp
index 66d9ff6e31e3..ee2bf6e06610 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp
@@ -83,9 +83,14 @@ sycl::event convolution(
     int64_t groups,
     Attr& attr,
     const std::vector<sycl::event>& deps) {
+<<<<<<< HEAD
   auto engine = GpuEngineManager::Instance().get_engine(
       {c10::kXPU, c10::xpu::current_device()});
   auto stream = GpuStreamManager::Instance().get_stream();
+=======
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool is_channels_last = use_channels_last_for_conv(src, weight);
 
@@ -184,9 +189,14 @@ sycl::event convolution_backward_weights(
     IntArrayRef dilation,
     int64_t groups,
     const std::vector<sycl::event>& deps) {
+<<<<<<< HEAD
   auto engine = GpuEngineManager::Instance().get_engine(
       {c10::kXPU, c10::xpu::current_device()});
   auto stream = GpuStreamManager::Instance().get_stream();
+=======
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool is_channels_last = use_channels_last_for_conv(src, diff_dst);
 
@@ -292,9 +302,14 @@ sycl::event convolution_backward_data(
     int64_t groups,
     bool bias_defined,
     const std::vector<sycl::event>& deps) {
+<<<<<<< HEAD
   auto engine = GpuEngineManager::Instance().get_engine(
       {c10::kXPU, c10::xpu::current_device()});
   auto stream = GpuStreamManager::Instance().get_stream();
+=======
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool is_channels_last = use_channels_last_for_conv(diff_dst, weight);
 
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp
index fb4520d8a874..7933eecd9919 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp
@@ -158,9 +158,14 @@ sycl::event deconvolution(
     int64_t groups,
     Attr& attr,
     const std::vector<sycl::event>& deps) {
+<<<<<<< HEAD
   auto engine = GpuEngineManager::Instance().get_engine(
       {c10::kXPU, c10::xpu::current_device()});
   auto stream = GpuStreamManager::Instance().get_stream();
+=======
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool is_channels_last_suggested = use_channels_last_for_conv(src, weight);
 
@@ -249,9 +254,14 @@ sycl::event deconvolution_backward_data(
     int64_t groups,
     bool bias_defined,
     const std::vector<sycl::event>& deps) {
+<<<<<<< HEAD
   auto engine = GpuEngineManager::Instance().get_engine(
       {c10::kXPU, c10::xpu::current_device()});
   auto stream = GpuStreamManager::Instance().get_stream();
+=======
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool is_channels_last_suggested =
       use_channels_last_for_conv(diff_dst, weight);
@@ -347,9 +357,14 @@ sycl::event deconvolution_backward_weights(
     IntArrayRef dilation,
     int64_t groups,
     const std::vector<sycl::event>& deps) {
+<<<<<<< HEAD
   auto engine = GpuEngineManager::Instance().get_engine(
       {c10::kXPU, c10::xpu::current_device()});
   auto stream = GpuStreamManager::Instance().get_stream();
+=======
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool is_channels_last_suggested = use_channels_last_for_conv(src, diff_dst);
 
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/DnnlExt.h b/aten/src/ATen/native/mkldnn/xpu/detail/DnnlExt.h
new file mode 100644
index 000000000000..d4fc77ed516a
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/DnnlExt.h
@@ -0,0 +1,594 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+#include <ATen/native/mkldnn/xpu/detail/LRUCache.h>
+#include <ATen/native/mkldnn/xpu/detail/Utils.h>
+#include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
+
+#include <oneapi/dnnl/dnnl.h>
+#include <oneapi/dnnl/dnnl.hpp>
+
+namespace std {
+
+template <>
+struct hash<dnnl::memory::dims> {
+  size_t operator()(dnnl::memory::dims const& vec) const {
+    size_t seed = vec.size();
+    for (auto& i : vec) {
+      seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    }
+    return seed;
+  }
+};
+
+} // namespace std
+
+using namespace dnnl;
+
+namespace at::native::onednn {
+
+class primitive_ext : public primitive {
+  static constexpr int max_args = 12;
+
+ public:
+  primitive_ext(const primitive& base) : primitive(base) {}
+  primitive_ext(primitive&& base) : primitive(std::move(base)) {}
+
+  /// Returns a memory descriptor.
+  ///
+  /// @note
+  ///     There are also convenience methods
+  ///     #dnnl::primitive_desc_base::src_desc(),
+  ///     #dnnl::primitive_desc_base::dst_desc(), and others.
+  ///
+  /// @param what The kind of parameter to query; can be
+  ///     #dnnl::query::src_md, #dnnl::query::dst_md, etc.
+  /// @param idx Index of the parameter. For example, convolution bias can
+  ///     be queried with what = #dnnl::query::weights_md and idx = 1.
+  /// @returns The requested memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     parameter of the specified kind or index.
+  const_dnnl_memory_desc_t query_md(query what, int idx = 0) const {
+    std::vector<query> valid_q{
+        query::src_md,
+        query::diff_src_md,
+        query::weights_md,
+        query::diff_weights_md,
+        query::dst_md,
+        query::diff_dst_md,
+        query::workspace_md,
+        query::scratchpad_md,
+        query::exec_arg_md};
+    if (!std::any_of(valid_q.cbegin(), valid_q.cend(), [=](query q) {
+          return what == q;
+        }))
+      DNNL_THROW_ERROR(
+          dnnl_invalid_arguments, "memory descriptor query is invalid");
+
+    const_dnnl_memory_desc_t cdesc = dnnl_primitive_desc_query_md(
+        this->get_primitive_desc(), dnnl::convert_to_c(what), idx);
+
+    return cdesc ? cdesc : nullptr;
+  }
+
+  /// Returns a source memory descriptor.
+  /// @param idx Source index.
+  /// @returns Source memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     source parameter with index @p idx.
+  const_dnnl_memory_desc_t src_desc(int idx) const {
+    return query_md(query::src_md, idx);
+  }
+
+  /// Returns a destination memory descriptor.
+  /// @param idx Destination index.
+  /// @returns Destination memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     destination parameter with index @p idx.
+  const_dnnl_memory_desc_t dst_desc(int idx) const {
+    return query_md(query::dst_md, idx);
+  }
+
+  /// Returns a weights memory descriptor.
+  /// @param idx Weights index.
+  /// @returns Weights memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     weights parameter with index @p idx.
+  const_dnnl_memory_desc_t weights_desc(int idx) const {
+    return query_md(query::weights_md, idx);
+  }
+
+  /// Returns a diff source memory descriptor.
+  /// @param idx Diff source index.
+  /// @returns Diff source memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     diff source parameter with index @p idx.
+  const_dnnl_memory_desc_t diff_src_desc(int idx) const {
+    return query_md(query::diff_src_md, idx);
+  }
+
+  /// Returns a diff destination memory descriptor.
+  /// @param idx Diff destination index.
+  /// @returns Diff destination memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     diff destination parameter with index @p idx.
+  const_dnnl_memory_desc_t diff_dst_desc(int idx) const {
+    return query_md(query::diff_dst_md, idx);
+  }
+
+  /// Returns a diff weights memory descriptor.
+  /// @param idx Diff weights index.
+  /// @returns Diff weights memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     diff weights parameter with index @p idx.
+  const_dnnl_memory_desc_t diff_weights_desc(int idx) const {
+    return query_md(query::diff_weights_md, idx);
+  }
+
+  const_dnnl_memory_desc_t exec_arg_desc(int idx) const {
+    return query_md(query::exec_arg_md, idx);
+  }
+
+  // Separate versions without the index argument for documentation
+  // purposes.
+
+  /// Returns a source memory descriptor.
+  /// @returns Source memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     source parameter.
+  const_dnnl_memory_desc_t src_desc() const {
+    return src_desc(0);
+  }
+
+  /// Returns a destination memory descriptor.
+  /// @returns Destination memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     destination parameter.
+  const_dnnl_memory_desc_t dst_desc() const {
+    return dst_desc(0);
+  }
+
+  /// Returns a weights memory descriptor.
+  /// @returns Weights memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     weights parameter.
+  const_dnnl_memory_desc_t weights_desc() const {
+    return weights_desc(0);
+  }
+
+  /// Returns a diff source memory descriptor.
+  /// @returns Diff source memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     diff source memory with.
+  const_dnnl_memory_desc_t diff_src_desc() const {
+    return diff_src_desc(0);
+  }
+
+  /// Returns a diff destination memory descriptor.
+  /// @returns Diff destination memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     diff destination parameter.
+  const_dnnl_memory_desc_t diff_dst_desc() const {
+    return diff_dst_desc(0);
+  }
+
+  /// Returns a diff weights memory descriptor.
+  /// @returns Diff weights memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not have a
+  ///     diff weights parameter.
+  const_dnnl_memory_desc_t diff_weights_desc() const {
+    return diff_weights_desc(0);
+  }
+
+  /// Returns the workspace memory descriptor.
+  /// @returns Workspace memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not require
+  ///     workspace parameter.
+  const_dnnl_memory_desc_t workspace_desc() const {
+    return query_md(query::workspace_md, 0);
+  }
+
+  /// Returns the scratchpad memory descriptor.
+  /// @returns scratchpad memory descriptor.
+  /// @returns A zero memory descriptor if the primitive does not require
+  ///     scratchpad parameter.
+  /// @sa @ref dev_guide_attributes_scratchpad
+  const_dnnl_memory_desc_t scratchpad_desc() const {
+    return query_md(query::scratchpad_md, 0);
+  }
+
+  inline memory make_memory(
+      const_dnnl_memory_desc_t md_t,
+      const engine& aengine,
+      void* handle = DNNL_MEMORY_ALLOCATE) const {
+    sycl_interop::memory_kind kind = dnnl::sycl_interop::memory_kind::usm;
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+        dnnl_sycl_interop_memory_create(
+            &c_memory, md_t, aengine.get(), convert_to_c(kind), handle),
+        "could not create a memory");
+    return memory(c_memory);
+  }
+
+  memory make_src(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE)
+      const {
+    return make_memory(src_desc(), aengine, handle);
+  }
+
+  memory make_weight(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE)
+      const {
+    return make_memory(weights_desc(), aengine, handle);
+  }
+
+  memory make_bias(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE)
+      const {
+    return make_memory(weights_desc(1), aengine, handle);
+  }
+
+  memory make_dst(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE)
+      const {
+    return make_memory(dst_desc(), aengine, handle);
+  }
+
+  memory make_scratchpad(
+      const engine& aengine,
+      void* handle = DNNL_MEMORY_ALLOCATE) const {
+    return make_memory(scratchpad_desc(), aengine, handle);
+  }
+
+  size_t get_scratchpad_size() const {
+    return dnnl_memory_desc_get_size(scratchpad_desc());
+  }
+
+  memory make_args(int arg_class, const engine& aengine, void* handle) const {
+    switch (arg_class) {
+      case DNNL_ARG_SRC:
+        return make_src(aengine, handle);
+      case DNNL_ARG_WEIGHTS:
+        return make_weight(aengine, handle);
+      case DNNL_ARG_SCRATCHPAD:
+        return make_scratchpad(aengine, handle);
+      case DNNL_ARG_DST:
+        return make_dst(aengine, handle);
+      case DNNL_ARG_BIAS:
+        return make_bias(aengine, handle);
+      default:
+        TORCH_INTERNAL_ASSERT(
+            false, "unsupported argument class for primitive_ext");
+    }
+  }
+
+  template <typename M>
+  void set_attribute(int slot, int arg_class, void* handle, M constructor) {
+    if (mem_arg_cache[slot])
+      mem_arg_cache[slot].set_data_handle(handle);
+    else {
+      mem_arg_cache[slot] = constructor();
+      c_args[slot].arg = arg_class;
+      c_args[slot].memory = mem_arg_cache[slot].get();
+    }
+  }
+
+  sycl::event execute(
+      const stream& astream,
+      const engine& aengine,
+      std::vector<std::pair<int, void*>>&& handles,
+      int slot_off = 2) {
+    auto off = slot_off;
+    for (const auto& p : handles) {
+      auto& m_arg = mem_arg_cache[off];
+      if (m_arg)
+        m_arg.set_data_handle(p.second);
+      else {
+        m_arg = make_args(p.first, aengine, p.second);
+        c_args[off].arg = p.first;
+        c_args[off].memory = m_arg.get();
+      }
+      ++off;
+    }
+
+    sycl::event return_event;
+    std::vector<sycl::event> deps{};
+    error::wrap_c_api(
+        dnnl_sycl_interop_primitive_execute(
+            this->get(), astream.get(), off, c_args, &deps, &return_event),
+        "could not execute a primitive");
+    return return_event;
+  }
+
+ private:
+  memory mem_arg_cache[max_args];
+  dnnl_exec_arg_t c_args[max_args];
+};
+
+// Specifies the combined data types of input and weight tensors.
+// For example, f32 means both input and weight are FP32,
+// bf16_int4 means input is BF16 and weight is INT4.
+enum class joint_dtypes_t { f32 = 0, f16, bf16, int8, f16_int4, bf16_int4 };
+
+// Specifies the transposition state of input and weight tensors.
+// Convention: first letter = input, second letter = weight.
+// 'n' = not transposed, 't' = transposed.
+// For example, 'nt' means input is not transposed, weight is transposed.
+enum class trans_type_t { nn = 0, nt, tn, tt };
+
+// Specifies the type and placement of bias in the computation.
+// 'none' = no bias,
+// 'scalar' = a single scalar bias applied to all elements,
+// 'm' = per-row bias (typically matched to input rows),
+// 'n' = per-column bias (typically matched to output channels),
+// 'mn' = full bias matrix matching the output dimensions.
+enum class bias_type_t { none = 0, scalar, m, n, mn };
+
+template <typename T>
+T concat(const T& t1, at::ScalarType d) {
+  T t;
+  t.insert(t.end(), t1.begin(), t1.end());
+  t.push_back((int64_t)d);
+
+  return t;
+}
+
+template <typename T>
+T concat(const T& t1, bool b) {
+  T t;
+  t.insert(t.end(), t1.begin(), t1.end());
+  t.push_back(b);
+
+  return t;
+}
+
+template <typename T>
+T concat(const T& t1, int b) {
+  T t;
+  t.insert(t.end(), t1.begin(), t1.end());
+  t.push_back(b);
+
+  return t;
+}
+
+template <typename T>
+T concat(const T& t1, const T& t2) {
+  T t;
+  t.insert(t.end(), t1.begin(), t1.end());
+  t.insert(t.end(), t2.begin(), t2.end());
+
+  return t;
+}
+
+template <typename T1, typename T2, typename... Ts>
+T1 concat(const T1& t1, const T2& t2, const Ts&... ts) {
+  return concat(concat(t1, t2), ts...);
+}
+
+template <joint_dtypes_t Ts>
+struct onednn_types_mapper;
+
+template <>
+struct onednn_types_mapper<joint_dtypes_t::f16_int4> {
+  static inline std::tuple<dnnl::memory::data_type, dnnl::memory::data_type>
+  get() {
+    return std::make_tuple(
+        dnnl::memory::data_type::f16, dnnl::memory::data_type::u4);
+  }
+};
+
+template <>
+struct onednn_types_mapper<joint_dtypes_t::bf16_int4> {
+  static inline std::tuple<dnnl::memory::data_type, dnnl::memory::data_type>
+  get() {
+    return std::make_tuple(
+        dnnl::memory::data_type::bf16, dnnl::memory::data_type::u4);
+  }
+};
+
+// TODO: bias types maybe not right
+static inline dnnl::memory::dims get_bias_type(
+    bias_type_t b_dims,
+    const int m,
+    const int n) {
+  switch (b_dims) {
+    case bias_type_t::none:
+      return {0};
+    case bias_type_t::scalar:
+      return {1, 1};
+    case bias_type_t::m:
+      return {m, 1};
+    case bias_type_t::n:
+      return {1, n};
+    case bias_type_t::mn:
+      return {m, n};
+    default:
+      TORCH_INTERNAL_ASSERT(false, "unsupported bias type ...");
+  }
+}
+
+// TODO: use template specialization on struct
+template <trans_type_t Tt>
+inline void get_strides(
+    memory::dims& src_strides,
+    memory::dims& wei_strides,
+    memory::dims& dst_strides,
+    const int64_t lda,
+    const int64_t ldb,
+    const int64_t ldc) {}
+
+template <>
+inline void get_strides<trans_type_t::nt>(
+    memory::dims& src_strides,
+    memory::dims& wei_strides,
+    memory::dims& dst_strides,
+    const int64_t lda,
+    const int64_t ldb,
+    const int64_t ldc) {
+  src_strides = {lda, 1};
+  wei_strides = {1, ldb};
+  dst_strides = {ldc, 1};
+}
+
+using primitive_cache =
+    at::native::onednn::lru_cache<memory::dims, primitive_ext>;
+
+template <trans_type_t Tt, joint_dtypes_t Ts, typename F>
+struct matmul_primitive_cache_t {
+  static inline primitive_ext& get(
+      const int m,
+      const int n,
+      const int k,
+      const int64_t lda,
+      const int64_t ldb,
+      const int64_t ldc,
+      const bias_type_t
+          b_dims, // for shapeless bias, not put it into template parameter
+      const int device_id,
+      F f_attr,
+      const int64_t scale_group_size,
+      const int64_t zp_group_size) {
+    auto& cached = get_cache(device_id);
+    memory::dims src_strides, wei_strides, dst_strides;
+    get_strides<Tt>(src_strides, wei_strides, dst_strides, lda, ldb, ldc);
+    auto pri_key = at::native::onednn::concat(
+        src_strides,
+        wei_strides,
+        m,
+        n,
+        k,
+        int(b_dims),
+        int(scale_group_size),
+        int(zp_group_size));
+    auto iter = cached.find(pri_key);
+    if (iter == cached.end()) {
+      auto [src_dt, wei_dt] = onednn_types_mapper<Ts>::get();
+      auto bias_dims = get_bias_type(b_dims, m, n);
+
+      auto src_md = memory::desc({m, k}, src_dt, src_strides);
+      auto wei_md = memory::desc({k, n}, wei_dt, wei_strides);
+      auto dst_md = memory::desc({m, n}, src_dt, dst_strides);
+      auto bias_format = b_dims == bias_type_t::none
+          ? dnnl::memory::format_tag::undef
+          : dnnl::memory::format_tag::ab;
+      auto bias_md =
+          memory::desc(bias_dims, src_dt, bias_format); // {m, n} or {1, n}
+
+      primitive_attr pattr;
+      f_attr(pattr);
+
+      dnnl::matmul::primitive_desc matmul_pd;
+      auto aengine =
+          at::native::onednn::GpuEngineManager::Instance().get_engine(
+              device_id);
+      if (b_dims == bias_type_t::none) {
+        matmul_pd = dnnl::matmul::primitive_desc(
+            aengine, src_md, wei_md, dst_md, pattr);
+      } else {
+        matmul_pd = dnnl::matmul::primitive_desc(
+            aengine, src_md, wei_md, bias_md, dst_md, pattr);
+      }
+
+      return cached.insert({pri_key, primitive_ext(dnnl::matmul(matmul_pd))})
+          .first->second;
+    } else {
+      return iter->second;
+    }
+  }
+
+ private:
+  static constexpr int max_cache_capacity = 512;
+  // if default constructor of primitive cache could read the environment
+  // variable then it'll save a lot of trouble
+  static inline thread_local std::array<primitive_cache, 16> mappings;
+
+  // this won't be needed if primitive_cache have good default constructor
+  static inline primitive_cache& get_cache(const int device_id) {
+    auto& mapping = mappings[device_id];
+    if (mapping.max_size() == 0) {
+      mapping.resize(max_cache_capacity);
+    }
+    return mapping;
+  }
+};
+
+template <joint_dtypes_t Ts, typename F>
+static inline primitive_ext& matmul_primitive_create_and_cache(
+    const trans_type_t Tt,
+    const bias_type_t b_dims,
+    const int m,
+    const int n,
+    const int k,
+    const int64_t lda,
+    const int64_t ldb,
+    const int64_t ldc,
+    const int device_id,
+    F attr,
+    const int64_t scale_group_size,
+    const int64_t zp_group_size) {
+  switch (Tt) {
+    case trans_type_t::nt:
+      return matmul_primitive_cache_t<trans_type_t::nt, Ts, F>::get(
+          m,
+          n,
+          k,
+          lda,
+          ldb,
+          ldc,
+          b_dims,
+          device_id,
+          attr,
+          scale_group_size,
+          zp_group_size);
+    default:
+      TORCH_INTERNAL_ASSERT(false, "unsupported trans type ...");
+  }
+}
+
+template <typename F>
+static inline primitive_ext& matmul_primitive_create_and_cache(
+    const joint_dtypes_t Ts,
+    const trans_type_t Tt,
+    const bias_type_t b_dims,
+    const int m,
+    const int n,
+    const int k,
+    const int64_t lda,
+    const int64_t ldb, // is weight ldb necessary?
+    const int64_t ldc,
+    const int device_id,
+    F attr,
+    const int64_t scale_group_size = 0,
+    const int64_t zp_group_size = 0) {
+  switch (Ts) {
+    case joint_dtypes_t::f16_int4:
+      return matmul_primitive_create_and_cache<joint_dtypes_t::f16_int4, F>(
+          Tt,
+          b_dims,
+          m,
+          n,
+          k,
+          lda,
+          ldb,
+          ldc,
+          device_id,
+          attr,
+          scale_group_size,
+          zp_group_size);
+    case joint_dtypes_t::bf16_int4:
+      return matmul_primitive_create_and_cache<joint_dtypes_t::bf16_int4, F>(
+          Tt,
+          b_dims,
+          m,
+          n,
+          k,
+          lda,
+          ldb,
+          ldc,
+          device_id,
+          attr,
+          scale_group_size,
+          zp_group_size);
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Only support int4 ...");
+  }
+}
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/LRUCache.h b/aten/src/ATen/native/mkldnn/xpu/detail/LRUCache.h
new file mode 100644
index 000000000000..9229c10bc57a
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/LRUCache.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <iterator>
+#include <list>
+#include <unordered_map>
+#include <utility>
+
+namespace at::native::onednn {
+
+template <
+    class key_t,
+    class value_t,
+    template <typename...> class map_t = std::unordered_map>
+class lru_cache {
+ public:
+  using value_type = std::pair<key_t, value_t>;
+  using list_type = std::list<value_type>;
+  using list_iter = typename list_type::iterator;
+  using map_type = map_t<key_t, list_iter>;
+  using const_list_iter = typename list_type::const_iterator;
+  using size_type = typename list_type::size_type;
+
+  explicit lru_cache(size_type capacity) : capacity_(capacity) {}
+  lru_cache() : capacity_(0) {}
+
+  [[nodiscard]] size_type size() const noexcept {
+    return map_.size();
+  }
+  [[nodiscard]] size_type max_size() const noexcept {
+    return capacity_;
+  }
+  [[nodiscard]] bool empty() const noexcept {
+    return vlist_.empty();
+  }
+
+  void resize(size_type new_capacity) {
+    capacity_ = new_capacity;
+    trim();
+  }
+
+  list_iter begin() noexcept {
+    return vlist_.begin();
+  }
+  const_list_iter begin() const noexcept {
+    return vlist_.begin();
+  }
+  list_iter end() noexcept {
+    return vlist_.end();
+  }
+  const_list_iter end() const noexcept {
+    return vlist_.end();
+  }
+
+  void clear() noexcept {
+    map_.clear();
+    vlist_.clear();
+  }
+
+  void swap(lru_cache& other) noexcept {
+    using std::swap;
+    swap(vlist_, other.vlist_);
+    swap(map_, other.map_);
+    swap(capacity_, other.capacity_);
+  }
+
+  list_iter find(const key_t& key) {
+    auto it = map_.find(key);
+    if (it == map_.end())
+      return end();
+    vlist_.splice(vlist_.begin(), vlist_, it->second);
+    return it->second;
+  }
+
+  std::pair<list_iter, bool> insert(const value_type& value) {
+    auto it = map_.find(value.first);
+    if (it != map_.end()) {
+      // Move existing to front
+      vlist_.splice(vlist_.begin(), vlist_, it->second);
+      return {it->second, false};
+    }
+
+    // Insert new at front
+    vlist_.emplace_front(value);
+    map_[value.first] = vlist_.begin();
+
+    trim();
+
+    return {vlist_.begin(), true};
+  }
+
+  list_iter erase(list_iter pos) {
+    map_.erase(pos->first);
+    return vlist_.erase(pos);
+  }
+
+ private:
+  void trim() {
+    while (map_.size() > capacity_) {
+      auto last = std::prev(vlist_.end());
+      map_.erase(last->first);
+      vlist_.pop_back();
+    }
+  }
+
+  list_type vlist_;
+  map_type map_;
+  size_type capacity_;
+};
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp
index 0e55fcf18fca..13d8ec58e81b 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp
@@ -20,6 +20,13 @@ sycl::event matmul(
     bool m2_trans,
     Attr attr,
     const std::vector<sycl::event>& deps) {
+<<<<<<< HEAD
+=======
+  // m2_trans means mat2 is transposed from the nn.Linear perspective.
+  // m2_trans==true means mat2 is [k, n] layout.
+  // m2_trans==false means mat2 is [n, k] layout, aka, the default layout in
+  // nn.Linear.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t dims = result.dim();
   TORCH_CHECK(
       dims == 2 || dims == 3,
@@ -30,9 +37,14 @@ sycl::event matmul(
       "oneDNN input matrixes must have the same ranks");
   TORCH_CHECK(result.defined(), "oneDNN matmul result should be defined");
 
+<<<<<<< HEAD
   at::Device cur_device = at::Device(at::kXPU, c10::xpu::current_device());
   auto engine = GpuEngineManager::Instance().get_engine(cur_device);
   auto stream = GpuStreamManager::Instance().get_stream();
+=======
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   at::Tensor m1 = mat1;
   at::Tensor m2 = mat2;
@@ -195,7 +207,16 @@ sycl::event matmul(
   pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
 
   if (m1_dt == dnnl::memory::data_type::f32) {
+<<<<<<< HEAD
     pattr.set_fpmath_mode(dnnl::fpmath_mode::strict);
+=======
+    bool allow_tf32 = at::globalContext().allowTF32OneDNN();
+    if (allow_tf32) {
+      pattr.set_fpmath_mode(dnnl::fpmath_mode::tf32);
+    } else {
+      pattr.set_fpmath_mode(dnnl::fpmath_mode::strict);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // STEP3: create primitive
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
index ffd47437b2d8..edf3d9620e88 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
@@ -5,6 +5,10 @@
 
 #include <ATen/native/mkldnn/xpu/detail/Attr.h>
 #include <ATen/native/mkldnn/xpu/detail/Utils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
 
 #include <oneapi/dnnl/dnnl.hpp>
@@ -84,8 +88,14 @@ at::Tensor quantized_convolution(
     std::optional<std::string_view> unary_attr,
     torch::List<std::optional<at::Scalar>> unary_scalars,
     std::optional<std::string_view> unary_algorithm) {
+<<<<<<< HEAD
   Attr attr =
       Attr(/*q_scale=*/1.0 / inv_output_scale, /*zp=*/output_zero_point);
+=======
+  Attr attr = Attr(
+      /*q_scale=*/static_cast<float>(1.0 / inv_output_scale),
+      /*zp=*/output_zero_point);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto ndim = act.ndimension();
   construct_attr_by_post_op(
@@ -106,9 +116,14 @@ at::Tensor quantized_convolution(
       output.defined(),
       "A valid output is required for quantized convolution.");
 
+<<<<<<< HEAD
   auto engine = GpuEngineManager::Instance().get_engine(
       {c10::kXPU, c10::xpu::current_device()});
   auto stream = GpuStreamManager::Instance().get_stream();
+=======
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // input tensors config
   dnnl::memory::dims src_dims = act.sizes().vec();
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
index 8d6c9c035b87..e6f43aa71996 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@@ -101,18 +101,30 @@ void quantized_matmul(
     std::optional<at::Tensor> other, // extra input for binary-post-op
     double other_scale,
     int64_t other_zero_point,
+<<<<<<< HEAD
     const c10::string_view& binary_post_op,
     double binary_alpha,
     const c10::string_view& unary_post_op,
     torch::List<std::optional<at::Scalar>>& unary_post_op_args,
     c10::string_view unary_post_op_algorithm,
+=======
+    const std::string_view& binary_post_op,
+    double binary_alpha,
+    const std::string_view& unary_post_op,
+    torch::List<std::optional<at::Scalar>>& unary_post_op_args,
+    std::string_view unary_post_op_algorithm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool m2_trans) {
   // [Note] Quantized Matrix Multiplication at XPU
   // The following code integrates oneDNN quantized gemm. The quantization
   // config we support:
   // activation: s8&u8; per tensor calibrated; symmetric&asymmetric
   // weight: s8; per_tensor/per_channel calibrated; symmetric
+<<<<<<< HEAD
   auto attr = Attr(1.0 / output_scale, output_zero_point);
+=======
+  auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   construct_attr_by_post_op(
       binary_post_op,
       binary_alpha,
@@ -125,9 +137,14 @@ void quantized_matmul(
       attr);
 
   size_t dims = result.dim();
+<<<<<<< HEAD
   at::Device cur_device = at::Device(at::kXPU, c10::xpu::current_device());
   auto engine = GpuEngineManager::Instance().get_engine(cur_device);
   auto stream = GpuStreamManager::Instance().get_stream();
+=======
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   at::Tensor m1 = is_onednn_matmul_strides(mat1) ? mat1 : mat1.contiguous();
   at::Tensor m2 = is_onednn_matmul_strides(mat2) ? mat2 : mat2.contiguous();
@@ -274,7 +291,11 @@ void quantized_matmul(
 
   int scratchpad_size = matmul_pd.scratchpad_desc().get_size();
   at::Tensor scratchpad_tensor =
+<<<<<<< HEAD
       at::empty({scratchpad_size}, m1.options().dtype(at::kByte), c10::nullopt);
+=======
+      at::empty({scratchpad_size}, m1.options().dtype(at::kByte), std::nullopt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto scratchpad_memory = make_onednn_memory(
       matmul_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr());
   args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_memory});
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
index 3f2e8097e377..3f2ebd974fe5 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
@@ -294,6 +294,16 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
   if (tensor.is_contiguous())
     return true;
 
+<<<<<<< HEAD
+=======
+  if (tensor.storage_offset() > 0) {
+    // currently onednn asks 64 byte alignment
+    constexpr int alignment_byte = 64;
+    if (reinterpret_cast<uintptr_t>(tensor.data_ptr()) % alignment_byte > 0)
+      return false;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // the overlaped cases are not supported
   dnnl::memory::dims strides = get_onednn_strides(tensor);
   int64_t storage_size = 1;
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/WoQMatmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/WoQMatmul.cpp
new file mode 100644
index 000000000000..6ef371424eed
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/WoQMatmul.cpp
@@ -0,0 +1,329 @@
+#include <c10/xpu/XPUFunctions.h>
+
+#include <ATen/native/mkldnn/xpu/detail/Attr.h>
+#include <ATen/native/mkldnn/xpu/detail/DnnlExt.h>
+#include <ATen/native/mkldnn/xpu/detail/Utils.h>
+
+#include <oneapi/dnnl/dnnl.hpp>
+#include <cstdint>
+
+namespace at::native::onednn {
+
+void woq_matmul_int4_impl(
+    Tensor& result,
+    const Tensor& mat1_,
+    const Tensor& mat2_,
+    const Tensor& scale,
+    const Tensor& zp,
+    int64_t group_size) {
+  auto& engine = GpuEngineManager::Instance().get_engine();
+  auto& stream = GpuStreamManager::Instance().get_stream();
+
+  Tensor m1 = mat1_;
+  Tensor m2 = mat2_;
+  Tensor scale_ = scale;
+  Tensor zp_ = zp;
+  Tensor dst = result;
+
+  int m = m1.size(-2); // M
+  int n = dst.size(-1); // N
+  int k = m1.size(-1); // K
+
+  // Construct usr md from input
+  // xxx_usr_md would describe the real layout of inputs
+  auto m1_usr_dt = get_onednn_dtype(m1); // e.g., half <==> f16
+  auto m2_usr_dt = get_onednn_dtype(m2); // int32 tensor, pack 8 int4
+  auto scale_usr_dt = get_onednn_dtype(scale_); // bf16
+  auto zp_usr_dt = get_onednn_dtype(zp_); // s8 expected currently
+  auto dst_usr_dt = get_onednn_dtype(dst); // bf16
+
+  dnnl::memory::dims m1_usr_dims, m2_usr_dims, scale_usr_dims, zp_usr_dims,
+      dst_usr_dims;
+  dnnl::memory::dims m1_usr_strides, m2_usr_strides, scale_usr_strides,
+      zp_usr_strides, dst_usr_strides;
+  int compressed_k = (int)(k / 8);
+  int num_groups = (int)(k / group_size);
+  m1_usr_dims = {m, k};
+  m1_usr_strides = {m1.stride(0), m1.stride(1)};
+  m2_usr_dims = {compressed_k, n};
+  m2_usr_strides = {1, compressed_k}; // k dim contiguous, 4bit pack into s32
+
+  scale_usr_dims = {num_groups, n};
+  scale_usr_strides = {n, 1};
+  zp_usr_dims = {num_groups, n};
+  zp_usr_strides = {n, 1};
+  dst_usr_dims = {m, n};
+  dst_usr_strides = {dst.stride(0), dst.stride(1)};
+
+  dnnl::memory::desc m1_usr_md, m2_usr_md, scale_usr_md, zp_usr_md, dst_usr_md;
+
+  m1_usr_md = dnnl::memory::desc(m1_usr_dims, m1_usr_dt, m1_usr_strides);
+  m2_usr_md = dnnl::memory::desc(m2_usr_dims, m2_usr_dt, m2_usr_strides);
+  scale_usr_md =
+      dnnl::memory::desc(scale_usr_dims, scale_usr_dt, scale_usr_strides);
+  zp_usr_md = dnnl::memory::desc(zp_usr_dims, zp_usr_dt, zp_usr_strides);
+  dst_usr_md = dnnl::memory::desc(dst_usr_dims, dst_usr_dt, dst_usr_strides);
+
+  // create usr memory
+  auto dst_usr_m = make_onednn_memory(dst_usr_md, engine, dst.data_ptr());
+  auto scale_usr_m = make_onednn_memory(scale_usr_md, engine, scale.data_ptr());
+  auto zp_usr_m = make_onednn_memory(zp_usr_md, engine, zp.data_ptr());
+
+  // Construct md for primitive creation
+  // The xxx_md describes what kinds of matmul the oneDNN does.
+  // The problem for this op is [m, k] x [k, n] => [m, n] matmul.
+  auto m1_dt = m1_usr_dt; // bf16
+  // Tell oneDNN the weight dtype we want manipulate is u4,
+  // library needs infer how to unpack u4 data based on the m2_usr_md (s32).
+  auto m2_dt = dnnl::memory::data_type::u4;
+  auto scale_dt = scale_usr_dt; // bf16
+  // Tell oneDNN the zp dtype we want manipulate is s8
+  // library needs infer how to unpack s8 data based on the m2_usr_md.
+  auto zp_dt = zp_usr_dt; // should be s8, currently
+  auto dst_dt = dst_usr_dt;
+
+  dnnl::memory::desc m1_md, m2_md, scale_md, zp_md, dst_md;
+  dnnl::memory::dims m1_dims, m2_dims, scale_dims, zp_dims, dst_dims;
+  dnnl::memory::dims m1_strides, m2_strides, scale_strides, zp_strides,
+      dst_strides;
+
+  m1_dims = m1_usr_dims; // {m, k}
+  m1_strides = m1_usr_strides; // {k, 1}
+  m2_dims = {k, n};
+  m2_strides = {n, 1};
+  scale_dims = scale_usr_dims; // {k//group_size, n}
+  scale_strides = scale_usr_strides;
+  zp_dims = zp_usr_dims;
+  zp_strides = zp_usr_strides;
+  dst_dims = dst_usr_dims;
+  dst_strides = dst_usr_strides;
+
+  m1_md = dnnl::memory::desc(m1_dims, m1_dt, m1_strides);
+  m2_md = dnnl::memory::desc(m2_dims, m2_dt, m2_strides);
+  scale_md = dnnl::memory::desc(scale_dims, scale_dt, scale_strides);
+  zp_md = dnnl::memory::desc(zp_dims, zp_dt, zp_strides);
+  dst_md = dnnl::memory::desc(dst_dims, dst_dt, dst_strides);
+
+  std::unordered_map<int, dnnl::memory> args;
+
+  dnnl::matmul matmul_p;
+  dnnl::matmul::primitive_desc matmul_pd;
+
+  auto m1_usr_m = make_onednn_memory(m1_usr_md, engine, m1.data_ptr());
+  auto m2_usr_m = make_onednn_memory(m2_usr_md, engine, m2.data_ptr());
+
+  void* handle_b = m2_usr_m.get_data_handle();
+  // reinterpret m2_usr_memory as u4
+  dnnl::memory m2_u4_m(
+      {{k, n}, dnnl::memory::data_type::u4, dnnl::memory::format_tag::ba},
+      engine,
+      handle_b);
+
+  dnnl::primitive_attr pattr;
+  pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+#if ONEDNN_SUPPORT_DETERMINISTIC
+  if (at::globalContext().deterministicAlgorithms() ||
+      at::globalContext().deterministicMkldnn()) {
+    pattr.set_deterministic(true);
+  }
+#endif
+
+  // Set scales with multiple scales along K dimension and with groups along  K.
+  pattr.set_scales(
+      DNNL_ARG_WEIGHTS,
+      /* mask */ (1 << 0) + (1 << 1),
+      {group_size, 1},
+      scale_dt);
+  // Set a single zero point with s8 data type.
+  pattr.set_zero_points(
+      DNNL_ARG_WEIGHTS,
+      (1 << 0) + (1 << 1),
+      {group_size, 1},
+      dnnl::memory::data_type::s8);
+
+  if (m1_dt == dnnl::memory::data_type::f16)
+    pattr.set_fpmath_mode(dnnl::fpmath_mode::f16, true);
+  else if (m1_dt == dnnl::memory::data_type::bf16)
+    pattr.set_fpmath_mode(dnnl::fpmath_mode::bf16, true);
+
+  matmul_pd = dnnl::matmul::primitive_desc(
+      engine, m1_md, m2_u4_m.get_desc(), dst_md, pattr);
+  matmul_p = dnnl::matmul(matmul_pd);
+
+  dnnl::memory m1_m = m1_usr_m, m2_m = m2_u4_m, dst_m = dst_usr_m;
+  dnnl::memory scale_m = scale_usr_m; // zp_m = zp_u4_m;
+  Tensor m1_, m2_, zp_new, dst_;
+
+  int scratchpad_size = matmul_pd.scratchpad_desc().get_size();
+  Tensor scratchpad_tensor =
+      at::empty({scratchpad_size}, m1.options().dtype(at::kByte), std::nullopt);
+  auto scratchpad_memory = make_onednn_memory(
+      matmul_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr());
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_memory});
+
+  args.insert({DNNL_ARG_SRC, m1_m});
+  args.insert({DNNL_ARG_WEIGHTS, m2_u4_m});
+  args.insert({DNNL_ARG_DST, dst_m});
+  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, scale_m});
+  args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, zp_usr_m});
+  dnnl::sycl_interop::execute(matmul_p, stream, args);
+}
+
+static inline void set_quant_primitive_attr(
+    primitive_attr& pattr,
+    const Tensor& scale,
+    const Tensor& zp,
+    const int64_t group_size) {
+  // set scale and zero point for matmul args
+  pattr.set_scales(
+      DNNL_ARG_WEIGHTS,
+      /* mask */ (1 << 0) + (1 << 1),
+      {group_size, 1},
+      get_onednn_dtype(scale));
+  pattr.set_zero_points(
+      DNNL_ARG_WEIGHTS,
+      /* mask */ (1 << 0) + (1 << 1),
+      {group_size, 1},
+      memory::data_type::s8);
+}
+
+void woq_matmul_int4_impl_cache(
+    Tensor& result,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Tensor& scale,
+    const Tensor& zp,
+    int64_t group_size) {
+  auto a_sz = mat1.sizes();
+  auto c_sz = result.sizes();
+
+  const int m =
+      std::reduce(a_sz.begin(), a_sz.end() - 1, 1, std::multiplies<int64_t>());
+  const int n = *(c_sz.end() - 1);
+  const int k = *(a_sz.end() - 1);
+
+  const int64_t ldb = mat2.strides()[mat2.dim() - 2] * 8; // for int4 matmul
+  const int64_t lda = mat1.strides()[mat1.dim() - 2];
+  const int64_t ldc = result.strides()[result.dim() - 2];
+
+  bias_type_t b_type = bias_type_t::none;
+  trans_type_t tt = trans_type_t::nt; // only support nt for int4 matmul
+
+  joint_dtypes_t jd;
+  if (mat1.scalar_type() == at::ScalarType::Half) {
+    jd = joint_dtypes_t::f16_int4;
+  } else if (mat1.scalar_type() == at::ScalarType::BFloat16) {
+    jd = joint_dtypes_t::bf16_int4;
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        false, "Unsupported data type for int4 matmul: ", mat1.scalar_type());
+  }
+
+  auto f_attr = [&](primitive_attr& pattr) {
+    pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+    if (jd == joint_dtypes_t::f16_int4) {
+      pattr.set_fpmath_mode(dnnl::fpmath_mode::f16, true);
+    } else if (jd == joint_dtypes_t::bf16_int4) {
+      pattr.set_fpmath_mode(dnnl::fpmath_mode::bf16, true);
+    }
+
+    set_quant_primitive_attr(pattr, scale, zp, group_size);
+
+#if ONEDNN_SUPPORT_DETERMINISTIC
+    if (at::globalContext().deterministicAlgorithms() ||
+        at::globalContext().deterministicMkldnn()) {
+      pattr.set_deterministic(true);
+    }
+#endif
+  };
+
+  int64_t zp_group_size = group_size;
+  auto device_id = c10::xpu::current_device();
+  auto& matmul_ext = matmul_primitive_create_and_cache(
+      jd,
+      tt,
+      b_type,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldc,
+      device_id,
+      f_attr,
+      group_size,
+      zp_group_size);
+
+  auto& engine = GpuEngineManager::Instance().get_engine();
+
+  int arg_off = 0;
+  // set scale and zero point for matmul args
+  matmul_ext.set_attribute(
+      arg_off++,
+      DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS,
+      scale.data_ptr(),
+      [&]() {
+        return make_onednn_memory(
+            get_onednn_md(scale), engine, scale.data_ptr());
+      });
+
+  // set zp_md for asymmetric quantization
+  matmul_ext.set_attribute(
+      arg_off++,
+      DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS,
+      zp.data_ptr(),
+      [&]() {
+        int num_groups = k / group_size;
+        memory zp_usr_m(
+            {{num_groups, n}, memory::data_type::s8, {n, 1}},
+            engine,
+            zp.data_ptr());
+        return zp_usr_m;
+      });
+
+  // set general args
+  std::vector<std::pair<int, void*>> arg_handles;
+  arg_handles.reserve(8);
+
+  arg_handles.emplace_back(DNNL_ARG_SRC, mat1.data_ptr());
+  arg_handles.emplace_back(DNNL_ARG_WEIGHTS, mat2.data_ptr());
+  arg_handles.emplace_back(DNNL_ARG_DST, result.data_ptr());
+
+  int scratchpad_size = matmul_ext.get_scratchpad_size();
+  Tensor scratchpad_tensor = at::empty(
+      {scratchpad_size}, mat1.options().dtype(at::kByte), std::nullopt);
+  arg_handles.emplace_back(DNNL_ARG_SCRATCHPAD, scratchpad_tensor.data_ptr());
+
+  auto& strm = GpuStreamManager::Instance().get_stream();
+  auto qint4_matmul_event =
+      matmul_ext.execute(strm, engine, std::move(arg_handles), arg_off);
+}
+
+void woq_matmul_int4(
+    Tensor& result, // torchao: [M, K], dtype: fp16,bf16
+    const Tensor& mat1_, // torchao: [M, K], dtype: fp16,bf16
+    const Tensor& mat2_, // torchao quantized weight, [K/8, N], dtype: uint4x8
+    const Tensor& scale, // torchao: [K/group_size, N], dtype: fp16,bf16
+    const Tensor& zp, // torchao: [K/group_size, N], dtype: int8
+    int64_t group_size,
+    bool pri_cache) {
+  size_t dims = result.dim();
+  TORCH_CHECK(
+      dims == 2, "INT4 matmul at XPU only works with 2D input, got ", dims);
+  TORCH_CHECK(result.defined(), "oneDNN matmul result should be defined");
+
+  const int device_id = c10::xpu::current_device();
+  at::Device cur_device = at::Device(at::kXPU, device_id);
+  TORCH_CHECK(
+      cur_device == mat1_.device(),
+      "_weight_int4pack_mm_with_scales_and_zeros input should be on current device.");
+
+  if (pri_cache) {
+    woq_matmul_int4_impl_cache(result, mat1_, mat2_, scale, zp, group_size);
+  } else {
+    woq_matmul_int4_impl(result, mat1_, mat2_, scale, zp, group_size);
+  }
+}
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
index a4f993eebcd6..c19a6839dea0 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@@ -89,6 +89,18 @@ TORCH_API sycl::event deconvolution_backward_weights(
     int64_t groups,
     const std::vector<sycl::event>& deps = {});
 
+<<<<<<< HEAD
+=======
+TORCH_API void woq_matmul_int4(
+    at::Tensor& result, // dst, [M, N]
+    const at::Tensor& mat1_, // src, [M, K]
+    const at::Tensor& mat2_, // quantized weight, [K/8, N]
+    const at::Tensor& scale, // [K/group_size, N]
+    const at::Tensor& zp, // [k/group_size, N]
+    int64_t group_size,
+    bool pri_cache = true);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 dnnl::memory::dims conv_dst_size(
     int64_t ndim,
     IntArrayRef src_tz,
@@ -148,20 +160,35 @@ void quantized_matmul(
     std::optional<at::Tensor> other, // extra input for binary-post-op
     double other_scale,
     int64_t other_zero_point,
+<<<<<<< HEAD
     const c10::string_view& binary_post_op,
     double binary_alpha,
     const c10::string_view& unary_post_op,
     torch::List<std::optional<at::Scalar>>& unary_post_op_args,
     c10::string_view unary_post_op_algorithm,
+=======
+    const std::string_view& binary_post_op,
+    double binary_alpha,
+    const std::string_view& unary_post_op,
+    torch::List<std::optional<at::Scalar>>& unary_post_op_args,
+    std::string_view unary_post_op_algorithm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool m2_trnas);
 
 void gpu_float_sdpa(
     int batch_size,
     int seq_len_q,
+<<<<<<< HEAD
     int seq_len_k,
     int num_head,
     int num_head_kv,
     int head_dim,
+=======
+    int seq_len_kv,
+    int num_head_q,
+    int num_head_kv,
+    int head_dim_qk,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int head_dim_v,
     const Tensor& query,
     const Tensor& key,
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp
index f9fe195d993a..6f90dc617ba4 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp
@@ -29,8 +29,12 @@ static inline void dnnl_delete(
 }
 
 GpuEngineManager::GpuEngineManager() {
+<<<<<<< HEAD
   c10::DeviceIndex device_count = c10::xpu::device_count();
   TORCH_INTERNAL_ASSERT(device_count > 0);
+=======
+  c10::DeviceIndex device_count = c10::xpu::device_count_ensure_non_zero();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto i : c10::irange(device_count)) {
     static dnnl::graph::allocator alloc =
         dnnl::graph::sycl_interop::make_allocator(dnnl_alloc, dnnl_delete);
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
index b1a85fb5b3bd..26d0e34ba148 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
@@ -25,10 +25,22 @@ bool set_onednn_verbose(int level);
 struct TORCH_XPU_API GpuEngineManager {
   static GpuEngineManager& Instance(); // Singleton
 
+<<<<<<< HEAD
   dnnl::engine& get_engine(const Device& device) {
     TORCH_INTERNAL_ASSERT(device.type() == kXPU);
     TORCH_INTERNAL_ASSERT(device.index() < c10::xpu::device_count());
     return *engine_pool[device.index()];
+=======
+  dnnl::engine& get_engine(
+      DeviceIndex device_index = c10::xpu::current_device()) {
+    c10::xpu::check_device_index(device_index);
+    return *engine_pool[device_index];
+  }
+
+  dnnl::engine& get_engine(const Device& device) {
+    TORCH_INTERNAL_ASSERT(device.type() == kXPU);
+    return get_engine(device.index());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   GpuEngineManager(GpuEngineManager const&) = delete;
@@ -48,16 +60,27 @@ struct TORCH_XPU_API GpuEngineManager {
 struct TORCH_XPU_API GpuStreamManager {
   static GpuStreamManager& Instance(); // Singleton
 
+<<<<<<< HEAD
   dnnl::stream get_stream() {
     auto stream = c10::xpu::getCurrentXPUStream();
     auto priority = stream.priority();
     auto device_index = stream.device_index();
+=======
+  dnnl::stream& get_stream(
+      DeviceIndex device_index = c10::xpu::current_device()) {
+    auto stream = c10::xpu::getCurrentXPUStream(device_index);
+    auto priority = stream.priority();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (stream_pool[device_index][priority].find(stream) ==
         stream_pool[device_index][priority].end()) {
       stream_pool[device_index][priority][stream] =
           std::make_shared<dnnl::stream>(dnnl::sycl_interop::make_stream(
+<<<<<<< HEAD
               GpuEngineManager::Instance().get_engine(
                   {c10::kXPU, device_index}),
+=======
+              GpuEngineManager::Instance().get_engine(device_index),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               stream.queue()));
     }
     return *stream_pool[device_index][priority][stream];
@@ -70,8 +93,12 @@ struct TORCH_XPU_API GpuStreamManager {
 
  protected:
   GpuStreamManager() {
+<<<<<<< HEAD
     c10::DeviceIndex device_count = c10::xpu::device_count();
     TORCH_INTERNAL_ASSERT(device_count > 0);
+=======
+    c10::DeviceIndex device_count = c10::xpu::device_count_ensure_non_zero();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stream_pool.resize(device_count);
   }
   ~GpuStreamManager() = default;
diff --git a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
index a123774698d9..ea40b06f5ff0 100644
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
@@ -19,7 +19,11 @@ static inline c10::ScalarType qconv_decide_out_dtype(
   return dst_dtype;
 }
 
+<<<<<<< HEAD
 at::Tensor qconv_prepack_xpu(
+=======
+static at::Tensor qconv_prepack_xpu(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor weight,
     at::Tensor weight_scales,
     double input_scale,
@@ -117,6 +121,47 @@ class QConvoneDNNXPU final {
         /*unary_algorithm*/ algorithm);
   }
 
+<<<<<<< HEAD
+=======
+  static at::Tensor run_pointwise_tensor(
+      at::Tensor act,
+      at::Tensor act_scale,
+      at::Tensor act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm) {
+    return run_pointwise(
+        act,
+        act_scale.item().toDouble(),
+        act_zero_point.item().toLong(),
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        output_scale,
+        output_zero_point,
+        output_dtype,
+        /*unary_attr*/ attr,
+        /*unary_scalars*/ scalars,
+        /*unary_algorithm*/ algorithm);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static at::Tensor run_pointwise_binary(
       at::Tensor act,
       double act_scale,
@@ -223,6 +268,15 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary"),
       QConvoneDNNXPU::run_pointwise_binary);
+<<<<<<< HEAD
+=======
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qconv_pointwise"),
+      QConvoneDNNXPU::run_pointwise);
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"),
+      QConvoneDNNXPU::run_pointwise_tensor);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
index 28deea079d73..22a67999ffdf 100644
--- a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
@@ -19,7 +19,11 @@ static inline c10::ScalarType qlinear_decide_out_dtype(
   return dst_dtype;
 }
 
+<<<<<<< HEAD
 Tensor q_linear_pointwise(
+=======
+static Tensor q_linear_pointwise(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor act,
     double act_scale,
     int64_t act_zero_point,
@@ -78,7 +82,11 @@ Tensor q_linear_pointwise(
   return qout;
 }
 
+<<<<<<< HEAD
 Tensor q_linear_pointwise_tensor(
+=======
+static Tensor q_linear_pointwise_tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor act,
     Tensor act_scale,
     Tensor act_zero_point,
@@ -137,7 +145,11 @@ Tensor q_linear_pointwise_tensor(
   return qout;
 }
 
+<<<<<<< HEAD
 Tensor q_linear_pointwise_binary(
+=======
+static Tensor q_linear_pointwise_binary(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor act,
     double act_scale,
     int64_t act_zero_point,
@@ -151,11 +163,19 @@ Tensor q_linear_pointwise_binary(
     std::optional<c10::ScalarType> output_dtype,
     double other_scale,
     int64_t other_zero_point,
+<<<<<<< HEAD
     c10::string_view binary_post_op,
     double binary_alpha,
     c10::string_view unary_post_op,
     torch::List<std::optional<at::Scalar>> unary_post_op_args,
     c10::string_view unary_post_op_algorithm) {
+=======
+    std::string_view binary_post_op,
+    double binary_alpha,
+    std::string_view unary_post_op,
+    torch::List<std::optional<at::Scalar>> unary_post_op_args,
+    std::string_view unary_post_op_algorithm) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       act.device() == weight.device() &&
           act.device() == weight_scales.device() &&
@@ -208,7 +228,11 @@ Tensor q_linear_pointwise_binary(
   return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
 }
 
+<<<<<<< HEAD
 Tensor q_linear_pointwise_binary_tensor(
+=======
+static Tensor q_linear_pointwise_binary_tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor act,
     Tensor act_scale,
     Tensor act_zero_point,
@@ -222,11 +246,19 @@ Tensor q_linear_pointwise_binary_tensor(
     std::optional<c10::ScalarType> output_dtype,
     double other_scale,
     int64_t other_zero_point,
+<<<<<<< HEAD
     c10::string_view binary_post_op,
     double binary_alpha,
     c10::string_view unary_post_op,
     torch::List<std::optional<at::Scalar>> unary_post_op_args,
     c10::string_view unary_post_op_algorithm) {
+=======
+    std::string_view binary_post_op,
+    double binary_alpha,
+    std::string_view unary_post_op,
+    torch::List<std::optional<at::Scalar>> unary_post_op_args,
+    std::string_view unary_post_op_algorithm) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return q_linear_pointwise_binary(
       act,
       act_scale.item().toDouble(),
@@ -248,7 +280,11 @@ Tensor q_linear_pointwise_binary_tensor(
       unary_post_op_algorithm);
 }
 
+<<<<<<< HEAD
 at::Tensor q_linear_prepack_onednn(
+=======
+static at::Tensor q_linear_prepack_onednn(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor weight,
     std::optional<torch::List<int64_t>> input_shape) {
   at::Tensor weight_transposed = weight.transpose(0, 1);
diff --git a/aten/src/ATen/native/mps/MetalShaderLibrary.h b/aten/src/ATen/native/mps/MetalShaderLibrary.h
index 0d29b31e57ab..31c71794e77e 100644
--- a/aten/src/ATen/native/mps/MetalShaderLibrary.h
+++ b/aten/src/ATen/native/mps/MetalShaderLibrary.h
@@ -13,6 +13,10 @@ typedef void* MTLComputePipelineState_t;
 typedef void* MTLComputeCommandEncoder_t;
 #endif
 
+<<<<<<< HEAD
+=======
+#include <c10/core/Scalar.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/OptionalArrayRef.h>
 #include <functional>
 #include <optional>
@@ -46,9 +50,18 @@ constexpr bool has_size_type_v = has_size_type<T>::value;
 
 } // namespace detail
 
+<<<<<<< HEAD
 class MetalKernelFunction {
  public:
   MetalKernelFunction(MTLComputePipelineState_t cps_);
+=======
+// Returns `gpuAddress` of respective `id<MTLBuffer>` plus storage offset
+void* get_tensor_gpu_address(const at::TensorBase&);
+
+class MetalKernelFunction {
+ public:
+  MetalKernelFunction(MTLComputePipelineState_t cps_, MTLFunction_t f_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~MetalKernelFunction();
   MetalKernelFunction(MetalKernelFunction&) = delete;
   // Shader properties
@@ -56,7 +69,11 @@ class MetalKernelFunction {
   uint64_t getThreadExecutionWidth() const;
   uint64_t getStaticThreadGroupMemoryLength() const;
   void runCommandBlock(std::function<void(void)> f);
+<<<<<<< HEAD
   // Methods below should be called from runCommandBlock functionT
+=======
+  // Methods below should be called from runCommandBlock function
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void startEncoding();
   void setArg(unsigned idx, const at::TensorBase& t);
   void setArg(unsigned idx, const void* ptr, uint64_t size);
@@ -88,6 +105,10 @@ class MetalKernelFunction {
 
  private:
   MTLComputePipelineState_t cps;
+<<<<<<< HEAD
+=======
+  MTLFunction_t func;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   MTLComputeCommandEncoder_t encoder = nullptr;
 };
 
@@ -132,7 +153,17 @@ class MetalShaderLibrary {
   void exec_unary_kernel(
       TensorIteratorBase& iter,
       const std::string& name,
+<<<<<<< HEAD
       std::optional<int64_t> extra = std::nullopt);
+=======
+      const std::optional<c10::Scalar> alpha = std::nullopt,
+      const std::optional<c10::ScalarType> scalar_arg_type = std::nullopt);
+  void exec_binary_kernel(
+      TensorIteratorBase& iter,
+      const std::string& name,
+      const std::optional<c10::Scalar> alpha = std::nullopt,
+      const std::optional<c10::ScalarType> scalar_arg_type = std::nullopt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  protected:
   virtual MTLLibrary_t getLibrary();
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 4f8c3df538d3..d0940addfe15 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -5,6 +5,10 @@
 #include <initializer_list>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Tensor.h>
+<<<<<<< HEAD
+=======
+#include <ATen/TensorIterator.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/Utils.h>
 #include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/MetalShaderLibrary.h>
@@ -35,10 +39,13 @@
                                                                        name:(NSString*)name;
 @end
 
+<<<<<<< HEAD
 // Fwd declarations
 namespace at {
 struct TensorIteratorBase;
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using namespace at::mps;
 
 namespace at::native::mps {
@@ -100,6 +107,10 @@ MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph,
                                  const TensorBase& input,
                                  bool includesInt64 = false);
 
+<<<<<<< HEAD
+=======
+MPSNDArray* getStridedMPSNDArray(const TensorBase& src, MPSNDArray* srcNDArray);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MPSNDArray* getMPSNDArray(const TensorBase& t, const IntArrayRef& sizes = {}, const IntArrayRef& strides = {});
 MPSNDArray* getMPSNDArray(const TensorBase& t, MPSShape* sizes = nil, MPSShape* strides = nil);
 // The MPSShape could vary based on memory format
@@ -156,10 +167,37 @@ MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph* mpsGraph, const TensorBase&
 MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType);
 MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph* mpsGraph, const Scalar& scalar);
 
+<<<<<<< HEAD
 string get_mem_format_string(c10::MemoryFormat memory_format);
 
 using MPSCacheKey = uint64_t;
 
+=======
+std::string get_mem_format_string(c10::MemoryFormat memory_format);
+
+using MPSCacheKey = uint64_t;
+
+struct MPSCachedKernel {
+  MPSCachedKernel(NSObject* object) : _object([object retain]) {}
+  virtual ~MPSCachedKernel() {
+    [_object release];
+    _object = nullptr;
+  }
+
+  // Delete copy constructor and assignment
+  MPSCachedKernel(const MPSCachedKernel&) = delete;
+  void operator=(const MPSCachedKernel&) = delete;
+
+  template <typename T>
+  inline T* kernel() const {
+    return (T*)_object;
+  }
+
+ private:
+  NSObject* _object = nullptr;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // derive this class to cache a graph and its inputs/outputs
 // can be used to store any NSObject
 struct MPSCachedGraph {
@@ -214,6 +252,100 @@ struct MPSBinaryGradCachedGraph : public MPSCachedGraph {
   MPSGraphTensor* gradInputTensor_ = nil;
 };
 
+<<<<<<< HEAD
+=======
+struct MPSKernelCache {
+  typedef MPSCachedKernel* (^CreateCachedKernelBlock)();
+
+  struct CacheEntry {
+    CacheEntry(const std::string& key, MPSCachedKernel* cachedKernel) : cachedKernel_(cachedKernel), key_(key) {}
+    MPSCachedKernel* cachedKernel_ = nullptr;
+    std::string key_;
+  };
+
+ public:
+  static MPSKernelCache* getInstance() {
+    if (_instance_cache == nullptr) {
+      _instance_cache = new MPSKernelCache();
+    }
+    return _instance_cache;
+  }
+
+  ~MPSKernelCache() {
+    dispatch_release(serialQueue_);
+    for (const auto& i : cache_) {
+      delete i.second.cachedKernel_;
+    }
+  }
+
+  // Disallow the copy constructor and operator= functions
+  MPSKernelCache(const MPSKernelCache&) = delete;
+  void operator=(const MPSKernelCache&) = delete;
+
+  MPSCachedKernel* CreateCachedKernel(const std::string& key, CreateCachedKernelBlock createCacheBlock) {
+    __block MPSCachedKernel* cachedKernel = nil;
+    MPSCacheKey hash = std::hash<std::string>{}(key);
+    dispatch_sync_with_rethrow(serialQueue_, ^() {
+      if (cache_.count(hash) != 0) {
+        auto& entry = cache_.at(hash);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached kernel!\n");
+        cachedKernel = entry.cachedKernel_;
+      } else {
+        cachedKernel = createCacheBlock();
+        CacheEntry entry(key, cachedKernel);
+        cache_.emplace(hash, entry);
+      }
+    });
+    return cachedKernel;
+  }
+  template <typename T>
+  inline T* CreateCachedKernelAs(const std::string& key, CreateCachedKernelBlock createCacheBlock) {
+    return static_cast<T*>(CreateCachedKernel(key, createCacheBlock));
+  }
+
+  MPSCachedKernel* LookUp(const std::string& key) const {
+    __block MPSCachedKernel* cachedKernel = nil;
+
+    MPSCacheKey hash = std::hash<std::string>{}(key);
+    dispatch_sync_with_rethrow(serialQueue_, ^() {
+      if (cache_.count(hash) != 0) {
+        auto& entry = cache_.at(hash);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached kernel!\n");
+        cachedKernel = entry.cachedKernel_;
+      }
+    });
+    return cachedKernel;
+  }
+
+  template <typename T>
+  inline T* LookUpAs(const std::string& key) const {
+    return static_cast<T*>(LookUp(key));
+  }
+
+ private:
+  MPSKernelCache() {
+    serialQueue_ = dispatch_queue_create("kernel cache queue", DISPATCH_QUEUE_SERIAL);
+  }
+
+  static MPSKernelCache* _instance_cache;
+  std::unordered_map<MPSCacheKey, CacheEntry> cache_;
+  dispatch_queue_t serialQueue_ = nullptr;
+};
+
+// Common template for creating cached kernel if missing
+template <typename T>
+inline T* LookUpOrCreateCachedKernel(const std::string& key, std::function<MPSKernel*()> instantiate) {
+  auto cache_ = MPSKernelCache::getInstance();
+  if (auto rc = cache_->LookUpAs<T>(key)) {
+    return rc;
+  }
+  return cache_->CreateCachedKernelAs<T>(key, ^mps::MPSCachedKernel*() {
+    auto k_ = new mps::MPSCachedKernel(instantiate());
+    return k_;
+  });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // TODO: Improve the overall design of MPSGraphCache.
 // https://github.com/pytorch/pytorch/issues/77176
 // Cache holding various keys mapped to graphs
@@ -355,6 +487,20 @@ static inline void mtl_setBuffer(encoder_t encoder, const TensorBase& t, unsigne
   if (C10_UNLIKELY(t.device().type() == kCPU)) {
     if constexpr (std::is_same_v<id<MTLComputeCommandEncoder>, encoder_t>) {
       TORCH_CHECK(t.dim() == 0, "Passed CPU tensor to MPS op");
+<<<<<<< HEAD
+=======
+      // MPS does not support doubles, silently downcast CPU scalar to float
+      if (C10_UNLIKELY(t.scalar_type() == kDouble)) {
+        auto val = static_cast<float>(*reinterpret_cast<const double*>(t.const_data_ptr()));
+        [encoder setBytes:&val length:sizeof(val) atIndex:idx];
+        return;
+      }
+      if (C10_UNLIKELY(t.scalar_type() == kComplexDouble)) {
+        auto val = static_cast<c10::complex<float>>(*reinterpret_cast<const c10::complex<double>*>(t.const_data_ptr()));
+        [encoder setBytes:&val length:sizeof(val) atIndex:idx];
+        return;
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       [encoder setBytes:t.storage().data() length:t.element_size() atIndex:idx];
     } else {
       TORCH_CHECK(false, "Passed CPU tensor to MPS op");
@@ -385,6 +531,33 @@ static inline void mtl_setBytes(id<MTLComputeCommandEncoder> encoder, const MPSS
   [encoder setBytes:&s.value length:s.size atIndex:idx];
 }
 
+<<<<<<< HEAD
+=======
+static size_t iter_tensor_offset(TensorIteratorBase& iter, unsigned idx) {
+  // At the moment, MPS storage data is not the real GPU pointer, but rather a pointer to id<MTLBuffer> object
+  // But TensorIterator constructs data_ptr as if base was just a raw pointer
+  // Workaround this problem by computing an offset from the start of the tensor, which works for both
+  // tensor views and sliced 64-bit iterators
+  return reinterpret_cast<size_t>(iter.data_ptr(idx)) -
+      reinterpret_cast<size_t>(iter.tensor_base(idx).storage().data());
+}
+
+static inline void bind_iter_tensors(id<MTLComputeCommandEncoder> encoder,
+                                     TensorIteratorBase& iter,
+                                     std::optional<size_t> ntensors = std::nullopt) {
+  for (auto idx : c10::irange(ntensors.value_or(iter.ntensors()))) {
+    auto& t = iter.tensor_base(idx);
+    // Handle CPU scalars
+    if (C10_UNLIKELY(t.device().type() == kCPU)) {
+      mtl_setBuffer(encoder, t, idx);
+      continue;
+    }
+    auto offs = iter_tensor_offset(iter, idx);
+    [encoder setBuffer:getMTLBufferStorage(t) offset:offs atIndex:idx];
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace detail {
 template <typename T>
 inline void mtl_setArg(id<MTLComputeCommandEncoder> encoder, const T& val, unsigned idx) {
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 6f81add2b6c9..7f94cb4fe99a 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -1,6 +1,10 @@
 //  Copyright © 2022 Apple Inc.
 #include <ATen/core/TensorBase.h>
 #include <ATen/native/mps/MetalShaderLibrary.h>
+<<<<<<< HEAD
+=======
+#include <c10/metal/common.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <functional>
 #include <stdexcept>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
@@ -12,6 +16,10 @@
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
+<<<<<<< HEAD
+=======
+#include <fmt/ranges.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -20,6 +28,10 @@
 #include <ATen/ops/scalar_tensor.h>
 #endif
 
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <mach-o/dyld.h>
 #include <mach-o/getsect.h>
 
@@ -307,6 +319,7 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
 }
 
 std::string getArrayRefString(const IntArrayRef s) {
+<<<<<<< HEAD
   std::stringstream ss;
   std::copy(s.begin(), s.end(), std::ostream_iterator<int>(ss, ","));
   return ss.str();
@@ -338,6 +351,37 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
     }
   }
   return str;
+=======
+  return fmt::to_string(fmt::join(s, ","));
+}
+
+std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) {
+  fmt::basic_memory_buffer<char, 100> buffer;
+  auto buf_iterator = std::back_inserter(buffer);
+
+  for (const Tensor& tensor : tensors) {
+    fmt::format_to(buf_iterator, ":");
+    if (tensor.defined()) {
+      fmt::format_to(buf_iterator, "{}[", getMPSTypeString(tensor.scalar_type(), short_dtype));
+      if (tensor.dim() == 0) {
+        fmt::format_to(buf_iterator, "Scalar");
+      } else {
+        if (exclude_shape) {
+          fmt::format_to(buf_iterator, "-1");
+        } else {
+          fmt::format_to(buf_iterator, getArrayRefString(tensor.sizes()));
+        }
+      }
+      fmt::format_to(buf_iterator, "]");
+      if (tensor.is_conj()) {
+        fmt::format_to(buf_iterator, "_conj");
+      }
+    } else {
+      fmt::format_to(buf_iterator, "Undefined");
+    }
+  }
+  return fmt::to_string(buffer);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor getTensorView(const Tensor& t, MPSShape* shape) {
@@ -466,7 +510,11 @@ void printTensorNDArray(const TensorBase& t) {
                                                         offset:t.storage_offset() * t.element_size()
                                                     descriptor:srcTensorDesc] autorelease];
   if (strides != nil) {
+<<<<<<< HEAD
     srcNDArray = [srcNDArray arrayViewWithShape:sizes strides:strides];
+=======
+    srcNDArray = getStridedMPSNDArray(t, srcNDArray);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return srcNDArray;
 }
@@ -475,7 +523,11 @@ void printTensorNDArray(const TensorBase& t) {
   return getMPSNDArray(t, getMPSShape(sizes.empty() ? t.sizes() : sizes), strides.empty() ? nil : getMPSShape(strides));
 }
 
+<<<<<<< HEAD
 static MPSNDArray* getStridedMPSNDArray(const TensorBase& src, MPSNDArray* srcNDArray) {
+=======
+MPSNDArray* getStridedMPSNDArray(const TensorBase& src, MPSNDArray* srcNDArray) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto strides = src.strides();
   auto sizes = src.sizes();
   auto nStrides = strides.size();
@@ -759,8 +811,13 @@ Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device) {
                               name:nil];
 }
 
+<<<<<<< HEAD
 string get_mem_format_string(c10::MemoryFormat memory_format) {
   string mem_format_key;
+=======
+std::string get_mem_format_string(c10::MemoryFormat memory_format) {
+  std::string mem_format_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   switch (memory_format) {
     case at::MemoryFormat::Contiguous:
       mem_format_key = "Contiguous";
@@ -777,6 +834,11 @@ string get_mem_format_string(c10::MemoryFormat memory_format) {
 
 MPSGraphCache* MPSGraphCache::_instance_cache = nullptr;
 
+<<<<<<< HEAD
+=======
+MPSKernelCache* MPSKernelCache::_instance_cache = nullptr;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void MPSGraphCache::profileCachedGraph(const CacheEntry& cacheEntry) const {
   auto& profiler = getMPSProfiler();
   if (profiler.isOperationProfilingEnabled()) {
@@ -853,8 +915,13 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {}
 
 id<MTLLibrary> MetalShaderLibrary::compileLibrary(const std::string& src) {
   static auto fast_math = []() {
+<<<<<<< HEAD
     auto val = std::getenv("PYTORCH_MPS_FAST_MATH");
     return val && std::stoi(val) != 0;
+=======
+    auto const val = c10::utils::get_env("PYTORCH_MPS_FAST_MATH");
+    return val.has_value() && val != "0";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }();
   NSError* error = nil;
   MTLCompileOptions* options = compile_options;
@@ -921,7 +988,12 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {}
 }
 
 std::shared_ptr<MetalKernelFunction> MetalShaderLibrary::getKernelFunction(const std::string& name) {
+<<<<<<< HEAD
   return std::make_shared<MetalKernelFunction>(getPipelineStateForFunc(name));
+=======
+  auto [cpl, func] = getLibraryPipelineState(getLibrary(), name);
+  return std::make_shared<MetalKernelFunction>(cpl, func);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 class BundledShaderLibary : public MetalShaderLibrary {
@@ -969,15 +1041,31 @@ static dispatch_data_t getSectionData(const std::string& name) {
 
 void MetalShaderLibrary::exec_unary_kernel(TensorIteratorBase& iter,
                                            const std::string& name,
+<<<<<<< HEAD
                                            std::optional<int64_t> extra) {
   auto inputTensor = iter.input(0);
   auto outputTensor = iter.output(0);
   bool is_storage_dense = is_dense_in_storage(inputTensor) && inputTensor.strides().equals(outputTensor.strides());
+=======
+                                           std::optional<c10::Scalar> alpha,
+                                           std::optional<c10::ScalarType> scalar_arg_type) {
+  // Decompose 64-bit tensor into 32-bit ones
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto&& sub_iter : iter.with_32bit_indexing()) {
+      exec_unary_kernel(sub_iter, name, alpha, scalar_arg_type);
+    }
+    return;
+  }
+
+  auto inputTensor = iter.input(0);
+  auto outputTensor = iter.output(0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint32_t length = iter.numel();
   if (length == 0) {
     return;
   }
   using namespace mps;
+<<<<<<< HEAD
   @autoreleasepool {
     id<MTLComputePipelineState> cplState = nil;
     cplState = getPipelineStateForFunc(fmt::format("{}_{}_{}_{}",
@@ -985,6 +1073,17 @@ static dispatch_data_t getSectionData(const std::string& name) {
                                                    is_storage_dense ? "dense" : "strided",
                                                    scalarToMetalTypeString(outputTensor),
                                                    scalarToMetalTypeString(inputTensor)));
+=======
+  const auto alpha_type = scalar_arg_type.has_value() ? scalar_arg_type.value() : iter.common_dtype();
+  auto kernel_name = fmt::format("{}_{}_{}_{}{}",
+                                 name,
+                                 iter.is_contiguous() ? "dense" : "strided",
+                                 scalarToMetalTypeString(outputTensor),
+                                 scalarToMetalTypeString(inputTensor),
+                                 alpha.has_value() ? fmt::format("_{}", scalarToMetalTypeString(alpha_type)) : "");
+  @autoreleasepool {
+    auto cplState = getPipelineStateForFunc(kernel_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     MPSStream* mpsStream = getCurrentMPSStream();
     dispatch_sync(mpsStream->queue(), ^() {
@@ -993,6 +1092,7 @@ static dispatch_data_t getSectionData(const std::string& name) {
       getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor});
 
       [computeEncoder setComputePipelineState:cplState];
+<<<<<<< HEAD
       if (is_storage_dense) {
         mtl_setArgs(computeEncoder, outputTensor, inputTensor);
         if (extra) {
@@ -1009,6 +1109,18 @@ static dispatch_data_t getSectionData(const std::string& name) {
         if (extra) {
           mtl_setBytes(computeEncoder, *extra, 6);
         }
+=======
+      bind_iter_tensors(computeEncoder, iter);
+      if (!iter.is_contiguous()) {
+        mtl_setArgs<2>(computeEncoder,
+                       outputTensor.sizes(),
+                       inputTensor.strides(),
+                       outputTensor.strides(),
+                       inputTensor.ndimension());
+      }
+      if (alpha) {
+        mtl_setBytes(computeEncoder, getMPSScalar(*alpha, alpha_type), iter.is_contiguous() ? 2 : 6);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       mtl_dispatch1DJob(computeEncoder, cplState, length);
 
@@ -1017,6 +1129,108 @@ static dispatch_data_t getSectionData(const std::string& name) {
   }
 }
 
+<<<<<<< HEAD
+=======
+void MetalShaderLibrary::exec_binary_kernel(TensorIteratorBase& iter,
+                                            const std::string& name,
+                                            std::optional<c10::Scalar> alpha,
+                                            std::optional<c10::ScalarType> scalar_arg_type) {
+  // TODO: Figure a better place to downcast double scalars (probably in tensor iterator itself?)
+  // Right now running something like 1.0-torch.rand(5, device='mps') will create iterator with
+  // double as common dtype (because Python floating point are always 64-bit values)
+  TORCH_CHECK(iter.output().scalar_type() != at::kDouble, "float64 is not supported on MPS");
+
+  // Skip for empty iterators
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  // Decompose 64-bit tensor into 32-bit ones
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto&& sub_iter : iter.with_32bit_indexing()) {
+      exec_binary_kernel(sub_iter, name, alpha, scalar_arg_type);
+    }
+    return;
+  }
+
+  auto convert_double_scalar = [](Tensor& t) {
+    if (t.dim() != 0) {
+      return;
+    }
+    if (t.scalar_type() == kDouble) {
+      t = t.to(kFloat);
+    } else if (t.scalar_type() == kComplexDouble) {
+      t = t.to(kComplexFloat);
+    }
+  };
+
+  Tensor input = iter.input(0);
+  Tensor other = iter.input(1);
+  Tensor out = iter.output();
+
+  convert_double_scalar(input);
+  convert_double_scalar(other);
+
+  MPSStream* mpsStream = getCurrentMPSStream();
+  const auto cast_needed = input.scalar_type() != other.scalar_type();
+  const auto suffix = iter.is_contiguous() ? "dense" : "strided";
+  const auto alpha_type = scalar_arg_type.has_value() ? scalar_arg_type.value() : iter.common_dtype();
+  const auto alpha_suffix = alpha.has_value() ? fmt::format("_{}", scalarToMetalTypeString(alpha_type)) : "";
+  // TODO: Implicitly pass both input and output types to non-cast kernels
+  const auto kernel_name = cast_needed
+      ? fmt::format("{}_{}_cast_{}{}", name, suffix, scalarToMetalTypeString(out), alpha_suffix)
+      : fmt::format(
+            "{}_{}_{}_{}{}", name, suffix, scalarToMetalTypeString(out), scalarToMetalTypeString(input), alpha_suffix);
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      auto computeEncoder = mpsStream->commandEncoder();
+      auto binaryPSO = getPipelineStateForFunc(kernel_name);
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(binaryPSO, kernel_name, {input, other});
+      [computeEncoder setComputePipelineState:binaryPSO];
+      // Set input and output tensors
+      bind_iter_tensors(computeEncoder, iter);
+      // Iterator is contiguous if all of its elements are dense in storage,
+      // i.e. it's true for both row-first and column-first tensors
+      if (iter.is_contiguous()) {
+        if (alpha) {
+          mtl_setBytes(computeEncoder, getMPSScalar(*alpha, alpha_type), 3);
+        }
+        if (cast_needed) {
+          std::array<int, 4> size_and_types = {static_cast<int>(c10::elementSize(input.scalar_type())),
+                                               static_cast<int>(c10::elementSize(other.scalar_type())),
+                                               static_cast<int>(input.scalar_type()),
+                                               static_cast<int>(other.scalar_type())};
+          mtl_setBytes(computeEncoder, size_and_types, alpha ? 4 : 3);
+        }
+      } else {
+        // Please note that shapes and strides of the iterator might be
+        // different than that of its operands, for example binary op
+        // between 4x4 tensor and scalar will result in 1D 16 element iterator
+        std::array<int, 4> ndim_and_types = {iter.ndim(),
+                                             static_cast<int>(input.scalar_type()),
+                                             static_cast<int>(other.scalar_type()),
+                                             static_cast<int>(out.scalar_type())};
+        if (alpha) {
+          mtl_setArgs<3>(computeEncoder,
+                         getMPSScalar(*alpha, alpha_type),
+                         iter.shape(),
+                         iter.strides(0),
+                         iter.strides(1),
+                         iter.strides(2),
+                         ndim_and_types);
+        } else {
+          mtl_setArgs<3>(
+              computeEncoder, iter.shape(), iter.strides(0), iter.strides(1), iter.strides(2), ndim_and_types);
+        }
+      }
+      mtl_dispatch1DJob(computeEncoder, binaryPSO, iter.numel());
+      getMPSProfiler().endProfileKernel(binaryPSO);
+    }
+  });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MetalShaderLibrary& MetalShaderLibrary::getBundledLibrary() {
   static BundledShaderLibary l;
   return l;
@@ -1028,10 +1242,19 @@ static dispatch_data_t getSectionData(const std::string& name) {
 }
 
 // MetalKernelFunction implementation
+<<<<<<< HEAD
 MetalKernelFunction::MetalKernelFunction(MTLComputePipelineState_t cps_) : cps([cps_ retain]) {}
 
 MetalKernelFunction::~MetalKernelFunction() {
   [cps release];
+=======
+MetalKernelFunction::MetalKernelFunction(MTLComputePipelineState_t cps_, MTLFunction_t f_)
+    : cps([cps_ retain]), func([f_ retain]) {}
+
+MetalKernelFunction::~MetalKernelFunction() {
+  [cps release];
+  [func release];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void MetalKernelFunction::runCommandBlock(std::function<void(void)> run) {
@@ -1055,7 +1278,11 @@ static dispatch_data_t getSectionData(const std::string& name) {
 }
 
 void MetalKernelFunction::dispatch(c10::ArrayRef<uint64_t> length, c10::OptionalArrayRef<uint64_t> group_size) {
+<<<<<<< HEAD
   TORCH_CHECK(length.size() > 0 && length.size() < 4, "Dispatch dimentions must be less than 3 and non-empty");
+=======
+  TORCH_CHECK(!length.empty() && length.size() < 4, "Dispatch dimentions must be less than 3 and non-empty");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(!group_size.has_value() || group_size->size() == length.size(),
               "size and group_size must have same number of dimentions");
   const auto max_tg_size = getMaxThreadsPerThreadgroup();
@@ -1092,4 +1319,17 @@ static dispatch_data_t getSectionData(const std::string& name) {
   return [cps staticThreadgroupMemoryLength];
 }
 
+<<<<<<< HEAD
+} // namespace at::native::mps
+=======
+void* get_tensor_gpu_address(const at::TensorBase& t) {
+  return reinterpret_cast<void*>(getMTLBufferStorage(t).gpuAddress + t.storage_offset() * t.element_size());
+}
+
 } // namespace at::native::mps
+
+// Check that c10::metal::ScalarType is strict subset (with matching values) of c10::ScalarType
+#define DTYPE_CHECKER(_n, _v) \
+  static_assert(static_cast<int>(::c10::ScalarType::_n) == static_cast<int>(::c10::metal::ScalarType::_n));
+C10_METAL_ALL_TYPES_FUNCTOR(DTYPE_CHECKER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/ActivationKernel.metal b/aten/src/ATen/native/mps/kernels/ActivationKernel.metal
new file mode 100644
index 000000000000..f7335d150d40
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/ActivationKernel.metal
@@ -0,0 +1,146 @@
+#include <c10/metal/indexing.h>
+#include <c10/metal/special_math.h>
+#include <metal_stdlib>
+using namespace metal;
+using namespace c10::metal;
+
+struct hardshrink_functor {
+  template <typename T>
+  inline T operator()(const T x, const T lambda) {
+    return abs(float(x)) <= float(lambda) ? T(0) : x;
+  }
+};
+
+struct softshrink_functor {
+  template <typename T>
+  inline T operator()(const T x, const T lambda) {
+    if (x > lambda) {
+      return x - lambda;
+    } else if (x < -lambda) {
+      return x + lambda;
+    } else {
+      return T(0);
+    }
+  }
+};
+
+struct shrink_backward_functor {
+  template <typename T>
+  inline T operator()(const T grad_output, const T x, const T lambda) {
+    return abs(float(x)) <= float(lambda) ? T(0) : grad_output;
+  }
+};
+
+REGISTER_UNARY_ALPHA_OP(hardshrink, float, float, float);
+REGISTER_UNARY_ALPHA_OP(hardshrink, half, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_UNARY_ALPHA_OP(hardshrink, bfloat, bfloat, bfloat);
+#endif
+
+REGISTER_UNARY_ALPHA_OP(softshrink, float, float, float);
+REGISTER_UNARY_ALPHA_OP(softshrink, half, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_UNARY_ALPHA_OP(softshrink, bfloat, bfloat, bfloat);
+#endif
+
+REGISTER_BINARY_ALPHA_OP(shrink_backward, float, float, float);
+REGISTER_BINARY_ALPHA_OP(shrink_backward, half, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_BINARY_ALPHA_OP(shrink_backward, bfloat, bfloat, bfloat);
+#endif
+
+struct hardsigmoid_functor {
+  template <typename T>
+  inline T operator()(const T x) {
+    return static_cast<T>(min(max(x + 3.0f, .0f), 6.f) / 6.f);
+  }
+};
+
+struct hardsigmoid_backward_functor {
+  template <typename T>
+  inline T operator()(const T grad_output, const T self) {
+    constexpr auto one_sixth = 1.0f / 6.0f;
+    return static_cast<T>(
+        abs(float(self)) < 3.0f ? float(grad_output) * one_sixth : 0.0f);
+  }
+};
+
+REGISTER_UNARY_OP(hardsigmoid, float, float);
+REGISTER_UNARY_OP(hardsigmoid, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_UNARY_OP(hardsigmoid, bfloat, bfloat);
+#endif
+
+REGISTER_BINARY_OP(hardsigmoid_backward, float, float);
+REGISTER_BINARY_OP(hardsigmoid_backward, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_BINARY_OP(hardsigmoid_backward, bfloat, bfloat);
+#endif
+
+struct hardswish_functor {
+  template <typename T>
+  inline T operator()(const T x) {
+    return static_cast<T>(float(x) * min(max(float(x) + 3.0f, .0f), 6.f) / 6.f);
+  }
+};
+
+struct hardswish_backward_functor {
+  template <typename T>
+  inline T operator()(const T grad_output, const T self) {
+    constexpr T zero(0);
+    constexpr T three(3);
+    constexpr T neg_three(-3);
+
+    if (self <= neg_three) {
+      return zero;
+    } else if (self >= three) {
+      return grad_output;
+    } else {
+      return static_cast<T>(float(grad_output) * (float(self) / 3.0f + 0.5f));
+    }
+  }
+};
+
+REGISTER_UNARY_OP(hardswish, float, float);
+REGISTER_UNARY_OP(hardswish, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_UNARY_OP(hardswish, bfloat, bfloat);
+#endif
+
+REGISTER_BINARY_OP(hardswish_backward, float, float);
+REGISTER_BINARY_OP(hardswish_backward, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
+#endif
+
+struct leaky_relu_functor {
+  template <typename T>
+  inline T operator()(const T x, const T negative_slope) {
+    return float(x) > 0.0f ? x
+                           : static_cast<T>(float(x) * float(negative_slope));
+  }
+};
+
+struct leaky_relu_backward_functor {
+  template <typename T>
+  inline T operator()(
+      const T self,
+      const T grad_output,
+      const T negative_slope) {
+    return float(self) > 0.0f
+        ? grad_output
+        : static_cast<T>(float(grad_output) * float(negative_slope));
+  }
+};
+
+REGISTER_UNARY_ALPHA_OP(leaky_relu, float, float, float);
+REGISTER_UNARY_ALPHA_OP(leaky_relu, half, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_UNARY_ALPHA_OP(leaky_relu, bfloat, bfloat, bfloat);
+#endif
+
+REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, float, float, float);
+REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, half, half, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_BINARY_ALPHA_OP(leaky_relu_backward, bfloat, bfloat, bfloat);
+#endif
diff --git a/aten/src/ATen/native/mps/kernels/Amp.metal b/aten/src/ATen/native/mps/kernels/Amp.metal
new file mode 100644
index 000000000000..abe852798f44
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/Amp.metal
@@ -0,0 +1,130 @@
+#include <metal_stdlib>
+using namespace metal;
+
+constant constexpr unsigned kmaxThreadGroups = 32;
+constant constexpr unsigned kmaxTensors = 32;
+constant constexpr unsigned kChunkSize = 65536;
+
+template <typename T>
+struct AmpNonFiniteCheckAndUnscaleArgs {
+  metal::array<device T*, kmaxTensors> data [[id(0)]];
+};
+
+struct MetadataArguments {
+  ulong numels[kmaxTensors];
+  ulong threadgroup_to_tensor[kmaxThreadGroups];
+  ulong threadgroup_to_chunk[kmaxThreadGroups];
+};
+
+template <typename T>
+kernel void ampNonFiniteCheckAndUnscale(
+    constant AmpNonFiniteCheckAndUnscaleArgs<T>& pointerArgs [[buffer(0)]],
+    constant MetadataArguments& metadata [[buffer(1)]],
+    device float& foundInf [[buffer(2)]],
+    constant T& invScale [[buffer(3)]],
+    uint local_tid [[thread_position_in_threadgroup]],
+    uint tgSize [[threads_per_threadgroup]],
+    uint group_id [[threadgroup_position_in_grid]]) {
+  uint threadGroupSize = tgSize;
+  uint tensor_index = metadata.threadgroup_to_tensor[group_id];
+  uint chunk = metadata.threadgroup_to_chunk[group_id];
+  uint numel = metadata.numels[tensor_index];
+
+  uint offset = chunk * kChunkSize;
+  uint chunk_size =
+      ((offset + kChunkSize) > numel) ? (numel - offset) : kChunkSize;
+
+  device T* data = pointerArgs.data[tensor_index];
+
+  for (uint i = local_tid; i < chunk_size; i += threadGroupSize) {
+    uint index = offset + i;
+    T val = data[index];
+    if (!isfinite(val)) {
+      foundInf = 1.0f;
+    }
+    data[index] = (invScale == static_cast<T>(1.0) ? val : val * invScale);
+  }
+}
+
+template <typename T>
+kernel void ampNonFiniteCheckAndUnscaleSingle(
+    device T* data [[buffer(0)]],
+    device float& foundInf [[buffer(1)]],
+    constant T& invScale [[buffer(2)]],
+    uint tid [[thread_position_in_grid]]) {
+  T val = data[tid];
+  if (!isfinite(val)) {
+    foundInf = 1.0f;
+  }
+  data[tid] = (invScale == T(1.0) ? val : val * invScale);
+}
+
+template <typename T>
+kernel void ampUpdateScale(
+    device T& scale [[buffer(0)]],
+    device int& growth_tracker [[buffer(1)]],
+    device float& foundInf [[buffer(2)]],
+    constant T& scaleGrowthFactor [[buffer(3)]],
+    constant T& scaleBackoffFactor [[buffer(4)]],
+    constant int& growthInterval [[buffer(5)]]) {
+  if (foundInf != 0.0f) {
+    scale *= scaleBackoffFactor;
+    growth_tracker = 0;
+  } else {
+    int g = growth_tracker + 1;
+    if (g >= growthInterval) {
+      scale *= scaleGrowthFactor;
+      g = 0;
+    }
+    growth_tracker = g;
+  }
+}
+
+#define INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(DTYPE)                  \
+  template [[host_name("ampNonFiniteCheckAndUnscale_" #DTYPE)]] kernel void \
+  ampNonFiniteCheckAndUnscale<DTYPE>(                                       \
+      constant AmpNonFiniteCheckAndUnscaleArgs<DTYPE> &                     \
+          pointerArgs [[buffer(0)]],                                        \
+      constant MetadataArguments & metadata [[buffer(1)]],                  \
+      device float& foundInf [[buffer(2)]],                                 \
+      constant DTYPE& invScale [[buffer(3)]],                               \
+      uint local_tid [[thread_position_in_threadgroup]],                    \
+      uint tgSize [[threads_per_threadgroup]],                              \
+      uint group_id [[threadgroup_position_in_grid]])
+
+#define INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(DTYPE)            \
+  template                                                                   \
+      [[host_name("ampNonFiniteCheckAndUnscaleSingle_" #DTYPE)]] kernel void \
+      ampNonFiniteCheckAndUnscaleSingle<DTYPE>(                              \
+          device DTYPE * data [[buffer(0)]],                                 \
+          device float& foundInf [[buffer(1)]],                              \
+          constant DTYPE& invScale [[buffer(2)]],                            \
+          uint tid [[thread_position_in_grid]])
+
+#define INSTANTIATE_AMP_UPDATE_SCALE(DTYPE)                    \
+  template [[host_name("ampUpdateScale_" #DTYPE)]] kernel void \
+  ampUpdateScale<DTYPE>(                                       \
+      device DTYPE & scale [[buffer(0)]],                      \
+      device int& growth_tracker [[buffer(1)]],                \
+      device float& foundInf [[buffer(2)]],                    \
+      constant DTYPE& scaleGrowthFactor [[buffer(3)]],         \
+      constant DTYPE& scaleBackoffFactor [[buffer(4)]],        \
+      constant int& growthInterval [[buffer(5)]])
+
+INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(float);
+INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE(bfloat);
+#endif
+
+INSTANTIATE_AMP_UPDATE_SCALE(float);
+INSTANTIATE_AMP_UPDATE_SCALE(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_AMP_UPDATE_SCALE(bfloat);
+#endif
+
+INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(float);
+INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_AMP_NONFINITE_CHECK_AND_UNSCALE_SINGLE(bfloat);
+#endif
diff --git a/aten/src/ATen/native/mps/kernels/Attention.metal b/aten/src/ATen/native/mps/kernels/Attention.metal
new file mode 100644
index 000000000000..c18ef3711ea3
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/Attention.metal
@@ -0,0 +1,626 @@
+// Largely influeneced by
+// https://github.com/ml-explore/mlx/blob/main/mlx/backend/metal/kernels/scaled_dot_product_attention.metal
+#include <c10/metal/utils.h>
+#include <metal_simdgroup>
+#include <metal_stdlib>
+
+using namespace metal;
+
+template <typename T, int D, int V = D>
+[[kernel]] void sdpa_vector(
+    const device T* queries [[buffer(0)]],
+    const device T* keys [[buffer(1)]],
+    const device T* values [[buffer(2)]],
+    device T* out [[buffer(3)]],
+    const constant uint& gqa_factor [[buffer(4)]],
+    const constant uint& N [[buffer(5)]],
+    const constant uint2& k_head_seq_stride [[buffer(6)]],
+    const constant uint2& v_head_seq_stride [[buffer(7)]],
+    const constant float& scale [[buffer(8)]],
+    const device bool* mask [[buffer(9)]],
+    const constant uint3& mask_strides [[buffer(10)]],
+    const constant bool& has_mask [[buffer(11)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threadgroups_per_grid]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  constexpr uint BN = 32;
+  constexpr uint BD = 32;
+  constexpr uint qk_per_thread = D / BD;
+  constexpr uint v_per_thread = V / BD;
+  const uint k_head_stride = k_head_seq_stride.x;
+  const uint k_seq_stride = k_head_seq_stride.y;
+  const uint v_head_stride = v_head_seq_stride.x;
+  const uint v_seq_stride = v_head_seq_stride.y;
+  const uint mask_head_stride = mask_strides.x;
+  const uint mask_kv_seq_stride = mask_strides.y;
+  const uint mask_q_seq_stride = mask_strides.z;
+  uint inner_k_stride = BN * int(k_seq_stride);
+  uint inner_v_stride = BN * int(v_seq_stride);
+
+  typedef float U;
+
+  thread U q[qk_per_thread];
+  thread U k[qk_per_thread];
+  thread U o[v_per_thread];
+
+  threadgroup U outputs[BN * BD];
+  threadgroup U max_scores[BN];
+  threadgroup U sum_exp_scores[BN];
+
+  // Adjust positions
+  const int head_idx = tid.x;
+  const int q_seq_idx = tid.y;
+  const int kv_head_idx = head_idx / gqa_factor;
+  const int Q = tpg.y;
+  const int group_offset = head_idx * Q + q_seq_idx;
+  const int q_offset = group_offset;
+  const int o_offset = group_offset;
+  queries += q_offset * D + simd_lid * qk_per_thread;
+  keys += kv_head_idx * k_head_stride + simd_gid * k_seq_stride +
+      simd_lid * qk_per_thread;
+  values += kv_head_idx * v_head_stride + simd_gid * v_seq_stride +
+      simd_lid * v_per_thread;
+  if (has_mask) {
+    mask += head_idx * mask_head_stride + simd_gid * mask_kv_seq_stride +
+        q_seq_idx * mask_q_seq_stride;
+  }
+
+  out += o_offset * V + simd_gid * v_per_thread;
+
+  // Read the query and 0 the output accumulator
+  for (uint i = 0; i < qk_per_thread; i++) {
+    q[i] = scale * static_cast<U>(queries[i]);
+  }
+  for (uint i = 0; i < v_per_thread; i++) {
+    o[i] = 0;
+  }
+
+  U max_score = -INFINITY;
+  U sum_exp_score = 0;
+
+  // For each key
+  for (uint i = simd_gid; i < N; i += BN) {
+    if (!has_mask || mask[0]) {
+      // Read the key
+      for (uint j = 0; j < qk_per_thread; j++) {
+        k[j] = static_cast<U>(keys[j]);
+      }
+
+      // Compute the i-th score
+      U score = 0;
+      for (uint j = 0; j < qk_per_thread; j++) {
+        score += q[j] * k[j];
+      }
+      score = simd_sum(score);
+
+      // Update the accumulators
+      U new_max = max(max_score, score);
+      U factor = metal::fast::exp(max_score - new_max);
+      U exp_score = metal::fast::exp(score - new_max);
+
+      max_score = new_max;
+      sum_exp_score = sum_exp_score * factor + exp_score;
+
+      // Update the output accumulator
+      for (uint j = 0; j < v_per_thread; j++) {
+        o[j] = o[j] * factor + exp_score * static_cast<U>(values[j]);
+      }
+    }
+
+    // Move the pointers to the next kv
+    keys += inner_k_stride;
+    values += inner_v_stride;
+    if (has_mask) {
+      mask += BN * mask_kv_seq_stride;
+    }
+  }
+
+  // Each thread has a partial part of the output so we need to combine them.
+
+  // First let's communicate the max and sum_exp
+  if (simd_lid == 0) {
+    max_scores[simd_gid] = max_score;
+    sum_exp_scores[simd_gid] = sum_exp_score;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  max_score = max_scores[simd_lid];
+  U new_max = simd_max(max_score);
+  U factor = metal::fast::exp(max_score - new_max);
+  sum_exp_score = simd_sum(sum_exp_scores[simd_lid] * factor);
+
+  // Now we need to aggregate all the outputs
+  for (uint i = 0; i < v_per_thread; i++) {
+    outputs[simd_lid * BD + simd_gid] = o[i];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    const U safe_sum = (sum_exp_score == 0 ? 1e-6f : sum_exp_score);
+    o[i] = simd_sum(outputs[simd_gid * BD + simd_lid] * factor) / safe_sum;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+  }
+
+  // And write the output
+  if (simd_lid == 0) {
+    for (uint i = 0; i < v_per_thread; i++) {
+      out[i] = static_cast<T>(o[i]);
+    }
+  }
+}
+
+template <typename T, int D, int V = D>
+[[kernel]] void sdpa_vector_2pass_1(
+    const device T* queries [[buffer(0)]],
+    const device T* keys [[buffer(1)]],
+    const device T* values [[buffer(2)]],
+    device T* out [[buffer(3)]],
+    device float* sums [[buffer(4)]],
+    device float* maxs [[buffer(5)]],
+    const constant uint& gqa_factor [[buffer(6)]],
+    const constant uint& N [[buffer(7)]],
+    const constant uint2& k_head_seq_stride [[buffer(8)]],
+    const constant uint2& v_head_seq_stride [[buffer(9)]],
+    const constant float& scale [[buffer(10)]],
+    const device bool* mask [[buffer(11)]],
+    const constant uint3& mask_strides [[buffer(12)]],
+    const constant bool& has_mask [[buffer(13)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threadgroups_per_grid]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  constexpr int BN = 8;
+  constexpr int BD = 32;
+  constexpr int qk_per_thread = D / BD;
+  constexpr int v_per_thread = V / BD;
+  const int k_head_stride = k_head_seq_stride.x;
+  const int k_seq_stride = k_head_seq_stride.y;
+  const int v_head_stride = v_head_seq_stride.x;
+  const int v_seq_stride = v_head_seq_stride.y;
+  const int mask_kv_seq_stride = mask_strides.x;
+  const int mask_q_seq_stride = mask_strides.y;
+  const int mask_head_stride = mask_strides.z;
+  int inner_k_stride = BN * int(k_seq_stride);
+  int inner_v_stride = BN * int(v_seq_stride);
+  constexpr int blocks = 32;
+
+  typedef float U;
+
+  thread U q[qk_per_thread];
+  thread U k[qk_per_thread];
+  thread U o[v_per_thread];
+
+  threadgroup U outputs[BN * BD];
+  threadgroup U max_scores[BN];
+  threadgroup U sum_exp_scores[BN];
+
+  // Adjust positions
+  const int block_idx = tid.z;
+  const int head_idx = tid.x;
+  const int q_seq_idx = tid.y;
+  const int o_offset = head_idx * tpg.y + q_seq_idx;
+  const int q_offset = o_offset;
+  const int kv_head_idx = head_idx / gqa_factor;
+
+  queries += q_offset * D + simd_lid * qk_per_thread;
+  keys += kv_head_idx * k_head_stride +
+      (block_idx * BN + simd_gid) * k_seq_stride + simd_lid * qk_per_thread;
+  values += kv_head_idx * v_head_stride +
+      (block_idx * BN + simd_gid) * v_seq_stride + simd_lid * v_per_thread;
+  out += o_offset * blocks * V + block_idx * V + simd_lid * v_per_thread;
+  if (has_mask) {
+    mask += head_idx * mask_head_stride +
+        (block_idx * BN + simd_gid) * mask_kv_seq_stride +
+        q_seq_idx * mask_q_seq_stride;
+  }
+  sums += o_offset * blocks + block_idx;
+  maxs += o_offset * blocks + block_idx;
+
+  // Read the query and 0 the output accumulator
+  for (uint i = 0; i < qk_per_thread; i++) {
+    q[i] = scale * static_cast<U>(queries[i]);
+  }
+  for (uint i = 0; i < v_per_thread; i++) {
+    o[i] = 0;
+  }
+
+  U max_score = -INFINITY;
+  U sum_exp_score = 0;
+
+  // For each key
+  for (uint i = block_idx * BN + simd_gid; i < N; i += blocks * BN) {
+    if (!has_mask || mask[0]) {
+      // Read the key
+      for (uint i = 0; i < qk_per_thread; i++) {
+        k[i] = static_cast<U>(keys[i]);
+      }
+
+      // Compute the i-th score
+      U score = 0;
+      for (uint i = 0; i < qk_per_thread; i++) {
+        score += q[i] * k[i];
+      }
+      score = simd_sum(score);
+
+      // Update the accumulators
+      U new_max = max(max_score, score);
+      U factor = fast::exp(max_score - new_max);
+      U exp_score = fast::exp(score - new_max);
+
+      max_score = new_max;
+      sum_exp_score = sum_exp_score * factor + exp_score;
+
+      // Update the output accumulator
+      for (uint i = 0; i < v_per_thread; i++) {
+        o[i] = o[i] * factor + exp_score * static_cast<U>(values[i]);
+      }
+    }
+
+    // Move the pointers to the next kv
+    keys += blocks * inner_k_stride;
+    values += blocks * inner_v_stride;
+    if (has_mask) {
+      mask += BN * blocks * mask_kv_seq_stride;
+    }
+  }
+
+  // Each thread has a partial part of the output so we need to combine them.
+
+  // First let's communicate the max and sum_exp
+  if (simd_lid == 0) {
+    max_scores[simd_gid] = max_score;
+    sum_exp_scores[simd_gid] = sum_exp_score;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  max_score = (simd_lid < BN) ? max_scores[simd_lid] : -1e9;
+  U new_max = simd_max(max_score);
+  U factor = fast::exp(max_score - new_max);
+  sum_exp_score = (simd_lid < BN) ? sum_exp_scores[simd_lid] : 0;
+  sum_exp_score = simd_sum(sum_exp_score * factor);
+
+  // Write the sum and new max
+  if (simd_gid == 0) {
+    sums[0] = sum_exp_score;
+    maxs[0] = new_max;
+  }
+
+  // Now we need to aggregate all the outputs
+  for (uint i = 0; i < v_per_thread; i++) {
+    outputs[simd_lid * BN + simd_gid] =
+        o[i] * fast::exp(max_scores[simd_gid] - new_max);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // And write the output
+    if (simd_gid == 0) {
+      U output = outputs[simd_lid * BN];
+      for (uint j = 1; j < BN; j++) {
+        output += outputs[simd_lid * BN + j];
+      }
+      out[i] = static_cast<T>(output);
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+  }
+}
+
+template <typename T, int D>
+[[kernel]] void sdpa_vector_2pass_2(
+    const device T* partials [[buffer(0)]],
+    const device float* sums [[buffer(1)]],
+    const device float* maxs [[buffer(2)]],
+    device T* out [[buffer(3)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threadgroups_per_grid]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  constexpr int BN = 32;
+  constexpr int BD = 32;
+  constexpr int elem_per_thread = D / BD;
+  constexpr int blocks = 32;
+
+  typedef float U;
+
+  thread U o[elem_per_thread];
+  threadgroup U outputs[BN * BD];
+
+  // Adjust positions
+  const int head_idx = tid.x;
+  const int q_seq_idx = tid.y;
+  const int hq_offset = head_idx * tpg.y + q_seq_idx;
+  partials +=
+      hq_offset * blocks * D + simd_gid * D + simd_lid * elem_per_thread;
+  sums += hq_offset * blocks;
+  maxs += hq_offset * blocks;
+  out += hq_offset * D + simd_gid * elem_per_thread;
+
+  // First every thread reads the max and sum_exp
+  U max_score = maxs[simd_lid];
+  U new_max = simd_max(max_score);
+  U factor = fast::exp(max_score - new_max);
+  U sum_exp_score = simd_sum(sums[simd_lid] * factor);
+
+  // Now read the block into registers and then use shared memory to transpose
+  // it
+  for (uint i = 0; i < elem_per_thread; i++) {
+    o[i] = partials[i];
+  }
+  for (uint i = 0; i < elem_per_thread; i++) {
+    outputs[simd_lid * BD + simd_gid] = o[i];
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    const U safe_sum = (sum_exp_score == 0 ? 1e-6f : sum_exp_score);
+    o[i] = simd_sum(outputs[simd_gid * BD + simd_lid] * factor) / safe_sum;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+  }
+
+  // And write the output
+  if (simd_lid == 0) {
+    for (uint i = 0; i < elem_per_thread; i++) {
+      out[i] = static_cast<T>(o[i]);
+    }
+  }
+}
+
+template <typename T, int BQ, int BK, int BD, int WM, int WN>
+kernel void attention(
+    const device T* Q [[buffer(0)]],
+    const device T* K [[buffer(1)]],
+    const device T* V [[buffer(2)]],
+    device T* O [[buffer(3)]],
+    const constant uint& qL [[buffer(4)]],
+    const constant uint& kL [[buffer(5)]],
+    const constant uint& gqa_factor [[buffer(6)]],
+    const constant float& scale [[buffer(7)]],
+    const constant uint& NK [[buffer(8)]],
+    const constant uint3& Q_strides [[buffer(9)]],
+    const constant uint3& K_strides [[buffer(10)]],
+    const constant uint3& V_strides [[buffer(11)]],
+    const constant uint3& O_strides [[buffer(12)]],
+    uint3 group_pos [[threadgroup_position_in_grid]],
+    uint3 local_pos [[thread_position_in_threadgroup]]) {
+  // 1. Compute a full linear thread id from the 3D local id.
+  constexpr int THREADGROUP_DIM_X = 32;
+  constexpr int THREADGROUP_DIM_Y = WM;
+  constexpr int THREADGROUP_DIM_Z = WN;
+  const int threads_in_group =
+      THREADGROUP_DIM_X * THREADGROUP_DIM_Y * THREADGROUP_DIM_Z;
+  int tid = local_pos.x + local_pos.y * THREADGROUP_DIM_X +
+      local_pos.z * (THREADGROUP_DIM_X * THREADGROUP_DIM_Y);
+
+  // 2. Compute the effective number of Q (query) rows for this tile.
+  const int query_seq_length = qL;
+  int start_q = group_pos.x * BQ;
+  uint tile_rows =
+      (start_q + BQ <= query_seq_length) ? BQ : (query_seq_length - start_q);
+
+  // 3. Compute Global Pointers Offsets for Q and O.
+  uint batch = group_pos.z;
+  uint head = group_pos.y;
+  uint seq_tile = group_pos.x;
+
+  const device T* Q_tile_ptr = Q + batch * Q_strides.x + head * Q_strides.y +
+      seq_tile * BQ * Q_strides.z;
+  device T* O_tile_ptr = O + batch * O_strides.x + head * O_strides.y +
+      seq_tile * BQ * O_strides.z;
+
+  // Adjust head index for K and V using gqa_factor.
+  uint kv_head = head / gqa_factor;
+  const device T* K_ptr = K + batch * K_strides.x + kv_head * K_strides.y;
+  const device T* V_ptr = V + batch * V_strides.x + kv_head * V_strides.y;
+
+  // 4. Declare Threadgroup (Shared) Memory for tiles.
+  // qTile covers BQ rows (each of length BD), kTile and vTile cover BK rows.
+  threadgroup T qTile[BQ * BD];
+  threadgroup T kTile[BK * BD];
+  threadgroup T vTile[BK * BD];
+
+  // 5. Load Q from global memory into threadgroup memory & apply scaling.
+  uint tile_q_elements = tile_rows * BD;
+  for (uint i = tid; i < tile_q_elements; i += threads_in_group) {
+    int row = i / BD;
+    int col = i % BD;
+    qTile[i] = Q_tile_ptr[row * Q_strides.z + col] * (T)scale;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // 6. Initialize accumulation buffers for output and softmax reduction.
+  float oAcc[BQ * BD]; // Only first tile_q_elements are used
+  float row_max[BQ]; // For each valid query row
+  float row_sum[BQ]; // For each valid query row
+  for (uint i = 0; i < tile_rows; i++) {
+    row_max[i] = -FLT_MAX;
+    row_sum[i] = 0.0f;
+  }
+  for (uint i = 0; i < tile_q_elements; i++) {
+    oAcc[i] = 0.0f;
+  }
+
+  // 7. Loop over the Key/Value (KV) sequence tiles.
+  for (uint kb_tile = 0; kb_tile < NK; ++kb_tile) {
+    uint kv_base = kb_tile * BK; // first KV row in this tile
+    uint total_kv_elements = BK * BD;
+
+    // --- Load K and V tiles into threadgroup memory.
+    // For positions that are out-of-bound (padded) set K to -INFINITY.
+    for (uint i = tid; i < total_kv_elements; i += threads_in_group) {
+      int row = i / BD;
+      int col = i % BD;
+      if ((kv_base + row) < kL) {
+        kTile[i] = K_ptr[(kv_base + row) * K_strides.z + col];
+        vTile[i] = V_ptr[(kv_base + row) * V_strides.z + col];
+      } else {
+        // For invalid keys, assign a very negative value so that exp(-inf)=0
+        kTile[i] = static_cast<T>(-INFINITY);
+        vTile[i] = 0;
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // 8. Compute the score matrix S = Q x (K)^T for this KV tile.
+    float S[BQ * BK];
+    for (uint i = 0; i < tile_rows; i++) {
+      for (int j = 0; j < BK; j++) {
+        float dot = 0.0f;
+        // Only compute dot product if this tile row corresponds to a valid key.
+        if ((kv_base + j) < kL) {
+          for (int d = 0; d < BD; d++) {
+            dot += qTile[i * BD + d] * kTile[j * BD + d];
+          }
+        } else {
+          dot = -INFINITY;
+        }
+        S[i * BK + j] = dot;
+      }
+    }
+
+    // 9. Update softmax statistics (row-wise) using an online reduction.
+    for (uint i = 0; i < tile_rows; i++) {
+      float old_max = row_max[i];
+      float new_max = old_max;
+      for (int j = 0; j < BK; j++) {
+        float val = S[i * BK + j];
+        if (val > new_max) {
+          new_max = val;
+        }
+      }
+      float factor = exp(old_max - new_max);
+      row_max[i] = new_max;
+      // Scale the accumulated numerator for this row.
+      for (int d = 0; d < BD; d++) {
+        oAcc[i * BD + d] *= factor;
+      }
+      // Exponentiate the scores and accumulate the sums.
+      float exp_sum = 0.0f;
+      for (int j = 0; j < BK; j++) {
+        float s_val = exp(S[i * BK + j] - new_max);
+        S[i * BK + j] = s_val;
+        exp_sum += s_val;
+      }
+      row_sum[i] = row_sum[i] * factor + exp_sum;
+    }
+
+    // 10. Use the softmax weights to compute the weighted sum of V.
+    for (uint i = 0; i < tile_rows; i++) {
+      for (int d = 0; d < BD; d++) {
+        float acc = 0.0f;
+        for (int j = 0; j < BK; j++) {
+          acc += S[i * BK + j] * vTile[j * BD + d];
+        }
+        oAcc[i * BD + d] += acc;
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+  } // End of KV tile loop
+
+  // 11. Normalize the accumulated output and store the results to global
+  // memory.
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  if (local_pos.x == 0 && local_pos.y == 0 && local_pos.z == 0) {
+    for (uint i = 0; i < tile_rows; i++) {
+      for (int d = 0; d < BD; d++) {
+        O_tile_ptr[i * O_strides.z + d] =
+            static_cast<T>(oAcc[i * BD + d] / row_sum[i]);
+      }
+    }
+  }
+}
+
+#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)    \
+  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM     \
+                       "_" #VALUE_DIM)]] kernel void         \
+  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                     \
+      const device DTYPE* queries [[buffer(0)]],             \
+      const device DTYPE* keys [[buffer(1)]],                \
+      const device DTYPE* values [[buffer(2)]],              \
+      device DTYPE* out [[buffer(3)]],                       \
+      const constant uint& gqa_factor [[buffer(4)]],         \
+      const constant uint& N [[buffer(5)]],                  \
+      const constant uint2& k_head_seq_stride [[buffer(6)]], \
+      const constant uint2& v_head_seq_stride [[buffer(7)]], \
+      const constant float& scale [[buffer(8)]],             \
+      const device bool* mask [[buffer(9)]],                 \
+      const constant uint3& mask_strides [[buffer(10)]],     \
+      const constant bool& has_mask [[buffer(11)]],          \
+      uint3 tid [[threadgroup_position_in_grid]],            \
+      uint3 tpg [[threadgroups_per_grid]],                   \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],      \
+      uint simd_lid [[thread_index_in_simdgroup]]);
+
+#define INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, QK_DIM, VALUE_DIM) \
+  template [[host_name("sdpa_vector_2pass_1_" #DTYPE "_" #QK_DIM  \
+                       "_" #VALUE_DIM)]] kernel void              \
+  sdpa_vector_2pass_1<DTYPE, QK_DIM, VALUE_DIM>(                  \
+      const device DTYPE* queries [[buffer(0)]],                  \
+      const device DTYPE* keys [[buffer(1)]],                     \
+      const device DTYPE* values [[buffer(2)]],                   \
+      device DTYPE* out [[buffer(3)]],                            \
+      device float* sums [[buffer(4)]],                           \
+      device float* maxs [[buffer(5)]],                           \
+      const constant uint& gqa_factor [[buffer(6)]],              \
+      const constant uint& N [[buffer(7)]],                       \
+      const constant uint2& k_head_seq_stride [[buffer(8)]],      \
+      const constant uint2& v_head_seq_stride [[buffer(9)]],      \
+      const constant float& scale [[buffer(10)]],                 \
+      const device bool* mask [[buffer(11)]],                     \
+      const constant uint3& mask_strides [[buffer(12)]],          \
+      const constant bool& has_mask [[buffer(13)]],               \
+      uint3 tid [[threadgroup_position_in_grid]],                 \
+      uint3 tpg [[threadgroups_per_grid]],                        \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],           \
+      uint simd_lid [[thread_index_in_simdgroup]]);
+
+#define INSTANTIATE_SDPA_VECTOR_AGGREGATION(DTYPE, VALUE_DIM)                 \
+  template                                                                    \
+      [[host_name("sdpa_vector_2pass_2_" #DTYPE "_" #VALUE_DIM)]] kernel void \
+      sdpa_vector_2pass_2<DTYPE, VALUE_DIM>(                                  \
+          const device DTYPE* partials [[buffer(0)]],                         \
+          const device float* sums [[buffer(1)]],                             \
+          const device float* maxs [[buffer(2)]],                             \
+          device DTYPE* out [[buffer(3)]],                                    \
+          uint3 tid [[threadgroup_position_in_grid]],                         \
+          uint3 tpg [[threadgroups_per_grid]],                                \
+          uint simd_gid [[simdgroup_index_in_threadgroup]],                   \
+          uint simd_lid [[thread_index_in_simdgroup]]);
+
+#define INSTANTIATE_SDPA_VECTOR_HEADS(DTYPE)        \
+  INSTANTIATE_SDPA_VECTOR(DTYPE, 64, 64);           \
+  INSTANTIATE_SDPA_VECTOR(DTYPE, 96, 96);           \
+  INSTANTIATE_SDPA_VECTOR(DTYPE, 128, 128);         \
+  INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, 64, 64);   \
+  INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, 96, 96);   \
+  INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, 128, 128); \
+  INSTANTIATE_SDPA_VECTOR_AGGREGATION(DTYPE, 64);   \
+  INSTANTIATE_SDPA_VECTOR_AGGREGATION(DTYPE, 96);   \
+  INSTANTIATE_SDPA_VECTOR_AGGREGATION(DTYPE, 128);
+
+INSTANTIATE_SDPA_VECTOR_HEADS(float);
+INSTANTIATE_SDPA_VECTOR_HEADS(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_SDPA_VECTOR_HEADS(bfloat);
+#endif
+
+#define INSTANTIATE_ATTN(DTYPE, bq, bk, bd, wm, wn)                      \
+  template [[host_name("attention_" #DTYPE "_bq" #bq "_bk" #bk "_bd" #bd \
+                       "_wm" #wm "_wn" #wn)]] [[kernel]] void            \
+  attention<DTYPE, bq, bk, bd, wm, wn>(                                  \
+      const device DTYPE* Q [[buffer(0)]],                               \
+      const device DTYPE* K [[buffer(1)]],                               \
+      const device DTYPE* V [[buffer(2)]],                               \
+      device DTYPE* O [[buffer(3)]],                                     \
+      const constant uint& qL [[buffer(4)]],                             \
+      const constant uint& kL [[buffer(5)]],                             \
+      const constant uint& gqa_factor [[buffer(6)]],                     \
+      const constant float& scale [[buffer(7)]],                         \
+      const constant uint& NK [[buffer(8)]],                             \
+      const constant uint3& Q_strides [[buffer(9)]],                     \
+      const constant uint3& K_strides [[buffer(10)]],                    \
+      const constant uint3& V_strides [[buffer(11)]],                    \
+      const constant uint3& O_strides [[buffer(12)]],                    \
+      uint3 group_pos [[threadgroup_position_in_grid]],                  \
+      uint3 local_pos [[thread_position_in_threadgroup]]);
+
+#define INSTANTIATE_ATTN_SHAPES_HELPER(dtype) \
+  INSTANTIATE_ATTN(dtype, 32, 16, 128, 4, 1)  \
+  INSTANTIATE_ATTN(dtype, 32, 32, 80, 4, 1)   \
+  INSTANTIATE_ATTN(dtype, 32, 32, 64, 4, 1)
+
+INSTANTIATE_ATTN_SHAPES_HELPER(float);
+INSTANTIATE_ATTN_SHAPES_HELPER(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_ATTN_SHAPES_HELPER(bfloat);
+#endif
diff --git a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
index f36746baea24..10c87dcdb07f 100644
--- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
@@ -1,8 +1,50 @@
+<<<<<<< HEAD
+=======
+#include <c10/metal/indexing.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/metal/special_math.h>
 #include <c10/metal/utils.h>
 #include <metal_stdlib>
 using namespace metal;
 
+<<<<<<< HEAD
+=======
+struct add_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(a + b);
+  }
+};
+
+struct sub_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(a - b);
+  }
+};
+
+struct add_alpha_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b, const T alpha) {
+    return static_cast<T>(a + c10::metal::mul(alpha, b));
+  }
+};
+
+struct sub_alpha_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b, const T alpha) {
+    return static_cast<T>(a - c10::metal::mul(alpha, b));
+  }
+};
+
+struct lerp_alpha_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b, const T alpha) {
+    return static_cast<T>(a + c10::metal::mul(alpha, b - a));
+  }
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct fmax_functor {
   template <typename T>
   inline T operator()(const T a, const T b) {
@@ -33,6 +75,7 @@ struct copysign_functor {
 };
 
 struct zeta_functor {
+<<<<<<< HEAD
   template <typename T>
   inline T operator()(const T a, const T b) {
     return static_cast<T>(c10::metal::zeta(a, b));
@@ -44,6 +87,93 @@ struct xlog1py_functor {
   inline T operator()(const T a, const T b) {
     return static_cast<T>(c10::metal::xlog1py(a, b));
   }
+=======
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(c10::metal::zeta(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::zeta(float(a), float(b));
+  }
+};
+
+struct xlog1py_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(c10::metal::xlog1py(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::xlog1py(float(a), float(b));
+  }
+};
+
+struct chebyshev_polynomial_t_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(c10::metal::chebyshev_polynomial_t_forward(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::chebyshev_polynomial_t_forward(float(a), float(b));
+  }
+};
+
+struct chebyshev_polynomial_u_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(c10::metal::chebyshev_polynomial_u_forward(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::chebyshev_polynomial_u_forward(float(a), float(b));
+  }
+};
+
+struct chebyshev_polynomial_v_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(c10::metal::chebyshev_polynomial_v_forward(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::chebyshev_polynomial_v_forward(float(a), float(b));
+  }
+};
+
+struct chebyshev_polynomial_w_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(c10::metal::chebyshev_polynomial_w_forward(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::chebyshev_polynomial_w_forward(float(a), float(b));
+  }
+};
+
+struct hermite_polynomial_h_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(c10::metal::hermite_polynomial_h_forward(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::hermite_polynomial_h_forward(float(a), float(b));
+  }
+};
+
+struct hermite_polynomial_he_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(c10::metal::hermite_polynomial_he_forward(a, b));
+  }
+  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::hermite_polynomial_he_forward(float(a), float(b));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct nextafter_functor {
@@ -82,6 +212,10 @@ struct nextafter_functor {
   }
 };
 
+<<<<<<< HEAD
+=======
+// Complex binary functors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct polar_functor {
   template <typename U>
   using ret_type = c10::metal::vec2type_t<U>;
@@ -91,6 +225,7 @@ struct polar_functor {
   }
 };
 
+<<<<<<< HEAD
 // Future BinaryTensorIterator
 template <typename T, typename F>
 using result_of = decltype(::metal::declval<F>()(
@@ -209,3 +344,215 @@ kernel void complex_kernel(
 
 REGISTER_BINARY_OP(complex_kernel, float);
 REGISTER_BINARY_OP(complex_kernel, half);
+=======
+// Constructs complex tensor from real and imaginary planes
+struct make_complex_functor {
+  template <typename U>
+  using ret_type = c10::metal::vec2type_t<U>;
+  template <typename T>
+  inline ret_type<T> operator()(const T a, const T b) {
+    return ret_type<T>(a, b);
+  }
+};
+
+struct mul_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+    return c10::metal::mul(a, b);
+  }
+};
+
+struct div_true_functor {
+  template <
+      typename T,
+      ::metal::enable_if_t<!::metal::is_integral_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return c10::metal::div(a, b);
+  }
+  template <
+      typename T,
+      ::metal::enable_if_t<::metal::is_integral_v<T>, bool> = true>
+  inline float operator()(const T a, const T b) {
+    return c10::metal::div(float(a), float(b));
+  }
+};
+
+struct div_floor_functor {
+  template <
+      typename T,
+      ::metal::enable_if_t<!::metal::is_integral_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return metal::floor(c10::metal::div(a, b));
+  }
+  template <
+      typename T,
+      ::metal::enable_if_t<
+          ::metal::is_integral_v<T>&& ::metal::is_signed_v<T>,
+          bool> = true>
+  inline T operator()(const T a, const T b) {
+    const auto quot = a / b;
+    if ((a < 0) == (b < 0)) {
+      return quot;
+    }
+    return a % b != 0 ? quot - 1 : quot;
+  }
+  template <
+      typename T,
+      ::metal::enable_if_t<
+          ::metal::is_integral_v<T> && !::metal::is_signed_v<T>,
+          bool> = true>
+  inline T operator()(const T a, const T b) {
+    return a / b;
+  }
+};
+
+struct div_trunc_functor {
+  template <
+      typename T,
+      ::metal::enable_if_t<!::metal::is_integral_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return T(metal::trunc(c10::metal::div(a, b)));
+  }
+  template <
+      typename T,
+      ::metal::enable_if_t<::metal::is_integral_v<T>, bool> = true>
+  inline T operator()(const T a, const T b) {
+    return a / b;
+  }
+};
+
+struct remainder_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+    return T(c10::metal::remainder(a, b));
+  }
+};
+
+struct fmod_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+    return c10::metal::fmod(a, b);
+  }
+};
+
+// Some helper defines
+#if __METAL_VERSION__ >= 310
+#define _METAL_310_PLUS(x) x
+#else
+#define _METAL_310_PLUS(x)
+#endif
+
+#define REGISTER_INTEGER_BINARY_OP(NAME)  \
+  REGISTER_BINARY_OP(NAME, long, long);   \
+  REGISTER_BINARY_OP(NAME, int, int);     \
+  REGISTER_BINARY_OP(NAME, short, short); \
+  REGISTER_BINARY_OP(NAME, uchar, uchar); \
+  REGISTER_BINARY_OP(NAME, char, char);   \
+  REGISTER_BINARY_OP(NAME, bool, bool)
+
+#define REGISTER_INT2FLOAT_BINARY_OP(NAME) \
+  REGISTER_BINARY_OP(NAME, long, float);   \
+  REGISTER_BINARY_OP(NAME, int, float);    \
+  REGISTER_BINARY_OP(NAME, short, float);  \
+  REGISTER_BINARY_OP(NAME, uchar, float);  \
+  REGISTER_BINARY_OP(NAME, char, float);   \
+  REGISTER_BINARY_OP(NAME, bool, float)
+
+#define REGISTER_FLOAT_BINARY_OP(NAME)    \
+  REGISTER_BINARY_OP(NAME, float, float); \
+  REGISTER_BINARY_OP(NAME, half, half);   \
+  _METAL_310_PLUS(REGISTER_BINARY_OP(NAME, bfloat, bfloat))
+
+#define REGISTER_OPMATH_FLOAT_BINARY_OP(NAME)    \
+  REGISTER_OPMATH_BINARY_OP(NAME, float, float); \
+  REGISTER_OPMATH_BINARY_OP(NAME, half, half);   \
+  _METAL_310_PLUS(REGISTER_OPMATH_BINARY_OP(NAME, bfloat, bfloat))
+
+REGISTER_FLOAT_BINARY_OP(copysign);
+REGISTER_INT2FLOAT_BINARY_OP(copysign);
+REGISTER_FLOAT_BINARY_OP(fmax);
+REGISTER_FLOAT_BINARY_OP(fmin);
+REGISTER_FLOAT_BINARY_OP(nextafter);
+REGISTER_FLOAT_BINARY_OP(zeta);
+REGISTER_INT2FLOAT_BINARY_OP(zeta);
+REGISTER_FLOAT_BINARY_OP(xlog1py);
+REGISTER_INT2FLOAT_BINARY_OP(xlog1py);
+REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_t);
+REGISTER_INT2FLOAT_BINARY_OP(chebyshev_polynomial_t);
+REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_u);
+REGISTER_INT2FLOAT_BINARY_OP(chebyshev_polynomial_u);
+REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_v);
+REGISTER_INT2FLOAT_BINARY_OP(chebyshev_polynomial_w);
+REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_w);
+REGISTER_INT2FLOAT_BINARY_OP(chebyshev_polynomial_v);
+REGISTER_FLOAT_BINARY_OP(hermite_polynomial_h);
+REGISTER_INT2FLOAT_BINARY_OP(hermite_polynomial_h);
+REGISTER_FLOAT_BINARY_OP(hermite_polynomial_he);
+REGISTER_INT2FLOAT_BINARY_OP(hermite_polynomial_he);
+REGISTER_FLOAT_BINARY_OP(add);
+REGISTER_INTEGER_BINARY_OP(add);
+REGISTER_OPMATH_FLOAT_BINARY_OP(mul);
+REGISTER_INTEGER_BINARY_OP(mul);
+REGISTER_FLOAT_BINARY_OP(sub);
+REGISTER_INTEGER_BINARY_OP(sub);
+REGISTER_OPMATH_FLOAT_BINARY_OP(div_floor);
+REGISTER_INTEGER_BINARY_OP(div_floor);
+REGISTER_FLOAT_BINARY_OP(div_trunc);
+REGISTER_INTEGER_BINARY_OP(div_trunc);
+REGISTER_OPMATH_FLOAT_BINARY_OP(div_true);
+REGISTER_INT2FLOAT_BINARY_OP(div_true);
+REGISTER_OPMATH_FLOAT_BINARY_OP(remainder);
+REGISTER_INTEGER_BINARY_OP(remainder);
+REGISTER_OPMATH_FLOAT_BINARY_OP(fmod);
+REGISTER_INTEGER_BINARY_OP(fmod);
+REGISTER_BINARY_ALPHA_OP(add_alpha, long, long, long);
+REGISTER_BINARY_ALPHA_OP(add_alpha, int, int, int);
+REGISTER_BINARY_ALPHA_OP(add_alpha, float, float, float);
+REGISTER_BINARY_ALPHA_OP(add_alpha, half, half, half);
+REGISTER_BINARY_ALPHA_OP(add_alpha, short, short, short);
+REGISTER_BINARY_ALPHA_OP(add_alpha, uchar, uchar, uchar);
+REGISTER_BINARY_ALPHA_OP(add_alpha, char, char, char);
+REGISTER_BINARY_ALPHA_OP(add_alpha, bool, bool, bool);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, long, long, long);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, int, int, int);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, float, float, float);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, half, half, half);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, short, short, short);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, uchar, uchar, uchar);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, char, char, char);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, bool, bool, bool);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, long, long, long);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, int, int, int);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, float, float, float);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, half, half, half);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, short, short, short);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, uchar, uchar, uchar);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, char, char, char);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, bool, bool, bool);
+
+#if __METAL_VERSION__ >= 310
+REGISTER_BINARY_ALPHA_OP(add_alpha, bfloat, bfloat, bfloat);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, bfloat, bfloat, bfloat);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, bfloat, bfloat, bfloat);
+#endif
+
+// Complex binary functions
+REGISTER_BINARY_OP(polar, float, float2);
+REGISTER_BINARY_OP(polar, half, half2);
+REGISTER_BINARY_OP(make_complex, float, float2);
+REGISTER_BINARY_OP(make_complex, half, half2);
+REGISTER_OPMATH_BINARY_OP(mul, float2, float2);
+REGISTER_OPMATH_BINARY_OP(mul, half2, half2);
+REGISTER_OPMATH_BINARY_OP(div_true, float2, float2);
+REGISTER_OPMATH_BINARY_OP(div_true, half2, half2);
+REGISTER_BINARY_OP(add, float2, float2);
+REGISTER_BINARY_OP(add, half2, half2);
+REGISTER_BINARY_OP(sub, float2, float2);
+REGISTER_BINARY_OP(sub, half2, half2);
+REGISTER_BINARY_ALPHA_OP(add_alpha, float2, float2, float2);
+REGISTER_BINARY_ALPHA_OP(add_alpha, half2, half2, half2);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, float2, float2, float2);
+REGISTER_BINARY_ALPHA_OP(sub_alpha, half2, half2, half2);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, float2, float2, float2);
+REGISTER_BINARY_ALPHA_OP(lerp_alpha, half2, half2, half2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/Bucketization.metal b/aten/src/ATen/native/mps/kernels/Bucketization.metal
index 6314565a9035..8ae350acc2a9 100644
--- a/aten/src/ATen/native/mps/kernels/Bucketization.metal
+++ b/aten/src/ATen/native/mps/kernels/Bucketization.metal
@@ -153,12 +153,21 @@ kernel void searchsorted(
       constant INPUT_T * data_in [[buffer(0)]],                              \
       constant INPUT_T * data_bd [[buffer(1)]],                              \
       device OUTPUT_T * data_out [[buffer(2)]],                              \
+<<<<<<< HEAD
       constant int64_t & idim_in [[buffer(3)]],                              \
       constant int64_t & idim_bd [[buffer(4)]],                              \
       constant int64_t & numel_in [[buffer(5)]],                             \
       constant int64_t & right [[buffer(6)]],                                \
       constant int64_t & is_1d_boundaries [[buffer(7)]],                     \
       constant int64_t * data_sort [[buffer(8)]],                            \
+=======
+      constant int64_t& idim_in [[buffer(3)]],                               \
+      constant int64_t& idim_bd [[buffer(4)]],                               \
+      constant int64_t& numel_in [[buffer(5)]],                              \
+      constant int64_t& right [[buffer(6)]],                                 \
+      constant int64_t& is_1d_boundaries [[buffer(7)]],                      \
+      constant int64_t* data_sort [[buffer(8)]],                             \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       uint2 tgid [[threadgroup_position_in_grid]],                           \
       uint2 tid2 [[thread_position_in_threadgroup]],                         \
       uint2 tptg [[threads_per_threadgroup]]);                               \
@@ -167,11 +176,19 @@ kernel void searchsorted(
       constant INPUT_T * data_in [[buffer(0)]],                              \
       constant INPUT_T * data_bd [[buffer(1)]],                              \
       device OUTPUT_T * data_out [[buffer(2)]],                              \
+<<<<<<< HEAD
       constant int64_t & idim_in [[buffer(3)]],                              \
       constant int64_t & idim_bd [[buffer(4)]],                              \
       constant int64_t & numel_in [[buffer(5)]],                             \
       constant int64_t & right [[buffer(6)]],                                \
       constant int64_t & is_1d_boundaries [[buffer(7)]],                     \
+=======
+      constant int64_t& idim_in [[buffer(3)]],                               \
+      constant int64_t& idim_bd [[buffer(4)]],                               \
+      constant int64_t& numel_in [[buffer(5)]],                              \
+      constant int64_t& right [[buffer(6)]],                                 \
+      constant int64_t& is_1d_boundaries [[buffer(7)]],                      \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       uint2 tgid [[threadgroup_position_in_grid]],                           \
       uint2 tid2 [[thread_position_in_threadgroup]],                         \
       uint2 tptg [[threads_per_threadgroup]]);
diff --git a/aten/src/ATen/native/mps/kernels/Col2Im.metal b/aten/src/ATen/native/mps/kernels/Col2Im.metal
new file mode 100644
index 000000000000..f784fc3053a6
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/Col2Im.metal
@@ -0,0 +1,101 @@
+#include <metal_stdlib>
+using namespace metal;
+
+template <typename T>
+kernel void col2im_kernel(
+    device const T* data_col [[buffer(0)]],
+    device T* data_im [[buffer(1)]],
+    constant uint& col_batch_stride [[buffer(2)]],
+    constant uint& channels [[buffer(3)]],
+    constant uint2& im_hw [[buffer(4)]],
+    constant uint2& kernel_hw [[buffer(5)]],
+    constant uint2& pad_hw [[buffer(6)]],
+    constant uint2& stride_hw [[buffer(7)]],
+    constant uint2& dilation_hw [[buffer(8)]],
+    constant uint2& col_hw [[buffer(9)]],
+    constant uint& im_batch_stride [[buffer(10)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  const uint output_height = im_hw.x;
+  const uint output_width = im_hw.y;
+
+  uint x = gid.x;
+  uint y = gid.y;
+
+  uint bc = gid.z;
+  uint batch_idx = bc / channels;
+  uint c_im = bc % channels;
+
+  uint w_im = x + pad_hw.y;
+  uint h_im = y + pad_hw.x;
+
+  uint kernel_h = kernel_hw.x;
+  uint kernel_w = kernel_hw.y;
+  uint stride_h = stride_hw.x;
+  uint stride_w = stride_hw.y;
+  uint dilation_h = dilation_hw.x;
+  uint dilation_w = dilation_hw.y;
+
+  uint height_col = col_hw.x;
+  uint width_col = col_hw.y;
+
+  uint kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+  uint kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+
+  uint w_col_start =
+      (w_im < kernel_extent_w) ? 0 : ((w_im - kernel_extent_w) / stride_w + 1);
+  uint w_col_end = min((w_im / stride_w + 1), width_col);
+  uint h_col_start =
+      (h_im < kernel_extent_h) ? 0 : ((h_im - kernel_extent_h) / stride_h + 1);
+  uint h_col_end = min((h_im / stride_h + 1), height_col);
+
+  float accumulator = 0.0;
+  uint col_batch_offset = batch_idx * col_batch_stride;
+
+  for (uint h_col = h_col_start; h_col < h_col_end; h_col++) {
+    for (uint w_col = w_col_start; w_col < w_col_end; w_col++) {
+      uint h_k = h_im - (h_col * stride_h);
+      uint w_k = w_im - (w_col * stride_w);
+
+      if ((h_k % dilation_h == 0) && (w_k % dilation_w == 0)) {
+        h_k /= dilation_h;
+        w_k /= dilation_w;
+        if (h_k < kernel_h && w_k < kernel_w) {
+          uint col_index =
+              (((c_im * kernel_h + h_k) * kernel_w + w_k) * height_col +
+               h_col) *
+                  width_col +
+              w_col;
+          accumulator +=
+              static_cast<float>(data_col[col_batch_offset + col_index]);
+        }
+      }
+    }
+  }
+
+  uint im_batch_offset = batch_idx * im_batch_stride;
+  uint im_index = (c_im * output_height + y) * output_width + x;
+  data_im[im_batch_offset + im_index] = static_cast<T>(accumulator);
+}
+
+#define INSTANTIATE_COL2IM(DTYPE)                             \
+  template [[host_name("col2im_kernel_" #DTYPE)]] kernel void \
+  col2im_kernel<DTYPE>(                                       \
+      device const DTYPE* data_col [[buffer(0)]],             \
+      device DTYPE* data_im [[buffer(1)]],                    \
+      constant uint& col_batch_stride [[buffer(2)]],          \
+      constant uint& channels [[buffer(3)]],                  \
+      constant uint2& im_hw [[buffer(4)]],                    \
+      constant uint2& kernel_hw [[buffer(5)]],                \
+      constant uint2& pad_hw [[buffer(6)]],                   \
+      constant uint2& stride_hw [[buffer(7)]],                \
+      constant uint2& dilation_hw [[buffer(8)]],              \
+      constant uint2& col_hw [[buffer(9)]],                   \
+      constant uint& im_batch_stride [[buffer(10)]],          \
+      uint3 gid [[thread_position_in_grid]]);
+
+INSTANTIATE_COL2IM(bool);
+INSTANTIATE_COL2IM(float);
+INSTANTIATE_COL2IM(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_COL2IM(bfloat);
+#endif
diff --git a/aten/src/ATen/native/mps/kernels/FusedOptimizerOps.metal b/aten/src/ATen/native/mps/kernels/FusedOptimizerOps.metal
index 2006e768d826..63764aaf8188 100644
--- a/aten/src/ATen/native/mps/kernels/FusedOptimizerOps.metal
+++ b/aten/src/ATen/native/mps/kernels/FusedOptimizerOps.metal
@@ -57,9 +57,15 @@ struct SgdMomentumArguments {
 };
 
 struct MetadataArguments {
+<<<<<<< HEAD
   uint32_t numels[kmaxTensors];
   uint32_t threadgroup_to_tensor[kmaxThreadGroups];
   uint32_t threadgroup_to_chunk[kmaxThreadGroups];
+=======
+  ulong numels[kmaxTensors];
+  ulong threadgroup_to_tensor[kmaxThreadGroups];
+  ulong threadgroup_to_chunk[kmaxThreadGroups];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 enum ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 };
diff --git a/aten/src/ATen/native/mps/kernels/Gamma.metal b/aten/src/ATen/native/mps/kernels/Gamma.metal
index 911893dbe1a7..2cbb66f24982 100644
--- a/aten/src/ATen/native/mps/kernels/Gamma.metal
+++ b/aten/src/ATen/native/mps/kernels/Gamma.metal
@@ -81,7 +81,12 @@ kernel void polygamma(
     constant int64_t& order [[buffer(2)]],
     uint id [[thread_position_in_grid]]) {
   // already blocked if n <= 1
+<<<<<<< HEAD
   output[id] = static_cast<T1>(c10::metal::polygamma(order, input[id]));
+=======
+  output[id] = static_cast<T1>(
+      c10::metal::polygamma(order, static_cast<float>(input[id])));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #define INSTANTIATE_GAMMA_KERNELS(DTYPE0, DTYPE1)                             \
diff --git a/aten/src/ATen/native/mps/kernels/HistogramKernel.metal b/aten/src/ATen/native/mps/kernels/HistogramKernel.metal
index ddbd08b1ae2a..cdc3115e37f6 100644
--- a/aten/src/ATen/native/mps/kernels/HistogramKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/HistogramKernel.metal
@@ -94,6 +94,7 @@ kernel void histogramdd(
   }
 }
 
+<<<<<<< HEAD
 #define REGISTER_HISTOGRAMDD_OP(DTYPE)                           \
   template [[host_name("histogramdd_" #DTYPE)]] kernel void      \
   histogramdd<DTYPE>(                                            \
@@ -109,6 +110,23 @@ kernel void histogramdd(
       constant int64_t * local_out_strides [[buffer(9)]],        \
       constant uint8_t & bin_selection_algorithm [[buffer(10)]], \
       constant uint8_t & has_weight [[buffer(11)]],              \
+=======
+#define REGISTER_HISTOGRAMDD_OP(DTYPE)                          \
+  template [[host_name("histogramdd_" #DTYPE)]] kernel void     \
+  histogramdd<DTYPE>(                                           \
+      constant DTYPE * input_ [[buffer(0)]],                    \
+      constant DTYPE * weight [[buffer(1)]],                    \
+      device DTYPE * local_out [[buffer(2)]],                   \
+      constant uint * offsets [[buffer(3)]],                    \
+      constant size_t& num_dims [[buffer(4)]],                  \
+      constant DTYPE* bin_seq [[buffer(5)]],                    \
+      constant int64_t* num_bin_edges [[buffer(6)]],            \
+      constant DTYPE* leftmost_edge [[buffer(7)]],              \
+      constant DTYPE* rightmost_edge [[buffer(8)]],             \
+      constant int64_t* local_out_strides [[buffer(9)]],        \
+      constant uint8_t& bin_selection_algorithm [[buffer(10)]], \
+      constant uint8_t& has_weight [[buffer(11)]],              \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       uint tid [[thread_position_in_grid]]);
 
 REGISTER_HISTOGRAMDD_OP(float);
diff --git a/aten/src/ATen/native/mps/kernels/Indexing.metal b/aten/src/ATen/native/mps/kernels/Indexing.metal
index 0f627589019d..5b4fe38ff8f6 100644
--- a/aten/src/ATen/native/mps/kernels/Indexing.metal
+++ b/aten/src/ATen/native/mps/kernels/Indexing.metal
@@ -1,5 +1,10 @@
+<<<<<<< HEAD
 #include <c10/metal/indexing.h>
 #include <metal_atomic>
+=======
+#include <c10/metal/atomic.h>
+#include <c10/metal/indexing.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <metal_stdlib>
 
 using namespace metal;
@@ -9,6 +14,7 @@ struct IndexAB {
   constant int64_t* indexArray;
 };
 
+<<<<<<< HEAD
 template <typename T, typename OffsetsT>
 kernel void index_select(
     constant IndexAB* indexAB [[buffer(0)]],
@@ -167,6 +173,193 @@ REGISTER_INDEX_OP_ALL_DTYPES(put);
   REGISTER_SINGLE_THREADED_INDEX_OP(64bit, idx64, long, INDEX_OP_TYPE, ulong3);
 
 REGISTER_SINGLE_THREADED_INDEX_OP_ALL_DTYPES(put_serial);
+=======
+template <typename T, typename OffsetT = ulong>
+kernel void index_select(
+    device T* output,
+    constant T* input,
+    constant IndexAB* indices,
+    constant int64_t* sizes,
+    constant int64_t* output_strides,
+    constant int64_t* input_strides,
+    constant int64_t* indices_strides,
+    constant int64_t* index_sizes,
+    constant int64_t* index_strides,
+    constant uint4& ndim_nindices_numel,
+    uint thread_index [[thread_position_in_grid]]) {
+  const auto ndim = ndim_nindices_numel.x;
+  const auto num_indices = ndim_nindices_numel.y;
+  uint pos[max_ndim];
+  pos_from_thread_index(thread_index, pos, sizes, ndim);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim);
+  OffsetT input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto indices_offs =
+      offset_from_coord(pos, indices_strides, ndim) / sizeof(int64_t);
+  for (uint i = 0; i < num_indices; i++) {
+    auto idx = indices[i].indexArray[indices_offs];
+    if (idx < 0) {
+      idx += index_sizes[i];
+    }
+    input_offs += idx * index_strides[i];
+  }
+  output[output_offs / sizeof(T)] = input[input_offs / sizeof(T)];
+}
+
+template <typename T, typename OffsetT = ulong>
+inline void index_put_impl(
+    device T* output,
+    constant T* input,
+    constant IndexAB* indices,
+    constant int64_t* sizes,
+    constant int64_t* output_strides,
+    constant int64_t* input_strides,
+    constant int64_t* indices_strides,
+    constant int64_t* index_sizes,
+    constant int64_t* index_strides,
+    constant uint4& ndim_nindices_numel,
+    uint thread_index) {
+  const auto ndim = ndim_nindices_numel.x;
+  const auto num_indices = ndim_nindices_numel.y;
+  uint pos[max_ndim];
+  pos_from_thread_index(thread_index, pos, sizes, ndim);
+  OffsetT output_offs = offset_from_coord(pos, output_strides, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto indices_offs =
+      offset_from_coord(pos, indices_strides, ndim) / sizeof(int64_t);
+  for (uint i = 0; i < num_indices; i++) {
+    auto idx = indices[i].indexArray[indices_offs];
+    if (idx < 0) {
+      idx += index_sizes[i];
+    }
+    output_offs += idx * index_strides[i];
+  }
+  output[output_offs / sizeof(T)] = input[input_offs / sizeof(T)];
+}
+
+template <typename T, typename OffsetT = ulong>
+kernel void index_put(
+    device T* output,
+    constant T* input,
+    constant IndexAB* indices,
+    constant int64_t* sizes,
+    constant int64_t* output_strides,
+    constant int64_t* input_strides,
+    constant int64_t* indices_strides,
+    constant int64_t* index_sizes,
+    constant int64_t* index_strides,
+    constant uint4& ndim_nindices_numel,
+    uint thread_index [[thread_position_in_grid]]) {
+  index_put_impl(
+      output,
+      input,
+      indices,
+      sizes,
+      output_strides,
+      input_strides,
+      indices_strides,
+      index_sizes,
+      index_strides,
+      ndim_nindices_numel,
+      thread_index);
+}
+
+template <typename T, typename OffsetT = ulong>
+kernel void index_put_serial(
+    device T* output,
+    constant T* input,
+    constant IndexAB* indices,
+    constant int64_t* sizes,
+    constant int64_t* output_strides,
+    constant int64_t* input_strides,
+    constant int64_t* indices_strides,
+    constant int64_t* index_sizes,
+    constant int64_t* index_strides,
+    constant uint4& ndim_nindices_numel,
+    uint thread_index [[thread_position_in_grid]]) {
+  (void)thread_index; // Suppress unused vairable varning
+  for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
+    index_put_impl(
+        output,
+        input,
+        indices,
+        sizes,
+        output_strides,
+        input_strides,
+        indices_strides,
+        index_sizes,
+        index_strides,
+        ndim_nindices_numel,
+        idx);
+  }
+}
+
+template <typename T, typename OffsetT = ulong>
+kernel void index_put_accumulate(
+    device T* output,
+    constant T* input,
+    constant IndexAB* indices,
+    constant int64_t* sizes,
+    constant int64_t* output_strides,
+    constant int64_t* input_strides,
+    constant int64_t* indices_strides,
+    constant int64_t* index_sizes,
+    constant int64_t* index_strides,
+    constant uint4& ndim_nindices_numel,
+    uint thread_index [[thread_position_in_grid]]) {
+  const auto ndim = ndim_nindices_numel.x;
+  const auto num_indices = ndim_nindices_numel.y;
+  uint pos[max_ndim];
+  pos_from_thread_index(thread_index, pos, sizes, ndim);
+  OffsetT output_offs = offset_from_coord(pos, output_strides, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto indices_offs =
+      offset_from_coord(pos, indices_strides, ndim) / sizeof(int64_t);
+  for (uint i = 0; i < num_indices; i++) {
+    auto idx = indices[i].indexArray[indices_offs];
+    if (idx < 0) {
+      idx += index_sizes[i];
+    }
+    output_offs += idx * index_strides[i];
+  }
+  AtomicType<T>::atomic_add(
+      reinterpret_cast<device AtomicType_t<T>*>(output),
+      output_offs / sizeof(T),
+      input[input_offs / sizeof(T)]);
+}
+
+#define REGISTER_INDEX_OP(OP_NAME, SUFFIX, DTYPE)                   \
+  template [[host_name("index_" #OP_NAME "_" #SUFFIX)]] kernel void \
+      index_##OP_NAME<DTYPE>(                                       \
+          device DTYPE * output,                                    \
+          constant DTYPE * input,                                   \
+          constant IndexAB * indices,                               \
+          constant int64_t* sizes,                                  \
+          constant int64_t* output_strides,                         \
+          constant int64_t* input_strides,                          \
+          constant int64_t* indices_strides,                        \
+          constant int64_t* index_sizes,                            \
+          constant int64_t* index_strides,                          \
+          constant uint4& ndim_nindices_numel,                      \
+          uint thread_index [[thread_position_in_grid]])
+
+#define REGISTER_INDEX_OP_ALL_DTYPES(OP_NAME) \
+  REGISTER_INDEX_OP(OP_NAME, 8bit, char);     \
+  REGISTER_INDEX_OP(OP_NAME, 16bit, short);   \
+  REGISTER_INDEX_OP(OP_NAME, 32bit, int);     \
+  REGISTER_INDEX_OP(OP_NAME, 64bit, long)
+
+REGISTER_INDEX_OP_ALL_DTYPES(select);
+REGISTER_INDEX_OP_ALL_DTYPES(put);
+REGISTER_INDEX_OP_ALL_DTYPES(put_serial);
+
+REGISTER_INDEX_OP(put_accumulate, float, float);
+REGISTER_INDEX_OP(put_accumulate, half, half);
+REGISTER_INDEX_OP(put_accumulate, int, int);
+REGISTER_INDEX_OP(put_accumulate, bool, bool);
+#if __METAL_VERSION__ >= 310
+REGISTER_INDEX_OP(put_accumulate, bfloat, bfloat);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <typename StridesT, typename DataT>
 kernel void kernel_index_offsets(
@@ -201,6 +394,7 @@ kernel_index_offsets<packed_uint3, ulong3>(
     constant uint& num_dimensions [[buffer(3)]],
     uint thread_index [[thread_position_in_grid]]);
 
+<<<<<<< HEAD
 template <typename T, typename E, typename OffsetsT>
 kernel void index_put_accumulate_native_dtypes(
     constant IndexAB* indexAB [[buffer(0)]],
@@ -318,6 +512,8 @@ index_put_accumulate_native_dtypes<atomic_int, int, ulong3>(
     constant uint32_t& num_indices [[buffer(6)]],
     uint thread_index [[thread_position_in_grid]]);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 kernel void masked_fill_scalar_dense(
     device T* input,
@@ -358,6 +554,130 @@ kernel void masked_fill_scalar_strided(
   }
 }
 
+<<<<<<< HEAD
+=======
+template <typename T, typename index_t>
+kernel void index_copy_dense(
+    device T* output,
+    constant T* input,
+    constant T* source,
+    constant index_t* indices,
+    constant uint& dim,
+    constant long* sizes,
+    constant uint& ndim,
+    constant uint& indices_numel,
+    uint thread_index [[thread_position_in_grid]]) {
+  // first copy input to output
+  output[thread_index] = input[thread_index];
+
+  // calculate pos in the tensor using a signed counter
+  long pos[max_ndim];
+  long linear_idx = thread_index;
+  for (int i = static_cast<int>(ndim) - 1; i >= 0; --i) {
+    pos[i] = linear_idx % sizes[i];
+    linear_idx /= sizes[i];
+  }
+
+  // check if this position's dim coordinate is in the indices
+  long dim_pos = pos[dim];
+
+  // search through indices to see if current dim pos should be updated
+  for (uint i = 0; i < indices_numel; i++) {
+    if (indices[i] == dim_pos) {
+      // this position should be updated from source
+      // calculate source offset where the source tensor has the same shape
+      // except along dim where it has size = indices_numel
+      long source_offset = 0;
+      long stride = 1;
+      for (int j = static_cast<int>(ndim) - 1; j >= 0; --j) {
+        if (j == static_cast<int>(dim)) {
+          // for the indexed dimension, use position i
+          source_offset += i * stride;
+          stride *= indices_numel;
+        } else {
+          // for other dimensions use the same position
+          source_offset += pos[j] * stride;
+          stride *= sizes[j];
+        }
+      }
+
+      output[thread_index] = source[source_offset];
+      break;
+    }
+  }
+}
+
+template <typename T, typename index_t>
+kernel void index_copy_strided(
+    device T* output,
+    constant T* input,
+    constant T* source,
+    constant index_t* indices,
+    constant uint& dim,
+    constant long* sizes,
+    constant uint& ndim,
+    constant uint& indices_numel,
+    constant long* input_strides,
+    constant long* output_strides,
+    constant long* source_strides,
+    uint thread_index [[thread_position_in_grid]]) {
+  int pos[max_ndim];
+  pos_from_thread_index(int(thread_index), pos, sizes, ndim);
+
+  // compute offsets for the output and input tensors
+  long output_offset = offset_from_coord(pos, output_strides, ndim);
+  long input_offset = offset_from_coord(pos, input_strides, ndim);
+
+  output[output_offset] = input[input_offset];
+
+  // save the original coordinate along the dim we're updating
+  int orig_dim = pos[dim];
+
+  // find the last index in the indices array that equals this coordinate
+  int last_matching_index = -1;
+  for (uint i = 0; i < indices_numel; i++) {
+    if (indices[i] == orig_dim) {
+      last_matching_index = int(i);
+    }
+  }
+
+  // if a matching index was found, use it to update the output
+  if (last_matching_index != -1) {
+    pos[dim] = last_matching_index;
+    long source_offset = offset_from_coord(pos, source_strides, ndim);
+    output[output_offset] = source[source_offset];
+  }
+}
+
+#define INSTANTIATE_INDEX_COPY(T, index_t)                      \
+  template [[host_name("index_copy_dense_" #T "_" #index_t)]]   \
+  kernel void index_copy_dense<T, index_t>(                     \
+      device T*,                                                \
+      constant T*,                                              \
+      constant T*,                                              \
+      constant index_t*,                                        \
+      constant uint&,                                           \
+      constant long*,                                           \
+      constant uint&,                                           \
+      constant uint&,                                           \
+      uint);                                                    \
+                                                                \
+  template [[host_name("index_copy_strided_" #T "_" #index_t)]] \
+  kernel void index_copy_strided<T, index_t>(                   \
+      device T*,                                                \
+      constant T*,                                              \
+      constant T*,                                              \
+      constant index_t*,                                        \
+      constant uint&,                                           \
+      constant long*,                                           \
+      constant uint&,                                           \
+      constant uint&,                                           \
+      constant long*,                                           \
+      constant long*,                                           \
+      constant long*,                                           \
+      uint);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define REGISTER_MASKED_FILL_SCALAR(SIZE, DTYPE)                            \
   template [[host_name("masked_fill_scalar_strided_" #SIZE)]] kernel void   \
   masked_fill_scalar_strided<DTYPE>(                                        \
@@ -380,3 +700,31 @@ REGISTER_MASKED_FILL_SCALAR(64bit, long);
 REGISTER_MASKED_FILL_SCALAR(32bit, int);
 REGISTER_MASKED_FILL_SCALAR(16bit, short);
 REGISTER_MASKED_FILL_SCALAR(8bit, char);
+<<<<<<< HEAD
+=======
+INSTANTIATE_INDEX_COPY(float, int);
+INSTANTIATE_INDEX_COPY(float, long);
+INSTANTIATE_INDEX_COPY(bool, int);
+INSTANTIATE_INDEX_COPY(bool, long);
+INSTANTIATE_INDEX_COPY(half, int);
+INSTANTIATE_INDEX_COPY(half, long);
+INSTANTIATE_INDEX_COPY(int, int);
+INSTANTIATE_INDEX_COPY(int, long);
+INSTANTIATE_INDEX_COPY(long, int);
+INSTANTIATE_INDEX_COPY(long, long);
+INSTANTIATE_INDEX_COPY(short, int);
+INSTANTIATE_INDEX_COPY(short, long);
+INSTANTIATE_INDEX_COPY(char, int);
+INSTANTIATE_INDEX_COPY(char, long);
+INSTANTIATE_INDEX_COPY(uchar, int);
+INSTANTIATE_INDEX_COPY(uchar, long);
+
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_INDEX_COPY(bfloat, int);
+INSTANTIATE_INDEX_COPY(bfloat, long);
+#endif
+INSTANTIATE_INDEX_COPY(float2, int);
+INSTANTIATE_INDEX_COPY(float2, long);
+INSTANTIATE_INDEX_COPY(half2, int);
+INSTANTIATE_INDEX_COPY(half2, long);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/kernels/LayerNorm.metal b/aten/src/ATen/native/mps/kernels/LayerNorm.metal
new file mode 100644
index 000000000000..eea3ff10cf5a
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/LayerNorm.metal
@@ -0,0 +1,294 @@
+#include <metal_simdgroup>
+#include <metal_stdlib>
+using namespace metal;
+
+template <typename T>
+kernel void layer_norm_single_row(
+    device T* input [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    device T* meanOut [[buffer(2)]],
+    device T* rstdTensor [[buffer(3)]],
+    constant uint& axis_size [[buffer(4)]],
+    constant float& epsilon [[buffer(5)]],
+    constant int& use_weight [[buffer(6)]],
+    constant int& use_bias [[buffer(7)]],
+    device T* weight [[buffer(8)]],
+    device T* bias [[buffer(9)]],
+    uint tg_id [[threadgroup_position_in_grid]],
+    uint tid [[thread_position_in_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simdgroup_id [[simdgroup_index_in_threadgroup]]) {
+  constexpr int SIMD_SIZE = 32;
+  constexpr int N_READS = 4;
+
+  // each threadgroup handles one full “row” of length axis_size
+  uint row_offset = tg_id * axis_size;
+  device T* x = input + row_offset + tid * N_READS;
+  device T* out = output + row_offset + tid * N_READS;
+
+  // partial sums for calculating mean & variance
+  float partial_sum = 0.0f;
+  float partial_sum_sq = 0.0f;
+  uint base_lane = tid * N_READS;
+  if (base_lane + N_READS <= axis_size) {
+    float4 v4 = float4(x[0], x[1], x[2], x[3]);
+    partial_sum = v4.x + v4.y + v4.z + v4.w;
+    partial_sum_sq = dot(v4, v4);
+  } else {
+    int remaining = axis_size - base_lane;
+    if (remaining >= 3) {
+      float3 v3 = float3(x[0], x[1], x[2]);
+      partial_sum = v3.x + v3.y + v3.z;
+      partial_sum_sq = dot(v3, v3);
+    } else if (remaining >= 2) {
+      float2 v2 = float2(x[0], x[1]);
+      partial_sum = v2.x + v2.y;
+      partial_sum_sq = dot(v2, v2);
+    } else if (remaining >= 1) {
+      float v = x[0];
+      partial_sum = v;
+      partial_sum_sq = fma(v, v, partial_sum_sq);
+    }
+  }
+
+  // threadgroup‐wide reduction
+  threadgroup float local_sums[SIMD_SIZE];
+  threadgroup float local_sums_sq[SIMD_SIZE];
+  threadgroup float tg_mean[1];
+  threadgroup float tg_inv_std[1];
+
+  if (simdgroup_id == 0) {
+    local_sums[simd_lane_id] = 0.0f;
+    local_sums_sq[simd_lane_id] = 0.0f;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // each simdgroup writes its partial
+  float group_partial_sum = simd_sum(partial_sum);
+  float group_partial_sum_sq = simd_sum(partial_sum_sq);
+  if (simd_lane_id == 0) {
+    local_sums[simdgroup_id] = group_partial_sum;
+    local_sums_sq[simdgroup_id] = group_partial_sum_sq;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  // warp 0 reduces those 32 values
+  if (simdgroup_id == 0) {
+    float sum = simd_sum(local_sums[simd_lane_id]);
+    float sum_sq = simd_sum(local_sums_sq[simd_lane_id]);
+    if (simd_lane_id == 0) {
+      float mean = sum / float(axis_size);
+      float var = sum_sq / float(axis_size) - mean * mean;
+      var = var < 1e-6 ? 0.0f : var; // for rsqrt precision
+      float inv_std = metal::precise::rsqrt(var + epsilon);
+      tg_mean[0] = mean;
+      tg_inv_std[0] = inv_std;
+    }
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  float mean = tg_mean[0];
+  float inv_std = tg_inv_std[0];
+
+  // normalize and optional scale & shift
+  if (base_lane + N_READS <= axis_size) {
+#pragma unroll
+    for (int i = 0; i < N_READS; i++) {
+      float v = float(x[i]);
+      float norm = (v - mean) * inv_std;
+      uint lane_idx = base_lane + i;
+      if (use_weight)
+        norm *= float(weight[lane_idx]);
+      if (use_bias)
+        norm += float(bias[lane_idx]);
+      out[i] = static_cast<T>(norm);
+    }
+  } else {
+#pragma unroll
+    for (int i = 0; i < N_READS; i++) {
+      uint lane_idx = base_lane + i;
+      if (lane_idx < axis_size) {
+        float v = float(x[i]);
+        float norm = (v - mean) * inv_std;
+        if (use_weight)
+          norm *= float(weight[lane_idx]);
+        if (use_bias)
+          norm += float(bias[lane_idx]);
+        out[i] = static_cast<T>(norm);
+      }
+    }
+  }
+
+  if (tid == 0 && simd_lane_id == 0) {
+    meanOut[tg_id] = static_cast<T>(mean);
+    rstdTensor[tg_id] = static_cast<T>(inv_std);
+  }
+}
+
+template <typename T>
+kernel void layer_norm_looped(
+    device T* input [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    device T* meanOut [[buffer(2)]],
+    device T* rstdTensor [[buffer(3)]],
+    constant uint& axis_size [[buffer(4)]],
+    constant float& epsilon [[buffer(5)]],
+    constant int& use_weight [[buffer(6)]],
+    constant int& use_bias [[buffer(7)]],
+    device T* weight [[buffer(8)]],
+    device T* bias [[buffer(9)]],
+    uint tg_id [[threadgroup_position_in_grid]],
+    uint tid [[thread_position_in_threadgroup]],
+    uint lsize [[threads_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simdgroup_id [[simdgroup_index_in_threadgroup]]) {
+  constexpr int SIMD_SIZE = 32;
+  constexpr int N_READS = 4;
+
+  uint row_offset = tg_id * axis_size;
+  device T* x = input + row_offset;
+  device T* out = output + row_offset;
+
+  float partial_sum = 0.0f;
+  float partial_sum_sq = 0.0f;
+  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
+    uint base = r + tid * N_READS;
+    if (base + N_READS <= axis_size) {
+      float4 xi4 = float4(x[base], x[base + 1], x[base + 2], x[base + 3]);
+      partial_sum += xi4.x + xi4.y + xi4.z + xi4.w;
+      partial_sum_sq += dot(xi4, xi4);
+    } else {
+      int remaining = axis_size - base;
+      if (remaining >= 3) {
+        float3 v3 = float3(x[base], x[base + 1], x[base + 2]);
+        partial_sum += v3.x + v3.y + v3.z;
+        partial_sum_sq += dot(v3, v3);
+      } else if (remaining >= 2) {
+        float2 v2 = float2(x[base], x[base + 1]);
+        partial_sum += v2.x + v2.y;
+        partial_sum_sq += dot(v2, v2);
+      } else if (remaining >= 1) {
+        float v = x[base];
+        partial_sum += v;
+        partial_sum_sq = fma(v, v, partial_sum_sq);
+      }
+    }
+  }
+
+  partial_sum = simd_sum(partial_sum);
+  partial_sum_sq = simd_sum(partial_sum_sq);
+
+  threadgroup float local_sums[SIMD_SIZE];
+  threadgroup float local_sums_sq[SIMD_SIZE];
+  threadgroup float tg_mean[1];
+  threadgroup float tg_inv_std[1];
+
+  if (simd_lane_id == 0) {
+    local_sums[simdgroup_id] = 0.0f;
+    local_sums_sq[simdgroup_id] = 0.0f;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  if (simd_lane_id == 0) {
+    local_sums[simdgroup_id] = partial_sum;
+    local_sums_sq[simdgroup_id] = partial_sum_sq;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  if (simdgroup_id == 0) {
+    float s = simd_sum(local_sums[simd_lane_id]);
+    float ss = simd_sum(local_sums_sq[simd_lane_id]);
+    if (simd_lane_id == 0) {
+      float mean = s / float(axis_size);
+      float var = ss / float(axis_size) - mean * mean;
+      var = var < 1e-6 ? 0.0f : var; // for rsqrt precision
+      float inv_std = metal::precise::rsqrt(var + epsilon);
+      tg_mean[0] = mean;
+      tg_inv_std[0] = inv_std;
+    }
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  float mean = tg_mean[0];
+  float inv_std = tg_inv_std[0];
+
+  // write back normalized + scale/shift if needed
+  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
+    uint base = r + tid * N_READS;
+    if (base + N_READS <= axis_size) {
+#pragma unroll
+      for (int i = 0; i < N_READS; i++) {
+        float xi = float(x[base + i]);
+        float norm = (xi - mean) * inv_std;
+        if (use_weight)
+          norm *= float(weight[base + i]);
+        if (use_bias)
+          norm += float(bias[base + i]);
+        out[base + i] = T(norm);
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < N_READS; i++) {
+        if (base + i < axis_size) {
+          float xi = float(x[base + i]);
+          float norm = (xi - mean) * inv_std;
+          if (use_weight)
+            norm *= float(weight[base + i]);
+          if (use_bias)
+            norm += float(bias[base + i]);
+          out[base + i] = T(norm);
+        }
+      }
+    }
+  }
+
+  if (tid == 0 && simd_lane_id == 0) {
+    meanOut[tg_id] = T(mean);
+    rstdTensor[tg_id] = T(inv_std);
+  }
+}
+
+#define instantiate_layer_norm_single_row(DTYPE)                          \
+  template [[host_name("layer_norm_single_row_" #DTYPE)]] [[kernel]] void \
+  layer_norm_single_row<DTYPE>(                                           \
+      device DTYPE * input [[buffer(0)]],                                 \
+      device DTYPE * output [[buffer(1)]],                                \
+      device DTYPE * meanOut [[buffer(2)]],                               \
+      device DTYPE * rstdTensor [[buffer(3)]],                            \
+      constant uint & axis_size [[buffer(4)]],                            \
+      constant float& epsilon [[buffer(5)]],                              \
+      constant int& use_weight [[buffer(6)]],                             \
+      constant int& use_bias [[buffer(7)]],                               \
+      device DTYPE* weight [[buffer(8)]],                                 \
+      device DTYPE* bias [[buffer(9)]],                                   \
+      uint tg_id [[threadgroup_position_in_grid]],                        \
+      uint tid [[thread_position_in_threadgroup]],                        \
+      uint simd_lane_id [[thread_index_in_simdgroup]],                    \
+      uint simdgroup_id [[simdgroup_index_in_threadgroup]]);
+
+#define instantiate_layer_norm_looped(DTYPE)                          \
+  template [[host_name("layer_norm_looped_" #DTYPE)]] [[kernel]] void \
+  layer_norm_looped<DTYPE>(                                           \
+      device DTYPE * input [[buffer(0)]],                             \
+      device DTYPE * output [[buffer(1)]],                            \
+      device DTYPE * meanOut [[buffer(2)]],                           \
+      device DTYPE * rstdTensor [[buffer(3)]],                        \
+      constant uint & axis_size [[buffer(4)]],                        \
+      constant float& epsilon [[buffer(5)]],                          \
+      constant int& use_weight [[buffer(6)]],                         \
+      constant int& use_bias [[buffer(7)]],                           \
+      device DTYPE* weight [[buffer(8)]],                             \
+      device DTYPE* bias [[buffer(9)]],                               \
+      uint tg_id [[threadgroup_position_in_grid]],                    \
+      uint tid [[thread_position_in_threadgroup]],                    \
+      uint lsize [[threads_per_threadgroup]],                         \
+      uint simd_lane_id [[thread_index_in_simdgroup]],                \
+      uint simdgroup_id [[simdgroup_index_in_threadgroup]]);
+
+#define instantiate_layer_norm(DTYPE) \
+  instantiate_layer_norm_single_row(DTYPE) instantiate_layer_norm_looped(DTYPE)
+
+instantiate_layer_norm(float) instantiate_layer_norm(half)
+#if __METAL_VERSION__ >= 310
+    instantiate_layer_norm(bfloat)
+#endif
\ No newline at end of file
diff --git a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
index c98cc6950f2f..cef22bc4e24f 100644
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@@ -7,6 +7,7 @@ using namespace metal;
 constant uint TILE_DIM = 16;
 
 template <typename T>
+<<<<<<< HEAD
 kernel void matmul(
     constant T* mat1Data [[buffer(0)]],
     constant T* mat2Data [[buffer(1)]],
@@ -29,14 +30,39 @@ kernel void matmul(
     if (row < sizes.x && tiledCol < sizes.y) {
       A_tile[tid.y][tid.x] =
           mat1Data[row * strides[0].x + tiledCol * strides[0].y];
+=======
+inline c10::metal::opmath_t<T> matmul_inner(
+    constant T* mat1Data,
+    constant T* mat2Data,
+    constant array<ulong2, 3>& strides,
+    constant uint3& sizes,
+    threadgroup T A_tile[TILE_DIM][TILE_DIM],
+    threadgroup T B_tile[TILE_DIM][TILE_DIM],
+    uint2 tid,
+    uint2 thread_id) {
+  c10::metal::opmath_t<T> sum = 0;
+
+  uint numTiles = (sizes.y + TILE_DIM - 1) / TILE_DIM;
+  for (uint t = 0; t < numTiles; t++) {
+    uint tiledCol = t * TILE_DIM + tid.x;
+    if (thread_id.y < sizes.x && tiledCol < sizes.y) {
+      A_tile[tid.y][tid.x] =
+          mat1Data[thread_id.y * strides[0].x + tiledCol * strides[0].y];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       A_tile[tid.y][tid.x] = 0;
     }
 
     uint tiledRow = t * TILE_DIM + tid.y;
+<<<<<<< HEAD
     if (tiledRow < sizes.y && col < sizes.z) {
       B_tile[tid.y][tid.x] =
           mat2Data[tiledRow * strides[1].x + col * strides[1].y];
+=======
+    if (tiledRow < sizes.y && thread_id.x < sizes.z) {
+      B_tile[tid.y][tid.x] =
+          mat2Data[tiledRow * strides[1].x + thread_id.x * strides[1].y];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       B_tile[tid.y][tid.x] = 0;
     }
@@ -50,8 +76,31 @@ kernel void matmul(
     threadgroup_barrier(mem_flags::mem_threadgroup);
   }
 
+<<<<<<< HEAD
   if (row < sizes.x && col < sizes.z) {
     outputData[row * strides[2].x + col * strides[2].y] = static_cast<T>(sum);
+=======
+  return sum;
+}
+
+template <typename T>
+kernel void matmul(
+    constant T* mat1Data [[buffer(0)]],
+    constant T* mat2Data [[buffer(1)]],
+    device T* outputData [[buffer(2)]],
+    constant array<ulong2, 3>& strides [[buffer(3)]],
+    constant uint3& sizes [[buffer(4)]],
+    uint2 tid [[thread_position_in_threadgroup]],
+    uint2 thread_id [[thread_position_in_grid]]) {
+  threadgroup T A_tile[TILE_DIM][TILE_DIM];
+  threadgroup T B_tile[TILE_DIM][TILE_DIM];
+
+  auto sum = matmul_inner(
+      mat1Data, mat2Data, strides, sizes, A_tile, B_tile, tid, thread_id);
+  if (thread_id.y < sizes.x && thread_id.x < sizes.z) {
+    outputData[thread_id.y * strides[2].x + thread_id.x * strides[2].y] =
+        static_cast<T>(sum);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -132,6 +181,31 @@ inline float blockReduceSum(
   return sharedScratch[0];
 }
 
+<<<<<<< HEAD
+=======
+template <bool col_major>
+inline device float& get_ref(device float* A, uint row, uint col, uint N);
+
+template <>
+inline device float& get_ref<true>(
+    device float* A,
+    uint row,
+    uint col,
+    uint N) {
+  return A[row * N + col];
+}
+
+template <>
+inline device float& get_ref<false>(
+    device float* A,
+    uint row,
+    uint col,
+    uint N) {
+  return A[row + col * N];
+}
+
+template <bool upper>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 kernel void factorDiagonalBlock(
     device float* A [[buffer(0)]],
     device int* info [[buffer(1)]],
@@ -158,7 +232,11 @@ kernel void factorDiagonalBlock(
   for (uint i = linear_tid; i < tileSize; i += group_size) {
     uint r = i / actSize;
     uint c = i % actSize;
+<<<<<<< HEAD
     tile[r][c] = A[batch_offset + (row0 + r) * N + (col0 + c)];
+=======
+    tile[r][c] = get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   threadgroup_barrier(mem_flags::mem_threadgroup);
 
@@ -231,10 +309,40 @@ kernel void factorDiagonalBlock(
   for (uint i = linear_tid; i < tileSize; i += group_size) {
     uint r = i / actSize;
     uint c = i % actSize;
+<<<<<<< HEAD
     A[batch_offset + (row0 + r) * N + (col0 + c)] = tile[r][c];
   }
 }
 
+=======
+    get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N) = tile[r][c];
+  }
+}
+
+template [[host_name("factorDiagonalBlockU")]]
+kernel void factorDiagonalBlock<true>(
+    device float* A [[buffer(0)]],
+    device int* info [[buffer(1)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 bid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template [[host_name("factorDiagonalBlockL")]]
+kernel void factorDiagonalBlock<false>(
+    device float* A [[buffer(0)]],
+    device int* info [[buffer(1)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 bid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template <bool upper>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 kernel void applyTRSM(
     device float* A [[buffer(0)]],
     constant uint& N [[buffer(2)]],
@@ -270,12 +378,20 @@ kernel void applyTRSM(
   for (uint i = linear_tid; i < actSize_k * actSize_k; i += group_size) {
     uint r = i / actSize_k;
     uint c = i % actSize_k;
+<<<<<<< HEAD
     diag[i] = A[batch_offset + (k * NB + r) * N + (k * NB + c)];
+=======
+    diag[i] = get_ref<upper>(A + batch_offset, k * NB + r, k * NB + c, N);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   for (uint i = linear_tid; i < actSize_j * actSize_k; i += group_size) {
     uint r = i / actSize_k;
     uint c = i % actSize_k;
+<<<<<<< HEAD
     target[i] = A[batch_offset + (row0 + r) * N + (col0 + c)];
+=======
+    target[i] = get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   threadgroup_barrier(mem_flags::mem_threadgroup);
 
@@ -319,10 +435,38 @@ kernel void applyTRSM(
   for (uint i = linear_tid; i < actSize_j * actSize_k; i += group_size) {
     uint r = i / actSize_k;
     uint c = i % actSize_k;
+<<<<<<< HEAD
     A[batch_offset + (row0 + r) * N + (col0 + c)] = target[i];
   }
 }
 
+=======
+    get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N) = target[i];
+  }
+}
+
+template [[host_name("applyTRSMU")]]
+kernel void applyTRSM<true>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template [[host_name("applyTRSML")]]
+kernel void applyTRSM<false>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template <bool upper>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 kernel void applySYRK(
     device float* A [[buffer(0)]],
     constant uint& N [[buffer(2)]],
@@ -390,6 +534,7 @@ kernel void applySYRK(
       // Same logic to load/store Cfrag, Afrag, Bfrag...
       simdgroup_matrix<float, 8, 8> Cfrag;
       simdgroup_load(
+<<<<<<< HEAD
           Cfrag, &A[batch_offset + (row0 + sb_y) * N + (col0 + sb_x)], N);
 
       for (uint kk = 0; kk < actSize_k; kk += 8) {
@@ -401,6 +546,27 @@ kernel void applySYRK(
             N,
             /* matrix_origin = */ 0,
             /* transpose = */ true);
+=======
+          Cfrag,
+          &get_ref<upper>(A + batch_offset, row0 + sb_y, col0 + sb_x, N),
+          N,
+          0,
+          !upper);
+
+      for (uint kk = 0; kk < actSize_k; kk += 8) {
+        simdgroup_load(
+            Afrag,
+            &get_ref<upper>(A + batch_offset, row0 + sb_y, k * NB + kk, N),
+            N,
+            0,
+            !upper);
+        simdgroup_load(
+            Bfrag,
+            &get_ref<upper>(A + batch_offset, col0 + sb_x, k * NB + kk, N),
+            N,
+            /* matrix_origin = */ 0,
+            /* transpose = */ upper);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         simdgroup_multiply(Prod, Afrag, Bfrag);
         simdgroup_multiply(Prod, Prod, negative_identity);
@@ -408,7 +574,15 @@ kernel void applySYRK(
       }
 
       simdgroup_store(
+<<<<<<< HEAD
           Cfrag, &A[batch_offset + (row0 + sb_y) * N + (col0 + sb_x)], N);
+=======
+          Cfrag,
+          &get_ref<upper>(A + batch_offset, row0 + sb_y, col0 + sb_x, N),
+          N,
+          0,
+          !upper);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   } else {
     // Fallback for non-multiple-of-8 dimensions
@@ -429,8 +603,15 @@ kernel void applySYRK(
 
         float sum = 0.0f;
         for (uint i = 0; i < actSize_k; i++) {
+<<<<<<< HEAD
           float a_val = A[batch_offset + (row0 + y) * N + k * NB + i];
           float b_val = A[batch_offset + (col0 + x) * N + k * NB + i];
+=======
+          float a_val =
+              get_ref<upper>(A + batch_offset, row0 + y, k * NB + i, N);
+          float b_val =
+              get_ref<upper>(A + batch_offset, col0 + x, k * NB + i, N);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           sum = fma(a_val, b_val, sum);
         }
         sum_accumulator[y * tpg.x + x] += sum;
@@ -439,13 +620,42 @@ kernel void applySYRK(
     threadgroup_barrier(mem_flags::mem_threadgroup);
     for (uint y = ty; y < actSize_j; y += tpg.y) {
       for (uint x = tx; x < actSize_h; x += tpg.x) {
+<<<<<<< HEAD
         A[batch_offset + (row0 + y) * N + col0 + x] -=
+=======
+        get_ref<upper>(A + batch_offset, row0 + y, col0 + x, N) -=
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sum_accumulator[y * tpg.x + x];
       }
     }
   }
 }
 
+<<<<<<< HEAD
+=======
+template [[host_name("applySYRKU")]]
+kernel void applySYRK<true>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]],
+    uint sgitg [[simdgroup_index_in_threadgroup]]);
+
+template [[host_name("applySYRKL")]]
+kernel void applySYRK<false>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]],
+    uint sgitg [[simdgroup_index_in_threadgroup]]);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 kernel void applyPivots(
     device float* P [[buffer(0)]],
     device const int* pivots [[buffer(1)]],
diff --git a/aten/src/ATen/native/mps/kernels/Quantized.metal b/aten/src/ATen/native/mps/kernels/Quantized.metal
index 1a277602aa2d..5cb4542654cc 100644
--- a/aten/src/ATen/native/mps/kernels/Quantized.metal
+++ b/aten/src/ATen/native/mps/kernels/Quantized.metal
@@ -213,7 +213,11 @@ INSTANTIATE_INT4MV(bfloat, 256);
  * 1. Load A and B blocks (32x32 and 64x32 respectively) into shared memory.
  * 2. In 4 simdgroups, calculate the outer product of the loaded blocks. Each simdgroup produces a 2x4 8x8 result.
  *      2.1 For how to use outer product to perform matrix multiplication, refer to
+<<<<<<< HEAD
  *           http://mlwiki.org/index.php/Matrix-Matrix_Multiplication#Sum_of_Outer_Products
+=======
+ *           https://web.archive.org/web/20230521063455/http://mlwiki.org/index.php/Matrix-Matrix_Multiplication#Sum_of_Outer_Products
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * 3. Repeat 1 & 2 along K axis, with K block size 32, accumulate the result in the 2x4 8x8 block.
  * 4. Dequantize the final result and store it in the output matrix.
  *
diff --git a/aten/src/ATen/native/mps/kernels/ScanKernel.metal b/aten/src/ATen/native/mps/kernels/ScanKernel.metal
new file mode 100644
index 000000000000..3f75295a8caa
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/ScanKernel.metal
@@ -0,0 +1,388 @@
+#include <metal_stdlib>
+using namespace metal;
+
+#include <c10/metal/common.h>
+#include <c10/metal/utils.h>
+
+using c10::metal::accum_t;
+
+template <typename T, typename acc_t = accum_t<T>>
+struct CumSumOp {
+  static acc_t apply(acc_t a, acc_t b) {
+    return a + b;
+  }
+  static acc_t identity() {
+    return acc_t(0);
+  }
+};
+
+template <typename T, typename acc_t = accum_t<T>>
+struct CumProdOp {
+  static acc_t apply(acc_t a, acc_t b) {
+    return a * b;
+  }
+  static acc_t identity() {
+    return acc_t(1);
+  }
+};
+
+template <typename T, typename acc_t = accum_t<T>>
+struct CumMinOp {
+  static acc_t apply(acc_t a, acc_t b) {
+    return metal::min(a, b);
+  }
+  static acc_t identity() {
+    return static_cast<acc_t>(
+        metal::is_floating_point_v<T> ? metal::numeric_limits<T>::infinity()
+                                      : metal::numeric_limits<T>::max());
+  }
+};
+
+template <typename T, typename acc_t = accum_t<T>>
+struct CumMaxOp {
+  static acc_t apply(acc_t a, acc_t b) {
+    return metal::max(a, b);
+  }
+  static acc_t identity() {
+    return static_cast<acc_t>(
+        metal::is_floating_point_v<T> ? -metal::numeric_limits<T>::infinity()
+                                      : metal::numeric_limits<T>::lowest());
+  }
+};
+
+// Inclusive scan along innermost dimension for contiguous tensors
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_contiguous_innermost_dim(
+    constant T* input [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    constant uint& num_rows [[buffer(2)]],
+    constant uint& row_size [[buffer(3)]],
+    uint row [[thread_position_in_grid]]) {
+  if (row >= num_rows)
+    return;
+
+  const uint offset = row * row_size;
+
+  acc_t accumulator = Op::identity();
+
+  for (uint col = 0; col < row_size; col++) {
+    T val = input[offset + col];
+    acc_t accum_val = static_cast<acc_t>(val);
+    accumulator = Op::apply(accumulator, accum_val);
+    output[offset + col] = static_cast<T>(accumulator);
+  }
+}
+
+// Inclusive scan along outer dimension for contiguous tensors
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_contiguous_outer_dim(
+    constant T* input [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    constant uint& num_orows [[buffer(2)]],
+    constant uint& num_irows [[buffer(3)]],
+    constant uint& row_size [[buffer(4)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const uint orow = thread_index / num_irows;
+  const uint irow = thread_index % num_irows;
+
+  if (orow >= num_orows)
+    return;
+
+  acc_t accumulator = Op::identity();
+
+  const uint idx_base = orow * row_size * num_irows + irow;
+  for (uint col = 0, idx = idx_base; col < row_size; col++, idx += num_irows) {
+    T val = input[idx];
+    acc_t accum_val = static_cast<acc_t>(val);
+    accumulator = Op::apply(accumulator, accum_val);
+    output[idx] = static_cast<T>(accumulator);
+  }
+}
+
+// Inclusive scan with indices along innermost dimension for contiguous tensors
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_with_indices_contiguous_innermost_dim(
+    constant T* input [[buffer(0)]],
+    device T* values [[buffer(1)]],
+    device int64_t* indices [[buffer(2)]],
+    constant uint& num_rows [[buffer(3)]],
+    constant uint& row_size [[buffer(4)]],
+    uint row [[thread_position_in_grid]]) {
+  if (row >= num_rows)
+    return;
+
+  const uint offset = row * row_size;
+
+  acc_t accumulator = Op::identity();
+  int64_t best_idx = 0;
+
+  for (uint col = 0; col < row_size; col++) {
+    T val = input[offset + col];
+    acc_t accum_val = static_cast<acc_t>(val);
+    if (col == 0 || Op::apply(accum_val, accumulator) == accum_val) {
+      accumulator = accum_val;
+      best_idx = col;
+    }
+    values[offset + col] = static_cast<T>(accumulator);
+    indices[offset + col] = best_idx;
+  }
+}
+
+// Inclusive scan with indices along outer dimension for contiguous tensors
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_with_indices_contiguous_outer_dim(
+    constant T* input [[buffer(0)]],
+    device T* values [[buffer(1)]],
+    device int64_t* indices [[buffer(2)]],
+    constant uint& num_orows [[buffer(3)]],
+    constant uint& num_irows [[buffer(4)]],
+    constant uint& row_size [[buffer(5)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const uint orow = thread_index / num_irows;
+  const uint irow = thread_index % num_irows;
+
+  if (orow >= num_orows)
+    return;
+
+  acc_t accumulator = Op::identity();
+  int64_t best_idx = 0;
+
+  const uint idx_base = orow * row_size * num_irows + irow;
+  for (uint col = 0, idx = idx_base; col < row_size; col++, idx += num_irows) {
+    T val = input[idx];
+    acc_t accum_val = static_cast<acc_t>(val);
+    if (col == 0 || Op::apply(accum_val, accumulator) == accum_val) {
+      accumulator = accum_val;
+      best_idx = col;
+    }
+    values[idx] = static_cast<T>(accumulator);
+    indices[idx] = best_idx;
+  }
+}
+
+// Shared utility functions for strided kernels
+inline long calculate_non_scan_elements(
+    constant long* sizes,
+    uint ndim,
+    uint scan_dim) {
+  long total = 1;
+  for (uint i = 0; i < ndim; ++i) {
+    if (i != scan_dim) {
+      total *= sizes[i];
+    }
+  }
+  return total;
+}
+
+inline void thread_index_to_coordinates(
+    uint index,
+    int pos[c10::metal::max_ndim],
+    constant long* sizes,
+    uint ndim,
+    uint scan_dim) {
+  long remaining_index = index;
+  for (uint i = 0; i < ndim; ++i) {
+    if (i != scan_dim) {
+      pos[i] = remaining_index % sizes[i];
+      remaining_index /= sizes[i];
+    } else {
+      pos[i] = 0;
+    }
+  }
+}
+
+inline long calculate_base_offset(
+    int pos[c10::metal::max_ndim],
+    constant long* strides,
+    uint ndim,
+    uint scan_dim) {
+  long offset = 0;
+  for (uint i = 0; i < ndim; ++i) {
+    if (i != scan_dim) {
+      offset += pos[i] * strides[i];
+    }
+  }
+  return offset;
+}
+
+// Generic strided scan kernel
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_strided(
+    constant T* input [[buffer(0)]],
+    device T* output [[buffer(1)]],
+    constant long* sizes [[buffer(2)]],
+    constant long* input_strides [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant uint& ndim [[buffer(5)]],
+    constant uint& scan_dim [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const long total_non_scan_elements =
+      calculate_non_scan_elements(sizes, ndim, scan_dim);
+  if (thread_index >= total_non_scan_elements) {
+    return;
+  }
+
+  int pos[c10::metal::max_ndim];
+  thread_index_to_coordinates(thread_index, pos, sizes, ndim, scan_dim);
+
+  const long input_base_offset =
+      calculate_base_offset(pos, input_strides, ndim, scan_dim);
+  const long output_base_offset =
+      calculate_base_offset(pos, output_strides, ndim, scan_dim);
+
+  acc_t accumulator = Op::identity();
+  const long scan_size = sizes[scan_dim];
+  const long input_scan_stride = input_strides[scan_dim];
+  const long output_scan_stride = output_strides[scan_dim];
+
+  for (long scan_idx = 0; scan_idx < scan_size; scan_idx++) {
+    const long input_offset = input_base_offset + scan_idx * input_scan_stride;
+    const long output_offset =
+        output_base_offset + scan_idx * output_scan_stride;
+
+    T val = input[input_offset];
+    acc_t accum_val = static_cast<acc_t>(val);
+    accumulator = Op::apply(accumulator, accum_val);
+    output[output_offset] = static_cast<T>(accumulator);
+  }
+}
+
+// Generic strided scan with indices kernel
+template <typename T, typename Op, typename acc_t = accum_t<T>>
+kernel void scan_with_indices_strided(
+    constant T* input [[buffer(0)]],
+    device T* values [[buffer(1)]],
+    device int64_t* indices [[buffer(2)]],
+    constant long* sizes [[buffer(3)]],
+    constant long* input_strides [[buffer(4)]],
+    constant long* values_strides [[buffer(5)]],
+    constant long* indices_strides [[buffer(6)]],
+    constant uint& ndim [[buffer(7)]],
+    constant uint& scan_dim [[buffer(8)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const long total_non_scan_elements =
+      calculate_non_scan_elements(sizes, ndim, scan_dim);
+  if (thread_index >= total_non_scan_elements) {
+    return;
+  }
+
+  int pos[c10::metal::max_ndim];
+  thread_index_to_coordinates(thread_index, pos, sizes, ndim, scan_dim);
+
+  const long input_base_offset =
+      calculate_base_offset(pos, input_strides, ndim, scan_dim);
+  const long values_base_offset =
+      calculate_base_offset(pos, values_strides, ndim, scan_dim);
+  const long indices_base_offset =
+      calculate_base_offset(pos, indices_strides, ndim, scan_dim);
+
+  acc_t accumulator = Op::identity();
+  int64_t best_idx = 0;
+  const long scan_size = sizes[scan_dim];
+  const long input_scan_stride = input_strides[scan_dim];
+  const long values_scan_stride = values_strides[scan_dim];
+  const long indices_scan_stride = indices_strides[scan_dim];
+
+  for (long scan_idx = 0; scan_idx < scan_size; scan_idx++) {
+    const long input_offset = input_base_offset + scan_idx * input_scan_stride;
+    const long values_offset =
+        values_base_offset + scan_idx * values_scan_stride;
+    const long indices_offset =
+        indices_base_offset + scan_idx * indices_scan_stride;
+
+    T val = input[input_offset];
+    acc_t accum_val = static_cast<acc_t>(val);
+    if (scan_idx == 0 || Op::apply(accum_val, accumulator) == accum_val) {
+      accumulator = accum_val;
+      best_idx = scan_idx;
+    }
+    values[values_offset] = static_cast<T>(accumulator);
+    indices[indices_offset] = best_idx;
+  }
+}
+
+#define REGISTER_SCAN_OP(OP_NAME, OP_CLASS, DTYPE)                             \
+  template [[host_name(#OP_NAME "_contiguous_innermost_" #DTYPE)]] kernel void \
+  scan_contiguous_innermost_dim<DTYPE, OP_CLASS<DTYPE>>(                       \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * output [[buffer(1)]],                                     \
+      constant uint & num_rows [[buffer(2)]],                                  \
+      constant uint & row_size [[buffer(3)]],                                  \
+      uint row [[thread_position_in_grid]]);                                   \
+                                                                               \
+  template [[host_name(#OP_NAME "_contiguous_outer_" #DTYPE)]] kernel void     \
+  scan_contiguous_outer_dim<DTYPE, OP_CLASS<DTYPE>>(                           \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * output [[buffer(1)]],                                     \
+      constant uint & num_orows [[buffer(2)]],                                 \
+      constant uint & num_irows [[buffer(3)]],                                 \
+      constant uint & row_size [[buffer(4)]],                                  \
+      uint thread_index [[thread_position_in_grid]]);                          \
+                                                                               \
+  template [[host_name(#OP_NAME "_strided_" #DTYPE)]] kernel void              \
+  scan_strided<DTYPE, OP_CLASS<DTYPE>>(                                        \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * output [[buffer(1)]],                                     \
+      constant long* sizes [[buffer(2)]],                                      \
+      constant long* input_strides [[buffer(3)]],                              \
+      constant long* output_strides [[buffer(4)]],                             \
+      constant uint& ndim [[buffer(5)]],                                       \
+      constant uint& scan_dim [[buffer(6)]],                                   \
+      uint thread_index [[thread_position_in_grid]]);
+
+#define REGISTER_SCAN_WITH_INDICES_OP(OP_NAME, OP_CLASS, DTYPE)                \
+  template [[host_name(#OP_NAME "_contiguous_innermost_" #DTYPE)]] kernel void \
+  scan_with_indices_contiguous_innermost_dim<DTYPE, OP_CLASS<DTYPE>>(          \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * values [[buffer(1)]],                                     \
+      device int64_t* indices [[buffer(2)]],                                   \
+      constant uint& num_rows [[buffer(3)]],                                   \
+      constant uint& row_size [[buffer(4)]],                                   \
+      uint row [[thread_position_in_grid]]);                                   \
+                                                                               \
+  template [[host_name(#OP_NAME "_contiguous_outer_" #DTYPE)]] kernel void     \
+  scan_with_indices_contiguous_outer_dim<DTYPE, OP_CLASS<DTYPE>>(              \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * values [[buffer(1)]],                                     \
+      device int64_t* indices [[buffer(2)]],                                   \
+      constant uint& num_orows [[buffer(3)]],                                  \
+      constant uint& num_irows [[buffer(4)]],                                  \
+      constant uint& row_size [[buffer(5)]],                                   \
+      uint thread_index [[thread_position_in_grid]]);                          \
+                                                                               \
+  template [[host_name(#OP_NAME "_strided_" #DTYPE)]] kernel void              \
+  scan_with_indices_strided<DTYPE, OP_CLASS<DTYPE>>(                           \
+      constant DTYPE * input [[buffer(0)]],                                    \
+      device DTYPE * values [[buffer(1)]],                                     \
+      device int64_t* indices [[buffer(2)]],                                   \
+      constant long* sizes [[buffer(3)]],                                      \
+      constant long* input_strides [[buffer(4)]],                              \
+      constant long* values_strides [[buffer(5)]],                             \
+      constant long* indices_strides [[buffer(6)]],                            \
+      constant uint& ndim [[buffer(7)]],                                       \
+      constant uint& scan_dim [[buffer(8)]],                                   \
+      uint thread_index [[thread_position_in_grid]]);
+
+// Scan operations with indices
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, float);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, half);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, long);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, int);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, short);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, char);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, uchar);
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, bool);
+
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, float);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, half);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, long);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, int);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, short);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, char);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, uchar);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, bool);
+
+#if __METAL_VERSION__ >= 310
+REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, bfloat);
+REGISTER_SCAN_WITH_INDICES_OP(cummax, CumMaxOp, bfloat);
+#endif
diff --git a/aten/src/ATen/native/mps/kernels/SpecialOps.metal b/aten/src/ATen/native/mps/kernels/SpecialOps.metal
index 83e5ea5ee2a0..26273265abb5 100644
--- a/aten/src/ATen/native/mps/kernels/SpecialOps.metal
+++ b/aten/src/ATen/native/mps/kernels/SpecialOps.metal
@@ -1,6 +1,7 @@
 #include <c10/metal/indexing.h>
 #include <c10/metal/special_math.h>
 using namespace c10::metal;
+<<<<<<< HEAD
 
 DEFINE_UNARY_FLOATING_FUNCTOR(i0);
 DEFINE_UNARY_FLOATING_FUNCTOR(i1);
@@ -11,6 +12,86 @@ DEFINE_UNARY_FLOATING_FUNCTOR(entr);
   REGISTER_UNARY_OP(i0, DTI, DTO);                  \
   REGISTER_UNARY_OP(i1, DTI, DTO);                  \
   REGISTER_UNARY_OP(spherical_bessel_j0, DTI, DTO); \
+=======
+using namespace metal;
+
+DEFINE_UNARY_FLOATING_FUNCTOR(bessel_j0_forward);
+DEFINE_UNARY_FLOATING_FUNCTOR(bessel_j1_forward);
+DEFINE_UNARY_FLOATING_FUNCTOR(modified_bessel_i0_forward);
+DEFINE_UNARY_FLOATING_FUNCTOR(modified_bessel_i1_forward);
+DEFINE_UNARY_FLOATING_FUNCTOR(modified_bessel_k0_forward);
+DEFINE_UNARY_FLOATING_FUNCTOR(modified_bessel_k1_forward);
+DEFINE_UNARY_FLOATING_FUNCTOR(scaled_modified_bessel_k0_forward);
+DEFINE_UNARY_FLOATING_FUNCTOR(scaled_modified_bessel_k1_forward);
+DEFINE_UNARY_FLOATING_FUNCTOR(i0);
+DEFINE_UNARY_FLOATING_FUNCTOR(i0e);
+DEFINE_UNARY_FLOATING_FUNCTOR(i1);
+DEFINE_UNARY_FLOATING_FUNCTOR(i1e);
+DEFINE_UNARY_FLOATING_FUNCTOR(spherical_bessel_j0);
+
+// TODO: Replaceme with DEFINE_UNARY_FLOATING_FUNCTOR
+// But for some reason instantinating bessel_y[01] and entr on M1/M2 results in
+// Failed to created pipeline state object, error: Error Domain=AGXMetalG14X
+// Code=3 "Compiler encountered an internal error"
+struct bessel_y0_forward_functor {
+  template <typename T>
+  inline enable_if_t<is_floating_point_v<T>, T> operator()(const T x) {
+    return static_cast<T>(bessel_y0_forward(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_integral_v<T>, float> operator()(const T x) {
+    return bessel_y0_forward(static_cast<float>(x));
+  }
+  inline float operator()(const bool x) {
+    return x ? 0.08825694769620895 : -INFINITY;
+  }
+};
+
+struct bessel_y1_forward_functor {
+  template <typename T>
+  inline enable_if_t<is_floating_point_v<T>, T> operator()(const T x) {
+    return static_cast<T>(bessel_y1_forward(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_integral_v<T>, float> operator()(const T x) {
+    return bessel_y1_forward(static_cast<float>(x));
+  }
+  inline float operator()(const bool x) {
+    return x ? -0.7812128067016602 : -INFINITY;
+  }
+};
+
+struct entr_functor {
+  template <typename T>
+  inline enable_if_t<is_floating_point_v<T>, T> operator()(const T x) {
+    return static_cast<T>(entr(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_integral_v<T>, float> operator()(const T x) {
+    return entr(static_cast<float>(x));
+  }
+  inline float operator()(const bool x) {
+    return x ? -0.0 : 0.0;
+  }
+};
+
+#define REGISTER_SPECIAL(DTI, DTO)                                \
+  REGISTER_UNARY_OP(bessel_j0_forward, DTI, DTO);                 \
+  REGISTER_UNARY_OP(bessel_j1_forward, DTI, DTO);                 \
+  REGISTER_UNARY_OP(modified_bessel_i0_forward, DTI, DTO);        \
+  REGISTER_UNARY_OP(modified_bessel_i1_forward, DTI, DTO);        \
+  REGISTER_UNARY_OP(modified_bessel_k0_forward, DTI, DTO);        \
+  REGISTER_UNARY_OP(modified_bessel_k1_forward, DTI, DTO);        \
+  REGISTER_UNARY_OP(scaled_modified_bessel_k0_forward, DTI, DTO); \
+  REGISTER_UNARY_OP(scaled_modified_bessel_k1_forward, DTI, DTO); \
+  REGISTER_UNARY_OP(bessel_y0_forward, DTI, DTO);                 \
+  REGISTER_UNARY_OP(bessel_y1_forward, DTI, DTO);                 \
+  REGISTER_UNARY_OP(i0, DTI, DTO);                                \
+  REGISTER_UNARY_OP(i0e, DTI, DTO);                               \
+  REGISTER_UNARY_OP(i1, DTI, DTO);                                \
+  REGISTER_UNARY_OP(i1e, DTI, DTO);                               \
+  REGISTER_UNARY_OP(spherical_bessel_j0, DTI, DTO);               \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   REGISTER_UNARY_OP(entr, DTI, DTO)
 
 REGISTER_SPECIAL(float, float);
diff --git a/aten/src/ATen/native/mps/kernels/TriangularOps.metal b/aten/src/ATen/native/mps/kernels/TriangularOps.metal
index 50599160ce85..d824fd6e6c65 100644
--- a/aten/src/ATen/native/mps/kernels/TriangularOps.metal
+++ b/aten/src/ATen/native/mps/kernels/TriangularOps.metal
@@ -1,5 +1,123 @@
 #include <metal_stdlib>
+<<<<<<< HEAD
 using namespace metal;
+=======
+
+using namespace metal;
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triu/tril ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+template <bool upper>
+inline bool triul_mask(int row, int col, int k);
+template <>
+inline bool triul_mask<true>(int row, int col, int k) {
+  return col - row >= k;
+}
+template <>
+inline bool triul_mask<false>(int row, int col, int k) {
+  return col - row <= k;
+}
+
+template <typename IndexType>
+inline IndexType compute_offs(
+    constant IndexType* strides,
+    constant uint* sizes,
+    uint3 pos,
+    int ndim) {
+  auto offs = pos.x * strides[0] + pos.y * strides[1];
+  if (ndim < 4) {
+    return ndim == 3 ? offs + pos.z * strides[2] : offs;
+  }
+  auto idx = pos.z;
+  for (int i = 2; i < ndim; ++i) {
+    offs += strides[i] * (idx % sizes[i]);
+    idx /= sizes[i];
+  }
+  return offs;
+}
+
+template <typename T, typename IndexType, bool upper>
+kernel void triul_inplace(
+    device T* self,
+    constant IndexType* strides,
+    constant uint* sizes,
+    constant int2& k_ndim,
+    uint3 pos [[thread_position_in_grid]]) {
+  if (triul_mask<upper>(pos.y, pos.x, k_ndim.x)) {
+    return;
+  }
+  auto offs = compute_offs(strides, sizes, pos, k_ndim.y);
+  self[offs] = 0;
+}
+
+template <typename T, typename IndexType, bool upper>
+kernel void triul(
+    device T* out,
+    device T* inp,
+    constant IndexType* out_strides,
+    constant IndexType* inp_strides,
+    constant uint* sizes,
+    constant int2& k_ndim,
+    uint3 pos [[thread_position_in_grid]]) {
+  auto out_offs = compute_offs(out_strides, sizes, pos, k_ndim.y);
+  if (!triul_mask<upper>(pos.y, pos.x, k_ndim.x)) {
+    out[out_offs] = 0;
+    return;
+  }
+  auto inp_offs = compute_offs(inp_strides, sizes, pos, k_ndim.y);
+  out[out_offs] = inp[inp_offs];
+}
+
+#define INSTANTIATE_TRIUL_KERNELS(DTYPE, IDX_TYPE)                         \
+  template [[host_name("triu_inplace_" #IDX_TYPE "_" #DTYPE)]] kernel void \
+  triul_inplace<DTYPE, IDX_TYPE, true>(                                    \
+      device DTYPE * self,                                                 \
+      constant IDX_TYPE * strides,                                         \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]]);                              \
+  template [[host_name("tril_inplace_" #IDX_TYPE "_" #DTYPE)]] kernel void \
+  triul_inplace<DTYPE, IDX_TYPE, false>(                                   \
+      device DTYPE * self,                                                 \
+      constant IDX_TYPE * strides,                                         \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]]);                              \
+  template [[host_name("triu_" #IDX_TYPE "_" #DTYPE)]] kernel void         \
+  triul<DTYPE, IDX_TYPE, true>(                                            \
+      device DTYPE * out,                                                  \
+      device DTYPE * inp,                                                  \
+      constant IDX_TYPE * out_strides,                                     \
+      constant IDX_TYPE * inp_strides,                                     \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]]);                              \
+  template [[host_name("tril_" #IDX_TYPE "_" #DTYPE)]] kernel void         \
+  triul<DTYPE, IDX_TYPE, false>(                                           \
+      device DTYPE * out,                                                  \
+      device DTYPE * inp,                                                  \
+      constant IDX_TYPE * out_strides,                                     \
+      constant IDX_TYPE * inp_strides,                                     \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]])
+
+INSTANTIATE_TRIUL_KERNELS(float, int);
+INSTANTIATE_TRIUL_KERNELS(half, int);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_TRIUL_KERNELS(bfloat, int);
+#endif
+
+INSTANTIATE_TRIUL_KERNELS(float2, int);
+INSTANTIATE_TRIUL_KERNELS(half2, int);
+
+INSTANTIATE_TRIUL_KERNELS(long, int);
+INSTANTIATE_TRIUL_KERNELS(int, int);
+INSTANTIATE_TRIUL_KERNELS(short, int);
+INSTANTIATE_TRIUL_KERNELS(char, int);
+INSTANTIATE_TRIUL_KERNELS(uchar, int);
+INSTANTIATE_TRIUL_KERNELS(bool, int);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 // To find the max integer that does not exceed the root of an int64_t variable,
@@ -209,11 +327,19 @@ kernel void triu_indices(
   template [[host_name(#NAME "_indices_" #DTYPE)]] kernel void \
       NAME##_indices<DTYPE>(                                   \
           device DTYPE * tensor,                               \
+<<<<<<< HEAD
           constant int64_t & col_offset,                       \
           constant int64_t & m_first_row,                      \
           constant int64_t & col,                              \
           constant int64_t & rectangle_size,                   \
           constant int64_t & triu_size,                        \
+=======
+          constant int64_t& col_offset,                        \
+          constant int64_t& m_first_row,                       \
+          constant int64_t& col,                               \
+          constant int64_t& rectangle_size,                    \
+          constant int64_t& triu_size,                         \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           uint linear_index [[thread_position_in_grid]])
 
 INSTANTIATE_TRI_INDICES(triu, long);
diff --git a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
index e0e25a1975f1..5e013d20974b 100644
--- a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
@@ -1,9 +1,14 @@
 #include <c10/metal/indexing.h>
 #include <c10/metal/special_math.h>
+<<<<<<< HEAD
+=======
+#include <c10/metal/utils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <metal_stdlib>
 using namespace metal;
 using namespace c10::metal;
 
+<<<<<<< HEAD
 template <typename T>
 T complex_div(T a, T b) {
   auto denom = dot(b, b);
@@ -24,6 +29,182 @@ struct exp_functor {
     return T(
         precise::exp(x.x) * precise::cos(x.y),
         precise::exp(x.x) * precise::sin(x.y));
+=======
+// Implement exp wrapper for both real and complex types
+template <typename T, enable_if_t<is_scalar_floating_point_v<T>, bool> = true>
+inline T exp_(const T x) {
+  return T(precise::exp(x));
+}
+
+template <typename T, enable_if_t<is_complex_v<T>, bool> = true>
+inline T exp_(const T x) {
+  return T(
+      precise::exp(x.x) * precise::cos(x.y),
+      precise::exp(x.x) * precise::sin(x.y));
+}
+
+struct exp_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return exp_(x);
+  }
+  template <typename T, enable_if_t<is_scalar_integral_v<T>, bool> = true>
+  inline float operator()(const T x) {
+    return exp_(static_cast<float>(x));
+  }
+};
+
+struct expm1_functor {
+  template <typename T, enable_if_t<is_scalar_floating_point_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    if (::metal::fabs(x) < 1e-5f) {
+      return static_cast<T>(c10::metal::expm1f(static_cast<float>(x)));
+    } else {
+      return static_cast<T>(exp_(static_cast<float>(x)) - 1.0f);
+    }
+  }
+  template <typename T, enable_if_t<is_scalar_integral_v<T>, bool> = true>
+  inline float operator()(const T x) {
+    return exp_(static_cast<float>(x)) - 1;
+  }
+  template <typename T, enable_if_t<is_complex_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    if (::precise::sqrt(dot(x, x)) < 1e-2) {
+      return T(
+          c10::metal::expm1f(x.x + ::precise::log(precise::cos(x.y))),
+          exp_(x.x) * precise::sin(x.y));
+    } else {
+      return exp_(x) - T(1.0f, 0.0f);
+    }
+  }
+};
+
+struct sigmoid_functor {
+  template <typename T, enable_if_t<is_scalar_floating_point_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return T(1.0f / (1.0f + exp_(-static_cast<float>(x))));
+  }
+  template <typename T, enable_if_t<is_complex_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return c10::metal::div(T(1, 0), (T(1, 0) + exp_(-x)));
+  }
+  template <typename T, enable_if_t<is_scalar_integral_v<T>, bool> = true>
+  inline float operator()(const T x) {
+    return 1.0f / (1.0f + exp_(-static_cast<float>(x)));
+  }
+};
+
+struct abs_functor {
+  template <typename T, enable_if_t<!is_complex_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return static_cast<T>(precise::abs(x));
+  }
+  template <typename T, enable_if_t<is_complex_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return T(::precise::sqrt(dot(x, x)), 0);
+  }
+};
+
+struct sin_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(precise::sin(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return precise::sin(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // sin(x+yi)=sin(x)cosh(y)+icos(x)sinh(y);
+    auto sin_x = precise::sin(x.x);
+    auto cosh_y = precise::cosh(x.y);
+    auto cos_x = precise::cos(x.x);
+    auto sinh_y = precise::sinh(x.y);
+    return T(sin_x * cosh_y, cos_x * sinh_y);
+  }
+};
+
+struct cos_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(precise::cos(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return precise::cos(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // cos(x+yi)=cos(x)cosh(y)-isin(x)sinh(y);
+    auto sin_x = precise::sin(x.x);
+    auto cosh_y = precise::cosh(x.y);
+    auto cos_x = precise::cos(x.x);
+    auto sinh_y = precise::sinh(x.y);
+    return T(cos_x * cosh_y, -1 * sin_x * sinh_y);
+  }
+};
+
+struct tan_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(precise::tan(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return precise::tan(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // tan(x+yi)=(tan(x) + itanh(y)) / (1 - i(tan(x) * tanh(y)))
+    auto tan_x = precise::tan(x.x);
+    auto tanh_y = precise::tanh(x.y);
+    return div(T(tan_x, tanh_y), T(1, -1 * tan_x * tanh_y));
+  }
+};
+
+struct sinh_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(precise::sinh(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return precise::sinh(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // sinh(x) = (e^x - e^(-x)) / 2
+    auto exp_1 =
+        T(precise::exp(x.x) * precise::cos(x.y),
+          precise::exp(x.x) * precise::sin(x.y));
+    auto exp_2 =
+        T(precise::exp(-x.x) * precise::cos(-x.y),
+          precise::exp(-x.x) * precise::sin(-x.y));
+    return div(exp_1 - exp_2, T(2, 0));
+  }
+};
+
+struct cosh_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(precise::cosh(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return precise::cosh(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // cosh(x+iy)=(e^x + e^(-x)) / 2
+    auto exp_1 =
+        T(precise::exp(x.x) * precise::cos(x.y),
+          precise::exp(x.x) * precise::sin(x.y));
+    auto exp_2 =
+        T(precise::exp(-x.x) * precise::cos(-x.y),
+          precise::exp(-x.x) * precise::sin(-x.y));
+    return div(exp_1 + exp_2, T(2, 0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -41,7 +222,236 @@ struct tanh_functor {
     // tanh(x+iy)=(tanh(x)+itan(y))/(1+itahnh(x)*tan(y));
     auto tanh_x = precise::tanh(x.x);
     auto tan_y = precise::tan(x.y);
+<<<<<<< HEAD
     return complex_div(T(tanh_x, tan_y), T(1.0, tanh_x * tan_y));
+=======
+    return div(T(tanh_x, tan_y), T(1.0, tanh_x * tan_y));
+  }
+};
+
+struct asin_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(precise::asin(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return precise::asin(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // asin(z) = atan(z/sqrt(1-z^2)) if z != ±1
+    if (x.x == 1 && x.y == 0)
+      return T(M_PI_F / 2, 0);
+    else if (x.x == -1 && x.y == 0)
+      return T(M_PI_F / -2, 0);
+    auto sqrt_val = T(1, 0) - c10::metal::mul(x, x);
+    // calculate sqrt
+    // modulus
+    auto m = precise::sqrt(sqrt_val.x * sqrt_val.x + sqrt_val.y * sqrt_val.y);
+    // real part: sqrt((m + a)/2)
+    auto real_part = precise::sqrt((m + sqrt_val.x) * .5);
+    // imaginary part: sign(b) * sqrt((m - a)/2)
+    auto imag_part = copysign(
+        static_cast<decltype(x.y)>(precise::sqrt((m - sqrt_val.x) * .5)),
+        sqrt_val.y);
+    auto atan_val = div(x, T(real_part, imag_part));
+    // calculate atan (see atan_functor)
+    auto coef = div(T(1, 0), T(0, 2));
+    auto log_arg =
+        div(T(-1 * atan_val.x, 1 - atan_val.y), T(atan_val.x, 1 + atan_val.y));
+    // Calculate log using method from log_functor
+    auto magnitude =
+        ::precise::sqrt(log_arg.x * log_arg.x + log_arg.y * log_arg.y);
+    auto real = ::precise::log(magnitude);
+    auto imag = (log_arg.x == 0 && log_arg.y == 0)
+        ? 0
+        : ::precise::atan2(log_arg.y, log_arg.x);
+    // return coefficient * log value
+    return c10::metal::mul(coef, T(real, imag));
+  }
+};
+
+struct acos_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(precise::acos(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return precise::acos(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // acos(z) = pi/2 - asin(z) if z != ±1
+    // calculate asin
+    if (x.x == 1 && x.y == 0)
+      return T(M_PI_F, 0);
+    else if (x.x == -1 && x.y == 0)
+      return T(-M_PI_F, 0);
+    auto sqrt_val = T(1, 0) - c10::metal::mul(x, x);
+    // calculate sqrt
+    // modulus
+    auto m = precise::sqrt(sqrt_val.x * sqrt_val.x + sqrt_val.y * sqrt_val.y);
+    // real part: sqrt((m + a)/2)
+    auto real_part = precise::sqrt((m + sqrt_val.x) * .5);
+    // imaginary part: sign(b) * sqrt((m - a)/2)
+    auto imag_part = copysign(
+        static_cast<decltype(x.y)>(precise::sqrt((m - sqrt_val.x) * .5)),
+        sqrt_val.y);
+    auto atan_val = div(x, T(real_part, imag_part));
+    // calculate atan (see atan_functor)
+    auto coef = div(T(1, 0), T(0, 2));
+    auto log_arg =
+        div(T(-1 * atan_val.x, 1 - atan_val.y), T(atan_val.x, 1 + atan_val.y));
+    // Calculate log using method from log_functor
+    auto magnitude =
+        ::precise::sqrt(log_arg.x * log_arg.x + log_arg.y * log_arg.y);
+    auto real = ::precise::log(magnitude);
+    auto imag = (log_arg.x == 0 && log_arg.y == 0)
+        ? 0
+        : ::precise::atan2(log_arg.y, log_arg.x);
+    // return coefficient * log value
+    return T(M_PI_F / 2, 0) - c10::metal::mul(coef, T(real, imag));
+  }
+};
+
+struct atan_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(precise::atan(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return precise::atan(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // atan(z) = (1/2i)ln((i-z)/(i+z))
+    auto coef = div(T(1, 0), T(0, 2));
+    auto log_arg = div(T(-1 * x.x, 1 - x.y), T(x.x, 1 + x.y));
+    // Calculate log using method from log_functor
+    auto magnitude =
+        ::precise::sqrt(log_arg.x * log_arg.x + log_arg.y * log_arg.y);
+    auto real = ::precise::log(magnitude);
+    auto imag = (log_arg.x == 0 && log_arg.y == 0)
+        ? 0
+        : ::precise::atan2(log_arg.y, log_arg.x);
+    // return coefficient * log value
+    return c10::metal::mul(coef, T(real, imag));
+  }
+};
+
+// Bool specialization is need to workaround compiler crashes on MacOS-13
+// Otherwise attempts to invoke will fail to create state object with error
+// Error Domain=AGXMetal13_3 Code=3 "Compiler encountered an internal error"
+
+struct log_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(::precise::log(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return ::precise::log(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // log(x+yi) = ln(sqrt(x^2 + y^2)) + iarctan(y/x)
+    auto magnitude = ::precise::sqrt(x.x * x.x + x.y * x.y);
+    auto real = ::precise::log(magnitude);
+    auto imag = (x.x == 0 && x.y == 0) ? 0 : ::precise::atan2(x.y, x.x);
+    return T(real, imag);
+  }
+  inline float operator()(const bool x) {
+    return x ? 0 : -INFINITY;
+  }
+};
+
+struct log10_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(::precise::log10(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return ::precise::log10(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // Base 10 complex log = ln(x+yi)/ln(10)
+    auto magnitude = ::precise::sqrt(x.x * x.x + x.y * x.y);
+    auto real = ::precise::log(magnitude);
+    auto imag = (x.x == 0 && x.y == 0) ? 0 : ::precise::atan2(x.y, x.x);
+    return div(T(real, imag), T(::precise::log(10), 0));
+  }
+  inline float operator()(const bool x) {
+    return x ? 0 : -INFINITY;
+  }
+};
+
+struct log1p_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(::c10::metal::log1p(float(x)));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return ::precise::log(1.0f + static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // TODO: Implement proper log1p algoirthm
+    auto magnitude = ::precise::sqrt((1.0f + x.x) * (1.0f + x.x) + x.y * x.y);
+    auto real = ::precise::log(magnitude);
+    auto imag = (x.x == -1 && x.y == 0) ? 0 : ::precise::atan2(x.y, 1.0 + x.x);
+    return T(real, imag);
+  }
+  inline float operator()(const bool x) {
+    return x ? ::precise::log(2.0) : 0;
+  }
+};
+
+struct log2_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(::precise::log2(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return ::precise::log2(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // Base 10 complex log = ln(x+yi)/ln(2)
+    auto magnitude = ::precise::sqrt(x.x * x.x + x.y * x.y);
+    auto real = ::precise::log(magnitude);
+    auto imag = (x.x == 0 && x.y == 0) ? 0 : ::precise::atan2(x.y, x.x);
+    return div(T(real, imag), T(::precise::log(2), 0));
+  }
+  inline float operator()(const bool x) {
+    return x ? 0 : -INFINITY;
+  }
+};
+
+struct exp2_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(::precise::pow(2, x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return ::precise::pow(2, static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // based on https://mathworld.wolfram.com/ComplexExponentiation.html
+    auto coef = ::precise::pow(4, x.x / 2);
+    auto ln = ::precise::log(4);
+    auto real = ::precise::cos(0.5 * x.y * ln);
+    auto imag = ::precise::sin(0.5 * x.y * ln);
+    return T(coef * real, coef * imag);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -67,6 +477,7 @@ struct sqrt_functor {
   }
 };
 
+<<<<<<< HEAD
 DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
 DEFINE_UNARY_FLOATING_FUNCTOR(sinc);
 
@@ -79,6 +490,121 @@ DEFINE_UNARY_FLOATING_FUNCTOR(sinc);
 
 #if __METAL_VERSION__ >= 310
 INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat);
+=======
+struct rsqrt_functor {
+  template <typename T>
+  inline enable_if_t<is_scalar_floating_point_v<T>, T> operator()(const T x) {
+    return T(1 / ::precise::sqrt(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_scalar_integral_v<T>, float> operator()(const T x) {
+    return 1 / ::precise::sqrt(static_cast<float>(x));
+  }
+  template <typename T>
+  inline enable_if_t<is_complex_v<T>, T> operator()(const T x) {
+    // modulus
+    auto m = precise::sqrt(x.x * x.x + x.y * x.y);
+    // real part: sqrt((m + a)/2)
+    auto real_part = precise::sqrt((m + x.x) * .5);
+    // imaginary part: sign(b) * sqrt((m - a)/2)
+    auto imag_part = copysign(
+        static_cast<decltype(x.y)>(precise::sqrt((m - x.x) * .5)), x.y);
+    auto denominator = (real_part * real_part) + (imag_part * imag_part);
+    return T(real_part / denominator, -1 * imag_part / denominator);
+  }
+};
+
+struct neg_functor {
+  template <typename T>
+  inline T operator()(const T x) {
+    return T(-1 * x);
+  }
+};
+
+struct bitwise_not_functor {
+  template <typename T>
+  inline enable_if_t<!is_same_v<T, bool> && is_scalar_integral_v<T>, T>
+  operator()(const T x) {
+    return ~x;
+  }
+
+  template <typename T>
+  inline enable_if_t<is_same_v<T, bool>, T> operator()(const T x) {
+    return !x;
+  }
+};
+
+template <typename T>
+float erfc(T x) {
+  return 1.0 - erf(x);
+}
+
+struct round_decimals_functor {
+  template <typename T>
+  inline T operator()(const T x, const long ndigits) {
+    return static_cast<T>(
+        rint(exp10(float(ndigits)) * x) * exp10(float(-ndigits)));
+  }
+};
+
+DEFINE_UNARY_FLOATING_FUNCTOR(erf);
+DEFINE_UNARY_FLOATING_FUNCTOR(erfc);
+DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
+DEFINE_UNARY_FLOATING_FUNCTOR(sinc);
+
+REGISTER_UNARY_OP(neg, int, int);
+REGISTER_UNARY_OP(neg, long, long);
+REGISTER_UNARY_OP(neg, short, short);
+REGISTER_UNARY_OP(neg, char, char);
+REGISTER_UNARY_OP(neg, uchar, uchar);
+REGISTER_UNARY_OP(neg, float, float);
+REGISTER_UNARY_OP(neg, half, half);
+
+REGISTER_UNARY_OP(bitwise_not, int, int);
+REGISTER_UNARY_OP(bitwise_not, long, long);
+REGISTER_UNARY_OP(bitwise_not, short, short);
+REGISTER_UNARY_OP(bitwise_not, char, char);
+REGISTER_UNARY_OP(bitwise_not, uchar, uchar);
+REGISTER_UNARY_OP(bitwise_not, bool, bool);
+
+REGISTER_UNARY_OP(abs, int, int);
+REGISTER_UNARY_OP(abs, long, long);
+REGISTER_UNARY_OP(abs, short, short);
+REGISTER_UNARY_OP(abs, char, char);
+REGISTER_UNARY_OP(abs, uchar, uchar);
+REGISTER_UNARY_OP(abs, float, float);
+REGISTER_UNARY_OP(abs, half, half);
+
+#define INSTANTIATE_UNARY_KERNELS2(DTYPE0, DTYPE1) \
+  REGISTER_UNARY_OP(erf, DTYPE1, DTYPE0);          \
+  REGISTER_UNARY_OP(erfc, DTYPE1, DTYPE0);         \
+  REGISTER_UNARY_OP(erfinv, DTYPE1, DTYPE0);       \
+  REGISTER_UNARY_OP(exp, DTYPE1, DTYPE0);          \
+  REGISTER_UNARY_OP(expm1, DTYPE1, DTYPE0);        \
+  REGISTER_UNARY_OP(sigmoid, DTYPE1, DTYPE0);      \
+  REGISTER_UNARY_OP(exp2, DTYPE1, DTYPE0);         \
+  REGISTER_UNARY_OP(log, DTYPE1, DTYPE0);          \
+  REGISTER_UNARY_OP(log10, DTYPE1, DTYPE0);        \
+  REGISTER_UNARY_OP(log1p, DTYPE1, DTYPE0);        \
+  REGISTER_UNARY_OP(log2, DTYPE1, DTYPE0);         \
+  REGISTER_UNARY_OP(sinc, DTYPE1, DTYPE0);         \
+  REGISTER_UNARY_OP(sqrt, DTYPE1, DTYPE0);         \
+  REGISTER_UNARY_OP(rsqrt, DTYPE1, DTYPE0);        \
+  REGISTER_UNARY_OP(sinh, DTYPE1, DTYPE0);         \
+  REGISTER_UNARY_OP(cosh, DTYPE1, DTYPE0);         \
+  REGISTER_UNARY_OP(tanh, DTYPE1, DTYPE0);         \
+  REGISTER_UNARY_OP(sin, DTYPE1, DTYPE0);          \
+  REGISTER_UNARY_OP(cos, DTYPE1, DTYPE0);          \
+  REGISTER_UNARY_OP(tan, DTYPE1, DTYPE0);          \
+  REGISTER_UNARY_OP(asin, DTYPE1, DTYPE0);         \
+  REGISTER_UNARY_OP(acos, DTYPE1, DTYPE0);         \
+  REGISTER_UNARY_OP(atan, DTYPE1, DTYPE0)
+
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat);
+REGISTER_UNARY_OP(neg, bfloat, bfloat);
+REGISTER_UNARY_OP(abs, bfloat, bfloat);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 INSTANTIATE_UNARY_KERNELS2(half, half);
 INSTANTIATE_UNARY_KERNELS2(float, float);
@@ -89,15 +615,43 @@ INSTANTIATE_UNARY_KERNELS2(float, short);
 INSTANTIATE_UNARY_KERNELS2(float, int);
 INSTANTIATE_UNARY_KERNELS2(float, long);
 
+<<<<<<< HEAD
 #define INSTANTIATE_UNARY_KERNELS_VEC2(DTYPE)  \
   REGISTER_UNARY_OP(exp, DTYPE##2, DTYPE##2);  \
   REGISTER_UNARY_OP(tanh, DTYPE##2, DTYPE##2); \
   REGISTER_UNARY_OP(sqrt, DTYPE##2, DTYPE##2); \
   REGISTER_UNARY_OP(sinc, DTYPE##2, DTYPE##2)
+=======
+#define INSTANTIATE_UNARY_KERNELS_VEC2(DTYPE)     \
+  REGISTER_UNARY_OP(neg, DTYPE##2, DTYPE##2);     \
+  REGISTER_UNARY_OP(exp, DTYPE##2, DTYPE##2);     \
+  REGISTER_UNARY_OP(expm1, DTYPE##2, DTYPE##2);   \
+  REGISTER_UNARY_OP(sigmoid, DTYPE##2, DTYPE##2); \
+  REGISTER_UNARY_OP(abs, DTYPE##2, DTYPE##2);     \
+  REGISTER_UNARY_OP(exp2, DTYPE##2, DTYPE##2);    \
+  REGISTER_UNARY_OP(log, DTYPE##2, DTYPE##2);     \
+  REGISTER_UNARY_OP(log10, DTYPE##2, DTYPE##2);   \
+  REGISTER_UNARY_OP(log1p, DTYPE##2, DTYPE##2);   \
+  REGISTER_UNARY_OP(log2, DTYPE##2, DTYPE##2);    \
+  REGISTER_UNARY_OP(sinh, DTYPE##2, DTYPE##2);    \
+  REGISTER_UNARY_OP(cosh, DTYPE##2, DTYPE##2);    \
+  REGISTER_UNARY_OP(tanh, DTYPE##2, DTYPE##2);    \
+  REGISTER_UNARY_OP(sqrt, DTYPE##2, DTYPE##2);    \
+  REGISTER_UNARY_OP(rsqrt, DTYPE##2, DTYPE##2);   \
+                                                  \
+  REGISTER_UNARY_OP(sinc, DTYPE##2, DTYPE##2);    \
+  REGISTER_UNARY_OP(sin, DTYPE##2, DTYPE##2);     \
+  REGISTER_UNARY_OP(cos, DTYPE##2, DTYPE##2);     \
+  REGISTER_UNARY_OP(tan, DTYPE##2, DTYPE##2);     \
+  REGISTER_UNARY_OP(asin, DTYPE##2, DTYPE##2);    \
+  REGISTER_UNARY_OP(acos, DTYPE##2, DTYPE##2);    \
+  REGISTER_UNARY_OP(atan, DTYPE##2, DTYPE##2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 INSTANTIATE_UNARY_KERNELS_VEC2(half);
 INSTANTIATE_UNARY_KERNELS_VEC2(float);
 
+<<<<<<< HEAD
 template <typename T>
 kernel void round_decimals_dense(
     device T* output [[buffer(0)]],
@@ -150,4 +704,10 @@ INSTANTIATE_ROUND_DECIMALS(float);
 INSTANTIATE_ROUND_DECIMALS(half);
 #if __METAL_VERSION__ >= 310
 INSTANTIATE_ROUND_DECIMALS(bfloat);
+=======
+REGISTER_UNARY_ALPHA_OP(round_decimals, float, long, float);
+REGISTER_UNARY_ALPHA_OP(round_decimals, half, long, half);
+#if __METAL_VERSION__ >= 310
+REGISTER_UNARY_ALPHA_OP(round_decimals, bfloat, long, bfloat);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
diff --git a/aten/src/ATen/native/mps/kernels/UpSample.h b/aten/src/ATen/native/mps/kernels/UpSample.h
new file mode 100644
index 000000000000..c2c3c1d5d458
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/UpSample.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#ifndef __METAL__
+#include <array>
+using ulong = unsigned long;
+#define _ARRAY_NS std
+#else
+#include <metal_array>
+#define _ARRAY_NS metal
+#endif
+
+template <unsigned N = 5>
+struct UpsampleParams {
+  _ARRAY_NS::array<ulong, N> input_strides;
+  _ARRAY_NS::array<ulong, N> input_sizes;
+  _ARRAY_NS::array<ulong, N> output_strides;
+  _ARRAY_NS::array<ulong, N> output_sizes;
+  _ARRAY_NS::array<float, N - 2> scales;
+  bool align_corners;
+};
diff --git a/aten/src/ATen/native/mps/kernels/UpSample.metal b/aten/src/ATen/native/mps/kernels/UpSample.metal
index e214113614c7..23137db4ad9b 100644
--- a/aten/src/ATen/native/mps/kernels/UpSample.metal
+++ b/aten/src/ATen/native/mps/kernels/UpSample.metal
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #include <metal_stdlib>
 using namespace metal;
 
@@ -53,6 +54,14 @@ struct AtomicType<bfloat> {
   }
 };
 #endif
+=======
+#include <ATen/native/mps/kernels/UpSample.h>
+#include <c10/metal/atomic.h>
+#include <metal_stdlib>
+
+using namespace metal;
+using namespace c10::metal;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Based on
 // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
@@ -114,6 +123,27 @@ accscalar_t area_pixel_compute_source_index(
 template <typename scalar_t>
 scalar_t upsample_get_value_bounded(
     constant scalar_t* data,
+<<<<<<< HEAD
+=======
+    uint3 dim,
+    array<ulong, 5> strides,
+    uint n,
+    uint c,
+    uint z,
+    uint y,
+    uint x) {
+  auto access_z = max(min(z, dim.z - 1), 0U);
+  auto access_y = max(min(y, dim.y - 1), 0U);
+  auto access_x = max(min(x, dim.x - 1), 0U);
+  return data
+      [n * strides[0] + c * strides[1] + access_z * strides[2] +
+       access_y * strides[3] + access_x * strides[4]];
+}
+
+template <typename scalar_t>
+scalar_t upsample_get_value_bounded(
+    constant scalar_t* data,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     long2 dim,
     ulong4 strides,
     long n,
@@ -155,7 +185,32 @@ void upsample_increment_value_bounded(
       data,
       n * strides.x + c * strides.y + access_y * strides.z +
           access_x * strides.w,
+<<<<<<< HEAD
       value);
+=======
+      static_cast<scalar_t>(value));
+}
+
+template <typename scalar_t>
+void upsample_increment_value_bounded(
+    device AtomicType_t<scalar_t>* data,
+    uint3 dim,
+    array<ulong, 5> strides,
+    uint n,
+    uint c,
+    uint z,
+    uint y,
+    uint x,
+    float value) {
+  auto access_z = max(min(z, dim.z - 1), 0U);
+  auto access_y = max(min(y, dim.y - 1), 0U);
+  auto access_x = max(min(x, dim.x - 1), 0U);
+  AtomicType<scalar_t>::atomic_add(
+      data,
+      n * strides[0] + c * strides[1] + access_z * strides[2] +
+          access_y * strides[3] + access_x * strides[4],
+      static_cast<scalar_t>(value));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T>
@@ -174,6 +229,291 @@ inline linear_return_t<T> linear_interp(T v0, T v1, float x) {
   return x * v1 + (1 - x) * v0;
 }
 
+<<<<<<< HEAD
+=======
+/* 3D interpolation kernels and helper functions */
+inline uint3 coords_from_threadidx(
+    constant UpsampleParams<5>& params,
+    uint thread_index) {
+  const auto size_x = static_cast<uint>(params.output_sizes[4]);
+  const auto size_xy = static_cast<uint>(params.output_sizes[3]) * size_x;
+  auto output_xy = thread_index % size_xy;
+  return uint3(output_xy % size_x, output_xy / size_x, thread_index / size_xy);
+}
+
+inline float3 coords_to_real_coords(
+    constant UpsampleParams<5>& params,
+    uint3 output,
+    bool align_corners) {
+  auto real_x = area_pixel_compute_source_index(
+      params.scales[0], output.x, align_corners, /*cubic=*/false);
+  auto real_y = area_pixel_compute_source_index(
+      params.scales[1], output.y, align_corners, /*cubic=*/false);
+  auto real_z = area_pixel_compute_source_index(
+      params.scales[2], output.z, align_corners, /*cubic=*/false);
+  return float3(real_x, real_y, real_z);
+}
+
+template <typename T>
+kernel void upsample_nearest_exact_3d(
+    constant T* inputData [[buffer(0)]],
+    device T* outputData [[buffer(1)]],
+    constant UpsampleParams<5>& params [[buffer(2)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const auto input_sizes = uint3(
+      params.input_sizes[4], params.input_sizes[3], params.input_sizes[2]);
+  const auto output = coords_from_threadidx(params, thread_index);
+  const auto real = coords_to_real_coords(params, output, false);
+  for (uint n = 0; n < params.output_sizes[0]; n++) {
+    for (uint c = 0; c < params.output_sizes[1]; c++) {
+      auto res = upsample_get_value_bounded<T>(
+          inputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z + .5,
+          real.y + .5,
+          real.x + .5);
+      outputData
+          [n * params.output_strides[0] + c * params.output_strides[1] +
+           output.z * params.output_strides[2] +
+           output.y * params.output_strides[3] +
+           output.x * params.output_strides[4]] = static_cast<T>(res);
+    }
+  }
+}
+
+template <typename T>
+kernel void upsample_nearest_exact_3d_backward(
+    device AtomicType_t<T>* gradInputData [[buffer(0)]],
+    constant T* gradOutputData [[buffer(1)]],
+    constant UpsampleParams<5>& params [[buffer(2)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const auto input_sizes = uint3(
+      params.input_sizes[4], params.input_sizes[3], params.input_sizes[2]);
+  const auto output = coords_from_threadidx(params, thread_index);
+  const auto real = coords_to_real_coords(params, output, false);
+  for (uint n = 0; n < params.output_sizes[0]; n++) {
+    for (uint c = 0; c < params.output_sizes[1]; c++) {
+      auto res = gradOutputData
+          [n * params.output_strides[0] + c * params.output_strides[1] +
+           output.z * params.output_strides[2] +
+           output.y * params.output_strides[3] +
+           output.x * params.output_strides[4]];
+      upsample_increment_value_bounded<T>(
+          gradInputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z + .5,
+          real.y + .5,
+          real.x + .5,
+          res);
+    }
+  }
+}
+
+template <typename T>
+kernel void upsample_nearest_3d(
+    constant T* inputData [[buffer(0)]],
+    device T* outputData [[buffer(1)]],
+    constant UpsampleParams<5>& params [[buffer(2)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const auto input_sizes = uint3(
+      params.input_sizes[4], params.input_sizes[3], params.input_sizes[2]);
+  const auto output = coords_from_threadidx(params, thread_index);
+  const auto real = coords_to_real_coords(params, output, true);
+  for (uint n = 0; n < params.output_sizes[0]; n++) {
+    for (uint c = 0; c < params.output_sizes[1]; c++) {
+      auto res = upsample_get_value_bounded<T>(
+          inputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z,
+          real.y,
+          real.x);
+      outputData
+          [n * params.output_strides[0] + c * params.output_strides[1] +
+           output.z * params.output_strides[2] +
+           output.y * params.output_strides[3] +
+           output.x * params.output_strides[4]] = static_cast<T>(res);
+    }
+  }
+}
+
+template <typename T>
+kernel void upsample_nearest_3d_backward(
+    device AtomicType_t<T>* gradInputData [[buffer(0)]],
+    constant T* gradOutputData [[buffer(1)]],
+    constant UpsampleParams<5>& params [[buffer(2)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const auto input_sizes = uint3(
+      params.input_sizes[4], params.input_sizes[3], params.input_sizes[2]);
+  const auto output = coords_from_threadidx(params, thread_index);
+  const auto real = coords_to_real_coords(params, output, true);
+  for (uint n = 0; n < params.output_sizes[0]; n++) {
+    for (uint c = 0; c < params.output_sizes[1]; c++) {
+      auto res = gradOutputData
+          [n * params.output_strides[0] + c * params.output_strides[1] +
+           output.z * params.output_strides[2] +
+           output.y * params.output_strides[3] +
+           output.x * params.output_strides[4]];
+      upsample_increment_value_bounded<T>(
+          gradInputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z,
+          real.y,
+          real.x,
+          res);
+    }
+  }
+}
+
+template <typename T>
+kernel void upsample_trilinear(
+    constant T* inputData [[buffer(0)]],
+    device T* outputData [[buffer(1)]],
+    constant UpsampleParams<5>& params [[buffer(2)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const auto input_sizes = uint3(
+      params.input_sizes[4], params.input_sizes[3], params.input_sizes[2]);
+  const auto output = coords_from_threadidx(params, thread_index);
+  const auto real = coords_to_real_coords(params, output, params.align_corners);
+  auto t = fract(real);
+  for (uint n = 0; n < params.output_sizes[0]; n++) {
+    for (uint c = 0; c < params.output_sizes[1]; c++) {
+      auto i000 = upsample_get_value_bounded<T>(
+          inputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z,
+          real.y,
+          real.x);
+      auto i001 = upsample_get_value_bounded<T>(
+          inputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z,
+          real.y,
+          real.x + 1);
+      auto i010 = upsample_get_value_bounded<T>(
+          inputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z,
+          real.y + 1,
+          real.x);
+      auto i011 = upsample_get_value_bounded<T>(
+          inputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z,
+          real.y + 1,
+          real.x + 1);
+      auto i100 = upsample_get_value_bounded<T>(
+          inputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z + 1,
+          real.y,
+          real.x);
+      auto i101 = upsample_get_value_bounded<T>(
+          inputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z + 1,
+          real.y,
+          real.x + 1);
+      auto i110 = upsample_get_value_bounded<T>(
+          inputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z + 1,
+          real.y + 1,
+          real.x);
+      auto i111 = upsample_get_value_bounded<T>(
+          inputData,
+          input_sizes,
+          params.input_strides,
+          n,
+          c,
+          real.z + 1,
+          real.y + 1,
+          real.x + 1);
+      auto i00_l = linear_interp(i000, i001, t.x);
+      auto i01_l = linear_interp(i010, i011, t.x);
+      auto i10_l = linear_interp(i100, i101, t.x);
+      auto i11_l = linear_interp(i110, i111, t.x);
+      auto i0_l = linear_interp(i00_l, i01_l, t.y);
+      auto i1_l = linear_interp(i10_l, i11_l, t.y);
+      auto res = linear_interp(i0_l, i1_l, t.z);
+      outputData
+          [n * params.output_strides[0] + c * params.output_strides[1] +
+           output.z * params.output_strides[2] +
+           output.y * params.output_strides[3] +
+           output.x * params.output_strides[4]] = static_cast<T>(res);
+    }
+  }
+}
+
+template <typename T>
+kernel void upsample_trilinear_backward(
+    device AtomicType_t<T>* gradInputData [[buffer(0)]],
+    constant T* gradOutputData [[buffer(1)]],
+    constant UpsampleParams<5>& params [[buffer(2)]],
+    uint thread_index [[thread_position_in_grid]]) {
+  const auto input_sizes = uint3(
+      params.input_sizes[4], params.input_sizes[3], params.input_sizes[2]);
+  const auto output = coords_from_threadidx(params, thread_index);
+  const auto real = coords_to_real_coords(params, output, params.align_corners);
+  auto t = fract(real);
+  for (uint n = 0; n < params.output_sizes[0]; n++) {
+    for (uint c = 0; c < params.output_sizes[1]; c++) {
+      auto res = gradOutputData
+          [n * params.output_strides[0] + c * params.output_strides[1] +
+           output.z * params.output_strides[2] +
+           output.y * params.output_strides[3] +
+           output.x * params.output_strides[4]];
+      for (int d = 0; d < 8; d++) {
+        const auto w = (d & 1 ? t.x : 1.0 - t.x) * (d & 2 ? t.y : 1.0 - t.y) *
+            (d & 4 ? t.z : 1.0 - t.z);
+        upsample_increment_value_bounded<T>(
+            gradInputData,
+            input_sizes,
+            params.input_strides,
+            n,
+            c,
+            real.z + ((d & 4) >> 2),
+            real.y + ((d & 2) >> 1),
+            real.x + (d & 1),
+            res * w);
+      }
+    }
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // See Note [ Weights computation for uint8_t and multiplication trick ]
 // Essentially fall back to fixed floating point arithmetic during uint8
 // interpolation, which is not necesserily more accurate (see example below),
@@ -268,12 +608,40 @@ kernel void upsample_bilinear2d(
   }
 }
 
+<<<<<<< HEAD
 inline float bilinear_functor(float x) {
   return abs(x) < 1.0 ? 1.0 - abs(x) : abs(x);
 }
 
 template <typename T>
 kernel void upsample_bilinear2d_aa(
+=======
+struct BilinearFunctor {
+  inline float operator()(float x) {
+    x = abs(x);
+    return x < 1.0 ? 1.0 - x : x;
+  }
+  static constant constexpr float area_factor = 1.0;
+};
+
+struct BicubicFunctor {
+  inline float operator()(float x) {
+    // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+    x = abs(x);
+    if (x < 1.0) {
+      return 1.0 + (1.5 * x - 2.5) * x * x;
+    }
+    if (x < 2.0) {
+      return 2.0 - 0.5 * ((x - 5.0) * x + 8.0) * x;
+    }
+    return 0;
+  }
+  static constant constexpr float area_factor = 2.0;
+};
+
+template <typename T, typename F>
+kernel void upsample_2d_aa(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constant T* inputData [[buffer(0)]],
     device T* outputData [[buffer(1)]],
     constant ulong4& input_strides [[buffer(2)]],
@@ -286,6 +654,7 @@ kernel void upsample_bilinear2d_aa(
   auto output_x = thread_index % static_cast<uint>(output_sizes.w);
   auto output_y = thread_index / static_cast<uint>(output_sizes.w);
   (void)align_corners; // Align corners is unused for AA algorithm
+<<<<<<< HEAD
   auto x_center = area_pixel_compute_source_index(
       scales.x, output_x, /*align_corners=*/false, /*cubic=*/false);
   auto y_center = area_pixel_compute_source_index(
@@ -295,6 +664,28 @@ kernel void upsample_bilinear2d_aa(
   auto x_max = min(input_sizes.w, long(ceil(x_center + clamped_scales.x)));
   auto y_min = max(0L, long(floor(y_center - clamped_scales.y + 1)));
   auto y_max = min(input_sizes.z, long(ceil(y_center + clamped_scales.y)));
+=======
+  F f;
+  auto x_center = area_pixel_compute_source_index(
+      scales.x,
+      output_x,
+      /*align_corners=*/false,
+      /*cubic=*/F::area_factor == 2.0);
+  auto y_center = area_pixel_compute_source_index(
+      scales.y,
+      output_y,
+      /*align_corners=*/false,
+      /*cubic=*/F::area_factor == 2.0);
+  auto clamped_scales = max(1.0, scales);
+  auto x_min =
+      max(0L, long(floor(x_center - f.area_factor * clamped_scales.x + 1)));
+  auto x_max = min(
+      input_sizes.w, long(ceil(x_center + f.area_factor * clamped_scales.x)));
+  auto y_min =
+      max(0L, long(floor(y_center - f.area_factor * clamped_scales.y + 1)));
+  auto y_max = min(
+      input_sizes.z, long(ceil(y_center + f.area_factor * clamped_scales.y)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (int n = 0; n < output_sizes.x; n++) {
     for (int c = 0; c < output_sizes.y; c++) {
       float res = 0.0;
@@ -302,9 +693,15 @@ kernel void upsample_bilinear2d_aa(
       constant auto* input =
           inputData + n * input_strides.x + c * input_strides.y;
       for (auto y = y_min; y < y_max; ++y) {
+<<<<<<< HEAD
         auto dy = bilinear_functor((y - y_center) / clamped_scales.y);
         for (auto x = x_min; x < x_max; ++x) {
           auto dx = bilinear_functor((x - x_center) / clamped_scales.x);
+=======
+        auto dy = f((y - y_center) / clamped_scales.y);
+        for (auto x = x_min; x < x_max; ++x) {
+          auto dx = f((x - x_center) / clamped_scales.x);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           auto val = input[x * input_strides.w + y * input_strides.z];
           res += val * dx * dy;
           ws += dx * dy;
@@ -456,6 +853,22 @@ kernel void upsample_bicubic2d_backward(
           constant bool& align_corners [[buffer(7)]],              \
           uint thread_index [[thread_position_in_grid]])
 
+<<<<<<< HEAD
+=======
+#define INSTANTIATE_UPSAMPLE_2D_AA(NAME, FUNCTOR, DTYPE)           \
+  template [[host_name("upsample_" #NAME "_" #DTYPE)]] kernel void \
+  upsample_2d_aa<DTYPE, FUNCTOR>(                                  \
+      constant DTYPE * inputData [[buffer(0)]],                    \
+      device DTYPE * outputData [[buffer(1)]],                     \
+      constant ulong4 & input_strides [[buffer(2)]],               \
+      constant ulong4 & output_strides [[buffer(3)]],              \
+      constant long4 & input_sizes [[buffer(4)]],                  \
+      constant long4 & output_sizes [[buffer(5)]],                 \
+      constant float2 & scales [[buffer(6)]],                      \
+      constant bool& align_corners [[buffer(7)]],                  \
+      uint thread_index [[thread_position_in_grid]])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define INSTANTIATE_UPSAMPLE_2D_BACKWARD(NAME, DTYPE)                       \
   template [[host_name("upsample_" #NAME "_backward_" #DTYPE)]] kernel void \
       upsample_##NAME##_backward<DTYPE>(                                    \
@@ -482,6 +895,7 @@ kernel void upsample_bicubic2d_backward(
       constant bool& align_corners [[buffer(7)]],                 \
       uint thread_index [[thread_position_in_grid]])
 
+<<<<<<< HEAD
 #define INSTANTIATE_UPSAMPLE_ALL(DTYPE)               \
   INSTANTIATE_UPSAMPLE_2D(bicubic2d, DTYPE);          \
   INSTANTIATE_UPSAMPLE_2D_BACKWARD(bicubic2d, DTYPE); \
@@ -490,6 +904,61 @@ kernel void upsample_bicubic2d_backward(
   INSTANTIATE_UPSAMPLE_LINEAR(DTYPE);
 
 INSTANTIATE_UPSAMPLE_2D(bilinear2d, uchar);
+=======
+#define INSTANTIATE_UPSAMPLE_3D(DTYPE)                                    \
+  template [[host_name("upsample_nearest_3d_" #DTYPE)]] kernel void       \
+  upsample_nearest_3d<DTYPE>(                                             \
+      constant DTYPE * inputData [[buffer(0)]],                           \
+      device DTYPE * outputData [[buffer(1)]],                            \
+      constant UpsampleParams<5> & params [[buffer(2)]],                  \
+      uint thread_index [[thread_position_in_grid]]);                     \
+  template [[host_name("upsample_nearest_exact_3d_" #DTYPE)]] kernel void \
+  upsample_nearest_exact_3d<DTYPE>(                                       \
+      constant DTYPE * inputData [[buffer(0)]],                           \
+      device DTYPE * outputData [[buffer(1)]],                            \
+      constant UpsampleParams<5> & params [[buffer(2)]],                  \
+      uint thread_index [[thread_position_in_grid]]);                     \
+  template [[host_name("upsample_trilinear_" #DTYPE)]] kernel void        \
+  upsample_trilinear<DTYPE>(                                              \
+      constant DTYPE * inputData [[buffer(0)]],                           \
+      device DTYPE * outputData [[buffer(1)]],                            \
+      constant UpsampleParams<5> & params [[buffer(2)]],                  \
+      uint thread_index [[thread_position_in_grid]])
+
+#define INSTANTIATE_UPSAMPLE_3D_BACKWARD(DTYPE)                               \
+  template [[host_name("upsample_nearest_3d_backward_" #DTYPE)]] kernel void  \
+  upsample_nearest_3d_backward<DTYPE>(                                        \
+      device AtomicType_t<DTYPE> * gradInputData [[buffer(0)]],               \
+      constant DTYPE * gradOutputData [[buffer(1)]],                          \
+      constant UpsampleParams<5> & params [[buffer(2)]],                      \
+      uint thread_index [[thread_position_in_grid]]);                         \
+  template                                                                    \
+      [[host_name("upsample_nearest_exact_3d_backward_" #DTYPE)]] kernel void \
+      upsample_nearest_exact_3d_backward<DTYPE>(                              \
+          device AtomicType_t<DTYPE> * gradInputData [[buffer(0)]],           \
+          constant DTYPE * gradOutputData [[buffer(1)]],                      \
+          constant UpsampleParams<5> & params [[buffer(2)]],                  \
+          uint thread_index [[thread_position_in_grid]]);                     \
+  template [[host_name("upsample_trilinear_backward_" #DTYPE)]] kernel void   \
+  upsample_trilinear_backward<DTYPE>(                                         \
+      device AtomicType_t<DTYPE> * gradInputData [[buffer(0)]],               \
+      constant DTYPE * gradOutputData [[buffer(1)]],                          \
+      constant UpsampleParams<5> & params [[buffer(2)]],                      \
+      uint thread_index [[thread_position_in_grid]]);
+
+#define INSTANTIATE_UPSAMPLE_ALL(DTYPE)                              \
+  INSTANTIATE_UPSAMPLE_2D(bicubic2d, DTYPE);                         \
+  INSTANTIATE_UPSAMPLE_2D_AA(bicubic2d_aa, BicubicFunctor, DTYPE);   \
+  INSTANTIATE_UPSAMPLE_2D_BACKWARD(bicubic2d, DTYPE);                \
+  INSTANTIATE_UPSAMPLE_2D(bilinear2d, DTYPE);                        \
+  INSTANTIATE_UPSAMPLE_2D_AA(bilinear2d_aa, BilinearFunctor, DTYPE); \
+  INSTANTIATE_UPSAMPLE_LINEAR(DTYPE);                                \
+  INSTANTIATE_UPSAMPLE_3D_BACKWARD(DTYPE);                           \
+  INSTANTIATE_UPSAMPLE_3D(DTYPE)
+
+INSTANTIATE_UPSAMPLE_2D(bilinear2d, uchar);
+INSTANTIATE_UPSAMPLE_3D(uchar);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 INSTANTIATE_UPSAMPLE_ALL(float);
 INSTANTIATE_UPSAMPLE_ALL(half);
 #if __METAL_VERSION__ >= 310
diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 2090450e259a..2bab13bb2294 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -17,6 +17,7 @@
 #include <ATen/ops/gelu_native.h>
 #include <ATen/ops/glu_backward_native.h>
 #include <ATen/ops/glu_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/hardsigmoid_backward_native.h>
 #include <ATen/ops/hardsigmoid_native.h>
 #include <ATen/ops/hardswish_backward_native.h>
@@ -24,6 +25,9 @@
 #include <ATen/ops/hardtanh_backward_native.h>
 #include <ATen/ops/leaky_relu_backward_native.h>
 #include <ATen/ops/leaky_relu_native.h>
+=======
+#include <ATen/ops/hardtanh_backward_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/log_sigmoid_backward_native.h>
 #include <ATen/ops/log_sigmoid_forward_native.h>
 #include <ATen/ops/mish_backward_native.h>
@@ -34,8 +38,11 @@
 #include <ATen/ops/silu_native.h>
 #include <ATen/ops/softplus_backward_native.h>
 #include <ATen/ops/softplus_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/softshrink_backward_native.h>
 #include <ATen/ops/softshrink_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/tanh_backward_native.h>
 #include <ATen/ops/threshold_backward_native.h>
 #include <ATen/ops/threshold_native.h>
@@ -60,7 +67,11 @@ Tensor relu_mps(const Tensor& self) {
 
   MPSStream* stream = getCurrentMPSStream();
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "relu" + getTensorsStringKey({self});
+=======
+    std::string key = "relu" + getTensorsStringKey({self});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
       // passing selector of reLUWithTensor on the mpsGraph object
@@ -101,7 +112,11 @@ Tensor relu_mps(const Tensor& self) {
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "relu_" + getTensorsStringKey({self});
+=======
+    std::string key = "relu_" + getTensorsStringKey({self});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
       // passing selector of reLUWithTensor on the mpsGraph object
@@ -125,6 +140,7 @@ Tensor relu_mps(const Tensor& self) {
   return output;
 }
 
+<<<<<<< HEAD
 TORCH_IMPL_FUNC(leaky_relu_out_mps)(const Tensor& self, const Scalar& negative_slope, const Tensor& output) {
   using namespace mps;
   using CachedGraph = MPSUnaryCachedGraph;
@@ -229,6 +245,8 @@ Tensor relu_mps(const Tensor& self) {
   output.copy_(output_);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_IMPL_FUNC(log_softmax_mps_out)
 (const Tensor& self, const int64_t dim, const bool half_to_float, const Tensor& out) {
   using namespace mps;
@@ -241,7 +259,11 @@ Tensor relu_mps(const Tensor& self) {
   MPSStream* stream = at::mps::getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "log_softmax_mps_out" + getTensorsStringKey({self}) + ":" + std::to_string(dim);
+=======
+    std::string key = "log_softmax_mps_out" + getTensorsStringKey({self}) + ":" + std::to_string(dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
 
@@ -284,7 +306,11 @@ Tensor relu_mps(const Tensor& self) {
   MPSStream* stream = at::mps::getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "log_softmax_backward_mps_out:" + getMPSTypeString(grad_output) + ":" + std::to_string(dim);
+=======
+    std::string key = "log_softmax_backward_mps_out:" + getMPSTypeString(grad_output) + ":" + std::to_string(dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* gradOutputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_output));
       MPSGraphTensor* outputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(output));
@@ -332,7 +358,11 @@ Tensor relu_mps(const Tensor& self) {
   Tensor output_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "log_sigmoid_forward_out:" + getTensorsStringKey({self});
+=======
+    std::string key = "log_sigmoid_forward_out:" + getTensorsStringKey({self});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
       MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0 shape:@[ @1 ] dataType:inputTensor.dataType];
@@ -391,7 +421,11 @@ Tensor relu_mps(const Tensor& self) {
   Tensor grad_input_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "log_sigmoid_backward_out:" + getTensorsStringKey({self, grad_output});
+=======
+    std::string key = "log_sigmoid_backward_out:" + getTensorsStringKey({self, grad_output});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
       MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
@@ -459,7 +493,11 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "sigmoid_backward_out_mps:" + getMPSTypeString(grad_output);
+=======
+    std::string key = "sigmoid_backward_out_mps:" + getMPSTypeString(grad_output);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* gradOutputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_output));
       MPSGraphTensor* outputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(output));
@@ -501,7 +539,11 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "tanh_backward_out_mps:" + getMPSTypeString(grad_output);
+=======
+    std::string key = "tanh_backward_out_mps:" + getMPSTypeString(grad_output);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* gradOutputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_output));
       MPSGraphTensor* outputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(output));
@@ -538,7 +580,11 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "threshold_out_mps" + getTensorsStringKey({self}) + ":" + std::to_string(threshold.to<double>()) +
+=======
+    std::string key = "threshold_out_mps" + getTensorsStringKey({self}) + ":" + std::to_string(threshold.to<double>()) +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ":" + std::to_string(value.to<double>());
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -585,7 +631,11 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key =
+=======
+    std::string key =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "threshold_backward_out_mps" + getTensorsStringKey({self, grad}) + ":" + std::to_string(threshold.to<double>());
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -815,7 +865,11 @@ static void elu_variants_out_mps(const Tensor& self,
                                  const Scalar& scale,
                                  const Scalar& input_scale,
                                  const Tensor& result,
+<<<<<<< HEAD
                                  string func_name) {
+=======
+                                 std::string func_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using namespace mps;
   using CachedGraph = MPSUnaryCachedGraph;
 
@@ -834,7 +888,11 @@ static void elu_variants_out_mps(const Tensor& self,
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = func_name + ":" + getTensorsStringKey({self}) + ":" + std::to_string(alpha.to<double>()) + ":" +
+=======
+    std::string key = func_name + ":" + getTensorsStringKey({self}) + ":" + std::to_string(alpha.to<double>()) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(scale.to<double>()) + ":" + std::to_string(input_scale.to<double>());
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -923,7 +981,11 @@ static void elu_variants_out_mps(const Tensor& self,
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
+=======
+    std::string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(alpha.to<double>()) + ":" + std::to_string(scale.to<double>()) + ":" +
         std::to_string(input_scale.to<double>()) + ":" + std::to_string(is_result);
 
@@ -1018,7 +1080,11 @@ static void elu_variants_out_mps(const Tensor& self,
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "glu_out_mps" + getTensorsStringKey({self}) + ":" + std::to_string(dim);
+=======
+    std::string key = "glu_out_mps" + getTensorsStringKey({self}) + ":" + std::to_string(dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), getMPSShape(self));
       NSArray<MPSGraphTensor*>* outputTensorsArray = [mpsGraph splitTensor:inputTensor
@@ -1060,7 +1126,11 @@ static void elu_variants_out_mps(const Tensor& self,
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "glu_backward_mps_out" + getTensorsStringKey({grad_output, self}) + ":" + std::to_string(dim);
+=======
+    std::string key = "glu_backward_mps_out" + getTensorsStringKey({grad_output, self}) + ":" + std::to_string(dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), getMPSShape(self));
       MPSGraphTensor* gradOutputTensor =
@@ -1143,8 +1213,13 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int
   MPSScalar threshold_scalar = getMPSScalar(threshold, self.scalar_type());
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "softplus_out_mps:" + getTensorsStringKey({self}) + ":" + std::to_string(beta.to<double>()) + ":" +
         std::to_string(threshold.to<double>());
+=======
+    std::string key = "softplus_out_mps:" + getTensorsStringKey({self}) + ":" + std::to_string(beta.to<double>()) +
+        ":" + std::to_string(threshold.to<double>());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
@@ -1214,7 +1289,11 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "softplus_backward_out_mps:" + getTensorsStringKey({grad_output, self}) + ":" +
+=======
+    std::string key = "softplus_backward_out_mps:" + getTensorsStringKey({grad_output, self}) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(beta.to<double>()) + ":" + std::to_string(threshold.to<double>());
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -1289,7 +1368,11 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int
   Tensor result_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "mish_out_mps:" + getTensorsStringKey({self});
+=======
+    std::string key = "mish_out_mps:" + getTensorsStringKey({self});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
@@ -1334,7 +1417,11 @@ Tensor mish_backward_mps(const Tensor& grad_output, const Tensor& self) {
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "mish_backward_out_mps:" + getTensorsStringKey({grad_output, self});
+=======
+    std::string key = "mish_backward_out_mps:" + getTensorsStringKey({grad_output, self});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
@@ -1381,6 +1468,7 @@ Tensor mish_backward_mps(const Tensor& grad_output, const Tensor& self) {
   }
 }
 
+<<<<<<< HEAD
 TORCH_IMPL_FUNC(softshrink_out_mps)
 (const Tensor& self, const Scalar& lambd, const Tensor& result) {
   using namespace mps;
@@ -1525,6 +1613,8 @@ static void shrink_backward_out_mps(const Tensor& grad_output,
   return shrink_backward_out_mps(grad_output, self, lambd, grad_input, "softshrink_backward_out_mps");
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
   using namespace mps;
 
@@ -1545,7 +1635,11 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "prelu_mps:" + getTensorsStringKey({self, weight_});
+=======
+    std::string key = "prelu_mps:" + getTensorsStringKey({self, weight_});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
@@ -1601,7 +1695,11 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "prelu_backward_mps:" + getTensorsStringKey({grad_output, self, weight_});
+=======
+    std::string key = "prelu_backward_mps:" + getTensorsStringKey({grad_output, self, weight_});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
@@ -1665,7 +1763,11 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
   Tensor result_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "silu_out_mps:" + getTensorsStringKey({self});
+=======
+    std::string key = "silu_out_mps:" + getTensorsStringKey({self});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
@@ -1709,7 +1811,11 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "silu_out_backward_mps:" + getTensorsStringKey({grad_output});
+=======
+    std::string key = "silu_out_backward_mps:" + getTensorsStringKey({grad_output});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
@@ -1752,6 +1858,7 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
   }
 }
 
+<<<<<<< HEAD
 TORCH_IMPL_FUNC(hardsigmoid_out_mps)(const Tensor& self, const Tensor& result) {
   using namespace mps;
   using CachedGraph = MPSUnaryCachedGraph;
@@ -1851,6 +1958,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // -------------------------------------------------
 // Hardtanh backward
 
@@ -1878,7 +1987,11 @@ Tensor hardtanh_backward_mps(const Tensor& grad_output, const Tensor& self, cons
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "hardtanh_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+=======
+    std::string key = "hardtanh_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(min.to<double>()) + ":" + std::to_string(max.to<double>());
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -1937,6 +2050,7 @@ Tensor hardtanh_backward_mps(const Tensor& grad_output, const Tensor& self, cons
   return grad_input;
 }
 
+<<<<<<< HEAD
 Tensor& hardswish_out_mps(const Tensor& self, Tensor& output) {
   using namespace mps;
   using CachedGraph = MPSUnaryCachedGraph;
@@ -2109,4 +2223,6 @@ Tensor hardswish_backward_mps(const Tensor& grad_output, const Tensor& self) {
   }
   return grad_input;
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/ActivationKernel.mm b/aten/src/ATen/native/mps/operations/ActivationKernel.mm
new file mode 100644
index 000000000000..cec8bfa2312e
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/ActivationKernel.mm
@@ -0,0 +1,62 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/TensorIterator.h>
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/Activation.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <fmt/format.h>
+
+namespace at::native {
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/ActivationKernel_metallib.h>
+#endif
+
+static void hardshrink_kernel(TensorIteratorBase& iter, const Scalar& lambda = 0.5) {
+  lib.exec_unary_kernel(iter, "hardshrink", lambda);
+}
+
+static void softshrink_kernel(TensorIteratorBase& iter, const Scalar& lambda = 0.5) {
+  lib.exec_unary_kernel(iter, "softshrink", lambda);
+}
+
+static void shrink_backward_kernel(TensorIteratorBase& iter, const Scalar& lambda = 0.5) {
+  lib.exec_binary_kernel(iter, "shrink_backward", lambda);
+}
+
+static void hardsigmoid_kernel(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "hardsigmoid");
+}
+
+static void hardsigmoid_backward_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "hardsigmoid_backward");
+}
+
+static void hardswish_kernel(at::TensorIterator& iter) {
+  lib.exec_unary_kernel(iter, "hardswish");
+}
+
+static void hardswish_backward_kernel(at::TensorIterator& iter) {
+  lib.exec_binary_kernel(iter, "hardswish_backward");
+}
+
+static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negative_slope) {
+  lib.exec_unary_kernel(iter, "leaky_relu", negative_slope);
+}
+
+static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negative_slope) {
+  lib.exec_binary_kernel(iter, "leaky_relu_backward", negative_slope);
+}
+
+REGISTER_DISPATCH(hardshrink_stub, hardshrink_kernel);
+REGISTER_DISPATCH(softshrink_stub, softshrink_kernel);
+REGISTER_DISPATCH(shrink_backward_stub, shrink_backward_kernel);
+REGISTER_DISPATCH(hardsigmoid_stub, hardsigmoid_kernel);
+REGISTER_DISPATCH(hardsigmoid_backward_stub, hardsigmoid_backward_kernel);
+REGISTER_DISPATCH(hardswish_stub, hardswish_kernel);
+REGISTER_DISPATCH(hardswish_backward_stub, hardswish_backward_kernel);
+REGISTER_DISPATCH(leaky_relu_stub, leaky_relu_kernel);
+REGISTER_DISPATCH(leaky_relu_backward_stub, leaky_relu_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Amp.mm b/aten/src/ATen/native/mps/operations/Amp.mm
new file mode 100644
index 000000000000..e410d434ec7a
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Amp.mm
@@ -0,0 +1,132 @@
+//  Copyright © 2022 Apple Inc.
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/operations/MultiTensorApply.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_native.h>
+#include <ATen/ops/_amp_update_scale_native.h>
+#endif
+
+namespace at::native {
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/Amp_metallib.h>
+#endif
+namespace mps {
+
+static void _amp_non_finite_check_and_unscale_mps_single_impl(const Tensor& scaled_grad,
+                                                              at::Tensor& found_inf,
+                                                              const at::Tensor& inv_scale) {
+  if (scaled_grad.numel() == 0) {
+    return;
+  }
+  TORCH_CHECK(scaled_grad.is_mps(), "Tensor is not on the MPS device.");
+  TORCH_CHECK(scaled_grad.numel() <= std::numeric_limits<uint32_t>::max(), "scaled_grad is too large");
+  float inv_scale_val = inv_scale.item<float>();
+  auto stream = getCurrentMPSStream();
+  auto device = MPSDevice::getInstance()->device();
+  auto ampPipelineState =
+      lib.getPipelineStateForFunc("ampNonFiniteCheckAndUnscaleSingle_" + mps::scalarToMetalTypeString(scaled_grad));
+
+  const uint32_t threadsPerThreadgroup = 256;
+  uint32_t numel = static_cast<uint32_t>(scaled_grad.numel());
+  MTLSize threadGroupSize = MTLSizeMake(threadsPerThreadgroup, 1, 1);
+  MTLSize gridSize = MTLSizeMake(numel, 1, 1);
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    auto computeEncoder = stream->commandEncoder();
+    [computeEncoder setComputePipelineState:ampPipelineState];
+    mtl_setArgs(computeEncoder, scaled_grad, found_inf, inv_scale_val);
+    [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadGroupSize];
+  });
+}
+
+static void _amp_update_scale_mps_impl(Tensor& self,
+                                       Tensor& growth_tracker,
+                                       const Tensor& found_inf,
+                                       float scale_growth_factor,
+                                       float scale_backoff_factor,
+                                       int32_t growth_interval) {
+  auto stream = getCurrentMPSStream();
+  auto ampUpdatePipelineState = lib.getPipelineStateForFunc("ampUpdateScale_" + mps::scalarToMetalTypeString(self));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    auto computeEncoder = stream->commandEncoder();
+    [computeEncoder setComputePipelineState:ampUpdatePipelineState];
+
+    mtl_setArgs(
+        computeEncoder, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);
+    mtl_dispatch1DJob(computeEncoder, ampUpdatePipelineState, 1);
+  });
+}
+
+std::pair<id<MTLComputePipelineState>, id<MTLFunction>> getAmpCPLState(const std::string& fname) {
+  return {lib.getPipelineStateForFunc(fname), lib.getMTLFunction(fname)};
+}
+} // namespace mps
+
+void _amp_foreach_non_finite_check_and_unscale_mps_(at::TensorList self,
+                                                    at::Tensor& found_inf,
+                                                    const at::Tensor& inv_scale) {
+  if (self.size() == 0) {
+    return;
+  }
+  TORCH_CHECK(inv_scale.is_mps(), "inv_scale must be a MPS tensor.");
+  TORCH_CHECK(found_inf.is_mps(), "found_inf must be a MPS tensor.");
+  TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor.");
+  TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
+  TORCH_CHECK(inv_scale.scalar_type() == at::ScalarType::Float, "inv_scale must be a float tensor.");
+  TORCH_CHECK(found_inf.scalar_type() == at::ScalarType::Float, "found_inf must be a float tensor.");
+  // Ensures client code (GradScaler) filtered scaled_grads by API restrictions.
+  check_foreach_api_restrictions(self);
+
+  // Prepare a vector of tensor lists.
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  if (can_use_fast_route(self)) {
+    TORCH_CHECK(self[0].is_mps(), "scaled_grads must be MPS tensors.");
+    tensor_lists.emplace_back(self.vec());
+  } else {
+    tensor_lists.resize(1);
+    tensor_lists[0].reserve(self.size());
+    auto expected_device = self[0].device();
+    const auto expected_dtype = self[0].scalar_type();
+    for (const at::Tensor& t : self) {
+      // Ensure that GradScaler has filtered by device, layout, and dtype.
+      TORCH_CHECK(t.is_mps(), "one of scaled_grads was not a MPS tensor.");
+      TORCH_CHECK(t.device() == expected_device, "scaled_grads must be on the same device.");
+      TORCH_CHECK(t.layout() == at::kStrided, "one of scaled_grads was not a strided tensor.");
+      if (!t.is_non_overlapping_and_dense() || t.scalar_type() != expected_dtype) {
+        // Fall back to the single-tensor implementation
+        mps::_amp_non_finite_check_and_unscale_mps_single_impl(const_cast<at::Tensor&>(t), found_inf, inv_scale);
+      } else {
+        tensor_lists[0].push_back(t);
+      }
+    }
+    if (tensor_lists[0].empty()) {
+      return;
+    }
+  }
+
+  std::string kernel_name =
+      "ampNonFiniteCheckAndUnscale_" + mps::scalarToMetalTypeString(tensor_lists[0][0].scalar_type());
+  mps::multi_tensor_apply<1>(kernel_name, tensor_lists, found_inf, inv_scale);
+}
+
+Tensor& _amp_update_scale_mps_(Tensor& self,
+                               Tensor& growth_tracker,
+                               const Tensor& found_inf,
+                               double scale_growth_factor,
+                               double scale_backoff_factor,
+                               int64_t growth_interval) {
+  mps::_amp_update_scale_mps_impl(
+      self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);
+  return self;
+}
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/mps/operations/Attention.mm b/aten/src/ATen/native/mps/operations/Attention.mm
index dac80d614d62..0b3e06582489 100644
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@@ -1,5 +1,9 @@
 #include <string>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+<<<<<<< HEAD
+=======
+#include <fmt/format.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <iostream>
 #include <optional>
 
@@ -18,6 +22,16 @@
 
 namespace at {
 namespace native {
+<<<<<<< HEAD
+=======
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/Attention_metallib.h>
+#endif
+
+static constexpr int SIMD_SIZE = 32;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // expand potential 3d to 4d tensor
 static inline std::tuple<Tensor, bool> ensure_4d(const Tensor& x) {
@@ -31,6 +45,7 @@
   }
 }
 
+<<<<<<< HEAD
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math_mps(const Tensor& query,
                                                                   const Tensor& key,
                                                                   const Tensor& value,
@@ -65,6 +80,19 @@
     std::tie(*mask_, std::ignore) = ensure_4d(*mask_);
   }
 
+=======
+// general version
+static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
+                                                   const Tensor& key,
+                                                   const Tensor& value,
+                                                   const std::optional<Tensor>& attn_mask,
+                                                   double dropout_p,
+                                                   bool is_causal,
+                                                   const std::optional<Tensor>& dropout_mask,
+                                                   std::optional<double> scale,
+                                                   const Tensor& orig_query,
+                                                   bool unsqueezed) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using namespace mps;
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@@ -75,19 +103,35 @@
     MPSGraphTensor* outputTensor = nil;
     MPSGraphTensor* attnTensor = nil;
   };
+<<<<<<< HEAD
   int64_t batchSize = q_.size(0);
   int64_t num_head = q_.size(1);
   int64_t qSize = q_.size(2);
   int64_t headSize = q_.size(3);
   int64_t maxSeqLength = k_.size(2);
+=======
+  const auto macOS15_0_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
+  int64_t batchSize = query.size(0);
+  int64_t num_head = query.size(1);
+  int64_t qSize = query.size(2);
+  int64_t headSize = query.size(3);
+  int64_t maxSeqLength = key.size(2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto out = at::empty({batchSize, num_head, qSize, headSize}, query.options());
   auto attn = at::empty({batchSize, num_head, qSize, maxSeqLength}, query.options());
   auto scale_factor = sdp::calculate_scale(query, scale).expect_float();
   @autoreleasepool {
+<<<<<<< HEAD
     auto mkey = __func__ + getTensorsStringKey({q_, k_, v_}) + ":" + std::to_string(is_causal) + ":" +
         std::to_string(attn_mask.has_value());
     auto cachedGraph =
         LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = q_, k_ = k_, v_ = v_](auto mpsGraph, auto graph) {
+=======
+    auto mkey = __func__ + getTensorsStringKey({query, key, value}) + ":" + std::to_string(is_causal) + ":" +
+        std::to_string(attn_mask.has_value());
+    auto cachedGraph =
+        LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = query, k_ = key, v_ = value](auto mpsGraph, auto graph) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
           auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
           auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);
@@ -99,10 +143,17 @@
           auto maskedMM = [mpsGraph matrixMultiplicationWithPrimaryTensor:qTensor secondaryTensor:kT name:nil];
 
           if (macOS15_0_plus && [maskedMM dataType] == MPSDataTypeFloat32) {
+<<<<<<< HEAD
             // TODO: In MacOS15 beta, there is a MPSGraph issue when the SDPA sequence gets remapped to use
             // an improved kernel for the computation, causing NaNs in the result. This identity prevents the remapping.
             // Limit the availability check once a fix lands.
             maskedMM = [mpsGraph identityWithTensor:maskedMM name:nil];
+=======
+            // bug in MacOS15, without this trick SDPA leaks memory, adding 0.0f gets ignored(still takes SDPA sequence
+            // path which leaks)
+            auto oneTensor = [mpsGraph constantWithScalar:1e-20f shape:getMPSShape({1}) dataType:MPSDataTypeFloat32];
+            maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:oneTensor name:nil];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           }
 
           // upcasting to float32 if needed to improve precision when multiplying by the scale factor
@@ -124,8 +175,13 @@
                                        truePredicateTensor:maskedMM
                                       falsePredicateTensor:minusInf
                                                       name:nil];
+<<<<<<< HEAD
           } else if (mask_) {
             graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *mask_);
+=======
+          } else if (attn_mask) {
+            graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:graph->maskTensor name:nil];
           }
           auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
@@ -136,6 +192,7 @@
           graph->outputTensor = output;
           graph->attnTensor = sm;
         });
+<<<<<<< HEAD
     auto qPlaceholder = Placeholder(cachedGraph->qTensor, q_);
     auto kPlaceholder = Placeholder(cachedGraph->kTensor, k_);
     auto vPlaceholder = Placeholder(cachedGraph->vTensor, v_);
@@ -146,16 +203,110 @@
       feeds = dictionaryFromPlaceholders(qPlaceholder, kPlaceholder, vPlaceholder);
     } else {
       auto mPlaceholder = Placeholder(cachedGraph->maskTensor, *mask_);
+=======
+    auto qPlaceholder = Placeholder(cachedGraph->qTensor, query);
+    auto kPlaceholder = Placeholder(cachedGraph->kTensor, key);
+    auto vPlaceholder = Placeholder(cachedGraph->vTensor, value);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, out);
+    auto attnPlaceholder = Placeholder(cachedGraph->attnTensor, attn);
+    NSDictionary* feeds = nil;
+    if (!attn_mask) {
+      feeds = dictionaryFromPlaceholders(qPlaceholder, kPlaceholder, vPlaceholder);
+    } else {
+      auto mPlaceholder = Placeholder(cachedGraph->maskTensor, *attn_mask);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       feeds = dictionaryFromPlaceholders(qPlaceholder, kPlaceholder, vPlaceholder, mPlaceholder);
     }
     NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder, attnPlaceholder);
     runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outs);
   }
 
+<<<<<<< HEAD
   // reshape back to original dimension
   auto final_out = sq ? out.view_as(query) : out;
   auto final_attn = sq ? (query.dim() == 3 ? attn.squeeze(0) : [&]{
     std::vector<int64_t> shape(query.sizes().begin(), query.sizes().end() - 3);
+=======
+  auto final_out = unsqueezed ? out.view_as(orig_query) : out;
+  auto final_attn = unsqueezed ? (orig_query.dim() == 3 ? attn.squeeze(0) : [&]{
+    std::vector<int64_t> shape(orig_query.sizes().begin(), orig_query.sizes().end() - 3);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    shape.insert(shape.end(), {attn.size(1), attn.size(2), attn.size(3)});
+    return attn.view(shape);
+  }()) : attn;
+
+  return {std::move(final_out), std::move(final_attn)};
+}
+
+<<<<<<< HEAD
+=======
+// Vector mode (One–pass variant)
+static std::tuple<Tensor, Tensor> sdpa_vector_fast_mps(const Tensor& q_,
+                                                       const Tensor& k_,
+                                                       const Tensor& v_,
+                                                       const std::optional<Tensor>& mask_,
+                                                       double dropout_p,
+                                                       bool is_causal,
+                                                       const std::optional<Tensor>& dropout_mask,
+                                                       std::optional<double> scale,
+                                                       const Tensor& orig_query,
+                                                       bool unsqueezed) {
+  const auto macOS15_0_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
+  using namespace mps;
+  uint batchSize = q_.size(0);
+  uint num_head = q_.size(1);
+  uint qSize = q_.size(2);
+  uint headSize = q_.size(3);
+  uint maxSeqLength = k_.size(2);
+  uint N = k_.size(2);
+  uint B = q_.size(0) * q_.size(1);
+  uint k_head_stride = k_.stride(1);
+  uint k_seq_stride = k_.stride(2);
+  uint v_head_stride = v_.stride(1);
+  uint v_seq_stride = v_.stride(2);
+
+  auto out = at::empty({batchSize, num_head, qSize, headSize}, q_.options());
+  auto attn = at::empty({batchSize, num_head, qSize, maxSeqLength}, q_.options());
+  auto scale_factor = sdp::calculate_scale(q_, scale).expect_float();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      auto computeEncoder = mpsStream->commandEncoder();
+      auto group_dims = MTLSizeMake(1024, 1, 1);
+      auto grid_dims = MTLSizeMake(batchSize * num_head, q_.size(2), 1);
+      bool has_mask = mask_.has_value();
+
+      const std::string kname =
+          fmt::format("sdpa_vector_{}_{}_{}", scalarToMetalTypeString(q_), q_.size(-1), v_.size(-1));
+      auto attentionPSO = lib.getPipelineStateForFunc(kname);
+      [computeEncoder setComputePipelineState:attentionPSO];
+      mtl_setArgs(computeEncoder,
+                  q_,
+                  k_,
+                  v_,
+                  out,
+                  1,
+                  N,
+                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
+                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+                  scale_factor);
+
+      if (has_mask) {
+        int nd = mask_.value().dim();
+        uint kv_seq_stride = (nd >= 1 && mask_.value().size(nd - 1) > 1) ? mask_.value().stride(nd - 1) : 0;
+        uint q_seq_stride = (nd >= 2 && mask_.value().size(nd - 2) > 1) ? mask_.value().stride(nd - 2) : 0;
+        uint head_stride = (nd >= 3 && mask_.value().size(nd - 3) > 1) ? mask_.value().stride(nd - 3) : 0;
+        mtl_setArgs<9>(
+            computeEncoder, mask_.value(), std::array<uint32_t, 3>{kv_seq_stride, q_seq_stride, head_stride});
+      }
+      mtl_setArgs<11>(computeEncoder, has_mask);
+      [computeEncoder dispatchThreadgroups:grid_dims threadsPerThreadgroup:group_dims];
+    }
+  });
+  // reshape back to original dimension
+  auto final_out = unsqueezed ? out.view_as(orig_query) : out;
+  auto final_attn = unsqueezed ? (orig_query.dim() == 3 ? attn.squeeze(0) : [&]{
+    std::vector<int64_t> shape(orig_query.sizes().begin(), orig_query.sizes().end() - 3);
     shape.insert(shape.end(), {attn.size(1), attn.size(2), attn.size(3)});
     return attn.view(shape);
   }()) : attn;
@@ -163,5 +314,261 @@
   return {std::move(final_out), std::move(final_attn)};
 }
 
+// Vector mode (Two–pass variant)
+static std::tuple<Tensor, Tensor> sdpa_vector_2pass_mps(const Tensor& q_,
+                                                        const Tensor& k_,
+                                                        const Tensor& v_,
+                                                        const std::optional<Tensor>& mask_,
+                                                        double dropout_p,
+                                                        bool is_causal,
+                                                        const std::optional<Tensor>& dropout_mask,
+                                                        std::optional<double> scale,
+                                                        const Tensor& orig_query,
+                                                        bool unsqueezed) {
+  using namespace mps;
+  uint batchSize = q_.size(0);
+  uint num_heads = q_.size(1);
+  uint seq_len_q = q_.size(2);
+  uint headSize = q_.size(3);
+  uint N = k_.size(2);
+  const uint blocks = 32;
+  uint B = batchSize * num_heads;
+  uint gqa_factor = q_.size(1) / k_.size(1);
+
+  uint k_head_stride = k_.stride(1);
+  uint k_seq_stride = k_.stride(2);
+  uint v_head_stride = v_.stride(1);
+  uint v_seq_stride = v_.stride(2);
+
+  auto out = at::empty({batchSize, num_heads, seq_len_q, headSize}, q_.options());
+  auto intermediate = at::empty({batchSize, num_heads, seq_len_q, blocks, headSize}, q_.options());
+  auto sums = at::empty({batchSize, num_heads, seq_len_q, blocks}, q_.options());
+  auto maxs = at::empty({batchSize, num_heads, seq_len_q, blocks}, q_.options());
+
+  auto scale_factor = sdp::calculate_scale(orig_query, scale).expect_float();
+  bool has_mask = mask_.has_value();
+
+  MPSStream* mpsStream = getCurrentMPSStream();
+
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      const std::string kname_pass1 =
+          fmt::format("sdpa_vector_2pass_1_{}_{}_{}", scalarToMetalTypeString(q_), q_.size(-1), v_.size(-1));
+      const std::string kname_pass2 =
+          fmt::format("sdpa_vector_2pass_2_{}_{}", scalarToMetalTypeString(q_), v_.size(-1));
+      auto sdpa_vector_pass1PSO = lib.getPipelineStateForFunc(kname_pass1);
+      auto sdpa_vector_pass2PSO = lib.getPipelineStateForFunc(kname_pass2);
+      MTLSize group_dims = MTLSizeMake(8 * SIMD_SIZE, 1, 1);
+      MTLSize grid_dims = MTLSizeMake(B, seq_len_q, blocks);
+      auto computeEncoder = mpsStream->commandEncoder();
+
+      [computeEncoder setComputePipelineState:sdpa_vector_pass1PSO];
+      mtl_setArgs(computeEncoder,
+                  q_,
+                  k_,
+                  v_,
+                  intermediate,
+                  sums,
+                  maxs,
+                  gqa_factor,
+                  N,
+                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
+                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+                  scale_factor);
+
+      if (has_mask) {
+        Tensor mask = mask_.value();
+        int nd = mask.dim();
+        uint kv_seq_stride = (nd >= 1 && mask.size(nd - 1) > 1) ? mask.stride(nd - 1) : 0;
+        uint q_seq_stride = (nd >= 2 && mask.size(nd - 2) > 1) ? mask.stride(nd - 2) : 0;
+        uint head_stride = (nd >= 3 && mask.size(nd - 3) > 1) ? mask.stride(nd - 3) : 0;
+        mtl_setArgs<11>(computeEncoder, mask, std::array<uint32_t, 3>{kv_seq_stride, q_seq_stride, head_stride});
+      }
+      mtl_setArgs<13>(computeEncoder, has_mask);
+      [computeEncoder dispatchThreadgroups:grid_dims threadsPerThreadgroup:group_dims];
+      // 2nd pass
+      [computeEncoder setComputePipelineState:sdpa_vector_pass2PSO];
+      mtl_setArgs(computeEncoder, intermediate, sums, maxs, out);
+      [computeEncoder dispatchThreadgroups:MTLSizeMake(B, seq_len_q, 1) threadsPerThreadgroup:MTLSizeMake(1024, 1, 1)];
+    }
+  });
+
+  auto final_out = unsqueezed ? out.view_as(orig_query) : out;
+  return {std::move(final_out), std::move(intermediate)};
+}
+
+// Implementation 3: Full attention mode
+static std::tuple<Tensor, Tensor> sdpa_full_attention_mps(const Tensor& q_,
+                                                          const Tensor& k_,
+                                                          const Tensor& v_,
+                                                          const std::optional<Tensor>& mask_,
+                                                          double dropout_p,
+                                                          bool is_causal,
+                                                          const std::optional<Tensor>& dropout_mask,
+                                                          std::optional<double> scale,
+                                                          const Tensor& orig_query,
+                                                          bool unsqueezed) {
+  using namespace mps;
+
+  int64_t batchSize = q_.size(0);
+  int64_t num_heads = q_.size(1);
+  int64_t qL = q_.size(2);
+  int64_t kL = k_.size(2);
+  int64_t headSize = q_.size(3);
+
+  auto q_batch_stride = q_.stride(0);
+  auto q_head_stride = q_.stride(1);
+  auto q_seq_stride = q_.stride(2);
+
+  auto k_batch_stride = k_.stride(0);
+  auto k_head_stride = k_.stride(1);
+  auto k_seq_stride = k_.stride(2);
+
+  auto v_batch_stride = v_.stride(0);
+  auto v_head_stride = v_.stride(1);
+  auto v_seq_stride = v_.stride(2);
+
+  int mask_batch_stride = 0;
+  int mask_head_stride = 0;
+  int mask_q_seq_stride = 0;
+  int mask_kv_seq_stride = 0;
+  Tensor mask_tensor;
+  bool has_mask = mask_.has_value();
+  if (has_mask) {
+    mask_tensor = mask_.value();
+    mask_batch_stride = mask_tensor.stride(0);
+    mask_head_stride = mask_tensor.stride(1);
+    mask_q_seq_stride = mask_tensor.stride(2);
+    mask_kv_seq_stride = mask_tensor.stride(3);
+  }
+
+  float scale_factor = sdp::calculate_scale(orig_query, scale).expect_float();
+  auto out = at::empty_like(q_);
+
+  constexpr uint wm = 4;
+  constexpr uint wn = 1;
+  constexpr uint bq = 32;
+  auto bd = headSize;
+  auto bk = (bd < 128 ? 32 : 16);
+  auto gqa_factor = static_cast<int>(q_.size(1) / k_.size(1));
+
+  const auto NQ = (qL + bq - 1) / bq;
+  const auto NK = (kL + bk - 1) / bk;
+
+  std::string kname =
+      fmt::format("attention_{}_bq{}_bk{}_bd{}_wm{}_wn{}", scalarToMetalTypeString(q_), bq, bk, bd, wm, wn);
+
+  MPSStream* mpsStream = getCurrentMPSStream();
+
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^{
+    @autoreleasepool {
+      auto computeEncoder = mpsStream->commandEncoder();
+      auto attentionPSO = lib.getPipelineStateForFunc(kname);
+      [computeEncoder setComputePipelineState:attentionPSO];
+      mtl_setArgs(computeEncoder,
+                  q_,
+                  k_,
+                  v_,
+                  out,
+                  static_cast<int>(qL),
+                  static_cast<int>(kL),
+                  gqa_factor,
+                  scale_factor,
+                  NK,
+                  std::array<uint32_t, 3>{static_cast<uint32_t>(q_batch_stride),
+                                          static_cast<uint32_t>(q_head_stride),
+                                          static_cast<uint32_t>(q_seq_stride)},
+                  std::array<uint32_t, 3>{static_cast<uint32_t>(k_batch_stride),
+                                          static_cast<uint32_t>(k_head_stride),
+                                          static_cast<uint32_t>(k_seq_stride)},
+                  std::array<uint32_t, 3>{static_cast<uint32_t>(v_batch_stride),
+                                          static_cast<uint32_t>(v_head_stride),
+                                          static_cast<uint32_t>(v_seq_stride)},
+                  std::array<uint32_t, 3>{static_cast<uint32_t>(out.stride(0)),
+                                          static_cast<uint32_t>(out.stride(1)),
+                                          static_cast<uint32_t>(out.stride(2))});
+
+      MTLSize group_dims = MTLSizeMake(NQ, num_heads, batchSize);
+      MTLSize threadsPerGroup = MTLSizeMake(SIMD_SIZE, wm, wn);
+      [computeEncoder dispatchThreadgroups:group_dims threadsPerThreadgroup:threadsPerGroup];
+    }
+  });
+
+  auto final_out = unsqueezed ? out.view_as(orig_query) : out;
+  return {std::move(final_out), std::move(final_out)};
+}
+
+std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math_mps(const Tensor& query,
+                                                                  const Tensor& key,
+                                                                  const Tensor& value,
+                                                                  const std::optional<Tensor>& attn_mask,
+                                                                  double dropout_p,
+                                                                  bool is_causal,
+                                                                  const std::optional<Tensor>& dropout_mask,
+                                                                  std::optional<double> scale) {
+  auto query_tuple = ensure_4d(query);
+  Tensor q_ = std::get<0>(query_tuple);
+  bool unsqueezed = std::get<1>(query_tuple);
+
+  auto key_tuple = ensure_4d(key);
+  Tensor k_ = std::get<0>(key_tuple);
+
+  auto value_tuple = ensure_4d(value);
+  Tensor v_ = std::get<0>(value_tuple);
+
+  std::optional<Tensor> mask_;
+  if (attn_mask) {
+    auto maskExpandedDims = query.sizes().vec();
+    maskExpandedDims[maskExpandedDims.size() - 1] = k_.size(2);
+    mask_ = attn_mask->expand(maskExpandedDims);
+    std::tie(*mask_, std::ignore) = ensure_4d(*mask_);
+  }
+
+  int query_head_dim = q_.size(3);
+  int value_head_dim = v_.size(3);
+
+  // For a vector fast implementation support {64, 96, 128} and for full support {64, 80, 128} head_dims
+  bool sdpa_vector_supported_head_dim =
+      (query_head_dim == value_head_dim) && (query_head_dim == 64 || query_head_dim == 96 || query_head_dim == 128);
+
+  int query_seq_len = q_.size(2);
+  // Fast vector attention: when the sequence length is very short,
+  // the key sequence length is large,
+  // the mask is boolean and head dims are supported
+  bool supports_sdpa_vector = (query_seq_len <= 8) && (query_seq_len <= k_.size(2)) &&
+      ((!mask_.has_value()) || (mask_.value().dtype() == at::kBool)) && sdpa_vector_supported_head_dim;
+
+  // boolean to decide if we can use kernel paths
+  bool supports_fast_sdpa = !is_causal && supports_sdpa_vector;
+
+  // if none of the fast paths apply, fall back to the generic mps graph solution
+  if (!supports_fast_sdpa) {
+    return sdpa_general_mps(q_, k_, v_, mask_, dropout_p, is_causal, dropout_mask, scale, query, unsqueezed);
+  }
+
+  // dispatch to the fast SDPA implementation
+  auto is_contiguous_or_head_seq_transposed = [](const Tensor& t) -> bool {
+    if (t.is_contiguous())
+      return true;
+    auto sizes = t.sizes();
+    auto strides = t.strides();
+    return (strides[3] == 1) && (strides[2] == sizes[3] * sizes[1]) && (strides[1] == sizes[3]) &&
+        (strides[0] == strides[2] * sizes[2]);
+  };
+
+  Tensor q_contig = is_contiguous_or_head_seq_transposed(q_) ? q_ : q_.contiguous();
+  Tensor k_contig = k_.contiguous();
+  Tensor v_contig = v_.contiguous();
+
+  // for short sequences, differentiate based on key sequence length
+  if ((k_.size(2) >= 1024) || (k_.size(1) < q_.size(1) && k_.size(2) >= 4096)) {
+    return sdpa_vector_2pass_mps(
+        q_contig, k_contig, v_contig, mask_, dropout_p, is_causal, dropout_mask, scale, query, unsqueezed);
+  } else {
+    return sdpa_vector_fast_mps(
+        q_contig, k_contig, v_contig, mask_, dropout_p, is_causal, dropout_mask, scale, query, unsqueezed);
+  }
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.h b/aten/src/ATen/native/mps/operations/BinaryKernel.h
index 6ee63360cc41..232eafdad645 100644
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.h
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.h
@@ -1,8 +1,18 @@
 #pragma once
 
 namespace at::native::mps {
+<<<<<<< HEAD
 void complex_mul_out(
     const Tensor& input,
     const Tensor& other,
     const Tensor& output);
 }
+=======
+void binary_op_kernel(
+    const std::string func_name,
+    const Tensor& input,
+    const Tensor& other,
+    const Tensor& output,
+    const std::optional<Scalar> alpha = std::nullopt);
+} // namespace at::native::mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.mm b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
index 793779082010..b48c4ced61b3 100644
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@@ -3,6 +3,11 @@
 #include <ATen/TensorIndexing.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/BinaryOps.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/Lerp.h>
+#include <ATen/native/TensorFactories.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/operations/BinaryKernel.h>
@@ -23,14 +28,20 @@
 #endif
 
 namespace at::native {
+<<<<<<< HEAD
 namespace mps {
 
 #ifndef PYTORCH_JIT_COMPILE_SHADERS
 static auto& lib = MetalShaderLibrary::getBundledLibrary();
+=======
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #include <ATen/native/mps/BinaryKernel_metallib.h>
 #endif
 
+<<<<<<< HEAD
 static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_name, bool supports_dense = true) {
   TORCH_CHECK(iter.common_dtype() != at::kDouble, "float64 is not supported on MPS");
 
@@ -88,6 +99,15 @@ static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_nam
 
 void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& output) {
   TORCH_INTERNAL_ASSERT(c10::isComplexType(input.scalar_type()) || c10::isComplexType(other.scalar_type()));
+=======
+namespace mps {
+
+void binary_op_kernel(const std::string func_name,
+                      const Tensor& input,
+                      const Tensor& other,
+                      const Tensor& output,
+                      const std::optional<Scalar> alpha) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto new_size = at::infer_size(input.sizes(), other.sizes());
   if (!output.sizes().equals(new_size)) {
     output.resize_(new_size);
@@ -96,6 +116,7 @@ void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& out
   if (length == 0) {
     return;
   }
+<<<<<<< HEAD
   auto common_dtype = output.scalar_type();
   auto output_as_real = at::view_as_real(output).select(output.dim(), 0);
   auto input_as_real = at::view_as_real(input.to(kMPS, common_dtype)).select(input.dim(), 0);
@@ -104,42 +125,153 @@ void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& out
       TensorIteratorConfig().add_output(output_as_real).add_input(input_as_real).add_input(other_as_real).build();
 
   mps::binary_mps_impl(iter, "complex_mul", false);
+=======
+
+  auto iter = TensorIteratorConfig()
+                  .allow_cpu_scalars(true)
+                  .add_output(output)
+                  .add_input(input)
+                  .add_input(other)
+                  .check_all_same_dtype(false)
+                  .build();
+
+  lib.exec_binary_kernel(iter, func_name, alpha);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace mps
 
 static void fmax_mps_kernel(TensorIteratorBase& iter) {
   if (isFloatingType(iter.common_dtype())) {
+<<<<<<< HEAD
     mps::binary_mps_impl(iter, "fmax");
+=======
+    lib.exec_binary_kernel(iter, "fmax");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     at::maximum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
   }
 }
+<<<<<<< HEAD
 static void fmin_mps_kernel(TensorIteratorBase& iter) {
   if (isFloatingType(iter.common_dtype())) {
     mps::binary_mps_impl(iter, "fmin");
+=======
+
+static void fmin_mps_kernel(TensorIteratorBase& iter) {
+  if (isFloatingType(iter.common_dtype())) {
+    lib.exec_binary_kernel(iter, "fmin");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     at::minimum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
   }
 }
 
 static void copysign_mps_kernel(TensorIteratorBase& iter) {
+<<<<<<< HEAD
   mps::binary_mps_impl(iter, "copysign");
+=======
+  lib.exec_binary_kernel(iter, "copysign");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void nextafter_mps_kernel(TensorIteratorBase& iter) {
   TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "nextafter_mps not implemented for non-floating types");
+<<<<<<< HEAD
   mps::binary_mps_impl(iter, "nextafter");
+=======
+  lib.exec_binary_kernel(iter, "nextafter");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void zeta_mps_kernel(TensorIteratorBase& iter) {
   TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "zeta_mps not implemented for non-floating types");
+<<<<<<< HEAD
   mps::binary_mps_impl(iter, "zeta");
+=======
+  lib.exec_binary_kernel(iter, "zeta");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
   TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
+<<<<<<< HEAD
   mps::binary_mps_impl(iter, "xlog1py");
+=======
+  lib.exec_binary_kernel(iter, "xlog1py");
+}
+
+static void chebyshev_polynomial_t_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "chebyshev_polynomial_t_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "chebyshev_polynomial_t");
+}
+
+static void chebyshev_polynomial_u_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "chebyshev_polynomial_u_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "chebyshev_polynomial_u");
+}
+
+static void chebyshev_polynomial_v_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "chebyshev_polynomial_v_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "chebyshev_polynomial_v");
+}
+
+static void chebyshev_polynomial_w_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "chebyshev_polynomial_w_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "chebyshev_polynomial_w");
+}
+
+static void hermite_polynomial_h_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "hermite_polynomial_h_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "hermite_polynomial_h");
+}
+
+static void hermite_polynomial_he_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "hermite_polynomial_he_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "hermite_polynomial_he");
+}
+
+static void polar_mps_kernel(TensorIterator& iter) {
+  lib.exec_binary_kernel(iter, "polar");
+}
+
+static void complex_mps_kernel(TensorIterator& iter) {
+  lib.exec_binary_kernel(iter, "make_complex");
+}
+
+static void lerp_scalar_mps_kernel(at::TensorIteratorBase& iter, const Scalar& weight) {
+  lib.exec_binary_kernel(iter, "lerp_alpha", weight);
+}
+
+static void mul_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "mul");
+}
+
+static void div_true_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "div_true");
+}
+
+static void div_floor_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "div_floor");
+}
+
+static void div_trunc_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "div_trunc");
+}
+
+static void remainder_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "remainder");
+}
+
+static void fmod_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "fmod");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel)
@@ -148,6 +280,7 @@ static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
 REGISTER_DISPATCH(nextafter_stub, &nextafter_mps_kernel)
 REGISTER_DISPATCH(zeta_stub, &zeta_mps_kernel)
 REGISTER_DISPATCH(xlog1py_stub, &xlog1py_mps_kernel)
+<<<<<<< HEAD
 
 Tensor& polar_out_mps(const Tensor& abs, const Tensor& angle, Tensor& output) {
   auto new_size = at::infer_size(abs.sizes(), angle.sizes());
@@ -180,4 +313,21 @@ static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
   mps::binary_mps_impl(iter, "complex_kernel", false);
   return output;
 }
+=======
+REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kernel)
+REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)
+REGISTER_DISPATCH(chebyshev_polynomial_v_stub, &chebyshev_polynomial_v_mps_kernel)
+REGISTER_DISPATCH(chebyshev_polynomial_w_stub, &chebyshev_polynomial_w_mps_kernel)
+REGISTER_DISPATCH(hermite_polynomial_h_stub, &hermite_polynomial_h_mps_kernel)
+REGISTER_DISPATCH(hermite_polynomial_he_stub, &hermite_polynomial_he_mps_kernel)
+REGISTER_DISPATCH(polar_stub, &polar_mps_kernel);
+REGISTER_DISPATCH(complex_stub, &complex_mps_kernel);
+REGISTER_DISPATCH(lerp_kernel_scalar_weight, &lerp_scalar_mps_kernel)
+REGISTER_DISPATCH(mul_stub, &mul_mps_kernel)
+REGISTER_DISPATCH(div_true_stub, &div_true_mps_kernel)
+REGISTER_DISPATCH(div_floor_stub, &div_floor_mps_kernel)
+REGISTER_DISPATCH(div_trunc_stub, &div_trunc_mps_kernel)
+REGISTER_DISPATCH(fmod_stub, &fmod_mps_kernel)
+REGISTER_DISPATCH(remainder_stub, &remainder_mps_kernel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index 9c87dfb4ab6e..3c243ce3fba2 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -14,13 +14,19 @@
 #include <ATen/ops/atan2_native.h>
 #include <ATen/ops/div_native.h>
 #include <ATen/ops/eq_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/floor_divide_native.h>
 #include <ATen/ops/fmod_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/ge_native.h>
 #include <ATen/ops/gt_native.h>
 #include <ATen/ops/hypot_native.h>
 #include <ATen/ops/le_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/lerp_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/logaddexp2_native.h>
 #include <ATen/ops/logaddexp_native.h>
 #include <ATen/ops/logical_and_native.h>
@@ -29,11 +35,17 @@
 #include <ATen/ops/lt_native.h>
 #include <ATen/ops/maximum_native.h>
 #include <ATen/ops/minimum_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/mul_native.h>
 #include <ATen/ops/ne_native.h>
 #include <ATen/ops/pow.h>
 #include <ATen/ops/pow_native.h>
 #include <ATen/ops/remainder_native.h>
+=======
+#include <ATen/ops/ne_native.h>
+#include <ATen/ops/pow.h>
+#include <ATen/ops/pow_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/result_type.h>
 #include <ATen/ops/sub_native.h>
 #include <ATen/ops/view_as_real.h>
@@ -46,7 +58,11 @@
 struct BinaryOpCachedGraph : public MPSCachedGraph {
   BinaryOpCachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
   MPSGraphTensor *primaryTensor = nil, *secondaryTensor = nil;
+<<<<<<< HEAD
   MPSGraphTensor *alphaTensor = nil, *outputTensor = nil;
+=======
+  MPSGraphTensor* outputTensor = nil;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 typedef MPSGraphTensor* (^BinaryOpBlock)(BinaryOpCachedGraph*, MPSGraphTensor*, MPSGraphTensor*);
@@ -61,10 +77,15 @@ static inline Tensor legacy_complex_as_view(const Tensor& t) {
   return at::view_as_real(t.dim() != 0 ? t : t.to(kMPS));
 }
 
+<<<<<<< HEAD
 // alpha is always 1.0 except when this function is called from add_sub_lerp_template()
 static void binaryOpTensor(const Tensor& self,
                            const Tensor& other,
                            const Scalar& alpha,
+=======
+static void binaryOpTensor(const Tensor& self,
+                           const Tensor& other,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                            const Tensor& output_,
                            std::string op_name,
                            BinaryOpBlock binaryBlock) {
@@ -119,7 +140,11 @@ static void binaryOpTensor(const Tensor& self,
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = op_name + getTensorsStringKey({self, other, output_});
+=======
+    std::string key = op_name + getTensorsStringKey({self, other, output_});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<BinaryOpCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       newCachedGraph->primaryTensor =
           mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(inputDataType), getMPSShape(self));
@@ -148,7 +173,10 @@ static void binaryOpTensor(const Tensor& self,
     Placeholder otherPlaceholder;
     MPSScalar self_scalar;
     MPSScalar other_scalar;
+<<<<<<< HEAD
     MPSScalar alpha_scalar;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (is_self_scalar && !self.is_mps()) {
       self_scalar = getMPSScalar(self.item(), inputDataType);
@@ -173,12 +201,15 @@ static void binaryOpTensor(const Tensor& self,
       feeds[otherPlaceholder.getMPSGraphTensor()] = otherPlaceholder.getMPSGraphTensorData();
     }
 
+<<<<<<< HEAD
     // 'cachedGraph->alphaTensor' is not nil only if add_sub_lerp_template() was called with an alpha value != 1.0
     if (cachedGraph->alphaTensor) {
       alpha_scalar = getMPSScalar(alpha, common_dtype);
       feeds[cachedGraph->alphaTensor] = getMPSGraphTensorFromScalar(mpsStream, alpha_scalar);
     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, needsCopyToOutput ? output : output_);
     runMPSGraph(mpsStream, cachedGraph->graph(), feeds, outputPlaceholder);
 
@@ -190,6 +221,7 @@ static void binaryOpTensor(const Tensor& self,
 
 static void binaryOpScalar(const Tensor& self,
                            const Scalar& other,
+<<<<<<< HEAD
                            const Scalar& alpha,
                            const Tensor& output,
                            std::string op_name,
@@ -251,6 +283,12 @@ static void div_mode_template(const Tensor& self,
                  output,
                  op_name + "_mps:" + (rounding_mode.has_value() ? c10::str(*rounding_mode) : ""),
                  div_mode_op_block);
+=======
+                           const Tensor& output,
+                           std::string op_name,
+                           BinaryOpBlock binaryBlock) {
+  binaryOpTensor(self, wrapped_scalar_tensor(other), output, op_name, binaryBlock);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void add_sub_lerp_template(const Tensor& self,
@@ -258,19 +296,27 @@ static void add_sub_lerp_template(const Tensor& self,
                                   const Scalar& alpha,
                                   const Tensor& output,
                                   std::string op_name) {
+<<<<<<< HEAD
   if (alpha.toDouble() == 0.0) {
+=======
+  if (!alpha.isComplex() && alpha.toDouble() == 0.0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!self.is_alias_of(output)) { // if inplace, no-op
       output.copy_(self);
     }
     return;
   }
 
+<<<<<<< HEAD
   const bool alpha_has_value = alpha.toDouble() != 1.0;
   if (alpha_has_value) {
     auto commonDtype = at::result_type(self, other);
     at::native::alpha_check(commonDtype, alpha);
   }
 
+=======
+  const bool alpha_has_value = alpha.isComplex() || alpha.toDouble() != 1.0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!alpha_has_value && op_name == "lerp") {
     if (!self.is_alias_of(other)) { // if inplace, no-op
       output.copy_(other);
@@ -278,6 +324,7 @@ static void add_sub_lerp_template(const Tensor& self,
     return;
   }
 
+<<<<<<< HEAD
   BinaryOpBlock add_sub_lerp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
     MPSGraph* mpsGraph = cachedGraph->graph();
     MPSGraphTensor* secondaryTensor = secondaryCastTensor;
@@ -308,10 +355,20 @@ static void add_sub_lerp_template(const Tensor& self,
                  output,
                  op_name + "_out_mps:" + (alpha_has_value ? getMPSTypeString(alpha.type()) : ""),
                  add_sub_lerp_op_block);
+=======
+  if (alpha_has_value) {
+    auto commonDtype = at::result_type(self, other);
+    at::native::alpha_check(commonDtype, alpha);
+    mps::binary_op_kernel(op_name + "_alpha", self, other, output, alpha);
+  } else {
+    mps::binary_op_kernel(op_name, self, other, output);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace mps
 
+<<<<<<< HEAD
 #define CREATE_MPS_BINARY_COMPARISON_OP_FUNC(func_out, func_stub, other_type)                       \
   Tensor& func_out(const Tensor& self, const other_type& other, Tensor& output) {                   \
     mps::binaryOp##other_type(                                                                      \
@@ -359,6 +416,42 @@ static void add_sub_lerp_template(const Tensor& self,
                                                               secondaryTensor:secondaryCastTensor \
                                                                          name:nil];               \
                               });                                                                 \
+=======
+#define CREATE_MPS_BINARY_COMPARISON_OP_FUNC(func_out, func_stub, other_type)                               \
+  Tensor& func_out(const Tensor& self, const other_type& other, Tensor& output) {                           \
+    mps::binaryOp##other_type(                                                                              \
+        self, other, output, #func_stub, ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) { \
+          MPSGraph* mpsGraph = cachedGraph->graph();                                                        \
+          return [mpsGraph func_stub##                                                                      \
+              WithPrimaryTensor:mps::castMPSTensor(mpsGraph, primaryCastTensor, ScalarType::Bool)           \
+                secondaryTensor:mps::castMPSTensor(mpsGraph, secondaryCastTensor, ScalarType::Bool)         \
+                           name:nil];                                                                       \
+        });                                                                                                 \
+    return output;                                                                                          \
+  }
+
+#define CREATE_MPS_STRUCTURED_BINARY_OP_FUNC(func_out, func_stub, other_type)                               \
+  TORCH_IMPL_FUNC(func_out)(const Tensor& self, const other_type& other, const Tensor& output) {            \
+    mps::binaryOp##other_type(                                                                              \
+        self, other, output, #func_stub, ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) { \
+          MPSGraph* mpsGraph = cachedGraph->graph();                                                        \
+          return [mpsGraph func_stub##WithPrimaryTensor:primaryCastTensor                                   \
+                                        secondaryTensor:secondaryCastTensor                                 \
+                                                   name:nil];                                               \
+        });                                                                                                 \
+  }
+
+// output of Boolean Ops will be cast to "MPSDataTypeBool" at the end of binaryOpTensor()
+#define CREATE_MPS_STRUCTURED_BOOLEAN_OP_FUNC(func_out, func_stub, other_type)                              \
+  TORCH_IMPL_FUNC(func_out)(const Tensor& self, const other_type& other, const Tensor& output) {            \
+    mps::binaryOp##other_type(                                                                              \
+        self, other, output, #func_stub, ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) { \
+          MPSGraph* mpsGraph = cachedGraph->graph();                                                        \
+          return [mpsGraph func_stub##WithPrimaryTensor:primaryCastTensor                                   \
+                                        secondaryTensor:secondaryCastTensor                                 \
+                                                   name:nil];                                               \
+        });                                                                                                 \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 // Boolean Binary Ops
@@ -384,6 +477,7 @@ static void add_sub_lerp_template(const Tensor& self,
 CREATE_MPS_BINARY_COMPARISON_OP_FUNC(logical_or_out_mps, logicalOR, Tensor);
 CREATE_MPS_BINARY_COMPARISON_OP_FUNC(logical_xor_out_mps, logicalXOR, Tensor);
 
+<<<<<<< HEAD
 TORCH_IMPL_FUNC(mul_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
   if (!mps::supportsComplex() && (c10::isComplexType(self.scalar_type()) || c10::isComplexType(other.scalar_type()))) {
     return mps::complex_mul_out(self, other, output);
@@ -423,10 +517,21 @@ static void add_sub_lerp_template(const Tensor& self,
                                       mps::legacy_complex_as_view(output),
                                       "add");
   }
+=======
+TORCH_IMPL_FUNC(atan2_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
+  mps::binaryOpTensor(self, other, output, "atan2", ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    return [mpsGraph atan2WithPrimaryTensor:primaryCastTensor secondaryTensor:secondaryCastTensor name:nil];
+  });
+}
+
+TORCH_IMPL_FUNC(add_out_mps)(const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mps::add_sub_lerp_template(self, other, alpha, output, "add");
 }
 
 TORCH_IMPL_FUNC(sub_out_mps)(const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) {
+<<<<<<< HEAD
   if ((isComplexType(self.scalar_type()) || isComplexType(other.scalar_type())) && !alpha.isComplex() &&
       !mps::supportsComplex()) {
     // Complex sub with non-complex alpha is just add over views
@@ -436,6 +541,8 @@ static void add_sub_lerp_template(const Tensor& self,
                                       mps::legacy_complex_as_view(output),
                                       "sub");
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mps::add_sub_lerp_template(self, other, alpha, output, "sub");
 }
 
@@ -447,6 +554,7 @@ static void add_sub_lerp_template(const Tensor& self,
   }
 }
 
+<<<<<<< HEAD
 Tensor& floor_divide_out_mps(const Tensor& self, const Tensor& other, Tensor& result) {
   mps::div_mode_template(self, other, "floor", result, "floor_divide_out");
   return result;
@@ -470,6 +578,8 @@ Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
   mps::div_mode_template(self, other, "trunc", output, "fmod_mps_out");
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_IMPL_FUNC(hypot_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
   mps::BinaryOpBlock hypot_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
     MPSGraph* mpsGraph = cachedGraph->graph();
@@ -483,7 +593,11 @@ Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
                                                                name:nil];
     return [mpsGraph squareRootWithTensor:sumTensor name:nil];
   };
+<<<<<<< HEAD
   mps::binaryOpTensor(self, other, Scalar(1.0), output, "hypot_out_mps", hypot_op_block);
+=======
+  mps::binaryOpTensor(self, other, output, "hypot_out_mps", hypot_op_block);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_IMPL_FUNC(logaddexp_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
@@ -495,7 +609,11 @@ Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
                                        name:nil];
     return [mpsGraph logarithmWithTensor:sumTensor name:nil];
   };
+<<<<<<< HEAD
   mps::binaryOpTensor(self, other, Scalar(1.0), output, "logaddexp_out_mps", logaddexp_op_block);
+=======
+  mps::binaryOpTensor(self, other, output, "logaddexp_out_mps", logaddexp_op_block);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_IMPL_FUNC(logaddexp2_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
@@ -507,7 +625,11 @@ Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
                                        name:nil];
     return [mpsGraph logarithmBase2WithTensor:sumTensor name:nil];
   };
+<<<<<<< HEAD
   mps::binaryOpTensor(self, other, Scalar(1.0), output, "logaddexp2_out_mps", logaddexp2_op_block);
+=======
+  mps::binaryOpTensor(self, other, output, "logaddexp2_out_mps", logaddexp2_op_block);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
@@ -532,10 +654,16 @@ Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
                                                   name:nil];
     return outputTensor;
   };
+<<<<<<< HEAD
   mps::binaryOpTensor(self, other, Scalar(1.0), output, "xlogy_out_mps", xlogy_op_block);
 }
 
 TORCH_IMPL_FUNC(lerp_Scalar_mps)(const Tensor& self, const Tensor& end, const Scalar& weight, const Tensor& out) {
   mps::add_sub_lerp_template(self, end, weight, out, "lerp");
 }
+=======
+  mps::binaryOpTensor(self, other, output, "xlogy_out_mps", xlogy_op_block);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/BitwiseOps.mm b/aten/src/ATen/native/mps/operations/BitwiseOps.mm
index 657c18355d93..e1afbd19e63c 100644
--- a/aten/src/ATen/native/mps/operations/BitwiseOps.mm
+++ b/aten/src/ATen/native/mps/operations/BitwiseOps.mm
@@ -5,7 +5,10 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/bitwise_and_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/bitwise_not_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/bitwise_or_native.h>
 #include <ATen/ops/bitwise_xor_native.h>
 #include <ATen/ops/logical_not_native.h>
@@ -100,11 +103,14 @@ kernel void bitwise_rshift_scalar_tensor(device {0}  *out [[buffer(0)]],
   out[offset] = static_cast<{0}>(a) >> b[offset];
 }}
 
+<<<<<<< HEAD
 kernel void bitwise_not(device {0}  *out [[buffer(0)]],
                          constant {1}  *a [[buffer(1)]],
                          uint offset [[thread_position_in_grid]]) {{
   out[offset] = ~a[offset];
 }}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )METAL",
                               3);
 
@@ -200,6 +206,7 @@ static void _bitwise_op_out_mps(const Tensor& self,
   return;
 }
 
+<<<<<<< HEAD
 static void _bitwise_not_out_mps(const Tensor& self, const Tensor& output_) {
   // Handle boolean tensor using logical not
   if (self.scalar_type() == c10::ScalarType::Bool) {
@@ -248,6 +255,8 @@ static void _bitwise_not_out_mps(const Tensor& self, const Tensor& output_) {
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace mps
 namespace {
 void lshift_kernel_mps(TensorIteratorBase& iter) {
@@ -272,10 +281,13 @@ void rshift_kernel_mps(TensorIteratorBase& iter) {
   mps::_bitwise_op_out_mps(self, other, output, "xor");
 }
 
+<<<<<<< HEAD
 TORCH_IMPL_FUNC(bitwise_not_out_mps)(const Tensor& self, const Tensor& output) {
   mps::_bitwise_not_out_mps(self, output);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_MPS_DISPATCH(lshift_stub, &lshift_kernel_mps)
 REGISTER_MPS_DISPATCH(rshift_stub, &rshift_kernel_mps)
 
diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm
index 5ff590263b18..28f58001bee6 100644
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@@ -64,7 +64,11 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) {
   MPSStream* stream = at::mps::getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "dot_mps" + getTensorsStringKey({self, other});
+=======
+    std::string key = "dot_mps" + getTensorsStringKey({self, other});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
@@ -143,7 +147,11 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) {
   Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "addmv_out_mps_impl" + getTensorsStringKey({self, matMulVec}) + ":" +
+=======
+    std::string key = "addmv_out_mps_impl" + getTensorsStringKey({self, matMulVec}) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(beta_.toDouble()) + ":" + std::to_string(alpha_.toDouble());
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* matMulVecTensor = mpsGraphRankedPlaceHolder(mpsGraph, matMulVec);
diff --git a/aten/src/ATen/native/mps/operations/Col2Im.mm b/aten/src/ATen/native/mps/operations/Col2Im.mm
new file mode 100644
index 000000000000..5ac42984edc8
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Col2Im.mm
@@ -0,0 +1,122 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/mps/OperationUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/col2im_native.h>
+#endif
+
+namespace at::native {
+using namespace mps;
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/Col2Im_metallib.h>
+#endif
+
+namespace {
+
+static void col2im_out_mps_template(const Tensor& input,
+                                    Tensor& output,
+                                    IntArrayRef output_size,
+                                    IntArrayRef kernel_size,
+                                    IntArrayRef dilation,
+                                    IntArrayRef padding,
+                                    IntArrayRef stride) {
+  auto output_height = output_size[0];
+  auto output_width = output_size[1];
+  auto kernel_height = kernel_size[0];
+  auto kernel_width = kernel_size[1];
+  auto dilation_height = dilation[0];
+  auto dilation_width = dilation[1];
+  auto pad_height = padding[0];
+  auto pad_width = padding[1];
+  auto stride_height = stride[0];
+  auto stride_width = stride[1];
+
+  Tensor col_tensor = input.contiguous();
+  bool batched_input = true;
+  if (col_tensor.dim() == 2) {
+    batched_input = false;
+    col_tensor = col_tensor.unsqueeze(0);
+  }
+
+  auto batch_size = col_tensor.size(0);
+  auto n_input_plane = col_tensor.size(1);
+  TORCH_CHECK(n_input_plane % (kernel_height * kernel_width) == 0, "col2im_mps: invalid number of input channels");
+  auto n_output_plane = n_input_plane / (kernel_height * kernel_width);
+  auto input_batch_stride = col_tensor.stride(0);
+
+  output.resize_({batch_size, n_output_plane, output_height, output_width});
+  auto output_batch_stride = output.stride(0);
+
+  auto height_col = (output_height + 2 * pad_height - (dilation_height * (kernel_height - 1) + 1)) / stride_height + 1;
+  auto width_col = (output_width + 2 * pad_width - (dilation_width * (kernel_width - 1) + 1)) / stride_width + 1;
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto col2imPSO = lib.getPipelineStateForFunc("col2im_kernel_" + mps::scalarToMetalTypeString(input));
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:col2imPSO];
+      const uint32_t gridWidth = static_cast<uint32_t>(output_width);
+      const uint32_t gridHeight = static_cast<uint32_t>(output_height);
+      const uint32_t gridDepth = static_cast<uint32_t>(batch_size * n_output_plane);
+      MTLSize gridSize = MTLSizeMake(gridWidth, gridHeight, gridDepth);
+      const uint32_t maxThreadsPerGroup = col2imPSO.maxTotalThreadsPerThreadgroup;
+      const uint32_t threadExecutionWidth = col2imPSO.threadExecutionWidth;
+      uint32_t tgWidth = std::min(gridWidth, threadExecutionWidth);
+      uint32_t tgHeight = std::min(gridHeight, maxThreadsPerGroup / tgWidth);
+      MTLSize threadgroupSize = MTLSizeMake(tgWidth, tgHeight, 1);
+      mtl_setArgs(
+          computeEncoder,
+          col_tensor,
+          output,
+          input_batch_stride,
+          n_output_plane,
+          std::array<uint32_t, 2>{static_cast<uint32_t>(output_height), static_cast<uint32_t>(output_width)}, // im_hw
+          std::array<uint32_t, 2>{static_cast<uint32_t>(kernel_height),
+                                  static_cast<uint32_t>(kernel_width)}, // kernel_hw
+          std::array<uint32_t, 2>{static_cast<uint32_t>(pad_height), static_cast<uint32_t>(pad_width)}, // pad_hw
+          std::array<uint32_t, 2>{static_cast<uint32_t>(stride_height),
+                                  static_cast<uint32_t>(stride_width)}, // stride_hw
+          std::array<uint32_t, 2>{static_cast<uint32_t>(dilation_height),
+                                  static_cast<uint32_t>(dilation_width)}, // dilation_hw
+          std::array<uint32_t, 2>{static_cast<uint32_t>(height_col), static_cast<uint32_t>(width_col)}, // col_hw
+          output_batch_stride);
+      [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
+    }
+  });
+  if (!batched_input) {
+    output = output.squeeze(0);
+  }
+}
+
+} // anonymous namespace
+
+Tensor& col2im_out_mps(const Tensor& self,
+                       IntArrayRef output_size,
+                       IntArrayRef kernel_size,
+                       IntArrayRef dilation,
+                       IntArrayRef padding,
+                       IntArrayRef stride,
+                       Tensor& out) {
+  col2im_out_mps_template(self, out, output_size, kernel_size, dilation, padding, stride);
+  return out;
+}
+
+Tensor col2im_mps(const Tensor& self,
+                  IntArrayRef output_size,
+                  IntArrayRef kernel_size,
+                  IntArrayRef dilation,
+                  IntArrayRef padding,
+                  IntArrayRef stride) {
+  Tensor out = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  col2im_out_mps_template(self, out, output_size, kernel_size, dilation, padding, stride);
+  return out;
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/mps/operations/ConstantOps.mm b/aten/src/ATen/native/mps/operations/ConstantOps.mm
index 5508f2431a03..583bc8dd3d5a 100644
--- a/aten/src/ATen/native/mps/operations/ConstantOps.mm
+++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm
@@ -33,7 +33,11 @@
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "fill_scalar_mps_impl" + getTensorsStringKey(self) + ":" + std::to_string(value.toDouble());
+=======
+    std::string key = "fill_scalar_mps_impl" + getTensorsStringKey(self) + ":" + std::to_string(value.toDouble());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphScalarPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()));
diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index c047f8446b5c..c8bb9e9e6500 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -190,7 +190,11 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
     if (bias_defined)
       bias_shape = bias_opt.value().sizes();
 
+<<<<<<< HEAD
     string mem_format_key;
+=======
+    std::string mem_format_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (memory_format) {
       case at::MemoryFormat::Contiguous:
         mem_format_key = "Contiguous";
@@ -202,14 +206,22 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
         assert(0 && "Check should have been done earlier\n");
     }
 
+<<<<<<< HEAD
     string bias_shape_key;
+=======
+    std::string bias_shape_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (bias_defined) {
       bias_shape_key = std::to_string(bias_shape[0]);
     } else {
       bias_shape_key = "nobias";
     }
 
+<<<<<<< HEAD
     string key;
+=======
+    std::string key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (is3DConv) {
       key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
           std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
@@ -404,7 +416,11 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
   @autoreleasepool {
     MPSStream* stream = getCurrentMPSStream();
 
+<<<<<<< HEAD
     string mem_format_key;
+=======
+    std::string mem_format_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (memory_format) {
       case at::MemoryFormat::Contiguous:
         mem_format_key = "Contiguous";
@@ -417,7 +433,11 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
     }
 
     MPSShape* mps_input_shape = getMPSShape(input_size);
+<<<<<<< HEAD
     string key;
+=======
+    std::string key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (is3DConv) {
       key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
           ":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
@@ -555,7 +575,11 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
     MPSStream* stream = getCurrentMPSStream();
 
     MPSShape* mps_weight_shape = getMPSShape(weight_size);
+<<<<<<< HEAD
     string key;
+=======
+    std::string key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (is3DConv) {
       key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
           std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
@@ -700,7 +724,13 @@ Tensor _mps_convolution_transpose(const Tensor& input_t,
                                   IntArrayRef stride,
                                   IntArrayRef dilation,
                                   int64_t groups) {
+<<<<<<< HEAD
   TORCH_CHECK(input_t.dim() < 5, "ConvTranspose 3D is not supported on MPS");
+=======
+  bool is_unsupported_3d_dtype =
+      (input_t.dim() == 5 && (input_t.scalar_type() == kHalf || input_t.scalar_type() == kBFloat16));
+  TORCH_CHECK(!is_unsupported_3d_dtype, "ConvTranspose 3D with BF16 or FP16 types is not supported on MPS");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto output_t =
       mps_convolution_transpose_forward(input_t, weight_t, padding, output_padding, stride, dilation, groups);
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 829c25ad5082..84f7c62bbcb0 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -48,7 +48,11 @@ static void copy_cast_mps(at::Tensor& dst,
 
   @autoreleasepool {
     const bool needs_conj = src.is_conj() != dst.is_conj();
+<<<<<<< HEAD
     string key = "copy_cast_mps" + getTensorsStringKey({src, dst}, true, /*exclude_shape*/ true) + ":" +
+=======
+    std::string key = "copy_cast_mps" + getTensorsStringKey({src, dst}, true, /*exclude_shape*/ true) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(needs_conj);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, srcDType);
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index 34976b45bd4b..6da08b7d8dbb 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -482,7 +482,11 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, std::optional<Generator
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "multinomial_with_replacement:" + getTensorsStringKey({self}) + ":" + std::to_string(n_sample);
+=======
+    std::string key = "multinomial_with_replacement:" + getTensorsStringKey({self}) + ":" + std::to_string(n_sample);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<RandomCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSShape* prob_shape = getMPSShape(self_v);
       newCachedGraph->stateTensor = mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[ @7 ]);
diff --git a/aten/src/ATen/native/mps/operations/Eye.mm b/aten/src/ATen/native/mps/operations/Eye.mm
index 1d850f161bbf..a828708e66c2 100644
--- a/aten/src/ATen/native/mps/operations/Eye.mm
+++ b/aten/src/ATen/native/mps/operations/Eye.mm
@@ -69,7 +69,11 @@
   @autoreleasepool {
     // A key is used to identify the MPSGraph which was created once, and can be reused if the parameters, data types
     // etc match the earlier created MPSGraph
+<<<<<<< HEAD
     string key = "eye_out_mps:" + getTensorsStringKey({result});
+=======
+    std::string key = "eye_out_mps:" + getTensorsStringKey({result});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto* mpsGraph, auto* newCachedGraph) {
       MPSGraphTensor* onesTensor = [mpsGraph constantWithScalar:1.0f
                                                           shape:getMPSShape(result)
diff --git a/aten/src/ATen/native/mps/operations/FusedSgdKernel.mm b/aten/src/ATen/native/mps/operations/FusedSgdKernel.mm
index cff639b1f77b..4e68a5cf1163 100644
--- a/aten/src/ATen/native/mps/operations/FusedSgdKernel.mm
+++ b/aten/src/ATen/native/mps/operations/FusedSgdKernel.mm
@@ -60,9 +60,31 @@ static void _fused_sgd_with_momentum_kernel_mps_(TensorList params,
                                                  const bool is_first_step,
                                                  const std::optional<Tensor>& grad_scale,
                                                  const std::optional<Tensor>& found_inf) {
+<<<<<<< HEAD
   TORCH_CHECK_GT(momentum, 0);
   TORCH_CHECK(native::check_fast_path_restrictions({params, grads, momentum_buffer_list}));
 
+=======
+  if (lr_tensor.is_cpu()) {
+    return _fused_sgd_with_momentum_kernel_mps_(params,
+                                                grads,
+                                                momentum_buffer_list,
+                                                weight_decay,
+                                                momentum,
+                                                lr_tensor.item<double>(),
+                                                dampening,
+                                                nesterov,
+                                                maximize,
+                                                is_first_step,
+                                                grad_scale,
+                                                found_inf);
+  }
+  TORCH_CHECK_GT(momentum, 0);
+  TORCH_CHECK(native::check_fast_path_restrictions({params, grads, momentum_buffer_list}));
+
+  TORCH_CHECK(lr_tensor.device() == params[0].device(), "lr must be on the same GPU device as the params");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<std::vector<Tensor>> tensor_lists{params.vec(), grads.vec(), momentum_buffer_list.vec()};
 
   const auto kernel_name = "fused_sgd_momentum_" + scalarToMetalTypeString(params[0].scalar_type());
@@ -98,8 +120,11 @@ void _fused_sgd_kernel_mps_(TensorList params,
                             const bool is_first_step,
                             const std::optional<Tensor>& grad_scale,
                             const std::optional<Tensor>& found_inf) {
+<<<<<<< HEAD
   TORCH_CHECK(!grad_scale.has_value() && !found_inf.has_value(), "grad_scale and found_inf are not supported on MPS");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!momentum_buffer_list.empty()) {
     return _fused_sgd_with_momentum_kernel_mps_(params,
                                                 grads,
@@ -147,8 +172,11 @@ void _fused_sgd_kernel_mps_(TensorList params,
                             const bool is_first_step,
                             const std::optional<Tensor>& grad_scale,
                             const std::optional<Tensor>& found_inf) {
+<<<<<<< HEAD
   TORCH_CHECK(!grad_scale.has_value() && !found_inf.has_value(), "grad_scale and found_inf are not supported on MPS");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!momentum_buffer_list.empty()) {
     return _fused_sgd_with_momentum_kernel_mps_(params,
                                                 grads,
diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm
index 8589ed28dc54..0bd29cfea182 100644
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@@ -71,8 +71,13 @@ static void grid_sampler_2d_mps_impl(Tensor& output,
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "grid_sampler_2d_mps" + getTensorsStringKey({input, grid}) + ":" + std::to_string(interpolation_mode) +
         ":" + std::to_string(padding_mode) + ":" + std::to_string(align_corners);
+=======
+    std::string key = "grid_sampler_2d_mps" + getTensorsStringKey({input, grid}) + ":" +
+        std::to_string(interpolation_mode) + ":" + std::to_string(padding_mode) + ":" + std::to_string(align_corners);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 3bc23e177fff..e50e9924db7d 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -34,6 +34,10 @@
 #include <ATen/ops/flip_native.h>
 #include <ATen/ops/index.h>
 #include <ATen/ops/index_add_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/index_copy_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/index_fill_native.h>
 #include <ATen/ops/index_put.h>
 #include <ATen/ops/index_select_native.h>
@@ -66,7 +70,13 @@
   std::vector<uint32_t> iterShapeData(iterShape.size());
   std::vector<std::array<uint32_t, nOffsets>> strides(nDim);
   TORCH_INTERNAL_ASSERT(iter.ntensors() >= nOffsets);
+<<<<<<< HEAD
   TORCH_CHECK(use_64bit_index || iter.can_use_32bit_indexing(), "Can't be indexed using 32-bit iterator");
+=======
+  TORCH_CHECK(use_64bit_index || iter.can_use_32bit_indexing(),
+              "kernel data offsets can't be computed using 32-bit iterator of shape ",
+              iterShape);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   for (const auto i : c10::irange(iterShape.size())) {
     iterShapeData[i] = static_cast<uint32_t>(iterShape[i]);
@@ -99,6 +109,7 @@
   TORCH_CHECK(scalarBitSize <= 64, "Unsupported data type: ", getMPSTypeString(scalar_type));
   return std::to_string(scalarBitSize) + "bit";
 }
+<<<<<<< HEAD
 static std::string getIndexFunctionName(ScalarType scalar_type,
                                         bool index_select,
                                         bool accumulate,
@@ -187,6 +198,11 @@ static bool dispatchIndexKernel(TensorIteratorBase& iter,
   });
 
   return true;
+=======
+
+static std::string getBitSizeString(const TensorBase& t) {
+  return getBitSizeString(t.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void validateInputData(const TensorIteratorBase& iter,
@@ -206,8 +222,12 @@ static void validateInputData(const TensorIteratorBase& iter,
 
   if (accumulate) {
     // No atomic support for the rest of dtypes
+<<<<<<< HEAD
     TORCH_CHECK(scalar_type == ScalarType::Float || inputTensor.scalar_type() == ScalarType::Int ||
                 scalar_type == ScalarType::Bool);
+=======
+    TORCH_CHECK(supportedFloatingType(scalar_type) || scalar_type == kInt || scalar_type == kBool);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     TORCH_CHECK(c10::isIntegralType(scalar_type, /*includesBool=*/true) || supportedFloatingType(scalar_type) ||
                     scalar_type == ScalarType::ComplexFloat || scalar_type == ScalarType::ComplexHalf,
@@ -238,11 +258,64 @@ static void validateInputData(const TensorIteratorBase& iter,
   return result;
 }
 
+<<<<<<< HEAD
 static void index_kernel_mps(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride) {
   @autoreleasepool {
     validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false);
     dispatchIndexKernel(iter, index_size, index_stride, /*index_select=*/true, /*accumulate=*/false);
   }
+=======
+static void dispatch_index_kernel(TensorIteratorBase& iter,
+                                  IntArrayRef index_size,
+                                  IntArrayRef index_stride,
+                                  const std::string& kernel_name,
+                                  const bool serial = false) {
+  validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false);
+  if (iter.numel() == 0)
+    return;
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      dispatch_index_kernel(sub_iter, index_size, index_stride, kernel_name);
+    }
+    return;
+  }
+  const auto mpsStream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    const int64_t num_indices = index_size.size();
+    auto indexSelectPSO = lib.getPipelineStateForFunc(kernel_name);
+    auto computeEncoder = mpsStream->commandEncoder();
+    size_t argumentBufferLength = sizeof(uint64_t) * num_indices;
+    std::vector<uint64_t> indexAB;
+    std::array<uint32_t, 4> ndim_nindiees = {static_cast<uint32_t>(iter.ndim()),
+                                             static_cast<uint32_t>(index_size.size()),
+                                             static_cast<uint32_t>(iter.numel()),
+                                             0};
+    for (uint32_t idx = 0; idx < num_indices; idx++) {
+      const auto& indexTensor = iter.tensor_base(idx + 2);
+      indexAB.push_back(getMTLBufferStorage(indexTensor).gpuAddress + iter_tensor_offset(iter, idx + 2));
+      TORCH_CHECK(indexTensor.scalar_type() == ScalarType::Long, "index(): Expected dtype int64 for Index");
+      [computeEncoder useResource:getMTLBufferStorage(indexTensor) usage:MTLResourceUsageRead];
+    }
+    [computeEncoder setComputePipelineState:indexSelectPSO];
+    bind_iter_tensors(computeEncoder, iter, 2);
+    mtl_setArgs<2>(computeEncoder,
+                   indexAB,
+                   iter.shape(),
+                   iter.strides(0),
+                   iter.strides(1),
+                   iter.strides(2),
+                   index_size,
+                   index_stride,
+                   ndim_nindiees);
+    mtl_dispatch1DJob(computeEncoder, indexSelectPSO, serial ? 1 : iter.numel());
+  });
+}
+
+static void index_kernel_mps(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride) {
+  validateInputData(iter, index_size, index_stride, "index.Tensor_out", /*accumulate=*/false);
+  dispatch_index_kernel(
+      iter, index_size, index_stride, fmt::format("index_select_{}", getBitSizeString(iter.tensor_base(0))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void index_put_kernel_mps(TensorIterator& iter,
@@ -251,11 +324,104 @@ static void index_put_kernel_mps(TensorIterator& iter,
                                  bool accumulate) {
   @autoreleasepool {
     validateInputData(iter, index_size, index_stride, "index_put_impl", accumulate);
+<<<<<<< HEAD
     dispatchIndexKernel(iter, index_size, index_stride, /*index_select=*/false, accumulate);
+=======
+    if (accumulate) {
+      dispatch_index_kernel(iter,
+                            index_size,
+                            index_stride,
+                            fmt::format("index_put_accumulate_{}", scalarToMetalTypeString(iter.tensor_base(0))));
+    } else if (at::globalContext().deterministicAlgorithms()) {
+      dispatch_index_kernel(iter,
+                            index_size,
+                            index_stride,
+                            fmt::format("index_put_serial_{}", getBitSizeString(iter.tensor_base(0))),
+                            true);
+    } else {
+      dispatch_index_kernel(
+          iter, index_size, index_stride, fmt::format("index_put_{}", getBitSizeString(iter.tensor_base(0))));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 } // namespace mps
 
+<<<<<<< HEAD
+=======
+TORCH_IMPL_FUNC(index_copy_out_mps)(const Tensor& self,
+                                    int64_t dim,
+                                    const Tensor& index,
+                                    const Tensor& source,
+                                    const Tensor& result) {
+  using namespace mps;
+
+  // special-case for 0-dim tensors
+  if (self.dim() == 0) {
+    TORCH_CHECK(index.numel() == 1,
+                "index_copy_(): attempting to index a 0-dim tensor with an index tensor of size ",
+                index.numel());
+    int64_t idx = index.item<int64_t>();
+    TORCH_CHECK(idx == 0, "index_copy_(): the only valid index for a 0-dim tensor is 0, but got ", idx);
+    result.copy_(source);
+    return;
+  }
+
+  dim = maybe_wrap_dim(dim, self.dim());
+
+  // early return for empty index
+  if (index.numel() == 0) {
+    result.copy_(self);
+    return;
+  }
+
+  for (int64_t i = 0; i < self.dim(); i++) {
+    if (i != dim) {
+      TORCH_CHECK(self.size(i) == source.size(i),
+                  "index_copy_(): self and source must have same size at dimension ",
+                  i,
+                  "; self has size ",
+                  self.size(i),
+                  ", source has size ",
+                  source.size(i));
+    }
+  }
+
+  TORCH_CHECK(source.size(dim) == index.numel(),
+              "index_copy_(): Number of indices (",
+              index.numel(),
+              ") should be equal to source.size(dim) (",
+              source.size(dim),
+              ")");
+
+  auto stream = getCurrentMPSStream();
+  auto device = MPSDevice::getInstance()->device();
+
+  const bool is_dense =
+      self.is_contiguous() && source.is_contiguous() && result.is_contiguous() && index.is_contiguous();
+
+  auto dense_or_strided = is_dense ? "dense" : "strided";
+  auto long_or_int = (index.scalar_type() == ScalarType::Long) ? "long" : "int";
+  auto indexCopyPSO = lib.getPipelineStateForFunc(
+      fmt::format("index_copy_{}_{}_{}", dense_or_strided, scalarToMetalTypeString(result), long_or_int));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto computeEncoder = stream->commandEncoder();
+      uint32_t dim_arg = static_cast<uint32_t>(dim);
+      uint32_t ndim = self.dim();
+      uint32_t indices_numel = index.numel();
+      [computeEncoder setComputePipelineState:indexCopyPSO];
+      mtl_setArgs(computeEncoder, result, self, source, index, dim_arg, self.sizes(), ndim, indices_numel);
+      if (!is_dense) {
+        mtl_setArgs<8>(computeEncoder, self.strides(), result.strides(), source.strides());
+      }
+      mtl_dispatch1DJob(computeEncoder, indexCopyPSO, result.numel());
+    }
+  });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static Tensor nonzero_fallback(const Tensor& self) {
   return at::nonzero(self.to("cpu")).to("mps");
 }
@@ -283,7 +449,11 @@ static Tensor nonzero_fallback(const Tensor& self) {
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "nonzero_out_native_mps" + getTensorsStringKey(self);
+=======
+    std::string key = "nonzero_out_native_mps" + getTensorsStringKey(self);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
 
@@ -375,7 +545,11 @@ static Tensor nonzero_fallback(const Tensor& self) {
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "nonzero_out_native_mps" + getTensorsStringKey(self);
+=======
+    std::string key = "nonzero_out_native_mps" + getTensorsStringKey(self);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
 
@@ -456,7 +630,11 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
     NSString* ns_dims_key = [[ns_dims valueForKey:@"description"] componentsJoinedByString:@","];
     // A key is used to identify the MPSGraph which was created once, and can be reused if the parameters, data types
     // etc match the earlier created MPSGraph
+<<<<<<< HEAD
     string key = "flip_mps:" + getTensorsStringKey({self}) + ":" + string([ns_dims_key UTF8String]);
+=======
+    std::string key = "flip_mps:" + getTensorsStringKey({self}) + ":" + std::string([ns_dims_key UTF8String]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputDataType, getMPSShape(self));
       MPSGraphTensor* outputTensor = [mpsGraph reverseTensor:inputTensor axes:ns_dims name:nil];
@@ -504,7 +682,11 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "index_add_mps_out" + getTensorsStringKey({self, index, source}) + ":" + std::to_string(dim);
+=======
+    std::string key = "index_add_mps_out" + getTensorsStringKey({self, index, source}) + ":" + std::to_string(dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
       MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index);
@@ -653,7 +835,11 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "index_select_out_mps" + getTensorsStringKey({self, index}) + ":" + std::to_string(dim);
+=======
+    std::string key = "index_select_out_mps" + getTensorsStringKey({self, index}) + ":" + std::to_string(dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputType, getMPSShape(self));
       MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index);
@@ -790,8 +976,14 @@ Tensor embedding_dense_backward_mps(const Tensor& grad_,
   auto stream = at::mps::getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "edb_mps:" + getTensorsStringKey({grad_, indices}) + ":num_weights" + std::to_string(num_weights) +
         ":padding_idx" + std::to_string(padding_idx) + ":scaled" + std::to_string(scale_grad_by_freq);
+=======
+    std::string key = "edb_mps:" + getTensorsStringKey({grad_, indices}) + ":num_weights" +
+        std::to_string(num_weights) + ":padding_idx" + std::to_string(padding_idx) + ":scaled" +
+        std::to_string(scale_grad_by_freq);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* incomingGradTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_));
 
@@ -930,7 +1122,12 @@ Tensor embedding_dense_backward_mps(const Tensor& grad_,
   auto expanded_source = source.expand(source_shape);
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "index_fill_mps_" + getTensorsStringKey({self, index, expanded_source}) + ":" + std::to_string(dim);
+=======
+    std::string key =
+        "index_fill_mps_" + getTensorsStringKey({self, index, expanded_source}) + ":" + std::to_string(dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputType, getMPSShape(self));
       MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index);
diff --git a/aten/src/ATen/native/mps/operations/Lerp.mm b/aten/src/ATen/native/mps/operations/Lerp.mm
index 1ad34ef9a566..06d03af37c54 100644
--- a/aten/src/ATen/native/mps/operations/Lerp.mm
+++ b/aten/src/ATen/native/mps/operations/Lerp.mm
@@ -24,7 +24,11 @@
     MPSGraphTensor* outputTensor_ = nil;
   };
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "lerp_Tensor_mps" + getTensorsStringKey({self, end, weight});
+=======
+    std::string key = "lerp_Tensor_mps" + getTensorsStringKey({self, end, weight});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto graph) {
       auto selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
       auto endTensor = mpsGraphRankedPlaceHolder(mpsGraph, end);
diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm
index 47a5fb5e95ef..2b0cd6ccec40 100644
--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@@ -1,6 +1,11 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/ExpandUtils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/MPSGraphSequoiaOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/linear_backward_native.h>
 #include <ATen/ops/linear_native.h>
@@ -9,16 +14,84 @@
 
 using namespace mps;
 
+<<<<<<< HEAD
+=======
+static void _mps_linear_nograph(const Tensor& input, const Tensor& weight, const Tensor& bias, Tensor& output) {
+  bool is_bias_defined = bias.defined();
+
+  MPSStream* mpsStream = getCurrentMPSStream();
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+
+  const std::string key = "mps_linear" + getTensorsStringKey({input, weight, bias}, true, true);
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      mpsStream->endKernelCoalescing();
+
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      id<MTLCommandBuffer> commandBuffer = mpsStream->commandBuffer();
+
+      MPSDataType mpsDataType = getMPSDataType(weight.scalar_type());
+
+      auto inputNDArray = getMPSNDArray(input, input.sizes(), input.strides());
+      auto outNDArray = getMPSNDArray(output, output.sizes(), output.strides());
+
+      id<MTLBuffer> weightBuf = getMTLBufferStorage(weight);
+      MPSNDArrayDescriptor* weightDesc = [MPSNDArrayDescriptor descriptorWithDataType:mpsDataType
+                                                                                shape:getMPSShape(weight.sizes())];
+      weightDesc.preferPackedRows = YES;
+      [weightDesc transposeDimension:0 withDimension:1];
+      MPSNDArray* weightNDArray = [[[MPSNDArray alloc] initWithBuffer:weightBuf
+                                                               offset:weight.storage_offset() * weight.element_size()
+                                                           descriptor:weightDesc] autorelease];
+
+      if (is_bias_defined) {
+        auto biasNDArray = getMPSNDArray(bias, bias.sizes(), bias.strides());
+        auto cachedKernel = LookUpOrCreateCachedKernel<MPSCachedKernel>(key, [&]() {
+          return [[[MPSNDArrayMatrixMultiplication alloc] initWithDevice:device sourceCount:3] autorelease];
+        });
+        auto kernel = cachedKernel->kernel<MPSNDArrayMatrixMultiplication>();
+
+        getMPSProfiler().beginProfileKernel(kernel, "mps_linear", {input, weight, bias});
+        [kernel encodeToCommandEncoder:computeEncoder
+                         commandBuffer:commandBuffer
+                          sourceArrays:@[ inputNDArray, weightNDArray, biasNDArray ]
+                      destinationArray:outNDArray];
+        getMPSProfiler().endProfileKernel(kernel);
+      } else {
+        auto cachedKernel = LookUpOrCreateCachedKernel<MPSCachedKernel>(key, [&]() {
+          return [[[MPSNDArrayMatrixMultiplication alloc] initWithDevice:device sourceCount:2] autorelease];
+        });
+        auto kernel = cachedKernel->kernel<MPSNDArrayMatrixMultiplication>();
+        getMPSProfiler().beginProfileKernel(kernel, "mps_linear", {input, weight, bias});
+        [kernel encodeToCommandEncoder:computeEncoder
+                         commandBuffer:commandBuffer
+                          sourceArrays:@[ inputNDArray, weightNDArray ]
+                      destinationArray:outNDArray];
+        getMPSProfiler().endProfileKernel(kernel);
+      }
+    }
+  });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::optional<Tensor>& bias_opt) {
   // wT = transpose(weight);
   // y=x*wT+b
 
+<<<<<<< HEAD
   auto weight = (weight_arg.dim() == 1) ? weight_arg.view({1, weight_arg.size(0)}) : weight_arg;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(supportedFloatingOrComplexType(input), "MPS device does not support linear for non-float inputs");
   TORCH_CHECK(input.is_mps(), "Tensor for argument input is on ", input.device(), " but expected on mps");
   TORCH_CHECK(supportedFloatingOrComplexType(weight_arg), "MPS device does not support linear for non-float weights");
   TORCH_CHECK(weight_arg.is_mps(), "Tensor for argument weight is on ", weight_arg.device(), " but expected on mps");
+<<<<<<< HEAD
+=======
+  TORCH_CHECK((input.scalar_type() != kComplexFloat && input.scalar_type() != kComplexHalf),
+              "mps linear does not support complex types");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const Tensor& bias = *(at::borrow_from_optional_tensor(bias_opt));
   const bool is_bias_defined = bias.defined();
@@ -27,6 +100,11 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
     TORCH_CHECK(supportedFloatingOrComplexType(bias), "MPS device does not support linear for non-float bias");
   }
 
+<<<<<<< HEAD
+=======
+  auto weight = (weight_arg.dim() == 1) ? weight_arg.unsqueeze(0) : weight_arg;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto input_size = input.sizes();
   std::vector<int64_t> output_size(input_size.begin(), input_size.end() - 1);
   output_size.push_back(weight.size(0));
@@ -54,8 +132,17 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
     return output;
   }
 
+<<<<<<< HEAD
   MPSStream* stream = getCurrentMPSStream();
 
+=======
+  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) {
+    _mps_linear_nograph(input, weight, bias, output);
+    // Squeeze last dim of 1D linear
+    return weight_arg.dim() != 1 ? output : output.squeeze(-1);
+  }
+  MPSStream* stream = getCurrentMPSStream();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor* inputTensor_ = nil;
@@ -65,7 +152,11 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "mps_linear" + getTensorsStringKey({input, weight, bias});
+=======
+    std::string key = "mps_linear" + getTensorsStringKey({input, weight, bias});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto* mpsGraph, auto* newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
       MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight);
@@ -116,6 +207,7 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
+<<<<<<< HEAD
   // Shave off '1' present at the end of the shape
   if (weight_arg.dim() == 1) {
     // Number of elements in new output shape
@@ -124,6 +216,10 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
     return output.view(IntArrayRef(out_shape));
   }
   return output;
+=======
+  // Squeeze last dim of 1D linear
+  return weight_arg.dim() != 1 ? output : output.squeeze(-1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& grad_output, const Tensor& weight) {
@@ -154,7 +250,11 @@ static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& g
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "mps_linear_backward_input" + getTensorsStringKey({grad_output, weight_reshaped});
+=======
+    std::string key = "mps_linear_backward_input" + getTensorsStringKey({grad_output, weight_reshaped});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto* mpsGraph, auto* newCachedGraph) {
       newCachedGraph->weightTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, weight_reshaped);
       newCachedGraph->gradOutputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
@@ -236,7 +336,11 @@ static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& g
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "mps_linear_backward_weights:" + std::to_string(bias_defined) + ":" +
+=======
+    std::string key = "mps_linear_backward_weights:" + std::to_string(bias_defined) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getTensorsStringKey({input_reshaped, weight, grad_output_reshaped});
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_reshaped);
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index 22aee2307f69..4396e0a79292 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -2,6 +2,11 @@
 
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/mps/MPSProfiler.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/BatchLinearAlgebra.h>
+#include <ATen/native/LinearAlgebra.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
 // For MTLLanguageVersion_3_1
@@ -21,7 +26,11 @@
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
 #include <ATen/ops/linalg_cholesky_ex_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/linalg_cholesky_native.h>
+=======
+#include <ATen/ops/linalg_inv_ex_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_native.h>
 #include <ATen/ops/linalg_solve_triangular_native.h>
@@ -32,6 +41,10 @@
 #include <ATen/ops/triangular_solve_native.h>
 #endif
 
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <algorithm>
 
 namespace at::native {
@@ -127,7 +140,11 @@
 }
 
 bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output) {
+<<<<<<< HEAD
   static bool always_use_metal = std::getenv("PYTORCH_MPS_PREFER_METAL") != nullptr;
+=======
+  static bool always_use_metal = c10::utils::has_env("PYTORCH_MPS_PREFER_METAL");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr auto max_stride_size = 32768;
   static bool is_macos_14_4_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
   if (always_use_metal || c10::isIntegralType(self.scalar_type(), true)) {
@@ -261,6 +278,7 @@ static void linalg_lu_factor_ex_out_mps_impl(const Tensor& A,
   }
 }
 
+<<<<<<< HEAD
 static void linalg_solve_out_mps_impl(const at::Tensor& A,
                                       const at::Tensor& B,
                                       bool left,
@@ -269,6 +287,16 @@ static void linalg_solve_out_mps_impl(const at::Tensor& A,
                                       const at::Tensor& LU,
                                       const at::Tensor& pivots,
                                       const at::Tensor& info) {
+=======
+static void linalg_solve_out_mps_impl(const Tensor& A,
+                                      const Tensor& B,
+                                      bool left,
+                                      bool check_errors,
+                                      const Tensor& result,
+                                      const Tensor& LU,
+                                      const Tensor& pivots,
+                                      const Tensor& info) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using namespace mps;
 
   TORCH_CHECK(!c10::isComplexType(A.scalar_type()) && !c10::isComplexType(LU.scalar_type()),
@@ -436,6 +464,35 @@ static void linalg_solve_out_mps_impl(const at::Tensor& A,
   }
 }
 
+<<<<<<< HEAD
+=======
+static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const Tensor& result, const Tensor& info) {
+  using namespace mps;
+  TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
+  TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
+  using CachedGraph = MPSUnaryCachedGraph;
+
+  MPSStream* stream = getCurrentMPSStream();
+  info.zero_();
+
+  if (A.numel() == 0) {
+    return;
+  }
+
+  if (!result.is_contiguous()) {
+    result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
+  }
+  auto A_sizes = A.sizes();
+  int ndim = A.dim();
+
+  Tensor LU = empty_like(A);
+  Tensor identity = zeros_like(A);
+  Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
+  (ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
+  linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {
   using namespace mps;
   static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
@@ -466,7 +523,11 @@ static void linalg_solve_out_mps_impl(const at::Tensor& A,
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "mm_out_mps_impl" + getTensorsStringKey({self, other});
+=======
+    std::string key = "mm_out_mps_impl" + getTensorsStringKey({self, other});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       std::tie(newCachedGraph->inputTensor_, newCachedGraph->otherTensor_, newCachedGraph->outputTensor_) =
@@ -550,7 +611,11 @@ static void linalg_solve_out_mps_impl(const at::Tensor& A,
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = (opType == ADDBMM_OP_TYPE) ? ("addbmm_out_mps_impl") : ("baddbmm_out_mps_impl");
+=======
+    std::string key = (opType == ADDBMM_OP_TYPE) ? ("addbmm_out_mps_impl") : ("baddbmm_out_mps_impl");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     key += getTensorsStringKey({batch1, batch2, input}) + ":" + std::to_string(beta.toDouble()) + ":" +
         std::to_string(alpha.toDouble());
 
@@ -653,7 +718,11 @@ static void linalg_solve_out_mps_impl(const at::Tensor& A,
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "addmm_out_mps_impl" + getTensorsStringKey({self, other, *bias_}) + ":" +
+=======
+    std::string key = "addmm_out_mps_impl" + getTensorsStringKey({self, other, *bias_}) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(beta.toDouble()) + ":" + std::to_string(alpha.toDouble());
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* biasTensor = mpsGraphRankedPlaceHolder(mpsGraph, *bias_);
@@ -894,7 +963,11 @@ static void linalg_solve_out_mps_impl(const at::Tensor& A,
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "bmm_out_mps_impl" + getTensorsStringKey({batch1, batch2}, true, /*exclude_shape*/ true) +
+=======
+    std::string key = "bmm_out_mps_impl" + getTensorsStringKey({batch1, batch2}, true, /*exclude_shape*/ true) +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(doTranspose);
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -1068,6 +1141,7 @@ static void lu_unpack_mps_impl(const Tensor& LU_data,
   }
 }
 
+<<<<<<< HEAD
 static void linalg_cholesky_mps_impl(const Tensor& input,
                                      bool upper,
                                      bool check_errors,
@@ -1087,6 +1161,10 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
     return;
   }
   out.copy_(input);
+=======
+static void cholesky_stub_impl(const Tensor& out, const Tensor& info, bool upper) {
+  auto input_sizes = out.sizes();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   int64_t ndim = out.dim();
   int64_t N = out.size(-1);
@@ -1095,9 +1173,15 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
   auto stream = getCurrentMPSStream();
   auto device = MPSDevice::getInstance()->device();
 
+<<<<<<< HEAD
   auto factorDiagonalPSO = lib.getPipelineStateForFunc("factorDiagonalBlock");
   auto applyTRSMPSO = lib.getPipelineStateForFunc("applyTRSM");
   auto applySYRKPSO = lib.getPipelineStateForFunc("applySYRK");
+=======
+  auto factorDiagonalPSO = lib.getPipelineStateForFunc(upper ? "factorDiagonalBlockU" : "factorDiagonalBlockL");
+  auto applyTRSMPSO = lib.getPipelineStateForFunc(upper ? "applyTRSMU" : "applyTRSML");
+  auto applySYRKPSO = lib.getPipelineStateForFunc(upper ? "applySYRKU" : "applySYRKL");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   int64_t NB = std::min<int64_t>(32, N);
   int64_t numBlocks = (N + NB - 1) / NB;
@@ -1139,6 +1223,7 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
       }
     });
   }
+<<<<<<< HEAD
   int status;
   if (check_errors) {
     if (info_.dim() > 0) {
@@ -1166,6 +1251,10 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
   out.tril_();
   upper ? out.transpose_(ndim - 2, ndim - 1) : out;
 }
+=======
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace mps
 
 Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, const Scalar& beta, const Scalar& alpha) {
@@ -1231,7 +1320,11 @@ Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, cons
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "addr_out_mps_impl" + getTensorsStringKey({vec1, vec2, *self_}) + ":" +
+=======
+    std::string key = "addr_out_mps_impl" + getTensorsStringKey({vec1, vec2, *self_}) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(beta.toDouble()) + ":" + std::to_string(alpha.toDouble());
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* t1 = mps::mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(vec1), inputShape);
@@ -1326,6 +1419,7 @@ Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, cons
   return result;
 }
 
+<<<<<<< HEAD
 Tensor cholesky_mps(const Tensor& self, bool upper) {
   auto out = at::empty_like(self, MemoryFormat::Contiguous);
   cholesky_mps_out(self, upper, out);
@@ -1343,6 +1437,8 @@ Tensor cholesky_mps(const Tensor& self, bool upper) {
   mps::linalg_cholesky_mps_impl(self, upper, check_errors, L, info);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor addbmm_mps(const Tensor& self,
                   const Tensor& batch1,
                   const Tensor& batch2,
@@ -1427,4 +1523,13 @@ Tensor linalg_solve_triangular_mps(const Tensor& A, const Tensor& B, bool upper,
 (const Tensor& A, bool pivot, bool check_errors, const Tensor& LU, const Tensor& pivots, const Tensor& info) {
   mps::linalg_lu_factor_ex_out_mps_impl(A, pivot, LU, pivots, info, check_errors);
 }
+<<<<<<< HEAD
+=======
+
+TORCH_IMPL_FUNC(linalg_inv_ex_out_mps)(const Tensor& A, bool check_errors, const Tensor& result, const Tensor& info) {
+  mps::linalg_inv_ex_out_mps_impl(A, check_errors, result, info);
+}
+
+REGISTER_DISPATCH(cholesky_stub, mps::cholesky_stub_impl)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
index 1f49b73f0672..ba97a6e238fb 100644
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -23,7 +23,11 @@
 namespace at::native {
 namespace mps {
 
+<<<<<<< HEAD
 static string reductionToString(int64_t reduction) {
+=======
+static std::string reductionToString(int64_t reduction) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   switch (reduction) {
     case Reduction::Mean:
       return "Mean";
@@ -58,7 +62,11 @@ static string reductionToString(int64_t reduction) {
                                           const Tensor& target,
                                           int64_t reduction,
                                           Tensor& grad_input,
+<<<<<<< HEAD
                                           const string op_name) {
+=======
+                                          const std::string& op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes")
   auto norm = reduction == Reduction::Mean ? 2. / static_cast<double>(input.numel()) : 2.;
 
@@ -73,7 +81,11 @@ static string reductionToString(int64_t reduction) {
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = op_name + reductionToString(reduction) + ":" + std::to_string(grad_input.sizes()[1]) +
+=======
+    std::string key = op_name + reductionToString(reduction) + ":" + std::to_string(grad_input.sizes()[1]) +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getTensorsStringKey({input, target, grad_output});
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
@@ -200,7 +212,11 @@ static string reductionToString(int64_t reduction) {
                                  int64_t reduction,
                                  Tensor& loss,
                                  const std::optional<Tensor>& grad_output_opt,
+<<<<<<< HEAD
                                  const string op_name) {
+=======
+                                 const std::string& op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // TODO: add sanity check for the elements of input tensor to be within [0..1]
   TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes")
 
@@ -217,7 +233,11 @@ static string reductionToString(int64_t reduction) {
   Tensor target_squeezed = target.squeeze();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key =
+=======
+    std::string key =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op_name + reductionToString(reduction) + getTensorsStringKey({input_squeezed, target_squeezed, weight});
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -332,7 +352,11 @@ static void nllnd_loss_backward_impl(Tensor& grad_input_arg,
     grad_output = grad_output_arg.unsqueeze(channel_dim);
   }
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "nllnd_loss_backward" + getTensorsStringKey({input, grad_output, target, weight, total_weight}) +
+=======
+    std::string key = "nllnd_loss_backward" + getTensorsStringKey({input, grad_output, target, weight, total_weight}) +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(numClasses) + ":" + std::to_string(ignore_index) + ":" + std::to_string(isWeightsArrayValid) +
         ":" + std::to_string(isTargetCasted) + ":" + reductionToString(reduction);
 
@@ -483,9 +507,16 @@ static void nllnd_loss_forward_impl(Tensor& output,
     NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
 
     // TODO: Make the key
+<<<<<<< HEAD
     string key = "nllnd_loss_forward_impl:" + std::to_string(ignore_index) + ":" + std::to_string(isWeightsArrayValid) +
         ":" + reductionToString(reduction) + ":" + [ns_shape_key UTF8String] + ":" + getMPSTypeString(input) + ":" +
         getMPSTypeString(target) + ":" + std::to_string(isTargetCasted) + ":" + getMPSTypeString(weight);
+=======
+    std::string key = "nllnd_loss_forward_impl:" + std::to_string(ignore_index) + ":" +
+        std::to_string(isWeightsArrayValid) + ":" + reductionToString(reduction) + ":" + [ns_shape_key UTF8String] +
+        ":" + getMPSTypeString(input) + ":" + getMPSTypeString(target) + ":" + std::to_string(isTargetCasted) + ":" +
+        getMPSTypeString(weight);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input), input_shape);
       MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(target), target_shape);
@@ -623,7 +654,11 @@ static void smooth_l1_loss_impl(const Tensor& input,
     MPSShape* input_shape = getMPSShape(input);
     NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
 
+<<<<<<< HEAD
     string key = "smooth_l1_loss_impl:" + reductionToString(reduction) + ":" + [ns_shape_key UTF8String] + ":" +
+=======
+    std::string key = "smooth_l1_loss_impl:" + reductionToString(reduction) + ":" + [ns_shape_key UTF8String] + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(beta) + ":" + getMPSTypeString(input) + ":" + getMPSTypeString(target);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       // smooth_l1_loss_mps:
@@ -762,7 +797,11 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "smooth_l1_loss_backward" + getTensorsStringKey({input, grad_output, grad_input, target}) + ":" +
+=======
+    std::string key = "smooth_l1_loss_backward" + getTensorsStringKey({input, grad_output, grad_input, target}) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reductionToString(reduction) + ":" + std::to_string(beta);
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -824,7 +863,11 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
 // HuberLoss
 
 Tensor& huber_loss_out_mps(const Tensor& input, const Tensor& target, int64_t reduction, double delta, Tensor& output) {
+<<<<<<< HEAD
   string op_name = __func__;
+=======
+  std::string op_name = __func__;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using namespace mps;
   TORCH_CHECK(delta > 0, "huber_loss does not support non-positive values for delta.")
   TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes")
@@ -845,7 +888,11 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = op_name + ":" + reductionToString(reduction) + ":" + std::to_string(delta) + ":" +
+=======
+    std::string key = op_name + ":" + reductionToString(reduction) + ":" + std::to_string(delta) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getTensorsStringKey({input, target});
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
@@ -926,8 +973,13 @@ Tensor huber_loss_mps(const Tensor& input, const Tensor& target, int64_t reducti
     MPSShape* input_shape = getMPSShape(input);
     NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
 
+<<<<<<< HEAD
     string key = "huber_loss_backward_out_mps:" + reductionToString(reduction) + ":" + std::to_string(delta) + ":" +
         [ns_shape_key UTF8String] + ":" + getMPSTypeString(input) + ":" + getMPSTypeString(target);
+=======
+    std::string key = "huber_loss_backward_out_mps:" + reductionToString(reduction) + ":" + std::to_string(delta) +
+        ":" + [ns_shape_key UTF8String] + ":" + getMPSTypeString(input) + ":" + getMPSTypeString(target);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* gradOutputTensor =
           mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(new_grad_output), getMPSShape(new_grad_output));
@@ -1004,7 +1056,11 @@ Tensor huber_loss_mps(const Tensor& input, const Tensor& target, int64_t reducti
 
 // MSELoss
 TORCH_IMPL_FUNC(mse_loss_out_mps)(const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& output_) {
+<<<<<<< HEAD
   string op_name = "mse_loss_out_mps";
+=======
+  std::string op_name = "mse_loss_out_mps";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using namespace mps;
   if ((input.numel() == 0) || (target.numel() == 0)) {
     reduction == Reduction::Mean ? output_.fill_(std::numeric_limits<float>::quiet_NaN()) : output_.zero_();
@@ -1029,7 +1085,11 @@ Tensor huber_loss_mps(const Tensor& input, const Tensor& target, int64_t reducti
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target});
+=======
+    std::string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
       newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
diff --git a/aten/src/ATen/native/mps/operations/MultiTensorApply.h b/aten/src/ATen/native/mps/operations/MultiTensorApply.h
index cb8d65a129c5..47df5d36015e 100644
--- a/aten/src/ATen/native/mps/operations/MultiTensorApply.h
+++ b/aten/src/ATen/native/mps/operations/MultiTensorApply.h
@@ -11,10 +11,17 @@ static constexpr int64_t kChunkSize = 65536;
 static constexpr int64_t kmaxThreadGroups = 32;
 static constexpr int64_t kmaxTensors = 32;
 
+<<<<<<< HEAD
 struct MetadataArguments { // the size of this struct must be less than 4 bytes
   uint numels[kmaxTensors];
   uint threadgroup_to_tensor[kmaxThreadGroups];
   uint threadgroup_to_chunk[kmaxThreadGroups];
+=======
+struct MetadataArguments { // the size of this struct must be less than 4 kilobytes
+  uint64_t numels[kmaxTensors];
+  uint64_t threadgroup_to_tensor[kmaxThreadGroups];
+  uint64_t threadgroup_to_chunk[kmaxThreadGroups];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct FusedAdamEncodingFunctor {
@@ -180,7 +187,11 @@ static void multi_tensor_apply_for_fused_optimizer(const std::string& kernel_nam
           [computeEncoder useResource:getMTLBufferStorage(tensor_lists[d][tensor_index])
                                 usage:MTLResourceUsageRead | MTLResourceUsageWrite];
         }
+<<<<<<< HEAD
         if (state_steps.size() > 0) {
+=======
+        if (!state_steps.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           mtl_setBuffer(tensorArgumentEncoder, state_steps[tensor_index], depth * kmaxTensors + tensor_loc);
           [computeEncoder useResource:getMTLBufferStorage(state_steps[tensor_index]) usage:MTLResourceUsageRead];
         }
@@ -230,7 +241,11 @@ static void multi_tensor_apply_for_fused_optimizer(const std::string& kernel_nam
                 [computeEncoder useResource:getMTLBufferStorage(tensor_lists[d][tensor_index])
                                       usage:MTLResourceUsageWrite | MTLResourceUsageRead];
               }
+<<<<<<< HEAD
               if (state_steps.size() > 0) {
+=======
+              if (!state_steps.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mtl_setBuffer(tensorArgumentEncoder, state_steps[tensor_index], depth * kmaxTensors);
                 [computeEncoder useResource:getMTLBufferStorage(state_steps[tensor_index]) usage:MTLResourceUsageRead];
               }
@@ -253,4 +268,113 @@ static void multi_tensor_apply_for_fused_optimizer(const std::string& kernel_nam
   });
 }
 
+<<<<<<< HEAD
+=======
+std::pair<id<MTLComputePipelineState>, id<MTLFunction>> getAmpCPLState(const std::string& fname);
+template <int depth, typename... ArgTypes>
+void multi_tensor_apply(const std::string& kernel_name,
+                        std::vector<std::vector<at::Tensor>>& tensor_lists,
+                        ArgTypes... args) {
+  const auto num_tensors = tensor_lists[0].size();
+  if (num_tensors == 0) {
+    return;
+  }
+
+  TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists must match depth.");
+
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      auto [pipeline, function] = getAmpCPLState(kernel_name);
+      [computeEncoder setComputePipelineState:pipeline];
+
+      id<MTLArgumentEncoder> argumentEncoder = [function newArgumentEncoderWithBufferIndex:0];
+      auto tensorArgumentBuffer = [[device newBufferWithLength:argumentEncoder.encodedLength options:0] autorelease];
+      [argumentEncoder setArgumentBuffer:tensorArgumentBuffer offset:0];
+
+      int tensor_loc = 0;
+      int threadgroup_loc = 0;
+      MetadataArguments metadata_arguments;
+      std::memset(&metadata_arguments, 0, sizeof(metadata_arguments));
+
+      for (size_t t = 0; t < num_tensors; t++) {
+        if (tensor_lists[0][t].numel() == 0)
+          continue;
+
+        // bind each tensor in this list to the correct slots across depths
+        for (int d = 0; d < depth; d++) {
+          mtl_setBuffer(argumentEncoder, tensor_lists[d][t], d * kmaxTensors + tensor_loc);
+          [computeEncoder useResource:getMTLBufferStorage(tensor_lists[d][t])
+                                usage:(MTLResourceUsageRead | MTLResourceUsageWrite)];
+        }
+
+        // save number of elements for this tensor
+        metadata_arguments.numels[tensor_loc] = tensor_lists[0][t].numel();
+        int currentTensorIndex = tensor_loc;
+        tensor_loc++;
+
+        const auto numel = tensor_lists[0][t].numel();
+        const auto chunks = numel / kChunkSize + ((numel % kChunkSize) ? 1 : 0);
+
+        // process tensor in chunks based on max chunk size
+        for (uint chunk = 0; chunk < chunks; chunk++) {
+          metadata_arguments.threadgroup_to_tensor[threadgroup_loc] = currentTensorIndex;
+          metadata_arguments.threadgroup_to_chunk[threadgroup_loc] = chunk;
+          threadgroup_loc++;
+
+          // dispatch when we've filled the threadgroup array or finished the chunks
+          const bool dispatch_now = (threadgroup_loc == kmaxThreadGroups) || (chunk == chunks - 1);
+          if (dispatch_now) {
+            // check for a partial dispatch (i.e. more chunks remain for the current tensor)
+            bool partial = (chunk != chunks - 1);
+            uint carried_numels = 0;
+            if (partial) {
+              carried_numels = metadata_arguments.numels[currentTensorIndex];
+            }
+
+            mtl_setArgs(computeEncoder, tensorArgumentBuffer, metadata_arguments, args...);
+            MTLSize gridSize = MTLSizeMake(threadgroup_loc, 1, 1);
+            uint32_t maxThreads = [pipeline maxTotalThreadsPerThreadgroup];
+            MTLSize threadGroupSize = MTLSizeMake(std::min(maxThreads, (uint32_t)64), 1, 1);
+            [computeEncoder dispatchThreadgroups:gridSize threadsPerThreadgroup:threadGroupSize];
+
+            // prepare for the next batch: reset threadgroup count and create a new buffer
+            threadgroup_loc = 0;
+            tensorArgumentBuffer = [[device newBufferWithLength:argumentEncoder.encodedLength options:0] autorelease];
+            [argumentEncoder setArgumentBuffer:tensorArgumentBuffer offset:0];
+
+            if (partial) {
+              // for a partial dispatch, rebind the partially processed tensor to slot 0
+              // so that its metadata is in the correct location
+              for (int d = 0; d < depth; d++) {
+                mtl_setBuffer(argumentEncoder, tensor_lists[d][t], d * kmaxTensors + 0);
+                [computeEncoder useResource:getMTLBufferStorage(tensor_lists[d][t])
+                                      usage:(MTLResourceUsageRead | MTLResourceUsageWrite)];
+              }
+              metadata_arguments.numels[0] = carried_numels;
+              // the currently processed tensor now lives at index 0
+              currentTensorIndex = 0;
+              tensor_loc = 1;
+            } else {
+              tensor_loc = 0;
+            }
+          }
+        }
+      }
+
+      if (threadgroup_loc != 0) {
+        mtl_setArgs(computeEncoder, tensorArgumentBuffer, metadata_arguments, args...);
+        MTLSize gridSize = MTLSizeMake(threadgroup_loc, 1, 1);
+        uint32_t maxThreads = [pipeline maxTotalThreadsPerThreadgroup];
+        MTLSize threadGroupSize = MTLSizeMake(std::min(maxThreads, static_cast<uint32_t>(64)), 1, 1);
+        [computeEncoder dispatchThreadgroups:gridSize threadsPerThreadgroup:threadGroupSize];
+      }
+    }
+  });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native::mps
diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index f49a0a037ea1..d4a815bea21f 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -22,6 +22,16 @@
 
 namespace at::native {
 namespace mps {
+<<<<<<< HEAD
+=======
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/LayerNorm_metallib.h>
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void get_shapes(MPSShape* input_shape_readonly,
                        NSMutableArray<NSNumber*>*& input_shape,
                        NSMutableArray<NSNumber*>*& new_mean_shape,
@@ -96,7 +106,12 @@ static void get_shapes(MPSShape* input_shape_readonly,
 
   const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined());
   const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined());
+<<<<<<< HEAD
   TORCH_CHECK(has_running_mean == has_running_var);
+=======
+  TORCH_CHECK_VALUE(has_running_mean == has_running_var,
+                    "running_mean and running_var must either both be None or neither be None");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const bool has_weight = (weight_opt.has_value() && weight_opt->defined());
   const bool has_bias = (bias_opt.has_value() && bias_opt->defined());
@@ -109,7 +124,11 @@ static void get_shapes(MPSShape* input_shape_readonly,
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string mem_format_key;
+=======
+    std::string mem_format_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (memory_format) {
       case at::MemoryFormat::Contiguous:
         mem_format_key = "Contiguous";
@@ -136,7 +155,11 @@ static void get_shapes(MPSShape* input_shape_readonly,
 
     NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
 
+<<<<<<< HEAD
     string key = "batch_norm_mps_out:" + mem_format_key + ":" + std::to_string(epsilon) + ":" +
+=======
+    std::string key = "batch_norm_mps_out:" + mem_format_key + ":" + std::to_string(epsilon) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(momentum) + ":" + std::to_string(train) + ":" + std::to_string(has_running_mean) + ":" +
         std::to_string(has_weight) + ":" + std::to_string(has_bias) + ":" + [ns_shape_key UTF8String] + ":" +
         getTensorsStringKey({self,
@@ -482,8 +505,13 @@ Check if running mean exists (maybe do this check before making graph)
       self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon, output, save_mean, save_var);
 }
 
+<<<<<<< HEAD
 static string get_mem_string(c10::MemoryFormat memory_format) {
   string mem_format_key;
+=======
+static std::string get_mem_string(c10::MemoryFormat memory_format) {
+  std::string mem_format_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   switch (memory_format) {
     case at::MemoryFormat::Contiguous:
       mem_format_key = "Contiguous";
@@ -580,10 +608,19 @@ static string get_mem_string(c10::MemoryFormat memory_format) {
 
   const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined());
   const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined());
+<<<<<<< HEAD
   TORCH_CHECK(has_running_mean == has_running_var);
   const bool has_save_mean = (save_mean_opt.has_value() && save_mean_opt->defined());
   const bool has_save_var = (save_var_opt.has_value() && save_var_opt->defined());
   TORCH_CHECK(has_save_mean == has_save_var);
+=======
+  TORCH_CHECK_VALUE(has_running_mean == has_running_var,
+                    "running_mean and running_var must either both be None or neither be None");
+  const bool has_save_mean = (save_mean_opt.has_value() && save_mean_opt->defined());
+  const bool has_save_var = (save_var_opt.has_value() && save_var_opt->defined());
+  TORCH_CHECK_VALUE(has_save_mean == has_save_var,
+                    "save_mean and save_var must either both be None or neither be None");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const bool has_weight = (weight_opt.has_value() && weight_opt->defined());
 
@@ -592,7 +629,11 @@ static string get_mem_string(c10::MemoryFormat memory_format) {
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string mem_format_key;
+=======
+    std::string mem_format_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     switch (memory_format) {
       case at::MemoryFormat::Contiguous:
         mem_format_key = "Contiguous";
@@ -616,7 +657,11 @@ static string get_mem_string(c10::MemoryFormat memory_format) {
 
     NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
 
+<<<<<<< HEAD
     string key = "batch_norm_backward_mps:" + mem_format_key + ":" + std::to_string(epsilon) + ":" +
+=======
+    std::string key = "batch_norm_backward_mps:" + mem_format_key + ":" + std::to_string(epsilon) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(train) + ":" + std::to_string(has_running_mean) + ":" + std::to_string(has_weight) + ":" +
         [ns_shape_key UTF8String] + ":" + c10::Join(",", grad_input_mask) + ":" + getMPSTypeString(input);
     auto input_mps_dtype = getMPSDataType(input);
@@ -878,6 +923,13 @@ static string get_mem_string(c10::MemoryFormat memory_format) {
                                                   const std::optional<Tensor>& weight_opt,
                                                   const std::optional<Tensor>& bias_opt,
                                                   double eps) {
+<<<<<<< HEAD
+=======
+  auto N = c10::multiply_integers(normalized_shape);
+  auto out = at::empty_like(input, MemoryFormat::Contiguous);
+  auto batch_dim = input.dim() - normalized_shape.size();
+  IntArrayRef batch_shape = input.sizes().slice(0, batch_dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
@@ -887,12 +939,25 @@ static string get_mem_string(c10::MemoryFormat memory_format) {
   auto M = M_N.first;
   auto X = input.expect_contiguous();
   auto gamma = weight.expect_contiguous();
+<<<<<<< HEAD
+
+  auto input_shape = input.sizes();
+=======
+  auto mean = at::empty(batch_shape, input.options(), MemoryFormat::Contiguous);
+  auto rstd = at::empty(batch_shape, input.options(), MemoryFormat::Contiguous);
 
   auto input_shape = input.sizes();
+  int axis_size = static_cast<int>(N);
+  float epsilon_buf = static_cast<float>(eps);
+  int use_weight_buf = weight.defined() ? 1 : 0;
+  int use_bias_buf = bias.defined() ? 1 : 0;
+  int use_weight_and_bias_buf = use_weight_buf & use_bias_buf;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto input_ndim = input.dim();
   const int normalized_ndim = normalized_shape.size();
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   const int axis = input_ndim - normalized_ndim;
+<<<<<<< HEAD
   at::Tensor input_reshaped = input.numel() == 0 ? input.reshape({1, M, 0}) : input.reshape({1, M, -1});
   // Unlike Batch Normalization, which applies scalar scale and bias for each
   // entire channel/plane with the affine option, Layer Normalization applies
@@ -918,6 +983,37 @@ static string get_mem_string(c10::MemoryFormat memory_format) {
   at::Tensor mean = std::get<1>(outputs);
   at::Tensor variance = std::get<2>(outputs);
 
+=======
+  MPSStream* stream = getCurrentMPSStream();
+  @autoreleasepool {
+    mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+      // which kernel variant to use based on the normalized axis N size
+      const int N_READS = 4;
+      auto metalType = mps::scalarToMetalTypeString(input);
+      id<MTLComputePipelineState> layerNormKernel = nil;
+      if (axis_size <= 1024 * N_READS) {
+        layerNormKernel = mps::lib.getPipelineStateForFunc("layer_norm_single_row_" + metalType);
+      } else {
+        layerNormKernel = mps::lib.getPipelineStateForFunc("layer_norm_looped_" + metalType);
+      }
+      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:layerNormKernel];
+
+      mps::mtl_setArgs(computeEncoder, *X, out, mean, rstd, axis_size, epsilon_buf, use_weight_buf, use_bias_buf);
+      if (use_weight_and_bias_buf) {
+        mps::mtl_setArgs<8>(computeEncoder, *gamma, bias);
+      } else if (use_weight_buf) {
+        mps::mtl_setArgs<8>(computeEncoder, *gamma);
+      } else if (use_bias_buf) {
+        mps::mtl_setArgs<9>(computeEncoder, bias);
+      }
+      MTLSize numThreads = MTLSizeMake(std::min((axis_size + N_READS - 1) / N_READS, 1024), 1, 1);
+      MTLSize numThreadgroups = MTLSizeMake(M, 1, 1);
+      [computeEncoder dispatchThreadgroups:numThreadgroups threadsPerThreadgroup:numThreads];
+    });
+  }
+  out = out.view(input_shape);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<int64_t> stat_shape;
   for (const auto idx : c10::irange(axis)) {
     stat_shape.push_back(input_shape[idx]);
@@ -926,8 +1022,13 @@ static string get_mem_string(c10::MemoryFormat memory_format) {
     stat_shape.push_back(1);
   }
   mean = mean.view(stat_shape);
+<<<<<<< HEAD
   variance = variance.view(stat_shape);
   return std::make_tuple(out, mean, variance);
+=======
+  rstd = rstd.view(stat_shape);
+  return std::make_tuple(out, mean, rstd);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_mps(const Tensor& grad_out,
@@ -1060,8 +1161,14 @@ static string get_mem_string(c10::MemoryFormat memory_format) {
       for (const auto i : c10::irange(num_normalized_dims))
         bn_gamma_shape[i + 2] = input_shape[i + num_channel_dims];
 
+<<<<<<< HEAD
       string key = "layer_norm_backward_mps:" + std::to_string(has_weight) + ":" + getArrayRefString(normalized_shape) +
           ":" + getArrayRefString((*X).sizes()) + ":" + c10::Join(",", grad_input_mask) + ":" + getMPSTypeString(*X);
+=======
+      std::string key = "layer_norm_backward_mps:" + std::to_string(has_weight) + ":" +
+          getArrayRefString(normalized_shape) + ":" + getArrayRefString((*X).sizes()) + ":" +
+          c10::Join(",", grad_input_mask) + ":" + getMPSTypeString(*X);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
         MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, *X);
         MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, *dOut);
diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
index 7713ba3d71e8..53d4d611bedd 100644
--- a/aten/src/ATen/native/mps/operations/Pad.mm
+++ b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -31,7 +31,11 @@
                                 const std::optional<Tensor>& grad_output_opt,
                                 MPSGraphPaddingMode mode,
                                 double constantValue,
+<<<<<<< HEAD
                                 const string op_name) {
+=======
+                                const std::string& op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using CachedGraph = MPSUnaryGradCachedGraph;
   const int padding_size = (int)padding.size();
   int padding_dim = padding_size / 2; // either 1D, 2D, or 3D
@@ -244,7 +248,11 @@
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = op_name + getTensorsStringKey({input, grad_output, output}) + ":[" + getArrayRefString(padding) +
+=======
+    std::string key = op_name + getTensorsStringKey({input, grad_output, output}) + ":[" + getArrayRefString(padding) +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "]:" + std::to_string(constantValue);
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
diff --git a/aten/src/ATen/native/mps/operations/PixelShuffle.mm b/aten/src/ATen/native/mps/operations/PixelShuffle.mm
index da213940203d..d1ad1cfd3727 100644
--- a/aten/src/ATen/native/mps/operations/PixelShuffle.mm
+++ b/aten/src/ATen/native/mps/operations/PixelShuffle.mm
@@ -44,7 +44,11 @@ static Tensor pixel_shuffle_helper(const Tensor& self, int64_t factor, bool upsc
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = (upscale ? "pixel_shuffle_" : "pixel_unshuffle_") + getTensorsStringKey({self}) + "_factor_" +
+=======
+    std::string key = (upscale ? "pixel_shuffle_" : "pixel_unshuffle_") + getTensorsStringKey({self}) + "_factor_" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(factor);
     CachedGraph* cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       const auto ndims = self.ndimension();
diff --git a/aten/src/ATen/native/mps/operations/PointwiseOps.mm b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
index 260636561f10..7806dd444add 100644
--- a/aten/src/ATen/native/mps/operations/PointwiseOps.mm
+++ b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
@@ -19,7 +19,11 @@ static void addc_mul_div_out_mps(const Tensor& self,
                                  const Scalar& value_opt, // default value = 1.0
                                  const Tensor& output,
                                  const bool is_div,
+<<<<<<< HEAD
                                  const string op_name) {
+=======
+                                 const std::string& op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (value_opt.toDouble() == 0.0) {
     output.copy_(self);
     return;
@@ -44,7 +48,11 @@ static void addc_mul_div_out_mps(const Tensor& self,
       output_ = at::empty_like(self, MemoryFormat::Contiguous);
     }
 
+<<<<<<< HEAD
     string key = op_name + getTensorsStringKey({self, tensor1, tensor2});
+=======
+    std::string key = op_name + getTensorsStringKey({self, tensor1, tensor2});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       ScalarType common_dtype =
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index 570d2024c640..285f3cebb5b7 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -45,7 +45,11 @@ static void pool2d_template(const Tensor& input,
                             bool count_include_pad,
                             const std::optional<int64_t> divisor_override,
                             PoolingOpBlock poolingBlock,
+<<<<<<< HEAD
                             const c10::string& op_name) {
+=======
+                            const std::string& op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const int64_t ndims = input.ndimension();
   const Tensor& grad_output = *(at::borrow_from_optional_tensor(grad_output_opt));
   const Tensor& indices = *(at::borrow_from_optional_tensor(indices_opt));
@@ -140,10 +144,17 @@ static void pool2d_template(const Tensor& input,
     padH = padW = 0;
   }
   @autoreleasepool {
+<<<<<<< HEAD
     string key = op_name + getTensorsStringKey({input, indices, grad_output}) + ":K[" + getArrayRefString(kernel_size) +
         "]:S[" + getArrayRefString(stride) + "]:P[" + getArrayRefString(padding) + "]:D[" +
         getArrayRefString(dilation) + "]" + (ceil_mode ? ":ceil" : "") + (count_include_pad ? ":include_pad" : "") +
         (has_divisor ? ":divisor" : "") + ":" +
+=======
+    std::string key = op_name + getTensorsStringKey({input, indices, grad_output}) + ":K[" +
+        getArrayRefString(kernel_size) + "]:S[" + getArrayRefString(stride) + "]:P[" + getArrayRefString(padding) +
+        "]:D[" + getArrayRefString(dilation) + "]" + (ceil_mode ? ":ceil" : "") +
+        (count_include_pad ? ":include_pad" : "") + (has_divisor ? ":divisor" : "") + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (suggested_memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
 
     MPSShape* inputShape = getMPSShape(input, memory_format);
@@ -250,7 +261,11 @@ static void avg_pool2d_template(const Tensor& input,
                                 bool ceil_mode,
                                 bool count_include_pad,
                                 const std::optional<int64_t> divisor_override,
+<<<<<<< HEAD
                                 const c10::string& op_name) {
+=======
+                                const std::string& op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const Tensor& grad_output = *(at::borrow_from_optional_tensor(grad_output_opt));
   const bool is_backward_pass = grad_output.defined();
   const bool use_divisor = divisor_override.has_value() && divisor_override.value() != 0;
diff --git a/aten/src/ATen/native/mps/operations/RMSNorm.mm b/aten/src/ATen/native/mps/operations/RMSNorm.mm
index 1260cdd6383a..0bbbb5bb4629 100644
--- a/aten/src/ATen/native/mps/operations/RMSNorm.mm
+++ b/aten/src/ATen/native/mps/operations/RMSNorm.mm
@@ -4,6 +4,7 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+<<<<<<< HEAD
 #include <ATen/ops/empty_like.h>
 #endif
 #include <ATen/native/mps/OperationUtils.h>
@@ -11,6 +12,16 @@
 #include <fmt/format.h>
 
 namespace at::native::mps {
+=======
+#include <ATen/ops/_fused_rms_norm_native.h>
+#include <ATen/ops/empty_like.h>
+#endif
+#include <ATen/native/mps/OperationUtils.h>
+#include <fmt/format.h>
+
+namespace at::native {
+using namespace mps;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef PYTORCH_JIT_COMPILE_SHADERS
 static auto& lib = MetalShaderLibrary::getBundledLibrary();
@@ -18,6 +29,7 @@
 #include <ATen/native/mps/RMSNorm_metallib.h>
 #endif
 
+<<<<<<< HEAD
 Tensor rms_norm_mps_kernel(const Tensor& input,
                            c10::SymIntArrayRef normalized_shape,
                            const Tensor& weight,
@@ -25,6 +37,11 @@ Tensor rms_norm_mps_kernel(const Tensor& input,
   TORCH_CHECK(input.is_contiguous() && weight.is_contiguous(), "Expected contiguous input and weight tensors");
   auto output = at::empty_like(input);
   const int normalized_ndim = normalized_shape.size();
+=======
+Tensor _fused_rms_norm_mps(const Tensor& input, const int64_t normalized_ndim, const Tensor& weight, const double eps) {
+  TORCH_CHECK(input.is_contiguous() && weight.is_contiguous(), "Expected contiguous input and weight tensors");
+  auto output = at::empty_like(input);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto input_shape = input.sizes();
   const auto input_ndim = input.dim();
   const int axis = input_ndim - normalized_ndim;
@@ -64,4 +81,8 @@ Tensor rms_norm_mps_kernel(const Tensor& input,
   return output;
 }
 
+<<<<<<< HEAD
 } // namespace at::native::mps
+=======
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/mps/operations/RangeFactories.mm b/aten/src/ATen/native/mps/operations/RangeFactories.mm
index a2d550550f8c..249fab7adc0e 100644
--- a/aten/src/ATen/native/mps/operations/RangeFactories.mm
+++ b/aten/src/ATen/native/mps/operations/RangeFactories.mm
@@ -3,6 +3,10 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/detail/FunctionTraits.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/RangeUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/arange_native.h>
 #include <ATen/ops/linspace_native.h>
@@ -65,6 +69,7 @@
       size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>()) / step.to<double>());
     }
 
+<<<<<<< HEAD
     TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
     TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
                 "unsupported range: ",
@@ -73,6 +78,9 @@
                 xend);
     TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
                 "upper bound and larger bound inconsistent with step sign");
+=======
+    arange_check_bounds(start, end, step);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
                 "invalid size, possible overflow?");
@@ -106,7 +114,11 @@
     auto stream = getCurrentMPSStream();
     auto mpsDataType = getMPSDataType(result);
     @autoreleasepool {
+<<<<<<< HEAD
       string key = "arange_mps_out" + getTensorsStringKey({result}) + ":" + std::to_string(size);
+=======
+      std::string key = "arange_mps_out" + getTensorsStringKey({result}) + ":" + std::to_string(size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto cachedGraph = cache_->LookUpAs<RangeCachedGraph>(key);
       if (!cachedGraph) {
         cachedGraph = cache_->CreateCachedGraphAs<RangeCachedGraph>(key, ^MPSCachedGraph*() {
@@ -147,6 +159,7 @@
       size_d = static_cast<double>(end.to<double>() - start.to<double>()) / step.to<double>() + 1;
     }
 
+<<<<<<< HEAD
     TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
     TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
                 "unsupported range: ",
@@ -155,6 +168,9 @@
                 xend);
     TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
                 "upper bound and larger bound inconsistent with step sign");
+=======
+    arange_check_bounds(start, end, step);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
                 "invalid size, possible overflow?");
@@ -173,7 +189,11 @@
     auto stream = getCurrentMPSStream();
     auto mpsDataType = getMPSDataType(result);
     @autoreleasepool {
+<<<<<<< HEAD
       string key = "arange_mps_out" + getTensorsStringKey({result}) + ":" + std::to_string(size);
+=======
+      std::string key = "arange_mps_out" + getTensorsStringKey({result}) + ":" + std::to_string(size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto cachedGraph = cache_->LookUpAs<RangeCachedGraph>(key);
       if (!cachedGraph) {
         cachedGraph = cache_->CreateCachedGraphAs<RangeCachedGraph>(key, ^MPSCachedGraph*() {
@@ -221,7 +241,11 @@
     bool start_less_end = (start.to<double>() <= end.to<double>());
 
     @autoreleasepool {
+<<<<<<< HEAD
       string key = "linspace_out_mps:" + getTensorsStringKey({result}) + ":" + std::to_string(steps) +
+=======
+      std::string key = "linspace_out_mps:" + getTensorsStringKey({result}) + ":" + std::to_string(steps) +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           std::to_string(start_less_end);
       auto cachedGraph = cache_->LookUpAs<RangeCachedGraph>(key);
 
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 4770643dfe6c..42cf1e049cfa 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -27,6 +27,10 @@
 #include <ATen/ops/median.h>
 #include <ATen/ops/median_native.h>
 #include <ATen/ops/min_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/nanmedian_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/nansum_native.h>
 #include <ATen/ops/norm_native.h>
 #include <ATen/ops/prod_native.h>
@@ -224,7 +228,11 @@ static void reduction_out_mps(const Tensor& input_t,
   @autoreleasepool {
     std::string dtype_str = dtype.has_value() ? getMPSTypeString(dtype.value()) : "";
     NSString* ns_key = [[wrappedAxes valueForKey:@"description"] componentsJoinedByString:@","];
+<<<<<<< HEAD
     string key = func_name + ":" + string([ns_key UTF8String]) + ":" + getTensorsStringKey(input_t) + ":" +
+=======
+    std::string key = func_name + ":" + std::string([ns_key UTF8String]) + ":" + getTensorsStringKey(input_t) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(keepdim) + ":" + std::to_string(reduction_type) + ":" + getTensorsStringKey(output_t) + ":" +
         dtype_str;
     using CachedGraph = MPSUnaryCachedGraph;
@@ -364,10 +372,17 @@ static void impl_func_norm_mps(const Tensor& input_tensor,
   auto stream = getCurrentMPSStream();
   @autoreleasepool {
     NSString* ns_key = [[wrappedAxes valueForKey:@"description"] componentsJoinedByString:@","];
+<<<<<<< HEAD
     string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
     string tensor_key = cdist ? getTensorsStringKey({input_tensor, other_tensor}) : getTensorsStringKey({input_t});
     string key = string("norm_out_mps:") + [ns_key UTF8String] + ":" + tensor_key + ":p" + std::to_string(p) + ":" +
         keepdim_info + ":" + toString(in_dtype);
+=======
+    std::string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
+    std::string tensor_key = cdist ? getTensorsStringKey({input_tensor, other_tensor}) : getTensorsStringKey({input_t});
+    std::string key = std::string("norm_out_mps:") + [ns_key UTF8String] + ":" + tensor_key + ":p" + std::to_string(p) +
+        ":" + keepdim_info + ":" + toString(in_dtype);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<MPSBinaryCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, input_tensor);
@@ -455,7 +470,11 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
   IntArrayRef dim_value = use_dim ? dim.value() : NULL;
 
   if (use_dim) {
+<<<<<<< HEAD
     string errMessage = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps";
+=======
+    std::string errMessage = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     errMessage += ": reduction dim must be in the range of input shape";
     for (const auto dim : dim_value) {
       auto wrap_dim = maybe_wrap_dim(dim, num_input_dims);
@@ -492,7 +511,11 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
     // Reduction axes
     axes = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
     axes[0] = @0;
+<<<<<<< HEAD
   } else if (!keepdim && use_dim && dim_value.size() > 0) {
+=======
+  } else if (!keepdim && use_dim && !dim_value.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t num_reduce_dims = dim_value.size();
     num_output_dims = num_input_dims;
 
@@ -527,7 +550,11 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
       correction_n *= input_shape[wrap_dim];
     }
     // (3, 4, 5) --> (3, 5)
+<<<<<<< HEAD
   } else if ((keepdim && !use_dim) || (keepdim && use_dim && dim_value.size() <= 0)) {
+=======
+  } else if ((keepdim && !use_dim) || (keepdim && use_dim && dim_value.empty())) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     num_output_dims = 0;
     int64_t num_reduce_dims = 0;
     set_axes(axes, num_reduce_dims, dim_value, input_shape.size());
@@ -539,7 +566,11 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
       correction_n *= input_shape[i];
     }
     // scalar --> vector case [[1.0034567]]
+<<<<<<< HEAD
   } else if (keepdim && use_dim && dim_value.size() > 0) {
+=======
+  } else if (keepdim && use_dim && !dim_value.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t num_reduce_dims = dim_value.size();
     num_output_dims = num_input_dims;
 
@@ -575,6 +606,7 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
   auto stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string op_key = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps";
     NSString* ns_key = [[wrappedAxes valueForKey:@"description"] componentsJoinedByString:@","];
     string bessel_corrected = (use_correction && correction_value) ? "unbiased " : "biased ";
@@ -582,6 +614,15 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
     string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
     string key = op_key + ":" + getTensorsStringKey(input_t) + ":" + use_dim_info + ":" + keepdim_info + ":" +
         string([ns_key UTF8String]) + ":" + bessel_corrected + ":" + std::to_string(correction_value);
+=======
+    std::string op_key = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps";
+    NSString* ns_key = [[wrappedAxes valueForKey:@"description"] componentsJoinedByString:@","];
+    std::string bessel_corrected = (use_correction && correction_value) ? "unbiased " : "biased ";
+    std::string use_dim_info = (use_dim) ? "use_dim=1:" + std::to_string(dim_value.size()) : "use_dim=0";
+    std::string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
+    std::string key = op_key + ":" + getTensorsStringKey(input_t) + ":" + use_dim_info + ":" + keepdim_info + ":" +
+        std::string([ns_key UTF8String]) + ":" + bessel_corrected + ":" + std::to_string(correction_value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
@@ -613,6 +654,87 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
   return output_t;
 }
 
+<<<<<<< HEAD
+=======
+static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, nanmedian ? "nanmedian" : "median");
+
+  IntArrayRef input_shape = input_t.sizes();
+  int64_t num_in_elements = c10::multiply_integers(input_shape);
+
+  // we allocate 1 here due to MacOS13 bug for gather MPSGraph op, look below for the error
+  Tensor output_t = at::empty({1}, input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
+  if (output_t.numel() == 0 || num_in_elements == 0) {
+    return output_t;
+  }
+
+  std::string medianKey = "median_mps:" + getMPSTypeString(input_t) + getTensorsStringKey(input_t) +
+      std::to_string(num_in_elements) + (nanmedian ? ":nan" : "");
+
+  using MedianCachedGraph = MPSUnaryCachedGraph;
+  auto medianCachedGraph =
+      LookUpOrCreateCachedGraph<MedianCachedGraph>(medianKey, [&](auto mpsGraph, auto newCachedGraph) {
+        MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+        MPSGraphTensor* castInputTensor =
+            castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+
+        MPSGraphTensor* reshapedTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
+
+        MPSGraphTensor* effectiveLengthTensor = nil;
+        if (nanmedian) {
+          MPSGraphTensor* isNanTensor = [mpsGraph isNaNWithTensor:reshapedTensor name:nil];
+          MPSGraphTensor* nanCountTensor = [mpsGraph reductionSumWithTensor:isNanTensor axis:-1 name:nil];
+
+          MPSGraphTensor* nanCountTensorFloat = [mpsGraph castTensor:nanCountTensor toType:MPSDataTypeInt32 name:nil];
+
+          MPSGraphTensor* totalElementsTensor = [mpsGraph constantWithScalar:num_in_elements
+                                                                       shape:@[]
+                                                                    dataType:MPSDataTypeInt32];
+
+          effectiveLengthTensor = [mpsGraph subtractionWithPrimaryTensor:totalElementsTensor
+                                                         secondaryTensor:nanCountTensor
+                                                                    name:nil];
+        } else {
+          effectiveLengthTensor = [mpsGraph constantWithScalar:num_in_elements shape:@[] dataType:MPSDataTypeInt32];
+        }
+
+        // get median index: medianIdx = ((effectiveLength + 1) / 2) - 1
+        MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1 shape:@[ @1 ] dataType:MPSDataTypeInt32];
+        MPSGraphTensor* twoTensor = [mpsGraph constantWithScalar:2 shape:@[ @1 ] dataType:MPSDataTypeInt32];
+        MPSGraphTensor* effectivePlusOne = [mpsGraph additionWithPrimaryTensor:effectiveLengthTensor
+                                                               secondaryTensor:oneTensor
+                                                                          name:nil];
+        MPSGraphTensor* halfEffective = [mpsGraph divisionWithPrimaryTensor:effectivePlusOne
+                                                            secondaryTensor:twoTensor
+                                                                       name:nil];
+        MPSGraphTensor* medianIdxTensor = [mpsGraph subtractionWithPrimaryTensor:halfEffective
+                                                                 secondaryTensor:oneTensor
+                                                                            name:nil];
+
+        MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:reshapedTensor axis:0 name:nil];
+
+        MPSGraphTensor* medianTensor = [mpsGraph gatherWithUpdatesTensor:sortedTensor
+                                                           indicesTensor:medianIdxTensor
+                                                                    axis:0
+                                                         batchDimensions:0
+                                                                    name:nil];
+        // MACOS 13 error: Rank of destination array must be greater than 0
+        // which is why we initialize @1 here
+        MPSGraphTensor* outputTensor = [mpsGraph reshapeTensor:medianTensor withShape:@[ @1 ] name:nil];
+
+        newCachedGraph->inputTensor_ = inputTensor;
+        newCachedGraph->outputTensor_ = outputTensor;
+      });
+  auto inputPlaceholder = Placeholder(medianCachedGraph->inputTensor_, input_t);
+  auto outputPlaceHolder = Placeholder(medianCachedGraph->outputTensor_, output_t);
+  auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+  runMPSGraph(getCurrentMPSStream(), medianCachedGraph->graph(), feeds, outputPlaceHolder);
+
+  return output_t.squeeze();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static Tensor min_max_mps_impl(const Tensor& input_t, MPSReductionType reduction_type, const std::string& func_name) {
   bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
   MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "min_max");
@@ -629,7 +751,11 @@ static Tensor min_max_mps_impl(const Tensor& input_t, MPSReductionType reduction
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = func_name + getTensorsStringKey(input_t);
+=======
+    std::string key = func_name + getTensorsStringKey(input_t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CachedGraph* cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
@@ -706,7 +832,11 @@ static void min_max_out_mps(const Tensor& input_t,
   auto stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = func_name + getTensorsStringKey({input_t, indices_t}) + ":" + std::to_string(dim_);
+=======
+    std::string key = func_name + getTensorsStringKey({input_t, indices_t}) + ":" + std::to_string(dim_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
       MPSGraphTensor* outputTensor = nil;
@@ -864,8 +994,13 @@ static void argmax_argmin_out_mps(const Tensor& input_t,
 
   @autoreleasepool {
     NSString* ns_key = [[apparent_in_shape valueForKey:@"description"] componentsJoinedByString:@","];
+<<<<<<< HEAD
     string key =
         func_name + ":" + std::to_string(dim_) + ":" + getTensorsStringKey(input_t) + ":" + string([ns_key UTF8String]);
+=======
+    std::string key = func_name + ":" + std::to_string(dim_) + ":" + getTensorsStringKey(input_t) + ":" +
+        std::string([ns_key UTF8String]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto inputScalarType = input_t.scalar_type();
       MPSGraphTensor* inputTensor =
@@ -924,7 +1059,11 @@ static void argmax_argmin_out_mps(const Tensor& input_t,
                        bool keepdim,
                        std::optional<ScalarType> opt_dtype,
                        Tensor& result) {
+<<<<<<< HEAD
   TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "nansum does not support complex inputs");
+=======
+  TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "nansum on MPS does not support complex inputs");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (c10::isIntegralType(self.scalar_type(), true)) {
     return at::sum_out(result, self, dim, keepdim, opt_dtype);
   }
@@ -1220,7 +1359,11 @@ static void all_any_common_impl_mps(const Tensor& input_t,
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = op_name + "_out_mps:" + getTensorsStringKey(input_t) + ":" + std::to_string(dim_);
+=======
+    std::string key = op_name + "_out_mps:" + getTensorsStringKey(input_t) + ":" + std::to_string(dim_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
@@ -1294,7 +1437,11 @@ static void all_any_common_impl_mps(const Tensor& input_t,
   MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "any_all_out");
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = string("any_all_out_mps:") + getTensorsStringKey(input_t);
+=======
+    std::string key = std::string("any_all_out_mps:") + getTensorsStringKey(input_t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
       auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
@@ -1345,7 +1492,11 @@ static void all_any_common_impl_mps(const Tensor& input_t,
   MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "all_all_out");
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = string("all_all_out_mps:") + getTensorsStringKey(input_t);
+=======
+    std::string key = std::string("all_all_out_mps:") + getTensorsStringKey(input_t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
       auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
@@ -1423,6 +1574,7 @@ Tensor min_mps(const Tensor& input_t) {
 
 // Median of entire tensor into scalar result
 Tensor median_mps(const Tensor& input_t) {
+<<<<<<< HEAD
   bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
   MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "median");
 
@@ -1581,6 +1733,18 @@ static void median_out_mps(const Tensor& input_t,
                                                                 bool keepdim,
                                                                 at::Tensor& values,
                                                                 at::Tensor& indices) {
+=======
+  return median_common_mps(input_t, /*nanmedian=*/false);
+}
+
+static void median_out_mps_common(const Tensor& input_t,
+                                  int64_t dim,
+                                  bool keepdim,
+                                  Tensor& values,
+                                  Tensor& indices,
+                                  const std::string& func_name,
+                                  bool nanmedian) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
   MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "median_out");
 
@@ -1625,6 +1789,7 @@ static void median_out_mps(const Tensor& input_t,
   }
 
   if (values.numel() == 0 || input_t.numel() == 0) {
+<<<<<<< HEAD
     return std::tuple<Tensor&, Tensor&>{values, indices};
   }
 
@@ -1633,6 +1798,154 @@ static void median_out_mps(const Tensor& input_t,
   return std::tuple<Tensor&, Tensor&>{values, indices};
 }
 
+=======
+    return;
+  }
+
+  if (input_t.numel() == 1 && input_t.dim() == 0) {
+    values.fill_(input_t);
+    indices.fill_(0);
+    return;
+  }
+
+  // Derive from MPSCachedGraph
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+    MPSGraphTensor* indicesTensor_ = nil;
+  };
+
+  for (const int i : c10::irange(num_input_dims)) {
+    apparent_out_shape[i] = dim_ == i ? @1 : [NSNumber numberWithInt:input_shape[i]];
+  }
+  int dim_total_elements = input_shape[dim_];
+
+  auto stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    std::string key = func_name + ":" + std::to_string(dim_) + ":" + getTensorsStringKey(input_t) + ":" +
+        getTensorsStringKey(indices);
+    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+      MPSGraphTensor* castInputTensor =
+          castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
+
+      MPSGraphTensor* effectiveLengthTensor = nil;
+      if (nanmedian) {
+        MPSGraphTensor* isNanTensor = [mpsGraph isNaNWithTensor:castInputTensor name:nil];
+        MPSGraphTensor* nanCountTensor = [mpsGraph reductionSumWithTensor:isNanTensor
+                                                                     axis:(NSInteger)dim_
+                                                                     name:@"nanCount"];
+        MPSGraphTensor* nanCountTensorInt = [mpsGraph castTensor:nanCountTensor
+                                                          toType:MPSDataTypeInt32
+                                                            name:@"nanCountInt"];
+        MPSGraphTensor* dimSizeTensor = [mpsGraph constantWithScalar:dim_total_elements
+                                                               shape:@[]
+                                                            dataType:MPSDataTypeInt32];
+        // effective count: effectiveLength = dim_size - nan_count.
+        effectiveLengthTensor = [mpsGraph subtractionWithPrimaryTensor:dimSizeTensor
+                                                       secondaryTensor:nanCountTensorInt
+                                                                  name:@"effectiveLength"];
+      } else {
+        effectiveLengthTensor = [mpsGraph constantWithScalar:dim_total_elements
+                                                       shape:apparent_out_shape
+                                                    dataType:MPSDataTypeInt32];
+      }
+      // median index = ((effectiveLength + 1) / 2) - 1.
+      MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1 shape:@[] dataType:MPSDataTypeInt32];
+      MPSGraphTensor* twoTensor = [mpsGraph constantWithScalar:2 shape:@[] dataType:MPSDataTypeInt32];
+      MPSGraphTensor* effectivePlusOne = [mpsGraph additionWithPrimaryTensor:effectiveLengthTensor
+                                                             secondaryTensor:oneTensor
+                                                                        name:@"effectivePlusOne"];
+      MPSGraphTensor* halfEffective = [mpsGraph divisionWithPrimaryTensor:effectivePlusOne
+                                                          secondaryTensor:twoTensor
+                                                                     name:@"halfEffective"];
+      MPSGraphTensor* medianIdxTensor = [mpsGraph subtractionWithPrimaryTensor:halfEffective
+                                                               secondaryTensor:oneTensor
+                                                                          name:@"medianIdx"];
+
+      MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor axis:((NSUInteger)(int)dim_)name:nil];
+      MPSGraphTensor* sortedIndicesTensor = [mpsGraph argSortWithTensor:castInputTensor
+                                                                   axis:(NSInteger)dim_
+                                                                   name:@"argsort_out"];
+
+      MPSGraphTensor* medianValueTensor = [mpsGraph gatherAlongAxis:dim_
+                                                  withUpdatesTensor:sortedTensor
+                                                      indicesTensor:medianIdxTensor
+                                                               name:@"gather_medianValue"];
+      MPSGraphTensor* medianIndexTensor = [mpsGraph gatherAlongAxis:dim_
+                                                  withUpdatesTensor:sortedIndicesTensor
+                                                      indicesTensor:medianIdxTensor
+                                                               name:@"gather_medianValue"];
+      newCachedGraph->inputTensor_ = inputTensor;
+      newCachedGraph->outputTensor_ = medianValueTensor;
+      newCachedGraph->indicesTensor_ = medianIndexTensor;
+    });
+
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, values, apparent_out_shape);
+    auto indicesPlaceholder = Placeholder(cachedGraph->indicesTensor_, indices, apparent_out_shape);
+
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    auto results = dictionaryFromPlaceholders(outputPlaceholder, indicesPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+// in case mps sortWithTensor do not supported on macOS
+static std::tuple<Tensor&, Tensor&> median_from_cpu(const Tensor& self,
+                                                    int64_t dim,
+                                                    bool keepdim,
+                                                    Tensor& valuesI,
+                                                    Tensor& indicesI,
+                                                    IntArrayRef vec_out_shape,
+                                                    IntArrayRef vec_apparent_out_shape) {
+  Tensor values;
+  Tensor indices;
+  if (!keepdim) {
+    values = at::empty({vec_out_shape}, self.options());
+    indices = at::empty({vec_out_shape}, self.options().dtype(kLong));
+  } else {
+    values = at::empty({vec_apparent_out_shape}, self.options());
+    indices = at::empty({vec_apparent_out_shape}, self.options().dtype(kLong));
+  }
+  at::median_out(values, indices, self, dim, keepdim);
+
+  valuesI.copy_(values);
+  indicesI.copy_(indices);
+  return std::forward_as_tuple(valuesI, indicesI);
+}
+
+TORCH_API ::std::tuple<at::Tensor&, at::Tensor&> median_out_mps(const at::Tensor& input_t,
+                                                                int64_t dim,
+                                                                bool keepdim,
+                                                                at::Tensor& values,
+                                                                at::Tensor& indices) {
+  median_out_mps_common(input_t, dim, keepdim, values, indices, "median_out_mps", false);
+  return std::tuple<Tensor&, Tensor&>{values, indices};
+}
+
+std::tuple<Tensor&, Tensor&> nanmedian_out_mps(const at::Tensor& self,
+                                               int64_t dim,
+                                               bool keepdim,
+                                               at::Tensor& values,
+                                               at::Tensor& indices) {
+  if (c10::isIntegralType(self.scalar_type(), true)) {
+    return median_out_mps(self, dim, keepdim, values, indices);
+  }
+  median_out_mps_common(self, dim, keepdim, values, indices, "nanmedian_out_mps", true);
+  return std::tie(values, indices);
+}
+
+Tensor nanmedian_mps(const Tensor& self) {
+  if (c10::isIntegralType(self.scalar_type(), true)) {
+    return median_mps(self);
+  }
+  return median_common_mps(self, /*nanmedian=*/true);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::tuple<Tensor, Tensor> std_mean_mps(const Tensor& self,
                                         at::OptionalIntArrayRef dim,
                                         const std::optional<Scalar>& correction,
diff --git a/aten/src/ATen/native/mps/operations/RenormKernel.mm b/aten/src/ATen/native/mps/operations/RenormKernel.mm
index d19c2bb77f12..49ca18f59435 100644
--- a/aten/src/ATen/native/mps/operations/RenormKernel.mm
+++ b/aten/src/ATen/native/mps/operations/RenormKernel.mm
@@ -38,7 +38,11 @@ void renorm_out_mps(const Tensor& self, const Scalar& p, int64_t dim, const Scal
   id<MTLBuffer> normBuffer = getMTLBufferStorage(norm);
   id<MTLBuffer> factorBuffer = getMTLBufferStorage(factor);
 
+<<<<<<< HEAD
   string key = "renorm_" + scalarToMetalTypeString(self);
+=======
+  std::string key = "renorm_" + scalarToMetalTypeString(self);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   MPSStream* mpsStream = getCurrentMPSStream();
   id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
   id<MTLComputePipelineState> renormPSO = lib.getPipelineStateForFunc(key);
diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
index 19dbe35e1a4b..ddf5422dae71 100644
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -64,7 +64,11 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
   auto outputDataType = getMPSDataType(result);
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "repeat_mps:" + getTensorsStringKey(self) + ":" + getArrayRefString(repeats);
+=======
+    std::string key = "repeat_mps:" + getTensorsStringKey(self) + ":" + getArrayRefString(repeats);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputDataType, getMPSShape(expanded_tensor));
       MPSGraphTensor* outputTensor = [mpsGraph tileTensor:inputTensor withMultiplier:getMPSShape(repeats) name:nil];
diff --git a/aten/src/ATen/native/mps/operations/RnnOps.mm b/aten/src/ATen/native/mps/operations/RnnOps.mm
index 01df161e623a..30a4417409f7 100644
--- a/aten/src/ATen/native/mps/operations/RnnOps.mm
+++ b/aten/src/ATen/native/mps/operations/RnnOps.mm
@@ -131,7 +131,11 @@
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "lstm_" + getTensorsStringKey({input, hx[0], hx[1]}) + getMPSTypeString(input) + "_num_layers_" +
+=======
+    std::string key = "lstm_" + getTensorsStringKey({input, hx[0], hx[1]}) + getMPSTypeString(input) + "_num_layers_" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(num_layers) + "_bidirectional_" + std::to_string(bidirectional) + "_has_biases_" +
         std::to_string(has_biases) + "_dropout_" + std::to_string(dropout_p) + "_batch_first_" +
         std::to_string(batch_first);
@@ -408,10 +412,17 @@
   // Get stream
   MPSStream* stream = getCurrentMPSStream();
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "lstm_backward_" + getTensorsStringKey({input, z_state, cell_state_fwd, grad_y, grad_cy, grad_hy}) +
         getMPSTypeString(input) + "_num_layers_" + std::to_string(num_layers) + "_bidirectional_" +
         std::to_string(bidirectional) + "_has_biases_" + std::to_string(has_biases) + "_batch_first_" +
         std::to_string(batch_first);
+=======
+    std::string key = "lstm_backward_" +
+        getTensorsStringKey({input, z_state, cell_state_fwd, grad_y, grad_cy, grad_hy}) + getMPSTypeString(input) +
+        "_num_layers_" + std::to_string(num_layers) + "_bidirectional_" + std::to_string(bidirectional) +
+        "_has_biases_" + std::to_string(has_biases) + "_batch_first_" + std::to_string(batch_first);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       NSMutableArray<MPSGraphTensor*>* kernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()];
       NSMutableArray<MPSGraphTensor*>* recurrentKernelWeightsList =
diff --git a/aten/src/ATen/native/mps/operations/ScanKernel.mm b/aten/src/ATen/native/mps/operations/ScanKernel.mm
new file mode 100644
index 000000000000..dc69452cec0e
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/ScanKernel.mm
@@ -0,0 +1,153 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/mps/OperationUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_cummax_helper_native.h>
+#include <ATen/ops/_cummin_helper_native.h>
+#endif
+
+namespace at::native {
+namespace mps {
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/ScanKernel_metallib.h>
+#endif
+
+// Generic scan implementation that handles both simple scans and scans with indices
+static void scan_mps_impl(const Tensor& self,
+                          const std::vector<Tensor>& outputs,
+                          int64_t dim,
+                          const std::string& op_name) {
+  if (outputs[0].numel() == 0) {
+    return;
+  }
+
+  const int64_t ndim = self.dim();
+  const int64_t wrapped_dim = maybe_wrap_dim(dim, ndim);
+
+  // Calculate dimensions for scan operation
+  int64_t row_size = self.size(wrapped_dim);
+  auto sizes = self.sizes();
+
+  bool is_innermost = (wrapped_dim == ndim - 1);
+
+  // Check if all tensors are contiguous
+  bool is_contiguous = self.is_contiguous();
+  for (const auto& output : outputs) {
+    is_contiguous = is_contiguous && output.is_contiguous();
+  }
+
+  uint32_t num_rows, num_orows, num_irows, num_threads;
+
+  if (is_innermost) {
+    // Treat all outer dimensions as a single dimension
+    num_rows = self.numel() / row_size;
+    num_threads = num_rows;
+  } else {
+    // Treat all outer dimensions (i.e. dim_ < dim) as one
+    num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + wrapped_dim);
+    // Treat all inner dimensions (i.e. dim > dimension) as one
+    num_irows = c10::multiply_integers(sizes.begin() + wrapped_dim + 1, sizes.end());
+    num_threads = num_orows * num_irows;
+  }
+
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+
+      // Choose kernel based on contiguity and dimension
+      std::string kernel_name;
+      if (is_contiguous) {
+        kernel_name =
+            op_name + "_contiguous_" + (is_innermost ? "innermost_" : "outer_") + scalarToMetalTypeString(self);
+      } else {
+        kernel_name = op_name + "_strided_" + scalarToMetalTypeString(self);
+      }
+
+      id<MTLComputePipelineState> scanPSO = lib.getPipelineStateForFunc(kernel_name);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(scanPSO, op_name, [&]() {
+        std::vector<Tensor> all_tensors = {self};
+        all_tensors.insert(all_tensors.end(), outputs.begin(), outputs.end());
+        return all_tensors;
+      }());
+
+      [computeEncoder setComputePipelineState:scanPSO];
+
+      // Set input tensor
+      mtl_setBuffer(computeEncoder, self, 0);
+
+      // Set output tensors
+      for (size_t i = 0; i < outputs.size(); ++i) {
+        mtl_setBuffer(computeEncoder, outputs[i], i + 1);
+      }
+
+      if (is_contiguous) {
+        // Contiguous kernels
+        if (is_innermost) {
+          if (outputs.size() == 1) {
+            // Simple scan
+            mtl_setArgs<2>(computeEncoder, num_rows, static_cast<uint32_t>(row_size));
+          } else {
+            // Scan with indices
+            mtl_setArgs<3>(computeEncoder, num_rows, static_cast<uint32_t>(row_size));
+          }
+        } else {
+          if (outputs.size() == 1) {
+            // Simple scan
+            mtl_setArgs<2>(computeEncoder, num_orows, num_irows, static_cast<uint32_t>(row_size));
+          } else {
+            // Scan with indices
+            mtl_setArgs<3>(computeEncoder, num_orows, num_irows, static_cast<uint32_t>(row_size));
+          }
+        }
+      } else {
+        // Strided kernels - pass full tensor information
+        if (outputs.size() == 1) {
+          // Simple scan
+          mtl_setArgs<2>(computeEncoder,
+                         self.sizes(),
+                         self.strides(),
+                         outputs[0].strides(),
+                         static_cast<uint32_t>(self.ndimension()),
+                         static_cast<uint32_t>(wrapped_dim));
+        } else {
+          // Scan with indices
+          mtl_setArgs<3>(computeEncoder,
+                         self.sizes(),
+                         self.strides(),
+                         outputs[0].strides(),
+                         outputs[1].strides(),
+                         static_cast<uint32_t>(self.ndimension()),
+                         static_cast<uint32_t>(wrapped_dim));
+        }
+      }
+
+      mtl_dispatch1DJob(computeEncoder, scanPSO, num_threads);
+
+      getMPSProfiler().endProfileKernel(scanPSO);
+    }
+  });
+}
+
+} // namespace mps
+
+void cummax_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
+  mps::scan_mps_impl(self, {values, indices}, dim, "cummax");
+}
+
+void cummin_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
+  mps::scan_mps_impl(self, {values, indices}, dim, "cummin");
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/ScatterGather.mm b/aten/src/ATen/native/mps/operations/ScatterGather.mm
index 4fd3339f5a30..ec8c8bed8860 100644
--- a/aten/src/ATen/native/mps/operations/ScatterGather.mm
+++ b/aten/src/ATen/native/mps/operations/ScatterGather.mm
@@ -60,7 +60,11 @@
     if (output_type == MPSDataTypeUInt8) {
       output_type = MPSDataTypeInt8;
     }
+<<<<<<< HEAD
     string key = "gather_out_mps" + getTensorsStringKey({self, index, output}) + ":" + std::to_string(dim);
+=======
+    std::string key = "gather_out_mps" + getTensorsStringKey({self, index, output}) + ":" + std::to_string(dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_type, getMPSShape(self));
       MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index);
@@ -132,7 +136,11 @@ static void scatter_mps_general(const Tensor& self_arg,
                                 const Tensor& index,
                                 const Tensor& src,
                                 const Tensor& output,
+<<<<<<< HEAD
                                 string func_name,
+=======
+                                std::string func_name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 const std::string_view reduce) {
   using namespace mps;
 
@@ -189,7 +197,11 @@ static void scatter_mps_general(const Tensor& self_arg,
       needsCast = true;
     }
 
+<<<<<<< HEAD
     string key = func_name + getTensorsStringKey({self, index, src, output}) + ":" + std::to_string(dim) + ":" +
+=======
+    std::string key = func_name + getTensorsStringKey({self, index, src, output}) + ":" + std::to_string(dim) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::string(reduce);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index cf1d55d91d9d..0a36aad0af53 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -87,6 +87,13 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
     return;
   }
 
+<<<<<<< HEAD
+=======
+  // issue #154890, raising error to prevent crash within MPSGraph until
+  // workaround is implemented.
+  TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   MPSStream* stream = getCurrentMPSStream();
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@@ -98,8 +105,13 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
     // Input as placeholders
     MPSShape* input_shape = getMPSShape(self);
     NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+<<<<<<< HEAD
     string key = string("topk:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" + std::to_string(k) +
         ":dim" + std::to_string(dim_) + ":largest" + std::to_string(largest);
+=======
+    std::string key = std::string("topk:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" +
+        std::to_string(k) + ":dim" + std::to_string(dim_) + ":largest" + std::to_string(largest);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
 
@@ -168,7 +180,11 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
   TORCH_CHECK(canCast(out_dtype, out.scalar_type()),
               "torch.cat(): input types can't be cast to the desired output type ",
               out.scalar_type());
+<<<<<<< HEAD
   TORCH_CHECK(inputs.size() > 0, "torch.cat(): invalid number of inputs ", inputs.size());
+=======
+  TORCH_CHECK(!inputs.empty(), "torch.cat(): invalid number of inputs ", inputs.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   dimension = legacy_cat_wrap_dim(dimension, materialized_inputs);
   TORCH_CHECK(dimension >= 0, "torch.cat(): invalid dimension ", dimension);
@@ -253,7 +269,11 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "cat_out_mps:" + std::to_string(dimension) + ":" +
+=======
+    std::string key = "cat_out_mps:" + std::to_string(dimension) + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
     if (!all_same_dtype) {
       key += getTensorsStringKey(input_tensors, true, all_same_sizes_and_stride);
diff --git a/aten/src/ATen/native/mps/operations/SoftMax.mm b/aten/src/ATen/native/mps/operations/SoftMax.mm
index 33d2c91826a5..52c2ed0be87d 100644
--- a/aten/src/ATen/native/mps/operations/SoftMax.mm
+++ b/aten/src/ATen/native/mps/operations/SoftMax.mm
@@ -63,7 +63,11 @@ static void get_shapes(MPSShape* input_shape_readonly,
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string mem_format_key = get_mem_format_string(memory_format);
+=======
+    std::string mem_format_key = get_mem_format_string(memory_format);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPSShape* input_shape_readonly = mps::getMPSShape(input);
     int num_input_dims = [input_shape_readonly count];
     // Check - Channels last implies 4d
@@ -93,8 +97,13 @@ static void get_shapes(MPSShape* input_shape_readonly,
 
     NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
 
+<<<<<<< HEAD
     string key = "softmax_mps_out" + getTensorsStringKey(input, true, /*exclude_shape*/ true) + ":" + mem_format_key +
         ":" + std::to_string(dim_);
+=======
+    std::string key = "softmax_mps_out" + getTensorsStringKey(input, true, /*exclude_shape*/ true) + ":" +
+        mem_format_key + ":" + std::to_string(dim_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()));
@@ -159,7 +168,11 @@ static void get_shapes(MPSShape* input_shape_readonly,
     MPSShape* grad_shape = mps::getMPSShape(grad);
     NSString* ns_shape_key = [[grad_shape valueForKey:@"description"] componentsJoinedByString:@","];
 
+<<<<<<< HEAD
     string key = "softmax_backward_mps_out:" + getMPSTypeString(output) + ":" + [ns_shape_key UTF8String] + ":" +
+=======
+    std::string key = "softmax_backward_mps_out:" + getMPSTypeString(output) + ":" + [ns_shape_key UTF8String] + ":" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(dim_);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* softmaxTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(output), grad_shape);
diff --git a/aten/src/ATen/native/mps/operations/Sort.mm b/aten/src/ATen/native/mps/operations/Sort.mm
index 08e02fbbf03b..6265d1cae8a2 100644
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@@ -50,7 +50,11 @@
     // Input as placeholders
     MPSShape* input_shape = getMPSShape(self);
     NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+<<<<<<< HEAD
     string key = string("sort:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":dim" +
+=======
+    std::string key = std::string("sort:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":dim" +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::to_string(dim) + ":descending" + std::to_string(descending);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
diff --git a/aten/src/ATen/native/mps/operations/SpecialOps.mm b/aten/src/ATen/native/mps/operations/SpecialOps.mm
index c40431d34958..929d0351c255 100644
--- a/aten/src/ATen/native/mps/operations/SpecialOps.mm
+++ b/aten/src/ATen/native/mps/operations/SpecialOps.mm
@@ -16,10 +16,24 @@ static void i0_kernel_mps(TensorIteratorBase& iter) {
   lib.exec_unary_kernel(iter, "i0");
 }
 
+<<<<<<< HEAD
+=======
+static void i0e_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "i0e");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void i1_kernel_mps(TensorIteratorBase& iter) {
   lib.exec_unary_kernel(iter, "i1");
 }
 
+<<<<<<< HEAD
+=======
+static void i1e_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "i1e");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void spherical_bessel_j0_kernel_mps(TensorIteratorBase& iter) {
   lib.exec_unary_kernel(iter, "spherical_bessel_j0");
 }
@@ -28,8 +42,65 @@ static void entr_kernel_mps(TensorIteratorBase& iter) {
   lib.exec_unary_kernel(iter, "entr");
 }
 
+<<<<<<< HEAD
+REGISTER_DISPATCH(i0_stub, &i0_kernel_mps)
+REGISTER_DISPATCH(special_i1_stub, &i1_kernel_mps)
+=======
+static void bessel_j0_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "bessel_j0_forward");
+}
+
+static void bessel_j1_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "bessel_j1_forward");
+}
+
+static void modified_bessel_i0_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "modified_bessel_i0_forward");
+}
+
+static void modified_bessel_i1_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "modified_bessel_i1_forward");
+}
+
+static void modified_bessel_k0_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "modified_bessel_k0_forward");
+}
+
+static void modified_bessel_k1_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "modified_bessel_k1_forward");
+}
+
+static void scaled_modified_bessel_k0_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "scaled_modified_bessel_k0_forward");
+}
+
+static void scaled_modified_bessel_k1_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "scaled_modified_bessel_k1_forward");
+}
+
+static void bessel_y0_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "bessel_y0_forward");
+}
+
+static void bessel_y1_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "bessel_y1_forward");
+}
+
 REGISTER_DISPATCH(i0_stub, &i0_kernel_mps)
+REGISTER_DISPATCH(special_i0e_stub, &i0e_kernel_mps)
 REGISTER_DISPATCH(special_i1_stub, &i1_kernel_mps)
+REGISTER_DISPATCH(special_i1e_stub, &i1e_kernel_mps)
+REGISTER_DISPATCH(special_bessel_j0_stub, &bessel_j0_kernel_mps)
+REGISTER_DISPATCH(special_bessel_j1_stub, &bessel_j1_kernel_mps)
+REGISTER_DISPATCH(special_modified_bessel_i0_stub, &modified_bessel_i0_kernel_mps)
+REGISTER_DISPATCH(special_modified_bessel_i1_stub, &modified_bessel_i1_kernel_mps)
+REGISTER_DISPATCH(special_modified_bessel_k0_stub, &modified_bessel_k0_kernel_mps)
+REGISTER_DISPATCH(special_modified_bessel_k1_stub, &modified_bessel_k1_kernel_mps)
+REGISTER_DISPATCH(special_scaled_modified_bessel_k0_stub, &scaled_modified_bessel_k0_kernel_mps)
+REGISTER_DISPATCH(special_scaled_modified_bessel_k1_stub, &scaled_modified_bessel_k1_kernel_mps)
+REGISTER_DISPATCH(special_bessel_y0_stub, &bessel_y0_kernel_mps)
+REGISTER_DISPATCH(special_bessel_y1_stub, &bessel_y1_kernel_mps)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 REGISTER_DISPATCH(special_spherical_bessel_j0_stub, &spherical_bessel_j0_kernel_mps)
 REGISTER_DISPATCH(special_entr_stub, &entr_kernel_mps)
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/SummaryOps.mm b/aten/src/ATen/native/mps/operations/SummaryOps.mm
index acb58b5ab03f..8d9d7561c96c 100644
--- a/aten/src/ATen/native/mps/operations/SummaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/SummaryOps.mm
@@ -19,7 +19,11 @@
   bool has_weights = weights.defined();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "bincount_mps_impl" + getTensorsStringKey({self, weights});
+=======
+    std::string key = "bincount_mps_impl" + getTensorsStringKey({self, weights});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
       MPSGraphTensor* scatterDataTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSScalarType(output.scalar_type()));
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 7a72568a705d..d8775e5b295d 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -1,6 +1,10 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Dispatch.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ScalarOps.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorCompare.h>
 #include <ATen/native/mps/OperationUtils.h>
@@ -16,6 +20,10 @@
 #include <ATen/ops/isin_native.h>
 #include <ATen/ops/nan_to_num_native.h>
 #include <ATen/ops/ones_like_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/result_type.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/where_native.h>
 #endif
 
@@ -79,7 +87,11 @@ static void clamp_mps_graph(CachedGraph* cachedGraph,
   cachedGraph->outputTensor = outputTensor;
 }
 
+<<<<<<< HEAD
 static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor& input_t, string op_name) {
+=======
+static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor& input_t, std::string op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!clamp_opt->is_same_size(input_t)) {
     auto num_clamp_dims = clamp_opt->dim();
     auto num_input_dims = input_t.dim();
@@ -118,7 +130,11 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
                                  const OptionalTensorRef min_opt,
                                  const OptionalTensorRef max_opt,
                                  const Tensor& output_t,
+<<<<<<< HEAD
                                  string op_name) {
+=======
+                                 std::string op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const bool has_min = (min_opt.has_value() && min_opt->defined());
   const bool has_max = (max_opt.has_value() && max_opt->defined());
 
@@ -172,7 +188,11 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
                    : getTensorsStringKey({input_t, min_opt_tensor}))
         : (has_max ? getTensorsStringKey({input_t, max_opt_tensor}) : getTensorsStringKey({input_t}));
 
+<<<<<<< HEAD
     string key = op_name + (has_min ? "_min" : "") + (has_max ? "_max" : "") + "_tensor" + tensor_key;
+=======
+    std::string key = op_name + (has_min ? "_min" : "") + (has_max ? "_max" : "") + "_tensor" + tensor_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       if (has_min) {
         newCachedGraph->minTensor = mpsGraphRankedPlaceHolder(mpsGraph, min_opt_tensor);
@@ -220,7 +240,11 @@ static void clamp_scalar_out_mps(const Tensor& input_t,
                                  const OptionalScalarRef min_opt,
                                  const OptionalScalarRef max_opt,
                                  const Tensor& output_t,
+<<<<<<< HEAD
                                  string op_name) {
+=======
+                                 std::string op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using scalar_t = double;
 
   const bool has_min = (min_opt.has_value());
@@ -242,7 +266,11 @@ static void clamp_scalar_out_mps(const Tensor& input_t,
 
   @autoreleasepool {
     // the optional min/max refs could affect how we build the cached graph
+<<<<<<< HEAD
     string key = op_name + (has_min ? ("_min:" + std::to_string(min_scalar)) : "") +
+=======
+    std::string key = op_name + (has_min ? ("_min:" + std::to_string(min_scalar)) : "") +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (has_max ? ("_max:" + std::to_string(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       if (has_min)
@@ -277,7 +305,11 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
                                        bool assume_unique,
                                        bool invert,
                                        const Tensor& out,
+<<<<<<< HEAD
                                        string op_name) {
+=======
+                                       std::string op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (elements.numel() == 0) {
     return;
   }
@@ -293,6 +325,7 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
     return;
   }
 
+<<<<<<< HEAD
   TORCH_CHECK(elements.is_mps() && test_elements.is_mps());
   TORCH_CHECK(elements.dtype() == test_elements.dtype());
   TORCH_CHECK(
@@ -310,6 +343,24 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
 
       newCachedGraph->inputTensor_ = inputTensor;
       newCachedGraph->otherTensor_ = otherTensor;
+=======
+  const auto common_type = at::result_type(elements, test_elements);
+  TORCH_CHECK(elements.is_mps() && test_elements.is_mps());
+  TORCH_CHECK(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) || supportedFloatingType(common_type),
+              "isin_Tensor_Tensor_out only works on floating types on MPS for pre MacOS_14_0. Received dtype: ",
+              common_type);
+
+  @autoreleasepool {
+    std::string key = op_name + getTensorsStringKey({elements, test_elements}) + std::to_string(invert);
+
+    auto cachedGraph = LookUpOrCreateCachedGraph<MPSBinaryCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      newCachedGraph->inputTensor_ = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(elements.scalar_type()));
+      newCachedGraph->otherTensor_ = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(test_elements.scalar_type()));
+
+      // Cast to common type
+      auto inputTensor = castMPSTensor(mpsGraph, newCachedGraph->inputTensor_, common_type);
+      auto otherTensor = castMPSTensor(mpsGraph, newCachedGraph->otherTensor_, common_type);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       MPSShape* outputShape = getMPSShape(out);
 
@@ -393,6 +444,15 @@ static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) {
 (const Tensor& elements, const Tensor& test_elements, bool assume_unique, bool invert, const Tensor& out) {
   mps::isin_Tensor_Tensor_out_mps(elements, test_elements, assume_unique, invert, out, __func__);
 }
+<<<<<<< HEAD
+=======
+TORCH_IMPL_FUNC(isin_Scalar_Tensor_out_mps)
+(const Scalar& elements, const Tensor& test_elements, bool assume_unique, bool invert, const Tensor& out) {
+  at::native::resize_output(out, {});
+  mps::isin_Tensor_Tensor_out_mps(
+      mps::wrapped_scalar_tensor_mps(elements, kMPS), test_elements, assume_unique, invert, out, __func__);
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static void where_kernel_mps(TensorIterator& iter) {
   const auto& condition = iter.input(0);
@@ -421,6 +481,14 @@ static void where_kernel_mps(TensorIterator& iter) {
     return;
   }
 
+<<<<<<< HEAD
+=======
+  Tensor out_;
+  if (needsGather(out)) {
+    out_ = out.contiguous();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Derive from MPSCachedGraph
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@@ -435,7 +503,11 @@ static void where_kernel_mps(TensorIterator& iter) {
   MPSDataType otherDataType = getMPSScalarType(other.scalar_type());
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "where_self_out_mps:" + getTensorsStringKey({cond_bool, self, other});
+=======
+    std::string key = "where_self_out_mps:" + getTensorsStringKey({cond_bool, self, other});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* conditionTensor = mpsGraphRankedPlaceHolder(mpsGraph, conditionDataType, getMPSShape(cond_bool));
@@ -459,11 +531,26 @@ static void where_kernel_mps(TensorIterator& iter) {
         Placeholder(cachedGraph->selfTensor_, self, /*mpsShape=*/nullptr, /*gatherTensorData=*/true, selfDataType);
     Placeholder otherPlaceholder =
         Placeholder(cachedGraph->otherTensor_, other, /*mpsShape=*/nullptr, /*gatherTensorData=*/true, otherDataType);
+<<<<<<< HEAD
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+=======
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_,
+                                                needsGather(out) ? out_ : out,
+                                                /*mpsShape=*/nullptr,
+                                                /*gatherTensorData=*/needsGather(out),
+                                                getMPSScalarType(out.scalar_type()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto feeds = dictionaryFromPlaceholders(conditionPlaceholder, selfPlaceholder, otherPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
+<<<<<<< HEAD
+=======
+
+  if (needsGather(out)) {
+    out.copy_(out_);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor& nan_to_num_out_mps(const Tensor& self,
@@ -495,7 +582,11 @@ static void where_kernel_mps(TensorIterator& iter) {
   };
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "nan_to_num" + getTensorsStringKey({self});
+=======
+    std::string key = "nan_to_num" + getTensorsStringKey({self});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPSDataType self_dtype = getMPSScalarType(self.scalar_type());
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
diff --git a/aten/src/ATen/native/mps/operations/TriangularOps.mm b/aten/src/ATen/native/mps/operations/TriangularOps.mm
index f19a8af9cc44..bdc185926c75 100644
--- a/aten/src/ATen/native/mps/operations/TriangularOps.mm
+++ b/aten/src/ATen/native/mps/operations/TriangularOps.mm
@@ -5,6 +5,10 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/TensorFactories.h>
 #include <ATen/native/mps/OperationUtils.h>
+<<<<<<< HEAD
+=======
+#include <fmt/format.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -26,6 +30,7 @@
 #include <ATen/native/mps/TriangularOps_metallib.h>
 #endif
 
+<<<<<<< HEAD
 TORCH_IMPL_FUNC(triu_mps_out)
 (const Tensor& self, int64_t k, const Tensor& output) {
   using namespace mps;
@@ -74,10 +79,55 @@
     auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
+=======
+template <typename T>
+static std::vector<T> reverse_array(const IntArrayRef& arr) {
+  std::vector<T> rc(arr.size());
+  for (const auto& i : c10::irange(arr.size())) {
+    rc[i] = arr[arr.size() - 1 - i];
+  }
+  return rc;
+}
+
+static void triu_tril_impl(const Tensor& self, int64_t k, const Tensor& out, const std::string& name) {
+  using namespace mps;
+  if (self.numel() == 0) {
+    return;
+  }
+  auto sizes = reverse_array<uint32_t>(self.sizes());
+  auto inp_strides = reverse_array<int32_t>(self.strides());
+  auto out_strides = reverse_array<int32_t>(out.strides());
+  std::array<int, 2> k_ndim = {int(k), int(self.ndimension())};
+  const bool inplace = self.is_same(out);
+  const auto kernel_name =
+      fmt::format("{}{}_{}_{}", name, inplace ? "_inplace" : "", "int", scalarToMetalTypeString(self));
+  auto triuPSO = lib.getPipelineStateForFunc(kernel_name);
+  uint32_t max_threads_per_group = [triuPSO maxTotalThreadsPerThreadgroup];
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:triuPSO];
+      if (inplace) {
+        mtl_setArgs(computeEncoder, self, inp_strides, sizes, k_ndim);
+      } else {
+        mtl_setArgs(computeEncoder, out, self, out_strides, inp_strides, sizes, k_ndim);
+      }
+      [computeEncoder dispatchThreads:MTLSizeMake(sizes[0], sizes[1], self.numel() / (sizes[0] * sizes[1]))
+                threadsPerThreadgroup:MTLSizeMake(std::min(max_threads_per_group, sizes[0]), 1, 1)];
+    }
+  });
+}
+
+TORCH_IMPL_FUNC(triu_mps_out)
+(const Tensor& self, int64_t k, const Tensor& output) {
+  triu_tril_impl(self, k, output, "triu");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_IMPL_FUNC(tril_mps_out)
 (const Tensor& self, int64_t k, const Tensor& output) {
+<<<<<<< HEAD
   using namespace mps;
   using CachedGraph = MPSUnaryCachedGraph;
 
@@ -125,6 +175,9 @@
     auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
+=======
+  triu_tril_impl(self, k, output, "tril");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 Tensor tril_indices_mps(int64_t row,
diff --git a/aten/src/ATen/native/mps/operations/UnaryKernel.mm b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
index 412a44074c83..1fbaa1f39270 100644
--- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
@@ -1,9 +1,15 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/TensorIterator.h>
 #include <ATen/mps/MPSProfiler.h>
+<<<<<<< HEAD
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 
+=======
+// #include <ATen/native/Activation.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/mps/OperationUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <fmt/format.h>
 
 namespace at::native {
@@ -14,6 +20,7 @@
 #include <ATen/native/mps/UnaryKernel_metallib.h>
 #endif
 
+<<<<<<< HEAD
 static void erfinv_kernel(TensorIteratorBase& iter) {
   lib.exec_unary_kernel(iter, "erfinv");
 }
@@ -44,4 +51,43 @@ static void sqrt_kernel_mps(TensorIteratorBase& iter) {
 REGISTER_DISPATCH(tanh_stub, tanh_kernel);
 REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel);
 REGISTER_DISPATCH(sqrt_stub, sqrt_kernel_mps);
+=======
+#define REGISTER_UNARY_TI_DISPATCH(NAME)                    \
+  static void NAME##_kernel_mps(TensorIteratorBase& iter) { \
+    lib.exec_unary_kernel(iter, #NAME);                     \
+  }                                                         \
+  REGISTER_DISPATCH(NAME##_stub, NAME##_kernel_mps)
+
+static void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals) {
+  lib.exec_unary_kernel(iter, "round_decimals", Scalar(decimals), ScalarType::Long);
+}
+
+REGISTER_UNARY_TI_DISPATCH(exp);
+REGISTER_UNARY_TI_DISPATCH(expm1);
+REGISTER_UNARY_TI_DISPATCH(erf);
+REGISTER_UNARY_TI_DISPATCH(erfc);
+REGISTER_UNARY_TI_DISPATCH(erfinv);
+REGISTER_UNARY_TI_DISPATCH(sinc);
+REGISTER_UNARY_TI_DISPATCH(sinh);
+REGISTER_UNARY_TI_DISPATCH(cosh);
+REGISTER_UNARY_TI_DISPATCH(tanh);
+REGISTER_UNARY_TI_DISPATCH(abs);
+REGISTER_UNARY_TI_DISPATCH(sin);
+REGISTER_UNARY_TI_DISPATCH(cos);
+REGISTER_UNARY_TI_DISPATCH(tan);
+REGISTER_UNARY_TI_DISPATCH(asin);
+REGISTER_UNARY_TI_DISPATCH(acos);
+REGISTER_UNARY_TI_DISPATCH(atan);
+REGISTER_UNARY_TI_DISPATCH(sqrt);
+REGISTER_UNARY_TI_DISPATCH(rsqrt);
+REGISTER_UNARY_TI_DISPATCH(neg);
+REGISTER_UNARY_TI_DISPATCH(exp2);
+REGISTER_UNARY_TI_DISPATCH(log10);
+REGISTER_UNARY_TI_DISPATCH(log2);
+REGISTER_UNARY_TI_DISPATCH(log);
+REGISTER_UNARY_TI_DISPATCH(log1p);
+REGISTER_UNARY_TI_DISPATCH(bitwise_not);
+REGISTER_UNARY_TI_DISPATCH(sigmoid);
+REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 768b30f3a993..3bc3a73fc0c1 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -12,7 +12,10 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_copy_from_and_resize.h>
+<<<<<<< HEAD
 #include <ATen/ops/abs_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/acos_native.h>
 #include <ATen/ops/acosh_native.h>
 #include <ATen/ops/angle_native.h>
@@ -27,6 +30,7 @@
 #include <ATen/ops/cumsum_native.h>
 #include <ATen/ops/erf_native.h>
 #include <ATen/ops/exp2_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/expm1_native.h>
 #include <ATen/ops/frac_native.h>
 #include <ATen/ops/imag.h>
@@ -34,6 +38,10 @@
 #include <ATen/ops/log1p_native.h>
 #include <ATen/ops/log2_native.h>
 #include <ATen/ops/log_native.h>
+=======
+#include <ATen/ops/frac_native.h>
+#include <ATen/ops/imag.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/logical_not_native.h>
 #include <ATen/ops/logit_backward_native.h>
 #include <ATen/ops/logit_native.h>
@@ -44,7 +52,10 @@
 #include <ATen/ops/reshape.h>
 #include <ATen/ops/rsqrt_native.h>
 #include <ATen/ops/sgn_native.h>
+<<<<<<< HEAD
 #include <ATen/ops/sigmoid_native.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/sign_mps_dispatch.h>
 #include <ATen/ops/sign_native.h>
 #include <ATen/ops/signbit_native.h>
@@ -82,7 +93,11 @@ static void unary_op_noresize(const Tensor& self, const Tensor& output_, std::st
   }
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = op_name + getTensorsStringKey({self, output});
+=======
+    std::string key = op_name + getTensorsStringKey({self, output});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto cachedGraph = LookUpOrCreateCachedGraph<MPSUnaryCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, self);
       MPSGraphTensor* castTensor = newCachedGraph->inputTensor_;
@@ -201,6 +216,7 @@ static void unary_op(const Tensor& self,
     });                                                                                                          \
   }
 
+<<<<<<< HEAD
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp2_out_mps, exponentBase2)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(reciprocal_out_mps, reciprocal)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(neg_out_mps, negative)
@@ -216,10 +232,14 @@ static void unary_op(const Tensor& self,
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(atan_out_mps, atan)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sinh_out_mps, sinh)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(cosh_out_mps, cosh)
+=======
+CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(reciprocal_out_mps, reciprocal)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(asinh_out_mps, asinh)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(acosh_out_mps, acosh)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(atanh_out_mps, atanh)
 
+<<<<<<< HEAD
 TORCH_IMPL_FUNC(rsqrt_out_mps)(const Tensor& self, const Tensor& output) {
   mps::unary_op(self, output, "rsqrt_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
 #ifdef __MAC_15_0
@@ -263,6 +283,8 @@ static void unary_op(const Tensor& self,
   return output;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor& logical_not_out_mps(const Tensor& self, Tensor& output) {
   auto bool_self = self.to(ScalarType::Bool);
   mps::unary_op(bool_self, output, "logical_not_out_mps", [](MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
@@ -280,7 +302,11 @@ static void unary_op(const Tensor& self,
     });
     return output;
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(!self.is_complex(), "MPS does not support angle with complex imput on macOS13")
+=======
+    TORCH_CHECK(!self.is_complex(), "MPS does not support angle with complex input on macOS13")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
       // On macOS 13 with non-complex input, realPartOfTensor and imaginaryPartOfTensor are
       // not available, and NaN is not propagated correctly:
@@ -304,6 +330,7 @@ Tensor angle_mps(const Tensor& self) {
   return angle_out_mps(self, result);
 }
 
+<<<<<<< HEAD
 TORCH_IMPL_FUNC(sigmoid_out_mps)(const Tensor& self, const Tensor& output) {
   TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support sigmoid op with int64 input");
   mps::unary_op(self, output, "sigmoid_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
@@ -318,6 +345,8 @@ Tensor angle_mps(const Tensor& self) {
   });
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_IMPL_FUNC(frac_out_mps)(const Tensor& self, const Tensor& output) {
   TORCH_CHECK(isFloatingType(self.scalar_type()), "frac_out_mps is only implemented for floating types");
   mps::unary_op(self, output, "frac_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
@@ -331,6 +360,7 @@ Tensor angle_mps(const Tensor& self) {
   });
 }
 
+<<<<<<< HEAD
 TORCH_IMPL_FUNC(expm1_out_mps)(const Tensor& self, const Tensor& output) {
   mps::unary_op(self, output, "expm1_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
     MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0 shape:@[ @1 ] dataType:inputTensor.dataType];
@@ -340,6 +370,9 @@ Tensor angle_mps(const Tensor& self) {
 }
 
 static void logit_mps_impl(const Tensor& self, std::optional<double> eps, Tensor& output, const std::string op_name) {
+=======
+static void logit_mps_impl(const Tensor& self, std::optional<double> eps, Tensor& output, const std::string& op_name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string key = op_name + ":[" + (eps.has_value() ? std::to_string(eps.value()) : "NULL") + "]";
 
   mps::unary_op(self, output, key, ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
@@ -373,7 +406,15 @@ static void logit_mps_impl(const Tensor& self, std::optional<double> eps, Tensor
 }
 
 Tensor logit_mps(const Tensor& self, std::optional<double> eps) {
+<<<<<<< HEAD
   Tensor result = at::empty(self.sizes(), ScalarType::Float, std::nullopt, kMPS, std::nullopt, std::nullopt);
+=======
+  auto out_dtype = self.scalar_type();
+  if (c10::isIntegralType(out_dtype, /*includeBool*/ true)) {
+    out_dtype = kFloat;
+  }
+  Tensor result = at::empty(self.sizes(), out_dtype, std::nullopt, kMPS, std::nullopt, std::nullopt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   logit_mps_impl(self, eps, result, "logit_mps");
   return result;
 }
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index 2ff101c893ad..0bbad0f0c9fb 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -184,7 +184,11 @@
                                          const bool consecutive,
                                          std::optional<int64_t> dim) {
   @autoreleasepool {
+<<<<<<< HEAD
     string key = getUniqueKey(self.scalar_type(), self.sizes(), return_inverse, return_counts, consecutive, dim);
+=======
+    std::string key = getUniqueKey(self.scalar_type(), self.sizes(), return_inverse, return_counts, consecutive, dim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return LookUpOrCreateCachedGraph<UniqueCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(self), getMPSShape(self));
       auto outputTensors = buildUniqueGraph(self, newCachedGraph, return_inverse, return_counts, consecutive, dim);
@@ -272,7 +276,11 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph,
   }
 
   int64_t lengthScalar = length.item<int64_t>() + 1; // length actually holds max index, add 1
+<<<<<<< HEAD
   if (output.sizes().size() != 0) {
+=======
+  if (!output.sizes().empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     output = at::slice(output, dim, 0, lengthScalar);
   }
   if (return_counts)
diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
index 2e0581e2155f..55f8f856faf6 100644
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -9,6 +9,10 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+<<<<<<< HEAD
+=======
+#include <ATen/ops/_upsample_bicubic2d_aa_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
 #include <ATen/ops/_upsample_bilinear2d_aa_native.h>
 #include <ATen/ops/_upsample_nearest_exact1d.h>
@@ -19,6 +23,11 @@
 #include <ATen/ops/_upsample_nearest_exact2d_backward.h>
 #include <ATen/ops/_upsample_nearest_exact2d_backward_native.h>
 #include <ATen/ops/_upsample_nearest_exact2d_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/_upsample_nearest_exact3d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/upsample_bicubic2d_backward_native.h>
 #include <ATen/ops/upsample_bicubic2d_native.h>
 #include <ATen/ops/upsample_bilinear2d.h>
@@ -37,7 +46,18 @@
 #include <ATen/ops/upsample_nearest2d_backward.h>
 #include <ATen/ops/upsample_nearest2d_backward_native.h>
 #include <ATen/ops/upsample_nearest2d_native.h>
+<<<<<<< HEAD
 #endif
+=======
+#include <ATen/ops/upsample_nearest3d_backward_native.h>
+#include <ATen/ops/upsample_nearest3d_native.h>
+#include <ATen/ops/upsample_trilinear3d_backward_native.h>
+#include <ATen/ops/upsample_trilinear3d_native.h>
+#endif
+
+#include <ATen/native/mps/kernels/UpSample.h>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at::native {
 namespace mps {
 
@@ -102,7 +122,11 @@ static void upsample_out_template(const Tensor& input,
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
+<<<<<<< HEAD
     string key = "upsample_" + std::string(resize_mode_str) + (align_corners ? "_aligned_corners" : "") +
+=======
+    std::string key = "upsample_" + std::string(resize_mode_str) + (align_corners ? "_aligned_corners" : "") +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getTensorsStringKey({input}) + ":[" + std::to_string(scale_h) + "," + std::to_string(scale_w) + "]:[" +
         (is_backward_pass ? getArrayRefString(input_size) : "Undefined") + "]";
 
@@ -258,7 +282,11 @@ static void upsample_kernel_out_template(const Tensor& input,
                                          std::optional<double> scale_h_opt,
                                          std::optional<double> scale_w_opt,
                                          const Tensor& output,
+<<<<<<< HEAD
                                          const std::string name) {
+=======
+                                         const std::string& name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (output.numel() == 0) {
     return;
   }
@@ -289,6 +317,79 @@ static void upsample_kernel_out_template(const Tensor& input,
   });
 }
 
+<<<<<<< HEAD
+=======
+static void upsample_kernel_out_template(const Tensor& input,
+                                         IntArrayRef output_size,
+                                         bool align_corners,
+                                         std::optional<double> scale_d_opt,
+                                         std::optional<double> scale_h_opt,
+                                         std::optional<double> scale_w_opt,
+                                         const Tensor& output,
+                                         const std::string& name) {
+  if (output.numel() == 0) {
+    return;
+  }
+  UpsampleParams<5> params;
+  memcpy(params.input_sizes.data(), input.sizes().data(), 5 * sizeof(long));
+  memcpy(params.input_strides.data(), input.strides().data(), 5 * sizeof(long));
+  memcpy(params.output_strides.data(), output.strides().data(), 5 * sizeof(long));
+  memcpy(params.output_sizes.data(), output.sizes().data(), 5 * sizeof(long));
+  params.scales[0] = area_pixel_compute_scale<float>(input.size(4), output.size(4), align_corners, scale_w_opt);
+  params.scales[1] = area_pixel_compute_scale<float>(input.size(3), output.size(3), align_corners, scale_h_opt);
+  params.scales[2] = area_pixel_compute_scale<float>(input.size(2), output.size(2), align_corners, scale_d_opt);
+  params.align_corners = align_corners;
+  auto upsamplePSO = lib.getPipelineStateForFunc(fmt::format("upsample_{}_{}", name, scalarToMetalTypeString(input)));
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:upsamplePSO];
+      mtl_setArgs(computeEncoder, input, output, params);
+      mtl_dispatch1DJob(computeEncoder, upsamplePSO, output_size[0] * output_size[1] * output_size[2]);
+    }
+  });
+}
+
+static void upsample_kernel_backward_out_template(const Tensor& grad_input,
+                                                  const Tensor& grad_output,
+                                                  IntArrayRef output_size,
+                                                  IntArrayRef input_size,
+                                                  bool align_corners,
+                                                  std::optional<double> scale_d_opt,
+                                                  std::optional<double> scale_h_opt,
+                                                  std::optional<double> scale_w_opt,
+                                                  const std::string& name) {
+  grad_input.zero_();
+  if (grad_output.numel() == 0) {
+    return;
+  }
+  auto upsamplePSO =
+      lib.getPipelineStateForFunc(fmt::format("upsample_{}_backward_{}", name, scalarToMetalTypeString(grad_input)));
+  UpsampleParams<5> params;
+  memcpy(params.input_sizes.data(), grad_input.sizes().data(), 5 * sizeof(long));
+  memcpy(params.input_strides.data(), grad_input.strides().data(), 5 * sizeof(long));
+  memcpy(params.output_strides.data(), grad_output.strides().data(), 5 * sizeof(long));
+  memcpy(params.output_sizes.data(), grad_output.sizes().data(), 5 * sizeof(long));
+  params.scales[0] =
+      area_pixel_compute_scale<float>(grad_input.size(4), grad_output.size(4), align_corners, scale_w_opt);
+  params.scales[1] =
+      area_pixel_compute_scale<float>(grad_input.size(3), grad_output.size(3), align_corners, scale_h_opt);
+  params.scales[2] =
+      area_pixel_compute_scale<float>(grad_input.size(2), grad_output.size(2), align_corners, scale_d_opt);
+  params.align_corners = align_corners;
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:upsamplePSO];
+      mtl_setArgs(computeEncoder, grad_input, grad_output, params);
+      mtl_dispatch1DJob(computeEncoder, upsamplePSO, output_size[0] * output_size[1] * output_size[2]);
+    }
+  });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static void upsample_kernel_backward_out_template(const Tensor& grad_input,
                                                   const Tensor& grad_output,
                                                   IntArrayRef output_size,
@@ -467,4 +568,82 @@ static void upsample_kernel_backward_out_template(const Tensor& grad_input,
   mps::upsample_kernel_out_template(input, output_size, align_corners, scales_h, scales_w, output, "bilinear2d_aa");
 }
 
+<<<<<<< HEAD
+=======
+TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_out_mps)
+(const Tensor& input,
+ IntArrayRef output_size,
+ bool align_corners,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
+ const Tensor& output) {
+  TORCH_CHECK(at::isFloatingType(input.scalar_type()),
+              "_upsample_bicubic2d_aa_out_mps only supports floating-point dtypes");
+  mps::upsample_kernel_out_template(input, output_size, align_corners, scales_h, scales_w, output, "bicubic2d_aa");
+}
+
+TORCH_IMPL_FUNC(upsample_nearest3d_out_mps)(const Tensor& input,
+                                            IntArrayRef output_size,
+                                            std::optional<double> scales_d,
+                                            std::optional<double> scales_h,
+                                            std::optional<double> scales_w,
+                                            const Tensor& output) {
+  mps::upsample_kernel_out_template(input, output_size, false, scales_d, scales_h, scales_w, output, "nearest_3d");
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact3d_out_mps)(const Tensor& input,
+                                                   IntArrayRef output_size,
+                                                   std::optional<double> scales_d,
+                                                   std::optional<double> scales_h,
+                                                   std::optional<double> scales_w,
+                                                   const Tensor& output) {
+  mps::upsample_kernel_out_template(
+      input, output_size, false, scales_d, scales_h, scales_w, output, "nearest_exact_3d");
+}
+
+TORCH_IMPL_FUNC(upsample_nearest3d_backward_out_mps)(const Tensor& grad_output,
+                                                     IntArrayRef output_size,
+                                                     IntArrayRef input_size,
+                                                     std::optional<double> scales_d,
+                                                     std::optional<double> scales_h,
+                                                     std::optional<double> scales_w,
+                                                     const Tensor& grad_input) {
+  mps::upsample_kernel_backward_out_template(
+      grad_input, grad_output, output_size, input_size, false, scales_d, scales_h, scales_w, "nearest_3d");
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact3d_backward_out_mps)(const Tensor& grad_output,
+                                                            IntArrayRef output_size,
+                                                            IntArrayRef input_size,
+                                                            std::optional<double> scales_d,
+                                                            std::optional<double> scales_h,
+                                                            std::optional<double> scales_w,
+                                                            const Tensor& grad_input) {
+  mps::upsample_kernel_backward_out_template(
+      grad_input, grad_output, output_size, input_size, false, scales_d, scales_h, scales_w, "nearest_exact_3d");
+}
+
+TORCH_IMPL_FUNC(upsample_trilinear3d_out_mps)(const Tensor& input,
+                                              IntArrayRef output_size,
+                                              bool align_corners,
+                                              std::optional<double> scales_d,
+                                              std::optional<double> scales_h,
+                                              std::optional<double> scales_w,
+                                              const Tensor& output) {
+  mps::upsample_kernel_out_template(
+      input, output_size, align_corners, scales_d, scales_h, scales_w, output, "trilinear");
+}
+TORCH_IMPL_FUNC(upsample_trilinear3d_backward_out_mps)(const Tensor& grad_output,
+                                                       IntArrayRef output_size,
+                                                       IntArrayRef input_size,
+                                                       bool align_corners,
+                                                       std::optional<double> scales_d,
+                                                       std::optional<double> scales_h,
+                                                       std::optional<double> scales_w,
+                                                       const Tensor& grad_input) {
+  mps::upsample_kernel_backward_out_template(
+      grad_input, grad_output, output_size, input_size, align_corners, scales_d, scales_h, scales_w, "trilinear");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index d99f4c42c693..26707711e518 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -215,7 +215,11 @@ Tensor as_strided_tensorimpl_mps(const Tensor& self,
   // when we create/run the view graph.
   IntArrayRef base_shape = mps::updateTensorBaseShape(self);
   TORCH_INTERNAL_ASSERT(
+<<<<<<< HEAD
       base_shape.size() > 0, "Failed to update the base shape of tensor's buffer at ", self.storage().data());
+=======
+      !base_shape.empty(), "Failed to update the base shape of tensor's buffer at ", self.storage().data());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return result;
 }
diff --git a/aten/src/ATen/native/mps/operations/WeightNorm.mm b/aten/src/ATen/native/mps/operations/WeightNorm.mm
index 6cc20cfa01aa..887ea1db99e3 100644
--- a/aten/src/ATen/native/mps/operations/WeightNorm.mm
+++ b/aten/src/ATen/native/mps/operations/WeightNorm.mm
@@ -40,7 +40,11 @@
   auto w = at::empty_like(v, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   auto norms = at::empty_like(g, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 
+<<<<<<< HEAD
   string key = "weight_norm_mps_" + std::to_string(dim) + getTensorsStringKey({v, g});
+=======
+  std::string key = "weight_norm_mps_" + std::to_string(dim) + getTensorsStringKey({v, g});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   NSMutableArray* reduction_dims = [NSMutableArray array];
   for (int i = 0; i < v.dim(); ++i) {
@@ -101,7 +105,11 @@
   auto grad_v = at::empty_like(saved_v, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   auto grad_g = at::empty_like(saved_g, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 
+<<<<<<< HEAD
   string key =
+=======
+  std::string key =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "weight_norm_backward_mps_" + std::to_string(dim) + getTensorsStringKey({grad_w, saved_v, saved_g, saved_norms});
 
   NSMutableArray* reduction_dims = [NSMutableArray array];
diff --git a/aten/src/ATen/native/mtia/EmptyTensor.cpp b/aten/src/ATen/native/mtia/EmptyTensor.cpp
new file mode 100644
index 000000000000..e7ff8b719aa2
--- /dev/null
+++ b/aten/src/ATen/native/mtia/EmptyTensor.cpp
@@ -0,0 +1,86 @@
+#include <ATen/Context.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/native/mtia/EmptyTensor.h>
+#include <c10/core/Allocator.h>
+#include <c10/core/DeviceGuard.h>
+
+namespace at::detail {
+
+at::Allocator* GetMTIAAllocator() {
+  return GetAllocator(DeviceType::MTIA);
+}
+
+TensorBase empty_mtia(
+    IntArrayRef size,
+    ScalarType dtype,
+    std::optional<Device> device_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
+  at::globalContext().lazyInitDevice(c10::DeviceType::MTIA);
+  const auto device = device_or_default(device_opt);
+  TORCH_INTERNAL_ASSERT(device.is_mtia());
+  const DeviceGuard device_guard(device);
+  auto* allocator = GetMTIAAllocator();
+  constexpr c10::DispatchKeySet mtia_dks(c10::DispatchKey::MTIA);
+  return at::detail::empty_generic(
+      size, allocator, mtia_dks, dtype, memory_format_opt);
+}
+
+TensorBase empty_mtia(
+    IntArrayRef size,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
+  const auto dtype = dtype_or_default(dtype_opt);
+  return at::detail::empty_mtia(size, dtype, device_opt, memory_format_opt);
+}
+
+TensorBase empty_mtia(IntArrayRef size, const TensorOptions& options) {
+  return at::detail::empty_mtia(
+      size,
+      optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(),
+      options.device_opt(),
+      options.pinned_memory_opt(),
+      options.memory_format_opt());
+}
+
+TensorBase empty_strided_mtia(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    std::optional<Device> device_opt) {
+  at::globalContext().lazyInitDevice(c10::DeviceType::MTIA);
+  const auto device = device_or_default(device_opt);
+  const DeviceGuard device_guard(device);
+  auto* allocator = GetMTIAAllocator();
+  constexpr c10::DispatchKeySet mtia_dks(c10::DispatchKey::MTIA);
+  return at::detail::empty_strided_generic(
+      size, stride, allocator, mtia_dks, dtype);
+}
+
+TensorBase empty_strided_mtia(
+    IntArrayRef size,
+    IntArrayRef stride,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
+  const auto dtype = dtype_or_default(dtype_opt);
+  return at::detail::empty_strided_mtia(size, stride, dtype, device_opt);
+}
+
+TensorBase empty_strided_mtia(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions& options) {
+  return at::detail::empty_strided_mtia(
+      size,
+      stride,
+      optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(),
+      options.device_opt(),
+      options.pinned_memory_opt());
+}
+} // namespace at::detail
diff --git a/aten/src/ATen/native/mtia/EmptyTensor.h b/aten/src/ATen/native/mtia/EmptyTensor.h
new file mode 100644
index 000000000000..afd1e58d40f0
--- /dev/null
+++ b/aten/src/ATen/native/mtia/EmptyTensor.h
@@ -0,0 +1,42 @@
+
+#pragma once
+#include <ATen/core/TensorBase.h>
+
+namespace at::detail {
+
+TensorBase empty_mtia(
+    IntArrayRef size,
+    ScalarType dtype,
+    std::optional<Device> device_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
+
+TensorBase empty_mtia(
+    IntArrayRef size,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
+
+TensorBase empty_mtia(IntArrayRef size, const TensorOptions& options);
+
+TensorBase empty_strided_mtia(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    std::optional<Device> device_opt);
+
+TensorBase empty_strided_mtia(
+    IntArrayRef size,
+    IntArrayRef stride,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt);
+
+TensorBase empty_strided_mtia(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions& options);
+
+} // namespace at::detail
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 9a645dd70fa2..5d4cca5bae26 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -288,13 +288,21 @@
   dispatch:
     CPU: native_dropout_cpu
     CUDA: native_dropout_cuda
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: native_dropout_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [nondeterministic_seeded, core]
   autogen: native_dropout.out
 
 - func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
   dispatch:
+<<<<<<< HEAD
     CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward
+=======
+    CPU, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_backward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CUDA: native_dropout_backward_cuda
   autogen: native_dropout_backward.out
   tags: pointwise
@@ -342,7 +350,11 @@
     CompositeExplicitAutograd: abs
     SparseCPU, SparseCUDA: abs_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: abs_(Tensor(a!) self) -> Tensor(a!)
@@ -352,13 +364,21 @@
     CompositeExplicitAutograd: abs_
     SparseCPU, SparseCUDA: abs_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: abs_out
     MPS: abs_out_mps
+=======
+    CPU, CUDA, MPS: abs_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: abs_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
   tags: pointwise
@@ -431,7 +451,11 @@
   dispatch:
     SparseCPU, SparseCUDA: sgn_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: sgn_(Tensor(a!) self) -> Tensor(a!)
@@ -440,7 +464,11 @@
   dispatch:
     SparseCPU, SparseCUDA: sgn_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -527,8 +555,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: acos_out
     MPS: acos_out_mps
+=======
+    CPU, CUDA, MPS: acos_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 # arccos, alias of acos
@@ -560,7 +592,11 @@
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
     MkldnnCPU: mkldnn_add
     ZeroTensor: add_zerotensor
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add_Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@@ -571,7 +607,11 @@
     SparseCPU, SparseCUDA, SparseMeta: add_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
     MkldnnCPU: mkldnn_add_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -703,7 +743,11 @@
   structured_delegate: all.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_all
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 - func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
@@ -942,7 +986,11 @@
 - func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     ZeroTensor, CPU, CUDA: as_strided_tensorimpl
+=======
+    ZeroTensor, CPU, CUDA, MTIA: as_strided_tensorimpl
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Meta: as_strided_tensorimpl_meta_symint
     MPS: as_strided_tensorimpl_mps
     QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
@@ -982,8 +1030,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: asin_out
     MPS: asin_out_mps
+=======
+    CPU, CUDA, MPS: asin_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: asin_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
   tags: pointwise
@@ -1020,8 +1072,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: atan_out
     MPS: atan_out_mps
+=======
+    CPU, CUDA, MPS: atan_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: atan_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
   tags: pointwise
@@ -1073,6 +1129,19 @@
     XPU: baddbmm_out_xpu
     SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
 
+<<<<<<< HEAD
+=======
+- func: baddbmm.dtype(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _baddbmm_dtype_cuda
+
+- func: baddbmm.dtype_out(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CUDA: _baddbmm_out_dtype_cuda
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
     CompositeExplicitAutograd: bartlett_window
@@ -1185,7 +1254,11 @@
     CompositeExplicitAutograd: binary_cross_entropy_with_logits
   autogen: binary_cross_entropy_with_logits.out
 
+<<<<<<< HEAD
 - func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor
+=======
+- func: bincount(Tensor self, Tensor? weights=None, SymInt minlength=0) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   variants: function, method
   dispatch:
     CPU: _bincount_cpu
@@ -1211,8 +1284,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: bitwise_not_out
     MPS: bitwise_not_out_mps
+=======
+    CPU, CUDA, MPS, MTIA: bitwise_not_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -1262,7 +1339,11 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logical_not
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_logical_not
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
@@ -1270,7 +1351,11 @@
   variants: method
   dispatch:
     CompositeExplicitAutograd: logical_not_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_logical_not_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -1318,7 +1403,11 @@
 - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: logical_and_out
+=======
+    CPU, CUDA, MTIA: logical_and_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: logical_and_out_mps
   tags: pointwise
 
@@ -1339,7 +1428,11 @@
 - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: logical_or_out
+=======
+    CPU, CUDA, MTIA: logical_or_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: logical_or_out_mps
   tags: pointwise
 
@@ -1375,6 +1468,19 @@
     SparseCUDA: bmm_out_sparse_cuda
     SparseCsrCUDA: bmm_out_sparse_csr_cuda
 
+<<<<<<< HEAD
+=======
+- func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _bmm_dtype_cuda
+
+- func: bmm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CUDA: _bmm_out_dtype_cuda
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - func: broadcast_tensors(Tensor[] tensors) -> Tensor[]
   device_check: NoCheck
   device_guard: False
@@ -1394,7 +1500,11 @@
   dispatch:
     SparseCPU, SparseCUDA: cat_sparse
     QuantizedCPU: cat_quantized_cpu
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: cat_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
@@ -1482,7 +1592,11 @@
   device_guard: False
   dispatch:
     CompositeImplicitAutograd: chunk
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: chunk_nested_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
   variants: function, method
@@ -1529,7 +1643,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: clamp_out
+=======
+    CPU, CUDA, MTIA: clamp_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: clamp_out_mps
   tags: pointwise
 
@@ -1569,7 +1687,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: clamp_max_out
+=======
+    CPU, CUDA, MTIA: clamp_max_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: clamp_max_out_mps
   tags: pointwise
 
@@ -1609,7 +1731,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: clamp_min_out
+=======
+    CPU, CUDA, MTIA: clamp_min_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: clamp_min_out_mps
   tags: pointwise
 
@@ -1658,8 +1784,12 @@
 
 - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: complex_out
     MPS: complex_out_mps
+=======
+    CPU, CUDA, MPS: complex_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: polar(Tensor abs, Tensor angle) -> Tensor
   variants: function
@@ -1668,8 +1798,12 @@
 
 - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: polar_out
     MPS: polar_out_mps
+=======
+    CPU, CUDA, MPS: polar_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
   variants: function
@@ -1781,7 +1915,11 @@
     SparseCPU, SparseCUDA: copy_sparse_wrapper_
     CompositeExplicitAutograd: copy_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: copy_nested_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: copy.out
 
 - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
@@ -1801,7 +1939,11 @@
   variants: function, method
   structured_delegate: cos.out
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_cos
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_cos
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: cos_(Tensor(a!) self) -> Tensor(a!)
@@ -1815,8 +1957,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: cos_out
     MPS: cos_out_mps
+=======
+    CPU, CUDA, MPS, MTIA: cos_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: cosh(Tensor self) -> Tensor
@@ -1836,8 +1982,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: cosh_out
     MPS: cosh_out_mps
+=======
+    CPU, CUDA, MPS: cosh_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
@@ -1951,6 +2101,10 @@
   dispatch:
     CPU: cummax_helper_cpu
     CUDA: cummax_helper_cuda
+<<<<<<< HEAD
+=======
+    MPS: cummax_helper_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
@@ -1975,6 +2129,10 @@
   dispatch:
     CPU: cummin_helper_cpu
     CUDA: cummin_helper_cuda
+<<<<<<< HEAD
+=======
+    MPS: cummin_helper_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor
   variants: function
@@ -2139,7 +2297,11 @@
   dispatch:
     SparseCPU, SparseCUDA: div_sparse
     ZeroTensor: div_zerotensor
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -2155,8 +2317,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: div_out
     MPS: div_out_mps
+=======
+    CPU, CUDA, MPS: div_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: div_out_sparse_zerodim
   tags: pointwise
 
@@ -2181,8 +2347,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: div_out_mode
     MPS: div_out_mode_mps
+=======
+    CPU, CUDA, MPS: div_out_mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: div_out_sparse_zerodim
   tags: pointwise
 
@@ -2192,7 +2362,11 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: div
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Scalar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -2292,7 +2466,11 @@
 - func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
   dispatch:
     CompositeExplicitAutograd: embedding_symint
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_embedding
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: embedding.out
   tags: core
 
@@ -2498,7 +2676,11 @@
     QuantizedCPU, QuantizedCUDA: empty_like_quantized
     SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: empty_like_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: empty_like.out
 
 - func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -2534,8 +2716,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: erf_out
     MPS: erf_out_mps
+=======
+    CPU, CUDA, MPS, MTIA: erf_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: erf_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
   tags: pointwise
@@ -2557,7 +2743,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: erfc_out
+=======
+    CPU, CUDA, MPS: erfc_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: exp(Tensor self) -> Tensor
@@ -2577,7 +2767,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: exp_out
+=======
+    CPU, CUDA, MPS, MTIA: exp_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: exp2(Tensor self) -> Tensor
@@ -2594,8 +2788,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: exp2_out
     MPS: exp2_out_mps
+=======
+    CPU, CUDA, MPS: exp2_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: expm1(Tensor self) -> Tensor
@@ -2621,8 +2819,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: expm1_out
     MPS: expm1_out_mps
+=======
+    CPU, CUDA, MPS: expm1_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: expm1_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
   tags: pointwise
@@ -2703,7 +2905,11 @@
     QuantizedCPU, QuantizedCUDA: fill_quantized_
     Meta: fill_meta_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: fill_sparse_csr_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: fill_nested_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: fill.Scalar_out
 
 - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
@@ -2714,7 +2920,11 @@
     MPS: fill_tensor_mps_
     QuantizedCPU, QuantizedCUDA: fill_quantized_
     Meta: fill_meta_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: fill_nested_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: fill.Tensor_out
 
 - func: floor(Tensor self) -> Tensor
@@ -2749,23 +2959,35 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: floor_divide
     MPS: floor_divide_mps
+=======
+    CPU, CUDA, MPS: floor_divide
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: floor_divide_sparse
 
 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: floor_divide_
     MPS: floor_divide_mps_
+=======
+    CPU, CUDA, MPS: floor_divide_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: floor_divide_sparse_
 
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: floor_divide_out
     MPS: floor_divide_out_mps
+=======
+    CPU, CUDA, MPS: floor_divide_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
 
 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@@ -3100,6 +3322,10 @@
   - dim -> int dim
   dispatch:
     CPU, CUDA: index_copy_out
+<<<<<<< HEAD
+=======
+    MPS: index_copy_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
   variants: method
@@ -3170,7 +3396,11 @@
   variants: function
   structured: True
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: isin_Tensor_Scalar_out
+=======
+    CPU, CUDA, MPS: isin_Tensor_Scalar_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor
   variants: function
@@ -3181,6 +3411,10 @@
   structured: True
   dispatch:
     CPU, CUDA: isin_Scalar_Tensor_out
+<<<<<<< HEAD
+=======
+    MPS: isin_Scalar_Tensor_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
   variants: function
@@ -3191,8 +3425,13 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: isnan
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_isnan
+=======
+    CPU, CUDA, MPS, MTIA: isnan
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: isnan_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
   autogen: isnan.out
@@ -3243,7 +3482,11 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: nested_is_same_size
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_is_same_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CompositeExplicitAutograd: is_same_size
 
 - func: is_signed(Tensor self) -> bool
@@ -3265,20 +3508,35 @@
 
 - func: kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
 
+<<<<<<< HEAD
 - func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+=======
+- func: kthvalue(Tensor self, SymInt k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: kthvalue
 
+<<<<<<< HEAD
 - func: kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+=======
+- func: kthvalue.values(Tensor self, SymInt k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dispatch:
     CPU: kthvalue_out_cpu
     CUDA: kthvalue_out_cuda
 
+<<<<<<< HEAD
 - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
 
 - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+=======
+- func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+  variants: function, method
+
+- func: kthvalue.dimname_out(Tensor self, SymInt k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
   dispatch:
@@ -3290,7 +3548,11 @@
     CUDA: layer_norm_cuda
     MPS: layer_norm_mps
     CompositeExplicitAutograd: math_native_layer_norm
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: nested_layer_norm
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_layer_norm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: native_layer_norm.out
   tags: core
 
@@ -3299,7 +3561,11 @@
     CPU: layer_norm_backward_cpu
     CUDA: layer_norm_backward_cuda
     MPS: layer_norm_backward_mps
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: layer_norm_backward_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: native_layer_norm_backward.out
   tags: core
 
@@ -3307,6 +3573,13 @@
   dispatch:
     CompositeImplicitAutograd: rms_norm_symint
 
+<<<<<<< HEAD
+=======
+- func: _fused_rms_norm(Tensor input, int normalized_shape_ndim, Tensor weight, float eps) -> Tensor
+  dispatch:
+    MPS: _fused_rms_norm_mps
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
   variants: function, method
   dispatch:
@@ -3323,7 +3596,11 @@
 
 - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: nan_to_num_out
+=======
+    CPU, CUDA, MTIA: nan_to_num_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: nan_to_num_out_mps
     SparseCPU, SparseCUDA: nan_to_num_sparse_out
   tags: pointwise
@@ -3332,12 +3609,20 @@
   python_module: nn
   dispatch:
     CompositeImplicitAutograd: linear
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: nested_linear
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_linear
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: _mps_linear
 
 - func: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: nested_linear_backward
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_linear_backward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: mps_linear_backward
   autogen: linear_backward.out
 
@@ -3371,7 +3656,11 @@
   dispatch:
     CUDA: _cslt_compress
 
+<<<<<<< HEAD
 - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0, int split_k=1, bool split_k_one_kernel=True) -> Tensor
+=======
+- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0, int split_k=1, int split_k_mode=-1) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dispatch:
     CUDA: _cslt_sparse_mm
   tags: needs_fixed_stride_order
@@ -3496,8 +3785,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: log_out
     MPS: log_out_mps
+=======
+    CPU, CUDA, MPS, MTIA: log_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: log10(Tensor self) -> Tensor
@@ -3517,8 +3810,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: log10_out
     MPS: log10_out_mps
+=======
+    CPU, CUDA, MPS: log10_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: log1p(Tensor self) -> Tensor
@@ -3544,8 +3841,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: log1p_out
     MPS: log1p_out_mps
+=======
+    CPU, CUDA, MPS: log1p_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: log1p_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
   tags: pointwise
@@ -3567,8 +3868,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: log2_out
     MPS: log2_out_mps
+=======
+    CPU, CUDA, MPS, MTIA: log2_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -3715,6 +4020,10 @@
   dispatch:
     CPU: log_softmax_cpu_out
     CUDA: log_softmax_cuda_out
+<<<<<<< HEAD
+=======
+    MTIA: log_softmax_mtia_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: log_softmax_mps_out
 
 - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
@@ -3725,6 +4034,10 @@
   dispatch:
     CPU: log_softmax_backward_cpu_out
     CUDA: log_softmax_backward_cuda_out
+<<<<<<< HEAD
+=======
+    MTIA: log_softmax_backward_mtia_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: log_softmax_backward_mps_out
 
 - func: _logcumsumexp(Tensor self, int dim) -> Tensor
@@ -3776,17 +4089,29 @@
   variants: function, method
   dispatch:
     CompositeImplicitAutograd: matmul
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: matmul_nested
 
 - func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)
   dispatch:
     NestedTensorCPU, NestedTensorCUDA: matmul_backward_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_nested
+
+- func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)
+  dispatch:
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_backward_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: matmul_backward.out
 
 - func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeImplicitAutograd: matmul_out
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: matmul_out_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_out_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Alias to linalg.matrix_power
 - func: matrix_power(Tensor self, int n) -> Tensor
@@ -3848,7 +4173,11 @@
   precomputed:
   - dim -> int dim
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: max_out
+=======
+    CPU, CUDA, MTIA: max_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: max_out_mps
 
 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -4004,6 +4333,10 @@
   dispatch:
     CPU: nanmedian_cpu
     CUDA: nanmedian_cuda
+<<<<<<< HEAD
+=======
+    MPS: nanmedian_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: nanmedian.out
 
 - func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -4015,6 +4348,10 @@
   dispatch:
     CPU: nanmedian_out_cpu
     CUDA: nanmedian_out_cuda
+<<<<<<< HEAD
+=======
+    MPS: nanmedian_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -4035,7 +4372,11 @@
   precomputed:
   - dim -> int dim
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: min_out
+=======
+    CPU, CUDA, MTIA: min_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: min_out_mps
 
 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -4143,11 +4484,26 @@
   dispatch:
     CPU: mm_out_cpu
     CUDA: mm_out_cuda
+<<<<<<< HEAD
+=======
+    MTIA: mm_out_mtia
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: mm_out_mps
     XPU: mm_out_xpu
     SparseCPU, SparseCUDA: _sparse_mm_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out
 
+<<<<<<< HEAD
+=======
+- func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
+  dispatch:
+    CUDA: _mm_dtype_cuda
+
+- func: mm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: _mm_dtype_out_cuda
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
   dispatch:
     CPU: _int_mm_cpu
@@ -4168,6 +4524,13 @@
     MPS: _weight_int4pack_mm_mps
     CUDA: _weight_int4pack_mm_cuda
 
+<<<<<<< HEAD
+=======
+- func: _weight_int4pack_mm_with_scales_and_zeros(Tensor self, Tensor mat2, int qGroupSize, Tensor qScale, Tensor qZeros) -> Tensor
+  dispatch:
+    XPU: _weight_int4pack_mm_xpu
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Split int4 pack weight between cpu and other devices due to
 # https://github.com/pytorch/ao/issues/1117#issuecomment-2451252756.
 - func: _convert_weight_to_int4pack_for_cpu(Tensor self, int innerKTiles) -> Tensor
@@ -4226,7 +4589,11 @@
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
     MkldnnCPU: mkldnn_mul
     ZeroTensor: mul_zerotensor
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul_Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -4237,7 +4604,11 @@
     SparseCPU, SparseCUDA: mul_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
     MkldnnCPU: mkldnn_mul_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -4245,8 +4616,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: mul_out
     MPS: mul_out_mps
+=======
+    CPU, CUDA, MPS: mul_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU: mul_out_sparse_cpu
     SparseCUDA: mul_out_sparse_cuda
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
@@ -4260,7 +4635,11 @@
   dispatch:
     CompositeExplicitAutograd: mul
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_scalar_sparse_csr
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul_Scalar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -4269,7 +4648,11 @@
   dispatch:
     CompositeExplicitAutograd: mul_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul__scalar_sparse_csr
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Scalar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: mul.Scalar_out
   tags: pointwise
 # multiply, alias for mul
@@ -4335,7 +4718,11 @@
   device_guard: False
   dispatch:
     CompositeImplicitAutograd: narrow_symint
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: narrow_nested_symint
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: narrow_nested_symint
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
   variants: function, method
@@ -4474,7 +4861,11 @@
     # NB: Although this composite mutates on the inside, it is
     # non-differentiable so NonFunctional doesn't apply
     CompositeExplicitAutograd: ones_like
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: ones_like
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: ones_like
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: ones_like.out
 
 - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
@@ -4756,6 +5147,17 @@
     CompositeExplicitAutograd: randint_like
   autogen: randint_like.out
 
+<<<<<<< HEAD
+=======
+- func: randint_like.Tensor(Tensor self, Tensor high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+  tags: nondeterministic_seeded
+  dispatch:
+    # NB: Although this composite mutates on the inside, it is
+    # non-differentiable so NonFunctional doesn't apply
+    CompositeExplicitAutograd: randint_like
+  autogen: randint_like.Tensor_out
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
   tags: nondeterministic_seeded
   dispatch:
@@ -4865,7 +5267,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: reciprocal_out
+=======
+    CPU, CUDA, MTIA: reciprocal_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: reciprocal_out_mps
   tags: pointwise
 
@@ -4876,7 +5282,11 @@
   dispatch:
     SparseCPU, SparseCUDA: neg_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
@@ -4886,7 +5296,11 @@
   dispatch:
     SparseCPU, SparseCUDA: neg_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -4894,8 +5308,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: neg_out
     MPS: neg_out_mps
+=======
+    CPU, CUDA, MPS, MTIA: neg_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: neg_out_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
   tags: pointwise
@@ -4957,7 +5375,11 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias
+=======
+    CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS, MTIA: _reshape_alias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # We don't need to support mkldnn since this is handled explicitly by the reshape operator.
 
 - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
@@ -5035,12 +5457,20 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: relu
+=======
+    CPU, CUDA, MTIA: relu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: relu_mps
     MkldnnCPU: mkldnn_relu
     QuantizedCPU: relu_quantized_cpu
     QuantizedCUDA: relu_quantized_cuda
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: relu_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
   tags: [core, pointwise]
@@ -5049,12 +5479,20 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: relu_
+=======
+    CPU, CUDA, MTIA: relu_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: relu_mps_
     MkldnnCPU: mkldnn_relu_
     QuantizedCPU: relu_quantized_cpu_
     QuantizedCUDA: relu_quantized_cuda_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: relu_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
   autogen: relu.out
@@ -5100,7 +5538,11 @@
   python_module: nn
   dispatch:
     QuantizedCPU: gelu_quantized_cpu_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_gelu_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: gelu(Tensor self, *, str approximate='none') -> Tensor
   structured_delegate: gelu.out
@@ -5110,7 +5552,11 @@
     MkldnnCPU: mkldnn_gelu
     QuantizedCPU: gelu_quantized_cpu
     QuantizedCUDA: gelu_quantized_cuda
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_gelu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
@@ -5127,7 +5573,11 @@
   python_module: nn
   dispatch:
     MkldnnCPU: mkldnn_gelu_backward
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: gelu_backwards_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: gelu_backwards_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
@@ -5141,7 +5591,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: hardshrink_out
+=======
+    CPU, CUDA, MPS: hardshrink_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   structured_delegate: hardshrink.out
@@ -5153,7 +5607,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: hardshrink_backward_out
+=======
+    CPU, CUDA, MPS: hardshrink_backward_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
   structured_delegate: hardshrink_backward.grad_input
@@ -5176,8 +5634,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: rsqrt_out
     MPS: rsqrt_out_mps
+=======
+    CPU, CUDA, MPS, MTIA: rsqrt_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
@@ -5192,7 +5654,11 @@
   dispatch:
     CompositeExplicitAutograd: select_symint
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: select_sparse_csr
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: select_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: select_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 - func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
@@ -5208,7 +5674,11 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: _nested_select_backward_symint
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_select_backward_symint
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: selu(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -5233,14 +5703,22 @@
   structured_delegate: silu.out
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_silu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: silu_(Tensor(a!) self) -> Tensor(a!)
   structured_delegate: silu.out
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_silu_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -5248,7 +5726,11 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: silu_out
+=======
+    CPU, CUDA, MTIA: silu_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: silu_out_mps
   tags: pointwise
 
@@ -5266,7 +5748,11 @@
   python_module: nn
   dispatch:
     CompositeImplicitAutograd: math_silu_backward
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: silu_backward_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: silu_backward_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: mish(Tensor self) -> Tensor
@@ -5315,14 +5801,22 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: sigmoid_out
     MPS: sigmoid_out_mps
+=======
+    CPU, CUDA, MPS: sigmoid_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: logit(Tensor self, float? eps=None) -> Tensor
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: logit
+=======
+    CPU, CUDA, MTIA: logit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: logit_mps
   tags: pointwise
 
@@ -5345,7 +5839,11 @@
   dispatch:
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
     SparseCPU, SparseCUDA: sin_sparse
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_sin
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: sin_(Tensor(a!) self) -> Tensor(a!)
@@ -5362,8 +5860,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: sin_out
     MPS: sin_out_mps
+=======
+    CPU, CUDA, MPS, MTIA: sin_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
     SparseCPU, SparseCUDA: sin_sparse_out
   tags: pointwise
@@ -5408,8 +5910,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: sinh_out
     MPS: sinh_out_mps
+=======
+    CPU, CUDA, MPS: sinh_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: sinh_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out
 
@@ -5429,7 +5935,11 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: detach
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: detach
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: detach
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Like `detach()`, but modifies this `Variable` in-place. This method may
 # only be called on non-view `Variable`s. You can use `is_view()` to check
@@ -5559,7 +6069,11 @@
   structured_delegate: _softmax.out
   dispatch:
     MkldnnCPU: mkldnn_softmax
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: softmax_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: softmax_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 - func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
@@ -5572,7 +6086,11 @@
 - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
   structured_delegate: _softmax_backward_data.out
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: nested_softmax_backward
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_softmax_backward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
   structured: True
@@ -5616,7 +6134,11 @@
   device_guard: False
   dispatch:
     CompositeExplicitAutograd: split_with_sizes
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: split_with_sizes_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 - func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
@@ -5644,7 +6166,11 @@
   dispatch:
     CompositeExplicitAutograd: squeeze
     QuantizedCPU, QuantizedCUDA: squeeze_quantized
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: squeeze_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
   variants: function, method
@@ -5653,7 +6179,11 @@
   dispatch:
     CompositeExplicitAutograd: squeeze
     QuantizedCPU, QuantizedCUDA: squeeze_quantized
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_dim_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
@@ -5669,7 +6199,11 @@
   dispatch:
     CompositeExplicitAutograd: squeeze
     QuantizedCPU, QuantizedCUDA: squeeze_quantized
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_dim_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 - func: squeeze_(Tensor(a!) self) -> Tensor(a!)
@@ -5843,7 +6377,11 @@
   structured_delegate: sqrt.out
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_sqrt
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: sqrt_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
   tags: [core, pointwise]
@@ -5862,7 +6400,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: sqrt_out
+=======
+    CPU, CUDA, MPS, MTIA: sqrt_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: sqrt_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
   tags: pointwise
@@ -6019,8 +6561,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: tan_out
     MPS: tan_out_mps
+=======
+    CPU, CUDA, MPS: tan_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: tan_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
   tags: pointwise
@@ -6034,7 +6580,11 @@
     MkldnnCPU: mkldnn_tanh
     SparseCPU, SparseCUDA: tanh_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: tanh_(Tensor(a!) self) -> Tensor(a!)
@@ -6045,7 +6595,11 @@
     MkldnnCPU: mkldnn_tanh_
     SparseCPU, SparseCUDA: tanh_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -6053,7 +6607,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: tanh_out
+=======
+    CPU, CUDA, MPS, MTIA: tanh_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: tanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
   tags: pointwise
@@ -6102,7 +6660,11 @@
     MkldnnCPU: mkldnn_relu_backward
     SparseCPU, SparseCUDA: threshold_backward_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: threshold_backward_sparse_compressed
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: threshold_backwards_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: tile(Tensor self, SymInt[] dims) -> Tensor
@@ -6116,7 +6678,11 @@
   device_guard: False
   dispatch:
     CompositeExplicitAutograd: transpose
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: transpose_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: transpose_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
   variants: function, method
@@ -6213,13 +6779,21 @@
 - func: _nested_tensor_size(Tensor self) -> Tensor
   variants: method
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: _nested_tensor_size
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_tensor_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _nested_tensor_size.out
 
 - func: _nested_tensor_strides(Tensor self) -> Tensor
   variants: method
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_tensor_strides
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _nested_tensor_strides.out
 
 - func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
@@ -6232,7 +6806,11 @@
 # _nested_from_padded_and_nested_example is available for testing.
 - func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _nested_from_padded_and_nested_example.out
 
 # The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
@@ -6423,7 +7001,11 @@
     CompositeExplicitAutograd: unsqueeze
     SparseCPU, SparseCUDA: unsqueeze_sparse
     QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: unsqueeze_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
@@ -6517,15 +7099,25 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: where
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_where
+=======
+    CPU, CUDA, MPS, MTIA: where
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_where
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, MPS: where_self_out
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_where_out
+=======
+    CPU, CUDA, MPS, MTIA: where_self_out
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_where_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
   variants: function
@@ -6860,7 +7452,11 @@
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed
     MkldnnCPU: mkldnn_clone
     QuantizedCPU, QuantizedCUDA: quantized_clone
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: clone_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: clone_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: clone.out
   tags: [core, pointwise]
 
@@ -6894,7 +7490,11 @@
     SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
     MkldnnCPU: mkldnn_zero_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: zero_nested_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: zero_nested_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: zero, zero.out
 
 - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -6914,7 +7514,11 @@
   dispatch:
     SparseCPU, SparseCUDA: sub_sparse
     ZeroTensor: sub_zerotensor
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@@ -6961,7 +7565,11 @@
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: rsub
+=======
+    CPU, CUDA, MPS: rsub
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: rsub.Tensor_out
 
 - func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
@@ -7043,6 +7651,17 @@
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
   tags: core
 
+<<<<<<< HEAD
+=======
+- func: addmm.dtype(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  dispatch:
+    CUDA: _addmm_dtype_cuda
+
+- func: addmm.dtype_out(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: _addmm_dtype_out_cuda
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
   structured_delegate: addmm.out
   variants: method
@@ -7066,11 +7685,19 @@
 - func: _scaled_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
   variants: function
   dispatch:
+<<<<<<< HEAD
+=======
+    CPU: _scaled_mm_cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CUDA: _scaled_mm_cuda
 
 - func: _scaled_mm.out(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
+<<<<<<< HEAD
+=======
+    CPU: _scaled_mm_out_cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CUDA: _scaled_mm_out_cuda
 
 
@@ -7079,6 +7706,14 @@
   dispatch:
     CUDA: _scaled_grouped_mm_cuda
 
+<<<<<<< HEAD
+=======
+- func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _grouped_mm_cuda
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NOTE [ Sparse: autograd and API ]
 #
 #
@@ -7233,6 +7868,7 @@
   dispatch:
     CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint
 
+<<<<<<< HEAD
 - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> ()
 
 - func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
@@ -7240,6 +7876,15 @@
 - func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
 - func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
 - func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
+=======
+- func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None, bool? check_pinning=None) -> ()
+
+- func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout, bool? check_pinning=None) -> ()
+- func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
+- func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
+- func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
+- func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
   dispatch:
@@ -7397,7 +8042,11 @@
   dispatch:
     SparseCPU, SparseCUDA, SparseMeta: values_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: values_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: values_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CompositeExplicitAutograd: values_default
   device_check: NoCheck
   device_guard: False
@@ -7456,7 +8105,11 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: unbind
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_unbind
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
   variants: function, method
@@ -7744,7 +8397,11 @@
   device_guard: False
   dispatch:
     CompositeExplicitAutograd: _to_copy
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: _to_copy_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _to_copy_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _to_copy.out
   tags: core
 
@@ -8030,7 +8687,11 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: masked_fill
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_masked_fill
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
@@ -8085,9 +8746,15 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
+<<<<<<< HEAD
     ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
     MkldnnCPU: mkldnn_view
     NestedTensorCPU, NestedTensorCUDA: view_nested
+=======
+    ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS, MTIA: view
+    MkldnnCPU: mkldnn_view
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: view_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 # Warning: If you want to change the name or overload name of this
@@ -8315,7 +8982,11 @@
   structured_inherits: TensorIteratorBase
   variants: function
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: bitwise_and_out
+=======
+    CPU, CUDA, MTIA: bitwise_and_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: bitwise_and_out_mps
   tags: pointwise
 
@@ -8382,7 +9053,11 @@
   structured_inherits: TensorIteratorBase
   variants: function
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: bitwise_or_out
+=======
+    CPU, CUDA, MTIA: bitwise_or_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: bitwise_or_out_mps
   tags: pointwise
 
@@ -8928,7 +9603,11 @@
   variants: method, function
   dispatch:
     QuantizedCPU: eq_quantized_cpu
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: eq_scalar_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: eq_scalar_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -8947,7 +9626,11 @@
   variants: method, function
   dispatch:
     QuantizedCPU: eq_quantized_cpu
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: eq_tensor_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: eq_tensor_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -8966,7 +9649,11 @@
   variants: method, function
   dispatch:
     QuantizedCPU: ge_quantized_cpu
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: ge_scalar_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: ge_scalar_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -9093,7 +9780,11 @@
   variants: method, function
   dispatch:
     QuantizedCPU: gt_quantized_cpu
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: gt_scalar_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: gt_scalar_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: [core, pointwise]
 
 - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -9146,7 +9837,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: lt_Scalar_out
+=======
+    CPU, CUDA, MTIA: lt_Scalar_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: lt_scalar_out_mps
     QuantizedCPU: lt_out_quantized_cpu
   tags: pointwise
@@ -9164,7 +9859,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: lt_Tensor_out
+=======
+    CPU, CUDA, MTIA: lt_Tensor_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: lt_tensor_out_mps
     QuantizedCPU: lt_out_quantized_cpu
   tags: pointwise
@@ -9436,14 +10135,22 @@
 
 - func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: cholesky_out
     MPS: cholesky_mps_out
+=======
+    CPU, CUDA, MPS: cholesky_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: cholesky(Tensor self, bool upper=False) -> Tensor
   variants: method, function
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: cholesky
     MPS: cholesky_mps
+=======
+    CPU, CUDA, MPS: cholesky
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -9520,13 +10227,21 @@
     MPS: lu_unpack_out_mps
 
 # TODO: remove dispatch section when porting TH CUDA to ATen
+<<<<<<< HEAD
 - func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+=======
+- func: multinomial.out(Tensor self, SymInt num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: nondeterministic_seeded
   dispatch:
     CPU, CUDA: multinomial_out
     MPS: multinomial_out_mps
 
+<<<<<<< HEAD
 - func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
+=======
+- func: multinomial(Tensor self, SymInt num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   variants: method, function
   dispatch:
     CPU, CUDA: multinomial
@@ -9727,8 +10442,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: lerp_Scalar
     MPS: lerp_Scalar_mps
+=======
+    CPU, CUDA, MPS: lerp_Scalar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
@@ -9827,8 +10546,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: fmod_out
     MPS: fmod_mps_out
+=======
+    CPU, CUDA, MPS: fmod_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
@@ -9934,8 +10657,12 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: remainder_out
     MPS: remainder_out_mps
+=======
+    CPU, CUDA, MPS, MTIA: remainder_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
@@ -10019,7 +10746,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: maximum_out
+=======
+    CPU, CUDA, MTIA: maximum_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: maximum_out_mps
   tags: pointwise
 
@@ -10051,7 +10782,11 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: minimum_out
+=======
+    CPU, CUDA, MTIA: minimum_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: minimum_out_mps
   tags: pointwise
 
@@ -10203,7 +10938,11 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, Meta, MPS: unfold
+=======
+    CPU, CUDA, Meta, MPS, MTIA: unfold
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     QuantizedCPU, QuantizedCUDA: unfold
 
 - func: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
@@ -10316,7 +11055,11 @@
     MPS: normal_mps_
     Meta: normal_meta_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: normal_sparse_csr_
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: normal_nested_
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: normal_nested_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: normal.out
 
 # Only used by the functionalization pass.
@@ -10384,7 +11127,11 @@
   variants: method, function
   dispatch:
     CompositeExplicitAutograd: alias
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: alias_nested
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: alias_nested
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
@@ -10392,6 +11139,10 @@
   dispatch:
     CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
     CPU: _amp_foreach_non_finite_check_and_unscale_cpu_
+<<<<<<< HEAD
+=======
+    MPS: _amp_foreach_non_finite_check_and_unscale_mps_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
 
 - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
@@ -10399,6 +11150,10 @@
   dispatch:
     CUDA: _amp_update_scale_cuda_
     CPU: _amp_update_scale_cpu_
+<<<<<<< HEAD
+=======
+    MPS: _amp_update_scale_mps_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _amp_update_scale, _amp_update_scale.out
 
     #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
@@ -11801,7 +12556,11 @@
   structured_delegate: elu.out
   device_check: NoCheck   # TensorIterator
   python_module: nn
+<<<<<<< HEAD
   tags: pointwise
+=======
+  tags: [core, pointwise]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!)
   structured: True
@@ -11865,8 +12624,12 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: hardsigmoid_out
     MPS: hardsigmoid_out_mps
+=======
+    CPU, CUDA, MPS: hardsigmoid_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     QuantizedCPU: hardsigmoid_out_quantized_cpu
 
 - func: hardsigmoid(Tensor self) -> Tensor
@@ -11887,8 +12650,12 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: hardsigmoid_backward_out
     MPS: hardsigmoid_backward_out_mps
+=======
+    CPU, CUDA, MPS: hardsigmoid_backward_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
   structured_delegate: hardsigmoid_backward.grad_input
@@ -11932,28 +12699,44 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: hardswish_out
     MPS: hardswish_out_mps
+=======
+    CPU, CUDA, MPS: hardswish_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: hardswish(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: hardswish
     MPS: hardswish_mps
+=======
+    CPU, CUDA, MPS: hardswish
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: hardswish_
     MPS: hardswish_mps_
+=======
+    CPU, CUDA, MPS: hardswish_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: hardswish_backward
     MPS: hardswish_backward_mps
+=======
+    CPU, CUDA, MPS: hardswish_backward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: hardswish_backward.out
 
 - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
@@ -11962,8 +12745,12 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: leaky_relu_out
     MPS: leaky_relu_out_mps
+=======
+    CPU, CUDA, MPS: leaky_relu_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     QuantizedCPU: leaky_relu_out_quantized_cpu
 
 - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
@@ -11979,8 +12766,12 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: leaky_relu_backward_out
     MPS: leaky_relu_backward_out_mps
+=======
+    CPU, CUDA, MPS: leaky_relu_backward_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
   structured_delegate: leaky_relu_backward.grad_input
@@ -12092,8 +12883,12 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: softshrink_out
     MPS: softshrink_out_mps
+=======
+    CPU, CUDA, MPS: softshrink_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   structured_delegate: softshrink.out
@@ -12106,8 +12901,12 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: softshrink_backward_out
     MPS: softshrink_backward_out_mps
+=======
+    CPU, CUDA, MPS: softshrink_backward_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
   structured_delegate: softshrink_backward.grad_input
@@ -12769,6 +13568,10 @@
   dispatch:
     CPU: _upsample_bicubic2d_aa_out_cpu
     CUDA: _upsample_bicubic2d_aa_out_cuda
+<<<<<<< HEAD
+=======
+    MPS: _upsample_bicubic2d_aa_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12791,6 +13594,10 @@
   dispatch:
     CPU: upsample_trilinear3d_out_cpu
     CUDA: upsample_trilinear3d_out_cuda
+<<<<<<< HEAD
+=======
+    MPS: upsample_trilinear3d_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12802,6 +13609,10 @@
   dispatch:
     CPU: upsample_trilinear3d_backward_out_cpu
     CUDA: upsample_trilinear3d_backward_out_cuda
+<<<<<<< HEAD
+=======
+    MPS: upsample_trilinear3d_backward_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12913,6 +13724,10 @@
   dispatch:
     CPU: upsample_nearest3d_out_cpu
     CUDA: upsample_nearest3d_out_cuda
+<<<<<<< HEAD
+=======
+    MPS: upsample_nearest3d_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -12920,6 +13735,10 @@
   dispatch:
     CPU: _upsample_nearest_exact3d_out_cpu
     CUDA: _upsample_nearest_exact3d_out_cuda
+<<<<<<< HEAD
+=======
+    MPS: _upsample_nearest_exact3d_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12939,6 +13758,10 @@
   dispatch:
     CPU: upsample_nearest3d_backward_out_cpu
     CUDA: upsample_nearest3d_backward_out_cuda
+<<<<<<< HEAD
+=======
+    MPS: upsample_nearest3d_backward_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: _upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -12946,6 +13769,10 @@
   dispatch:
     CPU: _upsample_nearest_exact3d_backward_out_cpu
     CUDA: _upsample_nearest_exact3d_backward_out_cuda
+<<<<<<< HEAD
+=======
+    MPS: _upsample_nearest_exact3d_backward_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12988,7 +13815,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: tanh_backward_out
+=======
+    CPU, CUDA, MTIA: tanh_backward_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MPS: tanh_backward_out_mps
   tags: pointwise
 
@@ -13120,12 +13951,20 @@
   dispatch:
     CPU: col2im_out_cpu
     CUDA: col2im_out_cuda
+<<<<<<< HEAD
+=======
+    MPS: col2im_out_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
   python_module: nn
   dispatch:
     CPU: col2im_cpu
     CUDA: col2im_cuda
+<<<<<<< HEAD
+=======
+    MPS: col2im_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: core
 
 - func: column_stack(Tensor[] tensors) -> Tensor
@@ -13158,7 +13997,11 @@
   device_guard: False
   dispatch:
     CompositeExplicitAutograd: isinf
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_isinf
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: isinf_sparse
     SparseMeta: isinf_sparse_meta
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
@@ -13174,7 +14017,11 @@
   variants: function, method
   structured_delegate: isposinf.out
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_isposinf
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: isposinf_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
   tags: pointwise
@@ -13192,7 +14039,11 @@
   variants: function, method
   structured_delegate: isneginf.out
   dispatch:
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_isneginf
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SparseCPU, SparseCUDA: isneginf_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
   tags: pointwise
@@ -13500,7 +14351,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_i0e_out
+=======
+    CPU, CUDA, MPS: special_i0e_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: special_i1(Tensor self) -> Tensor
@@ -13528,7 +14383,11 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_i1e_out
+=======
+    CPU, CUDA, MPS: special_i1e_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tags: pointwise
 
 - func: special_logit(Tensor self, float? eps=None) -> Tensor
@@ -13897,8 +14756,12 @@
   python_module: linalg
   structured: True
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: linalg_cholesky_ex_out
     MPS: linalg_cholesky_ex_out_mps
+=======
+    CPU, CUDA, MPS: linalg_cholesky_ex_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor
   python_module: linalg
@@ -14468,13 +15331,21 @@
   dispatch:
     # the NestedTensor keys are necessary because NestedTensor has been removed
     # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys]
+<<<<<<< HEAD
     CompositeExplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
+=======
+    CompositeExplicitAutograd, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _test_autograd_multiple_dispatch.fullcoverage_out
 
 # Note: this function is only for testing.
 - func: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor
   dispatch:
+<<<<<<< HEAD
     CompositeImplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
+=======
+    CompositeImplicitAutograd, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Note: this function is only for testing.
 - func: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)
@@ -14819,13 +15690,21 @@
 - func: _safe_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   dispatch:
     CompositeExplicitAutograd: _safe_softmax
+<<<<<<< HEAD
     NestedTensorCPU, NestedTensorCUDA: _safe_softmax
+=======
+    NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _safe_softmax
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
 - func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
   variants: function
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
+=======
+    CPU, CUDA, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: transformer_encoder_layer_forward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   autogen: _transformer_encoder_layer_fwd.out
 
 - func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)
@@ -14990,7 +15869,11 @@
 
 - func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_bessel_j0_out
+=======
+    CPU, CUDA, MPS: special_bessel_j0_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15005,7 +15888,11 @@
 
 - func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_bessel_j1_out
+=======
+    CPU, CUDA, MPS: special_bessel_j1_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15020,7 +15907,11 @@
 
 - func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_bessel_y0_out
+=======
+    CPU, CUDA, MPS: special_bessel_y0_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15035,7 +15926,11 @@
 
 - func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_bessel_y1_out
+=======
+    CPU, CUDA, MPS: special_bessel_y1_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15068,7 +15963,11 @@
 - func: special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_chebyshev_polynomial_t_out
+=======
+    CPU, CUDA, MPS: special_chebyshev_polynomial_t_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15117,7 +16016,11 @@
 - func: special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_chebyshev_polynomial_u_out
+=======
+    CPU, CUDA, MPS: special_chebyshev_polynomial_u_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15166,7 +16069,11 @@
 - func: special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_chebyshev_polynomial_v_out
+=======
+    CPU, CUDA, MPS: special_chebyshev_polynomial_v_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15215,7 +16122,11 @@
 - func: special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_chebyshev_polynomial_w_out
+=======
+    CPU, CUDA, MPS: special_chebyshev_polynomial_w_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15264,7 +16175,11 @@
 - func: special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_hermite_polynomial_h_out
+=======
+    CPU, CUDA, MPS: special_hermite_polynomial_h_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15313,7 +16228,11 @@
 - func: special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_hermite_polynomial_he_out
+=======
+    CPU, CUDA, MPS: special_hermite_polynomial_he_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15442,7 +16361,11 @@
 
 - func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_modified_bessel_i0_out
+=======
+    CPU, CUDA, MPS: special_modified_bessel_i0_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15457,7 +16380,11 @@
 
 - func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_modified_bessel_i1_out
+=======
+    CPU, CUDA, MPS: special_modified_bessel_i1_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15472,7 +16399,11 @@
 
 - func: special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_modified_bessel_k0_out
+=======
+    CPU, CUDA, MPS: special_modified_bessel_k0_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15487,7 +16418,11 @@
 
 - func: special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_modified_bessel_k1_out
+=======
+    CPU, CUDA, MPS: special_modified_bessel_k1_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15502,7 +16437,11 @@
 
 - func: special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_scaled_modified_bessel_k0_out
+=======
+    CPU, CUDA, MPS: special_scaled_modified_bessel_k0_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15517,7 +16456,11 @@
 
 - func: special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+<<<<<<< HEAD
     CPU, CUDA: special_scaled_modified_bessel_k1_out
+=======
+    CPU, CUDA, MPS: special_scaled_modified_bessel_k1_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15808,6 +16751,16 @@
     CPU: _fused_adagrad_kernel_cpu_
   autogen: _fused_adagrad, _fused_adagrad.out
 
+<<<<<<< HEAD
+=======
+- func: _fused_adagrad_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: _fused_adagrad_kernel_cpu_
+  autogen: _fused_adagrad.tensor_lr, _fused_adagrad.tensor_lr_out
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
 - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
   variants: function
diff --git a/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
index 988a0297d002..25589792386a 100644
--- a/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
@@ -72,7 +72,11 @@ static get_elementwise_nested_tensor_impl(
 }
 
 template <typename Func>
+<<<<<<< HEAD
 Tensor NestedTensor_elementwise_Tensor(
+=======
+static Tensor NestedTensor_elementwise_Tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     const Tensor& other,
     const std::string& op_name,
@@ -234,7 +238,11 @@ Tensor NestedTensor_masked_fill(
 
 
 template <typename Func>
+<<<<<<< HEAD
 Tensor& NestedTensor_elementwise__Tensor(
+=======
+static Tensor& NestedTensor_elementwise__Tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Tensor& self,
     const Tensor& other,
     const std::string& op_name,
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index 00dd63a76f9b..1b3205c78cb6 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -172,14 +172,23 @@ std::tuple<Tensor, Tensor, Tensor> nested_layer_norm(
       std::nullopt /* pin_memory */,
       at::MemoryFormat::Contiguous);
   auto options = input_buffer.options();
+<<<<<<< HEAD
   if (input_buffer.is_cuda()) {
     auto acc_type = at::toAccumulateType(input_buffer.scalar_type(), true);
+=======
+  if (input_buffer.is_cuda() || input_buffer.is_xpu()) {
+    auto acc_type = at::toAccumulateType(input_buffer.scalar_type(), input_buffer.device().type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     options = options.dtype(acc_type);
   }
   Tensor mean = at::empty({M}, options);
   Tensor rstd = at::empty({M}, options);
   LayerNormKernel(
+<<<<<<< HEAD
       input_buffer.is_cuda() ? kCUDA : kCPU,
+=======
+      input_buffer.device().type(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       input_buffer,
       *weight_contig,
       *bias_contig,
diff --git a/aten/src/ATen/native/nested/README.md b/aten/src/ATen/native/nested/README.md
index e79256cff2d6..20a36406c725 100644
--- a/aten/src/ATen/native/nested/README.md
+++ b/aten/src/ATen/native/nested/README.md
@@ -16,7 +16,11 @@ When constructing a NestedTensor in C++ you will likely not be using the NestedT
 
 ##  Code Structure
 
+<<<<<<< HEAD
 The NestedTensor code is split into two parts: the C++ code and the Python code. The C++ code is located in [aten/src/ATen/native/nested](.) and the Python code is located in [torch/nested/__init__.py](/torch/nested/__init__.py). The C++ code is split into the following files:
+=======
+The NestedTensor code is split into two parts: the C++ code and the Python code. The C++ code is located in [aten/src/ATen/native/nested](.) and the Python code is located in [torch/nested/__init__.py](../../../../../torch/nested/__init__.py). The C++ code is split into the following files:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - `NestedTensorImpl.h | NestedTensorImpl.cpp`: The NestedTensor data structure and its methods.
 - `NestedTensorUtils.h | NestedTensorUtils.cpp`: Utility functions for working with NestedTensors. (This is where you will find  `map_nested_tensor` which is discussed below in the section on implementing new functions.)
@@ -60,4 +64,8 @@ If performance is not your main concern and you would like to enable coverage th
 ##  Best Practices
 
 ## Testing
+<<<<<<< HEAD
 Unit tests for NestedTensors can be found at [test/test_nestedtensor.py](/test/test_nestedtensor.py). If a new operator is added to NestedTensors it is important to add a unit test for it. The unit tests are run on the CI and if they fail the PR will not be merged.
+=======
+Unit tests for NestedTensors can be found at [test/test_nestedtensor.py](../../../../../test/test_nestedtensor.py). If a new operator is added to NestedTensors it is important to add a unit test for it. The unit tests are run on the CI and if they fail the PR will not be merged.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/quantized/AffineQuantizer.cpp b/aten/src/ATen/native/quantized/AffineQuantizer.cpp
index 6bd9bfd687aa..5228611f0ae3 100644
--- a/aten/src/ATen/native/quantized/AffineQuantizer.cpp
+++ b/aten/src/ATen/native/quantized/AffineQuantizer.cpp
@@ -151,6 +151,10 @@ Tensor& quantize_tensor_per_channel_affine(
   AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() {
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
     if (qtensor.device().type() != c10::DeviceType::CUDA &&
+<<<<<<< HEAD
+=======
+        qtensor.device().type() != c10::DeviceType::XPU &&
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qtensor.device().type() != c10::DeviceType::PrivateUse1) {
       checkZeroPoints<underlying_t>(fn_name, zero_points);
     }  // for cuda and privateuse1, this check will occur in the actual device function
@@ -242,6 +246,10 @@ Tensor& dequantize_tensor_per_channel_affine(
   AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() {
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
     if(qtensor.device().type() != c10::DeviceType::CUDA &&
+<<<<<<< HEAD
+=======
+       qtensor.device().type() != c10::DeviceType::XPU &&
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        qtensor.device().type() != c10::DeviceType::PrivateUse1){
       checkZeroPoints<underlying_t>(fn_name, zero_points);
     }  // for cuda and privateuse1, this check will occur in the actual device function
diff --git a/aten/src/ATen/native/quantized/AffineQuantizerBase.cpp b/aten/src/ATen/native/quantized/AffineQuantizerBase.cpp
index 15d4eeeddccc..730253f91027 100644
--- a/aten/src/ATen/native/quantized/AffineQuantizerBase.cpp
+++ b/aten/src/ATen/native/quantized/AffineQuantizerBase.cpp
@@ -104,6 +104,7 @@ inline float dequantize_val(double scale, int64_t zero_point, T value) {
 }
 #else // USE_FBGEMM
 
+<<<<<<< HEAD
 #if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
 template <class T>
 inline float Round(const float x) {
@@ -119,6 +120,8 @@ inline T Round(const T x) {
 }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 T quantize_val(double scale, int64_t zero_point, float value) {
   // std::nearbyint results in nearest integer value according to the current
@@ -132,7 +135,11 @@ T quantize_val(double scale, int64_t zero_point, float value) {
   constexpr int64_t qmin = std::numeric_limits<typename T::underlying>::min();
   constexpr int64_t qmax = std::numeric_limits<typename T::underlying>::max();
   float inv_scale = 1.0f / static_cast<float>(scale);
+<<<<<<< HEAD
   qvalue = static_cast<int64_t>(zero_point + Round(value * inv_scale));
+=======
+  qvalue = static_cast<int64_t>(zero_point + std::nearbyint(value * inv_scale));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   qvalue = std::max<int64_t>(qvalue, qmin);
   qvalue = std::min<int64_t>(qvalue, qmax);
   return static_cast<T>(qvalue);
@@ -147,7 +154,11 @@ T quantize_val_arm(
   constexpr int32_t qmax = std::numeric_limits<T>::max();
   float inv_scale = 1.0f / scale;
 #ifndef _MSC_VER
+<<<<<<< HEAD
   auto r = static_cast<int32_t>(Round(value * inv_scale));
+=======
+  auto r = static_cast<int32_t>(std::nearbyint(value * inv_scale));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // builtin_add_overflow() returns true in case of overflow
   if (__builtin_add_overflow(zero_point, r, &r)) {
     // zero_point must be a non-negative value between qmin and qmax,
@@ -155,7 +166,11 @@ T quantize_val_arm(
     r = qmax;
   }
 #else
+<<<<<<< HEAD
   auto r = zero_point + static_cast<int32_t>(Round(value * inv_scale));
+=======
+  auto r = zero_point + static_cast<int32_t>(std::nearbyint(value * inv_scale));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   r = std::max(r, qmin);
   r = std::min(r, qmax);
@@ -191,7 +206,11 @@ TORCH_API float dequantize_val(double scale, int64_t zero_point, T value) {
 
 /*
 * Quantize value based on the following equation
+<<<<<<< HEAD
 * Xq = Round(Xf * inv_scale + zero_point)
+=======
+* Xq = std::nearbyint(Xf * inv_scale + zero_point)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 * where zero_point is in float.
 *
 * Note: For the case of embedding quantization we will set zero_point
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index ab9f3f46330f..3fc77bdd338f 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -81,8 +81,13 @@ std::vector<Tensor> quantize_per_tensor_list_cpu(
   for (const auto i : c10::irange(tensors.size())) {
     quantized_tensors.push_back(at::quantize_per_tensor(
         tensors[i],
+<<<<<<< HEAD
         scales[i].item<double>(),
         zero_points[i].item<int64_t>(),
+=======
+        scales[static_cast<int64_t>(i)].item<double>(),
+        zero_points[static_cast<int64_t>(i)].item<int64_t>(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype));
   }
   return quantized_tensors;
@@ -293,6 +298,7 @@ std::tuple<double, int64_t> _choose_qparams_per_tensor(
 
 static float calculate_quant_loss(
     const float* input,
+<<<<<<< HEAD
     int numel,
     float xmin,
     float xmax,
@@ -305,6 +311,18 @@ static float calculate_quant_loss(
   float scale = data_range == 0
       ? 1.0
       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+=======
+    int64_t numel,
+    float xmin,
+    float xmax,
+    float* q_input,
+    int64_t bit_width) {
+  xmin = static_cast<at::Half>(xmin);
+  float data_range = xmax - xmin;
+  float qmax = static_cast<float>((1 << bit_width) - 1);
+  float scale = data_range == 0
+      ? 1.0f
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : static_cast<float>(static_cast<at::Half>(data_range / qmax));
   float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale;
 
@@ -347,10 +365,17 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
   const float* input_row = input_tensor.const_data_ptr<float>();
   float xmin = *std::min_element(input_row, input_row + numel);
   float xmax = *std::max_element(input_row, input_row + numel);
+<<<<<<< HEAD
 
   float stepsize = (xmax - xmin) / n_bins;
   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   int min_bins = n_bins * (1.0 - (float) ratio);
+=======
+  float n_bins_float = static_cast<float>(n_bins);
+
+  float stepsize = (xmax - xmin) / n_bins_float;
+  float min_bins = static_cast<float>(n_bins_float* (1.0 - ratio));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tensor input_tensor_contig = input_tensor.contiguous();
   const float* input = input_tensor_contig.const_data_ptr<float>();
   std::vector<float> q_input(numel);
@@ -363,7 +388,10 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
   float cur_max = xmax;
   float cur_loss = loss;
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   float thr = min_bins * stepsize;
   while (cur_min + thr < cur_max) {
     // move left
diff --git a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
index d09d4a643013..bf56e5912855 100644
--- a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
@@ -256,9 +256,14 @@ enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format,
 
 template <typename scalar_t, bool ReLUFused = false>
 Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
+<<<<<<< HEAD
   const string func_name = "xnnp_add()";
   TORCH_CHECK(qa.ndimension() > 0, func_name, ": Got empty input tensor.");
   TORCH_CHECK(at::native::xnnpack::available(), func_name, ": XNNPACK is not available")
+=======
+  TORCH_CHECK(qa.ndimension() > 0, __func__, ": Got empty input tensor.");
+  TORCH_CHECK(at::native::xnnpack::available(), __func__, ": XNNPACK is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // using qa memory format for qb to allow xnnpack kernel to flatten all the
   // dims
@@ -294,7 +299,11 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
     &subgraph_ptr);
   TORCH_CHECK(
       status == xnn_status_success,
+<<<<<<< HEAD
       func_name, ": xnn create subgraph failed(", status,")!");
+=======
+      __func__, ": xnn create subgraph failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
       subgraph_ptr, &xnn_delete_subgraph);
 
@@ -311,7 +320,11 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
   );
   TORCH_CHECK(
       status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
+<<<<<<< HEAD
       func_name, ": xnn define input 0 failed(", status,")!");
+=======
+      __func__, ": xnn define input 0 failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Defining the quantized input 1
   status = xnnp_define_q_tensor(
@@ -324,7 +337,11 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
   );
   TORCH_CHECK(
       status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
+<<<<<<< HEAD
       func_name, ": xnn define input 1 failed(", status,")!");
+=======
+      __func__, ": xnn define input 1 failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Defining the quantized output
   status = xnnp_define_q_tensor(
@@ -337,7 +354,11 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
   );
   TORCH_CHECK(
       status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
+<<<<<<< HEAD
       func_name, ": xnn define output failed(", status,")!");
+=======
+      __func__, ": xnn define output failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const struct xnn_binary_params binary_params = {output_min, output_max};
   status = xnn_define_binary(
@@ -350,17 +371,28 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
     0);
   TORCH_CHECK(
       status == xnn_status_success,
+<<<<<<< HEAD
       func_name, ": xnn define binary add failed(", status,")!");
+=======
+      __func__, ": xnn define binary add failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // create runtime
   xnn_runtime_t runtime_ptr = nullptr;
   status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
   TORCH_CHECK(
       status == xnn_status_success,
+<<<<<<< HEAD
       func_name, ": xnn create runtime failed(", status,")!");
   TORCH_CHECK(
       runtime_ptr != nullptr,
       func_name, ": xnn create runtime failed because runtime_ptr is null");
+=======
+      __func__, ": xnn create runtime failed(", status,")!");
+  TORCH_CHECK(
+      runtime_ptr != nullptr,
+      __func__, ": xnn create runtime failed because runtime_ptr is null");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
       runtime_ptr, &xnn_delete_runtime);
 
@@ -375,11 +407,19 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
     external.data());
   TORCH_CHECK(
       status == xnn_status_success,
+<<<<<<< HEAD
       func_name, ": xnn setup runtime failed(", status,")!");
   status = xnn_invoke_runtime(runtime_ptr);
   TORCH_CHECK(
       status == xnn_status_success,
       func_name, ": xnn invoke runtime failed(", status,")!");
+=======
+      __func__, ": xnn setup runtime failed(", status,")!");
+  status = xnn_invoke_runtime(runtime_ptr);
+  TORCH_CHECK(
+      status == xnn_status_success,
+      __func__, ": xnn invoke runtime failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return qy;
 }
@@ -536,6 +576,51 @@ Tensor qadd_scalar_tensor_out(Tensor qa, Tensor b, Tensor out) {
   return qadd_scalar_out(std::move(qa), b.item(), std::move(out));
 }
 
+<<<<<<< HEAD
+=======
+DEFINE_DISPATCH(qadd_tensor_cpu_stub);
+DEFINE_DISPATCH(qadd_relu_tensor_cpu_stub);
+template <bool ReLUFused = false>
+Tensor int8_add_tensor_onednn(
+    const Tensor& self, double self_scale, int64_t self_zero_point,
+    const Tensor& other, double other_scale, int64_t other_zero_point,
+    double output_scale, int64_t output_zero_point, c10::ScalarType output_dtype) {
+  // Both inputs should have the same shape and both in uint8 dtype.
+  // If output_dtype is uint8, output is requantized with output scale/zero point.
+  // Otherwise, output scale should be 1 and zero point 0.
+  TORCH_CHECK(self.sizes() == other.sizes(),
+              "Quantized add operands should have the same size.");
+  TORCH_CHECK(self.scalar_type() == at::kByte && other.scalar_type() == at::kByte,
+              "Quantized add operands should be of type uint8, but got ",
+              self.scalar_type(), " and ", other.scalar_type());
+  TORCH_CHECK(output_dtype == at::kByte || output_dtype == at::kFloat || output_dtype == at::kBFloat16 || output_dtype == at::kHalf,
+              "Quantized add output should be of type uint8, float, bfloat16 or float16, but got ",
+              output_dtype);
+  if (output_dtype != at::kByte) {
+    TORCH_CHECK(output_scale == 1.0 && output_zero_point == 0,
+                "Quantized add output scale and zero point should be 1 and 0 for "
+                "output_dtype ", output_dtype, ", but got scale = ",
+                output_scale, " and zero point = ", output_zero_point);
+  }
+  at::Tensor out = at::empty_like(self, self.options().dtype(output_dtype));
+
+
+  if constexpr (ReLUFused) {
+    qadd_relu_tensor_cpu_stub(
+        self.device().type(), out, self, self_scale, self_zero_point,
+        other, other_scale, other_zero_point,
+        output_scale, output_zero_point);
+  } else {
+    qadd_tensor_cpu_stub(
+        self.device().type(), out, self, self_scale, self_zero_point,
+        other, other_scale, other_zero_point,
+        output_scale, output_zero_point);
+  }
+
+  return out;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::add"),                 TORCH_FN(qadd</*ReLUFused=*/false>));
   m.impl(TORCH_SELECTIVE_NAME("quantized::add.out"),             TORCH_FN(qadd_out</*ReLUFused=*/false>));
@@ -564,6 +649,14 @@ TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("_quantized::add"), TORCH_FN(qadd</*ReLUFused=*/false>));
 }
 
+<<<<<<< HEAD
+=======
+TORCH_LIBRARY_IMPL(onednn, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("onednn::qadd.tensor"), TORCH_FN(int8_add_tensor_onednn<false>));
+  m.impl(TORCH_SELECTIVE_NAME("onednn::qadd_relu.tensor"), TORCH_FN(int8_add_tensor_onednn<true>));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }  // namespace
 
 Tensor quantized_add(Tensor qa, Tensor qb, double scale, int64_t zero_point){
diff --git a/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp b/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp
index c9c09cf2464f..3505f4d5d93b 100644
--- a/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp
+++ b/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp
@@ -32,16 +32,26 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> PackedLinearWeight::unpack() {
         {N, K}, at::device(c10::kCPU).dtype(c10::kQInt8), w_scale[0], w_zp[0]);
   } else if (q_scheme == c10::kPerChannelAffine) {
     auto scales = at::from_blob(
+<<<<<<< HEAD
         w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat));
     auto zero_points = at::from_blob(
         w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kInt));
+=======
+        w_scale.data(), w_scale.size(), at::device(c10::kCPU).dtype(c10::kFloat));
+    auto zero_points = at::from_blob(
+        w_zp.data(), w_zp.size(), at::device(c10::kCPU).dtype(c10::kInt));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     weight_origin = at::_empty_per_channel_affine_quantized(
         {N, K},
         scales.toType(c10::kDouble),
         zero_points.toType(c10::kLong),
         0, // The output channel axis is 0
+<<<<<<< HEAD
         device(c10::kCPU).dtype(c10::kQInt8));
+=======
+        at::device(c10::kCPU).dtype(c10::kQInt8));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   int8_t* weight_ptr_int8 =
@@ -81,10 +91,17 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> PackedLinearWeightsQnnp::
       auto scales = at::from_blob(
           weight_scales_data,
           w_scales.sizes()[0] - kPaddingChannels,
+<<<<<<< HEAD
           device(c10::kCPU).dtype(c10::kFloat));
 
       at::Tensor zero_points = at::empty(
           w_zero_points.size() - kPaddingChannels, at::device(c10::kCPU).dtype(c10::kLong));
+=======
+          at::device(c10::kCPU).dtype(c10::kFloat));
+
+      at::Tensor zero_points = at::empty(
+          static_cast<int64_t>(w_zero_points.size() - kPaddingChannels), at::device(c10::kCPU).dtype(c10::kLong));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       for (const auto i : c10::irange(zero_points.numel())) {
         zero_points[i] = ((int64_t)w_zero_points[i] - 128);
       }
@@ -93,7 +110,11 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> PackedLinearWeightsQnnp::
                           scales,
                           zero_points.toType(c10::kLong),
                           0, // The output channel axis is 0
+<<<<<<< HEAD
                           device(c10::kCPU).dtype(c10::kQInt8))
+=======
+                          at::device(c10::kCPU).dtype(c10::kQInt8))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                           .contiguous();
     } else {
       TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
diff --git a/aten/src/ATen/native/quantized/cpu/Normalization.cpp b/aten/src/ATen/native/quantized/cpu/Normalization.cpp
index 2d2718dd94c0..34d06d65b7d7 100644
--- a/aten/src/ATen/native/quantized/cpu/Normalization.cpp
+++ b/aten/src/ATen/native/quantized/cpu/Normalization.cpp
@@ -11,6 +11,10 @@
 #else
 #include <ATen/ops/_empty_affine_quantized.h>
 #include <ATen/ops/empty_like.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/empty.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/quantized_batch_norm_native.h>
 #endif
 
@@ -20,6 +24,10 @@ namespace at::native {
 
 DEFINE_DISPATCH(qbatch_norm_stub);
 DEFINE_DISPATCH(qbatch_norm_relu_stub);
+<<<<<<< HEAD
+=======
+DEFINE_DISPATCH(qbatch_norm_cpu_stub);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 void compute_fused_params(
@@ -376,6 +384,88 @@ Tensor q_batch_norm_impl(
   return qy;
 }
 
+<<<<<<< HEAD
+=======
+Tensor int8_batch_norm2d_cpu_impl(
+    const Tensor& qx,
+    double qx_scale,
+    int64_t qx_zero_point,
+    const Tensor& weight,
+    const Tensor& bias,
+    const Tensor& mean,
+    const Tensor& var,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    c10::ScalarType output_dtype) {
+  if (qx.numel() == 0) {
+    auto out = qx.clone();
+    return out;
+  }
+  if (output_dtype != at::kByte) {
+    TORCH_CHECK(output_scale == 1.0 && output_zero_point == 0,
+                "Quantized batch_norm_2d output scale and zero point should be 1 and 0 for "
+                "output_dtype ", output_dtype, ", but got scale = ",
+                output_scale, " and zero point = ", output_zero_point);
+  }
+  int64_t ndim = qx.dim();
+  TORCH_CHECK(ndim == 4, "Int8 batch_norm2d: Expecting the input tensor of rank 4.");
+  const int64_t N = qx.size(0);
+  const int64_t C = qx.size(1);
+  const int64_t H = qx.size(2);
+  const int64_t W = qx.size(3);
+
+  TORCH_CHECK(weight.numel() == C, "Expect weight size to match C");
+  TORCH_CHECK(bias.numel() == C, "Expect weight size to match C");
+
+  const float* weight_data = weight.template const_data_ptr<float>();
+  const float* bias_data = bias.template const_data_ptr<float>();
+
+  TORCH_CHECK(mean.numel() == C, "Mean size must match channel dimension");
+  TORCH_CHECK(var.numel() == C, "Variance size must match channel dimension");
+
+  Tensor alpha = at::empty_like(mean, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  Tensor beta = at::empty_like(mean, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  float* alpha_data = alpha.mutable_data_ptr<float>();
+  float* beta_data = beta.data_ptr<float>();
+
+  const float* mean_data = mean.template const_data_ptr<float>();
+  const float* var_data = var.template const_data_ptr<float>();
+
+  auto oSizes = qx.sizes();
+  auto qx_nhwc = qx.contiguous(MemoryFormat::ChannelsLast);
+  Tensor qy = at::empty(
+      oSizes,
+      at::device(kCPU)
+        .dtype(output_dtype)
+        .memory_format(MemoryFormat::ChannelsLast));
+
+  compute_fused_params(
+      C,
+      weight_data,
+      bias_data,
+      mean_data,
+      var_data,
+      eps,
+      qx_scale,
+      output_scale,
+      alpha_data,
+      beta_data);
+  qbatch_norm_cpu_stub(
+      qx.device().type(),
+      N,
+      C,
+      H * W,
+      qx_zero_point,
+      output_zero_point,
+      qx_nhwc,
+      alpha,
+      beta,
+      qy);
+  return qy;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 Tensor quantized_batch_norm(
@@ -396,6 +486,10 @@ Tensor quantized_batch_norm(
       output_zero_point);
 }
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm"),        TORCH_FN(q_batch_norm_impl<false>));
   m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm_relu"),   TORCH_FN(q_batch_norm_impl<true>));
@@ -407,4 +501,11 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm3d_relu"), TORCH_FN(q_batch_norm3d_impl<true>));
 }
 
+<<<<<<< HEAD
+=======
+TORCH_LIBRARY_IMPL(onednn, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("onednn::qbatch_norm2d"), TORCH_FN(int8_batch_norm2d_cpu_impl));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
index 2fb60fd88b3c..6615692795bc 100644
--- a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
@@ -382,6 +382,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
 
 enum class Activation : uint8_t { NONE = 0, RELU = 1 };
 
+<<<<<<< HEAD
 #if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
 template <class T>
 inline float Round(const float x) {
@@ -397,11 +398,17 @@ inline T Round(const T x) {
 }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template<typename T>
 inline T QuantizeValue(float scale, int32_t zero_point, float value) {
   const int32_t qmin = std::numeric_limits<T>::min();
   const int32_t qmax = std::numeric_limits<T>::max();
+<<<<<<< HEAD
   auto r = zero_point + static_cast<int32_t>(Round(value / scale));
+=======
+  auto r = zero_point + static_cast<int32_t>(std::nearbyint(value / scale));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   r = std::max(r, qmin);
   r = std::min(r, qmax);
   return static_cast<T>(r);
diff --git a/aten/src/ATen/native/quantized/cpu/QuantizedOps.h b/aten/src/ATen/native/quantized/cpu/QuantizedOps.h
index f39e614e0a3e..c7c045f2d8cd 100644
--- a/aten/src/ATen/native/quantized/cpu/QuantizedOps.h
+++ b/aten/src/ATen/native/quantized/cpu/QuantizedOps.h
@@ -216,6 +216,31 @@ using qnormalize_nhwc_fn = void (*)(
 using qprelu_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/,
                            const Tensor& /*qw*/);
 
+<<<<<<< HEAD
+=======
+using qbinary_eltwise_cpu_fn = void (*)(
+    Tensor& /*out*/,
+    const Tensor& /*qx*/,
+    double /*qx_scale*/,
+    int64_t /*qx_zero_point*/,
+    const Tensor& /*qy*/,
+    double /*qy_scale*/,
+    int64_t /*qy_zero_point*/,
+    double /*output_scale*/,
+    int64_t /*output_zero_point*/);
+
+using qbatch_norm_cpu_fn = void(*)(
+    int64_t /*N*/,
+    int64_t /*C*/,
+    int64_t /*H * W*/,
+    int64_t /*in_zero_point*/,
+    int64_t /*out_zero_point*/,
+    const Tensor& /*input*/,
+    const Tensor& /*a*/,
+    const Tensor& /*b*/,
+    Tensor& /*output*/);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DECLARE_DISPATCH(qadaptive_avg_pool2d_fn, qadaptive_avg_pool2d_nhwc_stub)
 DECLARE_DISPATCH(qadaptive_avg_pool3d_fn, qadaptive_avg_pool3d_ndhwc_stub)
 DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_relu_stub)
@@ -252,5 +277,12 @@ DECLARE_DISPATCH(qupsample_bilinear2d_fn, qupsample_bilinear2d_nhwc_stub)
 DECLARE_DISPATCH(qmean_inner_dim_fn, qmean_inner_dim_stub)
 DECLARE_DISPATCH(qstd_inner_dim_fn, qstd_inner_dim_stub)
 DECLARE_DISPATCH(qprelu_fn, qprelu_stub)
+<<<<<<< HEAD
+=======
+DECLARE_DISPATCH(qbinary_eltwise_cpu_fn, qmul_tensor_cpu_stub)
+DECLARE_DISPATCH(qbinary_eltwise_cpu_fn, qadd_tensor_cpu_stub)
+DECLARE_DISPATCH(qbinary_eltwise_cpu_fn, qadd_relu_tensor_cpu_stub)
+DECLARE_DISPATCH(qbatch_norm_cpu_fn, qbatch_norm_cpu_stub)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp b/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
index f73f7c96f8af..9ff36e620774 100644
--- a/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
@@ -26,7 +26,11 @@ DEFINE_DISPATCH(qmean_inner_dim_stub);
 DEFINE_DISPATCH(qstd_inner_dim_stub);
 
 // If mean/std is taken in the innermost dims, the fast path can be used.
+<<<<<<< HEAD
 inline bool is_innnermost_dim(
+=======
+static inline bool is_innnermost_dim(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     OptionalIntArrayRef opt_dim) {
   if (!opt_dim.has_value()) {
@@ -43,7 +47,11 @@ inline bool is_innnermost_dim(
   return is_innermost;
 }
 
+<<<<<<< HEAD
 inline bool is_mean_inner_dim_fast_path(
+=======
+static inline bool is_mean_inner_dim_fast_path(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     OptionalIntArrayRef opt_dim,
     std::optional<ScalarType> opt_dtype) {
@@ -172,7 +180,11 @@ Tensor mean_quantized_cpu(
 }
 
 // qstd
+<<<<<<< HEAD
 inline bool is_std_inner_dim_fast_path(
+=======
+static inline bool is_std_inner_dim_fast_path(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     OptionalIntArrayRef dim,
     const std::optional<Scalar>& correction) {
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
index 233b86b73f6c..afe2ebf22664 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest2d.cpp
@@ -117,7 +117,11 @@ static void upsample_nearest2d_out_frame_nhwc(
 }
 
 template <nn_compute_source_index_fn_t nn_compute_source_index_fn>
+<<<<<<< HEAD
 Tensor _upsample_nearest2d_quantized_cpu(
+=======
+static Tensor _upsample_nearest2d_quantized_cpu(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     IntArrayRef output_size,
     std::optional<double> scales_h,
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
index 571490d719b9..0f640cac9ae9 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
@@ -129,7 +129,11 @@ static void upsample_nearest3d_out_frame_nhwc(
 }
 
 template <nn_compute_source_index_fn_t nn_compute_source_index_fn>
+<<<<<<< HEAD
 Tensor _upsample_nearest3d_quantized_cpu(
+=======
+static Tensor _upsample_nearest3d_quantized_cpu(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& input,
     IntArrayRef output_size,
     std::optional<double> scales_d,
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index db5bd1dfe762..9a5260874b63 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -474,6 +474,18 @@ int register_linear_params() {
               .def("bias", [](const c10::intrusive_ptr<LinearPackedParamsBase>& self) {
                   return std::get<1>(self->unpack());
                  })
+<<<<<<< HEAD
+=======
+#if defined(USE_FBGEMM) && defined(FBCODE_CAFFE2)
+              .def("__obj_flatten__", [](const c10::intrusive_ptr<LinearPackedParamsBase>& self) -> std::tuple<std::tuple<std::string, at::Tensor>, std::tuple<std::string, std::optional<at::Tensor>>> {
+                auto [weight, bias] = self->unpack();
+                return std::tuple(
+                  std::tuple("weight", std::move(weight)),
+                  std::tuple("bias", std::move(bias))
+                );
+              })
+#endif // defined(USE_FBGEMM) && defined(FBCODE_CAFFE2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               .def("unpack", &LinearPackedParamsBase::unpack);
   // (1) we can't (easily) return the static initializer itself because it can have a different type because of selective build
   // (2) we can't return void and be able to call the function in the global scope
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index 9f0ba12e28f7..1b4c075653ee 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -15,6 +15,10 @@
 #include <ATen/native/quantized/cpu/QuantizedOps.h>
 #include <ATen/native/cpu/utils.h>
 #include <c10/util/irange.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/Unroll.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -107,8 +111,12 @@ Tensor qcat_nhwc_kernel(
   const int64_t N = qx0.size(0);
   const int64_t H = qx0.size(2);
   const int64_t W = qx0.size(3);
+<<<<<<< HEAD
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   float inv_scale = 1.0 / scale;
+=======
+  float inv_scale = static_cast<float>(1.0 / scale);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto output = at::_empty_affine_quantized(
       {N, C_out, H, W},
@@ -1281,12 +1289,19 @@ void qelu_kernel(
 template <bool ReLUFused = false>
 void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
   int64_t zero_point = out.q_zero_point();
+<<<<<<< HEAD
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   float scale = out.q_scale();
   float inv_scale = 1.0f / scale;
   int64_t self_zero_point = self.q_zero_point();
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   float self_scale = self.q_scale();
+=======
+  float scale = static_cast<float>(out.q_scale());
+  float inv_scale = static_cast<float>(1.0f / scale);
+  int64_t self_zero_point = self.q_zero_point();
+  float self_scale = static_cast<float>(self.q_scale());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   float multiplier = self_scale * inv_scale;
 
@@ -2520,6 +2535,160 @@ void q_batch_norm_kernel(
   });
 }
 
+<<<<<<< HEAD
+=======
+template <typename T>
+void q_batch_norm_cpu_kernel_impl(
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t in_zero_point,
+    int64_t out_zero_point,
+    const uint8_t* in_ptr,
+    const float* alpha_ptr,
+    const float* beta_ptr,
+    T* out_ptr) {
+
+  int q_min = 0;
+  int q_max = 255;
+  const int64_t outer_size = N * HxW;
+
+#if defined(CPU_CAPABILITY_AVX512)
+  constexpr int kVLen = 16;
+  static constexpr int num_vecs = sizeof(float) / sizeof(uint8_t);
+  auto in_zp_vec = _mm512_set1_ps((float)in_zero_point);
+  auto fake_scale = _mm512_set1_ps(1.0f);
+  auto scale_neg_zp_premul = _mm512_xor_ps(_mm512_set1_ps(-0.f), in_zp_vec);
+  auto out_zero_point_v = _mm512_set1_epi32((int)out_zero_point);
+  constexpr auto lanes = static_cast<int64_t>(num_vecs * kVLen);
+  __m512i v_q_max = _mm512_set1_epi32(q_max);
+  __m512i v_q_min = _mm512_set1_epi32(q_min);
+
+  auto load_convert_u8_to_f32_512bit = [&](const uint8_t* src, __m512* dst) {
+    // Step 1: Load 512 bits
+    __m512i raw = _mm512_loadu_si512(src);
+
+    // Step 2: Extract two 256-bit chunks
+    __m256i v0 = _mm512_extracti64x4_epi64(raw, 0); // bytes 0–31
+    __m256i v1 = _mm512_extracti64x4_epi64(raw, 1); // bytes 32–63
+
+    // Step 3: Process each 256-bit chunk
+    // --- Expand uint8_t -> uint16_t ---
+    __m256i u16lo0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v0, 0));
+    __m256i u16hi0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v0, 1));
+    __m256i u16lo1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v1, 0));
+    __m256i u16hi1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v1, 1));
+    // --- Expand to uint32_t and convert to float ---
+    dst[0] = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(u16lo0));
+    dst[1] = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(u16hi0));
+    dst[2] = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(u16lo1));
+    dst[3] = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(u16hi1));
+  };
+
+  auto load_convert_u8_to_f32_128bit = [&](const uint8_t* src) {
+    // --- Load and expand uint8_t -> uint16_t ---
+    __m256i v_u16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)src));
+    // --- Expand to uint32_t and convert to float ---
+    return _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(v_u16));
+  };
+
+  auto store_output = [&](__m512 out, T* out_addr) {
+    if constexpr (std::is_same<T, float>::value) {
+      _mm512_storeu_ps(out_addr, out);
+    } else if constexpr (std::is_same<T, at::BFloat16>::value) {
+      __m256i out_bf16 = cvtfp32_bf16(out);
+      _mm256_storeu_si256((__m256i*)out_addr, out_bf16);
+    } else if constexpr (std::is_same<T, at::Half>::value) {
+      __m256i out_f16 = cvtfp32_fp16(out);
+      _mm256_storeu_si256((__m256i*)out_addr, out_f16);
+    } else { //  T == uint8, requantization needed
+      __m512i out_i32 = _mm512_cvtps_epi32(out);
+      out_i32 = _mm512_add_epi32(out_i32, out_zero_point_v);
+      out_i32 = _mm512_min_epi32(out_i32, v_q_max);
+      out_i32 = _mm512_max_epi32(out_i32, v_q_min);
+      __m128i out_i8 = _mm512_cvtepi32_epi8(out_i32);
+      _mm_storeu_si128((__m128i*)out_addr, out_i8);
+    }
+  };
+#endif
+
+  at::parallel_for(0, outer_size, 0, [&](int64_t begin, int64_t end) {
+    for (const auto i : c10::irange(begin, end)) {
+      auto* X_ptr = in_ptr + i * C;
+      auto* Y_ptr = out_ptr + i * C;
+      int64_t ch = 0;
+
+#if defined(CPU_CAPABILITY_AVX512)
+      __m512 vals_dq[num_vecs];
+      for(; ch + lanes <= C; ch += lanes) {
+        // load 64 values of input then dequantize them
+        load_convert_u8_to_f32_512bit(X_ptr + ch, vals_dq);
+        for (const auto idx : c10::irange(num_vecs)) {
+          vals_dq[idx] = _mm512_fmadd_ps(fake_scale, vals_dq[idx], scale_neg_zp_premul);
+          auto alpha_v = _mm512_loadu_ps(alpha_ptr + ch + idx * kVLen);
+          auto beta_v = _mm512_loadu_ps(beta_ptr + ch + idx * kVLen);
+          vals_dq[idx] = _mm512_fmadd_ps(alpha_v, vals_dq[idx], beta_v);
+          store_output(vals_dq[idx], Y_ptr + ch + idx * kVLen);
+        }
+      }
+
+      // for channel between 16 and 64
+      int64_t elem_size = C - ch;
+      if (elem_size >= kVLen) {
+        int64_t vec_num = elem_size / kVLen;
+        for (const auto idx : c10::irange(vec_num)) {
+          __m512 val_dq = load_convert_u8_to_f32_128bit(X_ptr + ch + idx * kVLen);
+          val_dq = _mm512_fmadd_ps(fake_scale, val_dq, scale_neg_zp_premul);
+          auto alpha_v = _mm512_loadu_ps(alpha_ptr + ch + idx * kVLen);
+          auto beta_v = _mm512_loadu_ps(beta_ptr + ch + idx * kVLen);
+          val_dq = _mm512_fmadd_ps(alpha_v, val_dq, beta_v);
+          store_output(val_dq, Y_ptr + ch + idx * kVLen);
+        }
+        ch += vec_num * kVLen;
+      }
+#endif
+      // for channels less than 16
+      for (; ch < C; ++ch) {
+        float y_val_f = alpha_ptr[ch] * (X_ptr[ch] - in_zero_point) +
+                        beta_ptr[ch];
+        if constexpr (std::is_same<T, float>::value) {
+          Y_ptr[ch] = y_val_f;
+        } else if constexpr (std::is_same<T, at::BFloat16>::value) {
+          Y_ptr[ch] = (at::BFloat16)y_val_f;
+        } else if constexpr (std::is_same<T, at::Half>::value) {
+          Y_ptr[ch] = (at::Half)y_val_f;
+        } else { //  T == uint8, requantization needed
+          long quantized_down = out_zero_point + lrintf(y_val_f);
+          Y_ptr[ch] = std::min<long>(
+              std::max<long>(quantized_down, q_min), q_max);
+        }
+      }
+    }
+  });
+}
+
+void q_batch_norm_cpu_kernel(
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t in_zero_point,
+    int64_t out_zero_point,
+    const Tensor& input,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& output) {
+  auto in_ptr = input.const_data_ptr<uint8_t>();
+  float* alpha_ptr = a.data_ptr<float>();
+  float* beta_ptr = b.data_ptr<float>();
+  AT_DISPATCH_FLOATING_TYPES_AND3(
+      at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Byte, output.scalar_type(), "int8_batch_norm2d_cpu", [&] {
+        auto out_ptr = output.data_ptr<scalar_t>();
+        q_batch_norm_cpu_kernel_impl<scalar_t>(
+            N, C, HxW, in_zero_point, out_zero_point, in_ptr, alpha_ptr, beta_ptr, out_ptr);
+      });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void _fake_quantize_tensor_helper(
   Tensor& output,
   Tensor& mask,
@@ -4245,6 +4414,249 @@ void index_put_kernel_quantized_cpu(TensorIterator& iter, IntArrayRef index_size
     }, /*serial_execution=*/is_deterministic);
   });
 }
+<<<<<<< HEAD
+=======
+
+template<typename T>
+void _qmul_tensor_cpu_impl(
+    T* out_ptr,
+    int64_t size,
+    const uint8_t* x_ptr,
+    double x_scale,
+    int64_t x_zero_point,
+    const uint8_t* y_ptr,
+    double y_scale,
+    int64_t y_zero_point,
+    double output_scale,
+    int64_t output_zero_point) {
+  float multiplier = x_scale * y_scale / output_scale;
+  auto compute_with_scalar = [&](int idx) {
+    uint8_t x_data = *(x_ptr + idx);
+    uint8_t y_data = *(y_ptr + idx);
+    int32_t x_val = static_cast<int32_t>(x_data) - x_zero_point;
+    int32_t y_val = static_cast<int32_t>(y_data) - y_zero_point;
+    int32_t out_val = static_cast<int32_t>(x_val * y_val);
+    float out_val_f = (float)out_val * multiplier;
+    if constexpr (std::is_same<T, float>::value) {
+      *(out_ptr + idx) = out_val_f;
+    } else if constexpr (std::is_same<T, at::BFloat16>::value) {
+      *(out_ptr + idx) = at::BFloat16(out_val_f);
+    } else if constexpr (std::is_same<T, at::Half>::value) {
+      *(out_ptr + idx) = at::Half(out_val_f);
+    } else { //  T == uint8, requantization needed
+      out_val_f = std::round(out_val_f);
+      int32_t out_val_i32 = (int32_t)out_val_f + output_zero_point;
+      out_val_i32 = std::min(255, std::max(0, out_val_i32));
+      *(out_ptr + idx) = static_cast<uint8_t>(out_val_i32);
+    }
+  };
+#if defined(CPU_CAPABILITY_AVX512)
+  int64_t size_rem = size % 16;
+  int64_t size_com = size - size_rem;
+  int64_t steps = size_com / 16;
+  __m512 vs = _mm512_set1_ps(multiplier);
+  __m512i vza = _mm512_set1_epi32(x_zero_point);
+  __m512i vzb = _mm512_set1_epi32(y_zero_point);
+  __m512i vzc = _mm512_set1_epi32(output_zero_point);
+  __m512i v255 = _mm512_set1_epi32(255);
+  __m512i v0 = _mm512_set1_epi32(0);
+  at::parallel_for(0, steps, 1, [&](int64_t start, int64_t end) {
+    for (const auto d : c10::irange(start, end)) {
+      auto x_data = x_ptr + d * 16;
+      auto y_data = y_ptr + d * 16;
+      auto out_data = out_ptr + d * 16;
+      __m128i va = _mm_loadu_si128((__m128i*)x_data);
+      __m128i vb = _mm_loadu_si128((__m128i*)y_data);
+      __m512i va_i32 = _mm512_cvtepi8_epi32(va);
+      __m512i vb_i32 = _mm512_cvtepi8_epi32(vb);
+      va_i32 = _mm512_sub_epi32(va_i32, vza);
+      vb_i32 = _mm512_sub_epi32(vb_i32, vzb);
+      __m512i vc = _mm512_mullo_epi32(va_i32, vb_i32);
+      __m512 vc_f = _mm512_cvtepi32_ps(vc);
+      vc_f = _mm512_mul_ps(vc_f, vs);
+      if constexpr (std::is_same<T, float>::value) {
+        _mm512_storeu_ps(out_data, vc_f);
+      } else if constexpr (std::is_same<T, at::BFloat16>::value) {
+        __m256i vc_bf16 = cvtfp32_bf16(vc_f);
+        _mm256_storeu_si256((__m256i*)out_data, vc_bf16);
+      } else if constexpr (std::is_same<T, at::Half>::value) {
+        __m256i vc_f16 = cvtfp32_fp16(vc_f);
+        _mm256_storeu_si256((__m256i*)out_data, vc_f16);
+      } else { //  T == uint8, requantization needed
+        __m512i vc_i32 = _mm512_cvtps_epi32(vc_f);
+        vc_i32 = _mm512_add_epi32(vc_i32, vzc);
+        vc_i32 = _mm512_min_epi32(vc_i32, v255);
+        vc_i32 = _mm512_max_epi32(vc_i32, v0);
+        __m128i vc_i8 = _mm512_cvtepi32_epi8(vc_i32);
+        _mm_storeu_si128((__m128i*)out_data, vc_i8);
+      }
+    }
+  });
+  if (size_rem > 0) {
+    for (const auto d : c10::irange(size_rem)) {
+      compute_with_scalar(size_com + d);
+    }
+  }
+#else
+  at::parallel_for(0, size, 1, [&](int64_t start, int64_t end) {
+    for (const auto d : c10::irange(start, end)) {
+      compute_with_scalar(d);
+    }
+  });
+#endif
+}
+
+void qmul_tensor_cpu_kernel(
+    Tensor& out,
+    const Tensor& qx,
+    double qx_scale,
+    int64_t qx_zero_point,
+    const Tensor& qy,
+    double qy_scale,
+    int64_t qy_zero_point,
+    double output_scale,
+    int64_t output_zero_point) {
+  auto qx_ptr = qx.const_data_ptr<uint8_t>();
+  auto qy_ptr = qy.const_data_ptr<uint8_t>();
+  int64_t size = qx.numel();
+  TORCH_CHECK(
+      size == qy.numel() && size == out.numel(),
+      "qmul_cpu: Expect qx, qy and out to have the same number of elements");
+  AT_DISPATCH_FLOATING_TYPES_AND3(
+      at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Byte, out.scalar_type(), "int8_mul_cpu", [&] {
+        auto out_ptr = out.data_ptr<scalar_t>();
+        _qmul_tensor_cpu_impl<scalar_t>(
+            out_ptr, size, qx_ptr, qx_scale, qx_zero_point, qy_ptr, qy_scale, qy_zero_point, output_scale, output_zero_point);
+      });
+}
+
+template<typename T, bool ReLUFused>
+void _qadd_tensor_cpu_impl(
+    T* out_ptr,
+    int64_t size,
+    const uint8_t* x_ptr,
+    double x_scale,
+    int64_t x_zero_point,
+    const uint8_t* y_ptr,
+    double y_scale,
+    int64_t y_zero_point,
+    double output_scale,
+    int64_t output_zero_point) {
+  float inv_output_scale = 1.0 / output_scale;
+  auto compute_with_scalar = [&](int idx) {
+    uint8_t x_data = *(x_ptr + idx);
+    uint8_t y_data = *(y_ptr + idx);
+    int32_t x_val = static_cast<int32_t>(x_data) - x_zero_point;
+    int32_t y_val = static_cast<int32_t>(y_data) - y_zero_point;
+    float x_val_f = static_cast<float>(x_val) * x_scale;
+    float y_val_f = static_cast<float>(y_val) * y_scale;
+    float out_val_f = x_val_f + y_val_f;
+    if constexpr (ReLUFused) {
+      out_val_f = std::max(out_val_f, 0.f);
+    }
+    if constexpr (std::is_same<T, float>::value) {
+      *(out_ptr + idx) = out_val_f;
+    } else if constexpr (std::is_same<T, at::BFloat16>::value) {
+      *(out_ptr + idx) = at::BFloat16(out_val_f);
+    } else if constexpr (std::is_same<T, at::Half>::value) {
+      *(out_ptr + idx) = at::Half(out_val_f);
+    } else { //  T == uint8, requantization needed
+      out_val_f = std::round(out_val_f * inv_output_scale);
+      int32_t out_val_i32 = (int32_t)out_val_f + output_zero_point;
+      out_val_i32 = std::min(255, std::max(0, out_val_i32));
+      *(out_ptr + idx) = static_cast<uint8_t>(out_val_i32);
+    }
+  };
+#if defined(CPU_CAPABILITY_AVX512)
+  int64_t size_rem = size % 16;
+  int64_t size_com = size - size_rem;
+  int64_t steps = size_com / 16;
+  __m512 vsa = _mm512_set1_ps(x_scale);
+  __m512 vsb = _mm512_set1_ps(y_scale);
+  __m512 vsc = _mm512_set1_ps(inv_output_scale);
+  __m512i vza = _mm512_set1_epi32(x_zero_point);
+  __m512i vzb = _mm512_set1_epi32(y_zero_point);
+  __m512i vzc = _mm512_set1_epi32(output_zero_point);
+  __m512i v255 = _mm512_set1_epi32(255);
+  __m512i v0 = _mm512_set1_epi32(0);
+  __m512 v0f = _mm512_set1_ps(0);
+  at::parallel_for(0, steps, 1, [&](int64_t start, int64_t end) {
+    for (const auto d : c10::irange(start, end)) {
+      auto x_data = x_ptr + d * 16;
+      auto y_data = y_ptr + d * 16;
+      auto out_data = out_ptr + d * 16;
+      __m128i va = _mm_loadu_si128((__m128i*)x_data);
+      __m128i vb = _mm_loadu_si128((__m128i*)y_data);
+      __m512i va_i32 = _mm512_cvtepi8_epi32(va);
+      __m512i vb_i32 = _mm512_cvtepi8_epi32(vb);
+      va_i32 = _mm512_sub_epi32(va_i32, vza);
+      vb_i32 = _mm512_sub_epi32(vb_i32, vzb);
+      __m512 va_f = _mm512_cvtepi32_ps(va_i32);
+      __m512 vb_f = _mm512_cvtepi32_ps(vb_i32);
+      va_f = _mm512_mul_ps(va_f, vsa);
+      vb_f = _mm512_mul_ps(vb_f, vsb);
+      __m512 vc_f = _mm512_add_ps(va_f, vb_f);
+      if constexpr (ReLUFused) {
+        vc_f = _mm512_max_ps(vc_f, v0f);
+      }
+      if constexpr (std::is_same<T, float>::value) {
+        _mm512_storeu_ps(out_data, vc_f);
+      } else if constexpr (std::is_same<T, at::BFloat16>::value) {
+        __m256i vc_bf16 = cvtfp32_bf16(vc_f);
+        _mm256_storeu_si256((__m256i*)out_data, vc_bf16);
+      } else if constexpr (std::is_same<T, at::Half>::value) {
+        __m256i vc_f16 = cvtfp32_fp16(vc_f);
+        _mm256_storeu_si256((__m256i*)out_data, vc_f16);
+      } else { //  T == uint8, requantization needed
+        vc_f = _mm512_mul_ps(vc_f, vsc);
+        __m512i vc_i32 = _mm512_cvtps_epi32(vc_f);
+        vc_i32 = _mm512_add_epi32(vc_i32, vzc);
+        vc_i32 = _mm512_min_epi32(vc_i32, v255);
+        vc_i32 = _mm512_max_epi32(vc_i32, v0);
+        __m128i vc_i8 = _mm512_cvtepi32_epi8(vc_i32);
+        _mm_storeu_si128((__m128i*)out_data, vc_i8);
+      }
+    }
+  });
+  if (size_rem > 0) {
+    for (const auto d : c10::irange(size_rem)) {
+      compute_with_scalar(size_com + d);
+    }
+  }
+#else
+  at::parallel_for(0, size, 1, [&](int64_t start, int64_t end) {
+    for (const auto d : c10::irange(start, end)) {
+      compute_with_scalar(d);
+    }
+  });
+#endif
+}
+
+template <bool ReLUFused>
+void qadd_tensor_cpu_kernel(
+    Tensor& out,
+    const Tensor& qx,
+    double qx_scale,
+    int64_t qx_zero_point,
+    const Tensor& qy,
+    double qy_scale,
+    int64_t qy_zero_point,
+    double output_scale,
+    int64_t output_zero_point) {
+  auto qx_ptr = qx.const_data_ptr<uint8_t>();
+  auto qy_ptr = qy.const_data_ptr<uint8_t>();
+  int64_t size = qx.numel();
+  TORCH_CHECK(
+      size == qy.numel() && size == out.numel(),
+      "qadd_cpu: Expect qx, qy and out to have the same number of elements");
+  AT_DISPATCH_FLOATING_TYPES_AND3(
+      at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Byte, out.scalar_type(), "int8_add_cpu", [&] {
+        auto out_ptr = out.data_ptr<scalar_t>();
+        _qadd_tensor_cpu_impl<scalar_t, ReLUFused>(
+            out_ptr, size, qx_ptr, qx_scale, qx_zero_point, qy_ptr, qy_scale, qy_zero_point, output_scale, output_zero_point);
+      });
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // anonymous namespace
 
 // Some quantization tests are flaky on Windows with AVX512. If --continue-through-error
@@ -4343,5 +4755,12 @@ REGISTER_DISPATCH(
     &index_put_kernel_quantized_cpu)
 REGISTER_DISPATCH(qmean_inner_dim_stub, &qmean_inner_dim_kernel)
 REGISTER_DISPATCH(qstd_inner_dim_stub, &qstd_inner_dim_kernel)
+<<<<<<< HEAD
+=======
+ALSO_REGISTER_AVX512_DISPATCH(qmul_tensor_cpu_stub, &qmul_tensor_cpu_kernel)
+ALSO_REGISTER_AVX512_DISPATCH(qadd_tensor_cpu_stub, &qadd_tensor_cpu_kernel<false>)
+ALSO_REGISTER_AVX512_DISPATCH(qadd_relu_tensor_cpu_stub, &qadd_tensor_cpu_kernel<true>)
+ALSO_REGISTER_AVX512_DISPATCH(qbatch_norm_cpu_stub, &q_batch_norm_cpu_kernel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::native
 // NOLINTEND(*-c-arrays)
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 46b58e9a38a8..366ba575915d 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -44,7 +44,11 @@ constexpr int64_t kReasonableMaxDim = 1000000;
 } // namespace
 
 template <int kSpatialDim = 2>
+<<<<<<< HEAD
 bool ConvDimChecks(
+=======
+static bool ConvDimChecks(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t act_dims,
     int64_t stride_dims,
     int64_t padding_dims,
@@ -95,7 +99,11 @@ bool ConvDimChecks(
   return true;
 }
 
+<<<<<<< HEAD
 inline int64_t compute_deconv_shape(int64_t input,
+=======
+static inline int64_t compute_deconv_shape(int64_t input,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                     int64_t kernel,
                                     int64_t stride,
                                     int64_t input_padding,
@@ -107,7 +115,11 @@ inline int64_t compute_deconv_shape(int64_t input,
 }
 
 template <int64_t kSpatialDim>
+<<<<<<< HEAD
 at::SmallVector<int64_t, kSpatialDim + 2> MakeDeConvOutputShape(
+=======
+static at::SmallVector<int64_t, kSpatialDim + 2> MakeDeConvOutputShape(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t N, int64_t M,
     const std::vector<int64_t>& input_shape,
     const std::vector<int64_t>& kernel,
@@ -178,7 +190,11 @@ at::SmallVector<int64_t, 5> MakeConvOutputShape<3>(
 #ifdef USE_PYTORCH_QNNPACK
 
 template <size_t kSpatialDim>
+<<<<<<< HEAD
 std::array<int64_t, kSpatialDim> MakeInputShape(
+=======
+static std::array<int64_t, kSpatialDim> MakeInputShape(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t D,
     int64_t H,
     int64_t W);
@@ -448,7 +464,11 @@ at::Tensor PackedConvWeight<kSpatialDim>::apply_impl(
   at::Tensor output = kSpatialDim == 2
       ? at::_empty_affine_quantized(
             output_shape,
+<<<<<<< HEAD
             device(c10::kCPU)
+=======
+            at::device(c10::kCPU)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 .dtype(c10::kQUInt8)
                 .memory_format(c10::MemoryFormat::ChannelsLast),
             output_scale,
@@ -460,7 +480,11 @@ at::Tensor PackedConvWeight<kSpatialDim>::apply_impl(
             output_shape[2],
             output_shape[3],
             output_shape[4],
+<<<<<<< HEAD
             device(c10::kCPU).dtype(c10::kQUInt8),
+=======
+            at::device(c10::kCPU).dtype(c10::kQUInt8),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_scale,
             output_zero_point);
   at::Tensor buffer =
@@ -1225,7 +1249,11 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
   ideep::dims dst_dims = ideep::dims({output_sizes.cbegin(), output_sizes.cend()});
   at::Tensor output = at::_empty_affine_quantized(
       dst_dims,
+<<<<<<< HEAD
       device(c10::kCPU)
+=======
+      at::device(c10::kCPU)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .dtype(c10::kQUInt8)
           .memory_format(kSpatialDim == 2 ?
               c10::MemoryFormat::ChannelsLast :
@@ -1593,7 +1621,11 @@ static at::Tensor _quantized_convolution_onednn(
     accum.value() :
     at::empty(
       dst_dims,
+<<<<<<< HEAD
       device(c10::kCPU)
+=======
+      at::device(c10::kCPU)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .dtype(fp32_output ? c10::kFloat : (bfloat16_output ? c10::kBFloat16 : c10::kByte))
           .memory_format(kSpatialDim == 2 ?
               c10::MemoryFormat::ChannelsLast :
@@ -1758,6 +1790,7 @@ namespace at::native {
       std::optional<std::string_view> algorithm) {
 #if AT_MKLDNN_ENABLED()
 
+<<<<<<< HEAD
     if (act.dim() == 3 || act.dim() == 5) {
       // Conv1D/3D post op check
       TORCH_CHECK(
@@ -1775,6 +1808,28 @@ namespace at::native {
         attr,
         ".")
     }
+=======
+    std::vector<std::string> supported_postop = {
+      "none"
+    };
+    if (act.dim() == 3) {
+      // Conv1D post op
+      supported_postop.push_back("relu");
+    } else if (act.dim() == 4) {
+      // Conv2D post op
+      supported_postop.push_back("relu");
+      supported_postop.push_back("hardtanh");
+      supported_postop.push_back("hardswish");
+      supported_postop.push_back("swish");
+    }
+    TORCH_CHECK(
+      std::find(supported_postop.begin(), supported_postop.end(), attr) != supported_postop.end(),
+      "Unsupported post op ",
+      attr,
+      " for quantized pointwise conv",
+      act.dim()-2,
+      "d.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _quantized_convolution_onednn(
         act, act_scale, act_zero_point,
         weight, weight_scales, weight_zero_points,
@@ -1920,7 +1975,11 @@ namespace {
  * FBGEMM uses vpmaddubsw instruction to multiply activations (uint8_t) and
  * weights (int8_t).
  *
+<<<<<<< HEAD
  * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16&expand=3284,3530
+=======
+ * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&expand=3284,3530&ig_expand=4236
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  *
  * vpmaddubsw operates on a vector of activations and a vector of
  * weights. If these vectors are
@@ -2079,6 +2138,11 @@ TORCH_LIBRARY_IMPL(onednn, MkldnnCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise"), at::native::QConvoneDNN::run_pointwise);
   m.impl(TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.tensor"), at::native::QConvoneDNN::run_pointwise_tensor);
   m.impl(TORCH_SELECTIVE_NAME("onednn::qconv3d_pointwise"), at::native::QConvoneDNN::run_pointwise);
+<<<<<<< HEAD
+=======
+  m.impl(TORCH_SELECTIVE_NAME("onednn::qconv_pointwise"), at::native::QConvoneDNN::run_pointwise);
+  m.impl(TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"), at::native::QConvoneDNN::run_pointwise_tensor);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Conv2D with binary postop
   m.impl(TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary"), at::native::QConvoneDNN::run_pointwise_binary);
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp
index 83273f979e95..3a8e80a5b9f4 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp
@@ -37,7 +37,11 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeight<
     unpacked_weights = kSpatialDim == 2
         ? at::_empty_affine_quantized(
               {output_channels, C_per_G, kernel_h, kernel_w},
+<<<<<<< HEAD
               device(c10::kCPU)
+=======
+              at::device(c10::kCPU)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   .dtype(c10::kQInt8)
                   .memory_format(c10::MemoryFormat::ChannelsLast),
               w_scale[0],
@@ -50,7 +54,11 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeight<
                   kernel_d,
                   kernel_h,
                   kernel_w,
+<<<<<<< HEAD
                   device(c10::kCPU).dtype(c10::kQInt8),
+=======
+                  at::device(c10::kCPU).dtype(c10::kQInt8),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   w_scale[0],
                   w_zp[0]);
   } else if (q_scheme == c10::kPerChannelAffine) {
@@ -58,16 +66,26 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeight<
         !transpose(),
         "Per Channel Quantization is currently disabled for transposed conv");
     auto scales = at::from_blob(
+<<<<<<< HEAD
         w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat));
     auto zero_points = at::from_blob(
         w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kInt));
+=======
+        w_scale.data(), w_scale.size(), at::device(c10::kCPU).dtype(c10::kFloat));
+    auto zero_points = at::from_blob(
+        w_zp.data(), w_zp.size(), at::device(c10::kCPU).dtype(c10::kInt));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unpacked_weights = kSpatialDim == 2
         ? at::_empty_per_channel_affine_quantized(
               {output_channels, C_per_G, kernel_h, kernel_w},
               scales.toType(c10::kDouble),
               zero_points.toType(c10::kLong),
               0, /* The output channel axis is 0 */
+<<<<<<< HEAD
               device(c10::kCPU).dtype(c10::kQInt8),
+=======
+              at::device(c10::kCPU).dtype(c10::kQInt8),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               c10::MemoryFormat::ChannelsLast)
         : at::native::fbgemm_utils::
               MakeEmptyPerChannelAffineQuantizedChannelsLast3dTensor(
@@ -76,7 +94,11 @@ std::tuple<at::Tensor, std::optional<at::Tensor>> PackedConvWeight<
                   kernel_d,
                   kernel_h,
                   kernel_w,
+<<<<<<< HEAD
                   device(c10::kCPU).dtype(c10::kQInt8),
+=======
+                  at::device(c10::kCPU).dtype(c10::kQInt8),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   scales.toType(c10::kDouble),
                   zero_points.toType(c10::kLong));
   } else {
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index 1ac999a92614..a14069ad0200 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -3,6 +3,10 @@
 #include <ATen/native/quantized/cpu/EmbeddingPackedParams.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/qembeddingbag.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/quantized/library.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/library.h>
 #ifdef USE_FBGEMM
 #include <fbgemm/Fbgemm.h>
@@ -28,8 +32,11 @@
 #include <arm_neon.h>
 #endif
 
+<<<<<<< HEAD
 int register_embedding_params();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 
 // Fallback implementation when FBGEMM is not available.
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index 49885a4990ba..4b6cc09ecdc7 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -7,6 +7,10 @@
 #include <ATen/core/custom_class.h>
 #include <ATen/native/quantized/cpu/EmbeddingPackedParams.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/quantized/library.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/ScalarType.h>
 #include <torch/library.h>
 
@@ -23,8 +27,11 @@
 
 #include <utility>
 
+<<<<<<< HEAD
 int register_embedding_params();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /*
  * Prepack function for embedding_bag weights.
  * This function expects a per-row quantized weight tensor
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
index 0a3e58d6cb9a..562dd7db570b 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
@@ -4,6 +4,10 @@
 #include <ATen/native/quantized/cpu/EmbeddingPackedParams.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/qembeddingbag.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/quantized/library.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 #include <torch/library.h>
 
@@ -17,8 +21,11 @@
 #include <ATen/ops/resize_native.h>
 #endif
 
+<<<<<<< HEAD
 int register_embedding_params();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 at::Tensor PackedEmbeddingBagWeight::unpack() {
   auto packed_weight = packed_w;
   at::Tensor weight_origin;
@@ -45,9 +52,15 @@ at::Tensor PackedEmbeddingBagWeight::unpack() {
             num_elem_per_byte};
 
     auto scales = at::from_blob(
+<<<<<<< HEAD
         w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat));
     auto zero_points = at::from_blob(
         w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kFloat));
+=======
+        w_scale.data(), w_scale.size(), at::device(c10::kCPU).dtype(c10::kFloat));
+    auto zero_points = at::from_blob(
+        w_zp.data(), w_zp.size(), at::device(c10::kCPU).dtype(c10::kFloat));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto output_columns = output_shape[1];
     uint8_t* output_data = nullptr;
@@ -59,7 +72,11 @@ at::Tensor PackedEmbeddingBagWeight::unpack() {
           scales.toType(c10::kFloat),
           zero_points.toType(c10::kFloat),
           0, // The output channel axis is 0
+<<<<<<< HEAD
           device(c10::kCPU).dtype(c10::kQUInt8));
+=======
+          at::device(c10::kCPU).dtype(c10::kQUInt8));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       output_data = static_cast<uint8_t*>(weight_origin.data_ptr());
     } else {
       // We create empty qtensor with the full output shape, and dtype set to
@@ -70,7 +87,11 @@ at::Tensor PackedEmbeddingBagWeight::unpack() {
           scales.toType(c10::kFloat),
           zero_points.toType(c10::kFloat),
           0, // The output channel axis is 0
+<<<<<<< HEAD
           device(c10::kCPU).dtype(c10::kQUInt4x2));
+=======
+          at::device(c10::kCPU).dtype(c10::kQUInt4x2));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       output_data = static_cast<uint8_t*>(weight_origin.data_ptr());
     }
 
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index f1a81269a44d..a1c360016089 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -1015,7 +1015,11 @@ static at::Tensor linear_int8_with_onednn_weight(
       other.value() :
       at::empty(
         dst_dims,
+<<<<<<< HEAD
         device(c10::kCPU)
+=======
+        at::device(c10::kCPU)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .dtype(fp32_output ? c10::kFloat : (bf16_output ? c10::kBFloat16 : c10::kByte))
       );
   if (output.numel() == 0) {
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index 5db6a6e14c4f..35d2880ec77e 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -652,7 +652,11 @@ static at::Tensor linear_dynamic_fp16_with_onednn_weight(
   std::vector<int64_t> dst_dims = {M, N};
   at::Tensor output = at::empty(
         dst_dims,
+<<<<<<< HEAD
         device(c10::kCPU)
+=======
+        at::device(c10::kCPU)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .dtype(c10::kFloat)
       );
   if (output.numel() == 0) {
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index 43b51c2dec4c..4032403de471 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -21,6 +21,11 @@
 #else
 #include <ATen/ops/_saturate_weight_to_fp16.h>
 #include <ATen/ops/_saturate_weight_to_fp16_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/_wrapped_linear_prepack_native.h>
+#include <ATen/ops/_wrapped_quantized_linear_prepacked_native.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/dequantize.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/quantize_per_tensor.h>
@@ -292,7 +297,11 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsOnednn::prepack(
   return ret_ptr;
 }
 
+<<<<<<< HEAD
 inline at::Tensor pack_weight_to_onednn_tensor(
+=======
+static inline at::Tensor pack_weight_to_onednn_tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const at::Tensor& weight,
     std::optional<torch::List<int64_t>>& input_shape) {
   std::vector<int64_t> w_dims = weight.sizes().vec();
@@ -312,7 +321,11 @@ inline at::Tensor pack_weight_to_onednn_tensor(
   return packed_weight;
 }
 
+<<<<<<< HEAD
 inline at::Tensor pack_weight_to_fp16_onednn_tensor(
+=======
+static inline at::Tensor pack_weight_to_fp16_onednn_tensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor& weight,
     std::optional<torch::List<int64_t>>& input_shape) {
   TORCH_CHECK(weight.scalar_type() == at::kHalf || weight.scalar_type() == at::kFloat, "Weight should be of type float or float16");
@@ -342,12 +355,20 @@ at::Tensor _saturate_weight_to_fp16(const Tensor& weight) {
 }
 
 template <class... Inputs>
+<<<<<<< HEAD
 inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
+=======
+static inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return {std::forward<Inputs>(inputs)...};
 }
 
 template <class... Args>
+<<<<<<< HEAD
 inline std::vector<c10::IValue> callOpByHandle(
+=======
+static inline std::vector<c10::IValue> callOpByHandle(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const c10::OperatorHandle& op,
     Args... args) {
   auto stack = makeStack(std::forward<Args>(args)...);
@@ -356,7 +377,11 @@ inline std::vector<c10::IValue> callOpByHandle(
 }
 
 template <class... Args>
+<<<<<<< HEAD
 inline std::vector<c10::IValue> callOpByName(
+=======
+static inline std::vector<c10::IValue> callOpByName(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const char* func_name,
     const char* overload_name,
     Args... args) {
@@ -366,7 +391,11 @@ inline std::vector<c10::IValue> callOpByName(
   return callOpByHandle(op_handle.value(), std::forward<Args>(args)...);
 }
 
+<<<<<<< HEAD
 at::Tensor wrapped_quantized_linear(
+=======
+static at::Tensor wrapped_quantized_linear(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor input,
     const at::Tensor& input_scale,
     const at::Tensor& input_zero_point,
@@ -422,7 +451,11 @@ at::Tensor wrapped_quantized_linear(
 #endif // USE_FBGEMM
 }
 
+<<<<<<< HEAD
 at::Tensor wrapped_quantized_linear_meta(
+=======
+static at::Tensor wrapped_quantized_linear_meta(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor input,
     [[maybe_unused]] const at::Tensor& input_scale,
     [[maybe_unused]] const at::Tensor& input_zero_point,
@@ -460,11 +493,14 @@ at::Tensor wrapped_quantized_linear_meta(
 at::Tensor _wrapped_linear_prepack(const at::Tensor& weight,
     const at::Tensor& weight_scale,
     const at::Tensor& weight_zero_point,
+<<<<<<< HEAD
     const at::Tensor& bias);
 
 at::Tensor _wrapped_linear_prepack(const at::Tensor& weight,
     const at::Tensor& weight_scale,
     const at::Tensor& weight_zero_point,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const at::Tensor& bias) {
   // This op does two things
   // 1. Use quantize_per_tensor to quantize the weight
@@ -500,6 +536,7 @@ at::Tensor _wrapped_quantized_linear_prepacked(const at::Tensor& input, const at
     const at::Tensor& packed_weight,
     const at::Tensor& output_scale,
     const at::Tensor& output_zero_point,
+<<<<<<< HEAD
     [[maybe_unused]] const int64_t out_channel);
 
 at::Tensor _wrapped_quantized_linear_prepacked(const at::Tensor& input, const at::Tensor& input_scale,
@@ -507,6 +544,8 @@ at::Tensor _wrapped_quantized_linear_prepacked(const at::Tensor& input, const at
     const at::Tensor& packed_weight,
     const at::Tensor& output_scale,
     const at::Tensor& output_zero_point,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [[maybe_unused]] const int64_t out_channel) {
   // This op is similar to wrapped_quantized_linear, but it takes the prepacked weight
 #ifdef USE_FBGEMM
@@ -528,12 +567,16 @@ at::Tensor _wrapped_quantized_linear_prepacked(const at::Tensor& input, const at
 #endif // USE_FBGEMM
 }
 
+<<<<<<< HEAD
 at::Tensor _wrapped_linear_prepack_meta(const at::Tensor& weight,
     [[maybe_unused]] const at::Tensor& weight_scale,
     [[maybe_unused]] const at::Tensor& weight_zero_point,
     [[maybe_unused]] const at::Tensor& bias);
 
 at::Tensor _wrapped_linear_prepack_meta(const at::Tensor& weight,
+=======
+static at::Tensor _wrapped_linear_prepack_meta(const at::Tensor& weight,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [[maybe_unused]] const at::Tensor& weight_scale,
     [[maybe_unused]] const at::Tensor& weight_zero_point,
     [[maybe_unused]] const at::Tensor& bias) {
@@ -551,6 +594,7 @@ at::Tensor _wrapped_linear_prepack_meta(const at::Tensor& weight,
 #endif // USE_FBGEMM
 }
 
+<<<<<<< HEAD
 at::Tensor _wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
     [[maybe_unused]] const at::Tensor& input_scale,
     [[maybe_unused]] const at::Tensor& input_zero_point,
@@ -560,6 +604,9 @@ at::Tensor _wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
     const int64_t out_channel);
 
 at::Tensor _wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
+=======
+static at::Tensor _wrapped_quantized_linear_prepacked_meta(const at::Tensor& input,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [[maybe_unused]] const at::Tensor& input_scale,
     [[maybe_unused]] const at::Tensor& input_zero_point,
     [[maybe_unused]] const at::Tensor& packed_weight,
diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp
index 661efa6ad819..91454ab8f2d1 100644
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@@ -81,10 +81,16 @@ Tensor _mul_out_xnnpack(
     const Tensor& other,
     double output_scale,
     int64_t output_zero_point) {
+<<<<<<< HEAD
   const string func_name = "xnnp_mul()";
   TORCH_CHECK(self.ndimension() > 0, func_name, ": Got empty input tensor.");
   TORCH_CHECK(
       at::native::xnnpack::available(), func_name, ": XNNPACK is not available")
+=======
+  TORCH_CHECK(self.ndimension() > 0, __func__, ": Got empty input tensor.");
+  TORCH_CHECK(
+      at::native::xnnpack::available(), __func__, ": XNNPACK is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // using qa memory format for qb to allow xnnpack kernel to flatten all the
   // dims
@@ -120,7 +126,11 @@ Tensor _mul_out_xnnpack(
     &subgraph_ptr);
   TORCH_CHECK(
       status == xnn_status_success,
+<<<<<<< HEAD
       func_name, ": xnn create subgraph failed(", status,")!");
+=======
+      __func__, ": xnn create subgraph failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
       subgraph_ptr, &xnn_delete_subgraph);
 
@@ -139,7 +149,11 @@ Tensor _mul_out_xnnpack(
   );
   TORCH_CHECK(
       status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
+<<<<<<< HEAD
       func_name, ": xnn define input 0 failed(", status,")!");
+=======
+      __func__, ": xnn define input 0 failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Defining the quantized input 1
   status = xnnp_define_q_tensor(
@@ -152,7 +166,11 @@ Tensor _mul_out_xnnpack(
   );
   TORCH_CHECK(
       status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
+<<<<<<< HEAD
       func_name, ": xnn define input 1 failed(", status,")!");
+=======
+      __func__, ": xnn define input 1 failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Defining the quantized output
   status = xnnp_define_q_tensor(
@@ -165,7 +183,11 @@ Tensor _mul_out_xnnpack(
   );
   TORCH_CHECK(
       status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
+<<<<<<< HEAD
       func_name, ": xnn define output failed(", status,")!");
+=======
+      __func__, ": xnn define output failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const struct xnn_binary_params binary_params = {output_min, output_max};
   status = xnn_define_binary(
@@ -178,17 +200,28 @@ Tensor _mul_out_xnnpack(
     0);
   TORCH_CHECK(
       status == xnn_status_success,
+<<<<<<< HEAD
       func_name, ": xnn define binary add failed(", status,")!");
+=======
+      __func__, ": xnn define binary add failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // create runtime
   xnn_runtime_t runtime_ptr = nullptr;
   status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
   TORCH_CHECK(
       status == xnn_status_success,
+<<<<<<< HEAD
       func_name, ": xnn create runtime failed(", status,")!");
   TORCH_CHECK(
       runtime_ptr != nullptr,
       func_name, ": xnn create runtime failed because runtime_ptr is null");
+=======
+      __func__, ": xnn create runtime failed(", status,")!");
+  TORCH_CHECK(
+      runtime_ptr != nullptr,
+      __func__, ": xnn create runtime failed because runtime_ptr is null");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
       runtime_ptr, &xnn_delete_runtime);
 
@@ -203,11 +236,19 @@ Tensor _mul_out_xnnpack(
     external.data());
   TORCH_CHECK(
       status == xnn_status_success,
+<<<<<<< HEAD
       func_name, ": xnn setup runtime failed(", status,")!");
   status = xnn_invoke_runtime(runtime_ptr);
   TORCH_CHECK(
       status == xnn_status_success,
       func_name, ": xnn invoke runtime failed(", status,")!");
+=======
+      __func__, ": xnn setup runtime failed(", status,")!");
+  status = xnn_invoke_runtime(runtime_ptr);
+  TORCH_CHECK(
+      status == xnn_status_success,
+      __func__, ": xnn invoke runtime failed(", status,")!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return out;
 }
@@ -277,6 +318,44 @@ Tensor _mul_scalar_out(Tensor& out, const Tensor& self, const Scalar& other) {
   return out;
   }
 
+<<<<<<< HEAD
+=======
+#if AT_MKLDNN_ENABLED()
+DEFINE_DISPATCH(qmul_tensor_cpu_stub);
+Tensor int8_mul_tensor_onednn(
+    const Tensor& self, double self_scale, int64_t self_zero_point,
+    const Tensor& other, double other_scale, int64_t other_zero_point,
+    double output_scale, int64_t output_zero_point, c10::ScalarType output_dtype) {
+  // Both inputs should have the same shape and both in uint8 dtype.
+  // If output_dtype is uint8, output is requantized with output scale/zero point.
+  // Otherwise, output scale should be 1 and zero point 0.
+  TORCH_CHECK(self.sizes() == other.sizes(),
+              "Quantized mul operands should have the same size.");
+  TORCH_CHECK(self.scalar_type() == at::kByte && other.scalar_type() == at::kByte,
+              "Quantized mul operands should be of type uint8, but got ",
+              self.scalar_type(), " and ", other.scalar_type());
+  TORCH_CHECK(output_dtype == at::kByte || output_dtype == at::kFloat || output_dtype == at::kBFloat16 || output_dtype == at::kHalf,
+              "Quantized mul output should be of type uint8, float, bfloat16 or float16, but got ",
+              output_dtype);
+  if (output_dtype != at::kByte) {
+    TORCH_CHECK(output_scale == 1.0 && output_zero_point == 0,
+                "Quantized mul output scale and zero point should be 1 and 0 for "
+                "output_dtype ", output_dtype, ", but got scale = ",
+                output_scale, " and zero point = ", output_zero_point);
+  }
+  at::Tensor out = at::empty_like(self, self.options().dtype(output_dtype));
+
+
+  qmul_tensor_cpu_stub(
+      self.device().type(), out, self, self_scale, self_zero_point,
+      other, other_scale, other_zero_point,
+      output_scale, output_zero_point);
+
+  return out;
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <bool ReLUFused = false>
 class QMul final {
  public:
@@ -370,6 +449,27 @@ class QMulScalarTensorOut final {
   }
 };
 
+<<<<<<< HEAD
+=======
+
+class QMulOnednn final {
+  public:
+  static Tensor run(
+    const Tensor self, double self_scale, int64_t self_zero_point,
+    const Tensor other, double other_scale, int64_t other_zero_point,
+    double output_scale, int64_t output_zero_point, c10::ScalarType output_dtype
+  ) {
+#if AT_MKLDNN_ENABLED()
+  return int8_mul_tensor_onednn(
+    self, self_scale, self_zero_point,
+    other, other_scale, other_zero_point,
+    output_scale, output_zero_point, output_dtype);
+#endif
+  TORCH_CHECK(false, "Unimplemented (int8 mul tensor with onednn)");
+  }
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul"),                 TORCH_FN(QMul</*ReLUFused=*/false>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul.out"),             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
@@ -395,5 +495,12 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu_out.Tensor"), TORCH_FN(QMulScalarTensorOut</*ReLUFused=*/true>::run));
 }
 
+<<<<<<< HEAD
+=======
+TORCH_LIBRARY_IMPL(onednn, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("onednn::qmul.tensor"), TORCH_FN(QMulOnednn::run));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }  // namespace
 }  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
index 7ea66d9fe8bc..8a7f5c6238e5 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
@@ -331,6 +331,12 @@ if(NOT TARGET clog)
     "${CONFU_DEPENDENCIES_BINARY_DIR}/clog")
   # We build static version of clog but a dynamic library may indirectly depend on it
   set_property(TARGET clog PROPERTY POSITION_INDEPENDENT_CODE ON)
+<<<<<<< HEAD
+=======
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    target_compile_options(clog PRIVATE "-Wno-unused-result")
+  endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 target_link_libraries(pytorch_qnnpack PUBLIC clog)
 
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/README.md b/aten/src/ATen/native/quantized/cpu/qnnpack/README.md
index ed6639c4ace9..6c35bb62068d 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/README.md
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/README.md
@@ -78,10 +78,17 @@ MAX_JOBS=1 scripts/build_local.sh -DBUILD_BINARY=ON -DBUILD_PYTHON=OFF \
     -DUSE_OBSERVERS=OFF -DUSE_DISTRIBUTED=OFF
 
 # Download model weights
+<<<<<<< HEAD
 wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb
 
 # Download model graph
 wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb
+=======
+wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb  # @lint-ignore
+
+# Download model graph
+wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Run speed benchmark with 50 warm-up iterations and 10 measurement iterations
 build/bin/speed_benchmark --net predict_net.pb --init_net init_net.pb \
@@ -104,11 +111,19 @@ scripts/build_android.sh -DANDROID_TOOLCHAIN=clang -DBUILD_BINARY=ON
 adb push build_android/bin/speed_benchmark /data/local/tmp/speed_benchmark
 
 # Download model weights and copy them to Android device
+<<<<<<< HEAD
 wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb
 adb push init_net.pb /data/local/tmp/init_net.pb
 
 # Download model graph and copy it to Android device
 wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb
+=======
+wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb  # @lint-ignore
+adb push init_net.pb /data/local/tmp/init_net.pb
+
+# Download model graph and copy it to Android device
+wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 adb push predict_net.pb /data/local/tmp/predict_net.pb
 
 # Run speed benchmark with 50 warm-up iterations and 10 measurement iterations
@@ -134,11 +149,19 @@ scripts/build_android.sh -DANDROID_ABI=arm64-v8a -DANDROID_TOOLCHAIN=clang -DBUI
 adb push build_android/bin/speed_benchmark /data/local/tmp/speed_benchmark
 
 # Download model weights and copy them to Android device
+<<<<<<< HEAD
 wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb
 adb push init_net.pb /data/local/tmp/init_net.pb
 
 # Download model graph and copy it to Android device
 wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb
+=======
+wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb  # @lint-ignore
+adb push init_net.pb /data/local/tmp/init_net.pb
+
+# Download model graph and copy it to Android device
+wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 adb push predict_net.pb /data/local/tmp/predict_net.pb
 
 # Run speed benchmark with 50 warm-up iterations and 10 measurement iterations
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/bench/hgemm.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/bench/hgemm.cc
index c5218e921d1c..44e4609d44b6 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/bench/hgemm.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/bench/hgemm.cc
@@ -40,7 +40,11 @@ class HGEMM : public benchmark::Fixture {
 
    void SetUp(const benchmark::State&) override {
     const uint_fast32_t seed =
+<<<<<<< HEAD
         std::chrono::system_clock::now().time_since_epoch().count();
+=======
+        std::chrono::steady_clock::now().time_since_epoch().count();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto rng = std::bind(
         fp16_ieee_from_fp32_value,
         std::bind(std::uniform_real_distribution<float>(), std::mt19937(seed)));
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/bench/q8gemm.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/bench/q8gemm.cc
index f30b1794701e..aba5a34d9b36 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/bench/q8gemm.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/bench/q8gemm.cc
@@ -374,7 +374,11 @@ class GEMMLOWP : public benchmark::Fixture {
  public:
    void SetUp(const benchmark::State& state) override {
     const uint_fast32_t seed =
+<<<<<<< HEAD
         std::chrono::system_clock::now().time_since_epoch().count();
+=======
+        std::chrono::steady_clock::now().time_since_epoch().count();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto rng =
         std::bind(std::uniform_int_distribution<uint8_t>(), std::mt19937(seed));
 
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/bench/requantization.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/bench/requantization.cc
index 40cc33f7cb0a..948e128cdc84 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/bench/requantization.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/bench/requantization.cc
@@ -43,9 +43,15 @@ class Requantization : public benchmark::Fixture {
     n_ = n_ / 16 * 16;
   }
 
+<<<<<<< HEAD
    void SetUp(const benchmark::State&) override {
     const uint_fast32_t seed =
         std::chrono::system_clock::now().time_since_epoch().count();
+=======
+  void SetUp(const benchmark::State&) override {
+    const uint_fast32_t seed =
+        std::chrono::steady_clock::now().time_since_epoch().count();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto rng =
         std::bind(std::uniform_int_distribution<int32_t>(), std::mt19937(seed));
 
@@ -55,7 +61,11 @@ class Requantization : public benchmark::Fixture {
     std::fill(output_.begin(), output_.end(), 0xA5);
   }
 
+<<<<<<< HEAD
    void TearDown(benchmark::State& state) override {
+=======
+  void TearDown(benchmark::State& state) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     state.SetItemsProcessed(uint64_t(state.iterations()) * n());
     state.SetBytesProcessed(
         uint64_t(state.iterations()) * n() *
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt
index af31056c4430..a471a890f9d4 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt
@@ -4,7 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+<<<<<<< HEAD
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+=======
+cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 include(GNUInstallDirs)
 
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-unpack.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-unpack.cc
index 02610c42c7b3..01acc22caa95 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-unpack.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-unpack.cc
@@ -41,7 +41,11 @@ void PackBMatrix::unpackWeights(
           kernel[(nr_block_start + nr_block_offset) * input_channels_ +
           (kr_block_start + kr_block_offset)] = *(packed.as_uint8_ptr++);
         }
+<<<<<<< HEAD
         if (kernel_zero_points != 0) {
+=======
+        if (kernel_zero_points != nullptr) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           for (size_t kr_block_offset = 0; kr_block_offset < (kr - kr_block_size);
                kr_block_offset++) {
             packed.as_uint8_ptr++;
@@ -50,7 +54,11 @@ void PackBMatrix::unpackWeights(
           packed.as_uint8_ptr += (kr - kr_block_size);
         }
       }
+<<<<<<< HEAD
       if (kernel_zero_points != 0) {
+=======
+      if (kernel_zero_points != nullptr) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         size_t remaining_nr_blocks = ((nr - nr_block_size) & (nr - 1));
         for (size_t nr_block_offset = 0; nr_block_offset < remaining_nr_blocks;
              nr_block_offset++) {
diff --git a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
index 5e5a2458cfca..0219ce8dc189 100644
--- a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
+++ b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
@@ -273,7 +273,11 @@ std::tuple<at::Tensor, at::Tensor> fused_moving_avg_obs_fake_quant_cuda(
     }
     _calculate_moving_average(
         y,
+<<<<<<< HEAD
         observer_on,
+=======
+        observer_on.to(at::kLong),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         running_min,
         running_max,
         averaging_const,
@@ -282,7 +286,11 @@ std::tuple<at::Tensor, at::Tensor> fused_moving_avg_obs_fake_quant_cuda(
   } else {
     _calculate_moving_average(
         x_contig,
+<<<<<<< HEAD
         observer_on,
+=======
+        observer_on.to(at::kLong),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         running_min,
         running_max,
         averaging_const,
@@ -295,7 +303,11 @@ std::tuple<at::Tensor, at::Tensor> fused_moving_avg_obs_fake_quant_cuda(
 
   _calc_moving_avg_qparams_helper(
       x_contig,
+<<<<<<< HEAD
       fake_quant_on,
+=======
+      fake_quant_on.to(at::kLong),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       running_min,
       running_max,
       scale_ptr,
@@ -316,7 +328,11 @@ std::tuple<at::Tensor, at::Tensor> fused_moving_avg_obs_fake_quant_cuda(
     }
   } else {
     return at::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams(
+<<<<<<< HEAD
         x, scale, zero_point, fake_quant_on, qmin, qmax);
+=======
+        x, scale, zero_point, fake_quant_on.to(at::kLong), qmin, qmax);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 } // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cudnn/Conv.cpp b/aten/src/ATen/native/quantized/cudnn/Conv.cpp
index edd4d0f5e760..3a78682ac746 100644
--- a/aten/src/ATen/native/quantized/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Conv.cpp
@@ -53,7 +53,11 @@ std::unordered_map<CacheKey, cudnn_frontend::ExecutionPlan, at::native::ParamsHa
 } // anonymous namespace
 // TODO: we can use cudnn_frontend::ExecutionPlanCache when it supports caching
 // multiple operators
+<<<<<<< HEAD
 // reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/conv_sample.cpp#L293
+=======
+// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/legacy_samples/conv_sample.cpp#L295
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //static cudnn_frontend::ExecutionPlanCache plan_cache("sample_cache");
 
 // the parameter quantized_output is a quantized tensor
diff --git a/aten/src/ATen/native/quantized/cudnn/Linear.cpp b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
index f8f7bcbcbbda..6fb62cfc1605 100644
--- a/aten/src/ATen/native/quantized/cudnn/Linear.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
@@ -79,7 +79,11 @@ std::unordered_map<CacheKey, cudnn_frontend::ExecutionPlan, at::native::ParamsHa
 }
 // TODO: we can use cudnn_frontend::ExecutionPlanCache when it supports caching
 // multiple operators
+<<<<<<< HEAD
 // reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/conv_sample.cpp#L293
+=======
+// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/legacy_samples/conv_sample.cpp#L295
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //static cudnn_frontend::ExecutionPlanCache plan_cache("sample_cache");
 
 // currently we only support int8 symmetric (zero_point = 0 for inputs and output) quantized linear op
diff --git a/aten/src/ATen/native/quantized/cudnn/Pooling.cpp b/aten/src/ATen/native/quantized/cudnn/Pooling.cpp
index 7e85ae9f468e..d3b20decbb89 100644
--- a/aten/src/ATen/native/quantized/cudnn/Pooling.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Pooling.cpp
@@ -76,7 +76,11 @@ Tensor adaptive_avg_pool2d_quantized_cuda(
 // any 3D tensors to 4D prior to using cudnn
 // This implementation currently uses the v7 cudnn APIs as v8 cudnn APIs are not yet available for
 // pooling operations.
+<<<<<<< HEAD
 // Consult https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnPoolingForward for
+=======
+// Consult https://docs.nvidia.com/deeplearning/cudnn/backend/latest/api/cudnn-ops-library.html#cudnnpoolingforward for
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // documentation on the APIs
 // Currently, it appears there is no cudnn support for dilated pooling -- we will
 // submit a feature request for this with cudnn
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 27c484c62bb9..2d86c260d2d0 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -258,6 +258,11 @@ TORCH_LIBRARY(onednn, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.tensor(Tensor qx, Tensor x_scale, Tensor x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
+<<<<<<< HEAD
+=======
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv_pointwise.tensor(Tensor qx, Tensor x_scale, Tensor x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Conv2D with binary postop
   m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor qaccum, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, float accum_scale, int accum_zero_point, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor"));
@@ -276,4 +281,14 @@ TORCH_LIBRARY(onednn, m) {
   // Linear with binary postop
   m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? other, Tensor? bias, float output_scale, int output_zero_point, ScalarType? output_dtype, float other_scale, int other_zp, str binary_post_op, float binary_alpha, str unary_post_op, Scalar?[] unary_post_op_args, str unary_post_op_algorithm) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_pointwise.binary_tensor(Tensor qx, Tensor x_scale, Tensor x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? other, Tensor? bias, float output_scale, int output_zero_point, ScalarType? output_dtype, float other_scale, int other_zp, str binary_post_op, float binary_alpha, str unary_post_op, Scalar?[] unary_post_op_args, str unary_post_op_algorithm) -> Tensor"));
+<<<<<<< HEAD
+=======
+  // int8 mul
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qmul.tensor(Tensor self, float self_scale, int self_zero_point, Tensor other, float other_scale, int other_zero_point, float output_scale, int output_zero_point, ScalarType output_dtype) -> Tensor"));
+  // int8 add
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qadd.tensor(Tensor self, float self_scale, int self_zero_point, Tensor other, float other_scale, int other_zero_point, float output_scale, int output_zero_point, ScalarType output_dtype) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qadd_relu.tensor(Tensor self, float self_scale, int self_zero_point, Tensor other, float other_scale, int other_zero_point, float output_scale, int output_zero_point, ScalarType output_dtype) -> Tensor"));
+  // int8 batch_norm2d
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qbatch_norm2d(Tensor qx, float qx_scale, int qx_zero_point, Tensor weight, Tensor bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point, ScalarType output_dtype) -> Tensor"));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
index 496ef0862b4e..a518bfc5cb8d 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
@@ -125,7 +125,11 @@ bool solve_arange(const Tensor& input, int64_t& start, int64_t& end, int64_t& st
   formats with support to batched and dense dimensions.
 */
 
+<<<<<<< HEAD
 static void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, const IntArrayRef size, const Layout& layout) {
+=======
+static void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, const IntArrayRef size, const Layout& layout, std::optional<bool> check_pinning_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Layout must be Sparse Compressed, 2.4
   AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args", [&]{});
 
@@ -134,6 +138,10 @@ static void _validate_sparse_compressed_tensor_args_worker(const Tensor& compres
   const std::string plain_indices_name = plainIndicesName(layout);
   const std::string compressed_dim_name = compressedDimName(layout);
   const std::string plain_dim_name = plainDimName(layout);
+<<<<<<< HEAD
+=======
+  const bool check_pinning = check_pinning_.value_or(true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Layout Invariants
 
@@ -276,10 +284,17 @@ static void _validate_sparse_compressed_tensor_args_worker(const Tensor& compres
   // Device Invariants
   // 4.1
   TORCH_CHECK(
+<<<<<<< HEAD
       values.device().type() == kCPU || values.device().type() == kCUDA || values.device().type() == kXPU || values.device().type() == kMeta,
       "device type of values (",
       values.device().type(),
       ") must be CPU or CUDA or XPU or Meta");
+=======
+      values.device().type() == kCPU || values.device().type() == kCUDA || values.device().type() == kXPU || values.device().type() == kMeta || values.device().type() == kPrivateUse1,
+      "device type of values (",
+      values.device().type(),
+      ") must be one of CPU, CUDA, XPU, Meta or PrivateUse1")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // 4.2, 4.3, 4.4
   TORCH_CHECK(
       compressed_indices.get_device() == values.get_device(),
@@ -295,20 +310,33 @@ static void _validate_sparse_compressed_tensor_args_worker(const Tensor& compres
       ") must match device of ", plain_indices_name, " (=",
       plain_indices.device(),
       ")");
+<<<<<<< HEAD
   TORCH_CHECK(
+=======
+  if (check_pinning) {
+    TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       compressed_indices.is_pinned() == values.is_pinned(),
       "memory pinning of ", compressed_indices_name, " (=",
       compressed_indices.is_pinned(),
       ") must match memory pinning of values (=",
       values.is_pinned(),
       ")");
+<<<<<<< HEAD
   TORCH_CHECK(
+=======
+    TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       compressed_indices.is_pinned() == plain_indices.is_pinned(),
       "memory pinning of ", compressed_indices_name, " (=",
       compressed_indices.is_pinned(),
       ") must match memory pinning of ", plain_indices_name, " (=",
       plain_indices.is_pinned(),
       ")");
+<<<<<<< HEAD
+=======
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Autograd Invariants
   //
@@ -319,6 +347,7 @@ static void _validate_sparse_compressed_tensor_args_worker(const Tensor& compres
   TORCH_INTERNAL_ASSERT(!plain_indices.requires_grad());
 }
 
+<<<<<<< HEAD
 void _validate_sparse_compressed_tensor_args(const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, IntArrayRef size, Layout layout) {
   _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, size, layout);
 }
@@ -337,6 +366,26 @@ void _validate_sparse_bsr_tensor_args(const Tensor& crow_indices, const Tensor&
 
 void _validate_sparse_bsc_tensor_args(const Tensor& ccol_indices, const Tensor& row_indices, const Tensor& values, IntArrayRef size) {
   _validate_sparse_compressed_tensor_args_worker(ccol_indices, row_indices, values, size, kSparseBsc);
+=======
+void _validate_sparse_compressed_tensor_args(const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, IntArrayRef size, Layout layout, std::optional<bool> check_pinning) {
+  _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, size, layout, check_pinning);
+}
+
+void _validate_sparse_csr_tensor_args(const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, IntArrayRef size, std::optional<bool> check_pinning) {
+  _validate_sparse_compressed_tensor_args_worker(crow_indices, col_indices, values, size, kSparseCsr, check_pinning);
+}
+
+void _validate_sparse_csc_tensor_args(const Tensor& ccol_indices, const Tensor& row_indices, const Tensor& values, IntArrayRef size, std::optional<bool> check_pinning) {
+  _validate_sparse_compressed_tensor_args_worker(ccol_indices, row_indices, values, size, kSparseCsc, check_pinning);
+}
+
+void _validate_sparse_bsr_tensor_args(const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, IntArrayRef size, std::optional<bool> check_pinning) {
+  _validate_sparse_compressed_tensor_args_worker(crow_indices, col_indices, values, size, kSparseBsr, check_pinning);
+}
+
+void _validate_sparse_bsc_tensor_args(const Tensor& ccol_indices, const Tensor& row_indices, const Tensor& values, IntArrayRef size, std::optional<bool> check_pinning) {
+  _validate_sparse_compressed_tensor_args_worker(ccol_indices, row_indices, values, size, kSparseBsc, check_pinning);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Construction of CSR, CSC, BSR, and BSC tensors.
@@ -429,7 +478,11 @@ Tensor sparse_compressed_tensor_with_dims(
     compressed_indices_size.push_back(compressed_size / blocksize[d0] + 1);
     values_size.append(DimVector(blocksize));
   } else {
+<<<<<<< HEAD
     TORCH_CHECK(blocksize.size() == 0, "sparse_compressed_tensor_with_dims: blocksize cannot be specified for non-block layout ", layout_);
+=======
+    TORCH_CHECK(blocksize.empty(), "sparse_compressed_tensor_with_dims: blocksize cannot be specified for non-block layout ", layout_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     compressed_indices_size.push_back(size[compressedDimension(layout_, size, dense_dim)] + 1);
   }
 
@@ -467,7 +520,11 @@ Tensor _sparse_compressed_tensor_unsafe_symint(
   Layout layout_ = layout.value();
   AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout_, "sparse_compressed_tensor_unsafe", [&]{});
   if (at::globalContext().checkSparseTensorInvariants()) {
+<<<<<<< HEAD
     _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, C10_AS_INTARRAYREF_SLOW(size), layout_);
+=======
+    _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, C10_AS_INTARRAYREF_SLOW(size), layout_, true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory);
   SparseCsrTensor self = new_compressed_tensor(options);
@@ -480,7 +537,11 @@ Tensor _sparse_compressed_tensor_unsafe_symint(
 }
 
 template <Layout required_layout>
+<<<<<<< HEAD
 Tensor _sparse_compressed_tensor_unsafe_template(const Tensor& compressed_indices,
+=======
+static Tensor _sparse_compressed_tensor_unsafe_template(const Tensor& compressed_indices,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                                  const Tensor& plain_indices,
                                                  const Tensor& values,
                                                  IntArrayRef size,
@@ -491,7 +552,11 @@ Tensor _sparse_compressed_tensor_unsafe_template(const Tensor& compressed_indice
   Layout layout_ = layout.value_or(required_layout);
   TORCH_CHECK(layout_ == required_layout, "sparse compressed layout must be ",required_layout, " but got ", layout_);
   if (at::globalContext().checkSparseTensorInvariants()) {
+<<<<<<< HEAD
     _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, size, layout_);
+=======
+    _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, size, layout_, true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory);
   SparseCsrTensor self = new_compressed_tensor(options);
@@ -967,7 +1032,11 @@ Tensor empty_like_sparse_csr(
 }
 
 template <bool require_view, bool require_copy>
+<<<<<<< HEAD
 Tensor select_sparse_csr_worker(const Tensor& self, int64_t dim, int64_t index) {
+=======
+static Tensor select_sparse_csr_worker(const Tensor& self, int64_t dim, int64_t index) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef STRIP_ERROR_MESSAGES
   constexpr const char* select_name = (require_view ? "select()" : "select_copy()");
 #endif
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index 5de47cfb5751..dae701f2d2bf 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -219,7 +219,11 @@ Tensor& mul_out_sparse_csr(const Tensor& t_, const Tensor& src_, Tensor& r) {
 }
 
 template <typename op_t>
+<<<<<<< HEAD
 Tensor intersection_binary_op_with_wrapped_scalar(const Tensor& sparse, const Tensor& scalar, const op_t& op) {
+=======
+static Tensor intersection_binary_op_with_wrapped_scalar(const Tensor& sparse, const Tensor& scalar, const op_t& op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOTE: intersection_binary_op_with_wrapped_scalar assumes scalar.numel() == 1.
   const auto result_values = op(sparse.values(), scalar.squeeze()).to(at::result_type(sparse, scalar));
   const auto result_sizes = infer_size(sparse.sizes(), scalar.sizes());
@@ -233,7 +237,11 @@ Tensor intersection_binary_op_with_wrapped_scalar(const Tensor& sparse, const Te
 }
 
 template <typename op_t>
+<<<<<<< HEAD
 Tensor& intersection_binary_op_with_wrapped_scalar_(Tensor& sparse, const Tensor& scalar, const string& op_name, const op_t& op) {
+=======
+static Tensor& intersection_binary_op_with_wrapped_scalar_(Tensor& sparse, const Tensor& scalar, const std::string& op_name, const op_t& op) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOTE: intersection_binary_op_with_wrapped_scalar_ assumes scalar.numel() == 1.
   const auto broadcasted_shape = infer_size(sparse.sizes(), scalar.sizes());
   if (sparse.sizes() != broadcasted_shape) {
@@ -522,7 +530,11 @@ CREATE_UNARY_UFUNC_FUNCTIONAL(isnan)
 CREATE_UNARY_UFUNC_FUNCTIONAL(isinf)
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void addmm_out_sparse_csr_native_cpu(
+=======
+static void addmm_out_sparse_csr_native_cpu(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& sparse,
     const Tensor& dense,
     const Tensor& r,
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 4746cfdfc230..6b0bbd1110c9 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -356,6 +356,7 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_,
     computed_sizes[static_cast<size_t>(sparse_dim + d)] = values.size(d + 1);
   }
 
+<<<<<<< HEAD
   return at::_sparse_coo_tensor_with_dims_and_tensors(
       sparse_dim,
       dense_dim,
@@ -363,6 +364,16 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_,
       indices,
       values,
       values.options().layout(kSparse),
+=======
+  return at::native::_sparse_coo_tensor_unsafe(
+      indices,
+      values,
+      computed_sizes,
+      optTypeMetaToScalarType(options.dtype_opt()),
+      options.layout_opt(),
+      options.device_opt(),
+      options.pinned_memory_opt(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       is_coalesced);
 }
 
@@ -370,9 +381,17 @@ void _validate_sparse_coo_tensor_args(
     const Tensor& indices,
     const Tensor& values_,
     ArrayRef<int64_t> size,
+<<<<<<< HEAD
     std::optional<bool> is_coalesced_) {
   Tensor values = expand_values_if_needed(values_);
   bool is_coalesced = is_coalesced_.value_or(false);
+=======
+    std::optional<bool> is_coalesced_,
+    std::optional<bool> check_pinning_) {
+  Tensor values = expand_values_if_needed(values_);
+  bool is_coalesced = is_coalesced_.value_or(false);
+  const bool check_pinning = check_pinning_.value_or(true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // the following checks are redundant because they are also checked in
   // SparseTensorImpl::set_indices_and_values_unsafe but we need to ensure them
@@ -396,6 +415,7 @@ void _validate_sparse_coo_tensor_args(
       "), but got ",
       size.size());
 
+<<<<<<< HEAD
   TORCH_CHECK(
       indices.is_pinned() == values.is_pinned(),
       "memory pinning of indices (=",
@@ -403,6 +423,17 @@ void _validate_sparse_coo_tensor_args(
       ") must match memory pinning of values (=",
       values.is_pinned(),
       ")");
+=======
+  if (check_pinning) {
+    TORCH_CHECK(
+        indices.is_pinned() == values.is_pinned(),
+        "memory pinning of indices (=",
+        indices.is_pinned(),
+        ") must match memory pinning of values (=",
+        values.is_pinned(),
+        ")");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Check to make sure all indices are within the boundaries of `size`
   if (indices.numel() > 0) {
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 07353450c375..e30d9b24409f 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -551,7 +551,11 @@ static SparseTensor& add_out_sparse_non_contiguous(SparseTensor& r, const Sparse
     return r;
 }
 
+<<<<<<< HEAD
 Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTensor& sparse_, const Scalar& value);
+=======
+static Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTensor& sparse_, const Scalar& value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 SparseTensor& add_out_sparse_cpu(const SparseTensor& t, const SparseTensor& src, const Scalar& value, SparseTensor& r) {
   if (!t.is_sparse()) {
@@ -593,7 +597,11 @@ SparseTensor& add_out_sparse_cpu(const SparseTensor& t, const SparseTensor& src,
 //    formerly known as spcadd
 // --------------------------------------------------------------------
 template <typename scalar_t>
+<<<<<<< HEAD
 void add_dense_sparse_worker_non_hybrid_cpu(Tensor& r, const Scalar& value, const SparseTensor& sparse, const Tensor& indices, const Tensor& values) {
+=======
+static void add_dense_sparse_worker_non_hybrid_cpu(Tensor& r, const Scalar& value, const SparseTensor& sparse, const Tensor& indices, const Tensor& values) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto indices_accessor = indices.accessor<int64_t, 2>();
   auto values_accessor = values.accessor<scalar_t, 1>();
 
@@ -616,7 +624,11 @@ void add_dense_sparse_worker_non_hybrid_cpu(Tensor& r, const Scalar& value, cons
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void add_dense_sparse_worker_hybrid_cpu(Tensor& r, const Scalar& value, const SparseTensor& sparse, const Tensor& indices, const Tensor& values) {
+=======
+static inline void add_dense_sparse_worker_hybrid_cpu(Tensor& r, const Scalar& value, const SparseTensor& sparse, const Tensor& indices, const Tensor& values) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Get the dense dimension element numbers of hybrid sparse tensor
   int64_t values_dense_size = values.stride(0);
@@ -647,7 +659,11 @@ inline void add_dense_sparse_worker_hybrid_cpu(Tensor& r, const Scalar& value, c
 }
 
 template <typename scalar_t>
+<<<<<<< HEAD
 inline void add_dense_sparse_worker_non_coalesced_cpu(Tensor& r, const Scalar& value,
+=======
+static inline void add_dense_sparse_worker_non_coalesced_cpu(Tensor& r, const Scalar& value,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const SparseTensor& sparse, const Tensor& indices, const Tensor& values) {
 
   // Get the dense dimension element numbers of hybrid sparse tensor
@@ -829,7 +845,11 @@ Tensor& mul_sparse_(Tensor& self, const Tensor& other) {
 // so it is up to the user to supply right implementations for non-commutative
 // operations.
 template <typename binary_func_t>
+<<<<<<< HEAD
 Tensor& intersection_binary_op_sparse_dense_out(
+=======
+static Tensor& intersection_binary_op_sparse_dense_out(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& d,
     const SparseTensor& s_,
     Tensor& res,
@@ -1183,7 +1203,11 @@ SparseTensor& mul_out_sparse_cpu(const Tensor& t_, const Tensor& src_, Tensor& r
 // --------------------------------------------------------------------
 
 template <typename scalar_t>
+<<<<<<< HEAD
 void s_addmm_out_sparse_dense_worker(int64_t nnz, int64_t dim_i, int64_t dim_j, int64_t dim_k, Tensor& r, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& indices, const Tensor& values, const Tensor& dense) {
+=======
+static void s_addmm_out_sparse_dense_worker(int64_t nnz, int64_t dim_i, int64_t dim_j, int64_t dim_k, Tensor& r, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& indices, const Tensor& values, const Tensor& dense) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // r_ = alpha * sparse * dense
   scalar_t cast_alpha = alpha.to<scalar_t>();
@@ -1905,7 +1929,11 @@ Tensor bmm_sparse_cpu(const SparseTensor& self, const Tensor& mat2) {
 // Returns the index of the found element.
 // Returns by reference `found`, true if search value was found, false otherwise
 template<typename scalar_t>
+<<<<<<< HEAD
 scalar_t binary_search_strided_rightmost(scalar_t search_val, TensorAccessor<scalar_t, 1>& sorted_arr_accessor, int64_t sorted_arr_begin_idx, int64_t length, bool* found) {
+=======
+static scalar_t binary_search_strided_rightmost(scalar_t search_val, TensorAccessor<scalar_t, 1>& sorted_arr_accessor, int64_t sorted_arr_begin_idx, int64_t length, bool* found) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (length == 0) {
     *found = false;
     return -1;
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
index e43df8e048e8..4ee62bea0a90 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@@ -615,7 +615,11 @@ void spmm(
 
   // CUDA < 11.0 doesn't support 64-bit indices and doesn't raise an error about this
   // silently returning incorrect results
+<<<<<<< HEAD
 #if defined(USE_ROCM)
+=======
+#if defined(USE_ROCM) && (ROCM_VERSION < 60300)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto mat1_32 = at::native::_sparse_csr_tensor_unsafe(
       mat1.crow_indices().to(kInt),
       mat1.col_indices().to(kInt),
@@ -626,11 +630,19 @@ void spmm(
       mat1.device());
   auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(mat1_32);
   auto algorithm = CUSPARSE_MM_ALG_DEFAULT;
+<<<<<<< HEAD
 #else
   // TODO: update this to support COO sparse layout
   auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(mat1);
   auto algorithm = CUSPARSE_SPMM_CSR_ALG2;
 #endif
+=======
+#else // defined(USE_ROCM) && (ROCM_VERSION < 60300)
+  // TODO: update this to support COO sparse layout
+  auto descA = at::cuda::sparse::CuSparseSpMatCsrDescriptor(mat1);
+  auto algorithm = CUSPARSE_SPMM_CSR_ALG2;
+#endif // defined(USE_ROCM) && (ROCM_VERSION < 60300)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto descB = at::cuda::sparse::CuSparseConstDnMatDescriptor(
       transpose_B ? mat2_->mT() : *mat2_);
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
index 133a73505dcf..0829a87f09ce 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
@@ -7,6 +7,7 @@
 
 #include <cusparse.h>
 
+<<<<<<< HEAD
 // LIMITATION (cusparseSpMM):
 // The generic APIs are available on all platforms on CUDA 11.0
 // For CUDA 10.1+ it is available for all platforms except Windows.
@@ -68,6 +69,9 @@ const char* cusparseGetErrorString(cusparseStatus_t status) {
   }
 }
 #endif
+=======
+#include <library_types.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace at::native::sparse::cuda {
 
@@ -92,8 +96,11 @@ cusparseOperation_t convertTransToCusparseOperation(char trans) {
   }
 }
 
+<<<<<<< HEAD
 #if IS_SPMM_AVAILABLE() || IS_SPMM_HIP_AVAILABLE()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 template<typename T>
 void _csrmm2(
@@ -259,6 +266,7 @@ template<> void csrmm2<c10::complex<double>>(
     reinterpret_cast<cuDoubleComplex*>(c), ldc, CUDA_C_64F);
 }
 
+<<<<<<< HEAD
 #else
 
 void adjustLd(char transb, int64_t m, int64_t n, int64_t k, int64_t *ldb, int64_t *ldc)
@@ -464,6 +472,8 @@ template<> void csrmm2<c10::complex<double>>(
 
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /* format conversion */
 void CreateIdentityPermutation(int64_t nnz, int *P) {
   TORCH_CHECK((nnz <= INT_MAX),
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 7d678ef1d740..8f359feb998c 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -573,7 +573,11 @@ Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_
   }
 
   const bool sum_all_sparse_dim = (input_sparse_dim == sparse_dims_to_sum_size);
+<<<<<<< HEAD
   const bool sum_dense_dim = (dense_dims_to_sum_v.size() > 0);
+=======
+  const bool sum_dense_dim = !dense_dims_to_sum_v.empty();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const bool sum_sparse_dim = (sparse_dims_to_sum_size > 0);
 
   if (sum_all_sparse_dim) {
diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index 23fcd26cf412..af212cc6a948 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -40,7 +40,11 @@
 #include <thrust/iterator/discard_iterator.h>
 
 
+<<<<<<< HEAD
 #if defined(__CUDACC__) && (CUSPARSE_VERSION >= 11000)
+=======
+#if defined(__CUDACC__) && ((CUSPARSE_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define IS_CUSPARSE11_AVAILABLE() 1
 #else
 #define IS_CUSPARSE11_AVAILABLE() 0
@@ -207,13 +211,29 @@ struct CusparseMatrixMultiplyOp {
 
   CusparseMatrixMultiplyOp() {
     static_assert(
+<<<<<<< HEAD
       std::is_same_v<c10::Half, scalar_t> ||
           std::is_same_v<c10::BFloat16, scalar_t> ||
+=======
+      #if !defined(USE_ROCM)
+          std::is_same_v<c10::Half, scalar_t> ||
+          std::is_same_v<c10::BFloat16, scalar_t> ||
+      #endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           std::is_same_v<float, scalar_t> ||
           std::is_same_v<double, scalar_t> ||
           std::is_same_v<c10::complex<float>, scalar_t> ||
           std::is_same_v<c10::complex<double>, scalar_t>,
+<<<<<<< HEAD
       "cusparseSpGEMM only supports data type of half, bfloat16, float, double and complex float, double.");
+=======
+      #if !defined(USE_ROCM)
+          "cusparseSpGEMM only supports data type of half, bfloat16, float, double and complex float, double."
+      #else
+          "cusparseSpGEMM only supports data type of float, double and complex float, double."
+      #endif
+      );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // SpGEMM Computation
     TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_createDescr(&spgemmDesc));
   }
@@ -268,11 +288,22 @@ struct CusparseMatrixMultiplyOp {
 
     // If a specific GPU model does not provide native support for a given data type,
     // the routine returns CUSPARSE_STATUS_ARCH_MISMATCH error
+<<<<<<< HEAD
+=======
+    #if defined(USE_ROCM)
+    TORCH_CHECK(!(computeType == CUDA_R_16F || computeType == CUDA_R_16BF),
+        "sparse_mm: Float16 and BFloat16 are not supported on ROCm");
+    #else // defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
     TORCH_CHECK(prop->major >= 5 && !((10*prop->major + prop->minor) < 53 && computeType == CUDA_R_16F),
         "sparse_mm: CUDA Float16 requires compute capability >= 53 (current: ", prop->major, prop->minor, ")");
     TORCH_CHECK(!(prop->major < 8 && computeType == CUDA_R_16BF),
         "sparse_mm: CUDA BFloat16 requires compute capability >= 80 (current: ", prop->major, prop->minor, ")");
+<<<<<<< HEAD
+=======
+    #endif // defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // ask bufferSize1 bytes for external memory
     TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_workEstimation(
@@ -810,9 +841,20 @@ Tensor sparse_sparse_matmul_cuda(const Tensor& mat1_, const Tensor& mat2_) {
   auto output = at::native::empty_like(mat1_);
   output.sparse_resize_and_clear_({mat1_.size(0), mat2_.size(1)}, mat1_.sparse_dim(), 0);
 
+<<<<<<< HEAD
 #if IS_CUSPARSE11_AVAILABLE()
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, mat1_.scalar_type(), "sparse_matmul", [&] {
     sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
+=======
+#if IS_CUSPARSE11_AVAILABLE() && !defined(USE_ROCM)
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, mat1_.scalar_type(), "sparse_matmul", [&] {
+      sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
+  });
+#elif IS_CUSPARSE11_AVAILABLE() && defined(USE_ROCM)
+  // ROCm does not support half and bfloat16 types for sparse_matmul
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] {
+      sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 #else
   AT_DISPATCH_FLOATING_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] {
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApplyDense.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApplyDense.cu
index 925a33b0bbd8..ee1f6125d419 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApplyDense.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApplyDense.cu
@@ -4,7 +4,11 @@
 #include <ATen/autocast_mode.h>
 #include <c10/cuda/CUDAGuard.h>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #include <ATen/native/sparse/cuda/ComputeSparseTile.h>
 #include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
@@ -12,7 +16,11 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 struct Params {
   uint64_t const* threads_masks;
@@ -123,7 +131,11 @@ Tensor _sparse_semi_structured_apply_dense(
     const Tensor& input,
     const Tensor& threads_masks) {
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "_sparse_semi_structured_apply_dense: not supported");
   return Tensor{};
 #else
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu
index 287938962a34..4c0fa7908faa 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu
@@ -3,7 +3,11 @@
 #include <ATen/cuda/CUDAUtils.h>
 #include <ATen/Dispatch.h>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #include <cuda_runtime.h>
 #include <cutlass/cutlass.h>
@@ -16,7 +20,11 @@
 #include <type_traits>
 #include <tuple>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #define CUTLASS_STATUS_CHECK(status)                                      \
   {                                                                       \
@@ -31,7 +39,11 @@ namespace {
 
 namespace at::native {
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 // Wrapper function for CUTLASS sparse GEMM implementation, used
 // solely to simplify dispatching from
@@ -75,12 +87,15 @@ Tensor two_four_sgemm(
     using LayoutC = LayoutOutput;
     constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
 
+<<<<<<< HEAD
     using BiasTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
         ThreadblockShape,
         WarpShape,
         ElementC,
         AlignmentC,
         NumEVTEpilogueStages>;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
         ThreadblockShape,
         WarpShape,
@@ -94,7 +109,11 @@ Tensor two_four_sgemm(
         cutlass::epilogue::threadblock::VisitorScalarBroadcast<ElementC>;
     using BiasTensor =
         cutlass::epilogue::threadblock::VisitorColBroadcast<
+<<<<<<< HEAD
             BiasTileThreadMap,
+=======
+            OutputTileThreadMap,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ElementC,
             cute::Stride<cute::_1, cute::_0, int64_t>>;
     using Bias = std::conditional_t<use_bias, BiasTensor, BiasScalar>;
@@ -251,7 +270,11 @@ Tensor two_four_sgemm(
                         ElementC(0),
                         {cute::_1{}, cute::_0{}, problem_size.m()}};
             } else {
+<<<<<<< HEAD
                 return {ElementC(0)};
+=======
+                return {{ElementC(0)}};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         }()
     };
@@ -619,7 +642,11 @@ Tensor _sparse_semi_structured_linear(
                     "removed in a future PyTorch release.  Please use "
                     "_sparse_semi_structured_mm/_sparse_semi_structured_addmm "
                     "instead.");
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false, "_sparse_semi_structured_linear: CUTLASS not supported");
     return Tensor{};
 #else
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
index 3b0f3a3170ca..ae9b5bd9116f 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
@@ -3,7 +3,11 @@
 #include <ATen/cuda/CUDAUtils.h>
 #include <ATen/Dispatch.h>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #include <cuda_runtime.h>
 #include <cutlass/cutlass.h>
@@ -16,7 +20,11 @@
 #include <type_traits>
 #include <tuple>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #define CUTLASS_STATUS_CHECK(status)                                    \
   {                                                                     \
@@ -28,7 +36,11 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 // Wrapper function for CUTLASS sparse GEMM implementation, used
 // solely to simplify dispatching from
@@ -70,12 +82,15 @@ void spgemm_cutlass(
     using LayoutC = LayoutOutput;
     constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
 
+<<<<<<< HEAD
     using TensorCTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
         ThreadblockShape,
         WarpShape,
         ElementC,
         AlignmentC,
         NumEVTEpilogueStages>;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
         ThreadblockShape,
         WarpShape,
@@ -105,7 +120,11 @@ void spgemm_cutlass(
         cutlass::epilogue::threadblock::VisitorScalarBroadcast<ElementC>;
     using TensorCTensor =
         cutlass::epilogue::threadblock::VisitorColBroadcast<
+<<<<<<< HEAD
             TensorCTileThreadMap,
+=======
+            OutputTileThreadMap,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ElementC,
             cute::Stride<cute::_1, cute::_0, int64_t>>;
     using TensorC = std::conditional_t<use_tensor_c, TensorCTensor, TensorCScalar>;
@@ -215,7 +234,11 @@ void spgemm_cutlass(
                           std::is_same_v<ElementComputeEpilogue, cutlass::bfloat16_t>) {
                 return {ElementComputeEpilogue{alpha.to<float>()}};
             } else {
+<<<<<<< HEAD
                 return {alpha.to<ElementComputeEpilogue>()};
+=======
+                return {{alpha.to<ElementComputeEpilogue>()}};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         }()
     };
@@ -225,7 +248,11 @@ void spgemm_cutlass(
                           std::is_same_v<ElementComputeEpilogue, cutlass::bfloat16_t>) {
                 return {ElementComputeEpilogue{beta.to<float>()}};
             } else {
+<<<<<<< HEAD
                 return {beta.to<ElementComputeEpilogue>()};
+=======
+                return {{beta.to<ElementComputeEpilogue>()}};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         }()
     };
@@ -236,7 +263,11 @@ void spgemm_cutlass(
                         ElementC(0),
                         {cute::_1{}, cute::_0{}, problem_size.m()}};
             } else {
+<<<<<<< HEAD
                 return {ElementC(0)};
+=======
+                return {{ElementC(0)}};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         }()
     };
@@ -532,7 +563,11 @@ Tensor sparse_semi_structured_mad_op(
       const Tensor& mat1, const Tensor& mat1_meta, const Tensor& mat2,
       const std::optional<Tensor>& input_opt, const Scalar& alpha,
       const Scalar& beta, const std::optional<c10::ScalarType> out_dtype_opt) {
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(false, __func__, " : CUTLASS not supported");
     return Tensor{};
 #else
@@ -824,7 +859,11 @@ Tensor _sparse_semi_structured_addmm(
 // Following is just for testing purposes.
 namespace at::native {
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 // Copied from tools/util/include/host_reorder.h, from CUTLASS source
 // tree.  This is for simplicity - namely, this file is not under
@@ -862,7 +901,11 @@ static void reorder_meta(cutlass::TensorRef<Element, LayoutDest> dest,
 
 std::tuple<Tensor, Tensor>
 _to_sparse_semi_structured(const Tensor& dense) {
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, __func__, " : CUTLASS not supported");
   return std::make_tuple(Tensor{}, Tensor{});
 #else
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu
index 5f94e013f3fe..129164db18f5 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu
@@ -8,7 +8,11 @@
 #include <ATen/cuda/CUDAUtils.h>
 #include <ATen/Dispatch.h>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #include <ATen/native/sparse/cuda/ComputeSparseTile.h>
 #include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
@@ -17,7 +21,11 @@
 
 namespace at::native {
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 struct MetadataCuSparseLt {
   // Format used by cuSparseLt
@@ -280,7 +288,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _sparse_semi_structured_tile(
   std::string_view algorithm,
   bool use_cutlass)
 {
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "_sparse_semi_structured_tile: not supported");
   return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{}, Tensor{});
 #else
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
index 9b9b1bc0cc60..8a279aea9184 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
@@ -5,14 +5,22 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/accumulate.h>
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
 #endif
 
 namespace at::native {
 
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 template <typename KT>
 __global__ void __launch_bounds__(32 /* num_threads */)
@@ -89,7 +97,11 @@ std::tuple<Tensor, Tensor> _sparse_semi_structured_apply_typed(Tensor input, Ten
 
 std::tuple<Tensor, Tensor> _sparse_semi_structured_apply(const Tensor& input, const Tensor& threads_masks) // Returned by `_sparse_semi_structured_tile`
 {
+<<<<<<< HEAD
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+=======
+#if defined(USE_ROCM) || defined(_MSC_VER)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "_sparse_semi_structured_apply: not supported");
   return std::make_tuple(Tensor{}, Tensor{});
 #else
diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
index e40b68ee0bfc..b7577b793787 100644
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@@ -1,5 +1,11 @@
 #include <ATen/native/sparse/cuda/cuSPARSELtOps.h>
+<<<<<<< HEAD
 
+=======
+#include <unordered_map>
+#include <mutex>
+#include <string_view>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if AT_CUSPARSELT_ENABLED()
 
 namespace at::native {
@@ -15,6 +21,48 @@ namespace at::native {
 thread_local cusparseLtHandle_t handle;
 thread_local bool handle_initialized = false;
 
+<<<<<<< HEAD
+=======
+#ifdef USE_ROCM
+// Single global flag for platform-wide hipSparseLt support
+c10::once_flag g_hipSparseLtSupportInitFlag;
+static bool g_hipSparseLtSupported = false;
+
+// Initialize the hipSparseLt support status once for the platform
+static void initHipSparseLtSupport() {
+    // Default to not supported
+    g_hipSparseLtSupported = false;
+
+    // Check only the first available device
+    try {
+        if (at::cuda::device_count() > 0) {
+            g_hipSparseLtSupported = at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx942"}, 0);
+        }
+    } catch (const std::exception&) {
+        // If an exception occurs during device property check, we assume hipSparseLt is not supported
+        // This could happen due to driver issues, device access problems, or other runtime errors
+        g_hipSparseLtSupported = false;
+        TORCH_WARN("Exception occurred while checking hipSparseLt support. Assuming not supported.");
+    }
+}
+
+static bool isHipSparseLtSupported() {
+    // Initialize support check only once
+    c10::call_once(g_hipSparseLtSupportInitFlag, initHipSparseLtSupport);
+
+    // Return cached result (platform-wide)
+    if (!g_hipSparseLtSupported) {
+        TORCH_CHECK(
+            false,
+            "hipSparseLt not supported on this device, supported architectures: "
+            "gfx950, gfx942. "
+            "required ROCM version: 6.4.0 or later.");
+    }
+    return g_hipSparseLtSupported;
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 at::Tensor _cslt_compress(const Tensor& sparse_input) {
   if (!handle_initialized) {
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
@@ -25,6 +73,13 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
   cudaDataType type;
   auto compression_factor = 9;
 
+<<<<<<< HEAD
+=======
+  #ifdef USE_ROCM
+  TORCH_CHECK(isHipSparseLtSupported());
+  #endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   switch (sparse_input.scalar_type()) {
     case at::ScalarType::Char:
       type = CUDA_R_8I;
@@ -36,17 +91,30 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
     case at::ScalarType::BFloat16:
       type = CUDA_R_16BF;
       break;
+<<<<<<< HEAD
     case at::ScalarType::Float:
       type = CUDA_R_32F;
       break;
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
+=======
+#ifndef USE_ROCM
+    case at::ScalarType::Float:
+      type = CUDA_R_32F;
+      break;
+#endif
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case at::ScalarType::Float8_e4m3fn:
       type = CUDA_R_8F_E4M3;
       compression_factor = 10;
       break;
 #endif
     default:
+<<<<<<< HEAD
       TORCH_CHECK(false, "Unsupported dtype for cuSPARSELt compressed matrix");
+=======
+      TORCH_CHECK(false, "Unsupported dtype for cuSPARSELt/hipSparseLt compressed matrix");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       break;
   }
 
@@ -91,7 +159,11 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
   return compressed_tensor;
 }
 
+<<<<<<< HEAD
 std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
+=======
+std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& compressed_A,
     const Tensor& dense_B,
     const std::optional<Tensor>& bias_opt,
@@ -100,7 +172,11 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
     bool transpose_result,
     int alg_id,
     int split_k,
+<<<<<<< HEAD
     bool split_k_one_kernel,
+=======
+    int split_k_mode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool search_alg_id) {
   if (!handle_initialized) {
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
@@ -120,6 +196,13 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
   cusparseComputeType compute_type;
   auto compression_factor = 9;
 
+<<<<<<< HEAD
+=======
+  #ifdef USE_ROCM
+  TORCH_CHECK(isHipSparseLtSupported());
+  #endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   switch (compressed_A.scalar_type()) {
     case at::ScalarType::Char:
       input_type = CUDA_R_8I;
@@ -131,7 +214,11 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
 
 // cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F
 // to CUSPARSE_COMPUTE_32F
+<<<<<<< HEAD
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 502
+=======
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 502 || defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case at::ScalarType::Half:
       input_type = CUDA_R_16F;
       output_type = CUDA_R_16F;
@@ -144,14 +231,24 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
       C_type = CUDA_R_16BF;
       compute_type = CUSPARSE_COMPUTE_32F;
       break;
+<<<<<<< HEAD
+=======
+#ifndef USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case at::ScalarType::Float:
       input_type = CUDA_R_32F;
       output_type = CUDA_R_32F;
       C_type = CUDA_R_32F;
       compute_type = CUSPARSE_COMPUTE_32F;
       break;
+<<<<<<< HEAD
 // if cuSPARSELt >= 6.2.3, we can add Float8 support
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
+=======
+#endif
+// if cuSPARSELt >= 6.2.3, we can add Float8 support
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case at::ScalarType::Float8_e4m3fn:
       input_type = CUDA_R_8F_E4M3;
       output_type = CUDA_R_8F_E4M3;
@@ -214,7 +311,11 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
       }
     }
 // cslt 0.6.2+: fp8 fp8 -> {fp8, fp16, bf16, fp32} support
+<<<<<<< HEAD
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
+=======
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else if (input_type == CUDA_R_8F_E4M3) {
       switch (out_dtype) {
         case at::ScalarType::Float8_e4m3fn:
@@ -351,6 +452,7 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
         &split_k,
         sizeof(split_k)));
 
+<<<<<<< HEAD
     splitKMode = split_k_one_kernel ? CUSPARSELT_SPLIT_K_MODE_ONE_KERNEL
                                     : CUSPARSELT_SPLIT_K_MODE_TWO_KERNELS;
     TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
@@ -359,6 +461,17 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
         CUSPARSELT_MATMUL_SPLIT_K_MODE,
         &splitKMode,
         sizeof(splitKMode)));
+=======
+    if (split_k_mode > 0) {
+      splitKMode = static_cast<cusparseLtSplitKMode_t>(split_k_mode);
+      TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
+          &handle,
+          &alg_sel,
+          CUSPARSELT_MATMUL_SPLIT_K_MODE,
+          &splitKMode,
+          sizeof(splitKMode)));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // set tensor_alpha_mode and alpha pointer for matmul
@@ -465,7 +578,11 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
       res,
       alg_id,
       split_k,
+<<<<<<< HEAD
       splitKMode == CUSPARSELT_SPLIT_K_MODE_ONE_KERNEL,
+=======
+      static_cast<int64_t>(splitKMode),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       max_alg_id};
 }
 
@@ -478,7 +595,11 @@ at::Tensor _cslt_sparse_mm(
     bool transpose_result,
     int64_t alg_id,
     int64_t split_k,
+<<<<<<< HEAD
     bool split_k_one_kernel) {
+=======
+    int64_t split_k_mode) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto result = _cslt_sparse_mm_impl(
       compressed_A,
       dense_B,
@@ -488,7 +609,11 @@ at::Tensor _cslt_sparse_mm(
       transpose_result,
       (int)alg_id,
       (int)split_k,
+<<<<<<< HEAD
       split_k_one_kernel,
+=======
+      (int)split_k_mode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       false);
   return std::get<0>(result);
 }
@@ -504,7 +629,11 @@ int64_t _cslt_sparse_mm_search(
       "torch._cslt_sparse_mm_search is deprecated and will be removed in a future PyTorch release. Please use torch._C._cusparselt.mm_search instead.");
   int alg_id_int = 0;
   int split_k = 1;
+<<<<<<< HEAD
   bool split_k_one_kernel = true;
+=======
+  int split_k_mode = -1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto result = _cslt_sparse_mm_impl(
       compressed_A,
       dense_B,
@@ -514,7 +643,11 @@ int64_t _cslt_sparse_mm_search(
       transpose_result,
       alg_id_int,
       split_k,
+<<<<<<< HEAD
       split_k_one_kernel,
+=======
+      split_k_mode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       true);
   return (int64_t)std::get<1>(result);
 }
@@ -538,7 +671,11 @@ at::Tensor _cslt_sparse_mm(
     bool transpose_result,
     int64_t alg_id,
     int64_t split_k,
+<<<<<<< HEAD
     bool split_k_one_kernel) {
+=======
+    int64_t split_k_mode) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(false, "cuSPARSELt not supported on your machine.");
 }
 
diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.h b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.h
index 00e7a8e1477d..d552016f59d6 100644
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.h
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.h
@@ -21,7 +21,11 @@ namespace at::native {
 
 at::Tensor _cslt_compress(const Tensor& sparse_input);
 
+<<<<<<< HEAD
 TORCH_CUDA_CPP_API std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
+=======
+TORCH_CUDA_CPP_API std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& compressed_A,
     const Tensor& dense_B,
     const std::optional<Tensor>& bias_opt,
@@ -30,7 +34,11 @@ TORCH_CUDA_CPP_API std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt
     bool transpose_result,
     int alg_id,
     int split_k,
+<<<<<<< HEAD
     bool split_k_one_kernel,
+=======
+    int split_k_mode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool search_alg_id
 );
 
@@ -43,7 +51,11 @@ at::Tensor _cslt_sparse_mm(
     bool transpose_result,
     int64_t alg_id,
     int64_t split_k,
+<<<<<<< HEAD
     bool split_k_one_kernel
+=======
+    int64_t split_k_mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 );
 
 int64_t _cslt_sparse_mm_search(
diff --git a/aten/src/ATen/native/tags.yaml b/aten/src/ATen/native/tags.yaml
index 62f5367670b9..5591386d8b4f 100644
--- a/aten/src/ATen/native/tags.yaml
+++ b/aten/src/ATen/native/tags.yaml
@@ -42,19 +42,42 @@
   desc: |
           This tag indicates if an operator doesn't guarantee bitwise equivalence
           across different runs of an operator with identical inputs.
+<<<<<<< HEAD
+=======
+- tag: needs_exact_strides
+  desc: |
+          This tag indicates that the operator should be passed Tensors following
+          the same strides as observed in eager when compiled in inductor.
+          Only one of {needs_exact_strides, needs_contiguous_strides, needs_fixed_stride_order, flexible_layout}
+          can apply; if multiple are assigned then we assume the most restrictive one.
+- tag: needs_contiguous_strides
+  desc: |
+          This tag indicates that the operator should be passed contiguous Tensors.
+          Failure to do so will result in undefined behavior.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - tag: needs_fixed_stride_order
   desc: |
           This tag indicates that the operator should be passed Tensors following
           the same stride permutation as observed in eager when compiled in inductor.
+<<<<<<< HEAD
           Only one of {needs_fixed_stride_order, flexible_layout} can apply; if
           multiple are assigned then we assume the most restrictive one.
+=======
+          Only one of {needs_exact_strides, needs_contiguous_strides, needs_fixed_stride_order, flexible_layout}
+          can apply; if multiple are assigned then we assume the most restrictive one.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - tag: flexible_layout
   desc: |
           This tag indicates that the custom operator can accept inputs with varying
           strides/storage_offset and that when compiled, Inductor is allowed to change
           the strides/storage_offset of inputs to the custom operator.
+<<<<<<< HEAD
           Only one of {needs_fixed_stride_order, flexible_layout} can apply; if
           multiple are assigned then we assume the most restrictive one.
+=======
+          Only one of {needs_exact_strides, needs_contiguous_strides, needs_fixed_stride_order, flexible_layout}
+          can apply; if multiple are assigned then we assume the most restrictive one.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # NOTE [Core ATen Ops]
 - tag: core
@@ -78,3 +101,11 @@
           relevant to CIA ops that decompose before functionalization/autograd. It is useful to
           know this information for export as we would want to decompose these ops as they are unsafe to be
           preserved.
+<<<<<<< HEAD
+=======
+- tag: cudagraph_unsafe
+  desc: |
+          This operator does not support cudagraphs. The presence of this tag on an operator will cause
+          Inductor to split the graph around this operator. Note that operators without this tag may still
+          not support CUDAGraphs. Inductor may have other hardcoded lists around that.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 27397bf78898..c30bd72de033 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -28,6 +28,10 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_fused_sdp_choice_native.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/_fused_sdp_choice_ops.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/_masked_softmax.h>
 #include <ATen/ops/_native_multi_head_attention_native.h>
 #include <ATen/ops/_nested_from_padded.h>
@@ -448,6 +452,10 @@ REGISTER_AVX512_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_VSX_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_ZVECTOR_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_SVE256_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
+<<<<<<< HEAD
+=======
+REGISTER_HPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_meta)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 int64_t _fused_sdp_choice_meta(
     const Tensor& query_,
@@ -459,6 +467,23 @@ int64_t _fused_sdp_choice_meta(
     std::optional<double> scale,
     bool enable_gqa) {
   auto query_key_set = query_.key_set();
+<<<<<<< HEAD
+=======
+  bool has_hpu = query_key_set.has(c10::DispatchKey::HPU);
+  if (has_hpu) {
+    auto choice_int = at::_ops::_fused_sdp_choice::redispatch(
+        c10::DispatchKeySet(DispatchKey::HPU),
+        query_,
+        key,
+        value,
+        attn_mask_,
+        dropout_p,
+        is_causal,
+        scale,
+        enable_gqa);
+    return choice_int;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(USE_ROCM)
   bool has_rocm = query_key_set.has(c10::DispatchKey::HIP);
   if (has_rocm) {
@@ -556,7 +581,17 @@ std::optional<Tensor> convert_boolean_attn_mask_cudnn(const std::optional<Tensor
 template<int alignment>
 bool aligned_tensor(const at::Tensor& tensor){
   for(const auto i : c10::irange(tensor.dim() - 1)){
+<<<<<<< HEAD
     if(tensor.sym_stride(i) % alignment != 0){
+=======
+    auto stride = tensor.sym_stride(i).maybe_as_int();
+    // If the stride is unknown at compilation time, assume it is unaligned
+    // and always pad it. This is helpful to avoid unnecessary guards.
+    if (!stride)
+      return false;
+
+    if((*stride) % alignment != 0){
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return false;
     }
   }
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index 248e3d276300..a80cd5089ed4 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -46,6 +46,10 @@
 #include <ATen/ops/_triton_multi_head_attention_native.h>
 #include <ATen/ops/_triton_scaled_dot_attention.h>
 #include <ATen/ops/empty.h>
+<<<<<<< HEAD
+=======
+#include <ATen/ops/empty_strided.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/empty_like.h>
 #include <ATen/ops/linear.h>
 #include <ATen/ops/narrow_native.h>
@@ -85,13 +89,90 @@
 #include <ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h>
 #else
 // MemoryEfficient Attention Specific Imports for ROCM
+<<<<<<< HEAD
 #include <ATen/native/transformers/hip/aotriton_adapter.h>
 #include <aotriton/flash.h>
 #include <aotriton/runtime.h>
+=======
+#ifndef DISABLE_AOTRITON
+#include <ATen/native/transformers/hip/aotriton_adapter.h>
+#include <aotriton/flash.h>
+#include <aotriton/runtime.h>
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h>
 #endif
 #endif
 
+<<<<<<< HEAD
+=======
+#if defined(USE_ROCM) && (defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION))
+namespace pytorch_flash
+{
+std::tuple<
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor>
+mha_fwd(
+    const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size
+    const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size
+    const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size
+    std::optional<at::Tensor>&
+        out_, // batch_size x seqlen_q x num_heads x head_size
+    std::optional<at::Tensor>&
+        alibi_slopes_, // num_heads or batch_size x num_heads
+    const float p_dropout,
+    const float softmax_scale,
+    bool is_causal,
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right,
+    const float softcap,
+    const bool return_softmax,
+    std::optional<at::Generator> gen_) {
+#if defined(USE_ROCM_CK_SDPA)
+  if (at::globalContext().getROCmFAPreferredBackend() ==
+      at::ROCmFABackend::Ck) {
+    const int non_null_window_left = window_size_left.value_or(-1);
+    const int non_null_window_right = window_size_right.value_or(-1);
+    std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
+    return mha_fwd_ck(
+        q,
+        k,
+        v,
+        out_,
+        p_dropout,
+        softmax_scale,
+        is_causal,
+        non_null_window_left,
+        non_null_window_right,
+        return_softmax,
+        gen_,
+        dummy_attn_bias); // Not used in flash attention
+  }
+#endif
+  return mha_fwd_aot(
+      q,
+      k,
+      v,
+      out_,
+      alibi_slopes_,
+      p_dropout,
+      softmax_scale,
+      is_causal,
+      window_size_left,
+      window_size_right,
+      return_softmax,
+      gen_);
+}
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace at {
 
 namespace cuda::philox {
@@ -961,6 +1042,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attenti
     std::optional<double> scale) {
   // Used for tracking usage statistics
   C10_LOG_API_USAGE_ONCE("torch.sdpa.mem_efficient_attention");
+<<<<<<< HEAD
   // Query -> Query(Batch x Q_seq_len x Num_heads x Dim_per_head)
   // Key   -> Key(Batch x KV_seq_len x Num_heads x Dim_per_head)
   // Value -> Value(Batch x KV_seq_len x  Num_heads x Dim_per_head)
@@ -988,6 +1070,114 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attenti
 
   attention = attention.transpose(1, 2);
   return std::make_tuple(std::move(attention), std::move(log_sumexp), std::move(seed), std::move(offset));
+=======
+  constexpr int64_t MAX_BATCH_SIZE = (1LL << 16) - 1;
+  int64_t batch_size = query.size(0);
+
+  if (batch_size > MAX_BATCH_SIZE) {
+    TORCH_CHECK(dropout_p == 0.0,
+                "Efficient attention cannot produce valid seed and offset outputs when "
+                "the batch size exceeds (", MAX_BATCH_SIZE, ").");
+  }
+  auto process_chunk = [&](const Tensor& q_chunk,
+                           const Tensor& k_chunk,
+                           const Tensor& v_chunk,
+                           const std::optional<Tensor>& bias_chunk)
+      -> std::tuple<Tensor, Tensor, Tensor, Tensor> {
+    Tensor q_t = q_chunk.transpose(1, 2);
+    Tensor k_t = k_chunk.transpose(1, 2);
+    Tensor v_t = v_chunk.transpose(1, 2);
+
+    sdp::CustomMaskType custom_mask_type = is_causal
+        ? sdp::CustomMaskType::CausalFromTopLeft
+        : sdp::CustomMaskType::NoCustomMask;
+
+    auto [attention, log_sumexp, seed, offset, max_seqlen_batch_q, max_seqlen_batch_kv] =
+        at::_efficient_attention_forward(
+            q_t,
+            k_t,
+            v_t,
+            bias_chunk,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            std::nullopt,
+            dropout_p,
+            static_cast<int64_t>(custom_mask_type),
+            compute_log_sumexp,
+            scale);
+    attention = attention.transpose(1, 2);
+
+    return std::make_tuple(std::move(attention),
+                           std::move(log_sumexp),
+                           std::move(seed),
+                           std::move(offset));
+  };
+
+  // when bs is larger than allowed maximum, process in chunks
+  if (batch_size > MAX_BATCH_SIZE) {
+    int64_t start = 0;
+    int64_t end = std::min(start + MAX_BATCH_SIZE, batch_size);
+
+    Tensor query_chunk = query.slice(0, start, end);
+    Tensor key_chunk = key.slice(0, start, end);
+    Tensor value_chunk = value.slice(0, start, end);
+    std::optional<Tensor> bias_chunk;
+    if (attn_bias.has_value()) {
+      bias_chunk = attn_bias.value().slice(0, start, end);
+    }
+    auto [attn, log_sumexp, seed, offset] =
+        process_chunk(query_chunk, key_chunk, value_chunk, bias_chunk);
+    int dim = attn.dim();
+    std::vector<int64_t> sizes;
+    sizes.reserve(dim);
+    sizes.push_back(batch_size);
+    for (int i = 1; i < dim; i++) {
+        sizes.push_back(attn.size(i));
+    }
+    Tensor final_attention = at::empty_strided(sizes, attn.strides(), attn.options());
+    final_attention.slice(0, start, end).copy_(attn);
+    Tensor final_log_sumexp;
+    if (compute_log_sumexp && log_sumexp.numel() > 0) {
+      std::vector<int64_t> lse_sizes;
+      lse_sizes.reserve(log_sumexp.dim());
+      lse_sizes.push_back(batch_size);
+      for (int i = 1; i < log_sumexp.dim(); i++) {
+        lse_sizes.push_back(log_sumexp.size(i));
+      }
+      final_log_sumexp = at::empty(std::move(lse_sizes), log_sumexp.options());
+      final_log_sumexp.slice(0, start, end).copy_(log_sumexp);
+    }
+
+    for (start = end; start < batch_size; start += MAX_BATCH_SIZE) {
+      end = std::min(start + MAX_BATCH_SIZE, batch_size);
+      query_chunk = query.slice(0, start, end);
+      key_chunk = key.slice(0, start, end);
+      value_chunk = value.slice(0, start, end);
+      if (attn_bias.has_value()) {
+        bias_chunk = attn_bias.value().slice(0, start, end);
+      } else {
+        bias_chunk.reset();
+      }
+
+      auto [chunk_attn, chunk_log_sumexp, chunk_seed, chunk_offset] =
+          process_chunk(query_chunk, key_chunk, value_chunk, bias_chunk);
+      final_attention.slice(0, start, end).copy_(chunk_attn);
+      if (compute_log_sumexp && chunk_log_sumexp.numel() > 0) {
+        final_log_sumexp.slice(0, start, end).copy_(chunk_log_sumexp);
+      }
+    }
+
+    return std::make_tuple(std::move(final_attention),
+              std::move(final_log_sumexp),
+              std::move(seed),
+              std::move(offset));
+  }
+  // when bs is within the allowed size, no need to chunk it
+  else {
+    return process_chunk(query, key, value, attn_bias);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Tensor& value,
@@ -1031,8 +1221,15 @@ _flash_attention_forward(
   std::optional<Tensor> alibi_slopes = _alibi_slopes;
   const float softcap = 0.0;
 
+<<<<<<< HEAD
   const int non_null_window_left = window_size_left.has_value() ? window_size_left.value() : -1;
   const int non_null_window_right = window_size_right.has_value() ? window_size_right.value() : -1;
+=======
+#ifndef USE_ROCM  // ROCM backend accepts std::optional for window_size_left/right directly.
+  const int non_null_window_left = window_size_left.value_or(-1);
+  const int non_null_window_right = window_size_right.value_or(-1);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // We are going to have two paths:
   // 1. The standard MHA path for dense tensors
@@ -1069,8 +1266,18 @@ _flash_attention_forward(
             softmax_scale,
             false /*zero_tensors*/,
             is_causal,
+<<<<<<< HEAD
+            non_null_window_left,
+            non_null_window_right,
+=======
+#ifdef USE_ROCM
+            window_size_left,
+            window_size_right,
+#else
             non_null_window_left,
             non_null_window_right,
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             softcap,
             return_debug_mask,
             std::nullopt /*gen_*/);
@@ -1093,8 +1300,18 @@ _flash_attention_forward(
             dropout_p,
             softmax_scale,
             is_causal,
+<<<<<<< HEAD
             non_null_window_left,
             non_null_window_right,
+=======
+#ifdef USE_ROCM
+            window_size_left,
+            window_size_right,
+#else
+            non_null_window_left,
+            non_null_window_right,
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             softcap,
             return_debug_mask, /*return_softmax (this is used for testing)*/
             std::nullopt);
@@ -1287,6 +1504,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     TORCH_CHECK(false, "Attempting to use CK mem_eff_forward backend in a build that has not built CK");
 #endif
   } else { // use aotriton
+<<<<<<< HEAD
+=======
+#ifndef DISABLE_AOTRITON
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto ret = aotriton::v2::flash::check_gpu(stream);
     if (hipSuccess != ret) {
         TORCH_CHECK(false,
@@ -1311,12 +1532,24 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     at::Tensor v_t = value.transpose(1, 2);
     at::Tensor output_t = res.transpose(1, 2);
     bool is_causal;
+<<<<<<< HEAD
     if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
       is_causal = true;
     } else if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
       is_causal = false;
     } else {
       TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now");
+=======
+    if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
+      is_causal = false;
+    } else {
+      is_causal = true;
+#if AOTRITON_V3_API == 0
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) != custom_mask_type) {
+        TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now");
+      }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     at::Tensor atomic_counter;
@@ -1341,7 +1574,55 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     auto offset_output = mk_philoxtensor(use_philox_state ? offset_t.data_ptr<int64_t>() : nullptr);
     auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr<int32_t>() : nullptr);
     hipError_t err; // TODO: Error handling
+<<<<<<< HEAD
     if (seqstart_q.has_value()) {
+=======
+    if constexpr (AOTRITON_ALWAYS_V3_API) {  // Better readability than nesting ifdef
+#if AOTRITON_V3_API  // if constexpr does not stop errors from undefined functions
+      using aotriton::v3::flash::CausalType;
+      using aotriton::v3::flash::VarlenType;
+      using aotriton::v3::flash::WindowValue;
+      aotriton::v3::flash::attn_fwd_params params;
+      params.Q = mk_aotensor(q_t, "q");
+      params.K = mk_aotensor(k_t, "k");
+      params.V = mk_aotensor(v_t, "v");
+      params.Sm_scale = softmax_scale;
+      params.L = compute_logsumexp ? mk_aotensor<2>(softmax_lse, "M") : empty_t2;
+      params.Out = mk_aotensor(output_t, "Out");
+      params.Max_seqlen_q = max_seqlen_q;    // Unused if cu_seqlens_q is empty
+      params.Max_seqlen_k = max_seqlen_k;    // Unused if cu_seqlens_k is empty
+      params.dropout_p = dropout_p;
+      params.philox_seed_ptr = seed;
+      params.philox_offset1 = offset1;
+      params.philox_offset2 = offset2;
+      params.philox_seed_output = seed_output;
+      params.philox_offset_output = offset_output;
+      params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax");
+      params.persistent_atomic_counter = persistent_counter;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
+        params.window_left = WindowValue::TopLeftAligned;
+        params.window_right = WindowValue::TopLeftAligned;
+      } else if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromBottomRight) == custom_mask_type) {
+        params.window_left = WindowValue::BottomRightAligned;
+        params.window_right = WindowValue::BottomRightAligned;
+      }
+      if (bias.has_value()) {
+        params.B = mk_aotensor(bias.value(), "bias");
+      }
+      if (seqstart_q.has_value()) {
+        params.varlen_type = VarlenType::CompactVarlen;
+        params.cu_seqlens_q = mk_aotensor<1>(seqstart_q.value(), "cu_seqlens_q");
+        params.cu_seqlens_k = mk_aotensor<1>(seqstart_k.value(), "cu_seqlens_k");
+      } else {
+        params.varlen_type = VarlenType::None;
+      }
+      err = aotriton::v3::flash::attn_fwd(params,
+                                          aotriton::v3::flash::attn_fwd_params::kVersion,
+                                          stream);
+#endif  // AOTRITON_V3_API
+    } else if (seqstart_q.has_value()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // varlen aka nested tensor
       err = attn_fwd_compact_varlen(mk_aotensor(q_t, "q"),
                                     mk_aotensor(k_t, "k"),
@@ -1383,11 +1664,24 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
                      persistent_counter,
                      stream);
     }
+<<<<<<< HEAD
+=======
+#else
+    TORCH_CHECK(false, "Attempting to use AOTriton mem_eff_forward backend in a build that has not built AOTriton");
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } // CK BACKEND
 #else
   // CUDA Implementation
   cudaDeviceProp* p = at::cuda::getDeviceProperties(query.device().index());
+<<<<<<< HEAD
   const int computeCapability = p->major * 10 + p->minor;
+=======
+  int computeCapability = p->major * 10 + p->minor;
+  if (computeCapability == 121) {
+    computeCapability = 120;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool kernel_launched = false;
   const auto maxShmem = p->sharedMemPerBlockOptin;
@@ -1645,6 +1939,10 @@ at::Tensor& _fill_mem_eff_dropout_mask_(
 #if defined(USE_MEM_EFF_ATTENTION)
 
 #ifdef USE_ROCM
+<<<<<<< HEAD
+=======
+#ifndef DISABLE_AOTRITON
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using aotriton::v2::flash::debug_simulate_encoded_softmax;
   using sdp::aotriton_adapter::mk_aotensor;
   using sdp::aotriton_adapter::mk_aoscalartensor;
@@ -1664,6 +1962,12 @@ at::Tensor& _fill_mem_eff_dropout_mask_(
                                        0,
                                        stream);
 #else
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(false, "_fill_mem_eff_dropout_mask_ is only enabled with aotriton");
+#endif
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::PhiloxCudaState rng_engine_inputs;
   rng_engine_inputs = at::PhiloxCudaState(seed, offset);
   at::cuda::CUDAGuard device_guard(self.device());
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index f5dd4f657286..bf2dc7ba70ba 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -24,6 +24,12 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+<<<<<<< HEAD
+=======
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/empty_strided.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/_flash_attention_backward.h>
 #include <ATen/ops/_flash_attention_backward_native.h>
 #include <ATen/ops/_efficient_attention_backward.h>
@@ -43,10 +49,20 @@
 #include <ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h>
 #include <ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h>
 #else
+<<<<<<< HEAD
 // MemoryEfficient Attention Specific Imports for ROCM
 #include <ATen/native/transformers/hip/aotriton_adapter.h>
 #include <aotriton/flash.h>
 #include <aotriton/runtime.h>
+=======
+#include <ATen/native/transformers/hip/gemm_kernel_utils.h>
+// MemoryEfficient Attention Specific Imports for ROCM
+#ifndef DISABLE_AOTRITON
+#include <ATen/native/transformers/hip/aotriton_adapter.h>
+#include <aotriton/flash.h>
+#include <aotriton/runtime.h>
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h>
 #endif
 #endif
@@ -83,8 +99,15 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
   auto contiguous_grad_out = grad_out.contiguous();
   auto contiguous_out = out.contiguous();
 
+<<<<<<< HEAD
   const int non_null_window_left = window_size_left.has_value() ? window_size_left.value() : -1;
   const int non_null_window_right = window_size_right.has_value() ? window_size_right.value() : -1;
+=======
+#ifndef USE_ROCM  // ROCM backend accepts std::optional for window_size_left/right directly.
+  const int non_null_window_left = window_size_left.has_value() ? window_size_left.value() : -1;
+  const int non_null_window_right = window_size_right.has_value() ? window_size_right.value() : -1;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::optional<at::Tensor> dq{std::nullopt};
   std::optional<at::Tensor> dk{std::nullopt};
@@ -132,8 +155,18 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
         softmax_scale,
         false /*zero_tensors*/,
         is_causal,
+<<<<<<< HEAD
         non_null_window_left,
         non_null_window_right,
+=======
+#ifdef USE_ROCM
+        window_size_left,
+        window_size_right,
+#else
+        non_null_window_left,
+        non_null_window_right,
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         softcap,
         determinisitic,
         philox_seed,
@@ -155,8 +188,18 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
         dropout_p,
         softmax_scale,
         is_causal,
+<<<<<<< HEAD
+        non_null_window_left,
+        non_null_window_right,
+=======
+#ifdef USE_ROCM
+        window_size_left,
+        window_size_right,
+#else
         non_null_window_left,
         non_null_window_right,
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         softcap,
         determinisitic,
         philox_seed,
@@ -453,6 +496,10 @@ _efficient_attention_backward(
     TORCH_CHECK(false, "Attempting to use CK mem_eff_backward backend in a build that has not built CK");
 #endif
   } else {
+<<<<<<< HEAD
+=======
+#ifndef DISABLE_AOTRITON
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(!num_splits_key.has_value(),
               "ROCM does not support num_split_keys in _efficient_attention_forward");
     TORCH_CHECK(!window_size.has_value(),
@@ -465,12 +512,24 @@ _efficient_attention_backward(
     }
     const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
     bool is_causal;
+<<<<<<< HEAD
     if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
       is_causal = true;
     } else if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
       is_causal = false;
     } else {
       TORCH_CHECK(false, "[_efficient_attention_backward] Unsupported mask type in AOTriton, for now");
+=======
+    if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
+      is_causal = false;
+    } else {
+      is_causal = true;
+#if AOTRITON_V3_API == 0
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) != custom_mask_type) {
+        TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now");
+      }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     at::Tensor q_t = query.permute({0,2,1,3});
     at::Tensor k_t = key.permute({0,2,1,3});
@@ -489,7 +548,66 @@ _efficient_attention_backward(
     using sdp::aotriton_adapter::mk_aoscalartensor;
     using sdp::aotriton_adapter::cast_dtype;
     aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, cast_dtype(query.dtype()));
+<<<<<<< HEAD
     if (cu_seqlens_q.has_value()) {
+=======
+    if constexpr (AOTRITON_ALWAYS_V3_API) {  // Better readability than nesting ifdef
+#if AOTRITON_V3_API  // if constexpr does not stop errors from undefined functions
+      using aotriton::v3::flash::CausalType;
+      using aotriton::v3::flash::VarlenType;
+      using aotriton::v3::flash::WindowValue;
+      aotriton::v3::flash::attn_bwd_params params;
+      params.Q = mk_aotensor(q_t, "q");
+      params.K = mk_aotensor(k_t, "k");
+      params.V = mk_aotensor(v_t, "v");
+      params.B = bias.has_value() ? mk_aotensor(bias.value(), "bias") : empty_t4;
+      params.Sm_scale = softmax_scale;
+      params.Out = mk_aotensor(out_t, "out");
+      params.DO = mk_aotensor(dout_t, "dout");
+      params.DK = mk_aotensor(dk_t, "dk");
+      params.DV = mk_aotensor(dv_t, "dv");
+      params.DQ = mk_aotensor(dq_t, "dq");
+      params.DB = bias_requires_grad ? mk_aotensor(grad_bias, "db") : empty_t4;
+      params.L = mk_aotensor<2>(softmax_lse, "L");
+      params.Max_seqlen_q = max_seqlen_q;        // Unused if cu_seqlens_q is empty
+      params.Max_seqlen_k = max_seqlen_k;        // Unused if cu_seqlens_k is empty
+      params.dropout_p = float(dropout_p);
+      params.philox_seed_ptr =  mk_aoscalartensor(philox_seed);
+      params.philox_offset1 = mk_aoscalartensor(philox_offset);
+      params.philox_offset2 = 0;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
+        params.window_left = WindowValue::TopLeftAligned;
+        params.window_right = WindowValue::TopLeftAligned;
+      } else if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromBottomRight) == custom_mask_type) {
+        params.window_left = WindowValue::BottomRightAligned;
+        params.window_right = WindowValue::BottomRightAligned;
+      }
+#if AOTRITON_ALWAYS_V3_API
+      using sdp::aotriton_adapter::mklazy_empty_like;
+      using sdp::aotriton_adapter::mklazy_fp32zeros;
+      using sdp::aotriton_adapter::LazyTensorContext;
+      LazyTensorContext lazy_delta { .like_tensor = softmax_lse, .tensor_name = "delta" };
+      LazyTensorContext lazy_dq_acc { .like_tensor = dq_t, .tensor_name = "dq_acc" };
+      params.D = mklazy_empty_like<2>(&lazy_delta);
+      params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc);
+#else
+      at::Tensor delta = at::empty_like(softmax_lse).contiguous();
+      params.D = mk_aotensor<2>(delta, "delta");
+#endif
+      if (cu_seqlens_q.has_value()) {
+        params.varlen_type = VarlenType::CompactVarlen;
+        params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q.value(), "cu_seqlens_q");
+        params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k.value(), "cu_seqlens_k");
+      } else {
+        params.varlen_type = VarlenType::None;
+      }
+      err = aotriton::v3::flash::attn_bwd(params,
+                                          aotriton::v3::flash::attn_bwd_params::kVersion,
+                                          stream);
+#endif  // AOTRITON_V3_API
+    } else if (cu_seqlens_q.has_value()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::Tensor delta = at::empty_like(softmax_lse).contiguous();
       // varlen aka Nested tensor
       err = attn_bwd_compact_varlen(mk_aotensor(q_t, "q"),
@@ -560,11 +678,24 @@ _efficient_attention_backward(
                      stream);
       } //used_fused_bwd
     } // cuseqlen.has_value
+<<<<<<< HEAD
+=======
+#else  // DISABLE_AOTRITON
+    TORCH_CHECK(false, "Attempting to use aotriton mem_eff_backward backend in a build that has not built AOTriton");
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } // Use CK
 #else // USE_CUDA
   at::Tensor workspace;
   cudaDeviceProp* p = at::cuda::getDeviceProperties(query.device().index());
+<<<<<<< HEAD
   const int computeCapability = p->major * 10 + p->minor;
+=======
+  int computeCapability = p->major * 10 + p->minor;
+  if (computeCapability == 121) {
+    computeCapability = 120;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool kernel_launched = false;
   const auto maxK = std::max(query.size(3), value.size(3));
@@ -739,7 +870,11 @@ _efficient_attention_backward(
       // when we need a staging area for gK/gV. let's avoid that
       if (Kernel::kNeedsAccumGradK || Kernel::kNeedsAccumGradV) {
         p.num_splits_key = std::min(
+<<<<<<< HEAD
             int(p.num_splits_key), 200 / (p.num_batches * p.num_heads));
+=======
+            int32_t(p.num_splits_key), 200 / ((int32_t)(p.num_batches * p.num_heads)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
     if (!Kernel::kEnableSplitKeys || p.num_splits_key < 1) {
@@ -896,6 +1031,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_e
   if (!grad_out_.defined()) {
     return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{});
   }
+<<<<<<< HEAD
   auto grad_out = grad_out_.transpose(1, 2);
   auto out_t = out.transpose(1, 2);
   auto q_t = query.transpose(1, 2);
@@ -907,29 +1043,76 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_e
   std::optional<Tensor> kernel_bias;
   if (attn_bias.defined()) {
     kernel_bias = attn_bias;
+=======
+  constexpr int64_t MAX_BATCH_SIZE = (1LL << 16) - 1;
+  int64_t batch_size = query.size(0);
+
+  if (batch_size > MAX_BATCH_SIZE) {
+    TORCH_CHECK(dropout_p == 0.0,
+                "Efficient attention backward cannot handle dropout when "
+                "the batch size exceeds (", MAX_BATCH_SIZE, ").");
+  }
+  auto grad_out_t = grad_out_.transpose(1, 2);
+  auto query_t = query.transpose(1, 2);
+  auto key_t = key.transpose(1, 2);
+  auto value_t = value.transpose(1, 2);
+  auto out_t = out.transpose(1, 2);
+
+  auto process_chunk = [&](const Tensor& grad_out_chunk,
+                          const Tensor& query_chunk,
+                          const Tensor& key_chunk,
+                          const Tensor& value_chunk,
+                          const std::optional<Tensor>& attn_bias_chunk,
+                          const Tensor& out_chunk,
+                          const Tensor& logsumexp_chunk)
+      -> std::tuple<Tensor, Tensor, Tensor, Tensor> {
+  // This is needed because SaveVariable automatically converts
+  // std::optional to undefined tensor
+  std::optional<Tensor> kernel_bias;
+  if (attn_bias_chunk.has_value() && attn_bias_chunk.value().defined()) {
+    kernel_bias = attn_bias_chunk.value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   // Will add with signauter changes for dropout and bias
   // We are only handling Dense inputs, but this should be passed
   // from forward to backward
+<<<<<<< HEAD
   int64_t max_seqlen_q = q_t.size(1);
   int64_t max_seqlen_k = k_t.size(1);
+=======
+  int64_t max_seqlen_q = query_chunk.size(2);
+  int64_t max_seqlen_k = key_chunk.size(2);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   sdp::CustomMaskType custom_mask_type = causal
     ? sdp::CustomMaskType::CausalFromTopLeft
     : sdp::CustomMaskType::NoCustomMask;
   auto [grad_q, grad_k, grad_v, grad_bias] =
       at::_efficient_attention_backward(
+<<<<<<< HEAD
           grad_out,
           q_t,
           k_t,
           v_t,
           kernel_bias,
           out_t,
+=======
+          grad_out_chunk,
+          query_chunk,
+          key_chunk,
+          value_chunk,
+          kernel_bias,
+          out_chunk,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           std::nullopt,
           std::nullopt,
           max_seqlen_q,
           max_seqlen_k,
+<<<<<<< HEAD
           logsumexp,
+=======
+          logsumexp_chunk,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           dropout_p,
           philox_seed,
           philox_offset,
@@ -938,7 +1121,94 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_e
           scale,
           std::nullopt);  // num_split_keys
   return std::make_tuple(
+<<<<<<< HEAD
       grad_q.transpose(1, 2), grad_k.transpose(1, 2), grad_v.transpose(1, 2), grad_bias);
+=======
+      grad_q.transpose(1, 2), grad_k.transpose(1, 2), grad_v.transpose(1, 2), std::move(grad_bias));
+  };
+
+  // process in chunks if batch size exceeds maximum
+  if (batch_size > MAX_BATCH_SIZE) {
+    Tensor final_grad_q, final_grad_k, final_grad_v, final_grad_bias;
+
+    auto create_strided_output = [batch_size](const Tensor& tensor) -> Tensor {
+      if (!tensor.defined()) {
+        return Tensor{};
+      }
+      int dim = tensor.dim();
+      std::vector<int64_t> sizes;
+      sizes.reserve(dim);
+      sizes.push_back(batch_size);
+      for (int i = 1; i < dim; i++) {
+        sizes.push_back(tensor.size(i));
+      }
+      return at::empty_strided(std::move(sizes), tensor.strides(), tensor.options());
+    };
+
+    if (grad_input_mask[0]) {
+      final_grad_q = create_strided_output(query);
+    }
+
+    if (grad_input_mask[1]) {
+      final_grad_k = create_strided_output(key);
+    }
+
+    if (grad_input_mask[2]) {
+      final_grad_v = create_strided_output(value);
+    }
+    if (grad_input_mask[3] && attn_bias.defined()) {
+      final_grad_bias = at::zeros_like(attn_bias);
+    }
+
+    for (int64_t start = 0; start < batch_size; start += MAX_BATCH_SIZE) {
+      int64_t end = std::min(start + MAX_BATCH_SIZE, batch_size);
+
+      Tensor grad_out_chunk = grad_out_t.slice(0, start, end);
+      Tensor query_chunk = query_t.slice(0, start, end);
+      Tensor key_chunk = key_t.slice(0, start, end);
+      Tensor value_chunk = value_t.slice(0, start, end);
+      Tensor attn_bias_chunk;
+      if (attn_bias.defined()) {
+        attn_bias_chunk = attn_bias.slice(0, start, end);
+      } else {
+        attn_bias_chunk.reset();
+      }
+      Tensor out_chunk = out_t.slice(0, start, end);
+      Tensor logsumexp_chunk = logsumexp.numel() > 0 ? logsumexp.slice(0, start, end) : logsumexp;
+
+      auto [chunk_grad_q, chunk_grad_k, chunk_grad_v, chunk_grad_bias] =
+          process_chunk(grad_out_chunk, query_chunk, key_chunk, value_chunk,
+                      attn_bias_chunk, out_chunk, logsumexp_chunk);
+
+      if (grad_input_mask[0] && chunk_grad_q.defined()) {
+        final_grad_q.slice(0, start, end).copy_(chunk_grad_q);
+      }
+      if (grad_input_mask[1] && chunk_grad_k.defined()) {
+        final_grad_k.slice(0, start, end).copy_(chunk_grad_k);
+      }
+      if (grad_input_mask[2] && chunk_grad_v.defined()) {
+        final_grad_v.slice(0, start, end).copy_(chunk_grad_v);
+      }
+      if (grad_input_mask[3] && chunk_grad_bias.defined()) {
+        final_grad_bias.add_(chunk_grad_bias);
+      }
+    }
+
+    return std::make_tuple(
+        std::move(final_grad_q),
+        std::move(final_grad_k),
+        std::move(final_grad_v),
+        std::move(final_grad_bias));
+  }
+  // when batch size is within allowed size, no chunking needed
+  else {
+    std::optional<Tensor> attn_bias_opt;
+    if (attn_bias.defined()) {
+      attn_bias_opt = attn_bias;
+    }
+    return process_chunk(grad_out_t, query_t, key_t, value_t, attn_bias_opt, out_t, logsumexp);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
index c3967b7296cd..f571839dfb7b 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
@@ -393,8 +393,13 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
     bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
     bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
     bool is_sm10x = dprops->major == 10 && dprops->minor >= 0;
+<<<<<<< HEAD
     bool is_sm120 = dprops->major == 12 && dprops->minor == 0;
     TORCH_CHECK(is_sm120 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+=======
+    bool is_sm120_or_sm121 = dprops->major == 12 && dprops->minor <= 1;
+    TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // We will support Turing in the near future
     // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
 
@@ -402,7 +407,11 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
     TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm120 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+=======
+        TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
@@ -479,7 +488,11 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
 
     // Otherwise the kernel will be launched from cuda:0 device
     // Cast to char to avoid compiler warning about narrowing
+<<<<<<< HEAD
     at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+=======
+    at::cuda::CUDAGuard device_guard{static_cast<signed char>(q.get_device())};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto opts = q.options();
 
@@ -581,8 +594,13 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
     bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
     bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
     bool is_sm10x = dprops->major == 10 && dprops->minor >= 0;
+<<<<<<< HEAD
     bool is_sm120 = dprops->major == 12 && dprops->minor == 0;
     TORCH_CHECK(is_sm120 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+=======
+    bool is_sm120_or_sm121 = dprops->major == 12 && dprops->minor <= 1;
+    TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // We will support Turing in the near future
     // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
 
@@ -590,7 +608,11 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
     TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm120 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+=======
+        TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
@@ -705,7 +727,11 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
 
     // Otherwise the kernel will be launched from cuda:0 device
     // Cast to char to avoid compiler warning about narrowing
+<<<<<<< HEAD
     at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+=======
+    at::cuda::CUDAGuard device_guard{static_cast<signed char>(q.get_device())};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto opts = q.options();
 
@@ -843,8 +869,13 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
     bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
     bool is_sm10x = dprops->major == 10 && dprops->minor >= 0;
+<<<<<<< HEAD
     bool is_sm120 = dprops->major == 12 && dprops->minor == 0;
     TORCH_CHECK(is_sm120 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+=======
+    bool is_sm120_or_sm121 = dprops->major == 12 && dprops->minor <= 1;
+    TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // We will support Turing in the near future
     // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
 
@@ -855,7 +886,11 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm120 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+=======
+        TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
@@ -885,7 +920,11 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
     TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
     if (head_size > 192 && (head_size <= 224 || is_dropout)) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm80 || is_sm90 || is_sm10x || is_sm120, "FlashAttention backward for head dim 256 with dropout, or head dim 224 with/without dropout requires A100/A800 or H100/H800");
+=======
+        TORCH_CHECK(is_sm80 || is_sm90 || is_sm10x || is_sm120_or_sm121, "FlashAttention backward for head dim 256 with dropout, or head dim 224 with/without dropout requires A100/A800 or H100/H800");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
@@ -940,7 +979,11 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
 
     // Otherwise the kernel will be launched from cuda:0 device
     // Cast to char to avoid compiler warning about narrowing
+<<<<<<< HEAD
     at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+=======
+    at::cuda::CUDAGuard device_guard{static_cast<signed char>(q.get_device())};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto opts = q.options();
     auto softmax_d = at::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
@@ -1060,8 +1103,13 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
     bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
     bool is_sm10x = dprops->major == 10 && dprops->minor >= 0;
+<<<<<<< HEAD
     bool is_sm120 = dprops->major == 12 && dprops->minor == 0;
     TORCH_CHECK(is_sm120 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+=======
+    bool is_sm120_or_sm121 = dprops->major == 12 && dprops->minor <= 1;
+    TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // We will support Turing in the near future
     // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
     bool is_dropout = p_dropout > 0.0;
@@ -1071,7 +1119,11 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm120 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+=======
+        TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
@@ -1106,7 +1158,11 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
     TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
     if (head_size > 192 && (head_size <= 224 || is_dropout)) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm80 || is_sm90 || is_sm10x || is_sm120, "FlashAttention backward for head dim 256 with dropout, or head dim 224 with/without dropout requires A100/A800 or H100/H800");
+=======
+        TORCH_CHECK(is_sm80 || is_sm90 || is_sm10x || is_sm120_or_sm121, "FlashAttention backward for head dim 256 with dropout, or head dim 224 with/without dropout requires A100/A800 or H100/H800");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
@@ -1163,7 +1219,11 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
 
     // Otherwise the kernel will be launched from cuda:0 device
     // Cast to char to avoid compiler warning about narrowing
+<<<<<<< HEAD
     at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+=======
+    at::cuda::CUDAGuard device_guard{static_cast<signed char>(q.get_device())};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto opts = q.options();
     auto softmax_d = at::empty({num_heads, total_q + 128 * batch_size}, opts.dtype(at::kFloat));
@@ -1284,8 +1344,13 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
     bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
     bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
     bool is_sm10x = dprops->major == 10 && dprops->minor >= 0;
+<<<<<<< HEAD
     bool is_sm120 = dprops->major == 12 && dprops->minor == 0;
     TORCH_CHECK(is_sm120 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+=======
+    bool is_sm120_or_sm121 = dprops->major == 12 && dprops->minor <= 1;
+    TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "FlashAttention only supports Ampere GPUs or newer.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // We will support Turing in the near future
     // TORCH_CHECK(is_sm90 || is_sm8x || is_sm75, "FlashAttention only supports Turing GPUs or newer.");
 
@@ -1293,7 +1358,11 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
     TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
+<<<<<<< HEAD
         TORCH_CHECK(is_sm120 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+=======
+        TORCH_CHECK(is_sm120_or_sm121 || is_sm10x || is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TORCH_CHECK(kcache.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(vcache.dtype() == q_dtype, "query and value must have the same dtype");
@@ -1393,7 +1462,11 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
 
     // Otherwise the kernel will be launched from cuda:0 device
     // Cast to char to avoid compiler warning about narrowing
+<<<<<<< HEAD
     at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+=======
+    at::cuda::CUDAGuard device_guard{static_cast<signed char>(q.get_device())};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto opts = q.options();
 
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
index 4f6267f3f3c7..4f3634e227da 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
@@ -666,12 +666,21 @@ struct AttentionBackwardKernel {
     int32_t num_heads = -1;
     uint8_t custom_mask_type = NoCustomMask;
 
+<<<<<<< HEAD
     int32_t q_strideM = -1;
     int32_t k_strideM = -1;
     int32_t v_strideM = -1;
     int32_t bias_strideM = 0;
     int32_t gO_strideM = -1;
     int32_t gB_strideM = -1;
+=======
+    int64_t q_strideM = -1;
+    int64_t k_strideM = -1;
+    int64_t v_strideM = -1;
+    int64_t bias_strideM = 0;
+    int64_t gO_strideM = -1;
+    int64_t gB_strideM = -1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int8_t gQKV_strideM_multiplier = 1; // 3 for packed, 1 otherwise
 
     at::PhiloxCudaState rng_engine_inputs = {0, 0};
@@ -680,6 +689,7 @@ struct AttentionBackwardKernel {
     unsigned long long dropout_batch_head_rng_offset = 0;
     float dropout_prob = 0.0f;
 
+<<<<<<< HEAD
     CUTLASS_HOST_DEVICE int32_t o_strideM() const {
       return head_dim_value * num_heads;
     }
@@ -690,6 +700,18 @@ struct AttentionBackwardKernel {
       return gQKV_strideM_multiplier * num_heads * head_dim;
     }
     CUTLASS_HOST_DEVICE int32_t gV_strideM() const {
+=======
+    CUTLASS_HOST_DEVICE int64_t o_strideM() const {
+      return head_dim_value * num_heads;
+    }
+    CUTLASS_HOST_DEVICE int64_t gQ_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int64_t gK_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int64_t gV_strideM() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return gQKV_strideM_multiplier * num_heads * head_dim_value;
     }
 
@@ -858,14 +880,22 @@ struct AttentionBackwardKernel {
         return 0;
       }
       return num_splits_key * kBlockSizeJ *
+<<<<<<< HEAD
           align_up(head_dim, (int32_t)kBlockSizeI);
+=======
+          align_up(head_dim, kBlockSizeI);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     CUTLASS_HOST_DEVICE int64_t workspace_elements_gv() const {
       if (!kNeedsAccumGradV) {
         return 0;
       }
       return num_splits_key * kBlockSizeJ *
+<<<<<<< HEAD
           align_up(head_dim_value, (int32_t)kBlockSizeI);
+=======
+          align_up(head_dim_value, kBlockSizeI);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     CUTLASS_HOST_DEVICE int64_t workspace_elements_gq() const {
       if (!kNeedsAccumGradQ) {
@@ -1307,12 +1337,20 @@ struct AttentionBackwardKernel {
     uint8_t warp_id = warp_uniform(thread_id / 32);
     uint8_t lane_id = thread_id % 32;
 
+<<<<<<< HEAD
     int32_t key_start = p.split_key_device() * kBlockSizeJ;
+=======
+    int64_t key_start = p.split_key_device() * kBlockSizeJ;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (key_start >= p.num_keys) {
       return;
     }
     if (kPrologueQK) {
+<<<<<<< HEAD
       int32_t query_start = getQueryStart(p, key_start);
+=======
+      int64_t query_start = getQueryStart(p, key_start);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       prologueQkNextIteration<true>(
           shared_storage, p, query_start, key_start, warp_id, lane_id);
     }
@@ -1362,8 +1400,13 @@ struct AttentionBackwardKernel {
          key_start += p.num_splits_key_device() * kBlockSizeJ) {
       output_frags.clear();
 
+<<<<<<< HEAD
       int32_t next_key = key_start;
       int32_t query_start = getQueryStart(p, key_start);
+=======
+      int64_t next_key = key_start;
+      int64_t query_start = getQueryStart(p, key_start);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       while (next_key == key_start && query_start < p.num_queries) {
         // This line here
         // vvvvvvvvvvvvvv
@@ -1385,7 +1428,11 @@ struct AttentionBackwardKernel {
             warp_id,
             lane_id);
 
+<<<<<<< HEAD
         int32_t next_query;
+=======
+        int64_t next_query;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         incrIteration(p, query_start, key_start, next_query, next_key);
         query_start = next_query;
       }
@@ -1439,8 +1486,13 @@ struct AttentionBackwardKernel {
       SharedStorage& shared_storage,
       OutputFragments& output_frags,
       Params& p,
+<<<<<<< HEAD
       int32_t query_start,
       int32_t key_start,
+=======
+      int64_t query_start,
+      int64_t key_start,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const curandStatePhilox4_32_10_t& curand_state_init,
       uint8_t warp_id,
       uint8_t lane_id) {
@@ -1463,7 +1515,11 @@ struct AttentionBackwardKernel {
     };
 
     bool isFirstQuery = (query_start == getQueryStart(p, key_start));
+<<<<<<< HEAD
     int32_t next_query, next_key;
+=======
+    int64_t next_query, next_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     incrIteration(p, query_start, key_start, next_query, next_key);
     bool isLastQuery = next_key != key_start;
 
@@ -1478,6 +1534,7 @@ struct AttentionBackwardKernel {
     int32_t num_queries_in_block = skipBoundsChecks
         ? MatmulQK::Mma::Shape::kN
         : warp_uniform(cutlass::fast_min(
+<<<<<<< HEAD
               (int32_t)MatmulQK::Mma::Shape::kN, p.num_queries - query_start));
     int32_t num_keys_in_block = skipBoundsChecks
         ? MatmulQK::Mma::Shape::kM
@@ -1489,6 +1546,19 @@ struct AttentionBackwardKernel {
           {int32_t(p.gO_strideM)},
           const_cast<scalar_t*>(p.grad_output_ptr + query_start * p.gO_strideM + col),
           {num_queries_in_block, p.head_dim_value - col},
+=======
+              MatmulQK::Mma::Shape::kN, (int32_t)(p.num_queries - query_start)));
+    int32_t num_keys_in_block = skipBoundsChecks
+        ? MatmulQK::Mma::Shape::kM
+        : warp_uniform(cutlass::fast_min(
+              MatmulQK::Mma::Shape::kM, (int32_t)(p.num_keys - key_start)));
+
+    auto prologueGradV = [&](int64_t col) {
+      typename MatmulGradV::Mma::IteratorB iterator_dO(
+          {int32_t(p.gO_strideM)},
+          const_cast<scalar_t*>(p.grad_output_ptr + query_start * p.gO_strideM + col),
+          {num_queries_in_block, (int32_t)(p.head_dim_value - col)},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           thread_id,
           no_offset);
       MatmulGradV::Mma::prologue(
@@ -1709,7 +1779,11 @@ struct AttentionBackwardKernel {
         // on the K-dimension) otherwise we can get NaNs during the GEMM
         const int kQueriesPerBlock = kBlockSizeI;
         const int threads_per_row = cutlass::fast_min(
+<<<<<<< HEAD
             int32_t(kNumThreads / kQueriesPerBlock), num_keys_in_block);
+=======
+            kNumThreads / kQueriesPerBlock, (int64_t)num_keys_in_block);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         const int elts_per_thread = cutlass::round_nearest(
             cutlass::ceil_div(num_keys_in_block, threads_per_row), 4);
 
@@ -1779,7 +1853,11 @@ struct AttentionBackwardKernel {
     /////////////////////////////////////////////////////////////////////////////////////////////////
     constexpr bool kSingleIterationGradV =
         kMaxK <= MatmulGradV::ThreadblockShape::kN;
+<<<<<<< HEAD
     for (int col = 0; col < (kSingleIterationGradV ? 1 : p.head_dim_value);
+=======
+    for (int32_t col = 0; col < (kSingleIterationGradV ? 1 : p.head_dim_value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          col += MatmulGradV::ThreadblockShape::kN) {
       using Mma = typename MatmulGradV::Mma;
       using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
@@ -2232,7 +2310,11 @@ struct AttentionBackwardKernel {
       }
 
       if (kPrologueQK && isLastColumn) {
+<<<<<<< HEAD
         int32_t next_query, next_key;
+=======
+        int64_t next_query, next_key;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         incrIteration(p, query_start, key_start, next_query, next_key);
         DISPATCH_BOOL(
             next_key != key_start, kForceReloadK, ([&]() {
@@ -2260,7 +2342,11 @@ struct AttentionBackwardKernel {
     }
   }
 
+<<<<<<< HEAD
   static CUTLASS_HOST_DEVICE int32_t getQueryStartShift(Params const& p) {
+=======
+  static CUTLASS_HOST_DEVICE int64_t getQueryStartShift(Params const& p) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (p.custom_mask_type == NoCustomMask && p.num_splits_key_device() > 1) {
       return (p.split_key_device() * kBlockSizeI) % getQueryEnd(p);
     }
@@ -2268,6 +2354,7 @@ struct AttentionBackwardKernel {
   }
 
   // Iteration order logic
+<<<<<<< HEAD
   static CUTLASS_HOST_DEVICE int32_t
   getQueryStart(Params const& p, int32_t key_start) {
     return getSmallestQueryForKey(p, key_start) + getQueryStartShift(p);
@@ -2289,6 +2376,29 @@ struct AttentionBackwardKernel {
 
     auto last_key_for_block =
         cutlass::fast_min(key_start + kBlockSizeJ, p.num_keys) - 1;
+=======
+  static CUTLASS_HOST_DEVICE int64_t
+  getQueryStart(Params const& p, int64_t key_start) {
+    return getSmallestQueryForKey(p, key_start) + getQueryStartShift(p);
+  };
+  static CUTLASS_HOST_DEVICE int64_t getQueryEnd(Params const& p) {
+    return align_up(p.num_queries, kBlockSizeI);
+  };
+
+  static CUTLASS_HOST_DEVICE int64_t
+  getSmallestQueryForKey(Params const& p, int64_t key_start) {
+    if (p.custom_mask_type == NoCustomMask) {
+      return 0;
+    }
+    int64_t shift = p.custom_mask_type == CausalFromBottomRight
+        ? p.num_keys - p.num_queries
+        : 0;
+    int64_t window_size =
+        p.window_size == 0 ? p.num_queries + p.num_keys : p.window_size;
+
+    auto last_key_for_block =
+        cutlass::fast_min(key_start + kBlockSizeJ, (int64_t)p.num_keys) - 1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int first_query = key_start - shift;
     int last_query = last_key_for_block - shift + window_size - 1;
     if (last_query < 0 || first_query >= p.num_queries) {
@@ -2333,10 +2443,17 @@ struct AttentionBackwardKernel {
   // Returns the next block to process
   static CUTLASS_HOST_DEVICE void incrIteration(
       Params const& p,
+<<<<<<< HEAD
       int32_t query_start,
       int32_t key_start,
       int32_t& next_query,
       int32_t& next_key) {
+=======
+      int64_t query_start,
+      int64_t key_start,
+      int64_t& next_query,
+      int64_t& next_key) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     next_query = query_start + kBlockSizeI;
     next_key = key_start;
     auto query_shift = getQueryStartShift(p);
@@ -2357,7 +2474,11 @@ struct AttentionBackwardKernel {
             : 0;
         // last key that is not masked out
         int last_key_for_block =
+<<<<<<< HEAD
             cutlass::fast_min(key_start + kBlockSizeJ, p.num_keys) - 1;
+=======
+            cutlass::fast_min(key_start + kBlockSizeJ, (int64_t)p.num_keys) - 1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         int last_query = last_key_for_block - shift + p.window_size - 1;
         if (next_query <= last_query && next_query < p.num_queries) {
           return;
@@ -2368,7 +2489,11 @@ struct AttentionBackwardKernel {
       // jump to next key
     }
     // Next key
+<<<<<<< HEAD
     next_key = key_start + p.num_splits_key_device() * kBlockSizeJ;
+=======
+    next_key = key_start + p.num_splits_key_device() * (int64_t)kBlockSizeJ;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     next_query = getQueryStart(p, next_key);
   }
 
@@ -2422,7 +2547,11 @@ struct AttentionBackwardKernel {
     int32_t num_keys_in_block = skipBoundsChecks
         ? MatmulQK::Mma::Shape::kM
         : cutlass::fast_min(
+<<<<<<< HEAD
               (int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start);
+=======
+              MatmulQK::Mma::Shape::kM, p.num_keys - key_start);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     typename MatmulGradV::OutputTileIterator outputV_it(
         typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
         p.grad_value_ptr + key_start * p.gV_strideM(),
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
index 53d213ef275f..3c98604b8082 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -16,6 +16,10 @@
 #include <c10/util/irange.h>
 #include <c10/util/Array.h>
 #include <c10/util/Exception.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/string_view.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
@@ -25,9 +29,18 @@
 
 #if USE_ROCM
 #if defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION)
+<<<<<<< HEAD
 #include <aotriton/flash.h>
 #define USE_ROCM_ATTENTION 1
 #endif
+=======
+#include <ATen/native/transformers/hip/aotriton_versions.h>
+#include <aotriton/flash.h>
+#define USE_ROCM_ATTENTION 1
+#endif
+#else
+#define USE_ROCM_ATTENTION 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 // Avoid potential compiler -Wall -Werror complains undefined macro
@@ -112,9 +125,30 @@ int64_t minimum_gemm_alignment(sdp_params const& params) {
 // caller_is_meff is added to make the TORCH_WARN message showing the correct result
 template<bool caller_is_meff = false>
 bool check_head_dim_size_flash(sdp_params const& params, bool debug) {
+<<<<<<< HEAD
 #if USE_ROCM_ATTENTION && AOTRITON_VERSION_MINOR >= 9
   // AOTriton 0.9+ supports head_dim up to 512
   const auto max_size = c10::SymInt(512);
+=======
+#if USE_ROCM_ATTENTION
+  // AOTriton 0.9+ supports head_dim up to 512
+  const static auto max_hdim = []() {
+#if AOTRITON_VERSION_CURRENT == AOTRITON_VERSION_INT(0, 11)
+    // gfx11xx only support hdim <= 256 on AOTriton 0.11
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    const c10::basic_string_view<char> arch(dprops->gcnArchName);
+    if (arch.starts_with("gfx11")) {
+      return 256;
+    }
+#endif // AOTriton 0.11
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 9)
+    return 512;
+#else
+    return 256;
+#endif
+  }();
+  const auto max_size = c10::SymInt(max_hdim);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   // All head_dim sizes must be equal and less than 256
   const auto max_size = c10::SymInt(256);
@@ -139,6 +173,31 @@ bool check_head_dim_size_flash(sdp_params const& params, bool debug) {
     }
     return false;
   }
+<<<<<<< HEAD
+=======
+  if constexpr(caller_is_meff) {
+    bool is_half = (params.query.dtype() == at::kHalf) ||
+      (params.query.dtype() == at::kBFloat16);
+    const int64_t alignment = is_half ? 8 : 4;
+    if (!(query_size_last % alignment == 0 && query_size_last > 0 &&
+          value_size_last % alignment == 0 && value_size_last > 0)) {
+      if (debug) {
+        TORCH_WARN(
+            "Mem efficient attention requires last dimension of inputs to be divisible by ",
+            alignment,
+            ". ",
+            "Got Query.size(-1): ",
+            query_size_last,
+            ", Key.size(-1): ",
+            params.key.sym_size(-1),
+            ", Value.size(-1): ",
+            params.value.sym_size(-1),
+            " instead.");
+      }
+      return false;
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return true;
 }
 
@@ -225,7 +284,11 @@ bool check_sm_version(cudaDeviceProp * dprops) {
 bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) {
   // Check that the gpu is capable of running flash attention
   using sm80 = SMVersion<8, 0>;
+<<<<<<< HEAD
   using sm120 = SMVersion<12, 0>;
+=======
+  using sm121 = SMVersion<12, 1>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if USE_ROCM
 #if USE_ROCM_ATTENTION
   if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) {
@@ -258,10 +321,17 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
 #endif
 #else
   auto dprops = at::cuda::getCurrentDeviceProperties();
+<<<<<<< HEAD
   if (!check_sm_version<sm80, sm120>(dprops)) {
     if (debug) {
       TORCH_WARN(
           "Flash attention only supports gpu architectures in the range [sm80, sm120]. Attempting to run on a sm ",
+=======
+  if (!check_sm_version<sm80, sm121>(dprops)) {
+    if (debug) {
+      TORCH_WARN(
+          "Flash attention only supports gpu architectures in the range [sm80, sm121]. Attempting to run on a sm ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           dprops->major,
           ".",
           dprops->minor,
@@ -276,6 +346,7 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
 bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) {
   // Mem Efficient attention supports hardware in the range [sm_50, sm_90]
   using sm50 = SMVersion<5, 0>;
+<<<<<<< HEAD
   using sm120 = SMVersion<12, 0>;
 #if USE_ROCM
 #if USE_ROCM_ATTENTION
@@ -298,15 +369,52 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
     }
   }
 #endif
+=======
+  using sm121 = SMVersion<12, 1>;
+#if USE_ROCM
+#if USE_ROCM_ATTENTION
+  if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) {
+    // User explicitly set CK as the flash attention backend. Return true for now
+    // TODO: Flesh out sanity checks
+    return true;
+  } else {
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    if (hipSuccess != aotriton::v2::flash::check_gpu(stream)) {
+        auto dprops = at::cuda::getCurrentDeviceProperties();
+        if (debug) {
+            TORCH_WARN(
+                    "Mem Efficient attention was not compiled for current AMD GPU architecture. Attempting to run on architecture ", dprops->gcnArchName);
+        }
+        return false;
+    }
+#if AOTRITON_VERSION_MINOR >= 9
+    if (aotriton::isArchExperimentallySupported(stream)) {
+      static const bool enable_experimental = c10::utils::check_env("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL") == true;
+      if (!enable_experimental) {
+        TORCH_WARN_ONCE("Mem Efficient attention on Current AMD GPU is still experimental."
+            " Enable it with TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1.");
+        return false;
+      }
+    }
+#endif
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   return false;
 #endif
 #else
   auto dprops = at::cuda::getCurrentDeviceProperties();
+<<<<<<< HEAD
   if (!check_sm_version<sm50, sm120>(dprops)) {
     if (debug) {
       TORCH_WARN(
           "Mem Efficient Attention only supports gpu architectures in the range [sm50, sm120]. Attempting to run on a sm ",
+=======
+  if (!check_sm_version<sm50, sm121>(dprops)) {
+    if (debug) {
+      TORCH_WARN(
+          "Mem Efficient Attention only supports gpu architectures in the range [sm50, sm121]. Attempting to run on a sm ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           dprops->major,
           ".",
           dprops->minor,
@@ -326,9 +434,16 @@ bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89_or_120(
   using sm86 = SMVersion<8, 6>;
   using sm89 = SMVersion<8, 9>;
   using sm120 = SMVersion<12, 0>;
+<<<<<<< HEAD
   auto dprops = at::cuda::getCurrentDeviceProperties();
   bool is_sm86_or_sm89 = check_sm_version<sm86, sm89>(dprops);
   bool is_sm120 = check_sm_version<sm120, sm120>(dprops);
+=======
+  using sm121 = SMVersion<12, 1>;
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  bool is_sm86_or_sm89 = check_sm_version<sm86, sm89>(dprops);
+  bool is_sm120_or_sm121 = check_sm_version<sm120, sm121>(dprops);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool is_head_dim_gt192 = params.query.sym_size(-1) > 192;
   bool is_head_dim_lte224 = params.query.sym_size(-1) <= 224;
   bool is_dropout = params.dropout > 0.0;
@@ -336,7 +451,11 @@ bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89_or_120(
   bool cond1 = is_head_dim_gt192 && is_head_dim_lte224;
   // head_dim size > 224 and is_dropout is not supported on sm86 and sm89
   bool cond2 = params.query.sym_size(-1) > 224 && is_dropout;
+<<<<<<< HEAD
   if (input_requires_grad(params) && (is_sm86_or_sm89 || is_sm120) && (cond1 || cond2)) {
+=======
+  if (input_requires_grad(params) && (is_sm86_or_sm89 || is_sm120_or_sm121) && (cond1 || cond2)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (debug) {
       TORCH_WARN(
           "Flash attention currently doesn't support training with head_dim ∈ (192, 224] or "
@@ -406,7 +525,17 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
     }
     return false;
   }
+<<<<<<< HEAD
   constexpr auto head_dim_limit = 128;
+=======
+  auto head_dim_limit = 128;
+  if (cudnn_version >= 90501) {
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    if (dprops->major == 9  && !dprops->minor) {
+      head_dim_limit = 256;
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (d_qk > head_dim_limit || d_v > head_dim_limit) {
     if (debug) {
       TORCH_WARN("head_dim should be no more than ", head_dim_limit);
@@ -518,12 +647,21 @@ bool check_cudnn_layout(sdp_params const& params, bool debug) {
 
 bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
   using sm80 = SMVersion<8, 0>;
+<<<<<<< HEAD
   using sm120 = SMVersion<12, 0>;
   auto dprops = at::cuda::getCurrentDeviceProperties();
   if (!check_sm_version<sm80, sm120>(dprops)) {
     if (debug) {
       TORCH_WARN(
           "cuDNN MHA only supports gpu architectures in the range [sm80, sm120]. Attempting to run on a sm ",
+=======
+  using sm121 = SMVersion<12, 1>;
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  if (!check_sm_version<sm80, sm121>(dprops)) {
+    if (debug) {
+      TORCH_WARN(
+          "cuDNN MHA only supports gpu architectures in the range [sm80, sm121]. Attempting to run on a sm ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           dprops->major,
           ".",
           dprops->minor,
@@ -541,9 +679,16 @@ bool check_for_nested_inputs(sdp_params const& params, bool debug) {
       TORCH_WARN("Experimental cuDNN SDPA nested tensor support is not enabled.");
     }
     return false;
+<<<<<<< HEAD
   } else if (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad()) {
     if (debug) {
       TORCH_WARN("Experimental cuDNN SDPA nested tensor support does not support backward.");
+=======
+  } else if (has_for_nested_inputs(params) && (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad())) {
+    if (debug) {
+      TORCH_WARN("Experimental cuDNN SDPA nested tensor support does not support backward.");
+      return false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -632,7 +777,12 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
   }
   constexpr auto dense_constraints =
       c10::array_of<bool (*)(sdp_params const&, bool)>(
+<<<<<<< HEAD
       check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>
+=======
+      check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>,
+      check_batch_size_and_num_heads_dense<true /*enable_gqa*/, false /*requires_same_num_heads*/>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   );
 
   if (has_only_dense_inputs(params)) {
@@ -716,6 +866,11 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
 #ifdef USE_ROCM
   constexpr auto aotriton_mem_efficient_dtypes =
       c10::array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
+<<<<<<< HEAD
+=======
+  constexpr auto ck_mem_efficient_dtypes =
+      c10::array_of<at::ScalarType>(at::kHalf, at::kBFloat16);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes =
       c10::array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
@@ -775,6 +930,12 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
       return false;
     }
   }
+<<<<<<< HEAD
+=======
+  if(at::globalContext().getROCmFAPreferredBackend() == at::ROCmFABackend::Ck) {
+    return check_tensor_dtype(params, ck_mem_efficient_dtypes, debug);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return check_tensor_dtype(params, aotriton_mem_efficient_dtypes, debug);
 #else
   auto dprop = at::cuda::getCurrentDeviceProperties();
diff --git a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
index 8c1220ab8b6b..4614e80bb63f 100644
--- a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
+++ b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
@@ -2,13 +2,23 @@
 
 #ifdef USE_ROCM
 
+<<<<<<< HEAD
 #include <aotriton/dtypes.h>
 #include <aotriton/util.h>
+=======
+// Expect to be included after headers of at::zeros_like and at::empty_like
+
+#include <aotriton/dtypes.h>
+#include <aotriton/util.h>
+#include <aotriton/config.h>
+#include <ATen/native/transformers/hip/aotriton_versions.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ////////////////////////////////////////////////////////////////////////////////
 // Common macros copied from cuda/mem_eff_attention/gemm_kernel_utils.h
 ////////////////////////////////////////////////////////////////////////////////
 
+<<<<<<< HEAD
 #define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
   TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
   TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
@@ -31,6 +41,8 @@
         B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace sdp {
 
 namespace aotriton_adapter {
@@ -133,6 +145,64 @@ inline aotriton::TensorView<0> mk_atomictensor(const int32_t* ptr)
                                  aotriton::DType::kInt32);
 }
 
+<<<<<<< HEAD
+=======
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11)
+
+struct LazyTensorContext {
+  at::Tensor like_tensor;
+  std::string_view tensor_name;
+  at::Tensor tensor;
+};
+
+template<int kRank, bool kRequireZeros>
+struct LazyTensorFunctions : public LazyTensorContext {
+  static aotriton::TensorView<kRank> acquire(void* cookie) {
+    auto ctx = (LazyTensorContext*)cookie;
+    if (!ctx->tensor.defined()) {
+      auto q = ctx->like_tensor;
+      if constexpr (kRequireZeros) {
+        ctx->tensor = at::zeros(q.sizes(),
+                                q.options().dtype(at::kFloat));
+      } else {
+        ctx->tensor = at::empty_like(q);
+      }
+    }
+    return mk_aotensor<kRank>(ctx->tensor, ctx->tensor_name);
+  }
+
+  static void dispose(void* cookie) {
+  }
+};
+
+template<int kRank, bool kRequireZeros>
+aotriton::LazyTensor<kRank> mklazy_common(LazyTensorContext* cookie)
+{
+  using LTF = LazyTensorFunctions<kRank, kRequireZeros>;
+  return aotriton::LazyTensor<kRank> {
+    .cookie = cookie,
+    .acquire = &LTF::acquire,
+    .dispose = &LTF::dispose
+  };
+}
+
+template<int kRank>
+auto mklazy_empty_like(LazyTensorContext* cookie)
+{
+  return mklazy_common<kRank, false>(cookie);
+}
+
+
+// Note: this will not keep the original strides
+template<int kRank>
+auto mklazy_fp32zeros(LazyTensorContext* cookie)
+{
+  return mklazy_common<kRank, true>(cookie);
+}
+
+#endif  // >= 0.11
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace aotriton_adapter
 
 } // namespace sdp
diff --git a/aten/src/ATen/native/transformers/hip/aotriton_versions.h b/aten/src/ATen/native/transformers/hip/aotriton_versions.h
new file mode 100644
index 000000000000..2f5d3f0e1222
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/aotriton_versions.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#ifdef USE_ROCM
+
+#define AOTRITON_VERSION_INT(x, y) (x * 100 + y)
+#define AOTRITON_VERSION_CURRENT (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR)
+
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11)
+#define AOTRITON_ALWAYS_V3_API 1
+#else
+#define AOTRITON_ALWAYS_V3_API 0
+#endif
+
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 10)
+#define AOTRITON_V3_API 1
+#else
+#define AOTRITON_V3_API 0
+#endif
+
+#endif
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
index 92a3dabbd7d5..eda6cbc5ac2d 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
@@ -60,12 +60,20 @@
 #include <c10/util/Exception.h>
 
 // AOTriton headers
+<<<<<<< HEAD
 #include <aotriton/config.h>
 #include <aotriton/flash.h>
 #include <aotriton/runtime.h>
 
 #if AOTRITON_VERSION_MINOR != 9
 #error "This adaptor code is only tested with AOTriton 0.9.x"
+=======
+#include <aotriton/flash.h>
+#include <aotriton/runtime.h>
+
+#if AOTRITON_VERSION_CURRENT < AOTRITON_VERSION_INT(0, 9)
+#error "This adaptor code is only tested with AOTriton >= 0.9"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 namespace pytorch_flash {
@@ -81,6 +89,41 @@ void check_gpu_arch(hipStream_t stream) {
   }
 }
 
+<<<<<<< HEAD
+=======
+std::tuple<bool, int, int>
+calculate_swa(std::optional<int64_t> window_size_left,
+              std::optional<int64_t> window_size_right,
+              int max_seqlen_q,
+              int max_seqlen_k,
+              bool is_causal) {
+#if AOTRITON_V3_API  // SWA is exposed through V3 API
+  bool needs_swa = false;
+  using aotriton::v3::flash::WindowValue;
+  // Default values when std::optional window_size_left/right have no value
+  int window_left = max_seqlen_q;
+  int window_right = max_seqlen_k;
+  if (is_causal) {
+    window_left = WindowValue::BottomRightAligned;
+    window_right = WindowValue::BottomRightAligned;
+  }
+  if (window_size_left.has_value() || window_size_right.has_value()) {
+    needs_swa = true;
+    window_left = window_size_left.value_or(window_left);
+    window_right = window_size_right.value_or(window_right);
+  }
+  return std::make_tuple(needs_swa, window_left, window_right);
+#else
+  if (window_size_left.has_value() || window_size_right.has_value()) {
+    TORCH_WARN_ONCE("Current AOTriton does not support sliding window attention (SWA)."
+                    " Both window_size_left and window_size_right will be ignored."
+                    " Re-compile PyTorch with AOTriton >= 0.10b to enable SWA support.");
+  }
+  return std::make_tuple(false, 0, 0);
+#endif
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // We want to checkpoint and save the RNG state for backward if dropout
 // We get the default generator and return the seed and offset which will
 // be used in the backward function
@@ -127,8 +170,13 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
             const float p_dropout,
             const float softmax_scale,
             bool is_causal,
+<<<<<<< HEAD
             int window_size_left,
             int window_size_right,
+=======
+            std::optional<int64_t> window_size_left,
+            std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             const bool return_softmax,
             const std::optional<at::Generator>& gen_) {
   auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
@@ -161,7 +209,10 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
   TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
   if (seqlen_q == 1) { is_causal = false; }  // causal=true is the same as causal=false in this case
+<<<<<<< HEAD
   if (is_causal) { window_size_right = 0; }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og);
   CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og);
@@ -212,6 +263,22 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
     atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
   }
 
+<<<<<<< HEAD
+=======
+  auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
+                                                              window_size_right,
+                                                              seqlen_q,
+                                                              seqlen_k,
+                                                              is_causal);
+#if AOTRITON_V3_API
+  const bool uses_swa = needs_swa;
+#else
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // optimized out (hopefully).
+  constexpr bool uses_swa = false;
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   hipError_t err; // TODO: Error handling
   using aotriton::v2::flash::attn_fwd;
   using sdp::aotriton_adapter::mk_aotensor;
@@ -226,6 +293,7 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
   auto seed_output = mk_philoxtensor(use_philox_state ? seed_t.data_ptr<int64_t>() : nullptr);
   auto offset_output = mk_philoxtensor(use_philox_state ? offset_t.data_ptr<int64_t>() : nullptr);
   auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr<int32_t>() : nullptr);
+<<<<<<< HEAD
   err = attn_fwd(mk_aotensor(q_t, "q"),
                  mk_aotensor(k_t, "k"),
                  mk_aotensor(v_t, "v"),
@@ -243,6 +311,56 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
                  is_causal,
                  persistent_counter,
                  stream);
+=======
+  if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
+    using aotriton::v3::flash::CausalType;
+    using aotriton::v3::flash::VarlenType;
+    aotriton::v3::flash::attn_fwd_params params;
+    params.Q = mk_aotensor(q_t, "q");
+    params.K = mk_aotensor(k_t, "k");
+    params.V = mk_aotensor(v_t, "v");
+    params.Sm_scale = softmax_scale;
+    params.L = mk_aotensor<2>(M, "M");
+    params.Out = mk_aotensor(output_t, "Out");
+    params.Max_seqlen_q = seqlen_q;    // Unused if cu_seqlens_q is empty
+    params.Max_seqlen_k = seqlen_k;    // Unused if cu_seqlens_k is empty
+    params.dropout_p = p_dropout;
+    params.philox_seed_ptr = seed;
+    params.philox_offset1 = offset1;
+    params.philox_offset2 = offset2;
+    params.philox_seed_output = seed_output;
+    params.philox_offset_output = offset_output;
+    params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax");
+    params.persistent_atomic_counter = persistent_counter;
+    params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
+    params.varlen_type = VarlenType::None;
+    params.window_left = window_left;
+    params.window_right = window_right;
+    err = aotriton::v3::flash::attn_fwd(params,
+                                        aotriton::v3::flash::attn_fwd_params::kVersion,
+                                        stream);
+#endif
+  } else {
+    err = attn_fwd(mk_aotensor(q_t, "q"),
+                   mk_aotensor(k_t, "k"),
+                   mk_aotensor(v_t, "v"),
+                   empty_bias,
+                   softmax_scale,
+                   mk_aotensor<2>(M, "M"),
+                   mk_aotensor(output_t, "Out"),
+                   p_dropout,
+                   seed,
+                   offset1,
+                   offset2,
+                   seed_output,
+                   offset_output,
+                   mk_aotensor(softmax_fa_t, "encoded_softmax"),
+                   is_causal,
+                   persistent_counter,
+                   stream);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return {out, q_padded, k_padded, v_padded, M.view({batch_size, num_heads, seqlen_q}), seed_t, offset_t, softmax_fa_t};
 }
@@ -263,8 +381,13 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
                const float softmax_scale,
                const bool zero_tensors,
                bool is_causal,
+<<<<<<< HEAD
                int window_size_left,
                int window_size_right,
+=======
+               std::optional<int64_t> window_size_left,
+               std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                const bool return_softmax,
                const std::optional<at::Generator>& gen_) {
   TORCH_CHECK(!seqused_k.has_value(), "[ROCm] mha_varlen_fwd: seqused_k must be nullopt");
@@ -312,6 +435,7 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
   TORCH_CHECK(head_size_og <= 512, "FlashAttention on ROCm forward only supports head dimension at most 512");
   TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
+<<<<<<< HEAD
   if (window_size_left >= max_seqlen_k) {
     window_size_left = -1;
   }
@@ -319,6 +443,8 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
     window_size_right = -1;
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CHECK_SHAPE(temp_q, total_q, num_heads, head_size_og);
   const int total_k = k.size(0);
   CHECK_SHAPE(k, total_k, num_heads_k, head_size_og);
@@ -368,6 +494,22 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
     }
   }
 
+<<<<<<< HEAD
+=======
+  auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
+                                                              window_size_right,
+                                                              max_seqlen_q,
+                                                              max_seqlen_k,
+                                                              is_causal);
+#if AOTRITON_V3_API
+  const bool uses_swa = needs_swa;
+#else
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // optimized out (hopefully).
+  constexpr bool uses_swa = false;
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto [seed_t, offset_t, philox_state, use_philox_state] =
     prepare_philox_arguments(p_dropout, batch_size * num_heads * 32);
 
@@ -377,10 +519,18 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
     using sdp::aotriton_adapter::mk_aotensor;
     using sdp::aotriton_adapter::mk_aoscalartensor;
     using sdp::aotriton_adapter::mk_philoxtensor;
+<<<<<<< HEAD
     using sdp::aotriton_adapter::cast_dtype;
     at::Tensor atomic_counter;
     if (is_causal) {
       atomic_counter = at::zeros({1}, q.options());
+=======
+    using sdp::aotriton_adapter::mk_atomictensor;
+    using sdp::aotriton_adapter::cast_dtype;
+    at::Tensor atomic_counter;
+    if (is_causal) {
+      atomic_counter = at::zeros({1}, q.options().dtype(at::kInt));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
     auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t);
@@ -389,6 +539,7 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
     auto nullscalar = mk_philoxtensor(nullptr);
     auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : nullscalar;
     auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : nullscalar;
+<<<<<<< HEAD
     auto persistent_counter = is_causal ? mk_philoxtensor(atomic_counter.data_ptr<int64_t>()) : nullscalar;
     err = attn_fwd_compact_varlen(mk_aotensor(q_padded, "q"),
                                   mk_aotensor(k_padded, "k"),
@@ -411,6 +562,63 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
                                   is_causal,
                                   persistent_counter,
                                   stream);
+=======
+    auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr<int32_t>() : nullptr);
+    if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
+      using aotriton::v3::flash::CausalType;
+      using aotriton::v3::flash::VarlenType;
+      aotriton::v3::flash::attn_fwd_params params;
+      params.Q = mk_aotensor(q_padded, "q");
+      params.K = mk_aotensor(k_padded, "k");
+      params.V = mk_aotensor(v_padded, "v");
+      params.Sm_scale = softmax_scale;
+      params.L = mk_aotensor<2>(M, "M");
+      params.Out = mk_aotensor(out_padded, "Out");
+      params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q, "cu_seqlens_q");
+      params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k, "cu_seqlens_k");
+      params.Max_seqlen_q = max_seqlen_q;    // Unused if cu_seqlens_q is empty
+      params.Max_seqlen_k = max_seqlen_k;    // Unused if cu_seqlens_k is empty
+      params.dropout_p = p_dropout;
+      params.philox_seed_ptr = seed;
+      params.philox_offset1 = offset1;
+      params.philox_offset2 = offset2;
+      params.philox_seed_output = seed_output;
+      params.philox_offset_output = offset_output;
+      params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax");
+      params.persistent_atomic_counter = persistent_counter;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
+      params.varlen_type = VarlenType::CompactVarlen;
+      params.window_left = window_left;
+      params.window_right = window_right;
+      err = aotriton::v3::flash::attn_fwd(params,
+                                          aotriton::v3::flash::attn_fwd_params::kVersion,
+                                          stream);
+#endif
+    } else {
+      err = attn_fwd_compact_varlen(mk_aotensor(q_padded, "q"),
+                                    mk_aotensor(k_padded, "k"),
+                                    mk_aotensor(v_padded, "v"),
+                                    empty_bias,
+                                    mk_aotensor<1>(cu_seqlens_q, "cu_seqlens_q"),
+                                    mk_aotensor<1>(cu_seqlens_k, "cu_seqlens_k"),
+                                    max_seqlen_q,
+                                    max_seqlen_k,
+                                    softmax_scale,
+                                    mk_aotensor<2>(M, "M"),
+                                    mk_aotensor(out_padded, "Out"),
+                                    p_dropout,
+                                    seed,
+                                    offset1,
+                                    offset2,
+                                    seed_output,
+                                    offset_output,
+                                    mk_aotensor(softmax_fa_t, "encoded_softmax"),
+                                    is_causal,
+                                    persistent_counter,
+                                    stream);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
     out.zero_();
@@ -434,8 +642,13 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
         const float p_dropout,         // probability to drop
         const float softmax_scale,
         const bool is_causal,
+<<<<<<< HEAD
         int window_size_left,
         int window_size_right,
+=======
+        std::optional<int64_t> window_size_left,
+        std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         const bool deterministic,
         const at::Tensor& philox_seed,
         const at::Tensor& philox_offset) {
@@ -474,10 +687,13 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
   const int seqlen_k = k.size(1);
   const int num_heads_k = k.size(2);
 
+<<<<<<< HEAD
   if (is_causal){
     TORCH_CHECK((seqlen_q == seqlen_k), "For backwards kernel seqlen_q must equal seqlen_k for causal kernels");
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(batch_size > 0, "batch size must be positive");
   TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
   TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
@@ -524,6 +740,22 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
     dv = at::empty_like(k);
   }
 
+<<<<<<< HEAD
+=======
+  auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
+                                                              window_size_right,
+                                                              seqlen_q,
+                                                              seqlen_k,
+                                                              is_causal);
+#if AOTRITON_V3_API
+  const bool uses_swa = needs_swa;
+#else
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // optimized out (hopefully).
+  constexpr bool uses_swa = false;
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto opts = q.options();
   auto softmax_d = at::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
 
@@ -541,10 +773,59 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
   int d_head = head_size_og;
   bool use_fused_bwd = d_head <= 192 && d_head * seqlen_q < 64 * 512;
   hipError_t err; // TODO: Error handling
+<<<<<<< HEAD
   if (use_fused_bwd) {
     using aotriton::v2::flash::attn_bwd_fused;
     using sdp::aotriton_adapter::mk_aotensor;
     using sdp::aotriton_adapter::mk_aoscalartensor;
+=======
+  using sdp::aotriton_adapter::mk_aotensor;
+  using sdp::aotriton_adapter::mk_aoscalartensor;
+  if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
+    // Fused BWD does not support SWA
+    using aotriton::v3::flash::CausalType;
+    using aotriton::v3::flash::VarlenType;
+    aotriton::v3::flash::attn_bwd_params params;
+    params.Q = mk_aotensor(q_t, "q");
+    params.K = mk_aotensor(k_t, "k");
+    params.V = mk_aotensor(v_t, "v");
+    params.Sm_scale = softmax_scale;
+    params.Out = mk_aotensor(out_t, "out");
+    params.DO = mk_aotensor(dout_t, "dout");
+    params.DQ = mk_aotensor(dq_t, "dq");
+    params.DK = mk_aotensor(dk_t, "dk");
+    params.DV = mk_aotensor(dv_t, "dv");
+    params.L = mk_aotensor<2>(softmax_lse_cont, "L");
+    params.Max_seqlen_q = seqlen_q;        // Unused if cu_seqlens_q is empty
+    params.Max_seqlen_k = seqlen_k;        // Unused if cu_seqlens_k is empty
+    params.dropout_p = p_dropout;
+    params.philox_seed_ptr =  mk_aoscalartensor(philox_seed);
+    params.philox_offset1 = mk_aoscalartensor(philox_offset);
+    params.philox_offset2 = 0;
+    params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
+    params.window_left = window_left;
+    params.window_right = window_right;
+    params.varlen_type = VarlenType::None;
+#if AOTRITON_ALWAYS_V3_API
+    using sdp::aotriton_adapter::mklazy_empty_like;
+    using sdp::aotriton_adapter::mklazy_fp32zeros;
+    using sdp::aotriton_adapter::LazyTensorContext;
+    LazyTensorContext lazy_delta { .like_tensor = softmax_lse_cont, .tensor_name = "delta" };
+    LazyTensorContext lazy_dq_acc { .like_tensor = dq_t, .tensor_name = "dq_acc" };
+    params.D = mklazy_empty_like<2>(&lazy_delta);
+    params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc);
+#else
+    at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
+    params.D = mk_aotensor<2>(delta, "delta");
+#endif
+    err = aotriton::v3::flash::attn_bwd(params,
+                                        aotriton::v3::flash::attn_bwd_params::kVersion,
+                                        stream);
+#endif
+  } else if (use_fused_bwd) {
+    using aotriton::v2::flash::attn_bwd_fused;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     using sdp::aotriton_adapter::cast_dtype;
     aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
     err = attn_bwd_fused(mk_aotensor(q_t, "q"),
@@ -568,8 +849,11 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
   } else {
     at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
     using aotriton::v2::flash::attn_bwd;
+<<<<<<< HEAD
     using sdp::aotriton_adapter::mk_aotensor;
     using sdp::aotriton_adapter::mk_aoscalartensor;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     using sdp::aotriton_adapter::cast_dtype;
     aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
     err = attn_bwd(mk_aotensor(q_t, "q"),
@@ -615,17 +899,25 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
                const float softmax_scale,
                const bool zero_tensors,
                const bool is_causal,
+<<<<<<< HEAD
                int window_size_left,
                int window_size_right,
+=======
+               std::optional<int64_t> window_size_left,
+               std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                const bool deterministic,
                const at::Tensor& philox_seed,
                const at::Tensor& philox_offset)
 {
   TORCH_CHECK(!alibi_slopes_.has_value(), "[ROCm] mha_varlen_fwd: alibi_slopes_ must be nullopt");
 
+<<<<<<< HEAD
   if (is_causal) {
     window_size_right = 0;
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Otherwise the kernel will be launched from cuda:0 device
   // Cast to char to avoid compiler warning about narrowing
   at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
@@ -669,9 +961,12 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
   TORCH_CHECK(head_size <= 512, "FlashAttention on ROCm backward only supports head dimension at most 512");
   TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
+<<<<<<< HEAD
   if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
   if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CHECK_SHAPE(q, total_q, num_heads, head_size);
   CHECK_SHAPE(k, total_k, num_heads_k, head_size);
   CHECK_SHAPE(v, total_k, num_heads_k, head_size);
@@ -681,7 +976,10 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
   CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
 
   at::Tensor softmax_lse_cont = softmax_lse.view({batch_size * num_heads, max_seqlen_q}).contiguous();
+<<<<<<< HEAD
   at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   at::Tensor q_padded, k_padded, v_padded;
   q_padded = q.unsqueeze(0).transpose(1, 2);
@@ -734,6 +1032,22 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
     softmax_d.zero_();
   }
 
+<<<<<<< HEAD
+=======
+  auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
+                                                              window_size_right,
+                                                              max_seqlen_q,
+                                                              max_seqlen_k,
+                                                              is_causal);
+#if AOTRITON_V3_API
+  const bool uses_swa = needs_swa;
+#else
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // optimized out (hopefully).
+  constexpr bool uses_swa = false;
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::PhiloxCudaState philox_args;
   if (is_dropout) {
     if (at::cuda::currentStreamCaptureStatus() ==
@@ -747,6 +1061,7 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
   }
   if (max_seqlen_q > 0) {
     hipError_t err; // TODO: Error handling
+<<<<<<< HEAD
     using aotriton::v2::flash::attn_bwd_compact_varlen;
     using sdp::aotriton_adapter::mk_aotensor;
     using sdp::aotriton_adapter::mk_aoscalartensor;
@@ -775,6 +1090,82 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
                                   0,
                                   is_causal,
                                   stream);
+=======
+    using sdp::aotriton_adapter::mk_aotensor;
+    using sdp::aotriton_adapter::mk_aoscalartensor;
+    if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
+      using aotriton::v3::flash::CausalType;
+      using aotriton::v3::flash::VarlenType;
+      aotriton::v3::flash::attn_bwd_params params;
+      params.Q = mk_aotensor(q_padded, "q");
+      params.K = mk_aotensor(k_padded, "k");
+      params.V = mk_aotensor(v_padded, "v");
+      params.Sm_scale = softmax_scale;
+      params.Out = mk_aotensor(out_t, "out");
+      params.DO = mk_aotensor(dout_t, "dout");
+      params.DK = mk_aotensor(dk_padded, "dk");
+      params.DV = mk_aotensor(dv_padded, "dv");
+      params.DQ = mk_aotensor(dq_padded, "dq");
+      params.L = mk_aotensor<2>(softmax_lse_cont, "L");
+      params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q, "cu_seqlens_q");
+      params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k, "cu_seqlens_k");
+      params.Max_seqlen_q = max_seqlen_q;        // Unused if cu_seqlens_q is empty
+      params.Max_seqlen_k = max_seqlen_k;        // Unused if cu_seqlens_k is empty
+      params.dropout_p = p_dropout;
+      params.philox_seed_ptr =  mk_aoscalartensor(philox_seed);
+      params.philox_offset1 = mk_aoscalartensor(philox_offset);
+      params.philox_offset2 = 0;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
+      params.varlen_type = VarlenType::CompactVarlen;
+      params.window_left = window_left;
+      params.window_right = window_right;
+#if AOTRITON_ALWAYS_V3_API
+      using sdp::aotriton_adapter::mklazy_empty_like;
+      using sdp::aotriton_adapter::mklazy_fp32zeros;
+      using sdp::aotriton_adapter::LazyTensorContext;
+      LazyTensorContext lazy_delta { .like_tensor = softmax_lse_cont, .tensor_name = "delta" };
+      LazyTensorContext lazy_dq_acc { .like_tensor = dq_padded, .tensor_name = "dq_acc" };
+      params.D = mklazy_empty_like<2>(&lazy_delta);
+      params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc);
+#else
+      at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
+      params.D = mk_aotensor<2>(delta, "delta");
+#endif
+      err = aotriton::v3::flash::attn_bwd(params,
+                                          aotriton::v3::flash::attn_bwd_params::kVersion,
+                                          stream);
+#endif  // AOTRITON_ALWAYS_V3_API
+    } else {
+      using aotriton::v2::flash::attn_bwd_compact_varlen;
+      using sdp::aotriton_adapter::cast_dtype;
+      at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
+      aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
+      err = attn_bwd_compact_varlen(mk_aotensor(q_padded, "q"),
+                                    mk_aotensor(k_padded, "k"),
+                                    mk_aotensor(v_padded, "v"),
+                                    mk_aotensor<1>(cu_seqlens_q, "cu_seqlens_q"),
+                                    mk_aotensor<1>(cu_seqlens_k, "cu_seqlens_k"),
+                                    max_seqlen_q,
+                                    max_seqlen_k,
+                                    empty_bias,
+                                    softmax_scale,
+                                    mk_aotensor(out_t, "out"),
+                                    mk_aotensor(dout_t, "dout"),
+                                    mk_aotensor(dq_padded, "dq"),
+                                    mk_aotensor(dk_padded, "dk"),
+                                    mk_aotensor(dv_padded, "dv"),
+                                    empty_bias,
+                                    mk_aotensor<2>(softmax_lse_cont, "L"),
+                                    mk_aotensor<2>(delta, "delta"),
+                                    p_dropout,
+                                    mk_aoscalartensor(philox_seed),
+                                    mk_aoscalartensor(philox_offset),
+                                    0,
+                                    is_causal,
+                                    stream);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
     dq.zero_();
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
index ff10049fafe9..a2b948306de9 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/mha_bwd_ck.hip
@@ -276,7 +276,11 @@ mha_bwd_ck(const at::Tensor &dout,                   // batch_size x seqlen_q x
     const int num_heads_k = k.size(2);
     TORCH_CHECK(batch_size > 0, "batch size must be positive");
     TORCH_CHECK(head_size_8x % 8 == 0, "head_size_8x should be a multiple of 8");
+<<<<<<< HEAD
     TORCH_CHECK(head_size_8x <= 128, "CK FlashAttention backward only supports head dimension at most 128");
+=======
+    TORCH_CHECK(head_size_8x <= 256, "CK FlashAttention backward only supports head dimension at most 256");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
index 4daaa66e8a1a..9d5b46dd74bf 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.h
@@ -5,6 +5,31 @@
 #include <ATen/core/Tensor.h>
 #include <c10/util/Exception.h>
 
+<<<<<<< HEAD
+=======
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  TORCH_CHECK(                         \
+      uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                    \
+  {                                                                    \
+    A = B;                                                             \
+    TORCH_CHECK(                                                    \
+        B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace pytorch_flash {
 
 // AOTriton Implementation
@@ -29,8 +54,13 @@ mha_fwd_aot(
     const float p_dropout,
     const float softmax_scale,
     bool is_causal,
+<<<<<<< HEAD
     int window_size_left,
     int window_size_right,
+=======
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const bool return_softmax,
     const std::optional<at::Generator>& gen_);
 
@@ -65,8 +95,13 @@ mha_varlen_fwd_aot(
     const float softmax_scale,
     const bool zero_tensors,
     bool is_causal,
+<<<<<<< HEAD
     int window_size_left,
     int window_size_right,
+=======
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const bool return_softmax,
     const std::optional<at::Generator>& gen_);
 
@@ -88,8 +123,13 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_bwd_aot(
     const float p_dropout, // probability to drop
     const float softmax_scale,
     const bool is_causal,
+<<<<<<< HEAD
     int window_size_left,
     int window_size_right,
+=======
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const bool deterministic,
     const at::Tensor& philox_seed,
     const at::Tensor& philox_offset);
@@ -119,8 +159,13 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd_aot(
     const float softmax_scale,
     const bool zero_tensors,
     const bool is_causal,
+<<<<<<< HEAD
     int window_size_left,
     int window_size_right,
+=======
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const bool deterministic,
     const at::Tensor& philox_seed,
     const at::Tensor& philox_offset);
@@ -248,7 +293,11 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varle
 #endif
 
 TORCH_API
+<<<<<<< HEAD
 inline std::tuple<
+=======
+std::tuple<
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor,
     at::Tensor,
     at::Tensor,
@@ -268,6 +317,7 @@ mha_fwd(
     const float p_dropout,
     const float softmax_scale,
     bool is_causal,
+<<<<<<< HEAD
     int window_size_left,
     int window_size_right,
     const float softcap,
@@ -321,6 +371,13 @@ mha_fwd(
       gen_);
 #endif
 }
+=======
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right,
+    const float softcap,
+    const bool return_softmax,
+    std::optional<at::Generator> gen_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 inline std::tuple<
     at::Tensor,
@@ -354,8 +411,13 @@ mha_varlen_fwd(
     const float softmax_scale,
     const bool zero_tensors,
     bool is_causal,
+<<<<<<< HEAD
     int window_size_left,
     int window_size_right,
+=======
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const float softcap,
     const bool return_softmax,
     std::optional<at::Generator> gen_) {
@@ -363,6 +425,11 @@ mha_varlen_fwd(
   if (at::globalContext().getROCmFAPreferredBackend() ==
       at::ROCmFABackend::Ck) {
     std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
+<<<<<<< HEAD
+=======
+    const int non_null_window_left = window_size_left.value_or(-1);
+    const int non_null_window_right = window_size_right.value_or(-1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return mha_varlen_fwd_ck(
         q,
         k,
@@ -377,6 +444,7 @@ mha_varlen_fwd(
         softmax_scale,
         zero_tensors,
         is_causal,
+<<<<<<< HEAD
         window_size_left,
         window_size_right,
         return_softmax,
@@ -405,6 +473,15 @@ mha_varlen_fwd(
         gen_);
   }
 #else
+=======
+        non_null_window_left,
+        non_null_window_right,
+        return_softmax,
+        gen_,
+        dummy_attn_bias); // Not used in flash attention
+  }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return mha_varlen_fwd_aot(
       q,
       k,
@@ -425,7 +502,10 @@ mha_varlen_fwd(
       window_size_right,
       return_softmax,
       gen_);
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_bwd(
@@ -446,16 +526,30 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_bwd(
     const float p_dropout, // probability to drop
     const float softmax_scale,
     const bool is_causal,
+<<<<<<< HEAD
     int window_size_left,
     int window_size_right,
+=======
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const float softcap,
     const bool deterministic,
     const at::Tensor philox_seed,
     const at::Tensor philox_offset) {
+<<<<<<< HEAD
 #if defined(USE_CK_FLASH_ATTENTION)
   if (at::globalContext().getROCmFAPreferredBackend() ==
       at::ROCmFABackend::Ck) {
     std::optional<at::Tensor> non_null_dbias = std::nullopt;
+=======
+  if (at::globalContext().getROCmFAPreferredBackend() ==
+      at::ROCmFABackend::Ck) {
+#if defined(USE_CK_FLASH_ATTENTION)
+    std::optional<at::Tensor> non_null_dbias = std::nullopt;
+    const int non_null_window_left = window_size_left.value_or(-1);
+    const int non_null_window_right = window_size_right.value_or(-1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto[dQuery,
          dKey,
          dValue,
@@ -476,13 +570,19 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_bwd(
                              p_dropout,
                              softmax_scale,
                              is_causal,
+<<<<<<< HEAD
                              window_size_left,
                              window_size_right,
+=======
+                             non_null_window_left,
+                             non_null_window_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                              deterministic,
                              philox_seed,
                              philox_offset);
     // for FA return [dQ, dV, dK, dSoftmax]
     return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue), std::move(dSoftmax));
+<<<<<<< HEAD
   } else {
     return mha_bwd_aot(
         dout,
@@ -508,6 +608,11 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_bwd(
   if(at::globalContext().getROCmFAPreferredBackend() ==
     at::ROCmFABackend::Ck) {
     TORCH_WARN_ONCE("Warning! You have opted to use CK flash attention backend in a build that was not compiled using USE_CK_FLASH_ATTENTION=1. Please set this variable and try again. Defaulting to use aotriton backend...");
+=======
+#else
+    TORCH_WARN_ONCE("Warning! You have opted to use CK flash attention backend in a build that was not compiled using USE_CK_FLASH_ATTENTION=1. Please set this variable and try again. Defaulting to use aotriton backend...");
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return mha_bwd_aot(
       dout,
@@ -528,7 +633,10 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_bwd(
       deterministic,
       philox_seed,
       philox_offset);
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd(
@@ -556,8 +664,13 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd
     const float softmax_scale,
     const bool zero_tensors,
     const bool is_causal,
+<<<<<<< HEAD
     int window_size_left,
     int window_size_right,
+=======
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const float softcap,
     const bool deterministic,
     const at::Tensor philox_seed,
@@ -566,6 +679,11 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd
   if (at::globalContext().getROCmFAPreferredBackend() ==
       at::ROCmFABackend::Ck) {
     std::optional<at::Tensor> non_null_dbias = std::nullopt;
+<<<<<<< HEAD
+=======
+    const int non_null_window_left = window_size_left.value_or(-1);
+    const int non_null_window_right = window_size_right.value_or(-1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto[dQuery,
          dKey,
          dValue,
@@ -591,13 +709,19 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd
                                     softmax_scale,
                                     zero_tensors,
                                     is_causal,
+<<<<<<< HEAD
                                     window_size_left,
                                     window_size_right,
+=======
+                                    non_null_window_left,
+                                    non_null_window_right,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                     deterministic,
                                     philox_seed,
                                     philox_offset);
     // for FA return [dQ, dV, dK, dSoftmax]
     return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue), std::move(dSoftmax));
+<<<<<<< HEAD
   } else {
     return mha_varlen_bwd_aot(
         dout,
@@ -625,6 +749,10 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd
         philox_offset);
   }
 #else
+=======
+  }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return mha_varlen_bwd_aot(
       dout,
       q,
@@ -649,7 +777,10 @@ inline std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varlen_bwd
       deterministic,
       philox_seed,
       philox_offset);
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h b/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h
new file mode 100644
index 000000000000..c18744afc1ff
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file is a trimmed version of cuda/mem_eff_attention/gemm_kernel_utils.h
+#pragma once
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  TORCH_CHECK(                         \
+      uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                    \
+  {                                                                    \
+    A = B;                                                             \
+    TORCH_CHECK(                                                    \
+        B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
diff --git a/aten/src/ATen/native/transformers/sdp_utils.h b/aten/src/ATen/native/transformers/sdp_utils.h
new file mode 100644
index 000000000000..b19ed00ecc88
--- /dev/null
+++ b/aten/src/ATen/native/transformers/sdp_utils.h
@@ -0,0 +1,88 @@
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+
+namespace at::native {
+
+void alloc_with_matching_layout(
+    const Tensor& q,
+    Tensor& output,
+    const std::vector<int64_t>& shape) {
+  TORCH_INTERNAL_ASSERT(
+      shape.size() == q.sizes().size(),
+      "SDPA alloc_with_matching_layout got requested shape ndim != q ndim");
+
+  if (std::equal(q.sizes().begin(), q.sizes().end(), shape.begin())) {
+    output = at::empty_like(q);
+    return;
+  }
+
+  // get the "fill order," which is just an argsort on the strides
+  std::vector<int> fill_order(shape.size());
+  std::iota(fill_order.begin(), fill_order.end(), 0);
+  const auto q_strides = q.strides();
+  std::stable_sort(
+      fill_order.begin(), fill_order.end(), [&q_strides](int idx1, int idx2) {
+        return q_strides[idx1] < q_strides[idx2];
+      });
+  std::vector<int64_t> ordered_strides(shape.size());
+  int64_t current_stride = 1;
+  for (const int dim_idx : fill_order) {
+    ordered_strides[dim_idx] = current_stride;
+    current_stride *= shape[dim_idx];
+  }
+  output = at::empty(at::IntArrayRef(shape), q.options())
+               .as_strided(
+                   at::IntArrayRef(shape), at::IntArrayRef(ordered_strides), 0);
+}
+
+void permute_to_matching_layout(const Tensor& output, Tensor& grad_output) {
+  const int dims = output.sizes().size();
+  std::vector<int64_t> outer_to_inner(dims);
+  std::iota(outer_to_inner.begin(), outer_to_inner.end(), 0);
+  const auto o_strides = output.strides();
+  std::stable_sort(
+      outer_to_inner.begin(),
+      outer_to_inner.end(),
+      [&o_strides](int idx1, int idx2) {
+        return o_strides[idx1] > o_strides[idx2];
+      });
+  std::vector<int64_t> inverse(dims);
+  for (int d = 0; d < dims; d++) {
+    inverse[d] = std::find(outer_to_inner.begin(), outer_to_inner.end(), d) -
+        outer_to_inner.begin();
+  }
+  grad_output = grad_output.permute(at::IntArrayRef(outer_to_inner))
+                    .contiguous()
+                    .permute(at::IntArrayRef(inverse));
+}
+
+bool same_strides(const Tensor& t1, const Tensor& t2) {
+  std::vector<int> t1_strides_no_ones;
+  std::vector<int> t2_strides_no_ones;
+  const auto t1strides = t1.strides();
+  const auto t2strides = t2.strides();
+  const int dim = t1strides.size();
+  if (dim != (int)t2strides.size()) {
+    return false;
+  }
+  const auto t1sizes = t1.sizes();
+  const auto t2sizes = t2.sizes();
+
+  // we are going through strides backward here, but if both are backward it's
+  // comparable
+  for (int i = 0; i < dim; i++) {
+    if (t1sizes[i] > 1) {
+      t1_strides_no_ones.push_back(t1strides[i]);
+    }
+    if (t2sizes[i] > 1) {
+      t2_strides_no_ones.push_back(t2strides[i]);
+    }
+  }
+  return std::equal(
+      t1_strides_no_ones.begin(),
+      t1_strides_no_ones.end(),
+      t2_strides_no_ones.begin(),
+      t2_strides_no_ones.end());
+}
+} // namespace at::native
diff --git a/aten/src/ATen/native/transformers/sdp_utils_cpp.h b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
index 22afbac1d079..951a5718f101 100644
--- a/aten/src/ATen/native/transformers/sdp_utils_cpp.h
+++ b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
@@ -6,6 +6,10 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/core/grad_mode.h>
 #include <ATen/native/DispatchStub.h>
+<<<<<<< HEAD
+=======
+#include <c10/core/DeviceType.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/core/ScalarType.h>
 
 #include <c10/util/Exception.h>
@@ -333,13 +337,21 @@ inline bool check_safe_kv_broadcast(at::Tensor const& param, bool debug) {
   return true;
 }
 
+<<<<<<< HEAD
+=======
+template <bool requires_same_num_heads=true>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline bool check_grouped_query_attention(sdp_params const& params, bool debug) {
   const auto q_num_heads = params.query.sym_size(-3);
   const auto k_num_heads = params.key.sym_size(-3);
   const auto v_num_heads = params.value.sym_size(-3);
   const bool same_kv_heads = k_num_heads == v_num_heads;
 
+<<<<<<< HEAD
   if (!(same_kv_heads)){
+=======
+  if (requires_same_num_heads && !(same_kv_heads)){
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (debug) {
       TORCH_WARN(
           "Both fused kernels require key and value to have the same num_heads and batch_size but got: ",
@@ -355,10 +367,17 @@ inline bool check_grouped_query_attention(sdp_params const& params, bool debug)
   }
   // Check if grouped query attention is supported and validate the number of
   // heads
+<<<<<<< HEAD
   if (q_num_heads % k_num_heads != 0) {
     if (debug) {
       TORCH_WARN(
           "FlashAttentionV2 only supports grouped query attention, where the number of heads in key/value must divide number of heads in query.",
+=======
+  if (q_num_heads % k_num_heads != 0 || (!requires_same_num_heads && (q_num_heads % v_num_heads != 0))) {
+    if (debug) {
+      TORCH_WARN(
+          "The number of heads in key/value must divide number of heads in query.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "Got input Key sizes(): ",
           params.key.sym_size(-3),
           ", Value sizes(): ",
@@ -372,7 +391,11 @@ inline bool check_grouped_query_attention(sdp_params const& params, bool debug)
   return true;
 }
 
+<<<<<<< HEAD
 template <bool supports_gqa>
+=======
+template <bool supports_gqa, bool requires_same_num_heads=true>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline bool check_batch_size_and_num_heads_dense(sdp_params const& params, bool debug) {
   // This is expected to be called after check_tensor_shapes ensuring that the
   // size() calls won't error since the inputs are all 4 dimensional
@@ -407,9 +430,16 @@ inline bool check_batch_size_and_num_heads_dense(sdp_params const& params, bool
   }
 
   if(params.enable_gqa && supports_gqa){
+<<<<<<< HEAD
     return check_grouped_query_attention(params, debug);
   }
 
+=======
+    return check_grouped_query_attention<requires_same_num_heads>(params, debug);
+  }
+
+  // same num heads condition for non-gqa case
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!same_num_heads){
     if (debug) {
       TORCH_WARN(
@@ -501,6 +531,7 @@ inline bool check_last_dim_stride_equals_1_dense(sdp_params const& params, bool
   if (ignore_singleton_dim){
     qkv_strides_equal_1 = qkv_strides_equal_1 || params.query.sym_size(-1) == 1;
   }
+<<<<<<< HEAD
   bool mask_stride_equal_1 = params.attn_mask.has_value()
       ? params.attn_mask.value().sym_stride(-1) == 1
       : true;
@@ -521,6 +552,29 @@ inline bool check_last_dim_stride_equals_1_dense(sdp_params const& params, bool
           ", Value.stride(-1): ",
           params.value.sym_stride(-1),
           epilogue_message.str());
+=======
+  bool is_cpu = params.query.device().type() == c10::DeviceType::CPU;
+  bool mask_stride_equal_1 = params.attn_mask.has_value()
+      ? params.attn_mask.value().sym_stride(-1) == 1
+      : true;
+  bool mask_stride_valid = is_cpu ? true : mask_stride_equal_1;
+  if (!(qkv_strides_equal_1 && mask_stride_valid)) {
+    if (debug) {
+      std::ostringstream message;
+      message
+          << "All fused kernels require the last dimension of the input to have stride 1. ";
+      message << "Got Query.stride(-1): " << params.query.sym_stride(-1)
+              << ", Key.stride(-1): " << params.key.sym_stride(-1)
+              << ", Value.stride(-1): " << params.value.sym_stride(-1);
+
+      if (params.attn_mask.has_value()) {
+        message
+            << ", Attn_mask.stride(-1): "
+            << params.attn_mask.value().sym_stride(-1)
+            << " (GPU backends require attn_mask's last dimension to have stride 1 while the CPU does not).";
+      }
+      TORCH_WARN(message.str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     return false;
diff --git a/aten/src/ATen/native/vulkan/api/Allocator.h b/aten/src/ATen/native/vulkan/api/Allocator.h
index a89c333387d3..acd13647ef93 100644
--- a/aten/src/ATen/native/vulkan/api/Allocator.h
+++ b/aten/src/ATen/native/vulkan/api/Allocator.h
@@ -41,7 +41,11 @@
 */
 #endif /* VULKAN_DEBUG */
 
+<<<<<<< HEAD
 // Note: Do not try to use C10 convenience macors here, as this header is
+=======
+// Note: Do not try to use C10 convenience macros here, as this header is
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // included from ExecuTorch that does not want to have dependency on C10
 #ifdef __clang__
 #pragma clang diagnostic push
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp
index c84d8d4e4c8e..10b03c2c96eb 100644
--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@@ -78,6 +78,12 @@ VkInstance create_instance(const RuntimeConfiguration& config) {
 #ifdef VK_EXT_debug_report
         VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
 #endif /* VK_EXT_debug_report */
+<<<<<<< HEAD
+=======
+#ifdef __APPLE__
+        VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME,
+#endif // __APPLE__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     };
 
     find_requested_layers_and_extensions(
@@ -90,7 +96,15 @@ VkInstance create_instance(const RuntimeConfiguration& config) {
   const VkInstanceCreateInfo instance_create_info{
       VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, // sType
       nullptr, // pNext
+<<<<<<< HEAD
       0u, // flags
+=======
+#ifdef __APPLE__
+      VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR, // flags
+#else // __APPLE__
+      0u, // flags
+#endif // __APPLE__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       &application_info, // pApplicationInfo
       static_cast<uint32_t>(enabled_layers.size()), // enabledLayerCount
       enabled_layers.data(), // ppEnabledLayerNames
diff --git a/aten/src/ATen/native/vulkan/ops/Batchnorm.h b/aten/src/ATen/native/vulkan/ops/Batchnorm.h
index 4108b0d4e320..d409206db966 100644
--- a/aten/src/ATen/native/vulkan/ops/Batchnorm.h
+++ b/aten/src/ATen/native/vulkan/ops/Batchnorm.h
@@ -40,7 +40,11 @@ class BatchNormPackedContext final : virtual public VulkanPackedContext,
   static BatchNormPackedContext pack(c10::impl::GenericList);
 
   const c10::impl::GenericList unpack() const override {
+<<<<<<< HEAD
     TORCH_CHECK(unpacked_.size() > 0u, "unpacked_ does not have any elements!");
+=======
+    TORCH_CHECK(!unpacked_.empty(), "unpacked_ does not have any elements!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return unpacked_;
   }
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 3831cce89373..d2a04da28fa5 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -1283,7 +1283,11 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
 Conv2dOpContext::State Conv2dOpContext::unpack() const {
   const c10::impl::GenericList unpacked_ = conv_context_.unpack();
 
+<<<<<<< HEAD
   TORCH_CHECK(unpacked_.size() > 0u, "unpacked_ does not have any elements!");
+=======
+  TORCH_CHECK(!unpacked_.empty(), "unpacked_ does not have any elements!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return Conv2dOpContext::State(
       unpacked_.get(Conv2dPackedContext::Unpacked::Weight).toTensor(),
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.h b/aten/src/ATen/native/vulkan/ops/Convolution.h
index 021799f4b6da..679058147f34 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.h
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.h
@@ -115,7 +115,11 @@ class Conv2dPackedContext final : virtual public VulkanPackedContext,
   static Conv2dPackedContext pack(c10::impl::GenericList);
 
   const c10::impl::GenericList unpack() const override {
+<<<<<<< HEAD
     TORCH_CHECK(unpacked_.size() > 0u, "unpacked_ does not have any elements!");
+=======
+    TORCH_CHECK(!unpacked_.empty(), "unpacked_ does not have any elements!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return unpacked_;
   }
@@ -275,7 +279,11 @@ class Conv1dPackedContext final : virtual public VulkanPackedContext,
   static Conv1dPackedContext pack(c10::impl::GenericList);
 
   const c10::impl::GenericList unpack() const override {
+<<<<<<< HEAD
     TORCH_CHECK(unpacked_.size() > 0u, "unpacked_ does not have any elements!");
+=======
+    TORCH_CHECK(!unpacked_.empty(), "unpacked_ does not have any elements!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return unpacked_;
   }
diff --git a/aten/src/ATen/native/vulkan/ops/Gru.cpp b/aten/src/ATen/native/vulkan/ops/Gru.cpp
index e803e9b9686f..c3546ec2759f 100644
--- a/aten/src/ATen/native/vulkan/ops/Gru.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Gru.cpp
@@ -240,7 +240,11 @@ const c10::impl::GenericList GruPackedContext::unpack() const {
         packed_linear_context.toCustomClass<LinearPackedContext>()->unpack();
 
     TORCH_CHECK(
+<<<<<<< HEAD
         unpacked_linear_context.size() > 0u,
+=======
+        !unpacked_linear_context.empty(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "unpacked_linear_context does not have any elements!");
 
     params_cpu.emplace_back(
diff --git a/aten/src/ATen/native/vulkan/ops/Lstm.cpp b/aten/src/ATen/native/vulkan/ops/Lstm.cpp
index 63f17f15eb2c..3a4cf66010e5 100644
--- a/aten/src/ATen/native/vulkan/ops/Lstm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Lstm.cpp
@@ -282,7 +282,11 @@ const c10::impl::GenericList LstmPackedContext::unpack() const {
         packed_linear_context.toCustomClass<LinearPackedContext>()->unpack();
 
     TORCH_CHECK(
+<<<<<<< HEAD
         unpacked_linear_context.size() > 0u,
+=======
+        !unpacked_linear_context.empty(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "unpacked_linear_context does not have any elements!");
 
     params_cpu.emplace_back(
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.h b/aten/src/ATen/native/vulkan/ops/Mm.h
index 99862913a65a..0caaff68daea 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.h
+++ b/aten/src/ATen/native/vulkan/ops/Mm.h
@@ -89,7 +89,11 @@ class LinearPackedContext final : virtual public VulkanPackedContext,
   static LinearPackedContext pack(c10::impl::GenericList);
 
   const c10::impl::GenericList unpack() const override {
+<<<<<<< HEAD
     TORCH_CHECK(unpacked_.size() > 0u, "unpacked_ does not have any elements!");
+=======
+    TORCH_CHECK(!unpacked_.empty(), "unpacked_ does not have any elements!");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return unpacked_;
   }
diff --git a/aten/src/ATen/native/vulkan/ops/Unsqueeze.cpp b/aten/src/ATen/native/vulkan/ops/Unsqueeze.cpp
index 937847447ceb..d6a94a6f6b8a 100644
--- a/aten/src/ATen/native/vulkan/ops/Unsqueeze.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Unsqueeze.cpp
@@ -88,6 +88,7 @@ Tensor unsqueeze(const at::Tensor& self, int64_t dim) {
     }
 
     // Create the params buffer
+<<<<<<< HEAD
     struct Block block {
       {
         // Dimension to unsqueeze
@@ -97,6 +98,15 @@ Tensor unsqueeze(const at::Tensor& self, int64_t dim) {
                 std::ceil(static_cast<float>(output_size[channel_index]) / 4)),
       }
     };
+=======
+    struct Block block{{
+        // Dimension to unsqueeze
+        static_cast<int32_t>(dim),
+        // Keep track of the channel in Image3D
+        static_cast<int32_t>(
+            std::ceil(static_cast<float>(output_size[channel_index]) / 4)),
+    }};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     api::UniformParamsBuffer params(context, block);
 
diff --git a/aten/src/ATen/native/xnnpack/AveragePooling.cpp b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
index f0ebaf260037..9420d6767396 100644
--- a/aten/src/ATen/native/xnnpack/AveragePooling.cpp
+++ b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
@@ -7,7 +7,11 @@
 
 namespace at::native::xnnpack {
 
+<<<<<<< HEAD
 inline std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in) {
+=======
+static inline std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto mem_format = in.suggest_memory_format();
   const auto& sizes = in.sizes();
   std::vector<size_t> ret(sizes.begin(), sizes.end());
diff --git a/aten/src/ATen/native/xnnpack/Linear.cpp b/aten/src/ATen/native/xnnpack/Linear.cpp
index 4d98cd753159..818e44b1b511 100644
--- a/aten/src/ATen/native/xnnpack/Linear.cpp
+++ b/aten/src/ATen/native/xnnpack/Linear.cpp
@@ -129,6 +129,10 @@ Tensor run(
 
   const IntArrayRef input_size = padded_input.sizes();
   std::vector<int64_t> output_size(input_size.cbegin(), input_size.cend());
+<<<<<<< HEAD
+=======
+  // NOLINTNEXTLINE(facebook-hte-LocalUncheckedArrayBounds)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   output_size.back() = context.output_channels;
 
   Tensor output = mobile::empty_with_tail_padding(
diff --git a/aten/src/ATen/nnapi/nnapi_bind.cpp b/aten/src/ATen/nnapi/nnapi_bind.cpp
index 38c7925a1d54..805cd1987255 100644
--- a/aten/src/ATen/nnapi/nnapi_bind.cpp
+++ b/aten/src/ATen/nnapi/nnapi_bind.cpp
@@ -133,7 +133,11 @@ void NnapiCompilation::run(
         t.nbytes());
   }
 
+<<<<<<< HEAD
   for (const auto i : c10::irange(outputs.size())) {
+=======
+  for (const auto i : c10::irange(static_cast<int32_t>(outputs.size()))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto& t = outputs[i];
     // TODO: Check contiguous and dtype.
     check_nnapi->Execution_setOutput(
@@ -147,7 +151,11 @@ void NnapiCompilation::run(
   check_nnapi->Execution_compute(execution);
 
   // TODO: Maybe skip this for fixed-size outputs?
+<<<<<<< HEAD
   for (const auto i : c10::irange(outputs.size())) {
+=======
+  for (const auto i : c10::irange(static_cast<int32_t>(outputs.size()))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto& t = outputs[i];
     uint32_t rank = 0;
     check_nnapi->Execution_getOutputOperandRank(execution, i, &rank);
@@ -177,9 +185,14 @@ void NnapiCompilation::get_operand_type(const at::Tensor& t, ANeuralNetworksOper
   if (t.scalar_type() == c10::kQUInt8) {
     TORCH_CHECK(t.is_quantized());
     operand->type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+<<<<<<< HEAD
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
     operand->scale = t.q_scale();
     operand->zeroPoint = t.q_zero_point();
+=======
+    operand->scale = static_cast<float>(t.q_scale());
+    operand->zeroPoint = static_cast<int32_t>(t.q_zero_point());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return;
   }
   if (t.scalar_type() == c10::kInt) {
@@ -194,7 +207,10 @@ void NnapiCompilation::get_operand_type(const at::Tensor& t, ANeuralNetworksOper
       "testing with fixed scale, zero_point. Please change your ",
       "inputs if you see this in production");
     operand->type = ANEURALNETWORKS_TENSOR_QUANT16_ASYMM;
+<<<<<<< HEAD
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     operand->scale = 0.125;
     operand->zeroPoint = 0;
     return;
diff --git a/aten/src/ATen/quantized/QTensorImpl.h b/aten/src/ATen/quantized/QTensorImpl.h
index 127fa78de12d..aa1c94105ab9 100644
--- a/aten/src/ATen/quantized/QTensorImpl.h
+++ b/aten/src/ATen/quantized/QTensorImpl.h
@@ -51,8 +51,13 @@ struct TORCH_API QTensorImpl : public c10::TensorImpl {
     auto impl = c10::make_intrusive<QTensorImpl>(
         Storage(storage()), key_set(), data_type_, quantizer_);
     copy_tensor_metadata(
+<<<<<<< HEAD
       /*src_impl=*/this,
       /*dest_impl=*/impl.get(),
+=======
+      /*src_q_impl=*/this,
+      /*dest_q_impl=*/impl.get(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       /*version_counter=*/version_counter,
       /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     impl->refresh_numel();
@@ -72,8 +77,13 @@ struct TORCH_API QTensorImpl : public c10::TensorImpl {
     auto impl = c10::make_intrusive<QTensorImpl>(
         Storage(storage()), key_set(), data_type_, quantizer_);
     copy_tensor_metadata(
+<<<<<<< HEAD
       /*src_impl=*/this,
       /*dest_impl=*/impl.get(),
+=======
+      /*src_q_impl=*/this,
+      /*dest_q_impl=*/impl.get(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       /*version_counter=*/std::move(version_counter),
       /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     impl->refresh_numel();
@@ -91,8 +101,13 @@ struct TORCH_API QTensorImpl : public c10::TensorImpl {
     AT_ASSERT(has_compatible_shallow_copy_type(impl->key_set()));
     auto q_impl = static_cast<const QTensorImpl*>(impl.get());
     copy_tensor_metadata(
+<<<<<<< HEAD
       /*src_impl=*/q_impl,
       /*dest_impl=*/this,
+=======
+      /*src_q_impl=*/q_impl,
+      /*dest_q_impl=*/this,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       /*version_counter=*/version_counter(),
       /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
     refresh_numel();
diff --git a/aten/src/ATen/templates/Functions.cpp b/aten/src/ATen/templates/Functions.cpp
index 1a2ffaa23816..5d2811636960 100644
--- a/aten/src/ATen/templates/Functions.cpp
+++ b/aten/src/ATen/templates/Functions.cpp
@@ -53,7 +53,13 @@ Tensor TensorMaker::make_tensor() {
     tensor_impl->set_storage_offset(*storage_offset_);
   }
 
+<<<<<<< HEAD
    return tensor;
+=======
+  tensor_impl->set_requires_grad(opts_.requires_grad());
+
+  return tensor;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  }
 
  std::size_t TensorMaker::computeStorageSize() const noexcept {
diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp
index f55b3714293f..7b30b81a9211 100644
--- a/aten/src/ATen/templates/RegisterDispatchKey.cpp
+++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 // required for old g++ to compile PRId64 macros, see
 // https://github.com/pytorch/pytorch/issues/3571
 // for context
@@ -5,6 +6,8 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // an external backend might generate file within its code tree
 // and check all the source files within the tree with clang-format.
 // so, disable it since the backend might have a different config.
diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp
index 7efb5b6a4bb2..54111c400d1f 100644
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@@ -8,6 +8,10 @@
 #include <ATen/MemoryOverlap.h>
 #include <torch/library.h>
 
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Operators.h>
 #include <ATen/NativeFunctions.h>
@@ -95,8 +99,13 @@ inline c10::List<::std::optional<Tensor>> to_meta(const c10::List<::std::optiona
 }
 
 static bool disable_meta_reference() {
+<<<<<<< HEAD
   static auto env = std::getenv("TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE");
   return env != nullptr && std::strcmp(env, "1") == 0;
+=======
+  static auto env = c10::utils::get_env("TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE");
+  return env == "1";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 922cd15b4ec0..74426bb69d0f 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -17,7 +17,10 @@ list(APPEND ATen_CPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_generator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_rng_test.cpp
+<<<<<<< HEAD
   ${CMAKE_CURRENT_SOURCE_DIR}/dispatch_key_set_test.cpp
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ${CMAKE_CURRENT_SOURCE_DIR}/dlconvertor_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/half_test.cpp
@@ -65,6 +68,10 @@ list(APPEND ATen_CUDA_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
+<<<<<<< HEAD
+=======
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_exchange_device_test.cpp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_generator_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_half_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_integer_divider_test.cu
@@ -122,6 +129,10 @@ list(APPEND ATen_XPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/xpu_device_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/xpu_event_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/xpu_generator_test.cpp
+<<<<<<< HEAD
+=======
+  ${CMAKE_CURRENT_SOURCE_DIR}/xpu_reportMemoryUsage_test.cpp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   )
 
 # ---[ Send the lists to the parent scope.
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 94c10f6a1484..d1925f3c20a0 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -166,8 +166,15 @@ void TestSqueeze(DeprecatedTypeProperties& type) {
   ASSERT_EQ_RESOLVED(b.dim(), 1);
   a = rand({1}, type);
   b = squeeze(a);
+<<<<<<< HEAD
   // TODO 0-dim squeeze
   ASSERT_TRUE(a[0].equal(b));
+=======
+  ASSERT_TRUE(a[0].equal(b));
+  Tensor c = at::scalar_tensor(1, type.options());
+  Tensor d = squeeze(c);
+  ASSERT_TRUE(c.equal(d));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void TestCopy(DeprecatedTypeProperties& type) {
diff --git a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
index 73f69d7f8529..a585f5a58b8d 100644
--- a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
+++ b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
@@ -63,8 +63,12 @@ TEST(CPUAllocationPlanTest, with_control_flow) {
     }
     bool success{true};
     for (uint64_t i = 0; i < 10; ++i) {
+<<<<<<< HEAD
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       bool validation_success;
+=======
+      bool validation_success = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {
         c10::WithValidateAllocationPlanGuard
           validation_guard(&plan, &validation_success);
diff --git a/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp b/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
index a4c5bcc7d1fa..01db4e69be68 100644
--- a/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
@@ -18,7 +18,11 @@ TEST(CachingHostAllocatorTest, check_stats) {
 
   // Clear the stats and ensure they are zero.
   size_t round_size = c10::llvm::PowerOf2Ceil(N);
+<<<<<<< HEAD
   auto stats = at::cuda::CachingHostAllocator_getStats();
+=======
+  auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ASSERT_EQ(stats.allocation.current, 0);
   ASSERT_EQ(stats.allocation.peak, 0);
   ASSERT_EQ(stats.allocation.allocated, 0);
@@ -31,7 +35,11 @@ TEST(CachingHostAllocatorTest, check_stats) {
         {N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
     ptr = pinned_tensor.data_ptr();
     ctx = pinned_tensor.storage().data_ptr().get_context();
+<<<<<<< HEAD
     auto stats = at::cuda::CachingHostAllocator_getStats();
+=======
+    auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ASSERT_EQ(stats.allocation.current, 1);
     ASSERT_EQ(stats.allocation.peak, 1);
     ASSERT_EQ(stats.allocation.allocated, 1);
@@ -47,7 +55,11 @@ TEST(CachingHostAllocatorTest, check_stats) {
   {
     auto pinned_tensor = at::empty(
         {N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
+<<<<<<< HEAD
     auto stats = at::cuda::CachingHostAllocator_getStats();
+=======
+    auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ASSERT_EQ(ptr, pinned_tensor.data_ptr());
     ASSERT_EQ(ctx, pinned_tensor.storage().data_ptr().get_context());
     ASSERT_EQ(stats.allocation.current, 1);
@@ -65,7 +77,11 @@ TEST(CachingHostAllocatorTest, check_stats) {
     size_t new_round_size = c10::llvm::PowerOf2Ceil(new_size);
     auto pinned_tensor = at::empty(
         {new_size}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
+<<<<<<< HEAD
     auto stats = at::cuda::CachingHostAllocator_getStats();
+=======
+    auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ASSERT_NE(ptr, pinned_tensor.data_ptr());
     ASSERT_NE(ctx, pinned_tensor.storage().data_ptr().get_context());
     ASSERT_EQ(stats.allocation.current, 1);
@@ -81,8 +97,13 @@ TEST(CachingHostAllocatorTest, check_stats) {
 
   // Test the empty cache.
   {
+<<<<<<< HEAD
     at::cuda::CachingHostAllocator_emptyCache();
     auto stats = at::cuda::CachingHostAllocator_getStats();
+=======
+    at::getHostAllocator(at::kCUDA)->empty_cache();
+    auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ASSERT_EQ(stats.allocation.current, 0);
     ASSERT_EQ(stats.allocated_bytes.current, 0);
     ASSERT_EQ(stats.allocation.peak, 2);
@@ -97,9 +118,15 @@ TEST(CachingHostAllocatorTest, check_stats) {
 
   // Test the reset stats.
   {
+<<<<<<< HEAD
     at::cuda::CachingHostAllocator_resetAccumulatedStats();
     at::cuda::CachingHostAllocator_resetPeakStats();
     auto stats = at::cuda::CachingHostAllocator_getStats();
+=======
+    at::getHostAllocator(at::kCUDA)->reset_accumulated_stats();
+    at::getHostAllocator(at::kCUDA)->reset_peak_stats();
+    auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ASSERT_EQ(stats.allocation.peak, 0);
     ASSERT_EQ(stats.allocation.allocated, 0);
     ASSERT_EQ(stats.allocation.freed, 0);
@@ -121,10 +148,17 @@ TEST(CachingHostAllocatorTest, pinned_alias_slice) {
   auto pinned_tensor =
       at::empty({N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
   ASSERT_TRUE(pinned_tensor.is_pinned());
+<<<<<<< HEAD
   ASSERT_TRUE(at::cuda::CachingHostAllocator_recordEvent(
       pinned_tensor.data_ptr(),
       pinned_tensor.storage().data_ptr().get_context(),
       at::cuda::getCurrentCUDAStream()));
+=======
+  ASSERT_TRUE(at::getHostAllocator(at::kCUDA)->record_event(
+      pinned_tensor.data_ptr(),
+      pinned_tensor.storage().data_ptr().get_context(),
+      at::cuda::getCurrentCUDAStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Check an tensor constructed with from_blob can be correctly recorded (via
   // the shared data_ptr)
@@ -136,10 +170,17 @@ TEST(CachingHostAllocatorTest, pinned_alias_slice) {
       alias_tensor.storage().data_ptr().get_context() ==
       pinned_tensor.storage().data_ptr().get_context());
   ASSERT_EQ(alias_tensor.data_ptr(), pinned_tensor.data_ptr());
+<<<<<<< HEAD
   ASSERT_TRUE(at::cuda::CachingHostAllocator_recordEvent(
       alias_tensor.data_ptr(),
       alias_tensor.storage().data_ptr().get_context(),
       at::cuda::getCurrentCUDAStream()));
+=======
+  ASSERT_TRUE(at::getHostAllocator(at::kCUDA)->record_event(
+      alias_tensor.data_ptr(),
+      alias_tensor.storage().data_ptr().get_context(),
+      at::cuda::getCurrentCUDAStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Check an tensor constructed with slicing can be correctly recorded (via
   // the shared context)
@@ -149,20 +190,34 @@ TEST(CachingHostAllocatorTest, pinned_alias_slice) {
       slice_tensor.storage().data_ptr().get_context(),
       pinned_tensor.storage().data_ptr().get_context());
   ASSERT_NE(slice_tensor.data_ptr(), pinned_tensor.data_ptr());
+<<<<<<< HEAD
   ASSERT_TRUE(at::cuda::CachingHostAllocator_recordEvent(
       slice_tensor.data_ptr(),
       slice_tensor.storage().data_ptr().get_context(),
       at::cuda::getCurrentCUDAStream()));
+=======
+  ASSERT_TRUE(at::getHostAllocator(at::kCUDA)->record_event(
+      slice_tensor.data_ptr(),
+      slice_tensor.storage().data_ptr().get_context(),
+      at::cuda::getCurrentCUDAStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Check a tensor that has neither a matching context nor data_ptr cannot be
   // recorded.
   auto alias_slice_tensor = at::from_blob(
       slice_tensor.data_ptr(), slice_tensor.sizes(), slice_tensor.options());
   ASSERT_TRUE(alias_slice_tensor.is_pinned());
+<<<<<<< HEAD
   ASSERT_FALSE(at::cuda::CachingHostAllocator_recordEvent(
       alias_slice_tensor.data_ptr(),
       alias_slice_tensor.storage().data_ptr().get_context(),
       at::cuda::getCurrentCUDAStream()));
+=======
+  ASSERT_FALSE(at::getHostAllocator(at::kCUDA)->record_event(
+      alias_slice_tensor.data_ptr(),
+      alias_slice_tensor.storage().data_ptr().get_context(),
+      at::cuda::getCurrentCUDAStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ASSERT_NE(
       alias_slice_tensor.storage().data_ptr().get(),
       slice_tensor.storage().data_ptr().get());
@@ -173,7 +228,11 @@ TEST(CachingHostAllocatorTest, check_raw_allocation) {
     return;
   }
 
+<<<<<<< HEAD
   auto data_ptr = at::cuda::getCachingHostAllocator()->allocate(N);
+=======
+  auto data_ptr = at::getHostAllocator(at::kCUDA)->allocate(N);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   class UserDataDeleter {
    public:
     explicit UserDataDeleter(std::unique_ptr<void, c10::DeleterFnPtr> ptr)
@@ -206,10 +265,17 @@ TEST(CachingHostAllocatorTest, check_raw_allocation) {
           .make_tensor();
 
   ASSERT_TRUE(pinned_tensor.is_pinned());
+<<<<<<< HEAD
   ASSERT_TRUE(at::cuda::CachingHostAllocator_recordEvent(
       pinned_tensor.data_ptr(),
       pinned_tensor.storage().data_ptr().get_context(),
       at::cuda::getCurrentCUDAStream()));
+=======
+  ASSERT_TRUE(at::getHostAllocator(at::kCUDA)->record_event(
+      pinned_tensor.data_ptr(),
+      pinned_tensor.storage().data_ptr().get_context(),
+      at::cuda::getCurrentCUDAStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(CachingHostAllocatorTest, check_unknown_tensor) {
@@ -220,10 +286,17 @@ TEST(CachingHostAllocatorTest, check_unknown_tensor) {
   auto unpinned_tensor =
       at::empty({N}, at::TensorOptions().dtype(at::kByte).pinned_memory(false));
 
+<<<<<<< HEAD
   ASSERT_FALSE(at::cuda::CachingHostAllocator_recordEvent(
       unpinned_tensor.data_ptr(),
       unpinned_tensor.storage().data_ptr().get_context(),
       at::cuda::getCurrentCUDAStream()));
+=======
+  ASSERT_FALSE(at::getHostAllocator(at::kCUDA)->record_event(
+      unpinned_tensor.data_ptr(),
+      unpinned_tensor.storage().data_ptr().get_context(),
+      at::cuda::getCurrentCUDAStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(CachingHostAllocatorTest, check_empty_cache) {
@@ -238,6 +311,7 @@ TEST(CachingHostAllocatorTest, check_empty_cache) {
         {N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
     ptr = pinned_tensor.data_ptr();
     ctx = pinned_tensor.storage().data_ptr().get_context();
+<<<<<<< HEAD
     ASSERT_TRUE(at::cuda::CachingHostAllocator_recordEvent(
         ptr, ctx, at::cuda::getCurrentCUDAStream()));
   }
@@ -245,6 +319,15 @@ TEST(CachingHostAllocatorTest, check_empty_cache) {
   at::cuda::CachingHostAllocator_emptyCache();
   ASSERT_FALSE(at::cuda::CachingHostAllocator_recordEvent(
       ptr, ctx, at::cuda::getCurrentCUDAStream()));
+=======
+    ASSERT_TRUE(at::getHostAllocator(at::kCUDA)->record_event(
+        ptr, ctx, at::cuda::getCurrentCUDAStream().unwrap()));
+  }
+
+  at::getHostAllocator(at::kCUDA)->empty_cache();
+  ASSERT_FALSE(at::getHostAllocator(at::kCUDA)->record_event(
+      ptr, ctx, at::cuda::getCurrentCUDAStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(CachingHostAllocatorTest, check_reuse) {
diff --git a/aten/src/ATen/test/cuda_exchange_device_test.cpp b/aten/src/ATen/test/cuda_exchange_device_test.cpp
new file mode 100644
index 000000000000..ea6a77cafe91
--- /dev/null
+++ b/aten/src/ATen/test/cuda_exchange_device_test.cpp
@@ -0,0 +1,31 @@
+#include <gtest/gtest.h>
+
+#include <ATen/DeviceAccelerator.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+TEST(CudaExchangeDeviceTest, checkPrimaryContext) {
+  if (!at::cuda::is_available()) {
+    return;
+  }
+
+  ASSERT_FALSE(at::cuda::hasPrimaryContext(0));
+  at::cuda::MaybeExchangeDevice(0);
+  ASSERT_FALSE(at::cuda::hasPrimaryContext(0));
+  at::accelerator::maybeExchangeDevice(0);
+  ASSERT_FALSE(at::cuda::hasPrimaryContext(0));
+
+  if (at::cuda::device_count() > 1) {
+    ASSERT_FALSE(at::cuda::hasPrimaryContext(1));
+    at::cuda::ExchangeDevice(1);
+    ASSERT_TRUE(at::cuda::hasPrimaryContext(1));
+  }
+
+  ASSERT_FALSE(at::cuda::hasPrimaryContext(0));
+  at::cuda::MaybeExchangeDevice(0);
+  ASSERT_FALSE(at::cuda::hasPrimaryContext(0));
+  at::accelerator::maybeExchangeDevice(0);
+  ASSERT_FALSE(at::cuda::hasPrimaryContext(0));
+  at::accelerator::exchangeDevice(0);
+  ASSERT_TRUE(at::cuda::hasPrimaryContext(0));
+}
diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
index 6b120f7eb304..29662a9421a8 100644
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -27,7 +27,11 @@ void reset_buffers() {
   }
 }
 
+<<<<<<< HEAD
 #if defined(USE_ROCM)
+=======
+#if defined(USE_ROCM) && !defined(_WIN32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(TestLoops, HasSameArgTypes) {
   // This is a compile-time unit test. If this file compiles without error,
   // then the test passes and during runtime, we just need to return.
diff --git a/aten/src/ATen/test/mps_test_metal_library.cpp b/aten/src/ATen/test/mps_test_metal_library.cpp
index baee8964364d..2b8cce1f7a46 100644
--- a/aten/src/ATen/test/mps_test_metal_library.cpp
+++ b/aten/src/ATen/test/mps_test_metal_library.cpp
@@ -54,6 +54,10 @@ TEST(MPSTestMetalLibrary, ArangeWithArgsShader) {
   });
   ASSERT_TRUE((x==y).all().item().toBool());
 }
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(MPSTestMetalLibrary, Arange2DShader) {
   const auto size = 16;
   auto x = torch::empty({size, size}, at::device(at::kMPS));
@@ -71,3 +75,44 @@ TEST(MPSTestMetalLibrary, Arange2DShader) {
   });
   ASSERT_EQ(x.sum().item().to<int>(), 65280);
 }
+<<<<<<< HEAD
+=======
+
+TEST(MPSTestMetalLibrary, ArgumentBuffers) {
+  constexpr auto nbuffers = 64;
+  const auto size = 32;
+  std::vector<at::Tensor> ibuffers;
+  std::vector<void *> ibuffers_gpu_ptrs;
+  for([[maybe_unused]] auto idx: c10::irange(nbuffers)) {
+    ibuffers.push_back(torch::rand({size}, at::device(at::kMPS)));
+    ibuffers_gpu_ptrs.push_back(get_tensor_gpu_address(ibuffers.back()));
+  }
+  auto output = torch::empty({size}, at::device(at::kMPS));
+  DynamicMetalShaderLibrary lib(R"MTL(
+  constant constexpr auto nbuffers = 64;
+  struct Inputs {
+    metal::array<device float *, nbuffers> args;
+  };
+
+  kernel void sum_all(device float* output, constant Inputs& inputs, uint idx [[thread_position_in_grid]]) {
+    output[idx] = 0;
+    for(auto i = 0; i < nbuffers; ++i) {
+      output[idx] += inputs.args[i][idx];
+    }
+  }
+  )MTL");
+  auto func = lib.getKernelFunction("sum_all");
+  func->runCommandBlock([&] {
+     func->startEncoding();
+     func->setArg(0, output);
+     func->setArg(1, ibuffers_gpu_ptrs);
+     func->dispatch(size);
+  });
+  // Compute sum of all 64 input tensors
+  auto result = torch::zeros({size}, at::device(at::kMPS));
+  for(auto buf: ibuffers) {
+    result += buf;
+  }
+  ASSERT_EQ(result.sum().item().to<float>(), output.sum().item().to<float>());
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/aten/src/ATen/test/test_install/CMakeLists.txt b/aten/src/ATen/test/test_install/CMakeLists.txt
index 50e2ce80ba7c..d13866831023 100644
--- a/aten/src/ATen/test/test_install/CMakeLists.txt
+++ b/aten/src/ATen/test/test_install/CMakeLists.txt
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 cmake_minimum_required(VERSION 3.0)
+=======
+cmake_minimum_required(VERSION 3.5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 find_package(ATen REQUIRED)
 include_directories(${ATEN_INCLUDE_DIR})
 
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index 4e0780800906..a05b83d71815 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -192,6 +192,14 @@ namespace {
             [](vec v) { return v.neg(); },
             createDefaultUnaryTestCase<vec>(TestSeed()),
             RESOLVE_OVERLOAD(filter_int_minimum));
+<<<<<<< HEAD
+=======
+        test_unary<vec>(
+            NAME_INFO(negate), std::negate<ValueType<vec>>(),
+            [](vec v) { return -v; },
+            createDefaultUnaryTestCase<vec>(TestSeed()),
+            RESOLVE_OVERLOAD(filter_int_minimum));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TYPED_TEST(SignManipulationHalfPrecision, AbsNegate) {
       typedef enum  {
@@ -329,7 +337,11 @@ namespace {
       test_binary<vec>(
           NAME_INFO(fmod),
           RESOLVE_OVERLOAD(std::fmod),
+<<<<<<< HEAD
           [](vec v0, vec v1) { return v0.fmod(v1); },
+=======
+          [](const auto& v0, const auto& v1) { return vec(v0).fmod(v1); },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           createDefaultBinaryTestCase<vec>(TestSeed()),
           RESOLVE_OVERLOAD(filter_fmod));
     }
@@ -371,11 +383,28 @@ namespace {
     }
     TYPED_TEST(Hyperbolic, Tanh) {
         using vec = TypeParam;
+<<<<<<< HEAD
+=======
+// NOTE: Because SVE uses ACL logic, the precision changes, hence the adjusted tolerance.
+#if defined(CPU_CAPABILITY_SVE)
+        using UVT = UvalueType<vec>;
+        UVT tolerance = getDefaultTolerance<UVT>();
+        test_unary<vec>(
+            NAME_INFO(tanH),
+            RESOLVE_OVERLOAD(std::tanh),
+            [](vec v) { return v.tanh(); },
+            createDefaultUnaryTestCase<vec>(TestSeed(), tolerance));
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         test_unary<vec>(
             NAME_INFO(tanH),
             RESOLVE_OVERLOAD(std::tanh),
             [](vec v) { return v.tanh(); },
             createDefaultUnaryTestCase<vec>(TestSeed()));
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     TYPED_TEST(Hyperbolic, Sinh) {
         using vec = TypeParam;
@@ -566,6 +595,22 @@ namespace {
         }
       }
     }
+<<<<<<< HEAD
+=======
+#if defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
+    TEST(NanBfloat16, IsNan) {
+      for (unsigned int ii = 0; ii < 0xFFFF; ++ii) {
+        c10::BFloat16 val(ii, c10::BFloat16::from_bits());
+        bool expected = std::isnan(val);
+        CACHE_ALIGN c10::BFloat16 actual_vals[at::vec::SVE256::Vectorized<c10::BFloat16>::size()];
+        at::vec::SVE256::Vectorized<c10::BFloat16>(val).isnan().store(actual_vals);
+        for (int jj = 0; jj < at::vec::SVE256::Vectorized<c10::BFloat16>::size(); ++jj) {
+          EXPECT_EQ(expected, c10::bit_cast<uint16_t>(actual_vals[jj]) != 0) << "bf16 isnan failure for bit pattern " << std::hex << ii << std::dec;
+        }
+      }
+    }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TYPED_TEST(LGamma, LGamma) {
         using vec = TypeParam;
         using UVT = UvalueType<vec>;
@@ -588,8 +633,13 @@ namespace {
         test_binary<vec>(
             NAME_INFO(atan2),
             RESOLVE_OVERLOAD(std::atan2),
+<<<<<<< HEAD
             [](vec v0, vec v1) {
                 return v0.atan2(v1);
+=======
+            [](const auto& v0, const auto& v1) {
+              return vec(v0).atan2(v1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
             createDefaultBinaryTestCase<vec>(TestSeed()));
     }
@@ -598,7 +648,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(pow),
             RESOLVE_OVERLOAD(std::pow),
+<<<<<<< HEAD
             [](vec v0, vec v1) { return v0.pow(v1); },
+=======
+            [](const auto& v0, const auto& v1) { return vec(v0).pow(v1); },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             createDefaultBinaryTestCase<vec>(TestSeed(), false, true));
     }
     TYPED_TEST(RealTests, Hypot) {
@@ -606,7 +660,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(hypot),
             RESOLVE_OVERLOAD(std::hypot),
+<<<<<<< HEAD
             [](vec v0, vec v1) { return v0.hypot(v1); },
+=======
+            [](const auto& v0, const auto& v1) { return vec(v0).hypot(v1); },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             createDefaultBinaryTestCase<vec>(TestSeed(), false, true));
     }
     TYPED_TEST(RealTests, NextAfter) {
@@ -614,7 +672,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(nextafter),
             RESOLVE_OVERLOAD(std::nextafter),
+<<<<<<< HEAD
             [](vec v0, vec v1) { return v0.nextafter(v1); },
+=======
+            [](const auto& v0, const auto& v1) { return vec(v0).nextafter(v1); },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             createDefaultBinaryTestCase<vec>(TestSeed(), false, true));
     }
     TYPED_TEST(Interleave, Interleave) {
@@ -664,7 +726,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(plus),
             std::plus<VT>(),
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) -> vec {
+=======
+            [](const auto& v0, const auto& v1) -> vec {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return v0 + v1;
             },
             createDefaultBinaryTestCase<vec>(TestSeed()),
@@ -676,7 +742,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(minus),
             std::minus<VT>(),
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) -> vec {
+=======
+            [](const auto& v0, const auto& v1) -> vec {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return v0 - v1;
             },
             createDefaultBinaryTestCase<vec>(TestSeed()),
@@ -687,7 +757,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(mult),
             RESOLVE_OVERLOAD(local_multiply),
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) { return v0 * v1; },
+=======
+            [](const auto& v0, const auto& v1) { return v0 * v1; },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             createDefaultBinaryTestCase<vec>(TestSeed(), false, true),
             RESOLVE_OVERLOAD(filter_mult_overflow));
     }
@@ -697,7 +771,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(division),
             RESOLVE_OVERLOAD(local_division),
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) { return v0 / v1; },
+=======
+            [](const auto& v0, const auto& v1) { return v0 / v1; },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             createDefaultBinaryTestCase<vec>(seed),
             RESOLVE_OVERLOAD(filter_div_ub));
     }
@@ -706,7 +784,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(bit_and),
             RESOLVE_OVERLOAD(local_and),
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) { return v0 & v1; },
+=======
+            [](const auto& v0, const auto& v1) { return v0 & v1; },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             createDefaultBinaryTestCase<vec>(TestSeed(), true));
     }
     TYPED_TEST(Bitwise, BitOr) {
@@ -714,7 +796,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(bit_or),
             RESOLVE_OVERLOAD(local_or),
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) { return v0 | v1; },
+=======
+            [](const auto& v0, const auto& v1) { return v0 | v1; },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             createDefaultBinaryTestCase<vec>(TestSeed(), true));
     }
     TYPED_TEST(Bitwise, BitXor) {
@@ -722,7 +808,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(bit_xor),
             RESOLVE_OVERLOAD(local_xor),
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) { return v0 ^ v1; },
+=======
+            [](const auto& v0, const auto& v1) { return v0 ^ v1; },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             createDefaultBinaryTestCase<vec>(TestSeed(), true));
     }
     TYPED_TEST(Comparison, Equal) {
@@ -785,7 +875,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(minimum),
             minimum<VT>,
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) {
+=======
+            [](const auto& v0, const auto& v1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return minimum(v0, v1);
             },
             createDefaultBinaryTestCase<vec>(TestSeed()));
@@ -796,7 +890,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(maximum),
             maximum<VT>,
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) {
+=======
+            [](const auto& v0, const auto& v1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return maximum(v0, v1);
             },
             createDefaultBinaryTestCase<vec>(TestSeed()));
@@ -807,7 +905,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(clamp min),
             clamp_min<VT>,
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) {
+=======
+            [](const auto& v0, const auto& v1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return clamp_min(v0, v1);
             },
             createDefaultBinaryTestCase<vec>(TestSeed()));
@@ -818,7 +920,11 @@ namespace {
         test_binary<vec>(
             NAME_INFO(clamp max),
             clamp_max<VT>,
+<<<<<<< HEAD
             [](const vec& v0, const vec& v1) {
+=======
+            [](const auto& v0, const auto& v1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return clamp_max(v0, v1);
             },
             createDefaultBinaryTestCase<vec>(TestSeed()));
@@ -1671,6 +1777,343 @@ namespace {
             << "Test failed for uint16 to float " << u16 << "\n";
       }
     }
+<<<<<<< HEAD
+=======
+    TEST(FP8E4M3Test, FP8E4M3ConversionFloat) {
+      for (uint32_t index = 0; index < 256; ++index) {
+        uint8_t input = static_cast<uint8_t>(index);
+      #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+        float f32 = at::vec::fp8e4m3_to_fp32_scalar(input);
+        uint8_t u8 = at::vec::fp32_to_fp8e4m3_scalar(f32);
+      #else
+        float f32 = c10::detail::fp8e4m3fn_to_fp32_value(input);
+        uint8_t u8 = c10::detail::fp8e4m3fn_from_fp32_value(f32);
+      #endif
+        if (index == 255 || index == 127) {
+          // EXPECT_EQ failed to check nan
+          EXPECT_TRUE(std::isnan(f32));
+        } else {
+          EXPECT_EQ(f32, c10::detail::fp8e4m3fn_to_fp32_value(input))
+              << "Test failed for u8 to float " << input << "\n";
+        }
+        EXPECT_EQ(u8, c10::detail::fp8e4m3fn_from_fp32_value(f32))
+            << "Test failed for float to u8 " << f32 << "\n";
+      }
+    }
+    TEST(FP8E4M3Test, FP8E4M3BinaryAdd) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e4m3(
+          [](c10::Float8_e4m3fn f8_e4m3_0, c10::Float8_e4m3fn f8_e4m3_1) {
+            return f8_e4m3_0 + f8_e4m3_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_0,
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_1) {
+            return f8_e4m3_0 + f8_e4m3_1;
+          }
+      );
+    #endif
+    }
+    TEST(FP8E4M3Test, FP8E4M3BinarySub) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e4m3(
+          [](c10::Float8_e4m3fn f8_e4m3_0, c10::Float8_e4m3fn f8_e4m3_1) {
+            return f8_e4m3_0 - f8_e4m3_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_0,
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_1) {
+            return f8_e4m3_0 - f8_e4m3_1;
+          }
+      );
+    #endif
+    }
+    TEST(FP8E4M3Test, FP8E4M3BinaryMul) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e4m3(
+          [](c10::Float8_e4m3fn f8_e4m3_0, c10::Float8_e4m3fn f8_e4m3_1) {
+            return f8_e4m3_0 * f8_e4m3_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_0,
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_1) {
+            return f8_e4m3_0 * f8_e4m3_1;
+          }
+      );
+    #endif
+    }
+    TEST(FP8E4M3Test, FP8E4M3BinaryDiv) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e4m3(
+          [](c10::Float8_e4m3fn f8_e4m3_0, c10::Float8_e4m3fn f8_e4m3_1) {
+            return f8_e4m3_0 / f8_e4m3_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_0,
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_1) {
+            return f8_e4m3_0 / f8_e4m3_1;
+          }
+      );
+    #endif
+    }
+    TEST(FP8E4M3Test, FP8E4M3BinaryEQ) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e4m3(
+          [](c10::Float8_e4m3fn f8_e4m3_0, c10::Float8_e4m3fn f8_e4m3_1) {
+            return f8_e4m3_0 == f8_e4m3_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_0,
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_1) {
+            return f8_e4m3_0 == f8_e4m3_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E4M3Test, FP8E4M3BinaryNE) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e4m3(
+          [](c10::Float8_e4m3fn f8_e4m3_0, c10::Float8_e4m3fn f8_e4m3_1) {
+            return f8_e4m3_0 != f8_e4m3_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_0,
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_1) {
+            return f8_e4m3_0 != f8_e4m3_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E4M3Test, FP8E4M3BinaryGT) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e4m3(
+          [](c10::Float8_e4m3fn f8_e4m3_0, c10::Float8_e4m3fn f8_e4m3_1) {
+            return f8_e4m3_0 > f8_e4m3_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_0,
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_1) {
+            return f8_e4m3_0 > f8_e4m3_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E4M3Test, FP8E4M3BinaryGE) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e4m3(
+          [](c10::Float8_e4m3fn f8_e4m3_0, c10::Float8_e4m3fn f8_e4m3_1) {
+            return f8_e4m3_0 >= f8_e4m3_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_0,
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_1) {
+            return f8_e4m3_0 >= f8_e4m3_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E4M3Test, FP8E4M3BinaryLT) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e4m3(
+          [](c10::Float8_e4m3fn f8_e4m3_0, c10::Float8_e4m3fn f8_e4m3_1) {
+            return f8_e4m3_0 < f8_e4m3_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_0,
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_1) {
+            return f8_e4m3_0 < f8_e4m3_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E4M3Test, FP8E4M3BinaryLE) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e4m3(
+          [](c10::Float8_e4m3fn f8_e4m3_0, c10::Float8_e4m3fn f8_e4m3_1) {
+            return f8_e4m3_0 <= f8_e4m3_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_0,
+            at::vec::Vectorized<c10::Float8_e4m3fn> f8_e4m3_1) {
+            return f8_e4m3_0 <= f8_e4m3_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E5M2Test, FP8E5M2ConversionFloat) {
+      for (uint32_t index = 0; index < 256; ++index) {
+        uint8_t input = static_cast<uint8_t>(index);
+      #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+        float f32 = at::vec::fp8e5m2_to_fp32_scalar(input);
+        uint8_t u8 = at::vec::fp32_to_fp8e5m2_scalar(f32);
+      #else
+        float f32 = c10::detail::fp8e5m2_to_fp32_value(input);
+        uint8_t u8 = c10::detail::fp8e5m2_from_fp32_value(f32);
+      #endif
+        if (index == 255 || index == 127 || index == 254 || index == 126 || index == 253 || index == 125) {
+          // EXPECT_EQ failed to check nan
+          EXPECT_TRUE(std::isnan(f32));
+        } else {
+          EXPECT_EQ(f32, c10::detail::fp8e5m2_to_fp32_value(input))
+              << "Test failed for u8 to float " << input << "\n";
+        }
+        EXPECT_EQ(u8, c10::detail::fp8e5m2_from_fp32_value(f32))
+            << "Test failed for float to u8 " << f32 << "\n";
+      }
+    }
+    TEST(FP8E5M2Test, FP8E5M2BinaryAdd) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e5m2(
+          [](c10::Float8_e5m2 f8_e5m2_0, c10::Float8_e5m2 f8_e5m2_1) {
+            return f8_e5m2_0 + f8_e5m2_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_0,
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_1) {
+            return f8_e5m2_0 + f8_e5m2_1;
+          }
+      );
+    #endif
+    }
+    TEST(FP8E5M2Test, FP8E5M2BinarySub) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e5m2(
+          [](c10::Float8_e5m2 f8_e5m2_0, c10::Float8_e5m2 f8_e5m2_1) {
+            return f8_e5m2_0 - f8_e5m2_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_0,
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_1) {
+            return f8_e5m2_0 - f8_e5m2_1;
+          }
+      );
+    #endif
+    }
+    TEST(FP8E5M2Test, FP8E5M2BinaryMul) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e5m2(
+          [](c10::Float8_e5m2 f8_e5m2_0, c10::Float8_e5m2 f8_e5m2_1) {
+            return f8_e5m2_0 * f8_e5m2_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_0,
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_1) {
+            return f8_e5m2_0 * f8_e5m2_1;
+          }
+      );
+    #endif
+    }
+    TEST(FP8E5M2Test, FP8E5M2BinaryDiv) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e5m2(
+          [](c10::Float8_e5m2 f8_e5m2_0, c10::Float8_e5m2 f8_e5m2_1) {
+            return f8_e5m2_0 / f8_e5m2_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_0,
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_1) {
+            return f8_e5m2_0 / f8_e5m2_1;
+          }
+      );
+    #endif
+    }
+    TEST(FP8E5M2Test, FP8E5M2BinaryEQ) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e5m2(
+          [](c10::Float8_e5m2 f8_e5m2_0, c10::Float8_e5m2 f8_e5m2_1) {
+            return f8_e5m2_0 == f8_e5m2_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_0,
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_1) {
+            return f8_e5m2_0 == f8_e5m2_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E5M2Test, FP8E5M2BinaryNE) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e5m2(
+          [](c10::Float8_e5m2 f8_e5m2_0, c10::Float8_e5m2 f8_e5m2_1) {
+            return f8_e5m2_0 != f8_e5m2_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_0,
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_1) {
+            return f8_e5m2_0 != f8_e5m2_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E5M2Test, FP8E5M2BinaryGT) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e5m2(
+          [](c10::Float8_e5m2 f8_e5m2_0, c10::Float8_e5m2 f8_e5m2_1) {
+            return f8_e5m2_0 > f8_e5m2_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_0,
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_1) {
+            return f8_e5m2_0 > f8_e5m2_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E5M2Test, FP8E5M2BinaryGE) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e5m2(
+          [](c10::Float8_e5m2 f8_e5m2_0, c10::Float8_e5m2 f8_e5m2_1) {
+            return f8_e5m2_0 >= f8_e5m2_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_0,
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_1) {
+            return f8_e5m2_0 >= f8_e5m2_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E5M2Test, FP8E5M2BinaryLT) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e5m2(
+          [](c10::Float8_e5m2 f8_e5m2_0, c10::Float8_e5m2 f8_e5m2_1) {
+            return f8_e5m2_0 < f8_e5m2_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_0,
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_1) {
+            return f8_e5m2_0 < f8_e5m2_1;
+          },
+          true
+      );
+    #endif
+    }
+    TEST(FP8E5M2Test, FP8E5M2BinaryLE) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+      test_binary_fp8_e5m2(
+          [](c10::Float8_e5m2 f8_e5m2_0, c10::Float8_e5m2 f8_e5m2_1) {
+            return f8_e5m2_0 <= f8_e5m2_1;
+          },
+          [](
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_0,
+            at::vec::Vectorized<c10::Float8_e5m2> f8_e5m2_1) {
+            return f8_e5m2_0 <= f8_e5m2_1;
+          },
+          true
+      );
+    #endif
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TYPED_TEST(InfiniteTests, HasInfNan) {
       using vec = TypeParam;
       using VT = UholdType<TypeParam>;
diff --git a/aten/src/ATen/test/vec_test_all_types.h b/aten/src/ATen/test/vec_test_all_types.h
index 20a1ca066cc3..09e22dcb113a 100644
--- a/aten/src/ATen/test/vec_test_all_types.h
+++ b/aten/src/ATen/test/vec_test_all_types.h
@@ -1001,6 +1001,13 @@ void test_binary(
     CACHE_ALIGN VT vals0[el_count];
     CACHE_ALIGN VT vals1[el_count];
     CACHE_ALIGN VT expected[el_count];
+<<<<<<< HEAD
+=======
+    [[maybe_unused]] CACHE_ALIGN VT expectedWithLeftScalar[el_count];
+    [[maybe_unused]] CACHE_ALIGN VT expectedWithRightScalar[el_count];
+    [[maybe_unused]] VT scalar0;
+    [[maybe_unused]] VT scalar1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool bitwise = testCase.isBitwise();
     UVT default_start = std::is_floating_point_v<UVT> ? std::numeric_limits<UVT>::lowest() : std::numeric_limits<UVT>::min();
     UVT default_end = std::numeric_limits<UVT>::max();
@@ -1010,6 +1017,10 @@ void test_binary(
     int trialCount = getTrialCount<UVT>(test_trials, domains_size);
     TestSeed seed = testCase.getTestSeed();
     uint64_t changeSeedBy = 0;
+<<<<<<< HEAD
+=======
+    constexpr bool kCanUseScalar = std::is_invocable_v<Op2, VT, T> && std::is_invocable_v<Op2, T, VT>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const CheckWithinDomains<UVT>& dmn : testCase.getDomains()) {
         size_t dmn_argc = dmn.ArgsDomain.size();
         UVT start0 = dmn_argc > 0 ? dmn.ArgsDomain[0].start : default_start;
@@ -1022,9 +1033,29 @@ void test_binary(
           for (const auto k : c10::irange(el_count)) {
             vals0[k] = generator0.get();
             vals1[k] = generator1.get();
+<<<<<<< HEAD
             call_filter(filter, vals0[k], vals1[k]);
             // map operator
             expected[k] = expectedFunction(vals0[k], vals1[k]);
+=======
+            if (k == 0) {
+              scalar0 = vals0[0];
+              scalar1 = vals1[0];
+            }
+            call_filter(filter, vals0[k], vals1[k]);
+            if constexpr (kCanUseScalar) {
+              call_filter(filter, vals0[k], scalar1);
+              call_filter(filter, scalar0, vals1[k]);
+            }
+          }
+          for (const auto k : c10::irange(el_count)) {
+            // map operator
+            expected[k] = expectedFunction(vals0[k], vals1[k]);
+            if constexpr (kCanUseScalar) {
+              expectedWithLeftScalar[k] = expectedFunction(scalar0, vals1[k]);
+              expectedWithRightScalar[k] = expectedFunction(vals0[k], scalar1);
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           }
           // test
           auto input0 = vec_type::loadu(vals0);
@@ -1034,8 +1065,32 @@ void test_binary(
           AssertVectorized<vec_type> vecAssert(
               testNameInfo, seed, vec_expected, actual, input0, input1);
           if (vecAssert.check(
+<<<<<<< HEAD
                   bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))
             return;
+=======
+                  bitwise, dmn.CheckWithTolerance, dmn.ToleranceError)) {
+            return;
+          }
+          if constexpr (kCanUseScalar) {
+            auto actualWithLeftScalar = actualFunction(scalar0, input1);
+            auto actualWithRightScalar = actualFunction(input0, scalar1);
+            auto vec_expectedWithLeftScalar = vec_type::loadu(expectedWithLeftScalar);
+            auto vec_expectedWithRightScalar = vec_type::loadu(expectedWithRightScalar);
+            AssertVectorized<vec_type> vecAssertWithLeftScalar(
+                testNameInfo, seed, vec_expectedWithLeftScalar, actualWithLeftScalar, scalar0, input1);
+            if (vecAssertWithLeftScalar.check(
+                    bitwise, dmn.CheckWithTolerance, dmn.ToleranceError)) {
+              return;
+            }
+            AssertVectorized<vec_type> vecAssertWithRightScalar(
+                testNameInfo, seed, vec_expectedWithRightScalar, actualWithRightScalar, input0, scalar1);
+            if (vecAssertWithRightScalar.check(
+                    bitwise, dmn.CheckWithTolerance, dmn.ToleranceError)) {
+              return;
+            }
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         } // trial
         changeSeedBy += 1;
     }
@@ -1052,6 +1107,71 @@ void test_binary(
     }
 }
 
+<<<<<<< HEAD
+=======
+template<typename Op1, typename Op2, typename scalar_t, std::enable_if_t<std::is_same_v<scalar_t, c10::Float8_e4m3fn> || std::is_same_v<scalar_t, c10::Float8_e5m2>, int> =0>
+void test_binary_fp8(
+    Op1 ScalarFunction,
+    Op2 VecFunction,
+    bool is_bit_wise = false) {
+    #if defined(CPU_CAPABILITY_AVX512) && !defined(__APPLE__) && !defined(_MSC_VER)
+    for (const auto i : c10::irange(10)) {
+        float f_val0 = static_cast<float>(i + 0.2);
+        float f_val1 = static_cast<float>(i + 0.3);
+        scalar_t f8_0(f_val0);
+        scalar_t f8_1(f_val1);
+        at::vec::Vectorized<scalar_t> f8_vec_0(f8_0);
+        at::vec::Vectorized<scalar_t> f8_vec_1(f8_1);
+        float ref_res_scalar = ScalarFunction(f8_0, f8_1);
+        __m512 res_fp32_512;
+        at::vec::Vectorized<scalar_t> res = VecFunction(f8_vec_0, f8_vec_1);
+
+        if constexpr (std::is_same_v<scalar_t, c10::Float8_e4m3fn>) {
+            at::vec::cvtfp8e4m3_fp32(_mm512_castsi512_si128(res), res_fp32_512);
+            float res_scalar = _mm512_cvtss_f32(res_fp32_512);
+            if (is_bit_wise) {
+                EXPECT_EQ(static_cast<bool>(ref_res_scalar), static_cast<bool>(res_scalar))
+                    << "Test failed for input0: " << c10::detail::fp8e4m3fn_to_fp32_value(f8_0.x)
+                    << " input1: " << c10::detail::fp8e4m3fn_to_fp32_value(f8_1.x) << "\n";
+            } else {
+                EXPECT_EQ(ref_res_scalar, res_scalar)
+                    << "Test failed for input0: " << c10::detail::fp8e4m3fn_to_fp32_value(f8_0.x)
+                    << " input1: " << c10::detail::fp8e4m3fn_to_fp32_value(f8_1.x) << "\n";
+            }
+        } else {
+            at::vec::cvtfp8e5m2_fp32(_mm512_castsi512_si128(res), res_fp32_512);
+            float res_scalar = _mm512_cvtss_f32(res_fp32_512);
+            if (is_bit_wise) {
+                EXPECT_EQ(static_cast<bool>(ref_res_scalar), static_cast<bool>(res_scalar))
+                    << "Test failed for input0: " << c10::detail::fp8e5m2_to_fp32_value(f8_0.x)
+                    << " input1: " << c10::detail::fp8e5m2_to_fp32_value(f8_1.x) << "\n";
+            } else {
+                EXPECT_EQ(ref_res_scalar, res_scalar)
+                    << "Test failed for input0: " << c10::detail::fp8e5m2_to_fp32_value(f8_0.x)
+                    << " input1: " << c10::detail::fp8e5m2_to_fp32_value(f8_1.x) << "\n";
+            }
+        }
+      }
+    #endif
+}
+
+template<typename Op1, typename Op2>
+void test_binary_fp8_e4m3(
+    Op1 ScalarFunction,
+    Op2 VecFunction,
+    bool is_bit_wise = false) {
+    test_binary_fp8<Op1, Op2, c10::Float8_e4m3fn>(ScalarFunction, VecFunction, is_bit_wise);
+}
+
+template<typename Op1, typename Op2>
+void test_binary_fp8_e5m2(
+    Op1 ScalarFunction,
+    Op2 VecFunction,
+    bool is_bit_wise = false) {
+    test_binary_fp8<Op1, Op2, c10::Float8_e5m2>(ScalarFunction, VecFunction, is_bit_wise);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template< typename T, typename Op1, typename Op2, typename Filter = std::nullptr_t>
 void test_ternary(
     std::string testNameInfo,
diff --git a/aten/src/ATen/test/vitals.cpp b/aten/src/ATen/test/vitals.cpp
index 9bf22d81e45f..d86bef760995 100644
--- a/aten/src/ATen/test/vitals.cpp
+++ b/aten/src/ATen/test/vitals.cpp
@@ -80,8 +80,12 @@ TEST(Vitals, OnAndOff) {
 
 TEST(Vitals, APIVitals) {
   std::stringstream buffer;
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   bool rvalue;
+=======
+  bool rvalue = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::streambuf* sbuf = std::cout.rdbuf();
   std::cout.rdbuf(buffer.rdbuf());
   {
diff --git a/aten/src/ATen/test/xpu_caching_host_allocator_test.cpp b/aten/src/ATen/test/xpu_caching_host_allocator_test.cpp
index 25da86353aa7..7f097d834b99 100644
--- a/aten/src/ATen/test/xpu_caching_host_allocator_test.cpp
+++ b/aten/src/ATen/test/xpu_caching_host_allocator_test.cpp
@@ -20,10 +20,17 @@ TEST(CachingHostAllocatorTest, testPinnedAliasSlice) {
       at::empty({N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
   // TODO: Uncomment this line when op `pin_memory` is supported on XPU.
   // ASSERT_TRUE(pinned_tensor.is_pinned());
+<<<<<<< HEAD
   ASSERT_TRUE(at::xpu::CachingHostAllocator_recordEvent(
       pinned_tensor.data_ptr(),
       pinned_tensor.storage().data_ptr().get_context(),
       at::xpu::getCurrentXPUStream()));
+=======
+  ASSERT_TRUE(at::getHostAllocator(at::kXPU)->record_event(
+      pinned_tensor.data_ptr(),
+      pinned_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Check an tensor constructed with from_blob can be correctly recorded (via
   // the shared data_ptr)
@@ -35,10 +42,17 @@ TEST(CachingHostAllocatorTest, testPinnedAliasSlice) {
       alias_tensor.storage().data_ptr().get_context() ==
       pinned_tensor.storage().data_ptr().get_context());
   ASSERT_EQ(alias_tensor.data_ptr(), pinned_tensor.data_ptr());
+<<<<<<< HEAD
   ASSERT_TRUE(at::xpu::CachingHostAllocator_recordEvent(
       alias_tensor.data_ptr(),
       alias_tensor.storage().data_ptr().get_context(),
       at::xpu::getCurrentXPUStream()));
+=======
+  ASSERT_TRUE(at::getHostAllocator(at::kXPU)->record_event(
+      alias_tensor.data_ptr(),
+      alias_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Check an tensor constructed with slicing can be correctly recorded (via
   // the shared context)
@@ -48,20 +62,34 @@ TEST(CachingHostAllocatorTest, testPinnedAliasSlice) {
       slice_tensor.storage().data_ptr().get_context(),
       pinned_tensor.storage().data_ptr().get_context());
   ASSERT_NE(slice_tensor.data_ptr(), pinned_tensor.data_ptr());
+<<<<<<< HEAD
   ASSERT_TRUE(at::xpu::CachingHostAllocator_recordEvent(
       slice_tensor.data_ptr(),
       slice_tensor.storage().data_ptr().get_context(),
       at::xpu::getCurrentXPUStream()));
+=======
+  ASSERT_TRUE(at::getHostAllocator(at::kXPU)->record_event(
+      slice_tensor.data_ptr(),
+      slice_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Check a tensor that has neither a matching context nor data_ptr cannot be
   // recorded.
   auto alias_slice_tensor = at::from_blob(
       slice_tensor.data_ptr(), slice_tensor.sizes(), slice_tensor.options());
   // ASSERT_TRUE(alias_slice_tensor.is_pinned());
+<<<<<<< HEAD
   ASSERT_FALSE(at::xpu::CachingHostAllocator_recordEvent(
       alias_slice_tensor.data_ptr(),
       alias_slice_tensor.storage().data_ptr().get_context(),
       at::xpu::getCurrentXPUStream()));
+=======
+  ASSERT_FALSE(at::getHostAllocator(at::kXPU)->record_event(
+      alias_slice_tensor.data_ptr(),
+      alias_slice_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ASSERT_NE(
       alias_slice_tensor.storage().data_ptr().get(),
       slice_tensor.storage().data_ptr().get());
@@ -72,7 +100,11 @@ TEST(CachingHostAllocatorTest, testRawAllocation) {
     return;
   }
 
+<<<<<<< HEAD
   auto data_ptr = at::xpu::getCachingHostAllocator()->allocate(N);
+=======
+  auto data_ptr = at::getHostAllocator(at::kXPU)->allocate(N);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   class UserDataDeleter {
    public:
     explicit UserDataDeleter(std::unique_ptr<void, c10::DeleterFnPtr> ptr)
@@ -105,10 +137,17 @@ TEST(CachingHostAllocatorTest, testRawAllocation) {
           .make_tensor();
 
   // ASSERT_TRUE(pinned_tensor.is_pinned());
+<<<<<<< HEAD
   ASSERT_TRUE(at::xpu::CachingHostAllocator_recordEvent(
       pinned_tensor.data_ptr(),
       pinned_tensor.storage().data_ptr().get_context(),
       at::xpu::getCurrentXPUStream()));
+=======
+  ASSERT_TRUE(at::getHostAllocator(at::kXPU)->record_event(
+      pinned_tensor.data_ptr(),
+      pinned_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(CachingHostAllocatorTest, testUnknownTensor) {
@@ -119,10 +158,17 @@ TEST(CachingHostAllocatorTest, testUnknownTensor) {
   auto unpinned_tensor =
       at::empty({N}, at::TensorOptions().dtype(at::kByte).pinned_memory(false));
 
+<<<<<<< HEAD
   ASSERT_FALSE(at::xpu::CachingHostAllocator_recordEvent(
       unpinned_tensor.data_ptr(),
       unpinned_tensor.storage().data_ptr().get_context(),
       at::xpu::getCurrentXPUStream()));
+=======
+  ASSERT_FALSE(at::getHostAllocator(at::kXPU)->record_event(
+      unpinned_tensor.data_ptr(),
+      unpinned_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(CachingHostAllocatorTest, testEmptyCache) {
@@ -137,8 +183,13 @@ TEST(CachingHostAllocatorTest, testEmptyCache) {
         {N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
     ptr = pinned_tensor.data_ptr();
     ctx = pinned_tensor.storage().data_ptr().get_context();
+<<<<<<< HEAD
     ASSERT_TRUE(at::xpu::CachingHostAllocator_recordEvent(
         ptr, ctx, at::xpu::getCurrentXPUStream()));
+=======
+    ASSERT_TRUE(at::getHostAllocator(at::kXPU)->record_event(
+        ptr, ctx, at::xpu::getCurrentXPUStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   {
@@ -147,9 +198,15 @@ TEST(CachingHostAllocatorTest, testEmptyCache) {
     at::xpu::syncStreamsOnDevice();
   }
 
+<<<<<<< HEAD
   at::xpu::CachingHostAllocator_emptyCache();
   ASSERT_FALSE(at::xpu::CachingHostAllocator_recordEvent(
       ptr, ctx, at::xpu::getCurrentXPUStream()));
+=======
+  at::getHostAllocator(at::kXPU)->empty_cache();
+  ASSERT_FALSE(at::getHostAllocator(at::kXPU)->record_event(
+      ptr, ctx, at::xpu::getCurrentXPUStream().unwrap()));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST(CachingHostAllocatorTest, testReuse) {
diff --git a/aten/src/ATen/test/xpu_reportMemoryUsage_test.cpp b/aten/src/ATen/test/xpu_reportMemoryUsage_test.cpp
new file mode 100644
index 000000000000..a9bc6c35dedd
--- /dev/null
+++ b/aten/src/ATen/test/xpu_reportMemoryUsage_test.cpp
@@ -0,0 +1,69 @@
+#include <ATen/test/reportMemoryUsage.h>
+
+#include <gtest/gtest.h>
+
+#include <c10/xpu/XPUCachingAllocator.h>
+
+TEST(DeviceCachingAllocator, check_reporter) {
+  auto reporter = std::make_shared<TestMemoryReportingInfo>();
+  c10::DebugInfoGuard guard(c10::DebugInfoKind::PROFILER_STATE, reporter);
+
+  auto _200kb = 200 * 1024;
+  auto _500mb = 500 * 1024 * 1024;
+
+  auto allocator = c10::xpu::XPUCachingAllocator::get();
+
+  auto alloc1 = allocator->allocate(_200kb);
+  auto r = reporter->getLatestRecord();
+  EXPECT_EQ(alloc1.get(), r.ptr);
+  EXPECT_LE(_200kb, r.alloc_size);
+  EXPECT_LE(_200kb, r.total_allocated);
+  EXPECT_LE(_200kb, r.total_reserved);
+  EXPECT_TRUE(r.device.is_xpu());
+
+  auto alloc1_true_ptr = r.ptr;
+  auto alloc1_true_alloc_size = r.alloc_size;
+
+  // I bet pytorch will not waste that much memory
+  EXPECT_LT(r.total_allocated, 2 * _200kb);
+  // I bet pytorch will not reserve that much memory
+  EXPECT_LT(r.total_reserved, _500mb);
+
+  auto alloc2 = allocator->allocate(_500mb);
+  r = reporter->getLatestRecord();
+  EXPECT_EQ(alloc2.get(), r.ptr);
+  EXPECT_LE(_500mb, r.alloc_size);
+  EXPECT_LE(_200kb + _500mb, r.total_allocated);
+  EXPECT_LE(_200kb + _500mb, r.total_reserved);
+  EXPECT_TRUE(r.device.is_xpu());
+  auto alloc2_true_ptr = r.ptr;
+  auto alloc2_true_alloc_size = r.alloc_size;
+
+  auto max_reserved = r.total_reserved;
+
+  alloc1.clear();
+  r = reporter->getLatestRecord();
+  EXPECT_EQ(alloc1_true_ptr, r.ptr);
+  EXPECT_EQ(-alloc1_true_alloc_size, r.alloc_size);
+  EXPECT_EQ(alloc2_true_alloc_size, r.total_allocated);
+  // alloc2 remain, it is a memory free operation, so it shouldn't reserve more
+  // memory.
+  EXPECT_TRUE(
+      alloc2_true_alloc_size <= static_cast<int64_t>(r.total_reserved) &&
+      r.total_reserved <= max_reserved);
+  EXPECT_TRUE(r.device.is_xpu());
+
+  alloc2.clear();
+  r = reporter->getLatestRecord();
+  EXPECT_EQ(alloc2_true_ptr, r.ptr);
+  EXPECT_EQ(-alloc2_true_alloc_size, r.alloc_size);
+  EXPECT_EQ(0, r.total_allocated);
+  EXPECT_TRUE(r.total_reserved <= max_reserved);
+  EXPECT_TRUE(r.device.is_xpu());
+}
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+  c10::xpu::XPUCachingAllocator::init(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/aten/src/ATen/xpu/CachingHostAllocator.cpp b/aten/src/ATen/xpu/CachingHostAllocator.cpp
index 332114a8715b..62ec72ddd425 100644
--- a/aten/src/ATen/xpu/CachingHostAllocator.cpp
+++ b/aten/src/ATen/xpu/CachingHostAllocator.cpp
@@ -32,6 +32,7 @@ struct XPUCachingHostAllocatorImpl
   }
 };
 
+<<<<<<< HEAD
 void raw_local_deleter(void* ptr);
 
 struct XPUCachingHostAllocator final
@@ -73,4 +74,15 @@ at::Allocator* getCachingHostAllocator() {
   return &getXPUCachingHostAllocator();
 }
 
+=======
+DECLARE_HOST_ALLOCATOR(
+    XPUCachingHostAllocator,
+    XPUCachingHostAllocatorImpl,
+    raw_local_deleter,
+    caching_host_allocator)
+
+REGISTER_HOST_ALLOCATOR(at::kXPU, &caching_host_allocator);
+
+} // anonymous namespace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace at::xpu
diff --git a/aten/src/ATen/xpu/CachingHostAllocator.h b/aten/src/ATen/xpu/CachingHostAllocator.h
index 5a4afc0dc748..26965d33c588 100644
--- a/aten/src/ATen/xpu/CachingHostAllocator.h
+++ b/aten/src/ATen/xpu/CachingHostAllocator.h
@@ -3,10 +3,15 @@
 #include <ATen/core/CachingHostAllocator.h>
 #include <ATen/xpu/XPUEvent.h>
 #include <c10/core/Allocator.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/Deprecated.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/xpu/XPUStream.h>
 
 namespace at::xpu {
 
+<<<<<<< HEAD
 TORCH_XPU_API c10::Allocator* getCachingHostAllocator();
 
 TORCH_XPU_API bool CachingHostAllocator_recordEvent(
@@ -18,6 +23,33 @@ TORCH_XPU_API void CachingHostAllocator_emptyCache();
 
 inline TORCH_XPU_API at::DataPtr HostAlloc(size_t size) {
   return getCachingHostAllocator()->allocate(size);
+=======
+C10_DEPRECATED_MESSAGE(
+    "at::xpu::getCachingHostAllocator() is deprecated. Please use at::getHostAllocator(at::kXPU) instead.")
+inline TORCH_XPU_API at::HostAllocator* getCachingHostAllocator() {
+  return at::getHostAllocator(at::kXPU);
+}
+
+C10_DEPRECATED_MESSAGE(
+    "at::xpu::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kXPU)->record_event(...) instead.")
+inline TORCH_XPU_API bool CachingHostAllocator_recordEvent(
+    void* ptr,
+    void* ctx,
+    c10::xpu::XPUStream stream) {
+  return getHostAllocator(at::kXPU)->record_event(ptr, ctx, stream.unwrap());
+}
+
+C10_DEPRECATED_MESSAGE(
+    "at::xpu::CachingHostAllocator_emptyCache() is deprecated. Please use at::getHostAllocator(at::kXPU)->empty_cache() instead.")
+inline TORCH_XPU_API void CachingHostAllocator_emptyCache() {
+  getHostAllocator(at::kXPU)->empty_cache();
+}
+
+C10_DEPRECATED_MESSAGE(
+    "at::xpu::HostAlloc(...) is deprecated. Please use at::getHostAllocator(at::kXPU)->allocate(...) instead.")
+inline TORCH_XPU_API at::DataPtr HostAlloc(size_t size) {
+  return getHostAllocator(at::kXPU)->allocate(size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace at::xpu
diff --git a/aten/src/ATen/xpu/PinnedMemoryAllocator.h b/aten/src/ATen/xpu/PinnedMemoryAllocator.h
index 4a209c955249..f055bfb78b32 100644
--- a/aten/src/ATen/xpu/PinnedMemoryAllocator.h
+++ b/aten/src/ATen/xpu/PinnedMemoryAllocator.h
@@ -5,7 +5,12 @@
 
 namespace at::xpu {
 
+<<<<<<< HEAD
 inline TORCH_XPU_API at::Allocator* getPinnedMemoryAllocator() {
   return getCachingHostAllocator();
+=======
+inline TORCH_XPU_API at::HostAllocator* getPinnedMemoryAllocator() {
+  return at::getHostAllocator(at::kXPU);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace at::xpu
diff --git a/benchmarks/README.md b/benchmarks/README.md
index e96d4f9508f5..ba615f5bc19c 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -8,7 +8,11 @@ It also provides mechanisms to compare PyTorch with other frameworks.
 Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order:
 ```
 # Install torchvision. It comes with the pytorch stable release binary
+<<<<<<< HEAD
 conda install pytorch torchvision -c pytorch
+=======
+pip3 install torch torchvision
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Install the latest pytorch master from source.
 # It should supersede the installation from the release binary.
diff --git a/benchmarks/dynamo/Makefile b/benchmarks/dynamo/Makefile
index 720542f28608..ea5301cee67d 100644
--- a/benchmarks/dynamo/Makefile
+++ b/benchmarks/dynamo/Makefile
@@ -1,5 +1,13 @@
+<<<<<<< HEAD
 # Makefile for the dashboard setup
 PIP ?= python -m pip
+=======
+# Usage:
+#   make build-deps TORCHBENCH_MODELS=<model_names>
+#   Support install a single torchbench model (e.g., "alexnet"),
+#   or multiple torchbench model names (e.g., "alexnet basic_gnn_gcn BERT_pytorch"),
+#   or empty (i.e., "") for installing all torchbench models.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 clone-deps:
 	(cd ../../.. \
@@ -24,6 +32,7 @@ pull-deps: clone-deps
 	(cd ../../../torchbenchmark && git fetch && git checkout "$$(cat ../pytorch/.github/ci_commit_pins/torchbench.txt)" && git submodule update --init --recursive)
 
 build-deps: clone-deps
+<<<<<<< HEAD
 	# conda env remove --name torchdynamo
 	# conda create --name torchdynamo -y python=3.8
 	# conda activate torchdynamo
@@ -40,3 +49,15 @@ build-deps: clone-deps
 	(cd ../../../detectron2  && python setup.py clean && python setup.py develop)
 	(cd ../../../torchbenchmark && python install.py --continue_on_fail)
 	pip uninstall -y torchrec-nightly fbgemm-gpu-nightly
+=======
+	uv pip install astunparse numpy scipy ninja pyyaml mkl mkl-include setuptools cmake \
+		typing-extensions requests protobuf numba cython scikit-learn librosa
+	(cd ../../../torchvision && uv pip install -e . --no-build-isolation)
+	(cd ../../../torchdata && uv pip install -e .)
+	(cd ../../../torchaudio && uv pip install -e . --no-build-isolation)
+	(cd ../../../FBGEMM/fbgemm_gpu && uv pip install -r requirements.txt && uv pip install -e . --no-build-isolation)
+	(cd ../../../torchrec && uv pip install -e .)
+	(cd ../../../detectron2 && uv pip install -e . --no-build-isolation)
+	(cd ../../../torchbenchmark && python install.py --continue_on_fail $(if $(TORCHBENCH_MODELS),models $(TORCHBENCH_MODELS)))
+	uv pip uninstall torchrec-nightly fbgemm-gpu-nightly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/benchmarks.py b/benchmarks/dynamo/benchmarks.py
index 981cfffe5129..4e3a3ec4f445 100755
--- a/benchmarks/dynamo/benchmarks.py
+++ b/benchmarks/dynamo/benchmarks.py
@@ -6,7 +6,11 @@
 
 
 # Note - hf and timm have their own version of this, torchbench does not
+<<<<<<< HEAD
 # TOOD(voz): Someday, consolidate all the files into one runner instead of a shim like this...
+=======
+# TODO(voz): Someday, consolidate all the files into one runner instead of a shim like this...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def model_names(filename: str) -> set[str]:
     names = set()
     with open(filename) as fh:
diff --git a/benchmarks/dynamo/cachebench.py b/benchmarks/dynamo/cachebench.py
index e32939add372..707e6e96942c 100644
--- a/benchmarks/dynamo/cachebench.py
+++ b/benchmarks/dynamo/cachebench.py
@@ -8,7 +8,11 @@
 import tempfile
 from typing import Callable
 
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -62,7 +66,11 @@ def _run_torchbench_from_args(
     warm_compile_time: list[float] = []
 
     for _ in range(cmd_args.repeat):
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             env = os.environ.copy()
             with tempfile.NamedTemporaryFile(suffix=".csv") as file:
                 args.append("--output=" + file.name)
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
index d4bfba1455e3..ca6a64d4aa1f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@@ -82,7 +82,11 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 detectron2_fcos_r_50_fpn,pass,20
+=======
+detectron2_fcos_r_50_fpn,pass,22
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -378,7 +382,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
index 95f4c93d55d4..5855473eef4c 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@@ -114,7 +114,11 @@ hf_Reformer,pass,23
 
 
 
+<<<<<<< HEAD
 hf_Roberta_base,fail_accuracy,6
+=======
+hf_Roberta_base,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -286,7 +290,11 @@ vgg16,pass,6
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,35
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
index 3ed2130c916e..e1248369636e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@@ -342,7 +342,11 @@ torch_multimodal_clip,pass,0
 
 
 
+<<<<<<< HEAD
 tts_angular,pass,0
+=======
+tts_angular,fail_to_run,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
index 1d1ee09d9e57..0352968c8594 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
@@ -42,7 +42,11 @@ cspdarknet53,pass,0
 
 
 
+<<<<<<< HEAD
 deit_base_distilled_patch16_224,fail_to_run,0
+=======
+deit_base_distilled_patch16_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -114,7 +118,11 @@ lcnet_050,pass,0
 
 
 
+<<<<<<< HEAD
 levit_128,fail_to_run,0
+=======
+levit_128,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -238,7 +246,11 @@ vit_base_patch16_224,pass,0
 
 
 
+<<<<<<< HEAD
 volo_d1_224,fail_to_run,0
+=======
+volo_d1_224,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
index dd6f6264f90c..7550150ca2d6 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
@@ -34,7 +34,11 @@ basic_gnn_gin,pass,0
 
 
 
+<<<<<<< HEAD
 basic_gnn_sage,fail_to_run,0
+=======
+basic_gnn_sage,pass,0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
index f7e07b308f2d..d45882fb57f1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@@ -74,7 +74,11 @@ detectron2_fasterrcnn_r_50_fpn,pass,46
 
 
 
+<<<<<<< HEAD
 detectron2_fcos_r_50_fpn,pass,22
+=======
+detectron2_fcos_r_50_fpn,pass,24
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
index c6b1387d85d0..5c600b6d07fe 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@@ -74,7 +74,11 @@ detectron2_fasterrcnn_r_50_fpn,pass,46
 
 
 
+<<<<<<< HEAD
 detectron2_fcos_r_50_fpn,pass,22
+=======
+detectron2_fcos_r_50_fpn,pass,24
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -138,7 +142,11 @@ hf_Bert_large,pass,0
 
 
 
+<<<<<<< HEAD
 hf_BigBird,pass,18
+=======
+hf_BigBird,pass,24
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
index 8e1d36a60b04..85561135b1e7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@@ -82,7 +82,11 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 detectron2_fcos_r_50_fpn,pass,20
+=======
+detectron2_fcos_r_50_fpn,pass,22
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -378,7 +382,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
index 8284828114e0..959cd04d3a3e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@@ -114,7 +114,11 @@ hf_Reformer,fail_to_run,19
 
 
 
+<<<<<<< HEAD
 hf_Roberta_base,fail_accuracy,6
+=======
+hf_Roberta_base,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -282,7 +286,11 @@ vgg16,pass,6
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,35
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
index a802f681c682..b2d924f27914 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@@ -74,7 +74,11 @@ detectron2_fasterrcnn_r_50_fpn,pass,46
 
 
 
+<<<<<<< HEAD
 detectron2_fcos_r_50_fpn,pass,22
+=======
+detectron2_fcos_r_50_fpn,pass,24
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -122,7 +126,11 @@ hf_Bert_large,pass,0
 
 
 
+<<<<<<< HEAD
 hf_BigBird,pass,18
+=======
+hf_BigBird,pass,24
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
index 00fc3c9e0949..300d8153f443 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@@ -290,7 +290,11 @@ soft_actor_critic,pass,0
 
 
 
+<<<<<<< HEAD
 speech_transformer,fail_to_run,5
+=======
+speech_transformer,pass,10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
index 8e1d36a60b04..85561135b1e7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@@ -82,7 +82,11 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 detectron2_fcos_r_50_fpn,pass,20
+=======
+detectron2_fcos_r_50_fpn,pass,22
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -378,7 +382,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
index ed212810c04c..fbaf7c037e6a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@@ -114,7 +114,11 @@ hf_Reformer,fail_to_run,19
 
 
 
+<<<<<<< HEAD
 hf_Roberta_base,fail_accuracy,6
+=======
+hf_Roberta_base,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -282,7 +286,11 @@ vgg16,pass,6
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,35
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
index d4bfba1455e3..ca6a64d4aa1f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@@ -82,7 +82,11 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 detectron2_fcos_r_50_fpn,pass,20
+=======
+detectron2_fcos_r_50_fpn,pass,22
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -378,7 +382,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
index 52c28eac5cee..e7e60fa87b28 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@@ -114,7 +114,11 @@ hf_Reformer,pass,23
 
 
 
+<<<<<<< HEAD
 hf_Roberta_base,fail_accuracy,6
+=======
+hf_Roberta_base,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -286,7 +290,11 @@ vgg16,pass,6
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,35
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
index e776b796af3e..a9c4b1e4369f 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -82,7 +82,11 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 detectron2_fcos_r_50_fpn,pass,20
+=======
+detectron2_fcos_r_50_fpn,pass,22
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -378,7 +382,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
index 95f4c93d55d4..5855473eef4c 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -114,7 +114,11 @@ hf_Reformer,pass,23
 
 
 
+<<<<<<< HEAD
 hf_Roberta_base,fail_accuracy,6
+=======
+hf_Roberta_base,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -286,7 +290,11 @@ vgg16,pass,6
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,35
+=======
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
index 148e0331849d..79ba927c3daf 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@@ -389,7 +389,11 @@ vgg16,eager_two_runs_differ,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
index 0f68cba8f87d..79a203744911 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
@@ -154,6 +154,13 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+microbench_unbacked_tolist_sum,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mnasnet1_0,pass,7
 
 
@@ -266,11 +273,19 @@ timm_nfnet,pass,0
 
 
 
+<<<<<<< HEAD
 timm_regnet,pass,6
 
 
 
 timm_resnest,pass,7
+=======
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -294,11 +309,19 @@ tts_angular,pass,9
 
 
 
+<<<<<<< HEAD
 vgg16,pass,6
 
 
 
 vision_maskrcnn,pass,35
+=======
+vgg16,pass,0
+
+
+
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
index 0ceb0901deea..d4e9cb708d94 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@@ -389,7 +389,11 @@ vgg16,eager_two_runs_differ,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
index 17346089a4ef..4fee385a9b33 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
@@ -154,6 +154,13 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+microbench_unbacked_tolist_sum,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mnasnet1_0,pass,7
 
 
@@ -262,11 +269,19 @@ timm_nfnet,pass,0
 
 
 
+<<<<<<< HEAD
 timm_regnet,pass,6
 
 
 
 timm_resnest,pass,7
+=======
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -290,11 +305,19 @@ tts_angular,pass,9
 
 
 
+<<<<<<< HEAD
 vgg16,pass,6
 
 
 
 vision_maskrcnn,pass,35
+=======
+vgg16,pass,0
+
+
+
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
index ec04c7919f9d..f8d622467f16 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@@ -82,7 +82,11 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 detectron2_fcos_r_50_fpn,pass,20
+=======
+detectron2_fcos_r_50_fpn,pass,22
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -366,7 +370,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
index 14b44da8bbcd..96d2efea8a00 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
@@ -142,6 +142,13 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+microbench_unbacked_tolist_sum,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mnasnet1_0,pass,7
 
 
@@ -242,11 +249,19 @@ timm_efficientnet,pass,7
 
 
 
+<<<<<<< HEAD
 timm_regnet,pass,6
 
 
 
 timm_resnest,pass,7
+=======
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -270,11 +285,19 @@ tts_angular,pass,9
 
 
 
+<<<<<<< HEAD
 vgg16,pass,6
 
 
 
 vision_maskrcnn,pass,35
+=======
+vgg16,pass,0
+
+
+
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
index 148e0331849d..79ba927c3daf 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@@ -389,7 +389,11 @@ vgg16,eager_two_runs_differ,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
index 44064a074bab..d8b61da391fb 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
@@ -154,6 +154,13 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+microbench_unbacked_tolist_sum,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mnasnet1_0,pass,7
 
 
@@ -266,11 +273,19 @@ timm_nfnet,pass,0
 
 
 
+<<<<<<< HEAD
 timm_regnet,pass,6
 
 
 
 timm_resnest,pass,7
+=======
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -294,11 +309,19 @@ tts_angular,pass,9
 
 
 
+<<<<<<< HEAD
 vgg16,pass,6
 
 
 
 vision_maskrcnn,pass,35
+=======
+vgg16,pass,0
+
+
+
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
index b90bc5695fdb..9416e1ba7ee8 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
@@ -82,7 +82,11 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
+<<<<<<< HEAD
 detectron2_fcos_r_50_fpn,pass,20
+=======
+detectron2_fcos_r_50_fpn,pass,22
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -370,7 +374,11 @@ vgg16,pass,0
 
 
 
+<<<<<<< HEAD
 vision_maskrcnn,pass,18
+=======
+vision_maskrcnn,pass,20
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
index 2e5fdd341dab..b61bcfe5ec3c 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
@@ -142,6 +142,13 @@ maml_omniglot,pass,7
 
 
 
+<<<<<<< HEAD
+=======
+microbench_unbacked_tolist_sum,pass,8
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mnasnet1_0,pass,7
 
 
@@ -246,11 +253,19 @@ timm_efficientnet,pass,7
 
 
 
+<<<<<<< HEAD
 timm_regnet,pass,6
 
 
 
 timm_resnest,pass,7
+=======
+timm_regnet,pass,7
+
+
+
+timm_resnest,pass,6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
@@ -274,11 +289,19 @@ tts_angular,pass,9
 
 
 
+<<<<<<< HEAD
 vgg16,pass,6
 
 
 
 vision_maskrcnn,pass,35
+=======
+vgg16,pass,0
+
+
+
+vision_maskrcnn,pass,39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/update_expected.py b/benchmarks/dynamo/ci_expected_accuracy/update_expected.py
index 564928223b56..2b1bad57708b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/update_expected.py
+++ b/benchmarks/dynamo/ci_expected_accuracy/update_expected.py
@@ -1,5 +1,9 @@
 """
+<<<<<<< HEAD
 Update commited CSV files used as reference points by dynamo/inductor CI.
+=======
+Update committed CSV files used as reference points by dynamo/inductor CI.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Currently only cares about graph breaks, so only saves those columns.
 
@@ -67,7 +71,11 @@
     workflowName, jobName
 """
 ARTIFACTS_QUERY_URL = (
+<<<<<<< HEAD
     "https://console-api.clickhouse.cloud/.api/query-endpoints/"
+=======
+    "https://console-api.clickhouse.cloud/.api/query-endpoints/"  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "c1cdfadc-6bb2-4a91-bbf9-3d19e1981cd4/run?format=JSON"
 )
 CSV_LINTER = str(
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 22b3731a8a6d..468b2c716da4 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -14,6 +14,10 @@
 import json
 import logging
 import os
+<<<<<<< HEAD
+=======
+import random
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import shutil
 import signal
 import subprocess
@@ -49,6 +53,7 @@
 
 
 try:
+<<<<<<< HEAD
     from torch._dynamo.utils import (
         clone_inputs,
         graph_break_reasons,
@@ -61,6 +66,12 @@
         graph_break_reasons,
         maybe_enable_compiled_autograd,
     )
+=======
+    from torch._dynamo.utils import clone_inputs, graph_break_reasons
+    from torch._inductor.utils import fresh_cache
+except ImportError:
+    from _dynamo.utils import clone_inputs, graph_break_reasons
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._functorch.config
 from torch._functorch.aot_autograd import set_model_name
@@ -74,7 +85,11 @@
     import torch_xla
     import torch_xla.core.xla_model as xm
 
+<<<<<<< HEAD
     # This is to woraround the backward issue https://github.com/pytorch/xla/issues/4174
+=======
+    # This is to workaround the backward issue https://github.com/pytorch/xla/issues/4174
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch_xla._XLAC._init_computation_client()
 except ImportError:
     # ignore the error if torch_xla is not installed
@@ -89,6 +104,10 @@
 
 # We are primarily interested in TF32
 torch.backends.cuda.matmul.allow_tf32 = True
+<<<<<<< HEAD
+=======
+torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Suppress torch.profiler spam
 os.environ["KINETO_LOG_LEVEL"] = "5"
@@ -276,7 +295,11 @@ class CI(NamedTuple):
 
 
 # Maps a benchmark model name to a list of status codes. For any listed entry, we'll
+<<<<<<< HEAD
 # capture TORCH_COMPILE_DEBUG logs in CI runs and preseve them (i.e., for upload) if
+=======
+# capture TORCH_COMPILE_DEBUG logs in CI runs and preserve them (i.e., for upload) if
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # the result status matches one listed.
 CI_PRESERVE_COMPILE_DEBUG = {
     # For example:
@@ -350,7 +373,11 @@ def load_model_from_path(path_and_class_str):
     return model, inputs
 
 
+<<<<<<< HEAD
 def write_outputs(filename, headers, row):
+=======
+def write_outputs(filename, headers, row, upload_to_benchmark_db: bool = True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Write both CSV and JSON outputs using the original CSV output interface
     """
@@ -359,7 +386,12 @@ def write_outputs(filename, headers, row):
         return
 
     output_csv(filename, headers, row)
+<<<<<<< HEAD
     output_json(filename, headers, row)
+=======
+    if upload_to_benchmark_db:
+        output_json(filename, headers, row)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def output_csv(filename, headers, row):
@@ -565,7 +597,11 @@ def nothing(f):
     return f
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def patch_torch_manual_seed():
     """Make torch manual seed deterministic. Helps with accuracy testing."""
 
@@ -593,17 +629,25 @@ def empty_gpu_cache(device):
     Explicitly empty gpu cache to avoid OOM in subsequent run.
     """
 
+<<<<<<< HEAD
     if device not in ["cuda", "xpu"]:
+=======
+    if device not in ["cuda", "xpu", "mps"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.warning(
             "Trying to call the empty_gpu_cache for device: %s, which is not in list [cuda, xpu]",
             device,
         )
         return
 
+<<<<<<< HEAD
     if device == "cuda":
         torch.cuda.empty_cache()
     elif device == "xpu":
         torch.xpu.empty_cache()
+=======
+    getattr(torch, device).empty_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def synchronize():
@@ -699,17 +743,64 @@ def timed(
     times=1,
     return_result=False,
     collect_outputs=False,
+<<<<<<< HEAD
+=======
+    batch_size=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     use_xla = tensor_is_on_xla(example_inputs)
     synchronize()
 
+<<<<<<< HEAD
+=======
+    if batch_size:
+        patch_torch_manual_seed()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if use_xla:
         xm.mark_step()
         xm.wait_device_ops()
 
+<<<<<<< HEAD
+    time_total = 0
+    # Dont collect outputs to correctly measure timing
+    for _ in range(times):
+=======
+    def vary_batch(t: torch.Tensor, new_batch_size) -> torch.Tensor:
+        for i, s in enumerate(t.size()):
+            if s == batch_size:
+                # If new batch is smaller, we truncate
+                if new_batch_size < batch_size:
+                    indexer = [slice(None)] * t.ndim
+                    indexer[i] = slice(0, new_batch_size)
+                    t = t[tuple(indexer)]
+                # If new batch is greater, we just duplicate the last row
+                # over and over until we hit the desired batch size
+                elif new_batch_size > batch_size:
+                    indexer = [slice(None)] * t.ndim
+                    indexer[i] = -1
+                    last_slice = t[tuple(indexer)].unsqueeze(i)
+                    repeat_shape = list(t.shape)
+                    repeat_shape[i] = new_batch_size - batch_size
+                    padding = last_slice.expand(*repeat_shape)
+                    t = torch.cat([t, padding], dim=i)
+                break
+        return t
+
     time_total = 0
     # Dont collect outputs to correctly measure timing
     for _ in range(times):
+        # If batch_size is 1, it too often collides with other non batch size
+        # dimensions resulting in errors.
+        if batch_size and batch_size > 1:
+            # Calculate new batch size by varying the original batch size by up to 20%
+            # Ensure it's at least greater than 1
+            variation = random.uniform(0.8, 1.2)
+            new_batch_size = max(2, int(batch_size * variation))
+            example_inputs = tree_map_only(
+                torch.Tensor, lambda x: vary_batch(x, new_batch_size), example_inputs
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Put this call inside the loop to reset the seed for each iteration.
         # Don't include reset_rng_state() to correctly measure timing
         reset_rng_state(use_xla)
@@ -916,6 +1007,7 @@ def maybe_mark_profile(*args, **kwargs):
             # inputs will incur high penalty then the next one.
             maybe_mark_step(args)
 
+<<<<<<< HEAD
             with (
                 maybe_mark_profile(p=p, mark=mark),
                 maybe_enable_compiled_autograd(
@@ -924,6 +1016,9 @@ def maybe_mark_profile(*args, **kwargs):
                     dynamic=args.dynamic_shapes,
                 ),
             ):
+=======
+            with maybe_mark_profile(p=p, mark=mark):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 timings[rep], actual_output = timed(
                     model,
                     model_iter_fn,
@@ -1034,9 +1129,12 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
 
     Writes to ./speedups.csv
     """
+<<<<<<< HEAD
     # if args.dynamic_shapes:
     #     return speedup_experiment_ds(args, model_iter_fn, model, example_inputs)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timings = np.zeros((args.repeat, 2), np.float64)
     # if we randomize the input, we should also check the result is correct
     should_randomize_input = args.randomize_input
@@ -1057,14 +1155,24 @@ def maybe_mark_profile(*args, **kwargs):
 
     times = args.iterations_per_run
 
+<<<<<<< HEAD
     # Use higher tolerance for XLA since XLA cause numerical unstability when
+=======
+    # Use higher tolerance for XLA since XLA cause numerical instability when
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # graph size changes
     tolerance = args.xla_tolerance if args.trace_on_xla else 1e-4
     torch._dynamo.config.repro_tolerance = tolerance
 
     with maybe_profile(args.export_profiler_trace, **args.profile_details) as p:
         if args.export_aot_inductor:
+<<<<<<< HEAD
             frozen_model_iter_fn = export_aot_inductor(model, example_inputs)
+=======
+            frozen_model_iter_fn = export_aot_inductor(
+                model, example_inputs, args.inductor_compile_mode
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
 
@@ -1088,11 +1196,16 @@ def maybe_mark_profile(*args, **kwargs):
                     return_result=True,
                     times=times,
                     collect_outputs=args.collect_outputs,
+<<<<<<< HEAD
+=======
+                    batch_size=kwargs.get("batch_size"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             # call mark_step between the 2 calls to make the comparison fair.
             maybe_mark_step(args)
 
+<<<<<<< HEAD
             with (
                 maybe_mark_profile(p=p, mark="actual"),
                 maybe_enable_compiled_autograd(
@@ -1101,6 +1214,9 @@ def maybe_mark_profile(*args, **kwargs):
                     dynamic=args.dynamic_shapes,
                 ),
             ):
+=======
+            with maybe_mark_profile(p=p, mark="actual"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 timings[rep, 1], actual_output = timed(
                     model,
                     frozen_model_iter_fn,
@@ -1200,6 +1316,7 @@ def maybe_mark_profile(*args, **kwargs):
     return msg
 
 
+<<<<<<< HEAD
 # WARNING: This code is currently dead
 def speedup_experiment_ds(args, model_iter_fn, model, example_inputs):
     """
@@ -1276,6 +1393,8 @@ def speedup_experiment_ds(args, model_iter_fn, model, example_inputs):
     return output_str
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def overhead_experiment(*args, model_iter_fn):
     """
     Measure overheads of TorchDynamo by running with no backend (only
@@ -1398,6 +1517,7 @@ def _produce_dynamic_shapes_for_export(path, x):
 
     if not isinstance(x, torch.Tensor):
         return None
+<<<<<<< HEAD
     return {i: Dim.AUTO for i in getattr(x, "_dynamo_dynamic_indices", {})}
 
 
@@ -1409,6 +1529,18 @@ def load(cls, model, example_inputs):
         import torch._inductor
         import torch.export._trace
         from torch.export.dynamic_shapes import _tree_map_with_path
+=======
+    return dict.fromkeys(getattr(x, "_dynamo_dynamic_indices", {}), Dim.AUTO)
+
+
+class AOTInductorModelCache:
+    cache: dict[weakref.ref, tuple[Any, float]] = {}
+
+    @classmethod
+    def load(cls, model, example_inputs, mode):
+        import torch._inductor
+        from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         key = weakref.ref(model)
         if key not in cls.cache:
@@ -1417,9 +1549,17 @@ def load(cls, model, example_inputs):
             with torch.no_grad():
                 # copy.deepcopy is required to prevent any surprising side-effect,
                 # see https://github.com/pytorch/pytorch/issues/113029
+<<<<<<< HEAD
                 example_outputs = copy.deepcopy(model)(*example_args, **example_kwargs)
 
             if pytree._is_namedtuple_instance(example_outputs):
+=======
+                # This will cause memory stats to be overshadowed by this eager run.
+                # To fix that, memory stats will be reset later.
+                example_outputs = copy.deepcopy(model)(*example_args, **example_kwargs)
+
+            if pytree.is_namedtuple_instance(example_outputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 typ = type(example_outputs)
                 pytree._register_namedtuple(
                     typ,
@@ -1428,19 +1568,64 @@ def load(cls, model, example_inputs):
             else:
                 _register_dataclass_output_as_pytree(example_outputs)
 
+<<<<<<< HEAD
             combined_args = tuple(example_args) + tuple(example_kwargs.values())
+=======
+            combined_args = _combine_args(model, example_args, example_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dynamic_shapes = _tree_map_with_path(
                 _produce_dynamic_shapes_for_export, combined_args
             )
 
+<<<<<<< HEAD
             ep = torch.export.export(
                 model,
+=======
+            # delete example_outputs and reset memory stats here
+            del example_outputs
+            if current_device == "cuda":
+                empty_gpu_cache(current_device)
+                torch.cuda.reset_peak_memory_stats()
+                pre_clone_memory_used = torch.cuda.max_memory_allocated()
+            elif current_device == "hpu":
+                torch.hpu.reset_peak_memory_stats()
+                pre_clone_memory_used = torch.hpu.max_memory_allocated()
+
+            # Clone the model pre-exporting.  This prevents scenarios observed in a few
+            # models, where the forward pass modifies model state while exporting, and
+            # FakeTensors are thus saved as model data members.  This invalidates model
+            # reuse in eager mode, so it's safest to export a model clone.
+            model_clone = copy.deepcopy(model)
+
+            # Since CPU doesn't monitor max memory allocation, anything measuring peak
+            # memory will miss our transient model clone on CPU anyway.
+            #
+            # The justification for tracking this value (in order to remove it from the
+            # AOTInductor memory measurements) is that normal usage of AOTInductor would
+            # not clone the model, since the eager model would be unused post-export.
+            clone_memory_used = 0.0
+            if current_device == "cuda":
+                clone_memory_used = (
+                    torch.cuda.max_memory_allocated() - pre_clone_memory_used
+                ) / 1e9
+            elif current_device == "hpu":
+                clone_memory_used = (
+                    torch.hpu.max_memory_allocated() - pre_clone_memory_used
+                ) / 1e9
+
+            inductor_configs = {}
+            if mode == "max-autotune":
+                inductor_configs["max_autotune"] = True
+            ep = torch.export.export(
+                model_clone,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 example_args,
                 example_kwargs,
                 dynamic_shapes=dynamic_shapes,
                 strict=False,
             )
             with torch.no_grad():
+<<<<<<< HEAD
                 package_path = torch._inductor.aoti_compile_and_package(ep)  # type: ignore[arg-type]
 
             cls.cache[key] = torch._inductor.aoti_load_package(package_path)
@@ -1450,16 +1635,46 @@ def load(cls, model, example_inputs):
 
 def export(model, example_inputs):
     from torch.export.dynamic_shapes import _tree_map_with_path
+=======
+                package_path = torch._inductor.aoti_compile_and_package(
+                    ep, inductor_configs=inductor_configs
+                )  # type: ignore[arg-type]
+
+            cls.cache[key] = (
+                torch._inductor.aoti_load_package(package_path),
+                clone_memory_used,
+            )
+
+        return cls.cache[key][0]
+
+    @classmethod
+    def get_excess_memory(cls, model) -> float:
+        return cls.cache.get(weakref.ref(model), (None, 0.0))[1]
+
+
+def export(model, example_inputs):
+    from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
     example_outputs = model(*example_args, **example_kwargs)
     _register_dataclass_output_as_pytree(example_outputs)
 
+<<<<<<< HEAD
     combined_args = tuple(example_args) + tuple(example_kwargs.values())
+=======
+    combined_args = _combine_args(model, example_args, example_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dynamic_shapes = _tree_map_with_path(
         _produce_dynamic_shapes_for_export, combined_args
     )
 
+<<<<<<< HEAD
+=======
+    # NOTE: if args.export is ever enabled for --performance mode (rather than solely
+    # --accuracy), we'll need to clone the model and subtract out extra memory usage, as
+    # done in AOTInductorModelCache.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ep = torch.export.export(
         model, example_args, example_kwargs, dynamic_shapes=dynamic_shapes, strict=True
     )
@@ -1471,8 +1686,13 @@ def opt_export(_, example_inputs):
     return opt_export
 
 
+<<<<<<< HEAD
 def export_aot_inductor(model, example_inputs):
     optimized = AOTInductorModelCache.load(model, example_inputs)
+=======
+def export_aot_inductor(model, example_inputs, mode):
+    optimized = AOTInductorModelCache.load(model, example_inputs, mode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def opt_aot_inductor(_, example_inputs, collect_outputs=False):
         example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
@@ -1675,7 +1895,11 @@ def maybe_snapshot_memory(should_snapshot_memory, suffix):
                     )
                 )
             except Exception as e:
+<<<<<<< HEAD
                 logging.error("Failed to save memory snapshot, %s", e)
+=======
+                log.error("Failed to save memory snapshot, %s", e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             torch.cuda.memory._record_memory_history(enabled=None)
 
@@ -1695,7 +1919,11 @@ def setup_amp(self, current_device=None):
 
         devices = [current_device] if current_device else self.args.devices
         if self.args.amp:
+<<<<<<< HEAD
             # AMP training can lead to small loss values which can undeflow
+=======
+            # AMP training can lead to small loss values which can underflow
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # gradient values returning in zero gradients. To solve this
             # problem, PyTorch introduces GradScaler. GradScaler is a stateful
             # structure, that scales the loss values to prevent underflow. Loss
@@ -1733,7 +1961,11 @@ def init_optimizer(self, name, device, params):
                 self.optimizer = torch.optim.SGD(params, lr=0.01, foreach=True)
                 # Disable multi_tensor_sgd for benchmarking, there isn't a large performance benefit (~1%) to compiling
                 # this optimizer because it is a single foreach add, and increases compile time.
+<<<<<<< HEAD
                 # After autotuning and fake tensor caching lands, we can enable, becuase the compile time impact will be lower.
+=======
+                # After autotuning and fake tensor caching lands, we can enable, because the compile time impact will be lower.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Fake Tensor caching: https://github.com/pytorch/pytorch/pull/113873
                 # Autotuning: https://github.com/pytorch/pytorch/issues/117447
                 self.optimizer.step = torch._dynamo.disable(self.optimizer.step)
@@ -2234,6 +2466,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                         new_result = optimized_model_iter_fn(model_copy, example_inputs)
                 else:
                     optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
+<<<<<<< HEAD
                     with maybe_enable_compiled_autograd(
                         self.args.compiled_autograd,
                         fullgraph=self.args.nopython,
@@ -2242,6 +2475,11 @@ def record_status(accuracy_status, dynamo_start_stats):
                         new_result = self.run_n_iterations(
                             model_copy, example_inputs, optimized_model_iter_fn
                         )
+=======
+                    new_result = self.run_n_iterations(
+                        model_copy, example_inputs, optimized_model_iter_fn
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except Exception as e:
                 log.exception("")
                 print(
@@ -2463,6 +2701,7 @@ def warmup(fn, model, example_inputs, mode, niters=10):
             else:
                 optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
 
+<<<<<<< HEAD
             with (
                 maybe_enable_compiled_autograd(
                     self.args.compiled_autograd,
@@ -2472,6 +2711,10 @@ def warmup(fn, model, example_inputs, mode, niters=10):
                 maybe_snapshot_memory(
                     self.args.snapshot_memory, f"compiled_{self.args.only}"
                 ),
+=======
+            with maybe_snapshot_memory(
+                self.args.snapshot_memory, f"compiled_{self.args.only}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup(
                     optimized_model_iter_fn, model, example_inputs, "dynamo"
@@ -2484,17 +2727,29 @@ def warmup(fn, model, example_inputs, mode, niters=10):
                         "dynamo",
                         niters=1,
                     )
+<<<<<<< HEAD
+=======
+                # If we use warm peak memory, the AOT model loading transient memory
+                # won't be present on the warm measurement.  We only have to account for
+                # it when using cold memory.
+                elif self.args.export_aot_inductor:
+                    dynamo_peak_mem -= AOTInductorModelCache.get_excess_memory(model)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if self.args.profile_dynamo_cache_lookup:
                 with torch.profiler.profile(
                     activities=[torch.profiler.ProfilerActivity.CPU]
                 ) as prof:
+<<<<<<< HEAD
                     with maybe_enable_compiled_autograd(
                         self.args.compiled_autograd,
                         fullgraph=self.args.nopython,
                         dynamic=self.args.dynamic_shapes,
                     ):
                         warmup(optimized_model_iter_fn, model, example_inputs, "dynamo")
+=======
+                    warmup(optimized_model_iter_fn, model, example_inputs, "dynamo")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 events = list(
                     filter(
@@ -2542,7 +2797,18 @@ def warmup(fn, model, example_inputs, mode, niters=10):
             return " ".join(map(str, results))
 
     def run_performance_test(
+<<<<<<< HEAD
         self, name, model, example_inputs, optimize_ctx, experiment, tag=None
+=======
+        self,
+        name,
+        model,
+        example_inputs,
+        optimize_ctx,
+        experiment,
+        tag=None,
+        batch_size=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         if self.args.xla:
             with self.pick_grad(name, self.args.training):
@@ -2600,6 +2866,10 @@ def warmup(fn, model, example_inputs, mode, niters=5):
         with self.pick_grad(name, self.args.training), ctx:
             ok, total = Stats.reset_counters()
             experiment_kwargs = {}
+<<<<<<< HEAD
+=======
+            experiment_kwargs["batch_size"] = batch_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if tag is not None:
                 experiment_kwargs["tag"] = tag
             results = []
@@ -2623,6 +2893,7 @@ def warmup(fn, model, example_inputs, mode, niters=5):
             else:
                 optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
 
+<<<<<<< HEAD
             with (
                 maybe_enable_compiled_autograd(
                     self.args.compiled_autograd,
@@ -2632,6 +2903,10 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                 maybe_snapshot_memory(
                     self.args.snapshot_memory, f"compiled_{self.args.only}"
                 ),
+=======
+            with maybe_snapshot_memory(
+                self.args.snapshot_memory, f"compiled_{self.args.only}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup(
                     optimized_model_iter_fn, model, example_inputs, "dynamo"
@@ -2644,17 +2919,29 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                         "dynamo",
                         niters=1,
                     )
+<<<<<<< HEAD
+=======
+                # If we use warm peak memory, the AOT model loading transient memory
+                # won't be present on the warm measurement.  We only have to account for
+                # it when using cold memory.
+                elif self.args.export_aot_inductor:
+                    dynamo_peak_mem -= AOTInductorModelCache.get_excess_memory(model)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if self.args.profile_dynamo_cache_lookup:
                 with torch.profiler.profile(
                     activities=[torch.profiler.ProfilerActivity.CPU]
                 ) as prof:
+<<<<<<< HEAD
                     with maybe_enable_compiled_autograd(
                         self.args.compiled_autograd,
                         fullgraph=self.args.nopython,
                         dynamic=self.args.dynamic_shapes,
                     ):
                         warmup(optimized_model_iter_fn, model, example_inputs, "dynamo")
+=======
+                    warmup(optimized_model_iter_fn, model, example_inputs, "dynamo")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 events = list(
                     filter(
@@ -2720,7 +3007,11 @@ def minify_model(
         experiment,
         tag,
     ):
+<<<<<<< HEAD
         logging.info("Minifying %s...", name)
+=======
+        log.info("Minifying %s...", name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         os.environ["TORCH_COMPILE_DEBUG"] = "1"
         os.environ["TORCHDYNAMO_REPRO_AFTER"] = "dynamo"
         os.environ["TORCHDYNAMO_REPRO_LEVEL"] = "4"
@@ -2735,9 +3026,15 @@ def minify_model(
         try:
             shutil.move("repro.py", f"{repro_dir}/{name}_repro.py")
         except OSError:
+<<<<<<< HEAD
             logging.error("Could not find repro script for model %s", name)
         else:
             logging.info(
+=======
+            log.error("Could not find repro script for model %s", name)
+        else:
+            log.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Repro script for model %s with minified graph saved to %s",
                 name,
                 repro_dir,
@@ -2770,6 +3067,10 @@ def run_one_model(
         experiment,
         explain=False,
         tag=None,
+<<<<<<< HEAD
+=======
+        batch_size=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         mode = "train" if self.args.training else "eval"
         msg = f"{current_device:4} {mode:5} {current_name:34} "
@@ -2798,7 +3099,17 @@ def run_one_model(
                 )
             else:
                 status = self.run_performance_test(
+<<<<<<< HEAD
                     name, model, example_inputs, optimize_ctx, experiment, tag
+=======
+                    name,
+                    model,
+                    example_inputs,
+                    optimize_ctx,
+                    experiment,
+                    tag,
+                    batch_size=batch_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             print(status)
         empty_gpu_cache(current_device)
@@ -2840,10 +3151,21 @@ def add_double_quotes(x):
                 user_stack = add_double_quotes(
                     ", ".join([str(x) for x in graph_break.user_stack])
                 )
+<<<<<<< HEAD
+=======
+
+                # NB: Don't upload them to the benchmark database as they are debugging
+                # information. There are also around a million records a day which is
+                # wasteful to store
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 write_outputs(
                     filename,
                     ["model", "reason", "user_stack"],
                     [current_name, reason, user_stack],
+<<<<<<< HEAD
+=======
+                    False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         if self.args.stats:
@@ -2895,7 +3217,11 @@ def parse_args(args=None):
     iterations_per_run_help = """
         Run this may iterations for each time measurement. This is mainly used for
         XLA training. We want to run multiple iterations per measurement so the
+<<<<<<< HEAD
         tracing and computation for different iteartions can overlap with each
+=======
+        tracing and computation for different iterations can overlap with each
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         other. This makes sure we have an accurate xla baseline.
     """
     parser.add_argument(
@@ -3054,7 +3380,11 @@ def get_example_inputs(self):
     parser.add_argument(
         "--generate-aot-autograd-stats",
         action="store_true",
+<<<<<<< HEAD
         help="Generates AOT Autograd stats like how mnay graphs are sent to AOT",
+=======
+        help="Generates AOT Autograd stats like how many graphs are sent to AOT",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     parser.add_argument(
         "--inductor-settings",
@@ -3275,7 +3605,11 @@ def get_example_inputs(self):
         "--warm-start-latency",
         "--warm_start_latency",
         action="store_true",
+<<<<<<< HEAD
         help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run",
+=======
+        help="Run model(s) twice and preserve caches in between to enable a 'warm start' on the 2nd run",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     group_fuser = parser.add_mutually_exclusive_group()
@@ -3430,7 +3764,11 @@ def maybe_fresh_cache(args):
     if not cache_dir_assigned and (
         args.cold_start_latency or args.warm_start_latency or args.ci
     ):
+<<<<<<< HEAD
         return fresh_inductor_cache()
+=======
+        return fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return contextlib.nullcontext()
 
@@ -3532,6 +3870,11 @@ def run(runner, args, original_dir=None):
     if args.dynamic_shapes:
         if not args.dynamic_batch_only:
             torch._dynamo.config.assume_static_by_default = False
+<<<<<<< HEAD
+=======
+    if args.compiled_autograd:
+        torch._dynamo.config.compiled_autograd = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if args.propagate_real_tensors:
         # TODO: Separate flag for data dependent
         torch._dynamo.config.capture_scalar_outputs = True
@@ -3588,6 +3931,7 @@ def run(runner, args, original_dir=None):
             "sam_fast",
             "resnet50_quantized_qat",
             "mobilenet_v2_quantized_qat",
+<<<<<<< HEAD
         }:
             # some of the models do not support use_deterministic_algorithms
             torch.use_deterministic_algorithms(True)
@@ -3602,14 +3946,50 @@ def run(runner, args, original_dir=None):
         #     # These seem unhappy with numerics of larger cuBLASLt workspace
         #     # sizes following #145130 (due to enabling split-k?)
         #     torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+=======
+            "detectron2_maskrcnn",
+            "detectron2_maskrcnn_r_101_c4",
+            "detectron2_maskrcnn_r_101_fpn",
+            "detectron2_maskrcnn_r_50_c4",
+            "detectron2_maskrcnn_r_50_fpn",
+            "detectron2_fasterrcnn_r_101_c4",
+            "detectron2_fasterrcnn_r_101_dc5",
+            "detectron2_fasterrcnn_r_101_fpn",
+            "detectron2_fasterrcnn_r_50_c4",
+            "detectron2_fasterrcnn_r_50_dc5",
+            "detectron2_fasterrcnn_r_50_fpn",
+        }:
+            # some of the models do not support use_deterministic_algorithms
+            torch.use_deterministic_algorithms(True)
+        if args.devices == ["xpu"]:
+            torch.use_deterministic_algorithms(True, warn_only=True)
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+        if args.only is not None and args.only in {
+            "DebertaForQuestionAnswering",
+            "nvidia_deeprecommender",
+            "crossvit_9_240",
+        }:
+            # These seem unhappy with numerics of larger cuBLASLt workspace
+            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
+            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.allow_tf32 = False
         torch.backends.cudnn.benchmark = False
         torch.backends.cuda.matmul.allow_tf32 = False
+<<<<<<< HEAD
 
         torch.backends.mkldnn.deterministic = True
 
         # Remove randomeness when torch manual seed is called
+=======
+        torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(False)
+
+        torch.backends.mkldnn.deterministic = True
+
+        # Remove randomness when torch manual seed is called
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         patch_torch_manual_seed()
 
         # Some models e.g. yolov3 assert batch size on n_gpus
@@ -3787,7 +4167,13 @@ def run(runner, args, original_dir=None):
     elif args.backend or args.export_aot_inductor:
         if args.export_aot_inductor:
             assert not args.training, "AOTInductor only supports inference"
+<<<<<<< HEAD
             optimize_ctx = functools.partial(export_aot_inductor)
+=======
+            optimize_ctx = functools.partial(
+                export_aot_inductor, mode=args.inductor_compile_mode
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # AOTInductor doesn't support control flow yet
             runner.skip_models.update(runner.skip_models_due_to_control_flow)
@@ -4113,6 +4499,10 @@ def detect_and_mark_batch(t):
                         experiment,
                         explain=args.explain,
                         tag=args.tag,
+<<<<<<< HEAD
+=======
+                        batch_size=batch_size if args.dynamic_batch_only else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
         if args.generate_aot_autograd_stats:
             stats_file = output_filename.split(".csv")[0] + "_stats.csv"
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index 630e5512d1fb..2ba2ac3d8bea 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+<<<<<<< HEAD
+=======
+# flake8: noqa: F821
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import importlib
 import logging
@@ -48,7 +52,10 @@ def pip_install(package):
 
 # Disable the flake warnings for the imports. Flake8 does not provide a way to
 # disable just warning for the entire file. Disabling flake8 entirely.
+<<<<<<< HEAD
 # flake8: noqa
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 imports = [
     "AlbertForPreTraining",
     "AutoConfig",
@@ -111,7 +118,11 @@ def process_hf_reformer_output(out):
 # Get the list of models and their batch sizes
 MODELS_FILENAME = os.path.join(os.path.dirname(__file__), "huggingface_models_list.txt")
 assert os.path.exists(MODELS_FILENAME)
+<<<<<<< HEAD
 with open(MODELS_FILENAME, "r") as fh:
+=======
+with open(MODELS_FILENAME) as fh:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     lines = fh.readlines()
     lines = [line.rstrip() for line in lines]
     for line in lines:
@@ -166,7 +177,11 @@ def get_sequence_length(model_cls, model_name):
         seq_length = 10000  # NB: a more realistic size is 155136
     else:
         log.info(
+<<<<<<< HEAD
             f"Sequence Length not defined for {model_name}. Choosing 128 arbitrarily"
+=======
+            f"Sequence Length not defined for {model_name}. Choosing 128 arbitrarily"  # noqa: G004
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         seq_length = 128
     return seq_length
@@ -204,6 +219,7 @@ def generate_inputs_for_model(
 
     input_dict = {"input_ids": input}
 
+<<<<<<< HEAD
     if (
         model_name.startswith("T5")
         or model_name.startswith("M2M100")
@@ -220,6 +236,18 @@ def generate_inputs_for_model(
             MarianMTModel,
         ]
     ):
+=======
+    if model_name.startswith(("T5", "M2M100", "MT5")) or model_cls in [
+        BlenderbotModel,
+        BlenderbotSmallModel,
+        BlenderbotForConditionalGeneration,
+        BlenderbotSmallForConditionalGeneration,
+        PegasusModel,
+        PegasusForConditionalGeneration,
+        MarianModel,
+        MarianMTModel,
+    ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_dict["decoder_input_ids"] = input
 
     if model_name.startswith("Lxmert"):
@@ -251,11 +279,16 @@ def generate_inputs_for_model(
                 device, 0, seq_length, (bs,)
             )
             input_dict["end_positions"] = rand_int_tensor(device, 0, seq_length, (bs,))
+<<<<<<< HEAD
         elif (
             model_name.endswith("MaskedLM")
             or model_name.endswith("HeadModel")
             or model_name.endswith("CausalLM")
             or model_name.endswith("DoubleHeadsModel")
+=======
+        elif model_name.endswith(
+            ("MaskedLM", "HeadModel", "CausalLM", "DoubleHeadsModel")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             input_dict["labels"] = rand_int_tensor(
                 device, 0, vocab_size, (bs, seq_length)
@@ -369,7 +402,14 @@ def skip_models_due_to_control_flow(self):
         return self._skip["control_flow"]
 
     def use_larger_multiplier_for_smaller_tensor(self, name):
+<<<<<<< HEAD
         return name in ["ElectraForQuestionAnswering"]
+=======
+        return name in [
+            "ElectraForQuestionAnswering",
+            "MegatronBertForQuestionAnswering",
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _get_model_cls_and_config(self, model_name):
         if model_name not in EXTRA_MODELS:
@@ -426,7 +466,11 @@ def load_model(
         elif batch_size is None:
             batch_size_default = 16
             log.info(
+<<<<<<< HEAD
                 f"Batch size not specified for {model_name}. Setting batch_size=16"
+=======
+                f"Batch size not specified for {model_name}. Setting batch_size=16"  # noqa: G004
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if batch_size is None:
@@ -435,7 +479,11 @@ def load_model(
             if model_name in batch_size_divisors:
                 batch_size = max(int(batch_size / batch_size_divisors[model_name]), 1)
                 log.info(
+<<<<<<< HEAD
                     f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}"
+=======
+                    f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}"  # noqa: G004
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         example_inputs = generate_inputs_for_model(
@@ -471,8 +519,13 @@ def iter_model_names(self, args):
             if index < start or index >= end:
                 continue
             if (
+<<<<<<< HEAD
                 not re.search("|".join(args.filter), model_name, re.I)
                 or re.search("|".join(args.exclude), model_name, re.I)
+=======
+                not re.search("|".join(args.filter), model_name, re.IGNORECASE)
+                or re.search("|".join(args.exclude), model_name, re.IGNORECASE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 or model_name in args.exclude_exact
                 or model_name in self.skip_models
             ):
@@ -533,7 +586,11 @@ def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
         self.grad_scaler.scale(loss).backward()
         self.optimizer_step()
         if collect_outputs:
+<<<<<<< HEAD
             return collect_results(mod, pred, loss, cloned_inputs)
+=======
+            return collect_results(mod, None, loss, cloned_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
 
@@ -618,7 +675,11 @@ def refresh_model_names_and_batch_sizes():
                 + [f"--output={MODELS_FILENAME}"]
             )
         except subprocess.SubprocessError:
+<<<<<<< HEAD
             log.warning(f"Failed to find suitable batch size for {model_name}")
+=======
+            log.warning(f"Failed to find suitable batch size for {model_name}")  # noqa: G004
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def huggingface_main():
diff --git a/benchmarks/dynamo/microbenchmarks/bench_mm_fusion.py b/benchmarks/dynamo/microbenchmarks/bench_mm_fusion.py
index d33a98ddbbc3..71cefe53de1f 100644
--- a/benchmarks/dynamo/microbenchmarks/bench_mm_fusion.py
+++ b/benchmarks/dynamo/microbenchmarks/bench_mm_fusion.py
@@ -1,6 +1,11 @@
+<<<<<<< HEAD
 # flake8: noqa
 
 import triton
+=======
+# flake8: noqa: B902
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from prettytable import PrettyTable
 
 import torch
@@ -18,7 +23,11 @@
 torch.backends.cuda.matmul.allow_tf32 = True
 
 
+<<<<<<< HEAD
 class Func(object):
+=======
+class Func:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # mm
     @torch._dynamo.optimize("inductor")
     def mm(a, b, bias):
@@ -45,7 +54,14 @@ def mm_add_relu(a, b, bias):
         return torch.relu(y)
 
 
+<<<<<<< HEAD
 def bench(shape, layer_id, p, fusion_types=[""]):
+=======
+def bench(shape, layer_id, p, fusion_types=None):
+    torch._logging.set_logs(inductor_metrics=True)
+    if fusion_types is None:
+        fusion_types = [""]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dtype = torch.float16
     M, K = shape[0]
     _, N = shape[1]
@@ -60,7 +76,11 @@ def tflops(ms):
     row = [layer_id]
     for fusion_type in fusion_types:
         if fusion_type == "":
+<<<<<<< HEAD
             fn_mm = getattr(Func, "mm")
+=======
+            fn_mm = Func.mm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             fn_mm = getattr(Func, f"mm_{fusion_type}")
 
@@ -87,6 +107,10 @@ def fn():
         row.extend([tflops(torch_mm_ms), tflops(triton_mm_ms)])
 
     p.add_row(row)
+<<<<<<< HEAD
+=======
+    torch._logging.set_logs()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 fusion_types = ["", "add", "relu", "add_relu"]
diff --git a/benchmarks/dynamo/microbenchmarks/cache_hit_microbenchmarks.py b/benchmarks/dynamo/microbenchmarks/cache_hit_microbenchmarks.py
index 53879f5e8c0e..71c1bfb3e352 100644
--- a/benchmarks/dynamo/microbenchmarks/cache_hit_microbenchmarks.py
+++ b/benchmarks/dynamo/microbenchmarks/cache_hit_microbenchmarks.py
@@ -3,7 +3,11 @@
 
 import torch.fx
 from torch._dynamo.utils import counters
+<<<<<<< HEAD
 from torch._inductor.utils import clear_inductor_caches, fresh_inductor_cache
+=======
+from torch._inductor.utils import clear_caches, fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 N = 10000
@@ -20,7 +24,11 @@ def main():
     torch._inductor.config.fx_graph_cache = True
     torch._inductor.config.fx_graph_remote_cache = False
 
+<<<<<<< HEAD
     with fresh_inductor_cache():
+=======
+    with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = torch.randn(4).cuda()
         compiled_fn = torch.compile(huge_graph, backend="inductor")
 
@@ -30,7 +38,11 @@ def main():
 
         def setup():
             torch._dynamo.reset()
+<<<<<<< HEAD
             clear_inductor_caches()
+=======
+            clear_caches()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for m in torch._inductor.codecache.PyCodeCache.cache.values():
                 os.remove(m.__file__)
             counters.clear()
diff --git a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
index 36a212625f17..96d573114aff 100644
--- a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
+++ b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
@@ -135,7 +135,11 @@ def contains_tensor_types(type):
     )
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def non_compute_operator(op):
     schema = op._schema
 
@@ -274,7 +278,11 @@ def get_inputs_for_operator(
             yield
             return
 
+<<<<<<< HEAD
         # line[1] represents number of times these inputs occured, ignored for now
+=======
+        # line[1] represents number of times these inputs occurred, ignored for now
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for line in self.operator_db[str(operator)].items():
             inps = line[0]
 
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/add_loop.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/add_loop.py
index e805b7ff6b38..9fcb50785678 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/add_loop.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/add_loop.py
@@ -3,7 +3,11 @@
 from benchmark_base import BenchmarkBase
 
 import torch
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Benchmark(BenchmarkBase):
@@ -50,7 +54,11 @@ def f(a, b):
                     result = result.sin()
             return result
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f(self.a, self.b)
 
 
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/basic_modules_benchmarks.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/basic_modules_benchmarks.py
index 18d753b3a7ce..bdfa9f1fecf5 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/basic_modules_benchmarks.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/basic_modules_benchmarks.py
@@ -4,7 +4,11 @@
 
 import torch
 import torch.nn as nn
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ListOfLinears(nn.Module):
@@ -54,12 +58,18 @@ def _prepare(self):
         torch._dynamo.reset()
 
     def _work(self):
+<<<<<<< HEAD
         # enable_cpp_symbolic_shape_guards has impact on this benchmark
         # Keep using False value for consistency.
         with (
             fresh_inductor_cache(),
             torch._inductor.config.patch(force_shape_pad=self._force_shape_pad),
             torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False),
+=======
+        with (
+            fresh_cache(),
+            torch._inductor.config.patch(force_shape_pad=self._force_shape_pad),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             opt_m = torch.compile(backend=self.backend(), dynamic=self.is_dynamic())(
                 self.m.cuda() if self._is_gpu else self.m
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/benchmark_base.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/benchmark_base.py
index 7568fa701bbb..f5f11e928e2c 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/benchmark_base.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/benchmark_base.py
@@ -3,6 +3,11 @@
 import json
 import os
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
+=======
+from typing import Optional
+from typing_extensions import Self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._C._instruction_counter as i_counter
 import torch._dynamo.config as config
@@ -76,7 +81,11 @@ def __init__(
         backend: str = "",
         mode: str = "",
         dynamic=None,
+<<<<<<< HEAD
     ):
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # These individual attributes are used to support different filters on the
         # dashboard later
         self._category = category
@@ -85,6 +94,7 @@ def __init__(
         self._mode = mode  # Training or inference
         self._dynamic = dynamic
 
+<<<<<<< HEAD
     def with_iterations(self, value):
         self._num_iterations = value
         return self
@@ -130,6 +140,53 @@ def _prepare_once(self):  # noqa: B027
         pass
 
     def _count_instructions(self):
+=======
+    def with_iterations(self, value: int) -> Self:
+        self._num_iterations = value
+        return self
+
+    def enable_instruction_count(self) -> Self:
+        self._enable_instruction_count = True
+        return self
+
+    def enable_compile_time_instruction_count(self) -> Self:
+        self._enable_compile_time_instruction_count = True
+        return self
+
+    def name(self) -> str:
+        return ""
+
+    def backend(self) -> str:
+        return self._backend
+
+    def mode(self) -> str:
+        return self._mode
+
+    def category(self) -> str:
+        return self._category
+
+    def device(self) -> str:
+        return self._device
+
+    def is_dynamic(self) -> Optional[bool]:
+        return self._dynamic
+
+    def description(self) -> str:
+        return ""
+
+    @abstractmethod
+    def _prepare(self) -> None:
+        pass
+
+    @abstractmethod
+    def _work(self) -> None:
+        pass
+
+    def _prepare_once(self) -> None:  # noqa: B027
+        pass
+
+    def _count_instructions(self) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         print(f"collecting instruction count for {self.name()}")
         results = []
         for i in range(self._num_iterations):
@@ -141,7 +198,11 @@ def _count_instructions(self):
             results.append(count)
         return min(results)
 
+<<<<<<< HEAD
     def _count_compile_time_instructions(self):
+=======
+    def _count_compile_time_instructions(self) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gc.disable()
 
         try:
@@ -169,7 +230,11 @@ def _count_compile_time_instructions(self):
         finally:
             gc.enable()
 
+<<<<<<< HEAD
     def _write_to_json(self, output_dir: str):
+=======
+    def _write_to_json(self, output_dir: str) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Write the result into JSON format, so that it can be uploaded to the benchmark database
         to be displayed on OSS dashboard. The JSON format is defined at
@@ -209,7 +274,11 @@ def _write_to_json(self, output_dir: str):
         with open(os.path.join(output_dir, f"{self.name()}.json"), "w") as f:
             json.dump(records, f)
 
+<<<<<<< HEAD
     def append_results(self, path):
+=======
+    def append_results(self, path: str) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with open(path, "a", newline="") as csvfile:
             # Create a writer object
             writer = csv.writer(csvfile)
@@ -221,11 +290,19 @@ def append_results(self, path):
         # as the CSV writer for now
         self._write_to_json(os.path.dirname(os.path.abspath(path)))
 
+<<<<<<< HEAD
     def print(self):
         for entry in self.results:
             print(f"{entry[0]},{entry[1]},{entry[2]}")
 
     def collect_all(self):
+=======
+    def print(self) -> None:
+        for entry in self.results:
+            print(f"{entry[0]},{entry[1]},{entry[2]}")
+
+    def collect_all(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._prepare_once()
         self.results = []
         if (
@@ -245,7 +322,14 @@ def collect_all(self):
                     instruction_count=r,
                 )
         if self._enable_compile_time_instruction_count:
+<<<<<<< HEAD
             r = self._count_compile_time_instructions()
+=======
+            # enable_cpp_symbolic_shape_guards has impact on these benchmarks
+            # Keep using False value for consistency.
+            with config.patch("enable_cpp_symbolic_shape_guards", False):
+                r = self._count_compile_time_instructions()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.results.append(
                 (
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dynamo_inline.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dynamo_inline.py
new file mode 100644
index 000000000000..13efd583efee
--- /dev/null
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/dynamo_inline.py
@@ -0,0 +1,117 @@
+import sys
+
+from benchmark_base import BenchmarkBase
+
+import torch
+import torch.nn as nn
+from torch._inductor.utils import fresh_cache
+
+
+# Create a chain of artificial nesting
+def fn(x):
+    return x + 1
+
+
+def fn1(x):
+    return fn(x)
+
+
+def fn2(x):
+    return fn1(x)
+
+
+def fn3(x):
+    return fn2(x)
+
+
+def fn4(x):
+    return fn3(x)
+
+
+def fn5(x):
+    return fn4(x)
+
+
+def fn6(x):
+    return fn5(x)
+
+
+def fn7(x):
+    return fn6(x)
+
+
+def fn8(x):
+    return fn7(x)
+
+
+def fn9(x):
+    return fn8(x)
+
+
+class InlineMod(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._n = 1000
+
+    def forward(self, x):
+        for _ in range(self._n):
+            x = fn9(x)
+        return x
+
+
+class Benchmark(BenchmarkBase):
+    def __init__(
+        self,
+        ModuleClass,
+        backend="eager",
+        is_gpu=False,
+        dynamic=False,
+    ):
+        self.ModuleClass = ModuleClass
+        self._name = ModuleClass.__name__
+        self._is_gpu = is_gpu
+
+        super().__init__(
+            category="basic",
+            backend=backend,
+            device="cuda" if self._is_gpu else "cpu",
+            dynamic=dynamic,
+        )
+
+    def name(self):
+        prefix = f"{self.category()}_{self._name}_{self.backend()}"
+        return prefix
+
+    def _prepare_once(self):
+        self.m = self.ModuleClass()
+        torch.set_float32_matmul_precision("high")
+        self.input = torch.ones(10, device=self.device())
+
+    def _prepare(self):
+        torch._dynamo.reset()
+
+    def _work(self):
+        # enable_cpp_symbolic_shape_guards has impact on this benchmark
+        # Keep using False value for consistency.
+        with (
+            fresh_cache(),
+        ):
+            opt_m = torch.compile(backend=self.backend(), dynamic=self.is_dynamic())(
+                self.m.cuda() if self._is_gpu else self.m
+            )
+            opt_m(self.input)
+
+
+def main():
+    result_path = sys.argv[1]
+    benchmarks = [
+        Benchmark(InlineMod),
+    ]
+    for b in benchmarks:
+        b.enable_compile_time_instruction_count().collect_all().append_results(
+            result_path
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/float_args.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/float_args.py
index 640557e6f11d..4821905820a7 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/float_args.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/float_args.py
@@ -3,7 +3,11 @@
 from benchmark_base import BenchmarkBase
 
 import torch
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Benchmark(BenchmarkBase):
@@ -31,7 +35,11 @@ def _work(self):
         def f(x, y):
             return x + y
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i in range(8):
                 f(torch.arange(3), i * 2.5)
 
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/mm_loop.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/mm_loop.py
new file mode 100644
index 000000000000..51c2ecf03470
--- /dev/null
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/mm_loop.py
@@ -0,0 +1,62 @@
+import sys
+
+from benchmark_base import BenchmarkBase
+
+import torch
+from torch._inductor.utils import fresh_cache
+
+
+class Benchmark(BenchmarkBase):
+    def __init__(self, is_dynamic: bool) -> None:
+        super().__init__(
+            category="mm_loop",
+            backend="inductor",
+            device="cuda",
+            dynamic=is_dynamic,
+        )
+
+    def name(self) -> str:
+        prefix = f"{self.category()}_{self.backend()}"
+        if self.is_dynamic():
+            prefix += "_dynamic"
+        if self.device() == "cuda":
+            prefix += "_gpu"
+        return prefix
+
+    def description(self) -> str:
+        return "a mm 100 times in a loop with max auto tune on"
+
+    def _prepare_once(self) -> None:
+        self.a = torch.ones(10, 10, device=self.device())
+        self.b = torch.torch.ones(10, 10, device=self.device())
+
+    def _prepare(self) -> None:
+        torch._dynamo.reset()
+
+    def _work(self) -> None:
+        @torch.compile(
+            backend="inductor",
+            fullgraph=True,
+            dynamic=self._dynamic,
+        )
+        def f(a, b):
+            z = torch.mm(a, b)
+            for i in range(200):
+                z = torch.mm(z, b)
+            return z
+
+        with fresh_cache(), torch._inductor.config.patch(max_autotune=True):
+            f(self.a, self.b)
+
+
+def main():
+    result_path = sys.argv[1]
+    all_benchamrks = [Benchmark(False), Benchmark(True)]
+    for b in all_benchamrks:
+        b.enable_compile_time_instruction_count().collect_all().append_results(
+            result_path
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/nested_module.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/nested_module.py
new file mode 100644
index 000000000000..bf60b418266b
--- /dev/null
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/nested_module.py
@@ -0,0 +1,90 @@
+import sys
+
+from benchmark_base import BenchmarkBase
+
+import torch
+import torch.nn as nn
+from torch._inductor.utils import fresh_cache
+
+
+class NestedModule(nn.Module):
+    def __init__(self, depth=3, width=4):
+        super().__init__()
+        self.depth = depth
+        self.width = width
+
+        self.relu_a = nn.ReLU()
+        self.relu_b = nn.ReLU()
+
+        sub_mods = []
+        if depth > 0:
+            for i in range(width):
+                sub_mods.append(NestedModule(depth - 1, width))
+        else:
+            for i in range(width):
+                sub_mods.append(nn.ReLU())
+        self.sub_mods = nn.Sequential(*sub_mods)
+        self.a = 2
+
+    def forward(self, x):
+        x = self.relu_a(x)
+        x = x + self.sub_mods(x)
+        return x + self.relu_b(x) + self.a
+
+
+class Benchmark(BenchmarkBase):
+    def __init__(
+        self,
+        ModuleClass,
+        backend="eager",
+        is_gpu=False,
+        dynamic=False,
+    ):
+        self.ModuleClass = ModuleClass
+        self._name = ModuleClass.__name__
+        self._is_gpu = is_gpu
+
+        super().__init__(
+            category="basic",
+            backend=backend,
+            device="cuda" if self._is_gpu else "cpu",
+            dynamic=dynamic,
+        )
+
+    def name(self):
+        prefix = f"{self.category()}_{self._name}_{self.backend()}"
+        return prefix
+
+    def _prepare_once(self):
+        self.m = self.ModuleClass()
+        torch.set_float32_matmul_precision("high")
+        self.input = torch.ones(10, device=self.device())
+
+    def _prepare(self):
+        torch._dynamo.reset()
+
+    def _work(self):
+        # enable_cpp_symbolic_shape_guards has impact on this benchmark
+        # Keep using False value for consistency.
+        with (
+            fresh_cache(),
+        ):
+            opt_m = torch.compile(backend=self.backend(), dynamic=self.is_dynamic())(
+                self.m.cuda() if self._is_gpu else self.m
+            )
+            opt_m(self.input)
+
+
+def main():
+    result_path = sys.argv[1]
+    benchmarks = [
+        Benchmark(NestedModule),
+    ]
+    for b in benchmarks:
+        b.enable_compile_time_instruction_count().collect_all().append_results(
+            result_path
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/dynamo/pr_time_benchmarks/check_results.py b/benchmarks/dynamo/pr_time_benchmarks/check_results.py
index c33440101a24..3d56efe6ad39 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/check_results.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/check_results.py
@@ -159,7 +159,11 @@ def log(event_name):
             print(
                 f"WIN: benchmark {key} failed, actual result {result} is {ratio:+.2f}% lower than "
                 f"expected {entry.expected_value} ±{entry.noise_margin * 100:.2f}% "
+<<<<<<< HEAD
                 f"please update the expected results. \n"
+=======
+                f"please OPEN THE TEST RESULTS update ALL BENCHMARKS RESULT with the new printed expected results. ALL ALL ALL\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             print(
                 "please update all results that changed significantly, and not only the failed ones"
diff --git a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
index 221218fe1677..40f25f4df49a 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 add_loop_eager,compile_time_instruction_count,2869000000,0.015
 
 
@@ -63,3 +64,78 @@ aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3678000000,
 
 
 aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9982000000,0.015
+=======
+add_loop_eager,compile_time_instruction_count,2937000000,0.015
+
+
+
+add_loop_eager_dynamic,compile_time_instruction_count,4300194436,0.025
+
+
+
+add_loop_inductor,compile_time_instruction_count,29630000000,0.015
+
+
+
+add_loop_inductor_dynamic_gpu,compile_time_instruction_count,39110000000,0.025
+
+
+
+add_loop_inductor_gpu,compile_time_instruction_count,26180000000,0.015
+
+
+
+basic_modules_ListOfLinears_eager,compile_time_instruction_count,942514329,0.015
+
+
+
+basic_modules_ListOfLinears_inductor,compile_time_instruction_count,18660000000,0.015
+
+
+
+basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,16750000000,0.015
+
+
+
+basic_modules_ListOfLinears_inductor_gpu,compile_time_instruction_count,10370000000,0.2
+
+
+
+update_hint_regression,compile_time_instruction_count,1677000000,0.02
+
+
+
+sum_floordiv_regression,compile_time_instruction_count,984411080,0.015
+
+
+
+symint_sum,compile_time_instruction_count,3252000000,0.015
+
+
+
+symint_sum_loop,compile_time_instruction_count,4216000000,0.015
+
+
+
+aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2113000000,0.015
+
+
+
+aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6022000000,0.015
+
+
+
+aotdispatcher_partitioner_cpu,compile_time_instruction_count,8844000000,0.015
+
+
+
+aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1963000000,0.015
+
+
+
+aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3875000000,0.015
+
+
+
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10420000000,0.015
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 486c1cc7cc0f..11025e3d33e9 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -269,7 +269,11 @@ def parse_args():
         "--no-graphs",
         action="store_true",
         default=False,
+<<<<<<< HEAD
         help="Do not genenerate and upload metric graphs",
+=======
+        help="Do not generate and upload metric graphs",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     parser.add_argument(
         "--no-update-archive",
@@ -368,7 +372,11 @@ def get_mode(args):
 
 def get_skip_tests(suite, device, is_training: bool):
     """
+<<<<<<< HEAD
     Generate -x seperated string to skip the unusual setup training tests
+=======
+    Generate -x separated string to skip the unusual setup training tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     skip_tests = set()
     original_dir = abspath(os.getcwd())
@@ -550,7 +558,11 @@ def env_var(name):
         gh_fh.write(comment)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def archive_data(archive_name):
     if archive_name is not None:
         prefix_match = re.search(r"\w+(?=_performance)", archive_name)
@@ -570,7 +582,11 @@ def archive_data(archive_name):
     return day, prefix
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def default_archive_name(dtype):
     _, prefix = archive_data(None)
     return f"{prefix}_performance_{dtype}_{randint(100, 999)}"
@@ -1359,7 +1375,11 @@ def update_lookup_file(self):
         dtype = self.args.dtypes[0]
         day, _ = archive_data(self.args.archive_name)
         target_dir = get_archive_name(self.args, dtype)
+<<<<<<< HEAD
         # Update lookup csv the folder to arhived logs
+=======
+        # Update lookup csv the folder to archived logs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         subprocess.check_call(
             f'echo "{day},performance,{dtype},{target_dir}" >> {self.lookup_file}',
             shell=True,
@@ -1418,7 +1438,11 @@ def gen_comment(self):
 
     def comment_on_gh(self, comment):
         """
+<<<<<<< HEAD
         Send a commment to dashboard
+=======
+        Send a comment to dashboard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         with tempfile.NamedTemporaryFile(mode="w", delete=False) as f:
             f.write(comment)
@@ -1450,7 +1474,11 @@ def update(self):
             try:
                 RegressionTracker(self.args).diff()
             except Exception:
+<<<<<<< HEAD
                 logging.exception("")
+=======
+                log.exception("")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 with open(f"{self.args.output_dir}/gh_regression.txt", "w") as gh_fh:
                     gh_fh.write("")
 
diff --git a/benchmarks/dynamo/summarize_perf.py b/benchmarks/dynamo/summarize_perf.py
index 8a291a37deb6..79c50dc13697 100644
--- a/benchmarks/dynamo/summarize_perf.py
+++ b/benchmarks/dynamo/summarize_perf.py
@@ -8,6 +8,12 @@
 from tabulate import tabulate
 
 
+<<<<<<< HEAD
+=======
+log = logging.getLogger(__name__)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def gmean(s):
     return s.product() ** (1 / len(s))
 
@@ -67,7 +73,11 @@ def main(directory, amp, float32, perf_compare):
         try:
             dfs[os.path.basename(f)].append(pd.read_csv(f))
         except Exception:
+<<<<<<< HEAD
             logging.warning("failed parsing %s", f)
+=======
+            log.warning("failed parsing %s", f)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise
 
     # dtype -> statistic -> benchmark -> compiler -> value
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index 14321594551c..1306a2df9e2a 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -71,6 +71,10 @@ def pip_install(package):
 }
 
 REQUIRE_HIGHER_TOLERANCE = {
+<<<<<<< HEAD
+=======
+    "crossvit_9_240",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "fbnetv3_b",
     "gmixer_24_224",
     "hrnet_w18",
@@ -80,6 +84,10 @@ def pip_install(package):
     "sebotnet33ts_256",
     "selecsls42b",
     "convnext_base",
+<<<<<<< HEAD
+=======
+    "cait_m36_384",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 REQUIRE_HIGHER_TOLERANCE_AMP = {
@@ -127,6 +135,11 @@ def pip_install(package):
     "inception_v3",
     "mobilenetv3_large_100",
     "cspdarknet53",
+<<<<<<< HEAD
+=======
+    "gluon_inception_v3",
+    "cait_m36_384",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -427,7 +440,11 @@ def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
         self.grad_scaler.scale(loss).backward()
         self.optimizer_step()
         if collect_outputs:
+<<<<<<< HEAD
             return collect_results(mod, pred, loss, cloned_inputs)
+=======
+            return collect_results(mod, None, loss, cloned_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
 
diff --git a/benchmarks/dynamo/torchao_backend.py b/benchmarks/dynamo/torchao_backend.py
index 17876005a7db..42c17256b299 100644
--- a/benchmarks/dynamo/torchao_backend.py
+++ b/benchmarks/dynamo/torchao_backend.py
@@ -43,7 +43,11 @@ def _torchao_apply(module: torch.nn.Module, example_inputs: Any):
                     from torchao.quantization.autoquant import AUTOQUANT_CACHE
 
                     if len(AUTOQUANT_CACHE) == 0:
+<<<<<<< HEAD
                         raise Exception(  # noqa: TRY002`
+=======
+                        raise Exception(  # noqa: TRY002
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             "NotAutoquantizable"
                             f"Found no autoquantizable layers in model {type(module)}, stopping autoquantized run"
                         )
diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index a23ce62fe762..e456df616783 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -85,8 +85,14 @@ def process_hf_whisper_output(out):
     out_ret = []
     for i, elem in enumerate(out):
         if i == 0:
+<<<<<<< HEAD
             assert isinstance(elem, dict)
             out_ret.append({k: v for k, v in elem.items() if k != "logits"})
+=======
+            if elem is not None:
+                assert isinstance(elem, dict)
+                out_ret.append({k: v for k, v in elem.items() if k != "logits"})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif i != 1:
             out_ret.append(elem)
 
@@ -470,7 +476,11 @@ def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
         self.grad_scaler.scale(loss).backward()
         self.optimizer_step()
         if collect_outputs:
+<<<<<<< HEAD
             return collect_results(mod, pred, loss, cloned_inputs)
+=======
+            return collect_results(mod, None, loss, cloned_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
 
diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml
index 34ea81e01d7b..a122809dc25b 100644
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@@ -48,6 +48,10 @@ tolerance:
     - doctr_reco_predictor
     - drq
     - hf_Whisper
+<<<<<<< HEAD
+=======
+    - phlippe_resnet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   higher_bf16:
     - doctr_reco_predictor
@@ -228,7 +232,11 @@ skip:
       - doctr_det_predictor
       - doctr_reco_predictor
       - moondream
+<<<<<<< HEAD
       # doesnt fit in memory
+=======
+      # doesn't fit in memory
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       - phi_1_5
       - detectron2_fcos_r_50_fpn
 
diff --git a/benchmarks/fastrnns/factory.py b/benchmarks/fastrnns/factory.py
index b17a475b631b..0732606f2641 100644
--- a/benchmarks/fastrnns/factory.py
+++ b/benchmarks/fastrnns/factory.py
@@ -225,7 +225,11 @@ def varlen_lstm_inputs(
         return x, lengths, (hx, cx), lstm.all_weights, lstm
     else:
         # NB: lstm.all_weights format:
+<<<<<<< HEAD
         # wih, whh, bih, bhh = lstm.all_weights[layer]
+=======
+        # w_ih, w_hh, b_ih, b_hh = lstm.all_weights[layer]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x, lengths, (hx, cx), lstm.all_weights, None
 
 
@@ -266,10 +270,17 @@ def varlen_lstm_factory(cell, script):
     def dynamic_rnn(
         sequences: list[Tensor],
         hiddens: tuple[Tensor, Tensor],
+<<<<<<< HEAD
         wih: Tensor,
         whh: Tensor,
         bih: Tensor,
         bhh: Tensor,
+=======
+        w_ih: Tensor,
+        w_hh: Tensor,
+        b_ih: Tensor,
+        b_hh: Tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[list[Tensor], tuple[list[Tensor], list[Tensor]]]:
         hx, cx = hiddens
         hxs = hx.unbind(1)
@@ -286,7 +297,11 @@ def dynamic_rnn(
 
             for seq_idx in range(len(inputs)):
                 hy, cy = cell(
+<<<<<<< HEAD
                     inputs[seq_idx].unsqueeze(0), (hy, cy), wih, whh, bih, bhh
+=======
+                    inputs[seq_idx].unsqueeze(0), (hy, cy), w_ih, w_hh, b_ih, b_hh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 output += [hy]
             outputs += [torch.stack(output)]
@@ -315,7 +330,11 @@ def varlen_lstm_creator(script=False, **kwargs):
 
 
 # cudnn_layernorm_lstm: since cudnn does not have Layernorm LSTM, we cannot benchmark
+<<<<<<< HEAD
 # the lowerbound directly. Instead, we only benchmark the forward pass by mimicing the
+=======
+# the lowerbound directly. Instead, we only benchmark the forward pass by mimicking the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # computation of a cudnn lstm + seq_len * 3 layernorm computation. This should serve
 # as a perf lowerbound for the Layernorm LSTM forward pass(given that Layernorm itself
 # is invariant), the lowerbound of backward pass is hard to get since we lose the
@@ -352,12 +371,21 @@ def forward(input, hidden):
     )
 
 
+<<<<<<< HEAD
 # input: lstm.all_weights format (wih, whh, bih, bhh = lstm.all_weights[layer])
 # output: packed_weights with format
 # packed_weights[0] is wih with size (layer, 4*hiddenSize, inputSize)
 # packed_weights[1] is whh with size (layer, 4*hiddenSize, hiddenSize)
 # packed_weights[2] is bih with size (layer, 4*hiddenSize)
 # packed_weights[3] is bhh with size (layer, 4*hiddenSize)
+=======
+# input: lstm.all_weights format (w_ih, w_hh, b_ih, b_hh = lstm.all_weights[layer])
+# output: packed_weights with format
+# packed_weights[0] is w_ih with size (layer, 4*hiddenSize, inputSize)
+# packed_weights[1] is w_hh with size (layer, 4*hiddenSize, hiddenSize)
+# packed_weights[2] is b_ih with size (layer, 4*hiddenSize)
+# packed_weights[3] is b_hh with size (layer, 4*hiddenSize)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def stack_weights(weights):
     def unzip_columns(mat):
         assert isinstance(mat, list)
@@ -398,7 +426,11 @@ def lstm_inputs(
         return x, (hx, cx), lstm.all_weights, lstm
     else:
         # NB: lstm.all_weights format:
+<<<<<<< HEAD
         # wih, whh, bih, bhh = lstm.all_weights[layer]
+=======
+        # w_ih, w_hh, b_ih, b_hh = lstm.all_weights[layer]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x, (hx, cx), lstm.all_weights, None
 
 
@@ -406,17 +438,28 @@ def lstm_factory(cell, script):
     def dynamic_rnn(
         input: Tensor,
         hidden: tuple[Tensor, Tensor],
+<<<<<<< HEAD
         wih: Tensor,
         whh: Tensor,
         bih: Tensor,
         bhh: Tensor,
+=======
+        w_ih: Tensor,
+        w_hh: Tensor,
+        b_ih: Tensor,
+        b_hh: Tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[Tensor, tuple[Tensor, Tensor]]:
         hx, cx = hidden
         outputs = []
         inputs = input.unbind(0)
         hy, cy = hx[0], cx[0]
         for seq_idx in range(len(inputs)):
+<<<<<<< HEAD
             hy, cy = cell(inputs[seq_idx], (hy, cy), wih, whh, bih, bhh)
+=======
+            hy, cy = cell(inputs[seq_idx], (hy, cy), w_ih, w_hh, b_ih, b_hh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             outputs += [hy]
         return torch.stack(outputs), (hy.unsqueeze(0), cy.unsqueeze(0))
 
@@ -432,6 +475,7 @@ def lstm_factory_premul(premul_cell, script):
     def dynamic_rnn(
         input: Tensor,
         hidden: tuple[Tensor, Tensor],
+<<<<<<< HEAD
         wih: Tensor,
         whh: Tensor,
         bih: Tensor,
@@ -443,6 +487,19 @@ def dynamic_rnn(
         hy, cy = hx[0], cx[0]
         for seq_idx in range(len(inputs)):
             hy, cy = premul_cell(inputs[seq_idx], (hy, cy), whh, bih, bhh)
+=======
+        w_ih: Tensor,
+        w_hh: Tensor,
+        b_ih: Tensor,
+        b_hh: Tensor,
+    ) -> tuple[Tensor, tuple[Tensor, Tensor]]:
+        hx, cx = hidden
+        outputs = []
+        inputs = torch.matmul(input, w_ih.t()).unbind(0)
+        hy, cy = hx[0], cx[0]
+        for seq_idx in range(len(inputs)):
+            hy, cy = premul_cell(inputs[seq_idx], (hy, cy), w_hh, b_ih, b_hh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             outputs += [hy]
         return torch.stack(outputs), (hy.unsqueeze(0), cy.unsqueeze(0))
 
@@ -458,10 +515,17 @@ def lstm_factory_premul_bias(premul_cell, script):
     def dynamic_rnn(
         input: Tensor,
         hidden: tuple[Tensor, Tensor],
+<<<<<<< HEAD
         wih: Tensor,
         whh: Tensor,
         bih: Tensor,
         bhh: Tensor,
+=======
+        w_ih: Tensor,
+        w_hh: Tensor,
+        b_ih: Tensor,
+        b_hh: Tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[Tensor, tuple[Tensor, Tensor]]:
         hx, cx = hidden
         outputs = []
@@ -470,11 +534,19 @@ def dynamic_rnn(
         # FIXME matmul(x,y) + bias currently goes through jit AD, and backward formula in AD is not optimized for this
         # case. Workaround with mm and views.
         inpSize = input.size()
+<<<<<<< HEAD
         inputs = torch.mm(input.view(-1, inpSize[2]), wih.t()) + bih
         inputs = inputs.view(inpSize[0], inpSize[1], -1).unbind(0)
         hy, cy = hx[0], cx[0]
         for seq_idx in range(len(inputs)):
             hy, cy = premul_cell(inputs[seq_idx], (hy, cy), whh, bhh)
+=======
+        inputs = torch.mm(input.view(-1, inpSize[2]), w_ih.t()) + b_ih
+        inputs = inputs.view(inpSize[0], inpSize[1], -1).unbind(0)
+        hy, cy = hx[0], cx[0]
+        for seq_idx in range(len(inputs)):
+            hy, cy = premul_cell(inputs[seq_idx], (hy, cy), w_hh, b_hh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             outputs += [hy]
         return torch.stack(outputs), (hy.unsqueeze(0), cy.unsqueeze(0))
 
@@ -488,12 +560,20 @@ def dynamic_rnn(
 # simple: flat inputs (no tuples), no list to accumulate outputs
 #         useful mostly for benchmarking older JIT versions
 def lstm_factory_simple(cell, script):
+<<<<<<< HEAD
     def dynamic_rnn(input, hx, cx, wih, whh, bih, bhh):
+=======
+    def dynamic_rnn(input, hx, cx, w_ih, w_hh, b_ih, b_hh):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hy = hx  # for scoping
         cy = cx  # for scoping
         inputs = input.unbind(0)
         for seq_idx in range(len(inputs)):
+<<<<<<< HEAD
             hy, cy = cell(inputs[seq_idx], hy, cy, wih, whh, bih, bhh)
+=======
+            hy, cy = cell(inputs[seq_idx], hy, cy, w_ih, w_hh, b_ih, b_hh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return hy, cy
 
     if script:
@@ -515,12 +595,21 @@ def dynamic_rnn(
             hy = hx[layer]
             cy = cx[layer]
             base_idx = layer * params_stride
+<<<<<<< HEAD
             wih = params[base_idx]
             whh = params[base_idx + 1]
             bih = params[base_idx + 2]
             bhh = params[base_idx + 3]
             for seq_idx in range(len(inputs)):
                 hy, cy = cell(inputs[seq_idx], (hy, cy), wih, whh, bih, bhh)
+=======
+            w_ih = params[base_idx]
+            w_hh = params[base_idx + 1]
+            b_ih = params[base_idx + 2]
+            b_hh = params[base_idx + 3]
+            for seq_idx in range(len(inputs)):
+                hy, cy = cell(inputs[seq_idx], (hy, cy), w_ih, w_hh, b_ih, b_hh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 outputs += [hy]
             inputs, outputs = outputs, []
         return torch.stack(inputs), (hy.unsqueeze(0), cy.unsqueeze(0))
diff --git a/benchmarks/fastrnns/test.py b/benchmarks/fastrnns/test.py
index 36a5db23c1b4..7b98b0594cfc 100644
--- a/benchmarks/fastrnns/test.py
+++ b/benchmarks/fastrnns/test.py
@@ -51,6 +51,7 @@ def test_rnns(
 
     print("Setting up...")
     control = control_creator(**creator_args)
+<<<<<<< HEAD
     experim = experim_creator(**creator_args)
 
     # Precondition
@@ -60,10 +61,22 @@ def test_rnns(
     print("Checking outputs...")
     control_outputs = control.forward(*control.inputs)
     experim_outputs = experim.forward(*experim.inputs)
+=======
+    experiment = experim_creator(**creator_args)
+
+    # Precondition
+    assertEqual(experiment.inputs, control.inputs)
+    assertEqual(experiment.params, control.params)
+
+    print("Checking outputs...")
+    control_outputs = control.forward(*control.inputs)
+    experim_outputs = experiment.forward(*experiment.inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assertEqual(experim_outputs, control_outputs)
 
     print("Checking grads...")
     assert control.backward_setup is not None
+<<<<<<< HEAD
     assert experim.backward_setup is not None
     assert control.backward is not None
     assert experim.backward is not None
@@ -79,6 +92,23 @@ def test_rnns(
 
     if verbose:
         print(experim.forward.graph_for(*experim.inputs))
+=======
+    assert experiment.backward_setup is not None
+    assert control.backward is not None
+    assert experiment.backward is not None
+    control_backward_inputs = control.backward_setup(control_outputs, seed)
+    experim_backward_inputs = experiment.backward_setup(experim_outputs, seed)
+
+    control.backward(*control_backward_inputs)
+    experiment.backward(*experim_backward_inputs)
+
+    control_grads = [p.grad for p in control.params]
+    experim_grads = [p.grad for p in experiment.params]
+    assertEqual(experim_grads, control_grads)
+
+    if verbose:
+        print(experiment.forward.graph_for(*experiment.inputs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     print()
 
 
@@ -103,16 +133,28 @@ def test_vl_py(**test_args):
 
         print("Setting up...")
         control = control_creator(**creator_args)
+<<<<<<< HEAD
         experim = experim_creator(**creator_args)
 
         # Precondition
         assertEqual(experim.inputs, control.inputs[:2])
         assertEqual(experim.params, control.params)
+=======
+        experiment = experim_creator(**creator_args)
+
+        # Precondition
+        assertEqual(experiment.inputs, control.inputs[:2])
+        assertEqual(experiment.params, control.params)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         print("Checking outputs...")
         control_out, control_hiddens = control.forward(*control.inputs)
         control_hx, control_cx = control_hiddens
+<<<<<<< HEAD
         experim_out, experim_hiddens = experim.forward(*experim.inputs)
+=======
+        experim_out, experim_hiddens = experiment.forward(*experiment.inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         experim_hx, experim_cx = experim_hiddens
 
         experim_padded = nn.utils.rnn.pad_sequence(experim_out).squeeze(-2)
@@ -122,6 +164,7 @@ def test_vl_py(**test_args):
 
         print("Checking grads...")
         assert control.backward_setup is not None
+<<<<<<< HEAD
         assert experim.backward_setup is not None
         assert control.backward is not None
         assert experim.backward is not None
@@ -129,10 +172,20 @@ def test_vl_py(**test_args):
             (control_out, control_hiddens), test_args["seed"]
         )
         experim_backward_inputs = experim.backward_setup(
+=======
+        assert experiment.backward_setup is not None
+        assert control.backward is not None
+        assert experiment.backward is not None
+        control_backward_inputs = control.backward_setup(
+            (control_out, control_hiddens), test_args["seed"]
+        )
+        experim_backward_inputs = experiment.backward_setup(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (experim_out, experim_hiddens), test_args["seed"]
         )
 
         control.backward(*control_backward_inputs)
+<<<<<<< HEAD
         experim.backward(*experim_backward_inputs)
 
         control_grads = [p.grad for p in control.params]
@@ -141,6 +194,16 @@ def test_vl_py(**test_args):
 
         if test_args["verbose"]:
             print(experim.forward.graph_for(*experim.inputs))
+=======
+        experiment.backward(*experim_backward_inputs)
+
+        control_grads = [p.grad for p in control.params]
+        experim_grads = [p.grad for p in experiment.params]
+        assertEqual(experim_grads, control_grads)
+
+        if test_args["verbose"]:
+            print(experiment.forward.graph_for(*experiment.inputs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         print()
 
 
diff --git a/benchmarks/functional_autograd_benchmark/torchaudio_models.py b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
index 40a3b853d6ed..80849a085c36 100644
--- a/benchmarks/functional_autograd_benchmark/torchaudio_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
@@ -473,7 +473,11 @@ def forward(self, src, has_mask=True):
         return F.log_softmax(output, dim=-1)
 
 
+<<<<<<< HEAD
 # From https://github.com/pytorch/text/blob/master/torchtext/modules
+=======
+# From https://github.com/pytorch/text/tree/master/torchtext/nn/modules
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MultiheadAttentionContainer(torch.nn.Module):
     def __init__(self, nhead, in_proj_container, attention_layer, out_proj):
         r"""A multi-head attention container
diff --git a/benchmarks/functional_autograd_benchmark/torchvision_models.py b/benchmarks/functional_autograd_benchmark/torchvision_models.py
index 25dd91c02d6a..bb4556e7ce16 100644
--- a/benchmarks/functional_autograd_benchmark/torchvision_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchvision_models.py
@@ -885,7 +885,11 @@ def __init__(
         self.cost_bbox = cost_bbox
         self.cost_giou = cost_giou
         assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, (
+<<<<<<< HEAD
             "all costs cant be 0"
+=======
+            "all costs can't be 0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @torch.no_grad()
@@ -920,13 +924,21 @@ def forward(self, outputs, targets):
 
         # Compute the classification cost. Contrary to the loss, we don't use the NLL,
         # but approximate it in 1 - proba[target class].
+<<<<<<< HEAD
         # The 1 is a constant that doesn't change the matching, it can be ommitted.
+=======
+        # The 1 is a constant that doesn't change the matching, it can be omitted.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cost_class = -out_prob[:, tgt_ids]
 
         # Compute the L1 cost between boxes
         cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
 
+<<<<<<< HEAD
         # Compute the giou cost betwen boxes
+=======
+        # Compute the giou cost between boxes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cost_giou = -generalized_box_iou(
             box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)
         )
diff --git a/benchmarks/gpt_fast/generate.py b/benchmarks/gpt_fast/generate.py
index 8b4e4a550b99..a5892c5568d8 100644
--- a/benchmarks/gpt_fast/generate.py
+++ b/benchmarks/gpt_fast/generate.py
@@ -44,7 +44,11 @@ def device_sync(device):
     elif "cpu" in device:
         pass
     else:
+<<<<<<< HEAD
         print(f"device={device} is not yet suppported")
+=======
+        print(f"device={device} is not yet supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_arch_name() -> str:
diff --git a/benchmarks/inductor_backends/cutlass.py b/benchmarks/inductor_backends/cutlass.py
index ddabd8cffa61..bf767b51ed48 100644
--- a/benchmarks/inductor_backends/cutlass.py
+++ b/benchmarks/inductor_backends/cutlass.py
@@ -1,9 +1,17 @@
 import os
+<<<<<<< HEAD
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 os.environ["TORCH_LOGS"] = "inductor"
 
 import itertools
+<<<<<<< HEAD
+=======
+import logging
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import time
 from abc import abstractmethod
 from collections import defaultdict
@@ -16,6 +24,13 @@
 
 import torch
 from torch._inductor import config as inductor_config
+<<<<<<< HEAD
+=======
+from torch.testing._internal.inductor_utils import _quantize_rowwise
+
+
+log: logging.Logger = logging.getLogger(__name__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 inductor_config.autotune_num_choices_displayed = None
@@ -24,14 +39,32 @@
 # uncomment for better debugging
 # inductor_config.force_disable_caches = True
 
+<<<<<<< HEAD
+=======
+USE_FAST_ACCUM = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 UNITS = {
     "name": "",
     "forward_time": " (us)",
+<<<<<<< HEAD
     "compilation_time": " (s)",
 }
 
 OP_NAMES = ["mm"]
+=======
+    "teraflops": " (TFLOPS)",
+    "compilation_time": " (s)",
+}
+PERF_OVER_ATEN_STR: str = "perf_over_aten (%)"
+
+OP_NAMES = [
+    "mm",
+    # "addmm",
+    # "bmm",
+    # "_scaled_mm",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 SHAPES = [
     # M, N, K
@@ -40,9 +73,21 @@
     (8192, 8192, 8192),
 ]
 
+<<<<<<< HEAD
 DTYPES = [
     torch.float16,
     torch.bfloat16,
+=======
+BATCH_SIZES = [
+    # For non-bmm testing, still need to specify something
+    8,
+]
+
+DTYPES = [
+    torch.float16,
+    torch.bfloat16,
+    # torch.float8_e4m3fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 # triton knobs
@@ -54,20 +99,34 @@
 # cutlass knobs
 CUTLASS_INSTANTIATION_LEVELS = [
     "0",
+<<<<<<< HEAD
     "1111",
     "2222",
     # not ready yet
     # "3333",
+=======
+    # "1111",
+    # "2222",
+    "3332",
+    # "9992",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
 def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
+<<<<<<< HEAD
     return do_bench(lambda: func(*args, **kwargs)) * 1e3
+=======
+    return do_bench(lambda: func(*args, **kwargs), warmup=100, rep=10000) * 1e3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass(frozen=True, kw_only=True)
 class ExperimentConfig:
+<<<<<<< HEAD
     autotune_fallback_to_aten: bool = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     max_autotune: bool = True
     coordinate_descent_tuning: bool = True
     max_autotune_gemm_backends: str = "ATEN"
@@ -78,7 +137,10 @@ def name(self) -> str:
 
     def to_options(self) -> dict[str, Any]:
         return {
+<<<<<<< HEAD
             "autotune_fallback_to_aten": self.autotune_fallback_to_aten,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "max_autotune": self.max_autotune,
             "coordinate_descent_tuning": self.coordinate_descent_tuning,
             "max_autotune_gemm_backends": self.max_autotune_gemm_backends,
@@ -132,12 +194,25 @@ class ExperimentGroupConfig:
     op_name: str
     shape: tuple[int, int, int]
     dtype: torch.dtype
+<<<<<<< HEAD
+=======
+    batch_size: int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     experiments: list[ExperimentConfig] = field(default_factory=list)
 
     def name(self) -> str:
         M, N, K = self.shape
+<<<<<<< HEAD
         sizes = f"({M}x{K}, {K}x{N})"
+=======
+        B = self.batch_size
+        sizes = (
+            f"(BS: {B}, {M}x{K}, {K}x{N})"
+            if self.op_name == "bmm"
+            else f"({M}x{K}, {K}x{N})"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"{self.op_name} {sizes} {self.dtype}"
 
 
@@ -145,6 +220,10 @@ def name(self) -> str:
 class ExperimentResults:
     name: str
     forward_time: float
+<<<<<<< HEAD
+=======
+    teraflops: float
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     compilation_time: float
 
     def asdict(self):
@@ -159,17 +238,66 @@ class ExperimentGroup:
 
 def get_inputs(
     config: ExperimentGroupConfig,
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     op_name = config.op_name
     M, N, K = config.shape
+=======
+) -> tuple[torch.Tensor, ...]:
+    op_name = config.op_name
+    M, N, K = config.shape
+    batch_size = config.batch_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dtype = config.dtype
     device = torch.device("cuda")
 
     if op_name == "mm":
         A = torch.randn(M, K, dtype=dtype, device=device)
+<<<<<<< HEAD
         B = torch.randn(K, N, dtype=dtype, device=device)
         C = None
         return A, B, C
+=======
+        B = torch.randn(N, K, dtype=dtype, device=device).t()
+        return A, B
+    elif op_name == "addmm":
+        A = torch.randn(M, K, dtype=dtype, device=device)
+        B = torch.randn(N, K, dtype=dtype, device=device).t()
+        C = torch.randn(N, dtype=dtype, device=device)
+        return C, A, B
+    elif op_name == "bmm":
+        A = torch.randn(batch_size, M, K, dtype=dtype, device=device)
+        B = torch.randn(batch_size, N, K, dtype=dtype, device=device).permute(0, 2, 1)
+        return A, B
+    elif op_name == "_scaled_mm":
+        # For _scaled_mm, we only support fp8e4m3 with rowwise scaling
+        if dtype != torch.float8_e4m3fn:
+            raise ValueError(f"_scaled_mm only supports fp8e4m3, got {dtype}")
+
+        # Create input tensors in bfloat16 first, then quantize to fp8
+        input_dtype = torch.bfloat16
+        x = torch.randn(M, K, dtype=input_dtype, device=device)
+        w = torch.randn(N, K, dtype=input_dtype, device=device)
+
+        # Quantize using rowwise scaling
+        w_fp8, w_inverse_scale = _quantize_rowwise(w, dtype)
+        w_t_fp8 = w_fp8.t()
+        w_inverse_scale = w_inverse_scale.t()  # scale_b should be (1, N)
+
+        x_fp8, x_inverse_scale = _quantize_rowwise(x, dtype)
+
+        # Return inputs for _scaled_mm: (input, weight_t, scale_a, scale_b, bias, out, out_dtype, use_fast_accum)
+        return (
+            x_fp8,
+            w_t_fp8,
+            x_inverse_scale,
+            w_inverse_scale,
+            None,
+            None,
+            torch.bfloat16,
+            USE_FAST_ACCUM,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         raise ValueError(f"Unknown op {op_name}")
 
@@ -177,30 +305,79 @@ def get_inputs(
 def run_single_experiment_group(
     group_config: ExperimentGroupConfig,
 ) -> list[ExperimentResults]:
+<<<<<<< HEAD
     A, B, C = get_inputs(group_config)
+=======
+    inputs = get_inputs(group_config)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     op = getattr(torch, group_config.op_name)
 
     results = []
 
     for config in group_config.experiments:
         torch._dynamo.reset()
+<<<<<<< HEAD
         torch._inductor.utils.clear_inductor_caches()
         compiled_op = torch.compile(op, fullgraph=True, options=config.to_options())
 
         start_time = time.perf_counter()
         _ = compiled_op(A, B)
+=======
+        torch._inductor.utils.clear_caches()
+        compiled_op = torch.compile(
+            op,
+            options=config.to_options(),
+        )
+
+        start_time = time.perf_counter()
+        try:
+            _ = compiled_op(*inputs)
+        except Exception as e:
+            import traceback
+
+            log.warning(
+                f"Benchmark config {config.name()} failed: {e}, "  # noqa: G004
+                f"traceback: {traceback.format_exc()}"
+            )
+            results.append(
+                ExperimentResults(
+                    name=config.name(),
+                    forward_time=float("inf"),
+                    teraflops=0.0,
+                    compilation_time=float("inf"),
+                )
+            )
+            continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compilation_time = time.perf_counter() - start_time
 
         forward_time = benchmark_torch_function_in_microseconds(
             compiled_op,
+<<<<<<< HEAD
             A,
             B,
         )
 
+=======
+            *inputs,
+        )
+
+        flops = calculate_flops(
+            group_config.op_name,
+            group_config.shape,
+            group_config.batch_size,
+        )
+        teraflops = flops / (forward_time * 1e-6) / 1e12
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         results.append(
             ExperimentResults(
                 name=config.name(),
                 forward_time=forward_time,
+<<<<<<< HEAD
+=======
+                teraflops=teraflops,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 compilation_time=compilation_time,
             )
         )
@@ -214,13 +391,29 @@ def generate_experiment_groups(
     dtypes: list[torch.dtype],
     enable_persistent_tma_matmuls: list[bool],
     cutlass_instantiation_levels: list[str],
+<<<<<<< HEAD
 ) -> list[ExperimentGroupConfig]:
     groups = []
     for op_name, shape, dtype in itertools.product(op_names, shapes, dtypes):
+=======
+    batch_sizes: list[int],
+) -> list[ExperimentGroupConfig]:
+    groups = []
+    for (
+        op_name,
+        shape,
+        dtype,
+        batch_size,
+    ) in itertools.product(op_names, shapes, dtypes, batch_sizes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         group = ExperimentGroupConfig(
             op_name=op_name,
             shape=shape,
             dtype=dtype,
+<<<<<<< HEAD
+=======
+            batch_size=batch_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         experiments = generate_experiment_configs(
             enable_persistent_tma_matmuls, cutlass_instantiation_levels
@@ -264,10 +457,16 @@ def generate_experiment_configs(
     return configs
 
 
+<<<<<<< HEAD
 def tabulate_group_results(results: list[ExperimentResults]):
     table_data = defaultdict(list)
     aten_perf: Optional[float] = None
     perf_over_aten_str: str = "perf_over_aten (%)"
+=======
+def calculate_table_data(results: list[ExperimentResults]) -> dict:
+    table_data = defaultdict(list)
+    aten_perf: Optional[float] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for experiment_result in results:
         for key, value in experiment_result.asdict().items():
@@ -276,11 +475,16 @@ def tabulate_group_results(results: list[ExperimentResults]):
 
         if experiment_result.name == "aten":
             aten_perf = experiment_result.forward_time
+<<<<<<< HEAD
             table_data[perf_over_aten_str].append("NA")
+=======
+            table_data[PERF_OVER_ATEN_STR].append("NA")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif aten_perf is not None:
             perf_over_aten = (
                 (experiment_result.forward_time - aten_perf) / aten_perf * 100
             )
+<<<<<<< HEAD
             table_data[perf_over_aten_str].append(perf_over_aten)
         else:
             # fallback in case aten is not in experiment group
@@ -294,19 +498,83 @@ def print_results(experiment_groups: list[ExperimentGroup]):
         group_config_name = experiment_group.config.name()
         print(f"\nExperiment group: {group_config_name}")
         print(tabulate_group_results(experiment_group.results))
+=======
+            table_data[PERF_OVER_ATEN_STR].append(perf_over_aten)
+        else:
+            # fallback in case aten is not in experiment group
+            table_data[PERF_OVER_ATEN_STR].append("NA")
+
+    return table_data
+
+
+def calculate_flops(op_name: str, shape: tuple[int, int, int], batch_size: int) -> int:
+    """
+    Calculate the number of floating point operations based on operation type and shape.
+    """
+    M, N, K = shape
+
+    if op_name == "bmm":
+        return 2 * batch_size * M * N * K
+    elif op_name == "addmm":
+        return 2 * M * N * K + M * N
+    elif op_name == "_scaled_mm":
+        return 2 * M * N * K
+    else:
+        return 2 * M * N * K
+
+
+def get_printable_results(experiment_groups: list[ExperimentGroup]) -> list[str]:
+    edge_over_aten = defaultdict(list)
+    output = []
+
+    for experiment_group in experiment_groups:
+        group_config_name = experiment_group.config.name()
+        output.append(f"\nExperiment group: {group_config_name}")
+
+        table_data = calculate_table_data(experiment_group.results)
+        for name, edge in zip(table_data["name"], table_data[PERF_OVER_ATEN_STR]):
+            edge_over_aten[name].append(edge)
+        output.append(
+            tabulate(table_data, headers="keys", tablefmt="pretty", floatfmt=".3f")
+        )
+
+    if "aten" in edge_over_aten:
+        output.append("\nAverage edge over aten (max(-edge, 0), higher is better):")
+        for name in edge_over_aten:
+            if name != "aten":
+                values = [
+                    max(-v, 0.0)
+                    for v in edge_over_aten[name]
+                    if v != float("inf") and v != "NA"
+                ]
+                valid_count = len(values)
+                average_edge = sum(values) / valid_count if values else "No valid data"
+                output.append(
+                    f"{name}: {average_edge} (from {valid_count} valid values)"
+                )
+        output.append("\n")
+
+    return "\n".join(output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def main():
     seed = 123
     torch.manual_seed(seed)
     results = []
+<<<<<<< HEAD
     for group_config in tqdm(
+=======
+    log.info("Starting benchmarking...")
+    configs = list(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_experiment_groups(
             OP_NAMES,
             SHAPES,
             DTYPES,
             ENABLE_PERSISTENT_TMA_MATMULS,
             CUTLASS_INSTANTIATION_LEVELS,
+<<<<<<< HEAD
         )
     ):
         results.append(
@@ -316,6 +584,22 @@ def main():
         )
 
     print_results(results)
+=======
+            BATCH_SIZES,
+        )
+    )
+    for i, group_config in enumerate(tqdm(configs)):
+        group_results = run_single_experiment_group(group_config)  # noqa: G004
+        results.append(
+            ExperimentGroup(config=group_config, results=group_results),
+        )
+        sys.stderr.write(
+            f"\nINTERMEDIATE results: {i + 1}/{len(configs)} \n"
+            + get_printable_results(results)
+        )
+    print("\nFINAL results...")
+    print(get_printable_results(results))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/inference/README.md b/benchmarks/inference/README.md
index fe707799c098..2c2f27991776 100644
--- a/benchmarks/inference/README.md
+++ b/benchmarks/inference/README.md
@@ -20,7 +20,11 @@ For now we omit data preprocessing as well as result post-processing.
 
 ### Running a single benchmark
 
+<<<<<<< HEAD
 The togglable commmand line arguments to the script are as follows:
+=======
+The togglable command line arguments to the script are as follows:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   - `num_iters` (default: 100): how many requests to send to the backend
     excluding the first warmup request
   - `batch_size` (default: 32): the batch size of the requests.
diff --git a/benchmarks/inference/server.py b/benchmarks/inference/server.py
index 706811b134e4..f212cef4ab1b 100644
--- a/benchmarks/inference/server.py
+++ b/benchmarks/inference/server.py
@@ -45,7 +45,11 @@ def _run_metrics(self, metrics_lock):
         """
         This function will poll the response queue until it has received all
         responses. It records the startup latency, the average, max, min latency
+<<<<<<< HEAD
         as well as througput of requests.
+=======
+        as well as throughput of requests.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         warmup_response_time = None
         response_times = []
diff --git a/benchmarks/instruction_counts/applications/ci.py b/benchmarks/instruction_counts/applications/ci.py
index 4c9517b0f897..bc1feb878799 100644
--- a/benchmarks/instruction_counts/applications/ci.py
+++ b/benchmarks/instruction_counts/applications/ci.py
@@ -55,7 +55,11 @@ def main(argv: list[str]) -> None:
 
     results = Runner(work_orders, cadence=30.0).run()
 
+<<<<<<< HEAD
     # TODO: Annotate with TypedDict when 3.8 is the minimum supported verson.
+=======
+    # TODO: Annotate with TypedDict when 3.8 is the minimum supported version.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grouped_results: dict[str, dict[str, list[Union[float, int]]]] = {
         key: {"times": [], "counts": []} for key in keys
     }
diff --git a/benchmarks/instruction_counts/main.py b/benchmarks/instruction_counts/main.py
index 09869bf6710d..3c25cc21ffa6 100644
--- a/benchmarks/instruction_counts/main.py
+++ b/benchmarks/instruction_counts/main.py
@@ -2,7 +2,11 @@
 
 The contents of this file are placeholders, and will be replaced by more
 expressive and robust components (e.g. better runner and result display
+<<<<<<< HEAD
 components) in future iterations. However this allows us to excercise the
+=======
+components) in future iterations. However this allows us to exercise the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 underlying benchmark generation infrastructure in the mean time.
 """
 
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index 8d91f4bf4751..e71bc4c243aa 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -1,9 +1,20 @@
 import ast
 import copy
+<<<<<<< HEAD
 import functools
 import json
 import timeit
 from collections import namedtuple
+=======
+import csv
+import functools
+import json
+import os
+import timeit
+from collections import namedtuple
+from dataclasses import asdict, dataclass
+from typing import Any, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import benchmark_utils
 
@@ -31,6 +42,11 @@
 
 BENCHMARK_TESTER = []
 
+<<<<<<< HEAD
+=======
+SKIP_OP_LISTS = ["weight_norm_sparsifier_step"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _register_test(*test_metainfo):
     """save the metainfo needed to create a test. Currently test_metainfo
@@ -187,6 +203,10 @@ def __init__(self, args):
         self.use_jit = args.use_jit
         self.num_runs = args.num_runs
         self.print_per_iter = False
+<<<<<<< HEAD
+=======
+        self.output_csv = args.output_csv
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.operator_range = benchmark_utils.get_operator_range(args.operator_range)
         # 100 is the default warmup iterations
         if self.args.warmup_iterations == -1:
@@ -296,8 +316,12 @@ def split(s):
             (key.strip(), value.strip())
             for key, value in map(lambda str: str.split(":"), key_vals)  # noqa: C417
         ]  # ['M: (32, 16)', 'ZPB: 2'] -> [('M', '(32, 16)'), ('ZPB', '2')]
+<<<<<<< HEAD
         for key, value in key_vals:
             out[key] = value
+=======
+        out.update(key_vals)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return out
 
@@ -398,6 +422,12 @@ def _check_keep_list(self, test_flag, cmd_flag_list):
             test_flag == cmd_flag for cmd_flag in cmd_flag_list
         )
 
+<<<<<<< HEAD
+=======
+    def _check_skip(self, test_module, cmd_flag):
+        return cmd_flag is None or (test_module not in cmd_flag)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _keep_test(self, test_case):
         # TODO: consider regex matching for test filtering.
         # Currently, this is a sub-string matching.
@@ -413,6 +443,10 @@ def _keep_test(self, test_case):
         return (
             self._check_keep(op_test_config.test_name, self.args.test_name)
             and self._check_keep_list(test_case.op_bench.module_name(), operators)
+<<<<<<< HEAD
+=======
+            and self._check_skip(test_case.op_bench.module_name(), SKIP_OP_LISTS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and self._check_operator_first_char(
                 test_case.op_bench.module_name(), self.operator_range
             )
@@ -447,10 +481,125 @@ def _print_test_case_info(self, test_case):
 
         return False
 
+<<<<<<< HEAD
     def run(self):
         self._print_header()
 
         if self.args.output_json:
+=======
+    def _output_csv(self, filename, headers, row):
+        if os.path.exists(filename):
+            with open(filename) as fd:
+                lines = list(csv.reader(fd)) or [[]]
+                if headers and len(headers) > len(lines[0]):
+                    # if prior results failed the header might not be filled in yet
+                    lines[0] = headers
+                else:
+                    headers = lines[0]
+        else:
+            lines = [headers]
+        lines.append([(f"{x:.6f}" if isinstance(x, float) else x) for x in row])
+        with open(filename, "w") as fd:
+            writer = csv.writer(fd, lineterminator="\n")
+            for line in lines:
+                writer.writerow(list(line) + ["0"] * (len(headers) - len(line)))
+
+    def _output_json(
+        self,
+        perf_list,
+        output_file,
+    ):
+        """
+        Write the result into JSON format, so that it can be uploaded to the benchmark database
+        to be displayed on OSS dashboard. The JSON format is defined at
+        https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+        """
+        if not perf_list:
+            return
+
+        # Prepare headers and records for JSON output
+        records = []
+        for perf_item in perf_list:
+            # Extract data from perf_item
+            test_name = perf_item.get("test_name", "unknown")
+            input_config = perf_item.get("input_config", "")
+            run_type = perf_item.get("run")
+            latency = perf_item.get("latency", 0)
+
+            dtype = "float32"  # default
+
+            # Extract mode based on run_type
+            mode = None
+            if run_type == "Forward":
+                mode = "inference"
+            elif run_type == "Backward":
+                mode = "training"
+
+            # Create the record
+            @dataclass
+            class BenchmarkInfo:
+                name: str
+                mode: Optional[str]
+                dtype: str
+                extra_info: dict[str, Any]
+
+            @dataclass
+            class ModelInfo:
+                name: str
+                type: str
+                origins: list[str]
+
+            @dataclass
+            class MetricInfo:
+                name: str
+                unit: str
+                benchmark_values: list[float]
+                target_value: Optional[float]
+
+            @dataclass
+            class BenchmarkRecord:
+                benchmark: BenchmarkInfo
+                model: ModelInfo
+                metric: MetricInfo
+
+            record = BenchmarkRecord(
+                benchmark=BenchmarkInfo(
+                    name="PyTorch operator benchmark",
+                    mode=mode,
+                    dtype=dtype,
+                    extra_info={"input_config": input_config},
+                ),
+                model=ModelInfo(
+                    name=test_name, type="micro-benchmark", origins=["pytorch"]
+                ),
+                metric=MetricInfo(
+                    name="latency",
+                    unit="us",
+                    benchmark_values=[latency],
+                    target_value=None,
+                ),
+            )
+
+            records.append(asdict(record))
+
+        # Write all records to the output file
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(records, f, indent=2)
+
+    def run(self):
+        self._print_header()
+        output_csv_filename = self.args.output_csv
+        headers = [
+            "Benchmarking Framework",
+            "Benchmarking Module Name",
+            "Case Name",
+            "tag",
+            "run_backward",
+            "Execution Time",
+        ]
+
+        if self.args.output_json or self.args.output_json_for_dashboard:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             perf_list = []
 
         for test_metainfo in BENCHMARK_TESTER:
@@ -491,13 +640,42 @@ def run(self):
                     )
                     for _ in range(self.num_runs)
                 ]
+<<<<<<< HEAD
 
                 self._print_perf_result(reported_time, test_case)
                 if self.args.output_json:
+=======
+                self._print_perf_result(reported_time, test_case)
+
+                # output results to csv
+                self._output_csv(
+                    output_csv_filename,
+                    headers,
+                    [
+                        test_case.framework,
+                        test_case.op_bench.module_name(),
+                        (
+                            test_case.test_config.test_name + "_BACKWARD"
+                            if test_case.test_config.run_backward is True
+                            else test_case.test_config.test_name
+                        ),
+                        test_case.test_config.tag,
+                        test_case.test_config.run_backward,
+                        reported_time[0],
+                    ],
+                )
+                if self.args.output_json or self.args.output_json_for_dashboard:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     perf_list.append(
                         self._perf_result_to_dict(reported_time, test_case)
                     )
 
+<<<<<<< HEAD
+=======
+        if self.args.output_json_for_dashboard:
+            self._output_json(perf_list, self.args.output_json_for_dashboard)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.args.output_json:
             with open(self.args.output_json, "w") as f:
                 json.dump(perf_list, f)
diff --git a/benchmarks/operator_benchmark/benchmark_runner.py b/benchmarks/operator_benchmark/benchmark_runner.py
index 1004bd4d575c..ce1de9e3a732 100644
--- a/benchmarks/operator_benchmark/benchmark_runner.py
+++ b/benchmarks/operator_benchmark/benchmark_runner.py
@@ -15,6 +15,10 @@
 parser = argparse.ArgumentParser(
     description="Run microbenchmarks.",
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+<<<<<<< HEAD
+=======
+    conflict_handler="resolve",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -150,6 +154,23 @@ def parse_args():
         default="None",
     )
 
+<<<<<<< HEAD
+=======
+    parser.add_argument(
+        "--output-csv",
+        "--output_csv",
+        help="CSV file path to store the results",
+        default="benchmark_logs",
+    )
+
+    parser.add_argument(
+        "--output-json-for-dashboard",
+        "--output_json_for_dashboard",
+        help="Save results in JSON format for display on the OSS dashboard",
+        default="False",
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args, _ = parser.parse_known_args()
 
     if args.omp_num_threads:
diff --git a/benchmarks/operator_benchmark/benchmark_utils.py b/benchmarks/operator_benchmark/benchmark_utils.py
index be9c62cb3c28..af42fca0f7cb 100644
--- a/benchmarks/operator_benchmark/benchmark_utils.py
+++ b/benchmarks/operator_benchmark/benchmark_utils.py
@@ -37,7 +37,11 @@ def numpy_random(dtype, *shapes):
     Args:
         shapes: int or a sequence of ints to defining the shapes of the tensor
         dtype: use the dtypes from numpy
+<<<<<<< HEAD
             (https://docs.scipy.org/doc/numpy/user/basics.types.html)
+=======
+            (https://numpy.org/doc/stable/user/basics.types.html)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Return:
         numpy tensor of dtype
     """
@@ -134,14 +138,22 @@ def _validate(configs):
 def config_list(**configs):
     """Generate configs based on the list of input shapes.
     This function will take input shapes specified in a list from user. Besides
+<<<<<<< HEAD
     that, all other parameters will be cross producted first and each of the
+=======
+    that, all other parameters will be cross produced first and each of the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     generated list will be merged with the input shapes list.
 
     Reserved Args:
         attr_names(reserved): a list of names for input shapes.
         attrs(reserved): a list of values for each input shape.
         corss_product: a dictionary of attributes which will be
+<<<<<<< HEAD
                        cross producted with the input shapes.
+=======
+                       cross produced with the input shapes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tags(reserved): a tag used to filter inputs.
 
     Here is an example:
diff --git a/benchmarks/operator_benchmark/check_perf_csv.py b/benchmarks/operator_benchmark/check_perf_csv.py
new file mode 100644
index 000000000000..585a41e3beaf
--- /dev/null
+++ b/benchmarks/operator_benchmark/check_perf_csv.py
@@ -0,0 +1,116 @@
+import argparse
+import sys
+import textwrap
+
+import pandas as pd
+
+
+SKIP_TEST_LISTS = [
+    # https://github.com/pytorch/pytorch/issues/143852
+    "channel_shuffle_batch_size4_channels_per_group64_height64_width64_groups4_channel_lastTrue",
+    "batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse",
+    "index_add__M256_N512_K1_dim1_cpu_dtypetorch.float32",
+    "interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modelinear",
+    "original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu",
+    "original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu",
+]
+
+
+def get_field(csv, case: str, field: str):
+    try:
+        return csv.loc[csv["Case Name"] == case][field].item()
+    except Exception:
+        return None
+
+
+def check_perf(actual_csv, expected_csv, expected_filename, threshold):
+    failed = []
+    improved = []
+    baseline_not_found = []
+
+    actual_csv = actual_csv[~actual_csv["Case Name"].isin(set(SKIP_TEST_LISTS))]
+
+    for case in actual_csv["Case Name"]:
+        perf = get_field(actual_csv, case, "Execution Time")
+        expected_perf = get_field(expected_csv, case, "Execution Time")
+
+        if expected_perf is None:
+            status = "Baseline Not Found"
+            print(f"{case:34}  {status}")
+            baseline_not_found.append(case)
+            continue
+
+        speed_up = expected_perf / perf
+
+        if (1 - threshold) <= speed_up < (1 + threshold):
+            status = "PASS"
+            print(f"{case:34}  {status}")
+            continue
+        elif speed_up >= 1 + threshold:
+            status = "IMPROVED:"
+            improved.append(case)
+        else:
+            status = "FAILED:"
+            failed.append(case)
+        print(f"{case:34}  {status:9} perf={perf}, expected={expected_perf}")
+
+    msg = ""
+    if failed or improved or baseline_not_found:
+        if failed:
+            msg += textwrap.dedent(
+                f"""
+            Error: {len(failed)} models have performance status regressed:
+                {" ".join(failed)}
+
+            """
+            )
+        if improved:
+            msg += textwrap.dedent(
+                f"""
+            Improvement: {len(improved)} models have performance status improved:
+                {" ".join(improved)}
+
+            """
+            )
+
+        if baseline_not_found:
+            msg += textwrap.dedent(
+                f"""
+            Baseline Not Found: {len(baseline_not_found)} models don't have the baseline data:
+                {" ".join(baseline_not_found)}
+
+            """
+            )
+
+        msg += textwrap.dedent(
+            f"""
+        If this change is expected, you can update `{expected_filename}` to reflect the new baseline.
+        """
+        )
+    return failed or improved or baseline_not_found, msg
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--actual", type=str, required=True)
+    parser.add_argument("--expected", type=str, required=True)
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.5,
+        help="threshold to define regression/improvement",
+    )
+    args = parser.parse_args()
+
+    actual = pd.read_csv(args.actual)
+    actual.drop_duplicates(subset=["Case Name"], keep="first", inplace=True)
+    expected = pd.read_csv(args.expected)
+
+    failed, msg = check_perf(actual, expected, args.expected, args.threshold)
+    if failed:
+        print(msg)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
new file mode 100644
index 000000000000..873f14d20127
--- /dev/null
+++ b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
@@ -0,0 +1,1319 @@
+Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time
+PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497
+PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181
+PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826
+PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449
+PyTorch,add,add_M1_N1_K1_cpu_bwd1_BACKWARD,short,TRUE,57.765
+PyTorch,add,add_M1_N1_K1_cpu_bwd2_BACKWARD,short,TRUE,57.8035
+PyTorch,add,add_M64_N64_K64_cpu_bwdall_BACKWARD,short,TRUE,135.2775
+PyTorch,add,add_M64_N64_K64_cpu_bwd1_BACKWARD,short,TRUE,135.1988
+PyTorch,add,add_M64_N64_K64_cpu_bwd2_BACKWARD,short,TRUE,135.1905
+PyTorch,add,add_M64_N64_K128_cpu_bwdall_BACKWARD,short,TRUE,135.9341
+PyTorch,add,add_M64_N64_K128_cpu_bwd1_BACKWARD,short,TRUE,136.0071
+PyTorch,add,add_M64_N64_K128_cpu_bwd2_BACKWARD,short,TRUE,135.7898
+PyTorch,addmm,addmm_M1_N1_K1_cpu,short,FALSE,6.5832
+PyTorch,addmm,addmm_M64_N64_K64_cpu,short,FALSE,18.623
+PyTorch,addmm,addmm_M64_N64_K128_cpu,short,FALSE,19.0005
+PyTorch,addmm,addmm_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,86.4294
+PyTorch,addmm,addmm_M1_N1_K1_cpu_bwd1_BACKWARD,short,TRUE,86.5513
+PyTorch,addmm,addmm_M1_N1_K1_cpu_bwd2_BACKWARD,short,TRUE,86.5072
+PyTorch,addmm,addmm_M1_N1_K1_cpu_bwd3_BACKWARD,short,TRUE,86.4965
+PyTorch,addmm,addmm_M64_N64_K64_cpu_bwdall_BACKWARD,short,TRUE,145.9072
+PyTorch,addmm,addmm_M64_N64_K64_cpu_bwd1_BACKWARD,short,TRUE,145.2227
+PyTorch,addmm,addmm_M64_N64_K64_cpu_bwd2_BACKWARD,short,TRUE,145.3786
+PyTorch,addmm,addmm_M64_N64_K64_cpu_bwd3_BACKWARD,short,TRUE,145.2559
+PyTorch,addmm,addmm_M64_N64_K128_cpu_bwdall_BACKWARD,short,TRUE,155.407
+PyTorch,addmm,addmm_M64_N64_K128_cpu_bwd1_BACKWARD,short,TRUE,155.4162
+PyTorch,addmm,addmm_M64_N64_K128_cpu_bwd2_BACKWARD,short,TRUE,155.485
+PyTorch,addmm,addmm_M64_N64_K128_cpu_bwd3_BACKWARD,short,TRUE,155.5694
+PyTorch,as_strided,"as_strided_M8_N8_size(2,2)_stride(1,1)_storage_offset0_cpu",short,FALSE,4.4039
+PyTorch,as_strided,"as_strided_M256_N256_size(32,32)_stride(1,1)_storage_offset0_cpu",short,FALSE,4.4316
+PyTorch,as_strided,"as_strided_M512_N512_size(64,64)_stride(2,2)_storage_offset1_cpu",short,FALSE,4.3663
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse,short,FALSE,153.1791
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse,short,FALSE,36.8686
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse_bwdall_BACKWARD,short,TRUE,171.3087
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse_bwd1_BACKWARD,short,TRUE,171.5833
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse_bwdall_BACKWARD,short,TRUE,169.3315
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse_bwd1_BACKWARD,short,TRUE,169.9856
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse,short,FALSE,37.001
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse_bwdall_BACKWARD,short,TRUE,707.938
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse_bwd1_BACKWARD,short,TRUE,705.6394
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse_bwdall_BACKWARD,short,TRUE,228.8024
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse_bwd1_BACKWARD,short,TRUE,229.531
+PyTorch,add,"add_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,FALSE,14.6918
+PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,3.8735
+PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,14.1837
+PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,14.262
+PyTorch,copy_,copy__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,2.3689
+PyTorch,copy_,copy__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,8.467
+PyTorch,copy_,copy__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,8.3493
+PyTorch,cat,"cat_sizes(1,1,1)_N2_dim0_cpu",short,FALSE,4.084
+PyTorch,cat,"cat_sizes(512,512,2)_N2_dim1_cpu",short,FALSE,19.7723
+PyTorch,cat,"cat_sizes(128,1024,2)_N2_dim1_cpu",short,FALSE,19.0845
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group16_height16_width16_groups2_channel_lastTrue,short,FALSE,11.9975
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group16_height16_width16_groups2_channel_lastFalse,short,FALSE,10.2126
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group32_height32_width32_groups2_channel_lastTrue,short,FALSE,13.4302
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group32_height32_width32_groups2_channel_lastFalse,short,FALSE,10.7077
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group32_height32_width32_groups4_channel_lastTrue,short,FALSE,16.3329
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group32_height32_width32_groups4_channel_lastFalse,short,FALSE,12.5681
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group64_height64_width64_groups4_channel_lastTrue,short,FALSE,125.6838
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group64_height64_width64_groups4_channel_lastFalse,short,FALSE,120.3468
+PyTorch,channel_shuffle,channel_shuffle_batch_size8_channels_per_group64_height64_width64_groups8_channel_lastTrue,short,FALSE,1682.1261
+PyTorch,channel_shuffle,channel_shuffle_batch_size8_channels_per_group64_height64_width64_groups8_channel_lastFalse,short,FALSE,1669.6469
+PyTorch,channel_shuffle,channel_shuffle_batch_size16_channels_per_group64_height64_width64_groups16_channel_lastTrue,short,FALSE,7362.4572
+PyTorch,channel_shuffle,channel_shuffle_batch_size16_channels_per_group64_height64_width64_groups16_channel_lastFalse,short,FALSE,7460.3745
+PyTorch,chunk,chunk_M8_N8_chunks2_cpu,short,FALSE,6.7387
+PyTorch,chunk,chunk_M256_N512_chunks2_cpu,short,FALSE,6.7331
+PyTorch,chunk,chunk_M512_N512_chunks2_cpu,short,FALSE,6.7612
+PyTorch,Conv1d,Conv1d_IC128_OC256_kernel3_stride1_N1_L64_cpu,short,FALSE,84.5187
+PyTorch,Conv1d,Conv1d_IC256_OC256_kernel3_stride2_N4_L64_cpu,short,FALSE,190.8436
+PyTorch,ConvTranspose1d,ConvTranspose1d_IC2016_OC1026_kernel1024_stride256_N1_L224_cpu,short,FALSE,2746443.218
+PyTorch,ConvTranspose1d,ConvTranspose1d_IC128_OC256_kernel3_stride1_N1_L64_cpu,short,FALSE,211.5399
+PyTorch,ConvTranspose1d,ConvTranspose1d_IC256_OC256_kernel3_stride2_N4_L64_cpu,short,FALSE,337.3341
+PyTorch,Conv2d,Conv2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,FALSE,255.67
+PyTorch,ConvTranspose2d,ConvTranspose2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,FALSE,335.2168
+PyTorch,Conv2dPointwise,Conv2dPointwise_IC256_OC256_stride1_N1_H16_W16_G1_pad0_cpu,short,FALSE,154.9221
+PyTorch,Conv3d,Conv3d_IC64_OC64_kernel3_stride1_N8_D4_H16_W16_cpu,short,FALSE,546.3879
+PyTorch,ConvTranspose3d,ConvTranspose3d_IC64_OC64_kernel3_stride1_N8_D4_H16_W16_cpu,short,FALSE,1085.1947
+PyTorch,diag,diag_dim1_M64_N64_diagonal0_outTrue_cpu,short,FALSE,10.5764
+PyTorch,diag,diag_dim2_M128_N128_diagonal-10_outFalse_cpu,short,FALSE,7.56
+PyTorch,diag,diag_dim1_M256_N256_diagonal20_outTrue_cpu,short,FALSE,23.1775
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.0592
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.3839
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.0397
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.3495
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.689
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.9881
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.5983
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.8823
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,30.6063
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,47.841
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,30.6428
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,47.8691
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,28.9983
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.4274
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.1056
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.4201
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.3727
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,45.1213
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.4613
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,45.4844
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,30.9368
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,48.3018
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,30.9671
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,48.6072
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.0466
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.3651
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.07
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.4114
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,30.312
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.7561
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.5243
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.8939
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,30.9467
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,48.6129
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,30.9435
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,48.225
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.0514
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,43.3893
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,28.9891
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.4042
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,29.4403
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,44.0072
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,29.5882
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,43.8711
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,31.001
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,49.1249
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,31.0196
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,49.3764
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,99.4515
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,99.5995
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,94.7753
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,94.8932
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,100.8096
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,100.7441
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,95.4789
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,95.8774
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,115.1715
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,115.2046
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,98.8603
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,99.0584
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,99.526
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,99.4308
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,103.1781
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,103.3991
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,100.831
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,100.7591
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,103.9218
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,104.1436
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,115.0634
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,115.2862
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,108.1062
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,108.1646
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,99.5187
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,99.5428
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,143.7975
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,142.3146
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,100.8197
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,100.7614
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,145.1106
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,143.4348
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,115.1787
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,115.1834
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,153.0874
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,152.9047
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,99.4746
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,99.4203
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,142.9623
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,141.5405
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,100.7833
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,100.7818
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,144.5829
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,142.8518
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,115.2002
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,115.3467
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,154.0366
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,153.7648
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size8_cpu,short,FALSE,15.2528
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size16_cpu,short,FALSE,15.324
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size64_cpu,short,FALSE,16.6834
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size8_cpu,short,FALSE,15.2863
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size16_cpu,short,FALSE,15.3453
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size64_cpu,short,FALSE,16.7499
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size8_cpu,short,FALSE,15.285
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size16_cpu,short,FALSE,15.3442
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size64_cpu,short,FALSE,16.7735
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size8_cpu,short,FALSE,15.2839
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size16_cpu,short,FALSE,15.34
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size64_cpu,short,FALSE,16.7823
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,64.2787
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,65.3662
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,73.7048
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,70.0031
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,71.1541
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,79.6225
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,99.6097
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,100.9448
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,116.043
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,100.1301
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,101.7065
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,113.1131
+PyTorch,fill_,fill__N1_cpu_dtypetorch.int32,short,FALSE,1.5661
+PyTorch,fill_,fill__N1024_cpu_dtypetorch.int32,short,FALSE,2.491
+PyTorch,fill_,fill__N2048_cpu_dtypetorch.int32,short,FALSE,2.6027
+PyTorch,gather,gather_M256_N512_dim0_cpu,short,FALSE,95.7869
+PyTorch,gather,gather_M512_N512_dim1_cpu,short,FALSE,56.6895
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,16)_num_groups2",short,FALSE,16.3328
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,16)_num_groups4",short,FALSE,16.7871
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,56,56)_num_groups2",short,FALSE,33.1706
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,56,56)_num_groups4",short,FALSE,33.2813
+PyTorch,Hardsigmoid,Hardsigmoid_N1_C3_H256_W256_cpu,short,FALSE,22.0699
+PyTorch,Hardsigmoid,Hardsigmoid_N4_C3_H256_W256_cpu,short,FALSE,22.4028
+PyTorch,Hardswish,Hardswish_N1_C3_H256_W256_cpu,short,FALSE,21.769
+PyTorch,Hardswish,Hardswish_N4_C3_H256_W256_cpu,short,FALSE,22.496
+PyTorch,InstanceNormBenchmark,"InstanceNormBenchmark_dims(32,8,16)",short,FALSE,43.7271
+PyTorch,InstanceNormBenchmark,"InstanceNormBenchmark_dims(32,8,56,56)",short,FALSE,155.211
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modenearest",short,FALSE,14.6238
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modelinear",short,FALSE,17.9502
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modebicubic",short,FALSE,62.1607
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modenearest",short,FALSE,29.3439
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modelinear",short,FALSE,32.6772
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modebicubic",short,FALSE,49.7318
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modenearest",short,FALSE,124.5945
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modelinear",short,FALSE,224.6027
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modebicubic",short,FALSE,677.6898
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modenearest",short,FALSE,47.635
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modelinear",short,FALSE,84.5371
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modebicubic",short,FALSE,453.9032
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modenearest",short,FALSE,137.7183
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modelinear",short,FALSE,250.3212
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modebicubic",short,FALSE,763.5104
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modenearest",short,FALSE,51.1275
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modelinear",short,FALSE,90.131
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modebicubic",short,FALSE,503.6091
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modenearest",short,FALSE,13.9714
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modelinear",short,FALSE,17.504
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modebicubic",short,FALSE,34.1424
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modenearest",short,FALSE,13.986
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modelinear",short,FALSE,17.5393
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modebicubic",short,FALSE,34.149
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modenearest",short,FALSE,47.9375
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modelinear",short,FALSE,84.0489
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modebicubic",short,FALSE,453.023
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modenearest",short,FALSE,47.88
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modelinear",short,FALSE,83.8355
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modebicubic",short,FALSE,453.0895
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modenearest",short,FALSE,51.024
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modelinear",short,FALSE,89.7526
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modebicubic",short,FALSE,502.907
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modenearest",short,FALSE,51.233
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modelinear",short,FALSE,89.4653
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modebicubic",short,FALSE,503.0013
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,14.5385
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,33.8506
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,127.4482
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,64.694
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,141.7011
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,70.1855
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,13.7804
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,13.7883
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,64.5975
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,64.778
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modenearest_dtypetorch.uint8",short,FALSE,70.2955
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modenearest_dtypetorch.uint8",short,FALSE,70.5035
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(1,8,16)",short,FALSE,10.2883
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(8,8,16)",short,FALSE,14.6892
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(32,8,16)",short,FALSE,16.7842
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(64,128,56,56)",short,FALSE,3613.5986
+PyTorch,linear,linear_N1_IN1_OUT1_cpu,short,FALSE,17.7464
+PyTorch,linear,linear_N4_IN256_OUT128_cpu,short,FALSE,25.6582
+PyTorch,linear,linear_N16_IN512_OUT256_cpu,short,FALSE,40.8298
+PyTorch,matmul,matmul_M1_N1_K1_trans_aTrue_trans_bFalse_cpu,short,FALSE,5.6513
+PyTorch,matmul,matmul_M128_N128_K128_trans_aTrue_trans_bFalse_cpu,short,FALSE,15.9769
+PyTorch,matmul,matmul_M256_N256_K256_trans_aFalse_trans_bTrue_cpu,short,FALSE,36.2135
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infTrue,short,FALSE,6.1434
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infFalse,short,FALSE,6.6946
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infTrue,short,FALSE,6.4431
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infFalse,short,FALSE,6.9691
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infTrue,short,FALSE,7.7852
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infFalse,short,FALSE,8.2928
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infTrue,short,FALSE,8.6692
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infFalse,short,FALSE,9.1626
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infTrue,short,FALSE,2.98
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infFalse,short,FALSE,3.4837
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infTrue,short,FALSE,3.2369
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infFalse,short,FALSE,3.7523
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infTrue,short,FALSE,3.6271
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infFalse,short,FALSE,4.1246
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infTrue,short,FALSE,4.4276
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infFalse,short,FALSE,4.9627
+PyTorch,MaxPool1d,MaxPool1d_kernel3_stride1_N8_C256_L256_cpu,short,FALSE,29.9109
+PyTorch,AvgPool1d,AvgPool1d_kernel3_stride1_N8_C256_L256_cpu,short,FALSE,222.0801
+PyTorch,MaxPool2d,"MaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,FALSE,22.5565
+PyTorch,AvgPool2d,"AvgPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,FALSE,16.4938
+PyTorch,AdaptiveMaxPool2d,"AdaptiveMaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,FALSE,16.1588
+PyTorch,FractionalMaxPool2d,"FractionalMaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,FALSE,22.3268
+PyTorch,MaxPool3d,"MaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,FALSE,158.2535
+PyTorch,AvgPool3d,"AvgPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,FALSE,46.2965
+PyTorch,AdaptiveMaxPool3d,"AdaptiveMaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,FALSE,32.4115
+PyTorch,FractionalMaxPool3d,"FractionalMaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,FALSE,22.1037
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.int32,short,FALSE,3.8882
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.float32,short,FALSE,3.9922
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.float64,short,FALSE,3.9689
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.int32,short,FALSE,81.9179
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.float32,short,FALSE,101.8454
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.float64,short,FALSE,192.0626
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.int32,short,FALSE,83.1678
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.float32,short,FALSE,104.5534
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.float64,short,FALSE,195.5447
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.int32,short,FALSE,3.8216
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.float32,short,FALSE,3.9375
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.float64,short,FALSE,3.9714
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.int32,short,FALSE,160.44
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.float32,short,FALSE,112.3482
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.float64,short,FALSE,212.7387
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.int32,short,FALSE,151.3565
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.float32,short,FALSE,116.2934
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.float64,short,FALSE,219.1238
+PyTorch,Softmax,Softmax_N1_C3_H256_W256_cpu,short,FALSE,34.4325
+PyTorch,Softmax,Softmax_N4_C3_H256_W256_cpu,short,FALSE,76.2321
+PyTorch,Softmax2d,Softmax2d_N1_C3_H256_W256_cpu,short,FALSE,32.0697
+PyTorch,Softmax2d,Softmax2d_N4_C3_H256_W256_cpu,short,FALSE,73.5653
+PyTorch,LogSoftmax,LogSoftmax_N1_C3_H256_W256_cpu,short,FALSE,60.9824
+PyTorch,LogSoftmax,LogSoftmax_N4_C3_H256_W256_cpu,short,FALSE,106.1641
+PyTorch,split,split_M8_N8_parts2_cpu,short,FALSE,7.7648
+PyTorch,split,split_M256_N512_parts2_cpu,short,FALSE,7.7645
+PyTorch,split,split_M512_N512_parts2_cpu,short,FALSE,7.7457
+PyTorch,sum,sum_R64_V32_dim0_contiguousTrue_cpu,short,FALSE,7.4003
+PyTorch,sum,sum_R64_V32_dim0_contiguousFalse_cpu,short,FALSE,7.7515
+PyTorch,sum,sum_R64_V32_dim1_contiguousTrue_cpu,short,FALSE,7.6422
+PyTorch,sum,sum_R64_V32_dim1_contiguousFalse_cpu,short,FALSE,8.0323
+PyTorch,sum,sum_R64_V512_dim0_contiguousTrue_cpu,short,FALSE,12.4306
+PyTorch,sum,sum_R64_V512_dim0_contiguousFalse_cpu,short,FALSE,13.821
+PyTorch,sum,sum_R64_V512_dim1_contiguousTrue_cpu,short,FALSE,15.0937
+PyTorch,sum,sum_R64_V512_dim1_contiguousFalse_cpu,short,FALSE,14.2064
+PyTorch,sum,sum_R256_V32_dim0_contiguousTrue_cpu,short,FALSE,7.76
+PyTorch,sum,sum_R256_V32_dim0_contiguousFalse_cpu,short,FALSE,9.7236
+PyTorch,sum,sum_R256_V32_dim1_contiguousTrue_cpu,short,FALSE,7.9835
+PyTorch,sum,sum_R256_V32_dim1_contiguousFalse_cpu,short,FALSE,9.6207
+PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,FALSE,12.5841
+PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,FALSE,20.8765
+PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,FALSE,15.4414
+PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,FALSE,15.3287
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0499
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3229
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4418
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.0868
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4495
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5578
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.2631
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5646
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,FALSE,5.7898
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0228
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3692
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4006
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.1107
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4119
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5583
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.3818
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5742
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,FALSE,6.8414
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.4657
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.4625
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.4165
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,10.0753
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,10.0801
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.9056
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,37.4143
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,37.4995
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,47.061
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,38.4561
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,38.6113
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,60.9784
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.0443
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.9833
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.9762
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,7.5451
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,7.4914
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,6.5208
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,6.5417
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.514
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,7.4671
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,7.5016
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,423.648
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,432.648
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,477.7001
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,428.6677
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,438.0222
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,492.3953
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,10.6166
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,10.6037
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,10.5716
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,11.9313
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,11.9191
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,11.4355
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,57.1153
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,57.19
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,226.2822
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,57.4159
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,57.3784
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,238.6827
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.2392
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,6.2414
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,6.2808
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.6169
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,7.552
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,7.7053
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,733.8272
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,881.4968
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,649.5353
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,736.2685
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,889.0958
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,672.2981
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.2062
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,6.355
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,6.2835
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.6287
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,7.7579
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,7.6012
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,584.3268
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,680.9102
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,701.6249
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,591.4621
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,687.5734
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,710.5012
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,10.6018
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,10.5699
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,10.5695
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,11.5372
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,11.5601
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,11.6734
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,248.9016
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,251.228
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,311.6496
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,254.0779
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,256.7338
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,311.0197
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,11.1139
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,11.3503
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,11.3583
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,11.9967
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,12.4721
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,12.5357
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,221.9019
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,562.2536
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,631.6971
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,224.0514
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,575.5199
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,644.9067
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,10.8539
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,11.0591
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,11.098
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,11.738
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,12.2373
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,12.2706
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,221.5425
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,562.3881
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,627.6411
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,222.5929
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,575.7397
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,645.4033
+PyTorch,add,add_N2_dtypetorch.quint8_contigFalse,short,FALSE,10.5944
+PyTorch,add,add_N2_dtypetorch.quint8_contigTrue,short,FALSE,10.3022
+PyTorch,add,add_N2_dtypetorch.qint8_contigFalse,short,FALSE,10.5853
+PyTorch,add,add_N2_dtypetorch.qint8_contigTrue,short,FALSE,10.381
+PyTorch,add,add_N2_dtypetorch.qint32_contigFalse,short,FALSE,10.6032
+PyTorch,add,add_N2_dtypetorch.qint32_contigTrue,short,FALSE,10.2916
+PyTorch,add,add_N8_dtypetorch.quint8_contigFalse,short,FALSE,11.1193
+PyTorch,add,add_N8_dtypetorch.quint8_contigTrue,short,FALSE,10.8562
+PyTorch,add,add_N8_dtypetorch.qint8_contigFalse,short,FALSE,11.1441
+PyTorch,add,add_N8_dtypetorch.qint8_contigTrue,short,FALSE,10.8855
+PyTorch,add,add_N8_dtypetorch.qint32_contigFalse,short,FALSE,11.1265
+PyTorch,add,add_N8_dtypetorch.qint32_contigTrue,short,FALSE,10.6666
+PyTorch,add,add_N64_dtypetorch.quint8_contigFalse,short,FALSE,42.827
+PyTorch,add,add_N64_dtypetorch.quint8_contigTrue,short,FALSE,12.1777
+PyTorch,add,add_N64_dtypetorch.qint8_contigFalse,short,FALSE,71.8321
+PyTorch,add,add_N64_dtypetorch.qint8_contigTrue,short,FALSE,12.2144
+PyTorch,add,add_N64_dtypetorch.qint32_contigFalse,short,FALSE,45.3253
+PyTorch,add,add_N64_dtypetorch.qint32_contigTrue,short,FALSE,31.7538
+PyTorch,add,add_N512_dtypetorch.quint8_contigFalse,short,FALSE,282.9102
+PyTorch,add,add_N512_dtypetorch.quint8_contigTrue,short,FALSE,29.0446
+PyTorch,add,add_N512_dtypetorch.qint8_contigFalse,short,FALSE,557.6633
+PyTorch,add,add_N512_dtypetorch.qint8_contigTrue,short,FALSE,28.9897
+PyTorch,add,add_N512_dtypetorch.qint32_contigFalse,short,FALSE,332.7038
+PyTorch,add,add_N512_dtypetorch.qint32_contigTrue,short,FALSE,186.5795
+PyTorch,add_relu,add_relu_N2_dtypetorch.quint8_contigFalse,short,FALSE,10.5985
+PyTorch,add_relu,add_relu_N2_dtypetorch.quint8_contigTrue,short,FALSE,10.2837
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint8_contigFalse,short,FALSE,10.6095
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint8_contigTrue,short,FALSE,10.2838
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint32_contigFalse,short,FALSE,10.6035
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint32_contigTrue,short,FALSE,10.2648
+PyTorch,add_relu,add_relu_N8_dtypetorch.quint8_contigFalse,short,FALSE,11.1752
+PyTorch,add_relu,add_relu_N8_dtypetorch.quint8_contigTrue,short,FALSE,10.8657
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint8_contigFalse,short,FALSE,11.2
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint8_contigTrue,short,FALSE,10.8263
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint32_contigFalse,short,FALSE,11.1316
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint32_contigTrue,short,FALSE,10.6437
+PyTorch,add_relu,add_relu_N64_dtypetorch.quint8_contigFalse,short,FALSE,44.7881
+PyTorch,add_relu,add_relu_N64_dtypetorch.quint8_contigTrue,short,FALSE,12.43
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint8_contigFalse,short,FALSE,57.4703
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint8_contigTrue,short,FALSE,12.4346
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint32_contigFalse,short,FALSE,45.4349
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint32_contigTrue,short,FALSE,31.8962
+PyTorch,add_relu,add_relu_N512_dtypetorch.quint8_contigFalse,short,FALSE,300.9877
+PyTorch,add_relu,add_relu_N512_dtypetorch.quint8_contigTrue,short,FALSE,31.4974
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint8_contigFalse,short,FALSE,410.9462
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint8_contigTrue,short,FALSE,31.4363
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint32_contigFalse,short,FALSE,344.9523
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint32_contigTrue,short,FALSE,187.2273
+PyTorch,mul,mul_N2_dtypetorch.quint8_contigFalse,short,FALSE,10.6211
+PyTorch,mul,mul_N2_dtypetorch.quint8_contigTrue,short,FALSE,10.2957
+PyTorch,mul,mul_N2_dtypetorch.qint8_contigFalse,short,FALSE,26.7876
+PyTorch,mul,mul_N2_dtypetorch.qint8_contigTrue,short,FALSE,11.8583
+PyTorch,mul,mul_N2_dtypetorch.qint32_contigFalse,short,FALSE,10.6105
+PyTorch,mul,mul_N2_dtypetorch.qint32_contigTrue,short,FALSE,10.253
+PyTorch,mul,mul_N8_dtypetorch.quint8_contigFalse,short,FALSE,10.8428
+PyTorch,mul,mul_N8_dtypetorch.quint8_contigTrue,short,FALSE,10.2394
+PyTorch,mul,mul_N8_dtypetorch.qint8_contigFalse,short,FALSE,482666.2954
+PyTorch,mul,mul_N8_dtypetorch.qint8_contigTrue,short,FALSE,482636.4966
+PyTorch,mul,mul_N8_dtypetorch.qint32_contigFalse,short,FALSE,10.8113
+PyTorch,mul,mul_N8_dtypetorch.qint32_contigTrue,short,FALSE,10.5465
+PyTorch,mul,mul_N64_dtypetorch.quint8_contigFalse,short,FALSE,24.7507
+PyTorch,mul,mul_N64_dtypetorch.quint8_contigTrue,short,FALSE,11.7326
+PyTorch,mul,mul_N64_dtypetorch.qint8_contigFalse,short,FALSE,482828.4171
+PyTorch,mul,mul_N64_dtypetorch.qint8_contigTrue,short,FALSE,481824.9386
+PyTorch,mul,mul_N64_dtypetorch.qint32_contigFalse,short,FALSE,26.0202
+PyTorch,mul,mul_N64_dtypetorch.qint32_contigTrue,short,FALSE,12.8726
+PyTorch,mul,mul_N512_dtypetorch.quint8_contigFalse,short,FALSE,145.5759
+PyTorch,mul,mul_N512_dtypetorch.quint8_contigTrue,short,FALSE,32.8526
+PyTorch,mul,mul_N512_dtypetorch.qint8_contigFalse,short,FALSE,504309.8453
+PyTorch,mul,mul_N512_dtypetorch.qint8_contigTrue,short,FALSE,481425.6449
+PyTorch,mul,mul_N512_dtypetorch.qint32_contigFalse,short,FALSE,178.6525
+PyTorch,mul,mul_N512_dtypetorch.qint32_contigTrue,short,FALSE,25.5806
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.quint8_contigFalse,short,FALSE,10.0295
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.quint8_contigTrue,short,FALSE,9.7243
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint8_contigFalse,short,FALSE,10.4763
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint8_contigTrue,short,FALSE,10.115
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint32_contigFalse,short,FALSE,10.4586
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint32_contigTrue,short,FALSE,10.1003
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.quint8_contigFalse,short,FALSE,10.2296
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.quint8_contigTrue,short,FALSE,9.6789
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint8_contigFalse,short,FALSE,10.5389
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint8_contigTrue,short,FALSE,10.1111
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint32_contigFalse,short,FALSE,10.5022
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint32_contigTrue,short,FALSE,10.1294
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.quint8_contigFalse,short,FALSE,23.0866
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.quint8_contigTrue,short,FALSE,10.6124
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint8_contigFalse,short,FALSE,12.5145
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint8_contigTrue,short,FALSE,10.2379
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint32_contigFalse,short,FALSE,13.7177
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint32_contigTrue,short,FALSE,11.5271
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.quint8_contigFalse,short,FALSE,121.8917
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.quint8_contigTrue,short,FALSE,21.5355
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint8_contigFalse,short,FALSE,36.5354
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint8_contigTrue,short,FALSE,15.6843
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint32_contigFalse,short,FALSE,49.8448
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint32_contigTrue,short,FALSE,17.9149
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.quint8_contigFalse,short,FALSE,10.36
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.quint8_contigTrue,short,FALSE,9.9088
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint8_contigFalse,short,FALSE,10.2901
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint8_contigTrue,short,FALSE,9.9043
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint32_contigFalse,short,FALSE,10.2596
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint32_contigTrue,short,FALSE,9.9143
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.quint8_contigFalse,short,FALSE,10.3176
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.quint8_contigTrue,short,FALSE,9.9044
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint8_contigFalse,short,FALSE,10.3161
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint8_contigTrue,short,FALSE,9.8889
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint32_contigFalse,short,FALSE,10.3277
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint32_contigTrue,short,FALSE,9.9309
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.quint8_contigFalse,short,FALSE,12.3152
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.quint8_contigTrue,short,FALSE,10.0833
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint8_contigFalse,short,FALSE,12.3086
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint8_contigTrue,short,FALSE,10.0465
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint32_contigFalse,short,FALSE,13.4912
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint32_contigTrue,short,FALSE,11.3329
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.quint8_contigFalse,short,FALSE,36.5829
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.quint8_contigTrue,short,FALSE,15.4998
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint8_contigFalse,short,FALSE,36.4562
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint8_contigTrue,short,FALSE,15.3596
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint32_contigFalse,short,FALSE,49.8853
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint32_contigTrue,short,FALSE,17.6362
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,355.0354
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,368.8042
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,360.2546
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,381.3022
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,374.2793
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,390.4843
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,430.9984
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,445.2845
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,433.8101
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,451.0111
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,438.0377
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,453.5154
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,709.668
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,721.3673
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,710.1981
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,721.1726
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,710.6612
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,723.0125
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,713.4903
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,724.1643
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,712.8765
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,724.4497
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,714.4023
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,726.1041
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,109.969
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,110.0344
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,110.761
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,110.5783
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,114.0811
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,114.2354
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,132.7028
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,132.6065
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,133.5545
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,133.6274
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,137.6377
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,137.5045
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,221.7957
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,221.4864
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,222.7084
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,222.4641
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,229.7689
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,229.6451
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,216.9278
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,216.7878
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,218.8793
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,217.6596
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,TRUE,227.4012
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,TRUE,226.5648
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size8_cpu,short,FALSE,290.8695
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size16_cpu,short,FALSE,290.7763
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size64_cpu,short,FALSE,297.0648
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size8_cpu,short,FALSE,403.8592
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size16_cpu,short,FALSE,403.8437
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size64_cpu,short,FALSE,409.9157
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size8_cpu,short,FALSE,671.7752
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size16_cpu,short,FALSE,672.3172
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size64_cpu,short,FALSE,676.8372
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size8_cpu,short,FALSE,674.6064
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size16_cpu,short,FALSE,675.0676
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size64_cpu,short,FALSE,680.827
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,79.0452
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,80.1169
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,88.6495
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,96.7162
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,97.7005
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,106.7757
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,175.1056
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,176.8257
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,190.8659
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size8_cpu_BACKWARD,short,TRUE,171.2972
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size16_cpu_BACKWARD,short,TRUE,172.8735
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size64_cpu_BACKWARD,short,TRUE,183.3709
+PyTorch,QBatchNorm1d,QBatchNorm1d_M1_N256_K3136_cpu_dtypetorch.qint8,short,FALSE,153.9832
+PyTorch,QBatchNorm2d,QBatchNorm2d_M1_N256_K3136_cpu_dtypetorch.qint8,short,FALSE,61.4478
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.quint8,short,FALSE,104.9221
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.qint8,short,FALSE,104.7472
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.qint32,short,FALSE,128.8396
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.quint8,short,FALSE,155.9432
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.qint8,short,FALSE,155.3885
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.qint32,short,FALSE,188.1577
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.quint8,short,FALSE,190.7007
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.qint8,short,FALSE,192.4179
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.qint32,short,FALSE,231.5225
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.quint8,short,FALSE,129.3093
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.qint8,short,FALSE,128.2626
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.qint32,short,FALSE,271.3915
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.quint8,short,FALSE,191.3597
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.qint8,short,FALSE,191.134
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.qint32,short,FALSE,397.2508
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.quint8,short,FALSE,237.8599
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.qint8,short,FALSE,240.6118
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.qint32,short,FALSE,506.6646
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4297
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.0997
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.7053
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.9015
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0206
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.1004
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6579
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.3904
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.3498
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.0453
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.6204
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.9011
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0282
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.1237
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.5881
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.3807
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.5953
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.3007
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.6519
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.9558
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0615
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.2629
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6217
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.4081
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,32.5281
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,46.3943
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,32.9841
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,45.4013
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,25.19
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,37.6235
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,24.520375
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,37.521875
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,33.144625
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,47.185375
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,33.52025
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,46.367375
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,25.179625
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,38.30575
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,24.608625
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,36.7722
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,34.2642
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,48.2449
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,34.0771
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,46.5628
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,24.3846
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,37.5572
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,24.827125
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,36.9891
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.5395
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.1835
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.682
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7864
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.1007
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.0939
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.644
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2399
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4854
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.09
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5728
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7568
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0606
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.0656
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.5755
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.186
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.6756
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.2891
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5692
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.8301
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.193
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.2332
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6213
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2713
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,30.6881
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,44.4952
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.8194
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,43.33
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.112
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.6978
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.6752
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.7569
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,30.6535
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,44.3514
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.8102
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.2606
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.7416
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.675
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.4435
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.6755
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,32.4963
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,45.9445
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,32.1904
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,41.2365
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.960125
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.6699
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.582625
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.653
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.5126
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.2293
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5979
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.9025
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0575
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.2111
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6034
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.3414
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4161
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.1066
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5909
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.8661
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0769
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.1281
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.5435
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.3255
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.6525
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.3766
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5673
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.9844
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.1277
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.2652
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6394
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.3441
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,30.1368
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.9901
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.6197
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.5089
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.0542
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.2108
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.4296
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.3959
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.9856
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.9188
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.2664
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.332
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.0548
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.1588
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.4009
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.1072
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,31.7829
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,45.85
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,31.5806
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,41.043
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.9851
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.1132
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.588
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.695
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.5106
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.1371
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.6594
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7381
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0487
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.078
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6736
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.1977
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.3725
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,29.9698
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.6123
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.6552
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0132
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.036
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6138
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.1466
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.6191
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.2911
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.567
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.8132
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0952
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.1532
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6421
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2205
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,30.0702
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.8646
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.6099
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.0542
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.0923
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.0654
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.6026
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,33.8391
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.8884
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.8855
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.344
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.4249
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,22.0377
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,35.0056
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.6264
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.0507
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,31.7842
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,45.8158
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,31.5994
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,41.2045
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.956
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.9384
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.7229
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.1293
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4409
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.0634
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.602
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7537
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0882
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.0023
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6048
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2668
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4064
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,29.96
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.599
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.6939
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.1344
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,24.9514
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.5767
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2132
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.6484
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.2545
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5295
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7238
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.2001
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.0923
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6029
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2461
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.6869
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.5843
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,30.3092
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.2157
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.7831
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.6798
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.1468
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,33.912
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.6168
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.5096
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,29.8273
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,42.5003
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.7367
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.6689
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.1267
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,33.7905
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,31.4938
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,45.118
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,31.0123
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,43.125
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.6769
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.624
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.1879
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,33.5701
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.4122
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.1641
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.62
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7366
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0367
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.1452
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6244
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.1445
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.3426
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.0687
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.599
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7432
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.0121
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.112
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.5655
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.1537
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,18.5808
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,30.3334
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,18.5502
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,30.7833
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,14.1244
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,25.2665
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,14.6322
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,26.2824
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.6852
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.5176
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,29.5751
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,41.6508
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.7045
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.7853
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.3255
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,35.008
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,29.6213
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,43.4305
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,29.9976
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,41.8899
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.6829
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.7859
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.1738
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,33.846
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,FALSE,31.5165
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,FALSE,45.5134
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,FALSE,31.2167
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,FALSE,43.2783
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,FALSE,21.601
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,FALSE,34.7351
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,FALSE,21.2838
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,FALSE,34.3512
+PyTorch,QConv1d,QConv1d_IC128_OC256_kernel3_stride1_N1_L64_cpu,short,FALSE,121.0661
+PyTorch,QConv1d,QConv1d_IC256_OC256_kernel3_stride2_N4_L64_cpu,short,FALSE,150.2673
+PyTorch,QConv2d,QConv2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,FALSE,170.6436
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128,short,FALSE,9.9914
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256,short,FALSE,11.3223
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512,short,FALSE,11.8468
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128,short,FALSE,11.0042
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256,short,FALSE,11.1633
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512,short,FALSE,12.6725
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128,short,FALSE,11.0473
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256,short,FALSE,11.2128
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512,short,FALSE,11.5582
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128,short,FALSE,14.0764
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256,short,FALSE,16.5054
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512,short,FALSE,20.8127
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128,short,FALSE,11.9969
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256,short,FALSE,12.1914
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512,short,FALSE,12.4254
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128,short,FALSE,12.1991
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256,short,FALSE,12.3906
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512,short,FALSE,13.1629
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,12.8605
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,14.0776
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,16.2496
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,9.6159
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,9.5633
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,9.5626
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,9.6479
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,9.67
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,9.6473
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,27.9084
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,45.0191
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,75.4599
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,8.9655
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,8.9751
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,8.9735
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128_batch_size10,short,FALSE,9.1326
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256_batch_size10,short,FALSE,9.1411
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512_batch_size10,short,FALSE,9.1444
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.7564
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.3127
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.7029
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.2567
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.7698
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.3691
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.8276
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.3425
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.963
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.6502
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.9717
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.5513
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.8327
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.3891
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.836
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.3409
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.7967
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.3416
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.8597
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.4254
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,49.1124
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.4759
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,49.1267
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.6559
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.9441
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.486
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.8731
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.4419
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.8811
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.4126
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.8853
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.4946
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,49.1347
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.6574
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,49.0424
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.6891
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,48.8887
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.3275
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.7541
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.4383
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,49.0772
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.4852
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,48.8794
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.4925
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,FALSE,49.0408
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,FALSE,64.7088
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,FALSE,49.0971
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,FALSE,64.7124
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,16)_num_groups2_dtypetorch.qint8",short,FALSE,14.2907
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,16)_num_groups4_dtypetorch.qint8",short,FALSE,14.4174
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,56,56)_num_groups2_dtypetorch.qint8",short,FALSE,35.2416
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,56,56)_num_groups4_dtypetorch.qint8",short,FALSE,35.5288
+PyTorch,QInstanceNormBenchmark,"QInstanceNormBenchmark_dims(32,8,16)_dtypetorch.qint8",short,FALSE,14.4557
+PyTorch,QInstanceNormBenchmark,"QInstanceNormBenchmark_dims(32,8,56,56)_dtypetorch.qint8",short,FALSE,33.8378
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modenearest_scale0.5_contigTrue,short,FALSE,7.8927
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modebilinear_scale0.5_contigTrue,short,FALSE,12.1855
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modenearest_scale2.0_contigTrue,short,FALSE,7.876
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modebilinear_scale2.0_contigTrue,short,FALSE,12.1806
+PyTorch,q_interpolate,q_interpolate_M3_N720_K1280_dtypetorch.quint8_modebilinear_scale0.83333_contigTrue,short,FALSE,31.3689
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(1,8,16)_dtypetorch.qint8",short,FALSE,14.4236
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(8,8,16)_dtypetorch.qint8",short,FALSE,19.3001
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(32,8,16)_dtypetorch.qint8",short,FALSE,19.5875
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(64,128,56,56)_dtypetorch.qint8",short,FALSE,1529.4168
+PyTorch,QLinear,QLinear_N1_IN1_OUT1_cpu,short,FALSE,61.9098
+PyTorch,QLinear,QLinear_N4_IN256_OUT128_cpu,short,FALSE,71.4256
+PyTorch,QLinear,QLinear_N16_IN512_OUT256_cpu,short,FALSE,76.5413
+PyTorch,QDynamicLinear,QDynamicLinear_N1_IN1_OUT1_cpu,short,FALSE,58.6741
+PyTorch,QDynamicLinear,QDynamicLinear_N4_IN256_OUT128_cpu,short,FALSE,68.1178
+PyTorch,QDynamicLinear,QDynamicLinear_N16_IN512_OUT256_cpu,short,FALSE,76.5539
+PyTorch,MinMaxObserver,MinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,FALSE,165.7528
+PyTorch,MinMaxObserver,MinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,FALSE,152.155
+PyTorch,MovingAverageMinMaxObserver,MovingAverageMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,FALSE,207.921
+PyTorch,MovingAverageMinMaxObserver,MovingAverageMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,FALSE,196.0801
+PyTorch,PerChannelMinMaxObserver,PerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_affine,short,FALSE,660.8751
+PyTorch,PerChannelMinMaxObserver,PerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_symmetric,short,FALSE,631.7678
+PyTorch,MovingAveragePerChannelMinMaxObserver,MovingAveragePerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_affine,short,FALSE,689.3469
+PyTorch,MovingAveragePerChannelMinMaxObserver,MovingAveragePerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_symmetric,short,FALSE,665.8384
+PyTorch,HistogramObserver,HistogramObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,FALSE,2269.4954
+PyTorch,HistogramObserver,HistogramObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,FALSE,2284.8399
+PyTorch,HistogramObserverCalculateQparams,HistogramObserverCalculateQparams_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,FALSE,2275.4481
+PyTorch,HistogramObserverCalculateQparams,HistogramObserverCalculateQparams_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,FALSE,2225.9528
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.qint32",short,FALSE,219.4663
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.qint8",short,FALSE,219.0055
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.quint8",short,FALSE,218.8345
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint32",short,FALSE,12.7215
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint8",short,FALSE,14.6691
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.quint8",short,FALSE,14.6109
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint32",short,FALSE,15.4468
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint8",short,FALSE,15.4546
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.quint8",short,FALSE,15.4778
+PyTorch,QLSTM,QLSTM_I1_H3_NL1_BTrue_DFalse_dtypetorch.qint8,short,FALSE,5217.2835
+PyTorch,QLSTM,QLSTM_I1_H3_NL1_BTrue_DTrue_dtypetorch.qint8,short,FALSE,10347.2789
+PyTorch,QLSTM,QLSTM_I5_H7_NL4_BTrue_DFalse_dtypetorch.qint8,short,FALSE,24447.7776
+PyTorch,QLSTM,QLSTM_I5_H7_NL4_BTrue_DTrue_dtypetorch.qint8,short,FALSE,48972.4577
+PyTorch,QMethodTensorInputCopyBenchmark,QMethodTensorInputCopyBenchmark_M32_N32_dtypetorch.quint8_contigFalse,short,FALSE,1.041
+PyTorch,QMethodTensorInputCopyBenchmark,QMethodTensorInputCopyBenchmark_M32_N32_dtypetorch.quint8_contigTrue,short,FALSE,1.022
+PyTorch,QuantizePerTensor,QuantizePerTensor_C3_M512_N512_dtypetorch.quint8_modeQ,short,FALSE,25.1494
+PyTorch,DequantizePerTensor,DequantizePerTensor_C3_M512_N512_dtypetorch.quint8_modeD,short,FALSE,17.1969
+PyTorch,QuantizePerChannel,QuantizePerChannel_C3_M512_N512_dtypetorch.quint8_modeQ_axis0,short,FALSE,3162.223
+PyTorch,DequantizePerChannel,DequantizePerChannel_C3_M512_N512_dtypetorch.quint8_modeD_axis0,short,FALSE,185.0221
+PyTorch,FakeQuantize,FakeQuantize_N1_C3_H512_W512_zero_point_dtypetorch.int32_cpu,short,FALSE,546.7611
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,FALSE,198.838
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,FALSE,208.628
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,FALSE,200.6107
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,FALSE,211.8795
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,TRUE,415.3363
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,TRUE,416.6851
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,TRUE,416.4106
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,TRUE,416.1216
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,TRUE,417.0081
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,TRUE,416.5927
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,TRUE,417.6604
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,TRUE,416.0931
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,TRUE,183.1625
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,TRUE,183.596
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,TRUE,183.0808
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,TRUE,182.9406
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,TRUE,183.9915
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,TRUE,183.407
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,TRUE,182.8545
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,TRUE,183.1087
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,FALSE,309.1467
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,FALSE,312.9401
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,FALSE,300.9107
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,FALSE,308.721
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,TRUE,542.2402
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,TRUE,544.185
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,TRUE,542.8632
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,TRUE,543.9898
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,TRUE,544.0337
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,TRUE,544.0846
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,TRUE,543.6945
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,TRUE,542.707
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,TRUE,182.4091
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,TRUE,183.0807
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,TRUE,183.1147
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,TRUE,182.72
+PyTorch,q_argsort,q_argsort_M512_N512_dtypetorch.quint8,short,FALSE,446.4263
+PyTorch,q_clone,q_clone_M512_N512_dtypetorch.quint8,short,FALSE,10.9374
+PyTorch,q_mean,q_mean_M512_N512_dtypetorch.quint8,short,FALSE,10.2288
+PyTorch,q_relu,q_relu_M512_N512_dtypetorch.quint8,short,FALSE,10.3366
+PyTorch,q_relu_,q_relu__M512_N512_dtypetorch.quint8,short,FALSE,25.3594
+PyTorch,q_sort,q_sort_M512_N512_dtypetorch.quint8,short,FALSE,447.1303
+PyTorch,qtopk,qtopk_M512_N512_k5_dtypetorch.quint8,short,FALSE,64.856
+PyTorch,abs,abs_M512_N512_cpu,short,FALSE,12.3046
+PyTorch,abs_,abs__M512_N512_cpu,short,FALSE,7.638213467
+PyTorch,acos,acos_M512_N512_cpu,short,FALSE,18.7028
+PyTorch,acos_,acos__M512_N512_cpu,short,FALSE,65.8008
+PyTorch,argsort,argsort_M512_N512_cpu,short,FALSE,1424.792
+PyTorch,asin,asin_M512_N512_cpu,short,FALSE,17.6292
+PyTorch,asin_,asin__M512_N512_cpu,short,FALSE,13.7757
+PyTorch,atan,atan_M512_N512_cpu,short,FALSE,17.7172
+PyTorch,atan_,atan__M512_N512_cpu,short,FALSE,13.9644
+PyTorch,ceil,ceil_M512_N512_cpu,short,FALSE,11.1606
+PyTorch,ceil_,ceil__M512_N512_cpu,short,FALSE,7.4759
+PyTorch,clone,clone_M512_N512_cpu,short,FALSE,12.2572
+PyTorch,cos,cos_M512_N512_cpu,short,FALSE,18.5237
+PyTorch,cos_,cos__M512_N512_cpu,short,FALSE,14.7932
+PyTorch,cosh,cosh_M512_N512_cpu,short,FALSE,80.2281
+PyTorch,digamma,digamma_M512_N512_cpu,short,FALSE,672.0418
+PyTorch,erf,erf_M512_N512_cpu,short,FALSE,20.1027
+PyTorch,erf_,erf__M512_N512_cpu,short,FALSE,16.4605
+PyTorch,erfc,erfc_M512_N512_cpu,short,FALSE,21.6993
+PyTorch,erfc_,erfc__M512_N512_cpu,short,FALSE,17.3411
+PyTorch,erfinv,erfinv_M512_N512_cpu,short,FALSE,20.9871
+PyTorch,exp,exp_M512_N512_cpu,short,FALSE,14.1471
+PyTorch,exp_,exp__M512_N512_cpu,short,FALSE,52.1716
+PyTorch,expm1,expm1_M512_N512_cpu,short,FALSE,32.8849
+PyTorch,expm1_,expm1__M512_N512_cpu,short,FALSE,28.8886
+PyTorch,floor,floor_M512_N512_cpu,short,FALSE,11.1898
+PyTorch,floor_,floor__M512_N512_cpu,short,FALSE,7.2972
+PyTorch,frac,frac_M512_N512_cpu,short,FALSE,12.9296
+PyTorch,frac_,frac__M512_N512_cpu,short,FALSE,9.1193
+PyTorch,hardshrink,hardshrink_M512_N512_cpu,short,FALSE,14.6546
+PyTorch,lgamma,lgamma_M512_N512_cpu,short,FALSE,110.4636
+PyTorch,log,log_M512_N512_cpu,short,FALSE,14.8297
+PyTorch,log10,log10_M512_N512_cpu,short,FALSE,15.6434
+PyTorch,log10_,log10__M512_N512_cpu,short,FALSE,11.8294
+PyTorch,log1p,log1p_M512_N512_cpu,short,FALSE,27.0109
+PyTorch,log1p_,log1p__M512_N512_cpu,short,FALSE,23.1485
+PyTorch,log2,log2_M512_N512_cpu,short,FALSE,15.3609
+PyTorch,log2_,log2__M512_N512_cpu,short,FALSE,11.5224
+PyTorch,log_,log__M512_N512_cpu,short,FALSE,81.0499
+PyTorch,logit,logit_M512_N512_cpu,short,FALSE,18.1755
+PyTorch,logit_,logit__M512_N512_cpu,short,FALSE,83.57725
+PyTorch,neg,neg_M512_N512_cpu,short,FALSE,11.1491
+PyTorch,neg_,neg__M512_N512_cpu,short,FALSE,7.4216
+PyTorch,reciprocal,reciprocal_M512_N512_cpu,short,FALSE,16.1436
+PyTorch,reciprocal_,reciprocal__M512_N512_cpu,short,FALSE,12.59
+PyTorch,relu,relu_M512_N512_cpu,short,FALSE,13.1991
+PyTorch,relu_,relu__M512_N512_cpu,short,FALSE,8.7194
+PyTorch,round,round_M512_N512_cpu,short,FALSE,11.1888
+PyTorch,round_,round__M512_N512_cpu,short,FALSE,7.4217
+PyTorch,rsqrt,rsqrt_M512_N512_cpu,short,FALSE,20.9455
+PyTorch,rsqrt_,rsqrt__M512_N512_cpu,short,FALSE,17.8695
+PyTorch,sigmoid,sigmoid_M512_N512_cpu,short,FALSE,32.1797
+PyTorch,sigmoid_,sigmoid__M512_N512_cpu,short,FALSE,28.0707
+PyTorch,sign,sign_M512_N512_cpu,short,FALSE,13.2475
+PyTorch,sgn,sgn_M512_N512_cpu,short,FALSE,13.1844
+PyTorch,sin,sin_M512_N512_cpu,short,FALSE,18.4476
+PyTorch,sin_,sin__M512_N512_cpu,short,FALSE,14.5837
+PyTorch,sinh,sinh_M512_N512_cpu,short,FALSE,81.4856
+PyTorch,sqrt,sqrt_M512_N512_cpu,short,FALSE,12.4782
+PyTorch,sqrt_,sqrt__M512_N512_cpu,short,FALSE,9.536
+PyTorch,square,square_M512_N512_cpu,short,FALSE,15.1528
+PyTorch,square_,square__M512_N512_cpu,short,FALSE,10.2758
+PyTorch,tan,tan_M512_N512_cpu,short,FALSE,20.1579
+PyTorch,tan_,tan__M512_N512_cpu,short,FALSE,16.0497
+PyTorch,tanh,tanh_M512_N512_cpu,short,FALSE,14.9006
+PyTorch,tanh_,tanh__M512_N512_cpu,short,FALSE,11.5883
+PyTorch,trunc,trunc_M512_N512_cpu,short,FALSE,12.7292
+PyTorch,trunc_,trunc__M512_N512_cpu,short,FALSE,8.5329
+PyTorch,unique,unique_M512_N512_cpu,short,FALSE,21486.0475
+PyTorch,zero_,zero__M512_N512_cpu,short,FALSE,7.4783
+PyTorch,bernoulli_,bernoulli__M512_N512_cpu,short,FALSE,3986.396
+PyTorch,cauchy_,cauchy__M512_N512_cpu,short,FALSE,7504.3189
+PyTorch,digamma_,digamma__M512_N512_cpu,short,FALSE,1276.7406
+PyTorch,exponential_,exponential__M512_N512_cpu,short,FALSE,6277.716
+PyTorch,normal_,normal__M512_N512_cpu,short,FALSE,1334.7115
+PyTorch,random_,random__M512_N512_cpu,short,FALSE,1209.5657
+PyTorch,sign_,sign__M512_N512_cpu,short,FALSE,9.3767
+PyTorch,uniform_,uniform__M512_N512_cpu,short,FALSE,1248.3519
+PyTorch,half,half_M512_N512_cpu,short,FALSE,14.2058
+PyTorch,long,long_M512_N512_cpu,short,FALSE,28.371
+PyTorch,arange,arange_start0_end1000_step2.5_cpu_dtypetorch.float32,short,FALSE,7.791
+PyTorch,arange,arange_start-1024_end2048_step1_cpu_dtypetorch.float32,short,FALSE,9.501666667
+PyTorch,add_,add__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,2.428333333
+PyTorch,add_,add__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,10.516
+PyTorch,add_,add__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,10.43566667
+PyTorch,sub_,sub__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,2.567
+PyTorch,sub_,sub__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,10.36466667
+PyTorch,sub_,sub__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,10.44266667
+PyTorch,mul_,mul__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,2.462
+PyTorch,mul_,mul__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,9.666666667
+PyTorch,mul_,mul__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,9.804666667
+PyTorch,div_,div__M1_N1_K1_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,FALSE,2.431333333
+PyTorch,div_,div__M64_N64_K64_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,FALSE,192.4236667
+PyTorch,div_,div__M64_N64_K128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,FALSE,194.4426667
+PyTorch,sub,"sub_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,FALSE,14.90066667
+PyTorch,div,"div_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,FALSE,14.753
+PyTorch,mul,"mul_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,FALSE,14.34233333
+PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,4.091333333
+PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,14.359
+PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,14.472
+PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,9.372333333
+PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,38.91933333
+PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,39.602
+PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,3.932
+PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,13.98933333
+PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,FALSE,14.05733333
+PyTorch,logical_and,"logical_and_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.bool",short,FALSE,41.10566667
+PyTorch,logical_and,logical_and_M1_N1_K1_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,FALSE,4.987666667
+PyTorch,logical_and,logical_and_M64_N64_K64_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,FALSE,10.88633333
+PyTorch,logical_and,logical_and_M64_N64_K128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,FALSE,10.844
+PyTorch,bmm,bmm_B2_M1_N8_K2_cpu_dtypetorch.float32,short,FALSE,4.132666667
+PyTorch,bmm,bmm_B2_M1_N8_K2_cpu_dtypetorch.bfloat16,short,FALSE,4.138
+PyTorch,bmm,bmm_B128_M64_N32_K64_cpu_dtypetorch.float32,short,FALSE,26.016
+PyTorch,bmm,bmm_B128_M64_N32_K64_cpu_dtypetorch.bfloat16,short,FALSE,122.5856667
+PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.float32,short,FALSE,6.079333333
+PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.bfloat16,short,FALSE,6.154666667
+PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.float32,short,FALSE,46.773
+PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.bfloat16,short,FALSE,138.773
+PyTorch,index_add_,index_add__M8_N32_K1_dim0_cpu_dtypetorch.float32,short,FALSE,40.94966667
+PyTorch,index_add_,index_add__M512_N512_K1_dim2_cpu_dtypetorch.float32,short,FALSE,124.4306667
+PyTorch,index_select,index_select_M8_N8_K1_dim1_cpu,short,FALSE,4.461
+PyTorch,index_select,index_select_M256_N512_K1_dim1_cpu,short,FALSE,79.63533333
+PyTorch,index_select,index_select_M512_N512_K1_dim1_cpu,short,FALSE,167.888
+PyTorch,index_select,index_select_M8_N8_K2_dim1_cpu,short,FALSE,4.544
+PyTorch,index_select,index_select_M256_N512_K2_dim1_cpu,short,FALSE,305.2126667
+PyTorch,index_select,index_select_M512_N512_K2_dim1_cpu,short,FALSE,603.7596667
+PyTorch,mm,mm_M1_N1_K1_cpu_dtypetorch.float32,short,FALSE,4.919
+PyTorch,mm,mm_M64_N64_K64_cpu_dtypetorch.float32,short,FALSE,13.90066667
+PyTorch,mm,mm_M64_N64_K128_cpu_dtypetorch.float32,short,FALSE,13.20866667
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim0",short,FALSE,5.519
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim1",short,FALSE,5.738333333
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim2",short,FALSE,5.746666667
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim3",short,FALSE,6.127
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim0",short,FALSE,19.943
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim1",short,FALSE,20.89633333
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim2",short,FALSE,116.437
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim3",short,FALSE,42.9705
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim0",short,FALSE,19.77333333
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim1",short,FALSE,20.22633333
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim2",short,FALSE,116.23
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim3",short,FALSE,42.9005
+PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,FALSE,4.179
+PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,FALSE,4.164666667
+PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,FALSE,4.956333333
+PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.bfloat16,short,FALSE,4.974666667
+PyTorch,addcdiv,addcdiv_M1_N2_cpu_dtypetorch.float32,short,FALSE,4.309333333
+PyTorch,addcdiv,addcdiv_M1_N2_cpu_dtypetorch.bfloat16,short,FALSE,4.338
+PyTorch,addcdiv,addcdiv_M32_N64_cpu_dtypetorch.float32,short,FALSE,5.157666667
+PyTorch,addcdiv,addcdiv_M32_N64_cpu_dtypetorch.bfloat16,short,FALSE,5.259333333
+PyTorch,topk,"topk_shape(16,4)_k4_dim1_cpu_dtypetorch.float32",short,FALSE,7.123333333
+PyTorch,topk,"topk_shape(1048576,)_k16_dim0_cpu_dtypetorch.float32",short,FALSE,2569.296333
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.682333333
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667
+PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333
+PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
\ No newline at end of file
diff --git a/benchmarks/operator_benchmark/pt/nan_to_num_test.py b/benchmarks/operator_benchmark/pt/nan_to_num_test.py
index 77aa92464496..a4a782d4169d 100644
--- a/benchmarks/operator_benchmark/pt/nan_to_num_test.py
+++ b/benchmarks/operator_benchmark/pt/nan_to_num_test.py
@@ -43,6 +43,13 @@ def init(self, M, N, dtype, replace_inf, op_func):
         self.op_func = op_func
         self.set_module_name("nan_to_num")
 
+<<<<<<< HEAD
+=======
+        #  To make casename unique as nan_to_num and nan_to_num_ are two different functions.
+        if op_func is torch.nan_to_num_:
+            self.set_module_name("nan_to_num_")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def forward(self, input, replace_inf: bool):
         # compare inplace
         if replace_inf:
diff --git a/benchmarks/operator_benchmark/pt/quantization_test.py b/benchmarks/operator_benchmark/pt/quantization_test.py
index e0d3483963af..db09322608c7 100644
--- a/benchmarks/operator_benchmark/pt/quantization_test.py
+++ b/benchmarks/operator_benchmark/pt/quantization_test.py
@@ -193,8 +193,13 @@ def fakeQuantizePerTensorOriginalKernel(
 
 fake_quantize_per_tensor_ops = op_bench.op_list(
     attrs=(
+<<<<<<< HEAD
         ("learnable_kernel", fakeQuantizePerTensorLearnableKernel),
         ("original_kernel", fakeQuantizePerTensorOriginalKernel),
+=======
+        ("learnable_kernel_tensor", fakeQuantizePerTensorLearnableKernel),
+        ("original_kernel_tensor", fakeQuantizePerTensorOriginalKernel),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     attr_names=("op_name", "op_func"),
 )
@@ -297,8 +302,13 @@ def fakeQuantizePerChannelOriginalKernel(
 
 fake_quantize_per_channel_ops = op_bench.op_list(
     attrs=(
+<<<<<<< HEAD
         ("learnable_kernel", fakeQuantizePerChannelLearnableKernel),
         ("original_kernel", fakeQuantizePerChannelOriginalKernel),
+=======
+        ("learnable_kernel_channel", fakeQuantizePerChannelLearnableKernel),
+        ("original_kernel_channel", fakeQuantizePerChannelOriginalKernel),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     attr_names=("op_name", "op_func"),
 )
diff --git a/benchmarks/sparse/dlmc/test.sh b/benchmarks/sparse/dlmc/test.sh
index 96a277ca8fea..f95b8fe5c29f 100644
--- a/benchmarks/sparse/dlmc/test.sh
+++ b/benchmarks/sparse/dlmc/test.sh
@@ -2,7 +2,11 @@
 
 DATASET_ROOT_DIR=$HOME/datasets/
 
+<<<<<<< HEAD
 # wget https://storage.googleapis.com/sgk-sc2020/dlmc.tar.gz -P $DATASET_ROOT_DIR
+=======
+# wget https://storage.googleapis.com/sgk-sc2020/dlmc.tar.gz -P $DATASET_ROOT_DIR  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # tar -xvf $DATASET_ROOT_DIR/dlmc.tar.gz
 
 echo "!! SPARSE SPMS TIME BENCHMARK!! "
diff --git a/benchmarks/sparse/test_csr.sh b/benchmarks/sparse/test_csr.sh
index c793658e31ea..31c35fe4586e 100644
--- a/benchmarks/sparse/test_csr.sh
+++ b/benchmarks/sparse/test_csr.sh
@@ -8,8 +8,12 @@ echo "----- USE_MKL=1 -----" >> $OUTFILE
 rm -rf build
 
 export USE_MKL=1
+<<<<<<< HEAD
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 python setup.py build --cmake-only
+=======
+CMAKE_ONLY=1 python setup.py build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ccmake build  # or cmake-gui build
 
 python setup.py install
diff --git a/benchmarks/static_runtime/test_static_module.cc b/benchmarks/static_runtime/test_static_module.cc
index b035ef346ce2..e96aa24b2ede 100644
--- a/benchmarks/static_runtime/test_static_module.cc
+++ b/benchmarks/static_runtime/test_static_module.cc
@@ -16,8 +16,11 @@ using namespace torch;
 using namespace torch::jit;
 using namespace torch::jit::test;
 
+<<<<<<< HEAD
 C10_DECLARE_bool(static_runtime_disable_debug_memory_overlap_check);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 
 StaticModule makeStaticModuleFromScript(const std::string& script) {
diff --git a/benchmarks/tensorexpr/attention.py b/benchmarks/tensorexpr/attention.py
index 546ac4063af8..a3b007f938fe 100644
--- a/benchmarks/tensorexpr/attention.py
+++ b/benchmarks/tensorexpr/attention.py
@@ -1,6 +1,10 @@
 # This is a copy of rnn_attention from MLPerf, with some common sizes hardcoded
 # for benchmarking and some control flow stripped out.
+<<<<<<< HEAD
 # https://github.com/mlperf/training/blob/master/rnn_translator/pytorch/seq2seq/models/attention.py
+=======
+# https://github.com/mlcommons/training/blob/master/retired_benchmarks/gnmt/pytorch/seq2seq/models/attention.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
diff --git a/benchmarks/tensorexpr/benchmark.py b/benchmarks/tensorexpr/benchmark.py
index 38951ac5091f..da6916650565 100644
--- a/benchmarks/tensorexpr/benchmark.py
+++ b/benchmarks/tensorexpr/benchmark.py
@@ -34,7 +34,11 @@ def __init__(self, mode, device, dtype):
         for method in dir(self.engine):
             if not callable(getattr(self.engine, method)):
                 continue
+<<<<<<< HEAD
             # don't forward if this function is overriden here
+=======
+            # don't forward if this function is overridden here
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if hasattr(self, method):
                 continue
             # don't forward if it is a internal function
@@ -89,7 +93,11 @@ def dtype_to_bytes(self):
 
     @staticmethod
     def default_configs():
+<<<<<<< HEAD
         """return a list of defualt configs for this benchmark"""
+=======
+        """return a list of default configs for this benchmark"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise ValueError("this method should be reimplemented by subclass")
 
     def is_supported(self):
diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py
index a2de75388986..ef995cc1fa4c 100644
--- a/benchmarks/transformer/score_mod.py
+++ b/benchmarks/transformer/score_mod.py
@@ -271,9 +271,15 @@ def run_single_backend_sdpa(
         if config.calculate_bwd_time:
             # TODO: debug backward pass for njt
             if eager_sdpa and not config.attn_type == "document_mask":
+<<<<<<< HEAD
                 dOut = torch.randn_like(out_eager.transpose(1, 2)).transpose(1, 2)
                 backward_eager_time = benchmark_torch_function_in_microseconds(
                     out_eager.backward, dOut, retain_graph=True
+=======
+                d_out = torch.randn_like(out_eager.transpose(1, 2)).transpose(1, 2)
+                backward_eager_time = benchmark_torch_function_in_microseconds(
+                    out_eager.backward, d_out, retain_graph=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 backward_eager_time = float("nan")
@@ -340,9 +346,15 @@ def run_single_backend_FA(
 
     if config.calculate_bwd_time:
         if FA:
+<<<<<<< HEAD
             dOut = torch.randn_like(out_FA)
             backward_FA_time = benchmark_torch_function_in_microseconds(
                 out_FA.backward, dOut, retain_graph=True
+=======
+            d_out = torch.randn_like(out_FA)
+            backward_FA_time = benchmark_torch_function_in_microseconds(
+                out_FA.backward, d_out, retain_graph=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             backward_FA_time = float("nan")
@@ -432,9 +444,15 @@ def run_single_experiment(
             )
 
     if config.calculate_bwd_time:
+<<<<<<< HEAD
         dOut = torch.randn_like(out_compile)
         backward_compile_time = benchmark_torch_function_in_microseconds(
             out_compile.backward, dOut, retain_graph=True
+=======
+        d_out = torch.randn_like(out_compile)
+        backward_compile_time = benchmark_torch_function_in_microseconds(
+            out_compile.backward, d_out, retain_graph=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
     sparsity = sparsity if config.attn_type != "document_mask" else 0.5
diff --git a/benchmarks/transformer/sdpa.py b/benchmarks/transformer/sdpa.py
index 8d286561ae0e..a36b1871ca64 100644
--- a/benchmarks/transformer/sdpa.py
+++ b/benchmarks/transformer/sdpa.py
@@ -172,9 +172,15 @@ def run_single_experiment(config: ExperimentConfig) -> ExperimentResults:
         out_torch = scaled_dot_product_attention(
             q, k, v, is_causal=is_causal, attn_mask=None
         )
+<<<<<<< HEAD
         dOut = torch.randn_like(out_torch)
         backward_time = benchmark_cuda_function_in_microseconds(
             out_torch.backward, dOut, retain_graph=True
+=======
+        d_out = torch.randn_like(out_torch)
+        backward_time = benchmark_cuda_function_in_microseconds(
+            out_torch.backward, d_out, retain_graph=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     # Calculate TFLOPS for forward and backward passes
diff --git a/benchmarks/upload_scribe.py b/benchmarks/upload_scribe.py
index 0c1f9448b15b..d49116e98d21 100644
--- a/benchmarks/upload_scribe.py
+++ b/benchmarks/upload_scribe.py
@@ -48,7 +48,11 @@ def upload(self, messages):
         access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN")
         if not access_token:
             raise ValueError("Can't find access token from environment variable")
+<<<<<<< HEAD
         url = "https://graph.facebook.com/scribe_logs"
+=======
+        url = "https://graph.facebook.com/scribe_logs"  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = requests.post(
             url,
             data={
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 65141ac9b5ae..61433cd815f5 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -178,11 +178,20 @@ THIRD_PARTY_LIBS = {
     "psimd": ["//xplat/third-party/psimd:psimd", "//third_party:psimd"],
     "pthreadpool": ["//xplat/third-party/pthreadpool:pthreadpool", "//third_party:pthreadpool"],
     "pthreadpool_header": ["//xplat/third-party/pthreadpool:pthreadpool_header", "//third_party:pthreadpool_header"],
+<<<<<<< HEAD
     "pyyaml": ["//third-party/pyyaml:pyyaml", "//third_party:pyyaml"],
     "rt": ["//xplat/third-party/linker_lib:rt", "//third_party:rt"],
     "ruy": ["//third-party/ruy:ruy_xplat_lib", "//third_party:ruy_lib"],
     "sleef_arm": ["//third-party/sleef:sleef_arm", "//third_party:sleef_arm"],
     "typing-extensions": ["//third-party/typing-extensions:typing-extensions", "//third_party:typing-extensions"],
+=======
+    "moodycamel": ["//third-party/moodycamel:moodycamel", "//third_party:moodycamel"],
+    "pyyaml": ["//third-party/pypi/pyyaml:pyyaml", "//third_party:pyyaml"],
+    "rt": ["//xplat/third-party/linker_lib:rt", "//third_party:rt"],
+    "ruy": ["//third-party/ruy:ruy_xplat_lib", "//third_party:ruy_lib"],
+    "nlohmann-json": ["fbsource//third-party/nlohmann-json:nlohmann-json", "//third_party:nlohmann-json"],
+    "sleef_arm": ["//third-party/sleef:sleef", "//third_party:sleef_arm"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 def third_party(name):
@@ -194,6 +203,12 @@ def get_pt_compiler_flags():
     return select({
         "DEFAULT": _PT_COMPILER_FLAGS,
         "ovr_config//compiler:cl": windows_convert_gcc_clang_flags(_PT_COMPILER_FLAGS),
+<<<<<<< HEAD
+=======
+    }) + select({
+        "DEFAULT": [],
+        "ovr_config//os:macos": ["-fvisibility=default"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     })
 
 _PT_COMPILER_FLAGS = [
@@ -228,6 +243,12 @@ ATEN_COMPILER_FLAGS = [
     # Not supported by clang on Windows
     "DEFAULT": ["-fPIC"],
     "ovr_config//compiler:clang-windows": [],
+<<<<<<< HEAD
+=======
+}) + select({
+    "DEFAULT": [],
+    "ovr_config//os:macos": ["-fvisibility=default"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 })
 
 def get_aten_compiler_flags():
@@ -247,6 +268,12 @@ _COMMON_PREPROCESSOR_FLAGS = [
     ["-DDISABLE_WARN"] if get_disable_warn() else []
 )
 
+<<<<<<< HEAD
+=======
+def get_no_as_needed_linker_flag():
+    return select({"DEFAULT": ["-Wl,--no-as-needed"], "ovr_config//os:macos": []})
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_aten_preprocessor_flags():
     # read_config is not allowed outside of function in Starlark
     ATEN_PREPROCESSOR_FLAGS = _COMMON_PREPROCESSOR_FLAGS + [
@@ -288,7 +315,11 @@ def get_pt_preprocessor_flags():
         PT_PREPROCESSOR_FLAGS.append("-DENABLE_PYTORCH_NON_PRODUCTION_BUILDS")
     return PT_PREPROCESSOR_FLAGS
 
+<<<<<<< HEAD
 # This needs to be kept in sync with https://github.com/pytorch/pytorch/blob/release/1.9/torchgen/gen.py#L892
+=======
+# This needs to be kept in sync with https://github.com/pytorch/pytorch/blob/release/1.9/torchgen/gen.py#L892  @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PT_BACKEND_HEADERS = [
     "CPU",
     "CUDA",
@@ -392,7 +423,11 @@ def get_aten_generated_files(enabled_backends):
 
     # This is tiresome.  A better strategy would be to unconditionally
     # generate these files, and then only actually COMPILE them depended
+<<<<<<< HEAD
     # on the generated set.  C'est la vie...
+=======
+    # on the generated set.  C'est la vie...  # codespell:ignore vie
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if "CPU" in enabled_backends:
         src_files.extend(aten_ufunc_generated_cpu_sources())
         src_files.extend(aten_ufunc_generated_cpu_kernel_sources())
@@ -515,7 +550,11 @@ def copy_template_registration_files(name, apple_sdks = None):
 
     # Ideally, we would run one copy command for a single source directory along
     # with all its child directories, but it's somewhat hard to know if a directory
+<<<<<<< HEAD
     # is a child of another just bu looking at the metadata (directory relative
+=======
+    # is a child of another just by looking at the metadata (directory relative
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # path) that we currently have since 1 directory could look like a parent of
     # another and yet come from a different filegroup() rule.
     #
@@ -728,7 +767,10 @@ def vulkan_spv_shader_library(name, spv_filegroup):
         },
         cmd = " ".join(genrule_cmd),
         default_outs = ["."],
+<<<<<<< HEAD
         labels = ["uses_dotslash"],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     fb_xplat_cxx_library(
@@ -767,7 +809,11 @@ def copy_metal(name, apple_sdks = None):
 
     # Metal custom ops currently have to be brought into selective build because they directly reference metal ops instead of
     # going through the dispatcher. There is some weird issues with the genrule and these files locations on windows though, so
+<<<<<<< HEAD
     # for now we simply skip building them for windows where they very likely arent needed anyway.
+=======
+    # for now we simply skip building them for windows where they very likely aren't needed anyway.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Metal MaskRCNN custom op
     for full_path in METAL_MASKRCNN_SOURCE_LIST:
         path_prefix = paths.dirname(full_path)
@@ -783,7 +829,11 @@ def copy_metal(name, apple_sdks = None):
         name = name,
         cmd = " && ".join(cmd),
         cmd_exe = "@powershell -Command " + ("; ".join(cmd_exe)),
+<<<<<<< HEAD
         # due to an obscure bug certain custom ops werent being copied correctly on windows. ARVR also sometimes builds android targets on windows,
+=======
+        # due to an obscure bug certain custom ops weren't being copied correctly on windows. ARVR also sometimes builds android targets on windows,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # so we just exclude those targets from being copied for those platforms (They end up uncompiled anyway).
         outs = select({
             "DEFAULT": get_metal_registration_files_outs(),
@@ -817,9 +867,13 @@ def get_pt_operator_registry_dict(
 
     return dict(
         srcs = code_gen_files["srcs"],
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         soname = "libtorch-code-gen.$(ext)",
@@ -937,6 +991,10 @@ def define_buck_targets(
             [
                 ("torch/csrc/api/include", "torch/**/*.h"),
                 ("", "torch/csrc/**/*.h"),
+<<<<<<< HEAD
+=======
+                ("", "torch/headeronly/**/*.h"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ("", "torch/script.h"),
                 ("", "torch/library.h"),
                 ("", "torch/custom_class.h"),
@@ -981,6 +1039,13 @@ def define_buck_targets(
     fb_xplat_cxx_library(
         name = "torch_mobile_headers",
         header_namespace = "",
+<<<<<<< HEAD
+=======
+        compiler_flags = select({
+            "DEFAULT": [],
+            "ovr_config//os:macos": ["-fvisibility=default"],
+        }),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         exported_headers = subdir_glob(
             [
                 ("", "torch/csrc/jit/mobile/*.h"),
@@ -1184,7 +1249,14 @@ def define_buck_targets(
         srcs = [
             "torch/csrc/jit/mobile/observer.cpp",
         ] + ([] if IS_OSS else ["torch/fb/observers/MobileObserverUtil.cpp"]),
+<<<<<<< HEAD
         compiler_flags = ["-fexceptions"],
+=======
+        compiler_flags = ["-fexceptions"] + select({
+            "DEFAULT": [],
+            "ovr_config//os:macos": ["-fvisibility=default"],
+        }),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         header_namespace = "",
         exported_headers = subdir_glob(
             [
@@ -1241,6 +1313,7 @@ def define_buck_targets(
         extra_flags = {
             "fbandroid_compiler_flags": ["-frtti"],
         },
+<<<<<<< HEAD
         # torch_mobile_deserialize brings in sources neccessary to read a module
         # which depends on mobile module definition
         # link_whole is enable so that all symbols neccessary for mobile module are compiled
@@ -1249,6 +1322,16 @@ def define_buck_targets(
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         linker_flags = ["-Wl,--no-as-needed"],
+=======
+        # torch_mobile_deserialize brings in sources necessary to read a module
+        # which depends on mobile module definition
+        # link_whole is enable so that all symbols necessary for mobile module are compiled
+        # instead of only symbols used while loading; this prevents symbol
+        # found defined in runtime
+        # @lint-ignore BUCKLINT link_whole
+        link_whole = True,
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         exported_deps = [
             ":aten_cpu",
@@ -1280,9 +1363,13 @@ def define_buck_targets(
         },
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         exported_deps = [
             ":aten_cpu",
@@ -1313,9 +1400,13 @@ def define_buck_targets(
         header_namespace = "",
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         deps = [
             ":torch_mobile_deserialize",
@@ -1335,9 +1426,13 @@ def define_buck_targets(
         exported_preprocessor_flags = get_pt_preprocessor_flags() + (["-DSYMBOLICATE_MOBILE_DEBUG_HANDLE"] if get_enable_eager_symbolication() else []),
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         deps = [
             ":generated-autograd-headers",
@@ -1367,6 +1462,7 @@ def define_buck_targets(
             "torch/csrc/jit/mobile/import.h",
             "torch/csrc/jit/mobile/flatbuffer_loader.h",
         ],
+<<<<<<< HEAD
         # torch_mobile_deserialize brings in sources neccessary to read a module
         # which depends on mobile module definition
         # link_whole is enable so that all symbols neccessary for mobile module are compiled
@@ -1377,6 +1473,16 @@ def define_buck_targets(
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        # torch_mobile_deserialize brings in sources necessary to read a module
+        # which depends on mobile module definition
+        # link_whole is enable so that all symbols necessary for mobile module are compiled
+        # instead of only symbols used while loading; this prevents symbol
+        # found defined in runtime
+        # @lint-ignore BUCKLINT link_whole
+        link_whole = True,
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         exported_deps = [
             ":aten_cpu",
@@ -1400,6 +1506,7 @@ def define_buck_targets(
         exported_headers = [],
         compiler_flags = get_pt_compiler_flags(),
         exported_preprocessor_flags = get_pt_preprocessor_flags() + (["-DSYMBOLICATE_MOBILE_DEBUG_HANDLE"] if get_enable_eager_symbolication() else []),
+<<<<<<< HEAD
         # torch_mobile_core brings in sources neccessary to read and run a module
         # link_whole is enabled so that all symbols linked
         # operators, registerations and other few symbols are need in runtime
@@ -1408,6 +1515,14 @@ def define_buck_targets(
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        # torch_mobile_core brings in sources necessary to read and run a module
+        # link_whole is enabled so that all symbols linked
+        # operators, registrations and other few symbols are need in runtime
+        # @lint-ignore BUCKLINT link_whole
+        link_whole = True,
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         deps = [
             ":generated-autograd-headers",
@@ -1518,10 +1633,17 @@ def define_buck_targets(
         ],
         compiler_flags = get_pt_compiler_flags(),
         exported_preprocessor_flags = get_pt_preprocessor_flags() + ["-DUSE_MOBILE_CLASSTYPE"],
+<<<<<<< HEAD
         # torch_mobile_train brings in sources neccessary to read and run a mobile
         # and save and load mobile params along with autograd
         # link_whole is enabled so that all symbols linked
         # operators, registerations and autograd related symbols are need in runtime
+=======
+        # torch_mobile_train brings in sources necessary to read and run a mobile
+        # and save and load mobile params along with autograd
+        # link_whole is enabled so that all symbols linked
+        # operators, registrations and autograd related symbols are need in runtime
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         visibility = ["PUBLIC"],
@@ -1543,9 +1665,15 @@ def define_buck_targets(
         ],
         compiler_flags = get_pt_compiler_flags(),
         exported_preprocessor_flags = get_pt_preprocessor_flags(),
+<<<<<<< HEAD
         # torch brings in all sources neccessary to read and run a mobile module/jit module
         # link_whole is enabled so that all symbols linked
         # operators, registerations and other few symbols are need in runtime
+=======
+        # torch brings in all sources necessary to read and run a mobile module/jit module
+        # link_whole is enabled so that all symbols linked
+        # operators, registrations and other few symbols are need in runtime
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         visibility = ["PUBLIC"],
@@ -1570,7 +1698,11 @@ def define_buck_targets(
         ],
         compiler_flags = get_pt_compiler_flags(),
         exported_preprocessor_flags = get_pt_preprocessor_flags() + ["-DUSE_MOBILE_CLASSTYPE"],
+<<<<<<< HEAD
         # torch_mobile_train_import_data brings in sources neccessary to read a mobile module
+=======
+        # torch_mobile_train_import_data brings in sources necessary to read a mobile module
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # link_whole is enabled so that all symbols linked
         # operators other few symbols are need in runtime
         # @lint-ignore BUCKLINT link_whole
@@ -1649,6 +1781,7 @@ def define_buck_targets(
         ],
         compiler_flags = get_pt_compiler_flags(),
         exported_preprocessor_flags = get_pt_preprocessor_flags() + (["-DSYMBOLICATE_MOBILE_DEBUG_HANDLE"] if get_enable_eager_symbolication() else []),
+<<<<<<< HEAD
         # torch_mobile_model_tracer brings in sources neccessary to read and run a jit module
         # and trace the ops
         # link_whole is enabled so that all symbols linked
@@ -1658,6 +1791,15 @@ def define_buck_targets(
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        # torch_mobile_model_tracer brings in sources necessary to read and run a jit module
+        # and trace the ops
+        # link_whole is enabled so that all symbols linked
+        # operators, registrations and other few symbols are need in runtime
+        # @lint-ignore BUCKLINT link_whole
+        link_whole = True,
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         deps = [
             ":caffe2_serialize",
@@ -1686,9 +1828,13 @@ def define_buck_targets(
         exported_preprocessor_flags = get_pt_preprocessor_flags() + (["-DSYMBOLICATE_MOBILE_DEBUG_HANDLE"] if get_enable_eager_symbolication() else []),
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         deps = [
             ":generated-autograd-headers",
@@ -1711,9 +1857,13 @@ def define_buck_targets(
         fbandroid_compiler_flags = c2_fbandroid_xplat_compiler_flags,
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         exported_deps = [
             ":aten_cpu",
@@ -1733,13 +1883,21 @@ def define_buck_targets(
         ],
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         deps = [
             third_party("glog"),
             third_party("kineto"),
+<<<<<<< HEAD
+=======
+            third_party("nlohmann-json"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
         exported_deps = [
             ":aten_cpu",
@@ -1757,9 +1915,13 @@ def define_buck_targets(
         ],
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         exported_deps = [
             ":torch_common",
@@ -1846,6 +2008,7 @@ def define_buck_targets(
         extra_flags = {
             "fbandroid_compiler_flags": ["-frtti"],
         },
+<<<<<<< HEAD
         # torch_mobile_deserialize brings in sources neccessary to read a module
         # which depends on mobile module definition
         # link_whole is enable so that all symbols neccessary for mobile module are compiled
@@ -1856,6 +2019,16 @@ def define_buck_targets(
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        # torch_mobile_deserialize brings in sources necessary to read a module
+        # which depends on mobile module definition
+        # link_whole is enable so that all symbols necessary for mobile module are compiled
+        # instead of only symbols used while loading; this prevents symbol
+        # found defined in runtime
+        # @lint-ignore BUCKLINT link_whole
+        link_whole = True,
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         deps = [
             ":mobile_bytecode",
@@ -1881,9 +2054,13 @@ def define_buck_targets(
         srcs = [
             "torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp",
         ],
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         deps = [
             ":flatbuffer_loader",
@@ -1925,9 +2102,13 @@ def define_buck_targets(
         exported_preprocessor_flags = get_pt_preprocessor_flags() + (["-DSYMBOLICATE_MOBILE_DEBUG_HANDLE"] if get_enable_eager_symbolication() else []),
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         deps = [],
         exported_deps = [
@@ -2060,14 +2241,22 @@ def define_buck_targets(
             "ovr_config//os:xtensa-xos": [
                 "-fdata-sections",
                 "-ffunction-sections",
+<<<<<<< HEAD
             ],
+=======
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }),
         exported_preprocessor_flags = get_pt_preprocessor_flags() + [
             "-DMIN_EDGE_RUNTIME",
         ],
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ] + select({
+=======
+        linker_flags = get_no_as_needed_linker_flag() + select({
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "DEFAULT": [],
             "ovr_config//os:macos": [
                 "-dead_strip",
@@ -2117,9 +2306,13 @@ def define_buck_targets(
         }),
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         exported_deps = [
             ":generated_aten_config_header",
@@ -2181,9 +2374,13 @@ def define_buck_targets(
         }),
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         exported_deps = [
             ":min_runtime_lib",
@@ -2242,9 +2439,13 @@ def define_buck_targets(
         }),
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
+<<<<<<< HEAD
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
+=======
+        linker_flags = get_no_as_needed_linker_flag(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["PUBLIC"],
         exported_deps = [
             ":aten_header",
diff --git a/build.bzl b/build.bzl
index f8bc5acefa6f..6ce60f541521 100644
--- a/build.bzl
+++ b/build.bzl
@@ -72,13 +72,22 @@ def define_targets(rules):
         "--install_dir=$(RULEDIR)",
         "--source-path aten/src/ATen",
         "--aoti_install_dir=$(RULEDIR)/torch/csrc/inductor/aoti_torch/generated"
+<<<<<<< HEAD
     ] + (["--static_dispatch_backend CPU"] if rules.is_cpu_static_dispatch_build() else []))
+=======
+    ] + (["--static_dispatch_backend CPU"] if rules.is_cpu_static_dispatch_build() else []) + ["--mtia"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     gen_aten_outs_cuda = (
         GENERATED_H_CUDA + GENERATED_CPP_CUDA + GENERATED_AOTI_CUDA_CPP +
         aten_ufunc_generated_cuda_sources()
     )
 
+<<<<<<< HEAD
+=======
+    gen_aten_outs_mtia = GENERATED_H_MTIA + GENERATED_CPP_MTIA
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gen_aten_outs = (
         GENERATED_H + GENERATED_H_CORE +
         GENERATED_CPP + GENERATED_CPP_CORE +
@@ -86,7 +95,11 @@ def define_targets(rules):
         aten_ufunc_generated_cpu_sources() +
         aten_ufunc_generated_cpu_kernel_sources() + [
             "Declarations.yaml",
+<<<<<<< HEAD
         ] + gen_aten_outs_cuda
+=======
+        ] + gen_aten_outs_cuda + gen_aten_outs_mtia
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     rules.genrule(
@@ -208,6 +221,18 @@ GENERATED_CPP_CUDA = [
     "RegisterQuantizedCUDA_0.cpp",
 ]
 
+<<<<<<< HEAD
+=======
+GENERATED_H_MTIA = [
+    "MTIAFunctions.h",
+    "MTIAFunctions_inl.h",
+]
+
+GENERATED_CPP_MTIA = [
+    "RegisterMTIA_0.cpp",
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GENERATED_CPP = [
     "Functions.cpp",
     "RegisterBackendSelect.cpp",
diff --git a/build_variables.bzl b/build_variables.bzl
index 3f72fd70a7ee..615830259159 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -101,6 +101,10 @@ libtorch_profiler_sources = [
     "torch/csrc/profiler/collection.cpp",
     "torch/csrc/profiler/data_flow.cpp",
     "torch/csrc/profiler/kineto_shim.cpp",
+<<<<<<< HEAD
+=======
+    "torch/csrc/mtia/profiler/MTIAMemoryProfiler.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/profiler/kineto_client_interface.cpp",
     "torch/csrc/profiler/orchestration/observer.cpp",
     "torch/csrc/profiler/orchestration/python_tracer.cpp",
@@ -472,7 +476,11 @@ inductor_core_resources = [
     "torch/csrc/inductor/aoti_runner/model_container_runner.cpp",
     "torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp",
     "torch/csrc/inductor/aoti_torch/shim_common.cpp",
+<<<<<<< HEAD
     "torch/csrc/inductor/aoti_torch/shim_mkldnn.cpp",
+=======
+    "torch/csrc/inductor/aoti_torch/shim_cpu.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/inductor/aoti_torch/tensor_converter.cpp",
     "torch/csrc/inductor/aoti_torch/mkldnn_tensor.cpp",
     "torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp",
@@ -492,11 +500,18 @@ libtorch_core_sources = sorted(
 
 # These files are the only ones that are supported on Windows.
 libtorch_distributed_base_sources = [
+<<<<<<< HEAD
     "torch/csrc/distributed/c10d/Backend.cpp",
     "torch/csrc/distributed/c10d/Backoff.cpp",
     "torch/csrc/distributed/c10d/DMAConnectivity.cpp",
     "torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
     "torch/csrc/distributed/c10d/FileStore.cpp",
+=======
+    "torch/csrc/distributed/c10d/Backoff.cpp",
+    "torch/csrc/distributed/c10d/Backend.cpp",
+    "torch/csrc/distributed/c10d/FileStore.cpp",
+    "torch/csrc/distributed/c10d/FlightRecorder.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/distributed/c10d/Functional.cpp",
     "torch/csrc/distributed/c10d/GlooDeviceFactory.cpp",
     "torch/csrc/distributed/c10d/GroupRegistry.cpp",
@@ -508,12 +523,23 @@ libtorch_distributed_base_sources = [
     "torch/csrc/distributed/c10d/ProcessGroupMPI.cpp",
     "torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
     "torch/csrc/distributed/c10d/Store.cpp",
+<<<<<<< HEAD
     "torch/csrc/distributed/c10d/SymmetricMemory.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/distributed/c10d/TCPStore.cpp",
     "torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
     "torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
     "torch/csrc/distributed/c10d/Utils.cpp",
+<<<<<<< HEAD
     "torch/csrc/distributed/c10d/comm.cpp",
+=======
+    "torch/csrc/distributed/c10d/Work.cpp",
+    "torch/csrc/distributed/c10d/comm.cpp",
+    "torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
+    "torch/csrc/distributed/c10d/control_plane/Handlers.cpp",
+    "torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/distributed/c10d/debug.cpp",
     "torch/csrc/distributed/c10d/default_comm_hooks.cpp",
     "torch/csrc/distributed/c10d/logger.cpp",
@@ -522,9 +548,14 @@ libtorch_distributed_base_sources = [
     "torch/csrc/distributed/c10d/reducer.cpp",
     "torch/csrc/distributed/c10d/sequence_num.cpp",
     "torch/csrc/distributed/c10d/socket.cpp",
+<<<<<<< HEAD
     "torch/csrc/distributed/c10d/Work.cpp",
     "torch/csrc/distributed/c10d/control_plane/Handlers.cpp",
     "torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp",
+=======
+    "torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp",
+    "torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 # These files are only supported on Linux (and others) but not on Windows.
@@ -587,6 +618,35 @@ jit_sources_full = [
 
 libtorch_core_jit_sources = sorted(jit_sources_full)
 
+<<<<<<< HEAD
+=======
+
+libtorch_nativert_sources = [
+    "torch/nativert/graph/Graph.cpp",
+    "torch/nativert/graph/GraphPasses.cpp",
+    "torch/nativert/graph/GraphSignature.cpp",
+    "torch/nativert/graph/Serialization.cpp",
+    "torch/nativert/graph/TensorMeta.cpp",
+    "torch/nativert/executor/DelegateExecutor.cpp",
+    "torch/nativert/executor/Placement.cpp",
+    "torch/nativert/executor/ExecutionPlanner.cpp",
+    "torch/nativert/executor/ExecutionFrame.cpp",
+    "torch/nativert/executor/GraphExecutorBase.cpp",
+    "torch/nativert/executor/OpKernel.cpp",
+    "torch/nativert/executor/PlacementUtils.cpp",
+    "torch/nativert/executor/SerialGraphExecutor.cpp",
+    "torch/nativert/executor/Weights.cpp",
+    "torch/nativert/executor/memory/FunctionSchema.cpp",
+    "torch/nativert/common/FileUtil.cpp",
+    "torch/nativert/detail/ITree.cpp",
+    "torch/nativert/kernels/C10Kernel.cpp",
+    "torch/nativert/kernels/AutoFunctionalizeKernel.cpp",
+    "torch/nativert/kernels/HigherOrderKernel.cpp",
+    "torch/nativert/executor/memory/GreedyBySize.cpp",
+    "torch/nativert/executor/memory/Bump.cpp",
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch_mobile_tracer_sources = [
     "torch/csrc/jit/mobile/model_tracer/tracer.cpp",
     "torch/csrc/jit/mobile/model_tracer/TensorUtils.cpp",
@@ -619,7 +679,11 @@ libtorch_lite_cmake_sources = sorted(
     torch_mobile_core,
 )
 
+<<<<<<< HEAD
 libtorch_cmake_sources = libtorch_core_sources + libtorch_core_jit_sources
+=======
+libtorch_cmake_sources = libtorch_core_sources + libtorch_core_jit_sources + libtorch_nativert_sources
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 libtorch_extra_sources = libtorch_core_jit_sources + [
     "torch/csrc/autograd/TraceTypeManual.cpp",
@@ -659,7 +723,11 @@ libtorch_extra_sources = libtorch_core_jit_sources + [
 
 def libtorch_sources(gencode_pattern = ":generate-code[{}]"):
     return (
+<<<<<<< HEAD
         libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources
+=======
+        libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources + libtorch_nativert_sources
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 libtorch_cuda_core_sources = [
@@ -684,13 +752,21 @@ libtorch_cuda_distributed_base_sources = [
 
 # These files are only supported on Linux (and others) but not on Windows.
 libtorch_cuda_distributed_extra_sources = [
+<<<<<<< HEAD
     "torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp",
     "torch/csrc/distributed/c10d/NCCLUtils.cpp",
     "torch/csrc/distributed/c10d/FlightRecorder.cpp",
+=======
+    "torch/csrc/distributed/c10d/FlightRecorderCuda.cpp",
+    "torch/csrc/distributed/c10d/NCCLUtils.cpp",
+    "torch/csrc/distributed/c10d/NanCheck.cu",
+    "torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp",
     "torch/csrc/distributed/c10d/ProcessGroupUCC.cpp",
     "torch/csrc/distributed/c10d/UCCTracing.cpp",
     "torch/csrc/distributed/c10d/UCCUtils.cpp",
+<<<<<<< HEAD
     "torch/csrc/distributed/c10d/intra_node_comm.cpp",
     "torch/csrc/distributed/c10d/intra_node_comm.cu",
     "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
@@ -700,6 +776,19 @@ libtorch_cuda_distributed_extra_sources = [
     "torch/csrc/distributed/c10d/NanCheck.cu",
     "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
     "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
+=======
+    "torch/csrc/distributed/c10d/cuda/AsyncMM.cu",
+    "torch/csrc/distributed/c10d/cuda/utils.cpp",
+    "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
+    "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
+    "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
+    "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
+    "torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp",
+    "torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu",
+    "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
+    "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
+    "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
@@ -789,7 +878,10 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
     "torch/csrc/cuda/python_nccl.cpp",
     "torch/csrc/cuda/shared/cudnn.cpp",
     "torch/csrc/cuda/shared/cusparselt.cpp",
+<<<<<<< HEAD
     "torch/csrc/cuda/Tensor.cpp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 libtorch_python_xpu_sources = [
@@ -859,6 +951,10 @@ libtorch_python_core_sources = [
     "torch/csrc/inductor/aoti_eager/kernel_holder.cpp",
     "torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp",
     "torch/csrc/inductor/resize_storage_bytes.cpp",
+<<<<<<< HEAD
+=======
+    "torch/csrc/inductor/static_cuda_launcher.cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch/csrc/jit/backends/backend_init.cpp",
     "torch/csrc/jit/python/init.cpp",
     "torch/csrc/jit/passes/onnx.cpp",
@@ -1498,7 +1594,11 @@ aten_cuda_cu_with_sort_by_key_source_list = [
     "aten/src/ATen/native/cuda/Unique.cu",
 ]
 
+<<<<<<< HEAD
 # Followings are source code for xnnpack delegate
+=======
+# Following are source code for xnnpack delegate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 xnnpack_delegate_serializer_header = [
     "torch/csrc/jit/backends/xnnpack/serialization/serializer.h",
diff --git a/c10/BUCK.oss b/c10/BUCK.oss
index 4453622f8853..4dddf2e3a3b6 100644
--- a/c10/BUCK.oss
+++ b/c10/BUCK.oss
@@ -15,6 +15,10 @@ cxx_library(
         "//third_party:cpuinfo",
         "//third_party:fmt",
         "//third_party:glog",
+<<<<<<< HEAD
+=======
+        "//third_party:moodycamel",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
     exported_deps = [],
     compiler_flags = [
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 34577caef2ec..a3c6875984fc 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+=======
+cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 project(c10 CXX)
 
 set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
@@ -50,7 +54,11 @@ endif()
         )
 if(NOT BUILD_LIBTORCHLESS)
   add_library(c10 ${C10_SRCS} ${C10_HEADERS})
+<<<<<<< HEAD
   target_compile_options_if_supported(c10 "-Wdeprecated")
+=======
+  torch_compile_options(c10)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if(HAVE_SOVERSION)
     set_target_properties(c10 PROPERTIES
         VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
@@ -96,6 +104,10 @@ if(NOT BUILD_LIBTORCHLESS)
   endif()
   target_link_libraries(c10 PRIVATE fmt::fmt-header-only)
   target_link_libraries(c10 PRIVATE nlohmann)
+<<<<<<< HEAD
+=======
+  target_link_libraries(c10 PRIVATE moodycamel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if(C10_USE_NUMA)
     message(STATUS "NUMA paths:")
diff --git a/c10/build.bzl b/c10/build.bzl
index 6ecae5112238..8da266b59fbd 100644
--- a/c10/build.bzl
+++ b/c10/build.bzl
@@ -30,7 +30,10 @@ def define_targets(rules):
             "//c10/macros",
             "//c10/util:base_headers",
             "//c10/util:bit_cast",
+<<<<<<< HEAD
             "//c10/util:ssize",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
         visibility = ["//visibility:public"],
     )
diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h
index 30eaf202a518..44790d717866 100644
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@@ -40,6 +40,13 @@ class C10_API DataPtr {
   void* operator->() const {
     return ptr_.get();
   }
+<<<<<<< HEAD
+=======
+  C10_ALWAYS_INLINE bool /* success */ unsafe_reset_data_and_ctx(
+      void* new_data_and_ctx) {
+    return ptr_.unsafe_reset_data_and_ctx(new_data_and_ctx);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void clear() {
     ptr_.clear();
   }
@@ -409,5 +416,29 @@ struct DurationStat {
   int64_t min = 0;
   int64_t count = 0;
 };
+<<<<<<< HEAD
+=======
+
+// Size pretty-printer
+inline std::string format_size(uint64_t size) {
+  std::ostringstream os;
+  os.precision(2);
+  os << std::fixed;
+  if (size <= 1024) {
+    os << size << " bytes";
+  } else if (size <= 1048576) {
+    os << (static_cast<double>(size) / 1024.0);
+    os << " KiB";
+  } else if (size <= 1073741824ULL) {
+    os << static_cast<double>(size) / 1048576.0;
+    os << " MiB";
+  } else {
+    os << static_cast<double>(size) / 1073741824.0;
+    os << " GiB";
+  }
+  return os.str();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace CachingAllocator
 } // namespace c10
diff --git a/c10/core/Backend.h b/c10/core/Backend.h
index 8ecaa7be7377..45a81c14f123 100644
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@@ -76,7 +76,11 @@ inline Backend dispatchKeyToBackend(DispatchKey t) {
     return Backend::VE;
   } else if (t == DispatchKey::FPGA) {
     return Backend::FPGA;
+<<<<<<< HEAD
   } else if (t == DispatchKey::MAIA) {
+=======
+  } else if (t == DispatchKey::MAIA || t == DispatchKey::AutogradMAIA) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Backend::MAIA;
   } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
     return Backend::XLA;
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index c2c9546cde3d..790c9958e16f 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -289,7 +289,13 @@ void ProfiledCPUMemoryReporter::OutOfMemory(size_t nbytes) {
   }
 }
 
+<<<<<<< HEAD
 C10_API at::Allocator* cpu_caching_alloc = nullptr;
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+C10_API at::Allocator* cpu_caching_alloc = nullptr;
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_API uint8_t cpu_caching_alloc_priority = 0;
 
 void SetCPUCachingAllocator(Allocator* alloc, uint8_t priority) {
diff --git a/c10/core/CachingDeviceAllocator.h b/c10/core/CachingDeviceAllocator.h
index 8310952a3ebc..c65b3e91e229 100644
--- a/c10/core/CachingDeviceAllocator.h
+++ b/c10/core/CachingDeviceAllocator.h
@@ -18,7 +18,11 @@ struct DeviceStats {
   // released via device memory deallocation)
   StatArray inactive_split;
 
+<<<<<<< HEAD
   // SUM: bytes allocated by this memory alocator
+=======
+  // SUM: bytes allocated by this memory allocator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   StatArray allocated_bytes;
   // SUM: bytes reserved by this memory allocator (both free and used)
   StatArray reserved_bytes;
@@ -58,6 +62,7 @@ struct DeviceStats {
   int64_t max_split_size = 0;
 };
 
+<<<<<<< HEAD
 // Size pretty-printer
 inline std::string format_size(uint64_t size) {
   std::ostringstream os;
@@ -78,4 +83,6 @@ inline std::string format_size(uint64_t size) {
   return os.str();
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10::CachingDeviceAllocator
diff --git a/c10/core/Contiguity.h b/c10/core/Contiguity.h
index 36f41b6251c0..d1e989b217bd 100644
--- a/c10/core/Contiguity.h
+++ b/c10/core/Contiguity.h
@@ -12,6 +12,7 @@ namespace c10 {
 
 template <typename T>
 bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
+<<<<<<< HEAD
   bool is_contiguous = true;
   if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, 0))) {
     return is_contiguous;
@@ -30,6 +31,51 @@ bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
     }
   }
   return is_contiguous;
+=======
+  if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, 0))) {
+    return true;
+  }
+
+  T expected_stride = 1;
+  // NB: make sure we do signed arithmetic
+  for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
+    const auto& size_d = sizes[d];
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(size_d, 1))) {
+      continue;
+    }
+
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected_stride))) {
+      return false;
+    }
+    expected_stride *= size_d;
+  }
+  return true;
+}
+
+// This function will return True if the tensor is contiguous, and False if the
+// its not or if we can't determine if it is contiguous due to unbacked symbols
+// (it could be either in that case based on the actual runtime data).
+template <typename T>
+bool definitely_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
+  if (TORCH_GUARD_OR_FALSE(sym_eq(numel, 0))) {
+    return true;
+  }
+
+  T expected_stride = 1;
+  // NB: make sure we do signed arithmetic
+  for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
+    const auto& size_d = sizes[d];
+    if (TORCH_GUARD_OR_FALSE(sym_eq(size_d, 1))) {
+      continue;
+    }
+
+    if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected_stride))) {
+      return false;
+    }
+    expected_stride *= size_d;
+  }
+  return true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T>
diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp
index 5b0e5e560129..fe006e43e65d 100644
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@@ -83,10 +83,18 @@ Device::Device(const std::string& device_string) : Device(Type::CPU) {
        pstate != DeviceStringParsingState::ERROR && i < device_string.size();
        ++i) {
     const char ch = device_string.at(i);
+<<<<<<< HEAD
     switch (pstate) {
       case DeviceStringParsingState::START:
         if (ch != ':') {
           if (isalpha(ch) || ch == '_') {
+=======
+    const unsigned char uch = static_cast<unsigned char>(ch);
+    switch (pstate) {
+      case DeviceStringParsingState::START:
+        if (ch != ':') {
+          if (std::isalpha(uch) || ch == '_') {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device_name.push_back(ch);
           } else {
             pstate = DeviceStringParsingState::ERROR;
@@ -97,7 +105,11 @@ Device::Device(const std::string& device_string) : Device(Type::CPU) {
         break;
 
       case DeviceStringParsingState::INDEX_START:
+<<<<<<< HEAD
         if (isdigit(ch)) {
+=======
+        if (std::isdigit(uch)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           device_index_str.push_back(ch);
           pstate = DeviceStringParsingState::INDEX_REST;
         } else {
@@ -110,7 +122,11 @@ Device::Device(const std::string& device_string) : Device(Type::CPU) {
           pstate = DeviceStringParsingState::ERROR;
           break;
         }
+<<<<<<< HEAD
         if (isdigit(ch)) {
+=======
+        if (std::isdigit(uch)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           device_index_str.push_back(ch);
         } else {
           pstate = DeviceStringParsingState::ERROR;
diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp
index daf94245c1e7..35f4af9250b5 100644
--- a/c10/core/DeviceType.cpp
+++ b/c10/core/DeviceType.cpp
@@ -119,7 +119,11 @@ std::ostream& operator<<(std::ostream& stream, DeviceType type) {
 //     Whenever a user prints a privateuse1 device name, they need to read this
 //     variable. Although unlikely, we'll data race if someone else is trying to
 //     set this variable at the same time that another thread is print the
+<<<<<<< HEAD
 //     device name. We could re-use the same mutex, but reading the atomic will
+=======
+//     device name. We could reuse the same mutex, but reading the atomic will
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //     be much faster.
 static std::atomic<bool> privateuse1_backend_name_set;
 static std::string privateuse1_backend_name;
diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp
index e9e18dcbd588..f10aaf94cca0 100644
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@@ -32,6 +32,11 @@ const char* toString(BackendComponent t) {
       return "VEBit";
     case BackendComponent::MTIABit:
       return "MTIA";
+<<<<<<< HEAD
+=======
+    case BackendComponent::MAIABit:
+      return "MAIA";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case BackendComponent::PrivateUse1Bit:
       return "PrivateUse1Bit";
     case BackendComponent::PrivateUse2Bit:
@@ -142,6 +147,11 @@ const char* toString(DispatchKey t) {
       return "AutocastCPU";
     case DispatchKey::AutocastMTIA:
       return "AutocastMTIA";
+<<<<<<< HEAD
+=======
+    case DispatchKey::AutocastMAIA:
+      return "AutocastMAIA";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case DispatchKey::AutocastXPU:
       return "AutocastXPU";
     case DispatchKey::AutocastIPU:
@@ -299,6 +309,10 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       {"Tracer", c10::DispatchKey::Tracer},
       {"AutocastCPU", c10::DispatchKey::AutocastCPU},
       {"AutocastMTIA", c10::DispatchKey::AutocastMTIA},
+<<<<<<< HEAD
+=======
+      {"AutocastMAIA", c10::DispatchKey::AutocastMAIA},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {"AutocastXPU", c10::DispatchKey::AutocastXPU},
       {"AutocastIPU", c10::DispatchKey::AutocastIPU},
       {"AutocastHPU", c10::DispatchKey::AutocastHPU},
@@ -335,6 +349,10 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       {"NestedTensorCPU", c10::DispatchKey::NestedTensorCPU},
       {"NestedTensorCUDA", c10::DispatchKey::NestedTensorCUDA},
       {"NestedTensorXPU", c10::DispatchKey::NestedTensorXPU},
+<<<<<<< HEAD
+=======
+      {"NestedTensorHPU", c10::DispatchKey::NestedTensorHPU},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {"NestedTensorMeta", c10::DispatchKey::NestedTensorMeta},
       {"NestedTensorPrivateUse1", c10::DispatchKey::NestedTensorPrivateUse1},
       {"PrivateUse1", c10::DispatchKey::PrivateUse1},
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index 13c2b1ca2658..307e562c578f 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -45,6 +45,10 @@ namespace c10 {
   _(VE, extra)                                  \
   _(Lazy, extra)                                \
   _(MTIA, extra)                                \
+<<<<<<< HEAD
+=======
+  _(MAIA, extra)                                \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _(PrivateUse1, extra)                         \
   _(PrivateUse2, extra)                         \
   _(PrivateUse3, extra)                         \
@@ -180,6 +184,7 @@ enum class DispatchKey : uint16_t {
   FPGA, // Xilinx support lives out of tree at
   // https://gitlab.com/pytorch-complex/vitis_kernels
 
+<<<<<<< HEAD
   // TODO: put this in BackendComponents
   // MAIA backend lives out of tree
   // - test/cpp_extensions/maia_extension.cpp
@@ -187,6 +192,8 @@ enum class DispatchKey : uint16_t {
   // - aten/src/ATen/test/extension_backend_test.cpp
   MAIA,
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Vulkan, // TODO: put this in BackendComponents
   Metal, // TODO: put this in BackendComponents
 
@@ -354,6 +361,10 @@ enum class DispatchKey : uint16_t {
   // and inputs are saved for backward in the post-autocast type.
   AutocastCPU,
   AutocastMTIA,
+<<<<<<< HEAD
+=======
+  AutocastMAIA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AutocastXPU,
   AutocastIPU,
   AutocastHPU,
diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
index f8f0b755e17e..cdb6a24b698c 100644
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@@ -53,7 +53,11 @@ constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset |
     // explicit kernels therefore we manually add the key to the
     // math_dispatch_keyset
     DispatchKeySet{DispatchKey::NestedTensor} |
+<<<<<<< HEAD
     // Functionalize should always re-use CompositeImplicit decomps.
+=======
+    // Functionalize should always reuse CompositeImplicit decomps.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DispatchKeySet{DispatchKey::Functionalize};
 
 constexpr DispatchKeySet nested_dispatch_keyset =
@@ -131,6 +135,11 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) {
       return DispatchKeySet(DispatchKey::IPU);
     case DispatchKey::AutogradXPU:
       return DispatchKeySet(DispatchKey::XPU);
+<<<<<<< HEAD
+=======
+    case DispatchKey::AutogradMAIA:
+      return DispatchKeySet(DispatchKey::MAIA);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case DispatchKey::AutogradPrivateUse1:
       return DispatchKeySet(DispatchKey::PrivateUse1);
     case DispatchKey::AutogradPrivateUse2:
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index c702737f055e..bfdaa9290f6f 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -115,7 +115,11 @@ C10_ALWAYS_INLINE static const std::
 // Not every backend and not every functionality counts as a "building block
 // key". This is mostly to give us more levers to pull in the design space.
 // Backend keys and functionality keys that count as "building blocks" will
+<<<<<<< HEAD
 // contribute to a full cross product of functionality that can be overriden.
+=======
+// contribute to a full cross product of functionality that can be overridden.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //
 // For example, right now we have at least 12 "backend" building
 // blocks (CPU, CUDA, XLA, ...) and at least 5 "functionality"
@@ -394,10 +398,21 @@ class DispatchKeySet final {
   bool empty() const {
     return repr_ == 0;
   }
+<<<<<<< HEAD
   uint64_t raw_repr() {
     return repr_;
   }
 
+=======
+  uint64_t raw_repr() const {
+    return repr_;
+  }
+
+  static DispatchKeySet from_raw_repr(uint64_t x) {
+    return DispatchKeySet(RAW, x);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   DispatchKey highestFunctionalityKey() const {
     auto functionality_idx = indexOfHighestBit();
     // This means that none of the functionality bits were set.
@@ -663,6 +678,10 @@ constexpr DispatchKeySet autocast_dispatch_keyset = DispatchKeySet({
     DispatchKey::AutocastXLA,
     DispatchKey::AutocastPrivateUse1,
     DispatchKey::AutocastMTIA,
+<<<<<<< HEAD
+=======
+    DispatchKey::AutocastMAIA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 });
 
 // See Note [TLS Initialization]
@@ -681,6 +700,10 @@ constexpr DispatchKeySet default_excluded_set = DispatchKeySet({
     DispatchKey::AutocastXLA,
     DispatchKey::AutocastPrivateUse1,
     DispatchKey::AutocastMTIA,
+<<<<<<< HEAD
+=======
+    DispatchKey::AutocastMAIA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 });
 
 constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView =
@@ -706,7 +729,10 @@ constexpr DispatchKeySet autogradother_backends =
         // Technically, HIP will now redispatch to its own custom AutogradHIP
         // slot in the runtime table.
         {DispatchKey::FPGA,
+<<<<<<< HEAD
          DispatchKey::MAIA,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
          DispatchKey::Vulkan,
          DispatchKey::Metal,
          DispatchKey::CustomRNGKeyId,
@@ -756,6 +782,10 @@ constexpr auto inplace_or_view_ks =
 constexpr auto autograd_cpu_ks = DispatchKeySet(DispatchKey::AutogradCPU);
 constexpr auto autograd_ipu_ks = DispatchKeySet(DispatchKey::AutogradIPU);
 constexpr auto autograd_mtia_ks = DispatchKeySet(DispatchKey::AutogradMTIA);
+<<<<<<< HEAD
+=======
+constexpr auto autograd_maia_ks = DispatchKeySet(DispatchKey::AutogradMAIA);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr auto autograd_xpu_ks = DispatchKeySet(DispatchKey::AutogradXPU);
 constexpr auto autograd_cuda_ks = DispatchKeySet(DispatchKey::AutogradCUDA);
 constexpr auto autograd_xla_ks = DispatchKeySet(DispatchKey::AutogradXLA);
@@ -835,6 +865,11 @@ inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) {
       return inplace_or_view_ks | autograd_ipu_ks;
     case BackendComponent::MTIABit:
       return inplace_or_view_ks | autograd_mtia_ks;
+<<<<<<< HEAD
+=======
+    case BackendComponent::MAIABit:
+      return inplace_or_view_ks | autograd_maia_ks;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case BackendComponent::XPUBit:
       return inplace_or_view_ks | autograd_xpu_ks;
     case BackendComponent::CUDABit:
@@ -864,6 +899,10 @@ inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) {
 inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
   constexpr auto autocast_cpu_ks = DispatchKeySet(DispatchKey::AutocastCPU);
   constexpr auto autocast_mtia_ks = DispatchKeySet(DispatchKey::AutocastMTIA);
+<<<<<<< HEAD
+=======
+  constexpr auto autocast_maia_ks = DispatchKeySet(DispatchKey::AutocastMAIA);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr auto autocast_xpu_ks = DispatchKeySet(DispatchKey::AutocastXPU);
   constexpr auto autocast_ipu_ks = DispatchKeySet(DispatchKey::AutocastIPU);
   constexpr auto autocast_hpu_ks = DispatchKeySet(DispatchKey::AutocastHPU);
@@ -877,6 +916,11 @@ inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
       return autocast_cpu_ks;
     case BackendComponent::MTIABit:
       return autocast_mtia_ks;
+<<<<<<< HEAD
+=======
+    case BackendComponent::MAIABit:
+      return autocast_maia_ks;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case BackendComponent::XPUBit:
       return autocast_xpu_ks;
     case BackendComponent::IPUBit:
diff --git a/c10/core/GeneratorImpl.cpp b/c10/core/GeneratorImpl.cpp
index 8025ab966e72..9c17630ca61c 100644
--- a/c10/core/GeneratorImpl.cpp
+++ b/c10/core/GeneratorImpl.cpp
@@ -32,7 +32,11 @@ c10::intrusive_ptr<GeneratorImpl> GeneratorImpl::clone() const {
 }
 
 void GeneratorImpl::graphsafe_set_state(
+<<<<<<< HEAD
     const c10::intrusive_ptr<c10::GeneratorImpl>& state) {
+=======
+    const c10::intrusive_ptr<c10::GeneratorImpl>& /*state*/) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK_NOT_IMPLEMENTED(
       false, "graphsafe_set_state is not supported in this Generator");
 }
@@ -91,9 +95,13 @@ uint64_t getNonDeterministicRandom(bool is_cuda) {
   uint64_t s = 0;
   if (!is_cuda) {
 #ifdef _WIN32
+<<<<<<< HEAD
     s = (uint64_t)std::chrono::high_resolution_clock::now()
             .time_since_epoch()
             .count();
+=======
+    s = (uint64_t)std::chrono::steady_clock::now().time_since_epoch().count();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #elif defined(__SGX_ENABLED__)
     TORCH_CHECK(
         sgx_read_rand(reinterpret_cast<uint8_t*>(&s), sizeof(s)) == SGX_SUCCESS,
diff --git a/c10/core/RefcountedDeleter.h b/c10/core/RefcountedDeleter.h
index ce988864720a..4193203dbb61 100644
--- a/c10/core/RefcountedDeleter.h
+++ b/c10/core/RefcountedDeleter.h
@@ -17,12 +17,21 @@ namespace c10 {
 // data when the refcount reaches 0.
 //
 // This shared DataPtr feature is only used when storages are shared between
+<<<<<<< HEAD
 // multiple Python interpreters in MultiPy. Before storages had PyObject
 // preservation, interpreters could just share the same StorageImpl instance.
 // But now a StorageImpl can only be associated with one interpreter in order
 // to properly manage a zombie PyObject. So we share storages across Python
 // interpreters by creating a different StorageImpl instance for each one, but
 // they all point to the same data.
+=======
+// multiple Python interpreters in MultiPy. // codespell:ignore multipy
+// Before storages had PyObject preservation, interpreters could just share the
+// same StorageImpl instance. But now a StorageImpl can only be associated with
+// one interpreter in order to properly manage a zombie PyObject. So we share
+// storages across Python interpreters by creating a different StorageImpl
+// instance for each one, but they all point to the same data.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct C10_API RefcountedDeleterContext {
   RefcountedDeleterContext(void* other_ctx, c10::DeleterFnPtr other_deleter)
       : other_ctx(other_ctx, other_deleter), refcount(1) {}
diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h
index 2a40114573cc..9a3dda63a003 100644
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@@ -13,7 +13,10 @@
 #include <c10/core/SymNodeImpl.h>
 #include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
+<<<<<<< HEAD
 #include <c10/util/Deprecated.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/Exception.h>
 #include <c10/util/Half.h>
 #include <c10/util/TypeCast.h>
@@ -187,9 +190,15 @@ class C10_API Scalar {
     return Tag::HAS_d == tag || Tag::HAS_sd == tag;
   }
 
+<<<<<<< HEAD
   C10_DEPRECATED_MESSAGE(
       "isIntegral is deprecated. Please use the overload with 'includeBool' parameter instead.")
   bool isIntegral() const {
+=======
+  [[deprecated(
+      "isIntegral is deprecated. Please use the overload with 'includeBool' parameter instead.")]] bool
+  isIntegral() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag;
   }
   bool isIntegral(bool includeBool) const {
diff --git a/c10/core/ScalarType.cpp b/c10/core/ScalarType.cpp
index d00d0240a299..f4d717cef455 100644
--- a/c10/core/ScalarType.cpp
+++ b/c10/core/ScalarType.cpp
@@ -225,6 +225,11 @@ std::pair<std::string, std::string> getDtypeNames(c10::ScalarType scalarType) {
     case c10::ScalarType::Float8_e8m0fnu:
       // TODO(#146647): macroify all of this
       return std::make_pair("float8_e8m0fnu", "");
+<<<<<<< HEAD
+=======
+    case c10::ScalarType::Float4_e2m1fn_x2:
+      return std::make_pair("float4_e2m1fn_x2", "");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       throw std::runtime_error("Unimplemented scalar type");
   }
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 32ae5aaee8fd..09f0df26640e 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -1,8 +1,13 @@
 #pragma once
 
 #include <c10/util/BFloat16.h>
+<<<<<<< HEAD
 #include <c10/util/Deprecated.h>
 #include <c10/util/Exception.h>
+=======
+#include <c10/util/Exception.h>
+#include <c10/util/Float4_e2m1fn_x2.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/Float8_e4m3fn.h>
 #include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
@@ -104,7 +109,12 @@ struct dummy_int1_7_t {};
   _(c10::dummy_int1_7_t<5>, Int5) /* 41 */               \
   _(c10::dummy_int1_7_t<6>, Int6) /* 42 */               \
   _(c10::dummy_int1_7_t<7>, Int7) /* 43 */               \
+<<<<<<< HEAD
   _(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */
+=======
+  _(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */        \
+  _(c10::Float4_e2m1fn_x2, Float4_e2m1fn_x2) /* 45 */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // If you want to support ComplexHalf for real, add ComplexHalf
 // into this macro (and change the name).  But beware: convert()
@@ -374,9 +384,15 @@ inline bool isIntegralType(ScalarType t, bool includeBool) {
   return isIntegral || (includeBool && t == ScalarType::Bool);
 }
 
+<<<<<<< HEAD
 C10_DEPRECATED_MESSAGE(
     "isIntegralType is deprecated. Please use the overload with 'includeBool' parameter instead.")
 inline bool isIntegralType(ScalarType t) {
+=======
+[[deprecated(
+    "isIntegralType is deprecated. Please use the overload with 'includeBool' parameter instead.")]] inline bool
+isIntegralType(ScalarType t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return isIntegralType(t, /*includeBool=*/false);
 }
 
@@ -387,7 +403,12 @@ inline bool isFloat8Type(ScalarType t) {
 }
 
 inline bool isReducedFloatingType(ScalarType t) {
+<<<<<<< HEAD
   return t == ScalarType::Half || t == ScalarType::BFloat16 || isFloat8Type(t);
+=======
+  return t == ScalarType::Half || t == ScalarType::BFloat16 ||
+      isFloat8Type(t) || t == ScalarType::Float4_e2m1fn_x2;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline bool isFloatingType(ScalarType t) {
@@ -502,6 +523,10 @@ inline bool isSignedType(ScalarType t) {
     case ScalarType::Int5:
     case ScalarType::Int6:
     case ScalarType::Int7:
+<<<<<<< HEAD
+=======
+    case ScalarType::Float4_e2m1fn_x2:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return true;
     case ScalarType::UInt1:
     case ScalarType::UInt2:
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
index e45b4953b9c9..53ae673d776f 100644
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -121,6 +121,14 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
     size_bytes_is_heap_allocated_ = false;
   }
 
+<<<<<<< HEAD
+=======
+  void unsafe_set_nbytes(size_t size_bytes) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!size_bytes_is_heap_allocated_);
+    size_bytes_.unsafe_set_data(size_bytes);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void set_nbytes(c10::SymInt size_bytes) {
     size_bytes_ = std::move(size_bytes);
   }
diff --git a/c10/core/SymBool.cpp b/c10/core/SymBool.cpp
index 1b5269c9da13..7624954bb208 100644
--- a/c10/core/SymBool.cpp
+++ b/c10/core/SymBool.cpp
@@ -72,6 +72,33 @@ bool SymBool::guard_size_oblivious(const char* file, int64_t line) const {
   return a->guard_size_oblivious(file, line);
 }
 
+<<<<<<< HEAD
+=======
+bool SymBool::guard_or_false(const char* file, int64_t line) const {
+  if (auto ma = maybe_as_bool()) {
+    return *ma;
+  }
+  SymNode a = toSymNodeImpl();
+  return a->guard_or_false(file, line);
+}
+
+bool SymBool::statically_known_true(const char* file, int64_t line) const {
+  if (auto ma = maybe_as_bool()) {
+    return *ma;
+  }
+  SymNode a = toSymNodeImpl();
+  return a->statically_known_true(file, line);
+}
+
+bool SymBool::guard_or_true(const char* file, int64_t line) const {
+  if (auto ma = maybe_as_bool()) {
+    return *ma;
+  }
+  SymNode a = toSymNodeImpl();
+  return a->guard_or_true(file, line);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool SymBool::expect_true(const char* file, int64_t line) const {
   if (auto ma = maybe_as_bool()) {
     return *ma;
diff --git a/c10/core/SymBool.h b/c10/core/SymBool.h
index c7b1fe5ff316..8941e8ada1cb 100644
--- a/c10/core/SymBool.h
+++ b/c10/core/SymBool.h
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #pragma once
 
 #include <c10/core/SymNodeImpl.h>
@@ -62,6 +66,12 @@ class C10_API SymBool {
   bool guard_bool(const char* file, int64_t line) const;
   bool expect_true(const char* file, int64_t line) const;
   bool guard_size_oblivious(const char* file, int64_t line) const;
+<<<<<<< HEAD
+=======
+  bool statically_known_true(const char* file, int64_t line) const;
+  bool guard_or_false(const char* file, int64_t line) const;
+  bool guard_or_true(const char* file, int64_t line) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool has_hint() const;
 
@@ -113,7 +123,63 @@ inline bool guard_size_oblivious(
   return b.guard_size_oblivious(file, line);
 }
 
+<<<<<<< HEAD
+#define TORCH_GUARD_SIZE_OBLIVIOUS(cond) \
+  c10::guard_size_oblivious((cond), __FILE__, __LINE__)
+
+=======
+inline bool guard_or_false(
+    bool b,
+    const char* file [[maybe_unused]],
+    int64_t line [[maybe_unused]]) {
+  return b;
+}
+
+inline bool guard_or_false(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.guard_or_false(file, line);
+}
+
+inline bool statically_known_true(
+    bool b,
+    const char* file [[maybe_unused]],
+    int64_t line [[maybe_unused]]) {
+  return b;
+}
+
+inline bool statically_known_true(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.statically_known_true(file, line);
+}
+
+inline bool guard_or_true(
+    bool b,
+    const char* file [[maybe_unused]],
+    int64_t line [[maybe_unused]]) {
+  return b;
+}
+
+inline bool guard_or_true(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.guard_or_true(file, line);
+}
+
 #define TORCH_GUARD_SIZE_OBLIVIOUS(cond) \
   c10::guard_size_oblivious((cond), __FILE__, __LINE__)
 
+#define TORCH_STATICALLY_KNOWN_TRUE(cond) \
+  c10::statically_known_true((cond), __FILE__, __LINE__)
+
+#define TORCH_GUARD_OR_FALSE(cond) \
+  c10::guard_or_false((cond), __FILE__, __LINE__)
+
+#define TORCH_GUARD_OR_TRUE(cond) c10::guard_or_true((cond), __FILE__, __LINE__)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index 70dfc0b74af8..0bdef956100b 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -136,7 +136,11 @@ SymInt operator-(const SymInt& s) {
     const auto val = *ma;
     // Note: Result of `-std::numeric_limits<decltype(val)>::min()` is undefined
     // But on many platforms it equals to self + setting Carry/Overflow flags
+<<<<<<< HEAD
     // Which in opimized code affects results of `check_range` condition
+=======
+    // Which in optimized code affects results of `check_range` condition
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Workaround by using ternary that avoids alterning the flags
 #if C10_HAS_BUILTIN_OVERFLOW()
     std::decay_t<decltype(val)> out = 0;
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index f965d909c9e1..d0a355c28fdc 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -224,6 +224,14 @@ class C10_API SymInt {
 
   operator SymFloat() const;
 
+<<<<<<< HEAD
+=======
+  void unsafe_set_data(size_t nbytes) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_heap_allocated());
+    data_ = static_cast<int64_t>(nbytes);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Don't use this.  Prefer maybe_as_int instead
   int64_t as_int_unchecked() const {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_heap_allocated());
@@ -414,6 +422,7 @@ inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
   return a.sym_ge(b);
 }
 
+<<<<<<< HEAD
 inline bool definitely_true(
     const c10::SymBool& b,
     const char* file,
@@ -421,4 +430,6 @@ inline bool definitely_true(
   return b.has_hint() && b.guard_bool(file, line);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10
diff --git a/c10/core/SymNodeImpl.h b/c10/core/SymNodeImpl.h
index 36652e1800ac..0f3c9c5231a6 100644
--- a/c10/core/SymNodeImpl.h
+++ b/c10/core/SymNodeImpl.h
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #pragma once
 
 #include <c10/macros/Export.h>
@@ -186,6 +190,27 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
     // with a better implementation!
     return guard_bool(file, line);
   }
+<<<<<<< HEAD
+=======
+  virtual bool guard_or_false(const char* file, int64_t line) {
+    // Note: PT2 primarily uses PythonSymNodeImpl for this functionality.
+    // XLA is currently the main consumer of this fallback path since it uses
+    // ahead-of-time compilation and cannot depend on Python runtime.
+    return guard_bool(file, line);
+  }
+  virtual bool statically_known_true(const char* file, int64_t line) {
+    // Note: PT2 primarily uses PythonSymNodeImpl for this functionality.
+    // XLA is currently the main consumer of this fallback path since it uses
+    // ahead-of-time compilation and cannot depend on Python runtime.
+    return guard_bool(file, line);
+  }
+  virtual bool guard_or_true(const char* file, int64_t line) {
+    // Note: PT2 primarily uses PythonSymNodeImpl for this functionality.
+    // XLA is currently the main consumer of this fallback path since it uses
+    // ahead-of-time compilation and cannot depend on Python runtime.
+    return guard_bool(file, line);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual bool expect_true(const char* file, int64_t line) {
     // No improvement for unbacked SymBools by default, replace this
     // with a better implementation!
diff --git a/c10/core/SymbolicShapeMeta.cpp b/c10/core/SymbolicShapeMeta.cpp
index f28b6305de1c..2aff9a2fe1a2 100644
--- a/c10/core/SymbolicShapeMeta.cpp
+++ b/c10/core/SymbolicShapeMeta.cpp
@@ -128,11 +128,19 @@ DEFINE_SYMBOOL_COMPUTE(compute_non_overlapping_and_dense, is_non_overlapping_and
 
 SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_dim4() const {
   init_is_contiguous();
+<<<<<<< HEAD
   if (definitely_true(is_contiguous(), __FILE__, __LINE__)) {
     return true;
   }
   init_is_channels_last_contiguous();
   if (definitely_true(is_channels_last_contiguous(), __FILE__, __LINE__)) {
+=======
+  if (guard_or_false(is_contiguous(), __FILE__, __LINE__)) {
+    return true;
+  }
+  init_is_channels_last_contiguous();
+  if (guard_or_false(is_channels_last_contiguous(), __FILE__, __LINE__)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return true;
   }
   return is_contiguous() | is_channels_last_contiguous() |
@@ -141,7 +149,11 @@ SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_dim4() const {
 
 SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d_dim5() const {
   init_is_channels_last_contiguous();
+<<<<<<< HEAD
   if (definitely_true(is_channels_last_contiguous(), __FILE__, __LINE__)) {
+=======
+  if (guard_or_false(is_channels_last_contiguous(), __FILE__, __LINE__)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
   return ~is_channels_last_contiguous() & compute_channels_last_contiguous_3d();
@@ -149,7 +161,11 @@ SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d_dim5() const {
 
 SymBool SymbolicShapeMeta::compute_channels_last_2d_dim5() const {
   init_is_channels_last_3d_contiguous();
+<<<<<<< HEAD
   if (definitely_true(is_channels_last_3d_contiguous(), __FILE__, __LINE__)) {
+=======
+  if (guard_or_false(is_channels_last_3d_contiguous(), __FILE__, __LINE__)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
   return ~is_channels_last_3d_contiguous() &
@@ -157,13 +173,18 @@ SymBool SymbolicShapeMeta::compute_channels_last_2d_dim5() const {
 }
 
 SymBool SymbolicShapeMeta::compute_channels_last_3d_dim5() const {
+<<<<<<< HEAD
   if (definitely_true(is_channels_last(), __FILE__, __LINE__)) {
+=======
+  if (guard_or_false(is_channels_last(), __FILE__, __LINE__)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
   return ~is_channels_last() & compute_strides_like_channels_last_3d();
 }
 
 SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_dim5() const {
+<<<<<<< HEAD
   if (definitely_true(is_contiguous(), __FILE__, __LINE__)) {
     return true;
   }
@@ -171,6 +192,15 @@ SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_dim5() const {
     return true;
   }
   if (definitely_true(is_channels_last_3d_contiguous(), __FILE__, __LINE__)) {
+=======
+  if (guard_or_false(is_contiguous(), __FILE__, __LINE__)) {
+    return true;
+  }
+  if (guard_or_false(is_channels_last_contiguous(), __FILE__, __LINE__)) {
+    return true;
+  }
+  if (guard_or_false(is_channels_last_3d_contiguous(), __FILE__, __LINE__)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return true;
   }
   return is_contiguous() | is_channels_last_contiguous() |
@@ -178,7 +208,11 @@ SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_dim5() const {
 }
 
 SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_anydim() const {
+<<<<<<< HEAD
   if (definitely_true(is_contiguous(), __FILE__, __LINE__)) {
+=======
+  if (guard_or_false(is_contiguous(), __FILE__, __LINE__)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return true;
   }
   return is_contiguous() | compute_non_overlapping_and_dense();
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 47af1d91793f..c994d81bcbfd 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -102,7 +102,11 @@ TensorImpl::TensorImpl(
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 TensorImpl::TensorImpl(
+<<<<<<< HEAD
     ImplType type,
+=======
+    ImplType /*type*/,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Storage&& storage,
     DispatchKeySet key_set,
     const caffe2::TypeMeta data_type)
@@ -160,7 +164,11 @@ TensorImpl::TensorImpl(
   if (inference_mode) {
     // See Note [Expected TLS state in InferenceMode] for why we exclude
     // Autograd & ADInplaceOrView keys. Normally key_set only contains backend
+<<<<<<< HEAD
     // keys but we do the substraction here to make sure.
+=======
+    // keys but we do the subtraction here to make sure.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     key_set_ = key_set - c10::autograd_dispatch_keyset_with_ADInplaceOrView;
   } else {
     // TODO: Ideally we only add AutogradBackend key when the tensor requires
@@ -218,7 +226,11 @@ void TensorImpl::HandleResize() {
   }
 }
 
+<<<<<<< HEAD
 bool TensorImpl::compute_contiguous(identity<bool>) const {
+=======
+bool TensorImpl::compute_contiguous() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_sparse()) {
     return false;
   }
@@ -228,7 +240,11 @@ bool TensorImpl::compute_contiguous(identity<bool>) const {
       numel_);
 }
 
+<<<<<<< HEAD
 bool TensorImpl::compute_channels_last_contiguous_2d(identity<bool>) const {
+=======
+bool TensorImpl::compute_channels_last_contiguous_2d() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_sparse()) {
     return false;
   }
@@ -237,7 +253,11 @@ bool TensorImpl::compute_channels_last_contiguous_2d(identity<bool>) const {
       sizes_and_strides_.strides_arrayref());
 }
 
+<<<<<<< HEAD
 bool TensorImpl::compute_channels_last_contiguous_3d(identity<bool>) const {
+=======
+bool TensorImpl::compute_channels_last_contiguous_3d() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_sparse()) {
     return false;
   }
@@ -246,7 +266,11 @@ bool TensorImpl::compute_channels_last_contiguous_3d(identity<bool>) const {
       sizes_and_strides_.strides_arrayref());
 }
 
+<<<<<<< HEAD
 bool TensorImpl::compute_strides_like_channels_last_2d(identity<bool>) const {
+=======
+bool TensorImpl::compute_strides_like_channels_last_2d() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_sparse()) {
     return false;
   }
@@ -255,7 +279,11 @@ bool TensorImpl::compute_strides_like_channels_last_2d(identity<bool>) const {
       sizes_and_strides_.strides_arrayref());
 }
 
+<<<<<<< HEAD
 bool TensorImpl::compute_strides_like_channels_last_3d(identity<bool>) const {
+=======
+bool TensorImpl::compute_strides_like_channels_last_3d() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_sparse()) {
     return false;
   }
@@ -264,7 +292,11 @@ bool TensorImpl::compute_strides_like_channels_last_3d(identity<bool>) const {
       sizes_and_strides_.strides_arrayref());
 }
 
+<<<<<<< HEAD
 bool TensorImpl::compute_non_overlapping_and_dense(identity<bool>) const {
+=======
+bool TensorImpl::compute_non_overlapping_and_dense() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_sparse()) {
     return false;
   }
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index ae600a9bddb5..8cc2df01557d 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -193,11 +193,19 @@ struct C10_API AutogradMetaFactory {
 C10_API void SetAutogradMetaFactory(AutogradMetaFactory* factory);
 C10_API AutogradMetaFactory* GetAutogradMetaFactory();
 
+<<<<<<< HEAD
 struct C10_API AutogradMetaFactoryRegisterer {
   explicit AutogradMetaFactoryRegisterer(AutogradMetaFactory* factory) {
     SetAutogradMetaFactory(factory);
   }
 };
+=======
+struct C10_API AutogradMetaFactoryRegisterer{
+    explicit AutogradMetaFactoryRegisterer(AutogradMetaFactory * factory){
+        SetAutogradMetaFactory(factory);
+} // namespace impl
+}; // namespace c10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace impl
 
@@ -964,7 +972,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Customization points for the functions above.  sizes_strides_policy_
    * must be set to enable these.
    *
+<<<<<<< HEAD
    * NB: dim is overrideable separately from sizes because it is possible
+=======
+   * NB: dim is overridable separately from sizes because it is possible
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    * for a tensor to have rank, but not well defined sizes.
    */
   // sizes_strides_policy_ >= CustomStrides
@@ -1834,6 +1846,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
         MemoryFormat::Contiguous); // calls refresh_contiguous()
   }
 
+<<<<<<< HEAD
+=======
+  C10_ALWAYS_INLINE const impl::SizesAndStrides& sizes_and_strides() {
+    return sizes_and_strides_;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /**
    * Set the sizes and strides of a tensor.
    *
@@ -2563,6 +2582,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Compute whether or not a tensor is contiguous based on the sizes and
    * strides of a tensor.
    */
+<<<<<<< HEAD
   bool compute_contiguous(identity<bool>) const;
 
   bool compute_channels_last_contiguous_2d(identity<bool>) const;
@@ -2574,6 +2594,19 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   bool compute_strides_like_channels_last_3d(identity<bool>) const;
 
   bool compute_non_overlapping_and_dense(identity<bool>) const;
+=======
+  bool compute_contiguous() const;
+
+  bool compute_channels_last_contiguous_2d() const;
+
+  bool compute_channels_last_contiguous_3d() const;
+
+  bool compute_strides_like_channels_last_2d() const;
+
+  bool compute_strides_like_channels_last_3d() const;
+
+  bool compute_non_overlapping_and_dense() const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  protected:
   /**
@@ -2616,6 +2649,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
  private:
+<<<<<<< HEAD
   // NB: the TypeId argument prevents confusion where you pass a true/false
   // literal and pick the wrong overload
 
@@ -2640,12 +2674,36 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   void _set_is_non_overlapping_and_dense(identity<bool>, bool b) {
+=======
+  void _set_is_contiguous(bool b) {
+    is_contiguous_ = b;
+  }
+
+  void _set_is_channels_last_contiguous(bool b) {
+    is_channels_last_contiguous_ = b;
+  }
+
+  void _set_is_channels_last_3d_contiguous(bool b) {
+    is_channels_last_3d_contiguous_ = b;
+  }
+
+  void _set_is_channels_last(bool b) {
+    is_channels_last_ = b;
+  }
+
+  void _set_is_channels_last_3d(bool b) {
+    is_channels_last_3d_ = b;
+  }
+
+  void _set_is_non_overlapping_and_dense(bool b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_non_overlapping_and_dense_ = b;
   }
 
   // These are little wrappers over the real compute_ functions that
   // can make use of other contiguity fields to short circuit.
 
+<<<<<<< HEAD
   bool compute_is_non_overlapping_and_dense_dim4(identity<bool> type_id) {
     return is_contiguous_ || is_channels_last_contiguous_ ||
         compute_non_overlapping_and_dense(type_id);
@@ -2678,6 +2736,37 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   template <typename T>
   void _refresh_contiguous() {
     auto type_id = identity<T>();
+=======
+  bool compute_is_non_overlapping_and_dense_dim4() {
+    return is_contiguous_ || is_channels_last_contiguous_ ||
+        compute_non_overlapping_and_dense();
+  }
+
+  bool compute_channels_last_contiguous_3d_dim5() {
+    return !is_channels_last_contiguous_ &&
+        compute_channels_last_contiguous_3d();
+  }
+
+  bool compute_channels_last_2d_dim5() {
+    return !is_channels_last_3d_contiguous_ &&
+        compute_strides_like_channels_last_2d();
+  }
+
+  bool compute_channels_last_3d_dim5() {
+    return !is_channels_last_ && compute_strides_like_channels_last_3d();
+  }
+
+  bool compute_is_non_overlapping_and_dense_dim5() {
+    return is_contiguous_ || is_channels_last_contiguous_ ||
+        is_channels_last_3d_contiguous_ || compute_non_overlapping_and_dense();
+  }
+
+  bool compute_is_non_overlapping_and_dense_anydim() {
+    return is_contiguous_ || compute_non_overlapping_and_dense();
+  }
+
+  void _refresh_contiguous() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Note:
     // Dim 0, 1, 2 will never be a channels last 2d/3d format
     // Dim 3+ is possibly be a channels last 2d format (Dim 4 only at this
@@ -2685,6 +2774,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     // this point)
     switch (dim()) {
       case 4: {
+<<<<<<< HEAD
         _set_is_contiguous(type_id, compute_contiguous(type_id));
         _set_is_channels_last_contiguous(
             type_id, compute_channels_last_contiguous_2d(type_id));
@@ -2707,6 +2797,26 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
             type_id, compute_channels_last_3d_dim5(type_id));
         _set_is_non_overlapping_and_dense(
             type_id, compute_is_non_overlapping_and_dense_dim5(type_id));
+=======
+        _set_is_contiguous(compute_contiguous());
+        _set_is_channels_last_contiguous(compute_channels_last_contiguous_2d());
+        _set_is_channels_last_3d_contiguous(false);
+        _set_is_channels_last(compute_strides_like_channels_last_2d());
+        _set_is_channels_last_3d(false);
+        _set_is_non_overlapping_and_dense(
+            compute_is_non_overlapping_and_dense_dim4());
+        break;
+      }
+      case 5: {
+        _set_is_contiguous(compute_contiguous());
+        _set_is_channels_last_contiguous(compute_channels_last_contiguous_2d());
+        _set_is_channels_last_3d_contiguous(
+            compute_channels_last_contiguous_3d_dim5());
+        _set_is_channels_last(compute_channels_last_2d_dim5());
+        _set_is_channels_last_3d(compute_channels_last_3d_dim5());
+        _set_is_non_overlapping_and_dense(
+            compute_is_non_overlapping_and_dense_dim5());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         break;
       }
       default:
@@ -2715,6 +2825,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
         // mean the tensor is strided like channels_last: for strides on channel
         // dimension could suggest desired memory_layout, but it doesn't affect
         // memory storage
+<<<<<<< HEAD
         _set_is_contiguous(type_id, compute_contiguous(type_id));
         _set_is_channels_last_contiguous(type_id, false);
         _set_is_channels_last_3d_contiguous(type_id, false);
@@ -2722,6 +2833,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
         _set_is_channels_last_3d(type_id, false);
         _set_is_non_overlapping_and_dense(
             type_id, compute_is_non_overlapping_and_dense_anydim(type_id));
+=======
+        _set_is_contiguous(compute_contiguous());
+        _set_is_channels_last_contiguous(false);
+        _set_is_channels_last_3d_contiguous(false);
+        _set_is_channels_last(false);
+        _set_is_channels_last_3d(false);
+        _set_is_non_overlapping_and_dense(
+            compute_is_non_overlapping_and_dense_anydim());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         break;
     }
   }
@@ -2735,7 +2855,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     if (has_symbolic_sizes_strides_) {
       symbolic_shape_meta().refresh_contiguous();
     } else {
+<<<<<<< HEAD
       _refresh_contiguous<bool>();
+=======
+      _refresh_contiguous();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
diff --git a/c10/core/impl/FakeGuardImpl.h b/c10/core/impl/FakeGuardImpl.h
index c8bfe91619ed..6d7ac9e85aa9 100644
--- a/c10/core/impl/FakeGuardImpl.h
+++ b/c10/core/impl/FakeGuardImpl.h
@@ -59,6 +59,7 @@ struct FakeGuardImpl final : public DeviceGuardImplInterface {
 
   // Event-related functions
   void record(
+<<<<<<< HEAD
       void** event,
       const Stream& stream,
       const DeviceIndex device_index,
@@ -68,6 +69,17 @@ struct FakeGuardImpl final : public DeviceGuardImplInterface {
     return true;
   }
   void destroyEvent(void* event, const DeviceIndex device_index)
+=======
+      void** /*event*/,
+      const Stream& /*stream*/,
+      const DeviceIndex /*device_index*/,
+      const EventFlag /*flag*/) const override {}
+  void block(void* /*event*/, const Stream& /*stream*/) const override {}
+  bool queryEvent(void* /*event*/) const override {
+    return true;
+  }
+  void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       const noexcept override {}
 
   // Convenience methods for testing
diff --git a/c10/core/impl/HermeticPyObjectTLS.cpp b/c10/core/impl/HermeticPyObjectTLS.cpp
index ae1f6fd8ee6b..9f628f02f2b5 100644
--- a/c10/core/impl/HermeticPyObjectTLS.cpp
+++ b/c10/core/impl/HermeticPyObjectTLS.cpp
@@ -2,7 +2,11 @@
 
 namespace c10::impl {
 
+<<<<<<< HEAD
 thread_local std::atomic<bool> hermeticPyObjectState{false};
+=======
+thread_local static std::atomic<bool> hermeticPyObjectState{false};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 std::atomic<bool> HermeticPyObjectTLS::haveState_{false};
 
diff --git a/c10/core/impl/HermeticPyObjectTLS.h b/c10/core/impl/HermeticPyObjectTLS.h
index 741132b9f967..86f2ff97a8df 100644
--- a/c10/core/impl/HermeticPyObjectTLS.h
+++ b/c10/core/impl/HermeticPyObjectTLS.h
@@ -13,7 +13,12 @@ namespace c10::impl {
 struct C10_API HermeticPyObjectTLS {
   static void set_state(bool state);
   static bool get_state() {
+<<<<<<< HEAD
     // Hypothetical fastpath if torchdeploy/multipy isn't used.  Per
+=======
+    // Hypothetical fastpath if torchdeploy/multipy // codespell:ignore multipy
+    // isn't used. Per
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
     // this qualifies relaxed access because it is a single-location data
     // structure (only the boolean here).
@@ -46,12 +51,23 @@ struct C10_API HermeticPyObjectTLS {
       return false;
     return get_tls_state();
   }
+<<<<<<< HEAD
   // Call this from the multipy/torchdeploy top level
   static void init_state();
 
  private:
   // This only flipped once from false to true during torchdeploy/multipy
   // initialization, and never again.
+=======
+  // Call this from the multipy/torchdeploy // codespell:ignore multipy
+  // top level
+  static void init_state();
+
+ private:
+  // This only flipped once from false to true during
+  // torchdeploy/multipy initialization, // codespell:ignore multipy
+  // and never again.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static std::atomic<bool> haveState_;
   static bool get_tls_state();
 };
diff --git a/c10/core/impl/InlineEvent.h b/c10/core/impl/InlineEvent.h
index 82fa3384907e..7bbf75f6e21d 100644
--- a/c10/core/impl/InlineEvent.h
+++ b/c10/core/impl/InlineEvent.h
@@ -107,17 +107,36 @@ struct InlineEvent final {
 
   double elapsedTime(const InlineEvent& other) const {
     TORCH_CHECK(
+<<<<<<< HEAD
         other.was_marked_for_recording(),
         "other was not marked for recording.");
     TORCH_CHECK(
         was_marked_for_recording(), "self was not marked for recording.");
     TORCH_CHECK(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         other.device_type() == device_type_,
         "Event device type ",
         DeviceTypeName(device_type_),
         " does not match other's device type ",
         DeviceTypeName(other.device_type()),
         ".");
+<<<<<<< HEAD
+=======
+    TORCH_CHECK_VALUE(
+        (flag_ == EventFlag::BACKEND_DEFAULT) &&
+            (other.flag_ == EventFlag::BACKEND_DEFAULT),
+        "Both events must be created with argument 'enable_timing=True'.");
+    TORCH_CHECK_VALUE(
+        was_marked_for_recording() && other.was_marked_for_recording(),
+        "Both events must be recorded before calculating elapsed time.");
+    // elapsedTime in MPS can wait event to be completed if event is not ready,
+    // which is a little different from CUDA
+    TORCH_CHECK(
+        (query() && other.query()) || device_type_ == DeviceType::MPS,
+        "Both events must be completed before calculating elapsed time.");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return backend_.elapsedTime(event_, other.event_, device_index_);
   }
 
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index e0563799760d..888feb5be8eb 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -1,7 +1,11 @@
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/core/impl/PyInterpreter.h>
+<<<<<<< HEAD
 
+=======
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace c10::impl {
 
 struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
@@ -145,3 +149,7 @@ void PyInterpreter::disarm() noexcept {
 }
 
 } // namespace c10::impl
+<<<<<<< HEAD
+=======
+C10_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h
index 568de4491cfb..7e3e253be80a 100644
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@@ -143,9 +143,15 @@ struct C10_API PyInterpreterVTable {
   virtual void reportErrorCallback(PyObject* callback, DispatchKey key)
       const = 0;
 
+<<<<<<< HEAD
   // This is only invoked in the multipy/torchdeploy situation from
   // pythonOpRegistrationTrampoline; this lets us get to the Python
   // interpreter to actually find the appropriate Python op registration
+=======
+  // This is only invoked in the multipy/torchdeploy // codespell:ignore multipy
+  // situation from pythonOpRegistrationTrampoline; this lets us get to the
+  // Python interpreter to actually find the appropriate Python op registration
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // entry to call.
   virtual void python_op_registration_trampoline(
       const c10::OperatorHandle& op,
diff --git a/c10/core/impl/PythonDispatcherTLS.cpp b/c10/core/impl/PythonDispatcherTLS.cpp
index a53e293eaa32..e0c7fe5a03a3 100644
--- a/c10/core/impl/PythonDispatcherTLS.cpp
+++ b/c10/core/impl/PythonDispatcherTLS.cpp
@@ -4,7 +4,11 @@
 
 namespace c10::impl {
 
+<<<<<<< HEAD
 thread_local PyInterpreter* pythonDispatcherState;
+=======
+thread_local static PyInterpreter* pythonDispatcherState;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void PythonDispatcherTLS::set_state(PyInterpreter* state) {
   if (state) {
diff --git a/c10/core/impl/SizesAndStrides.h b/c10/core/impl/SizesAndStrides.h
index 2da5f21ff5ec..62e3533c443a 100644
--- a/c10/core/impl/SizesAndStrides.h
+++ b/c10/core/impl/SizesAndStrides.h
@@ -50,6 +50,23 @@ class C10_API SizesAndStrides {
     }
   }
 
+<<<<<<< HEAD
+=======
+  bool operator==(const SizesAndStrides& other) const {
+    if (size_ != other.size_) {
+      return false;
+    }
+    return !(
+        isInline()
+            ? std::memcmp(
+                  inlineStorage_, other.inlineStorage_, sizeof(inlineStorage_))
+            : std::memcmp(
+                  outOfLineStorage_,
+                  other.outOfLineStorage_,
+                  storageBytes(size_)));
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   SizesAndStrides& operator=(const SizesAndStrides& rhs) {
     if (this == &rhs) {
       return *this;
diff --git a/c10/core/impl/TorchDispatchModeTLS.cpp b/c10/core/impl/TorchDispatchModeTLS.cpp
index c9a3274ed896..66d3eeb23237 100644
--- a/c10/core/impl/TorchDispatchModeTLS.cpp
+++ b/c10/core/impl/TorchDispatchModeTLS.cpp
@@ -8,7 +8,11 @@
 
 namespace c10::impl {
 
+<<<<<<< HEAD
 thread_local TorchDispatchModeTLS torchDispatchModeState;
+=======
+thread_local static TorchDispatchModeTLS torchDispatchModeState;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool TorchDispatchModeTLS::any_modes_set(bool skip_infra_modes) {
   if (!torchDispatchModeState.stack_.empty())
diff --git a/c10/core/impl/alloc_cpu.cpp b/c10/core/impl/alloc_cpu.cpp
index 2559413e8f60..edf83cad3fb9 100644
--- a/c10/core/impl/alloc_cpu.cpp
+++ b/c10/core/impl/alloc_cpu.cpp
@@ -19,11 +19,19 @@
 #endif
 
 // TODO: rename flags to C10
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DEFINE_bool(
     caffe2_cpu_allocator_do_zero_fill,
     false,
     "If set, do memory zerofilling when allocating on CPU")
 
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DEFINE_bool(
     caffe2_cpu_allocator_do_junk_fill,
     false,
@@ -62,6 +70,7 @@ static inline bool is_thp_alloc_enabled() {
   return value;
 }
 
+<<<<<<< HEAD
 inline size_t c10_compute_alignment(size_t nbytes) {
   static const auto pagesize = sysconf(_SC_PAGESIZE);
   // for kernels that don't provide page size, default it to 4K
@@ -69,12 +78,20 @@ inline size_t c10_compute_alignment(size_t nbytes) {
   return (is_thp_alloc_enabled() ? thp_alignment : gAlignment);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline bool is_thp_alloc(size_t nbytes) {
   // enable thp (transparent huge pages) for larger buffers
   return (is_thp_alloc_enabled() && (nbytes >= gAlloc_threshold_thp));
 }
+<<<<<<< HEAD
 #elif !defined(__ANDROID__) && !defined(_MSC_VER)
 constexpr size_t c10_compute_alignment([[maybe_unused]] size_t nbytes) {
+=======
+
+#elif !defined(__ANDROID__) && !defined(_MSC_VER)
+constexpr size_t c10_compute_alignment(size_t /*nbytes*/) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return gAlignment;
 }
 
@@ -84,6 +101,18 @@ constexpr bool is_thp_alloc([[maybe_unused]] size_t nbytes) {
 #endif
 } // namespace
 
+<<<<<<< HEAD
+=======
+#if defined(__linux__) && !defined(__ANDROID__)
+size_t c10_compute_alignment(size_t nbytes) {
+  static const auto pagesize = sysconf(_SC_PAGESIZE);
+  // for kernels that don't provide page size, default it to 4K
+  const size_t thp_alignment = (pagesize < 0 ? gPagesize : pagesize);
+  return (is_thp_alloc(nbytes) ? thp_alignment : gAlignment);
+}
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void* alloc_cpu(size_t nbytes) {
   if (nbytes == 0) {
     return nullptr;
diff --git a/c10/core/impl/alloc_cpu.h b/c10/core/impl/alloc_cpu.h
index 8d506acf392f..0a6d85935af5 100644
--- a/c10/core/impl/alloc_cpu.h
+++ b/c10/core/impl/alloc_cpu.h
@@ -1,6 +1,10 @@
 #pragma once
 
 #include <c10/macros/Export.h>
+<<<<<<< HEAD
+=======
+#include <c10/macros/Macros.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <cstddef>
 
@@ -9,6 +13,13 @@ namespace c10 {
 C10_API void* alloc_cpu(size_t nbytes);
 C10_API void free_cpu(void* data);
 
+<<<<<<< HEAD
+=======
+#if defined(__linux__) && !defined(__ANDROID__)
+C10_API size_t c10_compute_alignment(size_t nbytes);
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_MIMALLOC_ON_MKL
 namespace mi_malloc_wrapper {
 C10_API void* c10_mi_malloc(size_t size);
diff --git a/c10/cuda/CMakeLists.txt b/c10/cuda/CMakeLists.txt
index 3327dab4779b..fb0ceea13bc0 100644
--- a/c10/cuda/CMakeLists.txt
+++ b/c10/cuda/CMakeLists.txt
@@ -56,6 +56,10 @@ set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
 
 if(NOT BUILD_LIBTORCHLESS)
   torch_cuda_based_add_library(c10_cuda ${C10_CUDA_SRCS} ${C10_CUDA_HEADERS})
+<<<<<<< HEAD
+=======
+  torch_compile_options(c10_cuda)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   set(CUDA_LINK_LIBRARIES_KEYWORD)
   # If building shared library, set dllimport/dllexport proper.
   target_compile_options(c10_cuda PRIVATE "-DC10_CUDA_BUILD_MAIN_LIB")
diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index 632c7af5b80d..a2b89409d692 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -16,6 +16,16 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
       m_garbage_collection_threshold(0),
       m_pinned_num_register_threads(1),
       m_expandable_segments(false),
+<<<<<<< HEAD
+=======
+#if CUDA_VERSION >= 12030
+      m_expandable_segments_handle_type(
+          Expandable_Segments_Handle_Type::UNSPECIFIED),
+#else
+      m_expandable_segments_handle_type(
+          Expandable_Segments_Handle_Type::POSIX_FD),
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       m_release_lock_on_cudamalloc(false),
       m_pinned_use_cuda_host_register(false),
       m_pinned_use_background_threads(false) {
@@ -42,6 +52,7 @@ size_t CUDAAllocatorConfig::roundup_power2_divisions(size_t size) {
 }
 
 void CUDAAllocatorConfig::lexArgs(
+<<<<<<< HEAD
     const char* env,
     std::vector<std::string>& config) {
   std::vector<char> buf;
@@ -49,13 +60,27 @@ void CUDAAllocatorConfig::lexArgs(
   size_t env_length = strlen(env);
   for (size_t i = 0; i < env_length; i++) {
     if (env[i] == ',' || env[i] == ':' || env[i] == '[' || env[i] == ']') {
+=======
+    const std::string& env,
+    std::vector<std::string>& config) {
+  std::vector<char> buf;
+
+  for (char ch : env) {
+    if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (!buf.empty()) {
         config.emplace_back(buf.begin(), buf.end());
         buf.clear();
       }
+<<<<<<< HEAD
       config.emplace_back(1, env[i]);
     } else if (env[i] != ' ') {
       buf.emplace_back(static_cast<char>(env[i]));
+=======
+      config.emplace_back(1, ch);
+    } else if (ch != ' ') {
+      buf.emplace_back(ch);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   if (!buf.empty()) {
@@ -157,7 +182,11 @@ size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
         }
         TORCH_CHECK(
             val2 == 0 || llvm::isPowerOf2_64(val2),
+<<<<<<< HEAD
             "For roundups, the divisons has to be power of 2 or 0 to disable roundup ",
+=======
+            "For roundups, the divisions has to be power of 2 or 0 to disable roundup ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "");
 
         if (std::string_view(val1) == ">") {
@@ -203,7 +232,11 @@ size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
       size_t val1 = stoi(config[i]);
       TORCH_CHECK(
           llvm::isPowerOf2_64(val1),
+<<<<<<< HEAD
           "For roundups, the divisons has to be power of 2 ",
+=======
+          "For roundups, the divisions has to be power of 2 ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "");
       std::fill(
           m_roundup_power2_divisions.begin(),
@@ -220,6 +253,43 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
     const std::vector<std::string>& config,
     size_t i,
     bool& used_cudaMallocAsync) {
+<<<<<<< HEAD
+=======
+  // For ease of maintenance and understanding, the CUDA and ROCm
+  // implementations of this function are separated. This avoids having many
+  // #ifdef's throughout.
+#ifdef USE_ROCM
+  // Ease burden on ROCm users by allowing either cuda or hip tokens.
+  // cuda token is broken up to prevent hipify matching it.
+#define PYTORCH_TOKEN1 \
+  "cud"                \
+  "aMallocAsync"
+#define PYTORCH_TOKEN2 "hipMallocAsync"
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        ((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
+         (config[i] == PYTORCH_TOKEN2)),
+        "Unknown allocator backend, "
+        "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
+    used_cudaMallocAsync =
+        (config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
+    TORCH_INTERNAL_ASSERT(
+        config[i] == get()->name() ||
+            (config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
+        "Allocator backend parsed at runtime != "
+        "allocator backend parsed at load time, ",
+        config[i],
+        " != ",
+        get()->name());
+  } else {
+    TORCH_CHECK(false, "Error parsing backend value", "");
+  }
+  return i;
+#undef PYTORCH_TOKEN1
+#undef PYTORCH_TOKEN2
+#else // USE_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   consumeToken(config, ++i, ':');
   if (++i < config.size()) {
     TORCH_CHECK(
@@ -227,8 +297,11 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
         "Unknown allocator backend, "
         "options are native and cudaMallocAsync");
     used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
+<<<<<<< HEAD
 #ifndef USE_ROCM
     // HIP supports hipMallocAsync and does not need to check versions
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (used_cudaMallocAsync) {
 #if CUDA_VERSION >= 11040
       int version = 0;
@@ -246,7 +319,10 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
           CUDA_VERSION);
 #endif
     }
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT(
         config[i] == get()->name(),
         "Allocator backend parsed at runtime != "
@@ -255,9 +331,16 @@ size_t CUDAAllocatorConfig::parseAllocatorConfig(
     TORCH_CHECK(false, "Error parsing backend value", "");
   }
   return i;
+<<<<<<< HEAD
 }
 
 void CUDAAllocatorConfig::parseArgs(const char* env) {
+=======
+#endif // USE_ROCM
+}
+
+void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // If empty, set the default values
   m_max_split_size = std::numeric_limits<size_t>::max();
   m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
@@ -265,16 +348,28 @@ void CUDAAllocatorConfig::parseArgs(const char* env) {
   bool used_cudaMallocAsync = false;
   bool used_native_specific_option = false;
 
+<<<<<<< HEAD
   if (env == nullptr) {
+=======
+  if (!env.has_value()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return;
   }
   {
     std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
+<<<<<<< HEAD
     m_last_allocator_settings = env;
   }
 
   std::vector<std::string> config;
   lexArgs(env, config);
+=======
+    m_last_allocator_settings = env.value();
+  }
+
+  std::vector<std::string> config;
+  lexArgs(env.value(), config);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   for (size_t i = 0; i < config.size(); i++) {
     std::string_view config_item_view(config[i]);
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index 876bffcf9852..16780bfaa430 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -2,6 +2,10 @@
 
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/util/Exception.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <atomic>
 #include <cstddef>
@@ -12,6 +16,15 @@
 
 namespace c10::cuda::CUDACachingAllocator {
 
+<<<<<<< HEAD
+=======
+enum class Expandable_Segments_Handle_Type : int {
+  UNSPECIFIED = 0,
+  POSIX_FD = 1,
+  FABRIC_HANDLE = 2,
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Environment config parser
 class C10_CUDA_API CUDAAllocatorConfig {
  public:
@@ -33,6 +46,18 @@ class C10_CUDA_API CUDAAllocatorConfig {
 #endif
   }
 
+<<<<<<< HEAD
+=======
+  static Expandable_Segments_Handle_Type expandable_segments_handle_type() {
+    return instance().m_expandable_segments_handle_type;
+  }
+
+  static void set_expandable_segments_handle_type(
+      Expandable_Segments_Handle_Type handle_type) {
+    instance().m_expandable_segments_handle_type = handle_type;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static bool release_lock_on_cudamalloc() {
     return instance().m_release_lock_on_cudamalloc;
   }
@@ -59,7 +84,11 @@ class C10_CUDA_API CUDAAllocatorConfig {
 
   // This is used to round-up allocation size to nearest power of 2 divisions.
   // More description below in function roundup_power2_next_division
+<<<<<<< HEAD
   // As ane example, if we want 4 divisions between 2's power, this can be done
+=======
+  // As an example, if we want 4 divisions between 2's power, this can be done
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
   static size_t roundup_power2_divisions(size_t size);
 
@@ -80,19 +109,37 @@ class C10_CUDA_API CUDAAllocatorConfig {
   static CUDAAllocatorConfig& instance() {
     static CUDAAllocatorConfig* s_instance = ([]() {
       auto inst = new CUDAAllocatorConfig();
+<<<<<<< HEAD
       const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
+=======
+      auto env = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
+#ifdef USE_ROCM
+      // convenience for ROCm users, allow alternative HIP token
+      if (!env.has_value()) {
+        env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
+      }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       inst->parseArgs(env);
       return inst;
     })();
     return *s_instance;
   }
 
+<<<<<<< HEAD
   void parseArgs(const char* env);
+=======
+  void parseArgs(const std::optional<std::string>& env);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   CUDAAllocatorConfig();
 
+<<<<<<< HEAD
   static void lexArgs(const char* env, std::vector<std::string>& config);
+=======
+  static void lexArgs(const std::string& env, std::vector<std::string>& config);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static void consumeToken(
       const std::vector<std::string>& config,
       size_t i,
@@ -127,6 +174,11 @@ class C10_CUDA_API CUDAAllocatorConfig {
   std::atomic<double> m_garbage_collection_threshold;
   std::atomic<size_t> m_pinned_num_register_threads;
   std::atomic<bool> m_expandable_segments;
+<<<<<<< HEAD
+=======
+  std::atomic<Expandable_Segments_Handle_Type>
+      m_expandable_segments_handle_type;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::atomic<bool> m_release_lock_on_cudamalloc;
   std::atomic<bool> m_pinned_use_cuda_host_register;
   std::atomic<bool> m_pinned_use_background_threads;
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 052a5201d53e..1ace10564994 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -6,6 +6,10 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Gauge.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/Logging.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/ScopeExit.h>
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/env.h>
@@ -33,14 +37,37 @@
 #include <new>
 #include <regex>
 #include <set>
+<<<<<<< HEAD
+=======
+#include <stack>
+#include <thread>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <utility>
 #include <vector>
 
 TORCH_SDT_DEFINE_SEMAPHORE(malloc)
 TORCH_SDT_DEFINE_SEMAPHORE(free)
 
+<<<<<<< HEAD
 namespace c10 {
 
+=======
+// add these definitions so that we can compile with CUDA < 12.3
+// borrowed from
+// https://github.com/NVIDIA/nccl/blob/3ea7eedf3b9b94f1d9f99f4e55536dfcbd23c1ca/src/include/p2p.h#L20
+#if CUDA_VERSION < 12030
+#define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
+#define CU_IPC_HANDLE_SIZE 64
+typedef struct CUmemFabricHandle_st {
+  unsigned char data[CU_IPC_HANDLE_SIZE];
+} CUmemFabricHandle_v1;
+typedef CUmemFabricHandle_v1 CUmemFabricHandle;
+#endif
+
+namespace c10 {
+
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback)
 
 namespace cuda::CUDACachingAllocator {
@@ -128,7 +155,11 @@ constexpr size_t kMinLargeAlloc =
     10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
 constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
 
+<<<<<<< HEAD
 static char SHAREABLE_HANDLE_VERSION = 1;
+=======
+static char SHAREABLE_HANDLE_VERSION = 2;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 enum ShareableHandleType : char {
   SHAREABLE_CUDA_MALLOC = 'c',
   SHAREABLE_CUDA_EXPANDABLE_SEGMENT = 'e'
@@ -173,6 +204,11 @@ struct BlockPool {
   // Add a Block into blocks set with updating gc counter.
   std::pair<std::set<Block*, Comparison>::iterator, bool> insert_into_blocks(
       Block* block);
+<<<<<<< HEAD
+=======
+
+  MempoolId_t owner_MempoolId() const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct ExpandableSegment;
@@ -382,6 +418,10 @@ struct ExpandableSegment {
   // returns the actual range mapped, which may be
   // greater than requested if size is not aligned to segment_size_.
   // return size of 0 indicates OOM
+<<<<<<< HEAD
+=======
+  // return nullptr indicates the handle type is not supported.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   SegmentRange map(SegmentRange range) {
     auto begin = segmentLeft(range.ptr);
     auto end = segmentRight(range.ptr + range.size);
@@ -389,6 +429,26 @@ struct ExpandableSegment {
     if (begin == end) {
       return rangeFromHandles(begin, end);
     }
+<<<<<<< HEAD
+=======
+
+    // if the handle type is not specified, try to use fabric handle first.
+    // if it fails, use posix file handle
+    if (CUDAAllocatorConfig::expandable_segments_handle_type() ==
+        Expandable_Segments_Handle_Type::UNSPECIFIED) {
+      CUDAAllocatorConfig::set_expandable_segments_handle_type(
+          Expandable_Segments_Handle_Type::FABRIC_HANDLE);
+      auto output = map(range);
+      if (output.ptr != nullptr) {
+        return output;
+      }
+      // if fabric handle is not supported, use posix file handle.
+      CUDAAllocatorConfig::set_expandable_segments_handle_type(
+          Expandable_Segments_Handle_Type::POSIX_FD);
+      return map(range);
+    }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     while (end > handles_.size()) {
       handles_.emplace_back(std::nullopt);
     }
@@ -398,13 +458,31 @@ struct ExpandableSegment {
       CUmemAllocationProp prop = {};
       prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
 #ifndef FBCODE_CAFFE2
+<<<<<<< HEAD
       prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 #endif
+=======
+      if (CUDAAllocatorConfig::expandable_segments_handle_type() !=
+          Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
+        prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+      } else {
+        prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+      }
+#endif
+      int flag = 0;
+      C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuDeviceGetAttribute_(
+          &flag,
+          CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+          device_));
+      if (flag)
+        prop.allocFlags.gpuDirectRDMACapable = 1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
       // NOLINTNEXTLINE(bugprone-signed-char-misuse)
       prop.location.id = static_cast<int>(device_);
       auto status =
           DriverAPI::get()->cuMemCreate_(&handle, segment_size_, &prop, 0);
+<<<<<<< HEAD
       if (status == CUDA_ERROR_OUT_OF_MEMORY) {
         for (auto j : c10::irange(begin, i)) {
           // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
@@ -416,6 +494,34 @@ struct ExpandableSegment {
         return rangeFromHandles(begin, begin);
       }
       C10_CUDA_DRIVER_CHECK(status);
+=======
+      if (status != CUDA_SUCCESS) {
+        if (status == CUDA_ERROR_OUT_OF_MEMORY) {
+          for (auto j : c10::irange(begin, i)) {
+            // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+            auto h = handles_.at(j).value();
+            handles_.at(j) = std::nullopt;
+            C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
+          }
+          trimHandles();
+          return rangeFromHandles(begin, begin);
+        } else if (
+            CUDAAllocatorConfig::expandable_segments_handle_type() ==
+            Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
+          // we are testing if we can use fabric handle.
+          // if we can, we will use it.
+          // if we can't, we will use posix file handle.
+          // so we should not return an error here.
+          // in practice, we can get CUDA_ERROR_NOT_SUPPORTED or
+          // CUDA_ERROR_NOT_PERMITTED to be safe, any non out-of-memory error is
+          // considered as the handle type is not supported. if the handle type
+          // is not supported, return a null range to indicate it.
+          return SegmentRange(nullptr, 0);
+        } else {
+          C10_CUDA_DRIVER_CHECK(status);
+        }
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       handles_.at(i) = Handle{handle, std::nullopt};
     }
     mapAndSetAccess(begin, end);
@@ -448,6 +554,7 @@ struct ExpandableSegment {
     for (auto i : c10::irange(begin, end)) {
       // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
       auto& handle = handles_.at(i).value();
+<<<<<<< HEAD
       if (!handle.fd) {
         int fd = 0;
         C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
@@ -456,6 +563,35 @@ struct ExpandableSegment {
       }
       int fd = *handle.fd;
       buf.write((const char*)&fd, sizeof(int));
+=======
+      if (CUDAAllocatorConfig::expandable_segments_handle_type() !=
+          Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
+        if (!handle.shareable_handle) {
+          int fd = 0;
+          C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
+              &fd, handle.handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
+          handle.shareable_handle = fd;
+          LOG(INFO) << "use posix fd to share expandable segments.";
+        }
+        TORCH_CHECK(
+            handle.shareable_handle != std::nullopt,
+            "shareable_handle is null");
+        buf.write((const char*)&*handle.shareable_handle, sizeof(int));
+      } else {
+        if (!handle.shareable_handle) {
+          CUmemFabricHandle fabric_handle;
+          C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
+              &fabric_handle, handle.handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
+          handle.shareable_handle = fabric_handle;
+          LOG(INFO) << "use fabric handle to share expandable segments.";
+        }
+        TORCH_CHECK(
+            handle.shareable_handle != std::nullopt,
+            "shareable_handle is null");
+        buf.write(
+            (const char*)&*handle.shareable_handle, sizeof(CUmemFabricHandle));
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return rangeFromHandles(begin, end);
   }
@@ -480,6 +616,7 @@ struct ExpandableSegment {
 #ifndef SYS_pidfd_getfd
 #define SYS_pidfd_getfd 438
 #endif
+<<<<<<< HEAD
     auto pidfd = syscall(SYS_pidfd_open, header.pid, 0);
     TORCH_CHECK(
         pidfd != -1 || errno != ENOSYS,
@@ -516,6 +653,62 @@ struct ExpandableSegment {
       segment->handles_.emplace_back(Handle{handle, std::nullopt});
     }
     close((int)pidfd);
+=======
+    if (CUDAAllocatorConfig::expandable_segments_handle_type() !=
+        Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
+      auto pidfd = syscall(SYS_pidfd_open, header.pid, 0);
+      TORCH_CHECK(
+          pidfd != -1 || errno != ENOSYS,
+          "The kernel on this machine does not support the pidfd_open syscall needed to use IPC for CUDA tensors when expandable_segments:True is set. "
+          "Consider using expandable_segments:False via torch.cuda.memory._set_allocator_settings('expandable_segments:False') for this allocation.");
+      TORCH_CHECK(pidfd != -1, "pidfd_open:", c10::utils::str_error(errno));
+      for (auto i : c10::irange(header.num_handles)) {
+        (void)i;
+        int fd = 0;
+        buf.read((char*)&fd, sizeof(int));
+        auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
+        if (myfd == -1) {
+          auto err = errno;
+          close((int)pidfd);
+          for (auto& h : segment->handles_) {
+            C10_CUDA_DRIVER_CHECK(
+                // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+                DriverAPI::get()->cuMemRelease_(h.value().handle));
+            h = std::nullopt;
+          }
+          TORCH_CHECK(
+              err != ENOSYS,
+              "The kernel on this machine does not support the pidfd_getfd syscall needed to use IPC for CUDA tensors when expandable_segments:True is set. "
+              "Consider using expandable_segments:False via torch.cuda.memory._set_allocator_settings('expandable_segments:False') for this allocation.");
+          TORCH_CHECK(false, "pidfd_getfd: ", c10::utils::str_error(err));
+        }
+        CUmemGenericAllocationHandle handle = 0;
+        C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
+            &handle,
+            // NOLINTNEXTLINE(performance-no-int-to-ptr)
+            (void*)(uintptr_t)myfd,
+            CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+        LOG(INFO) << "use posix fd to import expandable segments.";
+        close((int)myfd);
+        segment->handles_.emplace_back(Handle{handle, std::nullopt});
+      }
+      close((int)pidfd);
+    } else {
+      for (auto i : c10::irange(header.num_handles)) {
+        (void)i;
+        CUmemFabricHandle fabric_handle;
+        buf.read((char*)&fabric_handle, sizeof(CUmemFabricHandle));
+        CUmemGenericAllocationHandle handle = 0;
+        C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
+            &handle,
+            // NOLINTNEXTLINE(performance-no-int-to-ptr)
+            (void*)&fabric_handle,
+            CU_MEM_HANDLE_TYPE_FABRIC));
+        LOG(INFO) << "use fabric handle to import expandable segments.";
+        segment->handles_.emplace_back(Handle{handle, std::nullopt});
+      }
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     segment->mapAndSetAccess(0, header.num_handles);
     return segment;
   }
@@ -589,8 +782,13 @@ struct ExpandableSegment {
       handles_.at(i) = std::nullopt;
       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemUnmap_(
           ptr_ + segment_size_ * i, segment_size_));
+<<<<<<< HEAD
       if (h.fd) {
         close(*h.fd);
+=======
+      if (h.shareable_handle) {
+        close(std::get<int>(*h.shareable_handle));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
     }
@@ -634,7 +832,11 @@ struct ExpandableSegment {
   size_t max_handles_;
   struct Handle {
     CUmemGenericAllocationHandle handle;
+<<<<<<< HEAD
     std::optional<int> fd;
+=======
+    std::optional<std::variant<int, CUmemFabricHandle>> shareable_handle;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
   struct ShareHeader {
     pid_t pid;
@@ -821,8 +1023,15 @@ class EventPool {
 
 // CUDA graphs helper
 struct PrivatePool {
+<<<<<<< HEAD
   PrivatePool()
       : large_blocks(/*small=*/false, this),
+=======
+  PrivatePool(MempoolId_t id, CUDAAllocator* allocator = nullptr)
+      : id(std::move(id)),
+        allocator_(allocator),
+        large_blocks(/*small=*/false, this),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         small_blocks(/*small=*/true, this) {}
   PrivatePool(const PrivatePool&) = delete;
   PrivatePool(PrivatePool&&) = delete;
@@ -830,6 +1039,10 @@ struct PrivatePool {
   PrivatePool& operator=(PrivatePool&&) = delete;
   ~PrivatePool() = default;
 
+<<<<<<< HEAD
+=======
+  MempoolId_t id{0, 0};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Number of live graphs using this pool
   int use_count{1};
   // Number of unfreed cudaMallocs made for this pool. When use_count and
@@ -841,10 +1054,31 @@ struct PrivatePool {
   // distinguish private blocks by adding a "pool id" check above the stream
   // check in BlockComparator. BlockComparator is performance- critical though,
   // I'd rather not add more logic to it.
+<<<<<<< HEAD
   BlockPool large_blocks;
   BlockPool small_blocks;
 };
 
+=======
+  CUDAAllocator* allocator_;
+  BlockPool large_blocks;
+  BlockPool small_blocks;
+
+ public:
+  CUDAAllocator* allocator() {
+    return allocator_;
+  }
+};
+
+MempoolId_t BlockPool::owner_MempoolId() const {
+  if (owner_PrivatePool) {
+    return owner_PrivatePool->id;
+  } else {
+    return {0, 0};
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlockState::BlockState(Block* block)
     : device(block->device),
       stream(block->stream),
@@ -883,9 +1117,14 @@ struct MempoolIdHash {
 };
 
 cudaError_t allocPrimitive(void** ptr, size_t size, AllocParams& p) {
+<<<<<<< HEAD
   auto active_pool = MemPoolContext::getActiveMemPool();
   if (active_pool && active_pool->allocator() && p.pool->owner_PrivatePool) {
     *ptr = active_pool->allocator()->raw_alloc(size);
+=======
+  if (p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator()) {
+    *ptr = p.pool->owner_PrivatePool->allocator()->raw_alloc(size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return *ptr ? cudaSuccess : cudaErrorMemoryAllocation;
   } else {
     return C10_CUDA_ERROR_HANDLED(cudaMalloc(ptr, size));
@@ -969,7 +1208,10 @@ class RingBuffer {
                    // deallocation it can hold references to Python state which
                    // will already be destroyed when we are in exit handlers
 };
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // anonymous namespace
 } // namespace Native
 
@@ -1057,6 +1299,12 @@ class DeviceCachingAllocator {
   std::vector<std::pair<MempoolId_t, std::function<bool(cudaStream_t)>>>
       captures_underway;
 
+<<<<<<< HEAD
+=======
+  // tracks which pools we can use as a last resort before ooming
+  ska::flat_hash_set<MempoolId_t, MempoolIdHash> use_on_oom_pools;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // See free() for this thing's purpose
   std::vector<Block*> needs_events_deferred_until_no_capture;
   // outstanding cuda events
@@ -1105,6 +1353,12 @@ class DeviceCachingAllocator {
   // was used while cudagraph capturing
   std::unordered_map<Block*, stream_set> block_to_cudagraph_stream_uses;
 
+<<<<<<< HEAD
+=======
+  // thread local compile context for each device
+  static thread_local std::stack<std::string> compile_context;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   DeviceCachingAllocator()
@@ -1118,14 +1372,23 @@ class DeviceCachingAllocator {
       bool enabled,
       CreateContextFn context_recorder,
       size_t alloc_buffer_max_entries,
+<<<<<<< HEAD
       RecordContext when) {
+=======
+      RecordContext when,
+      bool clearHistory) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::unique_lock<std::recursive_mutex> lock(mutex);
     TORCH_CHECK(when == RecordContext::NEVER || context_recorder);
     record_history = enabled;
     context_recorder_.store(record_history ? context_recorder : nullptr);
     alloc_buffer.setMaxEntries(alloc_buffer_max_entries);
     record_context_ = enabled ? when : RecordContext::NEVER;
+<<<<<<< HEAD
     if (!enabled) {
+=======
+    if (!enabled || clearHistory) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       alloc_buffer.clear();
     }
   }
@@ -1134,6 +1397,19 @@ class DeviceCachingAllocator {
     return record_history;
   }
 
+<<<<<<< HEAD
+=======
+  void pushCompileContext(std::string& md) {
+    compile_context.push(md);
+  }
+
+  void popCompileContext() {
+    if (!compile_context.empty()) {
+      compile_context.pop();
+    }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool checkPoolLiveAllocations(
       MempoolId_t mempool_id,
       const std::unordered_set<void*>& expected_live_allocations) {
@@ -1239,10 +1515,43 @@ class DeviceCachingAllocator {
               alloc_block(params, false, context, lock))
           // Free all non-split cached blocks and retry alloc.
           || (C10_LIKELY(captures_underway.empty()) &&
+<<<<<<< HEAD
               release_cached_blocks(context) &&
               alloc_block(params, true, context, lock));
     }
 
+=======
+              release_cached_blocks(context, {0, 0}) &&
+              alloc_block(params, true, context, lock));
+    }
+
+    // we are about to oom, try to use existing mempools as a last resort
+    if (!block_found && params.err == cudaErrorMemoryAllocation) {
+      // if already trying to use a mempool, then just oom
+      bool active_pool = params.pool->owner_PrivatePool;
+      if (!active_pool) {
+        for (MempoolId_t mempool_id : use_on_oom_pools) {
+          auto tid = std::this_thread::get_id();
+          auto filter = [tid](cudaStream_t) {
+            return std::this_thread::get_id() == tid;
+          };
+          beginAllocateToPool(mempool_id, filter);
+          auto& mempool = get_pool(size, stream);
+          AllocParams mempool_params(
+              device, size, stream, &mempool, alloc_size, stats);
+          mempool_params.stat_types = get_stat_types_for_pool(mempool);
+          block_found = get_free_block(mempool_params);
+          endAllocateToPool(mempool_id);
+          releasePool(mempool_id);
+          if (block_found) {
+            params = mempool_params;
+            break;
+          }
+        }
+      }
+    }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!block_found) {
       // For any error code other than cudaErrorMemoryAllocation,
       // alloc_block should have thrown an exception already.
@@ -1265,6 +1574,10 @@ class DeviceCachingAllocator {
           params.size(),
           params.stream(),
           params.device(),
+<<<<<<< HEAD
+=======
+          params.pool->owner_MempoolId(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           std::move(context));
       stats.num_ooms += 1;
 
@@ -1435,6 +1748,10 @@ class DeviceCachingAllocator {
         orig_size,
         block->stream,
         block->device,
+<<<<<<< HEAD
+=======
+        block->pool->owner_MempoolId(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         block->context_when_allocated);
 
     // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
@@ -1474,7 +1791,11 @@ class DeviceCachingAllocator {
 
     block->allocated = false;
 
+<<<<<<< HEAD
     // following logic might modifying underlaying Block, causing the size
+=======
+    // following logic might modifying underlying Block, causing the size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // changed. We store ahead for reporting
     auto orig_block_ptr = block->ptr;
     auto orig_block_size = block->size;
@@ -1496,6 +1817,10 @@ class DeviceCachingAllocator {
         block->requested_size,
         block->stream,
         block->device,
+<<<<<<< HEAD
+=======
+        block->pool->owner_MempoolId(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         context ? context : block->context_when_allocated);
 
     if (block->size >= CUDAAllocatorConfig::max_split_size())
@@ -1604,10 +1929,17 @@ class DeviceCachingAllocator {
   }
 
   /** returns cached blocks to the system allocator **/
+<<<<<<< HEAD
   void emptyCache() {
     auto context = maybeGatherContext(RecordContext::ALL);
     std::lock_guard<std::recursive_mutex> lock(mutex);
     release_cached_blocks(context);
+=======
+  void emptyCache(MempoolId_t mempool_id) {
+    auto context = maybeGatherContext(RecordContext::ALL);
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    release_cached_blocks(context, mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   /** Retrieves size of largest unused block held by the memory cache **/
@@ -1925,6 +2257,7 @@ class DeviceCachingAllocator {
 
   /** Dump a complete snapshot of the memory held by the allocator. Potentially
    * VERY expensive. **/
+<<<<<<< HEAD
   std::vector<SegmentInfo> snapshot() {
     std::lock_guard<std::recursive_mutex> lock(mutex);
 
@@ -1937,12 +2270,19 @@ class DeviceCachingAllocator {
     if (active_mempool) {
       mempool_id = active_mempool->id();
     }
+=======
+  std::vector<SegmentInfo> snapshot(MempoolId_t mempool_id) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    std::vector<Block*> all_blocks;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (mempool_id.first != 0 || mempool_id.second != 0) {
       // If there is an active mempool, we find the corresponding PrivatePool
       // in graph_pools and only return the blocks from it.
       auto pool = graph_pools.find(mempool_id);
       if (pool != graph_pools.end()) {
+<<<<<<< HEAD
         pool_to_id[pool->second.get()] = pool->first;
         all_blocks = get_private_pool_head_blocks(pool->second.get());
       }
@@ -1960,6 +2300,14 @@ class DeviceCachingAllocator {
       for (const auto& pair : graph_pools_freeable) {
         pool_to_id[pair.second] = pair.first;
       }
+=======
+        all_blocks = get_private_pool_head_blocks(pool->second.get());
+      }
+    } else {
+      // When snapshot is called with non-default mempool_id, we return
+      // all the blocks in the CUDACachingAllocator (as returned by
+      // get_all_blocks).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       all_blocks = get_all_blocks();
     }
 
@@ -1981,9 +2329,16 @@ class DeviceCachingAllocator {
       segment_info.is_expandable = head_block->expandable_segment_;
       segment_info.context_when_allocated =
           head_block->context_when_segment_allocated;
+<<<<<<< HEAD
       auto id = pool_to_id.find(head_block->pool->owner_PrivatePool);
       if (id != pool_to_id.end()) {
         segment_info.owner_private_pool_id = id->second;
+=======
+      MempoolId_t id = head_block->pool->owner_MempoolId();
+      if ((mempool_id.first == 0 && mempool_id.second == 0) ||
+          id == mempool_id) {
+        segment_info.owner_private_pool_id = id;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
 
       const Block* block = head_block;
@@ -2018,7 +2373,12 @@ class DeviceCachingAllocator {
           return a.address < b.address;
         });
 
+<<<<<<< HEAD
     record_trace(TraceEntry::SNAPSHOT, 0, total_active, nullptr, 0, nullptr);
+=======
+    record_trace(
+        TraceEntry::SNAPSHOT, 0, total_active, nullptr, 0, mempool_id, nullptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return result;
   }
 
@@ -2040,7 +2400,11 @@ class DeviceCachingAllocator {
   // For example, if we need to round-up 1200 and number of divisions is 4,
   // the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
   // them, the values are 1024, 1280, 1536, and 1792. So the function will
+<<<<<<< HEAD
   // return 1280 as the nearest ceiling of power-2 divison.
+=======
+  // return 1280 as the nearest ceiling of power-2 division.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static size_t roundup_power2_next_division(size_t size, size_t divisions) {
     if (llvm::isPowerOf2_64(size)) {
       return size;
@@ -2074,11 +2438,25 @@ class DeviceCachingAllocator {
     }
   }
 
+<<<<<<< HEAD
   void ensureExistsAndIncrefPool(MempoolId_t mempool_id) {
     // Create a PrivatePool object if it does not exist yet
     // and increment its use_count
     std::lock_guard<std::recursive_mutex> lock(mutex);
     ensure_exists_and_incref_pool(mempool_id);
+=======
+  void createOrIncrefPool(MempoolId_t mempool_id, CUDAAllocator* allocator) {
+    // Create a PrivatePool object if it does not exist yet
+    // and increment its use_count
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    create_or_incref_pool(mempool_id, allocator);
+  }
+
+  void setUseOnOOM(MempoolId_t mempool_id) {
+    // Choose if this pool should be used as a last resort before ooming
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    use_on_oom_pools.insert(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // See Note [Interaction with CUDA graph capture]
@@ -2088,7 +2466,11 @@ class DeviceCachingAllocator {
       MempoolId_t mempool_id,
       std::function<bool(cudaStream_t)> filter) {
     std::lock_guard<std::recursive_mutex> lock(mutex);
+<<<<<<< HEAD
     ensure_exists_and_incref_pool(mempool_id);
+=======
+    create_or_incref_pool(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (auto it2 = captures_underway.begin(); it2 != captures_underway.end();
          ++it2) {
       TORCH_CHECK(
@@ -2210,20 +2592,36 @@ class DeviceCachingAllocator {
     return blocks;
   }
 
+<<<<<<< HEAD
   void ensure_exists_and_incref_pool(MempoolId_t mempool_id) {
+=======
+  void create_or_incref_pool(
+      MempoolId_t mempool_id,
+      CUDAAllocator* allocator = nullptr) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto it = graph_pools.find(mempool_id);
     if (it == graph_pools.end()) {
       // mempool_id does not reference an existing pool.
       // Make a new pool for CUDAGraph capture or torch.cuda.use_mem_pool
       // usage. use_count is initially 1, which means the pool is
+<<<<<<< HEAD
       // being used since somebody called ensureExistsAndIncrefPool.
       graph_pools.emplace(mempool_id, std::make_unique<PrivatePool>());
+=======
+      // being used since somebody called createOrIncrefPool.
+      graph_pools.emplace(
+          mempool_id, std::make_unique<PrivatePool>(mempool_id, allocator));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       // mempool_id references an existing pool, which the current CUDAGraph
       // capture or torch.cuda.use_mem_pool will
       // share. Check this pool is live (at least one other capture already
       // references it). Increment it to establish the usage.
       TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
+<<<<<<< HEAD
+=======
+      TORCH_INTERNAL_ASSERT(allocator == nullptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       it->second->use_count++;
     }
   }
@@ -2353,6 +2751,10 @@ class DeviceCachingAllocator {
         mapped_range.size,
         to_map->stream,
         to_map->device,
+<<<<<<< HEAD
+=======
+        to_map->pool->owner_MempoolId(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ctx);
     if (!to_map->prev && !to_map->context_when_segment_allocated) {
       to_map->context_when_segment_allocated = ctx;
@@ -2408,6 +2810,10 @@ class DeviceCachingAllocator {
         block->requested_size,
         block->stream,
         block->device,
+<<<<<<< HEAD
+=======
+        block->pool->owner_MempoolId(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         context ? context : block->context_when_allocated);
 
     block->context_when_allocated = nullptr;
@@ -2686,7 +3092,11 @@ class DeviceCachingAllocator {
     }
   }
 
+<<<<<<< HEAD
   // This function assumes that global lock has been taken whle calling into
+=======
+  // This function assumes that global lock has been taken while calling into
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // this function. We do cudaMalloc sync call in this function which
   // can be expensive while holding the lock. Hence, we pass-in the lock to the
   // function to temporarily release the lock before cudaMalloc call and acquire
@@ -2711,7 +3121,12 @@ class DeviceCachingAllocator {
     bool in_fbcode = false;
 #endif
 
+<<<<<<< HEAD
     auto active_pool = MemPoolContext::getActiveMemPool();
+=======
+    bool active_pool =
+        p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (set_fraction &&
         total_allocated_memory + size > allowed_memory_maximum) {
       p.err = cudaErrorMemoryAllocation;
@@ -2736,12 +3151,15 @@ class DeviceCachingAllocator {
       }
       return bool(p.block);
     } else {
+<<<<<<< HEAD
       if (active_pool && active_pool->allocator() &&
           p.pool->owner_PrivatePool) {
         // Ensure that active_pool and p.pool are the same
         auto pp = get_private_pool(active_pool->id());
         TORCH_INTERNAL_ASSERT(pp == p.pool->owner_PrivatePool);
       }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (CUDAAllocatorConfig::release_lock_on_cudamalloc()) {
         // At scope exit, acquire the lock again. This provides safety against
         // any potential exceptions in the cudaMallocMaybeCapturing function.
@@ -2804,6 +3222,10 @@ class DeviceCachingAllocator {
         p.block->size,
         p.stream(),
         p.device(),
+<<<<<<< HEAD
+=======
+        p.pool->owner_MempoolId(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ctx);
     p.block->context_when_segment_allocated = ctx;
     return true;
@@ -2860,6 +3282,7 @@ class DeviceCachingAllocator {
     return true;
   }
 
+<<<<<<< HEAD
   bool release_cached_blocks(const std::shared_ptr<GatheredContext>& context) {
     MempoolId_t mempool_id = {0, 0};
     auto active_mempool = MemPoolContext::getActiveMemPool();
@@ -2868,6 +3291,13 @@ class DeviceCachingAllocator {
     }
 
     if (mempool_id.first == 0 && mempool_id.second == 0) {
+=======
+  bool release_cached_blocks(
+      const std::shared_ptr<GatheredContext>& context,
+      MempoolId_t mempool_id) {
+    if (mempool_id.first == 0 && mempool_id.second == 0 &&
+        captures_underway.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // If there is no active mempool, we work on releasing *all* blocks.
 
       // First ensure that all blocks that can't currently be allocated due to
@@ -2935,6 +3365,7 @@ class DeviceCachingAllocator {
         block->size,
         block->stream,
         block->device,
+<<<<<<< HEAD
         context ? context : block->context_when_segment_allocated);
 
     auto* pool = block->pool;
@@ -2947,6 +3378,16 @@ class DeviceCachingAllocator {
       // If there is an active mempool with a given allocator,
       // we use the given allocator's delete function.
       active_pool->allocator()->raw_delete((void*)block->ptr);
+=======
+        block->pool->owner_MempoolId(),
+        context ? context : block->context_when_segment_allocated);
+
+    auto* pool = block->pool;
+    if (pool->owner_PrivatePool && pool->owner_PrivatePool->allocator()) {
+      // If there is an active mempool with a given allocator,
+      // we use the given allocator's delete function.
+      pool->owner_PrivatePool->allocator()->raw_delete((void*)block->ptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       C10_CUDA_CHECK(cudaFree((void*)block->ptr));
     }
@@ -3044,6 +3485,10 @@ class DeviceCachingAllocator {
         unmapped.size,
         block->stream,
         block->device,
+<<<<<<< HEAD
+=======
+        block->pool->owner_MempoolId(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         context ? context : block->context_when_segment_allocated);
   }
   void release_blocks(
@@ -3237,18 +3682,36 @@ class DeviceCachingAllocator {
       size_t size,
       cudaStream_t stream,
       c10::DeviceIndex device,
+<<<<<<< HEAD
       std::shared_ptr<GatheredContext> context) {
     if (!record_history && trace_trackers_.empty())
       return;
 
+=======
+      MempoolId_t mempool_id,
+      std::shared_ptr<GatheredContext> context) {
+    if (!record_history && trace_trackers_.empty())
+      return;
+    std::string compile_string = "N/A";
+    if (!compile_context.empty()) {
+      compile_string = compile_context.top();
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto te = TraceEntry(
         action,
         device,
         addr,
         size,
         stream,
+<<<<<<< HEAD
         getApproximateTime(),
         record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr);
+=======
+        mempool_id,
+        getApproximateTime(),
+        record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
+        compile_string);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Callbacks should not include any Pytorch call
     for (const auto& cb : trace_trackers_) {
@@ -3261,6 +3724,7 @@ class DeviceCachingAllocator {
   }
 };
 
+<<<<<<< HEAD
 static bool zeroAllocations() {
   static auto has_cuda_env =
       c10::utils::check_env("PYTORCH_CUDA_MEMORY_CACHING_MEMSET_ZEROS") == true;
@@ -3270,6 +3734,8 @@ static bool zeroAllocations() {
   return zeros;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Returns whether to force all allocations to bypass the caching allocator and
 // go straight to cudaMalloc.  This setting is useful when debugging GPU memory
 // errors, since the caching allocator foils cuda-memcheck.
@@ -3311,7 +3777,11 @@ static void uncached_delete(void* ptr) {
 }
 
 static void local_raw_delete(void* ptr);
+<<<<<<< HEAD
 
+=======
+thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef __cpp_lib_hardware_interference_size
 using std::hardware_destructive_interference_size;
 #else
@@ -3431,16 +3901,28 @@ class NativeCachingAllocator : public CUDAAllocator {
   }
 
   void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(
+=======
+    TORCH_CHECK(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         0 <= device && static_cast<size_t>(device) < device_allocator.size(),
         "Allocator not initialized for device ",
         device,
         ": did you call init?");
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(
         0 <= fraction && fraction <= 1,
         "invalid fraction:",
         fraction,
         ". Please set within (0, 1).");
+=======
+    TORCH_CHECK(
+        0 <= fraction && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within [0, 1].");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     C10_CUDA_CHECK(c10::cuda::SetDevice(device));
     device_allocator[device]->setMemoryFraction(fraction);
   }
@@ -3449,13 +3931,26 @@ class NativeCachingAllocator : public CUDAAllocator {
       bool enabled,
       CreateContextFn context_recorder,
       size_t alloc_buffer_max_entries,
+<<<<<<< HEAD
       RecordContext when) override {
+=======
+      RecordContext when,
+      bool clearHistory) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     record_history = enabled;
     annotation_buffer.setMaxEntries(alloc_buffer_max_entries);
     annotation_buffer.clear();
     for (auto& allocator : device_allocator) {
       allocator->recordHistory(
+<<<<<<< HEAD
           enabled, context_recorder, alloc_buffer_max_entries, when);
+=======
+          enabled,
+          context_recorder,
+          alloc_buffer_max_entries,
+          when,
+          clearHistory);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -3475,6 +3970,27 @@ class NativeCachingAllocator : public CUDAAllocator {
     annotation_buffer.insertEntries(ae);
   }
 
+<<<<<<< HEAD
+=======
+  void pushCompileContext(std::string& md) override {
+    if (!record_history) {
+      return;
+    }
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    device_allocator[device]->pushCompileContext(md);
+  }
+
+  void popCompileContext() override {
+    if (!record_history) {
+      return;
+    }
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    device_allocator[device]->popCompileContext();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool isHistoryEnabled() override {
     c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
@@ -3501,9 +4017,15 @@ class NativeCachingAllocator : public CUDAAllocator {
     }
   }
 
+<<<<<<< HEAD
   void emptyCache() override {
     for (auto& da : device_allocator)
       da->emptyCache();
+=======
+  void emptyCache(MempoolId_t mempool_id) override {
+    for (auto& da : device_allocator)
+      da->emptyCache(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void enable(bool value) override {
@@ -3551,7 +4073,11 @@ class NativeCachingAllocator : public CUDAAllocator {
     device_allocator[block->device]->recordStream(block, stream);
   }
 
+<<<<<<< HEAD
   SnapshotInfo snapshot() override {
+=======
+  SnapshotInfo snapshot(MempoolId_t mempool_id) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Set-up converter to convert timestamps from tsc to microseconds.
     auto tsc_to_ns = clock_converter.makeConverter();
     auto tsc_to_us = [=](approx_time_t t_approx) {
@@ -3569,7 +4095,11 @@ class NativeCachingAllocator : public CUDAAllocator {
     // Get the device_traces' TraceEntry lists.
     for (auto& da : device_allocator) {
       result.device_traces.emplace_back(da->trace(tsc_to_us));
+<<<<<<< HEAD
       auto snap = da->snapshot();
+=======
+      auto snap = da->snapshot(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       result.segments.insert(result.segments.end(), snap.begin(), snap.end());
     }
 
@@ -3661,10 +4191,13 @@ class NativeCachingAllocator : public CUDAAllocator {
       TORCH_SDT_WITH_SEMAPHORE(malloc, devPtr, device, size, stream.id());
     }
 
+<<<<<<< HEAD
     if (zeroAllocations()) {
       C10_CUDA_CHECK(cudaMemsetAsync(devPtr, 0, size, stream));
     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {devPtr, devPtr, deleteFunc, Device(DeviceType::CUDA, device)};
   }
   DeleterFnPtr raw_deleter() const override {
@@ -3701,11 +4234,26 @@ class NativeCachingAllocator : public CUDAAllocator {
     device_allocator[device]->resetPeakStats();
   }
 
+<<<<<<< HEAD
   void ensureExistsAndIncrefPool(
       c10::DeviceIndex device,
       MempoolId_t mempool_id) override {
     assertValidDevice(device);
     device_allocator[device]->ensureExistsAndIncrefPool(std::move(mempool_id));
+=======
+  void createOrIncrefPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      CUDAAllocator* allocator) override {
+    assertValidDevice(device);
+    device_allocator[device]->createOrIncrefPool(
+        std::move(mempool_id), allocator);
+  }
+
+  void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    assertValidDevice(device);
+    device_allocator[device]->setUseOnOOM(std::move(mempool_id));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // CUDAGraph interactions
@@ -3747,12 +4295,15 @@ class NativeCachingAllocator : public CUDAAllocator {
       C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
       malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
     }
+<<<<<<< HEAD
     if (zeroAllocations()) {
       c10::DeviceIndex device = 0;
       C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
       C10_CUDA_CHECK(
           cudaMemsetAsync(r, 0, nbytes, cuda::getCurrentCUDAStream(device)));
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return r;
   }
 
@@ -3768,9 +4319,12 @@ class NativeCachingAllocator : public CUDAAllocator {
       C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
       malloc(&r, device, nbytes, stream);
     }
+<<<<<<< HEAD
     if (zeroAllocations()) {
       C10_CUDA_CHECK(cudaMemsetAsync(r, 0, nbytes, stream));
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return r;
   }
 
@@ -3964,6 +4518,10 @@ void local_raw_delete(void* ptr) {
 
 namespace CudaMallocAsync {
 // If this is put in its own header file, it gets incorrectly renamed in HIPify.
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDAAllocator* allocator();
 
 } // namespace CudaMallocAsync
@@ -3975,7 +4533,17 @@ struct BackendStaticInitializer {
   // version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
   // works, maybe we should move all of CUDAAllocatorConfig here?
   CUDAAllocator* parseEnvForBackend() {
+<<<<<<< HEAD
     const auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
+=======
+    auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
+#ifdef USE_ROCM
+    // convenience for ROCm users to allow either CUDA or HIP env var
+    if (!val.has_value()) {
+      val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
+    }
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (val.has_value()) {
       const std::string& config = val.value();
 
@@ -3991,7 +4559,19 @@ struct BackendStaticInitializer {
         std::vector<std::string> kv(it2, end2);
         if (kv.size() >= 2) {
           if (kv[0] == "backend") {
+<<<<<<< HEAD
+            if (kv[1] == "cudaMallocAsync")
+=======
+#ifdef USE_ROCM
+            // convenience for ROCm users to allow either CUDA or HIP env var
+            if (kv[1] ==
+                    "cud"
+                    "aMallocAsync" ||
+                kv[1] == "hipMallocAsync")
+#else
             if (kv[1] == "cudaMallocAsync")
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               return CudaMallocAsync::allocator();
             if (kv[1] == "native")
               return &Native::allocator;
@@ -4030,28 +4610,55 @@ std::atomic<CaptureId_t> MemPool::uuid_{1};
 
 MemPool::MemPool(
     CUDACachingAllocator::CUDAAllocator* allocator,
+<<<<<<< HEAD
     bool is_user_created)
     : allocator_(allocator), is_user_created_(is_user_created) {
+=======
+    bool is_user_created,
+    bool use_on_oom,
+    bool symmetric)
+    : allocator_(allocator),
+      is_user_created_(is_user_created),
+      symmetric_(symmetric) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_user_created_) {
     id_ = {0, uid_++};
   } else {
     id_ = {uuid_++, 0};
   }
   device_ = c10::cuda::current_device();
+<<<<<<< HEAD
   CUDACachingAllocator::ensureExistsAndIncrefPool(device_, id_);
+=======
+  CUDACachingAllocator::createOrIncrefPool(device_, id_, allocator);
+  if (use_on_oom) {
+    CUDACachingAllocator::setUseOnOOM(device_, id_);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 MemPool::~MemPool() {
   TORCH_INTERNAL_ASSERT(use_count() == 1);
   CUDACachingAllocator::releasePool(device_, id_);
+<<<<<<< HEAD
   auto ctx = MemPoolContext(this);
   c10::cuda::CUDACachingAllocator::emptyCache();
+=======
+  c10::cuda::CUDACachingAllocator::emptyCache(id_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 MempoolId_t MemPool::id() {
   return id_;
 }
 
+<<<<<<< HEAD
+=======
+bool MemPool::is_symmetric() {
+  return symmetric_;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
   return allocator_;
 }
@@ -4071,6 +4678,7 @@ MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
   return {uuid_++, 0};
 }
 
+<<<<<<< HEAD
 // Note that active_mempool_ is a global variable here
 // and not inside MemPoolContext class, because in windows we
 // can't use __declspec(dllexport) and __declspec(thread)
@@ -4090,4 +4698,6 @@ MemPool* MemPoolContext::getActiveMemPool() {
   return active_mempool_;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10::cuda
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index df31a11da785..bb535e3f70f7 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -115,14 +115,27 @@ struct TraceEntry {
       size_t addr,
       size_t size,
       cudaStream_t stream,
+<<<<<<< HEAD
       approx_time_t time,
       std::shared_ptr<GatheredContext> context = nullptr)
+=======
+      MempoolId_t mempool,
+      approx_time_t time,
+      std::shared_ptr<GatheredContext> context = nullptr,
+      std::string compile_context = "")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : action_(action),
         device_(device),
         addr_(addr),
         context_(std::move(context)),
         stream_(stream),
+<<<<<<< HEAD
         size_(size) {
+=======
+        size_(size),
+        mempool_(std::move(mempool)),
+        compile_context_(std::move(compile_context)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     time_.approx_t_ = time;
   }
   Action action_;
@@ -131,7 +144,13 @@ struct TraceEntry {
   std::shared_ptr<GatheredContext> context_;
   cudaStream_t stream_{};
   size_t size_;
+<<<<<<< HEAD
+  trace_time_ time_{};
+=======
+  MempoolId_t mempool_;
   trace_time_ time_{};
+  std::string compile_context_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // Calls made by record_function will save annotations
@@ -205,7 +224,11 @@ class CUDAAllocator : public Allocator {
   virtual bool initialized() = 0;
   virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
   virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
+<<<<<<< HEAD
   virtual void emptyCache() = 0;
+=======
+  virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void enable(bool value) = 0;
   virtual bool isEnabled() const = 0;
   virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
@@ -215,7 +238,11 @@ class CUDAAllocator : public Allocator {
       c10::DeviceIndex device) = 0;
   virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
   virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+<<<<<<< HEAD
   virtual SnapshotInfo snapshot() = 0;
+=======
+  virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void beginAllocateToPool(
       c10::DeviceIndex device,
       MempoolId_t mempool_id,
@@ -224,13 +251,20 @@ class CUDAAllocator : public Allocator {
       c10::DeviceIndex device,
       MempoolId_t mempool_id) = 0;
   virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
+<<<<<<< HEAD
   virtual int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
+=======
+  virtual int getPoolUseCount(
+      c10::DeviceIndex /*device*/,
+      MempoolId_t /*mempool_id*/) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(
         false,
         name(),
         " does not yet support getPoolUseCount. "
         "If you need it, please file an issue describing your use case.");
   }
+<<<<<<< HEAD
   virtual void ensureExistsAndIncrefPool(
       c10::DeviceIndex device,
       MempoolId_t mempool_id) {
@@ -245,6 +279,31 @@ class CUDAAllocator : public Allocator {
       c10::DeviceIndex device,
       MempoolId_t mempool_id,
       const std::unordered_set<void*>& expected_live_allocations) {
+=======
+  virtual void createOrIncrefPool(
+      c10::DeviceIndex /*device*/,
+      MempoolId_t /*mempool_id*/,
+      CUDAAllocator* allocator = nullptr) {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support createOrIncrefPool. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support setUseOnOOM. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  // returns true if the allocated blocks are equal to expected live allocations
+  virtual bool checkPoolLiveAllocations(
+      c10::DeviceIndex /*device*/,
+      MempoolId_t /*mempool_id*/,
+      const std::unordered_set<void*>& /*expected_live_allocations*/) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(
         false,
         name(),
@@ -264,9 +323,18 @@ class CUDAAllocator : public Allocator {
       bool enabled,
       CreateContextFn context_recorder,
       size_t alloc_trace_max_entries,
+<<<<<<< HEAD
       RecordContext when) = 0;
   virtual void recordAnnotation(
       const std::vector<std::pair<std::string, std::string>>& md) {}
+=======
+      RecordContext when,
+      bool clearHistory) = 0;
+  virtual void recordAnnotation(
+      const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
+  virtual void pushCompileContext(std::string& md) {}
+  virtual void popCompileContext() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
 
   // Attached AllocatorTraceTracker callbacks will be called while the
@@ -345,8 +413,13 @@ inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
   return get()->setMemoryFraction(fraction, device);
 }
 
+<<<<<<< HEAD
 inline void emptyCache() {
   return get()->emptyCache();
+=======
+inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
+  return get()->emptyCache(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void enable(bool value) {
@@ -382,8 +455,13 @@ inline void resetPeakStats(c10::DeviceIndex device) {
   return get()->resetPeakStats(device);
 }
 
+<<<<<<< HEAD
 inline SnapshotInfo snapshot() {
   return get()->snapshot();
+=======
+inline SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
+  return get()->snapshot(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline std::shared_ptr<AllocatorState> getCheckpointState(
@@ -414,9 +492,16 @@ inline void recordHistory(
     bool enabled,
     CreateContextFn context_recorder,
     size_t alloc_trace_max_entries,
+<<<<<<< HEAD
     RecordContext when) {
   return get()->recordHistory(
       enabled, context_recorder, alloc_trace_max_entries, when);
+=======
+    RecordContext when,
+    bool clearHistory) {
+  return get()->recordHistory(
+      enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline void recordAnnotation(
@@ -424,6 +509,17 @@ inline void recordAnnotation(
   return get()->recordAnnotation(md);
 }
 
+<<<<<<< HEAD
+=======
+inline void pushCompileContext(std::string& md) {
+  return get()->pushCompileContext(md);
+}
+
+inline void popCompileContext() {
+  return get()->popCompileContext();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline bool isHistoryEnabled() {
   return get()->isHistoryEnabled();
 }
@@ -447,10 +543,21 @@ inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
 inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
   return get()->releasePool(device, mempool_id);
 }
+<<<<<<< HEAD
 inline void ensureExistsAndIncrefPool(
     c10::DeviceIndex device,
     MempoolId_t mempool_id) {
   get()->ensureExistsAndIncrefPool(device, mempool_id);
+=======
+inline void createOrIncrefPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    CUDAAllocator* allocator_ptr = nullptr) {
+  get()->createOrIncrefPool(device, mempool_id, allocator_ptr);
+}
+inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->setUseOnOOM(device, mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
@@ -501,7 +608,13 @@ namespace c10::cuda {
 struct C10_CUDA_API MemPool {
   MemPool(
       CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
+<<<<<<< HEAD
       bool is_user_created = true);
+=======
+      bool is_user_created = true,
+      bool use_on_oom = false,
+      bool symmetric = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   MemPool(const MemPool&) = delete;
   MemPool(MemPool&&) = default;
   MemPool& operator=(const MemPool&) = delete;
@@ -509,6 +622,10 @@ struct C10_CUDA_API MemPool {
   ~MemPool();
 
   MempoolId_t id();
+<<<<<<< HEAD
+=======
+  bool is_symmetric();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CUDACachingAllocator::CUDAAllocator* allocator();
   int use_count();
   c10::DeviceIndex device();
@@ -520,6 +637,7 @@ struct C10_CUDA_API MemPool {
   CUDACachingAllocator::CUDAAllocator* allocator_;
   bool is_user_created_;
   MempoolId_t id_;
+<<<<<<< HEAD
   c10::DeviceIndex device_;
 };
 
@@ -545,4 +663,10 @@ struct C10_CUDA_API MemPoolContext {
   MemPool* prev_mempool_;
 };
 
+=======
+  bool symmetric_;
+  c10::DeviceIndex device_;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10::cuda
diff --git a/c10/cuda/CUDADeviceAssertion.h b/c10/cuda/CUDADeviceAssertion.h
index 063c7836932a..9a6f0ec7c5d9 100644
--- a/c10/cuda/CUDADeviceAssertion.h
+++ b/c10/cuda/CUDADeviceAssertion.h
@@ -6,6 +6,10 @@
 namespace c10::cuda {
 
 #ifdef TORCH_USE_CUDA_DSA
+<<<<<<< HEAD
+=======
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Copy string from `src` to `dst`
 static __device__ void dstrcpy(char* dst, const char* src) {
   int i = 0;
@@ -64,6 +68,10 @@ static __device__ void dsa_add_new_assertion_failure(
   self.thread_id[1] = thread_id.y;
   self.thread_id[2] = thread_id.z;
 }
+<<<<<<< HEAD
+=======
+C10_CLANG_DIAGNOSTIC_POP()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Emulates a kernel assertion. The assertion won't stop the kernel's progress,
 // so you should assume everything the kernel produces is garbage if there's an
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
index 21fd8b3052d3..4175b5b711df 100644
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -213,11 +213,19 @@ bool CUDAKernelLaunchRegistry::check_env_for_dsa_enabled() const {
 }
 
 uint32_t CUDAKernelLaunchRegistry::insert(
+<<<<<<< HEAD
     const char* launch_filename,
     const char* launch_function,
     const uint32_t launch_linenum,
     const char* kernel_name,
     const int32_t stream_id) {
+=======
+    const char* launch_filename [[maybe_unused]],
+    const char* launch_function [[maybe_unused]],
+    const uint32_t launch_linenum [[maybe_unused]],
+    const char* kernel_name [[maybe_unused]],
+    const int32_t stream_id [[maybe_unused]]) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef TORCH_USE_CUDA_DSA
   if (!enabled_at_runtime) {
     return 0;
diff --git a/c10/cuda/CUDADeviceAssertionHost.h b/c10/cuda/CUDADeviceAssertionHost.h
index a945915c2878..d23c99c375ff 100644
--- a/c10/cuda/CUDADeviceAssertionHost.h
+++ b/c10/cuda/CUDADeviceAssertionHost.h
@@ -47,7 +47,11 @@ struct DeviceAssertionData {
 /// Used to hold assertions generated by the device
 /// Held in managed memory and access by both the CPU and the GPU.
 struct DeviceAssertionsData {
+<<<<<<< HEAD
   /// Total number of assertions found; a subset of thse will be recorded
+=======
+  /// Total number of assertions found; a subset of these will be recorded
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// in `assertions`
   int32_t assertion_count{};
   /// An array of assertions that will be written to in a race-free manner
@@ -148,7 +152,11 @@ class C10_CUDA_API CUDAKernelLaunchRegistry {
 #endif
 };
 
+<<<<<<< HEAD
 std::string c10_retrieve_device_side_assertion_info();
+=======
+C10_CUDA_API std::string c10_retrieve_device_side_assertion_info();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace c10::cuda
 
diff --git a/c10/cuda/CUDAException.cpp b/c10/cuda/CUDAException.cpp
index 5b51a3e2a5ae..607cbe8f0ff7 100644
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@@ -10,9 +10,15 @@ namespace c10::cuda {
 
 void c10_cuda_check_implementation(
     const int32_t err,
+<<<<<<< HEAD
     const char* filename,
     const char* function_name,
     const int line_number,
+=======
+    const char* /*filename*/,
+    const char* /*function_name*/,
+    const int /*line_number*/,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const bool include_device_assertions) {
   const auto cuda_error = static_cast<cudaError_t>(err);
   const auto cuda_kernel_failure = include_device_assertions
@@ -24,7 +30,10 @@ void c10_cuda_check_implementation(
   }
 
   [[maybe_unused]] auto error_unused = cudaGetLastError();
+<<<<<<< HEAD
   (void)error_unused;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::string check_message;
 #ifndef STRIP_ERROR_MESSAGES
@@ -39,8 +48,13 @@ void c10_cuda_check_implementation(
         "Device-side assertions were explicitly omitted for this error check; the error probably arose while initializing the DSA handlers.");
   }
 #endif
+<<<<<<< HEAD
 
   TORCH_CHECK(false, check_message);
+=======
+  throw c10::AcceleratorError(
+      {__func__, __FILE__, int32_t(__LINE__)}, err, check_message);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace c10::cuda
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 796b2ae14349..1c3e13eb8f28 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -130,8 +130,13 @@ DeviceIndex current_device() {
   return cur_device;
 }
 
+<<<<<<< HEAD
 void set_device(DeviceIndex device) {
   C10_CUDA_CHECK(c10::cuda::SetDevice(device));
+=======
+void set_device(DeviceIndex device, const bool force) {
+  C10_CUDA_CHECK(c10::cuda::SetDevice(device, force));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void device_synchronize() {
@@ -178,6 +183,10 @@ static bool dummyHasPrimaryContext([[maybe_unused]] DeviceIndex device_index) {
 static bool (*hasPrimaryContext)(DeviceIndex) = dummyHasPrimaryContext;
 
 // Private api to be called from CUDAHooks.cpp
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_CUDA_API void setHasPrimaryContext(bool (*func)(DeviceIndex)) {
   hasPrimaryContext = func ? func : dummyHasPrimaryContext;
 }
@@ -230,9 +239,18 @@ cudaError_t GetDevice(DeviceIndex* device) {
   return err;
 }
 
+<<<<<<< HEAD
 cudaError_t SetDevice(DeviceIndex device) {
   TORCH_CHECK(device >= 0, "device id must be positive!", device);
   targetDeviceIndex = -1;
+=======
+cudaError_t SetDevice(DeviceIndex device, const bool force) {
+  TORCH_CHECK(device >= 0, "device id must be non-negative!", device);
+  targetDeviceIndex = -1;
+  if (force) {
+    return cudaSetDevice(device);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int cur_device = -1;
   C10_CUDA_CHECK(cudaGetDevice(&cur_device));
   if (device == cur_device) {
@@ -308,8 +326,16 @@ cudaError_t GetDevice(DeviceIndex* device) {
   return err;
 }
 
+<<<<<<< HEAD
 cudaError_t SetDevice(DeviceIndex device) {
   TORCH_CHECK(device >= 0, "device id must be positive!", device);
+=======
+cudaError_t SetDevice(DeviceIndex device, const bool force) {
+  TORCH_CHECK(device >= 0, "device id must be non-negative!", device);
+  if (force) {
+    return cudaSetDevice(device);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int cur_device = -1;
   C10_CUDA_CHECK(cudaGetDevice(&cur_device));
   if (device == cur_device) {
diff --git a/c10/cuda/CUDAFunctions.h b/c10/cuda/CUDAFunctions.h
index 192fafbad10f..7d7c6e234f54 100644
--- a/c10/cuda/CUDAFunctions.h
+++ b/c10/cuda/CUDAFunctions.h
@@ -27,7 +27,11 @@ C10_CUDA_API DeviceIndex device_count_ensure_non_zero();
 
 C10_CUDA_API DeviceIndex current_device();
 
+<<<<<<< HEAD
 C10_CUDA_API void set_device(DeviceIndex device);
+=======
+C10_CUDA_API void set_device(DeviceIndex device, const bool force = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 C10_CUDA_API void device_synchronize();
 
@@ -38,7 +42,12 @@ C10_CUDA_API cudaError_t GetDeviceCount(int* dev_count);
 
 C10_CUDA_API cudaError_t GetDevice(DeviceIndex* device);
 
+<<<<<<< HEAD
 C10_CUDA_API cudaError_t SetDevice(DeviceIndex device);
+=======
+C10_CUDA_API cudaError_t
+SetDevice(DeviceIndex device, const bool force = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 C10_CUDA_API cudaError_t MaybeSetDevice(DeviceIndex device);
 
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index 6d9e0bb8f4d4..06be6d7d7f94 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -14,7 +14,11 @@ namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync {
 using namespace c10::CachingAllocator;
 using namespace c10::CachingDeviceAllocator;
 
+<<<<<<< HEAD
 #if CUDA_VERSION >= 11040
+=======
+#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // CUDA device allocator that uses cudaMallocAsync to implement
 // the same interface as CUDACachingAllocator.cpp.
 
@@ -496,7 +500,11 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     // introduces performance nondeterminism.
   }
 
+<<<<<<< HEAD
   void emptyCache() override {
+=======
+  void emptyCache(/*unused*/ MempoolId_t mempool_id) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::lock_guard<std::mutex> lk(general_mutex);
 
     for (int dev = 0; dev < device_count; dev++) {
@@ -504,9 +512,15 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
         CUDAGuard g(static_cast<c10::DeviceIndex>(dev));
 
         cudaMemPool_t mempool = nullptr;
+<<<<<<< HEAD
         cudaDeviceGetDefaultMemPool(&mempool, dev);
         cudaDeviceSynchronize();
         cudaMemPoolTrimTo(mempool, 0);
+=======
+        C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, dev));
+        C10_CUDA_CHECK(cudaDeviceSynchronize());
+        C10_CUDA_CHECK(cudaMemPoolTrimTo(mempool, 0));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -582,7 +596,11 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
       }
 
       if (err == cudaSuccess) {
+<<<<<<< HEAD
         cudaFreeAsync(dummy, stream);
+=======
+        C10_CUDA_CHECK(cudaFreeAsync(dummy, stream));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *maxWorkspaceGuess = guess;
         return;
       } else if (err == cudaErrorMemoryAllocation) {
@@ -648,7 +666,12 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
       bool enabled,
       CreateContextFn context_recorder,
       size_t alloc_trace_max_entries,
+<<<<<<< HEAD
       RecordContext when) override {
+=======
+      RecordContext when,
+      bool clearHistory) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(
         false,
         "cudaMallocAsync does not yet support recordHistory. "
@@ -777,7 +800,11 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
         cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrUsedMemHigh, &zero));
   }
 
+<<<<<<< HEAD
   SnapshotInfo snapshot() override {
+=======
+  SnapshotInfo snapshot(MempoolId_t mempool_id) override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_CHECK(
         false,
         "Calling snapshot with backend:cudaMallocAsync is not meaningful. "
@@ -920,11 +947,19 @@ static CudaMallocAsyncAllocator device_allocator;
 void local_raw_delete(void* ptr) {
   freeAsync(ptr);
 }
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDAAllocator* allocator() {
   return &device_allocator;
 }
 
 #else
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDAAllocator* allocator() {
   TORCH_CHECK(false, "Cannot use CudaMallocAsyncAllocator with cuda < 11.4.");
   return nullptr;
diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp
index bbaeeba84ddc..84cc0e9717f2 100644
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@@ -200,6 +200,10 @@ static void initGlobalStreamState() {
 // Init a single CUDA or HIP stream
 // See Note [HIP Lazy Streams]
 static void initSingleStream(int p, DeviceIndex device_index, int i) {
+<<<<<<< HEAD
+=======
+  CUDAGuard device_guard(device_index);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& stream = streams[p][device_index][i];
   auto pri = -p; // lower number is higher priority
 
@@ -244,7 +248,17 @@ static void initCUDAStreamsOnce() {
 
 // Helper to verify the GPU index is valid
 static inline void check_gpu(DeviceIndex device_index) {
+<<<<<<< HEAD
   TORCH_INTERNAL_ASSERT(device_index >= 0 && device_index < num_gpus);
+=======
+  TORCH_CHECK(
+      device_index >= 0 && device_index < num_gpus,
+      "Device index value ",
+      static_cast<int>(device_index),
+      " is out of index range [0, ",
+      static_cast<int>(num_gpus),
+      ")");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Helper to determine the index of the stream to return
diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
index bb201b5c0397..c0c8e1b770c1 100644
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@@ -1,13 +1,23 @@
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+<<<<<<< HEAD
 #include <c10/cuda/driver_api.h>
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
+=======
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/driver_api.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+#include <cuda_runtime.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <dlfcn.h>
 
 namespace c10::cuda {
 
 namespace {
 
+<<<<<<< HEAD
 DriverAPI create_driver_api() {
   void* handle_0 = dlopen("libcuda.so.1", RTLD_LAZY | RTLD_NOLOAD);
   TORCH_CHECK(handle_0, "Can't open libcuda.so.1: ", dlerror());
@@ -25,6 +35,27 @@ DriverAPI create_driver_api() {
   dlerror();
   C10_LIBCUDA_DRIVER_API_12030(LOOKUP_LIBCUDA_ENTRY)
 #undef LOOKUP_LIBCUDA_ENTRY
+=======
+void* get_symbol(const char* name, int version);
+
+DriverAPI create_driver_api() {
+  void* handle_1 = DriverAPI::get_nvml_handle();
+  DriverAPI r{};
+
+#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED(name, version)            \
+  r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version)); \
+  TORCH_INTERNAL_ASSERT(r.name##_, "Can't find ", #name);
+  C10_LIBCUDA_DRIVER_API_REQUIRED(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED)
+#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_REQUIRED
+
+// Users running drivers between 12.0 and 12.3 will not have these symbols,
+// they would be resolved into nullptr, but we guard their usage at runtime
+// to ensure safe fallback behavior.
+#define LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL(name, version) \
+  r.name##_ = reinterpret_cast<decltype(&name)>(get_symbol(#name, version));
+  C10_LIBCUDA_DRIVER_API_OPTIONAL(LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL)
+#undef LOOKUP_LIBCUDA_ENTRY_WITH_VERSION_OPTIONAL
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (handle_1) {
 #define LOOKUP_NVML_ENTRY(name)                          \
@@ -35,6 +66,35 @@ DriverAPI create_driver_api() {
   }
   return r;
 }
+<<<<<<< HEAD
+=======
+
+void* get_symbol(const char* name, int version) {
+  void* out = nullptr;
+  cudaDriverEntryPointQueryResult qres{};
+
+  // CUDA 12.5+ supports version-based lookup
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12050)
+  if (auto st = cudaGetDriverEntryPointByVersion(
+          name, &out, version, cudaEnableDefault, &qres);
+      st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
+    return out;
+  }
+#endif
+
+  // This fallback to the old API to try getting the symbol again.
+  if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
+      st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
+    return out;
+  }
+
+  // If the symbol cannot be resolved, report and return nullptr;
+  // the caller is responsible for checking the pointer.
+  LOG(INFO) << "Failed to resolve symbol " << name;
+  return nullptr;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 void* DriverAPI::get_nvml_handle() {
diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h
index 65cbdfe878dc..a0e46175abf6 100644
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@@ -3,6 +3,11 @@
 #define NVML_NO_UNVERSIONED_FUNC_DEFS
 #include <nvml.h>
 
+<<<<<<< HEAD
+=======
+#include <c10/util/Exception.h>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define C10_CUDA_DRIVER_CHECK(EXPR)                                        \
   do {                                                                     \
     CUresult __err = EXPR;                                                 \
@@ -18,6 +23,7 @@
     }                                                                      \
   } while (0)
 
+<<<<<<< HEAD
 #define C10_LIBCUDA_DRIVER_API(_)   \
   _(cuDeviceGetAttribute)           \
   _(cuMemAddressReserve)            \
@@ -49,15 +55,73 @@
   _(nvmlDeviceGetNvLinkRemoteDeviceType) \
   _(nvmlDeviceGetNvLinkRemotePciInfo_v2) \
   _(nvmlDeviceGetComputeRunningProcesses)
+=======
+// The integer in the second column specifies the requested CUDA Driver API
+// version. The dynamic loader will accept a driver with a newer version, but it
+// ensures that the requested symbol exists in *at least* the specified version
+// or earlier.
+
+// Keep these requested versions as low as possible to maximize compatibility
+// across different driver versions.
+
+// Why do we pin to an older version instead of using the latest?
+// If a user installs a newer driver, blindly resolving the symbol may bind to a
+// newer version of the function with different behavior, potentially breaking
+// PyTorch.
+
+#define C10_LIBCUDA_DRIVER_API_REQUIRED(_) \
+  _(cuDeviceGetAttribute, 12000)           \
+  _(cuMemAddressReserve, 12000)            \
+  _(cuMemRelease, 12000)                   \
+  _(cuMemMap, 12000)                       \
+  _(cuMemAddressFree, 12000)               \
+  _(cuMemSetAccess, 12000)                 \
+  _(cuMemUnmap, 12000)                     \
+  _(cuMemCreate, 12000)                    \
+  _(cuMemGetAllocationGranularity, 12000)  \
+  _(cuMemExportToShareableHandle, 12000)   \
+  _(cuMemImportFromShareableHandle, 12000) \
+  _(cuMemsetD32Async, 12000)               \
+  _(cuStreamWriteValue32, 12000)           \
+  _(cuGetErrorString, 12000)
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
+#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \
+  _(cuMulticastAddDevice, 12030)           \
+  _(cuMulticastBindMem, 12030)             \
+  _(cuMulticastCreate, 12030)
+#else
+#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_)
+#endif
+
+#define C10_NVML_DRIVER_API(_)            \
+  _(nvmlInit_v2)                          \
+  _(nvmlDeviceGetHandleByPciBusId_v2)     \
+  _(nvmlDeviceGetNvLinkRemoteDeviceType)  \
+  _(nvmlDeviceGetNvLinkRemotePciInfo_v2)  \
+  _(nvmlDeviceGetComputeRunningProcesses) \
+  _(nvmlSystemGetCudaDriverVersion_v2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10::cuda {
 
 struct DriverAPI {
+<<<<<<< HEAD
 #define CREATE_MEMBER(name) decltype(&name) name##_;
   C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
   C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
   C10_NVML_DRIVER_API(CREATE_MEMBER)
 #undef CREATE_MEMBER
+=======
+#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_;
+#define CREATE_MEMBER(name) decltype(&name) name##_;
+  C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED)
+  C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED)
+  C10_NVML_DRIVER_API(CREATE_MEMBER)
+#undef CREATE_MEMBER_VERSIONED
+#undef CREATE_MEMBER
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static DriverAPI* get();
   static void* get_nvml_handle();
 };
diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h
index 244c012dcb39..a2e31a0bf19d 100644
--- a/c10/cuda/impl/CUDAGuardImpl.h
+++ b/c10/cuda/impl/CUDAGuardImpl.h
@@ -25,13 +25,24 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
 
   CUDAGuardImpl() = default;
   explicit CUDAGuardImpl(DeviceType t) {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(t == DeviceType::CUDA);
+=======
+    TORCH_CHECK(
+        t == DeviceType::CUDA,
+        "CUDAGuardImpl initialized with non-CUDA DeviceType: ",
+        t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   DeviceType type() const override {
     return DeviceType::CUDA;
   }
   Device exchangeDevice(Device d) const override {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(d.is_cuda());
+=======
+    TORCH_CHECK(d.is_cuda(), "Expected a CUDA device, but got ", d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto old_device_index = c10::cuda::ExchangeDevice(d.index());
     return Device(DeviceType::CUDA, old_device_index);
   }
@@ -50,7 +61,11 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     return Device(DeviceType::CUDA, device);
   }
   void setDevice(Device d) const override {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(d.is_cuda());
+=======
+    TORCH_CHECK(d.is_cuda(), "Expected a CUDA device, but got ", d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     C10_CUDA_CHECK(c10::cuda::SetDevice(d.index()));
   }
   void uncheckedSetDevice(Device d) const noexcept override {
diff --git a/c10/cuda/impl/CUDATest.cpp b/c10/cuda/impl/CUDATest.cpp
index 287479dab945..330da13f8b9b 100644
--- a/c10/cuda/impl/CUDATest.cpp
+++ b/c10/cuda/impl/CUDATest.cpp
@@ -7,7 +7,11 @@
 
 namespace c10::cuda::impl {
 
+<<<<<<< HEAD
 bool has_cuda_gpu() {
+=======
+static bool has_cuda_gpu() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int count = 0;
   C10_CUDA_IGNORE_ERROR(cudaGetDeviceCount(&count));
 
@@ -22,9 +26,12 @@ int c10_cuda_test() {
   return r;
 }
 
+<<<<<<< HEAD
 // This function is not exported
 int c10_cuda_private_test() {
   return 2;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10::cuda::impl
diff --git a/c10/cuda/test/CMakeLists.txt b/c10/cuda/test/CMakeLists.txt
index 100270625332..fe77bc849a96 100644
--- a/c10/cuda/test/CMakeLists.txt
+++ b/c10/cuda/test/CMakeLists.txt
@@ -17,11 +17,23 @@ if(BUILD_TEST)
     if(WIN32 AND test_src MATCHES "^.*\.hip$")
       set_source_files_properties(${test_src} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
       hip_add_executable(${test_name} "${test_src}")
+<<<<<<< HEAD
       set_target_properties(${test_name} PROPERTIES LINKER_LANGUAGE CXX HIP_ARCHITECTURES ${PYTORCH_ROCM_ARCH})
     else()
       add_executable(${test_name} "${test_src}")
     endif()
     target_link_libraries(${test_name} ${C10_CUDA_LIB} ${C10_LIB} gtest_main)
+=======
+      list(JOIN PYTORCH_ROCM_ARCH " " ROCM_PROPERTY_ARCH_LIST)
+      set_target_properties(${test_name} PROPERTIES LINKER_LANGUAGE CXX HIP_ARCHITECTURES ${ROCM_PROPERTY_ARCH_LIST})
+    else()
+      add_executable(${test_name} "${test_src}")
+    endif()
+    if(test_src MATCHES "^.*\.hip$")
+      set_target_properties(${test_name} PROPERTIES LINKER_LANGUAGE CXX)
+    endif()
+    target_link_libraries(${test_name} ${C10_CUDA_LIB} ${C10_LIB} gmock gtest gtest_main)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
     if(INSTALL_TEST)
       set_target_properties(${test_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
diff --git a/c10/hip/CMakeLists.txt b/c10/hip/CMakeLists.txt
index f153030e7931..79203568c432 100644
--- a/c10/hip/CMakeLists.txt
+++ b/c10/hip/CMakeLists.txt
@@ -33,6 +33,10 @@ file(GLOB_RECURSE C10_HIP_HEADERS *.h)
 
 if(NOT BUILD_LIBTORCHLESS)
   hip_add_library(c10_hip ${C10_HIP_SRCS} ${C10_HIP_HEADERS})
+<<<<<<< HEAD
+=======
+  torch_compile_options(c10_hip)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   # Propagate HIP_CXX_FLAGS that were set from Dependencies.cmake
   target_compile_options(c10_hip PRIVATE ${HIP_CXX_FLAGS})
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
index 21808de77a31..e8f975d3cb55 100644
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@@ -1,6 +1,7 @@
 #ifndef C10_MACROS_EXPORT_H_
 #define C10_MACROS_EXPORT_H_
 
+<<<<<<< HEAD
 /* Header file to define the common scaffolding for exported symbols.
  *
  * Export is by itself a quite tricky situation to deal with, and if you are
@@ -40,10 +41,13 @@
 // to inform this header that it does not need to include the cmake_macros.h
 // file.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef C10_USING_CUSTOM_GENERATED_MACROS
 #include <c10/macros/cmake_macros.h>
 #endif // C10_USING_CUSTOM_GENERATED_MACROS
 
+<<<<<<< HEAD
 #ifdef _WIN32
 #define C10_HIDDEN
 #if defined(C10_BUILD_SHARED_LIBS)
@@ -90,6 +94,9 @@
 #else
 #define C10_API C10_IMPORT
 #endif
+=======
+#include <torch/headeronly/macros/Export.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // This one is being used by libtorch.so
 #ifdef CAFFE2_BUILD_MAIN_LIB
@@ -159,4 +166,8 @@
 #define C10_API_ENUM
 #endif
 
+<<<<<<< HEAD
 #endif // C10_MACROS_MACROS_H_
+=======
+#endif // C10_MACROS_EXPORT_H_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 459c5838e649..4a1eeaf701c0 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -241,7 +241,11 @@ using namespace c10::xpu;
 #ifdef __HIPCC__
 // Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
 // We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
+<<<<<<< HEAD
 // See https://github.com/ROCm-Developer-Tools/HIP/issues/441
+=======
+// See https://github.com/ROCm/hip/issues/441
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <hip/hip_runtime.h>
 #endif
 
@@ -286,7 +290,11 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm)        \
   ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \
         ? (blocks_per_sm)                                              \
+<<<<<<< HEAD
         : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block)-1) /         \
+=======
+        : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) /       \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            (threads_per_block))))
 // C10_LAUNCH_BOUNDS is analogous to __launch_bounds__
 #define C10_LAUNCH_BOUNDS_0 \
@@ -433,11 +441,32 @@ __host__ __device__
 #endif // __SYCL_DEVICE_ONLY__
 }
 #endif // NDEBUG
+<<<<<<< HEAD
 // ROCm disable kernel assert by default
 #if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM)
 #define CUDA_KERNEL_ASSERT(cond)
 #define CUDA_KERNEL_ASSERT_MSG(cond, msg)
 #define SYCL_KERNEL_ASSERT(cond)
+=======
+// ROCm disables kernel assert by default for performance considerations.
+// Though ROCm supports __assert_fail, it uses kernel printf which has
+// a non-negligible performance impact even if the assert condition is
+// never triggered. We choose to use abort() instead which will still
+// terminate the application but without a more useful error message.
+#if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM)
+#define CUDA_KERNEL_ASSERT(cond) \
+  if C10_UNLIKELY (!(cond)) {    \
+    abort();                     \
+  }
+#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \
+  if C10_UNLIKELY (!(cond)) {             \
+    abort();                              \
+  }
+#define SYCL_KERNEL_ASSERT(cond) \
+  if C10_UNLIKELY (!(cond)) {    \
+    abort();                     \
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 #define CUDA_KERNEL_ASSERT(cond)                                         \
   if (C10_UNLIKELY(!(cond))) {                                           \
@@ -538,4 +567,17 @@ __host__ __device__
 
 #endif
 
+<<<<<<< HEAD
+=======
+// This macro is used to find older C++ compilers
+// that don't support move optimization for return values.
+
+#if (defined(__GNUC__) && __GNUC__ < 13) || \
+    (defined(__clang_major__) && __clang_major__ < 13)
+#define C10_RETURN_MOVE_IF_OLD_COMPILER 1
+#else
+#define C10_RETURN_MOVE_IF_OLD_COMPILER 0
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // C10_MACROS_MACROS_H_
diff --git a/c10/macros/build.bzl b/c10/macros/build.bzl
index 73646c7cbe2f..c44905f5fe99 100644
--- a/c10/macros/build.bzl
+++ b/c10/macros/build.bzl
@@ -12,6 +12,12 @@ def define_targets(rules):
         linkstatic = True,
         local_defines = ["C10_BUILD_MAIN_LIB"],
         visibility = ["//visibility:public"],
+<<<<<<< HEAD
+=======
+        deps = [
+            "//torch/headeronly:torch_headeronly",
+        ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     rules.cmake_configure_file(
diff --git a/c10/metal/atomic.h b/c10/metal/atomic.h
new file mode 100644
index 000000000000..84698024e887
--- /dev/null
+++ b/c10/metal/atomic.h
@@ -0,0 +1,105 @@
+#pragma once
+#include <metal_atomic>
+namespace c10 {
+namespace metal {
+
+// Atomic operations helper
+template <typename T>
+struct AtomicType {};
+template <typename T>
+using AtomicType_t = typename AtomicType<T>::type;
+
+template <>
+struct AtomicType<float> {
+  using type = ::metal::atomic<float>;
+  static inline void atomic_add(device type* data, long offset, float value) {
+    ::metal::atomic_fetch_add_explicit(
+        data + offset, value, ::metal::memory_order_relaxed);
+  }
+};
+
+template <>
+struct AtomicType<int> {
+  using type = ::metal::atomic<int>;
+  static inline void atomic_add(device type* data, long offset, int value) {
+    ::metal::atomic_fetch_add_explicit(
+        data + offset, value, ::metal::memory_order_relaxed);
+  }
+};
+
+// As of Metal3.2 atomic operations are not supported on half-precision floats,
+// so they must be simulated Using atomic compare and exchange over 32-bit
+// atomic type
+template <typename T>
+static inline void atomic_add_helper(
+    device ::metal::atomic<uint>* data,
+    long offset,
+    T value) {
+  auto ptr = data + (offset >> 1);
+  auto old = ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed);
+  union {
+    uint i;
+    T t[2];
+  } val;
+  do {
+    val.i = old;
+    val.t[offset & 1] += value;
+  } while (!::metal::atomic_compare_exchange_weak_explicit(
+      ptr,
+      &old,
+      val.i,
+      ::metal::memory_order_relaxed,
+      ::metal::memory_order_relaxed));
+}
+
+template <>
+struct AtomicType<half> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, half value) {
+    atomic_add_helper<half>(data, offset, value);
+  }
+};
+
+#if __METAL_VERSION__ >= 310
+template <>
+struct AtomicType<bfloat> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, bfloat value) {
+    atomic_add_helper<bfloat>(data, offset, value);
+  }
+};
+#endif
+
+// Metal supports atomic_store_explicit for bools, but
+// sizeof(::metal::atomic_bool) is 4 Therefore it could not be used to
+// atomically modify unaligned memory, so fall back to compare and exchange
+// trick As accumulation over booleans are just or operation, do nothing if
+// value is false
+template <>
+struct AtomicType<bool> {
+  using type = ::metal::atomic<uint>;
+  static inline void atomic_add(device type* data, long offset, bool value) {
+    if (!value) {
+      return;
+    }
+    auto ptr = data + (offset >> 2);
+    auto old =
+        ::metal::atomic_load_explicit(ptr, ::metal::memory_order_relaxed);
+    union {
+      uint i;
+      bool t[4];
+    } val;
+    do {
+      val.i = old;
+      val.t[offset & 3] = true;
+    } while (!::metal::atomic_compare_exchange_weak_explicit(
+        ptr,
+        &old,
+        val.i,
+        ::metal::memory_order_relaxed,
+        ::metal::memory_order_relaxed));
+  }
+};
+
+} // namespace metal
+} // namespace c10
diff --git a/c10/metal/common.h b/c10/metal/common.h
new file mode 100644
index 000000000000..8b489ff52809
--- /dev/null
+++ b/c10/metal/common.h
@@ -0,0 +1,48 @@
+#pragma once
+// Set of global constants that could be shareable between CPU and Metal code
+
+#ifdef __METAL__
+#define C10_METAL_CONSTEXPR constant constexpr
+#else
+#define C10_METAL_CONSTEXPR constexpr
+#endif
+
+#if !defined(__METAL__) || __METAL_VERSION__ >= 310
+#define C10_METAL_ALL_TYPES_FUNCTOR(_) \
+  _(Byte, 0)                           \
+  _(Char, 1)                           \
+  _(Short, 2)                          \
+  _(Int, 3)                            \
+  _(Long, 4)                           \
+  _(Half, 5)                           \
+  _(Float, 6)                          \
+  _(ComplexHalf, 8)                    \
+  _(ComplexFloat, 9)                   \
+  _(Bool, 11)                          \
+  _(BFloat16, 15)
+#else
+#define C10_METAL_ALL_TYPES_FUNCTOR(_) \
+  _(Byte, 0)                           \
+  _(Char, 1)                           \
+  _(Short, 2)                          \
+  _(Int, 3)                            \
+  _(Long, 4)                           \
+  _(Half, 5)                           \
+  _(Float, 6)                          \
+  _(ComplexHalf, 8)                    \
+  _(ComplexFloat, 9)                   \
+  _(Bool, 11)
+#endif
+
+namespace c10 {
+namespace metal {
+C10_METAL_CONSTEXPR unsigned max_ndim = 16;
+
+enum class ScalarType {
+#define _DEFINE_ENUM_VAL_(_v, _n) _v = _n,
+  C10_METAL_ALL_TYPES_FUNCTOR(_DEFINE_ENUM_VAL_)
+#undef _DEFINE_ENUM_VAL_
+};
+
+} // namespace metal
+} // namespace c10
diff --git a/c10/metal/expm1f.h b/c10/metal/expm1f.h
new file mode 100644
index 000000000000..3bc1517a2db3
--- /dev/null
+++ b/c10/metal/expm1f.h
@@ -0,0 +1,97 @@
+// Copy-and-pasted from:
+// https://github.com/ml-explore/mlx/blob/99c33d011d63174f50cea37c3eede002958be6d3/mlx/backend/metal/kernels/expm1f.h
+
+#pragma once
+
+#include <metal_math>
+
+// Original license copied below:
+//  Copyright (c) 2015-2023 Norbert Juffa
+//  All rights reserved.
+//
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//
+//  1. Redistributions of source code must retain the above copyright
+//     notice, this list of conditions and the following disclaimer.
+//
+//  2. Redistributions in binary form must reproduce the above copyright
+//     notice, this list of conditions and the following disclaimer in the
+//     documentation and/or other materials provided with the distribution.
+//
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+//  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+//  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+//  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+//  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+//  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+//  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+//  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+namespace c10 {
+namespace metal {
+
+/* Compute exponential base e minus 1. Maximum ulp error = 0.997458
+
+   i = rint(a/log(2)), f = a-i*log(2). Then expm1(a) = 2**i * (expm1(f)+1) - 1.
+   Compute r = expm1(f). Then expm1(a)= 2 * (0.5 * 2**i * r + 0.5 * 2**i - 0.5).
+   With t = 0.5*2**i, expm1(a) = 2*(r * t + t-0.5). However, for best accuracy,
+   when i == 1, expm1(a)= 2*(r + 0.5), and when i == 0, expm1(a) = r.
+
+   NOTE: Scale factor b is only applied if i < 0 or i > 1 (should be power of 2)
+*/
+inline float expm1f_scaled_unchecked(float a, float b) {
+  float f, j, r, s, t, u, v, x, y;
+  int i;
+
+  // exp(a) = 2**i * exp(f); i = rintf (a / log(2))
+  j = ::metal::fma(1.442695f, a, 12582912.f); // 0x1.715476p0, 0x1.8p23
+  j = j - 12582912.0f; // 0x1.8p23
+  i = (int)j;
+  f = ::metal::fma(j, -6.93145752e-1f, a);
+
+  // approximate r = exp(f)-1 on interval [-log(2)/2, +log(2)/2]
+  s = f * f;
+  if (a == 0.0f)
+    s = a; // ensure -0 is passed through
+  // err = 0.997458  ulp1 = 11081805
+  r = 1.97350979e-4f; // 0x1.9de000p-13
+  r = ::metal::fma(r, f, 1.39309070e-3f); // 0x1.6d30bcp-10
+  r = ::metal::fma(r, f, 8.33343994e-3f); // 0x1.1111f6p-7
+  r = ::metal::fma(r, f, 4.16668020e-2f); // 0x1.55559ep-5
+  r = ::metal::fma(r, f, 1.66666716e-1f); // 0x1.55555cp-3
+  r = ::metal::fma(r, f, 4.99999970e-1f); // 0x1.fffffep-2
+  u = (j == 1) ? (f + 0.5f) : f;
+  v = ::metal::fma(r, s, u);
+  s = 0.5f * b;
+  t = ::metal::ldexp(s, i);
+  y = t - s;
+  x = (t - y) - s; // double-float canonicalization of difference
+  r = ::metal::fma(v, t, x) + y;
+  r = r + r;
+  if (j == 0)
+    r = v;
+  if (j == 1)
+    r = v + v;
+  return r;
+}
+
+/* Compute exponential base e minus 1. max ulp err = 0.99746 */
+inline float expm1f(float a) {
+  float r;
+
+  r = expm1f_scaled_unchecked(a, 1.0f);
+  /* handle severe overflow and underflow */
+  if (::metal::abs(a - 1.0f) > 88.0f) {
+    r = ::metal::pow(2, a);
+    r = ::metal::fma(r, r, -1.0f);
+  }
+  return r;
+}
+
+} // namespace metal
+} // namespace c10
diff --git a/c10/metal/indexing.h b/c10/metal/indexing.h
index 679015c02aea..411c2ca68ae5 100644
--- a/c10/metal/indexing.h
+++ b/c10/metal/indexing.h
@@ -1,10 +1,19 @@
+<<<<<<< HEAD
 #pragma once
+=======
+// Metal indexing primitives
+#pragma once
+#include <c10/metal/common.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/metal/utils.h>
 #include <metal_stdlib>
 
 namespace c10 {
 namespace metal {
+<<<<<<< HEAD
 constant constexpr unsigned max_ndim = 16;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Given coordinates and strides, calculates offset from the start of the
 // tensors
@@ -103,5 +112,381 @@ kernel void unary_strided(
     }                                                                           \
   }
 
+<<<<<<< HEAD
+=======
+template <typename T, typename T2, typename F>
+kernel void unary_alpha_dense(
+    device result_of<F, T, T2>* output [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T2& alpha [[buffer(2)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  output[index] = f(input[index], alpha);
+}
+
+template <typename T, typename T2, typename F>
+kernel void unary_alpha_strided(
+    device result_of<F, T, T2>* output [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant long* sizes [[buffer(2)]],
+    constant long* input_strides [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant uint& ndim [[buffer(5)]],
+    constant T2& alpha [[buffer(6)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim);
+  output[output_offs] = f(input[input_offs], alpha);
+}
+
+#define REGISTER_UNARY_ALPHA_OP(NAME, DTYPEI, DTYPEA, DTYPEO)              \
+  static_assert(                                                           \
+      ::metal::is_same_v<                                                  \
+          DTYPEO,                                                          \
+          ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEA>>,        \
+      "Output dtype mismatch for unary op " #NAME " and input " #DTYPEI);  \
+  template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI                 \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::     \
+      unary_alpha_dense<DTYPEI, DTYPEA, NAME##_functor>(                   \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEA> * \
+              output,                                                      \
+          constant DTYPEI * input,                                         \
+          constant DTYPEA & alpha,                                         \
+          uint index);                                                     \
+  template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI               \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::     \
+      unary_alpha_strided<DTYPEI, DTYPEA, NAME##_functor>(                 \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEA> * \
+              output,                                                      \
+          constant DTYPEI * input,                                         \
+          constant long* sizes,                                            \
+          constant long* input_strides,                                    \
+          constant long* output_strides,                                   \
+          constant uint& ndim,                                             \
+          constant DTYPEA& alpha,                                          \
+          uint index)
+
+template <typename T>
+inline T val_at_offs(constant void* ptr, long offs) {
+  return *reinterpret_cast<constant T*>(
+      static_cast<constant char*>(ptr) + offs);
+}
+
+// Value at offset with dynamic cast from provided type
+template <typename T>
+inline T val_at_offs(constant void* ptr, long offs, ScalarType type) {
+  switch (type) {
+    case ScalarType::Bool:
+      return cast_to<T>(val_at_offs<bool>(ptr, offs));
+    case ScalarType::Byte:
+      return cast_to<T>(val_at_offs<uchar>(ptr, offs));
+    case ScalarType::Char:
+      return cast_to<T>(val_at_offs<char>(ptr, offs));
+    case ScalarType::Short:
+      return cast_to<T>(val_at_offs<short>(ptr, offs));
+    case ScalarType::Int:
+      return cast_to<T>(val_at_offs<int>(ptr, offs));
+    case ScalarType::Long:
+      return cast_to<T>(val_at_offs<long>(ptr, offs));
+    // Floats
+    case ScalarType::Float:
+      return cast_to<T>(val_at_offs<float>(ptr, offs));
+    case ScalarType::Half:
+      return cast_to<T>(val_at_offs<half>(ptr, offs));
+#if __METAL_VERSION__ >= 310
+    case ScalarType::BFloat16:
+      return cast_to<T>(val_at_offs<bfloat>(ptr, offs));
+#endif
+      // Complex
+    case ScalarType::ComplexHalf:
+      return cast_to<T>(val_at_offs<half2>(ptr, offs));
+    case ScalarType::ComplexFloat:
+      return cast_to<T>(val_at_offs<float2>(ptr, offs));
+  }
+}
+
+template <typename T>
+inline device T& ref_at_offs(device void* ptr, long offs) {
+  return *reinterpret_cast<device T*>(static_cast<device char*>(ptr) + offs);
+}
+
+// Binary elementwise ops kernels
+// Right now there are 4 flavors available:
+// - binary_dense where both input, other and output are dense and share the
+// same type
+// - binary_strided when all inputs are of the same types, but some elements are
+// strided
+// - binary_dense_cast - inputs are dense, but of different dtypes
+// - binary_strided_cast - inputs or output are strided and of different dtypes
+// TODO: Look like binary_dense_scalar are frequently used specialization that
+// should be added Pulse 4 variants of the same, but that accept optional
+// `alpha` parameter
+//   (currently only used add/sub/lerp.Scalar)
+// Note about accuracy (for more info see
+// https://github.com/pytorch/pytorch/issues/152736) Sometimes when kernel is
+// invoked to produce `half` output, but one of the arguments is float arguments
+// should be upcast to float, rather than downcast to half At the moment this is
+// expressed with `om_t` optional argument (which stands for opmath_type) which
+// is identical to output type but could be something else
+
+template <typename T, typename F, typename om_t = T>
+kernel void binary_strided(
+    device void* output [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant long* sizes [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant long* input_strides [[buffer(5)]],
+    constant long* other_strides [[buffer(6)]],
+    constant uint3& ndim [[buffer(7)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim.x);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim.x);
+  const auto other_offs = offset_from_coord(pos, other_strides, ndim.x);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim.x);
+  const auto a = val_at_offs<T>(input, input_offs);
+  const auto b = val_at_offs<T>(other, other_offs);
+  ref_at_offs<res_t>(output, output_offs) =
+      static_cast<res_t>(f(om_t(a), om_t(b)));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_strided(
+    device void* output [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    constant long* sizes [[buffer(4)]],
+    constant long* output_strides [[buffer(5)]],
+    constant long* input_strides [[buffer(6)]],
+    constant long* other_strides [[buffer(7)]],
+    constant uint3& ndim [[buffer(8)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim.x);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim.x);
+  const auto other_offs = offset_from_coord(pos, other_strides, ndim.x);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim.x);
+  const auto a = val_at_offs<T>(input, input_offs);
+  const auto b = val_at_offs<T>(other, other_offs);
+  ref_at_offs<result_of<F, T, T, T2>>(output, output_offs) = f(a, b, alpha);
+}
+
+template <typename T, typename F, typename om_t = opmath_t<T>>
+kernel void binary_strided_cast(
+    device void* output [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant long* sizes [[buffer(3)]],
+    constant long* output_strides [[buffer(4)]],
+    constant long* input_strides [[buffer(5)]],
+    constant long* other_strides [[buffer(6)]],
+    constant uint4& ndim_types [[buffer(7)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim_types.x);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim_types.x);
+  const auto other_offs = offset_from_coord(pos, other_strides, ndim_types.x);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim_types.x);
+  const auto a = val_at_offs<om_t>(
+      input, input_offs, static_cast<ScalarType>(ndim_types.y));
+  const auto b = val_at_offs<om_t>(
+      other, other_offs, static_cast<ScalarType>(ndim_types.z));
+  ref_at_offs<res_t>(output, output_offs) = static_cast<res_t>(f(a, b));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_strided_cast(
+    device void* output [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    constant long* sizes [[buffer(4)]],
+    constant long* output_strides [[buffer(5)]],
+    constant long* input_strides [[buffer(6)]],
+    constant long* other_strides [[buffer(7)]],
+    constant uint4& ndim_types [[buffer(8)]],
+    uint index [[thread_position_in_grid]]) {
+  F f;
+  int pos[max_ndim];
+  pos_from_thread_index(int(index), pos, sizes, ndim_types.x);
+  const auto input_offs = offset_from_coord(pos, input_strides, ndim_types.x);
+  const auto other_offs = offset_from_coord(pos, other_strides, ndim_types.x);
+  const auto output_offs = offset_from_coord(pos, output_strides, ndim_types.x);
+  const auto a =
+      val_at_offs<T>(input, input_offs, static_cast<ScalarType>(ndim_types.y));
+  const auto b =
+      val_at_offs<T>(other, other_offs, static_cast<ScalarType>(ndim_types.z));
+  ref_at_offs<result_of<F, T, T, T2>>(output, output_offs) = f(a, b, alpha);
+}
+
+template <typename T, typename F, typename om_t = opmath_t<T>>
+kernel void binary_dense(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T* other [[buffer(2)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  out[tid] = static_cast<res_t>(f(om_t(input[tid]), om_t(other[tid])));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T* other [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  out[tid] = f(input[tid], other[tid], alpha);
+}
+
+template <typename T, typename F, typename om_t = T>
+kernel void binary_dense_cast(
+    device result_of<F, T, T>* out [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant uint4& sizes_types [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  using res_t = result_of<F, T, T>;
+  const auto a = val_at_offs<om_t>(
+      input, tid * sizes_types.x, static_cast<ScalarType>(sizes_types.z));
+  const auto b = val_at_offs<om_t>(
+      other, tid * sizes_types.y, static_cast<ScalarType>(sizes_types.w));
+  out[tid] = static_cast<res_t>(f(a, b));
+}
+
+template <typename T, typename T2, typename F>
+kernel void binary_alpha_dense_cast(
+    device result_of<F, T, T, T2>* out [[buffer(0)]],
+    constant void* input [[buffer(1)]],
+    constant void* other [[buffer(2)]],
+    constant T2& alpha [[buffer(3)]],
+    constant uint4& sizes_types [[buffer(4)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  const auto a = val_at_offs<T>(
+      input, tid * sizes_types.x, static_cast<ScalarType>(sizes_types.z));
+  const auto b = val_at_offs<T>(
+      other, tid * sizes_types.y, static_cast<ScalarType>(sizes_types.w));
+  out[tid] = f(a, b, alpha);
+}
+
+#define REGISTER_BINARY_OP_(NAME, DTYPEI, DTYPEO, OMT)                         \
+  static_assert(                                                               \
+      ::metal::is_same_v<                                                      \
+          DTYPEO,                                                              \
+          ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI>>,            \
+      "Output dtype mismatch for binary op " #NAME " and input " #DTYPEI);     \
+  template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI)]] kernel void :: \
+      c10::metal::binary_strided<DTYPEI, NAME##_functor, OMT>(                 \
+          device void* out,                                                    \
+          constant void* input,                                                \
+          constant void* other,                                                \
+          constant long* sizes,                                                \
+          constant long* output_strides,                                       \
+          constant long* input_strides,                                        \
+          constant long* other_strides,                                        \
+          constant uint3& ndim,                                                \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_strided_cast_" #DTYPEI)]] kernel void ::c10::   \
+      metal::binary_strided_cast<DTYPEI, NAME##_functor, OMT>(                 \
+          device void* out,                                                    \
+          constant void* input,                                                \
+          constant void* other,                                                \
+          constant long* sizes,                                                \
+          constant long* output_strides,                                       \
+          constant long* input_strides,                                        \
+          constant long* other_strides,                                        \
+          constant uint4& ndim_types,                                          \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI)]] kernel void ::   \
+      c10::metal::binary_dense<DTYPEI, NAME##_functor, OMT>(                   \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          constant DTYPEI * input_,                                            \
+          constant DTYPEI * other_,                                            \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_cast_" #DTYPEI)]] kernel void ::c10::     \
+      metal::binary_dense_cast<DTYPEI, NAME##_functor, OMT>(                   \
+          device ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI> *     \
+              out_,                                                            \
+          constant void* input,                                                \
+          constant void* other,                                                \
+          constant uint4& sizes_types,                                         \
+          uint tid)
+
+// OpMath Binary Op promotes inputs to higher precision type before Functor call
+#define REGISTER_OPMATH_BINARY_OP(NAME, DTYPEI, DTYPEO) \
+  REGISTER_BINARY_OP_(NAME, DTYPEI, DTYPEO, ::c10::metal::opmath_t<DTYPEI>)
+
+#define REGISTER_BINARY_OP(NAME, DTYPEI, DTYPEO) \
+  REGISTER_BINARY_OP_(NAME, DTYPEI, DTYPEO, DTYPEI)
+
+#define REGISTER_BINARY_ALPHA_OP(NAME, DTYPEI, DTYPEA, DTYPEO)                 \
+  static_assert(                                                               \
+      ::metal::is_same_v<                                                      \
+          DTYPEO,                                                              \
+          ::c10::metal::result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA>>,    \
+      "Output dtype mismatch for binary op " #NAME " and input " #DTYPEI);     \
+  template [[host_name(#NAME "_strided_" #DTYPEO "_" #DTYPEI                   \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_strided<DTYPEI, DTYPEA, NAME##_functor>(                    \
+          device void* out,                                                    \
+          constant void* input,                                                \
+          constant void* other,                                                \
+          constant DTYPEA& alpha,                                              \
+          constant long* sizes,                                                \
+          constant long* output_strides,                                       \
+          constant long* input_strides,                                        \
+          constant long* other_strides,                                        \
+          constant uint3& ndim,                                                \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_strided_cast_" #DTYPEI                          \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_strided_cast<DTYPEI, DTYPEA, NAME##_functor>(               \
+          device void* out,                                                    \
+          constant void* input,                                                \
+          constant void* other,                                                \
+          constant DTYPEA& alpha,                                              \
+          constant long* sizes,                                                \
+          constant long* output_strides,                                       \
+          constant long* input_strides,                                        \
+          constant long* other_strides,                                        \
+          constant uint4& ndim_types,                                          \
+          uint tid);                                                           \
+  template [[host_name(#NAME "_dense_" #DTYPEO "_" #DTYPEI                     \
+                             "_" #DTYPEA)]] kernel void ::c10::metal::         \
+      binary_alpha_dense<DTYPEI, DTYPEA, NAME##_functor>(                      \
+          device ::c10::metal::                                                \
+                  result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *          \
+              out_,                                                            \
+          constant DTYPEI * input_,                                            \
+          constant DTYPEI * other_,                                            \
+          constant DTYPEA & alpha,                                             \
+          uint tid);                                                           \
+  template                                                                     \
+      [[host_name(#NAME "_dense_cast_" #DTYPEI "_" #DTYPEA)]] kernel void ::   \
+          c10::metal::binary_alpha_dense_cast<DTYPEI, DTYPEA, NAME##_functor>( \
+              device ::c10::metal::                                            \
+                      result_of<NAME##_functor, DTYPEI, DTYPEI, DTYPEA> *      \
+                  out_,                                                        \
+              constant void* input,                                            \
+              constant void* other,                                            \
+              constant DTYPEA& alpha,                                          \
+              constant uint4& sizes_types,                                     \
+              uint tid)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace metal
 } // namespace c10
diff --git a/c10/metal/random.h b/c10/metal/random.h
index 29c9c5836805..019968643594 100644
--- a/c10/metal/random.h
+++ b/c10/metal/random.h
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 // Philox Counter based RNG implemntation for Metal
+=======
+// Philox Counter based RNG implementation for Metal
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Borrowed from aten/src/ATen/core/PhiloxRNGEngine.h
 // Which in turn borrowed from
 // http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
diff --git a/c10/metal/reduction_utils.h b/c10/metal/reduction_utils.h
index 14ee775c4489..777575a1dbae 100644
--- a/c10/metal/reduction_utils.h
+++ b/c10/metal/reduction_utils.h
@@ -6,6 +6,7 @@
 namespace c10 {
 namespace metal {
 
+<<<<<<< HEAD
 template <typename T>
 opmath_t<T> threadgroup_sum(threadgroup T* data, unsigned size) {
   // TODO: This should be moved to the callee
@@ -31,6 +32,95 @@ opmath_t<T> threadgroup_prod(threadgroup T* data, unsigned size) {
 
 template <typename T>
 float2 threadgroup_welford_reduce(threadgroup T* data, unsigned size) {
+=======
+constant constexpr ushort simdgroup_size = 32;
+
+template <typename T>
+inline ::metal::enable_if_t<!::metal::is_same_v<T, long>, T> simd_sum(T val) {
+  return ::metal::simd_sum(val);
+}
+
+template <typename T>
+inline ::metal::enable_if_t<!::metal::is_same_v<T, long>, T> simd_prod(T val) {
+  return ::metal::simd_product(val);
+}
+
+// Metal does not support SIMD reductions over 64-bit types, but it could be
+// implement using simd_shuffle_down, that yields result in log2(simdgroup_size)
+// iterations Use fill variant, as shuffle down returns garbage if inactive
+// thread is referenced (on M1/M2, works fine on M4) and broadcast result to all
+// threads in the end. Implementation heavily borrows from
+// https://github.com/ml-explore/mlx/blob/86389bf9707f46101af45d90510e8e97c8a90b93/mlx/backend/metal/kernels/reduction/ops.h#L16
+template <typename T>
+inline ::metal::enable_if_t<::metal::is_same_v<T, long>, T> simd_sum(T val) {
+  for (ushort i = simdgroup_size / 2; i > 0; i /= 2) {
+    val += as_type<T>(
+        ::metal::simd_shuffle_and_fill_down(as_type<int2>(val), int2(0), i));
+  }
+  return as_type<T>(::metal::simd_broadcast(as_type<int2>(val), 0));
+}
+
+template <typename T>
+inline ::metal::enable_if_t<::metal::is_same_v<T, long>, T> simd_prod(T val) {
+  for (ushort i = simdgroup_size / 2; i > 0; i /= 2) {
+    val *= as_type<T>(
+        ::metal::simd_shuffle_and_fill_down(as_type<int2>(val), int2(0), i));
+  }
+  return as_type<T>(::metal::simd_broadcast(as_type<int2>(val), 0));
+}
+
+// Below algorithms are  written with hardcoded assumption that simdgroup is 32
+// and threadgroup_max is 1024, i.e. reduction can be done in two stages max
+template <typename T>
+opmath_t<T> threadgroup_sum(
+    threadgroup opmath_t<T>* data,
+    T val,
+    unsigned idx,
+    unsigned size) {
+  auto rc = simd_sum(static_cast<opmath_t<T>>(val));
+  if (idx % simdgroup_size == 0) {
+    data[idx / simdgroup_size] = rc;
+  }
+  if (size > simdgroup_size) {
+    ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+    if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) {
+      auto rc1 = simd_sum(data[idx]);
+      if (idx == 0) {
+        data[0] = rc1;
+      }
+    }
+  }
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  return data[0];
+}
+
+template <typename T>
+opmath_t<T> threadgroup_prod(
+    threadgroup opmath_t<T>* data,
+    T val,
+    unsigned idx,
+    unsigned size) {
+  auto rc = simd_prod(static_cast<opmath_t<T>>(val));
+  if (idx % simdgroup_size == 0) {
+    data[idx / simdgroup_size] = rc;
+  }
+  if (size > simdgroup_size) {
+    ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+    if (idx < ((size + simdgroup_size - 1) / simdgroup_size)) {
+      auto rc1 = simd_prod(data[idx]);
+      if (idx == 0) {
+        data[0] = rc1;
+      }
+    }
+  }
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  return data[0];
+}
+
+template <typename T>
+float3 threadgroup_welford_reduce(threadgroup T* data, unsigned size) {
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   float m = data[0];
   float m2 = 0;
   for (unsigned idx = 1; idx < size; ++idx) {
@@ -38,7 +128,33 @@ float2 threadgroup_welford_reduce(threadgroup T* data, unsigned size) {
     m += delta / (idx + 1);
     m2 += delta * (data[idx] - m);
   }
+<<<<<<< HEAD
   return float2(m, m2);
+=======
+  return float3(m, m2, size);
+}
+
+// Each vec3type is tuple of mean, m2 and weight
+template <typename T>
+float3 welford_combine(T a, T b) {
+  float delta = b.x - a.x;
+  float new_weight = a.z + b.z;
+  auto w2_over_w = new_weight != 0 ? b.z / new_weight : 0.0;
+  return float3(
+      a.x + delta * w2_over_w,
+      a.y + b.y + delta * delta * a.z * w2_over_w,
+      new_weight);
+}
+
+template <typename T>
+float3 threadgroup_welford_combine(threadgroup T* data, unsigned size) {
+  ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
+  float3 rc = data[0];
+  for (unsigned idx = 1; idx < size; ++idx) {
+    rc = welford_combine(rc, data[idx]);
+  }
+  return rc;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T>
@@ -68,7 +184,11 @@ int threadgroup_argmax(threadgroup T* data, unsigned size) {
   // TODO: This should be moved to the callee
   ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
   int rc = 0;
+<<<<<<< HEAD
   for (int idx = 1; idx < size; ++idx) {
+=======
+  for (unsigned idx = 1; idx < size; ++idx) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (data[idx] > data[rc]) {
       rc = idx;
     }
@@ -81,7 +201,11 @@ int threadgroup_argmin(threadgroup T* data, unsigned size) {
   // TODO: This should be moved to the callee
   ::metal::threadgroup_barrier(::metal::mem_flags::mem_threadgroup);
   int rc = 0;
+<<<<<<< HEAD
   for (int idx = 1; idx < size; ++idx) {
+=======
+  for (unsigned idx = 1; idx < size; ++idx) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (data[idx] < data[rc]) {
       rc = idx;
     }
diff --git a/c10/metal/special_math.h b/c10/metal/special_math.h
index 0bd442513da6..1a31c3a7f188 100644
--- a/c10/metal/special_math.h
+++ b/c10/metal/special_math.h
@@ -1,11 +1,16 @@
 // Implementation of specal math functions for Metal
 #pragma once
+<<<<<<< HEAD
+=======
+#include <c10/metal/expm1f.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/metal/utils.h>
 #include <metal_stdlib>
 
 namespace c10 {
 namespace metal {
 
+<<<<<<< HEAD
 // Translated to metal from https://www.johndcook.com/cpp_erf.html
 
 template <typename T>
@@ -30,6 +35,46 @@ inline T erf(T x) {
           ::metal::exp(-x * x);
 
   return sign * y;
+=======
+/*
+ * Approximation to the error function.
+ * Based on code from:
+ * https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff#answer-35148199
+ * Copy-n-pasted from
+ * https://github.com/ml-explore/mlx/blob/2e8cf0b4506c200a5c2d199ecbbf655fdf4c2ce2/mlx/backend/metal/kernels/erf.h#L11
+ */
+template <typename T>
+inline float erf(T x) {
+  const auto a = static_cast<float>(x);
+  const auto t = ::metal::abs(a);
+  const auto s = a * a;
+  if (t > 0.927734375f) {
+    // maximum error 0.99527 ulp
+    auto r = ::metal::fma(
+        -1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12
+    const auto u = ::metal::fma(
+        -3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6
+    r = ::metal::fma(r, s, u);
+    r = ::metal::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4
+    r = ::metal::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1
+    r = ::metal::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3
+    r = ::metal::fma(r, t, -t);
+    // TODO, replace with expm1 when implemented
+    r = 1.0f - ::metal::exp(r);
+    r = ::metal::copysign(r, a);
+    return r;
+  }
+
+  // maximum error 0.98929 ulp
+  auto r = -5.96761703e-4f; // -0x1.38e000p-11
+  r = ::metal::fma(r, s, 4.99119423e-3f); //  0x1.471a58p-8
+  r = ::metal::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6
+  r = ::metal::fma(r, s, 1.12819925e-1f); //  0x1.ce1c44p-4
+  r = ::metal::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2
+  r = ::metal::fma(r, s, 1.28379166e-1f); //  0x1.06eba8p-3
+  r = ::metal::fma(r, a, a);
+  return r;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T>
@@ -142,6 +187,55 @@ inline T i0(T _x) {
       (::metal::exp(x) * chbevl(32.0 / x - 2.0, B, 25)) / ::metal::sqrt(x));
 }
 
+<<<<<<< HEAD
+=======
+template <typename T>
+inline T i0e(T _x) {
+  auto x = ::metal::fabs(_x);
+
+  if (x <= 8.0) {
+    constexpr float coefficients[] = {
+        -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+        -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+        -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+        -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+        -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+        -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+        -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+        -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+        -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+        -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+        -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+        -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+        -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+        -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+        -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+    auto y = (x / 2.0) - 2.0;
+    return static_cast<T>(chbevl(y, coefficients, int{30}));
+  }
+
+  // x > 8
+  constexpr float coefficients[] = {
+      -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+      4.46562142029675999901E-17,  3.46122286769746109310E-17,
+      -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+      1.77256013305652638360E-15,  3.81168066935262242075E-15,
+      -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+      1.54008621752140982691E-14,  3.85277838274214270114E-13,
+      7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+      -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+      1.18891471078464383424E-11,  4.94060238822496958910E-10,
+      3.39623202570838634515E-9,   2.26666899049817806459E-8,
+      2.04891858946906374183E-7,   2.89137052083475648297E-6,
+      6.88975834691682398426E-5,   3.36911647825569408990E-3,
+      8.04490411014108831608E-1};
+
+  return static_cast<T>(
+      chbevl(32.0 / x - 2.0, coefficients, 25) / ::metal::sqrt(x));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Copied from
 // https://github.com/pytorch/pytorch/blob/58b661cda2c002a8e1ac3bee494bfe1f7420437c/aten/src/ATen/native/cuda/Math.cuh#L576
 
@@ -195,6 +289,57 @@ inline T i1(T _x) {
   return static_cast<T>(_x < T(0.) ? -out : out);
 }
 
+<<<<<<< HEAD
+=======
+template <typename T>
+inline T i1e(T _x) {
+  const auto x = ::metal::fabs(_x);
+  if (x <= 8.0) {
+    // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8].
+    // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2.
+    constexpr float coefficients[] = {
+        9.38153738649577178388E-9f,
+        -4.44505912879632808065E-8f,
+        2.00329475355213526229E-7f,
+        -8.56872026469545474066E-7f,
+        3.47025130813767847674E-6f,
+        -1.32731636560394358279E-5f,
+        4.78156510755005422638E-5f,
+        -1.61760815825896745588E-4f,
+        5.12285956168575772895E-4f,
+        -1.51357245063125314899E-3f,
+        4.15642294431288815669E-3f,
+        -1.05640848946261981558E-2f,
+        2.47264490306265168283E-2f,
+        -5.29459812080949914269E-2f,
+        1.02643658689847095384E-1f,
+        -1.76416518357834055153E-1f,
+        2.52587186443633654823E-1f};
+    const auto y = x / 2.0 - 2.0;
+    const auto out = chbevl(y, coefficients, 17) * x;
+    return static_cast<T>(_x < 0. ? -out : out);
+  }
+
+  // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+  //   in the inverted interval (8, infinity].
+  // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi).
+  // TODO: what's an "inverted interval"? Open on the left
+  //   and closed on the right?
+  constexpr float coefficients[] = {
+      -3.83538038596423702205E-9f,
+      -2.63146884688951950684E-8f,
+      -2.51223623787020892529E-7f,
+      -3.88256480887769039346E-6f,
+      -1.10588938762623716291E-4f,
+      -9.76109749136146840777E-3f,
+      7.78576235018280120474E-1f};
+
+  const auto out =
+      chbevl(32. / x - 2., coefficients, 7) / ::metal::precise::sqrt(x);
+  return static_cast<T>(_x < 0. ? -out : out);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // gamma, lgamma
 template <typename T>
 inline float log_gamma(const T);
@@ -384,6 +529,7 @@ inline float zeta(float x, float q) {
   return s;
 }
 
+<<<<<<< HEAD
 template <typename T0>
 inline float polygamma(const int64_t order, const T0 input) {
   float x = input;
@@ -392,6 +538,8 @@ inline float polygamma(const int64_t order, const T0 input) {
   return sgn * gamma(n + 1) * zeta(n + 1, x);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline float calc_digamma_positive_domain(float x) {
   constexpr float DIGAMMA_COEF[7] = {
       8.33333333333333333333E-2,
@@ -452,6 +600,22 @@ inline float digamma(T0 x) {
   }
 }
 
+<<<<<<< HEAD
+=======
+template <typename T0>
+inline float polygamma(const int64_t order, const T0 input) {
+  // Filter out n == 0.
+  if (order == 0) {
+    return digamma(input);
+  }
+
+  float x = input;
+  float n = order;
+  float sgn = ((order % 2) ? 1 : -1);
+  return sgn * gamma(n + 1) * zeta(n + 1, x);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 inline ::metal::enable_if_t<is_scalar_floating_point_v<T>, T> sinc(T a) {
   if (a == static_cast<T>(0)) {
@@ -504,6 +668,7 @@ inline T spherical_bessel_j0(T x) {
   return static_cast<T>(::metal::sin(x) / x);
 }
 
+<<<<<<< HEAD
 // Compute log(1+x) without losing precision for small values of x
 // Adapted from https://www.johndcook.com/blog/cpp_log_one_plus_x/
 template <typename T>
@@ -518,6 +683,8 @@ inline float log1p(T x) {
   return (-0.5 * x + 1.0) * x;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 inline float xlog1py(T x, T y) {
   if (::metal::isnan(y)) {
@@ -528,7 +695,11 @@ inline float xlog1py(T x, T y) {
     return x;
   }
 
+<<<<<<< HEAD
   return x * log1p(y);
+=======
+  return x * ::c10::metal::log1p(y);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T>
@@ -548,5 +719,1150 @@ inline T entr(T a) {
   return static_cast<T>(-INFINITY);
 }
 
+<<<<<<< HEAD
+=======
+// Copy-n-paste from aten/src/ATen/native/cuda/Math.cuh lines 1463-1915
+template <typename T>
+inline float bessel_j0_forward(T x) {
+  constexpr float PP[] = {
+      +7.96936729297347051624e-04,
+      +8.28352392107440799803e-02,
+      +1.23953371646414299388e+00,
+      +5.44725003058768775090e+00,
+      +8.74716500199817011941e+00,
+      +5.30324038235394892183e+00,
+      +9.99999999999999997821e-01,
+  };
+
+  constexpr float PQ[] = {
+      +9.24408810558863637013e-04,
+      +8.56288474354474431428e-02,
+      +1.25352743901058953537e+00,
+      +5.47097740330417105182e+00,
+      +8.76190883237069594232e+00,
+      +5.30605288235394617618e+00,
+      +1.00000000000000000218e+00,
+  };
+
+  constexpr float QP[] = {
+      -1.13663838898469149931e-02,
+      -1.28252718670509318512e+00,
+      -1.95539544257735972385e+01,
+      -9.32060152123768231369e+01,
+      -1.77681167980488050595e+02,
+      -1.47077505154951170175e+02,
+      -5.14105326766599330220e+01,
+      -6.05014350600728481186e+00,
+  };
+
+  constexpr float QQ[] = {
+      +6.43178256118178023184e+01,
+      +8.56430025976980587198e+02,
+      +3.88240183605401609683e+03,
+      +7.24046774195652478189e+03,
+      +5.93072701187316984827e+03,
+      +2.06209331660327847417e+03,
+      +2.42005740240291393179e+02,
+  };
+
+  constexpr float RP[] = {
+      -4.79443220978201773821e+09,
+      +1.95617491946556577543e+12,
+      -2.49248344360967716204e+14,
+      +9.70862251047306323952e+15,
+  };
+
+  constexpr float RQ[] = {
+      +4.99563147152651017219e+02,
+      +1.73785401676374683123e+05,
+      +4.84409658339962045305e+07,
+      +1.11855537045356834862e+10,
+      +2.11277520115489217587e+12,
+      +3.10518229857422583814e+14,
+      +3.18121955943204943306e+16,
+      +1.71086294081043136091e+18,
+  };
+
+  if (x < T(0)) {
+    x = -x;
+  }
+
+  if (x <= T(5.0)) {
+    if (x < T(0.00001)) {
+      return 1.0 - x * x / 4.0;
+    }
+
+    float rp = 0.0;
+
+    for (auto index = 0; index <= 3; index++) {
+      rp = rp * (x * x) + RP[index];
+    }
+
+    float rq = 0.0;
+
+    for (auto index = 0; index <= 7; index++) {
+      rq = rq * (x * x) + RQ[index];
+    }
+
+    return (x * x - 5.78318596294678452118e+00) *
+        (x * x - T(3.04712623436620863991e+01)) * rp / rq;
+  }
+
+  float pp = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pp = pp * (25.0 / (x * x)) + PP[index];
+  }
+
+  float pq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pq = pq * (25.0 / (x * x)) + PQ[index];
+  }
+
+  float qp = 0.0;
+
+  for (auto index = 0; index <= 7; index++) {
+    qp = qp * (25.0 / (x * x)) + QP[index];
+  }
+
+  float qq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    qq = qq * (25.0 / (x * x)) + QQ[index];
+  }
+
+  return (pp / pq *
+              ::metal::precise::cos(
+                  x - T(0.785398163397448309615660845819875721)) -
+          5.0 / x * (qp / qq) *
+              ::metal::precise::sin(
+                  x - 0.785398163397448309615660845819875721)) *
+      0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x);
+} // bessel_j0_forward(T x)
+
+template <typename T>
+inline float bessel_y0_forward(T x) {
+  constexpr float PP[] = {
+      +7.96936729297347051624e-04,
+      +8.28352392107440799803e-02,
+      +1.23953371646414299388e+00,
+      +5.44725003058768775090e+00,
+      +8.74716500199817011941e+00,
+      +5.30324038235394892183e+00,
+      +9.99999999999999997821e-01,
+  };
+
+  constexpr float PQ[] = {
+      +9.24408810558863637013e-04,
+      +8.56288474354474431428e-02,
+      +1.25352743901058953537e+00,
+      +5.47097740330417105182e+00,
+      +8.76190883237069594232e+00,
+      +5.30605288235394617618e+00,
+      +1.00000000000000000218e+00,
+  };
+
+  constexpr float QP[] = {
+      -1.13663838898469149931e-02,
+      -1.28252718670509318512e+00,
+      -1.95539544257735972385e+01,
+      -9.32060152123768231369e+01,
+      -1.77681167980488050595e+02,
+      -1.47077505154951170175e+02,
+      -5.14105326766599330220e+01,
+      -6.05014350600728481186e+00,
+  };
+
+  constexpr float QQ[] = {
+      +6.43178256118178023184e+01,
+      +8.56430025976980587198e+02,
+      +3.88240183605401609683e+03,
+      +7.24046774195652478189e+03,
+      +5.93072701187316984827e+03,
+      +2.06209331660327847417e+03,
+      +2.42005740240291393179e+02,
+  };
+
+  constexpr float YP[] = {
+      +1.55924367855235737965e+04,
+      -1.46639295903971606143e+07,
+      +5.43526477051876500413e+09,
+      -9.82136065717911466409e+11,
+      +8.75906394395366999549e+13,
+      -3.46628303384729719441e+15,
+      +4.42733268572569800351e+16,
+      -1.84950800436986690637e+16,
+  };
+
+  constexpr float YQ[] = {
+      +1.04128353664259848412e+03,
+      +6.26107330137134956842e+05,
+      +2.68919633393814121987e+08,
+      +8.64002487103935000337e+10,
+      +2.02979612750105546709e+13,
+      +3.17157752842975028269e+15,
+      +2.50596256172653059228e+17,
+  };
+
+  if (x <= T(5.0)) {
+    if (x == T(0.0)) {
+      return -INFINITY;
+    }
+
+    if (x < T(0.0)) {
+      return NAN;
+    }
+
+    float yp = 0.0;
+
+    for (auto index = 0; index <= 7; index++) {
+      yp = yp * (x * x) + YP[index];
+    }
+
+    float yq = 0.0;
+
+    for (auto index = 0; index <= 6; index++) {
+      yq = yq * (x * x) + YQ[index];
+    }
+
+    return yp / yq +
+        (0.636619772367581343075535053490057448 * ::metal::precise::log(x) *
+         bessel_j0_forward(x));
+  }
+
+  float pp = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pp = pp * (25.0 / (x * x)) + PP[index];
+  }
+
+  float pq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pq = pq * (25.0 / (x * x)) + PQ[index];
+  }
+
+  float qp = 0.0;
+
+  for (auto index = 0; index <= 7; index++) {
+    qp = qp * (25.0 / (x * x)) + QP[index];
+  }
+
+  float qq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    qq = qq * (25.0 / (x * x)) + QQ[index];
+  }
+
+  return (pp / pq *
+              ::metal::precise::sin(
+                  x - 0.785398163397448309615660845819875721) +
+          5.0 / x * (qp / qq) *
+              ::metal::precise::cos(
+                  x - 0.785398163397448309615660845819875721)) *
+      0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x);
+} // bessel_y0_forward(T x)
+
+template <typename T>
+inline float bessel_j1_forward(T x) {
+  constexpr float PP[] = {
+      +7.62125616208173112003e-04,
+      +7.31397056940917570436e-02,
+      +1.12719608129684925192e+00,
+      +5.11207951146807644818e+00,
+      +8.42404590141772420927e+00,
+      +5.21451598682361504063e+00,
+      +1.00000000000000000254e+00,
+  };
+
+  constexpr float PQ[] = {
+      +5.71323128072548699714e-04,
+      +6.88455908754495404082e-02,
+      +1.10514232634061696926e+00,
+      +5.07386386128601488557e+00,
+      +8.39985554327604159757e+00,
+      +5.20982848682361821619e+00,
+      +9.99999999999999997461e-01,
+  };
+
+  constexpr float QP[] = {
+      +5.10862594750176621635e-02,
+      +4.98213872951233449420e+00,
+      +7.58238284132545283818e+01,
+      +3.66779609360150777800e+02,
+      +7.10856304998926107277e+02,
+      +5.97489612400613639965e+02,
+      +2.11688757100572135698e+02,
+      +2.52070205858023719784e+01,
+  };
+
+  constexpr float QQ[] = {
+      +7.42373277035675149943e+01,
+      +1.05644886038262816351e+03,
+      +4.98641058337653607651e+03,
+      +9.56231892404756170795e+03,
+      +7.99704160447350683650e+03,
+      +2.82619278517639096600e+03,
+      +3.36093607810698293419e+02,
+  };
+
+  constexpr float RP[] = {
+      -8.99971225705559398224e+08,
+      +4.52228297998194034323e+11,
+      -7.27494245221818276015e+13,
+      +3.68295732863852883286e+15,
+  };
+
+  constexpr float RQ[] = {
+      +6.20836478118054335476e+02,
+      +2.56987256757748830383e+05,
+      +8.35146791431949253037e+07,
+      +2.21511595479792499675e+10,
+      +4.74914122079991414898e+12,
+      +7.84369607876235854894e+14,
+      +8.95222336184627338078e+16,
+      +5.32278620332680085395e+18,
+  };
+
+  if (x < T(0.0)) {
+    return -bessel_j1_forward(-x);
+  }
+
+  if (x <= T(5.0)) {
+    float rp = 0.0;
+
+    for (auto index = 0; index <= 3; index++) {
+      rp = rp * (x * x) + RP[index];
+    }
+
+    float rq = 0.0;
+
+    for (auto index = 0; index <= 7; index++) {
+      rq = rq * (x * x) + RQ[index];
+    }
+
+    return rp / rq * x * (x * x - 1.46819706421238932572e+01) *
+        (x * x - 4.92184563216946036703e+01);
+  }
+
+  float pp = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pp = pp * (5.0 / x * (5.0 / x)) + PP[index];
+  }
+
+  float pq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pq = pq * (5.0 / x * (5.0 / x)) + PQ[index];
+  }
+
+  float qp = 0.0;
+
+  for (auto index = 0; index <= 7; index++) {
+    qp = qp * (5.0 / x * (5.0 / x)) + QP[index];
+  }
+
+  float qq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    qq = qq * (5.0 / x * (5.0 / x)) + QQ[index];
+  }
+
+  return (pp / pq *
+              ::metal::precise::cos(
+                  x - 2.356194490192344928846982537459627163) -
+          5.0 / x * (qp / qq) *
+              ::metal::precise::sin(
+                  x - 2.356194490192344928846982537459627163)) *
+      0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x);
+} // bessel_j1_forward(T x)
+
+template <typename T>
+inline float bessel_y1_forward(T x) {
+  constexpr float PP[] = {
+      +7.62125616208173112003e-04,
+      +7.31397056940917570436e-02,
+      +1.12719608129684925192e+00,
+      +5.11207951146807644818e+00,
+      +8.42404590141772420927e+00,
+      +5.21451598682361504063e+00,
+      +1.00000000000000000254e+00,
+  };
+
+  constexpr float PQ[] = {
+      +5.71323128072548699714e-04,
+      +6.88455908754495404082e-02,
+      +1.10514232634061696926e+00,
+      +5.07386386128601488557e+00,
+      +8.39985554327604159757e+00,
+      +5.20982848682361821619e+00,
+      +9.99999999999999997461e-01,
+  };
+
+  constexpr float QP[] = {
+      +5.10862594750176621635e-02,
+      +4.98213872951233449420e+00,
+      +7.58238284132545283818e+01,
+      +3.66779609360150777800e+02,
+      +7.10856304998926107277e+02,
+      +5.97489612400613639965e+02,
+      +2.11688757100572135698e+02,
+      +2.52070205858023719784e+01,
+  };
+
+  constexpr float QQ[] = {
+      +7.42373277035675149943e+01,
+      +1.05644886038262816351e+03,
+      +4.98641058337653607651e+03,
+      +9.56231892404756170795e+03,
+      +7.99704160447350683650e+03,
+      +2.82619278517639096600e+03,
+      +3.36093607810698293419e+02,
+  };
+
+  constexpr float YP[] = {
+      +1.26320474790178026440e+09,
+      -6.47355876379160291031e+11,
+      +1.14509511541823727583e+14,
+      -8.12770255501325109621e+15,
+      +2.02439475713594898196e+17,
+      -7.78877196265950026825e+17,
+  };
+
+  constexpr float YQ[] = {
+      +5.94301592346128195359e+02,
+      +2.35564092943068577943e+05,
+      +7.34811944459721705660e+07,
+      +1.87601316108706159478e+10,
+      +3.88231277496238566008e+12,
+      +6.20557727146953693363e+14,
+      +6.87141087355300489866e+16,
+      +3.97270608116560655612e+18,
+  };
+
+  if (x <= T(5.0)) {
+    if (x == T(0.0)) {
+      return -INFINITY;
+    }
+
+    if (x <= T(0.0)) {
+      return NAN;
+    }
+
+    float yp = 0.0;
+
+    for (auto index = 0; index <= 5; index++) {
+      yp = yp * (x * x) + YP[index];
+    }
+
+    float yq = 0.0;
+
+    for (auto index = 0; index <= 7; index++) {
+      yq = yq * (x * x) + YQ[index];
+    }
+
+    return x * (yp / yq) +
+        (0.636619772367581343075535053490057448 *
+         (bessel_j1_forward(x) * ::metal::precise::log(x) - 1.0 / x));
+  }
+
+  float pp = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pp = pp * (5.0 / x * (5.0 / x)) + PP[index];
+  }
+
+  float pq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    pq = pq * (5.0 / x * (5.0 / x)) + PQ[index];
+  }
+
+  float qp = 0.0;
+
+  for (auto index = 0; index <= 7; index++) {
+    qp = qp * (5.0 / x * (5.0 / x)) + QP[index];
+  }
+
+  float qq = 0.0;
+
+  for (auto index = 0; index <= 6; index++) {
+    qq = qq * (5.0 / x * (5.0 / x)) + QQ[index];
+  }
+
+  return (pp / pq *
+              ::metal::precise::sin(
+                  x - 2.356194490192344928846982537459627163) +
+          5.0 / x * (qp / qq) *
+              ::metal::precise::cos(
+                  x - 2.356194490192344928846982537459627163)) *
+      0.797884560802865355879892119868763737 / ::metal::precise::sqrt(x);
+} // bessel_y1_forward(T x)
+
+template <typename T>
+inline float modified_bessel_i0_forward(T x) {
+  constexpr float A[] = {
+      -4.41534164647933937950e-18, +3.33079451882223809783e-17,
+      -2.43127984654795469359e-16, +1.71539128555513303061e-15,
+      -1.16853328779934516808e-14, +7.67618549860493561688e-14,
+      -4.85644678311192946090e-13, +2.95505266312963983461e-12,
+      -1.72682629144155570723e-11, +9.67580903537323691224e-11,
+      -5.18979560163526290666e-10, +2.65982372468238665035e-09,
+      -1.30002500998624804212e-08, +6.04699502254191894932e-08,
+      -2.67079385394061173391e-07, +1.11738753912010371815e-06,
+      -4.41673835845875056359e-06, +1.64484480707288970893e-05,
+      -5.75419501008210370398e-05, +1.88502885095841655729e-04,
+      -5.76375574538582365885e-04, +1.63947561694133579842e-03,
+      -4.32430999505057594430e-03, +1.05464603945949983183e-02,
+      -2.37374148058994688156e-02, +4.93052842396707084878e-02,
+      -9.49010970480476444210e-02, +1.71620901522208775349e-01,
+      -3.04682672343198398683e-01, +6.76795274409476084995e-01,
+  };
+
+  constexpr float B[] = {
+      -7.23318048787475395456e-18, -4.83050448594418207126e-18,
+      +4.46562142029675999901e-17, +3.46122286769746109310e-17,
+      -2.82762398051658348494e-16, -3.42548561967721913462e-16,
+      +1.77256013305652638360e-15, +3.81168066935262242075e-15,
+      -9.55484669882830764870e-15, -4.15056934728722208663e-14,
+      +1.54008621752140982691e-14, +3.85277838274214270114e-13,
+      +7.18012445138366623367e-13, -1.79417853150680611778e-12,
+      -1.32158118404477131188e-11, -3.14991652796324136454e-11,
+      +1.18891471078464383424e-11, +4.94060238822496958910e-10,
+      +3.39623202570838634515e-09, +2.26666899049817806459e-08,
+      +2.04891858946906374183e-07, +2.89137052083475648297e-06,
+      +6.88975834691682398426e-05, +3.36911647825569408990e-03,
+      +8.04490411014108831608e-01,
+  };
+
+  float p;
+  float q = 0.0;
+
+  if (::metal::fabs(x) <= 8.0) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 30; index++) {
+      p = q;
+      q = a;
+      a = (.5 * ::metal::fabs(x) - 2.0) * q - p + A[index];
+    }
+
+    return ::metal::exp(::metal::fabs(x)) * (T(0.5) * (a - p));
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (32.0 / ::metal::fabs(x) - 2.0) * q - p + B[index];
+  }
+
+  return ::metal::exp(::metal::fabs(x)) * (.5 * (b - p)) /
+      ::metal::precise::sqrt(::metal::fabs(x));
+} // modified_bessel_i0_forward(T x)
+
+template <typename T>
+inline float modified_bessel_i1_forward(T x) {
+  constexpr float A[] = {
+      +2.77791411276104639959e-18, -2.11142121435816608115e-17,
+      +1.55363195773620046921e-16, -1.10559694773538630805e-15,
+      +7.60068429473540693410e-15, -5.04218550472791168711e-14,
+      +3.22379336594557470981e-13, -1.98397439776494371520e-12,
+      +1.17361862988909016308e-11, -6.66348972350202774223e-11,
+      +3.62559028155211703701e-10, -1.88724975172282928790e-09,
+      +9.38153738649577178388e-09, -4.44505912879632808065e-08,
+      +2.00329475355213526229e-07, -8.56872026469545474066e-07,
+      +3.47025130813767847674e-06, -1.32731636560394358279e-05,
+      +4.78156510755005422638e-05, -1.61760815825896745588e-04,
+      +5.12285956168575772895e-04, -1.51357245063125314899e-03,
+      +4.15642294431288815669e-03, -1.05640848946261981558e-02,
+      +2.47264490306265168283e-02, -5.29459812080949914269e-02,
+      +1.02643658689847095384e-01, -1.76416518357834055153e-01,
+      +2.52587186443633654823e-01,
+  };
+
+  constexpr float B[] = {
+      +7.51729631084210481353e-18, +4.41434832307170791151e-18,
+      -4.65030536848935832153e-17, -3.20952592199342395980e-17,
+      +2.96262899764595013876e-16, +3.30820231092092828324e-16,
+      -1.88035477551078244854e-15, -3.81440307243700780478e-15,
+      +1.04202769841288027642e-14, +4.27244001671195135429e-14,
+      -2.10154184277266431302e-14, -4.08355111109219731823e-13,
+      -7.19855177624590851209e-13, +2.03562854414708950722e-12,
+      +1.41258074366137813316e-11, +3.25260358301548823856e-11,
+      -1.89749581235054123450e-11, -5.58974346219658380687e-10,
+      -3.83538038596423702205e-09, -2.63146884688951950684e-08,
+      -2.51223623787020892529e-07, -3.88256480887769039346e-06,
+      -1.10588938762623716291e-04, -9.76109749136146840777e-03,
+      +7.78576235018280120474e-01,
+  };
+
+  float p;
+  float q = 0.0;
+
+  if (::metal::fabs(x) <= T(8.0)) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 29; index++) {
+      p = q;
+      q = a;
+      a = (.5 * ::metal::fabs(x) - 2.0) * q - p + A[index];
+    }
+
+    return .5 * (a - p) * x * ::metal::precise::exp(::metal::fabs(x));
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (32.0 / ::metal::fabs(x) - 2.0) * q - p + B[index];
+  }
+
+  if (x < 0.0) {
+    return -(
+        ::metal::precise::exp(::metal::fabs(x)) * (0.5 * (b - p)) /
+        ::metal::precise::sqrt(::metal::fabs(x)));
+  }
+
+  return ::metal::precise::exp(::metal::fabs(x)) * (0.5 * (b - p)) /
+      ::metal::precise::sqrt(::metal::fabs(x));
+} // modified_bessel_i1_forward(T x)
+
+template <typename T>
+inline float modified_bessel_k0_forward(T x) {
+  constexpr float A[] = {
+      +1.37446543561352307156e-16,
+      +4.25981614279661018399e-14,
+      +1.03496952576338420167e-11,
+      +1.90451637722020886025e-09,
+      +2.53479107902614945675e-07,
+      +2.28621210311945178607e-05,
+      +1.26461541144692592338e-03,
+      +3.59799365153615016266e-02,
+      +3.44289899924628486886e-01,
+      -5.35327393233902768720e-01,
+  };
+
+  constexpr float B[] = {
+      +5.30043377268626276149e-18, -1.64758043015242134646e-17,
+      +5.21039150503902756861e-17, -1.67823109680541210385e-16,
+      +5.51205597852431940784e-16, -1.84859337734377901440e-15,
+      +6.34007647740507060557e-15, -2.22751332699166985548e-14,
+      +8.03289077536357521100e-14, -2.98009692317273043925e-13,
+      +1.14034058820847496303e-12, -4.51459788337394416547e-12,
+      +1.85594911495471785253e-11, -7.95748924447710747776e-11,
+      +3.57739728140030116597e-10, -1.69753450938905987466e-09,
+      +8.57403401741422608519e-09, -4.66048989768794782956e-08,
+      +2.76681363944501510342e-07, -1.83175552271911948767e-06,
+      +1.39498137188764993662e-05, -1.28495495816278026384e-04,
+      +1.56988388573005337491e-03, -3.14481013119645005427e-02,
+      +2.44030308206595545468e+00,
+  };
+
+  if (x == 0.0) {
+    return INFINITY;
+  }
+
+  if (x < 0.0) {
+    return NAN;
+  }
+
+  float p;
+  float q = 0.0;
+
+  if (x <= 2.0) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 10; index++) {
+      p = q;
+      q = a;
+      a = (x * x - 2.0) * q - p + A[index];
+    }
+
+    return 0.5 * (a - p) -
+        ::metal::log(0.5 * x) * modified_bessel_i0_forward(x);
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (8.0 / x - 2.0) * q - p + B[index];
+  }
+
+  return ::metal::exp(-x) * (0.5 * (b - p)) / ::metal::sqrt(x);
+} // modified_bessel_k0_forward(T x)
+
+template <typename T>
+inline float modified_bessel_k1_forward(T x) {
+  constexpr float A[] = {
+      -7.02386347938628759343e-18,
+      -2.42744985051936593393e-15,
+      -6.66690169419932900609e-13,
+      -1.41148839263352776110e-10,
+      -2.21338763073472585583e-08,
+      -2.43340614156596823496e-06,
+      -1.73028895751305206302e-04,
+      -6.97572385963986435018e-03,
+      -1.22611180822657148235e-01,
+      -3.53155960776544875667e-01,
+      +1.52530022733894777053e+00,
+  };
+
+  constexpr float B[] = {
+      -5.75674448366501715755e-18, +1.79405087314755922667e-17,
+      -5.68946255844285935196e-17, +1.83809354436663880070e-16,
+      -6.05704724837331885336e-16, +2.03870316562433424052e-15,
+      -7.01983709041831346144e-15, +2.47715442448130437068e-14,
+      -8.97670518232499435011e-14, +3.34841966607842919884e-13,
+      -1.28917396095102890680e-12, +5.13963967348173025100e-12,
+      -2.12996783842756842877e-11, +9.21831518760500529508e-11,
+      -4.19035475934189648750e-10, +2.01504975519703286596e-09,
+      -1.03457624656780970260e-08, +5.74108412545004946722e-08,
+      -3.50196060308781257119e-07, +2.40648494783721712015e-06,
+      -1.93619797416608296024e-05, +1.95215518471351631108e-04,
+      -2.85781685962277938680e-03, +1.03923736576817238437e-01,
+      +2.72062619048444266945e+00,
+  };
+
+  if (x == 0.0) {
+    return INFINITY;
+  }
+
+  if (x < 0.0) {
+    return NAN;
+  }
+
+  float p;
+  float q = 0.0;
+
+  if (x <= 2.0) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 11; index++) {
+      p = q;
+      q = a;
+      a = (x * x - T(2.0)) * q - p + A[index];
+    }
+
+    return ::metal::precise::log(T(0.5) * x) * modified_bessel_i1_forward(x) +
+        0.5 * (a - p) / x;
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (8.0 / x - 2.0) * q - p + B[index];
+  }
+
+  return ::metal::precise::exp(-x) * (0.5 * (b - p)) /
+      ::metal::precise::sqrt(x);
+}
+
+template <typename T>
+inline float scaled_modified_bessel_k0_forward(T x) {
+  constexpr float A[] = {
+      +1.37446543561352307156e-16,
+      +4.25981614279661018399e-14,
+      +1.03496952576338420167e-11,
+      +1.90451637722020886025e-09,
+      +2.53479107902614945675e-07,
+      +2.28621210311945178607e-05,
+      +1.26461541144692592338e-03,
+      +3.59799365153615016266e-02,
+      +3.44289899924628486886e-01,
+      -5.35327393233902768720e-01,
+  };
+
+  constexpr float B[] = {
+      +5.30043377268626276149e-18, -1.64758043015242134646e-17,
+      +5.21039150503902756861e-17, -1.67823109680541210385e-16,
+      +5.51205597852431940784e-16, -1.84859337734377901440e-15,
+      +6.34007647740507060557e-15, -2.22751332699166985548e-14,
+      +8.03289077536357521100e-14, -2.98009692317273043925e-13,
+      +1.14034058820847496303e-12, -4.51459788337394416547e-12,
+      +1.85594911495471785253e-11, -7.95748924447710747776e-11,
+      +3.57739728140030116597e-10, -1.69753450938905987466e-09,
+      +8.57403401741422608519e-09, -4.66048989768794782956e-08,
+      +2.76681363944501510342e-07, -1.83175552271911948767e-06,
+      +1.39498137188764993662e-05, -1.28495495816278026384e-04,
+      +1.56988388573005337491e-03, -3.14481013119645005427e-02,
+      +2.44030308206595545468e+00,
+  };
+
+  if (x == 0.0) {
+    return INFINITY;
+  }
+
+  if (x < 0.0) {
+    return NAN;
+  }
+
+  float p;
+  float q = 0.0;
+
+  if (x <= 2.0) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 10; index++) {
+      p = q;
+      q = a;
+      a = (x * x - T(2.0)) * q - p + A[index];
+    }
+
+    return (0.5 * (a - p) -
+            ::metal::precise::log(0.5 * x) * modified_bessel_i0_forward(x)) *
+        ::metal::precise::exp(x);
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (8.0 / x - 2.0) * q - p + B[index];
+  }
+
+  return 0.5 * (b - p) / ::metal::precise::sqrt(x);
+}
+
+template <typename T>
+inline float scaled_modified_bessel_k1_forward(T x) {
+  constexpr float A[] = {
+      -7.02386347938628759343e-18,
+      -2.42744985051936593393e-15,
+      -6.66690169419932900609e-13,
+      -1.41148839263352776110e-10,
+      -2.21338763073472585583e-08,
+      -2.43340614156596823496e-06,
+      -1.73028895751305206302e-04,
+      -6.97572385963986435018e-03,
+      -1.22611180822657148235e-01,
+      -3.53155960776544875667e-01,
+      +1.52530022733894777053e+00,
+  };
+
+  constexpr float B[] = {
+      -5.75674448366501715755e-18, +1.79405087314755922667e-17,
+      -5.68946255844285935196e-17, +1.83809354436663880070e-16,
+      -6.05704724837331885336e-16, +2.03870316562433424052e-15,
+      -7.01983709041831346144e-15, +2.47715442448130437068e-14,
+      -8.97670518232499435011e-14, +3.34841966607842919884e-13,
+      -1.28917396095102890680e-12, +5.13963967348173025100e-12,
+      -2.12996783842756842877e-11, +9.21831518760500529508e-11,
+      -4.19035475934189648750e-10, +2.01504975519703286596e-09,
+      -1.03457624656780970260e-08, +5.74108412545004946722e-08,
+      -3.50196060308781257119e-07, +2.40648494783721712015e-06,
+      -1.93619797416608296024e-05, +1.95215518471351631108e-04,
+      -2.85781685962277938680e-03, +1.03923736576817238437e-01,
+      +2.72062619048444266945e+00,
+  };
+
+  if (x == 0.0) {
+    return INFINITY;
+  }
+
+  if (x < 0.0) {
+    return NAN;
+  }
+
+  float p;
+  float q = 0.0;
+
+  if (x <= 2.0) {
+    float a = A[0];
+
+    for (uint8_t index = 1; index < 11; index++) {
+      p = q;
+      q = a;
+      a = (x * x - 2.0) * q - p + A[index];
+    }
+
+    return (::metal::precise::log(0.5 * x) * modified_bessel_i1_forward(x) +
+            0.5 * (a - p) / x) *
+        ::metal::precise::exp(x);
+  }
+
+  float b = B[0];
+
+  for (uint8_t index = 1; index < 25; index++) {
+    p = q;
+    q = b;
+    b = (8.0 / x - 2.0) * q - p + B[index];
+  }
+
+  return (0.5 * (b - p) / ::metal::precise::sqrt(x));
+}
+
+template <typename T>
+float chebyshev_polynomial_t_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (::metal::fabs(x) == 1.0) {
+    if (x > 0.0 || n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  if ((n > 6) && (::metal::precise::fabs(x) < 1.0)) {
+    return ::metal::precise::cos(n * ::metal::precise::acos(x));
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return x;
+  }
+
+  float p = 1.0;
+  float q = x;
+  float r;
+
+  for (int64_t k = 2; k <= n; k++) {
+    r = (x + x) * q - p;
+    p = q;
+    q = r;
+  }
+  return r;
+}
+
+template <typename T>
+float chebyshev_polynomial_u_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (::metal::fabs(x) == 1.0) {
+    if (x > 0.0 || n % 2 == 0) {
+      return n + 1;
+    }
+
+    return -(n + 1);
+  }
+
+  if ((n > 8) && (::metal::fabs(x) < 1.0)) {
+    const auto acos_x = ::metal::precise::acos(x);
+    if (::metal::precise::sin(acos_x) != 0.0) {
+      return ::metal::precise::sin((n + 1) * acos_x) /
+          ::metal::precise::sin(acos_x);
+    }
+
+    return (n + 1) * ::metal::precise::cos((n + 1) * acos_x) / x;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  auto q = 2.0 * x;
+  if (n == 1) {
+    return q;
+  }
+
+  auto p = 1.0;
+  float r;
+
+  for (int64_t k = 2; k <= n; k++) {
+    r = 2 * x * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+}
+
+template <typename T>
+float chebyshev_polynomial_v_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (::metal::fabs(x) == 1.0) {
+    if (x > 0.0) {
+      return 1.0;
+    }
+
+    if (n % 2 == 0) {
+      return n + n + 1;
+    }
+
+    return -(n + n + 1);
+  }
+
+  if ((n > 8) && (::metal::fabs(x) < 1.0)) {
+    const auto acos_x = ::metal::precise::acos(x);
+    if (::metal::precise::sin(.5 * acos_x) != 1.0) {
+      return ::metal::precise::cos((n + 0.5) * acos_x) /
+          ::metal::precise::cos(.5 * acos_x);
+    }
+
+    if (n % 2 == 0) {
+      return n + n + 1;
+    }
+
+    return -(n + n + 1);
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  auto q = 2.0 * x - 1.0;
+  if (n == 1) {
+    return q;
+  }
+
+  auto p = 1.0;
+  float r;
+
+  for (int64_t k = 2; k <= n; k++) {
+    r = 2 * x * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // chebyshev_polynomial_v_forward(T x, int64_t n)
+
+template <typename T>
+float chebyshev_polynomial_w_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (::metal::fabs(x) == 1.0) {
+    if (x > 0.0) {
+      return n + n + 1;
+    }
+
+    if (n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  if ((n > 8) && (::metal::fabs(x) < 1.0)) {
+    const auto acos_x = ::metal::precise::acos(x);
+    if (::metal::precise::cos(.5 * acos_x) != 1.0) {
+      return ::metal::precise::sin((n + 0.5) * acos_x) /
+          ::metal::precise::sin(.5 * acos_x);
+    }
+
+    if (x > 0.0) {
+      return n + n + 1;
+    }
+
+    if (n % 2 == 0) {
+      return 1.0;
+    }
+
+    return -1.0;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  auto q = 2.0 * x + 1.0;
+  if (n == 1) {
+    return q;
+  }
+
+  auto p = 1.0;
+  float r;
+
+  for (int64_t k = 2; k <= n; k++) {
+    r = 2.0 * x * q - p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // chebyshev_polynomial_w_forward(T x, int64_t n)
+
+template <typename T>
+// TODO: Add 512 if/when double will be supported in Metal
+inline constexpr int getHermitianLimit() {
+  return 128;
+}
+
+template <typename T>
+inline float hermite_polynomial_h_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return x + x;
+  }
+
+  if (n > getHermitianLimit<T>()) {
+    return NAN;
+  }
+
+  float p = 1.0;
+  float q = x + x;
+  float r = 0.0;
+
+  for (int64_t k = 2; k < n + n; k += 2) {
+    r = (x + x) * q - k * p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // hermite_polynomial_h_forward(T x, int64_t n)
+
+template <typename T>
+inline float hermite_polynomial_he_forward(T x, int64_t n) {
+  if (n < 0) {
+    return 0.0;
+  }
+
+  if (n == 0) {
+    return 1.0;
+  }
+
+  if (n == 1) {
+    return x;
+  }
+
+  if (n > getHermitianLimit<T>()) {
+    return NAN;
+  }
+
+  float p = 1.0;
+  float q = x;
+  float r;
+
+  for (int64_t k = 1; k < n; k++) {
+    r = x * q - k * p;
+    p = q;
+    q = r;
+  }
+
+  return r;
+} // hermite_polynomial_he_forward(T x, int64_t n)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace metal
 } // namespace c10
diff --git a/c10/metal/utils.h b/c10/metal/utils.h
index 4318077a7de1..609a94d34ff3 100644
--- a/c10/metal/utils.h
+++ b/c10/metal/utils.h
@@ -1,5 +1,9 @@
 // Metal helper functions
 #pragma once
+<<<<<<< HEAD
+=======
+#include <c10/metal/common.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <metal_stdlib>
 
 namespace c10 {
@@ -84,6 +88,30 @@ struct OpMathType<bfloat> {
   using type = float;
 };
 #endif
+<<<<<<< HEAD
+=======
+
+// Type promotion structure for higher precision accumulation
+template <typename T>
+struct AccumulationType {
+  using type = T;
+};
+
+// Specialization for half - promote to float for accumulation
+template <>
+struct AccumulationType<half> {
+  using type = float;
+};
+
+#if __METAL_VERSION__ >= 310
+// Specialization for bfloat - promote to float for accumulation
+template <>
+struct AccumulationType<bfloat> {
+  using type = float;
+};
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace detail
 
 template <typename T>
@@ -91,9 +119,16 @@ ::metal::enable_if_t<::metal::is_floating_point_v<T>, T> max(T a, T b) {
   return ::metal::isunordered(a, b) ? NAN : ::metal::max(a, b);
 }
 
+<<<<<<< HEAD
 template <typename T>
 ::metal::enable_if_t<::metal::is_integral_v<T>, T> max(T a, T b) {
   return ::metal::max(a, b);
+=======
+template <typename T, typename U>
+::metal::enable_if_t<::metal::is_integral_v<T>&& ::metal::is_integral_v<U>, T>
+max(T a, U b) {
+  return ::metal::max(a, static_cast<T>(b));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T>
@@ -101,9 +136,16 @@ ::metal::enable_if_t<::metal::is_floating_point_v<T>, T> min(T a, T b) {
   return ::metal::isunordered(a, b) ? NAN : ::metal::min(a, b);
 }
 
+<<<<<<< HEAD
 template <typename T>
 ::metal::enable_if_t<::metal::is_integral_v<T>, T> min(T a, T b) {
   return ::metal::min(a, b);
+=======
+template <typename T, typename U>
+::metal::enable_if_t<::metal::is_integral_v<T>&& ::metal::is_integral_v<U>, T>
+min(T a, U b) {
+  return ::metal::min(a, static_cast<T>(b));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #if __METAL_VERSION__ >= 310
@@ -129,6 +171,12 @@ using vec4type_t = typename detail::vectypes<T>::type4;
 template <typename T>
 using opmath_t = typename detail::OpMathType<T>::type;
 
+<<<<<<< HEAD
+=======
+template <typename T>
+using accum_t = typename detail::AccumulationType<T>::type;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // TODO: Move it to type_traits header may be
 template <typename F, typename... Args>
 using result_of = decltype(::metal::declval<F>()(::metal::declval<Args>()...));
@@ -145,5 +193,166 @@ template <typename T>
 constexpr constant bool is_scalar_integral_v =
     ::metal::is_integral_v<T> && ::metal::is_scalar_v<T>;
 
+<<<<<<< HEAD
+=======
+template <typename U, typename V>
+using common_dtype = decltype(U(0) + V(0));
+
+// floor_divide
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_integral_v<T> && is_scalar_integral_v<U>,
+        bool> = true>
+inline common_dtype<T, U> floor_divide(T x, U y) {
+  const auto quot = x / y;
+  return (x < 0) == (y < 0) ? quot : (x % y != 0) ? quot - 1 : quot;
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_floating_point_v<T> && is_scalar_floating_point_v<U>,
+        bool> = true>
+inline common_dtype<T, U> floor_divide(T x, U y) {
+  return ::metal::floor(x / y);
+}
+
+// fmod
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_integral_v<T> && is_scalar_integral_v<U>,
+        bool> = true>
+inline common_dtype<T, U> fmod(T x, U y) {
+  return x % y;
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_floating_point_v<T> && is_scalar_floating_point_v<U>,
+        bool> = true>
+inline common_dtype<T, U> fmod(T x, U y) {
+  return ::metal::fmod(x, y);
+}
+
+// cast_to primitives
+//  - No-op if types as the same
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<::metal::is_same_v<U, T>, bool> = true>
+inline T cast_to(const U from) {
+  return from;
+}
+//  - Simple cast between scalar and complex dtypes
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        !::metal::is_same_v<U, T> && (is_complex_v<T> == is_complex_v<U>),
+        bool> = true>
+inline T cast_to(const U from) {
+  return static_cast<T>(from);
+}
+
+// - Scalar to complex
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<is_complex_v<T> && !is_complex_v<U>, bool> = true>
+inline T cast_to(const U from) {
+  return T(float(from), 0.0);
+}
+// - Complex to scalar (should not really be used, but exists for compliteness)
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<!is_complex_v<T> && is_complex_v<U>, bool> = true>
+inline T cast_to(const U from) {
+  return static_cast<T>(from.x);
+}
+
+// Generalizable math operators (used for both scalar and complex)
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<!is_complex_v<T>, bool> = true>
+inline common_dtype<T, U> mul(const T x, const U y) {
+  return x * y;
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<is_complex_v<T> && is_complex_v<U>, bool> = true>
+inline common_dtype<T, U> mul(const T x, const U y) {
+  return T(x.x * y.x - x.y * y.y, x.x * y.y + x.y * y.x);
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<!is_complex_v<T>, bool> = true>
+inline common_dtype<T, U> div(const T x, const U y) {
+  return x / y;
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<is_complex_v<T> && is_complex_v<U>, bool> = true>
+inline common_dtype<T, U> div(const T x, const U y) {
+  return T(::metal::dot(x, y), x.y * y.x - x.x * y.y) / ::metal::dot(y, y);
+}
+
+// Remainder operator
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_floating_point_v<T> || is_scalar_floating_point_v<U>,
+        bool> = true>
+inline float remainder(const T x, const U y) {
+  const auto x_f = static_cast<float>(x);
+  const auto y_f = static_cast<float>(y);
+  return x_f - y_f * floor_divide(x_f, y_f);
+}
+
+template <
+    typename T,
+    typename U,
+    ::metal::enable_if_t<
+        is_scalar_integral_v<T> && is_scalar_integral_v<U>,
+        bool> = true>
+inline common_dtype<T, U> remainder(const T x, const U y) {
+  auto rc = x % y;
+  return rc == 0 || (x ^ y) > 0 ? rc : rc + y;
+}
+
+// Based on algorithm described in
+// https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#1202
+inline float log1p(float x) {
+  const auto xp1 = 1.0f + x;
+  // First two elements of Taylor series for log(1+x) in Horner's form are:
+  // log(1+x) = x * (1 - x * (.5 ...)), but if 1 + x == x, then it's just x
+  if (xp1 == 1.0f) {
+    return x;
+  }
+  auto rc = ::metal::precise::log(xp1);
+  if (x > -.5 && x < .5) {
+    // Order of operations is important here for higher precision
+    rc *= x / (xp1 - 1.0f);
+  }
+  return rc;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace metal
 } // namespace c10
diff --git a/c10/ovrsource_defs.bzl b/c10/ovrsource_defs.bzl
index 804e7bceb968..9f94a88540ef 100644
--- a/c10/ovrsource_defs.bzl
+++ b/c10/ovrsource_defs.bzl
@@ -74,6 +74,10 @@ def define_c10_ovrsource(name, is_mobile):
             ],
         }),
         exported_deps = [
+<<<<<<< HEAD
+=======
+            "//xplat/caffe2/torch/headeronly:torch_headeronly",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ":ovrsource_c10_cmake_macros.h",
             "//arvr/third-party/gflags:gflags",
             "//third-party/cpuinfo:cpuinfo",
diff --git a/c10/test/core/CompileTimeFunctionPointer_test.cpp b/c10/test/core/CompileTimeFunctionPointer_test.cpp
index bde0089ef2d8..38ea14d11c3e 100644
--- a/c10/test/core/CompileTimeFunctionPointer_test.cpp
+++ b/c10/test/core/CompileTimeFunctionPointer_test.cpp
@@ -4,13 +4,21 @@
 namespace test_is_compile_time_function_pointer {
 static_assert(!c10::is_compile_time_function_pointer<void()>::value);
 
+<<<<<<< HEAD
 void dummy() {}
+=======
+static void dummy() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static_assert(
     c10::is_compile_time_function_pointer<TORCH_FN_TYPE(dummy)>::value);
 } // namespace test_is_compile_time_function_pointer
 
 namespace test_access_through_type {
+<<<<<<< HEAD
 void dummy() {}
+=======
+static void dummy() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using dummy_ptr = TORCH_FN_TYPE(dummy);
 static_assert(c10::is_compile_time_function_pointer<dummy_ptr>::value);
 static_assert(dummy_ptr::func_ptr() == &dummy);
@@ -18,14 +26,22 @@ static_assert(std::is_same_v<void(), dummy_ptr::FuncType>);
 } // namespace test_access_through_type
 
 namespace test_access_through_value {
+<<<<<<< HEAD
 void dummy() {}
+=======
+static void dummy() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr auto dummy_ptr = TORCH_FN(dummy);
 static_assert(dummy_ptr.func_ptr() == &dummy);
 static_assert(std::is_same_v<void(), decltype(dummy_ptr)::FuncType>);
 } // namespace test_access_through_value
 
 namespace test_access_through_type_also_works_if_specified_as_pointer {
+<<<<<<< HEAD
 void dummy() {}
+=======
+static void dummy() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using dummy_ptr = TORCH_FN_TYPE(&dummy);
 static_assert(c10::is_compile_time_function_pointer<dummy_ptr>::value);
 static_assert(dummy_ptr::func_ptr() == &dummy);
@@ -33,14 +49,22 @@ static_assert(std::is_same_v<void(), dummy_ptr::FuncType>);
 } // namespace test_access_through_type_also_works_if_specified_as_pointer
 
 namespace test_access_through_value_also_works_if_specified_as_pointer {
+<<<<<<< HEAD
 void dummy() {}
+=======
+static void dummy() {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 constexpr auto dummy_ptr = TORCH_FN(&dummy);
 static_assert(dummy_ptr.func_ptr() == &dummy);
 static_assert(std::is_same_v<void(), decltype(dummy_ptr)::FuncType>);
 } // namespace test_access_through_value_also_works_if_specified_as_pointer
 
 namespace test_run_through_type {
+<<<<<<< HEAD
 int add(int a, int b) {
+=======
+static int add(int a, int b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return a + b;
 }
 using Add = TORCH_FN_TYPE(add);
@@ -58,11 +82,19 @@ TEST(CompileTimeFunctionPointerTest, runFunctionThroughType) {
 } // namespace test_run_through_type
 
 namespace test_run_through_value {
+<<<<<<< HEAD
 int add(int a, int b) {
   return a + b;
 }
 template <class Func>
 int execute(Func, int a, int b) {
+=======
+static int add(int a, int b) {
+  return a + b;
+}
+template <class Func>
+static int execute(Func, int a, int b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return Func::func_ptr()(a, b);
 }
 
diff --git a/c10/test/core/DispatchKeySet_test.cpp b/c10/test/core/DispatchKeySet_test.cpp
index c24fa4047e59..56d27edaac37 100644
--- a/c10/test/core/DispatchKeySet_test.cpp
+++ b/c10/test/core/DispatchKeySet_test.cpp
@@ -443,3 +443,16 @@ TEST(DispatchKeySet, TestFunctionalityDispatchKeyToString) {
     seen_strings.insert(res);
   }
 }
+<<<<<<< HEAD
+=======
+
+TEST(DispatchKeySet, TestGetRuntimeDispatchKeySet) {
+  // Check if getRuntimeDispatchKeySet and runtimeDispatchKeySetHas agree.
+  for (auto dk1 : DispatchKeySet(DispatchKeySet::FULL)) {
+    auto dks = getRuntimeDispatchKeySet(dk1);
+    for (auto dk2 : DispatchKeySet(DispatchKeySet::FULL)) {
+      ASSERT_EQ(dks.has(dk2), runtimeDispatchKeySetHas(dk1, dk2));
+    }
+  }
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/c10/test/util/Enumerate_test.cpp b/c10/test/util/Enumerate_test.cpp
new file mode 100644
index 000000000000..f380654d71da
--- /dev/null
+++ b/c10/test/util/Enumerate_test.cpp
@@ -0,0 +1,269 @@
+/*
+ * Ported from folly/container/test/EnumerateTest.cpp
+ */
+
+#include <c10/util/Enumerate.h>
+#include <gtest/gtest.h>
+#include <array>
+
+namespace {
+
+template <class T>
+struct IsConstReference {
+  constexpr static bool value = false;
+};
+template <class T>
+struct IsConstReference<const T&> {
+  constexpr static bool value = true;
+};
+
+constexpr int basicSum(const std::array<int, 3>& test) {
+  int sum = 0;
+  for (auto it : c10::enumerate(test)) {
+    sum += *it;
+  }
+  return sum;
+}
+
+constexpr int cpp17StructuredBindingSum(const std::array<int, 3>& test) {
+  int sum = 0;
+  for (auto&& [_, integer] : c10::enumerate(test)) {
+    sum += integer;
+  }
+  return sum;
+}
+
+} // namespace
+
+TEST(Enumerate, Basic) {
+  std::vector<std::string> v = {"abc", "a", "ab"};
+  size_t i = 0;
+  for (auto it : c10::enumerate(v)) {
+    EXPECT_EQ(it.index, i);
+    EXPECT_EQ(*it, v[i]);
+    EXPECT_EQ(it->size(), v[i].size());
+
+    /* Test mutability. */
+    std::string newValue = "x";
+    *it = newValue;
+    EXPECT_EQ(newValue, v[i]);
+
+    ++i;
+  }
+
+  EXPECT_EQ(i, v.size());
+}
+
+TEST(Enumerate, BasicRRef) {
+  std::vector<std::string> v = {"abc", "a", "ab"};
+  size_t i = 0;
+  for (auto&& it : c10::enumerate(v)) {
+    EXPECT_EQ(it.index, i);
+    EXPECT_EQ(*it, v[i]);
+    EXPECT_EQ(it->size(), v[i].size());
+
+    /* Test mutability. */
+    std::string newValue = "x";
+    *it = newValue;
+    EXPECT_EQ(newValue, v[i]);
+
+    ++i;
+  }
+
+  EXPECT_EQ(i, v.size());
+}
+
+TEST(Enumerate, BasicConst) {
+  std::vector<std::string> v = {"abc", "a", "ab"};
+  size_t i = 0;
+  for (const auto it : c10::enumerate(v)) {
+    static_assert(IsConstReference<decltype(*it)>::value, "Const enumeration");
+    EXPECT_EQ(it.index, i);
+    EXPECT_EQ(*it, v[i]);
+    EXPECT_EQ(it->size(), v[i].size());
+    ++i;
+  }
+
+  EXPECT_EQ(i, v.size());
+}
+
+TEST(Enumerate, BasicConstRef) {
+  std::vector<std::string> v = {"abc", "a", "ab"};
+  size_t i = 0;
+  for (const auto& it : c10::enumerate(v)) {
+    static_assert(IsConstReference<decltype(*it)>::value, "Const enumeration");
+    EXPECT_EQ(it.index, i);
+    EXPECT_EQ(*it, v[i]);
+    EXPECT_EQ(it->size(), v[i].size());
+    ++i;
+  }
+
+  EXPECT_EQ(i, v.size());
+}
+
+TEST(Enumerate, BasicConstRRef) {
+  std::vector<std::string> v = {"abc", "a", "ab"};
+  size_t i = 0;
+  for (const auto&& it : c10::enumerate(v)) {
+    static_assert(IsConstReference<decltype(*it)>::value, "Const enumeration");
+    EXPECT_EQ(it.index, i);
+    EXPECT_EQ(*it, v[i]);
+    EXPECT_EQ(it->size(), v[i].size());
+    ++i;
+  }
+
+  EXPECT_EQ(i, v.size());
+}
+
+TEST(Enumerate, BasicVecBool) {
+  std::vector<bool> v = {true, false, false, true};
+  size_t i = 0;
+  for (auto it : c10::enumerate(v)) {
+    EXPECT_EQ(it.index, i);
+    EXPECT_EQ(*it, v[i]);
+    ++i;
+  }
+
+  EXPECT_EQ(i, v.size());
+}
+
+TEST(Enumerate, BasicVecBoolRRef) {
+  std::vector<bool> v = {true, false, false, true};
+  size_t i = 0;
+  for (auto it : c10::enumerate(v)) {
+    EXPECT_EQ(it.index, i);
+    EXPECT_EQ(*it, v[i]);
+    ++i;
+  }
+
+  EXPECT_EQ(i, v.size());
+}
+
+TEST(Enumerate, Temporary) {
+  std::vector<std::string> v = {"abc", "a", "ab"};
+  size_t i = 0;
+  for (auto&& it : c10::enumerate(decltype(v)(v))) { // Copy v.
+    EXPECT_EQ(it.index, i);
+    EXPECT_EQ(*it, v[i]);
+    EXPECT_EQ(it->size(), v[i].size());
+    ++i;
+  }
+
+  EXPECT_EQ(i, v.size());
+}
+
+TEST(Enumerate, BasicConstArg) {
+  const std::vector<std::string> v = {"abc", "a", "ab"};
+  size_t i = 0;
+  for (auto&& it : c10::enumerate(v)) {
+    static_assert(
+        IsConstReference<decltype(*it)>::value, "Enumerating a const vector");
+    EXPECT_EQ(it.index, i);
+    EXPECT_EQ(*it, v[i]);
+    EXPECT_EQ(it->size(), v[i].size());
+    ++i;
+  }
+
+  EXPECT_EQ(i, v.size());
+}
+
+TEST(Enumerate, TemporaryConstEnumerate) {
+  std::vector<std::string> v = {"abc", "a", "ab"};
+  size_t i = 0;
+  for (const auto&& it : c10::enumerate(decltype(v)(v))) { // Copy v.
+    static_assert(IsConstReference<decltype(*it)>::value, "Const enumeration");
+    EXPECT_EQ(it.index, i);
+    EXPECT_EQ(*it, v[i]);
+    EXPECT_EQ(it->size(), v[i].size());
+    ++i;
+  }
+
+  EXPECT_EQ(i, v.size());
+}
+
+TEST(Enumerate, EmptyRange) {
+  std::vector<std::string> v;
+  for (auto&& it : c10::enumerate(v)) {
+    (void)it; // Silence warnings.
+    ADD_FAILURE();
+  }
+}
+
+class CStringRange {
+  const char* cstr;
+
+ public:
+  struct Sentinel {};
+
+  explicit CStringRange(const char* cstr_) : cstr(cstr_) {}
+
+  const char* begin() const {
+    return cstr;
+  }
+  Sentinel end() const {
+    return Sentinel{};
+  }
+};
+
+static bool operator==(const char* c, CStringRange::Sentinel) {
+  return *c == 0;
+}
+
+TEST(Enumerate, Cpp17Support) {
+  std::array<char, 5> test = {"test"};
+  for (const auto&& it : c10::enumerate(CStringRange{test.data()})) {
+    ASSERT_LT(it.index, test.size());
+    EXPECT_EQ(*it, test[it.index]);
+  }
+}
+
+TEST(Enumerate, Cpp17StructuredBindingConstRef) {
+  std::vector<std::string> test = {"abc", "a", "ab"};
+  for (const auto& [index, str] : c10::enumerate(test)) {
+    ASSERT_LT(index, test.size());
+    EXPECT_EQ(str, test[index]);
+  }
+}
+
+TEST(Enumerate, Cpp17StructuredBindingConstRRef) {
+  std::vector<std::string> test = {"abc", "a", "ab"};
+  for (const auto&& [index, str] : c10::enumerate(test)) {
+    ASSERT_LT(index, test.size());
+    EXPECT_EQ(str, test[index]);
+  }
+}
+
+TEST(Enumerate, Cpp17StructuredBindingConstVector) {
+  const std::vector<std::string> test = {"abc", "a", "ab"};
+  for (auto&& [index, str] : c10::enumerate(test)) {
+    static_assert(
+        IsConstReference<decltype(str)>::value, "Enumerating const vector");
+    ASSERT_LT(index, test.size());
+    EXPECT_EQ(str, test[index]);
+  }
+}
+
+TEST(Enumerate, Cpp17StructuredBindingModify) {
+  std::vector<int> test = {1, 2, 3, 4, 5};
+  for (auto&& [index, integer] : c10::enumerate(test)) {
+    integer = 0;
+  }
+
+  for (const auto& integer : test) {
+    EXPECT_EQ(integer, 0);
+  }
+}
+
+TEST(Enumerate, BasicConstexpr) {
+  constexpr std::array<int, 3> test = {1, 2, 3};
+  static_assert(basicSum(test) == 6, "Basic enumerating is not constexpr");
+  EXPECT_EQ(basicSum(test), 6);
+}
+
+TEST(Enumerate, Cpp17StructuredBindingConstexpr) {
+  constexpr std::array<int, 3> test = {1, 2, 3};
+  static_assert(
+      cpp17StructuredBindingSum(test) == 6,
+      "C++17 structured binding enumerating is not constexpr");
+  EXPECT_EQ(cpp17StructuredBindingSum(test), 6);
+}
diff --git a/c10/test/util/Half_test.cpp b/c10/test/util/Half_test.cpp
index fc2a002f3a94..71ed1b4f84ee 100644
--- a/c10/test/util/Half_test.cpp
+++ b/c10/test/util/Half_test.cpp
@@ -19,8 +19,12 @@ float halfbits2float(unsigned short h) {
     exponent = 0xff;
   } else if (!exponent) { /* Denorm or Zero */
     if (mantissa) {
+<<<<<<< HEAD
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       unsigned int msb;
+=======
+      unsigned int msb = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       exponent = 0x71;
       do {
         msb = (mantissa & 0x400000);
diff --git a/c10/test/util/IntrusiveList_test.cpp b/c10/test/util/IntrusiveList_test.cpp
new file mode 100644
index 000000000000..435c01b98dc6
--- /dev/null
+++ b/c10/test/util/IntrusiveList_test.cpp
@@ -0,0 +1,123 @@
+#include <c10/util/IntrusiveList.h>
+#include <c10/util/irange.h>
+
+#include <gtest/gtest.h>
+
+namespace {
+
+class ListItem : public c10::IntrusiveListHook {};
+
+template <typename TItem>
+void check_containers_equal(
+    c10::IntrusiveList<TItem>& c1,
+    std::vector<std::unique_ptr<TItem>>& c2) {
+  EXPECT_EQ(c1.size(), c2.size());
+  {
+    auto it = c1.begin();
+    for (const auto i : c10::irange(c1.size())) {
+      EXPECT_EQ(&*it, c2[i].get());
+      EXPECT_EQ(it, c1.iterator_to(*c2[i]));
+      ++it;
+    }
+  }
+  {
+    auto it = c1.rbegin();
+    for (const auto i : c10::irange(c1.size())) {
+      EXPECT_EQ(&*it, c2[c2.size() - 1 - i].get());
+      ++it;
+    }
+  }
+};
+
+} // namespace
+
+TEST(IntrusiveList, TestInsert) {
+  c10::IntrusiveList<ListItem> l;
+  std::vector<std::unique_ptr<ListItem>> v;
+
+  auto size = 50;
+
+  for ([[maybe_unused]] const auto i : c10::irange(size)) {
+    v.push_back(std::make_unique<ListItem>());
+    l.insert(l.end(), *v.back());
+    check_containers_equal(l, v);
+  }
+}
+
+TEST(IntrusiveList, TestUnlink) {
+  c10::IntrusiveList<ListItem> l;
+  std::vector<std::unique_ptr<ListItem>> v;
+
+  auto size = 50;
+
+  for ([[maybe_unused]] const auto i : c10::irange(size)) {
+    v.push_back(std::make_unique<ListItem>());
+    l.insert(l.end(), *v.back());
+  }
+
+  for ([[maybe_unused]] const auto i : c10::irange(size)) {
+    auto first = l.begin();
+    EXPECT_TRUE(first->is_linked());
+    first->unlink();
+    EXPECT_FALSE(first->is_linked());
+    v.erase(v.begin());
+    check_containers_equal(l, v);
+  }
+}
+
+TEST(IntrusiveList, TestMoveElement) {
+  c10::IntrusiveList<ListItem> l;
+  std::vector<std::unique_ptr<ListItem>> v;
+
+  auto size = 5;
+
+  for ([[maybe_unused]] const auto i : c10::irange(size)) {
+    v.push_back(std::make_unique<ListItem>());
+    l.insert(l.end(), *v.back());
+  }
+
+  // move 3rd element to the end of the list
+  {
+    auto it = l.iterator_to(*v[2]);
+    EXPECT_TRUE(it->is_linked());
+    l.iterator_to(*v[2])->unlink();
+    EXPECT_FALSE(it->is_linked());
+    l.insert(l.end(), *v[2]);
+  }
+  {
+    auto it = v.begin() + 2;
+    std::rotate(it, it + 1, v.end());
+  }
+
+  check_containers_equal(l, v);
+}
+
+TEST(IntrusiveList, TestEmpty) {
+  c10::IntrusiveList<ListItem> l;
+  ListItem i;
+
+  EXPECT_TRUE(l.empty());
+  l.insert(l.end(), i);
+  EXPECT_FALSE(l.empty());
+  l.begin()->unlink();
+  EXPECT_TRUE(l.empty());
+}
+TEST(IntrusiveList, TestUnlinkUnlinked) {
+  EXPECT_ANY_THROW(ListItem().unlink());
+}
+
+TEST(IntrusiveList, TestInitializerListCtro) {
+  ListItem i, j;
+  c10::IntrusiveList<ListItem> l({i, j});
+
+  EXPECT_EQ(l.size(), 2);
+  EXPECT_EQ(l.iterator_to(i), l.begin());
+  EXPECT_EQ(l.iterator_to(j), ++l.begin());
+}
+
+TEST(IntrusiveList, TestNullListIterator) {
+  auto null_iter = c10::ListIterator<c10::IntrusiveListHook, ListItem>{nullptr};
+
+  EXPECT_ANY_THROW(--null_iter);
+  EXPECT_ANY_THROW(++null_iter);
+}
diff --git a/c10/test/util/Metaprogramming_test.cpp b/c10/test/util/Metaprogramming_test.cpp
index a7bca7a5b511..704e2595de9c 100644
--- a/c10/test/util/Metaprogramming_test.cpp
+++ b/c10/test/util/Metaprogramming_test.cpp
@@ -68,8 +68,12 @@ static_assert(
 } // namespace test_function_traits
 
 struct MovableOnly {
+<<<<<<< HEAD
   constexpr MovableOnly(int val_) : val(val_) { /* no default constructor */
   }
+=======
+  constexpr MovableOnly(int val_) : val(val_) { /* no default constructor */ }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   MovableOnly(const MovableOnly&) = delete;
   MovableOnly(MovableOnly&&) = default;
   MovableOnly& operator=(const MovableOnly&) = delete;
diff --git a/c10/test/util/Semaphore_test.cpp b/c10/test/util/Semaphore_test.cpp
new file mode 100644
index 000000000000..9647270940d7
--- /dev/null
+++ b/c10/test/util/Semaphore_test.cpp
@@ -0,0 +1,35 @@
+#include <c10/util/Semaphore.h>
+#include <c10/util/irange.h>
+#include <gtest/gtest.h>
+
+#include <thread>
+
+using namespace ::testing;
+
+TEST(SemaphoreTest, TestConcurrency) {
+  auto num_threads = std::thread::hardware_concurrency();
+  auto num_incr = 10000;
+
+  c10::Semaphore sem;
+
+  std::vector<std::thread> threads;
+  for ([[maybe_unused]] const auto _ : c10::irange(num_threads)) {
+    threads.emplace_back([num_incr = num_incr, &sem]() {
+      for ([[maybe_unused]] const auto _ : c10::irange(num_incr)) {
+        sem.release();
+      }
+      for ([[maybe_unused]] const auto _ : c10::irange(num_incr)) {
+        sem.acquire();
+      }
+      sem.release(num_incr);
+      for ([[maybe_unused]] const auto _ : c10::irange(num_incr)) {
+        sem.acquire();
+      }
+    });
+  }
+
+  std::for_each(
+      threads.begin(), threads.end(), [](std::thread& t) { t.join(); });
+
+  EXPECT_FALSE(sem.tryAcquire());
+}
diff --git a/c10/test/util/TypeIndex_test.cpp b/c10/test/util/TypeIndex_test.cpp
index 5979d92edd59..945b5a1edef0 100644
--- a/c10/test/util/TypeIndex_test.cpp
+++ b/c10/test/util/TypeIndex_test.cpp
@@ -2,9 +2,15 @@
 #include <c10/util/TypeIndex.h>
 #include <gtest/gtest.h>
 
+<<<<<<< HEAD
 using c10::string_view;
 using c10::util::get_fully_qualified_type_name;
 using c10::util::get_type_index;
+=======
+using c10::util::get_fully_qualified_type_name;
+using c10::util::get_type_index;
+using std::string_view;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // NOLINTBEGIN(modernize-unary-static-assert)
 namespace {
@@ -137,7 +143,11 @@ static_assert(
     "");
 static_assert(
     string_view::npos !=
+<<<<<<< HEAD
         get_fully_qualified_type_name<typename Type<int>::type>().find("*"),
+=======
+        get_fully_qualified_type_name<typename Type<int>::type>().find('*'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "");
 
 // but with remove_pointer applied, there is no '*' in the type name anymore
@@ -145,7 +155,11 @@ static_assert(
     string_view::npos ==
         get_fully_qualified_type_name<
             std::remove_pointer_t<typename Type<int>::type>>()
+<<<<<<< HEAD
             .find("*"),
+=======
+            .find('*'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "");
 
 TEST(TypeIndex, TypeComputationsAreResolved) {
@@ -154,13 +168,21 @@ TEST(TypeIndex, TypeComputationsAreResolved) {
       get_fully_qualified_type_name<typename Type<int>::type>().find("int"));
   EXPECT_NE(
       string_view::npos,
+<<<<<<< HEAD
       get_fully_qualified_type_name<typename Type<int>::type>().find("*"));
+=======
+      get_fully_qualified_type_name<typename Type<int>::type>().find('*'));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // but with remove_pointer applied, there is no '*' in the type name anymore
   EXPECT_EQ(
       string_view::npos,
       get_fully_qualified_type_name<
           std::remove_pointer_t<typename Type<int>::type>>()
+<<<<<<< HEAD
           .find("*"));
+=======
+          .find('*'));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 struct Functor final {
diff --git a/c10/test/util/TypeTraits_test.cpp b/c10/test/util/TypeTraits_test.cpp
index e0e536d7357a..a415d471fc07 100644
--- a/c10/test/util/TypeTraits_test.cpp
+++ b/c10/test/util/TypeTraits_test.cpp
@@ -51,7 +51,11 @@ struct Functor {
 auto lambda = []() {};
 // func() and func__ just exists to silence a compiler warning about lambda
 // being unused
+<<<<<<< HEAD
 bool func() {
+=======
+static bool func() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   lambda();
   return true;
 }
@@ -151,6 +155,10 @@ struct MyStatelessConstFunctor final {
   Result operator()(Args...) const {}
 };
 
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void func() {
   auto stateless_lambda = [](int a) { return a; };
   static_assert(is_stateless_lambda<decltype(stateless_lambda)>::value, "");
diff --git a/c10/test/util/flags_test.cpp b/c10/test/util/flags_test.cpp
index ea8844c46197..16e18ddf4d88 100644
--- a/c10/test/util/flags_test.cpp
+++ b/c10/test/util/flags_test.cpp
@@ -4,6 +4,10 @@
 
 #include <c10/util/Flags.h>
 
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DEFINE_bool(c10_flags_test_only_flag, true, "Only used in test.");
 
 namespace c10_test {
diff --git a/c10/test/util/irange_test.cpp b/c10/test/util/irange_test.cpp
index 66fa4d4d7863..0091d639bf86 100644
--- a/c10/test/util/irange_test.cpp
+++ b/c10/test/util/irange_test.cpp
@@ -59,7 +59,11 @@ TEST(irange, empty_reverse_range_one_input) {
   ASSERT_EQ(test_vec, correct);
 }
 
+<<<<<<< HEAD
 constexpr std::array<int, 3> toy_iota() {
+=======
+static constexpr std::array<int, 3> toy_iota() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::array<int, 3> result = {0};
   for (const auto i : c10::irange(3)) {
     result[i] = i;
@@ -67,7 +71,11 @@ constexpr std::array<int, 3> toy_iota() {
   return result;
 }
 
+<<<<<<< HEAD
 constexpr std::array<int, 3> toy_iota_with_start(int start) {
+=======
+static constexpr std::array<int, 3> toy_iota_with_start(int start) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::array<int, 3> result = {0};
   for (const auto i : c10::irange(start, start + 3)) {
     result[i - start] = i;
diff --git a/c10/test/util/lazy_test.cpp b/c10/test/util/lazy_test.cpp
index d3e208ecae5e..764bc77117bb 100644
--- a/c10/test/util/lazy_test.cpp
+++ b/c10/test/util/lazy_test.cpp
@@ -53,8 +53,13 @@ TEST(LazyTest, OptimisticLazy) {
   EXPECT_EQ(sCopy.ensure(factory), kLongString);
   EXPECT_EQ(invocations.load(), 0);
 
+<<<<<<< HEAD
   auto sMove = std::move(s);
   EXPECT_EQ(sMove.ensure(factory), kLongString);
+=======
+  auto sMove = std::move(s); // codespell:ignore smove
+  EXPECT_EQ(sMove.ensure(factory), kLongString); // codespell:ignore smove
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   EXPECT_EQ(invocations.load(), 0);
   // NOLINTNEXTLINE(bugprone-use-after-move)
   EXPECT_EQ(s.ensure(factory), kLongString);
diff --git a/c10/test/util/logging_test.cpp b/c10/test/util/logging_test.cpp
index c06dfb43d46c..0bf77401bac6 100644
--- a/c10/test/util/logging_test.cpp
+++ b/c10/test/util/logging_test.cpp
@@ -159,6 +159,7 @@ TEST(LoggingDeathTest, TestEnforceUsingFatal) {
 }
 #endif
 
+<<<<<<< HEAD
 C10_NOINLINE void f1() {
   CAFFE_THROW("message");
 }
@@ -172,6 +173,20 @@ C10_NOINLINE void f3() {
 }
 
 #ifdef FBCODE_CAFFE2
+=======
+#ifdef FBCODE_CAFFE2
+static C10_NOINLINE void f1() {
+  CAFFE_THROW("message");
+}
+
+static C10_NOINLINE void f2() {
+  f1();
+}
+
+static C10_NOINLINE void f3() {
+  f2();
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(LoggingTest, ExceptionWhat) {
   std::optional<::c10::Error> error;
   try {
diff --git a/c10/test/util/ordered_preserving_dict_test.cpp b/c10/test/util/ordered_preserving_dict_test.cpp
index 2279f4486708..3ac4aefa7453 100644
--- a/c10/test/util/ordered_preserving_dict_test.cpp
+++ b/c10/test/util/ordered_preserving_dict_test.cpp
@@ -183,8 +183,20 @@ TEST(OrderedPreservingDictTest, test_range_insert) {
 
   ASSERT_EQUAL_PRIM(map.at(-2), 0);
 
+<<<<<<< HEAD
   for (int i = 10, j = 2; i < nb_values - 5; i++, j++) {
     ASSERT_EQUAL_PRIM(map.at(i), i + 1);
+=======
+  auto begin = map.begin();
+  begin++;
+  begin++;
+  for (int i = 10; i < nb_values - 5; i++, begin++) {
+    // Check range inserted kv pairs: map(i) = i + 1 for i = 10,....995
+    ASSERT_EQUAL_PRIM(map.at(i), i + 1);
+    // Check range inserted kv pairs are correctly indexed/ordered
+    TORCH_INTERNAL_ASSERT(begin->first == i);
+    TORCH_INTERNAL_ASSERT(begin->second == i + 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/c10/test/util/registry_test.cpp b/c10/test/util/registry_test.cpp
index d7ce74e654fb..6f93373f4042 100644
--- a/c10/test/util/registry_test.cpp
+++ b/c10/test/util/registry_test.cpp
@@ -16,6 +16,10 @@ class Foo {
   virtual ~Foo() = default;
 };
 
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DECLARE_REGISTRY(FooRegistry, Foo, int);
 C10_DEFINE_REGISTRY(FooRegistry, Foo, int);
 #define REGISTER_FOO(clsname) C10_REGISTER_CLASS(FooRegistry, clsname, clsname)
@@ -48,22 +52,38 @@ TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
 }
 
 // C10_REGISTER_CLASS_WITH_PRIORITY defines static variable
+<<<<<<< HEAD
 void RegisterFooDefault() {
+=======
+static void RegisterFooDefault() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_REGISTER_CLASS_WITH_PRIORITY(
       FooRegistry, FooWithPriority, c10::REGISTRY_DEFAULT, Foo);
 }
 
+<<<<<<< HEAD
 void RegisterFooDefaultAgain() {
+=======
+static void RegisterFooDefaultAgain() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_REGISTER_CLASS_WITH_PRIORITY(
       FooRegistry, FooWithPriority, c10::REGISTRY_DEFAULT, Foo);
 }
 
+<<<<<<< HEAD
 void RegisterFooBarFallback() {
+=======
+static void RegisterFooBarFallback() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_REGISTER_CLASS_WITH_PRIORITY(
       FooRegistry, FooWithPriority, c10::REGISTRY_FALLBACK, Bar);
 }
 
+<<<<<<< HEAD
 void RegisterFooBarPreferred() {
+=======
+static void RegisterFooBarPreferred() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_REGISTER_CLASS_WITH_PRIORITY(
       FooRegistry, FooWithPriority, c10::REGISTRY_PREFERRED, Bar);
 }
diff --git a/c10/test/util/string_util_test.cpp b/c10/test/util/string_util_test.cpp
index 62c053028f70..c2d581856fd5 100644
--- a/c10/test/util/string_util_test.cpp
+++ b/c10/test/util/string_util_test.cpp
@@ -77,4 +77,108 @@ TEST(StringUtilTest, testStrMulti) {
 }
 } // namespace test_str_multi
 
+<<<<<<< HEAD
+=======
+namespace test_try_to {
+TEST(tryToTest, Int64T) {
+  const std::vector<std::pair<const char*, int64_t>> valid_examples = {
+      {"123", 123},
+      {"+456", 456},
+      {"-123", -123},
+      {"0x123", 291},
+      {"00123", 83},
+      {"000", 0},
+  };
+  for (const auto& [str, num] : valid_examples) {
+    EXPECT_EQ(c10::tryToNumber<int64_t>(str), num);
+    EXPECT_EQ(c10::tryToNumber<int64_t>(std::string{str}), num);
+  }
+
+  const std::vector<const char*> invalid_examples = {
+      "123abc",
+      "123.45",
+      "",
+      "12345678901234567890", // overflow
+  };
+  for (const auto str : invalid_examples) {
+    EXPECT_FALSE(c10::tryToNumber<int64_t>(str).has_value());
+    EXPECT_FALSE(c10::tryToNumber<int64_t>(std::string{str}).has_value());
+  }
+  EXPECT_FALSE(c10::tryToNumber<int64_t>(nullptr).has_value());
+}
+
+TEST(tryToTest, Double) {
+  const std::vector<std::pair<const char*, double>> valid_examples = {
+      {"123.45", 123.45},
+      {"-123.45", -123.45},
+      {"123", 123.},
+      {".5", 0.5},
+      {"-.02", -0.02},
+      {"5e-2", 5e-2},
+      {"1e+3", 1e3},
+      {"0x123.45", 291.26953125},
+  };
+  for (const auto& [str, num] : valid_examples) {
+    EXPECT_EQ(c10::tryToNumber<double>(str), num);
+    EXPECT_EQ(c10::tryToNumber<double>(std::string{str}), num);
+  }
+
+  const std::vector<const char*> invalid_examples = {
+      "123abc",
+      "",
+      "1e309", // overflow
+  };
+  for (const auto str : invalid_examples) {
+    EXPECT_FALSE(c10::tryToNumber<double>(str).has_value());
+    EXPECT_FALSE(c10::tryToNumber<double>(std::string{str}).has_value());
+  }
+  EXPECT_FALSE(c10::tryToNumber<double>(nullptr).has_value());
+}
+} // namespace test_try_to
+
+namespace test_split {
+TEST(SplitTest, NormalCase) {
+  std::string str = "torch.ops.aten.linear";
+  auto result = c10::split(str, '.');
+  ASSERT_EQ(4, result.size());
+  EXPECT_EQ("torch", result[0]);
+  EXPECT_EQ("ops", result[1]);
+  EXPECT_EQ("aten", result[2]);
+  EXPECT_EQ("linear", result[3]);
+}
+TEST(SplitTest, EmptyString) {
+  auto result = c10::split("", '.');
+  EXPECT_TRUE(result.empty());
+}
+TEST(SplitTest, NoDelimiter) {
+  std::string str = "single";
+  auto result = c10::split(str, '.');
+  ASSERT_EQ(1, result.size());
+  EXPECT_EQ("single", result[0]);
+}
+TEST(SplitTest, ConsecutiveDelimiters) {
+  std::string str = "atom1..atom2";
+  auto result = c10::split(str, '.');
+  ASSERT_EQ(3, result.size());
+  EXPECT_EQ("atom1", result[0]);
+  EXPECT_EQ("", result[1]);
+  EXPECT_EQ("atom2", result[2]);
+}
+} // namespace test_split
+
+namespace test_str_enum {
+TEST(StringUtilTest, testStrEnum) {
+  enum class Foo { Bar = 1, Baz = 2 };
+  EXPECT_EQ(c10::str(Foo::Baz, Foo::Bar), "21");
+
+  enum UnscopedEnum { Bar2 = 1, Baz2 = 2 };
+  EXPECT_EQ(c10::str(Baz2, Bar2), "21");
+
+  static_assert(c10::detail::Streamable<int>::value);
+  static_assert(!c10::detail::Streamable<Foo>::value);
+
+  static_assert(c10::detail::Streamable<UnscopedEnum>::value);
+}
+} // namespace test_str_enum
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
diff --git a/c10/test/util/tempfile_test.cpp b/c10/test/util/tempfile_test.cpp
index 1c20b3755975..b22e0f2cc9ca 100644
--- a/c10/test/util/tempfile_test.cpp
+++ b/c10/test/util/tempfile_test.cpp
@@ -5,20 +5,36 @@
 
 #if !defined(_WIN32)
 static bool file_exists(const char* path) {
+<<<<<<< HEAD
   struct stat st {};
   return stat(path, &st) == 0 && S_ISREG(st.st_mode);
 }
 static bool directory_exists(const char* path) {
   struct stat st {};
+=======
+  struct stat st{};
+  return stat(path, &st) == 0 && S_ISREG(st.st_mode);
+}
+static bool directory_exists(const char* path) {
+  struct stat st{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return stat(path, &st) == 0 && S_ISDIR(st.st_mode);
 }
 #else
 static bool file_exists(const char* path) {
+<<<<<<< HEAD
   struct _stat st {};
   return _stat(path, &st) == 0 && ((st.st_mode & _S_IFMT) == _S_IFREG);
 }
 static bool directory_exists(const char* path) {
   struct _stat st {};
+=======
+  struct _stat st{};
+  return _stat(path, &st) == 0 && ((st.st_mode & _S_IFMT) == _S_IFREG);
+}
+static bool directory_exists(const char* path) {
+  struct _stat st{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return _stat(path, &st) == 0 && ((st.st_mode & _S_IFMT) == _S_IFDIR);
 }
 #endif // !defined(_WIN32)
diff --git a/c10/util/ApproximateClock.h b/c10/util/ApproximateClock.h
index 4b3b5b6bb9ab..ce3efde8e5e1 100644
--- a/c10/util/ApproximateClock.h
+++ b/c10/util/ApproximateClock.h
@@ -59,7 +59,11 @@ inline time_t getTime(bool allow_monotonic = false) {
       .count();
 #else
   // clock_gettime is *much* faster than std::chrono implementation on Linux
+<<<<<<< HEAD
   struct timespec t {};
+=======
+  struct timespec t{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto mode = CLOCK_REALTIME;
   if (allow_monotonic) {
     mode = CLOCK_MONOTONIC;
diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h
index 10c83998c420..4ce5967680ca 100644
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@@ -16,7 +16,10 @@
 #pragma once
 
 #include <c10/macros/Macros.h>
+<<<<<<< HEAD
 #include <c10/util/Deprecated.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/Exception.h>
 #include <c10/util/SmallVector.h>
 
@@ -377,8 +380,14 @@ bool operator!=(c10::ArrayRef<T> a1, const std::vector<T>& a2) {
 
 using IntArrayRef = ArrayRef<int64_t>;
 
+<<<<<<< HEAD
 // This alias is deprecated because it doesn't make ownership
 // semantics obvious.  Use IntArrayRef instead!
 C10_DEFINE_DEPRECATED_USING(IntList, ArrayRef<int64_t>)
+=======
+using IntList [[deprecated(
+    "This alias is deprecated because it doesn't make ownership semantics obvious. Use IntArrayRef instead!")]] =
+    ArrayRef<int64_t>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace c10
diff --git a/c10/util/BFloat16-inl.h b/c10/util/BFloat16-inl.h
index 10ab0c828d7a..f71550638425 100644
--- a/c10/util/BFloat16-inl.h
+++ b/c10/util/BFloat16-inl.h
@@ -10,6 +10,7 @@ C10_CLANG_DIAGNOSTIC_PUSH()
 C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 
+<<<<<<< HEAD
 #if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
 #if defined(CL_SYCL_LANGUAGE_VERSION)
 #include <CL/sycl.hpp> // for SYCL 1.2.1
@@ -18,6 +19,13 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 #include <ext/oneapi/bfloat16.hpp>
 #endif
+=======
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10 {
 
diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h
index 09d3051ab71c..cabc256cffde 100644
--- a/c10/util/BFloat16.h
+++ b/c10/util/BFloat16.h
@@ -14,6 +14,7 @@
 #include <cuda_bf16.h>
 #endif
 
+<<<<<<< HEAD
 #if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
 #if defined(CL_SYCL_LANGUAGE_VERSION)
 #include <CL/sycl.hpp> // for SYCL 1.2.1
@@ -22,6 +23,13 @@
 #endif
 #include <ext/oneapi/bfloat16.hpp>
 #endif
+=======
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace c10 {
 
@@ -31,7 +39,11 @@ inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
   uint32_t tmp = src;
   tmp <<= 16;
 
+<<<<<<< HEAD
 #if defined(USE_ROCM)
+=======
+#if defined(USE_ROCM) && defined(__HIPCC__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   float* tempRes;
 
   // We should be using memcpy in order to respect the strict aliasing rule
@@ -48,7 +60,11 @@ inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
 inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
   uint32_t res = 0;
 
+<<<<<<< HEAD
 #if defined(USE_ROCM)
+=======
+#if defined(USE_ROCM) && defined(__HIPCC__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We should be using memcpy in order to respect the strict aliasing rule
   // but it fails in the HIP environment.
   uint32_t* tempRes = reinterpret_cast<uint32_t*>(&src);
@@ -61,7 +77,11 @@ inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
 }
 
 inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
+<<<<<<< HEAD
 #if defined(USE_ROCM)
+=======
+#if defined(USE_ROCM) && defined(__HIPCC__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (src != src) {
 #elif defined(_MSC_VER)
   if (isnan(src)) {
@@ -87,7 +107,11 @@ struct alignas(2) BFloat16 {
   uint16_t x;
 
   // HIP wants __host__ __device__ tag, CUDA does not
+<<<<<<< HEAD
 #if defined(USE_ROCM)
+=======
+#if defined(USE_ROCM) && defined(__HIPCC__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_HOST_DEVICE BFloat16() = default;
 #else
   BFloat16() = default;
diff --git a/c10/util/Enumerate.h b/c10/util/Enumerate.h
new file mode 100644
index 000000000000..bbe74aa19972
--- /dev/null
+++ b/c10/util/Enumerate.h
@@ -0,0 +1,159 @@
+/*
+ * Ported from folly/container/Enumerate.h
+ */
+
+#pragma once
+
+#include <iterator>
+#include <memory>
+
+#ifdef _WIN32
+#include <basetsd.h> // @manual
+using ssize_t = SSIZE_T;
+#endif
+
+#include <c10/macros/Macros.h>
+
+/**
+ * Similar to Python's enumerate(), enumerate() can be used to
+ * iterate a range with a for-range loop, and it also allows to
+ * retrieve the count of iterations so far. Can be used in constexpr
+ * context.
+ *
+ * For example:
+ *
+ * for (auto&& [index, element] : enumerate(vec)) {
+ *   // index is a const reference to a size_t containing the iteration count.
+ *   // element is a reference to the type contained within vec, mutable
+ *   // unless vec is const.
+ * }
+ *
+ * If the binding is const, the element reference is too.
+ *
+ * for (const auto&& [index, element] : enumerate(vec)) {
+ *   // element is always a const reference.
+ * }
+ *
+ * It can also be used as follows:
+ *
+ * for (auto&& it : enumerate(vec)) {
+ *   // *it is a reference to the current element. Mutable unless vec is const.
+ *   // it->member can be used as well.
+ *   // it.index contains the iteration count.
+ * }
+ *
+ * As before, const auto&& it can also be used.
+ */
+
+namespace c10 {
+
+namespace detail {
+
+template <class T>
+struct MakeConst {
+  using type = const T;
+};
+template <class T>
+struct MakeConst<T&> {
+  using type = const T&;
+};
+template <class T>
+struct MakeConst<T*> {
+  using type = const T*;
+};
+
+template <class Iterator>
+class Enumerator {
+ public:
+  constexpr explicit Enumerator(Iterator it) : it_(std::move(it)) {}
+
+  class Proxy {
+   public:
+    using difference_type = ssize_t;
+    using value_type = typename std::iterator_traits<Iterator>::value_type;
+    using reference = typename std::iterator_traits<Iterator>::reference;
+    using pointer = typename std::iterator_traits<Iterator>::pointer;
+    using iterator_category = std::input_iterator_tag;
+
+    C10_ALWAYS_INLINE constexpr explicit Proxy(const Enumerator& e)
+        : index(e.idx_), element(*e.it_) {}
+
+    // Non-const Proxy: Forward constness from Iterator.
+    C10_ALWAYS_INLINE constexpr reference operator*() {
+      return element;
+    }
+    C10_ALWAYS_INLINE constexpr pointer operator->() {
+      return std::addressof(element);
+    }
+
+    // Const Proxy: Force const references.
+    C10_ALWAYS_INLINE constexpr typename MakeConst<reference>::type operator*()
+        const {
+      return element;
+    }
+    C10_ALWAYS_INLINE constexpr typename MakeConst<pointer>::type operator->()
+        const {
+      return std::addressof(element);
+    }
+
+   public:
+    size_t index;
+    reference element;
+  };
+
+  C10_ALWAYS_INLINE constexpr Proxy operator*() const {
+    return Proxy(*this);
+  }
+
+  C10_ALWAYS_INLINE constexpr Enumerator& operator++() {
+    ++it_;
+    ++idx_;
+    return *this;
+  }
+
+  template <typename OtherIterator>
+  C10_ALWAYS_INLINE constexpr bool operator==(
+      const Enumerator<OtherIterator>& rhs) const {
+    return it_ == rhs.it_;
+  }
+
+  template <typename OtherIterator>
+  C10_ALWAYS_INLINE constexpr bool operator!=(
+      const Enumerator<OtherIterator>& rhs) const {
+    return !(it_ == rhs.it_);
+  }
+
+ private:
+  template <typename OtherIterator>
+  friend class Enumerator;
+
+  Iterator it_;
+  size_t idx_ = 0;
+};
+
+template <class Range>
+class RangeEnumerator {
+  Range r_;
+  using BeginIteratorType = decltype(std::declval<Range>().begin());
+  using EndIteratorType = decltype(std::declval<Range>().end());
+
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+  constexpr explicit RangeEnumerator(Range&& r) : r_(std::forward<Range>(r)) {}
+
+  constexpr Enumerator<BeginIteratorType> begin() {
+    return Enumerator<BeginIteratorType>(r_.begin());
+  }
+  constexpr Enumerator<EndIteratorType> end() {
+    return Enumerator<EndIteratorType>(r_.end());
+  }
+};
+
+} // namespace detail
+
+template <class Range>
+constexpr detail::RangeEnumerator<Range> enumerate(Range&& r) {
+  return detail::RangeEnumerator<Range>(std::forward<Range>(r));
+}
+
+} // namespace c10
diff --git a/c10/util/Exception.cpp b/c10/util/Exception.cpp
index 3707fce07063..f33e65d26715 100644
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@@ -213,7 +213,11 @@ Warning::Warning(
 Warning::Warning(
     warning_variant_t type,
     SourceLocation source_location,
+<<<<<<< HEAD
     detail::CompileTimeEmptyString msg,
+=======
+    detail::CompileTimeEmptyString /*msg*/,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const bool verbatim)
     : Warning(type, source_location, "", verbatim) {}
 
diff --git a/c10/util/Exception.h b/c10/util/Exception.h
index 8f942eb39e68..05a4b707a7d2 100644
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@@ -116,8 +116,13 @@ class C10_API Error : public std::exception {
 
 class C10_API Warning {
  public:
+<<<<<<< HEAD
   class C10_API UserWarning {};
   class C10_API DeprecationWarning {};
+=======
+  class C10_API UserWarning{};
+  class C10_API DeprecationWarning{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   using warning_variant_t = std::variant<UserWarning, DeprecationWarning>;
 
@@ -289,12 +294,33 @@ class C10_API OutOfMemoryError : public Error {
   using Error::Error;
 };
 
+<<<<<<< HEAD
 // Used for handling syntacitc erros in input arguments.
 // They shuld turn into SytnaxError when the cross into Python
+=======
+// Used for handling syntactic errors in input arguments.
+// These turn into SyntaxError when the cross into Python.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class C10_API SyntaxError : public Error {
   using Error::Error;
 };
 
+<<<<<<< HEAD
+=======
+// Raised when accelerator API call hits an error.
+// These turn into AcceleratorError when the cross into Python
+class C10_API AcceleratorError : public Error {
+  int32_t error_code;
+
+ public:
+  AcceleratorError(SourceLocation loc, int32_t code, const std::string& msg)
+      : Error(loc, msg), error_code(code) {}
+  int32_t get_error_code() const {
+    return error_code;
+  }
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Base error type for all distributed errors.
 // These turn into DistError when they cross into Python.
 class C10_API DistError : public Error {
@@ -319,6 +345,15 @@ class C10_API DistNetworkError : public DistError {
   using DistError::DistError;
 };
 
+<<<<<<< HEAD
+=======
+// Raised when a queue is empty and a non-blocking pop is called.
+// Translated to torch.distributed.QueueEmptyError in Python
+class C10_API DistQueueEmptyError : public DistStoreError {
+  using DistStoreError::DistStoreError;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // A utility function to return an exception std::string by prepending its
 // exception type before its what() content
 C10_API std::string GetExceptionString(const std::exception& e);
@@ -699,28 +734,49 @@ namespace c10::detail {
 
 /*
 // Deprecation disabled until we fix sites in our codebase
+<<<<<<< HEAD
 C10_DEPRECATED_MESSAGE("AT_ERROR(msg) is deprecated, use TORCH_CHECK(false, msg)
 instead.")
+=======
+[[deprecated("AT_ERROR(msg) is deprecated, use TORCH_CHECK(false, msg)
+instead.")]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 */
 inline void deprecated_AT_ERROR() {}
 
 /*
 // Deprecation disabled until we fix sites in our codebase
+<<<<<<< HEAD
 C10_DEPRECATED_MESSAGE("AT_ASSERT is deprecated, if you mean to indicate an
 internal invariant failure, use " \
                        "TORCH_INTERNAL_ASSERT instead; if you mean to do user
 error checking, use " \ "TORCH_CHECK.  See
 https://github.com/pytorch/pytorch/issues/20287 for more details.")
+=======
+[[deprecated("AT_ASSERT is deprecated, if you mean to indicate an
+internal invariant failure, use " \
+                       "TORCH_INTERNAL_ASSERT instead; if you mean to do user
+error checking, use " \ "TORCH_CHECK.  See
+https://github.com/pytorch/pytorch/issues/20287 for more details.")]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 */
 inline void deprecated_AT_ASSERT() {}
 
 /*
 // Deprecation disabled until we fix sites in our codebase
+<<<<<<< HEAD
 C10_DEPRECATED_MESSAGE("AT_ASSERTM is deprecated, if you mean to indicate an
 internal invariant failure, use " \
                        "TORCH_INTERNAL_ASSERT instead; if you mean to do user
 error checking, use " \ "TORCH_CHECK.  See
 https://github.com/pytorch/pytorch/issues/20287 for more details.")
+=======
+[[deprecated("AT_ASSERTM is deprecated, if you mean to indicate an
+internal invariant failure, use " \
+                       "TORCH_INTERNAL_ASSERT instead; if you mean to do user
+error checking, use " \ "TORCH_CHECK.  See
+https://github.com/pytorch/pytorch/issues/20287 for more details.")]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 */
 inline void deprecated_AT_ASSERTM() {}
 
diff --git a/c10/util/Float4_e2m1fn_x2.h b/c10/util/Float4_e2m1fn_x2.h
new file mode 100644
index 000000000000..751e3584a862
--- /dev/null
+++ b/c10/util/Float4_e2m1fn_x2.h
@@ -0,0 +1,28 @@
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+/// Defines the Float4_e2m1fn_x2 type (4-bit floating-point, two elements packed
+/// into one byte). This is the FP4 dtype from the OCP MX format spec
+/// (https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf,
+/// Section 5.3.3)
+///
+/// Given two high precision values val0 and val1, here is the
+/// binary configuration of their packed representation, from MSB to LSB:
+///
+///   original value             | val1 : val0
+///   ========================================
+///   bit index (MSB==7, LSB==0) | 7654 : 3210
+///   sign/exponent/mantissa     | seem : seem
+///
+
+namespace c10 {
+
+struct alignas(1) Float4_e2m1fn_x2 {
+  uint8_t val_;
+  Float4_e2m1fn_x2() = default;
+  C10_HOST_DEVICE explicit Float4_e2m1fn_x2(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/c10/util/IntrusiveList.h b/c10/util/IntrusiveList.h
new file mode 100644
index 000000000000..d592e4205297
--- /dev/null
+++ b/c10/util/IntrusiveList.h
@@ -0,0 +1,206 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+
+namespace c10 {
+
+template <typename T>
+class IntrusiveList;
+
+class IntrusiveListHook {
+  template <typename P, typename T>
+  friend class ListIterator;
+
+  template <typename T>
+  friend class IntrusiveList;
+
+  IntrusiveListHook* next_{nullptr};
+  IntrusiveListHook* prev_{nullptr};
+
+  void link_before(IntrusiveListHook* next_node) {
+    next_ = next_node;
+    prev_ = next_node->prev_;
+    next_node->prev_ = this;
+    prev_->next_ = this;
+  }
+
+ public:
+  IntrusiveListHook() : next_(this), prev_(this) {}
+
+  IntrusiveListHook(const IntrusiveListHook&) = delete;
+  IntrusiveListHook& operator=(const IntrusiveListHook&) = delete;
+  IntrusiveListHook(IntrusiveListHook&&) = delete;
+  IntrusiveListHook& operator=(IntrusiveListHook&&) = delete;
+
+  void unlink() {
+    TORCH_CHECK(is_linked());
+    next_->prev_ = prev_;
+    prev_->next_ = next_;
+    next_ = this;
+    prev_ = this;
+  }
+
+  ~IntrusiveListHook() {
+    if (is_linked()) {
+      unlink();
+    }
+  }
+
+  bool is_linked() const {
+    return next_ != this;
+  }
+};
+
+template <typename P, typename T>
+class ListIterator {
+  static_assert(std::is_same_v<std::remove_const_t<P>, IntrusiveListHook>);
+  static_assert(std::is_base_of_v<IntrusiveListHook, T>);
+  P* ptr_;
+
+  friend class IntrusiveList<T>;
+
+ public:
+  using iterator_category = std::bidirectional_iterator_tag;
+  using value_type = std::conditional_t<std::is_const_v<P>, const T, T>;
+  using difference_type = std::ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+
+  explicit ListIterator(P* ptr) : ptr_(ptr) {}
+  ~ListIterator() = default;
+
+  ListIterator(const ListIterator&) = default;
+  ListIterator& operator=(const ListIterator&) = default;
+  ListIterator(ListIterator&&) = default;
+  ListIterator& operator=(ListIterator&&) = default;
+
+  template <
+      typename Q,
+      class = std::enable_if_t<std::is_const_v<P> && !std::is_const_v<Q>>>
+  ListIterator(const ListIterator<Q, T>& rhs) : ptr_(rhs.ptr_) {}
+
+  template <
+      typename Q,
+      class = std::enable_if_t<std::is_const_v<P> && !std::is_const_v<Q>>>
+  ListIterator& operator=(const ListIterator<Q, T>& rhs) {
+    ptr_ = rhs.ptr_;
+    return *this;
+  }
+
+  template <typename Q>
+  bool operator==(const ListIterator<Q, T>& other) const {
+    return ptr_ == other.ptr_;
+  }
+
+  template <typename Q>
+  bool operator!=(const ListIterator<Q, T>& other) const {
+    return !(*this == other);
+  }
+
+  auto& operator*() const {
+    return static_cast<reference>(*ptr_);
+  }
+
+  ListIterator& operator++() {
+    TORCH_CHECK(ptr_);
+    ptr_ = ptr_->next_;
+    return *this;
+  }
+
+  ListIterator& operator--() {
+    TORCH_CHECK(ptr_);
+    ptr_ = ptr_->prev_;
+    return *this;
+  }
+
+  auto* operator->() const {
+    return static_cast<pointer>(ptr_);
+  }
+};
+
+template <typename T>
+class IntrusiveList {
+  static_assert(std::is_base_of_v<IntrusiveListHook, T>);
+
+ public:
+  IntrusiveList() = default;
+  IntrusiveList(const std::initializer_list<std::reference_wrapper<T>>& items) {
+    for (auto& item : items) {
+      insert(this->end(), item);
+    }
+  }
+  ~IntrusiveList() {
+    while (head_.is_linked()) {
+      head_.next_->unlink();
+    }
+  }
+  IntrusiveList(const IntrusiveList&) = delete;
+  IntrusiveList& operator=(const IntrusiveList&) = delete;
+  IntrusiveList(IntrusiveList&&) = delete;
+  IntrusiveList& operator=(IntrusiveList&&) = delete;
+
+  using iterator = ListIterator<IntrusiveListHook, T>;
+  using const_iterator = ListIterator<const IntrusiveListHook, T>;
+
+  auto begin() const {
+    return ++const_iterator{&head_};
+  }
+
+  auto begin() {
+    return ++iterator{&head_};
+  }
+
+  auto end() const {
+    return const_iterator{&head_};
+  }
+
+  auto end() {
+    return iterator{&head_};
+  }
+
+  auto rbegin() const {
+    return std::reverse_iterator{end()};
+  }
+
+  auto rbegin() {
+    return std::reverse_iterator{end()};
+  }
+
+  auto rend() const {
+    return std::reverse_iterator{begin()};
+  }
+
+  auto rend() {
+    return std::reverse_iterator{begin()};
+  }
+
+  auto iterator_to(const T& n) const {
+    return const_iterator{&n};
+  }
+
+  auto iterator_to(T& n) {
+    return iterator{&n};
+  }
+
+  iterator insert(iterator pos, T& n) {
+    n.link_before(pos.ptr_);
+    return iterator{&n};
+  }
+
+  size_t size() const {
+    size_t ret = 0;
+    for ([[maybe_unused]] auto& _ : *this) {
+      ret++;
+    }
+    return ret;
+  }
+
+  bool empty() const {
+    return !head_.is_linked();
+  }
+
+ private:
+  IntrusiveListHook head_;
+};
+
+} // namespace c10
diff --git a/c10/util/LeftRight.h b/c10/util/LeftRight.h
index 759f6967933d..29abda3408eb 100644
--- a/c10/util/LeftRight.h
+++ b/c10/util/LeftRight.h
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+#pragma once
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/macros/Macros.h>
 #include <c10/util/Synchronized.h>
 #include <array>
@@ -68,7 +73,13 @@ class LeftRight final {
 
   ~LeftRight() {
     // wait until any potentially running writers are finished
+<<<<<<< HEAD
     { std::unique_lock<std::mutex> lock(_writeMutex); }
+=======
+    {
+      std::unique_lock<std::mutex> lock(_writeMutex);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // wait until any potentially running readers are finished
     while (_counters[0].load() != 0 || _counters[1].load() != 0) {
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index c4636587a696..23c2eb041789 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -37,7 +37,11 @@ void SetStackTraceFetcher(std::function<::c10::Backtrace()> fetcher) {
   GetFetchStackTrace() = std::move(fetcher);
 }
 
+<<<<<<< HEAD
 void SetStackTraceFetcher(std::function<string()> fetcher) {
+=======
+void SetStackTraceFetcher(std::function<std::string()> fetcher) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   SetStackTraceFetcher([fetcher = std::move(fetcher)] {
     return std::make_shared<PrecomputedLazyValue<std::string>>(fetcher());
   });
@@ -125,21 +129,33 @@ bool IsAPIUsageDebugMode() {
   return val.has_value() && !val.value().empty(); // any non-empty value
 }
 
+<<<<<<< HEAD
 void APIUsageDebug(const string& event) {
+=======
+void APIUsageDebug(const std::string& event) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // use stderr to avoid messing with glog
   std::cerr << "PYTORCH_API_USAGE " << event << '\n';
 }
 
 APIUsageLoggerType* GetAPIUsageLogger() {
   static APIUsageLoggerType func =
+<<<<<<< HEAD
       IsAPIUsageDebugMode() ? &APIUsageDebug : [](const string&) {};
+=======
+      IsAPIUsageDebugMode() ? &APIUsageDebug : [](const std::string&) {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return &func;
 }
 
 APIUsageMetadataLoggerType* GetAPIUsageMetadataLogger() {
   static APIUsageMetadataLoggerType func =
       [](const std::string&,
+<<<<<<< HEAD
          const std::map<std::string, std::string>& metadata_map) {};
+=======
+         const std::map<std::string, std::string>& /*metadata_map*/) {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return &func;
 }
 
@@ -273,8 +289,16 @@ DECLARE_bool(logtostderr);
 // This backward compatibility flags are in order to deal with cases where
 // Caffe2 are not built with glog, but some init flags still pass in these
 // flags. They may go away in the future.
+<<<<<<< HEAD
 C10_DEFINE_int32(minloglevel, 0, "Equivalent to glog minloglevel")
 C10_DEFINE_int32(v, 0, "Equivalent to glog verbose")
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+C10_DEFINE_int32(minloglevel, 0, "Equivalent to glog minloglevel")
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+C10_DEFINE_int32(v, 0, "Equivalent to glog verbose")
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DEFINE_bool(logtostderr, false, "Equivalent to glog logtostderr")
 #endif // !defined(c10_USE_GLOG)
 
@@ -383,7 +407,11 @@ void initLogging() {
   detail::setLogLevelFlagFromEnv();
 }
 
+<<<<<<< HEAD
 bool InitCaffeLogging(int* argc, char** argv) {
+=======
+bool InitCaffeLogging(int* argc, char** /*argv*/) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // When doing InitCaffeLogging, we will assume that caffe's flag parser has
   // already finished.
   if (*argc == 0)
diff --git a/c10/util/Logging.h b/c10/util/Logging.h
index fac615d836fc..b94c4009fa25 100644
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@@ -57,7 +57,13 @@ C10_DECLARE_bool(caffe2_use_fatal_for_enforce);
 
 namespace c10 {
 
+<<<<<<< HEAD
 using std::string;
+=======
+#if !defined(C10_NODEPRECATED)
+using std::string;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Functions that we use for initialization.
 C10_API bool InitCaffeLogging(int* argc, char** argv);
diff --git a/c10/util/NetworkFlow.cpp b/c10/util/NetworkFlow.cpp
index fe9073b31c01..971d292d2428 100644
--- a/c10/util/NetworkFlow.cpp
+++ b/c10/util/NetworkFlow.cpp
@@ -90,7 +90,11 @@ struct DinicFlowGraph {
     // The residual level graph is constructed by:
     //   1. doing a BFS on the residual graph, assigning levels
     //      to each vertex.
+<<<<<<< HEAD
     //   2. only include edges u->v where level[v] == leve[u] + 1
+=======
+    //   2. only include edges u->v where level[v] == level[u] + 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::queue<size_t> q;
     // let level[u] = 0 if it has not been visited yet.
     std::vector<size_t> level(graph_size, 0);
@@ -216,9 +220,13 @@ struct DinicFlowGraph {
     return seen;
   }
 
+<<<<<<< HEAD
   std::pair<std::vector<size_t>, std::vector<size_t>> partition(
       size_t s,
       size_t t) {
+=======
+  std::pair<std::vector<size_t>, std::vector<size_t>> partition(size_t t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Note: the partitioning returns "reachable" / "unreachable",
     //   but specifically, for "unreachable", it returns "all vertices
     //   that are reachable from t in the reverse residual graph"
@@ -258,7 +266,11 @@ struct DinicFlowGraph {
       };
     }
 
+<<<<<<< HEAD
     auto [reachable_idxs, unreachable_idxs] = partition(s_int, t_int);
+=======
+    auto [reachable_idxs, unreachable_idxs] = partition(t_int);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<std::string> reachable, unreachable;
 
     auto idxs_to_names = [&](std::vector<size_t>& src,
diff --git a/c10/util/Semaphore.h b/c10/util/Semaphore.h
new file mode 100644
index 000000000000..041d9abecf51
--- /dev/null
+++ b/c10/util/Semaphore.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <version>
+
+/*
+  a simple semaphore interface.
+*/
+
+// note: __cpp_lib_semaphore will not be defined in some apple platforms
+// even if >= C++20.
+#if __has_include(<semaphore>) && defined(__cpp_lib_semaphore) && __cpp_lib_semaphore >= 201907L
+#define C10_SEMAPHORE_USE_STL
+#endif
+
+#ifdef C10_SEMAPHORE_USE_STL
+#include <semaphore>
+#else
+// To use moodycamel semaphore, we need to include the header file
+// for concurrentqueue first. Hiding implementation detail here.
+#ifdef BLOCK_SIZE
+#pragma push_macro("BLOCK_SIZE")
+#undef BLOCK_SIZE
+#include <moodycamel/concurrentqueue.h> // @manual
+#pragma pop_macro("BLOCK_SIZE")
+#else
+#include <moodycamel/concurrentqueue.h> // @manual
+#endif
+
+#include <moodycamel/lightweightsemaphore.h> // @manual
+#endif
+
+namespace c10 {
+
+class Semaphore {
+ public:
+  Semaphore(int32_t initial_count = 0) : impl_(initial_count) {}
+
+  void release(int32_t n = 1) {
+#ifdef C10_SEMAPHORE_USE_STL
+    impl_.release(n);
+#else
+    impl_.signal(n);
+#endif
+  }
+
+  void acquire() {
+#ifdef C10_SEMAPHORE_USE_STL
+    impl_.acquire();
+#else
+    impl_.wait();
+#endif
+  }
+
+  bool tryAcquire() {
+#ifdef C10_SEMAPHORE_USE_STL
+    return impl_.try_acquire();
+#else
+    return impl_.tryWait();
+#endif
+  }
+
+ private:
+#ifdef C10_SEMAPHORE_USE_STL
+  std::counting_semaphore<> impl_;
+#else
+  moodycamel::LightweightSemaphore impl_;
+#endif
+};
+} // namespace c10
+
+#undef C10_SEMAPHORE_USE_STL
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index 0b5282c9b9e6..80e5f61ecce0 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -370,9 +370,15 @@ class SmallVectorTemplateCommon
 /// note
 template <
     typename T,
+<<<<<<< HEAD
     bool = (std::is_trivially_copy_constructible_v<T>)&&(
         std::is_trivially_move_constructible_v<
             T>)&&std::is_trivially_destructible_v<T>>
+=======
+    bool = (std::is_trivially_copy_constructible_v<T>) &&
+        (std::is_trivially_move_constructible_v<T>) &&
+        std::is_trivially_destructible_v<T>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
   friend class SmallVectorTemplateCommon<T>;
 
diff --git a/c10/util/StringUtil.cpp b/c10/util/StringUtil.cpp
index e9ca8438c8a8..6d3f4aaefa86 100644
--- a/c10/util/StringUtil.cpp
+++ b/c10/util/StringUtil.cpp
@@ -146,4 +146,76 @@ size_t ReplaceAll(std::string& s, std::string_view from, std::string_view to) {
   return numReplaced;
 }
 
+<<<<<<< HEAD
+=======
+template <>
+std::optional<int64_t> tryToNumber<int64_t>(const std::string& symbol) {
+  return tryToNumber<int64_t>(symbol.c_str());
+}
+
+template <>
+std::optional<int64_t> tryToNumber<int64_t>(const char* symbol) {
+  // TODO Using strtoll for portability. Consider using std::from_chars in the
+  // future. According to https://libcxx.llvm.org/Status/Cxx17.html,
+  // std::from_chars is not supported until clang 20. We will need MSVC to also
+  // fully support std::from_chars.
+  if (!symbol) {
+    return std::nullopt;
+  }
+  char* end = nullptr;
+  errno = 0;
+  int64_t value = strtoll(symbol, &end, 0);
+  if (errno != 0) {
+    errno = 0;
+    return std::nullopt;
+  }
+  if (*end != '\0' || end == symbol) {
+    return std::nullopt;
+  }
+  return value;
+}
+
+template <>
+std::optional<double> tryToNumber<double>(const std::string& symbol) {
+  return tryToNumber<double>(symbol.c_str());
+}
+
+template <>
+std::optional<double> tryToNumber<double>(const char* symbol) {
+  // TODO Using strtod for portability. Consider using std::from_chars in the
+  // future. According to https://libcxx.llvm.org/Status/Cxx17.html,
+  // std::from_chars is not supported until clang 20. We will need MSVC to also
+  // fully support std::from_chars.
+  if (!symbol) {
+    return std::nullopt;
+  }
+  char* end = nullptr;
+  errno = 0;
+  double value = strtod(symbol, &end);
+  if (errno != 0) {
+    errno = 0;
+    return std::nullopt;
+  }
+  if (*end != '\0' || end == symbol) {
+    return std::nullopt;
+  }
+  return value;
+}
+
+std::vector<std::string_view> split(std::string_view target, char delimiter) {
+  std::vector<std::string_view> atoms;
+  std::string_view buffer = target;
+  while (!buffer.empty()) {
+    auto i = buffer.find(delimiter);
+    if (i == std::string_view::npos) {
+      atoms.push_back(buffer);
+      buffer.remove_prefix(buffer.size());
+    } else {
+      atoms.push_back(buffer.substr(0, i));
+      buffer.remove_prefix(i + 1);
+    }
+  }
+  return atoms;
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10
diff --git a/c10/util/StringUtil.h b/c10/util/StringUtil.h
index c2b726a94d36..757f5d677e5c 100644
--- a/c10/util/StringUtil.h
+++ b/c10/util/StringUtil.h
@@ -10,6 +10,11 @@
 #include <sstream>
 #include <string>
 #include <string_view>
+<<<<<<< HEAD
+=======
+#include <type_traits>
+#include <vector>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 C10_CLANG_DIAGNOSTIC_PUSH()
 #if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
@@ -50,11 +55,31 @@ inline std::ostream& _str(std::ostream& ss) {
   return ss;
 }
 
+<<<<<<< HEAD
 template <typename T>
 inline std::ostream& _str(std::ostream& ss, const T& t) {
   // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
   ss << t;
   return ss;
+=======
+template <class T, class = std::ostream&>
+struct Streamable : std::false_type {};
+
+template <class T>
+struct Streamable<T, decltype(std::declval<std::ostream&>() << T{})>
+    : std::true_type {};
+
+template <typename T>
+inline std::ostream& _str(std::ostream& ss, const T& t) {
+  if constexpr (std::is_enum_v<T> && !Streamable<T>::value) {
+    // NOLINTNEXTLINE(modernize-type-traits)
+    return _str(ss, static_cast<typename std::underlying_type<T>::type>(t));
+  } else {
+    // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+    ss << t;
+    return ss;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename T>
@@ -213,6 +238,37 @@ inline void printQuotedString(std::ostream& stmt, const std::string_view str) {
   stmt << "\"";
 }
 
+<<<<<<< HEAD
+=======
+template <typename T>
+std::optional<T> tryToNumber(const char* symbol) = delete;
+template <typename T>
+std::optional<T> tryToNumber(const std::string& symbol) = delete;
+
+/*
+ * Convert a string to a 64 bit integer. Trailing whitespaces are not supported.
+ * Similarly, integer string with trailing characters like "123abc" will be
+ * rejected.
+ */
+template <>
+C10_API std::optional<int64_t> tryToNumber<int64_t>(const char* symbol);
+template <>
+C10_API std::optional<int64_t> tryToNumber<int64_t>(const std::string& symbol);
+
+/*
+ * Convert a string to a double. Trailing whitespaces are not supported.
+ * Similarly, integer string with trailing characters like "123abc" will
+ * be rejected.
+ */
+template <>
+C10_API std::optional<double> tryToNumber<double>(const char* symbol);
+template <>
+C10_API std::optional<double> tryToNumber<double>(const std::string& symbol);
+
+C10_API std::vector<std::string_view> split(
+    std::string_view target,
+    char delimiter);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10
 
 C10_CLANG_DIAGNOSTIC_POP()
diff --git a/c10/util/TypeIndex.h b/c10/util/TypeIndex.h
index e560fc6869d2..bce90f3998ee 100644
--- a/c10/util/TypeIndex.h
+++ b/c10/util/TypeIndex.h
@@ -5,7 +5,10 @@
 #include <c10/util/string_view.h>
 #include <cstdint>
 #include <ostream>
+<<<<<<< HEAD
 #include <stdexcept>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <string>
 #include <type_traits>
 
@@ -32,6 +35,7 @@ struct type_index final : IdWrapper<type_index, uint64_t> {
 
 namespace detail {
 
+<<<<<<< HEAD
 inline constexpr c10::c10_string_view extract(
     c10::c10_string_view prefix,
     c10::c10_string_view suffix,
@@ -73,6 +77,46 @@ inline constexpr c10::c10_string_view fully_qualified_type_name_impl() {
 }
 
 #if !defined(__CUDA_ARCH__)
+=======
+template <typename T>
+inline constexpr c10::c10_string_view fully_qualified_type_name_impl() {
+#if defined(_MSC_VER) && !defined(__clang__)
+  constexpr std::string_view fun_sig = __FUNCSIG__;
+#if defined(__NVCC__)
+  constexpr std::string_view prefix =
+      "c10::basic_string_view<char> c10::util::detail::fully_qualified_type_name_impl<";
+  constexpr std::string_view suffix = ">()";
+#else
+  constexpr std::string_view prefix =
+      "class c10::basic_string_view<char> __cdecl c10::util::detail::fully_qualified_type_name_impl<";
+  constexpr std::string_view suffix = ">(void)";
+#endif
+#elif defined(__clang__)
+  constexpr std::string_view fun_sig = __PRETTY_FUNCTION__;
+  constexpr std::string_view prefix =
+      "c10::c10_string_view c10::util::detail::fully_qualified_type_name_impl() [T = ";
+  constexpr std::string_view suffix = "]";
+#elif defined(__GNUC__)
+  constexpr std::string_view fun_sig = __PRETTY_FUNCTION__;
+  constexpr std::string_view prefix =
+      "constexpr c10::c10_string_view c10::util::detail::fully_qualified_type_name_impl() [with T = ";
+  constexpr std::string_view suffix =
+      "; c10::c10_string_view = c10::basic_string_view<char>]";
+#endif
+#if !defined(__CUDA_ARCH__) && !defined(__CUDA_ARCH_LIST__)
+  static_assert(c10::starts_with(
+      static_cast<std::string_view>(fun_sig),
+      static_cast<std::string_view>(prefix)));
+  static_assert(c10::ends_with(
+      static_cast<std::string_view>(fun_sig),
+      static_cast<std::string_view>(suffix)));
+#endif
+  return fun_sig.substr(
+      prefix.size(), fun_sig.size() - prefix.size() - suffix.size());
+}
+
+#if !defined(__CUDA_ARCH__) && !defined(__CUDA_ARCH_LIST__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 inline constexpr uint64_t type_index_impl() {
 // Idea: __PRETTY_FUNCTION__ (or __FUNCSIG__ on msvc) contains a qualified name
@@ -93,7 +137,11 @@ inline constexpr uint64_t type_index_impl() {
 
 template <typename T>
 inline constexpr type_index get_type_index() {
+<<<<<<< HEAD
 #if !defined(__CUDA_ARCH__)
+=======
+#if !defined(__CUDA_ARCH__) && !defined(__CUDA_ARCH_LIST__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // To enforce that this is really computed at compile time, we pass the
   // type index through std::integral_constant.
   return type_index{std::integral_constant<
@@ -122,10 +170,16 @@ inline constexpr type_index get_type_index<std::string>() {
 #endif
 
 template <typename T>
+<<<<<<< HEAD
 inline constexpr c10::c10_string_view get_fully_qualified_type_name() noexcept {
   constexpr c10::c10_string_view name =
       detail::fully_qualified_type_name_impl<T>();
   return name;
+=======
+inline constexpr std::string_view get_fully_qualified_type_name() noexcept {
+  return static_cast<std::string_view>(
+      detail::fully_qualified_type_name_impl<T>());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace c10::util
 
diff --git a/c10/util/UniqueVoidPtr.h b/c10/util/UniqueVoidPtr.h
index 175697f7f63b..59f78d75db14 100644
--- a/c10/util/UniqueVoidPtr.h
+++ b/c10/util/UniqueVoidPtr.h
@@ -60,6 +60,22 @@ class UniqueVoidPtr {
   void* get() const {
     return data_;
   }
+<<<<<<< HEAD
+=======
+
+  bool /* success */ unsafe_reset_data_and_ctx(void* new_data_and_ctx) {
+    if (C10_UNLIKELY(ctx_.get_deleter() != &deleteNothing)) {
+      return false;
+    }
+    // seems quicker than calling the no-op deleter when we reset
+    // NOLINTNEXTLINE(bugprone-unused-return-value)
+    ctx_.release();
+    ctx_.reset(new_data_and_ctx);
+    data_ = new_data_and_ctx;
+    return true;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void* get_context() const {
     return ctx_.get();
   }
diff --git a/c10/util/build.bzl b/c10/util/build.bzl
index 5e1dc6fbfbf5..e041f9cf0cbd 100644
--- a/c10/util/build.bzl
+++ b/c10/util/build.bzl
@@ -34,8 +34,15 @@ def define_targets(rules):
         visibility = ["//visibility:public"],
         deps = [
             ":bit_cast",
+<<<<<<< HEAD
             "//c10/macros",
             "@fmt",
+=======
+            "//torch/headeronly:torch_headeronly",
+            "//c10/macros",
+            "@fmt",
+            "@moodycamel//:moodycamel",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ] + rules.select({
             "//c10:using_gflags": ["@com_github_gflags_gflags//:gflags"],
             "//conditions:default": [],
@@ -89,6 +96,12 @@ def define_targets(rules):
                 "ssize.h",
             ],
         ),
+<<<<<<< HEAD
+=======
+        deps = [
+            "//torch/headeronly:torch_headeronly",
+        ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         visibility = ["//visibility:public"],
     )
 
diff --git a/c10/util/env.cpp b/c10/util/env.cpp
index dcc969ac381b..398c000fb445 100644
--- a/c10/util/env.cpp
+++ b/c10/util/env.cpp
@@ -7,11 +7,22 @@
 
 namespace c10::utils {
 
+<<<<<<< HEAD
 static std::shared_mutex env_mutex;
 
 // Set an environment variable.
 void set_env(const char* name, const char* value, bool overwrite) {
   std::lock_guard lk(env_mutex);
+=======
+static std::shared_mutex& get_env_mutex() {
+  static std::shared_mutex env_mutex;
+  return env_mutex;
+}
+
+// Set an environment variable.
+void set_env(const char* name, const char* value, bool overwrite) {
+  std::lock_guard lk(get_env_mutex());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef _MSC_VER
 #pragma warning(push)
 #pragma warning(disable : 4996)
@@ -46,7 +57,11 @@ void set_env(const char* name, const char* value, bool overwrite) {
 
 // Reads an environment variable and returns the content if it is set
 std::optional<std::string> get_env(const char* name) noexcept {
+<<<<<<< HEAD
   std::shared_lock lk(env_mutex);
+=======
+  std::shared_lock lk(get_env_mutex());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef _MSC_VER
 #pragma warning(push)
 #pragma warning(disable : 4996)
@@ -77,10 +92,17 @@ bool has_env(const char* name) noexcept {
 std::optional<bool> check_env(const char* name) {
   auto env_opt = get_env(name);
   if (env_opt.has_value()) {
+<<<<<<< HEAD
     if (*env_opt == "0") {
       return false;
     }
     if (*env_opt == "1") {
+=======
+    if (env_opt == "0") {
+      return false;
+    }
+    if (env_opt == "1") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return true;
     }
     TORCH_WARN(
diff --git a/c10/util/llvmMathExtras.h b/c10/util/llvmMathExtras.h
index 82651f7858c7..8fa97f447e4e 100644
--- a/c10/util/llvmMathExtras.h
+++ b/c10/util/llvmMathExtras.h
@@ -610,8 +610,12 @@ inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
 
 /// This function takes a 64-bit integer and returns the bit equivalent double.
 inline double BitsToDouble(uint64_t Bits) {
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   double D;
+=======
+  double D = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
   memcpy(&D, &Bits, sizeof(Bits));
   return D;
diff --git a/c10/util/numa.cpp b/c10/util/numa.cpp
index 9eb8c6d9d7fc..27f7ff87dc75 100644
--- a/c10/util/numa.cpp
+++ b/c10/util/numa.cpp
@@ -113,9 +113,15 @@ bool IsNUMAEnabled() {
   return false;
 }
 
+<<<<<<< HEAD
 void NUMABind(int numa_node_id) {}
 
 int GetNUMANode(const void* ptr) {
+=======
+void NUMABind(int /*numa_node_id*/) {}
+
+int GetNUMANode(const void* /*ptr*/) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return -1;
 }
 
@@ -123,7 +129,11 @@ int GetNumNUMANodes() {
   return -1;
 }
 
+<<<<<<< HEAD
 void NUMAMove(void* ptr, size_t size, int numa_node_id) {}
+=======
+void NUMAMove(void* /*ptr*/, size_t /*size*/, int /*numa_node_id*/) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 int GetCurrentNUMANode() {
   return -1;
diff --git a/c10/util/safe_numerics.h b/c10/util/safe_numerics.h
index e242cfaf8449..58be28ccafff 100644
--- a/c10/util/safe_numerics.h
+++ b/c10/util/safe_numerics.h
@@ -36,7 +36,11 @@ C10_ALWAYS_INLINE bool mul_overflows(uint64_t a, uint64_t b, uint64_t* out) {
   return __builtin_mul_overflow(a, b, out);
 #else
   *out = a * b;
+<<<<<<< HEAD
   // This test isnt exact, but avoids doing integer division
+=======
+  // This test isn't exact, but avoids doing integer division
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return (
       (c10::llvm::countLeadingZeros(a) + c10::llvm::countLeadingZeros(b)) < 64);
 #endif
@@ -77,7 +81,11 @@ bool safe_multiplies_u64(It first, It last, uint64_t* out) {
     prod_log2 += c10::llvm::Log2_64_Ceil(x);
   }
   *out = prod;
+<<<<<<< HEAD
   // This test isnt exact, but avoids doing integer division
+=======
+  // This test isn't exact, but avoids doing integer division
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return !is_zero && (prod_log2 >= 64);
 #endif
 }
diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index 267fc5721b28..5b92552ad94b 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -59,7 +59,11 @@ void hookupHandler() {
   if (hookedUpCount++) {
     return;
   }
+<<<<<<< HEAD
   struct sigaction sa {};
+=======
+  struct sigaction sa{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Setup the handler
   sa.sa_handler = &handleSignal;
   // Restart the system call, if at all possible
@@ -80,7 +84,11 @@ void unhookHandler() {
   if (--hookedUpCount > 0) {
     return;
   }
+<<<<<<< HEAD
   struct sigaction sa {};
+=======
+  struct sigaction sa{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Setup the sighub handler
   sa.sa_handler = SIG_DFL;
   // Restart the system call, if at all possible
@@ -219,11 +227,18 @@ void FatalSignalHandler::fatalSignalHandler(int signum) {
       if (tid != currentTid) {
         signalReceived = false;
         syscall(SYS_tgkill, pid, tid, SIGUSR2);
+<<<<<<< HEAD
         auto now = std::chrono::system_clock::now();
         using namespace std::chrono_literals;
         // we use wait_until instead of wait because on ROCm there was
         // a single thread that wouldn't receive the SIGUSR2
         if (std::cv_status::timeout == writingCond.wait_until(ul, now + 2s)) {
+=======
+        using namespace std::chrono_literals;
+        // we use wait_until instead of wait because on ROCm there was
+        // a single thread that wouldn't receive the SIGUSR2
+        if (std::cv_status::timeout == writingCond.wait_for(ul, 2s)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           if (!signalReceived) {
             std::cerr << "signal lost waiting for stacktrace " << pid << ":"
                       << tid << '\n';
@@ -274,7 +289,11 @@ void FatalSignalHandler::installFatalSignalHandlers() {
     return;
   }
   fatalSignalHandlersInstalled = true;
+<<<<<<< HEAD
   struct sigaction sa {};
+=======
+  struct sigaction sa{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   sigemptyset(&sa.sa_mask);
   // Since we'll be in an exiting situation it's possible there's memory
   // corruption, so make our own stack just in case.
diff --git a/c10/util/signal_handler.h b/c10/util/signal_handler.h
index 09f587d3923a..a1ac98abc249 100644
--- a/c10/util/signal_handler.h
+++ b/c10/util/signal_handler.h
@@ -88,7 +88,11 @@ class C10_API FatalSignalHandler {
   bool fatalSignalHandlersInstalled;
   // We need to hold a reference to call the previous SIGUSR2 handler in case
   // we didn't signal it
+<<<<<<< HEAD
   struct sigaction previousSigusr2 {};
+=======
+  struct sigaction previousSigusr2{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Flag dictating whether the SIGUSR2 handler falls back to previous handlers
   // or is intercepted in order to print a stack trace.
   std::atomic<bool> fatalSignalReceived;
diff --git a/c10/util/string_view.h b/c10/util/string_view.h
index 4858716a75b9..ab7018cf6421 100644
--- a/c10/util/string_view.h
+++ b/c10/util/string_view.h
@@ -632,7 +632,11 @@ struct hash<::c10::basic_string_view<CharT>> {
   size_t operator()(::c10::basic_string_view<CharT> x) const {
     // The standard says that std::string_view hashing must do the same as
     // std::string hashing but leaves the details of std::string hashing
+<<<<<<< HEAD
     // up to the implementer. So, to be conformant, we need to re-use and
+=======
+    // up to the implementer. So, to be conformant, we need to reuse and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // existing STL type's hash function. The std::string fallback is probably
     // slow but the only way to be conformant.
 
diff --git a/c10/util/strong_type.h b/c10/util/strong_type.h
index 1399c27c7d18..082047fd97d8 100644
--- a/c10/util/strong_type.h
+++ b/c10/util/strong_type.h
@@ -1604,12 +1604,15 @@ struct hash<::strong::type<T, Tag, M...>>
     return hash<T>::operator()(value_of(tt));
   }
 };
+<<<<<<< HEAD
 template <typename T, typename Tag, typename ... M>
 struct is_arithmetic<::strong::type<T, Tag, M...>>
   : is_base_of<::strong::arithmetic::modifier<::strong::type<T, Tag, M...>>,
                ::strong::type<T, Tag, M...>>
 {
 };
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if STRONG_HAS_STD_FORMAT
 template<typename T, typename Tag, typename... M, typename Char>
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index 20959f64180e..e11ed6ab1e36 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -19,7 +19,10 @@
 #include <c10/util/TypeIndex.h>
 #include <c10/util/TypeTraits.h>
 #include <c10/util/irange.h>
+<<<<<<< HEAD
 #include <c10/util/string_view.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <c10/core/ScalarType.h>
 
@@ -394,7 +397,11 @@ class C10_API TypeMeta final {
     return data().placementNew_;
   }
   /**
+<<<<<<< HEAD
    * Returns the typed copy function pointer for individual iterms.
+=======
+   * Returns the typed copy function pointer for individual items.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    */
   Copy* copy() const noexcept {
     return data().copy_;
@@ -477,7 +484,11 @@ class C10_API TypeMeta final {
   /**
    * convert TypeMeta handles to ScalarType enum values
    */
+<<<<<<< HEAD
   inline ScalarType toScalarType() {
+=======
+  inline ScalarType toScalarType() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (C10_LIKELY(isScalarType())) {
       return static_cast<ScalarType>(index_);
     }
diff --git a/c10/xpu/CMakeLists.txt b/c10/xpu/CMakeLists.txt
index 01f77d617136..1736ce3cbff9 100644
--- a/c10/xpu/CMakeLists.txt
+++ b/c10/xpu/CMakeLists.txt
@@ -31,6 +31,10 @@ set(C10_XPU_HEADERS
 )
 if(NOT BUILD_LIBTORCHLESS)
   add_library(c10_xpu ${C10_XPU_SRCS} ${C10_XPU_HEADERS})
+<<<<<<< HEAD
+=======
+  torch_compile_options(c10_xpu)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   target_compile_options(c10_xpu PRIVATE "-DC10_XPU_BUILD_MAIN_LIB")
   # Enable hidden visibility if compiler supports it.
   if(${COMPILER_SUPPORTS_HIDDEN_VISIBILITY})
diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp
index 7e5f4e214573..ffbe6d24e710 100644
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@@ -251,9 +251,18 @@ class DeviceCachingAllocator {
     return true;
   }
 
+<<<<<<< HEAD
   bool alloc_block(AllocParams& p) {
     auto size = p.alloc_size;
     auto device = p.device();
+=======
+  bool alloc_block(AllocParams& p, bool isRetry) {
+    auto size = p.alloc_size;
+    auto device = p.device();
+    if (isRetry) {
+      stats.num_alloc_retries += 1;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     void* ptr = sycl::aligned_alloc_device(
         kDeviceAlignment,
         size,
@@ -386,6 +395,16 @@ class DeviceCachingAllocator {
       stats.requested_bytes[stat_type].increase(block->requested_size);
     });
 
+<<<<<<< HEAD
+=======
+    c10::reportMemoryUsageToProfiler(
+        block->ptr,
+        static_cast<int64_t>(block->size),
+        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        c10::Device(c10::DeviceType::XPU, device));
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return block;
   }
 
@@ -418,8 +437,13 @@ class DeviceCachingAllocator {
     bool block_found = get_free_block(params);
     // Can't reuse an existing block, try to get a new one.
     if (!block_found) {
+<<<<<<< HEAD
       block_found = alloc_block(params) ||
           (release_cached_blocks() && alloc_block(params));
+=======
+      block_found = alloc_block(params, false) ||
+          (release_cached_blocks() && alloc_block(params, true));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     if (!block_found) {
       c10::xpu::DeviceProp device_prop;
@@ -431,6 +455,16 @@ class DeviceCachingAllocator {
       auto reserved_bytes =
           stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
               .current;
+<<<<<<< HEAD
+=======
+
+      c10::reportOutOfMemoryToProfiler(
+          static_cast<int64_t>(size),
+          allocated_bytes,
+          reserved_bytes,
+          c10::Device(c10::DeviceType::XPU, device));
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_CHECK_WITH(
           OutOfMemoryError,
           false,
@@ -455,6 +489,12 @@ class DeviceCachingAllocator {
     std::scoped_lock<std::recursive_mutex> lock(mutex);
     block->allocated = false;
 
+<<<<<<< HEAD
+=======
+    auto orig_block_ptr = block->ptr;
+    auto orig_block_size = block->size;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     StatTypes stat_types = get_stat_types_for_pool(*block->pool);
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
       stats.allocated_bytes[stat_type].decrease(block->size);
@@ -465,6 +505,16 @@ class DeviceCachingAllocator {
     } else {
       free_block(block);
     }
+<<<<<<< HEAD
+=======
+
+    c10::reportMemoryUsageToProfiler(
+        orig_block_ptr,
+        -static_cast<int64_t>(orig_block_size),
+        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        c10::Device(c10::DeviceType::XPU, block->device));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void recordStream(Block* block, xpu::XPUStream stream) {
@@ -495,6 +545,10 @@ class DeviceCachingAllocator {
       stats.active_bytes[statType].reset_accumulated();
       stats.requested_bytes[statType].reset_accumulated();
     }
+<<<<<<< HEAD
+=======
+    stats.num_alloc_retries = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void resetPeakStats() {
@@ -510,7 +564,11 @@ class DeviceCachingAllocator {
   }
 };
 
+<<<<<<< HEAD
 void local_raw_delete(void* ptr);
+=======
+static void local_raw_delete(void* ptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class XPUAllocator : public Allocator {
  private:
diff --git a/c10/xpu/XPUFunctions.cpp b/c10/xpu/XPUFunctions.cpp
index ca59c166b513..5fdd8b03254f 100644
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@@ -133,8 +133,18 @@ inline void initGlobalDevicePoolState() {
 #else
   // The default context is utilized for each Intel GPU device, allowing the
   // retrieval of the context from any GPU device.
+<<<<<<< HEAD
   gDevicePool.context = std::make_unique<sycl::context>(
       gDevicePool.devices[0]->get_platform().ext_oneapi_get_default_context());
+=======
+  const auto& platform = gDevicePool.devices[0]->get_platform();
+  gDevicePool.context = std::make_unique<sycl::context>(
+#if SYCL_COMPILER_VERSION >= 20250200
+      platform.khr_get_default_context());
+#else
+      platform.ext_oneapi_get_default_context());
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 }
 
diff --git a/c10/xpu/XPUStream.h b/c10/xpu/XPUStream.h
index 903986253d23..08b014746492 100644
--- a/c10/xpu/XPUStream.h
+++ b/c10/xpu/XPUStream.h
@@ -59,6 +59,14 @@ class C10_XPU_API XPUStream {
     return queue();
   }
 
+<<<<<<< HEAD
+=======
+  /// Implicit conversion to sycl::queue*.
+  operator sycl::queue*() const {
+    return &queue();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// Implicit conversion to Stream (a.k.a., forget that the stream is a
   /// XPU stream).
   operator Stream() const {
diff --git a/c10/xpu/impl/XPUGuardImpl.h b/c10/xpu/impl/XPUGuardImpl.h
index e7a6b3de9e1b..b79f2777a085 100644
--- a/c10/xpu/impl/XPUGuardImpl.h
+++ b/c10/xpu/impl/XPUGuardImpl.h
@@ -17,7 +17,12 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   XPUGuardImpl() = default;
 
   explicit XPUGuardImpl(DeviceType t) {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(t == kXPU);
+=======
+    TORCH_CHECK(
+        t == kXPU, "XPUGuardImpl initialized with non-XPU DeviceType: ", t);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   DeviceType type() const override {
@@ -25,7 +30,11 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   }
 
   Device exchangeDevice(Device d) const override {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(d.is_xpu());
+=======
+    TORCH_CHECK(d.is_xpu(), "Expected a XPU device, but got ", d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto old_device_index = c10::xpu::exchange_device(d.index());
     return Device(kXPU, old_device_index);
   }
@@ -36,7 +45,11 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   }
 
   void setDevice(Device d) const override {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(d.is_xpu());
+=======
+    TORCH_CHECK(d.is_xpu(), "Expected a XPU device, but got ", d);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::xpu::set_device(d.index());
   }
 
diff --git a/c10/xpu/test/CMakeLists.txt b/c10/xpu/test/CMakeLists.txt
index 96f355b02c5c..75290d9da919 100644
--- a/c10/xpu/test/CMakeLists.txt
+++ b/c10/xpu/test/CMakeLists.txt
@@ -11,7 +11,11 @@ if(BUILD_TEST)
     get_filename_component(test_file_name ${test_src} NAME_WE)
     set(test_name "c10_xpu_${test_file_name}")
     add_executable(${test_name} "${test_src}")
+<<<<<<< HEAD
     target_link_libraries(${test_name} ${C10_XPU_LIB} gtest_main)
+=======
+    target_link_libraries(${test_name} ${C10_XPU_LIB} gmock gtest gtest_main)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
     if(INSTALL_TEST)
       set_target_properties(${test_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
diff --git a/c10/xpu/test/impl/XPUCachingAllocatorTest.cpp b/c10/xpu/test/impl/XPUCachingAllocatorTest.cpp
index 5875fc0ceb2b..68546252327f 100644
--- a/c10/xpu/test/impl/XPUCachingAllocatorTest.cpp
+++ b/c10/xpu/test/impl/XPUCachingAllocatorTest.cpp
@@ -4,10 +4,13 @@
 #include <c10/xpu/XPUCachingAllocator.h>
 #include <c10/xpu/XPUException.h>
 
+<<<<<<< HEAD
 bool has_xpu() {
   return c10::xpu::device_count() > 0;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(XPUCachingAllocatorTest, GetXPUAllocator) {
   auto* allocator = c10::xpu::XPUCachingAllocator::get();
 
@@ -102,7 +105,11 @@ TEST(XPUCachingAllocatorTest, DeviceCachingAllocateByExternalStream) {
   void* tmp = sycl::aligned_alloc_device(
       512, _10mb, c10::xpu::get_raw_device(0), c10::xpu::get_device_context());
   void* ptr1 = c10::xpu::XPUCachingAllocator::raw_alloc(_10mb);
+<<<<<<< HEAD
   // We have reserved 500M of memory for resue. When allocating `ptr0` and
+=======
+  // We have reserved 500M of memory for reuse. When allocating `ptr0` and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // `ptr1` through the device caching allocator, they should be allocated from
   // the same block. Specifically, `ptr1` should follow immediately after `ptr0`
   // in the block, forming a sequence like [ptr0, ptr1]. This behavior occurs
diff --git a/c10/xpu/test/impl/XPUDeviceTest.cpp b/c10/xpu/test/impl/XPUDeviceTest.cpp
index 7a639385178c..0f369ab42c34 100644
--- a/c10/xpu/test/impl/XPUDeviceTest.cpp
+++ b/c10/xpu/test/impl/XPUDeviceTest.cpp
@@ -2,7 +2,11 @@
 
 #include <c10/xpu/XPUFunctions.h>
 
+<<<<<<< HEAD
 bool has_xpu() {
+=======
+static bool has_xpu() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return c10::xpu::device_count() > 0;
 }
 
diff --git a/c10/xpu/test/impl/XPUGuardTest.cpp b/c10/xpu/test/impl/XPUGuardTest.cpp
index fcd306375dc6..cca5c0648153 100644
--- a/c10/xpu/test/impl/XPUGuardTest.cpp
+++ b/c10/xpu/test/impl/XPUGuardTest.cpp
@@ -5,7 +5,11 @@
 #include <c10/xpu/XPUStream.h>
 #include <c10/xpu/test/impl/XPUTest.h>
 
+<<<<<<< HEAD
 bool has_xpu() {
+=======
+static bool has_xpu() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return c10::xpu::device_count() > 0;
 }
 
diff --git a/c10/xpu/test/impl/XPUStreamTest.cpp b/c10/xpu/test/impl/XPUStreamTest.cpp
index 3c55a3ae14b0..6e4b9d786e71 100644
--- a/c10/xpu/test/impl/XPUStreamTest.cpp
+++ b/c10/xpu/test/impl/XPUStreamTest.cpp
@@ -11,7 +11,11 @@
 #include <thread>
 #include <unordered_set>
 
+<<<<<<< HEAD
 bool has_xpu() {
+=======
+static bool has_xpu() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return c10::xpu::device_count() > 0;
 }
 
@@ -98,7 +102,11 @@ TEST(XPUStreamTest, StreamBehavior) {
   EXPECT_NE(stream.device_index(), c10::xpu::current_device());
 }
 
+<<<<<<< HEAD
 void thread_fun(std::optional<c10::xpu::XPUStream>& cur_thread_stream) {
+=======
+static void thread_fun(std::optional<c10::xpu::XPUStream>& cur_thread_stream) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto new_stream = c10::xpu::getStreamFromPool();
   c10::xpu::setCurrentXPUStream(new_stream);
   cur_thread_stream = {c10::xpu::getCurrentXPUStream()};
@@ -153,7 +161,15 @@ TEST(XPUStreamTest, StreamPoolRoundRobinTest) {
   EXPECT_TRUE(result_pair.second);
 }
 
+<<<<<<< HEAD
 void asyncMemCopy(sycl::queue& queue, int* dst, int* src, size_t numBytes) {
+=======
+static void asyncMemCopy(
+    sycl::queue& queue,
+    int* dst,
+    int* src,
+    size_t numBytes) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   queue.memcpy(dst, src, numBytes);
 }
 
@@ -219,6 +235,12 @@ TEST(XPUStreamTest, ExternalTest) {
   ASSERT_TRUE(curStream == myStream);
   ASSERT_TRUE(&(curStream.queue()) == stream);
 
+<<<<<<< HEAD
+=======
+  sycl::queue* q_ptr = curStream;
+  ASSERT_TRUE(q_ptr == stream);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   delete stream;
 }
 
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index d2d23b7ab651..636d503c6f30 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -275,6 +275,14 @@ if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
       "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterLazy.cpp"
     )
   endif()
+<<<<<<< HEAD
+=======
+  if(USE_MPS)
+    list(APPEND GENERATED_CXX_TORCH
+      "${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/generated/c_shim_mps.cpp"
+    )
+  endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 set(GENERATED_H_TORCH
@@ -567,11 +575,22 @@ if(USE_CUDA)
     if(NOT WIN32)
       append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
       set_source_files_properties(
+<<<<<<< HEAD
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/CudaDMAConnectivity.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemory.cu
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+=======
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
       )
     endif()
@@ -581,7 +600,11 @@ if(USE_CUDA)
     if(CMAKE_COMPILER_IS_GNUCXX)
       set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
     endif()
+<<<<<<< HEAD
     if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND EXISTING_ARCH_FLAGS MATCHES ".*compute_90.*")
+=======
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
     endif()
   endif()
@@ -701,9 +724,18 @@ list(APPEND Caffe2_CPU_SRCS ${TORCH_SRCS})
 
 if(USE_MPS)
   list(APPEND Caffe2_CPU_SRCS ${Caffe2_MPS_SRCS})
+<<<<<<< HEAD
   if(CAN_COMPILE_METAL)
       file(TOUCH ${CMAKE_BINARY_DIR}/aten/src/ATen/metallib_dummy.cpp)
       list(APPEND Caffe2_CPU_SRCS ${CMAKE_BINARY_DIR}/aten/src/ATen/metallib_dummy.cpp)
+=======
+  list(APPEND Caffe2_CPU_SRCS ${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/shim_mps.cpp)
+  list(APPEND Caffe2_CPU_SRCS ${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/shim_mps.mm)
+  list(APPEND Caffe2_CPU_SRCS ${TORCH_SRC_DIR}/csrc/inductor/aoti_runner/model_container_runner_mps.cpp)
+  if(CAN_COMPILE_METAL)
+      file(TOUCH ${CMAKE_BINARY_DIR}/caffe2/aten/src/ATen/metallib_dummy.cpp)
+      list(APPEND Caffe2_CPU_SRCS ${CMAKE_BINARY_DIR}/caffe2/aten/src/ATen/metallib_dummy.cpp)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
 endif()
 
@@ -975,6 +1007,56 @@ elseif(USE_CUDA)
     target_link_libraries(torch_cuda PRIVATE __caffe2_nccl)
     target_compile_definitions(torch_cuda PRIVATE USE_NCCL)
   endif()
+<<<<<<< HEAD
+=======
+
+  # Use env var for these for now for prototyping purposes
+  set(USE_NVSHMEM $ENV{USE_NVSHMEM} CACHE BOOL "Whether to build with NVSHMEM support")
+  # If user has specified NVSHMEM_HOME, we use it;
+  # Otherwise, NVSHMEM_HOME is auto detected in tools/setup_helpers/cmake.py
+  if($ENV{NVSHMEM_HOME})
+    set(NVSHMEM_HOME $ENV{NVSHMEM_HOME} CACHE PATH "Path to NVSHMEM build dir")
+  endif()
+
+  if(USE_NVSHMEM AND NOT DEFINED NVSHMEM_HOME)
+    message(WARNING "USE_NVSHMEM set to 1 but NVSHMEM_HOME not found. Please run `pip install nvidia-nvshmem-<version>`, or set NVSHMEM_HOME to the NVSHMEM build dir")
+    # Disable nvshmem if NVSHMEM_HOME is not found
+    set(USE_NVSHMEM FALSE CACHE BOOL "Whether to build with NVSHMEM support")
+  endif()
+
+  if(USE_NVSHMEM)
+    message("Building with NVSHMEM support:  '${NVSHMEM_HOME}'")
+    set(NVSHMEM_INCLUDE_DIR "${NVSHMEM_HOME}/include")
+    set(NVSHMEM_LIB_DIR "${NVSHMEM_HOME}/lib")
+
+    include_directories(${NVSHMEM_INCLUDE_DIR})
+
+    # Linking with nvshmem requires the source binary to be built with -rdc
+    # which is not viable for libtorch_cuda. So we isolate the linking of
+    # nvshmem in nvshmem_extension.
+    add_library(nvshmem_extension SHARED
+        "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp"
+        "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu"
+        "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu"
+        "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp"
+    )
+    set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_compile_options(nvshmem_extension PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>)
+    target_compile_options(nvshmem_extension PRIVATE "-U__CUDA_NO_HALF_OPERATORS__")
+    target_link_directories(nvshmem_extension PRIVATE ${NVSHMEM_LIB_DIR})
+    target_link_libraries(nvshmem_extension PRIVATE
+        # Full path needed bc nvshmem wheel ships with .so.3 instead of .so;
+        # otherwise, we could just write `nvshmem_host`
+        ${NVSHMEM_LIB_DIR}/libnvshmem_host.so.3
+        nvshmem_device
+    )
+    target_compile_definitions(torch_cuda PUBLIC USE_NVSHMEM)
+    target_compile_definitions(nvshmem_extension PUBLIC USE_NVSHMEM)
+    target_link_libraries(torch_cuda PRIVATE nvshmem_extension)
+    install(TARGETS nvshmem_extension EXPORT Caffe2Targets DESTINATION lib)
+  endif()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if(USE_UCC)
     target_link_libraries(torch_cuda PRIVATE __caffe2_ucc)
     target_compile_definitions(torch_cuda PRIVATE USE_UCC)
@@ -987,8 +1069,18 @@ elseif(USE_CUDA)
         FLASH_NAMESPACE=pytorch_flash
         UNFUSE_FMA                      # Addressing issue #121558
       )
+<<<<<<< HEAD
     target_include_directories(torch_cuda PRIVATE
         ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src/
+=======
+    target_sources(torch_cuda PRIVATE $<TARGET_OBJECTS:flash_attention>)
+    target_include_directories(torch_cuda PUBLIC
+      $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc>
+      $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/include>
+      $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>
+      $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc/flash_attn/src>
+      $<INSTALL_INTERFACE:include>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
   endif()
   if(USE_MEM_EFF_ATTENTION)
@@ -1059,9 +1151,12 @@ if(USE_XPU)
   add_library(torch_xpu ${Caffe2_XPU_SRCS})
   torch_compile_options(torch_xpu)  # see cmake/public/utils.cmake
   target_compile_definitions(torch_xpu PRIVATE USE_XPU)
+<<<<<<< HEAD
   if(WIN32)
     target_compile_options(torch_xpu PRIVATE /permissive-)
   endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   # ATen XPU implementation
   set(TORCH_XPU_OPS_DIR ${TORCH_ROOT}/third_party/torch-xpu-ops)
@@ -1238,7 +1333,12 @@ endif()
 target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE})
 
 target_include_directories(torch_cpu PRIVATE
+<<<<<<< HEAD
   ${TORCH_SRC_DIR}/csrc)
+=======
+  ${TORCH_SRC_DIR}/csrc
+  ${TORCH_SRC_DIR}/headeronly)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 target_include_directories(torch_cpu PRIVATE
   ${TORCH_ROOT}/third_party/miniz-3.0.2)
@@ -1257,9 +1357,18 @@ target_include_directories(torch_cpu PRIVATE
 target_include_directories(torch_cpu PRIVATE
   ${TORCH_ROOT}/third_party/nlohmann/include)
 
+<<<<<<< HEAD
 install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
   DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
   FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
+=======
+install(DIRECTORY
+  "${TORCH_SRC_DIR}/csrc"
+  "${TORCH_SRC_DIR}/headeronly"
+  DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
+  FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 install(FILES
   "${TORCH_SRC_DIR}/script.h"
   "${TORCH_SRC_DIR}/extension.h"
@@ -1268,12 +1377,15 @@ install(FILES
   "${TORCH_SRC_DIR}/custom_class_detail.h"
   DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
 if(BUILD_TEST)
+<<<<<<< HEAD
   if(BUILD_EXECUTORCH)
     add_subdirectory(
             ${TORCH_ROOT}/test/edge
             ${CMAKE_BINARY_DIR}/test_edge_op_registration
     )
   endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if(BUILD_LITE_INTERPRETER)
     add_subdirectory(
       ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime
@@ -1285,6 +1397,10 @@ if(BUILD_TEST)
     )
   else()
     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
+<<<<<<< HEAD
+=======
+    add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
     add_subdirectory(
       ${TORCH_ROOT}/test/cpp/tensorexpr
@@ -1320,10 +1436,13 @@ if(BUILD_TEST)
   endif()
 endif()
 
+<<<<<<< HEAD
 if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   include(../cmake/CheckAbi.cmake)
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # CMake config for external projects.
 configure_file(
   ${PROJECT_SOURCE_DIR}/cmake/TorchConfigVersion.cmake.in
@@ -1379,6 +1498,7 @@ if(USE_ROCM)
     set(ROCM_SOURCE_DIR "/opt/rocm")
   endif()
   message(INFO "caffe2 ROCM_SOURCE_DIR = ${ROCM_SOURCE_DIR}")
+<<<<<<< HEAD
   target_include_directories(torch_hip PRIVATE
     ${ROCM_SOURCE_DIR}/include
     ${ROCM_SOURCE_DIR}/hcc/include
@@ -1386,6 +1506,8 @@ if(USE_ROCM)
     ${ROCM_SOURCE_DIR}/hipsparse/include
     ${ROCM_SOURCE_DIR}/include/rccl/
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if(USE_FLASH_ATTENTION)
     target_compile_definitions(torch_hip PRIVATE
         USE_FLASH_ATTENTION
@@ -1585,11 +1707,15 @@ if(USE_CUDA)
   endif()
   target_link_libraries(torch_cuda INTERFACE torch::cudart)
   target_link_libraries(torch_cuda PUBLIC c10_cuda)
+<<<<<<< HEAD
   if(TARGET torch::nvtx3)
     target_link_libraries(torch_cuda PRIVATE torch::nvtx3)
   else()
     target_link_libraries(torch_cuda PUBLIC torch::nvtoolsext)
   endif()
+=======
+  target_link_libraries(torch_cuda PRIVATE CUDA::nvtx3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   target_include_directories(
       torch_cuda INTERFACE $<INSTALL_INTERFACE:include>)
@@ -1681,9 +1807,12 @@ if(BUILD_SHARED_LIBS)
   if(USE_CUDA)
     target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
     target_link_libraries(torch_global_deps torch::cudart)
+<<<<<<< HEAD
     if(TARGET torch::nvtoolsext)
       target_link_libraries(torch_global_deps torch::nvtoolsext)
     endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
   install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
@@ -1699,6 +1828,13 @@ if(USE_ROCM)
     endforeach()
   endif()
 
+<<<<<<< HEAD
+=======
+  if(WIN32)
+    list(APPEND HIP_CXX_FLAGS "-fms-extensions")
+  endif()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Call again since Caffe2_HIP_INCLUDE is extended with ATen include dirs.
   hip_include_directories(${Caffe2_HIP_INCLUDE})
 
@@ -1719,7 +1855,12 @@ if(USE_ROCM)
   target_link_libraries(torch_hip PRIVATE ${Caffe2_HIP_DEPENDENCY_LIBS})
 
   # Since PyTorch files contain HIP headers, this is also needed to capture the includes.
+<<<<<<< HEAD
   target_include_directories(torch_hip PRIVATE ${Caffe2_HIP_INCLUDE})
+=======
+  # ROCM_INCLUDE_DIRS is defined in LoadHIP.cmake
+  target_include_directories(torch_hip PRIVATE ${Caffe2_HIP_INCLUDE} ${ROCM_INCLUDE_DIRS})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   target_include_directories(torch_hip INTERFACE $<INSTALL_INTERFACE:include>)
 endif()
 
@@ -1728,7 +1869,11 @@ if(BUILD_STATIC_RUNTIME_BENCHMARK)
   add_executable(static_runtime_bench "${STATIC_RUNTIME_BENCHMARK_SRCS}")
   add_executable(static_runtime_test "${STATIC_RUNTIME_TEST_SRCS}")
   target_link_libraries(static_runtime_bench torch_library benchmark)
+<<<<<<< HEAD
   target_link_libraries(static_runtime_test torch_library gtest_main)
+=======
+  target_link_libraries(static_runtime_test torch_library gtest_main gtest)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 if(BUILD_MOBILE_BENCHMARK)
@@ -1799,7 +1944,11 @@ if(BUILD_TEST)
   foreach(test_src ${Caffe2_CPU_TEST_SRCS})
     get_filename_component(test_name ${test_src} NAME_WE)
     add_executable(${test_name} "${test_src}")
+<<<<<<< HEAD
     target_link_libraries(${test_name} torch_library gtest_main)
+=======
+    target_link_libraries(${test_name} torch_library gtest_main gtest gmock)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if(NOT MSVC)
       target_link_libraries(${test_name} stdc++)
     endif()
@@ -1896,9 +2045,28 @@ if(BUILD_TEST)
   endif()
 
   if(USE_ROCM)
+<<<<<<< HEAD
     foreach(test_src ${Caffe2_HIP_TEST_SRCS})
       get_filename_component(test_name ${test_src} NAME_WE)
       add_executable(${test_name} "${test_src}")
+=======
+    set(BASE_HIPCC_FLAGS ${HIP_HIPCC_FLAGS})
+    foreach(test_src ${Caffe2_HIP_TEST_SRCS})
+      get_filename_component(test_name ${test_src} NAME_WE)
+      if(WIN32 AND test_src MATCHES "^.*\.hip$")
+        # Solves bug with hip_add_executable propagating flags from the previous tests
+        set(HIP_HIPCC_FLAGS ${BASE_HIPCC_FLAGS})
+        set_source_files_properties(${test_src} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+        hip_add_executable(${test_name} "${test_src}")
+        list(JOIN PYTORCH_ROCM_ARCH " " ROCM_PROPERTY_ARCH_LIST)
+        set_target_properties(${test_name} PROPERTIES HIP_ARCHITECTURES ${ROCM_PROPERTY_ARCH_LIST})
+      else()
+        add_executable(${test_name} "${test_src}")
+      endif()
+      if(test_src MATCHES "^.*\.hip$")
+        set_target_properties(${test_name} PROPERTIES LINKER_LANGUAGE CXX)
+      endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       target_link_libraries(${test_name} torch_library gtest_main)
       target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE} ${Caffe2_HIP_INCLUDE})
diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
index 9b05d629edec..c817cce686bb 100644
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@@ -29,7 +29,11 @@
 #endif
 
 // Useful build settings that are recorded in the compiled binary
+<<<<<<< HEAD
 // torch.__build__.show()
+=======
+// torch.__config__.show()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define CAFFE2_BUILD_STRINGS { \
   {"TORCH_VERSION", "${TORCH_VERSION}"}, \
   {"CXX_COMPILER", "${CMAKE_CXX_COMPILER}"}, \
@@ -65,4 +69,9 @@
   {"USE_ITT", "${CAFFE2_USE_ITT}"}, \
   {"USE_ROCM_KERNEL_ASSERT", "${USE_ROCM_KERNEL_ASSERT}"}, \
   {"USE_CUSPARSELT", "${USE_CUSPARSELT}"}, \
+<<<<<<< HEAD
+=======
+  {"USE_XPU", "${USE_XPU}"}, \
+  {"USE_XCCL", "${USE_XCCL}"}, \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/caffe2/perfkernels/embedding_lookup_idx_avx2.cc b/caffe2/perfkernels/embedding_lookup_idx_avx2.cc
index 3ed48a1c5232..e8072f2e60b9 100644
--- a/caffe2/perfkernels/embedding_lookup_idx_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_idx_avx2.cc
@@ -3359,12 +3359,19 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
         bio = wgt * scale_bias[2 * idx + 1];
+=======
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        float bio = wgt * scale_bias[2 * idx + 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wgt = wgt * scale_bias[2 * idx];
         __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
@@ -3539,12 +3546,19 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
         bio = wgt * scale_bias[2 * idx + 1];
+=======
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        float bio = wgt * scale_bias[2 * idx + 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wgt = wgt * scale_bias[2 * idx];
         __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
@@ -3650,12 +3664,19 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
         bio = wgt * scale_bias[2 * idx + 1];
+=======
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        float bio = wgt * scale_bias[2 * idx + 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wgt = wgt * scale_bias[2 * idx];
         __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
@@ -3727,12 +3748,19 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
         bio = wgt * scale_bias[2 * idx + 1];
+=======
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        float bio = wgt * scale_bias[2 * idx + 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wgt = wgt * scale_bias[2 * idx];
         __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
@@ -3794,12 +3822,19 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
         bio = wgt * scale_bias[2 * idx + 1];
+=======
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        float bio = wgt * scale_bias[2 * idx + 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wgt = wgt * scale_bias[2 * idx];
         __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
@@ -3946,12 +3981,19 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
         bio = wgt * scale_bias[2 * idx + 1];
+=======
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        float bio = wgt * scale_bias[2 * idx + 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wgt = wgt * scale_bias[2 * idx];
         __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
@@ -4126,12 +4168,19 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
         bio = wgt * scale_bias[2 * idx + 1];
+=======
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        float bio = wgt * scale_bias[2 * idx + 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wgt = wgt * scale_bias[2 * idx];
         __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
@@ -4237,12 +4286,19 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
         bio = wgt * scale_bias[2 * idx + 1];
+=======
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        float bio = wgt * scale_bias[2 * idx + 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wgt = wgt * scale_bias[2 * idx];
         __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
@@ -4314,12 +4370,19 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
         bio = wgt * scale_bias[2 * idx + 1];
+=======
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        float bio = wgt * scale_bias[2 * idx + 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wgt = wgt * scale_bias[2 * idx];
         __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
@@ -4381,12 +4444,19 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
         bio = wgt * scale_bias[2 * idx + 1];
+=======
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        float bio = wgt * scale_bias[2 * idx + 1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wgt = wgt * scale_bias[2 * idx];
         __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
diff --git a/caffe2/perfkernels/embedding_lookup_idx_sve.cc b/caffe2/perfkernels/embedding_lookup_idx_sve.cc
index 873823536b55..8a42eacd81ec 100644
--- a/caffe2/perfkernels/embedding_lookup_idx_sve.cc
+++ b/caffe2/perfkernels/embedding_lookup_idx_sve.cc
@@ -28,6 +28,7 @@ static bool EmbeddingLookupIdx_int32_t_float_float__sve(
   const svbool_t svAll = svptrue_b32();
   const auto vLen = static_cast<int64_t>(svcntw());
   int64_t pos = 0;
+<<<<<<< HEAD
   if (block_size == 32 * vLen) {
     // unrolling 32 times
     for (int64_t i = 0; i < output_size; ++i) {
@@ -539,6 +540,408 @@ static bool EmbeddingLookupIdx_int32_t_float_float__sve(
           svst1_f32(
               pg, &op[j], svmul_f32_x(pg, svld1_f32(pg, &op[j]), vlen_inv));
         }
+=======
+  for (int64_t i = 0; i < output_size; ++i) {
+    float* const op = &out[i * block_size];
+    memset(op, 0, sizeof(float) * block_size);
+    if (pos != offsets[i] - offsets[0]) {
+      return false;
+    }
+    int64_t start_offset = offsets[i];
+    int64_t end_offset = offsets[i + 1];
+    int64_t j = start_offset;
+    // unrolling 16 times
+    while (j + 15 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      const auto idx8 = indices[pos + 8];
+      const auto idx9 = indices[pos + 9];
+      const auto idx10 = indices[pos + 10];
+      const auto idx11 = indices[pos + 11];
+      const auto idx12 = indices[pos + 12];
+      const auto idx13 = indices[pos + 13];
+      const auto idx14 = indices[pos + 14];
+      const auto idx15 = indices[pos + 15];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      if (idx8 < 0 || idx8 >= data_size) {
+        return false;
+      }
+      if (idx9 < 0 || idx9 >= data_size) {
+        return false;
+      }
+      if (idx10 < 0 || idx10 >= data_size) {
+        return false;
+      }
+      if (idx11 < 0 || idx11 >= data_size) {
+        return false;
+      }
+      if (idx12 < 0 || idx12 >= data_size) {
+        return false;
+      }
+      if (idx13 < 0 || idx13 >= data_size) {
+        return false;
+      }
+      if (idx14 < 0 || idx14 >= data_size) {
+        return false;
+      }
+      if (idx15 < 0 || idx15 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      float wgt8 = 1.f;
+      float wgt9 = 1.f;
+      float wgt10 = 1.f;
+      float wgt11 = 1.f;
+      float wgt12 = 1.f;
+      float wgt13 = 1.f;
+      float wgt14 = 1.f;
+      float wgt15 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+        wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8];
+        wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9];
+        wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10];
+        wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11];
+        wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12];
+        wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13];
+        wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14];
+        wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15];
+      }
+      const float* const ip0 = &input[idx0 * block_size];
+      const float* const ip1 = &input[idx1 * block_size];
+      const float* const ip2 = &input[idx2 * block_size];
+      const float* const ip3 = &input[idx3 * block_size];
+      const float* const ip4 = &input[idx4 * block_size];
+      const float* const ip5 = &input[idx5 * block_size];
+      const float* const ip6 = &input[idx6 * block_size];
+      const float* const ip7 = &input[idx7 * block_size];
+      const float* const ip8 = &input[idx8 * block_size];
+      const float* const ip9 = &input[idx9 * block_size];
+      const float* const ip10 = &input[idx10 * block_size];
+      const float* const ip11 = &input[idx11 * block_size];
+      const float* const ip12 = &input[idx12 * block_size];
+      const float* const ip13 = &input[idx13 * block_size];
+      const float* const ip14 = &input[idx14 * block_size];
+      const float* const ip15 = &input[idx15 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3);
+        output = svmla_x(svAll, output, svld1(svAll, &ip4[k]), wgt4);
+        output = svmla_x(svAll, output, svld1(svAll, &ip5[k]), wgt5);
+        output = svmla_x(svAll, output, svld1(svAll, &ip6[k]), wgt6);
+        output = svmla_x(svAll, output, svld1(svAll, &ip7[k]), wgt7);
+        output = svmla_x(svAll, output, svld1(svAll, &ip8[k]), wgt8);
+        output = svmla_x(svAll, output, svld1(svAll, &ip9[k]), wgt9);
+        output = svmla_x(svAll, output, svld1(svAll, &ip10[k]), wgt10);
+        output = svmla_x(svAll, output, svld1(svAll, &ip11[k]), wgt11);
+        output = svmla_x(svAll, output, svld1(svAll, &ip12[k]), wgt12);
+        output = svmla_x(svAll, output, svld1(svAll, &ip13[k]), wgt13);
+        output = svmla_x(svAll, output, svld1(svAll, &ip14[k]), wgt14);
+        output = svmla_x(svAll, output, svld1(svAll, &ip15[k]), wgt15);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3);
+        output = svmla_x(pg, output, svld1(svAll, &ip4[k]), wgt4);
+        output = svmla_x(pg, output, svld1(svAll, &ip5[k]), wgt5);
+        output = svmla_x(pg, output, svld1(svAll, &ip6[k]), wgt6);
+        output = svmla_x(pg, output, svld1(svAll, &ip7[k]), wgt7);
+        output = svmla_x(pg, output, svld1(svAll, &ip8[k]), wgt8);
+        output = svmla_x(pg, output, svld1(svAll, &ip9[k]), wgt9);
+        output = svmla_x(pg, output, svld1(svAll, &ip10[k]), wgt10);
+        output = svmla_x(pg, output, svld1(svAll, &ip11[k]), wgt11);
+        output = svmla_x(pg, output, svld1(svAll, &ip12[k]), wgt12);
+        output = svmla_x(pg, output, svld1(svAll, &ip13[k]), wgt13);
+        output = svmla_x(pg, output, svld1(svAll, &ip14[k]), wgt14);
+        output = svmla_x(pg, output, svld1(svAll, &ip15[k]), wgt15);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 16;
+      pos += 16;
+    }
+    // unrolling 8 times
+    while (j + 7 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+      }
+      const float* const ip0 = &input[idx0 * block_size];
+      const float* const ip1 = &input[idx1 * block_size];
+      const float* const ip2 = &input[idx2 * block_size];
+      const float* const ip3 = &input[idx3 * block_size];
+      const float* const ip4 = &input[idx4 * block_size];
+      const float* const ip5 = &input[idx5 * block_size];
+      const float* const ip6 = &input[idx6 * block_size];
+      const float* const ip7 = &input[idx7 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3);
+        output = svmla_x(svAll, output, svld1(svAll, &ip4[k]), wgt4);
+        output = svmla_x(svAll, output, svld1(svAll, &ip5[k]), wgt5);
+        output = svmla_x(svAll, output, svld1(svAll, &ip6[k]), wgt6);
+        output = svmla_x(svAll, output, svld1(svAll, &ip7[k]), wgt7);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3);
+        output = svmla_x(pg, output, svld1(svAll, &ip4[k]), wgt4);
+        output = svmla_x(pg, output, svld1(svAll, &ip5[k]), wgt5);
+        output = svmla_x(pg, output, svld1(svAll, &ip6[k]), wgt6);
+        output = svmla_x(pg, output, svld1(svAll, &ip7[k]), wgt7);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 8;
+      pos += 8;
+    }
+    // unrolling 4 times
+    while (j + 3 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+      }
+      const float* const ip0 = &input[idx0 * block_size];
+      const float* const ip1 = &input[idx1 * block_size];
+      const float* const ip2 = &input[idx2 * block_size];
+      const float* const ip3 = &input[idx3 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 4;
+      pos += 4;
+    }
+    // unrolling 2 times
+    while (j + 1 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+      }
+      const float* const ip0 = &input[idx0 * block_size];
+      const float* const ip1 = &input[idx1 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 2;
+      pos += 2;
+    }
+    // tail loop
+    if (j < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+      }
+      const float* const ip0 = &input[idx0 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      pos ++;
+    }
+    const int64_t length = end_offset - start_offset;
+
+    if (normalize_by_lengths && length != 0) {
+      const float len_inv = 1.0f / length;
+      svbool_t pg;
+      int64_t j = 0;
+      while (j + vLen - 1 < block_size) {
+        svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv));
+        j += vLen;
+      }
+      if (j < block_size) {
+        pg = svwhilelt_b32_s64(j, block_size);
+        svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -611,6 +1014,7 @@ static bool EmbeddingLookupIdx_int64_t_float_float__sve(
   const svbool_t svAll = svptrue_b32();
   const auto vLen = static_cast<int64_t>(svcntw());
   int64_t pos = 0;
+<<<<<<< HEAD
   if (block_size == 32 * vLen) {
     // unrolling 32 times
     for (int64_t i = 0; i < output_size; ++i) {
@@ -1122,6 +1526,408 @@ static bool EmbeddingLookupIdx_int64_t_float_float__sve(
           svst1_f32(
               pg, &op[j], svmul_f32_x(pg, svld1_f32(pg, &op[j]), vlen_inv));
         }
+=======
+  for (int64_t i = 0; i < output_size; ++i) {
+    float* const op = &out[i * block_size];
+    memset(op, 0, sizeof(float) * block_size);
+    if (pos != offsets[i] - offsets[0]) {
+      return false;
+    }
+    int64_t start_offset = offsets[i];
+    int64_t end_offset = offsets[i + 1];
+    int64_t j = start_offset;
+    // unrolling 16 times
+    while (j + 15 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      const auto idx8 = indices[pos + 8];
+      const auto idx9 = indices[pos + 9];
+      const auto idx10 = indices[pos + 10];
+      const auto idx11 = indices[pos + 11];
+      const auto idx12 = indices[pos + 12];
+      const auto idx13 = indices[pos + 13];
+      const auto idx14 = indices[pos + 14];
+      const auto idx15 = indices[pos + 15];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      if (idx8 < 0 || idx8 >= data_size) {
+        return false;
+      }
+      if (idx9 < 0 || idx9 >= data_size) {
+        return false;
+      }
+      if (idx10 < 0 || idx10 >= data_size) {
+        return false;
+      }
+      if (idx11 < 0 || idx11 >= data_size) {
+        return false;
+      }
+      if (idx12 < 0 || idx12 >= data_size) {
+        return false;
+      }
+      if (idx13 < 0 || idx13 >= data_size) {
+        return false;
+      }
+      if (idx14 < 0 || idx14 >= data_size) {
+        return false;
+      }
+      if (idx15 < 0 || idx15 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      float wgt8 = 1.f;
+      float wgt9 = 1.f;
+      float wgt10 = 1.f;
+      float wgt11 = 1.f;
+      float wgt12 = 1.f;
+      float wgt13 = 1.f;
+      float wgt14 = 1.f;
+      float wgt15 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+        wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8];
+        wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9];
+        wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10];
+        wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11];
+        wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12];
+        wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13];
+        wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14];
+        wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15];
+      }
+      const float* const ip0 = &input[idx0 * block_size];
+      const float* const ip1 = &input[idx1 * block_size];
+      const float* const ip2 = &input[idx2 * block_size];
+      const float* const ip3 = &input[idx3 * block_size];
+      const float* const ip4 = &input[idx4 * block_size];
+      const float* const ip5 = &input[idx5 * block_size];
+      const float* const ip6 = &input[idx6 * block_size];
+      const float* const ip7 = &input[idx7 * block_size];
+      const float* const ip8 = &input[idx8 * block_size];
+      const float* const ip9 = &input[idx9 * block_size];
+      const float* const ip10 = &input[idx10 * block_size];
+      const float* const ip11 = &input[idx11 * block_size];
+      const float* const ip12 = &input[idx12 * block_size];
+      const float* const ip13 = &input[idx13 * block_size];
+      const float* const ip14 = &input[idx14 * block_size];
+      const float* const ip15 = &input[idx15 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3);
+        output = svmla_x(svAll, output, svld1(svAll, &ip4[k]), wgt4);
+        output = svmla_x(svAll, output, svld1(svAll, &ip5[k]), wgt5);
+        output = svmla_x(svAll, output, svld1(svAll, &ip6[k]), wgt6);
+        output = svmla_x(svAll, output, svld1(svAll, &ip7[k]), wgt7);
+        output = svmla_x(svAll, output, svld1(svAll, &ip8[k]), wgt8);
+        output = svmla_x(svAll, output, svld1(svAll, &ip9[k]), wgt9);
+        output = svmla_x(svAll, output, svld1(svAll, &ip10[k]), wgt10);
+        output = svmla_x(svAll, output, svld1(svAll, &ip11[k]), wgt11);
+        output = svmla_x(svAll, output, svld1(svAll, &ip12[k]), wgt12);
+        output = svmla_x(svAll, output, svld1(svAll, &ip13[k]), wgt13);
+        output = svmla_x(svAll, output, svld1(svAll, &ip14[k]), wgt14);
+        output = svmla_x(svAll, output, svld1(svAll, &ip15[k]), wgt15);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3);
+        output = svmla_x(pg, output, svld1(svAll, &ip4[k]), wgt4);
+        output = svmla_x(pg, output, svld1(svAll, &ip5[k]), wgt5);
+        output = svmla_x(pg, output, svld1(svAll, &ip6[k]), wgt6);
+        output = svmla_x(pg, output, svld1(svAll, &ip7[k]), wgt7);
+        output = svmla_x(pg, output, svld1(svAll, &ip8[k]), wgt8);
+        output = svmla_x(pg, output, svld1(svAll, &ip9[k]), wgt9);
+        output = svmla_x(pg, output, svld1(svAll, &ip10[k]), wgt10);
+        output = svmla_x(pg, output, svld1(svAll, &ip11[k]), wgt11);
+        output = svmla_x(pg, output, svld1(svAll, &ip12[k]), wgt12);
+        output = svmla_x(pg, output, svld1(svAll, &ip13[k]), wgt13);
+        output = svmla_x(pg, output, svld1(svAll, &ip14[k]), wgt14);
+        output = svmla_x(pg, output, svld1(svAll, &ip15[k]), wgt15);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 16;
+      pos += 16;
+    }
+    // unrolling 8 times
+    while (j + 7 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+      }
+      const float* const ip0 = &input[idx0 * block_size];
+      const float* const ip1 = &input[idx1 * block_size];
+      const float* const ip2 = &input[idx2 * block_size];
+      const float* const ip3 = &input[idx3 * block_size];
+      const float* const ip4 = &input[idx4 * block_size];
+      const float* const ip5 = &input[idx5 * block_size];
+      const float* const ip6 = &input[idx6 * block_size];
+      const float* const ip7 = &input[idx7 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3);
+        output = svmla_x(svAll, output, svld1(svAll, &ip4[k]), wgt4);
+        output = svmla_x(svAll, output, svld1(svAll, &ip5[k]), wgt5);
+        output = svmla_x(svAll, output, svld1(svAll, &ip6[k]), wgt6);
+        output = svmla_x(svAll, output, svld1(svAll, &ip7[k]), wgt7);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3);
+        output = svmla_x(pg, output, svld1(svAll, &ip4[k]), wgt4);
+        output = svmla_x(pg, output, svld1(svAll, &ip5[k]), wgt5);
+        output = svmla_x(pg, output, svld1(svAll, &ip6[k]), wgt6);
+        output = svmla_x(pg, output, svld1(svAll, &ip7[k]), wgt7);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 8;
+      pos += 8;
+    }
+    // unrolling 4 times
+    while (j + 3 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+      }
+      const float* const ip0 = &input[idx0 * block_size];
+      const float* const ip1 = &input[idx1 * block_size];
+      const float* const ip2 = &input[idx2 * block_size];
+      const float* const ip3 = &input[idx3 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(svAll, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(svAll, output, svld1(svAll, &ip3[k]), wgt3);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1);
+        output = svmla_x(pg, output, svld1(svAll, &ip2[k]), wgt2);
+        output = svmla_x(pg, output, svld1(svAll, &ip3[k]), wgt3);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 4;
+      pos += 4;
+    }
+    // unrolling 2 times
+    while (j + 1 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+      }
+      const float* const ip0 = &input[idx0 * block_size];
+      const float* const ip1 = &input[idx1 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(svAll, output, svld1(svAll, &ip1[k]), wgt1);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0);
+        output = svmla_x(pg, output, svld1(svAll, &ip1[k]), wgt1);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 2;
+      pos += 2;
+    }
+    // tail loop
+    if (j < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+      }
+      const float* const ip0 = &input[idx0 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svmla_x(svAll, output, svld1(svAll, &ip0[k]), wgt0);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svmla_x(pg, output, svld1(svAll, &ip0[k]), wgt0);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      pos ++;
+    }
+    const int64_t length = end_offset - start_offset;
+
+    if (normalize_by_lengths && length != 0) {
+      const float len_inv = 1.0f / length;
+      svbool_t pg;
+      int64_t j = 0;
+      while (j + vLen - 1 < block_size) {
+        svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv));
+        j += vLen;
+      }
+      if (j < block_size) {
+        pg = svwhilelt_b32_s64(j, block_size);
+        svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -1194,6 +2000,7 @@ static bool EmbeddingLookupIdx_int32_t_half_float__sve(
   const svbool_t svAll = svptrue_b32();
   const auto vLen = static_cast<int64_t>(svcntw());
   int64_t pos = 0;
+<<<<<<< HEAD
   if (block_size == 32 * vLen) {
     // unrolling 32 times
     for (int64_t i = 0; i < output_size; ++i) {
@@ -2083,6 +2890,532 @@ static bool EmbeddingLookupIdx_int32_t_half_float__sve(
           svst1_f32(
               pg, &op[j], svmul_f32_x(pg, svld1_f32(pg, &op[j]), vlen_inv));
         }
+=======
+  for (int64_t i = 0; i < output_size; ++i) {
+    float* const op = &out[i * block_size];
+    memset(op, 0, sizeof(float) * block_size);
+    if (pos != offsets[i] - offsets[0]) {
+      return false;
+    }
+    int64_t start_offset = offsets[i];
+    int64_t end_offset = offsets[i + 1];
+    int64_t j = start_offset;
+    // unrolling 16 times
+    while (j + 15 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      const auto idx8 = indices[pos + 8];
+      const auto idx9 = indices[pos + 9];
+      const auto idx10 = indices[pos + 10];
+      const auto idx11 = indices[pos + 11];
+      const auto idx12 = indices[pos + 12];
+      const auto idx13 = indices[pos + 13];
+      const auto idx14 = indices[pos + 14];
+      const auto idx15 = indices[pos + 15];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      if (idx8 < 0 || idx8 >= data_size) {
+        return false;
+      }
+      if (idx9 < 0 || idx9 >= data_size) {
+        return false;
+      }
+      if (idx10 < 0 || idx10 >= data_size) {
+        return false;
+      }
+      if (idx11 < 0 || idx11 >= data_size) {
+        return false;
+      }
+      if (idx12 < 0 || idx12 >= data_size) {
+        return false;
+      }
+      if (idx13 < 0 || idx13 >= data_size) {
+        return false;
+      }
+      if (idx14 < 0 || idx14 >= data_size) {
+        return false;
+      }
+      if (idx15 < 0 || idx15 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      float wgt8 = 1.f;
+      float wgt9 = 1.f;
+      float wgt10 = 1.f;
+      float wgt11 = 1.f;
+      float wgt12 = 1.f;
+      float wgt13 = 1.f;
+      float wgt14 = 1.f;
+      float wgt15 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+        wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8];
+        wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9];
+        wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10];
+        wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11];
+        wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12];
+        wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13];
+        wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14];
+        wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15];
+      }
+      const at::Half* const ip0 = &input[idx0 * block_size];
+      const at::Half* const ip1 = &input[idx1 * block_size];
+      const at::Half* const ip2 = &input[idx2 * block_size];
+      const at::Half* const ip3 = &input[idx3 * block_size];
+      const at::Half* const ip4 = &input[idx4 * block_size];
+      const at::Half* const ip5 = &input[idx5 * block_size];
+      const at::Half* const ip6 = &input[idx6 * block_size];
+      const at::Half* const ip7 = &input[idx7 * block_size];
+      const at::Half* const ip8 = &input[idx8 * block_size];
+      const at::Half* const ip9 = &input[idx9 * block_size];
+      const at::Half* const ip10 = &input[idx10 * block_size];
+      const at::Half* const ip11 = &input[idx11 * block_size];
+      const at::Half* const ip12 = &input[idx12 * block_size];
+      const at::Half* const ip13 = &input[idx13 * block_size];
+      const at::Half* const ip14 = &input[idx14 * block_size];
+      const at::Half* const ip15 = &input[idx15 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        auto input4 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip4[k]))));
+        auto input5 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip5[k]))));
+        auto input6 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip6[k]))));
+        auto input7 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip7[k]))));
+        auto input8 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip8[k]))));
+        auto input9 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip9[k]))));
+        auto input10 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip10[k]))));
+        auto input11 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip11[k]))));
+        auto input12 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip12[k]))));
+        auto input13 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip13[k]))));
+        auto input14 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip14[k]))));
+        auto input15 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip15[k]))));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        output = svmla_x(svAll, output, input8, wgt8);
+        output = svmla_x(svAll, output, input9, wgt9);
+        output = svmla_x(svAll, output, input10, wgt10);
+        output = svmla_x(svAll, output, input11, wgt11);
+        output = svmla_x(svAll, output, input12, wgt12);
+        output = svmla_x(svAll, output, input13, wgt13);
+        output = svmla_x(svAll, output, input14, wgt14);
+        output = svmla_x(svAll, output, input15, wgt15);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        auto input4 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip4[k]))));
+        auto input5 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip5[k]))));
+        auto input6 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip6[k]))));
+        auto input7 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip7[k]))));
+        auto input8 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip8[k]))));
+        auto input9 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip9[k]))));
+        auto input10 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip10[k]))));
+        auto input11 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip11[k]))));
+        auto input12 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip12[k]))));
+        auto input13 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip13[k]))));
+        auto input14 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip14[k]))));
+        auto input15 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip15[k]))));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        output = svmla_x(pg, output, input8, wgt8);
+        output = svmla_x(pg, output, input9, wgt9);
+        output = svmla_x(pg, output, input10, wgt10);
+        output = svmla_x(pg, output, input11, wgt11);
+        output = svmla_x(pg, output, input12, wgt12);
+        output = svmla_x(pg, output, input13, wgt13);
+        output = svmla_x(pg, output, input14, wgt14);
+        output = svmla_x(pg, output, input15, wgt15);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 16;
+      pos += 16;
+    }
+    // unrolling 8 times
+    while (j + 7 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+      }
+      const at::Half* const ip0 = &input[idx0 * block_size];
+      const at::Half* const ip1 = &input[idx1 * block_size];
+      const at::Half* const ip2 = &input[idx2 * block_size];
+      const at::Half* const ip3 = &input[idx3 * block_size];
+      const at::Half* const ip4 = &input[idx4 * block_size];
+      const at::Half* const ip5 = &input[idx5 * block_size];
+      const at::Half* const ip6 = &input[idx6 * block_size];
+      const at::Half* const ip7 = &input[idx7 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        auto input4 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip4[k]))));
+        auto input5 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip5[k]))));
+        auto input6 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip6[k]))));
+        auto input7 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip7[k]))));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        auto input4 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip4[k]))));
+        auto input5 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip5[k]))));
+        auto input6 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip6[k]))));
+        auto input7 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip7[k]))));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 8;
+      pos += 8;
+    }
+    // unrolling 4 times
+    while (j + 3 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+      }
+      const at::Half* const ip0 = &input[idx0 * block_size];
+      const at::Half* const ip1 = &input[idx1 * block_size];
+      const at::Half* const ip2 = &input[idx2 * block_size];
+      const at::Half* const ip3 = &input[idx3 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 4;
+      pos += 4;
+    }
+    // unrolling 2 times
+    while (j + 1 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+      }
+      const at::Half* const ip0 = &input[idx0 * block_size];
+      const at::Half* const ip1 = &input[idx1 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 2;
+      pos += 2;
+    }
+    // tail loop
+    if (j < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+      }
+      const at::Half* const ip0 = &input[idx0 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        output = svmla_x(svAll, output, input0, wgt0);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        output = svmla_x(pg, output, input0, wgt0);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      pos ++;
+    }
+    const int64_t length = end_offset - start_offset;
+
+    if (normalize_by_lengths && length != 0) {
+      const float len_inv = 1.0f / length;
+      svbool_t pg;
+      int64_t j = 0;
+      while (j + vLen - 1 < block_size) {
+        svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv));
+        j += vLen;
+      }
+      if (j < block_size) {
+        pg = svwhilelt_b32_s64(j, block_size);
+        svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -2155,6 +3488,7 @@ static bool EmbeddingLookupIdx_int64_t_half_float__sve(
   const svbool_t svAll = svptrue_b32();
   const auto vLen = static_cast<int64_t>(svcntw());
   int64_t pos = 0;
+<<<<<<< HEAD
   if (block_size == 32 * vLen) {
     // unrolling 32 times
     for (int64_t i = 0; i < output_size; ++i) {
@@ -3044,6 +4378,532 @@ static bool EmbeddingLookupIdx_int64_t_half_float__sve(
           svst1_f32(
               pg, &op[j], svmul_f32_x(pg, svld1_f32(pg, &op[j]), vlen_inv));
         }
+=======
+  for (int64_t i = 0; i < output_size; ++i) {
+    float* const op = &out[i * block_size];
+    memset(op, 0, sizeof(float) * block_size);
+    if (pos != offsets[i] - offsets[0]) {
+      return false;
+    }
+    int64_t start_offset = offsets[i];
+    int64_t end_offset = offsets[i + 1];
+    int64_t j = start_offset;
+    // unrolling 16 times
+    while (j + 15 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      const auto idx8 = indices[pos + 8];
+      const auto idx9 = indices[pos + 9];
+      const auto idx10 = indices[pos + 10];
+      const auto idx11 = indices[pos + 11];
+      const auto idx12 = indices[pos + 12];
+      const auto idx13 = indices[pos + 13];
+      const auto idx14 = indices[pos + 14];
+      const auto idx15 = indices[pos + 15];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      if (idx8 < 0 || idx8 >= data_size) {
+        return false;
+      }
+      if (idx9 < 0 || idx9 >= data_size) {
+        return false;
+      }
+      if (idx10 < 0 || idx10 >= data_size) {
+        return false;
+      }
+      if (idx11 < 0 || idx11 >= data_size) {
+        return false;
+      }
+      if (idx12 < 0 || idx12 >= data_size) {
+        return false;
+      }
+      if (idx13 < 0 || idx13 >= data_size) {
+        return false;
+      }
+      if (idx14 < 0 || idx14 >= data_size) {
+        return false;
+      }
+      if (idx15 < 0 || idx15 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      float wgt8 = 1.f;
+      float wgt9 = 1.f;
+      float wgt10 = 1.f;
+      float wgt11 = 1.f;
+      float wgt12 = 1.f;
+      float wgt13 = 1.f;
+      float wgt14 = 1.f;
+      float wgt15 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+        wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8];
+        wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9];
+        wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10];
+        wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11];
+        wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12];
+        wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13];
+        wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14];
+        wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15];
+      }
+      const at::Half* const ip0 = &input[idx0 * block_size];
+      const at::Half* const ip1 = &input[idx1 * block_size];
+      const at::Half* const ip2 = &input[idx2 * block_size];
+      const at::Half* const ip3 = &input[idx3 * block_size];
+      const at::Half* const ip4 = &input[idx4 * block_size];
+      const at::Half* const ip5 = &input[idx5 * block_size];
+      const at::Half* const ip6 = &input[idx6 * block_size];
+      const at::Half* const ip7 = &input[idx7 * block_size];
+      const at::Half* const ip8 = &input[idx8 * block_size];
+      const at::Half* const ip9 = &input[idx9 * block_size];
+      const at::Half* const ip10 = &input[idx10 * block_size];
+      const at::Half* const ip11 = &input[idx11 * block_size];
+      const at::Half* const ip12 = &input[idx12 * block_size];
+      const at::Half* const ip13 = &input[idx13 * block_size];
+      const at::Half* const ip14 = &input[idx14 * block_size];
+      const at::Half* const ip15 = &input[idx15 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        auto input4 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip4[k]))));
+        auto input5 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip5[k]))));
+        auto input6 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip6[k]))));
+        auto input7 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip7[k]))));
+        auto input8 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip8[k]))));
+        auto input9 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip9[k]))));
+        auto input10 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip10[k]))));
+        auto input11 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip11[k]))));
+        auto input12 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip12[k]))));
+        auto input13 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip13[k]))));
+        auto input14 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip14[k]))));
+        auto input15 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip15[k]))));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        output = svmla_x(svAll, output, input8, wgt8);
+        output = svmla_x(svAll, output, input9, wgt9);
+        output = svmla_x(svAll, output, input10, wgt10);
+        output = svmla_x(svAll, output, input11, wgt11);
+        output = svmla_x(svAll, output, input12, wgt12);
+        output = svmla_x(svAll, output, input13, wgt13);
+        output = svmla_x(svAll, output, input14, wgt14);
+        output = svmla_x(svAll, output, input15, wgt15);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        auto input4 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip4[k]))));
+        auto input5 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip5[k]))));
+        auto input6 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip6[k]))));
+        auto input7 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip7[k]))));
+        auto input8 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip8[k]))));
+        auto input9 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip9[k]))));
+        auto input10 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip10[k]))));
+        auto input11 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip11[k]))));
+        auto input12 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip12[k]))));
+        auto input13 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip13[k]))));
+        auto input14 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip14[k]))));
+        auto input15 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip15[k]))));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        output = svmla_x(pg, output, input8, wgt8);
+        output = svmla_x(pg, output, input9, wgt9);
+        output = svmla_x(pg, output, input10, wgt10);
+        output = svmla_x(pg, output, input11, wgt11);
+        output = svmla_x(pg, output, input12, wgt12);
+        output = svmla_x(pg, output, input13, wgt13);
+        output = svmla_x(pg, output, input14, wgt14);
+        output = svmla_x(pg, output, input15, wgt15);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 16;
+      pos += 16;
+    }
+    // unrolling 8 times
+    while (j + 7 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+      }
+      const at::Half* const ip0 = &input[idx0 * block_size];
+      const at::Half* const ip1 = &input[idx1 * block_size];
+      const at::Half* const ip2 = &input[idx2 * block_size];
+      const at::Half* const ip3 = &input[idx3 * block_size];
+      const at::Half* const ip4 = &input[idx4 * block_size];
+      const at::Half* const ip5 = &input[idx5 * block_size];
+      const at::Half* const ip6 = &input[idx6 * block_size];
+      const at::Half* const ip7 = &input[idx7 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        auto input4 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip4[k]))));
+        auto input5 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip5[k]))));
+        auto input6 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip6[k]))));
+        auto input7 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip7[k]))));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        auto input4 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip4[k]))));
+        auto input5 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip5[k]))));
+        auto input6 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip6[k]))));
+        auto input7 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip7[k]))));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 8;
+      pos += 8;
+    }
+    // unrolling 4 times
+    while (j + 3 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+      }
+      const at::Half* const ip0 = &input[idx0 * block_size];
+      const at::Half* const ip1 = &input[idx1 * block_size];
+      const at::Half* const ip2 = &input[idx2 * block_size];
+      const at::Half* const ip3 = &input[idx3 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        auto input2 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k]))));
+        auto input3 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k]))));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 4;
+      pos += 4;
+    }
+    // unrolling 2 times
+    while (j + 1 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+      }
+      const at::Half* const ip0 = &input[idx0 * block_size];
+      const at::Half* const ip1 = &input[idx1 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        auto input1 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k]))));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 2;
+      pos += 2;
+    }
+    // tail loop
+    if (j < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+      }
+      const at::Half* const ip0 = &input[idx0 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svcvt_f32_x(svAll, svreinterpret_f16(
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        output = svmla_x(svAll, output, input0, wgt0);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svcvt_f32_x(pg, svreinterpret_f16(
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k]))));
+        output = svmla_x(pg, output, input0, wgt0);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      pos ++;
+    }
+    const int64_t length = end_offset - start_offset;
+
+    if (normalize_by_lengths && length != 0) {
+      const float len_inv = 1.0f / length;
+      svbool_t pg;
+      int64_t j = 0;
+      while (j + vLen - 1 < block_size) {
+        svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv));
+        j += vLen;
+      }
+      if (j < block_size) {
+        pg = svwhilelt_b32_s64(j, block_size);
+        svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -3116,6 +4976,7 @@ static bool EmbeddingLookupIdx_int32_t_bfloat16_float__sve(
   const svbool_t svAll = svptrue_b32();
   const auto vLen = static_cast<int64_t>(svcntw());
   int64_t pos = 0;
+<<<<<<< HEAD
   if (block_size == 32 * vLen) {
     // unrolling 32 times
     for (int64_t i = 0; i < output_size; ++i) {
@@ -4056,18 +5917,544 @@ static bool EmbeddingLookupIdx_int32_t_bfloat16_float__sve(
 
         ++pos;
       }
-      const int64_t length = end_offset - start_offset;
+      const int64_t length = end_offset - start_offset;
+
+      if (normalize_by_lengths && length != 0) {
+        const float len_inv = 1.0f / length;
+        svfloat32_t vlen_inv = svdup_n_f32(len_inv);
+        svbool_t pg;
+        for (int64_t j = 0;
+             svptest_first(svAll, pg = svwhilelt_b32_s64(j, block_size));
+             j += vLen) {
+          svst1_f32(
+              pg, &op[j], svmul_f32_x(pg, svld1_f32(pg, &op[j]), vlen_inv));
+        }
+=======
+  for (int64_t i = 0; i < output_size; ++i) {
+    float* const op = &out[i * block_size];
+    memset(op, 0, sizeof(float) * block_size);
+    if (pos != offsets[i] - offsets[0]) {
+      return false;
+    }
+    int64_t start_offset = offsets[i];
+    int64_t end_offset = offsets[i + 1];
+    int64_t j = start_offset;
+    // unrolling 16 times
+    while (j + 15 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      const auto idx8 = indices[pos + 8];
+      const auto idx9 = indices[pos + 9];
+      const auto idx10 = indices[pos + 10];
+      const auto idx11 = indices[pos + 11];
+      const auto idx12 = indices[pos + 12];
+      const auto idx13 = indices[pos + 13];
+      const auto idx14 = indices[pos + 14];
+      const auto idx15 = indices[pos + 15];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      if (idx8 < 0 || idx8 >= data_size) {
+        return false;
+      }
+      if (idx9 < 0 || idx9 >= data_size) {
+        return false;
+      }
+      if (idx10 < 0 || idx10 >= data_size) {
+        return false;
+      }
+      if (idx11 < 0 || idx11 >= data_size) {
+        return false;
+      }
+      if (idx12 < 0 || idx12 >= data_size) {
+        return false;
+      }
+      if (idx13 < 0 || idx13 >= data_size) {
+        return false;
+      }
+      if (idx14 < 0 || idx14 >= data_size) {
+        return false;
+      }
+      if (idx15 < 0 || idx15 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      float wgt8 = 1.f;
+      float wgt9 = 1.f;
+      float wgt10 = 1.f;
+      float wgt11 = 1.f;
+      float wgt12 = 1.f;
+      float wgt13 = 1.f;
+      float wgt14 = 1.f;
+      float wgt15 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+        wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8];
+        wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9];
+        wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10];
+        wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11];
+        wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12];
+        wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13];
+        wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14];
+        wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15];
+      }
+      const at::BFloat16* const ip0 = &input[idx0 * block_size];
+      const at::BFloat16* const ip1 = &input[idx1 * block_size];
+      const at::BFloat16* const ip2 = &input[idx2 * block_size];
+      const at::BFloat16* const ip3 = &input[idx3 * block_size];
+      const at::BFloat16* const ip4 = &input[idx4 * block_size];
+      const at::BFloat16* const ip5 = &input[idx5 * block_size];
+      const at::BFloat16* const ip6 = &input[idx6 * block_size];
+      const at::BFloat16* const ip7 = &input[idx7 * block_size];
+      const at::BFloat16* const ip8 = &input[idx8 * block_size];
+      const at::BFloat16* const ip9 = &input[idx9 * block_size];
+      const at::BFloat16* const ip10 = &input[idx10 * block_size];
+      const at::BFloat16* const ip11 = &input[idx11 * block_size];
+      const at::BFloat16* const ip12 = &input[idx12 * block_size];
+      const at::BFloat16* const ip13 = &input[idx13 * block_size];
+      const at::BFloat16* const ip14 = &input[idx14 * block_size];
+      const at::BFloat16* const ip15 = &input[idx15 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        auto input4 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip4[k])), 16));
+        auto input5 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip5[k])), 16));
+        auto input6 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip6[k])), 16));
+        auto input7 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip7[k])), 16));
+        auto input8 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip8[k])), 16));
+        auto input9 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip9[k])), 16));
+        auto input10 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip10[k])), 16));
+        auto input11 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip11[k])), 16));
+        auto input12 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip12[k])), 16));
+        auto input13 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip13[k])), 16));
+        auto input14 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip14[k])), 16));
+        auto input15 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip15[k])), 16));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        output = svmla_x(svAll, output, input8, wgt8);
+        output = svmla_x(svAll, output, input9, wgt9);
+        output = svmla_x(svAll, output, input10, wgt10);
+        output = svmla_x(svAll, output, input11, wgt11);
+        output = svmla_x(svAll, output, input12, wgt12);
+        output = svmla_x(svAll, output, input13, wgt13);
+        output = svmla_x(svAll, output, input14, wgt14);
+        output = svmla_x(svAll, output, input15, wgt15);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        auto input4 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip4[k])), 16));
+        auto input5 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip5[k])), 16));
+        auto input6 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip6[k])), 16));
+        auto input7 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip7[k])), 16));
+        auto input8 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip8[k])), 16));
+        auto input9 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip9[k])), 16));
+        auto input10 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip10[k])), 16));
+        auto input11 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip11[k])), 16));
+        auto input12 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip12[k])), 16));
+        auto input13 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip13[k])), 16));
+        auto input14 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip14[k])), 16));
+        auto input15 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip15[k])), 16));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        output = svmla_x(pg, output, input8, wgt8);
+        output = svmla_x(pg, output, input9, wgt9);
+        output = svmla_x(pg, output, input10, wgt10);
+        output = svmla_x(pg, output, input11, wgt11);
+        output = svmla_x(pg, output, input12, wgt12);
+        output = svmla_x(pg, output, input13, wgt13);
+        output = svmla_x(pg, output, input14, wgt14);
+        output = svmla_x(pg, output, input15, wgt15);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 16;
+      pos += 16;
+    }
+    // unrolling 8 times
+    while (j + 7 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+      }
+      const at::BFloat16* const ip0 = &input[idx0 * block_size];
+      const at::BFloat16* const ip1 = &input[idx1 * block_size];
+      const at::BFloat16* const ip2 = &input[idx2 * block_size];
+      const at::BFloat16* const ip3 = &input[idx3 * block_size];
+      const at::BFloat16* const ip4 = &input[idx4 * block_size];
+      const at::BFloat16* const ip5 = &input[idx5 * block_size];
+      const at::BFloat16* const ip6 = &input[idx6 * block_size];
+      const at::BFloat16* const ip7 = &input[idx7 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        auto input4 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip4[k])), 16));
+        auto input5 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip5[k])), 16));
+        auto input6 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip6[k])), 16));
+        auto input7 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip7[k])), 16));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        auto input4 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip4[k])), 16));
+        auto input5 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip5[k])), 16));
+        auto input6 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip6[k])), 16));
+        auto input7 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip7[k])), 16));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 8;
+      pos += 8;
+    }
+    // unrolling 4 times
+    while (j + 3 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+      }
+      const at::BFloat16* const ip0 = &input[idx0 * block_size];
+      const at::BFloat16* const ip1 = &input[idx1 * block_size];
+      const at::BFloat16* const ip2 = &input[idx2 * block_size];
+      const at::BFloat16* const ip3 = &input[idx3 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 4;
+      pos += 4;
+    }
+    // unrolling 2 times
+    while (j + 1 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+      }
+      const at::BFloat16* const ip0 = &input[idx0 * block_size];
+      const at::BFloat16* const ip1 = &input[idx1 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 2;
+      pos += 2;
+    }
+    // tail loop
+    if (j < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+      }
+      const at::BFloat16* const ip0 = &input[idx0 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        output = svmla_x(svAll, output, input0, wgt0);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        output = svmla_x(pg, output, input0, wgt0);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      pos ++;
+    }
+    const int64_t length = end_offset - start_offset;
 
-      if (normalize_by_lengths && length != 0) {
-        const float len_inv = 1.0f / length;
-        svfloat32_t vlen_inv = svdup_n_f32(len_inv);
-        svbool_t pg;
-        for (int64_t j = 0;
-             svptest_first(svAll, pg = svwhilelt_b32_s64(j, block_size));
-             j += vLen) {
-          svst1_f32(
-              pg, &op[j], svmul_f32_x(pg, svld1_f32(pg, &op[j]), vlen_inv));
-        }
+    if (normalize_by_lengths && length != 0) {
+      const float len_inv = 1.0f / length;
+      svbool_t pg;
+      int64_t j = 0;
+      while (j + vLen - 1 < block_size) {
+        svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv));
+        j += vLen;
+      }
+      if (j < block_size) {
+        pg = svwhilelt_b32_s64(j, block_size);
+        svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -4140,6 +6527,7 @@ static bool EmbeddingLookupIdx_int64_t_bfloat16_float__sve(
   const svbool_t svAll = svptrue_b32();
   const auto vLen = static_cast<int64_t>(svcntw());
   int64_t pos = 0;
+<<<<<<< HEAD
   if (block_size == 32 * vLen) {
     // unrolling 32 times
     for (int64_t i = 0; i < output_size; ++i) {
@@ -5092,6 +7480,532 @@ static bool EmbeddingLookupIdx_int64_t_bfloat16_float__sve(
           svst1_f32(
               pg, &op[j], svmul_f32_x(pg, svld1_f32(pg, &op[j]), vlen_inv));
         }
+=======
+  for (int64_t i = 0; i < output_size; ++i) {
+    float* const op = &out[i * block_size];
+    memset(op, 0, sizeof(float) * block_size);
+    if (pos != offsets[i] - offsets[0]) {
+      return false;
+    }
+    int64_t start_offset = offsets[i];
+    int64_t end_offset = offsets[i + 1];
+    int64_t j = start_offset;
+    // unrolling 16 times
+    while (j + 15 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      const auto idx8 = indices[pos + 8];
+      const auto idx9 = indices[pos + 9];
+      const auto idx10 = indices[pos + 10];
+      const auto idx11 = indices[pos + 11];
+      const auto idx12 = indices[pos + 12];
+      const auto idx13 = indices[pos + 13];
+      const auto idx14 = indices[pos + 14];
+      const auto idx15 = indices[pos + 15];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      if (idx8 < 0 || idx8 >= data_size) {
+        return false;
+      }
+      if (idx9 < 0 || idx9 >= data_size) {
+        return false;
+      }
+      if (idx10 < 0 || idx10 >= data_size) {
+        return false;
+      }
+      if (idx11 < 0 || idx11 >= data_size) {
+        return false;
+      }
+      if (idx12 < 0 || idx12 >= data_size) {
+        return false;
+      }
+      if (idx13 < 0 || idx13 >= data_size) {
+        return false;
+      }
+      if (idx14 < 0 || idx14 >= data_size) {
+        return false;
+      }
+      if (idx15 < 0 || idx15 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      float wgt8 = 1.f;
+      float wgt9 = 1.f;
+      float wgt10 = 1.f;
+      float wgt11 = 1.f;
+      float wgt12 = 1.f;
+      float wgt13 = 1.f;
+      float wgt14 = 1.f;
+      float wgt15 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+        wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8];
+        wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9];
+        wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10];
+        wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11];
+        wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12];
+        wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13];
+        wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14];
+        wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15];
+      }
+      const at::BFloat16* const ip0 = &input[idx0 * block_size];
+      const at::BFloat16* const ip1 = &input[idx1 * block_size];
+      const at::BFloat16* const ip2 = &input[idx2 * block_size];
+      const at::BFloat16* const ip3 = &input[idx3 * block_size];
+      const at::BFloat16* const ip4 = &input[idx4 * block_size];
+      const at::BFloat16* const ip5 = &input[idx5 * block_size];
+      const at::BFloat16* const ip6 = &input[idx6 * block_size];
+      const at::BFloat16* const ip7 = &input[idx7 * block_size];
+      const at::BFloat16* const ip8 = &input[idx8 * block_size];
+      const at::BFloat16* const ip9 = &input[idx9 * block_size];
+      const at::BFloat16* const ip10 = &input[idx10 * block_size];
+      const at::BFloat16* const ip11 = &input[idx11 * block_size];
+      const at::BFloat16* const ip12 = &input[idx12 * block_size];
+      const at::BFloat16* const ip13 = &input[idx13 * block_size];
+      const at::BFloat16* const ip14 = &input[idx14 * block_size];
+      const at::BFloat16* const ip15 = &input[idx15 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        auto input4 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip4[k])), 16));
+        auto input5 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip5[k])), 16));
+        auto input6 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip6[k])), 16));
+        auto input7 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip7[k])), 16));
+        auto input8 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip8[k])), 16));
+        auto input9 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip9[k])), 16));
+        auto input10 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip10[k])), 16));
+        auto input11 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip11[k])), 16));
+        auto input12 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip12[k])), 16));
+        auto input13 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip13[k])), 16));
+        auto input14 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip14[k])), 16));
+        auto input15 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip15[k])), 16));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        output = svmla_x(svAll, output, input8, wgt8);
+        output = svmla_x(svAll, output, input9, wgt9);
+        output = svmla_x(svAll, output, input10, wgt10);
+        output = svmla_x(svAll, output, input11, wgt11);
+        output = svmla_x(svAll, output, input12, wgt12);
+        output = svmla_x(svAll, output, input13, wgt13);
+        output = svmla_x(svAll, output, input14, wgt14);
+        output = svmla_x(svAll, output, input15, wgt15);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        auto input4 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip4[k])), 16));
+        auto input5 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip5[k])), 16));
+        auto input6 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip6[k])), 16));
+        auto input7 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip7[k])), 16));
+        auto input8 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip8[k])), 16));
+        auto input9 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip9[k])), 16));
+        auto input10 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip10[k])), 16));
+        auto input11 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip11[k])), 16));
+        auto input12 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip12[k])), 16));
+        auto input13 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip13[k])), 16));
+        auto input14 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip14[k])), 16));
+        auto input15 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip15[k])), 16));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        output = svmla_x(pg, output, input8, wgt8);
+        output = svmla_x(pg, output, input9, wgt9);
+        output = svmla_x(pg, output, input10, wgt10);
+        output = svmla_x(pg, output, input11, wgt11);
+        output = svmla_x(pg, output, input12, wgt12);
+        output = svmla_x(pg, output, input13, wgt13);
+        output = svmla_x(pg, output, input14, wgt14);
+        output = svmla_x(pg, output, input15, wgt15);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 16;
+      pos += 16;
+    }
+    // unrolling 8 times
+    while (j + 7 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+      }
+      const at::BFloat16* const ip0 = &input[idx0 * block_size];
+      const at::BFloat16* const ip1 = &input[idx1 * block_size];
+      const at::BFloat16* const ip2 = &input[idx2 * block_size];
+      const at::BFloat16* const ip3 = &input[idx3 * block_size];
+      const at::BFloat16* const ip4 = &input[idx4 * block_size];
+      const at::BFloat16* const ip5 = &input[idx5 * block_size];
+      const at::BFloat16* const ip6 = &input[idx6 * block_size];
+      const at::BFloat16* const ip7 = &input[idx7 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        auto input4 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip4[k])), 16));
+        auto input5 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip5[k])), 16));
+        auto input6 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip6[k])), 16));
+        auto input7 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip7[k])), 16));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        auto input4 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip4[k])), 16));
+        auto input5 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip5[k])), 16));
+        auto input6 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip6[k])), 16));
+        auto input7 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip7[k])), 16));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 8;
+      pos += 8;
+    }
+    // unrolling 4 times
+    while (j + 3 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+      }
+      const at::BFloat16* const ip0 = &input[idx0 * block_size];
+      const at::BFloat16* const ip1 = &input[idx1 * block_size];
+      const at::BFloat16* const ip2 = &input[idx2 * block_size];
+      const at::BFloat16* const ip3 = &input[idx3 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        auto input2 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip2[k])), 16));
+        auto input3 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip3[k])), 16));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 4;
+      pos += 4;
+    }
+    // unrolling 2 times
+    while (j + 1 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+      }
+      const at::BFloat16* const ip0 = &input[idx0 * block_size];
+      const at::BFloat16* const ip1 = &input[idx1 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        auto input1 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip1[k])), 16));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 2;
+      pos += 2;
+    }
+    // tail loop
+    if (j < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+      }
+      const at::BFloat16* const ip0 = &input[idx0 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(svAll,
+          svld1uh_u32(svAll, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        output = svmla_x(svAll, output, input0, wgt0);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        auto input0 = svreinterpret_f32(svlsl_x(pg,
+          svld1uh_u32(pg, reinterpret_cast<const uint16_t*>(&ip0[k])), 16));
+        output = svmla_x(pg, output, input0, wgt0);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      pos ++;
+    }
+    const int64_t length = end_offset - start_offset;
+
+    if (normalize_by_lengths && length != 0) {
+      const float len_inv = 1.0f / length;
+      svbool_t pg;
+      int64_t j = 0;
+      while (j + vLen - 1 < block_size) {
+        svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv));
+        j += vLen;
+      }
+      if (j < block_size) {
+        pg = svwhilelt_b32_s64(j, block_size);
+        svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -5164,6 +8078,7 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__sve(
   const svbool_t svAll = svptrue_b32();
   const auto vLen = static_cast<int64_t>(svcntw());
   int64_t pos = 0;
+<<<<<<< HEAD
   if (block_size == 32 * vLen) {
     // unrolling 32 times
     for (int64_t i = 0; i < output_size; ++i) {
@@ -5901,6 +8816,557 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__sve(
           svst1_f32(
               pg, &op[j], svmul_f32_x(pg, svld1_f32(pg, &op[j]), vlen_inv));
         }
+=======
+  for (int64_t i = 0; i < output_size; ++i) {
+    float* const op = &out[i * block_size];
+    memset(op, 0, sizeof(float) * block_size);
+    if (pos != offsets[i] - offsets[0]) {
+      return false;
+    }
+    int64_t start_offset = offsets[i];
+    int64_t end_offset = offsets[i + 1];
+    int64_t j = start_offset;
+    // unrolling 16 times
+    while (j + 15 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      const auto idx8 = indices[pos + 8];
+      const auto idx9 = indices[pos + 9];
+      const auto idx10 = indices[pos + 10];
+      const auto idx11 = indices[pos + 11];
+      const auto idx12 = indices[pos + 12];
+      const auto idx13 = indices[pos + 13];
+      const auto idx14 = indices[pos + 14];
+      const auto idx15 = indices[pos + 15];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      if (idx8 < 0 || idx8 >= data_size) {
+        return false;
+      }
+      if (idx9 < 0 || idx9 >= data_size) {
+        return false;
+      }
+      if (idx10 < 0 || idx10 >= data_size) {
+        return false;
+      }
+      if (idx11 < 0 || idx11 >= data_size) {
+        return false;
+      }
+      if (idx12 < 0 || idx12 >= data_size) {
+        return false;
+      }
+      if (idx13 < 0 || idx13 >= data_size) {
+        return false;
+      }
+      if (idx14 < 0 || idx14 >= data_size) {
+        return false;
+      }
+      if (idx15 < 0 || idx15 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      float wgt8 = 1.f;
+      float wgt9 = 1.f;
+      float wgt10 = 1.f;
+      float wgt11 = 1.f;
+      float wgt12 = 1.f;
+      float wgt13 = 1.f;
+      float wgt14 = 1.f;
+      float wgt15 = 1.f;
+      float bio = 0.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+        wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8];
+        wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9];
+        wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10];
+        wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11];
+        wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12];
+        wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13];
+        wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14];
+        wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15];
+      }
+      if (scale_bias) {
+        bio += wgt0 * scale_bias[2 * idx0 + 1];
+        wgt0 = wgt0 * scale_bias[2 * idx0];
+        bio += wgt1 * scale_bias[2 * idx1 + 1];
+        wgt1 = wgt1 * scale_bias[2 * idx1];
+        bio += wgt2 * scale_bias[2 * idx2 + 1];
+        wgt2 = wgt2 * scale_bias[2 * idx2];
+        bio += wgt3 * scale_bias[2 * idx3 + 1];
+        wgt3 = wgt3 * scale_bias[2 * idx3];
+        bio += wgt4 * scale_bias[2 * idx4 + 1];
+        wgt4 = wgt4 * scale_bias[2 * idx4];
+        bio += wgt5 * scale_bias[2 * idx5 + 1];
+        wgt5 = wgt5 * scale_bias[2 * idx5];
+        bio += wgt6 * scale_bias[2 * idx6 + 1];
+        wgt6 = wgt6 * scale_bias[2 * idx6];
+        bio += wgt7 * scale_bias[2 * idx7 + 1];
+        wgt7 = wgt7 * scale_bias[2 * idx7];
+        bio += wgt8 * scale_bias[2 * idx8 + 1];
+        wgt8 = wgt8 * scale_bias[2 * idx8];
+        bio += wgt9 * scale_bias[2 * idx9 + 1];
+        wgt9 = wgt9 * scale_bias[2 * idx9];
+        bio += wgt10 * scale_bias[2 * idx10 + 1];
+        wgt10 = wgt10 * scale_bias[2 * idx10];
+        bio += wgt11 * scale_bias[2 * idx11 + 1];
+        wgt11 = wgt11 * scale_bias[2 * idx11];
+        bio += wgt12 * scale_bias[2 * idx12 + 1];
+        wgt12 = wgt12 * scale_bias[2 * idx12];
+        bio += wgt13 * scale_bias[2 * idx13 + 1];
+        wgt13 = wgt13 * scale_bias[2 * idx13];
+        bio += wgt14 * scale_bias[2 * idx14 + 1];
+        wgt14 = wgt14 * scale_bias[2 * idx14];
+        bio += wgt15 * scale_bias[2 * idx15 + 1];
+        wgt15 = wgt15 * scale_bias[2 * idx15];
+      }
+      const uint8_t* const ip0 = &input[idx0 * block_size];
+      const uint8_t* const ip1 = &input[idx1 * block_size];
+      const uint8_t* const ip2 = &input[idx2 * block_size];
+      const uint8_t* const ip3 = &input[idx3 * block_size];
+      const uint8_t* const ip4 = &input[idx4 * block_size];
+      const uint8_t* const ip5 = &input[idx5 * block_size];
+      const uint8_t* const ip6 = &input[idx6 * block_size];
+      const uint8_t* const ip7 = &input[idx7 * block_size];
+      const uint8_t* const ip8 = &input[idx8 * block_size];
+      const uint8_t* const ip9 = &input[idx9 * block_size];
+      const uint8_t* const ip10 = &input[idx10 * block_size];
+      const uint8_t* const ip11 = &input[idx11 * block_size];
+      const uint8_t* const ip12 = &input[idx12 * block_size];
+      const uint8_t* const ip13 = &input[idx13 * block_size];
+      const uint8_t* const ip14 = &input[idx14 * block_size];
+      const uint8_t* const ip15 = &input[idx15 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svadd_x(svAll, output, bio);
+        auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k]));
+        auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k]));
+        auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k]));
+        auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k]));
+        auto input4 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip4[k]));
+        auto input5 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip5[k]));
+        auto input6 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip6[k]));
+        auto input7 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip7[k]));
+        auto input8 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip8[k]));
+        auto input9 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip9[k]));
+        auto input10 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip10[k]));
+        auto input11 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip11[k]));
+        auto input12 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip12[k]));
+        auto input13 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip13[k]));
+        auto input14 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip14[k]));
+        auto input15 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip15[k]));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        output = svmla_x(svAll, output, input8, wgt8);
+        output = svmla_x(svAll, output, input9, wgt9);
+        output = svmla_x(svAll, output, input10, wgt10);
+        output = svmla_x(svAll, output, input11, wgt11);
+        output = svmla_x(svAll, output, input12, wgt12);
+        output = svmla_x(svAll, output, input13, wgt13);
+        output = svmla_x(svAll, output, input14, wgt14);
+        output = svmla_x(svAll, output, input15, wgt15);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svadd_x(pg, output, bio);
+        auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k]));
+        auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k]));
+        auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k]));
+        auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k]));
+        auto input4 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip4[k]));
+        auto input5 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip5[k]));
+        auto input6 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip6[k]));
+        auto input7 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip7[k]));
+        auto input8 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip8[k]));
+        auto input9 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip9[k]));
+        auto input10 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip10[k]));
+        auto input11 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip11[k]));
+        auto input12 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip12[k]));
+        auto input13 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip13[k]));
+        auto input14 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip14[k]));
+        auto input15 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip15[k]));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        output = svmla_x(pg, output, input8, wgt8);
+        output = svmla_x(pg, output, input9, wgt9);
+        output = svmla_x(pg, output, input10, wgt10);
+        output = svmla_x(pg, output, input11, wgt11);
+        output = svmla_x(pg, output, input12, wgt12);
+        output = svmla_x(pg, output, input13, wgt13);
+        output = svmla_x(pg, output, input14, wgt14);
+        output = svmla_x(pg, output, input15, wgt15);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 16;
+      pos += 16;
+    }
+    // unrolling 8 times
+    while (j + 7 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      float bio = 0.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+      }
+      if (scale_bias) {
+        bio += wgt0 * scale_bias[2 * idx0 + 1];
+        wgt0 = wgt0 * scale_bias[2 * idx0];
+        bio += wgt1 * scale_bias[2 * idx1 + 1];
+        wgt1 = wgt1 * scale_bias[2 * idx1];
+        bio += wgt2 * scale_bias[2 * idx2 + 1];
+        wgt2 = wgt2 * scale_bias[2 * idx2];
+        bio += wgt3 * scale_bias[2 * idx3 + 1];
+        wgt3 = wgt3 * scale_bias[2 * idx3];
+        bio += wgt4 * scale_bias[2 * idx4 + 1];
+        wgt4 = wgt4 * scale_bias[2 * idx4];
+        bio += wgt5 * scale_bias[2 * idx5 + 1];
+        wgt5 = wgt5 * scale_bias[2 * idx5];
+        bio += wgt6 * scale_bias[2 * idx6 + 1];
+        wgt6 = wgt6 * scale_bias[2 * idx6];
+        bio += wgt7 * scale_bias[2 * idx7 + 1];
+        wgt7 = wgt7 * scale_bias[2 * idx7];
+      }
+      const uint8_t* const ip0 = &input[idx0 * block_size];
+      const uint8_t* const ip1 = &input[idx1 * block_size];
+      const uint8_t* const ip2 = &input[idx2 * block_size];
+      const uint8_t* const ip3 = &input[idx3 * block_size];
+      const uint8_t* const ip4 = &input[idx4 * block_size];
+      const uint8_t* const ip5 = &input[idx5 * block_size];
+      const uint8_t* const ip6 = &input[idx6 * block_size];
+      const uint8_t* const ip7 = &input[idx7 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svadd_x(svAll, output, bio);
+        auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k]));
+        auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k]));
+        auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k]));
+        auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k]));
+        auto input4 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip4[k]));
+        auto input5 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip5[k]));
+        auto input6 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip6[k]));
+        auto input7 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip7[k]));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svadd_x(pg, output, bio);
+        auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k]));
+        auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k]));
+        auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k]));
+        auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k]));
+        auto input4 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip4[k]));
+        auto input5 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip5[k]));
+        auto input6 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip6[k]));
+        auto input7 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip7[k]));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 8;
+      pos += 8;
+    }
+    // unrolling 4 times
+    while (j + 3 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float bio = 0.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+      }
+      if (scale_bias) {
+        bio += wgt0 * scale_bias[2 * idx0 + 1];
+        wgt0 = wgt0 * scale_bias[2 * idx0];
+        bio += wgt1 * scale_bias[2 * idx1 + 1];
+        wgt1 = wgt1 * scale_bias[2 * idx1];
+        bio += wgt2 * scale_bias[2 * idx2 + 1];
+        wgt2 = wgt2 * scale_bias[2 * idx2];
+        bio += wgt3 * scale_bias[2 * idx3 + 1];
+        wgt3 = wgt3 * scale_bias[2 * idx3];
+      }
+      const uint8_t* const ip0 = &input[idx0 * block_size];
+      const uint8_t* const ip1 = &input[idx1 * block_size];
+      const uint8_t* const ip2 = &input[idx2 * block_size];
+      const uint8_t* const ip3 = &input[idx3 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svadd_x(svAll, output, bio);
+        auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k]));
+        auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k]));
+        auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k]));
+        auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k]));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svadd_x(pg, output, bio);
+        auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k]));
+        auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k]));
+        auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k]));
+        auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k]));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 4;
+      pos += 4;
+    }
+    // unrolling 2 times
+    while (j + 1 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float bio = 0.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+      }
+      if (scale_bias) {
+        bio += wgt0 * scale_bias[2 * idx0 + 1];
+        wgt0 = wgt0 * scale_bias[2 * idx0];
+        bio += wgt1 * scale_bias[2 * idx1 + 1];
+        wgt1 = wgt1 * scale_bias[2 * idx1];
+      }
+      const uint8_t* const ip0 = &input[idx0 * block_size];
+      const uint8_t* const ip1 = &input[idx1 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svadd_x(svAll, output, bio);
+        auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k]));
+        auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k]));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svadd_x(pg, output, bio);
+        auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k]));
+        auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k]));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 2;
+      pos += 2;
+    }
+    // tail loop
+    if (j < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float bio = 0.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+      }
+      if (scale_bias) {
+        bio += wgt0 * scale_bias[2 * idx0 + 1];
+        wgt0 = wgt0 * scale_bias[2 * idx0];
+      }
+      const uint8_t* const ip0 = &input[idx0 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svadd_x(svAll, output, bio);
+        auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k]));
+        output = svmla_x(svAll, output, input0, wgt0);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svadd_x(pg, output, bio);
+        auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k]));
+        output = svmla_x(pg, output, input0, wgt0);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      pos ++;
+    }
+    const int64_t length = end_offset - start_offset;
+
+    if (normalize_by_lengths && length != 0) {
+      const float len_inv = 1.0f / length;
+      svbool_t pg;
+      int64_t j = 0;
+      while (j + vLen - 1 < block_size) {
+        svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv));
+        j += vLen;
+      }
+      if (j < block_size) {
+        pg = svwhilelt_b32_s64(j, block_size);
+        svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
@@ -5973,6 +9439,7 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__sve(
   const svbool_t svAll = svptrue_b32();
   const auto vLen = static_cast<int64_t>(svcntw());
   int64_t pos = 0;
+<<<<<<< HEAD
   if (block_size == 32 * vLen) {
     // unrolling 32 times
     for (int64_t i = 0; i < output_size; ++i) {
@@ -6710,6 +10177,557 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__sve(
           svst1_f32(
               pg, &op[j], svmul_f32_x(pg, svld1_f32(pg, &op[j]), vlen_inv));
         }
+=======
+  for (int64_t i = 0; i < output_size; ++i) {
+    float* const op = &out[i * block_size];
+    memset(op, 0, sizeof(float) * block_size);
+    if (pos != offsets[i] - offsets[0]) {
+      return false;
+    }
+    int64_t start_offset = offsets[i];
+    int64_t end_offset = offsets[i + 1];
+    int64_t j = start_offset;
+    // unrolling 16 times
+    while (j + 15 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      const auto idx8 = indices[pos + 8];
+      const auto idx9 = indices[pos + 9];
+      const auto idx10 = indices[pos + 10];
+      const auto idx11 = indices[pos + 11];
+      const auto idx12 = indices[pos + 12];
+      const auto idx13 = indices[pos + 13];
+      const auto idx14 = indices[pos + 14];
+      const auto idx15 = indices[pos + 15];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      if (idx8 < 0 || idx8 >= data_size) {
+        return false;
+      }
+      if (idx9 < 0 || idx9 >= data_size) {
+        return false;
+      }
+      if (idx10 < 0 || idx10 >= data_size) {
+        return false;
+      }
+      if (idx11 < 0 || idx11 >= data_size) {
+        return false;
+      }
+      if (idx12 < 0 || idx12 >= data_size) {
+        return false;
+      }
+      if (idx13 < 0 || idx13 >= data_size) {
+        return false;
+      }
+      if (idx14 < 0 || idx14 >= data_size) {
+        return false;
+      }
+      if (idx15 < 0 || idx15 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      float wgt8 = 1.f;
+      float wgt9 = 1.f;
+      float wgt10 = 1.f;
+      float wgt11 = 1.f;
+      float wgt12 = 1.f;
+      float wgt13 = 1.f;
+      float wgt14 = 1.f;
+      float wgt15 = 1.f;
+      float bio = 0.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+        wgt8 = weights[IS_WEIGHT_POSITIONAL ? (j + 8 - start_offset) : pos + 8];
+        wgt9 = weights[IS_WEIGHT_POSITIONAL ? (j + 9 - start_offset) : pos + 9];
+        wgt10 = weights[IS_WEIGHT_POSITIONAL ? (j + 10 - start_offset) : pos + 10];
+        wgt11 = weights[IS_WEIGHT_POSITIONAL ? (j + 11 - start_offset) : pos + 11];
+        wgt12 = weights[IS_WEIGHT_POSITIONAL ? (j + 12 - start_offset) : pos + 12];
+        wgt13 = weights[IS_WEIGHT_POSITIONAL ? (j + 13 - start_offset) : pos + 13];
+        wgt14 = weights[IS_WEIGHT_POSITIONAL ? (j + 14 - start_offset) : pos + 14];
+        wgt15 = weights[IS_WEIGHT_POSITIONAL ? (j + 15 - start_offset) : pos + 15];
+      }
+      if (scale_bias) {
+        bio += wgt0 * scale_bias[2 * idx0 + 1];
+        wgt0 = wgt0 * scale_bias[2 * idx0];
+        bio += wgt1 * scale_bias[2 * idx1 + 1];
+        wgt1 = wgt1 * scale_bias[2 * idx1];
+        bio += wgt2 * scale_bias[2 * idx2 + 1];
+        wgt2 = wgt2 * scale_bias[2 * idx2];
+        bio += wgt3 * scale_bias[2 * idx3 + 1];
+        wgt3 = wgt3 * scale_bias[2 * idx3];
+        bio += wgt4 * scale_bias[2 * idx4 + 1];
+        wgt4 = wgt4 * scale_bias[2 * idx4];
+        bio += wgt5 * scale_bias[2 * idx5 + 1];
+        wgt5 = wgt5 * scale_bias[2 * idx5];
+        bio += wgt6 * scale_bias[2 * idx6 + 1];
+        wgt6 = wgt6 * scale_bias[2 * idx6];
+        bio += wgt7 * scale_bias[2 * idx7 + 1];
+        wgt7 = wgt7 * scale_bias[2 * idx7];
+        bio += wgt8 * scale_bias[2 * idx8 + 1];
+        wgt8 = wgt8 * scale_bias[2 * idx8];
+        bio += wgt9 * scale_bias[2 * idx9 + 1];
+        wgt9 = wgt9 * scale_bias[2 * idx9];
+        bio += wgt10 * scale_bias[2 * idx10 + 1];
+        wgt10 = wgt10 * scale_bias[2 * idx10];
+        bio += wgt11 * scale_bias[2 * idx11 + 1];
+        wgt11 = wgt11 * scale_bias[2 * idx11];
+        bio += wgt12 * scale_bias[2 * idx12 + 1];
+        wgt12 = wgt12 * scale_bias[2 * idx12];
+        bio += wgt13 * scale_bias[2 * idx13 + 1];
+        wgt13 = wgt13 * scale_bias[2 * idx13];
+        bio += wgt14 * scale_bias[2 * idx14 + 1];
+        wgt14 = wgt14 * scale_bias[2 * idx14];
+        bio += wgt15 * scale_bias[2 * idx15 + 1];
+        wgt15 = wgt15 * scale_bias[2 * idx15];
+      }
+      const uint8_t* const ip0 = &input[idx0 * block_size];
+      const uint8_t* const ip1 = &input[idx1 * block_size];
+      const uint8_t* const ip2 = &input[idx2 * block_size];
+      const uint8_t* const ip3 = &input[idx3 * block_size];
+      const uint8_t* const ip4 = &input[idx4 * block_size];
+      const uint8_t* const ip5 = &input[idx5 * block_size];
+      const uint8_t* const ip6 = &input[idx6 * block_size];
+      const uint8_t* const ip7 = &input[idx7 * block_size];
+      const uint8_t* const ip8 = &input[idx8 * block_size];
+      const uint8_t* const ip9 = &input[idx9 * block_size];
+      const uint8_t* const ip10 = &input[idx10 * block_size];
+      const uint8_t* const ip11 = &input[idx11 * block_size];
+      const uint8_t* const ip12 = &input[idx12 * block_size];
+      const uint8_t* const ip13 = &input[idx13 * block_size];
+      const uint8_t* const ip14 = &input[idx14 * block_size];
+      const uint8_t* const ip15 = &input[idx15 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svadd_x(svAll, output, bio);
+        auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k]));
+        auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k]));
+        auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k]));
+        auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k]));
+        auto input4 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip4[k]));
+        auto input5 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip5[k]));
+        auto input6 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip6[k]));
+        auto input7 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip7[k]));
+        auto input8 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip8[k]));
+        auto input9 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip9[k]));
+        auto input10 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip10[k]));
+        auto input11 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip11[k]));
+        auto input12 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip12[k]));
+        auto input13 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip13[k]));
+        auto input14 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip14[k]));
+        auto input15 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip15[k]));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        output = svmla_x(svAll, output, input8, wgt8);
+        output = svmla_x(svAll, output, input9, wgt9);
+        output = svmla_x(svAll, output, input10, wgt10);
+        output = svmla_x(svAll, output, input11, wgt11);
+        output = svmla_x(svAll, output, input12, wgt12);
+        output = svmla_x(svAll, output, input13, wgt13);
+        output = svmla_x(svAll, output, input14, wgt14);
+        output = svmla_x(svAll, output, input15, wgt15);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svadd_x(pg, output, bio);
+        auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k]));
+        auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k]));
+        auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k]));
+        auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k]));
+        auto input4 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip4[k]));
+        auto input5 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip5[k]));
+        auto input6 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip6[k]));
+        auto input7 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip7[k]));
+        auto input8 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip8[k]));
+        auto input9 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip9[k]));
+        auto input10 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip10[k]));
+        auto input11 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip11[k]));
+        auto input12 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip12[k]));
+        auto input13 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip13[k]));
+        auto input14 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip14[k]));
+        auto input15 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip15[k]));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        output = svmla_x(pg, output, input8, wgt8);
+        output = svmla_x(pg, output, input9, wgt9);
+        output = svmla_x(pg, output, input10, wgt10);
+        output = svmla_x(pg, output, input11, wgt11);
+        output = svmla_x(pg, output, input12, wgt12);
+        output = svmla_x(pg, output, input13, wgt13);
+        output = svmla_x(pg, output, input14, wgt14);
+        output = svmla_x(pg, output, input15, wgt15);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 16;
+      pos += 16;
+    }
+    // unrolling 8 times
+    while (j + 7 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      const auto idx4 = indices[pos + 4];
+      const auto idx5 = indices[pos + 5];
+      const auto idx6 = indices[pos + 6];
+      const auto idx7 = indices[pos + 7];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      if (idx4 < 0 || idx4 >= data_size) {
+        return false;
+      }
+      if (idx5 < 0 || idx5 >= data_size) {
+        return false;
+      }
+      if (idx6 < 0 || idx6 >= data_size) {
+        return false;
+      }
+      if (idx7 < 0 || idx7 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float wgt4 = 1.f;
+      float wgt5 = 1.f;
+      float wgt6 = 1.f;
+      float wgt7 = 1.f;
+      float bio = 0.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+        wgt4 = weights[IS_WEIGHT_POSITIONAL ? (j + 4 - start_offset) : pos + 4];
+        wgt5 = weights[IS_WEIGHT_POSITIONAL ? (j + 5 - start_offset) : pos + 5];
+        wgt6 = weights[IS_WEIGHT_POSITIONAL ? (j + 6 - start_offset) : pos + 6];
+        wgt7 = weights[IS_WEIGHT_POSITIONAL ? (j + 7 - start_offset) : pos + 7];
+      }
+      if (scale_bias) {
+        bio += wgt0 * scale_bias[2 * idx0 + 1];
+        wgt0 = wgt0 * scale_bias[2 * idx0];
+        bio += wgt1 * scale_bias[2 * idx1 + 1];
+        wgt1 = wgt1 * scale_bias[2 * idx1];
+        bio += wgt2 * scale_bias[2 * idx2 + 1];
+        wgt2 = wgt2 * scale_bias[2 * idx2];
+        bio += wgt3 * scale_bias[2 * idx3 + 1];
+        wgt3 = wgt3 * scale_bias[2 * idx3];
+        bio += wgt4 * scale_bias[2 * idx4 + 1];
+        wgt4 = wgt4 * scale_bias[2 * idx4];
+        bio += wgt5 * scale_bias[2 * idx5 + 1];
+        wgt5 = wgt5 * scale_bias[2 * idx5];
+        bio += wgt6 * scale_bias[2 * idx6 + 1];
+        wgt6 = wgt6 * scale_bias[2 * idx6];
+        bio += wgt7 * scale_bias[2 * idx7 + 1];
+        wgt7 = wgt7 * scale_bias[2 * idx7];
+      }
+      const uint8_t* const ip0 = &input[idx0 * block_size];
+      const uint8_t* const ip1 = &input[idx1 * block_size];
+      const uint8_t* const ip2 = &input[idx2 * block_size];
+      const uint8_t* const ip3 = &input[idx3 * block_size];
+      const uint8_t* const ip4 = &input[idx4 * block_size];
+      const uint8_t* const ip5 = &input[idx5 * block_size];
+      const uint8_t* const ip6 = &input[idx6 * block_size];
+      const uint8_t* const ip7 = &input[idx7 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svadd_x(svAll, output, bio);
+        auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k]));
+        auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k]));
+        auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k]));
+        auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k]));
+        auto input4 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip4[k]));
+        auto input5 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip5[k]));
+        auto input6 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip6[k]));
+        auto input7 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip7[k]));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        output = svmla_x(svAll, output, input4, wgt4);
+        output = svmla_x(svAll, output, input5, wgt5);
+        output = svmla_x(svAll, output, input6, wgt6);
+        output = svmla_x(svAll, output, input7, wgt7);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svadd_x(pg, output, bio);
+        auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k]));
+        auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k]));
+        auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k]));
+        auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k]));
+        auto input4 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip4[k]));
+        auto input5 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip5[k]));
+        auto input6 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip6[k]));
+        auto input7 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip7[k]));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        output = svmla_x(pg, output, input4, wgt4);
+        output = svmla_x(pg, output, input5, wgt5);
+        output = svmla_x(pg, output, input6, wgt6);
+        output = svmla_x(pg, output, input7, wgt7);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 8;
+      pos += 8;
+    }
+    // unrolling 4 times
+    while (j + 3 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      const auto idx2 = indices[pos + 2];
+      const auto idx3 = indices[pos + 3];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      if (idx2 < 0 || idx2 >= data_size) {
+        return false;
+      }
+      if (idx3 < 0 || idx3 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float wgt2 = 1.f;
+      float wgt3 = 1.f;
+      float bio = 0.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+        wgt2 = weights[IS_WEIGHT_POSITIONAL ? (j + 2 - start_offset) : pos + 2];
+        wgt3 = weights[IS_WEIGHT_POSITIONAL ? (j + 3 - start_offset) : pos + 3];
+      }
+      if (scale_bias) {
+        bio += wgt0 * scale_bias[2 * idx0 + 1];
+        wgt0 = wgt0 * scale_bias[2 * idx0];
+        bio += wgt1 * scale_bias[2 * idx1 + 1];
+        wgt1 = wgt1 * scale_bias[2 * idx1];
+        bio += wgt2 * scale_bias[2 * idx2 + 1];
+        wgt2 = wgt2 * scale_bias[2 * idx2];
+        bio += wgt3 * scale_bias[2 * idx3 + 1];
+        wgt3 = wgt3 * scale_bias[2 * idx3];
+      }
+      const uint8_t* const ip0 = &input[idx0 * block_size];
+      const uint8_t* const ip1 = &input[idx1 * block_size];
+      const uint8_t* const ip2 = &input[idx2 * block_size];
+      const uint8_t* const ip3 = &input[idx3 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svadd_x(svAll, output, bio);
+        auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k]));
+        auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k]));
+        auto input2 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip2[k]));
+        auto input3 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip3[k]));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        output = svmla_x(svAll, output, input2, wgt2);
+        output = svmla_x(svAll, output, input3, wgt3);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svadd_x(pg, output, bio);
+        auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k]));
+        auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k]));
+        auto input2 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip2[k]));
+        auto input3 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip3[k]));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        output = svmla_x(pg, output, input2, wgt2);
+        output = svmla_x(pg, output, input3, wgt3);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 4;
+      pos += 4;
+    }
+    // unrolling 2 times
+    while (j + 1 < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      const auto idx1 = indices[pos + 1];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      if (idx1 < 0 || idx1 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float wgt1 = 1.f;
+      float bio = 0.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+        wgt1 = weights[IS_WEIGHT_POSITIONAL ? (j + 1 - start_offset) : pos + 1];
+      }
+      if (scale_bias) {
+        bio += wgt0 * scale_bias[2 * idx0 + 1];
+        wgt0 = wgt0 * scale_bias[2 * idx0];
+        bio += wgt1 * scale_bias[2 * idx1 + 1];
+        wgt1 = wgt1 * scale_bias[2 * idx1];
+      }
+      const uint8_t* const ip0 = &input[idx0 * block_size];
+      const uint8_t* const ip1 = &input[idx1 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svadd_x(svAll, output, bio);
+        auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k]));
+        auto input1 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip1[k]));
+        output = svmla_x(svAll, output, input0, wgt0);
+        output = svmla_x(svAll, output, input1, wgt1);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svadd_x(pg, output, bio);
+        auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k]));
+        auto input1 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip1[k]));
+        output = svmla_x(pg, output, input0, wgt0);
+        output = svmla_x(pg, output, input1, wgt1);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      j += 2;
+      pos += 2;
+    }
+    // tail loop
+    if (j < end_offset) {
+      const auto idx0 = indices[pos + 0];
+      if (idx0 < 0 || idx0 >= data_size) {
+        return false;
+      }
+      float wgt0 = 1.f;
+      float bio = 0.f;
+      if (weights) {
+        wgt0 = weights[IS_WEIGHT_POSITIONAL ? (j + 0 - start_offset) : pos + 0];
+      }
+      if (scale_bias) {
+        bio += wgt0 * scale_bias[2 * idx0 + 1];
+        wgt0 = wgt0 * scale_bias[2 * idx0];
+      }
+      const uint8_t* const ip0 = &input[idx0 * block_size];
+      svbool_t pg;
+      int64_t k = 0;
+      while (k + vLen - 1 < block_size) {
+        auto output = svld1(svAll, &op[k]);
+        output = svadd_x(svAll, output, bio);
+        auto input0 = svcvt_f32_x(svAll, svld1ub_u32(svAll, &ip0[k]));
+        output = svmla_x(svAll, output, input0, wgt0);
+        svst1(svAll, &op[k], output);
+        k += vLen;
+      }
+      if (k < block_size) {
+        pg = svwhilelt_b32_s64(k, block_size);
+        auto output = svld1(pg, &op[k]);
+        output = svadd_x(pg, output, bio);
+        auto input0 = svcvt_f32_x(pg, svld1ub_u32(pg, &ip0[k]));
+        output = svmla_x(pg, output, input0, wgt0);
+        svst1(pg, &op[k], output);
+        k += vLen;
+      }
+      pos ++;
+    }
+    const int64_t length = end_offset - start_offset;
+
+    if (normalize_by_lengths && length != 0) {
+      const float len_inv = 1.0f / length;
+      svbool_t pg;
+      int64_t j = 0;
+      while (j + vLen - 1 < block_size) {
+        svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv));
+        j += vLen;
+      }
+      if (j < block_size) {
+        pg = svwhilelt_b32_s64(j, block_size);
+        svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     }
   }
diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
index ca6da8ef4ffc..7d1928ba4d5c 100644
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@@ -113,8 +113,11 @@ def compute(regid, InType, use_weights, isa, prefetch):
 
     if InType == "uint8_t":
         code.append("        " + OutType + " wgt = 1.f;")
+<<<<<<< HEAD
         code.append("        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)")
         code.append("        " + OutType + " bio;")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code.append("        if (weights) {")
         code.append(
             "          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];"
@@ -125,7 +128,11 @@ def compute(regid, InType, use_weights, isa, prefetch):
                 "        const float* scale_bias = reinterpret_cast<const float*>(\n"
                 "            &input[idx * fused_block_size + block_size]);"
             )
+<<<<<<< HEAD
             code.append("        bio = wgt * scale_bias[1];")
+=======
+            code.append("        " + OutType + " bio = wgt * scale_bias[1];")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             code.append("        wgt = wgt * scale_bias[0];")
         else:
             code.append("        bio = wgt * scale_bias[2 * idx + 1];")
@@ -316,8 +323,11 @@ def compute(InType, use_weights, isa):
 
     if InType == "uint8_t":
         code.append("        " + OutType + " wgt = 1.f;")
+<<<<<<< HEAD
         code.append("        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)")
         code.append("        " + OutType + " bio;")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code.append("        if (weights) {")
         code.append(
             "          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];"
@@ -328,10 +338,17 @@ def compute(InType, use_weights, isa):
                 "        const float* scale_bias = reinterpret_cast<const float*>(\n"
                 "            &input[idx * fused_block_size + block_size]);"
             )
+<<<<<<< HEAD
             code.append("        bio = wgt * scale_bias[1];")
             code.append("        wgt = wgt * scale_bias[0];")
         else:
             code.append("        bio = wgt * scale_bias[2 * idx + 1];")
+=======
+            code.append("        " + OutType + " bio = wgt * scale_bias[1];")
+            code.append("        wgt = wgt * scale_bias[0];")
+        else:
+            code.append("        " + OutType + " bio = wgt * scale_bias[2 * idx + 1];")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             code.append("        wgt = wgt * scale_bias[2 * idx];")
         code.append("        __m256 vbio = _mm256_set1_ps(bio);")
     else:
diff --git a/caffe2/perfkernels/sve_emblookup_codegen.py b/caffe2/perfkernels/sve_emblookup_codegen.py
index 643b614c9081..74e1ff40119f 100644
--- a/caffe2/perfkernels/sve_emblookup_codegen.py
+++ b/caffe2/perfkernels/sve_emblookup_codegen.py
@@ -4,6 +4,7 @@
 
 
 # Unroll loops when block_size is a multiple of vector length.
+<<<<<<< HEAD
 def unroll(num_unrolls, IndexType, InType, OutType, use_weights):
     def compute(regid, InType, use_weights):
         code = []
@@ -195,6 +196,34 @@ def compute(InType, use_weights):
                 "                  svadd_f32_x(pg,"
                 " svld1_f32(pg, &op[k]), vbio)));"
             )
+=======
+def unroll(num_unrolls, IndexType, InType, OutType):
+    def compute_output(num_unrolls, InType, is_main):
+        code = []
+
+        pred = "svAll" if is_main else "pg"
+        if InType == "float":
+            for i in range(num_unrolls):
+                code.append(f"        output = svmla_x({pred}, output, svld1(svAll, &ip{i}[k]), wgt{i});")
+        elif InType == "at::Half":
+            for i in range(num_unrolls):
+                code.append(f"        auto input{i} = svcvt_f32_x({pred}, svreinterpret_f16(\n"
+                f"          svld1uh_u32({pred}, reinterpret_cast<const uint16_t*>(&ip{i}[k]))));")
+            for i in range(num_unrolls):
+                code.append(f"        output = svmla_x({pred}, output, input{i}, wgt{i});")
+        elif InType == "at::BFloat16":
+            for i in range(num_unrolls):
+                code.append(f"        auto input{i} = svreinterpret_f32(svlsl_x({pred},\n"
+                f"          svld1uh_u32({pred}, reinterpret_cast<const uint16_t*>(&ip{i}[k])), 16));")
+            for i in range(num_unrolls):
+                code.append(f"        output = svmla_x({pred}, output, input{i}, wgt{i});")
+        elif InType == "uint8_t":
+            code.append(f"        output = svadd_x({pred}, output, bio);")
+            for i in range(num_unrolls):
+                code.append(f"        auto input{i} = svcvt_f32_x({pred}, svld1ub_u32({pred}, &ip{i}[k]));")
+            for i in range(num_unrolls):
+                code.append(f"        output = svmla_x({pred}, output, input{i}, wgt{i});")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise ValueError(f'Unknown datatype "{InType}"')
 
@@ -202,6 +231,7 @@ def compute(InType, use_weights):
 
     code = []
 
+<<<<<<< HEAD
     code.append("    for (int64_t i = 0; i < output_size; ++i) {")
 
     code.append("      " + OutType + "* const op = &out[i * block_size];")
@@ -287,6 +317,74 @@ def compute(InType, use_weights):
     code.append("    }")
     return code
 
+=======
+    if num_unrolls == 1:
+        code.append(f"    // tail loop")
+        code.append("    if (j < end_offset) {")
+    else:
+        code.append(f"    // unrolling {num_unrolls} times")
+        code.append(f"    while (j + {num_unrolls - 1} < end_offset) {{")
+    for i in range(num_unrolls):
+        code.append(f"      const auto idx{i} = indices[pos + {i}];")
+
+    # check indices
+    for i in range(num_unrolls):
+        code.append(
+            f"      if (idx{i} < 0 || idx{i} >= data_size) {{\n"
+            + "        return false;\n"
+            + "      }"
+        )
+
+    if InType == "uint8_t":
+        for i in range(num_unrolls):
+            code.append(f"      {OutType} wgt{i} = 1.f;")
+        code.append(f"      {OutType} bio = 0.f;")
+    else:
+        for i in range(num_unrolls):
+            code.append(f"      {OutType} wgt{i} = 1.f;")
+
+    code.append("      if (weights) {")
+    for i in range(num_unrolls):
+        code.append(f"        wgt{i} = weights[IS_WEIGHT_POSITIONAL ? (j + {i} - start_offset) : pos + {i}];")
+    code.append("      }")
+    if InType == "uint8_t":
+        code.append("      if (scale_bias) {")
+        for i in range(num_unrolls):
+            code.append(f"        bio += wgt{i} * scale_bias[2 * idx{i} + 1];")
+            code.append(f"        wgt{i} = wgt{i} * scale_bias[2 * idx{i}];")
+        code.append("      }")
+
+    for i in range(num_unrolls):
+        code.append(f"      const {InType}* const ip{i} = &input[idx{i} * block_size];")
+
+    # compute and store
+    code.append("      svbool_t pg;")
+    code.append("      int64_t k = 0;")
+    # main loop
+    code.append("      while (k + vLen - 1 < block_size) {")
+    code.append("        auto output = svld1(svAll, &op[k]);")
+    code.extend(compute_output(num_unrolls, InType, True))
+    code.append("        svst1(svAll, &op[k], output);")
+    code.append("        k += vLen;")
+    code.append("      }")
+    # tail loop
+    code.append("      if (k < block_size) {")
+    code.append("        pg = svwhilelt_b32_s64(k, block_size);")
+    code.append("        auto output = svld1(pg, &op[k]);")
+    code.extend(compute_output(num_unrolls, InType, False))
+    code.append("        svst1(pg, &op[k], output);")
+    code.append("        k += vLen;")
+    code.append("      }")
+    if num_unrolls == 1:
+        code.append("      pos ++;")
+    else:
+        code.append(f"      j += {num_unrolls};")
+        code.append(f"      pos += {num_unrolls};")
+
+    code.append("    }")
+
+    return code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def main():
     parser = argparse.ArgumentParser()
@@ -352,6 +450,7 @@ def main():
         code.append("  const auto vLen = static_cast<int64_t>(svcntw());")
         code.append("  int64_t pos = 0;")
 
+<<<<<<< HEAD
         code.append("  if (block_size == 32 * vLen) {")
         code += unroll(32, IndexType, InType, OutType, True)
         code.append("  } else if (block_size == 16 * vLen) {")
@@ -368,6 +467,49 @@ def main():
         code.append("  }")
         code.append("  return pos == index_size;")
 
+=======
+        code.append("  for (int64_t i = 0; i < output_size; ++i) {")
+        code.append("    " + OutType + "* const op = &out[i * block_size];")
+
+        # initialize to 0
+        code.append("    memset(op, 0, sizeof(float) * block_size);")
+
+        # inner loop
+        code.append(
+            "    if (pos != offsets[i] - offsets[0]) {\n"
+            + "      return false;\n"
+            + "    }"
+        )
+        code.append(
+            "    int64_t start_offset = offsets[i];\n"
+            + "    int64_t end_offset = offsets[i + 1];"
+        )
+        code.append("    int64_t j = start_offset;")
+
+        code += unroll(16, IndexType, InType, OutType)
+        code += unroll(8, IndexType, InType, OutType)
+        code += unroll(4, IndexType, InType, OutType)
+        code += unroll(2, IndexType, InType, OutType)
+        code += unroll(1, IndexType, InType, OutType)
+
+        code.append("    const int64_t length = end_offset - start_offset;\n")
+        code.append("    if (normalize_by_lengths && length != 0) {")
+        code.append("      const float len_inv = 1.0f / length;")
+        code.append("      svbool_t pg;")
+        code.append("      int64_t j = 0;")
+        code.append("      while (j + vLen - 1 < block_size) {")
+        code.append("        svst1(svAll, &op[j], svmul_x(svAll, svld1(svAll, &op[j]), len_inv));")
+        code.append("        j += vLen;")
+        code.append("      }")
+        code.append("      if (j < block_size) {")
+        code.append("        pg = svwhilelt_b32_s64(j, block_size);")
+        code.append("        svst1(pg, &op[j], svmul_x(pg, svld1(pg, &op[j]), len_inv));")
+        code.append("      }")
+        code.append("    }")
+
+        code.append("  }")
+        code.append("  return pos == index_size;")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code.append("}")
 
         for is_weight_positional in ["false", "true"]:
diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 4972c6518cfc..59c22121b15d 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -196,8 +196,12 @@ void PyTorchStreamReader::init() {
 
   // version check
   at::DataPtr version_ptr;
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   size_t version_size;
+=======
+  size_t version_size = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (hasRecord(".data/version")) {
     std::tie(version_ptr, version_size) = getRecord(".data/version");
   } else {
@@ -208,7 +212,11 @@ void PyTorchStreamReader::init() {
       static_cast<const char*>(version_ptr.get()), version_size);
   try {
     version_ = std::stoull(version);
+<<<<<<< HEAD
   } catch (const std::invalid_argument& e) {
+=======
+  } catch (const std::invalid_argument&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CAFFE_THROW("Couldn't parse the version ", version, " as Long Long.");
   }
   if (version_ <
@@ -361,7 +369,12 @@ size_t PyTorchStreamReader::getRecordID(const std::string& name) {
 
 // return dataptr, size
 std::tuple<at::DataPtr, size_t> PyTorchStreamReader::getRecord(
+<<<<<<< HEAD
     const std::string& name) {
+=======
+    const std::string& name,
+    std::optional<at::Allocator*> allocator) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::lock_guard<std::mutex> guard(reader_lock_);
   if ((!load_debug_symbol_) && c10::ends_with(name, kDebugPklSuffix)) {
     at::DataPtr retval;
@@ -371,7 +384,13 @@ std::tuple<at::DataPtr, size_t> PyTorchStreamReader::getRecord(
   mz_zip_archive_file_stat stat;
   mz_zip_reader_file_stat(ar_.get(), key, &stat);
   valid("retrieving file meta-data for ", name.c_str());
+<<<<<<< HEAD
   at::DataPtr retval = c10::GetCPUAllocator()->allocate(stat.m_uncomp_size);
+=======
+  at::Allocator* allocatorPtr =
+      allocator.has_value() ? allocator.value() : c10::GetCPUAllocator();
+  at::DataPtr retval = allocatorPtr->allocate(stat.m_uncomp_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   mz_zip_reader_extract_to_mem(
       ar_.get(), key, retval.get(), stat.m_uncomp_size, 0);
   valid("reading file ", name.c_str());
@@ -449,10 +468,18 @@ size_t PyTorchStreamReader::getRecordMultiReaders(
 // read record with multi clients
 std::tuple<at::DataPtr, size_t> PyTorchStreamReader::getRecord(
     const std::string& name,
+<<<<<<< HEAD
     std::vector<std::shared_ptr<ReadAdapterInterface>>& additionalReaders) {
   if (additionalReaders.empty()) {
     // No additional readers or record too small, use single threaded version
     return getRecord(name);
+=======
+    std::vector<std::shared_ptr<ReadAdapterInterface>>& additionalReaders,
+    std::optional<at::Allocator*> allocator) {
+  if (additionalReaders.empty()) {
+    // No additional readers or record too small, use single threaded version
+    return getRecord(name, allocator);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   if ((!load_debug_symbol_) && c10::ends_with(name, kDebugPklSuffix)) {
@@ -469,7 +496,13 @@ std::tuple<at::DataPtr, size_t> PyTorchStreamReader::getRecord(
     return getRecord(name);
   }
 
+<<<<<<< HEAD
   at::DataPtr retval = c10::GetCPUAllocator()->allocate(stat.m_uncomp_size);
+=======
+  at::Allocator* allocatorPtr =
+      allocator.has_value() ? allocator.value() : c10::GetCPUAllocator();
+  at::DataPtr retval = allocatorPtr->allocate(stat.m_uncomp_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void* dst = retval.get();
   PyTorchStreamReader::getRecordMultiReaders(name, additionalReaders, dst, n);
   return std::make_tuple(std::move(retval), stat.m_uncomp_size);
@@ -760,11 +793,15 @@ void PyTorchStreamWriter::writeRecord(
   }
   std::string full_name = archive_name_plus_slash_ + name;
   size_t padding_size = detail::getPadding(
+<<<<<<< HEAD
       ar_->m_archive_size,
       full_name.size(),
       size,
       padding_,
       alignment_);
+=======
+      ar_->m_archive_size, full_name.size(), size, padding_, alignment_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   uint32_t flags = compress ? MZ_BEST_COMPRESSION : 0;
   if (!compute_crc32_) {
 #if (!defined(FBCODE_CAFFE2))
diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h
index e098bede1420..0636783f02f6 100644
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@@ -130,11 +130,23 @@ class TORCH_API PyTorchStreamReader final {
   explicit PyTorchStreamReader(std::shared_ptr<ReadAdapterInterface> in);
 
   // return dataptr, size
+<<<<<<< HEAD
   std::tuple<at::DataPtr, size_t> getRecord(const std::string& name);
   // multi-thread getRecord
   std::tuple<at::DataPtr, size_t> getRecord(
       const std::string& name,
       std::vector<std::shared_ptr<ReadAdapterInterface>>& additionalReaders);
+=======
+  // set allocator to override default cpu allocator
+  std::tuple<at::DataPtr, size_t> getRecord(
+      const std::string& name,
+      std::optional<at::Allocator*> allocator = std::nullopt);
+  // multi-thread getRecord
+  std::tuple<at::DataPtr, size_t> getRecord(
+      const std::string& name,
+      std::vector<std::shared_ptr<ReadAdapterInterface>>& additionalReaders,
+      std::optional<at::Allocator*> allocator = std::nullopt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // inplace memory writing
   size_t getRecord(const std::string& name, void* dst, size_t n);
   // inplace memory writing, multi-threads.
diff --git a/caffe2/serialize/inline_container_test.cc b/caffe2/serialize/inline_container_test.cc
index 1fb19c88d330..b6e063fb3f07 100644
--- a/caffe2/serialize/inline_container_test.cc
+++ b/caffe2/serialize/inline_container_test.cc
@@ -6,6 +6,10 @@
 #include <gtest/gtest.h>
 
 #include <c10/util/Logging.h>
+<<<<<<< HEAD
+=======
+#include "c10/core/CPUAllocator.h"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include "c10/util/irange.h"
 #include "caffe2/serialize/inline_container.h"
 
@@ -432,6 +436,202 @@ TEST(PytorchStreamWriterAndReader, LogAPIUsageMetadata) {
           const std::map<std::string, std::string>& metadata_map) {});
 }
 
+<<<<<<< HEAD
+=======
+class TestAllocator : public at::Allocator {
+ public:
+
+  explicit TestAllocator(at::Allocator* allocator): baseAllocator_(allocator) {}
+  at::DataPtr allocate(size_t nbytes) override {
+  allocatedBytes_ += nbytes;
+  return baseAllocator_->allocate(nbytes);
+  }
+  at::DeleterFnPtr raw_deleter() const override {
+    return baseAllocator_->raw_deleter();
+  }
+  void copy_data(void* dest, const void* src, std::size_t count) const override {
+    default_copy_data(dest, src, count);
+  }
+  size_t getAllocatedBytes() {
+    return allocatedBytes_;
+  }
+ private:
+  at::Allocator* baseAllocator_;
+  size_t allocatedBytes_{0};
+};
+
+TEST(PyTorchStreamWriterAndReader, SaveAndLoadWithAllocator) {
+  // create two test allocators, ones is supposed to be the default allocator
+  // the other one is only used when user specifies it
+  auto defaultAllocator = at::GetCPUAllocator();
+  TestAllocator overrideAllocator(defaultAllocator);
+  TestAllocator baseAllocator(defaultAllocator);
+  c10::SetCPUAllocator(&baseAllocator, 10 /* priority */);
+
+  std::ostringstream oss;
+  // write records through writers
+  PyTorchStreamWriter writer([&](const void* b, size_t n) -> size_t {
+    oss.write(static_cast<const char*>(b), n);
+    return oss ? n : 0;
+  });
+  const size_t kBytes1 = 127;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
+  std::array<char, kBytes1> data1;
+  // Inplace memory buffer
+  std::vector<uint8_t> buf(data1.size());
+
+  for (auto i : c10::irange(data1.size())) {
+    data1[i] = data1.size() - i;
+  }
+  writer.writeRecord("key1", data1.data(), data1.size());
+
+  const size_t kBytes2 = 64;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
+  std::array<char, kBytes2> data2;
+  for (auto i : c10::irange(data2.size())) {
+    data2[i] = data2.size() - i;
+  }
+  writer.writeRecord("key2", data2.data(), data2.size());
+
+  const std::unordered_set<std::string>& written_records =
+      writer.getAllWrittenRecords();
+  ASSERT_EQ(written_records.size(), 2);
+  ASSERT_EQ(written_records.count("key1"), 1);
+  ASSERT_EQ(written_records.count("key2"), 1);
+
+  writer.writeEndOfFile();
+  ASSERT_EQ(written_records.count(kSerializationIdRecordName), 1);
+
+  std::string the_file = oss.str();
+  const char* file_name = "output.zip";
+  std::ofstream foo(file_name);
+  foo.write(the_file.c_str(), the_file.size());
+  foo.close();
+
+  std::istringstream iss(the_file);
+
+  // read records through readers
+  PyTorchStreamReader reader(&iss);
+  ASSERT_TRUE(reader.hasRecord("key1"));
+  ASSERT_TRUE(reader.hasRecord("key2"));
+  ASSERT_FALSE(reader.hasRecord("key2000"));
+  // get the bytes allocated byfore read
+  const auto allocBytes = baseAllocator.getAllocatedBytes();
+  at::DataPtr data_ptr;
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  int64_t size;
+  // allocated with override allocator
+  std::tie(data_ptr, size) = reader.getRecord("key1", &overrideAllocator);
+  EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
+  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes);
+  // allcoate with base allocator
+  std::tie(data_ptr, size) = reader.getRecord("key1");
+  EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
+  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes + kBytes1);
+
+  std::tie(data_ptr, size) = reader.getRecord("key2", &overrideAllocator);
+  EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1 + kBytes2);
+  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes + kBytes1);
+  std::tie(data_ptr, size) = reader.getRecord("key2");
+  EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1 + kBytes2);
+  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes + kBytes1 + kBytes2);
+  std::tie(data_ptr, size) = reader.getRecord("key2", &baseAllocator);
+  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes + kBytes1 + 2 * kBytes2);
+}
+
+
+TEST(PyTorchStreamWriterAndReader, LoadWithMultiThreadsWithAllocator) {
+  auto defaultAllocator = at::GetCPUAllocator();
+  TestAllocator overrideAllocator(defaultAllocator);
+  TestAllocator baseAllocator(defaultAllocator);
+  c10::SetCPUAllocator(&baseAllocator, 10 /* priority */);
+  std::ostringstream oss;
+  // write records through writers
+  PyTorchStreamWriter writer([&](const void* b, size_t n) -> size_t {
+    oss.write(static_cast<const char*>(b), n);
+    return oss ? n : 0;
+  });
+
+  const size_t kBytes1 = 127;
+  const size_t kBytes2 = 64;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
+  std::array<char, kBytes1> data1;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
+  std::array<char, kBytes2> data2;
+  for (auto i : c10::irange(data1.size())) {
+    data1[i] = data1.size() - i;
+  }
+  writer.writeRecord("key1", data1.data(), data1.size());
+
+  for (auto i : c10::irange(data2.size())) {
+    data2[i] = data2.size() - i;
+  }
+  writer.writeRecord("key2", data2.data(), data2.size());
+
+  const std::unordered_set<std::string>& written_records =
+      writer.getAllWrittenRecords();
+  ASSERT_EQ(written_records.size(), 2);
+  ASSERT_EQ(written_records.count("key1"), 1);
+  ASSERT_EQ(written_records.count("key2"), 1);
+
+  writer.writeEndOfFile();
+  ASSERT_EQ(written_records.count(kSerializationIdRecordName), 1);
+
+  std::string the_file = oss.str();
+  const char* file_name = "output.zip";
+  std::ofstream foo(file_name);
+  foo.write(the_file.c_str(), the_file.size());
+  foo.close();
+
+  // read records through pytorchStreamReader
+  std::istringstream iss(the_file);
+  PyTorchStreamReader reader(&iss);
+  reader.setAdditionalReaderSizeThreshold(0);
+  // before testing, sanity check
+  int64_t size1, size2, ret;
+  at::DataPtr data_ptr;
+  std::tie(data_ptr, size1) = reader.getRecord("key1");
+  std::tie(data_ptr, size2) = reader.getRecord("key2");
+
+  // Test getRecord(name, additional_readers)
+  std::vector<std::shared_ptr<ReadAdapterInterface>> additionalReader;
+  size_t allocatedBytes = 0;
+  auto baseAllocBytes = baseAllocator.getAllocatedBytes();
+  for (int i = 0; i < 10; ++i) {
+    // Test various sized additional readers.
+    std::tie(data_ptr, ret) = reader.getRecord("key1", additionalReader, &overrideAllocator);
+    ASSERT_EQ(ret, size1);
+    allocatedBytes += size1;
+    EXPECT_EQ(overrideAllocator.getAllocatedBytes(), allocatedBytes);
+    EXPECT_EQ(baseAllocator.getAllocatedBytes(), baseAllocBytes);
+    ASSERT_EQ(memcmp(data_ptr.get(), data1.data(), size1), 0);
+
+    baseAllocBytes += size2;
+    std::tie(data_ptr, ret) = reader.getRecord("key2", additionalReader);
+    ASSERT_EQ(ret, size2);
+    ASSERT_EQ(memcmp(data_ptr.get(), data2.data(), size2), 0);
+    EXPECT_EQ(overrideAllocator.getAllocatedBytes(), allocatedBytes);
+    EXPECT_EQ(baseAllocator.getAllocatedBytes(), baseAllocBytes);
+  }
+
+  // Inplace multi-threading getRecord(name, dst, n, additional_readers) test
+  additionalReader.clear();
+  std::vector<uint8_t> dst1(size1), dst2(size2);
+  for (int i = 0; i < 10; ++i) {
+    // Test various sizes of read threads
+    additionalReader.push_back(std::make_unique<IStreamAdapter>(&iss));
+
+    ret = reader.getRecord("key1", dst1.data(), size1, additionalReader);
+    ASSERT_EQ(ret, size1);
+
+    ret = reader.getRecord("key2", dst2.data(), size2, additionalReader);
+    ASSERT_EQ(ret, size2);
+  }
+  // clean up
+  remove(file_name);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ChunkRecordIteratorTest : public ::testing::TestWithParam<int64_t> {};
 INSTANTIATE_TEST_SUITE_P(
     ChunkRecordIteratorTestGroup,
diff --git a/caffe2/utils/threadpool/ThreadPool.h b/caffe2/utils/threadpool/ThreadPool.h
index dbaefc51389f..1fb2b105ec74 100644
--- a/caffe2/utils/threadpool/ThreadPool.h
+++ b/caffe2/utils/threadpool/ThreadPool.h
@@ -9,6 +9,10 @@
 #include <mutex>
 #include <vector>
 
+<<<<<<< HEAD
+=======
+#include "c10/util/Flags.h"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include "caffe2/core/common.h"
 
 //
@@ -65,4 +69,17 @@ class TORCH_API /*alignas(kCacheLineSize)*/ ThreadPool {
 size_t getDefaultNumThreads();
 } // namespace caffe2
 
+<<<<<<< HEAD
+=======
+C10_DECLARE_bool(caffe2_threadpool_force_inline);
+
+// Whether or not threadpool caps apply to Android
+C10_DECLARE_int(caffe2_threadpool_android_cap);
+
+// Whether or not threadpool caps apply to iOS and MacOS
+C10_DECLARE_int(caffe2_threadpool_ios_cap);
+C10_DECLARE_int(caffe2_threadpool_macos_cap);
+
+C10_DECLARE_int(pthreadpool_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // CAFFE2_UTILS_THREADPOOL_H_
diff --git a/caffe2/utils/threadpool/pthreadpool-cpp.cc b/caffe2/utils/threadpool/pthreadpool-cpp.cc
index 6766b13d2b84..f2f09f93a124 100644
--- a/caffe2/utils/threadpool/pthreadpool-cpp.cc
+++ b/caffe2/utils/threadpool/pthreadpool-cpp.cc
@@ -1,9 +1,15 @@
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <caffe2/utils/threadpool/thread_pool_guard.h>
+<<<<<<< HEAD
 #include <c10/util/Exception.h>
 
 #include <atomic>
 
+=======
+#include <caffe2/utils/threadpool/ThreadPool.h>
+#include <c10/util/Exception.h>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 // After fork, the child process inherits the data-structures of the parent
 // process' thread-pool, but since those threads don't exist, the thread-pool
@@ -102,9 +108,12 @@ PThreadPool* pthreadpool(size_t thread_count) {
   return threadpool.get();
 }
 
+<<<<<<< HEAD
 // Forward declaration
 size_t getDefaultNumThreads();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PThreadPool* pthreadpool() {
   return pthreadpool(getDefaultNumThreads());
 }
diff --git a/caffe2/utils/threadpool/thread_pool_guard.cpp b/caffe2/utils/threadpool/thread_pool_guard.cpp
index e030547e6ed8..8d6ff92c7bb8 100644
--- a/caffe2/utils/threadpool/thread_pool_guard.cpp
+++ b/caffe2/utils/threadpool/thread_pool_guard.cpp
@@ -2,7 +2,11 @@
 
 namespace caffe2 {
 
+<<<<<<< HEAD
 thread_local bool _NoPThreadPoolGuard_enabled = false;
+=======
+static thread_local bool _NoPThreadPoolGuard_enabled = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool _NoPThreadPoolGuard::is_enabled() {
   return _NoPThreadPoolGuard_enabled;
diff --git a/cmake/Caffe2Config.cmake.in b/cmake/Caffe2Config.cmake.in
index c23b3990aff8..eb9aaf98399f 100644
--- a/cmake/Caffe2Config.cmake.in
+++ b/cmake/Caffe2Config.cmake.in
@@ -100,6 +100,16 @@ if(@USE_XPU@)
   list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
   include("${CMAKE_CURRENT_LIST_DIR}/public/xpu.cmake")
   set(CMAKE_MODULE_PATH "${old_CMAKE_MODULE_PATH}")
+<<<<<<< HEAD
+=======
+
+  if(@USE_XPU@ AND NOT PYTORCH_FOUND_XPU)
+    message(FATAL_ERROR
+      "Your installed Caffe2 version uses XPU but I cannot find the XPU runtime"
+      "libraries. Please set the proper oneAPI paths and / or install "
+      "oneAPI.")
+  endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 if(@CAFFE2_USE_MKL@)
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 724d99309903..2e577cbbe3c0 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -35,7 +35,11 @@ endfunction()
 
 ################################################################################
 
+<<<<<<< HEAD
 # -- [ Deterine commit hash
+=======
+# -- [ Determine commit hash
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 execute_process(
     COMMAND "${Python_EXECUTABLE}" -c "from tools.generate_torch_version import get_sha;print(get_sha('.'), end='')"
     OUTPUT_VARIABLE COMMIT_SHA
@@ -76,6 +80,7 @@ if(INTERN_BUILD_ATEN_OPS)
 
   file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../torchgen/*.py")
 
+<<<<<<< HEAD
   # RowwiseScaled.cu requires sm89/sm90a flags
   if(USE_CUDA)
     set(ROWWISE_SCALED_MM_FILE "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu")
@@ -117,6 +122,61 @@ if(INTERN_BUILD_ATEN_OPS)
     list(JOIN ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS " " ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS)
     set_source_files_properties(${ROWWISE_SCALED_MM_FILE} PROPERTIES COMPILE_FLAGS "${ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS}")
 
+=======
+  # Handle files that may need sm89/sm90a/sm100a flags (stable/nightly
+  # builds are not built for these archs).
+  if(USE_CUDA)
+    # The stable/nightly builds do not enable some SM architectures,
+    # like 89/90a/100a.  Still, some files need to be built for these
+    # architectures specifically.  This function makes it possible to
+    # enable building given file for a specific such architecture, in
+    # case if PyTorch is built for corresponding other architecture;
+    # for example, it will enable building for SM 90a in case PyTorch
+    # built for SM 90, etc.  For examples of how to use the function,
+    # see below the function itself.
+    function(_BUILD_FOR_ADDITIONAL_ARCHS file archs)
+      torch_cuda_get_nvcc_gencode_flag(_existing_arch_flags)
+
+      set(_file_compile_flags "")
+      if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0)
+        foreach(_arch ${archs})
+          if("${_arch}" STREQUAL "89")
+            if(_existing_arch_flags MATCHES ".*compute_86.*")
+              list(APPEND _file_compile_flags "-gencode;arch=compute_89,code=sm_89")
+            endif()
+          endif()
+          if("${_arch}" STREQUAL "90a")
+            if(_existing_arch_flags MATCHES ".*compute_90.*")
+              list(APPEND _file_compile_flags "-gencode;arch=compute_90a,code=sm_90a")
+            endif()
+          endif()
+          if("${_arch}" STREQUAL "100a")
+            if(_existing_arch_flags MATCHES ".*compute_100.*")
+              list(APPEND _file_compile_flags "-gencode;arch=compute_100a,code=sm_100a")
+            endif()
+          endif()
+          if("${_arch}" STREQUAL "120a")
+            if(_existing_arch_flags MATCHES ".*compute_120.*")
+              list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
+            endif()
+          endif()
+        endforeach()
+      endif()
+      list(JOIN _file_compile_flags " " _file_compile_flags)
+
+      set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS "${_file_compile_flags}")
+    endfunction()
+
+    _BUILD_FOR_ADDITIONAL_ARCHS(
+      "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu"
+      "89;90a;100a;120a")
+    _BUILD_FOR_ADDITIONAL_ARCHS(
+      "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu"
+      "90a")
+    _BUILD_FOR_ADDITIONAL_ARCHS(
+      "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/GroupMM.cu"
+      "90a")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   endif()
 
@@ -383,6 +443,7 @@ if(INTERN_BUILD_ATEN_OPS)
     LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}  ${CXX_ZVECTOR_FLAGS}")
   endif(CXX_ZVECTOR_FOUND)
 
+<<<<<<< HEAD
   if(CXX_SVE_FOUND)
     if(CXX_SVE256_FOUND)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_SVE_CPU_DEFINITION -DHAVE_SVE256_CPU_DEFINITION")
@@ -394,11 +455,26 @@ if(INTERN_BUILD_ATEN_OPS)
       endif()
     endif(CXX_SVE256_FOUND)
   endif(CXX_SVE_FOUND)
+=======
+  if(CXX_SVE_FOUND AND CXX_SVE256_FOUND AND CXX_ARM_BF16_FOUND)
+    list(APPEND CPU_CAPABILITY_NAMES "SVE256")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_SVE_CPU_DEFINITION -DHAVE_SVE256_CPU_DEFINITION -DHAVE_ARM_BF16_CPU_DEFINITION")
+    if("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+      list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -O2 -march=armv8-a+sve+bf16 -D__ARM_FEATURE_BF16 -DCPU_CAPABILITY_SVE -msve-vector-bits=256")
+    else()
+      list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -march=armv8-a+sve+bf16 -D__ARM_FEATURE_BF16 -DCPU_CAPABILITY_SVE -msve-vector-bits=256")
+    endif()
+  endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
   math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
 
+<<<<<<< HEAD
   # The sources list might get reordered later based on the capabilites.
+=======
+  # The sources list might get reordered later based on the capabilities.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # See NOTE [ Linking AVX and non-AVX files ]
   foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
     function(process_vec NAME)
diff --git a/cmake/DebugHelper.cmake b/cmake/DebugHelper.cmake
index 3797a472336f..c772deeeadca 100644
--- a/cmake/DebugHelper.cmake
+++ b/cmake/DebugHelper.cmake
@@ -1,5 +1,9 @@
 function(print_target_properties tgt)
+<<<<<<< HEAD
   # Get all propreties that cmake supports
+=======
+  # Get all properties that cmake supports
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   execute_process(COMMAND cmake --help-property-list OUTPUT_VARIABLE CMAKE_PROPERTY_LIST)
 
   # Convert command output into a CMake list
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 84aa090c3692..ce1fa9737c20 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -92,7 +92,11 @@ endif()
 if(USE_XPU)
   include(${CMAKE_CURRENT_LIST_DIR}/public/xpu.cmake)
   if(NOT PYTORCH_FOUND_XPU)
+<<<<<<< HEAD
     message(WARNING "Not compiling with XPU. Could NOT find SYCL."
+=======
+    message(WARNING "Not compiling with XPU. Could NOT find SYCL. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Suppress this warning with -DUSE_XPU=OFF.")
     caffe2_update_option(USE_XPU OFF)
   endif()
@@ -279,7 +283,11 @@ if(NOT AT_MKL_ENABLED)
   set(POCKETFFT_INCLUDE_DIR "${Torch_SOURCE_DIR}/third_party/pocketfft/")
   if(NOT EXISTS "${POCKETFFT_INCLUDE_DIR}")
     message(FATAL_ERROR "pocketfft directory not found, expected ${POCKETFFT_INCLUDE_DIR}")
+<<<<<<< HEAD
   elif(NOT EXISTS "${POCKETFFT_INCLUDE_DIR}/pocketfft_hdronly.h")
+=======
+  elseif(NOT EXISTS "${POCKETFFT_INCLUDE_DIR}/pocketfft_hdronly.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     message(FATAL_ERROR "pocketfft headers not found in ${POCKETFFT_INCLUDE_DIR}")
   endif()
 
@@ -620,6 +628,7 @@ if(BUILD_TEST OR BUILD_MOBILE_BENCHMARK OR BUILD_MOBILE_TEST)
   # need to install it.
   set(INSTALL_GTEST OFF CACHE BOOL "Install gtest." FORCE)
   set(BUILD_GMOCK ON CACHE BOOL "Build gmock." FORCE)
+<<<<<<< HEAD
   # For Windows, we will check the runtime used is correctly passed in.
   if(NOT CAFFE2_USE_MSVC_STATIC_RUNTIME)
       set(gtest_force_shared_crt ON CACHE BOOL "force shared crt on gtest" FORCE)
@@ -653,6 +662,10 @@ if(BUILD_TEST OR BUILD_MOBILE_BENCHMARK OR BUILD_MOBILE_TEST)
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest)
   set_property(DIRECTORY PROPERTY INCLUDE_DIRECTORIES ${INC_DIR_temp})
 
+=======
+
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   include_directories(BEFORE SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/include)
   include_directories(BEFORE SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googlemock/include)
 
@@ -671,6 +684,7 @@ if(BUILD_TEST OR BUILD_MOBILE_BENCHMARK OR BUILD_MOBILE_TEST)
     message("-- Found benchmark: ${BENCHMARK_LIBRARY}")
     set_property(TARGET benchmark PROPERTY IMPORTED_LOCATION ${BENCHMARK_LIBRARY})
   endif()
+<<<<<<< HEAD
   include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark/include)
 
   # Recover build options.
@@ -699,6 +713,11 @@ if(BUILD_TEST OR BUILD_MOBILE_BENCHMARK OR BUILD_MOBILE_TEST)
       set(RUN_HAVE_STD_REGEX 0 CACHE INTERNAL "Cache RUN_HAVE_STD_REGEX output for cross-compile.")
     endif()
   endif()
+=======
+
+  # Recover build options.
+  set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 # ---[ FBGEMM
@@ -737,6 +756,16 @@ if(USE_FBGEMM)
     set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON)
     set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON)
     set_property(TARGET fbgemm PROPERTY POSITION_INDEPENDENT_CODE ON)
+<<<<<<< HEAD
+=======
+
+    # Disabling autovec in fbgemm due to large library size causing symbol relocation issues, which is only allowed in static builds.
+    # Long-term solution involves modularizing fbgemm targets.
+    target_compile_definitions(fbgemm_generic PUBLIC DISABLE_FBGEMM_AUTOVEC)
+    target_compile_definitions(fbgemm_avx2 PUBLIC DISABLE_FBGEMM_AUTOVEC)
+    target_compile_definitions(fbgemm_avx512 PUBLIC DISABLE_FBGEMM_AUTOVEC)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 13.0.0)
       # See https://github.com/pytorch/pytorch/issues/74352
       target_compile_options_if_supported(asmjit -Wno-deprecated-copy)
@@ -747,8 +776,13 @@ if(USE_FBGEMM)
       target_compile_options_if_supported(fbgemm -Wno-extra-semi)
     endif()
   endif()
+<<<<<<< HEAD
 
   if(USE_FBGEMM)
+=======
+  if(USE_FBGEMM)
+    target_compile_definitions(fbgemm PUBLIC DISABLE_FBGEMM_AUTOVEC)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
   endif()
 endif()
@@ -784,6 +818,7 @@ if(USE_NUMA)
 endif()
 
 if(USE_ITT)
+<<<<<<< HEAD
     if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
       message(WARNING "ITT is only cmake-2.8 compatible")
       set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
@@ -792,6 +827,9 @@ if(USE_ITT)
     else()
       find_package(ITT)
     endif()
+=======
+  find_package(ITT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if(ITT_FOUND)
     include_directories(SYSTEM ${ITT_INCLUDE_DIR})
     list(APPEND Caffe2_DEPENDENCY_LIBS ${ITT_LIBRARIES})
@@ -834,6 +872,7 @@ elseif(NOT TARGET fp16 AND USE_SYSTEM_FP16)
 endif()
 list(APPEND Caffe2_DEPENDENCY_LIBS fp16)
 
+<<<<<<< HEAD
 # ---[ EIGEN
 # Due to license considerations, we will only use the MPL2 parts of Eigen.
 set(EIGEN_MPL2_ONLY 1)
@@ -853,6 +892,8 @@ endif()
 include_directories(SYSTEM ${EIGEN3_INCLUDE_DIR})
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # ---[ Python Interpreter
 # If not given a Python installation, then use the current active Python
 if(NOT Python_EXECUTABLE)
@@ -866,6 +907,32 @@ if(NOT Python_EXECUTABLE)
   endif()
 endif()
 
+<<<<<<< HEAD
+=======
+
+# ---[ EIGEN
+# Due to license considerations, we will only use the MPL2 parts of Eigen.
+set(EIGEN_MPL2_ONLY 1)
+if(USE_SYSTEM_EIGEN_INSTALL)
+  find_package(Eigen3)
+  if(EIGEN3_FOUND)
+    message(STATUS "Found system Eigen at " ${EIGEN3_INCLUDE_DIR})
+  else()
+    message(STATUS "Did not find system Eigen. Using third party subdirectory.")
+    execute_process(COMMAND ${Python_EXECUTABLE} ../tools/optional_modules.py checkout_eigen
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+
+    set(EIGEN3_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/../third_party/eigen)
+    caffe2_update_option(USE_SYSTEM_EIGEN_INSTALL OFF)
+  endif()
+else()
+  message(STATUS "Using third party subdirectory Eigen.")
+  set(EIGEN3_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/../third_party/eigen)
+endif()
+include_directories(SYSTEM ${EIGEN3_INCLUDE_DIR})
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(BUILD_PYTHON)
   set(PYTHON_COMPONENTS Development.Module)
   if(USE_NUMPY)
@@ -1017,6 +1084,26 @@ if(USE_CUDNN)
   target_include_directories(torch::cudnn INTERFACE ${CUDNN_FRONTEND_INCLUDE_DIR})
 endif()
 
+<<<<<<< HEAD
+=======
+# ---[ nvtx
+if(USE_SYSTEM_NVTX)
+  find_path(nvtx3_dir NAMES nvtx3 PATHS ${CUDA_INCLUDE_DIRS})
+  find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
+  if(NOT nvtx3_FOUND)
+    message(WARNING "Cannot find system NVTX3, find shipped NVTX3 instead")
+  endif()
+endif()
+if(NOT TARGET CUDA::nvtx3)
+  add_library(CUDA::nvtx3 INTERFACE IMPORTED)
+endif()
+if(NOT nvtx3_dir)
+  find_path(nvtx3_dir NAMES nvtx3 PATHS "${PROJECT_SOURCE_DIR}/third_party/NVTX/c/include" NO_DEFAULT_PATH)
+  target_include_directories(CUDA::nvtx3 INTERFACE "${nvtx3_dir}")
+endif()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # ---[ HIP
 if(USE_ROCM)
   include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
@@ -1059,6 +1146,10 @@ if(USE_ROCM)
     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
     list(APPEND HIP_CXX_FLAGS -std=c++17)
     list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
+<<<<<<< HEAD
+=======
+    list(APPEND HIP_CXX_FLAGS -DHIP_ENABLE_WARP_SYNC_BUILTINS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if(HIPBLASLT_OUTER_VEC)
       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_OUTER_VEC)
     endif()
@@ -1068,9 +1159,16 @@ if(USE_ROCM)
     list(APPEND HIP_HIPCC_FLAGS --offload-compress)
     if(WIN32)
       add_definitions(-DROCM_ON_WINDOWS)
+<<<<<<< HEAD
     endif()
     list(APPEND HIP_CXX_FLAGS -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI})
     list(APPEND HIP_CXX_FLAGS -DHIP_ENABLE_WARP_SYNC_BUILTINS)
+=======
+      list(APPEND HIP_CXX_FLAGS -fms-extensions)
+      # Suppress warnings about dllexport.
+      list(APPEND HIP_CXX_FLAGS -Wno-ignored-attributes)
+    endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add_definitions(-DROCM_VERSION=${ROCM_VERSION_DEV_INT})
     add_definitions(-DTORCH_HIP_VERSION=${TORCH_HIP_VERSION})
     message("TORCH_HIP_VERSION=${TORCH_HIP_VERSION} is added as a compiler defines")
@@ -1118,7 +1216,17 @@ if(USE_ROCM)
 
     # Math libraries
     list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
+<<<<<<< HEAD
       roc::hipblas roc::rocblas hip::hipfft hip::hiprand roc::hipsparse roc::hipsolver roc::hipblaslt)
+=======
+      roc::hipblas roc::rocblas hip::hipfft hip::hiprand roc::hipsparse roc::hipsolver roc::hipblaslt roc::rocsolver)
+    # hipsparselt is an optional component that will eventually be enabled by default.
+    if(hipsparselt_FOUND)
+      list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
+        roc::hipsparselt
+      )
+    endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # ---[ Kernel asserts
     # Kernel asserts is disabled for ROCm by default.
@@ -1153,6 +1261,17 @@ if(USE_NCCL)
   endif()
 endif()
 
+<<<<<<< HEAD
+=======
+# ---[ XCCL
+if(USE_XCCL)
+  if(NOT USE_XPU)
+    message(WARNING "Not using XPU, so disabling USE_XCCL. Suppress this warning with -DUSE_XCCL=OFF.")
+    caffe2_update_option(USE_XCCL OFF)
+  endif()
+endif()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # ---[ UCC
 if(USE_UCC)
   if(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux")
@@ -1192,7 +1311,11 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
       set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
     endif()
     add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe)
+<<<<<<< HEAD
     # Suppress warning to unblock libnop comiplation by clang-17
+=======
+    # Suppress warning to unblock libnop compilation by clang-17
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # See https://github.com/pytorch/pytorch/issues/151316
     target_compile_options_if_supported(tensorpipe -Wno-missing-template-arg-list-after-template-kw)
     if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
@@ -1201,6 +1324,10 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
 
     list(APPEND Caffe2_DEPENDENCY_LIBS tensorpipe)
     list(APPEND Caffe2_DEPENDENCY_LIBS nlohmann)
+<<<<<<< HEAD
+=======
+    list(APPEND Caffe2_DEPENDENCY_LIBS moodycamel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if(USE_CUDA)
       list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda)
     elseif(USE_ROCM)
@@ -1220,6 +1347,17 @@ if(USE_GLOO)
     set(GLOO_INSTALL OFF CACHE BOOL "" FORCE)
     set(GLOO_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
 
+<<<<<<< HEAD
+=======
+    if(USE_GLOO_IBVERBS)
+      set(USE_IBVERBS ON)
+    endif()
+
+    # Build BFloat16 cuda kernels
+    set(GLOO_USE_TORCH_DTYPES 1)
+    set(GLOO_TORCH_DIR ${PROJECT_SOURCE_DIR} ${CMAKE_BINARY_DIR})
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Temporarily override variables to avoid building Gloo tests/benchmarks
     set(__BUILD_TEST ${BUILD_TEST})
     set(__BUILD_BENCHMARK ${BUILD_BENCHMARK})
@@ -1233,6 +1371,7 @@ if(USE_GLOO)
         get_target_property(_include_dirs uv_a INCLUDE_DIRECTORIES)
         set_target_properties(uv_a PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${_include_dirs}")
       endif()
+<<<<<<< HEAD
       if(USE_NCCL AND NOT USE_SYSTEM_NCCL)
         # Tell Gloo build system to use bundled NCCL, see
         # https://github.com/facebookincubator/gloo/blob/950c0e23819779a9e0c70b861db4c52b31d1d1b2/cmake/Dependencies.cmake#L123
@@ -1251,6 +1390,22 @@ if(USE_GLOO)
       # Here is a little bit hacky. We have to put PROJECT_BINARY_DIR in front
       # of PROJECT_SOURCE_DIR with/without conda system. The reason is that
       # gloo generates a new config.h in the binary diretory.
+=======
+      set(GLOO_USE_CUDA_TOOLKIT ON CACHE BOOL "" FORCE)
+
+      # Disable NCCL/RCCL since we don't use Gloo+NCCL, make sure to re-enable it!
+      set(USE_NCCL_SAVED ${USE_NCCL})
+      set(USE_RCCL_SAVED ${USE_RCCL})
+      set(USE_NCCL OFF)
+      set(USE_RCCL OFF)
+      add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/gloo)
+      set(USE_NCCL ${USE_NCCL_SAVED})
+      set(USE_RCCL ${USE_RCCL_SAVED})
+
+      # Here is a little bit hacky. We have to put PROJECT_BINARY_DIR in front
+      # of PROJECT_SOURCE_DIR with/without conda system. The reason is that
+      # gloo generates a new config.h in the binary directory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       include_directories(BEFORE SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/gloo)
       include_directories(BEFORE SYSTEM ${PROJECT_BINARY_DIR}/third_party/gloo)
     else()
@@ -1332,16 +1487,25 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
     add_definitions(-DONNX_ML=1)
   endif()
   add_definitions(-DONNXIFI_ENABLE_EXT=1)
+<<<<<<< HEAD
   if(NOT USE_SYSTEM_ONNX)
     add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/onnx EXCLUDE_FROM_ALL)
     if(NOT MSVC)
       set_target_properties(onnx_proto PROPERTIES CXX_STANDARD 17)
     endif()
+=======
+  set(Python3_EXECUTABLE "${Python_EXECUTABLE}")
+  if(NOT USE_SYSTEM_ONNX)
+    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/onnx EXCLUDE_FROM_ALL)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
 
   add_definitions(-DONNX_NAMESPACE=${ONNX_NAMESPACE})
   if(NOT USE_SYSTEM_ONNX)
+<<<<<<< HEAD
     include_directories(${ONNX_INCLUDE_DIRS})
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # In mobile build we care about code size, and so we need drop
     # everything (e.g. checker) in onnx but the pb definition.
     if(ANDROID OR IOS)
@@ -1536,18 +1700,22 @@ if(NOT INTERN_BUILD_MOBILE)
   endif()
 
   if(USE_KLEIDIAI)
+<<<<<<< HEAD
     if(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_LESS "11" )
         message(WARNING "KleidiAI: Using non-supported Clang version. Expected 11 or newer, received ${CMAKE_C_COMPILER_VERSION}.")
     endif()
     if(CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS "11" )
         message(WARNING "KleidiAI: Using non-supported GCC version. Expected 11 or newer, received ${CMAKE_C_COMPILER_VERSION}.")
     endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
     set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
     set(AT_KLEIDIAI_ENABLED 1)
     set(KLEIDIAI_BUILD_TESTS OFF) # Disable building KLEIDIAI tests
     set(KLEIDIAI_SRC "${PROJECT_SOURCE_DIR}/third_party/kleidiai")
     add_subdirectory(${KLEIDIAI_SRC})
+<<<<<<< HEAD
     set(KLEIDIAI_INCLUDE_DIRS
     ${KLEIDIAI_SRC}/
     ${KLEIDIAI_SRC}/kai/
@@ -1559,6 +1727,8 @@ if(NOT INTERN_BUILD_MOBILE)
     ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/
     )
     include_directories(SYSTEM INTERFACE ${KLEIDIAI_INCLUDE_DIRS})
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     list(APPEND Caffe2_DEPENDENCY_LIBS kleidiai)
     # Recover build options.
     set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
@@ -1726,7 +1896,14 @@ if(USE_KINETO)
   }" EXCEPTIONS_WORK)
         set(CMAKE_REQUIRED_LINK_OPTIONS "")
         if(NOT EXCEPTIONS_WORK)
+<<<<<<< HEAD
           message(FATAL_ERROR "Detected that statically linking against CUPTI causes exceptions to stop working.  See https://github.com/pytorch/pytorch/issues/57744 for more details.  Perhaps try: USE_CUPTI_SO=1 python setup.py develop --cmake")
+=======
+          message(FATAL_ERROR
+            "Detected that statically linking against CUPTI causes exceptions to stop working. "
+            "See https://github.com/pytorch/pytorch/issues/57744 for more details. "
+            "Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python setup.py develop")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         endif()
       endif()
 
@@ -1776,3 +1953,10 @@ target_include_directories(httplib SYSTEM INTERFACE ${PROJECT_SOURCE_DIR}/third_
 # Include nlohmann-json
 add_library(nlohmann INTERFACE IMPORTED)
 include_directories(nlohmann SYSTEM INTERFACE ${PROJECT_SOURCE_DIR}/third_party/nlohmann/include)
+<<<<<<< HEAD
+=======
+
+# Include moodycamel
+add_library(moodycamel INTERFACE IMPORTED)
+include_directories(moodycamel SYSTEM INTERFACE ${PROJECT_SOURCE_DIR}/third_party/concurrentqueue)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index 72078292d83d..817972b22c91 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 macro(get_target_gpus_from_pytorch target_gpus)
    set(gfx90a_key MI200)
    set(gfx942_key MI300X)
@@ -11,6 +12,8 @@ macro(get_target_gpus_from_pytorch target_gpus)
    endforeach()
 endmacro()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(NOT __AOTRITON_INCLUDED)
   set(__AOTRITON_INCLUDED TRUE)
 
@@ -22,6 +25,7 @@ if(NOT __AOTRITON_INCLUDED)
   # Replaces .ci/docker/aotriton_version.txt
   # Note packages information may have versions skipped (due to no ABI breaks)
   # But they must be listed from lower version to higher version
+<<<<<<< HEAD
   set(__AOTRITON_RELEASE_PAGE "0.9.2b")
   set(__AOTRITON_VER_LIST
       "0.9.2b"  # rocm6.2
@@ -30,17 +34,24 @@ if(NOT __AOTRITON_INCLUDED)
       "0.9.2b_612896439f"  # rocm6.5 with gfx950
       "0.9.2b_612896439f"  # rocm7.0
       )
+=======
+  set(__AOTRITON_VER "0.11b")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   set(__AOTRITON_MANYLINUX_LIST
       "manylinux_2_28"  # rocm6.2
       "manylinux_2_28"  # rocm6.3
       "manylinux_2_28"  # rocm6.4
+<<<<<<< HEAD
       "manylinux_2_28"  # rocm6.5
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "manylinux_2_28"  # rocm7.0
       )
   set(__AOTRITON_ROCM_LIST
       "rocm6.2"
       "rocm6.3"
       "rocm6.4"
+<<<<<<< HEAD
       "rocm6.5"
       "rocm7.0"
       )
@@ -53,6 +64,216 @@ if(NOT __AOTRITON_INCLUDED)
       "9061bff8a1f7b857399467260b54714d659fd812a41eeee049f0a3e9c8b9aeeb"  # rocm7.0
       )
   set(__AOTRITON_Z "gz")
+=======
+      "rocm7.0"
+      )
+  set(__AOTRITON_CI_COMMIT "972223c501ffc22068bb035ac5d64cf54318d895")
+  set(__AOTRITON_SHA256_LIST
+      "6cae3d5de75ee205d22e088f7dfaab1227056d02ea67f29ccdbc09f2be4e8c8f"  # rocm6.2
+      "72a153549ea20707331e8a1f1e3d1b8de2913f9d5af2b900c56235d578b57efe"  # rocm6.3
+      "c7f319dd7448cbbbab81889dd8a37d47dbc25ebcbd89760f09e6a0904e556393"  # rocm6.4
+      "a2a974e0ad929a5e5827c0f896c59bda4872459cbaf8dd8e0a00407f404491cf"  # rocm7.0
+      )
+  set(__AOTRITON_IMAGE_LIST
+      "amd-gfx90a"
+      "amd-gfx942"
+      "amd-gfx950"
+      "amd-gfx11xx"
+      "amd-gfx120x"
+     )
+  set(__AOTRITON_IMAGE_SHA256_LIST
+     "c19a41c9480510ab32e6fb05e6ed0a3832d6b07634f050b836b760200befa735" # amd-gfx90a
+     "3a06a99971dddb7703a30378f1c5d6b41468d926ea51821156d1b6857b985bc4" # amd-gfx942
+     "27fc21f6761d57987a700436de8cf29cbdd9eeee91318dfed596eeb147d219ad" # amd-gfx950
+     "ec134032087344176695505db659387374d1916adfee16f0db47dee38d9c8603" # amd-gfx11xx
+     "fec05205747ff51649b1e151545267d5aa2037ba9d0338cad286882915b941b0" # amd-gfx120x
+     )
+  set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/")  # @lint-ignore
+  set(__AOTRITON_Z "gz")
+  # Set the default __AOTRITON_LIB path
+  if(NOT WIN32)
+    set(__AOTRITON_LIB "lib/libaotriton_v2.so")
+  else()
+    set(__AOTRITON_LIB "lib/aotriton_v2.lib")
+  endif()
+
+  function(aotriton_build_windows_dependencies dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
+    # Windows-specific dependencies - build these first
+    if(NOT noimage)
+      message(FATAL_ERROR "noimage must be ON for Windows builds")
+    endif()
+    # Build dlfcn-win32
+    set(__DLFCN_WIN32_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32")
+    set(__DLFCN_WIN32_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/dlfcn-win32-install")
+
+    ExternalProject_Add(${dlfcn-win32_external}
+      GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
+      GIT_TAG v1.4.2
+      PREFIX ${__DLFCN_WIN32_PREFIX}
+      INSTALL_DIR ${__DLFCN_WIN32_INSTALL_DIR}
+      CMAKE_ARGS
+        -DCMAKE_INSTALL_PREFIX=${__DLFCN_WIN32_INSTALL_DIR}
+        -DCMAKE_BUILD_TYPE=Release
+        -DCMAKE_C_COMPILER=cl
+        -DCMAKE_CXX_COMPILER=cl
+        -DBUILD_SHARED_LIBS=ON
+        -DBUILD_TESTS=OFF
+      BUILD_BYPRODUCTS
+        "${__DLFCN_WIN32_INSTALL_DIR}/lib/dl.lib"
+        "${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
+    )
+    ExternalProject_Add_Step(${dlfcn-win32_external} copy_to_aotriton
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        "${__DLFCN_WIN32_INSTALL_DIR}/bin/dl.dll"
+        "${__AOTRITON_INSTALL_DIR}/lib/"
+      DEPENDEES install
+    )
+    set(${dlfcn-win32_DIR} "${__DLFCN_WIN32_INSTALL_DIR}/share/dlfcn-win32" CACHE PATH "Path to dlfcn-win32 CMake config" FORCE)
+
+    # Build xz/liblzma
+    set(__XZ_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/xz")
+    set(__XZ_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/xz-install")
+
+    ExternalProject_Add(${xz_external}
+      GIT_REPOSITORY https://github.com/tukaani-project/xz.git
+      GIT_TAG v5.8.1
+      PREFIX ${__XZ_PREFIX}
+      INSTALL_DIR ${__XZ_INSTALL_DIR}
+      CMAKE_ARGS
+        -DCMAKE_INSTALL_PREFIX=${__XZ_INSTALL_DIR}
+        -DCMAKE_BUILD_TYPE=Release
+        -DBUILD_SHARED_LIBS=ON
+        -DENABLE_NLS=OFF
+        -DXZ_TOOL_LZMAINFO=OFF
+        -DXZ_TOOL_XZ=OFF
+        -DXZ_TOOL_XZDEC=OFF
+        -DXZ_TOOL_LZMADEC=OFF
+      BUILD_BYPRODUCTS
+        "${__XZ_INSTALL_DIR}/lib/lzma.lib"
+        "${__XZ_INSTALL_DIR}/bin/liblzma.dll"
+    )
+    ExternalProject_Add_Step(${xz_external} copy_to_aotriton
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        "${__XZ_INSTALL_DIR}/bin/liblzma.dll"
+        "${__AOTRITON_INSTALL_DIR}/lib/"
+      DEPENDEES install
+    )
+    set(${liblzma_DIR} "${__XZ_INSTALL_DIR}/lib/cmake/liblzma" CACHE PATH "Path to xz/liblzma CMake config" FORCE)
+  endfunction()
+
+  function(aotriton_build_from_source noimage project)
+    if(noimage)
+      SET(RECURSIVE "OFF")
+    else()
+      SET(RECURSIVE "ON")
+    endif()
+    if(WIN32)
+      message(STATUS "Building AOTriton Windows dependencies")
+      aotriton_build_windows_dependencies(dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
+    endif()
+    message(STATUS "PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}")
+
+    ExternalProject_Add(${project}
+      GIT_REPOSITORY https://github.com/ROCm/aotriton.git
+      GIT_SUBMODULES_RECURSE ${RECURSIVE}
+      GIT_TAG ${__AOTRITON_CI_COMMIT}
+      PREFIX ${__AOTRITON_EXTERN_PREFIX}
+      CMAKE_CACHE_ARGS
+      -DAOTRITON_TARGET_ARCH:STRING=${PYTORCH_ROCM_ARCH}
+      -DCMAKE_INSTALL_PREFIX:FILEPATH=${__AOTRITON_INSTALL_DIR}
+      CMAKE_ARGS
+      -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+      -DAOTRITON_GPU_BUILD_TIMEOUT=0
+      -DAOTRITON_NO_PYTHON=ON
+      -DAOTRITON_NOIMAGE_MODE=${noimage}
+      -DHIP_PLATFORM=amd
+      $<$<BOOL:${WIN32}>:-Ddlfcn-win32_DIR=${dlfcn-win32_DIR}>
+      $<$<BOOL:${WIN32}>:-Dliblzma_DIR=${liblzma_DIR}>
+      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}"
+      USES_TERMINAL_DOWNLOAD TRUE
+      USES_TERMINAL_CONFIGURE TRUE
+      USES_TERMINAL_BUILD TRUE
+      USES_TERMINAL_INSTALL TRUE
+    )
+    if(WIN32)
+      add_dependencies(${project} dlfcn-win32_external xz_external)
+    endif()
+  endfunction()
+
+  set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
+  function(aotriton_download_runtime index project)
+    list(GET __AOTRITON_ROCM_LIST ${index} __AOTRITON_ROCM)
+    list(GET __AOTRITON_MANYLINUX_LIST ${index} __AOTRITON_MANYLINUX)
+    list(GET __AOTRITON_SHA256_LIST ${index} __AOTRITON_SHA256)
+
+    string(CONCAT __AOTRITON_FILE "aotriton-"
+                                  "${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}"
+                                  "_${__AOTRITON_ARCH}-${__AOTRITON_ROCM}"
+                                  "-shared.tar.${__AOTRITON_Z}")
+    string(CONCAT __AOTRITON_URL
+           "${__AOTRITON_BASE_URL}"
+           "${__AOTRITON_VER}/${__AOTRITON_FILE}")
+    ExternalProject_Add(${project}
+      URL "${__AOTRITON_URL}"
+      URL_HASH SHA256=${__AOTRITON_SHA256}
+      SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND ""
+      INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
+      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime"
+      "${__AOTRITON_INSTALL_DIR}"
+      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}"
+    )
+    message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\
+    Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
+  endfunction()
+
+  function(aotriton_download_image image project)
+    list(FIND __AOTRITON_IMAGE_LIST ${image} index)
+    list(GET __AOTRITON_IMAGE_SHA256_LIST ${index} __AOTRITON_SHA256)
+
+    string(CONCAT __AOTRITON_FILE
+           "aotriton-${__AOTRITON_VER}-images-"
+           "${image}.tar.${__AOTRITON_Z}")
+    string(CONCAT __AOTRITON_URL
+           "${__AOTRITON_BASE_URL}"
+           "${__AOTRITON_VER}/${__AOTRITON_FILE}")
+
+    # Set up directories
+    set(__AOTRITON_DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_download-${image})
+    set(__AOTRITON_EXTRACT_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image})
+    set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR})
+    set(__DOWNLOAD_NO_EXTRACT "")
+    set(__BUILD_COMMANDS "")
+
+    # On Windows, we need custom tar extraction with UTF-8 support
+    if(WIN32)
+      set(__DOWNLOAD_NO_EXTRACT "DOWNLOAD_NO_EXTRACT;TRUE")
+      set(__BUILD_COMMANDS
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${__AOTRITON_EXTRACT_DIR}"
+        COMMAND tar --options hdrcharset=UTF-8 -xf "${__AOTRITON_DOWNLOAD_DIR}/${__AOTRITON_FILE}" -C "${__AOTRITON_EXTRACT_DIR}"
+      )
+      set(__AOTRITON_INSTALL_SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}/aotriton)
+    endif()
+
+    ExternalProject_Add(${project}
+      URL "${__AOTRITON_URL}"
+      URL_HASH SHA256=${__AOTRITON_SHA256}
+      DOWNLOAD_DIR ${__AOTRITON_DOWNLOAD_DIR}
+      ${__DOWNLOAD_NO_EXTRACT}
+      SOURCE_DIR ${__AOTRITON_EXTRACT_DIR}
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND ""
+      ${__BUILD_COMMANDS}
+      INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
+      "${__AOTRITON_INSTALL_SOURCE_DIR}"
+      "${__AOTRITON_INSTALL_DIR}"
+      BUILD_BYPRODUCTS
+      "${__AOTRITON_INSTALL_DIR}/lib/aotriton.images/${image}/__signature__"
+    )
+    message(STATUS "Download AOTriton pre-compiled GPU images from ${__AOTRITON_URL}.")
+  endfunction()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   # Note it is INSTALL"ED"
   if(DEFINED ENV{AOTRITON_INSTALLED_PREFIX})
@@ -63,6 +284,7 @@ if(NOT __AOTRITON_INCLUDED)
     set(__AOTRITON_INSTALL_DIR "$ENV{AOTRITON_INSTALLED_PREFIX}")
     message(STATUS "Using Preinstalled AOTriton at ${__AOTRITON_INSTALL_DIR}")
   elseif(DEFINED ENV{AOTRITON_INSTALL_FROM_SOURCE})
+<<<<<<< HEAD
     set(target_gpus "")
     get_target_gpus_from_pytorch(target_gpus)
     ExternalProject_Add(aotriton_external
@@ -130,6 +352,38 @@ if(NOT __AOTRITON_INCLUDED)
     Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
   endif()
   target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so)
+=======
+    aotriton_build_from_source(OFF aotriton_external)
+    add_dependencies(__caffe2_aotriton aotriton_external)
+    message(STATUS "Using AOTriton compiled from source directory ${__AOTRITON_EXTERN_PREFIX}")
+  else()
+    set(__AOTRITON_SYSTEM_ROCM "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}")
+    list(FIND __AOTRITON_ROCM_LIST "rocm${__AOTRITON_SYSTEM_ROCM}" __AOTRITON_RUNTIME_INDEX)
+    if(${__AOTRITON_RUNTIME_INDEX} LESS 0)
+      message(STATUS "Cannot find AOTriton runtime for ROCM ${__AOTRITON_SYSTEM_ROCM}. \
+      Build runtime from source")
+      aotriton_build_from_source(ON aotriton_runtime)
+    else()
+      aotriton_download_runtime(${__AOTRITON_RUNTIME_INDEX} aotriton_runtime)
+    endif()
+    add_dependencies(__caffe2_aotriton aotriton_runtime)
+    set(__AOTRITON_CHAINED_IMAGE "aotriton_runtime")
+    foreach(image ${__AOTRITON_IMAGE_LIST})
+      string(SUBSTRING ${image} 7 -1 gfx_pattern)
+      string(REPLACE "x" "." gfx_regex ${gfx_pattern})
+      foreach(target ${PYTORCH_ROCM_ARCH})
+        if(target MATCHES ${gfx_regex})
+          set(__AOTRITON_DOWNLOAD_TARGET aotriton_image_${gfx_pattern})
+          aotriton_download_image(${image} ${__AOTRITON_DOWNLOAD_TARGET})
+          add_dependencies(${__AOTRITON_CHAINED_IMAGE} ${__AOTRITON_DOWNLOAD_TARGET})
+          set(__AOTRITON_CHAINED_IMAGE ${__AOTRITON_DOWNLOAD_TARGET})
+          break()
+        endif()
+      endforeach()
+    endforeach()
+  endif()
+  target_link_libraries(__caffe2_aotriton INTERFACE "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
   set(AOTRITON_FOUND TRUE)
 endif() # __AOTRITON_INCLUDED
diff --git a/cmake/Metal.cmake b/cmake/Metal.cmake
index cc8e1932f1a1..45a1e5e4ebf4 100644
--- a/cmake/Metal.cmake
+++ b/cmake/Metal.cmake
@@ -8,7 +8,11 @@ if(WERROR)
 endif()
 
 function(metal_to_air SRC TARGET FLAGS)
+<<<<<<< HEAD
     add_custom_command(COMMAND xcrun metal -c ${SRC} -I ${CMAKE_SOURCE_DIR} -o ${TARGET} ${FLAGS} ${METAL_CFLAGS}
+=======
+    add_custom_command(COMMAND xcrun metal -c ${SRC} -I ${CMAKE_SOURCE_DIR} -I ${CMAKE_SOURCE_DIR}/aten/src -o ${TARGET} ${FLAGS} ${METAL_CFLAGS}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                        DEPENDS ${SRC}
                        OUTPUT ${TARGET}
                        COMMENT "Compiling ${SRC} to ${TARGET}"
diff --git a/cmake/Modules/FindAPL.cmake b/cmake/Modules/FindAPL.cmake
index 8990d3091eeb..6ff9c459f0ee 100644
--- a/cmake/Modules/FindAPL.cmake
+++ b/cmake/Modules/FindAPL.cmake
@@ -29,7 +29,11 @@ IF(NOT APL_LIB_DIR)
 ENDIF()
 
 # Check bin file
+<<<<<<< HEAD
 FIND_PATH(APL_BIN_DIR NAMES armpl_lp64.dll libarmpl_lp64.a PATHS ${APL_BIN_SEARCH_PATHS})
+=======
+FIND_PATH(APL_BIN_DIR NAMES armpl_lp64.dll armpl-info PATHS ${APL_BIN_SEARCH_PATHS})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IF(NOT APL_BIN_DIR)
     SET(APL_FOUND OFF)
     MESSAGE(STATUS "Could not verify APL bin directory. Turning APL_FOUND off")
diff --git a/cmake/Modules/FindARM.cmake b/cmake/Modules/FindARM.cmake
index 340c2a64e035..3d84499ed807 100644
--- a/cmake/Modules/FindARM.cmake
+++ b/cmake/Modules/FindARM.cmake
@@ -89,7 +89,11 @@ if(NOT CORTEXA9_FOUND)
 endif(NOT CORTEXA9_FOUND)
 mark_as_advanced(NEON_FOUND)
 
+<<<<<<< HEAD
 #SVE support is availale is only for Linux OS.
+=======
+#SVE support is available is only for Linux OS.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
     # Include necessary modules for checking C and C++ source compilations
     INCLUDE(CheckCSourceCompiles)
@@ -106,8 +110,23 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
       }
     ")
 
+<<<<<<< HEAD
     # Macro to check for SVE instruction support
     MACRO(CHECK_SVE lang type flags)
+=======
+    SET(ARM_BF16_CODE "
+      #include <arm_neon.h>
+      int main()
+      {
+        float32x4_t a = vdupq_n_f32(0);
+        bfloat16x8_t b = vreinterpretq_bf16_f32(a);
+        return 0;
+      }
+    ")
+
+    # Macro to check for SVE instruction support
+    MACRO(CHECK_COMPILES lang type flags code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # Save the current state of required flags
       SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
 
@@ -116,9 +135,15 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
 
       # Check if the source code compiles with the given flags for the specified language (C or C++)
       IF(lang STREQUAL "CXX")
+<<<<<<< HEAD
         CHECK_CXX_SOURCE_COMPILES("${SVE_CODE}" ${lang}_HAS_${type})
       ELSE()
         CHECK_C_SOURCE_COMPILES("${SVE_CODE}" ${lang}_HAS_${type})
+=======
+        CHECK_CXX_SOURCE_COMPILES("${code}" ${lang}_HAS_${type})
+      ELSE()
+        CHECK_C_SOURCE_COMPILES("${code}" ${lang}_HAS_${type})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ENDIF()
 
       # If the compilation test is successful, set appropriate variables indicating support
@@ -142,7 +167,12 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
     ENDMACRO()
 
     # Check for SVE256 vector length
+<<<<<<< HEAD
     CHECK_SVE(CXX "SVE256" "-march=armv8-a+sve -msve-vector-bits=256")
+=======
+    CHECK_COMPILES(CXX "SVE256" "-march=armv8.2-a+sve -msve-vector-bits=256" "${SVE_CODE}")
+    CHECK_COMPILES(CXX "ARM_BF16" "-march=armv8.2-a+sve+bf16 -msve-vector-bits=256" "${ARM_BF16_CODE}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # If SVE256 support is not found, set CXX_SVE_FOUND to FALSE and notify the user
     if(NOT CXX_SVE256_FOUND)
diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake
index 7d343f8adab7..4f84a05e0f16 100644
--- a/cmake/Modules/FindLAPACK.cmake
+++ b/cmake/Modules/FindLAPACK.cmake
@@ -226,6 +226,12 @@ if(BLAS_FOUND)
   #Arm Performance Libraries
   IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "apl"))
     SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+<<<<<<< HEAD
+=======
+    if(UNIX)
+      list(APPEND CMAKE_REQUIRED_LIBRARIES -lm)
+    endif(UNIX)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     check_function_exists("cheev_" APL_LAPACK_WORKS)
     if(APL_LAPACK_WORKS)
       check_function_exists("cgesdd_" LAPACK_CGESDD_WORKS)
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index a9276de9cd02..959336de1eda 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -12,6 +12,14 @@
 #  MKL_SOLVER_LIBRARIES - list of libraries to add for the solvers
 #  MKL_CDFT_LIBRARIES - list of libraries to add for the solvers
 
+<<<<<<< HEAD
+=======
+# Do nothing if on ARM
+IF (CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+RETURN()
+ENDIF()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Do nothing if MKL_FOUND was set before!
 IF (NOT MKL_FOUND)
 
diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake
index ae0a512b1658..0e6e0c1c366b 100644
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@@ -20,17 +20,31 @@ IF(NOT MKLDNN_FOUND)
   SET(MKLDNN_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep/mkl-dnn")
 
   if(USE_XPU) # Build oneDNN GPU library
+<<<<<<< HEAD
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+=======
+    if(WIN32)
+      # Windows
+      set(DNNL_HOST_COMPILER "DEFAULT")
+      set(SYCL_CXX_DRIVER "icx")
+      set(DNNL_LIB_NAME "dnnl.lib")
+    elseif(LINUX)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       # Linux
       # g++ is soft linked to /usr/bin/cxx, oneDNN would not treat it as an absolute path
       set(DNNL_HOST_COMPILER "g++")
       set(SYCL_CXX_DRIVER "icpx")
       set(DNNL_LIB_NAME "libdnnl.a")
     else()
+<<<<<<< HEAD
       # Windows
       set(DNNL_HOST_COMPILER "DEFAULT")
       set(SYCL_CXX_DRIVER "icx")
       set(DNNL_LIB_NAME "dnnl.lib")
+=======
+      MESSAGE(FATAL_ERROR "OneDNN for Intel GPU in PyTorch currently supports only Windows and Linux.
+                           Detected system '${CMAKE_SYSTEM_NAME}' is not supported.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     endif()
 
     set(DNNL_MAKE_COMMAND "cmake" "--build" ".")
@@ -42,6 +56,7 @@ IF(NOT MKLDNN_FOUND)
         list(APPEND DNNL_MAKE_COMMAND "--" "-l" "$ENV{MAX_JOBS}")
       endif()
     endif()
+<<<<<<< HEAD
     if(XPU_DEVICE_CXX_FLAGS)
       set(DNNL_CXX_FLAGS "-DCMAKE_CXX_FLAGS=${XPU_DEVICE_CXX_FLAGS}")
     else()
@@ -50,11 +65,19 @@ IF(NOT MKLDNN_FOUND)
     ExternalProject_Add(xpu_mkldnn_proj
       GIT_REPOSITORY https://github.com/oneapi-src/oneDNN
       GIT_TAG v3.7.1
+=======
+    ExternalProject_Add(xpu_mkldnn_proj
+      GIT_REPOSITORY https://github.com/oneapi-src/oneDNN
+      GIT_TAG v3.8.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       PREFIX ${XPU_MKLDNN_DIR_PREFIX}
       BUILD_IN_SOURCE 0
       CMAKE_ARGS  -DCMAKE_C_COMPILER=icx
       -DCMAKE_CXX_COMPILER=${SYCL_CXX_DRIVER}
+<<<<<<< HEAD
       ${DNNL_CXX_FLAGS}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       -DDNNL_GPU_RUNTIME=SYCL
       -DDNNL_CPU_RUNTIME=THREADPOOL
       -DDNNL_BUILD_TESTS=OFF
@@ -88,8 +111,17 @@ IF(NOT MKLDNN_FOUND)
   ENDIF(NOT APPLE AND NOT WIN32 AND NOT BUILD_LITE_INTERPRETER)
 
   IF(EXISTS "${MKLDNN_ROOT}/include/oneapi/dnnl/dnnl_ukernel.hpp")
+<<<<<<< HEAD
     MESSAGE("-- Will build oneDNN UKERNEL")
     SET(DNNL_EXPERIMENTAL_UKERNEL ON CACHE BOOL "" FORCE)
+=======
+    IF(CPU_POWER)
+      SET(DNNL_EXPERIMENTAL_UKERNEL OFF CACHE BOOL "" FORCE)
+    ELSE()
+      MESSAGE("-- Will build oneDNN UKERNEL")
+      SET(DNNL_EXPERIMENTAL_UKERNEL ON CACHE BOOL "" FORCE)
+    ENDIF()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ENDIF(EXISTS "${MKLDNN_ROOT}/include/oneapi/dnnl/dnnl_ukernel.hpp")
 
   FIND_PACKAGE(BLAS)
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
index a16c9aca67dd..004cafc7824c 100644
--- a/cmake/Modules/FindNCCL.cmake
+++ b/cmake/Modules/FindNCCL.cmake
@@ -12,7 +12,11 @@
 #
 # The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
 # install NCCL in the same location as the CUDA toolkit.
+<<<<<<< HEAD
 # See https://github.com/caffe2/caffe2/issues/1601
+=======
+# See https://github.com/facebookarchive/caffe2/issues/1601
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
 set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
@@ -57,7 +61,12 @@ if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
   include(CheckCXXSymbolExists)
   check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
 
+<<<<<<< HEAD
   if (NCCL_VERSION_DEFINED)
+=======
+  # this condition check only works for non static NCCL linking
+  if (NCCL_VERSION_DEFINED AND NOT USE_STATIC_NCCL)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
     file(WRITE ${file} "
       #include <iostream>
@@ -65,7 +74,10 @@ if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
       int main()
       {
         std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         int x;
         ncclGetVersion(&x);
         return x == NCCL_VERSION_CODE;
@@ -80,11 +92,17 @@ if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
 (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
     endif()
     message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
+<<<<<<< HEAD
   else()
     message(STATUS "NCCL version < 2.3.5-5")
   endif ()
   set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
 
+=======
+  endif ()
+
+  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
   mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
 endif()
diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake
index c20b003a95fc..508f8315d557 100644
--- a/cmake/Modules/FindOpenBLAS.cmake
+++ b/cmake/Modules/FindOpenBLAS.cmake
@@ -64,6 +64,35 @@ ELSE (OpenBLAS_FOUND)
   ENDIF (OpenBLAS_FIND_REQUIRED)
 ENDIF (OpenBLAS_FOUND)
 
+<<<<<<< HEAD
+=======
+IF(OpenBLAS_LIB)
+ # Run ldd on the OpenBLAS library
+execute_process(
+  COMMAND ldd "${OpenBLAS_LIB}"
+  OUTPUT_VARIABLE LDD_OUTPUT
+  ERROR_VARIABLE LDD_ERROR
+  RESULT_VARIABLE LDD_RESULT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+if(NOT LDD_RESULT EQUAL 0)
+  message(WARNING "ldd failed on ${OpenBLAS_LIB}: ${LDD_ERROR}")
+endif()
+
+# Check if the output contains "libgomp"
+string(FIND "${LDD_OUTPUT}" "libgomp" LIBGOMP_FOUND_INDEX)
+if(LIBGOMP_FOUND_INDEX GREATER -1)
+  message(STATUS "OpenBLAS is directly linked against libgomp")
+  set(OPENBLAS_USES_LIBGOMP TRUE CACHE BOOL "OpenBLAS uses libgomp")
+else()
+  message(STATUS "OpenBLAS is not directly linked against libgomp")
+  set(OPENBLAS_USES_LIBGOMP FALSE CACHE BOOL "OpenBLAS uses libgomp")
+endif()
+
+ENDIF(OpenBLAS_LIB)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 MARK_AS_ADVANCED(
     OpenBLAS_INCLUDE_DIR
     OpenBLAS_LIB
diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake
index 80d4f1e75c39..9961a50b2d22 100644
--- a/cmake/Modules/FindOpenMP.cmake
+++ b/cmake/Modules/FindOpenMP.cmake
@@ -277,6 +277,21 @@ function(_OPENMP_GET_FLAGS LANG FLAG_MODE OPENMP_FLAG_VAR OPENMP_LIB_NAMES_VAR)
       mark_as_advanced(OpenMP_libomp_LIBRARY)
     endif()
 
+<<<<<<< HEAD
+=======
+    # Check if we are using  OpenBLAS which is linked against libgomp
+    # we may end up with  multiple omp runtimes linked
+    # against libtorch_cpu.so
+    if(OpenBLAS_LIB AND OPENBLAS_USES_LIBGOMP)
+      find_library(OpenMP_libomp_LIBRARY
+        NAMES gomp
+        HINTS ${CMAKE_${LANG}_IMPLICIT_LINK_DIRECTORIES}
+        DOC "libomp location for OpenMP"
+      )
+      mark_as_advanced(OpenMP_libomp_LIBRARY)
+    endif()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (NOT OpenMP_libomp_LIBRARY)
       find_library(OpenMP_libomp_LIBRARY
         NAMES omp gomp iomp5
diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake
index 808a3ec33eca..8056743b7a72 100644
--- a/cmake/Modules/FindSYCLToolkit.cmake
+++ b/cmake/Modules/FindSYCLToolkit.cmake
@@ -12,6 +12,19 @@ if(DEFINED ENV{SYCL_ROOT})
   set(SYCL_ROOT $ENV{SYCL_ROOT})
 elseif(DEFINED ENV{CMPLR_ROOT})
   set(SYCL_ROOT $ENV{CMPLR_ROOT})
+<<<<<<< HEAD
+=======
+else()
+  # Use the default path to ensure proper linking with torch::xpurt when the user is working with libtorch.
+  if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    set(SYCL_ROOT "/opt/intel/oneapi/compiler/latest")
+  elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    set(SYCL_ROOT "C:/Program Files (x86)/Intel/oneAPI/compiler/latest")
+  endif()
+  if(NOT EXISTS ${SYCL_ROOT})
+    set(SYCL_ROOT "")
+  endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 string(COMPARE EQUAL "${SYCL_ROOT}" "" nosyclfound)
@@ -46,7 +59,13 @@ function(parse_sycl_compiler_version version_number)
   set(${version_number} "${VERSION_NUMBER_MATCH}" PARENT_SCOPE)
 endfunction()
 
+<<<<<<< HEAD
 parse_sycl_compiler_version(SYCL_COMPILER_VERSION)
+=======
+if(SYCL_COMPILER)
+  parse_sycl_compiler_version(SYCL_COMPILER_VERSION)
+endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if(NOT SYCL_COMPILER_VERSION)
   set(SYCL_FOUND False)
@@ -87,12 +106,19 @@ set(PYTORCH_2_5_SYCL_TOOLKIT_VERSION 20249999)
 
 # By default, we use libsycl.so on Linux and sycl.lib on Windows as the SYCL library name.
 if (SYCL_COMPILER_VERSION VERSION_LESS_EQUAL PYTORCH_2_5_SYCL_TOOLKIT_VERSION)
+<<<<<<< HEAD
   # Don't use if(LINUX) here since this requires cmake>=3.25 and file is installed
   # and used by other projects.
   # See: https://cmake.org/cmake/help/v3.25/variable/LINUX.html
   if(CMAKE_SYSTEM_NAME MATCHES "Linux")
     set(sycl_lib_suffix "-preview")
   elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+=======
+  # Don't use if(WIN32) here since this requires cmake>=3.25 and file is installed
+  # and used by other projects.
+  # See: https://cmake.org/cmake/help/v3.25/variable/LINUX.html
+  if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # On Windows, the SYCL library is named sycl7.lib until PYTORCH_2_5_SYCL_TOOLKIT_VERSION.
     # sycl.lib is supported in the later version.
     set(sycl_lib_suffix "7")
diff --git a/cmake/Modules_CUDA_fix/README.md b/cmake/Modules_CUDA_fix/README.md
index 77b7f89979d9..e133654d947a 100644
--- a/cmake/Modules_CUDA_fix/README.md
+++ b/cmake/Modules_CUDA_fix/README.md
@@ -23,5 +23,9 @@ to allow submodules to use these fixes because we can't patch their
 `CMakeList.txt`.
 
 If you need to update files under `./upstream` folder, we recommend you issue PRs
+<<<<<<< HEAD
 against [the CMake mainline branch](https://gitlab.kitware.com/cmake/cmake/tree/master/Modules/FindCUDA.cmake),
+=======
+against [the CMake mainline branch](https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 and then backport it here for earlier CMake compatibility.
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
index f642072bdc51..d24df40cb5ce 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@@ -909,7 +909,11 @@ mark_as_advanced(CUDA_cupti_LIBRARY)
 
 # Set the CUDA_LIBRARIES variable.  This is the set of stuff to link against if you are
 # using the CUDA runtime.  For the dynamic version of the runtime, most of the
+<<<<<<< HEAD
 # dependencies are brough in, but for the static version there are additional libraries
+=======
+# dependencies are brought in, but for the static version there are additional libraries
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # and linker commands needed.
 # Initialize to empty
 set(CUDA_LIBRARIES)
@@ -1202,7 +1206,11 @@ function(CUDA_COMPUTE_BUILD_PATH path build_path)
   # Only deal with CMake style paths from here on out
   file(TO_CMAKE_PATH "${path}" bpath)
   if (IS_ABSOLUTE "${bpath}")
+<<<<<<< HEAD
     # Absolute paths are generally unnessary, especially if something like
+=======
+    # Absolute paths are generally unnecessary, especially if something like
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # file(GLOB_RECURSE) is used to pick up the files.
 
     string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
@@ -1225,7 +1233,11 @@ function(CUDA_COMPUTE_BUILD_PATH path build_path)
   # Avoid spaces
   string(REPLACE " " "_" bpath "${bpath}")
 
+<<<<<<< HEAD
   # Strip off the filename.  I wait until here to do it, since removin the
+=======
+  # Strip off the filename.  I wait until here to do it, since removing the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # basename can make a path that looked like path/../basename turn into
   # path/.. (notice the trailing slash).
   get_filename_component(bpath "${bpath}" PATH)
@@ -1725,7 +1737,11 @@ function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options
       list(APPEND flags -Xcompiler ${f})
     endforeach()
 
+<<<<<<< HEAD
     # Add our general CUDA_NVCC_FLAGS with the configuration specifig flags
+=======
+    # Add our general CUDA_NVCC_FLAGS with the configuration specific flags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set(nvcc_flags ${CUDA_NVCC_FLAGS} ${config_specific_flags} ${nvcc_flags})
 
     file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/run_nvcc.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/run_nvcc.cmake
index 9293df3aafbd..c5d3c95de9b7 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/run_nvcc.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/run_nvcc.cmake
@@ -156,7 +156,11 @@ macro(cuda_execute_process status command)
     # copy and paste a runnable command line.
     set(cuda_execute_process_string)
     foreach(arg ${ARGN})
+<<<<<<< HEAD
       # If there are quotes, excape them, so they come through.
+=======
+      # If there are quotes, escape them, so they come through.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       string(REPLACE "\"" "\\\"" arg ${arg})
       # Args with spaces need quotes around them to get them to be parsed as a single argument.
       if(arg MATCHES " ")
diff --git a/cmake/Modules_CUDA_fix/upstream/README.md b/cmake/Modules_CUDA_fix/upstream/README.md
index 3fb6c43f9640..d0945845985c 100644
--- a/cmake/Modules_CUDA_fix/upstream/README.md
+++ b/cmake/Modules_CUDA_fix/upstream/README.md
@@ -1,5 +1,9 @@
 If you need to update files under this folder, we recommend you issue PRs
+<<<<<<< HEAD
 against [the CMake mainline branch](https://gitlab.kitware.com/cmake/cmake/tree/master/Modules/FindCUDA.cmake),
+=======
+against [the CMake mainline branch](https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 and then backport it here for earlier CMake compatibility.
 
 See [this](../README.md) for more details.
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 4c436dcd6451..bdffb2607263 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -33,6 +33,7 @@ macro(custom_protobuf_find)
   set(__caffe2_CMAKE_POSITION_INDEPENDENT_CODE ${CMAKE_POSITION_INDEPENDENT_CODE})
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
+<<<<<<< HEAD
   if(MSVC)
     foreach(flag_var
         CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
@@ -52,6 +53,8 @@ macro(custom_protobuf_find)
     endif(MSVC_Z7_OVERRIDE)
   endif(MSVC)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
     message(WARNING "Ancient protobuf forces CMake compatibility")
     set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index b46560e123ba..dbf022467f3e 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -162,6 +162,15 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    USE_SYSTEM_UCC        : ${USE_SYSTEM_UCC}")
   endif()
   message(STATUS "  USE_ITT               : ${USE_ITT}")
+<<<<<<< HEAD
+=======
+  message(STATUS "  USE_XCCL              : ${USE_XCCL}")
+  if(${USE_XCCL})
+    message(STATUS "    USE_C10D_XCCL       : ${USE_C10D_XCCL}")
+    message(STATUS "    XCCL include path   : ${XCCL_INCLUDE_DIR}")
+    message(STATUS "    XCCL library        : ${XCCL_LIBRARY}")
+  endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   message(STATUS "  USE_NCCL              : ${USE_NCCL}")
   if(${USE_NCCL})
     message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
@@ -188,6 +197,10 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    USE_MPI               : ${USE_MPI}")
     message(STATUS "    USE_GLOO              : ${USE_GLOO}")
     message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
+<<<<<<< HEAD
+=======
+    message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
   endif()
   if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 855edd350818..d72bb8e5f68e 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -132,9 +132,12 @@ if(@USE_CUDA@)
   else()
     set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB})
   endif()
+<<<<<<< HEAD
   if(TARGET torch::nvtoolsext)
     list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext)
   endif()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if(@BUILD_SHARED_LIBS@)
     find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
@@ -147,11 +150,14 @@ if(@USE_XPU@ AND @BUILD_SHARED_LIBS@)
     append_torchlib_if_found(c10_xpu torch_xpu)
 endif()
 
+<<<<<<< HEAD
 # When we build libtorch with the old libstdc++ ABI, dependent libraries must too.
 if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@")
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib")
 # the statements below changes target properties on
 # - the imported target from Caffe2Targets.cmake in shared library mode (see the find_package above)
diff --git a/cmake/iOS.cmake b/cmake/iOS.cmake
index 9ca91304d5fd..0807a1d8c76a 100644
--- a/cmake/iOS.cmake
+++ b/cmake/iOS.cmake
@@ -100,7 +100,11 @@ if(IOS_DEPLOYMENT_TARGET)
   set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
 endif()
 
+<<<<<<< HEAD
 # Hidden visibilty is required for cxx on iOS
+=======
+# Hidden visibility is required for cxx on iOS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 set(CMAKE_C_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS}")
 set(CMAKE_CXX_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -fvisibility-inlines-hidden")
 
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 14308ff34c1f..96dac728c7a2 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -26,12 +26,15 @@ else()
   endif()
 endif()
 
+<<<<<<< HEAD
 if(NOT DEFINED ENV{ROCM_INCLUDE_DIRS})
   set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include)
 else()
   set(ROCM_INCLUDE_DIRS $ENV{ROCM_INCLUDE_DIRS})
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # MAGMA_HOME
 if(NOT DEFINED ENV{MAGMA_HOME})
   set(MAGMA_HOME ${ROCM_PATH}/magma)
@@ -71,7 +74,18 @@ list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
 
 macro(find_package_and_print_version PACKAGE_NAME)
   find_package("${PACKAGE_NAME}" ${ARGN})
+<<<<<<< HEAD
   message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
+=======
+  if(NOT ${PACKAGE_NAME}_FOUND)
+    message("Optional package ${PACKAGE_NAME} not found")
+  else()
+    message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
+    if(${PACKAGE_NAME}_INCLUDE_DIR)
+      list(APPEND ROCM_INCLUDE_DIRS ${${PACKAGE_NAME}_INCLUDE_DIR})
+    endif()
+  endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endmacro()
 
 # Find the HIP Package
@@ -81,12 +95,38 @@ find_package_and_print_version(HIP 1.0 MODULE)
 
 if(HIP_FOUND)
   set(PYTORCH_FOUND_HIP TRUE)
+<<<<<<< HEAD
 
   # Find ROCM version for checks
   if(UNIX)
     set(ROCM_VERSION_HEADER_PATH ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h)
   else()
     set(ROCM_VERSION_HEADER_PATH ${ROCM_INCLUDE_DIRS}/hip/hip_version.h)
+=======
+  find_package_and_print_version(hip REQUIRED CONFIG)
+
+  # The rocm-core package was only introduced in ROCm 6.4, so we make it optional.
+  find_package(rocm-core CONFIG)
+
+  # Some old consumer HIP SDKs do not distribute rocm_version.h, so we allow
+  # falling back to the hip version, which everyone should have.
+  # rocm_version.h lives in the rocm-core package and hip_version.h lives in the
+  # hip (lower-case) package. Both are probed above and will be in
+  # ROCM_INCLUDE_DIRS if available.
+  find_file(ROCM_VERSION_HEADER_PATH
+    NAMES rocm-core/rocm_version.h hip/hip_version.h
+    NO_DEFAULT_PATH
+    PATHS ${ROCM_INCLUDE_DIRS}
+  )
+  if(ROCM_VERSION_HEADER_PATH MATCHES "rocm-core/rocm_version.h$")
+    set(ROCM_LIB_NAME "ROCM")
+  else()
+    set(ROCM_LIB_NAME "HIP")
+  endif()
+
+  if(NOT ROCM_VERSION_HEADER_PATH)
+    message(FATAL_ERROR "Could not find hip/hip_version.h or rocm-core/rocm_version.h in ${ROCM_INCLUDE_DIRS}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
   get_filename_component(ROCM_HEADER_NAME ${ROCM_VERSION_HEADER_PATH} NAME)
 
@@ -97,6 +137,7 @@ if(HIP_FOUND)
   endif()
 
   # Read the ROCM headerfile into a variable
+<<<<<<< HEAD
   file(READ ${ROCM_HEADER_FILE} ROCM_HEADER_CONTENT)
 
   # Since Windows currently supports only a part of ROCm and names it HIP-SDK,
@@ -106,6 +147,12 @@ if(HIP_FOUND)
   else() # Win32
     set(ROCM_LIB_NAME "HIP")
   endif()
+=======
+  message(STATUS "Reading ROCM version from: ${ROCM_HEADER_FILE}")
+  message(STATUS "Content: ${ROCM_HEADER_CONTENT}")
+  file(READ "${ROCM_HEADER_FILE}" ROCM_HEADER_CONTENT)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # Below we use a RegEx to find ROCM version numbers.
   # Note that CMake does not support \s for blank space. That is
   # why in the regular expressions below we have a blank space in
@@ -144,7 +191,10 @@ if(HIP_FOUND)
   # Find ROCM components using Config mode
   # These components will be searced for recursively in ${ROCM_PATH}
   message("\n***** Library versions from cmake find_package *****\n")
+<<<<<<< HEAD
   find_package_and_print_version(hip REQUIRED CONFIG)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   find_package_and_print_version(amd_comgr REQUIRED)
   find_package_and_print_version(rocrand REQUIRED)
   find_package_and_print_version(hiprand REQUIRED)
@@ -157,6 +207,10 @@ if(HIP_FOUND)
   find_package_and_print_version(hipcub REQUIRED)
   find_package_and_print_version(rocthrust REQUIRED)
   find_package_and_print_version(hipsolver REQUIRED)
+<<<<<<< HEAD
+=======
+  find_package_and_print_version(rocsolver REQUIRED)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   # workaround cmake 4 build issue
   if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
     message(WARNING "Work around hiprtc cmake failure for cmake >= 4")
@@ -171,7 +225,18 @@ if(HIP_FOUND)
   if(UNIX)
     find_package_and_print_version(rccl)
     find_package_and_print_version(hsa-runtime64 REQUIRED)
+<<<<<<< HEAD
+
+=======
+  endif()
+
+  # Optional components.
+  find_package_and_print_version(hipsparselt)  # Will be required when ready.
 
+  list(REMOVE_DUPLICATES ROCM_INCLUDE_DIRS)
+
+  if(UNIX)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # roctx is part of roctracer
     find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)
 
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index c66d32b115c1..a0087bfe753d 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -169,6 +169,7 @@ else()
         CUDA::cudart)
 endif()
 
+<<<<<<< HEAD
 # nvToolsExt
 if(USE_SYSTEM_NVTX)
   find_path(nvtx3_dir NAMES nvtx3 PATHS ${CUDA_INCLUDE_DIRS})
@@ -186,6 +187,8 @@ else()
   set_property(TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES CUDA::nvToolsExt)
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # cublas
 add_library(caffe2::cublas INTERFACE IMPORTED)
@@ -314,7 +317,11 @@ set_property(
     TARGET caffe2::nvrtc PROPERTY INTERFACE_LINK_LIBRARIES
     CUDA::nvrtc caffe2::cuda)
 
+<<<<<<< HEAD
 # Add onnx namepsace definition to nvcc
+=======
+# Add onnx namespace definition to nvcc
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if(ONNX_NAMESPACE)
   list(APPEND CUDA_NVCC_FLAGS "-DONNX_NAMESPACE=${ONNX_NAMESPACE}")
 else()
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
index 781a4e6819f8..6b8783d850e1 100644
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@@ -183,7 +183,11 @@ macro(caffe2_interface_library SRC DST)
     # use the populated INTERFACE_LINK_LIBRARIES property, because if one of the
     # dependent library is not a target, cmake creates a $<LINK_ONLY:src> wrapper
     # and then one is not able to find target "src". For more discussions, check
+<<<<<<< HEAD
     #   https://gitlab.kitware.com/cmake/cmake/issues/15415
+=======
+    #   https://cmake.org/Bug/print_bug_page.php?bug_id=15415
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #   https://cmake.org/pipermail/cmake-developers/2013-May/019019.html
     # Specifically the following quote
     #
@@ -324,12 +328,17 @@ endmacro()
 #
 macro(torch_cuda_get_nvcc_gencode_flag store_var)
   # setting nvcc arch flags
+<<<<<<< HEAD
   if((NOT DEFINED TORCH_CUDA_ARCH_LIST) AND (DEFINED ENV{TORCH_CUDA_ARCH_LIST}))
     message(WARNING
         "In the future we will require one to explicitly pass "
         "TORCH_CUDA_ARCH_LIST to cmake instead of implicitly setting it as an "
         "env variable. This will become a FATAL_ERROR in future version of "
         "pytorch.")
+=======
+  # We need to support the explicitly and conveniently defined TORCH_CUDA_ARCH_LIST
+  if((NOT DEFINED TORCH_CUDA_ARCH_LIST) AND (DEFINED ENV{TORCH_CUDA_ARCH_LIST}))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
   endif()
   if(DEFINED CUDA_ARCH_NAME)
@@ -337,7 +346,15 @@ macro(torch_cuda_get_nvcc_gencode_flag store_var)
         "CUDA_ARCH_NAME is no longer used. Use TORCH_CUDA_ARCH_LIST instead. "
         "Right now, CUDA_ARCH_NAME is ${CUDA_ARCH_NAME} and "
         "TORCH_CUDA_ARCH_LIST is ${TORCH_CUDA_ARCH_LIST}.")
+<<<<<<< HEAD
     set(TORCH_CUDA_ARCH_LIST TORCH_CUDA_ARCH_LIST ${CUDA_ARCH_NAME})
+=======
+    if(NOT TORCH_CUDA_ARCH_LIST)
+      set(TORCH_CUDA_ARCH_LIST ${CUDA_ARCH_NAME})
+    else()
+      list(APPEND TORCH_CUDA_ARCH_LIST ${CUDA_ARCH_NAME})
+    endif()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   endif()
 
   # Invoke cuda_select_nvcc_arch_flags from proper cmake FindCUDA.
@@ -361,6 +378,31 @@ function(torch_compile_options libname)
       set(MSVC_DEBINFO_OPTION "/Zi")
     endif()
 
+<<<<<<< HEAD
+=======
+    if(${MSVC_TOOLSET_VERSION} GREATER_EQUAL 142)
+      # Add /permissive- flag for conformance mode to the compiler.
+      # This will force more strict check to the code standard.
+      # 1. From MS official doc: https://learn.microsoft.com/en-us/cpp/build/reference/permissive-standards-conformance?view=msvc-170#remarks
+      #    By default, the /permissive- option is set in new projects created by Visual Studio 2017 version 15.5 and later versions.
+      #    We set the /permissive- flag from VS 2019 (MSVC_TOOLSET_VERSION 142) to avoid compiling issues for old toolkit.
+      # 2. For MSVC VERSION: https://cmake.org/cmake/help/latest/variable/MSVC_TOOLSET_VERSION.html
+      target_compile_options(${libname} PUBLIC $<$<COMPILE_LANGUAGE:CXX>:/permissive->)
+    endif()
+    # This option enables a token-based preprocessor that conforms to C99 and C++11 and later standards.
+    # This option is available since VS 2017.
+    # For MS official doc: https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:preprocessor" PARENT_SCOPE)
+
+    if(${MSVC_TOOLSET_VERSION} GREATER_EQUAL 143)
+      # Add /d2implyavx512upperregs- to disable compiler over-aggressive optimization, which caused involeved AVX512 register on AVX2 machine.
+      # Reference: https://github.com/pytorch/pytorch/issues/145702#issuecomment-2874029459
+      target_compile_options(${libname} PUBLIC $<$<COMPILE_LANGUAGE:CXX>:/d2implyavx512upperregs->)
+    endif()
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     target_compile_options(${libname} PUBLIC
       $<$<COMPILE_LANGUAGE:CXX>:
         ${MSVC_RUNTIME_LIBRARY_OPTION}
@@ -373,6 +415,10 @@ function(torch_compile_options libname)
       -Wall
       -Wextra
       -Wdeprecated
+<<<<<<< HEAD
+=======
+      -Wunused
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       -Wno-unused-parameter
       -Wno-missing-field-initializers
       -Wno-array-bounds
@@ -380,6 +426,7 @@ function(torch_compile_options libname)
       -Wno-strict-overflow
       -Wno-strict-aliasing
       )
+<<<<<<< HEAD
     list(APPEND private_compile_options -Wunused-function)
     list(APPEND private_compile_options -Wunused-variable)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -387,6 +434,13 @@ function(torch_compile_options libname)
     endif()
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
       list(APPEND private_compile_options -Wunused-private-field -Wextra-semi -Wno-error=extra-semi)
+=======
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      list(APPEND private_compile_options -Wredundant-move)
+    endif()
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      list(APPEND private_compile_options -Wextra-semi -Wno-error=extra-semi -Wmove)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else()
       list(APPEND private_compile_options
         # Considered to be flaky.  See the discussion at
@@ -399,9 +453,15 @@ function(torch_compile_options libname)
         -Werror
         -Werror=inconsistent-missing-override
         -Werror=inconsistent-missing-destructor-override
+<<<<<<< HEAD
         -Werror=unused-function
         -Werror=unused-variable
         -Werror=pedantic
+=======
+        -Werror=pedantic
+        -Werror=unused
+        -Wno-error=unused-parameter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       )
       if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
         list(APPEND private_compile_options -Werror=unused-but-set-variable)
diff --git a/cmake/public/xpu.cmake b/cmake/public/xpu.cmake
index 5025da53b5bf..e4ae10a1f7c5 100644
--- a/cmake/public/xpu.cmake
+++ b/cmake/public/xpu.cmake
@@ -6,7 +6,10 @@ if(TARGET torch::xpurt)
 endif()
 
 set(XPU_HOST_CXX_FLAGS)
+<<<<<<< HEAD
 set(XPU_DEVICE_CXX_FLAGS)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Find SYCL library.
 find_package(SYCLToolkit REQUIRED)
@@ -37,12 +40,15 @@ torch_xpu_get_arch_list(XPU_ARCH_FLAGS)
 # propagate to torch-xpu-ops
 set(TORCH_XPU_ARCH_LIST ${XPU_ARCH_FLAGS})
 
+<<<<<<< HEAD
 if(CMAKE_SYSTEM_NAME MATCHES "Linux" AND SYCL_COMPILER_VERSION VERSION_LESS_EQUAL PYTORCH_2_5_SYCL_TOOLKIT_VERSION)
   # for ABI compatibility on Linux
   string(APPEND XPU_HOST_CXX_FLAGS " -D__INTEL_PREVIEW_BREAKING_CHANGES")
   string(APPEND XPU_DEVICE_CXX_FLAGS " -fpreview-breaking-changes")
 endif()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 string(APPEND XPU_HOST_CXX_FLAGS " -DSYCL_COMPILER_VERSION=${SYCL_COMPILER_VERSION}")
 
 if(DEFINED ENV{XPU_ENABLE_KINETO})
@@ -51,6 +57,14 @@ else()
   set(XPU_ENABLE_KINETO FALSE)
 endif()
 
+<<<<<<< HEAD
 if(NOT WIN32)
+=======
+if(WIN32)
+  if(${SYCL_COMPILER_VERSION} GREATER_EQUAL 20250101)
+    set(XPU_ENABLE_KINETO TRUE)
+  endif()
+else()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   set(XPU_ENABLE_KINETO TRUE)
 endif()
\ No newline at end of file
diff --git a/defs.bzl b/defs.bzl
index 22f508bf6449..aaca11cb024f 100644
--- a/defs.bzl
+++ b/defs.bzl
@@ -45,7 +45,11 @@ default_compiler_flags = [
     # includes <pthread.h> - a header not available on Windows.
     "DEFAULT": ["-DUSE_XNNPACK"],
     "ovr_config//os:windows": [],
+<<<<<<< HEAD
 }) + (["-O1"] if native.read_config("fbcode", "build_mode_test_label", "") == "dev-nosan" else [])
+=======
+})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 compiler_specific_flags = {
     "clang": [
diff --git a/docs/Makefile b/docs/Makefile
index b58c1d3b9efc..c548e123281b 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -16,6 +16,10 @@ help:
 figures:
 	@$(PYCMD) source/scripts/build_activation_images.py
 	@$(PYCMD) source/scripts/build_quantization_configs.py
+<<<<<<< HEAD
+=======
+	@$(PYCMD) source/scripts/build_lr_scheduler_images.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 onnx:
 	@$(PYCMD) source/scripts/onnx/build_onnx_torchscript_supported_aten_op_csv_table.py
diff --git a/docs/cpp/source/conf.py b/docs/cpp/source/conf.py
index eec3840b0eb4..c40d21ddc57a 100644
--- a/docs/cpp/source/conf.py
+++ b/docs/cpp/source/conf.py
@@ -22,8 +22,13 @@
 import os
 import textwrap
 
+<<<<<<< HEAD
 
 # sys.path.insert(0, os.path.abspath('.'))
+=======
+# sys.path.insert(0, os.path.abspath('.'))
+import pytorch_sphinx_theme2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # -- General configuration ------------------------------------------------
@@ -112,6 +117,15 @@
 # Add any paths that contain templates here, relative to this directory.
 # templates_path = ['_templates']
 
+<<<<<<< HEAD
+=======
+theme_variables = pytorch_sphinx_theme2.get_theme_variables()
+
+templates_path = [
+    "_templates",
+    os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
@@ -142,7 +156,11 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
+<<<<<<< HEAD
 language = None
+=======
+language = "en"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -160,7 +178,11 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
+<<<<<<< HEAD
 html_theme = "pytorch_sphinx_theme"
+=======
+html_theme = "pytorch_sphinx_theme2"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -168,6 +190,7 @@
 #
 html_theme_options = {
     "canonical_url": "https://pytorch.org/docs/stable/",
+<<<<<<< HEAD
     "pytorch_project": "docs",
     "collapse_navigation": False,
     "display_version": True,
@@ -178,12 +201,57 @@
 html_logo = os.path.join(
     repo_root, "docs", "source", "_static", "img", "pytorch-logo-dark-unstable.png"
 )
+=======
+    "collapse_navigation": False,
+    "logo": {"text": "Home"},
+    "icon_links": [
+        {
+            "name": "X",
+            "url": "https://x.com/PyTorch",
+            "icon": "fa-brands fa-x-twitter",
+        },
+        {
+            "name": "GitHub",
+            "url": "https://github.com/pytorch/pytorch",
+            "icon": "fa-brands fa-github",
+        },
+        {
+            "name": "PyTorch Forum",
+            "url": "https://discuss.pytorch.org/",
+            "icon": "fa-brands fa-discourse",
+        },
+        {
+            "name": "PyPi",
+            "url": "https://pypi.org/project/torch/",
+            "icon": "fa-brands fa-python",
+        },
+    ],
+    "navbar_start": ["pytorch_version"],
+    "display_version": True,
+}
+
+html_context = {
+    "theme_variables": theme_variables,
+    "github_url": "https://github.com",
+    "github_user": "pytorch",
+    "github_repo": "pytorch",
+    "feedback_url": "https://github.com/pytorch/pytorch",
+    "github_version": "main",
+    "doc_path": "docs/cpp/source",
+    "library_links": theme_variables.get("library_links", []),
+    "community_links": theme_variables.get("community_links", []),
+    "language_bindings_links": theme_variables.get("language_bindings_links", []),
+}
+
+# NOTE: sharing python docs resources
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 # NOTE: sharing python docs resources
 html_static_path = [os.path.join(repo_root, "docs", "cpp", "source", "_static")]
+<<<<<<< HEAD
 
 
 # Called automatically by Sphinx, making this `conf.py` an "extension".
@@ -199,6 +267,11 @@ def setup(app):
     for css_file in html_css_files:
         add_css(css_file)
 
+=======
+html_css_files = ["cpp_theme.css"]
+
+# Called automatically by Sphinx, making this `conf.py` an "extension".
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # -- Options for HTMLHelp output ------------------------------------------
 
diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst
index 02fa2a089e9e..d6a41344bedd 100644
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@@ -137,7 +137,11 @@ about this by following `this
 API concerns itself with scenarios in which you would like to extend
 TorchScript with custom operators, which can similarly be serialized and
 invoked from C++ during inference. Lastly, the `torch::jit::compile
+<<<<<<< HEAD
 <https://pytorch.org/cppdocs/api/function_namespacetorch_1_1jit_1a176d99fd5bf0233119a5f49c07a1d01d.html#exhale-function-namespacetorch-1-1jit-1a176d99fd5bf0233119a5f49c07a1d01d>`_
+=======
+<https://pytorch.org/cppdocs/api/function_namespacetorch_1_1jit_1a8660dc13a6b82336aadac667e6dccba1.html>`_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 function may be used to access the TorchScript compiler directly from C++.
 
 C++ Extensions
diff --git a/docs/cpp/source/installing.rst b/docs/cpp/source/installing.rst
index cf192077f561..165eb9237340 100644
--- a/docs/cpp/source/installing.rst
+++ b/docs/cpp/source/installing.rst
@@ -92,7 +92,11 @@ We can now run the following commands to build the application from within the
   cmake --build . --config Release
 
 where ``/absolute/path/to/libtorch`` should be the absolute (!) path to the unzipped LibTorch
+<<<<<<< HEAD
 distribution. If PyTorch was installed via conda or pip, `CMAKE_PREFIX_PATH` can be queried
+=======
+distribution. If PyTorch was installed via pip, `CMAKE_PREFIX_PATH` can be queried
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using `torch.utils.cmake_prefix_path` variable. In that case CMake configuration step would look something like follows:
 
 .. code-block:: sh
diff --git a/docs/cpp/source/notes/inference_mode.rst b/docs/cpp/source/notes/inference_mode.rst
index efb1b9de2d1a..81b45e553d48 100644
--- a/docs/cpp/source/notes/inference_mode.rst
+++ b/docs/cpp/source/notes/inference_mode.rst
@@ -6,7 +6,11 @@ to be used when you are certain your operations will have no interactions
 with autograd (e.g. model training). Compared to ``NoGradMode``, code run
 under this mode gets better performance by disabling autograd related work like
 view tracking and version counter bumps. However, tensors created inside
+<<<<<<< HEAD
 ``c10::InferenceMode`` has more limitation when interacting with autograd system as well.
+=======
+``c10::InferenceMode`` have more limitations when interacting with autograd system as well.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ``InferenceMode`` can be enabled for a given block of code. Inside ``InferenceMode``
 all newly allocated (non-view) tensors are marked as inference tensors. Inference tensors:
@@ -19,7 +23,11 @@ all newly allocated (non-view) tensors are marked as inference tensors. Inferenc
   To work around you can make a clone outside ``InferenceMode`` to get a normal tensor before mutating.
 
 A non-view tensor is an inference tensor if and only if it was allocated inside ``InferenceMode``.
+<<<<<<< HEAD
 A view tensor is an inference tensor if and only if the tensor it is a view of is an inference tensor.
+=======
+A view tensor is an inference tensor if and only if it is a view of an inference tensor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Inside an ``InferenceMode`` block, we make the following performance guarantees:
 
@@ -66,7 +74,11 @@ users should pay additional attention to:
     also affects tensor creation while ``AutoNonVariableTypeMode`` doesn't. In other words, tensors created
     inside ``InferenceMode`` are marked as inference tensors so that certain limitation can be applied after
     exiting ``InferenceMode``.
+<<<<<<< HEAD
   - Enabled/disabled ``InferenceMode`` states can be nested while ``AutoNonVariableTypeMode`` only allows enabled state..
+=======
+  - Enabled/disabled ``InferenceMode`` states can be nested while ``AutoNonVariableTypeMode`` only allows enabled state.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. code-block:: cpp
 
@@ -82,7 +94,11 @@ users should pay additional attention to:
   // InferenceMode is off
 
 
+<<<<<<< HEAD
 2. Users trying to implement a customized kernel who wants to redispatch under ``Autograd`` dispatch
+=======
+2. Users trying to implement a customized kernel who want to redispatch under ``Autograd`` dispatch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    keys should use ``AutoDispatchBelowADInplaceOrView`` instead. Note ``AutoDispatchBelowADInplaceOrView`` is just a new name
    of ``AutoNonVariableTypeMode`` since it explains the guard's functionality better. We're deprecating
    ``AutoNonVariableTypeMode`` and it'll be removed in 1.10 release. See customized kernel
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 763d05793895..000000000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-sphinx==5.0.0
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
-# but it doesn't seem to work and hangs around idly. The initial thought is probably
-# something related to Docker setup. We can investigate this later
-sphinxcontrib.katex==0.8.6
-matplotlib==3.6.0
-tensorboard==2.10.0
-# required to build torch.distributed.elastic.rendezvous.etcd* docs
-python-etcd==0.4.5
-sphinx-copybutton==0.5.0
-sphinx-panels==0.4.1
-myst-parser==0.18.1
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 120000
index 000000000000..7176089000a2
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1 @@
+../.ci/docker/requirements-docs.txt
\ No newline at end of file
diff --git a/docs/requirements.txt~HEAD b/docs/requirements.txt~HEAD
new file mode 100644
index 000000000000..763d05793895
--- /dev/null
+++ b/docs/requirements.txt~HEAD
@@ -0,0 +1,13 @@
+sphinx==5.0.0
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
+# but it doesn't seem to work and hangs around idly. The initial thought is probably
+# something related to Docker setup. We can investigate this later
+sphinxcontrib.katex==0.8.6
+matplotlib==3.6.0
+tensorboard==2.10.0
+# required to build torch.distributed.elastic.rendezvous.etcd* docs
+python-etcd==0.4.5
+sphinx-copybutton==0.5.0
+sphinx-panels==0.4.1
+myst-parser==0.18.1
diff --git a/docs/source/_static/img/export/draft_export_report.png b/docs/source/_static/img/export/draft_export_report.png
new file mode 100644
index 000000000000..18a25dd755b5
Binary files /dev/null and b/docs/source/_static/img/export/draft_export_report.png differ
diff --git a/docs/source/_static/img/export/draft_export_report_dde.png b/docs/source/_static/img/export/draft_export_report_dde.png
new file mode 100644
index 000000000000..06edae6312d6
Binary files /dev/null and b/docs/source/_static/img/export/draft_export_report_dde.png differ
diff --git a/docs/source/_static/img/inductor_provenance/index.png b/docs/source/_static/img/inductor_provenance/index.png
new file mode 100644
index 000000000000..cae37070eb66
Binary files /dev/null and b/docs/source/_static/img/inductor_provenance/index.png differ
diff --git a/docs/source/_static/img/inductor_provenance/provenance_aot_inductor.png b/docs/source/_static/img/inductor_provenance/provenance_aot_inductor.png
new file mode 100644
index 000000000000..1a1a695d94c5
Binary files /dev/null and b/docs/source/_static/img/inductor_provenance/provenance_aot_inductor.png differ
diff --git a/docs/source/_static/img/inductor_provenance/provenance_jit_inductor.png b/docs/source/_static/img/inductor_provenance/provenance_jit_inductor.png
new file mode 100644
index 000000000000..392b5b423d55
Binary files /dev/null and b/docs/source/_static/img/inductor_provenance/provenance_jit_inductor.png differ
diff --git a/docs/source/accelerator.md b/docs/source/accelerator.md
new file mode 100644
index 000000000000..c6f2fb108040
--- /dev/null
+++ b/docs/source/accelerator.md
@@ -0,0 +1,27 @@
+# torch.accelerator
+
+```{eval-rst}
+.. automodule:: torch.accelerator
+```
+
+```{eval-rst}
+.. currentmodule:: torch.accelerator
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    device_count
+    is_available
+    current_accelerator
+    set_device_index
+    set_device_idx
+    current_device_index
+    current_device_idx
+    set_stream
+    current_stream
+    synchronize
+    device_index
+```
diff --git a/docs/source/amp.md b/docs/source/amp.md
new file mode 100644
index 000000000000..023f927c6f63
--- /dev/null
+++ b/docs/source/amp.md
@@ -0,0 +1,582 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# Automatic Mixed Precision package - torch.amp
+
+% Both modules below are missing doc entry. Adding them here for now.
+
+% This does not add anything to the rendered page
+
+```{eval-rst}
+.. py:module:: torch.cpu.amp
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.amp
+```
+
+```{eval-rst}
+.. automodule:: torch.amp
+```
+
+```{eval-rst}
+.. currentmodule:: torch.amp
+```
+
+{class}`torch.amp` provides convenience methods for mixed precision,
+where some operations use the `torch.float32` (`float`) datatype and other operations
+use lower precision floating point datatype (`lower_precision_fp`): `torch.float16` (`half`) or `torch.bfloat16`. Some ops, like linear layers and convolutions,
+are much faster in `lower_precision_fp`. Other ops, like reductions, often require the dynamic
+range of `float32`. Mixed precision tries to match each op to its appropriate datatype.
+
+Ordinarily, "automatic mixed precision training" with datatype of `torch.float16` uses {class}`torch.autocast` and
+{class}`torch.amp.GradScaler` together, as shown in the {ref}`Automatic Mixed Precision examples<amp-examples>`
+and [Automatic Mixed Precision recipe](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html).
+However, {class}`torch.autocast` and {class}`torch.GradScaler` are modular, and may be used separately if desired.
+As shown in the CPU example section of {class}`torch.autocast`, "automatic mixed precision training/inference" on CPU with
+datatype of `torch.bfloat16` only uses {class}`torch.autocast`.
+
+:::{warning}
+`torch.cuda.amp.autocast(args...)` and `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast("cuda", args...)` or `torch.amp.autocast("cpu", args...)` instead.
+`torch.cuda.amp.GradScaler(args...)` and `torch.cpu.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler("cuda", args...)` or `torch.amp.GradScaler("cpu", args...)` instead.
+:::
+
+{class}`torch.autocast` and {class}`torch.cpu.amp.autocast` are new in version `1.10`.
+
+```{contents}
+:local: true
+```
+
+(autocasting)=
+
+## Autocasting
+
+```{eval-rst}
+.. currentmodule:: torch.amp.autocast_mode
+```
+
+```{eval-rst}
+.. autofunction::  is_autocast_available
+```
+
+```{eval-rst}
+.. currentmodule:: torch
+```
+
+```{eval-rst}
+.. autoclass:: autocast
+    :members:
+```
+
+```{eval-rst}
+.. currentmodule:: torch.amp
+```
+
+```{eval-rst}
+.. autofunction::  custom_fwd
+```
+
+```{eval-rst}
+.. autofunction::  custom_bwd
+```
+
+```{eval-rst}
+.. currentmodule:: torch.cuda.amp
+```
+
+```{eval-rst}
+.. autoclass:: autocast
+    :members:
+```
+
+```{eval-rst}
+.. autofunction::  custom_fwd
+```
+
+```{eval-rst}
+.. autofunction::  custom_bwd
+```
+
+```{eval-rst}
+.. currentmodule:: torch.cpu.amp
+```
+
+```{eval-rst}
+.. autoclass:: autocast
+    :members:
+```
+
+(gradient-scaling)=
+
+## Gradient Scaling
+
+If the forward pass for a particular op has `float16` inputs, the backward pass for
+that op will produce `float16` gradients.
+Gradient values with small magnitudes may not be representable in `float16`.
+These values will flush to zero ("underflow"), so the update for the corresponding parameters will be lost.
+
+To prevent underflow, "gradient scaling" multiplies the network's loss(es) by a scale factor and
+invokes a backward pass on the scaled loss(es). Gradients flowing backward through the network are
+then scaled by the same factor. In other words, gradient values have a larger magnitude,
+so they don't flush to zero.
+
+Each parameter's gradient (`.grad` attribute) should be unscaled before the optimizer
+updates the parameters, so the scale factor does not interfere with the learning rate.
+
+:::{note}
+AMP/fp16 may not work for every model! For example, most bf16-pretrained models cannot operate in
+the fp16 numerical range of max 65504 and will cause gradients to overflow instead of underflow. In
+this case, the scale factor may decrease under 1 as an attempt to bring gradients to a number
+representable in the fp16 dynamic range. While one may expect the scale to always be above 1, our
+GradScaler does NOT make this guarantee to maintain performance. If you encounter NaNs in your loss
+or gradients when running with AMP/fp16, verify your model is compatible.
+:::
+
+```{eval-rst}
+.. currentmodule:: torch.cuda.amp
+```
+
+```{eval-rst}
+.. autoclass:: GradScaler
+    :members:
+```
+
+```{eval-rst}
+.. currentmodule:: torch.cpu.amp
+```
+
+```{eval-rst}
+.. autoclass:: GradScaler
+    :members:
+```
+
+(autocast-op-reference)=
+
+## Autocast Op Reference
+
+(autocast-eligibility)=
+
+### Op Eligibility
+
+Ops that run in `float64` or non-floating-point dtypes are not eligible, and will
+run in these types whether or not autocast is enabled.
+
+Only out-of-place ops and Tensor methods are eligible.
+In-place variants and calls that explicitly supply an `out=...` Tensor
+are allowed in autocast-enabled regions, but won't go through autocasting.
+For example, in an autocast-enabled region `a.addmm(b, c)` can autocast,
+but `a.addmm_(b, c)` and `a.addmm(b, c, out=d)` cannot.
+For best performance and stability, prefer out-of-place ops in autocast-enabled
+regions.
+
+Ops called with an explicit `dtype=...` argument are not eligible,
+and will produce output that respects the `dtype` argument.
+
+(autocast-cuda-op-reference)=
+
+### CUDA Op-Specific Behavior
+
+The following lists describe the behavior of eligible ops in autocast-enabled regions.
+These ops always go through autocasting whether they are invoked as part of a {class}`torch.nn.Module`,
+as a function, or as a {class}`torch.Tensor` method. If functions are exposed in multiple namespaces,
+they go through autocasting regardless of the namespace.
+
+Ops not listed below do not go through autocasting. They run in the type
+defined by their inputs. However, autocasting may still change the type
+in which unlisted ops run if they're downstream from autocasted ops.
+
+If an op is unlisted, we assume it's numerically stable in `float16`.
+If you believe an unlisted op is numerically unstable in `float16`,
+please file an issue.
+
+#### CUDA Ops that can autocast to `float16`
+
+`__matmul__`,
+`addbmm`,
+`addmm`,
+`addmv`,
+`addr`,
+`baddbmm`,
+`bmm`,
+`chain_matmul`,
+`multi_dot`,
+`conv1d`,
+`conv2d`,
+`conv3d`,
+`conv_transpose1d`,
+`conv_transpose2d`,
+`conv_transpose3d`,
+`GRUCell`,
+`linear`,
+`LSTMCell`,
+`matmul`,
+`mm`,
+`mv`,
+`prelu`,
+`RNNCell`
+
+#### CUDA Ops that can autocast to `float32`
+
+`__pow__`,
+`__rdiv__`,
+`__rpow__`,
+`__rtruediv__`,
+`acos`,
+`asin`,
+`binary_cross_entropy_with_logits`,
+`cosh`,
+`cosine_embedding_loss`,
+`cdist`,
+`cosine_similarity`,
+`cross_entropy`,
+`cumprod`,
+`cumsum`,
+`dist`,
+`erfinv`,
+`exp`,
+`expm1`,
+`group_norm`,
+`hinge_embedding_loss`,
+`kl_div`,
+`l1_loss`,
+`layer_norm`,
+`log`,
+`log_softmax`,
+`log10`,
+`log1p`,
+`log2`,
+`margin_ranking_loss`,
+`mse_loss`,
+`multilabel_margin_loss`,
+`multi_margin_loss`,
+`nll_loss`,
+`norm`,
+`normalize`,
+`pdist`,
+`poisson_nll_loss`,
+`pow`,
+`prod`,
+`reciprocal`,
+`rsqrt`,
+`sinh`,
+`smooth_l1_loss`,
+`soft_margin_loss`,
+`softmax`,
+`softmin`,
+`softplus`,
+`sum`,
+`renorm`,
+`tan`,
+`triplet_margin_loss`
+
+#### CUDA Ops that promote to the widest input type
+
+These ops don't require a particular dtype for stability, but take multiple inputs
+and require that the inputs' dtypes match. If all of the inputs are
+`float16`, the op runs in `float16`. If any of the inputs is `float32`,
+autocast casts all inputs to `float32` and runs the op in `float32`.
+
+`addcdiv`,
+`addcmul`,
+`atan2`,
+`bilinear`,
+`cross`,
+`dot`,
+`grid_sample`,
+`index_put`,
+`scatter_add`,
+`tensordot`
+
+Some ops not listed here (e.g., binary ops like `add`) natively promote
+inputs without autocasting's intervention. If inputs are a mixture of `float16`
+and `float32`, these ops run in `float32` and produce `float32` output,
+regardless of whether autocast is enabled.
+
+#### Prefer `binary_cross_entropy_with_logits` over `binary_cross_entropy`
+
+The backward passes of {func}`torch.nn.functional.binary_cross_entropy` (and {mod}`torch.nn.BCELoss`, which wraps it)
+can produce gradients that aren't representable in `float16`. In autocast-enabled regions, the forward input
+may be `float16`, which means the backward gradient must be representable in `float16` (autocasting `float16`
+forward inputs to `float32` doesn't help, because that cast must be reversed in backward).
+Therefore, `binary_cross_entropy` and `BCELoss` raise an error in autocast-enabled regions.
+
+Many models use a sigmoid layer right before the binary cross entropy layer.
+In this case, combine the two layers using {func}`torch.nn.functional.binary_cross_entropy_with_logits`
+or {mod}`torch.nn.BCEWithLogitsLoss`. `binary_cross_entropy_with_logits` and `BCEWithLogits`
+are safe to autocast.
+
+(autocast-xpu-op-reference)=
+
+### XPU Op-Specific Behavior (Experimental)
+
+The following lists describe the behavior of eligible ops in autocast-enabled regions.
+These ops always go through autocasting whether they are invoked as part of a {class}`torch.nn.Module`,
+as a function, or as a {class}`torch.Tensor` method. If functions are exposed in multiple namespaces,
+they go through autocasting regardless of the namespace.
+
+Ops not listed below do not go through autocasting. They run in the type
+defined by their inputs. However, autocasting may still change the type
+in which unlisted ops run if they're downstream from autocasted ops.
+
+If an op is unlisted, we assume it's numerically stable in `float16`.
+If you believe an unlisted op is numerically unstable in `float16`,
+please file an issue.
+
+#### XPU Ops that can autocast to `float16`
+
+`addbmm`,
+`addmm`,
+`addmv`,
+`addr`,
+`baddbmm`,
+`bmm`,
+`chain_matmul`,
+`multi_dot`,
+`conv1d`,
+`conv2d`,
+`conv3d`,
+`conv_transpose1d`,
+`conv_transpose2d`,
+`conv_transpose3d`,
+`GRUCell`,
+`linear`,
+`LSTMCell`,
+`matmul`,
+`mm`,
+`mv`,
+`RNNCell`
+
+#### XPU Ops that can autocast to `float32`
+
+`__pow__`,
+`__rdiv__`,
+`__rpow__`,
+`__rtruediv__`,
+`binary_cross_entropy_with_logits`,
+`cosine_embedding_loss`,
+`cosine_similarity`,
+`cumsum`,
+`dist`,
+`exp`,
+`group_norm`,
+`hinge_embedding_loss`,
+`kl_div`,
+`l1_loss`,
+`layer_norm`,
+`log`,
+`log_softmax`,
+`margin_ranking_loss`,
+`nll_loss`,
+`normalize`,
+`poisson_nll_loss`,
+`pow`,
+`reciprocal`,
+`rsqrt`,
+`soft_margin_loss`,
+`softmax`,
+`softmin`,
+`sum`,
+`triplet_margin_loss`
+
+#### XPU Ops that promote to the widest input type
+
+These ops don't require a particular dtype for stability, but take multiple inputs
+and require that the inputs' dtypes match. If all of the inputs are
+`float16`, the op runs in `float16`. If any of the inputs is `float32`,
+autocast casts all inputs to `float32` and runs the op in `float32`.
+
+`bilinear`,
+`cross`,
+`grid_sample`,
+`index_put`,
+`scatter_add`,
+`tensordot`
+
+Some ops not listed here (e.g., binary ops like `add`) natively promote
+inputs without autocasting's intervention. If inputs are a mixture of `float16`
+and `float32`, these ops run in `float32` and produce `float32` output,
+regardless of whether autocast is enabled.
+
+(autocast-cpu-op-reference)=
+
+### CPU Op-Specific Behavior
+
+The following lists describe the behavior of eligible ops in autocast-enabled regions.
+These ops always go through autocasting whether they are invoked as part of a {class}`torch.nn.Module`,
+as a function, or as a {class}`torch.Tensor` method. If functions are exposed in multiple namespaces,
+they go through autocasting regardless of the namespace.
+
+Ops not listed below do not go through autocasting. They run in the type
+defined by their inputs. However, autocasting may still change the type
+in which unlisted ops run if they're downstream from autocasted ops.
+
+If an op is unlisted, we assume it's numerically stable in `bfloat16`.
+If you believe an unlisted op is numerically unstable in `bfloat16`,
+please file an issue. `float16` shares the lists of `bfloat16`.
+
+#### CPU Ops that can autocast to `bfloat16`
+
+`conv1d`,
+`conv2d`,
+`conv3d`,
+`bmm`,
+`mm`,
+`linalg_vecdot`,
+`baddbmm`,
+`addmm`,
+`addbmm`,
+`linear`,
+`matmul`,
+`_convolution`,
+`conv_tbc`,
+`mkldnn_rnn_layer`,
+`conv_transpose1d`,
+`conv_transpose2d`,
+`conv_transpose3d`,
+`prelu`,
+`scaled_dot_product_attention`,
+`_native_multi_head_attention`
+
+#### CPU Ops that can autocast to `float32`
+
+`avg_pool3d`,
+`binary_cross_entropy`,
+`grid_sampler`,
+`grid_sampler_2d`,
+`_grid_sampler_2d_cpu_fallback`,
+`grid_sampler_3d`,
+`polar`,
+`prod`,
+`quantile`,
+`nanquantile`,
+`stft`,
+`cdist`,
+`trace`,
+`view_as_complex`,
+`cholesky`,
+`cholesky_inverse`,
+`cholesky_solve`,
+`inverse`,
+`lu_solve`,
+`orgqr`,
+`inverse`,
+`ormqr`,
+`pinverse`,
+`max_pool3d`,
+`max_unpool2d`,
+`max_unpool3d`,
+`adaptive_avg_pool3d`,
+`reflection_pad1d`,
+`reflection_pad2d`,
+`replication_pad1d`,
+`replication_pad2d`,
+`replication_pad3d`,
+`mse_loss`,
+`cosine_embedding_loss`,
+`nll_loss`,
+`nll_loss2d`,
+`hinge_embedding_loss`,
+`poisson_nll_loss`,
+`cross_entropy_loss`,
+`l1_loss`,
+`huber_loss`,
+`margin_ranking_loss`,
+`soft_margin_loss`,
+`triplet_margin_loss`,
+`multi_margin_loss`,
+`ctc_loss`,
+`kl_div`,
+`multilabel_margin_loss`,
+`binary_cross_entropy_with_logits`,
+`fft_fft`,
+`fft_ifft`,
+`fft_fft2`,
+`fft_ifft2`,
+`fft_fftn`,
+`fft_ifftn`,
+`fft_rfft`,
+`fft_irfft`,
+`fft_rfft2`,
+`fft_irfft2`,
+`fft_rfftn`,
+`fft_irfftn`,
+`fft_hfft`,
+`fft_ihfft`,
+`linalg_cond`,
+`linalg_matrix_rank`,
+`linalg_solve`,
+`linalg_cholesky`,
+`linalg_svdvals`,
+`linalg_eigvals`,
+`linalg_eigvalsh`,
+`linalg_inv`,
+`linalg_householder_product`,
+`linalg_tensorinv`,
+`linalg_tensorsolve`,
+`fake_quantize_per_tensor_affine`,
+`geqrf`,
+`_lu_with_info`,
+`qr`,
+`svd`,
+`triangular_solve`,
+`fractional_max_pool2d`,
+`fractional_max_pool3d`,
+`adaptive_max_pool3d`,
+`multilabel_margin_loss_forward`,
+`linalg_qr`,
+`linalg_cholesky_ex`,
+`linalg_svd`,
+`linalg_eig`,
+`linalg_eigh`,
+`linalg_lstsq`,
+`linalg_inv_ex`
+
+#### CPU Ops that promote to the widest input type
+
+These ops don't require a particular dtype for stability, but take multiple inputs
+and require that the inputs' dtypes match. If all of the inputs are
+`bfloat16`, the op runs in `bfloat16`. If any of the inputs is `float32`,
+autocast casts all inputs to `float32` and runs the op in `float32`.
+
+`cat`,
+`stack`,
+`index_copy`
+
+Some ops not listed here (e.g., binary ops like `add`) natively promote
+inputs without autocasting's intervention. If inputs are a mixture of `bfloat16`
+and `float32`, these ops run in `float32` and produce `float32` output,
+regardless of whether autocast is enabled.
+
+% This module needs to be documented. Adding here in the meantime
+
+% for tracking purposes
+
+```{eval-rst}
+.. py:module:: torch.amp.autocast_mode
+```
+
+```{eval-rst}
+.. py:module:: torch.cpu.amp.autocast_mode
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.amp.autocast_mode
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.amp.common
+```
+
+```{eval-rst}
+.. py:module:: torch.amp.grad_scaler
+```
+
+```{eval-rst}
+.. py:module:: torch.cpu.amp.grad_scaler
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.amp.grad_scaler
+```
diff --git a/docs/source/autograd.md b/docs/source/autograd.md
new file mode 100644
index 000000000000..4218eac05d79
--- /dev/null
+++ b/docs/source/autograd.md
@@ -0,0 +1,472 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# Automatic differentiation package - torch.autograd
+
+```{eval-rst}
+.. automodule:: torch.autograd
+```
+
+```{eval-rst}
+.. currentmodule:: torch.autograd
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    backward
+    grad
+```
+
+(forward-mode-ad)=
+
+## Forward-mode Automatic Differentiation
+
+:::{warning}
+This API is in beta. Even though the function signatures are very unlikely to change, improved
+operator coverage is planned before we consider this stable.
+:::
+
+Please see the [forward-mode AD tutorial](https://pytorch.org/tutorials/intermediate/forward_ad_usage.html)
+for detailed steps on how to use this API.
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    forward_ad.dual_level
+    forward_ad.make_dual
+    forward_ad.unpack_dual
+    forward_ad.enter_dual_level
+    forward_ad.exit_dual_level
+    forward_ad.UnpackedDualTensor
+```
+
+(functional-api)=
+
+## Functional higher level API
+
+:::{warning}
+This API is in beta. Even though the function signatures are very unlikely to change, major
+improvements to performances are planned before we consider this stable.
+:::
+
+This section contains the higher level API for the autograd that builds on the basic API above
+and allows you to compute jacobians, hessians, etc.
+
+This API works with user-provided functions that take only Tensors as input and return
+only Tensors.
+If your function takes other arguments that are not Tensors or Tensors that don't have requires_grad set,
+you can use a lambda to capture them.
+For example, for a function `f` that takes three inputs, a Tensor for which we want the jacobian, another
+tensor that should be considered constant and a boolean flag as `f(input, constant, flag=flag)`
+you can use it as `functional.jacobian(lambda x: f(x, constant, flag=flag), input)`.
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    functional.jacobian
+    functional.hessian
+    functional.vjp
+    functional.jvp
+    functional.vhp
+    functional.hvp
+```
+
+(locally-disable-grad)=
+
+## Locally disabling gradient computation
+
+See {ref}`locally-disable-grad-doc` for more information on the differences
+between no-grad and inference mode as well as other related mechanisms that
+may be confused with the two. Also see {ref}`torch-rst-local-disable-grad`
+for a list of functions that can be used to locally disable gradients.
+
+(default-grad-layouts)=
+
+## Default gradient layouts
+
+When a non-sparse `param` receives a non-sparse gradient during
+{func}`torch.autograd.backward` or {func}`torch.Tensor.backward`
+`param.grad` is accumulated as follows.
+
+If `param.grad` is initially `None`:
+
+1. If `param`'s memory is non-overlapping and dense, `.grad` is
+   created with strides matching `param` (thus matching `param`'s
+   layout).
+2. Otherwise, `.grad` is created with rowmajor-contiguous strides.
+
+If `param` already has a non-sparse `.grad` attribute:
+
+3. If `create_graph=False`, `backward()` accumulates into `.grad`
+   in-place, which preserves its strides.
+4. If `create_graph=True`, `backward()` replaces `.grad` with a
+   new tensor `.grad + new grad`, which attempts (but does not guarantee)
+   matching the preexisting `.grad`'s strides.
+
+The default behavior (letting `.grad`s be `None` before the first
+`backward()`, such that their layout is created according to 1 or 2,
+and retained over time according to 3 or 4) is recommended for best performance.
+Calls to `model.zero_grad()` or `optimizer.zero_grad()` will not affect `.grad`
+layouts.
+
+In fact, resetting all `.grad`s to `None` before each
+accumulation phase, e.g.:
+
+```
+for iterations...
+    ...
+    for param in model.parameters():
+        param.grad = None
+    loss.backward()
+```
+
+such that they're recreated according to 1 or 2 every time,
+is a valid alternative to `model.zero_grad()` or `optimizer.zero_grad()`
+that may improve performance for some networks.
+
+### Manual gradient layouts
+
+If you need manual control over `.grad`'s strides,
+assign `param.grad =` a zeroed tensor with desired strides
+before the first `backward()`, and never reset it to `None`.
+3 guarantees your layout is preserved as long as `create_graph=False`.
+4 indicates your layout is *likely* preserved even if `create_graph=True`.
+
+## In-place operations on Tensors
+
+Supporting in-place operations in autograd is a hard matter, and we discourage
+their use in most cases. Autograd's aggressive buffer freeing and reuse makes
+it very efficient and there are very few occasions when in-place operations
+actually lower memory usage by any significant amount. Unless you're operating
+under heavy memory pressure, you might never need to use them.
+
+### In-place correctness checks
+
+All {class}`Tensor` s keep track of in-place operations applied to them, and
+if the implementation detects that a tensor was saved for backward in one of
+the functions, but it was modified in-place afterwards, an error will be raised
+once backward pass is started. This ensures that if you're using in-place
+functions and not seeing any errors, you can be sure that the computed
+gradients are correct.
+
+## Variable (deprecated)
+
+:::{warning}
+The Variable API has been deprecated: Variables are no longer necessary to
+use autograd with tensors. Autograd automatically supports Tensors with
+`requires_grad` set to `True`. Below please find a quick guide on what
+has changed:
+
+- `Variable(tensor)` and `Variable(tensor, requires_grad)` still work as expected,
+  but they return Tensors instead of Variables.
+- `var.data` is the same thing as `tensor.data`.
+- Methods such as `var.backward(), var.detach(), var.register_hook()` now work on tensors
+  with the same method names.
+
+In addition, one can now create tensors with `requires_grad=True` using factory
+methods such as {func}`torch.randn`, {func}`torch.zeros`, {func}`torch.ones`, and others
+like the following:
+
+`autograd_tensor = torch.randn((2, 3, 4), requires_grad=True)`
+:::
+
+## Tensor autograd functions
+
+```{eval-rst}
+.. autosummary::
+    :nosignatures:
+
+   torch.Tensor.grad
+   torch.Tensor.requires_grad
+   torch.Tensor.is_leaf
+   torch.Tensor.backward
+   torch.Tensor.detach
+   torch.Tensor.detach_
+   torch.Tensor.register_hook
+   torch.Tensor.register_post_accumulate_grad_hook
+   torch.Tensor.retain_grad
+```
+
+## {hidden}`Function`
+
+```{eval-rst}
+.. autoclass:: Function
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Function.forward
+    Function.backward
+    Function.jvp
+    Function.vmap
+```
+
+(context-method-mixins)=
+
+## Context method mixins
+
+When creating a new {class}`Function`, the following methods are available to `ctx`.
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    function.FunctionCtx.mark_dirty
+    function.FunctionCtx.mark_non_differentiable
+    function.FunctionCtx.save_for_backward
+    function.FunctionCtx.set_materialize_grads
+```
+
+## Custom Function utilities
+
+Decorator for backward method.
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    function.once_differentiable
+```
+
+Base custom {class}`Function` used to build PyTorch utilities
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    function.BackwardCFunction
+    function.InplaceFunction
+    function.NestedIOFunction
+
+```
+
+(grad-check)=
+
+## Numerical gradient checking
+
+```{eval-rst}
+.. automodule:: torch.autograd.gradcheck
+```
+
+```{eval-rst}
+.. currentmodule:: torch.autograd.gradcheck
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    gradcheck
+    gradgradcheck
+    GradcheckError
+```
+
+% Just to reset the base path for the rest of this file
+
+```{eval-rst}
+.. currentmodule:: torch.autograd
+```
+
+## Profiler
+
+Autograd includes a profiler that lets you inspect the cost of different
+operators inside your model - both on the CPU and GPU. There are three modes
+implemented at the moment - CPU-only using {class}`~torch.autograd.profiler.profile`.
+nvprof based (registers both CPU and GPU activity) using
+{class}`~torch.autograd.profiler.emit_nvtx`.
+and vtune profiler based using
+{class}`~torch.autograd.profiler.emit_itt`.
+
+```{eval-rst}
+.. autoclass:: torch.autograd.profiler.profile
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    profiler.profile.export_chrome_trace
+    profiler.profile.key_averages
+    profiler.profile.self_cpu_time_total
+    profiler.profile.total_average
+    profiler.parse_nvprof_trace
+    profiler.EnforceUnique
+    profiler.KinetoStepTracker
+    profiler.record_function
+    profiler_util.Interval
+    profiler_util.Kernel
+    profiler_util.MemRecordsAcc
+    profiler_util.StringTable
+```
+
+```{eval-rst}
+.. autoclass:: torch.autograd.profiler.emit_nvtx
+```
+
+```{eval-rst}
+.. autoclass:: torch.autograd.profiler.emit_itt
+
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    profiler.load_nvprof
+```
+
+## Debugging and anomaly detection
+
+```{eval-rst}
+.. autoclass:: detect_anomaly
+```
+
+```{eval-rst}
+.. autoclass:: set_detect_anomaly
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    grad_mode.set_multithreading_enabled
+
+
+```
+
+## Autograd graph
+
+Autograd exposes methods that allow one to inspect the graph and interpose behavior during
+the backward pass.
+
+The `grad_fn` attribute of a {class}`torch.Tensor` holds a {class}`torch.autograd.graph.Node`
+if the tensor is the output of a operation that was recorded by autograd (i.e., grad_mode is
+enabled and at least one of the inputs required gradients), or `None` otherwise.
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    graph.Node.name
+    graph.Node.metadata
+    graph.Node.next_functions
+    graph.Node.register_hook
+    graph.Node.register_prehook
+    graph.increment_version
+```
+
+Some operations need intermediary results to be saved during the forward pass
+in order to execute the backward pass.
+These intermediary results are saved as attributes on the `grad_fn` and can be accessed.
+For example:
+
+```
+>>> a = torch.tensor([0., 0., 0.], requires_grad=True)
+>>> b = a.exp()
+>>> print(isinstance(b.grad_fn, torch.autograd.graph.Node))
+True
+>>> print(dir(b.grad_fn))
+['__call__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_raw_saved_result', '_register_hook_dict', '_saved_result', 'metadata', 'name', 'next_functions', 'register_hook', 'register_prehook', 'requires_grad']
+>>> print(torch.allclose(b.grad_fn._saved_result, b))
+True
+```
+
+You can also define how these saved tensors should be packed / unpacked using hooks.
+A common application is to trade compute for memory by saving those intermediary results
+to disk or to CPU instead of leaving them on the GPU. This is especially useful if you
+notice your model fits on GPU during evaluation, but not training.
+Also see {ref}`saved-tensors-hooks-doc`.
+
+```{eval-rst}
+.. autoclass:: torch.autograd.graph.saved_tensors_hooks
+```
+
+```{eval-rst}
+.. autoclass:: torch.autograd.graph.save_on_cpu
+```
+
+```{eval-rst}
+.. autoclass:: torch.autograd.graph.disable_saved_tensors_hooks
+```
+
+```{eval-rst}
+.. autoclass:: torch.autograd.graph.register_multi_grad_hook
+```
+
+```{eval-rst}
+.. autoclass:: torch.autograd.graph.allow_mutation_on_saved_tensors
+```
+
+```{eval-rst}
+.. autoclass:: torch.autograd.graph.GradientEdge
+```
+
+```{eval-rst}
+.. autofunction:: torch.autograd.graph.get_gradient_edge
+
+
+```
+
+% This module needs to be documented. Adding here in the meantime
+
+% for tracking purposes
+
+```{eval-rst}
+.. py:module:: torch.autograd.anomaly_mode
+```
+
+```{eval-rst}
+.. py:module:: torch.autograd.forward_ad
+```
+
+```{eval-rst}
+.. py:module:: torch.autograd.function
+```
+
+```{eval-rst}
+.. py:module:: torch.autograd.functional
+```
+
+```{eval-rst}
+.. py:module:: torch.autograd.grad_mode
+```
+
+```{eval-rst}
+.. py:module:: torch.autograd.graph
+```
+
+```{eval-rst}
+.. py:module:: torch.autograd.profiler
+```
+
+```{eval-rst}
+.. py:module:: torch.autograd.profiler_legacy
+```
+
+```{eval-rst}
+.. py:module:: torch.autograd.profiler_util
+```
+
+```{eval-rst}
+.. py:module:: torch.autograd.variable
+```
diff --git a/docs/source/backends.md b/docs/source/backends.md
new file mode 100644
index 000000000000..41869ba9b77b
--- /dev/null
+++ b/docs/source/backends.md
@@ -0,0 +1,389 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# torch.backends
+
+```{eval-rst}
+.. automodule:: torch.backends
+```
+
+`torch.backends` controls the behavior of various backends that PyTorch supports.
+
+These backends include:
+
+- `torch.backends.cpu`
+- `torch.backends.cuda`
+- `torch.backends.cudnn`
+- `torch.backends.cusparselt`
+- `torch.backends.mha`
+- `torch.backends.mps`
+- `torch.backends.mkl`
+- `torch.backends.mkldnn`
+- `torch.backends.nnpack`
+- `torch.backends.openmp`
+- `torch.backends.opt_einsum`
+- `torch.backends.xeon`
+
+## torch.backends.cpu
+
+```{eval-rst}
+.. automodule:: torch.backends.cpu
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.cpu.get_cpu_capability
+```
+
+## torch.backends.cuda
+
+```{eval-rst}
+.. automodule:: torch.backends.cuda
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.cuda.is_built
+```
+
+```{eval-rst}
+.. currentmodule:: torch.backends.cuda.matmul
+```
+
+```{eval-rst}
+.. attribute::  allow_tf32
+
+    A :class:`bool` that controls whether TensorFloat-32 tensor cores may be used in matrix
+    multiplications on Ampere or newer GPUs. See :ref:`tf32_on_ampere`.
+```
+
+```{eval-rst}
+.. attribute::  allow_fp16_reduced_precision_reduction
+
+    A :class:`bool` that controls whether reduced precision reductions (e.g., with fp16 accumulation type) are allowed with fp16 GEMMs.
+```
+
+```{eval-rst}
+.. attribute::  allow_bf16_reduced_precision_reduction
+
+    A :class:`bool` that controls whether reduced precision reductions are allowed with bf16 GEMMs.
+```
+
+```{eval-rst}
+.. currentmodule:: torch.backends.cuda
+```
+
+```{eval-rst}
+.. attribute::  cufft_plan_cache
+
+    ``cufft_plan_cache`` contains the cuFFT plan caches for each CUDA device.
+    Query a specific device `i`'s cache via `torch.backends.cuda.cufft_plan_cache[i]`.
+
+    .. currentmodule:: torch.backends.cuda.cufft_plan_cache
+    .. attribute::  size
+
+        A readonly :class:`int` that shows the number of plans currently in a cuFFT plan cache.
+
+    .. attribute::  max_size
+
+        A :class:`int` that controls the capacity of a cuFFT plan cache.
+
+    .. method::  clear()
+
+        Clears a cuFFT plan cache.
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.preferred_blas_library
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.preferred_rocm_fa_library
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.preferred_linalg_library
+```
+
+```{eval-rst}
+.. autoclass:: torch.backends.cuda.SDPAParams
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.flash_sdp_enabled
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.enable_mem_efficient_sdp
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.mem_efficient_sdp_enabled
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.enable_flash_sdp
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.math_sdp_enabled
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.enable_math_sdp
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.fp16_bf16_reduction_math_sdp_allowed
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.cudnn_sdp_enabled
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.enable_cudnn_sdp
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.is_flash_attention_available
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.can_use_flash_attention
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.can_use_efficient_attention
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.can_use_cudnn_attention
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cuda.sdp_kernel
+```
+
+## torch.backends.cudnn
+
+```{eval-rst}
+.. automodule:: torch.backends.cudnn
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cudnn.version
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cudnn.is_available
+```
+
+```{eval-rst}
+.. attribute::  enabled
+
+    A :class:`bool` that controls whether cuDNN is enabled.
+```
+
+```{eval-rst}
+.. attribute::  allow_tf32
+
+    A :class:`bool` that controls where TensorFloat-32 tensor cores may be used in cuDNN
+    convolutions on Ampere or newer GPUs. See :ref:`tf32_on_ampere`.
+```
+
+```{eval-rst}
+.. attribute::  deterministic
+
+    A :class:`bool` that, if True, causes cuDNN to only use deterministic convolution algorithms.
+    See also :func:`torch.are_deterministic_algorithms_enabled` and
+    :func:`torch.use_deterministic_algorithms`.
+```
+
+```{eval-rst}
+.. attribute::  benchmark
+
+    A :class:`bool` that, if True, causes cuDNN to benchmark multiple convolution algorithms
+    and select the fastest.
+```
+
+```{eval-rst}
+.. attribute::  benchmark_limit
+
+    A :class:`int` that specifies the maximum number of cuDNN convolution algorithms to try when
+    `torch.backends.cudnn.benchmark` is True. Set `benchmark_limit` to zero to try every
+    available algorithm. Note that this setting only affects convolutions dispatched via the
+    cuDNN v8 API.
+```
+
+```{eval-rst}
+.. py:module:: torch.backends.cudnn.rnn
+```
+
+## torch.backends.cusparselt
+
+```{eval-rst}
+.. automodule:: torch.backends.cusparselt
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cusparselt.version
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.cusparselt.is_available
+```
+
+## torch.backends.mha
+
+```{eval-rst}
+.. automodule:: torch.backends.mha
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.mha.get_fastpath_enabled
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.mha.set_fastpath_enabled
+
+```
+
+## torch.backends.mps
+
+```{eval-rst}
+.. automodule:: torch.backends.mps
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.mps.is_available
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.mps.is_built
+
+```
+
+## torch.backends.mkl
+
+```{eval-rst}
+.. automodule:: torch.backends.mkl
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.mkl.is_available
+```
+
+```{eval-rst}
+.. autoclass::  torch.backends.mkl.verbose
+
+```
+
+## torch.backends.mkldnn
+
+```{eval-rst}
+.. automodule:: torch.backends.mkldnn
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.mkldnn.is_available
+```
+
+```{eval-rst}
+.. autoclass::  torch.backends.mkldnn.verbose
+```
+
+## torch.backends.nnpack
+
+```{eval-rst}
+.. automodule:: torch.backends.nnpack
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.nnpack.is_available
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.nnpack.flags
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.nnpack.set_flags
+```
+
+## torch.backends.openmp
+
+```{eval-rst}
+.. automodule:: torch.backends.openmp
+```
+
+```{eval-rst}
+.. autofunction::  torch.backends.openmp.is_available
+```
+
+% Docs for other backends need to be added here.
+% Automodules are just here to ensure checks run but they don't actually
+% add anything to the rendered page for now.
+
+```{eval-rst}
+.. py:module:: torch.backends.quantized
+```
+
+```{eval-rst}
+.. py:module:: torch.backends.xnnpack
+```
+
+```{eval-rst}
+.. py:module:: torch.backends.kleidiai
+
+```
+
+## torch.backends.opt_einsum
+
+```{eval-rst}
+.. automodule:: torch.backends.opt_einsum
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.opt_einsum.is_available
+```
+
+```{eval-rst}
+.. autofunction:: torch.backends.opt_einsum.get_opt_einsum
+```
+
+```{eval-rst}
+.. attribute::  enabled
+
+    A :class:`bool` that controls whether opt_einsum is enabled (``True`` by default). If so,
+    torch.einsum will use opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/path_finding.html)
+    if available to calculate an optimal path of contraction for faster performance.
+
+    If opt_einsum is not available, torch.einsum will fall back to the default contraction path
+    of left to right.
+```
+
+```{eval-rst}
+.. attribute::  strategy
+
+    A :class:`str` that specifies which strategies to try when ``torch.backends.opt_einsum.enabled``
+    is ``True``. By default, torch.einsum will try the "auto" strategy, but the "greedy" and "optimal"
+    strategies are also supported. Note that the "optimal" strategy is factorial on the number of
+    inputs as it tries all possible paths. See more details in opt_einsum's docs
+    (https://optimized-einsum.readthedocs.io/en/stable/path_finding.html).
+
+```
+
+## torch.backends.xeon
+
+```{eval-rst}
+.. automodule:: torch.backends.xeon
+```
+
+```{eval-rst}
+.. py:module:: torch.backends.xeon.run_cpu
+```
diff --git a/docs/source/benchmark_utils.md b/docs/source/benchmark_utils.md
new file mode 100644
index 000000000000..8f58b60b0342
--- /dev/null
+++ b/docs/source/benchmark_utils.md
@@ -0,0 +1,58 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# Benchmark Utils - torch.utils.benchmark
+
+```{eval-rst}
+.. automodule:: torch.utils.benchmark
+```
+
+```{eval-rst}
+.. currentmodule:: torch.utils.benchmark
+```
+
+```{eval-rst}
+.. autoclass:: Timer
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Measurement
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: CallgrindStats
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: FunctionCounts
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Compare
+    :members:
+```
+
+% These are missing documentation. Adding them here until a better place
+% is made in this file.
+
+```{eval-rst}
+.. py:module:: torch.utils.benchmark.examples
+```
+
+```{eval-rst}
+.. py:module:: torch.utils.benchmark.op_fuzzers
+```
+
+```{eval-rst}
+.. py:module:: torch.utils.benchmark.utils
+```
+
+```{eval-rst}
+.. py:module:: torch.utils.benchmark.utils.valgrind_wrapper
+```
diff --git a/docs/source/checkpoint.md b/docs/source/checkpoint.md
new file mode 100644
index 000000000000..d27d0a44021f
--- /dev/null
+++ b/docs/source/checkpoint.md
@@ -0,0 +1,42 @@
+# torch.utils.checkpoint
+
+```{note}
+Checkpointing is implemented by rerunning a forward-pass segment for
+each checkpointed segment during backward propagation.  This can cause persistent
+states like the RNG state to be more advanced than they would without
+checkpointing.  By default, checkpointing includes logic to juggle
+the RNG state such that checkpointed passes making use of RNG
+(through dropout for example) have deterministic output as
+compared to non-checkpointed passes.  The logic to stash and restore
+RNG states can incur a moderate performance hit depending on the runtime
+of checkpointed operations.  If deterministic output compared to
+non-checkpointed passes is not required, supply `preserve_rng_state=False`
+to `checkpoint` or `checkpoint_sequential` to omit stashing and
+restoring the RNG state during each checkpoint.
+
+The stashing logic saves and restores the RNG state for CPU and another
+device type (infer the device type from Tensor arguments excluding CPU
+tensors by `_infer_device_type`) to the `run_fn`. If there are multiple
+device, device state will only be saved for devices of a single device type,
+and the remaining devices will be ignored. Consequently, if any checkpointed
+functions involve randomness, this may result in incorrect gradients. (Note
+that if CUDA devices are among the devices detected, it will be prioritized;
+otherwise, the first device encountered will be selected.) If there are no
+CPU-tensors, the default device type state (default value is `cuda`, and it
+could be set to other device by `DefaultDeviceType`) will be saved and restored.
+However, the logic has no way to anticipate if the user will move
+Tensors to a new device within the `run_fn` itself.  Therefore, if you move
+Tensors to a new device ("new" meaning not belonging to the set of
+[current device + devices of Tensor arguments]) within `run_fn`, deterministic
+output compared to non-checkpointed passes is never guaranteed.
+```
+
+```{eval-rst}
+.. currentmodule:: torch.utils.checkpoint
+.. autofunction:: checkpoint
+.. autofunction:: checkpoint_sequential
+.. autofunction:: set_checkpoint_debug_enabled
+.. autoclass:: CheckpointPolicy
+.. autoclass:: SelectiveCheckpointContext
+.. autofunction:: create_selective_checkpoint_contexts
+```
diff --git a/docs/source/community/build_ci_governance.rst b/docs/source/community/build_ci_governance.rst
index 9c8279af8d0a..dcd4fcfa38d7 100644
--- a/docs/source/community/build_ci_governance.rst
+++ b/docs/source/community/build_ci_governance.rst
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+:orphan:
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyTorch Governance | Build + CI
 ===============================
 
diff --git a/docs/source/community/contribution_guide.rst b/docs/source/community/contribution_guide.rst
index ab6fb13da763..230bd6b739e5 100644
--- a/docs/source/community/contribution_guide.rst
+++ b/docs/source/community/contribution_guide.rst
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+:orphan:
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. note::
 
    This page has been deprecated. Please refer to the `Contribution Guide <https://github.com/pytorch/pytorch/wiki/The-Ultimate-Guide-to-PyTorch-Contributions>`_ on the PyTorch Wiki.
@@ -323,13 +328,21 @@ Python Docs
 PyTorch documentation is generated from python source using
 `Sphinx <https://www.sphinx-doc.org/en/master/>`__. Generated HTML is
 copied to the docs folder in the main branch of
+<<<<<<< HEAD
 `pytorch.github.io <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__,
+=======
+`pytorch.org/docs <https://pytorch.org/docs/main>`__,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 and is served via GitHub pages.
 
 -  Site: https://pytorch.org/docs
 -  GitHub: https://github.com/pytorch/pytorch/tree/main/docs
 -  Served from:
+<<<<<<< HEAD
    `https://github.com/pytorch/pytorch.github.io/tree/master/docs <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__
+=======
+   `https://pytorch.org/docs/main <https://pytorch.org/docs/main>`__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 C++ Docs
 ~~~~~~~~
diff --git a/docs/source/community/design.rst b/docs/source/community/design.rst
index 16b1500afcdd..485dbbfbb0cf 100644
--- a/docs/source/community/design.rst
+++ b/docs/source/community/design.rst
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+:orphan:
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyTorch Design Philosophy
 =========================
 
diff --git a/docs/source/community/governance.rst b/docs/source/community/governance.rst
index 7c1c253e79bb..f8b5ae4fa1ab 100644
--- a/docs/source/community/governance.rst
+++ b/docs/source/community/governance.rst
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+:orphan:
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyTorch Governance | Mechanics
 ==============================
 
diff --git a/docs/source/community/index.rst b/docs/source/community/index.rst
new file mode 100644
index 000000000000..cf04b2f844e7
--- /dev/null
+++ b/docs/source/community/index.rst
@@ -0,0 +1,19 @@
+:orphan:
+
+Community
+=========
+
+PyTorch is more than just a deep learning framework—it's a vibrant
+ecosystem powered by a diverse global community. The PyTorch community
+brings together researchers, developers, students, and industry
+professionals who collaborate to advance the state of machine learning.
+
+Check out the resources below to learn how to contribute code to the
+core framework, report and fix bugs, improve documentation, and much more.
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+
+   The Ultimate Guide to PyTorch Contributions <https://github.com/pytorch/pytorch/wiki/The-Ultimate-Guide-to-PyTorch-Contributions>
+   *
diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index 23baddcee7aa..e6bd24162695 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+:orphan:
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyTorch Governance | Maintainers
 =========================================
 
diff --git a/docs/source/complex_numbers.md b/docs/source/complex_numbers.md
new file mode 100644
index 000000000000..610f9a06615a
--- /dev/null
+++ b/docs/source/complex_numbers.md
@@ -0,0 +1,161 @@
+(complex_numbers-doc)=
+
+# Complex Numbers
+
+Complex numbers are numbers that can be expressed in the form {math}`a + bj`, where a and b are real numbers,
+and *j* is called the imaginary unit, which satisfies the equation {math}`j^2 = -1`. Complex numbers frequently occur in mathematics and
+engineering, especially in topics like signal processing. Traditionally many users and libraries (e.g., TorchAudio) have
+handled complex numbers by representing the data in float tensors with shape {math}`(..., 2)` where the last
+dimension contains the real and imaginary values.
+
+Tensors of complex dtypes provide a more natural user experience while working with complex numbers. Operations on
+complex tensors (e.g., {func}`torch.mv`, {func}`torch.matmul`) are likely to be faster and more memory efficient
+than operations on float tensors mimicking them. Operations involving complex numbers in PyTorch are optimized
+to use vectorized assembly instructions and specialized kernels (e.g. LAPACK, cuBlas).
+
+```{note}
+Spectral operations in the [torch.fft module](https://pytorch.org/docs/stable/fft.html#torch-fft) support
+native complex tensors.
+```
+
+```{warning}
+Complex tensors is a beta feature and subject to change.
+```
+
+## Creating Complex Tensors
+
+We support two complex dtypes: `torch.cfloat` and `torch.cdouble`
+
+```python
+>>> x = torch.randn(2,2, dtype=torch.cfloat)
+>>> x
+tensor([[-0.4621-0.0303j, -0.2438-0.5874j],
+     [ 0.7706+0.1421j,  1.2110+0.1918j]])
+```
+
+```{note}
+The default dtype for complex tensors is determined by the default floating point dtype.
+If the default floating point dtype is `torch.float64` then complex numbers are inferred to
+have a dtype of `torch.complex128`, otherwise they are assumed to have a dtype of `torch.complex64`.
+```
+
+All factory functions apart from {func}`torch.linspace`, {func}`torch.logspace`, and {func}`torch.arange` are
+supported for complex tensors.
+
+## Transition from the old representation
+
+Users who currently worked around the lack of complex tensors with real tensors of shape {math}`(..., 2)`
+can easily to switch using the complex tensors in their code using {func}`torch.view_as_complex`
+and {func}`torch.view_as_real`. Note that these functions don’t perform any copy and return a
+view of the input tensor.
+
+```python
+>>> x = torch.randn(3, 2)
+>>> x
+tensor([[ 0.6125, -0.1681],
+     [-0.3773,  1.3487],
+     [-0.0861, -0.7981]])
+>>> y = torch.view_as_complex(x)
+>>> y
+tensor([ 0.6125-0.1681j, -0.3773+1.3487j, -0.0861-0.7981j])
+>>> torch.view_as_real(y)
+tensor([[ 0.6125, -0.1681],
+     [-0.3773,  1.3487],
+     [-0.0861, -0.7981]])
+```
+
+## Accessing real and imag
+
+The real and imaginary values of a complex tensor can be accessed using the {attr}`real` and
+{attr}`imag`.
+
+```{note}
+Accessing `real` and `imag` attributes doesn't allocate any memory, and in-place updates on the
+`real` and `imag` tensors will update the original complex tensor. Also, the
+returned `real` and `imag` tensors are not contiguous.
+```
+
+```python
+>>> y.real
+tensor([ 0.6125, -0.3773, -0.0861])
+>>> y.imag
+tensor([-0.1681,  1.3487, -0.7981])
+
+>>> y.real.mul_(2)
+tensor([ 1.2250, -0.7546, -0.1722])
+>>> y
+tensor([ 1.2250-0.1681j, -0.7546+1.3487j, -0.1722-0.7981j])
+>>> y.real.stride()
+(2,)
+```
+
+## Angle and abs
+
+The angle and absolute values of a complex tensor can be computed using {func}`torch.angle` and
+{func}`torch.abs`.
+
+```python
+>>> x1=torch.tensor([3j, 4+4j])
+>>> x1.abs()
+tensor([3.0000, 5.6569])
+>>> x1.angle()
+tensor([1.5708, 0.7854])
+```
+
+## Linear Algebra
+
+Many linear algebra operations, like {func}`torch.matmul`, {func}`torch.linalg.svd`, {func}`torch.linalg.solve` etc., support complex numbers.
+If you'd like to request an operation we don't currently support, please [search](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+is%3Aopen+complex)
+if an issue has already been filed and if not, [file one](https://github.com/pytorch/pytorch/issues/new/choose).
+
+## Serialization
+
+Complex tensors can be serialized, allowing data to be saved as complex values.
+
+```python
+>>> torch.save(y, 'complex_tensor.pt')
+>>> torch.load('complex_tensor.pt')
+tensor([ 0.6125-0.1681j, -0.3773+1.3487j, -0.0861-0.7981j])
+```
+
+## Autograd
+
+PyTorch supports autograd for complex tensors. The gradient computed is the Conjugate Wirtinger derivative,
+the negative of which is precisely the direction of steepest descent used in Gradient Descent algorithm. Thus,
+all the existing optimizers can be implemented to work out of the box with complex parameters. For more details,
+check out the note {ref}`complex_autograd-doc`.
+
+## Optimizers
+
+Semantically, we define stepping through a PyTorch optimizer with complex parameters as being equivalent to stepping
+through the same optimizer on the {func}`torch.view_as_real` equivalent of the complex params. More concretely:
+
+```python
+>>> params = [torch.rand(2, 3, dtype=torch.complex64) for _ in range(5)]
+>>> real_params = [torch.view_as_real(p) for p in params]
+
+>>> complex_optim = torch.optim.AdamW(params)
+>>> real_optim = torch.optim.AdamW(real_params)
+```
+
+`real_optim` and `complex_optim` will compute the same updates on the parameters, though there may be slight numerical
+discrepancies between the two optimizers, similar to numerical discrepancies between foreach vs forloop optimizers
+and capturable vs default optimizers. For more details, see [numbercial accuracy](https://pytorch.org/docs/stable/notes/numerical_accuracy.html).
+
+Specifically, while you can think of our optimizer's handling of complex tensors as the same as optimizing over their
+`p.real` and `p.imag` pieces separately, the implementation details are not precisely that. Note that the
+{func}`torch.view_as_real` equivalent will convert a complex tensor to a real tensor with shape {math}`(..., 2)`,
+whereas splitting a complex tensor into two tensors is 2 tensors of size {math}`(...)`. This distinction has no impact on
+pointwise optimizers (like AdamW) but will cause slight discrepancy in optimizers that do global reductions (like LBFGS).
+We currently do not have optimizers that do per-Tensor reductions and thus do not yet define this behavior. Open an issue
+if you have a use case that requires precisely defining this behavior.
+
+We do not fully support the following subsystems:
+
+* Quantization
+* JIT
+* Sparse Tensors
+* Distributed
+
+If any of these would help your use case, please [search](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+is%3Aopen+complex)
+if an issue has already been filed and if not, [file one](https://github.com/pytorch/pytorch/issues/new/choose).
\ No newline at end of file
diff --git a/docs/source/cond.md b/docs/source/cond.md
new file mode 100644
index 000000000000..0765d59dae7f
--- /dev/null
+++ b/docs/source/cond.md
@@ -0,0 +1,174 @@
+(cond)=
+
+# Control Flow - Cond
+
+`torch.cond` is a structured control flow operator. It can be used to specify if-else like control flow
+and can logically be seen as implemented as follows.
+
+```python
+def cond(
+    pred: Union[bool, torch.Tensor],
+    true_fn: Callable,
+    false_fn: Callable,
+    operands: Tuple[torch.Tensor]
+):
+    if pred:
+        return true_fn(*operands)
+    else:
+        return false_fn(*operands)
+```
+
+Its unique power lies in its ability of expressing **data-dependent control flow**: it lowers to a conditional
+operator (`torch.ops.higher_order.cond`), which preserves predicate, true function and false functions.
+This unlocks great flexibility in writing and deploying models that change model architecture based on
+the **value** or **shape** of inputs or intermediate outputs of tensor operations.
+
+```{warning}
+`torch.cond` is a prototype feature in PyTorch. It has limited support for input and output types and
+doesn't support training currently. Please look forward to a more stable implementation in a future version of PyTorch.
+Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+```
+
+## Examples
+
+Below is an example that uses cond to branch based on input shape:
+
+```python
+    import torch
+
+    def true_fn(x: torch.Tensor):
+        return x.cos() + x.sin()
+
+    def false_fn(x: torch.Tensor):
+        return x.sin()
+
+    class DynamicShapeCondPredicate(torch.nn.Module):
+        """
+        A basic usage of cond based on dynamic shape predicate.
+        """
+
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            def true_fn(x: torch.Tensor):
+                return x.cos()
+
+            def false_fn(x: torch.Tensor):
+                return x.sin()
+
+            return torch.cond(x.shape[0] > 4, true_fn, false_fn, (x,))
+
+    dyn_shape_mod = DynamicShapeCondPredicate()
+```
+
+We can eagerly run the model and expect the results vary based on input shape:
+
+```python
+    inp = torch.randn(3)
+    inp2 = torch.randn(5)
+    assert torch.equal(dyn_shape_mod(inp), false_fn(inp))
+    assert torch.equal(dyn_shape_mod(inp2), true_fn(inp2))
+```
+
+We can export the model for further transformations and deployment:
+
+```python
+    inp = torch.randn(4, 3)
+    dim_batch = torch.export.Dim("batch", min=2)
+    ep = torch.export.export(DynamicShapeCondPredicate(), (inp,), {}, dynamic_shapes={"x": {0: dim_batch}})
+    print(ep)
+```
+
+This gives us an exported program as shown below:
+
+```
+    class GraphModule(torch.nn.Module):
+        def forward(self, arg0_1: f32[s0, 3]):
+            sym_size: Sym(s0) = torch.ops.aten.sym_size.int(arg0_1, 0)
+            gt: Sym(s0 > 4) = sym_size > 4;  sym_size = None
+            true_graph_0 = self.true_graph_0
+            false_graph_0 = self.false_graph_0
+            conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+            return (conditional,)
+
+        class <lambda>(torch.nn.Module):
+            def forward(self, arg0_1: f32[s0, 3]):
+                cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
+                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+                add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
+                return add
+
+        class <lambda>(torch.nn.Module):
+            def forward(self, arg0_1: f32[s0, 3]):
+                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+                return sin
+```
+
+Notice that `torch.cond` is lowered to `torch.ops.higher_order.cond`, its predicate becomes a Symbolic expression over the shape of input,
+and branch functions becomes two sub-graph attributes of the top level graph module.
+
+Here is another example that showcases how to express a data-dependent control flow:
+
+```python
+    class DataDependentCondPredicate(torch.nn.Module):
+        """
+        A basic usage of cond based on data dependent predicate.
+        """
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return torch.cond(x.sum() > 4.0, true_fn, false_fn, (x,))
+```
+
+The exported program we get after export:
+
+```
+    class GraphModule(torch.nn.Module):
+        def forward(self, arg0_1: f32[s0, 3]):
+            sum_1: f32[] = torch.ops.aten.sum.default(arg0_1)
+            gt: b8[] = torch.ops.aten.gt.Scalar(sum_1, 4.0);  sum_1 = None
+
+            true_graph_0 = self.true_graph_0
+            false_graph_0 = self.false_graph_0
+            conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+            return (conditional,)
+
+        class <lambda>(torch.nn.Module):
+            def forward(self, arg0_1: f32[s0, 3]):
+                cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
+                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+                add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
+                return add
+
+        class <lambda>(torch.nn.Module):
+            def forward(self, arg0_1: f32[s0, 3]):
+                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+                return sin
+```
+
+## Invariants of torch.ops.higher_order.cond
+
+There are several useful invariants for `torch.ops.higher_order.cond`:
+
+- For predicate:
+    - Dynamicness of predicate is preserved (e.g. `gt` shown in the above example)
+    - If the predicate in user-program is constant (e.g. a python bool constant), the `pred` of the operator will be a constant.
+
+- For branches:
+    - The input and output signature will be a flattened tuple.
+    - They are `torch.fx.GraphModule`.
+    - Closures in original function becomes explicit inputs. No closures.
+    - No mutations on inputs or globals are allowed.
+
+- For operands:
+    - It will also be a flat tuple.
+
+- Nesting of `torch.cond` in user program becomes nested graph modules.
+
+## API Reference
+
+```{eval-rst}
+.. autofunction:: torch._higher_order_ops.cond.cond
+```
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4cc2c104eeb9..82ec8423855c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -12,11 +12,14 @@
 # serve to show the default.
 
 import inspect
+<<<<<<< HEAD
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 
 # import sys
@@ -29,6 +32,14 @@
 import torch
 
 
+<<<<<<< HEAD
+=======
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 try:
     import torchvision  # noqa: F401
 except ImportError:
@@ -38,15 +49,26 @@
 
 RELEASE = os.environ.get("RELEASE", False)
 
+<<<<<<< HEAD
 import pytorch_sphinx_theme
+=======
+import pytorch_sphinx_theme2
+
+
+html_theme = "pytorch_sphinx_theme2"
+html_theme_path = [pytorch_sphinx_theme2.get_html_theme_path()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # -- General configuration ------------------------------------------------
 
+<<<<<<< HEAD
 # If your documentation needs a minimal Sphinx version, state it here.
 #
 needs_sphinx = "3.1.2"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
@@ -58,6 +80,7 @@
     "sphinx.ext.todo",
     "sphinx.ext.coverage",
     "sphinx.ext.napoleon",
+<<<<<<< HEAD
     "sphinx.ext.viewcode",
     "sphinxcontrib.katex",
     "sphinx.ext.autosectionlabel",
@@ -67,13 +90,46 @@
     "sphinx.ext.linkcode",
 ]
 
+=======
+    "sphinx.ext.autosectionlabel",
+    "sphinxcontrib.katex",
+    "sphinx_copybutton",
+    "sphinx_design",
+    "myst_parser",
+    "sphinx.ext.linkcode",
+    "sphinxcontrib.mermaid",
+    "sphinx_sitemap",
+]
+
+myst_enable_extensions = [
+    "colon_fence",
+    "deflist",
+    "html_image",
+]
+
+html_baseurl = "https://docs.pytorch.org/docs/stable/"  # needed for sphinx-sitemap
+sitemap_locales = [None]
+sitemap_excludes = [
+    "search.html",
+    "genindex.html",
+]
+sitemap_url_scheme = "{link}"
+
+html_additional_pages = {
+    "404": "404.html",
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # build the templated autosummary files
 autosummary_generate = True
 numpydoc_show_class_members = False
 
+<<<<<<< HEAD
 # Theme has bootstrap already
 panels_add_bootstrap_css = False
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # autosectionlabel throws warnings if section names are duplicated.
 # The following tells autosectionlabel to not throw a warning for
 # duplicated section names that are in different documents.
@@ -85,6 +141,7 @@
 
 katex_prerender = True
 
+<<<<<<< HEAD
 napoleon_use_ivar = True
 
 # Add any paths that contain templates here, relative to this directory.
@@ -94,6 +151,115 @@
 
 html_domain_indices = False
 
+=======
+# General information about the project.
+project = "PyTorch"
+copyright = "PyTorch Contributors"
+author = "PyTorch Contributors"
+torch_version = str(torch.__version__)
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+# TODO: change to [:2] at v1.0
+version = "main (" + torch_version + " )"
+# The full version, including alpha/beta/rc tags.
+release = "main"
+
+# Customized html_title here.
+# Default is " ".join(project, release, "documentation") if not set
+if RELEASE:
+    # Turn 1.11.0aHASH into 1.11
+    # Note: the release candidates should no longer have the aHASH suffix, but in any
+    # case we wish to leave only major.minor, even for rc builds.
+    version = ".".join(torch_version.split(".")[:2])
+    html_title = " ".join((project, version, "documentation"))
+    release = version
+
+switcher_version = "main" if not RELEASE else version
+
+html_static_path = ["_static"]
+html_theme_options = {
+    "logo": {"text": "Home"},
+    "analytics_id": "GTM-T8XT4PS",
+    "canonical_url": "https://pytorch.org/docs/stable/",
+    "switcher": {
+        "json_url": "https://docs.pytorch.org/docs/pytorch-versions.json",
+        "version_match": switcher_version,
+    },
+    "show_toc_level": 2,
+    "navigation_with_keys": False,
+    "external_links": [
+        {
+            "name": "Tutorials",
+            "url": "https://pytorch.org/tutorials/",
+        },
+    ],
+    "show_version_warning_banner": True,
+    "icon_links": [
+        {
+            "name": "X",
+            "url": "https://x.com/PyTorch",
+            "icon": "fa-brands fa-x-twitter",
+        },
+        {
+            "name": "GitHub",
+            "url": "https://github.com/pytorch/pytorch",
+            "icon": "fa-brands fa-github",
+        },
+        {
+            "name": "PyTorch Forum",
+            "url": "https://discuss.pytorch.org/",
+            "icon": "fa-brands fa-discourse",
+        },
+        {
+            "name": "PyPi",
+            "url": "https://pypi.org/project/torch/",
+            "icon": "fa-brands fa-python",
+        },
+    ],
+    "navbar_align": "left",
+    "navbar_start": ["version-switcher", "navbar-logo"],
+    "navbar_center": ["navbar-nav"],
+    "navbar_end": ["search-field-custom", "theme-switcher", "navbar-icon-links"],
+    "header_links_before_dropdown": 6,
+    "navbar_persistent": [],
+    "use_edit_page_button": True,
+    "pytorch_project": "docs",
+}
+
+theme_variables = pytorch_sphinx_theme2.get_theme_variables()
+html_context = {
+    "theme_variables": theme_variables,
+    "github_url": "https://github.com",
+    "github_user": "pytorch",
+    "github_repo": "pytorch",
+    "feedback_url": "https://github.com/pytorch/pytorch",
+    "github_version": "main",
+    "pytorch_project": "docs",
+    "doc_path": "docs/source",
+    "theme_variables": theme_variables,  # noqa: F601
+    # library links are defined in
+    # pytorch_sphinx_theme2/pytorch_sphinx_theme2/links.json
+    "library_links": theme_variables.get("library_links", []),
+    "version": version,
+    "date_info": {
+        "paths_to_skip": ["generated/", "index"],
+    },
+}
+
+napoleon_use_ivar = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = [
+    "_templates",
+    os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
+]
+# TODO: document these and remove them from here.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 coverage_ignore_functions = [
     # torch
     "typename",
@@ -406,6 +572,7 @@
     "graph_pool_handle",
     "is_current_stream_capturing",
     "make_graphed_callables",
+<<<<<<< HEAD
     # torch.cuda.memory
     "caching_allocator_alloc",
     "caching_allocator_delete",
@@ -434,6 +601,10 @@
     "reset_peak_memory_stats",
     "reset_peak_host_memory_stats",
     "set_per_process_memory_fraction",
+=======
+    # torch.mtia.memory
+    "reset_peak_memory_stats",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.cuda.nccl
     "all_gather",
     "all_reduce",
@@ -890,7 +1061,10 @@
     "to_node",
     "wrap_node",
     "sym_sqrt",
+<<<<<<< HEAD
     "sym_ite",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.fx.experimental.symbolic_shapes
     "bind_symbols",
     "cast_symbool_to_symint_guardless",
@@ -1215,10 +1389,13 @@
     "scatter_kwargs",
     # torch.nn.parameter
     "is_lazy",
+<<<<<<< HEAD
     # torch.nn.utils.clip_grad
     "clip_grad_norm",
     "clip_grad_norm_",
     "clip_grad_value_",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.nn.utils.convert_parameters
     "parameters_to_vector",
     "vector_to_parameters",
@@ -1833,8 +2010,11 @@
     # torch.utils.backend_registration
     "generate_methods_for_privateuse1_backend",
     "rename_privateuse1_backend",
+<<<<<<< HEAD
     # torch.utils.benchmark.examples.blas_compare_setup
     "conda_run",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.utils.benchmark.examples.op_benchmark
     "assert_dicts_equal",
     # torch.utils.benchmark.op_fuzzers.spectral
@@ -2176,7 +2356,10 @@
     "UnsynchronizedAccessError",
     # torch.cuda.memory
     "MemPool",
+<<<<<<< HEAD
     "MemPoolContext",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.distributed.elastic.multiprocessing.errors
     "ChildFailedError",
     "ProcessFailure",
@@ -2510,6 +2693,12 @@
     # torch.distributed.checkpoint.filesystem
     "FileSystemReader",
     "FileSystemWriter",
+<<<<<<< HEAD
+=======
+    # torch.distributed.checkpoint.hf_storage
+    "HuggingFaceStorageReader",
+    "HuggingFaceStorageWriter",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.distributed.checkpoint.metadata
     "BytesStorageMetadata",
     "ChunkStorageMetadata",
@@ -3177,8 +3366,11 @@
     "TorchVersion",
     # torch.types
     "SymInt",
+<<<<<<< HEAD
     # torch.utils.benchmark.examples.blas_compare_setup
     "SubEnvSpec",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # torch.utils.benchmark.examples.compare
     "FauxTorch",
     # torch.utils.benchmark.examples.spectral_ops_fuzz_test
@@ -3364,6 +3556,7 @@
 # The master toctree document.
 master_doc = "index"
 
+<<<<<<< HEAD
 # General information about the project.
 project = "PyTorch"
 copyright = "PyTorch Contributors"
@@ -3391,6 +3584,8 @@
     html_title = " ".join((project, version, "documentation"))
     release = version
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Use the linkcode extension to override [SOURCE] links to point
 # to the repo. Use the torch_version variable defined above to
@@ -3446,8 +3641,11 @@ def linkcode_resolve(domain, info):
 pygments_style = "sphinx"
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
+<<<<<<< HEAD
 todo_include_todos = True
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Disable docstring inheritance
 autodoc_inherit_docstrings = False
 
@@ -3497,13 +3695,17 @@ def linkcode_resolve(domain, info):
 #
 #
 
+<<<<<<< HEAD
 html_theme = "pytorch_sphinx_theme"
 html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 
+<<<<<<< HEAD
 html_theme_options = {
     "pytorch_project": "docs",
     "canonical_url": "https://pytorch.org/docs/stable/",
@@ -3524,6 +3726,17 @@ def linkcode_resolve(domain, info):
 html_static_path = ["_static"]
 
 html_css_files = ["css/jit.css", "css/custom.css"]
+=======
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+
+html_css_files = [
+    "css/jit.css",
+    "css/custom.css",
+    "https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from sphinx.ext.coverage import CoverageBuilder
 
@@ -3626,7 +3839,10 @@ def process_docstring(app, what_, name, obj, options, lines):
         lines (List[str]): the lines of the docstring, see above
 
     References:
+<<<<<<< HEAD
         https://www.sphinx-doc.org/en/1.5.1/_modules/sphinx/ext/autodoc.html
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html
     """
     import re
@@ -3647,6 +3863,7 @@ def process_docstring(app, what_, name, obj, options, lines):
         lines.append("")
 
 
+<<<<<<< HEAD
 # Called automatically by Sphinx, making this `conf.py` an "extension".
 def setup(app):
     # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
@@ -3664,6 +3881,19 @@ def setup(app):
 
     app.connect("build-finished", coverage_post_process)
     app.connect("autodoc-process-docstring", process_docstring)
+=======
+def setup(app):
+    app.connect("build-finished", coverage_post_process)
+    app.connect("autodoc-process-docstring", process_docstring)
+    app.connect("html-page-context", hide_edit_button_for_pages)
+    app.config.add_last_updated = True
+    return {"version": "0.1", "parallel_read_safe": True}
+
+
+def hide_edit_button_for_pages(app, pagename, templatename, context, doctree):
+    if pagename.startswith("generated/"):
+        context["theme_use_edit_page_button"] = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # From PyTorch 1.5, we now use autogenerated files to document classes and
@@ -3711,6 +3941,7 @@ def visit_reference(self, node):
 
 # -- Options for LaTeX output ---------------------------------------------
 
+<<<<<<< HEAD
 latex_elements = {
     # The paper size ('letterpaper' or 'a4paper').
     #
@@ -3724,11 +3955,37 @@ def visit_reference(self, node):
     # Latex figure (float) alignment
     #
     # 'figure_align': 'htbp',
+=======
+latex_engine = "lualatex"
+latex_show_urls = "footnote"
+
+latex_elements = {
+    "papersize": "letterpaper",
+    "pointsize": "10pt",
+    "tableofcontents": r"\pdfbookmark[0]{Contents}{toc}\tableofcontents",
+    "preamble": r"""
+       \usepackage{tocloft}
+       \setcounter{tocdepth}{3}
+       \setcounter{secnumdepth}{3}
+       % Fix table column widths
+       \renewenvironment{tabulary}{\begin{longtable}{p{0.3\linewidth}p{0.7\linewidth}}}{\end{longtable}}
+
+       % Ensure tables don't overflow
+       \AtBeginEnvironment{tabular}{\sloppy}
+    """,
+    "fncychap": r"\usepackage[Bjornstrup]{fncychap}",
+    "extraclassoptions": "oneside",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
+<<<<<<< HEAD
+=======
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 latex_documents = [
     (
         master_doc,
@@ -3738,6 +3995,10 @@ def visit_reference(self, node):
         "manual",
     ),
 ]
+<<<<<<< HEAD
+=======
+latex_use_xindy = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # -- Options for manual page output ---------------------------------------
diff --git a/docs/source/config_mod.md b/docs/source/config_mod.md
new file mode 100644
index 000000000000..eab05f297ee8
--- /dev/null
+++ b/docs/source/config_mod.md
@@ -0,0 +1,11 @@
+# torch.__config__
+
+```{eval-rst}
+.. automodule:: torch.__config__
+.. currentmodule:: torch.__config__
+```
+
+```{eval-rst}
+.. autofunction:: show
+.. autofunction:: parallel_info
+```
diff --git a/docs/source/cpp_index.rst b/docs/source/cpp_index.rst
index 04352d6bc01a..a8bd79ca48e9 100644
--- a/docs/source/cpp_index.rst
+++ b/docs/source/cpp_index.rst
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+:orphan:
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C++
 ===================================
 .. Note::
diff --git a/docs/source/cuda.md b/docs/source/cuda.md
new file mode 100644
index 000000000000..e72610fa81e7
--- /dev/null
+++ b/docs/source/cuda.md
@@ -0,0 +1,309 @@
+# torch.cuda
+
+```{eval-rst}
+.. automodule:: torch.cuda
+```
+
+```{eval-rst}
+.. currentmodule:: torch.cuda
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    StreamContext
+    can_device_access_peer
+    current_blas_handle
+    current_device
+    current_stream
+    cudart
+    default_stream
+    device
+    device_count
+    device_memory_used
+    device_of
+    get_arch_list
+    get_device_capability
+    get_device_name
+    get_device_properties
+    get_gencode_flags
+    get_stream_from_external
+    get_sync_debug_mode
+    init
+    ipc_collect
+    is_available
+    is_initialized
+    is_tf32_supported
+    memory_usage
+    set_device
+    set_stream
+    set_sync_debug_mode
+    stream
+    synchronize
+    utilization
+    temperature
+    power_draw
+    clock_rate
+    AcceleratorError
+    OutOfMemoryError
+```
+
+## Random Number Generator
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    get_rng_state
+    get_rng_state_all
+    set_rng_state
+    set_rng_state_all
+    manual_seed
+    manual_seed_all
+    seed
+    seed_all
+    initial_seed
+
+```
+
+## Communication collectives
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    comm.broadcast
+    comm.broadcast_coalesced
+    comm.reduce_add
+    comm.reduce_add_coalesced
+    comm.scatter
+    comm.gather
+```
+
+## Streams and events
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Stream
+    ExternalStream
+    Event
+```
+
+## Graphs (beta)
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    is_current_stream_capturing
+    graph_pool_handle
+    CUDAGraph
+    graph
+    make_graphed_callables
+```
+
+(cuda-memory-management-api)=
+
+```{eval-rst}
+.. automodule:: torch.cuda.memory
+```
+
+```{eval-rst}
+.. currentmodule:: torch.cuda.memory
+```
+
+## Memory management
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+     empty_cache
+     get_per_process_memory_fraction
+     list_gpu_processes
+     mem_get_info
+     memory_stats
+     memory_stats_as_nested_dict
+     reset_accumulated_memory_stats
+     host_memory_stats
+     host_memory_stats_as_nested_dict
+     reset_accumulated_host_memory_stats
+     memory_summary
+     memory_snapshot
+     memory_allocated
+     max_memory_allocated
+     reset_max_memory_allocated
+     memory_reserved
+     max_memory_reserved
+     set_per_process_memory_fraction
+     memory_cached
+     max_memory_cached
+     reset_max_memory_cached
+     reset_peak_memory_stats
+     reset_peak_host_memory_stats
+     caching_allocator_alloc
+     caching_allocator_delete
+     get_allocator_backend
+     CUDAPluggableAllocator
+     change_current_allocator
+     MemPool
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    caching_allocator_enable
+```
+
+```{eval-rst}
+.. currentmodule:: torch.cuda
+```
+
+```{eval-rst}
+.. autoclass:: torch.cuda.use_mem_pool
+```
+
+% FIXME The following doesn't seem to exist. Is it supposed to?
+% https://github.com/pytorch/pytorch/issues/27785
+% .. autofunction:: reset_max_memory_reserved
+
+## NVIDIA Tools Extension (NVTX)
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    nvtx.mark
+    nvtx.range_push
+    nvtx.range_pop
+    nvtx.range
+```
+
+## Jiterator (beta)
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    jiterator._create_jit_fn
+    jiterator._create_multi_output_jit_fn
+```
+
+## TunableOp
+
+Some operations could be implemented using more than one library or more than
+one technique. For example, a GEMM could be implemented for CUDA or ROCm using
+either the cublas/cublasLt libraries or hipblas/hipblasLt libraries,
+respectively. How does one know which implementation is the fastest and should
+be chosen? That's what TunableOp provides. Certain operators have been
+implemented using multiple strategies as Tunable Operators. At runtime, all
+strategies are profiled and the fastest is selected for all subsequent
+operations.
+
+See the {doc}`documentation <cuda.tunable>` for information on how to use it.
+
+```{toctree}
+:hidden: true
+
+cuda.tunable
+```
+
+## Stream Sanitizer (prototype)
+
+CUDA Sanitizer is a prototype tool for detecting synchronization errors between streams in PyTorch.
+See the {doc}`documentation <cuda._sanitizer>` for information on how to use it.
+
+```{toctree}
+:hidden: true
+
+cuda._sanitizer
+```
+
+## GPUDirect Storage (prototype)
+
+The APIs in `torch.cuda.gds` provide thin wrappers around certain cuFile APIs that allow
+direct memory access transfers between GPU memory and storage, avoiding a bounce buffer in the CPU. See the
+[cufile api documentation](https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufile-io-api)
+for more details.
+
+These APIs can be used in versions greater than or equal to CUDA 12.6. In order to use these APIs, one must
+ensure that their system is appropriately configured to use GPUDirect Storage per the
+[GPUDirect Storage documentation](https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/contents.html).
+
+See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use these.
+
+```{eval-rst}
+.. currentmodule:: torch.cuda.gds
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    gds_register_buffer
+    gds_deregister_buffer
+    GdsFile
+
+```
+
+% This module needs to be documented. Adding here in the meantime
+
+% for tracking purposes
+
+```{eval-rst}
+.. py:module:: torch.cuda.comm
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.error
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.gds
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.graphs
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.jiterator
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.nccl
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.nvtx
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.profiler
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.random
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.sparse
+```
+
+```{eval-rst}
+.. py:module:: torch.cuda.streams
+```
diff --git a/docs/source/cuda.tunable.md b/docs/source/cuda.tunable.md
new file mode 100644
index 000000000000..565633fe1881
--- /dev/null
+++ b/docs/source/cuda.tunable.md
@@ -0,0 +1,97 @@
+```{eval-rst}
+.. currentmodule:: torch.cuda.tunable
+```
+
+# TunableOp
+
+## Overview
+
+```{eval-rst}
+.. automodule:: torch.cuda.tunable
+```
+
+## API Reference
+
+```{eval-rst}
+.. autofunction:: enable
+```
+
+```{eval-rst}
+.. autofunction:: is_enabled
+```
+
+```{eval-rst}
+.. autofunction:: tuning_enable
+```
+
+```{eval-rst}
+.. autofunction:: tuning_is_enabled
+```
+
+```{eval-rst}
+.. autofunction:: record_untuned_enable
+```
+
+```{eval-rst}
+.. autofunction:: record_untuned_is_enabled
+```
+
+```{eval-rst}
+.. autofunction:: set_max_tuning_duration
+```
+
+```{eval-rst}
+.. autofunction:: get_max_tuning_duration
+```
+
+```{eval-rst}
+.. autofunction:: set_max_tuning_iterations
+```
+
+```{eval-rst}
+.. autofunction:: get_max_tuning_iterations
+```
+
+```{eval-rst}
+.. autofunction:: set_filename
+```
+
+```{eval-rst}
+.. autofunction:: get_filename
+```
+
+```{eval-rst}
+.. autofunction:: get_results
+```
+
+```{eval-rst}
+.. autofunction:: get_validators
+```
+
+```{eval-rst}
+.. autofunction:: write_file_on_exit
+```
+
+```{eval-rst}
+.. autofunction:: write_file
+```
+
+```{eval-rst}
+.. autofunction:: read_file
+```
+
+```{eval-rst}
+.. autofunction:: tune_gemm_in_file
+```
+
+```{eval-rst}
+.. autofunction:: mgpu_tune_gemm_in_file
+```
+
+```{eval-rst}
+.. autofunction:: set_rotating_buffer_size
+```
+
+```{eval-rst}
+.. autofunction:: get_rotating_buffer_size
+```
diff --git a/docs/source/cudnn_rnn_determinism.rst b/docs/source/cudnn_rnn_determinism.rst
index c002925a8c30..05d74aa94117 100644
--- a/docs/source/cudnn_rnn_determinism.rst
+++ b/docs/source/cudnn_rnn_determinism.rst
@@ -13,4 +13,8 @@
 
     See the `cuDNN 8 Release Notes`_ for more information.
 
+<<<<<<< HEAD
 .. _cuDNN 8 Release Notes: https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_8.html
+=======
+.. _cuDNN 8 Release Notes: https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-880/release-notes/rel_8.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/data.md b/docs/source/data.md
new file mode 100644
index 000000000000..77c9869dd87e
--- /dev/null
+++ b/docs/source/data.md
@@ -0,0 +1,532 @@
+# torch.utils.data
+
+```{eval-rst}
+.. automodule:: torch.utils.data
+```
+
+At the heart of PyTorch data loading utility is the {class}`torch.utils.data.DataLoader`
+class. It represents a Python iterable over a dataset, with support for
+
+- {ref}`map-style and iterable-style datasets <dataset-types>`,
+- {ref}`customizing data loading order <data-loading-order-and-sampler>`,
+- {ref}`automatic batching <loading-batched-and-non-batched-data>`,
+- {ref}`single- and multi-process data loading <single-and-multi-process-data-loading>`,
+- {ref}`automatic memory pinning <memory-pinning>`.
+
+These options are configured by the constructor arguments of a
+{class}`~torch.utils.data.DataLoader`, which has signature:
+
+```python
+DataLoader(dataset, batch_size=1, shuffle=False, sampler=None,
+           batch_sampler=None, num_workers=0, collate_fn=None,
+           pin_memory=False, drop_last=False, timeout=0,
+           worker_init_fn=None, *, prefetch_factor=2,
+           persistent_workers=False)
+```
+
+The sections below describe in details the effects and usages of these options.
+
+(dataset-types)=
+## Dataset Types
+
+The most important argument of {class}`~torch.utils.data.DataLoader`
+constructor is {attr}`dataset`, which indicates a dataset object to load data
+from. PyTorch supports two different types of datasets:
+
+- {ref}`map-style-datasets`,
+- {ref}`iterable-style-datasets`.
+
+(map-style-datasets)=
+### Map-style datasets
+
+A map-style dataset is one that implements the {meth}`__getitem__` and
+{meth}`__len__` protocols, and represents a map from (possibly non-integral)
+indices/keys to data samples.
+
+For example, such a dataset, when accessed with `dataset[idx]`, could read
+the `idx`-th image and its corresponding label from a folder on the disk.
+
+See {class}`~torch.utils.data.Dataset` for more details.
+
+(iterable-style-datasets)=
+### Iterable-style datasets
+
+An iterable-style dataset is an instance of a subclass of {class}`~torch.utils.data.IterableDataset`
+that implements the {meth}`__iter__` protocol, and represents an iterable over
+data samples. This type of datasets is particularly suitable for cases where
+random reads are expensive or even improbable, and where the batch size depends
+on the fetched data.
+
+For example, such a dataset, when called `iter(dataset)`, could return a
+stream of data reading from a database, a remote server, or even logs generated
+in real time.
+
+See {class}`~torch.utils.data.IterableDataset` for more details.
+
+:::{note}
+When using a {class}`~torch.utils.data.IterableDataset` with
+{ref}`multi-process data loading <multi-process-data-loading>`. The same
+dataset object is replicated on each worker process, and thus the
+replicas must be configured differently to avoid duplicated data. See
+{class}`~torch.utils.data.IterableDataset` documentations for how to
+achieve this.
+:::
+
+
+(data-loading-order-and-sampler)=
+## Data Loading Order and {class}`~torch.utils.data.Sampler`
+
+For {ref}`iterable-style datasets <iterable-style-datasets>`, data loading order
+is entirely controlled by the user-defined iterable. This allows easier
+implementations of chunk-reading and dynamic batch size (e.g., by yielding a
+batched sample at each time).
+
+The rest of this section concerns the case with
+{ref}`map-style datasets <map-style-datasets>`. {class}`torch.utils.data.Sampler`
+classes are used to specify the sequence of indices/keys used in data loading.
+They represent iterable objects over the indices to datasets. E.g., in the
+common case with stochastic gradient decent (SGD), a
+{class}`~torch.utils.data.Sampler` could randomly permute a list of indices
+and yield each one at a time, or yield a small number of them for mini-batch
+SGD.
+
+A sequential or shuffled sampler will be automatically constructed based on the {attr}`shuffle` argument to a {class}`~torch.utils.data.DataLoader`.
+Alternatively, users may use the {attr}`sampler` argument to specify a
+custom {class}`~torch.utils.data.Sampler` object that at each time yields
+the next index/key to fetch.
+
+A custom {class}`~torch.utils.data.Sampler` that yields a list of batch
+indices at a time can be passed as the {attr}`batch_sampler` argument.
+Automatic batching can also be enabled via {attr}`batch_size` and
+{attr}`drop_last` arguments. See
+{ref}`the next section <loading-batched-and-non-batched-data>` for more details
+on this.
+
+:::{note}
+Neither {attr}`sampler` nor {attr}`batch_sampler` is compatible with
+iterable-style datasets, since such datasets have no notion of a key or an
+index.
+:::
+
+(loading-batched-and-non-batched-data)=
+## Loading Batched and Non-Batched Data
+
+{class}`~torch.utils.data.DataLoader` supports automatically collating
+individual fetched data samples into batches via arguments
+{attr}`batch_size`, {attr}`drop_last`, {attr}`batch_sampler`, and
+{attr}`collate_fn` (which has a default function).
+
+(automatic-batching-default)=
+### Automatic batching (default)
+
+This is the most common case, and corresponds to fetching a minibatch of
+data and collating them into batched samples, i.e., containing Tensors with
+one dimension being the batch dimension (usually the first).
+
+When {attr}`batch_size` (default `1`) is not `None`, the data loader yields
+batched samples instead of individual samples. {attr}`batch_size` and
+{attr}`drop_last` arguments are used to specify how the data loader obtains
+batches of dataset keys. For map-style datasets, users can alternatively
+specify {attr}`batch_sampler`, which yields a list of keys at a time.
+
+:::{note}
+The {attr}`batch_size` and {attr}`drop_last` arguments essentially are used
+to construct a {attr}`batch_sampler` from {attr}`sampler`. For map-style
+datasets, the {attr}`sampler` is either provided by user or constructed
+based on the {attr}`shuffle` argument. For iterable-style datasets, the
+{attr}`sampler` is a dummy infinite one. See
+{ref}`this section <data-loading-order-and-sampler>` on more details on
+samplers.
+:::
+
+:::{note}
+When fetching from
+{ref}`iterable-style datasets <iterable-style-datasets>` with
+{ref}`multi-processing <multi-process-data-loading>` the {attr}`drop_last`
+argument drops the last non-full batch of each worker's dataset replica.
+:::
+
+After fetching a list of samples using the indices from sampler, the function
+passed as the {attr}`collate_fn` argument is used to collate lists of samples
+into batches.
+
+In this case, loading from a map-style dataset is roughly equivalent with:
+
+```python
+for indices in batch_sampler:
+    yield collate_fn([dataset[i] for i in indices])
+```
+
+and loading from an iterable-style dataset is roughly equivalent with:
+
+```python
+dataset_iter = iter(dataset)
+for indices in batch_sampler:
+    yield collate_fn([next(dataset_iter) for _ in indices])
+```
+
+A custom {attr}`collate_fn` can be used to customize collation, e.g., padding
+sequential data to max length of a batch. See
+{ref}`this section <dataloader-collate_fn>` on more about {attr}`collate_fn`.
+
+(disable-automatic-batching)=
+### Disable automatic batching
+
+In certain cases, users may want to handle batching manually in dataset code,
+or simply load individual samples. For example, it could be cheaper to directly
+load batched data (e.g., bulk reads from a database or reading continuous
+chunks of memory), or the batch size is data dependent, or the program is
+designed to work on individual samples. Under these scenarios, it's likely
+better to not use automatic batching (where {attr}`collate_fn` is used to
+collate the samples), but let the data loader directly return each member of
+the {attr}`dataset` object.
+
+When both {attr}`batch_size` and {attr}`batch_sampler` are `None` (default
+value for {attr}`batch_sampler` is already `None`), automatic batching is
+disabled. Each sample obtained from the {attr}`dataset` is processed with the
+function passed as the {attr}`collate_fn` argument.
+
+**When automatic batching is disabled**, the default {attr}`collate_fn` simply
+converts NumPy arrays into PyTorch Tensors, and keeps everything else untouched.
+
+In this case, loading from a map-style dataset is roughly equivalent with:
+
+```python
+for index in sampler:
+    yield collate_fn(dataset[index])
+```
+
+and loading from an iterable-style dataset is roughly equivalent with:
+
+```python
+for data in iter(dataset):
+    yield collate_fn(data)
+```
+
+See {ref}`this section <dataloader-collate_fn>` on more about {attr}`collate_fn`.
+
+(dataloader-collate_fn)=
+### Working with {attr}`collate_fn`
+
+The use of {attr}`collate_fn` is slightly different when automatic batching is
+enabled or disabled.
+
+**When automatic batching is disabled**, {attr}`collate_fn` is called with
+each individual data sample, and the output is yielded from the data loader
+iterator. In this case, the default {attr}`collate_fn` simply converts NumPy
+arrays in PyTorch tensors.
+
+**When automatic batching is enabled**, {attr}`collate_fn` is called with a list
+of data samples at each time. It is expected to collate the input samples into
+a batch for yielding from the data loader iterator. The rest of this section
+describes the behavior of the default {attr}`collate_fn`
+({func}`~torch.utils.data.default_collate`).
+
+For instance, if each data sample consists of a 3-channel image and an integral
+class label, i.e., each element of the dataset returns a tuple
+`(image, class_index)`, the default {attr}`collate_fn` collates a list of
+such tuples into a single tuple of a batched image tensor and a batched class
+label Tensor. In particular, the default {attr}`collate_fn` has the following
+properties:
+
+- It always prepends a new dimension as the batch dimension.
+- It automatically converts NumPy arrays and Python numerical values into
+  PyTorch Tensors.
+- It preserves the data structure, e.g., if each sample is a dictionary, it
+  outputs a dictionary with the same set of keys but batched Tensors as values
+  (or lists if the values can not be converted into Tensors). Same
+  for `list` s, `tuple` s, `namedtuple` s, etc.
+
+Users may use customized {attr}`collate_fn` to achieve custom batching, e.g.,
+collating along a dimension other than the first, padding sequences of
+various lengths, or adding support for custom data types.
+
+If you run into a situation where the outputs of {class}`~torch.utils.data.DataLoader`
+have dimensions or type that is different from your expectation, you may
+want to check your {attr}`collate_fn`.
+
+(single-and-multi-process-data-loading)=
+## Single- and Multi-process Data Loading
+
+A {class}`~torch.utils.data.DataLoader` uses single-process data loading by
+default.
+
+Within a Python process, the
+[Global Interpreter Lock (GIL)](https://wiki.python.org/moin/GlobalInterpreterLock)
+prevents true fully parallelizing Python code across threads. To avoid blocking
+computation code with data loading, PyTorch provides an easy switch to perform
+multi-process data loading by simply setting the argument {attr}`num_workers`
+to a positive integer.
+
+(single-process-data-loading-default)=
+### Single-process data loading (default)
+
+In this mode, data fetching is done in the same process a
+{class}`~torch.utils.data.DataLoader` is initialized. Therefore, data loading
+may block computing. However, this mode may be preferred when resource(s) used
+for sharing data among processes (e.g., shared memory, file descriptors) is
+limited, or when the entire dataset is small and can be loaded entirely in
+memory. Additionally, single-process loading often shows more readable error
+traces and thus is useful for debugging.
+
+(multi-process-data-loading)=
+### Multi-process data loading
+
+Setting the argument {attr}`num_workers` as a positive integer will
+turn on multi-process data loading with the specified number of loader worker
+processes.
+
+:::{warning}
+After several iterations, the loader worker processes will consume
+the same amount of CPU memory as the parent process for all Python
+objects in the parent process which are accessed from the worker
+processes. This can be problematic if the Dataset contains a lot of
+data (e.g., you are loading a very large list of filenames at Dataset
+construction time) and/or you are using a lot of workers (overall
+memory usage is `number of workers * size of parent process`). The
+simplest workaround is to replace Python objects with non-refcounted
+representations such as Pandas, Numpy or PyArrow objects. Check out
+[issue #13246](https://github.com/pytorch/pytorch/issues/13246#issuecomment-905703662)
+for more details on why this occurs and example code for how to
+workaround these problems.
+:::
+
+In this mode, each time an iterator of a {class}`~torch.utils.data.DataLoader`
+is created (e.g., when you call `enumerate(dataloader)`), {attr}`num_workers`
+worker processes are created. At this point, the {attr}`dataset`,
+{attr}`collate_fn`, and {attr}`worker_init_fn` are passed to each
+worker, where they are used to initialize, and fetch data. This means that
+dataset access together with its internal IO, transforms
+(including {attr}`collate_fn`) runs in the worker process.
+
+{func}`torch.utils.data.get_worker_info()` returns various useful information
+in a worker process (including the worker id, dataset replica, initial seed,
+etc.), and returns `None` in main process. Users may use this function in
+dataset code and/or {attr}`worker_init_fn` to individually configure each
+dataset replica, and to determine whether the code is running in a worker
+process. For example, this can be particularly helpful in sharding the dataset.
+
+For map-style datasets, the main process generates the indices using
+{attr}`sampler` and sends them to the workers. So any shuffle randomization is
+done in the main process which guides loading by assigning indices to load.
+
+For iterable-style datasets, since each worker process gets a replica of the
+{attr}`dataset` object, naive multi-process loading will often result in
+duplicated data. Using {func}`torch.utils.data.get_worker_info()` and/or
+{attr}`worker_init_fn`, users may configure each replica independently. (See
+{class}`~torch.utils.data.IterableDataset` documentations for how to achieve
+this. ) For similar reasons, in multi-process loading, the {attr}`drop_last`
+argument drops the last non-full batch of each worker's iterable-style dataset
+replica.
+
+Workers are shut down once the end of the iteration is reached, or when the
+iterator becomes garbage collected.
+
+:::{warning}
+It is generally not recommended to return CUDA tensors in multi-process
+loading because of many subtleties in using CUDA and sharing CUDA tensors in
+multiprocessing (see {ref}`multiprocessing-cuda-note`). Instead, we recommend
+using {ref}`automatic memory pinning <memory-pinning>` (i.e., setting
+{attr}`pin_memory=True`), which enables fast data transfer to CUDA-enabled
+GPUs.
+:::
+
+(platform-specific-behaviors)=
+#### Platform-specific behaviors
+
+Since workers rely on Python {py:mod}`multiprocessing`, worker launch behavior is
+different on Windows compared to Unix.
+
+- On Unix, {func}`fork()` is the default {py:mod}`multiprocessing` start method.
+  Using {func}`fork`, child workers typically can access the {attr}`dataset` and
+  Python argument functions directly through the cloned address space.
+- On Windows or MacOS, {func}`spawn()` is the default {py:mod}`multiprocessing` start method.
+  Using {func}`spawn()`, another interpreter is launched which runs your main script,
+  followed by the internal worker function that receives the {attr}`dataset`,
+  {attr}`collate_fn` and other arguments through {py:mod}`pickle` serialization.
+
+This separate serialization means that you should take two steps to ensure you
+are compatible with Windows while using multi-process data loading:
+
+- Wrap most of you main script's code within `if __name__ == '__main__':` block,
+  to make sure it doesn't run again (most likely generating error) when each worker
+  process is launched. You can place your dataset and {class}`~torch.utils.data.DataLoader`
+  instance creation logic here, as it doesn't need to be re-executed in workers.
+- Make sure that any custom {attr}`collate_fn`, {attr}`worker_init_fn`
+  or {attr}`dataset` code is declared as top level definitions, outside of the
+  `__main__` check. This ensures that they are available in worker processes.
+  (this is needed since functions are pickled as references only, not `bytecode`.)
+
+
+(data-loading-randomness)=
+#### Randomness in multi-process data loading
+
+By default, each worker will have its PyTorch seed set to `base_seed + worker_id`,
+where `base_seed` is a long generated by main process using its RNG (thereby,
+consuming a RNG state mandatorily) or a specified {attr}`generator`. However, seeds for other
+libraries may be duplicated upon initializing workers, causing each worker to return
+identical random numbers. (See {ref}`this section <dataloader-workers-random-seed>` in FAQ.).
+
+In {attr}`worker_init_fn`, you may access the PyTorch seed set for each worker
+with either {func}`torch.utils.data.get_worker_info().seed <torch.utils.data.get_worker_info>`
+or {func}`torch.initial_seed()`, and use it to seed other libraries before data
+loading.
+
+
+(memory-pinning)=
+## Memory Pinning
+
+Host to GPU copies are much faster when they originate from pinned (page-locked)
+memory. See {ref}`cuda-memory-pinning` for more details on when and how to use
+pinned memory generally.
+
+For data loading, passing {attr}`pin_memory=True` to a
+{class}`~torch.utils.data.DataLoader` will automatically put the fetched data
+Tensors in pinned memory, and thus enables faster data transfer to CUDA-enabled
+GPUs.
+
+The default memory pinning logic only recognizes Tensors and maps and iterables
+containing Tensors. By default, if the pinning logic sees a batch that is a
+custom type (which will occur if you have a {attr}`collate_fn` that returns a
+custom batch type), or if each element of your batch is a custom type, the
+pinning logic will not recognize them, and it will return that batch (or those
+elements) without pinning the memory. To enable memory pinning for custom
+batch or data type(s), define a {meth}`pin_memory` method on your custom
+type(s).
+
+See the example below.
+
+Example:
+
+```python
+class SimpleCustomBatch:
+    def __init__(self, data):
+        transposed_data = list(zip(*data))
+        self.inp = torch.stack(transposed_data[0], 0)
+        self.tgt = torch.stack(transposed_data[1], 0)
+
+    # custom memory pinning method on custom type
+    def pin_memory(self):
+        self.inp = self.inp.pin_memory()
+        self.tgt = self.tgt.pin_memory()
+        return self
+
+def collate_wrapper(batch):
+    return SimpleCustomBatch(batch)
+
+inps = torch.arange(10 * 5, dtype=torch.float32).view(10, 5)
+tgts = torch.arange(10 * 5, dtype=torch.float32).view(10, 5)
+dataset = TensorDataset(inps, tgts)
+
+loader = DataLoader(dataset, batch_size=2, collate_fn=collate_wrapper,
+                    pin_memory=True)
+
+for batch_ndx, sample in enumerate(loader):
+    print(sample.inp.is_pinned())
+    print(sample.tgt.is_pinned())
+```
+
+```{eval-rst}
+.. autoclass:: DataLoader
+```
+
+```{eval-rst}
+.. autoclass:: Dataset
+```
+
+```{eval-rst}
+.. autoclass:: IterableDataset
+```
+
+```{eval-rst}
+.. autoclass:: TensorDataset
+```
+
+```{eval-rst}
+.. autoclass:: StackDataset
+```
+
+```{eval-rst}
+.. autoclass:: ConcatDataset
+```
+
+```{eval-rst}
+.. autoclass:: ChainDataset
+```
+
+```{eval-rst}
+.. autoclass:: Subset
+```
+
+```{eval-rst}
+.. autofunction:: torch.utils.data._utils.collate.collate
+```
+
+```{eval-rst}
+.. autofunction:: torch.utils.data.default_collate
+```
+
+```{eval-rst}
+.. autofunction:: torch.utils.data.default_convert
+```
+
+```{eval-rst}
+.. autofunction:: torch.utils.data.get_worker_info
+```
+
+```{eval-rst}
+.. autofunction:: torch.utils.data.random_split
+```
+
+```{eval-rst}
+.. autoclass:: torch.utils.data.Sampler
+```
+
+```{eval-rst}
+.. autoclass:: torch.utils.data.SequentialSampler
+```
+
+```{eval-rst}
+.. autoclass:: torch.utils.data.RandomSampler
+```
+
+```{eval-rst}
+.. autoclass:: torch.utils.data.SubsetRandomSampler
+```
+
+```{eval-rst}
+.. autoclass:: torch.utils.data.WeightedRandomSampler
+```
+
+```{eval-rst}
+.. autoclass:: torch.utils.data.BatchSampler
+```
+
+```{eval-rst}
+.. autoclass:: torch.utils.data.distributed.DistributedSampler
+
+```
+
+% These modules are documented as part of torch/data listing them here for
+
+% now until we have a clearer fix
+
+```{eval-rst}
+.. py:module:: torch.utils.data.datapipes
+```
+
+```{eval-rst}
+.. py:module:: torch.utils.data.datapipes.dataframe
+```
+
+```{eval-rst}
+.. py:module:: torch.utils.data.datapipes.iter
+```
+
+```{eval-rst}
+.. py:module:: torch.utils.data.datapipes.map
+```
+
+```{eval-rst}
+.. py:module:: torch.utils.data.datapipes.utils
+```
diff --git a/docs/source/ddp_comm_hooks.md b/docs/source/ddp_comm_hooks.md
new file mode 100644
index 000000000000..059c388cd003
--- /dev/null
+++ b/docs/source/ddp_comm_hooks.md
@@ -0,0 +1,218 @@
+# DDP Communication Hooks
+
+DDP communication hook is a generic interface to control how to communicate
+gradients across workers by overriding the vanilla allreduce in
+[DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.).
+A few built-in communication hooks are provided,
+and users can easily apply any of these hooks to optimize communication.
+Besides, the hook interface can also support user-defined communication
+strategies for more advanced use cases.
+
+## How to Use a Communication Hook?
+
+To use a communication hook, the user just needs to let the DDP model register
+the hook before the training loop as below.
+
+{func}`torch.nn.parallel.DistributedDataParallel.register_comm_hook`
+
+## What Does a Communication Hook Operate On?
+
+A communication hook provides a flexible way to allreduce gradients.
+Therefore, it mainly operates on the gradients on each replica before allreduce,
+which are bucketized to increase the overlap between communication and computation.
+Particularly, {class}`torch.distributed.GradBucket` represents a bucket of gradient tensors to be allreduced.
+
+```{eval-rst}
+.. autoclass:: torch.distributed.GradBucket
+
+.. autofunction:: torch.distributed.GradBucket.index
+.. autofunction:: torch.distributed.GradBucket.buffer
+.. autofunction:: torch.distributed.GradBucket.gradients
+.. autofunction:: torch.distributed.GradBucket.is_last
+.. autofunction:: torch.distributed.GradBucket.set_buffer
+.. autofunction:: torch.distributed.GradBucket.parameters
+```
+
+## Default Communication Hooks
+
+Default communication hooks are simple **stateless** hooks, so the input state
+in `register_comm_hook` is either a process group or `None`.
+The input `bucket` is a {class}`torch.distributed.GradBucket` object.
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks
+.. autofunction:: allreduce_hook
+.. autofunction:: fp16_compress_hook
+.. autofunction:: bf16_compress_hook
+```
+
+Additionally, a communication hook wrapper is provided to support {meth}`~fp16_compress_hook` or {meth}`~bf16_compress_hook` as a wrapper,
+which can be combined with other communication hooks.
+
+```{eval-rst}
+.. autofunction:: fp16_compress_wrapper
+.. autofunction:: bf16_compress_wrapper
+```
+## PowerSGD Communication Hook
+
+PowerSGD ([Vogels et al., NeurIPS 2019](https://arxiv.org/abs/1905.13727))
+is a gradient compression algorithm, which can provide very high compression
+rates and accelerate bandwidth-bound distributed training.
+This algorithm needs to maintain both some hyperparameters and the internal
+state. Therefore, PowerSGD communication hook is a **stateful** hook,
+and the user needs to provide a state object defined as below.
+
+### PowerSGD State
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
+.. autoclass:: PowerSGDState
+```
+### PowerSGD Hooks
+
+```{warning}
+PowerSGD typically requires extra memory of the same size as the model's
+gradients to enable error feedback, which can compensate for biased
+compressed communication and improve accuracy.
+```
+
+```{warning}
+PowerSGD hooks may conflict with [Apex automatic mixed precision package](https://github.com/NVIDIA/apex).
+Please use PyTorch [native automatic mixed precision package](https://pytorch.org/docs/stable/amp.html)
+instead.
+```
+
+```{eval-rst}
+.. autofunction:: powerSGD_hook
+.. autofunction:: batched_powerSGD_hook
+```
+## Debugging Communication Hooks
+
+As the name implies, debugging communication hooks are **only** used for debugging and performance optimization purpose.
+```{eval-rst}
+.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks
+```
+```{warning}
+Debugging communication hooks do not necessarily output the correct results.
+```
+```{eval-rst}
+.. autofunction:: noop_hook
+```
+## Checkpointing of Communication Hooks
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
+```
+A stateful communication hook can be saved as a part of model checkpointing to enable trainer restarts.
+To make a hook serializable, ``__setstate__`` and ``__getstate__`` should be defined.
+
+```{warning}
+`__getstate__` should exclude non-serializable attributes from a returned dictionary.
+```
+```{warning}
+`__setstate__` should properly initialize non-serializable attributes, excluded from a provided `state`.
+```
+{class}`PowerSGDState` has `__setstate__` and `__getstate__` implemented and can be used as a reference.
+
+```{eval-rst}
+.. class:: PowerSGDState
+    :noindex:
+
+    .. automethod:: PowerSGDState.__getstate__
+    .. automethod:: PowerSGDState.__setstate__
+```
+Here is a simple, end-to-end example of saving and reloading PowerSGD state and hook.
+
+```python
+
+import os
+import sys
+import tempfile
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+import torch.multiprocessing as mp
+
+from torch.nn.parallel import DistributedDataParallel
+from torch.distributed.algorithms.ddp_comm_hooks import powerSGD_hook as powerSGD
+
+class SimpleModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(24,24)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Linear(24,12)
+
+    def forward(self, x):
+        return self.fc2(self.relu(self.fc1(x)))
+
+def setup(rank, world_size):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+def cleanup():
+    dist.destroy_process_group()
+
+def run_demo(demo_fn, world_size):
+    mp.spawn(
+        demo_fn,
+        args=(world_size,),
+        nprocs=world_size,
+        join=True)
+
+def demo_serialization(rank, world_size):
+    setup(rank, world_size)
+
+    CHECKPOINT = tempfile.gettempdir() + "/checkpoint.pt"
+
+    model = SimpleModel().to(rank)
+    ddp_model = DistributedDataParallel(model, device_ids=[rank])
+
+    powersgd_hook = powerSGD.powerSGD_hook
+    powersgd_state = powerSGD.PowerSGDState(process_group=None)
+
+    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+    ddp_model.register_comm_hook(powersgd_state, powersgd_hook)
+
+    state = {
+        'state_dict': ddp_model.state_dict(),
+        'comm_hook': powersgd_hook,
+        'comm_hook_state': powersgd_state}
+
+    if rank == 0:
+        torch.save(state, CHECKPOINT)
+
+    dist.barrier()
+    map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
+    checkpoint = torch.load(CHECKPOINT, map_location=map_location)
+
+    new_ddp_model = DistributedDataParallel(SimpleModel().to(rank), device_ids=[rank])
+    new_ddp_model.load_state_dict(checkpoint['state_dict'])
+    powersgd_hook = checkpoint['comm_hook']
+    powersgd_state = checkpoint['comm_hook_state']
+
+    new_ddp_model.register_comm_hook(powersgd_state, powersgd_hook)
+
+    if rank == 0:
+        os.remove(CHECKPOINT)
+
+    cleanup()
+
+if __name__ == "__main__":
+    n_gpus = torch.cuda.device_count()
+    assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
+    world_size = n_gpus
+    run_demo(demo_serialization, world_size)
+```
+
+## Acknowledgements
+
+Many thanks to PowerSGD paper author **Thijs Vogels** for the code review on
+PowerSGD communication hook, as well as the
+[comparison experiments](https://observablehq.com/@tvogels/powersgd-benchmark),
+which show that the performance of PowerSGD communication hook is on par with
+the implementation in the original [paper](https://arxiv.org/abs/1905.13727).
diff --git a/docs/source/debugging_environment_variables.md b/docs/source/debugging_environment_variables.md
new file mode 100644
index 000000000000..5e9f8b55359b
--- /dev/null
+++ b/docs/source/debugging_environment_variables.md
@@ -0,0 +1,14 @@
+(debugging_environment_variables)=
+# Debugging Environment Variables
+
+:::{list-table}
+  :header-rows: 1
+
+  * - Variable
+    - Description
+  * - ``TORCH_SHOW_CPP_STACKTRACES``
+    - If set to ``1``, makes PyTorch print out a stack trace when it detects a C++ error.
+  * - ``TORCH_CPP_LOG_LEVEL``
+    - Set the log level of c10 logging facility (supports both GLOG and c10 loggers). Valid values are ``INFO``, ``WARNING``, ``ERROR``, and ``FATAL`` or their numerical equivalents ``0``, ``1``, ``2``, and ``3``.
+  * - ``TORCH_LOGS``
+    -  For a more in depth explanation of this environment variable, see {doc}`/logging`.
\ No newline at end of file
diff --git a/docs/source/deploy.md b/docs/source/deploy.md
new file mode 100644
index 000000000000..ef5131717bf7
--- /dev/null
+++ b/docs/source/deploy.md
@@ -0,0 +1,8 @@
+---
+orphan: true
+---
+
+# torch::deploy has been moved to pytorch/multipy <!-- codespell:ignore -->
+
+
+``torch::deploy`` has been moved to its new home at [https://github.com/pytorch/multipy](https://github.com/pytorch/multipy). <!-- codespell:ignore -->
diff --git a/docs/source/deterministic.md b/docs/source/deterministic.md
new file mode 100644
index 000000000000..c979cd3a1ca6
--- /dev/null
+++ b/docs/source/deterministic.md
@@ -0,0 +1,30 @@
+# torch.utils.deterministic
+
+```{eval-rst}
+.. py:module:: torch.utils.deterministic
+.. currentmodule:: torch.utils.deterministic
+
+.. attribute:: fill_uninitialized_memory
+
+    A :class:`bool` that, if True, causes uninitialized memory to be filled with
+    a known value when :meth:`torch.use_deterministic_algorithms()` is set to
+    ``True``. Floating point and complex values are set to NaN, and integer
+    values are set to the maximum value.
+
+    Default: ``True``
+
+    Filling uninitialized memory is detrimental to performance. So if your
+    program is valid and does not use uninitialized memory as the input to an
+    operation, then this setting can be turned off for better performance and
+    still be deterministic.
+
+    The following operations will fill uninitialized memory when this setting is
+    turned on:
+
+        * :func:`torch.Tensor.resize_` when called with a tensor that is not
+          quantized
+        * :func:`torch.empty`
+        * :func:`torch.empty_strided`
+        * :func:`torch.empty_permuted`
+        * :func:`torch.empty_like`
+```
\ No newline at end of file
diff --git a/docs/source/distributed.algorithms.join.md b/docs/source/distributed.algorithms.join.md
new file mode 100644
index 000000000000..c8c661557dce
--- /dev/null
+++ b/docs/source/distributed.algorithms.join.md
@@ -0,0 +1,24 @@
+```{role} hidden
+---
+class: hidden-section
+---
+
+```
+# Generic Join Context Manager
+
+The generic join context manager facilitates distributed training on uneven
+inputs. This page outlines the API of the relevant classes: {class}`Join`,
+{class}`Joinable`, and {class}`JoinHook`. For a tutorial, see
+[Distributed Training with Uneven Inputs Using the Join Context Manager](https://pytorch.org/tutorials/advanced/generic_join.html).
+
+```{eval-rst}
+.. autoclass:: torch.distributed.algorithms.Join
+    :members:
+
+.. autoclass:: torch.distributed.algorithms.Joinable
+    :members:
+
+.. autoclass:: torch.distributed.algorithms.JoinHook
+    :members:
+
+```
\ No newline at end of file
diff --git a/docs/source/distributed.checkpoint.md b/docs/source/distributed.checkpoint.md
new file mode 100644
index 000000000000..4cce03196a79
--- /dev/null
+++ b/docs/source/distributed.checkpoint.md
@@ -0,0 +1,254 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# Distributed Checkpoint - torch.distributed.checkpoint
+
+Distributed Checkpoint (DCP) support loading and saving models from multiple ranks in parallel.
+It handles load-time resharding which enables saving in one cluster topology and loading into another.
+
+DCP is different than `torch.save` and `torch.load` in a few significant ways:
+
+- It produces multiple files per checkpoint, with at least one per rank.
+- It operates in place, meaning that the model should allocate its data first and DCP uses that storage instead.
+
+The entrypoints to load and save a checkpoint are the following:
+
+## Additional resources:
+
+- [Getting Started with Distributed Checkpoint (DCP)](https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html)
+- [Asynchronous Saving with Distributed Checkpoint (DCP)](https://pytorch.org/tutorials/recipes/distributed_async_checkpoint_recipe.html)
+- [TorchTitan Checkpointing Docs](https://github.com/pytorch/torchtitan/blob/main/docs/checkpoint.md)
+- [TorchTitan DCP Implementation](https://github.com/pytorch/torchtitan/blob/main/torchtitan/components/checkpoint.py)
+
+```{eval-rst}
+.. automodule:: torch.distributed.checkpoint
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.checkpoint.state_dict_saver
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.state_dict_saver.AsyncCheckpointerType
+  :members:
+```
+
+```{eval-rst}
+.. autofunction::  save
+```
+
+```{eval-rst}
+.. autofunction::  async_save
+```
+
+```{eval-rst}
+.. autofunction::  save_state_dict
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.checkpoint.state_dict_loader
+```
+
+```{eval-rst}
+.. autofunction::  load
+```
+
+```{eval-rst}
+.. autofunction::  load_state_dict
+```
+
+The following module is also useful for additional customization of the staging mechanisms used for asynchronous checkpointing (`torch.distributed.checkpoint.async_save`):
+
+```{eval-rst}
+.. automodule:: torch.distributed.checkpoint.staging
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.staging.AsyncStager
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.staging.BlockingAsyncStager
+  :members:
+```
+
+In addition to the above entrypoints, `Stateful` objects, as described below, provide additional customization during saving/loading
+
+```{eval-rst}
+.. automodule:: torch.distributed.checkpoint.stateful
+   :noindex:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.stateful.Stateful
+  :members:
+```
+
+This [example](https://github.com/pytorch/pytorch/blob/main/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py) shows how to use Pytorch Distributed Checkpoint to save a FSDP model.
+
+The following types define the IO interface used during checkpoint:
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.StorageReader
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.StorageWriter
+  :members:
+```
+
+The following types define the planner interface used during checkpoint:
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.LoadPlanner
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.LoadPlan
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.ReadItem
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.SavePlanner
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.SavePlan
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.planner.WriteItem
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.planner.BytesIOWriteData
+  :members:
+```
+
+We provide a filesystem based storage layer:
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.FileSystemReader
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.FileSystemWriter
+  :members:
+```
+
+We also provide other storage layers, including ones to interact with HuggingFace safetensors:
+
+.. autoclass:: torch.distributed.checkpoint.HuggingFaceStorageReader
+  :members:
+
+.. autoclass:: torch.distributed.checkpoint.HuggingFaceStorageWriter
+  :members:
+
+We provide default implementations of `LoadPlanner` and `SavePlanner` that
+can handle all of torch.distributed constructs such as FSDP, DDP, ShardedTensor and DistributedTensor.
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.DefaultSavePlanner
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.DefaultLoadPlanner
+  :members:
+
+```
+
+Due to legacy design decisions, the state dictionaries of `FSDP` and `DDP` may have different keys or fully qualified names (e.g., layer1.weight) even when the original unparallelized model is identical. Moreover, `FSDP` offers various types of model state dictionaries, such as full and sharded state dictionaries. Additionally, optimizer state dictionaries employ parameter IDs instead of fully qualified names to identify parameters, potentially causing issues when parallelisms are used (e.g., pipeline parallelism).
+
+To tackle these challenges, we offer a collection of APIs for users to easily manage state_dicts. `get_model_state_dict()` returns a model state dictionary with keys consistent with those returned by the unparallelized model state dictionary. Similarly, `get_optimizer_state_dict()` provides the optimizer state dictionary with keys uniform across all parallelisms applied. To achieve this consistency, `get_optimizer_state_dict()` converts parameter IDs to fully qualified names identical to those found in the unparallelized model state dictionary.
+
+Note that results returned by these APIs can be used directly with the `torch.distributed.checkpoint.save()` and `torch.distributed.checkpoint.load()` methods without requiring any additional conversions.
+
+`set_model_state_dict()` and `set_optimizer_state_dict()` are provided to load the model and optimizer state_dict generated by by their respective getter APIs.
+
+Note that `set_optimizer_state_dict()` can only be called before `backward()` or after `step()` is called on optimizers.
+
+Note that this feature is experimental, and API signatures might change in the future.
+
+```{eval-rst}
+.. autofunction:: torch.distributed.checkpoint.state_dict.get_state_dict
+```
+
+```{eval-rst}
+.. autofunction:: torch.distributed.checkpoint.state_dict.get_model_state_dict
+```
+
+```{eval-rst}
+.. autofunction:: torch.distributed.checkpoint.state_dict.get_optimizer_state_dict
+```
+
+```{eval-rst}
+.. autofunction:: torch.distributed.checkpoint.state_dict.set_state_dict
+```
+
+```{eval-rst}
+.. autofunction:: torch.distributed.checkpoint.state_dict.set_model_state_dict
+```
+
+```{eval-rst}
+.. autofunction:: torch.distributed.checkpoint.state_dict.set_optimizer_state_dict
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.state_dict.StateDictOptions
+   :members:
+```
+
+For users which are used to using and sharing models in the `torch.save` format, the following methods are provided which provide offline utilities for converting betweeing formats.
+
+```{eval-rst}
+.. automodule:: torch.distributed.checkpoint.format_utils
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.checkpoint.format_utils
+```
+
+```{eval-rst}
+.. autofunction:: dcp_to_torch_save
+```
+
+```{eval-rst}
+.. autofunction:: torch_save_to_dcp
+```
+
+The following classes can also be utilized for online loading and resharding of models from the torch.save format.
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.format_utils.BroadcastingTorchSaveReader
+   :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.checkpoint.format_utils.DynamicMetaLoadPlanner
+   :members:
+```
+
+The following experimental interfaces are provided for improved observability in production environments:
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.logger
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.logging_handlers
+```
diff --git a/docs/source/distributed.elastic.md b/docs/source/distributed.elastic.md
new file mode 100644
index 000000000000..f23ca39d724a
--- /dev/null
+++ b/docs/source/distributed.elastic.md
@@ -0,0 +1,46 @@
+# Torch Distributed Elastic
+
+Makes distributed PyTorch fault-tolerant and elastic.
+
+## Get Started
+
+```{toctree}
+:caption: Usage
+:maxdepth: 1
+
+elastic/quickstart
+elastic/train_script
+elastic/examples
+```
+
+## Documentation
+
+```{toctree}
+:caption: API
+:maxdepth: 1
+
+elastic/run
+elastic/agent
+elastic/multiprocessing
+elastic/errors
+elastic/rendezvous
+elastic/timer
+elastic/metrics
+elastic/events
+elastic/subprocess_handler
+elastic/control_plane
+```
+
+```{toctree}
+:caption: Advanced
+:maxdepth: 1
+
+elastic/customization
+```
+
+```{toctree}
+:caption: Plugins
+:maxdepth: 1
+
+elastic/kubernetes
+```
diff --git a/docs/source/distributed.fsdp.fully_shard.md b/docs/source/distributed.fsdp.fully_shard.md
new file mode 100644
index 000000000000..4a54a41cefdb
--- /dev/null
+++ b/docs/source/distributed.fsdp.fully_shard.md
@@ -0,0 +1,125 @@
+# torch.distributed.fsdp.fully_shard
+
+## PyTorch FSDP2 (`fully_shard`)
+
+PyTorch FSDP2 ([RFC](<https://github.com/pytorch/pytorch/issues/114299>)) provides
+a fully sharded data parallelism (FSDP) implementation targeting performant
+eager-mode while using per-parameter sharding for improved usability
+
+- See the [Getting Started with FSDP2](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html)
+  tutorial for more information.
+
+- If you are currently using FSDP1, consider migrating to FSDP2 using our
+  [migration guide](https://docs.pytorch.org/tutorials/intermediate/FSDP_tutorial.html#fsdp1-to-fsdp2-migration-guide).
+
+
+The user contract for ``fully_shard(model)`` is as follows
+
+- For model initialization, fully_shard converts model.parameters() from
+  plain torch.Tensor to DTensor in-place. The parameters are moved to the
+  appropriate device according to the device mesh.
+
+- Before forward and backward passes, pre-forward/backward hooks are
+  responsible for all-gathering the parameters and converting model.parameters()
+  from DTensor to plain torch.Tensor.
+
+- After forward and backward passes, post-forward/backward hooks free
+  the unsharded parameters (no communication needed) and convert
+  model.parameters() from plain torch.Tensor back to DTensor.
+
+- For the optimizer, it must be initialized with the DTensor model.parameters(),
+  and the optimizer step should be performed on DTensor parameters.
+
+- Call ``model(input)`` instead of ``model.forward(input)`` to trigger pre-forward
+  hooks to all-gather parameters. To make model.forward(input) work, users must
+  either call ``model.unshard()`` explicitly or use ``register_fsdp_forward_method(model, "forward")``
+  to register the forward method for hooking.
+
+- fully_shard groups parameters together for a single all-gather. User should apply
+  fully_shard in a bottom-up manner. For example, in a Transformer model, fully_shard
+  should be applied to each layer before applying it to the root model. When applied
+  to the root model, fully_shard excludes model.parameters() from each layer and groups
+  the remaining parameters (e.g., embeddings, output projection) into a single
+  all-gather group.
+
+- ``type(model)`` is "unioned" with ``FSDPModule`` in-place. For example, if model
+  is originally of type nn.Linear, then fully_shard changes ``type(model)`` from
+  nn.Linear to ``FSDPLinear`` in-place. ``FSDPLinear`` is an instance of both
+  nn.Linear and ``FSDPModule``. It retains all methods of nn.Linear while also
+  exposing FSDP2-specific APIs under FSDPModule, such as ``reshard()`` and
+  ``unshard()``.
+
+- Fully Qualified Names (FQNs) for parameters remain unchanged. If we call
+  ``model.state_dict()``, the FQNs are the same before and after applying
+  fully_shard. This is because fully_shard does not wrap the module but only
+  registers hooks to the original module.
+
+
+Compared to PyTorch FSDP1 (`FullyShardedDataParallel`):
+
+- FSDP2 uses `DTensor`-based dim-0 per-parameter sharding for a simpler
+  sharding representation compared to FSDP1's flat-parameter sharding, while
+  preserving similar throughput performance. More specifically, FSDP2 chunks
+  each parameter on dim-0 across the data parallel workers (using
+  `torch.chunk(dim=0)`), whereas FSDP1 flattens, concatenates, and chunks a
+  group of tensors together, making reasoning about what data is present on
+  each worker and resharding to different parallelisms complex. Per-parameter
+  sharding provides a more intuitive user experience, relaxes constraints
+  around frozen parameters, and allows for communication-free (sharded) state
+  dicts, which otherwise require all-gathers in FSDP1.
+- FSDP2 implements a different memory management approach to handle the
+  multi-stream usages that avoids `torch.Tensor.record_stream`. This ensures
+  deterministic and expected memory usage and does not require blocking the CPU
+  like in FSDP1's `limit_all_gathers=True`.
+- FSDP2 exposes APIs for manual control over prefetching and collective
+  scheduling, allowing power users more customization. See the methods on
+  `FSDPModule` below for details.
+- FSDP2 simplifies some of the API surface: e.g. FSDP2 does not directly
+  support full state dicts. Instead, users can reshard the sharded state dicts
+  containing `DTensor` s to full state dicts themselves using `DTensor`
+  APIs like `DTensor.full_tensor()` or by using higher-level APIs like
+  [PyTorch Distributed Checkpoint](https://pytorch.org/docs/stable/distributed.checkpoint.html) 's
+  distributed state dict APIs. Also, some other args have been removed; see
+  [here](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md) for
+  details.
+
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.fsdp
+```
+
+The frontend API is `fully_shard` that can be called on a `module`:
+
+```{eval-rst}
+.. autofunction:: fully_shard
+```
+
+```{eval-rst}
+.. autoclass:: FSDPModule
+    :members:
+    :member-order: bysource
+```
+
+```{eval-rst}
+.. autoclass:: UnshardHandle
+    :members:
+```
+
+```{eval-rst}
+.. autofunction:: register_fsdp_forward_method
+```
+
+```{eval-rst}
+.. autoclass:: MixedPrecisionPolicy
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: OffloadPolicy
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: CPUOffloadPolicy
+    :members:
+```
diff --git a/docs/source/distributed.md b/docs/source/distributed.md
new file mode 100644
index 000000000000..bf958f116505
--- /dev/null
+++ b/docs/source/distributed.md
@@ -0,0 +1,1477 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# Distributed communication package - torch.distributed
+
+:::{note}
+Please refer to [PyTorch Distributed Overview](https://pytorch.org/tutorials/beginner/dist_overview.html)
+for a brief introduction to all features related to distributed training.
+:::
+
+```{eval-rst}
+.. automodule:: torch.distributed
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed
+```
+
+## Backends
+
+`torch.distributed` supports three built-in backends, each with
+different capabilities. The table below shows which functions are available
+for use with CPU / CUDA tensors.
+MPI supports CUDA only if the implementation used to build PyTorch supports it.
+
+```{eval-rst}
++----------------+-----------+-----------+-----------+
+| Backend        | ``gloo``  | ``mpi``   | ``nccl``  |
++----------------+-----+-----+-----+-----+-----+-----+
+| Device         | CPU | GPU | CPU | GPU | CPU | GPU |
++================+=====+=====+=====+=====+=====+=====+
+| send           | ✓   | ✘   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| recv           | ✓   | ✘   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| broadcast      | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| all_reduce     | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| reduce         | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| all_gather     | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| gather         | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| scatter        | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| reduce_scatter | ✓   | ✓   | ✘   | ✘   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| all_to_all     | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+| barrier        | ✓   | ✘   | ✓   | ?   | ✘   | ✓   |
++----------------+-----+-----+-----+-----+-----+-----+
+```
+
+### Backends that come with PyTorch
+
+PyTorch distributed package supports Linux (stable), MacOS (stable), and Windows (prototype).
+By default for Linux, the Gloo and NCCL backends are built and included in PyTorch
+distributed (NCCL only when building with CUDA). MPI is an optional backend that can only be
+included if you build PyTorch from source. (e.g. building PyTorch on a host that has MPI
+installed.)
+
+:::{note}
+As of PyTorch v1.8, Windows supports all collective communications backend but NCCL,
+If the `init_method` argument of {func}`init_process_group` points to a file it must adhere
+to the following schema:
+
+- Local file system, `init_method="file:///d:/tmp/some_file"`
+- Shared file system, `init_method="file://////{machine_name}/{share_folder_name}/some_file"`
+
+Same as on Linux platform, you can enable TcpStore by setting environment variables,
+MASTER_ADDR and MASTER_PORT.
+:::
+
+### Which backend to use?
+
+In the past, we were often asked: "which backend should I use?".
+
+- Rule of thumb
+
+  - Use the NCCL backend for distributed **GPU** training
+  - Use the Gloo backend for distributed **CPU** training.
+
+- GPU hosts with InfiniBand interconnect
+
+  - Use NCCL, since it's the only backend that currently supports
+    InfiniBand and GPUDirect.
+
+- GPU hosts with Ethernet interconnect
+
+  - Use NCCL, since it currently provides the best distributed GPU
+    training performance, especially for multiprocess single-node or
+    multi-node distributed training. If you encounter any problem with
+    NCCL, use Gloo as the fallback option. (Note that Gloo currently
+    runs slower than NCCL for GPUs.)
+
+- CPU hosts with InfiniBand interconnect
+
+  - If your InfiniBand has enabled IP over IB, use Gloo, otherwise,
+    use MPI instead. We are planning on adding InfiniBand support for
+    Gloo in the upcoming releases.
+
+- CPU hosts with Ethernet interconnect
+
+  - Use Gloo, unless you have specific reasons to use MPI.
+
+### Common environment variables
+
+#### Choosing the network interface to use
+
+By default, both the NCCL and Gloo backends will try to find the right network interface to use.
+If the automatically detected interface is not correct, you can override it using the following
+environment variables (applicable to the respective backend):
+
+- **NCCL_SOCKET_IFNAME**, for example `export NCCL_SOCKET_IFNAME=eth0`
+- **GLOO_SOCKET_IFNAME**, for example `export GLOO_SOCKET_IFNAME=eth0`
+
+If you're using the Gloo backend, you can specify multiple interfaces by separating
+them by a comma, like this: `export GLOO_SOCKET_IFNAME=eth0,eth1,eth2,eth3`.
+The backend will dispatch operations in a round-robin fashion across these interfaces.
+It is imperative that all processes specify the same number of interfaces in this variable.
+
+#### Other NCCL environment variables
+
+**Debugging** - in case of NCCL failure, you can set `NCCL_DEBUG=INFO` to print an explicit
+warning message as well as basic NCCL initialization information.
+
+You may also use `NCCL_DEBUG_SUBSYS` to get more details about a specific
+aspect of NCCL. For example, `NCCL_DEBUG_SUBSYS=COLL` would print logs of
+collective calls, which may be helpful when debugging hangs, especially those
+caused by collective type or message size mismatch. In case of topology
+detection failure, it would be helpful to set `NCCL_DEBUG_SUBSYS=GRAPH`
+to inspect the detailed detection result and save as reference if further help
+from NCCL team is needed.
+
+**Performance tuning** - NCCL performs automatic tuning based on its topology detection to save users'
+tuning effort. On some socket-based systems, users may still try tuning
+`NCCL_SOCKET_NTHREADS` and `NCCL_NSOCKS_PERTHREAD` to increase socket
+network bandwidth. These two environment variables have been pre-tuned by NCCL
+for some cloud providers, such as AWS or GCP.
+
+For a full list of NCCL environment variables, please refer to
+[NVIDIA NCCL's official documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html)
+
+You can tune NCCL communicators even further using `torch.distributed.ProcessGroupNCCL.NCCLConfig`
+and `torch.distributed.ProcessGroupNCCL.Options`. Learn more about them using `help`
+(e.g. `help(torch.distributed.ProcessGroupNCCL.NCCLConfig)`) in the interpreter.
+
+(distributed-basics)=
+
+## Basics
+
+The `torch.distributed` package provides PyTorch support and communication primitives
+for multiprocess parallelism across several computation nodes running on one or more
+machines. The class {func}`torch.nn.parallel.DistributedDataParallel` builds on this
+functionality to provide synchronous distributed training as a wrapper around any
+PyTorch model. This differs from the kinds of parallelism provided by
+{doc}`multiprocessing` and {func}`torch.nn.DataParallel` in that it supports
+multiple network-connected machines and in that the user must explicitly launch a separate
+copy of the main training script for each process.
+
+In the single-machine synchronous case, `torch.distributed` or the
+{func}`torch.nn.parallel.DistributedDataParallel` wrapper may still have advantages over other
+approaches to data-parallelism, including {func}`torch.nn.DataParallel`:
+
+- Each process maintains its own optimizer and performs a complete optimization step with each
+  iteration. While this may appear redundant, since the gradients have already been gathered
+  together and averaged across processes and are thus the same for every process, this means
+  that no parameter broadcast step is needed, reducing time spent transferring tensors between
+  nodes.
+- Each process contains an independent Python interpreter, eliminating the extra interpreter
+  overhead and "GIL-thrashing" that comes from driving several execution threads, model
+  replicas, or GPUs from a single Python process. This is especially important for models that
+  make heavy use of the Python runtime, including models with recurrent layers or many small
+  components.
+
+## Initialization
+
+The package needs to be initialized using the {func}`torch.distributed.init_process_group`
+or {func}`torch.distributed.device_mesh.init_device_mesh` function before calling any other methods.
+Both block until all processes have joined.
+
+:::{warning}
+Initialization is not thread-safe. Process group creation should be performed from a single thread, to prevent
+inconsistent 'UUID' assignment across ranks, and to prevent races during initialization that can lead to hangs.
+:::
+
+```{eval-rst}
+.. autofunction:: is_available
+```
+
+```{eval-rst}
+.. autofunction:: init_process_group
+```
+
+```{eval-rst}
+.. autofunction:: torch.distributed.device_mesh.init_device_mesh
+```
+
+```{eval-rst}
+.. autofunction:: is_initialized
+```
+
+```{eval-rst}
+.. autofunction:: is_mpi_available
+```
+
+```{eval-rst}
+.. autofunction:: is_nccl_available
+```
+
+```{eval-rst}
+.. autofunction:: is_gloo_available
+```
+
+```{eval-rst}
+.. autofunction:: torch.distributed.distributed_c10d.is_xccl_available
+```
+
+```{eval-rst}
+.. autofunction:: is_torchelastic_launched
+```
+
+```{eval-rst}
+.. autofunction:: get_default_backend_for_device
+```
+
+______________________________________________________________________
+
+Currently three initialization methods are supported:
+
+### TCP initialization
+
+There are two ways to initialize using TCP, both requiring a network address
+reachable from all processes and a desired `world_size`. The first way
+requires specifying an address that belongs to the rank 0 process. This
+initialization method requires that all processes have manually specified ranks.
+
+Note that multicast address is not supported anymore in the latest distributed
+package. `group_name` is deprecated as well.
+
+```
+import torch.distributed as dist
+
+# Use address of one of the machines
+dist.init_process_group(backend, init_method='tcp://10.1.1.20:23456',
+                        rank=args.rank, world_size=4)
+```
+
+### Shared file-system initialization
+
+Another initialization method makes use of a file system that is shared and
+visible from all machines in a group, along with a desired `world_size`. The URL should start
+with `file://` and contain a path to a non-existent file (in an existing
+directory) on a shared file system. File-system initialization will automatically
+create that file if it doesn't exist, but will not delete the file. Therefore, it
+is your responsibility to make sure that the file is cleaned up before the next
+{func}`init_process_group` call on the same file path/name.
+
+Note that automatic rank assignment is not supported anymore in the latest
+distributed package and `group_name` is deprecated as well.
+
+:::{warning}
+This method assumes that the file system supports locking using `fcntl` - most
+local systems and NFS support it.
+:::
+
+:::{warning}
+This method will always create the file and try its best to clean up and remove
+the file at the end of the program. In other words, each initialization with
+the file init method will need a brand new empty file in order for the initialization
+to succeed. If the same file used by the previous initialization (which happens not
+to get cleaned up) is used again, this is unexpected behavior and can often cause
+deadlocks and failures. Therefore, even though this method will try its best to clean up
+the file, if the auto-delete happens to be unsuccessful, it is your responsibility
+to ensure that the file is removed at the end of the training to prevent the same
+file to be reused again during the next time. This is especially important
+if you plan to call {func}`init_process_group` multiple times on the same file name.
+In other words, if the file is not removed/cleaned up and you call
+{func}`init_process_group` again on that file, failures are expected.
+The rule of thumb here is that, make sure that the file is non-existent or
+empty every time {func}`init_process_group` is called.
+:::
+
+```
+import torch.distributed as dist
+
+# rank should always be specified
+dist.init_process_group(backend, init_method='file:///mnt/nfs/sharedfile',
+                        world_size=4, rank=args.rank)
+```
+
+### Environment variable initialization
+
+This method will read the configuration from environment variables, allowing
+one to fully customize how the information is obtained. The variables to be set
+are:
+
+- `MASTER_PORT` - required; has to be a free port on machine with rank 0
+- `MASTER_ADDR` - required (except for rank 0); address of rank 0 node
+- `WORLD_SIZE` - required; can be set either here, or in a call to init function
+- `RANK` - required; can be set either here, or in a call to init function
+
+The machine with rank 0 will be used to set up all connections.
+
+This is the default method, meaning that `init_method` does not have to be specified (or
+can be `env://`).
+
+### Improving initialization time
+
+- `TORCH_GLOO_LAZY_INIT` - establishes connections on demand rather than
+  using a full mesh which can greatly improve initialization time for non all2all
+  operations.
+
+## Post-Initialization
+
+Once {func}`torch.distributed.init_process_group` was run, the following functions can be used. To
+check whether the process group has already been initialized use {func}`torch.distributed.is_initialized`.
+
+```{eval-rst}
+.. autoclass:: Backend
+    :members:
+```
+
+```{eval-rst}
+.. autofunction:: get_backend
+```
+
+```{eval-rst}
+.. autofunction:: get_rank
+```
+
+```{eval-rst}
+.. autofunction:: get_world_size
+```
+
+## Shutdown
+
+It is important to clean up resources on exit by calling {func}`destroy_process_group`.
+
+The simplest pattern to follow is to destroy every process group and backend by calling
+{func}`destroy_process_group()` with the default value of None for the `group` argument, at a
+point in the training script where communications are no longer needed, usually near the
+end of main(). The call should be made once per trainer-process, not at the outer
+process-launcher level.
+
+if {func}`destroy_process_group` is not called by all ranks in a pg within the timeout duration,
+especially when there are multiple process-groups in the application e.g. for N-D parallelism,
+hangs on exit are possible. This is because the destructor for ProcessGroupNCCL calls ncclCommAbort,
+which must be called collectively, but the order of calling ProcessGroupNCCL's destructor if called
+by python's GC is not deterministic. Calling {func}`destroy_process_group` helps by ensuring
+ncclCommAbort is called in a consistent order across ranks, and avoids calling ncclCommAbort
+during ProcessGroupNCCL's destructor.
+
+### Reinitialization
+
+`destroy_process_group` can also be used to destroy individual process groups. One use
+case could be fault tolerant training, where a process group may be destroyed and then
+a new one initialized during runtime. In this case, it's critical to synchronize the trainer
+processes using some means other than torch.distributed primitives \_after\_ calling destroy and
+before subsequently initializing. This behavior is currently unsupported/untested, due to
+the difficulty of achieving this synchronization, and is considered a known issue. Please file
+a github issue or RFC if this is a use case that's blocking you.
+
+______________________________________________________________________
+
+## Groups
+
+By default collectives operate on the default group (also called the world) and
+require all processes to enter the distributed function call. However, some workloads can benefit
+from more fine-grained communication. This is where distributed groups come
+into play. {func}`~torch.distributed.new_group` function can be
+used to create new groups, with arbitrary subsets of all processes. It returns
+an opaque group handle that can be given as a `group` argument to all collectives
+(collectives are distributed functions to exchange information in certain well-known programming patterns).
+
+```{eval-rst}
+.. autofunction:: new_group
+```
+
+```{eval-rst}
+.. autofunction:: get_group_rank
+```
+
+```{eval-rst}
+.. autofunction:: get_global_rank
+```
+
+```{eval-rst}
+.. autofunction:: get_process_group_ranks
+
+```
+
+## DeviceMesh
+
+DeviceMesh is a higher level abstraction that manages process groups (or NCCL communicators).
+It allows user to easily create inter node and intra node process groups without worrying about
+how to set up the ranks correctly for different sub process groups, and it helps manage those
+distributed process group easily. {func}`~torch.distributed.device_mesh.init_device_mesh` function can be
+used to create new DeviceMesh, with a mesh shape describing the device topology.
+
+```{eval-rst}
+.. autoclass:: torch.distributed.device_mesh.DeviceMesh
+    :members:
+```
+
+## Point-to-point communication
+
+```{eval-rst}
+.. autofunction:: send
+```
+
+```{eval-rst}
+.. autofunction:: recv
+```
+
+{func}`~torch.distributed.isend` and {func}`~torch.distributed.irecv`
+return distributed request objects when used. In general, the type of this object is unspecified
+as they should never be created manually, but they are guaranteed to support two methods:
+
+- `is_completed()` - returns True if the operation has finished
+- `wait()` - will block the process until the operation is finished.
+  `is_completed()` is guaranteed to return True once it returns.
+
+```{eval-rst}
+.. autofunction:: isend
+```
+
+```{eval-rst}
+.. autofunction:: irecv
+```
+
+```{eval-rst}
+.. autofunction:: send_object_list
+```
+
+```{eval-rst}
+.. autofunction:: recv_object_list
+```
+
+```{eval-rst}
+.. autofunction:: batch_isend_irecv
+```
+
+```{eval-rst}
+.. autoclass:: P2POp
+```
+
+## Synchronous and asynchronous collective operations
+
+Every collective operation function supports the following two kinds of operations,
+depending on the setting of the `async_op` flag passed into the collective:
+
+**Synchronous operation** - the default mode, when `async_op` is set to `False`.
+When the function returns, it is guaranteed that
+the collective operation is performed. In the case of CUDA operations, it is not guaranteed
+that the CUDA operation is completed, since CUDA operations are asynchronous. For CPU collectives, any
+further function calls utilizing the output of the collective call will behave as expected. For CUDA collectives,
+function calls utilizing the output on the same CUDA stream will behave as expected. Users must take care of
+synchronization under the scenario of running under different streams. For details on CUDA semantics such as stream
+synchronization, see [CUDA Semantics](https://pytorch.org/docs/stable/notes/cuda.html).
+See the below script to see examples of differences in these semantics for CPU and CUDA operations.
+
+**Asynchronous operation** - when `async_op` is set to True. The collective operation function
+returns a distributed request object. In general, you don't need to create it manually and it
+is guaranteed to support two methods:
+
+- `is_completed()` - in the case of CPU collectives, returns `True` if completed. In the case of CUDA operations,
+  returns `True` if the operation has been successfully enqueued onto a CUDA stream and the output can be utilized on the
+  default stream without further synchronization.
+- `wait()` - in the case of CPU collectives, will block the process until the operation is completed. In the case
+  of CUDA collectives, will block the currently active CUDA stream until the operation is completed (but will not block the CPU).
+- `get_future()` - returns `torch._C.Future` object. Supported for NCCL, also supported for most operations on GLOO
+  and MPI, except for peer to peer operations.
+  Note: as we continue adopting Futures and merging APIs, `get_future()` call might become redundant.
+
+**Example**
+
+The following code can serve as a reference regarding semantics for CUDA operations when using distributed collectives.
+It shows the explicit need to synchronize when using collective outputs on different CUDA streams:
+
+```
+# Code runs on each rank.
+dist.init_process_group("nccl", rank=rank, world_size=2)
+output = torch.tensor([rank]).cuda(rank)
+s = torch.cuda.Stream()
+handle = dist.all_reduce(output, async_op=True)
+# Wait ensures the operation is enqueued, but not necessarily complete.
+handle.wait()
+# Using result on non-default stream.
+with torch.cuda.stream(s):
+    s.wait_stream(torch.cuda.default_stream())
+    output.add_(100)
+if rank == 0:
+    # if the explicit call to wait_stream was omitted, the output below will be
+    # non-deterministically 1 or 101, depending on whether the allreduce overwrote
+    # the value after the add completed.
+    print(output)
+```
+
+## Collective functions
+
+```{eval-rst}
+.. autofunction:: broadcast
+```
+
+```{eval-rst}
+.. autofunction:: broadcast_object_list
+```
+
+```{eval-rst}
+.. autofunction:: all_reduce
+```
+
+```{eval-rst}
+.. autofunction:: reduce
+```
+
+```{eval-rst}
+.. autofunction:: all_gather
+```
+
+```{eval-rst}
+.. autofunction:: all_gather_into_tensor
+```
+
+```{eval-rst}
+.. autofunction:: all_gather_object
+```
+
+```{eval-rst}
+.. autofunction:: gather
+```
+
+```{eval-rst}
+.. autofunction:: gather_object
+```
+
+```{eval-rst}
+.. autofunction:: scatter
+```
+
+```{eval-rst}
+.. autofunction:: scatter_object_list
+```
+
+```{eval-rst}
+.. autofunction:: reduce_scatter
+```
+
+```{eval-rst}
+.. autofunction:: reduce_scatter_tensor
+```
+
+```{eval-rst}
+.. autofunction:: all_to_all_single
+```
+
+```{eval-rst}
+.. autofunction:: all_to_all
+```
+
+```{eval-rst}
+.. autofunction:: barrier
+```
+
+```{eval-rst}
+.. autofunction:: monitored_barrier
+```
+
+```{eval-rst}
+.. autoclass:: Work
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: ReduceOp
+```
+
+```{eval-rst}
+.. class:: reduce_op
+
+    Deprecated enum-like class for reduction operations: ``SUM``, ``PRODUCT``,
+    ``MIN``, and ``MAX``.
+
+    :class:`~torch.distributed.ReduceOp` is recommended to use instead.
+
+```
+
+## Distributed Key-Value Store
+
+The distributed package comes with a distributed key-value store, which can be
+used to share information between processes in the group as well as to
+initialize the distributed package in
+{func}`torch.distributed.init_process_group` (by explicitly creating the store
+as an alternative to specifying `init_method`.) There are 3 choices for
+Key-Value Stores: {class}`~torch.distributed.TCPStore`,
+{class}`~torch.distributed.FileStore`, and {class}`~torch.distributed.HashStore`.
+
+```{eval-rst}
+.. autoclass:: Store
+    :members:
+    :special-members:
+```
+
+```{eval-rst}
+.. autoclass:: TCPStore
+    :members:
+    :special-members: __init__
+```
+
+```{eval-rst}
+.. autoclass:: HashStore
+    :members:
+    :special-members: __init__
+```
+
+```{eval-rst}
+.. autoclass:: FileStore
+    :members:
+    :special-members: __init__
+```
+
+```{eval-rst}
+.. autoclass:: PrefixStore
+    :members:
+    :special-members: __init__
+
+```
+
+## Profiling Collective Communication
+
+Note that you can use `torch.profiler` (recommended, only available after 1.8.1) or `torch.autograd.profiler` to profile collective communication and point-to-point communication APIs mentioned here. All out-of-the-box backends (`gloo`,
+`nccl`, `mpi`) are supported and collective communication usage will be rendered as expected in profiling output/traces. Profiling your code is the same as any regular torch operator:
+
+```
+import torch
+import torch.distributed as dist
+with torch.profiler():
+    tensor = torch.randn(20, 10)
+    dist.all_reduce(tensor)
+```
+
+Please refer to the [profiler documentation](https://pytorch.org/docs/main/profiler.html) for a full overview of profiler features.
+
+## Multi-GPU collective functions
+
+:::{warning}
+The multi-GPU functions (which stand for multiple GPUs per CPU thread) are
+deprecated. As of today, PyTorch Distributed's preferred programming model
+is one device per thread, as exemplified by the APIs in this document. If
+you are a backend developer and want to support multiple devices per thread,
+please contact PyTorch Distributed's maintainers.
+:::
+
+(object_collectives)=
+
+## Object collectives
+
+:::{warning}
+Object collectives have a number of serious limitations. Read further to determine
+if they are safe to use for your use case.
+:::
+
+Object collectives are a set of collective-like operations that work on arbitrary
+Python objects, as long as they can be pickled. There are various collective patterns
+implemented (e.g. broadcast, all_gather, ...) but they each roughly follow this pattern:
+
+1. convert the input object into a pickle (raw bytes), then shove it into a byte tensor
+2. communicate the size of this byte tensor to peers (first collective operation)
+3. allocate appropriately sized tensor to perform the real collective
+4. communicate the object data (second collective operation)
+5. convert raw data back into Python (unpickle)
+
+Object collectives sometimes have surprising performance or memory characteristics that lead to
+long runtimes or OOMs, and thus they should be used with caution. Here are some common issues.
+
+**Asymmetric pickle/unpickle time** - Pickling objects can be slow, depending on the number, type and size of the objects.
+When the collective has a fan-in (e.g. gather_object), the receiving rank(s) must unpickle N times more objects than
+the sending rank(s) had to pickle, which can cause other ranks to time out on their next collective.
+
+**Inefficient tensor communication** - Tensors should be sent via regular collective APIs, not object collective APIs.
+It is possible to send Tensors via object collective APIs, but they will be serialized and deserialized (including a
+CPU-sync and device-to-host copy in the case of non-CPU tensors), and in almost every case other than debugging or
+troubleshooting code, it would be worth the trouble to refactor the code to use non-object collectives instead.
+
+**Unexpected tensor devices** - If you still want to send tensors via object collectives, there is another aspect
+specific to cuda (and possibly other accelerators) tensors. If you pickle a tensor that is currently on `cuda:3`, and
+then unpickle it, you will get another tensor on `cuda:3` *regardless of which process you are on, or which CUDA device
+is the 'default' device for that process*. With regular tensor collective APIs, 'output tensors' will always be on the
+same, local device, which is generally what you'd expect.
+
+Unpickling a tensor will implicitly activate a CUDA context if it is the first
+time a GPU is used by the process, which can waste significant amounts of GPU memory. This issue can be avoided by
+moving tensors to CPU before passing them as inputs to an object collective.
+
+## Third-party backends
+
+Besides the builtin GLOO/MPI/NCCL backends, PyTorch distributed supports
+third-party backends through a run-time register mechanism.
+For references on how to develop a third-party backend through C++ Extension,
+please refer to [Tutorials - Custom C++ and CUDA Extensions](https://pytorch.org/tutorials/advanced/cpp_extension.html) and
+`test/cpp_extensions/cpp_c10d_extension.cpp`. The capability of third-party
+backends are decided by their own implementations.
+
+The new backend derives from `c10d::ProcessGroup` and registers the backend
+name and the instantiating interface through {func}`torch.distributed.Backend.register_backend`
+when imported.
+
+When manually importing this backend and invoking {func}`torch.distributed.init_process_group`
+with the corresponding backend name, the `torch.distributed` package runs on
+the new backend.
+
+:::{warning}
+The support of third-party backend is experimental and subject to change.
+:::
+
+(distributed-launch)=
+
+## Launch utility
+
+The `torch.distributed` package also provides a launch utility in
+`torch.distributed.launch`. This helper utility can be used to launch
+multiple processes per node for distributed training.
+
+```{eval-rst}
+.. automodule:: torch.distributed.launch
+
+```
+
+## Spawn utility
+
+The {ref}`multiprocessing-doc` package also provides a `spawn`
+function in {func}`torch.multiprocessing.spawn`. This helper function
+can be used to spawn multiple processes. It works by passing in the
+function that you want to run and spawns N processes to run it. This
+can be used for multiprocess distributed training as well.
+
+For references on how to use it, please refer to [PyTorch example - ImageNet
+implementation](https://github.com/pytorch/examples/tree/master/imagenet)
+
+Note that this function requires Python 3.4 or higher.
+
+## Debugging `torch.distributed` applications
+
+Debugging distributed applications can be challenging due to hard to understand hangs, crashes, or inconsistent behavior across ranks. `torch.distributed` provides
+a suite of tools to help debug training applications in a self-serve fashion:
+
+### Python Breakpoint
+
+It is extremely convenient to use python's debugger in a distributed environment, but because it does not work out of the box many people do not use it at all.
+PyTorch offers a customized wrapper around pdb that streamlines the process.
+
+`torch.distributed.breakpoint` makes this process easy. Internally, it customizes `pdb`'s breakpoint behavior in two ways but otherwise behaves as normal `pdb`.
+1. Attaches the debugger only on one rank (specified by the user).
+2. Ensures all other ranks stop, by using a `torch.distributed.barrier()` that will release once the debugged rank issues a `continue`
+3. Reroutes stdin from the child process such that it connects to your terminal.
+
+To use it, simply issue `torch.distributed.breakpoint(rank)` on all ranks, using the same value for `rank` in each case.
+
+### Monitored Barrier
+
+As of v1.10, {func}`torch.distributed.monitored_barrier` exists as an alternative to {func}`torch.distributed.barrier` which fails with helpful information about which rank may be faulty
+when crashing, i.e. not all ranks calling into {func}`torch.distributed.monitored_barrier` within the provided timeout. {func}`torch.distributed.monitored_barrier` implements a host-side
+barrier using `send`/`recv` communication primitives in a process similar to acknowledgements, allowing rank 0 to report which rank(s) failed to acknowledge
+the barrier in time. As an example, consider the following function where rank 1 fails to call into {func}`torch.distributed.monitored_barrier` (in practice this could be due
+to an application bug or hang in a previous collective):
+
+```
+import os
+from datetime import timedelta
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+
+def worker(rank):
+    dist.init_process_group("nccl", rank=rank, world_size=2)
+    # monitored barrier requires gloo process group to perform host-side sync.
+    group_gloo = dist.new_group(backend="gloo")
+    if rank not in [1]:
+        dist.monitored_barrier(group=group_gloo, timeout=timedelta(seconds=2))
+
+
+if __name__ == "__main__":
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "29501"
+    mp.spawn(worker, nprocs=2, args=())
+```
+
+The following error message is produced on rank 0, allowing the user to determine which rank(s) may be faulty and investigate further:
+
+```
+RuntimeError: Rank 1 failed to pass monitoredBarrier in 2000 ms
+ Original exception:
+[gloo/transport/tcp/pair.cc:598] Connection closed by peer [2401:db00:eef0:1100:3560:0:1c05:25d]:8594
+```
+
+### `TORCH_DISTRIBUTED_DEBUG`
+
+With `TORCH_CPP_LOG_LEVEL=INFO`, the environment variable `TORCH_DISTRIBUTED_DEBUG` can be used to trigger additional useful logging and collective synchronization checks to ensure all ranks
+are synchronized appropriately. `TORCH_DISTRIBUTED_DEBUG` can be set to either `OFF` (default), `INFO`, or `DETAIL` depending on the debugging level
+required. Please note that the most verbose option, `DETAIL` may impact the application performance and thus should only be used when debugging issues.
+
+Setting `TORCH_DISTRIBUTED_DEBUG=INFO` will result in additional debug logging when models trained with {func}`torch.nn.parallel.DistributedDataParallel` are initialized, and
+`TORCH_DISTRIBUTED_DEBUG=DETAIL` will additionally log runtime performance statistics a select number of iterations. These runtime statistics
+include data such as forward time, backward time, gradient communication time, etc. As an example, given the following application:
+
+```
+import os
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+
+class TwoLinLayerNet(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.a = torch.nn.Linear(10, 10, bias=False)
+        self.b = torch.nn.Linear(10, 1, bias=False)
+
+    def forward(self, x):
+        a = self.a(x)
+        b = self.b(x)
+        return (a, b)
+
+
+def worker(rank):
+    dist.init_process_group("nccl", rank=rank, world_size=2)
+    torch.cuda.set_device(rank)
+    print("init model")
+    model = TwoLinLayerNet().cuda()
+    print("init ddp")
+    ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
+
+    inp = torch.randn(10, 10).cuda()
+    print("train")
+
+    for _ in range(20):
+        output = ddp_model(inp)
+        loss = output[0] + output[1]
+        loss.sum().backward()
+
+
+if __name__ == "__main__":
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "29501"
+    os.environ["TORCH_CPP_LOG_LEVEL"]="INFO"
+    os.environ[
+        "TORCH_DISTRIBUTED_DEBUG"
+    ] = "DETAIL"  # set to DETAIL for runtime logging.
+    mp.spawn(worker, nprocs=2, args=())
+```
+
+The following logs are rendered at initialization time:
+
+```
+I0607 16:10:35.739390 515217 logger.cpp:173] [Rank 0]: DDP Initialized with:
+broadcast_buffers: 1
+bucket_cap_bytes: 26214400
+find_unused_parameters: 0
+gradient_as_bucket_view: 0
+is_multi_device_module: 0
+iteration: 0
+num_parameter_tensors: 2
+output_device: 0
+rank: 0
+total_parameter_size_bytes: 440
+world_size: 2
+backend_name: nccl
+bucket_sizes: 440
+cuda_visible_devices: N/A
+device_ids: 0
+dtypes: float
+master_addr: localhost
+master_port: 29501
+module_name: TwoLinLayerNet
+nccl_async_error_handling: N/A
+nccl_blocking_wait: N/A
+nccl_debug: WARN
+nccl_ib_timeout: N/A
+nccl_nthreads: N/A
+nccl_socket_ifname: N/A
+torch_distributed_debug: INFO
+```
+
+The following logs are rendered during runtime (when `TORCH_DISTRIBUTED_DEBUG=DETAIL` is set):
+
+```
+I0607 16:18:58.085681 544067 logger.cpp:344] [Rank 1 / 2] Training TwoLinLayerNet unused_parameter_size=0
+ Avg forward compute time: 40838608
+ Avg backward compute time: 5983335
+Avg backward comm. time: 4326421
+ Avg backward comm/comp overlap time: 4207652
+I0607 16:18:58.085693 544066 logger.cpp:344] [Rank 0 / 2] Training TwoLinLayerNet unused_parameter_size=0
+ Avg forward compute time: 42850427
+ Avg backward compute time: 3885553
+Avg backward comm. time: 2357981
+ Avg backward comm/comp overlap time: 2234674
+```
+
+In addition, `TORCH_DISTRIBUTED_DEBUG=INFO` enhances crash logging in {func}`torch.nn.parallel.DistributedDataParallel` due to unused parameters in the model. Currently, `find_unused_parameters=True`
+must be passed into {func}`torch.nn.parallel.DistributedDataParallel` initialization if there are parameters that may be unused in the forward pass, and as of v1.10, all model outputs are required
+to be used in loss computation as {func}`torch.nn.parallel.DistributedDataParallel` does not support unused parameters in the backwards pass. These constraints are challenging especially for larger
+models, thus when crashing with an error, {func}`torch.nn.parallel.DistributedDataParallel` will log the fully qualified name of all parameters that went unused. For example, in the above application,
+if we modify `loss` to be instead computed as `loss = output[1]`, then `TwoLinLayerNet.a` does not receive a gradient in the backwards pass, and
+thus results in `DDP` failing. On a crash, the user is passed information about parameters which went unused, which may be challenging to manually find for large models:
+
+```
+RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by passing
+ the keyword argument `find_unused_parameters=True` to `torch.nn.parallel.DistributedDataParallel`, and by
+making sure all `forward` function outputs participate in calculating loss.
+If you already have done the above, then the distributed data parallel module wasn't able to locate the output tensors in the return value of your module's `forward` function. Please include the loss function and the structure of the return va
+lue of `forward` of your module when reporting this issue (e.g. list, dict, iterable).
+Parameters which did not receive grad for rank 0: a.weight
+Parameter indices which did not receive grad for rank 0: 0
+```
+
+Setting `TORCH_DISTRIBUTED_DEBUG=DETAIL` will trigger additional consistency and synchronization checks on every collective call issued by the user
+either directly or indirectly (such as DDP `allreduce`). This is done by creating a wrapper process group that wraps all process groups returned by
+{func}`torch.distributed.init_process_group` and {func}`torch.distributed.new_group` APIs. As a result, these APIs will return a wrapper process group that can be used exactly like a regular process
+group, but performs consistency checks before dispatching the collective to an underlying process group. Currently, these checks include a {func}`torch.distributed.monitored_barrier`,
+which ensures all ranks complete their outstanding collective calls and reports ranks which are stuck. Next, the collective itself is checked for consistency by
+ensuring all collective functions match and are called with consistent tensor shapes. If this is not the case, a detailed error report is included when the
+application crashes, rather than a hang or uninformative error message. As an example, consider the following function which has mismatched input shapes into
+{func}`torch.distributed.all_reduce`:
+
+```
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+
+def worker(rank):
+    dist.init_process_group("nccl", rank=rank, world_size=2)
+    torch.cuda.set_device(rank)
+    tensor = torch.randn(10 if rank == 0 else 20).cuda()
+    dist.all_reduce(tensor)
+    torch.cuda.synchronize(device=rank)
+
+
+if __name__ == "__main__":
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "29501"
+    os.environ["TORCH_CPP_LOG_LEVEL"]="INFO"
+    os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
+    mp.spawn(worker, nprocs=2, args=())
+```
+
+With the `NCCL` backend, such an application would likely result in a hang which can be challenging to root-cause in nontrivial scenarios. If the user enables
+`TORCH_DISTRIBUTED_DEBUG=DETAIL` and reruns the application, the following error message reveals the root cause:
+
+```
+work = default_pg.allreduce([tensor], opts)
+RuntimeError: Error when verifying shape tensors for collective ALLREDUCE on rank 0. This likely indicates that input shapes into the collective are mismatched across ranks. Got shapes:  10
+[ torch.LongTensor{1} ]
+```
+
+:::{note}
+For fine-grained control of the debug level during runtime the functions {func}`torch.distributed.set_debug_level`, {func}`torch.distributed.set_debug_level_from_env`, and
+{func}`torch.distributed.get_debug_level` can also be used.
+:::
+
+In addition, `TORCH_DISTRIBUTED_DEBUG=DETAIL` can be used in conjunction with `TORCH_SHOW_CPP_STACKTRACES=1` to log the entire callstack when a collective desynchronization is detected. These
+collective desynchronization checks will work for all applications that use `c10d` collective calls backed by process groups created with the
+{func}`torch.distributed.init_process_group` and {func}`torch.distributed.new_group` APIs.
+
+## Logging
+
+In addition to explicit debugging support via {func}`torch.distributed.monitored_barrier` and `TORCH_DISTRIBUTED_DEBUG`, the underlying C++ library of `torch.distributed` also outputs log
+messages at various levels. These messages can be helpful to understand the execution state of a distributed training job and to troubleshoot problems such as network connection failures. The
+following matrix shows how the log level can be adjusted via the combination of `TORCH_CPP_LOG_LEVEL` and `TORCH_DISTRIBUTED_DEBUG` environment variables.
+
+| `TORCH_CPP_LOG_LEVEL` | `TORCH_DISTRIBUTED_DEBUG` | Effective Log Level |
+| --------------------- | ------------------------- | ------------------- |
+| `ERROR`               | ignored                   | Error               |
+| `WARNING`             | ignored                   | Warning             |
+| `INFO`                | ignored                   | Info                |
+| `INFO`                | `INFO`                    | Debug               |
+| `INFO`                | `DETAIL`                  | Trace (a.k.a. All)  |
+
+Distributed components raise custom Exception types derived from `RuntimeError`:
+
+- `torch.distributed.DistError`: This is the base type of all distributed exceptions.
+- `torch.distributed.DistBackendError`: This exception is thrown when a backend-specific error occurs. For example, if
+  the `NCCL` backend is used and the user attempts to use a GPU that is not available to the `NCCL` library.
+- `torch.distributed.DistNetworkError`: This exception is thrown when networking
+  libraries encounter errors (ex: Connection reset by peer)
+- `torch.distributed.DistStoreError`: This exception is thrown when the Store encounters
+  an error (ex: TCPStore timeout)
+
+```{eval-rst}
+.. autoclass:: torch.distributed.DistError
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.DistBackendError
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.DistNetworkError
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.DistStoreError
+```
+
+If you are running single node training, it may be convenient to interactively breakpoint your script. We offer a way to conveniently breakpoint a single rank:
+
+```{eval-rst}
+.. autofunction:: torch.distributed.breakpoint
+```
+
+% Distributed modules that are missing specific entries.
+
+% Adding them here for tracking purposes until they are more permanently fixed.
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.model_averaging
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.utils
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.utils.data
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.launcher
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.nn
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.nn.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.nn.jit
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.nn.jit.templates
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.ddp_zero_hook
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.default_hooks
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.mixed_precision_hooks
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.quantization_hooks
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.join
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.model_averaging.averagers
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.model_averaging.hierarchical_model_averager
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.algorithms.model_averaging.utils
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.argparse_util
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.c10d_logger
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.default_planner
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.filesystem
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.hf_storage
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.metadata
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.optimizer
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.planner
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.planner_helpers
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.resharding
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.state_dict_loader
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.state_dict_saver
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.stateful
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.storage
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.utils
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.collective_utils
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.constants
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.device_mesh
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.distributed_c10d
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.agent.server.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.agent.server.local_elastic_agent
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.events.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.events.handlers
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.metrics.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.multiprocessing.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.multiprocessing.errors.error_handler
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.multiprocessing.errors.handlers
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.multiprocessing.redirects
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.multiprocessing.tail_log
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.rendezvous.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.rendezvous.c10d_rendezvous_backend
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.rendezvous.dynamic_rendezvous
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.rendezvous.etcd_rendezvous
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.rendezvous.etcd_rendezvous_backend
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.rendezvous.etcd_server
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.rendezvous.etcd_store
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.rendezvous.static_tcp_rendezvous
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.rendezvous.utils
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.timer.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.timer.file_based_local_timer
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.timer.local_timer
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.utils.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.utils.data.cycling_iterator
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.utils.data.elastic_distributed_sampler
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.utils.distributed
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.utils.log_level
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.utils.logging
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.elastic.utils.store
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.fsdp.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.fsdp.fully_sharded_data_parallel
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.fsdp.sharded_grad_scaler
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.fsdp.wrap
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.launcher.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.logging_handlers
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.nn.api.remote_module
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.nn.functional
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.nn.jit.instantiator
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.nn.jit.templates.remote_module_template
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.apply_optimizer_in_backward
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.functional_adadelta
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.functional_adagrad
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.functional_adam
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.functional_adamax
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.functional_adamw
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.functional_rmsprop
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.functional_rprop
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.functional_sgd
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.named_optimizer
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.optimizer
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.post_localSGD_optimizer
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.utils
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.optim.zero_redundancy_optimizer
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.remote_device
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.rendezvous
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.rpc.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.rpc.backend_registry
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.rpc.constants
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.rpc.functions
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.rpc.internal
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.rpc.options
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.rpc.rref_proxy
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.rpc.server_process_global_profiler
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.tensor.parallel.api
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.tensor.parallel.ddp
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.tensor.parallel.fsdp
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.tensor.parallel.input_reshard
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.tensor.parallel.loss
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.tensor.parallel.style
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.utils
+```
+
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.state_dict
+```
diff --git a/docs/source/distributed.optim.md b/docs/source/distributed.optim.md
new file mode 100644
index 000000000000..57914e1241fd
--- /dev/null
+++ b/docs/source/distributed.optim.md
@@ -0,0 +1,15 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# Distributed Optimizers
+
+:::{warning}
+Distributed optimizer is not currently supported when using CUDA tensors
+:::
+
+```{eval-rst}
+.. automodule:: torch.distributed.optim
+    :members: DistributedOptimizer, PostLocalSGDOptimizer, ZeroRedundancyOptimizer
+```
diff --git a/docs/source/distributed.pipelining.md b/docs/source/distributed.pipelining.md
new file mode 100644
index 000000000000..2b6dbf186ff4
--- /dev/null
+++ b/docs/source/distributed.pipelining.md
@@ -0,0 +1,515 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# Pipeline Parallelism
+
+:::{note}
+`torch.distributed.pipelining` is currently in alpha state and under
+development. API changes may be possible. It was migrated from the [PiPPy](https://github.com/pytorch/PiPPy) project.
+:::
+
+## Why Pipeline Parallel?
+
+Pipeline Parallelism is one of the **primitive** parallelism for deep learning.
+It allows the **execution** of a model to be partitioned such that multiple
+**micro-batches** can execute different parts of the model code concurrently.
+Pipeline parallelism can be an effective technique for:
+
+- large-scale training
+- bandwidth-limited clusters
+- large model inference
+
+The above scenarios share a commonality that the computation per device cannot
+hide the communication of conventional parallelism, for example, the weight
+all-gather of FSDP.
+
+## What is `torch.distributed.pipelining`?
+
+While promising for scaling, pipelining is often difficult to implement because
+it needs to **partition the execution** of a model in addition to model weights.
+The partitioning of execution often requires intrusive code changes to your
+model. Another aspect of complexity comes from **scheduling micro-batches in a
+distributed environment**, with **data flow dependency** considered.
+
+The `pipelining` package provides a toolkit that does said things
+**automatically** which allows easy implementation of pipeline parallelism
+on **general** models.
+
+It consists of two parts: a
+**splitting frontend** and a **distributed runtime**.
+The splitting frontend takes your model code as-is, splits it up into "model
+partitions", and captures the data-flow relationship. The distributed runtime
+executes the pipeline stages on different devices in parallel, handling things
+like micro-batch splitting, scheduling, communication, and gradient propagation,
+etc.
+
+Overall, the `pipelining` package provides the following features:
+
+- Splitting of model code based on simple specification.
+- Rich support for pipeline schedules, including GPipe, 1F1B,
+  Interleaved 1F1B and Looped BFS, and providing the infrastructure for writing
+  customized schedules.
+- First-class support for cross-host pipeline parallelism, as this is where PP
+  is typically used (over slower interconnects).
+- Composability with other PyTorch parallel techniques such as data parallel
+  (DDP, FSDP) or tensor parallel. The [TorchTitan](https://github.com/pytorch/torchtitan) project demonstrates a "3D parallel"
+  application on the Llama model.
+
+## Step 1: build `PipelineStage`
+
+Before we can use a `PipelineSchedule`, we need to create `PipelineStage`
+objects that wrap the part of the model running in that stage. The
+`PipelineStage` is responsible for allocating communication buffers and
+creating send/recv ops to communicate with its peers. It manages intermediate
+buffers e.g. for the outputs of forward that have not been consumed yet, and it
+provides a utility for running the backwards for the stage model.
+
+A `PipelineStage` needs to know the input and output shapes for the stage
+model, so that it can correctly allocate communication buffers. The shapes must
+be static, e.g. at runtime the shapes can not change from step to step. A class
+`PipeliningShapeError` will be raised if runtime shapes do not match the
+expected shapes. When composing with other paralleisms or applying mixed
+precision, these techniques must be taken into account so the `PipelineStage`
+knows the correct shape (and dtype) for the output of the stage module at
+runtime.
+
+Users may construct a `PipelineStage` instance directly, by passing in an
+`nn.Module` representing the portion of the model that should run on the
+stage. This may require changes to the original model code. See the example
+in {ref}`option_1_manual`.
+
+Alternatively, the splitting frontend can use graph partitioning to split your
+model into a series of `nn.Module` automatically. This technique requires the
+model is traceable with `torch.Export`. Composability of the resulting
+`nn.Module` with other parallelism techniques is experimental, and may require
+some workarounds. Usage of this frontend may be more appealing if the user
+cannot easily change the model code. See {ref}`option_2_tracer` for more
+information.
+
+## Step 2: use `PipelineSchedule` for execution
+
+We can now attach the `PipelineStage` to a pipeline schedule, and run the
+schedule with input data. Here is a GPipe example:
+
+```python
+from torch.distributed.pipelining import ScheduleGPipe
+
+# Create a schedule
+schedule = ScheduleGPipe(stage, n_microbatches)
+
+# Input data (whole batch)
+x = torch.randn(batch_size, in_dim, device=device)
+
+# Run the pipeline with input `x`
+# `x` will be divided into microbatches automatically
+if rank == 0:
+    schedule.step(x)
+else:
+    output = schedule.step()
+```
+
+Note that the above code needs to be launched for each worker, thus we use a
+launcher service to launch multiple processes:
+
+```bash
+torchrun --nproc_per_node=2 example.py
+```
+
+## Options for Splitting a Model
+
+(option_1_manual)=
+
+### Option 1: splitting a model manually
+
+To directly construct a `PipelineStage`, the user is responsible for providing
+a single `nn.Module` instance that owns the relevant `nn.Parameters` and
+`nn.Buffers`, and defines a `forward()` method that executes the operations
+relevant for that stage. For example, a condensed version of the Transformer
+class defined in Torchtitan shows a pattern of building an easily partitionable
+model.
+
+```python
+class Transformer(nn.Module):
+    def __init__(self, model_args: ModelArgs):
+        super().__init__()
+
+        self.tok_embeddings = nn.Embedding(...)
+
+        # Using a ModuleDict lets us delete layers without affecting names,
+        # ensuring checkpoints will correctly save and load.
+        self.layers = torch.nn.ModuleDict()
+        for layer_id in range(model_args.n_layers):
+            self.layers[str(layer_id)] = TransformerBlock(...)
+
+        self.output = nn.Linear(...)
+
+    def forward(self, tokens: torch.Tensor):
+        # Handling layers being 'None' at runtime enables easy pipeline splitting
+        h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
+
+        for layer in self.layers.values():
+            h = layer(h, self.freqs_cis)
+
+        h = self.norm(h) if self.norm else h
+        output = self.output(h).float() if self.output else h
+        return output
+```
+
+A model defined in this manner can be easily configured per stage by first
+initializing the whole model (using meta-device to avoid OOM errors), deleting
+undesired layers for that stage, and then creating a PipelineStage that wraps
+the model. For example:
+
+```python
+with torch.device("meta"):
+    assert num_stages == 2, "This is a simple 2-stage example"
+
+    # we construct the entire model, then delete the parts we do not need for this stage
+    # in practice, this can be done using a helper function that automatically divides up layers across stages.
+    model = Transformer()
+
+    if stage_index == 0:
+        # prepare the first stage model
+        del model.layers["1"]
+        model.norm = None
+        model.output = None
+
+    elif stage_index == 1:
+        # prepare the second stage model
+        model.tok_embeddings = None
+        del model.layers["0"]
+
+    from torch.distributed.pipelining import PipelineStage
+    stage = PipelineStage(
+        model,
+        stage_index,
+        num_stages,
+        device,
+    )
+```
+
+When composing with other Data or Model parallelism techniques, `output_args`
+may also be required, if the output shape/dtype of the model chunk will be
+affected.
+
+(option_2_tracer)=
+
+### Option 2: splitting a model automatically
+
+If you have a full model and do not want to spend time on modifying it into a
+sequence of "model partitions", the `pipeline` API is here to help.
+Here is a brief example:
+
+```python
+class Model(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.emb = torch.nn.Embedding(10, 3)
+        self.layers = torch.nn.ModuleList(
+            Layer() for _ in range(2)
+        )
+        self.lm = LMHead()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.emb(x)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.lm(x)
+        return x
+```
+
+If we print the model, we can see multiple hierarchies, which makes it hard to split by hand:
+
+```python
+Model(
+  (emb): Embedding(10, 3)
+  (layers): ModuleList(
+    (0-1): 2 x Layer(
+      (lin): Linear(in_features=3, out_features=3, bias=True)
+    )
+  )
+  (lm): LMHead(
+    (proj): Linear(in_features=3, out_features=3, bias=True)
+  )
+)
+```
+
+Let us see how the `pipeline` API works:
+
+```python
+from torch.distributed.pipelining import pipeline, SplitPoint
+
+# An example micro-batch input
+x = torch.LongTensor([1, 2, 4, 5])
+
+pipe = pipeline(
+    module=mod,
+    mb_args=(x,),
+    split_spec={
+        "layers.1": SplitPoint.BEGINNING,
+    }
+)
+```
+
+The `pipeline` API splits your model given a `split_spec`, where
+`SplitPoint.BEGINNING` stands for adding a split point
+*before* execution of certain submodule in the `forward` function, and
+similarly, `SplitPoint.END` for split point *after* such.
+
+If we `print(pipe)`, we can see:
+
+```python
+GraphModule(
+  (submod_0): GraphModule(
+    (emb): InterpreterModule()
+    (layers): Module(
+      (0): InterpreterModule(
+        (lin): InterpreterModule()
+      )
+    )
+  )
+  (submod_1): GraphModule(
+    (layers): Module(
+      (1): InterpreterModule(
+        (lin): InterpreterModule()
+      )
+    )
+    (lm): InterpreterModule(
+      (proj): InterpreterModule()
+    )
+  )
+)
+
+def forward(self, x):
+    submod_0 = self.submod_0(x);  x = None
+    submod_1 = self.submod_1(submod_0);  submod_0 = None
+    return (submod_1,)
+```
+
+The "model partitions" are represented by submodules (`submod_0`,
+`submod_1`), each of which is reconstructed with original model operations, weights
+and hierarchies. In addition, a "root-level" `forward` function is
+reconstructed to capture the data flow between those partitions. Such data flow
+will be replayed by the pipeline runtime later, in a distributed fashion.
+
+The `Pipe` object provides a method for retrieving the "model partitions":
+
+```python
+stage_mod : nn.Module = pipe.get_stage_module(stage_idx)
+```
+
+The returned `stage_mod` is a `nn.Module`, with which you can create an
+optimizer, save or load checkpoints, or apply other parallelisms.
+
+`Pipe` also allows you to create a distributed stage runtime on a device given
+a `ProcessGroup`:
+
+```python
+stage = pipe.build_stage(stage_idx, device, group)
+```
+
+Alternatively, if you would like to build the stage runtime later after some
+modification to the `stage_mod`, you can use a functional version of the
+`build_stage` API. For example:
+
+```python
+from torch.distributed.pipelining import build_stage
+from torch.nn.parallel import DistributedDataParallel
+
+dp_mod = DistributedDataParallel(stage_mod)
+info = pipe.info()
+stage = build_stage(dp_mod, stage_idx, info, device, group)
+```
+
+:::{note}
+The `pipeline` frontend uses a tracer (`torch.export`) to capture your
+model into a single graph. If your model is not full-graph'able, you can use
+our manual frontend below.
+:::
+
+## Hugging Face Examples
+
+In the [PiPPy](https://github.com/pytorch/PiPPy) repo where this package was
+original created, we kept examples based on unmodified Hugging Face models.
+See the [examples/huggingface](https://github.com/pytorch/PiPPy/tree/main/examples/huggingface) directory.
+
+Examples include:
+
+- [GPT2](https://github.com/pytorch/PiPPy/tree/main/examples/huggingface/pippy_gpt2.py)
+- [Llama](https://github.com/pytorch/PiPPy/tree/main/examples/llama)
+
+## Technical Deep Dive
+
+### How does the `pipeline` API split a model?
+
+First, the `pipeline` API turns our model into a directed acyclic graph (DAG)
+by tracing the model. It traces the model using `torch.export` -- a PyTorch 2
+full-graph capturing tool.
+
+Then, it groups together the **operations and parameters** needed by a stage
+into a reconstructed submodule: `submod_0`, `submod_1`, ...
+
+Different from conventional submodule access methods like `Module.children()`,
+the `pipeline` API does not only cut the module structure of your model, but
+also the **forward** function of your model.
+
+This is necessary because model structure like `Module.children()` merely
+captures information during `Module.__init__()`, and does not capture any
+information about `Module.forward()`. Said differently, `Module.children()`
+lacks information about the following aspects key to pipelininig:
+
+- Execution order of child modules in `forward`
+- Activation flows between child modules
+- Whether there are any functional operators between child modules (for example,
+  `relu` or `add` operations will not be captured by `Module.children()`).
+
+The `pipeline` API, on the contrary, makes sure that the `forward` behavior
+is truly preserved. It also captures the activation flow between the partitions,
+helping the distributed runtime to make correct send/receive calls without human
+intervention.
+
+Another flexibility of the `pipeline` API is that split points can be at
+arbitrary levels within your model hierarchy. In the split partitions, the original model
+hierarchy related to that partition will be reconstructed at no cost to you.
+At a result, fully-qualified names (FQNs) pointing to a submodule or parameter
+would be still valid, and services that relies on FQNs (such as FSDP, TP or
+checkpointing) can still run with your partitioned modules with almost zero code
+change.
+
+## Implementing Your Own Schedule
+
+You can implement your own pipeline schedule by extending one of the following two class:
+
+- `PipelineScheduleSingle`
+- `PipelineScheduleMulti`
+
+`PipelineScheduleSingle` is for schedules that assigns *only one* stage per rank.
+`PipelineScheduleMulti` is for schedules that assigns multiple stages per rank.
+
+For example, `ScheduleGPipe` and `Schedule1F1B` are subclasses of `PipelineScheduleSingle`.
+Whereas, `ScheduleInterleaved1F1B`, `ScheduleLoopedBFS`, `ScheduleInterleavedZeroBubble`, and `ScheduleZBVZeroBubble`
+are subclasses of `PipelineScheduleMulti`.
+
+## Logging
+
+You can turn on additional logging using the `TORCH_LOGS` environment variable from [torch.\_logging](https://pytorch.org/docs/main/logging.html#module-torch._logging):
+
+- `TORCH_LOGS=+pp` will display `logging.DEBUG` messages and all levels above it.
+- `TORCH_LOGS=pp` will display `logging.INFO` messages and above.
+- `TORCH_LOGS=-pp` will display `logging.WARNING` messages and above.
+
+## API Reference
+
+```{eval-rst}
+.. automodule:: torch.distributed.pipelining
+```
+
+### Model Split APIs
+
+The following set of APIs transform your model into a pipeline representation.
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.pipelining
+```
+
+```{eval-rst}
+.. autoclass:: SplitPoint
+```
+
+```{eval-rst}
+.. autofunction:: pipeline
+```
+
+```{eval-rst}
+.. autoclass:: Pipe
+```
+
+```{eval-rst}
+.. autofunction:: pipe_split
+```
+
+### Microbatch Utilities
+
+```{eval-rst}
+.. automodule:: torch.distributed.pipelining.microbatch
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.pipelining.microbatch
+```
+
+```{eval-rst}
+.. autoclass:: TensorChunkSpec
+```
+
+```{eval-rst}
+.. autofunction:: split_args_kwargs_into_chunks
+```
+
+```{eval-rst}
+.. autofunction:: merge_chunks
+```
+
+### Pipeline Stages
+
+```{eval-rst}
+.. automodule:: torch.distributed.pipelining.stage
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.pipelining.stage
+```
+
+```{eval-rst}
+.. autoclass:: PipelineStage
+```
+
+```{eval-rst}
+.. autofunction:: build_stage
+```
+
+### Pipeline Schedules
+
+```{eval-rst}
+.. automodule:: torch.distributed.pipelining.schedules
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.pipelining.schedules
+```
+
+```{eval-rst}
+.. autoclass:: ScheduleGPipe
+```
+
+```{eval-rst}
+.. autoclass:: Schedule1F1B
+```
+
+```{eval-rst}
+.. autoclass:: ScheduleInterleaved1F1B
+```
+
+```{eval-rst}
+.. autoclass:: ScheduleLoopedBFS
+```
+
+```{eval-rst}
+.. autoclass:: ScheduleInterleavedZeroBubble
+```
+
+```{eval-rst}
+.. autoclass:: ScheduleZBVZeroBubble
+```
+
+```{eval-rst}
+.. autoclass:: PipelineScheduleSingle
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: PipelineScheduleMulti
+  :members:
+```
diff --git a/docs/source/distributed.tensor.md b/docs/source/distributed.tensor.md
new file mode 100644
index 000000000000..64f2f02c8107
--- /dev/null
+++ b/docs/source/distributed.tensor.md
@@ -0,0 +1,250 @@
+
+:::{currentmodule} torch.distributed.tensor
+:::
+
+
+# torch.distributed.tensor
+
+:::{note}
+`torch.distributed.tensor` is currently in alpha state and under
+development, we are committing backward compatibility for the most APIs listed
+in the doc, but there might be API changes if necessary.
+:::
+
+## PyTorch DTensor (Distributed Tensor)
+
+PyTorch DTensor offers simple and flexible tensor sharding primitives that transparently handles distributed
+logic, including sharded storage, operator computation and collective communications across devices/hosts.
+`DTensor` could be used to build different parallelism solutions and support sharded state_dict representation
+when working with multi-dimensional sharding.
+
+Please see examples from the PyTorch native parallelism solutions that are built on top of `DTensor`:
+
+- [Tensor Parallel](https://pytorch.org/docs/main/distributed.tensor.parallel.html)
+- [FSDP2](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md)
+
+```{eval-rst}
+.. automodule:: torch.distributed.tensor
+```
+
+{class}`DTensor` follows the SPMD (single program, multiple data) programming model to empower users to
+write distributed program as if it's a **single-device program with the same convergence property**. It
+provides a uniform tensor sharding layout (DTensor Layout) through specifying the {class}`DeviceMesh`
+and {class}`Placement`:
+
+- {class}`DeviceMesh` represents the device topology and the communicators of the cluster using
+  an n-dimensional array.
+- {class}`Placement` describes the sharding layout of the logical tensor on the {class}`DeviceMesh`.
+  DTensor supports three types of placements: {class}`Shard`, {class}`Replicate` and {class}`Partial`.
+
+### DTensor Class APIs
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.tensor
+```
+
+{class}`DTensor` is a `torch.Tensor` subclass. This means once a {class}`DTensor` is created, it could be
+used in very similar way to `torch.Tensor`, including running different types of PyTorch operators as if
+running them in a single device, allowing proper distributed computation for PyTorch operators.
+
+In addition to existing `torch.Tensor` methods, it also offers a set of additional methods to interact with
+`torch.Tensor`, `redistribute` the DTensor Layout to a new DTensor, get the full tensor content
+on all devices, etc.
+
+```{eval-rst}
+.. autoclass:: DTensor
+    :members: from_local, to_local, full_tensor, redistribute, device_mesh, placements
+    :member-order: groupwise
+    :special-members: __create_chunk_list__
+
+```
+
+### DeviceMesh as the distributed communicator
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.device_mesh
+```
+
+{class}`DeviceMesh` was built from DTensor as the abstraction to describe cluster's device topology and represent
+multi-dimensional communicators (on top of `ProcessGroup`). To see the details of how to create/use a DeviceMesh,
+please refer to the [DeviceMesh recipe](https://pytorch.org/tutorials/recipes/distributed_device_mesh.html).
+
+### DTensor Placement Types
+
+```{eval-rst}
+.. automodule:: torch.distributed.tensor.placement_types
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.tensor.placement_types
+```
+
+DTensor supports the following types of {class}`Placement` on each {class}`DeviceMesh` dimension:
+
+```{eval-rst}
+.. autoclass:: Shard
+  :members:
+  :undoc-members:
+```
+
+```{eval-rst}
+.. autoclass:: Replicate
+  :members:
+  :undoc-members:
+```
+
+```{eval-rst}
+.. autoclass:: Partial
+  :members:
+  :undoc-members:
+```
+
+```{eval-rst}
+.. autoclass:: Placement
+  :members:
+  :undoc-members:
+```
+
+(create_dtensor)=
+
+## Different ways to create a DTensor
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.tensor
+```
+
+There're three ways to construct a {class}`DTensor`:
+: - {meth}`distribute_tensor` creates a {class}`DTensor` from a logical or "global" `torch.Tensor` on
+    each rank. This could be used to shard the leaf `torch.Tensor` s (i.e. model parameters/buffers
+    and inputs).
+  - {meth}`DTensor.from_local` creates a {class}`DTensor` from a local `torch.Tensor` on each rank, which can
+    be used to create {class}`DTensor` from a non-leaf `torch.Tensor` s (i.e. intermediate activation
+    tensors during forward/backward).
+  - DTensor provides dedicated tensor factory functions (e.g. {meth}`empty`, {meth}`ones`, {meth}`randn`, etc.)
+    to allow different {class}`DTensor` creations by directly specifying the {class}`DeviceMesh` and
+    {class}`Placement`. Compare to {meth}`distribute_tensor`, this could directly materializing the sharded memory
+    on device, instead of performing sharding after initializing the logical Tensor memory.
+
+### Create DTensor from a logical torch.Tensor
+
+The SPMD (single program, multiple data) programming model in `torch.distributed` launches multiple processes
+(i.e. via `torchrun`) to execute the same program, this means that the model inside the program would be
+initialized on different processes first (i.e. the model might be initialized on CPU, or meta device, or directly
+on GPU if enough memory).
+
+`DTensor` offers a {meth}`distribute_tensor` API that could shard the model weights or Tensors to `DTensor` s,
+where it would create a DTensor from the "logical" Tensor on each process. This would empower the created
+`DTensor` s to comply with the single device semantic, which is critical for **numerical correctness**.
+
+```{eval-rst}
+.. autofunction::  distribute_tensor
+```
+
+Along with {meth}`distribute_tensor`, DTensor also offers a {meth}`distribute_module` API to allow easier
+sharding on the {class}`nn.Module` level
+
+```{eval-rst}
+.. autofunction::  distribute_module
+
+```
+
+### DTensor Factory Functions
+
+DTensor also provides dedicated tensor factory functions to allow creating {class}`DTensor` directly
+using torch.Tensor like factory function APIs (i.e. torch.ones, torch.empty, etc), by additionally
+specifying the {class}`DeviceMesh` and {class}`Placement` for the {class}`DTensor` created:
+
+```{eval-rst}
+.. autofunction:: zeros
+```
+
+```{eval-rst}
+.. autofunction:: ones
+```
+
+```{eval-rst}
+.. autofunction:: empty
+```
+
+```{eval-rst}
+.. autofunction:: full
+```
+
+```{eval-rst}
+.. autofunction:: rand
+```
+
+```{eval-rst}
+.. autofunction:: randn
+
+```
+
+## Debugging
+
+```{eval-rst}
+.. automodule:: torch.distributed.tensor.debug
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.tensor.debug
+```
+
+### Logging
+
+When launching the program, you can turn on additional logging using the `TORCH_LOGS` environment variable from
+[torch._logging](https://pytorch.org/docs/main/logging.html#module-torch._logging) :
+
+- `TORCH_LOGS=+dtensor` will display `logging.DEBUG` messages and all levels above it.
+- `TORCH_LOGS=dtensor` will display `logging.INFO` messages and above.
+- `TORCH_LOGS=-dtensor` will display `logging.WARNING` messages and above.
+
+### Debugging Tools
+
+To debug the program that applied DTensor, and understand more details about what collectives happened under the
+hood, DTensor provides a {class}`CommDebugMode`:
+
+```{eval-rst}
+.. autoclass:: CommDebugMode
+    :members:
+    :undoc-members:
+```
+
+To visualize the sharding of a DTensor that have less than 3 dimensions, DTensor provides {meth}`visualize_sharding`:
+
+```{eval-rst}
+.. autofunction:: visualize_sharding
+
+```
+
+## Experimental Features
+
+`DTensor` also provides a set of experimental features. These features are either in prototyping stage, or the basic
+functionality is done and but looking for user feedbacks. Please submit a issue to PyTorch if you have feedbacks to
+these features.
+
+```{eval-rst}
+.. automodule:: torch.distributed.tensor.experimental
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.tensor.experimental
+```
+
+```{eval-rst}
+.. autofunction:: context_parallel
+```
+
+```{eval-rst}
+.. autofunction:: local_map
+```
+
+```{eval-rst}
+.. autofunction:: register_sharding
+
+```
+
+% modules that are missing docs, add the doc later when necessary
+
+```{eval-rst}
+.. py:module:: torch.distributed.tensor.device_mesh
+```
diff --git a/docs/source/distributed.tensor.parallel.md b/docs/source/distributed.tensor.parallel.md
new file mode 100644
index 000000000000..6083699493ff
--- /dev/null
+++ b/docs/source/distributed.tensor.parallel.md
@@ -0,0 +1,92 @@
+:::{role} hidden
+    :class: hidden-section
+:::
+
+# Tensor Parallelism - torch.distributed.tensor.parallel
+
+Tensor Parallelism(TP) is built on top of the PyTorch DistributedTensor
+(DTensor)[https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/README.md]
+and provides different parallelism styles: Colwise, Rowwise, and Sequence Parallelism.
+
+:::{warning}
+Tensor Parallelism APIs are experimental and subject to change.
+:::
+
+The entrypoint to parallelize your `nn.Module` using Tensor Parallelism is:
+
+```{eval-rst}
+.. automodule:: torch.distributed.tensor.parallel
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributed.tensor.parallel
+```
+
+```{eval-rst}
+.. autofunction::  parallelize_module
+```
+
+Tensor Parallelism supports the following parallel styles:
+
+```{eval-rst}
+.. autoclass:: torch.distributed.tensor.parallel.ColwiseParallel
+  :members:
+  :undoc-members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.tensor.parallel.RowwiseParallel
+  :members:
+  :undoc-members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.tensor.parallel.SequenceParallel
+  :members:
+  :undoc-members:
+```
+
+To simply configure the nn.Module's inputs and outputs with DTensor layouts
+and perform necessary layout redistributions, without distribute the module
+parameters to DTensors, the following `ParallelStyle` s can be used in
+the `parallelize_plan` when calling `parallelize_module`:
+
+
+```{eval-rst}
+.. autoclass:: torch.distributed.tensor.parallel.PrepareModuleInput
+  :members:
+  :undoc-members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.tensor.parallel.PrepareModuleOutput
+  :members:
+  :undoc-members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.tensor.parallel.PrepareModuleInputOutput
+  :members:
+  :undoc-members:
+```
+
+:::{note}
+when using the `Shard(dim)` as the input/output layouts for the above
+`ParallelStyle` s, we assume the input/output activation tensors are evenly sharded on
+the tensor dimension `dim` on the `DeviceMesh` that TP operates on. For instance,
+since `RowwiseParallel` accepts input that is sharded on the last dimension, it assumes
+the input tensor has already been evenly sharded on the last dimension. For the case of uneven sharded activation tensors, one could pass in DTensor directly to the partitioned modules, and use `use_local_output=False` to return DTensor after each `ParallelStyle`, where DTensor could track the uneven sharding information.
+:::
+
+For models like Transformer, we recommend users to use `ColwiseParallel`
+and `RowwiseParallel` together in the parallelize_plan for achieve the desired
+sharding for the entire model (i.e. Attention and MLP).
+
+Parallelized cross-entropy loss computation (loss parallelism), is supported via the following context manager:
+
+```{eval-rst}
+.. autofunction:: torch.distributed.tensor.parallel.loss_parallel
+```
+:::{warning}
+    The loss_parallel API is experimental and subject to change.
+:::
\ No newline at end of file
diff --git a/docs/source/distributions.md b/docs/source/distributions.md
new file mode 100644
index 000000000000..71c37c386cd1
--- /dev/null
+++ b/docs/source/distributions.md
@@ -0,0 +1,692 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# Probability distributions - torch.distributions
+
+```{eval-rst}
+.. automodule:: torch.distributions
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributions
+```
+
+## {hidden}`Distribution`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.distribution
+```
+
+```{eval-rst}
+.. autoclass:: Distribution
+    :members:
+    :show-inheritance:
+```
+
+## {hidden}`ExponentialFamily`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.exp_family
+```
+
+```{eval-rst}
+.. autoclass:: ExponentialFamily
+    :members:
+    :show-inheritance:
+```
+
+## {hidden}`Bernoulli`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.bernoulli
+```
+
+```{eval-rst}
+.. autoclass:: Bernoulli
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Beta`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.beta
+```
+
+```{eval-rst}
+.. autoclass:: Beta
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Binomial`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.binomial
+```
+
+```{eval-rst}
+.. autoclass:: Binomial
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Categorical`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.categorical
+```
+
+```{eval-rst}
+.. autoclass:: Categorical
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Cauchy`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.cauchy
+```
+
+```{eval-rst}
+.. autoclass:: Cauchy
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Chi2`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.chi2
+```
+
+```{eval-rst}
+.. autoclass:: Chi2
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`ContinuousBernoulli`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.continuous_bernoulli
+```
+
+```{eval-rst}
+.. autoclass:: ContinuousBernoulli
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Dirichlet`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.dirichlet
+```
+
+```{eval-rst}
+.. autoclass:: Dirichlet
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Exponential`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.exponential
+```
+
+```{eval-rst}
+.. autoclass:: Exponential
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`FisherSnedecor`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.fishersnedecor
+```
+
+```{eval-rst}
+.. autoclass:: FisherSnedecor
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Gamma`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.gamma
+```
+
+```{eval-rst}
+.. autoclass:: Gamma
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`GeneralizedPareto`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.generalized_pareto
+```
+
+```{eval-rst}
+.. autoclass:: GeneralizedPareto
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Geometric`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.geometric
+```
+
+```{eval-rst}
+.. autoclass:: Geometric
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Gumbel`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.gumbel
+```
+
+```{eval-rst}
+.. autoclass:: Gumbel
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`HalfCauchy`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.half_cauchy
+```
+
+```{eval-rst}
+.. autoclass:: HalfCauchy
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`HalfNormal`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.half_normal
+```
+
+```{eval-rst}
+.. autoclass:: HalfNormal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Independent`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.independent
+```
+
+```{eval-rst}
+.. autoclass:: Independent
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`InverseGamma`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.inverse_gamma
+```
+
+```{eval-rst}
+.. autoclass:: InverseGamma
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Kumaraswamy`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.kumaraswamy
+```
+
+```{eval-rst}
+.. autoclass:: Kumaraswamy
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`LKJCholesky`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.lkj_cholesky
+```
+
+```{eval-rst}
+.. autoclass:: LKJCholesky
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Laplace`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.laplace
+```
+
+```{eval-rst}
+.. autoclass:: Laplace
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`LogNormal`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.log_normal
+```
+
+```{eval-rst}
+.. autoclass:: LogNormal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`LowRankMultivariateNormal`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.lowrank_multivariate_normal
+```
+
+```{eval-rst}
+.. autoclass:: LowRankMultivariateNormal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`MixtureSameFamily`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.mixture_same_family
+```
+
+```{eval-rst}
+.. autoclass:: MixtureSameFamily
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Multinomial`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.multinomial
+```
+
+```{eval-rst}
+.. autoclass:: Multinomial
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`MultivariateNormal`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.multivariate_normal
+```
+
+```{eval-rst}
+.. autoclass:: MultivariateNormal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`NegativeBinomial`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.negative_binomial
+```
+
+```{eval-rst}
+.. autoclass:: NegativeBinomial
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Normal`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.normal
+```
+
+```{eval-rst}
+.. autoclass:: Normal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`OneHotCategorical`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.one_hot_categorical
+```
+
+```{eval-rst}
+.. autoclass:: OneHotCategorical
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Pareto`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.pareto
+```
+
+```{eval-rst}
+.. autoclass:: Pareto
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Poisson`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.poisson
+```
+
+```{eval-rst}
+.. autoclass:: Poisson
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`RelaxedBernoulli`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.relaxed_bernoulli
+```
+
+```{eval-rst}
+.. autoclass:: RelaxedBernoulli
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`LogitRelaxedBernoulli`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.relaxed_bernoulli
+```
+
+```{eval-rst}
+.. autoclass:: LogitRelaxedBernoulli
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`RelaxedOneHotCategorical`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.relaxed_categorical
+```
+
+```{eval-rst}
+.. autoclass:: RelaxedOneHotCategorical
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`StudentT`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.studentT
+```
+
+```{eval-rst}
+.. autoclass:: StudentT
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`TransformedDistribution`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.transformed_distribution
+```
+
+```{eval-rst}
+.. autoclass:: TransformedDistribution
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Uniform`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.uniform
+```
+
+```{eval-rst}
+.. autoclass:: Uniform
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`VonMises`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.von_mises
+```
+
+```{eval-rst}
+.. autoclass:: VonMises
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Weibull`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.weibull
+```
+
+```{eval-rst}
+.. autoclass:: Weibull
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## {hidden}`Wishart`
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.wishart
+```
+
+```{eval-rst}
+.. autoclass:: Wishart
+    :members:
+    :undoc-members:
+    :show-inheritance:
+```
+
+## `KL Divergence`
+
+```{eval-rst}
+.. automodule:: torch.distributions.kl
+```
+
+```{eval-rst}
+.. currentmodule:: torch.distributions.kl
+```
+
+```{eval-rst}
+.. autofunction:: kl_divergence
+```
+
+```{eval-rst}
+.. autofunction:: register_kl
+```
+
+## `Transforms`
+
+```{eval-rst}
+.. automodule:: torch.distributions.transforms
+    :members:
+    :member-order: bysource
+```
+
+## `Constraints`
+
+```{eval-rst}
+.. automodule:: torch.distributions.constraints
+    :members:
+    :member-order: bysource
+```
+
+## `Constraint Registry`
+
+```{eval-rst}
+.. automodule:: torch.distributions.constraint_registry
+    :members:
+    :member-order: bysource
+```
+
+% This module needs to be documented. Adding here in the meantime
+
+% for tracking purposes
+
+```{eval-rst}
+.. py:module:: torch.distributions.bernoulli
+
+.. py:module:: torch.distributions.beta
+
+.. py:module:: torch.distributions.binomial
+
+.. py:module:: torch.distributions.categorical
+
+.. py:module:: torch.distributions.cauchy
+
+.. py:module:: torch.distributions.chi2
+
+.. py:module:: torch.distributions.continuous_bernoulli
+
+.. py:module:: torch.distributions.dirichlet
+
+.. py:module:: torch.distributions.distribution
+
+.. py:module:: torch.distributions.exp_family
+
+.. py:module:: torch.distributions.exponential
+
+.. py:module:: torch.distributions.fishersnedecor
+
+.. py:module:: torch.distributions.gamma
+
+.. py:module:: torch.distributions.generalized_pareto
+
+.. py:module:: torch.distributions.geometric
+
+.. py:module:: torch.distributions.gumbel
+
+.. py:module:: torch.distributions.half_cauchy
+
+.. py:module:: torch.distributions.half_normal
+
+.. py:module:: torch.distributions.independent
+
+.. py:module:: torch.distributions.inverse_gamma
+
+.. py:module:: torch.distributions.kumaraswamy
+
+.. py:module:: torch.distributions.laplace
+
+.. py:module:: torch.distributions.lkj_cholesky
+
+.. py:module:: torch.distributions.log_normal
+
+.. py:module:: torch.distributions.logistic_normal
+
+.. py:module:: torch.distributions.lowrank_multivariate_normal
+
+.. py:module:: torch.distributions.mixture_same_family
+
+.. py:module:: torch.distributions.multinomial
+
+.. py:module:: torch.distributions.multivariate_normal
+
+.. py:module:: torch.distributions.negative_binomial
+
+.. py:module:: torch.distributions.normal
+
+.. py:module:: torch.distributions.one_hot_categorical
+
+.. py:module:: torch.distributions.pareto
+
+.. py:module:: torch.distributions.poisson
+
+.. py:module:: torch.distributions.relaxed_bernoulli
+
+.. py:module:: torch.distributions.relaxed_categorical
+
+.. py:module:: torch.distributions.studentT
+
+.. py:module:: torch.distributions.transformed_distribution
+
+.. py:module:: torch.distributions.uniform
+
+.. py:module:: torch.distributions.utils
+
+.. py:module:: torch.distributions.von_mises
+
+.. py:module:: torch.distributions.weibull
+
+.. py:module:: torch.distributions.wishart
+```
diff --git a/docs/source/dlpack.md b/docs/source/dlpack.md
new file mode 100644
index 000000000000..d66594bfa089
--- /dev/null
+++ b/docs/source/dlpack.md
@@ -0,0 +1,13 @@
+# torch.utils.dlpack
+
+```{eval-rst}
+.. currentmodule:: torch.utils.dlpack
+```
+
+```{eval-rst}
+.. autofunction:: from_dlpack
+```
+
+```{eval-rst}
+.. autofunction:: to_dlpack
+```
\ No newline at end of file
diff --git a/docs/source/draft_export.md b/docs/source/draft_export.md
new file mode 100644
index 000000000000..cc7247d3b526
--- /dev/null
+++ b/docs/source/draft_export.md
@@ -0,0 +1,262 @@
+(draft-export)=
+
+# Draft Export
+
+:::{warning}
+This feature is not meant to be used in production and is designed to be
+used as a tool for debugging torch.export tracing errors.
+:::
+
+Draft-export is a new version of export, which is designed to consistently
+produce a graph, even if there are potential soundness issues, and to generate a
+report listing out all of the issues export encountered during
+tracing and providing additional debugging information. For custom operators that
+don't have fake kernels, it will also generate a profile which you can register
+to automatically generate a fake kernel.
+
+Have you ever tried to export a model using {func}`torch.export.export`, only to
+encounter a data-dependent issue? You fix it, but then run into a missing fake
+kernel problem. And after resolving that, you get hit with another
+data-dependent issue. You wonder to yourself, I wish there was a way I could
+just get a graph to play around with, and be able to view all the issues in one
+place so that I can fix them later…
+
+`draft_export` to the rescue!
+
+`draft_export` is a version of export which will always successfully export a
+graph, even if there are potential soundness issues. These issues will then be
+compiled into a report for clearer visualization, which can be fixed later on.
+
+## What sort of errors does it catch?
+
+Draft-export helps to catch and debug the following errors:
+
+- Guard on data-dependent errors
+- Constraint violation errors
+- Missing fake kernels
+- Incorrectly written fake kernels
+
+## How does it work?
+
+In normal export, we will convert the sample inputs into FakeTensors and use
+them to record operations and trace the program into a graph. Input tensor
+shapes that can change (which are marked through `dynamic_shapes`), or values
+within tensors (typically from an `.item()` call) will be represented as a symbolic
+shape (`SymInt`) instead of a concrete integer. However some issues may occur
+while tracing - we may run into guards that we cannot evaluate, like if we want
+to check if some item in a tensor is greater than 0 (`u0 >= 0`). Since the tracer
+doesn't know anything about the value of `u0`, it will throw a data-dependent
+error. If the model uses a custom operator but a fake kernel hasn't been
+defined for it, then we will error with `fake_tensor.UnsupportedOperatorException`
+because export doesn't know how to apply this on `FakeTensors`. If a custom
+operator has a fake kernel implemented incorrectly, export will silently produce
+an incorrect graph that doesn't match the eager behavior.
+
+To fix the above errors, draft-export uses *real tensor tracing* to guide us on
+how to proceed when tracing. As we trace the model with fake tensors, for every
+operation that happens on a fake tensor, draft-export will also run the operator
+on stored real tensors which come from the example inputs passed to export. This
+allows us to address the above errors: When we reach a guard that we cannot
+evaluate, like `u0 >= 0`, we will use the stored real tensor values to
+evaluate this guard. Runtime asserts will be added into the graph to ensure that
+the graph asserts the same guard that we assumed while tracing. If we run into
+a custom operator without a fake kernel, we will run the operator's normal
+kernel with the stored real tensors, and return a fake tensor with the same rank
+but unbacked shapes. Since we have the real tensor output for every operation,
+we will compare this with the fake tensor output from the fake kernel. If the
+fake kernel is implemented incorrectly, we will then catch this behavior and
+generate a more correct fake kernel.
+
+## How can I use draft export?
+
+Let's say you're trying to export this piece of code:
+
+```python
+class M(torch.nn.Module):
+    def forward(self, x, y, z):
+        res = torch.ops.mylib.foo2(x, y)
+
+        a = res.item()
+        a = -a
+        a = a // 3
+        a = a + 5
+
+        z = torch.cat([z, z])
+
+        torch._check_is_size(a)
+        torch._check(a < z.shape[0])
+
+        return z[:a]
+
+inp = (torch.tensor(3), torch.tensor(4), torch.ones(3, 3))
+
+ep = torch.export.export(M(), inp)
+```
+
+This runs into a “missing fake kernel” error for `mylib.foo2` and then a
+`GuardOnDataDependentExpression` because of the slicing of `z` with `a`,
+an unbacked symint.
+
+To call `draft-export`, we can replace the `torch.export` line with the following:
+
+```python
+ep = torch.export.draft_export(M(), inp)
+```
+
+`ep` is a valid ExportedProgram which can now be passed through further environments!
+
+## Debugging with draft-export
+
+In the terminal output from draft-export, you should see the following message:
+
+```
+#########################################################################################
+WARNING: 2 issue(s) found during export, and it was not able to soundly produce a graph.
+To view the report of failures in an html page, please run the command:
+    `tlparse /tmp/export_angelayi/dedicated_log_torch_trace_axpofwe2.log --export`
+Or, you can view the errors in python by inspecting `print(ep._report)`.
+########################################################################################
+```
+
+Draft-export automatically dumps logs for `tlparse`. You can view the tracing
+errors by using `print(ep._report)`, or you can pass the logs into `tlparse`
+to generate an html report.
+
+Running the `tlparse` command in the terminal will generate a
+[tlparse](https://github.com/pytorch/tlparse)
+HTML report. Here is an example of the `tlparse` report:
+
+```{image} _static/img/export/draft_export_report.png
+```
+
+Clicking into the Data Dependent Error, we will see the following page which
+contains information to help debug this error. Specifically, it contains:
+
+- The stacktrace at which this error occurs
+- A list of local variables and their shapes
+- Information for how this guard was created
+
+```{image} _static/img/export/draft_export_report_dde.png
+```
+
+## The returned Exported Program
+
+Because draft-export specializes on code paths based on the example inputs, the
+exported program resulting from draft-export is guaranteed to be runnable and
+return correct results for **at least** the given example inputs. Other inputs can
+work, as long as they match the same guards that were taken when we were
+draft-exporting.
+
+For example, if we have a graph branching on if a value is greater than 5, if in
+draft-export our example inputs were greater than 5, then the returned
+`ExportedProgram` will specialize on that branch, and will assert that the value
+is greater than 5. This means that the program will succeed if you pass in
+another value greater than 5, but will fail if you pass in a value less than 5.
+This is more sound than `torch.jit.trace`, which will silently specialize on the
+branch. The proper way for `torch.export` to support both branches would be to
+rewrite the code using `torch.cond`, which will then capture both branches.
+
+Because of the runtime assertions in the graph, the returned exported-program is
+also retraceable with `torch.export` or `torch.compile`, with a minor addition in
+the case where a custom operator is missing a fake kernel.
+
+## Generating Fake Kernels
+
+If a custom operator does not contain a fake implementation, currently
+draft-export will use the real-tensor propagation to get an output for the
+operator and continue tracing. However, if we run the exported program with fake
+tensors or retrace the exported model, we will still fail because there is still
+no fake kernel implementation.
+
+To address this, after draft-export, we will generate an operator profile for
+each custom operator call that we encounter, and store this on the report
+attached to the exported program: `ep._report.op_profiles`. Users can then use the
+context manager `torch._library.fake_profile.unsafe_generate_fake_kernels` to
+generate and register a fake implementation based on these operator profiles.
+This way future fake tensor retracing will work.
+
+The workflow would look something like:
+
+```python
+class M(torch.nn.Module):
+    def forward(self, a, b):
+        res = torch.ops.mylib.foo(a, b)  # no fake impl
+        return res
+
+ep = draft_export(M(), (torch.ones(3, 4), torch.ones(3, 4)))
+
+with torch._library.fake_profile.unsafe_generate_fake_kernels(ep._report.op_profiles):
+    decomp = ep.run_decompositions()
+
+new_inp = (
+    torch.ones(2, 3, 4),
+    torch.ones(2, 3, 4),
+)
+
+# Save the profile to a yaml and check it into a codebase
+save_op_profiles(ep._report.op_profiles, "op_profile.yaml")
+# Load the yaml
+loaded_op_profile = load_op_profiles("op_profile.yaml")
+```
+
+The operator profile is a dictionary mapping operator name to a set of profiles
+which describe the input and outputs of the operator, and could be manually
+written, saved into a yaml file, and checked into a codebase. Here's an example
+of a profile for `mylib.foo.default`:
+
+```python
+"mylib.foo.default": {
+    OpProfile(
+        args_profile=(
+            TensorMetadata(
+                rank=2,
+                dtype=torch.float32,
+                device=torch.device("cpu"),
+                layout=torch.strided,
+            ),
+            TensorMetadata(
+                rank=2,
+                dtype=torch.float32,
+                device=torch.device("cpu"),
+                layout=torch.strided,
+            ),
+        ),
+        out_profile=TensorMetadata(
+            rank=2,
+            dtype=torch.float32,
+            device=torch.device("cpu"),
+            layout=torch.strided,
+        ),
+    )
+}
+```
+
+`mylib.foo.default`'s profile contains only one profile, which says that for 2
+input tensors of rank 2, dtype `torch.float32`, device `cpu`, we will return
+one tensor of rank 2, dtype `torch.float32`, and device `cpu`. Using the
+context manager, will then generate a fake kernel where given 2 input tensors of
+rank 2 (and the other tensor metadata), we will output one tensor of rank 2 (and
+the other tensor metadata).
+
+If the operator also supports other input ranks, then we can add the profile to
+this list of profiles, either by manually adding it into the existing profile or
+rerunning draft-export with new inputs to get new profiles, so that the
+generated fake kernel will support more input types. Otherwise it will error.
+
+## Where to go from here?
+
+Now that we have successfully created an `ExportedProgram` using draft-export,
+we can use further compilers such as `AOTInductor` to optimize its performance
+and produce a runnable artifact. This optimized version can then be used for
+deployment. In parallel, we can utilize the report generated by draft-export to
+identify and fix `torch.export` errors that were encountered so that the
+original model can be directly traceable with `torch.export`.
+
+```{toctree}
+:caption: Additional Links
+:maxdepth: 1
+
+torch.compiler_fake_tensor
+torch.compiler_dynamic_shapes
+torch.compiler_aot_inductor
+```
diff --git a/docs/source/elastic/agent.rst b/docs/source/elastic/agent.rst
index ac42403761f3..3003ea1f2f4c 100644
--- a/docs/source/elastic/agent.rst
+++ b/docs/source/elastic/agent.rst
@@ -48,7 +48,11 @@ Below are the agent implementations provided by torchelastic.
 Extending the Agent
 ---------------------
 
+<<<<<<< HEAD
 To extend the agent you can implement ```ElasticAgent`` directly, however
+=======
+To extend the agent you can implement ``ElasticAgent`` directly, however
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 we recommend you extend ``SimpleElasticAgent`` instead, which provides
 most of the scaffolding and leaves you with a few specific abstract methods
 to implement.
@@ -64,6 +68,7 @@ to implement.
 Watchdog in the Agent
 ---------------------
 
+<<<<<<< HEAD
 A named pipe based watchdog can be enabled in ```LocalElasticAgent``` if an
 environment variable ``TORCHELASTIC_ENABLE_FILE_TIMER`` with value 1 has
 been defined in the ```LocalElasticAgent``` process.
@@ -74,14 +79,32 @@ will internally create a unique file name and set it to the environment
 variable ```TORCHELASTIC_TIMER_FILE```, and this environment variable will
 be propagated to the worker processes to allow them to connect to the same
 named pipe that ```LocalElasticAgent``` uses.
+=======
+A named pipe based watchdog can be enabled in ``LocalElasticAgent`` if an
+environment variable ``TORCHELASTIC_ENABLE_FILE_TIMER`` with value 1 has
+been defined in the ``LocalElasticAgent`` process.
+Optionally, another environment variable ``TORCHELASTIC_TIMER_FILE``
+can be set with a unique file name for the named pipe. If the environment
+variable ``TORCHELASTIC_TIMER_FILE`` is not set, ``LocalElasticAgent``
+will internally create a unique file name and set it to the environment
+variable ``TORCHELASTIC_TIMER_FILE``, and this environment variable will
+be propagated to the worker processes to allow them to connect to the same
+named pipe that ``LocalElasticAgent`` uses.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 Health Check Server
 -------------------
 
+<<<<<<< HEAD
 A health check monitoring server can be enabled in ```LocalElasticAgent```
 if an environment variable ``TORCHELASTIC_HEALTH_CHECK_PORT`` has been defined
 in the ```LocalElasticAgent``` process.
+=======
+A health check monitoring server can be enabled in ``LocalElasticAgent``
+if an environment variable ``TORCHELASTIC_HEALTH_CHECK_PORT`` has been defined
+in the ``LocalElasticAgent`` process.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Adding interface for health check server which can be extended by starting tcp/http
 server on the specified port number.
 Additionally, health check server will have callback to check watchdog is alive.
diff --git a/docs/source/export.ir_spec.md b/docs/source/export.ir_spec.md
new file mode 100644
index 000000000000..355539ecfcc9
--- /dev/null
+++ b/docs/source/export.ir_spec.md
@@ -0,0 +1,487 @@
+(export.ir_spec)=
+
+# torch.export IR Specification
+
+Export IR is an intermediate representation (IR) for compilers, which bears
+similarities to MLIR and TorchScript. It is specifically designed to express the
+semantics of PyTorch programs. Export IR primarily represents computation in a
+streamlined list of operations, with limited support for dynamism such as
+control flows.
+
+To create an Export IR graph, a frontend can be used that soundly captures a
+PyTorch program via a trace-specializing mechanism. The resulting Export IR can
+then be optimized and executed by a backend. This can be done today through
+{func}`torch.export.export`.
+
+The key concepts that will be covered in this document include:
+
+- ExportedProgram: the data structure containing the Export IR program
+- Graph: which consists of a list of nodes.
+- Nodes: which represents operations, control flow, and metadata stored on this node.
+- Values are produced and consumed by nodes.
+- Types are associated with values and nodes.
+- The size and memory layout of values are also defined.
+
+## Assumptions
+
+This doc assumes that the audience is sufficiently familiar with PyTorch,
+specifically with {class}`torch.fx` and its related toolings. Thus it will stop
+describing contents present in {class}`torch.fx` documentation and paper.
+
+## What is Export IR
+
+Export IR is a graph-based intermediate representation IR of PyTorch programs.
+Export IR is realized on top of {class}`torch.fx.Graph`. In other words, **all
+Export IR graphs are also valid FX graphs**, and if interpreted using standard
+FX semantics, Export IR can be interpreted soundly. One implication is that an
+exported graph can be converted to a valid Python program via standard FX
+codegen.
+
+This documentation will primarily focus on highlighting areas where Export IR
+differs from FX in terms of its strictness, while skipping parts where it shares
+similarities with FX.
+
+## ExportedProgram
+
+The top-level Export IR construct is an {class}`torch.export.ExportedProgram`
+class. It bundles the computational graph of a PyTorch model (which is usually a
+{class}`torch.nn.Module`) with the parameters or weights that this model
+consumes.
+
+Some notable attributes of the {class}`torch.export.ExportedProgram` class are:
+
+- `graph_module` ({class}`torch.fx.GraphModule`): Data structure containing
+  the flattened computational graph of the PyTorch model. The graph can be
+  directly accessed through `ExportedProgram.graph`.
+- `graph_signature` ({class}`torch.export.ExportGraphSignature`): The graph
+  signature, which specifies the parameters and buffer names used and mutated
+  within the graph. Instead of storing parameters and buffers as attributes of
+  the graph, they are lifted as inputs to the graph. The graph_signature is
+  utilized to keep track of additional information on these parameters and
+  buffers.
+- `state_dict` (`Dict[str, Union[torch.Tensor, torch.nn.Parameter]]`): Data
+  structure containing the parameters and buffers.
+- `range_constraints` (`Dict[sympy.Symbol, RangeConstraint]`): For programs
+  that are exported with data dependent behavior, the metadata on each node will
+  contain symbolic shapes (which look like `s0`, `i0`). This attribute maps
+  the symbolic shapes to their lower/upper ranges.
+
+## Graph
+
+An Export IR Graph is a PyTorch program represented in the form of a DAG
+(directed acyclic graph). Each node in this graph represents a particular
+computation or operation, and edges of this graph consist of references between
+nodes.
+
+We can view Graph having this schema:
+
+```python
+class Graph:
+  nodes: List[Node]
+```
+
+In practice, Export IR's graph is realized as {class}`torch.fx.Graph` Python class.
+
+An Export IR graph contains the following nodes (Nodes will be described in more
+details in the next section):
+
+- 0 or more nodes of op type `placeholder`
+- 0 or more nodes of op type `call_function`
+- exactly 1 node of op type `output`
+
+**Collorary:** The smallest valid Graph will be of one node. i.e. nodes is never empty.
+
+**Definition:**
+The set of `placeholder` nodes of a Graph represents the **inputs** of the
+Graph of GraphModule. The `output` node of a Graph represents the **outputs**
+of the Graph of GraphModule.
+
+Example:
+
+```python
+import torch
+from torch import nn
+
+class MyModule(nn.Module):
+
+    def forward(self, x, y):
+      return x + y
+
+example_args = (torch.randn(1), torch.randn(1))
+mod = torch.export.export(MyModule(), example_args)
+print(mod.graph)
+```
+
+```python
+graph():
+  %x : [num_users=1] = placeholder[target=x]
+  %y : [num_users=1] = placeholder[target=y]
+  %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %y), kwargs = {})
+  return (add,)
+```
+
+The above is the textual representation of a Graph, with each line being a node.
+
+## Node
+
+A Node represents a particular computation or operation and is represented in
+Python using the {class}`torch.fx.Node` class. Edges between nodes are
+represented as direct references to other nodes via the `args` property of the
+Node class. Using the same FX machinery, we can represent the following
+operations that a computational graph typically needs, such as operator calls,
+placeholders (aka inputs), conditionals, and loops.
+
+The Node has the following schema:
+
+```python
+class Node:
+  name: str # name of node
+  op_name: str  # type of operation
+
+  # interpretation of the fields below depends on op_name
+  target: [str|Callable]
+  args: List[object]
+  kwargs: Dict[str, object]
+  meta: Dict[str, object]
+```
+
+**FX Text Format**
+
+As in the example above, notice that each line has this format:
+
+```
+%<name>:[...] = <op_name>[target=<target>](args = (%arg1, %arg2, arg3, arg4, …)), kwargs = {"keyword": arg5})
+```
+
+This format captures everything present in the Node class, with the exception of
+`meta`, in a compact format.
+
+Concretely:
+
+- **<name>** is the name of the node as it would appear in `node.name`.
+- **<op_name>** is the `node.op` field, which must be one of these:
+  `<call_function>`, `<placeholder>`,
+  `<get_attr>`, or `<output>`.
+- **<target>** is the target of the node as `node.target`. The meaning of this
+  field depends on `op_name`.
+- **args1, … args 4…** are what is listed in the `node.args` tuple. If a
+  value in the list is an {class}`torch.fx.Node`, then it will be especially
+  indicated with a leading **%.**
+
+For example, a call to the add operator would appear as:
+
+```
+%add1 = call_function[target = torch.op.aten.add.Tensor](args = (%x, %y), kwargs = {})
+```
+
+Where `%x`, `%y` are two other Nodes that have names x and y. Worth noting
+that the string `torch.op.aten.add.Tensor` represents the callable object that
+is actually stored in the target field, not merely its string name.
+
+The final line of this text format is:
+
+```
+return [add]
+```
+
+which is a Node with `op_name = output`, indicating that we are returning this
+one element.
+
+### call_function
+
+A `call_function` node represents a call to an operator.
+
+**Definitions**
+
+- **Functional:** We say a callable is “functional” if it satisfies all the
+  following requirements:
+
+  - Non-mutating: The operator does not mutate the value of its input (for
+    tensors, this includes both metadata and data).
+  - No side effects: The operator does not mutate states that are visible
+    from outside, like changing values of module parameters.
+
+- **Operator:** is a functional callable with a predefined schema. Examples of
+  such operators include functional ATen operators.
+
+**Representation in FX**
+
+```
+%name = call_function[target = operator](args = (%x, %y, …), kwargs = {})
+```
+
+**Differences from vanilla FX call_function**
+
+1. In FX graph, a call_function can refer to any callable, in Export IR, we
+   restrict it to only a select subset of ATen operators, custom operators, and
+   control flow operators.
+2. In Export IR, constant arguments will be embedded within the graph.
+3. In FX graph, a get_attr node can represent reading any attribute stored in
+   the graph module. However, in Export IR this is restricted to reading only
+   submodules as all parameters/buffers will be passed in as inputs to the graph
+   module.
+
+#### Metadata
+
+`Node.meta` is a dict attached to every FX node. However, the FX spec does not
+specify what metadata can or will be there. Export IR provides a stronger
+contract, specifically all `call_function` nodes will guarantee having and
+only having the following metadata fields:
+
+- `node.meta["stack_trace"]` is a string containing the Python stack trace
+  referencing the original Python source code. An example stack trace looks
+  like:
+
+  ```
+  File "my_module.py", line 19, in forward
+  return x + dummy_helper(y)
+  File "helper_utility.py", line 89, in dummy_helper
+  return y + 1
+  ```
+
+- `node.meta["val"]` describes the output of running the operation. It can be
+  of type `<symint>`, `<FakeTensor>`, a
+  `List[Union[FakeTensor, SymInt]]`, or `None`.
+
+- `node.meta["nn_module_stack"]` describes the "stacktrace" of the
+  {class}`torch.nn.Module` from which the node came, if it was from a
+  {class}`torch.nn.Module` call. For example, if a node containing the `addmm`
+  op called from a {class}`torch.nn.Linear` module inside of a
+  {class}`torch.nn.Sequential` module, the `nn_module_stack` would look
+  something like:
+
+  ```
+  {'self_linear': ('self.linear', <class 'torch.nn.Linear'>), 'self_sequential': ('self.sequential', <class 'torch.nn.Sequential'>)}
+  ```
+
+- `node.meta["source_fn_stack"]` contains the torch function or the leaf
+  {class}`torch.nn.Module` class this node was called from before decomposition.
+  For example, a node containing the `addmm` op from a
+  {class}`torch.nn.Linear` module call would contain {class}`torch.nn.Linear` in
+  their `source_fn`, and a node containing the `addmm` op from a
+  {class}`torch.nn.functional.Linear` module call would contain
+  {class}`torch.nn.functional.Linear` in their `source_fn`.
+
+### placeholder
+
+Placeholder represents an input to a graph. Its semantics are exactly the same as in FX.
+Placeholder nodes must be the first N nodes in the nodes list of a graph. N can be zero.
+
+**Representation in FX**
+
+```python
+%name = placeholder[target = name](args = ())
+```
+
+The target field is a string which is the name of input.
+
+`args`, if non-empty, should be of size 1 representing the default value of this input.
+
+**Metadata**
+
+Placeholder nodes also have `meta[‘val’]`, like `call_function` nodes. The
+`val` field in this case represents the input shape/dtype that the graph is
+expected to receive for this input parameter.
+
+### output
+
+An output call represents a return statement in a function; it thus terminates the
+current graph. There is one and only one output node, and it will always be the
+last node of the graph.
+
+**Representation in FX**
+
+```
+output[](args = (%something, …))
+```
+
+This has the exact semantics as in {class}`torch.fx`. `args` represents the node
+to be returned.
+
+**Metadata**
+
+Output node has the same metadata as `call_function` nodes.
+
+### get_attr
+
+`get_attr` nodes represent reading a submodule from the encapsulating
+{class}`torch.fx.GraphModule`. Unlike a vanilla FX graph from
+{func}`torch.fx.symbolic_trace` in which `get_attr` nodes are used to read
+attributes such as parameters and buffers from the top-level
+{class}`torch.fx.GraphModule`, parameters and buffers are passed in as
+inputs to the graph module, and stored in the top-level
+{class}`torch.export.ExportedProgram`.
+
+**Representation in FX**
+
+```python
+%name = get_attr[target = name](args = ())
+```
+
+**Example**
+
+Consider the following model:
+
+```python
+from functorch.experimental.control_flow import cond
+
+def true_fn(x):
+    return x.sin()
+
+def false_fn(x):
+    return x.cos()
+
+def f(x, y):
+    return cond(y, true_fn, false_fn, [x])
+```
+
+Graph:
+
+```
+graph():
+    %x_1 : [num_users=1] = placeholder[target=x_1]
+    %y_1 : [num_users=1] = placeholder[target=y_1]
+    %true_graph_0 : [num_users=1] = get_attr[target=true_graph_0]
+    %false_graph_0 : [num_users=1] = get_attr[target=false_graph_0]
+    %conditional : [num_users=1] = call_function[target=torch.ops.higher_order.cond](args = (%y_1, %true_graph_0, %false_graph_0, [%x_1]), kwargs = {})
+    return conditional
+```
+
+The line, `%true_graph_0 : [num_users=1] = get_attr[target=true_graph_0]`,
+reads the submodule `true_graph_0` which contains the `sin` operator.
+
+## References
+
+### SymInt
+
+A SymInt is an object that can either be a literal integer or a symbol that represents
+an Integer (represented in Python by `sympy.Symbol` class). When SymInt is a
+symbol, it describes a variable of type integer that is unknown to the graph at
+compile time, that is, its value is only known at runtime.
+
+### FakeTensor
+
+A FakeTensor is an object that contains the metadata of a tensor. It can be
+viewed as having the following metadata.
+
+```python
+class FakeTensor:
+  size: List[SymInt]
+  dtype: torch.dtype
+  device: torch.device
+  dim_order: List[int]  # This doesn't exist yet
+```
+
+The size field of FakeTensor is a list of integers or SymInts. If SymInts are
+present, this means this tensor has a dynamic shape. If integers are present, it
+is assumed that the tensor will have that exact static shape. The rank of the
+TensorMeta is never dynamic. The dtype field represents the dtype of the
+output of that node. There are no implicit type promotions in Edge IR. There
+are no strides in FakeTensor.
+
+In other words:
+
+- If the operator in node.target returns a Tensor, then `node.meta['val']` is a
+  FakeTensor describing that tensor.
+- If the operator in node.target returns an n-tuple of Tensors, then
+  `node.meta['val']` is an n-tuple of FakeTensors describing each tensor.
+- If the operator in node.target returns an int/float/scalar that is known at
+  compile time, then `node.meta['val']` is None.
+- If the operator in node.target returns an int/float/scalar that is not known
+  at compile time, then `node.meta['val']` is of type SymInt.
+
+For example:
+
+- `aten::add` returns a Tensor; so its spec will be a FakeTensor with dtype
+  and size of the tensor returned by this operator.
+- `aten::sym_size` returns an integer; so its val will be a SymInt because its
+  value is only available at runtime.
+- `max_pool2d_with_indexes` returns a tuple of (Tensor, Tensor); so the spec
+  will also be a 2-tuple of FakeTensor objects, the first TensorMeta describes
+  the first element of the return value etc.
+
+Python code:
+
+```python
+def add_one(x):
+  return torch.ops.aten(x, 1)
+```
+
+Graph:
+
+```
+graph():
+  %ph_0 : [#users=1] = placeholder[target=ph_0]
+  %add_tensor : [#users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%ph_0, 1), kwargs = {})
+  return [add_tensor]
+```
+
+FakeTensor:
+
+```python
+FakeTensor(dtype=torch.int, size=[2,], device=CPU)
+```
+
+### Pytree-able Types
+
+We define a type “Pytree-able”, if it is either a leaf type or a container type
+that contains other Pytree-able types.
+
+Note:
+
+> The concept of pytree is the same as the one documented
+> [here](https://jax.readthedocs.io/en/latest/pytrees.html) for JAX:
+
+The following types are defined as **leaf type**:
+
+```{eval-rst}
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+
+   * - Type
+     - Definition
+   * - Tensor
+     - :class:`torch.Tensor`
+   * - Scalar
+     - Any numerical types from Python, including integral types, floating point types, and zero dimensional tensors.
+   * - int
+     - Python int (bound as int64_t in C++)
+   * - float
+     - Python float (bound as double in C++)
+   * - bool
+     - Python bool
+   * - str
+     - Python string
+   * - ScalarType
+     - :class:`torch.dtype`
+   * - Layout
+     - :class:`torch.layout`
+   * - MemoryFormat
+     - :class:`torch.memory_format`
+   * - Device
+     - :class:`torch.device`
+```
+
+The following types are defined as **container type**:
+
+```{eval-rst}
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+
+   * - Type
+     - Definition
+   * - Tuple
+     - Python tuple
+   * - List
+     - Python list
+   * - Dict
+     - Python dict with Scalar keys
+   * - NamedTuple
+     - Python namedtuple
+   * - Dataclass
+     - Must be registered through `register_dataclass <https://github.com/pytorch/pytorch/blob/901aa85b58e8f490631ce1db44e6555869a31893/torch/export/__init__.py#L693>`__
+   * - Custom class
+     - Any custom class defined with `_register_pytree_node <https://github.com/pytorch/pytorch/blob/901aa85b58e8f490631ce1db44e6555869a31893/torch/utils/_pytree.py#L72>`__
+```
diff --git a/docs/source/export.md b/docs/source/export.md
new file mode 100644
index 000000000000..251f163ba46d
--- /dev/null
+++ b/docs/source/export.md
@@ -0,0 +1,923 @@
+(torch.export)=
+
+# torch.export
+
+:::{warning}
+This feature is a prototype under active development and there WILL BE
+BREAKING CHANGES in the future.
+:::
+
+## Overview
+
+{func}`torch.export.export` takes a {class}`torch.nn.Module` and produces a traced graph
+representing only the Tensor computation of the function in an Ahead-of-Time
+(AOT) fashion, which can subsequently be executed with different outputs or
+serialized.
+
+```python
+import torch
+from torch.export import export
+
+class Mod(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        a = torch.sin(x)
+        b = torch.cos(y)
+        return a + b
+
+example_args = (torch.randn(10, 10), torch.randn(10, 10))
+
+exported_program: torch.export.ExportedProgram = export(
+    Mod(), args=example_args
+)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, x: "f32[10, 10]", y: "f32[10, 10]"):
+            # code: a = torch.sin(x)
+            sin: "f32[10, 10]" = torch.ops.aten.sin.default(x)
+
+            # code: b = torch.cos(y)
+            cos: "f32[10, 10]" = torch.ops.aten.cos.default(y)
+
+            # code: return a + b
+            add: f32[10, 10] = torch.ops.aten.add.Tensor(sin, cos)
+            return (add,)
+
+    Graph signature:
+        ExportGraphSignature(
+            input_specs=[
+                InputSpec(
+                    kind=<InputKind.USER_INPUT: 1>,
+                    arg=TensorArgument(name='x'),
+                    target=None,
+                    persistent=None
+                ),
+                InputSpec(
+                    kind=<InputKind.USER_INPUT: 1>,
+                    arg=TensorArgument(name='y'),
+                    target=None,
+                    persistent=None
+                )
+            ],
+            output_specs=[
+                OutputSpec(
+                    kind=<OutputKind.USER_OUTPUT: 1>,
+                    arg=TensorArgument(name='add'),
+                    target=None
+                )
+            ]
+        )
+    Range constraints: {}
+```
+
+`torch.export` produces a clean intermediate representation (IR) with the
+following invariants. More specifications about the IR can be found
+{ref}`here <export.ir_spec>`.
+
+- **Soundness**: It is guaranteed to be a sound representation of the original
+  program, and maintains the same calling conventions of the original program.
+- **Normalized**: There are no Python semantics within the graph. Submodules
+  from the original programs are inlined to form one fully flattened
+  computational graph.
+- **Graph properties**: The graph is purely functional, meaning it does not
+  contain operations with side effects such as mutations or aliasing. It does
+  not mutate any intermediate values, parameters, or buffers.
+- **Metadata**: The graph contains metadata captured during tracing, such as a
+  stacktrace from user's code.
+
+Under the hood, `torch.export` leverages the following latest technologies:
+
+- **TorchDynamo (torch._dynamo)** is an internal API that uses a CPython feature
+  called the Frame Evaluation API to safely trace PyTorch graphs. This
+  provides a massively improved graph capturing experience, with much fewer
+  rewrites needed in order to fully trace the PyTorch code.
+- **AOT Autograd** provides a functionalized PyTorch graph and ensures the graph
+  is decomposed/lowered to the ATen operator set.
+- **Torch FX (torch.fx)** is the underlying representation of the graph,
+  allowing flexible Python-based transformations.
+
+### Existing frameworks
+
+{func}`torch.compile` also utilizes the same PT2 stack as `torch.export`, but
+is slightly different:
+
+- **JIT vs. AOT**: {func}`torch.compile` is a JIT compiler whereas
+  which is not intended to be used to produce compiled artifacts outside of
+  deployment.
+- **Partial vs. Full Graph Capture**: When {func}`torch.compile` runs into an
+  untraceable part of a model, it will "graph break" and fall back to running
+  the program in the eager Python runtime. In comparison, `torch.export` aims
+  to get a full graph representation of a PyTorch model, so it will error out
+  when something untraceable is reached. Since `torch.export` produces a full
+  graph disjoint from any Python features or runtime, this graph can then be
+  saved, loaded, and run in different environments and languages.
+- **Usability tradeoff**: Since {func}`torch.compile` is able to fallback to the
+  Python runtime whenever it reaches something untraceable, it is a lot more
+  flexible. `torch.export` will instead require users to provide more
+  information or rewrite their code to make it traceable.
+
+Compared to {func}`torch.fx.symbolic_trace`, `torch.export` traces using
+TorchDynamo which operates at the Python bytecode level, giving it the ability
+to trace arbitrary Python constructs not limited by what Python operator
+overloading supports. Additionally, `torch.export` keeps fine-grained track of
+tensor metadata, so that conditionals on things like tensor shapes do not
+fail tracing. In general, `torch.export` is expected to work on more user
+programs, and produce lower-level graphs (at the `torch.ops.aten` operator
+level). Note that users can still use {func}`torch.fx.symbolic_trace` as a
+preprocessing step before `torch.export`.
+
+Compared to {func}`torch.jit.script`, `torch.export` does not capture Python
+control flow or data structures, but it supports more Python language features
+than TorchScript (as it is easier to have comprehensive coverage over Python
+bytecodes). The resulting graphs are simpler and only have straight line control
+flow (except for explicit control flow operators).
+
+Compared to {func}`torch.jit.trace`, `torch.export` is sound: it is able to
+trace code that performs integer computation on sizes and records all of the
+side-conditions necessary to show that a particular trace is valid for other
+inputs.
+
+## Exporting a PyTorch Model
+
+### An Example
+
+The main entrypoint is through {func}`torch.export.export`, which takes a
+callable ({class}`torch.nn.Module`, function, or method) and sample inputs, and
+captures the computation graph into an {class}`torch.export.ExportedProgram`. An
+example:
+
+```python
+import torch
+from torch.export import export
+
+# Simple module for demonstration
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=3, out_channels=16, kernel_size=3, padding=1
+        )
+        self.relu = torch.nn.ReLU()
+        self.maxpool = torch.nn.MaxPool2d(kernel_size=3)
+
+    def forward(self, x: torch.Tensor, *, constant=None) -> torch.Tensor:
+        a = self.conv(x)
+        a.add_(constant)
+        return self.maxpool(self.relu(a))
+
+example_args = (torch.randn(1, 3, 256, 256),)
+example_kwargs = {"constant": torch.ones(1, 16, 256, 256)}
+
+exported_program: torch.export.ExportedProgram = export(
+    M(), args=example_args, kwargs=example_kwargs
+)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+    def forward(self, p_conv_weight: "f32[16, 3, 3, 3]", p_conv_bias: "f32[16]", x: "f32[1, 3, 256, 256]", constant: "f32[1, 16, 256, 256]"):
+            # code: a = self.conv(x)
+            conv2d: "f32[1, 16, 256, 256]" = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias, [1, 1], [1, 1])
+
+            # code: a.add_(constant)
+            add_: "f32[1, 16, 256, 256]" = torch.ops.aten.add_.Tensor(conv2d, constant)
+
+            # code: return self.maxpool(self.relu(a))
+            relu: "f32[1, 16, 256, 256]" = torch.ops.aten.relu.default(add_)
+            max_pool2d: "f32[1, 16, 85, 85]" = torch.ops.aten.max_pool2d.default(relu, [3, 3], [3, 3])
+            return (max_pool2d,)
+
+Graph signature:
+    ExportGraphSignature(
+        input_specs=[
+            InputSpec(
+                kind=<InputKind.PARAMETER: 2>,
+                arg=TensorArgument(name='p_conv_weight'),
+                target='conv.weight',
+                persistent=None
+            ),
+            InputSpec(
+                kind=<InputKind.PARAMETER: 2>,
+                arg=TensorArgument(name='p_conv_bias'),
+                target='conv.bias',
+                persistent=None
+            ),
+            InputSpec(
+                kind=<InputKind.USER_INPUT: 1>,
+                arg=TensorArgument(name='x'),
+                target=None,
+                persistent=None
+            ),
+            InputSpec(
+                kind=<InputKind.USER_INPUT: 1>,
+                arg=TensorArgument(name='constant'),
+                target=None,
+                persistent=None
+            )
+        ],
+        output_specs=[
+            OutputSpec(
+                kind=<OutputKind.USER_OUTPUT: 1>,
+                arg=TensorArgument(name='max_pool2d'),
+                target=None
+            )
+        ]
+    )
+Range constraints: {}
+```
+
+Inspecting the `ExportedProgram`, we can note the following:
+
+- The {class}`torch.fx.Graph` contains the computation graph of the original
+  program, along with records of the original code for easy debugging.
+- The graph contains only `torch.ops.aten` operators found [here](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml)
+  and custom operators, and is fully functional, without any inplace operators
+  such as `torch.add_`.
+- The parameters (weight and bias to conv) are lifted as inputs to the graph,
+  resulting in no `get_attr` nodes in the graph, which previously existed in
+  the result of {func}`torch.fx.symbolic_trace`.
+- The {class}`torch.export.ExportGraphSignature` models the input and output
+  signature, along with specifying which inputs are parameters.
+- The resulting shape and dtype of tensors produced by each node in the graph is
+  noted. For example, the `convolution` node will result in a tensor of dtype
+  `torch.float32` and shape (1, 16, 256, 256).
+
+(non-strict-export)=
+
+### Non-Strict Export
+
+In PyTorch 2.3, we introduced a new mode of tracing called **non-strict mode**.
+It's still going through hardening, so if you run into any issues, please file
+them to Github with the "oncall: export" tag.
+
+In *non-strict mode*, we trace through the program using the Python interpreter.
+Your code will execute exactly as it would in eager mode; the only difference is
+that all Tensor objects will be replaced by ProxyTensors, which will record all
+their operations into a graph.
+
+In *strict* mode, which is currently the default, we first trace through the
+program using TorchDynamo, a bytecode analysis engine. TorchDynamo does not
+actually execute your Python code. Instead, it symbolically analyzes it and
+builds a graph based on the results. This analysis allows torch.export to
+provide stronger guarantees about safety, but not all Python code is supported.
+
+An example of a case where one might want to use non-strict mode is if you run
+into a unsupported TorchDynamo feature that might not be easily solved, and you
+know the python code is not exactly needed for computation. For example:
+
+```python
+import contextlib
+import torch
+
+class ContextManager():
+    def __init__(self):
+        self.count = 0
+    def __enter__(self):
+        self.count += 1
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.count -= 1
+
+class M(torch.nn.Module):
+    def forward(self, x):
+        with ContextManager():
+            return x.sin() + x.cos()
+
+export(M(), (torch.ones(3, 3),), strict=False)  # Non-strict traces successfully
+export(M(), (torch.ones(3, 3),))  # Strict mode fails with torch._dynamo.exc.Unsupported: ContextManager
+```
+
+In this example, the first call using non-strict mode (through the
+`strict=False` flag) traces successfully whereas the second call using strict
+mode (default) results with a failure, where TorchDynamo is unable to support
+context managers. One option is to rewrite the code (see {ref}`Limitations of torch.export <limitations-of-torch-export>`),
+but seeing as the context manager does not affect the tensor
+computations in the model, we can go with the non-strict mode's result.
+
+(training-export)=
+
+### Export for Training and Inference
+
+In PyTorch 2.5, we introduced a new API called {func}`export_for_training`.
+It's still going through hardening, so if you run into any issues, please file
+them to Github with the "oncall: export" tag.
+
+In this API, we produce the most generic IR that contains all ATen operators
+(including both functional and non-functional) which can be used to train in
+eager PyTorch Autograd. This API is intended for eager training use cases such as PT2 Quantization
+and will soon be the default IR of torch.export.export. To read further about
+the motivation behind this change, please refer to
+<https://dev-discuss.pytorch.org/t/why-pytorch-does-not-need-a-new-standardized-operator-set/2206>
+
+When this API is combined with {func}`run_decompositions()`, you should be able to get inference IR with
+any desired decomposition behavior.
+
+To show some examples:
+
+```python
+class ConvBatchnorm(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(1, 3, 1, 1)
+        self.bn = torch.nn.BatchNorm2d(3)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return (x,)
+
+mod = ConvBatchnorm()
+inp = torch.randn(1, 1, 3, 3)
+
+ep_for_training = torch.export.export_for_training(mod, (inp,))
+print(ep_for_training)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, p_conv_weight: "f32[3, 1, 1, 1]", p_conv_bias: "f32[3]", p_bn_weight: "f32[3]", p_bn_bias: "f32[3]", b_bn_running_mean: "f32[3]", b_bn_running_var: "f32[3]", b_bn_num_batches_tracked: "i64[]", x: "f32[1, 1, 3, 3]"):
+            conv2d: "f32[1, 3, 3, 3]" = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias)
+            add_: "i64[]" = torch.ops.aten.add_.Tensor(b_bn_num_batches_tracked, 1)
+            batch_norm: "f32[1, 3, 3, 3]" = torch.ops.aten.batch_norm.default(conv2d, p_bn_weight, p_bn_bias, b_bn_running_mean, b_bn_running_var, True, 0.1, 1e-05, True)
+            return (batch_norm,)
+```
+
+From the above output, you can see that {func}`export_for_training` produces pretty much the same ExportedProgram
+as {func}`export` except for the operators in the graph. You can see that we captured batch_norm in the most general
+form. This op is non-functional and will be lowered to different ops when running inference.
+
+You can also go from this IR to an inference IR via {func}`run_decompositions` with arbitrary customizations.
+
+```python
+# Lower to core aten inference IR, but keep conv2d
+decomp_table = torch.export.default_decompositions()
+del decomp_table[torch.ops.aten.conv2d.default]
+ep_for_inference = ep_for_training.run_decompositions(decomp_table)
+
+print(ep_for_inference)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, p_conv_weight: "f32[3, 1, 1, 1]", p_conv_bias: "f32[3]", p_bn_weight: "f32[3]", p_bn_bias: "f32[3]", b_bn_running_mean: "f32[3]", b_bn_running_var: "f32[3]", b_bn_num_batches_tracked: "i64[]", x: "f32[1, 1, 3, 3]"):
+            conv2d: "f32[1, 3, 3, 3]" = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias)
+            add: "i64[]" = torch.ops.aten.add.Tensor(b_bn_num_batches_tracked, 1)
+            _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(conv2d, p_bn_weight, p_bn_bias, b_bn_running_mean, b_bn_running_var, True, 0.1, 1e-05)
+            getitem: "f32[1, 3, 3, 3]" = _native_batch_norm_legit_functional[0]
+            getitem_3: "f32[3]" = _native_batch_norm_legit_functional[3]
+            getitem_4: "f32[3]" = _native_batch_norm_legit_functional[4]
+            return (getitem_3, getitem_4, add, getitem)
+```
+
+Here you can see that we kept `conv2d` op in the IR while decomposing the rest. Now the IR is a functional IR
+containing core aten operators except for `conv2d`.
+
+You can do even more customization by directly registering your chosen decomposition behaviors.
+
+You can do even more customizations by directly registering custom decomp behaviour
+
+```python
+# Lower to core aten inference IR, but customize conv2d
+decomp_table = torch.export.default_decompositions()
+
+def my_awesome_custom_conv2d_function(x, weight, bias, stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1):
+    return 2 * torch.ops.aten.convolution(x, weight, bias, stride, padding, dilation, False, [0, 0], groups)
+
+decomp_table[torch.ops.aten.conv2d.default] = my_awesome_conv2d_function
+ep_for_inference = ep_for_training.run_decompositions(decomp_table)
+
+print(ep_for_inference)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, p_conv_weight: "f32[3, 1, 1, 1]", p_conv_bias: "f32[3]", p_bn_weight: "f32[3]", p_bn_bias: "f32[3]", b_bn_running_mean: "f32[3]", b_bn_running_var: "f32[3]", b_bn_num_batches_tracked: "i64[]", x: "f32[1, 1, 3, 3]"):
+            convolution: "f32[1, 3, 3, 3]" = torch.ops.aten.convolution.default(x, p_conv_weight, p_conv_bias, [1, 1], [0, 0], [1, 1], False, [0, 0], 1)
+            mul: "f32[1, 3, 3, 3]" = torch.ops.aten.mul.Tensor(convolution, 2)
+            add: "i64[]" = torch.ops.aten.add.Tensor(b_bn_num_batches_tracked, 1)
+            _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(mul, p_bn_weight, p_bn_bias, b_bn_running_mean, b_bn_running_var, True, 0.1, 1e-05)
+            getitem: "f32[1, 3, 3, 3]" = _native_batch_norm_legit_functional[0]
+            getitem_3: "f32[3]" = _native_batch_norm_legit_functional[3]
+            getitem_4: "f32[3]" = _native_batch_norm_legit_functional[4];
+            return (getitem_3, getitem_4, add, getitem)
+```
+
+### Expressing Dynamism
+
+By default `torch.export` will trace the program assuming all input shapes are
+**static**, and specializing the exported program to those dimensions. However,
+some dimensions, such as a batch dimension, can be dynamic and vary from run to
+run. Such dimensions must be specified by using the
+{func}`torch.export.Dim` API to create them and by passing them into
+{func}`torch.export.export` through the `dynamic_shapes` argument. An example:
+
+```python
+import torch
+from torch.export import Dim, export
+
+class M(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+        self.branch1 = torch.nn.Sequential(
+            torch.nn.Linear(64, 32), torch.nn.ReLU()
+        )
+        self.branch2 = torch.nn.Sequential(
+            torch.nn.Linear(128, 64), torch.nn.ReLU()
+        )
+        self.buffer = torch.ones(32)
+
+    def forward(self, x1, x2):
+        out1 = self.branch1(x1)
+        out2 = self.branch2(x2)
+        return (out1 + self.buffer, out2)
+
+example_args = (torch.randn(32, 64), torch.randn(32, 128))
+
+# Create a dynamic batch size
+batch = Dim("batch")
+# Specify that the first dimension of each input is that batch size
+dynamic_shapes = {"x1": {0: batch}, "x2": {0: batch}}
+
+exported_program: torch.export.ExportedProgram = export(
+    M(), args=example_args, dynamic_shapes=dynamic_shapes
+)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+class GraphModule(torch.nn.Module):
+    def forward(self, p_branch1_0_weight: "f32[32, 64]", p_branch1_0_bias: "f32[32]", p_branch2_0_weight: "f32[64, 128]", p_branch2_0_bias: "f32[64]", c_buffer: "f32[32]", x1: "f32[s0, 64]", x2: "f32[s0, 128]"):
+
+         # code: out1 = self.branch1(x1)
+        linear: "f32[s0, 32]" = torch.ops.aten.linear.default(x1, p_branch1_0_weight, p_branch1_0_bias)
+        relu: "f32[s0, 32]" = torch.ops.aten.relu.default(linear)
+
+         # code: out2 = self.branch2(x2)
+        linear_1: "f32[s0, 64]" = torch.ops.aten.linear.default(x2, p_branch2_0_weight, p_branch2_0_bias)
+        relu_1: "f32[s0, 64]" = torch.ops.aten.relu.default(linear_1)
+
+         # code: return (out1 + self.buffer, out2)
+        add: "f32[s0, 32]" = torch.ops.aten.add.Tensor(relu, c_buffer)
+        return (add, relu_1)
+
+Range constraints: {s0: VR[0, int_oo]}
+```
+
+Some additional things to note:
+
+- Through the {func}`torch.export.Dim` API and the `dynamic_shapes` argument, we specified the first
+  dimension of each input to be dynamic. Looking at the inputs `x1` and
+  `x2`, they have a symbolic shape of (s0, 64) and (s0, 128), instead of
+  the (32, 64) and (32, 128) shaped tensors that we passed in as example inputs.
+  `s0` is a symbol representing that this dimension can be a range
+  of values.
+- `exported_program.range_constraints` describes the ranges of each symbol
+  appearing in the graph. In this case, we see that `s0` has the range
+  [0, int_oo]. For technical reasons that are difficult to explain here, they are
+  assumed to be not 0 or 1. This is not a bug, and does not necessarily mean
+  that the exported program will not work for dimensions 0 or 1. See
+  [The 0/1 Specialization Problem](https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ#heading=h.ez923tomjvyk)
+  for an in-depth discussion of this topic.
+
+We can also specify more expressive relationships between input shapes, such as
+where a pair of shapes might differ by one, a shape might be double of
+another, or a shape is even. An example:
+
+```python
+class M(torch.nn.Module):
+    def forward(self, x, y):
+        return x + y[1:]
+
+x, y = torch.randn(5), torch.randn(6)
+dimx = torch.export.Dim("dimx", min=3, max=6)
+dimy = dimx + 1
+
+exported_program = torch.export.export(
+    M(), (x, y), dynamic_shapes=({0: dimx}, {0: dimy}),
+)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+class GraphModule(torch.nn.Module):
+    def forward(self, x: "f32[s0]", y: "f32[s0 + 1]"):
+        # code: return x + y[1:]
+        slice_1: "f32[s0]" = torch.ops.aten.slice.Tensor(y, 0, 1, 9223372036854775807)
+        add: "f32[s0]" = torch.ops.aten.add.Tensor(x, slice_1)
+        return (add,)
+
+Range constraints: {s0: VR[3, 6], s0 + 1: VR[4, 7]}
+```
+
+Some things to note:
+
+- By specifying `{0: dimx}` for the first input, we see that the resulting
+  shape of the first input is now dynamic, being `[s0]`. And now by specifying
+  `{0: dimy}` for the second input, we see that the resulting shape of the
+  second input is also dynamic. However, because we expressed `dimy = dimx + 1`,
+  instead of `y`'s shape containing a new symbol, we see that it is
+  now being represented with the same symbol used in `x`, `s0`. We can
+  see that relationship of `dimy = dimx + 1` is being shown through `s0 + 1`.
+- Looking at the range constraints, we see that `s0` has the range [3, 6],
+  which is specified initially, and we can see that `s0 + 1` has the solved
+  range of [4, 7].
+
+### Serialization
+
+To save the `ExportedProgram`, users can use the {func}`torch.export.save` and
+{func}`torch.export.load` APIs. A convention is to save the `ExportedProgram`
+using a `.pt2` file extension.
+
+An example:
+
+```python
+import torch
+import io
+
+class MyModule(torch.nn.Module):
+    def forward(self, x):
+        return x + 10
+
+exported_program = torch.export.export(MyModule(), torch.randn(5))
+
+torch.export.save(exported_program, 'exported_program.pt2')
+saved_exported_program = torch.export.load('exported_program.pt2')
+```
+
+### Specializations
+
+A key concept in understanding the behavior of `torch.export` is the
+difference between *static* and *dynamic* values.
+
+A *dynamic* value is one that can change from run to run. These behave like
+normal arguments to a Python function—you can pass different values for an
+argument and expect your function to do the right thing. Tensor *data* is
+treated as dynamic.
+
+A *static* value is a value that is fixed at export time and cannot change
+between executions of the exported program. When the value is encountered during
+tracing, the exporter will treat it as a constant and hard-code it into the
+graph.
+
+When an operation is performed (e.g. `x + y`) and all inputs are static, then
+the output of the operation will be directly hard-coded into the graph, and the
+operation won’t show up (i.e. it will get constant-folded).
+
+When a value has been hard-coded into the graph, we say that the graph has been
+*specialized* to that value.
+
+The following values are static:
+
+#### Input Tensor Shapes
+
+By default, `torch.export` will trace the program specializing on the input
+tensors' shapes, unless a dimension is specified as dynamic via the
+`dynamic_shapes` argument to `torch.export`. This means that if there exists
+shape-dependent control flow, `torch.export` will specialize on the branch
+that is being taken with the given sample inputs. For example:
+
+```python
+import torch
+from torch.export import export
+
+class Mod(torch.nn.Module):
+    def forward(self, x):
+        if x.shape[0] > 5:
+            return x + 1
+        else:
+            return x - 1
+
+example_inputs = (torch.rand(10, 2),)
+exported_program = export(Mod(), example_inputs)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+class GraphModule(torch.nn.Module):
+    def forward(self, x: "f32[10, 2]"):
+        # code: return x + 1
+        add: "f32[10, 2]" = torch.ops.aten.add.Tensor(x, 1)
+        return (add,)
+```
+
+The conditional of (`x.shape[0] > 5`) does not appear in the
+`ExportedProgram` because the example inputs have the static
+shape of (10, 2). Since `torch.export` specializes on the inputs' static
+shapes, the else branch (`x - 1`) will never be reached. To preserve the dynamic
+branching behavior based on the shape of a tensor in the traced graph,
+{func}`torch.export.Dim` will need to be used to specify the dimension
+of the input tensor (`x.shape[0]`) to be dynamic, and the source code will
+need to be {ref}`rewritten <data-shape-dependent-control-flow>`.
+
+Note that tensors that are part of the module state (e.g. parameters and
+buffers) always have static shapes.
+
+#### Python Primitives
+
+`torch.export` also specializes on Python primitives,
+such as `int`, `float`, `bool`, and `str`. However they do have dynamic
+variants such as `SymInt`, `SymFloat`, and `SymBool`.
+
+For example:
+
+```python
+import torch
+from torch.export import export
+
+class Mod(torch.nn.Module):
+    def forward(self, x: torch.Tensor, const: int, times: int):
+        for i in range(times):
+            x = x + const
+        return x
+
+example_inputs = (torch.rand(2, 2), 1, 3)
+exported_program = export(Mod(), example_inputs)
+print(exported_program)
+```
+
+```python
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, x: "f32[2, 2]", const, times):
+            # code: x = x + const
+            add: "f32[2, 2]" = torch.ops.aten.add.Tensor(x, 1)
+            add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(add, 1)
+            add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(add_1, 1)
+            return (add_2,)
+```
+
+Because integers are specialized, the `torch.ops.aten.add.Tensor` operations
+are all computed with the hard-coded constant `1`, rather than `const`. If
+a user passes a different value for `const` at runtime, like 2, than the one used
+during export time, 1, this will result in an error.
+Additionally, the `times` iterator used in the `for` loop is also "inlined"
+in the graph through the 3 repeated `torch.ops.aten.add.Tensor` calls, and the
+input `times` is never used.
+
+#### Python Containers
+
+Python containers (`List`, `Dict`, `NamedTuple`, etc.) are considered to
+have static structure.
+
+(limitations-of-torch-export)=
+
+## Limitations of torch.export
+
+### Graph Breaks
+
+As `torch.export` is a one-shot process for capturing a computation graph from
+a PyTorch program, it might ultimately run into untraceable parts of programs as
+it is nearly impossible to support tracing all PyTorch and Python features. In
+the case of `torch.compile`, an unsupported operation will cause a "graph
+break" and the unsupported operation will be run with default Python evaluation.
+In contrast, `torch.export` will require users to provide additional
+information or rewrite parts of their code to make it traceable. As the
+tracing is based on TorchDynamo, which evaluates at the Python
+bytecode level, there will be significantly fewer rewrites required compared to
+previous tracing frameworks.
+
+When a graph break is encountered, {ref}`ExportDB <torch.export_db>` is a great
+resource for learning about the kinds of programs that are supported and
+unsupported, along with ways to rewrite programs to make them traceable.
+
+An option to get past dealing with this graph breaks is by using
+{ref}`non-strict export <non-strict-export>`
+
+(data-shape-dependent-control-flow)=
+
+### Data/Shape-Dependent Control Flow
+
+Graph breaks can also be encountered on data-dependent control flow (`if
+x.shape[0] > 2`) when shapes are not being specialized, as a tracing compiler cannot
+possibly deal with without generating code for a combinatorially exploding
+number of paths. In such cases, users will need to rewrite their code using
+special control flow operators. Currently, we support {ref}`torch.cond <cond>`
+to express if-else like control flow (more coming soon!).
+
+### Missing Fake/Meta/Abstract Kernels for Operators
+
+When tracing, a FakeTensor kernel (aka meta kernel, abstract impl) is
+required for all operators. This is used to reason about the input/output shapes
+for this operator.
+
+Please see {func}`torch.library.register_fake` for more details.
+
+In the unfortunate case where your model uses an ATen operator that is does not
+have a FakeTensor kernel implementation yet, please file an issue.
+
+## Read More
+
+```{toctree}
+:caption: Additional Links for Export Users
+:maxdepth: 1
+
+export.programming_model
+export.ir_spec
+draft_export
+torch.compiler_transformations
+torch.compiler_ir
+generated/exportdb/index
+cond
+```
+
+```{toctree}
+:caption: Deep Dive for PyTorch Developers
+:maxdepth: 1
+
+torch.compiler_dynamo_overview
+torch.compiler_dynamo_deepdive
+torch.compiler_dynamic_shapes
+torch.compiler_fake_tensor
+```
+
+## API Reference
+
+```{eval-rst}
+.. automodule:: torch.export
+```
+
+```{eval-rst}
+.. autofunction:: export
+```
+
+```{eval-rst}
+.. autofunction:: save
+```
+
+```{eval-rst}
+.. autofunction:: load
+```
+
+```{eval-rst}
+.. autofunction:: draft_export
+```
+
+```{eval-rst}
+.. autofunction:: register_dataclass
+```
+
+```{eval-rst}
+.. autoclass:: torch.export.dynamic_shapes.Dim
+```
+
+```{eval-rst}
+.. autoclass:: torch.export.dynamic_shapes.ShapesCollection
+
+    .. automethod:: dynamic_shapes
+```
+
+```{eval-rst}
+.. autoclass:: torch.export.dynamic_shapes.AdditionalInputs
+
+    .. automethod:: add
+    .. automethod:: dynamic_shapes
+    .. automethod:: verify
+```
+
+```{eval-rst}
+.. autofunction:: torch.export.dynamic_shapes.refine_dynamic_shapes_from_suggested_fixes
+```
+
+```{eval-rst}
+.. autoclass:: ExportedProgram
+
+    .. attribute:: graph
+    .. attribute:: graph_signature
+    .. attribute:: state_dict
+    .. attribute:: constants
+    .. attribute:: range_constraints
+    .. attribute:: module_call_graph
+    .. attribute:: example_inputs
+    .. automethod:: module
+    .. automethod:: run_decompositions
+```
+
+```{eval-rst}
+.. autoclass:: ExportGraphSignature
+```
+
+```{eval-rst}
+.. autoclass:: ModuleCallSignature
+```
+
+```{eval-rst}
+.. autoclass:: ModuleCallEntry
+```
+
+```{eval-rst}
+.. automodule:: torch.export.decomp_utils
+```
+
+```{eval-rst}
+.. autoclass:: CustomDecompTable
+
+    .. automethod:: copy
+    .. automethod:: items
+    .. automethod:: keys
+    .. automethod:: materialize
+    .. automethod:: pop
+    .. automethod:: update
+```
+
+```{eval-rst}
+.. autofunction:: torch.export.exported_program.default_decompositions
+```
+
+```{eval-rst}
+.. automodule:: torch.export.exported_program
+```
+
+```{eval-rst}
+.. automodule:: torch.export.graph_signature
+```
+
+```{eval-rst}
+.. autoclass:: ExportGraphSignature
+
+    .. automethod:: replace_all_uses
+    .. automethod:: get_replace_hook
+```
+
+```{eval-rst}
+.. autoclass:: ExportBackwardSignature
+```
+
+```{eval-rst}
+.. autoclass:: InputKind
+```
+
+```{eval-rst}
+.. autoclass:: InputSpec
+```
+
+```{eval-rst}
+.. autoclass:: OutputKind
+```
+
+```{eval-rst}
+.. autoclass:: OutputSpec
+```
+
+```{eval-rst}
+.. autoclass:: SymIntArgument
+```
+
+```{eval-rst}
+.. autoclass:: SymBoolArgument
+```
+
+```{eval-rst}
+.. autoclass:: SymFloatArgument
+```
+
+```{eval-rst}
+.. autoclass:: CustomObjArgument
+```
+
+```{eval-rst}
+.. py:module:: torch.export.dynamic_shapes
+```
+
+```{eval-rst}
+.. py:module:: torch.export.custom_ops
+```
+
+```{eval-rst}
+.. automodule:: torch.export.unflatten
+    :members:
+```
+
+```{eval-rst}
+.. automodule:: torch.export.custom_obj
+```
+
+```{eval-rst}
+.. automodule:: torch.export.experimental
+```
+
+```{eval-rst}
+.. automodule:: torch.export.passes
+```
+
+```{eval-rst}
+.. autofunction:: torch.export.passes.move_to_device_pass
+```
+
+```{eval-rst}
+.. automodule:: torch.export.pt2_archive
+```
+
+```{eval-rst}
+.. automodule:: torch.export.pt2_archive.constants
+```
diff --git a/docs/source/export.programming_model.md b/docs/source/export.programming_model.md
new file mode 100644
index 000000000000..9a21db78464a
--- /dev/null
+++ b/docs/source/export.programming_model.md
@@ -0,0 +1,523 @@
+(export-programming-model)=
+
+# torch.export Programming Model
+
+This document aims to explain the behaviors and capabilities of
+{func}`torch.export.export`. It is intended to help build your intuition
+for how {func}`torch.export.export` handles code.
+
+## Basics of Tracing
+
+{func}`torch.export.export` captures a graph representing your model by
+tracing its execution on "example" inputs and recording the PyTorch operations
+and conditions observed along the traced path. This graph can then be run
+on different inputs as long as they satisfy the same conditions.
+
+The basic output of {func}`torch.export.export` is a single graph of PyTorch
+operations, with associated metadata. The exact format of this output is
+covered in the {ref}`export.ir_spec`.
+
+### Strict vs. Non-Strict Tracing
+
+{func}`torch.export.export` provides two modes of tracing.
+
+In *non-strict mode*, we trace through the program using the normal Python
+interpreter. Your code executes exactly as it would in eager mode; the only
+difference is that all Tensors are replaced by
+[fake Tensors](https://pytorch.org/docs/main/torch.compiler_fake_tensor.html),
+**which have shapes and other forms of metadata but no data**, wrapped in
+[Proxy objects](https://pytorch.org/docs/main/fx.html) that record all
+operations on them into a graph. We also capture
+[conditions on Tensor shapes](https://pytorch.org/docs/main/torch.compiler_dynamic_shapes.html#the-guard-model)
+**that guard the correctness of the generated code**.
+
+In *strict mode*, we first trace through the program using
+{ref}`TorchDynamo <torch.compiler_dynamo_deepdive>`, a Python bytecode
+analysis engine. TorchDynamo does not actually execute your Python code.
+Instead, it symbolically analyzes it and builds a graph based on the results.
+On the one hand, this analysis allows {func}`torch.export.export` to provide
+additional guarantees on Python-level safety (beyond capturing conditions on
+Tensor shapes, as in non-strict mode). On the other hand, not all Python
+features are supported by this analysis.
+
+Although currently the default mode of tracing is strict, **we strongly
+recommend using non-strict**, which will soon become the default.
+For most models, conditions on Tensor shapes are enough for soundness, and
+the additional guarantees on Python-level safety have no impact; at the same
+time, the possibility of hitting unsupported Python features in TorchDynamo
+presents an unnecessary risk.
+
+In the rest of this document we assume we are tracing in
+[non-strict mode](https://pytorch.org/docs/main/export.html#non-strict-export);
+in particular, we assume that **all Python features are supported**.
+
+## Values: Static vs. Dynamic
+
+A key concept in understanding the behavior of {func}`torch.export.export` is
+the difference between *static* and *dynamic* values.
+
+### Static Values
+
+A *static* value is a value that is **fixed at export time and cannot change
+between executions of the exported program**. When the value is encountered
+during tracing, we treat it as a constant and hard-code it into the graph.
+
+When an operation is performed (e.g. `x + y`) and all inputs are static,
+the output of the operation is directly hard-coded into the graph and the
+operation does not show up (i.e. it gets "constant-folded").
+
+When a value has been hard-coded into the graph, we say that the graph has
+been *specialized* to that value. For example:
+
+```python
+import torch
+
+class MyMod(torch.nn.Module):
+    def forward(self, x, y):
+        z = y + 7
+        return x + z
+
+m = torch.export.export(MyMod(), (torch.randn(1), 3))
+print(m.graph_module.code)
+
+"""
+def forward(self, arg0_1, arg1_1):
+    add = torch.ops.aten.add.Tensor(arg0_1, 10);  arg0_1 = None
+    return (add,)
+
+"""
+```
+
+Here, we provide `3` as the traced value for `y`; it is treated as a static
+value and added to `7`, burning in the static value `10` in the graph.
+
+### Dynamic Values
+
+A *dynamic* value is one that **can change from run to run**. It behaves just
+like a "normal" function argument: you can pass different inputs and expect
+your function to do the right thing.
+
+### Which values are static vs. dynamic?
+
+Whether a value is static or dynamic depends on its type:
+
+- For Tensor:
+
+  - Tensor *data* is treated as dynamic.
+
+  - Tensor *shapes* can be treated by the system as static or dynamic.
+
+    - By default, shapes of all input Tensors are considered static.
+      The user can override this behavior for any input Tensor by specifying
+      a [dynamic shape](https://pytorch.org/docs/main/export.html#expressing-dynamism)
+      for it.
+    - Tensors that are part of module state, i.e., parameters and buffers,
+      always have static shapes.
+
+  - Other forms of Tensor *metadata* (e.g. `device`, `dtype`) are static.
+
+- Python *primitives* (`int`, `float`, `bool`, `str`, `None`) are static.
+
+  - There are dynamic variants for some primitive types (`SymInt`,
+    `SymFloat`, `SymBool`). Typically users do not have to deal with them.
+
+- For Python *standard containers* (`list`, `tuple`, `dict`, `namedtuple`):
+
+  - The structure (i.e., length for `list` and `tuple` values, and key
+    sequence for `dict` and `namedtuple` values) is static.
+  - The contained elements have these rules applied to them recursively
+    (basically the
+    [PyTree](https://jax.readthedocs.io/en/latest/pytrees.html) scheme)
+    with leaves that are either Tensor or primitive types.
+
+- Other *classes* (including data classes) can be registered with PyTree
+  (see below), and follow the same rules as the standard containers.
+
+## Input types
+
+Inputs will be treated as either static or dynamic, based on their type
+(as explained above).
+
+- A static input will get hard-coded into the graph, and passing a different
+  value at run time will result in an error. Recall that these are mostly
+  values of primitive types.
+- A dynamic input behaves like a "normal" function input. Recall that these
+  are mostly values of Tensor types.
+
+By default, the types of inputs you can use for your program are:
+
+- Tensor
+- Python primitives (`int`, `float`, `bool`, `str`, `None`)
+- Python standard containers (`list`, `tuple`, `dict`, `namedtuple`)
+
+### Custom Input Types
+
+In addition, you can also define your own (custom) class and use it as an
+input type, but you will need to register such a class as a PyTree.
+
+Here's an example of using an utility to register a dataclass that is used as
+an input type.
+
+```python
+@dataclass
+class Input:
+    f: torch.Tensor
+    p: torch.Tensor
+
+torch.export.register_dataclass(Input)
+
+class M(torch.nn.Module):
+    def forward(self, x: Input):
+        return x.f + 1
+
+torch.export.export(M(), (Input(f=torch.ones(10, 4), p=torch.zeros(10, 4)),))
+```
+
+### Optional input types
+
+For optional inputs to the program that are not passed in,
+{func}`torch.export.export` will specialize to their default values. As a
+result, the exported program will require users to explicitly pass in all
+arguments, and will lose the defaulting behavior. For example:
+
+```python
+class M(torch.nn.Module):
+    def forward(self, x, y=None):
+        if y is not None:
+            return y * x
+        return x + x
+
+# Optional input is passed in
+ep = torch.export.export(M(), (torch.randn(3, 3), torch.randn(3, 3)))
+print(ep)
+"""
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, x: "f32[3, 3]", y: "f32[3, 3]"):
+            # File: /data/users/angelayi/pytorch/moo.py:15 in forward, code: return y * x
+            mul: "f32[3, 3]" = torch.ops.aten.mul.Tensor(y, x);  y = x = None
+            return (mul,)
+"""
+
+# Optional input is not passed in
+ep = torch.export.export(M(), (torch.randn(3, 3),))
+print(ep)
+"""
+ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, x: "f32[3, 3]", y):
+            # File: /data/users/angelayi/pytorch/moo.py:16 in forward, code: return x + x
+            add: "f32[3, 3]" = torch.ops.aten.add.Tensor(x, x);  x = None
+            return (add,)
+"""
+```
+
+## Control Flow: Static vs. Dynamic
+
+Control flow is supported by {func}`torch.export.export`. The behavior of
+control flow depends on whether the value you are branching on is static or
+dynamic.
+
+### Static Control Flow
+
+**Python control flow over static values is supported transparently**. (Recall
+that static values include static shapes, so control flow over static shapes
+is also covered by this case.)
+
+As mentioned above, we "burn in" static values, so the exported graph will
+never see any control flow over static values.
+
+In the case of an `if` statement, we will continue tracing the branch taken
+at export time. In the case of a `for` or `while` statement, we will continue
+tracing by unrolling the loop.
+
+### Dynamic Control Flow: Shape-Dependent vs. Data-Dependent
+
+When the value involved in a control flow is dynamic, it could depend on
+dynamic shapes or dynamic data. Given that the compiler traces with
+information on shapes rather than data, the implications on the programming
+model are different in these cases.
+
+#### Dynamic Shape-Dependent Control Flow
+
+When the value involved in a control flow is a
+[dynamic shape](https://pytorch.org/docs/main/torch.compiler_dynamic_shapes.html),
+in most cases **we will also know the concrete value of the dynamic shape
+during tracing**: see the following section for more details on how the
+compiler tracks this information.
+
+In these cases we say that the control flow is shape-dependent. **We use the
+concrete value of the dynamic shape to evaluate the condition** to either
+`True` or `False` and continue tracing (as discussed above), additionally
+emitting a guard corresponding to the condition just evaluated.
+
+Otherwise the control flow is considered data-dependent. We cannot evaluate
+the condition to either `True` or `False`, so cannot continue tracing and have to
+raise an error at export time. See next section.
+
+#### Dynamic Data-Dependent Control Flow
+
+**Data-dependent control flow over dynamic values is supported, but you must
+use one of PyTorch's explicit operators** to continue tracing. Using Python
+control flow statements over dynamic values is not permitted, because the
+compiler cannot evaluate the conditions necessary to continue tracing and
+thus an error must be raised at export time.
+
+We provide **operators to express general conditionals and loops over dynamic
+values**, e.g., `torch.cond`, `torch.map`. Note that you only need to use these
+if you truly want *data-dependent control flow*.
+
+Here's an example of an `if` statement on a data-dependent condition,
+`x.sum() > 0`, where `x` is an input Tensor, rewritten using `torch.cond`.
+Instead of having to decide which branch to trace, now both branches are
+traced.
+
+```python
+class M_old(torch.nn.Module):
+    def forward(self, x):
+        if x.sum() > 0:
+            return x.sin()
+        else:
+            return x.cos()
+
+class M_new(torch.nn.Module):
+    def forward(self, x):
+        return torch.cond(
+            pred=x.sum() > 0,
+            true_fn=lambda x: x.sin(),
+            false_fn=lambda x: x.cos(),
+            operands=(x,),
+        )
+```
+
+A special case of data-dependent control flow is where it involves a
+[data-dependent dynamic shape](https://pytorch.org/docs/main/torch.compiler_dynamic_shapes.html#unbacked-symints):
+typically, the shape of some intermediate Tensor that depends on input data
+rather than on input shapes (thus not shape-dependent). Instead of using a
+control flow operator, in this case you can provide an assertion that decides
+whether the condition is `True` or `False`. Given such an assertion, we can
+continue tracing, emitting a guard as above.
+
+We provide **operators to express assertions on dynamic shapes**, e.g.,
+`torch._check`. Note that you only need to use this when there is control
+flow on data-dependent dynamic shapes.
+
+Here's an example of an `if` statement on a condition involving a
+data-dependent dynamic shape, `nz.shape[0] > 0`, where `nz` is the result of
+calling {func}`torch.nonzero`, an operator whose output shape depends on input
+data. Instead of rewriting it, you can add an assertion using `torch._check`
+to effectively decide which branch to trace.
+
+```python
+class M_old(torch.nn.Module):
+    def forward(self, x):
+        nz = x.nonzero()
+        if nz.shape[0] > 0:
+            return x.sin()
+        else:
+            return x.cos()
+
+class M_new(torch.nn.Module):
+    def forward(self, x):
+        nz = x.nonzero()
+        torch._check(nz.shape[0] > 0)
+        if nz.shape[0] > 0:
+            return x.sin()
+        else:
+            return x.cos()
+```
+
+## Basics of Symbolic Shapes
+
+During tracing, dynamic Tensor shapes and conditions over them are encoded as
+"symbolic expressions." (In contrast, static Tensor shapes and conditions
+over them are simply `int` and `bool` values.)
+
+A *symbol* is like a variable; it describes a dynamic Tensor shape.
+
+As tracing proceeds, shapes of intermediate Tensors may be described by more
+general expressions, typically involving integer arithmetic operators. This
+is because **for most PyTorch operators, shapes of output Tensors can be
+described as functions of shapes of input Tensors**. For example, the shape of
+the output of {func}`torch.cat` is the sum of the shapes of its inputs.
+
+Moreover, as we encounter control flow in the program, we create boolean
+expressions, typically involving relational operators, describing conditions
+along the traced path. These **expressions are evaluated to decide which path
+to trace through the program**, and recorded in a
+[shape environment](https://pytorch.org/docs/main/torch.compiler_dynamic_shapes.html#overall-architecture)
+to guard the correctness of the traced path and to evaluate subsequently
+created expressions.
+
+We briefly introduce these subsystems next.
+
+### Fake Implementations of PyTorch Operators
+
+Recall that during tracing, we are executing the program with
+[fake Tensors](https://pytorch.org/docs/main/torch.compiler_fake_tensor.html),
+which have no data. In general we cannot call the actual implementations of
+PyTorch operators with fake Tensors. Thus each operator needs to have an
+additional fake (a.k.a. "meta") implementation, which inputs and outputs fake
+Tensors, that matches the behavior of the actual implementation in terms of
+shapes and other forms of metadata carried by fake Tensors.
+
+For example, note how the fake implementation of {func}`torch.index_select`
+computes the shape of the output using the shape of the input (while ignoring
+input data and returning empty output data).
+
+```python
+def meta_index_select(self, dim, index):
+    result_size = list(self.size())
+    if self.dim() > 0:
+        result_size[dim] = index.numel()
+    return self.new_empty(result_size)
+```
+
+#### Shape Propagation: Backed vs. Unbacked Dynamic Shapes
+
+Shapes are propagated using fake implementations of PyTorch operators.
+
+A key concept to understand the propagation of dynamic shapes in particular
+is the difference between *backed* and *unbacked* dynamic shapes: we know the
+concrete values of the former but not the latter.
+
+Propagation of shapes, including tracking backed and unbacked dynamic shapes,
+proceeds as follows:
+
+- The shapes of Tensors representing inputs can be static or dynamic. When
+  dynamic, they are described by symbols; moreover, **such symbols are backed
+  since we also know their concrete values given the "real" example inputs
+  provided by the user at export time**.
+
+- The output shape of an operator is computed by its fake implementation, and
+  is either static or dynamic. When dynamic, in general it is described by a
+  symbolic expression. Moreover:
+
+  - If the output shape depends only on input shapes, it is either static or
+    backed dynamic whenever the input shapes are all static or backed dynamic.
+  - On the other hand, **if the output shape depends on input data**, it is
+    necessarily dynamic, and moreover, **because we cannot know its concrete
+    value it is unbacked**.
+
+### Control Flow: Guards and Assertions
+
+When a condition on shapes is encountered, it either involves only static
+shapes, in which case it is a `bool`, or it involves dynamic shapes, in which
+case it is a symbolic boolean expression. For the latter:
+
+- When the condition involves only backed dynamic shapes, we can use the
+  concrete values of those dynamic shapes to evaluate the condition to `True`
+  or `False`. We can then add a guard to the shape environment that states
+  that the corresponding symbolic boolean expression is `True` or `False`,
+  and continue tracing.
+- Otherwise the condition involves unbacked dynamic shapes. In general we
+  cannot evaluate such a condition without additional information; thus we
+  cannot continue tracing, and we must raise an error at export time. The
+  user is expected to use an explicit PyTorch operator for tracing to
+  continue. This information is added as a guard in the shape environment,
+  and can also possibly help evaluate other subsequently encountered
+  conditions to `True` or `False`.
+
+Once the model is exported, **any guards on backed dynamic shapes can be
+understood as conditions on input dynamic shapes**. These are verified against
+a dynamic shape specification that must have been provided to export,
+describing conditions on dynamic shapes that not only example inputs but also
+all future inputs are expected to satisfy for the generated code to be
+correct. More precisely, the dynamic shape specification must logically imply
+the generated guards, otherwise an error is raised at export time (along with
+suggested fixes to the dynamic shape specification). On the other hand, when
+there are no generated guards on backed dynamic shapes (in particular, when
+all shapes are static) no dynamic shape specification needs to be provided to
+export. In general, the dynamic shape specification is converted to runtime
+assertions on the inputs of the generated code.
+
+Finally, **any guards on unbacked dynamic shapes are converted to "inline"
+runtime assertions**. These are added in the generated code at the locations
+where those unbacked dynamic shapes were created: typically, right after
+data-dependent operator calls.
+
+## Allowed PyTorch operators
+
+All PyTorch operators are permitted.
+
+### Custom operators
+
+In addition, you can define and use
+[custom operators](https://pytorch.org/tutorials/advanced/python_custom_ops#python-custom-ops-tutorial).
+Defining a custom operator includes defining a fake implementation for it,
+just like any other PyTorch operator (see previous section).
+
+Here's an example of a custom `sin` operator that wraps NumPy, and its
+registered (trivial) fake implementation.
+
+```python
+@torch.library.custom_op("mylib::sin", mutates_args=())
+def sin(x: Tensor) -> Tensor:
+    x_np = x.numpy()
+    y_np = np.sin(x_np)
+    return torch.from_numpy(y_np)
+
+@torch.library.register_fake("mylib::sin")
+def _(x: Tensor) -> Tensor:
+    return torch.empty_like(x)
+```
+
+**Sometimes your custom operator's fake implementation will involve
+data-dependent shapes**. Here's how a fake implementation for a custom
+`nonzero` might look like.
+
+```python
+...
+
+@torch.library.register_fake("mylib::custom_nonzero")
+def _(x):
+    nnz = torch.library.get_ctx().new_dynamic_size()
+    shape = [nnz, x.dim()]
+    return x.new_empty(shape, dtype=torch.int64)
+```
+
+## Module State: Reads vs. Updates
+
+Module states include parameters, buffers, and regular attributes.
+
+- A regular attribute can be of any type.
+- On the other hand, parameters and buffers are always Tensors.
+
+Module states can be dynamic or static, based on their types as outlined
+above. For example, `self.training` is a `bool`, which means it is static; on
+the other hand, any parameter or buffer is dynamic.
+
+The *shapes* of any Tensors contained in module states cannot be dynamic, i.e.,
+those shapes are fixed at export time, and cannot change between executions
+of the exported program.
+
+### Access rules
+
+**All module states must be initialized**. Accessing a module state that is
+not already initialized causes an error to be raised at export time.
+
+**Reading module states is always permitted**.
+
+Updating module states is possible, but must follow the rules below:
+
+- **A static regular attribute** (e.g., of primitive type) **can be updated**.
+  Reads and updates can be freely interleaved, and as expected, any reads
+  will always see the values of the latest updates. Because these attributes
+  are static, we will also burn the values in, so the generated code will not
+  have any instructions to actually "get" or "set" such attributes.
+- **A dynamic regular attribute** (e.g., of Tensor type) **cannot be updated**.
+  To do so, it must be registered as a buffer during module initialization.
+- **A buffer can be updated**, where the updating can be in-place (e.g.,
+  `self.buffer[:] = ...`) or not (e.g., `self.buffer = ...`).
+- **A parameter cannot be updated**. Typically parameters are updated only
+  during training, not during inference. We recommend exporting with
+  {func}`torch.no_grad` to avoid parameter updates at export time.
+
+### Effects of functionalization
+
+Any dynamic module state that is read and/or updated is "lifted"
+(respectively) as an input and/or output of the generated code.
+
+The exported program stores, along with the generated code, the initial
+values of parameters and buffers and the constant values of other Tensor
+attributes.
diff --git a/docs/source/fft.md b/docs/source/fft.md
new file mode 100644
index 000000000000..b345dbbdda46
--- /dev/null
+++ b/docs/source/fft.md
@@ -0,0 +1,56 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# torch.fft
+
+Discrete Fourier transforms and related functions.
+
+```{eval-rst}
+.. automodule:: torch.fft
+```
+
+```{eval-rst}
+.. currentmodule:: torch.fft
+```
+
+## Fast Fourier Transforms
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    fft
+    ifft
+    fft2
+    ifft2
+    fftn
+    ifftn
+    rfft
+    irfft
+    rfft2
+    irfft2
+    rfftn
+    irfftn
+    hfft
+    ihfft
+    hfft2
+    ihfft2
+    hfftn
+    ihfftn
+```
+
+## Helper Functions
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    fftfreq
+    rfftfreq
+    fftshift
+    ifftshift
+```
diff --git a/docs/source/fsdp.md b/docs/source/fsdp.md
new file mode 100644
index 000000000000..6163e56bbe64
--- /dev/null
+++ b/docs/source/fsdp.md
@@ -0,0 +1,75 @@
+# FullyShardedDataParallel
+
+```{eval-rst}
+.. automodule:: torch.distributed.fsdp
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.FullyShardedDataParallel
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.BackwardPrefetch
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.ShardingStrategy
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.MixedPrecision
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.CPUOffload
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.StateDictConfig
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.FullStateDictConfig
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.ShardedStateDictConfig
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.LocalStateDictConfig
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.OptimStateDictConfig
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.FullOptimStateDictConfig
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.ShardedOptimStateDictConfig
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.LocalOptimStateDictConfig
+  :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.distributed.fsdp.StateDictSettings
+  :members:
+```
diff --git a/docs/source/func.api.md b/docs/source/func.api.md
new file mode 100644
index 000000000000..111eb4dc743e
--- /dev/null
+++ b/docs/source/func.api.md
@@ -0,0 +1,88 @@
+# torch.func API Reference
+
+```{eval-rst}
+.. currentmodule:: torch.func
+```
+
+```{eval-rst}
+.. automodule:: torch.func
+```
+
+## Function Transforms
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+     vmap
+     grad
+     grad_and_value
+     vjp
+     jvp
+     linearize
+     jacrev
+     jacfwd
+     hessian
+     functionalize
+```
+
+## Utilities for working with torch.nn.Modules
+
+In general, you can transform over a function that calls a ``torch.nn.Module``.
+For example, the following is an example of computing a jacobian of a function
+that takes three values and returns three values:
+
+```python
+model = torch.nn.Linear(3, 3)
+
+def f(x):
+    return model(x)
+
+x = torch.randn(3)
+jacobian = jacrev(f)(x)
+assert jacobian.shape == (3, 3)
+```
+
+However, if you want to do something like compute a jacobian over the parameters of the model, then there needs to be a way to construct a function where the parameters are the inputs to the function. That's what {func}`functional_call` is for: it accepts an nn.Module, the transformed `parameters`, and the inputs to the Module's forward pass. It returns the value of running the Module's forward pass with the replaced parameters.
+
+Here's how we would compute the Jacobian over the parameters
+
+```python
+model = torch.nn.Linear(3, 3)
+
+def f(params, x):
+    return torch.func.functional_call(model, params, x)
+
+x = torch.randn(3)
+jacobian = jacrev(f)(dict(model.named_parameters()), x)
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    functional_call
+    stack_module_state
+    replace_all_batch_norm_modules_
+```
+
+If you're looking for information on fixing Batch Norm modules, please follow the
+guidance here
+
+```{eval-rst}
+.. toctree::
+   :maxdepth: 1
+
+   func.batch_norm
+```
+
+## Debug utilities
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+     debug_unwrap
+```
diff --git a/docs/source/func.batch_norm.md b/docs/source/func.batch_norm.md
new file mode 100644
index 000000000000..86907f11a7ca
--- /dev/null
+++ b/docs/source/func.batch_norm.md
@@ -0,0 +1,75 @@
+# Patching Batch Norm
+
+## What's happening?
+Batch Norm requires in-place updates to running_mean and running_var of the same size as the input.
+Functorch does not support inplace update to a regular tensor that takes in a batched tensor (i.e.
+`regular.add_(batched)` is not allowed). So when vmapping over a batch of inputs to a single module,
+we end up with this error
+
+## How to fix
+One of the best supported ways is to switch BatchNorm for GroupNorm. Options 1 and 2 support this
+
+All of these options assume that you don't need running stats. If you're using a module this means
+that it's assumed you won't use batch norm in evaluation mode. If you have a use case that involves
+running batch norm with vmap in evaluation mode, please file an issue
+
+### Option 1: Change the BatchNorm
+If you want to change for GroupNorm, anywhere that you have BatchNorm, replace it with:
+
+```python
+BatchNorm2d(C, G, track_running_stats=False)
+```
+
+Here `C` is the same `C` as in the original BatchNorm. `G` is the number of groups to
+break `C` into. As such, `C % G == 0` and as a fallback, you can set `C == G`, meaning
+each channel will be treated separately.
+
+If you must use BatchNorm and you've built the module yourself, you can change the module to
+not use running stats. In other words, anywhere that there's a BatchNorm module, set the
+`track_running_stats` flag to be False
+
+```python
+BatchNorm2d(64, track_running_stats=False)
+```
+
+### Option 2: torchvision parameter
+Some torchvision models, like resnet and regnet, can take in a `norm_layer` parameter. These are
+often defaulted to be BatchNorm2d if they've been defaulted.
+
+Instead you can set it to be GroupNorm.
+
+```python
+import torchvision
+from functools import partial
+torchvision.models.resnet18(norm_layer=lambda c: GroupNorm(num_groups=g, c))
+```
+
+Here, once again, `c % g == 0` so as a fallback, set `g = c`.
+
+If you are attached to BatchNorm, be sure to use a version that doesn't use running stats
+
+```python
+import torchvision
+from functools import partial
+torchvision.models.resnet18(norm_layer=partial(BatchNorm2d, track_running_stats=False))
+```
+
+### Option 3: functorch's patching
+functorch has added some functionality to allow for quick, in-place patching of the module to not
+use running stats. Changing the norm layer is more fragile, so we have not offered that. If you
+have a net where you want the BatchNorm to not use running stats, you can run
+`replace_all_batch_norm_modules_` to update the module in-place to not use running stats
+
+```python
+from torch.func import replace_all_batch_norm_modules_
+replace_all_batch_norm_modules_(net)
+```
+
+### Option 4: eval mode
+When run under eval mode, the running_mean and running_var will not be updated. Therefore, vmap can support this mode
+
+```python
+model.eval()
+vmap(model)(x)
+model.train()
+```
diff --git a/docs/source/func.md b/docs/source/func.md
new file mode 100644
index 000000000000..d1b81a00fa53
--- /dev/null
+++ b/docs/source/func.md
@@ -0,0 +1,56 @@
+# torch.func
+
+```{eval-rst}
+.. currentmodule:: torch.func
+```
+
+torch.func, previously known as "functorch", is
+[JAX-like](https://github.com/google/jax) composable function transforms for PyTorch.
+
+```{note}
+This library is currently in [beta](https://pytorch.org/blog/pytorch-feature-classification-changes/#beta).
+What this means is that the features generally work (unless otherwise documented)
+and we (the PyTorch team) are committed to bringing this library forward. However, the APIs
+may change under user feedback and we don't have full coverage over PyTorch operations.
+
+If you have suggestions on the API or use-cases you'd like to be covered, please
+open a GitHub issue or reach out. We'd love to hear about how you're using the library.
+```
+
+## What are composable function transforms?
+
+- A "function transform" is a higher-order function that accepts a numerical function
+  and returns a new function that computes a different quantity.
+
+- {mod}`torch.func` has auto-differentiation transforms (`grad(f)` returns a function that
+  computes the gradient of `f`), a vectorization/batching transform (`vmap(f)`
+  returns a function that computes `f` over batches of inputs), and others.
+
+- These function transforms can compose with each other arbitrarily. For example,
+  composing `vmap(grad(f))` computes a quantity called per-sample-gradients that
+  stock PyTorch cannot efficiently compute today.
+
+## Why composable function transforms?
+
+There are a number of use cases that are tricky to do in PyTorch today:
+
+- computing per-sample-gradients (or other per-sample quantities)
+- running ensembles of models on a single machine
+- efficiently batching together tasks in the inner-loop of MAML
+- efficiently computing Jacobians and Hessians
+- efficiently computing batched Jacobians and Hessians
+
+Composing {func}`vmap`, {func}`grad`, and {func}`vjp` transforms allows us to express the above without designing a separate subsystem for each.
+This idea of composable function transforms comes from the [JAX framework](https://github.com/google/jax).
+
+## Read More
+
+```{eval-rst}
+.. toctree::
+  :maxdepth: 2
+
+  func.whirlwind_tour
+  func.api
+  func.ux_limitations
+  func.migrating
+```
diff --git a/docs/source/func.migrating.md b/docs/source/func.migrating.md
new file mode 100644
index 000000000000..b87a590f804a
--- /dev/null
+++ b/docs/source/func.migrating.md
@@ -0,0 +1,201 @@
+# Migrating from functorch to torch.func
+
+torch.func, previously known as "functorch", is
+[JAX-like](https://github.com/google/jax) composable function transforms for PyTorch.
+
+functorch started as an out-of-tree library over at
+the [pytorch/functorch](https://github.com/pytorch/functorch) repository.
+Our goal has always been to upstream functorch directly into PyTorch and provide
+it as a core PyTorch library.
+
+As the final step of the upstream, we've decided to migrate from being a top level package
+(`functorch`) to being a part of PyTorch to reflect how the function transforms are
+integrated directly into PyTorch core. As of PyTorch 2.0, we are deprecating
+`import functorch` and ask that users migrate to the newest APIs, which we
+will maintain going forward. `import functorch` will be kept around to maintain
+backwards compatibility for a couple of releases.
+
+## function transforms
+
+The following APIs are a drop-in replacement for the following
+[functorch APIs](https://pytorch.org/functorch/1.13/functorch.html).
+They are fully backwards compatible.
+
+| functorch API                      | PyTorch API (as of PyTorch 2.0)                |
+| ----------------------------------- | ---------------------------------------------- |
+| functorch.vmap                      | {func}`torch.vmap` or {func}`torch.func.vmap`              |
+| functorch.grad                      | {func}`torch.func.grad`                              |
+| functorch.vjp                       | {func}`torch.func.vjp`                               |
+| functorch.jvp                       | {func}`torch.func.jvp`                               |
+| functorch.jacrev                    | {func}`torch.func.jacrev`                            |
+| functorch.jacfwd                    | {func}`torch.func.jacfwd`                            |
+| functorch.hessian                   | {func}`torch.func.hessian`                           |
+| functorch.functionalize             | {func}`torch.func.functionalize`                     |
+
+Furthermore, if you are using torch.autograd.functional APIs, please try out
+the {mod}`torch.func` equivalents instead. {mod}`torch.func` function
+transforms are more composable and more performant in many cases.
+
+| torch.autograd.functional API               | torch.func API (as of PyTorch 2.0)                |
+| ------------------------------------------- | ---------------------------------------------- |
+| {func}`torch.autograd.functional.vjp`             | {func}`torch.func.grad` or {func}`torch.func.vjp`           |
+| {func}`torch.autograd.functional.jvp`             | {func}`torch.func.jvp`                                |
+| {func}`torch.autograd.functional.jacobian`        | {func}`torch.func.jacrev` or {func}`torch.func.jacfwd`      |
+| {func}`torch.autograd.functional.hessian`         | {func}`torch.func.hessian`                            |
+
+## NN module utilities
+
+We've changed the APIs to apply function transforms over NN modules to make them
+fit better into the PyTorch design philosophy. The new API is different, so
+please read this section carefully.
+
+### functorch.make_functional
+
+{func}`torch.func.functional_call` is the replacement for
+[functorch.make_functional](https://pytorch.org/functorch/1.13/generated/functorch.make_functional.html#functorch.make_functional)
+and
+[functorch.make_functional_with_buffers](https://pytorch.org/functorch/1.13/generated/functorch.make_functional_with_buffers.html#functorch.make_functional_with_buffers).
+However, it is not a drop-in replacement.
+
+If you're in a hurry, you can use
+[helper functions in this gist](https://gist.github.com/zou3519/7769506acc899d83ef1464e28f22e6cf)
+that emulate the behavior of functorch.make_functional and functorch.make_functional_with_buffers.
+We recommend using {func}`torch.func.functional_call` directly because it is a more explicit
+and flexible API.
+
+Concretely, functorch.make_functional returns a functional module and parameters.
+The functional module accepts parameters and inputs to the model as arguments.
+{func}`torch.func.functional_call` allows one to call the forward pass of an existing
+module using new parameters and buffers and inputs.
+
+Here's an example of how to compute gradients of parameters of a model using functorch
+vs {mod}`torch.func`:
+
+```python
+# ---------------
+# using functorch
+# ---------------
+import torch
+import functorch
+inputs = torch.randn(64, 3)
+targets = torch.randn(64, 3)
+model = torch.nn.Linear(3, 3)
+
+fmodel, params = functorch.make_functional(model)
+
+def compute_loss(params, inputs, targets):
+    prediction = fmodel(params, inputs)
+    return torch.nn.functional.mse_loss(prediction, targets)
+
+grads = functorch.grad(compute_loss)(params, inputs, targets)
+
+# ------------------------------------
+# using torch.func (as of PyTorch 2.0)
+# ------------------------------------
+import torch
+inputs = torch.randn(64, 3)
+targets = torch.randn(64, 3)
+model = torch.nn.Linear(3, 3)
+
+params = dict(model.named_parameters())
+
+def compute_loss(params, inputs, targets):
+    prediction = torch.func.functional_call(model, params, (inputs,))
+    return torch.nn.functional.mse_loss(prediction, targets)
+
+grads = torch.func.grad(compute_loss)(params, inputs, targets)
+```
+
+And here's an example of how to compute jacobians of model parameters:
+
+```python
+# ---------------
+# using functorch
+# ---------------
+import torch
+import functorch
+inputs = torch.randn(64, 3)
+model = torch.nn.Linear(3, 3)
+
+fmodel, params = functorch.make_functional(model)
+jacobians = functorch.jacrev(fmodel)(params, inputs)
+
+# ------------------------------------
+# using torch.func (as of PyTorch 2.0)
+# ------------------------------------
+import torch
+from torch.func import jacrev, functional_call
+inputs = torch.randn(64, 3)
+model = torch.nn.Linear(3, 3)
+
+params = dict(model.named_parameters())
+# jacrev computes jacobians of argnums=0 by default.
+# We set it to 1 to compute jacobians of params
+jacobians = jacrev(functional_call, argnums=1)(model, params, (inputs,))
+```
+
+Note that it is important for memory consumption that you should only carry
+around a single copy of your parameters. `model.named_parameters()` does not copy
+the parameters. If in your model training you update the parameters of the model
+in-place, then the `nn.Module` that is your model has the single copy of the
+parameters and everything is OK.
+
+However, if you want to carry your parameters around in a dictionary and update
+them out-of-place, then there are two copies of parameters: the one in the
+dictionary and the one in the `model`. In this case, you should change
+`model` to not hold memory by converting it to the meta device via
+`model.to('meta')`.
+
+### functorch.combine_state_for_ensemble
+
+Please use {func}`torch.func.stack_module_state` instead of
+[functorch.combine_state_for_ensemble](https://pytorch.org/functorch/1.13/generated/functorch.combine_state_for_ensemble.html)
+{func}`torch.func.stack_module_state` returns two dictionaries, one of stacked parameters, and
+one of stacked buffers, that can then be used with {func}`torch.vmap` and {func}`torch.func.functional_call`
+for ensembling.
+
+For example, here is an example of how to ensemble over a very simple model:
+
+```python
+import torch
+num_models = 5
+batch_size = 64
+in_features, out_features = 3, 3
+models = [torch.nn.Linear(in_features, out_features) for i in range(num_models)]
+data = torch.randn(batch_size, 3)
+
+# ---------------
+# using functorch
+# ---------------
+import functorch
+fmodel, params, buffers = functorch.combine_state_for_ensemble(models)
+output = functorch.vmap(fmodel, (0, 0, None))(params, buffers, data)
+assert output.shape == (num_models, batch_size, out_features)
+
+# ------------------------------------
+# using torch.func (as of PyTorch 2.0)
+# ------------------------------------
+import copy
+
+# Construct a version of the model with no memory by putting the Tensors on
+# the meta device.
+base_model = copy.deepcopy(models[0])
+base_model.to('meta')
+
+params, buffers = torch.func.stack_module_state(models)
+
+# It is possible to vmap directly over torch.func.functional_call,
+# but wrapping it in a function makes it clearer what is going on.
+def call_single_model(params, buffers, data):
+    return torch.func.functional_call(base_model, (params, buffers), (data,))
+
+output = torch.vmap(call_single_model, (0, 0, None))(params, buffers, data)
+assert output.shape == (num_models, batch_size, out_features)
+```
+
+## functorch.compile
+
+We are no longer supporting functorch.compile (also known as AOTAutograd)
+as a frontend for compilation in PyTorch; we have integrated AOTAutograd
+into PyTorch's compilation story. If you are a user, please use
+{func}`torch.compile` instead.
diff --git a/docs/source/func.ux_limitations.md b/docs/source/func.ux_limitations.md
new file mode 100644
index 000000000000..3997f806a529
--- /dev/null
+++ b/docs/source/func.ux_limitations.md
@@ -0,0 +1,334 @@
+```{eval-rst}
+.. currentmodule:: torch.func
+```
+
+(ux-limitations)=
+
+# UX Limitations
+
+torch.func, like [JAX](https://github.com/google/jax), has restrictions around
+what can be transformed. In general, JAX’s limitations are that transforms
+only work with pure functions: that is, functions where the output is completely
+determined by the input and that do not involve side effects (like mutation).
+
+We have a similar guarantee: our transforms work well with pure functions.
+However, we do support certain in-place operations. On one hand, writing code
+compatible with function transforms may involve changing how you write PyTorch
+code, on the other hand, you may find that our transforms let you express things
+that were previously difficult to express in PyTorch.
+
+## General limitations
+
+All torch.func transforms share a limitation in that a function should not
+assign to global variables. Instead, all outputs to a function must be returned
+from the function. This restriction comes from how torch.func is implemented:
+each transform wraps Tensor inputs in special torch.func Tensor subclasses
+that facilitate the transform.
+
+So, instead of the following:
+
+```python
+import torch
+from torch.func import grad
+
+# Don't do this
+intermediate = None
+
+def f(x):
+  global intermediate
+  intermediate = x.sin()
+  z = intermediate.sin()
+  return z
+
+x = torch.randn([])
+grad_x = grad(f)(x)
+```
+
+Please rewrite `f` to return `intermediate`:
+
+```python
+def f(x):
+  intermediate = x.sin()
+  z = intermediate.sin()
+  return z, intermediate
+
+grad_x, intermediate = grad(f, has_aux=True)(x)
+```
+
+## torch.autograd APIs
+
+If you are trying to use a `torch.autograd` API like `torch.autograd.grad`
+or `torch.autograd.backward` inside of a function being transformed by
+{func}`vmap` or one of torch.func's AD transforms ({func}`vjp`, {func}`jvp`,
+{func}`jacrev`, {func}`jacfwd`), the transform may not be able to transform over it.
+If it is unable to do so, you'll receive an error message.
+
+This is a fundamental design limitation in how PyTorch's AD support is implemented
+and the reason why we designed the torch.func library. Please instead use the torch.func
+equivalents of the `torch.autograd` APIs:
+- `torch.autograd.grad`, `Tensor.backward` -> `torch.func.vjp` or `torch.func.grad`
+- `torch.autograd.functional.jvp` -> `torch.func.jvp`
+- `torch.autograd.functional.jacobian` -> `torch.func.jacrev` or `torch.func.jacfwd`
+- `torch.autograd.functional.hessian` -> `torch.func.hessian`
+
+## vmap limitations
+
+:::{note}
+{func}`vmap` is our most restrictive transform.
+The grad-related transforms ({func}`grad`, {func}`vjp`, {func}`jvp`) do not
+have these limitations. {func}`jacfwd` (and {func}`hessian`, which is
+implemented with {func}`jacfwd`) is a composition of {func}`vmap` and
+{func}`jvp` so it also has these limitations.
+:::
+
+`vmap(func)` is a transform that returns a function that maps `func` over
+some new dimension of each input Tensor. The mental model for vmap is that it is
+like running a for-loop: for pure functions (i.e. in the absence of side
+effects), `vmap(f)(x)` is equivalent to:
+
+```python
+torch.stack([f(x_i) for x_i in x.unbind(0)])
+```
+
+### Mutation: Arbitrary mutation of Python data structures
+
+In the presence of side effects, {func}`vmap` no longer acts like it is running
+a for-loop. For example, the following function:
+
+```python
+def f(x, list):
+  list.pop()
+  print("hello!")
+  return x.sum(0)
+
+x = torch.randn(3, 1)
+lst = [0, 1, 2, 3]
+
+result = vmap(f, in_dims=(0, None))(x, lst)
+```
+
+will print "hello!" once and pop only one element from `lst`.
+
+{func}`vmap` executes `f` a single time, so all side effects only happen once.
+
+This is a consequence of how vmap is implemented. torch.func has a special,
+internal BatchedTensor class. `vmap(f)(*inputs)` takes all Tensor inputs,
+turns them into BatchedTensors, and calls `f(*batched_tensor_inputs)`.
+BatchedTensor overrides the PyTorch API to produce batched (i.e. vectorized)
+behavior for each PyTorch operator.
+
+### Mutation: in-place PyTorch Operations
+
+You might be here due to receiving an error about vmap-incompatible in-place
+operations. {func}`vmap` will raise an error if it encounters an unsupported PyTorch
+in-place operation and it will succeed otherwise. Unsupported operations
+are those that would cause a Tensor with more elements to be written to a
+Tensor with fewer elements. Here's an example of how this can occur:
+
+```python
+def f(x, y):
+  x.add_(y)
+  return x
+
+x = torch.randn(1)
+y = torch.randn(3, 1)  # When vmapped over, looks like it has shape [1]
+
+# Raises an error because `x` has fewer elements than `y`.
+vmap(f, in_dims=(None, 0))(x, y)
+```
+
+`x` is a Tensor with one element, `y` is a Tensor with three elements.
+`x + y` has three elements (due to broadcasting), but attempting to write
+three elements back into `x`, which only has one element, raises an error
+due to attempting to write three elements into a Tensor with a single element.
+
+There is no problem if the Tensor being written to is batched under
+{func}`~torch.vmap` (i.e. it is being vmapped over).
+
+```python
+def f(x, y):
+  x.add_(y)
+  return x
+
+x = torch.randn(3, 1)
+y = torch.randn(3, 1)
+expected = x + y
+
+# Does not raise an error because x is being vmapped over.
+vmap(f, in_dims=(0, 0))(x, y)
+assert torch.allclose(x, expected)
+```
+
+One common fix for this is to replace calls to factory functions with
+their "new\_\*" equivalent. For example:
+
+- Replace {func}`torch.zeros` with {meth}`Tensor.new_zeros`
+- Replace {func}`torch.empty` with {meth}`Tensor.new_empty`
+
+To see why this helps, consider the following.
+
+```python
+def diag_embed(vec):
+  assert vec.dim() == 1
+  result = torch.zeros(vec.shape[0], vec.shape[0])
+  result.diagonal().copy_(vec)
+  return result
+
+vecs = torch.tensor([[0., 1, 2], [3., 4, 5]])
+
+# RuntimeError: vmap: inplace arithmetic(self, *extra_args) is not possible ...
+vmap(diag_embed)(vecs)
+```
+
+Inside of {func}`~torch.vmap`, `result` is a Tensor of shape [3, 3].
+However, although `vec` looks like it has shape [3], `vec` actually has
+underlying shape [2, 3].
+It is not possible to copy `vec` into `result.diagonal()`, which has
+shape [3], because it has too many elements.
+
+```python
+def diag_embed(vec):
+  assert vec.dim() == 1
+  result = vec.new_zeros(vec.shape[0], vec.shape[0])
+  result.diagonal().copy_(vec)
+  return result
+
+vecs = torch.tensor([[0., 1, 2], [3., 4, 5]])
+vmap(diag_embed)(vecs)
+```
+
+Replacing {func}`torch.zeros` with {meth}`Tensor.new_zeros` makes it so that
+`result` has an underlying Tensor of shape [2, 3, 3], so it is now possible
+to copy `vec`, which has underlying shape [2, 3], into `result.diagonal()`.
+
+### Mutation: out= PyTorch Operations
+
+{func}`vmap` doesn't support the `out=` keyword argument in PyTorch operations.
+It will error out gracefully if it encounters that in your code.
+
+This is not a fundamental limitation; we could theoretically support this in the
+future but we have chosen not to for now.
+
+### Data-dependent Python control flow
+
+We don't yet support `vmap` over data-dependent control flow. Data-dependent
+control flow is when the condition of an if-statement, while-loop, or
+for-loop is a Tensor that is being `vmap`'ed over. For example, the
+following will raise an error message:
+
+```python
+def relu(x):
+  if x > 0:
+    return x
+  return 0
+
+x = torch.randn(3)
+vmap(relu)(x)
+```
+
+However, any control flow that is not dependent on the values in `vmap`'ed
+tensors will work:
+
+```python
+def custom_dot(x):
+  if x.dim() == 1:
+    return torch.dot(x, x)
+  return (x * x).sum()
+
+x = torch.randn(3)
+vmap(custom_dot)(x)
+```
+
+JAX supports transforming over
+[data-dependent control flow](https://jax.readthedocs.io/en/latest/jax.lax.html#control-flow-operators)
+using special control flow operators (e.g. `jax.lax.cond`, `jax.lax.while_loop`).
+We're investigating adding equivalents of those to PyTorch.
+
+### Data-dependent operations (.item())
+
+We do not (and will not) support vmap over a user-defined function that calls
+`.item()` on a Tensor. For example, the following will raise an error message:
+
+```python
+def f(x):
+  return x.item()
+
+x = torch.randn(3)
+vmap(f)(x)
+```
+
+Please try to rewrite your code to not use `.item()` calls.
+
+You may also encounter an error message about using `.item()` but you might
+not have used it. In those cases, it is possible that PyTorch internally is
+calling `.item()` -- please file an issue on GitHub and we'll fix
+PyTorch internals.
+
+### Dynamic shape operations (nonzero and friends)
+
+`vmap(f)` requires that `f` applied to every "example" in your input
+returns a Tensor with the same shape. Operations such as `torch.nonzero`,
+`torch.is_nonzero` are not supported and will error as a result.
+
+To see why, consider the following example:
+
+```python
+xs = torch.tensor([[0, 1, 2], [0, 0, 3]])
+vmap(torch.nonzero)(xs)
+```
+
+`torch.nonzero(xs[0])` returns a Tensor of shape 2;
+but `torch.nonzero(xs[1])` returns a Tensor of shape 1.
+We are unable to construct a single Tensor as an output;
+the output would need to be a ragged Tensor (and PyTorch does not yet have
+the concept of a ragged Tensor).
+
+## Randomness
+
+The user's intention when calling a random operation can be unclear. Specifically, some users may want
+the random behavior to be the same across batches while others may want it to differ across batches.
+To address this, `vmap` takes a randomness flag.
+
+The flag can only be passed to vmap and can take on 3 values, "error," "different," or "same," defaulting
+to error. Under "error" mode, any call to a random function will produce an error asking the user to use
+one of the other two flags based on their use case.
+
+Under "different" randomness, elements in a batch produce different random values. For instance,
+
+```python
+def add_noise(x):
+  y = torch.randn(())  # y will be different across the batch
+  return x + y
+
+x = torch.ones(3)
+result = vmap(add_noise, randomness="different")(x)  # we get 3 different values
+```
+
+Under "same" randomness, elements in a batch produce same random values. For instance,
+
+```python
+def add_noise(x):
+  y = torch.randn(())  # y will be the same across the batch
+  return x + y
+
+x = torch.ones(3)
+result = vmap(add_noise, randomness="same")(x)  # we get the same value, repeated 3 times
+```
+
+:::{warning}
+Our system only determine the randomness behavior of PyTorch operators and cannot control the
+behavior of other libraries, like numpy. This is similar to JAX's limitations with their solutions
+:::
+
+:::{note}
+Multiple vmap calls using either type of supported randomness will not produce
+the same results. Like with standard PyTorch, a user can get randomness reproducibility through
+either using `torch.manual_seed()` outside of vmap or by using generators.
+:::
+
+:::{note}
+Finally, our randomness differs from JAX because we aren't using a stateless PRNG, in part because PyTorch
+doesn't have full support for a stateless PRNG. Instead, we've introduced a flag system to allow for the
+most common forms of randomness that we see. If your use case does not fit these forms of randomness, please
+file an issue.
+:::
diff --git a/docs/source/func.whirlwind_tour.md b/docs/source/func.whirlwind_tour.md
new file mode 100644
index 000000000000..e17172281e84
--- /dev/null
+++ b/docs/source/func.whirlwind_tour.md
@@ -0,0 +1,190 @@
+# torch.func Whirlwind Tour
+
+## What is torch.func?
+
+```{eval-rst}
+.. currentmodule:: torch.func
+```
+
+torch.func, previously known as functorch, is a library for
+[JAX](https://github.com/google/jax)-like composable function transforms in
+PyTorch.
+
+- A "function transform" is a higher-order function that accepts a numerical
+  function and returns a new function that computes a different quantity.
+- torch.func has auto-differentiation transforms (`grad(f)` returns a function
+  that computes the gradient of `f`), a vectorization/batching transform
+  (`vmap(f)` returns a function that computes `f` over batches of inputs),
+  and others.
+- These function transforms can compose with each other arbitrarily. For
+  example, composing `vmap(grad(f))` computes a quantity called
+  per-sample-gradients that stock PyTorch cannot efficiently compute today.
+
+## Why composable function transforms?
+
+There are a number of use cases that are tricky to do in PyTorch today:
+
+- computing per-sample-gradients (or other per-sample quantities)
+- running ensembles of models on a single machine
+- efficiently batching together tasks in the inner-loop of MAML
+- efficiently computing Jacobians and Hessians
+- efficiently computing batched Jacobians and Hessians
+
+Composing {func}`vmap`, {func}`grad`, {func}`vjp`, and {func}`jvp` transforms
+allows us to express the above without designing a separate subsystem for each.
+
+## What are the transforms?
+
+### {func}`grad` (gradient computation)
+
+`grad(func)` is our gradient computation transform. It returns a new function
+that computes the gradients of `func`. It assumes `func` returns a single-element
+Tensor and by default it computes the gradients of the output of `func` w.r.t.
+to the first input.
+
+```python
+import torch
+from torch.func import grad
+x = torch.randn([])
+cos_x = grad(lambda x: torch.sin(x))(x)
+assert torch.allclose(cos_x, x.cos())
+
+# Second-order gradients
+neg_sin_x = grad(grad(lambda x: torch.sin(x)))(x)
+assert torch.allclose(neg_sin_x, -x.sin())
+```
+
+### {func}`vmap` (auto-vectorization)
+
+Note: {func}`vmap` imposes restrictions on the code that it can be used on. For more
+details, please see {ref}`ux-limitations`.
+
+`vmap(func)(*inputs)` is a transform that adds a dimension to all Tensor
+operations in `func`. `vmap(func)` returns a new function that maps `func`
+over some dimension (default: 0) of each Tensor in inputs.
+
+vmap is useful for hiding batch dimensions: one can write a function func that
+runs on examples and then lift it to a function that can take batches of
+examples with `vmap(func)`, leading to a simpler modeling experience:
+
+```python
+import torch
+from torch.func import vmap
+batch_size, feature_size = 3, 5
+weights = torch.randn(feature_size, requires_grad=True)
+
+def model(feature_vec):
+    # Very simple linear model with activation
+    assert feature_vec.dim() == 1
+    return feature_vec.dot(weights).relu()
+
+examples = torch.randn(batch_size, feature_size)
+result = vmap(model)(examples)
+```
+
+When composed with {func}`grad`, {func}`vmap` can be used to compute per-sample-gradients:
+
+```python
+from torch.func import vmap
+batch_size, feature_size = 3, 5
+
+def model(weights,feature_vec):
+    # Very simple linear model with activation
+    assert feature_vec.dim() == 1
+    return feature_vec.dot(weights).relu()
+
+def compute_loss(weights, example, target):
+    y = model(weights, example)
+    return ((y - target) ** 2).mean()  # MSELoss
+
+weights = torch.randn(feature_size, requires_grad=True)
+examples = torch.randn(batch_size, feature_size)
+targets = torch.randn(batch_size)
+inputs = (weights,examples, targets)
+grad_weight_per_example = vmap(grad(compute_loss), in_dims=(None, 0, 0))(*inputs)
+```
+
+### {func}`vjp` (vector-Jacobian product)
+
+The {func}`vjp` transform applies `func` to `inputs` and returns a new function
+that computes the vector-Jacobian product (vjp) given some `cotangents` Tensors.
+
+```python
+from torch.func import vjp
+
+inputs = torch.randn(3)
+func = torch.sin
+cotangents = (torch.randn(3),)
+
+outputs, vjp_fn = vjp(func, inputs); vjps = vjp_fn(*cotangents)
+```
+
+### {func}`jvp` (Jacobian-vector product)
+
+The {func}`jvp` transforms computes Jacobian-vector-products and is also known as
+"forward-mode AD". It is not a higher-order function unlike most other transforms,
+but it returns the outputs of `func(inputs)` as well as the jvps.
+
+```python
+from torch.func import jvp
+x = torch.randn(5)
+y = torch.randn(5)
+f = lambda x, y: (x * y)
+_, out_tangent = jvp(f, (x, y), (torch.ones(5), torch.ones(5)))
+assert torch.allclose(out_tangent, x + y)
+```
+
+### {func}`jacrev`, {func}`jacfwd`, and {func}`hessian`
+
+The {func}`jacrev` transform returns a new function that takes in `x` and returns
+the Jacobian of the function with respect to `x` using reverse-mode AD.
+
+```python
+from torch.func import jacrev
+x = torch.randn(5)
+jacobian = jacrev(torch.sin)(x)
+expected = torch.diag(torch.cos(x))
+assert torch.allclose(jacobian, expected)
+```
+
+{func}`jacrev` can be composed with {func}`vmap` to produce batched jacobians:
+
+```python
+x = torch.randn(64, 5)
+jacobian = vmap(jacrev(torch.sin))(x)
+assert jacobian.shape == (64, 5, 5)
+```
+
+{func}`jacfwd` is a drop-in replacement for jacrev that computes Jacobians using
+forward-mode AD:
+
+```python
+from torch.func import jacfwd
+x = torch.randn(5)
+jacobian = jacfwd(torch.sin)(x)
+expected = torch.diag(torch.cos(x))
+assert torch.allclose(jacobian, expected)
+```
+
+Composing {func}`jacrev` with itself or {func}`jacfwd` can produce hessians:
+
+```python
+def f(x):
+    return x.sin().sum()
+
+x = torch.randn(5)
+hessian0 = jacrev(jacrev(f))(x)
+hessian1 = jacfwd(jacrev(f))(x)
+```
+
+{func}`hessian` is a convenience function that combines jacfwd and jacrev:
+
+```python
+from torch.func import hessian
+
+def f(x):
+    return x.sin().sum()
+
+x = torch.randn(5)
+hess = hessian(f)(x)
+```
diff --git a/docs/source/future_mod.md b/docs/source/future_mod.md
new file mode 100644
index 000000000000..3e72f21c3e77
--- /dev/null
+++ b/docs/source/future_mod.md
@@ -0,0 +1,25 @@
+# torch.\_\_future\_\_
+
+```{eval-rst}
+.. automodule:: torch.__future__
+```
+
+```{eval-rst}
+.. currentmodule:: torch.__future__
+```
+
+```{eval-rst}
+.. autofunction:: set_overwrite_module_params_on_conversion
+```
+
+```{eval-rst}
+.. autofunction:: get_overwrite_module_params_on_conversion
+```
+
+```{eval-rst}
+.. autofunction:: set_swap_module_params_on_conversion
+```
+
+```{eval-rst}
+.. autofunction:: get_swap_module_params_on_conversion
+```
diff --git a/docs/source/futures.md b/docs/source/futures.md
new file mode 100644
index 000000000000..ec3ffd590673
--- /dev/null
+++ b/docs/source/futures.md
@@ -0,0 +1,30 @@
+```{eval-rst}
+.. currentmodule:: torch.futures
+```
+
+(futures-docs)=
+
+# torch.futures
+
+This package provides a {class}`~torch.futures.Future` type that encapsulates
+an asynchronous execution and a set of utility functions to simplify operations
+on {class}`~torch.futures.Future` objects. Currently, the
+{class}`~torch.futures.Future` type is primarily used by the
+{ref}`distributed-rpc-framework`.
+
+```{eval-rst}
+.. automodule:: torch.futures
+```
+
+```{eval-rst}
+.. autoclass:: Future
+    :inherited-members:
+```
+
+```{eval-rst}
+.. autofunction:: collect_all
+```
+
+```{eval-rst}
+.. autofunction:: wait_all
+```
diff --git a/docs/source/fx.experimental.md b/docs/source/fx.experimental.md
new file mode 100644
index 000000000000..24125cd310bc
--- /dev/null
+++ b/docs/source/fx.experimental.md
@@ -0,0 +1,90 @@
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental
+```
+
+# torch.fx.experimental
+
+:::{warning}
+These APIs are experimental and subject to change without notice.
+:::
+
+## torch.fx.experimental.symbolic_shapes
+
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental.symbolic_shapes
+```
+
+```{eval-rst}
+.. automodule:: torch.fx.experimental.symbolic_shapes
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    ShapeEnv
+    DimDynamic
+    StrictMinMaxConstraint
+    RelaxedUnspecConstraint
+    EqualityConstraint
+    SymbolicContext
+    StatelessSymbolicContext
+    StatefulSymbolicContext
+    SubclassSymbolicContext
+    DimConstraints
+    ShapeEnvSettings
+    ConvertIntKey
+    CallMethodKey
+    PropagateUnbackedSymInts
+    DivideByKey
+    InnerTensorKey
+    Specialization
+
+    hint_int
+    is_concrete_int
+    is_concrete_bool
+    is_concrete_float
+    has_free_symbols
+    has_free_unbacked_symbols
+    guard_or_true
+    guard_or_false
+    guard_size_oblivious
+    sym_and
+    sym_eq
+    sym_or
+    constrain_range
+    constrain_unify
+    canonicalize_bool_expr
+    statically_known_true
+    statically_known_false
+    has_static_value
+    lru_cache
+    check_consistent
+    compute_unbacked_bindings
+    rebind_unbacked
+    resolve_unbacked_bindings
+    is_accessor_node
+```
+
+## torch.fx.experimental.proxy_tensor
+
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental.proxy_tensor
+```
+
+```{eval-rst}
+.. automodule:: torch.fx.experimental.proxy_tensor
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    make_fx
+    handle_sym_dispatch
+    get_proxy_mode
+    maybe_enable_thunkify
+    maybe_disable_thunkify
+```
diff --git a/docs/source/fx.md b/docs/source/fx.md
new file mode 100644
index 000000000000..8b60c8064966
--- /dev/null
+++ b/docs/source/fx.md
@@ -0,0 +1,1187 @@
+```{eval-rst}
+.. currentmodule:: torch.fx
+```
+
+
+# torch.fx
+
+## Overview
+```{eval-rst}
+.. automodule:: torch.fx
+```
+
+
+(Writing Transformations)=
+
+
+## Writing Transformations
+
+What is an FX transform? Essentially, it's a function that looks like this.
+
+```python
+
+import torch
+import torch.fx
+
+def transform(m: nn.Module,
+                tracer_class : type = torch.fx.Tracer) -> torch.nn.Module:
+    # Step 1: Acquire a Graph representing the code in `m`
+
+    # NOTE: torch.fx.symbolic_trace is a wrapper around a call to
+    # fx.Tracer.trace and constructing a GraphModule. We'll
+    # split that out in our transform to allow the caller to
+    # customize tracing behavior.
+    graph : torch.fx.Graph = tracer_class().trace(m)
+
+    # Step 2: Modify this Graph or create a new one
+    graph = ...
+
+    # Step 3: Construct a Module to return
+    return torch.fx.GraphModule(m, graph)
+```
+
+Your transform will take in a {class}`torch.nn.Module`, acquire a {class}`Graph`
+from it, do some modifications, and return a new
+{class}`torch.nn.Module`. You should think of the {class}`torch.nn.Module` that your FX
+transform returns as identical to a regular {class}`torch.nn.Module` -- you can pass it to another
+FX transform, you can pass it to TorchScript, or you can
+run it. Ensuring that the inputs and outputs of your FX transform are a
+{class}`torch.nn.Module` will allow for composability.
+
+```{note}
+
+It is also possible to modify an existing {class}`GraphModule` instead of
+creating a new one, like so:
+
+```python
+import torch
+import torch.fx
+
+def transform(m : nn.Module) -> nn.Module:
+    gm : torch.fx.GraphModule = torch.fx.symbolic_trace(m)
+
+    # Modify gm.graph
+    # <...>
+
+    # Recompile the forward() method of `gm` from its Graph
+    gm.recompile()
+
+    return gm
+```
+
+Note that you MUST call {meth}`GraphModule.recompile` to bring the generated
+`forward()` method on the `GraphModule` in sync with the modified {class}`Graph`.
+
+Given that you’ve passed in a {class}`torch.nn.Module` that has been traced into a
+{class}`Graph`, there are now two primary approaches you can take to building a new
+{class}`Graph`.
+
+### A Quick Primer on Graphs
+
+Full treatment of the semantics of graphs can be found in the {class}`Graph`
+documentation, but we are going to cover the basics here. A {class}`Graph` is
+a data structure that represents a method on a {class}`GraphModule`. The
+information that this requires is:
+
+- What are the inputs to the method?
+- What are the operations that run inside the method?
+- What is the output (i.e. return) value from the method?
+
+All three of these concepts are represented with {class}`Node` instances.
+Let's see what we mean by that with a short example:
+
+```python
+
+import torch
+import torch.fx
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.param = torch.nn.Parameter(torch.rand(3, 4))
+        self.linear = torch.nn.Linear(4, 5)
+
+    def forward(self, x):
+        return torch.topk(torch.sum(
+            self.linear(x + self.linear.weight).relu(), dim=-1), 3)
+
+m = MyModule()
+gm = torch.fx.symbolic_trace(m)
+
+gm.graph.print_tabular()
+```
+
+Here we define a module `MyModule` for demonstration purposes, instantiate it,
+symbolically trace it, then call the {meth}`Graph.print_tabular` method to print
+out a table showing the nodes of this {class}`Graph`:
+
+| opcode | name | target | args | kwargs |
+|--------|------|--------|------|--------|
+| placeholder | x | x | () | {} |
+| get_attr | linear_weight | linear.weight | () | {} |
+| call_function | add_1 | | (x, linear_weight) | {} |
+| call_module | linear_1 | linear | (add_1,) | {} |
+| call_method | relu_1 | relu | (linear_1,) | {} |
+| call_function | sum_1 | <built-in method sum ...> | (relu_1,) | {'dim': -1} |
+| call_function | topk_1 | <built-in method topk ...> | (sum_1, 3) | {} |
+| output | output | output | (topk_1,) | {} |
+
+We can use this information to answer the questions we posed above.
+
+- What are the inputs to the method? In FX, method inputs are specified
+  via special `placeholder` nodes. In this case, we have a single
+  `placeholder` node with a `target` of `x`, meaning we have
+  a single (non-self) argument named x.
+- What are the operations within the method? The `get_attr`,
+  `call_function`, `call_module`, and `call_method` nodes
+  represent the operations in the method. A full treatment of
+  the semantics of all of these can be found in the {class}`Node`
+  documentation.
+- What is the return value of the method? The return value in a
+  {class}`Graph` is specified by a special `output` node.
+
+Given that we now know the basics of how code is represented in
+FX, we can now explore how we would edit a {class}`Graph`.
+
+### Graph Manipulation
+
+#### Direct Graph Manipulation
+
+One approach to building this new {class}`Graph` is to directly manipulate your old
+one. To aid in this, we can simply take the {class}`Graph` we obtain from symbolic
+tracing and modify it. For example, let’s say we desire to replace
+{func}`torch.add` calls with {func}`torch.mul` calls.
+
+```python
+
+import torch
+import torch.fx
+
+# Sample module
+class M(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.add(x, y)
+
+def transform(m: torch.nn.Module,
+                tracer_class : type = fx.Tracer) -> torch.nn.Module:
+    graph : fx.Graph = tracer_class().trace(m)
+    # FX represents its Graph as an ordered list of
+    # nodes, so we can iterate through them.
+    for node in graph.nodes:
+        # Checks if we're calling a function (i.e:
+        # torch.add)
+        if node.op == 'call_function':
+            # The target attribute is the function
+            # that call_function calls.
+            if node.target == torch.add:
+                node.target = torch.mul
+
+    graph.lint() # Does some checks to make sure the
+                    # Graph is well-formed.
+
+    return fx.GraphModule(m, graph)
+```
+
+We can also do more involved {class}`Graph` rewrites, such as
+deleting or appending nodes. To aid in these transformations,
+FX has utility functions for transforming the graph that can
+be found in the {class}`Graph` documentation. An
+example of using these APIs to append a {func}`torch.relu` call
+can be found below.
+
+```python
+
+# Specifies the insertion point. Any nodes added to the
+# Graph within this scope will be inserted after `node`
+with traced.graph.inserting_after(node):
+    # Insert a new `call_function` node calling `torch.relu`
+    new_node = traced.graph.call_function(
+        torch.relu, args=(node,))
+
+    # We want all places that used the value of `node` to
+    # now use that value after the `relu` call we've added.
+    # We use the `replace_all_uses_with` API to do this.
+    node.replace_all_uses_with(new_node)
+```
+
+For simple transformations that only consist of substitutions, you can also
+make use of the [subgraph rewriter.](https://github.com/pytorch/pytorch/blob/main/torch/fx/subgraph_rewriter.py)
+
+#### Subgraph Rewriting With replace_pattern()
+
+FX also provides another level of automation on top of direct graph manipulation.
+The {func}`replace_pattern` API is essentially a "find/replace" tool for editing
+{class}`Graph`\s. It allows you to specify a `pattern` and `replacement` function
+and it will trace through those functions, find instances of the group of operations
+in the `pattern` graph, and replace those instances with copies of the `replacement`
+graph. This can help to greatly automate tedious graph manipulation code, which can
+get unwieldy as the transformations get more complex.
+
+#### Graph Manipulation Examples
+
+-  [Replace one
+   op](https://github.com/pytorch/examples/blob/master/fx/replace_op.py)
+-  [Conv/Batch Norm
+   fusion](https://github.com/pytorch/pytorch/blob/40cbf342d3c000712da92cfafeaca651b3e0bd3e/torch/fx/experimental/optimization.py#L50)
+-  [replace_pattern: Basic usage](https://github.com/pytorch/examples/blob/master/fx/subgraph_rewriter_basic_use.py)
+-  [Quantization](https://pytorch.org/docs/main/quantization.html#prototype-fx-graph-mode-quantization)
+-  [Invert Transformation](https://github.com/pytorch/examples/blob/master/fx/invert.py)
+
+### Proxy/Retracing
+
+Another way of manipulating {class}`Graph`\s is by reusing the {class}`Proxy`
+machinery used in symbolic tracing. For example, let’s
+imagine that we wanted to write a transformation that decomposed
+PyTorch functions into smaller operations. It would transform every
+`F.relu(x)` call into `(x > 0) * x`. One possibility would be to
+perform the requisite graph rewriting to insert the comparison and
+multiplication after the `F.relu`, and then clean up the original
+`F.relu`. However, we can automate this process by using {class}`Proxy`
+objects to automatically record operations into the {class}`Graph`.
+
+To use this method, we write the operations that we want inserted as regular
+PyTorch code and invoke that code with {class}`Proxy` objects as arguments.
+These {class}`Proxy` objects will capture the operations that are performed
+on them and append them to the {class}`Graph`.
+
+```python
+
+# Note that this decomposition rule can be read as regular Python
+def relu_decomposition(x):
+    return (x > 0) * x
+
+decomposition_rules = {}
+decomposition_rules[F.relu] = relu_decomposition
+
+def decompose(model: torch.nn.Module,
+                tracer_class : type = fx.Tracer) -> torch.nn.Module:
+    """
+    Decompose `model` into smaller constituent operations.
+    Currently,this only supports decomposing ReLU into its
+    mathematical definition: (x > 0) * x
+    """
+    graph : fx.Graph = tracer_class().trace(model)
+    new_graph = fx.Graph()
+    env = {}
+    tracer = torch.fx.proxy.GraphAppendingTracer(new_graph)
+    for node in graph.nodes:
+        if node.op == 'call_function' and node.target in decomposition_rules:
+            # By wrapping the arguments with proxies,
+            # we can dispatch to the appropriate
+            # decomposition rule and implicitly add it
+            # to the Graph by symbolically tracing it.
+            proxy_args = [
+                fx.Proxy(env[x.name], tracer) if isinstance(x, fx.Node) else x for x in node.args]
+            output_proxy = decomposition_rules[node.target](*proxy_args)
+
+            # Operations on `Proxy` always yield new `Proxy`s, and the
+            # return value of our decomposition rule is no exception.
+            # We need to extract the underlying `Node` from the `Proxy`
+            # to use it in subsequent iterations of this transform.
+            new_node = output_proxy.node
+            env[node.name] = new_node
+        else:
+            # Default case: we don't have a decomposition rule for this
+            # node, so just copy the node over into the new graph.
+            new_node = new_graph.node_copy(node, lambda x: env[x.name])
+            env[node.name] = new_node
+    return fx.GraphModule(model, new_graph)
+```
+
+In addition to avoiding explicit graph manipulation, using {class}`Proxy`\s
+also allows you to specify your rewrite rules as native Python code.
+For transformations that require a large amount of rewrite rules
+(such as vmap or grad), this can often improve readability and
+maintainability of the rules. Note that while calling {class}`Proxy` we also
+passed a tracer pointing to the underlying variable `graph`. This is done so
+if in case the operations in graph are n-ary (e.g. add is a binary operator)
+the call to {class}`Proxy` does not create multiple instances of a graph
+tracer which can lead to unexpected runtime errors. We recommend this method
+of using {class}`Proxy` especially when the underlying operators can not be
+safely assumed to be unary.
+
+A worked example of using {class}`Proxy`\s for {class}`Graph` manipulation
+can be found
+[here](https://github.com/pytorch/examples/blob/master/fx/proxy_based_graph_creation.py).
+
+### The Interpreter Pattern
+
+A useful code organizational pattern in FX is to loop over all the {class}`Node`\s
+in a {class}`Graph` and execute them. This can be used for several things including
+runtime analysis of values flowing through the graph or transformation of the code
+via retracing with {class}`Proxy`\s. For example, suppose we want to run a
+{class}`GraphModule` and record the {class}`torch.Tensor` shape and dtype
+properties on the nodes as we see them at runtime. That might look like:
+
+```python
+
+import torch
+import torch.fx
+from torch.fx.node import Node
+
+from typing import Dict
+
+class ShapeProp:
+    """
+    Shape propagation. This class takes a `GraphModule`.
+    Then, its `propagate` method executes the `GraphModule`
+    node-by-node with the given arguments. As each operation
+    executes, the ShapeProp class stores away the shape and
+    element type for the output values of each operation on
+    the `shape` and `dtype` attributes of the operation's
+    `Node`.
+    """
+    def __init__(self, mod):
+        self.mod = mod
+        self.graph = mod.graph
+        self.modules = dict(self.mod.named_modules())
+
+    def propagate(self, *args):
+        args_iter = iter(args)
+        env : Dict[str, Node] = {}
+
+        def load_arg(a):
+            return torch.fx.graph.map_arg(a, lambda n: env[n.name])
+
+        def fetch_attr(target : str):
+            target_atoms = target.split('.')
+            attr_itr = self.mod
+            for i, atom in enumerate(target_atoms):
+                if not hasattr(attr_itr, atom):
+                    raise RuntimeError(f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}")
+                attr_itr = getattr(attr_itr, atom)
+            return attr_itr
+
+        for node in self.graph.nodes:
+            if node.op == 'placeholder':
+                result = next(args_iter)
+            elif node.op == 'get_attr':
+
+                result = fetch_attr(node.target)
+            elif node.op == 'call_function':
+                result = node.target(*load_arg(node.args), **load_arg(node.kwargs))
+            elif node.op == 'call_method':
+                self_obj, *args = load_arg(node.args)
+                kwargs = load_arg(node.kwargs)
+                result = getattr(self_obj, node.target)(*args, **kwargs)
+            elif node.op == 'call_module':
+                result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
+
+            # This is the only code specific to shape propagation.
+            # you can delete this `if` branch and this becomes
+            # a generic GraphModule interpreter.
+            if isinstance(result, torch.Tensor):
+                node.shape = result.shape
+                node.dtype = result.dtype
+
+            env[node.name] = result
+
+        return load_arg(self.graph.result)
+```
+
+As you can see, a full interpreter for FX is not that complicated
+but it can be very useful. To ease using this pattern, we provide
+the {class}`Interpreter` class, which encompasses the above logic
+in a way that certain aspects of the interpreter's execution can
+be overridden via method overrides.
+
+In addition to executing operations, we can also generate a new
+`Graph` by feeding {class}`Proxy` values through an interpreter.
+Similarly, we provide the {class}`Transformer` class to encompass
+this pattern. {class}`Transformer` behaves similarly to
+{class}`Interpreter`, but instead of calling the `run` method to
+get a concrete output value from the Module, you would call the
+{meth}`Transformer.transform` method to return a new
+{class}`GraphModule` which was subject to any transformation rules
+you installed as overridden methods.
+
+#### Examples of the Interpreter Pattern
+
+-  [ShapePropagation](https://github.com/pytorch/pytorch/blob/master/torch/fx/passes/shape_prop.py)
+-  [Performance Profiler](https://github.com/pytorch/tutorials/pull/1319)
+
+
+## Debugging
+
+### Introduction
+
+Often in the course of authoring transformations, our code will not be quite right.
+In this case, we may need to do some debugging. The key is to work
+backwards: first, check the results of invoking the generated module to prove or
+disprove correctness. Then, inspect and debug the generated code. Then, debug the
+process of transformations that led to the generated code.
+
+If you’re not familiar with debuggers, please see the auxiliary section
+{ref}`Available-Debuggers`.
+
+
+### Common Pitfalls in Transform Authoring
+
+* Nondeterministic `set` iteration order. In Python, the `set` datatype is
+  unordered. Using `set` to contain collections of objects like `Node`\ s,
+  for example, can cause unexpected nondeterminism. An example is iterating
+  over a set of `Node` s to insert them into a `Graph`. Because the
+  `set` data type is unordered, the ordering of the operations in the output
+  program will be nondeterministic and can change across program invocations.
+  The recommended alternative is to use a `dict` data type, which is
+  [insertion ordered](https://mail.python.org/pipermail/python-dev/2017-December/151283.html)
+  as of Python 3.7 (and as of cPython 3.6). A `dict` can be used equivalently
+  to a set by storing values to be deduplicated in the keys of the `dict`.
+
+### Checking Correctness of Modules
+
+Because the output of most deep learning modules consists of floating
+point {class}`torch.Tensor` instances, checking for equivalence between
+the results of two {class}`torch.nn.Module` is not as straightforward
+as doing a simple equality check. To motivate this, let's use an
+example:
+
+```python
+
+import torch
+import torch.fx
+import torchvision.models as models
+
+def transform(m : torch.nn.Module) -> torch.nn.Module:
+    gm = torch.fx.symbolic_trace(m)
+
+    # Imagine we're doing some transforms here
+    # <...>
+
+    gm.recompile()
+
+    return gm
+
+resnet18 = models.resnet18()
+transformed_resnet18 = transform(resnet18)
+
+input_image = torch.randn(5, 3, 224, 224)
+
+assert resnet18(input_image) == transformed_resnet18(input_image)
+"""
+RuntimeError: Boolean value of Tensor with more than one value is ambiguous
+"""
+```
+
+Here, we've tried to check equality of the values of two deep learning
+models with the `==` equality operator. However, this is not well-\
+defined both due to the issue of that operator returning a tensor
+and not a bool, but also because comparison of floating point values
+should use a margin of error (or epsilon) to account for the
+non-commutativity of floating point operations (see
+[here](https://floating-point-gui.de/errors/comparison/) for more
+details). We can use {func}`torch.allclose` instead, which will give
+us an approximate comparison taking into account a relative and
+absolute tolerance threshold:
+
+```python
+assert torch.allclose(resnet18(input_image), transformed_resnet18(input_image))
+```
+This is the first tool in our toolbox to check if transformed modules are
+behaving as we expect compared to a reference implementation.
+
+### Debugging the Generated Code
+
+Because FX generates the `forward()` function on {class}`GraphModule`\s, using
+traditional debugging techniques like `print` statements or `pdb` is
+not as straightforward. Luckily, we have several techniques we can use
+for debugging the generated code.
+
+#### Use `pdb`
+Invoke `pdb` to step into the running program. Although the code that
+represents the {class}`Graph` is not in any source file, we can still step
+into it manually using `pdb` when the forward pass is invoked.
+
+```python
+
+import torch
+import torch.fx
+import torchvision.models as models
+
+def my_pass(inp: torch.nn.Module, tracer_class : type = fx.Tracer) -> torch.nn.Module:
+    graph = tracer_class().trace(inp)
+    # Transformation logic here
+    # <...>
+
+    # Return new Module
+    return fx.GraphModule(inp, graph)
+
+my_module = models.resnet18()
+my_module_transformed = my_pass(my_module)
+
+input_value = torch.randn(5, 3, 224, 224)
+
+# When this line is executed at runtime, we will be dropped into an
+# interactive `pdb` prompt. We can use the `step` or `s` command to
+# step into the execution of the next line
+import pdb; pdb.set_trace()
+
+my_module_transformed(input_value)
+```
+(Print the Generated Code)=
+
+#### Print the Generated Code
+If you’d like to run the same code multiple times, then it can be
+a bit tedious to step to the right code with `pdb`. In that case, one
+approach is to simply copy-paste the generated `forward` pass into
+your code and examine it from there.
+
+```python
+
+# Assume that `traced` is a GraphModule that has undergone some
+# number of transforms
+
+# Copy this code for later
+print(traced)
+# Print the code generated from symbolic tracing. This outputs:
+"""
+def forward(self, y):
+    x = self.x
+    add_1 = x + y;  x = y = None
+    return add_1
+"""
+
+# Subclass the original Module
+class SubclassM(M):
+    def __init__(self):
+        super().__init__()
+
+    # Paste the generated `forward` function (the one we printed and
+    # copied above) here
+    def forward(self, y):
+        x = self.x
+        add_1 = x + y;  x = y = None
+        return add_1
+
+# Create an instance of the original, untraced Module. Then, create an
+# instance of the Module with the copied `forward` function. We can
+# now compare the output of both the original and the traced version.
+pre_trace = M()
+post_trace = SubclassM()
+```
+#### Use the `to_folder` Function From `GraphModule`
+{meth}`GraphModule.to_folder` is a method in `GraphModule` that allows
+you to dump out the generated FX code to a folder. Although copying the
+forward pass into the code often suffices as in {ref}`Print the Generated Code`,
+it may be easier to examine modules and parameters using `to_folder`.
+
+```python
+
+m = symbolic_trace(M())
+m.to_folder("foo", "Bar")
+from foo import Bar
+y = Bar()
+```
+After running the above example, we can then look at the code within
+`foo/module.py` and modify it as desired (e.g. adding `print`
+statements or using `pdb`) to debug the generated code.
+
+### Debugging the Transformation
+
+Now that we've identified that a transformation is creating incorrect
+code, it's time to debug the transformation itself. First, we'll check
+the {ref}`Limitations of Symbolic Tracing` section in the documentation.
+Once we verify that tracing is working as expected, the goal
+becomes figuring out what went wrong during our `GraphModule`
+transformation. There may be a quick answer in
+{ref}`Writing Transformations`, but, if not, there are several ways to
+examine our traced module:
+
+```python
+
+# Sample Module
+class M(torch.nn.Module):
+    def forward(self, x, y):
+        return x + y
+
+# Create an instance of `M`
+m = M()
+
+# Symbolically trace an instance of `M` (returns a GraphModule). In
+# this example, we'll only be discussing how to inspect a
+# GraphModule, so we aren't showing any sample transforms for the
+# sake of brevity.
+traced = symbolic_trace(m)
+
+# Print the code produced by tracing the module.
+print(traced)
+# The generated `forward` function is:
+"""
+def forward(self, x, y):
+    add = x + y;  x = y = None
+    return add
+"""
+
+# Print the internal Graph.
+print(traced.graph)
+# This print-out returns:
+"""
+graph():
+    %x : [num_users=1] = placeholder[target=x]
+    %y : [num_users=1] = placeholder[target=y]
+    %add : [num_users=1] = call_function[target=operator.add](args = (%x, %y), kwargs = {})
+    return add
+"""
+
+# Print a tabular representation of the internal Graph.
+traced.graph.print_tabular()
+# This gives us:
+"""
+opcode         name    target                   args    kwargs
+-------------  ------  -----------------------  ------  --------
+placeholder    x       x                        ()      {}
+placeholder    y       y                        ()      {}
+call_function  add     <built-in function add>  (x, y)  {}
+output         output  output                   (add,)  {}
+"""
+```
+Using the utility functions above, we can compare our traced Module
+before and after we've applied our transformations. Sometimes, a
+simple visual comparison is enough to trace down a bug. If it's still
+not clear what's going wrong, a debugger like `pdb` can be a good
+next step.
+
+Going off of the example above, consider the following code:
+
+```python
+
+# Sample user-defined function
+def transform_graph(module: torch.nn.Module, tracer_class : type = fx.Tracer) -> torch.nn.Module:
+    # Get the Graph from our traced Module
+    g = tracer_class().trace(module)
+
+    """
+    Transformations on `g` go here
+    """
+
+    return fx.GraphModule(module, g)
+
+# Transform the Graph
+transformed = transform_graph(traced)
+
+# Print the new code after our transforms. Check to see if it was
+# what we expected
+print(transformed)
+```
+Using the above example, let’s say that the call to `print(traced)`
+showed us that there was an error in our transforms. We want to find
+what goes wrong using a debugger. We start a `pdb` session. We can see
+what’s happening during the transform by breaking on
+`transform_graph(traced)`, then pressing `s` to “step into” the call
+to `transform_graph(traced)`.
+
+We may also have good luck by editing the `print_tabular` method to print
+different attributes of the Nodes in the Graph. (For example, we might
+want to see the Node’s `input_nodes` and `users`.)
+
+(Available-Debuggers)=
+
+### Available Debuggers
+
+The most common Python debugger is
+[pdb](https://docs.python.org/3/library/pdb.html). You can start
+your program in “debug mode” with `pdb` by typing
+`python -m pdb FILENAME.py` into the command line, where `FILENAME`
+is the name of the file you want to debug. After that, you can use the
+`pdb` [debugger commands](https://docs.python.org/3/library/pdb.html#debugger-commands)
+to move through your running program stepwise. It’s common to set a
+breakpoint (`b LINE-NUMBER`) when you start `pdb`, then call `c` to
+run the program until that point. This prevents you from having to step
+through each line of execution (using `s` or `n`) to get to the part
+of the code you want to examine. Alternatively, you can write
+`import pdb; pdb.set_trace()` before the line you want to break at.
+If you add `pdb.set_trace()`, your program will automatically start
+in debug mode when you run it. (In other words, you can just type
+`python FILENAME.py` into the command line instead of
+`python -m pdb FILENAME.py`.) Once you're running your file in
+debug mode, you can step through the code and examine your program's
+internal state using certain commands. There are many excellent
+tutorials on `pdb` online, including RealPython’s
+[“Python Debugging With Pdb”](https://realpython.com/python-debugging-pdb/).
+
+IDEs like PyCharm or VSCode usually have a debugger built in. In your
+IDE, you can choose to either a) use `pdb` by pulling up a terminal
+window in your IDE (e.g. View → Terminal in VSCode), or b) use the
+built-in debugger (usually a graphical wrapper around `pdb`).
+
+(Limitations of Symbolic Tracing)=
+
+## Limitations of Symbolic Tracing
+
+FX uses a system of **symbolic tracing** (a.k.a [symbolic
+execution](https://en.wikipedia.org/wiki/Symbolic_execution))
+to capture the semantics of programs in a transformable/analyzable form.
+The system is **tracing** in that it executes the program (really a
+{class}`torch.nn.Module` or function) to record operations. It is
+**symbolic** in that the data flowing through the program during this
+execution is not real data, but rather symbols ({class}`Proxy` in FX parlance).
+
+Although symbolic tracing works for most neural net code, it has some
+limitations.
+
+### Dynamic Control Flow
+
+The main limitation of symbolic tracing is it does not currently support
+*dynamic control flow*. That is, loops or `if` statements where the
+condition may depend on the input values of the program.
+
+For example, let’s examine the following program:
+
+```python
+
+def func_to_trace(x):
+    if x.sum() > 0:
+        return torch.relu(x)
+    else:
+
+        return torch.neg(x)
+
+traced = torch.fx.symbolic_trace(func_to_trace)
+"""
+    <...>
+    File "dyn.py", line 6, in func_to_trace
+    if x.sum() > 0:
+    File "pytorch/torch/fx/proxy.py", line 155, in __bool__
+    return self.tracer.to_bool(self)
+    File "pytorch/torch/fx/proxy.py", line 85, in to_bool
+    raise TraceError('symbolically traced variables cannot be used as inputs to control flow')
+torch.fx.proxy.TraceError: symbolically traced variables cannot be used as inputs to control flow
+"""
+```
+The condition to the `if` statement relies on the value of `x.sum()`,
+which relies on the value of `x`, a function input. Since
+`x` can change (i.e. if you pass a new input tensor to the traced
+function), this is *dynamic control flow*. The traceback walks back up
+through your code to show you where this situation happens.
+
+### Static Control Flow
+
+On the other hand, so-called *static control flow* is supported. Static
+control flow is loops or `if` statements whose value cannot change
+across invocations. Typically, in PyTorch programs, this control flow
+arises for code making decisions about a model’s architecture based on
+hyper-parameters. As a concrete example:
+
+```python
+
+import torch
+import torch.fx
+
+class MyModule(torch.nn.Module):
+    def __init__(self, do_activation : bool = False):
+        super().__init__()
+        self.do_activation = do_activation
+        self.linear = torch.nn.Linear(512, 512)
+
+    def forward(self, x):
+        x = self.linear(x)
+        # This if-statement is so-called static control flow.
+        # Its condition does not depend on any input values
+        if self.do_activation:
+            x = torch.relu(x)
+        return x
+
+without_activation = MyModule(do_activation=False)
+with_activation = MyModule(do_activation=True)
+
+traced_without_activation = torch.fx.symbolic_trace(without_activation)
+print(traced_without_activation.code)
+"""
+def forward(self, x):
+    linear_1 = self.linear(x);  x = None
+    return linear_1
+"""
+
+traced_with_activation = torch.fx.symbolic_trace(with_activation)
+print(traced_with_activation.code)
+"""
+import torch
+def forward(self, x):
+    linear_1 = self.linear(x);  x = None
+    relu_1 = torch.relu(linear_1);  linear_1 = None
+    return relu_1
+"""
+```
+The if-statement `if self.do_activation` does not depend on any
+function inputs, thus it is static. `do_activation` can be considered
+to be a hyper-parameter, and the traces of different instances of
+`MyModule` with different values for that parameter have different
+code. This is a valid pattern that is supported by symbolic tracing.
+
+Many instances of dynamic control flow are semantically static control
+flow. These instances can be made to support symbolic tracing by
+removing the data dependencies on input values, for example by moving
+values to `Module` attributes or by binding concrete values to arguments
+during symbolic tracing:
+
+```python
+
+def f(x, flag):
+    if flag: return x
+    else: return x*2
+
+fx.symbolic_trace(f) # Fails!
+
+fx.symbolic_trace(f, concrete_args={'flag': True})
+```
+In the case of truly dynamic control flow, the sections of the program
+that contain this code can be traced as calls to the Method (see
+{ref}`Customizing Tracing`) or function (see
+{func}`wrap`) rather than tracing through them.
+
+### Non- `torch` Functions
+
+FX uses `__torch_function__` as the mechanism by which it intercepts
+calls (see the [technical
+overview](https://github.com/pytorch/pytorch/blob/main/torch/fx/README.md#technical-details)
+for more information about this). Some functions, such as builtin Python
+functions or those in the `math` module, are not covered by
+`__torch_function__`, but we would still like to capture them in
+symbolic tracing. For example:
+
+```python
+
+import torch
+import torch.fx
+from math import sqrt
+
+def normalize(x):
+    """
+    Normalize `x` by the size of the batch dimension
+    """
+    return x / sqrt(len(x))
+
+# It's valid Python code
+normalize(torch.rand(3, 4))
+
+traced = torch.fx.symbolic_trace(normalize)
+"""
+    <...>
+    File "sqrt.py", line 9, in normalize
+    return x / sqrt(len(x))
+    File "pytorch/torch/fx/proxy.py", line 161, in __len__
+    raise RuntimeError("'len' is not supported in symbolic tracing by default. If you want "
+RuntimeError: 'len' is not supported in symbolic tracing by default. If you want this call to be recorded, please call torch.fx.wrap('len') at module scope
+"""
+```
+The error tells us that the built-in function `len` is not supported.
+We can make it so that functions like this are recorded in the trace as
+direct calls using the {func}`wrap` API:
+
+```python
+
+torch.fx.wrap('len')
+torch.fx.wrap('sqrt')
+
+traced = torch.fx.symbolic_trace(normalize)
+
+print(traced.code)
+"""
+import math
+def forward(self, x):
+    len_1 = len(x)
+    sqrt_1 = math.sqrt(len_1);  len_1 = None
+    truediv = x / sqrt_1;  x = sqrt_1 = None
+    return truediv
+"""
+```
+(Customizing Tracing)=
+
+### Customizing Tracing with the `Tracer` class
+
+The {class}`Tracer` class is the class that underlies the
+implementation of `symbolic_trace`. The behavior of tracing can be
+customized by subclassing Tracer, like so:
+
+```python
+
+class MyCustomTracer(torch.fx.Tracer):
+    # Inside here you can override various methods
+    # to customize tracing. See the `Tracer` API
+    # reference
+    pass
+
+
+# Let's use this custom tracer to trace through this module
+class MyModule(torch.nn.Module):
+    def forward(self, x):
+        return torch.relu(x) + torch.ones(3, 4)
+
+mod = MyModule()
+
+traced_graph = MyCustomTracer().trace(mod)
+# trace() returns a Graph. Let's wrap it up in a
+# GraphModule to make it runnable
+traced = torch.fx.GraphModule(mod, traced_graph)
+```
+## Leaf Modules
+
+Leaf Modules are the modules that appear as calls in the symbolic trace
+rather than being traced through. The default set of leaf modules is the
+set of standard `torch.nn` module instances. For example:
+
+```python
+
+class MySpecialSubmodule(torch.nn.Module):
+    def forward(self, x):
+        return torch.neg(x)
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(3, 4)
+        self.submod = MySpecialSubmodule()
+
+    def forward(self, x):
+        return self.submod(self.linear(x))
+
+traced = torch.fx.symbolic_trace(MyModule())
+print(traced.code)
+# `linear` is preserved as a call, yet `submod` is traced though.
+# This is because the default set of "Leaf Modules" includes all
+# standard `torch.nn` modules.
+"""
+import torch
+def forward(self, x):
+    linear_1 = self.linear(x);  x = None
+    neg_1 = torch.neg(linear_1);  linear_1 = None
+    return neg_1
+"""
+```
+The set of leaf modules can be customized by overriding
+{meth}`Tracer.is_leaf_module`.
+
+### Miscellanea
+
+-  Tensor constructors (e.g. `torch.zeros`, `torch.ones`,
+   `torch.rand`, `torch.randn`, `torch.sparse_coo_tensor`)
+   are currently not traceable.
+
+   -  The deterministic constructors (`zeros`, `ones`) can be used
+      and the value they produce will be embedded in the trace as a
+      constant. This is only problematic if the arguments to these
+      constructors refers to dynamic input sizes. In this case,
+      `ones_like` or `zeros_like` may be a viable substitute.
+   -  Nondeterministic constructors (`rand`, `randn`) will have a
+      single random value embedded in the trace. This is likely not the
+      intended behavior. One workaround is to wrap `torch.randn` in a `torch.fx.wrap` function and call that instead.
+
+    ```python
+
+    @torch.fx.wrap
+    def torch_randn(x, shape):
+        return torch.randn(shape)
+
+    def f(x):
+        return x + torch_randn(x, 5)
+    fx.symbolic_trace(f)
+    ```
+   -  This behavior may be fixed in a future release.
+
+-  Type annotations
+
+   -  Python 3-style type annotations (e.g.
+      `func(x : torch.Tensor, y : int) -> torch.Tensor`) are supported
+      and will be preserved by symbolic tracing.
+   -  Python 2-style comment type annotations
+      `# type: (torch.Tensor, int) -> torch.Tensor` are not currently
+      supported.
+   -  Annotations on local names within a function are not currently
+      supported.
+
+
+-  Gotcha around `training` flag and submodules
+
+   -  When using functionals like `torch.nn.functional.dropout`, it will be common for the training argument to be passed in as `self.training`. During FX tracing, this will likely be baked in as a constant value.
+
+    ```python
+
+    import torch
+    import torch.fx
+
+    class DropoutRepro(torch.nn.Module):
+        def forward(self, x):
+        return torch.nn.functional.dropout(x, training=self.training)
+
+
+    traced = torch.fx.symbolic_trace(DropoutRepro())
+    print(traced.code)
+    """
+    def forward(self, x):
+        dropout = torch.nn.functional.dropout(x, p = 0.5, training = True, inplace = False);  x = None
+        return dropout
+    """
+
+    traced.eval()
+
+    x = torch.randn(5, 3)
+    torch.testing.assert_close(traced(x), x)
+    """
+    AssertionError: Tensor-likes are not close!
+
+    Mismatched elements: 15 / 15 (100.0%)
+    Greatest absolute difference: 1.6207983493804932 at index (0, 2) (up to 1e-05 allowed)
+    Greatest relative difference: 1.0 at index (0, 0) (up to 0.0001 allowed)
+    """
+    ```
+   - However, when the standard `nn.Dropout()` submodule is used, the training flag is encapsulated and--because of the preservation of the `nn.Module` object model--can be changed.
+
+    ```python
+
+    class DropoutRepro2(torch.nn.Module):
+        def __init__(self):
+        super().__init__()
+        self.drop = torch.nn.Dropout()
+
+        def forward(self, x):
+        return self.drop(x)
+
+    traced = torch.fx.symbolic_trace(DropoutRepro2())
+    print(traced.code)
+    """
+    def forward(self, x):
+        drop = self.drop(x);  x = None
+        return drop
+    """
+
+    traced.eval()
+
+    x = torch.randn(5, 3)
+    torch.testing.assert_close(traced(x), x)
+    ```
+  - Because of this difference, consider marking modules that interact with the `training` flag dynamically as leaf modules.
+
+
+## API Reference
+```{eval-rst}
+.. autofunction:: torch.fx.symbolic_trace
+```
+```{eval-rst}
+.. autofunction:: torch.fx.wrap
+```
+```{eval-rst}
+.. autoclass:: torch.fx.GraphModule
+  :members:
+
+  .. automethod:: __init__
+```
+```{eval-rst}
+.. autoclass:: torch.fx.Graph
+  :members:
+
+  .. automethod:: __init__
+```
+```{eval-rst}
+.. autoclass:: torch.fx.Node
+  :members:
+```
+```{eval-rst}
+.. autoclass:: torch.fx.Tracer
+  :members:
+  :inherited-members:
+```
+```{eval-rst}
+.. autoclass:: torch.fx.Proxy
+```
+```{eval-rst}
+.. autoclass:: torch.fx.Interpreter
+  :members:
+```
+```{eval-rst}
+.. autoclass:: torch.fx.Transformer
+  :members:
+```
+```{eval-rst}
+.. autofunction:: torch.fx.replace_pattern
+```
+
+<!-- The experimental and passes submodules are missing docs. -->
+<!-- Adding it here for coverage but this doesn't add anything to the -->
+<!-- rendered doc. -->
+```{eval-rst}
+.. py:module:: torch.fx.passes
+.. py:module:: torch.fx.passes.infra
+.. py:module:: torch.fx.passes.backends
+.. py:module:: torch.fx.passes.utils
+.. py:module:: torch.fx.passes.tests
+.. py:module:: torch.fx.experimental
+.. py:module:: torch.fx.experimental.unification
+.. py:module:: torch.fx.experimental.unification.multipledispatch
+.. py:module:: torch.fx.experimental.migrate_gradual_types
+.. py:module:: torch.fx.passes.dialect
+.. py:module:: torch.fx.passes.dialect.common
+.. py:module:: torch.fx.annotate
+.. py:module:: torch.fx.config
+.. py:module:: torch.fx.experimental.accelerator_partitioner
+.. py:module:: torch.fx.experimental.const_fold
+.. py:module:: torch.fx.experimental.debug
+.. py:module:: torch.fx.experimental.graph_gradual_typechecker
+.. py:module:: torch.fx.experimental.merge_matmul
+.. py:module:: torch.fx.experimental.meta_tracer
+.. py:module:: torch.fx.experimental.migrate_gradual_types.constraint
+.. py:module:: torch.fx.experimental.migrate_gradual_types.constraint_generator
+.. py:module:: torch.fx.experimental.migrate_gradual_types.constraint_transformation
+.. py:module:: torch.fx.experimental.migrate_gradual_types.operation
+.. py:module:: torch.fx.experimental.migrate_gradual_types.transform_to_z3
+.. py:module:: torch.fx.experimental.migrate_gradual_types.util
+.. py:module:: torch.fx.experimental.migrate_gradual_types.z3_types
+.. py:module:: torch.fx.experimental.normalize
+.. py:module:: torch.fx.experimental.optimization
+.. py:module:: torch.fx.experimental.partitioner_utils
+.. py:module:: torch.fx.experimental.recording
+.. py:module:: torch.fx.experimental.refinement_types
+.. py:module:: torch.fx.experimental.rewriter
+.. py:module:: torch.fx.experimental.schema_type_annotation
+.. py:module:: torch.fx.experimental.sym_node
+.. py:module:: torch.fx.experimental.unification.core
+.. py:module:: torch.fx.experimental.unification.dispatch
+.. py:module:: torch.fx.experimental.unification.match
+.. py:module:: torch.fx.experimental.unification.more
+.. py:module:: torch.fx.experimental.unification.multipledispatch.conflict
+.. py:module:: torch.fx.experimental.unification.multipledispatch.core
+.. py:module:: torch.fx.experimental.unification.multipledispatch.dispatcher
+.. py:module:: torch.fx.experimental.unification.multipledispatch.utils
+.. py:module:: torch.fx.experimental.unification.multipledispatch.variadic
+.. py:module:: torch.fx.experimental.unification.unification_tools
+.. py:module:: torch.fx.experimental.unification.utils
+.. py:module:: torch.fx.experimental.unification.variable
+.. py:module:: torch.fx.experimental.unify_refinements
+.. py:module:: torch.fx.experimental.validator
+.. py:module:: torch.fx.graph
+.. py:module:: torch.fx.graph_module
+.. py:module:: torch.fx.immutable_collections
+.. py:module:: torch.fx.interpreter
+.. py:module:: torch.fx.node
+.. py:module:: torch.fx.operator_schemas
+.. py:module:: torch.fx.passes.annotate_getitem_nodes
+.. py:module:: torch.fx.passes.backends.cudagraphs
+.. py:module:: torch.fx.passes.dialect.common.cse_pass
+.. py:module:: torch.fx.passes.fake_tensor_prop
+.. py:module:: torch.fx.passes.graph_drawer
+.. py:module:: torch.fx.passes.graph_manipulation
+.. py:module:: torch.fx.passes.graph_transform_observer
+.. py:module:: torch.fx.passes.infra.partitioner
+.. py:module:: torch.fx.passes.infra.pass_base
+.. py:module:: torch.fx.passes.infra.pass_manager
+.. py:module:: torch.fx.passes.net_min_base
+.. py:module:: torch.fx.passes.operator_support
+.. py:module:: torch.fx.passes.param_fetch
+.. py:module:: torch.fx.passes.pass_manager
+.. py:module:: torch.fx.passes.reinplace
+.. py:module:: torch.fx.passes.runtime_assert
+.. py:module:: torch.fx.passes.shape_prop
+.. py:module:: torch.fx.passes.split_module
+.. py:module:: torch.fx.passes.split_utils
+.. py:module:: torch.fx.passes.splitter_base
+.. py:module:: torch.fx.passes.tests.test_pass_manager
+.. py:module:: torch.fx.passes.tools_common
+.. py:module:: torch.fx.passes.utils.common
+.. py:module:: torch.fx.passes.utils.fuser_utils
+.. py:module:: torch.fx.passes.utils.matcher_utils
+.. py:module:: torch.fx.passes.utils.matcher_with_name_node_map_utils
+.. py:module:: torch.fx.passes.utils.source_matcher_utils
+.. py:module:: torch.fx.proxy
+.. py:module:: torch.fx.subgraph_rewriter
+.. py:module:: torch.fx.tensor_type
+.. py:module:: torch.fx.traceback
+```
diff --git a/docs/source/hub.md b/docs/source/hub.md
new file mode 100644
index 000000000000..afb312098866
--- /dev/null
+++ b/docs/source/hub.md
@@ -0,0 +1,157 @@
+# torch.hub
+
+Pytorch Hub is a pre-trained model repository designed to facilitate research reproducibility.
+
+## Publishing models
+
+Pytorch Hub supports publishing pre-trained models(model definitions and pre-trained weights)
+to a GitHub repository by adding a simple `hubconf.py` file;
+
+`hubconf.py` can have multiple entrypoints. Each entrypoint is defined as a python function
+(example: a pre-trained model you want to publish).
+
+```python
+  def entrypoint_name(*args, **kwargs):
+      # args & kwargs are optional, for models which take positional/keyword arguments.
+      ...
+```
+
+### How to implement an entrypoint?
+
+Here is a code snippet specifies an entrypoint for `resnet18` model if we expand
+the implementation in `pytorch/vision/hubconf.py`.
+In most case importing the right function in `hubconf.py` is sufficient. Here we
+just want to use the expanded version as an example to show how it works.
+You can see the full script in
+[pytorch/vision repo](https://github.com/pytorch/vision/blob/master/hubconf.py)
+
+```python
+  dependencies = ['torch']
+  from torchvision.models.resnet import resnet18 as _resnet18
+
+  # resnet18 is the name of entrypoint
+  def resnet18(pretrained=False, **kwargs):
+      """ # This docstring shows up in hub.help()
+      Resnet18 model
+      pretrained (bool): kwargs, load pretrained weights into the model
+      """
+      # Call the model, load pretrained weights
+      model = _resnet18(pretrained=pretrained, **kwargs)
+      return model
+```
+
+- `dependencies` variable is a **list** of package names required to **load** the model. Note this might
+  be slightly different from dependencies required for training a model.
+- `args` and `kwargs` are passed along to the real callable function.
+- Docstring of the function works as a help message. It explains what does the model do and what
+  are the allowed positional/keyword arguments. It's highly recommended to add a few examples here.
+- Entrypoint function can either return a model(nn.module), or auxiliary tools to make the user workflow smoother, e.g. tokenizers.
+- Callables prefixed with underscore are considered as helper functions which won't show up in {func}`torch.hub.list()`.
+- Pretrained weights can either be stored locally in the GitHub repo, or loadable by
+  {func}`torch.hub.load_state_dict_from_url()`. If less than 2GB, it's recommended to attach it to a [project release](https://help.github.com/en/articles/distributing-large-binaries)
+  and use the url from the release.
+  In the example above `torchvision.models.resnet.resnet18` handles `pretrained`, alternatively you can put the following logic in the entrypoint definition.
+
+```python
+  if pretrained:
+      # For checkpoint saved in local GitHub repo, e.g. <RELATIVE_PATH_TO_CHECKPOINT>=weights/save.pth
+      dirname = os.path.dirname(__file__)
+      checkpoint = os.path.join(dirname, <RELATIVE_PATH_TO_CHECKPOINT>)
+      state_dict = torch.load(checkpoint)
+      model.load_state_dict(state_dict)
+
+      # For checkpoint saved elsewhere
+      checkpoint = 'https://download.pytorch.org/models/resnet18-5c106cde.pth'
+      model.load_state_dict(torch.hub.load_state_dict_from_url(checkpoint, progress=False))
+```
+
+### Important Notice
+
+- The published models should be at least in a branch/tag. It can't be a random commit.
+
+## Loading models from Hub
+
+Pytorch Hub provides convenient APIs to explore all available models in hub
+through {func}`torch.hub.list()`, show docstring and examples through
+{func}`torch.hub.help()` and load the pre-trained models using
+{func}`torch.hub.load()`.
+
+```{eval-rst}
+.. automodule:: torch.hub
+```
+
+```{eval-rst}
+.. autofunction:: list
+```
+
+```{eval-rst}
+.. autofunction:: help
+```
+
+```{eval-rst}
+.. autofunction:: load
+```
+
+```{eval-rst}
+.. autofunction:: download_url_to_file
+```
+
+```{eval-rst}
+.. autofunction:: load_state_dict_from_url
+```
+
+### Running a loaded model:
+
+Note that `*args` and `**kwargs` in {func}`torch.hub.load()` are used to
+**instantiate** a model. After you have loaded a model, how can you find out
+what you can do with the model?
+A suggested workflow is
+
+- `dir(model)` to see all available methods of the model.
+- `help(model.foo)` to check what arguments `model.foo` takes to run
+
+To help users explore without referring to documentation back and forth, we strongly
+recommend repo owners make function help messages clear and succinct. It's also helpful
+to include a minimal working example.
+
+### Where are my downloaded models saved?
+
+The locations are used in the order of
+
+- Calling `hub.set_dir(<PATH_TO_HUB_DIR>)`
+- `$TORCH_HOME/hub`, if environment variable `TORCH_HOME` is set.
+- `$XDG_CACHE_HOME/torch/hub`, if environment variable `XDG_CACHE_HOME` is set.
+- `~/.cache/torch/hub`
+
+```{eval-rst}
+.. autofunction:: get_dir
+```
+
+```{eval-rst}
+.. autofunction:: set_dir
+```
+
+### Caching logic
+
+By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in the
+directory returned by {func}`~torch.hub.get_dir()`.
+
+Users can force a reload by calling `hub.load(..., force_reload=True)`. This will delete
+the existing GitHub folder and downloaded weights, reinitialize a fresh download. This is useful
+when updates are published to the same branch, users can keep up with the latest release.
+
+### Known limitations:
+
+Torch hub works by importing the package as if it was installed. There are some side effects
+introduced by importing in Python. For example, you can see new items in Python caches
+`sys.modules` and `sys.path_importer_cache` which is normal Python behavior.
+This also means that you may have import errors when importing different models
+from different repos, if the repos have the same sub-package names (typically, a
+`model` subpackage). A workaround for these kinds of import errors is to
+remove the offending sub-package from the `sys.modules` dict; more details can
+be found in [this GitHub issue](https://github.com/pytorch/hub/issues/243#issuecomment-942403391).
+
+A known limitation that is worth mentioning here: users **CANNOT** load two different branches of
+the same repo in the **same python process**. It's just like installing two packages with the
+same name in Python, which is not good. Cache might join the party and give you surprises if you
+actually try that. Of course it's totally fine to load them in separate processes.
diff --git a/docs/source/index.md b/docs/source/index.md
new file mode 100644
index 000000000000..e1e8ce5c0f2e
--- /dev/null
+++ b/docs/source/index.md
@@ -0,0 +1,42 @@
+% PyTorch documentation master file, created by
+%  sphinx-quickstart on Fri Dec 23 13:31:47 2016.
+%  You can adapt this file completely to your liking, but it should at least
+%  contain the root `toctree` directive.
+
+% :github_url: https://github.com/pytorch/pytorch
+
+PyTorch documentation
+===================================
+
+PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
+
+Features described in this documentation are classified by release status:
+
+**Stable (API-Stable):**
+These features will be maintained long-term and there should generally be no major performance limitations or gaps in documentation. We also expect to maintain backwards compatibility (although breaking changes can happen and notice will be given one release ahead of time).
+
+**Unstable (API-Unstable):**
+Encompasses all features that are under active development where APIs may change based on user feedback, requisite performance improvements or because coverage across operators is not yet complete.
+The APIs and performance characteristics of these features may change.
+
+```{toctree}
+:glob:
+:maxdepth: 2
+
+pytorch-api
+notes
+```
+
+```{toctree}
+:glob:
+:hidden:
+:maxdepth: 2
+
+community/index
+C++ <https://docs.pytorch.org/cppdocs/>
+```
+
+## Indices and tables
+
+* {ref}`genindex`
+* {ref}`modindex`
diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 9d37c2a7d330..fc4129dc00a0 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -310,7 +310,11 @@ your model code correctly.
 
 Interpreting Graphs
 ~~~~~~~~~~~~~~~~~~~
+<<<<<<< HEAD
 TorchScript also has a representation at a lower level than the code pretty-
+=======
+TorchScript also has a representation at a lower level than the code pretty-\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 printer, in the form of IR graphs.
 
 TorchScript uses a static single assignment (SSA) intermediate representation
diff --git a/docs/source/jit_language_reference.md b/docs/source/jit_language_reference.md
new file mode 100644
index 000000000000..973730948208
--- /dev/null
+++ b/docs/source/jit_language_reference.md
@@ -0,0 +1,952 @@
+```{contents}
+:depth: 2
+:local: true
+```
+
+```{eval-rst}
+.. testsetup::
+
+    # These are hidden from the docs, but these are necessary for `doctest`
+    # since the `inspect` module doesn't play nicely with the execution
+    # environment for `doctest`
+    import torch
+
+    original_script = torch.jit.script
+    def script_wrapper(obj, *args, **kwargs):
+        obj.__module__ = 'FakeMod'
+        return original_script(obj, *args, **kwargs)
+
+    torch.jit.script = script_wrapper
+
+    original_trace = torch.jit.trace
+    def trace_wrapper(obj, *args, **kwargs):
+        obj.__module__ = 'FakeMod'
+        return original_trace(obj, *args, **kwargs)
+
+    torch.jit.trace = trace_wrapper
+```
+
+(language-reference)=
+
+# TorchScript Language Reference
+
+TorchScript is a statically typed subset of Python that can either be written directly (using
+the {func}`@torch.jit.script <torch.jit.script>` decorator) or generated automatically from Python code via
+tracing. When using tracing, code is automatically converted into this subset of
+Python by recording only the actual operators on tensors and simply executing and
+discarding the other surrounding Python code.
+
+When writing TorchScript directly using `@torch.jit.script` decorator, the programmer must
+only use the subset of Python supported in TorchScript. This section documents
+what is supported in TorchScript as if it were a language reference for a stand
+alone language. Any features of Python not mentioned in this reference are not
+part of TorchScript. See `Builtin Functions` for a complete reference of available
+PyTorch tensor methods, modules, and functions.
+
+As a subset of Python, any valid TorchScript function is also a valid Python
+function. This makes it possible to `disable TorchScript` and debug the
+function using standard Python tools like `pdb`. The reverse is not true: there
+are many valid Python programs that are not valid TorchScript programs.
+Instead, TorchScript focuses specifically on the features of Python that are
+needed to represent neural network models in PyTorch.
+
+(types)=
+
+(supported-type)=
+
+## Types
+
+The largest difference between TorchScript and the full Python language is that
+TorchScript only supports a small set of types that are needed to express neural
+net models. In particular, TorchScript supports:
+
+```{eval-rst}
+.. csv-table::
+   :header: "Type", "Description"
+
+   "``Tensor``", "A PyTorch tensor of any dtype, dimension, or backend"
+   "``Tuple[T0, T1, ..., TN]``", "A tuple containing subtypes ``T0``, ``T1``, etc. (e.g. ``Tuple[Tensor, Tensor]``)"
+   "``bool``", "A boolean value"
+   "``int``", "A scalar integer"
+   "``float``", "A scalar floating point number"
+   "``str``", "A string"
+   "``List[T]``", "A list of which all members are type ``T``"
+   "``Optional[T]``", "A value which is either None or type ``T``"
+   "``Dict[K, V]``", "A dict with key type ``K`` and value type ``V``. Only ``str``, ``int``, and ``float`` are allowed as key types."
+   "``T``", "A {ref}`TorchScript Class`"
+   "``E``", "A {ref}`TorchScript Enum`"
+   "``NamedTuple[T0, T1, ...]``", "A :func:`collections.namedtuple <collections.namedtuple>` tuple type"
+   "``Union[T0, T1, ...]``", "One of the subtypes ``T0``, ``T1``, etc."
+```
+
+Unlike Python, each variable in TorchScript function must have a single static type.
+This makes it easier to optimize TorchScript functions.
+
+Example (a type mismatch)
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    @torch.jit.script
+    def an_error(x):
+        if x:
+            r = torch.rand(1)
+        else:
+            r = 4
+        return r
+
+```
+
+```{eval-rst}
+.. testoutput::
+
+     Traceback (most recent call last):
+       ...
+     RuntimeError: ...
+
+     Type mismatch: r is set to type Tensor in the true branch and type int in the false branch:
+     @torch.jit.script
+     def an_error(x):
+         if x:
+         ~~~~~
+             r = torch.rand(1)
+             ~~~~~~~~~~~~~~~~~
+         else:
+         ~~~~~
+             r = 4
+             ~~~~~ <--- HERE
+         return r
+     and was used here:
+         else:
+             r = 4
+         return r
+                ~ <--- HERE...
+```
+
+### Unsupported Typing Constructs
+
+TorchScript does not support all features and types of the {mod}`typing` module. Some of these
+are more fundamental things that are unlikely to be added in the future while others
+may be added if there is enough user demand to make it a priority.
+
+These types and features from the {mod}`typing` module are unavailable in TorchScript.
+
+```{eval-rst}
+.. csv-table::
+   :header: "Item", "Description"
+
+   ":any:`typing.Any`", ":any:`typing.Any` is currently in development but not yet released"
+   ":any:`typing.NoReturn`", "Not implemented"
+   ":any:`typing.Sequence`", "Not implemented"
+   ":any:`typing.Callable`", "Not implemented"
+   ":any:`typing.Literal`", "Not implemented"
+   ":any:`typing.ClassVar`", "Not implemented"
+   ":any:`typing.Final`", "This is supported for :any:`module attributes <Module Attributes>` class attribute annotations but not for functions"
+   ":any:`typing.AnyStr`", "TorchScript does not support :any:`bytes` so this type is not used"
+   ":any:`typing.overload`", ":any:`typing.overload` is currently in development but not yet released"
+   "Type aliases", "Not implemented"
+   "Nominal vs structural subtyping", "Nominal typing is in development, but structural typing is not"
+   "NewType", "Unlikely to be implemented"
+   "Generics", "Unlikely to be implemented"
+```
+
+Any other functionality from the {any}`typing` module not explicitly listed in this documentation is unsupported.
+
+### Default Types
+
+By default, all parameters to a TorchScript function are assumed to be Tensor.
+To specify that an argument to a TorchScript function is another type, it is possible to use
+MyPy-style type annotations using the types listed above.
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    @torch.jit.script
+    def foo(x, tup):
+        # type: (int, Tuple[Tensor, Tensor]) -> Tensor
+        t0, t1 = tup
+        return t0 + t1 + x
+
+    print(foo(3, (torch.rand(3), torch.rand(3))))
+```
+
+```{eval-rst}
+.. testoutput::
+    :hide:
+
+    ...
+```
+
+:::{note}
+It is also possible to annotate types with Python 3 type hints from the
+`typing` module.
+
+```{eval-rst}
+.. testcode::
+
+  import torch
+  from typing import Tuple
+
+  @torch.jit.script
+  def foo(x: int, tup: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+      t0, t1 = tup
+      return t0 + t1 + x
+
+  print(foo(3, (torch.rand(3), torch.rand(3))))
+```
+
+```{eval-rst}
+.. testoutput::
+  :hide:
+
+  ...
+```
+:::
+
+An empty list is assumed to be `List[Tensor]` and empty dicts
+`Dict[str, Tensor]`. To instantiate an empty list or dict of other types,
+use `Python 3 type hints`.
+
+Example (type annotations for Python 3):
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    import torch.nn as nn
+    from typing import Dict, List, Tuple
+
+    class EmptyDataStructures(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x: torch.Tensor) -> Tuple[List[Tuple[int, float]], Dict[str, int]]:
+            # This annotates the list to be a `List[Tuple[int, float]]`
+            my_list: List[Tuple[int, float]] = []
+            for i in range(10):
+                my_list.append((i, x.item()))
+
+            my_dict: Dict[str, int] = {}
+            return my_list, my_dict
+
+    x = torch.jit.script(EmptyDataStructures())
+
+
+
+```
+
+### Optional Type Refinement
+
+TorchScript will refine the type of a variable of type `Optional[T]` when
+a comparison to `None` is made inside the conditional of an if-statement or checked in an `assert`.
+The compiler can reason about multiple `None` checks that are combined with
+`and`, `or`, and `not`. Refinement will also occur for else blocks of if-statements
+that are not explicitly written.
+
+The `None` check must be within the if-statement's condition; assigning
+a `None` check to a variable and using it in the if-statement's condition will
+not refine the types of variables in the check.
+Only local variables will be refined, an attribute like `self.x` will not and must assigned to
+a local variable to be refined.
+
+Example (refining types on parameters and locals):
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    import torch.nn as nn
+    from typing import Optional
+
+    class M(nn.Module):
+        z: Optional[int]
+
+        def __init__(self, z):
+            super().__init__()
+            # If `z` is None, its type cannot be inferred, so it must
+            # be specified (above)
+            self.z = z
+
+        def forward(self, x, y, z):
+            # type: (Optional[int], Optional[int], Optional[int]) -> int
+            if x is None:
+                x = 1
+                x = x + 1
+
+            # Refinement for an attribute by assigning it to a local
+            z = self.z
+            if y is not None and z is not None:
+                x = y + z
+
+            # Refinement via an `assert`
+            assert z is not None
+            x += z
+            return x
+
+    module = torch.jit.script(M(2))
+    module = torch.jit.script(M(None))
+
+```
+
+(TorchScript Class)=
+
+(TorchScript Classes)=
+
+(torchscript-classes)=
+
+### TorchScript Classes
+
+:::{warning}
+TorchScript class support is experimental. Currently it is best suited
+for simple record-like types (think a `NamedTuple` with methods
+attached).
+:::
+
+Python classes can be used in TorchScript if they are annotated with {func}`@torch.jit.script <torch.jit.script>`,
+similar to how you would declare a TorchScript function:
+
+```{eval-rst}
+.. testcode::
+    :skipif: True  # TODO: fix the source file resolving so this can be tested
+
+    @torch.jit.script
+    class Foo:
+      def __init__(self, x, y):
+        self.x = x
+
+      def aug_add_x(self, inc):
+        self.x += inc
+
+```
+
+This subset is restricted:
+
+- All functions must be valid TorchScript functions (including `__init__()`).
+
+- Classes must be new-style classes, as we use `__new__()` to construct them with pybind11.
+
+- TorchScript classes are statically typed. Members can only be declared by assigning to
+  self in the `__init__()` method.
+
+  > For example, assigning to `self` outside of the `__init__()` method:
+  >
+  > ```
+  > @torch.jit.script
+  > class Foo:
+  >   def assign_x(self):
+  >     self.x = torch.rand(2, 3)
+  > ```
+  >
+  > Will result in:
+  >
+  > ```
+  > RuntimeError:
+  > Tried to set nonexistent attribute: x. Did you forget to initialize it in __init__()?:
+  > def assign_x(self):
+  >   self.x = torch.rand(2, 3)
+  >   ~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+  > ```
+
+- No expressions except method definitions are allowed in the body of the class.
+
+- No support for inheritance or any other polymorphism strategy, except for inheriting
+  from `object` to specify a new-style class.
+
+After a class is defined, it can be used in both TorchScript and Python interchangeably
+like any other TorchScript type:
+
+```
+# Declare a TorchScript class
+@torch.jit.script
+class Pair:
+  def __init__(self, first, second):
+    self.first = first
+    self.second = second
+
+@torch.jit.script
+def sum_pair(p):
+  # type: (Pair) -> Tensor
+  return p.first + p.second
+
+p = Pair(torch.rand(2, 3), torch.rand(2, 3))
+print(sum_pair(p))
+```
+
+(TorchScript Enum)=
+
+(TorchScript Enums)=
+
+(torchscript-enums)=
+
+### TorchScript Enums
+
+Python enums can be used in TorchScript without any extra annotation or code:
+
+```
+from enum import Enum
+
+
+class Color(Enum):
+    RED = 1
+    GREEN = 2
+
+@torch.jit.script
+def enum_fn(x: Color, y: Color) -> bool:
+    if x == Color.RED:
+        return True
+
+    return x == y
+```
+
+After an enum is defined, it can be used in both TorchScript and Python interchangeably
+like any other TorchScript type. The type of the values of an enum must be `int`,
+`float`, or `str`. All values must be of the same type; heterogeneous types for enum
+values are not supported.
+
+### Named Tuples
+
+Types produced by {func}`collections.namedtuple <collections.namedtuple>` can be used in TorchScript.
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    import collections
+
+    Point = collections.namedtuple('Point', ['x', 'y'])
+
+    @torch.jit.script
+    def total(point):
+        # type: (Point) -> Tensor
+        return point.x + point.y
+
+    p = Point(x=torch.rand(3), y=torch.rand(3))
+    print(total(p))
+```
+
+```{eval-rst}
+.. testoutput::
+    :hide:
+
+    ...
+
+```
+
+(jit_iterables)=
+
+### Iterables
+
+Some functions (for example, {any}`zip` and {any}`enumerate`) can only operate on iterable types.
+Iterable types in TorchScript include `Tensor`s, lists, tuples, dictionaries, strings,
+{any}`torch.nn.ModuleList` and {any}`torch.nn.ModuleDict`.
+
+## Expressions
+
+The following Python Expressions are supported.
+
+### Literals
+
+```
+True
+False
+None
+'string literals'
+"string literals"
+3  # interpreted as int
+3.4  # interpreted as a float
+```
+
+#### List Construction
+
+An empty list is assumed have type `List[Tensor]`.
+The types of other list literals are derived from the type of the members.
+See [Default Types] for more details.
+
+```
+[3, 4]
+[]
+[torch.rand(3), torch.rand(4)]
+```
+
+#### Tuple Construction
+
+```
+(3, 4)
+(3,)
+```
+
+#### Dict Construction
+
+An empty dict is assumed have type `Dict[str, Tensor]`.
+The types of other dict literals are derived from the type of the members.
+See [Default Types] for more details.
+
+```
+{'hello': 3}
+{}
+{'a': torch.rand(3), 'b': torch.rand(4)}
+```
+
+### Variables
+
+See [Variable Resolution] for how variables are resolved.
+
+```
+my_variable_name
+```
+
+### Arithmetic Operators
+
+```
+a + b
+a - b
+a * b
+a / b
+a ^ b
+a @ b
+```
+
+### Comparison Operators
+
+```
+a == b
+a != b
+a < b
+a > b
+a <= b
+a >= b
+```
+
+### Logical Operators
+
+```
+a and b
+a or b
+not b
+```
+
+### Subscripts and Slicing
+
+```
+t[0]
+t[-1]
+t[0:2]
+t[1:]
+t[:1]
+t[:]
+t[0, 1]
+t[0, 1:2]
+t[0, :1]
+t[-1, 1:, 0]
+t[1:, -1, 0]
+t[i:j, i]
+```
+
+### Function Calls
+
+Calls to `builtin functions`
+
+```
+torch.rand(3, dtype=torch.int)
+```
+
+Calls to other script functions:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    @torch.jit.script
+    def foo(x):
+        return x + 1
+
+    @torch.jit.script
+    def bar(x):
+        return foo(x)
+```
+
+### Method Calls
+
+Calls to methods of builtin types like tensor: `x.mm(y)`
+
+On modules, methods must be compiled before they can be called. The TorchScript
+compiler recursively compiles methods it sees when compiling other methods. By default,
+compilation starts on the `forward` method. Any methods called by `forward` will
+be compiled, and any methods called by those methods, and so on. To start compilation at
+a method other than `forward`, use the {func}`@torch.jit.export <torch.jit.export>` decorator
+(`forward` implicitly is marked `@torch.jit.export`).
+
+Calling a submodule directly (e.g. `self.resnet(input)`) is equivalent to
+calling its `forward` method (e.g. `self.resnet.forward(input)`).
+
+```{eval-rst}
+.. testcode::
+    :skipif: torchvision is None
+
+    import torch
+    import torch.nn as nn
+    import torchvision
+
+    class MyModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            means = torch.tensor([103.939, 116.779, 123.68])
+            self.means = torch.nn.Parameter(means.resize_(1, 3, 1, 1))
+            resnet = torchvision.models.resnet18()
+            self.resnet = torch.jit.trace(resnet, torch.rand(1, 3, 224, 224))
+
+        def helper(self, input):
+            return self.resnet(input - self.means)
+
+        def forward(self, input):
+            return self.helper(input)
+
+        # Since nothing in the model calls `top_level_method`, the compiler
+        # must be explicitly told to compile this method
+        @torch.jit.export
+        def top_level_method(self, input):
+            return self.other_helper(input)
+
+        def other_helper(self, input):
+            return input + 10
+
+    # `my_script_module` will have the compiled methods `forward`, `helper`,
+    # `top_level_method`, and `other_helper`
+    my_script_module = torch.jit.script(MyModule())
+
+```
+
+### Ternary Expressions
+
+```
+x if x > y else y
+```
+
+### Casts
+
+```
+float(ten)
+int(3.5)
+bool(ten)
+str(2)``
+```
+
+### Accessing Module Parameters
+
+```
+self.my_parameter
+self.my_submodule.my_parameter
+```
+
+## Statements
+
+TorchScript supports the following types of statements:
+
+### Simple Assignments
+
+```
+a = b
+a += b # short-hand for a = a + b, does not operate in-place on a
+a -= b
+```
+
+### Pattern Matching Assignments
+
+```
+a, b = tuple_or_list
+a, b, *c = a_tuple
+```
+
+Multiple Assignments
+
+```
+a = b, c = tup
+```
+
+### Print Statements
+
+```
+print("the result of an add:", a + b)
+```
+
+### If Statements
+
+```
+if a < 4:
+    r = -a
+elif a < 3:
+    r = a + a
+else:
+    r = 3 * a
+```
+
+In addition to bools, floats, ints, and Tensors can be used in a conditional
+and will be implicitly casted to a boolean.
+
+### While Loops
+
+```
+a = 0
+while a < 4:
+    print(a)
+    a += 1
+```
+
+### For loops with range
+
+```
+x = 0
+for i in range(10):
+    x *= i
+```
+
+### For loops over tuples
+
+These unroll the loop, generating a body for
+each member of the tuple. The body must type-check correctly for each member.
+
+```
+tup = (3, torch.rand(4))
+for x in tup:
+    print(x)
+```
+
+### For loops over constant nn.ModuleList
+
+To use a `nn.ModuleList` inside a compiled method, it must be marked
+constant by adding the name of the attribute to the `__constants__`
+list for the type. For loops over a `nn.ModuleList` will unroll the body of the
+loop at compile time, with each member of the constant module list.
+
+```{eval-rst}
+.. testcode::
+
+    class SubModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.weight = nn.Parameter(torch.randn(2))
+
+        def forward(self, input):
+            return self.weight + input
+
+    class MyModule(torch.nn.Module):
+        __constants__ = ['mods']
+
+        def __init__(self):
+            super().__init__()
+            self.mods = torch.nn.ModuleList([SubModule() for i in range(10)])
+
+        def forward(self, v):
+            for module in self.mods:
+                v = module(v)
+            return v
+
+
+    m = torch.jit.script(MyModule())
+
+
+```
+
+### Break and Continue
+
+```
+for i in range(5):
+    if i == 1:
+        continue
+    if i == 3:
+        break
+    print(i)
+```
+
+### Return
+
+```
+return a, b
+```
+
+## Variable Resolution
+
+TorchScript supports a subset of Python's variable resolution (i.e. scoping)
+rules. Local variables behave the same as in Python, except for the restriction
+that a variable must have the same type along all paths through a function.
+If a variable has a different type on different branches of an if statement, it
+is an error to use it after the end of the if statement.
+
+Similarly, a variable is not allowed to be used if it is only *defined* along some
+paths through the function.
+
+Example:
+
+```{eval-rst}
+.. testcode::
+
+    @torch.jit.script
+    def foo(x):
+        if x < 0:
+            y = 4
+        print(y)
+```
+
+```{eval-rst}
+.. testoutput::
+
+     Traceback (most recent call last):
+       ...
+     RuntimeError: ...
+
+     y is not defined in the false branch...
+     @torch.jit.script...
+     def foo(x):
+         if x < 0:
+         ~~~~~~~~~
+             y = 4
+             ~~~~~ <--- HERE
+         print(y)
+     and was used here:
+         if x < 0:
+             y = 4
+         print(y)
+               ~ <--- HERE...
+```
+
+Non-local variables are resolved to Python values at compile time when the
+function is defined. These values are then converted into TorchScript values using
+the rules described in [Use of Python Values].
+
+## Use of Python Values
+
+To make writing TorchScript more convenient, we allow script code to refer
+to Python values in the surrounding scope. For instance, any time there is a
+reference to `torch`, the TorchScript compiler is actually resolving it to the
+`torch` Python module when the function is declared. These Python values are
+not a first class part of TorchScript. Instead they are de-sugared at compile-time
+into the primitive types that TorchScript supports. This depends
+on the dynamic type of the Python valued referenced when compilation occurs.
+This section describes the rules that are used when accessing Python values in TorchScript.
+
+### Functions
+
+TorchScript can call Python functions. This functionality is very useful when
+incrementally converting a model to TorchScript. The model can be moved function-by-function
+to TorchScript, leaving calls to Python functions in place. This way you can incrementally
+check the correctness of the model as you go.
+
+```{eval-rst}
+.. autofunction:: torch.jit.is_scripting
+```
+
+```{eval-rst}
+.. autofunction:: torch.jit.is_tracing
+
+```
+
+### Attribute Lookup On Python Modules
+
+TorchScript can lookup attributes on modules. `Builtin functions` like `torch.add`
+are accessed this way. This allows TorchScript to call functions defined in
+other modules.
+
+(constant)=
+
+### Python-defined Constants
+
+TorchScript also provides a way to use constants that are defined in Python.
+These can be used to hard-code hyper-parameters into the function, or to
+define universal constants. There are two ways of specifying that a Python
+value should be treated as a constant.
+
+1. Values looked up as attributes of a module are assumed to be constant:
+
+```{eval-rst}
+.. testcode::
+
+    import math
+    import torch
+
+    @torch.jit.script
+    def fn():
+        return math.pi
+```
+
+2. Attributes of a ScriptModule can be marked constant by annotating them with `Final[T]`
+
+```
+import torch
+import torch.nn as nn
+
+class Foo(nn.Module):
+    # `Final` from the `typing_extensions` module can also be used
+    a : torch.jit.Final[int]
+
+    def __init__(self):
+        super().__init__()
+        self.a = 1 + 4
+
+    def forward(self, input):
+        return self.a + input
+
+f = torch.jit.script(Foo())
+```
+
+Supported constant Python types are
+
+- `int`
+- `float`
+- `bool`
+- `torch.device`
+- `torch.layout`
+- `torch.dtype`
+- tuples containing supported types
+- `torch.nn.ModuleList` which can be used in a TorchScript for loop
+
+(module-attributes)=
+(Module Attributes)=
+
+### Module Attributes
+
+The `torch.nn.Parameter` wrapper and `register_buffer` can be used to assign
+tensors to a module. Other values assigned to a module that is compiled
+will be added to the compiled module if their types can be inferred. All [types]
+available in TorchScript can be used as module attributes. Tensor attributes are
+semantically the same as buffers. The type of empty lists and dictionaries and `None`
+values cannot be inferred and must be specified via
+[PEP 526-style](https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations) class annotations.
+If a type cannot be inferred and is not explicitly annotated, it will not be added as an attribute
+to the resulting {class}`ScriptModule`.
+
+Example:
+
+```{eval-rst}
+.. testcode::
+
+    from typing import List, Dict
+
+    class Foo(nn.Module):
+        # `words` is initialized as an empty list, so its type must be specified
+        words: List[str]
+
+        # The type could potentially be inferred if `a_dict` (below) was not
+        # empty, but this annotation ensures `some_dict` will be made into the
+        # proper type
+        some_dict: Dict[str, int]
+
+        def __init__(self, a_dict):
+            super().__init__()
+            self.words = []
+            self.some_dict = a_dict
+
+            # `int`s can be inferred
+            self.my_int = 10
+
+        def forward(self, input):
+            # type: (str) -> int
+            self.words.append(input)
+            return self.some_dict[input] + self.my_int
+
+    f = torch.jit.script(Foo({'hi': 2}))
+```
diff --git a/docs/source/jit_language_reference_v2.md b/docs/source/jit_language_reference_v2.md
new file mode 100644
index 000000000000..12bd2a18a201
--- /dev/null
+++ b/docs/source/jit_language_reference_v2.md
@@ -0,0 +1,1854 @@
+```{eval-rst}
+.. testsetup::
+
+    # These are hidden from the docs, but these are necessary for `doctest`
+    # since the `inspect` module doesn't play nicely with the execution
+    # environment for `doctest`
+    import torch
+
+    original_script = torch.jit.script
+    def script_wrapper(obj, *args, **kwargs):
+        obj.__module__ = 'FakeMod'
+        return original_script(obj, *args, **kwargs)
+
+    torch.jit.script = script_wrapper
+
+    original_trace = torch.jit.trace
+    def trace_wrapper(obj, *args, **kwargs):
+        obj.__module__ = 'FakeMod'
+        return original_trace(obj, *args, **kwargs)
+
+    torch.jit.trace = trace_wrapper
+```
+
+(language-reference-v2)=
+
+# TorchScript Language Reference
+
+This reference manual describes the syntax and core semantics of the TorchScript language.
+TorchScript is a statically typed subset of the Python language. This document explains the supported features of
+Python in TorchScript and also how the language diverges from regular Python. Any features of Python that are not mentioned in
+this reference manual are not part of TorchScript. TorchScript focuses specifically on the features of Python that are needed to
+represent neural network models in PyTorch.
+
+```{contents}
+:depth: 1
+:local: true
+```
+
+(type-system)=
+
+## Terminology
+
+This document uses the following terminologies:
+
+```{eval-rst}
+.. list-table::
+   :widths: 25 25
+   :header-rows: 1
+
+   * - Pattern
+     - Notes
+   * - ``::=``
+     - Indicates that the given symbol is defined as.
+   * - ``" "``
+     - Represents real keywords and delimiters that are part of the syntax.
+   * - ``A | B``
+     - Indicates either A or B.
+   * - ``( )``
+     - Indicates grouping.
+   * - ``[]``
+     - Indicates optional.
+   * - ``A+``
+     - Indicates a regular expression where term A is repeated at least once.
+   * - ``A*``
+     - Indicates a regular expression where term A is repeated zero or more times.
+```
+
+## Type System
+
+TorchScript is a statically typed subset of Python. The largest difference between TorchScript and the full Python language is that TorchScript only supports a small set of types that are needed to express
+neural net models.
+
+### TorchScript Types
+
+The TorchScript type system consists of `TSType` and `TSModuleType` as defined below.
+
+```
+TSAllType ::= TSType | TSModuleType
+TSType    ::= TSMetaType | TSPrimitiveType | TSStructuralType | TSNominalType
+```
+
+`TSType` represents the majority of TorchScript types that are composable and that can be used in TorchScript type annotations.
+`TSType` refers to any of the following:
+
+- Meta Types, e.g., `Any`
+- Primitive Types, e.g., `int`, `float`, and `str`
+- Structural Types, e.g., `Optional[int]` or `List[MyClass]`
+- Nominal Types (Python classes), e.g., `MyClass` (user-defined), `torch.tensor` (built-in)
+
+`TSModuleType` represents `torch.nn.Module` and its subclasses. It is treated differently from `TSType` because its type schema is inferred partly from the object instance and partly from the class definition.
+As such, instances of a `TSModuleType` may not follow the same static type schema. `TSModuleType` cannot be used as a TorchScript type annotation or be composed with `TSType` for type safety considerations.
+
+### Meta Types
+
+Meta types are so abstract that they are more like type constraints than concrete types.
+Currently TorchScript defines one meta-type, `Any`, that represents any TorchScript type.
+
+#### `Any` Type
+
+The `Any` type represents any TorchScript type. `Any` specifies no type constraints, thus there is no type-checking on `Any`.
+As such it can be bound to any Python or TorchScript data types (e.g., `int`, TorchScript `tuple`, or an arbitrary Python class that is not scripted).
+
+```
+TSMetaType ::= "Any"
+```
+
+Where:
+
+- `Any` is the Python class name from the typing module. Therefore, to use the `Any` type, you must import it from `typing` (e.g., `from typing import Any`).
+- Since `Any` can represent any TorchScript type, the set of operators that are allowed to operate on values of this type on `Any` is limited.
+
+#### Operators Supported for `Any` Type
+
+- Assignment to data of `Any` type.
+- Binding to parameter or return of `Any` type.
+- `x is`, `x is not` where `x` is of `Any` type.
+- `isinstance(x, Type)` where `x` is of `Any` type.
+- Data of `Any` type is printable.
+- Data of `List[Any]` type may be sortable if the data is a list of values of the same type `T` and that `T` supports comparison operators.
+
+**Compared to Python**
+
+`Any` is the least constrained type in the TorchScript type system. In that sense, it is quite similar to the
+`Object` class in Python. However, `Any` only supports a subset of the operators and methods that are supported by `Object`.
+
+#### Design Notes
+
+When we script a PyTorch module, we may encounter data that is not involved in the execution of the script. Nevertheless, it has to be described
+by a type schema. It is not only cumbersome to describe static types for unused data (in the context of the script), but also may lead to unnecessary
+scripting failures. `Any` is introduced to describe the type of the data where precise static types are not necessary for compilation.
+
+**Example 1**
+
+This example illustrates how `Any` can be used to allow the second element of the tuple parameter to be of any type. This is possible
+because `x[1]` is not involved in any computation that requires knowing its precise type.
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    from typing import Tuple
+    from typing import Any
+
+    @torch.jit.export
+    def inc_first_element(x: Tuple[int, Any]):
+        return (x[0]+1, x[1])
+
+    m = torch.jit.script(inc_first_element)
+    print(m((1,2.0)))
+    print(m((1,(100,200))))
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    (2, 2.0)
+    (2, (100, 200))
+```
+
+The second element of the tuple is of `Any` type, thus can bind to multiple types.
+For example, `(1, 2.0)` binds a float type to `Any` as in `Tuple[int, Any]`,
+whereas `(1, (100, 200))` binds a tuple to `Any` in the second invocation.
+
+**Example 2**
+
+This example illustrates how we can use `isinstance` to dynamically check the type of the data that is annotated as `Any` type:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    from typing import Any
+
+    def f(a:Any):
+        print(a)
+        return (isinstance(a, torch.Tensor))
+
+    ones = torch.ones([2])
+    m = torch.jit.script(f)
+    print(m(ones))
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+     1
+     1
+    [ CPUFloatType{2} ]
+    True
+```
+
+### Primitive Types
+
+Primitive TorchScript types are types that represent a single type of value and go with a single pre-defined
+type name.
+
+```
+TSPrimitiveType ::= "int" | "float" | "double" | "complex" | "bool" | "str" | "None"
+```
+
+### Structural Types
+
+Structural types are types that are structurally defined without a user-defined name (unlike nominal types),
+such as `Future[int]`. Structural types are composable with any `TSType`.
+
+```
+TSStructuralType ::=  TSTuple | TSNamedTuple | TSList | TSDict |
+                    TSOptional | TSUnion | TSFuture | TSRRef | TSAwait
+
+TSTuple          ::= "Tuple" "[" (TSType ",")* TSType "]"
+TSNamedTuple     ::= "namedtuple" "(" (TSType ",")* TSType ")"
+TSList           ::= "List" "[" TSType "]"
+TSOptional       ::= "Optional" "[" TSType "]"
+TSUnion          ::= "Union" "[" (TSType ",")* TSType "]"
+TSFuture         ::= "Future" "[" TSType "]"
+TSRRef           ::= "RRef" "[" TSType "]"
+TSAwait          ::= "Await" "[" TSType "]"
+TSDict           ::= "Dict" "[" KeyType "," TSType "]"
+KeyType          ::= "str" | "int" | "float" | "bool" | TensorType | "Any"
+```
+
+Where:
+
+- `Tuple`, `List`, `Optional`, `Union`, `Future`, `Dict` represent Python type class names that are defined in the module `typing`. To use these type names, you must import them from `typing` (e.g., `from typing import Tuple`).
+- `namedtuple` represents the Python class `collections.namedtuple` or `typing.NamedTuple`.
+- `Future` and `RRef` represent the Python classes `torch.futures` and `torch.distributed.rpc`.
+- `Await` represent the Python class `torch._awaits._Await`
+
+**Compared to Python**
+
+Apart from being composable with TorchScript types, these TorchScript structural types often support a common subset of the operators and methods of their Python counterparts.
+
+**Example 1**
+
+This example uses `typing.NamedTuple` syntax to define a tuple:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    from typing import NamedTuple
+    from typing import Tuple
+
+    class MyTuple(NamedTuple):
+        first: int
+        second: int
+
+    def inc(x: MyTuple) -> Tuple[int, int]:
+        return (x.first+1, x.second+1)
+
+    t = MyTuple(first=1, second=2)
+    scripted_inc = torch.jit.script(inc)
+    print("TorchScript:", scripted_inc(t))
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    TorchScript: (2, 3)
+```
+
+**Example 2**
+
+This example uses `collections.namedtuple` syntax to define a tuple:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    from typing import NamedTuple
+    from typing import Tuple
+    from collections import namedtuple
+
+    _AnnotatedNamedTuple = NamedTuple('_NamedTupleAnnotated', [('first', int), ('second', int)])
+    _UnannotatedNamedTuple = namedtuple('_NamedTupleAnnotated', ['first', 'second'])
+
+    def inc(x: _AnnotatedNamedTuple) -> Tuple[int, int]:
+        return (x.first+1, x.second+1)
+
+    m = torch.jit.script(inc)
+    print(inc(_UnannotatedNamedTuple(1,2)))
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    (2, 3)
+```
+
+**Example 3**
+
+This example illustrates a common mistake of annotating structural types, i.e., not importing the composite type
+classes from the `typing` module:
+
+```python
+import torch
+
+# ERROR: Tuple not recognized because not imported from typing
+@torch.jit.export
+def inc(x: Tuple[int, int]):
+    return (x[0]+1, x[1]+1)
+
+m = torch.jit.script(inc)
+print(m((1,2)))
+```
+
+Running the above code yields the following scripting error:
+
+```python
+File "test-tuple.py", line 5, in <module>
+    def inc(x: Tuple[int, int]):
+NameError: name 'Tuple' is not defined
+```
+
+The remedy is to add the line `from typing import Tuple` to the beginning of the code.
+
+### Nominal Types
+
+Nominal TorchScript types are Python classes. These types are called nominal because they are declared with a custom
+name and are compared using class names. Nominal classes are further classified into the following categories:
+
+```
+TSNominalType ::= TSBuiltinClasses | TSCustomClass | TSEnum
+```
+
+Among them, `TSCustomClass` and `TSEnum` must be compilable to TorchScript Intermediate Representation (IR). This is enforced by the type-checker.
+
+### Built-in Class
+
+Built-in nominal types are Python classes whose semantics are built into the TorchScript system (e.g., tensor types).
+TorchScript defines the semantics of these built-in nominal types, and often supports only a subset of the methods or
+attributes of its Python class definition.
+
+```
+TSBuiltinClass ::= TSTensor | "torch.device" | "torch.Stream" | "torch.dtype" |
+                    "torch.nn.ModuleList" | "torch.nn.ModuleDict" | ...
+TSTensor       ::= "torch.Tensor" | "common.SubTensor" | "common.SubWithTorchFunction" |
+                    "torch.nn.parameter.Parameter" | and subclasses of torch.Tensor
+```
+
+#### Special Note on torch.nn.ModuleList and torch.nn.ModuleDict
+
+Although `torch.nn.ModuleList` and `torch.nn.ModuleDict` are defined as a list and dictionary in Python,
+they behave more like tuples in TorchScript:
+
+- In TorchScript, instances of `torch.nn.ModuleList` or `torch.nn.ModuleDict` are immutable.
+- Code that iterates over `torch.nn.ModuleList` or `torch.nn.ModuleDict` is completely unrolled so that elements of `torch.nn.ModuleList` or keys of `torch.nn.ModuleDict` can be of different subclasses of `torch.nn.Module`.
+
+**Example**
+
+The following example highlights the use of a few built-in Torchscript classes (`torch.*`):
+
+```python
+import torch
+
+@torch.jit.script
+class A:
+    def __init__(self):
+        self.x = torch.rand(3)
+
+    def f(self, y: torch.device):
+        return self.x.to(device=y)
+
+def g():
+    a = A()
+    return a.f(torch.device("cpu"))
+
+script_g = torch.jit.script(g)
+print(script_g.graph)
+```
+
+### Custom Class
+
+Unlike built-in classes, semantics of custom classes are user-defined and the entire class definition must be compilable to TorchScript IR and subject to TorchScript type-checking rules.
+
+```
+TSClassDef ::= [ "@torch.jit.script" ]
+                  "class" ClassName [ "(object)" ]  ":"
+                    MethodDefinition |
+                [ "@torch.jit.ignore" ] | [ "@torch.jit.unused" ]
+                    MethodDefinition
+```
+
+Where:
+
+- Classes must be new-style classes. Python 3 supports only new-style classes. In Python 2.x, a new-style class is specified by subclassing from the object.
+- Instance data attributes are statically typed, and instance attributes must be declared by assignments inside the `__init__()` method.
+- Method overloading is not supported (i.e., you cannot have multiple methods with the same method name).
+- `MethodDefinition` must be compilable to TorchScript IR and adhere to TorchScript’s type-checking rules, (i.e., all methods must be valid TorchScript functions and class attribute definitions must be valid TorchScript statements).
+- `torch.jit.ignore` and `torch.jit.unused` can be used to ignore the method or function that is not fully torchscriptable or should be ignored by the compiler.
+
+**Compared to Python**
+
+TorchScript custom classes are quite limited compared to their Python counterpart. Torchscript custom classes:
+
+- Do not support class attributes.
+- Do not support subclassing except for subclassing an interface type or object.
+- Do not support method overloading.
+- Must initialize all its instance attributes in `__init__()`; this is because TorchScript constructs a static schema of the class by inferring attribute types in `__init__()`.
+- Must contain only methods that satisfy TorchScript type-checking rules and are compilable to TorchScript IRs.
+
+**Example 1**
+
+Python classes can be used in TorchScript if they are annotated with `@torch.jit.script`, similar to how a TorchScript function would be declared:
+
+```python
+@torch.jit.script
+class MyClass:
+    def __init__(self, x: int):
+        self.x = x
+
+    def inc(self, val: int):
+        self.x += val
+```
+
+**Example 2**
+
+A TorchScript custom class type must "declare" all its instance attributes by assignments in `__init__()`. If an instance attribute is not defined in `__init__()` but accessed in other methods of the class, the class cannot be compiled as a TorchScript class, as shown in the following example:
+
+```python
+import torch
+
+@torch.jit.script
+class foo:
+    def __init__(self):
+        self.y = 1
+
+# ERROR: self.x is not defined in __init__
+def assign_x(self):
+    self.x = torch.rand(2, 3)
+```
+
+The class will fail to compile and issue the following error:
+
+```
+RuntimeError:
+Tried to set nonexistent attribute: x. Did you forget to initialize it in __init__()?:
+def assign_x(self):
+    self.x = torch.rand(2, 3)
+    ~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+```
+
+**Example 3**
+
+In this example, a TorchScript custom class defines a class variable name, which is not allowed:
+
+```python
+import torch
+
+@torch.jit.script
+class MyClass(object):
+    name = "MyClass"
+    def __init__(self, x: int):
+        self.x = x
+
+def fn(a: MyClass):
+    return a.name
+```
+
+It leads to the following compile-time error:
+
+```
+RuntimeError:
+'__torch__.MyClass' object has no attribute or method 'name'. Did you forget to initialize an attribute in __init__()?:
+    File "test-class2.py", line 10
+def fn(a: MyClass):
+    return a.name
+        ~~~~~~ <--- HERE
+```
+
+### Enum Type
+
+Like custom classes, semantics of the enum type are user-defined and the entire class definition must be compilable to TorchScript IR and adhere to TorchScript type-checking rules.
+
+```
+TSEnumDef ::= "class" Identifier "(enum.Enum | TSEnumType)" ":"
+                ( MemberIdentifier "=" Value )+
+                ( MethodDefinition )*
+```
+
+Where:
+
+- Value must be a TorchScript literal of type `int`, `float`, or `str`, and must be of the same TorchScript type.
+- `TSEnumType` is the name of a TorchScript enumerated type. Similar to Python enum, TorchScript allows restricted `Enum` subclassing, that is, subclassing an enumerated is allowed only if it does not define any members.
+
+**Compared to Python**
+
+- TorchScript supports only `enum.Enum`. It does not support other variations such as `enum.IntEnum`, `enum.Flag`, `enum.IntFlag`, and `enum.auto`.
+- Values of TorchScript enum members must be of the same type and can only be `int`, `float`, or `str` types, whereas Python enum members can be of any type.
+- Enums containing methods are ignored in TorchScript.
+
+**Example 1**
+
+The following example defines the class `Color` as an `Enum` type:
+
+```python
+import torch
+from enum import Enum
+
+class Color(Enum):
+    RED = 1
+    GREEN = 2
+
+def enum_fn(x: Color, y: Color) -> bool:
+    if x == Color.RED:
+        return True
+    return x == y
+
+m = torch.jit.script(enum_fn)
+
+print("Eager: ", enum_fn(Color.RED, Color.GREEN))
+print("TorchScript: ", m(Color.RED, Color.GREEN))
+```
+
+**Example 2**
+
+The following example shows the case of restricted enum subclassing, where `BaseColor` does not define any member, thus can be subclassed by `Color`:
+
+```python
+import torch
+from enum import Enum
+
+class BaseColor(Enum):
+    def foo(self):
+        pass
+
+class Color(BaseColor):
+    RED = 1
+    GREEN = 2
+
+def enum_fn(x: Color, y: Color) -> bool:
+    if x == Color.RED:
+        return True
+    return x == y
+
+m = torch.jit.script(enum_fn)
+
+print("TorchScript: ", m(Color.RED, Color.GREEN))
+print("Eager: ", enum_fn(Color.RED, Color.GREEN))
+```
+
+### TorchScript Module Class
+
+`TSModuleType` is a special class type that is inferred from object instances that are created outside TorchScript. `TSModuleType` is named by the Python class of the object instance. The `__init__()` method of the Python class is not considered a TorchScript method, so it does not have to comply with TorchScript’s type-checking rules.
+
+The type schema of a module instance class is constructed directly from an instance object (created outside the scope of TorchScript) rather than inferred from `__init__()` like custom classes. It is possible that two objects of the same instance class type follow two different type schemas.
+
+In this sense, `TSModuleType` is not really a static type. Therefore, for type safety considerations, `TSModuleType` cannot be used in a TorchScript type annotation or be composed with `TSType`.
+
+### Module Instance Class
+
+TorchScript module type represents the type schema of a user-defined PyTorch module instance. When scripting a PyTorch module, the module object is always created outside TorchScript (i.e., passed in as parameter to `forward`). The Python module class is treated as a module instance class, so the `__init__()` method of the Python module class is not subject to the type-checking rules of TorchScript.
+
+```
+TSModuleType ::= "class" Identifier "(torch.nn.Module)" ":"
+                    ClassBodyDefinition
+```
+
+Where:
+
+- `forward()` and other methods decorated with `@torch.jit.export` must be compilable to TorchScript IR and subject to TorchScript’s type-checking rules.
+
+Unlike custom classes, only the forward method and other methods decorated with `@torch.jit.export` of the module type need to be compilable. Most notably, `__init__()` is not considered a TorchScript method. Consequently, module type constructors cannot be invoked within the scope of TorchScript. Instead, TorchScript module objects are always constructed outside and passed into `torch.jit.script(ModuleObj)`.
+
+**Example 1**
+
+This example illustrates a few features of module types:
+
+- The `TestModule` instance is created outside the scope of TorchScript (i.e., before invoking `torch.jit.script`).
+- `__init__()` is not considered a TorchScript method, therefore, it does not have to be annotated and can contain arbitrary Python code. In addition, the `__init__()` method of an instance class cannot be invoked in TorchScript code. Because `TestModule` instances are instantiated in Python, in this example, `TestModule(2.0)` and `TestModule(2)` create two instances with different types for its data attributes. `self.x` is of type `float` for `TestModule(2.0)`, whereas `self.y` is of type `int` for `TestModule(2.0)`.
+- TorchScript automatically compiles other methods (e.g., `mul()`) invoked by methods annotated via `@torch.jit.export` or `forward()` methods.
+- Entry-points to a TorchScript program are either `forward()` of a module type, functions annotated as `torch.jit.script`, or methods annotated as `torch.jit.export`.
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    class TestModule(torch.nn.Module):
+        def __init__(self, v):
+            super().__init__()
+            self.x = v
+
+        def forward(self, inc: int):
+            return self.x + inc
+
+    m = torch.jit.script(TestModule(1))
+    print(f"First instance: {m(3)}")
+
+    m = torch.jit.script(TestModule(torch.ones([5])))
+    print(f"Second instance: {m(3)}")
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    First instance: 4
+    Second instance: tensor([4., 4., 4., 4., 4.])
+```
+
+**Example 2**
+
+The following example shows an incorrect usage of module type. Specifically, this example invokes the constructor of `TestModule` inside the scope of TorchScript:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    class TestModule(torch.nn.Module):
+        def __init__(self, v):
+            super().__init__()
+            self.x = v
+
+        def forward(self, x: int):
+            return self.x + x
+
+    class MyModel:
+        def __init__(self, v: int):
+            self.val = v
+
+        @torch.jit.export
+        def doSomething(self, val: int) -> int:
+            # error: should not invoke the constructor of module type
+            myModel = TestModule(self.val)
+            return myModel(val)
+
+    # m = torch.jit.script(MyModel(2)) # Results in below RuntimeError
+    # RuntimeError: Could not get name of python class object
+```
+
+(type-annotation)=
+
+## Type Annotation
+
+Since TorchScript is statically typed, programmers need to annotate types at *strategic points* of TorchScript code so that every local variable or
+instance data attribute has a static type, and every function and method has a statically typed signature.
+
+### When to Annotate Types
+
+In general, type annotations are only needed in places where static types cannot be automatically inferred (e.g., parameters or sometimes return types to
+methods or functions). Types of local variables and data attributes are often automatically inferred from their assignment statements. Sometimes an inferred type
+may be too restrictive, e.g., `x` being inferred as `NoneType` through assignment `x = None`, whereas `x` is actually used as an `Optional`. In such
+cases, type annotations may be needed to overwrite auto inference, e.g., `x: Optional[int] = None`. Note that it is always safe to type annotate a local variable
+or data attribute even if its type can be automatically inferred. The annotated type must be congruent with TorchScript’s type-checking.
+
+When a parameter, local variable, or data attribute is not type annotated and its type cannot be automatically inferred, TorchScript assumes it to be a
+default type of `TensorType`, `List[TensorType]`, or `Dict[str, TensorType]`.
+
+### Annotate Function Signature
+
+Since a parameter may not be automatically inferred from the body of the function (including both functions and methods), they need to be type annotated. Otherwise, they assume the default type `TensorType`.
+
+TorchScript supports two styles for method and function signature type annotation:
+
+- **Python3-style** annotates types directly on the signature. As such, it allows individual parameters to be left unannotated (whose type will be the default type of `TensorType`), or allows the return type to be left unannotated (whose type will be automatically inferred).
+
+```
+Python3Annotation ::= "def" Identifier [ "(" ParamAnnot* ")" ] [ReturnAnnot] ":"
+                            FuncOrMethodBody
+ParamAnnot        ::= Identifier [ ":" TSType ] ","
+ReturnAnnot       ::= "->" TSType
+```
+
+Note that when using Python3 style, the type `self` is automatically inferred and should not be annotated.
+
+- **Mypy style** annotates types as a comment right below the function/method declaration. In the Mypy style, since parameter names do not appear in the annotation, all parameters have to be annotated.
+
+```
+MyPyAnnotation ::= "# type:" "(" ParamAnnot* ")" [ ReturnAnnot ]
+ParamAnnot     ::= TSType ","
+ReturnAnnot    ::= "->" TSType
+```
+
+**Example 1**
+
+In this example:
+
+- `a` is not annotated and assumes the default type of `TensorType`.
+- `b` is annotated as type `int`.
+- The return type is not annotated and is automatically inferred as type `TensorType` (based on the type of the value being returned).
+
+```python
+import torch
+
+def f(a, b: int):
+    return a+b
+
+m = torch.jit.script(f)
+print("TorchScript:", m(torch.ones([6]), 100))
+```
+
+**Example 2**
+
+The following example uses Mypy style annotation. Note that parameters or return values must be annotated even if some of
+them assume the default type.
+
+```python
+import torch
+
+def f(a, b):
+    # type: (torch.Tensor, int) → torch.Tensor
+    return a+b
+
+m = torch.jit.script(f)
+print("TorchScript:", m(torch.ones([6]), 100))
+```
+
+### Annotate Variables and Data Attributes
+
+In general, types of data attributes (including class and instance data attributes) and local variables can be automatically inferred from assignment statements.
+Sometimes, however, if a variable or attribute is associated with values of different types (e.g., as `None` or `TensorType`), then they may need to be explicitly
+type annotated as a *wider* type such as `Optional[int]` or `Any`.
+
+#### Local Variables
+
+Local variables can be annotated according to Python3 typing module annotation rules, i.e.,
+
+```
+LocalVarAnnotation ::= Identifier [":" TSType] "=" Expr
+```
+
+In general, types of local variables can be automatically inferred. In some cases, however, you may need to annotate a multi-type for local variables
+that may be associated with different concrete types. Typical multi-types include `Optional[T]` and `Any`.
+
+**Example**
+
+```python
+import torch
+
+def f(a, setVal: bool):
+    value: Optional[torch.Tensor] = None
+    if setVal:
+        value = a
+    return value
+
+ones = torch.ones([6])
+m = torch.jit.script(f)
+print("TorchScript:", m(ones, True), m(ones, False))
+```
+
+#### Instance Data Attributes
+
+For `ModuleType` classes, instance data attributes can be annotated according to Python3 typing module annotation rules. Instance data attributes can be annotated (optionally) as final
+via `Final`.
+
+```
+"class" ClassIdentifier "(torch.nn.Module):"
+InstanceAttrIdentifier ":" ["Final("] TSType [")"]
+...
+```
+
+Where:
+
+- `InstanceAttrIdentifier` is the name of an instance attribute.
+- `Final` indicates that the attribute cannot be re-assigned outside of `__init__` or overridden in subclasses.
+
+**Example**
+
+```python
+import torch
+
+class MyModule(torch.nn.Module):
+    offset_: int
+
+def __init__(self, offset):
+    self.offset_ = offset
+
+...
+```
+
+### Type Annotation APIs
+
+#### `torch.jit.annotate(T, expr)`
+
+This API annotates type `T` to an expression `expr`. This is often used when the default type of an expression is not the type intended by the programmer.
+For instance, an empty list (dictionary) has the default type of `List[TensorType]` (`Dict[TensorType, TensorType]`), but sometimes it may be used to initialize
+a list of some other types. Another common use case is for annotating the return type of `tensor.tolist()`. Note, however, that it cannot be used to annotate
+the type of a module attribute in `__init__`; `torch.jit.Attribute` should be used for this instead.
+
+**Example**
+
+In this example, `[]` is declared as a list of integers via `torch.jit.annotate` (instead of assuming `[]` to be the default type of `List[TensorType]`):
+
+```python
+import torch
+from typing import List
+
+def g(l: List[int], val: int):
+    l.append(val)
+    return l
+
+def f(val: int):
+    l = g(torch.jit.annotate(List[int], []), val)
+    return l
+
+m = torch.jit.script(f)
+print("Eager:", f(3))
+print("TorchScript:", m(3))
+```
+
+See {meth}`torch.jit.annotate` for more information.
+
+### Type Annotation Appendix
+
+#### TorchScript Type System Definition
+
+```
+TSAllType       ::= TSType | TSModuleType
+TSType          ::= TSMetaType | TSPrimitiveType | TSStructuralType | TSNominalType
+
+TSMetaType      ::= "Any"
+TSPrimitiveType ::= "int" | "float" | "double" | "complex" | "bool" | "str" | "None"
+
+TSStructuralType ::= TSTuple | TSNamedTuple | TSList | TSDict | TSOptional |
+                    TSUnion | TSFuture | TSRRef | TSAwait
+TSTuple         ::= "Tuple" "[" (TSType ",")* TSType "]"
+TSNamedTuple    ::= "namedtuple" "(" (TSType ",")* TSType ")"
+TSList          ::= "List" "[" TSType "]"
+TSOptional      ::= "Optional" "[" TSType "]"
+TSUnion         ::= "Union" "[" (TSType ",")* TSType "]"
+TSFuture        ::= "Future" "[" TSType "]"
+TSRRef          ::= "RRef" "[" TSType "]"
+TSAwait         ::= "Await" "[" TSType "]"
+TSDict          ::= "Dict" "[" KeyType "," TSType "]"
+KeyType         ::= "str" | "int" | "float" | "bool" | TensorType | "Any"
+
+TSNominalType   ::= TSBuiltinClasses | TSCustomClass | TSEnum
+TSBuiltinClass  ::= TSTensor | "torch.device" | "torch.stream"|
+                    "torch.dtype" | "torch.nn.ModuleList" |
+                    "torch.nn.ModuleDict" | ...
+TSTensor        ::= "torch.tensor" and subclasses
+```
+
+#### Unsupported Typing Constructs
+
+TorchScript does not support all features and types of the Python3 [typing](https://docs.python.org/3/library/typing.html#module-typing) module.
+Any functionality from the [typing](https://docs.python.org/3/library/typing.html#module-typing) module that is not explicitly specified in this
+documentation is unsupported. The following table summarizes `typing` constructs that are either unsupported or supported with restrictions in TorchScript.
+
+```{eval-rst}
+=============================  ================
+ Item                           Description
+-----------------------------  ----------------
+``typing.Any``                  In development
+``typing.NoReturn``             Not supported
+``typing.Callable``             Not supported
+``typing.Literal``              Not supported
+``typing.ClassVar``             Not supported
+``typing.Final``                Supported for module attributes, class attribute, and annotations, but not for functions.
+``typing.AnyStr``               Not supported
+``typing.overload``             In development
+Type aliases                    Not supported
+Nominal typing                  In development
+Structural typing               Not supported
+NewType                         Not supported
+Generics                        Not supported
+=============================  ================
+```
+
+(expressions)=
+
+## Expressions
+
+The following section describes the grammar of expressions that are supported in TorchScript.
+It is modeled after [the expressions chapter of the Python language reference](https://docs.python.org/3/reference/expressions.html).
+
+### Arithmetic Conversions
+
+There are a number of implicit type conversions that are performed in TorchScript:
+
+- A `Tensor` with a `float` or `int` data type can be implicitly converted to an instance of `FloatType` or `IntType` provided that it has a size of 0, does not have `require_grad` set to `True`, and will not require narrowing.
+- Instances of `StringType` can be implicitly converted to `DeviceType`.
+- The implicit conversion rules from the two bullet points above can be applied to instances of `TupleType` to produce instances of `ListType` with the appropriate contained type.
+
+Explicit conversions can be invoked using the `float`, `int`, `bool`, and `str` built-in functions
+that accept primitive data types as arguments and can accept user-defined types if they implement
+`__bool__`, `__str__`, etc.
+
+### Atoms
+
+Atoms are the most basic elements of expressions.
+
+```
+atom      ::=  identifier | literal | enclosure
+enclosure ::=  parenth_form | list_display | dict_display
+```
+
+#### Identifiers
+
+The rules that dictate what is a legal identifier in TorchScript are the same as
+their [Python counterparts](https://docs.python.org/3/reference/lexical_analysis.html#identifiers).
+
+#### Literals
+
+```
+literal ::=  stringliteral | integer | floatnumber
+```
+
+Evaluation of a literal yields an object of the appropriate type with the specific value
+(with approximations applied as necessary for floats). Literals are immutable, and multiple evaluations
+of identical literals may obtain the same object or distinct objects with the same value.
+[stringliteral](https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals),
+[integer](https://docs.python.org/3/reference/lexical_analysis.html#integer-literals), and
+[floatnumber](https://docs.python.org/3/reference/lexical_analysis.html#floating-point-literals)
+are defined in the same way as their Python counterparts.
+
+#### Parenthesized Forms
+
+```
+parenth_form ::=  '(' [expression_list] ')'
+```
+
+A parenthesized expression list yields whatever the expression list yields. If the list contains at least one
+comma, it yields a `Tuple`; otherwise, it yields the single expression inside the expression list. An empty
+pair of parentheses yields an empty `Tuple` object (`Tuple[]`).
+
+#### List and Dictionary Displays
+
+```
+list_comprehension ::=  expression comp_for
+comp_for           ::=  'for' target_list 'in' or_expr
+list_display       ::=  '[' [expression_list | list_comprehension] ']'
+dict_display       ::=  '{' [key_datum_list | dict_comprehension] '}'
+key_datum_list     ::=  key_datum (',' key_datum)*
+key_datum          ::=  expression ':' expression
+dict_comprehension ::=  key_datum comp_for
+```
+
+Lists and dicts can be constructed by either listing the container contents explicitly or by providing
+instructions on how to compute them via a set of looping instructions (i.e. a *comprehension*). A comprehension
+is semantically equivalent to using a for loop and appending to an ongoing list.
+Comprehensions implicitly create their own scope to make sure that the items of the target list do not leak into the
+enclosing scope. In the case that container items are explicitly listed, the expressions in the expression list
+are evaluated left-to-right. If a key is repeated in a `dict_display` that has a `key_datum_list`, the
+resultant dictionary uses the value from the rightmost datum in the list that uses the repeated key.
+
+### Primaries
+
+```
+primary ::=  atom | attributeref | subscription | slicing | call
+```
+
+#### Attribute References
+
+```
+attributeref ::=  primary '.' identifier
+```
+
+The `primary` must evaluate to an object of a type that supports attribute references that have an attribute named
+`identifier`.
+
+#### Subscriptions
+
+```
+subscription ::=  primary '[' expression_list ']'
+```
+
+The `primary` must evaluate to an object that supports subscription.
+
+- If the primary is a `List`, `Tuple`, or `str`, the expression list must evaluate to an integer or slice.
+- If the primary is a `Dict`, the expression list must evaluate to an object of the same type as the key type of the `Dict`.
+- If the primary is a `ModuleList`, the expression list must be an `integer` literal.
+- If the primary is a `ModuleDict`, the expression must be a `stringliteral`.
+
+#### Slicings
+
+A slicing selects a range of items in a `str`, `Tuple`, `List`, or `Tensor`. Slicings may be used as
+expressions or targets in assignment or `del` statements.
+
+```
+slicing      ::=  primary '[' slice_list ']'
+slice_list   ::=  slice_item (',' slice_item)* [',']
+slice_item   ::=  expression | proper_slice
+proper_slice ::=  [expression] ':' [expression] [':' [expression] ]
+```
+
+Slicings with more than one slice item in their slice lists can only be used with primaries that evaluate to an
+object of type `Tensor`.
+
+#### Calls
+
+```
+call          ::=  primary '(' argument_list ')'
+argument_list ::=  args [',' kwargs] | kwargs
+args          ::=  [arg (',' arg)*]
+kwargs        ::=  [kwarg (',' kwarg)*]
+kwarg         ::=  arg '=' expression
+arg           ::=  identifier
+```
+
+The `primary` must desugar or evaluate to a callable object. All argument expressions are evaluated
+before the call is attempted.
+
+### Power Operator
+
+```
+power ::=  primary ['**' u_expr]
+```
+
+The power operator has the same semantics as the built-in pow function (not supported); it computes its
+left argument raised to the power of its right argument. It binds more tightly than unary operators on the
+left, but less tightly than unary operators on the right; i.e. `-2 ** -3 == -(2 ** (-3))`. The left and right
+operands can be `int`, `float` or `Tensor`. Scalars are broadcast in the case of scalar-tensor/tensor-scalar
+exponentiation operations, and tensor-tensor exponentiation is done elementwise without any broadcasting.
+
+### Unary and Arithmetic Bitwise Operations
+
+```
+u_expr ::=  power | '-' power | '~' power
+```
+
+The unary `-` operator yields the negation of its argument. The unary `~` operator yields the bitwise inversion
+of its argument. `-` can be used with `int`, `float`, and `Tensor` of `int` and `float`.
+`~` can only be used with `int` and `Tensor` of `int`.
+
+### Binary Arithmetic Operations
+
+```
+m_expr ::=  u_expr | m_expr '*' u_expr | m_expr '@' m_expr | m_expr '//' u_expr | m_expr '/' u_expr | m_expr '%' u_expr
+a_expr ::=  m_expr | a_expr '+' m_expr | a_expr '-' m_expr
+```
+
+The binary arithmetic operators can operate on `Tensor`, `int`, and `float`. For tensor-tensor ops, both arguments must
+have the same shape. For scalar-tensor or tensor-scalar ops, the scalar is usually broadcast to the size of the
+tensor. Division ops can only accept scalars as their right-hand side argument, and do not support broadcasting.
+The `@` operator is for matrix multiplication and only operates on `Tensor` arguments. The multiplication operator
+(`*`) can be used with a list and integer in order to get a result that is the original list repeated a certain
+number of times.
+
+### Shifting Operations
+
+```
+shift_expr ::=  a_expr | shift_expr ( '<<' | '>>' ) a_expr
+```
+
+These operators accept two `int` arguments, two `Tensor` arguments, or a `Tensor` argument and an `int` or
+`float` argument. In all cases, a right shift by `n` is defined as floor division by `pow(2, n)`, and a left shift
+by `n` is defined as multiplication by `pow(2, n)`. When both arguments are `Tensors`, they must have the same
+shape. When one is a scalar and the other is a `Tensor`, the scalar is logically broadcast to match the size of
+the `Tensor`.
+
+### Binary Bitwise Operations
+
+```
+and_expr ::=  shift_expr | and_expr '&' shift_expr
+xor_expr ::=  and_expr | xor_expr '^' and_expr
+or_expr  ::=  xor_expr | or_expr '|' xor_expr
+```
+
+The `&` operator computes the bitwise AND of its arguments, the `^` the bitwise XOR, and the `|` the bitwise OR.
+Both operands must be `int` or `Tensor`, or the left operand must be `Tensor` and the right operand must be
+`int`. When both operands are `Tensor`, they must have the same shape. When the right operand is `int`, and
+the left operand is `Tensor`, the right operand is logically broadcast to match the shape of the `Tensor`.
+
+### Comparisons
+
+```
+comparison    ::=  or_expr (comp_operator or_expr)*
+comp_operator ::=  '<' | '>' | '==' | '>=' | '<=' | '!=' | 'is' ['not'] | ['not'] 'in'
+```
+
+A comparison yields a boolean value (`True` or `False`), or if one of the operands is a `Tensor`, a boolean
+`Tensor`. Comparisons can be chained arbitrarily as long as they do not yield boolean `Tensors` that have more
+than one element. `a op1 b op2 c ...` is equivalent to `a op1 b and b op2 c and ...`.
+
+#### Value Comparisons
+
+The operators `<`, `>`, `==`, `>=`, `<=`, and `!=` compare the values of two objects. The two objects generally need to be of
+the same type, unless there is an implicit type conversion available between the objects. User-defined types can
+be compared if rich comparison methods (e.g., `__lt__`) are defined on them. Built-in type comparison works like
+Python:
+
+- Numbers are compared mathematically.
+- Strings are compared lexicographically.
+- `lists`, `tuples`, and `dicts` can be compared only to other `lists`, `tuples`, and `dicts` of the same type and are compared using the comparison operator of corresponding elements.
+
+#### Membership Test Operations
+
+The operators `in` and `not in` test for membership. `x in s` evaluates to `True` if `x` is a member of `s` and `False` otherwise.
+`x not in s` is equivalent to `not x in s`. This operator is supported for `lists`, `dicts`, and `tuples`, and can be used with
+user-defined types if they implement the `__contains__` method.
+
+#### Identity Comparisons
+
+For all types except `int`, `double`, `bool`, and `torch.device`, operators `is` and `is not` test for the object’s identity;
+`x is y` is `True` if and only if `x` and `y` are the same object. For all other types, `is` is equivalent to
+comparing them using `==`. `x is not y` yields the inverse of `x is y`.
+
+### Boolean Operations
+
+```
+or_test  ::=  and_test | or_test 'or' and_test
+and_test ::=  not_test | and_test 'and' not_test
+not_test ::=  'bool' '(' or_expr ')' | comparison | 'not' not_test
+```
+
+User-defined objects can customize their conversion to `bool` by implementing a `__bool__` method. The operator `not`
+yields `True` if its operand is false, `False` otherwise. The expression `x` and `y` first evaluates `x`; if it is `False`, its
+value (`False`) is returned; otherwise, `y` is evaluated and its value is returned (`False` or `True`). The expression `x` or `y`
+first evaluates `x`; if it is `True`, its value (`True`) is returned; otherwise, `y` is evaluated and its value is returned
+(`False` or `True`).
+
+### Conditional Expressions
+
+```
+conditional_expression ::=  or_expr ['if' or_test 'else' conditional_expression]
+expression            ::=  conditional_expression
+```
+
+The expression `x if c else y` first evaluates the condition `c` rather than x. If `c` is `True`, `x` is
+evaluated and its value is returned; otherwise, `y` is evaluated and its value is returned. As with if-statements,
+`x` and `y` must evaluate to a value of the same type.
+
+### Expression Lists
+
+```
+expression_list ::=  expression (',' expression)* [',']
+starred_item    ::=  '*' primary
+```
+
+A starred item can only appear on the left-hand side of an assignment statement, e.g., `a, *b, c = ...`.
+
+% statements:
+
+## Simple Statements
+
+The following section describes the syntax of simple statements that are supported in TorchScript.
+It is modeled after [the simple statements chapter of the Python language reference](https://docs.python.org/3/reference/simple_stmts.html).
+
+### Expression Statements
+
+```
+expression_stmt    ::=  starred_expression
+starred_expression ::=  expression | (starred_item ",")* [starred_item]
+starred_item       ::=  assignment_expression | "*" or_expr
+```
+
+### Assignment Statements
+
+```
+assignment_stmt ::=  (target_list "=")+ (starred_expression)
+target_list     ::=  target ("," target)* [","]
+target          ::=  identifier
+                    | "(" [target_list] ")"
+                    | "[" [target_list] "]"
+                    | attributeref
+                    | subscription
+                    | slicing
+                    | "*" target
+```
+
+### Augmented Assignment Statements
+
+```
+augmented_assignment_stmt ::= augtarget augop (expression_list)
+augtarget                 ::= identifier | attributeref | subscription
+augop                     ::= "+=" | "-=" | "*=" | "/=" | "//=" | "%=" |
+                              "**="| ">>=" | "<<=" | "&=" | "^=" | "|="
+```
+
+### Annotated Assignment Statements
+
+```
+annotated_assignment_stmt ::= augtarget ":" expression
+                              ["=" (starred_expression)]
+```
+
+### The `raise` Statement
+
+```
+raise_stmt ::=  "raise" [expression ["from" expression]]
+```
+
+Raise statements in TorchScript do not support `try\except\finally`.
+
+### The `assert` Statement
+
+```
+assert_stmt ::=  "assert" expression ["," expression]
+```
+
+Assert statements in TorchScript do not support `try\except\finally`.
+
+### The `return` Statement
+
+```
+return_stmt ::=  "return" [expression_list]
+```
+
+Return statements in TorchScript do not support `try\except\finally`.
+
+### The `del` Statement
+
+```
+del_stmt ::=  "del" target_list
+```
+
+### The `pass` Statement
+
+```
+pass_stmt ::= "pass"
+```
+
+### The `print` Statement
+
+```
+print_stmt ::= "print" "(" expression  [, expression] [.format{expression_list}] ")"
+```
+
+### The `break` Statement
+
+```
+break_stmt ::= "break"
+```
+
+### The `continue` Statement:
+
+```
+continue_stmt ::= "continue"
+```
+
+## Compound Statements
+
+The following section describes the syntax of compound statements that are supported in TorchScript.
+The section also highlights how Torchscript differs from regular Python statements.
+It is modeled after [the compound statements chapter of the Python language reference](https://docs.python.org/3/reference/compound_stmts.html).
+
+### The `if` Statement
+
+Torchscript supports both basic `if/else` and ternary `if/else`.
+
+#### Basic `if/else` Statement
+
+```
+if_stmt ::= "if" assignment_expression ":" suite
+            ("elif" assignment_expression ":" suite)
+            ["else" ":" suite]
+```
+
+`elif` statements can repeat for an arbitrary number of times, but it needs to be before `else` statement.
+
+#### Ternary `if/else` Statement
+
+```
+if_stmt ::= return [expression_list] "if" assignment_expression "else" [expression_list]
+```
+
+**Example 1**
+
+A `tensor` with 1 dimension is promoted to `bool`:
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+
+    @torch.jit.script
+    def fn(x: torch.Tensor):
+        if x: # The tensor gets promoted to bool
+            return True
+        return False
+    print(fn(torch.rand(1)))
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    True
+```
+
+**Example 2**
+
+A `tensor` with multi dimensions are not promoted to `bool`:
+
+```python
+import torch
+
+# Multi dimensional Tensors error out.
+
+@torch.jit.script
+def fn():
+    if torch.rand(2):
+        print("Tensor is available")
+
+    if torch.rand(4,5,6):
+        print("Tensor is available")
+
+print(fn())
+```
+
+Running the above code yields the following `RuntimeError`.
+
+```
+RuntimeError: The following operation failed in the TorchScript interpreter.
+Traceback of TorchScript (most recent call last):
+@torch.jit.script
+def fn():
+    if torch.rand(2):
+      ~~~~~~~~~~~~ <--- HERE
+        print("Tensor is available")
+RuntimeError: Boolean value of Tensor with more than one value is ambiguous
+```
+
+If a conditional variable is annotated as `final`, either the true or false branch is evaluated depending on the evaluation of the conditional variable.
+
+**Example 3**
+
+In this example, only the True branch is evaluated, since `a` is annotated as `final` and set to `True`:
+
+```python
+import torch
+
+a : torch.jit.final[Bool] = True
+
+if a:
+    return torch.empty(2,3)
+else:
+    return []
+```
+
+### The `while` Statement
+
+```
+while_stmt ::=  "while" assignment_expression ":" suite
+```
+
+`while...else` statements are not supported in Torchscript. It results in a `RuntimeError`.
+
+### The `for-in` Statement
+
+```
+for_stmt ::=  "for" target_list "in" expression_list ":" suite
+              ["else" ":" suite]
+```
+
+`for...else` statements are not supported in Torchscript. It results in a `RuntimeError`.
+
+**Example 1**
+
+For loops on tuples: these unroll the loop, generating a body for each member of the tuple. The body must type-check correctly for each member.
+
+```{eval-rst}
+.. testcode::
+
+    import torch
+    from typing import Tuple
+
+    @torch.jit.script
+    def fn():
+        tup = (3, torch.ones(4))
+        for x in tup:
+            print(x)
+
+    fn()
+```
+
+The example above produces the following output:
+
+```{eval-rst}
+.. testoutput::
+
+    3
+     1
+     1
+     1
+     1
+    [ CPUFloatType{4} ]
+
+```
+
+**Example 2**
+
+For loops on lists: for loops over a `nn.ModuleList` will unroll the body of the loop at compile time, with each member of the module list.
+
+```python
+class SubModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(2))
+
+    def forward(self, input):
+        return self.weight + input
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mods = torch.nn.ModuleList([SubModule() for i in range(10)])
+
+    def forward(self, v):
+        for module in self.mods:
+            v = module(v)
+        return v
+
+model = torch.jit.script(MyModule())
+```
+
+### The `with` Statement
+
+The `with` statement is used to wrap the execution of a block with methods defined by a context manager.
+
+```
+with_stmt ::=  "with" with_item ("," with_item) ":" suite
+with_item ::=  expression ["as" target]
+```
+
+- If a target was included in the `with` statement, the return value from the context manager’s `__enter__()` is assigned to it. Unlike python, if an exception caused the suite to be exited, its type, value, and traceback are not passed as arguments to `__exit__()`. Three `None` arguments are supplied.
+- `try`, `except`, and `finally` statements are not supported inside `with` blocks.
+- Exceptions raised within `with` block cannot be suppressed.
+
+### The `tuple` Statement
+
+```
+tuple_stmt ::= tuple([iterables])
+```
+
+- Iterable types in TorchScript include `Tensors`, `lists`, `tuples`, `dictionaries`, `strings`, `torch.nn.ModuleList`, and `torch.nn.ModuleDict`.
+- You cannot convert a List to Tuple by using this built-in function.
+
+Unpacking all outputs into a tuple is covered by:
+
+```
+abc = func() # Function that returns a tuple
+a,b = func()
+```
+
+### The `getattr` Statement
+
+```
+getattr_stmt ::= getattr(object, name[, default])
+```
+
+- Attribute name must be a literal string.
+- Module type object is not supported (e.g., torch.\_C).
+- Custom class object is not supported (e.g., torch.classes.\*).
+
+### The `hasattr` Statement
+
+```
+hasattr_stmt ::= hasattr(object, name)
+```
+
+- Attribute name must be a literal string.
+- Module type object is not supported (e.g., torch.\_C).
+- Custom class object is not supported (e.g., torch.classes.\*).
+
+### The `zip` Statement
+
+```
+zip_stmt ::= zip(iterable1, iterable2)
+```
+
+- Arguments must be iterables.
+- Two iterables of same outer container type but different length are supported.
+
+**Example 1**
+
+Both the iterables must be of the same container type:
+
+```{eval-rst}
+.. testcode::
+
+    a = [1, 2] # List
+    b = [2, 3, 4] # List
+    zip(a, b) # works
+```
+
+**Example 2**
+
+This example fails because the iterables are of different container types:
+
+```
+a = (1, 2) # Tuple
+b = [2, 3, 4] # List
+zip(a, b) # Runtime error
+```
+
+Running the above code yields the following `RuntimeError`.
+
+```
+RuntimeError: Can not iterate over a module list or
+    tuple with a value that does not have a statically determinable length.
+```
+
+**Example 3**
+
+Two iterables of the same container Type but different data type is supported:
+
+```{eval-rst}
+.. testcode::
+
+    a = [1.3, 2.4]
+    b = [2, 3, 4]
+    zip(a, b) # Works
+```
+
+Iterable types in TorchScript include `Tensors`, `lists`, `tuples`, `dictionaries`, `strings`, `torch.nn.ModuleList`, and `torch.nn.ModuleDict`.
+
+### The `enumerate` Statement
+
+```
+enumerate_stmt ::= enumerate([iterable])
+```
+
+- Arguments must be iterables.
+- Iterable types in TorchScript include `Tensors`, `lists`, `tuples`, `dictionaries`, `strings`, `torch.nn.ModuleList` and `torch.nn.ModuleDict`.
+
+(python-values-torch-script)=
+
+## Python Values
+
+(python-builtin-functions-values-resolution)=
+
+### Resolution Rules
+
+When given a Python value, TorchScript attempts to resolve it in the following five different ways:
+
+- Compilable Python Implementation:
+  : - When a Python value is backed by a Python implementation that can be compiled by TorchScript, TorchScript compiles and uses the underlying Python implementation.
+    - Example: `torch.jit.Attribute`
+- Op Python Wrapper:
+  : - When a Python value is a wrapper of a native PyTorch op, TorchScript emits the corresponding operator.
+    - Example: `torch.jit._logging.add_stat_value`
+- Python Object Identity Match:
+  : - For a limited set of `torch.*` API calls (in the form of Python values) that TorchScript supports, TorchScript attempts to match a Python value against each item in the set.
+    - When matched, TorchScript generates a corresponding `SugaredValue` instance that contains lowering logic for these values.
+    - Example: `torch.jit.isinstance()`
+- Name Match:
+  : - For Python built-in functions and constants, TorchScript identifies them by name, and creates a corresponding `SugaredValue` instance that implements their functionality.
+    - Example: `all()`
+- Value Snapshot:
+  : - For Python values from unrecognized modules, TorchScript attempts to take a snapshot of the value and converts it to a constant in the graph of the function(s) or method(s) that are being compiled.
+    - Example: `math.pi`
+
+(python-builtin-functions-support)=
+
+### Python Built-in Functions Support
+
+```{eval-rst}
+.. list-table:: TorchScript Support for Python Built-in Functions
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Built-in Function
+     - Support Level
+     - Notes
+   * - ``abs()``
+     - Partial
+     - Only supports ``Tensor``/``Int``/``Float`` type inputs. | Doesn't honor ``__abs__`` override.
+   * - ``all()``
+     - Full
+     -
+   * - ``any()``
+     - Full
+     -
+   * - ``ascii()``
+     - None
+     -
+   * - ``bin()``
+     - Partial
+     - Only supports ``Int`` type input.
+   * - ``bool()``
+     - Partial
+     - Only supports ``Tensor``/``Int``/``Float`` type inputs.
+   * - ``breakpoint()``
+     - None
+     -
+   * - ``bytearray()``
+     - None
+     -
+   * - ``bytes()``
+     - None
+     -
+   * - ``callable()``
+     - None
+     -
+   * - ``chr()``
+     - Partial
+     - Only ASCII character set is supported.
+   * - ``classmethod()``
+     - Full
+     -
+   * - ``compile()``
+     - None
+     -
+   * - ``complex()``
+     - None
+     -
+   * - ``delattr()``
+     - None
+     -
+   * - ``dict()``
+     - Full
+     -
+   * - ``dir()``
+     - None
+     -
+   * - ``divmod()``
+     - Full
+     -
+   * - ``enumerate()``
+     - Full
+     -
+   * - ``eval()``
+     - None
+     -
+   * - ``exec()``
+     - None
+     -
+   * - ``filter()``
+     - None
+     -
+   * - ``float()``
+     - Partial
+     - Doesn't honor ``__index__`` override.
+   * - ``format()``
+     - Partial
+     - Manual index specification not supported. | Format type modifier not supported.
+   * - ``frozenset()``
+     - None
+     -
+   * - ``getattr()``
+     - Partial
+     - Attribute name must be string literal.
+   * - ``globals()``
+     - None
+     -
+   * - ``hasattr()``
+     - Partial
+     - Attribute name must be string literal.
+   * - ``hash()``
+     - Full
+     - ``Tensor``'s hash is based on identity not numeric value.
+   * - ``hex()``
+     - Partial
+     - Only supports ``Int`` type input.
+   * - ``id()``
+     - Full
+     - Only supports ``Int`` type input.
+   * - ``input()``
+     - None
+     -
+   * - ``int()``
+     - Partial
+     - ``base`` argument not supported. | Doesn't honor ``__index__`` override.
+   * - ``isinstance()``
+     - Full
+     - ``torch.jit.isintance`` provides better support when checking against container types like ``Dict[str, int]``.
+   * - ``issubclass()``
+     - None
+     -
+   * - ``iter()``
+     - None
+     -
+   * - ``len()``
+     - Full
+     -
+   * - ``list()``
+     - Full
+     -
+   * - ``ord()``
+     - Partial
+     - Only ASCII character set is supported.
+   * - ``pow()``
+     - Full
+     -
+   * - ``print()``
+     - Partial
+     - ``separate``, ``end`` and ``file`` arguments are not supported.
+   * - ``property()``
+     - None
+     -
+   * - ``range()``
+     - Full
+     -
+   * - ``repr()``
+     - None
+     -
+   * - ``reversed()``
+     - None
+     -
+   * - ``round()``
+     - Partial
+     - ``ndigits`` argument is not supported.
+   * - ``set()``
+     - None
+     -
+   * - ``setattr()``
+     - None
+     -
+   * - ``slice()``
+     - Full
+     -
+   * - ``sorted()``
+     - Partial
+     - ``key`` argument is not supported.
+   * - ``staticmethod()``
+     - Full
+     -
+   * - ``str()``
+     - Partial
+     - ``encoding`` and ``errors`` arguments are not supported.
+   * - ``sum()``
+     - Full
+     -
+   * - ``super()``
+     - Partial
+     - It can only be used in ``nn.Module``'s ``__init__`` method.
+   * - ``type()``
+     - None
+     -
+   * - ``vars()``
+     - None
+     -
+   * - ``zip()``
+     - Full
+     -
+   * - ``__import__()``
+     - None
+     -
+```
+
+(python-builtin-values-support)=
+
+### Python Built-in Values Support
+
+```{eval-rst}
+.. list-table:: TorchScript Support for Python Built-in Values
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Built-in Value
+     - Support Level
+     - Notes
+   * - ``False``
+     - Full
+     -
+   * - ``True``
+     - Full
+     -
+   * - ``None``
+     - Full
+     -
+   * - ``NotImplemented``
+     - None
+     -
+   * - ``Ellipsis``
+     - Full
+     -
+
+```
+
+(torch-apis-in-torchscript)=
+
+## torch.\* APIs
+
+(torch-apis-in-torchscript-rpc)=
+
+### Remote Procedure Calls
+
+TorchScript supports a subset of RPC APIs that supports running a function on
+a specified remote worker instead of locally.
+
+Specifically, following APIs are fully supported:
+
+- `torch.distributed.rpc.rpc_sync()`
+  : - `rpc_sync()` makes a blocking RPC call to run a function on a remote worker. RPC messages are sent and received in parallel to execution of Python code.
+    - More details about its usage and examples can be found in {meth}`~torch.distributed.rpc.rpc_sync`.
+- `torch.distributed.rpc.rpc_async()`
+  : - `rpc_async()` makes a non-blocking RPC call to run a function on a remote worker. RPC messages are sent and received in parallel to execution of Python code.
+    - More details about its usage and examples can be found in {meth}`~torch.distributed.rpc.rpc_async`.
+- `torch.distributed.rpc.remote()`
+  : - `remote.()` executes a remote call on a worker and gets a Remote Reference `RRef` as the return value.
+    - More details about its usage and examples can be found in {meth}`~torch.distributed.rpc.remote`.
+
+(torch-apis-in-torchscript-async)=
+
+### Asynchronous Execution
+
+TorchScript enables you to create asynchronous computation tasks to make better use
+of computation resources. This is done via supporting a list of APIs that are
+only usable within TorchScript:
+
+- `torch.jit.fork()`
+  : - Creates an asynchronous task executing func and a reference to the value of the result of this execution. Fork will return immediately.
+    - Synonymous to `torch.jit._fork()`, which is only kept for backward compatibility reasons.
+    - More details about its usage and examples can be found in {meth}`~torch.jit.fork`.
+- `torch.jit.wait()`
+  : - Forces completion of a `torch.jit.Future[T]` asynchronous task, returning the result of the task.
+    - Synonymous to `torch.jit._wait()`, which is only kept for backward compatibility reasons.
+    - More details about its usage and examples can be found in {meth}`~torch.jit.wait`.
+
+(torch-apis-in-torchscript-annotation)=
+
+### Type Annotations
+
+TorchScript is statically-typed. It provides and supports a set of utilities to help annotate variables and attributes:
+
+- `torch.jit.annotate()`
+  : - Provides a type hint to TorchScript where Python 3 style type hints do not work well.
+    - One common example is to annotate type for expressions like `[]`. `[]` is treated as `List[torch.Tensor]` by default. When a different type is needed, you can use this code to hint TorchScript: `torch.jit.annotate(List[int], [])`.
+    - More details can be found in {meth}`~torch.jit.annotate`
+- `torch.jit.Attribute`
+  : - Common use cases include providing type hint for `torch.nn.Module` attributes. Because their `__init__` methods are not parsed by TorchScript, `torch.jit.Attribute` should be used instead of `torch.jit.annotate` in the module's `__init__` methods.
+    - More details can be found in {meth}`~torch.jit.Attribute`
+- `torch.jit.Final`
+  : - An alias for Python's `typing.Final`. `torch.jit.Final` is kept only for backward compatibility reasons.
+
+(torch-apis-in-torchscript-meta-programming)=
+
+### Meta Programming
+
+TorchScript provides a set of utilities to facilitate meta programming:
+
+- `torch.jit.is_scripting()`
+  : - Returns a boolean value indicating whether the current program is compiled by `torch.jit.script` or not.
+    - When used in an `assert` or an `if` statement, the scope or branch where `torch.jit.is_scripting()` evaluates to `False` is not compiled.
+    - Its value can be evaluated statically at compile time, thus commonly used in `if` statements to stop TorchScript from compiling one of the branches.
+    - More details and examples can be found in {meth}`~torch.jit.is_scripting`
+- `torch.jit.is_tracing()`
+  : - Returns a boolean value indicating whether the current program is traced by `torch.jit.trace` / `torch.jit.trace_module` or not.
+    - More details can be found in {meth}`~torch.jit.is_tracing`
+- `@torch.jit.ignore`
+  : - This decorator indicates to the compiler that a function or method should be ignored and left as a Python function.
+    - This allows you to leave code in your model that is not yet TorchScript compatible.
+    - If a function decorated by `@torch.jit.ignore` is called from TorchScript, ignored functions will dispatch the call to the Python interpreter.
+    - Models with ignored functions cannot be exported.
+    - More details and examples can be found in {meth}`~torch.jit.ignore`
+- `@torch.jit.unused`
+  : - This decorator indicates to the compiler that a function or method should be ignored and replaced with the raising of an exception.
+    - This allows you to leave code in your model that is not yet TorchScript compatible and still export your model.
+    - If a function decorated by `@torch.jit.unused` is called from TorchScript, a runtime error will be raised.
+    - More details and examples can be found in {meth}`~torch.jit.unused`
+
+(torch-apis-in-torchscript-type-refinement)=
+
+### Type Refinement
+
+- `torch.jit.isinstance()`
+  : - Returns a boolean indicating whether a variable is of the specified type.
+    - More details about its usage and examples can be found in {meth}`~torch.jit.isinstance`.
\ No newline at end of file
diff --git a/docs/source/library.rst b/docs/source/library.rst
index e54211ccab6f..48b40e4e2b68 100644
--- a/docs/source/library.rst
+++ b/docs/source/library.rst
@@ -65,7 +65,11 @@ operator registration APIs.
    The low-level operator registration APIs and the PyTorch Dispatcher are a
    complicated PyTorch concept. We recommend you use the higher level APIs above
    (that do not require a torch.library.Library object) when possible.
+<<<<<<< HEAD
    This blog post <http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/>`_
+=======
+   `This blog post <http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/>`_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    is a good starting point to learn about the PyTorch Dispatcher.
 
 A tutorial that walks you through some examples on how to use this API is available on `Google Colab <https://colab.research.google.com/drive/1RRhSfk7So3Cn02itzLWE9K4Fam-8U011?usp=sharing>`_.
diff --git a/docs/source/linalg.md b/docs/source/linalg.md
new file mode 100644
index 000000000000..0c3b46b90a5a
--- /dev/null
+++ b/docs/source/linalg.md
@@ -0,0 +1,141 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# torch.linalg
+
+Common linear algebra operations.
+
+See {ref}`Linear Algebra Stability` for some common numerical edge-cases.
+
+```{eval-rst}
+.. automodule:: torch.linalg
+.. currentmodule:: torch.linalg
+```
+
+## Matrix Properties
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    norm
+    vector_norm
+    matrix_norm
+    diagonal
+    det
+    slogdet
+    cond
+    matrix_rank
+```
+
+## Decompositions
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    cholesky
+    qr
+    lu
+    lu_factor
+    eig
+    eigvals
+    eigh
+    eigvalsh
+    svd
+    svdvals
+```
+
+(linalg solvers)=
+
+## Solvers
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    solve
+    solve_triangular
+    lu_solve
+    lstsq
+```
+
+(linalg inverses)=
+
+## Inverses
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    inv
+    pinv
+```
+
+## Matrix Functions
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    matrix_exp
+    matrix_power
+```
+
+## Matrix Products
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    cross
+    matmul
+    vecdot
+    multi_dot
+    householder_product
+```
+
+## Tensor Operations
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    tensorinv
+    tensorsolve
+```
+
+## Misc
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    vander
+```
+
+## Experimental Functions
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    cholesky_ex
+    inv_ex
+    solve_ex
+    lu_factor_ex
+    ldl_factor
+    ldl_factor_ex
+    ldl_solve
+```
diff --git a/docs/source/logging.md b/docs/source/logging.md
new file mode 100644
index 000000000000..8a9b8b2b1306
--- /dev/null
+++ b/docs/source/logging.md
@@ -0,0 +1,124 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# torch._logging
+
+PyTorch has a configurable logging system, where different components can be
+given different log level settings. For instance, one component's log messages
+can be completely disabled, while another component's log messages can be
+set to maximum verbosity.
+
+:::{warning}
+This feature is in beta and may have compatibility breaking
+changes in the future.
+:::
+
+:::{warning}
+This feature has not been expanded to control the log messages of
+all components in PyTorch yet.
+:::
+
+There are two ways to configure the logging system: through the environment variable `TORCH_LOGS`
+or the python API torch._logging.set_logs.
+
+```{eval-rst}
+.. automodule:: torch._logging
+.. currentmodule:: torch._logging
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    set_logs
+```
+
+The environment variable `TORCH_LOGS` is a comma-separated list of
+`[+-]<component>` pairs, where `<component>` is a component specified below. The `+` prefix
+will decrease the log level of the component, displaying more log messages while the `-` prefix
+will increase the log level of the component and display fewer log messages. The default setting
+is the behavior when a component is not specified in `TORCH_LOGS`. In addition to components, there are
+also artifacts. Artifacts are specific pieces of debug information associated with a component that are either displayed or not displayed,
+so prefixing an artifact with `+` or `-` will be a no-op. Since they are associated with a component, enabling that component will typically also enable that artifact,
+unless that artifact was specified to be `off_by_default`. This option is specified in _registrations.py for artifacts that are so spammy they should only be displayed when explicitly enabled.
+The following components and artifacts are configurable through the `TORCH_LOGS` environment
+variable (see torch._logging.set_logs for the python API):
+
+```{eval-rst}
+Components:
+        ``all``
+            Special component which configures the default log level of all components. Default: ``logging.WARN``
+
+        ``dynamo``
+            The log level for the TorchDynamo component. Default: ``logging.WARN``
+
+        ``aot``
+            The log level for the AOTAutograd component. Default: ``logging.WARN``
+
+        ``inductor``
+            The log level for the TorchInductor component. Default: ``logging.WARN``
+
+        ``your.custom.module``
+            The log level for an arbitrary unregistered module. Provide the fully qualified name and the module will be enabled. Default: ``logging.WARN``
+```
+
+```{eval-rst}
+Artifacts:
+        ``bytecode``
+            Whether to emit the original and generated bytecode from TorchDynamo.
+            Default: ``False``
+
+        ``aot_graphs``
+            Whether to emit the graphs generated by AOTAutograd. Default: ``False``
+
+        ``aot_joint_graph``
+            Whether to emit the joint forward-backward graph generated by AOTAutograd. Default: ``False``
+
+        ``compiled_autograd``
+            Whether to emit logs from compiled_autograd. Defaults: ``False``
+
+        ``ddp_graphs``
+            Whether to emit graphs generated by DDPOptimizer. Default: ``False``
+
+        ``graph``
+            Whether to emit the graph captured by TorchDynamo in tabular format.
+            Default: ``False``
+
+        ``graph_code``
+            Whether to emit the python source of the graph captured by TorchDynamo.
+            Default: ``False``
+
+        ``graph_breaks``
+            Whether to emit a message when a unique graph break is encountered during
+            TorchDynamo tracing. Default: ``False``
+
+        ``guards``
+            Whether to emit the guards generated by TorchDynamo for each compiled
+            function. Default: ``False``
+
+        ``recompiles``
+            Whether to emit a guard failure reason and message every time
+            TorchDynamo recompiles a function. Default: ``False``
+
+        ``output_code``
+            Whether to emit the TorchInductor output code. Default: ``False``
+
+        ``schedule``
+            Whether to emit the TorchInductor schedule. Default: ``False``
+```
+
+```{eval-rst}
+Examples:
+    ``TORCH_LOGS="+dynamo,aot"`` will set the log level of TorchDynamo to ``logging.DEBUG`` and  AOT to ``logging.INFO``
+
+    ``TORCH_LOGS="-dynamo,+inductor"`` will set the log level of TorchDynamo to ``logging.ERROR`` and  TorchInductor to ``logging.DEBUG``
+
+    ``TORCH_LOGS="aot_graphs"`` will enable the ``aot_graphs`` artifact
+
+    ``TORCH_LOGS="+dynamo,schedule"`` will enable set the log level of TorchDynamo to ``logging.DEBUG`` and enable the ``schedule`` artifact
+
+    ``TORCH_LOGS="+some.random.module,schedule"`` will set the log level of some.random.module to ``logging.DEBUG`` and enable the ``schedule`` artifact
+```
diff --git a/docs/source/masked.md b/docs/source/masked.md
new file mode 100644
index 000000000000..d193578d6937
--- /dev/null
+++ b/docs/source/masked.md
@@ -0,0 +1,316 @@
+```{eval-rst}
+.. automodule:: torch.masked
+.. automodule:: torch.masked.maskedtensor
+```
+
+```{eval-rst}
+.. currentmodule:: torch
+```
+
+(masked-docs)=
+
+# torch.masked
+
+## Introduction
+
+### Motivation
+
+:::{warning}
+The PyTorch API of masked tensors is in the prototype stage and may or may not change in the future.
+:::
+
+MaskedTensor serves as an extension to {class}`torch.Tensor` that provides the user with the ability to:
+
+* use any masked semantics (e.g. variable length tensors, nan* operators, etc.)
+* differentiate between 0 and NaN gradients
+* various sparse applications (see tutorial below)
+
+"Specified" and "unspecified" have a long history in PyTorch without formal semantics and certainly without
+consistency; indeed, MaskedTensor was born out of a build up of issues that the vanilla {class}`torch.Tensor`
+class could not properly address. Thus, a primary goal of MaskedTensor is to become the source of truth for
+said "specified" and "unspecified" values in PyTorch where they are a first class citizen instead of an afterthought.
+In turn, this should further unlock [sparsity's](https://pytorch.org/docs/stable/sparse.html) potential,
+enable safer and more consistent operators, and provide a smoother and more intuitive experience
+for users and developers alike.
+
+### What is a MaskedTensor?
+
+A MaskedTensor is a tensor subclass that consists of 1) an input (data), and 2) a mask. The mask tells us
+which entries from the input should be included or ignored.
+
+By way of example, suppose that we wanted to mask out all values that are equal to 0 (represented by the gray)
+and take the max:
+
+```{eval-rst}
+.. image:: _static/img/masked/tensor_comparison.jpg
+      :scale: 50%
+```
+
+On top is the vanilla tensor example while the bottom is MaskedTensor where all the 0's are masked out.
+This clearly yields a different result depending on whether we have the mask, but this flexible structure
+allows the user to systematically ignore any elements they'd like during computation.
+
+There are already a number of existing tutorials that we've written to help users onboard, such as:
+
+- [Overview – the place to start for new users, discusses how to use MaskedTensors and why they're useful](https://pytorch.org/tutorials/prototype/maskedtensor_overview)
+- [Sparsity – MaskedTensor supports sparse COO and CSR data and mask Tensors](https://pytorch.org/tutorials/prototype/maskedtensor_sparsity)
+- [Adagrad sparse semantics – a practical example of how MaskedTensor can simplify sparse semantics and implementations](https://pytorch.org/tutorials/prototype/maskedtensor_adagrad)
+- [Advanced semantics – discussion on why certain decisions were made (e.g. requiring masks to match for binary/reduction operations), differences with NumPy's MaskedArray, and reduction semantics](https://pytorch.org/tutorials/prototype/maskedtensor_advanced_semantics)
+
+## Supported Operators
+
+### Unary Operators
+
+Unary operators are operators that only contain only a single input.
+Applying them to MaskedTensors is relatively straightforward: if the data is masked out at a given index,
+we apply the operator, otherwise we'll continue to mask out the data.
+
+The available unary operators are:
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    abs
+    absolute
+    acos
+    arccos
+    acosh
+    arccosh
+    angle
+    asin
+    arcsin
+    asinh
+    arcsinh
+    atan
+    arctan
+    atanh
+    arctanh
+    bitwise_not
+    ceil
+    clamp
+    clip
+    conj_physical
+    cos
+    cosh
+    deg2rad
+    digamma
+    erf
+    erfc
+    erfinv
+    exp
+    exp2
+    expm1
+    fix
+    floor
+    frac
+    lgamma
+    log
+    log10
+    log1p
+    log2
+    logit
+    i0
+    isnan
+    nan_to_num
+    neg
+    negative
+    positive
+    pow
+    rad2deg
+    reciprocal
+    round
+    rsqrt
+    sigmoid
+    sign
+    sgn
+    signbit
+    sin
+    sinc
+    sinh
+    sqrt
+    square
+    tan
+    tanh
+    trunc
+```
+
+The available inplace unary operators are all of the above **except**:
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    angle
+    positive
+    signbit
+    isnan
+```
+
+### Binary Operators
+
+As you may have seen in the tutorial, {class}`MaskedTensor` also has binary operations implemented with the caveat
+that the masks in the two MaskedTensors must match or else an error will be raised. As noted in the error, if you
+need support for a particular operator or have proposed semantics for how they should behave instead, please open
+an issue on GitHub. For now, we have decided to go with the most conservative implementation to ensure that users
+know exactly what is going on and are being intentional about their decisions with masked semantics.
+
+The available binary operators are:
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    add
+    atan2
+    arctan2
+    bitwise_and
+    bitwise_or
+    bitwise_xor
+    bitwise_left_shift
+    bitwise_right_shift
+    div
+    divide
+    floor_divide
+    fmod
+    logaddexp
+    logaddexp2
+    mul
+    multiply
+    nextafter
+    remainder
+    sub
+    subtract
+    true_divide
+    eq
+    ne
+    le
+    ge
+    greater
+    greater_equal
+    gt
+    less_equal
+    lt
+    less
+    maximum
+    minimum
+    fmax
+    fmin
+    not_equal
+```
+
+The available inplace binary operators are all of the above **except**:
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    logaddexp
+    logaddexp2
+    equal
+    fmin
+    minimum
+    fmax
+```
+
+### Reductions
+
+The following reductions are available (with autograd support). For more information, the
+[Overview](https://pytorch.org/tutorials/prototype/maskedtensor_overview.html) tutorial
+details some examples of reductions, while the
+[Advanced semantics](https://pytorch.org/tutorials/prototype/maskedtensor_advanced_semantics.html) tutorial
+has some further in-depth discussions about how we decided on certain reduction semantics.
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    sum
+    mean
+    amin
+    amax
+    argmin
+    argmax
+    prod
+    all
+    norm
+    var
+    std
+```
+
+### View and select functions
+
+We've included a number of view and select functions as well; intuitively, these operators will apply to
+both the data and the mask and then wrap the result in a {class}`MaskedTensor`. For a quick example,
+consider {func}`select`:
+
+```python
+    >>> data = torch.arange(12, dtype=torch.float).reshape(3, 4)
+    >>> data
+    tensor([[ 0.,  1.,  2.,  3.],
+            [ 4.,  5.,  6.,  7.],
+            [ 8.,  9., 10., 11.]])
+    >>> mask = torch.tensor([[True, False, False, True], [False, True, False, False], [True, True, True, True]])
+    >>> mt = masked_tensor(data, mask)
+    >>> data.select(0, 1)
+    tensor([4., 5., 6., 7.])
+    >>> mask.select(0, 1)
+    tensor([False,  True, False, False])
+    >>> mt.select(0, 1)
+    MaskedTensor(
+      [      --,   5.0000,       --,       --]
+    )
+```
+
+The following ops are currently supported:
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    atleast_1d
+    broadcast_tensors
+    broadcast_to
+    cat
+    chunk
+    column_stack
+    dsplit
+    flatten
+    hsplit
+    hstack
+    kron
+    meshgrid
+    narrow
+    nn.functional.unfold
+    ravel
+    select
+    split
+    stack
+    t
+    transpose
+    vsplit
+    vstack
+    Tensor.expand
+    Tensor.expand_as
+    Tensor.reshape
+    Tensor.reshape_as
+    Tensor.unfold
+    Tensor.view
+```
+
+```{eval-rst}
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.masked.maskedtensor.binary
+.. py:module:: torch.masked.maskedtensor.core
+.. py:module:: torch.masked.maskedtensor.creation
+.. py:module:: torch.masked.maskedtensor.passthrough
+.. py:module:: torch.masked.maskedtensor.reductions
+.. py:module:: torch.masked.maskedtensor.unary
+```
diff --git a/docs/source/meta.md b/docs/source/meta.md
new file mode 100644
index 000000000000..a5cca7c86cde
--- /dev/null
+++ b/docs/source/meta.md
@@ -0,0 +1,92 @@
+# Meta device
+
+The "meta" device is an abstract device which denotes a tensor which records
+only metadata, but no actual data.  Meta tensors have two primary use cases:
+
+* Models can be loaded on the meta device, allowing you to load a
+  representation of the model without actually loading the actual parameters
+  into memory.  This can be helpful if you need to make transformations on
+  the model before you load the actual data.
+
+* Most operations can be performed on meta tensors, producing new meta
+  tensors that describe what the result would have been if you performed
+  the operation on a real tensor.  You can use this to perform abstract
+  analysis without needing to spend time on compute or space to represent
+  the actual tensors.  Because meta tensors do not have real data, you cannot
+  perform data-dependent operations like {func}`torch.nonzero` or
+  {meth}`~torch.Tensor.item`.  In some cases, not all device types (e.g., CPU
+  and CUDA) have exactly the same output metadata for an operation; we
+  typically prefer representing the CUDA behavior faithfully in this
+  situation.
+
+```{warning}
+Although in principle meta tensor computation should always be faster than
+an equivalent CPU/CUDA computation, many meta tensor implementations are
+implemented in Python and have not been ported to C++ for speed, so you
+may find that you get lower absolute framework latency with small CPU tensors.
+```
+
+## Idioms for working with meta tensors
+
+An object can be loaded with {func}`torch.load` onto meta device by specifying
+`map_location='meta'`:
+
+```python
+>>> torch.save(torch.randn(2), 'foo.pt')
+>>> torch.load('foo.pt', map_location='meta')
+tensor(..., device='meta', size=(2,))
+```
+
+If you have some arbitrary code which performs some tensor construction without
+explicitly specifying a device, you can override it to instead construct on meta device by using
+the {func}`torch.device` context manager:
+
+```python
+>>> with torch.device('meta'):
+...     print(torch.randn(30, 30))
+...
+tensor(..., device='meta', size=(30, 30))
+```
+
+This is especially helpful NN module construction, where you often are not
+able to explicitly pass in a device for initialization:
+
+```python
+>>> from torch.nn.modules import Linear
+>>> with torch.device('meta'):
+...     print(Linear(20, 30))
+...
+Linear(in_features=20, out_features=30, bias=True)
+```
+
+You cannot convert a meta tensor directly to a CPU/CUDA tensor, because the
+meta tensor stores no data and we do not know what the correct data values for
+your new tensor are:
+
+```python
+>>> torch.ones(5, device='meta').to("cpu")
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+NotImplementedError: Cannot copy out of meta tensor; no data!
+```
+
+Use a factory function like {func}`torch.empty_like` to explicitly specify how
+you would like the missing data to be filled in.
+
+NN modules have a convenience method {meth}`torch.nn.Module.to_empty` that
+allows you to move the module to another device, leaving all parameters
+uninitialized.  You are expected to explicitly reinitialize the parameters
+manually:
+
+```python
+>>> from torch.nn.modules import Linear
+>>> with torch.device('meta'):
+...     m = Linear(20, 30)
+>>> m.to_empty(device="cpu")
+Linear(in_features=20, out_features=30, bias=True)
+```
+
+{mod}`torch._subclasses.meta_utils` contains undocumented utilities for taking
+an arbitrary Tensor and constructing an equivalent meta Tensor with high
+fidelity.  These APIs are experimental and may be changed in a BC breaking way
+at any time.
diff --git a/docs/source/miscellaneous_environment_variables.md b/docs/source/miscellaneous_environment_variables.md
new file mode 100644
index 000000000000..6046d787ef53
--- /dev/null
+++ b/docs/source/miscellaneous_environment_variables.md
@@ -0,0 +1,10 @@
+(miscellaneous_environment_variables)=
+
+# Miscellaneous Environment Variables
+
+| Variable                              | Description |
+|---------------------------------------|-------------|
+| `TORCH_FORCE_WEIGHTS_ONLY_LOAD`       | If set to [`1`, `y`, `yes`, `true`], the `torch.load` will use `weights_only=True`. This will happen even if `weights_only=False` was passed at the callsite. For more documentation on this, see [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html). |
+| `TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD`    | If set to [`1`, `y`, `yes`, `true`], the `torch.load` will use `weights_only=False` if the `weights_only` variable was not passed at the callsite. For more documentation on this, see [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html). |
+| `TORCH_AUTOGRAD_SHUTDOWN_WAIT_LIMIT`  | Under some conditions, autograd threads can hang on shutdown, therefore we do not wait for them to shutdown indefinitely but rely on a timeout that is by default set to `10` seconds. This environment variable can be used to set the timeout in seconds. |
+| `TORCH_DEVICE_BACKEND_AUTOLOAD`       | If set to `1`, out-of-tree backend extensions will be automatically imported when running `import torch`. |
diff --git a/docs/source/mobile_optimizer.md b/docs/source/mobile_optimizer.md
new file mode 100644
index 000000000000..55b3c32c9fff
--- /dev/null
+++ b/docs/source/mobile_optimizer.md
@@ -0,0 +1,24 @@
+---
+robots: noindex
+---
+# torch.utils.mobile_optimizer
+
+PyTorch Mobile is no longer actively supported. Redirecting to [ExecuTorch documentation](https://docs.pytorch.org/executorch).
+
+```{raw} html
+<meta http-equiv="Refresh" content="0; url='https://docs.pytorch.org/executorch'" />
+```
+
+```{warning}
+PyTorch Mobile is no longer actively supported. Please check out
+[ExecuTorch](https://pytorch.org/executorch-overview), PyTorch's
+all-new on-device inference library. You can also review
+documentation on [XNNPACK](https://pytorch.org/executorch/stable/native-delegates-executorch-xnnpack-delegate.html)
+and [Vulkan](https://pytorch.org/executorch/stable/native-delegates-executorch-vulkan-delegate.html) delegates.
+```
+```{eval-rst}
+.. currentmodule:: torch.utils.mobile_optimizer
+```
+```{eval-rst}
+.. autofunction:: optimize_for_mobile
+```
diff --git a/docs/source/model_zoo.md b/docs/source/model_zoo.md
new file mode 100644
index 000000000000..5caf7ac40c89
--- /dev/null
+++ b/docs/source/model_zoo.md
@@ -0,0 +1,10 @@
+# torch.utils.model_zoo
+
+Moved to `torch.hub`.
+
+```{eval-rst}
+.. automodule:: torch.utils.model_zoo
+```
+```{eval-rst}
+.. autofunction:: load_url
+```
diff --git a/docs/source/module_tracker.md b/docs/source/module_tracker.md
new file mode 100644
index 000000000000..d0d1d55e64ae
--- /dev/null
+++ b/docs/source/module_tracker.md
@@ -0,0 +1,11 @@
+# torch.utils.module_tracker
+```{eval-rst}
+.. automodule:: torch.utils.module_tracker
+```
+
+This utility can be used to track the current position inside an {class}`torch.nn.Module` hierarchy.
+It can be used within other tracking tools to be able to easily associate measured quantities to user-friendly names. This is used in particular in the FlopCounterMode today.
+
+```{eval-rst}
+.. autoclass:: torch.utils.module_tracker.ModuleTracker
+```
diff --git a/docs/source/monitor.md b/docs/source/monitor.md
new file mode 100644
index 000000000000..20d310a20cd7
--- /dev/null
+++ b/docs/source/monitor.md
@@ -0,0 +1,70 @@
+# torch.monitor
+
+```{warning}
+This module is a prototype release, and its interfaces and functionality may
+change without warning in future PyTorch releases.
+```
+
+``torch.monitor`` provides an interface for logging events and counters from
+PyTorch.
+
+The stat interfaces are designed to be used for tracking high level metrics that
+are periodically logged out to be used for monitoring system performance. Since
+the stats aggregate with a specific window size you can log to them from
+critical loops with minimal performance impact.
+
+For more infrequent events or values such as loss, accuracy, usage tracking the
+event interface can be directly used.
+
+Event handlers can be registered to handle the events and pass them to an
+external event sink.
+
+## API Reference
+```{eval-rst}
+.. automodule:: torch.monitor
+```
+
+```{eval-rst}
+.. autoclass:: torch.monitor.Aggregation
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.monitor.Stat
+    :members:
+    :special-members: __init__
+```
+
+```{eval-rst}
+.. autoclass:: torch.monitor.data_value_t
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: torch.monitor.Event
+    :members:
+    :special-members: __init__
+```
+
+```{eval-rst}
+.. autoclass:: torch.monitor.EventHandlerHandle
+    :members:
+```
+
+```{eval-rst}
+.. autofunction:: torch.monitor.log_event
+```
+
+```{eval-rst}
+.. autofunction:: torch.monitor.register_event_handler
+```
+
+```{eval-rst}
+.. autofunction:: torch.monitor.unregister_event_handler
+```
+
+```{eval-rst}
+.. autoclass:: torch.monitor.TensorboardEventHandler
+    :members:
+    :special-members: __init__
+```
diff --git a/docs/source/mps.md b/docs/source/mps.md
new file mode 100644
index 000000000000..7336e71c03c0
--- /dev/null
+++ b/docs/source/mps.md
@@ -0,0 +1,67 @@
+# torch.mps
+
+```{eval-rst}
+.. automodule:: torch.mps
+```
+
+```{eval-rst}
+.. currentmodule:: torch.mps
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    device_count
+    synchronize
+    get_rng_state
+    set_rng_state
+    manual_seed
+    seed
+    empty_cache
+    set_per_process_memory_fraction
+    current_allocated_memory
+    driver_allocated_memory
+    recommended_max_memory
+    compile_shader
+```
+
+## MPS Profiler
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    profiler.start
+    profiler.stop
+    profiler.profile
+
+    profiler.is_capturing_metal
+    profiler.is_metal_capture_enabled
+    profiler.metal_capture
+```
+
+## MPS Event
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    event.Event
+
+```
+
+% This module needs to be documented. Adding here in the meantime
+
+% for tracking purposes
+
+```{eval-rst}
+.. py:module:: torch.mps.event
+```
+
+```{eval-rst}
+.. py:module:: torch.mps.profiler
+```
diff --git a/docs/source/mps_environment_variables.md b/docs/source/mps_environment_variables.md
new file mode 100644
index 000000000000..93c3b94879c0
--- /dev/null
+++ b/docs/source/mps_environment_variables.md
@@ -0,0 +1,33 @@
+(mps_environment_variables)=
+# MPS Environment Variables
+
+**PyTorch Environment Variables**
+
+
+| Variable                         | Description |
+|----------------------------------|-------------|
+| `PYTORCH_DEBUG_MPS_ALLOCATOR`   | If set to `1`, set allocator logging level to verbose. |
+| `PYTORCH_MPS_LOG_PROFILE_INFO`  | Set log options bitmask to `MPSProfiler`. See `LogOptions` enum in `aten/src/ATen/mps/MPSProfiler.h`. |
+| `PYTORCH_MPS_TRACE_SIGNPOSTS`   | Set profile and signpost bitmasks to `MPSProfiler`. See `ProfileOptions` and `SignpostTypes`. |
+| `PYTORCH_MPS_HIGH_WATERMARK_RATIO` | High watermark ratio for MPS allocator. Default is 1.7. |
+| `PYTORCH_MPS_LOW_WATERMARK_RATIO` | Low watermark ratio for MPS allocator. Default is 1.4 (unified) or 1.0 (discrete). |
+| `PYTORCH_MPS_FAST_MATH`         | If `1`, enables fast math for MPS kernels. See section 1.6.3 in the [Metal Shading Language Spec](https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf). |
+| `PYTORCH_MPS_PREFER_METAL`      | If `1`, uses metal kernels instead of MPS Graph APIs. Used for matmul. |
+| `PYTORCH_ENABLE_MPS_FALLBACK`   | If `1`, falls back to CPU when MPS ops aren't supported. |
+
+```{note}
+**high watermark ratio** is a hard limit for the total allowed allocations
+
+- `0.0` : disables high watermark limit (may cause system failure if system-wide OOM occurs)
+- `1.0` : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize)
+- `>1.0`: allows limits beyond the device.recommendedMaxWorkingSetSize
+
+e.g., value 0.95 means we allocate up to 95% of recommended maximum
+allocation size; beyond that, the allocations would fail with OOM error.
+
+**low watermark ratio** is a soft limit to attempt limiting memory allocations up to the lower watermark
+level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit).
+Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection)
+e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum
+allocation size.
+```
diff --git a/docs/source/mtia.md b/docs/source/mtia.md
new file mode 100644
index 000000000000..3229b80c3d91
--- /dev/null
+++ b/docs/source/mtia.md
@@ -0,0 +1,51 @@
+# torch.mtia
+
+The MTIA backend is implemented out of the tree, only interfaces are be defined here.
+
+```{eval-rst}
+.. automodule:: torch.mtia
+```
+
+```{eval-rst}
+.. currentmodule:: torch.mtia
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    StreamContext
+    current_device
+    current_stream
+    default_stream
+    device_count
+    init
+    is_available
+    is_initialized
+    memory_stats
+    get_device_capability
+    empty_cache
+    record_memory_history
+    snapshot
+    attach_out_of_memory_observer
+    set_device
+    set_stream
+    stream
+    synchronize
+    device
+    set_rng_state
+    get_rng_state
+    DeferredMtiaCallError
+```
+
+## Streams and events
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Event
+    Stream
+```
diff --git a/docs/source/mtia.memory.md b/docs/source/mtia.memory.md
new file mode 100644
index 000000000000..4da8c0988406
--- /dev/null
+++ b/docs/source/mtia.memory.md
@@ -0,0 +1,19 @@
+# torch.mtia.memory
+
+The MTIA backend is implemented out of the tree, only interfaces are be defined here.
+
+```{eval-rst}
+.. automodule:: torch.mtia.memory
+```
+
+```{eval-rst}
+.. currentmodule:: torch.mtia.memory
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    memory_stats
+```
diff --git a/docs/source/multiprocessing.md b/docs/source/multiprocessing.md
new file mode 100644
index 000000000000..6669fcaa24b3
--- /dev/null
+++ b/docs/source/multiprocessing.md
@@ -0,0 +1,219 @@
+---
+orphan: true
+---
+
+(multiprocessing-doc)=
+
+# Multiprocessing package - torch.multiprocessing
+
+```{eval-rst}
+.. automodule:: torch.multiprocessing
+```
+
+```{eval-rst}
+.. currentmodule:: torch.multiprocessing
+```
+
+:::{warning}
+If the main process exits abruptly (e.g. because of an incoming signal),
+Python's `multiprocessing` sometimes fails to clean up its children.
+It's a known caveat, so if you're seeing any resource leaks after
+interrupting the interpreter, it probably means that this has just happened
+to you.
+:::
+
+## Strategy management
+
+```{eval-rst}
+.. autofunction:: get_all_sharing_strategies
+```
+
+```{eval-rst}
+.. autofunction:: get_sharing_strategy
+```
+
+```{eval-rst}
+.. autofunction:: set_sharing_strategy
+
+```
+
+(multiprocessing-cuda-sharing-details)=
+
+## Sharing CUDA tensors
+
+Sharing CUDA tensors between processes is supported only in Python 3, using
+a `spawn` or `forkserver` start methods.
+
+Unlike CPU tensors, the sending process is required to keep the original tensor
+as long as the receiving process retains a copy of the tensor. The refcounting is
+implemented under the hood but requires users to follow the next best practices.
+
+:::{warning}
+If the consumer process dies abnormally to a fatal signal, the shared tensor
+could be forever kept in memory as long as the sending process is running.
+:::
+
+1. Release memory ASAP in the consumer.
+
+```
+## Good
+x = queue.get()
+# do somethings with x
+del x
+```
+
+```
+## Bad
+x = queue.get()
+# do somethings with x
+# do everything else (producer have to keep x in memory)
+```
+
+2. Keep producer process running until all consumers exits. This will prevent
+the situation when the producer process releasing memory which is still in use
+by the consumer.
+
+```
+## producer
+# send tensors, do something
+event.wait()
+```
+
+```
+## consumer
+# receive tensors and use them
+event.set()
+```
+
+3. Don't pass received tensors.
+
+```
+# not going to work
+x = queue.get()
+queue_2.put(x)
+```
+
+```
+# you need to create a process-local copy
+x = queue.get()
+x_clone = x.clone()
+queue_2.put(x_clone)
+```
+
+```
+# putting and getting from the same queue in the same process will likely end up with segfault
+queue.put(tensor)
+x = queue.get()
+```
+
+## Sharing strategies
+
+This section provides a brief overview into how different sharing strategies
+work. Note that it applies only to CPU tensor - CUDA tensors will always use
+the CUDA API, as that's the only way they can be shared.
+
+### File descriptor - `file_descriptor`
+
+:::{note}
+This is the default strategy (except for macOS and OS X where it's not
+supported).
+:::
+
+This strategy will use file descriptors as shared memory handles. Whenever a
+storage is moved to shared memory, a file descriptor obtained from `shm_open`
+is cached with the object, and when it's going to be sent to other processes,
+the file descriptor will be transferred (e.g. via UNIX sockets) to it. The
+receiver will also cache the file descriptor and `mmap` it, to obtain a shared
+view onto the storage data.
+
+Note that if there will be a lot of tensors shared, this strategy will keep a
+large number of file descriptors open most of the time. If your system has low
+limits for the number of open file descriptors, and you can't raise them, you
+should use the `file_system` strategy.
+
+### File system - `file_system`
+
+This strategy will use file names given to `shm_open` to identify the shared
+memory regions. This has a benefit of not requiring the implementation to cache
+the file descriptors obtained from it, but at the same time is prone to shared
+memory leaks. The file can't be deleted right after its creation, because other
+processes need to access it to open their views. If the processes fatally
+crash, or are killed, and don't call the storage destructors, the files will
+remain in the system. This is very serious, because they keep using up the
+memory until the system is restarted, or they're freed manually.
+
+To counter the problem of shared memory file leaks, {mod}`torch.multiprocessing`
+will spawn a daemon named `torch_shm_manager` that will isolate itself from
+the current process group, and will keep track of all shared memory allocations.
+Once all processes connected to it exit, it will wait a moment to ensure there
+will be no new connections, and will iterate over all shared memory files
+allocated by the group. If it finds that any of them still exist, they will be
+deallocated. We've tested this method and it proved to be robust to various
+failures. Still, if your system has high enough limits, and `file_descriptor`
+is a supported strategy, we do not recommend switching to this one.
+
+## Spawning subprocesses
+
+:::{note}
+Available for Python >= 3.4.
+
+This depends on the `spawn` start method in Python's
+`multiprocessing` package.
+:::
+
+Spawning a number of subprocesses to perform some function can be done
+by creating `Process` instances and calling `join` to wait for
+their completion. This approach works fine when dealing with a single
+subprocess but presents potential issues when dealing with multiple
+processes.
+
+Namely, joining processes sequentially implies they will terminate
+sequentially. If they don't, and the first process does not terminate,
+the process termination will go unnoticed. Also, there are no native
+facilities for error propagation.
+
+The `spawn` function below addresses these concerns and takes care
+of error propagation, out of order termination, and will actively
+terminate processes upon detecting an error in one of them.
+
+```{eval-rst}
+.. automodule:: torch.multiprocessing.spawn
+```
+
+```{eval-rst}
+.. currentmodule:: torch.multiprocessing.spawn
+```
+
+```{eval-rst}
+.. autofunction:: spawn
+```
+
+```{eval-rst}
+.. currentmodule:: torch.multiprocessing
+
+```
+
+```{eval-rst}
+.. class:: SpawnContext
+
+   Returned by :func:`~spawn` when called with ``join=False``.
+
+   .. automethod:: join
+
+```
+
+% This module needs to be documented. Adding here in the meantime
+
+% for tracking purposes
+
+```{eval-rst}
+.. py:module:: torch.multiprocessing.pool
+```
+
+```{eval-rst}
+.. py:module:: torch.multiprocessing.queue
+```
+
+```{eval-rst}
+.. py:module:: torch.multiprocessing.reductions
+```
diff --git a/docs/source/name_inference.md b/docs/source/name_inference.md
new file mode 100644
index 000000000000..ab705fa509b1
--- /dev/null
+++ b/docs/source/name_inference.md
@@ -0,0 +1,483 @@
+```{eval-rst}
+.. currentmodule:: torch
+```
+
+(name_inference_reference-doc)=
+
+# Named Tensors operator coverage
+
+Please read {ref}`named_tensors-doc` first for an introduction to named tensors.
+
+This document is a reference for *name inference*, a process that defines how
+named tensors:
+
+1. use names to provide additional automatic runtime correctness checks
+2. propagate names from input tensors to output tensors
+
+Below is a list of all operations that are supported with named tensors
+and their associated name inference rules.
+
+If you don't see an operation listed here, but it would help your use case, please
+[search if an issue has already been filed](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3A%22module%3A+named+tensor%22) and if not, [file one](https://github.com/pytorch/pytorch/issues/new/choose).
+
+:::{warning}
+The named tensor API is experimental and subject to change.
+:::
+
+```{eval-rst}
+.. csv-table:: Supported Operations
+   :header: API, Name inference rule
+   :widths: 20, 20
+
+   ":meth:`Tensor.abs`, :func:`torch.abs`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.abs_`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.acos`, :func:`torch.acos`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.acos_`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.add`, :func:`torch.add`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.add_`,:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.addmm`, :func:`torch.addmm`",:ref:`contracts_away_dims-doc`
+   :meth:`Tensor.addmm_`,:ref:`contracts_away_dims-doc`
+   ":meth:`Tensor.addmv`, :func:`torch.addmv`",:ref:`contracts_away_dims-doc`
+   :meth:`Tensor.addmv_`,:ref:`contracts_away_dims-doc`
+   :meth:`Tensor.align_as`,See documentation
+   :meth:`Tensor.align_to`,See documentation
+   ":meth:`Tensor.all`, :func:`torch.all`",None
+   ":meth:`Tensor.any`, :func:`torch.any`",None
+   ":meth:`Tensor.asin`, :func:`torch.asin`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.asin_`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.atan`, :func:`torch.atan`",:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.atan2`, :func:`torch.atan2`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.atan2_`,:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.atan_`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.bernoulli`, :func:`torch.bernoulli`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.bernoulli_`,None
+   :meth:`Tensor.bfloat16`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.bitwise_not`, :func:`torch.bitwise_not`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.bitwise_not_`,None
+   ":meth:`Tensor.bmm`, :func:`torch.bmm`",:ref:`contracts_away_dims-doc`
+   :meth:`Tensor.bool`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.byte`,:ref:`keeps_input_names-doc`
+   :func:`torch.cat`,:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.cauchy_`,None
+   ":meth:`Tensor.ceil`, :func:`torch.ceil`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.ceil_`,None
+   :meth:`Tensor.char`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.chunk`, :func:`torch.chunk`",:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.clamp`, :func:`torch.clamp`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.clamp_`,None
+   :meth:`Tensor.copy_`,:ref:`out_function_semantics-doc`
+   ":meth:`Tensor.cos`, :func:`torch.cos`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.cos_`,None
+   ":meth:`Tensor.cosh`, :func:`torch.cosh`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.cosh_`,None
+   ":meth:`Tensor.acosh`, :func:`torch.acosh`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.acosh_`,None
+   :meth:`Tensor.cpu`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.cuda`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.cumprod`, :func:`torch.cumprod`",:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.cumsum`, :func:`torch.cumsum`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.data_ptr`,None
+   ":meth:`Tensor.deg2rad`, :func:`torch.deg2rad`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.deg2rad_`,None
+   ":meth:`Tensor.detach`, :func:`torch.detach`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.detach_`,None
+   ":attr:`Tensor.device`, :func:`torch.device`",None
+   ":meth:`Tensor.digamma`, :func:`torch.digamma`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.digamma_`,None
+   :meth:`Tensor.dim`,None
+   ":meth:`Tensor.div`, :func:`torch.div`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.div_`,:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.dot`, :func:`torch.dot`",None
+   :meth:`Tensor.double`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.element_size`,None
+   :func:`torch.empty`,:ref:`factory-doc`
+   :func:`torch.empty_like`,:ref:`factory-doc`
+   ":meth:`Tensor.eq`, :func:`torch.eq`",:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.erf`, :func:`torch.erf`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.erf_`,None
+   ":meth:`Tensor.erfc`, :func:`torch.erfc`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.erfc_`,None
+   ":meth:`Tensor.erfinv`, :func:`torch.erfinv`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.erfinv_`,None
+   ":meth:`Tensor.exp`, :func:`torch.exp`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.exp_`,None
+   :meth:`Tensor.expand`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.expm1`, :func:`torch.expm1`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.expm1_`,None
+   :meth:`Tensor.exponential_`,None
+   :meth:`Tensor.fill_`,None
+   ":meth:`Tensor.flatten`, :func:`torch.flatten`",See documentation
+   :meth:`Tensor.float`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.floor`, :func:`torch.floor`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.floor_`,None
+   ":meth:`Tensor.frac`, :func:`torch.frac`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.frac_`,None
+   ":meth:`Tensor.ge`, :func:`torch.ge`",:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.get_device`, :func:`torch.get_device`",None
+   :attr:`Tensor.grad`,None
+   ":meth:`Tensor.gt`, :func:`torch.gt`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.half`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.has_names`,See documentation
+   ":meth:`Tensor.index_fill`, :func:`torch.index_fill`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.index_fill_`,None
+   :meth:`Tensor.int`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.is_contiguous`,None
+   :attr:`Tensor.is_cuda`,None
+   ":meth:`Tensor.is_floating_point`, :func:`torch.is_floating_point`",None
+   :attr:`Tensor.is_leaf`,None
+   :meth:`Tensor.is_pinned`,None
+   :meth:`Tensor.is_shared`,None
+   ":meth:`Tensor.is_signed`, :func:`torch.is_signed`",None
+   :attr:`Tensor.is_sparse`,None
+   :attr:`Tensor.is_sparse_csr`,None
+   :func:`torch.is_tensor`,None
+   :meth:`Tensor.item`,None
+   :attr:`Tensor.itemsize`,None
+   ":meth:`Tensor.kthvalue`, :func:`torch.kthvalue`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.le`, :func:`torch.le`",:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.log`, :func:`torch.log`",:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.log10`, :func:`torch.log10`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.log10_`,None
+   ":meth:`Tensor.log1p`, :func:`torch.log1p`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.log1p_`,None
+   ":meth:`Tensor.log2`, :func:`torch.log2`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.log2_`,None
+   :meth:`Tensor.log_`,None
+   :meth:`Tensor.log_normal_`,None
+   ":meth:`Tensor.logical_not`, :func:`torch.logical_not`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.logical_not_`,None
+   ":meth:`Tensor.logsumexp`, :func:`torch.logsumexp`",:ref:`removes_dimensions-doc`
+   :meth:`Tensor.long`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.lt`, :func:`torch.lt`",:ref:`unifies_names_from_inputs-doc`
+   :func:`torch.manual_seed`,None
+   ":meth:`Tensor.masked_fill`, :func:`torch.masked_fill`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.masked_fill_`,None
+   ":meth:`Tensor.masked_select`, :func:`torch.masked_select`",Aligns mask up to input and then unifies_names_from_input_tensors
+   ":meth:`Tensor.matmul`, :func:`torch.matmul`",:ref:`contracts_away_dims-doc`
+   ":meth:`Tensor.mean`, :func:`torch.mean`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.median`, :func:`torch.median`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.nanmedian`, :func:`torch.nanmedian`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.mm`, :func:`torch.mm`",:ref:`contracts_away_dims-doc`
+   ":meth:`Tensor.mode`, :func:`torch.mode`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.mul`, :func:`torch.mul`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.mul_`,:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.mv`, :func:`torch.mv`",:ref:`contracts_away_dims-doc`
+   :attr:`Tensor.names`,See documentation
+   ":meth:`Tensor.narrow`, :func:`torch.narrow`",:ref:`keeps_input_names-doc`
+   :attr:`Tensor.nbytes`,None
+   :attr:`Tensor.ndim`,None
+   :meth:`Tensor.ndimension`,None
+   ":meth:`Tensor.ne`, :func:`torch.ne`",:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.neg`, :func:`torch.neg`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.neg_`,None
+   :func:`torch.normal`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.normal_`,None
+   ":meth:`Tensor.numel`, :func:`torch.numel`",None
+   :func:`torch.ones`,:ref:`factory-doc`
+   ":meth:`Tensor.pow`, :func:`torch.pow`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.pow_`,None
+   ":meth:`Tensor.prod`, :func:`torch.prod`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.rad2deg`, :func:`torch.rad2deg`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.rad2deg_`,None
+   :func:`torch.rand`,:ref:`factory-doc`
+   :func:`torch.rand`,:ref:`factory-doc`
+   :func:`torch.randn`,:ref:`factory-doc`
+   :func:`torch.randn`,:ref:`factory-doc`
+   :meth:`Tensor.random_`,None
+   ":meth:`Tensor.reciprocal`, :func:`torch.reciprocal`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.reciprocal_`,None
+   :meth:`Tensor.refine_names`,See documentation
+   :meth:`Tensor.register_hook`,None
+   :meth:`Tensor.register_post_accumulate_grad_hook`,None
+   :meth:`Tensor.rename`,See documentation
+   :meth:`Tensor.rename_`,See documentation
+   :attr:`Tensor.requires_grad`,None
+   :meth:`Tensor.requires_grad_`,None
+   :meth:`Tensor.resize_`,Only allow resizes that do not change shape
+   :meth:`Tensor.resize_as_`,Only allow resizes that do not change shape
+   ":meth:`Tensor.round`, :func:`torch.round`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.round_`,None
+   ":meth:`Tensor.rsqrt`, :func:`torch.rsqrt`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.rsqrt_`,None
+   ":meth:`Tensor.select`, :func:`torch.select`",:ref:`removes_dimensions-doc`
+   :meth:`Tensor.short`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.sigmoid`, :func:`torch.sigmoid`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sigmoid_`,None
+   ":meth:`Tensor.sign`, :func:`torch.sign`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sign_`,None
+   ":meth:`Tensor.sgn`, :func:`torch.sgn`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sgn_`,None
+   ":meth:`Tensor.sin`, :func:`torch.sin`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sin_`,None
+   ":meth:`Tensor.sinh`, :func:`torch.sinh`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sinh_`,None
+   ":meth:`Tensor.asinh`, :func:`torch.asinh`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.asinh_`,None
+   :meth:`Tensor.size`,None
+   ":meth:`Tensor.softmax`, :func:`torch.softmax`",:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.split`, :func:`torch.split`",:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.sqrt`, :func:`torch.sqrt`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sqrt_`,None
+   ":meth:`Tensor.squeeze`, :func:`torch.squeeze`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.std`, :func:`torch.std`",:ref:`removes_dimensions-doc`
+   :func:`torch.std_mean`,:ref:`removes_dimensions-doc`
+   :meth:`Tensor.stride`,None
+   ":meth:`Tensor.sub`, :func:`torch.sub`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.sub_`,:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.sum`, :func:`torch.sum`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.tan`, :func:`torch.tan`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.tan_`,None
+   ":meth:`Tensor.tanh`, :func:`torch.tanh`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.tanh_`,None
+   ":meth:`Tensor.atanh`, :func:`torch.atanh`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.atanh_`,None
+   :func:`torch.tensor`,:ref:`factory-doc`
+   :meth:`Tensor.to`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.topk`, :func:`torch.topk`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.transpose`, :func:`torch.transpose`",:ref:`permutes_dimensions-doc`
+   ":meth:`Tensor.trunc`, :func:`torch.trunc`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.trunc_`,None
+   :meth:`Tensor.type`,None
+   :meth:`Tensor.type_as`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.unbind`, :func:`torch.unbind`",:ref:`removes_dimensions-doc`
+   :meth:`Tensor.unflatten`,See documentation
+   :meth:`Tensor.uniform_`,None
+   ":meth:`Tensor.var`, :func:`torch.var`",:ref:`removes_dimensions-doc`
+   :func:`torch.var_mean`,:ref:`removes_dimensions-doc`
+   :meth:`Tensor.zero_`,None
+   :func:`torch.zeros`,:ref:`factory-doc`
+
+```
+
+(keeps_input_names-doc)=
+
+## Keeps input names
+
+All pointwise unary functions follow this rule as well as some other unary functions.
+
+- Check names: None
+- Propagate names: input tensor's names are propagated to the output.
+
+```
+>>> x = torch.randn(3, 3, names=('N', 'C'))
+>>> x.abs().names
+('N', 'C')
+```
+
+(removes_dimensions-doc)=
+
+## Removes dimensions
+
+All reduction ops like {meth}`~Tensor.sum` remove dimensions by reducing
+over the desired dimensions. Other operations like {meth}`~Tensor.select` and
+{meth}`~Tensor.squeeze` remove dimensions.
+
+Wherever one can pass an integer dimension index to an operator, one can also pass
+a dimension name. Functions that take lists of dimension indices can also take in a
+list of dimension names.
+
+- Check names: If {attr}`dim` or {attr}`dims` is passed in as a list of names,
+  check that those names exist in {attr}`self`.
+- Propagate names: If the dimensions of the input tensor specified by {attr}`dim`
+  or {attr}`dims` are not present in the output tensor, then the corresponding names
+  of those dimensions do not appear in `output.names`.
+
+```
+>>> x = torch.randn(1, 3, 3, 3, names=('N', 'C', 'H', 'W'))
+>>> x.squeeze('N').names
+('C', 'H', 'W')
+
+>>> x = torch.randn(3, 3, 3, 3, names=('N', 'C', 'H', 'W'))
+>>> x.sum(['N', 'C']).names
+('H', 'W')
+
+# Reduction ops with keepdim=True don't actually remove dimensions.
+>>> x = torch.randn(3, 3, 3, 3, names=('N', 'C', 'H', 'W'))
+>>> x.sum(['N', 'C'], keepdim=True).names
+('N', 'C', 'H', 'W')
+```
+
+(unifies_names_from_inputs-doc)=
+
+## Unifies names from inputs
+
+All binary arithmetic ops follow this rule. Operations that broadcast still
+broadcast positionally from the right to preserve compatibility with unnamed
+tensors. To perform explicit broadcasting by names, use {meth}`Tensor.align_as`.
+
+- Check names: All names must match positionally from the right. i.e., in
+  `tensor + other`, `match(tensor.names[i], other.names[i])` must be true for all
+  `i` in `(-min(tensor.dim(), other.dim()) + 1, -1]`.
+- Check names: Furthermore, all named dimensions must be aligned from the right.
+  During matching, if we match a named dimension `A` with an unnamed dimension
+  `None`, then `A` must not appear in the tensor with the unnamed dimension.
+- Propagate names: unify pairs of names from the right from both tensors to
+  produce output names.
+
+For example,
+
+```
+# tensor: Tensor[   N, None]
+# other:  Tensor[None,    C]
+>>> tensor = torch.randn(3, 3, names=('N', None))
+>>> other = torch.randn(3, 3, names=(None, 'C'))
+>>> (tensor + other).names
+('N', 'C')
+```
+
+Check names:
+
+- `match(tensor.names[-1], other.names[-1])` is `True`
+- `match(tensor.names[-2], tensor.names[-2])` is `True`
+- Because we matched `None` in {attr}`tensor` with `'C'`,
+  check to make sure `'C'` doesn't exist in {attr}`tensor` (it does not).
+- Check to make sure `'N'` doesn't exists in {attr}`other` (it does not).
+
+Finally, the output names are computed with
+`[unify('N', None), unify(None, 'C')] = ['N', 'C']`
+
+More examples:
+
+```
+# Dimensions don't match from the right:
+# tensor: Tensor[N, C]
+# other:  Tensor[   N]
+>>> tensor = torch.randn(3, 3, names=('N', 'C'))
+>>> other = torch.randn(3, names=('N',))
+>>> (tensor + other).names
+RuntimeError: Error when attempting to broadcast dims ['N', 'C'] and dims
+['N']: dim 'C' and dim 'N' are at the same position from the right but do
+not match.
+
+# Dimensions aren't aligned when matching tensor.names[-1] and other.names[-1]:
+# tensor: Tensor[N, None]
+# other:  Tensor[      N]
+>>> tensor = torch.randn(3, 3, names=('N', None))
+>>> other = torch.randn(3, names=('N',))
+>>> (tensor + other).names
+RuntimeError: Misaligned dims when attempting to broadcast dims ['N'] and
+dims ['N', None]: dim 'N' appears in a different position from the right
+across both lists.
+```
+
+:::{note}
+In both of the last examples, it is possible to align the tensors by names
+and then perform the addition. Use {meth}`Tensor.align_as` to align
+tensors by name or {meth}`Tensor.align_to` to align tensors to a custom
+dimension ordering.
+:::
+
+(permutes_dimensions-doc)=
+
+## Permutes dimensions
+
+Some operations, like {meth}`Tensor.t()`, permute the order of dimensions. Dimension names
+are attached to individual dimensions so they get permuted as well.
+
+If the operator takes in positional index {attr}`dim`, it is also able to take a dimension
+name as {attr}`dim`.
+
+- Check names: If {attr}`dim` is passed as a name, check that it exists in the tensor.
+- Propagate names: Permute dimension names in the same way as the dimensions that are
+  being permuted.
+
+```
+>>> x = torch.randn(3, 3, names=('N', 'C'))
+>>> x.transpose('N', 'C').names
+('C', 'N')
+```
+
+(contracts_away_dims-doc)=
+
+## Contracts away dims
+
+Matrix multiply functions follow some variant of this. Let's go through
+{func}`torch.mm` first and then generalize the rule for batch matrix multiplication.
+
+For `torch.mm(tensor, other)`:
+
+- Check names: None
+- Propagate names: result names are `(tensor.names[-2], other.names[-1])`.
+
+```
+>>> x = torch.randn(3, 3, names=('N', 'D'))
+>>> y = torch.randn(3, 3, names=('in', 'out'))
+>>> x.mm(y).names
+('N', 'out')
+```
+
+Inherently, a matrix multiplication performs a dot product over two dimensions,
+collapsing them. When two tensors are matrix-multiplied, the contracted dimensions
+disappear and do not show up in the output tensor.
+
+{func}`torch.mv`, {func}`torch.dot` work in a similar way: name inference does not
+check input names and removes the dimensions that are involved in the dot product:
+
+```
+>>> x = torch.randn(3, 3, names=('N', 'D'))
+>>> y = torch.randn(3, names=('something',))
+>>> x.mv(y).names
+('N',)
+```
+
+Now, let's take a look at `torch.matmul(tensor, other)`. Assume that `tensor.dim() >= 2`
+and `other.dim() >= 2`.
+
+- Check names: Check that the batch dimensions of the inputs are aligned and broadcastable.
+  See {ref}`unifies_names_from_inputs-doc` for what it means for the inputs to be aligned.
+- Propagate names: result names are obtained by unifying the batch dimensions and removing
+  the contracted dimensions:
+  `unify(tensor.names[:-2], other.names[:-2]) + (tensor.names[-2], other.names[-1])`.
+
+Examples:
+
+```
+# Batch matrix multiply of matrices Tensor['C', 'D'] and Tensor['E', 'F'].
+# 'A', 'B' are batch dimensions.
+>>> x = torch.randn(3, 3, 3, 3, names=('A', 'B', 'C', 'D'))
+>>> y = torch.randn(3, 3, 3, names=('B', 'E', 'F'))
+>>> torch.matmul(x, y).names
+('A', 'B', 'C', 'F')
+```
+
+Finally, there are fused `add` versions of many matmul functions. i.e., {func}`addmm`
+and {func}`addmv`. These are treated as composing name inference for i.e. {func}`mm` and
+name inference for {func}`add`.
+
+(factory-doc)=
+
+## Factory functions
+
+Factory functions now take a new {attr}`names` argument that associates a name
+with each dimension.
+
+```
+>>> torch.zeros(2, 3, names=('N', 'C'))
+tensor([[0., 0., 0.],
+        [0., 0., 0.]], names=('N', 'C'))
+```
+
+(out_function_semantics-doc)=
+
+## out function and in-place variants
+
+A tensor specified as an `out=` tensor has the following behavior:
+
+- If it has no named dimensions, then the names computed from the operation
+  get propagated to it.
+- If it has any named dimensions, then the names computed from the operation
+  must be exactly equal to the existing names. Otherwise, the operation errors.
+
+All in-place methods modify inputs to have names equal to the computed names
+from name inference. For example:
+
+```
+>>> x = torch.randn(3, 3)
+>>> y = torch.randn(3, 3, names=('N', 'C'))
+>>> x.names
+(None, None)
+
+>>> x += y
+>>> x.names
+('N', 'C')
+```
diff --git a/docs/source/named_tensor.md b/docs/source/named_tensor.md
new file mode 100644
index 000000000000..72e895882f17
--- /dev/null
+++ b/docs/source/named_tensor.md
@@ -0,0 +1,316 @@
+```{eval-rst}
+.. currentmodule:: torch
+```
+
+(named_tensors-doc)=
+
+# Named Tensors
+
+Named Tensors allow users to give explicit names to tensor dimensions.
+In most cases, operations that take dimension parameters will accept
+dimension names, avoiding the need to track dimensions by position.
+In addition, named tensors use names to automatically check that APIs
+are being used correctly at runtime, providing extra safety. Names can
+also be used to rearrange dimensions, for example, to support
+"broadcasting by name" rather than "broadcasting by position".
+
+
+```{warning}
+    The named tensor API is a prototype feature and subject to change.
+```
+
+## Creating named tensors
+
+
+Factory functions now take a new {attr}`names` argument that associates a name
+with each dimension.
+
+```
+    >>> torch.zeros(2, 3, names=('N', 'C'))
+    tensor([[0., 0., 0.],
+            [0., 0., 0.]], names=('N', 'C'))
+```
+
+Named dimensions, like regular Tensor dimensions, are ordered.
+``tensor.names[i]`` is the name of dimension ``i`` of ``tensor``.
+
+The following factory functions support named tensors:
+
+- {func}`torch.empty`
+- {func}`torch.rand`
+- {func}`torch.randn`
+- {func}`torch.ones`
+- {func}`torch.tensor`
+- {func}`torch.zeros`
+
+## Named dimensions
+
+See {attr}`~Tensor.names` for restrictions on tensor names.
+
+Use {attr}`~Tensor.names` to access the dimension names of a tensor and
+{meth}`~Tensor.rename` to rename named dimensions.
+
+```
+    >>> imgs = torch.randn(1, 2, 2, 3 , names=('N', 'C', 'H', 'W'))
+    >>> imgs.names
+    ('N', 'C', 'H', 'W')
+
+    >>> renamed_imgs = imgs.rename(H='height', W='width')
+    >>> renamed_imgs.names
+    ('N', 'C', 'height', 'width)
+```
+
+Named tensors can coexist with unnamed tensors; named tensors are instances of
+{class}`torch.Tensor`. Unnamed tensors have ``None``-named dimensions. Named
+tensors do not require all dimensions to be named.
+
+```
+    >>> imgs = torch.randn(1, 2, 2, 3 , names=(None, 'C', 'H', 'W'))
+    >>> imgs.names
+    (None, 'C', 'H', 'W')
+```
+
+## Name propagation semantics
+
+Named tensors use names to automatically check that APIs are being called
+correctly at runtime. This occurs in a process called *name inference*.
+More formally, name inference consists of the following two steps:
+
+- **Check names**: an operator may perform automatic checks at runtime that
+  check that certain dimension names must match.
+- **Propagate names**: name inference propagates names to output tensors.
+
+All operations that support named tensors propagate names.
+
+```
+    >>> x = torch.randn(3, 3, names=('N', 'C'))
+    >>> x.abs().names
+    ('N', 'C')
+```
+
+
+(match_semantics-doc)=
+### match semantics
+
+
+Two names *match* if they are equal (string equality) or if at least one is ``None``.
+Nones are essentially a special "wildcard" name.
+
+``unify(A, B)`` determines which of the names ``A`` and ``B`` to propagate to the outputs.
+It returns the more *specific* of the two names, if they match. If the names do not match,
+then it errors.
+
+```{note}
+In practice, when working with named tensors, one should avoid having unnamed
+dimensions because their handling can be complicated. It is recommended to lift
+all unnamed dimensions to be named dimensions by using {meth}`~Tensor.refine_names`.
+```
+
+### Basic name inference rules
+
+Let's see how ``match`` and ``unify`` are used in name inference in the case of
+adding two one-dim tensors with no broadcasting.
+
+```
+    x = torch.randn(3, names=('X',))
+    y = torch.randn(3)
+    z = torch.randn(3, names=('Z',))
+```
+
+**Check names**: check that the names of the two tensors *match*.
+
+For the following examples:
+
+```
+    >>> # x + y  # match('X', None) is True
+    >>> # x + z  # match('X', 'Z') is False
+    >>> # x + x  # match('X', 'X') is True
+
+    >>> x + z
+    Error when attempting to broadcast dims ['X'] and dims ['Z']: dim 'X' and dim 'Z' are at the same position from the right but do not match.
+```
+
+**Propagate names**: *unify* the names to select which one to propagate.
+In the case of ``x + y``, ``unify('X', None) = 'X'`` because ``'X'`` is more
+specific than ``None``.
+
+```
+    >>> (x + y).names
+    ('X',)
+    >>> (x + x).names
+    ('X',)
+```
+
+For a comprehensive list of name inference rules, see {ref}`name_inference_reference-doc`.
+Here are two common operations that may be useful to go over:
+
+- Binary arithmetic ops: {ref}`unifies_names_from_inputs-doc`
+- Matrix multiplication ops: {ref}`contracts_away_dims-doc`
+
+## Explicit alignment by names
+
+Use {meth}`~Tensor.align_as` or {meth}`~Tensor.align_to` to align tensor dimensions
+by name to a specified ordering. This is useful for performing "broadcasting by names".
+
+```
+    # This function is agnostic to the dimension ordering of `input`,
+    # as long as it has a `C` dimension somewhere.
+    def scale_channels(input, scale):
+        scale = scale.refine_names('C')
+        return input * scale.align_as(input)
+
+    >>> num_channels = 3
+    >>> scale = torch.randn(num_channels, names=('C',))
+    >>> imgs = torch.rand(3, 3, 3, num_channels, names=('N', 'H', 'W', 'C'))
+    >>> more_imgs = torch.rand(3, num_channels, 3, 3, names=('N', 'C', 'H', 'W'))
+    >>> videos = torch.randn(3, num_channels, 3, 3, 3, names=('N', 'C', 'H', 'W', 'D')
+
+    >>> scale_channels(imgs, scale)
+    >>> scale_channels(more_imgs, scale)
+    >>> scale_channels(videos, scale)
+```
+
+## Manipulating dimensions
+
+Use {meth}`~Tensor.align_to` to permute large amounts of dimensions without
+mentioning all of them as in required by {meth}`~Tensor.permute`.
+
+```
+    >>> tensor = torch.randn(2, 2, 2, 2, 2, 2)
+    >>> named_tensor = tensor.refine_names('A', 'B', 'C', 'D', 'E', 'F')
+
+    # Move the F (dim 5) and E dimension (dim 4) to the front while keeping
+    # the rest in the same order
+    >>> tensor.permute(5, 4, 0, 1, 2, 3)
+    >>> named_tensor.align_to('F', 'E', ...)
+```
+
+Use {meth}`~Tensor.flatten` and {meth}`~Tensor.unflatten` to flatten and unflatten
+dimensions, respectively. These methods are more verbose than {meth}`~Tensor.view`
+and {meth}`~Tensor.reshape`, but have more semantic meaning to someone reading the code.
+
+
+```
+    >>> imgs = torch.randn(32, 3, 128, 128)
+    >>> named_imgs = imgs.refine_names('N', 'C', 'H', 'W')
+
+    >>> flat_imgs = imgs.view(32, -1)
+    >>> named_flat_imgs = named_imgs.flatten(['C', 'H', 'W'], 'features')
+    >>> named_flat_imgs.names
+    ('N', 'features')
+
+    >>> unflattened_named_imgs = named_flat_imgs.unflatten('features', [('C', 3), ('H', 128), ('W', 128)])
+    >>> unflattened_named_imgs.names
+    ('N', 'C', 'H', 'W')
+```
+
+(named_tensors_autograd-doc)=
+## Autograd support
+
+Autograd currently supports named tensors in a limited manner: autograd ignores
+names on all tensors. Gradient computation is still correct but we lose the
+safety that names give us.
+
+```
+    >>> x = torch.randn(3, names=('D',))
+    >>> weight = torch.randn(3, names=('D',), requires_grad=True)
+    >>> loss = (x - weight).abs()
+    >>> grad_loss = torch.randn(3)
+    >>> loss.backward(grad_loss)
+    >>> weight.grad  # Unnamed for now. Will be named in the future
+    tensor([-1.8107, -0.6357,  0.0783])
+
+    >>> weight.grad.zero_()
+    >>> grad_loss = grad_loss.refine_names('C')
+    >>> loss = (x - weight).abs()
+    # Ideally we'd check that the names of loss and grad_loss match but we don't yet.
+    >>> loss.backward(grad_loss)
+    >>> weight.grad
+    tensor([-1.8107, -0.6357,  0.0783])
+```
+
+## Currently supported operations and subsystems
+
+### Operators
+
+See {ref}`name_inference_reference-doc` for a full list of the supported torch and
+tensor operations. We do not yet support the following that is not covered by the link:
+
+- indexing, advanced indexing.
+
+For ``torch.nn.functional`` operators, we support the following:
+
+- {func}`torch.nn.functional.relu`
+- {func}`torch.nn.functional.softmax`
+- {func}`torch.nn.functional.log_softmax`
+- {func}`torch.nn.functional.tanh`
+- {func}`torch.nn.functional.sigmoid`
+- {func}`torch.nn.functional.dropout`
+
+### Subsystems
+
+
+Autograd is supported, see {ref}`named_tensors_autograd-doc`.
+Because gradients are currently unnamed, optimizers may work but are untested.
+
+NN modules are currently unsupported. This can lead to the following when calling
+modules with named tensor inputs:
+
+- NN module parameters are unnamed, so outputs may be partially named.
+- NN module forward passes have code that don't support named tensors and will
+  error out appropriately.
+
+We also do not support the following subsystems, though some may work out
+of the box:
+
+- distributions
+- serialization ({func}`torch.load`, {func}`torch.save`)
+- multiprocessing
+- JIT
+- distributed
+- ONNX
+
+If any of these would help your use case, please
+[search if an issue has already been filed](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3A%22module%3A+named+tensor%22)
+and if not, [file one](https://github.com/pytorch/pytorch/issues/new/choose).
+
+## Named tensor API reference
+
+In this section please find the documentation for named tensor specific APIs.
+For a comprehensive reference for how names are propagated through other PyTorch
+operators, see {ref}`name_inference_reference-doc`.
+
+```{eval-rst}
+.. class:: Tensor()
+   :noindex:
+
+   .. autoattribute:: names
+
+   .. automethod:: rename
+
+   .. automethod:: rename_
+
+   .. automethod:: refine_names
+
+   .. automethod:: align_as
+
+   .. automethod:: align_to
+
+   .. py:method:: flatten(dims, out_dim) -> Tensor
+      :noindex:
+
+      Flattens :attr:`dims` into a single dimension with name :attr:`out_dim`.
+
+      All of `dims` must be consecutive in order in the :attr:`self` tensor,
+      but not necessary contiguous in memory.
+
+      Examples::
+
+          >>> imgs = torch.randn(32, 3, 128, 128, names=('N', 'C', 'H', 'W'))
+          >>> flat_imgs = imgs.flatten(['C', 'H', 'W'], 'features')
+          >>> flat_imgs.names, flat_imgs.shape
+          (('N', 'features'), torch.Size([32, 49152]))
+
+      .. warning::
+          The named tensor API is experimental and subject to change.
+```
\ No newline at end of file
diff --git a/docs/source/nested.md b/docs/source/nested.md
new file mode 100644
index 000000000000..99bb2ad67056
--- /dev/null
+++ b/docs/source/nested.md
@@ -0,0 +1,523 @@
+# torch.nested
+
+```{eval-rst}
+.. automodule:: torch.nested
+```
+
+## Introduction
+
+
+```{warning}
+  The PyTorch API of nested tensors is in prototype stage and will change in the near future.
+```
+
+Nested tensors allow for ragged-shaped data to be contained within and operated upon as a
+single tensor. Such data is stored underneath in an efficient packed representation, while exposing
+a standard PyTorch tensor interface for applying operations.
+
+A common application of nested tensors is for expressing batches of variable-length sequential data
+present in various domains, such as varying sentence lengths, image sizes, and audio / video clip
+lengths. Traditionally, such data has been handled by padding sequences to that of the max length
+within a batch, performing computation on the padded form, and subsequently masking to remove
+padding. This is inefficient and error-prone, and nested tensors exist to address these problems.
+
+The API for calling operations on a nested tensor is no different from that of a regular
+``torch.Tensor``, allowing for seamless integration with existing models, with the main
+difference being {ref}`construction of the inputs <construction>`.
+
+As this is a prototype feature, the set of {ref}`operations supported <supported operations>` is
+limited, but growing. We welcome issues, feature requests, and contributions.
+More information on contributing can be found
+[in this Readme](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/nested/README.md).
+
+(construction)=
+## Construction
+
+```{note}
+
+  There are two forms of nested tensors present within PyTorch, distinguished by layout as
+  specified during construction. Layout can be one of ``torch.strided`` or ``torch.jagged``.
+  We recommend utilizing the ``torch.jagged`` layout whenever possible. While it currently only
+  supports a single ragged dimension, it has better op coverage, receives active development, and
+  integrates well with ``torch.compile``. These docs adhere to this recommendation and refer to
+  nested tensors with the ``torch.jagged`` layout as "NJTs" for brevity throughout.
+```
+
+Construction is straightforward and involves passing a list of tensors to the
+``torch.nested.nested_tensor`` constructor. A nested tensor with the ``torch.jagged`` layout
+(AKA an "NJT") supports a single ragged dimension. This constructor will copy the input tensors
+into a packed, contiguous block of memory according to the layout described in the `data_layout`_
+section below.
+
+```
+>>> a, b = torch.arange(3), torch.arange(5) + 3
+>>> a
+tensor([0, 1, 2])
+>>> b
+tensor([3, 4, 5, 6, 7])
+>>> nt = torch.nested.nested_tensor([a, b], layout=torch.jagged)
+>>> print([component for component in nt])
+[tensor([0, 1, 2]), tensor([3, 4, 5, 6, 7])]
+```
+
+Each tensor in the list must have the same number of dimensions, but the shapes can otherwise vary
+along a single dimension. If the dimensionalities of the input components don't match, the
+constructor throws an error.
+```
+>>> a = torch.randn(50, 128) # 2D tensor
+>>> b = torch.randn(2, 50, 128) # 3D tensor
+>>> nt = torch.nested.nested_tensor([a, b], layout=torch.jagged)
+...
+RuntimeError: When constructing a nested tensor, all tensors in list must have the same dim
+```
+
+During construction, dtype, device, and whether gradients are required can be chosen via the
+usual keyword arguments.
+
+```
+>>> nt = torch.nested.nested_tensor([a, b], layout=torch.jagged, dtype=torch.float32, device="cuda", requires_grad=True)
+>>> print([component for component in nt])
+[tensor([0., 1., 2.], device='cuda:0',
+       grad_fn=<UnbindBackwardAutogradNestedTensor0>), tensor([3., 4., 5., 6., 7.], device='cuda:0',
+       grad_fn=<UnbindBackwardAutogradNestedTensor0>)]
+```
+
+``torch.nested.as_nested_tensor`` can be used to preserve autograd history from the tensors passed
+to the constructor. When this constructor is utilized, gradients will flow through the nested tensor
+back into the original components. Note that this constructor still copies the input components into
+a packed, contiguous block of memory.
+
+```
+>>> a = torch.randn(12, 512, requires_grad=True)
+>>> b = torch.randn(23, 512, requires_grad=True)
+>>> nt = torch.nested.as_nested_tensor([a, b], layout=torch.jagged, dtype=torch.float32)
+>>> nt.sum().backward()
+>>> a.grad
+tensor([[1., 1., 1.,  ..., 1., 1., 1.],
+        [1., 1., 1.,  ..., 1., 1., 1.],
+        [1., 1., 1.,  ..., 1., 1., 1.],
+        ...,
+        [1., 1., 1.,  ..., 1., 1., 1.],
+        [1., 1., 1.,  ..., 1., 1., 1.],
+        [1., 1., 1.,  ..., 1., 1., 1.]])
+>>> b.grad
+tensor([[1., 1., 1.,  ..., 1., 1., 1.],
+        [1., 1., 1.,  ..., 1., 1., 1.],
+        [1., 1., 1.,  ..., 1., 1., 1.],
+        ...,
+        [1., 1., 1.,  ..., 1., 1., 1.],
+        [1., 1., 1.,  ..., 1., 1., 1.],
+        [1., 1., 1.,  ..., 1., 1., 1.]])
+```
+
+The above functions all create contiguous NJTs, where a chunk of memory is allocated to store
+a packed form of the underlying components (see the `data_layout`_ section below for more
+details).
+
+It is also possible to create a non-contiguous NJT view over a pre-existing dense tensor
+with padding, avoiding the memory allocation and copying. ``torch.nested.narrow()`` is the tool
+for accomplishing this.
+
+```
+>>> padded = torch.randn(3, 5, 4)
+>>> seq_lens = torch.tensor([3, 2, 5], dtype=torch.int64)
+>>> nt = torch.nested.narrow(padded, dim=1, start=0, length=seq_lens, layout=torch.jagged)
+>>> nt.shape
+torch.Size([3, j1, 4])
+>>> nt.is_contiguous()
+False
+```
+
+Note that the nested tensor acts as a view over the original padded dense tensor, referencing the
+same memory without copying / allocation. Operation support for non-contiguous NJTs is somewhat more
+limited, so if you run into support gaps, it's always possible to convert to a contiguous NJT
+using ``contiguous()``.
+
+(data_layout)=
+## Data Layout and Shape
+
+For efficiency, nested tensors generally pack their tensor components into a contiguous chunk of
+memory and maintain additional metadata to specify batch item boundaries. For the ``torch.jagged``
+layout, the contiguous chunk of memory is stored in the ``values`` component, with the ``offsets``
+component delineating batch item boundaries for the ragged dimension.
+
+![image](_static/img/nested/njt_visual.png)
+
+It's possible to directly access the underlying NJT components when necessary.
+
+```
+>>> a = torch.randn(50, 128) # text 1
+>>> b = torch.randn(32, 128) # text 2
+>>> nt = torch.nested.nested_tensor([a, b], layout=torch.jagged, dtype=torch.float32)
+>>> nt.values().shape  # note the "packing" of the ragged dimension; no padding needed
+torch.Size([82, 128])
+>>> nt.offsets()
+tensor([ 0, 50, 82])
+```
+
+It can also be useful to construct an NJT from the jagged ``values`` and ``offsets``
+constituents directly; the ``torch.nested.nested_tensor_from_jagged()`` constructor serves
+this purpose.
+
+```
+>>> values = torch.randn(82, 128)
+>>> offsets = torch.tensor([0, 50, 82], dtype=torch.int64)
+>>> nt = torch.nested.nested_tensor_from_jagged(values=values, offsets=offsets)
+```
+
+An NJT has a well-defined shape with dimensionality 1 greater than that of its components. The
+underlying structure of the ragged dimension is represented by a symbolic value (``j1`` in the
+example below).
+
+```
+>>> a = torch.randn(50, 128)
+>>> b = torch.randn(32, 128)
+>>> nt = torch.nested.nested_tensor([a, b], layout=torch.jagged, dtype=torch.float32)
+>>> nt.dim()
+3
+>>> nt.shape
+torch.Size([2, j1, 128])
+```
+
+NJTs must have the same ragged structure to be compatible with each other. For example, to run a
+binary operation involving two NJTs, the ragged structures must match (i.e. they must have the
+same ragged shape symbol in their shapes). In the details, each symbol corresponds with an exact
+``offsets`` tensor, so both NJTs must have the same ``offsets`` tensor to be compatible with
+each other.
+
+```
+>>> a = torch.randn(50, 128)
+>>> b = torch.randn(32, 128)
+>>> nt1 = torch.nested.nested_tensor([a, b], layout=torch.jagged, dtype=torch.float32)
+>>> nt2 = torch.nested.nested_tensor([a, b], layout=torch.jagged, dtype=torch.float32)
+>>> nt1.offsets() is nt2.offsets()
+False
+>>> nt3 = nt1 + nt2
+RuntimeError: cannot call binary pointwise function add.Tensor with inputs of shapes (2, j2, 128) and (2, j3, 128)
+```
+
+In the above example, even though the conceptual shapes of the two NJTs are the same, they don't
+share a reference to the same ``offsets`` tensor, so their shapes differ, and they are not
+compatible. We recognize that this behavior is unintuitive and are working hard to relax this
+restriction for the beta release of nested tensors. For a workaround, see the
+{ref}`Troubleshooting <ragged_structure_incompatibility>` section of this document.
+
+In addition to the ``offsets`` metadata, NJTs can also compute and cache the minimum and maximum
+sequence lengths for its components, which can be useful for invoking particular kernels (e.g. SDPA).
+There are currently no public APIs for accessing these, but this will change for the beta release.
+
+(supported operations)=
+## Supported Operations
+
+This section contains a list of common operations over nested tensors that you may find useful.
+It is not comprehensive, as there are on the order of a couple thousand ops within PyTorch. While
+a sizeable subset of these are supported for nested tensors today, full support is a large task.
+The ideal state for nested tensors is full support of all PyTorch operations that are available
+for non-nested tensors. To help us accomplish this, please consider:
+
+* Requesting particular ops needed for your use case
+  [here](https://github.com/pytorch/pytorch/issues/118107) to help us prioritize.
+* Contributing! It's not too hard to add nested tensor support for a given PyTorch op; see
+  the [Contributions](contributions) section below for details.
+
+### Viewing nested tensor constituents
+
+``unbind()`` allows you to retrieve a view of the nested tensor's constituents.
+
+```
+>>> import torch
+>>> a = torch.randn(2, 3)
+>>> b = torch.randn(3, 3)
+>>> nt = torch.nested.nested_tensor([a, b], layout=torch.jagged)
+>>> nt.unbind()
+(tensor([[-0.9916, -0.3363, -0.2799],
+        [-2.3520, -0.5896, -0.4374]]), tensor([[-2.0969, -1.0104,  1.4841],
+        [ 2.0952,  0.2973,  0.2516],
+        [ 0.9035,  1.3623,  0.2026]]))
+>>> nt.unbind()[0] is not a
+True
+>>> nt.unbind()[0].mul_(3)
+tensor([[ 3.6858, -3.7030, -4.4525],
+        [-2.3481,  2.0236,  0.1975]])
+>>> nt.unbind()
+(tensor([[-2.9747, -1.0089, -0.8396],
+        [-7.0561, -1.7688, -1.3122]]), tensor([[-2.0969, -1.0104,  1.4841],
+        [ 2.0952,  0.2973,  0.2516],
+        [ 0.9035,  1.3623,  0.2026]]))
+```
+
+Note that ``nt.unbind()[0]`` is not a copy, but rather a slice of the underlying memory, which
+represents the first entry or constituent of the nested tensor.
+
+#### Conversions to / from padded
+
+``torch.nested.to_padded_tensor()`` converts an NJT to a padded dense tensor with the specified
+padding value. The ragged dimension will be padded out to the size of the maximum sequence length.
+
+```
+>>> import torch
+>>> a = torch.randn(2, 3)
+>>> b = torch.randn(6, 3)
+>>> nt = torch.nested.nested_tensor([a, b], layout=torch.jagged)
+>>> padded = torch.nested.to_padded_tensor(nt, padding=4.2)
+>>> padded
+tensor([[[ 1.6107,  0.5723,  0.3913],
+         [ 0.0700, -0.4954,  1.8663],
+         [ 4.2000,  4.2000,  4.2000],
+         [ 4.2000,  4.2000,  4.2000],
+         [ 4.2000,  4.2000,  4.2000],
+         [ 4.2000,  4.2000,  4.2000]],
+        [[-0.0479, -0.7610, -0.3484],
+         [ 1.1345,  1.0556,  0.3634],
+         [-1.7122, -0.5921,  0.0540],
+         [-0.5506,  0.7608,  2.0606],
+         [ 1.5658, -1.1934,  0.3041],
+         [ 0.1483, -1.1284,  0.6957]]])
+```
+
+This can be useful as an escape hatch to work around NJT support gaps, but ideally such
+conversions should be avoided when possible for optimal memory usage and performance, as the
+more efficient nested tensor layout does not materialize padding.
+
+The reverse conversion can be accomplished using ``torch.nested.narrow()``, which applies
+ragged structure to a given dense tensor to produce an NJT. Note that by default, this operation
+does not copy the underlying data, and thus the output NJT is generally non-contiguous. It may be
+useful to explicitly call ``contiguous()`` here if a contiguous NJT is desired.
+
+```
+>>> padded = torch.randn(3, 5, 4)
+>>> seq_lens = torch.tensor([3, 2, 5], dtype=torch.int64)
+>>> nt = torch.nested.narrow(padded, dim=1, length=seq_lens, layout=torch.jagged)
+>>> nt.shape
+torch.Size([3, j1, 4])
+>>> nt = nt.contiguous()
+>>> nt.shape
+torch.Size([3, j2, 4])
+```
+
+### Shape manipulations
+
+Nested tensors support a wide array of operations for shape manipulation, including views.
+
+```
+>>> a = torch.randn(2, 6)
+>>> b = torch.randn(4, 6)
+>>> nt = torch.nested.nested_tensor([a, b], layout=torch.jagged)
+>>> nt.shape
+torch.Size([2, j1, 6])
+>>> nt.unsqueeze(-1).shape
+torch.Size([2, j1, 6, 1])
+>>> nt.unflatten(-1, [2, 3]).shape
+torch.Size([2, j1, 2, 3])
+>>> torch.cat([nt, nt], dim=2).shape
+torch.Size([2, j1, 12])
+>>> torch.stack([nt, nt], dim=2).shape
+torch.Size([2, j1, 2, 6])
+>>> nt.transpose(-1, -2).shape
+torch.Size([2, 6, j1])
+```
+
+### Attention mechanisms
+
+As variable-length sequences are common inputs to attention mechanisms, nested tensors support
+important attention operators
+[Scaled Dot Product Attention (SDPA)](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) and
+[FlexAttention](https://pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention).
+See
+[here](https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html#multiheadattention)
+for usage examples of NJT with SDPA and
+[here](https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html#flexattention-njt)
+for usage examples of NJT with FlexAttention.
+
+(usage_with_torch_compile)=
+## Usage with torch.compile
+
+
+NJTs are designed to be used with ``torch.compile()`` for optimal performance, and we always
+recommend utilizing ``torch.compile()`` with NJTs when possible. NJTs work out-of-the-box and
+graph-break-free both when passed as inputs to a compiled function or module OR when
+instantiated in-line within the function.
+
+```{note}
+    If you're not able to utilize ``torch.compile()`` for your use case, performance and memory
+    usage may still benefit from the use of NJTs, but it's not as clear-cut whether this will be
+    the case. It is important that the tensors being operated on are large enough so the
+    performance gains are not outweighed by the overhead of python tensor subclasses.
+```
+
+```
+>>> import torch
+>>> a = torch.randn(2, 3)
+>>> b = torch.randn(4, 3)
+>>> nt = torch.nested.nested_tensor([a, b], layout=torch.jagged)
+>>> def f(x): return x.sin() + 1
+...
+>>> compiled_f = torch.compile(f, fullgraph=True)
+>>> output = compiled_f(nt)
+>>> output.shape
+torch.Size([2, j1, 3])
+>>> def g(values, offsets): return torch.nested.nested_tensor_from_jagged(values, offsets) * 2.
+...
+>>> compiled_g = torch.compile(g, fullgraph=True)
+>>> output2 = compiled_g(nt.values(), nt.offsets())
+>>> output2.shape
+torch.Size([2, j1, 3])
+```
+
+Note that NJTs support
+[Dynamic Shapes](https://pytorch.org/docs/stable/torch.compiler_dynamic_shapes.html)
+to avoid unnecessary recompiles with changing ragged structure.
+
+```
+>>> a = torch.randn(2, 3)
+>>> b = torch.randn(4, 3)
+>>> c = torch.randn(5, 3)
+>>> d = torch.randn(6, 3)
+>>> nt1 = torch.nested.nested_tensor([a, b], layout=torch.jagged)
+>>> nt2 = torch.nested.nested_tensor([c, d], layout=torch.jagged)
+>>> def f(x): return x.sin() + 1
+...
+>>> compiled_f = torch.compile(f, fullgraph=True)
+>>> output1 = compiled_f(nt1)
+>>> output2 = compiled_f(nt2)  # NB: No recompile needed even though ragged structure differs
+```
+
+If you run into problems or arcane errors when utilizing NJT + ``torch.compile``, please file a
+PyTorch issue. Full subclass support within ``torch.compile`` is a long-term effort and there may
+be some rough edges at this time.
+
+(troubleshooting)=
+## Troubleshooting
+
+This section contains common errors that you may run into when utilizing nested tensors, alongside
+the reason for these errors and suggestions for how to address them.
+
+(unimplemented_op)=
+### Unimplemented ops
+
+This error is becoming rarer as nested tensor op support grows, but it's still possible to hit it
+today given that there are a couple thousand ops within PyTorch.
+
+```
+    NotImplementedError: aten.view_as_real.default
+```
+
+The error is straightforward; we haven't gotten around to adding op support for this particular op
+yet. If you'd like, you can [contribute](contributions) an implementation yourself OR simply
+[request](https://github.com/pytorch/pytorch/issues/118107) that we add support for this op
+in a future PyTorch release.
+
+(ragged_structure_incompatibility)=
+### Ragged structure incompatibility
+
+```
+    RuntimeError: cannot call binary pointwise function add.Tensor with inputs of shapes (2, j2, 128) and (2, j3, 128)
+```
+
+This error occurs when calling an op that operates over multiple NJTs with incompatible ragged
+structures. Currently, it is required that input NJTs have the exact same ``offsets`` constituent
+in order to have the same symbolic ragged structure symbol (e.g. ``j1``).
+
+As a workaround for this situation, it is possible to construct NJTs from the ``values`` and
+``offsets`` components directly. With both NJTs referencing the same ``offsets`` components, they
+are considered to have the same ragged structure and are thus compatible.
+
+```
+>>> a = torch.randn(50, 128)
+>>> b = torch.randn(32, 128)
+>>> nt1 = torch.nested.nested_tensor([a, b], layout=torch.jagged, dtype=torch.float32)
+>>> nt2 = torch.nested.nested_tensor_from_jagged(values=torch.randn(82, 128), offsets=nt1.offsets())
+>>> nt3 = nt1 + nt2
+>>> nt3.shape
+torch.Size([2, j1, 128])
+```
+
+### Data dependent operation within torch.compile
+
+```
+    torch._dynamo.exc.Unsupported: data dependent operator: aten._local_scalar_dense.default; to enable, set torch._dynamo.config.capture_scalar_outputs = True
+```
+
+This error occurs when calling an op that does data-dependent operation within torch.compile; this
+commonly occurs for ops that need to examine the values of the NJT's ``offsets`` to determine the
+output shape. For example:
+
+```
+>>> a = torch.randn(50, 128)
+>>> b = torch.randn(32, 128)
+>>> nt = torch.nested.nested_tensor([a, b], layout=torch.jagged, dtype=torch.float32)
+>>> def f(nt): return nt.chunk(2, dim=0)[0]
+...
+>>> compiled_f = torch.compile(f, fullgraph=True)
+>>> output = compiled_f(nt)
+```
+
+In this example, calling ``chunk()`` on the batch dimension of the NJT requires examination of the
+NJT's ``offsets`` data to delineate batch item boundaries within the packed ragged dimension. As a
+workaround, there are a couple torch.compile flags that can be set:
+
+```
+>>> torch._dynamo.config.capture_dynamic_output_shape_ops = True
+>>> torch._dynamo.config.capture_scalar_outputs = True
+```
+
+If, after setting these, you still see data-dependent operator errors, please file an issue with
+PyTorch. This area of ``torch.compile()`` is still in heavy development and certain aspects of
+NJT support may be incomplete.
+
+(contributions)=
+## Contributions
+
+If you'd like to contribute to nested tensor development, one of the most impactful ways to do
+so is to add nested tensor support for a currently-unsupported PyTorch op. This process generally
+consists of a couple simple steps:
+
+1. Determine the name of the op to add; this should be something like ``aten.view_as_real.default``.
+   The signature for this op can be found in ``aten/src/ATen/native/native_functions.yaml``.
+2. Register an op implementation in ``torch/nested/_internal/ops.py``, following the pattern
+   established there for other ops. Use the signature from ``native_functions.yaml`` for schema
+   validation.
+
+The most common way to implement an op is to unwrap the NJT into its constituents, redispatch the
+op on the underlying ``values`` buffer, and propagate the relevant NJT metadata (including
+``offsets``) to a new output NJT. If the output of the op is expected to have a different shape
+from the input, new ``offsets``, etc. metadata must be computed.
+
+When an op is applied over the batch or ragged dimension, these tricks can help quickly get a
+working implementation:
+
+* For *non-batchwise* operation, an ``unbind()``-based fallback should work.
+* For operation on the ragged dimension, consider converting to padded dense with a properly-selected
+  padding value that won't negatively bias the output, running the op, and converting back to NJT.
+  Within ``torch.compile``, these conversions can be fused to avoid materializing the padded
+  intermediate.
+
+(construction_and_conversion)=
+
+## Detailed Docs for Construction and Conversion Functions
+```{eval-rst}
+.. currentmodule:: torch.nested
+```
+```{eval-rst}
+.. autofunction:: nested_tensor
+```
+```{eval-rst}
+.. autofunction:: nested_tensor_from_jagged
+```
+```{eval-rst}
+.. autofunction:: as_nested_tensor
+```
+```{eval-rst}
+.. autofunction:: to_padded_tensor
+```
+```{eval-rst}
+.. autofunction:: masked_select
+```
+```{eval-rst}
+.. autofunction:: narrow
+```
+```{eval-rst}
+.. seealso::
+
+   `Accelerating PyTorch Transformers by replacing nn.Transformer with Nested Tensors and torch.compile <https://docs.pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+```
\ No newline at end of file
diff --git a/docs/source/nn.attention.bias.md b/docs/source/nn.attention.bias.md
new file mode 100644
index 000000000000..2a373e429b94
--- /dev/null
+++ b/docs/source/nn.attention.bias.md
@@ -0,0 +1,30 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+# torch.nn.attention.bias
+
+```{eval-rst}
+.. automodule:: torch.nn.attention.bias
+.. currentmodule:: torch.nn.attention.bias
+```
+
+## CausalBias
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classnoinheritance.rst
+
+    CausalBias
+```
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    causal_lower_right
+    causal_upper_left
+    CausalVariant
+```
\ No newline at end of file
diff --git a/docs/source/nn.attention.experimental.md b/docs/source/nn.attention.experimental.md
new file mode 100644
index 000000000000..0ecf3b1097cd
--- /dev/null
+++ b/docs/source/nn.attention.experimental.md
@@ -0,0 +1,12 @@
+# torch.nn.attention.experimental
+
+```{eval-rst}
+.. currentmodule:: torch.nn.attention.experimental
+```
+```{eval-rst}
+.. py:module:: torch.nn.attention.experimental
+```
+
+```{warning}
+   These APIs are experimental and subject to change without notice.
+```
\ No newline at end of file
diff --git a/docs/source/nn.attention.flex_attention.md b/docs/source/nn.attention.flex_attention.md
new file mode 100644
index 000000000000..fdbaff6c7b3e
--- /dev/null
+++ b/docs/source/nn.attention.flex_attention.md
@@ -0,0 +1,45 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# torch.nn.attention.flex_attention
+
+```{eval-rst}
+.. currentmodule:: torch.nn.attention.flex_attention
+```
+```{eval-rst}
+.. py:module:: torch.nn.attention.flex_attention
+```
+```{eval-rst}
+.. autofunction:: flex_attention
+```
+
+## BlockMask Utilities
+
+```{eval-rst}
+.. autofunction:: create_block_mask
+```
+```{eval-rst}
+.. autofunction:: create_mask
+```
+```{eval-rst}
+.. autofunction:: create_nested_block_mask
+```
+```{eval-rst}
+.. autofunction:: and_masks
+```
+```{eval-rst}
+.. autofunction:: or_masks
+```
+```{eval-rst}
+.. autofunction:: noop_mask
+```
+
+## BlockMask
+
+```{eval-rst}
+.. autoclass:: BlockMask
+    :members:
+    :undoc-members:
+```
\ No newline at end of file
diff --git a/docs/source/notes.md b/docs/source/notes.md
new file mode 100644
index 000000000000..8a09e8cfdfbf
--- /dev/null
+++ b/docs/source/notes.md
@@ -0,0 +1,9 @@
+(developer_notes)=
+# Developer Notes
+
+```{toctree}
+:glob:
+:maxdepth: 1
+
+notes/*
+```
diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst
index eb37a66e59ea..fad727b08349 100644
--- a/docs/source/notes/autograd.rst
+++ b/docs/source/notes/autograd.rst
@@ -97,13 +97,74 @@ Unfortunately many of the functions we use in practice do not have this property
 To try and reduce the impact of functions that are non-differentiable, we define the gradients of the elementary operations by applying the following rules in order:
 
 #. If the function is differentiable and thus a gradient exists at the current point, use it.
+<<<<<<< HEAD
 #. If the function is convex (at least locally), use the sub-gradient of minimum norm (it is the steepest descent direction).
+=======
+#. If the function is convex (at least locally), use the sub-gradient of minimum norm.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #. If the function is concave (at least locally), use the super-gradient of minimum norm (consider `-f(x)` and apply the previous point).
 #. If the function is defined, define the gradient at the current point by continuity (note that ``inf`` is possible here, for example for ``sqrt(0)``). If multiple values are possible, pick one arbitrarily.
 #. If the function is not defined (``sqrt(-1)``, ``log(-1)`` or most functions when the input is ``NaN``, for example) then the value used as the gradient is arbitrary (we might also raise an error but that is not guaranteed). Most functions will use ``NaN`` as the gradient, but for performance reasons, some functions will use other values (``log(-1)``, for example).
 #. If the function is not a deterministic mapping (i.e. it is not a `mathematical function`_), it will be marked as non-differentiable. This will make it error out in the backward if used on tensors that require grad outside of a ``no_grad`` environment.
 
+<<<<<<< HEAD
 .. _mathematical function: https://en.wikipedia.org/wiki/Function_(mathematics)
+=======
+.. _mathematical function: https://en.wikipedia.org/wiki/Function_%28mathematics%29
+
+Division by Zero in Autograd
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When performing division by zero in PyTorch (e.g., ``x / 0``), the forward pass will produce ``inf`` values following IEEE-754 floating point arithmetic. While these ``inf`` values can be masked out before computing the final loss (e.g., via indexing or masking), the autograd system still tracks and differentiates through the full computation graph, including the division by zero operation.
+
+During backpropagation, this can lead to problematic gradient expressions. For example:
+
+.. code::
+
+    x = torch.tensor([1., 1.], requires_grad=True)
+    div = torch.tensor([0., 1.])
+
+    y = x / div          # Results in [inf, 1]
+    mask = div != 0      # [False, True]
+    loss = y[mask].sum()
+    loss.backward()
+    print(x.grad)        # [nan, 1], not [0, 1]
+
+In this example, even though we only use the masked output (which excludes the division by zero), autograd still computes gradients through the full computation graph, including the division by zero operation. This results in ``nan`` gradients for the masked elements, which can cause training instability.
+
+To avoid this issue, there are several recommended approaches:
+
+1. Mask before division:
+
+.. code::
+
+    x = torch.tensor([1., 1.], requires_grad=True)
+    div = torch.tensor([0., 1.])
+
+    mask = div != 0
+    safe = torch.zeros_like(x)
+    safe[mask] = x[mask] / div[mask]
+    loss = safe.sum()
+    loss.backward()      # Produces safe gradients [0, 1]
+
+2. Use MaskedTensor (experimental API):
+
+.. code::
+
+    from torch.masked import as_masked_tensor
+
+    x = torch.tensor([1., 1.], requires_grad=True)
+    div = torch.tensor([0., 1.])
+
+    y = x / div
+    mask = div != 0
+    loss = as_masked_tensor(y, mask).sum()
+    loss.backward()      # Cleanly handles "undefined" vs "zero" gradients
+
+The key principle is to prevent the division by zero operation from being recorded in the computation graph, rather than masking its results after the fact. This ensures that autograd only computes gradients through valid operations.
+
+This behavior is important to keep in mind when working with operations that might produce ``inf`` or ``nan`` values, as masking the outputs does not prevent the problematic gradients from being computed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. _locally-disable-grad-doc:
 
@@ -214,14 +275,22 @@ In other words, computations in no-grad mode are never recorded in the backward
 even if there are inputs that have ``require_grad=True``.
 
 Enable no-grad mode when you need to perform operations that should not be
+<<<<<<< HEAD
 recorded by autograd, but you’d still like to use the outputs of these
+=======
+recorded by autograd, but you'd still like to use the outputs of these
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 computations in grad mode later. This context manager makes it convenient to
 disable gradients for a block of code or function without
 having to temporarily set tensors to have ``requires_grad=False``, and then
 back to ``True``.
 
 For example, no-grad mode might be useful when writing an optimizer: when
+<<<<<<< HEAD
 performing the training update you’d like to update parameters
+=======
+performing the training update you'd like to update parameters
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 in-place without the update being recorded by autograd.
 You also intend to use the updated parameters for computations in
 grad mode in the next forward pass.
@@ -241,13 +310,21 @@ will not be able to be used in computations to be recorded by autograd after
 exiting inference mode.
 
 Enable inference mode when you are performing computations that do not have
+<<<<<<< HEAD
 interactions with autograd, AND you don’t plan on using the tensors created
+=======
+interactions with autograd, AND you don't plan on using the tensors created
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 in inference mode in any computation that is to be recorded by autograd later.
 
 It is recommended that you try out inference mode in the parts of your code
 that do not require autograd tracking (e.g., data processing and model evaluation).
 If it works out of the box
+<<<<<<< HEAD
 for your use case it’s a free performance win. If you run into errors after
+=======
+for your use case it's a free performance win. If you run into errors after
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 enabling inference mode, check that you are not using tensors created in
 inference mode in computations that are recorded by autograd after exiting inference
 mode. If you cannot avoid such use in your case, you can always switch back
@@ -278,7 +355,11 @@ BatchNorm running statistics on validation data.
 
 It is recommended that you always use ``model.train()`` when
 training and ``model.eval()`` when evaluating your model (validation/testing) even
+<<<<<<< HEAD
 if you aren’t sure your model has training-mode specific behavior, because a
+=======
+if you aren't sure your model has training-mode specific behavior, because a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 module you are using might be updated to behave differently in training and
 eval modes.
 
@@ -470,10 +551,17 @@ Wirtinger Calculus comes into the picture ...
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 So, we have this great theory of complex differentiability and
+<<<<<<< HEAD
 holomorphic functions, and we can’t use any of it at all, because many
 of the commonly used functions are not holomorphic. What’s a poor
 mathematician to do? Well, Wirtinger observed that even if :math:`f(z)`
 isn’t holomorphic, one could rewrite it as a two variable function
+=======
+holomorphic functions, and we can't use any of it at all, because many
+of the commonly used functions are not holomorphic. What's a poor
+mathematician to do? Well, Wirtinger observed that even if :math:`f(z)`
+isn't holomorphic, one could rewrite it as a two variable function
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 :math:`f(z, z*)` which is always holomorphic. This is because real and
 imaginary of the components of :math:`z` can be expressed in terms of
 :math:`z` and :math:`z^*` as:
@@ -516,7 +604,11 @@ There are a lot of beautiful consequences of this change.
 
 - For one, the Cauchy-Riemann equations translate into simply saying that :math:`\frac{\partial f}{\partial z^*} = 0` (that is to say, the function :math:`f` can be written
   entirely in terms of :math:`z`, without making reference to :math:`z^*`).
+<<<<<<< HEAD
 - Another important (and somewhat counterintuitive) result, as we’ll see later, is that when we do optimization on a real-valued loss, the step we should
+=======
+- Another important (and somewhat counterintuitive) result, as we'll see later, is that when we do optimization on a real-valued loss, the step we should
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   take while making variable update is given by :math:`\frac{\partial Loss}{\partial z^*}` (not :math:`\frac{\partial Loss}{\partial z}`).
 
 For more reading, check out: https://arxiv.org/pdf/0906.4835.pdf
@@ -557,7 +649,11 @@ How does PyTorch compute the conjugate Wirtinger derivative?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Typically, our derivative formulas take in `grad_output` as an input,
+<<<<<<< HEAD
 representing the incoming Vector-Jacobian product that we’ve already
+=======
+representing the incoming Vector-Jacobian product that we've already
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 computed, aka, :math:`\frac{\partial L}{\partial s^*}`, where :math:`L`
 is the loss of the entire computation (producing a real loss) and
 :math:`s` is the output of our function. The goal here is to compute
@@ -569,10 +665,17 @@ have access to :math:`\frac{\partial L}{\partial s}`.  If you want
 to skip this derivation, look at the last equation in this section
 and then skip to the next section.
 
+<<<<<<< HEAD
 Let’s continue working with :math:`f: ℂ → ℂ` defined as
 :math:`f(z) = f(x+yj) = u(x, y) + v(x, y)j`. As discussed above,
 autograd’s gradient convention is centered around optimization for real
 valued loss functions, so let’s assume :math:`f` is a part of larger
+=======
+Let's continue working with :math:`f: ℂ → ℂ` defined as
+:math:`f(z) = f(x+yj) = u(x, y) + v(x, y)j`. As discussed above,
+autograd's gradient convention is centered around optimization for real
+valued loss functions, so let's assume :math:`f` is a part of larger
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 real valued loss function :math:`g`. Using chain rule, we can write:
 
     .. math::
@@ -793,7 +896,11 @@ Example:
 
     def pack_hook(x):
         if x.numel() < SAVE_ON_DISK_THRESHOLD:
+<<<<<<< HEAD
             return x
+=======
+            return x.detach()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         temp_file = SelfDeletingTempFile()
         torch.save(tensor, temp_file.name)
         return temp_file
@@ -833,7 +940,11 @@ Tensor object creation. For example:
 
 .. code::
 
+<<<<<<< HEAD
     with torch.autograd.graph.saved_tensors_hooks(lambda x: x, lambda x: x):
+=======
+    with torch.autograd.graph.saved_tensors_hooks(lambda x: x.detach(), lambda x: x):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(5, requires_grad=True)
         y = x * x
 
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 11d8b901c763..770e2df1f464 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -481,7 +481,11 @@ Available options:
   the native CUDACachingAllocator, the sizes are rounded up in multiple
   of blocks size of 512, so this works fine for smaller sizes. However, this
   can be inefficient for large near-by allocations as each will go to different
+<<<<<<< HEAD
   size of blocks and re-use of those blocks are minimized. This might create
+=======
+  size of blocks and reuse of those blocks are minimized. This might create
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   lots of unused blocks and will waste GPU memory capacity. This option enables
   the rounding of allocation size to nearest power-2 division. For example, if
   we need to round-up size of 1200 and if number of divisions is 4,
@@ -497,10 +501,17 @@ Available options:
   ``roundup_power2_divisions`` is only meaningful with ``backend:native``.
   With ``backend:cudaMallocAsync``, ``roundup_power2_divisions`` is ignored.
 * ``max_non_split_rounding_mb`` will allow non-split blocks for better reuse, eg,
+<<<<<<< HEAD
    a 1024MB cached block can be re-used for a 512MB allocation request. In the default
    case, we only allow up to 20MB of rounding of non-split blocks, so a 512MB block
    can only be served with between 512-532 MB size block. If we set the value of this
    option to 1024, it will alow 512-1536 MB size blocks to be used for a 512MB block
+=======
+   a 1024MB cached block can be reused for a 512MB allocation request. In the default
+   case, we only allow up to 20MB of rounding of non-split blocks, so a 512MB block
+   can only be served with between 512-532 MB size block. If we set the value of this
+   option to 1024, it will allow 512-1536 MB size blocks to be used for a 512MB block
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    which increases reuse of larger blocks. This will also help in reducing the stalls
    in avoiding expensive cudaMalloc calls.
 * ``garbage_collection_threshold`` helps actively reclaiming unused GPU memory to
@@ -511,6 +522,11 @@ Available options:
   80% of the total memory allocated to the GPU application). The algorithm prefers
   to free old & unused blocks first to avoid freeing blocks that are actively being
   reused. The threshold value should be between greater than 0.0 and less than 1.0.
+<<<<<<< HEAD
+=======
+  The default value is set at 1.0.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ``garbage_collection_threshold`` is only meaningful with ``backend:native``.
   With ``backend:cudaMallocAsync``, ``garbage_collection_threshold`` is ignored.
 * ``expandable_segments`` (experimental, default: `False`) If set to `True`, this setting instructs
@@ -546,20 +562,32 @@ Available options:
   appended to the end of the segment. This process does not create as many slivers
   of unusable memory, so it is more likely to succeed at finding this memory.
 
+<<<<<<< HEAD
   `pinned_use_cuda_host_register` option is a boolean flag that determines whether to
+=======
+* `pinned_use_cuda_host_register` option is a boolean flag that determines whether to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   use the CUDA API's cudaHostRegister function for allocating pinned memory instead
   of the default cudaHostAlloc. When set to True, the memory is allocated using regular
   malloc and then pages are mapped to the memory before calling cudaHostRegister.
   This pre-mapping of pages helps reduce the lock time during the execution
   of cudaHostRegister.
 
+<<<<<<< HEAD
   `pinned_num_register_threads` option is only valid when pinned_use_cuda_host_register
+=======
+* `pinned_num_register_threads` option is only valid when pinned_use_cuda_host_register
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   is set to True. By default, one thread is used to map the pages. This option allows
   using more threads to parallelize the page mapping operations to reduce the overall
   allocation time of pinned memory. A good value for this option is 8 based on
   benchmarking results.
 
+<<<<<<< HEAD
   `pinned_use_background_threads` option is a boolean flag to enable background thread
+=======
+* `pinned_use_background_threads` option is a boolean flag to enable background thread
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for processing events. This avoids any slow path associated with querying/processing of
   events in the fast allocation path. This feature is disabled by default.
 
@@ -770,6 +798,24 @@ be called internally on deletion of the pool, hence returning all the memory to
    del tensor, del pool
 
 
+<<<<<<< HEAD
+=======
+Users can optionally specify a ``use_on_oom`` bool (which is False by default) during MemPool
+creation. If true, then the CUDACachingAllocator will be able to use memory in this pool as
+a last resort instead of OOMing.
+
+.. code:: python
+
+    pool = torch.cuda.MemPool(allocator, use_on_oom=True)
+    with torch.cuda.use_mem_pool(pool):
+        a = torch.randn(40 * 1024 * 1024, dtype=torch.uint8, device="cuda")
+    del a
+
+    # at the memory limit, this will succeed by using pool's memory in order to avoid the oom
+    b = torch.randn(40 * 1024 * 1024, dtype=torch.uint8, device="cuda")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 The following :meth:`torch.cuda.MemPool.use_count` and :meth:`torch.cuda.MemPool.snapshot`
 APIs can be used for debugging purposes:
 
@@ -808,7 +854,11 @@ APIs can be used for debugging purposes:
        out_2 = torch.randn(nelem_1mb, device="cuda")
 
        # pool now should have 2 segments since the CUDACachingAllocator had
+<<<<<<< HEAD
        # to make a new 2 MB buffer to accomodate out_2
+=======
+       # to make a new 2 MB buffer to accommodate out_2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        assert len(pool.snapshot()) == 2
 
 
diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst
index 2e7dab100ec0..cd45e37e677e 100644
--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@@ -107,11 +107,33 @@ Take the following steps:
 properly in order to ensure that the new :class:`Function` works properly with
 the autograd engine.
 
+<<<<<<< HEAD
 - :meth:`~torch.autograd.function.FunctionCtx.save_for_backward` must be
   used to save any tensors to be used in the backward pass. Non-tensors should
   be stored directly on `ctx`. If tensors that are neither input nor output
   are saved for backward your :class:`~Function` may not support double backward
   (see step 3).
+=======
+- :meth:`~torch.autograd.function.FunctionCtx.save_for_backward` should be
+  used to save any tensors needed for the backward pass (as opposed to
+  directly on ``ctx``). You cannot use ``save_for_backward`` for non-tensors;
+  you should store those directly on ``ctx``.
+
+  Saving tensors via ``save_for_backward``:
+  1. Allows the autograd engine to clear
+  them as soon as the backward computation of the ``autograd.Function`` completes.
+  (If a tensor is stored directly on ``ctx``
+  it will unnecessarily remain alive for the lifetime of the autograd graph --
+  typically until the end of the iteration.)
+  2. Helps avoid certain reference cycles, (e.g., since the tensor
+  output of the ``autograd.Function`` itself keeps a reference to the ctx).
+  3. Is important for compatibility with
+  features like activation checkpointing and offloading that rely on
+  :class:`torch.autograd.graph.saved_tensors_hooks`.
+
+  If tensors that are neither inputs nor outputs are saved for backward your
+  :class:`~Function` may not support double backward (see step 3).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - :meth:`~torch.autograd.function.FunctionCtx.mark_dirty` must be used to
   mark any input that is modified inplace by the forward function.
 - :meth:`~torch.autograd.function.FunctionCtx.mark_non_differentiable` must
diff --git a/docs/source/notes/fsdp.rst b/docs/source/notes/fsdp.rst
index 5a3c1ab377ab..8bceabf93f9e 100644
--- a/docs/source/notes/fsdp.rst
+++ b/docs/source/notes/fsdp.rst
@@ -96,7 +96,11 @@ First, let's cover the buffers allocated for communications:
 ``forward`` currently requires 2x all-gather buffer size. Here is why:
 
 As explained in :ref:`fsdp_prefetch` in the case of explicit ``forward`` prefetching
+<<<<<<< HEAD
 (``forward_prefetch=True`) case of layer 0 all-gather -> layer 0 forward compute -> layer 1
+=======
+(``forward_prefetch=True``) case of layer 0 all-gather -> layer 0 forward compute -> layer 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 all-gather there is a need for 2 all-gather-sized buffers, because one buffer is used in the current ``forward`` while the other is used to do the prefetching.
 
 While the implicit ``forward`` prefetching (``forward_prefetch=False``, default) case of the same sequence in theory should need only 1 buffer, in reality it's still 2x all-gather-sized buffers. The reason is that in the flat-parameter FSDP design, we do not copy-out of the all-gather buffer. The parameters used for compute are directly viewed into the all-gather buffer (in fact, the main benefit of the "flat parameter" is exactly this reason). In that case, while 'layer 1 all-gather' is overlapping with 'layer 0 forward compute', the 'layer 0 forward compute' is using the parameters viewed into the 'layer 0 all-gather' buffer.
diff --git a/docs/source/notes/get_start_xpu.rst b/docs/source/notes/get_start_xpu.rst
index d5f140a3db0b..f1f1d65406ca 100644
--- a/docs/source/notes/get_start_xpu.rst
+++ b/docs/source/notes/get_start_xpu.rst
@@ -21,6 +21,7 @@ For Intel Data Center GPU
 
 For Intel Client GPU
 
+<<<<<<< HEAD
 +-------------------------------------+----------------------------------------------------------------------------------------------+
 | Supported OS                        | Validated Hardware                                                                           |
 +=====================================+==============================================================================================+
@@ -35,6 +36,22 @@ For Intel Client GPU
 ||                                    || Intel® Core™ Ultra 200V Series with Intel® Arc™ Graphics (CodeName: Lunar Lake)             |
 ||                                    || Intel® Core™ Ultra Series 2 Processors with Intel® Arc™ Graphics (CodeName: Arrow Lake)     |
 +-------------------------------------+----------------------------------------------------------------------------------------------+
+=======
++-------------------------------------+----------------------------------------------------------------------------------------------------+
+| Supported OS                        | Validated Hardware                                                                                 |
++=====================================+====================================================================================================+
+|| Windows 11 & Ubuntu 24.10          || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
+||                                    || Intel® Arc B-Series Graphics (CodeName: Battlemage)                                               |
+||                                    || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H)                 |
+||                                    || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
+||                                    || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
++-------------------------------------+----------------------------------------------------------------------------------------------------+
+|| Ubuntu 24.04 & WSL2 (Ubuntu 24.04) || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
+||                                    || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H)                 |
+||                                    || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
+||                                    || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
++-------------------------------------+----------------------------------------------------------------------------------------------------+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Intel GPUs support (Prototype) is ready from PyTorch* 2.5 for Intel® Client GPUs and Intel® Data Center GPU Max Series on both Linux and Windows, which brings Intel GPUs and the SYCL* software stack into the official PyTorch stack with consistent user experience to embrace more AI application scenarios.
 
@@ -52,7 +69,11 @@ Installation
 Binaries
 ^^^^^^^^
 
+<<<<<<< HEAD
 Now that we have `Intel GPU Driver <https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu.html#driver-installation>`_ installed, use the following commands to install ``pytorch``, ``torchvision``, ``torchaudio`` on Linux.
+=======
+Now that we have `Intel GPU Driver <https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu.html#driver-installation>`_ installed, use the following commands to install ``pytorch``, ``torchvision``, ``torchaudio``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 For release wheels
 
@@ -77,7 +98,11 @@ Build from source for ``torch`` refer to `PyTorch Installation Build from source
 
 Build from source for ``torchvision`` refer to `Torchvision Installation Build from source <https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation>`_.
 
+<<<<<<< HEAD
 Build from source for ``torchaudio`` refert to `Torchaudio Installation Build from source <https://github.com/pytorch/audio/blob/main/CONTRIBUTING.md#building-torchaudio-from-source>`_.
+=======
+Build from source for ``torchaudio`` refer to `Torchaudio Installation Build from source <https://github.com/pytorch/audio/blob/main/CONTRIBUTING.md#building-torchaudio-from-source>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Check availability for Intel GPU
 --------------------------------
@@ -87,7 +112,11 @@ To check if your Intel GPU is available, you would typically use the following c
 .. code-block::
 
    import torch
+<<<<<<< HEAD
    torch.xpu.is_available()  # torch.xpu is the API for Intel GPU support
+=======
+   print(torch.xpu.is_available())  # torch.xpu is the API for Intel GPU support
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 If the output is ``False``, double check driver installation for Intel GPUs.
 
@@ -107,7 +136,11 @@ If you are migrating code from ``cuda``, you would change references from ``cuda
 The following points outline the support and limitations for PyTorch with Intel GPU:
 
 #. Both training and inference workflows are supported.
+<<<<<<< HEAD
 #. Both eager mode and ``torch.compile`` is supported. The feature ``torch.compile`` is also supported on Windows from PyTorch* 2.7 with Intel GPU, refer to `How to Use Inductor on Windows with CPU/XPU <https://pytorch.org/tutorials/prototype/inductor_windows_cpu.html>`_.
+=======
+#. Both eager mode and ``torch.compile`` is supported. The feature ``torch.compile`` is also supported on Windows from PyTorch* 2.7 with Intel GPU, refer to `How to Use Inductor on Windows with CPU/XPU <https://pytorch.org/tutorials/prototype/inductor_windows.html>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #. Data types such as FP32, BF16, FP16, and Automatic Mixed Precision (AMP) are all supported.
 
 Examples
diff --git a/docs/source/notes/gradcheck.rst b/docs/source/notes/gradcheck.rst
index 813d2be40998..23db2d13734a 100644
--- a/docs/source/notes/gradcheck.rst
+++ b/docs/source/notes/gradcheck.rst
@@ -67,7 +67,11 @@ If we consider the elementary case of a one-dimensional function (:math:`N = M =
     \frac{\partial y}{\partial x} \approx \frac{f(x + eps) - f(x - eps)}{2 * eps}
 
 This formula easily generalizes for multiple outputs (:math:`M \gt 1`) by having :math:`\frac{\partial y}{\partial x}` be a column vector of size :math:`M \times 1` like :math:`f(x + eps)`.
+<<<<<<< HEAD
 In that case, the above formula can be re-used as-is and approximates the full Jacobian matrix with only two evaluations of the user function (namely :math:`f(x + eps)` and :math:`f(x - eps)`).
+=======
+In that case, the above formula can be reused as-is and approximates the full Jacobian matrix with only two evaluations of the user function (namely :math:`f(x + eps)` and :math:`f(x - eps)`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 It is more computationally expensive to handle the case with multiple inputs (:math:`N \gt 1`). In this scenario, we loop over all the inputs one after the other and apply the :math:`eps` perturbation for each element of :math:`x` one after the other. This allows us to reconstruct the :math:`J_f` matrix column by column.
 
diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst
index 57f08b930534..07a8da1aa66e 100644
--- a/docs/source/notes/hip.rst
+++ b/docs/source/notes/hip.rst
@@ -141,7 +141,11 @@ Currently, only the "nccl" and "gloo" backends for torch.distributed are support
 CUDA API to HIP API mappings in C++
 -----------------------------------
 
+<<<<<<< HEAD
 Please refer: https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP_API_Guide.html
+=======
+Please refer: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/api_syntax.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 NOTE: The CUDA_VERSION macro, cudaRuntimeGetVersion and cudaDriverGetVersion APIs do not
 semantically map to the same values as HIP_VERSION macro, hipRuntimeGetVersion and
diff --git a/docs/source/notes/libtorch_stable_abi.md b/docs/source/notes/libtorch_stable_abi.md
index 3c178702d55f..23bd8b81b004 100644
--- a/docs/source/notes/libtorch_stable_abi.md
+++ b/docs/source/notes/libtorch_stable_abi.md
@@ -8,6 +8,7 @@ This note will eventually contain more details on how to use the APIs in torch/c
 
 |  type in custom extension    |   StableIValue representation   |   type in libtorch  |   Schema Type  |
 | -------- | ------- | ------- | ------- |
+<<<<<<< HEAD
 | std::optional\<S> | \*reinterpret_cast\<(StableIValue\*)\*>, pointer to a StableIValue recursively defined | std::optional\<T> | Type? |
 | std::nullopt | \*reinterpret_cast\<nullptr_t\*> | IValue() | None |
 | RAIIATH | \*reinterpret_cast\<uint64_t\*> of AtenTensorHandle | at::Tensor |  Tensor |
@@ -17,6 +18,16 @@ This note will eventually contain more details on how to use the APIs in torch/c
 | bool | \*reinterpret_cast\<uint64_t\*> | bool | bool |
 | int64_t | \*reinterpret_cast\<uint64_t\*> | int64_t | int |
 | double | \*reinterpret_cast\<uint64_t\*> | double | float |
+=======
+| std::optional\<S> | if there is a value, raw bitwise copy into leading bytes of uint64_t of pointer to a new StableIValue representing S. if there is no value, nullptr. | std::optional\<T> | Type? |
+| RAIIATH | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor |  Tensor |
+| int32_t | raw bitwise copy into leading bytes of uint64_t | at::ScalarType | ScalarType |
+| int32_t | raw bitwise copy into leading bytes of uint64_t | at::Layout | Layout |
+| int32_t | raw bitwise copy into leading bytes of uint64_t | at::MemoryFormat | MemoryFormat |
+| bool | raw bitwise copy into leading bytes of uint64_t | bool | bool |
+| int64_t | raw bitwise copy into leading bytes of uint64_t | int64_t | int |
+| double | raw bitwise copy into leading bytes of uint64_t | double | float |
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 | ? | ? | c10::Device | Device |
 | ? | ? | c10::Stream | Stream |
 | ? | ? | c10::complex<double> | complex |
@@ -31,4 +42,25 @@ This note will eventually contain more details on how to use the APIs in torch/c
 | ? | ? | c10::SymBool | SymBool |
 | ? | ? | at::QScheme | QScheme |
 
+<<<<<<< HEAD
 Our confidently supported types are the ones in the table that have completed rows. For a limited set of use cases, we also implicitly support any literal type that is representable within 64 bits as StableIValues, as the default reinterpret_cast will succeed. You can work with StableIValue abstractions in your custom kernel for types such as c10::Device even if there is no standard defined representation of device in custom extensions. For example, a custom operator can take as argument a StableIValue device and directly pass it through to an aten operator with aoti_torch_call_dispatcher.
+=======
+Our confidently supported types are the ones in the table that have completed rows. You can rely on this subset proper ABI stability.
+
+For a limited set of use cases, we also implicitly support any literal type that is representable within 64 bits as StableIValues, as the default reinterpret_cast will succeed. These types are currently ABI-stable on best effort but might break in the future and thus should be used for short term testing only.
+
+You can always work with StableIValue abstractions in your custom kernel for types such as c10::Device even if there is no standard defined representation of device in custom extensions by not introspecting into the StableIValue. For example, a custom operator can take as argument a StableIValue device and directly pass it through to an aten operator with `aoti_torch_call_dispatcher`.
+
+
+## How to use stack-based APIs
+
+`aoti_torch_call_dispatcher` is what we consider a stack-based API because it takes as input a stack of StableIValues. Working with the dispatcher will likely bring you into proximity with stack-based APIs, so we are documenting some invariants:
+
+1. The stack is populated left to right.
+    a. For example, a stack representing arguments `arg0`, `arg1`, and `arg2` will have `arg0` at index 0, `arg1` at index 1, and `arg2` at index 2.
+    b. Returns are also populated left to right, e.g., `ret0` will be at index 0 and `ret1` will be at index 1, and so on.
+
+2. The stack always has ownership of the objects it holds.
+    a. When calling a stack-based API, you must give owning references to the calling stack and steal references from the returned stack.
+    b. When registering your function to be called with a stack, you must steal references from your argument stack and push onto the stack new references.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/notes/multiprocessing.rst b/docs/source/notes/multiprocessing.rst
index 0131f118a077..49481c6f40d7 100644
--- a/docs/source/notes/multiprocessing.rst
+++ b/docs/source/notes/multiprocessing.rst
@@ -22,13 +22,34 @@ memory and will only send a handle to another process.
 This allows to implement various training methods, like Hogwild, A3C, or any
 others that require asynchronous operation.
 
+<<<<<<< HEAD
+=======
+.. _multiprocessing-poison-fork-note:
+
+Poison fork in multiprocessing
+------------------------------
+
+When using multiprocessing with :ref:`accelerators<accelerators>`, a known issue called "poison fork" may occur.
+This happens when the accelerator's runtime is not fork safe and is initialized before a process forks, leading to
+runtime errors in child processes.
+
+To prevent such errors:
+    - Avoid initializing the accelerator in the main process before forking child processes.
+    - Use an alternative process start methods, such as ``spawn`` or ``forkserver``, which ensures a clean initialization of each process.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. _multiprocessing-cuda-note:
 
 CUDA in multiprocessing
 -----------------------
 
+<<<<<<< HEAD
 The CUDA runtime does not support the ``fork`` start method; either the ``spawn`` or ``forkserver`` start method are
 required to use CUDA in subprocesses.
+=======
+The CUDA runtime has the limitation described in :ref:`multiprocessing-poison-fork-note` when using the ``fork`` start method;
+either the ``spawn`` or ``forkserver`` start method are required to use CUDA in subprocesses.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. note::
   The start method can be set via either creating a context with
diff --git a/docs/source/notes/out.rst b/docs/source/notes/out.rst
new file mode 100644
index 000000000000..e73d33aadc33
--- /dev/null
+++ b/docs/source/notes/out.rst
@@ -0,0 +1,16 @@
+.. _out-notes:
+
+Out Notes
+=========
+
+When a user passes one or more tensors to out= the contract is as follows:
+
+* if an out tensor has no elements it will be resized to the shape, stride, and memory format of the output of the computation.
+* if an out tensor has a different shape than the result of the computation an error is thrown OR the out tensor is resized to the same shape, stride, and memory format of the output computation, just like a tensor with no elements. (This resizing behavior is deprecated and PyTorch is updating its operators to consistently throw an error.)
+* passing out= tensors with the correct shape is numerically equivalent to performing the operation and "safe copying" its results to the (possibly resized) out tensor. In this case strides and memory format are preserved.
+* passing out= tensors with grad needed is not supported.
+* if multiple tensors are passed to out= then the above behavior applies to each independently.
+
+A "safe copy" is different from PyTorch's regular copy. For operations that do not participate in type promotion the device and dtype of the source and destination tensors must match. For operations that do participate in type promotion the copy can be to a different dtype, but the destination of the copy cannot be a lower "type kind" than the source. PyTorch has four type kinds: boolean, integer, float, and complex, in that order. So, for example, an operation like add (which participates in type promotion) will throw a runtime error if given float inputs but an integer out= tensor.
+
+Note that while the numerics of out= for correctly shaped tensors are that the operation is performed and then its results are "safe copied," behind the scenes operations may reuse the storage of out= tensors and fuse the copy for efficiency. Many operations, like add, perform these optimizations. Also, while PyTorch's "out= contract" is specified above, many operations in PyTorch do not correctly implement the contract and need to be updated.
\ No newline at end of file
diff --git a/docs/source/notes/serialization.rst b/docs/source/notes/serialization.rst
index 019865e3b535..7264d0cb13e8 100644
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@@ -215,6 +215,49 @@ is 64-byte aligned.
     such, their storages are not serialized. In these cases ``data/`` might not exist
     in the checkpoint.
 
+<<<<<<< HEAD
+=======
+.. _layout-control:
+
+Layout Control
+--------------
+
+The ``mmap`` argument in :func:`torch.load` allows for lazy loading of tensor storages.
+
+In addition, there are some advanced features that allow for more fine-grained
+control and manipulation of a ``torch.save`` checkpoint.
+
+The :class:`torch.serialization.skip_data` context manager enables
+  * Saving a checkpoint with ``torch.save`` that includes empty space for data bytes
+    to be written later.
+  * Loading a checkpoint with ``torch.load`` and filling in the data bytes of tensors later.
+
+To inspect tensor metadata in a ``torch.save`` checkpoint without allocating memory for storage
+data, use ``torch.load`` within the ``FakeTensorMode`` context manager. On top of skipping loading
+storage data similar to ``skip_data`` above, it additionally tags storages with their offset within
+the checkpoint, enabling direct checkpoint manipulation.
+
+.. code-block:: python
+
+  import torch.nn as nn
+  from torch._subclasses.fake_tensor import FakeTensorMode
+
+  m = nn.Linear(10, 10)
+  torch.save(m.state_dict(), "checkpoint.pt")
+
+  with FakeTensorMode() as mode:
+      fake_sd = torch.load("checkpoint.pt")
+
+  for k, v in fake_sd.items():
+      print(f"key={k}, dtype={v.dtype}, shape={v.shape}, stride={v.stride()}, storage_offset={v.storage_offset()}")
+      # offset of the storage in the checkpoint
+      print(f"key={k}, checkpoint_offset={v.untyped_storage()._checkpoint_offset}")
+
+For more information, `this tutorial <https://docs.pytorch.org/tutorials/prototype/gpu_direct_storage.html>`_
+offers a comprehensive example of using these features to manipulate a checkpoint.
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. _weights-only:
 
 ``torch.load`` with ``weights_only=True``
diff --git a/docs/source/notes/windows.rst b/docs/source/notes/windows.rst
index 8fb2f913630b..6b9c55997458 100644
--- a/docs/source/notes/windows.rst
+++ b/docs/source/notes/windows.rst
@@ -24,9 +24,16 @@ MKL and MAGMA. Here are the steps to build with them.
     REM 2.5.3 (CUDA 10.1 10.2 11.0) x (Debug Release)
     REM 2.5.2 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release)
     REM 2.5.1 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release)
+<<<<<<< HEAD
     set CUDA_PREFIX=cuda102
     set CONFIG=release
     curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z -o magma.7z
+=======
+    set "CUDA_PREFIX=cuda102"
+    set "CONFIG=release"
+    set "HOST=https://s3.amazonaws.com/ossci-windows"
+    curl -k "%HOST%/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z" -o magma.7z
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     7z x -aoa magma.7z -omagma
 
     REM Setting essential environment variables
@@ -104,8 +111,11 @@ Package not found in win-32 channel.
     - pytorch
 
     Current channels:
+<<<<<<< HEAD
     - https://conda.anaconda.org/pytorch/win-32
     - https://conda.anaconda.org/pytorch/noarch
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - https://repo.continuum.io/pkgs/main/win-32
     - https://repo.continuum.io/pkgs/main/noarch
     - https://repo.continuum.io/pkgs/free/win-32
@@ -131,6 +141,7 @@ Import error
     ImportError: DLL load failed: The specified module could not be found.
 
 
+<<<<<<< HEAD
 The problem is caused by the missing of the essential files. Actually,
 we include almost all the essential files that PyTorch need for the conda
 package except VC2017 redistributable and some mkl libraries.
@@ -142,6 +153,10 @@ You can resolve this by typing the following command.
     conda install mkl_fft intel_openmp numpy mkl
 
 As for the wheels package, since we didn't pack some libraries and VS2017
+=======
+The problem is caused by the missing of the essential files.
+For the wheels package, since we didn't pack some libraries and VS2017
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 redistributable files in, please make sure you install them manually.
 The `VS 2017 redistributable installer
 <https://aka.ms/vs/15/release/VC_redist.x64.exe>`_ can be downloaded.
@@ -152,6 +167,7 @@ uses MKL instead of OpenBLAS. You may type in the following command.
 
     pip install numpy mkl intel-openmp mkl_fft
 
+<<<<<<< HEAD
 Another possible cause may be you are using GPU version without NVIDIA
 graphics cards. Please replace your GPU package with the CPU one.
 
@@ -170,6 +186,8 @@ the intel-openmp libraries through this command.
 
     conda install -c defaults intel-openmp -f
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Usage (multiprocessing)
 -------------------------------------------------------
diff --git a/docs/source/onnx.md b/docs/source/onnx.md
new file mode 100644
index 000000000000..ba5c746911bc
--- /dev/null
+++ b/docs/source/onnx.md
@@ -0,0 +1,116 @@
+# torch.onnx
+
+
+## Overview
+
+[Open Neural Network eXchange (ONNX)](https://onnx.ai/) is an open standard
+format for representing machine learning models. The `torch.onnx` module captures the computation graph from a
+native PyTorch {class}`torch.nn.Module` model and converts it into an
+[ONNX graph](https://github.com/onnx/onnx/blob/main/docs/IR.md).
+
+The exported model can be consumed by any of the many
+[runtimes that support ONNX](https://onnx.ai/supported-tools.html#deployModel), including
+Microsoft's [ONNX Runtime](https://www.onnxruntime.ai).
+
+**There are two flavors of ONNX exporter API that you can use, as listed below.**
+Both can be called through function {func}`torch.onnx.export`.
+Next example shows how to export a simple model.
+
+```python
+import torch
+
+class MyModel(torch.nn.Module):
+    def __init__(self):
+        super(MyModel, self).__init__()
+        self.conv1 = torch.nn.Conv2d(1, 128, 5)
+
+    def forward(self, x):
+        return torch.relu(self.conv1(x))
+
+input_tensor = torch.rand((1, 1, 128, 128), dtype=torch.float32)
+
+model = MyModel()
+
+torch.onnx.export(
+    model,                  # model to export
+    (input_tensor,),        # inputs of the model,
+    "my_model.onnx",        # filename of the ONNX model
+    input_names=["input"],  # Rename inputs for the ONNX model
+    dynamo=True             # True or False to select the exporter to use
+)
+```
+
+Next sections introduce the two versions of the exporter.
+
+## TorchDynamo-based ONNX Exporter
+
+*The TorchDynamo-based ONNX exporter is the newest (and Beta) exporter for PyTorch 2.1 and newer*
+
+TorchDynamo engine is leveraged to hook into Python's frame evaluation API and dynamically rewrite its
+bytecode into an FX Graph. The resulting FX Graph is then polished before it is finally translated into an
+ONNX graph.
+
+The main advantage of this approach is that the [FX graph](https://pytorch.org/docs/stable/fx.html) is captured using
+bytecode analysis that preserves the dynamic nature of the model instead of using traditional static tracing techniques.
+
+{doc}`Learn more about the TorchDynamo-based ONNX Exporter <onnx_dynamo>`
+
+## TorchScript-based ONNX Exporter
+
+*The TorchScript-based ONNX exporter is available since PyTorch 1.2.0*
+
+[TorchScript](https://pytorch.org/docs/stable/jit.html) is leveraged to trace (through {func}`torch.jit.trace`)
+the model and capture a static computation graph.
+
+As a consequence, the resulting graph has a couple limitations:
+
+* It does not record any control-flow, like if-statements or loops;
+* Does not handle nuances between `training` and `eval` mode;
+* Does not truly handle dynamic inputs
+
+As an attempt to support the static tracing limitations, the exporter also supports TorchScript scripting
+(through {func}`torch.jit.script`), which adds support for data-dependent control-flow, for example. However, TorchScript
+itself is a subset of the Python language, so not all features in Python are supported, such as in-place operations.
+
+{doc}`Learn more about the TorchScript-based ONNX Exporter <onnx_torchscript>`
+
+## Contributing / Developing
+
+The ONNX exporter is a community project and we welcome contributions. We follow the
+[PyTorch guidelines for contributions](https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md), but you might
+also be interested in reading our [development wiki](https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter).
+
+```{eval-rst}
+.. toctree::
+    :hidden:
+
+    onnx_dynamo
+    onnx_ops
+    onnx_verification
+    onnx_dynamo_onnxruntime_backend
+    onnx_torchscript
+```
+
+<!-- This module needs to be documented. Adding here in the meantime
+for tracking purposes -->
+```{eval-rst}
+.. py:module:: torch.onnx.errors
+.. py:module:: torch.onnx.operators
+.. py:module:: torch.onnx.symbolic_caffe2
+.. py:module:: torch.onnx.symbolic_helper
+.. py:module:: torch.onnx.symbolic_opset10
+.. py:module:: torch.onnx.symbolic_opset11
+.. py:module:: torch.onnx.symbolic_opset12
+.. py:module:: torch.onnx.symbolic_opset13
+.. py:module:: torch.onnx.symbolic_opset14
+.. py:module:: torch.onnx.symbolic_opset15
+.. py:module:: torch.onnx.symbolic_opset16
+.. py:module:: torch.onnx.symbolic_opset17
+.. py:module:: torch.onnx.symbolic_opset18
+.. py:module:: torch.onnx.symbolic_opset19
+.. py:module:: torch.onnx.symbolic_opset20
+.. py:module:: torch.onnx.symbolic_opset7
+.. py:module:: torch.onnx.symbolic_opset8
+.. py:module:: torch.onnx.symbolic_opset9
+.. py:module:: torch.onnx.utils
+```
diff --git a/docs/source/onnx_dynamo.md b/docs/source/onnx_dynamo.md
new file mode 100644
index 000000000000..c5077ef360a5
--- /dev/null
+++ b/docs/source/onnx_dynamo.md
@@ -0,0 +1,274 @@
+# TorchDynamo-based ONNX Exporter
+
+```{eval-rst}
+.. automodule:: torch.onnx
+  :noindex:
+```
+
+```{contents}
+:local:
+:depth: 1
+```
+
+## Overview
+
+The ONNX exporter leverages TorchDynamo engine to hook into Python's frame evaluation API
+and dynamically rewrite its bytecode into an FX Graph.
+The resulting FX Graph is then polished before it is finally translated into an ONNX graph.
+
+The main advantage of this approach is that the [FX graph](https://pytorch.org/docs/stable/fx.html) is captured using
+bytecode analysis that preserves the dynamic nature of the model instead of using traditional static tracing techniques.
+
+In addition, during the export process, memory usage is significantly reduced compared to the TorchScript-enabled exporter.
+See the {doc}`memory usage documentation <onnx_dynamo_memory_usage>` for more information.
+
+
+## Dependencies
+
+The ONNX exporter depends on extra Python packages:
+
+  - [ONNX](https://onnx.ai)
+  - [ONNX Script](https://microsoft.github.io/onnxscript)
+
+They can be installed through [pip](https://pypi.org/project/pip/):
+
+```{code-block} bash
+
+  pip install --upgrade onnx onnxscript
+```
+
+[onnxruntime](https://onnxruntime.ai) can then be used to execute the model
+on a large variety of processors.
+
+## A simple example
+
+See below a demonstration of exporter API in action with a simple Multilayer Perceptron (MLP) as example:
+
+```{code-block} python
+import torch
+import torch.nn as nn
+
+class MLPModel(nn.Module):
+  def __init__(self):
+      super().__init__()
+      self.fc0 = nn.Linear(8, 8, bias=True)
+      self.fc1 = nn.Linear(8, 4, bias=True)
+      self.fc2 = nn.Linear(4, 2, bias=True)
+      self.fc3 = nn.Linear(2, 2, bias=True)
+      self.fc_combined = nn.Linear(8 + 8 + 8, 8, bias=True)  # Combine all inputs
+
+  def forward(self, tensor_x: torch.Tensor, input_dict: dict, input_list: list):
+      """
+      Forward method that requires all inputs:
+      - tensor_x: A direct tensor input.
+      - input_dict: A dictionary containing the tensor under the key 'tensor_x'.
+      - input_list: A list where the first element is the tensor.
+      """
+      # Extract tensors from inputs
+      dict_tensor = input_dict['tensor_x']
+      list_tensor = input_list[0]
+
+      # Combine all inputs into a single tensor
+      combined_tensor = torch.cat([tensor_x, dict_tensor, list_tensor], dim=1)
+
+      # Process the combined tensor through the layers
+      combined_tensor = self.fc_combined(combined_tensor)
+      combined_tensor = torch.sigmoid(combined_tensor)
+      combined_tensor = self.fc0(combined_tensor)
+      combined_tensor = torch.sigmoid(combined_tensor)
+      combined_tensor = self.fc1(combined_tensor)
+      combined_tensor = torch.sigmoid(combined_tensor)
+      combined_tensor = self.fc2(combined_tensor)
+      combined_tensor = torch.sigmoid(combined_tensor)
+      output = self.fc3(combined_tensor)
+      return output
+
+model = MLPModel()
+
+# Example inputs
+tensor_input = torch.rand((97, 8), dtype=torch.float32)
+dict_input = {'tensor_x': torch.rand((97, 8), dtype=torch.float32)}
+list_input = [torch.rand((97, 8), dtype=torch.float32)]
+
+# The input_names and output_names are used to identify the inputs and outputs of the ONNX model
+input_names = ['tensor_input', 'tensor_x', 'list_input_index_0']
+output_names = ['output']
+
+# Exporting the model with all required inputs
+onnx_program = torch.onnx.export(model,(tensor_input, dict_input, list_input), dynamic_shapes=({0: "batch_size"},{"tensor_x": {0: "batch_size"}},[{0: "batch_size"}]), input_names=input_names, output_names=output_names, dynamo=True,)
+
+# Check the exported ONNX model is dynamic
+assert onnx_program.model.graph.inputs[0].shape == ("batch_size", 8)
+assert onnx_program.model.graph.inputs[1].shape == ("batch_size", 8)
+assert onnx_program.model.graph.inputs[2].shape == ("batch_size", 8)
+```
+
+As the code above shows, all you need is to provide {func}`torch.onnx.export` with an instance of the model and its input.
+The exporter will then return an instance of {class}`torch.onnx.ONNXProgram` that contains the exported ONNX graph along with extra information.
+
+The in-memory model available through ``onnx_program.model_proto`` is an ``onnx.ModelProto`` object in compliance with the [ONNX IR spec](https://github.com/onnx/onnx/blob/main/docs/IR.md).
+The ONNX model may then be serialized into a [Protobuf file](https://protobuf.dev/) using the {meth}`torch.onnx.ONNXProgram.save` API.
+
+```{code-block} python
+  onnx_program.save("mlp.onnx")
+```
+
+## Use the same model to compare with the TorchScript-enabled exporter
+
+The biggest difference between the TorchScript-enabled exporter and the TorchDynamo-based exporter is that the latter
+requires dynamic_shapes to be the same tree structure as the input, while the former
+requires the dynamic_shapes to be a single and flatten dictionary.
+
+```{code-block} python
+  torch.onnx.export(model,(tensor_input, dict_input, list_input), "mlp.onnx", dynamic_axes={"tensor_input":{0: "batch_size"}, "tensor_x": {0: "batch_size"}, "list_input_index_0": {0: "batch_size"}}, input_names=input_names, output_names=output_names)
+```
+
+## Inspecting the ONNX model using GUI
+
+You can view the exported model using [Netron](https://netron.app/).
+
+```{image} _static/img/onnx/onnx_dynamo_mlp_model.png
+:alt: MLP model as viewed using Netron
+:width: 30%
+:align: center
+```
+
+## When the conversion fails
+
+Function {func}`torch.onnx.export` should be called a second time with
+parameter ``report=True``. A markdown report is generated to help the user
+to resolve the issue.
+
+```{toctree}
+:hidden:
+onnx_dynamo_memory_usage
+```
+## Metadata
+
+During ONNX export, each ONNX node is annotated with metadata that helps trace its origin and context from the original PyTorch model. This metadata is useful for debugging, model inspection, and understanding the mapping between PyTorch and ONNX graphs.
+
+The following metadata fields are added to each ONNX node:
+
+- **namespace**
+
+  A string representing the hierarchical namespace of the node, consisting of a stack trace of modules/methods.
+
+  *Example:*
+  `__main__.SimpleAddModel/add: aten.add.Tensor`
+
+- **pkg.torch.onnx.class_hierarchy**
+
+  A list of class names representing the hierarchy of modules leading to this node.
+
+  *Example:*
+  `['__main__.SimpleAddModel', 'aten.add.Tensor']`
+
+- **pkg.torch.onnx.fx_node**
+
+  The string representation of the original FX node, including its name, number of consumers, the targeted torch op, arguments, and keyword arguments.
+
+  *Example:*
+  `%cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%tensor_x, %input_dict_tensor_x, %input_list_0], 1), kwargs = {})`
+
+- **pkg.torch.onnx.name_scopes**
+
+  A list of name scopes (methods) representing the path to this node in the PyTorch model.
+
+  *Example:*
+  `['', 'add']`
+
+- **pkg.torch.onnx.stack_trace**
+
+  The stack trace from the original code where this node was created, if available.
+
+  *Example:*
+  ```
+  File "simpleadd.py", line 7, in forward
+      return torch.add(x, y)
+  ```
+
+These metadata fields are stored in the metadata_props attribute of each ONNX node and can be inspected using Netron or programmatically.
+
+The overall ONNX graph has the following `metadata_props`:
+
+- **pkg.torch.export.ExportedProgram.graph_signature**
+
+  This property contains a string representation of the graph_signature from the original PyTorch ExportedProgram. The graph signature describes the structure of the model's inputs and outputs and how they map to the ONNX graph. The inputs are defined as `InputSpec` objects, which include the kind of input (e.g., `InputKind.PARAMETER` for parameters, `InputKind.USER_INPUT` for user-defined inputs), the argument name, the target (which can be a specific node in the model), and whether the input is persistent. The outputs are defined as `OutputSpec` objects, which specify the kind of output (e.g., `OutputKind.USER_OUTPUT`) and the argument name.
+
+  To read more about the graph signature, please see the {doc}`torch.export <export>` for more information.
+
+- **pkg.torch.export.ExportedProgram.range_constraints**
+
+  This property contains a string representation of any range constraints that were present in the original PyTorch ExportedProgram. Range constraints specify valid ranges for symbolic shapes or values in the model, which can be important for models that use dynamic shapes or symbolic dimensions.
+
+  *Example:*
+  `s0: VR[2, int_oo]`, which indicates that the size of the input tensor must be at least 2.
+
+  To read more about range constraints, please see the {doc}`torch.export <export>` for more information.
+
+Each input value in the ONNX graph may have the following metadata property:
+
+- **pkg.torch.export.graph_signature.InputSpec.kind**
+
+  The kind of input, as defined by PyTorch's InputKind enum.
+
+  *Example values:*
+  - "USER_INPUT": A user-provided input to the model.
+  - "PARAMETER": A model parameter (e.g., weight).
+  - "BUFFER": A model buffer (e.g., running mean in BatchNorm).
+  - "CONSTANT_TENSOR": A constant tensor argument.
+  - "CUSTOM_OBJ": A custom object input.
+  - "TOKEN": A token input.
+
+- **pkg.torch.export.graph_signature.InputSpec.persistent**
+
+  Indicates whether the input is persistent (i.e., should be saved as part of the model's state).
+
+  *Example values:*
+  - "True"
+  - "False"
+
+Each output value in the ONNX graph may have the following metadata property:
+
+- **pkg.torch.export.graph_signature.OutputSpec.kind**
+
+  The kind of input, as defined by PyTorch's OutputKind enum.
+
+  *Example values:*
+  - "USER_OUTPUT": A user-visible output.
+  - "LOSS_OUTPUT": A loss value output.
+  - "BUFFER_MUTATION": Indicates a buffer was mutated.
+  - "GRADIENT_TO_PARAMETER": Gradient output for a parameter.
+  - "GRADIENT_TO_USER_INPUT": Gradient output for a user input.
+  - "USER_INPUT_MUTATION": Indicates a user input was mutated.
+  - "TOKEN": A token output.
+
+Each initialized value, input, output has the following metadata:
+
+- **pkg.torch.onnx.original_node_name**
+
+  The original name of the node in the PyTorch FX graph that produced this value in the case where the value was renamed. This helps trace initializers back to their source in the original model.
+
+  *Example:*
+  `fc1.weight`
+
+## API Reference
+
+```{eval-rst}
+.. autofunction:: torch.onnx.export
+.. autoclass:: torch.onnx.ONNXProgram
+    :members:
+.. autofunction:: is_in_onnx_export
+.. autoclass:: torch.onnx.OnnxExporterError
+    :members:
+.. autofunction:: torch.onnx.enable_fake_mode
+```
+
+## Deprecated
+
+The following classes and functions are deprecated and will be removed.
+
+```{eval-rst}
+.. autofunction:: torch.onnx.dynamo_export
+.. autoclass:: torch.onnx.ExportOptions
+```
diff --git a/docs/source/onnx_dynamo_memory_usage.rst b/docs/source/onnx_dynamo_memory_usage.rst
index b339d20f0ba6..61cc8b8807c3 100644
--- a/docs/source/onnx_dynamo_memory_usage.rst
+++ b/docs/source/onnx_dynamo_memory_usage.rst
@@ -103,7 +103,11 @@ The code below could be run to generate a snapshot file which records the state
     print(f"Export is done.")
 
 Open `pytorch.org/memory_viz <https://pytorch.org/memory_viz>`_ and drag/drop the generated pickled snapshot file into the visualizer.
+<<<<<<< HEAD
 The memeory usage is described as below:
+=======
+The memory usage is described as below:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. image:: _static/img/onnx/torch_dynamo_exporter_memory_usage.png
 
diff --git a/docs/source/onnx_dynamo_onnxruntime_backend.md b/docs/source/onnx_dynamo_onnxruntime_backend.md
new file mode 100644
index 000000000000..a59cd4ab919c
--- /dev/null
+++ b/docs/source/onnx_dynamo_onnxruntime_backend.md
@@ -0,0 +1,11 @@
+# ONNX Backend for TorchDynamo
+
+For a quick overview of `torch.compiler`, see {ref}`torch.compiler_overview`.
+
+```{warning}
+  The ONNX backend for torch.compile is a rapidly evolving beta technology.
+```
+
+```{eval-rst}
+.. autofunction:: torch.onnx.is_onnxrt_backend_supported
+```
\ No newline at end of file
diff --git a/docs/source/onnx_ops.md b/docs/source/onnx_ops.md
new file mode 100644
index 000000000000..51ea2afe5eff
--- /dev/null
+++ b/docs/source/onnx_ops.md
@@ -0,0 +1,128 @@
+# torch.onnx.ops
+
+```{eval-rst}
+.. automodule:: torch.onnx.ops
+```
+
+## Symbolic Operators
+
+Operators that can be used to create any ONNX ops in the FX graph symbolically.
+These operators do not do actual computation. It's recommended that you used them
+inside an ``if torch.onnx.is_in_onnx_export`` block.
+
+```{eval-rst}
+.. autofunction:: torch.onnx.ops.symbolic
+.. autofunction:: torch.onnx.ops.symbolic_multi_out
+```
+
+## ONNX Operators
+
+The following operators are implemented as native PyTorch ops and can be exported as
+ONNX operators. They can be used natively in an ``nn.Module``.
+
+For example, you can define a module:
+
+```py
+class Model(torch.nn.Module):
+    def forward(
+        self, input_data, cos_cache_data, sin_cache_data, position_ids_data
+    ):
+        return torch.onnx.ops.rotary_embedding(
+            input_data,
+            cos_cache_data,
+            sin_cache_data,
+            position_ids_data,
+        )
+```
+
+and export it to ONNX using:
+
+```py
+input_data = torch.rand(2, 3, 4, 8)
+position_ids_data = torch.randint(0, 50, (2, 3)).long()
+sin_cache_data = torch.rand(50, 4)
+cos_cache_data = torch.rand(50, 4)
+dynamic_shapes = {
+    "input_data": {0: torch.export.Dim.DYNAMIC},
+    "cos_cache_data": None,
+    "sin_cache_data": None,
+    "position_ids_data": {0: torch.export.Dim.DYNAMIC},
+}
+onnx_program = torch.onnx.export(
+    model,
+    (input_data, cos_cache_data, sin_cache_data, position_ids_data),
+    dynamic_shapes=dynamic_shapes,
+    dynamo=True,
+    opset_version=23,
+)
+```
+
+Printing the ONNX program will show the ONNX operators used in the graph:
+
+```
+<...>
+
+graph(
+    name=main_graph,
+    inputs=(
+        %"input_data"<FLOAT,[s0,3,4,8]>,
+        %"cos_cache_data"<FLOAT,[50,4]>,
+        %"sin_cache_data"<FLOAT,[50,4]>,
+        %"position_ids_data"<INT64,[s0,3]>
+    ),
+    outputs=(
+        %"rotary_embedding"<FLOAT,[s0,3,4,8]>
+    ),
+) {
+    0 |  # rotary_embedding
+        %"rotary_embedding"<FLOAT,[s0,3,4,8]> ⬅️ ::RotaryEmbedding(%"input_data", %"cos_cache_data", %"sin_cache_data", %"position_ids_data")
+    return %"rotary_embedding"<FLOAT,[s0,3,4,8]>
+}
+```
+
+with the corresponding ``ExportedProgram``:
+
+ExportedProgram:
+
+```py
+class GraphModule(torch.nn.Module):
+    def forward(self, input_data: "f32[s0, 3, 4, 8]", cos_cache_data: "f32[50, 4]", sin_cache_data: "f32[50, 4]", position_ids_data: "i64[s0, 3]"):
+        rotary_embedding: "f32[s0, 3, 4, 8]" = torch.ops.onnx.RotaryEmbedding.opset23(input_data, cos_cache_data, sin_cache_data, position_ids_data);  input_data = cos_cache_data = sin_cache_data = position_ids_data = None
+        return (rotary_embedding,)
+```
+
+```{eval-rst}
+.. autofunction:: torch.onnx.ops.rotary_embedding
+.. autofunction:: torch.onnx.ops.attention
+```
+
+## ONNX to ATen Decomposition Table
+
+You can use {func}`torch.onnx.ops.aten_decompositions` to obtain a decomposition table
+to decompose ONNX operators defined above to ATen operators.
+
+```py
+class Model(torch.nn.Module):
+    def forward(
+        self, input_data, cos_cache_data, sin_cache_data, position_ids_data
+    ):
+        return torch.onnx.ops.rotary_embedding(
+            input_data,
+            cos_cache_data,
+            sin_cache_data,
+            position_ids_data,
+        )
+
+model = Model()
+
+ep = torch.export.export(
+    model,
+    (input_data, cos_cache_data, sin_cache_data, position_ids_data),
+)
+# The program can be decomposed into aten ops
+ep_decomposed = ep.run_decompositions(torch.onnx.ops.aten_decompositions())
+```
+
+```{eval-rst}
+.. autofunction:: torch.onnx.ops.aten_decompositions
+```
diff --git a/docs/source/onnx_torchscript.rst b/docs/source/onnx_torchscript.rst
index 308f74ad6d77..850794942a7d 100644
--- a/docs/source/onnx_torchscript.rst
+++ b/docs/source/onnx_torchscript.rst
@@ -452,7 +452,11 @@ ONNX operators that represent the function's behavior in ONNX. For example::
 .. . ``torch::jit::Value::setType``). This is not required, but it can help the exporter's
 .. shape and type inference for down-stream nodes. For a non-trivial example of ``setType``, see
 .. ``test_aten_embedding_2`` in
+<<<<<<< HEAD
 .. `test_operators.py <https://github.com/pytorch/pytorch/blob/main/test/onnx/test_operators.py>`_.
+=======
+.. `test_operators.py <https://github.com/pytorch/pytorch/blob/release/2.5/test/onnx/test_operators.py#L1179>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 .. The example below shows how you can access ``requires_grad`` via the ``Node`` object:
 
@@ -697,10 +701,18 @@ Functions
 ^^^^^^^^^
 
 .. autofunction:: export
+<<<<<<< HEAD
+=======
+    :noindex:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. autofunction:: register_custom_op_symbolic
 .. autofunction:: unregister_custom_op_symbolic
 .. autofunction:: select_model_mode_for_export
 .. autofunction:: is_in_onnx_export
+<<<<<<< HEAD
+=======
+    :noindex:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Classes
 ^^^^^^^
@@ -710,4 +722,8 @@ Classes
     :nosignatures:
     :template: classtemplate.rst
 
+<<<<<<< HEAD
     JitScalarType
+=======
+    JitScalarType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/docs/source/onnx_verification.md b/docs/source/onnx_verification.md
new file mode 100644
index 000000000000..cbaad021e960
--- /dev/null
+++ b/docs/source/onnx_verification.md
@@ -0,0 +1,33 @@
+# torch.onnx.verification
+```{eval-rst}
+.. automodule:: torch.onnx.verification
+```
+
+```{eval-rst}
+.. autofunction:: verify_onnx_program
+```
+
+```{eval-rst}
+.. autoclass:: VerificationInfo
+    :members:
+```
+
+```{eval-rst}
+.. autofunction:: verify
+```
+
+## Deprecated
+
+The following classes and functions are deprecated.
+
+<!-- Some deprecated members are not publicly shown -->
+```{eval-rst}
+.. py:class:: check_export_model_diff
+.. py:class:: GraphInfo
+.. py:class:: GraphInfoPrettyPrinter
+.. py:class:: OnnxBackend
+.. py:class:: OnnxTestCaseRepro
+.. py:class:: VerificationOptions
+.. py:function:: find_mismatch
+.. py:function:: verify_aten_graph
+```
diff --git a/docs/source/optim.md b/docs/source/optim.md
new file mode 100644
index 000000000000..8a3f03468810
--- /dev/null
+++ b/docs/source/optim.md
@@ -0,0 +1,707 @@
+# torch.optim
+
+```{eval-rst}
+.. automodule:: torch.optim
+```
+
+## How to use an optimizer
+
+To use {mod}`torch.optim` you have to construct an optimizer object that will hold
+the current state and will update the parameters based on the computed gradients.
+
+### Constructing it
+
+To construct an {class}`Optimizer` you have to give it an iterable containing the
+parameters (all should be {class}`~torch.nn.Parameter` s) or named parameters
+(tuples of (str, {class}`~torch.nn.Parameter`)) to optimize. Then,
+you can specify optimizer-specific options such as the learning rate, weight decay, etc.
+
+Example:
+```python
+optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+optimizer = optim.Adam([var1, var2], lr=0.0001)
+```
+
+Named parameters example:
+
+```python
+optimizer = optim.SGD(model.named_parameters(), lr=0.01, momentum=0.9)
+optimizer = optim.Adam([('layer0', var1), ('layer1', var2)], lr=0.0001)
+```
+
+### Per-parameter options
+
+{class}`Optimizer` s also support specifying per-parameter options. To do this, instead
+of passing an iterable of {class}`~torch.autograd.Variable` s, pass in an iterable of
+{class}`dict` s. Each of them will define a separate parameter group, and should contain
+a `params` key, containing a list of parameters belonging to it. Other keys
+should match the keyword arguments accepted by the optimizers, and will be used
+as optimization options for this group.
+
+For example, this is very useful when one wants to specify per-layer learning rates:
+
+```python
+optim.SGD([
+    {'params': model.base.parameters(), 'lr': 1e-2},
+    {'params': model.classifier.parameters()}
+], lr=1e-3, momentum=0.9)
+
+optim.SGD([
+    {'params': model.base.named_parameters(), 'lr': 1e-2},
+    {'params': model.classifier.named_parameters()}
+], lr=1e-3, momentum=0.9)
+```
+
+This means that `model.base`'s parameters will use a learning rate of `1e-2`, whereas
+`model.classifier`'s parameters will stick to the default learning rate of `1e-3`.
+Finally a momentum of `0.9` will be used for all parameters.
+
+```{note}
+You can still pass options as keyword arguments. They will be used as
+defaults, in the groups that didn't override them. This is useful when you
+only want to vary a single option, while keeping all others consistent
+between parameter groups.
+```
+
+Also consider the following example related to the distinct penalization of parameters.
+Remember that {func}`~torch.nn.Module.parameters` returns an iterable that
+contains all learnable parameters, including biases and other
+parameters that may prefer distinct penalization. To address this, one can specify
+individual penalization weights for each parameter group:
+
+```python
+bias_params = [p for name, p in self.named_parameters() if 'bias' in name]
+others = [p for name, p in self.named_parameters() if 'bias' not in name]
+
+optim.SGD([
+    {'params': others},
+    {'params': bias_params, 'weight_decay': 0}
+], weight_decay=1e-2, lr=1e-2)
+```
+
+In this manner, bias terms are isolated from non-bias terms, and a `weight_decay`
+of `0` is set specifically for the bias terms, as to avoid any penalization for
+this group.
+
+
+### Taking an optimization step
+
+All optimizers implement a {func}`~Optimizer.step` method, that updates the
+parameters. It can be used in two ways:
+
+#### `optimizer.step()`
+
+This is a simplified version supported by most optimizers. The function can be
+called once the gradients are computed using e.g.
+{func}`~torch.autograd.Variable.backward`.
+
+Example:
+
+```python
+for input, target in dataset:
+    optimizer.zero_grad()
+    output = model(input)
+    loss = loss_fn(output, target)
+    loss.backward()
+    optimizer.step()
+```
+
+#### `optimizer.step(closure)`
+
+Some optimization algorithms such as Conjugate Gradient and LBFGS need to
+reevaluate the function multiple times, so you have to pass in a closure that
+allows them to recompute your model. The closure should clear the gradients,
+compute the loss, and return it.
+
+Example:
+```python
+for input, target in dataset:
+    def closure():
+        optimizer.zero_grad()
+        output = model(input)
+        loss = loss_fn(output, target)
+        loss.backward()
+        return loss
+    optimizer.step(closure)
+```
+
+(optimizer-algorithms)=
+
+## Base class
+
+```{eval-rst}
+.. autoclass:: Optimizer
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Optimizer.add_param_group
+    Optimizer.load_state_dict
+    Optimizer.register_load_state_dict_pre_hook
+    Optimizer.register_load_state_dict_post_hook
+    Optimizer.state_dict
+    Optimizer.register_state_dict_pre_hook
+    Optimizer.register_state_dict_post_hook
+    Optimizer.step
+    Optimizer.register_step_pre_hook
+    Optimizer.register_step_post_hook
+    Optimizer.zero_grad
+```
+
+## Algorithms
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Adadelta
+    Adafactor
+    Adagrad
+    Adam
+    AdamW
+    SparseAdam
+    Adamax
+    ASGD
+    LBFGS
+    NAdam
+    RAdam
+    RMSprop
+    Rprop
+    SGD
+```
+Many of our algorithms have various implementations optimized for performance,
+readability and/or generality, so we attempt to default to the generally fastest
+implementation for the current device if no particular implementation has been
+specified by the user.
+
+We have 3 major categories of implementations: for-loop, foreach (multi-tensor), and
+fused. The most straightforward implementations are for-loops over the parameters with
+big chunks of computation. For-looping is usually slower than our foreach
+implementations, which combine parameters into a multi-tensor and run the big chunks
+of computation all at once, thereby saving many sequential kernel calls. A few of our
+optimizers have even faster fused implementations, which fuse the big chunks of
+computation into one kernel. We can think of foreach implementations as fusing
+horizontally and fused implementations as fusing vertically on top of that.
+
+In general, the performance ordering of the 3 implementations is fused > foreach > for-loop.
+So when applicable, we default to foreach over for-loop. Applicable means the foreach
+implementation is available, the user has not specified any implementation-specific kwargs
+(e.g., fused, foreach, differentiable), and all tensors are native. Note that while fused
+should be even faster than foreach, the implementations are newer and we would like to give
+them more bake-in time before flipping the switch everywhere. We summarize the stability status
+for each implementation on the second table below, you are welcome to try them out though!
+
+Below is a table showing the available and default implementations of each algorithm:
+
+```{eval-rst}
+.. csv-table::
+    :header: "Algorithm", "Default", "Has foreach?", "Has fused?"
+    :widths: 25, 25, 25, 25
+    :delim: ;
+
+    :class:`Adadelta`;foreach;yes;no
+    :class:`Adafactor`;for-loop;no;no
+    :class:`Adagrad`;foreach;yes;yes (cpu only)
+    :class:`Adam`;foreach;yes;yes
+    :class:`AdamW`;foreach;yes;yes
+    :class:`SparseAdam`;for-loop;no;no
+    :class:`Adamax`;foreach;yes;no
+    :class:`ASGD`;foreach;yes;no
+    :class:`LBFGS`;for-loop;no;no
+    :class:`NAdam`;foreach;yes;no
+    :class:`RAdam`;foreach;yes;no
+    :class:`RMSprop`;foreach;yes;no
+    :class:`Rprop`;foreach;yes;no
+    :class:`SGD`;foreach;yes;yes
+```
+Below table is showing the stability status for fused implementations:
+
+```{eval-rst}
+.. csv-table::
+    :header: "Algorithm", "CPU", "CUDA", "MPS"
+    :widths: 25, 25, 25, 25
+    :delim: ;
+
+    :class:`Adadelta`;unsupported;unsupported;unsupported
+    :class:`Adafactor`;unsupported;unsupported;unsupported
+    :class:`Adagrad`;beta;unsupported;unsupported
+    :class:`Adam`;beta;stable;beta
+    :class:`AdamW`;beta;stable;beta
+    :class:`SparseAdam`;unsupported;unsupported;unsupported
+    :class:`Adamax`;unsupported;unsupported;unsupported
+    :class:`ASGD`;unsupported;unsupported;unsupported
+    :class:`LBFGS`;unsupported;unsupported;unsupported
+    :class:`NAdam`;unsupported;unsupported;unsupported
+    :class:`RAdam`;unsupported;unsupported;unsupported
+    :class:`RMSprop`;unsupported;unsupported;unsupported
+    :class:`Rprop`;unsupported;unsupported;unsupported
+    :class:`SGD`;beta;beta;beta
+```
+
+## How to adjust learning rate
+
+{class}`torch.optim.lr_scheduler.LRScheduler` provides several methods to adjust the learning
+rate based on the number of epochs. {class}`torch.optim.lr_scheduler.ReduceLROnPlateau`
+allows dynamic learning rate reducing based on some validation measurements.
+
+Learning rate scheduling should be applied after optimizer's update; e.g., you
+should write your code this way:
+
+Example:
+```python
+optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+scheduler = ExponentialLR(optimizer, gamma=0.9)
+
+for epoch in range(20):
+    for input, target in dataset:
+        optimizer.zero_grad()
+        output = model(input)
+        loss = loss_fn(output, target)
+        loss.backward()
+        optimizer.step()
+    scheduler.step()
+```
+
+Most learning rate schedulers can be called back-to-back (also referred to as
+chaining schedulers). The result is that each scheduler is applied one after the
+other on the learning rate obtained by the one preceding it.
+
+Example:
+```python
+optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+scheduler1 = ExponentialLR(optimizer, gamma=0.9)
+scheduler2 = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)
+
+for epoch in range(20):
+    for input, target in dataset:
+        optimizer.zero_grad()
+        output = model(input)
+        loss = loss_fn(output, target)
+        loss.backward()
+        optimizer.step()
+    scheduler1.step()
+    scheduler2.step()
+```
+
+In many places in the documentation, we will use the following template to refer to schedulers
+algorithms.
+
+```python
+>>> scheduler = ...
+>>> for epoch in range(100):
+>>>     train(...)
+>>>     validate(...)
+>>>     scheduler.step()
+```
+
+```{warning}
+Prior to PyTorch 1.1.0, the learning rate scheduler was expected to be called before
+the optimizer's update; 1.1.0 changed this behavior in a BC-breaking way.  If you use
+the learning rate scheduler (calling `scheduler.step()`) before the optimizer's update
+(calling `optimizer.step()`), this will skip the first value of the learning rate schedule.
+If you are unable to reproduce results after upgrading to PyTorch 1.1.0, please check
+if you are calling `scheduler.step()` at the wrong time.
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    lr_scheduler.LRScheduler
+    lr_scheduler.LambdaLR
+    lr_scheduler.MultiplicativeLR
+    lr_scheduler.StepLR
+    lr_scheduler.MultiStepLR
+    lr_scheduler.ConstantLR
+    lr_scheduler.LinearLR
+    lr_scheduler.ExponentialLR
+    lr_scheduler.PolynomialLR
+    lr_scheduler.CosineAnnealingLR
+    lr_scheduler.ChainedScheduler
+    lr_scheduler.SequentialLR
+    lr_scheduler.ReduceLROnPlateau
+    lr_scheduler.CyclicLR
+    lr_scheduler.OneCycleLR
+    lr_scheduler.CosineAnnealingWarmRestarts
+```
+
+## How to utilize named parameters to load optimizer state dict
+
+The function {func}`~Optimizer.load_state_dict` stores the optional `param_names` content from the
+loaded state dict if present. However, the process of loading the optimizer state is not affected,
+as the order of the parameters matters to maintain compatibility (in case of different ordering).
+To utilize the loaded parameters names from the loaded state dict, a custom `register_load_state_dict_pre_hook`
+needs to be implemented according to the desired behavior.
+
+This can be useful, for instance, when the model architecture changes, but the weights and optimizer states need to
+remain unchanged. The following example demonstrates how to implement this customization.
+
+Example:
+```python
+class OneLayerModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc = nn.Linear(3, 4)
+
+    def forward(self, x):
+        return self.fc(x)
+
+model = OneLayerModel()
+optimizer = optim.SGD(model.named_parameters(), lr=0.01, momentum=0.9)
+# training..
+torch.save(optimizer.state_dict(), PATH)
+```
+
+Let's say that `model` implements an expert (MoE), and we want to duplicate it and resume training
+for two experts, both initialized the same way as the `fc` layer. For the following `model2` we create two layers identical to `fc` and resume training by loading the model weights and optimizer states from `model` into both `fc1` and `fc2` of `model2` (and adjust them accordingly):
+
+```python
+class TwoLayerModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(3, 4)
+        self.fc2 = nn.Linear(3, 4)
+
+    def forward(self, x):
+        return (self.fc1(x) + self.fc2(x)) / 2
+
+model2 = TwoLayerModel()
+# adapt and load model weights..
+optimizer2 = optim.SGD(model2.named_parameters(), lr=0.01, momentum=0.9)
+```
+
+To load the state dict for `optimizer2` with the state dict of the previous optimizer such that both
+`fc1` and `fc2` will be initialized with a copy of `fc` optimizer states
+(to resume training for each layer from `fc`), we can use the following hook:
+
+```python
+def adapt_state_dict_ids(optimizer, state_dict):
+    adapted_state_dict = deepcopy(optimizer.state_dict())
+    # Copy setup parameters (lr, weight_decay, etc.), in case they differ in the loaded state dict.
+    for k, v in state_dict['param_groups'][0].items():
+        if k not in ['params', 'param_names']:
+            adapted_state_dict['param_groups'][0][k] = v
+
+    lookup_dict = {
+        'fc1.weight': 'fc.weight',
+        'fc1.bias': 'fc.bias',
+        'fc2.weight': 'fc.weight',
+        'fc2.bias': 'fc.bias'
+    }
+    clone_deepcopy = lambda d: {k: (v.clone() if isinstance(v, torch.Tensor) else deepcopy(v)) for k, v in d.items()}
+    for param_id, param_name in zip(
+            optimizer.state_dict()['param_groups'][0]['params'],
+            optimizer.state_dict()['param_groups'][0]['param_names']):
+        name_in_loaded = lookup_dict[param_name]
+        index_in_loaded_list = state_dict['param_groups'][0]['param_names'].index(name_in_loaded)
+        id_in_loaded = state_dict['param_groups'][0]['params'][index_in_loaded_list]
+        # Copy the state of the corresponding parameter
+        if id_in_loaded in state_dict['state']:
+            adapted_state_dict['state'][param_id] = clone_deepcopy(state_dict['state'][id_in_loaded])
+
+    return adapted_state_dict
+
+optimizer2.register_load_state_dict_pre_hook(adapt_state_dict_ids)
+optimizer2.load_state_dict(torch.load(PATH)) # The previous optimizer saved state_dict
+```
+
+This ensures that the adapted state_dict with the correct states for the layers of `model2` will be used
+during model loading.
+Note that this code is designed specifically for this example (e.g., assuming a single parameter group),
+and other cases might require different adaptations.
+
+The following example shows how to handle missing parameters in a loaded
+`state dict` when the model structure changes.
+The `Model_bypass` adds a new `bypass` layer, which is not present in the original `Model1`.
+To resume training, a custom `adapt_state_dict_missing_param` hook is used to adapt the optimizer's `state_dict`,
+ensuring existing parameters are mapped correctly, while missing ones (like the bypass layer) remain unchanged
+(as initialized in this example).
+This approach enables smooth loading and resuming of the optimizer state despite model changes.
+The new bypass layer will be trained from scratch:
+
+```python
+class Model1(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc = nn.Linear(5, 5)
+
+    def forward(self, x):
+        return self.fc(x) + x
+
+
+model = Model1()
+optimizer = optim.SGD(model.named_parameters(), lr=0.01, momentum=0.9)
+# training..
+torch.save(optimizer.state_dict(), PATH)
+
+class Model_bypass(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc = nn.Linear(5, 5)
+        self.bypass = nn.Linear(5, 5, bias=False)
+        torch.nn.init.eye_(self.bypass.weight)
+
+    def forward(self, x):
+        return self.fc(x) + self.bypass(x)
+
+model2 = Model_bypass()
+optimizer2 = optim.SGD(model2.named_parameters(), lr=0.01, momentum=0.9)
+
+def adapt_state_dict_missing_param(optimizer, state_dict):
+    adapted_state_dict = deepcopy(optimizer.state_dict())
+    # Copy setup parameters (lr, weight_decay, etc.), in case they differ in the loaded state dict.
+    for k, v in state_dict['param_groups'][0].items():
+        if k not in ['params', 'param_names']:
+            adapted_state_dict['param_groups'][0][k] = v
+
+    lookup_dict = {
+        'fc.weight': 'fc.weight',
+        'fc.bias': 'fc.bias',
+        'bypass.weight': None,
+    }
+
+    clone_deepcopy = lambda d: {k: (v.clone() if isinstance(v, torch.Tensor) else deepcopy(v)) for k, v in d.items()}
+    for param_id, param_name in zip(
+            optimizer.state_dict()['param_groups'][0]['params'],
+            optimizer.state_dict()['param_groups'][0]['param_names']):
+        name_in_loaded = lookup_dict[param_name]
+        if name_in_loaded in state_dict['param_groups'][0]['param_names']:
+            index_in_loaded_list = state_dict['param_groups'][0]['param_names'].index(name_in_loaded)
+            id_in_loaded = state_dict['param_groups'][0]['params'][index_in_loaded_list]
+            # Copy the state of the corresponding parameter
+            if id_in_loaded in state_dict['state']:
+                adapted_state_dict['state'][param_id] = clone_deepcopy(state_dict['state'][id_in_loaded])
+
+    return adapted_state_dict
+
+optimizer2.register_load_state_dict_pre_hook(adapt_state_dict_ids)
+optimizer2.load_state_dict(torch.load(PATH)) # The previous optimizer saved state_dict
+```
+
+
+As a third example, instead of loading a state according to the order of parameters (the default approach),
+this hook can be used to load according to the parameters' names:
+
+```python
+def names_matching(optimizer, state_dict):
+    assert len(state_dict['param_groups']) == len(optimizer.state_dict()['param_groups'])
+    adapted_state_dict = deepcopy(optimizer.state_dict())
+    for g_ind in range(len(state_dict['param_groups'])):
+        assert len(state_dict['param_groups'][g_ind]['params']) == len(
+            optimizer.state_dict()['param_groups'][g_ind]['params'])
+
+        for k, v in state_dict['param_groups'][g_ind].items():
+            if k not in ['params', 'param_names']:
+                adapted_state_dict['param_groups'][g_ind][k] = v
+
+        for param_id, param_name in zip(
+                optimizer.state_dict()['param_groups'][g_ind]['params'],
+                optimizer.state_dict()['param_groups'][g_ind]['param_names']):
+            index_in_loaded_list = state_dict['param_groups'][g_ind]['param_names'].index(param_name)
+            id_in_loaded = state_dict['param_groups'][g_ind]['params'][index_in_loaded_list]
+            # Copy the state of the corresponding parameter
+            if id_in_loaded in state_dict['state']:
+                adapted_state_dict['state'][param_id] = deepcopy(state_dict['state'][id_in_loaded])
+
+    return adapted_state_dict
+```
+
+
+## Weight Averaging (SWA and EMA)
+
+{class}`torch.optim.swa_utils.AveragedModel` implements Stochastic Weight Averaging (SWA) and Exponential Moving Average (EMA),
+{class}`torch.optim.swa_utils.SWALR` implements the SWA learning rate scheduler and
+{func}`torch.optim.swa_utils.update_bn` is a utility function used to update SWA/EMA batch
+normalization statistics at the end of training.
+
+SWA has been proposed in [Averaging Weights Leads to Wider Optima and Better Generalization](https://arxiv.org/abs/1803.05407).
+
+EMA is a widely known technique to reduce the training time by reducing the number of weight updates needed.
+It is a variation of [Polyak averaging](https://paperswithcode.com/method/polyak-averaging), but using exponential weights instead of equal weights across iterations.
+
+### Constructing averaged models
+
+The `AveragedModel` class serves to compute the weights of the SWA or EMA model.
+
+You can create an SWA averaged model by running:
+
+```python
+>>> averaged_model = AveragedModel(model)
+```
+
+EMA models are constructed by specifying the `multi_avg_fn` argument as follows:
+
+```python
+>>> decay = 0.999
+>>> averaged_model = AveragedModel(model, multi_avg_fn=get_ema_multi_avg_fn(decay))
+```
+
+Decay is a parameter between 0 and 1 that controls how fast the averaged parameters are decayed. If not provided to {func}`torch.optim.swa_utils.get_ema_multi_avg_fn`, the default is 0.999. Decay value should be close to 1.0, as smaller values can cause optimization convergence issues.
+
+{func}`torch.optim.swa_utils.get_ema_multi_avg_fn` returns a function that applies the following EMA equation to the weights:
+
+```{math}
+W^\textrm{EMA}_{t+1} = \alpha W^\textrm{EMA}_{t} + (1 - \alpha) W^\textrm{model}_t
+```
+
+where alpha is the EMA decay.
+
+Here the model `model` can be an arbitrary {class}`torch.nn.Module` object. `averaged_model`
+will keep track of the running averages of the parameters of the `model`. To update these
+averages, you should use the {func}`update_parameters` function after the `optimizer.step()`:
+
+```python
+>>> averaged_model.update_parameters(model)
+```
+
+For SWA and EMA, this call is usually done right after the optimizer `step()`. In the case of SWA, this is usually skipped for some numbers of steps at the beginning of the training.
+
+### Custom averaging strategies
+
+By default, {class}`torch.optim.swa_utils.AveragedModel` computes a running equal average of
+the parameters that you provide, but you can also use custom averaging functions with the
+`avg_fn` or `multi_avg_fn` parameters:
+
+- `avg_fn` allows defining a function operating on each parameter tuple (averaged parameter, model parameter) and should return the new averaged parameter.
+- `multi_avg_fn` allows defining more efficient operations acting on a tuple of parameter lists, (averaged parameter list, model parameter list), at the same time, for example using the `torch._foreach*` functions. This function must update the averaged parameters in-place.
+
+In the following example `ema_model` computes an exponential moving average using the `avg_fn` parameter:
+
+```python
+>>> ema_avg = lambda averaged_model_parameter, model_parameter, num_averaged:\
+>>>         0.9 * averaged_model_parameter + 0.1 * model_parameter
+>>> ema_model = torch.optim.swa_utils.AveragedModel(model, avg_fn=ema_avg)
+```
+
+In the following example `ema_model` computes an exponential moving average using the more efficient `multi_avg_fn` parameter:
+
+```python
+>>> ema_model = AveragedModel(model, multi_avg_fn=get_ema_multi_avg_fn(0.9))
+```
+
+### SWA learning rate schedules
+
+Typically, in SWA the learning rate is set to a high constant value. {class}`SWALR` is a
+learning rate scheduler that anneals the learning rate to a fixed value, and then keeps it
+constant. For example, the following code creates a scheduler that linearly anneals the
+learning rate from its initial value to 0.05 in 5 epochs within each parameter group:
+
+```python
+>>> swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, \
+>>>         anneal_strategy="linear", anneal_epochs=5, swa_lr=0.05)
+```
+
+You can also use cosine annealing to a fixed value instead of linear annealing by setting
+`anneal_strategy="cos"`.
+
+
+### Taking care of batch normalization
+
+{func}`update_bn` is a utility function that allows to compute the batchnorm statistics for the SWA model
+on a given dataloader `loader` at the end of training:
+
+```python
+>>> torch.optim.swa_utils.update_bn(loader, swa_model)
+```
+
+{func}`update_bn` applies the `swa_model` to every element in the dataloader and computes the activation
+statistics for each batch normalization layer in the model.
+
+```{warning}
+{func}`update_bn` assumes that each batch in the dataloader `loader` is either a tensors or a list of
+tensors where the first element is the tensor that the network `swa_model` should be applied to.
+If your dataloader has a different structure, you can update the batch normalization statistics of the
+`swa_model` by doing a forward pass with the `swa_model` on each element of the dataset.
+```
+
+
+
+### Putting it all together: SWA
+
+In the example below, `swa_model` is the SWA model that accumulates the averages of the weights.
+We train the model for a total of 300 epochs and we switch to the SWA learning rate schedule
+and start to collect SWA averages of the parameters at epoch 160:
+
+```python
+>>> loader, optimizer, model, loss_fn = ...
+>>> swa_model = torch.optim.swa_utils.AveragedModel(model)
+>>> scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=300)
+>>> swa_start = 160
+>>> swa_scheduler = SWALR(optimizer, swa_lr=0.05)
+>>>
+>>> for epoch in range(300):
+>>>       for input, target in loader:
+>>>           optimizer.zero_grad()
+>>>           loss_fn(model(input), target).backward()
+>>>           optimizer.step()
+>>>       if epoch > swa_start:
+>>>           swa_model.update_parameters(model)
+>>>           swa_scheduler.step()
+>>>       else:
+>>>           scheduler.step()
+>>>
+>>> # Update bn statistics for the swa_model at the end
+>>> torch.optim.swa_utils.update_bn(loader, swa_model)
+>>> # Use swa_model to make predictions on test data
+>>> preds = swa_model(test_input)
+```
+
+### Putting it all together: EMA
+
+In the example below, `ema_model` is the EMA model that accumulates the exponentially-decayed averages of the weights with a decay rate of 0.999.
+We train the model for a total of 300 epochs and start to collect EMA averages immediately.
+
+```python
+>>> loader, optimizer, model, loss_fn = ...
+>>> ema_model = torch.optim.swa_utils.AveragedModel(model, \
+>>>             multi_avg_fn=torch.optim.swa_utils.get_ema_multi_avg_fn(0.999))
+>>>
+>>> for epoch in range(300):
+>>>       for input, target in loader:
+>>>           optimizer.zero_grad()
+>>>           loss_fn(model(input), target).backward()
+>>>           optimizer.step()
+>>>           ema_model.update_parameters(model)
+>>>
+>>> # Update bn statistics for the ema_model at the end
+>>> torch.optim.swa_utils.update_bn(loader, ema_model)
+>>> # Use ema_model to make predictions on test data
+>>> preds = ema_model(test_input)
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    swa_utils.AveragedModel
+    swa_utils.SWALR
+
+
+.. autofunction:: torch.optim.swa_utils.get_ema_multi_avg_fn
+.. autofunction:: torch.optim.swa_utils.update_bn
+```
+
+<!-- This module needs to be documented. Adding here in the meantime
+for tracking purposes -->
+```{eval-rst}
+.. py:module:: torch.optim.adadelta
+.. py:module:: torch.optim.adagrad
+.. py:module:: torch.optim.adam
+.. py:module:: torch.optim.adamax
+.. py:module:: torch.optim.adamw
+.. py:module:: torch.optim.asgd
+.. py:module:: torch.optim.lbfgs
+.. py:module:: torch.optim.lr_scheduler
+.. py:module:: torch.optim.nadam
+.. py:module:: torch.optim.optimizer
+.. py:module:: torch.optim.radam
+.. py:module:: torch.optim.rmsprop
+.. py:module:: torch.optim.rprop
+.. py:module:: torch.optim.sgd
+.. py:module:: torch.optim.sparse_adam
+.. py:module:: torch.optim.swa_utils
+```
diff --git a/docs/source/package.md b/docs/source/package.md
new file mode 100644
index 000000000000..e337fedde3e6
--- /dev/null
+++ b/docs/source/package.md
@@ -0,0 +1,756 @@
+```{eval-rst}
+.. automodule:: torch.package
+.. py:module:: torch.package.analyze
+
+.. currentmodule:: torch.package
+```
+
+# torch.package
+`torch.package` adds support for creating packages containing both artifacts and arbitrary
+PyTorch code. These packages can be saved, shared, used to load and execute models
+at a later date or on a different machine, and can even be deployed to production using
+`torch::deploy`.
+
+This document contains tutorials, how-to guides, explanations, and an API reference that
+will help you learn more about `torch.package` and how to use it.
+
+```{warning}
+This module depends on the `pickle` module which is not secure. Only unpackage data you trust.
+
+It is possible to construct malicious pickle data which will **execute arbitrary code during unpickling**.
+Never unpackage data that could have come from an untrusted source, or that could have been tampered with.
+
+For more information, review the [documentation](https://docs.python.org/3/library/pickle.html) for the `pickle` module.
+```
+
+```{contents}
+:local:
+:depth: 2
+```
+
+## Tutorials
+### Packaging your first model
+A tutorial that guides you through packaging and unpackaging a simple model is available
+[on Colab](https://colab.research.google.com/drive/1lFZkLyViGfXxB-m3jqlyTQuYToo3XLo-).
+After completing this exercise, you will be familiar with the basic API for creating and using
+Torch packages.
+
+## How do I...
+
+### See what is inside a package?
+
+#### Treat the package like a ZIP archive
+
+The container format for a `torch.package` is ZIP, so any tools that work with standard ZIP files should
+work for exploring the contents. Some common ways to interact with ZIP files:
+
+* `unzip my_package.pt` will unzip the `torch.package` archive to disk, where you can freely inspect its contents.
+
+```
+$ unzip my_package.pt && tree my_package
+my_package
+├── .data
+│   ├── 94304870911616.storage
+│   ├── 94304900784016.storage
+│   ├── extern_modules
+│   └── version
+├── models
+│   └── model_1.pkl
+└── torchvision
+    └── models
+        ├── resnet.py
+        └── utils.py
+~ cd my_package && cat torchvision/models/resnet.py
+...
+```
+
+* The Python `zipfile` module provides a standard way to read and write ZIP archive contents.
+
+```python
+from zipfile import ZipFile
+with ZipFile("my_package.pt") as myzip:
+    file_bytes = myzip.read("torchvision/models/resnet.py")
+    # edit file_bytes in some way
+    myzip.writestr("torchvision/models/resnet.py", new_file_bytes)
+```
+
+* vim has the ability to natively read ZIP archives. You can even edit files and :`write` them back into the archive!
+
+```vim
+# add this to your .vimrc to treat `*.pt` files as zip files
+au BufReadCmd *.pt call zip#Browse(expand("<amatch>"))
+
+~ vi my_package.pt
+```
+
+#### Use the `file_structure()` API
+{class}`PackageImporter` provides a `file_structure()` method, which will return a printable
+and queryable {class}`Directory` object. The {class}`Directory` object is a simple directory structure that you can use to explore the
+current contents of a `torch.package`.
+
+The {class}`Directory` object itself is directly printable and will print out a file tree representation. To filter what is returned,
+use the glob-style `include` and `exclude` filtering arguments.
+
+```python
+with PackageExporter('my_package.pt') as pe:
+    pe.save_pickle('models', 'model_1.pkl', mod)
+
+importer = PackageImporter('my_package.pt')
+# can limit printed items with include/exclude args
+print(importer.file_structure(include=["**/utils.py", "**/*.pkl"], exclude="**/*.storage"))
+print(importer.file_structure()) # will print out all files
+```
+
+Output:
+
+```
+# filtered with glob pattern:
+#    include=["**/utils.py", "**/*.pkl"], exclude="**/*.storage"
+─── my_package.pt
+    ├── models
+    │   └── model_1.pkl
+    └── torchvision
+        └── models
+            └── utils.py
+
+# all files
+─── my_package.pt
+    ├── .data
+    │   ├── 94304870911616.storage
+    │   ├── 94304900784016.storage
+    │   ├── extern_modules
+    │   └── version
+    ├── models
+    │   └── model_1.pkl
+    └── torchvision
+        └── models
+            ├── resnet.py
+            └── utils.py
+```
+
+You can also query {class}`Directory` objects with the `has_file()` method.
+
+```python
+importer_file_structure = importer.file_structure()
+found: bool = importer_file_structure.has_file("package_a/subpackage.py")
+```
+
+### See why a given module was included as a dependency?
+
+Say there is a given module `foo`, and you want to know why your {class}`PackageExporter` is pulling in `foo` as a dependency.
+
+{meth}`PackageExporter.get_rdeps` will return all modules that directly depend on `foo`.
+
+If you would like to see how a given module `src` depends on `foo`, the {meth}`PackageExporter.all_paths` method will
+return a DOT-formatted graph showing all the dependency paths between `src` and `foo`.
+
+If you would just like to see the whole dependency graph of your :class:`PackageExporter`, you can use {meth}`PackageExporter.dependency_graph_string`.
+
+
+### Include arbitrary resources with my package and access them later?
+{class}`PackageExporter` exposes three methods, `save_pickle`, `save_text` and `save_binary` that allow you to save
+Python objects, text, and binary data to a package.
+
+```python
+with torch.PackageExporter("package.pt") as exporter:
+    # Pickles the object and saves to `my_resources/tensor.pkl` in the archive.
+    exporter.save_pickle("my_resources", "tensor.pkl", torch.randn(4))
+    exporter.save_text("config_stuff", "words.txt", "a sample string")
+    exporter.save_binary("raw_data", "binary", my_bytes)
+
+```
+{class}`PackageImporter` exposes complementary methods named `load_pickle`, `load_text` and `load_binary` that allow you to load
+Python objects, text and binary data from a package.
+
+```python
+importer = torch.PackageImporter("package.pt")
+my_tensor = importer.load_pickle("my_resources", "tensor.pkl")
+text = importer.load_text("config_stuff", "words.txt")
+binary = importer.load_binary("raw_data", "binary")
+```
+
+### Customize how a class is packaged?
+`torch.package` allows for the customization of how classes are packaged. This behavior is accessed through defining the method
+`__reduce_package__` on a class and by defining a corresponding de-packaging function. This is similar to defining `__reduce__` for
+Python’s normal pickling process.
+
+Steps:
+
+1. Define the method `__reduce_package__(self, exporter: PackageExporter)` on the target class. This method should do the work to save the class instance inside of the package, and should return a tuple of the corresponding de-packaging function with the arguments needed to invoke the de-packaging function. This method is called by the `PackageExporter` when it encounters an instance of the target class.
+2. Define a de-packaging function for the class. This de-packaging function should do the work to reconstruct and return an instance of the class. The function signature’s first parameter should be a `PackageImporter` instance, and the rest of the parameters are user defined.
+
+
+```python
+# foo.py [Example of customizing how class Foo is packaged]
+from torch.package import PackageExporter, PackageImporter
+import time
+
+
+class Foo:
+    def __init__(self, my_string: str):
+        super().__init__()
+        self.my_string = my_string
+        self.time_imported = 0
+        self.time_exported = 0
+
+    def __reduce_package__(self, exporter: PackageExporter):
+        """
+        Called by ``torch.package.PackageExporter``'s Pickler's ``persistent_id`` when
+        saving an instance of this object. This method should do the work to save this
+        object inside of the ``torch.package`` archive.
+
+        Returns function w/ arguments to load the object from a
+        ``torch.package.PackageImporter``'s Pickler's ``persistent_load`` function.
+        """
+
+        # use this pattern to ensure no naming conflicts with normal dependencies,
+        # anything saved under this module name shouldn't conflict with other
+        # items in the package
+        generated_module_name = f"foo-generated._{exporter.get_unique_id()}"
+        exporter.save_text(
+            generated_module_name,
+            "foo.txt",
+            self.my_string + ", with exporter modification!",
+        )
+        time_exported = time.clock_gettime(1)
+
+        # returns de-packaging function w/ arguments to invoke with
+        return (unpackage_foo, (generated_module_name, time_exported,))
+
+
+def unpackage_foo(
+    importer: PackageImporter, generated_module_name: str, time_exported: float
+) -> Foo:
+    """
+    Called by ``torch.package.PackageImporter``'s Pickler's ``persistent_load`` function
+    when depickling a Foo object.
+    Performs work of loading and returning a Foo instance from a ``torch.package`` archive.
+    """
+    time_imported = time.clock_gettime(1)
+    foo = Foo(importer.load_text(generated_module_name, "foo.txt"))
+    foo.time_imported = time_imported
+    foo.time_exported = time_exported
+    return foo
+
+```
+
+
+```python
+# example of saving instances of class Foo
+
+import torch
+from torch.package import PackageImporter, PackageExporter
+import foo
+
+foo_1 = foo.Foo("foo_1 initial string")
+foo_2 = foo.Foo("foo_2 initial string")
+with PackageExporter('foo_package.pt') as pe:
+    # save as normal, no extra work necessary
+    pe.save_pickle('foo_collection', 'foo1.pkl', foo_1)
+    pe.save_pickle('foo_collection', 'foo2.pkl', foo_2)
+
+pi = PackageImporter('foo_package.pt')
+print(pi.file_structure())
+imported_foo = pi.load_pickle('foo_collection', 'foo1.pkl')
+print(f"foo_1 string: '{imported_foo.my_string}'")
+print(f"foo_1 export time: {imported_foo.time_exported}")
+print(f"foo_1 import time: {imported_foo.time_imported}")
+```
+
+```
+# output of running above script
+─── foo_package
+    ├── foo-generated
+    │   ├── _0
+    │   │   └── foo.txt
+    │   └── _1
+    │       └── foo.txt
+    ├── foo_collection
+    │   ├── foo1.pkl
+    │   └── foo2.pkl
+    └── foo.py
+
+foo_1 string: 'foo_1 initial string, with reduction modification!'
+foo_1 export time: 9857706.650140837
+foo_1 import time: 9857706.652698385
+```
+
+### Test in my source code whether or not it is executing inside a package?
+
+A {class}`PackageImporter` will add the attribute `__torch_package__` to every module that it initializes. Your code can check for the
+presence of this attribute to determine whether it is executing in a packaged context or not.
+
+```python
+# In foo/bar.py:
+
+if "__torch_package__" in dir():  # true if the code is being loaded from a package
+    def is_in_package():
+        return True
+
+    UserException = Exception
+else:
+    def is_in_package():
+        return False
+
+    UserException = UnpackageableException
+```
+
+Now, the code will behave differently depending on whether it’s imported normally through your Python environment or imported from a
+`torch.package`.
+
+```python
+from foo.bar import is_in_package
+
+print(is_in_package())  # False
+
+loaded_module = PackageImporter(my_package).import_module("foo.bar")
+loaded_module.is_in_package()  # True
+```
+
+**Warning**: in general, it’s bad practice to have code that behaves differently depending on whether it’s packaged or not. This can lead to
+hard-to-debug issues that are sensitive to how you imported your code. If your package is intended to be heavily used, consider restructuring
+your code so that it behaves the same way no matter how it was loaded.
+
+
+### Patch code into a package?
+{class}`PackageExporter` offers a `save_source_string()` method that allows one to save arbitrary Python source code to a module of your choosing.
+```python
+with PackageExporter(f) as exporter:
+    # Save the my_module.foo available in your current Python environment.
+    exporter.save_module("my_module.foo")
+
+    # This saves the provided string to my_module/foo.py in the package archive.
+    # It will override the my_module.foo that was previously saved.
+    exporter.save_source_string("my_module.foo", textwrap.dedent(
+        """\
+        def my_function():
+            print('hello world')
+        """
+    ))
+
+    # If you want to treat my_module.bar as a package
+    # (e.g. save to `my_module/bar/__init__.py` instead of `my_module/bar.py)
+    # pass is_package=True,
+    exporter.save_source_string("my_module.bar",
+                                "def foo(): print('hello')\n",
+                                is_package=True)
+
+importer = PackageImporter(f)
+importer.import_module("my_module.foo").my_function()  # prints 'hello world'
+```
+
+### Access package contents from packaged code?
+{class}`PackageImporter` implements the
+[`importlib.resources`](https://docs.python.org/3/library/importlib.html#module-importlib.resources)
+API for accessing resources from inside a package.
+
+```python
+with PackageExporter(f) as exporter:
+    # saves text to my_resource/a.txt in the archive
+    exporter.save_text("my_resource", "a.txt", "hello world!")
+    # saves the tensor to my_pickle/obj.pkl
+    exporter.save_pickle("my_pickle", "obj.pkl", torch.ones(2, 2))
+
+    # see below for module contents
+    exporter.save_module("foo")
+    exporter.save_module("bar")
+```
+
+The `importlib.resources` API allows access to resources from within packaged code.
+
+
+```python
+# foo.py:
+import importlib.resources
+import my_resource
+
+# returns "hello world!"
+def get_my_resource():
+    return importlib.resources.read_text(my_resource, "a.txt")
+```
+
+Using `importlib.resources` is the recommended way to access package contents from within packaged code, since it complies
+with the Python standard. However, it is also possible to access the parent :class:`PackageImporter` instance itself from within
+packaged code.
+
+```python
+# bar.py:
+import torch_package_importer # this is the PackageImporter that imported this module.
+
+# Prints "hello world!", equivalent to importlib.resources.read_text
+def get_my_resource():
+    return torch_package_importer.load_text("my_resource", "a.txt")
+
+# You also do things that the importlib.resources API does not support, like loading
+# a pickled object from the package.
+def get_my_pickle():
+    return torch_package_importer.load_pickle("my_pickle", "obj.pkl")
+```
+
+### Distinguish between packaged code and non-packaged code?
+To tell if an object’s code is from a `torch.package`, use the `torch.package.is_from_package()` function.
+Note: if an object is from a package but its definition is from a module marked `extern` or from `stdlib`,
+this check will return `False`.
+
+```python
+importer = PackageImporter(f)
+mod = importer.import_module('foo')
+obj = importer.load_pickle('model', 'model.pkl')
+txt = importer.load_text('text', 'my_test.txt')
+
+assert is_from_package(mod)
+assert is_from_package(obj)
+assert not is_from_package(txt) # str is from stdlib, so this will return False
+```
+
+### Re-export an imported object?
+To re-export an object that was previously imported by a {class}`PackageImporter`, you must make the new {class}`PackageExporter`
+aware of the original {class}`PackageImporter` so that it can find source code for your object’s dependencies.
+
+```python
+importer = PackageImporter(f)
+obj = importer.load_pickle("model", "model.pkl")
+
+# re-export obj in a new package
+with PackageExporter(f2, importer=(importer, sys_importer)) as exporter:
+    exporter.save_pickle("model", "model.pkl", obj)
+```
+
+### Package a TorchScript module?
+To package a TorchScript model, use the same `save_pickle` and `load_pickle` APIs as you would with any other object.
+Saving TorchScript objects that are attributes or submodules is supported as well with no extra work.
+
+```python
+# save TorchScript just like any other object
+with PackageExporter(file_name) as e:
+    e.save_pickle("res", "script_model.pkl", scripted_model)
+    e.save_pickle("res", "mixed_model.pkl", python_model_with_scripted_submodule)
+# load as normal
+importer = PackageImporter(file_name)
+loaded_script = importer.load_pickle("res", "script_model.pkl")
+loaded_mixed = importer.load_pickle("res", "mixed_model.pkl"
+```
+
+## Explanation
+
+### `torch.package` Format Overview
+A `torch.package` file is a ZIP archive which conventionally uses the `.pt` extension. Inside the ZIP archive, there are two kinds of files:
+
+* Framework files, which are placed in the `.data/`.
+* User files, which is everything else.
+
+As an example, this is what a fully packaged ResNet model from `torchvision` looks like:
+
+```
+resnet
+├── .data  # All framework-specific data is stored here.
+│   │      # It's named to avoid conflicts with user-serialized code.
+│   ├── 94286146172688.storage  # tensor data
+│   ├── 94286146172784.storage
+│   ├── extern_modules  # text file with names of extern modules (e.g. 'torch')
+│   ├── version         # version metadata
+│   ├── ...
+├── model  # the pickled model
+│   └── model.pkl
+└── torchvision  # all code dependencies are captured as source files
+    └── models
+        ├── resnet.py
+        └── utils.py
+```
+
+#### Framework files
+The `.data/` directory is owned by torch.package, and its contents are considered to be a private implementation detail.
+The `torch.package` format makes no guarantees about the contents of `.data/`, but any changes made will be backward compatible
+(that is, newer version of PyTorch will always be able to load older `torch.packages`).
+
+Currently, the `.data/` directory contains the following items:
+
+* `version`: a version number for the serialized format, so that the `torch.package` import infrastructures knows how to load this package.
+* `extern_modules`: a list of modules that are considered `extern`. `extern` modules will be imported using the loading environment’s system importer.
+* `*.storage`: serialized tensor data.
+
+```
+.data
+├── 94286146172688.storage
+├── 94286146172784.storage
+├── extern_modules
+├── version
+├── ...
+```
+
+#### User files
+All other files in the archive were put there by a user. The layout is identical to a Python
+[regular package](https://docs.python.org/3/reference/import.html#regular-packages). For a deeper dive in how Python packaging works,
+please consult [this essay](https://www.python.org/doc/essays/packages/) (it’s slightly out of date, so double-check implementation details
+with the [Python reference documentation](https://docs.python.org/3/library/importlib.html).
+
+```
+<package root>
+├── model  # the pickled model
+│   └── model.pkl
+├── another_package
+│   ├── __init__.py
+│   ├── foo.txt         # a resource file , see importlib.resources
+│   └── ...
+└── torchvision
+    └── models
+        ├── resnet.py   # torchvision.models.resnet
+        └── utils.py    # torchvision.models.utils
+```
+
+### How `torch.package` finds your code's dependencies
+#### Analyzing an object's dependencies
+When you issue a `save_pickle(obj, ...)` call, {class}`PackageExporter` will pickle the object normally. Then, it uses the
+`pickletools` standard library module to parse the pickle bytecode.
+
+In a pickle, an object is saved along with a `GLOBAL` opcode that describes where to find the implementation of the object’s type, like:
+
+```
+GLOBAL 'torchvision.models.resnet Resnet`
+```
+
+The dependency resolver will gather up all `GLOBAL` ops and mark them as dependencies of your pickled object.
+For more information about pickling and the pickle format, please consult [the Python docs](https://docs.python.org/3/library/pickle.html).
+
+#### Analyzing a module's dependencies
+When a Python module is identified as a dependency, `torch.package` walks the module’s python AST representation and looks for import statements with
+full support for the standard forms: `from x import y`, `import z`, `from w import v as u`, etc. When one of these import statements are
+encountered, `torch.package` registers the imported modules as dependencies that are then themselves parsed in the same AST walking way.
+
+**Note**: AST parsing has limited support for the `__import__(...)` syntax and does not support `importlib.import_module` calls. In general, you should
+not expect dynamic imports to be detected by `torch.package`.
+
+
+### Dependency Management
+`torch.package` automatically finds the Python modules that your code and objects depend on. This process is called dependency resolution.
+For each module that the dependency resolver finds, you must specify an *action* to take.
+
+The allowed actions are:
+
+* `intern`: put this module into the package.
+* `extern`: declare this module as an external dependency of the package.
+* `mock`: stub out this module.
+* `deny`: depending on this module will raise an error during package export.
+
+Finally, there is one more important action that is not technically part of `torch.package`:
+
+* Refactoring: remove or change the dependencies in your code.
+
+Note that actions are only defined on entire Python modules. There is no way to package “just” a function or class from a module and leave the rest out.
+This is by design. Python does not offer clean boundaries between objects defined in a module. The only defined unit of dependency organization is a
+module, so that’s what `torch.package` uses.
+
+Actions are applied to modules using patterns. Patterns can either be module names (`"foo.bar"`) or globs (like `"foo.**"`). You associate a pattern
+with an action using methods on {class}`PackageExporter`, e.g.
+
+```python
+my_exporter.intern("torchvision.**")
+my_exporter.extern("numpy")
+```
+
+If a module matches a pattern, the corresponding action is applied to it. For a given module, patterns will be checked in the order that they were defined,
+and the first action will be taken.
+
+
+#### `intern`
+If a module is `intern`-ed, it will be placed into the package.
+
+This action is your model code, or any related code you want to package. For example, if you are trying to package a ResNet from `torchvision`,
+you will need to `intern` the module torchvision.models.resnet.
+
+On package import, when your packaged code tries to import an `intern`-ed module, PackageImporter will look inside your package for that module.
+If it can’t find that module, an error will be raised. This ensures that each {class}`PackageImporter` is isolated from the loading environment—even
+if you have `my_interned_module` available in both your package and the loading environment, {class}`PackageImporter` will only use the version in your
+package.
+
+**Note**: Only Python source modules can be `intern`-ed. Other kinds of modules, like C extension modules and bytecode modules, will raise an error if
+you attempt to `intern` them. These kinds of modules need to be `mock`-ed or `extern`-ed.
+
+
+#### `extern`
+If a module is `extern`-ed, it will not be packaged. Instead, it will be added to a list of external dependencies for this package. You can find this
+list on `package_exporter.extern_modules`.
+
+On package import, when the packaged code tries to import an `extern`-ed module, {class}`PackageImporter` will use the default Python importer to find
+that module, as if you did `importlib.import_module("my_externed_module")`. If it can’t find that module, an error will be raised.
+
+In this way, you can depend on third-party libraries like `numpy` and `scipy` from within your package without having to package them too.
+
+**Warning**: If any external library changes in a backwards-incompatible way, your package may fail to load. If you need long-term reproducibility
+for your package, try to limit your use of `extern`.
+
+
+#### `mock`
+If a module is `mock`-ed, it will not be packaged. Instead a stub module will be packaged in its place. The stub module will allow you to retrieve
+objects from it (so that `from my_mocked_module import foo` will not error), but any use of that object will raise a `NotImplementedError`.
+
+`mock` should be used for code that you “know” will not be needed in the loaded package, but you still want available for use in non-packaged contents.
+For example, initialization/configuration code, or code only used for debugging/training.
+
+**Warning**: In general, `mock` should be used as a last resort. It introduces behavioral differences between packaged code and non-packaged code,
+which may lead to later confusion. Prefer instead to refactor your code to remove unwanted dependencies.
+
+
+#### Refactoring
+The best way to manage dependencies is to not have dependencies at all! Often, code can be refactored to remove unnecessary dependencies. Here are some
+guidelines for writing code with clean dependencies (which are also generally good practices!):
+
+**Include only what you use**. Do not leave unused imports in your code. The dependency resolver is not smart enough to tell that they are indeed unused,
+and will try to process them.
+
+**Qualify your imports**. For example, instead of writing import foo and later using `foo.bar.baz`, prefer to write `from foo.bar import baz`. This more
+precisely specifies your real dependency (`foo.bar`) and lets the dependency resolver know you don’t need all of `foo`.
+
+**Split up large files with unrelated functionality into smaller ones**. If your `utils` module contains a hodge-podge of unrelated functionality, any module
+that depends on `utils` will need to pull in lots of unrelated dependencies, even if you only needed a small part of it. Prefer instead to define
+single-purpose modules that can be packaged independently of one another.
+
+
+#### Patterns
+Patterns allow you to specify groups of modules with a convenient syntax. The syntax and behavior of patterns follows the Bazel/Buck
+[glob()](https://docs.bazel.build/versions/master/be/functions.html#glob).
+
+A module that we are trying to match against a pattern is called a candidate. A candidate is composed of a list of segments separated by a
+separator string, e.g. `foo.bar.baz`.
+
+A pattern contains one or more segments. Segments can be:
+
+* A literal string (e.g. `foo`), which matches exactly.
+* A string containing a wildcard (e.g. `torch`, or `foo*baz*`). The wildcard matches any string, including the empty string.
+* A double wildcard (`**`). This matches against zero or more complete segments.
+
+Examples:
+
+* `torch.**`: matches `torch` and all its submodules, e.g. `torch.nn` and `torch.nn.functional`.
+* `torch.*`: matches `torch.nn` or `torch.functional`, but not `torch.nn.functional` or `torch`
+* `torch*.**`: matches `torch`, `torchvision`, and all of their submodules
+
+When specifying actions, you can pass multiple patterns, e.g.
+
+```python
+exporter.intern(["torchvision.models.**", "torchvision.utils.**"])
+```
+
+A module will match against this action if it matches any of the patterns.
+
+You can also specify patterns to exclude, e.g.
+
+```python
+exporter.mock("**", exclude=["torchvision.**"])
+```
+
+
+A module will not match against this action if it matches any of the exclude patterns. In this example, we are mocking all modules except
+`torchvision` and its submodules.
+
+When a module could potentially match against multiple actions, the first action defined will be taken.
+
+
+### `torch.package` sharp edges
+#### Avoid global state in your modules
+Python makes it really easy to bind objects and run code at module-level scope. This is generally fine—after all, functions and classes are bound to
+names this way. However, things become more complicated when you define an object at module scope with the intention of mutating it, introducing mutable
+global state.
+
+Mutable global state is quite useful—it can reduce boilerplate, allow for open registration into tables, etc. But unless employed very carefully, it can
+cause complications when used with `torch.package`.
+
+Every {class}`PackageImporter` creates an independent environment for its contents. This is nice because it means we load multiple packages and ensure
+they are isolated from each other, but when modules are written in a way that assumes shared mutable global state, this behavior can create hard-to-debug
+errors.
+
+#### Types are not shared between packages and the loading environment
+Any class that you import from a {class}`PackageImporter` will be a version of the class specific to that importer. For example:
+
+
+```python
+from foo import MyClass
+
+my_class_instance = MyClass()
+
+with PackageExporter(f) as exporter:
+    exporter.save_module("foo")
+
+importer = PackageImporter(f)
+imported_MyClass = importer.import_module("foo").MyClass
+
+assert isinstance(my_class_instance, MyClass)  # works
+assert isinstance(my_class_instance, imported_MyClass)  # ERROR!
+```
+
+In this example, `MyClass` and `imported_MyClass` are *not the same type*. In this specific example, `MyClass` and `imported_MyClass` have exactly the
+same implementation, so you might think it’s okay to consider them the same class. But consider the situation where `imported_MyClass` is coming from an
+older package with an entirely different implementation of `MyClass` — in that case, it’s unsafe to consider them the same class.
+
+Under the hood, each importer has a prefix that allows it to uniquely identify classes:
+
+```python
+print(MyClass.__name__)  # prints "foo.MyClass"
+print(imported_MyClass.__name__)  # prints <torch_package_0>.foo.MyClass
+```
+
+That means you should not expect `isinstance` checks to work when one of the arguments is from a package and the other is not. If you need this
+functionality, consider the following options:
+
+* Doing duck typing (just using the class instead of explicitly checking that it is of a given type).
+* Make the typing relationship an explicit part of the class contract. For example, you can add an attribute tag `self.handler = "handle_me_this_way"` and have client code check for the value of `handler` instead of checking the type directly.
+
+
+### How `torch.package` keeps packages isolated from each other
+Each {class}`PackageImporter` instance creates an independent, isolated environment for its modules and objects. Modules in a package can only import
+other packaged modules, or modules marked `extern`. If you use multiple {class}`PackageImporter` instances to load a single package, you will get
+multiple independent environments that do not interact.
+
+This is achieved by extending Python’s import infrastructure with a custom importer. {class}`PackageImporter` provides the same core API as the
+`importlib` importer; namely, it implements the `import_module` and `__import__` methods.
+
+When you invoke {meth}`PackageImporter.import_module`, {class}`PackageImporter` will construct and return a new module, much as the system importer does.
+However, {class}`PackageImporter` patches the returned module to use `self` (i.e. that {class}`PackageImporter` instance) to fulfill future import
+requests by looking in the package rather than searching the user’s Python environment.
+
+#### Mangling
+To avoid confusion (“is this `foo.bar` object the one from my package, or the one from my Python environment?”), {class}`PackageImporter` mangles the
+`__name__` and `__file__` of all imported modules, by adding a *mangle prefix* to them.
+
+For `__name__`, a name like `torchvision.models.resnet18` becomes `<torch_package_0>.torchvision.models.resnet18`.
+
+For `__file__`, a name like `torchvision/models/resnet18.py` becomes `<torch_package_0>.torchvision/modules/resnet18.py`.
+
+Name mangling helps avoid inadvertent punning of module names between different packages, and helps you debug by making stack traces and print
+statements more clearly show whether they are referring to packaged code or not. For developer-facing details about mangling, consult
+`mangling.md` in `torch/package/`.
+
+
+## API Reference
+```{eval-rst}
+.. autoclass:: torch.package.PackagingError
+
+.. autoclass:: torch.package.EmptyMatchError
+
+.. autoclass:: torch.package.PackageExporter
+  :members:
+
+  .. automethod:: __init__
+
+.. autoclass:: torch.package.PackageImporter
+  :members:
+
+  .. automethod:: __init__
+
+.. autoclass:: torch.package.Directory
+  :members:
+```
+
+<!-- This module needs to be documented. Adding here in the meantime
+for tracking purposes -->
+```{eval-rst}
+.. py:module:: torch.package.analyze.find_first_use_of_broken_modules
+.. py:module:: torch.package.analyze.is_from_package
+.. py:module:: torch.package.analyze.trace_dependencies
+.. py:module:: torch.package.file_structure_representation
+.. py:module:: torch.package.find_file_dependencies
+.. py:module:: torch.package.glob_group
+.. py:module:: torch.package.importer
+.. py:module:: torch.package.package_exporter
+.. py:module:: torch.package.package_importer
+```
diff --git a/docs/source/profiler.md b/docs/source/profiler.md
new file mode 100644
index 000000000000..1578b7334d84
--- /dev/null
+++ b/docs/source/profiler.md
@@ -0,0 +1,49 @@
+```{eval-rst}
+.. currentmodule:: torch.profiler
+```
+
+# torch.profiler
+
+## Overview
+```{eval-rst}
+.. automodule:: torch.profiler
+```
+
+## API Reference
+```{eval-rst}
+.. autoclass:: torch.profiler._KinetoProfile
+  :members:
+
+.. autoclass:: torch.profiler.profile
+  :members:
+
+.. autoclass:: torch.profiler.ProfilerAction
+  :members:
+
+.. autoclass:: torch.profiler.ProfilerActivity
+  :members:
+
+.. autofunction:: torch.profiler.schedule
+
+.. autofunction:: torch.profiler.tensorboard_trace_handler
+```
+
+## Intel Instrumentation and Tracing Technology APIs
+
+```{eval-rst}
+.. autofunction:: torch.profiler.itt.is_available
+
+.. autofunction:: torch.profiler.itt.mark
+
+.. autofunction:: torch.profiler.itt.range_push
+
+.. autofunction:: torch.profiler.itt.range_pop
+```
+
+<!-- This module needs to be documented. Adding here in the meantime
+for tracking purposes -->
+```{eval-rst}
+.. py:module:: torch.profiler.itt
+.. py:module:: torch.profiler.profiler
+.. py:module:: torch.profiler.python_tracer
+```
diff --git a/docs/source/pytorch-api.md b/docs/source/pytorch-api.md
new file mode 100644
index 000000000000..1083354f3b3c
--- /dev/null
+++ b/docs/source/pytorch-api.md
@@ -0,0 +1,89 @@
+(pytorch_api)=
+# Python API
+
+```{toctree}
+:glob:
+:maxdepth: 1
+
+torch
+nn
+nn.functional
+tensors
+tensor_attributes
+tensor_view
+torch.amp <amp>
+torch.autograd <autograd>
+torch.library <library>
+accelerator
+cpu
+cuda
+torch.cuda.memory <torch_cuda_memory>
+mps
+xpu
+mtia
+mtia.memory
+meta
+torch.backends <backends>
+torch.export <export>
+torch.distributed <distributed>
+torch.distributed.tensor <distributed.tensor>
+torch.distributed.algorithms.join <distributed.algorithms.join>
+torch.distributed.elastic <distributed.elastic>
+torch.distributed.fsdp <fsdp>
+torch.distributed.fsdp.fully_shard <distributed.fsdp.fully_shard>
+torch.distributed.tensor.parallel <distributed.tensor.parallel>
+torch.distributed.optim <distributed.optim>
+torch.distributed.pipelining <distributed.pipelining>
+torch.distributed.checkpoint <distributed.checkpoint>
+torch.distributions <distributions>
+torch.compiler <torch.compiler>
+torch.fft <fft>
+torch.func <func>
+futures
+fx
+fx.experimental
+torch.hub <hub>
+torch.jit <jit>
+torch.linalg <linalg>
+torch.monitor <monitor>
+torch.signal <signal>
+torch.special <special>
+torch.overrides
+torch.package <package>
+profiler
+nn.init
+nn.attention
+onnx
+optim
+complex_numbers
+ddp_comm_hooks
+quantization
+rpc
+torch.random <random>
+masked
+torch.nested <nested>
+size
+sparse
+storage
+torch.testing <testing>
+torch.utils <utils>
+torch.utils.benchmark <benchmark_utils>
+torch.utils.bottleneck <bottleneck>
+torch.utils.checkpoint <checkpoint>
+torch.utils.cpp_extension <cpp_extension>
+torch.utils.data <data>
+torch.utils.deterministic <deterministic>
+torch.utils.jit <jit_utils>
+torch.utils.dlpack <dlpack>
+torch.utils.mobile_optimizer <mobile_optimizer>
+torch.utils.model_zoo <model_zoo>
+torch.utils.tensorboard <tensorboard>
+torch.utils.module_tracker <module_tracker>
+type_info
+named_tensor
+name_inference
+torch.__config__ <config_mod>
+torch.__future__ <future_mod>
+logging
+torch_environment_variables
+```
diff --git a/docs/source/quantization-accuracy-debugging.md b/docs/source/quantization-accuracy-debugging.md
new file mode 100644
index 000000000000..d13d83129570
--- /dev/null
+++ b/docs/source/quantization-accuracy-debugging.md
@@ -0,0 +1,96 @@
+# Quantization Accuracy Debugging
+
+This document provides high level strategies for improving quantization
+accuracy. If a quantized model has error compared to the original model,
+we can categorize the error into:
+
+1. **data insensitive error** - caused by intrinsic model quantization error,
+   large portion of input data has large error
+2. **data sensitive error** - caused by outlier input data, small
+   portion of input data has large error
+3. **implementation error** - quantized kernel is not matching reference implementation
+
+## Data insensitive error
+
+### General tips
+
+1. For PTQ, ensure that the data you are calibrating with is representative
+   of your dataset. For example, for a classification problem a general
+   guideline is to have multiple samples in every category, and the overall
+   number of samples should be at least 100. There is no penalty for
+   calibrating with more data other than calibration time.
+2. If your model has Conv-BN or Linear-BN patterns, consider fusing them.
+   If you are using FX graph mode quantization, this is done automatically
+   by the workflow. If you are using Eager mode quantization, you can do
+   this manually with the ``torch.ao.quantization.fuse_modules`` API.
+3. Increase the precision of dtype of the problematic ops. Usually, fp32
+   will have the highest accuracy, followed by fp16, followed by dynamically
+   quantized int8, followed by statically quantized int8.
+
+   1. Note: this is trading off performance for accuracy.
+   2. Note: availability of kernels per dtype per op can vary by backend.
+   3. Note: dtype conversions add an additional performance cost. For example,
+      ``fp32_op -> quant -> int8_op -> dequant -> fp32_op -> quant -> int8_op -> dequant``
+      will have a performance penalty compared to
+      ``fp32_op -> fp32_op -> quant -> int8_op -> int8_op -> dequant``
+      because of a higher number of required dtype conversions.
+
+4. If you are using PTQ, consider using QAT to recover some of the accuracy loss
+   from quantization.
+
+### Int8 quantization tips
+
+1. If you are using per-tensor weight quantization, consider using per-channel
+   weight quantization.
+2. If you are doing inference on `fbgemm`, ensure that you set the `reduce_range`
+   argument to `False` if your CPU is Cooperlake or newer, and to `True` otherwise.
+3. Audit the input activation distribution variation across different samples.
+   If this variation is high, the layer may be suitable for dynamic quantization
+   but not static quantization.
+
+## Data sensitive error
+
+If you are using static quantization and a small portion of your input data is
+resulting in high quantization error, you can try:
+
+1. Adjust your calibration dataset to make it more representative of your
+   inference dataset.
+2. Manually inspect (using Numeric Suite) which layers have high quantization
+   error. For these layers, consider leaving them in floating point or adjusting
+   the observer settings to choose a better scale and zero_point.
+
+
+## Implementation error
+
+If you are using PyTorch quantization with your own backend
+you may see differences between the reference implementation of an
+operation (such as ``dequant -> op_fp32 -> quant``) and the quantized implementation
+(such as `op_int8`) of the op on the target hardware. This could mean one of two things:
+
+1. the differences (usually small) are expected due to specific behavior of
+   the target kernel on the target hardware compared to fp32/cpu. An example of this
+   is accumulating in an integer dtype. Unless the kernel guarantees bitwise
+   equivalency with the reference implementation, this is expected.
+2. the kernel on the target hardware has an accuracy issue. In this case, reach
+   out to the kernel developer.
+
+## Numerical Debugging Tooling (prototype)
+
+```{eval-rst}
+.. toctree::
+    :hidden:
+
+    torch.ao.ns._numeric_suite
+    torch.ao.ns._numeric_suite_fx
+```
+
+```{warning}
+Numerical debugging tooling is early prototype and subject to change.
+```
+
+```{eval-rst}
+* :ref:`torch_ao_ns_numeric_suite`
+  Eager mode numeric suite
+* :ref:`torch_ao_ns_numeric_suite_fx`
+  FX numeric suite
+```
diff --git a/docs/source/quantization-backend-configuration.md b/docs/source/quantization-backend-configuration.md
new file mode 100644
index 000000000000..fb28fbef5438
--- /dev/null
+++ b/docs/source/quantization-backend-configuration.md
@@ -0,0 +1,19 @@
+# Quantization Backend Configuration
+
+FX Graph Mode Quantization allows the user to configure various
+quantization behaviors of an op in order to match the expectation
+of their backend.
+
+In the future, this document will contain a detailed spec of
+these configurations.
+
+## Default values for native configurations
+
+Below is the output of the configuration for quantization of ops
+in x86 and qnnpack (PyTorch's default quantized backends).
+
+Results:
+
+```{eval-rst}
+.. literalinclude:: scripts/quantization_backend_configs/default_backend_config.txt
+```
diff --git a/docs/source/quantization-support.md b/docs/source/quantization-support.md
new file mode 100644
index 000000000000..2f17a0626595
--- /dev/null
+++ b/docs/source/quantization-support.md
@@ -0,0 +1,785 @@
+# Quantization API Reference
+
+## torch.ao.quantization
+
+This module contains Eager mode quantization APIs.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization
+```
+
+### Top level APIs
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    quantize
+    quantize_dynamic
+    quantize_qat
+    prepare
+    prepare_qat
+    convert
+```
+
+### Preparing model for quantization
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    fuse_modules.fuse_modules
+    QuantStub
+    DeQuantStub
+    QuantWrapper
+    add_quant_dequant
+```
+
+### Utility functions
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    swap_module
+    propagate_qconfig_
+    default_eval_fn
+```
+
+## torch.ao.quantization.quantize_fx
+
+This module contains FX graph mode quantization APIs (prototype).
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.quantize_fx
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    prepare_fx
+    prepare_qat_fx
+    convert_fx
+    fuse_fx
+```
+
+## torch.ao.quantization.qconfig_mapping
+
+This module contains QConfigMapping for configuring FX graph mode quantization.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.qconfig_mapping
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    QConfigMapping
+    get_default_qconfig_mapping
+    get_default_qat_qconfig_mapping
+```
+
+## torch.ao.quantization.backend_config
+
+This module contains BackendConfig, a config object that defines how quantization is supported
+in a backend. Currently only used by FX Graph Mode Quantization, but we may extend Eager Mode
+Quantization to work with this as well.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.backend_config
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    BackendConfig
+    BackendPatternConfig
+    DTypeConfig
+    DTypeWithConstraints
+    ObservationType
+```
+
+## torch.ao.quantization.fx.custom_config
+
+This module contains a few CustomConfig classes that's used in both eager mode and FX graph mode quantization
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.fx.custom_config
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    FuseCustomConfig
+    PrepareCustomConfig
+    ConvertCustomConfig
+    StandaloneModuleConfigEntry
+```
+
+## torch.ao.quantization.quantizer
+
+```{eval-rst}
+.. automodule:: torch.ao.quantization.quantizer
+```
+
+## torch.ao.quantization.pt2e (quantization in pytorch 2.0 export implementation)
+
+```{eval-rst}
+.. automodule:: torch.ao.quantization.pt2e
+.. automodule:: torch.ao.quantization.pt2e.representation
+```
+
+## torch.ao.quantization.pt2e.export_utils
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.pt2e.export_utils
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    model_is_exported
+```
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization
+```
+
+## torch.ao.quantization.pt2e.lowering
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.pt2e.lowering
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    lower_pt2e_quantized_to_x86
+```
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization
+```
+
+## PT2 Export (pt2e) Numeric Debugger
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    generate_numeric_debug_handle
+    CUSTOM_KEY
+    NUMERIC_DEBUG_HANDLE_KEY
+    prepare_for_propagation_comparison
+    extract_results_from_loggers
+    compare_results
+```
+
+## torch (quantization related functions)
+
+This describes the quantization related functions of the `torch` namespace.
+
+```{eval-rst}
+.. currentmodule:: torch
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    quantize_per_tensor
+    quantize_per_channel
+    dequantize
+```
+
+## torch.Tensor (quantization related methods)
+
+Quantized Tensors support a limited subset of data manipulation methods of the
+regular full-precision tensor.
+
+```{eval-rst}
+.. currentmodule:: torch.Tensor
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    view
+    as_strided
+    expand
+    flatten
+    select
+    ne
+    eq
+    ge
+    le
+    gt
+    lt
+    copy_
+    clone
+    dequantize
+    equal
+    int_repr
+    max
+    mean
+    min
+    q_scale
+    q_zero_point
+    q_per_channel_scales
+    q_per_channel_zero_points
+    q_per_channel_axis
+    resize_
+    sort
+    topk
+```
+
+## torch.ao.quantization.observer
+
+This module contains observers which are used to collect statistics about
+the values observed during calibration (PTQ) or training (QAT).
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.observer
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ObserverBase
+    MinMaxObserver
+    MovingAverageMinMaxObserver
+    PerChannelMinMaxObserver
+    MovingAveragePerChannelMinMaxObserver
+    HistogramObserver
+    PlaceholderObserver
+    RecordingObserver
+    NoopObserver
+    get_observer_state_dict
+    load_observer_state_dict
+    default_observer
+    default_placeholder_observer
+    default_debug_observer
+    default_weight_observer
+    default_histogram_observer
+    default_per_channel_weight_observer
+    default_dynamic_quant_observer
+    default_float_qparams_observer
+    AffineQuantizedObserverBase
+    Granularity
+    MappingType
+    PerAxis
+    PerBlock
+    PerGroup
+    PerRow
+    PerTensor
+    PerToken
+    TorchAODType
+    ZeroPointDomain
+    get_block_size
+```
+
+## torch.ao.quantization.fake_quantize
+
+This module implements modules which are used to perform fake quantization
+during QAT.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.fake_quantize
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    FakeQuantizeBase
+    FakeQuantize
+    FixedQParamsFakeQuantize
+    FusedMovingAvgObsFakeQuantize
+    default_fake_quant
+    default_weight_fake_quant
+    default_per_channel_weight_fake_quant
+    default_histogram_fake_quant
+    default_fused_act_fake_quant
+    default_fused_wt_fake_quant
+    default_fused_per_channel_wt_fake_quant
+    disable_fake_quant
+    enable_fake_quant
+    disable_observer
+    enable_observer
+```
+
+## torch.ao.quantization.qconfig
+
+This module defines `QConfig` objects which are used
+to configure quantization settings for individual ops.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.qconfig
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    QConfig
+    default_qconfig
+    default_debug_qconfig
+    default_per_channel_qconfig
+    default_dynamic_qconfig
+    float16_dynamic_qconfig
+    float16_static_qconfig
+    per_channel_dynamic_qconfig
+    float_qparams_weight_only_qconfig
+    default_qat_qconfig
+    default_weight_only_qconfig
+    default_activation_only_qconfig
+    default_qat_qconfig_v2
+```
+
+## torch.ao.nn.intrinsic
+
+```{eval-rst}
+.. automodule:: torch.ao.nn.intrinsic
+.. automodule:: torch.ao.nn.intrinsic.modules
+```
+
+This module implements the combined (fused) modules conv + relu which can
+then be quantized.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.nn.intrinsic
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ConvReLU1d
+    ConvReLU2d
+    ConvReLU3d
+    LinearReLU
+    ConvBn1d
+    ConvBn2d
+    ConvBn3d
+    ConvBnReLU1d
+    ConvBnReLU2d
+    ConvBnReLU3d
+    BNReLU2d
+    BNReLU3d
+```
+
+## torch.ao.nn.intrinsic.qat
+
+```{eval-rst}
+.. automodule:: torch.ao.nn.intrinsic.qat
+.. automodule:: torch.ao.nn.intrinsic.qat.modules
+```
+
+This module implements the versions of those fused operations needed for
+quantization aware training.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.nn.intrinsic.qat
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    LinearReLU
+    ConvBn1d
+    ConvBnReLU1d
+    ConvBn2d
+    ConvBnReLU2d
+    ConvReLU2d
+    ConvBn3d
+    ConvBnReLU3d
+    ConvReLU3d
+    update_bn_stats
+    freeze_bn_stats
+```
+
+## torch.ao.nn.intrinsic.quantized
+
+```{eval-rst}
+.. automodule:: torch.ao.nn.intrinsic.quantized
+.. automodule:: torch.ao.nn.intrinsic.quantized.modules
+```
+
+This module implements the quantized implementations of fused operations
+like conv + relu. No BatchNorm variants as it's usually folded into convolution
+for inference.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.nn.intrinsic.quantized
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    BNReLU2d
+    BNReLU3d
+    ConvReLU1d
+    ConvReLU2d
+    ConvReLU3d
+    LinearReLU
+```
+
+## torch.ao.nn.intrinsic.quantized.dynamic
+
+```{eval-rst}
+.. automodule:: torch.ao.nn.intrinsic.quantized.dynamic
+.. automodule:: torch.ao.nn.intrinsic.quantized.dynamic.modules
+```
+
+This module implements the quantized dynamic implementations of fused operations
+like linear + relu.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.nn.intrinsic.quantized.dynamic
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    LinearReLU
+```
+
+## torch.ao.nn.qat
+
+```{eval-rst}
+.. automodule:: torch.ao.nn.qat
+.. automodule:: torch.ao.nn.qat.modules
+```
+
+This module implements versions of the key nn modules **Conv2d()** and
+**Linear()** which run in FP32 but with rounding applied to simulate the
+effect of INT8 quantization.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.nn.qat
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    Conv2d
+    Conv3d
+    Linear
+```
+
+## torch.ao.nn.qat.dynamic
+
+```{eval-rst}
+.. automodule:: torch.ao.nn.qat.dynamic
+.. automodule:: torch.ao.nn.qat.dynamic.modules
+```
+
+This module implements versions of the key nn modules such as **Linear()**
+which run in FP32 but with rounding applied to simulate the effect of INT8
+quantization and will be dynamically quantized during inference.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.nn.qat.dynamic
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    Linear
+```
+
+## torch.ao.nn.quantized
+
+```{eval-rst}
+.. automodule:: torch.ao.nn.quantized
+   :noindex:
+.. automodule:: torch.ao.nn.quantized.modules
+```
+
+This module implements the quantized versions of the nn layers such as
+`~torch.nn.Conv2d` and `torch.nn.ReLU`.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.nn.quantized
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ReLU6
+    Hardswish
+    ELU
+    LeakyReLU
+    Sigmoid
+    BatchNorm2d
+    BatchNorm3d
+    Conv1d
+    Conv2d
+    Conv3d
+    ConvTranspose1d
+    ConvTranspose2d
+    ConvTranspose3d
+    Embedding
+    EmbeddingBag
+    FloatFunctional
+    FXFloatFunctional
+    QFunctional
+    Linear
+    LayerNorm
+    GroupNorm
+    InstanceNorm1d
+    InstanceNorm2d
+    InstanceNorm3d
+```
+
+## torch.ao.nn.quantized.functional
+
+```{eval-rst}
+.. automodule:: torch.ao.nn.quantized.functional
+```
+
+```{eval-rst}
+This module implements the quantized versions of the functional layers such as
+`~torch.nn.functional.conv2d` and `torch.nn.functional.relu`. Note:
+:math:`~torch.nn.functional.relu` supports quantized inputs.
+```
+
+```{eval-rst}
+.. currentmodule:: torch.ao.nn.quantized.functional
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    avg_pool2d
+    avg_pool3d
+    adaptive_avg_pool2d
+    adaptive_avg_pool3d
+    conv1d
+    conv2d
+    conv3d
+    interpolate
+    linear
+    max_pool1d
+    max_pool2d
+    celu
+    leaky_relu
+    hardtanh
+    hardswish
+    threshold
+    elu
+    hardsigmoid
+    clamp
+    upsample
+    upsample_bilinear
+    upsample_nearest
+```
+
+## torch.ao.nn.quantizable
+
+This module implements the quantizable versions of some of the nn layers.
+These modules can be used in conjunction with the custom module mechanism,
+by providing the ``custom_module_config`` argument to both prepare and convert.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.nn.quantizable
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    LSTM
+    MultiheadAttention
+```
+
+## torch.ao.nn.quantized.dynamic
+
+```{eval-rst}
+.. automodule:: torch.ao.nn.quantized.dynamic
+.. automodule:: torch.ao.nn.quantized.dynamic.modules
+```
+
+Dynamically quantized {class}`~torch.nn.Linear`, {class}`~torch.nn.LSTM`,
+{class}`~torch.nn.LSTMCell`, {class}`~torch.nn.GRUCell`, and
+{class}`~torch.nn.RNNCell`.
+
+```{eval-rst}
+.. currentmodule:: torch.ao.nn.quantized.dynamic
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    Linear
+    LSTM
+    GRU
+    RNNCell
+    LSTMCell
+    GRUCell
+```
+
+## Quantized dtypes and quantization schemes
+
+Note that operator implementations currently only
+support per channel quantization for weights of the **conv** and **linear**
+operators. Furthermore, the input data is
+mapped linearly to the quantized data and vice versa
+as follows:
+
+```{eval-rst}
+    .. math::
+
+        \begin{aligned}
+            \text{Quantization:}&\\
+            &Q_\text{out} = \text{clamp}(x_\text{input}/s+z, Q_\text{min}, Q_\text{max})\\
+            \text{Dequantization:}&\\
+            &x_\text{out} = (Q_\text{input}-z)*s
+        \end{aligned}
+```
+
+```{eval-rst}
+where :math:`\text{clamp}(.)` is the same as :func:`~torch.clamp` while the
+scale :math:`s` and zero point :math:`z` are then computed
+as described in :class:`~torch.ao.quantization.observer.MinMaxObserver`, specifically:
+```
+
+```{eval-rst}
+    .. math::
+
+        \begin{aligned}
+            \text{if Symmetric:}&\\
+            &s = 2 \max(|x_\text{min}|, x_\text{max}) /
+                \left( Q_\text{max} - Q_\text{min} \right) \\
+            &z = \begin{cases}
+                0 & \text{if dtype is qint8} \\
+                128 & \text{otherwise}
+            \end{cases}\\
+            \text{Otherwise:}&\\
+                &s = \left( x_\text{max} - x_\text{min}  \right ) /
+                    \left( Q_\text{max} - Q_\text{min} \right ) \\
+                &z = Q_\text{min} - \text{round}(x_\text{min} / s)
+        \end{aligned}
+```
+
+where :math:`[x_\text{min}, x_\text{max}]` denotes the range of the input data while
+:math:`Q_\text{min}` and :math:`Q_\text{max}` are respectively the minimum and maximum values of the quantized dtype.
+
+Note that the choice of :math:`s` and :math:`z` implies that zero is represented with no quantization error whenever zero is within
+the range of the input data or symmetric quantization is being used.
+
+Additional data types and quantization schemes can be implemented through
+the `custom operator mechanism <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_.
+
+```{eval-rst}
+* :attr:`torch.qscheme` — Type to describe the quantization scheme of a tensor.
+  Supported types:
+
+  * :attr:`torch.per_tensor_affine` — per tensor, asymmetric
+  * :attr:`torch.per_channel_affine` — per channel, asymmetric
+  * :attr:`torch.per_tensor_symmetric` — per tensor, symmetric
+  * :attr:`torch.per_channel_symmetric` — per channel, symmetric
+
+* ``torch.dtype`` — Type to describe the data. Supported types:
+
+  * :attr:`torch.quint8` — 8-bit unsigned integer
+  * :attr:`torch.qint8` — 8-bit signed integer
+  * :attr:`torch.qint32` — 32-bit signed integer
+```
+
+```{eval-rst}
+.. These modules are missing docs. Adding them here only for tracking
+.. automodule:: torch.ao.nn.quantizable.modules
+   :noindex:
+.. automodule:: torch.ao.nn.quantized.reference
+   :noindex:
+.. automodule:: torch.ao.nn.quantized.reference.modules
+   :noindex:
+
+.. automodule:: torch.nn.quantizable
+.. automodule:: torch.nn.qat.dynamic.modules
+.. automodule:: torch.nn.qat.modules
+.. automodule:: torch.nn.qat
+.. automodule:: torch.nn.intrinsic.qat.modules
+.. automodule:: torch.nn.quantized.dynamic
+.. automodule:: torch.nn.intrinsic
+.. automodule:: torch.nn.intrinsic.quantized.modules
+.. automodule:: torch.quantization.fx
+.. automodule:: torch.nn.intrinsic.quantized.dynamic
+.. automodule:: torch.nn.qat.dynamic
+.. automodule:: torch.nn.intrinsic.qat
+.. automodule:: torch.nn.quantized.modules
+.. automodule:: torch.nn.intrinsic.quantized
+.. automodule:: torch.nn.quantizable.modules
+.. automodule:: torch.nn.quantized
+.. automodule:: torch.nn.intrinsic.quantized.dynamic.modules
+.. automodule:: torch.nn.quantized.dynamic.modules
+.. automodule:: torch.quantization
+.. automodule:: torch.nn.intrinsic.modules
+```
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 1b808136ef11..67e171b0c1cd 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -1341,6 +1341,10 @@ Please take a look at `Limitations of Symbolic Tracing <https://pytorch.org/docs
 .. py:module:: torch.ao.quantization.pt2e.qat_utils
 .. py:module:: torch.ao.quantization.pt2e.representation.rewrite
 .. py:module:: torch.ao.quantization.pt2e.utils
+<<<<<<< HEAD
+=======
+.. py:module:: torch.ao.quantization.pt2e.lowering
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. py:module:: torch.ao.quantization.qconfig
 .. py:module:: torch.ao.quantization.qconfig_mapping
 .. py:module:: torch.ao.quantization.quant_type
diff --git a/docs/source/random.md b/docs/source/random.md
new file mode 100644
index 000000000000..432c0a83293b
--- /dev/null
+++ b/docs/source/random.md
@@ -0,0 +1,10 @@
+# torch.random
+
+```{eval-rst}
+.. currentmodule:: torch.random
+```
+
+```{eval-rst}
+.. automodule:: torch.random
+   :members:
+```
diff --git a/docs/source/rpc.md b/docs/source/rpc.md
new file mode 100644
index 000000000000..77f8ec439aea
--- /dev/null
+++ b/docs/source/rpc.md
@@ -0,0 +1,306 @@
+(distributed-rpc-framework)=
+
+# Distributed RPC Framework
+
+The distributed RPC framework provides mechanisms for multi-machine model
+training through a set of primitives to allow for remote communication, and a
+higher-level API to automatically differentiate models split across several
+machines.
+
+```{warning}
+APIs in the RPC package are stable. There are multiple ongoing work items
+to improve performance and error handling, which will ship in future releases.
+```
+
+```{warning}
+CUDA support was introduced in PyTorch 1.9 and is still a **beta** feature.
+Not all features of the RPC package are yet compatible with CUDA support and
+thus their use is discouraged. These unsupported features include: RRefs,
+JIT compatibility, dist autograd and dist optimizer, and profiling. These
+shortcomings will be addressed in future releases.
+```
+
+```{note}
+Please refer to `PyTorch Distributed Overview <https://pytorch.org/tutorials/beginner/dist_overview.html>`__
+for a brief introduction to all features related to distributed training.
+```
+
+## Basics
+
+The distributed RPC framework makes it easy to run functions remotely, supports
+referencing remote objects without copying the real data around, and provides
+autograd and optimizer APIs to transparently run backward and update parameters
+across RPC boundaries. These features can be categorized into four sets of APIs.
+
+1) **Remote Procedure Call (RPC)** supports running a function on the specified
+   destination worker with the given arguments and getting the return value back
+   or creating a reference to the return value. There are three main RPC APIs:
+   {meth}`~torch.distributed.rpc.rpc_sync` (synchronous),
+   {meth}`~torch.distributed.rpc.rpc_async` (asynchronous), and
+   {meth}`~torch.distributed.rpc.remote` (asynchronous and returns a reference
+   to the remote return value). Use the synchronous API if the user code cannot
+   proceed without the return value. Otherwise, use the asynchronous API to get
+   a future, and wait on the future when the return value is needed on the
+   caller. The {meth}`~torch.distributed.rpc.remote` API is useful when the
+   requirement is to create something remotely but never need to fetch it to
+   the caller. Imagine the case that a driver process is setting up a parameter
+   server and a trainer. The driver can create an embedding table on the
+   parameter server and then share the reference to the embedding table with the
+   trainer, but itself will never use the embedding table locally. In this case,
+   {meth}`~torch.distributed.rpc.rpc_sync` and
+   {meth}`~torch.distributed.rpc.rpc_async` are no longer appropriate, as they
+   always imply that the return value will be returned to the caller
+   immediately or in the future.
+2) **Remote Reference (RRef)** serves as a distributed shared pointer to a local
+   or remote object. It can be shared with other workers and reference counting
+   will be handled transparently. Each RRef only has one owner and the object
+   only lives on that owner. Non-owner workers holding RRefs can get copies of
+   the object from the owner by explicitly requesting it. This is useful when
+   a worker needs to access some data object, but itself is neither the creator
+   (the caller of {meth}`~torch.distributed.rpc.remote`) or the owner of the
+   object. The distributed optimizer, as we will discuss below, is one example
+   of such use cases.
+3) **Distributed Autograd** stitches together local autograd engines on all the
+   workers involved in the forward pass, and automatically reach out to them
+   during the backward pass to compute gradients. This is especially helpful if
+   the forward pass needs to span multiple machines when conducting, e.g.,
+   distributed model parallel training, parameter-server training, etc. With
+   this feature, user code no longer needs to worry about how to send gradients
+   across RPC boundaries and in which order should the local autograd engines
+   be launched, which can become quite complicated where there are nested and
+   inter-dependent RPC calls in the forward pass.
+4) **Distributed Optimizer**'s constructor takes a
+   {meth}`~torch.optim.Optimizer` (e.g., {meth}`~torch.optim.SGD`,
+   {meth}`~torch.optim.Adagrad`, etc.) and a list of parameter RRefs, creates an
+   {meth}`~torch.optim.Optimizer` instance on each distinct RRef owner, and
+   updates parameters accordingly when running ``step()``. When you have
+   distributed forward and backward passes, parameters and gradients will be
+   scattered across multiple workers, and hence it requires an optimizer on each
+   of the involved workers. Distributed Optimizer wraps all those local
+   optimizers into one, and provides a concise constructor and ``step()`` API.
+
+
+(rpc)=
+## RPC
+
+Before using RPC and distributed autograd primitives, initialization must take
+place. To initialize the RPC framework we need to use
+{meth}`~torch.distributed.rpc.init_rpc` which would initialize the RPC
+framework, RRef framework and distributed autograd.
+
+```{eval-rst}
+.. automodule:: torch.distributed.rpc
+.. autofunction:: init_rpc
+```
+
+The following APIs allow users to remotely execute functions as well as create
+references (RRefs) to remote data objects. In these APIs, when passing a
+``Tensor`` as an argument or a return value, the destination worker will try to
+create a ``Tensor`` with the same meta (i.e., shape, stride, etc.). We
+intentionally disallow transmitting CUDA tensors because it might crash if the
+device lists on source and destination workers do not match. In such cases,
+applications can always explicitly move the input tensors to CPU on the caller
+and move it to the desired devices on the callee if necessary.
+
+```{warning}
+  TorchScript support in RPC is a prototype feature and subject to change. Since
+  v1.5.0, ``torch.distributed.rpc`` supports calling TorchScript functions as
+  RPC target functions, and this will help improve parallelism on the callee
+  side as executing TorchScript functions does not require GIL.
+```
+
+```{eval-rst}
+.. autofunction:: rpc_sync
+.. autofunction:: rpc_async
+.. autofunction:: remote
+.. autofunction:: get_worker_info
+.. autofunction:: shutdown
+.. autoclass:: WorkerInfo
+    :members:
+```
+
+The RPC package also provides decorators which allow applications to specify
+how a given function should be treated on the callee side.
+
+```{eval-rst}
+.. autofunction:: torch.distributed.rpc.functions.async_execution
+```
+
+(rpc-backends)=
+### Backends
+
+The RPC module can leverage different backends to perform the communication
+between the nodes. The backend to be used can be specified in the
+{func}`~torch.distributed.rpc.init_rpc` function, by passing a certain value of
+the {class}`~torch.distributed.rpc.BackendType` enum. Regardless of what backend
+is used, the rest of the RPC API won't change. Each backend also defines its own
+subclass of the {class}`~torch.distributed.rpc.RpcBackendOptions` class, an
+instance of which can also be passed to {func}`~torch.distributed.rpc.init_rpc`
+to configure the backend's behavior.
+
+```{eval-rst}
+.. autoclass:: BackendType
+
+.. autoclass:: RpcBackendOptions
+    :members:
+```
+
+#### TensorPipe Backend
+
+The TensorPipe agent, which is the default, leverages [the TensorPipe library](https://github.com/pytorch/tensorpipe), which provides a natively
+point-to-point communication primitive specifically suited for machine learning
+that fundamentally addresses some of the limitations of Gloo. Compared to Gloo,
+it has the advantage of being asynchronous, which allows a large number of
+transfers to occur simultaneously, each at their own speed, without blocking
+each other. It will only open pipes between pairs of nodes when needed, on
+demand, and when one node fails only its incident pipes will be closed, while
+all other ones will keep working as normal. In addition, it is able to support
+multiple different transports (TCP, of course, but also shared memory, NVLink,
+InfiniBand, ...) and can automatically detect their availability and negotiate
+the best transport to use for each pipe.
+
+The TensorPipe backend has been introduced in PyTorch v1.6 and is being actively
+developed. At the moment, it only supports CPU tensors, with GPU support coming
+soon. It comes with a TCP-based transport, just like Gloo. It is also able to
+automatically chunk and multiplex large tensors over multiple sockets and
+threads in order to achieve very high bandwidths. The agent will be able to pick
+the best transport on its own, with no intervention required.
+
+Example:
+
+```{code-block} python
+import os
+from torch.distributed import rpc
+os.environ['MASTER_ADDR'] = 'localhost'
+os.environ['MASTER_PORT'] = '29500'
+
+rpc.init_rpc(
+    "worker1",
+    rank=0,
+    world_size=2,
+    rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
+        num_worker_threads=8,
+        rpc_timeout=20 # 20 second timeout
+    )
+)
+
+# omitting init_rpc invocation on worker2
+```
+
+```{eval-rst}
+.. autoclass:: TensorPipeRpcBackendOptions
+    :members:
+    :inherited-members:
+```
+
+```{note}
+The RPC framework does not automatically retry any
+{meth}`~torch.distributed.rpc.rpc_sync`,
+{meth}`~torch.distributed.rpc.rpc_async` and
+{meth}`~torch.distributed.rpc.remote` calls. The reason being that there is
+no way the RPC framework can determine whether an operation is idempotent or
+not and whether it is safe to retry. As a result, it is the application's
+responsibility to deal with failures and retry if necessary. RPC communication
+is based on TCP and as a result failures could happen due to network failures
+or intermittent network connectivity issues. In such scenarios, the application
+needs to retry appropriately with reasonable backoffs to ensure the network
+isn't overwhelmed by aggressive retries.
+```
+(rref)=
+## RRef
+
+```{warning}
+RRefs are not currently supported when using CUDA tensors
+```
+
+An ``RRef`` (Remote REFerence) is a reference to a value of some type ``T``
+(e.g. ``Tensor``) on a remote worker. This handle keeps the referenced remote
+value alive on the owner, but there is no implication that the value will be
+transferred to the local worker in the future. RRefs can be used in
+multi-machine training by holding references to [nn.Modules](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) that exist on
+other workers, and calling the appropriate functions to retrieve or modify their
+parameters during training. See {ref}`remote-reference-protocol` for more
+details.
+
+```{eval-rst}
+.. autoclass:: PyRRef(RRef)
+    :members:
+    :inherited-members:
+```
+
+```{toctree}
+:caption: More Information about RRef
+
+rpc/rref
+```
+
+(remote_module)=
+
+## RemoteModule
+
+```{warning}
+RemoteModule is not currently supported when using CUDA tensors
+```
+
+``RemoteModule`` is an easy way to create an nn.Module remotely on a different
+process. The actual module resides on a remote host, but the local host has a
+handle to this module and invoke this module similar to a regular nn.Module.
+The invocation however incurs RPC calls to the remote end and can be performed
+asynchronously if needed via additional APIs supported by RemoteModule.
+
+```{eval-rst}
+.. autoclass:: torch.distributed.nn.api.remote_module.RemoteModule
+    :members: remote_parameters, get_module_rref
+```
+
+## Distributed Autograd Framework
+
+```{warning}
+Distributed autograd is not currently supported when using CUDA tensors
+```
+
+This module provides an RPC-based distributed autograd framework that can be
+used for applications such as model parallel training. In short, applications
+may send and receive gradient recording tensors over RPC. In the forward pass,
+we record when gradient recording tensors are sent over RPC and during the
+backward pass we use this information to perform a distributed backward pass
+using RPC. For more details see {ref}`distributed-autograd-design`.
+
+```{eval-rst}
+.. automodule:: torch.distributed.autograd
+    :members: context, backward, get_gradients
+```
+
+```{toctree}
+:caption: More Information about RPC Autograd
+
+rpc/distributed_autograd
+```
+
+
+## Distributed Optimizer
+
+See the [torch.distributed.optim](https://pytorch.org/docs/main/distributed.optim.html) page for documentation on distributed optimizers.
+
+## Design Notes
+
+The distributed autograd design note covers the design of the RPC-based distributed autograd framework that is useful for applications such as model parallel training.
+
+-  {ref}`distributed-autograd-design`
+
+The RRef design note covers the design of the {ref}`rref` (Remote REFerence) protocol used to refer to values on remote workers by the framework.
+
+-  {ref}`remote-reference-protocol`
+
+## Tutorials
+
+The RPC tutorials introduce users to the RPC framework, provide several example applications
+using {ref}`torch.distributed.rpc<distributed-rpc-framework>` APIs, and demonstrate how
+to use [the profiler](https://pytorch.org/docs/stable/autograd.html#profiler) to profile RPC-based workloads.
+
+-  [Getting started with Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_tutorial.html)
+-  [Implementing a Parameter Server using Distributed RPC Framework](https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html)
+-  [Combining Distributed DataParallel with Distributed RPC Framework](https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html) (covers **RemoteModule** as well)
+-  [Profiling RPC-based Workloads](https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html)
+-  [Implementing batch RPC processing](https://pytorch.org/tutorials/intermediate/rpc_async_execution.html)
+-  [Distributed Pipeline Parallel](https://pytorch.org/tutorials/intermediate/dist_pipeline_parallel_tutorial.html)
diff --git a/docs/source/rpc/rref.rst b/docs/source/rpc/rref.rst
index 3f858e586867..856a918c82d0 100644
--- a/docs/source/rpc/rref.rst
+++ b/docs/source/rpc/rref.rst
@@ -204,7 +204,11 @@ will create the ``OwnerRRef``, and returns an ACK to acknowledge ``{100, 1}``
 **G2**, the ``OwnerRRef`` is a child of the ``UserRRef``, and the ``UserRRef``
 is not deleted until it receives the ACK from the owner.
 
+<<<<<<< HEAD
 .. image:: https://user-images\.githubusercontent\.com/16999635/69164772-98181300-0abe-11ea-93a7-9ad9f757cd94.png
+=======
+.. image:: https://user-images.githubusercontent.com/16999635/69164772-98181300-0abe-11ea-93a7-9ad9f757cd94.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     :alt: user_to_owner_ret.png
     :width: 500 px
 
diff --git a/docs/source/scripts/build_activation_images.py b/docs/source/scripts/build_activation_images.py
index 27e35f22a810..a32b4054fe28 100644
--- a/docs/source/scripts/build_activation_images.py
+++ b/docs/source/scripts/build_activation_images.py
@@ -46,6 +46,11 @@
     torch.nn.Softsign(),
     torch.nn.Tanh(),
     torch.nn.Tanhshrink(),
+<<<<<<< HEAD
+=======
+    torch.nn.Threshold(0, 0.5),
+    torch.nn.GLU(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -54,8 +59,19 @@ def plot_function(function, **args):
     Plot a function on the current plot. The additional arguments may
     be used to specify color, alpha, etc.
     """
+<<<<<<< HEAD
     xrange = torch.arange(-7.0, 7.0, 0.01)  # We need to go beyond 6 for ReLU6
     plt.plot(xrange.numpy(), function(xrange).detach().numpy(), **args)
+=======
+    if isinstance(function, torch.nn.GLU):
+        xrange = torch.arange(-7.0, 7.0, 0.01).unsqueeze(1).repeat(1, 2)
+        x = xrange.numpy()[:, 0]
+    else:
+        xrange = torch.arange(-7.0, 7.0, 0.01)  # We need to go beyond 6 for ReLU6
+        x = xrange.numpy()
+    y = function(xrange).detach().numpy()
+    plt.plot(x, y, **args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Step through all the functions
diff --git a/docs/source/scripts/build_lr_scheduler_images.py b/docs/source/scripts/build_lr_scheduler_images.py
new file mode 100644
index 000000000000..ca6f1aef513f
--- /dev/null
+++ b/docs/source/scripts/build_lr_scheduler_images.py
@@ -0,0 +1,96 @@
+from pathlib import Path
+
+import matplotlib
+from matplotlib import pyplot as plt
+
+import torch
+import torch.optim as optim
+from torch.optim.lr_scheduler import (
+    ChainedScheduler,
+    ConstantLR,
+    CosineAnnealingLR,
+    CosineAnnealingWarmRestarts,
+    CyclicLR,
+    ExponentialLR,
+    LambdaLR,
+    LinearLR,
+    MultiplicativeLR,
+    MultiStepLR,
+    OneCycleLR,
+    PolynomialLR,
+    ReduceLROnPlateau,
+    SequentialLR,
+    StepLR,
+)
+
+
+matplotlib.use("Agg")
+
+LR_SCHEDULER_IMAGE_PATH = Path(__file__).parent / "lr_scheduler_images"
+
+if not LR_SCHEDULER_IMAGE_PATH.exists():
+    LR_SCHEDULER_IMAGE_PATH.mkdir()
+
+model = torch.nn.Linear(10, 1)
+optimizer = optim.SGD(model.parameters(), lr=0.05)
+
+num_epochs = 100
+
+scheduler1 = ConstantLR(optimizer, factor=0.1, total_iters=num_epochs // 5)
+scheduler2 = ExponentialLR(optimizer, gamma=0.9)
+
+schedulers = [
+    (lambda opt: LambdaLR(opt, lr_lambda=lambda epoch: epoch // 30)),
+    (lambda opt: MultiplicativeLR(opt, lr_lambda=lambda epoch: 0.95)),
+    (lambda opt: StepLR(opt, step_size=30, gamma=0.1)),
+    (lambda opt: MultiStepLR(opt, milestones=[30, 80], gamma=0.1)),
+    (lambda opt: ConstantLR(opt, factor=0.5, total_iters=40)),
+    (lambda opt: LinearLR(opt, start_factor=0.05, total_iters=40)),
+    (lambda opt: ExponentialLR(opt, gamma=0.95)),
+    (lambda opt: PolynomialLR(opt, total_iters=num_epochs / 2, power=0.9)),
+    (lambda opt: CosineAnnealingLR(opt, T_max=num_epochs)),
+    (lambda opt: CosineAnnealingWarmRestarts(opt, T_0=20)),
+    (lambda opt: CyclicLR(opt, base_lr=0.01, max_lr=0.1, step_size_up=10)),
+    (lambda opt: OneCycleLR(opt, max_lr=0.01, epochs=10, steps_per_epoch=10)),
+    (lambda opt: ReduceLROnPlateau(opt, mode="min")),
+    (lambda opt: ChainedScheduler([scheduler1, scheduler2])),
+    (
+        lambda opt: SequentialLR(
+            opt, schedulers=[scheduler1, scheduler2], milestones=[num_epochs // 5]
+        )
+    ),
+]
+
+
+def plot_function(scheduler):
+    plt.clf()
+    plt.grid(color="k", alpha=0.2, linestyle="--")
+    lrs = []
+    optimizer.param_groups[0]["lr"] = 0.05
+    scheduler = scheduler(optimizer)
+
+    plot_path = LR_SCHEDULER_IMAGE_PATH / f"{scheduler.__class__.__name__}.png"
+    if plot_path.exists():
+        return
+
+    for _ in range(num_epochs):
+        lrs.append(optimizer.param_groups[0]["lr"])
+        if isinstance(scheduler, ReduceLROnPlateau):
+            val_loss = torch.randn(1).item()
+            scheduler.step(val_loss)
+        else:
+            scheduler.step()
+
+    plt.plot(range(num_epochs), lrs)
+    plt.title(f"Learning Rate: {scheduler.__class__.__name__}")
+    plt.xlabel("Epoch")
+    plt.ylabel("Learning Rate")
+    plt.xlim([0, num_epochs])
+    plt.savefig(plot_path)
+    print(
+        f"Saved learning rate scheduler image for {scheduler.__class__.__name__} at {plot_path}"
+    )
+
+
+for scheduler in schedulers:
+    plot_function(scheduler)
diff --git a/docs/source/signal.md b/docs/source/signal.md
new file mode 100644
index 000000000000..b73609d11c3c
--- /dev/null
+++ b/docs/source/signal.md
@@ -0,0 +1,37 @@
+```{role} hidden
+:class: hidden-section
+```
+
+# torch.signal
+
+```{eval-rst}
+.. automodule:: torch.signal
+.. currentmodule:: torch.signal
+```
+
+The `torch.signal` module, modeled after SciPy's [signal](https://docs.scipy.org/doc/scipy/reference/signal.html)module.
+
+## torch.signal.windows
+
+```{eval-rst}
+.. automodule:: torch.signal.windows
+.. currentmodule:: torch.signal.windows
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    bartlett
+    blackman
+    cosine
+    exponential
+    gaussian
+    general_cosine
+    general_hamming
+    hamming
+    hann
+    kaiser
+    nuttall
+```
\ No newline at end of file
diff --git a/docs/source/size.md b/docs/source/size.md
new file mode 100644
index 000000000000..5ebba9a2e401
--- /dev/null
+++ b/docs/source/size.md
@@ -0,0 +1,26 @@
+# torch.Size
+
+{class}`torch.Size` is the result type of a call to {func}`torch.Tensor.size`. It describes the size of all dimensions
+of the original tensor. As a subclass of {class}`tuple`, it supports common sequence operations like indexing and
+length.
+
+
+Example:
+
+```{code-block} python
+    >>> x = torch.ones(10, 20, 30)
+    >>> s = x.size()
+    >>> s
+    torch.Size([10, 20, 30])
+    >>> s[1]
+    20
+    >>> len(s)
+    3
+```
+
+```{eval-rst}
+.. autoclass:: torch.Size
+   :members:
+   :undoc-members:
+   :inherited-members:
+```
\ No newline at end of file
diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst
index 8f7b025f354c..ac5adf87c754 100644
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@@ -360,8 +360,12 @@ Suppose we want to define a sparse tensor with the entry 3 at location
 Unspecified elements are assumed to have the same value, fill value,
 which is zero by default. We would then write:
 
+<<<<<<< HEAD
     >>> i = [[0, 1, 1],
              [2, 0, 2]]
+=======
+    >>> i = [[0, 1, 1], [2, 0, 2]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> v =  [3, 4, 5]
     >>> s = torch.sparse_coo_tensor(i, v, (2, 3))
     >>> s
@@ -1070,7 +1074,11 @@ Tools for working with sparse compressed tensors
 ------------------------------------------------
 
 All sparse compressed tensors --- CSR, CSC, BSR, and BSC tensors ---
+<<<<<<< HEAD
 are conceptionally very similar in that their indices data is split
+=======
+are conceptually very similar in that their indices data is split
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 into two parts: so-called compressed indices that use the CSR
 encoding, and so-called plain indices that are orthogonal to the
 compressed indices. This allows various tools on these tensors to
diff --git a/docs/source/special.md b/docs/source/special.md
new file mode 100644
index 000000000000..24dc10756a1e
--- /dev/null
+++ b/docs/source/special.md
@@ -0,0 +1,73 @@
+```{role} hidden
+:class: hidden-section
+```
+
+# torch.special
+
+The torch.special module, modeled after SciPy's [special](https://docs.scipy.org/doc/scipy/reference/special.html) module.
+
+```{eval-rst}
+.. automodule:: torch.special
+.. currentmodule:: torch.special
+```
+
+## Functions
+
+```{eval-rst}
+.. autofunction:: airy_ai
+.. autofunction:: bessel_j0
+.. autofunction:: bessel_j1
+.. autofunction:: bessel_y0
+.. autofunction:: bessel_y1
+.. autofunction:: chebyshev_polynomial_t
+.. autofunction:: chebyshev_polynomial_u
+.. autofunction:: chebyshev_polynomial_v
+.. autofunction:: chebyshev_polynomial_w
+.. autofunction:: digamma
+.. autofunction:: entr
+.. autofunction:: erf
+.. autofunction:: erfc
+.. autofunction:: erfcx
+.. autofunction:: erfinv
+.. autofunction:: exp2
+.. autofunction:: expit
+.. autofunction:: expm1
+.. autofunction:: gammainc
+.. autofunction:: gammaincc
+.. autofunction:: gammaln
+.. autofunction:: hermite_polynomial_h
+.. autofunction:: hermite_polynomial_he
+.. autofunction:: i0
+.. autofunction:: i0e
+.. autofunction:: i1
+.. autofunction:: i1e
+.. autofunction:: laguerre_polynomial_l
+.. autofunction:: legendre_polynomial_p
+.. autofunction:: log1p
+.. autofunction:: log_ndtr
+.. autofunction:: log_softmax
+.. autofunction:: logit
+.. autofunction:: logsumexp
+.. autofunction:: modified_bessel_i0
+.. autofunction:: modified_bessel_i1
+.. autofunction:: modified_bessel_k0
+.. autofunction:: modified_bessel_k1
+.. autofunction:: multigammaln
+.. autofunction:: ndtr
+.. autofunction:: ndtri
+.. autofunction:: polygamma
+.. autofunction:: psi
+.. autofunction:: round
+.. autofunction:: scaled_modified_bessel_k0
+.. autofunction:: scaled_modified_bessel_k1
+.. autofunction:: shifted_chebyshev_polynomial_t
+.. autofunction:: shifted_chebyshev_polynomial_u
+.. autofunction:: shifted_chebyshev_polynomial_v
+.. autofunction:: shifted_chebyshev_polynomial_w
+.. autofunction:: sinc
+.. autofunction:: softmax
+.. autofunction:: spherical_bessel_j0
+.. autofunction:: xlog1py
+.. autofunction:: xlogy
+.. autofunction:: zeta
+```
\ No newline at end of file
diff --git a/docs/source/storage.rst b/docs/source/storage.rst
index 9a9ff9d7ea14..aa3e1b03ade0 100644
--- a/docs/source/storage.rst
+++ b/docs/source/storage.rst
@@ -31,7 +31,11 @@ can be faster than deserializing multiple independent tensors.
 
 A tensor storage can be accessed through the :meth:`~torch.Tensor.untyped_storage` method. This will return an object of
 type :class:`torch.UntypedStorage`.
+<<<<<<< HEAD
 Fortunately, storages have a unique identifier called accessed through the :meth:`torch.UntypedStorage.data_ptr` method.
+=======
+Fortunately, storages have a unique identifier accessed through the :meth:`torch.UntypedStorage.data_ptr` method.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 In regular settings, two tensors with the same data storage will have the same storage ``data_ptr``.
 However, tensors themselves can point to two separate storages, one for its data attribute and another for its grad
 attribute. Each will require a ``data_ptr()`` of its own. In general, there is no guarantee that a
diff --git a/docs/source/tensor_attributes.rst b/docs/source/tensor_attributes.rst
index 46388033db5b..a54c219aeaf7 100644
--- a/docs/source/tensor_attributes.rst
+++ b/docs/source/tensor_attributes.rst
@@ -15,13 +15,21 @@ torch.dtype
 .. class:: dtype
 
 A :class:`torch.dtype` is an object that represents the data type of a
+<<<<<<< HEAD
 :class:`torch.Tensor`. PyTorch has twelve different data types:
+=======
+:class:`torch.Tensor`. PyTorch has several different data types:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ========================== ===========================================   ===========================
 Data type                  dtype                                         Legacy Constructors
 ========================== ===========================================   ===========================
 32-bit floating point      ``torch.float32`` or ``torch.float``          ``torch.*.FloatTensor``
 64-bit floating point      ``torch.float64`` or ``torch.double``         ``torch.*.DoubleTensor``
+<<<<<<< HEAD
+=======
+32-bit complex             ``torch.complex32`` or ``torch.chalf``
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 64-bit complex             ``torch.complex64`` or ``torch.cfloat``
 128-bit complex            ``torch.complex128`` or ``torch.cdouble``
 16-bit floating point [1]_ ``torch.float16`` or ``torch.half``           ``torch.*.HalfTensor``
diff --git a/docs/source/testing.md b/docs/source/testing.md
new file mode 100644
index 000000000000..c18a4a1d9ed4
--- /dev/null
+++ b/docs/source/testing.md
@@ -0,0 +1,21 @@
+# torch.testing
+
+```{eval-rst}
+.. automodule:: torch.testing
+```
+
+```{eval-rst}
+.. currentmodule:: torch.testing
+```
+
+```{eval-rst}
+.. autofunction:: assert_close
+```
+
+```{eval-rst}
+.. autofunction:: make_tensor
+```
+
+```{eval-rst}
+.. autofunction:: assert_allclose
+```
diff --git a/docs/source/threading_environment_variables.md b/docs/source/threading_environment_variables.md
new file mode 100644
index 000000000000..590ae6391ecb
--- /dev/null
+++ b/docs/source/threading_environment_variables.md
@@ -0,0 +1,14 @@
+(threading_environment_variables)=
+
+# Threading Environment Variables
+
+```{list-table}
+:header-rows: 1
+
+* - Variable
+  - Description
+* - ``OMP_NUM_THREADS``
+  - Sets the maximum number of threads to use for OpenMP parallel regions.
+* - ``MKL_NUM_THREADS``
+  - Sets the maximum number of threads to use for the Intel MKL library. Note that MKL_NUM_THREADS takes precedence over ``OMP_NUM_THREADS``.
+```
diff --git a/docs/source/torch.ao.ns._numeric_suite.md b/docs/source/torch.ao.ns._numeric_suite.md
new file mode 100644
index 000000000000..b1466470fe26
--- /dev/null
+++ b/docs/source/torch.ao.ns._numeric_suite.md
@@ -0,0 +1,16 @@
+(torch_ao_ns_numeric_suite)=
+
+# torch.ao.ns._numeric_suite
+
+```{warning}
+This module is an early prototype and is subject to change.
+```
+
+```{eval-rst}
+.. currentmodule:: torch.ao.ns._numeric_suite
+```
+```{eval-rst}
+.. automodule:: torch.ao.ns._numeric_suite
+    :members:
+    :member-order: bysource
+```
diff --git a/docs/source/torch.ao.ns._numeric_suite_fx.md b/docs/source/torch.ao.ns._numeric_suite_fx.md
new file mode 100644
index 000000000000..46a46d598f4f
--- /dev/null
+++ b/docs/source/torch.ao.ns._numeric_suite_fx.md
@@ -0,0 +1,39 @@
+(torch_ao_ns_numeric_suite_fx)=
+
+# torch.ao.ns._numeric_suite_fx
+
+
+```{warning}
+    This module is an early prototype and is subject to change.
+```
+
+```{eval-rst}
+.. automodule:: torch.ao.ns._numeric_suite_fx
+    :members:
+    :member-order: bysource
+
+```
+---
+
+# torch.ao.ns.fx.utils
+
+
+```{warning}
+    This module is an early prototype and is subject to change.
+```
+
+```{eval-rst}
+.. currentmodule:: torch.ao.ns.fx.utils
+```
+
+```{eval-rst}
+.. function:: compute_sqnr(x, y)
+```
+
+```{eval-rst}
+.. function:: compute_normalized_l2_error(x, y)
+```
+
+```{eval-rst}
+.. function:: compute_cosine_similarity(x, y)
+```
\ No newline at end of file
diff --git a/docs/source/torch.compiler.config.md b/docs/source/torch.compiler.config.md
new file mode 100644
index 000000000000..66059f07ea5b
--- /dev/null
+++ b/docs/source/torch.compiler.config.md
@@ -0,0 +1,14 @@
+```{eval-rst}
+.. currentmodule:: torch.compiler.config
+
+```
+
+# torch.compiler.config
+
+```{eval-rst}
+.. automodule:: torch.compiler.config
+```
+
+```{eval-rst}
+.. autodata:: torch.compiler.config.job_id
+```
diff --git a/docs/source/torch.compiler.md b/docs/source/torch.compiler.md
new file mode 100644
index 000000000000..c456a40141a8
--- /dev/null
+++ b/docs/source/torch.compiler.md
@@ -0,0 +1,127 @@
+(torch.compiler_overview)=
+
+# torch.compiler
+
+`torch.compiler` is a namespace through which some of the internal compiler
+methods are surfaced for user consumption. The main function and the feature in
+this namespace is `torch.compile`.
+
+`torch.compile` is a PyTorch function introduced in PyTorch 2.x that aims to
+solve the problem of accurate graph capturing in PyTorch and ultimately enable
+software engineers to run their PyTorch programs faster. `torch.compile` is
+written in Python and it marks the transition of PyTorch from C++ to Python.
+
+`torch.compile` leverages the following underlying technologies:
+
+- **TorchDynamo (torch._dynamo)** is an internal API that uses a CPython
+  feature called the Frame Evaluation API to safely capture PyTorch graphs.
+  Methods that are available externally for PyTorch users are surfaced
+  through the `torch.compiler` namespace.
+- **TorchInductor** is the default `torch.compile` deep learning compiler
+  that generates fast code for multiple accelerators and backends. You
+  need to use a backend compiler to make speedups through `torch.compile`
+  possible. For NVIDIA, AMD and Intel GPUs, it leverages OpenAI Triton as the key
+  building block.
+- **AOT Autograd** captures not only the user-level code, but also backpropagation,
+  which results in capturing the backwards pass "ahead-of-time". This enables
+  acceleration of both forwards and backwards pass using TorchInductor.
+
+:::{note}
+In some cases, the terms `torch.compile`, TorchDynamo, `torch.compiler`
+might be used interchangeably in this documentation.
+:::
+
+As mentioned above, to run your workflows faster, `torch.compile` through
+TorchDynamo requires a backend that converts the captured graphs into a fast
+machine code. Different backends can result in various optimization gains.
+The default backend is called TorchInductor, also known as *inductor*,
+TorchDynamo has a list of supported backends developed by our partners,
+which can be see by running `torch.compiler.list_backends()` each of which
+with its optional dependencies.
+
+Some of the most commonly used backends include:
+
+**Training & inference backends**
+
+```{eval-rst}
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+
+   * - Backend
+     - Description
+   * - ``torch.compile(m, backend="inductor")``
+     - Uses the TorchInductor backend. `Read more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
+   * - ``torch.compile(m, backend="cudagraphs")``
+     - CUDA graphs with AOT Autograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
+   * - ``torch.compile(m, backend="ipex")``
+     - Uses IPEX on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
+   * - ``torch.compile(m, backend="onnxrt")``
+     - Uses ONNX Runtime for training on CPU/GPU. :doc:`Read more <onnx_dynamo_onnxruntime_backend>`
+```
+
+**Inference-only backends**
+
+```{eval-rst}
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+
+   * - Backend
+     - Description
+   * - ``torch.compile(m, backend="tensorrt")``
+     - Uses Torch-TensorRT for inference optimizations. Requires ``import torch_tensorrt`` in the calling script to register backend. `Read more <https://github.com/pytorch/TensorRT>`__
+   * - ``torch.compile(m, backend="ipex")``
+     - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
+   * - ``torch.compile(m, backend="tvm")``
+     - Uses Apache TVM for inference optimizations. `Read more <https://tvm.apache.org/>`__
+   * - ``torch.compile(m, backend="openvino")``
+     - Uses OpenVINO for inference optimizations. `Read more <https://docs.openvino.ai/torchcompile>`__
+```
+
+## Read More
+
+```{eval-rst}
+.. toctree::
+   :caption: Getting Started for PyTorch Users
+   :maxdepth: 1
+
+   torch.compiler_get_started
+   torch.compiler_api
+   torch.compiler.config
+   torch.compiler_fine_grain_apis
+   torch.compiler_aot_inductor
+   torch.compiler_inductor_profiling
+   torch.compiler_profiling_torch_compile
+   torch.compiler_faq
+   torch.compiler_troubleshooting
+   torch.compiler_performance_dashboard
+   torch.compiler_inductor_provenance
+```
+
+% _If you want to contribute a developer-level topic
+%  that provides in-depth overview of a torch._dynamo feature,
+%  add in the below toc.
+
+```{eval-rst}
+.. toctree::
+   :caption: Deep Dive for PyTorch Developers
+   :maxdepth: 1
+
+   torch.compiler_dynamo_overview
+   torch.compiler_dynamo_deepdive
+   torch.compiler_dynamic_shapes
+   torch.compiler_nn_module
+   torch.compiler_cudagraph_trees
+   torch.compiler_fake_tensor
+```
+
+```{eval-rst}
+.. toctree::
+   :caption: HowTo for PyTorch Backend Vendors
+   :maxdepth: 1
+
+   torch.compiler_custom_backends
+   torch.compiler_transformations
+   torch.compiler_ir
+```
diff --git a/docs/source/torch.compiler_aot_inductor.md b/docs/source/torch.compiler_aot_inductor.md
new file mode 100644
index 000000000000..d2a7c9339264
--- /dev/null
+++ b/docs/source/torch.compiler_aot_inductor.md
@@ -0,0 +1,212 @@
+# AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models
+
+```{warning}
+AOTInductor and its related features are in prototype status and are
+subject to backwards compatibility breaking changes.
+```
+
+AOTInductor is a specialized version of
+[TorchInductor](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747),
+designed to process exported PyTorch models, optimize them, and produce shared libraries as well
+as other relevant artifacts.
+These compiled artifacts are specifically crafted for deployment in non-Python environments,
+which are frequently employed for inference deployments on the server side.
+
+In this tutorial, you will gain insight into the process of taking a PyTorch model, exporting it,
+compiling it into an artifact, and conducting model predictions using C++.
+
+## Model Compilation
+
+To compile a model using AOTInductor, we first need to use
+{func}`torch.export.export` to capture a given PyTorch model into a
+computational graph. {ref}`torch.export <torch.export>` provides soundness
+guarantees and a strict specification on the IR captured, which AOTInductor
+relies on.
+
+We will then use {func}`torch._inductor.aoti_compile_and_package` to compile the
+exported program using TorchInductor, and save the compiled artifacts into one
+package.
+
+```{note}
+If you have a CUDA-enabled device on your machine and you installed PyTorch with CUDA support,
+the following code will compile the model into a shared library for CUDA execution.
+Otherwise, the compiled artifact will run on CPU. For better performance during CPU inference,
+it is suggested to enable freezing by setting `export TORCHINDUCTOR_FREEZING=1`
+before running the Python script below. The same behavior works in an environment with Intel®
+GPU as well.
+```
+
+```python
+import os
+import torch
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 16)
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(16, 1)
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return x
+
+with torch.no_grad():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = Model().to(device=device)
+    example_inputs=(torch.randn(8, 10, device=device),)
+    batch_dim = torch.export.Dim("batch", min=1, max=1024)
+    # [Optional] Specify the first dimension of the input x as dynamic.
+    exported = torch.export.export(model, example_inputs, dynamic_shapes={"x": {0: batch_dim}})
+    # [Note] In this example we directly feed the exported module to aoti_compile_and_package.
+    # Depending on your use case, e.g. if your training platform and inference platform
+    # are different, you may choose to save the exported model using torch.export.save and
+    # then load it back using torch.export.load on your inference platform to run AOT compilation.
+    output_path = torch._inductor.aoti_compile_and_package(
+        exported,
+        # [Optional] Specify the generated shared library path. If not specified,
+        # the generated artifact is stored in your system temp directory.
+        package_path=os.path.join(os.getcwd(), "model.pt2"),
+    )
+```
+
+In this illustrative example, the `Dim` parameter is employed to designate the first dimension of
+the input variable "x" as dynamic. Notably, the path and name of the compiled library remain unspecified,
+resulting in the shared library being stored in a temporary directory.
+To access this path from the C++ side, we save it to a file for later retrieval within the C++ code.
+
+## Inference in Python
+
+There are multiple ways to deploy the compiled artifact for inference, and one of that is using Python.
+We have provided a convenient utility API in Python {func}`torch._inductor.aoti_load_package` for loading
+and running the artifact, as shown in the following example:
+
+```python
+import os
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = torch._inductor.aoti_load_package(os.path.join(os.getcwd(), "model.pt2"))
+print(model(torch.randn(8, 10, device=device)))
+```
+
+The input at inference time should have the same size, dtype, and stride as the input at export time.
+
+## Inference in C++
+
+Next, we use the following example C++ file `inference.cpp` to load the compiled artifact,
+enabling us to conduct model predictions directly within a C++ environment.
+
+```cpp
+#include <iostream>
+#include <vector>
+
+#include <torch/torch.h>
+#include <torch/csrc/inductor/aoti_package/model_package_loader.h>
+
+int main() {
+    c10::InferenceMode mode;
+
+    torch::inductor::AOTIModelPackageLoader loader("model.pt2");
+    // Assume running on CUDA
+    std::vector<torch::Tensor> inputs = {torch::randn({8, 10}, at::kCUDA)};
+    std::vector<torch::Tensor> outputs = loader.run(inputs);
+    std::cout << "Result from the first inference:"<< std::endl;
+    std::cout << outputs[0] << std::endl;
+
+    // The second inference uses a different batch size and it works because we
+    // specified that dimension as dynamic when compiling model.pt2.
+    std::cout << "Result from the second inference:"<< std::endl;
+    // Assume running on CUDA
+    std::cout << loader.run({torch::randn({1, 10}, at::kCUDA)})[0] << std::endl;
+
+    return 0;
+}
+```
+
+For building the C++ file, you can make use of the provided `CMakeLists.txt` file, which
+automates the process of invoking `python model.py` for AOT compilation of the model and compiling
+`inference.cpp` into an executable binary named `aoti_example`.
+
+```cmake
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(aoti_example)
+
+find_package(Torch REQUIRED)
+
+add_executable(aoti_example inference.cpp model.pt2)
+
+add_custom_command(
+    OUTPUT model.pt2
+    COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/model.py
+    DEPENDS model.py
+)
+
+target_link_libraries(aoti_example "${TORCH_LIBRARIES}")
+set_property(TARGET aoti_example PROPERTY CXX_STANDARD 17)
+```
+
+Provided the directory structure resembles the following, you can execute the subsequent commands
+to construct the binary. It is essential to note that the `CMAKE_PREFIX_PATH` variable
+is crucial for CMake to locate the LibTorch library, and it should be set to an absolute path.
+Please be mindful that your path may vary from the one illustrated in this example.
+
+```
+aoti_example/
+    CMakeLists.txt
+    inference.cpp
+    model.py
+```
+
+```bash
+$ mkdir build
+$ cd build
+$ CMAKE_PREFIX_PATH=/path/to/python/install/site-packages/torch/share/cmake cmake ..
+$ cmake --build . --config Release
+```
+
+After the `aoti_example` binary has been generated in the `build` directory, executing it will
+display results akin to the following:
+
+```bash
+$ ./aoti_example
+Result from the first inference:
+0.4866
+0.5184
+0.4462
+0.4611
+0.4744
+0.4811
+0.4938
+0.4193
+[ CUDAFloatType{8,1} ]
+Result from the second inference:
+0.4883
+0.4703
+[ CUDAFloatType{2,1} ]
+```
+
+## Troubleshooting
+
+Below are some useful tools for debugging AOT Inductor.
+
+```{toctree}
+:caption: Debugging Tools
+:maxdepth: 1
+
+logging
+torch.compiler_aot_inductor_minifier
+```
+
+To enable runtime checks on inputs, set the environment variable `AOTI_RUNTIME_CHECK_INPUTS` to 1. This will raise a `RuntimeError` if the inputs to the compiled model differ in size, data type, or strides from those used during export.
+
+## API Reference
+
+```{eval-rst}
+.. autofunction:: torch._inductor.aoti_compile_and_package
+.. autofunction:: torch._inductor.aoti_load_package
+```
diff --git a/docs/source/torch.compiler_aot_inductor_minifier.md b/docs/source/torch.compiler_aot_inductor_minifier.md
new file mode 100644
index 000000000000..75a06159ff08
--- /dev/null
+++ b/docs/source/torch.compiler_aot_inductor_minifier.md
@@ -0,0 +1,215 @@
+# AOTInductor Minifier
+
+If you encounter an error while using AOT Inductor APIs such as
+`torch._inductor.aoti_compile_and_package`, `torch._indcutor.aoti_load_package`,
+or running the loaded model of `aoti_load_package` on some inputs, you can use the AOTInductor Minifier
+to create a minimal nn.Module that reproduce the error by setting `from torch._inductor import config; config.aot_inductor.dump_aoti_minifier = True`.
+
+One a high-level, there are two steps in using the minifier:
+
+- Set `from torch._inductor import config; config.aot_inductor.dump_aoti_minifier = True` or set the environment variable `DUMP_AOTI_MINIFIER=1`. Then running the script that errors would produce a `minifier_launcher.py` script. The output directory is configurable by setting `torch._dynamo.config.debug_dir_root` to a valid directory name.
+
+- Run the `minifier_launcher.py` script. If the minifier runs successfully, it generates runnable python code in `repro.py` which reproduces the exact error.
+
+## Example Code
+
+Here is sample code which will generate an error because we injected an error on relu with
+`torch._inductor.config.triton.inject_relu_bug_TESTING_ONLY = "compile_error"`.
+
+
+```
+import torch
+from torch._inductor import config as inductor_config
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 16)
+        self.relu = torch.nn.ReLU()
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.sigmoid(x)
+        return x
+
+
+inductor_config.aot_inductor.dump_aoti_minifier = True
+torch._inductor.config.triton.inject_relu_bug_TESTING_ONLY = "compile_error"
+
+with torch.no_grad():
+    model = Model().to("cuda")
+    example_inputs = (torch.randn(8, 10).to("cuda"),)
+    ep = torch.export.export(model, example_inputs)
+    package_path = torch._inductor.aoti_compile_and_package(ep)
+    compiled_model = torch._inductor.aoti_load_package(package_path)
+    result = compiled_model(*example_inputs)
+```
+
+The code above generates the following error:
+
+```text
+RuntimeError: Failed to import /tmp/torchinductor_shangdiy/fr/cfrlf4smkwe4lub4i4cahkrb3qiczhf7hliqqwpewbw3aplj5g3s.py
+SyntaxError: invalid syntax (cfrlf4smkwe4lub4i4cahkrb3qiczhf7hliqqwpewbw3aplj5g3s.py, line 29)
+```
+
+
+This is because we injected an error on relu, and so the generated triton kernel looks like below. Note that we have `compile error!`
+instead if `relu`, so we get a `SyntaxError`.
+
+```
+@triton.jit
+def triton_poi_fused_addmm_relu_sigmoid_0(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
+    xnumel = 128
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:]
+    xmask = xindex < xnumel
+    x2 = xindex
+    x0 = xindex % 16
+    tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+    tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+    tmp2 = tmp0 + tmp1
+    tmp3 = compile error!
+    tmp4 = tl.sigmoid(tmp3)
+    tl.store(in_out_ptr0 + (x2), tmp4, xmask)
+```
+
+
+Since we have `torch._inductor.config.aot_inductor.dump_aoti_minifier=True`, we also see an additional line indicating where `minifier_launcher.py` has
+been written to. The output directory is configurable by setting
+`torch._dynamo.config.debug_dir_root` to a valid directory name.
+
+```text
+W1031 16:21:08.612000 2861654 pytorch/torch/_dynamo/debug_utils.py:279] Writing minified repro to:
+W1031 16:21:08.612000 2861654 pytorch/torch/_dynamo/debug_utils.py:279] /data/users/shangdiy/pytorch/torch_compile_debug/run_2024_10_31_16_21_08_602433-pid_2861654/minifier/minifier_launcher.py
+```
+
+
+## Minifier Launcher
+
+The `minifier_launcher.py` file has the following code. The `exported_program` contains the inputs to `torch._inductor.aoti_compile_and_package`.
+The `command='minify'` parameter means the script will run the minifier to create a minimal graph module that reproduce the error. Alternatively, you set
+use `command='run'` to just compile, load, and run the loaded model (without running the minifier).
+
+
+```
+import torch
+import torch._inductor.inductor_prims
+
+import torch._dynamo.config
+import torch._inductor.config
+import torch._functorch.config
+import torch.fx.experimental._config
+
+torch._inductor.config.triton.inject_relu_bug_TESTING_ONLY = 'compile_error'
+torch._inductor.config.aot_inductor.dump_aoti_minifier = True
+
+
+
+
+isolate_fails_code_str = None
+
+
+
+# torch version: 2.6.0a0+gitcd9c6e9
+# torch cuda version: 12.0
+# torch git version: cd9c6e9408dd79175712223895eed36dbdc84f84
+
+
+# CUDA Info:
+# nvcc: NVIDIA (R) Cuda compiler driver
+# Copyright (c) 2005-2023 NVIDIA Corporation
+# Built on Fri_Jan__6_16:45:21_PST_2023
+# Cuda compilation tools, release 12.0, V12.0.140
+# Build cuda_12.0.r12.0/compiler.32267302_0
+
+# GPU Hardware Info:
+# NVIDIA PG509-210 : 8
+
+exported_program = torch.export.load('/data/users/shangdiy/pytorch/torch_compile_debug/run_2024_11_06_13_52_35_711642-pid_3567062/minifier/checkpoints/exported_program.pt2')
+# print(exported_program.graph)
+config_patches={}
+if __name__ == '__main__':
+    from torch._dynamo.repro.aoti import run_repro
+    with torch.no_grad():
+        run_repro(exported_program, config_patches=config_patches, accuracy=False, command='minify', save_dir='/data/users/shangdiy/pytorch/torch_compile_debug/run_2024_11_06_13_52_35_711642-pid_3567062/minifier/checkpoints', check_str=None)
+```
+
+
+Suppose we kept the `command='minify'` option, and run the script, we would get the following output:
+
+```text
+...
+W1031 16:48:08.938000 3598491 torch/_dynamo/repro/aoti.py:89] Writing checkpoint with 3 nodes to /data/users/shangdiy/pytorch/torch_compile_debug/run_2024_10_31_16_48_02_720863-pid_3598491/minifier/checkpoints/3.py
+W1031 16:48:08.975000 3598491 torch/_dynamo/repro/aoti.py:101] Copying repro file for convenience to /data/users/shangdiy/pytorch/repro.py
+Wrote minimal repro out to repro.py
+```
+
+
+If you get an `AOTIMinifierError` when running `minifier_launcher.py`, please report a bug [here](https://github.com/pytorch/pytorch/issues/new?assignees=&labels=&projects=&template=bug-report.yml).
+
+## Minified Result
+
+The `repro.py` looks like this. Notice that the exported program is printed at the top of the file, and it contains only the relu node. The minifier successfully reduced the graph to the op that raises the error.
+
+
+```
+# from torch.nn import *
+# class Repro(torch.nn.Module):
+#     def __init__(self) -> None:
+#         super().__init__()
+
+
+
+#     def forward(self, linear):
+#         relu = torch.ops.aten.relu.default(linear);  linear = None
+#         return (relu,)
+
+import torch
+from torch import tensor, device
+import torch.fx as fx
+from torch._dynamo.testing import rand_strided
+from math import inf
+import torch._inductor.inductor_prims
+
+import torch._dynamo.config
+import torch._inductor.config
+import torch._functorch.config
+import torch.fx.experimental._config
+
+torch._inductor.config.generate_intermediate_hooks = True
+torch._inductor.config.triton.inject_relu_bug_TESTING_ONLY = 'compile_error'
+torch._inductor.config.aot_inductor.dump_aoti_minifier = True
+
+
+
+
+isolate_fails_code_str = None
+
+
+
+# torch version: 2.6.0a0+gitcd9c6e9
+# torch cuda version: 12.0
+# torch git version: cd9c6e9408dd79175712223895eed36dbdc84f84
+
+
+# CUDA Info:
+# nvcc: NVIDIA (R) Cuda compiler driver
+# Copyright (c) 2005-2023 NVIDIA Corporation
+# Built on Fri_Jan__6_16:45:21_PST_2023
+# Cuda compilation tools, release 12.0, V12.0.140
+# Build cuda_12.0.r12.0/compiler.32267302_0
+
+# GPU Hardware Info:
+# NVIDIA PG509-210 : 8
+
+
+exported_program = torch.export.load('/data/users/shangdiy/pytorch/torch_compile_debug/run_2024_11_25_13_59_33_102283-pid_3658904/minifier/checkpoints/exported_program.pt2')
+# print(exported_program.graph)
+config_patches={'aot_inductor.package': True}
+if __name__ == '__main__':
+    from torch._dynamo.repro.aoti import run_repro
+    with torch.no_grad():
+        run_repro(exported_program, config_patches=config_patches, accuracy=False, command='run', save_dir='/data/users/shangdiy/pytorch/torch_compile_debug/run_2024_11_25_13_59_33_102283-pid_3658904/minifier/checkpoints', check_str=None)
+```
\ No newline at end of file
diff --git a/docs/source/torch.compiler_api.md b/docs/source/torch.compiler_api.md
new file mode 100644
index 000000000000..2b79b0e67007
--- /dev/null
+++ b/docs/source/torch.compiler_api.md
@@ -0,0 +1,34 @@
+```{eval-rst}
+.. currentmodule:: torch.compiler
+.. automodule:: torch.compiler
+```
+
+(torch.compiler_api)=
+# torch.compiler API reference
+
+For a quick overview of `torch.compiler`, see {ref}`torch.compiler_overview`.
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+     compile
+     reset
+     allow_in_graph
+     substitute_in_graph
+     assume_constant_result
+     list_backends
+     disable
+     set_stance
+     set_enable_guard_collectives
+     cudagraph_mark_step_begin
+     is_compiling
+     is_dynamo_compiling
+     is_exporting
+     skip_guard_on_inbuilt_nn_modules_unsafe
+     skip_guard_on_all_nn_modules_unsafe
+     keep_tensor_guards_unsafe
+     skip_guard_on_globals_unsafe
+     nested_compile_region
+```
diff --git a/docs/source/torch.compiler_cudagraph_trees.md b/docs/source/torch.compiler_cudagraph_trees.md
new file mode 100644
index 000000000000..6fd52edf97a4
--- /dev/null
+++ b/docs/source/torch.compiler_cudagraph_trees.md
@@ -0,0 +1,287 @@
+# CUDAGraph Trees
+
+## **Background**
+
+### CUDAGraph
+
+For a longer background on CUDAGraphs, read [accelerating pytorch with CUDAGraphs](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/).
+
+[CUDA Graphs](https://developer.nvidia.com/blog/cuda-10-features-revealed/), which made its debut in CUDA 10, let a series of CUDA kernels to be defined and encapsulated as a single unit, i.e., a graph of operations, rather than a sequence of individually-launched operations. It provides a mechanism to launch multiple GPU operations through a single CPU operation, and hence reduces the launching overheads.
+
+CUDA Graphs can give large speedups, especially for models with high CPU overhead or small compute. There are a number of limitations from requiring the same kernels to be run with the same arguments and dependencies, and memory addresses.
+
+- Control Flow is not possible
+- Kernels which trigger host to device syncs (such as .item()) errors
+- All input arguments to kernels are fixed to what they were recorded
+- CUDA Memory addresses are fixed, however the values of the memory at those addresses can change
+- No Essential CPU ops or CPU side effects
+
+### PyTorch CUDAGraph Integration
+
+PyTorch provides a [convenience wrapper](https://pytorch.org/docs/stable/generated/torch.cuda.CUDAGraph.html) around CUDAGraphs that handles a couple of tricky interactions with PyTorch’s caching allocator.
+
+The CachingAllocator uses a separate memory pool for all the new allocations. During CUDAGraph recording, memory is accounted for, allocated, and freed exactly as during eager run. On replay, just the kernels are invoked, and there are no changes to the allocator. Subsequent to initial recording, the allocator does not know which memory is actively being used in user programs.
+
+Using a separate memory pool between eager allocations and cudagraph allocations may increase the memory of your program if there is substantial memory allocated to both.
+
+### Make Graphed Callables
+
+[Make Graphed Callables](https://pytorch.org/docs/stable/generated/torch.cuda.make_graphed_callables.html) is a PyTorch Abstraction to share a single memory pool over a series of callables. Graphed Callables takes advantage of the fact that on CUDA Graph recording, memory is exactly accounted for by the caching allocator to safely share memory between separate CUDA Graph recordings. In each invocation, outputs are preserved as live memory, preventing one callable from overwriting the live memory of another. Graphed Callables can only be invoked in a single order; memory addresses from the first run are burned into the second, and so forth.
+
+### TorchDynamo Previous CUDA Graphs Integration
+
+Running with `cudagraph_trees=False` does not reuse memory across separate graph captures, which can lead to large memory regressions. Even for a model that has no graph breaks, this has issues. The forward and backward are separate graph captures, so the memory pools for forward and backward are not shared. In particular, memory for activations that are saved in the forward cannot be reclaimed in the backward.
+
+## **CUDAGraph Trees Integration**
+
+Like Graph Callables, CUDA Graph Trees use a single memory pool across all graph captures. However, instead of requiring a single sequence of invocations, CUDA Graph Trees create separate trees of CUDA Graph captures. Let’s take a look at an illustrative example:
+
+```python
+@torch.compile(mode="reduce-overhead")
+def foo(x):
+    # GRAPH 1
+    y = x * x * x
+    # graph break triggered here
+    if y.sum() > 0:
+        # GRAPH 2
+        z = y ** y
+    else:
+        # GRAPH 3
+        z = (y.abs() ** y.abs())
+    torch._dynamo.graph_break()
+    # GRAPH 4
+    return z * torch.rand_like(z)
+
+# the first run warms up each graph, which does things like CuBlas or Triton benchmarking
+foo(torch.arange(0, 10, device="cuda"))
+# The second run does a CUDA Graph recording, and replays it
+foo(torch.arange(0, 10, device="cuda"))
+# Finally we hit the optimized, CUDA Graph replay path
+foo(torch.arange(0, 10, device="cuda"))
+```
+
+In this example, there are two separate paths that we make through the function: 1 -> 2 -> 4, or 1 -> 3 -> 4.
+
+We share all of the memory in a single memory pool between separate recordings by building up a tape of CUDA Graph recordings, in this instance, 1 -> 2 -> 4. We add invariants to ensure that memory is always in the same location as it were recorded, and no live tensors exist in user programs that might be overwritten.
+
+- Same constraints from CUDA Graphs apply: same kernels must be invoked with the same arguments (static sizes, addresses, etc)
+- The same pattern of memory must be observed between recording and replay: if a tensor output of one graph dies subsequent to another graph during recording, it must also do so during replay.
+- Live memory in the CUDA pool forces a dependence between two recordings
+- These recordings can only be invoked in a single order 1 - > 2 -> 4
+
+All of the memory is shared in a single memory pool, so there is no additional memory overhead compared to eager. Now, what happens if we were to hit a new path and run Graph 3?
+
+Graph 1 gets replayed, and then we hit Graph 3, which we have not yet recorded. On graph replays, the private memory pool is not updated, so y is not reflected in the allocator. Without care, we would overwrite it. To support reusing the same memory pool after replaying other graphs, we checkpoint the memory pool back to its state at the end of graph 1. Now that our live tensors are reflected in the caching allocator, we are safe to run a new graph.
+
+First, we would hit the optimized, CUDAGraph.replay() path that we have already recorded in graph 1. Then we would hit Graph 3. Just as before, we will need to warm up the graph once before recording. On the warmup run, the memory addresses are not fixed, so graph 4 will also fallback to the inductor, non-cudagraph invocation.
+
+The second time we hit graph 3 we are warmed up and ready to record. We record graph 3 and then record graph 4 again since the input memory addresses have changed. This creates a tree of CUDA Graph recordings. A CUDA Graph Tree!
+
+```
+  1
+ / \\
+2   3
+ \\   \\
+  4   4
+```
+
+### Input Mutation Support
+
+Input mutation function refers to a function conducting in-place writes to an input tensor,
+as illustrated below:
+
+```python
+def foo(x, y):
+    # mutates input x
+    x.add_(1)
+    return x + y
+```
+
+Input mutation functions generally lead to challenges for CUDAGraph Trees. Due to the static
+CUDA memory address requirement from CUDAGraph, for each input tensor x, CUDAGraph Trees may
+allocate a static memory address x'. During execution, CUDAGraph Trees first copy the input
+tensor x to the static memory address x', and then replay the recorded CUDAGraph. For input
+mutation function, x' is in-place updated, which is not reflected on the input tensor x since
+x and x' reside on different CUDA memory addresses.
+
+A closer look at input mutation functions reveals that there are three types of inputs:
+
+- **inputs from eager**: These tensors we assume will vary input tensor addresses from
+  execution to execution. Because cudagraphs freeze memory addresses, we need to copy these
+  inputs to a static address tensor prior to graph recording and execution.
+- **Parameters and buffers**: These tensors we assume (and runtime-check) have the same tensor
+  addresses on every execution. We do not need to copy over their contents because the recorded
+  memory address will be the same as the executed memory address.
+- **Tensors which are prior outputs from CUDAGraph Trees**: Because the output tensor addresses
+  of a cudagraph are fixed, if we run CUDAGraph1, then run CUDAGraph2, the inputs which came from
+  CUDAGraph1 into CUDAGraph2 will have a fixed memory address. These inputs, like parameters and
+  buffers, do not require copying over to a static address tensor. We check to make sure that
+  these inputs are stable at runtime, and if they're not we will re-record.
+
+CUDAGraph Trees support input mutation on parameters and buffers, and tensors which are prior
+outputs from CUDAGraph Trees. For mutation on inputs from eager, CUDAGraph Trees will run the
+function without CUDAGraph and emit *skipping due to mutated inputs* log. The following example
+shows CUDAGraph Trees' support for tensors which are prior outputs from CUDAGraph Trees.
+
+```python
+import torch
+
+@torch.compile(mode="reduce-overhead")
+def foo(x):
+    return x + 1
+
+@torch.compile(mode="reduce-overhead")
+def mut(x):
+    return x.add_(2)
+
+# Enable input mutation support
+torch._inductor.config.triton.cudagraph_support_input_mutation = True
+
+for i in range(3):
+    torch.compiler.cudagraph_mark_step_begin()
+    inp = torch.rand([4], device="cuda")
+
+    # CUDAGraph is applied since `foo` does not mutate `inp`
+    tmp = foo(inp)
+    # Although `mut` mutates `tmp`, which is an output of a CUDAGraph
+    # managed function. So CUDAGraph is still applied.
+    mut(tmp)
+
+
+torch.compiler.cudagraph_mark_step_begin()
+inp = torch.rand([4], device="cuda")
+
+tmp = foo(inp)
+# While `tmp` is a CUDAGraph Tree managed function's output, `tmp.clone()`
+# is not. So CUDAGraph is not applied to `mut` and there is a log
+# `skipping cudagraphs due to mutated inputs`
+mut(tmp.clone())
+```
+
+To enable CUDAGraph Trees for a function mutating inputs from eager, please re-write
+the function to avoid input mutation.
+
+
+
+> **Note**\
+> Enable input mutation support by setting
+[torch.\_inductor.config.cudagraph_support_input_mutation = True](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py#L662) for "reduce-overhead" mode.
+
+
+### Dynamic Shape Support
+
+[Dynamic shape](https://pytorch.org/docs/stable/torch.compiler_dynamic_shapes.html)
+means that an input tensor has different shapes across function calls. Since CUDAGraph
+requires fixed tensor addresses, CUDAGraph Trees re-record CUDAGraph for every unique
+shape of an input tensor. This leads to multiple CUDAGraphs for a single inductor graph.
+When there are limited shapes (e.g., batch sizes in inference), it is profitable to
+re-record CUDAGraphs. However, if input tensor shapes change frequently or even on
+every invocation, re-recording CUDAGraph may not be profitable. Nvidia uses 64 KB of
+device memory per kernel launch in CUDAGraph, up until CUDA 12.4 and Driver Version 550+.
+This memory cost can be significant with many CUDAGraph re-recordings.
+
+For functions with frequently changing input tensor shapes, we suggest padding input
+tensors to a few fixed tensor shapes to still enjoy benefits from CUDAGraph. In addition,
+setting [torch.\_inductor.config.triton.cudagraph_skip_dynamic_graphs=True](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py#L653)
+allows to skip cudagraphing functions with dynamic shape inputs and only cudagraphing
+functions with static input tensor shapes.
+
+### NCCL Support
+
+CUDAGraph Trees support functions with nccl operators. While CUDAGraph Trees perform per-device
+record for CUDAGraph, NCCL support allows cross-device communication.
+
+```python
+@torch.compile(mode="reduce-overhead")
+def func(x):
+    y = x * x
+    y = torch.distributed.all_reduce(y, op=torch.distributed.ReduceOp.SUM)
+    x = torch.nn.functional.silu(x)
+    return x * y
+```
+
+### Reasons for Skipping CUDAGraph
+
+Since CUDAGraph has requirements such as static input tensor addresses and not supporting
+CPU operators, CUDAGraph Trees check whether a function satisfies these requirements and
+may skip CUDAGraph when necessary. Here, we list common reasons for skipping CUDAGraph.
+
+- **Input mutation**: CUDAGraph Trees skip functions that in-place mutates eager input.
+  In-place mutating parameters and buffers, or output tensors from CUDAGraph Tree managed
+  functions are still supported. Please see *Input Mutation Support* section for more details.
+- **CPU operators**: Functions containing CPU operator are skipped. Please split the
+  function into multiple functions and apply CUDAGraph Trees on functions with only GPU operators.
+- **Multi-device operators**: A function is skipped if it contains operators on multiple
+  devices. Currently, CUDAGraph is applied on a per-device basis. Please use supported
+  libraries such as NCCL for cross-device communication. Please see *NCCL Support*
+  section for more details.
+- **Free unbacked symbols**: Free unbacked symbols usually happen during
+  [dynamic shapes](https://pytorch.org/docs/stable/torch.compiler_dynamic_shapes.html).
+  CUDAGraph Trees currently record a CUDAGraph for every unique input tensor shapes.
+  Please see *Dynamic Shape Support* for more details.
+- **Incompatible operators**: CUDAGraph Trees skip a function if it contain incompatible
+  operators. Please replace these operators in a function with supported operators. We
+  show an exhaustive list of incompatible operators:
+
+```python
+aten._fused_moving_avg_obs_fq_helper.default
+aten._fused_moving_avg_obs_fq_helper_functional.default
+aten.multinomial.default
+fbgemm.dense_to_jagged.default
+fbgemm.jagged_to_padded_dense.default
+run_and_save_rng_state
+run_with_rng_state
+aten._local_scalar_dense
+aten._assert_scalar
+```
+
+The following operators are incompatible when [torch.are_deterministic_algorithms_enabled()](https://pytorch.org/docs/stable/generated/torch.are_deterministic_algorithms_enabled.html).
+
+```python
+aten._fused_moving_avg_obs_fq_helper.default
+aten._fused_moving_avg_obs_fq_helper_functional.default
+aten.multinomial.default
+fbgemm.dense_to_jagged.default
+fbgemm.jagged_to_padded_dense.default
+run_and_save_rng_state
+run_with_rng_state
+aten._local_scalar_dense
+aten._assert_scalar
+```
+
+### Limitations
+
+Because CUDA Graph fixes memory addresses, CUDA Graphs do not have a great way of handling live tensors from a previous invocation.
+
+Let’s say we are benchmarking running inference with the following code:
+
+```python
+import torch
+
+@torch.compile(mode="reduce-overhead")
+def my_model(x):
+    y = torch.matmul(x, x)
+    return y
+
+x = torch.randn(10, 10, device="cuda")
+y1 = my_model(x)
+y2 = my_model(x)
+print(y1)
+# RuntimeError: Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run.
+```
+
+In the Separate CUDA Graph implementation, the output from the first invocation will be overwritten by the second invocation. In CUDAGraph
+Trees, we don’t want to add unintended dependencies between iterations that would cause us to not hit the hot path, nor do we want we want
+to prematurely free memory from a prior invocation. Our heuristics are in inference we start a new iteration on each invocation for
+torch.compile, and in training we do the same so long as there is not a pending backward that has not been invoked. If those heuristics
+are wrong, you can mark the start of a new iteration with
+[torch.compiler.mark_step_begin()](https://pytorch.org/docs/stable/generated/torch.compiler.cudagraph_mark_step_begin.html), or clone
+tensors of a prior iteration (outside of torch.compile) before you begin the next run.
+
+### Comparisons
+
+| Footguns      | Separate CudaGraph                                         | CUDAGraph Trees                                                        |
+|---------------|------------------------------------------------------------|------------------------------------------------------------------------|
+| Memory Can Increase | On each graph compilation (new sizes, etc.)              | If you are also running non-cudagraph memory                           |
+| Recordings    | On any new invocation of a graph                           | Will re-record on any new, unique path you take through your program   |
+| Footguns      | Invocation of one graph will overwrite prior invocation    | Cannot persist memory between separate runs through your model - one training loop training, or one run of inference |
\ No newline at end of file
diff --git a/docs/source/torch.compiler_custom_backends.md b/docs/source/torch.compiler_custom_backends.md
new file mode 100644
index 000000000000..df43bb4c9e86
--- /dev/null
+++ b/docs/source/torch.compiler_custom_backends.md
@@ -0,0 +1,280 @@
+# Custom Backends
+
+## Overview
+
+`torch.compile` provides a straightforward method to enable users
+to define custom backends.
+
+A backend function has the contract
+`(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]) -> Callable`.
+
+Backend functions can be called by TorchDynamo, the graph tracing component of `torch.compile`,
+after tracing an FX graph and are
+expected to return a compiled function that is equivalent to the traced FX graph.
+The returned callable should have the same contract as the `forward` function of the original `torch.fx.GraphModule`
+passed into the backend:
+`(*args: torch.Tensor) -> List[torch.Tensor]`.
+
+In order for TorchDynamo to call your backend, pass your backend function as the `backend` kwarg in
+`torch.compile`. For example,
+
+```python
+import torch
+
+def my_custom_backend(gm, example_inputs):
+    return gm.forward
+
+def f(...):
+    ...
+
+f_opt = torch.compile(f, backend=my_custom_backend)
+
+@torch.compile(backend=my_custom_backend)
+def g(...):
+    ...
+```
+
+See below for more examples.
+
+## Registering Custom Backends
+
+You can register your backend using the `register_backend` decorator, for example,
+
+```python
+from torch._dynamo import register_backend
+
+@register_backend
+def my_compiler(gm, example_inputs):
+    ...
+```
+
+Besides the `register_backend` decorator, if your backend is in another python package, you could also register your
+backend through entry points of python package, which provides a way for a package to register a plugin for another one.
+
+:::{hint}
+You can learn more about `entry_points` in the
+[python packaging documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
+:::
+
+To register your backend through `entry_points`, you could add your backend function to the `torch_dynamo_backends` entry point group in the
+`setup.py` file of your package like:
+
+```python
+...
+setup(
+    ...
+    'torch_dynamo_backends': [
+        'my_compiler = your_module.submodule:my_compiler',
+    ]
+    ...
+)
+```
+
+Please replace the `my_compiler` before `=` to the name of your backend's name and replace the part after `=` to
+the module and function name of your backend function.
+The entry point will be added to your python environment after the installation of the package.
+When you call `torch.compile(model, backend="my_compiler")`, PyTorch would first search the backend named `my_compiler`
+that has been registered with `register_backend`. If not found, it will continue to search in all backends registered
+via `entry_points`.
+
+Registration serves two purposes:
+
+- You can pass a string containing your backend function's name to `torch.compile` instead of the function itself,
+  for example, `torch.compile(model, backend="my_compiler")`.
+- It is required for use with the [minifier](https://pytorch.org/docs/main/torch.compiler_troubleshooting_old.html#minifier). Any generated
+  code from the minifier must call your code that registers your backend function, typically through an `import` statement.
+
+## Custom Backends after AOTAutograd
+
+It is possible to define custom backends that are called by AOTAutograd rather than TorchDynamo.
+This is useful for 2 main reasons:
+
+- Users can define backends that support model training, as AOTAutograd can generate the backward graph for compilation.
+- AOTAutograd produces FX graphs consisting of [core Aten ops](https://pytorch.org/docs/main/torch.compiler_ir.html#core-aten-ir). As a result,
+  custom backends only need to support the core Aten opset, which is a significantly smaller opset than the entire torch/Aten opset.
+
+Wrap your backend with
+`torch._dynamo.backends.common.aot_autograd` and use `torch.compile` with the `backend` kwarg as before.
+Backend functions wrapped by `aot_autograd` should have the same contract as before.
+
+Backend functions are passed to `aot_autograd` through the `fw_compiler` (forward compiler)
+or `bw_compiler` (backward compiler) kwargs. If `bw_compiler` is not specified, the backward compile function
+defaults to the forward compile function.
+
+One caveat is that AOTAutograd requires compiled functions returned by backends to be "boxed". This can be done by wrapping
+the compiled function with `functorch.compile.make_boxed_func`.
+
+For example,
+
+```python
+from torch._dynamo.backends.common import aot_autograd
+from functorch.compile import make_boxed_func
+
+def my_compiler(gm, example_inputs):
+    return make_boxed_func(gm.forward)
+
+my_backend = aot_autograd(fw_compiler=my_compiler)  # bw_compiler=my_compiler
+
+model_opt = torch.compile(model, backend=my_backend)
+```
+
+## Examples
+
+### Debugging Backend
+
+If you want to better understand what is going on during a
+compilation, you can create a custom compiler, which is referred to as
+backend in this section, that will print pretty print the fx
+`GraphModule` extracted from Dynamo’s bytecode analysis
+and return a `forward()` callable.
+
+For example:
+
+```python
+from typing import List
+import torch
+def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+    print("my_compiler() called with FX graph:")
+    gm.graph.print_tabular()
+    return gm.forward  # return a python callable
+@torch.compile(backend=my_compiler)
+def fn(x, y):
+    a = torch.cos(x)
+    b = torch.sin(y)
+    return a + b
+fn(torch.randn(10), torch.randn(10))
+```
+
+Running the above example produces the following output:
+
+```
+my_compiler() called with FX graph:
+opcode         name    target                                                  args        kwargs
+-------------  ------  ------------------------------------------------------  ----------  --------
+placeholder    x       x                                                       ()          {}
+placeholder    y       y                                                       ()          {}
+call_function  cos     <built-in method cos of type object at 0x7f1a894649a8>  (x,)        {}
+call_function  sin     <built-in method sin of type object at 0x7f1a894649a8>  (y,)        {}
+call_function  add     <built-in function add>                                 (cos, sin)  {}
+output         output  output                                                  ((add,),)   {}
+```
+
+This works for `torch.nn.Module` as well as shown below:
+
+```python
+from typing import List
+import torch
+def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+    print("my_compiler() called with FX graph:")
+    gm.graph.print_tabular()
+    return gm.forward  # return a python callable
+class MockModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.relu = torch.nn.ReLU()
+    def forward(self, x):
+        return self.relu(torch.cos(x))
+mod = MockModule()
+optimized_mod = torch.compile(mod, backend=my_compiler)
+optimized_mod(torch.randn(10))
+```
+
+Let’s take a look at one more example with control flow:
+
+```python
+from typing import List
+import torch
+def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+    print("my_compiler() called with FX graph:")
+    gm.graph.print_tabular()
+    return gm.forward  # return a python callable
+@torch.compile(backend=my_compiler)
+def toy_example(a, b):
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+for _ in range(100):
+    toy_example(torch.randn(10), torch.randn(10))
+```
+
+Running this example produces the following output:
+
+```
+my_compiler() called with FX graph:
+opcode         name     target                                                  args              kwargs
+-------------  -------  ------------------------------------------------------  ----------------  --------
+placeholder    a        a                                                       ()                {}
+placeholder    b        b                                                       ()                {}
+call_function  abs_1    <built-in method abs of type object at 0x7f8d259298a0>  (a,)              {}
+call_function  add      <built-in function add>                                 (abs_1, 1)        {}
+call_function  truediv  <built-in function truediv>                             (a, add)          {}
+call_method    sum_1    sum                                                     (b,)              {}
+call_function  lt       <built-in function lt>                                  (sum_1, 0)        {}
+output         output   output                                                  ((truediv, lt),)  {}
+
+my_compiler() called with FX graph:
+opcode         name    target                   args         kwargs
+-------------  ------  -----------------------  -----------  --------
+placeholder    b       b                        ()           {}
+placeholder    x       x                        ()           {}
+call_function  mul     <built-in function mul>  (b, -1)      {}
+call_function  mul_1   <built-in function mul>  (x, mul)     {}
+output         output  output                   ((mul_1,),)  {}
+
+my_compiler() called with FX graph:
+opcode         name    target                   args       kwargs
+-------------  ------  -----------------------  ---------  --------
+placeholder    b       b                        ()         {}
+placeholder    x       x                        ()         {}
+call_function  mul     <built-in function mul>  (x, b)     {}
+output         output  output                   ((mul,),)  {}
+
+The order of the last two graphs is nondeterministic depending
+on which one is encountered first by the just-in-time compiler.
+```
+
+### Speedy Backend
+
+Integrating a custom backend that offers superior performance is also
+easy and we’ll integrate a real one
+with [optimize_for_inference](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html):
+
+```python
+def optimize_for_inference_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+    scripted = torch.jit.script(gm)
+    return torch.jit.optimize_for_inference(scripted)
+```
+
+And then you should be able to optimize any existing code with:
+
+```python
+@torch.compile(backend=optimize_for_inference_compiler)
+def code_to_accelerate():
+    ...
+```
+
+### Composable Backends
+
+TorchDynamo includes many backends, which can be listed with
+`torch._dynamo.list_backends()`. You can combine these backends
+together with the following code:
+
+```python
+from torch._dynamo import lookup_backend
+def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+    try:
+        trt_compiled = lookup_backend("tensorrt")(gm, example_inputs)
+        if trt_compiled is not None:
+            return trt_compiled
+    except Exception:
+        pass
+    # first backend failed, try something else...
+    try:
+        inductor_compiled = lookup_backend("inductor")(gm, example_inputs)
+        if inductor_compiled is not None:
+            return inductor_compiled
+    except Exception:
+        pass
+    return gm.forward
+```
diff --git a/docs/source/torch.compiler_dynamic_shapes.md b/docs/source/torch.compiler_dynamic_shapes.md
new file mode 100644
index 000000000000..95998ffe8491
--- /dev/null
+++ b/docs/source/torch.compiler_dynamic_shapes.md
@@ -0,0 +1,129 @@
+# Dynamic Shapes
+
+Code: [symbolic_shapes.py](https://github.com/pytorch/pytorch/blob/db4572dbf18f1cf50cf662547e272d3117063747/torch/fx/experimental/symbolic_shapes.py)
+
+See also: [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng)
+
+## Motivation
+
+Deep learning compilers commonly only work for static shapes, that is to say, they produced compiled programs which only work for a single specific configuration of input shapes, and must recompile if any input shape changes. This assumption works great for the majority of commonly run deep learning models today, but there are a few situations where it is insufficient:
+
+- Some dimensions, such as batch size or sequence length, may vary. For example, an inference service performing adaptive batching will execute inference requests with varying batch sizes depending on how many requests it received within its batching window. We may also want to consider padding out variable size sequences only to the maximum sequence length within a batch, which may vary from batch-to-batch.
+- Some models exhibit data-dependent output shapes, that is to say, the size of their outputs and intermediates may depend on the actual input data which may vary across runs. For example, detection models may first generate a variable number of potential bounding boxes before running a more expensive image recognition model to identify if the subject is in a bounding box. The number of bounding boxes is data dependent.
+- One particularly important case of data-dependent shapes occurs when dealing with sparse representations, such as sparse tensors, jagged tensors, and graph neural networks. In all of these cases, the amount of data to be processed depends on the sparse structure of the problem, which will typically vary in a data-dependent way.
+
+In supporting dynamic shapes, we chose not to support dynamic rank programs, e.g., programs whose inputs tensors change in dimensionality, as this pattern rarely occurs in real-world deep learning programs, and it avoids the need to reason inductively over symbolic lists of shapes.
+
+## Abridged public API
+
+The default dynamic behavior in PyTorch 2.1 is:
+
+- PT2 assumes everything is static by default
+- If we recompile because a size changed, we will instead attempt to recompile
+  that size as being dynamic (sizes that have changed are likely to change in
+  the future). This generalization may fail (e.g., because user code does a
+  conditional branch on the size in question or missing dynamic shapes support
+  in PT2). If you are trying to understand why PT2 has overspecialized some
+  code, run with `TORCH_LOGS=dynamic` and look for "eval" entries that say
+  when guards are added and why.
+- If you know ahead of time something will be dynamic, you can skip the first
+  recompile with `torch._dynamo.mark_dynamic(tensor, dim)`. If you know ahead of time
+  the `min` and `max` value this dimension can take, you can specify `torch._dynamo.mark_dynamic(tensor, dim, min=min, max=max)`
+- If you say `torch.compile(dynamic=False)`, we will turn off automatic
+  dynamic shapes on recompiles and always recompile for each distinct size.
+  Conversely, if you say `torch.compile(dynamic=True)`, we will try to make
+  everything as dynamic as possible. This is mostly useful for small
+  operators; if you try it on a big model it will (1) probably crash PT2 and (2) run slow for no good reason.
+- You can whitelist specific sources to be marked as dynamic using the
+  `TORCH_COMPILE_DYNAMIC_SOURCES` environment variable or by setting
+  `torch.compiler.config.dynamic_sources`. This is particularly useful for large
+  models with graph breaks, as you can maintain dynamism across graph breaks since
+  source names stay consistent. You can also use this to mark integers as dynamic.
+  The format is a comma-delimited list of source names, e.g., `"L['x'], L['y']"`.
+  You can also use regexes, e.g., `"L\['x.*'\], L\['y.*'\]")`.
+  This whitelist takes precedence over other flags like `dynamic=False`,
+  `force_nn_module_property_static_shapes`, and `force_parameter_static_shapes`.
+- Sometimes it can be cumbersome to find the right inputs to mark as dynamic. If
+  you're willing to take a performance hit for the first batch, one other affordable
+  option we have are the eager_then_compile stances which derive dynamism for you.
+  See [torch.compiler.set_stance](https://docs.pytorch.org/docs/stable/generated/torch.compiler.set_stance.html) for more details.
+
+## The Guard Model
+
+When considering how to add support for dynamic shapes to TorchDynamo and TorchInductor, we made a major design decision: in order to reuse decompositions and other preexisting code written in Python/C++ targeting the PyTorch API, we must be able to trace through dynamic shapes. Unlike a fully symbolic system which might capture both branches of a conditional, we always pick one branch and specialize our trace under the assumption that we only use this trace when we would have made the same choice for that branch in the future. To do this, we maintain a "hint" for every symbolic size saying what its concrete value is at compile time (as TorchDynamo is a just-in-time compiler, it always knows what the actual input sizes are.) When we perform a condition on a tensor, we simply consult the hint to find out which branch to take.
+
+This greatly simplifies the symbolic shape formulas we produce, but means we have a much more involved system for managing guards. Consider, for example, the following program:
+
+```python
+def f(x, y):
+    z = torch.cat([x, y])
+    if z.size(0) > 2:
+        return z.mul(2)
+    else:
+        return z.add(2)
+```
+
+The final IR we will compile with TorchInductor will either be `torch.cat([x, y]).add(2)` or `torch.cat([x, y]).mul(2)` (with the condition flattened away), but to determine which branch we are in, we would need to know the size of `z`, an intermediate. Because TorchDynamo must know upfront if a compiled trace is valid (we do not support bailouts, like some JIT compilers), we must be able to reduce `z.size(0)` as an expression in terms of the inputs, `x.size(0) + y.size(0)`. This is done by writing meta functions for all operators in PyTorch which can propagate size information to the output of a tensor without actually performing computation on the node.
+
+## Overall architecture
+
+Symbolic shapes workflow:
+
+1. When we start compiling a frame in Dynamo, we allocate a ShapeEnv (attached to FakeTensorMode) which keeps track of symbolic shapes state.
+2. We allocate symbolic sizes for tensors on entry (what is static or dynamic is a policy decision, with some knobs).
+3. We propagate the symbolic sizes through operators, maintaining both (1) FX IR so that we can faithfully export symbolic compute, and (2) Sympy expressions representing the size vars, so we can reason about them.
+4. When we condition on symbolic sizes, either in Dynamo tracing or in Inductor optimization, we add guards based on the conditional. These can be induced from both Python and C++.
+5. These guards can induce further simplifications on symbolic variables. For example, if you assert `s0 == 4`, we can now replace all occurrences of `s0` with `4`.
+6. When we're done tracing and optimizing, we install all of these guards with the compiled code; the compiled code is only reusable if all the guards evaluate true.
+
+Important files:
+
+- C++ SymInt API: `c10/core/SymInt.h`, `SymFloat.h`, `SymBool.h`
+- Python SymInt API: `torch/__init__.py` (look for `SymInt/SymFloat/SymBool`)
+- C++ plumbing: `c10/core/SymNodeImpl.h`, `torch/csrc/utils/python_symnode.h`, `torch/csrc/jit/python/init.cpp`
+- Python infrastructure: `torch/fx/experimental/symbolic_shapes.py`
+- Other important files: `torch/_subclasses/fake_tensor.py`, `torch/_meta_registrations.py`, decomps, PrimTorch refs
+
+## Abridged internal API
+
+Understanding the Python class hierarchy:
+
+- SymInt/SymFloat/SymBool: these are user-visible classes that simulate their int/float/bool counterparts. If you add two SymInts, we give you a new SymInt that symbolically tracks that the integer addition had occurred.
+- SymNode: this is the internal structure (accessible via e.g., `symint.node`) which holds the actual symbolic tracking info. SymNode is type erased; this makes it more convenient to represent mixed-type operations. Note that technically you don't have to call into Python SymNode from SymInt; for example, XLA's C++ `SymNodeImpl` would take the place of SymNode.
+- ShapeEnv: per-compile context state which keeps track of all the free symbols and guards we have accumulated so far. Every SymNode records its ShapeEnv (but not vice versa; SymNodes only get used if they participate in a guard).
+
+C++ is fairly similar:
+
+- c10::SymInt/SymFloat/SymBool: user-visible classes that simulate int/float/bool.
+- c10::SymNode/SymNodeImpl: analogous to SymNode
+- There is no ShapeEnv in C++; for ease of debugging, the entire symbolic reasoning apparatus is in Python.
+
+When you write code that is traceable with `make_fx`, it must be able to deal with SymInt/SymFloat/SymBool flowing through it. [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng) gives some guidance for how to do this.
+
+## DimDynamic policy
+
+Symbolic reasoning:
+
+- Value ranges
+- Sympy usage notes
+- Constraints
+- DimDynamic/Constraint
+
+## Unbacked SymInts
+
+To resolve control flow, we check the hint, aka actual value, of a symbolic integer to determine which branch to go. However, in some cases, we may not have a hint: so-called unbacked symbolic integers arise when a size variable emerges from a data-dependent operation like `.nonzero()` or `.item()`. It is illegal to perform control flow on these symbolic integers, so we must graph break on these operations.
+
+Naively implemented, this is too restrictive: most PyTorch programs will immediately fail if you try to do anything with unbacked symbolic integers. Here are the most important enhancements to make this actually work:
+
+- On tensor creation, PyTorch precomputes a lot of data about a tensor; for example, if you use `empty_strided` to create a tensor, we will eagerly sort the strides and determine if the tensor is non-overlapping and dense. Sorts produce a lot of guards. However, it is more common to produce a tensor directly with a higher-level API like `empty`, which is guaranteed to produce a non-overlapping and dense tensor. We modified PyTorch to avoid needlessly recomputing these properties.
+- Even if nontrivial compute is needed, sometimes a property is never actually queried at all. Making these precomputed properties lazy allows us to avoid guarding on an unbacked symbolic integer unless it is actually needed.
+- The data in an integer tensor is generally not known to be non-negative. However, we provide an API `constrain_range` whereby a user can specify that a size is bounded above and below by known limits.
+
+Similar to the dynamic APIs, there are corresponding unbacked APIs: namely you can use mark_unbacked instead of `mark_dynamic` and `TORCH_COMPILE_UNBACKED_SOURCES` instead of `TORCH_COMPILE_DYNAMIC_SOURCES` to tell the compiler to mark an input as unbacked.
+
+In future versions of PT2 (beyond PT2.1), we will extend our reasoning system
+to infer that an unbacked symbolic integer is size-like based on usage. For
+example, if you pass the result of an `.item()` call to a factory function
+like `torch.empty`, we will automatically infer that the result is a size
+(because if it was not, it would fail.) This assumption would get validated
+at runtime, raising an error if it was not fulfilled.
diff --git a/docs/source/torch.compiler_dynamo_deepdive.md b/docs/source/torch.compiler_dynamo_deepdive.md
new file mode 100644
index 000000000000..6bbb03170e54
--- /dev/null
+++ b/docs/source/torch.compiler_dynamo_deepdive.md
@@ -0,0 +1,856 @@
+(torch.compiler_dynamo_deepdive)=
+
+# Dynamo Deep-Dive
+
+TorchDynamo (or simply Dynamo) is the tracer within `torch.compile`,
+and it is, more often than not, the one to blame for those insane
+backtraces. However, we cannot blindly blame Dynamo for these errors. In
+order to provide the user with the flexibility it does, Dynamo is given
+the arduous task of understanding any Python program. In particular,
+Dynamo has to implement a good part of the Python programming language
+internally!
+
+In this post, we will go over the internal design of Dynamo from the
+ground up. We will discuss the functionality it provides, and how it is
+implemented. By the end of this post, you will have a better
+understanding of what went wrong when you `torch.compiled` a PyTorch
+program and the compilation errored out, or succeeded but the speed-up
+was not what you expected.
+
+## A Gentle Introduction to Dynamo
+
+Before getting our hands dirty with all the implementation details,
+let’s start by discussing what it is that Dynamo does.
+
+Dynamo is a tracer. This means, given and function and inputs to it, it
+executes the function and records a linear sequence of instructions
+(without control flow) into a graph. For example, consider the following
+program:
+
+```python
+import torch
+
+@torch.compile
+def mse(x, y):
+    z = (x - y) ** 2
+    return z.sum()
+
+x = torch.randn(200)
+y = torch.randn(200)
+mse(x, y)
+```
+
+If we save this program into the file `example.py` and we run
+
+```bash
+TORCH_LOGS=graph_code python example.py
+```
+
+we see the output that Dynamo traced
+
+```python
+def forward(l_x_: torch.Tensor, l_y_: torch.Tensor):
+    # File: example.py:5, code: z = (x - y) ** 2
+    sub = l_x_ - l_y_
+    z = sub ** 2
+    # File: example.py:6, code: return z.sum()
+    sum_1 = z.sum()
+    return (sum_1,)
+```
+
+We call this a **graph (or trace) of the function for the given
+inputs**. This is represented via an [FX
+graph](https://pytorch.org/docs/main/fx.html). We will simply think
+of an FX graph as a container that stores a list of function calls.
+
+The first thing we should notice is that the graph is a linear sequence
+of PyTorch operations. [^1] Dynamo records all the PyTorch operations
+and stores them sequentially. For example, it split `z = (x - y) ** 2`
+into its two constituting operations, `sub = l_x_ - l_y_` and
+`z = sub ** 2`.
+
+When we say that the trace is linear, we mean that there is no branching
+or any control flow. To see this, consider
+
+```python
+import torch
+
+@torch.compile
+def fn(x, n):
+    y = x ** 2
+    if n >= 0:
+        return (n + 1) * y
+    else:
+        return y / n
+
+x = torch.randn(200)
+fn(x, 2)
+```
+
+which, when executed with `TORCH_LOGS=graph_code`, returns
+
+```python
+def forward(l_x_: torch.Tensor):
+    # File: example.py:5, code: y = x ** 2
+    y = l_x_ ** 2
+    # File: example.py:7, code: return (n + 1) * y
+    mul = 3 * y
+    return (mul,)
+```
+
+We see that Dynamo completely removed the `if` statement from the
+trace and just recorded the operations that were executed with the
+inputs.
+
+As such, it should be clear that **the trace of a function depends on
+the inputs**. In particular, this means that the trace is not generated
+when we write `@torch.compile`, but when we execute the function
+`fn(x, 2)` with the actual arguments.
+
+The other interesting thing to note here is that Dynamo removed the
+second argument to the function. Instead, it treated it as a constant
+and recorded the result of the operation `n + 1` in the graph. This is
+another feature of Dynamo: Dynamo will treat as constant any non-tensor
+value… other than ints. Let’s see now how are ints special.
+
+The last defining property of Dynamo is that it knows how to handle
+dynamic shapes. Symbolic shapes refer to Dynamo’s ability of tracing
+shapes, and more generally, integers, rather than leaving them as
+constants. This allows for avoiding recompilations and deploying generic
+models that work for any size in production. The main examples of places
+where dynamic shapes appear are the batch size, where we might train a
+model with a fixed batch size but then perform inference for an
+arbitrary batch size, or the variable sequence length that one
+encounters when processing text or audio.
+
+We can see this by executing a few more times the example above
+
+```python
+import torch
+
+@torch.compile
+def fn(x, n):
+    y = x ** 2
+    if n >= 0:
+        return (n + 1) * y
+    else:
+        return y / n
+
+x = torch.randn(200)
+fn(x, 2)
+fn(x, 3)
+fn(x, -2)
+```
+
+In this case, `TORCH_LOGS=graph_code` generates two more graphs
+
+```python
+# Graph for n==2 omitted
+
+def forward(self, l_x_: torch.Tensor, l_n_: torch.SymInt):
+    # File: a.py:5, code: y = x ** 2
+    y = l_x_ ** 2
+
+    # File: a.py:7, code: return (n + 1) * y
+    add = l_n_ + 1
+    mul = add * y
+    return (mul,)
+```
+
+```python
+def forward(self, l_x_: torch.Tensor, l_n_: torch.SymInt):
+    # File: a.py:5, code: y = x ** 2
+    y = l_x_ ** 2
+
+    # File: a.py:9, code: return y / n
+    truediv = y / l_n_
+    return (truediv,)
+```
+
+Dynamo detected that one integer changed its value after the first call
+and started tracing it. We see that these graphs are generic, and trace
+the variable `n` symbolically via an object of type `SymInt`.
+
+If after these calls we call `fn(x, 4)`, Dynamo would not recompile,
+but rather reuse the graph that was already traced.
+
+To summarize: 1. Dynamo is a Python tracer 2. Given some inputs, it
+returns an FX graph with the PyTorch functions that were executed 3. It
+can also trace integers if it detects that they changed between calls 4.
+It specializes any other value that is not a tensor or a scalar
+
+Of course, Dynamo does many more things, like figuring out when it needs
+to retrace, rewriting the bytecode of the function, implementing graph
+breaks… To keep the introduction short, we will incrementally discuss
+all these in the sequel.
+
+## PEP 523: Adding a frame evaluation API to CPython
+
+Imagine now that we are given the task to implement Dynamo. Where would
+we even start? Rather conveniently for us, [PEP
+523](https://peps.python.org/pep-0523/) was released with Python 3.6.
+This PEP [was
+designed](https://peps.python.org/pep-0523/#a-jit-for-cpython) to
+allow third parties to create JIT compilers for Python. Let’s see how.
+
+**A note on CPython**: CPython is internally implemented as a [stack
+machine](https://en.wikipedia.org/wiki/Stack_machine). A Python
+program is compiled into
+[bytecodes](https://en.wikipedia.org/wiki/Bytecode) that then are
+executed by this interpreter. To learn more about these bytecodes, see
+the [dis module](https://docs.python.org/3/library/dis.html) from the
+standard library. See also [the developer
+docs](https://devguide.python.org/internals/interpreter/) for an
+introduction to CPython’s interpreter. We will assume that the reader is
+familiar with the notion of a stack machine.
+
+PEP 523 exposes an API where a user can add a custom per-function
+interpreter. Then, CPython will use this interpreter rather than its own
+to execute the function. In order to be able to execute the function, on
+entry, CPython provides the custom interpreter with things like - The
+bytecode of the function - The value of the arguments of the function
+(i.e., the local variables) and their names - The value of the global
+variables and their names - The builtin functions like `abs` or
+`print`
+
+You can see all the fields
+[here](https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/csrc/dynamo/eval_frame.c#L50-L59). [^2]
+
+In summary, CPython provides the user’s interpreter with all the
+information necessary to execute the function. [^3]
+
+With this API, we can implement a tracer by implementing an interpreter
+that runs the code and records in a graph all the PyTorch operations
+that occur during this execution. This is exactly what Dynamo does.
+
+Dynamo uses this CPython API to parse all these objects and packs them
+into [a Python
+structure](https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/csrc/dynamo/eval_frame.c#L93-L108).
+After it has done so… it goes back from C to python. Other than for this
+piece of code that communicates with CPython, Dynamo is fully
+implemented in Python.
+
+It should be clear that it is the decorator `@torch.compile`’s job
+to install the necessary scaffolding that will pass the bytecode, the
+args, global variables and so on to Dynamo when the function is called.
+Again, `@torch.compile` does not actually compile anything.
+
+## Implementing CPython in Python
+
+So, we are back in the Python world. We have the bytecode of a function,
+and all the context necessary to execute it. In particular, we have
+landed at
+[_convert_frame_assert](https://github.com/pytorch/pytorch/blob/b6df8414601e1e086e830ca9e919e7fdc8874e71/torch/_dynamo/convert_frame.py#L272-L274).
+This is the function that the decorator `torch.compile` returns! We
+get to this function from
+[_dynamo.optimize](https://github.com/pytorch/pytorch/blob/b6df8414601e1e086e830ca9e919e7fdc8874e71/torch/_dynamo/eval_frame.py#L715-L727).
+The decorator `torch.compile` is just a nice API around
+`_dynamo.optimize`.
+
+Before getting into implementing a Python interpreter, we want to define
+an [IR](https://en.wikipedia.org/wiki/Intermediate_representation).
+In particular, we want to wrap all the local and global variables in our
+own internal classes. This allows us to better track these objects and
+group together objects that can be treated in the same way to the eyes
+of Dynamo.
+
+The parent class of the internal class structure is `VariableTracker`
+and represents the different objects that Dynamo understands. For
+example, `ListVariable`, represents a `list` object, and keeps
+internally a [list of VariableTrackers](https://github.com/pytorch/pytorch/blob/e38a3a6079a3861b4bc9f256120ec661f34e726d/torch/_dynamo/variables/lists.py#L48-L56).
+Another example of `VariableTracker` is
+[ConstantVariable](https://github.com/pytorch/pytorch/blob/83c0763dda1f93c6cf552ba88260a0dc7a3ecb70/torch/_dynamo/variables/constant.py#L30).
+ConstantVariable wraps all the [objects considered constant by
+Dynamo](https://github.com/pytorch/pytorch/blob/83c0763dda1f93c6cf552ba88260a0dc7a3ecb70/torch/_dynamo/variables/constant.py#L98-L107).
+We also have special subclasses for objects that require special
+attention, like
+[TensorVariable](https://github.com/pytorch/pytorch/blob/83c0763dda1f93c6cf552ba88260a0dc7a3ecb70/torch/_dynamo/variables/tensor.py#L68-L69).
+All these internal classes are defined in the
+[torch/_dynamo/variables](https://github.com/pytorch/pytorch/tree/83c0763dda1f93c6cf552ba88260a0dc7a3ecb70/torch/_dynamo/variables)
+folder.
+
+Python objects are wrapped into their corresponding `VariableTracker`
+class in
+[VariableBuilder._wrap](https://github.com/pytorch/pytorch/blob/83c0763dda1f93c6cf552ba88260a0dc7a3ecb70/torch/_dynamo/variables/builder.py#L365).
+This function is just a very long chain of `elif`s that tries to
+recursively pattern-match the Python inputs into the appropriate type of
+`VariableTracker`.
+
+**Debugging tip**. When we get unexpected results from dynamo, it is
+sometimes caused by the builder. If the logic of the builder is wrong,
+sometimes Dynamo may wrap a variable in the incorrect
+`VariableTracker` type, and this may cause issues later on. It is
+rather useful to have a look at the `VariableTracker` types that
+appear in the errors, and the `VariableTracker` method that throws the
+exception when you encounter a Dynamo error. In particular, sometimes we
+find that an object is tracked as a `UserDefinedObjectVariable` (this
+is Dynamo’s catch-all class), when it should have been tracked as
+something more specific. In these cases, the `SourceBuilder.__call__`
+logic is often to blame.
+
+**Debugging tip**. When running a program with `TORCH_LOGS=dynamo`,
+one of the artifacts that are printed out is lines of the form
+
+```
+TRACE LOAD_GLOBAL y [TorchInGraphFunctionVariable(<built-in method any>), TensorVariable()]
+```
+
+This is the bytecode for the original program and the state of the stack
+at that point. This is very useful to find where an object was not
+traced into the right `VariableTracker`.
+
+Ok, so we have an IR for our tracer, now we *just* need to reimplement
+CPython’s stack machine. This is implemented by
+[InstructorTranslatorBase](https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/symbolic_convert.py#L576-L594)
+in
+[symbolic_convert.py](https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/symbolic_convert.py).
+
+`InstructionTranslatorBase` has about 200 methods, implementing almost
+all of Python bytecodes. As an example, we can see the implementation of
+`BUILD_LIST`
+
+```python
+def BUILD_LIST(self, inst):
+    items = self.popn(inst.argval)
+    self.push(ListVariable(items, mutation_type=ValueMutationNew()))
+```
+
+This is the bytecode generated by constructions like `l = [2, 3, 4]`.
+In this case, since there are three elements, the generated bytecode is
+`BUILD_LIST 3`. This means that we pop the top `3` elements of the
+stack and push a new list object to the top of the stack formed by these
+three elements.
+
+## Generating the Output Graph
+
+With a way to symbolically execute Python code, we are set to extract
+the PyTorch operations that happen during the symbolic execution of a
+program given some inputs. This is implemented in Dynamo via the
+[OutputGraph](https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/output_graph.py#L221-L230)
+object. The `OutputGraph` object is [bound to an
+`InstructionTranslator object](https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/symbolic_convert.py#L2060-L2071)
+and it tracks all the data necessary to create the FX graph which will
+be returned by Dynamo.
+
+All the inputs and intermediary elements of the FX graph are
+`fx.Node`s. In Dynamo, `fx.Node`s are wrapped in
+`fx.Proxy`s. `fx.Proxy`s are used to build the FX graph.
+In particular, they record every PyTorch operation performed on them
+into the graph. You can create a new operation to be added to
+the graph by calling [create_proxy](https://github.com/pytorch/pytorch/blob/fb80f05ee2e1cba17892980701bfd5dbce58349f/torch/_dynamo/output_graph.py#L430-L431).
+Then, we can add it to the graph through the function
+[wrap_fx_proxy](https://github.com/pytorch/pytorch/blob/fb80f05ee2e1cba17892980701bfd5dbce58349f/torch/_dynamo/variables/builder.py#L1311).
+
+A graph stores operations on tensors… and operations on symbolic
+integers. We will discuss symbolic integers later on, but first we will
+discuss how Dynamo addresses a rather important correctness issue.
+
+(making-dynamo-sound-guards)=
+## Making Dynamo Sound: Guards
+
+At this point, we have a way to trace programs completely disregarding control flow.
+And for that, we have reimplemented all of CPython… If this sounds like a bit of an
+overkill, that is because it is.
+[torch.jit.trace](https://pytorch.org/docs/main/generated/torch.jit.trace.html)
+already implements this without all this machinery, so what gives?
+
+The issue with `torch.jit.trace`, as it is warned in its docs, is that
+it just works if the traced program is not data dependent. In other
+words, it will just work if the program itself is linear. This means
+writing our program without using if-elses, for-while loops, exceptions.
+Even more, none of the libraries that we use can use any control flow!
+All in all, not using control flow in a language as dynamic as Python
+is, in fact, a huge constraint.
+
+JAX solves this problem by always retracing and caching the graph after
+retracing. Dynamo, on the other hand, uses guards to avoid retracing the
+whole program every time.
+
+A **guard** is an assumption (a boolean expression on an input) made in
+order to specialize a frame for one set of example inputs. Reusing the
+graph is only valid if these assumptions hold on the new inputs.
+
+For example, any constant input to a function, like a string, installs a
+guard stating that that input should be of type `str` and equal to the
+string we passed. Running
+
+```python
+import torch
+
+@torch.compile
+def fn(a, b):
+    return a * len(b)
+
+fn(torch.arange(10), "Hello")
+```
+
+with `TORCH_LOGS=guards` prints (among other guards)
+
+```python
+___check_type_id(L['b'], 94334122025024)
+L['b'] == 'Hello'
+```
+
+This reads as “the local variable `b` should have a specific type
+(`str` in this case, represented by the constant `9433...`) and
+its value should be `'Hello'`”. If we then execute the function
+again passing a different argument
+
+```python
+import torch
+
+@torch.compile
+def fn(a, b):
+    return a * len(b)
+
+fn(torch.arange(10), "Hello")
+fn(torch.arange(10), "Hi")
+```
+
+we can see the guard that failed by running `TORCH_LOGS=recompiles`
+
+```python
+Recompiling function fn in script.py:3
+triggered by the following guard failure(s):
+     - L['b'] == 'Hello'
+```
+
+Guards are accumulated while [the inputs to the function are wrapped in
+the
+builder](https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/variables/builder.py#L808-L810)
+and [during the execution of the
+program](https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/variables/dicts.py#L763-L769).
+We will show many more examples of guards in the next section, but first
+let us discuss sources.
+
+A **source** tracks how to reconstruct a variable from the original
+local or global variables present when entering the current frame. In
+particular, it tracks the original local and global objects and any of
+the objects they contain. In
+
+```python
+def foo(x: Tensor, y: List[Tensor]):
+    a = x * y[0]
+    return a * x
+```
+
+`x` and `y` have
+[LocalSource](https://github.com/pytorch/pytorch/blob/40dc0580a69565b06ec5263efe5d87cecc8200f7/torch/_dynamo/source.py#L80-L92)
+as their source, and `y[0]` has
+[GetItemSource](https://github.com/pytorch/pytorch/blob/40dc0580a69565b06ec5263efe5d87cecc8200f7/torch/_dynamo/source.py#L302),
+which stores a `LocalSource` inside. On the other hand, `a` will not
+have a source as it is an intermediate variable that only exists within
+the fx graph.
+
+All these are defined in
+[torch/_dynamo/source.py](https://github.com/pytorch/pytorch/blob/main/torch/_dynamo/source.py).
+We can see the guard generated by `GetItemSource` in the following
+example:
+
+```python
+import torch
+
+@torch.compile
+def fn(x, l):
+    return x * len(l[0])
+
+fn(torch.randn(8), ["Hi", "Hello"])
+```
+
+generates the following guards
+
+```python
+___check_type_id(L['l'], 94439025877664)
+len(L['l']) == 2
+___check_type_id(L['l'][0], 94439025840192)
+L['l'][0] == 'Hi'
+___check_type_id(L['l'][1], 94439025840192)
+L['l'][1] == 'Hello'
+```
+
+Here, we see the code generated by `GetItemSource` (`[0]` and
+`[1]`) wrapping a `LocalSource` (`L['l']`).
+
+At this point, with sources and guards, we are able to implement a
+caching system to avoid recompilation without having to retrace every
+time. We will discuss a bit more in detail this caching system in the
+sequel.
+
+The attentive reader will have noticed that this does not explain yet
+why we need to have such fine control over the Python interpreter as to
+having to reimplement it. The examples of guards that we have shown
+depend on the input objects, so we could still compute these before
+executing the function. In other words, we could implement this guard
+system on top of `torch.jit.trace` and get the same functionality with
+much less effort… Enter symbolic shapes.
+
+## Symbolic Shapes
+
+Another point we discussed in the introduction is that Dynamo knows how
+to trace integers. In order to implement this, we use a symbolic class
+[torch.SymInt](https://github.com/pytorch/pytorch/blob/fb80f05ee2e1cba17892980701bfd5dbce58349f/torch/__init__.py#L244-L249)
+that acts like an `int` but it records all the operations performed on
+it in the output FX graph. [^4] We already saw this class in the introduction
+when introducing symbolic integer tracing.
+
+Let us now discuss the three properties that define symbolic shape
+tracing in Dynamo, and how to implement them.
+
+### Static by default
+
+Dynamo assumes that every integer, let that be an input or the shape of
+a tensor, is static by default. In other words, no integers will be
+traced on the first execution of a function. Then, only if it detects
+that an integer or a shape changed value during the execution, it will
+trace it and generate a graph generic on that variable.
+
+We already saw this behavior in the introduction using integers. Let us
+now look at an example using shapes of tensors.
+
+```python
+import torch
+
+@torch.compile
+def fn(a, b):
+    return a.shape[0] * a * b
+
+fn(torch.randn(4, 3), torch.randn(4, 3))
+fn(torch.randn(8, 3), torch.randn(8, 3))
+```
+
+Running this program with `TORCH_LOGS=graph_code` we see that these
+two calls are traced as
+
+```python
+def forward(self, l_a_: torch.Tensor, l_b_: torch.Tensor):
+    mul = 4 * l_a_
+    mul_1 = mul * l_b_
+    return (mul_1,)
+
+def forward(self, s0: torch.SymInt, l_a_: torch.Tensor, l_b_: torch.Tensor):
+    size = l_a_.size()
+    getitem = size[0]
+    mul = getitem * l_a_
+    mul_1 = mul * l_b_
+    return (mul_1,)
+```
+
+In the first graph the shape is traced as a constant, but once it
+changes, it traces it symbolically using a `SymInt`s. In general, a
+simpler way to see the shapes of the intermediary values is by running
+the program with `TORCH_LOGS=graph_sizes`
+
+```
+TRACED GRAPH TENSOR SIZES
+===== __compiled_fn_1 =====
+l_a_: (s0, 3)
+l_a_ (concrete): (8, 3)
+l_b_: (s0, 3)
+l_b_ (concrete): (8, 3)
+mul: (s0, 3)
+mul (concrete): (8, 3)
+mul_1: (s0, 3)
+mul_1 (concrete): (8, 3)
+```
+
+where we can see that the first dimension of the two tensor args is
+dynamic, given that it is represented by the `s0` variable.
+
+We can find how Dynamo implements this by running `TORCH_LOGS=guards`
+
+```python
+# Guards first call
+check_tensor(L['a'], torch.float32, device=None, requires_grad=False, size=[4, 3], stride=[3, 1])
+check_tensor(L['b'], torch.float32, device=None, requires_grad=False, size=[4, 3], stride=[3, 1])
+
+# Guards second call
+check_tensor(L['a'], torch.float32, device=None, requires_grad=False, size=[None, 3], stride=[3, 1])
+check_tensor(L['b'], torch.float32, device=None, requires_grad=False, size=[None, 3], stride=[3, 1])
+
+L['b'].size()[0] == L['a'].size()[0]
+2 <= L['a'].size()[0]
+```
+
+We see that on the first call, the guards check that the tensors have
+some fixed sizes and strides. These guards fail in the second execution,
+so it retraces. Since it was an `int` guard that failed, in this
+second iteration it traces this `int` symbolically and it installs
+more general guards on this more generic kernel.
+
+**Compilation performance tip**. If you know that a dimension will vary
+in size, you can mark it as dynamic by calling
+[torch._dynamo.mark_dynamic](https://github.com/pytorch/pytorch/blob/66a76516bfc341b2b55bb2056d2faa9c2de46d69/torch/_dynamo/decorators.py#L176)
+before calling `torch.compile`. This will avoid the first compilation
+with a static shape. There are other useful utility functions like
+`maybe_mark_dynamic` or `mark_static`. You can also have all
+integers and shapes traced by calling `torch.compile(dynamic=True)`.
+This is mostly useful for debugging purposes.
+
+### 0, 1 are always specialized
+
+Regardless of whether we mark a dimension as dynamic, if we pass an input
+where that dimension is 0 or 1, Dynamo will trace it as non-dynamic and it
+will generate a specific graph for it. This is the reason why in the example
+above we find guards of the form `2 <= L['a'].size()[0]`.
+
+There are several reasons for this choice. There are two particularly
+important - A tensor is empty if and only if any of its dimensions is
+zero - A tensor can only be contiguous if one of the strides is one
+
+This policy decision does NOT apply to plain Python ints; if we think a Python
+int should be compiled dynamically, we won't specialize them by default;
+instead, whether or not it gets specialized depends on its usage.
+
+### Duck shaping
+
+Dynamo performs what we call “duck shaping”. If two dynamic integers
+have the same value at trace time, we will assume that they are equal
+and guard on it. Effectively, this means that rather than having two
+symbols `s0`, `s1` in the example above, we just unified them to
+`s0` and had the guard `L['b'].size()[0] == L['a'].size()[0]`. This
+enables performing fusions within the compiler while being able to
+generate kernels that are generic enough.
+
+### Guards on symbolic ints
+
+We now understand how symbolic shapes are implemented at a high level
+and the properties they have. Now, why is that symbolic shapes forced us
+through the tricky route of getting control of the CPython interpreter?
+Consider the following example:
+
+```python
+import torch
+
+@torch.compile(dynamic=True)
+def fn(a):
+    if a.shape[0] * 2 < 16:
+        return a
+    else:
+        return a + 1
+
+fn(torch.randn(8))
+```
+
+This code has a guard of the form `2*L['a'].size()[0] >= 16`. This is
+a non-trivial guard in terms of the inputs of the function, but it is
+registered in the middle of the execution of the program. Even more so,
+we cannot know this guard is needed until we see the `if` statement
+conditional on a `SymNodeVariable` argument. Such conditions are
+invisible to `torch.jit.trace` and require deep analysis of the python
+code.
+
+**Debugging tip** Running this code with `TORCH_LOGS=dynamo` tells us
+where this guard was added
+
+```
+eval 2*s0 >= 16 [guard added] at script.py:5 in fn (_dynamo/variables/tensor.py:812 in evaluate_expr)
+```
+
+Placing a breakpoint there and looking at the backtrace is rather useful
+to understand where a guard came from.
+
+## Making Dynamo Complete: Graph Breaks
+
+With all the tools we have discussed, we have a tracer that can trace
+PyTorch operations on tensors and integers and has a caching system that
+knows when it can reuse a previously traced graph and when it needs to
+retrace. All this executing arbitrary Python code!
+
+There is just one small issue with this. The statement “executing
+arbitrary Python code” is perhaps a bit too general. Dynamo implements a
+good part of Python, but does it implement the more complex parts, like
+coroutines or async? Does it implement the whole Python standard
+library? NumPy also has a Python API. Does `torch.compile` also
+understand NumPy? and Django? [^5]
+
+Python’s ecosystem is massive, and a good part of it is written in other
+more performant languages like C++ or Rust, and it just exposes Python
+bindings. There is no hope in Dynamo tracing through Python objects that
+are implemented in C++. What can a tracer do when it finds an operation
+that it does not understand?
+
+The usual way machine learning tracers handle this issue is by informing
+the user that the operation they choked on and giving up tracing
+altogether. This would pose a real usability issue in the case of
+PyTorch, where its users are used to the flexibility it gives them. As a
+real-world example the `doctr_det_predictor` model uses NumPy and the
+`cv2` library to [postprocess the model’s
+result](https://github.com/mindee/doctr/blob/f2114758d529ed8d3d0030581638f0520b6b98d8/doctr/models/detection/core.py#L86).
+
+Here is another place where having access to CPython is interesting.
+Rather than erroring out, Dynamo can let CPython run that problematic
+code! To do this, Dynamo generates at trace time one graph with all the
+operations before the problematic code, and one with all the operations
+after. [^6] Then, at runtime, it will delegate to CPython to execute the
+first graph, then the problematic code, and then the second graph. This
+process of stopping the tracing and generating multiple graphs is called
+a **graph break**.
+
+A small confession: I lied all throughout the introduction and the first
+sections. Dynamo does not generate one graph, but **multiple graphs**!
+For all practical purposes, starting retracing after a second graph can
+be thought of as starting tracing a new function. The new graph after
+the graph break will have its own guards, its new set of local
+variables, and so on.
+
+To discuss how to implement graph breaks, we need to first revisit how
+Dynamo interacts with CPython. Using PEP 523, CPython allows a user to
+use their own frame evaluation mechanism. What we had not discussed is
+that CPython also exposes its own frame evaluation for others to use.
+Dynamo leverages this to let the fast CPython interpreter run the
+compiled code. For a function without graph breaks, the whole tracing /
+execution process of a program that calls the function 2 times with the
+same arguments looks like this:
+
+1. In the first call to the function
+
+   1. Dynamo traces the function into an FX graph
+
+      1. The FX graph is compiled by the compiler (Inductor) into
+         efficient low-level code… but that’s a story for another day
+
+   2. It rewrites the bytecode of the function so that it simply calls
+      the compiled function
+
+   3. It gives CPython this new bytecode and asks it to run it
+      [here](https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/csrc/dynamo/eval_frame.c#L1006)
+
+2. In the second call to the function
+
+   1. It checks the guards from the first call against the new arguments
+      [here](https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/csrc/dynamo/eval_frame.c#L658).
+      Since they are the same arguments as before, they pass
+   2. It asks CPython to run the bytecode associated to those guards
+      [here](https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/csrc/dynamo/eval_frame.c#L972-L975)
+
+This process on its own looks overly complicated. Why generate new
+bytecode and ask CPython to run it rather than simply creating a C++
+binding to the compiled function and executing it? Well, this pattern
+allows us to implement graph breaks! The bytecode generated by a graph
+break has the following structure:
+
+1. Bytecode that executes the first graph
+2. Bytecode that leaves the stack as it would be if CPython would have
+   executed the first graph. It also replays any modifications to local
+   or global variables that would be visible at this point
+3. The bytecode that made Dynamo graph break
+4. Bytecode that executes the second graph
+
+Let us see this in a simple example
+
+```python
+import torch
+
+@torch.compile
+def fn(a):
+    b = a + 2
+    print("Hi")
+    return b + a
+
+fn(torch.randn(4))
+```
+
+Running this with `TORCH_LOGS=bytecode` shows us the initial bytecode
+and the modified bytecode
+
+```python
+MODIFIED BYTECODE fn script.py line 3
+ 0 LOAD_GLOBAL              1 (__compiled_fn_0)
+ 2 LOAD_FAST                0 (a)
+ 4 CALL_FUNCTION            1
+ 6 STORE_FAST               3 (graph_out_0)
+ 8 LOAD_GLOBAL              0 (print)
+10 LOAD_CONST               2 ('Hi')
+12 LOAD_FAST                3 (graph_out_0)
+14 LOAD_CONST               3 (0)
+16 BINARY_SUBSCR
+18 STORE_FAST               1 (b)
+
+20 CALL_FUNCTION            1
+22 LOAD_GLOBAL              2 (__resume_at_14_1)
+24 ROT_TWO
+26 LOAD_FAST                0 (a)
+28 LOAD_FAST                1 (b)
+30 CALL_FUNCTION            3
+32 RETURN_VALUE
+
+MODIFIED BYTECODE resume_in_fn script.py line 6
+ 0 LOAD_GLOBAL              1 (__compiled_fn_2)
+ 2 LOAD_FAST                2 (b)
+ 4 LOAD_FAST                1 (a)
+ 6 CALL_FUNCTION            2
+ 8 UNPACK_SEQUENCE          1
+10 RETURN_VALUE
+```
+
+We can see that the modified bytecode is split into two functions,
+`fn`, the original function, and a function called `resume_in_fn`.
+This second function is a function created by Dynamo to implement the
+execution of the program starting at the graph break. This is often
+called a [continuation
+function](https://en.wikipedia.org/wiki/Continuation). This
+continuation function simply calls the second compiled function with the
+right arguments. The code for the initial function is rewritten
+implementing the strategy that we described before
+
+- L0-4. Call the compiled function (`a + 2`).
+- L6. Store its result in a local variable called `graph_out_0`.
+  `graph_out_0` is a tuple
+- L8-18. Leave the stack as it would be at the point of the graph break
+- L20. Execute the code that caused the graph break
+- L22-32. Call the compiled continuation function (`a + b`)
+
+The code generation of the stack in Dynamo is delegated to
+`VariableTracker` subclasses. Every `VariableTracker` object in
+Dynamo has a [reconstruct](https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/_dynamo/variables/lists.py#L307-L309)
+method that generates the necessary bytecode to create the python object
+it represents on the stack.
+
+**Debugging tip**. Graph breaks hamper performance, and as such, it is
+best to avoid them. Running a program with `TORCH_LOGS=graph_breaks`
+is a great way to find how many graph breaks did our program hit. The
+information it returns is in terms of `VariableTracker` objects, so
+the debugging tips above are sometimes also helpful to figure out what
+caused that graph break.
+
+## Conclusion
+
+Dynamo is a complex piece of software. Once you sign up to implement a
+CPython interpreter you know you are in for a ride. That being said, we
+hope that this post helps demystify it a bit.
+
+Dynamo is (mostly) implemented in Python. We left plenty of links to the
+pieces of the code that we discussed. We hope that reading those pieces
+of code and grepping for the places that call them, or putting
+breakpoints on them and looking at the call stack helps understanding
+the rest of the code base.
+
+Of course, the best way to learn how a piece of software works is by
+extending it. In this case, the best way is to have a look at the [open
+dynamo issues on
+github](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+is%3Aopen+label%3A%22module%3A+dynamo%22+).
+Many of them require very minor changes in the code, once you find where
+you need to make those changes.
+
+## Footnotes
+
+Below are additional details and references for concepts mentioned in this document.
+
+[^1]: In the literature, this is called a Directed Acyclical Graph (DAG).
+
+[^2]: All this binding code lives in `torch/csrc/dynamo/eval_frame.c`.
+
+[^3]: In CPython lingo, the set of all these objects are called [a
+    frame](https://github.com/python/cpython/blob/f26bfe4b25f7e5a4f68fcac26207b7175abad208/Include/internal/pycore_frame.h#L57-L71).
+
+[^4]: There are also `SymBool` and `SymFloat` classes. The latter one
+    is not used all that much at the time of this writing.
+
+[^5]: Interestingly enough, it does understand NumPy code! Have a look at
+    [this blogpost](https://pytorch.org/blog/compiling-numpy-code/)
+    and [the docs](https://pytorch.org/docs/main/torch.compiler_faq.html#does-numpy-work-with-torch-compile).
+    Now, this is just possible because we reimplemented NumPy using
+    PyTorch. Good luck implementing Django in PyTorch though…
+
+[^6]: Assuming there is just one piece of problematic code. If there are
+    more, Dynamo can split the code into as many graphs as it needs.
diff --git a/docs/source/torch.compiler_dynamo_overview.md b/docs/source/torch.compiler_dynamo_overview.md
new file mode 100644
index 000000000000..6baf75058a8e
--- /dev/null
+++ b/docs/source/torch.compiler_dynamo_overview.md
@@ -0,0 +1,333 @@
+# Dynamo Overview
+
+Before you read this section, read {ref}`torch.compiler_overview`.
+
+TorchDynamo (or simply Dynamo) is a Python-level Just-In-Time (JIT) compiler designed to make
+unmodified PyTorch programs faster. Dynamo hooks into the frame evaluation
+API in CPython ([PEP 523](https://peps.python.org/pep-0523/)) to
+dynamically modify Python bytecode right before it is executed. It
+rewrites Python bytecode to extract sequences of PyTorch
+operations into an [FX Graph](https://pytorch.org/docs/stable/fx.html)
+which is then compiled with a customizable backend.
+It creates this FX Graph through bytecode analysis and is designed to
+mix Python execution with compiled backends to get the best of both
+worlds — usability and performance.
+
+Dynamo makes it easy to experiment with different compiler
+backends to make PyTorch code faster with a single line decorator
+`torch._dynamo.optimize()` which is wrapped for convenience by `torch.compile()`
+
+The following diagram demonstrates how PyTorch works with `torch.compile`
+and without it:
+
+```{image} _static/img/dynamo/TorchDynamo.png
+```
+
+`TorchInductor` is one of the backends
+supported by [Dynamo Graph](https://pytorch.org/docs/stable/fx.html)
+into [Triton](https://github.com/openai/triton) for GPUs or
+[C++/OpenMP](https://www.openmp.org/) for CPUs. We have a
+[training performance dashboard](https://github.com/pytorch/torchdynamo/issues/681#issuecomment-1233828468)
+that provides performance comparison for different training backends. You can read
+more in the [TorchInductor post on PyTorch
+dev-discuss](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747).
+
+For an in-depth overview, read the sections below, watch the deep-dive video,
+and check out the dev-discuss topics.
+
+- [Dynamo deep-dive video](https://www.youtube.com/watch?v=egZB5Uxki0I)
+- [dev-discuss topics](https://dev-discuss.pytorch.org/search?q=TorchDynamo%20order%3Alatest)
+## Dynamo Internals
+
+**Author**: [Jason Ansel](https://github.com/jansel) and [Kaichao You](https://github.com/youkaichao)
+
+This section will go over some of the Dynamo internals and will
+demonstrate how Dynamo works under the hood.
+
+### What is a guard?
+
+Dynamo operates just-in-time and specializes graphs based on
+dynamic properties. Below is a basic example of how to use Dynamo.
+One can decorate a function or a method using `torchdynamo.optimize` to enable
+Dynamo optimization:
+
+```python
+from typing import List
+import torch
+from torch import _dynamo as torchdynamo
+def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+    print("my_compiler() called with FX graph:")
+    gm.graph.print_tabular()
+    return gm.forward  # return a python callable
+
+@torchdynamo.optimize(my_compiler)
+def toy_example(a, b):
+    x = a / (torch.abs(a) + 1)
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+for _ in range(100):
+    toy_example(torch.randn(10), torch.randn(10))
+```
+
+For example, the first graph above has the following
+guards:
+
+```
+GUARDS:
+hasattr(L['a'], '_dynamo_dynamic_indices') == False
+hasattr(L['b'], '_dynamo_dynamic_indices') == False
+utils_device.CURRENT_DEVICE == None
+___skip_backend_check() or ___current_backend() == ___lookup_backend(140355900538256)
+check_tensor(L['a'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=False, size=[10], stride=[1])
+check_tensor(L['b'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=False, size=[10], stride=[1])
+```
+
+If any of those guards fail, the graph will be recaptured and
+recompiled. The interesting guard there is `check_tensor`, which
+checks the following `torch.Tensor` properties:
+
+- Python class of the tensor (tensor subclassing, etc)
+- dtype
+- device
+- requires_grad
+- dispatch_key (with thread-local includes/excludes applied)
+- ndim
+- sizes\*
+- strides\*
+
+The full specialization mode allows the backend compiler to assume an
+entirely static graph. Unfortunately, most backends require this.
+Operators which return dynamic shapes will trigger a graph break when
+not in dynamic shape mode.
+
+### What is Dynamo doing?
+
+If you want to understand better what Dynamo is doing, you can run your code with:
+
+```
+TORCH_LOGS="+dynamo,guards,bytecode"
+```
+
+If you are not familiar with Python bytecode, you can add a decompiler hook
+to decompile the bytecode into human-readable source code. One available
+tool is [depyf](https://github.com/youkaichao/depyf). If you don't have
+`depyf` already installed, run `pip install depyf`. Then, add the
+following code to install decompilation hooks before you run any code.
+
+```python
+import depyf
+depyf.install()
+```
+
+This code triggers useful (but spammy) printouts.
+
+For example, the printouts for the first graph in the `toy_example`
+are:
+
+```
+__compiled_fn_0 <eval_with_key>.1
+opcode         name     target                                                  args              kwargs
+-------------  -------  ------------------------------------------------------  ----------------  --------
+placeholder    a        a                                                       ()                {}
+placeholder    b        b                                                       ()                {}
+call_function  abs_1    <built-in method abs of type object at 0x7f9ca082f8a0>  (a,)              {}
+call_function  add      <built-in function add>                                 (abs_1, 1)        {}
+call_function  truediv  <built-in function truediv>                             (a, add)          {}
+call_method    sum_1    sum                                                     (b,)              {}
+call_function  lt       <built-in function lt>                                  (sum_1, 0)        {}
+output         output   output                                                  ((truediv, lt),)  {}
+ORIGINAL BYTECODE toy_example example.py line 12
+ 14           0 LOAD_FAST                0 (a)
+              2 LOAD_GLOBAL              0 (torch)
+              4 LOAD_METHOD              1 (abs)
+              6 LOAD_FAST                0 (a)
+              8 CALL_METHOD              1
+             10 LOAD_CONST               1 (1)
+             12 BINARY_ADD
+             14 BINARY_TRUE_DIVIDE
+             16 STORE_FAST               2 (x)
+ 15          18 LOAD_FAST                1 (b)
+             20 LOAD_METHOD              2 (sum)
+             22 CALL_METHOD              0
+             24 LOAD_CONST               2 (0)
+             26 COMPARE_OP               0 (<)
+             28 POP_JUMP_IF_FALSE       19 (to 38)
+ 16          30 LOAD_FAST                1 (b)
+             32 LOAD_CONST               3 (-1)
+             34 BINARY_MULTIPLY
+             36 STORE_FAST               1 (b)
+ 17     >>   38 LOAD_FAST                2 (x)
+             40 LOAD_FAST                1 (b)
+             42 BINARY_MULTIPLY
+             44 RETURN_VALUE
+MODIFIED BYTECODE toy_example example.py line 12
+ 12           0 LOAD_GLOBAL              3 (__compiled_fn_0)
+              2 LOAD_FAST                0 (a)
+              4 LOAD_FAST                1 (b)
+              6 CALL_FUNCTION            2
+              8 UNPACK_SEQUENCE          2
+             10 STORE_FAST               2 (x)
+             12 POP_JUMP_IF_FALSE       12 (to 24)
+             14 LOAD_GLOBAL              4 (__resume_at_30_1)
+             16 LOAD_FAST                1 (b)
+             18 LOAD_FAST                2 (x)
+             20 CALL_FUNCTION            2
+             22 RETURN_VALUE
+        >>   24 LOAD_GLOBAL              5 (__resume_at_38_2)
+             26 LOAD_FAST                1 (b)
+             28 LOAD_FAST                2 (x)
+             30 CALL_FUNCTION            2
+             32 RETURN_VALUE
+possible source code:
+def toy_example(a, b):
+    __temp_1 = __compiled_fn_0(a, b)
+    x = __temp_1[0]
+    if __temp_1[1]:
+        return __resume_at_30_1(b, x)
+    return __resume_at_38_2(b, x)
+If you find the decompiled code is wrong,please submit an issue at https://github.com/youkaichao/depyf/issues.
+```
+
+At the top you can see the FX graph.
+Next, you see the original bytecode of the function, followed by the
+modified bytecode generated by Dynamo, and the decompiled source
+code for reference. Finally, you see the guards which we covered above.
+
+In the modified bytecode, `__compiled_fn_0` is the return value of
+`my_compiler()` (the compiled graph). `__resume_at_30_1` and
+`__resume_at_38_2` are both generated continuation functions that pick
+up execution after a graph break (at bytecode offsets 30 and 38). Each
+of these functions take the form:
+
+```
+__resume_at_<offset>:
+    ... restore stack state if needed ...
+    JUMP_ABSOLUTE <offset> into toy_example
+    ... original bytecode of toy_example ...
+```
+
+By generating this `resume_at` function, we force the remainder of the
+function to be executed in a new Python frame which recursively
+triggers Dynamo to restart its capture once execution reaches that
+point for the first time.
+
+### How to inspect artifacts generated by Dynamo?
+
+To inspect the artifacts generated by Dynamo, there is an API `torch._dynamo.eval_frame._debug_get_cache_entry_list` that retrieves compiled code and guards out of a function's `__code__` object. A compiled function can have several cache entries, and each cache entry consists a generated function to check guards, and a `types.CodeType` object to keep the code to be executed if the guarding conditions are satisfied.
+
+```python
+from torch._dynamo.eval_frame import _debug_get_cache_entry_list, innermost_fn
+cache_entries = _debug_get_cache_entry_list(innermost_fn(toy_example))
+cache_entry = cache_entries[0]
+guard, code = cache_entry.check_fn, cache_entry.code
+# the guard takes the local variables of an input frame, and tells whether a re-compilation should be triggered.
+import dis
+dis.dis(guard)
+dis.dis(code)
+```
+
+If you know Python bytecode, you can understand the above output.
+
+For the guard function, there is no need to inspect the bytecode. We can directly access its guarding conditions:
+
+```python
+for code_part in guard.code_parts:
+    print(code_part)
+```
+
+The output is:
+
+```
+___guarded_code.valid
+___check_global_state()
+hasattr(L['a'], '_dynamo_dynamic_indices') == False
+hasattr(L['b'], '_dynamo_dynamic_indices') == False
+utils_device.CURRENT_DEVICE == None
+___skip_backend_check() or ___current_backend() == ___lookup_backend(140215810860528)
+___check_tensors(L['a'], L['b'], tensor_check_names=tensor_check_names)
+```
+
+Only when all the conditions are satisfied, the guard function returns true, and the compiled code is executed.
+
+For the compiled code, we cannot directly access its source but have to decompile it.
+
+```python
+from depyf import decompile
+print(decompile(code))
+```
+
+The output is:
+
+```
+def toy_example(a, b):
+    __temp_1 = __compiled_fn_0(a, b)
+    x = __temp_1[0]
+    if __temp_1[1]:
+        return __resume_at_30_1(b, x)
+    return __resume_at_38_2(b, x)
+```
+
+Some names referenced in the code are:
+
+- Compiled functions, stored in the global namespace of the module containing the original function `toy_example`. These include names like `__compiled_fn_0` / `__resume_at_30_1` / `__resume_at_38_2`.
+- Closure variables used for checking guards. The names can be accessed from `guard.__code__.co_freevars`, and the values are stored in `guard.__closure__`. These include names like `___guarded_code` / `___is_grad_enabled` / `___are_deterministic_algorithms_enabled` / `___is_torch_function_enabled` / `utils_device` / `___check_tensors` / `tensor_check_names`.
+- Argument `L` of the `guard` function. This is a dict mapping the name of arguments of `toy_example` to its values. This is only available when the function is called, where the frame evaluation API comes into play. In short, `L` is a `dict` with structure of `{'a': value_a, 'b': value_b}`. Therefore, you can see the code uses `L['a']` to refer to the input variable `a`.
+
+The graph break is shown in the code of compiled `toy_example`, where we have to use Python interpreter to select the following graph to execute.
+
+Note that we pass a simple `my_compiler` function as the backend compiler, therefore the subgraph code `__resume_at_38_2`, `__resume_at_30_1`, and `__compiled_fn_0` remain Python code. This can also be inspected (please ignore the function name, and only use the function signature and function body code):
+
+```python
+print("source code of __compiled_fn_0:")
+print(innermost_fn(__compiled_fn_0).__self__.code)
+print("=" * 60)
+print("source code of __resume_at_30_1:")
+print(decompile(__resume_at_30_1))
+print("=" * 60)
+print("source code of __resume_at_38_2:")
+print(decompile(__resume_at_38_2))
+```
+
+```
+source code of __compiled_fn_0:
+def forward(self, L_a_ : torch.Tensor, L_b_ : torch.Tensor):
+    l_a_ = L_a_
+    l_b_ = L_b_
+    abs_1 = torch.abs(l_a_)
+    add = abs_1 + 1;  abs_1 = None
+    truediv = l_a_ / add;  l_a_ = add = None
+    sum_1 = l_b_.sum();  l_b_ = None
+    lt = sum_1 < 0;  sum_1 = None
+    return (truediv, lt)
+# To see more debug info, please use ``graph_module.print_readable()``
+============================================================
+source code of __resume_at_30_1:
+def <resume in toy_example>(b, x):
+    b = b * -1
+    return x * b
+============================================================
+source code of __resume_at_38_2:
+def <resume in toy_example>(b, x):
+    return x * b
+```
+
+However, if we use other backends like the built-in `inductor`, the subgraph code will be compiled CUDA kernels for GPU or C++ code for CPU.
+
+To summarize, the compiled code is conceptually equivalent to the code below:
+
+```python
+def compiled_example(a, b):
+    L = {'a': a, 'b': b}
+    for guard, code in get_cache_entries():
+        if guard(L):
+            return code(a, b)
+    recompile_and_add_another_cache_entry()
+```
+
+The following diagram demonstrates how `torch.compile` transforms and optimizes user-written code: it first extracts computation graphs from the user-written function, and compiles these graphs into optimized functions, then assembles them into a new function, which is functionally equivalent to the user-written code but optimized to have a good computation speed.
+
+```{image} _static/img/dynamo/flowchart.jpg
+```
+
+To learn more about how all this is implemented internally, see {ref}`torch.compiler_dynamo_deepdive`.
\ No newline at end of file
diff --git a/docs/source/torch.compiler_fake_tensor.md b/docs/source/torch.compiler_fake_tensor.md
new file mode 100644
index 000000000000..b53b1d87ad19
--- /dev/null
+++ b/docs/source/torch.compiler_fake_tensor.md
@@ -0,0 +1,157 @@
+# Fake tensor
+
+Code: [fake_tensor.py](https://github.com/pytorch/pytorch/blob/db4572dbf18f1cf50cf662547e272d3117063747/torch/_subclasses/fake_tensor.py)
+
+## Motivation
+
+When doing Dynamo symbolic evaluation and compiler passes, we often want to be able to run tensor operations to understand what output sizes/dtypes/devices are, without actually running those operations (or trashing preexisting tensors), which would be slower (if you're doing a lot of compute) and take a lot of memory (it's bad if your compiler needs to use GPU memory while you are compiling the program). A fake tensor is like a real tensor in all respects, except that it doesn't actually have any data. For example, when we do Dynamo tracing, we need to trace through user Tensor code and answer questions about intermediates (e.g., if a user does a conditional on an intermediate tensor). Without fake tensor, we would not have accurate information for these queries.
+
+Similarly, suppose you want to store metadata for a tensor, e.g., on an FX IR node (meta['val']). You can instead store a fake tensor directly on the node, which will give you all the metadata you need for the tensor, including subtle stuff that you probably wouldn't have handled (e.g., aliasing relationships).
+
+## Related work
+
+- A meta tensor is a tensor with device='meta'. This is actually a lot of what you want for fake tensor, but meta tensors don't model devices, and sometimes stride behavior varies depending on your device, so fake tensors really can get a lot more accurate info this way. Also, meta tensors are "global" (they exist on their own, similar to how a CPU/CUDA tensor exist on their own), whereas fake tensors are scoped to a FakeTensorMode.
+- A tensor subclass lets you subclass torch.Tensor and customize their behavior. Fake tensors are implemented as a tensor subclass; that means almost all of its implementation lives in Python! For more simple examples of tensor subclasses check out [subclass_zoo](https://github.com/albanD/subclass_zoo/).
+- Dynamic shapes allow you to create tensors with symbolic sizes rather than only concrete sizes, and propagate these sizes symbolically through operations. Dynamic shapes maintain state in a ShapeEnv, which is always associated with a FakeTensorMode (so fake tensors also are responsible for managing symbolic sizes.) In general, whenever we compile a subgraph with PT2, there is a tracing context associated with this compilation, which contains, among other things, a FakeTensorMode and (possibly) a ShapeEnv.
+
+## Overall architecture
+
+All fake tensors are associated with a FakeTensorMode. Because fake tensor's primary use case is to do analysis on real tensors, the general workflow is you have a bunch of real tensors, you allocate a FakeTensorMode, and then you use from_real_tensor to convert all those real tensors into fake tensors, and then you do things to the fake tensors. In particular, the FakeTensorMode maintains a memo table persistently mapping tensors (and storages) to the same storages. If you fakeify the same tensor multiple times, you will get the same fake tensor; if you fakeify two tensors which alias each other, you will get two fake tensors which alias the same fake storage. FakeTensors are tensor subclasses, so if you do operations on them, you'll automatically get a fake tensor, but in general you will want to do operations on fake tensors (e.g., if you're running an FX pass) with the FakeTensorMode active; what a tensor operation will do is automatically turn on the fake tensor mode and try again.
+
+A fake tensor is represented as a \_\_torch_dispatch\_\_ tensor subclass of a meta tensor. This means under the hood, fake tensors are meta device tensors; they then use extra extensibility hooks, specifically dispatch_device, to lie about what the actual device of the tensor is. This was one of the more error-prone parts of fake tensors in the early days: sometimes, fake tensors were too good at lying about being CPU/CUDA whatever, and you'd end up with a CPU kernel getting called with a fake tensor trying to dereference the data pointer, which obviously won't work. If you are segfaulting in fake tensor code, this is the first thing you should check: is the C++ backtrace in a CPU kernel (unexpected!) or a meta kernel (expected!) A meta kernel is like a real kernel, but all it does is allocate the outputs, it doesn't do any data compute.
+
+A tensor subclass has to define how to implement various operations. Here is the general fake tensor recipe:
+
+- Run the meta kernel on the input fake tensors, reinterpreting them as meta tensors. This is done via a magic context manager in_kernel_invocation_manager which instructs all of PyTorch to view fake tensors as their underlying meta tensors, rather than "unwrapping" fake tensors into meta tensors (a fake tensor is a meta tensor). Fake tensors are represented this way to avoid having to keep two sets of metadata in sync (the meta tensor's metadata, and the fake tensor's metadata); the "is a" relationship ensures there is only one canonical copy of metadata.
+- If you're a factory function, you'll instead call the underlying factory function with device='meta'.
+- Convert the resulting meta tensor into a fake tensor, computing what the output device of the tensor should be (this is usually trivial, but sometimes it is not, e.g., cpu scalar promotion, or device-converting operations.)
+
+## API: the important bits
+
+Non-PT2 usage (check out test/test_fake_tensor.py for more examples):
+
+```python
+# Create a fake mode
+from torch._subclasses.fake_tensor import FakeTensorMode
+fake_mode = FakeTensorMode()
+converter = fake_mode.fake_tensor_converter
+# Fakeify some real tensors
+fake_x = converter.from_real_tensor(fake_mode, x)
+with fake_mode:
+    # Do some operations on the fake tensors
+    fake_y = fake_x * 2
+    # Factory operations automatically get fakeified in the context manager
+    fake_z = torch.empty(20)
+```
+
+Q: Why do you have real tensors as inputs?
+
+A: In a PT2 context, this is because you typically are compiling just-in-time, so for all the inputs to a graph you're compiling, you already have the "real" inputs, because you're compiling while you're executing the program.
+
+PT2 pre-AOTAutograd usage (this is unusual, you probably don't want to do this):
+
+```python
+# Fake mode is not enabled!
+from torch._guards import detect_fake_mode
+fake_mode = detect_fake_mode(args)
+# if fake_mode isn't None
+converter = fake_mode.fake_tensor_converter
+fake_args = [converter.from_real_tensor(fake_mode, arg) for arg in args]
+with fake_mode:
+    ... # do stuff with the fake args, if needed ...
+```
+
+detect_fake_mode will search a number of locations to try to find "the" fake tensor mode associated with the lifecycle. Typically it will be pulled off of the tracing context.
+
+PT2 post-AOTAutograd usage:
+
+```python
+# Fake mode is enabled! example_inputs is typically fake already
+# TODO: we probably want to change this
+# Still do this to access fake mode
+fake_mode = detect_fake_mode(example_inputs)
+# But in general you don't have to turn it on
+```
+
+Other useful stuff:
+
+```python
+from torch._subclasses.fake_tensor import unset_fake_temporarily
+with unset_fake_temporarily():
+    ... # fake mode is disabled here, you can do real tensor compute
+```
+
+When might you want to disable fake tensor mode? Usually you don't want to do this. One niche case where we've found it useful is to implement constant propagation on fake tensors: in this case, we need to do some actual tensor computation even though we're in a fake tensor mode.
+
+```python
+import FakeTensorProp from torch.fx.passes.fake_tensor_prop
+gm: GraphModule
+real_inputs: List[Tensor]
+FakeTensorProp(gm).propagate(*real_inputs)
+# This will populate meta['val'] on all the FX nodes with a fake tensor
+# or if you have a preexisting fake mode, you should use it
+FakeTensorProp(gm, mode=fake_mode).propagate(*real_inputs)
+# There is also propagate_dont_convert_inputs if your inputs are already fake
+fake_inputs: List[FakeTensor]
+FakeTensorProp(gm, mode=fake_mode).propagate_dont_convert_inputs(*fake_inputs)
+```
+
+## Details
+
+Auto-convert or not?
+Originally, FakeTensorMode would not automatically fakeify real tensors if you tried to do compute on them inside a FakeTensorMode region. The motivation behind this was to prevent the following footgun:
+
+```python
+with FakeTensorMode():
+    real_tensor.t_()
+```
+
+What should this code do? It would be surprising if we actually modified the metadata on the real tensor. But at the same time, there isn't any obvious opportunity to create a FakeTensor. So we conservatively decided to make this raise an error: "Invoking operators with non-Fake Tensor inputs in FakeTensorMode is not yet supported. Please convert all Tensors to FakeTensors first."
+
+This error is pretty annoying in practice. For example, suppose you have a real nn.Module and you want to feed fake tensors through it. You need to somehow fakeify the nn.Module. This motivated FakeCopyMode.
+
+Eventually, we gave up and added automatic fakeification. However, this is still not yet enabled by default in many uses of FakeTensorMode.
+
+Metadata mutation on fake tensor
+If you have a fake tensor, and you t\_() it, the metadata on the fake tensor changes. This is reasonable on its face, but sometimes you want to also store fake tensors as metadata on FX nodes; mutating a fake tensor is bad because this will invalidate old metadata!
+
+In fact, there is a fundamental tension here, which is that fake tensors maintain extremely accurate metadata about tensors, up to and including object identity. If object metadata changes over time in an FX graph, there is not actually any way to represent this change over time. Most of the time, our serious FX analyses are done on functionalized graphs, which don't have this, but occasionally you need to do an analysis on a non-functionalized graph. Maybe it was a mistake to put fake tensor in meta['val']
+
+## About the tensor subclass
+
+Fake tensor uses both a subclass and a mode tensor subclass pattern, where FakeTensor.\_\_torch_dispatch\_\_ enables the FakeTensorMode associated with the fake tensor, and then redispatches (relying on FakeTensorMode to do the heavy lifting). If fake tensor operations get a subclass argument it doesn't recognize, it will return NotImplemented, giving the other subclass a chance to run first (hopefully desugaring into plain tensor operations), before it tries again. This can cause infinite loops.
+
+## How is each individual operator implemented?
+
+Unfortunately, there is a pretty complicated set of places where any given operator may be implemented. Some important cases to know about:
+
+- Tensor subclasses support limited constant propagation if the number of elements is very small (this helps deal with some cases where we immediately call item() on such tensors.)
+- We have some fastpath implementations for certain operators, which are done entirely in fake tensor, for performance reasons.
+- If you use @custom_op to generate a custom tensor, these will register impl_abstract directly to fake tensor.
+- Fake tensor itself has some hardcoded special cases for device-converting operations.
+- If there is no meta implementation nor any decomposition, we will generate real zero-filled tensors and attempt to run the operator directly to find out what the results will be. This can cause segfaults if the operator attempts to do indexing with data, so we don't turn this on by default for custom ops.
+
+## How does the converter work?
+
+Because fake tensors are used in situations that are very sensitive to the exact properties of a tensor, fake tensors do conversion very carefully, preserving leaf-ness, requires_grad'ness, aliasing, and a whole host of other properties. The bulk of the heavy lifting is in MetaConverter.
+
+## Performance characteristics
+
+You would think fake tensors are fast because they don't do any tensor compute. But at small tensor sizes we are actually entirely overhead bound, and, well, fake tensor is in Python, and we often do a LOT of work to do a single tensor operation (because they are implemented as decompositions). So fake tensors are actually pretty slow in practice, especially when symbolic shapes are involved. There are two important fastpaths we currently have in fake tensor that make a big difference in practice:
+
+- Pointwise ops don't go through PrimTorch decomps, instead we've hand-coded their propagation rule.
+- If possible, we should.
+
+## Fake tensor of fake tensor?
+
+There is interest in sending fake tensors as user inputs into the PT2 stack, which would imply we would need to be able to create a fake tensor of a fake tensor. This isn't really supported right now, but maybe it would not be too difficult to do.
+
+## Interaction with dynamic shapes
+
+Every FakeTensorMode contains a ShapeEnv, which tracks all symbolic shapes information. Their lifetimes are typically tied: they live and die together.
+
+Because FakeTensorMode has a ShapeEnv (but meta implementations do not), meta functions that are data-dependent and require allocating an unbacked SymInt live in fake tensor. Fake tensor also takes care of memoizing unbacked SymInts, so that, e.g., if you call nonzero() on the same fake tensor twice, you get the same symbolic size.
+
+## Other resources
+
+[Colab Tutorial On Using FakeTensor To Determine Max Batch Size](https://colab.research.google.com/drive/1zjAisRrc8R6uixKsrs1DRm3lwz5MWN68)
diff --git a/docs/source/torch.compiler_faq.md b/docs/source/torch.compiler_faq.md
new file mode 100644
index 000000000000..7a8eaaa5215f
--- /dev/null
+++ b/docs/source/torch.compiler_faq.md
@@ -0,0 +1,630 @@
+# Frequently Asked Questions
+
+**Author**: [Mark Saroufim](https://github.com/msaroufim)
+
+## Does `torch.compile` support training?
+
+`torch.compile` supports training, using AOTAutograd to capture backwards:
+
+1. The `.forward()` graph and `optimizer.step()` is captured by
+   TorchDynamo’s python `evalframe` frontend.
+2. For each segment of `.forward()` that torchdynamo captures, it uses
+   AOTAutograd to generate a backward graph segment.
+3. Each pair of forward and backward graph are (optionally) min-cut
+   partitioned to save the minimal state between forward and backward.
+4. The forward and backward pairs are wrapped in `autograd.function` modules.
+5. User code calling `.backward()` still triggers eager’s autograd engine,
+   which runs each *compiled backward* graph as if it were one op, also running
+   any non-compiled eager ops’ `.backward()` functions.
+
+## Do you support Distributed code?
+
+`torch.compile` supports `DistributedDataParallel` (DDP).
+Support for other distributed training libraries is being considered.
+
+The main reason why Distributed code is challenging with dynamo is
+because AOTAutograd unrolls both the forward and backward pass and
+provides 2 graphs for backends to optimize. This is a problem for
+distributed code because we’d like to ideally overlap communication
+operations with computations. Eager pytorch accomplishes this in
+different ways for DDP/FSDP- using autograd hooks, module hooks, and
+modifications/mutations of module states. In a naive application of
+dynamo, hooks that should run directly after an operation during
+backwards may be delayed until after the entire compiled region of
+backwards ops, due to how AOTAutograd compiled functions interact with
+dispatcher hooks.
+
+The basic strategy for optimizing DDP with Dynamo is outlined in
+[distributed.py](https://github.com/pytorch/pytorch/blob/main/torch/_dynamo/backends/distributed.py)
+where the main idea will be to graph break on [DDP bucket
+boundaries](https://pytorch.org/docs/stable/notes/ddp.html#internal-design).
+
+When each node in DDP needs to synchronize its weights with the other
+nodes it organizes its gradients and parameters into buckets which
+reduces communication times and allows a node to broadcast a fraction of
+its gradients to other waiting nodes.
+
+Graph breaks in distributed code mean you can expect dynamo and its
+backends to optimize the compute overhead of a distributed program but
+not its communication overhead. Graph-breaks may interfere with
+compilation speedups, if the reduced graph-size robs the compiler of
+fusion opportunities. However, there are diminishing returns with
+increasing graph size since most of the current compute optimizations
+are local fusions. So in practice this approach may be sufficient.
+
+## Do I still need to export whole graphs?
+
+For the vast majority of models you probably don’t and you can use
+`torch.compile()` as is but there are a few situations where
+full graphs are necessary and you can can ensure a full graph by simply
+running `torch.compile(..., fullgraph=True)`. These situations include:
+
+- Large scale training runs, such as $250K+ that require pipeline parallelism
+  and other advanced sharding strategies.
+- Inference optimizers like [TensorRT](https://github.com/pytorch/TensorRT)
+  or [AITemplate](https://github.com/facebookincubator/AITemplate) that
+  rely on fusing much more aggressively than training optimizers.
+- Mobile training or inference.
+
+Future work will include tracing communication operations into graphs,
+coordinating these operations with compute optimizations, and optimizing
+the communication operations.
+
+## Why is my code crashing?
+
+If your code ran just fine without `torch.compile` and started to
+crash with it is enabled, then the most important first step is figuring
+out which part of the stack your failure occurred. To troubleshoot that,
+follow the steps below and only try the next step if the previous one
+succeeded.
+
+1. `torch.compile(..., backend="eager")` which only runs TorchDynamo
+   forward graph capture and then runs the captured graph with PyTorch.
+   If this fails then there’s an issue with TorchDynamo.
+2. `torch.compile(..., backend="aot_eager")`
+   which runs TorchDynamo to capture a forward graph, and then AOTAutograd
+   to trace the backward graph without any additional backend compiler
+   steps. PyTorch eager will then be used to run the forward and backward
+   graphs. If this fails then there’s an issue with AOTAutograd.
+3. `torch.compile(..., backend="inductor")` which runs TorchDynamo to capture a
+   forward graph, and then AOTAutograd to trace the backward graph with the
+   TorchInductor compiler. If this fails then there’s an issue with TorchInductor
+
+## Why is compilation slow?
+
+- **Dynamo Compilation**– TorchDynamo has a builtin stats function for
+  collecting and displaying the time spent in each compilation phase.
+  These stats can be accessed by calling `torch._dynamo.utils.compile_times()`
+  after executing `torch._dynamo`. By default, this returns a string
+  representation of the compile times spent in each TorchDynamo function by name.
+- **Inductor Compilation**– TorchInductor has a builtin stats and trace function
+  for displaying time spent in each compilation phase, output code, output
+  graph visualization and IR dump. `env TORCH_COMPILE_DEBUG=1 python repro.py`.
+  This is a debugging tool designed to make it easier to debug/understand the
+  internals of TorchInductor with an output that will look something like
+  [this](https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396)
+  Each file in that debug trace can be enabled/disabled via
+  `torch._inductor.config.trace.*`. The profile and the diagram are both
+  disabled by default since they are expensive to generate. See the
+  [example debug directory
+  output](https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396)
+  for more examples.
+- **Excessive Recompilation**
+  When TorchDynamo compiles a function (or part of one), it makes certain
+  assumptions about locals and globals in order to allow compiler
+  optimizations, and expresses these assumptions as guards that check
+  particular values at runtime. If any of these guards fail, Dynamo will
+  recompile that function (or part) up to
+  `torch._dynamo.config.recompile_limit` times. If your program is
+  hitting the cache limit, you will first need to determine which guard is
+  failing and what part of your program is triggering it. The
+  Use `TORCH_TRACE/tlparse` or `TORCH_LOGS=recompiles` to trace the root of the issue, check {ref}`torch.compiler_troubleshooting` for more details.
+## Why are you recompiling in production?
+
+In some cases, you may not want unexpected compiles after a program has
+warmed up. For example, if you are serving production traffic in a
+latency critical application. For this, TorchDynamo provides an
+alternate mode where prior compiled graphs are used, but no new ones are
+generated:
+
+```python
+frozen_toy_example = dynamo.run(toy_example)
+frozen_toy_example(torch.randn(10), torch.randn(10))
+```
+
+## How are you speeding up my code?
+
+There are 3 major ways to accelerate PyTorch code:
+
+1. Kernel fusion via vertical fusions which fuse sequential operations to avoid
+   excessive read/writes. For example, fuse 2 subsequent cosines means you
+   can can do 1 read 1 write instead 2 reads 2 writes 2. Horizontal fusion:
+   the simplest example being batching where a single matrix is multiplied
+   with a batch of examples but the more general scenario is a grouped GEMM
+   where a group of matrix multiplications are scheduled together
+2. Out of order execution: A general optimization for compilers, by looking ahead
+   at the exact data dependencies within a graph we can decide on the most
+   opportune time to execute a node and which buffers can be reused
+3. Automatic work placement: Similar of the out of order execution point,
+   but by matching nodes of a graph to resources like physical hardware or
+   memory we can design an appropriate schedule
+
+The above are general principles for accelerating PyTorch code but
+different backends will each make different tradeoffs on what to
+optimize. For example Inductor first takes care of fusing whatever it
+can and only then generates [Triton](https://openai.com/blog/triton/)
+kernels.
+
+Triton in addition offers speedups because of automatic memory
+coalescing, memory management and scheduling within each Streaming
+Multiprocessor and has been designed to handle tiled computations.
+
+However, regardless of the backend you use it’s best to use a benchmark
+and see approach so try out the PyTorch profiler, visually inspect the
+generated kernels and try to see what’s going on for yourself.
+
+
+(torch.compiler_graph_breaks)=
+## Why am I not seeing speedups?
+
+### Graph Breaks
+
+The main reason you won’t see the speedups you’d like to by using dynamo
+is excessive graph breaks. So what’s a graph break?
+
+Given a program like:
+
+```python
+def some_fun(x):
+    ...
+
+torch.compile(some_fun)(x)
+...
+```
+
+Torchdynamo will attempt to compile all of the torch/tensor operations
+within `some_fun()` into a single FX graph, but it may fail to capture
+everything into one graph.
+
+Some graph break reasons are insurmountable to TorchDynamo like calling
+into a C extension other than PyTorch is invisible to TorchDynamo, and
+could do arbitrary things without TorchDynamo being able to introduce
+necessary guards to ensure that the compiled program would be safe to reuse.
+
+> To maximize performance, it’s important to have as few graph breaks
+> as possible.
+### Identifying the cause of a graph break
+
+To identify all graph breaks in a program and the associated reasons for
+the breaks, `torch._dynamo.explain` can be used. This tool runs
+TorchDynamo on the supplied function and aggregates the graph breaks
+that are encountered. Here is an example usage:
+
+```python
+import torch
+import torch._dynamo as dynamo
+def toy_example(a, b):
+    x = a / (torch.abs(a) + 1)
+    print("woo")
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+explanation = dynamo.explain(toy_example)(torch.randn(10), torch.randn(10))
+print(explanation)
+"""
+Graph Count: 3
+Graph Break Count: 2
+Op Count: 5
+Break Reasons:
+  Break Reason 1:
+    Reason: builtin: print [<class 'torch._dynamo.variables.constant.ConstantVariable'>] False
+    User Stack:
+      <FrameSummary file foo.py, line 5 in toy_example>
+  Break Reason 2:
+    Reason: generic_jump TensorVariable()
+    User Stack:
+      <FrameSummary file foo.py, line 6 in torch_dynamo_resume_in_toy_example_at_5>
+Ops per Graph:
+  ...
+Out Guards:
+  ...
+"""
+```
+
+To throw an error on the first graph break encountered you can
+disable python fallbacks by using `fullgraph=True`, this should be
+familiar if you’ve worked with export based compilers.
+
+```python
+def toy_example(a, b):
+   ...
+
+torch.compile(toy_example, fullgraph=True, backend=<compiler>)(a, b)
+```
+
+### Why didn’t my code recompile when I changed it?
+
+If you enabled dynamic shapes by setting
+`env TORCHDYNAMO_DYNAMIC_SHAPES=1 python model.py` then your code
+won’t recompile on shape changes. We’ve added support for dynamic shapes
+which avoids recompilations in the case when shapes vary by less than a
+factor of 2. This is especially useful in scenarios like varying image
+sizes in CV or variable sequence length in NLP. In inference scenarios
+it’s often not possible to know what a batch size will be beforehand
+because you take what you can get from different client apps.
+
+In general, TorchDynamo tries very hard not to recompile things
+unnecessarily so if for example TorchDynamo finds 3 graphs and your
+change only modified one graph then only that graph will recompile. So
+another tip to avoid potentially slow compilation times is to warmup a
+model by compiling it once after which subsequent compilations will be
+much faster. Cold start compile times is still a metric we track
+visibly.
+
+## Why am I getting incorrect results?
+
+Accuracy issues can also be minified if you set the environment variable
+`TORCHDYNAMO_REPRO_LEVEL=4`, it operates with a similar git bisect
+model and a full repro might be something like
+`TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4` the reason
+we need this is downstream compilers will codegen code whether it’s
+Triton code or the C++ backend, the numerics from those downstream
+compilers can be different in subtle ways yet have dramatic impact on
+your training stability. So the accuracy debugger is very useful for us
+to detect bugs in our codegen or with a backend compiler.
+
+If you'd like to ensure that random number generation is the same across both torch
+and triton then you can enable `torch._inductor.config.fallback_random = True`
+
+## Why am I getting OOMs?
+
+Dynamo is still an alpha product so there’s a few sources of OOMs and if
+you’re seeing an OOM try disabling the following configurations in this
+order and then open an issue on GitHub so we can solve the root problem
+1\. If you’re using dynamic shapes try disabling them, we’ve disabled
+them by default: `env TORCHDYNAMO_DYNAMIC_SHAPES=0 python model.py` 2.
+CUDA graphs with Triton are enabled by default in inductor but removing
+them may alleviate some OOM issues: `torch._inductor.config.triton.cudagraphs = False`.
+
+## Does `torch.func` work with `torch.compile` (for `grad` and `vmap` transforms)?
+
+Applying a `torch.func` transform to a function that uses `torch.compile`
+does work:
+
+```python
+import torch
+
+@torch.compile
+def f(x):
+    return torch.sin(x)
+
+def g(x):
+    return torch.grad(f)(x)
+
+x = torch.randn(2, 3)
+g(x)
+```
+
+### Calling `torch.func` transform inside of a function handled with `torch.compile`
+
+### Compiling `torch.func.grad` with `torch.compile`
+
+```python
+import torch
+
+def wrapper_fn(x):
+    return torch.func.grad(lambda x: x.sin().sum())(x)
+
+x = torch.randn(3, 3, 3)
+grad_x = torch.compile(wrapper_fn)(x)
+```
+
+### Compiling `torch.vmap` with `torch.compile`
+
+```python
+import torch
+
+def my_fn(x):
+    return torch.vmap(lambda x: x.sum(1))(x)
+
+x = torch.randn(3, 3, 3)
+output = torch.compile(my_fn)(x)
+```
+
+### Compiling functions besides the ones which are supported (escape hatch)
+
+For other transforms, as a workaround, use `torch._dynamo.allow_in_graph`
+
+`allow_in_graph` is an escape hatch. If your code does not work with
+`torch.compile`, which introspects Python bytecode, but you believe it
+will work via a symbolic tracing approach (like `jax.jit`), then use
+`allow_in_graph`.
+
+By using `allow_in_graph` to annotate a function, you must make sure
+your code meets the following requirements:
+
+- All outputs in your function only depend on the inputs and
+  do not depend on any captured Tensors.
+- Your function is functional. That is, it does not mutate any state. This may
+  be relaxed; we actually support functions that appear to be functional from
+  the outside: they may have in-place PyTorch operations, but may not mutate
+  global state or inputs to the function.
+- Your function does not raise data-dependent errors.
+
+```python
+import torch
+
+@torch.compile
+def f(x):
+    return torch._dynamo.allow_in_graph(torch.vmap(torch.sum))(x)
+
+x = torch.randn(2, 3)
+f(x)
+```
+
+A common pitfall is using `allow_in_graph` to annotate a function that
+invokes an `nn.Module`. This is because the outputs now depend on the
+parameters of the `nn.Module`. To get this to work, use
+`torch.func.functional_call` to extract the module state.
+
+## Does NumPy work with `torch.compile`?
+
+Starting in 2.1, `torch.compile` understands native NumPy programs that
+work on NumPy arrays, and mixed PyTorch-NumPy programs that convert from PyTorch
+to NumPy and back via `x.numpy()`, `torch.from_numpy`, and related functions.
+
+(nonsupported-numpy-feats)=
+
+### Which NumPy features does `torch.compile` support?
+
+NumPy within `torch.compile` follows NumPy 2.0 pre-release.
+
+Generally, `torch.compile` is able to trace through most NumPy constructions,
+and when it cannot, it falls back to eager and lets NumPy execute that piece of
+code. Even then, there are a few features where `torch.compile` semantics
+slightly deviate from those of NumPy:
+
+- NumPy scalars: We model them as 0-D arrays. That is, `np.float32(3)` returns
+  a 0-D array under `torch.compile`. To avoid a graph break, it is best to use this 0-D
+  array. If this breaks your code, you can workaround this by casting the NumPy scalar
+  to the relevant Python scalar type `bool/int/float`.
+- Negative strides: `np.flip` and slicing with a negative step return a copy.
+- Type promotion: NumPy's type promotion will change in NumPy 2.0. The new rules
+  are described in [NEP 50](https://numpy.org/neps/nep-0050-scalar-promotion.html).
+  `torch.compile` implements NEP 50 rather than the current soon-to-be deprecated rules.
+- `{tril,triu}_indices_from/{tril,triu}_indices` return arrays rather than a tuple of arrays.
+
+There are other features for which we do not support tracing and we gracefully
+fallback to NumPy for their execution:
+
+- Non-numeric dtypes like datetimes, strings, chars, void, structured dtypes and recarrays.
+- Long dtypes `np.float128/np.complex256` and some unsigned dtypes `np.uint16/np.uint32/np.uint64`.
+- `ndarray` subclasses.
+- Masked arrays.
+- Esoteric ufunc machinery like `axes=[(n,k),(k,m)->(n,m)]` and ufunc methods (e.g., `np.add.reduce`).
+- Sorting / ordering `complex64/complex128` arrays.
+- NumPy `np.poly1d` and `np.polynomial`.
+- Positional `out1, out2` args in functions with 2 or more returns (`out=tuple` does work).
+- `__array_function__`, `__array_interface__` and `__array_wrap__`.
+- `ndarray.ctypes` attribute.
+
+### Can I compile NumPy code using `torch.compile`?
+
+Of course you do! `torch.compile` understands NumPy code natively, and treats it
+as if it were PyTorch code. To do so, simply wrap NumPy code with the `torch.compile`
+decorator.
+
+```python
+import torch
+import numpy as np
+
+@torch.compile
+def numpy_fn(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
+    return np.sum(X[:, :, None] * Y[:, None, :], axis=(-2, -1))
+
+X = np.random.randn(1024, 64)
+Y = np.random.randn(1024, 64)
+Z = numpy_fn(X, Y)
+assert isinstance(Z, np.ndarray)
+```
+
+Executing this example with the environment variable `TORCH_LOGS=output_code`, we can see
+that `torch.compile` was able to fuse the multiplication and the sum into one C++ kernel.
+It was also able to execute them in parallel using OpenMP (native NumPy is single-threaded).
+This can easily make your NumPy code `n` times faster, where `n` is the number of cores
+in your processor!
+
+Tracing NumPy code this way also supports graph breaks within the compiled code.
+
+### Can I execute NumPy code on CUDA and compute gradients via `torch.compile`?
+
+Yes you can! To do so, you may simply execute your code within a `torch.device("cuda")`
+context. Consider the example
+
+```python
+import torch
+import numpy as np
+
+@torch.compile
+def numpy_fn(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
+    return np.sum(X[:, :, None] * Y[:, None, :], axis=(-2, -1))
+
+X = np.random.randn(1024, 64)
+Y = np.random.randn(1024, 64)
+with torch.device("cuda"):
+    Z = numpy_fn(X, Y)
+assert isinstance(Z, np.ndarray)
+```
+
+In this example, `numpy_fn` will be executed in CUDA. For this to be
+possible, `torch.compile` automatically moves `X` and `Y` from CPU
+to CUDA, and then it moves the result `Z` from CUDA to CPU. If we are
+executing this function several times in the same program run, we may want
+to avoid all these rather expensive memory copies. To do so, we just need
+to tweak our `numpy_fn` so that it accepts cuda Tensors and returns tensors.
+We can do so by using `torch.compiler.wrap_numpy`:
+
+```python
+@torch.compile(fullgraph=True)
+@torch.compiler.wrap_numpy
+def numpy_fn(X, Y):
+    return np.sum(X[:, :, None] * Y[:, None, :], axis=(-2, -1))
+
+X = torch.randn(1024, 64, device="cuda")
+Y = torch.randn(1024, 64, device="cuda")
+Z = numpy_fn(X, Y)
+assert isinstance(Z, torch.Tensor)
+assert Z.device.type == "cuda"
+```
+
+Here, we explicitly create the tensors in CUDA memory, and pass them to the
+function, which performs all the computations on the CUDA device.
+`wrap_numpy` is in charge of marking any `torch.Tensor` input as an input
+with `np.ndarray` semantics at a `torch.compile` level. Marking tensors
+inside the compiler is a very cheap operation, so no data copy or data movement
+happens during runtime.
+
+Using this decorator, we can also differentiate through NumPy code!
+
+```python
+@torch.compile(fullgraph=True)
+@torch.compiler.wrap_numpy
+def numpy_fn(X, Y):
+    return np.mean(np.sum(X[:, :, None] * Y[:, None, :], axis=(-2, -1)))
+
+X = torch.randn(1024, 64, device="cuda", requires_grad=True)
+Y = torch.randn(1024, 64, device="cuda")
+Z = numpy_fn(X, Y)
+assert isinstance(Z, torch.Tensor)
+Z.backward()
+# X.grad now holds the gradient of the computation
+print(X.grad)
+```
+
+We have been using `fullgraph=True` as graph break are problematic in this context.
+When a graph break occurs, we need to materialize the NumPy arrays. Since NumPy arrays
+do not have a notion of `device` or `requires_grad`, this information is lost during
+a graph break.
+
+We cannot propagate gradients through a graph break, as the graph break code may execute
+arbitrary code that don't know how to differentiate. On the other hand, in the case of
+the CUDA execution, we can work around this problem as we did in the first example, by
+using the `torch.device("cuda")` context manager:
+
+```python
+@torch.compile
+@torch.compiler.wrap_numpy
+def numpy_fn(X, Y):
+    prod = X[:, :, None] * Y[:, None, :]
+    print("oops, a graph break!")
+    return np.sum(prod, axis=(-2, -1))
+
+X = torch.randn(1024, 64, device="cuda")
+Y = torch.randn(1024, 64, device="cuda")
+
+with torch.device("cuda"):
+    Z = numpy_fn(X, Y)
+assert isinstance(Z, torch.Tensor)
+assert Z.device.type == "cuda"
+```
+
+During the graph break, the intermediary tensors still need to be moved to CPU, but when the
+tracing is resumed after the graph break, the rest of the graph is still traced on CUDA.
+Given this CUDA <> CPU and CPU <> CUDA movement, graph breaks are fairly costly in the NumPy
+context and should be avoided, but at least they allow tracing through complex pieces of code.
+
+### How do I debug NumPy code under `torch.compile`?
+
+Debugging JIT compiled code is challenging, given the complexity of modern
+compilers and the daunting errors that they raise.
+{ref}`The torch.compile troubleshooting doc <torch.compiler_troubleshooting>`
+contains a few tips and tricks on how to tackle this task.
+
+If the above is not enough to pinpoint the origin of the issue, there are still
+a few other NumPy-specific tools we can use. We can discern whether the bug
+is entirely in the PyTorch code by disabling tracing through NumPy functions:
+
+```python
+from torch._dynamo import config
+config.trace_numpy = False
+```
+
+If the bug lies in the traced NumPy code, we can execute the NumPy code eagerly (without `torch.compile`)
+using PyTorch as a backend by importing `import torch._numpy as np`.
+This should just be used for **debugging purposes** and is in no way a
+replacement for the PyTorch API, as it is **much less performant** and, as a
+private API, **may change without notice**. At any rate, `torch._numpy` is a
+Python implementation of NumPy in terms of PyTorch and it is used internally by `torch.compile` to
+transform NumPy code into Pytorch code. It is rather easy to read and modify,
+so if you find any bug in it feel free to submit a PR fixing it or simply open
+an issue.
+
+If the program does work when importing `torch._numpy as np`, chances are
+that the bug is in TorchDynamo. If this is the case, please feel free to open an issue
+with a {ref}`minimal reproducer <torch.compiler_troubleshooting>`.
+
+### I `torch.compile` some NumPy code and I did not see any speed-up.
+
+The best place to start is the
+[tutorial with general advice for how to debug these sort of torch.compile issues](https://pytorch.org/docs/main/torch.compiler_faq.html#why-am-i-not-seeing-speedups).
+
+Some graph breaks may happen because of the use of unsupported features. See
+{ref}`nonsupported-numpy-feats`. More generally, it is useful to keep in mind
+that some widely used NumPy features do not play well with compilers. For
+example, in-place modifications make reasoning difficult within the compiler and
+often yield worse performance than their out-of-place counterparts.As such, it is best to avoid
+them. Same goes for the use of the `out=` parameter. Instead, prefer
+out-of-place ops and let `torch.compile` optimize the memory use. Same goes
+for data-dependent ops like masked indexing through boolean masks, or
+data-dependent control flow like `if` or `while` constructions.
+
+## Which API to use for fine grain tracing?
+
+In some cases, you might need to exclude small parts of your code from the
+torch.compile compilations. This section provides some of the answers and
+you can find more information in {ref}`torchdynamo_fine_grain_tracing`.
+
+### How do I graph break on a function?
+
+Graph break on a function is not enough to sufficiently express what you want
+PyTorch to do. You need to be more specific about your use case. Some of the
+most common use cases you might want to consider:
+
+- If you want to disable compilation on this function frame and the recursively
+  invoked frames, use `torch._dynamo.disable`.
+- If you want a particular operator, such as `fbgemm` to use the eager mode,
+  use `torch._dynamo.disallow_in_graph`.
+
+Some of the uncommon use cases include:
+
+- If you want to disable TorchDynamo on the function frame but enable it back
+  on the recursively invoked frames – use `torch._dynamo.disable(recursive=False)`.
+- If you want to prevent inlining of a function frame – use `torch._dynamo.graph_break`
+  at the beginning of the function you want to prevent inlining.
+
+### What's the difference between `torch._dynamo.disable` and `torch._dynamo.disallow_in_graph`
+
+Disallow-in-graph works at the level of operators, or more specifically,
+the operators that you see in the TorchDynamo extracted graphs.
+
+Disable works at the function frame level and decides if TorchDynamo
+should look into the function frame or not.
+
+### What's the difference between `torch._dynamo.disable` and `torch._dynamo_skip`
+
+:::{note}
+`torch._dynamo_skip` is deprecated.
+:::
+
+You most likely need `torch._dynamo.disable`. But in an unlikely scenario, you
+might need even finer control. Suppose you want to disable the tracing on just
+the `a_fn` function, but want to continue the tracing back in `aa_fn` and
+`ab_fn`. The image below demonstrates this use case:
+
+:::{figure} _static/img/fine_grained_apis/call_stack_diagram.png
+:alt: diagram of torch.compile + disable(a_fn, recursive=False)
+:::
+
+In this case, you can use `torch._dynamo.disable(recursive=False)`.
+In previous versions, this functionality was provided by `torch._dynamo.skip`.
+This is now supported by the `recursive` flag inside `torch._dynamo.disable`.
\ No newline at end of file
diff --git a/docs/source/torch.compiler_fine_grain_apis.md b/docs/source/torch.compiler_fine_grain_apis.md
new file mode 100644
index 000000000000..fc4768ce2ebc
--- /dev/null
+++ b/docs/source/torch.compiler_fine_grain_apis.md
@@ -0,0 +1,108 @@
+(torchdynamo_fine_grain_tracing)=
+
+# TorchDynamo APIs for fine-grained tracing
+
+:::{note}
+In this document `torch.compiler.compile` and `torch.compile` are used interchangeably.
+Both versions will work in your code.
+:::
+
+`torch.compile` performs TorchDynamo tracing on the whole user model.
+However, it is possible that a small part of the model code cannot be
+handled by `torch.compiler`. In this case, you might want to disable
+the compiler on that particular portion, while running compilation on
+the rest of the model. This section describe the existing APIs that
+use to define parts of your code in which you want to skip compilation
+and the relevant use cases.
+
+The API that you can use to define portions of the code on which you can
+disable compilation are listed in the following table:
+
+```{eval-rst}
+.. csv-table:: TorchDynamo APIs to control fine-grained tracing
+   :header: "API", "Description", "When to use?"
+   :widths: auto
+
+   "``torch.compiler.disable``", "Disables Dynamo on the decorated function as well as recursively invoked functions.", "Excellent for unblocking a user, if a small portion of the model cannot be handled with ``torch.compile``."
+   "``torch._dynamo.disallow_in_graph``", "Disallows the marked op in the TorchDynamo graph. TorchDynamo causes graph break, and runs the op in the eager (no compile) mode.\n\nThis is suitable for the ops, while ``torch.compiler.disable`` is suitable for decorating functions.", "This API is excellent for both debugging and unblocking if a custom op like ``torch.ops.fbgemm.*`` is causing issues with the ``torch.compile`` function."
+   "``torch.compile.allow_in_graph``", "The annotated callable goes as is in the TorchDynamo graph. For example, a black-box for TorchDynamo Dynamo.\n\nNote that AOT Autograd will trace through it, so the ``allow_in_graph`` is only a Dynamo-level concept.", "This API is useful for portions of the model which have known TorchDynamo hard-to-support features, like hooks or ``autograd.Function``. However, each usage of ``allow_in_graph`` **must be carefully screened** (no graph breaks, no closures)."
+   "``torch._dynamo.graph_break``", "Adds a graph break. The code before and after the graph break goes through TorchDynamo.", "**Rarely useful for deployment** - If you think you need this, most probably you need either ``disable`` or ``disallow_in_graph``."
+   "``torch.compiler.is_compiling``", "Indicates whether a graph is executed/traced as part of torch.compile() or torch.export()."
+   "``torch.compiler.is_dynamo_compiling``", "Indicates whether a graph is traced via TorchDynamo. It's stricter than torch.compiler.is_compiling() flag, as it would only be set to True when TorchDynamo is used."
+   "``torch.compiler.is_exporting``", "Indicates whether a graph is traced via export. It's stricter than torch.compiler.is_compiling() flag, as it would only be set to True when torch.export is used."
+```
+
+## `torch.compiler.disable`
+
+`torch.compiler.disable` disables compilation on the decorated function frame and all the function frames recursively invoked from the decorated function frame.
+
+TorchDynamo intercepts the execution of each Python function frame. So, suppose you have a code structure (image below) where the function `fn` calls functions `a_fn` and `b_fn`. And `a_fn` calls `aa_fn` and `ab_fn`. When you use the PyTorch eager mode rather than `torch.compile`, these function frames run as is. With `torch.compile`, TorchDynamo intercepts each of these function frames (indicated by the green color):
+
+:::{figure} _static/img/fine_grained_apis/api_diagram.png
+:alt: Callstack diagram of different apis.
+:::
+
+Let's imagine, that function `a_fn` is causing troubles with `torch.compile`.
+And this is a non-critical portion of the model. You can use `compiler.disable`
+on function `a_fn`. As shown above, TorchDynamo will stop looking at frames
+originating from the `a_fn` call (white color indicates original Python behavior).
+
+To skip compilation, you can decorate the offending function with
+`@torch.compiler.disable`.
+
+You can also use the non-decorator syntax if you don’t want to change the source
+code
+However, we recommend that you avoid this style if possible. Here, you have to
+take care that all users of the original function are now using the patched
+version.
+
+## `torch._dynamo.disallow_in_graph`
+
+`torch._dynamo.disallow_in_graph` disallows an operator but not the function
+to be present in the TorchDynamo extracted graph. Note that this is suitable
+for operators and not general functions as in the case of `_dynamo.disable`.
+
+Let's imagine you compile your model with PyTorch. TorchDynamo is able to
+extract a graph, but then you see the downstream compiler failing. For example,
+the meta kernel is missing, or some Autograd dispatch key is set incorrectly
+for a particular operator. Then you can mark that operator as
+`disallow_in_graph`, and TorchDynamo will cause a graph break and run that
+operator by using the PyTorch eager mode.
+
+The catch is that you will have to find the corresponding Dynamo level operator,
+and not the ATen level operator. See more in the Limitations section of the doc.
+
+:::{warning}
+`torch._dynamo.disallow_in_graph` is a global flag. If you are comparing
+different backend compilers, you might have to call `allow_in_graph` for
+the disallowed operator when switching to the other compiler.
+:::
+
+## `torch.compiler.allow_in_graph`
+
+`torch.compiler.allow_in_graph` is useful when the relevant function frame
+has some known hard-to-support TorchDynamo feature, such as hooks and
+`autograd.Function`, and you are confident that downstream PyTorch components
+such as AOTAutograd can safely trace through the decorated function. When a
+function is decorated with `allow_in_graph`, TorchDynamo treats it as a
+black-box and puts it as is in the generated graph.
+
+:::{warning}
+`allow_in_graph` skips TorchDynamo completely on the decorated function
+omitting all TorchDynamo safety checks, including graph breaks, handling
+closures, and others. Use `allow_in_graph` with caution. PyTorch downstream
+components, such as AOTAutograd rely on TorchDynamo to handle complex Python
+features, but `allow_in_graph` bypasses TorchDynamo. Using `allow_in_graph`
+could lead to soundness and hard-to-debug issues.
+:::
+
+## Limitations
+
+All the existing APIs are applied at the TorchDynamo level. Therefore, these
+APIs have visibility to only what TorchDynamo sees. This can lead to confusing
+scenarios.
+
+For example, `torch._dynamo.disallow_in_graph` will not work for ATen operators
+because they are visible to AOT Autograd. For example,
+`torch._dynamo.disallow_in_graph(torch.ops.aten.add)` will not work in the
+above example.
diff --git a/docs/source/torch.compiler_get_started.md b/docs/source/torch.compiler_get_started.md
new file mode 100644
index 000000000000..adbc2184df25
--- /dev/null
+++ b/docs/source/torch.compiler_get_started.md
@@ -0,0 +1,148 @@
+(torch_compiler_get_started)=
+
+# Getting Started
+
+Before you read this section, make sure to read the {ref}`torch.compiler_overview`
+
+let's start by looking at a simple `torch.compile` example that demonstrates
+how to use `torch.compile` for inference. This example demonstrates the
+`torch.cos()` and `torch.sin()` features which are examples of pointwise
+operators as they operate element by element on a vector. This example might
+not show significant performance gains but should help you form an intuitive
+understanding of how you can use `torch.compile` in your own programs.
+
+:::{note}
+To run this script, you need to have at least one GPU on your machine.
+If you do not have a GPU, you can remove the `.to(device="cuda:0")` code
+in the snippet below and it will run on CPU. You can also set device to
+`xpu:0` to run on Intel® GPUs.
+:::
+
+```python
+import torch
+def fn(x):
+   a = torch.cos(x)
+   b = torch.sin(a)
+   return b
+new_fn = torch.compile(fn, backend="inductor")
+input_tensor = torch.randn(10000).to(device="cuda:0")
+a = new_fn(input_tensor)
+```
+
+A more famous pointwise operator you might want to use would
+be something like `torch.relu()`. Pointwise ops in eager mode are
+suboptimal because each one would need to read a tensor from the
+memory, make some changes, and then write back those changes. The single
+most important optimization that inductor performs is fusion. In the
+example above we can turn 2 reads (`x`, `a`) and
+2 writes (`a`, `b`) into 1 read (`x`) and 1 write (`b`), which
+is crucial especially for newer GPUs where the bottleneck is memory
+bandwidth (how quickly you can send data to a GPU) rather than compute
+(how quickly your GPU can crunch floating point operations).
+
+Another major optimization that inductor provides is automatic
+support for CUDA graphs.
+CUDA graphs help eliminate the overhead from launching individual
+kernels from a Python program which is especially relevant for newer GPUs.
+
+TorchDynamo supports many different backends, but TorchInductor specifically works
+by generating [Triton](https://github.com/openai/triton) kernels. Let's save
+our example above into a file called `example.py`. We can inspect the code
+generated Triton kernels by running `TORCH_COMPILE_DEBUG=1 python example.py`.
+As the script executes, you should see `DEBUG` messages printed to the
+terminal. Closer to the end of the log, you should see a path to a folder
+that contains `torchinductor_<your_username>`. In that folder, you can find
+the `output_code.py` file that contains the generated kernel code similar to
+the following:
+
+```python
+@pointwise(size_hints=[16384], filename=__file__, triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'xnumel': 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': [], 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
+@triton.jit
+def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+   xnumel = 10000
+   xoffset = tl.program_id(0) * XBLOCK
+   xindex = xoffset + tl.arange(0, XBLOCK)[:]
+   xmask = xindex < xnumel
+   x0 = xindex
+   tmp0 = tl.load(in_ptr0 + (x0), xmask, other=0.0)
+   tmp1 = tl.cos(tmp0)
+   tmp2 = tl.sin(tmp1)
+   tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
+```
+
+:::{note}
+The above code snippet is an example. Depending on your hardware,
+you might see different code generated.
+:::
+
+And you can verify that fusing the `cos` and `sin` did actually occur
+because the `cos` and `sin` operations occur within a single Triton kernel
+and the temporary variables are held in registers with very fast access.
+
+Read more on Triton's performance
+[here](https://openai.com/blog/triton/). Because the code is written
+in Python, it's fairly easy to understand even if you have not written all that
+many CUDA kernels.
+
+Next, let's try a real model like resnet50 from the PyTorch
+hub.
+
+```python
+import torch
+model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=True)
+opt_model = torch.compile(model, backend="inductor")
+opt_model(torch.randn(1,3,64,64))
+```
+
+And that is not the only available backend, you can run in a REPL
+`torch.compiler.list_backends()` to see all the available backends. Try out the
+`cudagraphs` next as inspiration.
+
+## Using a pretrained model
+
+PyTorch users frequently leverage pretrained models from
+[transformers](https://github.com/huggingface/transformers) or
+[TIMM](https://github.com/rwightman/pytorch-image-models) and one of
+the design goals is TorchDynamo and TorchInductor is to work out of the box with
+any model that people would like to author.
+
+Let's download a pretrained model directly from the HuggingFace hub and optimize
+it:
+
+```python
+import torch
+from transformers import BertTokenizer, BertModel
+# Copy pasted from here https://huggingface.co/bert-base-uncased
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertModel.from_pretrained("bert-base-uncased").to(device="cuda:0")
+model = torch.compile(model, backend="inductor") # This is the only line of code that we changed
+text = "Replace me by any text you'd like."
+encoded_input = tokenizer(text, return_tensors='pt').to(device="cuda:0")
+output = model(**encoded_input)
+```
+
+If you remove the `to(device="cuda:0")` from the model and
+`encoded_input`, then Triton will generate C++ kernels that will be
+optimized for running on your CPU. You can inspect both Triton or C++
+kernels for BERT. They are more complex than the trigonometry
+example we tried above but you can similarly skim through it and see if you
+understand how PyTorch works.
+
+Similarly, let's try out a TIMM example:
+
+```python
+import timm
+import torch
+model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
+opt_model = torch.compile(model, backend="inductor")
+opt_model(torch.randn(64,3,7,7))
+```
+
+## Next Steps
+
+In this section, we have reviewed a few inference examples and developed a
+basic understanding of how torch.compile works. Here is what you check out next:
+
+- [torch.compile tutorial on training](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html)
+- {ref}`torch.compiler_api`
+- {ref}`torchdynamo_fine_grain_tracing`
\ No newline at end of file
diff --git a/docs/source/torch.compiler_inductor_profiling.md b/docs/source/torch.compiler_inductor_profiling.md
new file mode 100644
index 000000000000..c8e69e836b95
--- /dev/null
+++ b/docs/source/torch.compiler_inductor_profiling.md
@@ -0,0 +1,171 @@
+# TorchInductor GPU Profiling
+
+This section lists useful commands and workflows that can help
+you dive into a model’s performance in TorchInductor. When a model is not
+running as fast as expected, you may want to check individual kernels of the
+model. Usually, those kernels taking the majority of the
+GPU time are the most interesting ones. After that, you
+may also want to run individual kernels directly and inspect its perf.
+PyTorch provides tools to cover everything mentioned above.
+
+## Relevant Environment Variables
+
+You can use the following environment variables in your analysis:
+
+-  ``TORCHINDUCTOR_UNIQUE_KERNEL_NAMES``
+
+   -  By default, TorchInductor names a Triton kernel as ``‘triton\_’``. When
+      this environmental variable is enabled, inductor generates a more
+      meaningful kernel name in the trace, for example,
+      ``triton_poi_fused_cat_155`` which contains the kernel category
+      (``poi`` for pointwise) and original ATen
+      operator. This config is disabled by default to improve the chance of
+      compilation cache hit.
+
+-  ``TORCHINDUCTOR_BENCHMARK_KERNEL``
+
+   -  Enabling this will make inductor codegen harness to benchmark
+      individual triton kernels.
+
+-  ``TORCHINDUCTOR_MAX_AUTOTUNE``
+
+   -  Inductor autotuner will benchmark more ``triton.Configs`` and pick the
+      one with the best performance results. This will increase compilation
+      time with the hope to improve performance.
+
+## Breakdown Model GPU Time
+
+Below are the steps to breakdown execution time of a model into
+individual kernels. We take ``mixnet_l`` as an example.
+
+1. Run the benchmark script for the model:
+
+   ```bash
+      TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1 TORCHINDUCTOR_BENCHMARK_KERNEL=1
+      python -u benchmarks/dynamo/timm_models.py –backend inductor –amp
+      –performance –dashboard –only mixnet_l –disable-cudagraphs –training
+   ```
+   ```{note}
+   The tool relies on kernel name to decide its category. Enabling
+      ``TORCHINDUCTOR_UNIQUE_KERNEL_NAMES`` is crucial for that.
+   ```
+2. In the output log, look for lines:
+
+   ```bash
+      **Compiled module path:
+      /tmp/torchinductor_shunting/qz/cqz7hvhood7y3psp7fy6msjxsxyli7qiwiybizdwtjw6ffyq5wwd.py**
+   ```
+
+We have one line for each compiled module. If there are no extra graph
+breaks, we would see 2 such lines in the log, one for the forward graph
+and one for the backward graph.
+
+For our example command, we get the following compiled module for the
+forward and backward graphs respectively:
+
+-  [Forward graph compiled module](https://gist.github.com/shunting314/c2a4d8a28b00fcb5586d0e9d9bf77f9f)
+-  [Backward graph compiled module](https://gist.github.com/shunting314/48efc83b12ec3ead950052e4a0220b10)
+
+3. Now we can dive into the perf for each individual compiled module.
+   Let’s pick the one for the forward graph for illustration purposes.
+   I’ll name it ``fwd.py`` for convenience. Run it directly with the
+   ``-p`` argument:
+
+   ```bash
+      **> python fwd.py -p**
+   ```
+
+See the full output log in this [example gist](https://gist.github.com/shunting314/8243734a38b5733ea78479209c0ae893)
+
+In the output, you can notice the following:
+
+* We write a chrome trace file for the profile so we can load the trace and interact with it. In the log, look for lines as follows to find the path of the trace file.
+
+
+  **Chrome trace for the profile is written to /tmp/compiled_module_profile.json**
+
+   Loading the trace into Chrome (visit chrome://tracing in the chrome browser and load the file as the UI suggested) will show UI as follows:
+
+   ```{image} _static/img/inductor_profiling/trace.png
+   ```
+
+   You can zoom in and out to check the profile.
+
+* We report the percent of GPU time regarding to the wall time by log line like:
+
+  **Percent of time when GPU is busy: 102.88%**
+
+  Sometimes you may see a value larger than 100%. The reason is because PyTorch
+  uses the kernel execution time with profiling enabled while using wall time
+  with profiling disabled. Profiling may distort the kernel execution time a
+  bit. But overall it should not be a big deal.
+
+  If we run the model like ``densenet121`` with a small batch size, we would see
+  low percent of time when GPU is busy:
+
+   ```bash
+     (Forward graph) Percent of time when GPU is busy: 32.69%
+   ```
+
+  This means the model has a lot of CPU overhead. This is consistent with
+  the fact that enabling cudagraphs improve densenet121’s perf a lot.
+
+* We can break down the GPU time to different categories of kernels.
+  In the ``mixnet_l`` example, we see
+
+  -  pointwise kernel takes 28.58%
+  -  reduction kernel takes 13.85%
+  -  persistent reduction kernel takes 3.89%
+  -  the rest are cutlass/cudnn kernels for mm/conv which takes 56.57%
+
+  This information can be found in the summary line (last line)
+  of the report for each kernel category.
+
+* We also call zoom into a certain category of kernels. For example,
+  let’s check reduction kernels:
+
+  ```{image} _static/img/inductor_profiling/kernel_breakdown.png
+  ```
+
+  We can see an ordered table of execution time for each individual
+  reduction kernel. We also see how many times a kernel is executed. This
+  is helpful for a few reasons:
+
+  - If a kernel only takes a tiny amount of time, for example, 0.1%,
+    improving it will at most bring 0.1% overall gain. It is not
+    worth spending a lot of effort on it.
+  - Ff a kernel takes 2% of time, improving it by 2x will bring in 1%
+    overall gain which justifies the effort.
+
+## Benchmark Individual Triton Kernel
+
+Let’s say we want to take a closer look at
+``triton_red_fused\__native_batch_norm_legit_functional_16`` which is the
+most expensive reduction kernel and takes 2.19% of overall wall time for
+the forward graph.
+
+We can lookup the kernel name in the ``fwd.py``, and find comment like:
+
+**# kernel path:
+/tmp/torchinductor_shunting/jk/cjk2vm3446xrk7rth7hr6pun7xxo3dnzubwcn6ydrpifal4eykrz.py**
+
+```{image} _static/img/inductor_profiling/inductor_code.png
+```
+
+I’ll rename it k.py for convenience. Here is a paste for this [file](https://gist.github.com/shunting314/96a0afef9dce53d6357bf1633094f358).
+
+``k.py`` is a standalone Python module containing the kernel code and its
+benchmark.
+
+Run ``k.py`` directly will report its execution time and bandwidth:
+
+ ```{image} _static/img/inductor_profiling/terminal_printout.png
+ ```
+
+We can check if max-autotune helps this kernel, by running:
+
+```bash
+   **TORCHINDUCTOR_MAX_AUTOTUNE=1 python /tmp/k.py**
+```
+We may also temporarily add more reduction heuristics and run the script
+again to check how that helps with the kernel.
diff --git a/docs/source/torch.compiler_inductor_provenance.rst b/docs/source/torch.compiler_inductor_provenance.rst
new file mode 100644
index 000000000000..ccdafbe3fd2e
--- /dev/null
+++ b/docs/source/torch.compiler_inductor_provenance.rst
@@ -0,0 +1,71 @@
+.. _torchinductor-provenance:
+
+TorchInductor and AOTInductor Provenance Tracking
+=================================================
+
+.. warning::
+    This feature is a prototype under active development and there will be
+    breaking change in future releases.
+    The current compatibility of this tool is limited to the latest nightly build of PyTorch.
+
+
+This section describes how to use the provenance tracking feature for TorchInductor and AOTInductor in ``tlparse``.
+Provenance tracking helps you visualize the relationships between the input GraphModule to (AOT)Inductor and the optimized code generated. This feature allows you to trace how your original operations are transformed during compilation.
+
+Some example screenshots of the provenance tracking tool are shown below.
+The tool visualizes the mapping between nodes in the input graph (panel 1), the post grad graph (panel 2), and the Inductor generated code (panel 3).
+
+The **bolded** lines represent nodes/kernels covered by the current provenance tracing functionality.
+We currently cover triton kernels, cpp kernels, and combo kernels.
+The yellow highlighting shows the provenance of the nodes/kernels.
+
+
+Example screenshot of the provenance tracking tool for TorchInductor:
+ .. image:: _static/img/inductor_provenance/provenance_jit_inductor.png
+
+Example screenshot of the provenance tracking tool for AOTInductor:
+ .. image:: _static/img/inductor_provenance/provenance_aot_inductor.png
+
+
+Using the Provenance Tracking Highlighter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Follow these steps to enable and use provenance tracking in your PyTorch project:
+
+1. Install ``tlparse`` by ``cargo install tlparse``. If you don't have ``cargo``, see `The Cargo Book <https://doc.rust-lang.org/cargo/getting-started/installation.html>`__ for instructions to install.
+2. Run your program with required flags:
+
+   .. code-block:: bash
+
+     TORCH_TRACE=~/my_trace_log_dir TORCH_LOGS="+inductor" TORCH_COMPILE_DEBUG=1 python your_program.py
+
+   This will generate a log file in ``~/my_trace_log_dir``. The log file will be used by tlparse to generate the provenance tracking highlighter.
+3. Run ``tlparse`` on the log with ``--inductor-provenance`` flag. For example:
+
+   .. code-block:: bash
+
+      tlparse log_file_name.log --inductor-provenance
+
+   - Even if you don't add the ``--inductor-provenance`` flag, you should be able to see the mapping in json format in the ``inductor_provenance_tracking_node_mappings_<number>.json`` file in the ``index.html`` tlparse output.
+   - Run ``tlpare`` directly on the log file. It might not work if you run "tlparse parse <folder_name>  --inductor-provenance".
+   - The ``tlparse`` artifacts used by the provenance tracking highlighter are:
+
+      * ``before_pre_grad_graph.txt``
+      * ``after_post_grad_graph.txt``
+      * ``inductor_aot_wrapper_code.txt``
+      * ``inductor_output_code.txt``
+      * ``inductor_provenance_tracking_node_mappings.json``
+
+After running ``tlparse <file_name> --inductor-provenance``, you should see an additional "Provenance Tracking" section in the tlparse output. Clicking into the link(s) to access the provenance tracking tool.
+For a demo, see: https://github.com/pytorch/tlparse/pull/93
+
+ .. image:: _static/img/inductor_provenance/index.png
+
+
+See Also
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``tlparse`` is a tool written in Rust.
+
+- Link to the tlparse GitHub repo: https://github.com/pytorch/tlparse
+- Learn more about ``tlparse`` at :ref:`torch.compiler_troubleshooting`
diff --git a/docs/source/torch.compiler_ir.md b/docs/source/torch.compiler_ir.md
new file mode 100644
index 000000000000..ed920a064a68
--- /dev/null
+++ b/docs/source/torch.compiler_ir.md
@@ -0,0 +1,38 @@
+# IRs
+
+PyTorch 2.0 offers two set of IRs for backends to interface with: Core Aten IR and Prims IR.
+
+## Core Aten IR
+
+Core aten ops is the core subset of aten operators that can be used to compose other operators.
+Core aten IR is fully functional, and there is no `inplace` or `_out` variants in this opset.
+In contrast to Prims IR, core aten ops reuses the existing aten ops in "native_functions.yaml",
+and it doesn't further decompose ops into explicit type promotion and broadcasting ops.
+This opset is designed to serve as the functional IR to interface with backends.
+
+```{warning}
+  This opset is still under active development, more ops will be added in the future.
+```
+
+```{csv-table}
+   :file: ../build/ir/aten_ops.csv
+   :widths: auto
+   :header-rows: 1
+```
+
+## Prims IR
+
+Prims IR is a set of primitive operators that can be used to compose other operators.
+Prims IR is a lower level opset than core aten IR, and it further decomposes ops into explicit
+type promotion and broadcasting ops: prims.convert_element_type and prims.broadcast_in_dim.
+This opset is designed to interface with compiler backends.
+
+```{warning}
+  This opset is still under active development, more ops will be added in the future.
+```
+
+```{csv-table}
+   :file: ../build/ir/prims_ops.csv
+   :widths: auto
+   :header-rows: 1
+```
diff --git a/docs/source/torch.compiler_nn_module.md b/docs/source/torch.compiler_nn_module.md
new file mode 100644
index 000000000000..a694e2c88dbd
--- /dev/null
+++ b/docs/source/torch.compiler_nn_module.md
@@ -0,0 +1,59 @@
+# PyTorch 2.0 NNModule Support
+
+**Author**: [Will Constable](https://github.com/wconstab)
+
+`torch.compile` has special handling for torch.nn.Module objects, tracing them differently than it traces
+arbitrary python classes, with the intent of producing faster code by making assumptions about the structure.
+
+This doc describes some of the tradeoffs or edge cases that come up due to this specialization.
+
+## NNModule Hooks Support
+
+Previously, `torch.compile` had no support for hooks on nn.Modules, and if hooks were registered
+they would simply be ignored in the compiled program. Indeed many users do not
+use nn.Module hooks at all, or only use them for debug workflows, but there are valid use cases
+for composing nn.Module hooks with `torch.compile`.
+
+Hooks that are orchestrated via nn.Module.__call__ implementation include `_forward_pre_hooks`,
+`forward_hooks`, `_backward_pre_hooks`, and `_backward_hooks`, and will be referred to as 'call hooks'.
+These hooks are partially supported by `torch.compile` with limitations described below.
+
+Another category of hooks includes `_state_dict_hooks` and its `pre` and `load_` variants, and are still
+unsupported by `torch.compile`.
+
+## `nn.Module.__call__` Hooks Usage and limitations
+
+By default, `torch.compile` will trace the contents of `nn.Module.__call__` which means it will encounter
+and run forward/pre-forward hooks.  If you install hooks before calling `torch.compile` and then do not remove
+or alter the hooks later, your use case should be supported by default.
+
+Backward/Pre-backward hooks are generally also supported, with similar caveats: currently graph-breaks in dynamo
+occur when accessing backward_hooks dicts, which is probably avoiable with some work.  Graph-breaks also impact the
+timing of firing backward hooks, since graph-segments are run as autograd-functions which produce all their grads at
+the same time.  Assuming it were possible for dynamo to not graph-break on the presence of backward-hooks, we would
+still expect the backward hooks for a series of modules to all fire together after the whole compiled graph's backward
+ran.
+
+**hooks on 'allowed modules'**
+`torch.compile` treats common modules such as torch.conv, as well as modules that are difficult to trace, specially
+by allowing them to be called opaquely in the dynamo graph instead of traced into by dynamo.  For such modules, hooks
+currently trigger a graph-break so that the affected modules run outside of dynamo.  Depending on the model, this could
+introduce a significant performance regression, and additional work is required to improve this support.
+
+**skip_nnmodule_hook_guards**
+By default, `torch._dynamo.config.skip_nnmodule_hook_guards` is set to True, meaning no guards will be installed
+on each nn.Module hook dictionary, improving runtime by reducing guard execution time, at the cost of not noticing
+if any hook dict is changed after compilation.
+
+If you want to be able to remove or modify hooks after compilation and have `torch.compile` react appropriately
+(by recompiling), then you need to set `skip_nnmodule_hook_guards=False` and expect a runtime penalty for the added
+guards.
+
+TODO: confirm if backward/pre_backward hooks are working or not and document accordingly
+
+## state_dict Hooks
+
+State dict hooks have not yet been supported in `torch.compile`.
+
+
+TODO: warn_once if graph-breaking on hooks.  warn_once to point to this doc if hooks are present.
\ No newline at end of file
diff --git a/docs/source/torch.compiler_performance_dashboard.md b/docs/source/torch.compiler_performance_dashboard.md
new file mode 100644
index 000000000000..468d70f0dd5c
--- /dev/null
+++ b/docs/source/torch.compiler_performance_dashboard.md
@@ -0,0 +1,47 @@
+# PyTorch 2.0 Performance Dashboard
+
+**Author:** [Bin Bao](https://github.com/desertfire) and [Huy Do](https://github.com/huydhn)
+
+PyTorch 2.0's performance is tracked nightly on this [dashboard](https://hud.pytorch.org/benchmark/compilers).
+The performance collection runs on 12 GCP A100 nodes every night. Each node contains a 40GB A100 Nvidia GPU and
+a 6-core 2.2GHz Intel Xeon CPU. The corresponding CI workflow file can be found
+[here](https://github.com/pytorch/pytorch/blob/main/.github/workflows/inductor-perf-test-nightly.yml).
+
+## How to read the dashboard?
+
+The landing page shows tables for all three benchmark suites we measure, ``TorchBench``, ``Huggingface``, and ``TIMM``,
+and graphs for one benchmark suite with the default setting. For example, the default graphs currently show the AMP
+training performance trend in the past 7 days for ``TorchBench``. Droplists on the top of that page can be
+selected to view tables and graphs with different options. In addition to the pass rate, there are 3 key
+performance metrics reported there: ``Geometric mean speedup``, ``Mean compilation time``, and
+``Peak memory footprint compression ratio``.
+Both ``Geometric mean speedup`` and ``Peak memory footprint compression ratio`` are compared against
+the PyTorch eager performance, and the larger the better. Each individual performance number on those tables can be clicked,
+which will bring you to a view with detailed numbers for all the tests in that specific benchmark suite.
+
+## What is measured on the dashboard?
+
+All the dashboard tests are defined in this
+[function](https://github.com/pytorch/pytorch/blob/3e18d3958be3dfcc36d3ef3c481f064f98ebeaf6/.ci/pytorch/test.sh#L305).
+The exact test configurations are subject to change, but at the moment, we measure both inference and training
+performance with AMP precision on the three benchmark suites. We also measure different settings of TorchInductor,
+including ``default``, ``with_cudagraphs (default + cudagraphs)``, and ``dynamic (default + dynamic_shapes)``.
+
+## Can I check if my PR affects TorchInductor's performance on the dashboard before merging?
+
+Individual dashboard runs can be triggered manually by clicking the ``Run workflow`` button
+[here](https://github.com/pytorch/pytorch/actions/workflows/inductor-perf-test-nightly.yml)
+and submitting with your PR's branch selected. This will kick off a whole dashboard run with your PR's changes.
+Once it is done, you can check the results by selecting the corresponding branch name and commit ID
+on the performance dashboard UI. Be aware that this is an expensive CI run. With the limited
+resources, please use this functionality wisely.
+
+## How can I run any performance test locally?
+
+The exact command lines used during a complete dashboard run can be found in any recent CI run logs.
+The [workflow page](https://github.com/pytorch/pytorch/actions/workflows/inductor-perf-test-nightly.yml)
+is a good place to look for logs from some of the recent runs.
+In those logs, you can search for lines like
+`python benchmarks/dynamo/huggingface.py --performance --cold-start-latency --inference --amp --backend inductor --disable-cudagraphs --device cuda`
+and run them locally if you have a GPU working with PyTorch 2.0.
+``python benchmarks/dynamo/huggingface.py -h`` will give you a detailed explanation on options of the benchmarking script.
diff --git a/docs/source/torch.compiler_profiling_torch_compile.md b/docs/source/torch.compiler_profiling_torch_compile.md
new file mode 100644
index 000000000000..885b43dc2eef
--- /dev/null
+++ b/docs/source/torch.compiler_profiling_torch_compile.md
@@ -0,0 +1,252 @@
+# Profiling to understand torch.compile performance
+
+## What to use torch.profiler for:
+
+torch.profiler is helpful for understanding the performance of your program at a kernel-level granularity - for example, it can show graph breaks and resources utilization at the level of the program. The data provided by the profiler can often help users understand where to investigate further to understand model performance.
+
+To understand kernel-level performance, other tools exist, such as [Nvidia Nsight compute tool](https://developer.nvidia.com/nsight-compute), [AMD Omnitrace](https://rocm.docs.amd.com/projects/omnitrace/en/latest/),  Intel® VTune™ Profiler or [inductor's profiling tools](https://docs.pytorch.org/docs/stable/torch.compiler_inductor_profiling.html#torchinductor-gpu-profiling) can be used.
+
+See also the [general pytorch profiler guide](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html).
+
+## Basics of using torch.profiler and viewing traces
+
+**Example program**: We'll use this example of profiling resnet18. Notice the following parts of this example program:
+
+* Include a warm-up run to wait for compilation to complete (this will warm up systems like the CUDA caching allocator)
+* Use `torch.profiler.profile()` context for profiling the section we are interested in
+* Use `prof.export_chrome_trace("trace.json")` to export the profiling artifact.
+
+```python
+
+    import torch
+    from torchvision.models import resnet18
+
+    device = 'cuda'      # or 'cpu', 'xpu', etc.
+    model = resnet18().to(device)
+
+    inputs = [torch.randn((5, 3, 224, 224), device=device) for _ in range(10)]
+
+    model_c = torch.compile(model)
+
+    def fwd_bwd(inp):
+        out = model_c(inp)
+        out.sum().backward()
+
+    # warm up
+    fwd_bwd(inputs[0])
+
+    with torch.profiler.profile() as prof:
+        for i in range(1, 4):
+            fwd_bwd(inputs[i])
+            prof.step()
+
+    prof.export_chrome_trace("trace.json")
+```
+
+**Viewing chrome traces**: In the Chrome browser, open chrome://tracing and load the json file. Use the “w” and “s” keys to zoom in and out, and use “a” and “d” to scroll left and right. “?” will show a “help” screen with a list of shortcuts.
+
+```{figure}  _static/img/profiling_torch_compile/basic_chrome_trace.png
+:alt: Example of a basic chrome trace, visualized in the chrome://tracing viewer
+```
+
+Here, we observe:
+* CompiledFunction and CompiledFunctionBackward events, which correspond to the dynamo-compiled regions.
+* CPU events at the top, and GPU events at the bottom.
+
+**Flows between CPU and accelerator events**
+
+Every kernel on the accelerator occurs after being launched by code running on the CPU. The profiler can draw connections (i.e. “flows”) between the accelerator and CPU events to show which CPU event launched a accelerator kernel. This is particularly helpful because, with a few exceptions, accelerator kernels are launched asynchronously.
+
+To view a flow connection, click on a GPU kernel and click “ac2g”:
+
+```{figure}  _static/img/profiling_torch_compile/ac2g.png
+:alt: Visualization in the chrome://trace viewer, showing an async flow between a kernel and its launching location.
+```
+
+Alternatively, turn on *all* flows with the “Flow events” dropdown at the top.
+
+## Working around CUDA Graph profiling issues
+
+When CUDA graphs are enabled, some CUDA configurations (driver version under 525.85.12 or CUDA < 12)  can encounter issues between the profiling tools and CUDA graphs. To fix these issues, add an empty profiling context at the top of your program:
+
+```python
+
+    import torch
+
+    torch.profiler._utils._init_for_cuda_graphs()
+
+    # ... rest of program
+```
+
+## Understanding compilation time
+
+To understand why compilation is taking a long time, you can profile the first invocation of a torch.compile-ed program. Keep in mind that profile traces of compilations can be distorted more than typical profiling, because compilation workloads can be quite different from typical PyTorch workloads. In some cases, trace files may also be quite large. Traces > 1GB can be difficult to open with the chrome tracing tool.
+
+Note: roughly the same information can also be obtained in non-graphical format with :code:`torch._dynamo.utils.compile_times()`. This utility won’t show when the compilation steps occur, but it will show the amount of time spent on each step - and times will not be affected by any profiling overhead.
+
+See an example below:
+
+```python
+
+    import torch
+    from torchvision.models import resnet18
+
+    # user can switch between cuda and xpu
+    device = 'cuda'
+    model = resnet18().to(device)
+    inputs = [torch.randn((5, 3, 224, 224), device=device) for _ in range(10)]
+
+    model_c = torch.compile(model)
+
+    def fwd_bwd(inp):
+        out = model_c(inp)
+        out.sum().backward()
+
+    def warmup_compile():
+        def fn(x):
+            return x.sin().relu()
+
+        x = torch.rand((2, 2), device=device, requires_grad=True)
+        fn_c = torch.compile(fn)
+        out = fn_c(x)
+        out.sum().backward()
+
+    with torch.profiler.profile() as prof:
+        with torch.profiler.record_function("warmup compile"):
+            warmup_compile()
+
+        with torch.profiler.record_function("resnet18 compile"):
+            fwd_bwd(inputs[0])
+
+    prof.export_chrome_trace("trace_compile.json")
+```
+
+```{figure} _static/img/profiling_torch_compile/compilation_profiling.png
+:alt: A visualization in the chrome://trace viewer, showing dynamo and inductor compilation steps
+```
+
+Note a few things:
+
+* The first invocation should occur *during* profiling in order to capture compilation
+* Add a warm-up compilation in order to initialize any systems that need to be lazily initialized.
+
+# Finding graph breaks: "Torch-Compiled Region" and "CompiledFunction"
+
+Although there are logging tools for identifying graph breaks, the profiler provides a quick visual method of identifying :ref:`graph breaks <torch.compiler_graph_breaks>`. There are two profiler events to look for: **Torch-Compiled Region** and **CompiledFunction**.
+
+**Torch-Compiled Region** - which was introduced in PyTorch 2.2 - is a profiler event that covers the entire compiled region. Graph breaks almost always look the same: nested “Torch-Compiled Region” events.
+
+If you run two separate functions with torch.compile() applied independently on each of them, you should generally expect to see two adjacent (i.e NOT stacked/nested) Torch-Compiled regions. Meanwhile, if you encounter graph breaks (or disable()'ed/skipped regions), expect nested “Torch-Compiled Region” events.
+
+**CompiledFunction** - introduced in PyTorch 2.0 - is a profiler event that appears when gradients are required for any inputs.  Each graph break will interrupt a CompiledFunction block, splitting it in two. CompiledFunction events only appear when Autograd is involved, i.e. some of the input tensors to the graph have requires_grad=True.
+
+When a CompiledFunction appears in a trace, it is typically paired with a CompiledFunctionBackward event in the backward pass. A “fwd-bwd link” should appear in the trace connecting the two, if the backward function is called.
+
+If your use case includes a graph that doesn't require grad and doesn't include "Torch-Compiled Region" events, it can be more difficult to identify whether torch.compile is being applied correctly. One clue can be the existence of Inductor-generated Triton kernels.
+
+See the synthetic example below for a demonstration:
+
+```python
+
+    import torch
+    import torch._dynamo
+    # user can switch between cuda and xpu
+    device = 'cuda'
+
+    class ModelWithBreaks(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            def create_sequential():
+                return torch.nn.Sequential(
+                    torch.nn.Linear(128, 128),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(128, 128),
+                    torch.nn.ReLU(),
+                )
+            self.mod1 = create_sequential()
+            self.mod2 = create_sequential()
+            self.mod3 = create_sequential()
+            self.mod4 = create_sequential()
+
+        def forward(self, inp):
+            mod1 = self.mod1(inp)
+            torch._dynamo.graph_break()
+            mod2 = self.mod2(mod1)
+            torch._dynamo.graph_break()
+            mod3 = self.mod3(mod2)
+            torch._dynamo.graph_break()
+            mod4 = self.mod4(mod3)
+            return mod4
+
+    model = ModelWithBreaks().to(device)
+    inputs = [torch.randn((128, 128), device=device) for _ in range(10)]
+
+    model_c = torch.compile(model)
+
+    def fwd_bwd(inp):
+        out = model_c(inp)
+        out.sum().backward()
+
+    # warm up
+    fwd_bwd(inputs[0])
+
+    with torch.profiler.profile() as prof:
+        for i in range(1, 4):
+            fwd_bwd(inputs[i])
+            prof.step()
+
+    prof.export_chrome_trace("trace_break.json")
+```
+
+```{figure} _static/img/profiling_torch_compile/graph_breaks_with_torch_compiled_region.png
+:alt: Visualization in the chrome://trace viewer, showing nested Torch-Compiled Region events and multiple CompiledFunction events - indicating graph breaks.
+```
+
+## Operator Kernels
+
+When an operator is launched, we expect to see a few events:
+
+1. CPU-side event
+2. Kernel launch (if dealing with a GPU kernel)
+3. GPU-side event
+
+```{figure} _static/img/profiling_torch_compile/kernel_launch_labeled.png
+:alt: Visualization in the chrome://trace viewer, showing the three types of events - CPU-side event, kernel launch, and GPU-side event
+```
+
+**Inductor-generated Triton kernels:**
+1. The **CPU-side event** should appear as an event prefixed with "triton\_". The events currently have minimal information - the kernel name and a launch, but less information than typical aten kernel launches (which contain input shapes, types, etc.).
+2. The **kernel launch** should appear as cuLaunchKernel instead of cudaLaunchKernel (cudaLaunchKernel is typical for aten ops)
+3. The **GPU-side event** should appear, and how descriptive the name will be depends on the inductor config for unique_kernel_names
+
+```{figure} _static/img/profiling_torch_compile/triton_kernel_launch.png
+```
+
+**Non-Inductor generated Triton kernels:**
+
+1. The **CPU-side** event may not appear in traces; the machinery for automatically inserting a profiler event is currently implemented at the Inductor level, so Triton kernels that bypass Inductor may not appear in traces, unless users have annotated them manually
+2. The **kernel launch** should appear s cuLaunchKernel instead of cudaLaunchKernel (cudaLaunchKernel is typical for aten ops)
+3. The **GPU-side** event should appear, named similarly to the triton kernel that was authored.
+
+```{figure} _static/img/profiling_torch_compile/noninductor_triton_kernel.png
+```
+
+**Inductor-generated CPU kernels:**
+
+1. The **CPU-side event** will not appear in traces; we haven't added profiling for this yet.
+2. The **kernel launch** and **GPU-side events** don't exist
+
+**Non-Triton kernels** (i.e. aten kernels or custom ops) should also be expected to sometimes appear in traces. Sometimes, Inductor will fall back to the original op implementation, in which case you will see a call to the aten op.
+
+
+## Launch overhead
+
+One common issue is bad GPU utilization. A quick way to identify this is if there are large gaps between kernels on the GPU:
+
+```{figure} _static/img/profiling_torch_compile/cpu_bound.png
+:alt: Visualization in the chrome://trace viewer, showing large gaps between GPU kernels. This indicates that the model is CPU bound, likely due to overhead during kernel launches.
+```
+
+This is often the result of CPU overhead, e.g. if the amount of time spent on the CPU between kernel launches is larger than the amount of time spent by the GPU to process the kernels. The issue is more common for small batch sizes.
+
+When using inductor, enabling CUDA graphs can often help improve performance when launch overhead is a concern.
\ No newline at end of file
diff --git a/docs/source/torch.compiler_transformations.md b/docs/source/torch.compiler_transformations.md
new file mode 100644
index 000000000000..7291df298f37
--- /dev/null
+++ b/docs/source/torch.compiler_transformations.md
@@ -0,0 +1,424 @@
+# Writing Graph Transformations on ATen IR
+
+## Passes
+
+Since the ATen IR sits at the FX Graph/GraphModule level, any
+transformations written for FX Graphs can be easily applied onto the
+ATen IR. If you’re familiar with writing FX graph transformations, then
+this will be the same.
+
+The most direct way of writing transformations is by looping through the
+given graph and directly manipulating the nodes within the graph.
+
+For example, let’s say we want to replace
+`torch.ops.aten.add.Tensor()` calls with
+`torch.ops.aten.mul.Tensor()` calls:
+
+```python
+import torch
+
+def replace_add_with_mul(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch.ops.aten.add.Tensor:
+            node.target = torch.ops.aten.mul.Tensor
+```
+
+We can also delete and append new nodes through FX utility functions
+that can be found in the
+[Graph](https://pytorch.org/docs/stable/fx.html#torch.fx.Graph)
+documentation. For example, if we want to insert a
+`torch.ops.aten.relu.default()` after the `add` call:
+
+```python
+import torch
+
+def insert_relu_after_add(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch.ops.aten.add.Tensor:
+
+            # Specifies the insertion point. Any nodes added to the graph within
+            # this scope will be inserted after `node`
+            with gm.graph.inserting_after(node):
+                # Insert a new `call_function` node with op `torch.ops.aten.relu.default`
+                new_relu_node = gm.graph.call_function(torch.ops.aten.relu.default, args=(node,))
+                # Replace all the places that use `node` to now use the `new_relu_node`
+                node.replace_all_uses_with(new_relu_node)
+```
+
+In general, transformations can be roughly categorized into a couple of
+axis:
+
+Axis A: 1. Creating one-to-X mapping (eg. decomposition) 2. Creating
+many-to-one mapping (eg. fusion)
+
+Axis B: 1. Doing forwards iteration (eg. shape propagation) 2. Doing
+backwards iteration (eg. dead code elimination)
+
+Axis C: 1. Dependent on local node information (eg. out-variant
+conversion) 2. Dependent on global graph information (eg. memory
+planning)
+
+Our projection on the frequency of these use cases are: 1. A.1, B.1, C.1
+2\. A.2 3. B.2, C.2
+
+Although we can make all graph transformations through directly
+manipulating the graph, we also provide some helper utilities for some
+ease of use for the level 1 and 2 use-cases.
+
+### Transformer
+
+For level 1 uses cases (creating one-to-X mappings, doing forwards
+iterations, and looking at local node information), we can utilize the
+[Transformer](https://pytorch.org/docs/stable/fx.html#torch.fx.Transformer)
+class to execute each node and recreate a graph, except with the
+transformations specified.
+
+#### One-to-One Pass
+
+An example for one-to-one mappings, if we wanted to replace an op A with
+another op B, we can run the GraphModule, and very time we see op A,
+return op B.
+
+An example is:
+
+```python
+class ReplaceAddWithMul(torch.fx.Transformer):
+    def call_function(self, target, args, kwargs):
+        if target != torch.ops.aten.add.Tensor:
+            return super().call_function(target, args, kwargs)
+        return super().call_function(torch.ops.aten.mul.Tensor, args, kwargs)
+
+transformed_graph_module = ReplaceAddWithMul(graph_module).transform()
+```
+
+The `super().call_function(target, args, kwargs, meta)` call creates a
+`call_function` FX node, and returns the result of running the
+operator with the given arguments.
+
+#### One-to-X Pass
+
+If we wanted to do one-to-X mappings, like replacing op A with 2 other
+ops B and C, we would then make 2 calls to `super().call_function` to
+create 2 FX nodes, one with op B and another with op C, and return the
+result of running op C.
+
+For example:
+
+```python
+class ReplaceAddWithMulSub(torch.fx.Transformer):
+    """
+    Original:
+        def f(x, y):
+            return x + y
+
+    After pass:
+        def f(x, y):
+            z = x * y
+            return z - y
+    """
+    def call_function(self, target, args, kwargs):
+        if target != torch.ops.aten.add.Tensor:
+            return super().call_function(target, args, kwargs)
+
+        x, y = args
+
+        mul_res = super().call_function(torch.ops.aten.mul.Tensor, args, {})
+        return super().call_function(torch.ops.aten.sub.Tensor, (mul_res, y), {})
+
+transformed_graph_module = ReplaceAddWithMulSub(graph_module).transform()
+```
+
+#### One-to-None Pass
+
+If we wanted to remove an op, we can just return the value passed into
+the function:
+
+```python
+class RemoveDetachPass(torch.fx.Transformer):
+    def call_function(self, target, args, kwargs):
+        if target not in (
+            torch.ops.aten.detach.default,
+            torch.ops.aten.detach_copy.default,
+        ):
+            return super().call_function(target, args, kwargs, meta)
+
+        assert len(args) == 1
+        return args[0]
+
+transformed_graph_module = RemoveDetachPass(graph_module).transform()
+```
+
+#### Utilizing Local Information
+
+An example of utilizing local node information is, if we wanted to
+convert all the scalars within the graph to tensors, we can run the
+given `fx.GraphModule`, and for every argument that contains a scalar,
+we convert it to a tensor. It might look something like:
+
+```python
+def args_map(target, fn, args, kwargs):
+    assert isinstance(args, tuple)
+    assert isinstance(kwargs, dict)
+    args = list(args)
+    kwargs = kwargs.copy()
+
+    # Update the argument based on the function passed
+    def update(key, args, schema):
+        args[key] = fn(args[key], schema)
+
+    # Update each argument in the schema
+    for i, schema in enumerate(target._schema.arguments):
+        if schema.name in kwargs:
+            update(schema.name, kwargs, schema)
+        elif not schema.kwarg_only and i < len(args):
+            update(i, args, schema)
+    return tuple(args), kwargs
+
+class ScalarToTensorPass(torch.fx.Transformer):
+    def call_function(self, target, args, kwargs):
+        breakpoint()
+        def try_coerce(value, arg):
+            return (
+                torch.tensor(value)
+                if isinstance(value, (float, int, bool))
+                and type(arg.type) == torch.TensorType
+                else value
+            )
+
+        args, kwargs = args_map(target, try_coerce, args, kwargs)
+        return super().call_function(target, args, kwargs)
+
+transformed_graph_module = ScalarToTensorPass(graph_module).transform()
+```
+
+### Subgraph Rewriter
+
+For creating many-to-one mappings, we can utilize FX’s [subgraph
+rewriter](https://github.com/pytorch/pytorch/blob/main/torch/fx/subgraph_rewriter.py).
+Given a `pattern`, it creates a subgraph of operators matching to the
+pattern, and then replaces each matched subgraph with the
+`replacement`.
+
+Note:
+
+```
+This is an inplace operation.
+```
+
+The `pattern` and `replacement` inputs must be callable functions or
+GraphModules containing the same operators that are used within the
+graph (ATen ops) so that the subgraph rewriter can find the correct
+pattern in the graph. Inputs to the pattern/replacement callables will
+be treated as wildcards when matching.
+
+An example:
+
+```python
+from torch.fx import subgraph_rewriter
+
+def replace_patterns(graph_module):
+    def pattern(x, y):
+        x = torch.ops.aten.add.Tensor(x, y)
+        x = torch.ops.aten.mul.Tensor(x, y)
+        return x
+
+    def replacement(x, y):
+        return torch.ops.aten.sub.Tensor(x, y)
+
+replaced_patterns = subgraph_rewriter.replace_pattern_with_filters(
+    traced_module, pattern, replacement
+)
+```
+
+The subgraph rewriter returns a list of `ReplacedPatterns`:
+
+```python
+@dataclass
+class ReplacedPatterns:
+    # Node from which the match was found
+    anchor: Node
+    # Maps nodes in the pattern subgraph to nodes in the larger graph
+    nodes_map: Dict[Node, Node]
+    # List of nodes that were added into the graph
+    replacements: List[Node]
+```
+
+Note:
+
+```
+The nodes created by the subgraph rewriter will not have the metadata that
+is populated in the matched nodes, but you can use
+`ReplacedPatterns.nodes_map` to find the nodes in the original graph that
+were matched, and `ReplacedPatterns.replacements` to find the nodes that
+were replaced in the transformed graph.
+```
+
+## Pass Manager
+
+The
+[PassManager](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/infra/pass_manager.py)
+is a class used to run multiple passes on a given graph module. When
+initializing a `PassManager` instance, we pass in a list of passes
+that we want to run and set a couple of flags. To run the collection of
+passes on a graph module, we can pass the graph module directly to the
+`PassManager` instance.
+
+An example:
+
+```python
+from torch.fx.passes.infra.pass_manager import PassManager
+
+pm = PassManager(
+    passes=[replace_add_with_div, replace_div_with_mul],
+    run_checks_after_each_pass=True,
+    suppress_check_failures=False,
+)
+graph_module_out = pm(graph_module)
+```
+
+To add a common set of checks that are run after each pass, we can call
+the function `set_checks(check: Callable)` which takes in a callable
+function as input. If the `run_checks_after_each_pass` flag is set,
+the `check` will be called after each pass is run on the graph module.
+
+An example:
+
+```python
+pm = PassManager(passes=[replace_add_with_div, replace_div_with_mul])
+
+def check_div_target(graph_module):
+    for node in graph_module.graph.nodes:
+        if node.op == "call_function" and node.target != torch.div:
+            raise ValueError("Target should be div!")
+
+pm.add_checks(check_div_target)
+
+pm(graph_module)    # raises ValueError after replace_div_with_mul pass
+```
+
+## Partitioner
+
+There are a couple of common FX graph based partitioners we can use to
+partition the graph.
+
+### Subgraph Matcher
+
+For finding subgraphs within a graph that match a specific pattern, we
+can utilize FX’s
+[`SubgraphMatcher`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/utils/matcher_utils.py).
+
+Class Attributes:
+
+- `pattern (Graph)`: The targeted matching pattern. Placeholder nodes
+  in the graph will be treated as wildcards when matching.
+- `match_output (bool)`: If True, output node in the pattern graph
+  will be treated as a part of the targeted pattern. If False, output
+  node is ignored during match.
+- `match_placeholder (bool)`: If True, placeholder node in the
+  pattern graph will be treated as a part of the targeted pattern. If
+  False, placeholder nodes will be used a wildcard.
+- `remove_overlapping_matches (bool)`: If True, in the case of
+  overlapping matches, only the first match will be returned.
+- `ignore_literals (bool)`: If True, will not check if literals are
+  equal and will instead treat them as wildcards.
+
+An example:
+
+```python
+from torch.fx.passes.utils.matcher_utils import SubgraphMatcher
+
+class LargeModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._weight = torch.nn.Parameter(torch.ones(3, 3))
+        self._bias = torch.nn.Parameter(torch.ones(3, 3))
+
+    def forward(self, x):
+        return torch.ops.aten.addmm.default(self._bias, x, self._weight)
+
+large_model_graph = torch.export(LargeModel(), inputs).graph
+
+class PatternModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._weight_1 = torch.nn.Parameter(torch.ones(5, 5))
+        self._bias_1 = torch.nn.Parameter(torch.ones(5, 5))
+
+    def forward(self, x):
+        return torch.ops.aten.addmm.default(self._bias_1, x, self._weight_1)
+
+pattern_graph = torch.export(PatternModel(), inputs).graph
+
+subgraph_matcher = SubgraphMatcher(pattern_graph)
+match_result = subgraph_matcher.match(large_model_graph)
+```
+
+The `match` function returns a list of `InternalMatch`:
+
+```python
+@dataclass
+class InternalMatch():
+    # Nodes from which the match was found
+    anchors: List[Node]
+    # Maps nodes in the pattern subgraph to nodes in the larger graph
+    nodes_map: Dict[Node, Node] = field(default_factory=dict)
+    # Nodes in target graph that are matched placeholder in pattern
+    placeholder_nodes: List[Node] = field(default_factory=list)
+    # Nodes in matched subgraph returned by output
+    returning_nodes: List[Node] = field(default_factory=list)
+```
+
+### Capability Based Partitioner
+
+To find the largest subgraphs of nodes that support a specific
+invariant, we can utilize FX’s
+[`CapabilityBasedPartitioner`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/infra/partitioner.py#L34).
+
+Class Attributes
+
+- `graph_module (torch.fx.GraphModule)`: The graph module we are
+  partitioning on.
+- `operator_support (OperatorSupportBase)`: The object used to
+  determine if a node in the graph is supported in the partition.
+- `allows_single_node_partition (bool)`: If True, allows single node
+  partitions to be formed.
+- `non_compute_ops (Optional[Sequence[str]])`: A set of ops that are
+  considered to be “non-compute” (ex `torch.ops.aten.view` and
+  `_operator.getitem`, so that the partitioner will not create graphs
+  that only contain these non-compute ops
+- `allowed_single_node_partition_ops (Optional[Sequence[str]])`: A
+  set of ops that are allowed to be in a single node partition.
+
+The
+[`OperatorSupportBase`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/operator_support.py#LL28C1-L28C1)
+class is used by the partitioner to determine if a specific node in the
+graph belongs in the partition. This is done by overriding the
+`is_node_supported` function. You can chain multiple
+`OperatorSupportBase` by using
+[`chain`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/operator_support.py#L150) (which
+returns False if any of the OperatorSupportBase return False) and
+[`any_chain`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/operator_support.py#L164)
+(which returns True if any of the OperatorSupportBase returns True).
+
+An example:
+
+```python
+from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+class AddMulOperatorSupport(OperatorSupportBase):
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        return node.op == "call_function" and node.target in [
+            torch.ops.aten.add.Tensor, torch.ops.aten.mul.Tensor,
+        ]
+
+capability_partitioner = CapabilityBasedPartitioner(
+    graph_module,
+    op_support,
+)
+
+# Returns a list of partitions (list of nodes that belong in each partition)
+partition_list = capability_partitioner.propose_partitions()
+# Fuses the partitions into graph modules and inserts `call_module` nodes in the graph
+fused_graph_module = capability_partitioner.fuse_partitions(partition_list)
+```
diff --git a/docs/source/torch.compiler_troubleshooting.md b/docs/source/torch.compiler_troubleshooting.md
new file mode 100644
index 000000000000..041d61cf9b90
--- /dev/null
+++ b/docs/source/torch.compiler_troubleshooting.md
@@ -0,0 +1,1083 @@
+(torch.compiler_troubleshooting)=
+
+# torch.compile Troubleshooting
+
+You're trying to use `torch.compile` on your PyTorch model to enhance its performance
+but it's not working as expected. Perhaps performance isn't improving, crashes are happening, or compilation time is too long. This article provides tips, workarounds, and debugging tools to help you overcome these challenges.
+
+**Contents**
+
+```{contents}
+:local: true
+```
+
+## Setting Expectations
+
+`torch.compile` is designed as a general-purpose PyTorch compiler.
+Unlike the previous compiler solution, TorchScript, `torch.compile`
+requires fewer code changes, meaning models typically don't need to be rewritten from scratch.
+It also manages unsupported code more gracefully - unsupported code results in a lost optimization opportunity rather than a crash.
+
+In the ideal world, one can simply apply `torch.compile` to any PyTorch model and enjoy automatic speedups.
+However, in reality, code complexities can lead to one of three scenarios:
+
+1. `torch.compile` works seamlessly, providing speedups.
+2. Some code modifications are necessary. `torch.compile` doesn't crash or take too long,
+   but you might not be seeing significant performance gains.
+3. Extensive changes to your code are required.
+
+We anticipate most code will fall under scenarios (1) and (2).
+This document provides tips, arranged by level of involvement, to help address code issues in scenario (2).
+
+### Compile times
+
+`torch.compile` functions as a just-in-time compiler, so the initial one or two runs
+of the compiled function are expected to be significantly slower. Recompilations, which can occur under certain conditions (detailed below),
+will also make runs slower. Various `torch.compile` components cache results to
+reduce compilation time for future invocations, even in different processes.
+Cold-start (uncached) compilation time typically ranges from seconds to minutes for common or benchmarked models.
+Larger models may take upwards of 30 minutes to a few hours.
+
+## Terminology
+
+The following terms are relevant to troubleshooting `torch.compile` problems.
+
+### Graph break
+
+`torch.compile` traces your code and attempts to capture your PyTorch code into a
+single computation graph of PyTorch operators (FX graph). However, this is not always possible.
+When encountering code that can't be traced, a "graph break" occurs.
+A graph break involves compiling the FX graph has been determined so far, running the unsupported code,
+then resuming tracing after the unsupported code with a new FX graph.
+Because the computation graph is broken up, we lose optimization opportunities,
+so model code should avoid graph breaks whenever possible.
+Graph breaks occur on things like:
+
+- Data-dependent if-statements
+- Many Python built-in functions
+- C functions
+
+Below is an example of a graph break due to the function `copy.deepcopy` from a Python builtin library
+(exact output may differ).
+
+```py
+import torch
+
+@torch.compile
+def fn(x):
+    x = x + 1
+    with open("test.txt", "r") as f:
+        return x + len(f.read())
+
+fn(torch.ones(3, 3))
+```
+
+```
+$TORCH_LOGS="graph_breaks" python playground.py
+Graph break in user code at /data/users/williamwen/pytorch/playground.py:7
+Reason: Unsupported: builtin: open [<class 'torch._dynamo.variables.constant.ConstantVariable'>, <class 'torch._dynamo.variables.constant.ConstantVariable'>] False
+User code traceback:
+File "/data/users/williamwen/pytorch/playground.py", line 7, in fn
+    with open("test.txt", "r") as f:
+Traceback (most recent call last):
+File "/data/users/williamwen/pytorch/torch/_dynamo/symbolic_convert.py", line 635, in wrapper
+    return inner_fn(self, inst)
+        ^^^^^^^^^^^^^^^^^^^^
+File "/data/users/williamwen/pytorch/torch/_dynamo/symbolic_convert.py", line 2414, in CALL
+    self._call(inst)
+File "/data/users/williamwen/pytorch/torch/_dynamo/symbolic_convert.py", line 2408, in _call
+    self.call_function(fn, args, kwargs)
+File "/data/users/williamwen/pytorch/torch/_dynamo/symbolic_convert.py", line 962, in call_function
+    self.push(fn.call_function(self, args, kwargs))  # type: ignore[arg-type]
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+File "/data/users/williamwen/pytorch/torch/_dynamo/variables/builtin.py", line 997, in call_function
+    return handler(tx, args, kwargs)
+        ^^^^^^^^^^^^^^^^^^^^^^^^^
+File "/data/users/williamwen/pytorch/torch/_dynamo/variables/builtin.py", line 831, in <lambda>
+    return lambda *args: unimplemented(error_msg)
+                        ^^^^^^^^^^^^^^^^^^^^^^^^
+File "/data/users/williamwen/pytorch/torch/_dynamo/exc.py", line 313, in unimplemented
+    raise Unsupported(msg, case_name=case_name)
+torch._dynamo.exc.Unsupported: builtin: open [<class 'torch._dynamo.variables.constant.ConstantVariable'>, <class 'torch._dynamo.variables.constant.ConstantVariable'>] False
+```
+
+### Guards
+
+`torch.compile` makes some assumptions about runtime values as we trace through code.
+During tracing, we generate "guards", which are runtime checks for these assumptions.
+Guards are run in future calls to the compiled function to determine if we can reuse previously compiled code.
+Examples of runtime checks are constant values, types, and object IDs.
+
+Below is an example of generated guards. The `TENSOR_MATCH` guard checks for the input's type, device, dtype, shape, etc.
+
+```py
+import torch
+
+@torch.compile
+def fn(x):
+    return x + 1
+
+fn(torch.ones(3, 3))
+```
+
+```
+$ TORCH_LOGS="guards" python playground.py
+GUARDS:
+
+TREE_GUARD_MANAGER:
++- RootGuardManager
+| +- DEFAULT_DEVICE: utils_device.CURRENT_DEVICE == None                           # _dynamo/output_graph.py:471 in init_ambient_guards
+| +- GLOBAL_STATE: ___check_global_state()
+| +- TORCH_FUNCTION_MODE_STACK: ___check_torch_function_mode_stack()
+| +- GuardManager: source=L['x'], accessed_by=DictGetItemGuardAccessor(x)
+| | +- TENSOR_MATCH: check_tensor(L['x'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=False, size=[3, 3], stride=[3, 1])  # return x + 1  # playground.py:6 in fn
+| | +- NO_HASATTR: hasattr(L['x'], '_dynamo_dynamic_indices') == False           # return x + 1  # playground.py:6 in fn
+```
+
+### Recompilation
+
+If the guards fail for every instance of previously compiled code,
+then `torch.compile` must "recompile" the function, requiring the original code to be traced again.
+
+In the example below, recompilation is necessary because the guard checking the tensor argument's shape failed.
+
+```py
+import torch
+
+@torch.compile
+def fn(x):
+    return x + 1
+
+fn(torch.ones(3, 3))
+fn(torch.ones(4, 4))
+```
+
+```
+$ TORCH_LOGS="recompiles" python playground.py
+Recompiling function fn in /data/users/williamwen/pytorch/playground.py:3
+    triggered by the following guard failure(s):
+    - 0/0: tensor 'L['x']' size mismatch at index 0. expected 3, actual 4
+```
+
+### Dynamic Shapes
+
+`torch.compile` initially assumes tensor shapes are static/constant and guards based on these assumptions.
+By using "dynamic shapes," we can get `torch.compile` to produce compiled code that can accept
+tensor inputs with different shapes - we avoid recompiling every time shapes differ.
+By default, automatic dynamic shapes are enabled `torch.compile(dynamic=None)` -
+if compilation fails due to shape mismatch, recompilation is attempted with dynamic shapes.
+Dynamic shapes can also be fully enabled `dynamic=True` or disabled `dynamic=False`.
+
+Below, we enable dynamic shapes and note that we no longer need to recompile.
+
+```py
+import torch
+
+@torch.compile(dynamic=True)
+def fn(x):
+    return x + 1
+
+fn(torch.ones(3, 3))
+fn(torch.ones(4, 4))
+```
+
+```
+$ TORCH_LOGS="dynamic,recompiles" python playground.py
+create_symbol s0 = 3 for L['x'].size()[0] [2, int_oo] at playground.py:5 in fn (_dynamo/variables/builder.py:2718 in <lambda>), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="s0"
+produce_guards
+produce_guards
+```
+
+For more information on dynamic shapes, see [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng).
+
+## Logging Tools
+
+### tlparse / TORCH_TRACE
+
+`tlparse` / `TORCH_TRACE` are a pair of tools that produce compilation reports that look like this:
+<https://web.mit.edu/~ezyang/Public/bhack-20240609-tlparse/index.html>.
+
+Traces are very easy to collect. To collect a trace, run your reproduction command with
+
+```
+TORCH_TRACE="/tmp/tracedir" python foo.py
+pip install tlparse
+tlparse /tmp/tracedir
+```
+
+This approach works even if you are running a distributed job, providing a trace for each rank.
+It will open your browser with HTML similar to what's generated above.
+If you are making a bug report for a complicated problem that you don't have a standalone reproduction for,
+you can still greatly assist PyTorch developers by attaching the trace log generated in `/tmp/tracedir`.
+
+```{warning}
+The trace log contains all of your model code.
+Do not share the trace log if the model you are working on is sensitive. The trace log does NOT contain weights.
+```
+
+```{raw} html
+    <style>
+        .red {background-color:#ff0000;}
+        .green {background-color:#00ff00;}
+        .dark-green {background-color:#027f02;}
+    </style>
+```
+
+```{eval-rst}
+.. role:: red
+
+.. role:: green
+
+.. role:: dark-green
+```
+
+The output of `tlparse` is primarily aimed for PyTorch developers,
+and the log format is easy to upload and share on GitHub.
+However,  as a non-PyTorch developer, you can still extract useful information from it.
+We recommend starting with the inline help text in the report, which explains its contents.
+Here are some insights you can gain from a `tlparse`:
+
+- What model code was compiled by looking at the stack trie?
+  This is especially useful if you're not familiar with the codebase being compiled!
+- How many graph breaks / distinct compilation regions are there?
+  (Each distinct compile is its own color coded block like {dark-green}`[0/0]`).
+  Frames that are potentially graph-broken are light green {green}`[2/4]`.
+  If there are a lot of frames, that is suspicious, and suggests that you had some catastrophic graph breaks,
+  or maybe your code isn't a good match for `torch.compile`.
+- How many times did I recompile a particular frame? Something that recompiled a lot will look like:
+  {dark-green}`[10/0]` {dark-green}`[10/1]` {dark-green}`[10/2]`
+  \- if something is being recompiled a lot, that is very suspicious and worth looking into, even if it isn't the root cause of your problem.
+- Was there a compilation error? Frames that errored will look like {red}`[0/1]`.
+- What intermediate compiler products did I generate for a given frame?
+  For example, you can look at the high-level generated FX graph or the generated Triton code.
+- Is there relevant information for a particular frame? You can find these in `compilation_metrics`.
+
+### TORCH_LOGS
+
+You can use the `TORCH_LOGS` environment variable to selectively enable parts of the `torch.compile` stack to log.
+`TORCH_LOGS` is in fact the source of logs for `tlparse`. The format of the `TORCH_LOGS` environment variable looks like this:
+
+```
+TORCH_LOGS="<option1>,<option2>,..." python foo.py
+```
+
+Useful high-level options include:
+
+- `graph_breaks`: logs locations of graph breaks in user code and the reason for the graph break
+- `guards`: logs guards that are generated
+- `recompiles`: logs which function recompiled and the guards that failed, leading to the recompilation
+- `dynamic`: logs related to dynamic shapes
+
+Also, you can programmatically set logging options using `torch._logging.set_logs`:
+
+```py
+import logging
+torch._logging.set_logs(graph_breaks=True)
+...
+```
+
+More `TORCH_LOGS` options are {ref}`troubleshooting-torch-logs-options`.
+For the full list of options, see [torch.\_logging](https://pytorch.org/docs/stable/logging.html)
+and [torch.\_logging.set_logs](https://pytorch.org/docs/stable/generated/torch._logging.set_logs.html#torch._logging.set_logs).
+
+### tlparse vs. TORCH_LOGS
+
+Generally, we suggest first using `tlparse` when encountering issues.
+`tlparse` is ideal for debugging large models and gaining a high-level overview of how your model was compiled.
+On the other hand, `TORCH_LOGS` is preferred for small examples and fine-grained debugging detail,
+when we already have an idea of which `torch.compile` component is causing the problem.
+
+## Simple Workarounds
+
+Here, we describe some workarounds to `torch.compile` issues involving small code modifications
+or changing some `torch.compile` settings.
+
+### Where to apply torch.compile?
+
+We recommend applying `torch.compile` to the highest-level function that doesn't cause excessive problems.
+Typically, it is your train or eval step with the optimizer but without the loop, your top-level `nn.Module`,
+or some sub-``` nn.Module``s. ``torch.compile ``` specifically doesn't handle distributed wrapper modules like
+DDP or FSDP very well, so consider applying `torch.compile` to the inner module passed to the wrapper.
+
+```py
+# inference
+model = ...
+opt_model = torch.compile(model)
+
+for _ in range(N_ITERS):
+    inp = ...
+    out = opt_model(inp)
+```
+
+```py
+# training
+model = ...
+opt = torch.optim.Adam(model.parameters())
+
+@torch.compile
+def train(mod, data):
+    opt.zero_grad(True)
+    pred = mod(data[0])
+    loss = torch.nn.CrossEntropyLoss()(pred, data[1])
+    loss.backward()
+    opt.step()
+
+for _ in range(N_ITERS):
+    inp = ...
+    train(model, inp)
+```
+
+```py
+# DistributedDataParallel
+model = ...
+opt_model = torch.compile(model)
+model_ddp = DistributedDataParallel(opt_model, ...)
+
+for _ in range(N_ITERS):
+    inp = ...
+    out = model_ddp(inp)
+```
+
+### Disabling and Suppressing Errors
+
+For some model architectures, there are portions of the model which are particularly difficult to compile
+\- either there are many graph breaks, or there are crashes. You may want to explicitly disable these
+portions of the model which are problematic so that you can apply `torch.compile` to the parts that work.
+You can do this by using the `@torch.compiler.disable` decorator. When `torch.compile` attempts to call a
+disabled function, it breaks the graph and skips tracing the disabled function, resuming tracing after the call.
+By default, all recursive calls made from a disabled function are also disabled. Use the `recursive=False`
+option to allow compilation for recursive calls.
+
+```py
+def bad1_inner(...):
+    # skipped
+
+@torch.compiler.disable
+def bad1_outer(...):
+    # skipped
+    bad1_inner(...)
+
+def bad2_inner(...)
+    # traced
+
+@torch.compiler.disable(recursive=False)
+def bad2_outer(...):
+    # skipped
+    bad2_inner(...)
+
+@torch.compile
+def fn(...):
+    # graph break
+    bad1_outer(...)
+        ...
+    # graph break
+    bad2_outer(...)
+```
+
+For example, we use `torch.compiler.disable` to disable `torch.compile` on sparse architecture in
+recommendation models, as the sparse arch is difficult to compile. Preprocessing and logging functions
+are other examples of functions that typically cause a lot of graph breaks and do not get value from being compiled.
+
+If you are experiencing compiler crashes and you want to continue regardless, you can set
+`torch._dynamo.config.suppress_errors = True`. When the compiler crashes, we will just skip tracing
+the function and try again later. This is not best practice - it is better to eventually manually add
+disable annotations as necessary.
+
+### Resolving graph breaks
+
+To maximize optimization opportunities, it's important to reduce the number of graph breaks.
+Recall that you can see what graph breaks are happening using `tlparse` or `TORCH_LOGS="graph_breaks"`.
+In general, graph breaks are caused by one of the following:
+
+1. You're trying to do something that fundamentally cannot be traced, such as data-dependent control flow.
+2. You're trying to do something not yet supported. .
+   For example, we currently have limited support for tracing code that uses the built-in Python `inspect` module.
+3. Your code has an error in it. For example, you may have tried calling a function with an incorrect number of arguments.
+
+Graph break logs will tell you the user code location and reason for the graph break.
+Unfortunately, many graph breaks are not actionable without a deeper understanding of Dynamo.
+It can even be challenging to determine which of the three causes was the true cause of your graph break.
+We are working on making graph break messages more actionable.
+
+Additionally, the impact of lost optimization opportunities differs between graph breaks.
+For example, graph breaks that happen in the middle of your model's `forward` are likely to have a more negatie impact than
+graph breaks in a preprocessing part at the beginning of the `forward`. So it is not crucial to prevent *every single*
+break, but rather to prevent the ones that cause significant performance hits.
+
+If a graph break message doesn't suggest any action, you suspect that the cause of your graph break is (2),
+and you believe that the graph break is causing performance hits,
+then please report the graph break as an issue. If a function has many graph breaks,
+consider disabling compilation on that function, as the overhead cost for the graph breaks may become prohibitive.
+
+Below are some common graph breaks and some workarounds.
+
+#### Data-dependent operations
+
+`torch.compile` graph breaks on data-dependent operations such as data-dependent control flow
+(if-statements, loops with tensors) and direct tensor data accesses (`.item`, `.data_ptr`).
+
+```py
+import torch
+
+@torch.compile
+def fn(x):
+    y = x.sum()
+    if y > 0:
+        return x + y.item()
+    return x - y.item()
+
+fn(torch.ones(3, 3))
+```
+
+```
+$ TORCH_LOGS="graph_breaks" python playground.py
+Graph break in user code at /data/users/williamwen/pytorch/playground.py:6
+Reason: Data-dependent jump
+User code traceback:
+File "/data/users/williamwen/pytorch/playground.py", line 6, in fn
+    if y > 0:
+
+Graph break in user code at /data/users/williamwen/pytorch/playground.py:7
+Reason: Unsupported: Tensor.item
+User code traceback:
+File "/data/users/williamwen/pytorch/playground.py", line 7, in torch_dynamo_resume_in_fn_at_6
+    return x + y.item()
+Traceback (most recent call last):
+File "/data/users/williamwen/pytorch/torch/_dynamo/symbolic_convert.py", line 616, in wrapper
+    return inner_fn(self, inst)
+        ^^^^^^^^^^^^^^^^^^^^
+File "/data/users/williamwen/pytorch/torch/_dynamo/symbolic_convert.py", line 2288, in CALL
+    self._call(inst)
+File "/data/users/williamwen/pytorch/torch/_dynamo/symbolic_convert.py", line 2282, in _call
+    self.call_function(fn, args, kwargs)
+File "/data/users/williamwen/pytorch/torch/_dynamo/symbolic_convert.py", line 838, in call_function
+    self.push(fn.call_function(self, args, kwargs))  # type: ignore[arg-type]
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+File "/data/users/williamwen/pytorch/torch/_dynamo/variables/misc.py", line 1038, in call_function
+    return self.obj.call_method(tx, self.name, args, kwargs)
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+File "/data/users/williamwen/pytorch/torch/_dynamo/variables/tensor.py", line 527, in call_method
+    result = handler_method(*args, **kwargs)
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+File "/data/users/williamwen/pytorch/torch/_dynamo/variables/tensor.py", line 773, in method_item
+    unimplemented("Tensor.item")
+File "/data/users/williamwen/pytorch/torch/_dynamo/exc.py", line 304, in unimplemented
+    raise Unsupported(msg, case_name=case_name)
+torch._dynamo.exc.Unsupported: Tensor.item
+```
+
+The general workaround for these graph breaks is to avoid doing data-dependent operations. Some specific workarounds are:
+
+- If your control flow doesn't actually depend on data values, consider modifying your code to perform control flow on constants.
+
+```py
+# old
+x = torch.randn(3, 3)
+@torch.compile
+def fn(y):
+    if x.sum() > 0:
+        return y + x
+    else:
+        return y - x
+
+# new
+x = torch.randn(3, 3)
+cond = (x.sum() > 0).item()
+@torch.compile
+def fn(y):
+    if cond:
+        return y + x
+    else:
+        return y - x
+```
+
+- Use higher-order ops like `torch.cond` (<https://pytorch.org/docs/main/cond.html>) in place of data-dependent control flow
+
+```py
+# old
+@torch.compile
+def fn(x):
+    if x.sum() > 0:
+        return x + 1
+    return x - 1
+
+# new
+@torch.compile
+def fn(x):
+    return torch.cond(
+        x.sum() > 0,
+        lambda x: x + 1,
+        lambda x: x - 1,
+        (x,),
+    )
+```
+
+- If you have a `.item()` call, try `torch._dynamo.config.capture_scalar_outputs = True` or `TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1`
+- Wrap problematic parts of the function in a custom op
+
+#### Custom ops
+
+If you have code that `torch.compile` has trouble tracing through, either due to missing support or fundamental incompatibility,
+you can consider wrapping the problematic code in a custom op.
+
+Custom ops require a little bit of additional work to get them to be compatible with `torch.compile`.
+See <https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html> for more details.
+
+#### Printing
+
+Printing/logging/issuing warnings will result in a graph break. If you have a function that makes many logging calls,
+for example, a function that logs data about a training iteration, consider applying `torch.compiler.disable` on it.
+
+Alternatively, you can try using `torch._dynamo.config.reorderable_logging_functions`.
+This config is used to reorder logging functions so that they are called at the end of the traced function,
+thus avoiding a graph break. However, the logged contents may differ if, for example, a mutation occurs.
+
+```py
+import torch
+
+torch._dynamo.config.reorderable_logging_functions.add(print)
+
+@torch.compile
+def fn(x):
+    x += 1
+    print("log!")
+    return torch.sin(x)
+
+fn(torch.ones(3, 3))
+```
+
+```
+$ TORCH_LOGS="graph_breaks" python playground.py
+log!
+```
+
+#### Incorrect code
+
+Your code may be wrong, or is otherwise encountering an error from outside `torch.compile`.
+In the code below, we made a typo in the `torch.sin` call by providing an extra argument.
+
+```py
+import torch
+
+@torch.compile
+def fn(x):
+    y = torch.sin(x, x)
+    return y
+
+fn(torch.ones(3, 3))
+```
+
+```
+$ TORCH_LOGS="graph_breaks" python playground.py
+Graph break in user code at /data/users/williamwen/pytorch/playground.py:5
+Reason: Unsupported: TypeError <built-in method sin of type object at 0x7fd6fd764600>: sin() takes 1 positional argument but 2 were given
+User code traceback:
+File "/data/users/williamwen/pytorch/playground.py", line 5, in fn
+    y = torch.sin(x, x)
+...
+```
+
+It can be difficult to tell from the logs if the error is caused by your code or because of a `torch.compile` bug.
+In order to differentiate, we recommend trying to run your code without `torch.compile` to see if you still get the error.
+
+### Dealing with recompilations
+
+You can view recompilations and their reasons using `tlparse` or `TORCH_LOGS=recompiles`.
+
+#### Is dynamic shapes enabled?
+
+Recompilations due to mismatched shapes are in the form:
+
+```
+tensor 'L['x']' size mismatch at index 0. expected 3, actual 4
+```
+
+Make sure that the `dynamic` option of `torch.compile` is not set to `False`.
+The default option, `dynamic=None`, will only attempt dynamic shapes after the first compilation.
+You can set `dynamic=True` to upfront compile as dynamic as possible.
+
+For more information on dynamic shapes, see [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng).
+
+#### Changing the cache size limit
+
+There is a limit to how many times a function can be recompiled, determined by `torch._dynamo.config.recompile_limit`
+and `torch._dynamo.config.accumulated_recompile_limit`.
+If either limit is exceeded, then we will not attempt to compile the function again and instead will run the function eagerly.
+`torch.compile` will also issue a warning containing the affected function and which limit was hit.
+In the example below, each function call results in a recompile attempt.
+When we hit the cache size limit (8), we stop attempting to recompile.
+
+```py
+import torch
+
+@torch.compile(dynamic=False)
+def fn(x):
+    return x + 1
+
+for i in range(1, 10):
+    fn(torch.ones(i))
+```
+
+```
+$ python playground.py
+torch._dynamo hit config.recompile_limit (8)
+    function: 'fn' (/data/users/williamwen/pytorch/playground.py:5)
+    last reason: 0/0: tensor 'L['x']' size mismatch at index 0. expected 1, actual 9
+```
+
+If you know that the number of recompilations has a reasonable constant upper bound, you can raise the cache size limit.
+If the cost of recompilation outweighs the benefit of compilation, then you can consider lowering the cache size limit.
+
+#### Wrapping constants with tensors
+
+By default, `int` / `float` variables are treated as constants and are guarded as such.
+In the below example, we have a recompilation for each function call.
+
+```py
+import torch
+
+@torch.compile
+def fn(x, c):
+    return x + c
+
+for i in range(1, 10):
+    fn(torch.ones(i), 0.5 + i)
+```
+
+```
+$ TORCH_LOGS="recompiles" python playground.py
+Recompiling function fn in /data/users/williamwen/pytorch/playground.py:3
+    triggered by the following guard failure(s):
+    - 0/7: L['c'] == 8.5
+    - 0/6: L['c'] == 7.5
+    - 0/5: L['c'] == 6.5
+    - 0/4: L['c'] == 5.5
+    - 0/3: L['c'] == 4.5
+    - 0/2: L['c'] == 3.5
+    - 0/1: L['c'] == 2.5
+    - 0/0: L['c'] == 1.5
+torch._dynamo hit config.recompile_limit (8)
+    function: 'fn' (/data/users/williamwen/pytorch/playground.py:3)
+    last reason: 0/0: L['c'] == 1.5
+```
+
+In particular, for LR schedulers, initializing with a constant can lead to recompilations:
+
+```py
+import torch
+
+mod = torch.nn.Linear(3, 3)
+opt = torch.optim.Adam(mod.parameters(), lr=0.01)
+sched = torch.optim.lr_scheduler.ExponentialLR(opt, 0.9)
+
+@torch.compile
+def fn(inp):
+    opt.zero_grad(True)
+    out = mod(inp).sum()
+    out.backward()
+    opt.step()
+    sched.step()
+
+for i in range(1, 10):
+    fn(torch.ones(3, 3))
+```
+
+```
+$ TORCH_LOGS="recompiles" python playground.py
+Recompiling function step in /data/users/williamwen/pytorch/torch/optim/adam.py:189
+    triggered by the following guard failure(s):
+    - 3/7: L['self'].param_groups[0]['lr'] == 0.004782969000000002
+    - 3/6: L['self'].param_groups[0]['lr'] == 0.005314410000000002
+    - 3/5: L['self'].param_groups[0]['lr'] == 0.005904900000000002
+    - 3/4: L['self'].param_groups[0]['lr'] == 0.006561000000000002
+    - 3/3: L['self'].param_groups[0]['lr'] == 0.007290000000000001
+    - 3/2: L['self'].param_groups[0]['lr'] == 0.008100000000000001
+    - 3/1: L['self'].param_groups[0]['lr'] == 0.009000000000000001
+    - 3/0: L['self'].param_groups[0]['lr'] == 0.01
+torch._dynamo hit config.recompile_limit (8)
+    function: 'step' (/data/users/williamwen/pytorch/torch/optim/adam.py:189)
+    last reason: 3/0: L['self'].param_groups[0]['lr'] == 0.01
+```
+
+In both examples, we can wrap float variables in tensors in order to prevent recompilations.
+
+```py
+# first example
+for i in range(1, 10):
+    fn(torch.ones(i), torch.tensor(0.5 + i))
+
+# second example
+opt = torch.optim.Adam(mod.parameters(), lr=torch.tensor(0.01))
+sched = torch.optim.lr_scheduler.ExponentialLR(opt, torch.tensor(0.9))
+```
+
+## Reporting Issues
+
+If the workarounds provided above were not enough to get `torch.compile` working,
+then you should consider reporting the issue to PyTorch.
+But there are a few things that you can do to make our lives significantly easier.
+
+### Ablation
+
+Check which component of the `torch.compile` stack is the one causing the issue using the `backend=` option for `torch.compile`.
+In particular, try:
+
+- `torch.compile(fn, backend="eager")`, which only runs TorchDynamo, the graph capture component of `torch.compile`.
+- `torch.compile(fn, backend="aot_eager")`, which runs TorchDynamo and AOTAutograd, which additionally generates the backward graph during compilation.
+- `torch.compile(fn, backend="aot_eager_decomp_partition")`, which runs TorchDynamo and AOTAutograd with operator decompositions/partitions.
+- `torch.compile(fn, backend="inductor")`, which runs TorchDynamo, AOTAutograd, and TorchInductor, the backend ML compiler that generates compiled kernels.
+
+If you only fail with the Inductor backend, you can additionally test various Inductor modes:
+
+- `torch.compile(fn, backend="inductor", mode="default")`
+- `torch.compile(fn, backend="inductor", mode="reduce-overhead")`
+- `torch.compile(fn, backend="inductor", mode="max-autotune")`
+
+You can also check if dynamic shapes is causing issues with any backend:
+
+- `torch.compile(fn, dynamic=True)` (always use dynamic shapes)
+- `torch.compile(fn, dynamic=False)` (never use dynamic shapes)
+- `torch.compile(fn, dynamic=None)` (automatic dynamic shapes)
+
+### Bisecting
+
+Did you try on the latest nightly? Did something work in the past but now no longer works?
+Can you bisect to determine the first nightly where your issue occurs?
+Bisecting is especially helpful for performance, accuracy, or compile time regressions,
+where it is not immediately obvious where the problem originates from.
+
+### Creating a reproducer
+
+Creating reproducers is a lot of work, and it is perfectly fine if you do not have the time to do it.
+However, if you are a motivated user unfamiliar with the internals of `torch.compile`,
+creating a standalone reproducer can have a huge impact on our ability to fix the bug.
+Without a reproducer, your bug report must contain enough information for us to identify the root cause of the problem and write a reproducer from scratch.
+
+Here's a list of useful reproducers, ranked from most to least preferred:
+
+1. **Self-contained, small reproducer:** A script with no external dependencies, under 100 lines of code, that reproduces the problem when run.
+2. **Self-contained, large reproducer:** Even if it's large, being self-contained is a huge advantage!
+3. **Non-self-contained reproducer with manageable dependencies:**
+   For example, if you can reproduce the problem by running a script after `pip install transformers`,
+   that's manageable. We can likely run it and investigate.
+4. **Non-self-contained reproducer requiring substantial setup:** This might involve downloading datasets,
+   multiple environment setup steps, or specific system library versions requiring a Docker image.
+   The more complex the setup, the harder it is for us to recreate the environment.
+
+   :::{note}
+       Docker simplifies setup but complicates changes to the environment, so it's not a perfect solution, though we'll use it if necessary.
+   :::
+
+Somewhat orthogonally, a reproducer that can be run in a single process is better than a reproducer
+that requires multiprocess training (but once again, if you only have a multiprocess reproducer, we'll take it!).
+
+Additionally, below is a non-exhaustive list of aspects to check in your
+issue that you can attempt to replicate in your reproducer:
+
+- **Autograd**. Did you have tensor inputs with `requires_grad=True`? Did you call `backward()` on the output?
+- **Dynamic shapes**. Did you set `dynamic=True`? Or did you run the test code multiple times with varying shapes?
+- **Custom operators**. Is there a custom operator involved in the real workflow?
+  Can you replicate some of its important characteristics using the Python custom operator API?
+- **Configuration**. Did you set all the same configuration?
+  This includes `torch._dynamo.config` and `torch._inductor.config` settings,
+  as well as arguments to `torch.compile` like `backend` / `mode`.
+- **Context managers**. Did you replicate any active context managers?
+  This could be `torch.no_grad`, automatic mixed precision, `TorchFunctionMode` / `TorchDispatchMode`,
+  activation checkpointing, compiled autograd etc.
+- **Tensor subclasses**. Is there a tensor subclass involved?
+
+### Minifier
+
+The minifier is an early `torch.compile` tool that, given an FX graph that crashes when we attempt to run or compile it,
+finds a subgraph that also crashes and outputs the code that performs that subgraph's operations.
+Essentially, the minifier finds a minimal repro for a certain class of `torch.compile`-related crashes.
+This assumes that we were able to successfully trace through code.
+
+Unfortunately, most of the time nowadays, the minifier doesn't work as expected, and alternative methods may be necessary.
+This is likely because bugs that can be automatically reproduced in this manner are generally easier to fix
+and have already been addressed, leaving more complex issues that do not reproduce easily.
+However, it is straightforward to attempt using the minifier, so it is worth trying even if it may not succeed.
+
+Instructions for operating the minifier can be found [here](https://pytorch.org/docs/stable/torch.compiler_troubleshooting_old.html).
+If the compiler is crashing, you can set `TORCHDYNAMO_REPRO_AFTER="dynamo"` or `TORCHDYNAMO_REPRO_AFTER="aot"`
+The `aot` option is more likely to succeed, although it may not identify the `AOTAutograd` issues. This will generate the `repro.py` file which may help to diagnose the problem.
+For accuracy-related issues, consider setting `TORCHDYNAMO_REPRO_LEVEL=4`. Please note that this may not always successfully identify the problematic subgraph.
+
+## Debugging Deeper
+
+This section provides tools and techniques for independently debugging `torch.compile` issues
+or for gaining a deeper understanding of the `torch.compile` stack.
+These methods are more involved than those presented above and are used by PyTorch developers regularly
+to debug real `torch.compile` issues.
+
+Below is a high-level overview of the stack:
+
+![Torch Dynamo Stack](_static/img/dynamo/td_stack.png)
+
+The stack comprises three main components: TorchDynamo, AOTAutograd, and Inductor.
+Our debugging strategy involves first identifying the component in which the error occurs
+and then individually debugging the component. To determine the component responsible for the issue,
+see the `Ablation` section under `Reporting Issues` above. For guidance on debugging a specific component, consult the sections below.
+
+### TorchDynamo
+
+#### Logging what Dynamo is tracing
+
+The `TORCH_LOGS=trace_bytecode` option enables you to view the precise bytecode instructions that Dynamo is tracing,
+as well as a symbolic representation of the Python interpreter stack. When encountering a graph break or crash,
+it is advisable to inspect the last few bytecode instructions traced.
+
+You can also use `TORCH_LOGS=trace_source` to see which lines of source code Dynamo is tracing through.
+This is useful in combination with `trace_bytecode` to see the line of source code each traced bytecode instruction corresponds to.
+
+Finally, you can use `TORCH_LOGS=graph_code` to see the Python code representing the FX graph that Dynamo traced.
+You can view this code to double check that the correct ops are being traced.
+
+```py
+import torch
+
+def g(x, y):
+    return x + y
+
+@torch.compile(backend="eager")
+def f(x):
+    x = torch.sin(x)
+    x = g(x, x)
+    return x
+
+f(torch.ones(3, 3))
+```
+
+```
+$ TORCH_LOGS="trace_bytecode,trace_source,graph_code" python playground.py
+TRACE starts_line /data/users/williamwen/pytorch/playground.py:6 in f ()
+    @torch.compile(backend="eager")
+TRACE RESUME 0 []
+TRACE starts_line /data/users/williamwen/pytorch/playground.py:8 in f (f)
+        x = torch.sin(x)
+TRACE LOAD_GLOBAL torch []
+TRACE LOAD_ATTR sin [NullVariable(), PythonModuleVariable(<module 'torch' from '/data/users/williamwen/pytorch/torch/__init__.py'>)]
+TRACE LOAD_FAST x [NullVariable(), TorchInGraphFunctionVariable(<built-in method sin of type object at 0x7f00f6964600>)]
+TRACE CALL 1 [NullVariable(), TorchInGraphFunctionVariable(<built-in method sin of type object at 0x7f00f6964600>), LazyVariableTracker()]
+TRACE STORE_FAST x [TensorVariable()]
+TRACE starts_line /data/users/williamwen/pytorch/playground.py:9 in f (f)
+        x = g(x, x)
+TRACE LOAD_GLOBAL g []
+TRACE LOAD_FAST x [NullVariable(), UserFunctionVariable()]
+TRACE LOAD_FAST x [NullVariable(), UserFunctionVariable(), TensorVariable()]
+TRACE CALL 2 [NullVariable(), UserFunctionVariable(), TensorVariable(), TensorVariable()]
+TRACE starts_line /data/users/williamwen/pytorch/playground.py:3 in g (g) (inline depth: 1)
+    def g(x, y):
+TRACE RESUME 0 []
+TRACE starts_line /data/users/williamwen/pytorch/playground.py:4 in g (g) (inline depth: 1)
+        return x + y
+TRACE LOAD_FAST x []
+TRACE LOAD_FAST y [TensorVariable()]
+TRACE BINARY_OP 0 [TensorVariable(), TensorVariable()]
+TRACE RETURN_VALUE None [TensorVariable()]
+TRACE STORE_FAST x [TensorVariable()]
+TRACE starts_line /data/users/williamwen/pytorch/playground.py:10 in f (f)
+        return x
+TRACE LOAD_FAST x []
+TRACE RETURN_VALUE None [TensorVariable()]
+TRACED GRAPH
+===== __compiled_fn_1 =====
+/data/users/williamwen/pytorch/torch/fx/_lazy_graph_module.py class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3][3, 1]cpu"):
+        l_x_ = L_x_
+
+        # File: /data/users/williamwen/pytorch/playground.py:8 in f, code: x = torch.sin(x)
+        x: "f32[3, 3][3, 1]cpu" = torch.sin(l_x_);  l_x_ = None
+
+        # File: /data/users/williamwen/pytorch/playground.py:4 in g, code: return x + y
+        x_1: "f32[3, 3][3, 1]cpu" = x + x;  x = None
+        return (x_1,)
+```
+
+#### Breakpointing Dynamo tracing
+
+Inserting a breakpoint in Dynamo/user code is helpful at times to see what the state of Dynamo is when tracing through user code.
+Unfortunately, inserting a breakpoint in the normal Python fashion will result in a graph break in TorchDynamo,
+so we will not be able to view the state of Dynamo at the point where we intended to breakpoint.
+
+The first method for setting a breakpoint is to insert it within the Dynamo source code. Three recommended locations to place a breakpoint are:
+
+- In `torch/_dynamo/symbolic_convert.py`, breakpoint at functions that are named after the problematic bytecode instruction,
+  such as `def CALL_FUNCTION` and `def STORE_ATTR`. You can conditionally breakpoint depending on inputs,
+  for example, the `argval` of the instruction, or the name of the object at the top of the stack since some bytecode opcodes are frequently used.
+- Breakpoint where the graph break or error originates from. Typically, graph breaks are emitted from a call to `unimplemented(...)`.
+- Breakpoint in `torch/_dynamo/variables/builder.py, function:_wrap`. You will likely have to conditionally breakpoint on the input.
+  This function determines how to symbolically represent a given value. Consider breakpointing here if you suspect that a value is represented incorrectly.
+
+The second way to insert a breakpoint is to use `torch._dynamo.comptime.comptime.breakpoint`:
+
+```py
+from torch._dynamo.comptime import comptime
+
+@torch.compile
+def f(...):
+    ...
+    comptime.breakpoint()
+    ...
+```
+
+A comptime breakpoint is convenient as it enables you to inspect the Dynamo state at a specific location within the user code being traced.
+It does not require you to insert a breakpoint in the Dynamo source or to conditionally breakpoint based on variables.
+
+When a comptime breakpoint is triggered, you can do the following:
+
+- `ctx.print_bt()` to print the user stack trace
+- `ctx.print_locals()` to print all current locals
+- `ctx.print_graph()` to print the currently traced graph
+- `ctx.disas()` to print the currently traced function's bytecode
+- Use standard `pdb` commands, such as `bt/u/d/n/s/r`, - you can go up the `pdb` stack to inspect more Dynamo internals
+
+```py
+import torch
+from torch._dynamo.comptime import comptime
+
+@torch.compile(backend="eager")
+def f(x):
+    y = x + 1
+    comptime.breakpoint()
+    y = y + 1
+    return y
+
+f(torch.ones(3, 3))
+```
+
+```
+$ python playground.py
+--Return--
+> /data/users/williamwen/pytorch/torch/_dynamo/comptime.py(392)inner()->None
+-> builtins.breakpoint()
+(Pdb) ctx.print_bt()
+File "/data/users/williamwen/pytorch/playground.py", line 7, in f
+    comptime.breakpoint()
+
+(Pdb) ctx.print_locals()
+x = FakeTensor(..., size=(3, 3))
+y = FakeTensor(..., size=(3, 3))
+(Pdb) bt
+...
+/data/users/williamwen/pytorch/torch/_dynamo/symbolic_convert.py(826)call_function()
+-> self.push(fn.call_function(self, args, kwargs))  # type: ignore[arg-type]
+/data/users/williamwen/pytorch/torch/_dynamo/variables/misc.py(331)call_function()
+-> func(ComptimeContext(tx))
+> /data/users/williamwen/pytorch/torch/_dynamo/comptime.py(392)inner()->None
+-> builtins.breakpoint()
+(Pdb) ctx.print_graph()
+
+
+
+def forward(self, L_x_: "f32[3, 3]"):
+    l_x_ = L_x_
+
+    # File: /data/users/williamwen/pytorch/playground.py:6 in f, code: y = x + 1
+    y: "f32[3, 3]" = l_x_ + 1;  l_x_ = y = None
+```
+
+% TODO(uncomment/update once we improve this API)
+% Debugging large models
+% ^^^^^^^^^^^^^^^^^^^^^^
+%
+% Debugging TorchDynamo on large models can be tricky, mainly because Dynamo traces through large amounts of code.
+% It can be difficult to find the problematic function, or to determine where to place a breakpoint.
+% Even if we've found the problematic function, we don't want to deal with logging spam.
+% Fortunately, you can use ``TORCHDYNAMO_DEBUG_FUNCTION=<function name>``, which limits dynamo tracing to only functions with a specific name
+% (exact match). This will allow you to filter all of the functions in the model to the function(s) of interest.
+% Use this in combination with the above debugging strategies.
+
+#### Bytecode generation errors
+
+Although uncommon, Dynamo may generate incorrect bytecode. This may occur if you determine the following:
+
+- Ablation reveals the error is happening at the TorchDynamo level
+- The error is not being emitted from TorchDynamo stack frames
+- The error looks more like a user error rather than a Dynamo error, or is a segmentation fault
+- The error does not occur without `torch.compile`
+
+Bytecode generation bugs are generally tricky to fix and we recommend submitting an issue instead of trying to fix those yourself.
+If you are interested in seeing the bytecode that Dynamo generates, you can use `TORCH_LOGS=bytecode`.
+You can see a high-level overview on what bytecode Dynamo generates [here](https://docs.google.com/presentation/d/1tMZOoAoNKF32CAm1C-WfzdVVgoEvJ3lp/edit?usp=sharing&ouid=114922067987692817315&rtpof=true&sd=true).
+
+### AOTAutograd
+
+AOTAutograd errors are typically difficult to debug - we recommend just submitting an issue.
+AOTAutograd logging output is primarily helpful to see what the input to Inductor is.
+
+% TODO
+% TorchInductor
+% -------------
+
+% TODO
+
+(troubleshooting-torch-logs-options)=
+
+### Summary of TORCH_LOGS options
+
+A summary of helpful `TORCH_LOGS` options is:
+
+```{eval-rst}
+.. list-table::
+    :widths: 25 50
+    :header-rows: 1
+
+    * - Option
+      - Description
+    * - +all
+      - Output debug logs from all ``torch.compile`` components
+    * - +dynamo
+      - Output debug logs from TorchDynamo
+    * - +aot
+      - Output debug logs from AOTAutograd
+    * - +inductor
+      - Output debug logs from TorchInductor
+    * - dynamic
+      - Output logs from dynamic shapes
+    * - graph_code
+      - Output the Python code for the FX graph that Dynamo generated
+    * - graph_sizes
+      - Output the tensor sizes of the FX graph that Dynamo generated
+    * - trace_bytecode
+      - Output the bytecode instructions that Dynamo is tracing through and the symbolic interpreter stack Dynamo is keeping track of
+    * - trace_source
+      - Output the line of code in the original source that Dynamo is currently tracing through
+    * - bytecode
+      - Output Dynamo-generated bytecode
+    * - guards
+      - Output generated guards
+    * - recompiles
+      - Output recompilation reasons (only the first guard check that fails)
+    * - recompiles_verbose
+      - Output all guard checks that fail when a recompilation occurs
+    * - aot_graphs
+      - Output graph generated by AOTAutograd
+    * - aot_joint_graphs
+      - Output the joint forward-backward graph generated by AOTAutograd
+    * - output_code
+      - Output code generated by Inductor
+    * - kernel_code
+      - Output code generated by Inductor on a per-kernel basis
+    * - schedule
+      - Output Inductor scheduling logs
+    * - perf_hints
+      - Output Inductor perf hint logs
+    * - fusion
+      - Output Inductor fusion logs
+```
+
+For the full list of options, see [torch.\_logging](https://pytorch.org/docs/stable/logging.html)
+and [torch.\_logging.set_logs](https://pytorch.org/docs/stable/generated/torch._logging.set_logs.html#torch._logging.set_logs).
+
+## Related Articles
+
+- [torch.compile tutorial](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html)
+- [torch.compile fine-grained APIs](https://pytorch.org/docs/stable/torch.compiler_fine_grain_apis.html)
+- [torch.compile FAQ](https://pytorch.org/docs/stable/torch.compiler_faq.html)
+- [torch.compiler namespace overview](https://pytorch.org/docs/stable/torch.compiler.html#torch-compiler-overview)
+- [torch.compiler API reference](https://pytorch.org/docs/stable/torch.compiler_api.html)
+- [Profiling torch.compile](https://pytorch.org/docs/stable/torch.compiler_profiling_torch_compile.html)
+- [torch.compile missing manual](https://docs.google.com/document/d/1y5CRfMLdwEoF1nTk9q8qEu1mgMUuUtvhklPKJ2emLU8/edit?usp=sharing)
+- [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng)
+- [TorchInductor caching tutorial](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html)
diff --git a/docs/source/torch.compiler_troubleshooting_old.md b/docs/source/torch.compiler_troubleshooting_old.md
new file mode 100644
index 000000000000..03555d74e817
--- /dev/null
+++ b/docs/source/torch.compiler_troubleshooting_old.md
@@ -0,0 +1,721 @@
+---
+orphan: true
+---
+
+(torch.compiler_troubleshooting_old)=
+
+# PyTorch 2.0 Troubleshooting (old)
+
+**Author**: [Michael Lazos](https://github.com/mlazos)
+
+:::{note}
+This document is outdated and is now mainly a primary resource on how to run the `torch.compile` minifier.
+Please see the [updated troubleshooting document](https://pytorch.org/docs/main/torch.compiler_troubleshooting.html).
+There is also a more [comprehensive manual for torch.compile](https://docs.google.com/document/d/1y5CRfMLdwEoF1nTk9q8qEu1mgMUuUtvhklPKJ2emLU8/edit#heading=h.ivdr7fmrbeab)
+available.
+:::
+
+We are actively developing debug tools, profilers, and improving our
+error and warning messages. Below is a table of the available
+tools and their typical usage. For additional help see
+{ref}`diagnosing-runtime-errors`.
+
+```{eval-rst}
+.. list-table:: Title
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Tool
+     - Purpose
+     - Usage
+   * - Info logging
+     - View summarized steps of compilation
+     - ``torch._logging.set_logs(dynamo = logging.INFO)`` or ``TORCH_LOGS="dynamo"``
+   * - Debug logging
+     - View detailed steps of compilation (print every instruction traced)
+     - ``torch._logging.set_logs(dynamo = logging.DEBUG)`` and
+       ``torch._dynamo.config.verbose = True``, or ``TORCH_LOGS="+dynamo" TORCHDYNAMO_VERBOSE=1``
+   * - Minifier for any backend
+     - Find smallest subgraph which reproduces errors for any backend
+     - set environment variable ``TORCHDYNAMO_REPRO_AFTER="dynamo"``
+   * - Minifier for ``TorchInductor``
+     - If the error is known to occur after ``AOTAutograd`` find
+       smallest subgraph which reproduces errors during ``TorchInductor`` lowering
+     - set environment variable ``TORCHDYNAMO_REPRO_AFTER="aot"``
+   * - Dynamo accuracy minifier
+     - Finds the smallest subgraph which reproduces an accuracy issue
+       between an eager mode model and optimized model, when you
+       suspect the problem is in ``AOTAutograd``
+     - ``TORCHDYNAMO_REPRO_AFTER="dynamo" TORCHDYNAMO_REPRO_LEVEL=4``
+   * - Inductor accuracy minifier
+     - Finds the smallest subgraph which reproduces an accuracy issue
+       between an eager mode model and optimized model, when you
+       suspect the problem is in the backend (e.g., inductor).
+       If this doesn't work, try the Dynamo accuracy minifier
+       instead.
+     - ``TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4``
+   * - ``torch._dynamo.explain``
+     - Find graph breaks and display reasoning for them
+     - ``torch._dynamo.explain(fn)(*inputs)``
+   * - Record/Replay
+     - Record and replay frames which to reproduce errors during graph capture
+     - ``torch._dynamo.config.replay_record_enabled = True``
+   * - TorchDynamo function name filtering
+     - Only compile functions with the given name to reduce noise when
+       debugging an issue
+     - set environment variable ``TORCHDYNAMO_DEBUG_FUNCTION=<name>``
+   * - TorchInductor Debug logging
+     - Print general TorchInductor debug info and generated Triton/C++ code
+     - ``torch._inductor.config.debug = True``
+   * - TorchInductor Tracing
+     - Show time taken in each TorchInductor stage + output code and graph
+       visualization
+     - set the environment variable TORCH_COMPILE_DEBUG=1 or
+       ``torch._inductor.config.trace.enabled = True``
+```
+
+In addition to info and debug logging,
+you can use [torch.\_logging](https://pytorch.org/docs/main/logging.html)
+for more fine-grained logging.
+
+(diagnosing-runtime-errors)=
+## Diagnosing Runtime Errors
+
+At a high level, the TorchDynamo stack consists of a graph capture from
+Python code (TorchDynamo) and a backend compiler. For example, a
+backend compiler may consist of backward graph tracing (AOTAutograd) and
+graph lowering (TorchInductor)\*. Errors can occur in any component of
+the stack and will provide full stack traces.
+
+To determine in which component an error occurred,
+you may use info-level logging
+`torch._logging.set_logs(dynamo = logging.INFO)` or `TORCH_LOGS="dynamo"`
+and look for `Step #: ...` outputs. Logs are made at the beginning and end of
+each step, so the step that an error should correspond to is the most recently
+logged step whose end has not yet been logged. The steps correspond to the
+following parts of the stack:
+
+| Step | Component        |
+| ---- | ---------------- |
+| 1    | TorchDynamo      |
+| 2    | Compiler Backend |
+| 3    | TorchInductor    |
+
+If info logging is insufficient, you can use available backend
+options. These options include:
+
+- `"eager"`: only runs TorchDynamo forward graph capture and then
+  runs the captured graph with PyTorch. This provides an indication as
+  to whether TorchDynamo is raising the error.
+- `"aot_eager"`: runs TorchDynamo to capture a forward graph, and
+  then AOTAutograd to trace the backward graph without any additional
+  backend compiler steps. PyTorch eager will then be used to run the
+  forward and backward graphs. This is useful to narrow down the issue
+  to AOTAutograd.
+
+The general procedure to narrow down an issue is the following:
+
+1. Run your program with the `"eager"` backend. If the error no longer
+   occurs, the issue is in the backend compiler that is being used (if
+   using TorchInductor, proceed to step 2. If not, see
+   {ref}`minifying-backend-compiler-errors`). If the error still
+   occurs with the `"eager"` backend, it is due to
+   {ref}`torchdynamo-errors`.
+2. This step is only necessary if `TorchInductor` is used as the backend
+   compiler. Run the model with the `"aot_eager"` backend. If this
+   backend raises an error then the error is occurring during
+   AOTAutograd tracing. If the error no longer occurs with this backend,
+   then {ref}`minifying-torchinductor-errors`.
+
+Each of these cases are analyzed in the following sections.
+
+:::{note}
+The TorchInductor backend consists of
+both AOTAutograd tracing and the TorchInductor compiler itself. We will
+disambiguate by referring to `TorchInductor` as the backend, and
+TorchInductor lowering as the phase which lowers the graph traced by
+AOTAutograd.
+:::
+
+(torchdynamo-errors)=
+
+### Torchdynamo Errors
+
+If the error that is generated occurs with the `"eager"` backend, then
+TorchDynamo is most likely the source of the error. Here is a sample code
+which will generate an error.
+
+```py
+import torch
+
+import torch._dynamo as dynamo
+
+
+def test_assertion_error():
+    y = torch.ones(200, 200)
+    z = {y: 5}
+    return z
+
+compiled_test_assertion_error = torch.compile(test_assertion_error, backend="eager")
+
+compiled_test_assertion_error()
+```
+
+The code above generates the following error:
+
+```
+torch._dynamo.convert_frame: [ERROR] WON'T CONVERT test_assertion_error /scratch/mlazos/torchdynamo/../test/errors.py line 26
+due to:
+Traceback (most recent call last):
+  File "/scratch/mlazos/torchdynamo/torchdynamo/symbolic_convert.py", line 837, in BUILD_MAP
+    assert isinstance(k, ConstantVariable) or (
+AssertionError
+
+from user code:
+   File "/scratch/mlazos/torchdynamo/../test/errors.py", line 34, in test_assertion_error
+    z = {y: 5}
+
+Set torch._dynamo.config.verbose=True for more information
+==========
+```
+
+As the message suggests you can set
+`torch._dynamo.config.verbose=True` to get a full stack trace to both
+the error in TorchDynamo and the user code. In addition to this flag,
+you can also set the `log_level` of TorchDynamo through
+`torch._logging.set_logs(dynamo = logging.INFO)` or `TORCH_LOGS="dynamo"`. These levels include:
+
+- `logging.DEBUG` or `TORCH_LOGS="+dynamo"`: Print every instruction that is
+  encountered in addition to all the log levels listed below.
+- `logging.INFO`:
+  Print each function that is compiled (original and modified bytecode)
+  and the graph that is captured in addition to all the log levels listed below.
+- `logging.WARNING` (default): Print graph breaks in addition to all
+  the log levels listed below.
+- `logging.ERROR`: Print errors only.
+
+If a model is very large, the logs can become overwhelming. If
+an error occurs deep within a model's Python code, it can be useful to
+execute only the frame in which the error occurs to enable easier
+debugging. There are two tools available to enable this:
+
+- Setting the environment variable `TORCHDYNAMO_DEBUG_FUNCTION`
+  to the desired function name will only run torchdynamo on functions with that
+  name.
+- Enabling the record/replay tool (set `torch._dynamo.config.replay_record_enabled = True`)
+  which dumps an execution record when an error is encountered. This record can
+  then be replayed to run only the frame where an error occurred.
+
+### Diagnosing TorchInductor Errors
+
+If the error does not occur with the `"eager"` backend, then the
+backend compiler is the source of the error ([example
+error](https://gist.github.com/mlazos/2f13681e3cc6c43b3911f336327032de)).
+There are [different choices](./torch.compiler.md)
+for backend compilers for TorchDynamo, with TorchInductor
+fitting the needs of most users. This section focuses on TorchInductor
+as the motivating example, but some tools can also be used with other
+backend compilers.
+
+Below is the portion of the stack which we are focusing on:
+
+With TorchInductor as the chosen backend, AOTAutograd is used to
+generate the backward graph from the forward graph captured by
+torchdynamo. It is important to note that errors can occur during this
+tracing and also while TorchInductor lowers the forward and backward
+graphs to GPU code or C++. A model can often consist of hundreds or
+thousands of FX nodes, so narrowing the exact nodes where this problem
+occurred can be very difficult. Fortunately, there are tools available to
+automatically minify these input graphs to the nodes which are causing
+the issue. The first step is to determine whether the error occurs
+during tracing of the backward graph with AOTAutograd or during
+TorchInductor lowering. As mentioned above in step 2, the
+`"aot_eager"` backend can be used to run only AOTAutograd in isolation
+without lowering. If the error still occurs with this backend, this
+indicates that the error is occurring during AOTAutograd tracing.
+
+Here is an example:
+
+```py
+import torch
+
+import torch._dynamo as dynamo
+
+model = torch.nn.Sequential(*[torch.nn.Linear(200, 200) for _ in range(5)])
+
+def test_backend_error():
+
+    y = torch.ones(200, 200)
+    x = torch.ones(200, 200)
+    z = x + y
+    a = torch.ops.aten._foobar(z)  # dummy function which errors
+    return model(a)
+
+
+compiled_test_backend_error = torch.compile(test_backend_error, backend="inductor")
+compiled_test_backend_error()
+```
+
+Running this should give you this error with a longer stack trace below
+it:
+
+```
+Traceback (most recent call last):
+  File "/scratch/mlazos/torchdynamo/torchinductor/graph.py", line 246, in call_function
+    return lowerings[target](*args, **kwargs)
+  File "/scratch/mlazos/torchdynamo/torchinductor/lowering.py", line 185, in wrapped
+    return decomp_fn(*args, **kwargs)
+  File "/scratch/mlazos/torchdynamo/torchinductor/lowering.py", line 810, in _foobar
+    assert False
+AssertionError
+...
+```
+
+[error with full stack
+trace](https://gist.github.com/mlazos/d6947854aa56d686800259a164c62100)
+
+If you then change `torch.compile(backend="inductor")` to
+`torch.compile(backend="aot_eager")`, it will run without error, because
+[the
+issue](https://github.com/pytorch/torchdynamo/blob/d09e50fbee388d466b5252a63045643166006f77/torchinductor/lowering.py#:~:text=%23%20This%20shouldn%27t%20be,assert%20False)
+is in the TorchInductor lowering process, not in AOTAutograd.
+
+(minifying-torchinductor-errors)=
+
+### Minifying TorchInductor Errors
+
+From here, let’s run the minifier to get a minimal repro. Setting the
+environment variable `TORCHDYNAMO_REPRO_AFTER="aot"` (or setting
+`torch._dynamo.config.repro_after="aot"` directly) will generate a
+Python program which reduces the graph produced by AOTAutograd to the
+smallest subgraph which reproduces the error. (See below for an example
+where we minify the graph produced by TorchDynamo) Running the program
+with this environment variable should show nearly [identical
+output](https://gist.github.com/mlazos/0458ab828aa403c779fe73c012aa5982),
+with an additional line indicating where `minifier_launcher.py` has
+been written to. The output directory is configurable by setting
+`torch._dynamo.config.base_dir` to a valid directory name. The final
+step is to run the minifier and check that it runs successfully. A
+successful run looks like
+[this](https://gist.github.com/mlazos/e6ea41ccce68a7b1b8a7a09acb1b206a).
+If the minifier runs successfully, it generates runnable python code
+which reproduces the exact error. For our example this is the following
+code:
+
+```python
+import torch
+from torch import tensor, device
+import torch.fx as fx
+from torch._dynamo.testing import rand_strided
+from math import inf
+from torch.fx.experimental.proxy_tensor import make_fx
+
+# torch version: 1.13.0a0+gitfddfc44
+# torch cuda version: 11.6
+# torch git version: fddfc4488afb207971c54ad4bf58130fdc8a4dc5
+
+
+# CUDA Info:
+# nvcc: NVIDIA (R) Cuda compiler driver
+# Copyright (c) 2005-2022 NVIDIA Corporation
+# Built on Thu_Feb_10_18:23:41_PST_2022
+# Cuda compilation tools, release 11.6, V11.6.112
+# Build cuda_11.6.r11.6/compiler.30978841_0
+
+# GPU Hardware Info:
+# NVIDIA A100-SXM4-40GB : 8
+
+from torch.nn import *
+
+class Repro(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, add):
+        _foobar = torch.ops.aten._foobar.default(add);  add = None
+        return (_foobar,)
+
+args = [((200, 200), (200, 1), torch.float32, 'cpu')]
+args = [rand_strided(shape, stride, dtype, device) for shape, stride, dtype, device in args]
+mod = make_fx(Repro())(*args)
+from torch._inductor.compile_fx import compile_fx_inner
+
+compiled = compile_fx_inner(mod, args)
+compiled(*args)
+```
+
+The `forward` method of the `Repro` module contains the exact op
+which causes the issue. When filing an issue, please include any
+minified repros to aid in debugging.
+
+(minifying-backend-compiler-errors)=
+
+### Minifying Backend Compiler Errors
+
+With backend compilers other than TorchInductor the process for finding
+the subgraph causing the error is nearly identical to the procedure in
+{ref}`minifying-torchinductor-errors` with one important
+caveat. Namely, that the minifier will now be run on the graph that is
+traced by TorchDynamo, not the output graph of AOTAutograd. Let’s walk
+through an example.
+
+```py
+import torch
+
+import torch._dynamo as dynamo
+
+model = torch.nn.Sequential(*[torch.nn.Linear(200, 200) for _ in range(5)])
+# toy compiler which fails if graph contains relu
+def toy_compiler(gm: torch.fx.GraphModule, _):
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            assert False
+
+    return gm
+
+
+def test_backend_error():
+    y = torch.ones(200, 200)
+    x = torch.ones(200, 200)
+    z = x + y
+    a = torch.relu(z)
+    return model(a)
+
+
+compiled_test_backend_error = torch.compile(test_backend_error, backend=toy_compiler)
+compiled_test_backend_error()
+```
+
+In order to run the code after TorchDynamo has traced the forward graph,
+you can use the `TORCHDYNAMO_REPRO_AFTER` environment variable. Running
+this program with `TORCHDYNAMO_REPRO_AFTER="dynamo"` (or
+`torch._dynamo.config.repro_after="dynamo"`) should produce [this
+output](https://gist.github.com/mlazos/244e3d5b53667e44078e194762c0c92b)and
+the following code in `{torch._dynamo.config.base_dir}/repro.py`.
+
+:::{note}
+The other option for TORCHDYNAMO_REPRO_AFTER is `"aot"`, which
+will run the minifier after the backward graph has been generated.
+:::
+
+```python
+import torch
+import torch._dynamo as dynamo
+from torch import tensor, device
+import torch.fx as fx
+from torch._dynamo.testing import rand_strided
+from math import inf
+from torch._dynamo.debug_utils import run_fwd_maybe_bwd
+
+from torch.nn import *
+
+class Repro(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, add):
+        relu = torch.relu(add);  add = None
+        return (relu,)
+
+
+mod = Repro().cuda()
+opt_mod = torch.compile(mod, backend="None")
+
+
+args = [((200, 200), (200, 1), torch.float32, 'cpu', False)]
+args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
+
+
+with torch.cuda.amp.autocast(enabled=False):
+    ref = run_fwd_maybe_bwd(mod, args)
+    res = run_fwd_maybe_bwd(opt_mod, args)
+```
+
+The minifier successfully reduced the graph to the op that raises the
+error in `toy_compiler`. The other difference from the procedure in
+{ref}`minifying-torchinductor-errors` is that the minifier is
+automatically run after encountering a backend compiler error. After a
+successful run, the minifier writes `repro.py` to
+`torch._dynamo.config.base_dir`.
+
+## Performance Profiling
+
+### Accessing TorchDynamo Profiler
+
+TorchDynamo has a built-in stats function for collecting and displaying
+the time spent in each compilation phase. These stats can be accessed by
+calling `torch._dynamo.utils.compile_times()` after executing
+Torch.\_Dynamo. By default, this returns a string representation of the
+compile times spent in each TorchDynamo function by name.
+
+### TorchInductor Debugging using TORCH_COMPILE_DEBUG
+
+TorchInductor has a builtin stats and trace function for displaying time
+spent in each compilation phase, output code, output graph visualization
+and IR dump. This is a debugging tool designed to make it easier to
+understand and troubleshoot the internals of TorchInductor.
+
+Let's run an example with the following test program (`repro.py`):
+
+```
+import torch
+
+@torch.compile()
+def test_model(x):
+    model = torch.nn.Sequential(
+        torch.nn.Linear(10, 10),
+        torch.nn.LayerNorm(10),
+        torch.nn.ReLU(),
+    )
+    return model(x)
+
+
+y = test_model(torch.ones(10, 10))
+```
+
+Setting the environment variable `TORCH_COMPILE_DEBUG=1` will cause a
+debug trace directory to be created, by default this directory will be in the
+current directory and named torch_compile_debug (this can be overridden in
+the torchdynamo configuration field `debug_dir_root` and also the
+`env var TORCH_COMPILE_DEBUG_DIR`). Inside this directory, each run will
+have a separate folder named with the timestamp and process id of the run:
+
+```
+$ env TORCH_COMPILE_DEBUG=1 python repro.py
+$ cd torch_compile_debug
+$ ls
+run_2023_03_01_08_20_52_143510-pid_180167
+```
+
+In the run folder there will be a `torchdynamo` directory which contains
+debug logs, and an `torchinductor` folder which contains a subfolder for each
+compiled kernel with inductor debug artifacts.
+
+```
+$ cd
+run_2023_03_01_08_20_52_143510-pid_180167
+$ ls
+torchinductor  torchdynamo
+```
+
+Moving further into the `torchinductor` directory, the `\*.log` files are
+logs from the AOT Autograd phase of compilation, `model__0_forward_1.0` contains
+the inductor debug artifacts.
+
+```
+$ cd torchinductor
+$ ls
+aot_model___0_debug.log  model__0_forward_1.0
+$ cd model__0_forward_1.0
+$ ls
+debug.log  fx_graph_readable.py  fx_graph_runnable.py  fx_graph_transformed.py  ir_post_fusion.txt  ir_pre_fusion.txt  output_code.py
+```
+
+Here is a summary of the contents:
+
+- `fx_graph_readable.py` and `fx_graph_runnable.py` are the readable and
+  runnable versions of the `fx_graph` received by inductor.
+- `fx_graph_transformed.py` is the fx graph after inductor has run all fx passes.
+- `ir\*.txt` is the inductor ir pre and post fusion.
+- `output_code.py` is the compiled triton kernel for the subgraph.
+
+Here are [example debug directory contents](https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396)
+for the test program:
+
+```
+import torch
+
+@torch.compile()
+def test_model(x):
+    model = torch.nn.Sequential(
+        torch.nn.Linear(10, 10),
+        torch.nn.LayerNorm(10),
+        torch.nn.ReLU(),
+    )
+    return model(x)
+
+
+y = test_model(torch.ones(10, 10))
+```
+
+Each file in that debug trace can be enabled and disabled through
+`torch._inductor.config.trace.*`. The profile and the diagram are both
+disabled by default since they are expensive to generate.
+
+A single node in this new debug format looks like:
+
+```
+buf1: SchedulerNode(ComputedBuffer)
+buf1.writes =
+    {   MemoryDep(name='buf1', index=0, size=()),
+        MemoryDep(name='buf1', index=0, size=(s0,))}
+buf1.unmet_dependencies = {MemoryDep(name='buf0', index=c0, size=(s0,))}
+buf1.met_dependencies = {MemoryDep(name='primals_2', index=c0, size=(s0,))}
+buf1.group.device = cuda:0
+buf1.group.iteration = (1, s0)
+buf1.sizes = ([], [s0])
+class buf1_loop_body:
+    var_ranges = {z0: s0}
+    index0 = z0
+    index1 = 0
+    def body(self, ops):
+        get_index = self.get_index('index0')
+        load = ops.load('buf0', get_index, False)
+        get_index_1 = self.get_index('index0')
+        load_1 = ops.load('primals_2', get_index_1, False)
+        add = ops.add(load, load_1)
+        get_index_2 = self.get_index('index1')
+        reduction = ops.reduction('buf1', torch.float32, torch.float32, 'sum', get_index_2, add)
+        return reduction
+```
+
+See the [example debug directory
+output](https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396)
+for more examples.
+
+% _Memory Profiling
+% ----------------
+%
+% TBD
+
+### Graph Breaks
+
+Given a program like this:
+
+```python
+def some_fun(x):
+    ...
+
+compiled_fun = torch.compile(some_fun, ...)
+...
+```
+
+TorchDynamo will attempt to compile all of the torch/tensor operations
+within some_fun into a single FX graph, but it may fail to capture
+everything into one graph.
+
+Some graph break reasons are insurmountable to TorchDynamo, and can't be
+easily fixed. - calling into a C extension other than torch is invisible
+to torchdynamo, and could do arbitrary things without TorchDynamo being
+able to introduce necessary guards (see {ref}`making-dynamo-sound-guards`)
+to ensure that the compiled program would be safe to reuse. Graph breaks
+can hinder performance if the resulting fragments are small. To maximize
+performance, it's important to have as few graph breaks as possible.
+
+## Identifying the Cause of a Graph Break
+
+To identify all graph breaks in a program and the associated reasons for
+the breaks, `torch._dynamo.explain` can be used. This tool runs
+TorchDynamo on the supplied function and aggregates the graph breaks
+that are encountered. Here is an example usage:
+
+```python
+import torch
+import torch._dynamo as dynamo
+def toy_example(a, b):
+    x = a / (torch.abs(a) + 1)
+    print("woo")
+    if b.sum() < 0:
+        b = b * -1
+    return x * b
+explanation = dynamo.explain(toy_example)(torch.randn(10), torch.randn(10))
+print(explanation_verbose)
+"""
+Graph Count: 3
+Graph Break Count: 2
+Op Count: 5
+Break Reasons:
+  Break Reason 1:
+    Reason: builtin: print [<class 'torch._dynamo.variables.constant.ConstantVariable'>] False
+    User Stack:
+      <FrameSummary file foo.py, line 5 in toy_example>
+  Break Reason 2:
+    Reason: generic_jump TensorVariable()
+    User Stack:
+      <FrameSummary file foo.py, line 6 in torch_dynamo_resume_in_toy_example_at_5>
+Ops per Graph:
+  ...
+Out Guards:
+  ...
+"""
+```
+
+Outputs include:
+
+- `out_guards` - a list of lists where each sublist contains the guards that must pass to ensure the traced graphs are valid.
+- `graphs` - a list of graph modules which were successfully traced.
+- `ops_per_graph` - a list of lists where each sublist contains the ops that are run in the graph.
+
+To throw an error on the first graph break encountered, use the `fullgraph`
+mode. This mode disables TorchDynamo’s Python fallback, and only
+succeeds if the entire program is convertible into a single graph. Example
+usage:
+
+```python
+def toy_example(a, b):
+   ...
+
+compiled_toy = torch.compile(toy_example, fullgraph=True, backend=<compiler>)(a, b)
+```
+
+### Excessive Recompilation
+
+When TorchDynamo compiles a function (or part of one), it makes certain
+assumptions about locals and globals in order to allow compiler
+optimizations, and expresses these assumptions as guards that check
+particular values at runtime. If any of these guards fail, Dynamo will
+recompile that function (or part) up to
+`torch._dynamo.config.recompile_limit` times. If your program is
+hitting the cache limit, you will first need to determine which guard is
+failing and what part of your program is triggering it.
+
+If your program exhibits a bounded amount of dynamism, you may be able
+to tune the TorchDynamo cache limit to allow for each variation to be
+compiled and cached, but if the cache limit is too high you may find the
+cost of recompilation outweighs any optimization benefits.
+
+```
+torch._dynamo.config.recompile_limit = <your desired cache limit>
+```
+
+TorchDynamo plans to support many common cases of dynamic tensor shapes,
+such as varying batch size or sequence length. It does not plan to
+support rank-dynamism. In the meantime, setting a specific cache limit
+can be used in coordination with bucketing techniques to achieve an
+acceptable number of recompilations for some dynamic models.
+
+## Accuracy Debugging
+
+Accuracy issues can also be minified if you set the environment variable
+`TORCHDYNAMO_REPRO_LEVEL=4`, it operates with a similar git bisect
+model and a full repro might be something like
+`TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4` the reason
+we need this is downstream compilers will codegen code whether it’s
+Triton code or the C++ backend, the numerics from those downstream
+compilers can be different in subtle ways yet have dramatic impact on
+your training stability. So the accuracy debugger is very useful for us
+to detect bugs in our codegen or with a backend compiler.
+
+If you'd like to ensure that random number generation is the same across both torch
+and triton then you can enable `torch._inductor.config.fallback_random = True`
+
+## Extended Debugging
+
+Extended debugging can be enabled by using the following experimental flags.
+
+`TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED` - provides extended debug information if the
+string representation of a guard matches this flag value. For example, set it to
+"Ne(s0, 10)" to generate full Python and C++ backtrace whenever guard was issued.
+`TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL` - provides extended debug information when
+a particular symbol is allocated. For example, set this to "u2" to generate full Python
+and C++ backtrace whenever this symbol was created.
+`TORCHDYNAMO_EXTENDED_DEBUG_CPP` - provides extended debug information (C++ backtrace)
+for all extended debug settings as well as errors. For example, set this to "1". The C++
+backtrace is slow and very spammy so it is not included by default with extended debugging.
+
+## Cold Start Timing and Cache Corruption Debugging
+
+In order to measure the cold start compilation time or debug a cache corruption,
+it is possible pass `TORCHINDUCTOR_FORCE_DISABLE_CACHES=1` or set
+`torch._inductor.config.force_disable_caches = True` which will override any
+other caching config option and disable all compile time caching.
diff --git a/docs/source/torch.overrides.md b/docs/source/torch.overrides.md
new file mode 100644
index 000000000000..42e75bab9508
--- /dev/null
+++ b/docs/source/torch.overrides.md
@@ -0,0 +1,49 @@
+```{eval-rst}
+.. currentmodule:: torch.overrides
+```
+
+# torch.overrides
+```{eval-rst}
+.. py:module:: torch.overrides
+```
+
+This module exposes various helper functions for the ``__torch_function__``
+protocol. See {ref}`extending-torch-python` for more details on the
+``__torch_function__`` protocol.
+
+## Functions
+```{eval-rst}
+.. autofunction::  get_ignored_functions
+```
+
+```{eval-rst}
+.. autofunction::  get_overridable_functions
+```
+
+```{eval-rst}
+.. autofunction::  resolve_name
+```
+
+```{eval-rst}
+.. autofunction::  get_testing_overrides
+```
+
+```{eval-rst}
+.. autofunction::  handle_torch_function
+```
+
+```{eval-rst}
+.. autofunction::  has_torch_function
+```
+
+```{eval-rst}
+.. autofunction::  is_tensor_like
+```
+
+```{eval-rst}
+.. autofunction::  is_tensor_method_or_property
+```
+
+```{eval-rst}
+.. autofunction::  wrap_torch_function
+```
diff --git a/docs/source/torch_cuda_memory.md b/docs/source/torch_cuda_memory.md
new file mode 100644
index 000000000000..bb50e5fd5751
--- /dev/null
+++ b/docs/source/torch_cuda_memory.md
@@ -0,0 +1,97 @@
+(torch_cuda_memory)=
+
+# Understanding CUDA Memory Usage
+
+To debug CUDA memory use, PyTorch provides a way to generate memory snapshots that record the state of allocated CUDA memory
+at any point in time, and optionally record the history of allocation events that led up to that snapshot.
+
+The generated snapshots can then be drag and dropped onto the interactiver viewer hosted at [pytorch.org/memory_viz](https://pytorch.org/memory_viz) which
+can be used to explore the snapshot.
+
+```{note}
+The memory profiler and visualizer described in this document only have visibility into the CUDA memory that is
+allocated and managed through the PyTorch allocator.  Any memory allocated directly from CUDA APIs will not be
+visible in the PyTorch memory profiler.
+
+NCCL (used for distributed communication on CUDA devices) is a common example of a library that allocates some
+GPU memory that is invisible to the PyTorch memory profiler.  See {ref}`non_pytorch_alloc` for more info.
+```
+
+## Generating a Snapshot
+
+The common pattern for recording a snapshot is to enable memory history, run the code to be observed, and then save a file with a pickled snapshot:
+
+```python
+# enable memory history, which will
+# add tracebacks and event history to snapshots
+torch.cuda.memory._record_memory_history()
+
+run_your_code()
+torch.cuda.memory._dump_snapshot("my_snapshot.pickle")
+```
+
+## Using the visualizer
+
+Open [pytorch.org/memory_viz](https://pytorch.org/memory_viz>) and drag/drop the pickled snapshot file into the visualizer.
+The visualizer is a javascript application that runs locally on your computer. It does not upload any snapshot data.
+
+
+## Active Memory Timeline
+
+The Active Memory Timeline shows all the live tensors over time in the snapshot on a particular GPU. Pan/Zoom over the plot to look at smaller allocations.
+Mouse over allocated blocks to see a stack trace for when that block was allocated, and details like its address. The detail slider can be adjusted to
+render fewer allocations and improve performance when there is a lot of data.
+
+```{image} _static/img/torch_cuda_memory/active_memory_timeline.png
+```
+
+
+## Allocator State History
+
+The Allocator State History shows individual allocator events in a timeline on the left. Select an event in the timeline to see a visual summary of the
+allocator state at that event. This summary shows each individual segment returned from cudaMalloc and how it is split up into blocks of individual allocations
+or free space. Mouse over segments and blocks to see the stack trace when the memory was allocated. Mouse over events to see the stack trace when the event occurred,
+such as when a tensor was freed. Out of memory errors are reported as OOM events. Looking at the state of memory during an OOM may provide insight into why
+an allocation failed even though reserved memory still exists.
+
+```{image} _static/img/torch_cuda_memory/allocator_state_history.png
+```
+
+The stack trace information also reports the address at which an allocation occurred.
+The address b7f064c000000_0 refers to the (b)lock at address 7f064c000000 which is the "_0"th time this address was allocated.
+This unique string can be looked up in the Active Memory Timeline and searched
+in the Active State History to examine the memory state when a tensor was allocated or freed.
+
+(non_pytorch_alloc)=
+## Identifying Non-PyTorch allocations
+
+If you suspect CUDA memory is being allocated outside of PyTorch, you can collect the raw CUDA allocation info using
+the pynvml package, and compare that to the allocation reported by pytorch.
+
+
+To collect raw memory usage outside pytorch, use {func}`device_memory_used`
+
+```python
+import torch
+device_idx = ...
+print(torch.cuda.device_memory_used(device_idx))
+```
+
+## Snapshot API Reference
+
+```{eval-rst}
+.. currentmodule:: torch.cuda.memory
+```
+
+```{eval-rst}
+.. autofunction:: _record_memory_history
+```
+
+```{eval-rst}
+.. autofunction:: _snapshot
+```
+
+
+```{eval-rst}
+.. autofunction:: _dump_snapshot
+```
diff --git a/docs/source/torch_environment_variables.md b/docs/source/torch_environment_variables.md
new file mode 100644
index 000000000000..7bf429db0033
--- /dev/null
+++ b/docs/source/torch_environment_variables.md
@@ -0,0 +1,29 @@
+(torch_environment_variables)=
+# Torch Environment Variables
+
+PyTorch leverages environment variables for adjusting various settings that influence its runtime behavior.
+These variables offer control over key functionalities, such as displaying the C++ stack trace upon encountering errors, synchronizing the execution of CUDA kernels,
+specifying the number of threads for parallel processing tasks and many more.
+
+Moreover, PyTorch leverages several high-performance libraries, such as MKL and cuDNN,
+which also utilize environment variables to modify their functionality.
+This interplay of settings allows for a highly customizable development environment that can be
+optimized for efficiency, debugging, and computational resource management.
+
+Please note that while this documentation covers a broad spectrum of environment variables relevant to PyTorch and its associated libraries, it is not exhaustive.
+If you find anything in this documentation that is missing, incorrect, or could be improved, please let us know by filing an issue or opening a pull request.
+
+
+```{eval-rst}
+.. toctree::
+   :maxdepth: 1
+
+   threading_environment_variables
+   cuda_environment_variables
+   mps_environment_variables
+   debugging_environment_variables
+   miscellaneous_environment_variables
+   logging
+   torch_nccl_environment_variables
+
+```
diff --git a/docs/source/torch_nccl_environment_variables.md b/docs/source/torch_nccl_environment_variables.md
new file mode 100644
index 000000000000..8293cdbbfc17
--- /dev/null
+++ b/docs/source/torch_nccl_environment_variables.md
@@ -0,0 +1,41 @@
+(_torch_nccl_environment_variables)=
+# PYTORCH ProcessGroupNCCL Environment Variables
+
+For more information on the environment variables, see [ProcessGroupNCCL Environment Variables](https://github.com/pytorch/pytorch/blob/main/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp).
+
+```{list-table}
+:header-rows: 1
+
+* - **Variable**
+  - **Description**
+* - ``TORCH_NCCL_ASYNC_ERROR_HANDLING``
+  - Control how we perform Async Error Handling with NCCL when an exception is observed in watchdog. If set to 0, no handling of asynchronous NCCL errors. If set to 1, aborting NCCL communicator and tearing down process upon error. If set to 2, only abort NCCL communicator and if set to 3, tearing down process without aborting NCCL communicator. By default, it is set to 3.
+* - ``TORCH_NCCL_HIGH_PRIORITY``
+  - Control whether to use high priority stream for the NCCL communicator.
+* - ``TORCH_NCCL_BLOCKING_WAIT``
+  - Control whether or not wait() is blocking or non-blocking.
+* - ``TORCH_NCCL_DUMP_ON_TIMEOUT``
+  - Control whether dumping debug info on watchdog timeout or exception is detected. This variable must be set together with TORCH_NCCL_TRACE_BUFFER_SIZE larger than 0.
+* - ``TORCH_NCCL_DESYNC_DEBUG``
+  - Control whether Desync Debug is enabled. This is helpful in figuring out the culprit rank of collective desync.
+* - ``TORCH_NCCL_ENABLE_TIMING``
+  - If set to ``1``, enable recording start-events for all ProcessGroupNCCL collectives, and compute accurate collective timing per-collective.
+* - ``TORCH_NCCL_ENABLE_MONITORING``
+  - If set to ``1``,enable monitoring thread which aborts the process when the ProcessGroupNCCL Watchdog thread gets stuck and no heartbeat is detected after TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC. This can happen due to calling CUDA/NCCL APIs that may hang. It is Useful to prevent jobs being stuck for a prolonged time than necessary tying up cluster resources.
+* - ``TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC``
+  - Control the watchdog heartbeat timeout period after which the monitoring thread will abort the process.
+* - ``TORCH_NCCL_TRACE_BUFFER_SIZE``
+  - The maximum number of events we store in the flight recorder's ring buffer. One event could be the start or end of a collective, for example. Set to 0 to disable the tracebuffer and debugging info dump.
+* - ``TORCH_NCCL_TRACE_CPP_STACK``
+  - Whether to collect cpp stack traces for flight recorder. Default value is False.
+* - ``TORCH_NCCL_COORD_CHECK_MILSEC``
+  - Control the interval inside the monitoring thread to check the coordinated signal from other ranks, e.g. to dump the debugging information. Default value is 1000 ms.
+* - ``TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC``
+  - Control how much extra time we will wait for dumping the debugging info before we exit and throws timeout exception.
+* - ``TORCH_NCCL_DEBUG_INFO_TEMP_FILE``
+  - The file into which the debugging info would be dumped.
+* - ``TORCH_NCCL_DEBUG_INFO_PIPE_FILE``
+  - The pipe file to trigger debugging dump manually, write anything into the pipe would trigger the dump.
+* - ``TORCH_NCCL_NAN_CHECK``
+  - Control whether to enable NAN check for the input, Error would be thrown if NAN is detected.
+```
diff --git a/docs/source/type_info.md b/docs/source/type_info.md
new file mode 100644
index 000000000000..9fc2ce56c4be
--- /dev/null
+++ b/docs/source/type_info.md
@@ -0,0 +1,61 @@
+```{eval-rst}
+.. currentmodule:: torch
+```
+
+(type-info-doc)=
+# Type Info
+
+The numerical properties of a {class}`torch.dtype` can be accessed through either the {class}`torch.finfo` or the {class}`torch.iinfo`.
+
+(finfo-doc)=
+## torch.finfo
+
+```{eval-rst}
+.. class:: torch.finfo
+```
+
+A {class}`torch.finfo` is an object that represents the numerical properties of a floating point
+{class}`torch.dtype`, (i.e. ``torch.float32``, ``torch.float64``, ``torch.float16``, and ``torch.bfloat16``).
+This is similar to [numpy.finfo](https://numpy.org/doc/stable/reference/generated/numpy.finfo.html).
+
+A {class}`torch.finfo` provides the following attributes:
+
+| Name            | Type  | Description                                                                |
+| :-------------- | :---- | :------------------------------------------------------------------------- |
+| bits            | int   | The number of bits occupied by the type.                                   |
+| eps             | float | The smallest representable number such that ``1.0 + eps != 1.0``.          |
+| max             | float | The largest representable number.                                          |
+| min             | float | The smallest representable number (typically ``-max``).                    |
+| tiny            | float | The smallest positive normal number. Equivalent to ``smallest_normal``.    |
+| smallest_normal | float | The smallest positive normal number. See notes.                            |
+| resolution      | float | The approximate decimal resolution of this type, i.e., ``10**-precision``. |
+
+```{note}
+  The constructor of {class}`torch.finfo` can be called without argument,
+  in which case the class is created for the pytorch default dtype (as returned by {func}`torch.get_default_dtype`).
+```
+
+```{note}
+  `smallest_normal` returns the smallest *normal* number, but there are smaller
+  subnormal numbers. See https://en.wikipedia.org/wiki/Denormal_number
+  for more information.
+```
+
+(iinfo-doc)=
+## torch.iinfo
+
+```{eval-rst}
+.. class:: torch.iinfo
+```
+
+A {class}`torch.iinfo` is an object that represents the numerical properties of a integer
+{class}`torch.dtype` (i.e. ``torch.uint8``, ``torch.int8``, ``torch.int16``, ``torch.int32``, and ``torch.int64``).
+This is similar to [numpy.iinfo](https://numpy.org/doc/stable/reference/generated/numpy.iinfo.html).
+
+A {class}`torch.iinfo` provides the following attributes:
+
+| Name | Type | Description                              |
+| :--- | :--- | :--------------------------------------- |
+| bits | int  | The number of bits occupied by the type. |
+| max  | int  | The largest representable number.        |
+| min  | int  | The smallest representable number.       |
diff --git a/docs/source/utils.md b/docs/source/utils.md
new file mode 100644
index 000000000000..6742866a8b25
--- /dev/null
+++ b/docs/source/utils.md
@@ -0,0 +1,97 @@
+# torch.utils
+```{eval-rst}
+.. automodule:: torch.utils
+```
+
+```{eval-rst}
+.. currentmodule:: torch.utils
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    rename_privateuse1_backend
+    generate_methods_for_privateuse1_backend
+    get_cpp_backtrace
+    set_module
+    swap_tensors
+```
+
+<!-- This module needs to be documented. Adding here in the meantime
+for tracking purposes -->
+```{eval-rst}
+.. py:module:: torch.utils.backend_registration
+.. py:module:: torch.utils.benchmark.examples.compare
+.. py:module:: torch.utils.benchmark.examples.fuzzer
+.. py:module:: torch.utils.benchmark.examples.op_benchmark
+.. py:module:: torch.utils.benchmark.examples.simple_timeit
+.. py:module:: torch.utils.benchmark.examples.spectral_ops_fuzz_test
+.. py:module:: torch.utils.benchmark.op_fuzzers.binary
+.. py:module:: torch.utils.benchmark.op_fuzzers.sparse_binary
+.. py:module:: torch.utils.benchmark.op_fuzzers.sparse_unary
+.. py:module:: torch.utils.benchmark.op_fuzzers.spectral
+.. py:module:: torch.utils.benchmark.op_fuzzers.unary
+.. py:module:: torch.utils.benchmark.utils.common
+.. py:module:: torch.utils.benchmark.utils.compare
+.. py:module:: torch.utils.benchmark.utils.compile
+.. py:module:: torch.utils.benchmark.utils.cpp_jit
+.. py:module:: torch.utils.benchmark.utils.fuzzer
+.. py:module:: torch.utils.benchmark.utils.sparse_fuzzer
+.. py:module:: torch.utils.benchmark.utils.timer
+.. py:module:: torch.utils.benchmark.utils.valgrind_wrapper.timer_interface
+.. py:module:: torch.utils.bundled_inputs
+.. py:module:: torch.utils.checkpoint
+.. py:module:: torch.utils.collect_env
+.. py:module:: torch.utils.cpp_backtrace
+.. py:module:: torch.utils.cpp_extension
+.. py:module:: torch.utils.data.backward_compatibility
+.. py:module:: torch.utils.data.dataloader
+.. py:module:: torch.utils.data.datapipes.dataframe.dataframe_wrapper
+.. py:module:: torch.utils.data.datapipes.dataframe.dataframes
+.. py:module:: torch.utils.data.datapipes.dataframe.datapipes
+.. py:module:: torch.utils.data.datapipes.dataframe.structures
+.. py:module:: torch.utils.data.datapipes.datapipe
+.. py:module:: torch.utils.data.datapipes.gen_pyi
+.. py:module:: torch.utils.data.datapipes.iter.callable
+.. py:module:: torch.utils.data.datapipes.iter.combinatorics
+.. py:module:: torch.utils.data.datapipes.iter.combining
+.. py:module:: torch.utils.data.datapipes.iter.filelister
+.. py:module:: torch.utils.data.datapipes.iter.fileopener
+.. py:module:: torch.utils.data.datapipes.iter.grouping
+.. py:module:: torch.utils.data.datapipes.iter.routeddecoder
+.. py:module:: torch.utils.data.datapipes.iter.selecting
+.. py:module:: torch.utils.data.datapipes.iter.sharding
+.. py:module:: torch.utils.data.datapipes.iter.streamreader
+.. py:module:: torch.utils.data.datapipes.iter.utils
+.. py:module:: torch.utils.data.datapipes.map.callable
+.. py:module:: torch.utils.data.datapipes.map.combinatorics
+.. py:module:: torch.utils.data.datapipes.map.combining
+.. py:module:: torch.utils.data.datapipes.map.grouping
+.. py:module:: torch.utils.data.datapipes.map.utils
+.. py:module:: torch.utils.data.datapipes.utils.common
+.. py:module:: torch.utils.data.datapipes.utils.decoder
+.. py:module:: torch.utils.data.datapipes.utils.snapshot
+.. py:module:: torch.utils.data.dataset
+.. py:module:: torch.utils.data.distributed
+.. py:module:: torch.utils.data.graph
+.. py:module:: torch.utils.data.graph_settings
+.. py:module:: torch.utils.data.sampler
+.. py:module:: torch.utils.dlpack
+.. py:module:: torch.utils.file_baton
+.. py:module:: torch.utils.flop_counter
+.. py:module:: torch.utils.hipify.constants
+.. py:module:: torch.utils.hipify.cuda_to_hip_mappings
+.. py:module:: torch.utils.hipify.hipify_python
+.. py:module:: torch.utils.hipify.version
+.. py:module:: torch.utils.hooks
+.. py:module:: torch.utils.jit.log_extract
+.. py:module:: torch.utils.mkldnn
+.. py:module:: torch.utils.mobile_optimizer
+.. py:module:: torch.utils.show_pickle
+.. py:module:: torch.utils.tensorboard.summary
+.. py:module:: torch.utils.tensorboard.writer
+.. py:module:: torch.utils.throughput_benchmark
+.. py:module:: torch.utils.weak
+```
diff --git a/docs/source/xpu.md b/docs/source/xpu.md
new file mode 100644
index 000000000000..46d36451d4b8
--- /dev/null
+++ b/docs/source/xpu.md
@@ -0,0 +1,92 @@
+# torch.xpu
+```{eval-rst}
+.. automodule:: torch.xpu
+```
+```{eval-rst}
+.. currentmodule:: torch.xpu
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    StreamContext
+    current_device
+    current_stream
+    device
+    device_count
+    device_of
+    get_arch_list
+    get_device_capability
+    get_device_name
+    get_device_properties
+    get_gencode_flags
+    get_stream_from_external
+    init
+    is_available
+    is_initialized
+    set_device
+    set_stream
+    stream
+    synchronize
+```
+
+## Random Number Generator
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    get_rng_state
+    get_rng_state_all
+    initial_seed
+    manual_seed
+    manual_seed_all
+    seed
+    seed_all
+    set_rng_state
+    set_rng_state_all
+```
+
+## Streams and events
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Event
+    Stream
+```
+
+```{eval-rst}
+.. automodule:: torch.xpu.memory
+```
+```{eval-rst}
+.. currentmodule:: torch.xpu.memory
+```
+
+## Memory management
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+     empty_cache
+     max_memory_allocated
+     max_memory_reserved
+     mem_get_info
+     memory_allocated
+     memory_reserved
+     memory_stats
+     memory_stats_as_nested_dict
+     reset_accumulated_memory_stats
+     reset_peak_memory_stats
+```
+
+<!-- This module needs to be documented. Adding here in the meantime
+for tracking purposes -->
+```{eval-rst}
+.. py:module:: torch.xpu.random
+.. py:module:: torch.xpu.streams
+```
diff --git a/functorch/benchmarks/chrome_trace_parser.py b/functorch/benchmarks/chrome_trace_parser.py
index 826d53c990da..cc9b0890fe9a 100755
--- a/functorch/benchmarks/chrome_trace_parser.py
+++ b/functorch/benchmarks/chrome_trace_parser.py
@@ -8,6 +8,11 @@
 from torch._functorch.benchmark_utils import compute_utilization
 
 
+<<<<<<< HEAD
+=======
+log = logging.getLogger(__name__)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # process the chrome traces output by the pytorch profiler
 # require the json input file's name to be in format {model_name}_chrome_trace_*.json
 # the runtimes file should have format (model_name, runtime)
@@ -65,7 +70,11 @@ def main():
             )
             print(f"{modelname}, {utilization}, {mm_conv_utilization}")
         except BaseException:
+<<<<<<< HEAD
             logging.exception("%s, ERROR", filename)
+=======
+            log.exception("%s, ERROR", filename)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             print(f"{filename}, ERROR")
 
 
diff --git a/functorch/csrc/dim/arena.h b/functorch/csrc/dim/arena.h
index aaaf7e772a3a..8194cbc6b3c7 100644
--- a/functorch/csrc/dim/arena.h
+++ b/functorch/csrc/dim/arena.h
@@ -8,7 +8,11 @@
 #include <ATen/ATen.h>
 #include "minpybind.h"
 
+<<<<<<< HEAD
 #ifdef _WIN32
+=======
+#if defined(_MSC_VER) && !defined(__clang__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <intrin.h>
 // https://stackoverflow.com/questions/355967/how-to-use-msvc-intrinsics-to-get-the-equivalent-of-this-gcc-code
 inline unsigned int __builtin_clz(unsigned int x) {
diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
index 33e1c080dabd..0071867f5c6c 100644
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@@ -710,7 +710,11 @@ struct Tensor : public mpy::base<Tensor> {
             auto t = Tensor::wrap(run_torch_function(A, delayed_->orig, delayed_->args, true));
             tensor_ = t->tensor(A);
             delayed_.reset();
+<<<<<<< HEAD
             // don't force creation of batch tensor if it wasn't alreay provided.
+=======
+            // don't force creation of batch tensor if it wasn't already provided.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             batchtensor_ = t->batchtensor_;
             AT_ASSERT(levels() == t->levels());
         }
@@ -1739,7 +1743,11 @@ static mpy::object dot(Arena& A, TensorInfo lhs, TensorInfo rhs, Slice<DimEntry>
     if (lr_dims.dims.size() != sum.size()) {
         for (auto & d : sum) {
             if (!lhs.levels.contains(d) && !rhs.levels.contains(d)) {
+<<<<<<< HEAD
                 mpy::raise_error(DimensionBindError(), "summing over non-existant dimension %S", d.dim().ptr());
+=======
+                mpy::raise_error(DimensionBindError(), "summing over non-existent dimension %S", d.dim().ptr());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         }
     }
@@ -2206,7 +2214,11 @@ mpy::object index(Arena& A, mpy::handle self, mpy::handle dims, mpy::handle indi
         self_info.tensor = A.autorelease(rearranged->reshape(at::IntArrayRef(new_sizes.begin(), new_sizes.end())));
 
         self_info.levels = reshape_levels; // note: we are using the first level in a flattened group to represent the group for the rest of the op
+<<<<<<< HEAD
                                            // we need to be careful not to rely the dimensions size because it doesnt match the size of the whole group
+=======
+                                           // we need to be careful not to rely the dimensions size because it doesn't match the size of the whole group
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     bool has_dimpacks = false;
     for (auto idx : indices_list) {
@@ -2219,7 +2231,11 @@ mpy::object index(Arena& A, mpy::handle self, mpy::handle dims, mpy::handle indi
     return invoke_getitem(A, info);
 }
 
+<<<<<<< HEAD
 // true -- the indices were flattend out of a tuple, list or sequence...
+=======
+// true -- the indices were flattened out of a tuple, list or sequence...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Slice<mpy::handle> slice_from_sequence(Arena& A, mpy::handle value) {
     if (mpy::tuple_view::check(value)) {
@@ -2539,7 +2555,11 @@ IndexingInfo getsetitem_flat(Arena& A, TensorInfo self_info, Slice<mpy::handle>
              }
         } else if (Dim::check_exact(inp)) {
             auto d = Dim::unchecked_wrap(inp);
+<<<<<<< HEAD
             // dimesions used once are just binding operations
+=======
+            // dimensions used once are just binding operations
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (1 == seen_dims_nuses[*seen_dims.index(d)]) {
                 flat_inputs[i] = no_slice;
                 result_levels.append(A, d);
@@ -2798,7 +2818,11 @@ PyObject* py_split(PyObject *_,
         if (!dim.ptr()) {
             dim = A.autorelease(mpy::from_int(0));
         }
+<<<<<<< HEAD
         mpy::raise_error(PyExc_TypeError, "tensor does not comtain dimension %R", dim.ptr());
+=======
+        mpy::raise_error(PyExc_TypeError, "tensor does not contain dimension %R", dim.ptr());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     Slice<int64_t> indices;
 
diff --git a/functorch/csrc/dim/python_variable_simple.h b/functorch/csrc/dim/python_variable_simple.h
index fbd5cfd82815..41309684d337 100644
--- a/functorch/csrc/dim/python_variable_simple.h
+++ b/functorch/csrc/dim/python_variable_simple.h
@@ -6,7 +6,11 @@
 
 #pragma once
 // note: pytorch's python variable simple includes pybind which conflicts with minpybind
+<<<<<<< HEAD
 // so this file just reproduces the minimial API needed to extract Tensors from python objects.
+=======
+// so this file just reproduces the minimal API needed to extract Tensors from python objects.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/csrc/python_headers.h>
 #include <ATen/core/Tensor.h>
diff --git a/functorch/dim/README.md b/functorch/dim/README.md
index 89bf4ff52d14..0eab81e66738 100644
--- a/functorch/dim/README.md
+++ b/functorch/dim/README.md
@@ -5,7 +5,11 @@ Named Tensors using First-class Dimensions in PyTorch
 
 _An implementation of [named tensors](https://namedtensor.github.io) with the functionality of [einsum](http://einops.rocks]http://einops.rocks) , batching ([vmap](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap), [xmap](https://jax.readthedocs.io/en/latest/notebooks/xmap_tutorial.html)), and tensor indexing by adding dimension objects to PyTorch_.
 
+<<<<<<< HEAD
 The tensor input to a resnet might have the shape [8, 3, 224, 224] but informally we think of those dimensions as 'batch', 'channel', 'width', and 'height'. Eventhough 'width' and 'height' have the same _size_ we still think of them as separate dimensions, and if we have two _different_ images, we think of both as sharing the _same_ 'channel' dimension.
+=======
+The tensor input to a resnet might have the shape [8, 3, 224, 224] but informally we think of those dimensions as 'batch', 'channel', 'width', and 'height'. Even though 'width' and 'height' have the same _size_ we still think of them as separate dimensions, and if we have two _different_ images, we think of both as sharing the _same_ 'channel' dimension.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Named tensors gives these dimensions names. [PyTorch's current implementation](https://pytorch.org/docs/stable/named_tensor.html) uses strings to name dimensions. Instead, this library introduces a Python object, a `Dim`, to represent the concept. By expanding the semantics of tensors with dim objects, in addition to naming dimensions, we can get behavior equivalent to batching transforms (xmap, vmap), einops-style rearrangement, and loop-style tensor indexing.
 
@@ -51,19 +55,32 @@ We may eventually upstream them into PyTorch itself along with `functorch`.
 We have to install a nightly build of PyTorch so first set up an environment:
 
 ```sh
+<<<<<<< HEAD
 conda create --name dim
 conda activate dim
+=======
+python -m venv dim
+source dim/bin/activate  # or `& .\dim\Scripts\Activate.ps1` on Windows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 First-class dims requires a fairly recent nightly build of PyTorch so that functorch will work. You can install it using one of these commands:
 
 ```sh
+<<<<<<< HEAD
 # For CUDA 10.2
 conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch-nightly
 # For CUDA 11.3
 conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch-nightly
 # For CPU-only build
 conda install pytorch torchvision torchaudio cpuonly -c pytorch-nightly
+=======
+# For CUDA
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu116
+
+# For CPU-only build
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 Install dim. You will be asked for github credentials to access the fairinternal organization.
@@ -752,7 +769,11 @@ In this way, first-class dims are a way of adapting the nicer syntax of these ar
 
 Performance Expectations
 ========================
+<<<<<<< HEAD
 First-class dimensions are not a compiler. They provide syntax for existing PyTorch operations such as advanced indexing that is easier to read and write. For large sized tensors, the performance of any statements including them will be the same as using the already existing operations. An important exception is the pattern matching of products and summation, where performance will be improved by issuing to a matrix-multiply kernel. The C++ implementation of dimensions adds a small overhead of around 2us on top of PyTorch's normal overhead of 8us to each function that uses them. In the future, the implementation can encorporate more fusion optimization to further improve performance of this style of code.
+=======
+First-class dimensions are not a compiler. They provide syntax for existing PyTorch operations such as advanced indexing that is easier to read and write. For large sized tensors, the performance of any statements including them will be the same as using the already existing operations. An important exception is the pattern matching of products and summation, where performance will be improved by issuing to a matrix-multiply kernel. The C++ implementation of dimensions adds a small overhead of around 2us on top of PyTorch's normal overhead of 8us to each function that uses them. In the future, the implementation can incorporate more fusion optimization to further improve performance of this style of code.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ## License
diff --git a/functorch/dim/__init__.py b/functorch/dim/__init__.py
index 691b1b984f8d..001df5caad71 100644
--- a/functorch/dim/__init__.py
+++ b/functorch/dim/__init__.py
@@ -58,7 +58,11 @@ def __repr__(self):
 
 
 class Dim(_C.Dim, _Tensor):
+<<<<<<< HEAD
     # note that _C.Dim comes before tensor because we want the Dim API for things like size to take precendence.
+=======
+    # note that _C.Dim comes before tensor because we want the Dim API for things like size to take precedence.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Tensor defines format, but we want to print Dims with special formatting
     __format__ = object.__format__
 
diff --git a/functorch/dim/reference.py b/functorch/dim/reference.py
index 5c6178c0981c..b5ef91a8399c 100644
--- a/functorch/dim/reference.py
+++ b/functorch/dim/reference.py
@@ -507,7 +507,11 @@ def add_dims(t):
     for i in reversed(dim_packs):
         input[i : i + 1] = input[i]
 
+<<<<<<< HEAD
     # currenty:
+=======
+    # currently:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # input is flat, containing either Dim, or Tensor, or something valid for standard indexing
     # self may have first-class dims as well.
 
@@ -515,7 +519,11 @@ def add_dims(t):
     # drop the first class dims from self, they just become direct indices of their positions
 
     # figure out the dimensions of the indexing tensors: union of all the dims in the tensors in the index.
+<<<<<<< HEAD
     # these dimensions will appear and need to be bound at the first place tensor occures
+=======
+    # these dimensions will appear and need to be bound at the first place tensor occurs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if isinstance(self, _Tensor):
         ptensor_self, levels = self._tensor, list(self._levels)
diff --git a/functorch/docs/source/conf.py b/functorch/docs/source/conf.py
index a53fb86d7169..1bd0b269680b 100644
--- a/functorch/docs/source/conf.py
+++ b/functorch/docs/source/conf.py
@@ -13,20 +13,32 @@
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 import os
+<<<<<<< HEAD
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import functorch
 
 
+<<<<<<< HEAD
 # import sys
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # source code directory, relative to this file, for sphinx-autobuild
 # sys.path.insert(0, os.path.abspath('../..'))
 
 
 RELEASE = os.environ.get("RELEASE", False)
 
+<<<<<<< HEAD
 
 import pytorch_sphinx_theme
+=======
+sys.path.insert(0, os.path.abspath(".."))
+import pytorch_sphinx_theme2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # -- General configuration ------------------------------------------------
@@ -77,8 +89,15 @@
 autosummary_generate = True
 
 # Add any paths that contain templates here, relative to this directory.
+<<<<<<< HEAD
 templates_path = ["_templates"]
 
+=======
+templates_path = [
+    "_templates",
+    os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
@@ -161,13 +180,19 @@
 #
 #
 
+<<<<<<< HEAD
 html_theme = "pytorch_sphinx_theme"
 html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+=======
+html_theme = "pytorch_sphinx_theme2"
+html_theme_path = [pytorch_sphinx_theme2.get_html_theme_path()]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
+<<<<<<< HEAD
 html_theme_options = {
     "collapse_navigation": False,
     "display_version": True,
@@ -175,6 +200,61 @@
     "pytorch_project": "functorch",
     "navigation_with_keys": True,
     "analytics_id": "UA-117752657-2",
+=======
+
+switcher_version = "main" if not RELEASE else version
+
+html_theme_options = {
+    "icon_links": [
+        {
+            "name": "X",
+            "url": "https://x.com/PyTorch",
+            "icon": "fa-brands fa-x-twitter",
+        },
+        {
+            "name": "GitHub",
+            "url": "https://github.com/pytorch/pytorch",
+            "icon": "fa-brands fa-github",
+        },
+        {
+            "name": "PyTorch Forum",
+            "url": "https://discuss.pytorch.org/",
+            "icon": "fa-brands fa-discourse",
+        },
+        {
+            "name": "PyPi",
+            "url": "https://pypi.org/project/torch/",
+            "icon": "fa-brands fa-python",
+        },
+    ],
+    "collapse_navigation": False,
+    "display_version": True,
+    "pytorch_project": "functorch",
+    "navigation_with_keys": True,
+    "analytics_id": "UA-117752657-2",
+    "logo": {"text": "Home"},
+    "switcher": {
+        "json_url": "https://pytorch.org/functorch/functorch-versions.json",
+        "version_match": switcher_version,
+    },
+    "navbar_start": ["version-switcher"],
+}
+
+theme_variables = pytorch_sphinx_theme2.get_theme_variables()
+
+html_context = {
+    "theme_variables": theme_variables,
+    "display_github": True,
+    "github_url": "https://github.com",
+    "github_user": "pytorch",
+    "github_repo": "pytorch",
+    "feedback_url": "https://github.com/pytorch/pytorch",
+    "github_version": "main",
+    "doc_path": "functorch/docs/source",
+    "library_links": theme_variables.get("library_links", []),
+    "community_links": theme_variables.get("community_links", []),
+    "language_bindings_links": html_theme_options.get("language_bindings_links", []),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
@@ -184,10 +264,15 @@
 
 html_css_files = [
     "css/custom.css",
+<<<<<<< HEAD
+=======
+    "https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
 # Called automatically by Sphinx, making this `conf.py` an "extension".
+<<<<<<< HEAD
 def setup(app):
     # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
     # and can be moved outside of this function (and the setup(app) function
@@ -201,6 +286,8 @@ def setup(app):
     add_css = getattr(app, "add_css_file", app.add_stylesheet)
     for css_file in html_css_files:
         add_css(css_file)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # -- Options for HTMLHelp output ------------------------------------------
diff --git a/functorch/examples/ensembling/parallel_train.py b/functorch/examples/ensembling/parallel_train.py
index a674a24c738d..32efa92de5c8 100644
--- a/functorch/examples/ensembling/parallel_train.py
+++ b/functorch/examples/ensembling/parallel_train.py
@@ -138,7 +138,11 @@ def step6():
 # Step 7: Now, the flaw with step 6 is that we were training on the same exact
 # data. This can lead to all of the models in the ensemble overfitting in the
 # same way. The solution that http://willwhitney.com/parallel-training-jax.html
+<<<<<<< HEAD
 # applies is to randomly subset the data in a way that the models do not recieve
+=======
+# applies is to randomly subset the data in a way that the models do not receive
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # exactly the same data in each training step!
 # Because the goal of this doc is to show that we can use eager-mode vmap to
 # achieve similar things as JAX, the rest of this is left as an exercise to the reader.
diff --git a/functorch/examples/lennard_jones/lennard_jones.py b/functorch/examples/lennard_jones/lennard_jones.py
index 30a50c14a7f7..7e312f9cf58b 100644
--- a/functorch/examples/lennard_jones/lennard_jones.py
+++ b/functorch/examples/lennard_jones/lennard_jones.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # This example was adapated from https://github.com/muhrin/milad
+=======
+# This example was adapted from https://github.com/muhrin/milad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # It is licensed under the GLPv3 license. You can find a copy of it
 # here: https://www.gnu.org/licenses/gpl-3.0.en.html .
 
diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
index ec414d8c135b..66867ddae20a 100644
--- a/functorch/experimental/__init__.py
+++ b/functorch/experimental/__init__.py
@@ -1,5 +1,9 @@
 # PyTorch forward-mode is not mature yet
+<<<<<<< HEAD
 from torch._functorch.deprecated import functionalize
+=======
+from functorch import functionalize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._functorch.apis import chunk_vmap
 from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_
 from torch._functorch.eager_transforms import hessian, jacfwd, jvp
diff --git a/functorch/experimental/control_flow.py b/functorch/experimental/control_flow.py
index cbfd76d184cc..f9d6d704da78 100644
--- a/functorch/experimental/control_flow.py
+++ b/functorch/experimental/control_flow.py
@@ -1,5 +1,8 @@
 from torch import cond  # noqa: F401
+<<<<<<< HEAD
 from torch._higher_order_ops.cond import UnsupportedAliasMutationException  # noqa: F401
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.map import (  # noqa: F401
     _stack_pytree,
     _unstack_pytree,
diff --git a/functorch/notebooks/_src/plot_jacobians_and_hessians.py b/functorch/notebooks/_src/plot_jacobians_and_hessians.py
index 3faeaa9a1675..2bbaaf10eab5 100644
--- a/functorch/notebooks/_src/plot_jacobians_and_hessians.py
+++ b/functorch/notebooks/_src/plot_jacobians_and_hessians.py
@@ -100,7 +100,11 @@ def compute_jac(xp):
 #   vjp and vmap transforms.
 # - jacfwd uses forward-mode AD. It is implemented as a composition of our
 #   jvp and vmap transforms.
+<<<<<<< HEAD
 # jacfwd and jacrev can be subsituted for each other and have different
+=======
+# jacfwd and jacrev can be substituted for each other and have different
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # performance characteristics.
 #
 # As a general rule of thumb, if you're computing the jacobian of an R^N -> R^M
diff --git a/functorch/notebooks/jacobians_hessians.ipynb b/functorch/notebooks/jacobians_hessians.ipynb
index 5b986a592b72..b163249f7fa3 100644
--- a/functorch/notebooks/jacobians_hessians.ipynb
+++ b/functorch/notebooks/jacobians_hessians.ipynb
@@ -350,7 +350,11 @@
     {
       "cell_type": "markdown",
       "source": [
+<<<<<<< HEAD
         "Furthemore, it’s pretty easy to flip the problem around and say we want to compute Jacobians of the parameters to our model (weight, bias) instead of the input."
+=======
+        "Furthermore, it’s pretty easy to flip the problem around and say we want to compute Jacobians of the parameters to our model (weight, bias) instead of the input."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ],
       "metadata": {
         "id": "EQAB99EQflUJ"
diff --git a/functorch/notebooks/per_sample_grads.ipynb b/functorch/notebooks/per_sample_grads.ipynb
index a34c80d07ac4..e0e893c71a69 100644
--- a/functorch/notebooks/per_sample_grads.ipynb
+++ b/functorch/notebooks/per_sample_grads.ipynb
@@ -123,7 +123,11 @@
         "predictions = model(data) # move the entire mini-batch through the model\n",
         "\n",
         "loss = loss_fn(predictions, targets)\n",
+<<<<<<< HEAD
         "loss.backward() # back propogate the 'average' gradient of this mini-batch"
+=======
+        "loss.backward() # back propagate the 'average' gradient of this mini-batch"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ],
       "metadata": {
         "id": "WYjMx8QTUvRu"
diff --git a/mypy-strict.ini b/mypy-strict.ini
index 2feea92cb8c0..59f6a689163c 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -20,7 +20,11 @@ disallow_any_unimported = True
 strict = True
 implicit_reexport = False
 
+<<<<<<< HEAD
 # do not reenable this:
+=======
+# do not re-enable this:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # https://github.com/pytorch/pytorch/pull/60006#issuecomment-866130657
 warn_unused_ignores = False
 
diff --git a/mypy.ini b/mypy.ini
index 65f9ee43a6b8..d92f752ab980 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -2,7 +2,11 @@
 # test_run_mypy in test/test_type_hints.py uses this string)
 
 [mypy]
+<<<<<<< HEAD
 plugins = mypy_plugins/check_mypy_version.py, mypy_plugins/sympy_mypy_plugin.py, numpy.typing.mypy_plugin
+=======
+plugins = mypy_plugins/check_mypy_version.py, mypy_plugins/sympy_mypy_plugin.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 cache_dir = .mypy_cache/normal
 allow_redefinition = True
@@ -17,7 +21,11 @@ follow_imports = normal
 local_partial_types = True
 enable_error_code = possibly-undefined
 
+<<<<<<< HEAD
 # do not reenable this:
+=======
+# do not re-enable this:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # https://github.com/pytorch/pytorch/pull/60006#issuecomment-866130657
 warn_unused_ignores = False
 
@@ -311,4 +319,8 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 
 [mypy-redis]
-ignore_missing_imports = True
\ No newline at end of file
+<<<<<<< HEAD
+ignore_missing_imports = True
+=======
+ignore_missing_imports = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/pt_template_srcs.bzl b/pt_template_srcs.bzl
index 6d42026ba6ca..08f6d414a6cc 100644
--- a/pt_template_srcs.bzl
+++ b/pt_template_srcs.bzl
@@ -210,7 +210,11 @@ def get_metal_registration_files_outs():
 
 # There is a really weird issue with the arvr windows builds where
 # the custom op files are breaking them. See https://fburl.com/za87443c
+<<<<<<< HEAD
 # The hack is just to not build them for that platform and pray they arent needed.
+=======
+# The hack is just to not build them for that platform and pray they aren't needed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_metal_registration_files_outs_windows():
     outs = {}
     for file_path in METAL_SOURCE_LIST:
diff --git a/pyproject.toml b/pyproject.toml
index e84d980ff307..287e2c4d262a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,40 @@
+<<<<<<< HEAD
 [build-system]
 requires = [
     "setuptools",
+=======
+[project]
+name = "torch"
+requires-python = ">=3.9"
+license = {text = "BSD-3-Clause"}
+dynamic = [
+    "authors",
+    "classifiers",
+    "entry-points",
+    "dependencies",
+    "description",
+    "keywords",
+    "optional-dependencies",
+    "readme",
+    "scripts",
+    "version",
+]
+
+[project.urls]
+Homepage = "https://pytorch.org/"
+Documentation = "https://pytorch.org/docs/"
+Source = "https://github.com/pytorch/pytorch"
+Forum = "https://discuss.pytorch.org/"
+
+
+[build-system]
+requires = [
+    # After 75.8.2 dropped dep disttools API. Please fix
+    # API temporarily restored and shim used. Please fix
+    # Setuptools will drop support for setup.py past 80
+    # min version for recursive glob package data support
+    "setuptools>=62.3.0,<80.0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "wheel",
     "astunparse",
     "numpy",
@@ -16,8 +50,11 @@ build-backend = "setuptools.build_meta:__legacy__"
 
 [tool.black]
 line-length = 88
+<<<<<<< HEAD
 target-version = ["py38"]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 [tool.isort]
 src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"]
@@ -40,7 +77,10 @@ standard_library = ["typing_extensions"]
 
 
 [tool.ruff]
+<<<<<<< HEAD
 target-version = "py39"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 line-length = 88
 src = ["caffe2", "torch", "torchgen", "functorch", "test"]
 
@@ -50,6 +90,26 @@ quote-style = "double"
 
 [tool.ruff.lint]
 # NOTE: Synchoronize the ignores with .flake8
+<<<<<<< HEAD
+=======
+external = [
+    "B001",
+    "B902",
+    "B950",
+    "E121",
+    "E122",
+    "E128",
+    "E131",
+    "E704",
+    "E723",
+    "F723",
+    "F812",
+    "P201",
+    "P204",
+    "T484",
+    "TOR901",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ignore = [
     # these ignores are from flake8-bugbear; please fix!
     "B007", "B008", "B017",
@@ -70,7 +130,10 @@ ignore = [
     # these ignores are from ruff PERF; please fix!
     "PERF203",
     "PERF401",
+<<<<<<< HEAD
     "PERF403",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # these ignores are from PYI; please fix!
     "PYI024",
     "PYI036",
@@ -86,6 +149,10 @@ ignore = [
     "SIM117",
     "SIM118",
     "UP007", # keep-runtime-typing
+<<<<<<< HEAD
+=======
+    "TC006",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 select = [
     "B",
@@ -126,10 +193,18 @@ select = [
     "PLW0406", # import self
     "PLW0711", # binary op exception
     "PLW1501", # bad open mode
+<<<<<<< HEAD
+=======
+    "PLW1507", # shallow copy os.environ
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "PLW1509", # preexec_fn not safe with threads
     "PLW2101", # useless lock statement
     "PLW3301", # nested min max
     "PT006", # TODO: enable more PT rules
+<<<<<<< HEAD
+=======
+    "PT014", # duplicate parameterize case
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "PT022",
     "PT023",
     "PT024",
@@ -146,25 +221,60 @@ select = [
     "RUF017",
     "RUF018", # no assignment in assert
     "RUF019", # unnecessary-key-check
+<<<<<<< HEAD
     "RUF024", # from keys mutable
     "RUF026", # default factory kwarg
     "RUF030", # No print statement in assert
     "S324", # for hashlib FIPS compliance
     "SLOT",
     "TCH",
+=======
+    "RUF020", # never union
+    "RUF024", # from keys mutable
+    "RUF026", # default factory kwarg
+    "RUF030", # No print statement in assert
+    "RUF033", # default values __post_init__ dataclass
+    "RUF041", # simplify nested Literal
+    "RUF048", # properly parse `__version__`
+    "RUF200", # validate pyproject.toml
+    "S324", # for hashlib FIPS compliance
+    "SLOT",
+    "TC",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "TRY002", # ban vanilla raise (todo fix NOQAs)
     "TRY203",
     "TRY401", # verbose-log-message
     "UP",
+<<<<<<< HEAD
+=======
+    "YTT",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = [
     "F401",
 ]
+<<<<<<< HEAD
+"functorch/notebooks/**" = [
+    "F401",
+]
+=======
+"*.pyi" = [
+    "PYI011", # typed-argument-default-in-stub
+    "PYI021", # docstring-in-stub
+    "PYI053", # string-or-bytes-too-long
+]
 "functorch/notebooks/**" = [
     "F401",
 ]
+"test/export/**" = [
+    "PGH004"
+]
+"test/typing/**" = [
+    "PGH004"
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 "test/typing/reveal/**" = [
     "F821",
 ]
@@ -178,6 +288,12 @@ select = [
 "test/dynamo/test_debug_utils.py" = [
     "UP037",
 ]
+<<<<<<< HEAD
+=======
+"test/dynamo/test_misc.py" = [
+    "PGH004",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 "test/jit/**" = [
     "PLR0133", # tests require this for JIT
     "PYI",
@@ -190,10 +306,17 @@ select = [
     "RUF015",
     "UP", # We don't want to modify the jit test as they test specify syntax
 ]
+<<<<<<< HEAD
+=======
+"test/inductor/s429861_repro.py" = [
+    "PGH004",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 "test/inductor/test_torchinductor.py" = [
     "UP037",
 ]
 # autogenerated #TODO figure out why file level noqa is ignored
+<<<<<<< HEAD
 "torch/_inductor/fx_passes/serialized_patterns/**" = ["F401", "F501"]
 "torch/_inductor/autoheuristic/artifacts/**" = ["F401", "F501"]
 "torchgen/api/types/__init__.py" = [
@@ -201,6 +324,16 @@ select = [
     "F403",
 ]
 "torchgen/executorch/api/types/__init__.py" = [
+=======
+"torch/_appdirs.py" = ["PGH004"]
+"torch/jit/_shape_functions.py" = ["PGH004"]
+"torch/_inductor/fx_passes/serialized_patterns/**" = ["F401", "F501"]
+"torch/_inductor/autoheuristic/artifacts/**" = ["F401", "F501"]
+"torch/_inductor/codegen/**" = [
+    "PGH004"
+]
+"torchgen/api/types/__init__.py" = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "F401",
     "F403",
 ]
@@ -210,3 +343,12 @@ select = [
 "torch/_vendor/**" = [
     "UP", # No need to mess with _vendor
 ]
+<<<<<<< HEAD
+=======
+"tools/linter/**" = [
+    "LOG015" # please fix
+]
+
+[tool.codespell]
+ignore-words = "tools/linter/dictionary.txt"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/pyrefly.toml b/pyrefly.toml
new file mode 100644
index 000000000000..e06374734977
--- /dev/null
+++ b/pyrefly.toml
@@ -0,0 +1,88 @@
+project_includes = [
+    "torch",
+    "caffe2",
+    "test/test_bundled_images.py",
+    "test/test_bundled_inputs.py",
+    "test/test_complex.py",
+    "test/test_datapipe.py",
+    "test/test_futures.py",
+    "test/test_numpy_interop.py",
+    "test/test_torch.py",
+    "test/test_type_hints.py",
+    "test/test_type_info.py",
+    "test/test_utils.py",
+]
+project_excludes = [
+  "torch/include/**",
+  "torch/csrc/**",
+  "torch/distributed/elastic/agent/server/api.py",
+  "torch/testing/_internal/**",
+  "torch/distributed/fsdp/fully_sharded_data_parallel.py",
+  "torch/ao/quantization/pt2e/_affine_quantization.py",
+  "torch/nn/modules/pooling.py",
+  "torch/nn/parallel/_functions.py",
+  "torch/_appdirs.py",
+  "torch/multiprocessing/pool.py",
+  "torch/overrides.py",
+  "*/__pycache__/**",
+  "*/.*",
+]
+replace_imports_with_any = [
+    "torch._C._jit_tree_views.*",
+    "torch.for_onnx.onnx.*",
+    "torch.ao.quantization.experimental.apot_utils.*",
+    "torch.ao.quantization.experimental.quantizer.*",
+    "torch.ao.quantization.experimental.observer.*",
+    "torch.ao.quantization.experimental.APoT_tensor.*",
+    "torch.ao.quantization.experimental.fake_quantize_function.*",
+    "torch.ao.quantization.experimental.fake_quantize.*",
+    "triton.*",
+    "tensorflow.*",
+    "tensorboard.*",
+    "matplotlib.*",
+    "numpy.*",
+    "sympy.*",
+    "hypothesis.*",
+    "tqdm.*",
+    "multiprocessing.*",
+    "setuptools.*",
+    "distutils.*",
+    "nvd3.*",
+    "future.utils.*",
+    "past.builtins.*",
+    "numba.*",
+    "PIL.*",
+    "moviepy.*",
+    "cv2.*",
+    "torchvision.*",
+    "pycuda.*",
+    "tensorrt.*",
+    "tornado.*",
+    "pydot.*",
+    "networkx.*",
+    "scipy.*",
+    "IPython.*",
+    "google.protobuf.textformat.*",
+    "lmdb.*",
+    "mpi4py.*",
+    "skimage.*",
+    "librosa.*",
+    "mypy.*",
+    "xml.*",
+    "boto3.*",
+    "dill.*",
+    "usort.*",
+    "cutlass_library.*",
+    "deeplearning.*",
+    "einops.*",
+    "libfb.*",
+    "torch.fb.*",
+    "torch.*.fb.*",
+    "torch_xla.*",
+    "onnx.*",
+    "onnxruntime.*",
+    "onnxscript.*",
+    "redis.*"
+]
+
+untyped_def_behavior = "check-and-infer-return-any"
\ No newline at end of file
diff --git a/related_commits b/related_commits
index 1cce00390bfb..a8c9d92c2b5b 100644
--- a/related_commits
+++ b/related_commits
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 ubuntu|pytorch|apex|release/1.7.0|20a62dc5a91c85fdc9bfd1b389919316f6476951|https://github.com/ROCm/apex
 centos|pytorch|apex|release/1.7.0|20a62dc5a91c85fdc9bfd1b389919316f6476951|https://github.com/ROCm/apex
 ubuntu|pytorch|torchvision|release/0.22|59a3e1f9f78cfe44cb989877cc6f4ea77c8a75ca|https://github.com/pytorch/vision
@@ -8,3 +9,15 @@ ubuntu|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|h
 centos|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
 ubuntu|pytorch|torchaudio|release/2.7|95c61b4168fc5133be8dd8c1337d929d066ae6cf|https://github.com/pytorch/audio
 centos|pytorch|torchaudio|release/2.7|95c61b4168fc5133be8dd8c1337d929d066ae6cf|https://github.com/pytorch/audio
+=======
+ubuntu|pytorch|apex|release/1.8.0|3f26640cff501d67d35acf424ed2566d50949f5b|https://github.com/ROCm/apex
+centos|pytorch|apex|release/1.8.0|3f26640cff501d67d35acf424ed2566d50949f5b|https://github.com/ROCm/apex
+ubuntu|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision
+centos|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+centos|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|release/2.8|6e1c7fe9ff6d82b8665d0a46d859d3357d2ebaaa|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|release/2.8|6e1c7fe9ff6d82b8665d0a46d859d3357d2ebaaa|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
+centos|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/requirements.txt b/requirements.txt
index aeb151ed6aff..2e473b42b562 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,11 @@ hypothesis==5.35.1
 jinja2==3.1.6
 lintrunner==0.12.7 ; platform_machine != "s390x"
 networkx==2.8.8
+<<<<<<< HEAD
 ninja==1.11.1.4
+=======
+ninja==1.11.1.3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 numpy==2.0.2 ; python_version == "3.9"
 numpy==2.1.2 ; python_version > "3.9"
 optree==0.13.0
@@ -16,6 +20,10 @@ packaging==25.0
 psutil==7.0.0
 pyyaml==6.0.2
 requests==2.32.4
+<<<<<<< HEAD
+=======
+# setuptools develop deprecated on 80.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # issue on Windows after >= 75.8.2 - https://github.com/pytorch/pytorch/issues/148877
 setuptools==75.8.2
 sympy==1.13.3
diff --git a/scripts/build_android.sh b/scripts/build_android.sh
index de0bed7c26d4..bb3ce2fd9e75 100755
--- a/scripts/build_android.sh
+++ b/scripts/build_android.sh
@@ -157,7 +157,11 @@ if [ -n "${USE_VULKAN}" ]; then
   fi
 fi
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overridding defaults
+=======
+# Use-specified CMake arguments go last to allow overriding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=($@)
 
 # Patch pocketfft (as Android does not have aligned_alloc even if compiled with c++17
diff --git a/scripts/build_mobile.sh b/scripts/build_mobile.sh
index 06cae0dd41a3..20d7a600fed4 100755
--- a/scripts/build_mobile.sh
+++ b/scripts/build_mobile.sh
@@ -80,7 +80,11 @@ if [ "${VERBOSE:-}" == '1' ]; then
   CMAKE_ARGS+=("-DCMAKE_VERBOSE_MAKEFILE=1")
 fi
 
+<<<<<<< HEAD
 # Use-specified CMake arguments go last to allow overridding defaults
+=======
+# Use-specified CMake arguments go last to allow overriding defaults
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CMAKE_ARGS+=("$@")
 
 # Now, actually build the Android target.
diff --git a/scripts/build_tizen.sh b/scripts/build_tizen.sh
index ce64b6c4298e..95542881392d 100755
--- a/scripts/build_tizen.sh
+++ b/scripts/build_tizen.sh
@@ -9,7 +9,11 @@
 
 setup_environment(){
 # The rootfs image for a Tizen target (RPi3)is located at the below webpage:
+<<<<<<< HEAD
 # http://download.tizen.org/releases/milestone/tizen/4.0.m1/tizen-unified_20170529.1/images/
+=======
+# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/tizen-unified_20170529.1/images/
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # If you do not have a Tizen device, Please, run qemu-arm-static and chroot command.
 # $ sudo chroot ~/tizen-rootfs qemu-arm-static /usr/bin/bash
 
@@ -25,7 +29,11 @@ caffe2_lite_dep_packages(){
 # You can set-up a rpm repository with zypper, yum, and dnf because Tizen
 # software platform officially support rpm format such as Fedora, OpenSUSE.
 # The official Tizen repository is as following:
+<<<<<<< HEAD
 # http://download.tizen.org/releases/milestone/tizen/4.0.m1/
+=======
+# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 echo "Installing dependencies."
 sudo zypper install \
   make \
@@ -69,7 +77,11 @@ caffe2_full_dep_packages(){
 # You can set-up a rpm repository with zypper, yum, and dnf because Tizen
 # software platform officially support rpm format such as Fedora, OpenSUSE.
 # The official Tizen repository is as following:
+<<<<<<< HEAD
 # http://download.tizen.org/releases/milestone/tizen/4.0.m1/
+=======
+# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 echo "Installing dependencies."
 sudo zypper install \
   cmake \
diff --git a/scripts/compile_tests/download_reports.py b/scripts/compile_tests/download_reports.py
index 03804b11f7eb..c994f320c0b6 100644
--- a/scripts/compile_tests/download_reports.py
+++ b/scripts/compile_tests/download_reports.py
@@ -9,6 +9,7 @@
 
 CONFIGS = {
     "dynamo39": {
+<<<<<<< HEAD
         "linux-focal-py3.9-clang10 / test (dynamo_wrapped, 1, 3, linux.2xlarge)",
         "linux-focal-py3.9-clang10 / test (dynamo_wrapped, 2, 3, linux.2xlarge)",
         "linux-focal-py3.9-clang10 / test (dynamo_wrapped, 3, 3, linux.2xlarge)",
@@ -22,11 +23,32 @@
         "linux-focal-py3.11-clang10 / test (default, 1, 3, linux.2xlarge)",
         "linux-focal-py3.11-clang10 / test (default, 2, 3, linux.2xlarge)",
         "linux-focal-py3.11-clang10 / test (default, 3, 3, linux.2xlarge)",
+=======
+        "linux-jammy-py3.9-clang12 / test (dynamo_wrapped, 1, 3, linux.2xlarge)",
+        "linux-jammy-py3.9-clang12 / test (dynamo_wrapped, 2, 3, linux.2xlarge)",
+        "linux-jammy-py3.9-clang12 / test (dynamo_wrapped, 3, 3, linux.2xlarge)",
+    },
+    "dynamo313": {
+        "linux-jammy-py3.13-clang12 / test (dynamo_wrapped, 1, 3, linux.2xlarge)",
+        "linux-jammy-py3.13-clang12 / test (dynamo_wrapped, 2, 3, linux.2xlarge)",
+        "linux-jammy-py3.13-clang12 / test (dynamo_wrapped, 3, 3, linux.2xlarge)",
+    },
+    "eager313": {
+        "linux-jammy-py3.13-clang12 / test (default, 1, 5, linux.4xlarge)",
+        "linux-jammy-py3.13-clang12 / test (default, 2, 5, linux.4xlarge)",
+        "linux-jammy-py3.13-clang12 / test (default, 3, 5, linux.4xlarge)",
+        "linux-jammy-py3.13-clang12 / test (default, 4, 5, linux.4xlarge)",
+        "linux-jammy-py3.13-clang12 / test (default, 5, 5, linux.4xlarge)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     },
 }
 
 
+<<<<<<< HEAD
 def download_reports(commit_sha, configs=("dynamo39", "dynamo311", "eager311")):
+=======
+def download_reports(commit_sha, configs=("dynamo39", "dynamo313", "eager313")):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     log_dir = "tmp_test_reports_" + commit_sha
 
     def subdir_path(config):
diff --git a/scripts/compile_tests/update_failures.py b/scripts/compile_tests/update_failures.py
index 73fb354a8d15..34a5f7a445e7 100755
--- a/scripts/compile_tests/update_failures.py
+++ b/scripts/compile_tests/update_failures.py
@@ -221,5 +221,10 @@ def read_test_results(directory):
     args = parser.parse_args()
     assert Path(args.filename).exists(), args.filename
     assert Path(args.test_dir).exists(), args.test_dir
+<<<<<<< HEAD
     dynamo39, dynamo311 = download_reports(args.commit, ("dynamo39", "dynamo311"))
     update(args.filename, args.test_dir, dynamo39, dynamo311, args.also_remove_skips)
+=======
+    dynamo39, dynamo313 = download_reports(args.commit, ("dynamo39", "dynamo313"))
+    update(args.filename, args.test_dir, dynamo39, dynamo313, args.also_remove_skips)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/scripts/install_triton_wheel.sh b/scripts/install_triton_wheel.sh
index a3e1736362a5..43d15442c799 100755
--- a/scripts/install_triton_wheel.sh
+++ b/scripts/install_triton_wheel.sh
@@ -1,5 +1,10 @@
 #!/bin/bash
 # Updates Triton to the pinned version for this copy of PyTorch
+<<<<<<< HEAD
+=======
+PYTHON="python3"
+PIP="$PYTHON -m pip"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BRANCH=$(git rev-parse --abbrev-ref HEAD)
 DOWNLOAD_PYTORCH_ORG="https://download.pytorch.org/whl"
 
@@ -7,6 +12,7 @@ if [[ -z "${USE_XPU}" ]]; then
     # Default install from PyTorch source
 
     TRITON_VERSION="pytorch-triton==$(cat .ci/docker/triton_version.txt)"
+<<<<<<< HEAD
     if [[ "$BRANCH" =~ .*release.* ]]; then
         pip install --index-url ${DOWNLOAD_PYTORCH_ORG}/test/ $TRITON_VERSION
     else
@@ -28,4 +34,16 @@ else
         # force-reinstall to ensure the pinned version is installed
         pip install --force-reinstall "git+${TRITON_XPU_REPO}@${TRITON_XPU_COMMIT_ID}#subdirectory=python"
     fi
+=======
+    TRITON_COMMIT_ID="$(head -c 8 .ci/docker/ci_commit_pins/triton.txt)"
+else
+    TRITON_VERSION="pytorch-triton-xpu==$(cat .ci/docker/triton_xpu_version.txt)"
+    TRITON_COMMIT_ID="$(head -c 8 .ci/docker/ci_commit_pins/triton-xpu.txt)"
+fi
+
+if [[ "$BRANCH" =~ .*release.* ]]; then
+    ${PIP} install --index-url ${DOWNLOAD_PYTORCH_ORG}/test/ $TRITON_VERSION
+else
+    ${PIP} install --index-url ${DOWNLOAD_PYTORCH_ORG}/nightly/ $TRITON_VERSION+git${TRITON_COMMIT_ID}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 fi
diff --git a/scripts/jit/log_extract.py b/scripts/jit/log_extract.py
index 95d882b461d4..0851d59e988b 100644
--- a/scripts/jit/log_extract.py
+++ b/scripts/jit/log_extract.py
@@ -95,7 +95,11 @@ def run():
         "--no-nnc-dynamic",
         dest="nnc_dynamic",
         action="store_false",
+<<<<<<< HEAD
         help="DONT't benchmark nnc with dynamic shapes",
+=======
+        help="don't benchmark nnc with dynamic shapes",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     parser.set_defaults(nnc_dynamic=False)
 
diff --git a/scripts/lint_urls.sh b/scripts/lint_urls.sh
new file mode 100755
index 000000000000..0a8fddce00b3
--- /dev/null
+++ b/scripts/lint_urls.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+set -euo pipefail
+
+trap 'kill 0' SIGINT
+
+status=0
+green='\e[1;32m'; red='\e[1;31m'; cyan='\e[1;36m'; yellow='\e[1;33m'; reset='\e[0m'
+user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
+max_jobs=10
+pids=()
+
+running_jobs() {
+  jobs -rp | wc -l
+}
+
+while IFS=: read -r filepath url; do
+  (
+    code=$(curl -k -gsLm30 --retry 3 --retry-delay 3 --retry-connrefused -o /dev/null -w "%{http_code}" -I "$url") || code=000
+    if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then
+      sleep 1
+      code=$(curl -k -gsLm30 --retry 3 --retry-delay 3 --retry-connrefused -o /dev/null -w "%{http_code}" -r 0-0 -A "$user_agent" -H "Accept-Language: en-US,en" -H "Connection: keep-alive" "$url") || code=000
+    fi
+    if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then
+      sleep 1
+      request_id=$(curl -sS -G -H 'Accept: application/json' \
+        --data-urlencode "host=$url" \
+        --data-urlencode "max_nodes=1" \
+        --data-urlencode "node=us3.node.check-host.net" \
+        https://check-host.net/check-http \
+        | jq -r .request_id) || request_id=""
+      if [ -n "$request_id" ]; then
+        sleep 5
+        for _ in {1..5}; do
+          new_code=$(curl -sS -H 'Accept: application/json' \
+            "https://check-host.net/check-result/$request_id" \
+            | jq -r -e '.[][0][3]') || new_code=000
+          [[ "$new_code" =~ ^[0-9]+$ ]] || new_code=000
+          if [ "$new_code" -ge 200 ] && [ "$new_code" -lt 400 ]; then
+            code=$new_code
+            break
+          fi
+          sleep 5
+        done
+      fi
+    fi
+    # Treat Cloudflare JS-challenge and rate-limit as success.
+    if [[ "$code" == "403" || "$code" == "429" || "$code" == "503" ]]; then
+      printf "${yellow}WARN %s${reset} ${cyan}%s${reset} %s\n" "$code" "$url" "$filepath"
+      exit 0
+    fi
+    if [ "$code" -lt 200 ] || [ "$code" -ge 400 ]; then
+      printf "${red}FAIL %s${reset} ${yellow}%s${reset} %s\n" "$code" "$url" "$filepath" >&2
+      exit 1
+    else
+      printf "${green} OK  %s${reset} ${cyan}%s${reset} %s\n" "$code" "$url" "$filepath"
+      exit 0
+    fi
+  ) &
+  pids+=($!)
+  while [ "$(running_jobs)" -ge "$max_jobs" ]; do
+    sleep 1
+  done
+done < <(
+  pattern='(?!.*@lint-ignore)(?<!git\+)(?<!\$\{)https?://(?![^/]*@)(?![^\s<>\")]*[<>\{\}\$])[[:alnum:]][^[:space:]<>")\[\]\\|]*'
+  excludes=(
+    ':(exclude,glob)**/.*'
+    ':(exclude,glob)**/*.lock'
+    ':(exclude,glob)**/*.svg'
+    ':(exclude,glob)**/*.xml'
+    ':(exclude,glob)**/*.gradle*'
+    ':(exclude,glob)**/*gradle*'
+    ':(exclude,glob)**/third-party/**'
+    ':(exclude,glob)**/third_party/**'
+  )
+  if [ $# -eq 2 ]; then
+    for filename in $(git diff --name-only --unified=0 "$1...$2"); do
+      git diff --unified=0 "$1...$2" -- "$filename" "${excludes[@]}" \
+        | grep -E '^\+' \
+        | grep -Ev '^\+\+\+' \
+        | perl -nle 'print for m#'"$pattern"'#g' \
+        | sed 's|^|'"$filename"':|'
+    done
+  else
+    git --no-pager grep --no-color -I -P -o "$pattern" -- . "${excludes[@]}"
+  fi \
+  | sed -E 's/[^/[:alnum:]]+$//' \
+  | grep -Ev '://(0\.0\.0\.0|127\.0\.0\.1|localhost)([:/])' \
+  | grep -Ev '://[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' \
+  | grep -Ev 'fwdproxy:8080' \
+  || true
+)
+
+for pid in "${pids[@]}"; do
+  wait "$pid" 2>/dev/null || {
+    case $? in
+      1) status=1 ;;
+      127) ;;  # ignore "not a child" noise
+      *) exit $? ;;
+    esac
+  }
+done
+
+exit $status
diff --git a/scripts/lint_xrefs.sh b/scripts/lint_xrefs.sh
new file mode 100755
index 000000000000..8c36d68ba09e
--- /dev/null
+++ b/scripts/lint_xrefs.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+set -euo pipefail
+
+status=0
+green='\e[1;32m'; red='\e[1;31m'; cyan='\e[1;36m'; yellow='\e[1;33m'; reset='\e[0m'
+last_filepath=
+
+while IFS=: read -r filepath link; do
+  if [ "$filepath" != "$last_filepath" ]; then
+    printf '\n%s:\n' "$filepath"
+    last_filepath=$filepath
+  fi
+  if [ -e "$(dirname "$filepath")/${link%%#*}" ]; then
+    printf " ${green}OK${reset}  ${cyan}%s${reset}\n" "$link"
+  else
+    printf "${red}FAIL${reset} ${yellow}%s${reset}\n" "$link" >&2
+    status=1
+  fi
+done < <(
+  pattern='(?!.*@lint-ignore)(?:\[[^]]+\]\([^[:space:]\)]+/[^[:space:]\)]+\)|href="[^"]*/[^"]*"|src="[^"]*/[^"]*")'
+  excludes=(
+    ':(exclude,glob)**/.*'
+    ':(exclude,glob)**/*.lock'
+    ':(exclude,glob)**/*.svg'
+    ':(exclude,glob)**/*.xml'
+    ':(exclude,glob)**/*.gradle*'
+    ':(exclude,glob)**/*gradle*'
+    ':(exclude,glob)**/third-party/**'
+    ':(exclude,glob)**/third_party/**'
+  )
+  if [ $# -eq 2 ]; then
+    for filename in $(git diff --name-only --unified=0 "$1...$2"); do
+      git diff --unified=0 "$1...$2" -- "$filename" "${excludes[@]}" \
+        | grep -E '^\+' \
+        | grep -Ev '^\+\+\+' \
+        | perl -nle 'print for m#'"$pattern"'#g' \
+        | sed 's|^|'"$filename"':|'
+    done
+  else
+    git --no-pager grep --no-color -I -P -o "$pattern" -- . "${excludes[@]}"
+  fi \
+  | grep -Ev 'https?://' \
+  | sed -E \
+      -e 's#([^:]+):\[[^]]+\]\(([^)]+)\)#\1:\2#' \
+      -e 's#([^:]+):href="([^"]+)"#\1:\2#' \
+      -e 's#([^:]+):src="([^"]+)"#\1:\2#' \
+      -e 's/[[:punct:]]*$//' \
+  | grep -Ev '\{\{' \
+  || true
+)
+
+exit $status
diff --git a/scripts/release/README.md b/scripts/release/README.md
index 3908142bb6b0..d523a99da9da 100644
--- a/scripts/release/README.md
+++ b/scripts/release/README.md
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # PyTorch Release Scripts
+=======
+# PyTorch release scripts performing branch cut and applying release only changes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 These are a collection of scripts that are to be used for release activities.
 
@@ -7,6 +11,7 @@ These are a collection of scripts that are to be used for release activities.
 >       The basic idea being that there should be no potential to do anything dangerous unless
 >       `DRY_RUN` is explicitly set to `disabled`.
 
+<<<<<<< HEAD
 ## Requirements to actually run these scripts
 * AWS access to pytorch account
 * Access to upload conda packages to the `pytorch` conda channel
@@ -58,3 +63,14 @@ Which backup to restore from is dictated by the `RESTORE_FROM` environment varia
 ```bash
 RESTORE_FROM=v1.5.0-rc5 ./restore-backup.sh
 ```
+=======
+### Order of Execution
+
+1. Run cut-release-branch.sh to cut the release branch
+2. Run tag-docker-images.sh to tag current docker images with release tag and push them to docker.io. These images will be used to build the release.
+3. Run apply-release-changes.sh to apply release only changes to create a PR with release only changes similar to this [PR](https://github.com/pytorch/pytorch/pull/149056)
+
+#### Promoting packages
+
+ Scripts for Promotion of PyTorch packages are under test-infra repository. Please follow [README.md](https://github.com/pytorch/test-infra/blob/main/release/README.md)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/scripts/release_notes/apply_categories.py b/scripts/release_notes/apply_categories.py
index 786b1a95908b..010f66e2e926 100644
--- a/scripts/release_notes/apply_categories.py
+++ b/scripts/release_notes/apply_categories.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 # Quick scipt to apply categorized items to the
+=======
+# Quick script to apply categorized items to the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # base commitlist . Useful if you are refactoring any code
 # but want to keep the previous data on categories
 
diff --git a/scripts/release_notes/classifier.py b/scripts/release_notes/classifier.py
index c64bad818e4e..72ce3a83ea73 100644
--- a/scripts/release_notes/classifier.py
+++ b/scripts/release_notes/classifier.py
@@ -156,9 +156,15 @@ def convert_index_to_category_name(self, most_likely_index):
         elif isinstance(most_likely_index, torch.Tensor):
             return [self.categories[i] for i in most_likely_index]
 
+<<<<<<< HEAD
     def get_most_likely_category_name(self, inpt):
         # Input will be a dict with title and author keys
         logits = self.forward(inpt)
+=======
+    def get_most_likely_category_name(self, input):
+        # Input will be a dict with title and author keys
+        logits = self.forward(input)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         most_likely_index = torch.argmax(logits, dim=1)
         return self.convert_index_to_category_name(most_likely_index)
 
@@ -264,9 +270,15 @@ def generate_batch(batch):
 
 
 def train_step(batch, model, optimizer, loss):
+<<<<<<< HEAD
     inpt, targets = batch
     optimizer.zero_grad()
     output = model(inpt)
+=======
+    input, targets = batch
+    optimizer.zero_grad()
+    output = model(input)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     l = loss(output, targets)
     l.backward()
     optimizer.step()
@@ -275,8 +287,13 @@ def train_step(batch, model, optimizer, loss):
 
 @torch.no_grad()
 def eval_step(batch, model, loss):
+<<<<<<< HEAD
     inpt, targets = batch
     output = model(inpt)
+=======
+    input, targets = batch
+    output = model(input)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     l = loss(output, targets)
     return l
 
diff --git a/scripts/release_notes/commitlist.py b/scripts/release_notes/commitlist.py
index bff5b3ec3e50..d4bb8f3bcb3c 100644
--- a/scripts/release_notes/commitlist.py
+++ b/scripts/release_notes/commitlist.py
@@ -32,6 +32,12 @@
 
 """
 
+<<<<<<< HEAD
+=======
+# Increase the allowed size of a CSV field to 1mil bytes for long files changed
+csv.field_size_limit(1000000)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass(frozen=False)
 class Commit:
@@ -490,6 +496,7 @@ def get_markdown_header(category):
     header = f"""
 # Release Notes worksheet {category}
 
+<<<<<<< HEAD
 The main goal of this process is to rephrase all the commit messages below to make them clear and easy to read by the end user. You should follow the following instructions to do so:
 
 * **Please cleanup, and format commit titles to be readable by the general pytorch user.** [Detailed instructions here](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit)
@@ -499,6 +506,17 @@ def get_markdown_header(category):
 * Please use markdown format
 * Please use #PR_NUM to link to the PR, instead of `[#PR_NUM](https://github.com/pytorch/pytorch/pull/#PR_NUM)` to reduce the length of the release notes
 * We place a lot of emphasis on the “BC-breaking” and “deprecation” sections. Those should be where the most effort goes in. The “improvements” and “bug fixes” for Python API should be nice as well. Everything else doesn’t matter too much so feel free to cut corners if time is short.
+=======
+The main goal of this process is to rephrase all the commit messages below to make them **clear and easy to read** by the end user. You should follow the following instructions to do so:
+
+* **Please clean up and format commit titles to be readable by the general PyTorch user.** Make sure you're [following the guidance here](https://docs.google.com/document/d/14OmgGBr1w6gl1VO47GGGdwrIaUNr92DFhQbY_NEk8mQ/edit)! Your resulting notes must be consistent and easy to read.
+* Please sort commits into the following categories (you should not rename the categories!), I tried to pre-sort these to ease your work, feel free to move commits around if the current categorization is not good.
+* Anything that is not public facing needs to be removed.
+* If anything is miscategorized/belongs to another domain, move it to `miscategorized.md`.
+* Please scan through `miscategorized.md` and handle any commits that belong within your domain according to these instructions.
+* We place a lot of emphasis on the “BC-breaking” and “deprecation” sections. Those should be where the most effort goes in. The “improvements” and “bug fixes” for Python API should be nice as well.
+* Once you are finished, move this very file from `todo/` to `done/` and submit a pull request.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 The categories below are as follows:
 
@@ -510,6 +528,10 @@ def get_markdown_header(category):
 * performance: All commits that are added mainly for performance (we separate this from improvements above to make it easier for users to look for it)
 * documentation: All commits that add/update documentation
 * Developers: All commits that are not end-user facing but still impact people that compile from source, develop into pytorch, extend pytorch, etc
+<<<<<<< HEAD
+=======
+* not user facing: All commits that are not public end-user facing and hence should be dropped from the release notes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 
     return [header]
diff --git a/scripts/release_notes/common.py b/scripts/release_notes/common.py
index 9143fd672fb2..df9167d1c8f0 100644
--- a/scripts/release_notes/common.py
+++ b/scripts/release_notes/common.py
@@ -212,7 +212,13 @@ def get_token():
 
 def run_query(query):
     request = requests.post(
+<<<<<<< HEAD
         "https://api.github.com/graphql", json={"query": query}, headers=headers
+=======
+        "https://api.github.com/graphql",  # @lint-ignore
+        json={"query": query},
+        headers=headers,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     if request.status_code == 200:
         return request.json()
diff --git a/setup.py b/setup.py
index 32b4305ba7c8..adb251ab302e 100644
--- a/setup.py
+++ b/setup.py
@@ -30,13 +30,26 @@
 #   CC
 #     the C/C++ compiler to use
 #
+<<<<<<< HEAD
+=======
+#   CMAKE_FRESH=1
+#     force a fresh cmake configuration run, ignoring the existing cmake cache
+#
+#   CMAKE_ONLY=1
+#     run cmake and stop; do not build the project
+#
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Environment variables for feature toggles:
 #
 #   DEBUG_CUDA=1
 #     if used in conjunction with DEBUG or REL_WITH_DEB_INFO, will also
 #     build CUDA kernels with -lineinfo --source-in-ptx.  Note that
 #     on CUDA 12 this may cause nvcc to OOM, so this is disabled by default.
+<<<<<<< HEAD
 
+=======
+#
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #   USE_CUDNN=0
 #     disables the cuDNN build
 #
@@ -225,6 +238,11 @@
 #   BUILD_PYTHON_ONLY
 #      Builds pytorch as a wheel using libtorch.so from a separate wheel
 
+<<<<<<< HEAD
+=======
+from __future__ import annotations
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import sys
 
@@ -238,9 +256,12 @@
 import platform
 
 
+<<<<<<< HEAD
 BUILD_LIBTORCH_WHL = os.getenv("BUILD_LIBTORCH_WHL", "0") == "1"
 BUILD_PYTHON_ONLY = os.getenv("BUILD_PYTHON_ONLY", "0") == "1"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 python_min_version = (3, 9, 0)
 python_min_version_str = ".".join(map(str, python_min_version))
 if sys.version_info < python_min_version:
@@ -272,6 +293,50 @@
 from tools.setup_helpers.generate_linker_script import gen_linker_script
 
 
+<<<<<<< HEAD
+=======
+def str2bool(value: str | None) -> bool:
+    """Convert environment variables to boolean values."""
+    if not value:
+        return False
+    if not isinstance(value, str):
+        raise ValueError(
+            f"Expected a string value for boolean conversion, got {type(value)}"
+        )
+    value = value.strip().lower()
+    if value in (
+        "1",
+        "true",
+        "t",
+        "yes",
+        "y",
+        "on",
+        "enable",
+        "enabled",
+        "found",
+    ):
+        return True
+    if value in (
+        "0",
+        "false",
+        "f",
+        "no",
+        "n",
+        "off",
+        "disable",
+        "disabled",
+        "notfound",
+        "none",
+        "null",
+        "nil",
+        "undefined",
+        "n/a",
+    ):
+        return False
+    raise ValueError(f"Invalid string value for boolean conversion: {value}")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_package_path(package_name):
     spec = importlib.util.find_spec(package_name)
     if spec:
@@ -286,13 +351,22 @@ def _get_package_path(package_name):
     return None
 
 
+<<<<<<< HEAD
+=======
+BUILD_LIBTORCH_WHL = str2bool(os.getenv("BUILD_LIBTORCH_WHL"))
+BUILD_PYTHON_ONLY = str2bool(os.getenv("BUILD_PYTHON_ONLY"))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # set up appropriate env variables
 if BUILD_LIBTORCH_WHL:
     # Set up environment variables for ONLY building libtorch.so and not libtorch_python.so
     # functorch is not supported without python
     os.environ["BUILD_FUNCTORCH"] = "OFF"
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if BUILD_PYTHON_ONLY:
     os.environ["BUILD_LIBTORCHLESS"] = "ON"
     os.environ["LIBTORCH_LIB_PATH"] = f"{_get_package_path('torch')}/lib"
@@ -301,13 +375,22 @@ def _get_package_path(package_name):
 # Parameters parsed from environment
 ################################################################################
 
+<<<<<<< HEAD
 VERBOSE_SCRIPT = True
+=======
+VERBOSE_SCRIPT = str2bool(os.getenv("VERBOSE", "1"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 RUN_BUILD_DEPS = True
 # see if the user passed a quiet flag to setup.py arguments and respect
 # that in our parts of the build
 EMIT_BUILD_WARNING = False
+<<<<<<< HEAD
 RERUN_CMAKE = False
 CMAKE_ONLY = False
+=======
+RERUN_CMAKE = str2bool(os.getenv("CMAKE_FRESH"))
+CMAKE_ONLY = str2bool(os.getenv("CMAKE_ONLY"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 filtered_args = []
 for i, arg in enumerate(sys.argv):
     if arg == "--cmake":
@@ -417,7 +500,11 @@ def not_exists_or_empty(folder):
             os.path.isdir(folder) and len(os.listdir(folder)) == 0
         )
 
+<<<<<<< HEAD
     if bool(os.getenv("USE_SYSTEM_LIBS", False)):
+=======
+    if str2bool(os.getenv("USE_SYSTEM_LIBS")):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
     folders = get_submodule_folders()
     # If none of the submodule folders exists, try to initialize them
@@ -431,7 +518,11 @@ def not_exists_or_empty(folder):
             end = time.time()
             report(f" --- Submodule initialization took {end - start:.2f} sec")
         except Exception:
+<<<<<<< HEAD
             report(" --- Submodule initalization failed")
+=======
+            report(" --- Submodule initialization failed")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             report("Please run:\n\tgit submodule update --init --recursive")
             sys.exit(1)
     for folder in folders:
@@ -447,7 +538,11 @@ def not_exists_or_empty(folder):
             ],
         )
     check_for_files(
+<<<<<<< HEAD
         os.path.join(third_party_path, "fbgemm", "third_party", "asmjit"),
+=======
+        os.path.join(third_party_path, "fbgemm", "external", "asmjit"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ["CMakeLists.txt"],
     )
 
@@ -703,8 +798,11 @@ def run(self):
             )
         if cmake_cache_vars["USE_LIGHTWEIGHT_DISPATCH"]:
             report("-- Using lightweight dispatch")
+<<<<<<< HEAD
         if cmake_cache_vars["BUILD_EXECUTORCH"]:
             report("-- Building Executorch")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if cmake_cache_vars["USE_ITT"]:
             report("-- Using ITT")
@@ -752,6 +850,27 @@ def run(self):
 
             self.copy_file(export_lib, target_lib)
 
+<<<<<<< HEAD
+=======
+            # In ROCm on Windows case copy rocblas and hipblaslt files into
+            # torch/lib/rocblas/library and torch/lib/hipblaslt/library
+            if str2bool(os.getenv("USE_ROCM")):
+                rocm_dir_path = os.environ.get("ROCM_DIR")
+                rocm_bin_path = os.path.join(rocm_dir_path, "bin")
+
+                rocblas_dir = os.path.join(rocm_bin_path, "rocblas")
+                target_rocblas_dir = os.path.join(target_dir, "rocblas")
+                os.makedirs(target_rocblas_dir, exist_ok=True)
+                self.copy_tree(rocblas_dir, target_rocblas_dir)
+
+                hipblaslt_dir = os.path.join(rocm_bin_path, "hipblaslt")
+                target_hipblaslt_dir = os.path.join(target_dir, "hipblaslt")
+                os.makedirs(target_hipblaslt_dir, exist_ok=True)
+                self.copy_tree(hipblaslt_dir, target_hipblaslt_dir)
+            else:
+                report("The specified environment variable does not exist.")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def build_extensions(self):
         self.create_compile_commands()
 
@@ -1102,7 +1221,11 @@ def make_relative_rpath_args(path):
     To develop locally:
       $ python setup.py develop
     To force cmake to re-generate native build files (off by default):
+<<<<<<< HEAD
       $ python setup.py develop --cmake
+=======
+      $ CMAKE_FRESH=1 python setup.py develop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 
 
@@ -1133,6 +1256,7 @@ def main():
     if BUILD_PYTHON_ONLY:
         install_requires.append(f"{LIBTORCH_PKG_NAME}=={get_torch_version()}")
 
+<<<<<<< HEAD
     use_prioritized_text = str(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD", ""))
     if (
         use_prioritized_text == ""
@@ -1146,6 +1270,9 @@ def main():
             """
         )
     if use_prioritized_text == "1" or use_prioritized_text == "True":
+=======
+    if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gen_linker_script(
             filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
         )
@@ -1157,6 +1284,16 @@ def main():
         os.environ["CXXFLAGS"] = (
             os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections"
         )
+<<<<<<< HEAD
+=======
+    elif platform.system() == "Linux" and platform.processor() == "aarch64":
+        print_box(
+            """
+            WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA.
+            To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+            """
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Parse the command line and check the arguments before we proceed with
     # building deps and setup. We need to set values so `--help` works.
@@ -1185,6 +1322,10 @@ def main():
     extras_require = {
         "optree": ["optree>=0.13.0"],
         "opt-einsum": ["opt-einsum>=3.3"],
+<<<<<<< HEAD
+=======
+        "pyyaml": ["pyyaml"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     # Read in README.md for our long_description
@@ -1197,6 +1338,7 @@ def main():
         "bin/*",
         "test/*",
         "*.pyi",
+<<<<<<< HEAD
         "_C/*.pyi",
         "cuda/*.pyi",
         "fx/*.pyi",
@@ -1366,6 +1508,21 @@ def main():
         "include/THH/*.h*",
         "include/THH/generic/*.h",
         "include/sleef.h",
+=======
+        "**/*.pyi",
+        "lib/*.pdb",
+        "lib/**/*.pdb",
+        "lib/*shm*",
+        "lib/torch_shm_manager",
+        "lib/*.h",
+        "lib/**/*.h",
+        "include/*.h",
+        "include/**/*.h",
+        "include/*.hpp",
+        "include/**/*.hpp",
+        "include/*.cuh",
+        "include/**/*.cuh",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "_inductor/codegen/*.h",
         "_inductor/codegen/aoti_runtime/*.cpp",
         "_inductor/script.ld",
@@ -1386,6 +1543,10 @@ def main():
         "utils/model_dump/skeleton.html",
         "utils/model_dump/code.js",
         "utils/model_dump/*.mjs",
+<<<<<<< HEAD
+=======
+        "_dynamo/graph_break_registry.json",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
     if not BUILD_LIBTORCH_WHL:
@@ -1416,6 +1577,7 @@ def main():
         torch_package_data.extend(
             [
                 "include/tensorpipe/*.h",
+<<<<<<< HEAD
                 "include/tensorpipe/channel/*.h",
                 "include/tensorpipe/channel/basic/*.h",
                 "include/tensorpipe/channel/cma/*.h",
@@ -1427,12 +1589,16 @@ def main():
                 "include/tensorpipe/transport/ibv/*.h",
                 "include/tensorpipe/transport/shm/*.h",
                 "include/tensorpipe/transport/uv/*.h",
+=======
+                "include/tensorpipe/**/*.h",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
         )
     if get_cmake_cache_vars()["USE_KINETO"]:
         torch_package_data.extend(
             [
                 "include/kineto/*.h",
+<<<<<<< HEAD
             ]
         )
     torchgen_package_data = [
@@ -1445,6 +1611,14 @@ def main():
         "packaged/ATen/templates/*",
         "packaged/autograd/*",
         "packaged/autograd/templates/*",
+=======
+                "include/kineto/**/*.h",
+            ]
+        )
+    torchgen_package_data = [
+        "packaged/*",
+        "packaged/**/*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     package_data = {
         "torch": torch_package_data,
@@ -1471,6 +1645,11 @@ def main():
         install_requires=install_requires,
         extras_require=extras_require,
         package_data=package_data,
+<<<<<<< HEAD
+=======
+        # TODO fix later Manifest.IN file was previously ignored
+        include_package_data=False,  # defaults to True with pyproject.toml file
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         url="https://pytorch.org/",
         download_url="https://github.com/pytorch/pytorch/tags",
         author="PyTorch Team",
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index f4b4c621eb43..ffaf1eb282b2 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2007,8 +2007,13 @@
     "cast_symbool_to_symint_guardless",
     "constrain_range",
     "constrain_unify",
+<<<<<<< HEAD
     "definitely_false",
     "definitely_true",
+=======
+    "guard_or_true",
+    "guard_or_false",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "error",
     "eval_guards",
     "eval_is_non_overlapping_and_dense",
@@ -2384,6 +2389,7 @@
   "torch.utils.collect_env": [
     "namedtuple"
   ],
+<<<<<<< HEAD
   "torch.utils.data.datapipes.gen_pyi": [
     "Any",
     "Dict",
@@ -2393,6 +2399,8 @@
     "Union",
     "defaultdict"
   ],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   "torch.utils.data.datapipes.utils.snapshot": [
     "IterDataPipe",
     "apply_random_seed"
@@ -2659,6 +2667,14 @@
   "torch.export.graph_signature": [
     "TokenArgument"
   ],
+<<<<<<< HEAD
+=======
+  "torch.export.pt2_archive": [
+    "PT2ArchiveWriter",
+    "PT2ArchiveReader",
+    "is_pt2_package"
+  ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   "torch.fx.experimental.shape_inference.infer_shape": [
     "DimDynamic",
     "FakeTensorMode",
diff --git a/test/ao/sparsity/test_activation_sparsifier.py b/test/ao/sparsity/test_activation_sparsifier.py
index 56a88596f994..abb9769b06fe 100644
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@@ -1,7 +1,10 @@
 # Owner(s): ["module: unknown"]
 
 import copy
+<<<<<<< HEAD
 import logging
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -10,11 +13,18 @@
     ActivationSparsifier,
 )
 from torch.ao.pruning.sparsifier.utils import module_to_fqn
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfTorchDynamo, TestCase
 
 
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+    TestCase,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -405,3 +415,10 @@ def _vanilla_norm_sparsifier(data, sparsity_level):
 
         # check state_dict() after squash_mask()
         self._check_state_dict(activation_sparsifier)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_ao_sparsity.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py
index 8b4586f9979c..0039ef09e03f 100644
--- a/test/ao/sparsity/test_composability.py
+++ b/test/ao/sparsity/test_composability.py
@@ -1,8 +1,11 @@
 # Owner(s): ["module: unknown"]
 
 
+<<<<<<< HEAD
 import logging
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.ao.quantization as tq
 from torch import nn
@@ -15,6 +18,7 @@
     prepare_qat_fx,
 )
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase, xfailIfS390X
 
 
@@ -22,6 +26,15 @@
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )
 
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    TestCase,
+    xfailIfS390X,
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 sparse_defaults = {
     "sparsity_level": 0.8,
     "sparse_block_shape": (1, 4),
@@ -642,3 +655,10 @@ def test_s_prep_q_prep_fx_ref(self):
             sparsity_level, sparse_config[0]["sparsity_level"]
         )
         self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_ao_sparsity.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/ao/sparsity/test_data_scheduler.py b/test/ao/sparsity/test_data_scheduler.py
index 6481867292e4..3953a65d3553 100644
--- a/test/ao/sparsity/test_data_scheduler.py
+++ b/test/ao/sparsity/test_data_scheduler.py
@@ -1,19 +1,26 @@
 # Owner(s): ["module: unknown"]
 
 import copy
+<<<<<<< HEAD
 import logging
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 
 import torch
 from torch import nn
 from torch.ao.pruning._experimental.data_scheduler import BaseDataScheduler
 from torch.ao.pruning._experimental.data_sparsifier import DataNormSparsifier
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
 
 
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ImplementedDataScheduler(BaseDataScheduler):
@@ -180,3 +187,10 @@ def test_state_dict(self):
             name, _, _ = self._get_name_data_config(some_data, defaults)
             assert scheduler1.base_param[name] == scheduler2.base_param[name]
             assert scheduler1._last_param[name] == scheduler2._last_param[name]
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_ao_sparsity.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py
index 4f987b994ae8..d156f00b663d 100644
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@@ -2,7 +2,10 @@
 
 import copy
 import itertools
+<<<<<<< HEAD
 import logging
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import math
 
 import torch
@@ -15,12 +18,16 @@
     post_training_sparse_quantize,
 )
 from torch.nn.utils.parametrize import is_parametrized
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
 
 
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ImplementedSparsifier(BaseDataSparsifier):
@@ -500,9 +507,13 @@ def test_nn_embeddings(self):
         (
             emb1,
             emb2,
+<<<<<<< HEAD
         ) = nn.Embedding(
             10, 3
         ), nn.Embedding(20, 3)
+=======
+        ) = nn.Embedding(10, 3), nn.Embedding(20, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         emb1_bag, emb2_bag = nn.EmbeddingBag(10, 3), nn.EmbeddingBag(20, 3)
 
         emb3, emb3_bag = nn.Embedding(15, 3), nn.EmbeddingBag(20, 3)
@@ -633,9 +644,13 @@ def test_nn_embeddings(self):
         (
             emb1,
             emb2,
+<<<<<<< HEAD
         ) = nn.Embedding(
             10, 3
         ), nn.Embedding(20, 3)
+=======
+        ) = nn.Embedding(10, 3), nn.Embedding(20, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         emb1_bag, emb2_bag = nn.EmbeddingBag(10, 3), nn.EmbeddingBag(20, 3)
 
         emb3, emb3_bag = nn.Embedding(15, 3), nn.EmbeddingBag(20, 3)
@@ -792,3 +807,10 @@ def test_ptq_quantize_first(self):
         assert abs(sl_embbag1 - 0.80) <= 0.05  # +- 5% leeway
         assert abs(sl_emb_seq_0 - 0.80) <= 0.05  # +- 5% leeway
         assert abs(sl_emb_seq_1 - 0.80) <= 0.05  # +- 5% leeway
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_ao_sparsity.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/ao/sparsity/test_kernels.py b/test/ao/sparsity/test_kernels.py
index 1fb8d46adf9e..16aa98ab4d08 100644
--- a/test/ao/sparsity/test_kernels.py
+++ b/test/ao/sparsity/test_kernels.py
@@ -19,14 +19,26 @@
     qengine_is_qnnpack,
     qengine_is_x86,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+    TestCase,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # TODO: Once more test files are created, move the contents to a ao folder.
 
+<<<<<<< HEAD
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )
+=======
+logger = logging.getLogger(__name__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestQuantizedSparseKernels(TestCase):
@@ -78,10 +90,17 @@ def test_sparse_qlinear(self):
 
             for use_channelwise, dynamic_mode in product([True, False], [True, False]):
                 if qengine_is_fbgemm() and dynamic_mode:
+<<<<<<< HEAD
                     logging.info("dynamic sparse qlinear is only available in qnnpack")
                     continue
                 if qengine_is_qnnpack() and not dynamic_mode:
                     logging.info("static sparse qlinear is only available in fbgemm")
+=======
+                    logger.info("dynamic sparse qlinear is only available in qnnpack")
+                    continue
+                if qengine_is_qnnpack() and not dynamic_mode:
+                    logger.info("static sparse qlinear is only available in fbgemm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     continue
                 if use_channelwise:
                     W_q = torch.quantize_per_channel(
@@ -216,12 +235,21 @@ def _sparse_layer_test_helper(
         qmodule_to_check = fqn_to_module(qmodel, fqn_to_check)
 
         # check that the modules were converted as expected
+<<<<<<< HEAD
         assert isinstance(
             sqmodule_to_check, sqmodule_expected_converted_class
         ), "Convert failed"
         assert isinstance(
             qmodule_to_check, qmodule_expected_converted_class
         ), "Mapping failed"
+=======
+        assert isinstance(sqmodule_to_check, sqmodule_expected_converted_class), (
+            "Convert failed"
+        )
+        assert isinstance(qmodule_to_check, qmodule_expected_converted_class), (
+            "Mapping failed"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         row_block_size, col_block_size = sqmodel.linear._packed_params._weight_bias()[
             2:
@@ -325,4 +353,8 @@ def test_sparse_qlinear_serdes(self):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     run_tests()
+=======
+    raise_on_run_directly("test/test_ao_sparsity.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/ao/sparsity/test_parametrization.py b/test/ao/sparsity/test_parametrization.py
index 820c133d8940..3c20088d98c7 100644
--- a/test/ao/sparsity/test_parametrization.py
+++ b/test/ao/sparsity/test_parametrization.py
@@ -1,18 +1,25 @@
 # Owner(s): ["module: unknown"]
 
 
+<<<<<<< HEAD
 import logging
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import nn
 from torch.ao.pruning.sparsifier import utils
 from torch.nn.utils import parametrize
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
 
 
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ModelUnderTest(nn.Module):
@@ -173,3 +180,10 @@ def test_jit_trace(self):
         y = model(x)
         y_hat = model_trace(x)
         self.assertEqual(y_hat, y)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_ao_sparsity.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/ao/sparsity/test_scheduler.py b/test/ao/sparsity/test_scheduler.py
index 7ac2f42193ca..cdb9ade99bf8 100644
--- a/test/ao/sparsity/test_scheduler.py
+++ b/test/ao/sparsity/test_scheduler.py
@@ -4,7 +4,11 @@
 
 from torch import nn
 from torch.ao.pruning import BaseScheduler, CubicSL, LambdaSL, WeightNormSparsifier
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ImplementedScheduler(BaseScheduler):
@@ -190,3 +194,10 @@ def test_step(self):
             self.sorted_sparse_levels,
             msg="Sparsity level is not reaching the target level afer delta_t * n steps ",
         )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_ao_sparsity.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index 097d4890dc8f..60bd6ff6504a 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -1,7 +1,10 @@
 # Owner(s): ["module: unknown"]
 
 import itertools
+<<<<<<< HEAD
 import logging
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 
 import torch
@@ -18,12 +21,16 @@
     MockSparseLinear,
     SimpleLinear,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
 
 
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestBaseSparsifier(TestCase):
@@ -484,3 +491,10 @@ def _verify_nearliness(self, mask: torch.Tensor, nearliness: int):
                         assert mask[row, col] == 1
                     else:
                         assert mask[row, col] == 0
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_ao_sparsity.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/ao/sparsity/test_sparsity_utils.py b/test/ao/sparsity/test_sparsity_utils.py
index b29be49d571d..d3a49524aeec 100644
--- a/test/ao/sparsity/test_sparsity_utils.py
+++ b/test/ao/sparsity/test_sparsity_utils.py
@@ -18,7 +18,11 @@
     SingleLayerLinearModel,
     TwoLayerLinearModel,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 logging.basicConfig(
@@ -147,3 +151,10 @@ def test_get_arg_info_from_tensor_fqn_fail(self):
             self.assertEqual(arg_info["module_fqn"], "foo.bar")
             self.assertEqual(arg_info["tensor_name"], "baz")
             self.assertEqual(arg_info["tensor_fqn"], "foo.bar.baz")
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_ao_sparsity.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/ao/sparsity/test_structured_sparsifier.py b/test/ao/sparsity/test_structured_sparsifier.py
index 00fdbed68afa..744f5ddfa476 100644
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@@ -1,6 +1,9 @@
 # Owner(s): ["module: unknown"]
 import copy
+<<<<<<< HEAD
 import logging
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import random
 
 import torch
@@ -29,6 +32,7 @@
     SimpleConv2d,
     SimpleLinear,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfTorchDynamo, TestCase
 
 
@@ -36,6 +40,15 @@
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )
 
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+    TestCase,
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DEVICES = {
     torch.device("cpu"),
     torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
@@ -1056,9 +1069,15 @@ def _test_update_mask_on_multiple_layer(
         mask1 = pruner.groups[0]["module"].parametrizations.weight[0].mask[-1]
         mask2 = pruner.groups[0]["module"].parametrizations.weight[0].mask[-2]
         # Check if either of the least-norm filters is not pruned
+<<<<<<< HEAD
         assert (
             mask1.item() is not False or mask2.item() is not False
         ), "Do not prune all least-norm filters"
+=======
+        assert mask1.item() is not False or mask2.item() is not False, (
+            "Do not prune all least-norm filters"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # fusion step
         pruned_model = pruner.prune()
@@ -1089,3 +1108,10 @@ def test_update_mask(self):
             self._test_update_mask_on_multiple_layer(
                 expected_conv1, expected_conv2, device
             )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_ao_sparsity.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/bench_mps_ops.py b/test/bench_mps_ops.py
new file mode 100644
index 000000000000..c3f1999f1deb
--- /dev/null
+++ b/test/bench_mps_ops.py
@@ -0,0 +1,196 @@
+# Owner(s): ["module: mps"]
+# Collection of op level benchmarks for MPS
+# Useful as reference tool when migrating ops from MPS to Metal
+import itertools
+import timeit
+import warnings
+from typing import Optional
+
+import torch
+from torch.utils.benchmark import Compare, Measurement, Timer
+
+
+def bench_unary_op(func, x, label) -> Measurement:
+    sync_cmd = "torch.mps.synchronize()" if "mps" in str(x.device) else ""
+    t = Timer(
+        stmt=f"f(x);{sync_cmd}",
+        globals={"f": func, "x": x},
+        language="python",
+        timer=timeit.default_timer,
+        sub_label=f"{func.__name__} ({str(x.dtype)})",
+        description=label,
+        env=torch.__version__,
+    )
+    return t.blocked_autorange()
+
+
+def bench_binary_op(func, x, y, label) -> Measurement:
+    sync_cmd = "torch.mps.synchronize()" if "mps" in str(x.device) else ""
+    t = Timer(
+        stmt=f"f(x, y);{sync_cmd}",
+        globals={"f": func, "x": x, "y": y},
+        language="python",
+        timer=timeit.default_timer,
+        sub_label=f"{func.__name__} ({str(x.dtype)}, {str(y.dtype)})",
+        description=label,
+        env=torch.__version__,
+    )
+    return t.blocked_autorange()
+
+
+def bench_unary(
+    unary_func, device: str = "mps", dtype: torch.dtype = torch.float32
+) -> list[Measurement]:
+    x = torch.testing.make_tensor(1024, 1024, device=device, dtype=dtype)
+    x_s = torch.testing.make_tensor(1024, 2048, device=device, dtype=dtype)[::, ::2]
+    rc = []
+    rc.append(bench_unary_op(unary_func, x, "dense"))
+    rc.append(bench_unary_op(unary_func, x.t(), "transposed"))
+    rc.append(bench_unary_op(unary_func, x_s, "strided"))
+    rc.append(bench_unary_op(unary_func, x_s.t(), "strided + transposed"))
+    return rc
+
+
+def bench_binary(
+    binary_func,
+    device: str = "mps",
+    dt_a: torch.dtype = torch.float32,
+    dt_b: Optional[torch.dtype] = None,
+) -> list[Measurement]:
+    dt_b = dt_b if dt_b is not None else dt_a
+    x = torch.testing.make_tensor(1024, 1024, device=device, dtype=dt_a)
+    y = torch.testing.make_tensor(1024, 1024, device=device, dtype=dt_b)
+    s = torch.testing.make_tensor((), device=device, dtype=dt_b)
+    rc = []
+    rc.append(bench_binary_op(binary_func, x, y, "dense-dense"))
+    rc.append(bench_binary_op(binary_func, x.t(), y.t(), "transp-transp"))
+    rc.append(bench_binary_op(binary_func, x, y.t(), "dense-transp"))
+    rc.append(bench_binary_op(binary_func, x.t(), y, "transp-dense"))
+    rc.append(bench_binary_op(binary_func, x, s, "dense-scalar"))
+    rc.append(bench_binary_op(binary_func, x, y[0], "dense-bcast"))
+    return rc
+
+
+def bench_reduction(
+    reduction_func, device: str = "mps", dtype: torch.dtype = torch.float32
+) -> list[Measurement]:
+    rc = []
+
+    # Bench 2D with reduction over dim=0
+    def f(t):
+        return reduction_func(t, dim=0)
+
+    f.__name__ = reduction_func.__name__
+    f_c = torch.compile(f, dynamic=False)
+
+    for size in (512, 1024, 2048, 4096):
+        x = torch.testing.make_tensor(size, size, device=device, dtype=dtype)
+        rc_c, rc_e = f(x), f_c(x)
+        rc_c, rc_e = (rc_c[0], rc_e[0]) if isinstance(rc_c, tuple) else (rc_c, rc_e)
+        if not torch.allclose(rc_c, rc_e):
+            mdiff = (rc_c - rc_e).abs().max()
+            warnings.warn(
+                f"Eager and compile reduction do not match for {reduction_func.__name__} and {dtype} max_diff={mdiff}",
+                stacklevel=2,
+            )
+        rc.append(bench_unary_op(f, x, f"eager-{size}x{size}"))
+        rc.append(bench_unary_op(f_c, x, f"compile-{size}x{size}"))
+    return rc
+
+
+def bench_scan(
+    scan_func, device: str = "mps", dtype: torch.dtype = torch.float32
+) -> list[Measurement]:
+    rc = []
+
+    # Bench cumsum along different dimensions
+    for dim in [0, 1]:
+
+        def f(t):
+            return scan_func(t, dim=dim)
+
+        f_c = torch.compile(f, dynamic=False)
+
+        for size in (32, 128, 512, 1024):
+            f.__name__ = f"{scan_func.__name__}-dim{dim}-{size}x{size}"
+            f_c.__name__ = f.__name__
+            x = torch.testing.make_tensor(size, size, device=device, dtype=dtype)
+            rc_c, rc_e = f(x), f_c(x)
+            if not torch.allclose(rc_c, rc_e):
+                mdiff = (rc_c - rc_e).abs().max()
+                warnings.warn(
+                    f"Eager and compile scan do not match for {scan_func.__name__} dim={dim} and {dtype} max_diff={mdiff}",
+                    stacklevel=2,
+                )
+            rc.append(bench_unary_op(f, x, "eager"))
+            rc.append(bench_unary_op(f_c, x, "compile"))
+
+    # Bench 1D cumsum for different sizes
+    def f_1d(t):
+        return scan_func(t, dim=0)
+
+    f_1d_c = torch.compile(f_1d, dynamic=False)
+
+    for size in (100, 10000, 1000000):
+        f_1d.__name__ = f"{scan_func.__name__}-1d-{size}"
+        f_1d_c.__name__ = f_1d.__name__
+        x = torch.testing.make_tensor(size, device=device, dtype=dtype)
+        rc_c, rc_e = f_1d(x), f_1d_c(x)
+        if not torch.allclose(rc_c, rc_e):
+            mdiff = (rc_c - rc_e).abs().max()
+            warnings.warn(
+                f"Eager and compile 1D scan do not match for {scan_func.__name__} and {dtype} max_diff={mdiff}",
+                stacklevel=2,
+            )
+        rc.append(bench_unary_op(f_1d, x, "eager"))
+        rc.append(bench_unary_op(f_1d_c, x, "compile"))
+
+    return rc
+
+
+def main() -> None:
+    dtypes = [torch.float16, torch.float32]
+    if torch.backends.mps.is_macos_or_newer(14, 0):
+        dtypes.append(torch.bfloat16)
+
+    # Profile index ops
+    B = 11
+    rc = []
+    for dtype, N in itertools.product(
+        [torch.int8, torch.float16, torch.float32], [50, 100, 500, 1000, 2000]
+    ):
+        x = torch.testing.make_tensor((B, N, N), device="mps", dtype=dtype)
+        y = torch.randint(0, B, (3,))
+        rc.append(bench_binary_op(torch.Tensor.__getitem__, x, y, f"{B}x{N}x{N}"))
+    Compare(rc).print()
+
+    # Profile unary ops
+    rc = []
+    for op, dtype in itertools.product([torch.sqrt, torch.sin], dtypes):
+        rc.extend(bench_unary(op, dtype=dtype))
+    Compare(rc).print()
+
+    # Profile reduction ops
+    rc = []
+    for op in [torch.sum, torch.max]:
+        rc.extend(bench_reduction(op))
+    Compare(rc).print()
+
+    # Profile scan ops (cumsum)
+    rc = []
+    for dtype in dtypes:
+        rc.extend(bench_scan(torch.cumsum, dtype=dtype))
+    Compare(rc).print()
+
+    # Profile binary ops
+    rc = []
+    ops = [torch.fmax, torch.add]
+    for op, dtype in itertools.product(ops, dtypes):
+        rc.extend(bench_binary(op, dt_a=dtype))
+        if dtype == torch.float32:
+            rc.extend(bench_binary(op, dt_b=torch.float16))
+    Compare(rc).print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/benchmark_utils/test_benchmark_utils.py b/test/benchmark_utils/test_benchmark_utils.py
index 969a1584b68c..76f8897df28e 100644
--- a/test/benchmark_utils/test_benchmark_utils.py
+++ b/test/benchmark_utils/test_benchmark_utils.py
@@ -66,9 +66,16 @@ def to_entry(fn_counts):
         json.dump(artifacts, f, indent=4)
 
 
+<<<<<<< HEAD
 def load_callgrind_artifacts() -> (
     tuple[benchmark_utils.CallgrindStats, benchmark_utils.CallgrindStats]
 ):
+=======
+def load_callgrind_artifacts() -> tuple[
+    benchmark_utils.CallgrindStats,
+    benchmark_utils.CallgrindStats,
+]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Hermetic artifact to unit test Callgrind wrapper.
 
     In addition to collecting counts, this wrapper provides some facilities for
diff --git a/test/compiled_autograd_skips/FakeTensorDispatchCache.test__upsample_bilinear2d_aa_backward_dynamic_shapes b/test/compiled_autograd_skips/FakeTensorDispatchCache.test__upsample_bilinear2d_aa_backward_dynamic_shapes
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/MiniOpTest.test_aot_dispatch_dynamic__test_mm b/test/compiled_autograd_skips/MiniOpTest.test_aot_dispatch_dynamic__test_mm
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/MiniOpTest.test_aot_dispatch_static__test_mm b/test/compiled_autograd_skips/MiniOpTest.test_aot_dispatch_static__test_mm
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/PackedSequenceTest.test_pack_padded_sequence b/test/compiled_autograd_skips/PackedSequenceTest.test_pack_padded_sequence
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_backward_out_of_context b/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_backward_out_of_context
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_basic b/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_basic
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_double_backward b/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_double_backward
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_inplace_foreach b/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_inplace_foreach
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_save_base_and_modify_view b/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_save_base_and_modify_view
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_save_view_modify_base b/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_save_view_modify_base
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_saved_but_not_anymore b/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_saved_but_not_anymore
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_views b/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_views
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_with_math_views b/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_with_math_views
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_with_out_variant b/test/compiled_autograd_skips/TestAllowMutationOnSaved.test_with_out_variant
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutodiffSubgraphSlicing.test_bias_as_arg b/test/compiled_autograd_skips/TestAutodiffSubgraphSlicing.test_bias_as_arg
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutodiffSubgraphSlicing.test_bias_as_module_attr b/test/compiled_autograd_skips/TestAutodiffSubgraphSlicing.test_bias_as_module_attr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutodiffSubgraphSlicing.test_constructed_bias b/test/compiled_autograd_skips/TestAutodiffSubgraphSlicing.test_constructed_bias
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_accumulate_grad_tensor_reference b/test/compiled_autograd_skips/TestAutograd.test_accumulate_grad_tensor_reference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_anomaly_assign_parent_cleanup b/test/compiled_autograd_skips/TestAutograd.test_anomaly_assign_parent_cleanup
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_anomaly_detect_nan b/test/compiled_autograd_skips/TestAutograd.test_anomaly_detect_nan
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_anomaly_grad_warnings b/test/compiled_autograd_skips/TestAutograd.test_anomaly_grad_warnings
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_autograd_inplace_views_cross_dtype b/test/compiled_autograd_skips/TestAutograd.test_autograd_inplace_views_cross_dtype
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_autograd_node_isinstance b/test/compiled_autograd_skips/TestAutograd.test_autograd_node_isinstance
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_callback_adds_callback b/test/compiled_autograd_skips/TestAutograd.test_callback_adds_callback
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_callback_propagates_errors_from_device_thread b/test/compiled_autograd_skips/TestAutograd.test_callback_propagates_errors_from_device_thread
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_create_graph_and_full_backward_hook_cycle b/test/compiled_autograd_skips/TestAutograd.test_create_graph_and_full_backward_hook_cycle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_current_graph_task_execution_order b/test/compiled_autograd_skips/TestAutograd.test_current_graph_task_execution_order
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_current_node b/test/compiled_autograd_skips/TestAutograd.test_current_node
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_custom_autograd_no_early_free b/test/compiled_autograd_skips/TestAutograd.test_custom_autograd_no_early_free
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_custom_function_cycle b/test/compiled_autograd_skips/TestAutograd.test_custom_function_cycle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_custom_function_error b/test/compiled_autograd_skips/TestAutograd.test_custom_function_error
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_custom_function_forward_mode_forward_is_no_op b/test/compiled_autograd_skips/TestAutograd.test_custom_function_forward_mode_forward_is_no_op
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_custom_function_forward_mode_wrong_formula b/test/compiled_autograd_skips/TestAutograd.test_custom_function_forward_mode_wrong_formula
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_custom_function_save_for_forward b/test/compiled_autograd_skips/TestAutograd.test_custom_function_save_for_forward
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_default_saved_tensors_hooks_double_backward b/test/compiled_autograd_skips/TestAutograd.test_default_saved_tensors_hooks_double_backward
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_grad_batched_grad b/test/compiled_autograd_skips/TestAutograd.test_grad_batched_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_grad_nonleaf_register_hook b/test/compiled_autograd_skips/TestAutograd.test_grad_nonleaf_register_hook
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_backward_mul_by_grad_output b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_backward_mul_by_grad_output
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_check_batched_grad b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_check_batched_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_check_forward_or_backward_only b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_check_forward_or_backward_only
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_check_no_differentiable_outputs b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_check_no_differentiable_outputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_complex_non_complex_outputs b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_complex_non_complex_outputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_dense_and_sparse_inputs b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_dense_and_sparse_inputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_forward_ad b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_forward_ad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_forward_ad_batched_grad b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_forward_ad_batched_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_multiple_mkldnn_inputs b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_multiple_mkldnn_inputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_undefined_grad b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_undefined_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_validates_input_mkldnn b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_validates_input_mkldnn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_gradcheck_validates_inputs b/test/compiled_autograd_skips/TestAutograd.test_gradcheck_validates_inputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_graph_save_on_cpu b/test/compiled_autograd_skips/TestAutograd.test_graph_save_on_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_graph_save_on_cpu_cuda b/test/compiled_autograd_skips/TestAutograd.test_graph_save_on_cpu_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_hooks b/test/compiled_autograd_skips/TestAutograd.test_hooks
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_input_buffer_accum b/test/compiled_autograd_skips/TestAutograd.test_input_buffer_accum
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_mark_non_differentiable_none b/test/compiled_autograd_skips/TestAutograd.test_mark_non_differentiable_none
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_naughty_autograd_function_stashing_ctx b/test/compiled_autograd_skips/TestAutograd.test_naughty_autograd_function_stashing_ctx
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_no_grad_copy b/test/compiled_autograd_skips/TestAutograd.test_no_grad_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_no_grad_copy_sparse b/test/compiled_autograd_skips/TestAutograd.test_no_grad_copy_sparse
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_node_post_hook_registered_during_unpack_hook b/test/compiled_autograd_skips/TestAutograd.test_node_post_hook_registered_during_unpack_hook
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_post_accumulate_grad_hook_ordering b/test/compiled_autograd_skips/TestAutograd.test_post_accumulate_grad_hook_ordering
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_profiler_seq_nr b/test/compiled_autograd_skips/TestAutograd.test_profiler_seq_nr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_reentrant_with_callbacks_both_depths b/test/compiled_autograd_skips/TestAutograd.test_reentrant_with_callbacks_both_depths
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_reentrant_with_callbacks_depth_0 b/test/compiled_autograd_skips/TestAutograd.test_reentrant_with_callbacks_depth_0
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_reentrant_with_callbacks_depth_1 b/test/compiled_autograd_skips/TestAutograd.test_reentrant_with_callbacks_depth_1
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_reentrant_with_non_leaf_variable_hook b/test/compiled_autograd_skips/TestAutograd.test_reentrant_with_non_leaf_variable_hook
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_save_output_nr b/test/compiled_autograd_skips/TestAutograd.test_save_output_nr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_saved_variable_packing_unpacking_saved_original_with_hooks b/test/compiled_autograd_skips/TestAutograd.test_saved_variable_packing_unpacking_saved_original_with_hooks
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_setup_context_when_forward_has_default_args b/test/compiled_autograd_skips/TestAutograd.test_setup_context_when_forward_has_default_args
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_both_scalar b/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_both_scalar
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_dim0 b/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_dim0
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_dim1 b/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_dim1
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_dim_neg b/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_dim_neg
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_ind_scalar b/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_ind_scalar
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_x_scalar b/test/compiled_autograd_skips/TestAutograd.test_sparse_gather_x_scalar
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_sparse_mm_backward b/test/compiled_autograd_skips/TestAutograd.test_sparse_mm_backward
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_to_sparse_backward b/test/compiled_autograd_skips/TestAutograd.test_to_sparse_backward
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutograd.test_unrelated_inputs b/test/compiled_autograd_skips/TestAutograd.test_unrelated_inputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradComplex.test_view_func_for_complex_views b/test/compiled_autograd_skips/TestAutogradComplex.test_view_func_for_complex_views
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_inplace_on_view_gradcheck_cpu b/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_inplace_on_view_gradcheck_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_sparse_backward_cpu_complex128 b/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_sparse_backward_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_sparse_backward_cpu_float64 b/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_sparse_backward_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_sparse_ctor_getter_backward_cpu_complex128 b/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_sparse_ctor_getter_backward_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_sparse_ctor_getter_backward_cpu_float64 b/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_sparse_ctor_getter_backward_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_sparse_mask_autograd_cpu b/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_sparse_mask_autograd_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_where_functional_cpu b/test/compiled_autograd_skips/TestAutogradDeviceTypeCPU.test_where_functional_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_inplace_on_view_python_cuda b/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_inplace_on_view_python_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_pin_memory_cuda b/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_pin_memory_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_reentrant_parent_error_on_cpu_cuda b/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_reentrant_parent_error_on_cpu_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_sparse_backward_cuda_complex128 b/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_sparse_backward_cuda_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_sparse_backward_cuda_float64 b/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_sparse_backward_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_sparse_ctor_getter_backward_cuda_complex128 b/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_sparse_ctor_getter_backward_cuda_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_sparse_ctor_getter_backward_cuda_float64 b/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_sparse_ctor_getter_backward_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_sparse_mask_autograd_cuda b/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_sparse_mask_autograd_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_where_functional_cuda b/test/compiled_autograd_skips/TestAutogradDeviceTypeCUDA.test_where_functional_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_autograd_function_registered_to_cpu_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_autograd_function_registered_to_cpu_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_base_does_not_require_grad_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_base_does_not_require_grad_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_composite_registered_to_cpu_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_composite_registered_to_cpu_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_cpu_return_self_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_cpu_return_self_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_inplace_autograd_function_registered_to_cpu_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_inplace_autograd_function_registered_to_cpu_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_no_autograd_kernel_inplace_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_no_autograd_kernel_inplace_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_no_autograd_kernel_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_no_autograd_kernel_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_post_autograd_returns_leaf_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_post_autograd_returns_leaf_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_post_autograd_returns_mix_of_requires_grad_tensors_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_post_autograd_returns_mix_of_requires_grad_tensors_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_supports_tensor_lists_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_supports_tensor_lists_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_undefined_grads_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_undefined_grads_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFallback.test_undefined_inputs_outputs_mode_warn b/test/compiled_autograd_skips/TestAutogradFallback.test_undefined_inputs_outputs_mode_warn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradForwardMode.test_advanced_packing_unpacking b/test/compiled_autograd_skips/TestAutogradForwardMode.test_advanced_packing_unpacking
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_create_graph_vectorize_False_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_create_graph_vectorize_False_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_create_graph_vectorize_False_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_create_graph_vectorize_False_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_create_graph_vectorize_True_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_create_graph_vectorize_True_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_create_graph_vectorize_True_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_create_graph_vectorize_True_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_err_check_vectorize_True_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_err_check_vectorize_True_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_err_check_vectorize_True_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_err_check_vectorize_True_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_output_vectorized_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_output_vectorized_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_output_vectorized_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_output_vectorized_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_multi_input_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_multi_input_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_multi_input_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_multi_input_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_simple_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_simple_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_simple_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_simple_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_unrelated_outputs_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_unrelated_outputs_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_unrelated_outputs_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_correctness_unrelated_outputs_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_raises_no_warnings_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_raises_no_warnings_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_raises_no_warnings_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hessian_vectorize_raises_no_warnings_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hvp_create_graph_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hvp_create_graph_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_hvp_create_graph_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_hvp_create_graph_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_create_graph_vectorize_False_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_create_graph_vectorize_False_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_create_graph_vectorize_False_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_create_graph_vectorize_False_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_create_graph_vectorize_True_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_create_graph_vectorize_True_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_create_graph_vectorize_True_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_create_graph_vectorize_True_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_err_check_vectorize_True_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_err_check_vectorize_True_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_err_check_vectorize_True_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_err_check_vectorize_True_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_output_vectorized_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_output_vectorized_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_output_vectorized_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_output_vectorized_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_scalar_vectorized_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_scalar_vectorized_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_scalar_vectorized_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_scalar_vectorized_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_different_devices_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_different_devices_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_different_devices_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_different_devices_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_different_dtype_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_different_dtype_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_different_dtype_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_different_dtype_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_multi_input_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_multi_input_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_multi_input_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_multi_input_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_multi_input_multi_output_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_multi_input_multi_output_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_multi_input_multi_output_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_multi_input_multi_output_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_simple_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_simple_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_simple_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_simple_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_unrelated_outputs_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_unrelated_outputs_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_unrelated_outputs_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_unrelated_outputs_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_zero_dim_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_zero_dim_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_zero_dim_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_correctness_zero_dim_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_raises_no_warnings_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_raises_no_warnings_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_raises_no_warnings_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jacobian_vectorize_raises_no_warnings_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jvp_create_graph_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jvp_create_graph_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_jvp_create_graph_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_jvp_create_graph_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_vhp_create_graph_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_vhp_create_graph_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_vhp_create_graph_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_vhp_create_graph_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_vjp_create_graph_base_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_vjp_create_graph_base_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradFunctional.test_vjp_create_graph_logging_tensor b/test/compiled_autograd_skips/TestAutogradFunctional.test_vjp_create_graph_logging_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradMultipleDispatchCPU.test_autograd_composite_implicit_and_dispatch_registration_cpu b/test/compiled_autograd_skips/TestAutogradMultipleDispatchCPU.test_autograd_composite_implicit_and_dispatch_registration_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradMultipleDispatchCPU.test_autograd_multiple_dispatch_registrations_cpu b/test/compiled_autograd_skips/TestAutogradMultipleDispatchCPU.test_autograd_multiple_dispatch_registrations_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradMultipleDispatchCPU.test_per_dispatch_key_input_saving_cpu b/test/compiled_autograd_skips/TestAutogradMultipleDispatchCPU.test_per_dispatch_key_input_saving_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradMultipleDispatchCUDA.test_autograd_composite_implicit_and_dispatch_registration_cuda b/test/compiled_autograd_skips/TestAutogradMultipleDispatchCUDA.test_autograd_composite_implicit_and_dispatch_registration_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradMultipleDispatchCUDA.test_autograd_multiple_dispatch_registrations_cuda b/test/compiled_autograd_skips/TestAutogradMultipleDispatchCUDA.test_autograd_multiple_dispatch_registrations_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradMultipleDispatchCUDA.test_backward_single_threaded_cuda b/test/compiled_autograd_skips/TestAutogradMultipleDispatchCUDA.test_backward_single_threaded_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestAutogradStreamSynchronization.test_side_stream_backward_overlap b/test/compiled_autograd_skips/TestAutogradStreamSynchronization.test_side_stream_backward_overlap
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestBasicsCPU.test_nn_unfold_cpu b/test/compiled_autograd_skips/TestBasicsCPU.test_nn_unfold_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestBasicsCPU.test_softmax_cpu b/test/compiled_autograd_skips/TestBasicsCPU.test_softmax_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestBasicsCPU.test_stack_cpu b/test/compiled_autograd_skips/TestBasicsCPU.test_stack_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestBasicsCPU.test_to_dense_and_sparse_coo_cpu b/test/compiled_autograd_skips/TestBasicsCPU.test_to_dense_and_sparse_coo_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestBasicsCPU.test_to_dense_and_sparse_csr_cpu b/test/compiled_autograd_skips/TestBasicsCPU.test_to_dense_and_sparse_csr_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestBasicsCPU.test_to_dense_cpu b/test/compiled_autograd_skips/TestBasicsCPU.test_to_dense_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestBasicsCPU.test_to_sparse_cpu b/test/compiled_autograd_skips/TestBasicsCPU.test_to_sparse_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestBasicsCPU.test_unfold_cpu b/test/compiled_autograd_skips/TestBasicsCPU.test_unfold_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestBasicsCPU.test_where_cpu b/test/compiled_autograd_skips/TestBasicsCPU.test_where_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestControlFlowTraced.test_tracing_map_autograd_aot_functionalized b/test/compiled_autograd_skips/TestControlFlowTraced.test_tracing_map_autograd_aot_functionalized
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestControlFlowTraced.test_tracing_map_autograd_symbolic_dict b/test/compiled_autograd_skips/TestControlFlowTraced.test_tracing_map_autograd_symbolic_dict
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestControlFlowTraced.test_tracing_map_autograd_symbolic_list b/test/compiled_autograd_skips/TestControlFlowTraced.test_tracing_map_autograd_symbolic_list
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestControlFlowTraced.test_tracing_map_autograd_symbolic_simple b/test/compiled_autograd_skips/TestControlFlowTraced.test_tracing_map_autograd_symbolic_simple
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_Conv2d_backward_depthwise_cpu_complex128 b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_Conv2d_backward_depthwise_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv3d_same_padding_backward_cpu_complex128 b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv3d_same_padding_backward_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv3d_valid_padding_backward_cpu_complex128 b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv3d_valid_padding_backward_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn1d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn2d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn3d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch1d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch2d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch3d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_False_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_False_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_False_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_False_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_True_contiguous_False_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_True_contiguous_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_True_contiguous_True_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_True_contiguous_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_double_backward_stride_cpu b/test/compiled_autograd_skips/TestConvolutionNNDeviceTypeCPU.test_conv_double_backward_stride_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestCppApiParity.test_torch_nn_EmbeddingBag_sparse b/test/compiled_autograd_skips/TestCppApiParity.test_torch_nn_EmbeddingBag_sparse
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestCppApiParity.test_torch_nn_Embedding_sparse b/test/compiled_autograd_skips/TestCppApiParity.test_torch_nn_Embedding_sparse
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestCustomOpTestingCPU.test_aot_autograd_check_degenerate_cases_check_gradients_False_dynamic_False_cpu b/test/compiled_autograd_skips/TestCustomOpTestingCPU.test_aot_autograd_check_degenerate_cases_check_gradients_False_dynamic_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestCustomOpTestingCPU.test_aot_autograd_check_degenerate_cases_check_gradients_False_dynamic_True_cpu b/test/compiled_autograd_skips/TestCustomOpTestingCPU.test_aot_autograd_check_degenerate_cases_check_gradients_False_dynamic_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestCustomOpTestingCPU.test_aot_autograd_check_degenerate_cases_check_gradients_auto_dynamic_False_cpu b/test/compiled_autograd_skips/TestCustomOpTestingCPU.test_aot_autograd_check_degenerate_cases_check_gradients_auto_dynamic_False_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestCustomOpTestingCPU.test_aot_autograd_check_degenerate_cases_check_gradients_auto_dynamic_True_cpu b/test/compiled_autograd_skips/TestCustomOpTestingCPU.test_aot_autograd_check_degenerate_cases_check_gradients_auto_dynamic_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestCustomOpTestingCPU.test_global_state_mutation_cpu b/test/compiled_autograd_skips/TestCustomOpTestingCPU.test_global_state_mutation_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNN.test_embedding_sparse_basic b/test/compiled_autograd_skips/TestEmbeddingNN.test_embedding_sparse_basic
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNN.test_embedding_sparse_empty_tensor b/test/compiled_autograd_skips/TestEmbeddingNN.test_embedding_sparse_empty_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int32_float32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int32_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int32_float64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int32_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int64_float32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int64_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int64_float64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int64_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_backward_cpu_float64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_backward_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_2D_padding_idx_cpu_float32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_2D_padding_idx_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_2D_padding_idx_cpu_float64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_2D_padding_idx_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int32_int32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int32_int32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int32_int64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int32_int64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int64_int32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int64_int32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int64_int64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int64_int64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_bfloat16 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_float16 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_float32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_float64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_bfloat16 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_float16 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_float32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_float64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_bfloat16 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_float16 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_float32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_float64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_bfloat16 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_float16 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_float32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_float64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int32_int32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int32_int32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int32_int64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int32_int64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int64_int32 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int64_int32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int64_int64 b/test/compiled_autograd_skips/TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int64_int64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExamplesCorrectnessCPU.test_maml_omniglot_mechanism_functional_call_cpu b/test/compiled_autograd_skips/TestExamplesCorrectnessCPU.test_maml_omniglot_mechanism_functional_call_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExamplesCorrectnessCPU.test_maml_omniglot_mechanism_make_functional_cpu b/test/compiled_autograd_skips/TestExamplesCorrectnessCPU.test_maml_omniglot_mechanism_make_functional_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_cnn_model_mean_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_cnn_model_mean_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_cnn_model_sum_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_cnn_model_sum_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_embedding_model_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_embedding_model_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_group_norm_model_num_dim_1_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_group_norm_model_num_dim_1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_group_norm_model_num_dim_2_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_group_norm_model_num_dim_2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_group_norm_model_num_dim_3_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_group_norm_model_num_dim_3_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_instance_norm_model_num_dim_1_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_instance_norm_model_num_dim_1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_instance_norm_model_num_dim_2_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_instance_norm_model_num_dim_2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_instance_norm_model_num_dim_3_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_instance_norm_model_num_dim_3_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_layer_norm_model_num_dim_1_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_layer_norm_model_num_dim_1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_layer_norm_model_num_dim_2_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_layer_norm_model_num_dim_2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_layer_norm_model_num_dim_3_cpu b/test/compiled_autograd_skips/TestExpandedWeightFunctionalCPU.test_layer_norm_model_num_dim_3_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_circular_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_circular_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_circular_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_circular_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad1_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad1_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad1_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad1size1_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad1size1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad1size1_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad1size1_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad2size1_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad2size1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad2size1_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_pad2size1_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_reflect_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_reflect_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_reflect_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_reflect_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_replicate_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_replicate_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_replicate_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_replicate_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_stride_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_stride_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_stride_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_stride_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_zeros_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_zeros_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_zeros_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv1d_zeros_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_circular_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_circular_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_circular_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_circular_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_dilated_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_dilated_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_dilated_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_dilated_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_no_bias_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_no_bias_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_no_bias_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_no_bias_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_padding_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_padding_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_padding_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_padding_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_reflect_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_reflect_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_reflect_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_reflect_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_replicate_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_replicate_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_replicate_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_replicate_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_strided_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_strided_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_strided_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_strided_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_zeros_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_zeros_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_zeros_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv2d_zeros_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_1x1x1_no_bias_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_1x1x1_no_bias_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_1x1x1_no_bias_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_1x1x1_no_bias_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_circular_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_circular_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_circular_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_circular_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_no_bias_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_no_bias_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_no_bias_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_no_bias_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_replicate_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_replicate_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_replicate_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_replicate_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_stride_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_stride_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_stride_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_stride_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_stride_padding_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_stride_padding_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_stride_padding_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_stride_padding_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_zeros_stride2_pad2_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_zeros_stride2_pad2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_zeros_stride2_pad2_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Conv3d_zeros_stride2_pad2_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Linear_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Linear_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Linear_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Linear_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Linear_no_bias_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Linear_no_bias_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Linear_no_bias_multiple_inputs_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_Linear_no_bias_multiple_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_per_sample_api_failing_cpu b/test/compiled_autograd_skips/TestExpandedWeightModuleCPU.test_per_sample_api_failing_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestFakeQuantize.test_fq_module_per_channel b/test/compiled_autograd_skips/TestFakeQuantize.test_fq_module_per_channel
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenerateOpcheckTests.test_opcheck b/test/compiled_autograd_skips/TestGenerateOpcheckTests.test_opcheck
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenerateOpcheckTests.test_opcheck_customopdef b/test/compiled_autograd_skips/TestGenerateOpcheckTests.test_opcheck_customopdef
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorFake.test_make_fx_model_fwd_bwd b/test/compiled_autograd_skips/TestGenericProxyTensorFake.test_make_fx_model_fwd_bwd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorFake.test_make_fx_model_fwd_bwd_wgtupdate b/test/compiled_autograd_skips/TestGenericProxyTensorFake.test_make_fx_model_fwd_bwd_wgtupdate
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorFake.test_proxy_tensor b/test/compiled_autograd_skips/TestGenericProxyTensorFake.test_proxy_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorFake.test_resnet18_backward_trace b/test/compiled_autograd_skips/TestGenericProxyTensorFake.test_resnet18_backward_trace
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorReal.test_make_fx_model_fwd_bwd b/test/compiled_autograd_skips/TestGenericProxyTensorReal.test_make_fx_model_fwd_bwd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorReal.test_make_fx_model_fwd_bwd_wgtupdate b/test/compiled_autograd_skips/TestGenericProxyTensorReal.test_make_fx_model_fwd_bwd_wgtupdate
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorReal.test_proxy_tensor b/test/compiled_autograd_skips/TestGenericProxyTensorReal.test_proxy_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorReal.test_resnet18_backward_trace b/test/compiled_autograd_skips/TestGenericProxyTensorReal.test_resnet18_backward_trace
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorSymbolic.test_make_fx_model_fwd_bwd b/test/compiled_autograd_skips/TestGenericProxyTensorSymbolic.test_make_fx_model_fwd_bwd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorSymbolic.test_make_fx_model_fwd_bwd_wgtupdate b/test/compiled_autograd_skips/TestGenericProxyTensorSymbolic.test_make_fx_model_fwd_bwd_wgtupdate
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorSymbolic.test_proxy_tensor b/test/compiled_autograd_skips/TestGenericProxyTensorSymbolic.test_proxy_tensor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestGenericProxyTensorSymbolic.test_resnet18_backward_trace b/test/compiled_autograd_skips/TestGenericProxyTensorSymbolic.test_resnet18_backward_trace
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestJit.test_script_autograd_grad b/test/compiled_autograd_skips/TestJit.test_script_autograd_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestJit.test_script_backward_twice b/test/compiled_autograd_skips/TestJit.test_script_backward_twice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_Conv2d_no_bias b/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_Conv2d_no_bias
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_Conv3d_1x1x1_no_bias b/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_Conv3d_1x1x1_no_bias
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_Conv3d_no_bias b/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_Conv3d_no_bias
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_ConvTranspose1d_no_bias b/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_ConvTranspose1d_no_bias
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_ConvTranspose2d_dilated b/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_ConvTranspose2d_dilated
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_ConvTranspose2d_no_bias b/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_ConvTranspose2d_no_bias
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_Softsign_no_batch_dim b/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_Softsign_no_batch_dim
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_Tanhshrink_no_batch_dim b/test/compiled_autograd_skips/TestJitGeneratedModule.test_nn_Tanhshrink_no_batch_dim
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestLinalgCPU.test_invariance_error_spectral_decompositions_cpu_complex128 b/test/compiled_autograd_skips/TestLinalgCPU.test_invariance_error_spectral_decompositions_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_adaptive_avg_pool2d_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_adaptive_avg_pool2d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_autograd_from_mkldnn_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_autograd_from_mkldnn_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_autograd_to_mkldnn_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_autograd_to_mkldnn_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_avg_pool2d_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_avg_pool2d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_avg_pool3d_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_avg_pool3d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_batch_norm_2d_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_batch_norm_2d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_conv2d_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_conv2d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_conv3d_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_conv3d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_gelu_bf16_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_gelu_bf16_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_gelu_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_gelu_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_linear_backward_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_linear_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_linear_non_contiguous_weight_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_linear_non_contiguous_weight_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_max_pool2d_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_max_pool2d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_max_pool3d_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_max_pool3d_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_prelu_bf16_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_prelu_bf16_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_prelu_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_prelu_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_relu__cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_relu__cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_relu_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_relu_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMkldnnCPU.test_reshape_backward_cpu b/test/compiled_autograd_skips/TestMkldnnCPU.test_reshape_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestModels.test_dcgan_models b/test/compiled_autograd_skips/TestModels.test_dcgan_models
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestModels.test_mnist b/test/compiled_autograd_skips/TestModels.test_mnist
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestModels.test_reinforcement_learning b/test/compiled_autograd_skips/TestModels.test_reinforcement_learning
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestModels.test_vae b/test/compiled_autograd_skips/TestModels.test_vae
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestModuleGlobalHooks.test_module_backward_global_hook_writeable b/test/compiled_autograd_skips/TestModuleGlobalHooks.test_module_backward_global_hook_writeable
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestModuleGlobalHooks.test_module_global_hook_invalid_outputs b/test/compiled_autograd_skips/TestModuleGlobalHooks.test_module_global_hook_invalid_outputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestModuleHookNN.test_hook_invalid_outputs b/test/compiled_autograd_skips/TestModuleHookNN.test_hook_invalid_outputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestModuleHookNN.test_hooks b/test/compiled_autograd_skips/TestModuleHookNN.test_hooks
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMultithreadAutograd.test_fork_join_in_middle b/test/compiled_autograd_skips/TestMultithreadAutograd.test_fork_join_in_middle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMultithreadAutograd.test_multi_grad_all_hooks b/test/compiled_autograd_skips/TestMultithreadAutograd.test_multi_grad_all_hooks
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMultithreadAutograd.test_multi_grad_any_hooks b/test/compiled_autograd_skips/TestMultithreadAutograd.test_multi_grad_any_hooks
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMultithreadAutograd.test_python_thread_in_middle b/test/compiled_autograd_skips/TestMultithreadAutograd.test_python_thread_in_middle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMultithreadAutograd.test_set_multithreading_enabled_as_context_manager_and_function b/test/compiled_autograd_skips/TestMultithreadAutograd.test_set_multithreading_enabled_as_context_manager_and_function
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMultithreadAutograd.test_simple_backward b/test/compiled_autograd_skips/TestMultithreadAutograd.test_simple_backward
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestMultithreadAutograd.test_simple_backward_same_input b/test/compiled_autograd_skips/TestMultithreadAutograd.test_simple_backward_same_input
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_AdaptiveLogSoftmax b/test/compiled_autograd_skips/TestNN.test_AdaptiveLogSoftmax
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Conv1d_circular_stride2_pad2 b/test/compiled_autograd_skips/TestNN.test_Conv1d_circular_stride2_pad2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Conv1d_reflect_stride2_pad2 b/test/compiled_autograd_skips/TestNN.test_Conv1d_reflect_stride2_pad2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Conv1d_replicate_stride2_pad2 b/test/compiled_autograd_skips/TestNN.test_Conv1d_replicate_stride2_pad2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Conv2d_circular_stride2_pad2 b/test/compiled_autograd_skips/TestNN.test_Conv2d_circular_stride2_pad2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Conv2d_reflect_stride2_pad2 b/test/compiled_autograd_skips/TestNN.test_Conv2d_reflect_stride2_pad2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Conv2d_replicate_stride2_pad2 b/test/compiled_autograd_skips/TestNN.test_Conv2d_replicate_stride2_pad2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Conv3d_circular_stride2_pad2 b/test/compiled_autograd_skips/TestNN.test_Conv3d_circular_stride2_pad2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Conv3d_replicate_stride2_pad2 b/test/compiled_autograd_skips/TestNN.test_Conv3d_replicate_stride2_pad2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_CosineEmbeddingLoss_no_batch_dim_mean b/test/compiled_autograd_skips/TestNN.test_CosineEmbeddingLoss_no_batch_dim_mean
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_CosineEmbeddingLoss_no_batch_dim_none b/test/compiled_autograd_skips/TestNN.test_CosineEmbeddingLoss_no_batch_dim_none
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_CosineEmbeddingLoss_no_batch_dim_sum b/test/compiled_autograd_skips/TestNN.test_CosineEmbeddingLoss_no_batch_dim_sum
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_EmbeddingBag_sparse b/test/compiled_autograd_skips/TestNN.test_EmbeddingBag_sparse
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Embedding_sparse b/test/compiled_autograd_skips/TestNN.test_Embedding_sparse
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_HingeEmbeddingLoss_margin_no_reduce b/test/compiled_autograd_skips/TestNN.test_HingeEmbeddingLoss_margin_no_reduce
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_HingeEmbeddingLoss_no_batch_dim_mean b/test/compiled_autograd_skips/TestNN.test_HingeEmbeddingLoss_no_batch_dim_mean
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_HingeEmbeddingLoss_no_batch_dim_none b/test/compiled_autograd_skips/TestNN.test_HingeEmbeddingLoss_no_batch_dim_none
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_HingeEmbeddingLoss_no_batch_dim_sum b/test/compiled_autograd_skips/TestNN.test_HingeEmbeddingLoss_no_batch_dim_sum
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_HingeEmbeddingLoss_no_reduce b/test/compiled_autograd_skips/TestNN.test_HingeEmbeddingLoss_no_reduce
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Linear b/test/compiled_autograd_skips/TestNN.test_Linear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Linear_no_batch_dim b/test/compiled_autograd_skips/TestNN.test_Linear_no_batch_dim
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Linear_no_bias b/test/compiled_autograd_skips/TestNN.test_Linear_no_bias
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_MarginRankingLoss_no_batch_dim_mean b/test/compiled_autograd_skips/TestNN.test_MarginRankingLoss_no_batch_dim_mean
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_MarginRankingLoss_no_batch_dim_none b/test/compiled_autograd_skips/TestNN.test_MarginRankingLoss_no_batch_dim_none
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_MarginRankingLoss_no_batch_dim_sum b/test/compiled_autograd_skips/TestNN.test_MarginRankingLoss_no_batch_dim_sum
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Mish_no_batch_dim b/test/compiled_autograd_skips/TestNN.test_Mish_no_batch_dim
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_MultiLabelSoftMarginLoss_no_batch_dim_mean b/test/compiled_autograd_skips/TestNN.test_MultiLabelSoftMarginLoss_no_batch_dim_mean
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_MultiLabelSoftMarginLoss_no_batch_dim_none b/test/compiled_autograd_skips/TestNN.test_MultiLabelSoftMarginLoss_no_batch_dim_none
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_MultiLabelSoftMarginLoss_no_batch_dim_sum b/test/compiled_autograd_skips/TestNN.test_MultiLabelSoftMarginLoss_no_batch_dim_sum
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_MultiLabelSoftMarginLoss_no_reduce b/test/compiled_autograd_skips/TestNN.test_MultiLabelSoftMarginLoss_no_reduce
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_MultiLabelSoftMarginLoss_weights_no_reduce b/test/compiled_autograd_skips/TestNN.test_MultiLabelSoftMarginLoss_weights_no_reduce
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_PairwiseDistance b/test/compiled_autograd_skips/TestNN.test_PairwiseDistance
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_PairwiseDistance_broadcast_lhs b/test/compiled_autograd_skips/TestNN.test_PairwiseDistance_broadcast_lhs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_PairwiseDistance_broadcast_rhs b/test/compiled_autograd_skips/TestNN.test_PairwiseDistance_broadcast_rhs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_PairwiseDistance_no_batch_dim b/test/compiled_autograd_skips/TestNN.test_PairwiseDistance_no_batch_dim
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_PairwiseDistance_with_non_default_args b/test/compiled_autograd_skips/TestNN.test_PairwiseDistance_with_non_default_args
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_PoissonNLLLoss_no_batch_dim_mean b/test/compiled_autograd_skips/TestNN.test_PoissonNLLLoss_no_batch_dim_mean
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_PoissonNLLLoss_no_batch_dim_none b/test/compiled_autograd_skips/TestNN.test_PoissonNLLLoss_no_batch_dim_none
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_PoissonNLLLoss_no_batch_dim_sum b/test/compiled_autograd_skips/TestNN.test_PoissonNLLLoss_no_batch_dim_sum
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_PoissonNLLLoss_no_reduce b/test/compiled_autograd_skips/TestNN.test_PoissonNLLLoss_no_reduce
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_SiLU_no_batch_dim b/test/compiled_autograd_skips/TestNN.test_SiLU_no_batch_dim
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Softsign_no_batch_dim b/test/compiled_autograd_skips/TestNN.test_Softsign_no_batch_dim
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Tanhshrink_no_batch_dim b/test/compiled_autograd_skips/TestNN.test_Tanhshrink_no_batch_dim
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_TransformerDecoderLayer_gelu_activation b/test/compiled_autograd_skips/TestNN.test_TransformerDecoderLayer_gelu_activation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_TransformerDecoderLayer_relu_activation b/test/compiled_autograd_skips/TestNN.test_TransformerDecoderLayer_relu_activation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_TransformerEncoderLayer_gelu_activation b/test/compiled_autograd_skips/TestNN.test_TransformerEncoderLayer_gelu_activation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_TransformerEncoderLayer_relu_activation b/test/compiled_autograd_skips/TestNN.test_TransformerEncoderLayer_relu_activation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_Transformer_multilayer_coder b/test/compiled_autograd_skips/TestNN.test_Transformer_multilayer_coder
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_TripletMarginLoss_no_batch_dim_mean b/test/compiled_autograd_skips/TestNN.test_TripletMarginLoss_no_batch_dim_mean
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_TripletMarginLoss_no_batch_dim_none b/test/compiled_autograd_skips/TestNN.test_TripletMarginLoss_no_batch_dim_none
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_TripletMarginLoss_no_batch_dim_sum b/test/compiled_autograd_skips/TestNN.test_TripletMarginLoss_no_batch_dim_sum
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_affine_grid b/test/compiled_autograd_skips/TestNN.test_affine_grid
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_affine_grid_3d b/test/compiled_autograd_skips/TestNN.test_affine_grid_3d
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_bilinear_no_bias b/test/compiled_autograd_skips/TestNN.test_bilinear_no_bias
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_cosine_embedding_loss_margin_no_reduce b/test/compiled_autograd_skips/TestNN.test_cosine_embedding_loss_margin_no_reduce
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_cosine_embedding_loss_no_reduce b/test/compiled_autograd_skips/TestNN.test_cosine_embedding_loss_no_reduce
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_elu_inplace_on_view b/test/compiled_autograd_skips/TestNN.test_elu_inplace_on_view
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_grid_sample b/test/compiled_autograd_skips/TestNN.test_grid_sample
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_interpolate b/test/compiled_autograd_skips/TestNN.test_interpolate
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_bias_weightCOO b/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_bias_weightCOO
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_bias_weightCSC b/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_bias_weightCSC
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_bias_weightCSR b/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_bias_weightCSR
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_nobias_weightCOO b/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_nobias_weightCOO
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_nobias_weightCSC b/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_nobias_weightCSC
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_nobias_weightCSR b/test/compiled_autograd_skips/TestNN.test_linear_autograd_device_cpu_nobias_weightCSR
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_normalize b/test/compiled_autograd_skips/TestNN.test_normalize
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_relu_inplace_on_view b/test/compiled_autograd_skips/TestNN.test_relu_inplace_on_view
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_swap_module_params_poisons_acc_grad b/test/compiled_autograd_skips/TestNN.test_swap_module_params_poisons_acc_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_triplet_margin_loss b/test/compiled_autograd_skips/TestNN.test_triplet_margin_loss
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_triplet_margin_loss_no_reduce b/test/compiled_autograd_skips/TestNN.test_triplet_margin_loss_no_reduce
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_triplet_margin_loss_swap b/test/compiled_autograd_skips/TestNN.test_triplet_margin_loss_swap
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_triplet_margin_loss_swap_no_reduce b/test/compiled_autograd_skips/TestNN.test_triplet_margin_loss_swap_no_reduce
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_upsamplingLinear1d b/test/compiled_autograd_skips/TestNN.test_upsamplingLinear1d
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_upsampling_bfloat16 b/test/compiled_autograd_skips/TestNN.test_upsampling_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNN.test_upsampling_not_recompute_scale_factor b/test/compiled_autograd_skips/TestNN.test_upsampling_not_recompute_scale_factor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_GRU_grad_and_gradgrad_cpu_float64 b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_GRU_grad_and_gradgrad_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_GroupNorm_general_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_GroupNorm_general_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_LSTM_differentiable_backward_using_oneDNN_cpu_bfloat16 b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_LSTM_differentiable_backward_using_oneDNN_cpu_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_LSTM_differentiable_backward_using_oneDNN_cpu_float32 b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_LSTM_differentiable_backward_using_oneDNN_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_LSTM_grad_and_gradgrad_cpu_float64 b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_LSTM_grad_and_gradgrad_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_elu_inplace_with_neg_alpha_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_elu_inplace_with_neg_alpha_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_groupnorm_nhwc_cpu_bfloat16 b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_groupnorm_nhwc_cpu_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_groupnorm_nhwc_cpu_float16 b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_groupnorm_nhwc_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_groupnorm_nhwc_cpu_float32 b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_groupnorm_nhwc_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_groupnorm_nhwc_cpu_float64 b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_groupnorm_nhwc_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_leaky_relu_inplace_with_neg_slope_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_leaky_relu_inplace_with_neg_slope_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_triplet_margin_with_distance_loss_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_triplet_margin_with_distance_loss_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_triplet_margin_with_distance_loss_default_parity_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_triplet_margin_with_distance_loss_default_parity_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_False_mode_bicubic_memory_format0_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_False_mode_bicubic_memory_format0_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_False_mode_bicubic_memory_format1_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_False_mode_bicubic_memory_format1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_False_mode_bilinear_memory_format0_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_False_mode_bilinear_memory_format0_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_False_mode_bilinear_memory_format1_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_False_mode_bilinear_memory_format1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_True_mode_bicubic_memory_format0_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_True_mode_bicubic_memory_format0_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_True_mode_bicubic_memory_format1_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_True_mode_bicubic_memory_format1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_True_mode_bilinear_memory_format0_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_True_mode_bilinear_memory_format0_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_True_mode_bilinear_memory_format1_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_False_align_corners_True_mode_bilinear_memory_format1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_False_mode_bicubic_memory_format0_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_False_mode_bicubic_memory_format0_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_False_mode_bicubic_memory_format1_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_False_mode_bicubic_memory_format1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_True_mode_bicubic_memory_format0_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_True_mode_bicubic_memory_format0_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_True_mode_bicubic_memory_format1_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_True_mode_bicubic_memory_format1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingNearest1d_mode_nearest-exact_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingNearest1d_mode_nearest-exact_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingNearest1d_mode_nearest_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingNearest1d_mode_nearest_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingTrilinear3d_align_corners_False_memory_format0_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingTrilinear3d_align_corners_False_memory_format0_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingTrilinear3d_align_corners_False_memory_format1_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingTrilinear3d_align_corners_False_memory_format1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingTrilinear3d_align_corners_True_memory_format0_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingTrilinear3d_align_corners_True_memory_format0_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingTrilinear3d_align_corners_True_memory_format1_cpu b/test/compiled_autograd_skips/TestNNDeviceTypeCPU.test_upsamplingTrilinear3d_align_corners_True_memory_format1_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNNParametrization.test_new_spectral_norm_swap_False b/test/compiled_autograd_skips/TestNNParametrization.test_new_spectral_norm_swap_False
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNamedTensor.test_autograd_ignores_names b/test/compiled_autograd_skips/TestNamedTensor.test_autograd_ignores_names
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNamedTensor.test_autograd_smoke b/test/compiled_autograd_skips/TestNamedTensor.test_autograd_smoke
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNamedTensor.test_autograd_warns_named_grad b/test/compiled_autograd_skips/TestNamedTensor.test_autograd_warns_named_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNamedTensor.test_tensor_grad_is_unnamed b/test/compiled_autograd_skips/TestNamedTensor.test_tensor_grad_is_unnamed
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedCheckpoint.test_nested_checkpoint_early_stop_False b/test/compiled_autograd_skips/TestNestedCheckpoint.test_nested_checkpoint_early_stop_False
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedCheckpoint.test_nested_checkpoint_early_stop_True b/test/compiled_autograd_skips/TestNestedCheckpoint.test_nested_checkpoint_early_stop_True
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedCheckpoint.test_nested_checkpoint_set_early_stop_no_recompution_needed b/test/compiled_autograd_skips/TestNestedCheckpoint.test_nested_checkpoint_set_early_stop_no_recompution_needed
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedCheckpoint.test_nested_checkpoint_two_children_early_stop_False b/test/compiled_autograd_skips/TestNestedCheckpoint.test_nested_checkpoint_two_children_early_stop_False
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedCheckpoint.test_nested_checkpoint_two_children_early_stop_True b/test/compiled_autograd_skips/TestNestedCheckpoint.test_nested_checkpoint_two_children_early_stop_True
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_abs_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_abs_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_accumulate_grad_different_strides_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_accumulate_grad_different_strides_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_as_nested_tensor_propagates_gradients_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_as_nested_tensor_propagates_gradients_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_backward_add_strided_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_backward_add_strided_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_backward_for_add_op_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_backward_for_add_op_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_backward_for_sub_op_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_backward_for_sub_op_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_backward_sub_strided_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_backward_sub_strided_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_dropout_backward_jagged_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_dropout_backward_jagged_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_dropout_backward_strided_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_dropout_backward_strided_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_gelu_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_gelu_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_indexing_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_indexing_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_5d_size_128_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_5d_size_128_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_5d_size_2_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_5d_size_2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_5d_size_32_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_5d_size_32_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_5d_size_4_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_5d_size_4_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_edge_case_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_edge_case_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_1023_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_1023_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_1024_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_1024_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_128_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_128_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_256_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_256_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_2_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_32_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_32_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_4_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_4_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_512_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_512_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_513_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_layer_norm_backward_size_513_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_masked_fill_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_masked_fill_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_bmm_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_bmm_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_bmm_gradcheck_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_bmm_gradcheck_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_from_list_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_from_list_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_from_mask_and_to_padded_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_from_mask_and_to_padded_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_from_padded_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_from_padded_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_from_padded_fused_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_from_padded_fused_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_generates_leaf_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_generates_leaf_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_linear_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_linear_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_linear_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_linear_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_linear_plus_transpose_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_linear_plus_transpose_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_matmul_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_matmul_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_matmul_gradcheck_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_matmul_gradcheck_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_reshape_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_reshape_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_reshape_gradcheck_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_reshape_gradcheck_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_softmax_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_softmax_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_squeeze_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_squeeze_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_squeeze_gradcheck_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_squeeze_gradcheck_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_to_padded_tensor_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_to_padded_tensor_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_transpose_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_transpose_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_transpose_gradcheck_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_transpose_gradcheck_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_unsqueeze_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_unsqueeze_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_unsqueeze_gradcheck_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_nested_tensor_unsqueeze_gradcheck_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_relu_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_relu_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_selu_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_selu_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_split_with_sizes_flow_through_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_split_with_sizes_flow_through_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_to_buffer_series_ops_grad_with_broadcast_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_to_buffer_series_ops_grad_with_broadcast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_unbind_flow_through_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_unbind_flow_through_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_values_grad_with_broadcast_cpu b/test/compiled_autograd_skips/TestNestedTensorAutogradCPU.test_values_grad_with_broadcast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_abs_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_abs_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_accumulate_grad_different_strides_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_accumulate_grad_different_strides_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_as_nested_tensor_propagates_gradients_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_as_nested_tensor_propagates_gradients_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_backward_add_strided_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_backward_add_strided_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_backward_for_add_op_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_backward_for_add_op_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_backward_for_sub_op_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_backward_for_sub_op_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_backward_sub_strided_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_backward_sub_strided_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_dropout_backward_jagged_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_dropout_backward_jagged_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_dropout_backward_strided_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_dropout_backward_strided_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_gelu_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_gelu_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_indexing_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_indexing_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_5d_size_128_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_5d_size_128_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_5d_size_2_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_5d_size_2_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_5d_size_32_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_5d_size_32_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_5d_size_4_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_5d_size_4_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_edge_case_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_edge_case_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_1023_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_1023_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_1024_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_1024_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_128_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_128_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_256_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_256_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_2_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_2_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_32_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_32_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_4_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_4_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_512_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_512_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_513_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_layer_norm_backward_size_513_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_masked_fill_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_masked_fill_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_bmm_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_bmm_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_bmm_gradcheck_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_bmm_gradcheck_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_from_list_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_from_list_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_from_mask_and_to_padded_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_from_mask_and_to_padded_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_from_padded_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_from_padded_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_from_padded_fused_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_from_padded_fused_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_generates_leaf_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_generates_leaf_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_linear_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_linear_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_linear_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_linear_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_linear_plus_transpose_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_linear_plus_transpose_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_matmul_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_matmul_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_matmul_gradcheck_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_matmul_gradcheck_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_reshape_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_reshape_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_reshape_gradcheck_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_reshape_gradcheck_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_softmax_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_softmax_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_squeeze_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_squeeze_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_squeeze_gradcheck_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_squeeze_gradcheck_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_to_padded_tensor_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_to_padded_tensor_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_transpose_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_transpose_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_transpose_gradcheck_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_transpose_gradcheck_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_unsqueeze_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_unsqueeze_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_unsqueeze_gradcheck_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_nested_tensor_unsqueeze_gradcheck_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_relu_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_relu_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_selu_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_selu_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_split_with_sizes_flow_through_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_split_with_sizes_flow_through_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_to_buffer_series_ops_grad_with_broadcast_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_to_buffer_series_ops_grad_with_broadcast_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_unbind_flow_through_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_unbind_flow_through_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_values_grad_with_broadcast_cuda b/test/compiled_autograd_skips/TestNestedTensorAutogradCUDA.test_values_grad_with_broadcast_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_detach_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_detach_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_detach_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_detach_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_detach_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_detach_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_embedding_jagged_cpu b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_embedding_jagged_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_chunk_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_chunk_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_chunk_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_chunk_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_chunk_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_chunk_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_indexing_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_indexing_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_indexing_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_indexing_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_indexing_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_indexing_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_sum_dim_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCPU.test_nested_tensor_sum_dim_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_detach_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_detach_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_detach_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_detach_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_detach_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_detach_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_embedding_jagged_cuda b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_embedding_jagged_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_chunk_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_chunk_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_chunk_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_chunk_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_chunk_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_chunk_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_indexing_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_indexing_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_indexing_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_indexing_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_indexing_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorDeviceTypeCUDA.test_nested_tensor_indexing_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorOpInfoCPU.test_nested_tensor_input_mutation_backward_cpu b/test/compiled_autograd_skips/TestNestedTensorOpInfoCPU.test_nested_tensor_input_mutation_backward_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorOpInfoCUDA.test_nested_tensor_input_mutation_backward_cuda b/test/compiled_autograd_skips/TestNestedTensorOpInfoCUDA.test_nested_tensor_input_mutation_backward_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_apply__cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_apply__cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_autograd_function_with_None_grad_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_autograd_function_with_None_grad_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_binary_pointwise_broadcasting_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_binary_pointwise_broadcasting_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_binary_pointwise_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_binary_pointwise_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_binary_pointwise_transposed_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_binary_pointwise_transposed_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_composite_op_in_inference_mode_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_composite_op_in_inference_mode_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_construction_from_list_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_construction_from_list_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_is_contiguous_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_is_contiguous_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_False_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_False_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_False_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_False_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_False_components_require_grad_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_mean_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_mean_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_mean_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_mean_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_mean_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_mean_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_mean_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_mean_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_op_different_output_shape_dim_sum_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_2d_input_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_2d_input_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_2d_input_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_2d_input_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_2d_input_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_2d_input_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_2d_input_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_2d_input_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_operate_on_batch_dim_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_operate_on_batch_dim_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_operate_on_batch_dim_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_operate_on_batch_dim_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_operate_on_batch_dim_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_operate_on_batch_dim_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_operate_on_batch_dim_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_operate_on_batch_dim_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_reduce_ragged_idx_1_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_reduce_ragged_idx_1_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_reduce_ragged_idx_1_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_layer_norm_reduce_ragged_idx_1_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_linear_nt_dim_3_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_linear_nt_dim_3_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_linear_nt_dim_4_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_linear_nt_dim_4_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_linear_nt_dim_5_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_linear_nt_dim_5_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_njt_cat_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_njt_cat_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_batch_only_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_1_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_1_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_1_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_1_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_1_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_1_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_1_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_1_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_1_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_2_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_2_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_2_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_2_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_2_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_2_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_2_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_mean_transpose_offset_2_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_1_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_reduce_ragged_idx_greater_than_1_different_output_shape_sum_transpose_offset_2_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_transpose_non_ragged_dim_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_mean_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_mean_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_op_dim_with_lengths_different_output_shape_sum_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_reshape_decomp_requires_grad_True_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_reshape_decomp_requires_grad_True_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_cpu_bfloat16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_cpu_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_with_constant_sequence_length_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_with_constant_sequence_length_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_with_constant_sequence_length_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_with_constant_sequence_length_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_with_constant_sequence_length_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sdpa_with_constant_sequence_length_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_1_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_1_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_1_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_1_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_1_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_1_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_1_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_1_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_1_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_1_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_1_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_1_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_1_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_1_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_1_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_1_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_2_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_2_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_2_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_2_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_2_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_2_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_2_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_reduce_ragged_idx_greater_than_1_same_output_shape_transpose_offset_2_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_transpose_non_ragged_dim_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_transpose_non_ragged_dim_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_transpose_non_ragged_dim_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_transpose_non_ragged_dim_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_transpose_non_ragged_dim_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_transpose_non_ragged_dim_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_transpose_non_ragged_dim_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_transpose_non_ragged_dim_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_with_lengths_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_with_lengths_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_with_lengths_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_with_lengths_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_with_lengths_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_with_lengths_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_with_lengths_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_dim_with_lengths_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_reduce_batch_dim_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_reduce_batch_dim_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_reduce_batch_dim_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_reduce_batch_dim_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_reduce_batch_dim_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_reduce_batch_dim_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_reduce_batch_dim_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_softmax_reduce_batch_dim_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_squeeze_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_squeeze_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_batch_and_non_batch_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_False_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_False_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_False_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_False_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_True_requires_grad_False_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_True_requires_grad_False_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_True_requires_grad_True_components_require_grad_False_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_sum_dim_reduce_ragged_and_non_batch_keepdim_True_requires_grad_True_components_require_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_tensor_attributes_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_tensor_attributes_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_copy_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_copy_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_2_requires_grad_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_2_requires_grad_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_2_requires_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_2_requires_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_2_requires_grad_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_2_requires_grad_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_3_requires_grad_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_3_requires_grad_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_3_requires_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_3_requires_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_3_requires_grad_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_3_requires_grad_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_4_requires_grad_True_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_4_requires_grad_True_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_4_requires_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_4_requires_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_4_requires_grad_True_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_to_padded_tensor_nt_dim_4_requires_grad_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unary_pointwise_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unary_pointwise_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unary_pointwise_transposed_inputs_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unary_pointwise_transposed_inputs_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_backward_cpu_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_backward_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_backward_cpu_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_backward_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_backward_cpu_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_backward_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_transpose_ragged_idx_2_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_transpose_ragged_idx_2_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_transpose_ragged_idx_3_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_transpose_ragged_idx_3_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_transpose_ragged_idx_last_dim_cpu b/test/compiled_autograd_skips/TestNestedTensorSubclassCPU.test_unbind_transpose_ragged_idx_last_dim_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_False_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_jagged_requires_grad_True_contiguous_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_False_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_2_layout_strided_requires_grad_True_contiguous_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_False_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_jagged_requires_grad_True_contiguous_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_False_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_3_layout_strided_requires_grad_True_contiguous_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_False_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_jagged_requires_grad_True_contiguous_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_False_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_as_nested_tensor_from_tensor_dim_4_layout_strided_requires_grad_True_contiguous_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_autograd_function_with_None_grad_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_autograd_function_with_None_grad_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_binary_pointwise_broadcasting_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_binary_pointwise_broadcasting_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_binary_pointwise_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_binary_pointwise_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_binary_pointwise_transposed_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_binary_pointwise_transposed_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_chunk_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_chunk_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_flex_attention_converts_stacked_seq_indices_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_flex_attention_converts_stacked_seq_indices_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_flex_attention_noncontig_with_holes_False_cross_attention_False_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_flex_attention_noncontig_with_holes_False_cross_attention_False_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_flex_attention_noncontig_with_holes_False_cross_attention_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_flex_attention_noncontig_with_holes_False_cross_attention_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_as_nested_tensor_components_require_grad_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_False_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_layout_construction_nested_tensor_requires_grad_True_components_require_grad_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_False_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_jagged_view_from_values_offsets_requires_grad_True_values_is_view_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_linear_backward_memory_usage_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_linear_backward_memory_usage_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_linear_nt_dim_3_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_linear_nt_dim_3_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_linear_nt_dim_4_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_linear_nt_dim_4_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_linear_nt_dim_5_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_linear_nt_dim_5_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_njt_cat_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_njt_cat_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_reshape_decomp_requires_grad_True_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_reshape_decomp_requires_grad_True_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_cuda_bfloat16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_cuda_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_with_constant_sequence_length_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_with_constant_sequence_length_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_with_constant_sequence_length_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_with_constant_sequence_length_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_with_constant_sequence_length_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_sdpa_with_constant_sequence_length_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_softmax_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_softmax_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_2_requires_grad_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_2_requires_grad_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_2_requires_grad_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_2_requires_grad_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_2_requires_grad_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_2_requires_grad_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_3_requires_grad_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_3_requires_grad_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_3_requires_grad_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_3_requires_grad_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_3_requires_grad_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_3_requires_grad_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_4_requires_grad_True_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_4_requires_grad_True_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_4_requires_grad_True_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_4_requires_grad_True_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_4_requires_grad_True_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_to_padded_tensor_nt_dim_4_requires_grad_True_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_unary_pointwise_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_unary_pointwise_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_unary_pointwise_transposed_inputs_cuda b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_unary_pointwise_transposed_inputs_cuda
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_unbind_backward_cuda_float16 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_unbind_backward_cuda_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_unbind_backward_cuda_float32 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_unbind_backward_cuda_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_unbind_backward_cuda_float64 b/test/compiled_autograd_skips/TestNestedTensorSubclassCUDA.test_unbind_backward_cuda_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestOpenReg.test_autograd_init b/test/compiled_autograd_skips/TestOpenReg.test_autograd_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_adaptive_pooling_empty_output_size_cpu_float32 b/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_adaptive_pooling_empty_output_size_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_adaptive_pooling_empty_output_size_cpu_float64 b/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_adaptive_pooling_empty_output_size_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_max_pool3d_ndhwc_cpu_bfloat16 b/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_max_pool3d_ndhwc_cpu_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_max_pool3d_ndhwc_cpu_float16 b/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_max_pool3d_ndhwc_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_max_pool3d_ndhwc_cpu_float32 b/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_max_pool3d_ndhwc_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_max_pool3d_ndhwc_cpu_float64 b/test/compiled_autograd_skips/TestPoolingNNDeviceTypeCPU.test_max_pool3d_ndhwc_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestProfiler.test_profiler_fwd_bwd_link b/test/compiled_autograd_skips/TestProfiler.test_profiler_fwd_bwd_link
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestProfiler.test_source b/test/compiled_autograd_skips/TestProfiler.test_source
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestPythonDispatch.test_custom_autograd b/test/compiled_autograd_skips/TestPythonDispatch.test_custom_autograd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestPythonDispatch.test_shallow_copy_and_detach b/test/compiled_autograd_skips/TestPythonDispatch.test_shallow_copy_and_detach
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestPythonDispatch.test_subclass_autograd_device_check b/test/compiled_autograd_skips/TestPythonDispatch.test_subclass_autograd_device_check
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestReductions.test_amax_grad b/test/compiled_autograd_skips/TestReductions.test_amax_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestReductions.test_amin_grad b/test/compiled_autograd_skips/TestReductions.test_amin_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestReductions.test_backward b/test/compiled_autograd_skips/TestReductions.test_backward
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestReductions.test_mean_dim_grad b/test/compiled_autograd_skips/TestReductions.test_mean_dim_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestReductions.test_mean_grad_case_1a b/test/compiled_autograd_skips/TestReductions.test_mean_grad_case_1a
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestReductions.test_mean_grad_case_1b b/test/compiled_autograd_skips/TestReductions.test_mean_grad_case_1b
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestReductions.test_mean_grad_case_1e b/test/compiled_autograd_skips/TestReductions.test_mean_grad_case_1e
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestReductions.test_prod_grad b/test/compiled_autograd_skips/TestReductions.test_prod_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestReductions.test_sum_grad b/test/compiled_autograd_skips/TestReductions.test_sum_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestScatterGatherCPU.test_gather_backward_with_empty_index_tensor_sparse_grad_True_cpu_float32 b/test/compiled_autograd_skips/TestScatterGatherCPU.test_gather_backward_with_empty_index_tensor_sparse_grad_True_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestScatterGatherCPU.test_gather_backward_with_empty_index_tensor_sparse_grad_True_cpu_float64 b/test/compiled_autograd_skips/TestScatterGatherCPU.test_gather_backward_with_empty_index_tensor_sparse_grad_True_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestScript.test_cat b/test/compiled_autograd_skips/TestScript.test_cat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestScript.test_linear_grad b/test/compiled_autograd_skips/TestScript.test_linear_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestScript.test_mm_batching b/test/compiled_autograd_skips/TestScript.test_mm_batching
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestScript.test_stack b/test/compiled_autograd_skips/TestScript.test_stack
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSelectiveActivationCheckpoint.test_flops_and_mem b/test/compiled_autograd_skips/TestSelectiveActivationCheckpoint.test_flops_and_mem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSC_masked_fast_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSC_masked_fast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSC_masked_slow_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSC_masked_slow_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSC_nonmasked_fast_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSC_nonmasked_fast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSC_nonmasked_slow_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSC_nonmasked_slow_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSR_masked_fast_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSR_masked_fast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSR_masked_slow_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSR_masked_slow_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSR_nonmasked_fast_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSR_nonmasked_fast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSR_nonmasked_slow_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseBSR_nonmasked_slow_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCOO_masked_fast_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCOO_masked_fast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCOO_masked_slow_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCOO_masked_slow_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCOO_nonmasked_fast_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCOO_nonmasked_fast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCOO_nonmasked_slow_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCOO_nonmasked_slow_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSC_masked_fast_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSC_masked_fast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSC_masked_slow_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSC_masked_slow_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSC_nonmasked_fast_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSC_nonmasked_fast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSC_nonmasked_slow_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSC_nonmasked_slow_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSR_masked_fast_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSR_masked_fast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSR_masked_slow_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSR_masked_slow_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSR_nonmasked_fast_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSR_nonmasked_fast_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSR_nonmasked_slow_cpu b/test/compiled_autograd_skips/TestSparseAnyCPU.test_as_sparse_gradcheck_SparseCSR_nonmasked_slow_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_masked_fast_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_masked_fast_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_masked_fast_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_masked_fast_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_masked_slow_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_masked_slow_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_masked_slow_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_masked_slow_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_sparse_fast_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_sparse_fast_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_sparse_fast_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_sparse_fast_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_sparse_slow_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_sparse_slow_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_sparse_slow_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseBSR_sparse_slow_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_masked_fast_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_masked_fast_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_masked_fast_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_masked_fast_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_masked_slow_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_masked_slow_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_masked_slow_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_masked_slow_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_sparse_fast_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_sparse_fast_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_sparse_fast_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_sparse_fast_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_sparse_slow_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_sparse_slow_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_sparse_slow_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSC_sparse_slow_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_masked_fast_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_masked_fast_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_masked_fast_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_masked_fast_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_masked_slow_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_masked_slow_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_masked_slow_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_masked_slow_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_sparse_fast_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_sparse_fast_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_sparse_fast_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_sparse_fast_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_sparse_slow_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_sparse_slow_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_sparse_slow_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCSR_sparse_slow_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSC_int64_masked_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSC_int64_masked_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSC_int64_masked_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSC_int64_masked_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSC_int64_sparse_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSC_int64_sparse_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSC_int64_sparse_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSC_int64_sparse_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSR_int64_masked_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSR_int64_masked_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSR_int64_masked_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSR_int64_masked_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSR_int64_sparse_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSR_int64_sparse_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSR_int64_sparse_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseBSR_int64_sparse_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCOO_int64_masked_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCOO_int64_masked_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCOO_int64_masked_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCOO_int64_masked_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCOO_int64_sparse_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCOO_int64_sparse_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCOO_int64_sparse_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCOO_int64_sparse_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSC_int64_masked_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSC_int64_masked_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSC_int64_masked_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSC_int64_masked_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSC_int64_sparse_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSC_int64_sparse_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSC_int64_sparse_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSC_int64_sparse_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSR_int64_masked_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSR_int64_masked_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSR_int64_masked_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSR_int64_masked_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSR_int64_sparse_cpu_complex128 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSR_int64_sparse_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSR_int64_sparse_cpu_float64 b/test/compiled_autograd_skips/TestSparseAnyCPU.test_gradcheck_to_dense_SparseCSR_int64_sparse_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_Sparse_to_Sparse_copy__cpu_bfloat16 b/test/compiled_autograd_skips/TestSparseCPU.test_Sparse_to_Sparse_copy__cpu_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_Sparse_to_Sparse_copy__cpu_complex128 b/test/compiled_autograd_skips/TestSparseCPU.test_Sparse_to_Sparse_copy__cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_Sparse_to_Sparse_copy__cpu_float64 b/test/compiled_autograd_skips/TestSparseCPU.test_Sparse_to_Sparse_copy__cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_log_softmax_zero_nnz_cpu_float32 b/test/compiled_autograd_skips/TestSparseCPU.test_log_softmax_zero_nnz_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_log_softmax_zero_nnz_cpu_float64 b/test/compiled_autograd_skips/TestSparseCPU.test_log_softmax_zero_nnz_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_softmax_zero_nnz_cpu_float32 b/test/compiled_autograd_skips/TestSparseCPU.test_softmax_zero_nnz_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_softmax_zero_nnz_cpu_float64 b/test/compiled_autograd_skips/TestSparseCPU.test_softmax_zero_nnz_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_sparse_mask_backward_cpu_complex128 b/test/compiled_autograd_skips/TestSparseCPU.test_sparse_mask_backward_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_sparse_mask_backward_cpu_float64 b/test/compiled_autograd_skips/TestSparseCPU.test_sparse_mask_backward_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_sparse_matmul_cpu_complex128 b/test/compiled_autograd_skips/TestSparseCPU.test_sparse_matmul_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_sparse_matmul_cpu_complex64 b/test/compiled_autograd_skips/TestSparseCPU.test_sparse_matmul_cpu_complex64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_sparse_matmul_cpu_float32 b/test/compiled_autograd_skips/TestSparseCPU.test_sparse_matmul_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_sparse_matmul_cpu_float64 b/test/compiled_autograd_skips/TestSparseCPU.test_sparse_matmul_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_sparse_mul_masked_cpu_float64 b/test/compiled_autograd_skips/TestSparseCPU.test_sparse_mul_masked_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCPU.test_sparse_mul_sparse_cpu_float64 b/test/compiled_autograd_skips/TestSparseCPU.test_sparse_mul_sparse_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_autograd_dense_output_addmm_cpu_float64 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_autograd_dense_output_addmm_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_autograd_dense_output_addmv_cpu_float64 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_autograd_dense_output_addmv_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_mul_cpu_float32 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_mul_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_mul_cpu_float64 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_mul_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_sampled_addmm_autograd_cpu_complex128 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_sampled_addmm_autograd_cpu_complex128
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_sampled_addmm_autograd_cpu_complex64 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_sampled_addmm_autograd_cpu_complex64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_sampled_addmm_autograd_cpu_float32 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_sampled_addmm_autograd_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_sampled_addmm_autograd_cpu_float64 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_sampled_addmm_autograd_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_sparse_mm_reduce_sum_cpu_bfloat16 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_sparse_mm_reduce_sum_cpu_bfloat16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_sparse_mm_reduce_sum_cpu_float16 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_sparse_mm_reduce_sum_cpu_float16
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_sparse_mm_reduce_sum_cpu_float32 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_sparse_mm_reduce_sum_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_sparse_mm_reduce_sum_cpu_float64 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_sparse_mm_reduce_sum_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_sum_cpu_float32 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_sum_cpu_float32
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestSparseCSRCPU.test_sum_cpu_float64 b/test/compiled_autograd_skips/TestSparseCSRCPU.test_sum_cpu_float64
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_addcmul b/test/compiled_autograd_skips/TestTEFuserDynamic.test_addcmul
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_clamp b/test/compiled_autograd_skips/TestTEFuserDynamic.test_clamp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_comparison_eq_ne b/test/compiled_autograd_skips/TestTEFuserDynamic.test_comparison_eq_ne
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_comparison_ge_le b/test/compiled_autograd_skips/TestTEFuserDynamic.test_comparison_ge_le
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_comparison_gt_lt b/test/compiled_autograd_skips/TestTEFuserDynamic.test_comparison_gt_lt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_concat_invariant b/test/compiled_autograd_skips/TestTEFuserDynamic.test_concat_invariant
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_exp b/test/compiled_autograd_skips/TestTEFuserDynamic.test_exp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_hardsigmoid_fwd_bwd b/test/compiled_autograd_skips/TestTEFuserDynamic.test_hardsigmoid_fwd_bwd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_hardswish_fwd_bwd b/test/compiled_autograd_skips/TestTEFuserDynamic.test_hardswish_fwd_bwd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_inlined_optimized_graph b/test/compiled_autograd_skips/TestTEFuserDynamic.test_inlined_optimized_graph
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_lerp b/test/compiled_autograd_skips/TestTEFuserDynamic.test_lerp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_lstm_concat b/test/compiled_autograd_skips/TestTEFuserDynamic.test_lstm_concat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_lstm_traced b/test/compiled_autograd_skips/TestTEFuserDynamic.test_lstm_traced
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_relu b/test/compiled_autograd_skips/TestTEFuserDynamic.test_relu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_relu_fwd_bwd b/test/compiled_autograd_skips/TestTEFuserDynamic.test_relu_fwd_bwd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserDynamic.test_small_constant b/test/compiled_autograd_skips/TestTEFuserDynamic.test_small_constant
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_addcmul b/test/compiled_autograd_skips/TestTEFuserStatic.test_addcmul
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_chunk_distributes b/test/compiled_autograd_skips/TestTEFuserStatic.test_chunk_distributes
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_clamp b/test/compiled_autograd_skips/TestTEFuserStatic.test_clamp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_comparison_eq_ne b/test/compiled_autograd_skips/TestTEFuserStatic.test_comparison_eq_ne
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_comparison_ge_le b/test/compiled_autograd_skips/TestTEFuserStatic.test_comparison_ge_le
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_comparison_gt_lt b/test/compiled_autograd_skips/TestTEFuserStatic.test_comparison_gt_lt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_concat_invariant b/test/compiled_autograd_skips/TestTEFuserStatic.test_concat_invariant
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_constant_chunk_shapes b/test/compiled_autograd_skips/TestTEFuserStatic.test_constant_chunk_shapes
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_exp b/test/compiled_autograd_skips/TestTEFuserStatic.test_exp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_hardsigmoid_fwd_bwd b/test/compiled_autograd_skips/TestTEFuserStatic.test_hardsigmoid_fwd_bwd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_hardswish_fwd_bwd b/test/compiled_autograd_skips/TestTEFuserStatic.test_hardswish_fwd_bwd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_lerp b/test/compiled_autograd_skips/TestTEFuserStatic.test_lerp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_lstm_concat b/test/compiled_autograd_skips/TestTEFuserStatic.test_lstm_concat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_lstm_traced b/test/compiled_autograd_skips/TestTEFuserStatic.test_lstm_traced
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_milstm b/test/compiled_autograd_skips/TestTEFuserStatic.test_milstm
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_relu b/test/compiled_autograd_skips/TestTEFuserStatic.test_relu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_relu_fwd_bwd b/test/compiled_autograd_skips/TestTEFuserStatic.test_relu_fwd_bwd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestTEFuserStatic.test_small_constant b/test/compiled_autograd_skips/TestTEFuserStatic.test_small_constant
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestViewOpsCPU.test_as_strided_gradients_cpu b/test/compiled_autograd_skips/TestViewOpsCPU.test_as_strided_gradients_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestViewOpsLAZY.test_as_strided_gradients_lazy b/test/compiled_autograd_skips/TestViewOpsLAZY.test_as_strided_gradients_lazy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestViewOpsLAZY.test_view_copy_lazy b/test/compiled_autograd_skips/TestViewOpsLAZY.test_view_copy_lazy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapAPILegacy.test_batched_gradient_basic b/test/compiled_autograd_skips/TestVmapAPILegacy.test_batched_gradient_basic
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapAPILegacy.test_fallback_with_undefined_grad b/test/compiled_autograd_skips/TestVmapAPILegacy.test_fallback_with_undefined_grad
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_add_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_add_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_binary_cross_entropy_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_binary_cross_entropy_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_diagonal_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_diagonal_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_div_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_div_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_expand_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_expand_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_index_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_index_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_inplace_manyview_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_inplace_manyview_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_inplace_on_view_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_inplace_on_view_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_lgamma_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_lgamma_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_log1p_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_log1p_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_log_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_log_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_logsumexp_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_logsumexp_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_max_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_max_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_median_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_median_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_min_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_min_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_mul_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_mul_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_permute_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_permute_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_reshape_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_reshape_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_select_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_select_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_sigmoid_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_sigmoid_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_slice_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_slice_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_stack_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_stack_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_sub_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_sub_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_threshold_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_threshold_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_trace_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_trace_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_unrelated_output_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_unrelated_output_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_unrelated_output_multiple_grad_cpu b/test/compiled_autograd_skips/TestVmapBatchedGradientLegacyCPU.test_unrelated_output_multiple_grad_cpu
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestWithEffects.test_effectful_custom_op_with_subclasses b/test/compiled_autograd_skips/TestWithEffects.test_effectful_custom_op_with_subclasses
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestWithEffects.test_effects_and_aliased_outputs b/test/compiled_autograd_skips/TestWithEffects.test_effects_and_aliased_outputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestWithEffects.test_regular_effectful_op_in_forward_and_backward b/test/compiled_autograd_skips/TestWithEffects.test_regular_effectful_op_in_forward_and_backward
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/compiled_autograd_skips/TestWithEffects.test_regular_effectful_op_only_in_backward b/test/compiled_autograd_skips/TestWithEffects.test_regular_effectful_op_only_in_backward
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/conftest.py b/test/conftest.py
index e02f24ad9cbb..215e56f47b88 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -341,5 +341,9 @@ def pytest_runtest_protocol(self, item, nextitem) -> None:
         self.cache.set(self.directory, self.lastrun)
 
     def pytest_sessionfinish(self, session, exitstatus):
+<<<<<<< HEAD
         if exitstatus == 0 and not self.run_single:
+=======
+        if exitstatus == 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.cache.set(self.directory, self.initial_val)
diff --git a/test/cpp/aoti_abi_check/CMakeLists.txt b/test/cpp/aoti_abi_check/CMakeLists.txt
index 401a4c712a8c..97dcc14bc3fd 100644
--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@@ -5,6 +5,10 @@ set(AOTI_ABI_CHECK_TEST_SRCS
   ${AOTI_ABI_CHECK_TEST_ROOT}/main.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
+<<<<<<< HEAD
+=======
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_vec.cpp
@@ -19,7 +23,11 @@ target_compile_definitions(test_aoti_abi_check PRIVATE USE_GTEST)
 
 # WARNING: DO NOT LINK torch!!!
 # The purpose is to check if the used aten/c10 headers are writtern in a header-only way
+<<<<<<< HEAD
 target_link_libraries(test_aoti_abi_check PRIVATE gtest)
+=======
+target_link_libraries(test_aoti_abi_check PRIVATE gtest_main)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE})
 
 if(INSTALL_TEST)
diff --git a/test/cpp/aoti_abi_check/test_macros.cpp b/test/cpp/aoti_abi_check/test_macros.cpp
new file mode 100644
index 000000000000..a42e89d524ac
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_macros.cpp
@@ -0,0 +1,18 @@
+#include <gtest/gtest.h>
+
+#include <torch/headeronly/macros/Export.h>
+
+namespace torch {
+namespace aot_inductor {
+
+C10_API bool equal(int a, int b) {
+  return a == b;
+}
+
+TEST(TestMacros, TestC10API) {
+  EXPECT_TRUE(equal(1, 1));
+  EXPECT_FALSE(equal(1, 2));
+}
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/test/cpp/aoti_inference/CMakeLists.txt b/test/cpp/aoti_inference/CMakeLists.txt
index c00ca5c153b6..1dbdef757d97 100644
--- a/test/cpp/aoti_inference/CMakeLists.txt
+++ b/test/cpp/aoti_inference/CMakeLists.txt
@@ -55,7 +55,11 @@ add_custom_command(
 
 target_link_libraries(test_aoti_inference PRIVATE
   torch
+<<<<<<< HEAD
   gtest
+=======
+  gtest_main
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   -Wl,--no-as-needed aoti_custom_class
 )
 
diff --git a/test/cpp/aoti_inference/test.cpp b/test/cpp/aoti_inference/test.cpp
index 2e18a1e8fa5f..b08d608da5a3 100644
--- a/test/cpp/aoti_inference/test.cpp
+++ b/test/cpp/aoti_inference/test.cpp
@@ -1,10 +1,31 @@
+<<<<<<< HEAD
 #include <gtest/gtest.h>
 #include <filesystem>
 #include <string>
+=======
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <atomic>
+#include <condition_variable>
+#include <filesystem>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <string>
+#include <thread>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <vector>
 
 #include <torch/csrc/inductor/aoti_package/model_package_loader.h>
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
+<<<<<<< HEAD
+=======
+#if defined(USE_CUDA)
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime.h>
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(USE_CUDA) || defined(USE_ROCM)
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
 #endif
@@ -16,6 +37,34 @@
 
 namespace {
 
+<<<<<<< HEAD
+=======
+const std::unordered_map<std::string, at::Tensor> derefTensorConstantMap(
+    torch::inductor::TensorConstantMap tensor_constant_map) {
+  std::unordered_map<std::string, at::Tensor> ret;
+  for (const auto& pair : tensor_constant_map) {
+    ret.emplace(pair.first, *(pair.second));
+  }
+  return ret;
+}
+
+bool compareConstantMap(
+    const std::unordered_map<std::string, at::Tensor>& lhs,
+    const std::unordered_map<std::string, at::Tensor>& rhs) {
+  if (lhs.size() != rhs.size()) {
+    return false;
+  }
+
+  for (const auto& pair : lhs) {
+    auto it = rhs.find(pair.first);
+    if (it == rhs.end() || !torch::allclose(pair.second, it->second)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void test_aoti(const std::string& device, bool use_runtime_constant_folding) {
   torch::NoGradGuard no_grad;
 
@@ -103,6 +152,48 @@ void test_aoti_package_loader(
   ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
 }
 
+<<<<<<< HEAD
+=======
+void test_aoti_package_loader_multi_gpu(
+    const std::string& device,
+    bool use_runtime_constant_folding) {
+  torch::NoGradGuard no_grad;
+
+  std::string data_path =
+      (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
+           .string();
+  torch::jit::script::Module data_loader = torch::jit::load(data_path);
+  std::string suffix = use_runtime_constant_folding
+      ? device + "_use_runtime_constant_folding"
+      : device;
+  std::string path_attr = "pt2_package_path_" + suffix;
+  std::string inputs_attr = "inputs_" + suffix;
+  std::string outputs_attr = "outputs_" + suffix;
+  const auto& pt2_package_path =
+      data_loader.attr(path_attr.c_str()).toStringRef();
+  const auto& ref_output_tensors =
+      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
+
+  // For all available CUDA devices: Load PT2 package on this device, run
+  // inference, and validate results
+  auto input_tensors =
+      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
+  for (int i = 0; i < torch::cuda::device_count(); i++) {
+    auto options = torch::TensorOptions().device(torch::kCUDA, i);
+    torch::inductor::AOTIModelPackageLoader runner(
+        pt2_package_path, "model", false, 1, i);
+    std::vector<torch::Tensor> input_tensors_on_device;
+    for (auto input_tensor : input_tensors) {
+      input_tensors_on_device.push_back(input_tensor.clone().to(options));
+    }
+    // Run loaded PT2 package on device
+    auto actual_output_tensors = runner.run(input_tensors_on_device);
+    ASSERT_TRUE(torch::allclose(
+        ref_output_tensors[0].cpu(), actual_output_tensors[0].cpu()));
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void test_aoti_constants_update(
     const std::string& device,
     bool use_runtime_constant_folding) {
@@ -155,13 +246,24 @@ void test_aoti_constants_update(
   ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
 
   // Update with missing map which should throw.
+<<<<<<< HEAD
   EXPECT_THROW(
       runner->update_constant_buffer(missing_map, false, true),
       std::runtime_error);
+=======
+  // Somehow EXPECT_THROW doesn't work here when running tests in a row, but
+  // works when running AotInductorTest.RuntimeUpdateConstantsCuda individually.
+  try {
+    runner->update_constant_buffer(missing_map, false, true);
+  } catch (const std::runtime_error& e) {
+    EXPECT_THAT(e.what(), ::testing::HasSubstr("API call failed at"));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Update random weight to buffer #1.
   runner->update_constant_buffer(missing_map, false, false);
   actual_output_tensors = runner->run(input_tensors);
+<<<<<<< HEAD
   if (use_runtime_constant_folding) {
     // At this moment, this update is applied on the original weight.
     // The weight being consumed is "folded", so will have no affect.
@@ -170,6 +272,8 @@ void test_aoti_constants_update(
     runner->run_const_fold(/* use_inactive = */ false);
     actual_output_tensors = runner->run(input_tensors);
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ASSERT_FALSE(
       torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
 
@@ -190,6 +294,103 @@ void test_aoti_constants_update(
   actual_output_tensors = runner->run(input_tensors);
   ASSERT_FALSE(
       torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+<<<<<<< HEAD
+=======
+
+  for (auto& pair : missing_map) {
+    delete pair.second;
+  }
+  for (auto& pair : rand_map) {
+    delete pair.second;
+  }
+  for (auto& pair : real_map) {
+    delete pair.second;
+  }
+}
+
+void test_aoti_extract_constants_map(const std::string& device) {
+  torch::NoGradGuard no_grad;
+
+  std::string data_path =
+      (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
+           .string();
+
+  torch::jit::script::Module data_loader = torch::jit::load(data_path);
+  std::string path_attr = "model_so_path_" + device;
+  std::string inputs_attr = "inputs_" + device;
+  std::string outputs_attr = "outputs_" + device;
+  std::string weights_attr = "w_pre_" + device;
+  std::string add_attr = "w_add_" + device;
+  const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef();
+  auto input_tensors =
+      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
+  const auto& ref_output_tensors =
+      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
+
+  const auto& weight_tensors =
+      data_loader.attr(weights_attr.c_str()).toTensor();
+  const auto& add_tensors = data_loader.attr(add_attr.c_str()).toTensor();
+
+  torch::inductor::TensorConstantMap rand_map, real_map;
+  at::Tensor rand_pre, rand_add;
+  at::Tensor w_pre, w_add;
+  at::DeviceType device_type = device == "cuda" ? at::kCUDA : at::kCPU;
+  rand_pre = at::randn({4, 4}).to(device_type);
+  rand_add = at::randn({4, 4}).to(device_type);
+  w_pre = at::Tensor(weight_tensors).to(device_type);
+  w_add = at::Tensor(add_tensors).to(device_type);
+
+  rand_map.emplace("L__self___w_pre", &rand_pre);
+  rand_map.emplace("L__self___w_add", &rand_add);
+  real_map.emplace("L__self___w_pre", &w_pre);
+  real_map.emplace("L__self___w_add", &w_add);
+
+  std::unique_ptr<torch::inductor::AOTIModelContainerRunner> runner;
+  if (device == "cpu") {
+    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCpu>(
+        model_so_path);
+#if defined(USE_CUDA) || defined(USE_ROCM)
+  } else if (device == "cuda") {
+    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
+        model_so_path);
+#endif
+  } else {
+    testing::AssertionFailure() << "unsupported device: " << device;
+  }
+
+  // By default, buffer #1 get loaded with burned in weights. Correct results.
+  auto actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // We update the weights to buffer #2 and activate it. This should still
+  // produce correct result, as it's the real constant map.
+  runner->update_inactive_constant_buffer(real_map);
+  auto extracted_inactive_weight =
+      runner->extract_constants_map(/* use_inactive = */ true);
+  auto extracted_active_weight =
+      runner->extract_constants_map(/* use_inactive = */ false);
+  auto cmp_real_map = derefTensorConstantMap(real_map);
+  auto cmp_rand_map = derefTensorConstantMap(rand_map);
+  ASSERT_TRUE(compareConstantMap(extracted_active_weight, cmp_real_map));
+  ASSERT_TRUE(compareConstantMap(extracted_inactive_weight, cmp_real_map));
+
+  // We update random weights to buffer #1. But do not swap in the weight yet.
+  runner->update_inactive_constant_buffer(rand_map);
+  extracted_inactive_weight =
+      runner->extract_constants_map(/* use_inactive = */ true);
+  ASSERT_TRUE(compareConstantMap(extracted_inactive_weight, cmp_rand_map));
+
+  // We swap and activate the weight to buffer #1.
+  // Active weight now should be the new weight, while inactive should be the
+  // previous one.
+  runner->swap_constant_buffer();
+  extracted_inactive_weight =
+      runner->extract_constants_map(/* use_inactive = */ true);
+  extracted_active_weight =
+      runner->extract_constants_map(/* use_inactive = */ false);
+  ASSERT_TRUE(compareConstantMap(extracted_active_weight, cmp_rand_map));
+  ASSERT_TRUE(compareConstantMap(extracted_inactive_weight, cmp_real_map));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void test_aoti_double_buffering(
@@ -271,6 +472,16 @@ void test_aoti_double_buffering(
   runner->swap_constant_buffer();
   actual_output_tensors = runner->run(input_tensors);
   ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+<<<<<<< HEAD
+=======
+
+  for (auto& pair : rand_map) {
+    delete pair.second;
+  }
+  for (auto& pair : real_map) {
+    delete pair.second;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #if defined(USE_CUDA) || defined(USE_ROCM)
@@ -311,6 +522,7 @@ void test_aoti_double_buffering_with_tensor_constants() {
   runner->swap_constant_buffer();
   actual_output_tensors = runner->run(input_tensors);
   ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+<<<<<<< HEAD
 }
 #endif
 
@@ -318,6 +530,521 @@ void test_aoti_double_buffering_with_tensor_constants() {
 
 namespace torch {
 namespace aot_inductor {
+=======
+
+  for (auto& pair : real_map) {
+    delete pair.second;
+  }
+}
+
+void test_aoti_user_managed_buffer() {
+  torch::NoGradGuard no_grad;
+
+  std::string data_path =
+      (std::filesystem::path(
+           STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "large_data.pt")
+           .string();
+
+  // Memory information variable
+  size_t DATASIZE = 128 * 1024 * 1024; // We have 128MB of weight data.
+
+  torch::jit::script::Module data_loader = torch::jit::load(data_path);
+  std::string path_attr = "model_so_path";
+  std::string inputs_attr = "inputs";
+  std::string outputs_attr = "outputs";
+  std::string weights_attr = "w_pre";
+  std::string add_attr = "w_add";
+  const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef();
+  auto input_tensors =
+      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
+  const auto& ref_output_tensors =
+      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
+
+  const auto& weight_tensors =
+      data_loader.attr(weights_attr.c_str()).toTensor();
+  const auto& add_tensors = data_loader.attr(add_attr.c_str()).toTensor();
+
+  torch::inductor::TensorConstantMap rand_map, real_map;
+  at::Tensor rand_pre, rand_add;
+  at::Tensor w_pre, w_add;
+  rand_pre = at::randn({4096, 4096}).contiguous().to(at::kCUDA);
+  rand_add = at::randn({4096, 4096}).contiguous().to(at::kCUDA);
+  w_pre = at::Tensor(weight_tensors).contiguous().to(at::kCUDA);
+  w_add = at::Tensor(add_tensors).contiguous().to(at::kCUDA);
+
+  rand_map.emplace("L__self___w_pre", &rand_pre);
+  rand_map.emplace("L__self___w_add", &rand_add);
+  real_map.emplace("L__self___w_pre", &w_pre);
+  real_map.emplace("L__self___w_add", &w_add);
+
+  std::unique_ptr<torch::inductor::AOTIModelContainerRunner> runner;
+  runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
+      model_so_path);
+
+  // We extract the memory information starting from here.
+  int device_idx = -1;
+  cudaError_t cudaStatus;
+  cudaStatus = cudaGetDevice(&device_idx);
+  c10::cuda::CUDACachingAllocator::DeviceStats stats =
+      c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  size_t initTorchReserved = stats.reserved_bytes[0].current;
+  size_t torchReserved = stats.reserved_bytes[0].current;
+  if (cudaStatus != cudaSuccess || device_idx == -1) {
+    throw std::runtime_error("cudaGetDevice failed!");
+  }
+  // This should contain one set of weight (128MB) loaded from .so
+  size_t initMemory = 0;
+  size_t totalMemory = 0;
+  size_t preFreeMemory = 0;
+  cudaStatus = cudaMemGetInfo(&preFreeMemory, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  // At this point, no memory should be consumed since we freed them all.
+  runner->swap_constant_buffer();
+  runner->free_inactive_constant_buffer();
+  runner->swap_constant_buffer();
+  cudaStatus = cudaMemGetInfo(&initMemory, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  ASSERT_EQ(initMemory - DATASIZE, preFreeMemory);
+
+  // We update the active buffer, but with user_managed = True. This shouldn't
+  // add any memory consumption.
+  runner->update_constant_buffer(
+      real_map,
+      /*use_inactive = */ false,
+      /*validate_full_updates = */ true,
+      /*user_managed = */ true);
+  size_t updateMemory = 0;
+  cudaStatus = cudaMemGetInfo(&updateMemory, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  ASSERT_EQ(initMemory, updateMemory);
+
+  // Make sure the output is correct with user managed buffer.
+  auto actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // Update with rand_map and extract the output of rand_map.
+  // We let user_managed = false for rand_map, this should increase memory
+  // consumption.
+  cudaStatus = cudaMemGetInfo(&initMemory, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  runner->update_constant_buffer(
+      rand_map,
+      /*use_inactive = */ true,
+      /*validate_full_updates = */ true,
+      /*user_managed = */ false);
+  cudaStatus = cudaMemGetInfo(&updateMemory, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  ASSERT_EQ(initMemory - DATASIZE, updateMemory);
+
+  runner->swap_constant_buffer();
+  auto ref_rand_output_tensors = runner->run(input_tensors);
+  ASSERT_FALSE(
+      torch::allclose(ref_output_tensors[0], ref_rand_output_tensors[0]));
+
+  // Free everything.
+  runner->free_inactive_constant_buffer();
+  runner->swap_constant_buffer();
+  runner->free_inactive_constant_buffer();
+
+  // Set buffer #1 user_managed, and #2 not user managed, and compare the
+  // underlying data
+  runner->update_constant_buffer(
+      real_map,
+      /*use_inactive = */ false,
+      /*validate_full_updates = */ true,
+      /*user_managed = */ false);
+  runner->update_constant_buffer(
+      real_map,
+      /*use_inactive = */ true,
+      /*validate_full_updates = */ true,
+      /*user_managed = */ true);
+
+  auto extracted_active_weight =
+      runner->extract_constants_map(/* use_inactive = */ false);
+  auto extracted_inactive_weight =
+      runner->extract_constants_map(/* use_inactive = */ true);
+  auto cmp_real_map = derefTensorConstantMap(real_map);
+  // Value-wise all weights are equal
+  ASSERT_TRUE(compareConstantMap(extracted_active_weight, cmp_real_map));
+  ASSERT_TRUE(compareConstantMap(extracted_inactive_weight, cmp_real_map));
+  // Only when user_managed has the same underlying if set to true.
+  ASSERT_FALSE(
+      extracted_active_weight["L__self___w_pre"].data_ptr() ==
+      cmp_real_map["L__self___w_pre"].data_ptr());
+  ASSERT_TRUE(
+      extracted_inactive_weight["L__self___w_pre"].data_ptr() ==
+      cmp_real_map["L__self___w_pre"].data_ptr());
+
+  // From non user_managed
+  actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // From user_managed
+  runner->swap_constant_buffer();
+  actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // We modify the buffer by the data's pointer outside of container.
+  cudaMemcpy(
+      real_map["L__self___w_add"]->data_ptr(),
+      rand_map["L__self___w_add"]->data_ptr(),
+      4096 * 4096 * sizeof(float),
+      cudaMemcpyDeviceToDevice);
+  cudaMemcpy(
+      real_map["L__self___w_pre"]->data_ptr(),
+      rand_map["L__self___w_pre"]->data_ptr(),
+      4096 * 4096 * sizeof(float),
+      cudaMemcpyDeviceToDevice);
+
+  // We should get the result of the rand output.
+  actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(
+      torch::allclose(ref_rand_output_tensors[0], actual_output_tensors[0]));
+}
+
+void test_aoti_free_buffer(bool use_runtime_constant_folding) {
+  torch::NoGradGuard no_grad;
+
+  std::string data_path =
+      (std::filesystem::path(
+           STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "large_data.pt")
+           .string();
+
+  // Memory information variable
+  size_t DATASIZE = 128 * 1024 * 1024; // We have 128MB of weight data.
+  size_t FOLDEDDATASIZE = use_runtime_constant_folding
+      ? 64 * 1024 * 1024
+      : 0; // We have 64MB of folded data.
+
+  torch::jit::script::Module data_loader = torch::jit::load(data_path);
+  std::string path_attr = "model_so_path";
+  if (use_runtime_constant_folding) {
+    path_attr += std::string("_use_runtime_constant_folding");
+  }
+  std::string inputs_attr = "inputs";
+  std::string outputs_attr = "outputs";
+  std::string weights_attr = "w_pre";
+  std::string add_attr = "w_add";
+  const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef();
+  auto input_tensors =
+      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
+  const auto& ref_output_tensors =
+      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
+
+  const auto& weight_tensors =
+      data_loader.attr(weights_attr.c_str()).toTensor();
+  const auto& add_tensors = data_loader.attr(add_attr.c_str()).toTensor();
+
+  torch::inductor::TensorConstantMap rand_map, real_map;
+  rand_map.emplace("L__self___w_pre", new at::Tensor(at::randn({4096, 4096})));
+  rand_map.emplace("L__self___w_add", new at::Tensor(at::randn({4096, 4096})));
+  real_map.emplace("L__self___w_pre", new at::Tensor(weight_tensors));
+  real_map.emplace("L__self___w_add", new at::Tensor(add_tensors));
+
+  std::unique_ptr<torch::inductor::AOTIModelContainerRunner> runner;
+  runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
+      model_so_path);
+
+  // We extract the memory information starting from here.
+  int device_idx = -1;
+  cudaError_t cudaStatus;
+  cudaStatus = cudaGetDevice(&device_idx);
+  if (cudaStatus != cudaSuccess || device_idx == -1) {
+    throw std::runtime_error("cudaGetDevice failed!");
+  }
+  c10::cuda::CUDACachingAllocator::DeviceStats stats =
+      c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  size_t initTorchActive = stats.active_bytes[0].current;
+  size_t initTorchReserved = stats.reserved_bytes[0].current;
+  // This should contain one set of weight (128MB) loaded from .so
+  size_t torchActive1, torchActive2;
+  size_t torchReserved1, torchReserved2;
+  size_t initMemory = 0;
+  size_t totalMemory = 0;
+  cudaStatus = cudaMemGetInfo(&initMemory, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+
+  // We update inactive buffer, this should create one copy (128MB) at buffer #2
+  runner->update_inactive_constant_buffer(real_map);
+  size_t updateMemory2 = 0;
+  cudaStatus = cudaMemGetInfo(&updateMemory2, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  ASSERT_EQ(initMemory - DATASIZE, updateMemory2);
+
+  // Call run, this should run const_fold and create the folded constant in #2
+  // (64MB).
+  if (use_runtime_constant_folding) {
+    runner->run_const_fold(/* use_inactive = */ true);
+    stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+    torchActive1 = stats.active_bytes[0].current;
+    torchReserved1 = stats.reserved_bytes[0].current;
+    size_t constFoldMemory = 0;
+    cudaStatus = cudaMemGetInfo(&constFoldMemory, &totalMemory);
+    if (cudaStatus != cudaSuccess) {
+      throw std::runtime_error("cudaMemGetInfo failed!");
+    }
+    ASSERT_EQ(
+        initMemory - DATASIZE - (torchReserved1 - initTorchReserved),
+        constFoldMemory);
+    ASSERT_EQ(torchActive1 - initTorchActive, FOLDEDDATASIZE);
+  }
+
+  // We swap and free the inactive buffer. (Use #2 and free #1)
+  // Note that buffer #1 does not include folded-const
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  torchActive1 = stats.active_bytes[0].current;
+  torchReserved1 = stats.reserved_bytes[0].current;
+  runner->swap_constant_buffer();
+  runner->free_inactive_constant_buffer();
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  torchActive2 = stats.active_bytes[0].current;
+  torchReserved2 = stats.reserved_bytes[0].current;
+  size_t postFreeMemory = 0;
+  cudaStatus = cudaMemGetInfo(&postFreeMemory, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  // We should only have one set of buffer (#2), available memory should equal
+  // initial memory minus the folded constants.
+  ASSERT_EQ(initMemory - (torchReserved2 - initTorchReserved), postFreeMemory);
+  // Buffer #1 does not include folded-consts
+  ASSERT_EQ(torchActive2 - torchActive1, 0);
+
+  // We update random weights to buffer #1 and run const fold.
+  // We will have 2 full set of data plus 2 set of const-folded data.
+  runner->update_inactive_constant_buffer(rand_map);
+  runner->run_const_fold(/* use_inactive = */ true);
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  torchActive1 = stats.active_bytes[0].current;
+  torchReserved1 = stats.reserved_bytes[0].current;
+  size_t updateMemory1 = 0;
+  cudaStatus = cudaMemGetInfo(&updateMemory1, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  ASSERT_EQ(
+      initMemory - DATASIZE - (torchReserved1 - initTorchReserved),
+      updateMemory1);
+  ASSERT_EQ(torchActive1 - initTorchActive, 2 * FOLDEDDATASIZE);
+
+  // We directly free the buffer #1. This would free the DATASIZE weight.
+  // If folded constant exists, it will not directly free the cudaMalloc, but
+  // decrease the active buffer in CachingAllocator instead.
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  torchActive1 = stats.active_bytes[0].current;
+  runner->free_inactive_constant_buffer();
+  cudaStatus = cudaMemGetInfo(&updateMemory1, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  torchActive2 = stats.active_bytes[0].current;
+  torchReserved2 = stats.reserved_bytes[0].current;
+  ASSERT_EQ(initMemory - (torchReserved2 - initTorchReserved), updateMemory1);
+  ASSERT_EQ(FOLDEDDATASIZE, torchActive1 - torchActive2);
+
+  // Free buffer #1 again, since #1 is freed, nothing should change.
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  torchActive1 = stats.active_bytes[0].current;
+  runner->free_inactive_constant_buffer();
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  torchActive2 = stats.active_bytes[0].current;
+  cudaStatus = cudaMemGetInfo(&updateMemory1, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+  ASSERT_EQ(initMemory - (torchReserved2 - initTorchReserved), updateMemory1);
+  ASSERT_EQ(torchActive1 - torchActive2, 0);
+
+  // Swap and free #2, no data should exist in memory now.
+  // However, the folded constants might still occupies the CUDA memory in
+  // CachedAllocator.
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  torchActive1 = stats.active_bytes[0].current;
+  torchReserved1 = stats.reserved_bytes[0].current;
+  runner->swap_constant_buffer();
+  runner->free_inactive_constant_buffer();
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  torchActive2 = stats.active_bytes[0].current;
+  torchReserved2 = stats.reserved_bytes[0].current;
+  cudaStatus = cudaMemGetInfo(&updateMemory1, &totalMemory);
+  if (cudaStatus != cudaSuccess) {
+    throw std::runtime_error("cudaMemGetInfo failed!");
+  }
+
+  ASSERT_EQ(
+      initMemory + DATASIZE - (torchReserved2 - initTorchReserved),
+      updateMemory1);
+  ASSERT_EQ(FOLDEDDATASIZE, torchActive1 - torchActive2);
+  ASSERT_EQ(0, torchActive2 - initTorchActive);
+
+  for (auto& pair : rand_map) {
+    delete pair.second;
+  }
+  for (auto& pair : real_map) {
+    delete pair.second;
+  }
+}
+
+class ThreadPool {
+ private:
+  struct Task {
+    int id;
+    std::vector<torch::Tensor> inputs;
+  };
+
+  std::vector<std::thread> workers;
+  std::vector<c10::cuda::CUDAStream> cuda_streams;
+  std::queue<Task> tasks;
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+  std::condition_variable completion_condition;
+  std::atomic<int> active_tasks{0};
+  std::atomic<bool> stop;
+
+ public:
+  ThreadPool(size_t num_threads) : stop(false) {
+    // Create CUDA streams
+    cuda_streams.reserve(num_threads);
+    for (size_t i = 0; i < num_threads; ++i) {
+      cuda_streams.push_back(c10::cuda::getStreamFromPool());
+    }
+
+    // Create worker threads
+    for (size_t i = 0; i < num_threads; ++i) {
+      workers.emplace_back([this, i] {
+        while (true) {
+          Task task;
+          {
+            std::unique_lock<std::mutex> lock(this->queue_mutex);
+            this->condition.wait(
+                lock, [this] { return this->stop || !this->tasks.empty(); });
+
+            if (this->stop && this->tasks.empty()) {
+              return;
+            }
+
+            task = std::move(this->tasks.front());
+            this->tasks.pop();
+          }
+
+          // Process the task with this thread's CUDA stream
+          process_function(task.id, task.inputs, this->cuda_streams[i]);
+
+          // Mark task as completed
+          {
+            std::unique_lock<std::mutex> lock(this->queue_mutex);
+            active_tasks--;
+            if (active_tasks == 0 && this->tasks.empty()) {
+              completion_condition.notify_all();
+            }
+          }
+        }
+      });
+    }
+  }
+
+  // Updated processing function for vector of tensors and CUDA stream
+  std::function<
+      void(int, const std::vector<torch::Tensor>&, c10::cuda::CUDAStream&)>
+      process_function;
+
+  // Enqueue task with vector of tensors as input
+  void enqueue(int i, std::vector<torch::Tensor> inputs) {
+    {
+      std::unique_lock<std::mutex> lock(queue_mutex);
+      tasks.push({i, std::move(inputs)});
+      active_tasks++;
+    }
+    condition.notify_one();
+  }
+
+  // Wait for all tasks to complete
+  void wait() {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+    completion_condition.wait(
+        lock, [this] { return active_tasks == 0 && tasks.empty(); });
+  }
+
+  ~ThreadPool() {
+    {
+      std::unique_lock<std::mutex> lock(queue_mutex);
+      stop = true;
+    }
+
+    condition.notify_all();
+    for (std::thread& worker : workers) {
+      worker.join();
+    }
+  }
+};
+
+void test_multi_cuda_streams(const std::string& device) {
+  c10::InferenceMode mode;
+  std::string data_path =
+      (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
+           .string();
+  torch::jit::script::Module data_loader = torch::jit::load(data_path);
+  std::string path_attr = "pt2_package_path_" + device;
+  std::string inputs_attr = "inputs_" + device;
+  std::string outputs_attr = "outputs_" + device;
+  const auto& pt2_package_path =
+      data_loader.attr(path_attr.c_str()).toStringRef();
+  const auto& ref_output_tensors =
+      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
+  auto inputs = data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
+
+  constexpr int N = 16;
+  constexpr int num_threads = 4;
+  std::vector<std::vector<torch::Tensor>> all_outputs(N);
+  // Create thread pool with desired number of threads
+  torch::inductor::AOTIModelPackageLoader loader(
+      pt2_package_path, "model", false, num_threads);
+  ThreadPool pool(num_threads);
+  std::mutex results_mutex;
+
+  // Set the processing function
+  pool.process_function = [&](int i,
+                              const std::vector<torch::Tensor>& inputs,
+                              c10::cuda::CUDAStream& stream) {
+    // Run inference with the task-specific input
+    std::vector<torch::Tensor> outputs = loader.run(inputs, stream.stream());
+    // Store results safely
+    {
+      std::lock_guard<std::mutex> lock(results_mutex);
+      all_outputs[i] = outputs;
+    }
+  };
+  // Enqueue all tasks
+  for (int i = 0; i < N; i++) {
+    pool.enqueue(i, inputs);
+  }
+  // Wait for all tasks to complete
+  pool.wait();
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_TRUE(torch::allclose(ref_output_tensors[0], all_outputs[i][0]));
+  }
+}
+#endif
+} // namespace
+
+namespace torch::aot_inductor {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TEST(AotInductorTest, BasicTestCpu) {
   test_aoti("cpu", false);
@@ -331,6 +1058,13 @@ TEST(AotInductorTest, BasicPackageLoaderTestCpu) {
   test_aoti_package_loader("cpu", false);
 }
 
+<<<<<<< HEAD
+=======
+TEST(AotInductorTest, ExtractConstantsMapCpu) {
+  test_aoti_extract_constants_map("cpu");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_CUDA
 TEST(AotInductorTest, BasicTestCuda) {
   test_aoti("cuda", true);
@@ -345,6 +1079,17 @@ TEST(AotInductorTest, BasicPackageLoaderTestCuda) {
   test_aoti_package_loader("cuda", false);
 }
 
+<<<<<<< HEAD
+=======
+TEST(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
+  test_aoti_package_loader_multi_gpu("cuda", false);
+}
+
+TEST(AotInductorTest, UpdateUserManagedConstantsCuda) {
+  test_aoti_user_managed_buffer();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(AotInductorTest, RuntimeUpdateConstantsCuda) {
   test_aoti_constants_update("cuda", true);
 }
@@ -353,6 +1098,13 @@ TEST(AotInductorTest, UpdateConstantsCuda) {
   test_aoti_constants_update("cuda", false);
 }
 
+<<<<<<< HEAD
+=======
+TEST(AotInductorTest, ExtractConstantsMapCuda) {
+  test_aoti_extract_constants_map("cuda");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
   test_aoti_double_buffering("cuda", true);
 }
@@ -364,7 +1116,25 @@ TEST(AotInductorTest, UpdateInactiveConstantsCuda) {
 TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
   test_aoti_double_buffering_with_tensor_constants();
 }
+<<<<<<< HEAD
 #endif
 
 } // namespace aot_inductor
 } // namespace torch
+=======
+
+TEST(AotInductorTest, FreeInactiveConstantBufferCuda) {
+  test_aoti_free_buffer(false);
+}
+
+TEST(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
+  test_aoti_free_buffer(true);
+}
+
+TEST(AotInductorTest, MultiStreamTestCuda) {
+  test_multi_cuda_streams("cuda");
+}
+#endif
+
+} // namespace torch::aot_inductor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp/aoti_inference/test.py b/test/cpp/aoti_inference/test.py
index aeb3d25f9c78..3cc8482296ce 100644
--- a/test/cpp/aoti_inference/test.py
+++ b/test/cpp/aoti_inference/test.py
@@ -7,10 +7,17 @@
 
 
 class Net(torch.nn.Module):
+<<<<<<< HEAD
     def __init__(self, device):
         super().__init__()
         self.w_pre = torch.randn(4, 4, device=device)
         self.w_add = torch.randn(4, 4, device=device)
+=======
+    def __init__(self, device, size=4):
+        super().__init__()
+        self.w_pre = torch.randn(size, size, device=device)
+        self.w_add = torch.randn(size, size, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, x):
         w_transpose = torch.transpose(self.w_pre, 0, 1)
@@ -30,6 +37,10 @@ def forward(self, x, y):
 
 
 data = {}
+<<<<<<< HEAD
+=======
+large_data = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 data_with_tensor_constants = {}
 
 
@@ -84,6 +95,50 @@ def generate_basic_tests():
             )
 
 
+<<<<<<< HEAD
+=======
+def generate_large_tests():
+    device = "cuda"
+    model = Net(device, size=4096).to(device=device)
+    x = torch.randn((4096, 4096), device=device)
+    with torch.no_grad():
+        ref_output = model(x)
+
+    torch._dynamo.reset()
+    for use_runtime_constant_folding in [True, False]:
+        with torch.no_grad():
+            model_so_path = aot_compile(
+                model,
+                (x,),
+                options={
+                    "aot_inductor.use_runtime_constant_folding": use_runtime_constant_folding
+                },
+            )
+            # Also store a .pt2 file using the aoti_compile_and_package API
+            pt2_package_path = torch._inductor.aoti_compile_and_package(
+                torch.export.export(
+                    model,
+                    (x,),
+                ),
+                inductor_configs={
+                    "aot_inductor.use_runtime_constant_folding": use_runtime_constant_folding
+                },
+            )
+
+        suffix = "_use_runtime_constant_folding" if use_runtime_constant_folding else ""
+        large_data.update(
+            {  # noqa: F541
+                f"model_so_path{suffix}": model_so_path,
+                f"pt2_package_path{suffix}": pt2_package_path,
+                "inputs": [x],
+                "outputs": [ref_output],
+                "w_pre": model.w_pre,
+                "w_add": model.w_add,
+            }
+        )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # AOTI model which will create additional tensors during autograd.
 def generate_test_with_additional_tensors():
     if not torch.cuda.is_available():
@@ -115,6 +170,10 @@ def generate_test_with_additional_tensors():
 
 
 generate_basic_tests()
+<<<<<<< HEAD
+=======
+generate_large_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 generate_test_with_additional_tensors()
 
 
@@ -127,6 +186,10 @@ def __init__(self, data):
 
 
 torch.jit.script(Serializer(data)).save("data.pt")
+<<<<<<< HEAD
+=======
+torch.jit.script(Serializer(large_data)).save("large_data.pt")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.jit.script(Serializer(data_with_tensor_constants)).save(
     "data_with_tensor_constants.pt"
 )
diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt
index fe34bf6a5021..54ec8ab67381 100644
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@@ -50,7 +50,11 @@ endif()
 
 add_executable(test_api ${TORCH_API_TEST_SOURCES})
 target_include_directories(test_api PRIVATE ${ATen_CPU_INCLUDE})
+<<<<<<< HEAD
 target_link_libraries(test_api PRIVATE torch gtest)
+=======
+target_link_libraries(test_api PRIVATE torch gtest_main gmock)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if(USE_CUDA)
   target_compile_definitions(test_api PRIVATE "USE_CUDA")
diff --git a/test/cpp/api/dataloader.cpp b/test/cpp/api/dataloader.cpp
index 0b50e8989012..1a61df8af704 100644
--- a/test/cpp/api/dataloader.cpp
+++ b/test/cpp/api/dataloader.cpp
@@ -87,7 +87,11 @@ struct DummyChunkDataReader : public datasets::ChunkDataReader<int> {
     return chunk_count_;
   };
 
+<<<<<<< HEAD
   void reset() override{};
+=======
+  void reset() override {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const static size_t chunk_count_ = 3;
   // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays)
@@ -1479,7 +1483,11 @@ TEST(DataLoaderTest, StatefulDatasetWithNoWorkers) {
     void reset() override {
       counter = 0;
     }
+<<<<<<< HEAD
     void save(torch::serialize::OutputArchive& archive) const override{};
+=======
+    void save(torch::serialize::OutputArchive& archive) const override {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     void load(torch::serialize::InputArchive& archive) override {}
     int counter = 0;
   };
@@ -1517,7 +1525,11 @@ TEST(DataLoaderTest, StatefulDatasetWithManyWorkers) {
     void reset() override {
       counter = 0;
     }
+<<<<<<< HEAD
     void save(torch::serialize::OutputArchive& archive) const override{};
+=======
+    void save(torch::serialize::OutputArchive& archive) const override {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     void load(torch::serialize::InputArchive& archive) override {}
     int counter = 0;
     std::mutex mutex;
@@ -1556,7 +1568,11 @@ TEST(DataLoaderTest, StatefulDatasetWithMap) {
     void reset() override {
       counter = 0;
     }
+<<<<<<< HEAD
     void save(torch::serialize::OutputArchive& archive) const override{};
+=======
+    void save(torch::serialize::OutputArchive& archive) const override {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     void load(torch::serialize::InputArchive& archive) override {}
     int counter = 0;
   };
@@ -1605,7 +1621,11 @@ TEST(DataLoaderTest, StatefulDatasetWithCollate) {
     void reset() override {
       counter = 0;
     }
+<<<<<<< HEAD
     void save(torch::serialize::OutputArchive& archive) const override{};
+=======
+    void save(torch::serialize::OutputArchive& archive) const override {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     void load(torch::serialize::InputArchive& archive) override {}
     int counter = 0;
   };
@@ -1747,7 +1767,11 @@ TEST(DataLoaderTest, ChunkDataSetWithEmptyBatch) {
       return 1;
     };
 
+<<<<<<< HEAD
     void reset() override{};
+=======
+    void reset() override {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   const size_t prefetch_count = 1;
@@ -1791,7 +1815,11 @@ TEST(DataLoaderTest, ChunkDataSetGetBatchWithUnevenBatchSize) {
       return 2;
     };
 
+<<<<<<< HEAD
     void reset() override{};
+=======
+    void reset() override {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
@@ -1936,7 +1964,11 @@ TEST(DataLoaderTest, ChunkDatasetSave) {
       return chunk_count_;
     };
 
+<<<<<<< HEAD
     void reset() override{};
+=======
+    void reset() override {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     BatchType batch_data_ = BatchType(chunk_size, 0);
   };
 
@@ -2115,7 +2147,11 @@ TEST(DataLoaderTest, ChunkDatasetCrossChunkShuffle) {
 
   class S : public samplers::Sampler<> {
    public:
+<<<<<<< HEAD
     explicit S(size_t size) : size_(size), index_(0){};
+=======
+    explicit S(size_t size) : size_(size), index_(0) {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     void reset(std::optional<size_t> new_size = std::nullopt) override {
       if (new_size.has_value()) {
@@ -2170,7 +2206,11 @@ TEST(DataLoaderTest, ChunkDatasetCrossChunkShuffle) {
       return chunk_count_;
     };
 
+<<<<<<< HEAD
     void reset() override{};
+=======
+    void reset() override {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     size_t chunk_count_;
   };
 
@@ -2258,7 +2298,11 @@ TEST(DataLoaderTest, CustomPreprocessPolicy) {
       return chunk_count_;
     };
 
+<<<<<<< HEAD
     void reset() override{};
+=======
+    void reset() override {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     size_t chunk_count_;
   };
 
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 83c5cd2900e0..d9eab199084d 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -631,6 +631,20 @@ TEST_F(FunctionalTest, GridSample) {
 
   ASSERT_TRUE(output.allclose(expected));
 
+<<<<<<< HEAD
+=======
+  // bicubic, zeros, true
+  options = F::GridSampleFuncOptions()
+                .mode(torch::kBicubic)
+                .padding_mode(torch::kZeros)
+                .align_corners(true);
+  output = F::grid_sample(input, grid, options);
+  expected = torch::tensor(
+      {{{{0., 0., 1.}, {3., 4., 5.}, {7., 8., 0.}}}}, torch::kFloat);
+
+  ASSERT_TRUE(output.allclose(expected));
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // bilinear, border, true
   options = F::GridSampleFuncOptions()
                 .mode(torch::kBilinear)
@@ -1063,7 +1077,11 @@ TEST_F(FunctionalTest, ELU) {
       x_bf16.resize_({size, size, size});
 
       auto y_exp = torch::max(torch::zeros_like(x), x) +
+<<<<<<< HEAD
           torch::min(torch::zeros_like(x), alpha * (torch::exp(x) - 1.0));
+=======
+          torch::min(torch::zeros_like(x), alpha * (torch::expm1(x)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto y = F::elu(x, F::ELUFuncOptions().alpha(alpha).inplace(inplace));
       auto y_bf16 =
           F::elu(x_bf16, F::ELUFuncOptions().alpha(alpha).inplace(inplace));
@@ -1090,8 +1108,12 @@ TEST_F(FunctionalTest, SELU) {
       auto input_bf16 = input.clone().to(torch::kBFloat16);
       auto expected = scale *
           (torch::max(torch::zeros_like(input), input) +
+<<<<<<< HEAD
            torch::min(
                torch::zeros_like(input), alpha * (torch::exp(input) - 1)));
+=======
+           torch::min(torch::zeros_like(input), alpha * (torch::expm1(input))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto output = F::selu(input, inplace);
       auto output_bf16 = F::selu(input_bf16, inplace);
 
@@ -1711,8 +1733,12 @@ TEST_F(FunctionalTest, CELU) {
       x.resize_({size, size, size});
       auto x_bf16 = x.clone().to(torch::kBFloat16);
       auto y_exp = torch::max(torch::zeros_like(x), x) +
+<<<<<<< HEAD
           torch::min(torch::zeros_like(x),
                      alpha * (torch::exp(x / alpha) - 1.0));
+=======
+          torch::min(torch::zeros_like(x), alpha * (torch::expm1(x / alpha)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto y = F::celu(x, F::CELUFuncOptions().alpha(alpha).inplace(inplace));
       auto y_bf16 =
           F::celu(x_bf16, F::CELUFuncOptions().alpha(alpha).inplace(inplace));
@@ -1737,7 +1763,11 @@ TEST_F(FunctionalTest, CELUDefaultOptions) {
   x.resize_({size, size, size});
   auto x_bf16 = x.clone().to(torch::kBFloat16);
   auto y_exp = torch::max(torch::zeros_like(x), x) +
+<<<<<<< HEAD
       torch::min(torch::zeros_like(x), alpha * (torch::exp(x / alpha) - 1.0));
+=======
+      torch::min(torch::zeros_like(x), alpha * (torch::expm1(x / alpha)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto y = F::celu(x);
   auto y_bf16 = F::celu(x_bf16);
 
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index a584624bd1b7..d90849fba003 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -127,9 +127,13 @@ TEST_F(ModulesTest, Conv2dSameStrided) {
       [&] { Conv2d model_invalid(options.stride(2)); }(),
       "padding='same' is not supported for strided convolutions");
   ASSERT_THROWS_WITH(
+<<<<<<< HEAD
       [&] {
         Conv2d model_invalid(options.stride({1, 2}));
       }(),
+=======
+      [&] { Conv2d model_invalid(options.stride({1, 2})); }(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "padding='same' is not supported for strided convolutions");
 }
 
@@ -181,9 +185,13 @@ TEST_F(ModulesTest, Conv3dSameStrided) {
       [&] { Conv3d model_invalid(options.stride(2)); }(),
       "padding='same' is not supported for strided convolutions");
   ASSERT_THROWS_WITH(
+<<<<<<< HEAD
       [&] {
         Conv3d model_invalid(options.stride({1, 2, 1}));
       }(),
+=======
+      [&] { Conv3d model_invalid(options.stride({1, 2, 1})); }(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "padding='same' is not supported for strided convolutions");
 }
 
@@ -2432,8 +2440,12 @@ TEST_F(ModulesTest, ELU) {
       ASSERT_EQ(y.ndimension(), 3);
       ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
       auto y_exp = torch::max(torch::zeros_like(x_orig), x_orig) +
+<<<<<<< HEAD
           torch::min(torch::zeros_like(x_orig),
                      alpha * (torch::exp(x_orig) - 1.0));
+=======
+          torch::min(torch::zeros_like(x_orig), alpha * (torch::expm1(x_orig)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ASSERT_TRUE(torch::allclose(y, y_exp));
       if (inplace) {
         ASSERT_TRUE(torch::allclose(x, y_exp));
@@ -2458,7 +2470,11 @@ TEST_F(ModulesTest, SELU) {
     auto zero = torch::zeros_like(input);
     auto expected = scale *
         (torch::max(zero, input_orig) +
+<<<<<<< HEAD
          torch::min(zero, alpha * (torch::exp(input_orig) - 1)));
+=======
+         torch::min(zero, alpha * (torch::expm1(input_orig))));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto s = output.sum();
 
     ASSERT_EQ(s.ndimension(), 0);
@@ -2848,7 +2864,11 @@ TEST_F(ModulesTest, CELU) {
       ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
       auto y_exp = torch::max(torch::zeros_like(x_orig), x_orig) +
           torch::min(torch::zeros_like(x_orig),
+<<<<<<< HEAD
                      alpha * (torch::exp(x_orig / alpha) - 1.0));
+=======
+                     alpha * (torch::expm1(x_orig / alpha)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ASSERT_TRUE(torch::allclose(y, y_exp));
       if (inplace) {
         ASSERT_TRUE(torch::allclose(x, y_exp));
@@ -2895,7 +2915,10 @@ TEST_F(ModulesTest, TanhGELU) {
   ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05));
 }
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST_F(ModulesTest, Mish) {
   Mish model;
   auto x = torch::randn(100) * 10;
@@ -4592,6 +4615,18 @@ TEST_F(ModulesTest, PrettyPrintConv) {
   ASSERT_EQ(
       c10::str(Conv1d(3, 4, 5)),
       "torch::nn::Conv1d(3, 4, kernel_size=5, stride=1)");
+<<<<<<< HEAD
+=======
+  {
+    auto options = Conv1dOptions(3, 4, 5);
+    ASSERT_EQ(
+        c10::str(Conv1d(options.padding(torch::kSame))),
+        "torch::nn::Conv1d(3, 4, kernel_size=5, stride=1, padding='same')");
+    ASSERT_EQ(
+        c10::str(Conv1d(options.padding(torch::kValid))),
+        "torch::nn::Conv1d(3, 4, kernel_size=5, stride=1, padding='valid')");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ASSERT_EQ(
       c10::str(Conv2d(3, 4, 5)),
@@ -4606,6 +4641,18 @@ TEST_F(ModulesTest, PrettyPrintConv) {
         c10::str(Conv2d(options)),
         "torch::nn::Conv2d(3, 4, kernel_size=[5, 6], stride=[1, 2])");
   }
+<<<<<<< HEAD
+=======
+  {
+    auto options = Conv2dOptions(3, 4, std::vector<int64_t>{5, 6});
+    ASSERT_EQ(
+        c10::str(Conv2d(options.padding(torch::kSame))),
+        "torch::nn::Conv2d(3, 4, kernel_size=[5, 6], stride=[1, 1], padding='same')");
+    ASSERT_EQ(
+        c10::str(Conv2d(options.padding(torch::kValid))),
+        "torch::nn::Conv2d(3, 4, kernel_size=[5, 6], stride=[1, 1], padding='valid')");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ASSERT_EQ(
       c10::str(Conv3d(4, 4, std::vector<int64_t>{5, 6, 7})),
@@ -4631,6 +4678,18 @@ TEST_F(ModulesTest, PrettyPrintConv) {
         "bias=false, "
         "padding_mode=kCircular)");
   }
+<<<<<<< HEAD
+=======
+  {
+    auto options = Conv3dOptions(3, 4, std::vector<int64_t>{5, 6, 7});
+    ASSERT_EQ(
+        c10::str(Conv3d(options.padding(torch::kSame))),
+        "torch::nn::Conv3d(3, 4, kernel_size=[5, 6, 7], stride=[1, 1, 1], padding='same')");
+    ASSERT_EQ(
+        c10::str(Conv3d(options.padding(torch::kValid))),
+        "torch::nn::Conv3d(3, 4, kernel_size=[5, 6, 7], stride=[1, 1, 1], padding='valid')");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TEST_F(ModulesTest, PrettyPrintConvTranspose) {
diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp
index fef879b79837..a2329a0ddf7d 100644
--- a/test/cpp/api/rnn.cpp
+++ b/test/cpp/api/rnn.cpp
@@ -441,7 +441,11 @@ lstm_output_to_device(
 }
 
 // This test is a port of python code introduced here:
+<<<<<<< HEAD
 // https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
+=======
+// https://medium.com/data-science/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Reverse forward of bidirectional GRU should act
 // as regular forward of unidirectional GRU
 void BidirectionalGRUReverseForward(bool cuda) {
diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp
index c040a8472318..92e2feb05033 100644
--- a/test/cpp/api/tensor.cpp
+++ b/test/cpp/api/tensor.cpp
@@ -76,6 +76,17 @@ TEST(TensorTest, ToTensorAndTensorAttributes) {
   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kInt, at::kStrided);
 }
 
+<<<<<<< HEAD
+=======
+TEST(TensorTest, TensorAttributes) {
+  std::vector<float> v = {1.0, 2.0, 3.0};
+  auto options = c10::requires_grad(true);
+  auto tensor = at::from_blob(v.data(), {3}, options);
+  REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
+  ASSERT_TRUE(tensor.requires_grad());
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Not currently supported.
 // TEST(TensorTest, ToLayout) {
 //   auto tensor = at::empty({3, 4});
@@ -912,7 +923,13 @@ TEST(TensorTest, Arange) {
 }
 
 TEST(TensorTest, PrettyPrintTensorDataContainer) {
+<<<<<<< HEAD
   { ASSERT_EQ(c10::str(torch::detail::TensorDataContainer(1.1)), "1.1"); }
+=======
+  {
+    ASSERT_EQ(c10::str(torch::detail::TensorDataContainer(1.1)), "1.1");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   {
     ASSERT_EQ(
         c10::str(torch::detail::TensorDataContainer({1.1, 2.2})), "{1.1, 2.2}");
diff --git a/test/cpp/c10d/CMakeLists.txt b/test/cpp/c10d/CMakeLists.txt
index 5b423241d5b3..e0d7634bf5fb 100644
--- a/test/cpp/c10d/CMakeLists.txt
+++ b/test/cpp/c10d/CMakeLists.txt
@@ -16,8 +16,19 @@ function(c10d_add_test test_src)
 
   get_filename_component(test_name ${test_src} NAME_WE)
   add_executable(${test_name} "${test_src}")
+<<<<<<< HEAD
   target_include_directories(${test_name} PRIVATE $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/distributed>)
   target_link_libraries(${test_name} ${ARG_LINK_LIBRARIES})
+=======
+  target_include_directories(${test_name} PRIVATE
+      $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/distributed>
+      $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>
+  )
+  target_link_libraries(${test_name} PRIVATE
+    fmt::fmt-header-only
+    ${ARG_LINK_LIBRARIES}
+  )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
 
   if(ARG_INSTALL_TEST)
diff --git a/test/cpp/c10d/ProcessGroupMPITest.cpp b/test/cpp/c10d/ProcessGroupMPITest.cpp
index 1112ab723bd5..6e384569d43b 100644
--- a/test/cpp/c10d/ProcessGroupMPITest.cpp
+++ b/test/cpp/c10d/ProcessGroupMPITest.cpp
@@ -185,6 +185,116 @@ void testAllgather(int iter = 10000) {
   }
 }
 
+<<<<<<< HEAD
+=======
+void testAllgatherBase(int iter = 10000) {
+  auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
+
+  // Get the world size
+  auto worldSize = pg->getSize();
+  auto rank = pg->getRank();
+
+  // Generate inputs
+  for (const auto i : c10::irange(iter)) {
+    auto tensor = at::ones({16, 16}) * i * rank;
+    auto output = at::zeros({worldSize, 16, 16});
+
+    // Queue the work.
+    c10::intrusive_ptr<::c10d::Work> work = pg->_allgather_base(output, tensor);
+    works.push_back(std::move(work));
+  }
+
+  auto outputTensors = waitFuture(pg, works);
+
+  // Verify outputs
+  for (const auto i : c10::irange(iter)) {
+    for (const auto j : c10::irange(worldSize)) {
+      const auto expected = i * j;
+      auto data = outputTensors[i][0][j].data_ptr<float>();
+      for (auto k = 0; k < outputTensors[i][0][j].numel(); ++k) {
+        if (data[k] != static_cast<float>(expected)) {
+          TORCH_CHECK(false, "BOOM!");
+        }
+      }
+    }
+  }
+}
+
+void testReduceScatter(int iter = 10000) {
+  auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
+
+  // Get the world size
+  auto worldSize = pg->getSize();
+  auto rank = pg->getRank();
+
+  // Generate inputs
+  int count = 2;
+  for (const auto i : c10::irange(iter)) {
+    auto tensors = std::vector<std::vector<at::Tensor>>(1);
+    tensors[0].resize(worldSize);
+    for (const auto j : c10::irange(worldSize)) {
+      tensors[0][j] = at::ones({count, count}) * i * rank;
+    }
+    auto output = at::zeros({count, count});
+    auto outputs = std::vector<at::Tensor>({output});
+
+    // Queue the work.
+    c10::intrusive_ptr<::c10d::Work> work =
+        pg->reduce_scatter(outputs, tensors);
+    works.push_back(std::move(work));
+  }
+
+  auto outputTensors = waitFuture(pg, works);
+
+  // Verify outputs
+  for (const auto i : c10::irange(iter)) {
+    const auto expected = i * (worldSize * (worldSize - 1)) / 2.0;
+    auto data = outputTensors[i][0].data_ptr<float>();
+    for (auto j = 0; j < outputTensors[i][0].numel(); ++j) {
+      if (data[j] != static_cast<float>(expected)) {
+        TORCH_CHECK(false, "BOOM!");
+      }
+    }
+  }
+}
+
+void testReduceScatterBase(int iter = 10000) {
+  auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
+
+  // Get the world size
+  auto worldSize = pg->getSize();
+  auto rank = pg->getRank();
+
+  // Generate inputs
+  for (const auto i : c10::irange(iter)) {
+    auto tensor = at::ones({worldSize, 16, 16}) * i * rank;
+    auto output = at::zeros({16, 16});
+    auto outputs = std::vector<at::Tensor>({output});
+
+    // Queue the work.
+    c10::intrusive_ptr<::c10d::Work> work =
+        pg->_reduce_scatter_base(output, tensor);
+    works.push_back(std::move(work));
+  }
+
+  auto outputTensors = waitFuture(pg, works);
+
+  // Verify outputs
+  for (const auto i : c10::irange(iter)) {
+    const auto expected = i * (worldSize * (worldSize - 1)) / 2.0;
+    auto data = outputTensors[i][0].data_ptr<float>();
+    for (auto j = 0; j < outputTensors[i][0].numel(); ++j) {
+      if (data[j] != static_cast<float>(expected)) {
+        TORCH_CHECK(false, "BOOM!");
+      }
+    }
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void testGather(int iter = 10000) {
   auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
   std::vector<c10::intrusive_ptr<::c10d::Work>> works;
@@ -355,6 +465,12 @@ int main(int argc, char** argv) {
   testBroadcast();
   testReduce();
   testAllgather();
+<<<<<<< HEAD
+=======
+  testAllgatherBase();
+  testReduceScatter();
+  testReduceScatterBase();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   testGather();
   testScatter();
   testSendRecv(false);
diff --git a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
index a2fa2b467c52..a083a9eefd5e 100644
--- a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
@@ -164,7 +164,11 @@ class ProcessGroupNCCLTimedOutErrors : public ProcessGroupNCCLSimulateErrors {
   // so we have this hack to manually set the desync debug flag after PG
   // creation.
   void forceSetDesyncDebugFlag() {
+<<<<<<< HEAD
     desyncDebug_ = true;
+=======
+    watchdog_->setDesyncDebug(true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
  private:
@@ -179,7 +183,16 @@ class ProcessGroupNCCLNoHeartbeatCaught
       int rank,
       int size,
       c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> opts)
+<<<<<<< HEAD
       : ProcessGroupNCCLTimedOutErrors(store, rank, size, std::move(opts)) {}
+=======
+      : ProcessGroupNCCLTimedOutErrors(store, rank, size, std::move(opts)) {
+    // Override the heartbeat monitor function to make sure that we capture
+    // the exception in the monitor thread because we cannot try-catch it in
+    // the main thread and we set a flag for the main thread to check.
+    heartbeatMonitor_ = std::make_unique<TestHeartbeatMonitor>(this);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::mutex& getWatchdogMutex() {
     return workMetaListMutex_;
@@ -195,6 +208,7 @@ class ProcessGroupNCCLNoHeartbeatCaught
     asyncDebugDump.wait();
   }
 
+<<<<<<< HEAD
  protected:
   // Override the heartbeat monitor function to make sure that we capture
   // the exception in the monitor thread because we cannot try-catch it in
@@ -207,6 +221,24 @@ class ProcessGroupNCCLNoHeartbeatCaught
     }
   }
 
+=======
+  class TestHeartbeatMonitor : public c10d::ProcessGroupNCCL::HeartbeatMonitor {
+   public:
+    using HeartbeatMonitor::HeartbeatMonitor;
+
+    void runLoop() override {
+      try {
+        c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop();
+      } catch (std::runtime_error& e) {
+        // Safe cast because we know it's a ProcessGroupNCCLNoHeartbeatCaught
+        auto* pg = static_cast<ProcessGroupNCCLNoHeartbeatCaught*>(pg_);
+        pg->hasMonitorThreadCaughtError_ = true;
+      }
+    }
+  };
+
+ protected:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // It's really hard to unit test std::abort. So we override it instead.
   // Commented this override, we do see process aborted with core dump without
   // this override.
@@ -363,6 +395,12 @@ class TestDebugInfoWriter : public c10d::DebugInfoWriter {
 };
 
 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
+<<<<<<< HEAD
+=======
+  // Note (kwen2501) 03/07/2025
+  // TODO: re-enable
+  GTEST_SKIP() << "Skipping test as the trace write seems unstable.";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int heartBeatIntervalInSec = 2;
   std::string timeInterval = std::to_string(heartBeatIntervalInSec);
   ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "0", 1) == 0);
diff --git a/test/cpp/c10d/TCPStoreTest.cpp b/test/cpp/c10d/TCPStoreTest.cpp
index 48504a2d0d97..9222b4d38d3a 100644
--- a/test/cpp/c10d/TCPStoreTest.cpp
+++ b/test/cpp/c10d/TCPStoreTest.cpp
@@ -246,6 +246,55 @@ TEST(TCPStoreTest, testLibUVPartialRead) {
   clientThread.join();
 }
 
+<<<<<<< HEAD
+=======
+TEST(TCPStoreTest, testLibUVSetAndWait) {
+  int numWorkers = 128; // thread 0 creates both server and client
+  std::vector<std::thread> threads;
+
+  // server part
+  c10d::TCPStoreOptions server_opts{
+      0,
+      true, // is master
+      numWorkers,
+      false, // don't wait otherwise client thread won't spawn
+      std::chrono::seconds(defaultTimeout)};
+  server_opts.useLibUV = true;
+
+  auto serverTCPStore =
+      std::make_unique<c10d::TCPStore>("127.0.0.1", server_opts);
+
+  // client part
+  c10d::TCPStoreOptions client_opts{
+      serverTCPStore->getPort(),
+      false, // is master
+      numWorkers,
+      false, // wait workers
+      std::chrono::seconds(defaultTimeout)};
+  client_opts.useLibUV = true;
+
+  for (const auto i : c10::irange(numWorkers)) {
+    threads.emplace_back([=, &client_opts] {
+      auto clientTCPStore =
+          c10::make_intrusive<c10d::TCPStore>("127.0.0.1", client_opts);
+      std::string key("k_" + std::to_string(i));
+      std::string value("v_" + std::to_string(i));
+      std::vector<uint8_t> valueBuf(value.begin(), value.end());
+      clientTCPStore->set(key, valueBuf);
+      std::vector<std::string> all_keys;
+      for (const auto j : c10::irange(numWorkers)) {
+        all_keys.push_back("k_" + std::to_string(j));
+      }
+      clientTCPStore->wait(all_keys);
+    });
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void testMultiTenantStores(bool libUV) {
   c10d::TCPStoreOptions opts{};
   opts.isServer = true;
diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt
index 6b5bba4b8208..db23a2eacafb 100644
--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@@ -7,7 +7,11 @@ if(USE_DISTRIBUTED AND NOT WIN32)
 
   add_executable(test_dist_autograd ${DIST_AUTOGRAD_TEST_SOURCES})
   target_include_directories(test_dist_autograd PRIVATE ${ATen_CPU_INCLUDE})
+<<<<<<< HEAD
   target_link_libraries(test_dist_autograd PRIVATE torch gtest)
+=======
+  target_link_libraries(test_dist_autograd PRIVATE torch gtest_main)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if(USE_CUDA)
     target_compile_definitions(test_dist_autograd PRIVATE "USE_CUDA")
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index cd2eaf761dff..6e97e15dec85 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -39,6 +39,10 @@ endif()
 
 # Build the cpp gtest binary containing the cpp-only tests.
 set(JIT_TEST_SRCS
+<<<<<<< HEAD
+=======
+  ${JIT_TEST_ROOT}/source_range_test.cpp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ${JIT_TEST_ROOT}/test_add_if_then_else.cpp
   ${JIT_TEST_ROOT}/test_alias_analysis.cpp
   ${JIT_TEST_ROOT}/test_argument_spec.cpp
@@ -70,6 +74,10 @@ set(JIT_TEST_SRCS
   ${JIT_TEST_ROOT}/test_ir.cpp
   ${JIT_TEST_ROOT}/test_irparser.cpp
   ${JIT_TEST_ROOT}/test_jit_type.cpp
+<<<<<<< HEAD
+=======
+  ${JIT_TEST_ROOT}/test_lexer.cpp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ${JIT_TEST_ROOT}/test_lite_interpreter.cpp
   ${JIT_TEST_ROOT}/test_lite_interpreter_direct.cpp
   ${JIT_TEST_ROOT}/test_lite_trainer.cpp
@@ -115,13 +123,22 @@ target_compile_definitions(test_jit PRIVATE USE_GTEST)
 
 if(USE_SYSTEM_ONNX)
   target_link_libraries(test_jit PRIVATE onnx_proto onnx)
+<<<<<<< HEAD
+=======
+else()
+  target_link_libraries(test_jit PRIVATE onnx_library)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 if(USE_MKLDNN)
   target_link_libraries(test_jit PRIVATE caffe2::mkldnn)
 endif()
 
+<<<<<<< HEAD
 set(JIT_TEST_DEPENDENCIES torch gtest jitbackend_test backend_with_compiler)
+=======
+set(JIT_TEST_DEPENDENCIES torch gtest_main jitbackend_test backend_with_compiler gmock)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if(MSVC)
   list(APPEND JIT_TEST_DEPENDENCIES onnx_library)
diff --git a/test/cpp/jit/source_range_test.cpp b/test/cpp/jit/source_range_test.cpp
index 4cbb58ddda63..2d08920cb9da 100644
--- a/test/cpp/jit/source_range_test.cpp
+++ b/test/cpp/jit/source_range_test.cpp
@@ -1,16 +1,28 @@
 #include <gtest/gtest.h>
+<<<<<<< HEAD
+=======
+
+#include <c10/util/irange.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/frontend/source_range.h>
 
 using namespace ::testing;
 using namespace ::torch::jit;
 
+<<<<<<< HEAD
 TEST(SourceRangeTest, test_find) {
+=======
+std::vector<StringCordView> sampleStringCordViews() {
+  std::vector<StringCordView> result;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<std::shared_ptr<std::string>> strings;
   strings.push_back(std::make_shared<std::string>("hello world"));
   strings.push_back(std::make_shared<std::string>("nihaoma"));
 
   std::vector<std::string_view> pieces{*strings[0], *strings[1]};
 
+<<<<<<< HEAD
   StringCordView view(pieces, strings);
 
   auto x = view.find("rldni", 0);
@@ -48,4 +60,65 @@ TEST(SourceRangeTest, test_iter) {
 
   iter = view.iter_for_pos(13);
   EXPECT_EQ(iter.pos(), 13);
+=======
+  result.emplace_back(std::move(pieces), std::move(strings));
+  pieces = {"hello worldnihaoma"};
+  strings.clear();
+  result.emplace_back(std::move(pieces), std::move(strings));
+  return result;
+}
+
+TEST(SourceRangeTest, test_find) {
+  for (const auto& view : sampleStringCordViews()) {
+    auto x = view.find("rldni", 0);
+    EXPECT_EQ(x, 8) << view.str();
+    EXPECT_EQ(view.find("ello", 0), 1);
+  }
+}
+
+TEST(SourceRangeTest, test_substr) {
+  for (const auto& view : sampleStringCordViews()) {
+    auto x = view.substr(4, 10).str();
+    EXPECT_EQ(x, view.str().substr(4, 10));
+    EXPECT_EQ(view.substr(0, view.size()).str(), view.str());
+    for (const auto start : c10::irange(view.size())) {
+      for (const auto size : c10::irange(view.size())) {
+        EXPECT_EQ(
+            view.substr(start, size).str(), view.str().substr(start, size));
+      }
+    }
+  }
+}
+
+TEST(SourceRangeTest, test_iter_simple) {
+  for (const auto& view : sampleStringCordViews()) {
+    EXPECT_NE(view.begin(), view.end());
+    EXPECT_TRUE(view.begin().has_next());
+    EXPECT_EQ(view.str(), std::string(view.begin(), view.end()));
+  }
+}
+
+TEST(SourceRangeTest, test_iter) {
+  int idx = 0;
+  for (const auto& view : sampleStringCordViews()) {
+    auto iter = view.iter_for_pos(5);
+    EXPECT_EQ(*iter, ' ');
+    if (idx++ == 0) {
+      EXPECT_EQ(iter.rest_line(), " world");
+    } else {
+      EXPECT_EQ(iter.rest_line(), " worldnihaoma");
+    }
+    EXPECT_EQ(*iter.next_iter(), 'w');
+    EXPECT_EQ(iter.pos(), 5);
+    iter = view.iter_for_pos(13);
+    EXPECT_EQ(iter.pos(), 13);
+  }
+}
+
+TEST(SourceRangeTest, SimpleString) {
+  Source src("hello");
+  EXPECT_EQ(src.num_lines(), 1);
+  EXPECT_EQ(src.get_line(0), "hello");
+  EXPECT_EQ(src.text_str().str(), "hello");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/test/cpp/jit/test_alias_analysis.cpp b/test/cpp/jit/test_alias_analysis.cpp
index 3c89f8104a10..3c651e6ea91a 100644
--- a/test/cpp/jit/test_alias_analysis.cpp
+++ b/test/cpp/jit/test_alias_analysis.cpp
@@ -1033,9 +1033,14 @@ TEST(ContainerAliasingTest, MovesAcrossContainedWrites) {
   auto ops = torch::RegisterOperators().op(
       "uses::list",
       torch::RegisterOperators::options()
+<<<<<<< HEAD
           .catchAllKernel([](torch::List<at::Tensor> in) {
             return torch::rand({2, 3});
           })
+=======
+          .catchAllKernel(
+              [](torch::List<at::Tensor> in) { return torch::rand({2, 3}); })
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .aliasAnalysis(AliasAnalysisKind::PURE_FUNCTION));
   // Write to the inside of a list. Check that we can't reorder a
   // print across it.
@@ -1073,9 +1078,14 @@ TEST(ContainerAliasingTest, MovesAcrossContainedWritesNested) {
   auto ops = torch::RegisterOperators().op(
       "uses::list",
       torch::RegisterOperators::options()
+<<<<<<< HEAD
           .catchAllKernel([](torch::List<at::Tensor> in) {
             return torch::rand({2, 3});
           })
+=======
+          .catchAllKernel(
+              [](torch::List<at::Tensor> in) { return torch::rand({2, 3}); })
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .aliasAnalysis(AliasAnalysisKind::PURE_FUNCTION));
   // Write to the inside of a list. Check that we can't reorder a
   // print across it.
@@ -1257,9 +1267,14 @@ TEST(AliasRegistrationTest, ConservativeWithInferredSchema) {
   auto registry = torch::RegisterOperators().op(
       "foo::rand1",
       torch::RegisterOperators::options()
+<<<<<<< HEAD
           .catchAllKernel([](at::Tensor) -> at::Tensor {
             return at::rand({2, 2});
           })
+=======
+          .catchAllKernel(
+              [](at::Tensor) -> at::Tensor { return at::rand({2, 2}); })
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .aliasAnalysis(AliasAnalysisKind::CONSERVATIVE));
   const auto rand_op = Symbol::fromQualString("foo::rand1");
   auto graph = std::make_shared<Graph>();
@@ -1274,9 +1289,14 @@ TEST(AliasRegistrationTest, ConservativeWithSpecifiedSchema) {
   auto registry = torch::RegisterOperators().op(
       "foo::rand2(Tensor arg1) -> Tensor",
       torch::RegisterOperators::options()
+<<<<<<< HEAD
           .catchAllKernel([](at::Tensor) -> at::Tensor {
             return at::rand({2, 2});
           })
+=======
+          .catchAllKernel(
+              [](at::Tensor) -> at::Tensor { return at::rand({2, 2}); })
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .aliasAnalysis(AliasAnalysisKind::CONSERVATIVE));
   const auto rand_op = Symbol::fromQualString("foo::rand2");
   auto graph = std::make_shared<Graph>();
@@ -1291,9 +1311,14 @@ TEST(AliasRegistrationTest, ConservativeWithAliasingAnnotationsShouldError) {
   auto registry = torch::RegisterOperators().op(
       "foo::rand3(Tensor(a) arg1) -> Tensor(b)",
       torch::RegisterOperators::options()
+<<<<<<< HEAD
           .catchAllKernel([](at::Tensor) -> at::Tensor {
             return at::rand({2, 2});
           })
+=======
+          .catchAllKernel(
+              [](at::Tensor) -> at::Tensor { return at::rand({2, 2}); })
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .aliasAnalysis(AliasAnalysisKind::CONSERVATIVE));
 
   const auto rand_op = Symbol::fromQualString("foo::rand3");
@@ -1312,9 +1337,14 @@ TEST(AliasRegistrationTest, ConservativeWithAliasingAnnotationsShouldError2) {
   auto registry = torch::RegisterOperators().op(
       "foo::rand4(Tensor(a) arg1) -> Tensor(a)",
       torch::RegisterOperators::options()
+<<<<<<< HEAD
           .catchAllKernel([](at::Tensor) -> at::Tensor {
             return at::rand({2, 2});
           })
+=======
+          .catchAllKernel(
+              [](at::Tensor) -> at::Tensor { return at::rand({2, 2}); })
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .aliasAnalysis(AliasAnalysisKind::CONSERVATIVE));
   const auto rand_op = Symbol::fromQualString("foo::rand4");
   auto graph = std::make_shared<Graph>();
@@ -1334,9 +1364,14 @@ TEST(AliasRegistrationTest, FromSchemaWithInferredSchemaShouldError) {
         torch::RegisterOperators().op(
             "foo::rand5",
             torch::RegisterOperators::options()
+<<<<<<< HEAD
                 .catchAllKernel([](at::Tensor) -> at::Tensor {
                   return at::rand({2, 2});
                 })
+=======
+                .catchAllKernel(
+                    [](at::Tensor) -> at::Tensor { return at::rand({2, 2}); })
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 .aliasAnalysis(AliasAnalysisKind::FROM_SCHEMA));
       },
       "Tried to register operator foo::rand5(Tensor _0) -> Tensor _0 with AliasAnalysisKind::FROM_SCHEMA, but the schema is inferred");
@@ -1346,9 +1381,14 @@ TEST(AliasRegistrationTest, FromSchemaInferredPure) {
   auto registry = torch::RegisterOperators().op(
       "foo::rand6(Tensor arg1) -> Tensor",
       torch::RegisterOperators::options()
+<<<<<<< HEAD
           .catchAllKernel([](at::Tensor) -> at::Tensor {
             return at::rand({2, 2});
           })
+=======
+          .catchAllKernel(
+              [](at::Tensor) -> at::Tensor { return at::rand({2, 2}); })
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .aliasAnalysis(AliasAnalysisKind::FROM_SCHEMA));
   const auto rand_op = Symbol::fromQualString("foo::rand6");
   auto graph = std::make_shared<Graph>();
@@ -1395,9 +1435,14 @@ TEST(AliasRegistrationTest, PureNoSchema) {
   auto registry = torch::RegisterOperators().op(
       "foo::rand9",
       torch::RegisterOperators::options()
+<<<<<<< HEAD
           .catchAllKernel([](at::Tensor) -> at::Tensor {
             return at::rand({2, 2});
           })
+=======
+          .catchAllKernel(
+              [](at::Tensor) -> at::Tensor { return at::rand({2, 2}); })
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .aliasAnalysis(AliasAnalysisKind::PURE_FUNCTION));
   const auto rand_op = Symbol::fromQualString("foo::rand9");
   auto graph = std::make_shared<Graph>();
@@ -1412,9 +1457,14 @@ TEST(AliasRegistrationTest, PureWithSchema) {
   auto registry = torch::RegisterOperators().op(
       "foo::rand10(Tensor arg1) -> Tensor",
       torch::RegisterOperators::options()
+<<<<<<< HEAD
           .catchAllKernel([](at::Tensor) -> at::Tensor {
             return at::rand({2, 2});
           })
+=======
+          .catchAllKernel(
+              [](at::Tensor) -> at::Tensor { return at::rand({2, 2}); })
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .aliasAnalysis(AliasAnalysisKind::PURE_FUNCTION));
   const auto rand_op = Symbol::fromQualString("foo::rand10");
   auto graph = std::make_shared<Graph>();
diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp
index 5ad5d0377e72..af63e7347289 100644
--- a/test/cpp/jit/test_autodiff.cpp
+++ b/test/cpp/jit/test_autodiff.cpp
@@ -121,6 +121,7 @@ TEST(AutodiffTest, ADFormulas) {
       {"t", unary_pointwise_2d, [](const VL& v) -> VL { return {v[0].t()}; }},
       {"view",
        unary_pointwise_2d,
+<<<<<<< HEAD
        [](const VL& v) -> VL {
          return {v[0].view({3, 2})};
        }},
@@ -129,6 +130,12 @@ TEST(AutodiffTest, ADFormulas) {
        [](const VL& v) -> VL {
          return {v[0].expand({2, 3})};
        }},
+=======
+       [](const VL& v) -> VL { return {v[0].view({3, 2})}; }},
+      {"expand",
+       {{2, 1}},
+       [](const VL& v) -> VL { return {v[0].expand({2, 3})}; }},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {"mm",
        {{10, 12}, {12, 15}},
        [](const VL& v) -> VL { return {v[0].mm(v[1])}; }},
diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp
index d1e0d5fa2180..097466dfc3e0 100644
--- a/test/cpp/jit/test_custom_class_registrations.cpp
+++ b/test/cpp/jit/test_custom_class_registrations.cpp
@@ -518,6 +518,11 @@ TORCH_LIBRARY(_TorchScriptTesting, m) {
       "takes_foo_list_return(__torch__.torch.classes._TorchScriptTesting._Foo foo, Tensor x) -> Tensor[]");
   m.def(
       "takes_foo_tuple_return(__torch__.torch.classes._TorchScriptTesting._Foo foo, Tensor x) -> (Tensor, Tensor)");
+<<<<<<< HEAD
+=======
+  m.def(
+      "takes_foo_tensor_return(__torch__.torch.classes._TorchScriptTesting._Foo foo, Tensor x) -> Tensor");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.class_<FooGetterSetter>("_FooGetterSetter")
       .def(torch::init<int64_t, int64_t>())
@@ -701,6 +706,13 @@ std::tuple<at::Tensor, at::Tensor> takes_foo_tuple_return(
   return std::make_tuple(a, b);
 }
 
+<<<<<<< HEAD
+=======
+at::Tensor takes_foo_tensor_return(c10::intrusive_ptr<Foo> foo, at::Tensor x) {
+  return at::ones({foo->x, foo->y}, at::device(at::kCPU).dtype(at::kInt));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void queue_push(c10::intrusive_ptr<TensorQueue> tq, at::Tensor x) {
   tq->push(x);
 }
@@ -732,6 +744,10 @@ TORCH_LIBRARY_IMPL(_TorchScriptTesting, CPU, m) {
   m.impl("queue_push", queue_push);
   m.impl("queue_pop", queue_pop);
   m.impl("queue_size", queue_size);
+<<<<<<< HEAD
+=======
+  m.impl("takes_foo_tensor_return", takes_foo_tensor_return);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_LIBRARY_IMPL(_TorchScriptTesting, Meta, m) {
diff --git a/test/cpp/jit/test_lexer.cpp b/test/cpp/jit/test_lexer.cpp
new file mode 100644
index 000000000000..465adbf6ecb4
--- /dev/null
+++ b/test/cpp/jit/test_lexer.cpp
@@ -0,0 +1,88 @@
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/frontend/lexer.h>
+
+namespace torch::jit {
+
+TEST(LexerTest, AllTokens) {
+  std::vector<std::pair<int /* TokenKind */, std::string>> tokens;
+  for (const char* ch = valid_single_char_tokens; *ch; ch++) {
+    tokens.emplace_back(*ch, std::string(1, *ch));
+  }
+#define ADD_TOKEN(tok, _, tokstring)     \
+  if (*tokstring) {                      \
+    tokens.emplace_back(tok, tokstring); \
+  }
+  TC_FORALL_TOKEN_KINDS(ADD_TOKEN);
+#undef ADD_TOKEN
+
+  for (const auto& [kind, token] : tokens) {
+    Lexer l(std::make_shared<Source>(token));
+    const auto& tok = l.cur();
+    EXPECT_EQ(kind, tok.kind) << tok.range.text().str();
+    EXPECT_EQ(token, tok.range.text().str()) << tok.range.text().str();
+    l.next();
+    EXPECT_EQ(l.cur().kind, TK_EOF);
+  }
+}
+
+TEST(LexerTest, SlightlyOffIsNot) {
+  std::vector<std::string> suffixes = {"", " ", "**"};
+  for (const auto& suffix : suffixes) {
+    std::vector<std::string> extras = {"n", "no", "no3"};
+    for (const auto& extra : extras) {
+      std::string s = "is " + extra + suffix;
+      Lexer l(std::make_shared<Source>(s));
+      const auto& is_tok = l.next();
+      EXPECT_EQ(is_tok.kind, TK_IS) << is_tok.range.text().str();
+      const auto& no_tok = l.cur();
+      EXPECT_EQ(no_tok.kind, TK_IDENT) << no_tok.range.text().str();
+      EXPECT_EQ(no_tok.range.text().str(), extra) << no_tok.range.text().str();
+    }
+  }
+}
+
+TEST(LexerTest, SlightlyOffNotIn) {
+  std::vector<std::string> suffixes = {"", " ", "**"};
+  for (const auto& suffix : suffixes) {
+    std::vector<std::string> extras = {"i", "i3"};
+    for (const auto& extra : extras) {
+      std::string s = "not " + extra + suffix;
+      Lexer l(std::make_shared<Source>(s));
+      const auto& not_tok = l.next();
+      EXPECT_EQ(not_tok.kind, TK_NOT) << not_tok.range.text().str();
+      const auto& in_tok = l.cur();
+      EXPECT_EQ(in_tok.kind, TK_IDENT) << in_tok.range.text().str();
+      EXPECT_EQ(in_tok.range.text().str(), extra) << in_tok.range.text().str();
+    }
+  }
+}
+
+TEST(LexerTest, IsNoteBug) {
+  // The code string `is note` is lexed as TK_ISNOT followed by a
+  // TK_IDENT that is an e. This is not how it works in Python, but
+  // presumably we need to maintain this behavior.
+  Lexer l(std::make_shared<Source>("is note"));
+  const auto is_not_tok = l.next();
+  EXPECT_EQ(is_not_tok.kind, TK_ISNOT);
+  const auto e_tok = l.next();
+  EXPECT_EQ(e_tok.kind, TK_IDENT);
+  EXPECT_EQ(e_tok.range.text(), "e");
+  const auto eof_tok = l.next();
+  EXPECT_EQ(eof_tok.kind, TK_EOF);
+}
+
+TEST(LexerTest, NotInpBug) {
+  // Another manifestation of the above IsNoteBug; `not inp` is lexed
+  // as TK_NOT_IN followed by a TK_IDENT that is a p. Again, not how
+  // it works in Python.
+  Lexer l(std::make_shared<Source>("not inp"));
+  const auto not_in_tok = l.next();
+  EXPECT_EQ(not_in_tok.kind, TK_NOTIN);
+  const auto p_tok = l.next();
+  EXPECT_EQ(p_tok.kind, TK_IDENT);
+  EXPECT_EQ(p_tok.range.text(), "p");
+  const auto eof_tok = l.next();
+  EXPECT_EQ(eof_tok.kind, TK_EOF);
+}
+} // namespace torch::jit
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 98a06b7a0208..c04d880a86c3 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -863,8 +863,17 @@ void checkScopeCallbacks() {
 
   {
     RECORD_TORCHSCRIPT_FUNCTION("test_method", {});
+<<<<<<< HEAD
     { RECORD_FUNCTION("test_function", {}); }
     { RECORD_USER_SCOPE("test_user_scope"); }
+=======
+    {
+      RECORD_FUNCTION("test_function", {});
+    }
+    {
+      RECORD_USER_SCOPE("test_user_scope");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   TORCH_CHECK(!bad_scope);
@@ -1057,7 +1066,13 @@ TEST(RecordFunctionTest, RecordFunctionGuard) {
           RECORD_USER_SCOPE("C");
         }
       }
+<<<<<<< HEAD
       { RECORD_USER_SCOPE("D"); }
+=======
+      {
+        RECORD_USER_SCOPE("D");
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   TORCH_CHECK(fn_names.size() == 1);
@@ -1084,7 +1099,13 @@ TEST(RecordFunctionTest, Callbacks) {
   add_remove_test_add_cb<2>();
   auto h3 = add_remove_test_add_cb<3>();
 
+<<<<<<< HEAD
   { RECORD_USER_SCOPE("test"); }
+=======
+  {
+    RECORD_USER_SCOPE("test");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_CHECK(ids.size() == 3);
   TORCH_CHECK(std::find(ids.begin(), ids.end(), 1) != ids.end());
@@ -1094,7 +1115,13 @@ TEST(RecordFunctionTest, Callbacks) {
   ids.clear();
   removeCallback(h1);
 
+<<<<<<< HEAD
   { RECORD_USER_SCOPE("test"); }
+=======
+  {
+    RECORD_USER_SCOPE("test");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_CHECK(ids.size() == 2);
   TORCH_CHECK(std::find(ids.begin(), ids.end(), 2) != ids.end());
@@ -1103,7 +1130,13 @@ TEST(RecordFunctionTest, Callbacks) {
   ids.clear();
   removeCallback(h3);
 
+<<<<<<< HEAD
   { RECORD_USER_SCOPE("test"); }
+=======
+  {
+    RECORD_USER_SCOPE("test");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_CHECK(ids.size() == 1);
   TORCH_CHECK(std::find(ids.begin(), ids.end(), 2) != ids.end());
@@ -1115,7 +1148,13 @@ TEST(RecordFunctionTest, Callbacks) {
   ids.clear();
   add_remove_test_add_cb<1>();
 
+<<<<<<< HEAD
   { RECORD_USER_SCOPE("test"); }
+=======
+  {
+    RECORD_USER_SCOPE("test");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_CHECK(ids.size() == 1);
   TORCH_CHECK(ids[0] == 1);
@@ -1128,7 +1167,13 @@ TEST(RecordFunctionTest, Callbacks) {
           return nullptr;
         }));
 
+<<<<<<< HEAD
     { RECORD_USER_SCOPE("test_thread"); }
+=======
+    {
+      RECORD_USER_SCOPE("test_thread");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
   th.join();
   TORCH_CHECK(ids.size() == 2);
@@ -1136,7 +1181,13 @@ TEST(RecordFunctionTest, Callbacks) {
   TORCH_CHECK(std::find(ids.begin(), ids.end(), 2) != ids.end());
   ids.clear();
 
+<<<<<<< HEAD
   { RECORD_USER_SCOPE("test"); }
+=======
+  {
+    RECORD_USER_SCOPE("test");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_CHECK(ids.size() == 1);
   TORCH_CHECK(ids[0] == 1);
@@ -1167,7 +1218,13 @@ TEST(RecordFunctionTest, Callbacks) {
           TORCH_CHECK(ctx->b == "test_str");
         }));
 
+<<<<<<< HEAD
     { RECORD_USER_SCOPE("test"); }
+=======
+    {
+      RECORD_USER_SCOPE("test");
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     TORCH_CHECK(ids.size() == 1);
     TORCH_CHECK(ids[0] == 1);
@@ -1193,7 +1250,13 @@ TEST(RecordFunctionTest, Callbacks) {
           }));
 
       // Will call both global and thread local callbacks.
+<<<<<<< HEAD
       { RECORD_USER_SCOPE("test_thread"); }
+=======
+      {
+        RECORD_USER_SCOPE("test_thread");
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     });
     ctx_th.join();
     TORCH_CHECK(ids.size() == 2);
@@ -1216,21 +1279,39 @@ TEST(RecordFunctionTest, ShouldRun) {
         return nullptr;
       }));
 
+<<<<<<< HEAD
   { RECORD_USER_SCOPE("test"); }
+=======
+  {
+    RECORD_USER_SCOPE("test");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_TRUE(ran) << "first run didn't happen";
   ran = false;
 
   disableCallback(handle);
 
+<<<<<<< HEAD
   { RECORD_USER_SCOPE("test"); }
+=======
+  {
+    RECORD_USER_SCOPE("test");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_FALSE(ran) << "second run happened but shouldn't have";
   ran = false;
 
   reenableCallback(handle);
 
+<<<<<<< HEAD
   { RECORD_USER_SCOPE("test"); }
+=======
+  {
+    RECORD_USER_SCOPE("test");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   EXPECT_TRUE(ran) << "run after re-enable didn't happen";
   ran = false;
@@ -1273,7 +1354,13 @@ TEST(RecordFunctionTest, Basic) {
             return nullptr;
           })
           .needsIds(true));
+<<<<<<< HEAD
   { RECORD_USER_SCOPE("test"); }
+=======
+  {
+    RECORD_USER_SCOPE("test");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(has_ids);
   clearCallbacks();
   has_ids = false;
@@ -1282,7 +1369,13 @@ TEST(RecordFunctionTest, Basic) {
         has_ids = fn.handle() > 0;
         return nullptr;
       }));
+<<<<<<< HEAD
   { RECORD_USER_SCOPE("test"); }
+=======
+  {
+    RECORD_USER_SCOPE("test");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(!has_ids);
   clearCallbacks();
 }
diff --git a/test/cpp/jit/test_subgraph_utils.cpp b/test/cpp/jit/test_subgraph_utils.cpp
index bc05b8e82991..55f6d4c3e60d 100644
--- a/test/cpp/jit/test_subgraph_utils.cpp
+++ b/test/cpp/jit/test_subgraph_utils.cpp
@@ -19,8 +19,12 @@ TEST(SubgraphUtilsTest, Basic) {
   for (bool reverse_iterate : {true, false}) {
     // Merge everything into a single subgraph
     bool first = true;
+<<<<<<< HEAD
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     Node* subgraph;
+=======
+    Node* subgraph = nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto it =
         reverse_iterate ? graph->nodes().rbegin() : graph->nodes().begin();
     auto end = reverse_iterate ? graph->nodes().rend() : graph->nodes().end();
@@ -84,8 +88,12 @@ graph(%a : Tensor, %b : Tensor, %c : Tensor):
     while (graph2->next() != *graph->nodes().end()) {
       SubgraphUtils::mergeNodeIntoSubgraph(graph2->next(), graph2);
     }
+<<<<<<< HEAD
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     Node* subgraph;
+=======
+    Node* subgraph = nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (reverse_merge) {
       SubgraphUtils::mergeNodeIntoSubgraph(graph2, graph1);
       subgraph = graph1;
diff --git a/test/cpp/lazy/CMakeLists.txt b/test/cpp/lazy/CMakeLists.txt
index cd67557acefa..0abc79fa1864 100644
--- a/test/cpp/lazy/CMakeLists.txt
+++ b/test/cpp/lazy/CMakeLists.txt
@@ -28,7 +28,11 @@ add_executable(test_lazy
 # TODO temporary until we can delete the old gtest polyfills.
 target_compile_definitions(test_lazy PRIVATE USE_GTEST)
 
+<<<<<<< HEAD
 set(LAZY_TEST_DEPENDENCIES torch gtest)
+=======
+set(LAZY_TEST_DEPENDENCIES torch gtest_main)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 target_link_libraries(test_lazy PRIVATE ${LAZY_TEST_DEPENDENCIES})
 target_include_directories(test_lazy PRIVATE ${ATen_CPU_INCLUDE})
diff --git a/test/cpp/lite_interpreter_runtime/CMakeLists.txt b/test/cpp/lite_interpreter_runtime/CMakeLists.txt
index 6a2e6db6eaa9..1cbfc0ce8282 100644
--- a/test/cpp/lite_interpreter_runtime/CMakeLists.txt
+++ b/test/cpp/lite_interpreter_runtime/CMakeLists.txt
@@ -21,7 +21,11 @@ target_include_directories(
   ${ATen_CPU_INCLUDE}
 )
 
+<<<<<<< HEAD
 target_link_libraries(test_lite_interpreter_runtime PRIVATE torch gtest backend_with_compiler_runtime)
+=======
+target_link_libraries(test_lite_interpreter_runtime PRIVATE torch gtest_main backend_with_compiler_runtime)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if(LINUX)
   target_link_libraries(test_lite_interpreter_runtime PRIVATE "-Wl,--no-as-needed,$<TARGET_FILE:backend_with_compiler_runtime>,--as-needed")
diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt
new file mode 100644
index 000000000000..9f2ad858dfd1
--- /dev/null
+++ b/test/cpp/nativert/CMakeLists.txt
@@ -0,0 +1,57 @@
+set(NATIVERT_TEST_ROOT ${TORCH_ROOT}/test/cpp/nativert)
+
+file(GLOB_RECURSE NATIVERT_ALL_TEST_FILES "${NATIVERT_TEST_ROOT}/test_*.cpp")
+
+# Build the cpp gtest binary containing the cpp-only tests.
+set(NATIVERT_TEST_SRCS
+  ${NATIVERT_ALL_TEST_FILES}
+  ${TORCH_ROOT}/torch/nativert/graph/TensorMeta.cpp
+  ${TORCH_ROOT}/torch/nativert/graph/Graph.cpp
+  ${TORCH_ROOT}/torch/nativert/graph/GraphSignature.cpp
+  ${TORCH_ROOT}/torch/nativert/graph/Serialization.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/OpKernel.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/PlacementUtils.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/Weights.cpp
+  ${TORCH_ROOT}/torch/nativert/common/FileUtil.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/memory/FunctionSchema.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/ExecutionPlanner.cpp
+  ${TORCH_ROOT}/torch/nativert/detail/ITree.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/ExecutionFrame.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/C10Kernel.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/memory/GreedyBySize.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/memory/Bump.cpp
+)
+
+add_executable(test_nativert
+  ${TORCH_ROOT}/test/cpp/common/main.cpp
+  ${NATIVERT_TEST_SRCS}
+)
+
+# TODO temporary until we can delete the old gtest polyfills.
+target_compile_definitions(test_nativert PRIVATE USE_GTEST)
+
+set(NATIVERT_TEST_DEPENDENCIES torch gtest_main)
+
+target_link_libraries(test_nativert PRIVATE ${NATIVERT_TEST_DEPENDENCIES})
+target_link_libraries(test_nativert PRIVATE fmt::fmt-header-only)
+target_include_directories(test_nativert PRIVATE ${ATen_CPU_INCLUDE})
+
+if(USE_CUDA)
+  target_compile_definitions(test_nativert PRIVATE USE_CUDA)
+elseif(USE_ROCM)
+  target_link_libraries(test_nativert PRIVATE
+    hiprtc::hiprtc
+    hip::amdhip64
+    ${TORCH_CUDA_LIBRARIES})
+
+  target_compile_definitions(test_nativert PRIVATE USE_ROCM)
+endif()
+
+if(INSTALL_TEST)
+  set_target_properties(test_nativert PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
+  install(TARGETS test_nativert DESTINATION bin)
+  # Install PDB files for MSVC builds
+  if(MSVC AND BUILD_SHARED_LIBS)
+    install(FILES $<TARGET_PDB_FILE:test_nativert> DESTINATION bin OPTIONAL)
+  endif()
+endif()
diff --git a/test/cpp/nativert/test_c10_kernel.cpp b/test/cpp/nativert/test_c10_kernel.cpp
new file mode 100644
index 000000000000..f731128b1c81
--- /dev/null
+++ b/test/cpp/nativert/test_c10_kernel.cpp
@@ -0,0 +1,76 @@
+#include <ATen/core/op_registration/op_registration.h>
+#include <gtest/gtest.h>
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/graph/Graph.h>
+#include <torch/nativert/kernels/C10Kernel.h>
+#include <torch/torch.h>
+
+namespace torch::nativert {
+
+at::Tensor foo_kernel(const at::Tensor& a, const at::Tensor& b) {
+  return a + b;
+}
+
+TEST(C10KernelTest, computeInternal) {
+  auto registrar = c10::RegisterOperators().op(
+      "test::foo(Tensor a, Tensor b) -> Tensor", &foo_kernel);
+
+  static constexpr std::string_view source =
+      R"(graph(%a, %b):
+%x = test.foo.default(a=%a, b=%b)
+return (%x)
+)";
+
+  auto graph = stringToGraph(source);
+  const auto& nodes = graph->nodes();
+  auto it = nodes.begin();
+  std::advance(it, 1);
+  const Node& node = *it;
+
+  c10::Device device = torch::Device(torch::kCPU, 0);
+
+  auto a = at::randn({6, 6, 6});
+  auto b = at::randn({6, 6, 6});
+
+  auto frame = ExecutionFrame(*graph);
+  frame.setIValue(graph->getValue("a")->id(), a);
+  frame.setIValue(graph->getValue("b")->id(), b);
+
+  auto kernel = C10Kernel(&node, device);
+
+  kernel.computeInternal(frame);
+
+  at::Tensor expected = a + b;
+  EXPECT_TRUE(
+      torch::equal(frame.getTensor(graph->getValue("x")->id()), expected));
+}
+
+TEST(ScalarBinaryOpKernelTest, computeInternal) {
+  static constexpr std::string_view source =
+      R"(graph(%a, %b):
+%x = _operator.add(a=%a, b=%b)
+return (%x)
+)";
+
+  auto graph = stringToGraph(source);
+  const auto& nodes = graph->nodes();
+  auto it = nodes.begin();
+  std::advance(it, 1);
+  const Node& node = *it;
+
+  auto a = 1;
+  auto b = 2;
+
+  auto frame = ExecutionFrame(*graph);
+  frame.setIValue(graph->getValue("a")->id(), a);
+  frame.setIValue(graph->getValue("b")->id(), b);
+
+  auto kernel = ScalarBinaryOpKernel(&node);
+
+  kernel.computeInternal(frame);
+
+  auto expected = a + b;
+  EXPECT_EQ(frame.getIValue(graph->getValue("x")->id()).toInt(), expected);
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_execution_frame.cpp b/test/cpp/nativert/test_execution_frame.cpp
new file mode 100644
index 000000000000..ab5fa2e146db
--- /dev/null
+++ b/test/cpp/nativert/test_execution_frame.cpp
@@ -0,0 +1,96 @@
+#include <gtest/gtest.h>
+#include <torch/nativert/executor/ExecutionFrame.h>
+
+namespace torch::nativert {
+
+TEST(ExecutionFrameTest, CreateFrame) {
+  auto graph = stringToGraph(R"(
+    graph(%x, %y):
+  %a = foo(a=%x, b=%y)
+  %b = foo1(a=%x, b=%y)
+  %c = foo2(c=%a, d=%b)
+  return(%c)
+  )");
+
+  auto frame = ExecutionFrame(*graph);
+
+  for (auto* v : graph->values()) {
+    frame.setIValue(v->id(), c10::IValue(at::tensor({v->id()}, at::kInt)));
+    auto& frame_v = frame.getIValue(v->id());
+    EXPECT_EQ(frame_v.tagKind(), "Tensor");
+  }
+
+  auto outputs = frame.tryMoveUserOutputs();
+
+  EXPECT_EQ(outputs.size(), 1);
+  EXPECT_EQ(outputs[0].tagKind(), "Tensor");
+  EXPECT_EQ(outputs[0].toTensor().item().toInt(), graph->getValue("c")->id());
+}
+
+TEST(ExecutionFrameTest, TestSetBorrowedValue) {
+  auto graph = stringToGraph(R"(
+    graph(%x, %y):
+  %a = foo(a=%x, b=%y)
+  %b = foo1(a=%x, b=%y)
+  %c = foo2(c=%a, d=%b)
+  return(%c)
+  )");
+
+  auto x = c10::IValue(at::tensor({1}, at::kInt));
+  auto y = c10::IValue(at::tensor({2}, at::kInt));
+
+  {
+    auto frame = ExecutionFrame(*graph);
+
+    frame.setBorrowedIValue(
+        graph->getValue("x")->id(),
+        c10::MaybeOwnedTraits<c10::IValue>::createBorrow(x));
+    frame.setBorrowedIValue(
+        graph->getValue("y")->id(),
+        c10::MaybeOwnedTraits<c10::IValue>::createBorrow(y));
+
+    [[maybe_unused]] auto& w = frame.getIValue(graph->getValue("x")->id());
+    [[maybe_unused]] auto& z = frame.getIValue(graph->getValue("y")->id());
+
+    EXPECT_EQ(x.use_count(), 1);
+    EXPECT_EQ(y.use_count(), 1);
+
+    EXPECT_TRUE(c10::MaybeOwnedTraits<c10::IValue>{}.debugBorrowIsValid(
+        frame.getIValue(graph->getValue("x")->id())));
+    EXPECT_TRUE(c10::MaybeOwnedTraits<c10::IValue>{}.debugBorrowIsValid(
+        frame.getIValue(graph->getValue("y")->id())));
+  }
+
+  EXPECT_EQ(x.use_count(), 1);
+  EXPECT_EQ(y.use_count(), 1);
+}
+
+TEST(ExecutionFrameTest, TestPersistentValue) {
+  auto graph = stringToGraph(R"(
+    graph(%x, %y, %my_weight):
+  %a = foo(a=%x, b=%y)
+  %b = foo1(a=%x, b=%y)
+  %c = foo2(c=%a, d=%b)
+  return(%c)
+  )");
+
+  Weights weights(graph.get());
+  weights.setValue("my_weight", at::tensor({1}, at::kInt));
+
+  auto new_sig = graph->signature();
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+  const_cast<std::vector<std::pair<std::string, std::string>>&>(
+      new_sig.inputsToWeights())
+      .emplace_back("my_weight", "my_weight");
+  graph->setSignature(new_sig);
+
+  auto frame = ExecutionFrame(*graph, weights);
+
+  EXPECT_EQ(frame.weightVersion(), 0);
+  auto wid = graph->getValue("my_weight")->id();
+
+  EXPECT_NO_THROW(frame.getTensor(wid));
+  EXPECT_DEATH(frame.releaseValue(wid), "Cannot release persistent value");
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_execution_planner.cpp b/test/cpp/nativert/test_execution_planner.cpp
new file mode 100644
index 000000000000..8162d0fdbf2f
--- /dev/null
+++ b/test/cpp/nativert/test_execution_planner.cpp
@@ -0,0 +1,47 @@
+#include <gtest/gtest.h>
+#include <torch/nativert/executor/ExecutionPlanner.h>
+
+namespace torch::nativert {
+
+TEST(ExecutionPlannerTest, CreatePlan) {
+  auto graph = stringToGraph(R"(
+    graph(%x, %y):
+  %a = foo(a=%x, b=%y)
+  %b = foo1(a=%x, b=%y)
+  %c = foo2(c=%a, d=%b)
+  return(%c)
+  )");
+
+  {
+    auto plan = ExecutionPlanner{*graph}.createPlan();
+
+    auto& values_to_free = plan->valuesToFree;
+    EXPECT_EQ(values_to_free.size(), 5);
+
+    for (const auto i : c10::irange(3)) {
+      EXPECT_TRUE(values_to_free[i].empty());
+    }
+
+    EXPECT_EQ(values_to_free[3].size(), 2);
+    std::set<int64_t> ids{values_to_free[3].begin(), values_to_free[3].end()};
+    EXPECT_EQ(
+        ids,
+        std::set<int64_t>(
+            {graph->tryGetValue("a")->id(), graph->tryGetValue("b")->id()}));
+
+    EXPECT_EQ(values_to_free[4].size(), 0);
+  }
+
+  {
+    auto static_values = ExecutionPlanner::staticValues(*graph);
+    std::set<int64_t> static_ids{static_values.begin(), static_values.end()};
+    EXPECT_EQ(
+        static_ids,
+        std::set<int64_t>(
+            {graph->tryGetValue("x")->id(),
+             graph->tryGetValue("y")->id(),
+             graph->tryGetValue("c")->id()}));
+  }
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_file_util.cpp b/test/cpp/nativert/test_file_util.cpp
new file mode 100644
index 000000000000..05d0b8203ad3
--- /dev/null
+++ b/test/cpp/nativert/test_file_util.cpp
@@ -0,0 +1,111 @@
+#include <gtest/gtest.h>
+#include <torch/nativert/common/FileUtil.h>
+#include <fstream>
+
+namespace torch {
+namespace nativert {
+
+TEST(FileUtilTest, OpenNoInt) {
+  // Create a temporary file
+  std::ofstream tmpFile("tmp_file.txt");
+  tmpFile.close();
+
+  int fd = openNoInt("tmp_file.txt", O_RDONLY, 0);
+  ASSERT_GE(fd, 0);
+
+  closeNoInt(fd);
+}
+
+TEST(FileUtilTest, CloseNoInt) {
+  // Create a temporary file
+  std::ofstream tmpFile("tmp_file.txt");
+  tmpFile.close();
+
+  int fd = openNoInt("tmp_file.txt", O_RDONLY, 0);
+  ASSERT_GE(fd, 0);
+
+  int result = closeNoInt(fd);
+  ASSERT_EQ(result, 0);
+}
+
+TEST(FileUtilTest, WriteFull) {
+  // Create a temporary file
+  std::ofstream tmpFile("tmp_file.txt");
+  tmpFile.close();
+
+  int fd = openNoInt("tmp_file.txt", O_WRONLY | O_CREAT, 0644);
+  ASSERT_GE(fd, 0);
+
+  const char* data = "Hello, World!";
+  ssize_t bytesWritten = writeFull(fd, data, strlen(data));
+  ASSERT_EQ(bytesWritten, strlen(data));
+
+  closeNoInt(fd);
+}
+
+TEST(FileUtilTest, ReadFull) {
+  // Create a temporary file
+  std::ofstream tmpFile("tmp_file.txt");
+  tmpFile << "Hello, World!";
+  tmpFile.close();
+
+  int fd = openNoInt("tmp_file.txt", O_RDONLY, 0);
+  ASSERT_GE(fd, 0);
+
+  char buffer[1024];
+  ssize_t bytesRead = readFull(fd, buffer, 1024);
+  ASSERT_EQ(bytesRead, 13); // length of "Hello, World!"
+
+  closeNoInt(fd);
+}
+
+TEST(FileUtilTest, FileConstructor) {
+  // Create a temporary file
+  std::ofstream tmpFile("tmp_file.txt");
+  tmpFile.close();
+
+  File file("tmp_file.txt", O_RDONLY, 0);
+  ASSERT_GE(file.fd(), 0);
+
+  file.close();
+}
+
+TEST(FileUtilTest, FileMoveConstructor) {
+  // Create a temporary file
+  std::ofstream tmpFile("tmp_file.txt");
+  tmpFile.close();
+
+  File file1("tmp_file.txt", O_RDONLY, 0);
+  File file2(std::move(file1));
+
+  ASSERT_GE(file2.fd(), 0);
+  ASSERT_EQ(file1.fd(), -1);
+
+  file2.close();
+}
+
+TEST(FileUtilTest, FileAssignmentOperator) {
+  // Create a temporary file
+  std::ofstream tmpFile("tmp_file.txt");
+  tmpFile.close();
+
+  File file1("tmp_file.txt", O_RDONLY, 0);
+  File file2;
+
+  file2 = std::move(file1);
+
+  ASSERT_GE(file2.fd(), 0);
+  ASSERT_EQ(file1.fd(), -1);
+
+  file2.close();
+}
+
+TEST(FileUtilTest, TemporaryFile) {
+  File file = File::temporary();
+  ASSERT_GE(file.fd(), 0);
+
+  file.close();
+}
+
+} // namespace nativert
+} // namespace torch
diff --git a/test/cpp/nativert/test_function_schema.cpp b/test/cpp/nativert/test_function_schema.cpp
new file mode 100644
index 000000000000..c80e6273f12a
--- /dev/null
+++ b/test/cpp/nativert/test_function_schema.cpp
@@ -0,0 +1,70 @@
+#include <gtest/gtest.h>
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/nativert/executor/memory/FunctionSchema.h>
+
+using namespace ::testing;
+
+int64_t increment_kernel(const at::Tensor& tensor, int64_t input) {
+  return input + 1;
+}
+
+at::Tensor slice_kernel(const at::Tensor& tensor, int64_t dim) {
+  return tensor.slice(dim);
+}
+
+TEST(TestFunctionSchema, testNoAlias) {
+  auto registrar = c10::RegisterOperators().op(
+      "_test::my_op(Tensor dummy, int input) -> int", &increment_kernel);
+  auto handle = c10::Dispatcher::singleton().findSchema({"_test::my_op", ""});
+
+  EXPECT_TRUE(handle.has_value());
+  EXPECT_TRUE(handle->hasSchema());
+
+  auto nativert_schema = torch::nativert::FunctionSchema(handle->schema());
+
+  EXPECT_FALSE(nativert_schema.alias(0, 0));
+  EXPECT_FALSE(nativert_schema.alias(1, 0));
+
+  // bounds check
+  EXPECT_THROW(nativert_schema.alias(2, 0), c10::Error);
+  EXPECT_THROW(nativert_schema.alias(1, 1), c10::Error);
+}
+
+TEST(TestFunctionSchema, testAliasOverride) {
+  auto registrar = c10::RegisterOperators().op(
+      "_test::my_op(Tensor dummy, int input) -> int", &increment_kernel);
+  auto handle = c10::Dispatcher::singleton().findSchema({"_test::my_op", ""});
+
+  EXPECT_TRUE(handle.has_value());
+  EXPECT_TRUE(handle->hasSchema());
+
+  auto nativert_schema =
+      torch::nativert::FunctionSchema(handle->schema(), {{0, 0}});
+
+  EXPECT_TRUE(nativert_schema.alias(0, 0));
+  EXPECT_FALSE(nativert_schema.alias(1, 0));
+
+  // bounds check
+  EXPECT_THROW(nativert_schema.alias(2, 0), c10::Error);
+  EXPECT_THROW(nativert_schema.alias(1, 1), c10::Error);
+}
+
+TEST(TestFunctionSchema, testAlias) {
+  auto registrar = c10::RegisterOperators().op(
+      "_test::my_op(Tensor(a) dummy, int input) -> Tensor(a)", &slice_kernel);
+  auto handle = c10::Dispatcher::singleton().findSchema({"_test::my_op", ""});
+
+  EXPECT_TRUE(handle.has_value());
+  EXPECT_TRUE(handle->hasSchema());
+
+  auto nativert_schema = torch::nativert::FunctionSchema(handle->schema());
+
+  EXPECT_TRUE(nativert_schema.alias(0, 0));
+  EXPECT_FALSE(nativert_schema.alias(1, 0));
+
+  // bounds check
+  EXPECT_THROW(nativert_schema.alias(2, 0), c10::Error);
+  EXPECT_THROW(nativert_schema.alias(1, 1), c10::Error);
+}
diff --git a/test/cpp/nativert/test_graph.cpp b/test/cpp/nativert/test_graph.cpp
new file mode 100644
index 000000000000..945d2d51252f
--- /dev/null
+++ b/test/cpp/nativert/test_graph.cpp
@@ -0,0 +1,647 @@
+#include <c10/core/Device.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <torch/nativert/graph/Graph.h>
+
+using namespace ::testing;
+
+namespace torch::nativert {
+TEST(GraphTest, Basic) {
+  static constexpr std::string_view source =
+      R"(graph(%foo, %bar, %baz):
+%o1, %o2 = aten.foo(self=%foo, target=%bar, alpha=0.1)
+return(%o2, %baz)
+)";
+  auto graph = stringToGraph(source);
+  EXPECT_EQ(graph->inputs().size(), 3);
+  EXPECT_EQ(graph->inputs()[0]->name(), "foo");
+  EXPECT_EQ(graph->inputs()[1]->name(), "bar");
+  EXPECT_EQ(graph->inputs()[2]->name(), "baz");
+
+  const auto& nodes = graph->nodes();
+  EXPECT_EQ(nodes.size(), 3);
+  // First node is the input node
+  auto it = nodes.begin();
+  {
+    const auto& node = *it;
+    EXPECT_EQ(node.target(), "prim.Input");
+    EXPECT_EQ(node.inputs().size(), 0);
+    EXPECT_EQ(node.outputs().size(), 3);
+    EXPECT_EQ(node.outputs()[0]->name(), "foo");
+    EXPECT_EQ(node.outputs()[1]->name(), "bar");
+    EXPECT_EQ(node.outputs()[2]->name(), "baz");
+  }
+  {
+    std::advance(it, 1);
+    const auto& node = *it;
+    EXPECT_EQ(node.target(), "aten.foo");
+    EXPECT_EQ(node.inputs().size(), 2);
+    EXPECT_EQ(node.inputs()[0].name, "self");
+    EXPECT_EQ(node.inputs()[1].name, "target");
+
+    EXPECT_EQ(node.attributes().size(), 1);
+    EXPECT_EQ(node.attributes()[0].name, "alpha");
+  }
+  {
+    std::advance(it, 1);
+    const auto& node = *it;
+    EXPECT_EQ(node.target(), "prim.Output");
+    EXPECT_EQ(node.inputs().size(), 2);
+    EXPECT_EQ(node.inputs()[0].name, "o2");
+    EXPECT_EQ(node.inputs()[1].name, "baz");
+  }
+  EXPECT_EQ(graph->outputs().size(), 2);
+  EXPECT_EQ(graph->outputs()[0]->name(), "o2");
+  EXPECT_EQ(graph->outputs()[1]->name(), "baz");
+
+  const auto& values = graph->values();
+  EXPECT_EQ(values.size(), 5);
+  std::vector<std::string> valueNames;
+  valueNames.reserve(values.size());
+  for (const auto& v : values) {
+    valueNames.emplace_back(v->name());
+  }
+  std::sort(valueNames.begin(), valueNames.end());
+
+  EXPECT_THAT(
+      valueNames,
+      ContainerEq(std::vector<std::string>({"bar", "baz", "foo", "o1", "o2"})));
+}
+
+TEST(GraphTest, ValueProducer) {
+  static constexpr std::string_view source =
+      R"(graph(%foo, %bar, %baz):
+%o1, %o2 = aten.foo(self=%foo, target=%bar, alpha=0.1)
+return(%o2, %baz)
+)";
+  auto graph = stringToGraph(source);
+  auto foo = graph->getValue("foo");
+  EXPECT_EQ(foo->producer()->target(), "prim.Input");
+  auto o1 = graph->getValue("o1");
+  EXPECT_EQ(o1->producer()->target(), "aten.foo");
+}
+
+TEST(GraphTest, InsertBeforeAfter) {
+  static constexpr std::string_view source =
+      R"(graph(%foo, %bar, %baz):
+%o1, %o2 = aten.foo(self=%foo, target=%bar, alpha=0.1)
+return(%o2, %baz)
+)";
+  auto graph = stringToGraph(source);
+  auto it = graph->nodes().begin();
+  ++it;
+  auto& node = *it;
+  EXPECT_EQ(node.target(), "aten.foo");
+  auto before = graph->createNode("before", {});
+  auto after = graph->createNode("after", {});
+  auto atEnd = graph->createNode("atEnd", {});
+
+  graph->insertBefore(before, &node);
+  graph->insertAfter(after, &node);
+  graph->insert(atEnd);
+
+  static constexpr std::string_view expected =
+      R"(graph(%foo, %bar, %baz):
+ = before()
+%o1, %o2 = aten.foo(self=%foo, target=%bar, alpha=0.1)
+ = after()
+ = atEnd()
+return(%o2, %baz)
+)";
+  EXPECT_EQ(graphToString(*graph), expected);
+}
+
+TEST(GraphTest, ValueUses) {
+  static constexpr std::string_view source =
+      R"(graph(%foo, %bar, %baz):
+%o1, %o2 = aten.foo(self=%foo, target=%bar, alpha=0.1)
+return(%o2, %baz)
+)";
+  auto graph = stringToGraph(source);
+  auto o2 = graph->getValue("o2");
+  EXPECT_EQ(o2->users().size(), 1);
+  EXPECT_EQ(o2->users()[0]->target(), "prim.Output");
+}
+
+TEST(GraphTest, ApplyDevicePlacement) {
+  auto graph = Graph::createGraph();
+  auto node1 = graph->insertNode("node1");
+  auto node2 = graph->insertNode("node2");
+
+  node1->addAttribute({"a", c10::Device(c10::DeviceType::CPU)});
+  node1->addAttribute({"b", c10::Device(c10::DeviceType::CUDA, 0)});
+  node1->addAttribute({"c", c10::Device(c10::DeviceType::CUDA, 1)});
+
+  node2->addAttribute({"d", c10::Device(c10::DeviceType::CUDA, 0)});
+
+  graph->applyDevicePlacement(
+      Placement(std::unordered_map<c10::Device, c10::Device>{
+          {c10::Device(c10::DeviceType::CUDA, 0),
+           c10::Device(c10::DeviceType::CUDA, 1)}}));
+
+  EXPECT_EQ(
+      std::get<c10::Device>(node1->getAttribute("a").value),
+      c10::Device(c10::DeviceType::CPU));
+  EXPECT_EQ(
+      std::get<c10::Device>(node1->getAttribute("b").value),
+      c10::Device(c10::DeviceType::CUDA, 1));
+  EXPECT_EQ(
+      std::get<c10::Device>(node1->getAttribute("c").value),
+      c10::Device(c10::DeviceType::CUDA, 1));
+  EXPECT_EQ(
+      std::get<c10::Device>(node2->getAttribute("d").value),
+      c10::Device(c10::DeviceType::CUDA, 1));
+}
+
+TEST(GraphTest, ReplaceAllUses) {
+  static constexpr std::string_view source =
+      R"(graph(%foo, %bar, %baz):
+%o1, %o2 = aten.foo(self=%foo, target=%bar, alpha=0.1)
+return(%o2, %baz)
+)";
+  auto graph = stringToGraph(source);
+  auto o2 = graph->getValue("o2");
+  auto bar = graph->getValue("bar");
+  auto foo = graph->getValue("foo");
+
+  EXPECT_EQ(o2->users().size(), 1);
+  EXPECT_EQ(bar->users().size(), 1);
+  EXPECT_EQ(foo->users().size(), 1);
+
+  graph->replaceAllUses(o2, bar);
+  EXPECT_EQ(o2->users().size(), 0);
+  EXPECT_EQ(bar->users().size(), 2);
+
+  graph->replaceAllUses(bar, foo);
+  EXPECT_EQ(bar->users().size(), 0);
+  EXPECT_EQ(foo->users().size(), 2);
+  static constexpr std::string_view expected =
+      R"(graph(%foo, %bar, %baz):
+%o1, %o2 = aten.foo(self=%foo, target=%foo, alpha=0.1)
+return(%foo, %baz)
+)";
+  EXPECT_EQ(graphToString(*graph), expected);
+}
+
+TEST(GraphTest, GetUniqueValueName) {
+  static constexpr std::string_view source =
+      R"(graph(%foo, %bar, %baz):
+%o1, %o2 = aten.foo(self=%foo, target=%bar, alpha=0.1)
+return(%o2, %bar)
+)";
+  auto graph = stringToGraph(source);
+  auto o2 = graph->getValue("o2");
+  auto fooNode = o2->producer();
+  auto v0 = graph->getUniqueValueName();
+  graph->addValue(v0, Type::Kind::None, fooNode);
+  auto v1 = graph->getUniqueValueName();
+  graph->addValue(v1, Type::Kind::None, fooNode);
+  auto v2 = graph->getUniqueValueName();
+  EXPECT_EQ(v0, "v0");
+  EXPECT_EQ(v1, "v1");
+  EXPECT_EQ(v2, "v2");
+}
+
+TEST(GraphTest, ReplaceAllUsesMultiUse) {
+  static constexpr std::string_view source =
+      R"(graph(%foo, %bar):
+%o1 = aten.foo(a=%foo, b=%foo, c=%bar)
+return(%o1)
+)";
+  auto graph = stringToGraph(source);
+  auto foo = graph->getValue("foo");
+  auto bar = graph->getValue("bar");
+  graph->replaceAllUses(foo, bar);
+
+  static constexpr std::string_view expected =
+      R"(graph(%foo, %bar):
+%o1 = aten.foo(a=%bar, b=%bar, c=%bar)
+return(%o1)
+)";
+  EXPECT_EQ(graphToString(*graph), expected);
+}
+
+TEST(GraphTest, ReplaceAllUsesAfter) {
+  static constexpr std::string_view source =
+      R"(graph(%foo):
+%o1 = aten.foo1(a=%foo)
+%o2 = aten.foo2(a=%o1, b=%foo)
+%o3 = aten.foo3(a=%o2, b=%o2, c=%foo)
+return(%foo, %o1, %o2, %o3)
+)";
+  auto graph = stringToGraph(source);
+  auto foo = graph->getValue("foo");
+  auto o1 = graph->getValue("o1");
+  auto foo3Node = graph->getValue("o3")->producer();
+  graph->replaceAllUsesAfterNode(foo, o1, foo3Node);
+
+  static constexpr std::string_view expected =
+      R"(graph(%foo):
+%o1 = aten.foo1(a=%foo)
+%o2 = aten.foo2(a=%o1, b=%foo)
+%o3 = aten.foo3(a=%o2, b=%o2, c=%foo)
+return(%o1, %o1, %o2, %o3)
+)";
+  EXPECT_EQ(graphToString(*graph), expected);
+  EXPECT_EQ(foo->users().size(), 3);
+  EXPECT_EQ(o1->users().size(), 2);
+}
+
+TEST(GraphTest, InsertingAfter) {
+  static constexpr std::string_view source =
+      R"(graph(%foo, %bar):
+%o1 = aten.first(a=%foo)
+%o2 = aten.foo(c=%bar)
+return(%o1, %o2)
+)";
+  auto graph = stringToGraph(source);
+  auto origNode = graph->getValue("o1")->producer();
+  {
+    InsertingAfter guard(origNode);
+    graph->insertNode("one");
+    graph->insertNode("two");
+    graph->insertNode("three");
+  }
+  graph->insertNode("four");
+  static constexpr std::string_view expected =
+      R"(graph(%foo, %bar):
+%o1 = aten.first(a=%foo)
+ = one()
+ = two()
+ = three()
+%o2 = aten.foo(c=%bar)
+ = four()
+return(%o1, %o2)
+)";
+  EXPECT_EQ(graphToString(*graph), expected);
+}
+
+TEST(NodeTest, GetInputAndAttribute) {
+  auto graph = Graph::createGraph();
+  auto input1 = graph->addInput("input1", Type::Kind::Tensor);
+  auto input2 = graph->addInput("input2", Type::Kind::Tensor);
+  auto input3 = graph->addInput("input3", Type::Kind::Tensor);
+  auto node = graph->createNode("foo.bar");
+
+  node->addInput({"out_of_order", input1});
+  node->addInput({"arg1", input2});
+  node->addInput({"arg2", input3});
+
+  node->addAttribute({"b", static_cast<int64_t>(0)});
+  node->addAttribute({"a", static_cast<int64_t>(2)});
+  node->addAttribute({"c", static_cast<int64_t>(1)});
+  {
+    const auto& input = node->getInput("out_of_order");
+    EXPECT_EQ(input.name, "out_of_order");
+    EXPECT_EQ(input.value, input1);
+  }
+  {
+    const auto& input = node->getInput("arg1");
+    EXPECT_EQ(input.name, "arg1");
+    EXPECT_EQ(input.value, input2);
+  }
+  {
+    const auto& input = node->getInput("arg2");
+    EXPECT_EQ(input.name, "arg2");
+    EXPECT_EQ(input.value, input3);
+  }
+  {
+    const auto& attr = node->getAttribute("a");
+    EXPECT_EQ(attr.name, "a");
+    EXPECT_EQ(attr.value, Constant(static_cast<int64_t>(2)));
+  }
+  {
+    const auto& attr = node->getAttribute("b");
+    EXPECT_EQ(attr.name, "b");
+    EXPECT_EQ(attr.value, Constant(static_cast<int64_t>(0)));
+  }
+  {
+    const auto& attr = node->getAttribute("c");
+    EXPECT_EQ(attr.name, "c");
+    EXPECT_EQ(attr.value, Constant(static_cast<int64_t>(1)));
+  }
+
+  EXPECT_EQ(node->tryGetInput("doesnotexist"), nullptr);
+  EXPECT_EQ(node->tryGetAttribute("doesnotexist"), nullptr);
+}
+
+TEST(NodeTest, NextPrev) {
+  static constexpr std::string_view source =
+      R"(graph(%foo):
+%o1 = aten.foo1(a=%foo)
+%o2 = aten.foo2(a=%o1, b=%foo)
+%o3 = aten.foo3(a=%o2, b=%o2, c=%foo)
+return(%foo, %o1, %o2, %o3)
+)";
+  auto graph = stringToGraph(source);
+  auto foo1 = graph->getValue("o1")->producer();
+  auto foo2 = graph->getValue("o2")->producer();
+  auto foo3 = graph->getValue("o3")->producer();
+  EXPECT_EQ(foo1->next(), foo2);
+  EXPECT_EQ(foo2->next(), foo3);
+  EXPECT_EQ(foo3->prev(), foo2);
+  EXPECT_EQ(foo3->next(), graph->outputNode());
+  EXPECT_EQ(foo2->prev(), foo1);
+  EXPECT_EQ(foo1->prev(), graph->inputNode());
+  EXPECT_EQ(graph->inputNode()->prev(), nullptr);
+  EXPECT_EQ(graph->outputNode()->next(), nullptr);
+}
+
+TEST(GraphTest, IsBefore) {
+  auto source = R"IR(
+    graph(%foo):
+      %o1 = aten.foo1(a=%foo)
+      %o2 = aten.foo2(a=%o1)
+      %o3 = aten.foo3(a=%o2)
+      return (%o3)
+  )IR";
+
+  auto graph = stringToGraph(source);
+  ASSERT_NE(graph, nullptr);
+
+  auto* o1 = graph->tryGetValue("o1");
+  auto* o2 = graph->tryGetValue("o2");
+  auto* o3 = graph->tryGetValue("o3");
+
+  auto* foo1 = o1->producer();
+  auto* foo2 = o2->producer();
+  auto* foo3 = o3->producer();
+
+  EXPECT_TRUE(foo1->isBefore(foo2)) << "foo1 should appear before foo2";
+  EXPECT_TRUE(foo2->isBefore(foo3)) << "foo2 should appear before foo3";
+  EXPECT_TRUE(foo1->isBefore(foo3)) << "foo1 should appear before foo3";
+
+  EXPECT_FALSE(foo2->isBefore(foo1)) << "foo2 should not appear before foo1";
+  EXPECT_FALSE(foo3->isBefore(foo2)) << "foo3 should not appear before foo2";
+}
+
+TEST(GraphTest, RemoveNodeWithUsers) {
+  // Check we shouldn't be able to remove a node that still has users
+  auto source = R"IR(
+    graph(%foo):
+        %o1 = aten.foo1(a=%foo)
+        %o2 = aten.foo2(a=%o1, b=%foo)
+        %o3 = aten.foo3(a=%o2, b=%o2, c=%foo)
+        return (%foo, %o1, %o3)
+  )IR";
+
+  auto graph = stringToGraph(source);
+  ASSERT_NE(graph, nullptr);
+
+  auto* o2 = graph->tryGetValue("o2");
+  auto* foo2 = o2->producer();
+
+  EXPECT_THROW(graph->removeNode(foo2), c10::Error);
+}
+
+TEST(GraphTest, RemoveNodeUnused) {
+  // Check node removal works as expected
+  auto source = R"IR(
+    graph(%foo):
+      %o1 = aten.foo1(a=%foo)
+      %o2 = aten.foo2(a=%o1, b=%foo)
+      %unused = aten.fooUnused(a=%o2)
+      return(%foo, %o1, %o2)
+  )IR";
+  auto graph = stringToGraph(source);
+
+  auto* valUnused = graph->tryGetValue("unused");
+  Node* nodeUnused = valUnused->producer();
+  EXPECT_EQ(nodeUnused->target(), "aten.fooUnused");
+
+  graph->removeNode(nodeUnused);
+  graph->lint();
+
+  // %unused should now be gone
+  EXPECT_EQ(graph->tryGetValue("unused"), nullptr)
+      << "Value %unused should no longer exist in the graph";
+
+  for (const auto& node : graph->nodes()) {
+    EXPECT_NE(node.target(), "aten.fooUnused");
+    for (const auto* output : node.outputs()) {
+      EXPECT_NE(output->name(), "unused")
+          << "Should not find %unused in any remaining node's outputs";
+    }
+  }
+}
+
+TEST(GraphTest, RemoveValue) {
+  auto source = R"IR(
+    graph(%foo):
+  %o1 = aten.foo1(a=%foo)
+  %o2 = aten.foo2(a=%o1, b=%foo)
+  %o3 = aten.foo3(a=%o2, b=%o2, c=%foo)
+  return (%foo, %o1, %o3)
+  )IR";
+
+  auto graph = stringToGraph(source);
+  auto* val_o1 = graph->tryGetValue("o1");
+
+  {
+    // Check we shouldn't be able to remove a value that still has users
+    EXPECT_THROW(graph->removeValue(val_o1), c10::Error);
+  }
+
+  {
+    // Check value removal works as expected
+    graph->replaceAllUses(val_o1, graph->tryGetValue("foo"));
+    graph->removeValue(val_o1);
+    EXPECT_EQ(graph->tryGetValue("%o1"), nullptr);
+  }
+}
+
+TEST(GraphTest, InsertGraph) {
+  auto source = R"IR(
+    graph(%foo):
+        %o1 = aten.foo1(a=%foo)
+        return (%o1)
+  )IR";
+
+  // Subgraph to be inserted
+  auto subgraphSource = R"IR(
+    graph(%x):
+        %s1 = aten.subFoo1(a=%x)
+        %s2 = aten.subFoo2(a=%s1)
+        return (%s2)
+  )IR";
+
+  auto mainGraph = stringToGraph(source);
+  auto subGraph = stringToGraph(subgraphSource);
+
+  // Insert subGraph into mainGraph. Use %o1 as the subGraph's %x
+  auto val_o1 = mainGraph->tryGetValue("o1");
+  std::unordered_map<const Value*, Value*> valueMap;
+  std::vector<Value*> insertedOutputs =
+      mainGraph->insertGraph(*subGraph, {val_o1}, valueMap);
+
+  EXPECT_EQ(insertedOutputs.size(), 1);
+
+  // Check all new nodes are inserted correctly from the copied %s2
+  auto* newS2 = insertedOutputs.front();
+
+  auto* newSubFoo2 = newS2->producer();
+  EXPECT_EQ(newSubFoo2->target(), "aten.subFoo2");
+
+  auto* newS1 = newSubFoo2->inputs().front().value;
+  auto* newSubFoo1 = newS1->producer();
+  EXPECT_EQ(newSubFoo1->target(), "aten.subFoo1");
+
+  EXPECT_EQ(newSubFoo1->inputs().front().value, val_o1);
+
+  auto* subInputVal = subGraph->inputs().front();
+  EXPECT_EQ(valueMap[subInputVal], val_o1);
+  for (const auto& [val1, val2] : valueMap) {
+    if (val1->name() == "s1") {
+      EXPECT_EQ(val2->name(), newS1->name());
+    }
+    if (val1->name() == "s2") {
+      EXPECT_EQ(val2->name(), newS2->name());
+    }
+    if (val1->name() == "x") {
+      EXPECT_EQ(val2->name(), val_o1->name());
+    }
+  }
+
+  mainGraph->lint();
+}
+
+TEST(GraphTest, CleanupDeadNodes) {
+  // %c is unused
+  const std::string source = R"(
+  graph(%x, %y):
+%a = foo(a=%x, b=%y)
+%b = foo1(c=%a)
+%c = foo2(a=%b, b=%y)
+return(%b)
+)";
+  auto graph = stringToGraph(source);
+
+  // Verify that %c exists initially
+  auto* cVal = graph->tryGetValue("c");
+  ASSERT_NE(nullptr, cVal);
+  size_t nodeCountBefore = graph->nodes().size();
+
+  graph->cleanupDeadNodes();
+
+  // %c should now be gone
+  EXPECT_EQ(nullptr, graph->tryGetValue("c"));
+  // %b should still be there
+  EXPECT_NE(nullptr, graph->tryGetValue("b"));
+  EXPECT_EQ(nodeCountBefore - 1, graph->nodes().size());
+}
+
+TEST(GraphTest, RenumberValues) {
+  const std::string source = R"(
+  graph(%x):
+%a = foo(a=%x)
+%b = foo1(a=%a)
+return (%a)
+)";
+  auto graph = stringToGraph(source);
+  graph->cleanupDeadNodes();
+
+  // %b should now be gone
+  EXPECT_EQ(nullptr, graph->tryGetValue("b"));
+
+  // %a should now be the last value
+  EXPECT_EQ(graph->tryGetValue("a")->id(), graph->numValues() - 1);
+
+  // All values should be renumbered
+  size_t numVals = graph->numValues();
+  std::unordered_set<ValueId> ids;
+  ids.reserve(numVals);
+  for (const auto* val : graph->values()) {
+    ASSERT_LT(val->id(), numVals);
+    ids.insert(val->id());
+  }
+
+  // Check ids are contiguous and unique b/w 0 and numVals
+  EXPECT_EQ(numVals, ids.size());
+  for (size_t i = 0; i < numVals; ++i) {
+    EXPECT_NE(ids.end(), ids.find(i));
+  }
+}
+
+TEST(SerializationTest, RoundTrip) {
+  static constexpr std::string_view source =
+      R"(graph(%foo, %bar, %baz):
+%o1 = aten.foo(self=%foo, target=%bar, alpha=0.1)
+return(%o1, %baz)
+)";
+  const auto graph = stringToGraph(source);
+  const auto serialized = graphToString(*graph);
+  EXPECT_EQ(source, serialized);
+}
+
+TEST(SerializationTest, EscapedStringConstant) {
+  const auto parsed =
+      std::get<std::string>(convertAtomicConstant(R"("string_\"escape")"));
+  std::string expected = "string_\\\"escape";
+  EXPECT_EQ(parsed, expected);
+}
+
+TEST(SerializationTest, DeviceConstant) {
+  const auto device =
+      std::get<c10::Device>(convertAtomicConstant("Device{cuda:1}"));
+  EXPECT_EQ(device.index(), 1);
+  EXPECT_EQ(device.type(), c10::DeviceType::CUDA);
+}
+
+TEST(SerializationTest, TrueConstant) {
+  const auto parsedTrue = std::get<bool>(convertAtomicConstant("true"));
+  EXPECT_EQ(parsedTrue, true);
+  const auto parsedFalse = std::get<bool>(convertAtomicConstant("false"));
+  EXPECT_EQ(parsedFalse, false);
+}
+
+TEST(SerializationTest, MemoryFormatConstant) {
+  const auto parsed = std::get<c10::MemoryFormat>(
+      convertAtomicConstant("MemoryFormat::ContiguousFormat"));
+  EXPECT_EQ(parsed, c10::MemoryFormat::Contiguous);
+}
+
+TEST(SerializationTest, FloatConstant) {
+  const auto parsed = std::get<double>(convertAtomicConstant("5.0"));
+  EXPECT_EQ(parsed, 5.0);
+}
+
+TEST(SerializationTest, IntConstant) {
+  const auto parsed = std::get<int64_t>(convertAtomicConstant("5"));
+  EXPECT_EQ(parsed, 5);
+}
+
+TEST(SerializationTest, FloatExponentConstant) {
+  const auto parsed = std::get<double>(convertAtomicConstant("1e-05"));
+  EXPECT_EQ(parsed, 0.00001);
+}
+
+TEST(SerializationTest, SingleElementListConstant) {
+  const auto parsed =
+      std::get<std::vector<int64_t>>(convertListConstant("[1]"));
+  const auto expected = std::vector<int64_t>{1};
+  EXPECT_EQ(parsed, expected);
+}
+
+TEST(SerializationTest, IntListConstant) {
+  const auto parsed =
+      std::get<std::vector<int64_t>>(convertListConstant("[1, 2, 3, 4]"));
+  const auto expected = std::vector<int64_t>{1, 2, 3, 4};
+  EXPECT_EQ(parsed, expected);
+}
+
+TEST(SerializationTest, FloatListConstant) {
+  const auto parsed = std::get<std::vector<double>>(
+      convertListConstant("[1.0, 2.0, 3.0, 4.0]"));
+  const auto expected = std::vector<double>{1.0, 2.0, 3.0, 4.0};
+  EXPECT_EQ(parsed, expected);
+}
+
+TEST(SerializationTest, BoolListConstant) {
+  const auto parsed =
+      std::get<std::vector<bool>>(convertListConstant("[false, true, false]"));
+  const auto expected = std::vector<bool>{false, true, false};
+  EXPECT_EQ(parsed, expected);
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_graph_signature.cpp b/test/cpp/nativert/test_graph_signature.cpp
new file mode 100644
index 000000000000..c57aebb5082c
--- /dev/null
+++ b/test/cpp/nativert/test_graph_signature.cpp
@@ -0,0 +1,77 @@
+#include <gtest/gtest.h>
+#include <torch/nativert/graph/GraphSignature.h>
+
+namespace torch::nativert {
+
+class GraphSignatureTest : public ::testing::Test {
+ protected:
+  // Member to hold the GraphSignature object
+  GraphSignature graph_sig;
+
+  void SetUp() override {
+    torch::_export::TensorArgument param_tensor_arg;
+    param_tensor_arg.set_name("param");
+    torch::_export::InputToParameterSpec param_input_spec;
+    param_input_spec.set_arg(param_tensor_arg);
+    param_input_spec.set_parameter_name("param");
+    torch::_export::InputSpec input_spec_0;
+    input_spec_0.set_parameter(param_input_spec);
+
+    torch::_export::TensorArgument input_tensor_arg;
+    input_tensor_arg.set_name("input");
+    torch::_export::Argument input_arg;
+    input_arg.set_as_tensor(input_tensor_arg);
+    torch::_export::UserInputSpec user_input_spec;
+    user_input_spec.set_arg(input_arg);
+    torch::_export::InputSpec input_spec_1;
+    input_spec_1.set_user_input(user_input_spec);
+
+    torch::_export::TensorArgument loss_tensor_arg;
+    loss_tensor_arg.set_name("loss");
+    torch::_export::LossOutputSpec loss_output_spec;
+    loss_output_spec.set_arg(loss_tensor_arg);
+    torch::_export::OutputSpec output_spec_0;
+    output_spec_0.set_loss_output(loss_output_spec);
+
+    torch::_export::TensorArgument output_tensor_arg;
+    output_tensor_arg.set_name("output");
+    torch::_export::Argument output_arg;
+    output_arg.set_as_tensor(output_tensor_arg);
+    torch::_export::UserOutputSpec user_output_spec;
+    user_output_spec.set_arg(output_arg);
+    torch::_export::OutputSpec output_spec_1;
+    output_spec_1.set_user_output(user_output_spec);
+
+    torch::_export::GraphSignature mock_storage;
+    mock_storage.set_input_specs({input_spec_0, input_spec_1});
+    mock_storage.set_output_specs({output_spec_0, output_spec_1});
+
+    // Initialize the GraphSignature object
+    graph_sig = GraphSignature(mock_storage);
+  }
+};
+
+// Test the constructor with a simple GraphSignature
+TEST_F(GraphSignatureTest, ConstructorTest) {
+  std::vector<std::string_view> expected_params = {"param"};
+  EXPECT_EQ(graph_sig.parameters(), expected_params);
+
+  std::vector<std::string> expected_inputs = {"input"};
+  EXPECT_EQ(graph_sig.userInputs(), expected_inputs);
+
+  EXPECT_EQ(graph_sig.userInputs().size(), 1);
+  EXPECT_EQ(graph_sig.parameters().size(), 1);
+  EXPECT_EQ(graph_sig.lossOutput(), "loss");
+
+  std::vector<std::optional<std::string>> expected_outputs = {"output"};
+  EXPECT_EQ(graph_sig.userOutputs(), expected_outputs);
+}
+
+// Test the replaceAllUses method
+TEST_F(GraphSignatureTest, ReplaceAllUsesTest) {
+  graph_sig.replaceAllUses("output", "new_output");
+  std::vector<std::optional<std::string>> expected_outputs = {"new_output"};
+  EXPECT_EQ(graph_sig.userOutputs(), expected_outputs);
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_itree.cpp b/test/cpp/nativert/test_itree.cpp
new file mode 100644
index 000000000000..e0004f7db77e
--- /dev/null
+++ b/test/cpp/nativert/test_itree.cpp
@@ -0,0 +1,1150 @@
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <fmt/format.h>
+
+#include <c10/util/Enumerate.h>
+#include <torch/nativert/detail/ITree.h>
+
+namespace torch::nativert::detail {
+
+using torch::nativert::Graph;
+using torch::nativert::stringToGraph;
+using torch::nativert::Type;
+using torch::nativert::Value;
+
+std::pair<std::unique_ptr<Graph>, std::vector<const Value*>> makeValues(
+    int count) {
+  auto graph = Graph::createGraph();
+  std::vector<const Value*> values;
+
+  for (int i = 0; i < count; i++) {
+    std::string name = fmt::format("v{}", i);
+    Value* value = graph->addValue(name, Type::Kind::None, nullptr);
+    values.push_back(value);
+  }
+
+  return std::make_pair(std::move(graph), values);
+}
+
+TEST(ITreeTest, Unflatten) {
+  // Original data: [(0, 1), 2, {"4": 7, "5": 8, "6": 9}, (10,), {"11": 12}]
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.tuple",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\", \"6\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": "torch.fx.immutable_collections.immutable_list",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": "torch.fx.immutable_collections.immutable_dict",
+        "context": "[\"11\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+
+  auto [graph, valuePtrs] = makeValues(8);
+
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+  std::vector<c10::IValue> flats = {
+      c10::IValue(0),
+      c10::IValue(1),
+      c10::IValue(2),
+      c10::IValue(7),
+      c10::IValue(8),
+      c10::IValue(9),
+      c10::IValue(10),
+      c10::IValue(12),
+  };
+  auto itree = itreeUnflatten(flats, spec);
+  EXPECT_TRUE(itree.isList());
+  EXPECT_EQ(itree.toListRef().size(), 5);
+
+  EXPECT_TRUE(itree.toListRef().at(0).isTuple());
+  EXPECT_EQ(itree.toListRef().at(0).toTupleRef().elements()[0], c10::IValue(0));
+  EXPECT_EQ(itree.toListRef().at(0).toTupleRef().elements()[1], c10::IValue(1));
+
+  EXPECT_TRUE(itree.toListRef().at(1).isInt());
+  EXPECT_EQ(itree.toListRef().at(1), c10::IValue(2));
+
+  EXPECT_TRUE(itree.toListRef().at(2).isGenericDict());
+  EXPECT_EQ(itree.toListRef().at(2).toGenericDict().at("4"), c10::IValue(7));
+  EXPECT_EQ(itree.toListRef().at(2).toGenericDict().at("5"), c10::IValue(8));
+  EXPECT_EQ(itree.toListRef().at(2).toGenericDict().at("6"), c10::IValue(9));
+
+  EXPECT_TRUE(itree.toListRef().at(3).isList());
+  EXPECT_EQ(itree.toListRef().at(3).toListRef().at(0), c10::IValue(10));
+
+  EXPECT_TRUE(itree.toListRef().at(4).isGenericDict());
+  EXPECT_EQ(itree.toListRef().at(4).toGenericDict().at("11"), c10::IValue(12));
+
+  const auto flattened = itreeFlatten(itree, spec);
+  EXPECT_EQ(flattened.size(), flats.size());
+  for (size_t i = 0; i < flattened.size(); i++) {
+    EXPECT_EQ(flattened[i], flats[i]);
+  }
+}
+
+TEST(ITreeTest, NoVersion) {
+  auto jsonSpec = R"(
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.tuple",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+  )";
+
+  auto [graph, valuePtrs] = makeValues(2);
+  EXPECT_THROW({ itreeSpecLoads(jsonSpec, valuePtrs); }, std::exception);
+}
+
+TEST(ITreeTest, NoField) {
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "children_spec": []
+      },
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\", \"6\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+
+  auto [graph, valuePtrs] = makeValues(3);
+  EXPECT_THROW(itreeSpecLoads(jsonSpec, valuePtrs), std::exception);
+}
+
+TEST(ITreeTest, NoContext) {
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.dict",
+        "context": "[]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+  auto [graph, valuePtrs] = makeValues(3);
+  auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+
+  std::vector<c10::IValue> flats = {
+      c10::IValue(7),
+      c10::IValue(8),
+      c10::IValue(9),
+  };
+  ASSERT_DEATH({ itreeUnflatten(flats, spec); }, "Check failed");
+}
+
+TEST(ITreeTest, TooManyContext) {
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\", \"6\", \"10\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+
+  auto [graph, valuePtrs] = makeValues(3);
+  auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+
+  std::vector<c10::IValue> flats = {
+      c10::IValue(7),
+      c10::IValue(8),
+      c10::IValue(9),
+  };
+  ASSERT_DEATH({ itreeUnflatten(flats, spec); }, "Check failed");
+}
+
+TEST(ITreeTest, DoubleRegister) {
+  EXPECT_THROW(
+      { registerPytreeNode("builtins.dict", NodeDef{}); }, std::exception);
+}
+
+TEST(ITreeTest, NotEnoughUnflatten) {
+  // Original data: [(0, 1), 2, {"4": 7, "5": 8, "6": 9}]
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.tuple",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\", \"6\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+  auto [graph, valuePtrs] = makeValues(6);
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+  std::vector<c10::IValue> flats = {
+      c10::IValue(0),
+      c10::IValue(1),
+      c10::IValue(2),
+      c10::IValue(7),
+  };
+  ASSERT_DEATH({ itreeUnflatten(flats, spec); }, "Check failed");
+}
+
+TEST(ITreeTest, TooManyUnflatten) {
+  // Original data: [(0, 1), 2, {"4": 7, "5": 8, "6": 9}]
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.tuple",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\", \"6\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+  auto [graph, valuePtrs] = makeValues(6);
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+  std::vector<c10::IValue> flats = {
+      c10::IValue(0),
+      c10::IValue(1),
+      c10::IValue(2),
+      c10::IValue(7),
+      c10::IValue(0),
+      c10::IValue(1),
+      c10::IValue(2),
+      c10::IValue(7),
+      c10::IValue(0),
+      c10::IValue(1),
+      c10::IValue(2),
+      c10::IValue(7),
+  };
+  ASSERT_DEATH({ itreeUnflatten(flats, spec); }, "Check failed");
+}
+
+TEST(ITreeTest, Flatten) {
+  // Original data: [(0, 1), 2, {"4": 7, "5": 8, "6": 9}, (10,), {"11": 12}]
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.tuple",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\", \"6\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": "torch.fx.immutable_collections.immutable_list",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": "torch.fx.immutable_collections.immutable_dict",
+        "context": "[\"11\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+  auto [graph, valuePtrs] = makeValues(8);
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+  auto tup = c10::ivalue::Tuple::create({c10::IValue(0), c10::IValue(1)});
+  c10::Dict<c10::IValue, c10::IValue> dict(
+      c10::StringType::get(), c10::AnyType::get());
+  dict.insert("4", c10::IValue(7));
+  dict.insert("5", c10::IValue(8));
+  dict.insert("6", c10::IValue(9));
+  c10::List<c10::IValue> ilist(c10::AnyType::get());
+  ilist.push_back(c10::IValue(10));
+  c10::Dict<c10::IValue, c10::IValue> idict(
+      c10::StringType::get(), c10::AnyType::get());
+  idict.insert("11", c10::IValue(12));
+  c10::List<c10::IValue> list(c10::AnyType::get());
+  list.push_back(std::move(tup));
+  list.push_back(c10::IValue(2));
+  list.push_back(std::move(dict));
+  list.push_back(std::move(ilist));
+  list.push_back(std::move(idict));
+  auto flats = itreeFlatten(c10::IValue{list}, spec);
+  std::vector<c10::IValue> expected = {
+      c10::IValue(0),
+      c10::IValue(1),
+      c10::IValue(2),
+      c10::IValue(7),
+      c10::IValue(8),
+      c10::IValue(9),
+      c10::IValue(10),
+      c10::IValue(12),
+  };
+  for (const auto& [i, flat] : c10::enumerate(flats)) {
+    EXPECT_EQ(flat, expected.at(i));
+  }
+}
+
+TEST(ITreeTest, IValueApplyFromArgs) {
+  // inputSpec for testing is generated from E2ETestModelWithNestedDictInput
+  /*
+      args = (
+        {
+            "a": (
+                torch.rand(4, 4),
+                {
+                    123: (torch.rand(4, 4), torch.rand(4, 4)),
+                    234: (torch.rand(4, 4), torch.rand(4, 4)),
+                },
+            ),
+            "b": (
+                torch.rand(4, 4),
+                {
+                    345: (torch.rand(4, 4), torch.rand(4, 4)),
+                    456: (torch.rand(4, 4), torch.rand(4, 4)),
+                },
+            ),
+        },
+    )*/
+  auto jsonSpec = R"(
+[
+    1,
+    {
+        "type": "builtins.tuple",
+        "context": "null",
+        "children_spec": [
+            {
+                "type": "builtins.tuple",
+                "context": "null",
+                "children_spec": [
+                    {
+                        "type": "builtins.dict",
+                        "context": "[\"a\", \"b\"]",
+                        "children_spec": [
+                            {
+                                "type": "builtins.tuple",
+                                "context": "null",
+                                "children_spec": [
+                                    {
+                                        "type": null,
+                                        "context": null,
+                                        "children_spec": []
+                                    },
+                                    {
+                                        "type": "builtins.dict",
+                                        "context": "[123, 234]",
+                                        "children_spec": [
+                                            {
+                                                "type": "builtins.tuple",
+                                                "context": "null",
+                                                "children_spec": [
+                                                    {
+                                                        "type": null,
+                                                        "context": null,
+                                                        "children_spec": []
+                                                    },
+                                                    {
+                                                        "type": null,
+                                                        "context": null,
+                                                        "children_spec": []
+                                                    }
+                                                ]
+                                            },
+                                            {
+                                                "type": "builtins.tuple",
+                                                "context": "null",
+                                                "children_spec": [
+                                                    {
+                                                        "type": null,
+                                                        "context": null,
+                                                        "children_spec": []
+                                                    },
+                                                    {
+                                                        "type": null,
+                                                        "context": null,
+                                                        "children_spec": []
+                                                    }
+                                                ]
+                                            }
+                                        ]
+                                    }
+                                ]
+                            },
+                            {
+                                "type": "builtins.tuple",
+                                "context": "null",
+                                "children_spec": [
+                                    {
+                                        "type": null,
+                                        "context": null,
+                                        "children_spec": []
+                                    },
+                                    {
+                                        "type": "builtins.dict",
+                                        "context": "[345, 456]",
+                                        "children_spec": [
+                                            {
+                                                "type": "builtins.tuple",
+                                                "context": "null",
+                                                "children_spec": [
+                                                    {
+                                                        "type": null,
+                                                        "context": null,
+                                                        "children_spec": []
+                                                    },
+                                                    {
+                                                        "type": null,
+                                                        "context": null,
+                                                        "children_spec": []
+                                                    }
+                                                ]
+                                            },
+                                            {
+                                                "type": "builtins.tuple",
+                                                "context": "null",
+                                                "children_spec": [
+                                                    {
+                                                        "type": null,
+                                                        "context": null,
+                                                        "children_spec": []
+                                                    },
+                                                    {
+                                                        "type": null,
+                                                        "context": null,
+                                                        "children_spec": []
+                                                    }
+                                                ]
+                                            }
+                                        ]
+                                    }
+                                ]
+                            }
+                        ]
+                    }
+                ]
+            },
+            {
+                "type": "builtins.dict",
+                "context": "[]",
+                "children_spec": []
+            }
+        ]
+    }
+]
+  )";
+
+  auto tup_a1_123 =
+      c10::ivalue::Tuple::create({c10::IValue(1), c10::IValue(2)});
+  auto tup_a1_234 =
+      c10::ivalue::Tuple::create({c10::IValue(3), c10::IValue(4)});
+  c10::Dict<c10::IValue, c10::IValue> dict_a1(
+      c10::StringType::get(), c10::AnyType::get());
+  dict_a1.insert(123, tup_a1_123);
+  dict_a1.insert(234, tup_a1_234);
+  auto tup_a =
+      c10::ivalue::Tuple::create({c10::IValue(0), c10::IValue(dict_a1)});
+
+  auto tup_b1_345 =
+      c10::ivalue::Tuple::create({c10::IValue(6), c10::IValue(7)});
+  auto tup_b1_456 =
+      c10::ivalue::Tuple::create({c10::IValue(8), c10::IValue(9)});
+  c10::Dict<c10::IValue, c10::IValue> dict_b1(
+      c10::StringType::get(), c10::AnyType::get());
+  dict_b1.insert(345, tup_b1_345);
+  dict_b1.insert(456, tup_b1_456);
+  auto tup_b =
+      c10::ivalue::Tuple::create({c10::IValue(5), c10::IValue(dict_b1)});
+
+  c10::Dict<c10::IValue, c10::IValue> dict(
+      c10::StringType::get(), c10::AnyType::get());
+  dict.insert("a", tup_a);
+  dict.insert("b", tup_b);
+  std::vector<c10::IValue> args = {c10::IValue(dict)};
+
+  for (int usedIdx = 0; usedIdx < 10; usedIdx++) {
+    std::vector<bool> isUsed(10, false);
+    isUsed[usedIdx] = true;
+    std::stringstream ss;
+    for (int i = 0; i < 10; ++i) {
+      if (isUsed[i]) {
+        ss << fmt::format("%o1 = aten.foo(a=%a{})\n", i);
+      }
+    }
+    std::string source = fmt::format(
+        R"(graph(%a0, %a1, %a2, %a3, %a4, %a5, %a6, %a7, %a8, %a9):
+{}
+return(%o1)
+)",
+        ss.str());
+
+    auto graph = stringToGraph(source);
+    std::vector<const Value*> userInputs(
+        graph->userInputs().begin(), graph->userInputs().end());
+
+    const auto spec = itreeSpecLoads(jsonSpec, userInputs);
+
+    std::vector<int> visited;
+    auto fn = [&](const c10::IValue& leaf, const Value* value) {
+      visited.push_back(value->id());
+    };
+    ivalueApplyFromArgs(fn, args, {}, spec);
+
+    EXPECT_EQ(visited.size(), 1);
+    EXPECT_EQ(visited[0], usedIdx);
+  }
+}
+
+TEST(ITreeTest, UnmatchedFlattenType) {
+  // Original data: [(0, 1), 2, {"4": 7, "5": 8, "6": 9}]
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.tuple",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\", \"6\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+  auto [graph, valuePtrs] = makeValues(6);
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+  auto tup = c10::ivalue::Tuple::create({c10::IValue(0), c10::IValue(1)});
+  c10::Dict<c10::IValue, c10::IValue> dict(
+      c10::StringType::get(), c10::AnyType::get());
+  dict.insert("4", c10::IValue(7));
+  dict.insert("5", c10::IValue(8));
+  dict.insert("6", c10::IValue(9));
+  EXPECT_THROW(
+      { itreeFlatten(c10::IValue{std::move(dict)}, spec); }, std::exception);
+}
+
+TEST(ITreeTest, UnmatchedDictFlatten) {
+  // Original data: [(0, 1), 2, {"4": 7, "5": 8, "6": 9}]
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.tuple",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\", \"6\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+  auto [graph, valuePtrs] = makeValues(6);
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+  auto tup = c10::ivalue::Tuple::create({c10::IValue(0), c10::IValue(1)});
+  c10::Dict<c10::IValue, c10::IValue> dict(
+      c10::StringType::get(), c10::AnyType::get());
+  dict.insert("4", c10::IValue(7));
+  dict.insert("5", c10::IValue(8));
+  dict.insert("100", c10::IValue(8));
+  dict.insert("101", c10::IValue(8));
+  c10::List<c10::IValue> list(c10::AnyType::get());
+  list.push_back(std::move(tup));
+  list.push_back(c10::IValue(2));
+  list.push_back(std::move(dict));
+  ASSERT_DEATH(
+      { itreeFlatten(c10::IValue{std::move(list)}, spec); }, "Check failed");
+}
+
+TEST(ITreeTest, DictFlattenTest) {
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\", \"6\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+  auto [graph, valuePtrs] = makeValues(3);
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+  c10::Dict<c10::IValue, c10::IValue> dict(
+      c10::StringType::get(), c10::AnyType::get());
+  // allow dict.size < context
+  // test dict.size=2 , context,size=3,
+  dict.insert("4", c10::IValue(7));
+  dict.insert("5", c10::IValue(8));
+  c10::List<c10::IValue> list(c10::AnyType::get());
+  list.push_back(std::move(dict));
+  itreeFlatten(c10::IValue{std::move(list)}, spec);
+}
+
+TEST(ITreeTest, UnmatchedTupleFlatten) {
+  // Original data: [(0, 1), 2, {"4": 7, "5": 8, "6": 9}]
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.list",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.tuple",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\", \"6\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+  auto [graph, valuePtrs] = makeValues(6);
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+  auto tup = c10::ivalue::Tuple::create({c10::IValue(0)});
+  c10::Dict<c10::IValue, c10::IValue> dict(
+      c10::StringType::get(), c10::AnyType::get());
+  dict.insert("4", c10::IValue(7));
+  dict.insert("5", c10::IValue(8));
+  dict.insert("6", c10::IValue(8));
+  c10::List<c10::IValue> list(c10::AnyType::get());
+  list.push_back(std::move(tup));
+  list.push_back(c10::IValue(2));
+  list.push_back(std::move(dict));
+  ASSERT_DEATH(
+      { itreeFlatten(c10::IValue{std::move(list)}, spec); }, "Check failed");
+}
+
+TEST(ITreeTest, ToAtenType) {
+  // Original data: ((0, 1), 2, {"4": 7, "5": 8}, [10], {6: 9})
+  auto jsonSpec = R"(
+[
+  1,
+  {
+    "type": "builtins.tuple",
+    "context": "null",
+    "children_spec": [
+      {
+        "type": "builtins.tuple",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": null,
+        "context": null,
+        "children_spec": []
+      },
+      {
+        "type": "builtins.dict",
+        "context": "[\"4\", \"5\"]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          },
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": "builtins.list",
+        "context": "null",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      },
+      {
+        "type": "builtins.dict",
+        "context": "[6]",
+        "children_spec": [
+          {
+            "type": null,
+            "context": null,
+            "children_spec": []
+          }
+        ]
+      }
+    ]
+  }
+]
+  )";
+  auto [graph, valuePtrs] = makeValues(7);
+  const auto spec = itreeSpecLoads(jsonSpec, valuePtrs);
+  auto atenType = spec.toAtenType();
+
+  // Root level is tuple.
+  EXPECT_EQ(atenType->kind(), c10::TypeKind::TupleType);
+  const c10::TupleType& rootType = atenType->expectRef<c10::TupleType>();
+  EXPECT_EQ(rootType.elements().size(), 5);
+
+  at::TypePtr elementType = rootType.elements()[0];
+  EXPECT_EQ(elementType->kind(), c10::TypeKind::TupleType);
+  EXPECT_EQ(
+      elementType->expectRef<c10::TupleType>().elements()[0]->kind(),
+      c10::TypeKind::AnyType);
+  EXPECT_EQ(
+      elementType->expectRef<c10::TupleType>().elements()[1]->kind(),
+      c10::TypeKind::AnyType);
+
+  elementType = rootType.elements()[1];
+  EXPECT_EQ(elementType->kind(), c10::TypeKind::AnyType);
+
+  elementType = rootType.elements()[2];
+  EXPECT_EQ(elementType->kind(), c10::TypeKind::DictType);
+  EXPECT_EQ(
+      elementType->expectRef<c10::DictType>().getKeyType()->kind(),
+      c10::TypeKind::StringType);
+  EXPECT_EQ(
+      elementType->expectRef<c10::DictType>().getValueType()->kind(),
+      c10::TypeKind::AnyType);
+
+  elementType = rootType.elements()[3];
+  EXPECT_EQ(elementType->kind(), c10::TypeKind::ListType);
+  EXPECT_EQ(
+      elementType->expectRef<c10::ListType>().getElementType()->kind(),
+      c10::TypeKind::AnyType);
+
+  elementType = rootType.elements()[4];
+  EXPECT_EQ(elementType->kind(), c10::TypeKind::DictType);
+  EXPECT_EQ(
+      elementType->expectRef<c10::DictType>().getKeyType()->kind(),
+      c10::TypeKind::IntType);
+  EXPECT_EQ(
+      elementType->expectRef<c10::DictType>().getValueType()->kind(),
+      c10::TypeKind::AnyType);
+}
+
+} // namespace torch::nativert::detail
diff --git a/test/cpp/nativert/test_layout_planner_algorithm.cpp b/test/cpp/nativert/test_layout_planner_algorithm.cpp
new file mode 100644
index 000000000000..72b29db861e2
--- /dev/null
+++ b/test/cpp/nativert/test_layout_planner_algorithm.cpp
@@ -0,0 +1,63 @@
+#include <c10/util/Enumerate.h>
+#include <gtest/gtest.h>
+
+#include <torch/nativert/executor/memory/Bump.h>
+#include <torch/nativert/executor/memory/GreedyBySize.h>
+
+using namespace ::testing;
+using namespace torch::nativert;
+
+std::vector<AllocationSpec> create_test_allocation_specs() {
+  std::vector<AllocationSpec> specs;
+
+  const std::vector<std::tuple<size_t, size_t, size_t>> test_cases = {
+      {0, 1, 32},
+      {1, 4, 28},
+      {2, 5, 36},
+      {3, 5, 16},
+      {4, 5, 8},
+      {5, 7, 64},
+      {6, 8, 10},
+      {7, 8, 40},
+  };
+
+  specs.reserve(test_cases.size());
+  for (const auto& [l_start, l_end, size] : test_cases) {
+    specs.push_back(AllocationSpec{AllocationLifetime(l_start, l_end), size});
+  };
+
+  return specs;
+}
+
+// figure 6 -- https://arxiv.org/pdf/2001.03288
+TEST(LayoutPlannerAlgorithmTests, TestGreedyBySize) {
+  auto result = GreedyBySizeAllocationPlanner(create_test_allocation_specs());
+
+  EXPECT_EQ(result.total_size, 124);
+
+  auto& allocations = result.allocations;
+
+  EXPECT_EQ(allocations[0].offset, 0);
+  EXPECT_EQ(allocations[1].offset, 32);
+  EXPECT_EQ(allocations[2].offset, 64);
+  EXPECT_EQ(allocations[3].offset, 100);
+  EXPECT_EQ(allocations[4].offset, 116);
+  EXPECT_EQ(allocations[5].offset, 0);
+  EXPECT_EQ(allocations[6].offset, 104);
+  EXPECT_EQ(allocations[7].offset, 64);
+}
+
+TEST(LayoutPlannerAlgorithmTests, TestBump) {
+  auto specs = create_test_allocation_specs();
+  auto result = BumpAllocationPlanner(create_test_allocation_specs());
+
+  auto& allocations = result.allocations;
+
+  size_t offset = 0;
+  for (auto&& [i, spec] : c10::enumerate(specs)) {
+    EXPECT_EQ(allocations[i].offset, offset);
+    offset += spec.size;
+  }
+
+  EXPECT_EQ(result.total_size, offset);
+}
diff --git a/test/cpp/nativert/test_mpmc_queue.cpp b/test/cpp/nativert/test_mpmc_queue.cpp
new file mode 100644
index 000000000000..1a8abeb31732
--- /dev/null
+++ b/test/cpp/nativert/test_mpmc_queue.cpp
@@ -0,0 +1,134 @@
+#include <atomic>
+#include <thread>
+
+#include <gtest/gtest.h>
+
+#include <torch/nativert/detail/MPMCQueue.h>
+
+using torch::nativert::detail::MPMCQueue;
+
+TEST(MPMCQueueTest, EmptyQueue) {
+  MPMCQueue<int> queue(5);
+  int out = 0;
+  EXPECT_FALSE(queue.readIfNotEmpty(out));
+}
+
+TEST(MPMCQueueTest, SingleElement) {
+  MPMCQueue<int> queue(5);
+  EXPECT_TRUE(queue.writeIfNotFull(10));
+  int out = 0;
+  EXPECT_TRUE(queue.readIfNotEmpty(out));
+  EXPECT_EQ(out, 10);
+}
+
+TEST(MPMCQueueTest, MultipleElements) {
+  MPMCQueue<int> queue(5);
+  for (int i = 0; i < 5; ++i) {
+    EXPECT_TRUE(queue.writeIfNotFull(i));
+  }
+  for (int i = 0; i < 5; ++i) {
+    int out = 0;
+    EXPECT_TRUE(queue.readIfNotEmpty(out));
+    EXPECT_EQ(out, i);
+  }
+}
+
+TEST(MPMCQueueTest, FullQueue) {
+  MPMCQueue<int> queue(5);
+  for (int i = 0; i < 5; ++i) {
+    EXPECT_TRUE(queue.writeIfNotFull(i));
+  }
+  EXPECT_FALSE(queue.writeIfNotFull(10));
+}
+
+TEST(MPMCQueueTest, ConcurrentAccess) {
+  MPMCQueue<int> queue(10);
+  std::thread writer([&queue]() {
+    for (int i = 0; i < 5; ++i) {
+      queue.writeIfNotFull(i);
+    }
+  });
+  std::thread reader([&queue]() {
+    for (int i = 0; i < 5; ++i) {
+      int out = 0;
+      while (!queue.readIfNotEmpty(out)) {
+        // Wait until an element is available
+        // TODO We could provide a blocking version of read() instead of
+        // looping here. We only provide a non blocking wait API because
+        // for now the queue is paired with a semaphore in executor.
+        std::this_thread::yield();
+      }
+      EXPECT_LT(out, 5);
+    }
+  });
+  writer.join();
+  reader.join();
+}
+
+TEST(MPMCQueueTest, MPMCConcurrentAccess) {
+  const size_t queueCapacity = 100000;
+  const size_t numWriters = 5;
+  const size_t numReaders = 5;
+  const size_t numElementsPerWriter = 10000;
+  MPMCQueue<int> queue(queueCapacity);
+  // Writer threads
+  std::vector<std::thread> writers;
+  writers.reserve(numWriters);
+  for (size_t i = 0; i < numWriters; ++i) {
+    writers.emplace_back([&]() {
+      for (size_t j = 0; j < numElementsPerWriter; ++j) {
+        size_t value = i * numElementsPerWriter + j;
+        while (!queue.writeIfNotFull(static_cast<int>(value))) {
+          // Retry until the queue has space
+          // TODO We could provide a blocking version of read() instead of
+          // looping here. We only provide a non blocking wait API because
+          // for now the queue is paired with a semaphore in executor.
+          std::this_thread::yield();
+        }
+      }
+    });
+  }
+  // Reader threads
+  std::vector<std::thread> readers;
+  std::atomic<size_t> totalReadCount{0};
+  readers.reserve(numReaders);
+  for (size_t i = 0; i < numReaders; ++i) {
+    readers.emplace_back([&]() {
+      int value = 0;
+      while (totalReadCount < numWriters * numElementsPerWriter) {
+        if (queue.readIfNotEmpty(value)) {
+          ++totalReadCount;
+        } else {
+          // TODO We could provide a blocking version of read() instead of
+          // looping here. We only provide a non blocking wait API because
+          // for now the queue is paired with a semaphore in executor.
+          std::this_thread::yield();
+        }
+      }
+    });
+  }
+  // Join all threads
+  for (auto& writer : writers) {
+    writer.join();
+  }
+  for (auto& reader : readers) {
+    reader.join();
+  }
+  // Verify that all elements were read
+  EXPECT_EQ(totalReadCount, numWriters * numElementsPerWriter);
+}
+
+TEST(MPMCQueueTest, MoveOnlyType) {
+  struct MoveOnly {
+    MoveOnly() = default;
+    MoveOnly(const MoveOnly&) = delete;
+    MoveOnly& operator=(const MoveOnly&) = delete;
+    MoveOnly(MoveOnly&&) = default;
+    MoveOnly& operator=(MoveOnly&&) = default;
+    ~MoveOnly() = default;
+  };
+  MPMCQueue<MoveOnly> queue(5);
+  EXPECT_TRUE(queue.writeIfNotFull(MoveOnly()));
+  MoveOnly out;
+  EXPECT_TRUE(queue.readIfNotEmpty(out));
+}
diff --git a/test/cpp/nativert/test_op_kernel.cpp b/test/cpp/nativert/test_op_kernel.cpp
new file mode 100644
index 000000000000..312e355f9ca2
--- /dev/null
+++ b/test/cpp/nativert/test_op_kernel.cpp
@@ -0,0 +1,55 @@
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <gtest/gtest.h>
+#include <torch/nativert/executor/OpKernel.h>
+
+namespace torch::nativert {
+
+int64_t increment_kernel(const at::Tensor& tensor, int64_t input) {
+  return input + 1;
+}
+
+TEST(OpKernelTest, GetOperatorForTargetValid) {
+  auto registrar = c10::RegisterOperators().op(
+      "test::foo(Tensor dummy, int input) -> int", &increment_kernel);
+  std::string target = "test.foo.default";
+  EXPECT_NO_THROW({
+    c10::OperatorHandle handle = getOperatorForTarget(target);
+    EXPECT_TRUE(handle.hasSchema());
+    EXPECT_EQ(handle.operator_name().name, "test::foo");
+    EXPECT_EQ(handle.operator_name().overload_name, "");
+  });
+}
+
+TEST(OpKernelTest, GetOperatorForTargetInvalid) {
+  std::string target = "invalid.target";
+  EXPECT_THROW(getOperatorForTarget(target), c10::Error);
+}
+
+TEST(OpKernelTest, GetReadableArgs) {
+  c10::FunctionSchema schema = c10::FunctionSchema(
+      "test_op",
+      "",
+      {c10::Argument("tensor_arg"),
+       c10::Argument("tensor_list_arg"),
+       c10::Argument("int_arg"),
+       c10::Argument("none_arg")},
+      {});
+  std::vector<c10::IValue> stack = {
+      at::tensor({1, 2, 3}),
+      c10::IValue(
+          std::vector<at::Tensor>{at::tensor({1, 2}), at::tensor({3, 4})}),
+      c10::IValue(1),
+      c10::IValue(),
+  };
+  std::string expected =
+      "arg0 tensor_arg: Tensor int[3]cpu\n"
+      "arg1 tensor_list_arg: GenericList [int[2]cpu, int[2]cpu, ]\n"
+      "arg2 int_arg: Int 1\n"
+      "arg3 none_arg: None \n";
+
+  std::string result = readableArgs(schema, stack);
+  EXPECT_EQ(result, expected);
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_placement.cpp b/test/cpp/nativert/test_placement.cpp
new file mode 100644
index 000000000000..e88ae20e1de0
--- /dev/null
+++ b/test/cpp/nativert/test_placement.cpp
@@ -0,0 +1,104 @@
+
+#include <c10/core/Device.h>
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+#include <torch/nativert/executor/Placement.h>
+
+using namespace ::testing;
+
+namespace torch::nativert {
+TEST(PlacementTest, NormalizeDevice) {
+  c10::Device cpuDevice = c10::Device(c10::DeviceType::CPU);
+  c10::Device cpuDevice1 = c10::Device(c10::DeviceType::CPU);
+  cpuDevice1.set_index(1);
+
+  EXPECT_EQ(normalizeDevice(cpuDevice), cpuDevice);
+  EXPECT_NE(normalizeDevice(cpuDevice1), cpuDevice1);
+
+  c10::Device cudaDevice = c10::Device(c10::DeviceType::CUDA);
+  c10::Device cudaDevice1 = c10::Device(c10::DeviceType::CUDA, 1);
+  EXPECT_EQ(normalizeDevice(cudaDevice), c10::Device(c10::DeviceType::CUDA, 0));
+  EXPECT_EQ(
+      normalizeDevice(cudaDevice1), c10::Device(c10::DeviceType::CUDA, 1));
+
+  EXPECT_NE(
+      normalizeDevice(cudaDevice1), c10::Device(c10::DeviceType::CUDA, 0));
+}
+
+TEST(PlacementTest, IsSameDevice) {
+  c10::Device cpuDevice = c10::Device(c10::DeviceType::CPU);
+  c10::Device cpuDevice1 = c10::Device(c10::DeviceType::CPU);
+  cpuDevice1.set_index(1);
+
+  EXPECT_TRUE(isSameDevice(cpuDevice, cpuDevice));
+  EXPECT_TRUE(isSameDevice(cpuDevice, cpuDevice1));
+
+  c10::Device cudaDevice = c10::Device(c10::DeviceType::CUDA);
+  c10::Device cudaDevice0 = c10::Device(c10::DeviceType::CUDA, 0);
+  c10::Device cudaDevice1 = c10::Device(c10::DeviceType::CUDA, 1);
+  EXPECT_TRUE(isSameDevice(cudaDevice, cudaDevice0));
+  EXPECT_FALSE(isSameDevice(cudaDevice0, cudaDevice1));
+
+  EXPECT_FALSE(isSameDevice(cudaDevice0, cpuDevice));
+}
+
+TEST(PlacementTest, PlacementDefaultOnly) {
+  Placement placement(c10::Device(c10::DeviceType::CUDA, 0));
+
+  std::ostringstream os;
+  os << placement;
+  EXPECT_EQ(os.str(), "|cuda:0");
+
+  c10::Device cuda0 = c10::Device(c10::DeviceType::CUDA, 0);
+  c10::Device cuda1 = c10::Device(c10::DeviceType::CUDA, 1);
+  c10::Device cuda2 = c10::Device(c10::DeviceType::CUDA, 2);
+
+  EXPECT_EQ(placement.getMappedDevice(cuda0), cuda0);
+  EXPECT_EQ(placement.getMappedDevice(cuda1), cuda0);
+  EXPECT_EQ(placement.getMappedDevice(cuda2), cuda0);
+}
+
+TEST(PlacementTest, PlacementBasic) {
+  Placement placement(
+      {{c10::Device(c10::DeviceType::CPU), c10::Device(c10::DeviceType::CPU)},
+       {c10::Device(c10::DeviceType::CUDA, 0),
+        c10::Device(c10::DeviceType::CUDA, 1)},
+       {c10::Device(c10::DeviceType::CUDA, 1),
+        c10::Device(c10::DeviceType::CUDA, 2)}},
+      c10::Device(c10::DeviceType::CUDA, 0));
+
+  std::ostringstream os;
+  os << placement;
+  EXPECT_EQ(os.str(), "cpu|cpu,cuda:0|cuda:1,cuda:1|cuda:2,|cuda:0");
+
+  c10::Device cpu = c10::Device(c10::DeviceType::CPU);
+  c10::Device cuda0 = c10::Device(c10::DeviceType::CUDA, 0);
+  c10::Device cuda1 = c10::Device(c10::DeviceType::CUDA, 1);
+  c10::Device cuda2 = c10::Device(c10::DeviceType::CUDA, 2);
+  c10::Device cuda3 = c10::Device(c10::DeviceType::CUDA, 3);
+
+  EXPECT_EQ(placement.getMappedDevice(cpu), cpu);
+  EXPECT_EQ(placement.getMappedDevice(cuda0), cuda1);
+  EXPECT_EQ(placement.getMappedDevice(cuda1), cuda2);
+  EXPECT_EQ(placement.getMappedDevice(cuda2), cuda0);
+  EXPECT_EQ(placement.getMappedDevice(cuda3), cuda0);
+}
+
+TEST(PlacementTest, Placement) {
+  std::unordered_map<c10::Device, c10::Device> deviceMap1 = {
+      {c10::Device("cuda:0"), c10::Device("cuda:1")}};
+  Placement p1(deviceMap1);
+  EXPECT_EQ(p1.getMappedDevice(c10::Device("cpu")), c10::Device("cpu"));
+  EXPECT_EQ(p1.getMappedDevice(c10::Device("cuda")), c10::Device("cuda:1"));
+  EXPECT_EQ(p1.getMappedDevice(c10::Device("cuda:0")), c10::Device("cuda:1"));
+
+  std::unordered_map<c10::Device, c10::Device> deviceMap2 = {
+      {c10::Device("cpu"), c10::Device("cuda")}};
+  Placement p2(deviceMap2);
+  EXPECT_EQ(p2.getMappedDevice(c10::Device("cpu")), c10::Device("cuda:0"));
+  EXPECT_EQ(p2.getMappedDevice(c10::Device("cuda:0")), c10::Device("cuda:0"));
+  EXPECT_EQ(p2.getMappedDevice(c10::Device("cuda:1")), c10::Device("cuda:1"));
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_serialization.cpp b/test/cpp/nativert/test_serialization.cpp
new file mode 100644
index 000000000000..3504f02b53a9
--- /dev/null
+++ b/test/cpp/nativert/test_serialization.cpp
@@ -0,0 +1,51 @@
+#include <gtest/gtest.h>
+#include <torch/nativert/graph/Serialization.h>
+
+namespace torch::nativert {
+TEST(SerializationTest, CheckIsSymbolic) {
+  torch::_export::TensorArgument tensor_arg;
+  torch::_export::Argument as_tensor_arg;
+  as_tensor_arg.set_as_tensor(tensor_arg);
+  EXPECT_TRUE(isSymbolic(as_tensor_arg));
+
+  std::vector<torch::_export::TensorArgument> tensor_args;
+  torch::_export::Argument as_tensors_arg;
+  as_tensors_arg.set_as_tensors(tensor_args);
+  EXPECT_TRUE(isSymbolic(as_tensors_arg));
+
+  torch::_export::SymIntArgument sym_int_arg;
+  torch::_export::Argument as_sym_int_arg;
+  as_sym_int_arg.set_as_sym_int(sym_int_arg);
+  EXPECT_TRUE(isSymbolic(as_sym_int_arg));
+
+  torch::_export::Argument as_int_arg;
+  as_int_arg.set_as_int(static_cast<int64_t>(1));
+  EXPECT_FALSE(isSymbolic(as_int_arg));
+
+  torch::_export::Argument as_bool_arg;
+  as_bool_arg.set_as_bool(true);
+  EXPECT_FALSE(isSymbolic(as_bool_arg));
+
+  torch::_export::Argument as_string_arg;
+  as_string_arg.set_as_string("test_string");
+  EXPECT_FALSE(isSymbolic(as_string_arg));
+}
+
+TEST(SerializationTest, ConstantToValue) {
+  torch::_export::Argument as_int_arg;
+  as_int_arg.set_as_int(static_cast<int64_t>(42));
+  auto value = constantToValue(as_int_arg, false);
+  EXPECT_EQ(value, Constant(static_cast<int64_t>(42)));
+
+  torch::_export::Argument as_bool_arg;
+  as_bool_arg.set_as_bool(true);
+  value = constantToValue(as_bool_arg, false);
+  EXPECT_EQ(value, Constant(true));
+
+  torch::_export::Argument as_string_arg;
+  as_string_arg.set_as_string("test_string");
+  value = constantToValue(as_string_arg, false);
+  EXPECT_EQ(value, Constant("test_string"));
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_tensor_meta.cpp b/test/cpp/nativert/test_tensor_meta.cpp
new file mode 100644
index 000000000000..8c09898368f3
--- /dev/null
+++ b/test/cpp/nativert/test_tensor_meta.cpp
@@ -0,0 +1,62 @@
+#include <gtest/gtest.h>
+#include <torch/nativert/graph/TensorMeta.h>
+
+namespace torch::nativert {
+TEST(TensorMetaTest, ScalarTypeConversion) {
+  EXPECT_EQ(
+      convertJsonScalarType(torch::_export::ScalarType::FLOAT),
+      c10::ScalarType::Float);
+  EXPECT_EQ(
+      convertJsonScalarType(torch::_export::ScalarType::INT),
+      c10::ScalarType::Int);
+  EXPECT_EQ(
+      convertJsonScalarType(torch::_export::ScalarType::HALF),
+      c10::ScalarType::Half);
+  EXPECT_EQ(
+      convertJsonScalarType(torch::_export::ScalarType::COMPLEXHALF),
+      c10::ScalarType::ComplexHalf);
+  EXPECT_EQ(
+      convertJsonScalarType(torch::_export::ScalarType::BFLOAT16),
+      c10::ScalarType::BFloat16);
+  EXPECT_THROW(
+      convertJsonScalarType(static_cast<torch::_export::ScalarType>(100)),
+      c10::Error);
+}
+TEST(TensorMetaTest, MemoryFormatConversion) {
+  EXPECT_EQ(
+      convertJsonMemoryFormat(torch::_export::MemoryFormat::ContiguousFormat),
+      c10::MemoryFormat::Contiguous);
+  EXPECT_EQ(
+      convertJsonMemoryFormat(torch::_export::MemoryFormat::ChannelsLast),
+      c10::MemoryFormat::ChannelsLast);
+  EXPECT_EQ(
+      convertJsonMemoryFormat(torch::_export::MemoryFormat::PreserveFormat),
+      c10::MemoryFormat::Preserve);
+  EXPECT_THROW(
+      convertJsonMemoryFormat(static_cast<torch::_export::MemoryFormat>(100)),
+      c10::Error);
+}
+
+TEST(TensorMetaTest, LayoutConversion) {
+  EXPECT_EQ(
+      convertJsonLayout(torch::_export::Layout::Strided), c10::Layout::Strided);
+  EXPECT_EQ(
+      convertJsonLayout(torch::_export::Layout::SparseCsr),
+      c10::Layout::SparseCsr);
+  EXPECT_EQ(
+      convertJsonLayout(torch::_export::Layout::_mkldnn), c10::Layout::Mkldnn);
+  EXPECT_THROW(
+      convertJsonLayout(static_cast<torch::_export::Layout>(100)), c10::Error);
+}
+TEST(TensorMetaTest, DeviceConversion) {
+  torch::_export::Device cpu_device;
+  cpu_device.set_type("cpu");
+  EXPECT_EQ(convertJsonDevice(cpu_device), c10::Device(c10::DeviceType::CPU));
+  torch::_export::Device cuda_device;
+  cuda_device.set_type("cuda");
+  cuda_device.set_index(0);
+  EXPECT_EQ(
+      convertJsonDevice(cuda_device), c10::Device(c10::DeviceType::CUDA, 0));
+}
+
+} // namespace torch::nativert
diff --git a/test/cpp/nativert/test_weights.cpp b/test/cpp/nativert/test_weights.cpp
new file mode 100644
index 000000000000..43d05d5ad887
--- /dev/null
+++ b/test/cpp/nativert/test_weights.cpp
@@ -0,0 +1,92 @@
+#include <gtest/gtest.h>
+#include <torch/csrc/jit/serialization/pickle.h>
+#include <torch/custom_class.h>
+#include <torch/torch.h>
+#include <memory>
+
+#include <torch/nativert/executor/Placement.h>
+#include <torch/nativert/executor/Weights.h>
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+class WeightsTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    static constexpr std::string_view source =
+        R"(graph(%foo, %bar, %baz):
+%o1, %o2 = aten.foo(self=%foo, target=%bar, alpha=0.1)
+return(%o2, %baz)
+)";
+    graph = stringToGraph(source);
+    placement = std::make_unique<Placement>(c10::Device(c10::DeviceType::CPU));
+  }
+  std::shared_ptr<Graph> graph;
+  std::unique_ptr<Placement> placement;
+};
+TEST_F(WeightsTest, ConstructEmptyStateDict) {
+  std::unordered_map<std::string, c10::IValue> stateDict;
+  Weights weights(graph.get(), stateDict, *placement);
+  // Check that weights are initialized correctly
+  EXPECT_TRUE(weights.parameters().empty());
+  EXPECT_TRUE(weights.buffers().empty());
+  EXPECT_FALSE(weights.contains("non_existent_weight"));
+}
+TEST_F(WeightsTest, SetAndGetValue) {
+  std::unordered_map<std::string, c10::IValue> stateDict;
+  Weights weights(graph.get(), stateDict, *placement);
+  at::Tensor tensor = at::ones({2, 2});
+  weights.setValue("added_weight", tensor);
+  EXPECT_TRUE(weights.contains("added_weight"));
+  EXPECT_EQ(weights.at("added_weight").sizes(), tensor.sizes());
+}
+
+} // namespace torch::nativert
+
+using namespace ::testing;
+struct ContainsTensorDict : torch::CustomClassHolder {
+  explicit ContainsTensorDict(at::Tensor t) : t_(t) {}
+
+  explicit ContainsTensorDict(c10::Dict<std::string, at::Tensor> dict) {
+    t_ = dict.at(std::string("init_tensor"));
+  }
+
+  c10::Dict<std::string, at::Tensor> serialize() const {
+    c10::Dict<std::string, at::Tensor> dict;
+    dict.insert(std::string("init_tensor"), t_);
+    return dict;
+  }
+
+  at::Tensor t_;
+};
+
+static auto reg =
+    torch::class_<ContainsTensorDict>("testing", "ContainsTensorDict")
+        .def(torch::init<at::Tensor>())
+        .def_pickle(
+            // __getstate__
+            [](const c10::intrusive_ptr<ContainsTensorDict>& self)
+                -> c10::Dict<std::string, at::Tensor> {
+              return self->serialize();
+            },
+            // __setstate__
+            [](c10::Dict<std::string, at::Tensor> data)
+                -> c10::intrusive_ptr<ContainsTensorDict> {
+              return c10::make_intrusive<ContainsTensorDict>(std::move(data));
+            });
+
+TEST(CustomWeightsTest, TestCustomObjWithContainedTensor) {
+  // Save
+  auto customObj =
+      c10::make_intrusive<ContainsTensorDict>(torch::tensor({1, 2, 3}));
+  const auto bytes = torch::jit::pickle_save(c10::IValue(std::move(customObj)));
+
+  // Load
+  const auto loadedCustomObj =
+      torch::jit::pickle_load_obj(std::string{bytes.begin(), bytes.end()});
+  EXPECT_TRUE(loadedCustomObj.isObject());
+  EXPECT_EQ(
+      loadedCustomObj.to<c10::intrusive_ptr<ContainsTensorDict>>()
+          ->t_[0]
+          .item<int>(),
+      1);
+}
diff --git a/test/cpp/profiler/record_function.cpp b/test/cpp/profiler/record_function.cpp
index 19cf8f21183e..0718177e7115 100644
--- a/test/cpp/profiler/record_function.cpp
+++ b/test/cpp/profiler/record_function.cpp
@@ -144,7 +144,13 @@ TEST(RecordFunctionTest, CallOrder) {
 #undef REGISTER_CALLBACK
 
   RECORD_FUNCTION("Outer", {});
+<<<<<<< HEAD
   { RECORD_FUNCTION("Inner", {}); }
+=======
+  {
+    RECORD_FUNCTION("Inner", {});
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   at::clearCallbacks();
   ASSERT_FALSE(at::hasCallbacks());
diff --git a/test/cpp/rpc/CMakeLists.txt b/test/cpp/rpc/CMakeLists.txt
index 5c3a0dc020de..0e8ab9b81d99 100644
--- a/test/cpp/rpc/CMakeLists.txt
+++ b/test/cpp/rpc/CMakeLists.txt
@@ -5,7 +5,11 @@ set(TORCH_RPC_TEST_SOURCES
   ${TORCH_RPC_TEST_DIR}/test_wire_serialization.cpp
 )
 set(TORCH_RPC_TEST_DEPENDENCY_LIBS
+<<<<<<< HEAD
   torch gtest
+=======
+  torch gtest_main
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 if(USE_GLOO)
diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt
index 9c409e078d9d..f4efff9fd7d2 100644
--- a/test/cpp/tensorexpr/CMakeLists.txt
+++ b/test/cpp/tensorexpr/CMakeLists.txt
@@ -39,7 +39,11 @@ add_executable(test_tensorexpr
   ${TENSOREXPR_TEST_ROOT}/padded_buffer.cpp
   ${TENSOREXPR_TEST_SRCS})
 
+<<<<<<< HEAD
 target_link_libraries(test_tensorexpr PRIVATE torch gtest)
+=======
+target_link_libraries(test_tensorexpr PRIVATE torch gtest_main)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 target_include_directories(test_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
 target_compile_definitions(test_tensorexpr PRIVATE USE_GTEST)
 
diff --git a/test/cpp/tensorexpr/README.md b/test/cpp/tensorexpr/README.md
index 055d2201b009..87c215eca8f7 100644
--- a/test/cpp/tensorexpr/README.md
+++ b/test/cpp/tensorexpr/README.md
@@ -40,8 +40,13 @@ We glob all the test files together in `CMakeLists.txt` so that you don't
 have to edit it every time you add a test. Unfortunately, this means that in
 order to get the build to pick up your new test file, you need to re-run
 cmake:
+<<<<<<< HEAD
 ```
 python setup.py build --cmake
+=======
+```bash
+CMAKE_FRESH=1 python setup.py build
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ```
 
 ## How do I run the tests?
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
index a7df88b8ab99..55aab17d8d85 100644
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -202,9 +202,13 @@ TEST(BoundsInference, _5) {
   Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr inner;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr inner;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   std::vector<ForPtr> loops = l.getLoopStmtsFor(b);
   LoopNest::splitWithTail(loops[0], 16, &inner, &tail);
@@ -680,7 +684,10 @@ TEST(BoundsInference, GetPotentialHazardsLoopSplit) {
   });
 
   LoopNest l({A});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr inner, tail;
 
   // Splitting with tail by something offset creates a tail which also writes to
diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp
index af5d0e560757..794ea52b3a20 100644
--- a/test/cpp/tensorexpr/test_external_calls.cpp
+++ b/test/cpp/tensorexpr/test_external_calls.cpp
@@ -644,8 +644,17 @@ TEST(ExternalCall, BinaryFloat) {
   tests.push_back(
       Test{{100, 200}, {200, 300}, {100, 300}, at::matmul, "nnc_aten_matmul"});
   tests.push_back(Test{{100, 300}, {300}, {100}, at::mv, "nnc_aten_mv"});
+<<<<<<< HEAD
   tests.push_back(
       Test{{100, 200}, {200, 300}, {100, 300}, at::mm, "nnc_aten_mm"});
+=======
+  tests.push_back(Test{
+      {100, 200},
+      {200, 300},
+      {100, 300},
+      [&](const at::Tensor& a, const at::Tensor& b) { return at::mm(a, b); },
+      "nnc_aten_mm"});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto curTest : tests) {
     auto [aShape, bShape, resShape, torchFunc, externCallName] = curTest;
     auto toExprHandleVec = [](std::vector<int64_t> v) {
@@ -712,6 +721,7 @@ TEST(ExternalCall, UnaryFloat) {
       std::string,
       std::vector<ExprHandle>>;
   std::vector<Test> tests = {};
+<<<<<<< HEAD
   tests.push_back(Test{// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
                        {1, 64, 8, 9},
                        {1, 64, 5, 7},
@@ -720,6 +730,15 @@ TEST(ExternalCall, UnaryFloat) {
                        },
                        "nnc_aten_adaptive_avg_pool2d",
                        toExprHandleVec({5, 7})});
+=======
+  tests.push_back(Test{
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+      {1, 64, 8, 9},
+      {1, 64, 5, 7},
+      [](at::Tensor x) { return at::adaptive_avg_pool2d(x, {5, 7}); },
+      "nnc_aten_adaptive_avg_pool2d",
+      toExprHandleVec({5, 7})});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   tests.push_back(Test{// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
                        {100, 200},
                        {100},
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index ab387458fb12..87ff06fc98f0 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -164,9 +164,13 @@ TEST(LoopNest, ExprSliceHeadWithLoopOptions) {
   };
   Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   loops[0]->set_gpu_block_index(LoopOptions::IDX_Y);
@@ -187,16 +191,24 @@ TEST(LoopNest, ExprSliceTailWithLoopOptions) {
   };
   Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 4, &head, &tail);
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr tail_head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr tail_head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail_tail;
   tail->set_gpu_block_index(LoopOptions::IDX_Y);
   LoopNest::sliceTail(tail, 2, &tail_head, &tail_tail);
@@ -219,9 +231,13 @@ TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) {
   };
   Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceHead(loops[0], 10, &head, &tail);
@@ -239,9 +255,13 @@ TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) {
   };
   Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceHead(loops[0], 100, &head, &tail);
@@ -259,9 +279,13 @@ TEST(LoopNest, ExprSliceHead) {
   };
   Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceHead(loops[0], 4, &head, &tail);
@@ -283,9 +307,13 @@ TEST(LoopNest, ExprSliceHeadWithNonZeroStart) {
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   LoopNest::sliceTail(loops[0], 4, &head, &tail);
   // head: [0, 6)
@@ -307,9 +335,13 @@ TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) {
   };
   Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 10, &head, &tail);
@@ -329,9 +361,13 @@ TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) {
   };
   Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 100, &head, &tail);
@@ -349,9 +385,13 @@ TEST(LoopNest, ExprSliceTail) {
   };
   Tensor tensor = Compute("f", {10}, func);
   LoopNest l({tensor});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   LoopNest::sliceTail(loops[0], 4, &head, &tail);
@@ -375,9 +415,13 @@ TEST(LoopNest, ExprSplitAndSlice) {
   Tensor tensor = Compute("f", {100}, func);
   LoopNest l({tensor});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr inner;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr inner;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
   // outer: [0, 4)
@@ -428,9 +472,13 @@ TEST(LoopNest, ExprSliceAndNormalize) {
   LoopNest l({tensor});
   std::vector<ForPtr> loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr head;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr tail;
   LoopNest::sliceHead(loops[0], 2, &head, &tail);
   // head: [0, 2)
@@ -460,9 +508,13 @@ TEST(LoopNest, ExprSliceWithVariableDimension) {
         std::vector<ForPtr> loops =
             l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0);
 
+<<<<<<< HEAD
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
         ForPtr head;
         // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+        ForPtr head;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ForPtr tail;
         LoopNest::sliceHead(loops[0], 2, &head, &tail);
 
@@ -850,7 +902,10 @@ TEST(LoopNest, SplitWithTailWithLoopOptions) {
   Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr inner, tail;
 
   LoopNest l({tensor});
@@ -880,7 +935,10 @@ TEST(LoopNest, SplitWithMaskWithLoopOptions) {
   Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) {
     return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr inner;
 
   LoopNest l({tensor});
@@ -1433,7 +1491,10 @@ TEST(LoopNest, ScheduleSplitTwiceThenInline) {
   Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; });
   Tensor b = Compute(
       "b", {2}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); });
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr i_inner;
 
   LoopNest l({b}, {a, b});
@@ -3410,9 +3471,13 @@ TEST(LoopNest, NormalizeAndSplitWithTail) {
 
   LoopNest::normalize(for_stmt);
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr x_inner;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr x_inner;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr x_tail;
   LoopNest::splitWithTail(for_stmt, 10, &x_inner, &x_tail);
 
@@ -3454,9 +3519,13 @@ TEST(LoopNest, NotNormalizeAndSplitWithTail) {
   auto for_stmt = For::make(x, 5, 15, Store::make(a_buf, {x}, x * 2));
   auto parent_block = Block::make({for_stmt});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ForPtr x_inner;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+  ForPtr x_inner;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr x_tail;
   LoopNest::splitWithTail(for_stmt, 8, &x_inner, &x_tail);
 
@@ -5349,7 +5418,10 @@ TEST(LoopNest, fuseLoopsSimple) {
   auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
   auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
   auto par = Block::make({forJ, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 
@@ -5389,7 +5461,10 @@ TEST(LoopNest, fuseLoopsMultiple) {
   auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
   auto forK = For::make(k, 0, 100, Store::make(b_buf, {k}, Mul::make(20, k)));
   auto par = Block::make({forI, forJ, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forI, forJ, forK}, &fused_loop));
 
@@ -5446,7 +5521,10 @@ TEST(LoopNest, fuseLoopsNested) {
   auto forM = For::make(m, 0, 20, Block::make({initA, forJ}));
   auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
   auto par = Block::make({forM, forN});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
 
@@ -5506,7 +5584,10 @@ TEST(LoopNest, fuseLoopsNested2D) {
           50,
           Store::make(b_buf, {m, n}, Add::make(m, Mul::make(n, 100)))));
   auto par = Block::make({forI, forM});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
 
@@ -5547,7 +5628,10 @@ TEST(LoopNest, fuseLoopsNested2DInner) {
   auto forN = For::make(
       n, 0, 100, Store::make(b_buf, {i, n}, Add::make(i, Mul::make(n, 100))));
   auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
 
@@ -5583,7 +5667,10 @@ TEST(LoopNest, fuseLoopsDifferentStopBounds) {
   auto forK = For::make(k, 0, 50, Store::make(b_buf, {j}, Mul::make(20, k)));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   auto par = Block::make({forJ, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }
@@ -5604,7 +5691,10 @@ TEST(LoopNest, fuseLoopsDifferentStartBounds) {
   auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   auto par = Block::make({forJ, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }
@@ -5627,7 +5717,10 @@ TEST(LoopNest, fuseLoopsNotContiguous) {
   auto forK = For::make(k, 0, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   auto par = Block::make({forJ, initB, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }
@@ -5654,7 +5747,10 @@ TEST(LoopNest, fuseLoopsWithDifferentParents) {
   auto forK = For::make(k, 50, 100, Store::make(b_buf, {j}, Mul::make(20, k)));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   auto par = Block::make({forI, initB, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }
@@ -5676,7 +5772,10 @@ TEST(LoopNest, fuseLoopsWithVariableBounds) {
   // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
   auto forK = For::make(k, 0, N, Store::make(b_buf, {j}, Mul::make(20, k)));
   auto par = Block::make({forJ, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 
@@ -5712,7 +5811,10 @@ TEST(LoopNest, fuseLoopsWithExprBounds) {
   auto forJ = For::make(j, 0, M + N, Store::make(a_buf, {j}, Mul::make(10, j)));
   auto forK = For::make(k, 0, M + N, Store::make(b_buf, {j}, Mul::make(20, k)));
   auto par = Block::make({forJ, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 
@@ -5749,7 +5851,10 @@ TEST(LoopNest, fuseLoopsWithDifferentExprBounds) {
   // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers)
   auto forK = For::make(k, M, N + N, Store::make(b_buf, {j}, Mul::make(20, k)));
   auto par = Block::make({forJ, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 
@@ -5784,7 +5889,10 @@ TEST(LoopNest, fuseLoopsWithNonOverlappingBufferAccesses) {
       For::make(k, 10, 100, Store::make(a_buf, {k + 100}, Mul::make(30, k)));
   auto par = Block::make({forJ, forK});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 
@@ -5830,7 +5938,10 @@ TEST(LoopNest, fuseLoopsWithNonOverlapping2DBufferAccesses) {
   auto forM = For::make(m, 0, 20, forN);
   auto par = Block::make({forI, forM});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
 
@@ -5876,7 +5987,10 @@ TEST(LoopNest, fuseLoopsWithReductions) {
   auto forM =
       For::make(m, 0, 20, Store::make(c_buf, {m}, Load::make(a_buf, {m})));
   auto par = Block::make({forI, forM});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
 
@@ -5932,7 +6046,10 @@ TEST(LoopNest, fuseLoopsWith2DReductions) {
   auto forM = For::make(m, 0, 20, For::make(n, 0, 40, storeC));
   auto par = Block::make({forI, forM});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
 
@@ -5980,7 +6097,10 @@ TEST(LoopNest, fuseLoopsWithComplexIndices) {
   auto forM = For::make(m, 0, 20, For::make(n, 0, 20, storeB));
   auto par = Block::make({forI, forM});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_TRUE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
 
@@ -6025,7 +6145,10 @@ TEST(LoopNest, fuseLoopsWithMixedLoopVarsAsIndices) {
   auto forM = For::make(m, 0, 20, For::make(n, 0, 20, storeB));
   auto par = Block::make({forI, forM});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
 }
@@ -6054,7 +6177,10 @@ TEST(LoopNest, fuseLoopsWithTranspose) {
   auto forM = For::make(m, 0, 20, For::make(n, 0, 20, storeB));
   auto par = Block::make({forI, forM});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
 }
@@ -6075,7 +6201,10 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies1) {
       For::make(k, 10, 100, Store::make(a_buf, {k - 1}, Mul::make(20, k)));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   auto par = Block::make({forJ, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }
@@ -6096,7 +6225,10 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies2) {
       For::make(k, 10, 100, Store::make(a_buf, {k + 50}, Mul::make(20, k)));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   auto par = Block::make({forJ, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }
@@ -6139,7 +6271,10 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies3) {
   auto forN = For::make(n, 0, 20, Block::make({initB, forK}));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   auto par = Block::make({forM, forN});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forM, forN}, &fused_loop));
 }
@@ -6181,7 +6316,10 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies4) {
           Store::make(a_buf, {m + 1, n}, Add::make(m, Mul::make(n, 100)))));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   auto par = Block::make({forI, forM});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forI, forM}, &fused_loop));
 }
@@ -6209,7 +6347,10 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies5) {
       Store::make(a_buf, {i, n + 1}, Add::make(i, Mul::make(n, 100))));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,cppcoreguidelines-avoid-magic-numbers)
   auto forI = For::make(i, 0, 20, Block::make({forJ, forN}));
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forJ, forN}, &fused_loop));
 }
@@ -6235,7 +6376,10 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies6) {
           b_buf, {k}, Mul::make(20, Load::make(a_buf, {ExprHandle(99) - k}))));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   auto par = Block::make({forJ, forK});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forJ, forK}, &fused_loop));
 }
@@ -6261,7 +6405,10 @@ TEST(LoopNest, fuseLoopsThatViolateDependencies7) {
   auto forJ = For::make(j, 0, 100, Store::make(a_buf, {j}, Mul::make(10, j)));
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   auto par = Block::make({forK, forJ});
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr fused_loop;
   ASSERT_FALSE(LoopNest::fuseLoops({forK, forJ}, &fused_loop));
 }
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index ddb63431fe3f..96cda5706b54 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -1066,7 +1066,10 @@ TEST(Reductions, ReduceOverSplitRfactor) {
   Tensor c = Reduce("sum", {}, Sum(), b, {N, K});
   LoopNest loop({c});
   std::vector<ForPtr> loops = loop.getLoopStmtsFor(c);
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr i, t;
   LoopNest::splitWithTail(loops[1], SPLIT_FACTOR, &i, &t);
   LoopNest::reorderAxis(loops[0], i);
@@ -1573,7 +1576,10 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) {
 
   LoopNest l({e}, {c, d, e});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr inner;
 
   // Split outer reduction axis.
@@ -1623,7 +1629,10 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) {
 
   LoopNest l({e}, {c, d, e});
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForPtr inner;
 
   // reorder outer reduction axes.
@@ -1678,7 +1687,10 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) {
   LoopNest::reorderAxis(loops.at(0), loops.at(1));
   loops = loop.getLoopStmtsFor(c);
   auto c_body = loop.getAllWritesToBuf(c.buf())[1];
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   BufPtr rfac_buf;
   ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf));
   loop.distributeLoop(loops.at(0));
@@ -1744,7 +1756,10 @@ TEST(Reductions, ReductionRfactorCacheTempInner) {
 
   LoopNest::reorderAxis(loops.at(0), loops.at(1));
   loops = loop.getLoopStmtsFor(c);
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   BufPtr rfac_buf;
   ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf));
   loop.distributeLoop(loops.at(0));
diff --git a/test/cpp_api_parity/functional_impl_check.py b/test/cpp_api_parity/functional_impl_check.py
index b4272a2df1bd..87315ae37009 100644
--- a/test/cpp_api_parity/functional_impl_check.py
+++ b/test/cpp_api_parity/functional_impl_check.py
@@ -158,7 +158,12 @@ def camel_case_to_snake_case(camel_case_str):
         return test_params_dict["cpp_function_call"].split("(")[0].replace("F::", "")
     else:
         raise RuntimeError(
+<<<<<<< HEAD
             f"`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n{pprint.pformat(test_params_dict)}"  # noqa: B950
+=======
+            "`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n"
+            f"{pprint.pformat(test_params_dict)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -179,7 +184,12 @@ def compute_cpp_function_call(test_params_dict, arg_dict, functional_name):
         )
     else:
         raise RuntimeError(
+<<<<<<< HEAD
             f"`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n{pprint.pformat(test_params_dict)}"  # noqa: B950
+=======
+            "`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n"
+            f"{pprint.pformat(test_params_dict)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -217,7 +227,12 @@ def write_test_to_test_class(
         or "cpp_function_call" in test_params_dict
     ), (
         "To enable C++ API parity test, "
+<<<<<<< HEAD
         f"`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n{pprint.pformat(test_params_dict)}. \n"  # noqa: B950
+=======
+        "`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n"
+        f"{pprint.pformat(test_params_dict)}. \n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "If you are interested in adding the C++ API parity test, please see:\n"
         "NOTE [How to check NN module / functional API parity between Python and C++ frontends]. \n"
         "If not, please add `test_cpp_api_parity=False` to the test params dict and file an issue about this."
@@ -233,14 +248,26 @@ def write_test_to_test_class(
 
     functional_name = compute_functional_name(test_params_dict)
 
+<<<<<<< HEAD
     assert hasattr(
         torch.nn.functional, functional_name
     ), f"`torch.nn.functional` doesn't have function `{functional_name}`. (Discovered while processing\n{pprint.pformat(test_params_dict)}.)"  # noqa: B950
+=======
+    assert hasattr(torch.nn.functional, functional_name), (
+        f"`torch.nn.functional` doesn't have function `{functional_name}`. "
+        f"(Discovered while processing\n{pprint.pformat(test_params_dict)}.)"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     functional_full_name = "F::" + functional_name
 
     assert functional_full_name in parity_table["torch::nn::functional"], (
+<<<<<<< HEAD
         f"Please add `{functional_full_name}` entry to `torch::nn::functional` section of `test/cpp_api_parity/parity-tracker.md`. "
+=======
+        f"Please add `{functional_full_name}` entry to `torch::nn::functional` "
+        "section of `test/cpp_api_parity/parity-tracker.md`. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f"(Discovered while processing\n{pprint.pformat(test_params_dict)}.)"
     )
 
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
new file mode 100644
index 000000000000..554203752479
--- /dev/null
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@@ -0,0 +1,256 @@
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+
+#include <optional>
+
+void inline sgd_math(
+  float* param_ptr,
+  float* grad_ptr,
+  float* out_ptr,
+  const float weight_decay,
+  const double lr,
+  const bool maximize,
+  int64_t size
+){
+  int64_t d = 0;
+  for (; d < size; d++) {
+    float grad_val = grad_ptr[d];
+    if (maximize) grad_val = -grad_val;
+    if (weight_decay != 0.0){
+      grad_val += param_ptr[d] * weight_decay;
+    }
+    out_ptr[d] = param_ptr[d] - grad_val * float(lr);
+  }
+}
+
+using torch::stable::Tensor;
+
+Tensor sgd_out_of_place(
+    const Tensor param,
+    const Tensor grad,
+    const float weight_decay,
+    const double lr,
+    const bool maximize) {
+  int64_t *param_sizes;
+  int64_t *param_strides;
+  aoti_torch_get_sizes(param.get(), &param_sizes);
+  aoti_torch_get_strides(param.get(), &param_strides);
+
+  int32_t param_dtype;
+  aoti_torch_get_dtype(param.get(), &param_dtype);
+
+  int32_t param_device_type;
+  aoti_torch_get_device_type(param.get(), &param_device_type);
+
+  AtenTensorHandle out_ath;
+  aoti_torch_empty_strided(param.dim(), param_sizes, param_strides, param_dtype, param_device_type, param.get_device(), &out_ath);
+  auto out = Tensor(out_ath);
+
+  sgd_math(
+    reinterpret_cast<float*>(param.data_ptr()),
+    reinterpret_cast<float*>(grad.data_ptr()),
+    reinterpret_cast<float*>(out.data_ptr()),
+    weight_decay,
+    lr,
+    maximize,
+    param.numel()
+  );
+
+  return out;
+}
+
+void boxed_sgd_out_of_place(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor res = sgd_out_of_place(
+    to<Tensor>(stack[0]),
+    to<Tensor>(stack[1]),
+    float(to<double>(stack[2])),
+    to<double>(stack[3]),
+    to<bool>(stack[4]));
+
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
+  m.def("sgd_out_of_place(Tensor param, Tensor grad, float weight_decay, float lr, bool maximize) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
+  m.impl("sgd_out_of_place", &boxed_sgd_out_of_place);
+}
+
+Tensor identity(Tensor t) {
+  return t;
+}
+
+void boxed_identity(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor res = identity(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("identity(Tensor t) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
+  m.impl("identity", &boxed_identity);
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
+  m.impl("identity", &boxed_identity);
+}
+
+Tensor my_abs(Tensor t) {
+  const auto num_args = 1;
+  StableIValue stack[num_args];
+  stack[0] = from(t);
+  aoti_torch_call_dispatcher("aten::abs", "", stack);
+  return to<Tensor>(stack[0]);
+}
+
+void boxed_my_abs(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor tensor_res = my_abs(to<Tensor>(stack[0]));
+  stack[0] = from(tensor_res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("my_abs(Tensor t) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("my_abs", &boxed_my_abs);
+}
+
+Tensor my_ones_like(Tensor t, StableIValue device) {
+  const auto num_args = 6;
+  StableIValue stack[num_args];
+
+  int32_t t_dtype;
+  aoti_torch_get_dtype(t.get(), &t_dtype);
+  auto mf = aoti_torch_memory_format_contiguous_format();
+
+  stack[0] = from(t);
+  stack[1] = from(std::optional(t_dtype));    // dtype
+  stack[2] = from(std::nullopt);              // layout
+  stack[3] = from(std::optional(device));     // device
+  stack[4] = from(std::optional(false));      // pin_memory
+  stack[5] = from(std::optional(mf));         // memory_format
+
+  aoti_torch_call_dispatcher("aten::ones_like", "", stack);
+
+  return to<Tensor>(stack[0]);
+}
+
+void boxed_my_ones_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor res = my_ones_like(to<Tensor>(stack[0]), stack[1]);
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("my_ones_like(Tensor t, Device d) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("my_ones_like", &boxed_my_ones_like);
+}
+
+std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) {
+  StableIValue stack_exp[1];
+  stack_exp[0] = from(t1);
+  aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
+
+  StableIValue stack_neg[1];
+  stack_neg[0] = from(t2);
+  aoti_torch_call_dispatcher("aten::neg", "", stack_neg);
+
+  StableIValue stack_is_leaf[1];
+  stack_is_leaf[0] = from(t3);
+  aoti_torch_call_dispatcher("aten::is_leaf", "", stack_is_leaf);
+
+  return std::make_tuple(
+    to<Tensor>(stack_exp[0]),
+    to<Tensor>(stack_neg[0]),
+    to<bool>(stack_is_leaf[0]));
+}
+
+void boxed_exp_neg_is_leaf(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  auto tuple = exp_neg_is_leaf(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<Tensor>(stack[2]));
+  stack[0] = from(std::get<0>(tuple));
+  stack[1] = from(std::get<1>(tuple));
+  stack[2] = from(std::get<2>(tuple));
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) -> (Tensor, Tensor, bool)");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("exp_neg_is_leaf", &boxed_exp_neg_is_leaf);
+}
+
+Tensor neg_exp(Tensor t) {
+  StableIValue stack[1];
+  stack[0] = from(t);
+  aoti_torch_call_dispatcher("aten::exp", "", stack);
+  aoti_torch_call_dispatcher("aten::neg", "", stack);
+  return to<Tensor>(stack[0]);
+}
+
+void boxed_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor res = neg_exp(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("neg_exp(Tensor t) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("neg_exp", &boxed_neg_exp);
+}
+
+Tensor divide_neg_exp(Tensor t) {
+  StableIValue stack_neg[1];
+  stack_neg[0] = from(t);
+
+  StableIValue stack_exp[1];
+  stack_exp[0] = from(t);
+  aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
+  aoti_torch_call_dispatcher("aten::neg", "", stack_neg);
+
+  StableIValue stack_div[2];
+  stack_div[0] = stack_neg[0];
+  stack_div[1] = stack_exp[0];
+  aoti_torch_call_dispatcher("aten::divide", "Tensor", stack_div);
+  return to<Tensor>(stack_div[0]);
+}
+
+void boxed_divide_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor res = divide_neg_exp(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("divide_neg_exp(Tensor t) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("divide_neg_exp", &boxed_divide_neg_exp);
+}
+
+bool is_contiguous(Tensor t) {
+  return t.is_contiguous();
+}
+
+void boxed_is_contiguous(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  bool res = is_contiguous(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("is_contiguous(Tensor t) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("is_contiguous", &boxed_is_contiguous);
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
index cb424dcc2077..3b229bddee37 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@@ -64,3 +64,58 @@ def my_ones_like(tensor, device) -> Tensor:
         like the input tensor
     """
     return torch.ops.libtorch_agnostic.my_ones_like.default(tensor, device)
+<<<<<<< HEAD
+=======
+
+
+def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
+    """
+    Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
+    t1, t2, and t3.
+
+    Args:
+        t1: Tensor
+        t2: Tensor
+        t3: Tensor
+
+    Returns:
+        (exp(t1), neg(t2), is_leaf(t3))
+    """
+    return torch.ops.libtorch_agnostic.exp_neg_is_leaf.default(t1, t2, t3)
+
+
+def neg_exp(t) -> Tensor:
+    """
+    Returns a Tensor composing neg of exp
+
+    Args:
+        t: Tensor
+
+    Returns: neg(exp(t))
+    """
+    return torch.ops.libtorch_agnostic.neg_exp.default(t)
+
+
+def divide_neg_exp(t) -> Tensor:
+    """
+    Returns a Tensor division of neg and exp
+
+    Args:
+        t: Tensor
+
+    Returns: divide(neg(t), exp(t))
+    """
+    return torch.ops.libtorch_agnostic.divide_neg_exp.default(t)
+
+
+def is_contiguous(t) -> bool:
+    """
+    Returns a bool indicating if the input tensor is contiguous
+
+    Args:
+        t: Tensor
+
+    Returns: is_contiguous(t)
+    """
+    return torch.ops.libtorch_agnostic.is_contiguous.default(t)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
index 935af9a47b23..b8ccb1ca9ce0 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@@ -1,6 +1,10 @@
 # Owner(s): ["module: cpp"]
 
+<<<<<<< HEAD
 import libtorch_agnostic  # noqa: F401
+=======
+from pathlib import Path
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.testing._internal.common_device_type import (
@@ -8,6 +12,7 @@
     onlyCPU,
     onlyCUDA,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
@@ -106,6 +111,174 @@ def test_z_delete_torch_lib(self, device):
 
 
 instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
+=======
+from torch.testing._internal.common_utils import (
+    install_cpp_extension,
+    IS_WINDOWS,
+    run_tests,
+    TestCase,
+    xfailIfTorchDynamo,
+)
+
+
+# TODO: Fix this error in Windows:
+# LINK : error LNK2001: unresolved external symbol PyInit__C
+if not IS_WINDOWS:
+
+    class TestLibtorchAgnostic(TestCase):
+        @classmethod
+        def setUpClass(cls):
+            try:
+                import libtorch_agnostic  # noqa: F401
+            except Exception:
+                install_cpp_extension(extension_root=Path(__file__).parent.parent)
+
+        @onlyCPU
+        def test_slow_sgd(self, device):
+            import libtorch_agnostic
+
+            param = torch.rand(5, device=device)
+            grad = torch.rand_like(param)
+            weight_decay = 0.01
+            lr = 0.001
+            maximize = False
+
+            new_param = libtorch_agnostic.ops.sgd_out_of_place(
+                param, grad, weight_decay, lr, maximize
+            )
+            torch._fused_sgd_(
+                (param,),
+                (grad,),
+                (),
+                weight_decay=weight_decay,
+                momentum=0.0,
+                lr=lr,
+                dampening=0.0,
+                nesterov=False,
+                maximize=maximize,
+                is_first_step=False,
+            )
+            self.assertEqual(new_param, param)
+
+        @onlyCUDA
+        def test_identity_does_not_hog_memory(self, device):
+            import libtorch_agnostic
+
+            def _run_identity(prior_mem):
+                t = torch.rand(32, 32, device=device)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+                identi_t = libtorch_agnostic.ops.identity(t)
+                assert identi_t is t
+
+            init_mem = torch.cuda.memory_allocated(device)
+
+            for _ in range(3):
+                _run_identity(init_mem)
+                curr_mem = torch.cuda.memory_allocated(device)
+                self.assertEqual(curr_mem, init_mem)
+
+        def test_exp_neg_is_leaf(self, device):
+            import libtorch_agnostic
+
+            t1 = torch.rand(2, 3, device=device)
+            t2 = torch.rand(3, 2, device=device)
+            t3 = torch.rand(2, device=device)
+
+            exp, neg, is_leaf = libtorch_agnostic.ops.exp_neg_is_leaf(t1, t2, t3)
+            self.assertEqual(exp, torch.exp(t1))
+            self.assertEqual(neg, torch.neg(t2))
+            self.assertEqual(is_leaf, t3.is_leaf)
+
+        def test_my_abs(self, device):
+            import libtorch_agnostic
+
+            t = torch.rand(32, 16, device=device) - 0.5
+            res = libtorch_agnostic.ops.my_abs(t)
+            self.assertEqual(res, torch.abs(t))
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_t = libtorch_agnostic.ops.my_abs(t)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+                self.assertEqual(cuda_t, torch.abs(t))
+
+            if t.is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+        def test_neg_exp(self, device):
+            import libtorch_agnostic
+
+            t = torch.rand(32, 16, device=device) - 0.5
+            res = libtorch_agnostic.ops.neg_exp(t)
+            self.assertEqual(res, torch.neg(torch.exp(t)))
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_res = libtorch_agnostic.ops.neg_exp(t)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+                self.assertEqual(cuda_res, torch.neg(torch.exp(t)))
+
+            if t.is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+        def test_divide_neg_exp(self, device):
+            import libtorch_agnostic
+
+            t = torch.zeros(2, 3, device=device) - 0.5
+            res = libtorch_agnostic.ops.divide_neg_exp(t)
+            self.assertEqual(res, torch.neg(t) / torch.exp(t))
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_res = libtorch_agnostic.ops.divide_neg_exp(t)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+                self.assertEqual(cuda_res, torch.neg(t) / torch.exp(t))
+
+            if t.is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+        def test_is_contiguous(self, device):
+            import libtorch_agnostic
+
+            t = torch.rand(2, 7, device=device)
+            self.assertTrue(libtorch_agnostic.ops.is_contiguous(t))
+            self.assertFalse(libtorch_agnostic.ops.is_contiguous(t.transpose(0, 1)))
+
+        # TODO: Debug this:
+        # torch._dynamo.exc.TorchRuntimeError: Dynamo failed to run FX node with fake tensors:
+        # call_function libtorch_agnostic.my_ones_like.default(*(FakeTensor(..., size=(3, 1)), 'cpu'),
+        # **{}): got AssertionError("tensor's device must be `meta`, got cpu instead")
+        @xfailIfTorchDynamo
+        def test_my_ones_like(self, device):
+            import libtorch_agnostic
+
+            t = torch.rand(3, 1, device=device) - 0.5
+            cpu_t = libtorch_agnostic.ops.my_ones_like(t, "cpu")
+            self.assertEqual(cpu_t, torch.ones_like(t, device="cpu"))
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_t = libtorch_agnostic.ops.my_ones_like(t, device)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+                self.assertEqual(cuda_t, torch.ones_like(t, device=device))
+
+            if t.is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+    instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/cpp_extensions/maia_extension.cpp b/test/cpp_extensions/maia_extension.cpp
index 2b8c001c0ab2..e5265caaefa9 100644
--- a/test/cpp_extensions/maia_extension.cpp
+++ b/test/cpp_extensions/maia_extension.cpp
@@ -52,11 +52,34 @@ std::tuple<Tensor,Tensor,Tensor> fake_convolution_backward(
             get_tensor(input.dtype(), {}));
 }
 
+<<<<<<< HEAD
+=======
+at::Tensor maia_to_dtype_override(
+  const at::Tensor & self, at::ScalarType dtype, bool non_blocking,
+  bool copy, ::std::optional<at::MemoryFormat> memory_format
+) {
+  return get_tensor(scalarTypeToTypeMeta(dtype), self.sizes());
+}
+
+at::Tensor maia_matmul_override(const at::Tensor & self, const at::Tensor & other) {
+  AT_ASSERT(self.dim() == 2);
+  AT_ASSERT(other.dim() == 2);
+  AT_ASSERT(self.dtype() == other.dtype());
+  AT_ASSERT(self.device() == other.device());
+  return get_tensor(self.dtype(), {self.size(0), other.size(1)});
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_LIBRARY_IMPL(aten, MAIA, m) {
   m.impl("empty.memory_format",                empty_override);
   m.impl("add.out",                            add_out_override);
   m.impl("convolution_overrideable",           fake_convolution);
   m.impl("convolution_backward_overrideable",  fake_convolution_backward);
+<<<<<<< HEAD
+=======
+  m.impl("to.dtype",                           maia_to_dtype_override);
+  m.impl("matmul",                             maia_matmul_override);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // TODO: Extend this to exercise multi-device setting.  In that case,
diff --git a/test/cpp_extensions/mtia_extension.cpp b/test/cpp_extensions/mtia_extension.cpp
index 257ecf9cc91f..43be502dc2b6 100644
--- a/test/cpp_extensions/mtia_extension.cpp
+++ b/test/cpp_extensions/mtia_extension.cpp
@@ -38,9 +38,14 @@ struct MTIAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   }
 
   void setDevice(c10::Device d) const override {
+<<<<<<< HEAD
     c10::Device current_device = getDevice();
     if (current_device.index() != d.index()) {
       current_device = d;
+=======
+    if (getDevice().index() != d.index()) {
+      current_device = d.index();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   void uncheckedSetDevice(c10::Device d) const noexcept override {
diff --git a/test/cpp_extensions/open_registration_extension.cpp b/test/cpp_extensions/open_registration_extension.cpp
index 1ffc39e6ff1f..6e120fe0db18 100644
--- a/test/cpp_extensions/open_registration_extension.cpp
+++ b/test/cpp_extensions/open_registration_extension.cpp
@@ -16,24 +16,33 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/UnaryOps.h>
 #include <ATen/native/cpu/Loops.h>
+<<<<<<< HEAD
 #include <ATen/native/quantized/AffineQuantizer.h>
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/sdp_utils_cpp.h>
 #include <ATen/ops/abs_native.h>
+=======
+#include <ATen/native/transformers/attention.h>
+#include <ATen/native/transformers/sdp_utils_cpp.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ops/view.h>
 
 #include <unordered_map>
 
 static uint64_t add_counter = 0;
 static uint64_t last_saved_value = 0;
+<<<<<<< HEAD
 static c10::DeviceIndex custom_device_index = 0;
 
 static uint64_t abs_counter = 0;
 static uint64_t last_abs_saved_value = 0;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static uint64_t storageImpl_counter = 0;
 static uint64_t last_storageImpl_saved_value = 0;
 
+<<<<<<< HEAD
 namespace {
 
 // Using the simplest way to obtain continuous Tensor data and process it.
@@ -202,6 +211,8 @@ void custom_set_backend_meta(const at::Tensor& t) {
   t.unsafeGetTensorImpl()->set_backend_meta(new_tmeta);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // A dummy storageImpl for our custom device, that secretly uses the CPU
 c10::intrusive_ptr<c10::StorageImpl> make_custom_storage_impl(c10::StorageImpl::use_byte_size_t,
                                                               c10::SymInt size_bytes,
@@ -256,6 +267,7 @@ at::Tensor& custom_set_source_Storage(at::Tensor& result, c10::Storage src) {
   return result;
 }
 
+<<<<<<< HEAD
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, c10::SymInt, c10::SymInt, at::Tensor, at::Tensor, at::Tensor>
 custom_scaled_dot_product_fused_attention_overrideable(
     const at::Tensor & query,
@@ -309,6 +321,8 @@ custom_scaled_dot_product_fused_attention_overrideable_backward(
           at::empty_like(attn_bias));
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // This macro does the heavy lifting.
 // With TORCH_LIBRARY_IMPL, you can register custom kernels for your backend.
 // For open registration, we're registering all of our kernels to the PrivateUse1 dispatch key.
@@ -322,10 +336,13 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
   m.impl("add.Tensor", &custom_add_Tensor);
   m.impl("_copy_from_and_resize", &custom__copy_from_and_resize);
   m.impl("set_.source_Storage", &custom_set_source_Storage);
+<<<<<<< HEAD
   m.impl("quantize_per_tensor", at::native::quantize_per_tensor);
   m.impl("_fused_sdp_choice", &_fused_sdp_choice_privateuse1);
   m.impl("_scaled_dot_product_fused_attention_overrideable", &custom_scaled_dot_product_fused_attention_overrideable);
   m.impl("_scaled_dot_product_fused_attention_overrideable_backward", &custom_scaled_dot_product_fused_attention_overrideable_backward);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
@@ -356,6 +373,7 @@ bool custom_add_called() {
   return called;
 }
 
+<<<<<<< HEAD
 void set_custom_device_index(c10::DeviceIndex device_index) {
   custom_device_index = device_index;
 }
@@ -366,6 +384,10 @@ const at::Generator& default_generator(c10::DeviceIndex device_index) {
 
 void fallback_with_undefined_tensor() {
   at::Tensor first = at::empty((2,3)).to(at::DeviceType::PrivateUse1);
+=======
+void fallback_with_undefined_tensor() {
+  at::Tensor first = at::empty({2, 3}).to(at::DeviceType::PrivateUse1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::Tensor second = at::Tensor();
   at::Tensor step = at::empty({}).fill_(2).to(at::DeviceType::PrivateUse1);
   at::Tensor grad_scale = at::empty({}).fill_(0.00001).to(at::DeviceType::PrivateUse1);
@@ -415,6 +437,7 @@ at::Tensor custom_autograd_fn_aliasing(at::Tensor x) {
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("custom_device", &get_custom_device, "get custom device object");
     m.def("custom_add_called", &custom_add_called, "check if our custom add function was called");
+<<<<<<< HEAD
     m.def("set_custom_device_index", &set_custom_device_index, "set custom device index");
     m.def("custom_storage_registry", &custom_storage_registry, "set custom storageImpl creat method");
     m.def("custom_storageImpl_called", &custom_storageImpl_called, "check if our custom abs function was called");
@@ -422,6 +445,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("check_backend_meta", &check_backend_meta, "check if BackendMeta serialization correctly");
     m.def("custom_serialization_registry", &custom_serialization_registry, "register custom serialization function");
     m.def("default_generator", &default_generator, "default_generator for privateuse1");
+=======
+    m.def("custom_storage_registry", &custom_storage_registry, "set custom storageImpl creat method");
+    m.def("custom_storageImpl_called", &custom_storageImpl_called, "check if our custom abs function was called");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     m.def("fallback_with_undefined_tensor", &fallback_with_undefined_tensor, "fallback_with_undefined_tensor for privateuse1");
 
     // Co-opting this file to more easily test torch.compile'ing of custom autograd functions in C++
diff --git a/test/cpp_extensions/open_registration_extension/README.md b/test/cpp_extensions/open_registration_extension/README.md
index 18d98971eda8..ae81118700c0 100644
--- a/test/cpp_extensions/open_registration_extension/README.md
+++ b/test/cpp_extensions/open_registration_extension/README.md
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 This folder contains a self-contained example of a PyTorch out-of-tree backend leveraging the "PrivateUse1" backend from core.
 
 ## How to use
@@ -5,12 +6,26 @@ Install as standalone with `python setup.py develop` (or install) from this fold
 You can run test via `python test/test_openreg.py`.
 
 ## Design principles
+=======
+# PyTorch OpenReg
+
+This folder contains a self-contained example of a PyTorch out-of-tree backend leveraging the "PrivateUse1" backend from core.
+
+## How to use
+
+Install as standalone with `python setup.py develop` (or install) from this folder.
+You can run test via `python {PYTORCH_ROOT_PATH}/test/test_openreg.py`.
+
+## Design principles
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 For simplicity anything that can be implemented from python is done so.
 A real implementation will most likely want to call these different APIs from c++ directly.
 
 The current version sends everything back to python and contains enough implementation to run basic model, transfer host/device and printing.
 
 The codebase is split as follows:
+<<<<<<< HEAD
 - `pytorch_openreg/__init__.py` imports torch to get core state initialized, imports `._aten_impl` to register our aten op implementations to torch, imports `.C` to load our c++ extension that registers more ops, allocator and hooks and finally renames the PrivateUse1 backend and register our python-side module.
 - `pytorch_openreg/_aten_impl.py` does two main things. Use the `_register_same_name()` function to register hooks from c++ (like getDevice, getStream, etc) and send them to our device daemon. Define a new `torch.Library` that registers a fallback that will be called whenever a backend kernel for PrivateUse1 is called. It contains the logic to handle all kind of native functions, computing the output metadata, allocating it and only calling into the device daemon to perform computation
 - `pytorch_openreg/_device_daemon.py` contains the Allocator (responsible for allocating memory on the device side, as int8 buffers, and recreating nice looking Tensors on the device side to be able to use aten ops to run code there), `run_op` that is the logic running on the device side to perform compute (for simplicity of coverage, we are re-building full blown Tensors here and calling aten ops on them). It also contains the Daemon responsible for the device worker process and sending data back and forth.
@@ -27,3 +42,25 @@ The main next step would be to:
 Longer term:
 - Replace the current `open_registration_extension.cpp` test in PyTorch CI with this.
 - Build this module in the CI environment and enable Device-generic tests on this device.
+=======
+
+- `pytorch_openreg/__init__.py`
+  - imports torch to get core state initialized.
+  - imports `._aten_impl` to register our aten op implementations to torch.
+  - imports `.C` to load our c++ extension that registers more ops, allocator and hooks.
+  - renames the PrivateUse1 backend and register our python-side module.
+- `pytorch_openreg/_aten_impl.py`
+  - Define a new `torch.Library` that registers a fallback that will be called whenever a backend kernel for PrivateUse1 is called. It contains the logic to handle all kind of native functions, computing the output metadata, allocating it and only calling into the device daemon to perform computation.
+- `pytorch_openreg/_device_daemon.py`
+  - contains the Allocator (responsible for allocating memory on the device side and host side, as int8 buffers).
+  - contains `Driver`, which as user-process driver to deal with some information needed to be done in driver.
+  - contains `Executor`, which as device-process exector to do something related device logic.
+- `pytorch_openreg/_meta_parser.py` mainly contain utilities to send objects over the wire from the user process to the device process.
+  - The main class there is `OpenRegTensorMeta` that contains all the metadata sent to the device which should be enough for it to populate the output Tensor.
+
+## Next steps
+
+The main next step would be to:
+
+- Replace the current `open_registration_extension.cpp` test in PyTorch CI with this.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/__init__.py b/test/cpp_extensions/open_registration_extension/pytorch_openreg/__init__.py
index 35efd3c1ae92..15a2975b7767 100644
--- a/test/cpp_extensions/open_registration_extension/pytorch_openreg/__init__.py
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/__init__.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 import torch
 
 # Create our python implementation dict so that the C++ module
@@ -13,8 +14,131 @@
 # Module used for our backend
 class _OpenRegMod:
     pass
+=======
+import types
+
+import torch
+
+# Create our python implementation dict so that the C++ module
+# can access it during its initialization and also register aten impls.
+from ._aten_impl import impl_factory as impl_factory  # noqa: F401
+from ._device_daemon import driver
+
+
+# Load the C++ Module
+import pytorch_openreg._C  # isort:skip # type: ignore[import] # noqa: F401
+
+
+def _create_module():
+    module = types.ModuleType("_OpenRegMod")
+
+    class device:
+        r"""Context-manager that changes the selected device.
+
+        Args:
+            device (torch.device or int): device index to select. It's a no-op if
+                this argument is a negative integer or ``None``.
+        """
+
+        def __init__(self, device):
+            self.idx = torch.accelerator._get_device_index(device, optional=True)
+            self.prev_idx = -1
+
+        def __enter__(self):
+            self.prev_idx = driver.exec("exchangeDevice", self.idx)
+
+        def __exit__(self, type, value, traceback):
+            self.idx = driver.exec("uncheckedSetDevice", self.prev_idx)
+            return False
+
+    def device_count() -> int:
+        return driver.exec("deviceCount")
+
+    def is_available():
+        return True
+
+    def current_device():
+        return torch.accelerator.current_device_index()
+
+    def get_rng_state(device="openreg"):
+        if isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device("openreg", device)
+        idx = device.index
+        if idx is None:
+            idx = current_device()
+        default_generator = pytorch_openreg._C._get_default_generator(idx)
+        return default_generator.get_state()
+
+    def set_rng_state(new_state, device="openreg"):
+        if isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device("openreg", device)
+        idx = device.index
+        if idx is None:
+            idx = current_device()
+        default_generator = pytorch_openreg._C._get_default_generator(idx)
+        default_generator.set_state(new_state)
+
+    def initial_seed() -> int:
+        _lazy_init()
+        idx = current_device()
+        default_generator = pytorch_openreg._C._get_default_generator(idx)
+        return default_generator.initial_seed()
+
+    def manual_seed(seed: int) -> None:
+        seed = int(seed)
+
+        idx = current_device()
+        default_generator = pytorch_openreg._C._get_default_generator(idx)
+        default_generator.manual_seed(seed)
+
+    def manual_seed_all(seed: int) -> None:
+        seed = int(seed)
+
+        for idx in range(device_count()):
+            default_generator = pytorch_openreg._C._get_default_generator(idx)
+            default_generator.manual_seed(seed)
+
+    def is_initialized():
+        return module._initialized
+
+    def _is_in_bad_fork():
+        return False
+
+    def _lazy_init():
+        if is_initialized():
+            return
+        pytorch_openreg._C._init()
+        module._initialized = True
+
+    module.is_available = is_available  # type: ignore[assignment]
+
+    module._initialized = False  # type: ignore[assignment]
+    module._lazy_init = _lazy_init  # type: ignore[assignment]
+    module.is_initialized = is_initialized  # type: ignore[assignment]
+
+    module.device = device  # type: ignore[assignment]
+    module.device_count = device_count  # type: ignore[assignment]
+    module.current_device = current_device  # type: ignore[assignment]
+    module.get_rng_state = get_rng_state  # type: ignore[assignment]
+    module.set_rng_state = set_rng_state  # type: ignore[assignment]
+    module._is_in_bad_fork = _is_in_bad_fork  # type: ignore[assignment]
+    module.initial_seed = initial_seed  # type: ignore[assignment]
+    module.manual_seed = manual_seed  # type: ignore[assignment]
+    module.manual_seed_all = manual_seed_all  # type: ignore[assignment]
+
+    return module
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Set all the appropriate state on PyTorch
 torch.utils.rename_privateuse1_backend("openreg")
+<<<<<<< HEAD
 torch._register_device_module("openreg", _OpenRegMod())
+=======
+torch._register_device_module("openreg", _create_module())
+torch.utils.generate_methods_for_privateuse1_backend(for_storage=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/_aten_impl.py b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_aten_impl.py
index 676c6550a2b7..8663cfda42fa 100644
--- a/test/cpp_extensions/open_registration_extension/pytorch_openreg/_aten_impl.py
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_aten_impl.py
@@ -25,6 +25,7 @@ def _(*args, **kwargs):
     return _
 
 
+<<<<<<< HEAD
 # TODO: replace it with implementing torch.openreg.device
 class DeviceContext:
     def __init__(self, device):
@@ -37,6 +38,8 @@ def __exit__(self, *args):
         driver.exec("uncheckedSetDevice", self.prev)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _openreg_kernel_fallback(op, *args, **kwargs):
     def get_tensor_device(*args):
         for arg in args:
@@ -48,7 +51,11 @@ def get_tensor_device(*args):
         return _kernel_fallback(op, *args, **kwargs)
 
     # Mimicks the DeviceGuard system we have in aten
+<<<<<<< HEAD
     with DeviceContext(device):
+=======
+    with torch.openreg.device(device):  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _kernel_fallback(op, *args, **kwargs)
 
 
@@ -90,9 +97,15 @@ def _post_process():
     elif op._schema.is_mutable or op is torch.ops.aten._copy_from.default:
         # Only handle inplace ops returning their first arg
         assert len(args) >= 1, f"Inplace {op} needs at least one arg"
+<<<<<<< HEAD
         assert (
             len(op._schema.returns) == 1
         ), f"NYI Inplace {op} with more than one return"
+=======
+        assert len(op._schema.returns) == 1, (
+            f"NYI Inplace {op} with more than one return"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op_name = op.overloadpacket._qualified_op_name
         real_res = args[0]
     elif any(r.alias_info is not None for r in op._schema.returns):
@@ -140,13 +153,21 @@ def _post_process():
 
 
 def copy_from_device(from_):
+<<<<<<< HEAD
     with DeviceContext(from_.device):
+=======
+    with torch.openreg.device(from_.device):  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args, _ = prepare_for_sending((from_,), {})
         return driver.exec("send_data", *args)
 
 
 def copy_from_host_to_device(from_, to_):
+<<<<<<< HEAD
     with DeviceContext(to_.device):
+=======
+    with torch.openreg.device(to_.device):  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args, _ = prepare_for_sending((to_,), {})
         driver.exec("recv_data", from_, *args)
     return to_
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/_device_daemon.py b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_device_daemon.py
index 8489a7bcd9a8..83132226efdc 100644
--- a/test/cpp_extensions/open_registration_extension/pytorch_openreg/_device_daemon.py
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_device_daemon.py
@@ -104,11 +104,14 @@ def __init__(self, num_devices):
         super().__init__()
         self.num_devices = num_devices
         self.is_initialized = False
+<<<<<<< HEAD
         self.rlock = threading.RLock()
 
     def _lazy_init(self):
         if self.is_initialized:
             return
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # State of our driver
         self.curr_device_idx = 0
@@ -119,6 +122,14 @@ def _lazy_init(self):
         self.host_allocator = HostAllocator()
         self.event_belong = {}
 
+<<<<<<< HEAD
+=======
+        self.rlock = threading.RLock()
+
+    def _lazy_init(self):
+        if self.is_initialized:
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.devices = []
 
         for i in range(self.num_devices):
@@ -136,7 +147,10 @@ def _lazy_init(self):
 
     def exec(self, cmd, *args):
         with self.rlock:
+<<<<<<< HEAD
             self._lazy_init()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.info("Main process launched: %s(*%s)", cmd, safe_str(args))
 
             if cmd in Driver.registry:
@@ -151,6 +165,10 @@ def exec(self, cmd, *args):
                 return res
 
     def run_on_executor(self, device_idx, cmd, *args):
+<<<<<<< HEAD
+=======
+        self._lazy_init()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         req_queue, ans_queue, _ = self.devices[device_idx]
         stream = self.getStream(device_idx)
         validate_send_queue_args(cmd, args)
@@ -161,7 +179,11 @@ def run_on_executor(self, device_idx, cmd, *args):
 
     @register(registry)
     def hasPrimaryContext(self, device_idx):
+<<<<<<< HEAD
         return device_idx >= 0 and device_idx < len(self.devices)
+=======
+        return device_idx >= 0 and device_idx < self.num_devices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @register(registry)
     def deviceCount(self, *args):
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/_meta_parser.py b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_meta_parser.py
index 80194b38aaeb..1759d678c069 100644
--- a/test/cpp_extensions/open_registration_extension/pytorch_openreg/_meta_parser.py
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/_meta_parser.py
@@ -67,7 +67,11 @@ def prepare_for_sending(args, kwargs):
     def convert(obj):
         if type(obj) not in VALID_QUEUE_TYPES_IN:
             raise RuntimeError(
+<<<<<<< HEAD
                 f"Cannot send object of type {type(obj)} " "over openreg device pipe."
+=======
+                f"Cannot send object of type {type(obj)} over openreg device pipe."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if isinstance(obj, torch.Tensor):
@@ -82,8 +86,12 @@ def receive_after_sending(allocator, args, kwargs):
     def convert(obj):
         if type(obj) not in VALID_QUEUE_TYPES_OUT:
             raise RuntimeError(
+<<<<<<< HEAD
                 f"Received invalid object of type {type(obj)} "
                 "over openreg device pipe."
+=======
+                f"Received invalid object of type {type(obj)} over openreg device pipe."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if isinstance(obj, OpenRegTensorMeta):
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/Module.cpp b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/Module.cpp
index 7522f7a540bc..4977167be1ab 100644
--- a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/Module.cpp
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/Module.cpp
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #include <OpenReg.h>
 
 // Make this a proper CPython module
@@ -21,4 +22,56 @@ PyMODINIT_FUNC PyInit__C(void) {
     openreg::set_impl_factory(openreg_mod.attr("impl_factory").ptr());
 
     return mod;
+=======
+#include "OpenReg.h"
+
+#include <ATen/Context.h>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/utils.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+static PyObject* _initExtension(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+
+  at::globalContext().lazyInitDevice(c10::DeviceType::PrivateUse1);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* _getDefaultGenerator(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(arg),
+      "_get_default_generator expects an int, but got ",
+      THPUtils_typename(arg));
+  auto idx = static_cast<int>(THPUtils_unpackLong(arg));
+
+  return THPGenerator_initDefaultGenerator(
+      at::globalContext().defaultGenerator(
+          c10::Device(c10::DeviceType::PrivateUse1, idx)));
+
+  END_HANDLE_TH_ERRORS
+}
+
+static PyMethodDef methods[] = {
+    {"_init", _initExtension, METH_NOARGS, nullptr},
+    {"_get_default_generator", _getDefaultGenerator, METH_O, nullptr},
+    {nullptr, nullptr, 0, nullptr}
+};
+
+static struct PyModuleDef openreg_C_module =
+    {PyModuleDef_HEAD_INIT, "pytorch_openreg._C", nullptr, -1, methods};
+
+PyMODINIT_FUNC PyInit__C(void) {
+  PyObject* mod = PyModule_Create(&openreg_C_module);
+
+  py::object openreg_mod = py::module_::import("pytorch_openreg");
+  // Only borrowed from the python side!
+  openreg::set_impl_factory(openreg_mod.attr("impl_factory").ptr());
+
+  return mod;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenReg.h b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenReg.h
index 442d9db9931a..76b178f7b43d 100644
--- a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenReg.h
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenReg.h
@@ -1,11 +1,61 @@
 #pragma once
+<<<<<<< HEAD
 // Shared header for OpenReg module
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/csrc/utils/pybind.h>
 
 namespace openreg {
 
+<<<<<<< HEAD
     void set_impl_factory(PyObject* factory);
     py::function get_method(const char* name);
 
-}
\ No newline at end of file
+}
+=======
+using openreg_ptr_t = uint64_t;
+
+void set_impl_factory(PyObject* factory);
+py::function get_method(const char* name);
+
+static constexpr char kFreeMethod[] = "free";
+static constexpr char kHostFreeMethod[] = "hostFree";
+
+template <const char* name>
+static void ReportAndDelete(void* ptr) {
+  if (!ptr || !Py_IsInitialized()) {
+    return;
+  }
+
+  py::gil_scoped_acquire acquire;
+
+  PyObject *type = nullptr, *value = nullptr, *traceback = nullptr;
+  // Always stash, this will be a no-op if there is no error
+  PyErr_Fetch(&type, &value, &traceback);
+
+  TORCH_CHECK(
+      get_method(name)(reinterpret_cast<openreg_ptr_t>(ptr)).cast<bool>(),
+      "Failed to free memory pointer at ",
+      ptr);
+
+  // If that user code raised an error, just print it without raising it
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+  }
+
+  // Restore the original error
+  PyErr_Restore(type, value, traceback);
+}
+
+#define REGISTER_PRIVATEUSE1_SERIALIZATION(                                    \
+    FOR_SERIALIZATION, FOR_DESERIALIZATION)                                    \
+  static int register_serialization() {                                        \
+    torch::jit::TensorBackendMetaRegistry(                                     \
+        c10::DeviceType::PrivateUse1, FOR_SERIALIZATION, FOR_DESERIALIZATION); \
+    return 0;                                                                  \
+  }                                                                            \
+  static const int _temp = register_serialization();
+
+} // namespace openreg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegHooks.cpp b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegHooks.cpp
index 590f9f8997e1..2aebd41e0fad 100644
--- a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegHooks.cpp
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegHooks.cpp
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #include <OpenReg.h>
 
 #include <ATen/detail/PrivateUse1HooksInterface.h>
@@ -17,6 +18,24 @@ PyObject* py_factory;
 
 using host_ptr_t = uint64_t;
 
+=======
+#include "OpenReg.h"
+
+#include <ATen/CPUGeneratorImpl.h>
+#include <ATen/core/GeneratorForPrivateuseone.h>
+#include <ATen/detail/PrivateUse1HooksInterface.h>
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+
+namespace openreg {
+namespace {
+
+// Python factory function where real implementations can be found
+PyObject* py_factory;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct HostAllocator final : at::Allocator {
   HostAllocator() = default;
 
@@ -24,6 +43,7 @@ struct HostAllocator final : at::Allocator {
     py::gil_scoped_acquire acquire;
     void* data = nullptr;
     if (nbytes > 0) {
+<<<<<<< HEAD
       data = reinterpret_cast<void*>(get_method("hostMalloc")(nbytes).cast<host_ptr_t>());
       TORCH_CHECK(data, "Failed to allocator ", nbytes, " bytes on host.");
     }
@@ -42,13 +62,34 @@ struct HostAllocator final : at::Allocator {
 
   at::DeleterFnPtr raw_deleter() const override {
     return &ReportAndDelete;
+=======
+      data = reinterpret_cast<void*>(
+          get_method("hostMalloc")(nbytes).cast<openreg_ptr_t>());
+      TORCH_CHECK(data, "Failed to allocator ", nbytes, " bytes on host.");
+    }
+    return {data, data, &ReportAndDelete<kHostFreeMethod>, at::Device(at::kCPU)};
+  }
+
+  at::DeleterFnPtr raw_deleter() const override {
+    return &ReportAndDelete<kHostFreeMethod>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void copy_data(void* dest, const void* src, std::size_t count) const final {
     py::gil_scoped_acquire acquire;
+<<<<<<< HEAD
     get_method("hostCopyData")(reinterpret_cast<host_ptr_t>(dest), reinterpret_cast<host_ptr_t>(src), count);
   }
 };
+=======
+    get_method("hostCopyData")(
+        reinterpret_cast<openreg_ptr_t>(dest),
+        reinterpret_cast<openreg_ptr_t>(src),
+        count);
+  }
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static HostAllocator global_host_alloc;
 
 static c10::DeviceIndex device_count() {
@@ -77,6 +118,7 @@ static at::Generator make_openreg_generator(c10::DeviceIndex device_index) {
 // Default, global generators, one per device.
 static std::vector<at::Generator> default_generators;
 
+<<<<<<< HEAD
 static void initGenerators() {
   auto deivce_nums = device_count();
   default_generators.resize(deivce_nums);
@@ -91,6 +133,10 @@ struct OpenRegHooksArgs : public at::PrivateUse1HooksArgs {};
 
 struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
   OpenRegHooksInterface(OpenRegHooksArgs) {};
+=======
+struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
+  OpenRegHooksInterface() {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~OpenRegHooksInterface() override = default;
 
   bool hasPrimaryContext(c10::DeviceIndex device_index) const override {
@@ -104,13 +150,31 @@ struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
 
   bool isPinnedPtr(const void* data) const override {
     py::gil_scoped_acquire acquire;
+<<<<<<< HEAD
     return get_method("isPinnedPtr")(reinterpret_cast<host_ptr_t>(data)).cast<bool>();
+=======
+    return get_method("isPinnedPtr")(reinterpret_cast<openreg_ptr_t>(data))
+        .cast<bool>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   const at::Generator& getDefaultGenerator(
       c10::DeviceIndex device_index) const override {
+<<<<<<< HEAD
     static c10::once_flag generator_init_flag;
     c10::call_once(generator_init_flag, initGenerators);
+=======
+    static bool flag [[maybe_unused]] = []() {
+      auto deivce_nums = device_count();
+      default_generators.resize(deivce_nums);
+      for (auto i = 0; i < deivce_nums; i++) {
+        default_generators[i] = make_openreg_generator(i);
+        default_generators[i].seed();
+      }
+      return true;
+    }();
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::DeviceIndex idx = device_index;
     if (idx == -1) {
       idx = current_device_idx();
@@ -125,6 +189,7 @@ struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
   }
 };
 
+<<<<<<< HEAD
 int register_hook() {
   at::RegisterPrivateUse1HooksInterface(new OpenRegHooksInterface(OpenRegHooksArgs{}));
   return 0;
@@ -135,6 +200,13 @@ TORCH_DECLARE_REGISTRY(PrivateUse1HooksRegistry, OpenRegHooksInterface, OpenRegH
 C10_DEFINE_REGISTRY(PrivateUse1HooksRegistry, OpenRegHooksInterface, OpenRegHooksArgs);
 // Using Create function to get PrivateUse1HooksInterface point from PrivateUse1HooksRegistry class.
 C10_REGISTER_TYPED_CLASS(PrivateUse1HooksRegistry, "OpenRegHooks", OpenRegHooksInterface);
+=======
+static bool register_hook_flag [[maybe_unused]] = []() {
+  at::RegisterPrivateUse1HooksInterface(new OpenRegHooksInterface());
+
+  return true;
+}();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Device guard registration
 struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
@@ -158,7 +230,12 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   c10::Device exchangeDevice(c10::Device d) const override {
     TORCH_INTERNAL_ASSERT(d.is_privateuseone());
     py::gil_scoped_acquire acquire;
+<<<<<<< HEAD
     auto old_device_index = get_method("exchangeDevice")(d.index()).cast<c10::DeviceIndex>();
+=======
+    auto old_device_index =
+        get_method("exchangeDevice")(d.index()).cast<c10::DeviceIndex>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::Device(static_type, old_device_index);
   }
 
@@ -207,9 +284,18 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   /**
    * Get a stream from the global pool for a given device.
    */
+<<<<<<< HEAD
   c10::Stream getStreamFromGlobalPool(c10::Device d, bool isHighPriority = false) const override {
     py::gil_scoped_acquire acquire;
     return get_method("getStreamFromGlobalPool")(d.index(), isHighPriority).cast<c10::Stream>();
+=======
+  c10::Stream getStreamFromGlobalPool(
+      c10::Device d,
+      bool isHighPriority = false) const override {
+    py::gil_scoped_acquire acquire;
+    return get_method("getStreamFromGlobalPool")(d.index(), isHighPriority)
+        .cast<c10::Stream>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   /**
@@ -219,7 +305,12 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    */
   c10::Stream getNewStream(c10::Device d, int priority = 0) const override {
     py::gil_scoped_acquire acquire;
+<<<<<<< HEAD
     auto stream_id = get_method("getNewStream")(d.index(), priority).cast<c10::StreamId>();
+=======
+    auto stream_id =
+        get_method("getNewStream")(d.index(), priority).cast<c10::StreamId>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return c10::Stream(c10::Stream::UNSAFE, d, stream_id);
   }
 
@@ -322,8 +413,14 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
    * being used on the given stream, and that it should thus avoid recycling the
    * DataPtr until all work on that stream is done.
    */
+<<<<<<< HEAD
   void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const c10::Stream& stream)
       const override {
+=======
+  void recordDataPtrOnStream(
+      const c10::DataPtr& data_ptr,
+      const c10::Stream& stream) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     py::gil_scoped_acquire acquire;
     get_method("recordDataPtrOnStream")(data_ptr, stream);
   }
@@ -331,17 +428,32 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   /**
    * Fetch the elapsed time between two recorded events.
    */
+<<<<<<< HEAD
   double elapsedTime(void* event1, void* event2, const c10::DeviceIndex device_index)
       const override {
     py::gil_scoped_acquire acquire;
     return get_method("elapsedTime")((int64_t)event1, (int64_t)event2, device_index).cast<double>();
+=======
+  double elapsedTime(
+      void* event1,
+      void* event2,
+      const c10::DeviceIndex device_index) const override {
+    py::gil_scoped_acquire acquire;
+    return get_method("elapsedTime")(
+               (int64_t)event1, (int64_t)event2, device_index)
+        .cast<double>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
 // Register our device guard
 C10_REGISTER_GUARD_IMPL(PrivateUse1, OpenRegGuardImpl);
 
+<<<<<<< HEAD
 } // anonymous namspaces
+=======
+} // namespace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Setter for the python dictionary with implementations
 void set_impl_factory(PyObject* factory) {
@@ -352,4 +464,9 @@ py::function get_method(const char* name) {
   auto factory = py::cast<py::function>(py_factory);
   return factory(name);
 }
-} // openreg
\ No newline at end of file
+<<<<<<< HEAD
+} // openreg
+=======
+
+} // namespace openreg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegMem.cpp b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegMem.cpp
index ad5d004aea18..1b9f047365aa 100644
--- a/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegMem.cpp
+++ b/test/cpp_extensions/open_registration_extension/pytorch_openreg/csrc/OpenRegMem.cpp
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 #include <OpenReg.h>
 
 #include <torch/library.h>
@@ -16,12 +17,38 @@ namespace {
 using openreg_ptr_t = uint64_t;
 
 // A dummy allocator for our custom device, that secretly uses the CPU
+=======
+#include "OpenReg.h"
+
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/ops/as_strided_cpu_dispatch.h>
+#include <ATen/ops/quantize_per_tensor_native.h>
+#include <ATen/ops/resize_native.h>
+#include <ATen/ops/set_cpu_dispatch.h>
+#include <ATen/ops/set_native.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/transformers/attention.h>
+#include <ATen/native/transformers/sdp_utils_cpp.h>
+#include <ATen/native/quantized/AffineQuantizer.h>
+
+#include <c10/core/Allocator.h>
+
+#include <torch/library.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+
+namespace openreg {
+namespace {
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct OpenRegAllocator final : at::Allocator {
   OpenRegAllocator() = default;
 
   at::DataPtr allocate(size_t nbytes) override {
     py::gil_scoped_acquire acquire;
     auto curr_device_idx = get_method("getDevice")().cast<c10::DeviceIndex>();
+<<<<<<< HEAD
     auto curr_device = c10::Device(c10::DeviceType::PrivateUse1, curr_device_idx);
     void* data = nullptr;
     if (nbytes > 0) {
@@ -58,10 +85,27 @@ struct OpenRegAllocator final : at::Allocator {
 
   at::DeleterFnPtr raw_deleter() const override {
     return &ReportAndDelete;
+=======
+    auto curr_device =
+        c10::Device(c10::DeviceType::PrivateUse1, curr_device_idx);
+    void* data = nullptr;
+    if (nbytes > 0) {
+      data = reinterpret_cast<void*>(
+          get_method("malloc")(nbytes).cast<openreg_ptr_t>());
+      TORCH_CHECK(
+          data, "Failed to allocator ", nbytes, " bytes on openreg device.");
+    }
+    return {data, data, &ReportAndDelete<kFreeMethod>, curr_device};
+  }
+
+  at::DeleterFnPtr raw_deleter() const override {
+    return &ReportAndDelete<kFreeMethod>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void copy_data(void* dest, const void* src, std::size_t count) const final {
     py::gil_scoped_acquire acquire;
+<<<<<<< HEAD
     get_method("copy_data")(reinterpret_cast<openreg_ptr_t>(dest), reinterpret_cast<openreg_ptr_t>(src), count);
   }
 };
@@ -71,6 +115,18 @@ static OpenRegAllocator global_openreg_alloc;
 REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_openreg_alloc);
 
 
+=======
+    get_method("copy_data")(
+        reinterpret_cast<openreg_ptr_t>(dest),
+        reinterpret_cast<openreg_ptr_t>(src),
+        count);
+  }
+};
+
+static OpenRegAllocator global_openreg_alloc;
+REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_openreg_alloc);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Empty op needs C++ code and cannot be handled by python side fallback
 at::Tensor empty_openreg(
     c10::IntArrayRef size,
@@ -82,8 +138,17 @@ at::Tensor empty_openreg(
   const auto device = c10::device_or_default(device_opt);
   const auto dtype = c10::dtype_or_default(dtype_opt);
   TORCH_CHECK(device.is_privateuseone());
+<<<<<<< HEAD
   TORCH_CHECK(c10::layout_or_default(layout_opt) == c10::Layout::Strided, "Non strided layout not supported");
   TORCH_CHECK(!c10::pinned_memory_or_default(pin_memory_opt), "Pin memory can only be on CPU");
+=======
+  TORCH_CHECK(
+      c10::layout_or_default(layout_opt) == c10::Layout::Strided,
+      "Non strided layout not supported");
+  TORCH_CHECK(
+      !c10::pinned_memory_or_default(pin_memory_opt),
+      "Pin memory can only be on CPU");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const c10::DeviceGuard device_guard(device);
   constexpr c10::DispatchKeySet pu1_dks(c10::DispatchKey::PrivateUse1);
   return at::detail::empty_generic(
@@ -100,8 +165,17 @@ at::Tensor empty_strided_openreg(
   const auto device = c10::device_or_default(device_opt);
   const auto dtype = c10::dtype_or_default(dtype_opt);
   TORCH_CHECK(device.is_privateuseone());
+<<<<<<< HEAD
   TORCH_CHECK(c10::layout_or_default(layout_opt) == c10::Layout::Strided, "Non strided layout not supported");
   TORCH_CHECK(!c10::pinned_memory_or_default(pin_memory_opt), "Pin memory can only be on CPU");
+=======
+  TORCH_CHECK(
+      c10::layout_or_default(layout_opt) == c10::Layout::Strided,
+      "Non strided layout not supported");
+  TORCH_CHECK(
+      !c10::pinned_memory_or_default(pin_memory_opt),
+      "Pin memory can only be on CPU");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const c10::DeviceGuard device_guard(device);
   constexpr c10::DispatchKeySet pu1_dks(c10::DispatchKey::PrivateUse1);
   return at::detail::empty_strided_generic(
@@ -113,16 +187,33 @@ at::Tensor as_strided_openreg(
     c10::IntArrayRef size,
     c10::IntArrayRef stride,
     std::optional<int64_t> storage_offset_) {
+<<<<<<< HEAD
     // Metadata-only change so we re-use the cpu impl
     return at::cpu::as_strided(self, size, stride, storage_offset_);
 }
 
 at::Tensor& set_openreg(
+=======
+  // Metadata-only change so we re-use the cpu impl
+  return at::cpu::as_strided(self, size, stride, storage_offset_);
+}
+
+const at::Tensor& resize__openreg(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    ::std::optional<at::MemoryFormat> memory_format) {
+  return at::native::resize_(
+      self, C10_AS_INTARRAYREF_SLOW(size), memory_format);
+}
+
+at::Tensor& set_source_Storage_storage_offsetset_openreg(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::Tensor& result,
     at::Storage storage,
     int64_t storage_offset,
     c10::IntArrayRef size,
     c10::IntArrayRef stride) {
+<<<<<<< HEAD
     return at::cpu::set_(result, storage, storage_offset, size, stride);
 }
 
@@ -137,3 +228,244 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
 } // anonymous namspaces
 
 } // openreg
+=======
+  return at::cpu::set_(result, storage, storage_offset, size, stride);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, c10::SymInt, c10::SymInt, at::Tensor, at::Tensor, at::Tensor>
+custom_scaled_dot_product_fused_attention_overrideable(
+    const at::Tensor & query,
+    const at::Tensor & key,
+    const at::Tensor & value,
+    const std::optional<at::Tensor> & attn_bias,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask,
+    std::optional<double> scale) {
+  const int64_t batch_size = query.size(0);
+  const int64_t num_heads = query.size(1);
+  const int64_t head_dim_v = value.size(3);
+  const int64_t max_seqlen_q = query.size(2);
+  const int64_t max_seqlen_kv = key.size(2);
+
+  auto opts = query.options();
+  auto output = at::empty({batch_size, num_heads, max_seqlen_q, head_dim_v}, opts);
+  auto logsumexp = at::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+  auto debug_attn_mask = at::empty({batch_size, num_heads, max_seqlen_q, max_seqlen_kv},
+                                   opts.dtype(at::kFloat));
+  auto philox_seed = at::empty({}, at::dtype(at::kLong));
+  auto philox_offset = at::empty({}, at::dtype(at::kLong));
+
+  return std::make_tuple(output, logsumexp, at::Tensor(), at::Tensor(), max_seqlen_q, max_seqlen_kv, philox_seed, philox_offset, debug_attn_mask);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+custom_scaled_dot_product_fused_attention_overrideable_backward(
+    const at::Tensor & grad_out,
+    const at::Tensor & query,
+    const at::Tensor & key,
+    const at::Tensor & value,
+    const at::Tensor & attn_bias,
+    std::array<bool,4> grad_input_mask,
+    const at::Tensor & out,
+    const at::Tensor & logsumexp,
+    const at::Tensor & cum_seq_q,
+    const at::Tensor & cum_seq_k,
+    int64_t max_q,
+    int64_t max_k,
+    double dropout_p,
+    bool is_causal,
+    const at::Tensor & philox_seed,
+    const at::Tensor & philox_offset,
+    std::optional<double> scale) {
+  return std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>(
+          at::empty_like(query),
+          at::empty_like(key),
+          at::empty_like(value),
+          at::empty_like(attn_bias));
+}
+}
+
+// Using the simplest way to obtain continuous Tensor data and process it.
+// This is a demo for using operand API, and you can add more complex logic
+// for input and output tensor based on your custom device kernel.
+void abs_kernel(at::TensorIteratorBase& iter) {
+  // Abs only have a input tensor and a output tensor.
+  auto& output_operand = iter.operand(0);
+  auto& input_operand = iter.operand(1);
+  auto& output_tensor_base = output_operand.tensor_base();
+  auto& input_tensor_base = input_operand.tensor_base();
+  TORCH_CHECK(!input_operand.original_tensor_base().defined(),
+    "input original tensor is defined.");
+  TORCH_CHECK(!output_operand.original_tensor_base().defined(),
+    "output original tensor is defined.");
+  // For easy test, only accept contiguous input tensor for calculate.
+  auto memory_format = input_tensor_base.suggest_memory_format();
+  TORCH_CHECK(input_tensor_base.is_contiguous(memory_format),
+    "Input tensor need be contiguous.");
+  // Add necessary restrictions to ensure the security of the demo.
+  TORCH_CHECK(input_tensor_base.sizes() == output_tensor_base.sizes(),
+    "Intput and output tensor size are not equal.");
+  // Common dtype is calculate in TensorIteratorBase.
+  TORCH_CHECK(iter.common_dtype() == at::ScalarType::Float,
+    "Only support float type.")
+  // Using for loop for abs calculate.
+  auto abs_function = [](float* output_ptr, const float* input_ptr,
+                         const int64_t NUM) {
+    for (int64_t i = 0; i < NUM; ++i) {
+      *(output_ptr + i) = std::abs(*(input_ptr + i));
+    }
+  };
+  // To simplify the logic of the test demo code,
+  // we only use contiguous tensor to calculate on device side.
+  // And using input tensor memory format.
+  if (iter.is_contiguous()) {
+    // Add for will_resize flag check. You can convert to differernt
+    // tensor memory format when will_resize is True.
+    // If TensorIteratorConfig resize_outputs_ flag is true, and there are two
+    // situations:
+    // 1) Out tensor is undefined, and TensorIterator set will_resize to true;
+    // 2) Out tensor is defined and tensor size is not equal to input tensor size;
+    //    TensorIterator set will_resize to true, and call set_output_raw_strided
+    //    to resize output tensor.
+    // When output operand will_resize flag is ture, dummy
+    // device can convert tensor to dummy device preferred memory format.
+    // Here we don't convert tensor memory format, because it will become complex
+    // when dummy device want keep same memory format for training network.
+    TORCH_CHECK(output_operand.will_resize,
+      "output operand will_resize flag need be True.");
+    abs_function((float*)iter.data_ptr(0), (float*)iter.data_ptr(1), iter.numel());
+  } else {
+    // Stride copy is not support for foo device, using cpu device instead.
+    // For abs op, the last situation is: output tensor is not contiguous with
+    // operand will_resize is False.
+    TORCH_CHECK(!output_operand.will_resize, "output operand will_resize is True.");
+    // Get a contiguous tensor with input memory format.
+    at::Tensor output = at::empty(output_tensor_base.sizes(),
+                                  input_tensor_base.options()
+                                                   .memory_format(memory_format));
+    // For structured op which inheried from TensorIteratorBase, maybe you need to
+    // call set_output_raw_strided function to update output stored in op sturctured.
+    // abs op is no need to do this.
+    output_operand.exchange_tensor(c10::MaybeOwned<at::TensorBase>::owned(std::in_place, output));
+    abs_function((float*)output_operand.tensor_base().mutable_data_ptr(),
+                 (float*)iter.data_ptr(1), iter.numel());
+    // Copy tensor base to original tensor base, and keep same scalar type and
+    // stride with cpu and gpu.
+    if (output_operand.original_tensor_base().defined() &&
+        !output_operand.original_tensor_base().is_same(output_operand.tensor_base())) {
+      output_operand.original_tensor().copy_(output_operand.tensor());
+      output_operand.restore_original_tensor();
+    }
+  }
+}
+
+int64_t _fused_sdp_choice_privateuse1(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const std::optional<at::Tensor>& attn_mask,
+    double dropout_p,
+    bool is_causal,
+    std::optional<double> scale,
+    bool enable_gqa) {
+  auto backend = sdp::SDPBackend::overrideable;
+  return static_cast<int64_t>(backend);
+}
+
+void quantize_tensor_per_tensor_affine_privateuse1(
+    const at::Tensor& rtensor,
+    at::Tensor& qtensor,
+    double scale,
+    int64_t zero_point) {
+    // Just test the process, so do nothing
+}
+
+/* Notes:
+ *
+ * OpenReg is currently designed to simulate device memory through multiple
+ * subprocesses on purpose to ensure we don't mistakenly poke at the "device's
+ * memory" from the main process. And be able to simulate the same thing that
+ * happens with other accelerators: any metadata-only change is cpu-only
+ * (main process), any data change must go through to the device (other process)
+ * and any data transfer between the two is expensive (serializing the whole
+ * Tensor).
+ *
+ * Currently, for the efficiency of IPC, most operations are to pass the Tensor
+ * metadata, and only a small number of operations involving copy will serialize
+ * and pass the Tensor body by custom pickler provided by torch.multiprocess.
+ *
+ * Therefore, in principle, only operations related to Metadata modification can
+ * be directly implemented at the C++ level and registered in PrivateUse1; but
+ * if memory access is involved, the relevant operations must be implemented at
+ * the Python level, otherwise invalid memory access will result.
+ */
+
+TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+  m.impl("empty.memory_format", empty_openreg);
+  m.impl("empty_strided", empty_strided_openreg);
+  m.impl("as_strided", as_strided_openreg);
+  m.impl("resize_", resize__openreg);
+  m.impl("set_.source_Storage", at::native::set_);
+  m.impl("set_.source_Storage_storage_offset", set_source_Storage_storage_offsetset_openreg);
+  m.impl("quantize_per_tensor", at::native::quantize_per_tensor);
+  m.impl("_fused_sdp_choice", &_fused_sdp_choice_privateuse1);
+  m.impl("_scaled_dot_product_fused_attention_overrideable", &custom_scaled_dot_product_fused_attention_overrideable);
+  m.impl("_scaled_dot_product_fused_attention_overrideable_backward", &custom_scaled_dot_product_fused_attention_overrideable_backward);
+}
+
+struct OpenRegBackendMeta : public c10::BackendMeta {
+  OpenRegBackendMeta(int version_number, int format_number)
+      : version_number_(version_number), format_number_(format_number) {}
+
+  int version_number_{-1};
+  int format_number_{-1};
+};
+
+void for_serialization(
+    const at::Tensor& t,
+    std::unordered_map<std::string, bool>& m) {
+  auto meta_ptr = t.unsafeGetTensorImpl()->get_backend_meta();
+
+  if (meta_ptr != nullptr) {
+    auto o_meta_ptr = dynamic_cast<OpenRegBackendMeta*>(meta_ptr);
+    if (o_meta_ptr->version_number_ == 1) {
+      m["version_number"] = true;
+    }
+    if (o_meta_ptr->format_number_ == 29) {
+      m["format_number"] = true;
+    }
+  }
+}
+
+void for_deserialization(
+    const at::Tensor& t,
+    std::unordered_map<std::string, bool>& m) {
+  int version_number{-1};
+  int format_number{-1};
+
+  if (m.find("version_number") != m.end()) {
+    version_number = 1;
+  }
+  if (m.find("format_number") != m.end()) {
+    format_number = 29;
+  }
+
+  c10::intrusive_ptr<c10::BackendMeta> meta{std::unique_ptr<c10::BackendMeta>(
+      new OpenRegBackendMeta(version_number, format_number))};
+  t.unsafeGetTensorImpl()->set_backend_meta(meta);
+}
+
+REGISTER_PRIVATEUSE1_SERIALIZATION(&for_serialization, &for_deserialization)
+} // namespace openreg
+
+namespace at::native {
+REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &openreg::abs_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(
+    quantize_tensor_per_tensor_affine_stub,
+    &openreg::quantize_tensor_per_tensor_affine_privateuse1);
+REGISTER_PRIVATEUSE1_DISPATCH(
+    _fused_sdp_choice_stub,
+    &openreg::_fused_sdp_choice_privateuse1);
+} // namespace at::native
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/cpp_extensions/open_registration_extension/setup.py b/test/cpp_extensions/open_registration_extension/setup.py
index d4e6459d6476..b43fbb335ed8 100644
--- a/test/cpp_extensions/open_registration_extension/setup.py
+++ b/test/cpp_extensions/open_registration_extension/setup.py
@@ -1,5 +1,9 @@
 import distutils.command.clean
 import os
+<<<<<<< HEAD
+=======
+import platform
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import shutil
 import sys
 from pathlib import Path
@@ -40,6 +44,12 @@ def run(self):
             CXX_FLAGS = ["/sdl"]
         else:
             CXX_FLAGS = ["/sdl", "/permissive-"]
+<<<<<<< HEAD
+=======
+    elif platform.machine() == "s390x":
+        # no -Werror on s390x due to newer compiler
+        CXX_FLAGS = {"cxx": ["-g", "-Wall"]}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         CXX_FLAGS = {"cxx": ["-g", "-Wall", "-Werror"]}
 
diff --git a/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py b/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
new file mode 100644
index 000000000000..a64ddc8e440e
--- /dev/null
+++ b/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
@@ -0,0 +1,65 @@
+# Owner(s): ["module: cpp"]
+
+import os
+import shutil
+import subprocess
+import sys
+import unittest
+from pathlib import Path
+
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    onlyCUDA,
+)
+from torch.testing._internal.common_utils import IS_LINUX, run_tests, shell, TestCase
+
+
+class TestPythonAgnostic(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # Wipe the dist dir if it exists
+        cls.extension_root = Path(__file__).parent.parent
+        cls.dist_dir = os.path.join(cls.extension_root, "dist")
+        if os.path.exists(cls.dist_dir):
+            shutil.rmtree(cls.dist_dir)
+
+        # Build the wheel
+        wheel_cmd = [sys.executable, "setup.py", "bdist_wheel"]
+        return_code = shell(wheel_cmd, cwd=cls.extension_root, env=os.environ)
+        if return_code != 0:
+            raise RuntimeError("python_agnostic bdist_wheel failed to build")
+
+    @onlyCUDA
+    @unittest.skipIf(not IS_LINUX, "test requires linux tools ldd and nm")
+    def test_extension_is_python_agnostic(self, device):
+        # For this test, run_test.py will call `python setup.py bdist_wheel` in the
+        # cpp_extensions/python_agnostic_extension folder, where the extension and
+        # setup calls specify py_limited_api to `True`. To approximate that the
+        # extension is indeed python agnostic, we test
+        #   a. The extension wheel name contains "cp39-abi3", meaning the wheel
+        # should be runnable for any Python 3 version after and including 3.9
+        #   b. The produced shared library does not have libtorch_python.so as a
+        # dependency from the output of "ldd _C.so"
+        #   c. The .so does not need any python related symbols. We approximate
+        # this by running "nm -u _C.so" and grepping that nothing starts with "Py"
+
+        matches = list(Path(self.dist_dir).glob("*.whl"))
+        self.assertEqual(len(matches), 1, msg=str(matches))
+        whl_file = matches[0]
+        self.assertRegex(str(whl_file), r".*python_agnostic-0\.0-cp39-abi3-.*\.whl")
+
+        build_dir = os.path.join(self.extension_root, "build")
+        matches = list(Path(build_dir).glob("**/*.so"))
+        self.assertEqual(len(matches), 1, msg=str(matches))
+        so_file = matches[0]
+        lddtree = subprocess.check_output(["ldd", so_file]).decode("utf-8")
+        self.assertFalse("torch_python" in lddtree)
+
+        missing_symbols = subprocess.check_output(["nm", "-u", so_file]).decode("utf-8")
+        self.assertFalse("Py" in missing_symbols)
+
+
+instantiate_device_type_tests(TestPythonAgnostic, globals(), only_for="cuda")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_autograd.py b/test/distributed/_composable/fsdp/test_fully_shard_autograd.py
index a09d9c3e58d6..050907d60d52 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_autograd.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_autograd.py
@@ -4,7 +4,10 @@
 import copy
 import functools
 import itertools
+<<<<<<< HEAD
 import unittest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Optional, Union
 
 import torch
@@ -12,13 +15,20 @@
 import torch.nn as nn
 from torch.distributed.fsdp import fully_shard
 from torch.nn.parallel.scatter_gather import _is_namedtuple
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
     DoubleLinear,
     FSDPTest,
     FSDPTestMultiThread,
+<<<<<<< HEAD
+=======
+    get_devtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MLP,
 )
 from torch.testing._internal.common_utils import run_tests
@@ -28,10 +38,20 @@
 )
 
 
+<<<<<<< HEAD
 class TestFullyShardAutograd(FSDPTest):
     @property
     def world_size(self) -> int:
         return min(4, torch.cuda.device_count())
+=======
+device_type = torch.device(get_devtype())
+
+
+class TestFullyShardAutograd(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _reduce_1d_partial_grads(
         self, module: nn.Module, group: Optional[dist.ProcessGroup] = None
@@ -58,7 +78,11 @@ def _test_unused_forward_output(self, reshard_after_forward: Union[bool, int]):
         local_batch_size = 2
         global_batch_size, dim = (self.world_size * local_batch_size, 24)
         model = DoubleLinear(dim=dim, use_second_linear=True)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard(model.lin1, reshard_after_forward=reshard_after_forward)
         fully_shard(model, reshard_after_forward=reshard_after_forward)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
@@ -68,7 +92,11 @@ def _test_unused_forward_output(self, reshard_after_forward: Union[bool, int]):
         for iter_idx in range(10):
             # Use all forward outputs in the loss/backward for the first half
             # of the iterations and only the 1st forward output for the rest
+<<<<<<< HEAD
             global_inp = torch.rand((global_batch_size, dim), device="cuda")
+=======
+            global_inp = torch.rand((global_batch_size, dim), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_inp = global_inp[
                 self.rank * local_batch_size : (self.rank + 1) * local_batch_size
             ].detach()
@@ -104,7 +132,11 @@ def _test_unused_forward_module(self, reshard_after_forward: Union[bool, int]):
         local_batch_size, dim = (2, 24)
         global_batch_size = self.world_size * local_batch_size
         model = DoubleLinear(dim=dim, use_second_linear=False)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard(model.lin1, reshard_after_forward=reshard_after_forward)
         fully_shard(model.lin2, reshard_after_forward=reshard_after_forward)
         fully_shard(model, reshard_after_forward=reshard_after_forward)
@@ -113,7 +145,11 @@ def _test_unused_forward_module(self, reshard_after_forward: Union[bool, int]):
 
         torch.manual_seed(1)  # same on all ranks
         for iter_idx in range(10):
+<<<<<<< HEAD
             global_inp = torch.rand((global_batch_size, dim), device="cuda")
+=======
+            global_inp = torch.rand((global_batch_size, dim), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_inp = global_inp[
                 self.rank * local_batch_size : (self.rank + 1) * local_batch_size
             ].detach()
@@ -214,7 +250,11 @@ def forward(self, x: torch.Tensor):
             Module(dim),
             FromContainerType(container_type),
         )
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for module in model:
             fully_shard(module)
         fully_shard(model)
@@ -223,7 +263,11 @@ def forward(self, x: torch.Tensor):
 
         torch.manual_seed(1)  # same on all ranks
         for iter_idx in range(10):
+<<<<<<< HEAD
             global_inp = torch.rand((global_batch_size, dim), device="cuda")
+=======
+            global_inp = torch.rand((global_batch_size, dim), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_inp = global_inp[
                 self.rank * local_batch_size : (self.rank + 1) * local_batch_size
             ].detach()
@@ -245,7 +289,11 @@ class TestFullyShardPostAccGradHookMultiThread(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 2
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_post_acc_grad_hook_runs(self):
         param_name_to_hook_count = collections.defaultdict(int)
 
@@ -260,7 +308,11 @@ def hook(param_name: str, param: torch.Tensor) -> None:
             param_hook = functools.partial(hook, param_name)
             param.register_post_accumulate_grad_hook(param_hook)
 
+<<<<<<< HEAD
         inp = torch.randn((2, 8), device="cuda")
+=======
+        inp = torch.randn((2, 8), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model(inp).sum().backward()
         param_names = {param_name for param_name, _ in model.named_parameters()}
         self.assertEqual(param_names, set(param_name_to_hook_count.keys()))
@@ -271,7 +323,11 @@ def hook(param_name: str, param: torch.Tensor) -> None:
 class TestFullyShardPostAccGradHookMultiProcess(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(torch.cuda.device_count(), 2)
+=======
+        return min(torch.get_device_module(device_type).device_count(), 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_post_acc_grad_hook_optim_parity(self):
@@ -283,7 +339,11 @@ def test_post_acc_grad_hook_optim_parity(self):
         model_args = ModelArgs(dropout_p=0.0)
         model = Transformer(model_args)
 
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for module in itertools.chain(ref_model.layers, [ref_model]):
             fully_shard(module)
         optim_kwargs = {"lr": 1e-2, "foreach": False}
@@ -312,7 +372,11 @@ def optim_hook(param: nn.Parameter) -> None:
             param.register_post_accumulate_grad_hook(optim_hook)
 
         torch.manual_seed(42 + self.rank)
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+=======
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(10):
             ref_loss = ref_model(inp).sum()
             ref_loss.backward()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
index 788eb7a245b4..d1c87f93edfd 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
@@ -11,7 +11,11 @@
 from torch.distributed.fsdp import fully_shard
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+<<<<<<< HEAD
 from torch.testing._internal.common_fsdp import FSDPTest, MLPStack
+=======
+from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLPStack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
@@ -20,6 +24,12 @@
 )
 
 
+<<<<<<< HEAD
+=======
+device_type = torch.device(get_devtype())
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _TestClipGradNormBase(FSDPTest):
     def _test_clip_grad_norm(
         self,
@@ -33,7 +43,11 @@ def _test_clip_grad_norm(
         dp_mesh: Optional[DeviceMesh] = None,
     ):
         vector_norm_fn = functools.partial(torch.linalg.vector_norm, ord=norm_type)
+<<<<<<< HEAD
         dp_mesh = dp_mesh or init_device_mesh("cuda", (self.world_size,))
+=======
+        dp_mesh = dp_mesh or init_device_mesh(device_type.type, (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(42 + dp_mesh.get_local_rank() + 1)
         for _ in range(10):
             ref_optim.zero_grad()
@@ -68,7 +82,11 @@ def _test_clip_grad_norm(
                     max_norm=max_norm,
                     norm_type=norm_type,
                 )
+<<<<<<< HEAD
             self.assertEqual(ref_total_norm, total_norm.full_tensor(), atol=5e-05, rtol=2e-06)
+=======
+            self.assertEqual(ref_total_norm, total_norm.full_tensor())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Expect one all-reduce per mesh dim for partial -> replicate
             expected_all_reduces = len(total_norm.placements)
             self.assertEqual(
@@ -91,7 +109,11 @@ def _test_clip_grad_norm(
 class TestClipGradNormWorldSize2(_TestClipGradNormBase):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(torch.cuda.device_count(), 2)
+=======
+        return min(torch.get_device_module(device_type).device_count(), 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_clip_grad_norm_1d(self):
@@ -99,14 +121,24 @@ def test_clip_grad_norm_1d(self):
             torch.manual_seed(42)
             model_args = ModelArgs(dropout_p=0.0)
             model = Transformer(model_args)
+<<<<<<< HEAD
             ref_model = replicate(copy.deepcopy(model).cuda())
+=======
+            ref_model = replicate(copy.deepcopy(model).to(device_type))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
             for module in model.modules():
                 if isinstance(module, TransformerBlock):
                     fully_shard(module)
             fully_shard(model)
             optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+<<<<<<< HEAD
             inp = torch.randint(0, model.model_args.vocab_size, (3, 16), device="cuda")
+=======
+            inp = torch.randint(
+                0, model.model_args.vocab_size, (3, 16), device=device_type
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._test_clip_grad_norm(
                 1, norm_type, ref_model, ref_optim, model, optim, inp
             )
@@ -115,14 +147,22 @@ def test_clip_grad_norm_1d(self):
 class TestClipGradNormWorldSize4(_TestClipGradNormBase):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(torch.cuda.device_count(), 4)
+=======
+        return min(torch.get_device_module(device_type).device_count(), 4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(4)
     def test_clip_grad_norm_2d(self):
         for norm_type in (2, 1, 3, float("inf")):
             dp_size = 2
             global_mesh = init_device_mesh(
+<<<<<<< HEAD
                 "cuda",
+=======
+                device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (dp_size, self.world_size // dp_size),
                 mesh_dim_names=("dp", "tp"),
             )
@@ -132,7 +172,11 @@ def test_clip_grad_norm_2d(self):
             # has some more significant numeric differences from the TP
             model = MLPStack(16, with_seq_parallel=True)
             ref_model = replicate(
+<<<<<<< HEAD
                 copy.deepcopy(model).cuda(), process_group=dp_mesh.get_group()
+=======
+                copy.deepcopy(model).to(device_type), process_group=dp_mesh.get_group()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
             model.parallelize(
@@ -142,7 +186,11 @@ def test_clip_grad_norm_2d(self):
                 reshard_after_forward=True,
             )
             optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+<<<<<<< HEAD
             inp = torch.randn(2, 16, device="cuda")
+=======
+            inp = torch.randn(2, 16, device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._test_clip_grad_norm(
                 0.5, norm_type, ref_model, ref_optim, model, optim, inp, dp_mesh
             )
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
index ff36cfacf77a..ae1d56671945 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -3,7 +3,12 @@
 import copy
 import functools
 import itertools
+<<<<<<< HEAD
 import unittest
+=======
+import os
+import tempfile
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Callable, Optional, Union
 
 import torch
@@ -35,8 +40,15 @@
 from torch.distributed.tensor import DTensor
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.experimental import implicit_replication
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+=======
+from torch.testing._internal.common_distributed import (
+    requires_multicast_support,
+    skip_if_lt_x_gpu,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
     DoubleLinear,
@@ -60,6 +72,15 @@
 # For recording FSDP events like unshard or post-backward
 EventType = tuple[str, str, TrainingState]
 
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_fsdp import get_devtype
+
+
+device_type = torch.device(get_devtype())
+device_module = torch.get_device_module(device_type)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestFullyShardCollectiveOps(FSDPTestMultiThread):
     @property
@@ -68,7 +89,11 @@ def world_size(self) -> int:
 
     @property
     def device(self) -> torch.device:
+<<<<<<< HEAD
         return torch.device("cuda:0")
+=======
+        return torch.device(device_type.type, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _get_param_sizes(self) -> list[torch.Size]:
         # For world size 128, the fp32 all-gather and reduce-scatter testing
@@ -116,11 +141,22 @@ def _init_fsdp_param_group(
         fsdp_param_group.lazy_init()
         return fsdp_param_group
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_all_gather_fp32(self):
         param_sizes = self._get_param_sizes()
         default_stream = torch.cuda.current_stream()
         stream1, stream2 = torch.cuda.Stream(), torch.cuda.Stream()
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_all_gather_fp32(self):
+        param_sizes = self._get_param_sizes()
+        default_stream = device_module.current_stream()
+        stream1, stream2 = (
+            device_module.Stream(),
+            device_module.Stream(),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for async_op, streams, reshard_after_forward in itertools.product(
             (False, True),
             ((default_stream, default_stream), (stream1, stream2)),
@@ -146,8 +182,13 @@ def _test_all_gather(
         param_sizes: list[torch.Size],
         reshard_after_forward: Union[bool, int],
         async_op: bool,
+<<<<<<< HEAD
         all_gather_copy_in_stream: torch.cuda.Stream,
         all_gather_stream: torch.cuda.Stream,
+=======
+        all_gather_copy_in_stream,
+        all_gather_stream,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         def all_gather(fsdp_param_group: FSDPParamGroup, group: dist.ProcessGroup):
             all_gather_result = foreach_all_gather(
@@ -202,11 +243,19 @@ def check_all_gathered_params(
         )
         check_all_gathered_params(orig_params, module)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_reduce_scatter_fp32(self):
         param_sizes = self._get_param_sizes()
         default_stream = torch.cuda.current_stream()
         stream = torch.cuda.Stream()
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_reduce_scatter_fp32(self):
+        param_sizes = self._get_param_sizes()
+        default_stream = device_module.current_stream()
+        stream = device_module.Stream()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for reduce_scatter_stream in (default_stream, stream):
             self._test_reduce_scatter(
                 param_sizes,
@@ -214,11 +263,19 @@ def test_reduce_scatter_fp32(self):
                 reduce_scatter_dtype=torch.float32,
             )
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_reduce_scatter_fp16(self):
         param_sizes = self._get_param_sizes()
         default_stream = torch.cuda.current_stream()
         stream = torch.cuda.Stream()
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_reduce_scatter_fp16(self):
+        param_sizes = self._get_param_sizes()
+        default_stream = torch.get_device_module(device_type).current_stream()
+        stream = device_module.Stream()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for reduce_scatter_stream in (default_stream, stream):
             self._test_reduce_scatter(
                 param_sizes,
@@ -229,7 +286,11 @@ def test_reduce_scatter_fp16(self):
     def _test_reduce_scatter(
         self,
         param_sizes: list[torch.Size],
+<<<<<<< HEAD
         reduce_scatter_stream: torch.cuda.Stream,
+=======
+        reduce_scatter_stream,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduce_scatter_dtype: torch.dtype,
     ):
         # Set up the reference parameters and construct the FSDP group
@@ -248,7 +309,11 @@ def _test_reduce_scatter(
         unsharded_grads = [torch.ones_like(param) * self.rank for param in orig_params]
         group = fsdp_param_group.mesh_info.shard_process_group
         self.assertEqual(group.size(), self.world_size)
+<<<<<<< HEAD
         all_reduce_stream = torch.cuda.Stream()
+=======
+        all_reduce_stream = device_module.Stream()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             _,
             _,
@@ -264,26 +329,48 @@ def _test_reduce_scatter(
             orig_dtype=orig_params[0].dtype,
             reduce_dtype=reduce_scatter_dtype,
             device=self.device,
+<<<<<<< HEAD
             reduce_scatter_reduce_op=None,
+=======
+            gradient_divide_factor=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             all_reduce_group=None,
             all_reduce_stream=all_reduce_stream,
             all_reduce_hook=None,
             all_reduce_grads=True,
             partial_reduce_output=None,
         )
+<<<<<<< HEAD
         torch.cuda.current_stream().wait_event(post_reduce_event)
 
         # Check reduce-scatter correctness
         predivide_factor, postdivide_factor = _get_gradient_divide_factors(
             group, None, reduce_scatter_dtype
         )
+=======
+        torch.get_device_module(device_type).current_stream().wait_event(
+            post_reduce_event
+        )
+
+        # Check reduce-scatter correctness
+        (
+            predivide_factor,
+            postdivide_factor,
+            _,
+            all_reduce_op,
+        ) = _get_gradient_divide_factors(group, None, reduce_scatter_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduced_grads = [grad.detach().clone() for grad in unsharded_grads]
         for grad in reduced_grads:
             _div_if_needed(grad, predivide_factor)
             dist.all_reduce(
                 grad,
                 group=group,
+<<<<<<< HEAD
                 op=dist.ReduceOp.AVG if predivide_factor is None else dist.ReduceOp.SUM,
+=======
+                op=all_reduce_op,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             _div_if_needed(grad, postdivide_factor)
         for fsdp_param, reduced_grad in zip(fsdp_params, reduced_grads):
@@ -295,7 +382,11 @@ def _test_reduce_scatter(
 class TestFullyShardCommunication(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(4, torch.cuda.device_count())
+=======
+        return min(4, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_fully_shard_communication_count(self):
@@ -304,13 +395,21 @@ def test_fully_shard_communication_count(self):
         reduce-scatters during forward and backward.
         """
         self.run_subtests(
+<<<<<<< HEAD
             {"reshard_after_forward": [True, False, 2]},
+=======
+            {"reshard_after_forward": [True, False, 2, None]},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._test_communication_count,
         )
 
     def _test_communication_count(
         self,
+<<<<<<< HEAD
         reshard_after_forward: Union[bool, int],
+=======
+        reshard_after_forward: Union[bool, int, None],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         torch.manual_seed(42)
         model_args = ModelArgs()
@@ -327,7 +426,11 @@ def _test_communication_count(
         # We construct `num_blocks` plus 1 FSDP states/communication groups
 
         torch.manual_seed(42 + self.rank)
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+=======
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with CommDebugMode() as fwd_comm_mode:
             loss = model(inp)
         fwd_comm_counts = fwd_comm_mode.get_comm_counts()
@@ -336,12 +439,25 @@ def _test_communication_count(
         with CommDebugMode() as bwd_comm_mode:
             loss.sum().backward()
         bwd_comm_counts = bwd_comm_mode.get_comm_counts()
+<<<<<<< HEAD
         if reshard_after_forward is False:
             self.assertEqual(len(bwd_comm_counts), 1)
         else:
             # The root always does not reshard after forward
             self.assertEqual(len(bwd_comm_counts), 2)
             self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_blocks)
+=======
+        if reshard_after_forward is None:
+            # 2 means two types of collectives (all-gather, reduce-scatter)
+            self.assertEqual(len(bwd_comm_counts), 2)
+            # do not reshard root model
+            self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_blocks)
+        elif reshard_after_forward:
+            self.assertEqual(len(bwd_comm_counts), 2)
+            self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_blocks + 1)
+        else:
+            self.assertEqual(len(bwd_comm_counts), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_blocks + 1
         )
@@ -364,7 +480,11 @@ def test_manual_reshard_with_reshard_after_forward_false(self):
         )
 
         torch.manual_seed(42 + self.rank)
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+=======
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with CommDebugMode() as fwd_comm_mode:
             loss = model(inp)
         fwd_comm_counts = fwd_comm_mode.get_comm_counts()
@@ -395,7 +515,11 @@ def _test_set_reduce_scatter_divide_factor(self, divide_factor: float):
         torch.manual_seed(42)
         model_args = ModelArgs(dropout_p=0.0, weight_tying=False)
         model = Transformer(model_args)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
         for module in model.modules():
             if isinstance(module, TransformerBlock):
@@ -405,7 +529,11 @@ def _test_set_reduce_scatter_divide_factor(self, divide_factor: float):
         model.set_reduce_scatter_divide_factor(divide_factor)
 
         torch.manual_seed(42 + self.rank)
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+=======
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for _ in range(10):
             ref_loss = ref_model(inp).sum()
@@ -422,25 +550,116 @@ def _test_set_reduce_scatter_divide_factor(self, divide_factor: float):
             self.assertEqual(ref_loss, loss)
             check_sharded_parity(self, ref_model, model)
 
+<<<<<<< HEAD
+=======
+    @skip_if_lt_x_gpu(2)
+    def test_set_reshard_after_forward(self):
+        """
+        Tests that FSDP issues the expected number of all-gathers and
+        reduce-scatters during a train step when setting reshard_after_forward.
+        comm_count should perform same as test_fully_shard_communication_count.
+        """
+        self.run_subtests(
+            {
+                "set_reshard_after_forward": [True, False, None],
+                "recurse": [True, False],
+            },
+            self._test_set_reshard_after_forward_by_communication_count,
+        )
+
+    def _test_set_reshard_after_forward_by_communication_count(
+        self,
+        set_reshard_after_forward: Union[bool, None],
+        recurse: bool,
+    ):
+        torch.manual_seed(42)
+        model_args = ModelArgs()
+        model = Transformer(model_args).to(device_type)
+        if set_reshard_after_forward is None:
+            fully_shard_fn = fully_shard
+        else:
+            fully_shard_fn = functools.partial(
+                fully_shard, reshard_after_forward=not set_reshard_after_forward
+            )
+
+        num_blocks = 0
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard_fn(module)
+                num_blocks += 1
+        fully_shard_fn(model)
+        num_fsdp_modules = sum(
+            isinstance(module, FSDPModule) for module in model.modules()
+        )
+        if set_reshard_after_forward is not None:
+            model.set_reshard_after_forward(
+                reshard_after_forward=set_reshard_after_forward, recurse=recurse
+            )
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
+        with CommDebugMode() as fwd_comm_mode:
+            loss = model(inp)
+        fwd_comm_counts = fwd_comm_mode.get_comm_counts()
+        self.assertEqual(len(fwd_comm_counts), 1)
+        self.assertEqual(fwd_comm_counts[c10d_ops._allgather_base_], num_fsdp_modules)
+
+        with CommDebugMode() as bwd_comm_mode:
+            loss.sum().backward()
+        bwd_comm_counts = bwd_comm_mode.get_comm_counts()
+        # If recurse is False, set_reshard_after_forward only affects the root module
+        if set_reshard_after_forward is None:
+            self.assertEqual(len(bwd_comm_counts), 2)
+            self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_blocks)
+        elif set_reshard_after_forward:
+            self.assertEqual(len(bwd_comm_counts), 2)
+            self.assertEqual(
+                bwd_comm_counts[c10d_ops._allgather_base_],
+                num_blocks + 1 if recurse else 1,
+            )
+        else:
+            if recurse:
+                self.assertEqual(len(bwd_comm_counts), 1)
+            else:
+                self.assertEqual(len(bwd_comm_counts), 2)
+                self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_blocks)
+
+        self.assertEqual(
+            bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_blocks + 1
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestFullyShardPrefetch(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(4, torch.cuda.device_count())
+=======
+        return min(4, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_fully_shard_backward_prefetch(self):
         # Activation checkpointing should not affect the expected FSDP events
         self.run_subtests(
             {
+<<<<<<< HEAD
                 "reshard_after_forward": [True, False, 2],
+=======
+                "reshard_after_forward": [True, False, 2, None],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "checkpoint_impl": [None, "utils", "composable"],
             },
             self._test_backward_prefetch_forward_backward,
         )
         self.run_subtests(
             {
+<<<<<<< HEAD
                 "reshard_after_forward": [True, False, 2],
+=======
+                "reshard_after_forward": [True, False, 2, None],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "checkpoint_impl": [None, "utils", "composable"],
             },
             self._test_backward_prefetch_multi_forward,
@@ -448,7 +667,13 @@ def test_fully_shard_backward_prefetch(self):
         self._test_backward_prefetch_unused_in_backward(True)
 
     def _test_backward_prefetch_forward_backward(
+<<<<<<< HEAD
         self, reshard_after_forward: Union[bool, int], checkpoint_impl: Optional[str]
+=======
+        self,
+        reshard_after_forward: Union[bool, int, None],
+        checkpoint_impl: Optional[str],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         n_layers = 3
         model, optim, inp = self._init_transformer(
@@ -462,8 +687,14 @@ def _test_backward_prefetch_forward_backward(
             FSDPParamGroup.post_backward, events
         )
         # Check the order for normal 1 forward, 1 backward, 1 optimizer step
+<<<<<<< HEAD
         with patch_unshard(unshard_with_record), patch_post_backward(
             post_backward_with_record
+=======
+        with (
+            patch_unshard(unshard_with_record),
+            patch_post_backward(post_backward_with_record),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             for iter_idx in range(3):
                 loss = model(inp)
@@ -476,6 +707,7 @@ def _test_backward_prefetch_forward_backward(
                 self.assertEqual(events, expected_events)
                 events.clear()
                 loss.sum().backward()
+<<<<<<< HEAD
                 expected_events = [
                     # Root does not reshard after forward so there is no
                     # unshard event for it in backward
@@ -490,6 +722,27 @@ def _test_backward_prefetch_forward_backward(
                     ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
                     ("post_backward", "", TrainingState.POST_BACKWARD),
                 ]
+=======
+                expected_events = []
+                # Root does not reshard after forward so there is no
+                # unshard event for it in backward
+                if reshard_after_forward is not None:
+                    expected_events.append(("unshard", "", TrainingState.PRE_BACKWARD))
+                expected_events.extend(
+                    [
+                        ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                        # Explicit backward prefetching moves the unshards early
+                        # by one module (note how swapping each unshard down one
+                        # event would give the natural event order)
+                        ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                        ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+                        ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                        ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+                        ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+                        ("post_backward", "", TrainingState.POST_BACKWARD),
+                    ]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if reshard_after_forward is False:
                     # No reshard after forward means no backward unshards
                     expected_events = [e for e in expected_events if e[0] != "unshard"]
@@ -513,8 +766,14 @@ def _test_backward_prefetch_multi_forward(
             FSDPParamGroup.post_backward, events
         )
         # Check the order for multiple forwards before 1 backward
+<<<<<<< HEAD
         with patch_unshard(unshard_with_record), patch_post_backward(
             post_backward_with_record
+=======
+        with (
+            patch_unshard(unshard_with_record),
+            patch_post_backward(post_backward_with_record),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             loss1 = model(inp)
             loss2 = model(inp)
@@ -523,6 +782,7 @@ def _test_backward_prefetch_multi_forward(
                 ("unshard", "layers.0", TrainingState.FORWARD),
                 ("unshard", "layers.1", TrainingState.FORWARD),
                 ("unshard", "layers.2", TrainingState.FORWARD),
+<<<<<<< HEAD
                 # Root does not reshard after forward so there is not another
                 # unshard event for it
                 ("unshard", "layers.0", TrainingState.FORWARD),
@@ -548,6 +808,42 @@ def _test_backward_prefetch_multi_forward(
                 ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
                 ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
             ]
+=======
+            ]
+            if reshard_after_forward is not None:
+                expected_events.append(("unshard", "", TrainingState.FORWARD))
+            expected_events.extend(
+                [
+                    ("unshard", "layers.0", TrainingState.FORWARD),
+                    ("unshard", "layers.1", TrainingState.FORWARD),
+                    ("unshard", "layers.2", TrainingState.FORWARD),
+                ]
+            )
+            if reshard_after_forward is False:
+                # No reshard after forward means no second set of unshards
+                expected_events = expected_events[:-4]
+            self.assertEqual(events, expected_events)
+            events.clear()
+            (loss1 + loss2).sum().backward()
+            expected_events = []
+            if reshard_after_forward is not None:
+                expected_events.append(("unshard", "", TrainingState.PRE_BACKWARD))
+            expected_events.extend(
+                [
+                    # Same as the single forward/backward case except the root's
+                    # post-backward does not run until the end of backward in the
+                    # final callback (since the input not requiring gradient means
+                    # that we do not have a tensor on which to hook for
+                    # post-backward)
+                    ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                    ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                    ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+                    ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                    ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+                    ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+                ]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if reshard_after_forward is False:
                 # No reshard after forward means no backward unshards
                 expected_events = [e for e in expected_events if e[0] != "unshard"]
@@ -568,7 +864,11 @@ def _test_backward_prefetch_multi_forward(
             events.clear()
 
     def _test_backward_prefetch_unused_in_backward(
+<<<<<<< HEAD
         self, reshard_after_forward: Union[bool, int]
+=======
+        self, reshard_after_forward: Union[bool, int, None]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Test a model with a linear module then a split into two linear modules,
@@ -582,7 +882,11 @@ def _test_backward_prefetch_unused_in_backward(
         fully_shard(model[1].lin1, reshard_after_forward=reshard_after_forward)
         fully_shard(model[1].lin2, reshard_after_forward=reshard_after_forward)
         fully_shard(model, reshard_after_forward=reshard_after_forward)
+<<<<<<< HEAD
         inp = torch.randn((4, dim), device="cuda")
+=======
+        inp = torch.randn((4, dim), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         events: list[EventType] = []
         unshard_with_record = self._get_unshard_with_record(
             FSDPParamGroup.unshard, events
@@ -590,8 +894,14 @@ def _test_backward_prefetch_unused_in_backward(
         post_backward_with_record = self._get_post_backward_with_record(
             FSDPParamGroup.post_backward, events
         )
+<<<<<<< HEAD
         with patch_unshard(unshard_with_record), patch_post_backward(
             post_backward_with_record
+=======
+        with (
+            patch_unshard(unshard_with_record),
+            patch_post_backward(post_backward_with_record),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             loss1, loss2 = model(inp)
             expected_events = [
@@ -665,6 +975,10 @@ def set_forward_prefetch(model: Transformer, num_to_prefetch: int) -> None:
         )
         expected_backward_events = [
             # Default backward prefetching
+<<<<<<< HEAD
+=======
+            ("unshard", "", TrainingState.PRE_BACKWARD),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ("unshard", "layers.3", TrainingState.PRE_BACKWARD),
             ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
             ("reshard", "layers.3", TrainingState.POST_BACKWARD),
@@ -680,9 +994,17 @@ def set_forward_prefetch(model: Transformer, num_to_prefetch: int) -> None:
             ("reshard", "", TrainingState.POST_BACKWARD),
             ("post_backward", "", TrainingState.POST_BACKWARD),
         ]
+<<<<<<< HEAD
         with patch_unshard(unshard_with_record), patch_reshard(
             reshard_with_record
         ), patch_post_backward(post_backward_with_record):
+=======
+        with (
+            patch_unshard(unshard_with_record),
+            patch_reshard(reshard_with_record),
+            patch_post_backward(post_backward_with_record),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             set_forward_prefetch(model, num_to_prefetch=1)
             loss = model(inp)
             expected_forward_events = [
@@ -696,6 +1018,10 @@ def set_forward_prefetch(model: Transformer, num_to_prefetch: int) -> None:
                 ("unshard", "layers.3", TrainingState.FORWARD),
                 ("reshard", "layers.2", TrainingState.FORWARD),
                 ("reshard", "layers.3", TrainingState.FORWARD),
+<<<<<<< HEAD
+=======
+                ("reshard", "", TrainingState.FORWARD),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             self.assertEqual(events, expected_forward_events)
             events.clear()
@@ -716,6 +1042,10 @@ def set_forward_prefetch(model: Transformer, num_to_prefetch: int) -> None:
                 ("reshard", "layers.1", TrainingState.FORWARD),
                 ("reshard", "layers.2", TrainingState.FORWARD),
                 ("reshard", "layers.3", TrainingState.FORWARD),
+<<<<<<< HEAD
+=======
+                ("reshard", "", TrainingState.FORWARD),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             self.assertEqual(events, expected_forward_events)
             events.clear()
@@ -764,16 +1094,30 @@ def set_backward_prefetch(model: Transformer, num_to_prefetch: int) -> None:
             ("reshard", "layers.2", TrainingState.FORWARD),
             ("unshard", "layers.3", TrainingState.FORWARD),
             ("reshard", "layers.3", TrainingState.FORWARD),
+<<<<<<< HEAD
         ]
         with patch_unshard(unshard_with_record), patch_reshard(
             reshard_with_record
         ), patch_post_backward(post_backward_with_record):
+=======
+            ("reshard", "", TrainingState.FORWARD),
+        ]
+        with (
+            patch_unshard(unshard_with_record),
+            patch_reshard(reshard_with_record),
+            patch_post_backward(post_backward_with_record),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             set_backward_prefetch(model, num_to_prefetch=1)
             loss = model(inp)
             self.assertEqual(events, expected_forward_events)
             events.clear()
             loss.sum().backward()
             expected_backward_events = [
+<<<<<<< HEAD
+=======
+                ("unshard", "", TrainingState.PRE_BACKWARD),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Root prefetches `layers.3` per default
                 ("unshard", "layers.3", TrainingState.PRE_BACKWARD),
                 # `layers.i` prefetches for `layers.i-1` (same as default)
@@ -800,6 +1144,10 @@ def set_backward_prefetch(model: Transformer, num_to_prefetch: int) -> None:
             events.clear()
             loss.sum().backward()
             expected_backward_events = [
+<<<<<<< HEAD
+=======
+                ("unshard", "", TrainingState.PRE_BACKWARD),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Root prefetches `layers.3` per default
                 ("unshard", "layers.3", TrainingState.PRE_BACKWARD),
                 # `layers.i` prefetches for `layers.i-1` and `layers.i-2`
@@ -843,10 +1191,21 @@ def test_fully_shard_multi_module_backward_prefetch(self):
             FSDPParamGroup.post_backward, events
         )
         inp = torch.randint(
+<<<<<<< HEAD
             0, model_args.vocab_size, (2, model_args.max_seq_len), device="cuda"
         )
         with patch_unshard(unshard_with_record), patch_post_backward(
             post_backward_with_record
+=======
+            0,
+            model_args.vocab_size,
+            (2, model_args.max_seq_len),
+            device=device_type.type,
+        )
+        with (
+            patch_unshard(unshard_with_record),
+            patch_post_backward(post_backward_with_record),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             for _ in range(3):
                 loss = model(inp)
@@ -923,9 +1282,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         post_backward_with_record = self._get_post_backward_with_record(
             FSDPParamGroup.post_backward, events
         )
+<<<<<<< HEAD
         inp = torch.randn((2, 16), device="cuda")
         with patch_unshard(unshard_with_record), patch_post_backward(
             post_backward_with_record
+=======
+        inp = torch.randn((2, 16), device=device_type.type)
+        with (
+            patch_unshard(unshard_with_record),
+            patch_post_backward(post_backward_with_record),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             for _ in range(3):
                 loss = model(inp)
@@ -961,7 +1327,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
     @skip_if_lt_x_gpu(2)
     def test_backward_misprefetch(self):
         torch.manual_seed(42)
+<<<<<<< HEAD
         model = MLP(dim=16, device="cuda")
+=======
+        model = MLP(dim=16, device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_model = copy.deepcopy(model)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         fully_shard(model.in_proj)
@@ -975,7 +1345,11 @@ def test_backward_misprefetch(self):
         model.in_proj.set_modules_to_backward_prefetch([model.out_proj])
 
         torch.manual_seed(self.rank + 1)
+<<<<<<< HEAD
         inp = torch.randn((2, 16), device="cuda")
+=======
+        inp = torch.randn((2, 16), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(3):
             ref_optim.zero_grad()
             ref_loss = ref_model(inp).sum()
@@ -992,7 +1366,11 @@ def test_backward_misprefetch(self):
     def _init_transformer(
         self,
         n_layers: int,
+<<<<<<< HEAD
         reshard_after_forward: Union[bool, int],
+=======
+        reshard_after_forward: Union[bool, int, None],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         checkpoint_impl: Optional[str],
     ):
         model_args = ModelArgs(
@@ -1007,7 +1385,14 @@ def _init_transformer(
         fully_shard(model, reshard_after_forward=reshard_after_forward)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
         inp = torch.randint(
+<<<<<<< HEAD
             0, model_args.vocab_size, (2, model_args.max_seq_len), device="cuda"
+=======
+            0,
+            model_args.vocab_size,
+            (2, model_args.max_seq_len),
+            device=device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return model, optim, inp
 
@@ -1057,7 +1442,11 @@ def post_backward_with_record(self, *args, **kwargs):
 class TestFullyShardUnshardMultiProcess(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(torch.cuda.device_count(), 2)
+=======
+        return min(torch.get_device_module(device_type).device_count(), 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_unshard_async(self):
@@ -1111,10 +1500,17 @@ def forward(self, x: torch.Tensor):
                     self.mlps.mlp3.unshard(async_op=True)
                 return self.mlps([y1, y2, y3], [work1, work2, work3])
 
+<<<<<<< HEAD
         mesh = init_device_mesh("cuda", (self.world_size,))
         batch_size, dim = 2, 8
         torch.manual_seed(42)
         ref_model = replicate(ReduceModel(dim, mesh).cuda())
+=======
+        mesh = init_device_mesh(device_type.type, (self.world_size,))
+        batch_size, dim = 2, 8
+        torch.manual_seed(42)
+        ref_model = replicate(ReduceModel(dim, mesh).to(device_type))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         torch.manual_seed(42)
         model = ReduceModel(dim, mesh)
@@ -1122,10 +1518,17 @@ def forward(self, x: torch.Tensor):
         fully_shard(model.mlps.mlp2, reshard_after_forward=False)
         fully_shard(model.mlps.mlp3, reshard_after_forward=False)
         fully_shard(model.mlps)
+<<<<<<< HEAD
         replicate(model.cuda())
         optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
         torch.manual_seed(42 + self.rank + 1)
         inp = torch.randn((batch_size, dim), device="cuda")
+=======
+        replicate(model.to(device_type))
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((batch_size, dim), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(10):
             losses: list[torch.Tensor] = []
             for _model, _optim in ((ref_model, ref_optim), (model, optim)):
@@ -1142,7 +1545,11 @@ class TestFullyShardUnshardMultiThread(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 2
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unshard_no_param_group(self):
         # Check that we can call `unshard()` on a module with no parameter
         # group / no managed parameters without erroring
@@ -1153,7 +1560,11 @@ def test_unshard_no_param_group(self):
         handle = model.unshard(async_op=True)
         handle.wait()
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unshard_without_lazy_init(self):
         torch.manual_seed(42)
         model = MLP(4)
@@ -1166,5 +1577,201 @@ def test_unshard_without_lazy_init(self):
             self.assertEqual(ref_param, param)
 
 
+<<<<<<< HEAD
+=======
+class TestFullyShardAllocFromPG(FSDPTest):
+    # The messages might change when we move to a different NCCL version.
+    # Please update this test if it starts failing.
+    MEMORY_REGISTER_RE = (
+        "NCCL INFO register comm 0x[0-9a-f]+ buffer 0x[0-9a-f]+ size [0-9]+"
+    )
+
+    @classmethod
+    def _run(cls, *args, **kwargs):
+        cls.nccl_log_dir = tempfile.TemporaryDirectory()
+        os.environ["NCCL_DEBUG"] = "INFO"
+        os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,ENV,REG"
+        os.environ["NCCL_DEBUG_FILE"] = cls.nccl_log_dir.name + "/nccl_log"
+        super()._run(*args, **kwargs)
+
+    @skip_if_lt_x_gpu(2)
+    # The NCCL PG refuses to allocate tensors if multicast is unavailable, see
+    # https://github.com/pytorch/pytorch/blob/503362d019b3782581492af7767945dbd75ca1c9/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L5634
+    @requires_multicast_support()
+    def test_fully_shard_alloc_from_pg(self):
+        torch.manual_seed(42)
+        model_args = ModelArgs()
+        model = Transformer(model_args)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module)
+        fully_shard(model)
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+
+        loss = model(inp)
+        loss.sum().backward()
+
+        torch.distributed.barrier()
+        torch.cuda.synchronize()
+
+        with open(self.nccl_log_dir.name + "/nccl_log") as f:
+            self.assertNotRegex(f.read(), self.MEMORY_REGISTER_RE)
+
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                module.set_allocate_memory_from_process_group_for_comm(True)
+        model.set_allocate_memory_from_process_group_for_comm(True)
+
+        loss = model(inp)
+        loss.sum().backward()
+
+        torch.distributed.barrier()
+        torch.cuda.synchronize()
+
+        with open(self.nccl_log_dir.name + "/nccl_log") as f:
+            self.assertRegex(f.read(), self.MEMORY_REGISTER_RE)
+
+
+class TestFullyShardForceSumReduction(FSDPTest):
+    # The messages might change when we move to a different NCCL version.
+    # Please update this test if it starts failing.
+    COLLECTIVE_RE = (
+        "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
+        "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
+    )
+    # See here for the numerical values for each reduction op:
+    # https://github.com/NVIDIA/nccl/blob/72d2432094d6ae36abd6e511c3a16a2d052dbf94/src/nccl.h.in#L260-L275
+    SUM_REDUCTION = 0
+    AVG_REDUCTION = 4
+
+    @classmethod
+    def _run(cls, *args, **kwargs):
+        cls.nccl_log_dir = tempfile.TemporaryDirectory()
+        os.environ["NCCL_DEBUG"] = "INFO"
+        os.environ["NCCL_DEBUG_SUBSYS"] = "COLL"
+        os.environ["NCCL_DEBUG_FILE"] = cls.nccl_log_dir.name + "/nccl_log"
+        super()._run(*args, **kwargs)
+
+    # Test reduce-scatter only on plain FSDP on 2 GPUs
+    @skip_if_lt_x_gpu(2)
+    def test_fully_shard_force_sum_reduce_scatter(self):
+        torch.manual_seed(42)
+        model_args = ModelArgs()
+        model = Transformer(model_args)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module)
+        fully_shard(model)
+
+        # We target a specific count so that we don't pick up the barrier ops
+        layer_numel = sum(w.numel() for w in model.layers[0].parameters())
+        comms_size = layer_numel // self.world_size
+        reduce_scatter_avg_re = self.COLLECTIVE_RE.format(
+            coll="ReduceScatter", count=comms_size, reduce_op=self.AVG_REDUCTION
+        )
+        reduce_scatter_sum_re = self.COLLECTIVE_RE.format(
+            coll="ReduceScatter", count=comms_size, reduce_op=self.SUM_REDUCTION
+        )
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+
+        loss = model(inp)
+        loss.sum().backward()
+
+        torch.distributed.barrier()
+        torch.cuda.synchronize()
+
+        with open(self.nccl_log_dir.name + "/nccl_log") as f:
+            logs = f.read()
+        # At this stage we should have only AVG, no SUM
+        self.assertRegex(logs, reduce_scatter_avg_re)
+        self.assertNotRegex(logs, reduce_scatter_sum_re)
+
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                module.set_force_sum_reduction_for_comms(True)
+        model.set_force_sum_reduction_for_comms(True)
+
+        loss = model(inp)
+        loss.sum().backward()
+
+        torch.distributed.barrier()
+        torch.cuda.synchronize()
+
+        with open(self.nccl_log_dir.name + "/nccl_log") as f:
+            logs = f.read()
+        # Now we should also have SUM
+        self.assertRegex(logs, reduce_scatter_sum_re)
+
+    # Test both reduce-scatter and all-reduce on HSDP (DDP+FSDP) on 4 GPUs
+    @skip_if_lt_x_gpu(4)
+    def test_fully_shard_force_sum_both_reductions(self):
+        mesh = init_device_mesh(
+            device_type.type, (2, self.world_size // 2), mesh_dim_names=("ddp", "fsdp")
+        )
+
+        torch.manual_seed(42)
+        model_args = ModelArgs()
+        model = Transformer(model_args)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module, mesh=mesh)
+        fully_shard(model, mesh=mesh)
+
+        # We target a specific count so that we don't pick up the barrier ops
+        layer_numel = sum(w.numel() for w in model.layers[0].parameters())
+        comms_size = layer_numel // (self.world_size // 2)
+        reduce_scatter_avg_re = self.COLLECTIVE_RE.format(
+            coll="ReduceScatter", count=comms_size, reduce_op=self.AVG_REDUCTION
+        )
+        reduce_scatter_sum_re = self.COLLECTIVE_RE.format(
+            coll="ReduceScatter", count=comms_size, reduce_op=self.SUM_REDUCTION
+        )
+        all_reduce_avg_re = self.COLLECTIVE_RE.format(
+            coll="AllReduce", count=comms_size, reduce_op=self.AVG_REDUCTION
+        )
+        all_reduce_sum_re = self.COLLECTIVE_RE.format(
+            coll="AllReduce", count=comms_size, reduce_op=self.SUM_REDUCTION
+        )
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+
+        loss = model(inp)
+        loss.sum().backward()
+
+        torch.distributed.barrier()
+        torch.cuda.synchronize()
+
+        with open(self.nccl_log_dir.name + "/nccl_log") as f:
+            logs = f.read()
+        # At this stage we should have only AVG, no SUM
+        self.assertRegex(logs, reduce_scatter_avg_re)
+        self.assertRegex(logs, all_reduce_avg_re)
+        self.assertNotRegex(logs, reduce_scatter_sum_re)
+        self.assertNotRegex(logs, all_reduce_sum_re)
+
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                module.set_force_sum_reduction_for_comms(True)
+        model.set_force_sum_reduction_for_comms(True)
+
+        loss = model(inp)
+        loss.sum().backward()
+
+        torch.distributed.barrier()
+        torch.cuda.synchronize()
+
+        with open(self.nccl_log_dir.name + "/nccl_log") as f:
+            logs = f.read()
+        # Now we should also have SUM
+        self.assertRegex(logs, reduce_scatter_sum_re)
+        self.assertRegex(logs, all_reduce_sum_re)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
index 6351a74459bd..f091140a7111 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_compile.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@@ -17,7 +17,11 @@
 from torch._dynamo.utils import counters
 from torch._inductor import comms
 from torch._inductor.utils import is_fallback_op, run_and_get_code
+<<<<<<< HEAD
 from torch.distributed._tensor import init_device_mesh
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import (
     fully_shard,
     FullyShardedDataParallel as FSDP,
@@ -31,7 +35,11 @@
     skip_if_lt_x_gpu,
     sm_is_or_higher_than,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_fsdp import FSDPTest, MLP
+=======
+from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLP
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
@@ -40,6 +48,11 @@
 from torch.testing._internal.inductor_utils import HAS_GPU
 
 
+<<<<<<< HEAD
+=======
+device_type = torch.device(get_devtype())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 
 
@@ -59,9 +72,15 @@ def __init__(self):
         super().__init__()
 
         self.encoder = torch.nn.Sequential(
+<<<<<<< HEAD
             torch.nn.Linear(28 * 28, 1024, device="cuda"),
             torch.nn.Linear(1024, 1024, device="cuda"),
             torch.nn.Linear(1024, 4096, device="cuda"),
+=======
+            torch.nn.Linear(28 * 28, 1024, device=device_type),
+            torch.nn.Linear(1024, 1024, device=device_type),
+            torch.nn.Linear(1024, 4096, device=device_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def forward(self, x):
@@ -104,10 +123,17 @@ def patched_trace_rules_check(*args, **kwargs):
         torch.distributed.barrier()
         torch._dynamo.config.skip_fsdp_hooks = skip_fsdp_hooks
         torch._dynamo.trace_rules.check = patched_trace_rules_check
+<<<<<<< HEAD
         model = MLP(4)
         fully_shard(model)
         model.compile()
         model(torch.randn((4, 4), device="cuda"))
+=======
+        model = MLP(4).to(device_type)
+        fully_shard(model)
+        model.compile()
+        model(torch.randn((4, 4), device=device_type))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.distributed.barrier()
         torch._dynamo.config.skip_fsdp_hooks = original_skip_fsdp_hooks
         torch._dynamo.trace_rules.check = orig_trace_rules_check
@@ -127,11 +153,21 @@ class TestFullyShardCompile(FSDPTest):
     def skipTestForOldSm(self):
         # Assumption: This test class is only run on GPU. See `HAS_GPU` check at
         # the top of the class.
+<<<<<<< HEAD
         device = torch.device("cuda", self.rank % torch.cuda.device_count())
         if not sm_is_or_higher_than(device, 8, 0):
             self.skipTest("bf16 requires sm >= 8.0")
 
     @skipIfRocm
+=======
+        device = torch.device(
+            device_type.type,
+            self.rank % torch.get_device_module(device_type).device_count(),
+        )
+        if not sm_is_or_higher_than(device, 8, 0):
+            self.skipTest("bf16 requires sm >= 8.0")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_trace_use_training_state(self):
         torch._dynamo.reset()
         # Construct a dummy FSDPParamGroup, since we just want to test the `use_training_state` ctx manager.
@@ -140,7 +176,11 @@ def test_dynamo_trace_use_training_state(self):
             (torch.nn.Linear(1, 1),),  # module: Tuple[nn.Module, ...],
             None,  # mesh_info: FSDPMeshInfo,
             None,  # post_forward_mesh_info: Optional[FSDPMeshInfo],
+<<<<<<< HEAD
             torch.device("cuda"),  # device: torch.device,
+=======
+            device_type,  # device: torch.device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             None,  # shard_placement_fn: Optional[Callable],
             None,  # mp_policy: MixedPrecisionPolicy,
             None,  # offload_policy: OffloadPolicy,
@@ -169,7 +209,10 @@ def f(x):
         self.assertEqual(cnt.op_count, 1)
         self.assertEqual(len(cnt.graphs), 1)
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_trace_fsdp_copy_(self):
         @torch.library.custom_op("mylib::add_one_out", mutates_args={"out"})
         def add_one_out(x: torch.Tensor, out: torch.Tensor) -> None:
@@ -219,9 +262,13 @@ def _assert_no_aliased_unsharded_params_in_graph_inputs(
             ):
                 unsharded_param_graph_inputs.add(node.args[0])
         assert len(unsharded_param_graph_inputs) > 0
+<<<<<<< HEAD
         assert len(unsharded_param_graph_inputs) == len(
             list(model.parameters())
         ), """\
+=======
+        assert len(unsharded_param_graph_inputs) == len(list(model.parameters())), """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Expected all model parameters to be wrapped by FSDP2 and
 have their unsharded version as graph input, but it's not true!
 """
@@ -234,7 +281,11 @@ def _assert_no_aliased_unsharded_params_in_graph_inputs(
                 no_aliased_unsharded_params_in_graph_inputs = False
                 err_msg += f"""\n
 Found aliased unsharded param in graph inputs: {aliased_graph_inputs},
+<<<<<<< HEAD
 val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
+=======
+val.shape: {[node.meta["val"].shape for node in aliased_graph_inputs]},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
         self.assertTrue(no_aliased_unsharded_params_in_graph_inputs, err_msg)
 
@@ -463,10 +514,16 @@ def inductor_code_check_fsdp_reduce_scatter(
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_compiled_autograd_ctx(self):
         self.skipTestForOldSm()
+<<<<<<< HEAD
         with torch._dynamo.config.patch(
             skip_fsdp_hooks=False,
         ), torch._functorch.config.patch(
             recompute_views=True,
+=======
+        with (
+            torch._dynamo.config.patch(skip_fsdp_hooks=False),
+            torch._functorch.config.patch(recompute_views=True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             inputs = torch.randn(8, 8)
             model = torch.nn.Linear(8, 8)
@@ -540,7 +597,20 @@ def test_compiled():
                 )
                 if fwd_fullgraph:
                     self.assertEqual(len(counters["graph_break"]), 1)
+<<<<<<< HEAD
                     self.assertIn("Tensor.backward", counters["graph_break"])
+=======
+                    self.assertExpectedInline(
+                        next(iter(counters["graph_break"].keys())),
+                        """\
+Unsupported Tensor.backward() call
+  Explanation: Dynamo currently does not support tracing `Tensor.backward()`.
+  Hint: This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround.
+
+  Developer debug context: call_method TensorVariable() backward () {}
+""",  # noqa: B950
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     self.assertGreater(len(counters["graph_break"]), 1)
                 return res
@@ -554,6 +624,7 @@ def test_eager():
 
         torch._dynamo.reset()
         torch._dynamo.compiled_autograd.reset()
+<<<<<<< HEAD
         with torch._dynamo.config.patch(
             compiled_autograd=True,
             compiled_autograd_kwargs_override={
@@ -572,6 +643,30 @@ def test_eager():
                 "raise_comms",
                 "reorder_compute_for_overlap",
             ],
+=======
+        with (
+            torch._dynamo.config.patch(
+                compiled_autograd=True,
+                compiled_autograd_kwargs_override={
+                    "fullgraph": True,
+                },
+                inline_inbuilt_nn_modules=True,
+                skip_fsdp_hooks=False,
+            ),
+            torch._functorch.config.patch(
+                enable_autograd_cache=False,
+                recompute_views=True,
+            ),
+            torch._inductor.config.patch(
+                force_disable_caches=True,
+                reorder_for_compute_comm_overlap=True,
+                reorder_for_compute_comm_overlap_passes=[
+                    "sink_waits",
+                    "raise_comms",
+                    "reorder_compute_for_overlap",
+                ],
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             losses_compiled = test_compiled()
         losses_eager = test_eager()
@@ -594,11 +689,19 @@ def model_init_fn():
             torch.manual_seed(self.rank)
             fsdp_config = {}
             model = nn.Sequential(
+<<<<<<< HEAD
                 nn.Linear(hidden_dim, hidden_dim, device="cuda"),
                 nn.ReLU(),
                 nn.Linear(hidden_dim, hidden_dim, device="cuda"),
                 nn.ReLU(),
                 nn.Linear(hidden_dim, hidden_dim, device="cuda"),
+=======
+                nn.Linear(hidden_dim, hidden_dim, device=device_type),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim, device=device_type),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim, device=device_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             fully_shard(model, reshard_after_forward=True, **fsdp_config)
             optim = torch.optim.SGD(model.parameters(), lr=1e-4)
@@ -606,7 +709,11 @@ def model_init_fn():
 
         def input_creation_fn():
             torch.manual_seed(self.rank)
+<<<<<<< HEAD
             inp = torch.randn((2, hidden_dim), device="cuda", requires_grad=False)
+=======
+            inp = torch.randn((2, hidden_dim), device=device_type, requires_grad=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return inp
 
         return model_init_fn, input_creation_fn
@@ -643,11 +750,19 @@ def __init__(self, hidden_dim):
                 super().__init__()
                 self.param1 = nn.Parameter(
                     torch.zeros(
+<<<<<<< HEAD
                         hidden_dim, hidden_dim, dtype=torch.float, device="cuda"
                     )
                 )
                 self.param2 = nn.Parameter(
                     torch.zeros(hidden_dim, dtype=torch.float, device="cuda")
+=======
+                        hidden_dim, hidden_dim, dtype=torch.float, device=device_type
+                    )
+                )
+                self.param2 = nn.Parameter(
+                    torch.zeros(hidden_dim, dtype=torch.float, device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             def forward(self, x):
@@ -682,7 +797,11 @@ def forward(self, x):
         def model_init_fn():
             torch.manual_seed(self.rank)
             fsdp_config = {}
+<<<<<<< HEAD
             mesh = init_device_mesh("cuda", (self.world_size,))
+=======
+            mesh = init_device_mesh(device_type.type, (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model = TestModule(n_layers=3)
             for mod in model.layers:
                 fully_shard(mod, mesh=mesh, reshard_after_forward=True, **fsdp_config)
@@ -694,7 +813,11 @@ def model_init_fn():
 
         def input_creation_fn():
             torch.manual_seed(self.rank)
+<<<<<<< HEAD
             inp = torch.randn((2, hidden_dim), device="cuda", requires_grad=False)
+=======
+            inp = torch.randn((2, hidden_dim), device=device_type, requires_grad=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return inp
 
         return model_init_fn, input_creation_fn
@@ -725,6 +848,7 @@ def test_nested_fully_shard_backend_aot_eager_decomp_partition(self):
                 fwd_fullgraph=fwd_fullgraph,
             )
 
+<<<<<<< HEAD
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_inductor_fullgraph_True(self):
@@ -742,6 +866,26 @@ def test_nested_fully_shard_backend_inductor_fullgraph_True(self):
                 )
                 if fwd_fullgraph
                 else None
+=======
+    def _test_nested_fully_shard_backend_inductor_fullgraph_True(self):
+        self.skipTestForOldSm()
+        for fwd_fullgraph in [True]:
+            with (
+                self._reinplace_all_gather_with_optional_checks(fwd_fullgraph),
+                torch._inductor.config.patch(
+                    post_grad_custom_post_pass=(
+                        functools.partial(
+                            self._check_fsdp_copy_and_resize_ops_count_in_graph,
+                            fwd_copy_count=0,
+                            fwd_resize_count=0,
+                            bwd_copy_count=0,
+                            bwd_resize_count=0,
+                        )
+                        if fwd_fullgraph
+                        else None
+                    )
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 _, triton_codes = run_and_get_code(
                     lambda: self._test_traceable_fsdp(
@@ -760,7 +904,18 @@ def test_nested_fully_shard_backend_inductor_fullgraph_True(self):
                     "Expected two separate lowerings to Triton code, one from FWD graph and one from Compiled Autograd BWD graph",
                 )
                 fwd_code = triton_codes[0]
+<<<<<<< HEAD
                 file_check = FileCheck().check("def call(args):")
+=======
+
+                extra_str_from_graph_partition = (
+                    "self, " if torch._inductor.config.graph_partition else ""
+                )
+
+                file_check = FileCheck().check(
+                    f"def call({extra_str_from_graph_partition}args):"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for fwd_ag_block_info in [
                     dict(overlapped_compute_op_str=None),
                     dict(
@@ -796,7 +951,13 @@ def test_nested_fully_shard_backend_inductor_fullgraph_True(self):
                 file_check.run(fwd_code)
 
                 bwd_code = triton_codes[1]
+<<<<<<< HEAD
                 file_check = FileCheck().check("def call(args):")
+=======
+                file_check = FileCheck().check(
+                    f"def call({extra_str_from_graph_partition}args):"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for bwd_ag_block_info in [
                     dict(overlapped_compute_op_str=None),
                     dict(
@@ -824,6 +985,20 @@ def test_nested_fully_shard_backend_inductor_fullgraph_True(self):
                     pass
                 file_check.run(bwd_code)
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_nested_fully_shard_backend_inductor_fullgraph_True(self):
+        self._test_nested_fully_shard_backend_inductor_fullgraph_True()
+
+    @skipIfRocm
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_nested_fully_shard_backend_inductor_fullgraph_True_graph_partition(self):
+        self._test_nested_fully_shard_backend_inductor_fullgraph_True()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skip("TODO: fix fwd_fullgraph=False case")
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@@ -854,7 +1029,11 @@ def _create_transformer_factory_fns(
         def model_init_fn():
             torch.manual_seed(self.rank)
             fsdp_config = {}
+<<<<<<< HEAD
             mesh = init_device_mesh("cuda", (self.world_size,))
+=======
+            mesh = init_device_mesh(device_type.type, (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model_args = ModelArgs(
                 vocab_size=vocab_size,
                 n_layers=n_layers,
@@ -875,7 +1054,11 @@ def model_init_fn():
             for _, mod in enumerate(model.layers):
                 fully_shard(mod, mesh=mesh, reshard_after_forward=True, **fsdp_config)
             model = fully_shard(
+<<<<<<< HEAD
                 model, mesh=mesh, reshard_after_forward=True, **fsdp_config
+=======
+                model, mesh=mesh, reshard_after_forward=False, **fsdp_config
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             optim = torch.optim.SGD(model.parameters(), lr=1e-4)
             return model, optim
@@ -883,7 +1066,11 @@ def model_init_fn():
         def input_creation_fn():
             torch.manual_seed(self.rank)
             inp = torch.randint(
+<<<<<<< HEAD
                 0, vocab_size, (2, seq_len), device="cuda", requires_grad=False
+=======
+                0, vocab_size, (2, seq_len), device=device_type, requires_grad=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return inp
 
@@ -910,9 +1097,16 @@ def test_transformer_backend_aot_eager(self):
         for fwd_fullgraph, all_requires_grad in itertools.product(
             [True], [True, False]
         ):
+<<<<<<< HEAD
             with self._maybe_add_graph_break_to_sdpa(
                 fwd_fullgraph
             ), self._reinplace_all_gather_with_optional_checks(fwd_fullgraph):
+=======
+            with (
+                self._maybe_add_graph_break_to_sdpa(fwd_fullgraph),
+                self._reinplace_all_gather_with_optional_checks(fwd_fullgraph),
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._test_traceable_fsdp(
                     *self._create_transformer_factory_fns(
                         all_requires_grad=all_requires_grad
@@ -939,11 +1133,15 @@ def test_transformer_backend_aot_eager_decomp_partition(self):
                     fwd_fullgraph=fwd_fullgraph,
                 )
 
+<<<<<<< HEAD
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO: native_dropout causes CUDA IMA error, need to figure out why
     @torch._inductor.config.patch(fallback_random=True)
     def test_transformer_backend_inductor_fullgraph_True(self):
+=======
+    def _test_transformer_backend_inductor_fullgraph_True(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.skipTestForOldSm()
         for (
             fwd_fullgraph,
@@ -953,6 +1151,7 @@ def test_transformer_backend_inductor_fullgraph_True(self):
             log.warning(
                 f"fwd_fullgraph={fwd_fullgraph}, all_requires_grad={all_requires_grad}, activation_checkpoint={activation_checkpoint}"  # noqa: G004, G001, B950
             )
+<<<<<<< HEAD
             with self._reinplace_all_gather_with_optional_checks(
                 fwd_fullgraph
             ), torch._inductor.config.patch(
@@ -968,6 +1167,26 @@ def test_transformer_backend_inductor_fullgraph_True(self):
                 )
                 if fwd_fullgraph
                 else None
+=======
+            with (
+                self._reinplace_all_gather_with_optional_checks(fwd_fullgraph),
+                torch._inductor.config.patch(
+                    post_grad_custom_post_pass=(
+                        functools.partial(
+                            self._check_fsdp_copy_and_resize_ops_count_in_graph,
+                            # NOTE: For the root unsharded params, we don't reshard after forward since for training,
+                            # the parameters would be freed and all-gathered immediately. Hence we still have
+                            # their resize and copy ops in the graph.
+                            fwd_copy_count=4,
+                            fwd_resize_count=4,
+                            bwd_copy_count=0,
+                            bwd_resize_count=4,
+                        )
+                        if fwd_fullgraph
+                        else None
+                    )
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 _, triton_codes = run_and_get_code(
                     lambda: self._test_traceable_fsdp(
@@ -987,12 +1206,27 @@ def test_transformer_backend_inductor_fullgraph_True(self):
                     "Expected two separate lowerings to Triton code, one from FWD graph and one from Compiled Autograd BWD graph",
                 )
                 fwd_code = triton_codes[0]
+<<<<<<< HEAD
                 file_check = FileCheck().check("def call(args):")
                 for fwd_ag_block_info in [
                     dict(
                         overlapped_compute_op_str="triton_"
                         if all_requires_grad
                         else None,
+=======
+                extra_str_from_graph_partition = (
+                    "self, " if torch._inductor.config.graph_partition else ""
+                )
+
+                file_check = FileCheck().check(
+                    f"def call({extra_str_from_graph_partition}args):"
+                )
+                for fwd_ag_block_info in [
+                    dict(
+                        overlapped_compute_op_str=(
+                            "triton_" if all_requires_grad else None
+                        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                     dict(
                         overlapped_compute_op_str="aten.native_dropout.",
@@ -1012,7 +1246,13 @@ def test_transformer_backend_inductor_fullgraph_True(self):
                 file_check.run(fwd_code)
 
                 bwd_code = triton_codes[1]
+<<<<<<< HEAD
                 file_check = FileCheck().check("def call(args):")
+=======
+                file_check = FileCheck().check(
+                    f"def call({extra_str_from_graph_partition}args):"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for bwd_ag_block_info in [
                     dict(
                         overlapped_compute_op_str="extern_kernels.mm(",
@@ -1031,9 +1271,17 @@ def test_transformer_backend_inductor_fullgraph_True(self):
                     #     )
                     pass
                 for bwd_rs_block_info in [
+<<<<<<< HEAD
                     dict(overlapped_compute_op_str="extern_kernels.mm(")
                     if all_requires_grad
                     else None,
+=======
+                    (
+                        dict(overlapped_compute_op_str="extern_kernels.mm(")
+                        if all_requires_grad
+                        else None
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dict(
                         overlapped_compute_op_str=None
                     ),  # TODO: improve compute/comm overlap, so that `overlapped_compute_op_str` is not None
@@ -1047,6 +1295,24 @@ def test_transformer_backend_inductor_fullgraph_True(self):
                     pass
                 file_check.run(bwd_code)
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    # TODO: native_dropout causes CUDA IMA error, need to figure out why
+    @torch._inductor.config.patch(fallback_random=True)
+    def test_transformer_backend_inductor_fullgraph_True(self):
+        self._test_transformer_backend_inductor_fullgraph_True()
+
+    @skipIfRocm
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    # TODO: native_dropout causes CUDA IMA error, need to figure out why
+    @torch._inductor.config.patch(fallback_random=True)
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_transformer_backend_inductor_fullgraph_True_graph_partition(self):
+        self._test_transformer_backend_inductor_fullgraph_True()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skip("TODO: fix fwd_fullgraph=False case")
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@@ -1088,7 +1354,11 @@ def test_dynamo_recompiles_on_fsdp_layers(self):
                 new_child = torch.compile(child)
                 setattr(m.encoder, name, new_child)
         m = FSDP(m, sharding_strategy=ShardingStrategy.FULL_SHARD, use_orig_params=True)
+<<<<<<< HEAD
         inp = torch.randn(32, 784, device="cuda")
+=======
+        inp = torch.randn(32, 784, device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m(inp)
 
 
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_extensions.py b/test/distributed/_composable/fsdp/test_fully_shard_extensions.py
index d8d3aa4ea149..b6b232cf3984 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_extensions.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_extensions.py
@@ -5,7 +5,10 @@
 import functools
 import math
 import threading
+<<<<<<< HEAD
 import unittest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Optional, Union
 
 import torch
@@ -15,18 +18,31 @@
 from torch.autograd.grad_mode import _unsafe_preserve_version_counter
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
     FSDPTest,
     FSDPTestMultiThread,
+<<<<<<< HEAD
+=======
+    get_devtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MLP,
 )
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.two_tensor import TwoTensor
 
 
+<<<<<<< HEAD
+=======
+device_type = torch.device(get_devtype())
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def two_tensor_fsdp_pre_all_gather_v1(
     self, mesh: DeviceMesh
 ) -> tuple[tuple[torch.Tensor, ...], Any]:
@@ -222,7 +238,11 @@ def test_all_gather_extensions_train_parity(self):
     def _test_all_gather_extensions_train_parity(self, reshard_after_forward: bool):
         torch.manual_seed(42)
         model = self._init_two_tensor_mlp()
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=True)
         fully_shard_fn = functools.partial(
             fully_shard, reshard_after_forward=reshard_after_forward
@@ -234,7 +254,11 @@ def _test_all_gather_extensions_train_parity(self, reshard_after_forward: bool):
         check_sharded_parity(self, ref_model, model)
 
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         inp = torch.randn((2, 8), device="cuda")
+=======
+        inp = torch.randn((2, 8), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for iter_idx in range(10):
             losses: list[torch.Tensor] = []
             for _model in (ref_model, model):
@@ -261,9 +285,15 @@ def world_size(self) -> int:
 
     @property
     def device(self) -> torch.device:
+<<<<<<< HEAD
         return torch.device("cuda:0")
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+        return torch.device(device_type)
+
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_all_gather_extensions_end_to_end(self):
         with self._patch_two_tensor_fsdp_all_gather(pre_all_gather_version=1):
             self.run_subtests(
@@ -297,13 +327,21 @@ def _test_all_gather_extensions_end_to_end(self, reshard_after_forward: bool):
 
         # Run a few iterations to check for errors
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         inp = torch.randn((2, 8), device="cuda")
+=======
+        inp = torch.randn((2, 8), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(3):
             model(inp).sum().backward()
             optim.step()
             optim.zero_grad()
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_all_gather_extensions_monkey_patch(self):
         tls = threading.local()
         tls.ran_pre_all_gather = False
@@ -368,14 +406,22 @@ def fsdp_post_all_gather(
 
         # Run a few iterations to check for errors
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         inp = torch.randn((2, 8), device="cuda")
+=======
+        inp = torch.randn((2, 8), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(3):
             model(inp).sum().backward()
             optim.step()
             optim.zero_grad()
         assert tls.ran_pre_all_gather
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_all_gather_extension_outer_size_stride(self):
         """
         NOTE: We cannot easily test the incorrect case where the user-defined
@@ -383,31 +429,54 @@ def test_all_gather_extension_outer_size_stride(self):
         only some ranks may require padding, in which case only those ranks
         will error out and the all-gather will timeout.
         """
+<<<<<<< HEAD
         assert (
             self.world_size >= 2
         ), f"Assumes world size of at least 2 but got {self.world_size=}"
+=======
+        assert self.world_size >= 2, (
+            f"Assumes world size of at least 2 but got {self.world_size=}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = MLP(dim=3, dim_multiplier=3)
         for module in model.modules():
             for param_name, param in module.named_parameters(recurse=False):
                 if "weight" in param_name:
                     param = nn.Parameter(BFloat16AllGatherTensor(param))
                     setattr(module, param_name, param)
+<<<<<<< HEAD
         fully_shard(model)
         optim = torch.optim.AdamW(model.parameters(), lr=1e-2, fused=True)
         torch.manual_seed(42 + self.rank + 1)
         inp = torch.randn((2, 3), device="cuda")
+=======
+        # need to fix reshard_after_forward=True
+        # https://github.com/pytorch/pytorch/issues/154836
+        fully_shard(model, reshard_after_forward=False)
+        optim = torch.optim.AdamW(model.parameters(), lr=1e-2, fused=True)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, 3), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss = model(inp).sum()
         loss.backward()
         optim.step()
         optim.zero_grad()
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_all_gather_extension_hsdp_mesh(self):
         tls = threading.local()
         replicate_size = 2
         shard_size = self.world_size // replicate_size
         mesh = init_device_mesh(
+<<<<<<< HEAD
             "cuda",
+=======
+            device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (replicate_size, shard_size),
             mesh_dim_names=("dp_replicate", "dp_shard"),
         )
@@ -456,7 +525,11 @@ def fsdp_post_all_gather(
                     local_param
                 )
 
+<<<<<<< HEAD
         inp = torch.randn((2, 8), device="cuda")
+=======
+        inp = torch.randn((2, 8), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model(inp)
         # Check that FSDP passes only the shard mesh to the pre-all-gather
         self.assertEqual(tls.mesh.ndim, 1)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_frozen.py b/test/distributed/_composable/fsdp/test_fully_shard_frozen.py
index 3734c8a0759b..0705a1ef2e66 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_frozen.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_frozen.py
@@ -18,6 +18,10 @@
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
     FSDPTest,
+<<<<<<< HEAD
+=======
+    get_devtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MLP,
     patch_reduce_scatter,
     patch_register_post_backward_hook_backward,
@@ -26,10 +30,20 @@
 from torch.testing._internal.common_utils import run_tests
 
 
+<<<<<<< HEAD
 class TestFullyShardFrozen(FSDPTest):
     @property
     def world_size(self) -> int:
         return min(4, torch.cuda.device_count())
+=======
+device_type = torch.device(get_devtype())
+
+
+class TestFullyShardFrozen(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_train_mixed_requires_grad_per_group(self):
@@ -66,7 +80,11 @@ def _test_train_mixed_requires_grad_per_group(
                 if "bias" not in param_name:
                     param.requires_grad_(False)
         ref_model = replicate(
+<<<<<<< HEAD
             copy.deepcopy(model).cuda(),
+=======
+            copy.deepcopy(model).to(device_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device_ids=[self.rank],
             find_unused_parameters=freeze_after_init,
         )
@@ -110,10 +128,18 @@ def backward_with_count(*args, **kwargs):
             return orig_backward(*args, **kwargs)
 
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         device = torch.device("cuda")
         with patch_reduce_scatter(
             reduce_scatter
         ), patch_register_post_backward_hook_backward(backward_with_count):
+=======
+        device = device_type
+        with (
+            patch_reduce_scatter(reduce_scatter),
+            patch_register_post_backward_hook_backward(backward_with_count),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for iter_idx in range(10):
                 inp = torch.randn((8, lin_dim), device=device)
                 losses: list[torch.Tensor] = []
@@ -156,7 +182,11 @@ def _test_train_mixed_requires_grad_across_groups(
             modules += [nn.Linear(lin_dim, lin_dim), nn.ReLU()]
         model = nn.Sequential(*modules)
         ref_model = replicate(
+<<<<<<< HEAD
             copy.deepcopy(model).cuda(),
+=======
+            copy.deepcopy(model).to(device_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device_ids=[self.rank],
             find_unused_parameters=True,
         )
@@ -184,7 +214,11 @@ def backward_with_count(*args, **kwargs):
         _set_requires_grad(ref_model, False)
         num_iters, no_grad_iter_idx = (3, 1)
         torch.manual_seed(42 + self.rank)
+<<<<<<< HEAD
         inp = torch.randn((8, lin_dim), device="cuda")
+=======
+        inp = torch.randn((8, lin_dim), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with patch_register_post_backward_hook_backward(backward_with_count):
             for iter_idx in range(num_iters):
                 losses: list[torch.Tensor] = []
@@ -242,7 +276,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         torch.manual_seed(42)
         model = MultiForwardModule(torch.device("cpu"))
+<<<<<<< HEAD
         ref_model = replicate(copy.deepcopy(model).cuda(), device_ids=[self.rank])
+=======
+        ref_model = replicate(
+            copy.deepcopy(model).to(device_type), device_ids=[self.rank]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         for module in model.modules():
             if isinstance(module, nn.Linear):
@@ -250,7 +290,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         fully_shard(model, reshard_after_forward=reshard_after_forward)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
         for iter_idx in range(10):
+<<<<<<< HEAD
             inp = torch.randn((8, 5), device="cuda")
+=======
+            inp = torch.randn((8, 5), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             losses: list[torch.Tensor] = []
             for _model, _optim in ((ref_model, ref_optim), (model, optim)):
                 _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py b/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
index 7b7beb30af9d..71e67a50d932 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_grad_scaler.py
@@ -4,7 +4,11 @@
 import torch
 import torch.nn as nn
 from torch.amp.grad_scaler import GradScaler, OptState
+<<<<<<< HEAD
 from torch.distributed._tensor import init_device_mesh
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import fully_shard
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -12,13 +16,24 @@
     RowwiseParallel,
 )
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+<<<<<<< HEAD
 from torch.testing._internal.common_fsdp import FSDPTest, MLP
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
+=======
+from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLP
+from torch.testing._internal.common_utils import run_tests
+
+
+device_type = torch.device(get_devtype())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestFullyShardGradientScaler(FSDPTest):
     @skip_if_lt_x_gpu(4)
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_gradient_scaler(self):
         self.run_subtests(
             {"has_inf": [True, False], "test_2d": [True, False]},
@@ -28,16 +43,28 @@ def test_gradient_scaler(self):
     def _test_gradient_scaler(self, has_inf: bool, test_2d: bool):
         torch.manual_seed(0)
         model = nn.Sequential(
+<<<<<<< HEAD
             *[nn.Linear(4, 4, device="cuda", bias=False) for _ in range(2)]
+=======
+            *[nn.Linear(4, 4, device=device_type, bias=False) for _ in range(2)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         for layer in model:
             fully_shard(layer)
         fully_shard(model)
+<<<<<<< HEAD
         input = torch.randn([4, 4], device="cuda")
 
         if test_2d:
             mesh_2d = init_device_mesh(
                 "cuda", (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
+=======
+        input = torch.randn([4, 4], device=device_type)
+
+        if test_2d:
+            mesh_2d = init_device_mesh(
+                device_type.type, (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             dp_mesh, tp_mesh = mesh_2d["dp"], mesh_2d["tp"]
             model = nn.Sequential(MLP(2), MLP(2), MLP(2))
@@ -57,10 +84,17 @@ def _test_gradient_scaler(self, has_inf: bool, test_2d: bool):
             for module in model:
                 fully_shard(module, mesh=dp_mesh)
             fully_shard(model, mesh=dp_mesh)
+<<<<<<< HEAD
             input = torch.randn((2,), device="cuda")
 
         loss = model(input).sum()
         scaler = GradScaler(init_scale=2.0, enabled=True)
+=======
+            input = torch.randn((2,), device=device_type)
+
+        loss = model(input).sum()
+        scaler = GradScaler(init_scale=2.0, enabled=True, device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opt = torch.optim.Adam(model.parameters(), lr=1e-2)
         scaler.scale(loss).backward()
         inv_scale = scaler._scale.double().reciprocal().float()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py b/test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py
index 45dc850fe8d9..756d56f4574c 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_ignore_params.py
@@ -7,12 +7,21 @@
 import torch.nn as nn
 from torch.distributed._composable.fsdp import fully_shard
 from torch.distributed._composable.fsdp.fully_shard import FSDPModule as FSDP2
+<<<<<<< HEAD
 from torch.distributed._tensor.experimental import implicit_replication
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import DTensor
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor.experimental import implicit_replication
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest, get_devtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     run_tests,
@@ -20,6 +29,11 @@
 )
 
 
+<<<<<<< HEAD
+=======
+device_type = torch.device(get_devtype())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
@@ -72,14 +86,22 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class Y(nn.Module):
     def __init__(self) -> None:
         super().__init__()
+<<<<<<< HEAD
         p = torch.randn(10, device="cuda")
+=======
+        p = torch.randn(10, device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.p = nn.Parameter(p)
 
 
 class X(nn.Module):
     def __init__(self) -> None:
         super().__init__()
+<<<<<<< HEAD
         q = torch.randn(10, device="cuda")
+=======
+        q = torch.randn(10, device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.q = nn.Parameter(q)
         self.y = Y()
 
@@ -95,6 +117,7 @@ def _generate_model_and_input() -> nn.Module:
     dim = 8
 
     torch.manual_seed(42)
+<<<<<<< HEAD
     addend = torch.randn((dim, dim), device="cuda")
 
     torch.manual_seed(70)
@@ -104,6 +127,17 @@ def _generate_model_and_input() -> nn.Module:
 
     torch.manual_seed(84)
     inp = torch.randn((dim, dim), device="cuda")
+=======
+    addend = torch.randn((dim, dim), device=device_type)
+
+    torch.manual_seed(70)
+    subend = torch.randn((dim, dim), device=device_type)
+
+    model = A(dim, addend, subend).to(device_type)
+
+    torch.manual_seed(84)
+    inp = torch.randn((dim, dim), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return model, inp
 
@@ -229,7 +263,11 @@ def compare_ref_test_params(self, ref_name_to_param_map, test_name_to_param_map)
     @skip_if_lt_x_gpu(2)
     def test_ddp_A_fsdp_B_ddp_C(self):
         default_pg = dist.distributed_c10d._get_default_group()
+<<<<<<< HEAD
         mesh = init_device_mesh("cuda", mesh_shape=(default_pg.size(),))
+=======
+        mesh = init_device_mesh(device_type.type, mesh_shape=(default_pg.size(),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ref_model, ref_inp = _generate_model_and_input()
 
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_init.py b/test/distributed/_composable/fsdp/test_fully_shard_init.py
index 9728e536f7fa..3e540c71e543 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_init.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_init.py
@@ -2,13 +2,17 @@
 
 import copy
 import itertools
+<<<<<<< HEAD
 import unittest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import cast, Optional
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._composable import replicate
+<<<<<<< HEAD
 from torch.distributed._tensor import (
     DeviceMesh,
     distribute_tensor,
@@ -16,6 +20,8 @@
     Replicate,
     Shard,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import fully_shard
 from torch.distributed.fsdp._fully_shard._fsdp_init import (
@@ -31,14 +37,29 @@
     _init_inter_node_process_group,
     _init_intra_node_process_group,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+    Replicate,
+    Shard,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
     RowwiseParallel,
 )
 from torch.distributed.tensor.placement_types import _StridedShard
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_fsdp import FSDPTestMultiThread, MLP
+=======
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTestMultiThread, get_devtype, MLP
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
@@ -47,6 +68,12 @@
 )
 
 
+<<<<<<< HEAD
+=======
+device_type = torch.device(get_devtype())
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestFullyShardDeviceTensor(FSDPTestMultiThread):
     """Tests that tensor parameters are moved to the expected device."""
 
@@ -54,15 +81,42 @@ class TestFullyShardDeviceTensor(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 1
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_move_states_to_device_tensor(self):
         model = MLP(8, torch.device("cpu"), with_buffer=True)
         for tensor in itertools.chain(model.parameters(), model.buffers()):
             self.assertEqual(tensor.device, torch.device("cpu"))
         fully_shard(model)
+<<<<<<< HEAD
         cuda_device = torch.device("cuda", torch.cuda.current_device())
         for tensor in itertools.chain(model.parameters(), model.buffers()):
             self.assertEqual(tensor.device, cuda_device)
+=======
+        accelerator_device = torch.device(
+            device_type.type, torch.get_device_module(device_type).current_device()
+        )
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            self.assertEqual(tensor.device, accelerator_device)
+
+    @skip_if_lt_x_gpu(1)
+    def test_move_states_to_device_ignored_param_device(self):
+        cpu_device = torch.device("cpu")
+        model = MLP(8, cpu_device, with_buffer=True)
+        ignored_params = [model.out_proj.weight, model.out_proj.bias]
+        fully_shard(model, ignored_params=set(ignored_params))
+        for tensor in ignored_params:
+            self.assertEqual(tensor.device, cpu_device)
+        accelerator_device = torch.device(
+            device_type.type, torch.get_device_module(device_type).current_device()
+        )
+        model.to(device_type)
+        for tensor in ignored_params:
+            self.assertEqual(tensor.device, accelerator_device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestFullyShardDeviceDTensor(FSDPTestMultiThread):
@@ -72,12 +126,22 @@ class TestFullyShardDeviceDTensor(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 4
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_move_states_to_device_dtensor_valid(self):
         assert self.world_size >= 4, f"{self.world_size}"
         dp_size = 2
         global_mesh = init_device_mesh(
+<<<<<<< HEAD
             "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+=======
+            device_type.type,
+            (dp_size, self.world_size // dp_size),
+            mesh_dim_names=("dp", "tp"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
         model = MLP(8, torch.device("cpu"), with_buffer=True)
@@ -86,16 +150,28 @@ def test_move_states_to_device_dtensor_valid(self):
             tp_mesh,
             {"in_proj": ColwiseParallel(), "out_proj": RowwiseParallel()},
         )
+<<<<<<< HEAD
         cuda_device = torch.device("cuda", torch.cuda.current_device())
         for tensor in itertools.chain(model.parameters(), model.buffers()):
             if isinstance(tensor, DTensor):
                 # DTensor constructor moves to the mesh's device
                 self.assertEqual(tensor.device, cuda_device)
                 self.assertEqual(tensor._local_tensor.device, cuda_device)
+=======
+        accelerator_device = torch.device(
+            device_type.type, torch.get_device_module(device_type).current_device()
+        )
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            if isinstance(tensor, DTensor):
+                # DTensor constructor moves to the mesh's device
+                self.assertEqual(tensor.device, accelerator_device)
+                self.assertEqual(tensor._local_tensor.device, accelerator_device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 self.assertEqual(tensor.device, torch.device("cpu"))
         fully_shard(model, mesh=dp_mesh)
         for tensor in itertools.chain(model.parameters(), model.buffers()):
+<<<<<<< HEAD
             self.assertEqual(tensor.device, cuda_device)
             if isinstance(tensor, DTensor):
                 self.assertEqual(tensor._local_tensor.device, cuda_device)
@@ -106,11 +182,29 @@ def test_move_states_to_device_dtensor_invalid(self):
         dp_size = 2
         global_cuda_mesh = init_device_mesh(
             "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+=======
+            self.assertEqual(tensor.device, accelerator_device)
+            if isinstance(tensor, DTensor):
+                self.assertEqual(tensor._local_tensor.device, accelerator_device)
+
+    @skip_if_lt_x_gpu(1)
+    def test_move_states_to_device_dtensor_invalid(self):
+        assert self.world_size >= 4, f"{self.world_size}"
+        dp_size = 2
+        global_accelerator_mesh = init_device_mesh(
+            device_type.type,
+            (dp_size, self.world_size // dp_size),
+            mesh_dim_names=("dp", "tp"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         global_cpu_mesh = init_device_mesh(
             "cpu", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
         )
+<<<<<<< HEAD
         dp_mesh = global_cuda_mesh["dp"]
+=======
+        dp_mesh = global_accelerator_mesh["dp"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tp_mesh = global_cpu_mesh["tp"]  # mismatched meshes!
         model = MLP(8, torch.device("cpu"), with_buffer=True)
         parallelize_module(
@@ -122,7 +216,14 @@ def test_move_states_to_device_dtensor_invalid(self):
             self.assertEqual(tensor.device, torch.device("cpu"))
             if isinstance(tensor, DTensor):
                 self.assertEqual(tensor._local_tensor.device, torch.device("cpu"))
+<<<<<<< HEAD
         regex = r"Requires DTensor to have mesh of the same type as the FSDP mesh but got cpu for DTensor and cuda for FSDP"
+=======
+        regex = (
+            rf"Requires DTensor to have mesh of the same type as the FSDP mesh but got "
+            rf"cpu for DTensor and {device_type.type} for FSDP"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(ValueError, regex):
             fully_shard(model, mesh=dp_mesh)
 
@@ -134,17 +235,29 @@ class TestFullyShardMeshArg(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 4
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_invalid_mesh_ndim(self):
         mesh = init_device_mesh("cuda", (self.world_size, 1, 1))
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_invalid_mesh_ndim(self):
+        mesh = init_device_mesh(device_type.type, (self.world_size, 1, 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = MLP(8)
         regex = r"fully\_shard expects a 1D or 2D DeviceMesh but got DeviceMesh"
         with self.assertRaisesRegex(ValueError, regex):
             fully_shard(model, mesh=mesh)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_2d_mesh_without_mesh_dim_names(self):
         mesh = init_device_mesh("cuda", (self.world_size // 2, 2))
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_2d_mesh_without_mesh_dim_names(self):
+        mesh = init_device_mesh(device_type.type, (self.world_size // 2, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = MLP(8)
         regex = "Please init the 2D mesh for HSDP with mesh_dim_names specified"
         with self.assertRaisesRegex(AssertionError, regex):
@@ -158,7 +271,11 @@ class TestFullyShardManagedModulesAndStates(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 1
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_managed_modules_single(self):
         model = MLP(8)
         # Assume calling `fully_shard` on `model`
@@ -166,7 +283,11 @@ def test_managed_modules_single(self):
         expected_managed_modules = list(model.modules())
         self._check_managed_modules(managed_modules, expected_managed_modules)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_managed_modules_nested(self):
         model = nn.Sequential(*[MLP(8) for _ in range(2)])
         fully_shard(model[0])
@@ -175,7 +296,11 @@ def test_managed_modules_nested(self):
         expected_managed_modules = list(model[1].modules()) + [model]
         self._check_managed_modules(managed_modules, expected_managed_modules)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_managed_modules_nested_fully_shard_and_replicate(self):
         model = nn.Sequential(*[MLP(8) for _ in range(3)])
         replicate(model[0])
@@ -185,7 +310,11 @@ def test_managed_modules_nested_fully_shard_and_replicate(self):
         expected_managed_modules = list(model[1].modules()) + [model]
         self._check_managed_modules(managed_modules, expected_managed_modules)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_managed_modules_duplicate(self):
         mlp = MLP(8)
         model = nn.Sequential(mlp, mlp)  # duplicate MLP
@@ -195,7 +324,11 @@ def test_managed_modules_duplicate(self):
         expected_managed_modules = list(mlp.modules()) + [model]
         self._check_managed_modules(managed_modules, expected_managed_modules)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_managed_modules_list_of_mlps(self):
         model = nn.Sequential(*[MLP(8) for _ in range(5)])
         # Assume calling `fully_shard` on `[model[0], model[1], model[2]]`
@@ -219,7 +352,11 @@ def _check_managed_modules(
         # Check set comparison since we do not require anything about the order
         self.assertEqual(set(managed_modules), set(expected_managed_modules))
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_managed_states_shared_params_and_buffers(self):
         model = nn.Sequential(*[MLP(8, with_buffer=True) for _ in range(3)])
         model[0].in_proj.weight = model[1].in_proj.weight
@@ -232,7 +369,11 @@ def test_managed_states_shared_params_and_buffers(self):
         expected_buffers = list(model.buffers())  # de-dups shared
         self._check_managed_states(params, buffers, expected_params, expected_buffers)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_managed_states_nested_fully_shard(self):
         model = nn.Sequential(*[MLP(8, with_buffer=True) for _ in range(2)])
         fully_shard(model[0])
@@ -243,7 +384,11 @@ def test_managed_states_nested_fully_shard(self):
         expected_buffers = list(model[1].buffers())
         self._check_managed_states(params, buffers, expected_params, expected_buffers)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_managed_states_list_of_mlps(self):
         model = nn.Sequential(*[MLP(8, with_buffer=True) for _ in range(5)])
         # Assume calling `fully_shard` on `[model[0], model[1], model[2]]`
@@ -279,7 +424,11 @@ class TestFullyShardParamModuleInfos(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 2
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_param_module_infos_shared_params(self):
         model = nn.Sequential(*[MLP(8) for _ in range(2)])
         model[0].in_proj.weight = model[1].in_proj.weight
@@ -300,7 +449,11 @@ def test_get_param_module_infos_shared_params(self):
         self.assertEqual(len(param_module_infos), len(expected_param_module_infos))
         self.assertEqual(param_module_infos, expected_param_module_infos)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_param_module_infos_duplicates(self):
         mlp = MLP(8)
         model = nn.Sequential(mlp, mlp)  # shared MLP
@@ -328,7 +481,11 @@ def test_get_param_module_infos_duplicates(self):
             ParamModuleInfo(mlp.out_proj, "bias", [], []),
         ]
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_param_module_infos_list_of_mlps(self):
         model = nn.Sequential(*[MLP(8) for _ in range(2)])
         managed_modules = _get_managed_modules((model[0], model[1]))
@@ -354,7 +511,11 @@ class TestFullyShardShardedParameterTensor(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 2
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_shard_tensor_parameters(self):
         # Use odd dim sizes to test uneven shards
         model = nn.Sequential(*[MLP(3, dim_multiplier=3) for _ in range(3)])
@@ -374,7 +535,11 @@ def _check_1d_sharded_parameters(
         self, orig_params: list[nn.Parameter], sharded_params: list[nn.Parameter]
     ):
         self.assertEqual(len(orig_params), len(sharded_params))
+<<<<<<< HEAD
         global_mesh = init_device_mesh("cuda", (self.world_size,))
+=======
+        global_mesh = init_device_mesh(device_type.type, (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for orig_param, sharded_param in zip(orig_params, sharded_params):
             self.assertIsInstance(sharded_param, DTensor)
             self.assertEqual(sharded_param.device_mesh, global_mesh)
@@ -384,17 +549,31 @@ def _check_1d_sharded_parameters(
             chunks = torch.chunk(orig_param, self.world_size, dim=0)
             self.assertEqual(sharded_param._local_tensor, chunks[self.rank])
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_raise_scalar_parameter(self):
         """Tests raising an exception when the model has scalar parameters."""
         model = nn.Sequential(*[MLP(3, dim_multiplier=3) for _ in range(3)])
         model.register_parameter("scalar_p", nn.Parameter(torch.tensor(1.0).cuda()))
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_raise_scalar_parameter(self):
+        """Tests raising an exception when the model has scalar parameters."""
+        model = nn.Sequential(*[MLP(3, dim_multiplier=3) for _ in range(3)])
+        model.register_parameter(
+            "scalar_p", nn.Parameter(torch.tensor(1.0).to(device_type))
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(
             ValueError, "Change scalar_p to a 1D tensor with numel equal to 1."
         ):
             fully_shard(model)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_raise_noncontiguous_parameter(self):
         """
         Tests raising an exception when the model has non-contiguous
@@ -412,6 +591,7 @@ class TestFullyShardShardedParameterDTensor(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 4
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_shard_dtensor_parameters(self):
         dp_size = 2 if self.world_size > 2 else 1
@@ -423,6 +603,19 @@ def test_shard_dtensor_parameters(self):
         # TODO: change "mlp_dim" back to 9 when uneven sharding
         # is supported for FSDP+TP
         model = MLP(8, dim_multiplier=3)
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_shard_dtensor_parameters(self):
+        dp_size = 2 if self.world_size > 2 else 1
+        global_mesh = init_device_mesh(
+            device_type.type,
+            (dp_size, self.world_size // dp_size),
+            mesh_dim_names=("dp", "tp"),
+        )
+        dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+        # Use odd dim sizes to test uneven shards
+        model = MLP(9, dim_multiplier=3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         orig_params = [param.detach().clone() for param in model.parameters()]
         orig_param_names = [param_name for param_name, _ in model.named_parameters()]
         parallelize_module(
@@ -457,7 +650,11 @@ class TestFullyShardLazyInit(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 2
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_is_root(self):
         """
         Tests that ``_is_root`` is set correctly after lazy initialization.
@@ -486,7 +683,11 @@ def test_fully_shard_is_root(self):
             all_states, [root_state, model0_in_proj_state, model0_out_proj_state]
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_module_and_param_fqns(self):
         """
         Tests that the module and parameter FQNs are computed correctly after
@@ -544,7 +745,11 @@ def test_fully_shard_module_and_param_fqns(self):
             model0_out_proj_param_fqns, {"0.out_proj.weight", "0.out_proj.bias"}
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_double_lazy_init(self):
         model = nn.Sequential(MLP(8), MLP(8))
         fully_shard(model[0].in_proj)
@@ -560,7 +765,11 @@ def test_fully_shard_double_lazy_init(self):
         with self.assertRaisesRegex(RuntimeError, regex):
             root_state._lazy_init()
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_multi_module_root(self):
         model = nn.Sequential(MLP(8), MLP(8))
         fully_shard([model[0], model[1]])
@@ -569,7 +778,11 @@ def test_fully_shard_multi_module_root(self):
         with self.assertRaisesRegex(RuntimeError, regex):
             root_state._lazy_init()
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reset_sharded_param_in_lazy_init(self):
         class MyModel(nn.Module):
             def __init__(self):
@@ -596,11 +809,19 @@ def forward(self, inp: torch.Tensor) -> torch.Tensor:
         fully_shard(model.layer2)
         fully_shard(model)
 
+<<<<<<< HEAD
         model.layer1.to_empty(device="cuda")
         model.layer2.to_empty(device="cuda")
         model.init_weight_norm()
 
         inp = torch.randn(3, 3, device="cuda")
+=======
+        model.layer1.to_empty(device=device_type.type)
+        model.layer2.to_empty(device=device_type.type)
+        model.init_weight_norm()
+
+        inp = torch.randn(3, 3, device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss = model(inp).sum()
         loss.backward()
 
@@ -610,10 +831,17 @@ class TestFullyShardMetaDeviceInit(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 4
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_meta_device_1d_init(self):
         default_pg = torch.distributed.distributed_c10d._get_default_group()
         mesh = init_device_mesh("cuda", mesh_shape=(default_pg.size(),))
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_meta_device_1d_init(self):
+        default_pg = torch.distributed.distributed_c10d._get_default_group()
+        mesh = init_device_mesh(device_type.type, mesh_shape=(default_pg.size(),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Test both even sharding (8) and uneven sharding (3)
         for mlp_dim in (8, 3):
@@ -641,12 +869,22 @@ def test_meta_device_1d_init(self):
             self.assertEqual(param.device, torch.device("meta"))
         self._test_to_empty_and_reset_parameters(model, mesh, mlp_dim)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_meta_device_2d_init(self):
         assert self.world_size >= 4, f"{self.world_size}"
         dp_size = 2
         global_mesh = init_device_mesh(
+<<<<<<< HEAD
             "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+=======
+            device_type.type,
+            (dp_size, self.world_size // dp_size),
+            mesh_dim_names=("dp", "tp"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
 
@@ -674,7 +912,13 @@ def _test_to_empty_and_reset_parameters(
         self, model: nn.Module, mesh: DeviceMesh, mlp_dim: int
     ):
         # Check that we can materialize it on GPU with empty values
+<<<<<<< HEAD
         device = torch.device("cuda", torch.cuda.current_device())
+=======
+        device = torch.device(
+            device_type.type, torch.get_device_module(device_type).current_device()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model.to_empty(device=device)
         for param in model.parameters():
             self.assertEqual(param.device, device)
@@ -695,6 +939,7 @@ def _test_to_empty_and_reset_parameters(
             self.assertNotEqual(buffer, torch.ones_like(buffer) * const)
 
         # Check that we can run an iteration without erroring
+<<<<<<< HEAD
         inp = torch.randn((4, mlp_dim), device="cuda")
         model(inp).sum().backward()
         optim.step()
@@ -703,6 +948,16 @@ def _test_to_empty_and_reset_parameters(
     def test_invalid_meta_device_init(self):
         default_pg = torch.distributed.distributed_c10d._get_default_group()
         mesh = init_device_mesh("cuda", mesh_shape=(default_pg.size(),))
+=======
+        inp = torch.randn((4, mlp_dim), device=device_type.type)
+        model(inp).sum().backward()
+        optim.step()
+
+    @skip_if_lt_x_gpu(1)
+    def test_invalid_meta_device_init(self):
+        default_pg = torch.distributed.distributed_c10d._get_default_group()
+        mesh = init_device_mesh(device_type.type, mesh_shape=(default_pg.size(),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mlp_dim = 8
         with torch.device("meta"):
             model = nn.Sequential(MLP(mlp_dim, with_buffer=True), MLP(mlp_dim))
@@ -711,7 +966,11 @@ def test_invalid_meta_device_init(self):
             fully_shard(model[0], mesh=mesh)
             fully_shard(model[1], mesh=mesh)
             fully_shard(model, mesh=mesh)
+<<<<<<< HEAD
         inp = torch.randn((4, mlp_dim), device="cuda")
+=======
+        inp = torch.randn((4, mlp_dim), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         error_regex = (
             "FSDP parameters should be materialized from meta device before training, "
             "but the following were still on meta device: "
@@ -720,7 +979,11 @@ def test_invalid_meta_device_init(self):
         with self.assertRaisesRegex(RuntimeError, error_regex):
             model(inp)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_rank0_broadcast_meta_device_init(self):
         model_args = ModelArgs(dropout_p=0.0)
         # Assume we have a CPU full state dict on rank 0
@@ -732,7 +995,11 @@ def test_rank0_broadcast_meta_device_init(self):
                 self.assertEqual(param.device, torch.device("cpu"))
 
         # Initialize the sharded model on meta device
+<<<<<<< HEAD
         fsdp_mesh = init_device_mesh("cuda", (self.world_size,))
+=======
+        fsdp_mesh = init_device_mesh(device_type.type, (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch.device("meta"):
             model = Transformer(model_args)
         for module in model.modules():
@@ -752,7 +1019,11 @@ def test_rank0_broadcast_meta_device_init(self):
             for (param_name, full_param), sharded_meta_param in zip(
                 full_sd.items(), meta_sharded_sd.values()
             ):
+<<<<<<< HEAD
                 full_param = full_param.detach().cuda()
+=======
+                full_param = full_param.detach().to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mesh = sharded_meta_param.device_mesh
                 dist.broadcast(full_param, src=0, group=mesh.get_group(0))
                 sharded_tensor = distribute_tensor(
@@ -763,7 +1034,11 @@ def test_rank0_broadcast_meta_device_init(self):
             for param_name, sharded_meta_param in meta_sharded_sd.items():
                 full_tensor = torch.empty(
                     sharded_meta_param.size(),
+<<<<<<< HEAD
                     device="cuda",
+=======
+                    device=device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dtype=sharded_meta_param.dtype,
                 )
                 mesh = sharded_meta_param.device_mesh
@@ -776,7 +1051,11 @@ def test_rank0_broadcast_meta_device_init(self):
         model.load_state_dict(sharded_sd, assign=True)
         for param in model.parameters():
             self.assertIsInstance(param, DTensor)
+<<<<<<< HEAD
             self.assertEqual(param.device.type, "cuda")
+=======
+            self.assertEqual(param.device.type, device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Construct the reference model on nonzero ranks by broadcasting the
         # unsharded model from rank 0 and sharding on all ranks
@@ -796,7 +1075,11 @@ def test_rank0_broadcast_meta_device_init(self):
             self.assertEqual(param, ref_param)
 
         # Check one forward/backward for parity
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+=======
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss = model(inp).sum()
         loss.backward()
         ref_loss = ref_model(inp).sum()
@@ -811,25 +1094,43 @@ class TestFullyShardProcessGroupInit(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 4
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_1d_process_group_init(self):
         assert self.world_size == 4, f"{self.world_size}"
         # For convenience, use device mesh's infra to construct the DP PG
         # (in practice, the trainer would do it manually via `new_group()`)
         dp_size = 2
         global_mesh = init_device_mesh(
+<<<<<<< HEAD
             "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+=======
+            device_type.type,
+            (dp_size, self.world_size // dp_size),
+            mesh_dim_names=("dp", "tp"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         ref_dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
         dp_pg = ref_dp_mesh.get_group(0)
 
         # Check the `from_group()` API for correctness
+<<<<<<< HEAD
         dp_mesh = DeviceMesh.from_group(dp_pg, "cuda", mesh_dim_names=("dp",))
+=======
+        dp_mesh = DeviceMesh.from_group(dp_pg, device_type.type, mesh_dim_names=("dp",))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Only compare the mesh tensors, not `DeviceMesh` objects themselves,
         # since the ref has a parent mesh, while the `from_group` one does not
         self.assertEqual(dp_mesh.mesh, ref_dp_mesh.mesh)
         self.assertEqual(dp_mesh._coordinate_on_dim, ref_dp_mesh._coordinate_on_dim)
+<<<<<<< HEAD
         self.assertEqual(dp_mesh._dim_group_infos, ref_dp_mesh._dim_group_infos)
+=======
+        self.assertEqual(dp_mesh._dim_group_names, ref_dp_mesh._dim_group_names)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Check 1D FSDP forward/backward parity over the DP mesh
         # NOTE: We cannot use 2D DTensor-based training here because the DP
@@ -849,7 +1150,11 @@ def test_1d_process_group_init(self):
             fully_shard(module, mesh=dp_mesh)
 
         # Ensure that TP ranks have the same input
+<<<<<<< HEAD
         inp = torch.randn((4, mlp_dim), device="cuda")
+=======
+        inp = torch.randn((4, mlp_dim), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.rank in (0, 1):
             dist.broadcast(inp, src=0, group=tp_mesh.get_group(0))
         elif self.rank in (2, 3):
@@ -871,6 +1176,7 @@ def test_1d_process_group_init(self):
                 param.grad.device_mesh.mesh, ref_param.grad.device_mesh.mesh
             )
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_2d_process_group_init(self):
         shard_mesh_dim_size = 2
@@ -881,6 +1187,18 @@ def test_2d_process_group_init(self):
         mesh_dim_names = ("replicate", "shard")
         ref_mesh = init_device_mesh(
             "cuda",
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_2d_process_group_init(self):
+        shard_mesh_dim_size = 2
+        assert self.world_size % shard_mesh_dim_size == 0, (
+            f"Expects {self.world_size} to be divisible by {shard_mesh_dim_size}"
+        )
+        replicate_mesh_dim_size = self.world_size // shard_mesh_dim_size
+        mesh_dim_names = ("replicate", "shard")
+        ref_mesh = init_device_mesh(
+            device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (replicate_mesh_dim_size, shard_mesh_dim_size),
             mesh_dim_names=mesh_dim_names,
         )
@@ -899,18 +1217,25 @@ def test_2d_process_group_init(self):
         # Check the `from_group()` API for correctness
         mesh = DeviceMesh.from_group(
             [dp_replicate_group, dp_shard_group],
+<<<<<<< HEAD
             "cuda",
+=======
+            device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mesh_dim_names=mesh_dim_names,
             mesh=mesh_tensor,
         )
         self.assertEqual(mesh.mesh, ref_mesh.mesh)
         self.assertEqual(mesh._coordinate_on_dim, ref_mesh._coordinate_on_dim)
+<<<<<<< HEAD
         for (_, ranks, _), (_, ref_ranks, _) in zip(
             mesh._dim_group_infos, ref_mesh._dim_group_infos
         ):
             # Since we manually constructed new subgroups, the test and ref
             # groups are not the same
             self.assertEqual(ranks, ref_ranks)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for mesh_dim_name in mesh_dim_names:
             child_mesh = mesh[mesh_dim_name]
             ref_child_mesh = ref_mesh[mesh_dim_name]
@@ -938,7 +1263,11 @@ def test_2d_process_group_init(self):
         for module in (model.in_proj, model.out_proj, model):
             fully_shard(module, mesh=mesh)
 
+<<<<<<< HEAD
         inp = torch.randn((4, mlp_dim), device="cuda")
+=======
+        inp = torch.randn((4, mlp_dim), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_loss = ref_model(inp).sum()
         ref_loss.backward()
         loss = model(inp).sum()
@@ -954,11 +1283,21 @@ class TestFullyShardHSDPBroadcast(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 4
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_hsdp_broadcast_across_replicas(self):
         shard_size, replicate_size = 2, 2
         mesh = init_device_mesh(
             "cuda", (replicate_size, shard_size), mesh_dim_names=("replicate", "shard")
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_hsdp_broadcast_across_replicas(self):
+        shard_size, replicate_size = 2, 2
+        mesh = init_device_mesh(
+            device_type.type,
+            (replicate_size, shard_size),
+            mesh_dim_names=("replicate", "shard"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         model_args = ModelArgs()
         model = Transformer(model_args)
@@ -1012,7 +1351,11 @@ def test_hsdp_broadcast_across_replicas(self):
                 self.assertEqual(other_local_tensor, local_tensor_list[0])
 
         # Check that we can run an iteration without erroring
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+=======
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model(inp).sum().backward()
 
 
@@ -1023,17 +1366,30 @@ def world_size(self) -> int:
 
     def perThreadSetUp(self) -> None:
         super().perThreadSetUp()
+<<<<<<< HEAD
         torch.set_default_device("cuda")
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_custom_hook_custom_stream(self):
         hsdp_mesh = init_device_mesh(
             "cuda", (2, 2), mesh_dim_names=("replicate", "shard")
+=======
+        torch.set_default_device(device_type)
+
+    @skip_if_lt_x_gpu(1)
+    def test_custom_hook_custom_stream(self):
+        hsdp_mesh = init_device_mesh(
+            device_type.type, (2, 2), mesh_dim_names=("replicate", "shard")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         model = MLP(10, bias=False)
         fully_shard(model, mesh=hsdp_mesh)
         model = cast(FSDPModule, model)
+<<<<<<< HEAD
         custom_stream = torch.cuda.Stream()
+=======
+        custom_stream = torch.get_device_module(device_type).Stream()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # native HSDP should reject
         with self.assertRaises(ValueError) as cm:
@@ -1046,7 +1402,11 @@ def test_custom_hook_custom_stream(self):
         intra_pg = _init_intra_node_process_group(2)
         fsdp_mesh = DeviceMesh.from_group(
             intra_pg,
+<<<<<<< HEAD
             "cuda",
+=======
+            device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dist.get_process_group_ranks(intra_pg),
             mesh_dim_names=("shard",),
         )
@@ -1054,7 +1414,11 @@ def test_custom_hook_custom_stream(self):
 
         def _hook(_output: torch.Tensor) -> None:
             nonlocal hook_used_stream
+<<<<<<< HEAD
             hook_used_stream = torch.cuda.current_stream()
+=======
+            hook_used_stream = torch.get_device_module(device_type).current_stream()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model = MLP(10, bias=False)
         fully_shard(model, mesh=fsdp_mesh)
@@ -1064,17 +1428,28 @@ def _hook(_output: torch.Tensor) -> None:
         inp = torch.arange(10, dtype=torch.float32, requires_grad=True).view(1, 10)
         out = model(inp)
         out.sum().backward()
+<<<<<<< HEAD
         torch.cuda.synchronize()
         self.assertEqual(hook_used_stream, custom_stream)
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+        torch.get_device_module(device_type).synchronize()
+        self.assertEqual(hook_used_stream, custom_stream)
+
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_custom_hsdp_all_reduce_hook(self):
         world_pg = dist.distributed_c10d._get_default_group()
         intra_pg = _init_intra_node_process_group(2)
         inter_pg = _init_inter_node_process_group(world_pg, 2)
         mesh = DeviceMesh.from_group(
             intra_pg,
+<<<<<<< HEAD
             "cuda",
+=======
+            device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dist.get_process_group_ranks(intra_pg),
             mesh_dim_names=("shard",),
         )
@@ -1101,7 +1476,11 @@ def _custom_hook(output: torch.Tensor) -> None:
         inp = torch.arange(10, dtype=torch.float32, requires_grad=True).view(1, 10)
         out = model(inp)
         out.sum().backward()
+<<<<<<< HEAD
         torch.cuda.synchronize()
+=======
+        torch.get_device_module(device_type).synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # custom hook was fired
         self.assertTrue(hook_called)
         # within each replica, FSDP shards the weights at dim 0
@@ -1135,7 +1514,11 @@ def _init_models(self):
         ref_model = copy.deepcopy(model)
         return model, ref_model
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_init_1d_transformer_shard_largest_dim(self):
         model, ref_model = self._init_models()
 
@@ -1163,7 +1546,11 @@ def shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             full_param = param.full_tensor()
             self.assertEqual(full_param, ref_param)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_init_1d_transformer_shard_dim_neg1(self):
         model, ref_model = self._init_models()
 
@@ -1179,13 +1566,21 @@ def shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             full_param = param.full_tensor()
             self.assertEqual(full_param, ref_param)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_init_2d_transformer_shard_diff_dim(self):
         model, ref_model = self._init_models()
 
         dp_size, tp_size = self.world_size // 2, 2
         global_mesh = init_device_mesh(
+<<<<<<< HEAD
             "cuda", (dp_size, tp_size), mesh_dim_names=("dp", "tp")
+=======
+            device_type.type, (dp_size, tp_size), mesh_dim_names=("dp", "tp")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         model = Transformer.parallelize(model, global_mesh["tp"], use_seq_parallel=True)
 
@@ -1229,7 +1624,11 @@ def shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             full_param = param.full_tensor()
             self.assertEqual(full_param, ref_param)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_init_1d_uneven_shard_largest_dim(self):
         torch.manual_seed(42)
         model = nn.Sequential(nn.Linear(16, 17), nn.Linear(17, 8))
@@ -1250,7 +1649,11 @@ def shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
         ):
             fully_shard(model, shard_placement_fn=shard_placement_fn)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_invalid_shard_dim(self):
         model = nn.Sequential(nn.Linear(16, 16), nn.Linear(16, 8))
 
@@ -1271,7 +1674,11 @@ class TestFullyShardOldImport(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 2
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_old_import_training(self):
         from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy
         from torch.distributed._composable.fsdp.fully_shard import FSDPModule
@@ -1286,9 +1693,44 @@ def test_old_import_training(self):
         self.assertIsInstance(model[1], FSDPModule)
         self.assertIsInstance(model, FSDPModule)
 
+<<<<<<< HEAD
         inp = torch.randn((8, 16), device="cuda")
         model(inp).sum().backward()
 
 
+=======
+        inp = torch.randn((8, 16), device=device_type)
+        model(inp).sum().backward()
+
+
+class TestFullyShardMixedDtypeParam(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    def test_mixed_dtypes_no_grad_param(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                # no grad params with different dtypes
+                self.w_fp8 = torch.nn.Parameter(
+                    torch.empty((256, 256), dtype=torch.float8_e4m3fn),
+                    requires_grad=False,
+                )
+                self.w_fp32 = torch.nn.Parameter(
+                    torch.empty((256, 256), dtype=torch.float32)
+                )
+
+            def forward(self, input):
+                return
+
+        mesh = init_device_mesh(device_type.type, (self.world_size,))
+        model = Model()
+        fully_shard(model, mesh=mesh)
+        model(0)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_logging.py b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
index 94e57b2fc36d..ccbb97537147 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_logging.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
@@ -15,6 +15,15 @@
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
 
+<<<<<<< HEAD
+=======
+import torch
+from torch.testing._internal.common_fsdp import get_devtype
+
+
+device_type = torch.device(get_devtype())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @skip_if_lt_x_gpu(2)
 class LoggingTests(LoggingTestCase):
@@ -27,7 +36,11 @@ def test_fsdp_logging(self):
         env["MASTER_PORT"] = "34715"
         env["MASTER_ADDR"] = "localhost"
         _, stderr = self.run_process_no_exception(
+<<<<<<< HEAD
             """\
+=======
+            f"""\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import torch
 import torch.distributed as dist
@@ -35,7 +48,11 @@ def test_fsdp_logging(self):
 from torch.distributed.fsdp import fully_shard
 logger = logging.getLogger("torch.distributed._composable.fsdp")
 logger.setLevel(logging.DEBUG)
+<<<<<<< HEAD
 device = "cuda"
+=======
+device = {device_type.type}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.manual_seed(0)
 model = nn.Sequential(*[nn.Linear(4, 4, device=device, bias=False) for _ in range(2)])
 for layer in model:
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_memory.py b/test/distributed/_composable/fsdp/test_fully_shard_memory.py
index de6df77479c9..1010a0e34835 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_memory.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_memory.py
@@ -2,12 +2,21 @@
 
 import functools
 import gc
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, OffloadPolicy
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+<<<<<<< HEAD
 from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import run_tests
+=======
+from torch.testing._internal.common_fsdp import FSDPTest, get_devtype
+from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -15,12 +24,25 @@
 )
 
 
+<<<<<<< HEAD
 class TestFullyShardMemory(FSDPTest):
     @property
     def world_size(self) -> int:
         return min(2, torch.cuda.device_count())
 
     @skip_if_lt_x_gpu(2)
+=======
+device_type = torch.device(get_devtype())
+
+
+class TestFullyShardMemory(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(2, torch.get_device_module(device_type).device_count())
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_HPU, " 'empty_cache' is not supported on hpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_training_memory(self):
         self.run_subtests(
             {
@@ -49,17 +71,30 @@ def _test_fully_shard_training_memory(
             )
         ):
             return  # skip since not a common use case
+<<<<<<< HEAD
         assert (
             self.world_size == 2
         ), f"Requires world size of 2 since some values are hard coded: {self.world_size}"
+=======
+        assert self.world_size == 2, (
+            f"Requires world size of 2 since some values are hard coded: {self.world_size}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(42)
         # Pre-run a linear forward (gemm and bias) and backward (gemm) to
         # allocate the cuBLAS workspaces before measuring the memory usage
         # since the workspace size can differ between hardwares
+<<<<<<< HEAD
         lin = torch.nn.Linear(768, 768, device="cuda")
         inp = torch.randn(1, 768, device="cuda")
         lin(inp).sum().backward()
         torch.cuda.empty_cache()
+=======
+        lin = torch.nn.Linear(768, 768, device=device_type)
+        inp = torch.randn(1, 768, device=device_type)
+        lin(inp).sum().backward()
+        torch.get_device_module(device_type).empty_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_mem_mb = self._get_peak_active_memory_mb()
         vocab_size = 32
         model_args = ModelArgs(
@@ -108,7 +143,11 @@ def _test_fully_shard_training_memory(
         self.assertLessEqual(curr_mem_mb - base_mem_mb, init_mem_mb)
 
         # Use a small input to minimize activation memory usage
+<<<<<<< HEAD
         inp = torch.randint(0, vocab_size, (1, 4), device="cuda")
+=======
+        inp = torch.randint(0, vocab_size, (1, 4), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Forward:
         loss = model(inp)
@@ -169,7 +208,11 @@ def _test_fully_shard_training_memory(
             ) * 4 / 1e6 + buffer_mb
         self.assertLessEqual(mem_mb - base_mem_mb, expected_mem_mb)
         del loss
+<<<<<<< HEAD
         torch.cuda.reset_peak_memory_stats()
+=======
+        torch.get_device_module(device_type).reset_peak_memory_stats()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Optimizer step: unsharded parameters/gradients freed
         if not run_optim_in_backward:
@@ -187,7 +230,13 @@ def _test_fully_shard_training_memory(
         # Zero grad: sharded gradients freed
         if not run_optim_in_backward:
             optim.zero_grad()
+<<<<<<< HEAD
         torch.cuda.reset_peak_memory_stats()  # reset after freeing
+=======
+        torch.get_device_module(
+            device_type
+        ).reset_peak_memory_stats()  # reset after freeing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mem_mb = self._get_peak_active_memory_mb()
         expected_mem_mb = 0
         if not use_cpu_offload:
@@ -228,12 +277,27 @@ def test_fully_shard_del_memory(self):
         self.assertEqual(mem_mb, base_mem_mb)
 
     def _get_peak_active_memory_mb(self) -> int:
+<<<<<<< HEAD
         mem_stats = torch.cuda.memory_stats()
         return round(mem_stats["active_bytes.all.peak"] / 1e6)
 
     def _get_curr_active_memory_mb(self) -> int:
         mem_stats = torch.cuda.memory_stats()
         return round(mem_stats["active_bytes.all.current"] / 1e6)
+=======
+        mem_stats = torch.get_device_module(device_type).memory_stats()
+        if TEST_CUDA:
+            return round(mem_stats["active_bytes.all.peak"] / 1e6)
+        if TEST_HPU:
+            return round(mem_stats["MaxInUse"] / 1e6)
+
+    def _get_curr_active_memory_mb(self) -> int:
+        mem_stats = torch.get_device_module(device_type).memory_stats()
+        if TEST_CUDA:
+            return round(mem_stats["active_bytes.all.current"] / 1e6)
+        if TEST_HPU:
+            return round(mem_stats["InUse"] / 1e6)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _register_optim_in_backward(
         self, model: torch.nn.Module, **optim_kwargs
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
index 8081309aaa12..5a43da507de4 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
@@ -1,6 +1,10 @@
 # Owner(s): ["oncall: distributed"]
 
 import copy
+<<<<<<< HEAD
+=======
+import dataclasses
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 from typing import Optional, Union
 
@@ -22,17 +26,32 @@
     check_sharded_parity,
     FSDPTest,
     FSDPTestMultiThread,
+<<<<<<< HEAD
+=======
+    get_devtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MLP,
     patch_reduce_scatter,
     reduce_scatter_with_assert,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests
+=======
+from torch.testing._internal.common_utils import run_tests, skipIfRocm, TEST_HPU
+
+
+device_type = torch.device(get_devtype())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestFullyShardMixedPrecisionTraining(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(4, torch.cuda.device_count())
+=======
+        return min(4, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _init_models_and_optims(
         self,
@@ -43,7 +62,11 @@ def _init_models_and_optims(
     ):
         torch.manual_seed(42)
         model = nn.Sequential(*[MLP(16, torch.device("cpu")) for _ in range(3)])
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
 
         def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
@@ -81,6 +104,10 @@ def _get_use_shard_placement_fn_vals_for_bf16_reduce(self):
             use_shard_placement_fn_vals.append(True)
         return use_shard_placement_fn_vals
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
     def test_compute_dtype(self):
@@ -117,12 +144,20 @@ def assert_fn(output: torch.Tensor):
         reduce_scatter = functools.partial(
             reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
         )
+<<<<<<< HEAD
         predivide_factor, postdivide_factor = _get_gradient_divide_factors(
+=======
+        predivide_factor, postdivide_factor, _, _ = _get_gradient_divide_factors(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.process_group, all_reduce_group=None, reduce_dtype=param_dtype
         )
 
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         inp = torch.randn((4, 16), device="cuda", dtype=param_dtype)
+=======
+        inp = torch.randn((4, 16), device=device_type.type, dtype=param_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for iter_idx in range(10):
             optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
             fsdp_loss = model(inp).sum()
@@ -160,6 +195,10 @@ def assert_fn(output: torch.Tensor):
             self.assertEqual(fsdp_loss, ref_loss)
             check_sharded_parity(self, ref_model, model)
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm  # regressed in ROCm 6.4, but ROCm 6.5 fixes it
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
     def test_reduce_dtype(self):
@@ -207,7 +246,11 @@ def assert_fn(output: torch.Tensor):
             reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
         )
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         inp = torch.randn((4, 16), device="cuda", dtype=param_dtype)
+=======
+        inp = torch.randn((4, 16), device=device_type.type, dtype=param_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for iter_idx in range(10):
             optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
             fsdp_loss = model(inp).sum()
@@ -256,7 +299,11 @@ def assert_fn(output: torch.Tensor):
             reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
         )
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         inp = torch.randn((4, 16), device="cuda", dtype=param_dtype)
+=======
+        inp = torch.randn((4, 16), device=device_type.type, dtype=param_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for iter_idx in range(10):
             optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
             fsdp_loss = model(inp).sum()
@@ -277,9 +324,13 @@ def assert_fn(output: torch.Tensor):
                 )  # bf16 reduction
                 param.grad = funcol.all_gather_tensor(
                     sharded_grad, gather_dim=0, group=group
+<<<<<<< HEAD
                 ).to(
                     param.dtype
                 )  # upcast to fp32
+=======
+                ).to(param.dtype)  # upcast to fp32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ref_optim.step()  # fp32 optimizer step
 
             self.assertEqual(fsdp_loss, ref_loss)
@@ -307,7 +358,11 @@ def _test_grad_acc_with_reduce_dtype(self, reshard_after_forward: bool):
         # To emulate the mixed precision implementation where forward/backward
         # compute use bf16 and optimizer uses fp32, we maintain both an fp32
         # and a bf16 copy of the reference model
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_model_compute = copy.deepcopy(ref_model).to(param_dtype)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         for mlp in model:
@@ -327,7 +382,11 @@ def assert_fn(output: torch.Tensor):
             reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
         )
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         device = torch.device("cuda")
+=======
+        device = device_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Train on the same input to avoid loss explosion
         num_microbatches = 4
         inp = torch.randn((2 * num_microbatches, 16), device=device, dtype=param_dtype)
@@ -387,7 +446,11 @@ def world_size(self) -> int:
 
     @skip_if_lt_x_gpu(1)
     def test_float16_on_one_submodule(self):
+<<<<<<< HEAD
         x = torch.zeros(2, 100, device="cuda")
+=======
+        x = torch.zeros(2, 100, device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Subtest 1: use fp16 on the second child submodule -- does not require
         # any additional casting logic
@@ -395,7 +458,11 @@ def test_float16_on_one_submodule(self):
         model = SaveForwardInputsModel(
             forward_inputs,
             cast_forward_inputs=False,
+<<<<<<< HEAD
         ).cuda()
+=======
+        ).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard(model.c2, mp_policy=MixedPrecisionPolicy(param_dtype=torch.float16))
         fully_shard(model)
         model(x).sum().backward()
@@ -408,7 +475,11 @@ def test_float16_on_one_submodule(self):
         forward_inputs: dict[nn.Module, torch.Tensor] = {}
         model = SaveForwardInputsModel(
             forward_inputs=forward_inputs, cast_forward_inputs=True
+<<<<<<< HEAD
         ).cuda()
+=======
+        ).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard(
             model.c2,
             mp_policy=MixedPrecisionPolicy(
@@ -426,7 +497,11 @@ def test_float16_on_one_submodule(self):
         forward_inputs: dict[nn.Module, torch.Tensor] = {}
         model = SaveForwardInputsModel(
             forward_inputs=forward_inputs, cast_forward_inputs=False
+<<<<<<< HEAD
         ).cuda()
+=======
+        ).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard(
             model.c1,
             mp_policy=MixedPrecisionPolicy(
@@ -468,13 +543,22 @@ def __init__(self, forward_inputs: dict[str, torch.Tensor]) -> None:
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 self.forward_inputs["model_input_x"] = x
                 y = torch.ones(
+<<<<<<< HEAD
                     2, 100, device="cuda", dtype=torch.float32
+=======
+                    2, 100, device=device_type.type, dtype=torch.float32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )  # external input
                 return self.l2(self.l1(x), y)
 
         forward_inputs: dict[str, torch.Tensor] = {}
+<<<<<<< HEAD
         model = ToyModel(forward_inputs).cuda()
         x = torch.zeros(2, 100, device="cuda", dtype=torch.float32)
+=======
+        model = ToyModel(forward_inputs).to(device_type)
+        x = torch.zeros(2, 100, device=device_type.type, dtype=torch.float32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard(
             model.l2,
             mp_policy=MixedPrecisionPolicy(
@@ -527,9 +611,21 @@ def inner(model: nn.Module, x: torch.Tensor):
         model = nn.Sequential(nn.Conv2d(1, 5, 3), nn.BatchNorm2d(5), nn.Conv2d(5, 4, 3))
         for module in (model[0], model[1], model[2], model):
             fully_shard(module, mp_policy=mp_policy)
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "Expected running_mean to have type"):
             # Errors in batch norm 2D backward
             inner(model, torch.randn((3, 1, 9, 9)))
+=======
+        if TEST_HPU:
+            inner(model, torch.randn((3, 1, 9, 9)))
+        else:
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Expected running_mean to have type",  # Error not seen on HPUs and hence it can be skipped
+            ):
+                # Errors in batch norm 2D backward
+                inner(model, torch.randn((3, 1, 9, 9)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Batch norm 2D: cast buffers down to lower precision
         model = nn.Sequential(nn.Conv2d(1, 5, 3), nn.BatchNorm2d(5), nn.Conv2d(5, 4, 3))
@@ -555,7 +651,11 @@ def test_clamp_reduce_dtype(self):
         model = nn.Sequential(
             nn.Linear(32, 32, dtype=init_dtype),
             nn.Linear(32, 32, dtype=init_dtype),
+<<<<<<< HEAD
         )
+=======
+        ).to(device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mp_policy = MixedPrecisionPolicy(
             param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16
         )
@@ -577,10 +677,41 @@ def assert_fn(output: torch.Tensor):
             reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
         )
         with patch_reduce_scatter(reduce_scatter):
+<<<<<<< HEAD
             inp = torch.randn((4, 32), device="cuda")
             loss = model(inp).sum()
             loss.backward()
 
+=======
+            inp = torch.randn((4, 32), device=device_type.type)
+            loss = model(inp).sum()
+            loss.backward()
+
+    @skip_if_lt_x_gpu(1)
+    def test_dataclass_input(self):
+        @dataclasses.dataclass
+        class Input:
+            x: torch.Tensor
+
+        class Model(nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+                self._layer = nn.Linear(10, 10)
+
+            def forward(self, input: Input):
+                return self._layer(input.x)
+
+        mp_policy = MixedPrecisionPolicy(
+            torch.bfloat16, torch.bfloat16, torch.bfloat16, True
+        )
+        model = Model()
+        inp = Input(torch.randn(2, 10).cuda())
+
+        fully_shard(model, mp_policy=mp_policy)
+        loss = model(inp).sum()
+        loss.backward()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
index 2d1cc7779fdd..3f554e6a8772 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
@@ -2,11 +2,16 @@
 
 import copy
 import functools
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Callable
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed._tensor.experimental import implicit_replication
 from torch.distributed.fsdp import fully_shard
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -16,6 +21,22 @@
     patch_reduce_scatter,
 )
 from torch.testing._internal.common_utils import get_cycles_per_ms, run_tests
+=======
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.tensor.experimental import implicit_replication
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+    get_devtype,
+    patch_all_gather,
+    patch_reduce_scatter,
+)
+from torch.testing._internal.common_utils import get_cycles_per_ms, run_tests, TEST_HPU
+
+
+device_type = torch.device(get_devtype())
+device_module = torch.get_device_module(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestFullyShardOverlap(FSDPTest):
@@ -35,9 +56,16 @@ class TestFullyShardOverlap(FSDPTest):
 
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(2, torch.cuda.device_count())
 
     @skip_if_lt_x_gpu(2)
+=======
+        return min(2, torch.get_device_module(device_type).device_count())
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_training_overlap(self):
         torch.manual_seed(42)
 
@@ -46,7 +74,11 @@ def test_fully_shard_training_overlap(self):
         model = nn.Sequential(
             *[LinearWithSleep(dim, compute_sleep_ms) for _ in range(num_linears)]
         )
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for lin in model:
             assert len(list(lin.parameters())) == 1, "Expects only one weight"
             fully_shard(lin, reshard_after_forward=True)
@@ -54,15 +86,32 @@ def test_fully_shard_training_overlap(self):
 
         orig_all_gather_into_tensor = dist.all_gather_into_tensor
         orig_reduce_scatter_tensor = dist.reduce_scatter_tensor
+<<<<<<< HEAD
         comm_stream = torch.cuda.Stream()
+=======
+        comm_stream = torch.get_device_module(device_type).Stream()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def delay_collective():
             # Share a stream so that all-gather and reduce-scatter block each
             # other like in `ProcessGroupNCCL`
+<<<<<<< HEAD
             comm_stream.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(comm_stream):
                 torch.cuda._sleep(int(comm_sleep_ms * get_cycles_per_ms()))
             torch.cuda.current_stream().wait_stream(comm_stream)
+=======
+            comm_stream.wait_stream(
+                torch.get_device_module(device_type).current_stream()
+            )
+            with torch.get_device_module(device_type).stream(comm_stream):
+                torch.get_device_module(device_type)._sleep(
+                    int(comm_sleep_ms * get_cycles_per_ms())
+                )
+            torch.get_device_module(device_type).current_stream().wait_stream(
+                comm_stream
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def delayed_all_gather(*args, **kwargs):
             delay_collective()
@@ -72,7 +121,11 @@ def delayed_reduce_scatter(*args, **kwargs):
             delay_collective()
             return orig_reduce_scatter_tensor(*args, **kwargs)
 
+<<<<<<< HEAD
         inp = torch.randn((2, dim), device="cuda")
+=======
+        inp = torch.randn((2, dim), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss = model(inp).sum()  # warmup CUDA and allocator
         loss.backward()
 
@@ -126,8 +179,14 @@ def ref_fwd_bwd():
                     dist.reduce_scatter_tensor(dummy_rs_output, dummy_rs_input)
 
         def fwd_bwd():
+<<<<<<< HEAD
             with patch_all_gather(delayed_all_gather), patch_reduce_scatter(
                 delayed_reduce_scatter
+=======
+            with (
+                patch_all_gather(delayed_all_gather),
+                patch_reduce_scatter(delayed_reduce_scatter),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 loss = model(inp).sum()
                 loss.backward()
@@ -144,6 +203,10 @@ def fwd_bwd():
         self.assertLessEqual(fwd_bwd_time, ref_fwd_bwd_time)
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_post_optim_event_overlap(self):
         torch.manual_seed(42)
 
@@ -153,17 +216,30 @@ def test_fully_shard_post_optim_event_overlap(self):
         # low-compute linear, where only the low-compute linear uses FSDP
         model = nn.Sequential(
             LinearWithSleep(dim, compute_sleep_ms), nn.Linear(dim, dim)
+<<<<<<< HEAD
         ).cuda()
+=======
+        ).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard(model[1], reshard_after_forward=False)
         optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
 
         orig_all_gather_into_tensor = dist.all_gather_into_tensor
 
         def delayed_all_gather(*args, **kwargs):
+<<<<<<< HEAD
             torch.cuda._sleep(int(comm_sleep_ms * get_cycles_per_ms()))
             return orig_all_gather_into_tensor(*args, **kwargs)
 
         inp = torch.randn((2, dim), device="cuda")
+=======
+            torch.get_device_module(device_type)._sleep(
+                int(comm_sleep_ms * get_cycles_per_ms())
+            )
+            return orig_all_gather_into_tensor(*args, **kwargs)
+
+        inp = torch.randn((2, dim), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def run_train_steps(num_iters: int, use_post_optim_event: bool):
             for _ in range(num_iters):
@@ -174,7 +250,15 @@ def run_train_steps(num_iters: int, use_post_optim_event: bool):
                 with implicit_replication():
                     optim.step()
                 if use_post_optim_event:
+<<<<<<< HEAD
                     post_optim_event = torch.cuda.current_stream().record_event()
+=======
+                    post_optim_event = (
+                        torch.get_device_module(device_type)
+                        .current_stream()
+                        .record_event()
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     model[1].set_post_optim_event(post_optim_event)
 
         run_train_steps(1, False)  # warmup CUDA and allocator
@@ -205,6 +289,7 @@ def run_train_steps(num_iters: int, use_post_optim_event: bool):
         self.assertGreater(baseline_time, test_time)
 
     def _time_fn(self, fn: Callable):
+<<<<<<< HEAD
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         dist.barrier()
@@ -213,6 +298,16 @@ def _time_fn(self, fn: Callable):
         fn()
         end_event.record()
         torch.cuda.synchronize()
+=======
+        start_event = device_module.Event(enable_timing=True)
+        end_event = device_module.Event(enable_timing=True)
+        dist.barrier()
+        device_module.synchronize()
+        start_event.record()
+        fn()
+        end_event.record()
+        device_module.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elapsed_time = start_event.elapsed_time(end_event)
         return elapsed_time
 
@@ -223,13 +318,23 @@ class Matmul(torch.autograd.Function):
     def forward(ctx, input: torch.Tensor, weight: torch.Tensor, sleep_ms: int):
         ctx.save_for_backward(input, weight)
         ctx.sleep_ms = sleep_ms
+<<<<<<< HEAD
         torch.cuda._sleep(int(sleep_ms * get_cycles_per_ms()))
+=======
+        torch.get_device_module(device_type)._sleep(int(sleep_ms * get_cycles_per_ms()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return input @ weight
 
     @staticmethod
     def backward(ctx, grad_output: torch.Tensor):
         (input, weight) = ctx.saved_tensors
+<<<<<<< HEAD
         torch.cuda._sleep(int(2 * ctx.sleep_ms * get_cycles_per_ms()))
+=======
+        torch.get_device_module(device_type)._sleep(
+            int(2 * ctx.sleep_ms * get_cycles_per_ms())
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         grad_input = grad_output @ weight.T
         grad_weight = input.T @ grad_output
         return grad_input, grad_weight, None
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_state.py b/test/distributed/_composable/fsdp/test_fully_shard_state.py
index c175f3bdb8e5..b77ac14c760a 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_state.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_state.py
@@ -1,11 +1,18 @@
 # Owner(s): ["oncall: distributed"]
 
 import copy
+<<<<<<< HEAD
 import unittest
 
 import torch.nn as nn
 from torch.distributed.fsdp import FSDPModule, fully_shard
 from torch.testing._internal.common_cuda import TEST_CUDA
+=======
+
+import torch.nn as nn
+from torch.distributed.fsdp import FSDPModule, fully_shard
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_fsdp import FSDPTestMultiThread, MLP
 from torch.testing._internal.common_utils import run_tests
 
@@ -15,7 +22,11 @@ class TestFullyShardState(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 1
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_state(self):
         """
         Tests the ability to get the state object from a fully sharded module.
@@ -31,7 +42,11 @@ def test_fully_shard_state(self):
         # Check that each `fully_shard` call constructs a distinct state object
         self.assertEqual(len(set(all_states)), num_mlps + 1)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_reapply(self):
         model = MLP(8)
         fully_shard(model)
@@ -41,7 +56,11 @@ def test_fully_shard_reapply(self):
         ):
             fully_shard(model)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_cls(self):
         # Check that we only swap class for the module passed to `fully_shard`
         model = MLP(8)
@@ -64,7 +83,11 @@ def test_fully_shard_cls(self):
         self.assertTrue(isinstance(sliced_model, nn.Sequential))
         self.assertFalse(isinstance(sliced_model, FSDPModule))
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_unsupported_module_cls(self):
         regex = (
             r"fully\_shard does not support containers that do not implement forward"
@@ -76,7 +99,11 @@ def test_fully_shard_unsupported_module_cls(self):
         with self.assertRaisesRegex(ValueError, regex):
             fully_shard(model)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fully_shard_deepcopy(self):
         model = MLP(8)
         fully_shard(model)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
index 6422462d0eb8..c0f8b2484b52 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
@@ -2,7 +2,10 @@
 
 import copy
 import functools
+<<<<<<< HEAD
 import unittest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import nullcontext
 from typing import Optional
 
@@ -16,9 +19,19 @@
     parallelize_module,
     RowwiseParallel,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest, FSDPTestMultiThread, MLP
+=======
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+    FSDPTestMultiThread,
+    get_devtype,
+    MLP,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
@@ -27,6 +40,7 @@
 )
 
 
+<<<<<<< HEAD
 class TestFullyShardStateDictMultiProcess(FSDPTest):
     @property
     def world_size(self) -> int:
@@ -35,10 +49,24 @@ def world_size(self) -> int:
     @skip_if_lt_x_gpu(2)
     def test_dp_state_dict_save_load(self):
         fsdp_mesh = init_device_mesh("cuda", (self.world_size,))
+=======
+device_type = torch.device(get_devtype())
+
+
+class TestFullyShardStateDictMultiProcess(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(8, torch.get_device_module(device_type).device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_dp_state_dict_save_load(self):
+        fsdp_mesh = init_device_mesh(device_type.type, (self.world_size,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.run_subtests(
             {"mlp_dim": [2, 3, 4, 5], "mesh": [fsdp_mesh]},
             self._test_dp_state_dict_save_load,
         )
+<<<<<<< HEAD
         self.run_subtests(
             {"mlp_dim": [16], "mesh": [fsdp_mesh], "use_shard_placement_fn": [True]},
             self._test_dp_state_dict_save_load,
@@ -47,6 +75,23 @@ def test_dp_state_dict_save_load(self):
             return
         hsdp_mesh = init_device_mesh(
             "cuda",
+=======
+        if 16 % self.world_size == 0:
+            # TODO: remove this evenness check when FSDP2 supports uneven sharding
+            # see: https://github.com/pytorch/pytorch/blob/cbb03e69717943ddf912f9a68b3a6f935bbf21f5/torch/distributed/fsdp/_fully_shard/_fsdp_param.py#L353-L361  # noqa: B950
+            self.run_subtests(
+                {
+                    "mlp_dim": [16],
+                    "mesh": [fsdp_mesh],
+                    "use_shard_placement_fn": [True],
+                },
+                self._test_dp_state_dict_save_load,
+            )
+        if self.world_size % 2 != 0:
+            return
+        hsdp_mesh = init_device_mesh(
+            device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (self.world_size // 2, 2),
             mesh_dim_names=("dp_replicate", "dp_shard"),
         )
@@ -96,7 +141,11 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
         fully_shard_fn(model2, reshard_after_forward=False)
         self._test_state_dict_save_load(model2)
         ref_sharded_sd = model2.state_dict()
+<<<<<<< HEAD
         inp = torch.randn((2, mlp_dim), device="cuda")
+=======
+        inp = torch.randn((2, mlp_dim), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model2(inp)  # parameters are not resharded after this forward
         # Check that state dict hooks reshard
         sharded_sd = model2.state_dict()
@@ -148,12 +197,20 @@ def _test_dp_state_dict_cpu_offload(
             model.load_state_dict(sd, assign=True, strict=False)
 
         # lazy init without error
+<<<<<<< HEAD
         inp = torch.rand((mlp_dim, mlp_dim), device="cuda")
+=======
+        inp = torch.rand((mlp_dim, mlp_dim), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         context = (
             self.assertRaisesRegex(
                 RuntimeError,
+<<<<<<< HEAD
                 r"Found following parameters on non-CPU device: \[\('0.weight', device\(type='cuda'",
+=======
+                rf"Found following parameters on non-CPU device: \[\('0.weight', device\(type='{device_type.type}'",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             if not cpu_state_dict
             else nullcontext()
@@ -164,10 +221,20 @@ def _test_dp_state_dict_cpu_offload(
             for name, dtensor in state_dict.items():
                 self.assertEqual(dtensor.device.type, "cpu")
 
+<<<<<<< HEAD
     def test_2d_state_dict_correctness(self):
         dp_size = 2
         global_mesh = init_device_mesh(
             "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+=======
+    @skip_if_lt_x_gpu(2)
+    def test_2d_state_dict_correctness(self):
+        dp_size = 2
+        global_mesh = init_device_mesh(
+            device_type.type,
+            (dp_size, self.world_size // dp_size),
+            mesh_dim_names=("dp", "tp"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
         torch.manual_seed(42)
@@ -207,7 +274,13 @@ def test_2d_state_dict_correctness(self):
     def test_dp_tp_state_dict_save_load(self):
         dp_size = 2
         global_mesh = init_device_mesh(
+<<<<<<< HEAD
             "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+=======
+            device_type.type,
+            (dp_size, self.world_size // dp_size),
+            mesh_dim_names=("dp", "tp"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.run_subtests(
             {"mlp_dim": [4, 6, 8, 10]},
@@ -238,7 +311,11 @@ def _test_dp_tp_state_dict_save_load(self, global_mesh: DeviceMesh, mlp_dim: int
     @skip_if_lt_x_gpu(4)
     def test_hsdp_tp_state_dict_save_load(self):
         global_mesh = init_device_mesh(
+<<<<<<< HEAD
             "cuda",
+=======
+            device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (2, 2, self.world_size // 4),
             mesh_dim_names=("dp_replicate", "dp_shard", "tp"),
         )
@@ -338,12 +415,20 @@ class TestFullyShardStateDictMultiThread(FSDPTestMultiThread):
     def world_size(self):
         return 2
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_rank0_offload_full_state_dict(self):
         # Construct a reference unsharded model on all ranks
         model_args = ModelArgs(dropout_p=0.0)
         torch.manual_seed(42)
+<<<<<<< HEAD
         ref_model = Transformer(model_args).cuda()
+=======
+        ref_model = Transformer(model_args).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for param in ref_model.parameters():
             torch.distributed.broadcast(param.detach(), src=0)
 
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
index c11c0f93e03a..412c357f1672 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -27,10 +27,13 @@
 )
 from torch.distributed.tensor import DTensor, init_device_mesh, Shard
 from torch.distributed.tensor.debug import CommDebugMode
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     TEST_CUDA,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
@@ -44,9 +47,14 @@
 )
 from torch.testing._internal.common_utils import (
     get_cycles_per_ms,
+<<<<<<< HEAD
     NAVI4_ARCH,
     run_tests,
     skipIfRocmArch,
+=======
+    run_tests,
+    TEST_HPU,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     wrapSwapTensorsTest,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
@@ -59,21 +67,36 @@
 c10d_ops = torch.ops.c10d
 funcol = torch.ops.c10d_functional
 
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_fsdp import get_devtype
+
+
+device_type = torch.device(get_devtype())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestFullyShardForwardInputs(FSDPTestMultiThread):
     @property
     def world_size(self) -> int:
         return 2
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_root_move_forward_input_to_device(self):
         device = torch.device("cuda", 0)
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_root_move_forward_input_to_device(self):
+        device = torch.device(device_type.type, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class ParamlessModule(nn.Module):
             def forward(self, x: torch.Tensor, ys: tuple[torch.Tensor, ...]):
                 # Check that FSDP moved the inputs to GPU, including recursing
                 # into the tuple data structure
                 assert x.device == device, f"Expects {device} but got {x.device}"
+<<<<<<< HEAD
                 assert (
                     ys[0].device == device
                 ), f"Expects {device} but got {ys[0].device}"
@@ -85,6 +108,19 @@ def forward(self, x: torch.Tensor, ys: tuple[torch.Tensor, ...]):
 
         model = ParamlessModule()
         fully_shard(model)
+=======
+                assert ys[0].device == device, (
+                    f"Expects {device} but got {ys[0].device}"
+                )
+                assert ys[1].device == device, (
+                    f"Expects {device} but got {ys[1].device}"
+                )
+                y = ys[0] + ys[1]
+                return x + y + 1
+
+        model = ParamlessModule().to(device)
+        fully_shard(model).to(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn((3,))
         ys = (torch.randn((3,)), torch.randn((3,)))
         self.assertEqual(x.device, torch.device("cpu"))
@@ -98,6 +134,7 @@ class TestFullyShardRegisteredParams(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 4
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     @skipIfRocmArch(NAVI4_ARCH)  # Supported in future releaes
     def test_param_registration_after_forward(self):
@@ -105,6 +142,14 @@ def test_param_registration_after_forward(self):
         device = torch.device("cuda", 0)
         # Single FSDP group
         for reshard_after_forward in (True, False, 2):
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_param_registration_after_forward(self):
+        """Tests the parameter registration after forward."""
+        device = torch.device(device_type.type, 0)
+        # Single FSDP group
+        for reshard_after_forward in (True, False, 2, None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.manual_seed(42)
             model = MLP(3, device)
             # Since seed is per process, not per thread, we broadcast to ensure
@@ -113,18 +158,33 @@ def test_param_registration_after_forward(self):
                 dist.broadcast(param, src=0)
             ref_model = copy.deepcopy(model)
             fully_shard(model, reshard_after_forward=reshard_after_forward)  # root only
+<<<<<<< HEAD
             inp = torch.randn((2, 3), device="cuda")
             self._assert_dtensor_params(model.parameters())
             self._assert_same_params(model.parameters(), ref_model.parameters())
             model(inp)  # root does not reshard after forward
             self._assert_tensor_params(model.parameters())
+=======
+            inp = torch.randn((2, 3), device=device_type.type)
+            self._assert_dtensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+            model(inp)
+            if reshard_after_forward:
+                self._assert_dtensor_params(model.parameters())
+            else:
+                self._assert_tensor_params(model.parameters())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._assert_same_params(model.parameters(), ref_model.parameters())
             model.reshard()  # however, we can manually reshard
             self._assert_dtensor_params(model.parameters())
             self._assert_same_params(model.parameters(), ref_model.parameters())
 
         # Multiple FSDP groups
+<<<<<<< HEAD
         for reshard_after_forward in (True, False, 2):
+=======
+        for reshard_after_forward in (True, False, 2, None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.manual_seed(42)
             model = nn.Sequential(MLP(3, device), MLP(3, device))
             for param in model.parameters():
@@ -141,11 +201,23 @@ def test_param_registration_after_forward(self):
                 model[0].out_proj.parameters()
             )
             root_params = list(set(model.parameters()) - set(non_root_params))
+<<<<<<< HEAD
             if reshard_after_forward is False:
                 self._assert_tensor_params(non_root_params)
             else:
                 self._assert_dtensor_params(non_root_params)
             self._assert_tensor_params(root_params)
+=======
+            if reshard_after_forward is None:
+                self._assert_dtensor_params(non_root_params)
+                self._assert_tensor_params(root_params)
+            elif reshard_after_forward:
+                self._assert_dtensor_params(non_root_params)
+                self._assert_dtensor_params(root_params)
+            else:
+                self._assert_tensor_params(non_root_params)
+                self._assert_tensor_params(root_params)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._assert_same_params(model.parameters(), ref_model.parameters())
             for module in model.modules():
                 if isinstance(module, FSDPModule):
@@ -153,15 +225,26 @@ def test_param_registration_after_forward(self):
             self._assert_dtensor_params(model.parameters())
             self._assert_same_params(model.parameters(), ref_model.parameters())
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     def test_param_registration_after_backward(self):
         """Tests the parameter registration after backward."""
         device = torch.device("cuda", 0)
+=======
+    @skip_if_lt_x_gpu(1)
+    def test_param_registration_after_backward(self):
+        """Tests the parameter registration after backward."""
+        device = torch.device(device_type.type, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Single FSDP group
         for reshard_after_forward in (True, False, 2):
             model = MLP(8, device)
             fully_shard(model, reshard_after_forward=reshard_after_forward)  # root only
+<<<<<<< HEAD
             inp = torch.randn((2, 8), device="cuda")
+=======
+            inp = torch.randn((2, 8), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._assert_dtensor_params(model.parameters())
             model(inp).sum().backward()
             self._assert_dtensor_params(model.parameters())
@@ -177,13 +260,24 @@ def test_param_registration_after_backward(self):
             self._assert_dtensor_params(model.parameters())
 
     def _assert_tensor_params(self, params: Iterable[nn.Parameter]):
+<<<<<<< HEAD
         self.assertGreater(len(list(params)), 0)
+=======
+        # need to iterate over the list multiple times
+        params = list(params)
+        self.assertGreater(len(params), 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for param in params:
             self.assertNotIsInstance(param, DTensor)
             self.assertIsInstance(param, torch.Tensor)
 
     def _assert_dtensor_params(self, params: Iterable[nn.Parameter]):
+<<<<<<< HEAD
         self.assertGreater(len(list(params)), 0)
+=======
+        params = list(params)
+        self.assertGreater(len(params), 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for param in params:
             self.assertIsInstance(param, DTensor)
 
@@ -204,15 +298,24 @@ class TestFullyShardCastAfterInit(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 2
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
     @wrapSwapTensorsTest(True)
     @skipIfRocmArch(NAVI4_ARCH)  # Supported in future releaes
+=======
+    @skip_if_lt_x_gpu(1)
+    @wrapSwapTensorsTest(True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_to_float64_after_init(self):
         """Tests that the user can cast the module to float64 after init."""
         # NOTE: Test fp64 instead of a lower precision dtype like bf16 for
         # better numerics. The important part is changing the dtype.
         torch.manual_seed(42)
+<<<<<<< HEAD
         mlp_dim, device, dtype = 4, torch.device("cuda"), torch.float64
+=======
+        mlp_dim, device, dtype = 4, device_type, torch.float64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = MLP(mlp_dim, device=device)
         for param in model.parameters():
             dist.broadcast(param, src=0)
@@ -229,7 +332,11 @@ def test_to_float64_after_init(self):
         optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
         check_sharded_parity(self, ref_model, model)
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         inp = torch.randn((2, mlp_dim), device="cuda", dtype=dtype)
+=======
+        inp = torch.randn((2, mlp_dim), device=device_type.type, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for iter_idx in range(10):
             losses: list[torch.Tensor] = []
             for _model in (ref_model, model):
@@ -252,7 +359,11 @@ def test_to_float64_after_init(self):
 class TestFullyShard1DTrainingCore(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(8, torch.cuda.device_count())
+=======
+        return min(8, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_train_parity_single_group_shard_dim0(self):
@@ -294,7 +405,11 @@ def _test_train_parity_single_group(
         model = nn.Sequential(
             nn.Linear(*lin_shapes[0]), nn.ReLU(), nn.Linear(*lin_shapes[1])
         )
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         replicate(ref_model, device_ids=[self.rank])
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
 
@@ -305,7 +420,11 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
         fully_shard(model, shard_placement_fn=shard_placement_fn)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         inp = (torch.randn((4, lin_shapes[0][0]), device="cuda"),)
+=======
+        inp = (torch.randn((4, lin_shapes[0][0]), device=device_type.type),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for iter_idx in range(10):
             losses: list[torch.Tensor] = []
             for _model, _optim in ((ref_model, ref_optim), (model, optim)):
@@ -316,10 +435,15 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA"
     )
+=======
+    @unittest.skipIf(TEST_HPU, "Sleep kernel not supported for HPU")
+    @compiled_fsdp_test(compile_compute_on_module=Transformer)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_train_parity_multi_group(self):
         """
         Tests train parity against DDP when using multiple parameter groups for
@@ -329,7 +453,11 @@ def test_train_parity_multi_group(self):
         self.run_subtests(
             {
                 "reshard_after_forward": [True, False, 2],
+<<<<<<< HEAD
                 "device_type": ["cuda"],
+=======
+                "device_type": [device_type.type],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "offload_policy": [OffloadPolicy()],
                 "delay_after_forward": [False, True],
                 "delay_before_all_gather": [False, True],
@@ -341,6 +469,10 @@ def test_train_parity_multi_group(self):
         )
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_train_parity_multi_group_cpu_offload_eager(self):
         """
         Tests train parity against DDP when using multiple parameter groups for
@@ -353,7 +485,11 @@ def test_train_parity_multi_group_cpu_offload_eager(self):
                     CPUOffloadPolicy(pin_memory=True),
                     CPUOffloadPolicy(pin_memory=False),
                 ],
+<<<<<<< HEAD
                 "device_type": ["cuda"],
+=======
+                "device_type": [device_type.type],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "delay_after_forward": [False, True],
                 "delay_before_all_gather": [False, True],
                 "delay_before_reduce_scatter": [False, True],
@@ -364,6 +500,10 @@ def test_train_parity_multi_group_cpu_offload_eager(self):
         )
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
     def test_train_parity_multi_group_unshard_async_op(self):
         """
@@ -373,7 +513,11 @@ def test_train_parity_multi_group_unshard_async_op(self):
         self.run_subtests(
             {
                 "reshard_after_forward": [True],
+<<<<<<< HEAD
                 "device_type": ["cuda"],
+=======
+                "device_type": [device_type.type],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "offload_policy": [OffloadPolicy()],
                 "delay_after_forward": [False, True],
                 "delay_before_all_gather": [False, True],
@@ -404,7 +548,11 @@ def _test_train_parity_multi_group(
             in (2, 3)
         ):
             return
+<<<<<<< HEAD
         assert device_type in ("cuda", "cpu"), f"{device_type}"
+=======
+        assert device_type in ("cuda", "hpu", "xpu", "cpu"), f"{device_type}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(42)
         vocab_size = 1024
         model_args = ModelArgs(
@@ -416,8 +564,16 @@ def _test_train_parity_multi_group(
         )
         model = Transformer(model_args)
         ref_model = copy.deepcopy(model)
+<<<<<<< HEAD
         if device_type == "cuda":
             replicate(ref_model.cuda(), device_ids=[self.rank])
+=======
+        if device_type == device_type:
+            replicate(
+                ref_model.to(device_type),
+                device_ids=[self.rank],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             gloo_pg = dist.new_group(backend="gloo")
             replicate(ref_model, process_group=gloo_pg)
@@ -442,11 +598,23 @@ def _test_train_parity_multi_group(
         orig_reduce_scatter = dist.reduce_scatter_tensor
 
         def delayed_all_gather(*args, **kwargs):
+<<<<<<< HEAD
             torch.cuda._sleep(int(delay_in_ms * get_cycles_per_ms()))
             return orig_all_gather(*args, **kwargs)
 
         def delayed_reduce_scatter(*args, **kwargs):
             torch.cuda._sleep(int(delay_in_ms * get_cycles_per_ms()))
+=======
+            torch.get_device_module(device_type)._sleep(
+                int(delay_in_ms * get_cycles_per_ms())
+            )
+            return orig_all_gather(*args, **kwargs)
+
+        def delayed_reduce_scatter(*args, **kwargs):
+            torch.get_device_module(device_type)._sleep(
+                int(delay_in_ms * get_cycles_per_ms())
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return orig_reduce_scatter(*args, **kwargs)
 
         torch.manual_seed(42 + self.rank + 1)
@@ -468,10 +636,21 @@ def delayed_reduce_scatter(*args, **kwargs):
                     _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
                     losses.append(_model(inp).sum())
                     if _model is model and delay_after_forward:
+<<<<<<< HEAD
                         torch.cuda._sleep(int(delay_in_ms * get_cycles_per_ms()))
                     losses[-1].backward()
                     if _model is model and delay_before_optim:
                         torch.cuda._sleep(int(delay_in_ms * get_cycles_per_ms()))
+=======
+                        torch.get_device_module(device_type)._sleep(
+                            int(delay_in_ms * get_cycles_per_ms())
+                        )
+                    losses[-1].backward()
+                    if _model is model and delay_before_optim:
+                        torch.get_device_module(device_type)._sleep(
+                            int(delay_in_ms * get_cycles_per_ms())
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     _optim.step()
                 self.assertEqual(losses[0], losses[1])
 
@@ -484,14 +663,22 @@ def test_non_root_forward_backward(self):
         torch.manual_seed(42)
         lin_dim = 32
         model = nn.Sequential(*[MLP(lin_dim, torch.device("cpu")) for _ in range(3)])
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         for mlp in model:
             fully_shard(mlp)
         fully_shard(model)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
         torch.manual_seed(42 + self.rank)
+<<<<<<< HEAD
         inp = torch.randn((8, lin_dim), device=torch.device("cuda"))
+=======
+        inp = torch.randn((8, lin_dim), device=device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ref_root_loss = ref_model(inp).sum()
         ref_root_loss.backward()
@@ -510,7 +697,11 @@ def test_non_root_forward_backward(self):
 
         root_loss = model(inp).sum()
         root_loss.backward()
+<<<<<<< HEAD
         torch.cuda._sleep(int(100 * get_cycles_per_ms()))
+=======
+        torch.get_device_module(device_type)._sleep(int(100 * get_cycles_per_ms()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optim.step()
         optim.zero_grad()
         nonroot_loss = model[0](inp).sum()
@@ -545,16 +736,29 @@ def forward(self, x):
                 return self.outer(i + j)
 
         torch.manual_seed(42)
+<<<<<<< HEAD
         model = MultiForwardModule(device="cuda")
         ref_model = copy.deepcopy(model)
         replicate(ref_model, device_ids=[self.rank])
+=======
+        model = MultiForwardModule(device=device_type.type)
+        ref_model = copy.deepcopy(model)
+        replicate(
+            ref_model,
+            device_ids=[self.rank],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         fully_shard(model.inner)
         fully_shard(model)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
 
         torch.manual_seed(42 + self.rank)
+<<<<<<< HEAD
         inp = torch.randn((32, 4), device="cuda")
+=======
+        inp = torch.randn((32, 4), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for iter_idx in range(10):
             losses: list[torch.Tensor] = []
             for _model, _optim in ((ref_model, ref_optim), (model, optim)):
@@ -569,7 +773,11 @@ def test_explicit_prefetching(self):
         torch.manual_seed(42)
         model_args = ModelArgs(n_layers=8, dropout_p=0.0)
         model = Transformer(model_args)
+<<<<<<< HEAD
         ref_model = replicate(copy.deepcopy(model).cuda())
+=======
+        ref_model = replicate(copy.deepcopy(model).to(device_type))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
         for layer in itertools.chain(model.layers, [model]):
             fully_shard(layer)
@@ -592,7 +800,11 @@ def test_explicit_prefetching(self):
             layer.set_modules_to_backward_prefetch(layers_to_prefetch)
 
         torch.manual_seed(42 + self.rank)
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (2, 8), device="cuda")
+=======
+        inp = torch.randint(0, model_args.vocab_size, (2, 8), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for _ in range(10):
             losses: list[torch.Tensor] = []
             for _model, _optim in ((ref_model, ref_optim), (model, optim)):
@@ -603,11 +815,19 @@ def test_explicit_prefetching(self):
             self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_post_optim_event(self):
         torch.manual_seed(42)
         model_args = ModelArgs(dropout_p=0.0)
         model = Transformer(model_args)
+<<<<<<< HEAD
         ref_model = replicate(copy.deepcopy(model).cuda())
+=======
+        ref_model = replicate(copy.deepcopy(model).to(device_type.type))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
         for layer in itertools.chain(model.layers, [model]):
             fully_shard(layer)
@@ -616,13 +836,23 @@ def test_post_optim_event(self):
         def step_post_hook(
             fsdp_module: FSDPModule, opt: torch.optim.Optimizer, args, kwargs
         ) -> None:
+<<<<<<< HEAD
             post_optim_event = torch.cuda.current_stream().record_event()
+=======
+            post_optim_event = (
+                torch.get_device_module(device_type).current_stream().record_event()
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fsdp_module.set_post_optim_event(post_optim_event)
 
         optim.register_step_post_hook(functools.partial(step_post_hook, model))
 
         torch.manual_seed(42 + self.rank)
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (2, 8), device="cuda")
+=======
+        inp = torch.randint(0, model_args.vocab_size, (2, 8), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Track all losses and check for equality at the end to avoid a CPU
         # sync point after each iteration
         ref_losses: list[torch.Tensor] = []
@@ -639,7 +869,11 @@ def step_post_hook(
             optim.step()
             # Sleep after the optimizer step to allow CPU to run ahead into the
             # next iteration's forward, exercising the post-optim stream sync
+<<<<<<< HEAD
             torch.cuda._sleep(int(25 * get_cycles_per_ms()))
+=======
+            torch.get_device_module(device_type)._sleep(int(25 * get_cycles_per_ms()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for ref_loss, loss in zip(ref_losses, losses):
             self.assertEqual(ref_loss, loss)
 
@@ -649,7 +883,11 @@ class TestFullyShard1DTrainingCompose(FSDPTest):
     def world_size(self) -> int:
         # Since these tests run with a larger transformer model, they may see
         # some numeric drift with >2 GPUs
+<<<<<<< HEAD
         return min(torch.cuda.device_count(), 2)
+=======
+        return min(torch.get_device_module(device_type).device_count(), 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     @compiled_fsdp_test(compile_compute_on_module=Transformer)
@@ -679,7 +917,11 @@ def _test_train_parity_with_activation_checkpointing(
             return
         torch.manual_seed(42)
         vocab_size = 1024
+<<<<<<< HEAD
         with torch.device(torch.device("cuda")):
+=======
+        with torch.device(device_type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model_args = ModelArgs(
                 n_layers=3,
                 n_heads=4,
@@ -693,7 +935,14 @@ def _test_train_parity_with_activation_checkpointing(
                 weight_tying=module_grouping != "mem_eff",
             )
             model = Transformer(model_args)
+<<<<<<< HEAD
         ref_model = replicate(copy.deepcopy(model), device_ids=[self.rank])
+=======
+        ref_model = replicate(
+            copy.deepcopy(model),
+            device_ids=[self.rank],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
 
         # Apply activation checkpointing
@@ -733,7 +982,11 @@ def _test_train_parity_with_activation_checkpointing(
         torch.manual_seed(42 + self.rank)
         # Reuse the same input across iterations to avoid loss explosion from
         # trying to learn from random inputs
+<<<<<<< HEAD
         inp = torch.randint(0, vocab_size, (3, 64), device="cuda")
+=======
+        inp = torch.randint(0, vocab_size, (3, 64), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         check_sharded_parity(
             self, ref_model, model, prefixes_to_ignore=prefixes_to_ignore
         )
@@ -760,14 +1013,22 @@ def _test_train_parity_with_activation_checkpointing(
 class TestFullyShardShardPlacementFnMultiProcess(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(8, torch.cuda.device_count())
+=======
+        return min(8, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_train_parity_shard_placement_fn_shard_largest_dim(self):
         torch.manual_seed(42)
         model_args = ModelArgs(n_layers=3, dropout_p=0.0)
         model = Transformer(model_args)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
 
         def shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
@@ -783,7 +1044,11 @@ def shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(full_param, ref_param)
 
         torch.manual_seed(42 + self.rank)
+<<<<<<< HEAD
         inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+=======
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for iter_idx in range(5):
             ref_loss = ref_model(inp).sum()
             loss = model(inp).sum()
@@ -810,7 +1075,11 @@ class TestFullyShardShardPlacementFnMultiThread(FSDPTestMultiThread):
     def world_size(self) -> int:
         return 4
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+    @skip_if_lt_x_gpu(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_shard_placement_fn_contiguous_params_grads(self):
         dim = 4
         model = MLP(dim=dim)
@@ -835,7 +1104,11 @@ def assert_contiguous_params(module: nn.Module, args: Any):
             self.assertTrue(param.is_contiguous())
             self.assertTrue(param.to_local().is_contiguous())
 
+<<<<<<< HEAD
         inp = torch.randn((2, dim), device="cuda")
+=======
+        inp = torch.randn((2, dim), device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model(inp).sum().backward()
 
         for param in model.parameters():
@@ -848,7 +1121,11 @@ def assert_contiguous_params(module: nn.Module, args: Any):
 class TestFullyShardSharedParams(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(4, torch.cuda.device_count())
+=======
+        return min(4, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_train_parity_with_shared_params(self):
@@ -868,8 +1145,16 @@ def _test_train_shared_params(
         torch.manual_seed(42)
         model_args = ModelArgs(n_layers=3, dropout_p=0.0, weight_tying=True)
         model = Transformer(model_args)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
         replicate(ref_model, device_ids=[self.rank])
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+        replicate(
+            ref_model,
+            device_ids=[self.rank],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         for module in model.modules():
             if isinstance(module, TransformerBlock):
@@ -881,7 +1166,13 @@ def _test_train_shared_params(
 
         torch.manual_seed(42 + self.rank + 1)
         for iter_idx in range(10):
+<<<<<<< HEAD
             inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+=======
+            inp = torch.randint(
+                0, model_args.vocab_size, (2, 16), device=device_type.type
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             losses: list[torch.Tensor] = []
             for _model, _optim in ((ref_model, ref_optim), (model, optim)):
                 _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
@@ -894,7 +1185,11 @@ def _test_train_shared_params(
 class TestFullyShardGradientAccumulation(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(4, torch.cuda.device_count())
+=======
+        return min(4, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_gradient_accumulation(self):
@@ -902,12 +1197,22 @@ def test_gradient_accumulation(self):
         Tests gradient accumulation with/without gradient reduction and
         with/without resharding after backward.
         """
+<<<<<<< HEAD
         meshes = [init_device_mesh("cuda", (self.world_size,))]  # always test FSDP
+=======
+        meshes = [
+            init_device_mesh(device_type.type, (self.world_size,))
+        ]  # always test FSDP
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.world_size == 4:  # test HSDP too if enough GPUs
             shard_size, replicate_size = 2, 2
             meshes.append(
                 init_device_mesh(
+<<<<<<< HEAD
                     "cuda",
+=======
+                    device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     (replicate_size, shard_size),
                     mesh_dim_names=("dp_replicate", "dp_shard"),
                 )
@@ -961,7 +1266,11 @@ def _test_gradient_accumulation(
         modules = [nn.Linear(lin_dim, lin_dim)]
         modules.extend(MLP(lin_dim) for _ in range(num_mlps))
         model = nn.Sequential(*modules)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard_fn = functools.partial(
             fully_shard,
             mesh=mesh,
@@ -1004,7 +1313,11 @@ def set_backward_flags(_model: nn.Module, is_last_microbatch: bool):
             for microbatch_idx in range(num_microbatches):
                 is_last_microbatch = microbatch_idx == num_microbatches - 1
                 set_backward_flags(model, is_last_microbatch)
+<<<<<<< HEAD
                 inp = torch.randn(batch_size, lin_dim, device="cuda")
+=======
+                inp = torch.randn(batch_size, lin_dim, device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 losses: list[torch.Tensor] = []
                 for _model in (ref_model, model):
                     with CommDebugMode() as comm_mode:
@@ -1049,9 +1362,13 @@ def set_backward_flags(_model: nn.Module, is_last_microbatch: bool):
             # the first microbatch's forward
             expected_all_gather_count = num_mlps + 1
             if reshard_after_forward is not False:  # `True` or `2`
+<<<<<<< HEAD
                 # Add the number of MLPs without the +1 for the backward
                 # all-gathers since the root does not reshard after forward
                 expected_all_gather_count += num_mlps
+=======
+                expected_all_gather_count += num_mlps + 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Multiply by the number of microbatches since these
                 # all-gathers run every microbatch
                 expected_all_gather_count *= num_microbatches
@@ -1093,7 +1410,11 @@ def _test_1f1b_microbatching(
         torch.manual_seed(42)
         model_args = ModelArgs(dropout_p=0.0)
         model = Transformer(model_args)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
         for module in model.modules():
             if isinstance(module, TransformerBlock):
@@ -1106,7 +1427,14 @@ def _test_1f1b_microbatching(
         torch.manual_seed(42 + self.rank + 1)
         inps = [
             torch.randint(
+<<<<<<< HEAD
                 0, model_args.vocab_size, (local_batch_size, 16), device="cuda"
+=======
+                0,
+                model_args.vocab_size,
+                (local_batch_size, 16),
+                device=device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for _ in range(num_microbatches)
         ]
@@ -1146,14 +1474,22 @@ def _test_1f1b_microbatching(
 class TestFullyShardNDTraining(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(8, torch.cuda.device_count())
+=======
+        return min(8, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def init_global_mesh(self) -> DeviceMesh:
         # Prefer to test with >=8 GPUs, but for 2 GPUs, use 2-way TP
         dp_size = 2 if self.world_size > 2 else 1
         pp_size = 2 if self.world_size > 4 else 1
         return init_device_mesh(
+<<<<<<< HEAD
             "cuda",
+=======
+            device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (pp_size, dp_size, self.world_size // (dp_size * pp_size)),
             mesh_dim_names=("pp", "dp", "tp"),
         )
@@ -1165,9 +1501,13 @@ def test_2d_mlp_with_nd_mesh(self):
             {
                 "reshard_after_forward": [False, True],
                 "use_activation_checkpointing": [False, True],
+<<<<<<< HEAD
                 # TODO: change "mlp_dim" back to [3, 16, 17] when uneven sharding
                 # is supported for FSDP+TP
                 "mlp_dim": [4, 16, 20],
+=======
+                "mlp_dim": [3, 5, 16, 17],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "foreach": [False],
             },
             functools.partial(self._test_2d_mlp_with_nd_mesh, global_mesh),
@@ -1191,8 +1531,17 @@ def _test_2d_mlp_with_nd_mesh(
 
         torch.manual_seed(42)
         model = MLPStack(mlp_dim)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
         replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+        replicate(
+            ref_model,
+            device_ids=[self.rank],
+            process_group=dp_pg,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=foreach)
         model.parallelize(
             tp_mesh,
@@ -1203,7 +1552,11 @@ def _test_2d_mlp_with_nd_mesh(
         optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=foreach)
 
         torch.manual_seed(42 + dp_pg.rank() + 1)
+<<<<<<< HEAD
         device = torch.device("cuda")
+=======
+        device = device_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for iter_idx in range(10):
             inp = torch.randn((8, mlp_dim), device=device)
             losses: list[torch.Tensor] = []
@@ -1224,11 +1577,19 @@ def _test_2d_mlp_with_nd_mesh(
 class TestFullyShardHSDP3DTraining(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(8, torch.cuda.device_count())
 
     def init_global_mesh(self) -> DeviceMesh:
         return init_device_mesh(
             "cuda",
+=======
+        return min(8, torch.get_device_module(device_type).device_count())
+
+    def init_global_mesh(self) -> DeviceMesh:
+        return init_device_mesh(
+            device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (2, 2, 2),
             mesh_dim_names=("dp_replicate", "dp_shard", "tp"),
         )
@@ -1240,9 +1601,13 @@ def test_3d_mlp_with_nd_mesh(self):
             {
                 "reshard_after_forward": [False, True],
                 "use_activation_checkpointing": [False, True],
+<<<<<<< HEAD
                 # TODO: change "mlp_dim" back to [3, 16, 17] when uneven sharding
                 # is supported for FSDP+TP
                 "mlp_dim": [4, 16, 20],
+=======
+                "mlp_dim": [3, 5, 16, 17],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "foreach": [False],
             },
             functools.partial(self._test_3d_mlp_with_nd_mesh, global_mesh),
@@ -1262,8 +1627,17 @@ def _test_3d_mlp_with_nd_mesh(
 
         torch.manual_seed(42)
         model = MLPStack(mlp_dim)
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
         replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+        replicate(
+            ref_model,
+            device_ids=[self.rank],
+            process_group=dp_pg,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=foreach)
         model.parallelize(
             tp_mesh,
@@ -1271,10 +1645,23 @@ def _test_3d_mlp_with_nd_mesh(
             use_activation_checkpointing,
             reshard_after_forward=reshard_after_forward,
         )
+<<<<<<< HEAD
         optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=foreach)
 
         torch.manual_seed(42 + dp_pg.rank() + 1)
         device = torch.device("cuda")
+=======
+        # Checking paramters match orig model is critical to validate .full_tensor correctly replicates the
+        # strided-sharded layers.
+        for ref_p, p in zip(ref_model.parameters(), model.parameters()):
+            self.assertIsInstance(p, DTensor)
+            self.assertEqual(ref_p, p.full_tensor())
+
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=foreach)
+
+        torch.manual_seed(42 + dp_pg.rank() + 1)
+        device = device_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for iter_idx in range(10):
             inp = torch.randn((8, mlp_dim), device=device)
             losses: list[torch.Tensor] = []
@@ -1297,14 +1684,22 @@ def _test_3d_mlp_with_nd_mesh(
 class TestFullyShardHSDPTraining(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(4, torch.cuda.device_count())
+=======
+        return min(4, torch.get_device_module(device_type).device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_train_parity_hsdp(self):
         shard_size = 2 if self.world_size > 2 else 1
         replicate_size = self.world_size // shard_size
         global_mesh = init_device_mesh(
+<<<<<<< HEAD
             "cuda",
+=======
+            device_type.type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (replicate_size, shard_size),
             mesh_dim_names=("dp_replicate", "dp_shard"),
         )
@@ -1333,8 +1728,16 @@ def _test_train_parity_hsdp(
             MLP(mlp_dim),
             MLP(mlp_dim, dim_multiplier=3),
         )
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
         replicate(ref_model, device_ids=[self.rank])
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+        replicate(
+            ref_model,
+            device_ids=[self.rank],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         for mlp in model:
             if use_activation_checkpointing:
@@ -1348,7 +1751,11 @@ def _test_train_parity_hsdp(
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
         check_sharded_parity(self, ref_model, model)
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         device = torch.device("cuda")
+=======
+        device = device_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_microbatches = 3
         for iter_idx in range(5):
             for microbatch_idx in range(num_microbatches):
@@ -1371,7 +1778,11 @@ def _test_train_parity_hsdp(
 class TestFullyShardCustomForwardMethod(FSDPTest):
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         return min(torch.cuda.device_count(), 2)
+=======
+        return min(torch.get_device_module(device_type).device_count(), 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_register_fsdp_forward_method(self):
@@ -1400,14 +1811,22 @@ def forward(self, imgs: torch.Tensor) -> torch.Tensor:
 
         torch.manual_seed(42)
         model = Model()
+<<<<<<< HEAD
         ref_model = copy.deepcopy(model).cuda()
+=======
+        ref_model = copy.deepcopy(model).to(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fully_shard(model.vit)
         fully_shard(model.projector)
         fully_shard(model)
         register_fsdp_forward_method(model.vit, "forward_features")
 
         torch.manual_seed(42 + self.rank + 1)
+<<<<<<< HEAD
         inp = torch.randn(4, 3, 224, 224, device="cuda")
+=======
+        inp = torch.randn(4, 3, 224, 224, device=device_type.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_loss = ref_model(inp).sum()
         loss = model(inp).sum()
         self.assertEqual(ref_loss, loss)
diff --git a/test/distributed/_composable/test_composability/test_2d_composability.py b/test/distributed/_composable/test_composability/test_2d_composability.py
index 3cec8a0cbaf1..6f203581d662 100644
--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@@ -12,7 +12,10 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed._composable import replicate
+<<<<<<< HEAD
 from torch.distributed._tensor import DTensor, init_device_mesh, Replicate, Shard
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.state_dict import (
     get_model_state_dict,
     get_optimizer_state_dict,
@@ -20,7 +23,11 @@
     set_optimizer_state_dict,
     StateDictOptions,
 )
+<<<<<<< HEAD
 from torch.distributed.device_mesh import DeviceMesh
+=======
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import (
     CPUOffloadPolicy,
     fully_shard,
@@ -31,6 +38,10 @@
     clean_tensor_name,
 )
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor import DTensor, Replicate, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -47,7 +58,10 @@
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+<<<<<<< HEAD
     skipIfRocm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -115,6 +129,7 @@ def init_global_mesh(self) -> DeviceMesh:
             "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
         )
 
+<<<<<<< HEAD
     # TODO: remove this test when uneven sharding is supported for FSDP+TP
     @skip_if_lt_x_gpu(2)
     def test_2d_uneven_shard_raise_error(self):
@@ -126,15 +141,22 @@ def test_2d_uneven_shard_raise_error(self):
 
     @skip_if_lt_x_gpu(2)
     @skipIfRocm
+=======
+    @skip_if_lt_x_gpu(2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_train_parity_2d_mlp(self):
         global_mesh = self.init_global_mesh()
         self.run_subtests(
             {
                 "reshard_after_forward": [False, True],
                 "use_activation_checkpointing": [False, True],
+<<<<<<< HEAD
                 # TODO: change "mlp_dim" back to [3, 16, 17] when uneven sharding
                 # is supported for FSDP+TP
                 "mlp_dim": [4, 16, 20],
+=======
+                "mlp_dim": [3, 16, 17],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
             functools.partial(self._test_train_parity_2d_mlp, global_mesh),
         )
@@ -175,7 +197,10 @@ def _test_train_parity_2d_mlp(
             self.assertEqual(losses[0], losses[1])
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_train_parity_2d_transformer(self):
         self.run_subtests(
             {"use_shard_placement_fn": [False, True]},
@@ -256,7 +281,10 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
             self.assertEqual(full_param, ref_param)
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tp_with_fsdp_offloading(self):
         global_mesh = init_device_mesh(
             "cuda", (1, self.world_size), mesh_dim_names=("dp", "tp")
diff --git a/test/distributed/_composable/test_composability/test_pp_composability.py b/test/distributed/_composable/test_composability/test_pp_composability.py
index d30d0424a040..10e0c71ea8d7 100644
--- a/test/distributed/_composable/test_composability/test_pp_composability.py
+++ b/test/distributed/_composable/test_composability/test_pp_composability.py
@@ -38,7 +38,10 @@
     parametrize,
     run_tests,
     skip_but_pass_in_sandcastle_if,
+<<<<<<< HEAD
     skipIfRocm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
@@ -101,7 +104,10 @@ def world_size(self):
     def device(self):
         return self.rank
 
+<<<<<<< HEAD
     @skipIfRocm()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl()
     @skip_if_lt_x_gpu(4)
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 4+ GPUs")
diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index a4fdc7beceb4..c99415e16f66 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -8,8 +8,13 @@
 import torch.nn.functional as F
 from torch import nn
 from torch.distributed._composable.replicate import replicate
+<<<<<<< HEAD
 from torch.distributed._tensor import DTensor
 from torch.distributed.fsdp import fully_shard
+=======
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.tensor import DTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     skip_if_lt_x_gpu,
diff --git a/test/distributed/_composable/test_replicate_with_compiler.py b/test/distributed/_composable/test_replicate_with_compiler.py
index 839bbcd6920d..e0c49fc0a6df 100644
--- a/test/distributed/_composable/test_replicate_with_compiler.py
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@@ -28,11 +28,18 @@
 from torch.testing._internal.common_distributed import (
     DistributedTestBase,
     skip_if_lt_x_gpu,
+<<<<<<< HEAD
     skip_if_rocm_multiprocess,
     sm_is_or_higher_than,
 )
 from torch.testing._internal.common_fsdp import get_devtype
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
+=======
+    sm_is_or_higher_than,
+)
+from torch.testing._internal.common_fsdp import get_devtype
+from torch.testing._internal.common_utils import run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed.fake_pg import FakeStore
 from torch.testing._internal.inductor_utils import HAS_GPU
 from torch.utils.checkpoint import checkpoint
@@ -194,7 +201,10 @@ def test_compile_cpu_no_sync(self):
         self._test_compile(no_sync=True, device="cpu")
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @skip_if_rocm_multiprocess
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @torch._inductor.config.patch(
         reorder_for_locality=False, reorder_for_peak_memory=False
@@ -203,7 +213,10 @@ def test_compile_gpu(self):
         self._test_compile(no_sync=False, checkpoint=False, device=device_type)
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @skip_if_rocm_multiprocess
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @torch._inductor.config.patch(
         reorder_for_locality=False, reorder_for_peak_memory=False
@@ -212,11 +225,21 @@ def test_compile_gpu_ac(self):
         self._test_compile(no_sync=False, checkpoint=True, device=device_type)
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @skip_if_rocm_multiprocess
     @skip_if_lt_x_gpu(2)
     def test_compile_bf16(self):
         # Check device capability wrt bf16
         if not sm_is_or_higher_than(torch.device(device_type), 8, 0):
+=======
+    @skip_if_lt_x_gpu(2)
+    def test_compile_bf16(self):
+        # Check device capability wrt bf16
+        if (
+            not sm_is_or_higher_than(torch.device(device_type), 8, 0)
+            and torch.version.hip is None
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.skipTest("bf16 requires sm >= 8.0")
 
         def setup(model, compiled_replicate_model, compiled_ddp_model) -> None:
@@ -230,7 +253,10 @@ def setup(model, compiled_replicate_model, compiled_ddp_model) -> None:
         self._test_compile(no_sync=False, setup_func=setup, device=device_type)
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @skip_if_rocm_multiprocess
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_compile_fp16(self):
         def setup(model, compiled_replicate_model, compiled_ddp_model) -> None:
@@ -247,7 +273,10 @@ def setup(model, compiled_replicate_model, compiled_ddp_model) -> None:
         )
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @skip_if_rocm_multiprocess
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_compile_backward_only(self):
         self._test_compile(no_sync=False, no_compile_forward=True, device=device_type)
@@ -387,7 +416,10 @@ def tearDown(self):
         "Temporarily disabled due to SymInt error: `unhashable type: non-nested SymInt`"
     )
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ddp_tp(self):
         ref_model = Net()
         compiled_replicate_model = deepcopy(ref_model)
@@ -418,7 +450,11 @@ def test_ddp_tp(self):
             # https://github.com/pytorch/pytorch/issues/127797#issuecomment-2291695474
             with self.assertRaisesRegex(
                 AssertionError,
+<<<<<<< HEAD
                 "Expected ProxyTensor, got <class 'torch.distributed._tensor.api.DTensor'>",
+=======
+                "Expected ProxyTensor, got <class 'torch.distributed.tensor.DTensor'>",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 loss.backward()
 
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
index 7439b266fabc..568820de3bb8 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -2487,6 +2487,133 @@ def test_init_from_local_shards(self):
                 shard = remote_shard.to_here()
                 self.assertEqual((5, 5), shard.tensor.size())
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_recalc_for_metadata(self):
+        shard_sizes = [0, 5]  # test 2 different shard sizes
+        for shard_size in shard_sizes:
+            local_shard_metadata = ShardMetadata(
+                shard_offsets=[0, 0],
+                shard_sizes=[shard_size, shard_size],
+                placement=f"rank:{self.rank}/cuda:{self.rank}",
+            )
+
+            local_shards = [
+                sharded_tensor.Shard(
+                    torch.randn(shard_size, shard_size, device=f"cuda:{self.rank}"),
+                    local_shard_metadata,
+                )
+            ]
+
+            st = sharded_tensor.init_from_local_shards(local_shards, None, None)
+            self.assertEqual((shard_size * 4, shard_size), st.size())
+            self.assertEqual(1, len(st.local_shards()))
+
+            # Verify local shard.
+            local_shard = st.local_shards()[0]
+            self.assertEqual(
+                torch.device(f"cuda:{self.rank}"), local_shard.tensor.device
+            )
+            self.assertEqual((shard_size, shard_size), local_shard.tensor.size())
+
+            # Verify local shard metadata.
+            self.assertEqual(
+                (self.rank * shard_size, 0),
+                local_shard.metadata.shard_offsets,
+            )
+            self.assertEqual((shard_size, shard_size), local_shard.metadata.shard_sizes)
+            self.assertEqual(
+                f"rank:{self.rank}/cuda:{self.rank}",
+                str(local_shard.metadata.placement),
+            )
+
+            # Verify global metadata.
+            shards_metadata = st.metadata().shards_metadata
+            self.assertEqual(4, len(shards_metadata))
+            for rank, shard_metadata in enumerate(shards_metadata):
+                self.assertEqual((rank * shard_size, 0), shard_metadata.shard_offsets)
+                self.assertEqual((shard_size, shard_size), shard_metadata.shard_sizes)
+                self.assertEqual(
+                    f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement)
+                )
+
+            with self.assertRaises(ValueError):
+                st = sharded_tensor.init_from_local_shards(local_shards)
+
+    @skipIfRocm
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_init_from_local_shards_with_different_glb_size(self):
+        wrong_offset_local_shard_metadata = ShardMetadata(
+            shard_offsets=[0, 0],
+            shard_sizes=[5, 5],
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
+        )
+
+        wrong_offset_local_shards = [
+            sharded_tensor.Shard(
+                torch.randn(5, 5, device=f"cuda:{self.rank}"),
+                wrong_offset_local_shard_metadata,
+            )
+        ]
+        with self.assertRaises(ValueError):
+            sharded_tensor.init_from_local_shards(wrong_offset_local_shards, 0, 0)
+
+        local_shard_metadata = ShardMetadata(
+            shard_offsets=[self.rank * 5, 0],
+            shard_sizes=[5, 5],
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
+        )
+
+        local_shards = [
+            sharded_tensor.Shard(
+                torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata
+            )
+        ]
+        with self.assertRaises(ValueError):
+            sharded_tensor.init_from_local_shards(local_shards, 0, 0)
+
+    @skipIfRocm
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_non_rw_sharded_recalc_for_metadata(self):
+        local_shard_metadata = ShardMetadata(
+            shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
+            shard_sizes=[5, 5],
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
+        )
+
+        local_shards = [
+            sharded_tensor.Shard(
+                torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata
+            )
+        ]
+
+        st = sharded_tensor.init_from_local_shards(local_shards, None, 5)
+        if self.rank == 0:
+            self.assertEqual(
+                st.local_shards()[0].metadata.shard_offsets,
+                local_shard_metadata.shard_offsets,
+            )
+        else:
+            self.assertNotEqual(
+                st.local_shards()[0].metadata.shard_offsets,
+                local_shard_metadata.shard_offsets,
+            )
+        self.assertEqual(
+            st.local_shards()[0].metadata.shard_sizes, local_shard_metadata.shard_sizes
+        )
+        self.assertEqual(
+            st.local_shards()[0].metadata.placement, local_shard_metadata.placement
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(4)
     def test_st_base_init_from_local_shards_and_global_metadata(self):
         world_size = 4
@@ -2548,6 +2675,181 @@ def test_st_base_init_from_local_shards_and_global_metadata(self):
             self.assertEqual(f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement))
 
     @skipIfRocm
+<<<<<<< HEAD
+=======
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_init_from_local_shards_and_global_metadata_with_all_zeros(self):
+        local_shard_metadata = ShardMetadata(
+            shard_offsets=[0, 0],
+            shard_sizes=[0, 0],
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
+        )
+
+        shards_metadata = []
+        for r in range(self.world_size):
+            if r == self.rank:
+                shards_metadata.append(local_shard_metadata)
+            else:
+                shards_metadata.append(
+                    ShardMetadata(
+                        shard_offsets=[0, 0],
+                        shard_sizes=[0, 0],
+                        placement=f"rank:{r}/cuda:{r}",
+                    )
+                )
+
+        local_shards = [
+            sharded_tensor.Shard(
+                torch.randn(0, 0, device=f"cuda:{self.rank}"), local_shard_metadata
+            )
+        ]
+
+        tensor_properties = TensorProperties(
+            dtype=torch.get_default_dtype(),
+            layout=torch.strided,
+            requires_grad=False,
+            memory_format=torch.contiguous_format,
+            pin_memory=False,
+        )
+
+        sharded_tensor_metadata = sharded_tensor.ShardedTensorMetadata(
+            shards_metadata=shards_metadata,
+            size=torch.Size([0, 0]),
+            tensor_properties=tensor_properties,
+        )
+
+        st = ShardedTensor._init_from_local_shards_and_global_metadata(
+            local_shards,
+            sharded_tensor_metadata,
+        )
+
+        self.assertEqual((0, 0), st.size())
+        self.assertEqual(1, len(st.local_shards()))
+
+        # Verify local shard.
+        local_shard = st.local_shards()[0]
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device)
+        self.assertEqual((0, 0), local_shard.tensor.size())
+
+        # Verify local shard metadata.
+        self.assertEqual(
+            (0, 0),
+            local_shard.metadata.shard_offsets,
+        )
+        self.assertEqual((0, 0), local_shard.metadata.shard_sizes)
+        self.assertEqual(
+            f"rank:{self.rank}/cuda:{self.rank}", str(local_shard.metadata.placement)
+        )
+
+        # Verify global metadata.
+        shards_metadata = st.metadata().shards_metadata
+        self.assertEqual(4, len(shards_metadata))
+        for rank, shard_metadata in enumerate(shards_metadata):
+            self.assertEqual((0, 0), shard_metadata.shard_offsets)
+            self.assertEqual((0, 0), shard_metadata.shard_sizes)
+            self.assertEqual(f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement))
+
+    @skipIfRocm
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    def test_init_from_local_shards_and_global_metadata_with_local_view(self):
+        # testing cases where we create ST with local view, meaning we initialize other rank's metadata with 0s
+        shard_offsets = [0, 1]  # valid, invalid
+        for shard_offset in shard_offsets:
+            local_shard_metadata = ShardMetadata(
+                shard_offsets=[shard_offset, 0],
+                shard_sizes=[5, 5],
+                placement=f"rank:{self.rank}/cuda:{self.rank}",
+            )
+
+            shards_metadata = []
+            for r in range(self.world_size):
+                if r == self.rank:
+                    shards_metadata.append(local_shard_metadata)
+                else:
+                    shards_metadata.append(
+                        ShardMetadata(
+                            shard_offsets=[0 if r < self.rank else 5, 0],
+                            shard_sizes=[0, 0],
+                            placement=f"rank:{r}/cuda:{r}",
+                        )
+                    )
+
+            local_shards = [
+                sharded_tensor.Shard(
+                    torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata
+                )
+            ]
+
+            tensor_properties = TensorProperties(
+                dtype=torch.get_default_dtype(),
+                layout=torch.strided,
+                requires_grad=False,
+                memory_format=torch.contiguous_format,
+                pin_memory=False,
+            )
+
+            sharded_tensor_metadata = sharded_tensor.ShardedTensorMetadata(
+                shards_metadata=shards_metadata,
+                size=torch.Size([5, 5]),
+                tensor_properties=tensor_properties,
+            )
+            if shard_offset == 0:
+                # valid case
+                st = ShardedTensor._init_from_local_shards_and_global_metadata(
+                    local_shards,
+                    sharded_tensor_metadata,
+                )
+            else:
+                # invalid case
+                with self.assertRaises(ValueError):
+                    ShardedTensor._init_from_local_shards_and_global_metadata(
+                        local_shards,
+                        sharded_tensor_metadata,
+                    )
+                return
+
+            self.assertEqual((5, 5), st.size())
+            self.assertEqual(1, len(st.local_shards()))
+
+            # Verify local shard.
+            local_shard = st.local_shards()[0]
+            self.assertEqual(
+                torch.device(f"cuda:{self.rank}"), local_shard.tensor.device
+            )
+            self.assertEqual((5, 5), local_shard.tensor.size())
+
+            # Verify local shard metadata.
+            self.assertEqual(
+                (0, 0),
+                local_shard.metadata.shard_offsets,
+            )
+            self.assertEqual((5, 5), local_shard.metadata.shard_sizes)
+            self.assertEqual(
+                f"rank:{self.rank}/cuda:{self.rank}",
+                str(local_shard.metadata.placement),
+            )
+
+            # Verify global metadata.
+            shards_metadata = st.metadata().shards_metadata
+            self.assertEqual(4, len(shards_metadata))
+            for rank, shard_metadata in enumerate(shards_metadata):
+                self.assertEqual(
+                    (0 if rank <= self.rank else 5, 0), shard_metadata.shard_offsets
+                )
+                if rank == self.rank:
+                    self.assertEqual((5, 5), shard_metadata.shard_sizes)
+                else:
+                    self.assertEqual((0, 0), shard_metadata.shard_sizes)
+                self.assertEqual(
+                    f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement)
+                )
+
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
diff --git a/test/distributed/_test_template.py b/test/distributed/_test_template.py
new file mode 100644
index 000000000000..74a38f713648
--- /dev/null
+++ b/test/distributed/_test_template.py
@@ -0,0 +1,16 @@
+# Owner(s): ["oncall: distributed"]
+
+from torch.testing._internal.common_distributed import MultiProcContinousTest
+from torch.testing._internal.common_utils import run_tests
+
+
+class TestTemplate(MultiProcContinousTest):
+    def testABC(self):
+        print(f"rank {self.rank} of {self.world_size} testing ABC")
+
+    def testDEF(self):
+        print(f"rank {self.rank} of {self.world_size} testing DEF")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tools/test_fsdp2_mem_tracker.py b/test/distributed/_tools/test_fsdp2_mem_tracker.py
index 83ff95ec8a25..1005dd513eaa 100644
--- a/test/distributed/_tools/test_fsdp2_mem_tracker.py
+++ b/test/distributed/_tools/test_fsdp2_mem_tracker.py
@@ -6,12 +6,19 @@
 import torch
 import torch.nn as nn
 from torch.distributed._composable import checkpoint
+<<<<<<< HEAD
 from torch.distributed._tensor import init_device_mesh
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed._tools.fsdp2_mem_tracker import FSDPMemTracker
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     apply_activation_checkpointing,
     CheckpointWrapper,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import (
     CPUOffloadPolicy,
     fully_shard,
@@ -166,7 +173,11 @@ def test_tracker_non_root_forward_backward(self):
         self.assertAlmostEqual(
             accuracy,
             1.0,
+<<<<<<< HEAD
             delta=0.16,
+=======
+            delta=0.1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
         )
         del inp
@@ -258,7 +269,11 @@ def _test_tracker_with_activation_checkpointing(
         self.assertAlmostEqual(
             accuracy,
             1.0,
+<<<<<<< HEAD
             delta=0.25,
+=======
+            delta=0.1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             msg=f"Tracker Max:{tracker_max} CUDA Max:{cuda_max}",
         )
         del inp
diff --git a/test/distributed/_tools/test_memory_tracker.py b/test/distributed/_tools/test_memory_tracker.py
index eaa5d24ce369..ebc7013e8281 100644
--- a/test/distributed/_tools/test_memory_tracker.py
+++ b/test/distributed/_tools/test_memory_tracker.py
@@ -6,17 +6,29 @@
 import torch.nn as nn
 from torch.distributed._tools import MemoryTracker
 from torch.testing._internal.common_cuda import TEST_CUDA
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class TestMemoryTracker(TestCase):
     @unittest.skipIf(not TEST_CUDA, "no cuda")
+=======
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
+
+
+class TestMemoryTracker(TestCase):
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "no cuda/xpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_local_model(self):
         """
         Minimal test case to check the memory tracker can collect the expected
         memory stats at operator level, as well as can print the summary result
         without crash.
         """
+<<<<<<< HEAD
+=======
+        device = "cuda" if TEST_CUDA else "xpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Create a model with a hierarchy of modules
         torch.manual_seed(0)
         model = nn.Sequential(
@@ -28,16 +40,27 @@ def test_local_model(self):
             ),
             nn.Flatten(start_dim=1),
             nn.Sequential(nn.Linear(64, 2), nn.ReLU(inplace=True)),
+<<<<<<< HEAD
         ).cuda()
+=======
+        ).to(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Run one iteration of forward and backward pass
         tracker = MemoryTracker()
         tracker.start_monitor(model)
 
+<<<<<<< HEAD
         x = torch.randn(size=(2, 3, 224, 224), device=torch.device("cuda"))
         # torch.LongTensor expects cpu device type, not cuda device type in
         # constructor, so calling .cuda() outside constructor here.
         target = torch.LongTensor([0, 1]).cuda()
+=======
+        x = torch.randn(size=(2, 3, 224, 224), device=torch.device(device))
+        # torch.LongTensor expects cpu device type, not device type in
+        # constructor, so calling .to(device) outside constructor here.
+        target = torch.LongTensor([0, 1]).to(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         criterion = nn.CrossEntropyLoss()
         criterion(model(x), target).backward()
 
@@ -61,7 +84,11 @@ def test_local_model(self):
         self.assertEqual(len(tracker.memories_reserved), tracker._op_index)
         self.assertTrue(len(tracker._markers) == 2)
         self.assertTrue(tracker._cur_module_name != "")
+<<<<<<< HEAD
         self.assertTrue(hasattr(tracker, "_num_cuda_retries"))
+=======
+        self.assertTrue(hasattr(tracker, "_num_alloc_retries"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py
index cdc5210c22cb..c61850e72a98 100644
--- a/test/distributed/_tools/test_sac_ilp.py
+++ b/test/distributed/_tools/test_sac_ilp.py
@@ -17,6 +17,7 @@
     get_optimal_checkpointing_policy_per_module,
     sac_milp,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA, PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_utils import (
     run_tests,
@@ -25,6 +26,15 @@
     skipIfRocm,
     skipIfRocmArch,
     NAVI_ARCH,
+=======
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import (
+    MI300_ARCH,
+    run_tests,
+    skipIfRocmArch,
+    skipIfTorchDynamo,
+    TestCase,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
@@ -137,7 +147,11 @@ def _collect_module_info_with_fake_tensor_mode(self) -> ModuleInfo:
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+<<<<<<< HEAD
     @skipIfRocm
+=======
+    @skipIfRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sac_ilp_case1(self):
         """
         This is a case where the memory budget is either binding or too tight,
@@ -158,7 +172,11 @@ def test_sac_ilp_case1(self):
         # Due to symmetry, the layer that has 0.7964 can be any of the first three layers. On CI,
         # due to machine variance and difference in flops, the results can be different -- e.g.,
         # the ratios are  0.672, 0.5646, 0.5646, 0.5646 for the four transformer layers for test
+<<<<<<< HEAD
         # linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 1, 3, lf.linux.8xlarge.nvidia.gpu).
+=======
+        # linux-jammy-cuda11.8-py3.10-gcc9 / test (distributed, 1, 3, lf.linux.8xlarge.nvidia.gpu).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # and recomputation_time = 58.14; compute_time = 902.26
         modules_to_ac = set(ac_decisions.keys())
         sorted_discard_ratio = sorted(ac_decisions.values())
@@ -180,7 +198,10 @@ def test_sac_ilp_case1(self):
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sac_ilp_case2(self):
         """
         This is a case where the memory budget is not binding, meaning that no
diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index ec85a668d74f..9001a55e2c5c 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -234,8 +234,14 @@ def hook(flags, bucket):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     assert (
         not torch.cuda._initialized
     ), "test_distributed must not have initialized CUDA context on main process"
+=======
+    assert not torch.cuda._initialized, (
+        "test_distributed must not have initialized CUDA context on main process"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     run_tests()
diff --git a/test/distributed/algorithms/test_join.py b/test/distributed/algorithms/test_join.py
index 60982d29cc62..df8fa10d0dad 100644
--- a/test/distributed/algorithms/test_join.py
+++ b/test/distributed/algorithms/test_join.py
@@ -250,9 +250,17 @@ def _test_join_base(
             else "Detected at least one rank that exhausted inputs. "
             "Throwing across all ranks."
         )
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             RuntimeError, expected_msg
         ) if throw_on_early_termination else contextlib.nullcontext():
+=======
+        with (
+            self.assertRaisesRegex(RuntimeError, expected_msg)
+            if throw_on_early_termination
+            else contextlib.nullcontext()
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with Join(
                 allreducers,
                 enable=enable,
diff --git a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
index e0a4b8918443..5926820b65c4 100644
--- a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
+++ b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
@@ -13,7 +13,10 @@
 import torch.distributed.checkpoint.state_dict_saver as saver
 import torch.nn as nn
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.distributed._tensor.device_mesh import init_device_mesh
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.state_dict import (
     _patch_model_state_dict,
     _patch_optimizer_state_dict,
@@ -26,6 +29,10 @@
 from torch.distributed.checkpoint.state_dict_saver import AsyncCheckpointerType
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.distributed.checkpoint.utils import CheckpointException
+<<<<<<< HEAD
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.distributed_c10d import ReduceOp
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.api import ShardingStrategy
@@ -262,9 +269,17 @@ def _run_e2e_test(
             f = saver.async_save(
                 sd,
                 storage_writer=writer,
+<<<<<<< HEAD
                 async_checkpointer_type=async_checkpointer_type
                 if async_checkpointer_type
                 else AsyncCheckpointerType.THREAD,
+=======
+                async_checkpointer_type=(
+                    async_checkpointer_type
+                    if async_checkpointer_type
+                    else AsyncCheckpointerType.THREAD
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             t = time.monotonic()
             while not f.done():
diff --git a/test/distributed/checkpoint/e2e/test_fine_tuning.py b/test/distributed/checkpoint/e2e/test_fine_tuning.py
index 799d304ab542..8bd2c65c2638 100644
--- a/test/distributed/checkpoint/e2e/test_fine_tuning.py
+++ b/test/distributed/checkpoint/e2e/test_fine_tuning.py
@@ -7,7 +7,10 @@
 import torch.distributed as dist
 import torch.distributed.checkpoint as dist_cp
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed._tensor import init_device_mesh
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.state_dict import (
     get_model_state_dict,
     get_state_dict,
@@ -15,6 +18,10 @@
     set_state_dict,
     StateDictOptions,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
diff --git a/test/distributed/checkpoint/e2e/test_fsdp_ep.py b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
index 09acdbdb4c26..5f4abe605b9a 100644
--- a/test/distributed/checkpoint/e2e/test_fsdp_ep.py
+++ b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
@@ -2,10 +2,17 @@
 
 import torch
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed._tensor import DTensor
 from torch.distributed.checkpoint.state_dict import get_state_dict
 from torch.distributed.device_mesh import _mesh_resources, init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+=======
+from torch.distributed.checkpoint.state_dict import get_state_dict
+from torch.distributed.device_mesh import _mesh_resources, init_device_mesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.tensor import DTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
diff --git a/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py b/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
index fac49dd2786f..80437fc0c9b1 100644
--- a/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
+++ b/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
@@ -6,19 +6,31 @@
 import torch
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed._tensor import DTensor, init_device_mesh
 from torch.distributed._tensor.experimental import implicit_replication
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.state_dict import (
     get_model_state_dict,
     get_optimizer_state_dict,
     StateDictOptions,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import (
     fully_shard,
     FullyShardedDataParallel as FSDP,
     StateDictType,
 )
 from torch.distributed.fsdp.wrap import always_wrap_policy
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor.experimental import implicit_replication
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
@@ -458,9 +470,22 @@ def _test_save_with_fsdp2_tp_and_load_with_tp(
         """
         Test that we can save a model with FSDP2 + TP on 2d mesh and load it with TP.
         """
+<<<<<<< HEAD
 
         def _get_base_model(mlp_dim: int = 2):
             base_model = nn.Sequential(MLP(mlp_dim), MLP(mlp_dim), MLP(mlp_dim))
+=======
+        mlp_dim = 5
+
+        def _get_base_model(mlp_dim):
+            # dim_multiplier=1 helps make it easier to hit corner cases in uneven sharding
+            # (e.g. in/out dim both=5 means unevenness is easier to hit depending on row/col sharding)
+            base_model = nn.Sequential(
+                MLP(mlp_dim, dim_multiplier=1),
+                MLP(mlp_dim, dim_multiplier=1),
+                MLP(mlp_dim, dim_multiplier=1),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return base_model
 
         cm = (
@@ -468,6 +493,7 @@ def _get_base_model(mlp_dim: int = 2):
             if allow_implicit_replication
             else contextlib.nullcontext()
         )
+<<<<<<< HEAD
         tp_parallelize_plan = {
             "0.in_proj": ColwiseParallel(),
             "0.out_proj": RowwiseParallel(),
@@ -475,6 +501,17 @@ def _get_base_model(mlp_dim: int = 2):
             "1.out_proj": RowwiseParallel(),
             "2.in_proj": ColwiseParallel(),
             "2.out_proj": RowwiseParallel(),
+=======
+        # Must set 'use_local_output=False' in order to test uneven-sharding case
+        # see https://github.com/pytorch/pytorch/issues/150336
+        tp_parallelize_plan = {
+            "0.in_proj": ColwiseParallel(use_local_output=False),
+            "0.out_proj": RowwiseParallel(use_local_output=False),
+            "1.in_proj": ColwiseParallel(use_local_output=False),
+            "1.out_proj": RowwiseParallel(use_local_output=False),
+            "2.in_proj": ColwiseParallel(use_local_output=False),
+            "2.out_proj": RowwiseParallel(use_local_output=False),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         if allow_implicit_replication:
             # intentionally pop the plans for some tp layers so that the model is not fully tensor parallelized
@@ -482,6 +519,7 @@ def _get_base_model(mlp_dim: int = 2):
             tp_parallelize_plan.pop("0.out_proj")
 
         with cm:
+<<<<<<< HEAD
             tp_parallelize_plan = {
                 "0.in_proj": ColwiseParallel(),
                 "0.out_proj": RowwiseParallel(),
@@ -491,6 +529,8 @@ def _get_base_model(mlp_dim: int = 2):
                 "2.out_proj": RowwiseParallel(),
             }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # init device mesh
             dp_size = 2
             global_mesh_1d = init_device_mesh(
@@ -505,7 +545,11 @@ def _get_base_model(mlp_dim: int = 2):
 
             for save_full_state_dict in [True, False]:
                 # Save state dict with original model
+<<<<<<< HEAD
                 base_model = _get_base_model().cuda()
+=======
+                base_model = _get_base_model(mlp_dim).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 base_optim = torch.optim.AdamW(base_model.parameters(), lr=0.1)
 
                 # Save state dict with FSDP2 + TP model
@@ -521,7 +565,11 @@ def _get_base_model(mlp_dim: int = 2):
                 fsdp2_tp_optim = torch.optim.AdamW(fsdp2_tp_model.parameters(), lr=0.1)
 
                 # one-step training to modify state dict
+<<<<<<< HEAD
                 inp = torch.randn((2,), device=self.rank)
+=======
+                inp = torch.randn((mlp_dim,), device=self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 base_model(inp).sum().backward()
                 base_optim.step()
                 fsdp2_tp_model(inp).sum().backward()
@@ -563,7 +611,11 @@ def _get_base_model(mlp_dim: int = 2):
                 )
 
                 # Load state dict into model with TP applied
+<<<<<<< HEAD
                 tp_model = _get_base_model()
+=======
+                tp_model = _get_base_model(mlp_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tp_model = parallelize_module(
                     tp_model,
                     device_mesh=global_mesh_1d,
diff --git a/test/distributed/checkpoint/test_dtensor_checkpoint.py b/test/distributed/checkpoint/test_dtensor_checkpoint.py
index 9c021814cecf..5a59bbef4c8d 100644
--- a/test/distributed/checkpoint/test_dtensor_checkpoint.py
+++ b/test/distributed/checkpoint/test_dtensor_checkpoint.py
@@ -4,7 +4,11 @@
 import torch
 import torch.distributed as dist
 import torch.distributed.checkpoint as dist_cp
+<<<<<<< HEAD
 from torch.distributed._tensor import (
+=======
+from torch.distributed.tensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DeviceMesh,
     distribute_tensor,
     DTensor,
diff --git a/test/distributed/checkpoint/test_dtensor_resharding.py b/test/distributed/checkpoint/test_dtensor_resharding.py
index b99e6592c5cc..da5610e6ad2a 100644
--- a/test/distributed/checkpoint/test_dtensor_resharding.py
+++ b/test/distributed/checkpoint/test_dtensor_resharding.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 import torch
 import torch.distributed.checkpoint as dist_cp
+<<<<<<< HEAD
 from torch.distributed._tensor import (
     distribute_tensor,
     init_device_mesh,
@@ -9,6 +10,11 @@
     zeros,
 )
 from torch.distributed.checkpoint._extension import ZStandard
+=======
+from torch.distributed.checkpoint._extension import ZStandard
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, Replicate, Shard, zeros
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
diff --git a/test/distributed/checkpoint/test_format_utils.py b/test/distributed/checkpoint/test_format_utils.py
index dd83da0c9846..1d3dbaa3bb61 100644
--- a/test/distributed/checkpoint/test_format_utils.py
+++ b/test/distributed/checkpoint/test_format_utils.py
@@ -5,13 +5,20 @@
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.distributed._tensor.device_mesh import init_device_mesh
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.format_utils import (
     BroadcastingTorchSaveReader,
     dcp_to_torch_save,
     DynamicMetaLoadPlanner,
     torch_save_to_dcp,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests
diff --git a/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py b/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
index 8e49edf1472b..12fca6e7386d 100644
--- a/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
+++ b/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
@@ -3,9 +3,16 @@
 import torch.distributed.checkpoint as dist_cp
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._state_dict_utils import _all_gather_sharded_tensor
+<<<<<<< HEAD
 from torch.distributed._tensor import DTensor, init_device_mesh, Replicate
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+from torch.distributed.tensor import DTensor, Replicate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
diff --git a/test/distributed/checkpoint/test_hf_safetensor_e2e.py b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
new file mode 100644
index 000000000000..4260388c07f3
--- /dev/null
+++ b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
@@ -0,0 +1,401 @@
+# Owner(s): ["oncall: distributed checkpointing"]
+
+import importlib
+
+import torch
+import torch.distributed.checkpoint as dist_cp
+from torch.distributed.checkpoint.state_dict_loader import _load_state_dict_from_keys
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, Replicate, Shard, zeros
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    run_tests,
+    TestCase,
+)
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    skip_if_lt_x_gpu,
+    with_comms,
+)
+from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
+
+
+CHECKPOINT_DIR = "checkpoint"
+
+
+class MyTestModule(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear_1 = torch.nn.Linear(5, 5)
+        self.linear_2 = torch.nn.Linear(5, 1)
+        self.emb = torch.nn.EmbeddingBag(5, 10)
+
+
+class TestSingleRankSaveLoad(TestCase):
+    @with_temp_dir
+    def test_save(self) -> None:
+        try:
+            from safetensors.torch import load_file
+        except ImportError:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+
+        state_dict_to_save = MyTestModule().state_dict()
+        dist_cp.save(
+            state_dict=state_dict_to_save,
+            storage_writer=dist_cp.HuggingFaceStorageWriter(path=CHECKPOINT_DIR),
+        )
+
+        state_dict_loaded = load_file(
+            CHECKPOINT_DIR + "/model-00001-of-00001.safetensors"
+        )
+        self.assertEqual(
+            sorted(state_dict_to_save.keys()), sorted(state_dict_loaded.keys())
+        )
+        for key in state_dict_to_save.keys():
+            self.assertTrue(
+                torch.equal(state_dict_to_save[key], state_dict_loaded[key])
+            )
+
+    @with_temp_dir
+    def test_load(self) -> None:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+
+        state_dict_to_save = MyTestModule().state_dict()
+        state_dict_to_load = MyTestModule().state_dict()
+        save_file(
+            state_dict_to_save, CHECKPOINT_DIR + "/model-00001-of-00001.safetensors"
+        )
+
+        dist_cp.load(
+            state_dict=state_dict_to_load,
+            storage_reader=dist_cp.HuggingFaceStorageReader(path=CHECKPOINT_DIR),
+        )
+
+        self.assertEqual(
+            sorted(state_dict_to_save.keys()), sorted(state_dict_to_load.keys())
+        )
+        for key in state_dict_to_save.keys():
+            self.assertTrue(
+                torch.equal(state_dict_to_save[key], state_dict_to_load[key])
+            )
+
+    @with_temp_dir
+    def test_load_into_empty_dict(self) -> None:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+
+        state_dict_to_save = MyTestModule().state_dict()
+        save_file(
+            state_dict_to_save, CHECKPOINT_DIR + "/model-00001-of-00001.safetensors"
+        )
+
+        state_dict_loaded = _load_state_dict_from_keys(
+            storage_reader=dist_cp.HuggingFaceStorageReader(path=CHECKPOINT_DIR),
+        )
+
+        self.assertEqual(
+            sorted(state_dict_to_save.keys()), sorted(state_dict_loaded.keys())
+        )
+        for key in state_dict_to_save.keys():
+            self.assertTrue(
+                torch.equal(state_dict_to_save[key], state_dict_loaded[key])
+            )
+
+
+ONE_D_PLACEMENTS = [
+    [Shard(0)],
+    [Replicate()],
+]
+ONE_D_TO_ONE_D_PLACEMENTS = [
+    ([Replicate()], [Shard(0)]),
+    ([Shard(0)], [Replicate()]),
+]
+
+TWO_D_PLACEMENTS = [
+    [Replicate(), Replicate()],
+    [Replicate(), Shard(0)],
+    [Shard(0), Replicate()],
+    [Shard(0), Shard(0)],
+]
+TWO_D_TO_TWO_D_PLACEMENTS = []
+for p1 in TWO_D_PLACEMENTS:
+    for p2 in TWO_D_PLACEMENTS:
+        if p1 != p2:
+            TWO_D_TO_TWO_D_PLACEMENTS.append((p1, p2))
+
+
+@instantiate_parametrized_tests
+class TestDTensorReshardPlacementChange(DTensorTestBase):
+    """
+    Test DCP reshard for DTensor with placements changes and without world_size change and mesh_tensor change.
+    """
+
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+    @with_temp_dir
+    def test_1d_to_1d_reshard_placement_change(self) -> None:
+        if importlib.util.find_spec("safetensors") is None:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+
+        for one_d_to_one_d_placements in ONE_D_TO_ONE_D_PLACEMENTS:
+            original_placement, new_placement = one_d_to_one_d_placements
+
+            global_tensor = torch.arange(16, dtype=torch.float).view(4, 4)
+            mesh_shape = (self.world_size,)
+            device_mesh = init_device_mesh(self.device_type, mesh_shape)
+            dtensor = distribute_tensor(
+                global_tensor, device_mesh, placements=original_placement
+            )
+            state_dict_to_save = {"dtensor": dtensor}
+
+            dist_cp.save(
+                state_dict=state_dict_to_save,
+                storage_writer=dist_cp.HuggingFaceStorageWriter(
+                    path=CHECKPOINT_DIR,
+                    save_sharded=True,
+                ),
+            )
+
+            zero_dtensor = zeros(
+                [4, 4], device_mesh=device_mesh, placements=new_placement
+            )
+            state_dict_to_load = {"dtensor": zero_dtensor}
+
+            dist_cp.load(
+                state_dict=state_dict_to_load,
+                storage_reader=dist_cp.HuggingFaceStorageReader(
+                    CHECKPOINT_DIR,
+                ),
+            )
+
+            # materialize the whole tensor to compare with the original global_tensor
+            state_dict_to_load["dtensor"] = state_dict_to_load["dtensor"].redistribute(
+                device_mesh,
+                placements=[Replicate()],
+            )
+            self.assertEqual(global_tensor, state_dict_to_load["dtensor"].to_local())
+
+            # redistribute the tensor back to its original placement for comparison.
+            state_dict_to_load["dtensor"] = state_dict_to_load["dtensor"].redistribute(
+                device_mesh,
+                placements=original_placement,
+            )
+            self.assertEqual(
+                state_dict_to_save["dtensor"].to_local(),
+                state_dict_to_load["dtensor"].to_local(),
+            )
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @with_temp_dir
+    def test_2d_to_2d_reshard_placement_change(self) -> None:
+        if importlib.util.find_spec("safetensors") is None:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+        for two_d_to_two_d_placements in TWO_D_TO_TWO_D_PLACEMENTS:
+            original_placement, new_placement = two_d_to_two_d_placements
+
+            global_tensor = torch.arange(16, dtype=torch.float).view(4, 4)
+            mesh_shape = (2, self.world_size // 2)
+            mesh_2d = init_device_mesh(self.device_type, mesh_shape)
+            dtensor = distribute_tensor(
+                global_tensor,
+                mesh_2d,
+                placements=original_placement,
+            )
+            state_dict_to_save = {"dtensor": dtensor}
+
+            dist_cp.save(
+                state_dict=state_dict_to_save,
+                storage_writer=dist_cp.HuggingFaceStorageWriter(
+                    path=CHECKPOINT_DIR, save_sharded=True
+                ),
+                planner=dist_cp.DefaultSavePlanner(),
+            )
+
+            zero_dtensor = zeros([4, 4], device_mesh=mesh_2d, placements=new_placement)
+            state_dict_to_load = {"dtensor": zero_dtensor}
+
+            dist_cp.load(
+                state_dict=state_dict_to_load,
+                storage_reader=dist_cp.HuggingFaceStorageReader(CHECKPOINT_DIR),
+            )
+
+            state_dict_to_load["dtensor"] = state_dict_to_load["dtensor"].redistribute(
+                mesh_2d,
+                placements=[Replicate(), Replicate()],
+            )
+            self.assertEqual(global_tensor, state_dict_to_load["dtensor"].to_local())
+
+            state_dict_to_load["dtensor"] = state_dict_to_load["dtensor"].redistribute(
+                mesh_2d,
+                placements=original_placement,
+            )
+            self.assertEqual(
+                state_dict_to_save["dtensor"].to_local(),
+                state_dict_to_load["dtensor"].to_local(),
+            )
+
+
+class TestDTensorReshardMeshChange(DTensorTestBase):
+    """
+    Test DCP reshard for DTensor with placements changes and mesh_tensor change.
+    """
+
+    @with_comms
+    @with_temp_dir
+    @skip_if_lt_x_gpu(2)
+    def test_1d_to_2d_reshard_mesh_change(self) -> None:
+        if importlib.util.find_spec("safetensors") is None:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+        for placements_1d in ONE_D_PLACEMENTS:
+            global_tensor = torch.arange(16, dtype=torch.float).view(4, 4)
+            mesh_shape = (self.world_size,)
+            mesh_1d = init_device_mesh(self.device_type, mesh_shape)
+            dtensor = distribute_tensor(
+                global_tensor, mesh_1d, placements=placements_1d
+            )
+            state_dict_to_save = {"dtensor": dtensor}
+
+            dist_cp.save(
+                state_dict=state_dict_to_save,
+                storage_writer=dist_cp.HuggingFaceStorageWriter(
+                    path=CHECKPOINT_DIR, save_sharded=True
+                ),
+            )
+
+            for placements_2d in TWO_D_PLACEMENTS:
+                mesh_shape = (2, self.world_size // 2)
+                mesh_2d = init_device_mesh(self.device_type, mesh_shape)
+
+                zero_dtensor = zeros(
+                    [4, 4], device_mesh=mesh_2d, placements=placements_2d
+                )
+                state_dict_to_load = {"dtensor": zero_dtensor}
+
+                dist_cp.load(
+                    state_dict=state_dict_to_load,
+                    storage_reader=dist_cp.HuggingFaceStorageReader(CHECKPOINT_DIR),
+                    planner=dist_cp.DefaultLoadPlanner(),
+                )
+
+                # materialzie the whole tensor to compare with the original global_tensor
+                state_dict_to_load["dtensor"] = state_dict_to_load[
+                    "dtensor"
+                ].redistribute(
+                    mesh_2d,
+                    placements=[Replicate(), Replicate()],
+                )
+                self.assertEqual(
+                    global_tensor, state_dict_to_load["dtensor"].to_local()
+                )
+
+    @with_comms
+    @with_temp_dir
+    @skip_if_lt_x_gpu(4)
+    def test_2d_to_1d_reshard_mesh_change(self) -> None:
+        if importlib.util.find_spec("safetensors") is None:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+        for placements_2d in TWO_D_PLACEMENTS:
+            global_tensor = torch.arange(16, dtype=torch.float).view(4, 4)
+            mesh_shape = (2, self.world_size // 2)
+            mesh_2d = init_device_mesh(self.device_type, mesh_shape)
+            dtensor = distribute_tensor(
+                global_tensor, mesh_2d, placements=placements_2d
+            )
+            state_dict_to_save = {"dtensor": dtensor}
+
+            dist_cp.save(
+                state_dict=state_dict_to_save,
+                storage_writer=dist_cp.HuggingFaceStorageWriter(
+                    path=CHECKPOINT_DIR, save_sharded=True
+                ),
+                planner=dist_cp.DefaultSavePlanner(),
+            )
+
+            for placements_1d in ONE_D_PLACEMENTS:
+                mesh_shape = (self.world_size,)
+                mesh_1d = init_device_mesh(self.device_type, mesh_shape)
+
+                zero_dtensor = zeros(
+                    [4, 4], device_mesh=mesh_1d, placements=placements_1d
+                )
+                state_dict_to_load = {"dtensor": zero_dtensor}
+
+                dist_cp.load(
+                    state_dict=state_dict_to_load,
+                    storage_reader=dist_cp.HuggingFaceStorageReader(CHECKPOINT_DIR),
+                    planner=dist_cp.DefaultLoadPlanner(),
+                )
+
+                # materialzie the whole tensor to compare with the original global_tensor
+                state_dict_to_load["dtensor"] = state_dict_to_load[
+                    "dtensor"
+                ].redistribute(
+                    mesh_1d,
+                    placements=[Replicate()],
+                )
+                self.assertEqual(
+                    global_tensor, state_dict_to_load["dtensor"].to_local()
+                )
+
+    @with_comms
+    @with_temp_dir
+    @skip_if_lt_x_gpu(2)
+    def test_dtensor_checkpoint_resharding_with_empty_shard(self):
+        """
+        Test dtensor checkpoint resharding with dtensor containing empty shards.
+        """
+        if importlib.util.find_spec("safetensors") is None:
+            print("safetensors not installed")
+            return
+
+        tensor = torch.rand(1).cuda()
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+        dtensor = distribute_tensor(tensor, mesh, [Shard(0)])
+        ref_state_dict = {"dtensor": dtensor}
+
+        dist_cp.save(
+            state_dict=ref_state_dict,
+            storage_writer=dist_cp.HuggingFaceStorageWriter(
+                path=self.temp_dir, save_sharded=True
+            ),
+        )
+
+        tensor = torch.rand(1).cuda()
+        mesh_2 = init_device_mesh(self.device_type, (2, self.world_size // 2))
+        dtensor = distribute_tensor(tensor, mesh_2, [Shard(0), Shard(0)])
+        state_dict = {"dtensor": dtensor}
+        dist_cp.load(
+            state_dict=state_dict,
+            storage_reader=dist_cp.HuggingFaceStorageReader(self.temp_dir),
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_hf_storage.py b/test/distributed/checkpoint/test_hf_storage.py
index 5ad5bc8f97df..7218696ee077 100644
--- a/test/distributed/checkpoint/test_hf_storage.py
+++ b/test/distributed/checkpoint/test_hf_storage.py
@@ -8,6 +8,7 @@
 from unittest.mock import MagicMock
 
 import torch
+<<<<<<< HEAD
 from torch.distributed.checkpoint._hf_storage import (
     _HuggingFaceStorageReader,
     _HuggingFaceStorageWriter,
@@ -28,6 +29,32 @@
     _create_read_items,
     _create_write_item_for_tensor,
 )
+=======
+from torch.distributed.checkpoint import DefaultLoadPlanner
+from torch.distributed.checkpoint._hf_utils import _HFStorageInfo
+from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
+from torch.distributed.checkpoint.filesystem import _StorageInfo, FileSystem
+from torch.distributed.checkpoint.hf_storage import (
+    _metadata_fn,
+    HuggingFaceStorageReader,
+    HuggingFaceStorageWriter,
+)
+from torch.distributed.checkpoint.metadata import (
+    BytesStorageMetadata,
+    ChunkStorageMetadata,
+    Metadata,
+    MetadataIndex,
+    TensorProperties,
+    TensorStorageMetadata,
+)
+from torch.distributed.checkpoint.planner import (
+    LoadItemType,
+    LoadPlan,
+    ReadItem,
+    SavePlan,
+)
+from torch.distributed.checkpoint.planner_helpers import _create_write_item_for_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.storage import WriteResult
 from torch.testing._internal.common_utils import run_tests, TestCase
 
@@ -35,17 +62,26 @@
 class TestHfStorage(TestCase):
     def test_write_data_hf(self) -> None:
         mock_module = MagicMock()
+<<<<<<< HEAD
         sys.modules["safetensors"] = mock_module
         sys.modules["huggingface_hub"] = mock_module
 
         mock_module = MagicMock()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mock_module.save.return_value = b""
         sys.modules["safetensors.torch"] = mock_module
 
         with tempfile.TemporaryDirectory() as path:
+<<<<<<< HEAD
             writer = _HuggingFaceStorageWriter(
                 path=path,
                 fqn_to_index_mapping={"tensor_0": 1, "tensor_1": 1},
+=======
+            writer = HuggingFaceStorageWriter(
+                path=path,
+                fqn_to_index_mapping={"tensor_0": 1, "tensor_1": 2},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             writer.fs = FileSystem()
 
@@ -58,7 +94,11 @@ def test_write_data_hf(self) -> None:
 
             save_plan = SavePlan(
                 [write_item_1, write_item_2],
+<<<<<<< HEAD
                 storage_data={"tensor_0": 1, "tensor_1": 1},
+=======
+                storage_data={"fqn_to_index_mapping": {"tensor_0": 1, "tensor_1": 2}},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             save_planner = DefaultSavePlanner()
             save_planner.set_up_planner(state_dict=state_dict)
@@ -75,7 +115,11 @@ def test_write_data_hf(self) -> None:
                     ),
                     size_in_bytes=tensor0.numel() * tensor0.element_size(),
                     storage_data=_StorageInfo(
+<<<<<<< HEAD
                         relative_path="model-00001-of-00001.safetensors",
+=======
+                        relative_path="model-00001-of-00002.safetensors",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         offset=0,
                         length=tensor0.numel() * tensor0.element_size(),
                     ),
@@ -86,7 +130,72 @@ def test_write_data_hf(self) -> None:
                     ),
                     size_in_bytes=tensor1.numel() * tensor1.element_size(),
                     storage_data=_StorageInfo(
+<<<<<<< HEAD
                         relative_path="model-00001-of-00001.safetensors",
+=======
+                        relative_path="model-00002-of-00002.safetensors",
+                        offset=0,
+                        length=tensor1.numel() * tensor1.element_size(),
+                    ),
+                ),
+            ]
+
+            self.assertEqual(
+                actual_write_results,
+                expected_write_results,
+            )
+
+    def test_write_data_with_sharding(self) -> None:
+        mock_module = MagicMock()
+        mock_module.save.return_value = b""
+        sys.modules["safetensors.torch"] = mock_module
+
+        with tempfile.TemporaryDirectory() as path:
+            writer = HuggingFaceStorageWriter(
+                path=path,
+                save_sharded=True,
+            )
+            writer.fs = FileSystem()
+
+            tensor0 = torch.rand(4)
+            tensor1 = torch.rand(10)
+            write_item_1 = _create_write_item_for_tensor("tensor_0", tensor0)
+            write_item_2 = _create_write_item_for_tensor("tensor_1", tensor1)
+
+            state_dict = {"tensor_0": tensor0, "tensor_1": tensor1}
+
+            save_plan = SavePlan(
+                [write_item_1, write_item_2],
+                storage_data={"shard_index": 1},
+            )
+            save_planner = DefaultSavePlanner()
+            save_planner.set_up_planner(state_dict=state_dict)
+
+            write_results = writer.write_data(save_plan, save_planner)
+
+            write_results.wait()
+            actual_write_results = write_results.value()
+
+            expected_write_results = [
+                WriteResult(
+                    index=MetadataIndex(
+                        fqn="tensor_0", offset=torch.Size([0]), index=None
+                    ),
+                    size_in_bytes=tensor0.numel() * tensor0.element_size(),
+                    storage_data=_StorageInfo(
+                        relative_path="shard-00001-model-00001-of-00001.safetensors",
+                        offset=0,
+                        length=tensor0.numel() * tensor0.element_size(),
+                    ),
+                ),
+                WriteResult(
+                    index=MetadataIndex(
+                        fqn="tensor_1", offset=torch.Size([0]), index=None
+                    ),
+                    size_in_bytes=tensor1.numel() * tensor1.element_size(),
+                    storage_data=_StorageInfo(
+                        relative_path="shard-00001-model-00001-of-00001.safetensors",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         offset=0,
                         length=tensor1.numel() * tensor1.element_size(),
                     ),
@@ -99,6 +208,7 @@ def test_write_data_hf(self) -> None:
             )
 
     def test_read_data_hf(self) -> None:
+<<<<<<< HEAD
         mock_module = MagicMock()
         sys.modules["safetensors"] = mock_module
         sys.modules["huggingface_hub"] = mock_module
@@ -136,6 +246,101 @@ def test_read_data_hf(self) -> None:
             self.assertEqual(loaded_tensor, tensor_0)
 
     def test_metadata_hf(self) -> None:
+=======
+        mock_safetensors = MagicMock()
+        sys.modules["safetensors"] = mock_safetensors
+
+        # Create test tensors
+        tensor_0 = torch.tensor([1.0, 2.0, 3.0, 4.0])
+
+        # Mock the deserialize function to return our test tensors
+        # The format matches what's expected in the read_data method
+        mock_safetensors.deserialize.return_value = [
+            (
+                "tensor_0",
+                {"data": tensor_0.numpy().tobytes(), "dtype": "F32", "shape": [4]},
+            ),
+        ]
+
+        with tempfile.TemporaryDirectory() as path:
+            # Create the reader
+            reader = HuggingFaceStorageReader(path=path)
+            reader.fs = FileSystem()
+
+            # Create test file
+            file_name = "model-00001-of-00001.safetensors"
+            file_path = os.path.join(path, file_name)
+            pathlib.Path(file_path).touch()
+
+            # Set up storage data with _StorageInfo objects
+            storage_data = {
+                MetadataIndex(
+                    fqn="tensor_0", offset=torch.Size([0]), index=None
+                ): _HFStorageInfo(
+                    file_path,
+                    0,
+                    tensor_0.numel() * tensor_0.element_size(),
+                    tensor_0.shape,
+                    tensor_0.dtype,
+                ),
+            }
+
+            reader.storage_data = storage_data
+
+            # Create target tensors that will be updated by read_data
+            target_tensor_0 = torch.zeros(4)
+            state_dict = {
+                "tensor_0": target_tensor_0,
+            }
+
+            # Create read items for the load plan
+            read_items = []
+            for name, tensor in state_dict.items():
+                storage_index = MetadataIndex(
+                    fqn=name, offset=torch.Size([0]), index=None
+                )
+                dest_index = MetadataIndex(fqn=name, offset=torch.Size([0]), index=None)
+                read_items.append(
+                    ReadItem(
+                        type=LoadItemType.TENSOR,
+                        storage_index=storage_index,
+                        dest_index=dest_index,
+                        storage_offsets=[0, 0],
+                        dest_offsets=[0, 0],
+                        lengths=tensor.size(),
+                    )
+                )
+
+            # Create load plan and planner
+            load_plan = LoadPlan(read_items)
+            load_planner = DefaultLoadPlanner()
+            load_planner.set_up_planner(
+                state_dict=state_dict,
+                metadata=Metadata(
+                    state_dict_metadata={
+                        "tensor_0": TensorStorageMetadata(
+                            properties=TensorProperties(dtype=torch.float32),
+                            size=torch.Size([4]),
+                            chunks=[
+                                ChunkStorageMetadata(
+                                    offsets=torch.Size([0]), sizes=torch.Size([4])
+                                )
+                            ],
+                        )
+                    },
+                    storage_data=storage_data,
+                ),
+            )
+
+            # Call read_data
+            future = reader.read_data(load_plan, load_planner)
+            future.wait()
+
+            # Verify results - the target tensors should now contain the values from our test tensor
+            self.assertTrue(torch.equal(state_dict["tensor_0"], tensor_0))
+
+    def test_write_metadata_hf(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mock_module = MagicMock()
         sys.modules["huggingface_hub"] = mock_module
         with tempfile.TemporaryDirectory() as path:
@@ -145,21 +350,38 @@ def test_metadata_hf(self) -> None:
                     index=MetadataIndex(fqn="tensor_0", offset=None, index=None),
                     size_in_bytes=100,
                     storage_data=_StorageInfo(
+<<<<<<< HEAD
                         relative_path=file_name, offset=0, length=100
+=======
+                        relative_path=file_name,
+                        offset=0,
+                        length=100,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                 ),
                 WriteResult(
                     index=MetadataIndex(fqn="tensor_1", offset=None, index=None),
                     size_in_bytes=100,
                     storage_data=_StorageInfo(
+<<<<<<< HEAD
                         relative_path=file_name, offset=0, length=100
+=======
+                        relative_path=file_name,
+                        offset=0,
+                        length=100,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ),
                 ),
             ]
 
+<<<<<<< HEAD
             writer = _HuggingFaceStorageWriter(
                 path=path,
                 fqn_to_index_mapping={},
+=======
+            writer = HuggingFaceStorageWriter(
+                path=path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             writer.fs = FileSystem()
             writer.finish(
@@ -184,10 +406,65 @@ def test_metadata_hf(self) -> None:
                 metadata = json.load(f)
                 self.assertEqual(metadata, expected_metadata)
 
+<<<<<<< HEAD
             reader = _HuggingFaceStorageReader(path=path)
             reader.fs = FileSystem()
             metadata = reader.read_metadata()
             self.assertEqual(metadata.storage_data, expected_metadata["weight_map"])
+=======
+    def test_read_metadata_hf(self):
+        with tempfile.TemporaryDirectory() as path:
+            reader = HuggingFaceStorageReader(path=path)
+
+            key = "tensor_0"
+            file_name = "test.safetensors"
+            with open(os.path.join(path, file_name), "wb") as f:
+                # write metadata the same way it would be in safetensors file
+                metadata_contents = json.dumps(
+                    {
+                        "tensor_0": {
+                            "dtype": "F32",
+                            "shape": [5, 10],
+                            "data_offsets": [0, 200],
+                        }
+                    }
+                )
+                metadata_bytes = metadata_contents.encode("utf-8")
+
+                f.write(len(metadata_bytes).to_bytes(8, byteorder="little"))
+                f.write(metadata_bytes)
+
+            metadata = reader.read_metadata()
+
+            self.assertEqual(
+                metadata.state_dict_metadata,
+                {
+                    key: TensorStorageMetadata(
+                        properties=TensorProperties(dtype=torch.float32),
+                        size=torch.Size([5, 10]),
+                        chunks=[
+                            ChunkStorageMetadata(
+                                offsets=torch.Size([0, 0]), sizes=torch.Size([5, 10])
+                            )
+                        ],
+                    ),
+                },
+            )
+            self.assertEqual(
+                metadata.storage_data,
+                {
+                    MetadataIndex(
+                        fqn=key, offset=torch.Size([0, 0]), index=None
+                    ): _HFStorageInfo(
+                        os.path.join(path, file_name),
+                        0,
+                        200,
+                        torch.Size([5, 10]),
+                        torch.float32,
+                    )
+                },
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/checkpoint/test_hsdp_checkpoint.py b/test/distributed/checkpoint/test_hsdp_checkpoint.py
index dc9c8518962c..ce9eb0d47fc9 100644
--- a/test/distributed/checkpoint/test_hsdp_checkpoint.py
+++ b/test/distributed/checkpoint/test_hsdp_checkpoint.py
@@ -5,16 +5,27 @@
 import torch.distributed.checkpoint as dist_cp
 import torch.nn as nn
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.distributed._tensor import init_device_mesh, Replicate
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.default_planner import (
     DefaultLoadPlanner,
     DefaultSavePlanner,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     ShardingStrategy,
     StateDictType,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor import Replicate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
diff --git a/test/distributed/checkpoint/test_planner.py b/test/distributed/checkpoint/test_planner.py
index a8d673ca8e4c..8044dc4c6c4b 100644
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@@ -1,6 +1,9 @@
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
 import copy
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 
 import torch
@@ -201,6 +204,29 @@ def create_data(rank):
                         item_md.chunks[new_item.index.index], old_item.tensor_data.chunk
                     )
 
+<<<<<<< HEAD
+=======
+    def test_dedup_plans(self):
+        def create_data(rank):
+            with with_dist(rank=rank, world_size=4):
+                tensor = torch.rand(10)
+                val = [1, 2, 3]
+                st = create_sharded_tensor(rank=rank, world_size=4, shards_per_rank=1)
+                state_dict = {"tensor": tensor, "value": val, "st": st}
+                return create_default_local_save_plan(state_dict, rank == 0)
+
+        all_plans = [create_data(0), create_data(1), create_data(2), create_data(3)]
+        deduped_plans = dedup_save_plans(all_plans)
+
+        # Number of plans should remain unchanged
+        self.assertEqual(len(all_plans), len(deduped_plans))
+
+        # Numer of items in the deduped plans should be less than the original plans
+        for new_plan, old_plan in zip(deduped_plans, all_plans):
+            self.assertFalse(_compare_save_plans(new_plan, old_plan))
+            self.assertTrue(len(new_plan.items) < len(old_plan.items))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_global_plan_with_caching(self):
         def create_data(rank):
             with with_dist(rank=rank, world_size=4):
@@ -213,7 +239,10 @@ def create_data(rank):
                 return planner.create_local_plan()
 
         all_plans = [create_data(0), create_data(1), create_data(2), create_data(3)]
+<<<<<<< HEAD
         expected_all_plans = copy.deepcopy(all_plans)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         planner = DefaultSavePlanner(enable_plan_caching=True)
         # First iteration, should create a new plan
         first_global_plan, first_metadata = planner.create_global_plan(all_plans)
@@ -224,12 +253,19 @@ def create_data(rank):
 
         # Validate that all_plans are cached
         cached_all_plans = SavePlanner._cached_all_plans[planner._cached_plans_key]
+<<<<<<< HEAD
         self.assertEqual(cached_all_plans, expected_all_plans)
+=======
+        self.assertEqual(cached_all_plans, all_plans)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Second iteration, should return empty plans
         # Recreate the plans as the previous ones are deduped.
         all_plans = [create_data(0), create_data(1), create_data(2), create_data(3)]
+<<<<<<< HEAD
         expected_all_plans = copy.deepcopy(all_plans)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         second_global_plan, second_metadata = planner.create_global_plan(all_plans)
         # All the plans should be empty and usable
         for plan in second_global_plan:
@@ -239,10 +275,20 @@ def create_data(rank):
             self.assertIsNone(plan.storage_data)
 
         self.assertEqual(first_metadata, second_metadata)
+<<<<<<< HEAD
 
         # Validate that all_plans are cached and remain unchanged.
         cached_all_plans = SavePlanner._cached_all_plans[planner._cached_plans_key]
         self.assertEqual(cached_all_plans, expected_all_plans)
+=======
+        self.assertEqual(
+            second_metadata, planner._cached_metadata[planner._cached_plans_key]
+        )
+
+        # Validate that all_plans are cached and remain unchanged.
+        cached_all_plans = SavePlanner._cached_all_plans[planner._cached_plans_key]
+        self.assertEqual(cached_all_plans, all_plans)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Third iteration with changed plans
         def create_data_v2(rank):
@@ -261,7 +307,10 @@ def create_data_v2(rank):
             create_data_v2(2),
             create_data_v2(3),
         ]
+<<<<<<< HEAD
         expected_all_plans = copy.deepcopy(all_plans)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         third_global_plan, third_metadata = planner.create_global_plan(all_plans)
         # Only the rank 0 plan should be non-empty. The rest should be empty
         tensor_plan = third_global_plan[0]
@@ -270,7 +319,11 @@ def create_data_v2(rank):
 
         # Validate that all_plans are updated and cached
         cached_all_plans = SavePlanner._cached_all_plans[planner._cached_plans_key]
+<<<<<<< HEAD
         self.assertEqual(cached_all_plans, expected_all_plans)
+=======
+        self.assertEqual(cached_all_plans, all_plans)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for plan in third_global_plan[1:]:
             self.assertFalse(plan.usable)
@@ -280,6 +333,13 @@ def create_data_v2(rank):
 
         # Global metadata should be different as one plan has changed
         self.assertNotEqual(second_metadata, third_metadata)
+<<<<<<< HEAD
+=======
+        # Validate that the metadata is cached
+        self.assertEqual(
+            third_metadata, planner._cached_metadata[planner._cached_plans_key]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Validate that the new plan has been cached
         cached_global_plan = SavePlanner._cached_global_plan[planner._cached_plans_key][
diff --git a/test/distributed/checkpoint/test_save_load_api.py b/test/distributed/checkpoint/test_save_load_api.py
index e50a6b07cdea..8abf95ddc8ef 100644
--- a/test/distributed/checkpoint/test_save_load_api.py
+++ b/test/distributed/checkpoint/test_save_load_api.py
@@ -5,7 +5,11 @@
 import torch
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed._tensor.device_mesh import init_device_mesh
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
diff --git a/test/distributed/checkpoint/test_state_dict.py b/test/distributed/checkpoint/test_state_dict.py
index f8eb7c0368fd..e1048cd3b7fe 100644
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@@ -11,7 +11,10 @@
 import torch.nn as nn
 from torch.distributed._composable import replicate
 from torch.distributed._shard.sharded_tensor import ShardedTensor
+<<<<<<< HEAD
 from torch.distributed._tensor import DTensor, init_device_mesh
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     apply_activation_checkpointing,
 )
@@ -26,6 +29,10 @@
     set_optimizer_state_dict,
     StateDictOptions,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp import (
     fully_shard,
     FullyShardedDataParallel as FSDP,
@@ -34,6 +41,10 @@
 )
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.distributed.optim import _apply_optimizer_in_backward
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor import DTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
@@ -676,9 +687,13 @@ def test_optim_state_dict_param_matching(self) -> None:
             fully_shard(layer)
         fully_shard(model)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+<<<<<<< HEAD
         torch.optim.lr_scheduler.LambdaLR(
             optim, lr_lambda=[lambda epoch: 0.95**epoch]
         )
+=======
+        torch.optim.lr_scheduler.LambdaLR(optim, lr_lambda=[lambda epoch: 0.95**epoch])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opt_state_dict = ptd_state_dict.get_optimizer_state_dict(
             model,
             optim,
@@ -913,6 +928,33 @@ def test_setting_meta_device_model_broadcasting_and_memory(self) -> None:
 
     @with_comms
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
+=======
+    def test_set_cpu_model_state_dict_broadcast_from_rank0(self) -> None:
+        torch.manual_seed(42)
+        model = nn.Linear(2, 2)
+        expected_state_dict = {
+            k: v.detach().clone() for k, v in model.state_dict().items()
+        }
+        state_dict = expected_state_dict if torch.distributed.get_rank() == 0 else {}
+        model._apply(lambda t: torch.zeros_like(t))
+
+        set_model_state_dict(
+            model,
+            state_dict,
+            options=StateDictOptions(full_state_dict=True, broadcast_from_rank0=True),
+        )
+
+        for (actual_name, tensor), (expected_name, expected_tensor) in zip(
+            model.state_dict().items(),
+            expected_state_dict.items(),
+        ):
+            assert actual_name == expected_name
+            torch.testing.assert_close(tensor, expected_tensor, msg=expected_name)
+
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multi_device_load_model_state_dict(self) -> None:
         torch.manual_seed(0)
         with torch.device("meta"):
diff --git a/test/distributed/checkpoint/test_state_dict_stager.py b/test/distributed/checkpoint/test_state_dict_stager.py
new file mode 100644
index 000000000000..57f3c014e88d
--- /dev/null
+++ b/test/distributed/checkpoint/test_state_dict_stager.py
@@ -0,0 +1,821 @@
+# Owner(s): ["oncall: distributed"]
+
+import dataclasses
+
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+from torch.distributed._tensor.placement_types import Shard
+from torch.distributed.checkpoint._state_dict_stager import StateDictStager
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import requires_cuda, run_tests, TestCase
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+
+
+def create_cpu_state_dict(state_dict):
+    cpu_state_dict = {}
+    for key, value in state_dict.items():
+        cpu_state_dict[key] = value.cpu()
+    return cpu_state_dict
+
+
+def compare_state_dicts(cuda_state_dict, cpu_state_dict, rtol=1e-5, atol=1e-8):
+    """
+    Compare if two state dictionaries (one on CUDA, one on CPU) are otherwise the same.
+
+    This function checks if the tensors in both state dictionaries have the same values,
+    shapes, dtypes, etc., ignoring the device difference. It also checks if tensors that
+    share storage in one state dict also share storage in the other.
+
+    Args:
+        cuda_state_dict: The state dictionary with tensors on CUDA
+        cpu_state_dict: The state dictionary with tensors on CPU
+        rtol: Relative tolerance for comparing tensor values
+        atol: Absolute tolerance for comparing tensor values
+
+    Returns:
+        bool: True if the state dictionaries are equivalent, False otherwise
+        str: Error message if the state dictionaries are not equivalent, empty string otherwise
+    """
+    # Track storage data pointers to check storage sharing
+    cuda_storage_ptrs = {}
+    cpu_storage_ptrs = {}
+
+    def compare_objects(cuda_obj, cpu_obj, path=""):
+        # If objects are tensors, compare them
+        if isinstance(cuda_obj, torch.Tensor) and isinstance(cpu_obj, torch.Tensor):
+            # Check if devices are as expected
+            if cuda_obj.device.type != "cuda":
+                return (
+                    False,
+                    f"Expected CUDA tensor, got {cuda_obj.device.type} tensor at {path}",
+                )
+            if cpu_obj.device.type != "cpu":
+                return (
+                    False,
+                    f"Expected CPU tensor, got {cpu_obj.device.type} tensor at {path}",
+                )
+            if cuda_obj.storage_offset() != cpu_obj.storage_offset():
+                return (
+                    False,
+                    f"Storage offset mismatch at {path}: {cuda_obj.storage_offset()} vs {cpu_obj.storage_offset()}",
+                )
+
+            if not torch.equal(cuda_obj.cpu(), cpu_obj):
+                return (
+                    False,
+                    f"Tensors are not same at {path}",
+                )
+
+            # Track storage sharing
+            cuda_storage_ptr = cuda_obj.storage().data_ptr()
+            cpu_storage_ptr = cpu_obj.storage().data_ptr()
+
+            if cuda_storage_ptr in cuda_storage_ptrs:
+                # This CUDA tensor shares storage with another tensor
+                # Check if the corresponding CPU tensors also share storage
+                if cpu_storage_ptr != cuda_storage_ptrs[cuda_storage_ptr]:
+                    return (
+                        False,
+                        f"Storage sharing mismatch: CUDA tensors share storage but CPU tensors don't at {path}",
+                    )
+            else:
+                # First time seeing this storage
+                cuda_storage_ptrs[cuda_storage_ptr] = cpu_storage_ptr
+                cpu_storage_ptrs[cpu_storage_ptr] = cuda_storage_ptr
+
+            return True, ""
+
+        # If objects are dictionaries, compare them recursively
+        elif isinstance(cuda_obj, dict) and isinstance(cpu_obj, dict):
+            if cuda_obj.keys() != cpu_obj.keys():
+                return (
+                    False,
+                    f"Dictionary keys mismatch at {path}: {cuda_obj.keys()} vs {cpu_obj.keys()}",
+                )
+
+            for key in cuda_obj:
+                result, error = compare_objects(
+                    cuda_obj[key], cpu_obj[key], f"{path}.{key}" if path else key
+                )
+                if not result:
+                    return False, error
+
+            return True, ""
+
+        # If objects are lists, tuples, or sets, compare them recursively
+        elif isinstance(cuda_obj, (list, tuple, set)) and isinstance(
+            cpu_obj, (list, tuple, set)
+        ):
+            if len(cuda_obj) != len(cpu_obj):
+                return (
+                    False,
+                    f"Collection length mismatch at {path}: {len(cuda_obj)} vs {len(cpu_obj)}",
+                )
+            if type(cuda_obj) != type(cpu_obj):
+                return (
+                    False,
+                    f"Collection type mismatch at {path}: {type(cuda_obj)} vs {type(cpu_obj)}",
+                )
+
+            for i, (cuda_item, cpu_item) in enumerate(zip(cuda_obj, cpu_obj)):
+                result, error = compare_objects(cuda_item, cpu_item, f"{path}[{i}]")
+                if not result:
+                    return False, error
+
+            return True, ""
+
+        # If objects are custom classes, compare their attributes
+        elif hasattr(cuda_obj, "__dict__") and hasattr(cpu_obj, "__dict__"):
+            if type(cuda_obj) != type(cpu_obj):
+                return (
+                    False,
+                    f"Object type mismatch at {path}: {type(cuda_obj)} vs {type(cpu_obj)}",
+                )
+
+            result, error = compare_objects(
+                cuda_obj.__dict__, cpu_obj.__dict__, f"{path}.__dict__"
+            )
+            if not result:
+                return False, error
+
+            return True, ""
+
+        # For other types, use direct equality comparison
+        else:
+            if type(cuda_obj) != type(cpu_obj):
+                return (
+                    False,
+                    f"Type mismatch at {path}: {type(cuda_obj)} vs {type(cpu_obj)}",
+                )
+            if cuda_obj != cpu_obj:
+                return False, f"Value mismatch at {path}: {cuda_obj} vs {cpu_obj}"
+
+            return True, ""
+
+    # Start the recursive comparison
+    result, error = compare_objects(cuda_state_dict, cpu_state_dict)
+    return result, error
+
+
+@dataclasses.dataclass
+class TestStruct:
+    tensor1: torch.Tensor
+
+
+@dataclasses.dataclass
+class NestedTensorStruct:
+    tensor: torch.Tensor
+    value: int = 42
+
+
+@dataclasses.dataclass
+class ComplexDataClass:
+    tensor: torch.Tensor
+    name: str
+    values: list[float]
+    nested: NestedTensorStruct
+
+
+@dataclasses.dataclass(frozen=True)
+class FrozenDataClass:
+    tensor: torch.Tensor
+    value: int = 100
+
+
+class TestStateDictStager(TestCase):
+    @requires_cuda
+    def test_views(self):
+        test_configs = [
+            (False, False),  # pin_memory=False, share_memory=False,
+            (True, False),  # pin_memory=True, share_memory=False
+            (False, True),  # pin_memory=False, share_memory=True
+            (True, True),  # pin_memory=True, share_memory=True
+        ]
+        for pin_memory, share_memory in test_configs:
+            with self.subTest(pin_memory=pin_memory, share_memory=share_memory):
+                tensor1 = torch.randn(4, 4).cuda()
+                tensor2 = tensor1.view(16)
+                tensor3 = torch.randn(4, 4).cuda()
+                state_dict = {
+                    "tensor1": tensor1,
+                    "tensor2": tensor2,
+                    "recursive": {
+                        "tensor3": tensor3,
+                        "type": TestStruct(tensor1=tensor3.narrow(0, 0, 2)),
+                    },
+                }
+                assert (
+                    state_dict["tensor1"].storage().data_ptr()
+                    == state_dict["tensor2"].storage().data_ptr()
+                )
+
+                stager = StateDictStager(
+                    pin_memory=pin_memory, share_memory=share_memory
+                )
+
+                cpu_state_dict = stager.stage(state_dict)
+
+                # Calculate stats
+                num_storages = len(stager._cached_storage_mapping)
+                num_bytes = sum(
+                    storage.nbytes()
+                    for storage in stager._cached_storage_mapping.values()
+                )
+
+                # Validate tensor count and bytes
+                expected_storage_cnt = 2
+                assert num_storages == expected_storage_cnt, (
+                    f"Expected {expected_storage_cnt} storages, got {num_storages}"
+                )
+
+                # Calculate expected bytes
+                # Note: Only unique storages are counted in the byte count
+                expected_bytes = (
+                    tensor1.numel() * tensor1.element_size()
+                    + tensor3.numel()  # tensor1 and tensor2 share storage
+                    * tensor3.element_size()  # tensor3 and its narrow view share storage
+                )
+                assert num_bytes == expected_bytes, (
+                    f"Expected {expected_bytes} bytes, got {num_bytes}"
+                )
+                # Verify that the CPU state dict is equivalent to the original CUDA state dict
+                result, error = compare_state_dicts(state_dict, cpu_state_dict)
+                assert result, f"State dicts are not equivalent: {error}"
+
+                # Additional checks for storage sharing
+                assert cpu_state_dict["tensor1"].device == torch.device("cpu")
+                assert cpu_state_dict["tensor2"].device == torch.device("cpu")
+                assert (
+                    cpu_state_dict["tensor1"].storage().data_ptr()
+                    == cpu_state_dict["tensor2"].storage().data_ptr()
+                )
+
+                recursive = cpu_state_dict["recursive"]
+                assert recursive["tensor3"].device == torch.device("cpu")
+                assert recursive["type"].tensor1.device == torch.device("cpu")
+                assert (
+                    recursive["tensor3"].storage().data_ptr()
+                    == recursive["type"].tensor1.storage().data_ptr()
+                )
+
+    @requires_cuda
+    def test_caching(self):
+        """
+        Test that the StateDictStager correctly caches and reuses storages.
+        """
+        test_configs = [
+            (False, False),  # pin_memory=False, share_memory=False,
+            (True, False),  # pin_memory=True, share_memory=False
+            (False, True),  # pin_memory=False, share_memory=True
+            (True, True),  # pin_memory=True, share_memory=True
+        ]
+        for pin_memory, share_memory in test_configs:
+            with self.subTest(pin_memory=pin_memory, share_memory=share_memory):
+                # Create test tensors and state dict
+                tensor1 = torch.randn(4, 4).cuda()
+                tensor2 = tensor1.view(16)
+                tensor3 = torch.randn(4, 4).cuda()
+                state_dict = {
+                    "tensor1": tensor1,
+                    "tensor2": tensor2,
+                    "recursive": {
+                        "tensor3": tensor3,
+                        "type": TestStruct(tensor1=tensor3.narrow(0, 0, 2)),
+                    },
+                }
+
+                # Create a StateDictStager instance
+                stager = StateDictStager(
+                    pin_memory=pin_memory, share_memory=share_memory
+                )
+
+                # First call to stage with staging context
+                cpu_state_dict1 = stager.stage(state_dict)
+
+                # Get the number of cached storages after first stage
+                num_storages1 = len(stager._cached_storage_mapping)
+
+                # Verify the first result is correct
+                result, error = compare_state_dicts(state_dict, cpu_state_dict1)
+                assert result, (
+                    f"First state dict is not equivalent to original: {error}"
+                )
+
+                # Modify the original tensors
+                tensor1.fill_(0)
+                tensor3.fill_(0)
+
+                # Second call to stage with staging context
+                cpu_state_dict2 = stager.stage(state_dict)
+
+                # Get the number of cached storages after second stage
+                num_storages2 = len(stager._cached_storage_mapping)
+
+                # Verify that the second CPU state dict is equivalent to the modified original state dict
+                result, error = compare_state_dicts(state_dict, cpu_state_dict2)
+                assert result, (
+                    f"Second state dict is not equivalent to modified original: {error}"
+                )
+
+                # Verify that the number of cached storages hasn't changed
+                assert num_storages1 == num_storages2, (
+                    f"Storage count changed: {num_storages1} vs {num_storages2}"
+                )
+
+                # Verify that the tensors in the second state dict have the same storage pointers as the first
+                assert (
+                    cpu_state_dict1["tensor1"].storage().data_ptr()
+                    == cpu_state_dict2["tensor1"].storage().data_ptr()
+                ), "Storage pointers should match for tensor1"
+                assert (
+                    cpu_state_dict1["tensor2"].storage().data_ptr()
+                    == cpu_state_dict2["tensor2"].storage().data_ptr()
+                ), "Storage pointers should match for tensor2"
+                assert (
+                    cpu_state_dict1["recursive"]["tensor3"].storage().data_ptr()
+                    == cpu_state_dict2["recursive"]["tensor3"].storage().data_ptr()
+                ), "Storage pointers should match for tensor3"
+
+                # Modify the original tensors again with different values
+                tensor1.fill_(42.0)
+
+                # Third call to stage with staging context
+                cpu_state_dict3 = stager.stage(state_dict)
+
+                # Verify that the third CPU state dict reflects the updated values
+                assert torch.all(cpu_state_dict3["tensor1"] == 42.0), (
+                    "Updated values should be reflected in the cached state dict"
+                )
+                assert torch.all(cpu_state_dict3["tensor2"] == 42.0), (
+                    "Updated values should be reflected in the cached state dict"
+                )
+
+    @requires_cuda
+    def test_tensor_attrs(self):
+        """
+        Test that tensor attributes are preserved during stage with StateDictStager.
+        """
+        tensor1 = torch.randn(4, 4).cuda()
+        tensor2 = tensor1.view(16)
+        tensor3 = torch.randn(4, 4).cuda()
+
+        # Add custom attributes to tensors
+        tensor1.a = 42
+        tensor1.b = 43
+        tensor3.c = 44
+
+        state_dict = {
+            "tensor1": tensor1,
+            "tensor2": tensor2,
+            "recursive": {
+                "tensor3": tensor3,
+                "type": TestStruct(tensor1=tensor3.narrow(0, 0, 2)),
+            },
+        }
+
+        stager = StateDictStager(pin_memory=True, share_memory=True)
+        cpu_state_dict = stager.stage(state_dict)
+
+        # Verify that tensor attributes are preserved
+        assert hasattr(cpu_state_dict["tensor1"], "a"), (
+            "Tensor attribute 'a' was not preserved"
+        )
+        assert cpu_state_dict["tensor1"].a == 42, (
+            "Tensor attribute 'a' has incorrect value"
+        )
+        assert hasattr(cpu_state_dict["tensor1"], "b"), (
+            "Tensor attribute 'b' was not preserved"
+        )
+        assert cpu_state_dict["tensor1"].b == 43, (
+            "Tensor attribute 'b' has incorrect value"
+        )
+        assert hasattr(cpu_state_dict["recursive"]["tensor3"], "c"), (
+            "Tensor attribute 'c' was not preserved"
+        )
+        assert cpu_state_dict["recursive"]["tensor3"].c == 44, (
+            "Tensor attribute 'c' has incorrect value"
+        )
+
+    @requires_cuda
+    def test_different_dtypes(self):
+        """
+        Test that StateDictStager works correctly with tensors of different data types.
+        """
+        # Create tensors with different dtypes
+        tensors = {
+            "float32": torch.randn(4, 4, dtype=torch.float32).cuda(),
+            "float64": torch.randn(4, 4, dtype=torch.float64).cuda(),
+            "int32": torch.randint(-100, 100, (4, 4), dtype=torch.int32).cuda(),
+            "int64": torch.randint(-100, 100, (4, 4), dtype=torch.int64).cuda(),
+            "bool": torch.randint(0, 2, (4, 4), dtype=torch.bool).cuda(),
+        }
+
+        # Create a state dict with these tensors
+        state_dict = tensors.copy()
+
+        stager = StateDictStager()
+        cpu_state_dict = stager.stage(state_dict)
+
+        # Verify that all tensors have been correctly copied to CPU with the right dtypes
+        for dtype_name, original_tensor in tensors.items():
+            cpu_tensor = cpu_state_dict[dtype_name]
+            self.assertEqual(
+                cpu_tensor.device.type, "cpu", f"Tensor {dtype_name} should be on CPU"
+            )
+            self.assertEqual(
+                cpu_tensor.dtype,
+                original_tensor.dtype,
+                f"Tensor {dtype_name} has incorrect dtype",
+            )
+            self.assertTrue(
+                torch.allclose(cpu_tensor, original_tensor.cpu()),
+                f"Tensor {dtype_name} has incorrect values",
+            )
+
+    @requires_cuda
+    def test_empty_tensors(self):
+        """
+        Test that StateDictStager works correctly with empty tensors.
+        """
+        test_configs = [
+            (False, False),  # pin_memory=False, share_memory=False,
+            (True, False),  # pin_memory=True, share_memory=False
+            (False, True),  # pin_memory=False, share_memory=True
+            (True, True),  # pin_memory=True, share_memory=True
+        ]
+        for pin_memory, share_memory in test_configs:
+            with self.subTest(pin_memory=pin_memory, share_memory=share_memory):
+                # Create empty tensors with different shapes
+                tensors = {
+                    "empty_0d": torch.tensor([], dtype=torch.float32).cuda(),
+                    "empty_1d": torch.tensor([], dtype=torch.float32).reshape(0).cuda(),
+                    "empty_2d": torch.tensor([], dtype=torch.float32)
+                    .reshape(0, 0)
+                    .cuda(),
+                    "empty_3d": torch.tensor([], dtype=torch.float32)
+                    .reshape(0, 0, 0)
+                    .cuda(),
+                    "zero_dim": torch.tensor(0.0).cuda(),  # scalar tensor
+                }
+
+                # Create a state dict with these tensors
+                state_dict = tensors.copy()
+
+                cpu_state_dict = StateDictStager(pin_memory, share_memory).stage(
+                    state_dict
+                )
+
+                # Verify that all tensors have been correctly copied to CPU
+                for tensor_name, original_tensor in tensors.items():
+                    cpu_tensor = cpu_state_dict[tensor_name]
+
+                    self.assertEqual(
+                        cpu_tensor.device.type,
+                        "cpu",
+                        f"Tensor {tensor_name} should be on CPU",
+                    )
+                    self.assertEqual(
+                        cpu_tensor.shape,
+                        original_tensor.shape,
+                        f"Tensor {tensor_name} has incorrect shape",
+                    )
+                    self.assertEqual(
+                        cpu_tensor.dtype,
+                        original_tensor.dtype,
+                        f"Tensor {tensor_name} has incorrect dtype",
+                    )
+
+    @requires_cuda
+    def test_complex_storage_sharing(self):
+        """
+        Test that StateDictStager correctly handles complex storage sharing scenarios.
+        """
+        # Create a base tensor
+        base_tensor = torch.randn(10, 10).cuda()
+
+        # Create various views and slices that share storage
+        view1 = base_tensor.view(100)
+        view2 = base_tensor.view(10, 10)
+        slice1 = base_tensor[2:8, 2:8]
+        slice2 = base_tensor[:, :5]
+        slice3 = view1[10:60]
+
+        # Create a state dict with these tensors
+        state_dict = {
+            "base": base_tensor,
+            "view1": view1,
+            "view2": view2,
+            "slice1": slice1,
+            "slice2": slice2,
+            "slice3": slice3,
+        }
+        cpu_state_dict = StateDictStager().stage(state_dict)
+
+        # Verify that all tensors have been correctly copied to CPU
+        result, error = compare_state_dicts(state_dict, cpu_state_dict)
+        self.assertTrue(result, f"State dicts are not equivalent: {error}")
+
+        # Verify storage sharing is preserved
+        # All these tensors should share the same storage
+        storage_ptr = cpu_state_dict["base"].storage().data_ptr()
+        self.assertEqual(
+            cpu_state_dict["view1"].storage().data_ptr(),
+            storage_ptr,
+            "view1 should share storage with base",
+        )
+        self.assertEqual(
+            cpu_state_dict["view2"].storage().data_ptr(),
+            storage_ptr,
+            "view2 should share storage with base",
+        )
+        self.assertEqual(
+            cpu_state_dict["slice1"].storage().data_ptr(),
+            storage_ptr,
+            "slice1 should share storage with base",
+        )
+        self.assertEqual(
+            cpu_state_dict["slice2"].storage().data_ptr(),
+            storage_ptr,
+            "slice2 should share storage with base",
+        )
+        self.assertEqual(
+            cpu_state_dict["slice3"].storage().data_ptr(),
+            storage_ptr,
+            "slice3 should share storage with base",
+        )
+
+        # Verify that modifying the base tensor affects all views and slices
+        cpu_state_dict["base"].fill_(42.0)
+        self.assertTrue(
+            torch.all(cpu_state_dict["view1"] == 42.0),
+            "view1 should reflect changes to base",
+        )
+        self.assertTrue(
+            torch.all(cpu_state_dict["view2"] == 42.0),
+            "view2 should reflect changes to base",
+        )
+        self.assertTrue(
+            torch.all(cpu_state_dict["slice1"] == 42.0),
+            "slice1 should reflect changes to base",
+        )
+        self.assertTrue(
+            torch.all(cpu_state_dict["slice2"] == 42.0),
+            "slice2 should reflect changes to base",
+        )
+        self.assertTrue(
+            torch.all(cpu_state_dict["slice3"] == 42.0),
+            "slice3 should reflect changes to base",
+        )
+
+    @requires_cuda
+    def test_dataclasses(self):
+        # Create tensors
+        tensor1 = torch.randn(4, 4).cuda()
+        tensor2 = torch.randn(8, 8).cuda()
+        tensor3 = torch.randn(2, 6).cuda()
+        tensor4 = torch.randn(3, 5).cuda()
+
+        # Create dataclass instances
+        nested = NestedTensorStruct(tensor=tensor3)
+        complex_dc = ComplexDataClass(
+            tensor=tensor1, name="test", values=[1.0, 2.0, 3.0], nested=nested
+        )
+        frozen_dc = FrozenDataClass(tensor=tensor4)
+
+        # Create a state dict with these dataclasses
+        state_dict = {
+            "regular_tensor": tensor2,
+            "complex_dataclass": complex_dc,
+            "frozen_dataclass": frozen_dc,
+        }
+
+        # Stage the state dict
+        stager = StateDictStager(pin_memory=False, share_memory=False)
+        cpu_state_dict = stager.stage(state_dict)
+
+        # Verify regular tensor
+        self.assertEqual(cpu_state_dict["regular_tensor"].device.type, "cpu")
+        self.assertTrue(torch.allclose(cpu_state_dict["regular_tensor"], tensor2.cpu()))
+
+        # Verify complex dataclass
+        complex_cpu = cpu_state_dict["complex_dataclass"]
+        self.assertEqual(complex_cpu.name, "test")
+        self.assertEqual(complex_cpu.values, [1.0, 2.0, 3.0])
+        self.assertEqual(complex_cpu.tensor.device.type, "cpu")
+        self.assertTrue(torch.allclose(complex_cpu.tensor, tensor1.cpu()))
+
+        # Verify nested dataclass inside complex dataclass
+        nested_cpu = complex_cpu.nested
+        self.assertEqual(nested_cpu.value, 42)
+        self.assertEqual(nested_cpu.tensor.device.type, "cpu")
+        self.assertTrue(torch.allclose(nested_cpu.tensor, tensor3.cpu()))
+
+        # Verify frozen dataclass
+        frozen_cpu = cpu_state_dict["frozen_dataclass"]
+        self.assertEqual(frozen_cpu.value, 100)
+        self.assertEqual(frozen_cpu.tensor.device.type, "cpu")
+        self.assertTrue(torch.allclose(frozen_cpu.tensor, tensor4.cpu()))
+
+        # Verify that modifying the original tensors doesn't affect the staged ones
+        tensor1.fill_(99.0)
+        tensor3.fill_(88.0)
+        tensor4.fill_(77.0)
+
+        self.assertFalse(torch.allclose(complex_cpu.tensor, tensor1.cpu()))
+        self.assertFalse(torch.allclose(nested_cpu.tensor, tensor3.cpu()))
+        self.assertFalse(torch.allclose(frozen_cpu.tensor, tensor4.cpu()))
+
+    def test_cpu_storage_independence(self):
+        """
+        Test ensures CPU tensors passed to StateDictStager are actually cloned
+        """
+        # Create test tensors
+        tensor1 = torch.randn(4, 4)
+        tensor2 = torch.randn(8, 8)
+
+        # Create a state dict with these tensors
+        state_dict = {
+            "tensor1": tensor1,
+            "tensor2": tensor2,
+        }
+
+        cpu_state_dict = StateDictStager().stage(state_dict)
+        cpu_tensor1 = cpu_state_dict["tensor1"]
+        cpu_tensor2 = cpu_state_dict["tensor2"]
+
+        # Verify that the CPU tensors have different storage pointers than the original tensors
+        self.assertNotEqual(
+            tensor1.storage().data_ptr(),
+            cpu_tensor1.storage().data_ptr(),
+            "CPU tensor should have a different storage pointer than the original tensor",
+        )
+        self.assertNotEqual(
+            tensor2.storage().data_ptr(),
+            cpu_tensor2.storage().data_ptr(),
+            "CPU tensor should have a different storage pointer than the original tensor",
+        )
+
+        self.assertTrue(
+            torch.allclose(tensor1, cpu_tensor1),
+            "CPU tensor should have the same values as the original tensor",
+        )
+        self.assertTrue(
+            torch.allclose(tensor2, cpu_tensor2),
+            "CPU tensor should have the same values as the original tensor",
+        )
+
+        # Modify the original CPU tensors and validate staged tensors are not modified
+        cloned_orginial1 = tensor1.clone()
+        cloned_orginia2 = tensor2.clone()
+        tensor1.fill_(99.0)
+        tensor2.fill_(88.0)
+
+        self.assertFalse(torch.allclose(cloned_orginial1, tensor1))
+        self.assertTrue(
+            torch.allclose(cloned_orginial1, cpu_tensor1),
+            "CPU tensor should have the same values as the original tensor",
+        )
+        self.assertTrue(
+            torch.allclose(cloned_orginia2, cpu_tensor2),
+            "CPU tensor should have the same values as the original tensor",
+        )
+
+    @requires_cuda
+    def test_tensor_pinned_and_shared(self):
+        """
+        Test that verifies tensors are actually pinned and shared using tensor.is_pinned() and tensor.is_shared() methods.
+        """
+        # Create test tensors
+        tensor1 = torch.randn(4, 4).cuda()
+        tensor2 = torch.randn(8, 8).cuda()
+
+        # Create a state dict with these tensors
+        state_dict = {
+            "tensor1": tensor1,
+            "tensor2": tensor2,
+        }
+
+        # Test all combinations of pin_memory and share_memory
+        test_configs = [
+            (False, False),  # pin_memory=False, share_memory=False
+            (True, False),  # pin_memory=True, share_memory=False
+            (False, True),  # pin_memory=False, share_memory=True
+            (True, True),  # pin_memory=True, share_memory=True
+        ]
+
+        for pin_memory, share_memory in test_configs:
+            with self.subTest(pin_memory=pin_memory, share_memory=share_memory):
+                # Create stager with specific configuration
+                stager = StateDictStager(
+                    pin_memory=pin_memory, share_memory=share_memory
+                )
+                cpu_state_dict = stager.stage(state_dict)
+
+                # Get the staged tensors
+                cpu_tensor1 = cpu_state_dict["tensor1"]
+                cpu_tensor2 = cpu_state_dict["tensor2"]
+
+                # Verify tensor device
+                self.assertEqual(
+                    cpu_tensor1.device.type, "cpu", "Staged tensor should be on CPU"
+                )
+                self.assertEqual(
+                    cpu_tensor2.device.type, "cpu", "Staged tensor should be on CPU"
+                )
+
+                # Verify tensor values
+                self.assertTrue(
+                    torch.allclose(cpu_tensor1, tensor1.cpu()),
+                    "CPU tensor should have the same values as the original tensor",
+                )
+                self.assertTrue(
+                    torch.allclose(cpu_tensor2, tensor2.cpu()),
+                    "CPU tensor should have the same values as the original tensor",
+                )
+
+                # Verify pinned memory status
+                self.assertEqual(
+                    cpu_tensor1.is_pinned(),
+                    pin_memory,
+                    f"Tensor pinned status should be {pin_memory}",
+                )
+                self.assertEqual(
+                    cpu_tensor2.is_pinned(),
+                    pin_memory,
+                    f"Tensor pinned status should be {pin_memory}",
+                )
+
+                # Verify shared memory status
+                self.assertEqual(
+                    cpu_tensor1.is_shared(),
+                    share_memory,
+                    f"Tensor shared status should be {share_memory}",
+                )
+                self.assertEqual(
+                    cpu_tensor2.is_shared(),
+                    share_memory,
+                    f"Tensor shared status should be {share_memory}",
+                )
+
+                # Verify storage sharing is consistent with tensor sharing
+                if share_memory:
+                    # When share_memory is True, the storage should also be shared
+                    self.assertTrue(
+                        cpu_tensor1.storage().is_shared(),
+                        "When share_memory=True, tensor storage should be shared",
+                    )
+                    self.assertTrue(
+                        cpu_tensor2.storage().is_shared(),
+                        "When share_memory=True, tensor storage should be shared",
+                    )
+                else:
+                    # When share_memory is False, the storage should not be shared
+                    self.assertFalse(
+                        cpu_tensor1.storage().is_shared(),
+                        "When share_memory=False, tensor storage should not be shared",
+                    )
+                    self.assertFalse(
+                        cpu_tensor2.storage().is_shared(),
+                        "When share_memory=False, tensor storage should not be shared",
+                    )
+
+
+class TestDTensorStateDictStager(DTensorTestBase):
+    @with_comms
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_dtensor(self):
+        """
+        Test that StateDictStager works correctly with DTensors.
+        """
+        # Create a DTensor
+        device_mesh = dist.DeviceMesh("cuda", list(range(dist.get_world_size())))
+        tensor = torch.randn(3, 3, device="cuda")
+        dtensor = DTensor.from_local(tensor, device_mesh, [Shard(0)])
+
+        dtensor = dtensor + 1
+        dtensor = dtensor * 2
+
+        state_dict = {
+            "dtensor": dtensor,
+        }
+
+        stager = StateDictStager(pin_memory=True, share_memory=True)
+        cpu_state_dict = stager.stage(state_dict)
+
+        # Verify the original DTensor has the expected values
+        self.assertTrue(torch.allclose(dtensor.to_local(), (tensor + 1) * 2))
+        self.assertTrue(
+            torch.allclose(
+                cpu_state_dict["dtensor"].to_local(), dtensor.to_local().cpu()
+            )
+        )
+        self.assertEqual(cpu_state_dict["dtensor"]._spec, dtensor._spec)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_state_dict_utils.py b/test/distributed/checkpoint/test_state_dict_utils.py
index b77300da06ca..bffeb47f914a 100644
--- a/test/distributed/checkpoint/test_state_dict_utils.py
+++ b/test/distributed/checkpoint/test_state_dict_utils.py
@@ -5,6 +5,15 @@
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
+<<<<<<< HEAD
+=======
+from torch.distributed._shard.sharded_tensor import (
+    init_from_local_shards,
+    Shard as ShardedTensorShard,
+    ShardedTensor,
+    ShardMetadata,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed._state_dict_utils import (
     _check_state_dict_similarity,
     _copy_state_dict,
@@ -13,12 +22,17 @@
     _gather_state_dict,
     _offload_state_dict_to_cpu,
 )
+<<<<<<< HEAD
 from torch.distributed._tensor import (
     distribute_tensor,
     DTensor,
     init_device_mesh,
     Shard,
 )
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, DTensor, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -124,15 +138,51 @@ def create_dtensor():
         }
         self.assertEqual(state_dict, _gather_state_dict(dist_state_dict))
 
+<<<<<<< HEAD
+    @skip_if_lt_x_gpu(2)
+    def test_create_cpu_state_dict(self):
+        device = torch.device("cuda")
+=======
+    @with_comms
     @skip_if_lt_x_gpu(2)
     def test_create_cpu_state_dict(self):
         device = torch.device("cuda")
+        rank = dist.get_rank()
+        # Scale tensors based on world size
+        # to fit in the tensor shards accurately.
+        scale_factor = self.world_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buffer = io.BytesIO()
         torch.save(torch.ones(10), buffer)
         buffer.seek(0)
         state_dict = {
             "tensor1": torch.arange(10, device=device),
             "tensor2": torch.ones(10, device=device),
+<<<<<<< HEAD
+=======
+            "sharded_tensor": init_from_local_shards(
+                [
+                    ShardedTensorShard(
+                        tensor=torch.arange(
+                            50 * rank, 50 + 50 * rank, device=device
+                        ).reshape(5, 10),
+                        metadata=ShardMetadata(
+                            shard_offsets=[5 * rank, 0],
+                            shard_sizes=[5, 10],
+                            placement=f"rank:{rank}/cuda:{rank}",
+                        ),
+                    )
+                ],
+                torch.Size([5 * scale_factor, 10]),
+            ),
+            "dtensor": distribute_tensor(
+                torch.arange(50 * scale_factor, device=device).reshape(
+                    5 * scale_factor, 10
+                ),
+                init_device_mesh("cuda", mesh_shape=(self.world_size,)),
+                [Shard(0)],
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "non_tensor_bytes_io": copy.deepcopy(buffer),
             "non_tensor_bytes": buffer.read(),
             "step": torch.tensor(7, dtype=torch.float),
@@ -152,10 +202,41 @@ def _verify(cpu_state_dict):
 
             # Verify if _copy_state_dict works
             for v in cpu_state_dict.values():
+<<<<<<< HEAD
                 if isinstance(v, torch.Tensor):
                     self.assertFalse(v.is_cuda)
             self.assertEqual(cpu_state_dict["tensor1"], torch.arange(10))
             self.assertEqual(cpu_state_dict["tensor2"], torch.ones(10))
+=======
+                if isinstance(v, (torch.Tensor, DTensor, ShardedTensor)):
+                    self.assertTrue(v.device == torch.device("cpu"))
+            self.assertEqual(cpu_state_dict["tensor1"], torch.arange(10))
+            self.assertEqual(cpu_state_dict["tensor2"], torch.ones(10))
+            self.assertEqual(
+                cpu_state_dict["sharded_tensor"].local_tensor(),
+                torch.arange(50 * rank, 50 + 50 * rank).reshape(5, 10),
+            )
+            self.assertEqual(
+                cpu_state_dict["dtensor"].to_local(),
+                torch.arange(50 * rank, 50 + 50 * rank).reshape(5, 10),
+            )
+            self.assertNotEqual(
+                cpu_state_dict["tensor1"].storage().data_ptr(),
+                state_dict["tensor1"].storage().data_ptr(),
+            )
+            self.assertNotEqual(
+                cpu_state_dict["tensor2"].storage().data_ptr(),
+                state_dict["tensor2"].storage().data_ptr(),
+            )
+            self.assertNotEqual(
+                cpu_state_dict["sharded_tensor"].local_tensor().storage().data_ptr(),
+                state_dict["sharded_tensor"].local_tensor().storage().data_ptr(),
+            )
+            self.assertNotEqual(
+                cpu_state_dict["dtensor"].to_local().storage().data_ptr(),
+                state_dict["dtensor"].to_local().storage().data_ptr(),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             buffer.seek(0)
             cpu_state_dict["non_tensor_bytes_io"].seek(0)
             self.assertEqual(
@@ -167,6 +248,11 @@ def _verify(cpu_state_dict):
             self.assertEqual(cpu_state_dict["step"], 7)
             self.assertEqual(cpu_state_dict["nested"], {"list": [1, 2, 3, 4]})
 
+<<<<<<< HEAD
+=======
+        cpu_state_dict = _create_cpu_state_dict(state_dict)
+        _verify(cpu_state_dict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cpu_state_dict = _create_cpu_state_dict(state_dict, pin_memory=True)
         _verify(cpu_state_dict)
         cpu_state_dict = _create_cpu_state_dict(state_dict, share_memory=True)
diff --git a/test/distributed/checkpoint/test_tp_checkpoint.py b/test/distributed/checkpoint/test_tp_checkpoint.py
index 3f5bbce1b334..96bca166564f 100644
--- a/test/distributed/checkpoint/test_tp_checkpoint.py
+++ b/test/distributed/checkpoint/test_tp_checkpoint.py
@@ -4,11 +4,18 @@
 
 import torch
 import torch.distributed.checkpoint as dcp
+<<<<<<< HEAD
 from torch.distributed._tensor import init_device_mesh
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.default_planner import (
     DefaultLoadPlanner,
     DefaultSavePlanner,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
diff --git a/test/distributed/checkpoint/test_utils.py b/test/distributed/checkpoint/test_utils.py
index d3b3441039d4..18b85dc9e420 100644
--- a/test/distributed/checkpoint/test_utils.py
+++ b/test/distributed/checkpoint/test_utils.py
@@ -2,6 +2,10 @@
 
 import io
 import sys
+<<<<<<< HEAD
+=======
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -242,6 +246,64 @@ def test_scatter_object(self):
         expected_objects = rank
         assert scattered_objects == expected_objects
 
+<<<<<<< HEAD
+=======
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+    def test_broadcast_object_with_nonzero_coordinator(self):
+        # Everybody uses WORLD, but src is coordinator_rank=1
+        dist_wrapper = _DistWrapper(
+            group=dist.group.WORLD,
+            use_dist=True,
+            coordinator_rank=1,
+        )
+
+        rank = dist.get_rank()
+        # only local rank 1 supplies the payload
+        payload: Optional[int] = rank if rank == 1 else None
+
+        result = dist_wrapper.broadcast_object(payload)
+        # every rank should receive the value from global rank 1
+        assert result == 1
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_broadcast_object_global_local_mismatch(self):
+        # reproduces issue 152310
+
+        mesh_2d = dist.init_device_mesh(self.device_type, (2, self.world_size // 2))
+        dist_wrapper = _DistWrapper(
+            group=mesh_2d.get_group(1),
+            use_dist=True,
+            coordinator_rank=1,  # local coordinator index within the subgroup
+        )
+
+        rank = mesh_2d.get_rank()
+
+        # only the local coordinator in each subgroup provides payload
+        payload: Optional[int] = rank if dist_wrapper.is_coordinator else None
+        got = dist_wrapper.broadcast_object(payload)
+
+        # ensure we broadcast from the *global* coordinator rank,
+        # not the local index.  For rows [0,1] this is global rank 1;
+        # for rows [2,3] this is global rank 3.
+        expected = dist_wrapper.global_coordinator_rank
+        assert got == expected
+
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+    def test_barrier(self):
+        mesh_2d = dist.init_device_mesh(self.device_type, (2, self.world_size // 2))
+        torch.random.manual_seed(dist.get_rank())
+
+        dist_wrapper = _DistWrapper(
+            mesh_2d.get_group(1), use_dist=True, coordinator_rank=0
+        )
+
+        # No exception should be raised.
+        dist_wrapper.barrier()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index c821d801dc40..0485bb73a075 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -34,7 +34,10 @@
 from torch.distributed.elastic.rendezvous import RendezvousHandler, RendezvousParameters
 from torch.distributed.elastic.rendezvous.api import RendezvousGracefulExitError
 from torch.distributed.elastic.utils.distributed import get_free_port
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def do_nothing():
@@ -127,9 +130,13 @@ def __init__(self, spec):
         self.stop_workers_call_count = 0
         self.start_workers_call_count = 0
 
+<<<<<<< HEAD
     def _stop_workers(
         self, worker_group: WorkerGroup, is_restart: bool = False
     ) -> None:
+=======
+    def _stop_workers(self, worker_group: WorkerGroup) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # workers are fake, nothing to stop; just clear the rdzv info
         worker_group.group_rank = None
         worker_group.group_world_size = None
@@ -168,6 +175,10 @@ def _get_worker_spec(
         role="test_trainer",
         local_world_size=8,
         local_addr=None,
+<<<<<<< HEAD
+=======
+        event_log_handler="null",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         run_id = str(uuid.uuid4().int)
         port = get_free_port()
@@ -194,6 +205,10 @@ def _get_worker_spec(
             max_restarts=max_restarts,
             monitor_interval=monitor_interval,
             local_addr=local_addr,
+<<<<<<< HEAD
+=======
+            event_log_handler=event_log_handler,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return spec
 
@@ -350,7 +365,13 @@ def test_rendezvous_master_addr_with_local_addr(self):
         self.assertEqual(spec_local_addr, worker_group.master_addr)
         self.assertGreater(worker_group.master_port, 0)
 
+<<<<<<< HEAD
     def test_initialize_workers(self):
+=======
+    @patch.object(TestAgent, "_construct_event")
+    @patch("torch.distributed.elastic.agent.server.api.record")
+    def test_initialize_workers(self, mock_record, mock_construct_event):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         spec = self._get_worker_spec(max_restarts=1)
         agent = TestAgent(spec)
         worker_group = agent.get_worker_group()
@@ -361,6 +382,36 @@ def test_initialize_workers(self):
             worker = worker_group.workers[i]
             self.assertEqual(worker.id, worker.global_rank)
 
+<<<<<<< HEAD
+=======
+        mock_construct_event.assert_called()
+        self.assertEqual(mock_construct_event.call_count, 10)
+        mock_record.assert_called()
+        second_arg = mock_record.call_args_list[0][0][1]
+        self.assertEqual(second_arg, "null")
+
+    @patch.object(TestAgent, "_construct_event")
+    @patch("torch.distributed.elastic.agent.server.api.record")
+    def test_initialize_workers_with_new_spec(self, mock_record, mock_construct_event):
+        spec = self._get_worker_spec(
+            max_restarts=1, event_log_handler="framework_logger"
+        )
+        agent = TestAgent(spec)
+        worker_group = agent.get_worker_group()
+        agent._initialize_workers(worker_group)
+
+        self.assertEqual(WorkerState.HEALTHY, worker_group.state)
+        for i in range(spec.local_world_size):
+            worker = worker_group.workers[i]
+            self.assertEqual(worker.id, worker.global_rank)
+
+        mock_construct_event.assert_called()
+        self.assertEqual(mock_construct_event.call_count, 10)
+        mock_record.assert_called()
+        second_arg = mock_record.call_args_list[0][0][1]
+        self.assertEqual(second_arg, "framework_logger")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_restart_workers(self):
         spec = self._get_worker_spec()
         agent = TestAgent(spec)
@@ -648,4 +699,11 @@ def test_agent_process_handler_graceful_exception(self, invoke_run, _):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     run_tests()
+=======
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
index f689f8f41f54..6746277b2332 100644
--- a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
+++ b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
@@ -1466,3 +1466,13 @@ def fail_rank_one_once(self):
     )
     def test_rank_restart_after_failure(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.fail_rank_one_once)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index a6acc177ec81..e372bf24f1bd 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -500,11 +500,21 @@ def test_wait_for_all_child_procs_to_exit(self):
                 logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
             )
 
+<<<<<<< HEAD
             with mock.patch.object(
                 mpc, "_is_done", return_value=True
             ), mock.patch.object(mpc, "_pc"), mock.patch.object(
                 mpc._pc, "join", side_effect=[True, False, False, True]
             ) as mock_join:
+=======
+            with (
+                mock.patch.object(mpc, "_is_done", return_value=True),
+                mock.patch.object(mpc, "_pc"),
+                mock.patch.object(
+                    mpc._pc, "join", side_effect=[True, False, False, True]
+                ) as mock_join,
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mpc._poll()
                 self.assertEqual(4, mock_join.call_count)
 
diff --git a/test/distributed/elastic/multiprocessing/redirects_test.py b/test/distributed/elastic/multiprocessing/redirects_test.py
index 2fa507a15a36..e8161a403be1 100644
--- a/test/distributed/elastic/multiprocessing/redirects_test.py
+++ b/test/distributed/elastic/multiprocessing/redirects_test.py
@@ -141,4 +141,11 @@ def c_print(i):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     unittest.main()
+=======
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/elastic/test_control_plane.py b/test/distributed/elastic/test_control_plane.py
index 10be085b70aa..cc3df513a302 100644
--- a/test/distributed/elastic/test_control_plane.py
+++ b/test/distributed/elastic/test_control_plane.py
@@ -15,12 +15,16 @@
     TORCH_WORKER_SERVER_SOCKET,
     worker_main,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     requires_cuda,
     run_tests,
     skipIfRocm,
     TestCase,
 )
+=======
+from torch.testing._internal.common_utils import requires_cuda, run_tests, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class UnixHTTPConnection(HTTPConnection):
@@ -62,9 +66,16 @@ def test_worker_server(self) -> None:
             self.assertEqual(resp.status, 200)
             self.assertEqual(
                 resp.data,
+<<<<<<< HEAD
                 b"""<h1>torch.distributed.WorkerServer</h1>
 <a href="/handler/">Handler names</a>
 """,
+=======
+                b"<h1>torch.distributed.WorkerServer</h1>\n"
+                b'<a href="'
+                b"/handler/"
+                b'">Handler names</a>\n',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             resp = pool.request("POST", "/handler/ping")
@@ -156,7 +167,10 @@ def test_dump_nccl_trace_pickle_with_json(self) -> None:
             )
             self.assertEqual(resp.status, 200)
 
+<<<<<<< HEAD
     @skipIfRocm  # skipped upstream too
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tcp(self) -> None:
         import requests
 
diff --git a/test/distributed/elastic/utils/util_test.py b/test/distributed/elastic/utils/util_test.py
index 9eb69d99cb77..7c9176794ed6 100644
--- a/test/distributed/elastic/utils/util_test.py
+++ b/test/distributed/elastic/utils/util_test.py
@@ -158,7 +158,11 @@ def run_barrier_for_rank(i: int):
         with ThreadPool(N - 1) as pool:
             outputs: list[str] = pool.map(run_barrier_for_rank, range(N - 1))
 
+<<<<<<< HEAD
         self.assertTrue(any("missing_ranks=[Rank 2 host]" in msg for msg in outputs))
+=======
+        self.assertTrue(any("missing_ranks=[Rank " in msg for msg in outputs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertTrue(
             any(
diff --git a/test/distributed/flight_recorder/test_fr_analysis.py b/test/distributed/flight_recorder/test_fr_analysis.py
index ebfe517c6dda..d186d606c604 100644
--- a/test/distributed/flight_recorder/test_fr_analysis.py
+++ b/test/distributed/flight_recorder/test_fr_analysis.py
@@ -1,13 +1,26 @@
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
 import math
 import pathlib
 import sys
+=======
+import copy
+import math
+import pathlib
+import sys
+from typing import Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
 
 sys.path.insert(0, str(REPO_ROOT))
+<<<<<<< HEAD
+=======
+from tools.flight_recorder.components.builder import build_db
+from tools.flight_recorder.components.config_manager import JobConfig
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from tools.flight_recorder.components.types import COLLECTIVES, MatchInfo, MatchState
 from tools.flight_recorder.components.utils import match_one_event
 
@@ -117,15 +130,56 @@ def test_match_one_event(self):
             MatchState.COLLECTIVE_DTYPE_MISMATCH,
         )
 
+<<<<<<< HEAD
+=======
+        e11 = create_one_event(
+            "gather",
+            ("0", "default"),
+            [[4, 4]],
+            [[4, 4], [4, 4]],
+            "completed",
+            1,
+            output_dtypes="float32",
+        )
+        e12 = create_one_event(
+            "gather",
+            ("0", "default"),
+            [[4, 4]],
+            [[]],
+            "completed",
+            1,
+            output_dtypes="",
+        )
+        self.assertEqual(
+            match_one_event(e11, e12, membership, "0").state,
+            MatchState.FULLY_MATCHED,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_all_events(self):
         for collective in sorted(COLLECTIVES):
             input_sizes = [[4, 4]]
             output_sizes = [[4, 4]]
             expectedState = MatchState.FULLY_MATCHED
+<<<<<<< HEAD
             if collective == "_reduce_scatter_base":
                 input_sizes = [[4, 4]]
                 output_sizes = [[input_sizes[0][0] * 2]]
             if collective == "all_gather":
+=======
+            if collective in [
+                "reduce_scatter",
+                "_reduce_scatter_base",
+                "reduce_scatter_tensor_coalesced",
+            ]:
+                input_sizes = [[4, 4]]
+                output_sizes = [[input_sizes[0][0] * 2]]
+            if collective in [
+                "all_gather",
+                "_all_gather_base",
+                "all_gather_into_tensor_coalesced",
+            ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_sizes = [[math.prod(input_sizes[0]) * 2]]
             if collective == "all_to_all":
                 expectedState = MatchState.UNDECIDED
@@ -147,5 +201,206 @@ def test_match_info(self):
         self.assertEqual(str(m2), "Error type: FULLY_MATCHED, rank 1")
 
 
+<<<<<<< HEAD
+=======
+LOADED_FR_DETAIL_TEMPLATE: dict[str, dict[str, Any]] = {
+    "dump_file_rank_0": {
+        "entries": [],
+        "pg_config": {
+            "0": {"name": "0", "desc": "default_pg", "ranks": "[0, 1]"},
+            "1": {"name": "1", "desc": "sub_pg", "ranks": "[0]"},
+        },
+        "rank": 0,
+    },
+    "dump_file_rank_1": {
+        "entries": [],
+        "pg_config": {
+            "0": {"name": "0", "desc": "default_pg", "ranks": "[0, 1]"},
+            "1": {"name": "1", "desc": "sub_pg", "ranks": "[1]"},
+        },
+        "rank": 1,
+    },
+}
+
+
+def create_one_entry(
+    record_id,
+    collective_name,
+    input_sizes,
+    output_sizes,
+    state="completed",
+    collective_seq_id=0,
+    p2p_seq_id=0,
+    output_dtypes="float32",
+    pg_info=("0", "default"),
+):
+    event = create_one_event(
+        collective_name,
+        pg_info,
+        input_sizes,
+        output_sizes,
+        state,
+        collective_seq_id,
+        p2p_seq_id,
+        output_dtypes,
+    )
+    event.update({"record_id": record_id})
+    event.update({"is_p2p": False})
+    return event
+
+
+class FlightRecorderE2ETest(TestCase):
+    def testBuildDB(self):
+        config = JobConfig()
+        args = config.parse_args([])
+        version = "2.8"  # Same as the version in FlightRecorder.hpp
+        LOADED_FR_DETAIL_TEMPLATE["dump_file_rank_0"]["version"] = version
+        LOADED_FR_DETAIL_TEMPLATE["dump_file_rank_1"]["version"] = version
+        # Test case 1: matched all_reduce case.
+        details1 = copy.deepcopy(LOADED_FR_DETAIL_TEMPLATE)
+        details1["dump_file_rank_0"]["entries"].append(
+            create_one_entry(0, "all_reduce", [[4, 4]], [[4, 4]])
+        )
+        details1["dump_file_rank_1"]["entries"].append(
+            create_one_entry(0, "all_reduce", [[4, 4]], [[4, 4]])
+        )
+        details1["dump_file_rank_0"]["entries"].append(
+            create_one_entry(
+                1, "all_reduce", [[5, 5]], [[5, 5]], pg_info=("1", "sub_pg")
+            )
+        )
+        details1["dump_file_rank_1"]["entries"].append(
+            create_one_entry(
+                1, "all_reduce", [[5, 5]], [[5, 5]], pg_info=("1", "sub_pg")
+            )
+        )
+        db = build_db(details1, args, version)
+        self.assertEqual(len(db.collectives), 3)
+        self.assertEqual(db.collectives[0].record_id, 0)
+        self.assertEqual(db.collectives[0].collective_name, "nccl:all_reduce")
+        self.assertEqual(db.collectives[0].pass_check, True)
+        self.assertEqual(db.collectives[1].record_id, 1)
+        self.assertEqual(db.collectives[1].collective_name, "nccl:all_reduce")
+        self.assertEqual(db.collectives[1].pass_check, True)
+        self.assertEqual(db.collectives[2].pass_check, True)
+        # Test case 2: matched allreduce_coalesced case.
+        details2 = copy.deepcopy(LOADED_FR_DETAIL_TEMPLATE)
+        details2["dump_file_rank_0"]["entries"].append(
+            create_one_entry(0, "allreduce_coalesced", [[4, 4]], [[4, 4]])
+        )
+        details2["dump_file_rank_1"]["entries"].append(
+            create_one_entry(0, "allreduce_coalesced", [[4, 4]], [[4, 4]])
+        )
+        db = build_db(details2, args, version)
+        self.assertEqual(len(db.collectives), 1)
+        self.assertEqual(db.collectives[0].record_id, 0)
+        self.assertEqual(db.collectives[0].collective_name, "nccl:allreduce_coalesced")
+        self.assertEqual(db.collectives[0].pass_check, True)
+        # Test case 3: matched slow path, two broadcast coalesce case.
+        details3 = copy.deepcopy(LOADED_FR_DETAIL_TEMPLATE)
+        # sequence ID should not increase for coalesced collectives
+        details3["dump_file_rank_0"]["entries"].append(
+            create_one_entry(0, "broadcast", [[4, 4]], [[4, 4]])
+        )
+        details3["dump_file_rank_0"]["entries"].append(
+            create_one_entry(1, "broadcast", [[4, 4]], [[4, 4]])
+        )
+        details3["dump_file_rank_0"]["entries"].append(
+            create_one_entry(2, "coalesced", [[]], [[]])
+        )
+        details3["dump_file_rank_1"]["entries"].append(
+            create_one_entry(0, "broadcast", [[4, 4]], [[4, 4]])
+        )
+        details3["dump_file_rank_1"]["entries"].append(
+            create_one_entry(1, "broadcast", [[4, 4]], [[4, 4]])
+        )
+        details3["dump_file_rank_1"]["entries"].append(
+            create_one_entry(2, "coalesced", [[]], [[]])
+        )
+        db = build_db(details3, args, version)
+        self.assertEqual(len(db.collectives), 1)
+        self.assertEqual(db.collectives[0].record_id, 2)
+        self.assertEqual(db.collectives[0].collective_name, "nccl:coalesced")
+        self.assertEqual(db.collectives[0].pass_check, True)
+        # Test case 4: mis-matched uneven all-gather case.
+        details4 = copy.deepcopy(LOADED_FR_DETAIL_TEMPLATE)
+        # sequence ID should not increase for coalesced collectives
+        details4["dump_file_rank_0"]["entries"].append(
+            create_one_entry(0, "_broadcast_oop", [[4, 4]], [[4, 4]])
+        )
+        details4["dump_file_rank_0"]["entries"].append(
+            create_one_entry(1, "_broadcast_oop", [[5, 5]], [[5, 5]])
+        )
+        details4["dump_file_rank_0"]["entries"].append(
+            create_one_entry(2, "ALLGATHER_coalesced", [[]], [[]])
+        )
+        details4["dump_file_rank_1"]["entries"].append(
+            create_one_entry(0, "_broadcast_oop", [[4, 4]], [[4, 4]])
+        )
+        details4["dump_file_rank_1"]["entries"].append(
+            create_one_entry(1, "_broadcast_oop", [[4, 4]], [[4, 4]])
+        )
+        details4["dump_file_rank_1"]["entries"].append(
+            create_one_entry(2, "ALLGATHER_coalesced", [[]], [[]])
+        )
+        db = build_db(details4, args, version)
+        self.assertEqual(len(db.collectives), 1)
+        self.assertEqual(db.collectives[0].record_id, 1)
+        self.assertEqual(db.collectives[0].collective_name, "nccl:_broadcast_oop")
+        self.assertEqual(db.collectives[0].pass_check, False)
+        # Test case 5: matched uneven reduce scatter case.
+        details5 = copy.deepcopy(LOADED_FR_DETAIL_TEMPLATE)
+        # sequence ID should not increase for coalesced collectives
+        details5["dump_file_rank_0"]["entries"].append(
+            create_one_entry(0, "_reduce_oop", [[4, 4]], [[4, 4]])
+        )
+        details5["dump_file_rank_0"]["entries"].append(
+            create_one_entry(1, "_reduce_oop", [[4, 4]], [[4, 4]])
+        )
+        details5["dump_file_rank_0"]["entries"].append(
+            create_one_entry(2, "REDUCE_SCATTER_coalesced", [[]], [[]])
+        )
+        details5["dump_file_rank_1"]["entries"].append(
+            create_one_entry(0, "_reduce_oop", [[4, 4]], [[4, 4]])
+        )
+        details5["dump_file_rank_1"]["entries"].append(
+            create_one_entry(1, "_reduce_oop", [[4, 4]], [[4, 4]])
+        )
+        details5["dump_file_rank_1"]["entries"].append(
+            create_one_entry(2, "REDUCE_SCATTER_coalesced", [[]], [[]])
+        )
+        db = build_db(details5, args, version)
+        self.assertEqual(len(db.collectives), 1)
+        self.assertEqual(db.collectives[0].record_id, 2)
+        self.assertEqual(
+            db.collectives[0].collective_name, "nccl:REDUCE_SCATTER_coalesced"
+        )
+        self.assertEqual(db.collectives[0].pass_check, True)
+        # Test case 6: empty coalesced call on rank 0 case.
+        details6 = copy.deepcopy(LOADED_FR_DETAIL_TEMPLATE)
+        # sequence ID should not increase for coalesced collectives
+        details6["dump_file_rank_0"]["entries"].append(
+            create_one_entry(0, "all_reduce", [[4, 4]], [[4, 4]])
+        )
+        details6["dump_file_rank_1"]["entries"].append(
+            create_one_entry(0, "all_reduce", [[4, 4]], [[4, 4]])
+        )
+        details6["dump_file_rank_1"]["entries"].append(
+            create_one_entry(1, "_reduce_oop", [[4, 4]], [[4, 4]])
+        )
+        details6["dump_file_rank_1"]["entries"].append(
+            create_one_entry(2, "_reduce_oop", [[4, 4]], [[4, 4]])
+        )
+        details6["dump_file_rank_1"]["entries"].append(
+            create_one_entry(3, "REDUCE_SCATTER_coalesced", [[]], [[]])
+        )
+        db = build_db(details6, args, version)
+        self.assertEqual(len(db.collectives), 2)
+        self.assertEqual(db.collectives[1].collective_name, "nccl:_reduce_oop")
+        self.assertEqual(db.collectives[1].record_id, 1)
+        self.assertEqual(db.collectives[1].pass_check, True)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_distributed_checkpoint.py b/test/distributed/fsdp/test_distributed_checkpoint.py
index 42111efc8922..7c043a87ffb4 100644
--- a/test/distributed/fsdp/test_distributed_checkpoint.py
+++ b/test/distributed/fsdp/test_distributed_checkpoint.py
@@ -56,32 +56,58 @@ def test_distributed_checkpoint(self, state_dict_type) -> None:
             torch.manual_seed(200)
             new_model = wrap(SkipModel(double_nest=True))
 
+<<<<<<< HEAD
         with FullyShardedDataParallel.summon_full_params(
             model
         ), FullyShardedDataParallel.summon_full_params(new_model):
+=======
+        with (
+            FullyShardedDataParallel.summon_full_params(model),
+            FullyShardedDataParallel.summon_full_params(new_model),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             params = list(model.parameters())
             new_params = list(new_model.parameters())
             self.assertNotEqual(params, new_params)
 
         writer = FileSystemWriter(self.temp_dir)
         reader = FileSystemReader(self.temp_dir)
+<<<<<<< HEAD
         with FSDP.state_dict_type(model, state_dict_type), FSDP.state_dict_type(
             new_model, state_dict_type
+=======
+        with (
+            FSDP.state_dict_type(model, state_dict_type),
+            FSDP.state_dict_type(new_model, state_dict_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             state_dict = model.state_dict()
 
         save(state_dict, writer)
 
+<<<<<<< HEAD
         with FSDP.state_dict_type(model, state_dict_type), FSDP.state_dict_type(
             new_model, state_dict_type
+=======
+        with (
+            FSDP.state_dict_type(model, state_dict_type),
+            FSDP.state_dict_type(new_model, state_dict_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             state_dict = new_model.state_dict()
             load(state_dict, reader)
             new_model.load_state_dict(state_dict)
 
+<<<<<<< HEAD
         with FullyShardedDataParallel.summon_full_params(
             model
         ), FullyShardedDataParallel.summon_full_params(new_model):
+=======
+        with (
+            FullyShardedDataParallel.summon_full_params(model),
+            FullyShardedDataParallel.summon_full_params(new_model),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             params = list(model.parameters())
             new_params = list(new_model.parameters())
             self.assertEqual(params, new_params)
diff --git a/test/distributed/fsdp/test_fsdp_comm.py b/test/distributed/fsdp/test_fsdp_comm.py
index aedeb688977d..444e8b4cb4d7 100644
--- a/test/distributed/fsdp/test_fsdp_comm.py
+++ b/test/distributed/fsdp/test_fsdp_comm.py
@@ -242,11 +242,18 @@ def test_communication(
         # and if `use_no_sync=False`, we only run `num_iters` iterations
         # outside `no_sync()`
         num_iters = 3
+<<<<<<< HEAD
         with patch(
             "torch.distributed.all_gather_into_tensor"
         ) as mock_all_gather, patch(
             "torch.distributed.reduce_scatter_tensor"
         ) as mock_reduce_scatter:
+=======
+        with (
+            patch("torch.distributed.all_gather_into_tensor") as mock_all_gather,
+            patch("torch.distributed.reduce_scatter_tensor") as mock_reduce_scatter,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def reset_mocks():
                 mock_all_gather.reset_mock()
diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index 4f415b03320f..983b647c17f9 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -35,11 +35,16 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     instantiate_parametrized_tests,
     NAVI4_ARCH,
     parametrize,
     run_tests,
     skipIfRocmArch,
+=======
+    parametrize,
+    run_tests,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_HPU,
     TEST_WITH_DEV_DBG_ASAN,
 )
@@ -163,7 +168,10 @@ def test_nested_always_wrap_model(
 
     @skip_if_lt_x_gpu(2)
     @parametrize(params, configs, subtest_name)
+<<<<<<< HEAD
     @skipIfRocmArch(NAVI4_ARCH)  # Supported in future releases
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_transformer(
         self,
         cpu_offload: CPUOffload,
@@ -383,12 +391,24 @@ def _register_pre_backward_hooks_with_count(*args, **kwargs):
             register_pre_backward_hooks_call_count += 1
             return orig_register_pre_backward_hooks(*args, **kwargs)
 
+<<<<<<< HEAD
         with mock.patch(
             "torch.distributed.fsdp._runtime_utils._register_pre_backward_hooks",
             _register_pre_backward_hooks_with_count,
         ), mock.patch(
             "torch.distributed.fsdp._runtime_utils._register_post_backward_hook"
         ) as register_post_bwd_mock:
+=======
+        with (
+            mock.patch(
+                "torch.distributed.fsdp._runtime_utils._register_pre_backward_hooks",
+                _register_pre_backward_hooks_with_count,
+            ),
+            mock.patch(
+                "torch.distributed.fsdp._runtime_utils._register_post_backward_hook"
+            ) as register_post_bwd_mock,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(register_pre_backward_hooks_call_count, 0)
             self.assertFalse(register_post_bwd_mock.called)
             fsdp_model(*input)
diff --git a/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py b/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py
index 18e497b625b4..38a52b86ac74 100644
--- a/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py
@@ -5,7 +5,10 @@
 import torch
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import ShardedTensor
+<<<<<<< HEAD
 from torch.distributed._tensor import DTensor, Shard
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.api import (
@@ -13,6 +16,10 @@
     ShardedStateDictConfig,
     StateDictType,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor import DTensor, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_fsdp import get_devtype
 from torch.testing._internal.common_utils import parametrize, run_tests
diff --git a/test/distributed/fsdp/test_fsdp_grad_acc.py b/test/distributed/fsdp/test_fsdp_grad_acc.py
index fc371979ca3c..5a94ee521f24 100644
--- a/test/distributed/fsdp/test_fsdp_grad_acc.py
+++ b/test/distributed/fsdp/test_fsdp_grad_acc.py
@@ -24,7 +24,10 @@
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+<<<<<<< HEAD
     skipIfRocm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_WITH_DEV_DBG_ASAN,
 )
 
@@ -153,9 +156,15 @@ def permute_tensor(x: torch.Tensor):
             batches.append(tuple(permute_tensor(t) for t in batch))
         for batch1, batch2 in itertools.combinations(batches, r=2):
             for t1, t2 in zip(batch1, batch2):
+<<<<<<< HEAD
                 assert not torch.all(
                     t1 == t2
                 ), "Check the test to make sure that batches are distinct"
+=======
+                assert not torch.all(t1 == t2), (
+                    "Check the test to make sure that batches are distinct"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Concatenate the batches along the given batch dimension
         concat_batch: tuple[torch.Tensor, ...] = tuple(
@@ -275,7 +284,10 @@ def test_grad_acc(
         )
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("use_orig_params", [False, True])
     def test_grad_acc_cpu_offload(
         self,
diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
index f39752acb932..1dd9425adbc6 100644
--- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -6,7 +6,10 @@
 from enum import auto, Enum
 from functools import partial
 from typing import Optional
+<<<<<<< HEAD
 import unittest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -32,9 +35,12 @@
     FSDPTest,
     TransformerWithSharedParams,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     run_tests,
@@ -125,8 +131,14 @@ def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
     def test_hsdp_save_load_state_dict(self):
         model = MyModel().cuda()
         num_node_devices = torch.cuda.device_count()
+<<<<<<< HEAD
         shard_rank_lists = list(range(0, num_node_devices // 2)), list(
             range(num_node_devices // 2, num_node_devices)
+=======
+        shard_rank_lists = (
+            list(range(0, num_node_devices // 2)),
+            list(range(num_node_devices // 2, num_node_devices)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         shard_groups = (
             dist.new_group(shard_rank_lists[0]),
@@ -175,8 +187,14 @@ def test_hsdp_save_load_state_dict(self):
     def test_hsdp_sync_module_state(self):
         model = MyModel().cuda()
         num_node_devices = torch.cuda.device_count()
+<<<<<<< HEAD
         shard_rank_lists = list(range(0, num_node_devices // 2)), list(
             range(num_node_devices // 2, num_node_devices)
+=======
+        shard_rank_lists = (
+            list(range(0, num_node_devices // 2)),
+            list(range(num_node_devices // 2, num_node_devices)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         shard_groups = (
             dist.new_group(shard_rank_lists[0]),
@@ -231,7 +249,10 @@ def test_invalid_pg_specification_raises(self):
     # resharded after forward.
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fsdp_hybrid_shard_basic_setup(self):
         """
         Tests basic functionality of HYBRID_SHARD and _HYBRID_SHARD_ZERO2:
@@ -315,8 +336,14 @@ def patched_collective(orig_collective, counter, *args, **kwargs):
         cntr = Counter()
         patched_allreduce = partial(patched_collective, orig_ar, cntr)
         patched_reduce_scatter = partial(patched_collective, orig_rs, cntr)
+<<<<<<< HEAD
         with patch_allreduce(patched_allreduce), patch_reduce_scatter(
             patched_reduce_scatter
+=======
+        with (
+            patch_allreduce(patched_allreduce),
+            patch_reduce_scatter(patched_reduce_scatter),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             inp = hsdp_model.get_input(device=torch.cuda.current_device())
             out = hsdp_model(inp[0], inp[1])
@@ -360,9 +387,15 @@ def _test_fsdp_hybrid_shard_parity(
             use_orig_params,
             hsdp_process_groups=hsdp_pgs,
         )
+<<<<<<< HEAD
         assert (
             hsdp_model._inter_node_pg.size() > 1
         ), "HSDP model initialized without replication"
+=======
+        assert hsdp_model._inter_node_pg.size() > 1, (
+            "HSDP model initialized without replication"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fsdp_optim = torch.optim.Adam(fsdp_model.parameters(), lr=1e-2)
         hsdp_optim = torch.optim.Adam(hsdp_model.parameters(), lr=1e-2)
         torch.manual_seed(global_pg.rank() + 1)
diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
index bb54f1c2d2c9..e06a3876f6d7 100644
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -766,9 +766,15 @@ def forward(self, x, expect_use_full_prec_in_eval):
                 if expect_use_full_prec_in_eval:
                     assert x.dtype == torch.float32, f"Expected fp32, got {x.dtype}"
                 else:
+<<<<<<< HEAD
                     assert (
                         x.dtype == low_prec_dtype
                     ), f"Expected {low_prec_dtype}, got {x.dtype}"
+=======
+                    assert x.dtype == low_prec_dtype, (
+                        f"Expected {low_prec_dtype}, got {x.dtype}"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return self.a(x)
 
         mp_config = MixedPrecision(
diff --git a/test/distributed/fsdp/test_fsdp_overlap.py b/test/distributed/fsdp/test_fsdp_overlap.py
index df5bdc319671..05bf44c64cf1 100644
--- a/test/distributed/fsdp/test_fsdp_overlap.py
+++ b/test/distributed/fsdp/test_fsdp_overlap.py
@@ -19,7 +19,10 @@
     run_tests,
     TEST_HPU,
     TEST_WITH_DEV_DBG_ASAN,
+<<<<<<< HEAD
     skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -243,7 +246,10 @@ def _delayed_all_gather(*args, **kwargs):
             compute_only = e3["gpu_compute"]
             all_gather_only = e2["gpu_total"]
             both = e4["gpu_total"]
+<<<<<<< HEAD
             print(f"compute_only={compute_only} all_gather_only={all_gather_only} both={both}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertTrue(compute_only + all_gather_only > 1.1 * both)
 
     @unittest.skipIf(TEST_HPU, "HPU doesn't has HW sleep API support, skipping")
@@ -252,7 +258,10 @@ def test_forward_overlap(self):
         self._dist_train()
 
 
+<<<<<<< HEAD
 @skipIfRocm #Not running upstream
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestForwardOverlapWorldSizeTwo(TestForwardOverlapWorldSizeOne):
     @property
     def world_size(self):
diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
index 8d0ce114f0d6..e1734a8db3c7 100644
--- a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
+++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
@@ -19,7 +19,10 @@
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     DEVICEInitMode,
@@ -237,9 +240,12 @@ def _build_model_and_optim(
         return model, optim, ref_model, ref_optim
 
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sharded_grad_scaler_found_inf(self):
         self.run_subtests(
             {
diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
index 62a79214c81a..adf59bf20065 100644
--- a/test/distributed/fsdp/test_fsdp_tp_integration.py
+++ b/test/distributed/fsdp/test_fsdp_tp_integration.py
@@ -6,6 +6,7 @@
 
 import torch
 from torch import distributed as dist
+<<<<<<< HEAD
 from torch.distributed._tensor import (
     DeviceMesh,
     distribute_module,
@@ -14,11 +15,24 @@
     Replicate,
     Shard,
 )
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     CPUOffload,
     FullyShardedDataParallel as FSDP,
     ShardingStrategy,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_module,
+    DTensor,
+    Replicate,
+    Shard,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -91,9 +105,15 @@ def _get_params_and_sharding_info(
         tensor_parallel_size: int,
     ) -> tuple[dict[str, int], dict[str, tuple[torch.Size, int]]]:
         """ """
+<<<<<<< HEAD
         assert (
             type(model) is SimpleModel
         ), "Expects a `SimpleModel` since the sharding cases on the model definition"
+=======
+        assert type(model) is SimpleModel, (
+            "Expects a `SimpleModel` since the sharding cases on the model definition"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         param_name_to_numel = OrderedDict()
         param_name_to_sharding_info = OrderedDict()
         for param_name, param in model.named_parameters():
@@ -190,9 +210,17 @@ def _get_grads_as_flattened(
         local_grads_as_flattened = (
             torch.cat(
                 [
+<<<<<<< HEAD
                     torch.flatten(param.grad)
                     if param.grad is not None
                     else torch.zeros_like(torch.flatten(param))
+=======
+                    (
+                        torch.flatten(param.grad)
+                        if param.grad is not None
+                        else torch.zeros_like(torch.flatten(param))
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for param in model.parameters()
                 ]
             )
diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index a0e1d0a50cc0..7469ac71bea1 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -654,9 +654,18 @@ def _test_multiple_forward(
             losses1 = []
             losses2 = []
             losses = []
+<<<<<<< HEAD
             for _model, _optim in (fsdp_model, optim), (
                 fsdp_model_orig_params,
                 optim_orig_params,
+=======
+            for _model, _optim in (
+                (fsdp_model, optim),
+                (
+                    fsdp_model_orig_params,
+                    optim_orig_params,
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 _optim.zero_grad()
                 loss1 = _model(*inp1)
@@ -1166,9 +1175,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                     clean_tensor_name(tup[0]) for tup in self.named_parameters()
                 ]
                 params = [tup[1] for tup in self.named_parameters()]
+<<<<<<< HEAD
                 assert (
                     param_shapes[0] is not None and param_shapes[1] is not None
                 ), "`param_sizes` should be set"
+=======
+                assert param_shapes[0] is not None and param_shapes[1] is not None, (
+                    "`param_sizes` should be set"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert_equal_fn(
                     param_names,
                     [
diff --git a/test/distributed/fsdp/test_hsdp_dtensor_state_dict.py b/test/distributed/fsdp/test_hsdp_dtensor_state_dict.py
index 0b7a6f1072cf..311042ac4c73 100644
--- a/test/distributed/fsdp/test_hsdp_dtensor_state_dict.py
+++ b/test/distributed/fsdp/test_hsdp_dtensor_state_dict.py
@@ -7,7 +7,10 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import ShardedTensor
+<<<<<<< HEAD
 from torch.distributed._tensor import DTensor, Replicate, Shard
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.api import (
@@ -16,6 +19,10 @@
     ShardingStrategy,
     StateDictType,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor import DTensor, Replicate, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_fsdp import get_devtype
 from torch.testing._internal.common_utils import parametrize, run_tests
diff --git a/test/distributed/launcher/api_test.py b/test/distributed/launcher/api_test.py
index c167a1e03c20..a60fa66565d6 100644
--- a/test/distributed/launcher/api_test.py
+++ b/test/distributed/launcher/api_test.py
@@ -411,3 +411,13 @@ def test_rdzv_handler_shutdown_on_agent_error(self, mock_get_rdzv, mock_agent_ru
                 launch_agent(config, simple_rank_scale, [])
             rdzv_handler_mock.shutdown.assert_called_once()
             record_event_mock.assert_called_once()
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py b/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
index 691c43ddb542..a2653312749b 100755
--- a/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
+++ b/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
@@ -19,6 +19,10 @@
 see: - test/distributed/launcher/run_test.py#test_is_torchelastic_launched()
      - test/distributed/launcher/run_test.py#test_is_not_torchelastic_launched()
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import argparse
 
 import torch.distributed as dist
diff --git a/test/distributed/launcher/launch_test.py b/test/distributed/launcher/launch_test.py
index 1ef7fa7e284b..8756652269b1 100644
--- a/test/distributed/launcher/launch_test.py
+++ b/test/distributed/launcher/launch_test.py
@@ -84,3 +84,13 @@ def test_launch_with_env(self):
         self.assertSetEqual(
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index 10a84492c75e..2679121b62cf 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -6,7 +6,10 @@
 # LICENSE file in the root directory of this source tree.
 
 import copy
+<<<<<<< HEAD
 import os
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 from contextlib import nullcontext
 from typing import Any, cast
@@ -30,12 +33,32 @@
 from torch.distributed.optim.zero_redundancy_optimizer import _broadcast_object
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import AdamW, SGD
+<<<<<<< HEAD
 from torch.testing._internal import common_distributed
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_WINDOWS,
     parametrize,
     run_tests,
+=======
+from torch.testing._internal.common_distributed import (
+    DistributedTestBase,
+    logger,
+    requires_accelerator_dist_backend,
+    requires_ddp_rank,
+    requires_gloo,
+    skip_if_lt_x_gpu,
+    skip_if_no_gpu,
+    skip_if_rocm_multiprocess,
+    skip_if_win32,
+)
+from torch.testing._internal.common_fsdp import get_devtype
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    skipIfHpu,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -47,6 +70,7 @@
     HAS_TORCHVISION = False
 
 
+<<<<<<< HEAD
 # Use GLOO on GPU when running CUDA + Windows
 def _get_backend_for_tests():
     return (
@@ -72,11 +96,21 @@ def device(self):
         return (
             torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         )
+=======
+device_type = str(get_devtype())
+
+
+class TestZeroRedundancyOptimizer(DistributedTestBase):
+    @property
+    def device(self):
+        return device_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def world_size(self):
         return 1
 
+<<<<<<< HEAD
     def tearDown(self):
         try:
             torch.distributed.destroy_process_group()
@@ -98,12 +132,18 @@ def dist_init(self, rank, world_size=-1, backend=BACKEND):
             world_size=world_size,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer):
     def test_state_dict(self):
         """Check that ZeroRedundancyOptimizer exposes the expected state dict
         interface, irrespective of the sharding."""
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR1 = 0.1
         LR2 = 0.01
         MOMENTUM = 0.9
@@ -171,7 +211,11 @@ def test_state_dict(self):
     def test_lr_scheduler(self):
         """Check that a normal PyTorch ``lr_scheduler`` is usable with
         ZeroRedundancyOptimizer."""
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         NUM_ITERS = 5
         LR = 0.01
         x = torch.tensor([1.0], device=self.device, requires_grad=True)
@@ -193,7 +237,11 @@ def test_lr_scheduler(self):
 
     def test_step_with_kwargs(self):
         """Check that the ``step(**kwargs)`` interface is properly exposed."""
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR = 0.1
 
         class SGDWithStepKWArg(torch.optim.SGD):
@@ -217,7 +265,11 @@ def test_step_with_extra_inner_key(self):
         """Check that ZeroRedundancyOptimizer wrapping an optimizer that adds
         extra keys to ``param_groups`` exposes those keys through ZeRO's own
         ``param_groups``."""
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR = 0.1
 
         class SGDWithNewKey(torch.optim.SGD):
@@ -236,7 +288,11 @@ def step(self, closure=None):
     def test_step_without_closure(self):
         """Check that the ``step()`` method (without closure) is handled as
         expected."""
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR = 0.1
 
         class SGDWithoutClosure(torch.optim.SGD):
@@ -255,7 +311,11 @@ def step(self):
 
     def test_zero_grad(self):
         """Check that the ``zero_grad`` method is properly handled."""
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR = 0.01
         x = torch.rand(1)
         m = torch.nn.Linear(1, 1)
@@ -271,7 +331,11 @@ def test_zero_grad(self):
     def test_constructor(self):
         """Check the robustness of the ZeroRedundancyOptimizer constructor by
         passing different values for the ``params`` argument."""
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR = 0.01
         m = torch.nn.Sequential(
             torch.nn.Linear(5, 10),
@@ -321,9 +385,15 @@ def test_constructor(self):
             betas=BETAS,
             eps=EPS,
         )
+<<<<<<< HEAD
         assert (
             len(o.param_groups) == 2
         ), f"Expected 2 ZeRO param groups, but got {len(o.param_groups)}"
+=======
+        assert len(o.param_groups) == 2, (
+            f"Expected 2 ZeRO param groups, but got {len(o.param_groups)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(o.optim.param_groups) == 2, (
             "Expected 2 local optimizer param groups, but got "
             f"{len(o.optim.param_groups)}"
@@ -336,7 +406,11 @@ def test_same_dense_param_type(self):
         NOTE: This test should be removed once support for sparse parameters
         and varying parameter types is added.
         """
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR = 0.01
         inputs = [
             [torch.sparse_coo_tensor(size=(2, 3))],
@@ -354,6 +428,7 @@ def test_same_dense_param_type(self):
 
 class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
     @property
+<<<<<<< HEAD
     def device(self):
         return (
             torch.device(self.rank)
@@ -372,6 +447,17 @@ def context(self):
             if not torch.cuda.is_available()
             else torch.cuda.device(self.rank)
         )
+=======
+    def world_size(self):
+        return min(4, max(2, torch.get_device_module(self.device).device_count()))
+
+    @property
+    def context(self):
+        if requires_ddp_rank(self.device):
+            return torch.get_device_module(self.device).device(self.rank)
+        else:
+            return nullcontext()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_same_model_params(
         self,
@@ -396,12 +482,21 @@ def _check_same_model_params(
                 msg=f"Model buffers differ:\n{b_a} {b_b}\n" + message,
             )
 
+<<<<<<< HEAD
     @common_distributed.skip_if_no_gpu
     @common_distributed.skip_if_rocm_multiprocess
     def test_step(self):
         """Check that ZeroRedundancyOptimizer properly exposes the ``step()``
         interface."""
         self.dist_init(self.rank, world_size=self.world_size)
+=======
+    @skip_if_no_gpu
+    @skip_if_rocm_multiprocess
+    def test_step(self):
+        """Check that ZeroRedundancyOptimizer properly exposes the ``step()``
+        interface."""
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR = 0.01
 
         with self.context:
@@ -436,6 +531,7 @@ def test_step(self):
             self.assertEqual(m.weight, m_zero.weight)
             self.assertEqual(m.bias, m_zero.bias)
 
+<<<<<<< HEAD
     @common_distributed.skip_if_no_gpu
     @common_distributed.skip_if_rocm_multiprocess
     def test_step_with_closure(self):
@@ -443,6 +539,14 @@ def test_step_with_closure(self):
         ``step(closure)`` interface."""
         self.dist_init(self.rank, world_size=self.world_size)
 
+=======
+    @skip_if_no_gpu
+    @skip_if_rocm_multiprocess
+    def test_step_with_closure(self):
+        """Check that ZeroRedundancyOptimizer properly exposes the
+        ``step(closure)`` interface."""
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.context:
             for bucket_view in [False, True]:
                 x_val = self.rank + 1
@@ -487,11 +591,19 @@ def closure():
                 self.assertEqual(m.weight, torch.tensor([[1.1]]))
                 self.assertEqual(m.bias, torch.tensor([2.1]))
 
+<<<<<<< HEAD
     @common_distributed.skip_if_no_gpu
     def test_lr_scheduler(self):
         """Check that a normal PyTorch ``lr_scheduler`` is usable with
         ZeroRedundancyOptimizer."""
         self.dist_init(self.rank)
+=======
+    @skip_if_no_gpu
+    def test_lr_scheduler(self):
+        """Check that a normal PyTorch ``lr_scheduler`` is usable with
+        ZeroRedundancyOptimizer."""
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.tensor([1.0], device=self.device, requires_grad=True)
         x2 = torch.tensor([1.0], device=self.device, requires_grad=True)
         o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=0.01)
@@ -519,7 +631,11 @@ def test_sharding(self):
         ``ZeroRedundancyOptimizer._partition_parameters()`` in
         zero_redundancy_optimizer.py.
         """
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR = 0.01
         sizes = [9, 7, 5, 3]
         params = []
@@ -541,7 +657,11 @@ def test_add_param_group(self):
         ``ZeroRedundancyOptimizer._partition_parameters()`` in
         zero_redundancy_optimizer.py.
         """
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR = 0.01
 
         # Test with all parameters trainable to begin with
@@ -589,14 +709,22 @@ def some_trainable():
         all_trainable()
         some_trainable()
 
+<<<<<<< HEAD
     @common_distributed.skip_if_no_gpu
+=======
+    @skip_if_no_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multiple_param_groups(self):
         """
         Check parity between constructing ZeRO with multiple parameter groups
         upfront versus adding parameter groups to ZeRO after construction
         versus a non-sharded optimizer.
         """
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         BATCH_SIZE, NUM_ITERS = 8, 3
         INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM = 5, 10, 5
         WD, LR = 0.01, 0.01
@@ -656,12 +784,21 @@ def test_multiple_param_groups(self):
                 torch.testing.assert_close(layer1.bias, layer2.bias)
                 torch.testing.assert_close(layer1.bias, layer3.bias)
 
+<<<<<<< HEAD
     @common_distributed.skip_if_no_gpu
     @common_distributed.skip_if_rocm_multiprocess
     def test_collect_shards(self):
         """Check the state consolidation mechanism and the state dict exposed
         by ZeroRedundancyOptimizer."""
         self.dist_init(self.rank)
+=======
+    @skip_if_no_gpu
+    @skip_if_rocm_multiprocess
+    def test_collect_shards(self):
+        """Check the state consolidation mechanism and the state dict exposed
+        by ZeroRedundancyOptimizer."""
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LR = 1e-3
         MOMENTUM = 0.99
         BATCH_SIZE, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM = 3, 20, 10, 5
@@ -719,13 +856,18 @@ def test_nondefault_process_group(self):
         # trivial
         MIN_WORLD_SIZE = 4
         if self.world_size < MIN_WORLD_SIZE:
+<<<<<<< HEAD
             common_distributed.logger.info(
+=======
+            logger.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Skipping `test_nondefault_process_group()` since world size "
                 "of %s is less than %s",
                 self.world_size,
                 MIN_WORLD_SIZE,
             )
             return
+<<<<<<< HEAD
         BACKEND = dist.Backend.GLOO
         self.dist_init(self.rank, self.world_size, BACKEND)
         # Use GPU if enough are available, or fall back to CPU otherwise, which
@@ -734,12 +876,24 @@ def test_nondefault_process_group(self):
             device = torch.device(self.rank)
         else:
             device = torch.device("cpu")
+=======
+        # Use GPU if enough are available, or fall back to CPU otherwise
+        if torch.get_device_module(self.device).device_count() < self.world_size:
+            device = torch.device("cpu")
+        else:
+            device = torch.device(self.device)
+        self.create_pg(device.type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Create a new process group consisting of the even ranks to exercise
         # the case where the global and local ranks do not necessarily match
         subgroup_ranks = [r for r in range(self.world_size) if r % 2 == 0]
         process_group = dist.new_group(
             ranks=subgroup_ranks,
+<<<<<<< HEAD
             backend=BACKEND,
+=======
+            backend=self.backend(device.type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # Ranks not participating in the new process group are no longer needed
         if self.rank not in subgroup_ranks:
@@ -754,9 +908,15 @@ def test_nondefault_process_group(self):
         LR = 1e-3
         MOMENTUM = 0.99
         REFERENCE_RANK = 0
+<<<<<<< HEAD
         assert (
             REFERENCE_RANK in subgroup_ranks
         ), "Reference rank must be in the new process group"
+=======
+        assert REFERENCE_RANK in subgroup_ranks, (
+            "Reference rank must be in the new process group"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         loss_fn = torch.nn.L1Loss().to(device)
 
         def check(optimizer):
@@ -811,7 +971,11 @@ def closure():
         )
         check(optimizer)
 
+<<<<<<< HEAD
     @common_distributed.skip_if_no_gpu
+=======
+    @skip_if_no_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "optimizer_class_str",
         ["Adam", "AdamW", "SGD"],
@@ -828,7 +992,11 @@ def test_local_optimizer_parity(
     ):
         """When combined with DDP, check that a local optimizer gives the same
         results as wrapping that optimizer with ZeroRedundancyOptimizer."""
+<<<<<<< HEAD
         self.dist_init(self.rank)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         BATCHES = 20
         BATCH_SIZE = 64
         LR = 1e-3
@@ -867,7 +1035,11 @@ def test_local_optimizer_parity(
             )
             sharded_ddp_model = DDP(
                 module=model,
+<<<<<<< HEAD
                 device_ids=[self.rank],
+=======
+                device_ids=[self.rank] if requires_ddp_rank(self.device) else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 broadcast_buffers=True,
                 find_unused_parameters=True,
             )
@@ -879,7 +1051,11 @@ def test_local_optimizer_parity(
             )
             ddp_model = DDP(
                 local_model,
+<<<<<<< HEAD
                 device_ids=[self.rank],
+=======
+                device_ids=[self.rank] if requires_ddp_rank(self.device) else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 broadcast_buffers=True,
                 find_unused_parameters=True,
             )
@@ -892,7 +1068,11 @@ def test_local_optimizer_parity(
             )
 
             def check_step():
+<<<<<<< HEAD
                 input_tensor = torch.rand((BATCH_SIZE, INPUT_DIM))
+=======
+                input_tensor = torch.rand((BATCH_SIZE, INPUT_DIM)).to(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 def closure_ddp(input_tensor=input_tensor):
                     ddp_optimizer.zero_grad()
@@ -917,8 +1097,11 @@ def closure_sharded(input_tensor=input_tensor):
                 torch.testing.assert_close(
                     loss_ddp,
                     loss_sharded_optim,
+<<<<<<< HEAD
                     atol=1.6e-3,
                     rtol=3e-6,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     msg="Losses differ between local optimizer and ZeRO",
                 )
                 self._check_same_model_params(
@@ -972,6 +1155,7 @@ def _test_zero_join(self, device):
         NUM_EPOCHS = 2
         LR = 0.01
         torch.manual_seed(0)
+<<<<<<< HEAD
         torch.cuda.manual_seed(0)
 
         rank = self.rank
@@ -979,6 +1163,14 @@ def _test_zero_join(self, device):
         is_gpu = device.type == "cuda"
         backend = _get_backend_for_tests() if is_gpu else dist.Backend.GLOO
         self.dist_init(rank, world_size, backend)
+=======
+        if "cpu" not in device:
+            torch.get_device_module(device).manual_seed(0)
+
+        rank = self.rank
+        world_size = self.world_size
+        self.create_pg(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model = torch.nn.Sequential(
             torch.nn.Linear(2, 3),
@@ -990,7 +1182,13 @@ def _test_zero_join(self, device):
         # DDP ensures correct gradients in data parallel training, so DDP with
         # local optimizers on uneven inputs should be equivalent to ZeRO on
         # uneven inputs with gradients being manually set
+<<<<<<< HEAD
         ddp_model = DDP(model, device_ids=[rank]) if is_gpu else DDP(model)
+=======
+        ddp_model = (
+            DDP(model, device_ids=[rank]) if requires_ddp_rank(device) else DDP(model)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR)
         zero_model = copy.deepcopy(model)
         zero_model.to(device)
@@ -1113,13 +1311,19 @@ def join_process_group(self):
                         )
                     iter += 1
 
+<<<<<<< HEAD
     @common_distributed.requires_nccl()
     @common_distributed.skip_if_no_gpu
+=======
+    @requires_accelerator_dist_backend()
+    @skip_if_no_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_zero_join_gpu(self):
         """Check that the ZeRO join hook allows training with uneven inputs
         on GPU."""
         self._test_zero_join(self.device)
 
+<<<<<<< HEAD
     @common_distributed.requires_gloo()
     def test_zero_join_cpu(self):
         """Check that the ZeRO join hook allows training with uneven inputs
@@ -1127,13 +1331,27 @@ def test_zero_join_cpu(self):
         self._test_zero_join(torch.device("cpu"))
 
     def _test_zero_model_parallel(self, parameters_as_bucket_view: bool):
+=======
+    @requires_gloo()
+    def test_zero_join_cpu(self):
+        """Check that the ZeRO join hook allows training with uneven inputs
+        on CPU."""
+        self._test_zero_join("cpu")
+
+    def _test_zero_model_parallel(self, parameters_as_bucket_view: bool, device: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Use two processes each with two GPUs
         assert self.rank < 2
         NUM_EPOCHS = 2
         NUM_INPUTS = 4
         LR = 0.01
         torch.manual_seed(0)
+<<<<<<< HEAD
         torch.cuda.manual_seed(0)
+=======
+        if "cpu" not in device:
+            torch.get_device_module(device).manual_seed(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class ModelParallelModel(torch.nn.Module):
             def __init__(self, dev0, dev1):
@@ -1206,16 +1424,29 @@ def closure_ddp():
 
                 # Increased tolerances are needed to pass when using TF32
                 # See: https://github.com/pytorch/pytorch/issues/67764
+<<<<<<< HEAD
                 torch.testing.assert_close(
                     local_loss.cpu(),
                     ddp_loss.cpu(),
                     rtol=1e-03,
                     atol=1e-08,
                 ), "Losses differ between local optimizer and ZeRO"
+=======
+                (
+                    torch.testing.assert_close(
+                        local_loss.cpu(),
+                        ddp_loss.cpu(),
+                        rtol=1e-03,
+                        atol=1e-08,
+                    ),
+                    "Losses differ between local optimizer and ZeRO",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 for local_p, ddp_p in zip(
                     local_model.parameters(), ddp_model.parameters()
                 ):
+<<<<<<< HEAD
                     torch.testing.assert_close(
                         local_p.cpu(),
                         ddp_p.cpu(),
@@ -1224,6 +1455,20 @@ def closure_ddp():
                     ), "Models differ after a step"
 
     @common_distributed.skip_if_lt_x_gpu(4)
+=======
+                    (
+                        torch.testing.assert_close(
+                            local_p.cpu(),
+                            ddp_p.cpu(),
+                            rtol=1e-03,
+                            atol=1e-04,
+                        ),
+                        "Models differ after a step",
+                    )
+
+    @skipIfHpu
+    @skip_if_lt_x_gpu(4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "parameters_as_bucket_view",
         [False, True],
@@ -1236,8 +1481,13 @@ def test_zero_model_parallel(
         layers are assigned to different devices."""
         if self.rank >= 2:
             return
+<<<<<<< HEAD
         self.dist_init(self.rank, world_size=2)
         self._test_zero_model_parallel(parameters_as_bucket_view)
+=======
+        self.create_pg(self.device, world_size=2)
+        self._test_zero_model_parallel(parameters_as_bucket_view, self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_ddp_zero_overlap(
         self,
@@ -1252,12 +1502,19 @@ def _test_ddp_zero_overlap(
         SGD_WEIGHT_DECAY = 0.001
         NUM_INPUTS = 5
         torch.manual_seed(0)
+<<<<<<< HEAD
         torch.cuda.manual_seed(0)
 
         rank = self.rank
         is_gpu = device.type == "cuda"
         if is_gpu:
             torch.cuda.set_device(device)
+=======
+        if "cpu" not in device:
+            torch.get_device_module(device).manual_seed(0)
+
+        rank = self.rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         models_to_test = [
             (
                 torch.nn.Sequential(
@@ -1275,11 +1532,24 @@ def _test_ddp_zero_overlap(
                 )
             )
         for model, inputs in models_to_test:
+<<<<<<< HEAD
             # Enable determinism in cudnn operators
             with torch.backends.cudnn.flags(
                 enabled=True, deterministic=True, benchmark=False
             ):
                 device_ids = [rank] if is_gpu else None
+=======
+            # Select deterministic context based on device
+            det_ctx = (
+                torch.backends.cudnn.flags(
+                    enabled=True, deterministic=True, benchmark=False
+                )
+                if "cuda" in device
+                else torch.use_deterministic_algorithms(True)
+            )
+            with det_ctx:
+                device_ids = [rank] if requires_ddp_rank(device) else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Set up the DDP model overlapping with ZeRO
                 ddp_model_overlap = DDP(
                     copy.deepcopy(model).to(device),
@@ -1376,10 +1646,17 @@ def _test_ddp_zero_overlap(
 
     # NOTE: The test is skipped if using Windows since functional optimizers
     # are not currently supported.
+<<<<<<< HEAD
     @common_distributed.skip_if_win32()
     @common_distributed.requires_nccl()
     @common_distributed.skip_if_no_gpu
     @common_distributed.skip_if_rocm_multiprocess
+=======
+    @skip_if_win32()
+    @requires_accelerator_dist_backend()
+    @skip_if_no_gpu
+    @skip_if_rocm_multiprocess
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "use_gpu",
         [True],
@@ -1415,9 +1692,13 @@ def test_ddp_zero_overlap(
         by ``hook_constructor`` and ``shard_buckets`` and using the given ZeRO
         and DDP arguments achieves parity with DDP using a local optimizer.
         """
+<<<<<<< HEAD
         device = torch.device(self.rank) if use_gpu else torch.device("cpu")
         backend = _get_backend_for_tests()
         self.dist_init(self.rank, self.world_size, backend)
+=======
+        self.create_pg(self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hook_constructor = (
             hook_with_zero_step
             if not use_interleaved_hook
@@ -1425,7 +1706,11 @@ def test_ddp_zero_overlap(
         )
 
         self._test_ddp_zero_overlap(
+<<<<<<< HEAD
             device,
+=======
+            self.device if use_gpu else "cpu",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             hook_constructor,
             gradient_as_bucket_view,
             static_graph,
diff --git a/test/distributed/pipelining/model_registry.py b/test/distributed/pipelining/model_registry.py
index 05d4e54176f9..b3ef2dc8071c 100644
--- a/test/distributed/pipelining/model_registry.py
+++ b/test/distributed/pipelining/model_registry.py
@@ -7,13 +7,24 @@
 
 
 class ExampleCode(torch.nn.Module):
+<<<<<<< HEAD
     def __init__(self, d_hid):
         super().__init__()
+=======
+    def __init__(self, d_hid, splits=2):
+        assert splits <= 4
+        super().__init__()
+        self.splits = splits
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
         self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
         self.cval = torch.nn.Buffer(torch.randn((d_hid,), requires_grad=False))
         self.lin0 = torch.nn.Linear(d_hid, d_hid)
         self.lin1 = torch.nn.Linear(d_hid, d_hid)
+<<<<<<< HEAD
+=======
+        self.lin2 = torch.nn.Linear(d_hid, d_hid)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, x):
         x = torch.mm(x, self.mm_param0)
@@ -24,8 +35,19 @@ def forward(self, x):
         pipe_split()
         x = torch.relu(x) + a_constant
         x = torch.mm(x, self.mm_param1)
+<<<<<<< HEAD
         x = self.lin1(x)
         x = torch.relu(x)
+=======
+        if self.splits > 2:
+            pipe_split()
+            x = self.lin1(x)
+            x = torch.relu(x)
+        if self.splits > 3:
+            pipe_split()
+            x = self.lin2(x)
+            x = torch.relu(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x
 
 
@@ -33,12 +55,24 @@ class ModelWithKwargs(torch.nn.Module):
     DEFAULT_DHID = 512
     DEFAULT_BATCH_SIZE = 256
 
+<<<<<<< HEAD
     def __init__(self, d_hid: int = DEFAULT_DHID):
         super().__init__()
+=======
+    def __init__(self, d_hid: int = DEFAULT_DHID, splits=2):
+        assert splits <= 4
+        super().__init__()
+        self.splits = splits
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
         self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
         self.lin0 = torch.nn.Linear(d_hid, d_hid)
         self.lin1 = torch.nn.Linear(d_hid, d_hid)
+<<<<<<< HEAD
+=======
+        self.lin2 = torch.nn.Linear(d_hid, d_hid)
+        self.lin3 = torch.nn.Linear(d_hid, d_hid)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, x, y=torch.zeros(DEFAULT_BATCH_SIZE, DEFAULT_DHID)):
         x = torch.mm(x, self.mm_param0)
@@ -49,6 +83,17 @@ def forward(self, x, y=torch.zeros(DEFAULT_BATCH_SIZE, DEFAULT_DHID)):
         x = torch.mm(x, self.mm_param1)
         x = self.lin1(x)
         x = torch.relu(x)
+<<<<<<< HEAD
+=======
+        if self.splits > 2:
+            pipe_split()
+            x = self.lin2(x)
+            x = torch.relu(x)
+        if self.splits > 3:
+            pipe_split()
+            x = self.lin3(x)
+            x = torch.relu(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x
 
 
@@ -88,6 +133,30 @@ def forward(self, x):
         return x
 
 
+<<<<<<< HEAD
+=======
+class MLPKWargModule(torch.nn.Module):
+    def __init__(self, d_hid: int, layer_num):
+        super().__init__()
+        self.net1 = torch.nn.Linear(d_hid, d_hid)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(d_hid, d_hid)
+        self.layer_num = layer_num
+
+    def forward(self, x, unused_kwarg: torch.Tensor = torch.zeros(1)):
+        x = self.net1(x)
+        x = self.relu(x)
+        x = self.net2(x)
+        # Test when only 1 module has extra outputs
+        # TODO: handle this case later
+        # if self.layer_num == 0:
+        #     return x, unused_kwarg
+        # else:
+        #     return x
+        return x
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Multi-MLP model
 class MultiMLP(torch.nn.Module):
     def __init__(self, d_hid: int, n_layers: int = 2):
@@ -104,6 +173,32 @@ def forward(self, x):
         return x
 
 
+<<<<<<< HEAD
+=======
+# Multi-MLP with kwargs model
+class MultiMLPKwargs(torch.nn.Module):
+    def __init__(self, d_hid: int, n_layers: int = 2):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [MLPKWargModule(d_hid, i) for i in range(n_layers)]
+        )
+        # For testing purpose only, this should be defined by user
+        self.split_spec = {
+            f"layers.{i}": SplitPoint.BEGINNING for i in range(1, n_layers)
+        }
+
+    def forward(self, x, unused_kwarg: torch.Tensor = torch.zeros(1)):
+        for layer in self.layers:
+            # TODO: handle this case later
+            # if layer.layer_num == 0:
+            #     x, _ = layer(x, unused_kwarg)
+            # else:
+            #     x = layer(x)
+            x = layer(x)
+        return x
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CustomLinearDx(Function):
     @staticmethod
     def forward(ctx, input_val, weight, bias, module, layer_idx):
diff --git a/test/distributed/pipelining/test_pipe.py b/test/distributed/pipelining/test_pipe.py
index 3e02c4de3c93..684f5b361d14 100644
--- a/test/distributed/pipelining/test_pipe.py
+++ b/test/distributed/pipelining/test_pipe.py
@@ -89,9 +89,15 @@ def test_model_split(self, ModelClass):
             mb_args=(x, y),
         )
 
+<<<<<<< HEAD
         assert (
             pipe.num_stages == EXPECTED_N_STAGES[ModelClass]
         ), f"nstages = {pipe.num_stages}, expect {EXPECTED_N_STAGES[ModelClass]}"
+=======
+        assert pipe.num_stages == EXPECTED_N_STAGES[ModelClass], (
+            f"nstages = {pipe.num_stages}, expect {EXPECTED_N_STAGES[ModelClass]}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ref_out = mod(x, y)
         out = pipe(x, y)[0]
@@ -109,9 +115,13 @@ def test_model_split(self, ModelClass):
             new_names.update(stage_fqns)
 
         if CHECK_FQN_SET_EQUALITY:
+<<<<<<< HEAD
             assert (
                 old_names == new_names
             ), f"""
+=======
+            assert old_names == new_names, f"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             old names {old_names}
             new names {new_names}
             """
diff --git a/test/distributed/pipelining/test_schedule_multiproc.py b/test/distributed/pipelining/test_schedule_multiproc.py
index 831e6ffe1d59..5b8389a9b698 100644
--- a/test/distributed/pipelining/test_schedule_multiproc.py
+++ b/test/distributed/pipelining/test_schedule_multiproc.py
@@ -2,11 +2,17 @@
 # Owner(s): ["oncall: distributed"]
 import copy
 import logging
+<<<<<<< HEAD
 import os
 import sys
 import tempfile
 
 from model_registry import ModelWithKwargs, MultiMLP, MultiMLPWithDw
+=======
+import tempfile
+
+from model_registry import ModelWithKwargs, MultiMLP, MultiMLPKwargs, MultiMLPWithDw
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from schedule_registry import (
     ScheduleUnbalanced,
     ScheduleVShaped,
@@ -37,6 +43,10 @@
     check_leaked_tensors,
     instantiate_parametrized_tests,
     parametrize,
+<<<<<<< HEAD
+=======
+    run_tests,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_but_pass_in_sandcastle_if,
 )
 
@@ -45,16 +55,27 @@
 
 d_hid = 512
 batch_size = 256
+<<<<<<< HEAD
 
 torch.manual_seed(0)
 
 
 class ScheduleTest(MultiProcContinousTest):
+=======
+torch.manual_seed(0)
+device_type = "cuda"
+
+
+class ScheduleTest(MultiProcContinousTest):
+    world_size = 2
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
         return "nccl"
 
+<<<<<<< HEAD
     @classmethod
     def setUpClass(cls):
         """
@@ -64,6 +85,11 @@ def setUpClass(cls):
         super().setUpClass()
         dev_id = cls.rank % torch.cuda.device_count()
         cls.device = torch.device(f"cuda:{dev_id}")
+=======
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@@ -77,7 +103,11 @@ def test_forward_only(self, ScheduleClass):
         x = torch.randn(batch_size, d_hid, device=self.device)
         x_clone = x.clone()
 
+<<<<<<< HEAD
         num_microbatches = 4
+=======
+        num_microbatches = 2 * self.world_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x_mb = x.chunk(num_microbatches)[0]
 
         # Create a pipeline
@@ -159,6 +189,15 @@ def test_multi_iter(self, ScheduleClass):
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
     def test_kwargs_with_tracer(self, ScheduleClass):
+<<<<<<< HEAD
+=======
+        # Model has two stages only, thus limiting group size to 2
+        group_size = 2
+        group = dist.new_group(list(range(group_size)))
+        if self.rank >= group_size:
+            return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod = ModelWithKwargs(d_hid)
         mod.to(self.device)
 
@@ -180,6 +219,10 @@ def test_kwargs_with_tracer(self, ScheduleClass):
         stage = pipe.build_stage(
             self.rank,
             self.device,
+<<<<<<< HEAD
+=======
+            group=group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Attach to a schedule
@@ -188,16 +231,27 @@ def test_kwargs_with_tracer(self, ScheduleClass):
         # Run
         if self.rank == 0:
             schedule.step(x, y=y)
+<<<<<<< HEAD
         elif self.rank == self.world_size - 1:
+=======
+        elif self.rank == group_size - 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             losses = []
             out = schedule.step(target=target, losses=losses)
         else:
             schedule.step()
 
+<<<<<<< HEAD
         dist.barrier()
 
         # Last rank checks result
         if self.rank == self.world_size - 1:
+=======
+        # dist.barrier()
+
+        # Last rank checks result
+        if self.rank == group_size - 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ref_out = mod(x, y=y)
             ref_loss = loss_fn(ref_out, target)
             pipe_loss = sum(losses)
@@ -207,9 +261,14 @@ def test_kwargs_with_tracer(self, ScheduleClass):
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+<<<<<<< HEAD
     @parametrize("ModelClass", [MultiMLP])
     def test_grad_with_tracer(self, ScheduleClass, ModelClass):
         mod = ModelClass(d_hid)
+=======
+    def test_grad_with_tracer(self, ScheduleClass):
+        mod = MultiMLP(d_hid, n_layers=self.world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod.to(self.device)
 
         ref_mod = copy.deepcopy(mod)
@@ -229,7 +288,11 @@ def test_grad_with_tracer(self, ScheduleClass, ModelClass):
             ref_loss.backward()
 
         # Create a pipeline
+<<<<<<< HEAD
         chunks = 4
+=======
+        chunks = 2 * self.world_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x_mb = x.chunk(chunks)[0]
         split_spec = mod.split_spec if hasattr(mod, "split_spec") else None
         pipe = pipeline(
@@ -307,7 +370,11 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
         # Get a submodule, e.g. `layers.0` or `layers.1`
         submod_name = f"layers.{self.rank}"
         stage_module = full_mod.get_submodule(submod_name)
+<<<<<<< HEAD
         chunks = 4
+=======
+        chunks = 2 * self.world_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if shape_inference:
             input_args = None
@@ -410,7 +477,11 @@ def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
         num_microbatches = (
             ScheduleClass.num_microbatches
             if hasattr(ScheduleClass, "num_microbatches")
+<<<<<<< HEAD
             else 8
+=======
+            else 2 * self.world_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         stages = [
             PipelineStage(
@@ -512,7 +583,11 @@ def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
             for name, p in stage_module.named_parameters():
                 ref_p = ref_submod.get_parameter(name)
                 try:
+<<<<<<< HEAD
                     torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5)
+=======
+                    torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=1e-3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except AssertionError:
                     print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
                     raise
@@ -524,7 +599,11 @@ def test_schedule_with_native_zero_bubble(self, ScheduleClass):
         print(ScheduleClass)
         if ScheduleClass is ScheduleInterleavedZeroBubble:
             n_stages = 4
+<<<<<<< HEAD
             num_microbatches = 8
+=======
+            num_microbatches = 2 * n_stages
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rank_stages = {
                 0: [0, 2],
                 1: [1, 3],
@@ -604,7 +683,11 @@ def test_schedule_with_native_zero_bubble(self, ScheduleClass):
             for name, p in stage_module.named_parameters():
                 ref_p = ref_submod.get_parameter(name)
                 try:
+<<<<<<< HEAD
                     torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=9e-5)
+=======
+                    torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except AssertionError:
                     print(
                         f"Parameter test failed for {submod_name}.{name}: {p.grad} vs {ref_p.grad}"
@@ -937,11 +1020,122 @@ def dw_runner():
                 ref_p = ref_submod.get_parameter(name)
                 torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=4e-5)
 
+<<<<<<< HEAD
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize(
+        "ScheduleClass",
+        [ScheduleInterleavedZeroBubble, ScheduleInterleaved1F1B],
+    )
+    def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
+        stages_per_rank = 2
+        n_stages = stages_per_rank * self.world_size
+        full_mod = MultiMLPKwargs(d_hid, n_layers=n_stages)
+        full_mod.to(self.device)
+
+        ref_mod = copy.deepcopy(full_mod)
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        unused_kwarg = torch.tensor([1.0], device=self.device)
+
+        with torch.no_grad():
+            y = ref_mod(x)
+            # Add a small perturbation
+            target = y + torch.randn(batch_size, d_hid, device=self.device)
+
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        # Get a submodule, e.g. `layers.0` or `layers.1`
+        stage_indices = [
+            self.rank + i * self.world_size for i in range(stages_per_rank)
+        ]
+        submod_names = [f"layers.{i}" for i in stage_indices]
+        stage_modules = [
+            full_mod.get_submodule(submod_name) for submod_name in submod_names
+        ]
+        # Run reference
+        for _ in range(2):
+            ref_stage_modules = [
+                ref_mod.get_submodule(submod_name) for submod_name in submod_names
+            ]
+            for stage_module in ref_stage_modules:
+                stage_module.zero_grad()
+
+            ref_mod.zero_grad()
+            ref_out = ref_mod(x, unused_kwarg=unused_kwarg)
+            ref_loss = loss_fn(ref_out, target)
+            ref_loss.backward()
+
+        # Create a pipeline stage to wrap that submodule
+        stages = [
+            PipelineStage(
+                stage_module,
+                stage_idx,
+                n_stages,
+                self.device,
+            )
+            for stage_module, stage_idx in zip(stage_modules, stage_indices)
+        ]
+
+        # Attach to a schedule
+        num_microbatches = (
+            ScheduleClass.num_microbatches
+            if hasattr(ScheduleClass, "num_microbatches")
+            else 2 * self.world_size
+        )
+        schedule = ScheduleClass(
+            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
+        )
+
+        for _ in range(2):
+            # Zero gradients
+            for stage_module in stage_modules:
+                stage_module.zero_grad()
+            if self.rank == 0:
+                schedule.step(
+                    x,
+                    unused_kwarg=unused_kwarg.clone()
+                    .unsqueeze(0)
+                    .expand(num_microbatches, -1),
+                )
+            elif self.rank == self.world_size - 1:
+                losses = []
+                out = schedule.step(target=target, losses=losses)
+            else:
+                schedule.step()
+
+        dist.barrier()
+        # Last rank checks result
+        if self.rank == self.world_size - 1:
+            # Check output
+            torch.testing.assert_close(out, ref_out)
+
+            # Check loss
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+        # Every rank checks gradients
+        for stage_module, submod_name in zip(stage_modules, submod_names):
+            # Get corresponding submodule from reference model
+            ref_submod = ref_mod.get_submodule(submod_name)
+            # Check gradients per parameter
+            for name, p in stage_module.named_parameters():
+                ref_p = ref_submod.get_parameter(name)
+                try:
+                    torch.testing.assert_close(p.grad, ref_p.grad, rtol=1e-5, atol=5e-3)
+                except AssertionError:
+                    print(
+                        f"Gradient test failed for {name}: {p.grad=} vs {ref_p.grad=}"
+                    )
+                    raise
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(ScheduleTest)
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     # Check if GPU and NCCL are available
     if not (
         dist.is_available()
@@ -969,3 +1163,6 @@ def dw_runner():
             nprocs=world_size,
             args=(world_size, rdvz_file),
         )
+=======
+    run_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/pipelining/test_stage.py b/test/distributed/pipelining/test_stage.py
index 450e719377f8..551ade548c3c 100644
--- a/test/distributed/pipelining/test_stage.py
+++ b/test/distributed/pipelining/test_stage.py
@@ -1,8 +1,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
+<<<<<<< HEAD
 import os
 import sys
 import tempfile
+=======
+
+import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from model_registry import ExampleCode, ModelWithKwargs, MultiMLP
 
@@ -18,11 +23,20 @@
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
     MultiProcContinousTest,
+<<<<<<< HEAD
+=======
+    MultiProcessTestCase,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     requires_nccl,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+<<<<<<< HEAD
+=======
+    run_tests,
+    skip_but_pass_in_sandcastle,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_but_pass_in_sandcastle_if,
 )
 from torch.utils._pytree import tree_map_only
@@ -32,6 +46,11 @@
 batch_size = 256
 chunks = 4
 
+<<<<<<< HEAD
+=======
+device_type = "cuda"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.manual_seed(0)
 
 
@@ -66,6 +85,7 @@ def backend_str(cls) -> str:
         return "nccl"
 
     @classmethod
+<<<<<<< HEAD
     def setUpClass(cls):
         """
         Class-scope test fixture. Run once for entire test class, before any test starts.
@@ -74,12 +94,24 @@ def setUpClass(cls):
         super().setUpClass()
         dev_id = cls.rank % torch.cuda.device_count()
         cls.device = torch.device(f"cuda:{dev_id}")
+=======
+    def device_type(cls) -> str:
+        return device_type
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     @parametrize("ModelClass", [ExampleCode, MultiMLP])
     def test_tracer(self, ModelClass):
+<<<<<<< HEAD
         mod = ModelClass(d_hid)
+=======
+        mod = ModelClass(d_hid, self.world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod.to(self.device)
 
         x = torch.randn(batch_size, d_hid, device=self.device)
@@ -119,6 +151,7 @@ def _run_step(x):
         old_keys = mod.state_dict().keys()
         assert all(k in old_keys for k in submod_keys)
 
+<<<<<<< HEAD
         if self.rank == 0:
             # intended to run this code on all ranks, but the problem is if rank0 throws,
             # it won't perform the send that unblocks rank 1.
@@ -140,11 +173,17 @@ def _run_step(x):
             with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"):
                 _run_step(x)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     @parametrize("ModelClass", [ModelWithKwargs])
     def test_tracer_kwargs(self, ModelClass):
+<<<<<<< HEAD
         mod = ModelClass(d_hid)
+=======
+        mod = ModelClass(d_hid, self.world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod.to(self.device)
 
         x = torch.randn(batch_size, d_hid, device=self.device)
@@ -221,6 +260,7 @@ def _run_step(x):
             ref_out = full_mod(x)
             torch.testing.assert_close(out, ref_out)
 
+<<<<<<< HEAD
         if self.rank == 0:
             with self.assertRaisesRegex(PipeliningShapeError, "shape mismatch"):
                 _run_step(torch.randn(batch_size + 1, d_hid, device=self.device))
@@ -238,6 +278,8 @@ def _run_step(x):
             with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"):
                 _run_step(x)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_custom_dw_with_fb_schedule(self):
@@ -298,14 +340,161 @@ def _run_step(x):
             ref_out = full_mod(x)
             torch.testing.assert_close(out, ref_out)
 
+<<<<<<< HEAD
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_output_chunks_memory_usage(self):
+        """Test that output_chunks doesn't store memory for non-first stages."""
+        full_mod = MultiMLP(d_hid, n_layers=self.world_size)
+        full_mod.to(self.device)
+        stage_mod = full_mod.get_submodule(f"layers.{self.rank}")
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        target = torch.randn(batch_size, d_hid, device=self.device)
+        stage = PipelineStage(
+            stage_mod,
+            self.rank,
+            self.world_size,
+            self.device,
+        )
+        self.assertEqual(
+            len(stage.output_chunks), 0, "output_chunks should be empty initially"
+        )
+
+        schedule = ScheduleGPipe(
+            stage, chunks, loss_fn=torch.nn.MSELoss(reduction="sum")
+        )
+
+        def _run_step(x):
+            if self.rank == 0:
+                return schedule.step(x)
+            elif self.rank == self.world_size - 1:
+                return schedule.step(target=target)
+            else:
+                return schedule.step()
+
+        _run_step(x)
+
+        # Verify fwd_cache is empty
+        self.assertEqual(len(stage.fwd_cache), 0, "fwd_cache should be cleared")
+
+        # Check output_chunks state after step
+        if self.rank == self.world_size - 1:
+            self.assertEqual(
+                len(stage.output_chunks),
+                chunks,
+                "Last stage should store output chunks",
+            )
+        else:
+            self.assertEqual(
+                len(stage.output_chunks),
+                0,
+                f"Non-last stage (rank {self.rank}) should not store output chunks",
+            )
+
+        # Clear the schedule and stage caches
+        stage.clear_runtime_states()
+        if self.rank == self.world_size - 1:
+            # Last stage should have output_chunks populated
+            self.assertEqual(
+                len(stage.output_chunks), 0, "Last stage should store output chunks"
+            )
+
+
+instantiate_parametrized_tests(StageTest)
+
+
+class StageNegativeTest(MultiProcessTestCase):
+    @property
+    def world_size(self) -> int:
+        return torch.get_device_module(device_type).device_count()
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def init_pg(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="nccl",
+            store=store,
+            rank=self.rank,
+            world_size=self.world_size,
+            device_id=self.device,
+        )
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle("Flaky in CI")
+    def test_shape_prop_mismatch(self):
+        """Tests shape prop errors are raised"""
+        self.init_pg()
+
+        full_mod = MultiMLP(d_hid, n_layers=self.world_size)
+        full_mod.to(self.device)
+        stage_mod = full_mod.get_submodule(f"layers.{self.rank}")
+
+        x = torch.randn(batch_size, d_hid, device=self.device)
+
+        stage = PipelineStage(
+            stage_mod,
+            self.rank,
+            self.world_size,
+            self.device,
+        )
+
+        # Attach to a schedule
+        schedule = ScheduleGPipe(stage, chunks)
+
+        # Run
+        def _run_step(x):
+            if self.rank == 0:
+                return schedule.step(x)
+            else:
+                return schedule.step()
+
+        _run_step(x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.rank == 0:
             with self.assertRaisesRegex(PipeliningShapeError, "shape mismatch"):
                 _run_step(torch.randn(batch_size + 1, d_hid, device=self.device))
 
+<<<<<<< HEAD
+=======
+            with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"):
+                _run_step(x.to(torch.int32))
+
+            # output of stage's mlp layer will be flattened by this hook, the stage should err
+            handle = stage_mod.register_forward_hook(get_flatten_hook())
+            with self.assertRaisesRegex(PipeliningShapeError, "shape mismatch"):
+                _run_step(x)
+            handle.remove()
+
+            stage_mod.register_forward_hook(get_dtype_change_hook(torch.bfloat16))
+            with self.assertRaisesRegex(PipeliningShapeError, "dtype mismatch"):
+                _run_step(x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_custom_dw_errors(self):
         """Tests expected errors are raised"""
+<<<<<<< HEAD
+=======
+        self.init_pg()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         full_mod = MultiMLP(d_hid, n_layers=self.world_size)
         full_mod.to(self.device)
         stage_mod = full_mod.get_submodule(f"layers.{self.rank}")
@@ -321,6 +510,7 @@ def test_custom_dw_errors(self):
             stage_with_dw_builder.backward_weight_one_chunk(bwd_chunk_id=0)
 
 
+<<<<<<< HEAD
 instantiate_parametrized_tests(StageTest)
 
 if __name__ == "__main__":
@@ -351,3 +541,7 @@ def test_custom_dw_errors(self):
             nprocs=world_size,
             args=(world_size, rdvz_file),
         )
+=======
+if __name__ == "__main__":
+    run_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/pipelining/test_unflatten.py b/test/distributed/pipelining/test_unflatten.py
index 5fb30b5e1d17..6864fa473701 100644
--- a/test/distributed/pipelining/test_unflatten.py
+++ b/test/distributed/pipelining/test_unflatten.py
@@ -60,9 +60,15 @@ def test_unflatten(self, device):
         for stage_idx in range(pipe.num_stages):
             stage_mod = pipe.get_stage_module(stage_idx)
             for param_name, _ in stage_mod.named_parameters():
+<<<<<<< HEAD
                 assert (
                     param_name in orig_state_dict
                 ), f"{param_name} not in original state dict"
+=======
+                assert param_name in orig_state_dict, (
+                    f"{param_name} not in original state dict"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         print("Param qualname test passed")
 
         # Check equivalence
diff --git a/test/distributed/rpc/test_share_memory.py b/test/distributed/rpc/test_share_memory.py
index bda98b1df949..ac885f44e547 100644
--- a/test/distributed/rpc/test_share_memory.py
+++ b/test/distributed/rpc/test_share_memory.py
@@ -45,9 +45,15 @@ def __init__(self) -> None:
         for t in torch._tensor_classes:
             self._dispatch_table[t] = TorchMpReductions.reduce_tensor
         self._dispatch_table[torch.Tensor] = TorchMpReductions.reduce_tensor
+<<<<<<< HEAD
         self._dispatch_table[
             torch.nn.parameter.Parameter
         ] = TorchMpReductions.reduce_tensor
+=======
+        self._dispatch_table[torch.nn.parameter.Parameter] = (
+            TorchMpReductions.reduce_tensor
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def worker_loop(a):
diff --git a/test/distributed/tensor/debug/test_comm_mode.py b/test/distributed/tensor/debug/test_comm_mode.py
index fb194f461978..b293be42c84d 100644
--- a/test/distributed/tensor/debug/test_comm_mode.py
+++ b/test/distributed/tensor/debug/test_comm_mode.py
@@ -4,8 +4,12 @@
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, DTensor
 from torch.distributed._tensor.placement_types import Shard
+=======
+from torch.distributed.tensor import DeviceMesh, DTensor, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import requires_nccl
 from torch.testing._internal.common_utils import run_tests, TestCase
diff --git a/test/distributed/tensor/debug/test_comm_mode_features.py b/test/distributed/tensor/debug/test_comm_mode_features.py
index cbd0ff0e8508..8438a716e5f3 100644
--- a/test/distributed/tensor/debug/test_comm_mode_features.py
+++ b/test/distributed/tensor/debug/test_comm_mode_features.py
@@ -4,15 +4,23 @@
 from typing import Any
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh
 from torch.distributed._tensor.api import distribute_tensor, DTensor
+=======
+from torch.distributed.tensor import DeviceMesh, distribute_tensor, DTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
     RowwiseParallel,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests
+=======
+from torch.testing._internal.common_utils import run_tests, skipIfHpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     MLPModule,
@@ -24,8 +32,12 @@
     with_comms,
 )
 
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FUSED_ATTENTION
 import unittest
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 c10d_functional = torch.ops.c10d_functional
 
 
@@ -113,6 +125,10 @@ def test_MLP_distributed_sharding_display(self):
         )
         self.check_same_set_of_keys(module_sharding_dict, comm_mode.get_sharding_info())
 
+<<<<<<< HEAD
+=======
+    @skipIfHpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_comms
     def test_MLPStacked_distributed_sharding_display(self):
         """
@@ -220,9 +236,15 @@ def test_MLP_module_tracing(self):
             1,
         )
 
+<<<<<<< HEAD
     @skip_unless_torch_gpu
     @with_comms
     @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Does not support fused scaled dot product attention")
+=======
+    @skipIfHpu
+    @skip_unless_torch_gpu
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_transformer_module_tracing(self, is_seq_parallel=False):
         """
         tests module-level tracing for more complicated transformer module and
diff --git a/test/distributed/tensor/experimental/test_local_map.py b/test/distributed/tensor/experimental/test_local_map.py
index 85d9977310c1..99dd7cfdc2f8 100644
--- a/test/distributed/tensor/experimental/test_local_map.py
+++ b/test/distributed/tensor/experimental/test_local_map.py
@@ -4,6 +4,7 @@
 
 import torch
 import torch.distributed._functional_collectives as funcol
+<<<<<<< HEAD
 from torch.distributed._tensor import (
     distribute_tensor,
     DTensor,
@@ -13,6 +14,18 @@
 )
 from torch.distributed._tensor.experimental import local_map
 from torch.distributed.tensor.debug import CommDebugMode
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import (
+    distribute_tensor,
+    DTensor,
+    Partial,
+    Replicate,
+    Shard,
+)
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.experimental import local_map
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -323,6 +336,70 @@ def test_local_map_redistribute(self):
         with self.assertRaisesRegex(ValueError, "set redistribute_inputs=True"):
             Y_dt = local_mm_allreduce_forward(device_mesh, X_dt, W_dt)
 
+<<<<<<< HEAD
+=======
+    # check for `in_grad_placements` handling
+    @with_comms()
+    def test_local_map_with_grad_placement(self):
+        """
+        Test the gradient result is correct when we specify the right
+        `in_grad_placements`.
+        """
+        device_mesh = init_device_mesh(
+            device_type=self.device_type, mesh_shape=(self.world_size,)
+        )
+        torch.manual_seed(12)
+
+        # ground truth output, consider X as a batch of 2 on dim 0.
+        X = torch.randn(4, 2, device=self.device_type, requires_grad=True)
+        X1, X2 = torch.chunk(X, 2, dim=0)
+        X1 = X1.detach().requires_grad_()
+        X2 = X2.detach().requires_grad_()
+        W = torch.randn(2, 4, device=self.device_type, requires_grad=True)
+        Y1 = torch.mm(X1, W)
+        Y2 = torch.mm(X2, W)
+        loss = Y1.sum() + Y2.sum()
+        loss.backward()
+
+        in_placement_mismatch_choice = (False, True)
+        for is_in_placement_mismatch in in_placement_mismatch_choice:
+            if is_in_placement_mismatch:
+                # in_placements for local_map() will take effect
+                X_dt = distribute_tensor(X, device_mesh, replicate)
+            else:
+                # in_placements for local_map() will not take effect
+                X_dt = distribute_tensor(X, device_mesh, row_wise)
+            W_dt = distribute_tensor(W, device_mesh, replicate)
+            in_grad_placements = ([Shard(0)], [Partial()])
+
+            local_mm_forward = local_map(
+                mm_forward,
+                out_placements=[Shard(0)],
+                in_placements=(row_wise, replicate),
+                in_grad_placements=in_grad_placements,
+                device_mesh=device_mesh,
+                redistribute_inputs=True,
+            )
+            Y_dt = local_mm_forward(X_dt, W_dt)
+            self.assertEqual(Y_dt.full_tensor(), torch.cat([Y1, Y2], dim=0))
+
+            # Note: this is a way to simulate how DPP works. We don't need to
+            # all_gather the loss. Instead, we do all_reduce to each distributed
+            # weight.
+            loss = Y_dt.to_local().sum()
+            loss.backward()
+
+            if not is_in_placement_mismatch:
+                self.assertEqual(X_dt.grad.placements, in_grad_placements[0])
+                self.assertEqual(W_dt.grad.placements, in_grad_placements[1])
+            # regardless of is_in_placement_mismatch, grad output should always
+            # match
+            self.assertEqual(
+                X_dt.grad.full_tensor(), torch.cat([X1.grad, X2.grad], dim=0)
+            )
+            self.assertEqual(W_dt.grad.full_tensor(), W.grad)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/experimental/test_register_sharding.py b/test/distributed/tensor/experimental/test_register_sharding.py
index 0c17f54a07cb..61b3e85f4bc3 100644
--- a/test/distributed/tensor/experimental/test_register_sharding.py
+++ b/test/distributed/tensor/experimental/test_register_sharding.py
@@ -3,9 +3,15 @@
 import itertools
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import distribute_tensor, DTensor, Replicate, Shard
 from torch.distributed._tensor.experimental import register_sharding
 from torch.distributed._tensor.placement_types import DTensorSpec
+=======
+from torch.distributed.tensor import distribute_tensor, DTensor, Replicate, Shard
+from torch.distributed.tensor._dtensor_spec import DTensorSpec
+from torch.distributed.tensor.experimental import register_sharding
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
diff --git a/test/distributed/tensor/experimental/test_tp_transform.py b/test/distributed/tensor/experimental/test_tp_transform.py
index 30961b1bad76..389d6a05c0ff 100644
--- a/test/distributed/tensor/experimental/test_tp_transform.py
+++ b/test/distributed/tensor/experimental/test_tp_transform.py
@@ -2,7 +2,11 @@
 from collections import defaultdict
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor.experimental._tp_transform import (
+=======
+from torch.distributed.tensor.experimental._tp_transform import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensor_parallel_transformation,
 )
 from torch.distributed.tensor.parallel.style import (
diff --git a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
index ca064003c395..92e5dd5cb427 100644
--- a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
+++ b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
@@ -13,15 +13,24 @@
     micro_pipeline_tp_pass,
 )
 from torch._inductor.fx_passes.post_grad import remove_noop_ops, view_to_reshape
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache, run_and_get_triton_code
+=======
+from torch._inductor.utils import fresh_cache, run_and_get_triton_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed._functional_collectives import (
     all_gather_tensor,
     reduce_scatter_tensor,
 )
 from torch.distributed._symmetric_memory import _test_mode
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh
 from torch.distributed._tensor.placement_types import Shard
 from torch.distributed.distributed_c10d import _get_group_size_by_name
+=======
+from torch.distributed.distributed_c10d import _get_group_size_by_name
+from torch.distributed.tensor import DeviceMesh, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
@@ -30,9 +39,16 @@
 from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
     instantiate_parametrized_tests,
+<<<<<<< HEAD
     parametrize,
     run_tests,
     skipIfRocm,
+=======
+    MI300_ARCH,
+    parametrize,
+    run_tests,
+    runOnRocmArch,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
@@ -81,7 +97,11 @@ def tearDown(self):
         dist.destroy_process_group()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_find_all_gather_patterns(self):
         group = dist.group.WORLD
 
@@ -124,7 +144,11 @@ def func(
         self.assertEqual(all_gathers[2].gather_dim, 0)
         self.assertEqual(
             all_gathers[2].res_node.target,
+<<<<<<< HEAD
             torch.ops.aten.view.dtype,
+=======
+            torch.ops._c10d_functional.wait_tensor.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.assertEqual(all_gathers[3].gather_dim, 1)
@@ -134,7 +158,11 @@ def func(
         )
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_find_reduce_scatter_patterns(self):
         group = dist.group.WORLD
 
@@ -157,11 +185,19 @@ def func(inp: torch.Tensor) -> torch.Tensor:
                 "placeholder",
             )
             self.assertEqual(
+<<<<<<< HEAD
                 reduce_scatter.rs_node.target,
                 torch.ops._c10d_functional.reduce_scatter_tensor.default,
             )
             self.assertEqual(
                 reduce_scatter.res_node.target,
+=======
+                reduce_scatter.reduce_scatter_node.target,
+                torch.ops._c10d_functional.reduce_scatter_tensor.default,
+            )
+            self.assertEqual(
+                reduce_scatter.wait_tensor_node.target,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.ops._c10d_functional.wait_tensor.default,
             )
             self.assertEqual(reduce_scatter.group_name, group.group_name)
@@ -173,7 +209,11 @@ def func(inp: torch.Tensor) -> torch.Tensor:
         self.assertEqual(reduce_scatters[1].scatter_dim, 1)
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_unexposed_collectives(self):
         group = dist.group.WORLD
 
@@ -201,7 +241,11 @@ def func(inp: torch.Tensor) -> torch.Tensor:
     @parametrize("A_dims", [2, 3])
     @parametrize("gather_dim", [0, 1, 2])
     @parametrize("return_A", [True, False])
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fuse_all_gather_matmul(self, A_dims, gather_dim, return_A):
         if gather_dim >= A_dims:
             return
@@ -243,12 +287,20 @@ def func(A_shard: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
             self.assertNotIn("all_gather_into_tensor", code)
             self.assertEqual("return_A=True" in code, return_A)
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @parametrize("A_dims", [2, 3])
     @parametrize("gather_dim", [0, 1, 2])
     @parametrize("return_A", [True, False])
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fuse_all_gather_scaled_matmul(self, A_dims, gather_dim, return_A):
         if gather_dim >= A_dims:
             return
@@ -321,7 +373,11 @@ def func(
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @parametrize("A_dims", [2, 3])
     @parametrize("scatter_dim", [0, 1, 2])
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fuse_matmul_reduce_scatter(self, A_dims, scatter_dim):
         if scatter_dim >= A_dims:
             return
@@ -346,11 +402,19 @@ def func(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
         self.assertIn("fused_matmul_reduce_scatter", code)
         self.assertNotIn("reduce_scatter_tensor", code)
 
+<<<<<<< HEAD
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @parametrize("A_dims", [2, 3])
     @parametrize("scatter_dim", [0, 1, 2])
     @fresh_inductor_cache()
+=======
+    @runOnRocmArch(MI300_ARCH)
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @parametrize("A_dims", [2, 3])
+    @parametrize("scatter_dim", [0, 1, 2])
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fuse_scaled_matmul_reduce_scatter(self, A_dims, scatter_dim):
         if scatter_dim >= A_dims:
             return
@@ -400,10 +464,17 @@ def func(
         self.assertIn("fused_scaled_matmul_reduce_scatter", code)
         self.assertNotIn("reduce_scatter_tensor", code)
 
+<<<<<<< HEAD
     @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @parametrize("scatter_dim", [2])
     @fresh_inductor_cache()
+=======
+    @runOnRocmArch(MI300_ARCH)
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @parametrize("scatter_dim", [0, 1, 2])
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape(
         self, scatter_dim
     ):
@@ -433,11 +504,19 @@ def reshape_mm_reshape(
             C = C.view(*orig_shape[:-1], C.shape[-1])
             return reduce_scatter_tensor(C, "sum", scatter_dim, group)
 
+<<<<<<< HEAD
         A = torch.rand(1, 16, 32, device="cuda").to(torch.float8_e4m3fn)
         B = torch.rand(64, 32, device="cuda").to(torch.float8_e4m3fn).T
 
         # A_scale = rowwise scales
         A_scale = torch.full((1, 16, 1), 0.1, device="cuda")
+=======
+        A = torch.rand(2, 16, 32, device="cuda").to(e4m3_type)
+        B = torch.rand(64, 32, device="cuda").to(e4m3_type).T
+
+        # A_scale = rowwise scales
+        A_scale = torch.full((2, 16, 1), 0.1, device="cuda")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # B_scale = rowwise scales transposed for A @ B^T
         B_scale = torch.full((1, 64), 0.1, device="cuda")
@@ -465,7 +544,11 @@ def reshape_mm_reshape(
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @parametrize("shard_dim", [0, 1])
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dtensor_seq_par(self, shard_dim: int):
         model: torch.nn.Module = MLPModule(device="cuda", bias=False)
         device_mesh = DeviceMesh(
@@ -494,5 +577,58 @@ def test_dtensor_seq_par(self, shard_dim: int):
         self.assertNotIn("reduce_scatter_tensor", code)
 
 
+<<<<<<< HEAD
+=======
+@instantiate_parametrized_tests
+class MicroPipelineTP4GPUTest(TestCase):
+    def setUp(self):
+        torch._inductor.config._micro_pipeline_tp = True
+
+        self.rank = 0
+        self.world_size = 4
+        torch.cuda.set_device("cuda:0")
+
+        store = FakeStore()
+        dist.init_process_group(
+            backend="fake",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+
+    def tearDown(self):
+        dist.destroy_process_group()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_cache()
+    def test_extra_collectives(self):
+        device_mesh = DeviceMesh(
+            "cuda",
+            torch.arange(0, self.world_size).view(2, -1),
+            mesh_dim_names=("tp", "other"),
+        )
+
+        def func(inp: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> torch.Tensor:
+            hidden = all_gather_tensor(inp, 0, (device_mesh, 0)) @ w1.t()
+            full_hidden = all_gather_tensor(hidden, 0, (device_mesh, 1))
+            full_hidden /= full_hidden.pow(2).sum().sqrt()
+            hidden = reduce_scatter_tensor(full_hidden, "avg", 0, (device_mesh, 1))
+            return reduce_scatter_tensor(hidden @ w2.t(), "avg", 0, (device_mesh, 0))
+
+        inp = torch.rand(8, 10, device="cuda")
+        w1 = torch.rand(7, 10, device="cuda")
+        w2 = torch.rand(10, 7, device="cuda")
+
+        with _test_mode(group_names={device_mesh["tp"].get_group().group_name}):
+            compiled = torch.compile(func)
+            code = run_and_get_triton_code(compiled, inp, w1, w2)
+
+        self.assertIn("fused_all_gather_matmul", code)
+        self.assertIn("all_gather_into_tensor", code)
+        self.assertIn("fused_matmul_reduce_scatter", code)
+        self.assertIn("reduce_scatter_tensor", code)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
index 18128366c8db..b14a594c4fb0 100644
--- a/test/distributed/tensor/parallel/test_parallelize_api.py
+++ b/test/distributed/tensor/parallel/test_parallelize_api.py
@@ -3,12 +3,20 @@
 from copy import deepcopy
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
+=======
+from torch.distributed.tensor import DeviceMesh, DTensor, Replicate, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.parallel.api import parallelize_module
 from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     PrepareModuleInput,
+<<<<<<< HEAD
+=======
+    PrepareModuleInputOutput,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PrepareModuleOutput,
     RowwiseParallel,
 )
@@ -202,6 +210,32 @@ def test_prepare_module_output(self):
         self.assertEqual(inp, output)
 
     @with_comms
+<<<<<<< HEAD
+=======
+    def test_prepare_module_input_output(self):
+        module = DummyModule()
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        parallelize_module(
+            module,
+            device_mesh,
+            PrepareModuleInputOutput(
+                input_layouts=Shard(0),
+                desired_input_layouts=Replicate(),
+                output_layouts=Replicate(),
+                desired_output_layouts=Shard(1),
+            ),
+        )
+        inp = torch.rand(5, 7, device=self.device_type)
+        output = module(inp)
+        inp = (
+            DTensor.from_local(inp, device_mesh, [Shard(0)], run_check=False)
+            .redistribute(device_mesh, [Shard(1)])
+            .to_local()
+        )
+        self.assertEqual(inp, output)
+
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_parallelize_module_with_star(self):
         inp_size = [12, 10]
         model = MLPModule(self.device_type)
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index 4b481511a70e..947de9fec2ec 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -8,17 +8,28 @@
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.distributed._tensor import (
+=======
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    checkpoint_wrapper,
+    CheckpointImpl,
+)
+from torch.distributed.tensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DeviceMesh,
     distribute_tensor,
     DTensor,
     Replicate,
     Shard,
 )
+<<<<<<< HEAD
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     checkpoint_wrapper,
     CheckpointImpl,
 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -43,7 +54,10 @@
     Transformer,
     with_comms,
 )
+<<<<<<< HEAD
 from unittest import skipIf
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 c10d_functional = torch.ops.c10d_functional
@@ -262,7 +276,11 @@ def _validate_optim_step(
         check_comms=True,
     ):
         optim.step()  # Ensure model weights are still the same after update.
+<<<<<<< HEAD
         from torch.distributed._tensor.experimental import implicit_replication
+=======
+        from torch.distributed.tensor.experimental import implicit_replication
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with implicit_replication():
             with CommDebugMode() as comm_mode:
diff --git a/test/distributed/tensor/parallel/test_tp_random_state.py b/test/distributed/tensor/parallel/test_tp_random_state.py
index 18032329fc07..31894ca3a4b5 100644
--- a/test/distributed/tensor/parallel/test_tp_random_state.py
+++ b/test/distributed/tensor/parallel/test_tp_random_state.py
@@ -2,7 +2,12 @@
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.tensor._random as random
+<<<<<<< HEAD
 from torch.distributed._tensor import init_device_mesh, Replicate
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import Replicate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel.api import parallelize_module
 from torch.distributed.tensor.parallel.style import ColwiseParallel
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -20,10 +25,17 @@ def get_tensor_slice(self, idx, n, large_tensor):
         assert shape[0] % n == 0
         local_shape = [shape[0] // n, shape[1]]
 
+<<<<<<< HEAD
         slice_idx = [
             slice(idx * local_shape[0], (idx + 1) * local_shape[0]),
             slice(local_shape[1]),
         ]
+=======
+        slice_idx = (
+            slice(idx * local_shape[0], (idx + 1) * local_shape[0]),
+            slice(local_shape[1]),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return large_tensor[slice_idx]
 
     def check_gathered_tensors(self, self_rank, size, gathered_tensors, assertFunc):
diff --git a/test/distributed/tensor/parallel/test_tp_style.py b/test/distributed/tensor/parallel/test_tp_style.py
index a5a323992735..842e0a56662c 100644
--- a/test/distributed/tensor/parallel/test_tp_style.py
+++ b/test/distributed/tensor/parallel/test_tp_style.py
@@ -5,6 +5,7 @@
 
 import torch
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed._tensor import (
     distribute_tensor,
     DTensor,
@@ -12,6 +13,10 @@
     Replicate,
     Shard,
 )
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, DTensor, Replicate, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.parallel import parallelize_module
 from torch.distributed.tensor.parallel.style import (
diff --git a/test/distributed/tensor/test_api.py b/test/distributed/tensor/test_api.py
index b9280a143e52..c6278060402c 100644
--- a/test/distributed/tensor/test_api.py
+++ b/test/distributed/tensor/test_api.py
@@ -3,7 +3,11 @@
 
 import torch
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed._tensor import (
+=======
+from torch.distributed.tensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DeviceMesh,
     distribute_module,
     distribute_tensor,
diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py
index d123de75ad13..f8ecfc4170d9 100644
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@@ -6,18 +6,31 @@
 import torch.distributed as dist
 import torch.nn.functional as F
 from torch import nn
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh
 from torch.distributed._tensor.experimental._attention import (
     _AttentionContextParallel,
     _CausalBehavior,
     _cp_options,
+=======
+from torch.distributed.tensor import DeviceMesh
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.experimental._attention import (
+    _AttentionContextParallel,
+    _CausalBehavior,
+    _cp_options,
+    _DispatchMode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _is_causal_behavior,
     _RotateMethod,
     context_parallel,
     context_parallel_unshard,
     set_rotate_method,
 )
+<<<<<<< HEAD
 from torch.distributed.tensor.debug import CommDebugMode
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import parallelize_module
 from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch.testing._internal.common_cuda import (
@@ -76,6 +89,13 @@ def test_ring_attention_sdpa(self) -> None:
                 "load_balance": [True, False],
                 "rotater": [_RotateMethod.ALL_TO_ALL, _RotateMethod.ALL_GATHER],
                 "test_forward_only": [True, False],
+<<<<<<< HEAD
+=======
+                "dispatch_mode": [
+                    _DispatchMode.MONKEY_PATCH,
+                    _DispatchMode.TORCH_FUNCTION,
+                ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
             self._test_ring_attention_sdpa,
         )
@@ -88,7 +108,14 @@ def _test_ring_attention_sdpa(
         load_balance: bool,
         rotater: _RotateMethod,
         test_forward_only: bool,
+<<<<<<< HEAD
     ) -> None:
+=======
+        dispatch_mode: _DispatchMode,
+    ) -> None:
+        torch.distributed.tensor.experimental._attention._dispatch_mode = dispatch_mode
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn_eval(fn, *args, **kwargs):
             if test_forward_only:
                 with torch.no_grad():
@@ -220,6 +247,13 @@ def fn_eval(fn, *args, **kwargs):
             cp_k.requires_grad = False
             cp_v.requires_grad = False
 
+<<<<<<< HEAD
+=======
+        torch.distributed.tensor.experimental._attention._dispatch_mode = (
+            _DispatchMode.MONKEY_PATCH
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_is_causal_behavior(self) -> None:
         _cp_options.enable_load_balance = False
         self.assertEqual(
diff --git a/test/distributed/tensor/test_common_rules.py b/test/distributed/tensor/test_common_rules.py
index f712ef1cf02b..49081a193278 100644
--- a/test/distributed/tensor/test_common_rules.py
+++ b/test/distributed/tensor/test_common_rules.py
@@ -2,8 +2,13 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh
 from torch.distributed._tensor.placement_types import DTensorSpec, TensorMeta
+=======
+from torch.distributed.tensor import DeviceMesh
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._op_schema import OpSchema
 from torch.distributed.tensor._ops._common_rules import einop_rule, pointwise_rule
 from torch.testing._internal.common_utils import run_tests
diff --git a/test/distributed/tensor/test_convolution_ops.py b/test/distributed/tensor/test_convolution_ops.py
index 5d40a18f0674..b0fdc7154521 100644
--- a/test/distributed/tensor/test_convolution_ops.py
+++ b/test/distributed/tensor/test_convolution_ops.py
@@ -187,7 +187,11 @@ def test_depthwise_convolution(self):
     @skip_if_lt_x_gpu(2)
     def test_conv_backward_none_grad_inp(self):
         device_mesh = init_device_mesh(
+<<<<<<< HEAD
             device_type="cuda", mesh_shape=(self.world_size,)
+=======
+            device_type=self.device_type, mesh_shape=(self.world_size,)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         conv = nn.Conv2d(64, 64, 3, padding=1).train()
         x = torch.randn(1, 64, 32, 32)
diff --git a/test/distributed/tensor/test_dtensor.py b/test/distributed/tensor/test_dtensor.py
index a50f6129f3e8..2f75c96fe49c 100644
--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@@ -11,6 +11,7 @@
 import torch
 import torch.nn.functional as F
 from torch.distributed._functional_collectives import AsyncCollectiveTensor
+<<<<<<< HEAD
 from torch.distributed._tensor import (
     DeviceMesh,
     distribute_tensor,
@@ -27,12 +28,31 @@
 )
 from torch.distributed.tensor._api import _shard_tensor
 from torch.distributed.tensor.debug import CommDebugMode
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+    Partial,
+    Replicate,
+    Shard,
+)
+from torch.distributed.tensor._api import _shard_tensor
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.experimental import implicit_replication
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
     RowwiseParallel,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_FBCODE, run_tests
+=======
+from torch.testing._internal.common_utils import IS_FBCODE, run_tests, skipIfHpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
@@ -543,6 +563,10 @@ def test_dtensor_save_load(self):
         reloaded_st = torch.load(buffer, weights_only=True)
         self.assertEqual(sharded_tensor, reloaded_st)
 
+<<<<<<< HEAD
+=======
+    @skipIfHpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @with_comms
     @unittest.skipIf(
         IS_FBCODE,
@@ -743,7 +767,11 @@ def test_dtensor_spec_local_shard_offset(self):
             ),
         ]
 
+<<<<<<< HEAD
         from torch.distributed._tensor._utils import (
+=======
+        from torch.distributed.tensor._utils import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compute_local_shape_and_global_offset,
         )
 
@@ -956,7 +984,11 @@ def test_split_tensor_1D(self) -> None:
             )
             if size == 0:
                 # when tensor size is 0, there is no padding needed for all the ranks.
+<<<<<<< HEAD
                 expected_pad_sizes = []
+=======
+                expected_pad_sizes = [0] * self.world_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert_array_equal(expected_pad_sizes, pad_sizes)
 
                 is_tensor_empty = [
@@ -1009,7 +1041,12 @@ def test_dtensor_log(self):
             """\
 import logging
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import  init_device_mesh, distribute_tensor, Shard
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 mesh = init_device_mesh("cuda", (1,), mesh_dim_names=("dp",))
 placements = [Shard(0)]
diff --git a/test/distributed/tensor/test_dtensor_compile.py b/test/distributed/tensor/test_dtensor_compile.py
index 06e728cbba49..f250c23413d1 100644
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@@ -1,6 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+<<<<<<< HEAD
+=======
+import contextlib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import copy
 import functools
 import unittest
@@ -13,6 +17,7 @@
 import torch.nn as nn
 from torch._C import FileCheck
 from torch._inductor.utils import run_and_get_triton_code
+<<<<<<< HEAD
 from torch.distributed._tensor import (
     DeviceMesh,
     DTensor,
@@ -22,11 +27,20 @@
     Shard,
 )
 from torch.distributed._tensor.placement_types import DTensorSpec, TensorMeta
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     checkpoint_wrapper,
     CheckpointImpl,
 )
+<<<<<<< HEAD
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
@@ -34,12 +48,20 @@
     PrepareModuleOutput,
     RowwiseParallel,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor.placement_types import _StridedShard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import get_devtype
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+<<<<<<< HEAD
+=======
+    skipIfHpu,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchDynamo,
     TEST_CUDA,
     TEST_HPU,
@@ -127,6 +149,63 @@ def fn(x):
         res = fn(x)
         res.to_local().sum().backward()
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    def test_dtensor_basic_export(self):
+        mesh = DeviceMesh("cuda", torch.arange(self.world_size))
+
+        param = torch.randn(4, 4)
+        param_x = DTensor.from_local(param, mesh, [Shard(0)], run_check=False)
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.buffer = torch.nn.Buffer(param_x)
+
+            def forward(self, x):
+                inter = self.buffer + DTensor.from_local(
+                    x, mesh, [Shard(0)], run_check=False
+                )
+                return inter.to_local()
+
+        torch.utils._pytree.register_constant(
+            torch.distributed.tensor._dtensor_spec.DTensorSpec
+        )
+        torch.utils._pytree.register_constant(DeviceMesh)
+
+        ep = torch.export.export_for_training(
+            Foo(), (torch.randn(4, 4, dtype=torch.float64),), strict=False
+        )
+        self.assertExpectedInline(
+            str(ep.graph_module.code).strip(),
+            """\
+def forward(self, b_buffer, x):
+    _assert_tensor_metadata_default = torch.ops.aten._assert_tensor_metadata.default(x, dtype = torch.float64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default = None
+    to = torch.ops.aten.to.dtype_layout(x, dtype = torch.float64, layout = torch.strided, device = device(type='cuda'));  x = None
+    view_as = torch.ops.aten.view_as.default(to, to);  to = None
+    dtensor___init__0 = self.dtensor___init__0
+    dtensor_const_func_spec0 = self.dtensor_const_func_spec0
+    flat_apply = torch.ops.higher_order.flat_apply(dtensor_const_func_spec0, dtensor___init__0, view_as, False);  dtensor_const_func_spec0 = dtensor___init__0 = view_as = None
+    add = torch.ops.aten.add.Tensor(b_buffer, flat_apply);  b_buffer = flat_apply = None
+    access_subclass_inner_tensor_default_4 = torch.ops.export.access_subclass_inner_tensor.default(add, '_local_tensor');  add = None
+    view_as_1 = torch.ops.aten.view_as.default(access_subclass_inner_tensor_default_4, access_subclass_inner_tensor_default_4);  access_subclass_inner_tensor_default_4 = None
+    return (view_as_1,)""",  # noqa: B950
+        )
+
+        self.assertExpectedInline(
+            str(ep.run_decompositions({}).graph_module.code).strip(),
+            """\
+def forward(self, b_parametrizations_buffer_original0, x):
+    _assert_tensor_metadata = torch.ops.aten._assert_tensor_metadata.default(x, None, None, torch.float64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata = None
+    _to_copy = torch.ops.aten._to_copy.default(x, dtype = torch.float64, layout = torch.strided, device = device(type='cuda', index=0));  x = None
+    view = torch.ops.aten.view.default(_to_copy, [4, 4]);  _to_copy = None
+    add = torch.ops.aten.add.Tensor(b_parametrizations_buffer_original0, view);  b_parametrizations_buffer_original0 = view = None
+    view_1 = torch.ops.aten.view.default(add, [4, 4]);  add = None
+    return (view_1,)""",  # noqa: B950
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_placement_compile(self):
         def fn(x):
             a = 0
@@ -147,8 +226,15 @@ def fn(x):
             return a
 
         compiled_fn = torch.compile(backend="aot_eager", fullgraph=True)(fn)
+<<<<<<< HEAD
 
         for x in [Shard(0), Replicate(), Partial()]:
+=======
+        split_factors = [2, 3, 4]
+        for x in [Shard(0), Replicate(), Partial()] + [
+            _StridedShard(0, split_factor=s) for s in split_factors
+        ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             opt_fn = fn(x)
             compiled_out = compiled_fn(x)
             self.assertEqual(opt_fn, compiled_out)
@@ -203,6 +289,10 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(res, ref)
 
+<<<<<<< HEAD
+=======
+    @skipIfHpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dtensor_dynamic(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
@@ -429,6 +519,10 @@ def fn(x):
         self.assertEqual(fn(x3), opt_fn(x3))
         self.assertEqual(cnt.frame_count, 2)
 
+<<<<<<< HEAD
+=======
+    @skipIfHpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dtensor_partial_placement_redistribute_unbalanced_correct_strides(self):
         # Partial -> Shard on an unbalanced tensor results in:
         # - A contiguous DTensor
@@ -598,6 +692,10 @@ def redistribute_kwargs_fn(x):
         res = opt_kwargs_fn(x)
         self.assertEqual(res, ref)
 
+<<<<<<< HEAD
+=======
+    @skipIfHpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_dtensor_from_local_redistribute_async(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         from torch.distributed._functional_collectives import AsyncCollectiveTensor
@@ -669,6 +767,10 @@ def fn(x_dt):
         res = opt_fn(x_dt)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
+=======
+    @skipIfHpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_graph_input_is_async(self):
         from torch.distributed._functional_collectives import AsyncCollectiveTensor
 
@@ -749,12 +851,16 @@ def fn(x):
         out_dt = torch.matmul(tmp_dt, y_dt)
         out_dt.sum().backward()
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(1)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
     @patch.object(torch._inductor.config, "reorder_for_compute_comm_overlap", True)
     def test_tp_compile_comm_reordering(self):
+=======
+    def _test_tp_compile_comm_reordering(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class FakeAttention(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -820,9 +926,30 @@ def forward(self, input):
             "buf0 = torch.ops._c10d_functional.all_gather_into_tensor.default(primal"
         ).check("torch.ops._c10d_functional.wait_tensor.default(buf0").check(
             "extern_kernels.mm(buf0,"
+<<<<<<< HEAD
         ).run(
             code
         )
+=======
+        ).run(code)
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_lt_x_gpu(1)
+    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
+    @patch.object(torch._inductor.config, "compile_threads", 1)
+    @patch.object(torch._inductor.config, "reorder_for_compute_comm_overlap", True)
+    def test_tp_compile_comm_reordering(self):
+        self._test_tp_compile_comm_reordering()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_lt_x_gpu(1)
+    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
+    @patch.object(torch._inductor.config, "compile_threads", 1)
+    @patch.object(torch._inductor.config, "reorder_for_compute_comm_overlap", True)
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_tp_compile_comm_reordering_graph_partition(self):
+        self._test_tp_compile_comm_reordering()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @instantiate_parametrized_tests
@@ -831,9 +958,23 @@ class TestDTensorCompileE2E(DTensorTestBase):
     def world_size(self):
         return 4
 
+<<<<<<< HEAD
     @with_comms
     @parametrize("is_seq_parallel", [True, False])
     def test_tp_compile_fullgraph(self, is_seq_parallel):
+=======
+    # multiprocess relies on pickling the source code
+    # so compiled autograd tests can't dynamically wrap this class
+    def _bwd_ctx(self, use_ca):
+        if not use_ca:
+            return contextlib.nullcontext()
+        return torch._dynamo.compiled_autograd._enable(torch.compile)
+
+    @with_comms
+    @parametrize("is_seq_parallel", [True, False])
+    @parametrize("use_ca", [True, False])
+    def test_tp_compile_fullgraph(self, is_seq_parallel, use_ca):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
         model = SimpleModel(self.device_type)
@@ -887,20 +1028,34 @@ def test_tp_compile_fullgraph(self, is_seq_parallel):
         cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
         compiled_mod = torch.compile(model, backend=cnt, fullgraph=True)
         compiled_out = compiled_mod(inp)
+<<<<<<< HEAD
         compiled_out.sum().backward()
+=======
+        with self._bwd_ctx(use_ca):
+            compiled_out.sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(compiled_out, out)
         self.assertEqual(cnt.frame_count, 1)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
+<<<<<<< HEAD
     def test_2d_fsdp_tp_compile(self):
+=======
+    @parametrize("use_ca", [True, False])
+    def test_2d_fsdp_tp_compile(self, use_ca):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         data_parallel_size = 2
         model = SimpleModel(self.device_type)
         model_copy = copy.deepcopy(model)
 
         # 2-D mesh is [dp, tp]
         twod_mesh = init_device_mesh(
+<<<<<<< HEAD
             "cuda",
+=======
+            self.device_type,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (data_parallel_size, self.world_size // data_parallel_size),
             mesh_dim_names=["dp", "tp"],
         )
@@ -936,13 +1091,23 @@ def test_2d_fsdp_tp_compile(self):
         cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
         compiled_2d = torch.compile(fsdp_2d, backend=cnt)
         compiled_output = compiled_2d(inp)
+<<<<<<< HEAD
+=======
+        with self._bwd_ctx(use_ca):
+            compiled_output.sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(out, compiled_output)
         self.assertEqual(cnt.frame_count, 1)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
+<<<<<<< HEAD
     def test_2d_fsdp_tp_ac_compile(self):
+=======
+    @parametrize("use_ca", [True, False])
+    def test_2d_fsdp_tp_ac_compile(self, use_ca):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dp_degree = 2
         tp_degree = self.world_size // dp_degree
         model = SimpleModel(self.device_type)
@@ -950,7 +1115,13 @@ def test_2d_fsdp_tp_ac_compile(self):
 
         # 2-D mesh is [dp, tp]
         mesh_2d = init_device_mesh(
+<<<<<<< HEAD
             "cuda", mesh_shape=(dp_degree, tp_degree), mesh_dim_names=("dp", "tp")
+=======
+            self.device_type,
+            mesh_shape=(dp_degree, tp_degree),
+            mesh_dim_names=("dp", "tp"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         inp = torch.rand(20, 10, device=self.device_type)
@@ -985,7 +1156,12 @@ def test_2d_fsdp_tp_ac_compile(self):
 
         # backward pass
         out.sum().backward()
+<<<<<<< HEAD
         compiled_output.sum().backward()
+=======
+        with self._bwd_ctx(use_ca):
+            compiled_output.sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # compare the gradients:
         for n, p in zip(fsdp_2d.parameters(), compiled_2d.parameters()):
@@ -993,8 +1169,16 @@ def test_2d_fsdp_tp_ac_compile(self):
 
     @with_comms
     @skip_if_lt_x_gpu(4)
+<<<<<<< HEAD
     def test_compile_dtensor_redistribute_backward(self):
         mesh = DeviceMesh(device_type="cuda", mesh=torch.arange(self.world_size))
+=======
+    @parametrize("use_ca", [True, False])
+    def test_compile_dtensor_redistribute_backward(self, use_ca):
+        mesh = DeviceMesh(
+            device_type=self.device_type, mesh=torch.arange(self.world_size)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn(x, y):
             dt = DTensor.from_local(x.reshape(2, 4), mesh, [Shard(0)], run_check=False)
@@ -1017,7 +1201,12 @@ def fn(x, y):
 
         # Now run and assert the backward + gradients
         ref.sum().backward()
+<<<<<<< HEAD
         res.sum().backward()
+=======
+        with self._bwd_ctx(use_ca):
+            res.sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(x_ref.grad, x.grad)
         self.assertEqual(y_ref.grad, y.grad)
diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
index 647160eb7187..f11f6e598220 100644
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@@ -7,7 +7,11 @@
 import torch
 import torch.distributed as dist
 import torch.testing._internal.common_methods_invocations as common_ops
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, DTensor
+=======
+from torch.distributed.tensor import DeviceMesh, DTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.overrides import resolve_name
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
@@ -112,6 +116,10 @@ def wrapped(fn):
     xfail("_batch_norm_with_update"),
     xfail("block_diag"),
     xfail("broadcast_shapes"),
+<<<<<<< HEAD
+=======
+    xfail("cartesian_prod"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("cauchy"),
     xfail("cdist"),
     xfail("cholesky"),
@@ -128,11 +136,16 @@ def wrapped(fn):
     xfail("cross"),
     xfail("cummax"),
     xfail("cummin"),
+<<<<<<< HEAD
     xfail("cumsum"),
     xfail("cumulative_trapezoid"),
     xfail("diagonal_scatter"),
     xfail("dist"),
     xfail("dot"),
+=======
+    xfail("diagonal_scatter"),
+    xfail("dist"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("empty"),
     xfail("empty_strided"),
     xfail("empty_like"),
@@ -157,6 +170,10 @@ def wrapped(fn):
     xfail("fft.rfft"),
     xfail("fft.rfftn"),
     xfail("fill"),
+<<<<<<< HEAD
+=======
+    xfail("flatten"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("flip"),
     xfail("fliplr"),
     xfail("flipud"),
@@ -186,6 +203,10 @@ def wrapped(fn):
     xfail("index_select"),
     xfail("isin"),
     xfail("kthvalue"),
+<<<<<<< HEAD
+=======
+    xfail("kron"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("linalg.cholesky"),
     xfail("linalg.cholesky_ex"),
     xfail("linalg.cross"),
@@ -233,7 +254,10 @@ def wrapped(fn):
     xfail("masked.argmax"),
     xfail("masked.argmin"),
     xfail("masked.cumprod"),
+<<<<<<< HEAD
     xfail("masked.cumsum"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("masked.logsumexp"),
     xfail("masked.median"),
     xfail("matrix_exp"),
@@ -357,10 +381,19 @@ def wrapped(fn):
     xfail("randint"),
     xfail("randn"),
     xfail("randn_like"),
+<<<<<<< HEAD
+=======
+    xfail("ravel"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("renorm"),
     xfail("repeat_interleave"),
     xfail("resize_"),
     xfail("resize_as_"),
+<<<<<<< HEAD
+=======
+    xfail("reshape"),
+    xfail("reshape_as"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("roll"),
     xfail("rot90"),
     xfail("rsub"),
@@ -372,7 +405,10 @@ def wrapped(fn):
     xfail("scatter_reduce", "prod"),
     xfail("scatter_reduce", "sum"),
     xfail("searchsorted"),
+<<<<<<< HEAD
     xfail("select"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("select_scatter"),
     xfail("sort"),
     xfail("sparse.sampled_addmm"),
@@ -425,6 +461,10 @@ def wrapped(fn):
     xfail("svd_lowrank"),
     xfail("t_copy"),
     xfail("take"),
+<<<<<<< HEAD
+=======
+    xfail("take_along_dim"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("tensor_split"),
     xfail("to_sparse"),
     xfail("trace"),
@@ -446,8 +486,16 @@ def wrapped(fn):
     xfail("var_mean"),
     xfail("var_mean", "unbiased"),
     xfail("vdot"),
+<<<<<<< HEAD
+    xfail("view_copy"),
+    xfail("zeros"),
+=======
+    xfail("view"),
+    xfail("view_as"),
     xfail("view_copy"),
     xfail("zeros"),
+    # /TODO(whc) debug/triage
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # ops inside this might even fail without dtensor
     # tests, as we rescale op db common test size factor (i.e. L, M, S)
     # which triggered the original function run failures with input
@@ -577,6 +625,7 @@ def concat_res_if_necessary(func, res: object) -> object:
         def to_replicate(e: object) -> object:
             return e.full_tensor() if isinstance(e, DTensor) else e
 
+<<<<<<< HEAD
         try:
             # Suppress warnings, this doesn't matter for test_meta.py
             # but it does matter if you want to use this decorator
@@ -589,6 +638,20 @@ def to_replicate(e: object) -> object:
                     # Only attempt if we managed to convert all tensors to DTensor
                     # (if any of them failed, we're in a mixed tensor situation and
                     # this is not allowed in DTensor)
+=======
+        # Suppress warnings, this doesn't matter for test_meta.py
+        # but it does matter if you want to use this decorator
+        # for cross-ref testing, as some tests may be looking at
+        # errors
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            # for every comb of sharding choices, we test if it works
+            for dtensor_args, dtensor_kwargs in to_dtensor:
+                # Only attempt if we managed to convert all tensors to DTensor
+                # (if any of them failed, we're in a mixed tensor situation and
+                # this is not allowed in DTensor)
+                try:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if to_dtensor.successful():
                         # Handle special cases first if there's any
                         # Suppress warnings, this doesn't matter for test_meta.py
@@ -598,7 +661,11 @@ def to_replicate(e: object) -> object:
                         dtensor_rs = func(*dtensor_args, **dtensor_kwargs)
 
                         # we need to skip tests containing tensors of zero elements for now.
+<<<<<<< HEAD
                         # see issue: https://github.com/pytorch/tau/issues/470
+=======
+                        # see issue: https://github.com/pytorch/PiPPy/issues/470
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         # TODO remove this once issue above fixed.
                         flat_args = pytree.tree_leaves(dtensor_rs)
                         if any(
@@ -629,11 +696,18 @@ def to_replicate(e: object) -> object:
                             f"failed to convert args to DTensor; "
                             f"originally (*{args}, **{kwargs})"
                         )
+<<<<<<< HEAD
         except Exception as e:
             raise RuntimeError(
                 f"failed to run: {resolve_name(func)}, with (*{args}, **{kwargs})"
             ) from e
 
+=======
+                except Exception as e:
+                    raise RuntimeError(
+                        f"failed to run: {resolve_name(func)}, with (*{dtensor_args}, **{dtensor_kwargs})"
+                    ) from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return rs
 
     def check_dtensor_func(self, test_func, opinfo, dry_run=False):
diff --git a/test/distributed/tensor/test_embedding_ops.py b/test/distributed/tensor/test_embedding_ops.py
index b47adeb85ff3..d2c9d65a4ba9 100644
--- a/test/distributed/tensor/test_embedding_ops.py
+++ b/test/distributed/tensor/test_embedding_ops.py
@@ -3,7 +3,11 @@
 import sys
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import (
+=======
+from torch.distributed.tensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     distribute_module,
     distribute_tensor,
     DTensor,
diff --git a/test/distributed/tensor/test_experimental_ops.py b/test/distributed/tensor/test_experimental_ops.py
index f391d3ad73ea..d25e66671389 100644
--- a/test/distributed/tensor/test_experimental_ops.py
+++ b/test/distributed/tensor/test_experimental_ops.py
@@ -4,7 +4,11 @@
 
 import torch
 import torch.distributed as dist
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate
+=======
+from torch.distributed.tensor import DeviceMesh, distribute_tensor, Replicate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
diff --git a/test/distributed/tensor/test_init.py b/test/distributed/tensor/test_init.py
index 11a9596abc75..55350e8c1b0e 100644
--- a/test/distributed/tensor/test_init.py
+++ b/test/distributed/tensor/test_init.py
@@ -2,7 +2,11 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard, zeros
+=======
+from torch.distributed.tensor import DeviceMesh, DTensor, Replicate, Shard, zeros
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -94,7 +98,11 @@ def _run_init_op(self, init_op, dist_init_op, eq_op, *args, **kwargs):
     def test_ones(self):
         self._run_init_op(
             torch.ones,
+<<<<<<< HEAD
             torch.distributed._tensor.ones,
+=======
+            torch.distributed.tensor.ones,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual,
             requires_grad=True,
         )
@@ -103,7 +111,11 @@ def test_ones(self):
     def test_empty(self):
         self._run_init_op(
             torch.empty,
+<<<<<<< HEAD
             torch.distributed._tensor.empty,
+=======
+            torch.distributed.tensor.empty,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             lambda x, y: (x.shape == y.shape)
             and (x.dtype == y.dtype)
             and (x.layout == y.layout),
@@ -114,7 +126,11 @@ def test_empty(self):
     def test_full(self):
         self._run_init_op(
             torch.full,
+<<<<<<< HEAD
             torch.distributed._tensor.full,
+=======
+            torch.distributed.tensor.full,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual,
             123.4,
             requires_grad=True,
@@ -124,7 +140,11 @@ def test_full(self):
     def test_zeros(self):
         self._run_init_op(
             torch.zeros,
+<<<<<<< HEAD
             torch.distributed._tensor.zeros,
+=======
+            torch.distributed.tensor.zeros,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual,
             requires_grad=True,
         )
diff --git a/test/distributed/tensor/test_math_ops.py b/test/distributed/tensor/test_math_ops.py
index ceaab5ec0e12..6542ca48da29 100644
--- a/test/distributed/tensor/test_math_ops.py
+++ b/test/distributed/tensor/test_math_ops.py
@@ -7,13 +7,22 @@
 from typing import NamedTuple
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor.placement_types import Replicate, Shard
+=======
+from torch.distributed.device_mesh import init_device_mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
     distribute_tensor,
     DTensor,
+<<<<<<< HEAD
     init_device_mesh,
+=======
+    Replicate,
+    Shard,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.distributed.tensor._ops.utils import is_tensor_partial, normalize_dim
 from torch.distributed.tensor.debug import CommDebugMode
@@ -23,7 +32,11 @@
     RowwiseParallel,
     SequenceParallel,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests
+=======
+from torch.testing._internal.common_utils import run_tests, skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     skip_unless_torch_gpu,
@@ -310,7 +323,11 @@ def _replicate_fn(name, module, device_mesh):
                 f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
             )
 
+<<<<<<< HEAD
             from torch.distributed._tensor.placement_types import TensorMeta
+=======
+            from torch.distributed.tensor._dtensor_spec import TensorMeta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             dtensor_meta = y_dist._spec.tensor_meta
             assert isinstance(dtensor_meta, TensorMeta)
@@ -440,9 +457,17 @@ class SubTest(NamedTuple):
             out_req_grad: bool
 
         subtest_fails = {}
+<<<<<<< HEAD
         valid_filter = lambda cfg: not (  # noqa: E731
             cfg.ln_req_grad and not cfg.elementwise_affine
         ) and any(cfg[2:])
+=======
+        valid_filter = (  # noqa: E731
+            lambda cfg: (
+                not (cfg.ln_req_grad and not cfg.elementwise_affine) and any(cfg[2:])
+            )
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         subtest_cfgs = list(
             filter(
                 valid_filter,
@@ -565,9 +590,15 @@ def forward(self, tokens):
             except Exception as e:
                 subtest_fails[subtest_cfg] = e
         # if any subtest fails, provide the failed subtests and report the overall failure
+<<<<<<< HEAD
         assert (
             not subtest_fails
         ), f"{len(subtest_fails)}/{len(subtest_cfgs)} subtests failed: {pformat(subtest_fails)}"
+=======
+        assert not subtest_fails, (
+            f"{len(subtest_fails)}/{len(subtest_cfgs)} subtests failed: {pformat(subtest_fails)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     def test_topk(self):
@@ -658,6 +689,10 @@ def test_foreach_norm_different_mesh(self):
         self.assertEqual(grad1_norm.device_mesh, mesh_y)
 
     @with_comms
+<<<<<<< HEAD
+=======
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_foreach_add_different_mesh(self):
         mesh_shape = (2, self.world_size // 2)
         mesh_2d = init_device_mesh(
@@ -727,6 +762,37 @@ def test_upsampling(self):
             dtensor_result = m(input_dtensor)
             self.assertEqual(result, dtensor_result.full_tensor())
 
+<<<<<<< HEAD
+=======
+    @with_comms
+    def test_cumsum(self):
+        mesh = self.build_device_mesh()
+        comm_mode = CommDebugMode()
+        inp = torch.rand(3, 5, device=self.device_type)
+
+        shard_dim = 0
+        input_dtensor = distribute_tensor(
+            inp, device_mesh=mesh, placements=[Shard(shard_dim)]
+        )
+
+        cumsum_dims = [0, 1]
+        for dim in cumsum_dims:
+            output = torch.cumsum(inp, dim=dim)
+            with comm_mode:
+                output_dtensor = torch.cumsum(input_dtensor, dim=dim)
+                if dim == shard_dim:
+                    self.assertEqual(comm_mode.get_total_counts(), 1)
+                    self.assertEqual(
+                        comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
+                        1,
+                    )
+                    self.assertTrue(output_dtensor.placements[0].is_replicate())
+                else:
+                    self.assertEqual(comm_mode.get_total_counts(), 0)
+                    self.assertTrue(output_dtensor.placements[0].is_shard(shard_dim))
+                self.assertEqual(output_dtensor.full_tensor(), output)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_matrix_ops.py b/test/distributed/tensor/test_matrix_ops.py
index d0935f6077da..3a19bcdcff03 100644
--- a/test/distributed/tensor/test_matrix_ops.py
+++ b/test/distributed/tensor/test_matrix_ops.py
@@ -17,8 +17,14 @@
     Shard,
 )
 from torch.distributed.tensor.debug import CommDebugMode
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_utils import run_tests, skipIfRocm, skipIfRocmArch, MI350_ARCH
+=======
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8, SM90OrLater
+from torch.testing._internal.common_device_type import E4M3_MAX_POS, e4m3_type
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     skip_unless_torch_gpu,
@@ -26,6 +32,12 @@
 )
 
 
+<<<<<<< HEAD
+=======
+funcol = torch.ops.c10d_functional
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def scale_for_fp8(
     t: torch.Tensor, scale_shape: tuple[int]
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -33,8 +45,15 @@ def scale_for_fp8(
         t = t.unsqueeze(0).unsqueeze(-2)
     else:
         t = t.unflatten(0, (scale_shape[0], -1)).unflatten(-1, (scale_shape[1], -1))
+<<<<<<< HEAD
     scale = t.abs().amax(dim=[1, -1]).float() / torch.finfo(torch.float8_e4m3fn).max
     t_fp8 = (t / scale[:, None, :, None]).to(torch.float8_e4m3fn)
+=======
+
+    scale = t.abs().amax(dim=[1, -1]).float() / E4M3_MAX_POS
+    t_fp8 = (t / scale[:, None, :, None]).to(e4m3_type)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return t_fp8.flatten(end_dim=1).flatten(start_dim=-2), scale.view(scale_shape)
 
 
@@ -140,7 +159,10 @@ def test_placement_comb(
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
+<<<<<<< HEAD
     @skipIfRocmArch(MI350_ARCH) #Enable via https://github.com/ROCm/frameworks-internal/issues/13103
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scaled_mm(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         shrd0 = Shard(0)
@@ -206,7 +228,11 @@ def test_scaled_mm(self):
 
             full_dist_res = dist_res.full_tensor()
             # Fp8 matmuls are quite inaccurate, we need high tolerances
+<<<<<<< HEAD
             self.assertEqual(full_dist_res, full_ref_res, atol=1, rtol=7e-2)
+=======
+            self.assertEqual(full_dist_res, full_ref_res, atol=1.5, rtol=7e-2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.assertEqual(comm_mode.get_total_counts(), 0)
 
@@ -449,7 +475,10 @@ def test_scaled_dot_product_attention(self):
                     self.assertTrue(dist_value.grad.placements[0].is_shard(dim=1))
                     self.assertEqual(dist_value.grad.full_tensor(), value.grad)
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_unless_torch_gpu
     @with_comms()
     def test_dtensor_mm(self):
@@ -473,7 +502,13 @@ def test_dtensor_mm(self):
             lhs_dtensor = distribute_tensor(lhs, mesh, [Shard(dim=0), Replicate()])
             rhs_dtensor = distribute_tensor(rhs, mesh, [Replicate(), Shard(dim=1)])
             dtensor_result = lhs_dtensor @ rhs_dtensor
+<<<<<<< HEAD
             self.assertEqual(dtensor_result.full_tensor(), mm_result)
+=======
+            self.assertEqual(
+                dtensor_result.full_tensor(), mm_result, atol=1.5e-5, rtol=1e-6
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     @skip_unless_torch_gpu
@@ -498,6 +533,66 @@ def test_tensordot_shampoo(self):
             dist_result_full = dist_result.full_tensor()
             self.assertEqual(local_result, dist_result_full)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_grouped_mm(self):
+        # TODO: torch._grouped_mm can take inputs of dimension (2D, 3D) x (2D, 3D)
+        # Here we only test the 2D x 3D Tensor Parallel use case in an MoE layer.
+        # More tests need to be added.
+        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+        comm_mode = CommDebugMode()
+        dtype = torch.bfloat16
+
+        inp = torch.rand(
+            64, 16, device=self.device_type, dtype=dtype, requires_grad=True
+        )
+        w1 = torch.rand(
+            2, 16, 32, device=self.device_type, dtype=dtype, requires_grad=True
+        )
+        w2 = torch.rand(
+            2, 32, 16, device=self.device_type, dtype=dtype, requires_grad=True
+        )
+        offs = torch.tensor([16, 64], device=self.device_type, dtype=torch.int32)
+
+        h = torch._grouped_mm(inp, w1, offs=offs)
+        out = torch._grouped_mm(h, w2, offs=offs)
+
+        dist_inp = distribute_tensor(inp, device_mesh, [Replicate()])
+        # colwise sharded
+        dist_w1 = distribute_tensor(w1, device_mesh, [Shard(2)])
+        # rowwise sharded
+        dist_w2 = distribute_tensor(w2, device_mesh, [Shard(1)])
+        dist_offs = distribute_tensor(offs, device_mesh, [Replicate()])
+
+        with comm_mode:
+            dist_h = torch._grouped_mm(dist_inp, dist_w1, offs=dist_offs)
+            dist_out = torch._grouped_mm(dist_h, dist_w2, offs=dist_offs)
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+            self.assertTrue(dist_out.placements[0].is_partial())
+            self.assertEqual(dist_out.full_tensor(), out)
+
+        out_grad = torch.ones_like(out)
+        out.backward(out_grad)
+
+        dist_out = dist_out.redistribute(device_mesh, [Shard(0)])
+        dist_out_grad = distribute_tensor(out_grad, device_mesh, [Shard(0)])
+
+        with comm_mode:
+            dist_out.backward(dist_out_grad)
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
+                1,
+            )
+        self.assertEqual(dist_inp.grad.full_tensor(), inp.grad)
+        self.assertEqual(dist_w1.grad.full_tensor(), w1.grad)
+        self.assertEqual(dist_w2.grad.full_tensor(), w2.grad)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_op_strategy.py b/test/distributed/tensor/test_op_strategy.py
index 4a9bf20ec324..6d5aee9fa35a 100644
--- a/test/distributed/tensor/test_op_strategy.py
+++ b/test/distributed/tensor/test_op_strategy.py
@@ -3,6 +3,7 @@
 from itertools import chain
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, DTensor
 from torch.distributed._tensor.placement_types import (
     DTensorSpec,
@@ -13,6 +14,12 @@
 )
 from torch.distributed.tensor._collective_utils import redistribute_cost
 from torch.distributed.tensor._op_schema import OpSchema, OpStrategy, PlacementStrategy
+=======
+from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard
+from torch.distributed.tensor._collective_utils import redistribute_cost
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+from torch.distributed.tensor._op_schema import OpSchema, OpSpec, OpStrategy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._ops._einsum_strategy import (
     EinsumDims,
     gen_einsum_strategies,
@@ -190,9 +197,15 @@ def test_redistribute_cost_latency(self):
         op_schema = OpSchema(
             torch.ops.aten.addmm.default,
             (
+<<<<<<< HEAD
                 OpStrategy([PlacementStrategy(shard0_spec)]),
                 OpStrategy([PlacementStrategy(partial_spec)]),
                 OpStrategy([PlacementStrategy(shard1_spec)]),
+=======
+                OpStrategy([OpSpec(shard0_spec)]),
+                OpStrategy([OpSpec(partial_spec)]),
+                OpStrategy([OpSpec(shard1_spec)]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             {},
         )
@@ -267,8 +280,13 @@ def test_mm_strategies(self):
             op_schema = OpSchema(
                 torch.ops.aten.mm.default,
                 (
+<<<<<<< HEAD
                     OpStrategy([PlacementStrategy(lhs_spec)]),
                     OpStrategy([PlacementStrategy(rhs_spec)]),
+=======
+                    OpStrategy([OpSpec(lhs_spec)]),
+                    OpStrategy([OpSpec(rhs_spec)]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 {},
             )
@@ -314,8 +332,13 @@ def test_bmm_strategies(self):
             op_schema = OpSchema(
                 torch.ops.aten.bmm.default,
                 (
+<<<<<<< HEAD
                     OpStrategy([PlacementStrategy(lhs_spec)]),
                     OpStrategy([PlacementStrategy(rhs_spec)]),
+=======
+                    OpStrategy([OpSpec(lhs_spec)]),
+                    OpStrategy([OpSpec(rhs_spec)]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 {},
             )
diff --git a/test/distributed/tensor/test_optimizers.py b/test/distributed/tensor/test_optimizers.py
index a8a86b65fa5d..e92445af52de 100644
--- a/test/distributed/tensor/test_optimizers.py
+++ b/test/distributed/tensor/test_optimizers.py
@@ -4,7 +4,11 @@
 
 import torch
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed._tensor import (
+=======
+from torch.distributed.tensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DeviceMesh,
     distribute_module,
     distribute_tensor,
diff --git a/test/distributed/tensor/test_pointwise_ops.py b/test/distributed/tensor/test_pointwise_ops.py
index f30b700b3663..d5d0346ffb9b 100644
--- a/test/distributed/tensor/test_pointwise_ops.py
+++ b/test/distributed/tensor/test_pointwise_ops.py
@@ -8,8 +8,15 @@
 import torch
 import torch.utils._pytree as pytree
 from torch import Tensor
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
 from torch.distributed._tensor.placement_types import (
+=======
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Partial,
     Placement,
     Replicate,
@@ -192,7 +199,13 @@ def test_activations(self):
             op=torch.sigmoid,
         )
 
+<<<<<<< HEAD
     @skip("testing RNG based ops is broken: https://github.com/pytorch/tau/issues/494")
+=======
+    @skip(
+        "testing RNG based ops is broken: https://github.com/pytorch/PiPPy/issues/494"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dropout(self):
         device_mesh = self.build_device_mesh()
 
diff --git a/test/distributed/tensor/test_random_ops.py b/test/distributed/tensor/test_random_ops.py
index 96d3594080f6..86fcef30d093 100644
--- a/test/distributed/tensor/test_random_ops.py
+++ b/test/distributed/tensor/test_random_ops.py
@@ -6,20 +6,40 @@
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.tensor._random as random
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, DTensor, init_device_mesh
 from torch.distributed._tensor._utils import compute_local_shape_and_global_offset
 from torch.distributed._tensor.api import distribute_tensor
 from torch.distributed._tensor.placement_types import Replicate, Shard
 from torch.distributed.distributed_c10d import broadcast_object_list
 from torch.distributed.fsdp import fully_shard
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.distributed_c10d import broadcast_object_list
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+    Replicate,
+    Shard,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._random import (
     is_rng_supported_mesh,
     manual_seed,
     OffsetBasedRNGTracker,
 )
+<<<<<<< HEAD
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module
 from torch.testing._internal.common_utils import run_tests, TEST_HPU
+=======
+from torch.distributed.tensor._utils import compute_local_shape_and_global_offset
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module
+from torch.testing._internal.common_utils import run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     skip_if_lt_x_gpu,
@@ -28,9 +48,12 @@
 )
 
 
+<<<<<<< HEAD
 TYPE_DEVICE = "hpu" if TEST_HPU else "cuda"
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DistTensorRandomInitTest(DTensorTestBase):
     def _run_init_op(self, init_op, *args, **kwargs):
         device_mesh = self.build_device_mesh()
@@ -50,7 +73,11 @@ def _run_init_op(self, init_op, *args, **kwargs):
             self.assertEqual(local_tensor_clone, dtensor.to_local())
         else:
             # create DTensor from Tensor
+<<<<<<< HEAD
             _tensor = torch.empty(*input_size, device=TYPE_DEVICE)
+=======
+            _tensor = torch.empty(*input_size, device=self.device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtensor = distribute_tensor(_tensor, device_mesh, [Shard(1)])
 
             # DTensor random init
@@ -60,12 +87,20 @@ def _run_init_op(self, init_op, *args, **kwargs):
             # compare with local tensors from other ranks
             for other_rank in range(self.world_size):
                 if self.rank != other_rank:
+<<<<<<< HEAD
                     slice_idx = [
+=======
+                    slice_idx = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         slice(input_size[0]),
                         slice(
                             other_rank * input_size[1], (other_rank + 1) * input_size[1]
                         ),
+<<<<<<< HEAD
                     ]
+=======
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # other rank should have a different local tensor
                     self.assertNotEqual(dtensor.full_tensor()[slice_idx], local_tensor)
 
@@ -168,7 +203,13 @@ def test_tp_model_meta_init(self):
             self.assertEqual(model.weight.device, torch.device("meta"))
 
         # actual initialization
+<<<<<<< HEAD
         device = torch.device("cuda", torch.cuda.current_device())
+=======
+        device = torch.device(
+            self.device_type, torch.get_device_module(self.device_type).current_device()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model.to_empty(device=device)
         model.reset_parameters()
         self.assertTrue(
@@ -219,7 +260,13 @@ def test_fsdp_tp_model_meta_init(self):
             self.assertEqual(model.weight.device, torch.device("meta"))
 
         # actual initialization
+<<<<<<< HEAD
         device = torch.device("cuda", torch.cuda.current_device())
+=======
+        device = torch.device(
+            self.device_type, torch.get_device_module(self.device_type).current_device()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model.to_empty(device=device)
         model.reset_parameters()
         self.assertTrue(
@@ -261,7 +308,13 @@ def test_rng_tracker_init(self):
         # seed synchronization now does NOT happen after the first `distribute_tensor`
         # call
         dt = distribute_tensor(
+<<<<<<< HEAD
             torch.empty([self.world_size], device=TYPE_DEVICE), device_mesh, [Shard(0)]
+=======
+            torch.empty([self.world_size], device=self.device_type),
+            device_mesh,
+            [Shard(0)],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertTrue(random._rng_tracker is None)
         # seed synchronization only happens after `manual_seed` or the first DTensor
@@ -361,7 +414,11 @@ def test_deterministic_dropout_1d(self):
         size = [4, 4]
 
         dtensor = distribute_tensor(
+<<<<<<< HEAD
             torch.empty(*size, device=TYPE_DEVICE), device_mesh, [Shard(1)]
+=======
+            torch.empty(*size, device=self.device_type), device_mesh, [Shard(1)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # a random op call shifts the offset
@@ -396,8 +453,13 @@ def test_deterministic_rand_1d(self):
         size = [4, 4 * self.world_size]
 
         for fn in [
+<<<<<<< HEAD
             torch.distributed._tensor.rand,
             torch.distributed._tensor.randn,
+=======
+            torch.distributed.tensor.rand,
+            torch.distributed.tensor.randn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]:
             dtensor = fn(size, device_mesh=device_mesh, placements=[Shard(1)])
             local_tensor = funcol.all_gather_tensor(
@@ -506,11 +568,21 @@ def test_deterministic_uniform_2d(self):
                     shard_dim = placement.dim
                     local_shard_list_on_dim[shard_dim] = []
                     for shard_idx_on_dim in range(mesh_dim_size):
+<<<<<<< HEAD
                         shard_size, shard_offset = placement._local_shard_size_on_dim(
                             dtensor_shape[shard_dim],
                             mesh_dim_size,
                             shard_idx_on_dim,
                             return_offset=True,
+=======
+                        (
+                            shard_size,
+                            shard_offset,
+                        ) = placement._local_shard_size_and_offset(
+                            dtensor_shape[shard_dim],
+                            mesh_dim_size,
+                            shard_idx_on_dim,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         local_shard_list_on_dim[shard_dim].append(
                             (shard_offset, shard_size)
@@ -530,9 +602,15 @@ def test_deterministic_uniform_2d(self):
                     slice(offset, offset + size) for offset, size in other_local_shard
                 ]
                 if local_shard_offset == other_local_shard_offset:
+<<<<<<< HEAD
                     self.assertEqual(full_tensor[slice_idx], local_tensor)
                 else:
                     self.assertNotEqual(full_tensor[slice_idx], local_tensor)
+=======
+                    self.assertEqual(full_tensor[tuple(slice_idx)], local_tensor)
+                else:
+                    self.assertNotEqual(full_tensor[tuple(slice_idx)], local_tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DistTensorRandomOpsTest3D(DTensorTestBase):
@@ -564,7 +642,13 @@ def test_hsdp_tp_model_meta_init(self):
             self.assertEqual(model.weight.device, torch.device("meta"))
 
         # actual initialization
+<<<<<<< HEAD
         device = torch.device("cuda", torch.cuda.current_device())
+=======
+        device = torch.device(
+            self.device_type, torch.get_device_module(self.device_type).current_device()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model.to_empty(device=device)
         model.reset_parameters()
         self.assertTrue(
diff --git a/test/distributed/tensor/test_redistribute.py b/test/distributed/tensor/test_redistribute.py
index adff7e386b12..70572b02f529 100644
--- a/test/distributed/tensor/test_redistribute.py
+++ b/test/distributed/tensor/test_redistribute.py
@@ -4,9 +4,21 @@
 import itertools
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
 from torch.distributed._tensor.placement_types import Partial, Replicate, Shard
 from torch.distributed.device_mesh import init_device_mesh
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+    Partial,
+    Replicate,
+    Shard,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._collective_utils import shard_dim_alltoall
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
@@ -197,6 +209,138 @@ def test_partial_to_replicate_forward_backward(self):
         self.assertEqual(comm_mode.get_total_counts(), 0)
 
     @with_comms
+<<<<<<< HEAD
+=======
+    def test_replicate_to_replicate_forward_backward_datatype_conversion(self):
+        device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+        replica_spec = [Replicate()]
+
+        forward_datatypes = [
+            torch.bfloat16,
+            torch.bfloat16,
+            torch.float32,
+            torch.float32,
+            torch.bfloat16,
+            torch.float32,
+            None,
+            None,
+        ]
+        backward_datatypes = [
+            torch.bfloat16,
+            torch.float32,
+            torch.bfloat16,
+            torch.float32,
+            None,
+            None,
+            torch.bfloat16,
+            torch.float32,
+        ]
+
+        comm_mode = CommDebugMode()
+
+        for forward_dtype, backward_dtype in zip(forward_datatypes, backward_datatypes):
+            local_tensor = torch.randn(
+                12, 3, device=self.device_type, requires_grad=True
+            )
+            # 1) test replicate -> replicate forward
+            #    forward datatype cast to self.forward_dtype and backward datatype cast to self.backward_dtype
+            replica_tensor = distribute_tensor(local_tensor, device_mesh, replica_spec)
+
+            with comm_mode:
+                reshard_replica_tensor = replica_tensor.redistribute(
+                    device_mesh,
+                    replica_spec,
+                    forward_dtype=forward_dtype,
+                    backward_dtype=backward_dtype,
+                )
+            self.assertEqual(replica_tensor.size(), local_tensor.size())
+            self.assertEqual(replica_tensor.to(forward_dtype), reshard_replica_tensor)
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
+            # 2) test replicate -> replicate backward:
+            # should give gradient as replicate
+            grad_output = torch.ones_like(reshard_replica_tensor)
+            with comm_mode:
+                reshard_replica_tensor.backward(grad_output)
+            grad_input = replica_tensor.grad
+            self.assertEqual(grad_input.placements, replica_spec)
+            self.assertEqual(grad_input.to_local(), torch.ones(12, 3))
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
+    @with_comms
+    def test_shard_to_replicate_forward_backward_datatype_conversion(self):
+        device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+        replica_spec = [Replicate()]
+
+        shard_dim_and_input_sizes = [
+            (0, (self.world_size * 3, 3)),
+            (0, (self.world_size * 3 + 1, 3)),
+            (0, (self.world_size * 3 + 2, 3)),
+            (1, (3, self.world_size * 3)),
+            (1, (3, self.world_size * 3 + 1)),
+            (1, (3, self.world_size * 3 + 2)),
+        ]
+
+        forward_datatypes = [
+            torch.bfloat16,
+            torch.bfloat16,
+            torch.float32,
+            torch.float32,
+            torch.bfloat16,
+            torch.float32,
+            None,
+            None,
+        ]
+        backward_datatypes = [
+            torch.bfloat16,
+            torch.float32,
+            torch.bfloat16,
+            torch.float32,
+            None,
+            None,
+            torch.bfloat16,
+            torch.float32,
+        ]
+
+        comm_mode = CommDebugMode()
+
+        for forward_dtype, backward_dtype in zip(forward_datatypes, backward_datatypes):
+            for shard_dim, input_size in shard_dim_and_input_sizes:
+                # 1) test shard -> replicate forward
+                shard_spec = [Shard(shard_dim)]
+                expected_tensor = torch.randn(
+                    input_size, device=self.device_type, requires_grad=True
+                )
+                dtensor = distribute_tensor(expected_tensor, device_mesh, shard_spec)
+                with comm_mode:
+                    reshard_dtensor = dtensor.redistribute(
+                        device_mesh,
+                        replica_spec,
+                        forward_dtype=forward_dtype,
+                        backward_dtype=backward_dtype,
+                    )
+                self.assertEqual(reshard_dtensor.size(), torch.Size(input_size))
+                self.assertEqual(
+                    expected_tensor.to(forward_dtype), reshard_dtensor.to_local()
+                )
+                self.assertEqual(
+                    comm_mode.get_comm_counts()[funcol.all_gather_into_tensor], 1
+                )
+
+                # 2) test shard -> replicate backward:
+                # should give gradient as shard
+                grad_output = torch.ones_like(reshard_dtensor)
+                with comm_mode:
+                    reshard_dtensor.backward(grad_output)
+                grad_input = dtensor.grad
+                self.assertEqual(grad_input.placements, shard_spec)
+                self.assertEqual(
+                    grad_input.to_local(), torch.ones(dtensor.to_local().size())
+                )
+                self.assertEqual(comm_mode.get_total_counts(), 0)
+
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_replicate_to_partial(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True)
diff --git a/test/distributed/tensor/test_tensor_ops.py b/test/distributed/tensor/test_tensor_ops.py
index 6d970c379065..f1aef7fde6bc 100644
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@@ -2,8 +2,19 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
 from torch.distributed._tensor.placement_types import Partial, Replicate, Shard
+=======
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+    Partial,
+    Replicate,
+    Shard,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
@@ -649,8 +660,13 @@ def test_slice(self):
 
         global_out.backward(gradient=torch.ones_like(global_out))
         with comm_mode:
+<<<<<<< HEAD
             sharded_out_grad = torch.distributed._tensor.ones(
                 sharded_out.shape, device_mesh=mesh, placements=[Shard(1)]
+=======
+            sharded_out_grad = torch.distributed.tensor.ones(
+                sharded_out.shape, device_mesh=mesh, placements=shard_spec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             sharded_out.backward(gradient=sharded_out_grad)
 
diff --git a/test/distributed/tensor/test_utils.py b/test/distributed/tensor/test_utils.py
index a9798f9d434a..6aeb0d2c4f25 100644
--- a/test/distributed/tensor/test_utils.py
+++ b/test/distributed/tensor/test_utils.py
@@ -3,6 +3,7 @@
 import itertools
 
 import torch
+<<<<<<< HEAD
 from torch.distributed._tensor import distribute_tensor, DTensor
 from torch.distributed._tensor._utils import compute_local_shape_and_global_offset
 from torch.distributed.device_mesh import init_device_mesh
@@ -10,6 +11,20 @@
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.placement_types import _StridedShard, Replicate, Shard
 from torch.testing._internal.common_utils import run_tests
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, DTensor
+from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+from torch.distributed.tensor._utils import (
+    _compute_local_shape_and_global_offset,
+    _explicit_order_placements,
+    compute_global_tensor_shape,
+    compute_local_shape_and_global_offset,
+)
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.placement_types import _StridedShard, Replicate, Shard
+from torch.testing._internal.common_utils import run_tests, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
@@ -19,6 +34,116 @@
 c10d_functional = torch.ops.c10d_functional
 
 
+<<<<<<< HEAD
+=======
+class LocalTest(TestCase):
+    def test_explicit_order_placements(self):
+        # mesh_shape: ShapeType, placements: Sequence[Placement]
+        test_cases = [
+            {
+                "mesh_shape": [2, 4],
+                "placements": [Replicate(), Replicate()],
+                "ordered": [(0, Replicate()), (1, Replicate())],
+            },
+            {
+                "mesh_shape": [3, 2],
+                "placements": [Shard(0), Replicate()],
+                "ordered": [(0, Shard(0)), (1, Replicate())],
+            },
+            {
+                "mesh_shape": [2, 4],
+                "placements": [_StridedShard(0, split_factor=4), Shard(0)],
+                "ordered": [(1, Shard(0)), (0, Shard(0))],
+            },
+            {
+                "mesh_shape": [2, 3, 4],
+                "placements": [Shard(0), _StridedShard(0, split_factor=4), Shard(0)],
+                "ordered": [(0, Shard(0)), (2, Shard(0)), (1, Shard(0))],
+            },
+            {
+                "mesh_shape": [2, 3, 4],
+                "placements": [
+                    _StridedShard(0, split_factor=12),
+                    _StridedShard(0, split_factor=4),
+                    Shard(0),
+                ],
+                "ordered": [(2, Shard(0)), (1, Shard(0)), (0, Shard(0))],
+            },
+        ]
+        for test_case in test_cases:
+            actual = _explicit_order_placements(
+                test_case["mesh_shape"], test_case["placements"]
+            )
+            expected = test_case["ordered"]
+
+            self.assertEqual(
+                actual,
+                expected,
+                f"mesh_shape={test_case['mesh_shape']} placements={test_case['placements']}, output: {actual=}, {expected=}",
+            )
+
+        error_cases = [
+            {
+                "mesh_shape": [2, 3, 4],
+                "placements": [Shard(0), _StridedShard(0, split_factor=3), Shard(0)],
+                "exception_type": RuntimeError,
+                "exception_text": "Can only convert _StridedShard to ordered Shard if split_factor",
+            },
+            {
+                "mesh_shape": [2, 3, 4],
+                "placements": [
+                    _StridedShard(0, split_factor=3),
+                    Shard(0),
+                    Shard(0),
+                ],
+                "exception_type": NotImplementedError,
+                "exception_text": r"Strided sharding does not allow Shard\(\) to appear after the strided part has ended",
+            },
+            {
+                "mesh_shape": [2, 3],
+                "placements": [
+                    Shard(0),
+                ],
+                "exception_type": RuntimeError,
+                "exception_text": "Expected one placement per mesh dim",
+            },
+        ]
+        for test_case in error_cases:
+            with self.assertRaisesRegex(
+                test_case["exception_type"], test_case["exception_text"]
+            ):
+                _explicit_order_placements(
+                    test_case["mesh_shape"], test_case["placements"]
+                )
+
+    def test_compute_local_shape_and_global_offset_uneven(self):
+        # This case is not only 'uneven' bug also has an empty shard
+        # (e.g. most DP ranks have local shape 18,4096, one has 8,4096, one has 0,4096
+        global_shape = (4096, 4096)
+        DP = 30
+        TP = 8
+        mesh_shape = (DP, TP)
+        placements = [_StridedShard(0, split_factor=8), Shard(0)]
+        TP_shard_size = global_shape[0] / TP
+        for my_coordinate in itertools.product(range(DP), range(TP)):
+            local_shape, global_offset = _compute_local_shape_and_global_offset(
+                global_shape, mesh_shape, list(my_coordinate), placements
+            )
+            dp_rank, tp_rank = my_coordinate
+            expected_shard_size = 18
+            expected_shard_offset = tp_rank * TP_shard_size + 18 * dp_rank
+            if dp_rank == 28:
+                expected_shard_size = 8
+            elif dp_rank == 29:
+                expected_shard_size = 0
+                # we define the offset value of a zero-sized shard as the dim size
+                # this actually matters, because DCP uses offset to deduplicate shards when saving
+                expected_shard_offset = 4096
+            self.assertEqual(local_shape, (expected_shard_size, 4096))
+            self.assertEqual(global_offset, (expected_shard_offset, 0))
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class UtilTest(DTensorTestBase):
     @property
     def world_size(self):
@@ -31,6 +156,78 @@ def _compute_start_end_offsets(self, global_offset, local_size, n_dim):
         return offset
 
     @with_comms
+<<<<<<< HEAD
+=======
+    def test_compute_global_tensor_shape_1D(self):
+        one_d_placements = [[Shard(1)], [Shard(0)], [Replicate()]]
+        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+        for placements in one_d_placements:
+            if isinstance(placements[0], Shard):
+                uneven_dim = list(range(self.world_size))
+                local_shape = (
+                    torch.Size([5, uneven_dim[self.rank]])
+                    if placements[0].dim == 1
+                    else torch.Size([uneven_dim[self.rank], 5])
+                )
+                expected_global_shape = (
+                    torch.Size([5, sum(uneven_dim)])
+                    if placements[0].dim == 1
+                    else torch.Size([sum(uneven_dim), 5])
+                )
+            else:
+                expected_global_shape = torch.Size([5, 5])
+                local_shape = torch.Size([5, 5])
+            global_shape = compute_global_tensor_shape(
+                local_shape, device_mesh, placements
+            )
+            self.assertEqual(global_shape, expected_global_shape)
+
+    @with_comms
+    def test_compute_global_tensor_shape_1D_invalid_shape(self):
+        one_d_placement = [Shard(1)]
+        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+        uneven_dim = list(range(self.world_size))
+        local_shape = (
+            torch.Size([5, uneven_dim[self.rank]])
+            if self.rank % 2 == 0
+            else torch.Size([6, uneven_dim[self.rank]])
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Non-sharded dimentions should have identical size across ranks.",
+        ):
+            _ = compute_global_tensor_shape(
+                local_shape,
+                device_mesh,
+                one_d_placement,
+            )
+
+    @with_comms
+    def test_compute_global_tensor_shape_failure_2D(self):
+        placement_2D = [Shard(0), Shard(1)]
+        device_mesh_2D = init_device_mesh(self.device_type, (2, 2))
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            "compute_global_tensor_shape only supports 1 placement for now.",
+        ):
+            _ = compute_global_tensor_shape(
+                torch.Size([2, 2]),
+                device_mesh_2D,
+                placement_2D,
+            )
+        placement_1D = [Shard(0)]
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected one placement per mesh dim",
+        ):
+            _ = compute_global_tensor_shape(
+                torch.Size([2, 2]),
+                device_mesh_2D,
+                placement_1D,
+            )
+
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compute_local_shape_and_global_offset_1D(self):
         one_d_placements = [[Shard(0)], [Replicate()]]
 
@@ -115,6 +312,28 @@ def test_fsdp_tp_meta_compute(self):
         self.assertEqual(global_offset, expected_global_offset)
 
     @with_comms
+<<<<<<< HEAD
+=======
+    def test_uneven_fsdp_tp_meta_compute(self):
+        # FSDP + TP uneven sharding
+        tp_size = 2
+        dp_size = self.world_size // tp_size
+        global_mesh = init_device_mesh(
+            self.device_type, (dp_size, tp_size), mesh_dim_names=("dp", "tp")
+        )
+        global_tensor_shape = torch.Size([15, 5])
+        placements = [_StridedShard(0, split_factor=tp_size), Shard(0)]
+        local_shape, global_offset = compute_local_shape_and_global_offset(
+            global_tensor_shape, global_mesh, placements
+        )
+        rank = global_mesh.get_rank()
+        expected_shapes = [2, 2, 2, 2, 2, 2, 2, 1]
+        expected_offsets = [0, 8, 2, 10, 4, 12, 6, 14]
+        self.assertEqual(local_shape[0], expected_shapes[rank])
+        self.assertEqual(global_offset[0], expected_offsets[rank])
+
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_hsdp_tp_meta_compute(self):
         # HSDP + TP sharding
         tp_size = 2
diff --git a/test/distributed/tensor/test_view_ops.py b/test/distributed/tensor/test_view_ops.py
index 56565fdd323d..0801d0aa8ac4 100644
--- a/test/distributed/tensor/test_view_ops.py
+++ b/test/distributed/tensor/test_view_ops.py
@@ -190,6 +190,34 @@ def dimmap_test(self, op, args, expected_rule_output):
         self.call_dt_test(op, args, {}, self.device_mesh)
 
     @with_comms
+<<<<<<< HEAD
+=======
+    def test_illegal_views(self):
+        device_mesh = self.build_device_mesh()
+        # 1D mesh [6] (see above)
+        tensor = torch.randn((6, 256))
+        dtensor = distribute_tensor(tensor, device_mesh, [Replicate()])
+        shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=0)])
+        # view should be legal, since sharding is even and flatten includes only one sharded dim
+        shard.view(-1)
+
+        shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=1)])
+        with self.assertRaisesRegex(
+            RuntimeError, "Attempted to flatten sharded dimension"
+        ):
+            shard.view(-1)
+
+        # 8 is the uneven case since mesh dim is 6
+        tensor = torch.randn((8, 256))
+        dtensor = distribute_tensor(tensor, device_mesh, [Replicate()])
+        shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=0)])
+        with self.assertRaisesRegex(
+            RuntimeError, "Attempted to flatten unevenly sharded dimension"
+        ):
+            shard.view(-1)
+
+    @with_comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_view_ops(self):
         self.device_mesh = DeviceMesh(
             self.device_type, torch.arange(dist.get_world_size()).view(-1, 2)
@@ -578,6 +606,7 @@ def test_dtensor_view_op_uneven(self):
     def test_view_redistribution(self):
         """
         This test is added to demonstrate "incorrect" view ops behavior if redistribution happens.
+<<<<<<< HEAD
         #TODO: we need to define the view ops behavior when view on the sharded dimension.
         """
 
@@ -594,6 +623,18 @@ def test_view_redistribution(self):
         self.assertNotEqual(
             dtensor_x.to_local().data_ptr(), dtensor_y.to_local().data_ptr()
         )
+=======
+        """
+
+        x = torch.randn(4, 4)
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+        dtensor_x = distribute_tensor(x, mesh, (Shard(0),))
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Attempted to flatten unevenly sharded dimension"
+        ):
+            dtensor_x.view(-1, 8)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/tensor/test_xla_integration.py b/test/distributed/tensor/test_xla_integration.py
index 498c35269c5f..1931934dc571 100644
--- a/test/distributed/tensor/test_xla_integration.py
+++ b/test/distributed/tensor/test_xla_integration.py
@@ -10,7 +10,11 @@
 
 import torch
 from torch import nn
+<<<<<<< HEAD
 from torch.distributed._tensor import (
+=======
+from torch.distributed.tensor import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DeviceMesh,
     distribute_module,
     distribute_tensor,
@@ -26,7 +30,13 @@ def with_xla(func: Callable) -> Callable:
 
     @wraps(func)  # pyre-ignore[6]
     def wrapper(
+<<<<<<< HEAD
         self, *args: tuple[object], **kwargs: dict[str, Any]  # type: ignore[misc]
+=======
+        self,
+        *args: tuple[object],
+        **kwargs: dict[str, Any],  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         # TODO(yeounoh) replace this with xr.use_spmd() when we deprecate the flag.
         os.environ["XLA_USE_SPMD"] = "1"
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index ca38c54bc653..3b6374ab667d 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1565,9 +1565,51 @@ def wait(self, timeout=5.0):
 class DummyProcessGroup(dist.ProcessGroup):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+<<<<<<< HEAD
         self._aborted = False
         self._shutdown = False
 
+=======
+        self._bound_device_id = None
+        self.global_rank = args[0]
+        self.group_size = args[1]
+        self._aborted = False
+        self._shutdown = False
+
+    def rank(self):
+        return self.global_rank
+
+    def size(self):
+        return self.group_size
+
+    @property
+    def supports_splitting(self):
+        return True
+
+    @property
+    def bound_device_id(self):
+        return self._bound_device_id
+
+    @bound_device_id.setter
+    def bound_device_id(self, device):
+        self._bound_device_id = device
+
+    def eager_connect_single_device(self, device=None):
+        self._bound_device_id = device
+
+    def _set_sequence_number_for_group(self):
+        pass
+
+    def _get_backend(self, device):
+        return self
+
+    def comm_split_count(self):
+        return 0
+
+    def perform_nocolor_split(self, device):
+        pass
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def getBackendName(self):
         return "Dummy"
 
@@ -1654,6 +1696,54 @@ def test_get_backend_name(self):
         dpg = DummyProcessGroup(0, 1)
         self.assertEqual("Dummy", dpg.name())
 
+<<<<<<< HEAD
+=======
+        # dist.Backend.register_backend(
+        #     "dummy", PythonProcessGroupExtensionTest.create_dummy
+        # )
+
+        # # os.environ["MASTER_ADDR"] = "localhost"
+        # # os.environ["MASTER_PORT"] = "6789"
+        # # dist.init_process_group(
+        # #     "cpu:dummy", rank=0, world_size=1,
+        # # )
+        # dpg = DummyProcessGroup(0, 1)
+        # from torch.distributed.distributed_c10d import _canonicalize_group_rank
+        # self.assertEqual(123, _canonicalize_group_rank(dpg, group_rank=123, return_global=False))
+        # with self.assertRaises(RuntimeError):
+        # _canonicalize_group_rank(dpg, group_rank=123, return_global=True)
+
+    def test_canonicalize_helper(self):
+        dist.Backend.register_backend(
+            "dummy", PythonProcessGroupExtensionTest.create_dummy
+        )
+
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "6789"
+        dist.init_process_group("dummy", rank=self.rank, world_size=self.world_size)
+
+        dpg = DummyProcessGroup(0, 124)
+        from torch.distributed.distributed_c10d import _canonicalize_group_rank
+
+        # we ensure that a process group with more ranks than the 'default' group can still be used.
+        # e.g. if the dpg had 124 ranks and the world had only 2 ranks.
+        self.assertEqual(
+            123, _canonicalize_group_rank(dpg, group_rank=123, return_global=False)
+        )
+        self.assertEqual(
+            0, _canonicalize_group_rank(dpg, global_rank=0, return_global=True)
+        )
+        with self.assertRaises(ValueError):
+            # TODO(whc) this is actually catching the wrong error:
+            # ValueError: Group <__mp_main__.DummyProcessGroup object at 0x7faa0a844540> is not registered,
+            # please create group with torch.distributed.new_group API
+            # It should be catching a different error where the rank doesn't exist in the global mapping.
+            # But it's still testing the same part of the _canonicalize_group_rank helper so maybe this is fine
+            _canonicalize_group_rank(dpg, group_rank=123, return_global=True)
+
+        dist.destroy_process_group()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_backend_class_attr(self):
         dist.Backend.register_backend(
             "dummy", PythonProcessGroupExtensionTest.create_dummy
@@ -1728,6 +1818,14 @@ def test_init_process_group_with_multiple_backends(self):
         dist.destroy_process_group()
 
     class Options:
+<<<<<<< HEAD
+=======
+        group_name = None
+        split_from = None
+        split_color = None
+        global_ranks_in_group = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def __init__(self) -> None:
             pass
 
@@ -1738,6 +1836,13 @@ def create(self):
     def create_dummy(store, group_rank, group_size, timeout):
         return DummyProcessGroup(group_rank, group_size)
 
+<<<<<<< HEAD
+=======
+    @staticmethod
+    def create_dummy_ext(dist_opts, pg_options=None):
+        return DummyProcessGroup(dist_opts.group_rank, dist_opts.group_size)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_collectives(self):
         dist.Backend.register_backend(
             "dummy", PythonProcessGroupExtensionTest.create_dummy
@@ -2144,8 +2249,14 @@ def testNodeLocalRank(self):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     assert (
         not torch.cuda._initialized
     ), "test_distributed must not have initialized CUDA context on main process"
+=======
+    assert not torch.cuda._initialized, (
+        "test_distributed must not have initialized CUDA context on main process"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     run_tests()
diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
index 2d880154b6aa..39bd32863215 100644
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@@ -9,7 +9,11 @@
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 from torch._C import FileCheck
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache, run_and_get_triton_code
+=======
+from torch._inductor.utils import fresh_cache, run_and_get_triton_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed._functional_collectives import (
     all_gather_into_tensor_coalesced,
     all_gather_tensor,
@@ -222,6 +226,31 @@ def test_all_gather_into_tensor_single(self) -> None:
         assert output.eq(expect).all()
         assert output.completed
 
+<<<<<<< HEAD
+=======
+    # https://github.com/pytorch/pytorch/issues/133421
+    @skip_if_lt_x_gpu(2)
+    def test_functional_collectives_inference_mode(self) -> None:
+        self._init_process_group()
+
+        with torch.inference_mode():
+            input = torch.full((2, 2), float(self.rank), device=self.device)
+            out1 = funcol.all_gather_tensor(
+                input, gather_dim=0, group=torch.distributed.group.WORLD
+            )
+            out2 = out1.to(dtype=torch.bfloat16)
+            # this tests that the call to .to() properly triggered a wait() on the AsyncCollectiveTensor
+            self.assertTrue(type(out2) is torch.Tensor)
+            self.assertEqual(
+                out2,
+                torch.tensor(
+                    [[0, 0], [0, 0], [1, 1], [1, 1]],
+                    device=self.device,
+                    dtype=torch.bfloat16,
+                ),
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
     # https://github.com/pytorch/pytorch/issues/126338
@@ -442,7 +471,11 @@ def test_unwaited(self) -> None:
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_threading(self):
         self._init_process_group()
         device = torch.device(f"cuda:{self.rank}")
@@ -488,7 +521,11 @@ def join(self):
         "_scaled_mm currently only supports sm>=90",
     )
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fixed_striding(self):
         self._init_process_group()
 
@@ -691,7 +728,10 @@ def test_collectives(self) -> None:
         self.assertEqual(pg.dels, 4)
 
 
+<<<<<<< HEAD
 @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CompileTest(TestCase):
     def setUp(self):
         super().setUp()
@@ -715,7 +755,11 @@ def tearDown(self):
         dist.destroy_process_group()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_all_reduce_single(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             buf0 = arg + 42
@@ -748,11 +792,19 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         assert "= torch.ops._c10d_functional.wait_tensor.default" not in code
 
         # Test aoti
+<<<<<<< HEAD
         AOTIRunnerUtil.run("cuda", func, (arg,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_inductor_cache()
+=======
+        AOTIRunnerUtil.run(func, (arg,))
+        torch.cuda.synchronize()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_all_reduce_coalesced(self):
         def func(args: list[torch.Tensor]) -> torch.Tensor:
             bufs = [arg + 42 for arg in args]
@@ -775,6 +827,7 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
             .check("buf6 = empty")
             # Expect in-place with inductor allocated buf
             .check(
+<<<<<<< HEAD
                 "torch.ops._c10d_functional.all_reduce_coalesced_"
                 ".default([buf0, buf1]"
             )
@@ -782,6 +835,13 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
             .check(
                 "torch.ops._c10d_functional.all_reduce_coalesced_"
                 ".default([buf5, buf6]"
+=======
+                "torch.ops._c10d_functional.all_reduce_coalesced_.default([buf0, buf1]"
+            )
+            # Expect no in-place with graph input (buf5, buf6 are clones)
+            .check(
+                "torch.ops._c10d_functional.all_reduce_coalesced_.default([buf5, buf6]"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             .check("torch.ops._c10d_functional.wait_tensor.default(buf0")
             .check("torch.ops._c10d_functional.wait_tensor.default(buf1")
@@ -794,11 +854,19 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
         assert "= torch.ops._c10d_functional.wait_tensor.default" not in code
 
         # Test aoti
+<<<<<<< HEAD
         out = AOTIRunnerUtil.run("cuda", func, (args,))  # noqa: F841
         torch.cuda.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_inductor_cache()
+=======
+        out = AOTIRunnerUtil.run(func, (args,))  # noqa: F841
+        torch.cuda.synchronize()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_inplace_op_on_view(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             buf0 = (arg + 10)[:2]
@@ -822,7 +890,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         )
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_all_reduce_non_contig_input(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             ar0 = funcol.all_reduce(arg, "avg", "0")
@@ -848,7 +920,11 @@ def func2(arg: torch.Tensor) -> torch.Tensor:
         assert "torch.ops._c10d_functional.wait_tensor.default" in code
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_reuse_buffer_after_inplace_collective(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             # Expect allocation
@@ -883,7 +959,11 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         assert "= torch.ops._c10d_functional.wait_tensor.default" not in code
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_all_gather_into_tensor_single(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             ag0 = funcol.all_gather_tensor(arg, 0, "0")
@@ -906,11 +986,19 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         assert "= torch.ops._c10d_functional.wait_tensor.default" not in code
 
         # Test aoti
+<<<<<<< HEAD
         AOTIRunnerUtil.run("cuda", func, (arg,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_inductor_cache()
+=======
+        AOTIRunnerUtil.run(func, (arg,))
+        torch.cuda.synchronize()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_all_gather_into_tensor_coalesced(self):
         def func(args: list[torch.Tensor]) -> torch.Tensor:
             ag0 = funcol.all_gather_into_tensor_coalesced(args, "0")
@@ -940,11 +1028,19 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
         )
 
         # Test aoti
+<<<<<<< HEAD
         out = AOTIRunnerUtil.run("cuda", func, (args,))  # noqa: F841
         torch.cuda.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "This is a GPU test!")
     @fresh_inductor_cache()
+=======
+        out = AOTIRunnerUtil.run(func, (args,))  # noqa: F841
+        torch.cuda.synchronize()
+
+    @unittest.skipIf(not HAS_GPU, "This is a GPU test!")
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_wait_tensor(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             t = torch.ops._c10d_functional.all_reduce(arg, "avg", "0")
@@ -962,11 +1058,19 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         )
 
         # Test aoti
+<<<<<<< HEAD
         AOTIRunnerUtil.run("cuda", func, (arg,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_inductor_cache()
+=======
+        AOTIRunnerUtil.run(func, (arg,))
+        torch.cuda.synchronize()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_reduce_scatter_tensor_single(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             rs0 = funcol.reduce_scatter_tensor(arg, "avg", 0, "0")
@@ -988,11 +1092,19 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         )
 
         # Test aoti
+<<<<<<< HEAD
         AOTIRunnerUtil.run("cuda", func, (arg,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_inductor_cache()
+=======
+        AOTIRunnerUtil.run(func, (arg,))
+        torch.cuda.synchronize()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_reduce_scatter_tensor_coalesced(self):
         def func(args: list[torch.Tensor]) -> torch.Tensor:
             rs0 = funcol.reduce_scatter_tensor_coalesced(
@@ -1024,11 +1136,19 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
         )
 
         # Test aoti
+<<<<<<< HEAD
         AOTIRunnerUtil.run("cuda", func, (args,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_inductor_cache()
+=======
+        AOTIRunnerUtil.run(func, (args,))
+        torch.cuda.synchronize()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_all_to_all_single(self):
         def _tolist_with_constrain_as_size(tensor):
             lst = tensor.tolist()
@@ -1076,7 +1196,11 @@ def func(
         )
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_broadcast(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             buf0 = arg + 42
@@ -1109,11 +1233,19 @@ def func(arg: torch.Tensor) -> torch.Tensor:
         )
 
         # Test aoti
+<<<<<<< HEAD
         AOTIRunnerUtil.run("cuda", func, (arg,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_inductor_cache()
+=======
+        AOTIRunnerUtil.run(func, (arg,))
+        torch.cuda.synchronize()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ranks_and_tag(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             buf0 = arg + 42
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 1001adf56af3..12e65236d746 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -1,13 +1,25 @@
 # Owner(s): ["oncall: distributed"]
 
 import copy
+<<<<<<< HEAD
+=======
+import json
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import math
 import operator
 import os
+<<<<<<< HEAD
 import random
 import sys
 import tempfile
+=======
+import pickle
+import random
+import sys
+import tempfile
+import time
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from datetime import timedelta
 from functools import reduce
 from itertools import groupby
@@ -46,6 +58,10 @@
     requires_gloo,
     simple_sparse_reduce_tests,
     skip_if_lt_x_gpu,
+<<<<<<< HEAD
+=======
+    skip_if_win32,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     verify_ddp_error_logged,
 )
 from torch.testing._internal.common_utils import (
@@ -54,6 +70,10 @@
     run_tests,
     skip_but_pass_in_sandcastle,
     skipIfRocm,
+<<<<<<< HEAD
+=======
+    skipIfRocmArch,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 
@@ -80,6 +100,14 @@ def simple_reduce_tests(rank, world_size):
             torch.tensor([rank + 1.0]),
             torch.tensor([float(world_size)]),
         ),
+<<<<<<< HEAD
+=======
+        (
+            c10d.ReduceOp.AVG,
+            torch.tensor([rank + 1.0]),
+            torch.tensor([float((world_size + 1) / 2)]),
+        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
     # Generate tests for BAND.
@@ -185,6 +213,20 @@ def simple_multi_input_reduce_tests(rank, world_size):
     ]
 
 
+<<<<<<< HEAD
+=======
+class RendezvousTCPTest(TestCase):
+    @retry_on_connect_failures
+    def test_tcp_init(self):
+        rendezvous_iterator = dist.rendezvous("tcp://127.0.0.1:0", rank=0, world_size=1)
+        store, rank, world_size = next(rendezvous_iterator)
+        self.assertEqual(rank, 0)
+        self.assertEqual(world_size, 1)
+        # port number should get assigned
+        self.assertNotEqual(store.port, "0")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class RendezvousEnvTest(TestCase):
     @requires_gloo()
     @retry_on_connect_failures
@@ -214,6 +256,11 @@ def test_default_store_timeout_gloo(self):
 
 
 class ProcessGroupGlooTest(MultiProcessTestCase):
+<<<<<<< HEAD
+=======
+    lazy_init = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _create_process_group_gloo(self, store, rank, world_size, opts):
         pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts)
         dist.barrier(group=pg)
@@ -223,11 +270,20 @@ def setUp(self):
         super().setUp()
         self._spawn_processes()
 
+<<<<<<< HEAD
     def opts(self, threads=2):
         opts = c10d.ProcessGroupGloo._Options()
         opts._timeout = 50.0
         opts._devices = [create_device(interface=LOOPBACK)]
         opts._threads = threads
+=======
+    def opts(self, threads=2, group_name="0"):
+        opts = c10d.ProcessGroupGloo._Options()
+        opts._timeout = 50.0
+        opts._devices = [create_device(interface=LOOPBACK, lazy_init=self.lazy_init)]
+        opts._threads = threads
+        opts.group_name = group_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return opts
 
     @requires_gloo()
@@ -236,8 +292,13 @@ def test_multi_device_constructor(self):
         opts = c10d.ProcessGroupGloo._Options()
         opts._timeout = 5.0
         opts._devices = [
+<<<<<<< HEAD
             create_device(interface=LOOPBACK),
             create_device(interface=LOOPBACK),
+=======
+            create_device(interface=LOOPBACK, lazy_init=self.lazy_init),
+            create_device(interface=LOOPBACK, lazy_init=self.lazy_init),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         pg = self._create_process_group_gloo(store, self.rank, self.world_size, opts)
 
@@ -387,9 +448,15 @@ def test_broadcast_stress(self):
         inputs = [torch.tensor([i * self.world_size + self.rank]) for i in range(1000)]
         self._test_broadcast_stress(inputs)
 
+<<<<<<< HEAD
     @skipIfRocm
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
+=======
+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_broadcast_stress_cuda(self):
         inputs = [
             torch.tensor([i * self.world_size + self.rank]).cuda() for i in range(1000)
@@ -458,6 +525,33 @@ def _test_allreduce_basics(self, fn):
             result[0],
         )
 
+<<<<<<< HEAD
+=======
+        # Test fp16 numerical correctness for all-reduce SUM.
+        torch.manual_seed(self.rank)
+        # TODO: when create larger sizes of tensors, numerical instability will be observed.
+        # We need to investigate the root cause and ensure it is fixed.
+        tensor = (
+            (torch.rand(200, 1, dtype=torch.float32) * 2 - 1) * 65504 / self.world_size
+        )
+        opts = c10d.AllreduceOptions()
+        tensor = tensor.to(torch.float16)
+        output = [[torch.zeros_like(tensor) for _ in range(self.world_size)]]
+        # allgather all local tensors first and then sum up.
+        fut = pg.allgather(output, [tensor]).get_future()
+        fut.wait()
+        ag_result = fut.value()
+        total = torch.stack(ag_result, dim=0).sum(dim=0)
+
+        # result from fp16 all-reduce.
+        fut = pg.allreduce([tensor], opts).get_future()
+        fut.wait()
+        result_fp16 = fut.value()
+        # float16 has only ~11 bits of mantissa, and is sensitive to accumulation
+        # order and rounding errors so we use a larger tolerance.
+        self.assertEqual(total, result_fp16[0], rtol=1e-2, atol=1e-3)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gloo()
     def test_allreduce_basics(self):
         self._test_allreduce_basics(lambda t: t.clone())
@@ -493,9 +587,15 @@ def test_allreduce_stress(self):
         inputs = [torch.tensor([i + self.rank]) for i in range(1000)]
         self._test_allreduce_stress(inputs)
 
+<<<<<<< HEAD
     @skipIfRocm
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
+=======
+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_allreduce_stress_cuda(self):
         inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
         self._test_allreduce_stress(inputs)
@@ -703,6 +803,30 @@ def test_allgather_into_tensor_coalesced(self):
             self.assertTrue(torch.allclose(output, expect))
 
     @requires_gloo()
+<<<<<<< HEAD
+=======
+    def test_reduce_scatter(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="gloo",
+            store=store,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+        torch.manual_seed(42)
+
+        # variable size per rank
+        inputs = [torch.rand(i) for i in range(self.world_size)]
+        output = torch.empty(self.rank)
+
+        work = dist.reduce_scatter(output, inputs, async_op=True)
+        work.wait()
+
+        expect = inputs[self.rank] * self.world_size
+        self.assertTrue(torch.allclose(output, expect))
+
+    @requires_gloo()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reduce_scatter_tensor(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
@@ -926,10 +1050,16 @@ def test_scatter_stress(self):
     @skip_but_pass_in_sandcastle(
         "Test is flaky, see https://github.com/pytorch/pytorch/issues/15963"
     )
+<<<<<<< HEAD
 
     @skipIfRocm
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
+=======
+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scatter_stress_cuda(self):
         inputs = [
             [torch.tensor([i + self.rank]) for _ in range(self.world_size)]
@@ -1102,8 +1232,13 @@ def test_gather_stress(self):
         inputs = [torch.tensor([i + self.rank]) for i in range(1000)]
         self._test_gather_stress(inputs, lambda t: t.clone())
 
+<<<<<<< HEAD
     @skipIfRocm
     @skip_if_lt_x_gpu(2)
+=======
+    @skip_if_lt_x_gpu(2)
+    @skipIfRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gloo()
     def test_gather_stress_cuda(self):
         inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
@@ -1238,9 +1373,15 @@ def test_allgather_stress(self):
         inputs = [torch.tensor([i + self.rank]) for i in range(1000)]
         self._test_allgather_stress(inputs, lambda t: t.clone())
 
+<<<<<<< HEAD
     @skipIfRocm
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
+=======
+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_allgather_stress_cuda(self):
         inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
         self._test_allgather_stress(inputs, lambda t: t.clone().cuda())
@@ -1425,9 +1566,15 @@ def test_reduce_stress(self):
         inputs = [torch.tensor([i + self.rank]) for i in range(1000)]
         self._test_reduce_stress(inputs)
 
+<<<<<<< HEAD
     @skipIfRocm
     @skip_if_lt_x_gpu(2)
     @requires_gloo()
+=======
+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reduce_stress_cuda(self):
         inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)]
         self._test_reduce_stress(inputs)
@@ -2314,6 +2461,138 @@ def test_forward_backward_optimizer(self):
             optimizer.step()
 
 
+<<<<<<< HEAD
+=======
+@skip_if_win32()
+class ProcessGroupGlooLazyInitTest(ProcessGroupGlooTest):
+    lazy_init = True
+
+    def setUp(self):
+        os.environ["TORCH_GLOO_LAZY_INIT"] = "1"
+        super().setUp()
+
+    def tearDown(self) -> None:
+        del os.environ["TORCH_GLOO_LAZY_INIT"]
+        return super().tearDown()
+
+
+class ProcessGroupGlooFRTest(ProcessGroupGlooTest):
+    def setUp(self):
+        os.environ["TORCH_FR_BUFFER_SIZE"] = "10"
+        super().setUp()
+
+    def tearDown(self) -> None:
+        del os.environ["TORCH_FR_BUFFER_SIZE"]
+        return super().tearDown()
+
+    def _verify_trace(self, t, is_json):
+        ver = t["version"]
+        self.assertEqual(ver, "2.9")
+        pg_config = t["pg_config"]
+        self.assertEqual(len(pg_config), 1)
+        default_pg_info = pg_config["0"]
+        self.assertIn("name", default_pg_info)
+        self.assertIn("desc", default_pg_info)
+        self.assertIn("ranks", default_pg_info)
+        pg_status = t["pg_status"]
+        self.assertEqual(len(pg_status), 1)
+        self.assertEqual(str(pg_status["0"]["last_enqueued_collective"]), "3")
+        self.assertEqual(str(pg_status["0"]["last_completed_collective"]), "3")
+        self.assertEqual(
+            str(pg_status["0"]["last_started_collective"]),
+            "-1",
+        )
+        global_ranks = pg_config["0"]["ranks"]
+        self.assertEqual(len(json.loads(global_ranks)), self.world_size)
+        self.assertEqual(len(t["entries"]), 3)
+        t = t["entries"]
+        last = t[-1]
+        self.assertEqual(last["process_group"], ("0", ""))
+        # No event recorded for Gloo.
+        self.assertEqual(last["state"], "scheduled")
+        # we don't collect stack traces in JSON at the moment
+        if not is_json:
+            self.assertIn("test_c10d_gloo.py", str(last["frames"]))
+        self.assertEqual(last["input_sizes"], ((3, 4),))
+        self.assertEqual(last["input_dtypes"], ["Float"])
+        self.assertEqual(last["output_sizes"], ((3, 4),))
+        self.assertEqual(last["output_dtypes"], ["Float"])
+        self.assertEqual(last["collective_seq_id"], 3)
+        # TODO: Needs verification
+        self.assertEqual(last["timeout_ms"], 50000)
+        self.assertTrue("duration_ms" not in last)
+
+    @requires_gloo()
+    def test_short_json(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_gloo(
+            store, self.rank, self.world_size, self.opts(group_name="0")
+        )
+        a = torch.full((3, 4), float(self.rank))
+        for _ in range(2):
+            f = pg.allreduce(a)
+            f.wait()
+        time.sleep(1)
+        t = json.loads(
+            torch._C._distributed_c10d._dump_fr_trace_json(includeCollectives=True)
+        )
+        self._verify_trace(t, True)
+
+    @requires_gloo()
+    def test_short_pickle(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_gloo(
+            store, self.rank, self.world_size, self.opts(group_name="0")
+        )
+        a = torch.full((3, 4), float(self.rank))
+        for _ in range(2):
+            f = pg.allreduce(a)
+            f.wait()
+        time.sleep(1)
+        t = pickle.loads(
+            torch._C._distributed_c10d._dump_fr_trace(includeCollectives=True)
+        )
+        self._verify_trace(
+            t,
+            is_json=False,
+        )
+
+    @requires_gloo()
+    def test_long(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_gloo(
+            store, self.rank, self.world_size, self.opts(group_name="0")
+        )
+        a = torch.full((3, 4), float(self.rank))
+        for _ in range(2):
+            # test some other primitives to make sure
+            # their strings are valid
+            xs = [torch.ones(3, 4)]
+            pg.broadcast(xs).wait()
+            pg.allreduce(xs).wait()
+            pg.reduce(xs).wait()
+            ys = [[torch.empty(3, 4) for _ in range(self.world_size)]]
+            pg.allgather(ys, xs).wait()
+            pg.reduce_scatter(xs, ys).wait()
+            f = pg.allreduce(a)
+            f.wait()
+        t = pickle.loads(torch._C._distributed_c10d._dump_fr_trace())
+        t = t["entries"]
+        self.assertEqual(len(t), 10)
+        first = t[0]
+        last = t[-1]
+        self.assertEqual(last["profiling_name"], "gloo:all_reduce")
+        self.assertEqual(last["state"], "scheduled")
+        self.assertIn("test_c10d_gloo.py", str(last["frames"]))
+        self.assertEqual(last["input_sizes"], ((3, 4),))
+        self.assertEqual(last["input_dtypes"], ["Float"])
+        self.assertEqual(last["output_sizes"], ((3, 4),))
+        self.assertEqual(last["output_dtypes"], ["Float"])
+        self.assertEqual(last["timeout_ms"], 50000)
+        self.assertEqual(last["collective_seq_id"] - first["collective_seq_id"], 9)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
     @property
     def device(self):
@@ -2508,8 +2787,14 @@ def test_new_group_local_sync_duplicate_pg(self):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     assert (
         not torch.cuda._initialized
     ), "test_distributed must not have initialized CUDA context on main process"
+=======
+    assert not torch.cuda._initialized, (
+        "test_distributed must not have initialized CUDA context on main process"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     run_tests()
diff --git a/test/distributed/test_c10d_logger.py b/test/distributed/test_c10d_logger.py
index de72646405af..c537fbd4297b 100644
--- a/test/distributed/test_c10d_logger.py
+++ b/test/distributed/test_c10d_logger.py
@@ -2,7 +2,10 @@
 
 import json
 import logging
+<<<<<<< HEAD
 import os
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 import sys
 from functools import partial, wraps
@@ -16,10 +19,20 @@
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import MultiProcessTestCase, TEST_SKIPS
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 
+=======
+from torch.testing._internal.common_distributed import DistributedTestBase, TEST_SKIPS
+from torch.testing._internal.common_fsdp import get_devtype
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
+
+
+device_type = str(get_devtype())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TEST_WITH_DEV_DBG_ASAN:
     print(
         "Skip dev-asan as torch + multiprocessing spawn have known issues",
@@ -27,8 +40,12 @@
     )
     sys.exit(0)
 
+<<<<<<< HEAD
 BACKEND = dist.Backend.NCCL
 WORLD_SIZE = min(4, max(2, torch.cuda.device_count()))
+=======
+WORLD_SIZE = min(4, max(2, torch.get_device_module(device_type).device_count()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def with_comms(func=None):
@@ -39,15 +56,22 @@ def with_comms(func=None):
 
     @wraps(func)
     def wrapper(self, *args, **kwargs):
+<<<<<<< HEAD
         if BACKEND == dist.Backend.NCCL and torch.cuda.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
         self.dist_init()
+=======
+        if torch.get_device_module(device_type).device_count() < self.world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
+        self.create_pg(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         func(self)
         self.destroy_comms()
 
     return wrapper
 
 
+<<<<<<< HEAD
 class C10dErrorLoggerTest(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
@@ -63,6 +87,9 @@ def device(self):
             else torch.device("cpu")
         )
 
+=======
+class C10dErrorLoggerTest(DistributedTestBase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def world_size(self):
         return WORLD_SIZE
@@ -76,6 +103,7 @@ def destroy_comms(self):
         dist.barrier()
         dist.destroy_process_group()
 
+<<<<<<< HEAD
     def dist_init(self):
         dist.init_process_group(
             backend=BACKEND,
@@ -88,6 +116,8 @@ def dist_init(self):
         if BACKEND == "nccl":
             torch.cuda.set_device(self.rank)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_or_create_logger(self):
         self.assertIsNotNone(_c10d_logger)
         self.assertEqual(1, len(_c10d_logger.handlers))
@@ -117,7 +147,15 @@ def test_exception_logger(self) -> None:
                 re.search("({.+})", captured.output[0]).group(0).replace("'", '"')
             )
 
+<<<<<<< HEAD
             self.assertEqual(len(error_msg_dict), 9)
+=======
+            # NCCL adds additional nccl_version data to the error_msg_dict
+            if self.backend(device_type) == dist.Backend.NCCL:
+                self.assertEqual(len(error_msg_dict), 9)
+            else:
+                self.assertEqual(len(error_msg_dict), 8)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.assertIn("pg_name", error_msg_dict.keys())
             self.assertEqual("None", error_msg_dict["pg_name"])
@@ -126,6 +164,7 @@ def test_exception_logger(self) -> None:
             self.assertEqual("broadcast", error_msg_dict["func_name"])
 
             self.assertIn("backend", error_msg_dict.keys())
+<<<<<<< HEAD
             self.assertEqual("nccl", error_msg_dict["backend"])
 
             self.assertIn("nccl_version", error_msg_dict.keys())
@@ -133,6 +172,16 @@ def test_exception_logger(self) -> None:
             self.assertEqual(
                 ".".join(str(v) for v in nccl_ver), error_msg_dict["nccl_version"]
             )
+=======
+            self.assertEqual(self.backend(device_type), error_msg_dict["backend"])
+
+            if self.backend(device_type) == dist.Backend.NCCL:
+                self.assertIn("nccl_version", error_msg_dict.keys())
+                nccl_ver = torch.cuda.nccl.version()
+                self.assertEqual(
+                    ".".join(str(v) for v in nccl_ver), error_msg_dict["nccl_version"]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # In this test case, group_size = world_size, since we don't have multiple processes on one node.
             self.assertIn("group_size", error_msg_dict.keys())
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 3135d1e20128..5bb67faf5bc9 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -39,12 +39,19 @@
 from torch import nn
 from torch._C._distributed_c10d import ErrorType, OpType, WorkResult
 from torch.nn.parallel import DistributedDataParallel
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_MULTIGPU, _get_torch_rocm_version
+=======
+from torch.testing._internal.common_cuda import _get_torch_rocm_version, TEST_MULTIGPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import (
     get_timeout,
     init_multigpu_helper,
     MultiProcessTestCase,
+<<<<<<< HEAD
     requires_gloo,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     requires_multicast_support,
     requires_nccl,
     requires_nccl_version,
@@ -57,9 +64,18 @@
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+<<<<<<< HEAD
     parametrize,
     retry_on_connect_failures,
     run_tests,
+=======
+    IS_SANDCASTLE,
+    MI300_ARCH,
+    parametrize,
+    retry_on_connect_failures,
+    run_tests,
+    runOnRocmArch,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_but_pass_in_sandcastle,
     skip_but_pass_in_sandcastle_if,
     TEST_CUDA,
@@ -282,6 +298,7 @@ def opts(self, high_priority_stream=False):
 
     def setUp(self):
         super().setUp()
+<<<<<<< HEAD
         # Need to skip return code checking for these tests since the child
         # processes don't exit cleanly in some cuda versions
         self.skip_return_code_checks = [
@@ -292,6 +309,20 @@ def setUp(self):
             self.test_nan_assert_float8_e4m3fn.__wrapped__,
             self.test_nan_assert_float8_e5m2.__wrapped__,
         ]
+=======
+
+        # These tests are expected to throw SIGABRT(6);
+        # But if we are in Sandcastle, `skip_but_pass_in_sandcastle` would return 0.
+        TEST_NAN_ASSERT_RETURN = 0 if IS_SANDCASTLE else signal.SIGABRT
+        self.special_return_code_checks = {
+            self.test_nan_assert_float16.__wrapped__: TEST_NAN_ASSERT_RETURN,
+            self.test_nan_assert_float32.__wrapped__: TEST_NAN_ASSERT_RETURN,
+            self.test_nan_assert_float64.__wrapped__: TEST_NAN_ASSERT_RETURN,
+            self.test_nan_assert_bfloat16.__wrapped__: TEST_NAN_ASSERT_RETURN,
+            self.test_nan_assert_float8_e4m3fn.__wrapped__: TEST_NAN_ASSERT_RETURN,
+            self.test_nan_assert_float8_e5m2.__wrapped__: TEST_NAN_ASSERT_RETURN,
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
         # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
@@ -450,7 +481,10 @@ def test_cuda_event_cache_mthd_race(self):
         # This unit test is to test the case when the collective is launched in
         # a side thread and the thread dies before the cache has been fully recycled.
         # More details can be found in this issue: https://github.com/pytorch/pytorch/issues/143470.
+<<<<<<< HEAD
         import threading
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # initiate collectives here
         def init_collective_task(t):
@@ -473,12 +507,22 @@ def init_collective_task(t):
         side_thread.join()
         torch.cuda.synchronize()
 
+<<<<<<< HEAD
+=======
+        # reset ENV
+        os.environ["TORCH_NCCL_CUDA_EVENT_CACHE"] = "0"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CUDA_12_AND_ABOVE = torch.cuda.is_available() and (
         torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 12
     )
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(
+<<<<<<< HEAD
+=======
+        # skip for cu126 as well due to https://github.com/pytorch/pytorch/issues/153479
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         not (TEST_MULTIGPU and CUDA_12_AND_ABOVE),
         "NCCL test requires 2+ GPUs and Device side assert could cause unexpected errors in lower versions of CUDA",
     )
@@ -499,6 +543,11 @@ def test_nan_assert(self, type):
         os.environ["TORCH_NCCL_NAN_CHECK"] = "1"
         store = c10d.FileStore(self.file_name, self.world_size)
         pg = self._create_process_group_nccl(store, self.opts())
+<<<<<<< HEAD
+=======
+        backend = pg._get_backend(torch.device("cuda"))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = self.rank_to_GPU[self.rank][0]
         # Cover different buffer sizes
         if type == torch.float64:
@@ -526,11 +575,30 @@ def test_nan_assert(self, type):
             nan_tensor = nan_tensor.to(type)
 
         output = torch.empty(self.world_size, *size, dtype=type, device=device)
+<<<<<<< HEAD
         with self.assertRaises(RuntimeError):
             # Note: using all-gather here bc FP8 types do not support reduce ops
             # at the moment
             pg._allgather_base(output, nan_tensor)
         dist.destroy_process_group()
+=======
+
+        # confirm enable/disable flag works
+        backend._set_enable_nan_check(False)
+        # Note: using all-gather here bc some NCCL/SM version does not support
+        # FP8 reduction
+        # temporarily skip due to https://github.com/pytorch/pytorch/issues/153479
+        # pg._allgather_base(output, nan_tensor)
+
+        backend._set_enable_nan_check(True)
+        try:
+            pg._allgather_base(output, nan_tensor)
+        except Exception:
+            sys.exit(signal.SIGABRT)
+
+        dist.destroy_process_group()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # reset env
         os.environ["TORCH_NCCL_NAN_CHECK"] = "0"
 
@@ -565,16 +633,24 @@ def test_nan_rank_filter(self):
     def test_nan_check(self):
         # Not expecting an error, NaN check should not make legit code fail
         device = torch.device(f"cuda:{self.rank:d}")
+<<<<<<< HEAD
         if not sm_is_or_higher_than(device, 8, 0):
             self.skipTest("bf16 requires sm >= 8.0")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         os.environ["TORCH_NCCL_NAN_CHECK"] = "1"
         store = c10d.FileStore(self.file_name, self.world_size)
         c10d.init_process_group(
             backend="nccl", store=store, rank=self.rank, world_size=self.world_size
         )
+<<<<<<< HEAD
         x = torch.ones((10,), dtype=torch.bfloat16, device=device) * self.rank
         t = torch.ones(3, 4, dtype=torch.bfloat16, device=device)
+=======
+        x = torch.ones((10,), device=device) * self.rank
+        t = torch.ones(3, 4, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         c10d.broadcast(x, src=0)
         c10d.all_reduce(t)
         c10d.barrier()
@@ -655,9 +731,17 @@ def _helper_test_extra_cuda_context_by_memory(self):
             # fail because one context takes about 1 GB -- much more than the
             # tensor size created in this test.
             self.assertTrue(
+<<<<<<< HEAD
                 used_after < used_before * 1.5,
                 f"{device} used {used_after} bytes after collective, "
                 f"50% more than the status before ({used_before} bytes). "
+=======
+                # Bump the heuristic from 1.5 to 1.7 due to
+                # https://github.com/pytorch/pytorch/issues/153122
+                used_after < used_before * 1.7,
+                f"{device} used {used_after} bytes after collective, "
+                f"70% more than the status before ({used_before} bytes). "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"Extra CUDA context may have been created.",
             )
 
@@ -770,6 +854,7 @@ def test_abort_in_destroy_pg(self):
     @skip_but_pass_in_sandcastle_if(
         torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs"
     )
+<<<<<<< HEAD
     def test_close_multi_pg_unordered(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         pg = self._create_process_group_nccl(store, self.opts())
@@ -802,6 +887,8 @@ def test_close_multi_pg_unordered(self):
     @skip_but_pass_in_sandcastle_if(
         torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_abort_in_destroy_multi_pgs(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         pg = self._create_process_group_nccl(store, self.opts())
@@ -1215,6 +1302,32 @@ def test_set_process_group_desc(self):
         pg_2 = c10d.new_group([0, 1])
         self.assertEqual(pg_2.group_desc, "undefined")
 
+<<<<<<< HEAD
+=======
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_deterministic_mode_no_break(self):
+        torch.use_deterministic_algorithms(True)
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        self._create_process_group_nccl(store, self.opts(), device_id=device)
+        tensor = torch.empty(10, 10, device=device)
+        dist.all_reduce(tensor)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_init_with_idx(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device_idx = self.rank
+        dist.init_process_group(
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            device_id=device_idx,
+        )
+        dist.all_reduce(torch.empty(1, device=torch.device("cuda", device_idx)))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class DistributedDataParallelTest(
     test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
@@ -2801,6 +2914,7 @@ def hook(work_info: torch._C._distributed_c10d.WorkInfo):
 class NcclErrorHandlingTest(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
+<<<<<<< HEAD
         # Need to skip return code checking for these tests since the child
         # processes don't exit cleanly.
         self.skip_return_code_checks = [
@@ -2809,6 +2923,8 @@ def setUp(self):
             self.test_nccl_errors_blocking_sigterm.__wrapped__,
             self.test_nccl_errors_blocking_nonzero_exit.__wrapped__,
         ]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
         # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
         os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
@@ -2836,12 +2952,25 @@ def blocking_wait_error_msg(self):
     def _run_all_reduce(self, pg):
         pg.allreduce(torch.rand(10).cuda(self.rank))
 
+<<<<<<< HEAD
+=======
+    def _reduce_timeout(self):
+        # set heartbeat timeout to a small value so that we don't wait too long
+        # for things to shutdown
+        os.environ["TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"] = "4"
+        os.environ["TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC"] = "1000"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl()
     @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
     @skip_if_rocm_multiprocess
     @skip_but_pass_in_sandcastle("Test does not pass when run locally")
     def test_nccl_errors_nonblocking(self):
+<<<<<<< HEAD
+=======
+        self._reduce_timeout()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Note: we unset and restore TORCH_NCCL_ASYNC_ERROR_HANDLING for this test
         # since test_c10d_common runs with async error handling by default, but this
         # tests behavior when it is not enabled.
@@ -2868,16 +2997,30 @@ def test_nccl_errors_nonblocking(self):
             self.assertTrue(t.is_alive())
 
         if prev_nccl_async_error_handling is not None:
+<<<<<<< HEAD
             os.environ[
                 "TORCH_NCCL_ASYNC_ERROR_HANDLING"
             ] = prev_nccl_async_error_handling
 
     def _test_nccl_errors_blocking(self, func):
+=======
+            os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = (
+                prev_nccl_async_error_handling
+            )
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(3)
+    @skip_if_rocm_multiprocess
+    def test_nccl_errors_blocking(self):
+        self._reduce_timeout()
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(
             store,
             self.rank,
             self.world_size,
+<<<<<<< HEAD
             timeout=timedelta(seconds=10),
         )
         process_group.allreduce(torch.rand(10).cuda(self.rank))
@@ -2896,6 +3039,15 @@ def _test_nccl_errors_blocking(self, func):
             # Clean up structures (ex: files for FileStore before going down)
             del process_group
             func()
+=======
+        )
+        x = torch.rand(1024 * 1024).cuda(self.rank)
+        process_group.allreduce(x)
+        if self.rank == 0:
+            work = process_group.allreduce(x)
+            with self.assertRaisesRegex(dist.DistBackendError, ""):
+                work.wait(timeout=timedelta(seconds=self.op_timeout_sec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_barrier_error(self):
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -2919,6 +3071,7 @@ def _test_barrier_error(self):
     @requires_nccl()
     @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+<<<<<<< HEAD
     @skip_if_rocm_multiprocess
     def test_nccl_errors_blocking_clean_exit(self):
         self._test_nccl_errors_blocking(lambda: sys.exit(0))
@@ -2963,12 +3116,20 @@ def test_nccl_errors_blocking_sigterm(self):
     @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
     def test_nccl_blocking_wait_with_barrier(self):
+=======
+    def test_nccl_blocking_wait_with_barrier(self):
+        self._reduce_timeout()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_barrier_error()
 
     @requires_nccl()
     @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
     def test_nccl_non_blocking_wait_with_barrier(self):
+<<<<<<< HEAD
+=======
+        self._reduce_timeout()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # test the barrier behavior in the non blocking wait setting
         prev_nccl_async_error_handling = os.environ.get(
             "TORCH_NCCL_ASYNC_ERROR_HANDLING", None
@@ -2977,9 +3138,15 @@ def test_nccl_non_blocking_wait_with_barrier(self):
         os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
         self._test_barrier_error()
         if prev_nccl_async_error_handling is not None:
+<<<<<<< HEAD
             os.environ[
                 "TORCH_NCCL_ASYNC_ERROR_HANDLING"
             ] = prev_nccl_async_error_handling
+=======
+            os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = (
+                prev_nccl_async_error_handling
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_nccl()
     @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
@@ -3030,15 +3197,25 @@ def assert_fut_success(fut):
         process_group.abort()
 
         if prev_nccl_async_error_handling is not None:
+<<<<<<< HEAD
             os.environ[
                 "TORCH_NCCL_ASYNC_ERROR_HANDLING"
             ] = prev_nccl_async_error_handling
+=======
+            os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = (
+                prev_nccl_async_error_handling
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_nccl()
     @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
     @skip_if_rocm_multiprocess
     @skip_if_lt_x_gpu(3)
     def test_restart_pg_after_error(self):
+<<<<<<< HEAD
+=======
+        self._reduce_timeout()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # test the barrier behavior in the non blocking wait setting
         prev_nccl_async_error_handling = os.environ.get(
             "TORCH_NCCL_ASYNC_ERROR_HANDLING", None
@@ -3110,9 +3287,15 @@ def test_restart_pg_after_error(self):
             os.remove(new_file_name)
 
         if prev_nccl_async_error_handling is not None:
+<<<<<<< HEAD
             os.environ[
                 "TORCH_NCCL_ASYNC_ERROR_HANDLING"
             ] = prev_nccl_async_error_handling
+=======
+            os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = (
+                prev_nccl_async_error_handling
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _run_invalid_nccl_blocking_wait_env(self, val):
         os.environ["TORCH_NCCL_BLOCKING_WAIT"] = val
@@ -3128,6 +3311,7 @@ def test_invalid_nccl_blocking_wait_env(self):
         self._run_invalid_nccl_blocking_wait_env("2147483647")
         self._run_invalid_nccl_blocking_wait_env("4294967295")
 
+<<<<<<< HEAD
     @with_nccl_blocking_wait
     @requires_nccl()
     @requires_gloo()
@@ -3169,6 +3353,10 @@ def test_nccl_timeout(self):
 
 
 class NcclUserBufferRegistrationTest(MultiProcessTestCase):
+=======
+
+class NcclRegistrationTest(MultiProcessTestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def setUp(self):
         super().setUp()
         # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
@@ -3179,7 +3367,11 @@ def setUp(self):
         os.environ["NCCL_DEBUG"] = "INFO"
         os.environ["NCCL_DEBUG_SUBSYS"] = "NVLS"
         if torch.cuda.nccl.version() >= (2, 24, 3):
+<<<<<<< HEAD
             os.environ["NCCL_DEBUG_SUBSYS"] = "REG"
+=======
+            os.environ["NCCL_DEBUG_SUBSYS"] = "REG,TUNING"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         os.environ["NCCL_DEBUG_FILE"] = nccl_debug_file.name
         self._spawn_processes()
 
@@ -3235,6 +3427,51 @@ def test_nccl_user_buffer_registration(self):
             else:
                 self.assertRegex(nccl_debug_file_content, "local-registered")
 
+<<<<<<< HEAD
+=======
+    @requires_nccl()
+    @requires_nccl_version((2, 27), "Need NCCL 2.27 for window registration")
+    @skip_if_lt_x_gpu(4)
+    @requires_multicast_support()
+    def test_nccl_window_registration(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="nccl", rank=self.rank, world_size=self.world_size, store=store
+        )
+        device = torch.device(f"cuda:{self.rank}")
+        torch.cuda.set_device(self.rank)
+        pg = c10d.distributed_c10d._get_default_group()
+        backend = pg._get_backend(torch.device(device))
+
+        # Use NCCL memory allocator
+        # enable symmetric memory usage in NCCL
+        pool = torch.cuda.MemPool(backend.mem_allocator, symm_mem=True)
+
+        # allocate memory with ncclMemAlloc
+        # note: symmetric kernels are not available for dtypes like torch.int64
+        with torch.cuda.use_mem_pool(pool):
+            tensor = torch.arange(1024 * 1024 * 2, device=device, dtype=torch.float32)
+
+        # register buffers to NCCL
+        backend.register_mem_pool(pool)
+
+        # allreduce now should use NVIDIA Switches
+        pg.allreduce(tensor).wait()
+        torch.cuda.synchronize(device=device)
+
+        # de-register buffers from NCCL
+        backend.deregister_mem_pool(pool)
+
+        # clean up memory
+        del tensor, pool
+
+        with open(os.environ["NCCL_DEBUG_FILE"]) as f:
+            nccl_debug_file_content = f.read()
+            # if buffers were registered and symmetric kernels ran, NCCL_DEBUG
+            # should show successful registration in debug output
+            self.assertRegex(nccl_debug_file_content, "[Symmetric]")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
     @property
@@ -3323,6 +3560,7 @@ def test_all_reduce_coalesced_nccl(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     def test_all_reduce_coalesced_nccl_float8_errors(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         c10d.init_process_group(
@@ -3344,6 +3582,8 @@ def test_all_reduce_coalesced_nccl_float8_errors(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_all_reduce_coalesced_manager_nccl(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         c10d.init_process_group(
@@ -3372,7 +3612,11 @@ def test_all_reduce_coalesced_manager_nccl(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @skip_if_rocm_multiprocess
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_intra_node_comm_all_reduce(self):
         from torch._C._distributed_c10d import _get_intra_node_comm_usage_counter
         from torch.testing._internal.common_cuda import SM80OrLater
@@ -3423,9 +3667,13 @@ def test_intra_node_comm_all_reduce(self):
         self.assertEqual(_get_intra_node_comm_usage_counter(), 3)
 
         # Verify that IntraNodeComm is not used beyond 10MB
+<<<<<<< HEAD
         t = torch.full(
             (10 * 1024**2 // 2 + 1,), self.rank, dtype=torch.bfloat16
         ).cuda()
+=======
+        t = torch.full((10 * 1024**2 // 2 + 1,), self.rank, dtype=torch.bfloat16).cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         c10d.all_reduce(t, c10d.ReduceOp.SUM)
         self.assertTrue(t.eq(expect).all())
         self.assertEqual(_get_intra_node_comm_usage_counter(), 3)
@@ -3433,6 +3681,31 @@ def test_intra_node_comm_all_reduce(self):
         c10d.destroy_process_group()
 
     @requires_nccl()
+<<<<<<< HEAD
+=======
+    @requires_nccl_version(
+        (2, 22), "Need NCCL 2.22+ for configuring estimate comm time"
+    )
+    @skip_if_lt_x_gpu(2)
+    def test_time_estimate_nccl(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        torch.cuda.set_device(self.rank)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        process_group = c10d.distributed_c10d._get_default_group()
+        device = torch.device(f"cuda:{self.rank:d}")
+        t = torch.full(
+            (1024,),
+            self.rank,
+        ).cuda()
+        with dist._time_estimator(group=process_group, device=device) as cm:
+            c10d.all_reduce(t, c10d.ReduceOp.SUM)
+        self.assertTrue(cm.estimated_time is not None)
+        self.assertTrue(cm.estimated_time > 0)
+
+    @requires_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_sequence_num_set_default_pg_nccl(self):
         torch.cuda.set_device(self.rank)
@@ -3520,6 +3793,24 @@ def test_pass_nccl_options_config(self):
         self.assertEqual(pg_opts.config.net_name, net_name.decode())
         self.assertEqual(pg_opts.config.split_share, int(split_share))
 
+<<<<<<< HEAD
+=======
+        # Tests that config is inited correctly
+        pg_opts = c10d.ProcessGroupNCCL.Options()
+        nccl_cfg = c10d.ProcessGroupNCCL.NCCLConfig()
+        self.assertEqual(pg_opts.config.min_ctas, -2147483648)
+        self.assertEqual(nccl_cfg.min_ctas, -2147483648)
+
+        # Tests that opts and config can be copied
+        pg_opts_2 = copy.deepcopy(pg_opts)
+        nccl_cfg_2 = copy.copy(pg_opts_2.config)
+        pg_opts_2.config.min_ctas = 2
+        nccl_cfg_2.min_ctas = 4
+        self.assertEqual(pg_opts.config.min_ctas, -2147483648)
+        self.assertEqual(pg_opts_2.config.min_ctas, 2)
+        self.assertEqual(nccl_cfg_2.min_ctas, 4)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl()
     @skip_if_lt_x_gpu(4)
     def test_nccl_barrier(self):
@@ -3711,6 +4002,7 @@ def test_reduce_scatter_tensor_coalesced(self):
                 dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i])
         self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size)
 
+<<<<<<< HEAD
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_reduce_scatter_base_k_float8_errors(self):
@@ -3761,6 +4053,8 @@ def test_reduce_scatter_tensor_coalesced_float8_errors(self):
                     dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i])
             self.assertEqual(output_tensors, input_tensors[self.rank])
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class SetDeviceMethod(Enum):
     TORCH_CUDA_SET = auto()  # torch.cuda.set_device
@@ -3805,6 +4099,12 @@ def test_allgather_base(self):
     @skip_if_lt_x_gpu(1)
     @parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     def test_allgather_float8(self, float8_dtype):
+<<<<<<< HEAD
+=======
+        device = torch.device(f"cuda:{self.rank:d}")
+        if not sm_is_or_higher_than(device, 9, 0):  # noqa: F821
+            self.skipTest("FP8 reduction support begins with sm90 capable devices")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
             "nccl",
@@ -4322,9 +4622,15 @@ def test_ddp_set_sparse_metadata(self):
 class NCCLTraceTestBase(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
+<<<<<<< HEAD
         os.environ[
             "TORCH_NCCL_ENABLE_TIMING"
         ] = "0"  # see 'timing_enabled' parametrized tests
+=======
+        os.environ["TORCH_NCCL_ENABLE_TIMING"] = (
+            "0"  # see 'timing_enabled' parametrized tests
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "1000"
         os.environ["TORCH_NCCL_DUMP_ON_TIMEOUT"] = "1"
         self.tempdir = tempfile.TemporaryDirectory()
@@ -4410,7 +4716,14 @@ def started_or_scheduled(self, timing_enabled):
 class NCCLTraceTest(NCCLTraceTestBase):
     def _verify_trace(self, t, include_collectives, timing_enabled, is_json):
         ver = t["version"]
+<<<<<<< HEAD
         self.assertEqual(ver, "2.4")
+=======
+        self.assertEqual(ver, "2.9")
+        nccl_version = t["nccl_version"]
+        torch_nccl_version = torch.cuda.nccl.version()
+        self.assertEqual(nccl_version, ".".join(str(v) for v in torch_nccl_version))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pg_config = t["pg_config"]
         self.assertEqual(len(pg_config), 1)
         default_pg_info = pg_config["0"]
@@ -4431,6 +4744,11 @@ def _verify_trace(self, t, include_collectives, timing_enabled, is_json):
             self.assertEqual(len(t["entries"]), 2)
             t = t["entries"]
             last = t[-1]
+<<<<<<< HEAD
+=======
+            self.assertEqual(last["thread_id"], str(threading.current_thread().ident))
+            self.assertEqual(last["thread_name"], "fr_test_thread")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(last["process_group"], ("0", "default_pg"))
             self.assertEqual(last["state"], "completed")
             s = last["time_discovered_started_ns"]
@@ -4463,6 +4781,38 @@ def _verify_trace(self, t, include_collectives, timing_enabled, is_json):
         else:
             self.assertTrue("entries" not in t)
 
+<<<<<<< HEAD
+=======
+    def load_libpthread_or_libc(self):
+        import ctypes.util
+
+        for base in ("pthread", "c"):
+            path = ctypes.util.find_library(base)
+            if path:
+                try:
+                    return ctypes.CDLL(path)
+                except OSError:
+                    continue
+        raise RuntimeError("Could not load pthread or libc")
+
+    # Directly set thread name using threading.current_thread().name does not work
+    # because we use pthread_getname_np to get the thread’s OS-level name in C++
+    def set_thread_name(self, name):
+        import ctypes
+
+        lib = self.load_libpthread_or_libc()
+        pthread_self = lib.pthread_self
+        pthread_self.restype = ctypes.c_void_p
+        pthread_setname_np = lib.pthread_setname_np
+        pthread_setname_np.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+
+        # Get current pthread handle
+        tid = pthread_self()
+
+        # Set name
+        pthread_setname_np(tid, name.encode())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     @parametrize("timing_enabled", [True, False])
@@ -4474,6 +4824,10 @@ def test_short_json(self, timing_enabled, include_collectives):
         if timing_enabled:
             pg._enable_collectives_timing()
         device = self.local_device
+<<<<<<< HEAD
+=======
+        self.set_thread_name("fr_test_thread")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = torch.full((3, 4), float(self.rank), device=device)
         for _ in range(2):
             f = pg.allreduce(a)
@@ -4500,6 +4854,10 @@ def test_short_pickle(self, timing_enabled, include_collectives):
         if timing_enabled:
             pg._enable_collectives_timing()
         device = self.local_device
+<<<<<<< HEAD
+=======
+        self.set_thread_name("fr_test_thread")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = torch.full((3, 4), float(self.rank), device=device)
         for _ in range(2):
             f = pg.allreduce(a)
@@ -4682,12 +5040,24 @@ def test_trace_while_active(self, timing_enabled, only_active):
                     self.assertEqual(t[-1]["profiling_name"], "nccl:all_reduce")
                     self.assertEqual(t[-1]["collective_seq_id"], 2)
 
+<<<<<<< HEAD
                     #ROCm runtime used to call uSleep(20 µs)inside the default‑signal busy-wait loop.
                     #Now, this sleep is removed which lets the host thread spin continuously
                     #Therefore, the state can either be scheduled or started before test dumps the trace.
                     if torch.version.hip and _get_torch_rocm_version() >= (6,4) and timing_enabled:
                         assert(
                             t[-1]["state"] in ("scheduled", "started"))
+=======
+                    # ROCm runtime used to call uSleep(20 µs)inside the default‑signal busy-wait loop.
+                    # Now, this sleep is removed which lets the host thread spin continuously
+                    # Therefore, the state can either be scheduled or started before test dumps the trace.
+                    if (
+                        torch.version.hip
+                        and _get_torch_rocm_version() >= (6, 4)
+                        and timing_enabled
+                    ):
+                        assert t[-1]["state"] in ("scheduled", "started")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     else:
                         self.assertEqual(
                             t[-1]["state"], self.started_or_scheduled(timing_enabled)
@@ -4920,8 +5290,53 @@ def test_individual_send_recv(self, op_sizes, timing_enabled):
             else:
                 self.assertTrue("duration_ms" not in t["entries"][seq])
 
+<<<<<<< HEAD
     # TODO(whc) support and test coalesced collectives that use the c++ start/end group thingy instead of python
     # coalescing manager
+=======
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize("timing_enabled", [True, False])
+    def test_allgather_uneven(self, timing_enabled):
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_nccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        output_split_sizes = [i + 1 for i in range(self.world_size)]
+        sum_len = sum(output_split_sizes)
+        output_tensor = torch.zeros(sum_len, 2).to(self.rank)
+        expected_tensor = torch.ones(sum_len, 2).to(self.rank)
+        input_tensor = torch.ones(output_split_sizes[self.rank], 2).to(self.rank)
+
+        dist.all_gather(
+            list(torch.split(output_tensor, output_split_sizes)), input_tensor
+        )
+        torch.cuda.synchronize(device=self.rank)
+        self.assertEqual(output_tensor, expected_tensor)
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        self.assertEqual(len(t["entries"]), self.world_size + 1)
+        for i in range(self.world_size):
+            self.assertEqual(t["entries"][i]["profiling_name"], "nccl:_broadcast_oop")
+            # collective_seq_id should be incremented once.
+            self.assertEqual(t["entries"][i]["collective_seq_id"], 1)
+            self.assertEqual(t["entries"][i]["input_sizes"], [[i + 1, 2]])
+            self.assertEqual(
+                t["entries"][i]["output_sizes"],
+                [[i + 1, 2]],
+            )
+            self.assertEqual(t["entries"][i]["state"], "scheduled")
+            # No event is recorded for individual ops
+            self.assertTrue("time_discovered_completed_ns" in t["entries"][i])
+        self.assertEqual(
+            t["entries"][self.world_size]["profiling_name"], "nccl:ALLGATHER_coalesced"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO(whc) test out other ops (And combinations of ops, if that's valid?)
     @requires_nccl()
@@ -5325,8 +5740,14 @@ def test_comm_recursive_split_group(self):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     assert (
         not torch.cuda._initialized
     ), "test_distributed must not have initialized CUDA context on main process"
+=======
+    assert not torch.cuda._initialized, (
+        "test_distributed must not have initialized CUDA context on main process"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     run_tests()
diff --git a/test/distributed/test_c10d_ops_nccl.py b/test/distributed/test_c10d_ops_nccl.py
index 2ac6e6923afe..5878434f80f4 100644
--- a/test/distributed/test_c10d_ops_nccl.py
+++ b/test/distributed/test_c10d_ops_nccl.py
@@ -11,7 +11,10 @@
 import math
 import os
 import sys
+<<<<<<< HEAD
 import tempfile
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as c10d
@@ -28,9 +31,17 @@
     init_multigpu_helper,
     MultiProcContinousTest,
     requires_nccl,
+<<<<<<< HEAD
     TEST_SKIPS,
 )
 from torch.testing._internal.common_utils import (
+=======
+    requires_nccl_version,
+    sm_is_or_higher_than,
+)
+from torch.testing._internal.common_utils import (
+    run_tests,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_but_pass_in_sandcastle_if,
     skipIfRocm,
     TEST_WITH_DEV_DBG_ASAN,
@@ -243,6 +254,27 @@ def allreduce(tensors, op):
             with self.assertRaisesRegex(ValueError, "Cannot use " + err + " with NCCL"):
                 allreduce(tensors, op)
 
+<<<<<<< HEAD
+=======
+    @requires_nccl_version((2, 24), "Need NCCL 2.24+ for Float8")
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_allreduce_float8(self):
+        device = torch.device("cuda", self.rank_to_GPU[self.rank][0])
+        if not sm_is_or_higher_than(device, 9, 0):
+            self.skipTest("Float8 requires sm >= 90")
+
+        numel = 1024
+        tensor = torch.ones(numel, dtype=torch.float32, device=device).to(
+            torch.float8_e4m3fn
+        )
+        dist.all_reduce(tensor)
+
+        expected = (
+            torch.empty_like(tensor).fill_(self.world_size).to(torch.float8_e4m3fn)
+        )
+        torch.testing.assert_close(tensor, expected)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_alltoall_ops_with_cudafree_race(self):
@@ -270,6 +302,7 @@ def test_alltoall_ops_with_cudafree_race(self):
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+<<<<<<< HEAD
     @skipIfRocm
     def test_allreduce_in_cudagraph(self):
         pg = self.pg
@@ -288,12 +321,39 @@ def test_allreduce_in_cudagraph(self):
                 pg.allreduce(xs).wait()
             # Graph capture should not change the tensor value
             self.assertEqual(xs[0].item(), expected_val)
+=======
+    def test_allreduce_in_cudagraph(self):
+        local_device_idx = self.rank_to_GPU[self.rank][0]
+        # This device setting is needed by the CUDAGraph API to understand on
+        # which device to find the current stream
+        torch.cuda.set_device(local_device_idx)
+        xs = torch.FloatTensor([1]).cuda(local_device_idx)
+
+        # single warmup
+        c10d.all_reduce(xs, group=self.pg)
+        # 1 + 1 + ...  = world_size
+        expected_val = self.world_size
+        self.assertEqual(xs.item(), expected_val)
+
+        # Use a loop to test re-capture
+        for _ in range(2):
+            graph = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(graph):
+                c10d.all_reduce(xs, group=self.pg)
+                c10d.broadcast(xs, src=0, group=self.pg)
+            # Graph capture should not change the tensor value
+            self.assertEqual(xs.item(), expected_val)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             graph.replay()
             expected_val *= self.world_size
             graph.replay()
             expected_val *= self.world_size
+<<<<<<< HEAD
             self.assertEqual(xs[0].item(), expected_val)
+=======
+            self.assertEqual(xs.item(), expected_val)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@@ -736,6 +796,35 @@ def reduce_scatter_base(output_t, input_t):
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+<<<<<<< HEAD
+=======
+    def test_reduce_scatter_v(self):
+        device = torch.device("cuda", self.rank_to_GPU[self.rank][0])
+        # A list of tensors with different sizes
+        input_list = [torch.ones(i, device=device) for i in range(self.world_size)]
+        # The i-th output should have size i
+        output = torch.zeros(self.rank, device=device)
+        work = c10d.reduce_scatter(output, input_list, group=self.pg, async_op=True)
+        expected = torch.ones(self.rank, device=device) * self.world_size
+        work.wait()
+        self.assertEqual(expected, output)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_all_gather_v(self):
+        device = torch.device("cuda", self.rank_to_GPU[self.rank][0])
+        # A list of tensors with different sizes
+        output_list = [torch.zeros(i, device=device) for i in range(self.world_size)]
+        # The i-th input has size i, filled with value i
+        input = torch.ones(self.rank, device=device) * self.rank
+        work = c10d.all_gather(output_list, input, group=self.pg, async_op=True)
+        expected = [torch.ones(i, device=device) * i for i in range(self.world_size)]
+        work.wait()
+        self.assertEqual(expected, output_list)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reduce_scatter_ops(self):
         pg = self.pg
         local_device_ids = self.rank_to_GPU[self.rank]
@@ -794,6 +883,7 @@ def reduce_scatter(outputs, input_lists, op):
         # Product
         reduce_scatter(output, tensor_lists, c10d.ReduceOp.PRODUCT)
 
+<<<<<<< HEAD
         # math package don't have math.perm until python 3.8, so
         # we implement a naive version here.
         def perm(n, k):
@@ -804,6 +894,10 @@ def perm(n, k):
 
         for i in range(num_gpus):
             prod_val = perm(self.rank + self.world_size, self.world_size)
+=======
+        for i in range(num_gpus):
+            prod_val = math.perm(self.rank + self.world_size, self.world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             expected = torch.tensor([prod_val])
             self.assertEqual(expected, output[i])
@@ -875,6 +969,30 @@ def reduce_scatter_base(output_t, input_t):
         # Verification
         self.assertEqual(output_t[0], self.rank * self.world_size)
 
+<<<<<<< HEAD
+=======
+    @requires_nccl_version((2, 24), "Need NCCL 2.24+ for Float8")
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_reduce_scatter_float8(self):
+        device = torch.device("cuda", self.rank_to_GPU[self.rank][0])
+        if not sm_is_or_higher_than(device, 9, 0):
+            self.skipTest("Float8 requires sm >= 90")
+
+        numel = 1024
+        output_tensor = torch.zeros(numel, dtype=torch.float32, device=device).to(
+            torch.float8_e5m2
+        )
+        input_tensor = torch.ones(
+            self.world_size * numel, dtype=torch.float32, device=device
+        ).to(torch.float8_e5m2)
+        dist.reduce_scatter_tensor(output_tensor, input_tensor)
+
+        expected = (
+            torch.empty_like(output_tensor).fill_(self.world_size).to(torch.float8_e5m2)
+        )
+        torch.testing.assert_close(output_tensor, expected)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_barrier(self):
@@ -982,6 +1100,7 @@ def allgather_base(output_t, input_t):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     if not torch.cuda.is_available():
         sys.exit(TEST_SKIPS["no_cuda"].exit_code)
 
@@ -1003,3 +1122,6 @@ def allgather_base(output_t, input_t):
             nprocs=world_size,
             args=(world_size, rdvz_file),
         )
+=======
+    run_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py
index 01f7d8adb551..ee757f3f0792 100644
--- a/test/distributed/test_c10d_spawn.py
+++ b/test/distributed/test_c10d_spawn.py
@@ -8,7 +8,11 @@
 import torch.distributed as c10d
 import torch.multiprocessing as mp
 from torch.testing._internal.common_distributed import MultiProcessTestCase
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import load_tests
+=======
+from torch.testing._internal.common_utils import load_tests, run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Torch distributed.nn is not available in windows
@@ -246,3 +250,10 @@ def _test_all_to_all_single(self, backend):
         z.backward()
         x_s = ((self.rank + 1) * torch.ones(int(row), 5, device=device)).cos()
         self.assertEqual(x.grad, x_s)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    run_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/test_c10d_ucc.py b/test/distributed/test_c10d_ucc.py
index e63c5f81924e..504d8a06be0c 100644
--- a/test/distributed/test_c10d_ucc.py
+++ b/test/distributed/test_c10d_ucc.py
@@ -42,7 +42,10 @@
     run_tests,
     skip_but_pass_in_sandcastle,
     TestCase,
+<<<<<<< HEAD
     xfailIfLinux,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -674,7 +677,10 @@ def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
             vanilla_parameter.grad.coalesce(), ddp_parameter.grad.coalesce()
         )
 
+<<<<<<< HEAD
     @xfailIfLinux
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_ucc()
     @skip_if_lt_x_gpu(2)
     def test_save_load_checkpoint(self):
@@ -1092,8 +1098,14 @@ def test_allgather_base(self):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     assert (
         not torch.cuda._initialized
     ), "test_distributed must not have initialized CUDA context on main process"
+=======
+    assert not torch.cuda._initialized, (
+        "test_distributed must not have initialized CUDA context on main process"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     run_tests()
diff --git a/test/distributed/test_collective_utils.py b/test/distributed/test_collective_utils.py
index ee93d56efb8f..dba006a1b18f 100644
--- a/test/distributed/test_collective_utils.py
+++ b/test/distributed/test_collective_utils.py
@@ -5,6 +5,10 @@
 import torch.distributed as c10d
 from torch.distributed.collective_utils import all_gather, broadcast
 from torch.testing._internal.common_distributed import MultiProcessTestCase
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestCollectiveUtils(MultiProcessTestCase):
@@ -89,9 +93,15 @@ def test_all_gather_result(self) -> None:
 
         res = all_gather(data_or_fn=func, pg=pg)
         func.assert_called_once()
+<<<<<<< HEAD
         assert res == list(
             range(self.world_size)
         ), f"Expect res to be list of 0 through {self.world_size} (got {res})"
+=======
+        assert res == list(range(self.world_size)), (
+            f"Expect res to be list of 0 through {self.world_size} (got {res})"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_all_gather_result_no_pg(self) -> None:
         """
@@ -114,3 +124,10 @@ def test_all_gather_result_raises_exceptions_from_func(
         expected_exception = "test exception"
         with self.assertRaisesRegex(Exception, expected_exception):
             all_gather(data_or_fn=func)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    run_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/test_composability.py b/test/distributed/test_composability.py
index 91b22a60e74b..8dc72b05bde0 100644
--- a/test/distributed/test_composability.py
+++ b/test/distributed/test_composability.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: distributed"]
 import copy
+<<<<<<< HEAD
 import os
 import sys
 import tempfile
@@ -9,6 +10,12 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed._tensor import DTensor
+=======
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy
 from torch.distributed.pipelining import PipelineStage
@@ -20,6 +27,10 @@
     ScheduleInterleavedZeroBubble,
     ScheduleLoopedBFS,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor import DTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
@@ -30,11 +41,21 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+<<<<<<< HEAD
+=======
+    run_tests,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_but_pass_in_sandcastle_if,
     TEST_WITH_ROCM,
 )
 
 
+<<<<<<< HEAD
+=======
+device_type = "cuda"
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # MLP Layer
 class MLPModule(torch.nn.Module):
     def __init__(self, d_hid: int):
@@ -92,13 +113,17 @@ def loss_fn(y, target, scale=1e-4):
 
 
 class ComposabilityTest(MultiProcContinousTest):
+<<<<<<< HEAD
     world_size = 4
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def backend_str(cls) -> str:
         # Testing with NCCL backend
         return "nccl"
 
+<<<<<<< HEAD
     @classmethod
     def setUpClass(cls):
         """
@@ -115,6 +140,11 @@ def _build_mesh(self, mesh_shape=(2, 2), mesh_dim_names=("dp", "pp")):
             "cuda", mesh_shape=mesh_shape, mesh_dim_names=mesh_dim_names
         )
         return device_mesh
+=======
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _rand_microbatches(self, dp_mesh, num_microbatches, dim, dtype=torch.float32):
         full = [
@@ -216,7 +246,16 @@ def test_pp_ddp(self, ScheduleClass):
             # https://github.com/pytorch/pytorch/issues/144530
             return
 
+<<<<<<< HEAD
         device_mesh = self._build_mesh((2, 2), ("dp", "pp"))
+=======
+        torch.get_device_module(device_type).set_device(self.device)
+        mesh_shape = (self.world_size // 2, 2)
+        mesh_dim_names = ("dp", "pp")
+        device_mesh = init_device_mesh(
+            "cuda", mesh_shape=mesh_shape, mesh_dim_names=mesh_dim_names
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pp_group = device_mesh["pp"].get_group()
         dp_mesh = device_mesh["dp"]
 
@@ -292,7 +331,16 @@ def test_pp_fsdp(self, dp_type, ScheduleClass):
         if TEST_WITH_ROCM:
             return
 
+<<<<<<< HEAD
         device_mesh = self._build_mesh((2, 2), ("dp", "pp"))
+=======
+        torch.get_device_module(device_type).set_device(self.device)
+        mesh_shape = (self.world_size // 2, 2)
+        mesh_dim_names = ("dp", "pp")
+        device_mesh = init_device_mesh(
+            "cuda", mesh_shape=mesh_shape, mesh_dim_names=mesh_dim_names
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pp_group = device_mesh["pp"].get_group()
         dp_mesh = device_mesh["dp"]
 
@@ -376,6 +424,7 @@ def apply_dp(partial_model):
                 name = ".".join(parts)
                 ref_p = ref_parameters[name]
                 self.assertTrue(isinstance(p.grad, DTensor))
+<<<<<<< HEAD
                 torch.testing.assert_close(p.grad.full_tensor(), ref_p.grad)
 
 
@@ -408,3 +457,14 @@ def apply_dp(partial_model):
             nprocs=world_size,
             args=(world_size, rdvz_file),
         )
+=======
+                torch.testing.assert_close(
+                    p.grad.full_tensor(), ref_p.grad, atol=5e-5, rtol=2e-2
+                )
+
+
+instantiate_parametrized_tests(ComposabilityTest)
+
+if __name__ == "__main__":
+    run_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/distributed/test_compute_comm_reordering.py b/test/distributed/test_compute_comm_reordering.py
index a2780b55c203..60eabbc2459b 100644
--- a/test/distributed/test_compute_comm_reordering.py
+++ b/test/distributed/test_compute_comm_reordering.py
@@ -26,12 +26,24 @@
     _dynamo_dist_per_rank_init,
     at_least_x_gpu,
     DynamoDistributedMultiProcTestCase,
+<<<<<<< HEAD
     requires_nccl,
 )
+=======
+    requires_accelerator_dist_backend,
+)
+from torch.testing._internal.common_fsdp import get_devtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import skipIfRocm
 from torch.testing._internal.inductor_utils import HAS_GPU
 
 
+<<<<<<< HEAD
+=======
+device_type = str(get_devtype())
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_snode_runtime_for_reorder_compute_test(snode):
     # NOTE: custom cost model to show that the compute reordering algorithm is working
     # Collective kernels
@@ -74,7 +86,11 @@ def create_grouped_node_for_allreduce_and_its_deps(snodes):
     return new_snode_order
 
 
+<<<<<<< HEAD
 @requires_nccl()
+=======
+@requires_accelerator_dist_backend()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase):
     """
     Run correctness checks in multi-proc runner, mark with minimum # GPUs to run under
@@ -113,9 +129,18 @@ def func(a):
             return torch.matmul(ar, b)
 
         with _dynamo_dist_per_rank_init(
+<<<<<<< HEAD
             self.rank, self.world_size, fake_pg=not at_least_x_gpu(2)
         ):
             inputs = torch.ones(4, 4, dtype=torch.float, device="cuda") + self.rank
+=======
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, inputs)
             # Verify that the wait_tensor is sinked below the 1st matmul but
@@ -154,9 +179,18 @@ def func(a):
             return torch.matmul(d, e)
 
         with _dynamo_dist_per_rank_init(
+<<<<<<< HEAD
             self.rank, self.world_size, fake_pg=not at_least_x_gpu(2)
         ):
             inputs = torch.ones(4, 4, dtype=torch.float, device="cuda") + self.rank
+=======
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, inputs)
             # Verify that the all_reduce_ has been raised above the 2nd matmul
@@ -202,9 +236,18 @@ def func(a, *, tag, ranks, group_size):
             return torch.mm(e, g)
 
         with _dynamo_dist_per_rank_init(
+<<<<<<< HEAD
             self.rank, self.world_size, fake_pg=not at_least_x_gpu(2)
         ):
             inputs = torch.ones(4, 4, dtype=torch.float, device="cuda") + self.rank
+=======
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
             # Things to verify:
@@ -255,9 +298,18 @@ def func(a, *, tag, ranks, group_size):
             return (e,)
 
         with _dynamo_dist_per_rank_init(
+<<<<<<< HEAD
             self.rank, self.world_size, fake_pg=not at_least_x_gpu(2)
         ):
             inputs = torch.ones(4, 4, dtype=torch.float, device="cuda") + self.rank
+=======
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
             # NOTE: after scheduling the first all_reduce:
@@ -312,9 +364,18 @@ def func(a, *, tag, ranks, group_size):
             return (e,)
 
         with _dynamo_dist_per_rank_init(
+<<<<<<< HEAD
             self.rank, self.world_size, fake_pg=not at_least_x_gpu(2)
         ):
             inputs = torch.ones(4, 4, dtype=torch.float, device="cuda") + self.rank
+=======
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
             # NOTE: after scheduling the first all_reduce:
@@ -362,9 +423,18 @@ def func(a, *, tag, ranks, group_size):
             return (mm,)
 
         with _dynamo_dist_per_rank_init(
+<<<<<<< HEAD
             self.rank, self.world_size, fake_pg=not at_least_x_gpu(2)
         ):
             inputs = torch.ones(4, 4, dtype=torch.float, device="cuda") + self.rank
+=======
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
             # Expectations:
@@ -379,6 +449,64 @@ def func(a, *, tag, ranks, group_size):
             correct = func(inputs, **self.get_world_trs())
             self.assertTrue(same(out, correct))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_inductor_default_comms_ordering(self):
+        pg_info = self.get_world_trs()
+        tag = pg_info["tag"]
+        ranks = pg_info["ranks"]
+        group_size = pg_info["group_size"]
+
+        g1 = torch.ones(10, 10, device=device_type)
+        g2 = torch.ones(11, 11, device=device_type)
+        g3 = torch.ones(12, 12, device=device_type)
+
+        def assert_pass(graph):
+            # all_reduces need to remain in order!
+            self.assertExpectedInline(
+                graph,
+                """\
+graph():
+    %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
+    %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
+    %arg2_1 : [num_users=1] = placeholder[target=arg2_1]
+    %all_reduce : [num_users=1] = call_function[target=torch.ops._c10d_functional.all_reduce.default](args = (%arg0_1, avg, 0), kwargs = {})
+    %all_reduce_1 : [num_users=1] = call_function[target=torch.ops._c10d_functional.all_reduce.default](args = (%arg1_1, avg, 0), kwargs = {})
+    %all_reduce_2 : [num_users=1] = call_function[target=torch.ops._c10d_functional.all_reduce.default](args = (%arg2_1, avg, 0), kwargs = {})
+    %wait_tensor : [num_users=1] = call_function[target=torch.ops._c10d_functional.wait_tensor.default](args = (%all_reduce_2,), kwargs = {})
+    %wait_tensor_1 : [num_users=1] = call_function[target=torch.ops._c10d_functional.wait_tensor.default](args = (%all_reduce_1,), kwargs = {})
+    %wait_tensor_2 : [num_users=1] = call_function[target=torch.ops._c10d_functional.wait_tensor.default](args = (%all_reduce,), kwargs = {})
+    return (wait_tensor, wait_tensor_1, wait_tensor_2)""",  # noqa: B950
+            )
+
+        torch._inductor.config.post_grad_custom_post_pass = assert_pass
+
+        @torch.compile
+        def fn(g1, g2, g3):
+            handle1 = torch.ops.c10d_functional.all_reduce(
+                g1, "avg", tag, ranks, group_size
+            )
+            handle2 = torch.ops.c10d_functional.all_reduce(
+                g2, "avg", tag, ranks, group_size
+            )
+            handle3 = torch.ops.c10d_functional.all_reduce(
+                g3, "avg", tag, ranks, group_size
+            )
+
+            # wait on them in a different order
+            grad3 = torch.ops._c10d_functional.wait_tensor.default(handle3)
+            grad2 = torch.ops._c10d_functional.wait_tensor.default(handle2)
+            grad1 = torch.ops._c10d_functional.wait_tensor.default(handle1)
+            return grad3, grad2, grad1
+
+        with _dynamo_dist_per_rank_init(
+            self.rank, self.world_size, self.backend(device_type), fake_pg=True
+        ):
+            fn(g1, g2, g3)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nccl_heuristics(self):
         assert len(baseLat) == len(NCCL_ALGO)
         assert all(len(x) == len(NCCL_PROTO) for x in baseLat)
diff --git a/test/distributed/test_control_collectives.py b/test/distributed/test_control_collectives.py
index 594c028ae9d4..fff05dcef213 100644
--- a/test/distributed/test_control_collectives.py
+++ b/test/distributed/test_control_collectives.py
@@ -207,8 +207,14 @@ def f(rank: int) -> None:
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     assert (
         not torch.cuda._initialized
     ), "test_distributed must not have initialized CUDA context on main process"
+=======
+    assert not torch.cuda._initialized, (
+        "test_distributed must not have initialized CUDA context on main process"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     run_tests()
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index e6d8a094d8e7..ac4a563a6c77 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -3,9 +3,15 @@
 import os
 
 import torch
+<<<<<<< HEAD
 import torch.distributed._functional_collectives as funcol
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.distributed._tensor import DTensor
+=======
+import torch.distributed as dist
+import torch.distributed._functional_collectives as funcol
+from torch._subclasses.fake_tensor import FakeTensorMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh
 from torch.distributed.distributed_c10d import (
     _get_default_group,
@@ -14,10 +20,17 @@
     get_world_size,
     init_process_group,
     is_initialized,
+<<<<<<< HEAD
     is_nccl_available,
     new_group,
     ProcessGroup,
 )
+=======
+    new_group,
+    ProcessGroup,
+)
+from torch.distributed.tensor import DTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._collective_utils import (
     mesh_broadcast,
     mesh_scatter,
@@ -34,6 +47,7 @@
 from torch.utils._typing_utils import not_none
 
 
+<<<<<<< HEAD
 def _get_device_type(world_size):
     if (
         torch.cuda.is_available()
@@ -47,10 +61,18 @@ def _get_device_type(world_size):
 
 
 def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0):
+=======
+def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_rank=-1):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     os.environ["MASTER_ADDR"] = addr
     os.environ["MASTER_PORT"] = port
     os.environ["WORLD_SIZE"] = f"{world_size}"
     os.environ["RANK"] = f"{rank}"
+<<<<<<< HEAD
+=======
+    if local_rank != -1:
+        os.environ["LOCAL_RANK"] = f"{local_rank}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DeviceMeshTestGlooBackend(DTensorTestBase):
@@ -70,6 +92,72 @@ def test_device_mesh_reuse_default_group(self):
             self.assertEqual(mesh_group, default_group)
 
 
+<<<<<<< HEAD
+=======
+class DeviceMeshSetDeviceTest(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 4
+
+    @skip_if_lt_x_gpu(4)
+    def test_manual_set_device(self):
+        mesh_tensor = torch.arange(4).reshape(2, 2)
+        self.assertTrue(not is_initialized())
+
+        # Set the device on each process before DeviceMesh constructor,
+        # and device to be different than the default world rank
+        torch.cuda.set_device((self.rank + 2) % self.world_size)
+        _set_env_var(world_size=self.world_size, rank=self.rank)
+        DeviceMesh(self.device_type, mesh_tensor)
+        self.assertTrue(is_initialized())
+
+        # check that the device is set to the correct device
+        # and respect the previous set_device calls
+        self.assertEqual(torch.cuda.current_device(), (self.rank + 2) % self.world_size)
+        self.destroy_pg()
+
+    @skip_if_lt_x_gpu(4)
+    def test_auto_set_device_from_local_rank(self):
+        mesh_tensor = torch.arange(4).reshape(2, 2)
+        self.assertTrue(not is_initialized())
+        # set the local rank to be different than the default world rank,
+        # DeviceMesh should respect LOCAL_RANK env var if it's set
+        local_rank = (self.rank + 1) % self.world_size
+
+        _set_env_var(
+            world_size=self.world_size,
+            rank=self.rank,
+            local_rank=local_rank,
+        )
+        DeviceMesh(self.device_type, mesh_tensor)
+        self.assertTrue(is_initialized())
+
+        # check that the device is set to the correct device
+        # and respect the LOCAL_RANK env var
+        self.assertEqual(torch.cuda.current_device(), local_rank)
+        self.destroy_pg()
+
+    @skip_if_lt_x_gpu(4)
+    def test_auto_set_device_from_heuristic(self):
+        mesh_tensor = torch.arange(4).reshape(2, 2)
+        self.assertTrue(not is_initialized())
+
+        _set_env_var(
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+        with self.assertWarnsRegex(
+            UserWarning, "It seems like you did not set/select the default device"
+        ):
+            DeviceMesh(self.device_type, mesh_tensor)
+        self.assertTrue(is_initialized())
+
+        # check that the device is set to the correct device
+        self.assertEqual(torch.cuda.current_device(), self.rank)
+        self.destroy_pg()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DeviceMeshTest(DTensorTestBase):
     @property
     def world_size(self):
@@ -77,6 +165,7 @@ def world_size(self):
 
     @skip_if_lt_x_gpu(4)
     def test_init_process_group(self):
+<<<<<<< HEAD
         device_type = _get_device_type(self.world_size)
         mesh_tensor = torch.arange(4).reshape(2, 2)
         self.assertTrue(not is_initialized())
@@ -84,6 +173,14 @@ def test_init_process_group(self):
         DeviceMesh(device_type, mesh_tensor)
         self.assertTrue(is_initialized())
         self.destroy_pg()
+=======
+        mesh_tensor = torch.arange(4).reshape(2, 2)
+        self.assertTrue(not is_initialized())
+        _set_env_var(world_size=self.world_size, rank=self.rank)
+        DeviceMesh(self.device_type, mesh_tensor)
+        self.assertTrue(is_initialized())
+        self.destroy_pg(self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -211,7 +308,11 @@ def test_fake_pg_device_mesh(self):
         local_tensor = torch.randn(2, 8)
         global_tensor = funcol.all_gather_tensor(
             local_tensor, gather_dim=0, group=(mesh, 0)
+<<<<<<< HEAD
         )
+=======
+        ).wait()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(global_tensor.shape, (self.world_size * 2, 8))
 
     @with_comms
@@ -222,7 +323,11 @@ def test_from_group_with_global_pg(self):
         mesh_pg = ref_global_mesh.get_group()
         global_mesh = DeviceMesh.from_group(mesh_pg, self.device_type)
         self.assertEqual(ref_global_mesh, global_mesh)
+<<<<<<< HEAD
         self.assertEqual(ref_global_mesh._dim_group_infos, global_mesh._dim_group_infos)
+=======
+        self.assertEqual(ref_global_mesh._dim_group_names, global_mesh._dim_group_names)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             ref_global_mesh._coordinate_on_dim, global_mesh._coordinate_on_dim
         )
@@ -231,7 +336,11 @@ def test_from_group_with_global_pg(self):
             mesh_pg, self.device_type, mesh=torch.arange(self.world_size)
         )
         self.assertEqual(ref_global_mesh, global_mesh)
+<<<<<<< HEAD
         self.assertEqual(ref_global_mesh._dim_group_infos, global_mesh._dim_group_infos)
+=======
+        self.assertEqual(ref_global_mesh._dim_group_names, global_mesh._dim_group_names)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             ref_global_mesh._coordinate_on_dim, global_mesh._coordinate_on_dim
         )
@@ -410,6 +519,7 @@ def test_from_group_with_mesh_shape_3d(self):
             mesh_dim_names=("dp_replicate", "dp_shard"),
         )
 
+<<<<<<< HEAD
         ref_mesh_dp_dim_group_infos = ref_mesh._dim_group_infos[:2]
         for (_, ref_ranks, _), (_, ranks, _) in zip(
             ref_mesh_dp_dim_group_infos, dp_mesh._dim_group_infos
@@ -428,6 +538,22 @@ def test_from_group_with_mesh_shape_3d(self):
             dp_mesh["dp_shard"]._dim_group_infos, ref_mesh["dp_shard"]._dim_group_infos
         ):
             self.assertEqual(ref_ranks, ranks)
+=======
+        ref_mesh_dp_dim_group_names = ref_mesh._dim_group_names[:2]
+        self.assertEqual(ref_mesh_dp_dim_group_names, dp_mesh._dim_group_names[:2])
+        # Cannot check directly for mesh equality since parent meshes are not
+        # the same since the ref's parent mesh is 3D
+        self.assertEqual(dp_mesh["dp_replicate"].mesh, ref_mesh["dp_replicate"].mesh)
+        self.assertEqual(
+            dp_mesh["dp_replicate"]._dim_group_names,
+            ref_mesh["dp_replicate"]._dim_group_names,
+        )
+        self.assertEqual(dp_mesh["dp_shard"].mesh, ref_mesh["dp_shard"].mesh)
+        self.assertEqual(
+            dp_mesh["dp_shard"]._dim_group_names,
+            ref_mesh["dp_shard"]._dim_group_names,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @with_comms()
     def test_from_group_with_mesh_shape_2d(self):
@@ -442,8 +568,14 @@ def test_from_group_with_mesh_shape_2d(self):
 
         # Create shard groups (e.g. (0, 1, 2, 3), (4, 5, 6, 7))
         # and assign the correct shard group to each rank
+<<<<<<< HEAD
         shard_rank_lists = list(range(0, self.world_size // 2)), list(
             range(self.world_size // 2, self.world_size)
+=======
+        shard_rank_lists = (
+            list(range(0, self.world_size // 2)),
+            list(range(self.world_size // 2, self.world_size)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         shard_groups = (
             new_group(shard_rank_lists[0]),
@@ -470,12 +602,22 @@ def test_from_group_with_mesh_shape_2d(self):
             mesh_dim_names=("dp_replicate", "dp_shard"),
         )
 
+<<<<<<< HEAD
         ref_mesh_dp_dim_group_infos = ref_mesh._dim_group_infos[:2]
         for (_, ref_ranks, _), (_, ranks, _) in zip(
             ref_mesh_dp_dim_group_infos, dp_mesh._dim_group_infos
         ):
             self.assertEqual(ref_ranks, ranks)
 
+=======
+        # self.assertEqual(ref_mesh._dim_group_names, dp_mesh._dim_group_names)
+        for mesh_dim_group, ref_mesh_dim_group in zip(
+            dp_mesh.get_all_groups(), ref_mesh.get_all_groups()
+        ):
+            mesh_dim_group_ranks = dist.get_process_group_ranks(mesh_dim_group)
+            ref_mesh_dim_group_ranks = dist.get_process_group_ranks(ref_mesh_dim_group)
+            self.assertEqual(mesh_dim_group_ranks, ref_mesh_dim_group_ranks)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # check both the 2d mesh and the submeshes are exactly the same.
         self.assertEqual(dp_mesh, ref_mesh)
         self.assertEqual(dp_mesh["dp_replicate"], ref_mesh["dp_replicate"])
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 57a685eef534..2fc0a3fa029f 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -25,6 +25,10 @@
 from torch._dynamo.testing import collect_results
 from torch._dynamo.utils import same
 from torch._higher_order_ops.wrap import tag_activation_checkpoint
+<<<<<<< HEAD
+=======
+from torch.compiler import set_enable_guard_collectives
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed._functional_collectives import _maybe_wrap_tensor
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.wrap import (
@@ -61,6 +65,18 @@ def init_weights(m):
         m.bias.data.fill_(0.01)
 
 
+<<<<<<< HEAD
+=======
+@contextmanager
+def enable_guard_collectives():
+    old = set_enable_guard_collectives(True)
+    try:
+        yield
+    finally:
+        set_enable_guard_collectives(old)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ToyModel(nn.Module):
     def __init__(self, in_feat=10, hidden_feat=5000, out_feat=5, ctx_manager=None):
         super().__init__()
@@ -668,6 +684,53 @@ def test_fsdp_aot_eager(self):
             outputs = fsdp_m(inputs)
             self.assertTrue(same(correct_outputs, outputs))
 
+<<<<<<< HEAD
+=======
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_ddp_optimizer_cudagraph(self):
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                # need a large channel to trigger ddp optimizer split module
+                self.CHANNELS = 640
+                self.convi = nn.Conv2d(46, self.CHANNELS, 3, padding=1, bias=False)
+                self.convp = nn.Conv2d(
+                    self.CHANNELS, self.CHANNELS, 1, padding=0, bias=False
+                )
+                self.bni = nn.BatchNorm2d(self.CHANNELS)
+
+            def forward(self, bitmap_channels):
+                x = self.convi(bitmap_channels)
+                x = self.bni(x)
+                x = self.convp(x)
+                return x
+
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+            net = Net().to(self.rank)
+            optimizer = torch.optim.SGD(
+                net.parameters(),
+                lr=5e-2,
+            )
+
+            net = DDP(net, device_ids=[self.rank])
+            opt_net = torch.compile(net, mode="reduce-overhead")
+            opt_net.train()
+
+            for _ in range(10):
+                optimizer.zero_grad()
+                data = torch.randn((16, 46, 8, 8), dtype=torch.float32, device="cuda")
+                opt_net(data).sum().backward()
+
+            # 2 fwd and 2 bwd graph such that 4 graphs in total
+            graph_id = (
+                torch._inductor.cudagraph_trees.get_container(self.rank)
+                .tree_manager.new_graph_id()
+                .id
+            )
+            self.assertTrue(graph_id == 4)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(enable_compiler_collectives=True)
     @skip_if_lt_x_gpu(1)
     def test_fsdp_setattr(self):
@@ -683,7 +746,11 @@ def test_fsdp_setattr(self):
             self.assertTrue(same(correct_outputs, outputs))
             self.assertEqual(len(counters["graph_break"]), 1)
             first_graph_break = list(counters["graph_break"].keys())[0]  # noqa: RUF015
+<<<<<<< HEAD
             self.assertTrue("setattr" not in first_graph_break)
+=======
+            self.assertIn("setattr() on Tensor.requires_grad", first_graph_break)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @config.patch(inline_inbuilt_nn_modules=False)
     @config.patch(enable_compiler_collectives=True)
@@ -1098,6 +1165,34 @@ def f(x):
                 self.assertEqual(res[0], r)
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+<<<<<<< HEAD
+=======
+    @enable_guard_collectives()
+    def test_guard_collective(self):
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+            torch._dynamo.utils.clear_compilation_metrics()
+
+            @torch.compile()
+            def f(x):
+                return x.sum()
+
+            x = torch.randn(10, device=self.rank)
+            f(x)
+
+            if self.rank == 0:
+                x = torch.randn(10, device=self.rank)
+            else:
+                x = torch.randn(12, device=self.rank)  # recompile on one rank
+            f(x)
+
+            metrics = torch._dynamo.utils.get_compilation_metrics()
+            res = [None] * self.world_size
+            torch.distributed.all_gather_object(res, len(metrics))
+            for r in res[1:]:
+                self.assertEqual(res[0], r)
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_pg_attr(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             pg = dist.distributed_c10d._get_default_group()
@@ -1174,11 +1269,17 @@ def f(x):
     @patch.object(torch._inductor.config, "sleep_sec_TESTING_ONLY", 10)
     def test_asymmetric_compilation_with_fx_cache(self):
         from torch._dynamo.utils import counters
+<<<<<<< HEAD
         from torch._inductor.utils import fresh_inductor_cache
 
         with fresh_inductor_cache(), _dynamo_dist_per_rank_init(
             self.rank, self.world_size
         ):
+=======
+        from torch._inductor.utils import fresh_cache
+
+        with fresh_cache(), _dynamo_dist_per_rank_init(self.rank, self.world_size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._dynamo.utils.clear_compilation_metrics()
 
             device = f"cuda:{self.rank}"
@@ -1208,7 +1309,11 @@ def f(x):
             torch._dynamo.reset()
 
             if self.rank == 0:
+<<<<<<< HEAD
                 with fresh_inductor_cache():
+=======
+                with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     f(x)
                 self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
                 self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
@@ -1788,9 +1893,13 @@ def _(ctx):
                 f"""{expected_guard_source} "L['self']._modules['net']" TYPE_MATCH"""
             ).check(
                 f"""{expected_guard_source} "L['self']._modules['net']._modules['0']" TYPE_MATCH"""
+<<<<<<< HEAD
             ).run(
                 GUARDS_FILE.getvalue()
             )
+=======
+            ).run(GUARDS_FILE.getvalue())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.assertTrue(same(correct_outputs, outputs))
 
diff --git a/test/distributed/test_fake_pg.py b/test/distributed/test_fake_pg.py
index 7943d403e5cc..c17d0ecb5d50 100644
--- a/test/distributed/test_fake_pg.py
+++ b/test/distributed/test_fake_pg.py
@@ -7,8 +7,14 @@
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 import torch.nn as nn
+<<<<<<< HEAD
 from torch.distributed._tensor import DeviceMesh, init_device_mesh, Shard
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+=======
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.tensor import DeviceMesh, Shard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
@@ -16,7 +22,13 @@
 )
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TestCase
+=======
+from torch.testing._internal.common_distributed import HAS_ACCELERATOR
+from torch.testing._internal.common_fsdp import get_devtype
+from torch.testing._internal.common_utils import run_tests, skipIfHpu, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
 from torch.testing._internal.distributed.fake_pg import FakeStore
 
@@ -25,13 +37,24 @@
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
+<<<<<<< HEAD
 HAS_CUDA = torch.cuda.is_available()
+=======
+device_type = get_devtype().type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestFakePG(TestCase):
     def tearDown(self):
         super().tearDown()
+<<<<<<< HEAD
         dist.destroy_process_group()
+=======
+        try:
+            dist.destroy_process_group()
+        except AssertionError:
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_all_reduce(self):
         store = FakeStore()
@@ -61,6 +84,7 @@ def test_reduce_scatter(self):
         dist.reduce_scatter(output_tensor, to_reduce_scatter)
         self.assertEqual(tuple(output_tensor.shape), (3, 3))
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_CUDA, "No CUDA")
     def test_construct_fsdp(self):
         store = FakeStore()
@@ -68,13 +92,29 @@ def test_construct_fsdp(self):
         FSDP(nn.Linear(2, 3, device="cuda"))
 
     @unittest.skipIf(not HAS_CUDA, "No CUDA")
+=======
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+    def test_construct_fsdp(self):
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+        FSDP(nn.Linear(2, 3, device=device_type))
+
+    @skipIfHpu
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fsdp_fake_e2e(self):
         store = dist.HashStore()
         dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
         my_module = nn.Sequential(
+<<<<<<< HEAD
             nn.Linear(2, 3, device="cuda"),
             nn.ReLU(),
             nn.Linear(3, 2, device="cuda"),
+=======
+            nn.Linear(2, 3, device=device_type),
+            nn.ReLU(),
+            nn.Linear(3, 2, device=device_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         sharded_module = FSDP(my_module, use_orig_params=True)
         optim = torch.optim.Adam(sharded_module.parameters(), lr=0.0001)
@@ -84,7 +124,12 @@ def test_fsdp_fake_e2e(self):
         loss.backward()
         optim.step()
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_CUDA, "No CUDA")
+=======
+    @skipIfHpu
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fake_pg_tracing(self):
         store = dist.HashStore()
         dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
@@ -94,7 +139,11 @@ def test_fake_pg_tracing(self):
         def allgather_fn(tensor):
             return funcol.all_gather_tensor(tensor, 0, default_pg)
 
+<<<<<<< HEAD
         gm = make_fx(allgather_fn)(torch.randn(2, 2, device="cuda"))
+=======
+        gm = make_fx(allgather_fn)(torch.randn(2, 2, device=device_type))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         FileCheck().check("all_gather").check("wait_tensor").run(str(gm.graph))
 
     def test_broadcast(self):
@@ -164,7 +213,12 @@ def test_recv(self):
         dist.recv(output, 1)
         self.assertEqual(tuple(output.shape), (3, 3))
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_CUDA, "No CUDA or TP+FSDP")
+=======
+    @skipIfHpu
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fsdp_tp_fake_e2e(self):
         world_size = 4
         tp_size = 2
@@ -174,9 +228,17 @@ def test_fsdp_tp_fake_e2e(self):
             backend="fake", rank=0, world_size=world_size, store=store
         )
 
+<<<<<<< HEAD
         device_mesh = DeviceMesh("cuda", torch.arange(0, world_size).view(-1, tp_size))
         device_mesh = init_device_mesh(
             "cuda", (world_size // tp_size, tp_size), mesh_dim_names=["dp", "tp"]
+=======
+        device_mesh = DeviceMesh(
+            device_type, torch.arange(0, world_size).view(-1, tp_size)
+        )
+        device_mesh = init_device_mesh(
+            device_type, (world_size // tp_size, tp_size), mesh_dim_names=["dp", "tp"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         sequence_parallelize_plan = {
@@ -189,7 +251,11 @@ def test_fsdp_tp_fake_e2e(self):
         }
         for parallel_plan in [sequence_parallelize_plan, pairwise_parallelize_plan]:
             my_module = parallelize_module(
+<<<<<<< HEAD
                 MLPModule(device="cuda"),
+=======
+                MLPModule(device=device_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 device_mesh["tp"],
                 parallel_plan,
             )
@@ -202,7 +268,11 @@ def test_fsdp_tp_fake_e2e(self):
             for i in range(10):
                 dp_rank = dist.get_rank()
                 torch.manual_seed(i + dp_rank)
+<<<<<<< HEAD
                 input = torch.randn(20, 10).cuda(dist.get_rank())
+=======
+                input = torch.randn(20, 10, device=f"{device_type}:{dp_rank}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x = sharded_module(input)
                 loss = x.sum()
                 loss.backward()
diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py
index b31fdeb94e67..6a8f3c69784f 100644
--- a/test/distributed/test_functional_api.py
+++ b/test/distributed/test_functional_api.py
@@ -7,8 +7,13 @@
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as ft_c
+<<<<<<< HEAD
 import torch.distributed._tensor as dt
 import torch.distributed.distributed_c10d as c10d
+=======
+import torch.distributed.distributed_c10d as c10d
+import torch.distributed.tensor as dt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functorch import make_fx
 from torch._inductor.utils import run_and_get_code
 from torch.testing import FileCheck
@@ -715,6 +720,16 @@ def run_with_backward():
 
         _, codes = run_and_get_code(run_with_backward)
         for code in codes:
+<<<<<<< HEAD
+=======
+            assert_keywords = ["assert_size_stride", "assert_alignment"]
+            filtered_lines = [
+                line
+                for line in code.splitlines()
+                if not any(assert_key in line for assert_key in assert_keywords)
+            ]
+            code = "\n".join(filtered_lines)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             FileCheck().check_count(
                 "_c10d_functional.all_to_all_single.default", 1, exactly=True
             ).check_count("_c10d_functional.wait_tensor.default", 1, exactly=True).run(
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 2bd78b9cc22b..6ba85f4dab89 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -2,6 +2,11 @@
 import datetime
 import functools
 import unittest
+<<<<<<< HEAD
+=======
+from collections import defaultdict
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -14,7 +19,16 @@
 from torch._C import FileCheck
 from torch._dynamo.testing import CompileCounter
 from torch._dynamo.utils import same
+<<<<<<< HEAD
 from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
+=======
+from torch._inductor.comms import (
+    _reorder_communication_preserving_peak_memory_internal,
+    ReorderInfo,
+)
+from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
+from torch._inductor.scheduler import BaseSchedulerNode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.utils import run_and_get_triton_code
 from torch.distributed.distributed_c10d import GroupMember
 from torch.fx.experimental.proxy_tensor import make_fx
@@ -32,6 +46,10 @@
     skipIfRocm,
 )
 from torch.testing._internal.inductor_utils import HAS_GPU
+<<<<<<< HEAD
+=======
+from torch.utils._python_dispatch import TorchDispatchMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _tolist_with_constrain_as_size(tensor):
@@ -42,6 +60,10 @@ def _tolist_with_constrain_as_size(tensor):
 
 
 @requires_nccl()
+<<<<<<< HEAD
+=======
+@instantiate_parametrized_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
     """
     Run correctness checks in multi-proc runner, mark with minimum # GPUs to run under
@@ -127,7 +149,10 @@ def compile(func, example_inputs):
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
     @skipIfRocm #Skip as flaky upstream as well, enable via https://github.com/ROCm/frameworks-internal/issues/13105
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_allreduce_inductor_cudagraph_trees(self):
         """
         Tests whether cudagraph trees support all_reduce from nccl
@@ -511,12 +536,22 @@ def example(
             out = a2a / a2a.sum(dim=0)
             return out
 
+<<<<<<< HEAD
         with _dynamo_dist_per_rank_init(
             self.rank, self.world_size
         ), torch._dynamo.config.patch(
             dynamic_shapes=True,
             capture_dynamic_output_shape_ops=True,
             capture_scalar_outputs=True,
+=======
+        with (
+            _dynamo_dist_per_rank_init(self.rank, self.world_size),
+            torch._dynamo.config.patch(
+                dynamic_shapes=True,
+                capture_dynamic_output_shape_ops=True,
+                capture_scalar_outputs=True,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             row = self.world_size * (self.rank + 1) * (self.world_size + 1) / 2
             input_split_sizes_tensor = torch.tensor(
@@ -551,6 +586,200 @@ def example(
             inductor_out = compiled_fn(*inputs, **trs)
             self.assertTrue(same(eager_out, inductor_out, tol=0.001))
 
+<<<<<<< HEAD
+=======
+    # The goal of this test is that when `unsafe_allow_recompute_of_collectives=False`,
+    # The partitioner will *never* recompute collectives in the backward, even
+    # if the activation_memory_budget partitioner is being used,
+    # unless there is a manual user checkpoint() region (which we know makes it safe
+    # to recompute the collective, since we assume that the user applied the AC
+    # region consistently across all ranks)
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_lt_x_gpu(2)
+    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    @patch.object(torch._functorch.config, "activation_memory_budget", 0.01)
+    @parametrize("override_with_ac", [False, True])
+    def test_all_to_all_recompute_is_always_banned(self, override_with_ac):
+        @torch.library.custom_op("custom_ns::foo", mutates_args=())
+        def foo(x: torch.Tensor) -> torch.Tensor:
+            return x + 1
+
+        @foo.register_fake
+        def _(x):
+            return torch.empty_like(x)
+
+        def setup_context(ctx, inputs, output):
+            ctx.save_for_backward(inputs[0])
+            return
+
+        def backward(ctx, grad):
+            (x,) = ctx.saved_tensors
+            return grad * x
+
+        foo.register_autograd(backward, setup_context=setup_context)
+
+        class AllToAllSingle(torch.autograd.Function):
+            @staticmethod
+            def forward(
+                ctx,
+                input: torch.Tensor,
+                output_split_sizes,
+                input_split_sizes,
+                tag,
+                ranks,
+                group_size: int,
+            ) -> torch.Tensor:
+                ctx.output_split_sizes = input_split_sizes
+                ctx.input_split_sizes = output_split_sizes
+                ctx.group_size = group_size
+                a2a = torch.ops._c10d_functional.all_to_all_single.default(
+                    input,
+                    output_split_sizes,
+                    input_split_sizes,
+                    "0",
+                )
+                a2a = torch.ops.c10d_functional.wait_tensor(a2a)
+                return a2a
+
+            @staticmethod
+            def backward(ctx, grad):
+                grad = torch.ops._c10d_functional.all_to_all_single.default(
+                    grad,
+                    ctx.output_split_sizes,
+                    ctx.input_split_sizes,
+                    "0",
+                )
+
+                return (
+                    torch.ops.c10d_functional.wait_tensor(grad),
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                )
+
+        def alltoall_autograd(
+            inp,
+            output_split_sizes,
+            input_split_sizes,
+            tag,
+            ranks,
+            group_size,
+        ):
+            out = AllToAllSingle.apply(
+                inp, output_split_sizes, input_split_sizes, tag, ranks, group_size
+            )
+            return out
+
+        # simple mode to track how many collective ops we saw in the backward
+        class TrackingMode(TorchDispatchMode):
+            def __init__(self):
+                super().__init__()
+                self.ops_counter = defaultdict(int)
+
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                rs = func(*args, **kwargs)
+                self.ops_counter[func] += 1
+                return rs
+
+        def example(
+            inp,
+            input_split_sizes_tensor,
+            output_split_sizes_tensor,
+            *,
+            tag,
+            ranks,
+            group_size,
+        ):
+            input_split_sizes = _tolist_with_constrain_as_size(input_split_sizes_tensor)
+            output_split_sizes = _tolist_with_constrain_as_size(
+                output_split_sizes_tensor
+            )
+            a2a = torch.ops.custom_ns.alltoall_autograd.default(
+                inp,
+                output_split_sizes,
+                input_split_sizes,
+                tag,
+                ranks,
+                group_size,
+            )
+
+            return torch.ops.custom_ns.foo(a2a)
+
+        with (
+            _dynamo_dist_per_rank_init(self.rank, self.world_size),
+            torch._dynamo.config.patch(
+                dynamic_shapes=True,
+                capture_dynamic_output_shape_ops=True,
+                capture_scalar_outputs=True,
+            ),
+            torch.library._scoped_library("custom_ns", "FRAGMENT") as lib,
+        ):
+            lib.define(
+                "alltoall_autograd(Tensor input, SymInt[]? output_split_sizes, SymInt[]? input_split_sizes, str tag, int[] ranks, int group_size) -> Tensor"  # noqa: B950
+            )
+            lib.impl("alltoall_autograd", alltoall_autograd, "Autograd")
+            lib.impl("alltoall_autograd", alltoall_autograd, "Meta")
+
+            row = self.world_size * (self.rank + 1) * (self.world_size + 1) / 2
+            input_split_sizes_tensor = torch.tensor(
+                [(i + 1) * (self.rank + 1) for i in range(self.world_size)],
+                dtype=torch.int64,
+            )
+            output_split_sizes_tensor = torch.tensor(
+                [(i + 1) * (self.rank + 1) for i in range(self.world_size)],
+                dtype=torch.int64,
+            )
+            inputs = (
+                torch.ones(int(row), 5, device="cuda", requires_grad=True)
+                * (self.rank + 1),
+                input_split_sizes_tensor,
+                output_split_sizes_tensor,
+            )
+            trs = self.get_world_trs()
+
+            compiled_fn = torch.compile(
+                example,
+                fullgraph=True,
+                dynamic=True,
+                backend="aot_eager_decomp_partition",
+            )
+
+            if override_with_ac:
+
+                def compiled_fn_wrapper(*args):
+                    return example(*inputs, **trs)
+
+                out = torch.utils.checkpoint.checkpoint(
+                    compiled_fn_wrapper, *inputs, use_reentrant=False
+                )
+            else:
+                out = compiled_fn(*inputs, **trs)
+
+            # track how many all_to_alls we saw in the backward
+            with TrackingMode() as m:
+                out.sum().backward()
+            if override_with_ac:
+                # We wrapped our test in AC, which overrides the partitioner decision
+                # of never recomputing collectives.
+                # So we should properly see the all2all be recomputed in the backward
+                self.assertEqual(
+                    m.ops_counter[torch.ops._c10d_functional.all_to_all_single.default],
+                    2,
+                )
+            else:
+                # there is 1 all2all in the fw, and 1 all2all in the backward.
+                # notably: even though activation_memory_budget == 0 ("recompute_everything"),
+                # we are still choosing *not* to recompute the all2all from the fw
+                self.assertEqual(
+                    m.ops_counter[torch.ops._c10d_functional.all_to_all_single.default],
+                    1,
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
     def test_all_to_all_single_inductor_split_sizes_none(self):
@@ -671,9 +900,13 @@ def func(inp, *, tag, ranks, group_size):
         correct = func(inputs, **self.get_world_trs())
         self.assertTrue(same(out, correct))
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
     def test_inductor_doesnt_mutate_shared(self):
+=======
+    def _test_inductor_doesnt_mutate_shared(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         make sure that an intermediate that's going to be reuse isn't mutated unless copied
         """
@@ -710,6 +943,22 @@ def func(inp, *, tag, ranks, group_size):
         correct = func(inputs, **self.get_world_trs())
         self.assertTrue(same(out, correct))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
+    def test_inductor_doesnt_mutate_shared(self):
+        self._test_inductor_doesnt_mutate_shared()
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_inductor_doesnt_mutate_shared_graph_partition(self):
+        # checks graph partition reorder does not change relative order of ops
+        # when all ops are on cuda
+        self._test_inductor_doesnt_mutate_shared()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_trace_allreduce(self):
         def func(inp):
             ar = _functional_collectives.all_reduce(inp, "sum", "0")
@@ -1212,6 +1461,89 @@ def func(inp, *, tag, ranks, group_size):
         correct = func(inputs, **self.get_world_trs())
         assert same(out, correct), f"{out} va {correct}"
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_reorder_peak_memory(self):
+        """
+        TODO(whc)
+        - check each of the `limiting_factor` cases
+        - confirm peak memory is respected in some adversarial case
+        - check whether it is expected / correct that the "buf7 = buf0; del buf0  # reuse" statement materially changes
+        """
+
+        def func(inp, *, tag, ranks, group_size):
+            x = inp + 1
+            tensor_list = torch.ops.c10d_functional.reduce_scatter_tensor_coalesced(
+                [x, inp], "sum", tag, ranks, group_size
+            )
+            y = x + 2
+            ar0 = torch.ops.c10d_functional.wait_tensor(tensor_list[0])
+            ar1 = torch.ops.c10d_functional.wait_tensor(tensor_list[1])
+            # ensure other is not incorrectly aliasing ar's buffer
+            other = torch.ones_like(inp) + 22
+            return ar0, y, other, ar1
+
+        inputs = torch.ones(4, 4, device="cuda")
+
+        # get stats directly from the internal helper without affecting the real pass's signature
+        node_stats: Optional[dict[BaseSchedulerNode, ReorderInfo]] = None
+
+        def _reorder_communication_preserving_peak_memory(
+            snodes: list[BaseSchedulerNode],
+        ) -> list[BaseSchedulerNode]:
+            nonlocal node_stats
+            (
+                reordered_snodes,
+                node_stats,
+            ) = _reorder_communication_preserving_peak_memory_internal(snodes)
+            return reordered_snodes
+
+        with torch._inductor.config.patch(
+            {
+                "reorder_for_compute_comm_overlap": True,
+                "reorder_for_compute_comm_overlap_passes": [
+                    "sink_waits",
+                    # same as reorder_communication_preserving_peak_memory but returns debug info structures directly
+                    _reorder_communication_preserving_peak_memory,
+                ],
+            }
+        ):
+            compiled = torch.compile(func)
+            code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
+        # NOTE: The first return value should be the output of the first wait_tensor.
+        # We want to make sure no unneccessary copy is made.
+        (
+            FileCheck()
+            .check("buf0 = empty_strided")
+            .check("buf6 = empty_strided")
+            .check(".run(arg0_1, buf0, buf6, 16")
+            .check(
+                "buf1 = torch.ops._c10d_functional.reduce_scatter_tensor_coalesced.default([buf0, arg0_1]"
+            )
+            # .check("buf2 = buf1[0]")
+            # .check("buf3 = buf1[1]")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf2")
+            # .check("buf7 = buf0; del buf0  # reuse")
+            # .check(".run(buf7, 16")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf3")
+            .check("return (buf2, buf6, buf7, buf3")
+            .run(code)
+        )
+        out = compiled(inputs, **self.get_world_trs())
+        correct = func(inputs, **self.get_world_trs())
+        assert same(out, correct), f"{out} va {correct}"
+
+        # TODO make the test case more interesting and validate the actual desired behavior
+        assert node_stats is not None
+        self.assertTrue(isinstance(node_stats, dict))
+        self.assertEqual(len(node_stats), 1)
+        for stats in node_stats.values():
+            self.assertEqual(stats.initial_exposed, 0)
+            self.assertEqual(stats.limiting_factor, "data dependency")
+            self.assertEqual(stats.moves, 0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/distributed/test_nccl.py b/test/distributed/test_nccl.py
index f9bb4f6543ee..996a75171bff 100644
--- a/test/distributed/test_nccl.py
+++ b/test/distributed/test_nccl.py
@@ -7,11 +7,19 @@
 import torch.cuda
 import torch.cuda.nccl as nccl
 import torch.distributed as c10d
+<<<<<<< HEAD
+=======
+import torch.distributed._symmetric_memory as symm_mem
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_distributed import MultiProcContinousTest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     IS_WINDOWS,
     load_tests,
@@ -239,6 +247,44 @@ def test_reduce_scatter(self, device, dtype):
             self.assertEqual(outputs[i], expected[i])
 
 
+<<<<<<< HEAD
+=======
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+class NCCLSymmetricMemoryTest(MultiProcContinousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    # To run this test, one needs to TORCH_SYMMMEM=NCCL when running the test.
+    @skip_but_pass_in_sandcastle_if(TEST_WITH_ROCM, "Skip NCCL tests for ROCm")
+    @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
+    def test_nccl_symmem_alloc(self):
+        self._init_device()
+        c10d.all_reduce(torch.ones(1, device=self.device))
+        group_name = c10d.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+
+        def foo():
+            inp = symm_mem.empty(numel, dtype=dtype, device=self.device)
+            symm_mem.rendezvous(inp, group=group_name)
+
+        foo()
+
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device)
+        symm_mem.rendezvous(out, group=group_name)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_device_type_tests(TestNCCL, globals(), only_for="cuda")
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_nvshmem.py b/test/distributed/test_nvshmem.py
new file mode 100644
index 000000000000..e903dec89cf9
--- /dev/null
+++ b/test/distributed/test_nvshmem.py
@@ -0,0 +1,983 @@
+# Owner(s): ["oncall: distributed"]
+
+# To run:
+# TORCH_SYMMMEM=NVSHMEM python test/distributed/test_nvshmem.py
+
+
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
+from torch._inductor.runtime.triton_compat import tl, triton
+from torch.testing._internal.common_distributed import MultiProcContinousTest
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+)
+from torch.testing._internal.inductor_utils import requires_triton
+
+
+# Decorator
+def requires_nvshmem():
+    return skip_but_pass_in_sandcastle_if(
+        not symm_mem.is_nvshmem_available(),
+        "test_nvshmem requires NVSHMEM, skipping tests",
+    )
+
+
+# So that tests are written in device-agnostic way
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+@instantiate_parametrized_tests
+@requires_nvshmem()
+class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # NOTE: required for nvshmem allocation
+        torch.empty(1, device=self.device)
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    @skipIfRocm
+    def test_alloc(self) -> None:
+        self._init_device()
+
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+
+        def foo():
+            inp = symm_mem.empty(numel, dtype=dtype, device=self.device)
+            symm_mem.rendezvous(inp, group=group_name)
+
+        foo()
+
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device)
+        symm_mem.rendezvous(out, group=group_name)
+
+    @skipIfRocm
+    def test_nvshmem_put(self) -> None:
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+        tensor = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(self.rank)
+        symm_mem.rendezvous(tensor, group=group_name)
+
+        if self.rank == 0:
+            torch.ops.symm_mem.nvshmem_put(tensor, 1)
+            # TODO: remove after we have wait_signal
+            dist.barrier()
+        elif self.rank == 1:
+            # handle.wait_signal(src_rank=0)
+            # TODO: remove after we have wait_signal
+            dist.barrier()
+            torch.testing.assert_close(
+                tensor, torch.zeros(numel, dtype=dtype, device=self.device)
+            )
+        else:
+            dist.barrier()
+
+    @skipIfRocm
+    def test_nvshmem_all_to_all(self) -> None:
+        self._init_device()
+
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel_per_peer = 10
+        numel = self.world_size * numel_per_peer
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(self.rank)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+
+        symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(out, group=group_name)
+        torch.ops.symm_mem.nvshmem_all_to_all(inp, out, group_name)
+
+        expected = torch.cat(
+            [
+                torch.empty(numel_per_peer, dtype=dtype, device=self.device).fill_(i)
+                for i in range(self.world_size)
+            ]
+        )
+        torch.testing.assert_close(out, expected)
+
+    @skipIfRocm
+    def test_all_to_all_vdev(self) -> None:
+        self._init_device()
+
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        # Number of elements for a peer is random between [0, k)
+        k = 10
+        inp_splits = torch.randint(k, (self.world_size,), device=self.device)
+        inp_numel = inp_splits.sum().item()
+        # Exchange input splits to get output splits
+        out_splits = torch.zeros_like(inp_splits)
+        dist.all_to_all_single(out_splits, inp_splits)
+        out_numel = out_splits.sum().item()
+
+        # Max number of input elements (must be a constant across ranks for symmetric memory allocation)
+        max_inp_numel = k * self.world_size
+        # Max number of output elements (must be a constant across ranks for symmetric memory allocation)
+        overflow_factor = self.world_size  # worst case: one rank receives all data
+        max_out_numel = max_inp_numel * overflow_factor
+
+        inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=self.device).fill_(
+            self.rank
+        )
+        out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
+        in_out_splits = symm_mem.empty(
+            (3, self.world_size), dtype=torch.int64, device=self.device
+        )
+        # Row 0 is input splits
+        in_out_splits[0].copy_(inp_splits)
+
+        torch.ops.symm_mem.all_to_all_vdev(inp, out, in_out_splits, group_name)
+
+        # Check input splits (row 0) -- should not change
+        torch.testing.assert_close(in_out_splits[0], inp_splits)
+
+        # Check output splits (row 1)
+        torch.testing.assert_close(in_out_splits[1], out_splits)
+
+        # Check output offsets (row 2)
+        out_offsets = torch.cumsum(out_splits, dim=0)  # inclusive scan
+        # output offsets from `all_to_all_vdev` is exclusive scan
+        self.assertEqual(in_out_splits[2][0], 0)
+        torch.testing.assert_close(in_out_splits[2][1:], out_offsets[:-1])
+
+        # Check data
+        expected = torch.empty(out_numel, dtype=dtype, device=self.device)
+        dist.all_to_all_single(
+            expected, inp[:inp_numel], out_splits.tolist(), inp_splits.tolist()
+        )
+        torch.testing.assert_close(out[:out_numel], expected)
+
+    @skipIfRocm
+    @parametrize("align", [1, 8, 16])  # `major_align` of output
+    def test_all_to_all_vdev_2d(self, align: int) -> None:
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        # Number of experts per rank
+        ne = 8
+        nsplits = ne * self.world_size
+
+        # Number of elements for an expert is random between [0, k)
+        k = 10
+        inp_splits = torch.randint(k, (nsplits,), dtype=torch.int64, device=self.device)
+
+        # Exchange input splits to get output splits
+        out_splits = torch.zeros_like(inp_splits)
+        dist.all_to_all_single(out_splits, inp_splits)
+        # We do a .t() here because there is a rank-major to expert-major shuffle
+        out_splits_t = out_splits.reshape(self.world_size, ne).t()
+
+        # Actual number of input elements
+        inp_numel = inp_splits.sum().item()
+        # Actual number of output elements
+        out_numel = out_splits.sum().item()
+        # Max number of input elements (must be a constant across ranks for symmetric memory allocation)
+        max_inp_numel = k * nsplits
+        # Max number of output elements (must be a constant across ranks for symmetric memory allocation)
+        overflow_factor = self.world_size  # worst case: one rank receives all data
+        max_out_numel = max_inp_numel * overflow_factor
+
+        inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=self.device).fill_(
+            self.rank
+        )
+        out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
+        # 3 rows: input splits, output splits, output offsets
+        # Initiallizing all values to -1 to check if they are updated
+        in_out_splits = symm_mem.empty(
+            (3, nsplits), dtype=torch.int64, device=self.device
+        ).fill_(-1)
+        # Row 0 is input splits
+        in_out_splits[0].copy_(inp_splits)
+
+        torch.ops.symm_mem.all_to_all_vdev_2d(
+            inp, out, in_out_splits, group_name, major_align=align
+        )
+        received_out_splits = in_out_splits[1]
+        received_out_offsets = in_out_splits[2]
+
+        # Check input splits (row 0) -- should not change
+        torch.testing.assert_close(in_out_splits[0], inp_splits)
+
+        # Check output splits (row 1)
+        torch.testing.assert_close(received_out_splits, out_splits_t.reshape(-1))
+
+        # Check output offsets (row 2)
+        out_split_list = out_splits_t.tolist()
+        for i in range(ne):
+            expert_sum = 0
+            for j in range(self.world_size):
+                expert_sum += out_split_list[i][j]
+            # Align up expert_sum
+            expert_sum_aligned = (expert_sum + align - 1) // align * align
+            # If 0, make it at least `align` (bc cutlass currently does not support empty bins)
+            expert_sum_aligned = max(expert_sum_aligned, align)
+            # last element absorbs the padding
+            out_split_list[i][-1] += expert_sum_aligned - expert_sum
+
+        out_splits_padded = torch.tensor(out_split_list, device=self.device).reshape(-1)
+        out_offsets = torch.cumsum(out_splits_padded, dim=0)  # inclusive scan
+        # Make it exclusive scan because that's what `all_to_all_vdev_2d` returns
+        out_offsets = torch.cat(
+            [torch.zeros(1, device=self.device), out_offsets[:-1]]
+        ).to(torch.int64)
+        torch.testing.assert_close(received_out_offsets, out_offsets)
+
+        # Check data
+        expected = torch.empty(out_numel, dtype=dtype, device=self.device)
+        inp_splits_rank = inp_splits.reshape(self.world_size, ne).sum(1)
+        out_splits_rank = out_splits.reshape(self.world_size, ne).sum(1)
+        dist.all_to_all_single(
+            expected,
+            inp[:inp_numel],
+            out_splits_rank.tolist(),
+            inp_splits_rank.tolist(),
+        )
+        # We still need to shuffle `expected`
+        out_offsets = torch.cumsum(out_splits, dim=0)  # inclusive scan
+        result_list = []
+        for j in range(ne):
+            for i in range(self.world_size):
+                chunk_id = i * ne + j
+                offset = out_offsets[chunk_id]
+                chunk = expected[offset - out_splits[chunk_id] : offset]
+                result_list.append(chunk)
+
+        # Do a chunk-wise comparison
+        for c, chunk in enumerate(result_list):
+            start = received_out_offsets[c].item()
+            split = received_out_splits[c].item()
+            received_chunk = out[start : start + split]
+            torch.testing.assert_close(received_chunk, chunk)
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_put(self) -> None:
+        # A Triton kernel that calls nvshmem device side API
+        @triton.jit
+        def put_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.putmem_block(dst_ptr, src_ptr, numel, peer)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        # Enable NVSHMEM for Triton
+        nvshmem_lib = nvshmem.enable_triton()
+
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+
+        val = 5
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+
+        peer = 1 - rank
+        if rank == 0:
+            dst_ptr = out_hdl.buffer_ptrs[rank]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            put_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+
+        dist.barrier()
+        if rank == 1:
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_get(self) -> None:
+        # A Triton kernel that calls nvshmem device side API for GET
+        @triton.jit
+        def get_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.getmem_block(dst_ptr, src_ptr, numel, peer)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+        val = 7
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(
+            val if rank == 0 else -1
+        )
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        dist.barrier()
+        peer = 1 - rank
+        if rank == 1:
+            # Rank 1 gets data from rank 0
+            dst_ptr = out_hdl.buffer_ptrs[rank]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            get_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+        if rank == 1:
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_get_ring(self) -> None:
+        # A Triton kernel that calls nvshmem device side API for GET
+        # with ring topology
+        @triton.jit
+        def get_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.getmem_block(dst_ptr, src_ptr, numel, peer)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+        world_size = dist.get_world_size()
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+
+        # Each rank fills its input buffer with its own rank value
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(rank)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        dist.barrier()
+
+        # Ring topology: each rank gets data from the rank to its left
+        # rank 0 gets from rank (world_size-1), rank 1 gets from rank 0, etc.
+        peer = (rank - 1) % world_size
+
+        # All ranks execute the get operation
+        dst_ptr = out_hdl.buffer_ptrs[rank]
+        src_ptr = inp_hdl.buffer_ptrs[rank]
+        get_kernel[(1, 1, 1)](
+            dst_ptr,
+            src_ptr,
+            numel=numel,
+            peer=peer,
+            extern_libs=nvshmem_lib,
+        )
+
+        expected_value = peer
+        torch.testing.assert_close(
+            out, expected_value * torch.ones(numel, dtype=dtype, device=self.device)
+        )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_put_signal_set(self) -> None:
+        # A Triton kernel that calls nvshmem device side API for PUT with SIGNAL
+        @triton.jit
+        def put_signal_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            sig_ptr,
+            signal_val: tl.constexpr,
+            sig_op: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.putmem_signal_block(
+                dst_ptr, src_ptr, numel, sig_ptr, signal_val, sig_op, peer
+            )
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        nvshmem_lib = nvshmem.enable_triton()
+
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+
+        # Data buffers
+        val = 11
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+
+        # Use the signal pad attached to the output symmetric memory handle
+        # as the flag buffer for signaling completion.
+        flag = out_hdl.get_signal_pad(rank, (1,), dtype=torch.int64).fill_(0)
+
+        peer = 1 - rank
+        NVSHMEM_SIGNAL_SET = 0  # value defined by NVSHMEM for atomic set
+        SIGNAL_VAL = 1  # Signal completion value
+        NVSHMEM_CMP_EQ = 0  # compare equal for signal wait until
+
+        # Kernel for waiting on the signal locally (Rank 1).
+        @triton.jit
+        def signal_wait_until_kernel(
+            sig_ptr, cmp_op: tl.constexpr, cmp_val: tl.constexpr
+        ):
+            nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
+
+        if rank == 0:
+            # Rank 0 puts into Rank 1
+            dst_ptr = out_hdl.buffer_ptrs[peer]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            sig_ptr = out_hdl.signal_pad_ptrs[peer]
+            put_signal_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=numel,
+                sig_ptr=sig_ptr,
+                signal_val=SIGNAL_VAL,
+                sig_op=NVSHMEM_SIGNAL_SET,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+
+        if rank == 1:
+            # Wait until signal flag is set by Rank 0
+            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
+            signal_wait_until_kernel[(1,)](
+                sig_ptr_local,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=SIGNAL_VAL,
+                extern_libs=nvshmem_lib,
+            )
+            # After wait completes, verify data and flag contents
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+            torch.testing.assert_close(
+                flag, torch.tensor([SIGNAL_VAL], dtype=torch.int64, device=self.device)
+            )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_put_signal_add(self) -> None:
+        # A Triton kernel that calls nvshmem device side API for PUT with SIGNAL
+        @triton.jit
+        def put_signal_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            sig_ptr,
+            signal_val: tl.constexpr,
+            sig_op: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.putmem_signal_block(
+                dst_ptr, src_ptr, numel, sig_ptr, signal_val, sig_op, peer
+            )
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+
+        nvshmem_lib = nvshmem.enable_triton()
+
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+
+        # Data buffers
+        val = 11
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+
+        # Use the signal pad attached to the output symmetric memory handle
+        # as the flag buffer for signaling completion.
+        flag = out_hdl.get_signal_pad(rank, (1,), dtype=torch.int64).fill_(0)
+
+        peer = 1 - rank
+        NVSHMEM_SIGNAL_ADD = 5  # atomic add operation
+        SIGNAL_VAL = 16  # val + NVSHMEM_SIGNAL_ADD
+        NVSHMEM_CMP_EQ = 0
+
+        @triton.jit
+        def signal_wait_until_kernel(
+            sig_ptr, cmp_op: tl.constexpr, cmp_val: tl.constexpr
+        ):
+            nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
+
+        if rank == 0:
+            # Rank 0 puts into Rank 1
+            dst_ptr = out_hdl.buffer_ptrs[peer]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            sig_ptr = out_hdl.signal_pad_ptrs[peer]
+            put_signal_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=numel,
+                sig_ptr=sig_ptr,
+                signal_val=SIGNAL_VAL,
+                sig_op=NVSHMEM_SIGNAL_ADD,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+
+        if rank == 1:
+            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
+            signal_wait_until_kernel[(1, 1, 1)](
+                sig_ptr_local,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=SIGNAL_VAL,
+                extern_libs=nvshmem_lib,
+            )
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+            torch.testing.assert_close(
+                flag, torch.tensor([SIGNAL_VAL], dtype=torch.int64, device=self.device)
+            )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_wait_until(self) -> None:
+        # A Triton kernel that calls nvshmem device side API for PUT
+        @triton.jit
+        def put_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.putmem_block(dst_ptr, src_ptr, numel, peer)
+
+        # A Triton kernel that calls nvshmem device side API for WAIT_UNTIL
+        @triton.jit
+        def wait_until_kernel(
+            ivar_ptr,
+            cmp_op: tl.constexpr,
+            cmp_val: tl.constexpr,
+        ):
+            nvshmem.wait_until(ivar_ptr, cmp_op, cmp_val)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+
+        # Data buffers
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+        val = 13
+        flag_val = 21
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        dist.barrier()
+
+        peer = 1 - rank
+        NVSHMEM_CMP_EQ = 0  # from nvshmem.h
+
+        if rank == 0:
+            # Rank 0 waits for the flag to be set by Rank 1, then checks the data
+            ivar_ptr = out_hdl.signal_pad_ptrs[rank]
+            wait_until_kernel[(1, 1, 1)](
+                ivar_ptr,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=flag_val,
+                extern_libs=nvshmem_lib,
+            )
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+
+        if rank == 1:
+            # Rank 1 puts data into Rank 0's output buffer
+            dst_ptr = out_hdl.buffer_ptrs[rank]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            put_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+            # Rank 1 sets the flag on Rank 0
+            # We use a temporary tensor for the value to put.
+            flag_update_val = torch.tensor(
+                [flag_val], dtype=torch.int64, device=self.device
+            )
+            dst_ptr = out_hdl.signal_pad_ptrs[rank]
+            src_ptr = flag_update_val.data_ptr()
+            put_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel=1,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_signal_wait_until(self) -> None:
+        # A Triton kernel that waits on a signal variable until it meets the compare condition.
+        @triton.jit
+        def signal_wait_until_kernel(
+            sig_ptr,
+            cmp_op: tl.constexpr,
+            cmp_val: tl.constexpr,
+        ):
+            nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
+
+        # A Triton kernel for the producer that puts data and then signals completion.
+        @triton.jit
+        def put_and_signal_kernel(
+            dst_ptr,
+            src_ptr,
+            numel: tl.constexpr,
+            sig_ptr,
+            signal_val: tl.constexpr,
+            sig_op: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            nvshmem.putmem_signal_block(
+                dst_ptr, src_ptr, numel, sig_ptr, signal_val, sig_op, peer
+            )
+
+        self._init_device()
+        # Enable NVSHMEM for Triton
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+        peer = 1 - rank
+
+        # NVSHMEM constants from documentation
+        NVSHMEM_CMP_EQ = 0  # equal comparison
+        NVSHMEM_SIGNAL_SET = 0  # atomic set operation
+
+        # Message configuration
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+        val_to_put = 123  # arbitrary test value
+        COMPLETION_FLAG_VAL = 1
+
+        # Producer (rank 0) prepares the data to send
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val_to_put)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        # Consumer (rank 1) prepares the destination buffer
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        # Use the signal pad for synchronization, as in previous tests
+        flag_dtype = torch.int64
+        flag = out_hdl.get_signal_pad(rank, (1,), dtype=flag_dtype).fill_(0)
+        # Ensure setup is complete on all ranks before proceeding
+        dist.barrier()
+
+        if rank == 0:
+            # Producer (rank 0): Puts data into rank 1's `out` buffer and then sets the flag
+            dst_ptr = out_hdl.buffer_ptrs[peer]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            sig_ptr = out_hdl.signal_pad_ptrs[peer]
+            put_and_signal_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                numel,
+                sig_ptr,
+                signal_val=COMPLETION_FLAG_VAL,
+                sig_op=NVSHMEM_SIGNAL_SET,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+        elif rank == 1:
+            # Consumer (rank 1): Waits on the signal variable using `signal_wait_until`.
+            sig_ptr = out_hdl.signal_pad_ptrs[rank]
+            signal_wait_until_kernel[(1, 1, 1)](
+                sig_ptr,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=COMPLETION_FLAG_VAL,
+                extern_libs=nvshmem_lib,
+            )
+            # After the wait returns, verify data and flag
+            torch.testing.assert_close(
+                out, val_to_put * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+            torch.testing.assert_close(
+                flag,
+                torch.tensor(
+                    [COMPLETION_FLAG_VAL], dtype=flag_dtype, device=self.device
+                ),
+            )
+        # Final barrier to ensure the test does not exit before assertions complete
+        dist.barrier()
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_fence(self) -> None:
+        """
+        Rank 0 performs two put operations into Rank 1's buffers with a fence
+        between them, followed by another fence and a flag update. Rank 1 waits
+        for the flag, then verifies that both destination buffers contain the
+        expected values. The flag is transferred after the final fence, so
+        its arrival implies that both preceding puts have been delivered in
+        order.
+        """
+
+        # Triton kernel that issues two ordered puts separated by fences and
+        # finally writes the completion flag.
+        @triton.jit
+        def put_with_fence_kernel(
+            dst_ptr1,
+            dst_ptr2,
+            src_ptr1,
+            src_ptr2,
+            flag_ptr,
+            flag_src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            # First put
+            nvshmem.putmem_block(dst_ptr1, src_ptr1, numel, peer)
+            # Ensure the first put is ordered before the next.
+            nvshmem.fence()
+            # Second put
+            nvshmem.putmem_block(dst_ptr2, src_ptr2, numel, peer)
+            # Order the second put before flag update.
+            nvshmem.fence()
+            # Write the flag (single int64) to signal completion.
+            nvshmem.putmem_block(flag_ptr, flag_src_ptr, 1, peer)
+
+        # Kernel for Rank 1 to wait until the flag becomes the expected value.
+        @triton.jit
+        def wait_until_kernel(
+            ivar_ptr,
+            cmp_op: tl.constexpr,
+            cmp_val: tl.constexpr,
+        ):
+            nvshmem.wait_until(ivar_ptr, cmp_op, cmp_val)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+        peer = 1 - rank
+        # Message configuration
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+        val1 = 10
+        val2 = 20
+        flag_val = 1
+        # Symmetric buffers
+        inp1 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val1)
+        inp2 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val2)
+        out1 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        out2 = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp1_hdl = symm_mem.rendezvous(inp1, group=group_name)
+        inp2_hdl = symm_mem.rendezvous(inp2, group=group_name)
+        out1_hdl = symm_mem.rendezvous(out1, group=group_name)
+        out2_hdl = symm_mem.rendezvous(out2, group=group_name)
+
+        # Flag buffer resides in the signal pad of out2.
+        flag = out2_hdl.get_signal_pad(rank, (1,), dtype=torch.int64).fill_(0)
+        flag_update_val = torch.tensor(
+            [flag_val], dtype=torch.int64, device=self.device
+        )
+        NVSHMEM_CMP_EQ = 0  # compare equal
+        dist.barrier()
+
+        if rank == 0:
+            dst_ptr1 = out1_hdl.buffer_ptrs[rank]
+            dst_ptr2 = out2_hdl.buffer_ptrs[rank]
+            src_ptr1 = inp1_hdl.buffer_ptrs[rank]
+            src_ptr2 = inp2_hdl.buffer_ptrs[rank]
+            flag_ptr = out2_hdl.signal_pad_ptrs[rank]
+            flag_src_ptr = flag_update_val.data_ptr()
+
+            put_with_fence_kernel[(1, 1, 1)](
+                dst_ptr1,
+                dst_ptr2,
+                src_ptr1,
+                src_ptr2,
+                flag_ptr,
+                flag_src_ptr,
+                numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+        elif rank == 1:
+            # Wait until flag is set by Rank 0.
+            ivar_ptr = out2_hdl.signal_pad_ptrs[rank]
+            wait_until_kernel[(1, 1, 1)](
+                ivar_ptr,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=flag_val,
+                extern_libs=nvshmem_lib,
+            )
+
+            # Verify ordered data arrival.
+            torch.testing.assert_close(
+                out1, val1 * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+            torch.testing.assert_close(
+                out2, val2 * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+            torch.testing.assert_close(
+                flag, torch.tensor([flag_val], dtype=torch.int64, device=self.device)
+            )
+        dist.barrier()
+
+    @skipIfRocm
+    @requires_triton()
+    def test_triton_quiet(self) -> None:
+        # A Triton kernel that uses nvshmem_quiet to ensure completion
+        @triton.jit
+        def put_with_quiet_kernel(
+            dst_ptr,
+            src_ptr,
+            flag_dst_ptr,
+            flag_src_ptr,
+            numel: tl.constexpr,
+            peer: tl.constexpr,
+        ):
+            # Put data
+            nvshmem.putmem_block(dst_ptr, src_ptr, numel, peer)
+            # Call quiet to ensure put is complete
+            nvshmem.quiet()
+            # Only after quiet, set the completion flag
+            # This ensures the data put is complete before flag is set
+            nvshmem.putmem_block(flag_dst_ptr, flag_src_ptr, 1, peer)
+
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        # Enable NVSHMEM for Triton
+        nvshmem_lib = nvshmem.enable_triton()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+        rank = self.rank
+        msg_size_bytes = 8
+        dtype = torch.int8
+        numel = msg_size_bytes // dtype.itemsize
+        # Data buffers
+        val = 15
+        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        out_hdl = symm_mem.rendezvous(out, group=group_name)
+        # Use signal pad as completion flag
+        flag_val = 42
+        peer = 1 - rank
+        NVSHMEM_CMP_EQ = 0
+
+        @triton.jit
+        def wait_until_kernel(
+            ivar_ptr,
+            cmp_op: tl.constexpr,
+            cmp_val: tl.constexpr,
+        ):
+            nvshmem.wait_until(ivar_ptr, cmp_op, cmp_val)
+
+        dist.barrier()
+        if rank == 0:
+            # Rank 0 waits for flag from Rank 1
+            ivar_ptr = out_hdl.signal_pad_ptrs[rank]
+            wait_until_kernel[(1, 1, 1)](
+                ivar_ptr,
+                cmp_op=NVSHMEM_CMP_EQ,
+                cmp_val=flag_val,
+                extern_libs=nvshmem_lib,
+            )
+            # After flag is set, data should be complete due to quiet
+            torch.testing.assert_close(
+                out, val * torch.ones(numel, dtype=dtype, device=self.device)
+            )
+        if rank == 1:
+            # Rank 1 puts data and flag with quiet in between
+            dst_ptr = out_hdl.buffer_ptrs[rank]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            flag_dst_ptr = out_hdl.signal_pad_ptrs[rank]
+            # Create a tensor for the flag value
+            flag_update_val = torch.tensor(
+                [flag_val], dtype=torch.int64, device=self.device
+            )
+            flag_src_ptr = flag_update_val.data_ptr()
+            put_with_quiet_kernel[(1, 1, 1)](
+                dst_ptr,
+                src_ptr,
+                flag_dst_ptr,
+                flag_src_ptr,
+                numel=numel,
+                peer=peer,
+                extern_libs=nvshmem_lib,
+            )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/test_pg_wrapper.py b/test/distributed/test_pg_wrapper.py
index d7e59f1c90a7..92a5bb039e52 100644
--- a/test/distributed/test_pg_wrapper.py
+++ b/test/distributed/test_pg_wrapper.py
@@ -464,8 +464,14 @@ def test_collective_shape_mismatch_cuda(self):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     assert (
         not torch.cuda._initialized
     ), "test_pg_wrapper must not have initialized CUDA context on main process"
+=======
+    assert not torch.cuda._initialized, (
+        "test_pg_wrapper must not have initialized CUDA context on main process"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     run_tests()
diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
index 115ca2ae0346..54f5df5cd21d 100644
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@@ -8,6 +8,10 @@
 import tempfile
 import threading
 import time
+<<<<<<< HEAD
+=======
+from concurrent.futures import ThreadPoolExecutor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from datetime import timedelta
 from sys import platform
 
@@ -148,6 +152,78 @@ def _test_append(self, store):
     def test_append(self):
         self._test_append(self._create_store())
 
+<<<<<<< HEAD
+=======
+    def _create_store_or_skip_if_no_queues(self) -> dist.Store:
+        store = self._create_store()
+
+        try:
+            store.queue_push("test_queue_support", "1")
+        except NotImplementedError:
+            self.skipTest("Store does not support queues")
+
+        return store
+
+    def test_queues(self) -> None:
+        store = self._create_store_or_skip_if_no_queues()
+
+        self.assertFalse(store.check(["foo"]))
+        self.assertEqual(store.queue_len("foo"), 0)
+
+        store.queue_push("foo", "1")
+        store.queue_push("foo", "2")
+
+        self.assertTrue(store.check(["foo"]))
+        self.assertEqual(store.queue_len("foo"), 2)
+        store.wait(["foo"])
+
+        self.assertEqual(store.queue_pop("foo"), b"1")
+        self.assertEqual(store.queue_pop("foo"), b"2")
+
+        self.assertFalse(store.check(["foo"]))
+        self.assertEqual(store.queue_len("foo"), 0)
+
+    def test_queues_nonblocking(self) -> None:
+        store = self._create_store_or_skip_if_no_queues()
+
+        with self.assertRaisesRegex(dist.QueueEmptyError, "empty"):
+            store.queue_pop("foo", block=False)
+
+        store.queue_push("foo", "a")
+        self.assertEqual(store.queue_pop("foo", block=False), b"a")
+
+    def test_queues_bidirectional(self) -> None:
+        store = self._create_store_or_skip_if_no_queues()
+
+        def worker_a():
+            local_store = store.clone()
+
+            local_store.queue_push("a", "a1")
+            self.assertEqual(local_store.queue_pop("b"), b"b1")
+
+        def worker_b():
+            local_store = store.clone()
+
+            self.assertEqual(local_store.queue_pop("a"), b"a1")
+            local_store.queue_push("b", "b1")
+
+        # test bidirectional communication
+        with ThreadPoolExecutor(max_workers=2) as pool:
+            futures = [
+                pool.submit(worker_a),
+                pool.submit(worker_b),
+            ]
+            for fut in futures:
+                fut.result()
+
+    def test_queues_timeout(self) -> None:
+        store = self._create_store_or_skip_if_no_queues()
+
+        store.set_timeout(timedelta(seconds=0.01))
+        with self.assertRaisesRegex(DistStoreError, "timeout"):
+            store.queue_pop("non_existant")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_multi_set(self, store):
         if not store.has_extended_api():
             # Just return for stores that don't support extended APIs.
@@ -172,6 +248,18 @@ def _test_multi_get(self, store):
     def test_multi_get(self):
         self._test_multi_get(self._create_store())
 
+<<<<<<< HEAD
+=======
+    def test_clone(self):
+        a = self._create_store()
+        b = a.clone()
+
+        self.assertIsInstance(b, dist.Store)
+
+        a.set("foo", "bar")
+        self.assertEqual(b.get("foo"), b"bar")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This is the number of keys used in test_set_get. Adding this as a class
     # property instead of hardcoding in the test since some Store
     # implementations will have differing number of keys. In the base case,
@@ -293,12 +381,17 @@ def test_address_already_in_use(self):
             # Use noqa to silence flake8.
             # Need to store in an unused variable here to ensure the first
             # object is not destroyed before the second object is created.
+<<<<<<< HEAD
             store1 = dist.TCPStore(
                 addr, port, 1, True, use_libuv=self._use_libuv
             )  # noqa: F841
             store2 = dist.TCPStore(
                 addr, port, 1, True, use_libuv=self._use_libuv
             )  # noqa: F841
+=======
+            store1 = dist.TCPStore(addr, port, 1, True, use_libuv=self._use_libuv)  # noqa: F841
+            store2 = dist.TCPStore(addr, port, 1, True, use_libuv=self._use_libuv)  # noqa: F841
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(store1.libuvBackend, self._use_libuv)
             self.assertEqual(store2.libuvBackend, self._use_libuv)
 
@@ -567,6 +660,7 @@ def _create_store_with_ws(self, addr, world_size):
             addr, world_size, wait_for_workers=False, use_libuv=True
         )
 
+<<<<<<< HEAD
     def test_take_over_listen_socket(self):
         """
         override the take_over_listen_socket test in TCPStoreTest.
@@ -594,6 +688,8 @@ def test_take_over_listen_socket(self):
                 use_libuv=self._use_libuv,
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
     def setUp(self):
@@ -655,6 +751,12 @@ def compare_set(self, key, expected, newValue):
             val = self.store[key] = newValue
         return val
 
+<<<<<<< HEAD
+=======
+    def clone(self) -> "MyPythonStore":
+        return self
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class PythonStoreTest(TestCase):
     def test_set_get(self):
@@ -712,7 +814,11 @@ def test_common_errors(self):
 
     def test_nominal(self):
         with tempfile.NamedTemporaryFile(delete=False) as file:
+<<<<<<< HEAD
             url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2'
+=======
+            url = f"file:///{file.name.replace(os.path.sep, '/')}?world_size=2"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gen0 = dist.rendezvous(url + "&rank=0")
             store0, rank0, size0 = next(gen0)
             self.assertEqual(0, rank0)
@@ -1123,8 +1229,14 @@ def listen() -> None:
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     assert (
         not torch.cuda._initialized
     ), "test_distributed must not have initialized CUDA context on main process"
+=======
+    assert not torch.cuda._initialized, (
+        "test_distributed must not have initialized CUDA context on main process"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     run_tests()
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
index 0393b151966e..a95f261b93ef 100644
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@@ -1,41 +1,83 @@
 # Owner(s): ["module: c10d"]
 
+<<<<<<< HEAD
 import os
 from unittest import skipIf
+=======
+import itertools
+import os
+from contextlib import nullcontext
+from unittest import skip, skipIf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
 from torch._C._autograd import DeviceType
 from torch._C._distributed_c10d import _SymmetricMemory
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache, run_and_get_triton_code
+=======
+from torch._inductor.utils import fresh_cache, run_and_get_triton_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed._functional_collectives import all_gather_tensor
 from torch.distributed._symmetric_memory import (
     _fused_all_gather_matmul_fallback,
     _fused_all_gather_scaled_matmul_fallback,
     _fused_matmul_reduce_scatter_fallback,
+<<<<<<< HEAD
     _fused_scaled_matmul_reduce_scatter_fallback,
+=======
+    _test_mode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     enable_symm_mem_for_group,
     restride_A_for_fused_matmul_reduce_scatter,
     restride_A_shard_for_fused_all_gather_matmul,
 )
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater
+<<<<<<< HEAD
+from torch.testing._internal.common_distributed import (
+=======
+from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_distributed import (
+    MultiProcContinousTest,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MultiProcessTestCase,
     requires_multicast_support,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+<<<<<<< HEAD
+    parametrize,
+    requires_cuda,
+    run_tests,
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+=======
+    MI300_ARCH,
     parametrize,
     requires_cuda,
     run_tests,
+    runOnRocmArch,
     skip_but_pass_in_sandcastle_if,
     skipIfRocm,
+    TEST_WITH_ROCM,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 
 
+<<<<<<< HEAD
+=======
+test_contexts = [nullcontext, _test_mode]
+
+# So that tests are written in device-agnostic way
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def requires_cuda_p2p_access():
     cuda_p2p_access_available = (
         torch.cuda.is_available()
@@ -59,6 +101,7 @@ def requires_cuda_p2p_access():
 
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
+<<<<<<< HEAD
 class SymmetricMemoryTest(MultiProcessTestCase):
     def setUp(self) -> None:
         super().setUp()
@@ -71,10 +114,17 @@ def world_size(self) -> int:
     @property
     def device(self) -> torch.device:
         return torch.device(f"cuda:{self.rank}")
+=======
+class SymmetricMemoryTest(MultiProcContinousTest):
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _init_process(self, set_device: bool = True):
         if set_device:
             torch.cuda.set_device(self.device)
+<<<<<<< HEAD
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
             backend="nccl",
@@ -85,6 +135,10 @@ def _init_process(self, set_device: bool = True):
         torch.manual_seed(42 + self.rank)
 
     @requires_multicast_support()
+=======
+        torch.manual_seed(42 + self.rank)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_has_multicast_support(self) -> None:
         # validate that has_multicast_support() returns "false" instead of throwing
         self.assertFalse(_SymmetricMemory.has_multicast_support(DeviceType.CPU, 0))
@@ -102,7 +156,11 @@ def test_cuda_nvlink_connectivity_detection(self) -> None:
         for row in connectivity.matrix:
             self.assertEqual(len(row), torch.cuda.device_count())
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_large_alloc(self) -> None:
         t = symm_mem.empty(2 * 1024**3, dtype=torch.uint8, device="cuda")
         self.assertEqual(t.numel() * t.element_size(), 2 * 1024**3)
@@ -116,7 +174,11 @@ def _get_test_alloc_args(self):
         return (shape, stride, dtype, device, group_name)
 
     def _verify_symmetric_memory(self, symm_mem_hdl):
+<<<<<<< HEAD
         self.assertEqual(symm_mem_hdl.world_size, 2)
+=======
+        self.assertEqual(symm_mem_hdl.world_size, self.world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         buf = symm_mem_hdl.get_buffer(
             0, (symm_mem_hdl.buffer_size // 4,), torch.float32
@@ -142,7 +204,11 @@ def _verify_symmetric_memory(self, symm_mem_hdl):
 
         symm_mem_hdl.barrier()
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("set_device", [True, False])
     def test_empty_strided_p2p(self, set_device: bool) -> None:
@@ -159,9 +225,14 @@ def test_empty_strided_p2p(self, set_device: bool) -> None:
 
         del t
         self._verify_symmetric_memory(symm_mem_hdl)
+<<<<<<< HEAD
         dist.destroy_process_group()
 
     @skipIfRocm
+=======
+
+    @skipIfRocm  # started failing during ROCm 6.4 CI upgrade
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("set_device", [True, False])
     def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
@@ -187,9 +258,14 @@ def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
 
         symm_mem_hdl = _SymmetricMemory.rendezvous(t)
         self._verify_symmetric_memory(symm_mem_hdl)
+<<<<<<< HEAD
         dist.destroy_process_group()
 
     @skipIfRocm
+=======
+
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     def test_get_signal_pad(self) -> None:
         self._init_process()
@@ -230,6 +306,7 @@ def test_get_signal_pad(self) -> None:
         t.fill_(0)
         self.assertTrue(signal_pad.eq(42).all())
 
+<<<<<<< HEAD
         dist.destroy_process_group()
 
     @skipIfRocm
@@ -309,6 +386,12 @@ def test_allow_overlapping_devices(self) -> None:
             rank=self.rank,
             store=store,
         )
+=======
+    @runOnRocmArch(MI300_ARCH)
+    @requires_cuda
+    def test_allow_overlapping_devices(self) -> None:
+        os.environ["TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES"] = "1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         t = symm_mem.empty(64, device="cuda:0")
         symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
 
@@ -322,9 +405,15 @@ def test_allow_overlapping_devices(self) -> None:
             else:
                 self.assertEqual(buf.device, t.device)
 
+<<<<<<< HEAD
         dist.destroy_process_group()
 
     @skipIfRocm
+=======
+        os.environ["TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES"] = "0"
+
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("gather_dim", [0, 1])
     def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
@@ -354,9 +443,13 @@ def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
             assert torch.allclose(mm_output_0, mm_output_1)
             assert mm_output_0.stride(), mm_output_1.stride()
 
+<<<<<<< HEAD
         dist.destroy_process_group()
 
     @skipIfRocm
+=======
+    @skipIfRocm  # this requires async_input_mm support
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIf(
         not SM90OrLater,
         "_fused_all_gather_matmul_native currently only supports sm>=90",
@@ -413,10 +506,15 @@ def test_fused_all_gather_matmul_native(
 
         torch.testing.assert_close(ag_target, ag_baseline)
         torch.testing.assert_close(mm_target[0], mm_baseline[0])
+<<<<<<< HEAD
 
         dist.destroy_process_group()
 
     @skipIfRocm
+=======
+        os.environ["TORCH_SYMM_MEM_ENABLE_NATIVE_ASYNC_TP"] = "0"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @requires_multicast_support()
     def test_multimem_all_gather_matmul(self) -> None:
@@ -455,9 +553,13 @@ def test_multimem_all_gather_matmul(self) -> None:
         torch.testing.assert_close(ag_target, ag_baseline)
         torch.testing.assert_close(mm_target[0], mm_baseline[0])
 
+<<<<<<< HEAD
         dist.destroy_process_group()
 
     @skipIfRocm
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("gather_dim", [0, 1])
     @parametrize(
@@ -483,10 +585,16 @@ def test_fused_all_gather_scaled_matmul(
             raise AssertionError("Invalid scale_mode: {scale_mode}")
 
         torch.manual_seed(42 + rank)
+<<<<<<< HEAD
         A_shard = torch.rand(*leading_dims, K, device="cuda").to(torch.float8_e4m3fn)
         Bs = [
             torch.rand(N, K, device="cuda").to(torch.float8_e4m3fn).T for _ in range(3)
         ]
+=======
+
+        A_shard = torch.rand(*leading_dims, K, device="cuda").to(e4m3_type)
+        Bs = [torch.rand(N, K, device="cuda").to(e4m3_type).T for _ in range(3)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if scale_mode == "tensor-wise":
             A_scale = torch.tensor(0.1, device="cuda")
@@ -544,9 +652,13 @@ def test_fused_all_gather_scaled_matmul(
             self.assertEqual(mm_output_0.stride(), mm_output_1.stride())
             self.assertEqual(mm_output_0.dtype, mm_output_1.dtype)
 
+<<<<<<< HEAD
         dist.destroy_process_group()
 
     @skipIfRocm
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("scatter_dim", [0, 1])
     def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
@@ -573,9 +685,13 @@ def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
         assert torch.allclose(output_0, output_1)
         assert output_0.stride() == output_1.stride()
 
+<<<<<<< HEAD
         dist.destroy_process_group()
 
     @skipIfRocm
+=======
+    @skipIfRocm  # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("scatter_dim", [0, 1])
     @parametrize("rowwise", [True, False])
@@ -592,8 +708,13 @@ def test_fused_scaled_matmul_reduce_scatter(
         rank = self.rank
 
         torch.manual_seed(42 + rank)
+<<<<<<< HEAD
         A = torch.rand(BATCH, M, K, device="cuda").to(torch.float8_e4m3fn)
         B = torch.rand(N, K, device="cuda").to(torch.float8_e4m3fn).T
+=======
+        A = torch.rand(BATCH, M, K, device="cuda").to(e4m3_type)
+        B = torch.rand(N, K, device="cuda").to(e4m3_type).T
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if rowwise:
             A_scale = torch.full((BATCH, M, 1), 0.1, device="cuda")
@@ -602,6 +723,7 @@ def test_fused_scaled_matmul_reduce_scatter(
             A_scale = torch.tensor(0.1, device="cuda")
             B_scale = torch.tensor(0.1, device="cuda")
 
+<<<<<<< HEAD
         output_0 = _fused_scaled_matmul_reduce_scatter_fallback(
             A,
             B,
@@ -629,6 +751,32 @@ def test_fused_scaled_matmul_reduce_scatter(
         dist.destroy_process_group()
 
     @skipIfRocm
+=======
+        output_shape = [*A.shape[:-1], B.shape[1]]
+
+        outputs = []
+        for context in test_contexts:
+            with context():
+                outputs.append(
+                    torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+                        A,
+                        B,
+                        A_scale,
+                        B_scale,
+                        "avg",
+                        scatter_dim,
+                        scatter_dim,
+                        group.group_name,
+                        output_shape,
+                        out_dtype=torch.bfloat16,
+                    )
+                )
+
+        assert outputs[0].stride() == outputs[1].stride()
+        assert torch.allclose(outputs[0], outputs[1]), (outputs[0], outputs[1])
+
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("dim", [0, 1, 2])
     def test_optimal_layout(self, dim: int) -> None:
         t = torch.rand(8, 64, 32, 16)
@@ -641,7 +789,11 @@ def test_optimal_layout(self, dim: int) -> None:
         self.assertTrue(x.movedim(dim, 0).is_contiguous())
         self.assertTrue(torch.allclose(x, t))
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("symm_mem_input", [True, False])
     def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
@@ -666,9 +818,13 @@ def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
         for r in range(self.world_size):
             self.assertTrue(chunks[r].eq(r).all())
 
+<<<<<<< HEAD
         dist.destroy_process_group()
 
     @skipIfRocm
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @parametrize("reduce_op", ["sum", "avg"])
     @parametrize("symm_mem_input", [True, False])
@@ -704,6 +860,7 @@ def test_low_contention_reduce_scatter(
             raise AssertionError(f"Unexpected reduce_op: {reduce_op}")
         self.assertTrue(res.eq(expect).all())
 
+<<<<<<< HEAD
         dist.destroy_process_group()
 
 
@@ -734,6 +891,9 @@ def _init_process(self):
         torch.manual_seed(42 + self.rank)
 
     @skipIfRocm
+=======
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(4)
     def test_subgroup(self) -> None:
         self._init_process()
@@ -771,22 +931,38 @@ def test_subgroup(self) -> None:
             self.assertTrue(buf.eq(peer_rank + world.size() // 2).all())
 
 
+<<<<<<< HEAD
 @skipIfRocm
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
 class SymmMemCollectiveTest(MultiProcessTestCase):
+=======
+# This Test class is used to test the error handling of SymmetricMemory APIs.
+# Since a process restart is often needed after each test, we use the
+# MultiProcessTestCase instead of MultiProcContinousTest.
+@requires_cuda_p2p_access()
+class SymmMemNegativeTest(MultiProcessTestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def setUp(self) -> None:
         super().setUp()
         self._spawn_processes()
 
     @property
     def world_size(self) -> int:
+<<<<<<< HEAD
         # world_size > 2 is needed to verify accumulation order
         return 4
 
     @property
     def device(self) -> torch.device:
         return torch.device(f"cuda:{self.rank}")
+=======
+        return device_module.device_count()
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _init_process(self):
         torch.cuda.set_device(self.device)
@@ -799,6 +975,102 @@ def _init_process(self):
         )
         torch.manual_seed(42 + self.rank)
 
+<<<<<<< HEAD
+=======
+    # These timeout tests are skipped on ROCm because timeout calls trap(), which
+    # is handled differently inside hip runtime. It collects gpu coredump and causes
+    # the linux kernel to create a core dump of the host application. The funcitonality
+    # is there, meaning timeout is happening correctly. However, there isn't a nice way
+    # to test it as the current executing thread will coredump and exit.
+    @skipIfRocm
+    @skip_if_lt_x_gpu(2)
+    def test_barrier_timeout(self) -> None:
+        self._init_process()
+
+        t = symm_mem.empty(1, device="cuda")
+        symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
+
+        if self.rank == 0:
+            with self.assertRaises(RuntimeError):
+                symm_mem_hdl.barrier(timeout_ms=1000)
+                torch.cuda.synchronize()
+        else:
+            torch.cuda.synchronize()
+
+        # The device-side timeout triggers a __trap() that causes all
+        # subsequent host/device interactions to result in an "unspecified
+        # launch failure." Using os._exit(0) to abort the test, as it's
+        # impossible to terminate the process in this state.
+        os._exit(0)
+
+    # These timeout tests are skipped on ROCm because timeout calls trap(), which
+    # is handled differently inside hip runtime. It collects gpu coredump and causes
+    # the linux kernel to create a core dump of the host application. The funcitonality
+    # is there, meaning timeout is happening correctly. However, there isn't a nice way
+    # to test it as the current executing thread will coredump and exit.
+    @skipIfRocm
+    @skip_if_lt_x_gpu(2)
+    def test_put_signal_timeout(self) -> None:
+        self._init_process()
+
+        t = symm_mem.empty(1, device="cuda")
+        symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
+
+        if self.rank == 0:
+            with self.assertRaises(RuntimeError):
+                # First, put a signal into rank 1's signal pad. Since rank 1
+                # doesn't wait on this signal, the subsequent put will timeout.
+                symm_mem_hdl.put_signal(dst_rank=1)
+                symm_mem_hdl.put_signal(dst_rank=1, timeout_ms=1000)
+                torch.cuda.synchronize()
+        else:
+            torch.cuda.synchronize()
+
+        # The device-side timeout triggers a __trap() that causes all
+        # subsequent host/device interactions to result in an "unspecified
+        # launch failure." Using os._exit(0) to abort the test, as it's
+        # impossible to terminate the process in this state.
+        os._exit(0)
+
+    # These timeout tests are skipped on ROCm because timeout calls trap(), which
+    # is handled differently inside hip runtime. It collects gpu coredump and causes
+    # the linux kernel to create a core dump of the host application. The funcitonality
+    # is there, meaning timeout is happening correctly. However, there isn't a nice way
+    # to test it as the current executing thread will coredump and exit.
+    @skipIfRocm
+    @skip_if_lt_x_gpu(2)
+    def test_wait_signal_timeout(self) -> None:
+        self._init_process()
+
+        t = symm_mem.empty(1, device="cuda")
+        symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
+
+        if self.rank == 0:
+            with self.assertRaises(RuntimeError):
+                symm_mem_hdl.wait_signal(src_rank=1, timeout_ms=1000)
+                torch.cuda.synchronize()
+        else:
+            torch.cuda.synchronize()
+
+        # The device-side timeout triggers a __trap() that causes all
+        # subsequent host/device interactions to result in an "unspecified
+        # launch failure." Using os._exit(0) to abort the test, as it's
+        # impossible to terminate the process in this state.
+        os._exit(0)
+
+
+@instantiate_parametrized_tests
+@requires_cuda_p2p_access()
+class SymmMemCollectiveTest(MultiProcContinousTest):
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    def _init_process(self):
+        torch.cuda.set_device(self.device)
+        torch.manual_seed(42 + self.rank)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(4)
     @requires_multicast_support()
     @parametrize("dtype", [torch.float, torch.bfloat16])
@@ -830,8 +1102,11 @@ def test_multimem_all_reduce(
         self.assertTrue(t[shift + numel :].eq(0).all().item())
         self._verify_all_reduce_result(inp, res)
 
+<<<<<<< HEAD
         dist.destroy_process_group()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(4)
     @requires_multicast_support()
     @parametrize("dtype", [torch.float, torch.bfloat16])
@@ -857,6 +1132,7 @@ def test_multimem_one_shot_all_reduce(
             gathered_inps.sum(dim=0), res, rtol=1e-03, atol=1e-05
         )
 
+<<<<<<< HEAD
         dist.destroy_process_group()
 
     @skipIfRocm
@@ -912,6 +1188,75 @@ def test_two_shot_all_reduce(
         self._verify_all_reduce_result(inp, res)
 
         dist.destroy_process_group()
+=======
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(4)
+    def test_one_shot_all_reduce(self) -> None:
+        self._init_process()
+        group_name = dist.group.WORLD.group_name
+
+        for dtype, size_bytes, align_bytes, copy, offset in itertools.product(
+            [torch.float, torch.bfloat16],
+            [4, 8192, 8196],
+            [
+                8
+            ],  # TODO: add back [4, 8, 16], currently OOM when looping over all combinations
+            [True, False],
+            [0, 16],
+        ):
+            inp = symm_mem.empty(
+                size_bytes // dtype.itemsize + offset, dtype=dtype, device=self.device
+            )
+            symm_mem.rendezvous(inp, group=group_name)
+            if not copy:
+                inp.normal_()
+                res = torch.ops.symm_mem.one_shot_all_reduce(
+                    inp[offset:], "sum", group_name
+                )
+            if copy:
+                local_inp = torch.randn_like(inp[offset:])
+                res = torch.ops.symm_mem.one_shot_all_reduce_copy(
+                    inp[offset:], local_inp, "sum", group_name
+                )
+            self._verify_all_reduce_result(local_inp if copy else inp[offset:], res)
+
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(4)
+    def test_two_shot_all_reduce(self) -> None:
+        self._init_process()
+        group_name = dist.group.WORLD.group_name
+
+        for dtype, size_bytes, align_bytes, inplace in itertools.product(
+            [torch.float, torch.bfloat16],
+            [4, 8192, 8196],
+            [
+                8
+            ],  # TODO: add back [4, 8, 16], currently OOM when looping over all combinations
+            [True, False],
+        ):
+            t = symm_mem.empty(16384, dtype=dtype, device=self.device).fill_(0)
+            symm_mem.rendezvous(t, group=group_name)
+
+            self.assertTrue(t.data_ptr() % 16 == 0)
+            self.assertTrue(align_bytes % t.element_size() == 0)
+            self.assertTrue(size_bytes % t.element_size() == 0)
+
+            shift = align_bytes // t.element_size()
+            numel = size_bytes // t.element_size()
+            res = t[shift : shift + numel]
+            res.normal_()
+            inp = res.clone()
+            if not inplace:
+                out = torch.empty_like(inp)
+                torch.ops.symm_mem.two_shot_all_reduce_out(res, "sum", group_name, out)
+            else:
+                torch.ops.symm_mem.two_shot_all_reduce_(res, "sum", group_name)
+
+            # Head and tail should not be written
+            self.assertTrue(t[:shift].eq(0).all().item())
+            self.assertTrue(t[shift + numel :].eq(0).all().item())
+            self._verify_all_reduce_result(inp, res if inplace else out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _verify_all_reduce_result(self, inp, res):
         gathered_res = all_gather_tensor(res, 0, "0").view(self.world_size, -1)
@@ -926,9 +1271,87 @@ def _verify_all_reduce_result(self, inp, res):
             gathered_inps.sum(dim=0), res, rtol=1e-01, atol=1e-01
         )
 
+<<<<<<< HEAD
     @skip_if_lt_x_gpu(4)
     @parametrize("align_bytes", [4, 8, 16])
     @requires_multicast_support()
+=======
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(4)
+    def test_reduce_scatter(self) -> None:
+        self._init_process()
+        group_name = dist.group.WORLD.group_name
+
+        for dtype, size_bytes, align_bytes, split_last_dim in itertools.product(
+            [torch.float, torch.bfloat16],
+            [128, 8192, 36 * 1024 * 16],
+            [
+                8
+            ],  # TODO: add back [4, 8, 16], currently OOM when looping over all combinations
+            [True, False],
+        ):
+            t = symm_mem.empty(36 * 1024 * 16, dtype=dtype, device=self.device).fill_(0)
+            symm_mem.rendezvous(t, group=group_name)
+
+            self.assertTrue(t.data_ptr() % 16 == 0)
+            self.assertTrue(align_bytes % t.element_size() == 0)
+            self.assertTrue(size_bytes % t.element_size() == 0)
+
+            shift = align_bytes // t.element_size()
+            numel = size_bytes // t.element_size()
+            res = t[shift : shift + numel].normal_()
+            if split_last_dim:
+                res = res.view(-1, 128 // t.element_size())
+            inp = res.clone()
+            out_size = list(inp.shape)
+            out_size[-1] = inp.shape[-1] // self.world_size
+            out = torch.empty(out_size, dtype=dtype, device=self.device)
+            torch.ops.symm_mem.reduce_scatter_out(res, group_name, split_last_dim, out)
+
+            # Head and tail should not be written
+            self.assertTrue(t[:shift].eq(0).all().item())
+            self.assertTrue(t[shift + numel :].eq(0).all().item())
+            self._verify_reduce_scatter_result(inp, out)
+
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(4)
+    def test_reduce_scatter_corner_cases(self) -> None:
+        self._init_process()
+        dtype = torch.bfloat16
+        group_name = dist.group.WORLD.group_name
+        t = symm_mem.empty(16384, dtype=dtype, device=self.device).fill_(0)
+        symm_mem.rendezvous(t, group=group_name)
+        res = t[:0]
+        out_size = res.shape[0] // self.world_size
+        out = torch.empty(out_size, dtype=dtype, device=self.device)
+        torch.ops.symm_mem.reduce_scatter_out(res, group_name, False, out)
+        res = t[:48]
+        out_size = res.shape[0] // self.world_size
+        out = torch.empty(out_size, dtype=dtype, device=self.device)
+        with self.assertRaisesRegex(RuntimeError, "divisible"):
+            torch.ops.symm_mem.reduce_scatter_out(res, group_name, False, out)
+        res = t[: 2 * 48].view(2, 48)
+        out = torch.empty(2, 48 // self.world_size, dtype=dtype, device=self.device)
+        with self.assertRaisesRegex(RuntimeError, "divisible"):
+            torch.ops.symm_mem.reduce_scatter_out(res, group_name, True, out)
+
+    def _verify_reduce_scatter_result(self, inp, res):
+        gathered_res = all_gather_tensor(res, 0, "0").view(self.world_size, *res.shape)
+        gathered_inps = all_gather_tensor(inp, 0, "0").view(self.world_size, *inp.shape)
+        sum_inps = gathered_inps.sum(0)
+        slice_width = sum_inps.shape[-1] // self.world_size
+        for i in range(self.world_size):
+            torch.testing.assert_close(
+                gathered_res[i],
+                sum_inps[..., i * slice_width : (i + 1) * slice_width],
+                rtol=1e-01,
+                atol=1.1e-01,
+            )
+
+    @skip_if_lt_x_gpu(4)
+    @requires_multicast_support()
+    @parametrize("align_bytes", [4, 8, 16])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multimem_all_gather(self, align_bytes: int) -> None:
         self._init_process()
         group_name = dist.group.WORLD.group_name
@@ -951,11 +1374,15 @@ def test_multimem_all_gather(self, align_bytes: int) -> None:
         ref = torch.ops._c10d_functional.wait_tensor(ref)
 
         self.assertTrue(out.eq(ref).all())
+<<<<<<< HEAD
         dist.destroy_process_group()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
+<<<<<<< HEAD
 class LoweringTest(MultiProcessTestCase):
     def setUp(self) -> None:
         super().setUp()
@@ -989,6 +1416,25 @@ def _init_process(self):
     def test_lowering_one_shot_all_reduce(self):
         self._init_process()
 
+=======
+class LoweringTest(MultiProcContinousTest):
+    def _init_process(self) -> None:
+        torch.cuda.set_device(self.device)
+        enable_symm_mem_for_group(dist.group.WORLD.group_name)
+        torch.manual_seed(42 + self.rank)
+        torch._inductor.config._collective.auto_select = True
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    @skip("Fails with 'one_shot_all_reduce' not found in AOT graph, TODO: fix")
+    @skipIfRocm  # requires registered-buffer support
+    @skip_if_lt_x_gpu(2)
+    @fresh_cache()
+    def test_lowering_one_shot_all_reduce(self):
+        self._init_process()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arg = torch.rand(4, 4, device=self.device)
 
         def func_0(x):
@@ -1039,12 +1485,21 @@ def func_3(x):
 
 
 class SymmMemSingleProcTest(TestCase):
+<<<<<<< HEAD
     @skipIfRocm
     @requires_cuda
     @skipIf(
         _get_torch_cuda_version() < (12, 0),
         "stream_write_value32 currently only supports cuda version>=12.0",
     )
+=======
+    @requires_cuda
+    @skipIf(
+        not TEST_WITH_ROCM and _get_torch_cuda_version() < (12, 0),
+        "stream_write_value32 currently only supports cuda version>=12.0",
+    )
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stream_write_value32(self):
         tensor = torch.zeros(4, dtype=torch.uint32, device="cuda")
         expect = torch.tril(torch.ones(4, 4, device="cuda")).to(torch.uint32)
@@ -1059,8 +1514,13 @@ def test_stream_write_value32(self):
         with self.assertRaises(RuntimeError):
             _SymmetricMemory.stream_write_value32(tensor, offset=0, val=4294967296)
 
+<<<<<<< HEAD
     @skipIfRocm
     @requires_cuda
+=======
+    @requires_cuda
+    @runOnRocmArch(MI300_ARCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_memset32(self):
         t = _SymmetricMemory.empty_strided_p2p(
             (64,),
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 5e214fd36d90..bbf58737df8d 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -57,6 +57,10 @@
     ExponentialFamily,
     FisherSnedecor,
     Gamma,
+<<<<<<< HEAD
+=======
+    GeneralizedPareto,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Geometric,
     Gumbel,
     HalfCauchy,
@@ -151,7 +155,11 @@ def is_all_nan(tensor):
 Example = namedtuple("Example", ["Dist", "params"])
 
 
+<<<<<<< HEAD
 # Register all distributions for generic tests.
+=======
+# Register all distributions for generic tests by appending to this list.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_examples():
     return [
         Example(
@@ -800,9 +808,26 @@ def _get_examples():
                 },
             ],
         ),
+<<<<<<< HEAD
     ]
 
 
+=======
+        Example(
+            GeneralizedPareto,
+            [
+                {
+                    "loc": torch.randn(5, 5, requires_grad=True).mul(10),
+                    "scale": torch.randn(5, 5).abs().requires_grad_(),
+                    "concentration": torch.randn(5, 5).div(10).requires_grad_(),
+                },
+            ],
+        ),
+    ]
+
+
+# Register all distributions for bad examples by appending to this list.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_bad_examples():
     return [
         Example(
@@ -1199,6 +1224,24 @@ def _get_bad_examples():
                 },
             ],
         ),
+<<<<<<< HEAD
+=======
+        Example(
+            GeneralizedPareto,
+            [
+                {
+                    "loc": torch.tensor([0.0, 0.0], requires_grad=True),
+                    "scale": torch.tensor([-1.0, -100.0], requires_grad=True),
+                    "concentration": torch.tensor([0.0, 0.0], requires_grad=True),
+                },
+                {
+                    "loc": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "scale": torch.tensor([0.0, 0.0], requires_grad=True),
+                    "concentration": torch.tensor([-1.0, -100.0], requires_grad=True),
+                },
+            ],
+        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
 
@@ -2531,10 +2574,17 @@ def test_mixture_same_family_shape(self):
         )
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+<<<<<<< HEAD
     def test_mixture_same_family_log_prob(self):
         probs = torch.rand(5, 5).softmax(dim=-1)
         loc = torch.randn(5, 5)
         scale = torch.rand(5, 5)
+=======
+    def test_mixture_same_family_normal_log_prob(self):
+        probs = torch.rand(10, 5).softmax(dim=-1)
+        loc = torch.randn(10, 5)
+        scale = torch.rand(10, 5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def ref_log_prob(idx, x, log_prob):
             p = probs[idx].numpy()
@@ -2551,6 +2601,30 @@ def ref_log_prob(idx, x, log_prob):
         )
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+<<<<<<< HEAD
+=======
+    def test_mixture_same_family_binomial_log_prob(self):
+        max_count = 20
+        probs = torch.rand(10, 5).softmax(dim=-1)
+        binom_probs = torch.rand(10, 5)
+
+        def ref_log_prob(idx, x, log_prob):
+            p = probs[idx].numpy()
+            binom_p = binom_probs[idx].numpy()
+            mix = scipy.stats.multinomial(1, p)
+            comp = scipy.stats.binom(max_count, binom_p)
+            expected = scipy.special.logsumexp(comp.logpmf(x) + np.log(mix.p))
+            self.assertEqual(log_prob, expected, atol=1e-3, rtol=0)
+
+        self._check_log_prob(
+            MixtureSameFamily(
+                Categorical(probs=probs), Binomial(max_count, binom_probs)
+            ),
+            ref_log_prob,
+        )
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mixture_same_family_sample(self):
         probs = torch.rand(5).softmax(dim=-1)
         loc = torch.randn(5)
@@ -3498,6 +3572,54 @@ def test_pareto_sample(self):
             )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+<<<<<<< HEAD
+=======
+    def test_generalized_pareto(self):
+        loc = torch.randn(2, 3).requires_grad_()
+        scale = torch.randn(2, 3).abs().requires_grad_()
+        concentration = torch.randn(2, 3).requires_grad_()
+        loc_1d = torch.randn(1).requires_grad_()
+        scale_1d = torch.randn(1).abs().requires_grad_()
+        concentration_1d = torch.randn(1).requires_grad_()
+        self.assertEqual(
+            GeneralizedPareto(loc, scale, concentration).sample().size(), (2, 3)
+        )
+        self.assertEqual(
+            GeneralizedPareto(loc, scale, concentration).sample((5,)).size(), (5, 2, 3)
+        )
+        self.assertEqual(
+            GeneralizedPareto(loc_1d, scale_1d, concentration_1d).sample((1,)).size(),
+            (1, 1),
+        )
+        self.assertEqual(
+            GeneralizedPareto(loc_1d, scale_1d, concentration_1d).sample().size(), (1,)
+        )
+        self.assertEqual(GeneralizedPareto(1.0, 1.0, 1.0).sample().size(), ())
+        self.assertEqual(GeneralizedPareto(1.0, 1.0, 1.0).sample((1,)).size(), (1,))
+
+        def ref_log_prob(idx, x, log_prob):
+            l = loc.view(-1)[idx].detach()
+            s = scale.view(-1)[idx].detach()
+            c = concentration.view(-1)[idx].detach()
+            expected = scipy.stats.genpareto.logpdf(x, c, loc=l, scale=s)
+            self.assertEqual(log_prob, expected, atol=1e-3, rtol=0)
+
+        self._check_log_prob(GeneralizedPareto(loc, scale, concentration), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_generalized_pareto_sample(self):
+        set_rng_seed(1)  # see note [Randomized statistical tests]
+        for loc, scale, concentration in product(
+            [-1.0, 0.0, 1.0], [0.1, 1.0, 10.0], [-0.5, 0.0, 0.5]
+        ):
+            self._check_sampler_sampler(
+                GeneralizedPareto(loc, scale, concentration),
+                scipy.stats.genpareto(c=concentration, loc=loc, scale=scale),
+                f"GeneralizedPareto(loc={loc}, scale={scale}, concentration={concentration})",
+                failure_rate=7e-4,
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_gumbel(self):
         loc = torch.randn(2, 3, requires_grad=True)
         scale = torch.randn(2, 3).abs().requires_grad_()
@@ -6321,6 +6443,17 @@ def setUp(self):
                 Gumbel(random_var, positive_var2),
                 scipy.stats.gumbel_r(random_var, positive_var2),
             ),
+<<<<<<< HEAD
+=======
+            (
+                GeneralizedPareto(
+                    loc=random_var, scale=positive_var, concentration=random_var / 10
+                ),
+                scipy.stats.genpareto(
+                    c=random_var / 10, loc=random_var, scale=positive_var
+                ),
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (HalfCauchy(positive_var), scipy.stats.halfcauchy(scale=positive_var)),
             (HalfNormal(positive_var2), scipy.stats.halfnorm(scale=positive_var2)),
             (
@@ -6724,6 +6857,15 @@ def _examples(self):
     def _perturb_tensor(self, value, constraint):
         if isinstance(constraint, constraints._IntegerGreaterThan):
             return value + 1
+<<<<<<< HEAD
+=======
+        if isinstance(constraint, constraints._LessThan):
+            return value - torch.rand(value.shape)
+        if isinstance(
+            constraint, (constraints._GreaterThan, constraints._GreaterThanEq)
+        ):
+            return value + torch.rand(value.shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(
             constraint,
             (constraints._PositiveDefinite, constraints._PositiveSemidefinite),
@@ -6742,18 +6884,34 @@ def _perturb_tensor(self, value, constraint):
 
     def _perturb(self, Dist, keys, values, sample):
         with torch.no_grad():
+<<<<<<< HEAD
             if Dist is Uniform:
                 param = dict(zip(keys, values))
                 param["low"] = param["low"] - torch.rand(param["low"].shape)
                 param["high"] = param["high"] + torch.rand(param["high"].shape)
                 values = [param[key] for key in keys]
             else:
+=======
+            if isinstance(Dist.arg_constraints, dict):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 values = [
                     self._perturb_tensor(
                         value, Dist.arg_constraints.get(key, constraints.real)
                     )
                     for key, value in zip(keys, values)
                 ]
+<<<<<<< HEAD
+=======
+            else:
+                # arg_constraints is parameter-dependent
+                dist = Dist(**dict(zip(keys, values)))
+                values = [
+                    self._perturb_tensor(
+                        value, dist.arg_constraints.get(key, constraints.real)
+                    )
+                    for key, value in zip(keys, values)
+                ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             param = dict(zip(keys, values))
             sample = Dist(**param).sample()
             return values, sample
diff --git a/test/dynamo/cpython/3_13/CHANGES.txt b/test/dynamo/cpython/3_13/CHANGES.txt
new file mode 100644
index 000000000000..ad541cea2b31
--- /dev/null
+++ b/test/dynamo/cpython/3_13/CHANGES.txt
@@ -0,0 +1,9 @@
+This subdirectory contains a selection of tests from the CPython repository (branch: v3.13.0):\
+https://github.com/python/cpython/releases/tag/v3.13.0
+
+Modifications were made to ensure compatibility with the Dynamo infrastructure:
++ Monkey-patched `unittest.TestCase` to `torch._dynamo.test_case.CPythonTestCase`.
++ Replaced `unittest.main()` with `torch._dynamo.test_case.run_tests()`.
++ Assigned test "owners."
++ Annotated CPU-intensive tests with the `@slowTest` decorator.
++ Adjusted imports to use `import module` instead of `from test import module`.
diff --git a/test/dynamo/cpython/3_13/LICENSE b/test/dynamo/cpython/3_13/LICENSE
new file mode 100644
index 000000000000..1c9c8bddbbf3
--- /dev/null
+++ b/test/dynamo/cpython/3_13/LICENSE
@@ -0,0 +1,46 @@
+PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+--------------------------------------------
+
+1. This LICENSE AGREEMENT is between the Python Software Foundation
+("PSF"), and the Individual or Organization ("Licensee") accessing and
+otherwise using this software ("Python") in source or binary form and
+its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, PSF hereby
+grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+analyze, test, perform and/or display publicly, prepare derivative works,
+distribute, and otherwise use Python alone or in any derivative version,
+provided, however, that PSF's License Agreement and PSF's notice of copyright,
+i.e., "Copyright (c) 2001 Python Software Foundation; All Rights Reserved"
+are retained in Python alone or in any derivative version prepared by Licensee.
+
+3. In the event Licensee prepares a derivative work that is based on
+or incorporates Python or any part thereof, and wants to make
+the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to Python.
+
+4. PSF is making Python available to Licensee on an "AS IS"
+basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between PSF and
+Licensee.  This License Agreement does not grant permission to use PSF
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+
+8. By copying, installing or otherwise using Python, Licensee
+agrees to be bound by the terms and conditions of this License
+Agreement.
diff --git a/test/dynamo/cpython/3_13/list_tests.diff b/test/dynamo/cpython/3_13/list_tests.diff
new file mode 100644
index 000000000000..ab26ef23f44e
--- /dev/null
+++ b/test/dynamo/cpython/3_13/list_tests.diff
@@ -0,0 +1,67 @@
+diff --git a/test/dynamo/cpython/3_13/list_tests.py b/test/dynamo/cpython/3_13/list_tests.py
+index dbc5ef4f9f2..2b9f3b9311f 100644
+--- a/test/dynamo/cpython/3_13/list_tests.py
++++ b/test/dynamo/cpython/3_13/list_tests.py
+@@ -1,3 +1,53 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ """
+ Tests common to list and UserList.UserList
+ """
+@@ -5,7 +55,7 @@ Tests common to list and UserList.UserList
+ import sys
+ from functools import cmp_to_key
+ 
+-from test import seq_tests
++import seq_tests
+ from test.support import ALWAYS_EQ, NEVER_EQ, get_c_recursion_limit
+ 
+ 
diff --git a/test/dynamo/cpython/3_13/list_tests.py b/test/dynamo/cpython/3_13/list_tests.py
new file mode 100644
index 000000000000..2b9f3b9311f4
--- /dev/null
+++ b/test/dynamo/cpython/3_13/list_tests.py
@@ -0,0 +1,627 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+"""
+Tests common to list and UserList.UserList
+"""
+
+import sys
+from functools import cmp_to_key
+
+import seq_tests
+from test.support import ALWAYS_EQ, NEVER_EQ, get_c_recursion_limit
+
+
+class CommonTest(seq_tests.CommonTest):
+
+    def test_init(self):
+        # Iterable arg is optional
+        self.assertEqual(self.type2test([]), self.type2test())
+
+        # Init clears previous values
+        a = self.type2test([1, 2, 3])
+        a.__init__()
+        self.assertEqual(a, self.type2test([]))
+
+        # Init overwrites previous values
+        a = self.type2test([1, 2, 3])
+        a.__init__([4, 5, 6])
+        self.assertEqual(a, self.type2test([4, 5, 6]))
+
+        # Mutables always return a new object
+        b = self.type2test(a)
+        self.assertNotEqual(id(a), id(b))
+        self.assertEqual(a, b)
+
+    def test_getitem_error(self):
+        a = []
+        msg = "list indices must be integers or slices"
+        with self.assertRaisesRegex(TypeError, msg):
+            a['a']
+
+    def test_setitem_error(self):
+        a = []
+        msg = "list indices must be integers or slices"
+        with self.assertRaisesRegex(TypeError, msg):
+            a['a'] = "python"
+
+    def test_repr(self):
+        l0 = []
+        l2 = [0, 1, 2]
+        a0 = self.type2test(l0)
+        a2 = self.type2test(l2)
+
+        self.assertEqual(str(a0), str(l0))
+        self.assertEqual(repr(a0), repr(l0))
+        self.assertEqual(repr(a2), repr(l2))
+        self.assertEqual(str(a2), "[0, 1, 2]")
+        self.assertEqual(repr(a2), "[0, 1, 2]")
+
+        a2.append(a2)
+        a2.append(3)
+        self.assertEqual(str(a2), "[0, 1, 2, [...], 3]")
+        self.assertEqual(repr(a2), "[0, 1, 2, [...], 3]")
+
+    def test_repr_deep(self):
+        a = self.type2test([])
+        for i in range(get_c_recursion_limit() + 1):
+            a = self.type2test([a])
+        self.assertRaises(RecursionError, repr, a)
+
+    def test_set_subscript(self):
+        a = self.type2test(range(20))
+        self.assertRaises(ValueError, a.__setitem__, slice(0, 10, 0), [1,2,3])
+        self.assertRaises(TypeError, a.__setitem__, slice(0, 10), 1)
+        self.assertRaises(ValueError, a.__setitem__, slice(0, 10, 2), [1,2])
+        self.assertRaises(TypeError, a.__getitem__, 'x', 1)
+        a[slice(2,10,3)] = [1,2,3]
+        self.assertEqual(a, self.type2test([0, 1, 1, 3, 4, 2, 6, 7, 3,
+                                            9, 10, 11, 12, 13, 14, 15,
+                                            16, 17, 18, 19]))
+
+    def test_reversed(self):
+        a = self.type2test(range(20))
+        r = reversed(a)
+        self.assertEqual(list(r), self.type2test(range(19, -1, -1)))
+        self.assertRaises(StopIteration, next, r)
+        self.assertEqual(list(reversed(self.type2test())),
+                         self.type2test())
+        # Bug 3689: make sure list-reversed-iterator doesn't have __len__
+        self.assertRaises(TypeError, len, reversed([1,2,3]))
+
+    def test_setitem(self):
+        a = self.type2test([0, 1])
+        a[0] = 0
+        a[1] = 100
+        self.assertEqual(a, self.type2test([0, 100]))
+        a[-1] = 200
+        self.assertEqual(a, self.type2test([0, 200]))
+        a[-2] = 100
+        self.assertEqual(a, self.type2test([100, 200]))
+        self.assertRaises(IndexError, a.__setitem__, -3, 200)
+        self.assertRaises(IndexError, a.__setitem__, 2, 200)
+
+        a = self.type2test([])
+        self.assertRaises(IndexError, a.__setitem__, 0, 200)
+        self.assertRaises(IndexError, a.__setitem__, -1, 200)
+        self.assertRaises(TypeError, a.__setitem__)
+
+        a = self.type2test([0,1,2,3,4])
+        a[0] = 1
+        a[1] = 2
+        a[2] = 3
+        self.assertEqual(a, self.type2test([1,2,3,3,4]))
+        a[0] = 5
+        a[1] = 6
+        a[2] = 7
+        self.assertEqual(a, self.type2test([5,6,7,3,4]))
+        a[-2] = 88
+        a[-1] = 99
+        self.assertEqual(a, self.type2test([5,6,7,88,99]))
+        a[-2] = 8
+        a[-1] = 9
+        self.assertEqual(a, self.type2test([5,6,7,8,9]))
+
+        msg = "list indices must be integers or slices"
+        with self.assertRaisesRegex(TypeError, msg):
+            a['a'] = "python"
+
+    def test_delitem(self):
+        a = self.type2test([0, 1])
+        del a[1]
+        self.assertEqual(a, [0])
+        del a[0]
+        self.assertEqual(a, [])
+
+        a = self.type2test([0, 1])
+        del a[-2]
+        self.assertEqual(a, [1])
+        del a[-1]
+        self.assertEqual(a, [])
+
+        a = self.type2test([0, 1])
+        self.assertRaises(IndexError, a.__delitem__, -3)
+        self.assertRaises(IndexError, a.__delitem__, 2)
+
+        a = self.type2test([])
+        self.assertRaises(IndexError, a.__delitem__, 0)
+
+        self.assertRaises(TypeError, a.__delitem__)
+
+    def test_setslice(self):
+        l = [0, 1]
+        a = self.type2test(l)
+
+        for i in range(-3, 4):
+            a[:i] = l[:i]
+            self.assertEqual(a, l)
+            a2 = a[:]
+            a2[:i] = a[:i]
+            self.assertEqual(a2, a)
+            a[i:] = l[i:]
+            self.assertEqual(a, l)
+            a2 = a[:]
+            a2[i:] = a[i:]
+            self.assertEqual(a2, a)
+            for j in range(-3, 4):
+                a[i:j] = l[i:j]
+                self.assertEqual(a, l)
+                a2 = a[:]
+                a2[i:j] = a[i:j]
+                self.assertEqual(a2, a)
+
+        aa2 = a2[:]
+        aa2[:0] = [-2, -1]
+        self.assertEqual(aa2, [-2, -1, 0, 1])
+        aa2[0:] = []
+        self.assertEqual(aa2, [])
+
+        a = self.type2test([1, 2, 3, 4, 5])
+        a[:-1] = a
+        self.assertEqual(a, self.type2test([1, 2, 3, 4, 5, 5]))
+        a = self.type2test([1, 2, 3, 4, 5])
+        a[1:] = a
+        self.assertEqual(a, self.type2test([1, 1, 2, 3, 4, 5]))
+        a = self.type2test([1, 2, 3, 4, 5])
+        a[1:-1] = a
+        self.assertEqual(a, self.type2test([1, 1, 2, 3, 4, 5, 5]))
+
+        a = self.type2test([])
+        a[:] = tuple(range(10))
+        self.assertEqual(a, self.type2test(range(10)))
+
+        self.assertRaises(TypeError, a.__setitem__, slice(0, 1, 5))
+
+        self.assertRaises(TypeError, a.__setitem__)
+
+    def test_slice_assign_iterator(self):
+        x = self.type2test(range(5))
+        x[0:3] = reversed(range(3))
+        self.assertEqual(x, self.type2test([2, 1, 0, 3, 4]))
+
+        x[:] = reversed(range(3))
+        self.assertEqual(x, self.type2test([2, 1, 0]))
+
+    def test_delslice(self):
+        a = self.type2test([0, 1])
+        del a[1:2]
+        del a[0:1]
+        self.assertEqual(a, self.type2test([]))
+
+        a = self.type2test([0, 1])
+        del a[1:2]
+        del a[0:1]
+        self.assertEqual(a, self.type2test([]))
+
+        a = self.type2test([0, 1])
+        del a[-2:-1]
+        self.assertEqual(a, self.type2test([1]))
+
+        a = self.type2test([0, 1])
+        del a[-2:-1]
+        self.assertEqual(a, self.type2test([1]))
+
+        a = self.type2test([0, 1])
+        del a[1:]
+        del a[:1]
+        self.assertEqual(a, self.type2test([]))
+
+        a = self.type2test([0, 1])
+        del a[1:]
+        del a[:1]
+        self.assertEqual(a, self.type2test([]))
+
+        a = self.type2test([0, 1])
+        del a[-1:]
+        self.assertEqual(a, self.type2test([0]))
+
+        a = self.type2test([0, 1])
+        del a[-1:]
+        self.assertEqual(a, self.type2test([0]))
+
+        a = self.type2test([0, 1])
+        del a[:]
+        self.assertEqual(a, self.type2test([]))
+
+    def test_append(self):
+        a = self.type2test([])
+        a.append(0)
+        a.append(1)
+        a.append(2)
+        self.assertEqual(a, self.type2test([0, 1, 2]))
+
+        self.assertRaises(TypeError, a.append)
+
+    def test_extend(self):
+        a1 = self.type2test([0])
+        a2 = self.type2test((0, 1))
+        a = a1[:]
+        a.extend(a2)
+        self.assertEqual(a, a1 + a2)
+
+        a.extend(self.type2test([]))
+        self.assertEqual(a, a1 + a2)
+
+        a.extend(a)
+        self.assertEqual(a, self.type2test([0, 0, 1, 0, 0, 1]))
+
+        a = self.type2test("spam")
+        a.extend("eggs")
+        self.assertEqual(a, list("spameggs"))
+
+        self.assertRaises(TypeError, a.extend, None)
+        self.assertRaises(TypeError, a.extend)
+
+        # overflow test. issue1621
+        class CustomIter:
+            def __iter__(self):
+                return self
+            def __next__(self):
+                raise StopIteration
+            def __length_hint__(self):
+                return sys.maxsize
+        a = self.type2test([1,2,3,4])
+        a.extend(CustomIter())
+        self.assertEqual(a, [1,2,3,4])
+
+
+    def test_insert(self):
+        a = self.type2test([0, 1, 2])
+        a.insert(0, -2)
+        a.insert(1, -1)
+        a.insert(2, 0)
+        self.assertEqual(a, [-2, -1, 0, 0, 1, 2])
+
+        b = a[:]
+        b.insert(-2, "foo")
+        b.insert(-200, "left")
+        b.insert(200, "right")
+        self.assertEqual(b, self.type2test(["left",-2,-1,0,0,"foo",1,2,"right"]))
+
+        self.assertRaises(TypeError, a.insert)
+
+    def test_pop(self):
+        a = self.type2test([-1, 0, 1])
+        a.pop()
+        self.assertEqual(a, [-1, 0])
+        a.pop(0)
+        self.assertEqual(a, [0])
+        self.assertRaises(IndexError, a.pop, 5)
+        a.pop(0)
+        self.assertEqual(a, [])
+        self.assertRaises(IndexError, a.pop)
+        self.assertRaises(TypeError, a.pop, 42, 42)
+        a = self.type2test([0, 10, 20, 30, 40])
+
+    def test_remove(self):
+        a = self.type2test([0, 0, 1])
+        a.remove(1)
+        self.assertEqual(a, [0, 0])
+        a.remove(0)
+        self.assertEqual(a, [0])
+        a.remove(0)
+        self.assertEqual(a, [])
+
+        self.assertRaises(ValueError, a.remove, 0)
+
+        self.assertRaises(TypeError, a.remove)
+
+        a = self.type2test([1, 2])
+        self.assertRaises(ValueError, a.remove, NEVER_EQ)
+        self.assertEqual(a, [1, 2])
+        a.remove(ALWAYS_EQ)
+        self.assertEqual(a, [2])
+        a = self.type2test([ALWAYS_EQ])
+        a.remove(1)
+        self.assertEqual(a, [])
+        a = self.type2test([ALWAYS_EQ])
+        a.remove(NEVER_EQ)
+        self.assertEqual(a, [])
+        a = self.type2test([NEVER_EQ])
+        self.assertRaises(ValueError, a.remove, ALWAYS_EQ)
+
+        class BadExc(Exception):
+            pass
+
+        class BadCmp:
+            def __eq__(self, other):
+                if other == 2:
+                    raise BadExc()
+                return False
+
+        a = self.type2test([0, 1, 2, 3])
+        self.assertRaises(BadExc, a.remove, BadCmp())
+
+        class BadCmp2:
+            def __eq__(self, other):
+                raise BadExc()
+
+        d = self.type2test('abcdefghcij')
+        d.remove('c')
+        self.assertEqual(d, self.type2test('abdefghcij'))
+        d.remove('c')
+        self.assertEqual(d, self.type2test('abdefghij'))
+        self.assertRaises(ValueError, d.remove, 'c')
+        self.assertEqual(d, self.type2test('abdefghij'))
+
+        # Handle comparison errors
+        d = self.type2test(['a', 'b', BadCmp2(), 'c'])
+        e = self.type2test(d)
+        self.assertRaises(BadExc, d.remove, 'c')
+        for x, y in zip(d, e):
+            # verify that original order and values are retained.
+            self.assertIs(x, y)
+
+    def test_index(self):
+        super().test_index()
+        a = self.type2test([-2, -1, 0, 0, 1, 2])
+        a.remove(0)
+        self.assertRaises(ValueError, a.index, 2, 0, 4)
+        self.assertEqual(a, self.type2test([-2, -1, 0, 1, 2]))
+
+        # Test modifying the list during index's iteration
+        class EvilCmp:
+            def __init__(self, victim):
+                self.victim = victim
+            def __eq__(self, other):
+                del self.victim[:]
+                return False
+        a = self.type2test()
+        a[:] = [EvilCmp(a) for _ in range(100)]
+        # This used to seg fault before patch #1005778
+        self.assertRaises(ValueError, a.index, None)
+
+    def test_reverse(self):
+        u = self.type2test([-2, -1, 0, 1, 2])
+        u2 = u[:]
+        u.reverse()
+        self.assertEqual(u, [2, 1, 0, -1, -2])
+        u.reverse()
+        self.assertEqual(u, u2)
+
+        self.assertRaises(TypeError, u.reverse, 42)
+
+    def test_clear(self):
+        u = self.type2test([2, 3, 4])
+        u.clear()
+        self.assertEqual(u, [])
+
+        u = self.type2test([])
+        u.clear()
+        self.assertEqual(u, [])
+
+        u = self.type2test([])
+        u.append(1)
+        u.clear()
+        u.append(2)
+        self.assertEqual(u, [2])
+
+        self.assertRaises(TypeError, u.clear, None)
+
+    def test_copy(self):
+        u = self.type2test([1, 2, 3])
+        v = u.copy()
+        self.assertEqual(v, [1, 2, 3])
+
+        u = self.type2test([])
+        v = u.copy()
+        self.assertEqual(v, [])
+
+        # test that it's indeed a copy and not a reference
+        u = self.type2test(['a', 'b'])
+        v = u.copy()
+        v.append('i')
+        self.assertEqual(u, ['a', 'b'])
+        self.assertEqual(v, u + ['i'])
+
+        # test that it's a shallow, not a deep copy
+        u = self.type2test([1, 2, [3, 4], 5])
+        v = u.copy()
+        self.assertEqual(u, v)
+        self.assertIs(v[3], u[3])
+
+        self.assertRaises(TypeError, u.copy, None)
+
+    def test_sort(self):
+        u = self.type2test([1, 0])
+        u.sort()
+        self.assertEqual(u, [0, 1])
+
+        u = self.type2test([2,1,0,-1,-2])
+        u.sort()
+        self.assertEqual(u, self.type2test([-2,-1,0,1,2]))
+
+        self.assertRaises(TypeError, u.sort, 42, 42)
+
+        def revcmp(a, b):
+            if a == b:
+                return 0
+            elif a < b:
+                return 1
+            else: # a > b
+                return -1
+        u.sort(key=cmp_to_key(revcmp))
+        self.assertEqual(u, self.type2test([2,1,0,-1,-2]))
+
+        # The following dumps core in unpatched Python 1.5:
+        def myComparison(x,y):
+            xmod, ymod = x%3, y%7
+            if xmod == ymod:
+                return 0
+            elif xmod < ymod:
+                return -1
+            else: # xmod > ymod
+                return 1
+        z = self.type2test(range(12))
+        z.sort(key=cmp_to_key(myComparison))
+
+        self.assertRaises(TypeError, z.sort, 2)
+
+        def selfmodifyingComparison(x,y):
+            z.append(1)
+            if x == y:
+                return 0
+            elif x < y:
+                return -1
+            else: # x > y
+                return 1
+        self.assertRaises(ValueError, z.sort,
+                          key=cmp_to_key(selfmodifyingComparison))
+
+        self.assertRaises(TypeError, z.sort, 42, 42, 42, 42)
+
+    def test_slice(self):
+        u = self.type2test("spam")
+        u[:2] = "h"
+        self.assertEqual(u, list("ham"))
+
+    def test_iadd(self):
+        super().test_iadd()
+        u = self.type2test([0, 1])
+        u2 = u
+        u += [2, 3]
+        self.assertIs(u, u2)
+
+        u = self.type2test("spam")
+        u += "eggs"
+        self.assertEqual(u, self.type2test("spameggs"))
+
+        self.assertRaises(TypeError, u.__iadd__, None)
+
+    def test_imul(self):
+        super().test_imul()
+        s = self.type2test([])
+        oldid = id(s)
+        s *= 10
+        self.assertEqual(id(s), oldid)
+
+    def test_extendedslicing(self):
+        #  subscript
+        a = self.type2test([0,1,2,3,4])
+
+        #  deletion
+        del a[::2]
+        self.assertEqual(a, self.type2test([1,3]))
+        a = self.type2test(range(5))
+        del a[1::2]
+        self.assertEqual(a, self.type2test([0,2,4]))
+        a = self.type2test(range(5))
+        del a[1::-2]
+        self.assertEqual(a, self.type2test([0,2,3,4]))
+        a = self.type2test(range(10))
+        del a[::1000]
+        self.assertEqual(a, self.type2test([1, 2, 3, 4, 5, 6, 7, 8, 9]))
+        #  assignment
+        a = self.type2test(range(10))
+        a[::2] = [-1]*5
+        self.assertEqual(a, self.type2test([-1, 1, -1, 3, -1, 5, -1, 7, -1, 9]))
+        a = self.type2test(range(10))
+        a[::-4] = [10]*3
+        self.assertEqual(a, self.type2test([0, 10, 2, 3, 4, 10, 6, 7, 8 ,10]))
+        a = self.type2test(range(4))
+        a[::-1] = a
+        self.assertEqual(a, self.type2test([3, 2, 1, 0]))
+        a = self.type2test(range(10))
+        b = a[:]
+        c = a[:]
+        a[2:3] = self.type2test(["two", "elements"])
+        b[slice(2,3)] = self.type2test(["two", "elements"])
+        c[2:3:] = self.type2test(["two", "elements"])
+        self.assertEqual(a, b)
+        self.assertEqual(a, c)
+        a = self.type2test(range(10))
+        a[::2] = tuple(range(5))
+        self.assertEqual(a, self.type2test([0, 1, 1, 3, 2, 5, 3, 7, 4, 9]))
+        # test issue7788
+        a = self.type2test(range(10))
+        del a[9::1<<333]
+
+    def test_constructor_exception_handling(self):
+        # Bug #1242657
+        class F(object):
+            def __iter__(self):
+                raise KeyboardInterrupt
+        self.assertRaises(KeyboardInterrupt, list, F())
+
+    def test_exhausted_iterator(self):
+        a = self.type2test([1, 2, 3])
+        exhit = iter(a)
+        empit = iter(a)
+        for x in exhit:  # exhaust the iterator
+            next(empit)  # not exhausted
+        a.append(9)
+        self.assertEqual(list(exhit), [])
+        self.assertEqual(list(empit), [9])
+        self.assertEqual(a, self.type2test([1, 2, 3, 9]))
+
+        # gh-115733: Crash when iterating over exhausted iterator
+        exhit = iter(self.type2test([1, 2, 3]))
+        for _ in exhit:
+            next(exhit, 1)
diff --git a/test/dynamo/cpython/3_13/mapping_tests.diff b/test/dynamo/cpython/3_13/mapping_tests.diff
new file mode 100644
index 000000000000..03ae75513d66
--- /dev/null
+++ b/test/dynamo/cpython/3_13/mapping_tests.diff
@@ -0,0 +1,67 @@
+diff --git a/test/dynamo/cpython/3_13/mapping_tests.py b/test/dynamo/cpython/3_13/mapping_tests.py
+index ed89a81a6ea..eed59a68e94 100644
+--- a/test/dynamo/cpython/3_13/mapping_tests.py
++++ b/test/dynamo/cpython/3_13/mapping_tests.py
+@@ -1,10 +1,61 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ # tests common to dict and UserDict
+ import unittest
+ import collections
+ from test.support import get_c_recursion_limit
+ 
+ 
+-class BasicTestMappingProtocol(unittest.TestCase):
++class BasicTestMappingProtocol(__TestCase):
+     # This base class can be used to check that an object conforms to the
+     # mapping protocol
+ 
diff --git a/test/dynamo/cpython/3_13/mapping_tests.py b/test/dynamo/cpython/3_13/mapping_tests.py
new file mode 100644
index 000000000000..eed59a68e944
--- /dev/null
+++ b/test/dynamo/cpython/3_13/mapping_tests.py
@@ -0,0 +1,719 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+# tests common to dict and UserDict
+import unittest
+import collections
+from test.support import get_c_recursion_limit
+
+
+class BasicTestMappingProtocol(__TestCase):
+    # This base class can be used to check that an object conforms to the
+    # mapping protocol
+
+    # Functions that can be useful to override to adapt to dictionary
+    # semantics
+    type2test = None # which class is being tested (overwrite in subclasses)
+
+    def _reference(self):
+        """Return a dictionary of values which are invariant by storage
+        in the object under test."""
+        return {"1": "2", "key1":"value1", "key2":(1,2,3)}
+    def _empty_mapping(self):
+        """Return an empty mapping object"""
+        return self.type2test()
+    def _full_mapping(self, data):
+        """Return a mapping object with the value contained in data
+        dictionary"""
+        x = self._empty_mapping()
+        for key, value in data.items():
+            x[key] = value
+        return x
+
+    def __init__(self, *args, **kw):
+        unittest.TestCase.__init__(self, *args, **kw)
+        self.reference = self._reference().copy()
+
+        # A (key, value) pair not in the mapping
+        key, value = self.reference.popitem()
+        self.other = {key:value}
+
+        # A (key, value) pair in the mapping
+        key, value = self.reference.popitem()
+        self.inmapping = {key:value}
+        self.reference[key] = value
+
+    def test_read(self):
+        # Test for read only operations on mapping
+        p = self._empty_mapping()
+        p1 = dict(p) #workaround for singleton objects
+        d = self._full_mapping(self.reference)
+        if d is p:
+            p = p1
+        #Indexing
+        for key, value in self.reference.items():
+            self.assertEqual(d[key], value)
+        knownkey = list(self.other.keys())[0]
+        self.assertRaises(KeyError, lambda:d[knownkey])
+        #len
+        self.assertEqual(len(p), 0)
+        self.assertEqual(len(d), len(self.reference))
+        #__contains__
+        for k in self.reference:
+            self.assertIn(k, d)
+        for k in self.other:
+            self.assertNotIn(k, d)
+        #cmp
+        self.assertEqual(p, p)
+        self.assertEqual(d, d)
+        self.assertNotEqual(p, d)
+        self.assertNotEqual(d, p)
+        #bool
+        if p: self.fail("Empty mapping must compare to False")
+        if not d: self.fail("Full mapping must compare to True")
+        # keys(), items(), iterkeys() ...
+        def check_iterandlist(iter, lst, ref):
+            self.assertTrue(hasattr(iter, '__next__'))
+            self.assertTrue(hasattr(iter, '__iter__'))
+            x = list(iter)
+            self.assertTrue(set(x)==set(lst)==set(ref))
+        check_iterandlist(iter(d.keys()), list(d.keys()),
+                          self.reference.keys())
+        check_iterandlist(iter(d), list(d.keys()), self.reference.keys())
+        check_iterandlist(iter(d.values()), list(d.values()),
+                          self.reference.values())
+        check_iterandlist(iter(d.items()), list(d.items()),
+                          self.reference.items())
+        #get
+        key, value = next(iter(d.items()))
+        knownkey, knownvalue = next(iter(self.other.items()))
+        self.assertEqual(d.get(key, knownvalue), value)
+        self.assertEqual(d.get(knownkey, knownvalue), knownvalue)
+        self.assertNotIn(knownkey, d)
+
+    def test_write(self):
+        # Test for write operations on mapping
+        p = self._empty_mapping()
+        #Indexing
+        for key, value in self.reference.items():
+            p[key] = value
+            self.assertEqual(p[key], value)
+        for key in self.reference.keys():
+            del p[key]
+            self.assertRaises(KeyError, lambda:p[key])
+        p = self._empty_mapping()
+        #update
+        p.update(self.reference)
+        self.assertEqual(dict(p), self.reference)
+        items = list(p.items())
+        p = self._empty_mapping()
+        p.update(items)
+        self.assertEqual(dict(p), self.reference)
+        d = self._full_mapping(self.reference)
+        #setdefault
+        key, value = next(iter(d.items()))
+        knownkey, knownvalue = next(iter(self.other.items()))
+        self.assertEqual(d.setdefault(key, knownvalue), value)
+        self.assertEqual(d[key], value)
+        self.assertEqual(d.setdefault(knownkey, knownvalue), knownvalue)
+        self.assertEqual(d[knownkey], knownvalue)
+        #pop
+        self.assertEqual(d.pop(knownkey), knownvalue)
+        self.assertNotIn(knownkey, d)
+        self.assertRaises(KeyError, d.pop, knownkey)
+        default = 909
+        d[knownkey] = knownvalue
+        self.assertEqual(d.pop(knownkey, default), knownvalue)
+        self.assertNotIn(knownkey, d)
+        self.assertEqual(d.pop(knownkey, default), default)
+        #popitem
+        key, value = d.popitem()
+        self.assertNotIn(key, d)
+        self.assertEqual(value, self.reference[key])
+        p=self._empty_mapping()
+        self.assertRaises(KeyError, p.popitem)
+
+    def test_constructor(self):
+        self.assertEqual(self._empty_mapping(), self._empty_mapping())
+
+    def test_bool(self):
+        self.assertTrue(not self._empty_mapping())
+        self.assertTrue(self.reference)
+        self.assertTrue(bool(self._empty_mapping()) is False)
+        self.assertTrue(bool(self.reference) is True)
+
+    def test_keys(self):
+        d = self._empty_mapping()
+        self.assertEqual(list(d.keys()), [])
+        d = self.reference
+        self.assertIn(list(self.inmapping.keys())[0], d.keys())
+        self.assertNotIn(list(self.other.keys())[0], d.keys())
+        self.assertRaises(TypeError, d.keys, None)
+
+    def test_values(self):
+        d = self._empty_mapping()
+        self.assertEqual(list(d.values()), [])
+
+        self.assertRaises(TypeError, d.values, None)
+
+    def test_items(self):
+        d = self._empty_mapping()
+        self.assertEqual(list(d.items()), [])
+
+        self.assertRaises(TypeError, d.items, None)
+
+    def test_len(self):
+        d = self._empty_mapping()
+        self.assertEqual(len(d), 0)
+
+    def test_getitem(self):
+        d = self.reference
+        self.assertEqual(d[list(self.inmapping.keys())[0]],
+                         list(self.inmapping.values())[0])
+
+        self.assertRaises(TypeError, d.__getitem__)
+
+    def test_update(self):
+        # mapping argument
+        d = self._empty_mapping()
+        d.update(self.other)
+        self.assertEqual(list(d.items()), list(self.other.items()))
+
+        # No argument
+        d = self._empty_mapping()
+        d.update()
+        self.assertEqual(d, self._empty_mapping())
+
+        # item sequence
+        d = self._empty_mapping()
+        d.update(self.other.items())
+        self.assertEqual(list(d.items()), list(self.other.items()))
+
+        # Iterator
+        d = self._empty_mapping()
+        d.update(self.other.items())
+        self.assertEqual(list(d.items()), list(self.other.items()))
+
+        # FIXME: Doesn't work with UserDict
+        # self.assertRaises((TypeError, AttributeError), d.update, None)
+        self.assertRaises((TypeError, AttributeError), d.update, 42)
+
+        outerself = self
+        class SimpleUserDict:
+            def __init__(self):
+                self.d = outerself.reference
+            def keys(self):
+                return self.d.keys()
+            def __getitem__(self, i):
+                return self.d[i]
+        d.clear()
+        d.update(SimpleUserDict())
+        i1 = sorted(d.items())
+        i2 = sorted(self.reference.items())
+        self.assertEqual(i1, i2)
+
+        class Exc(Exception): pass
+
+        d = self._empty_mapping()
+        class FailingUserDict:
+            def keys(self):
+                raise Exc
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        d.clear()
+
+        class FailingUserDict:
+            def keys(self):
+                class BogonIter:
+                    def __init__(self):
+                        self.i = 1
+                    def __iter__(self):
+                        return self
+                    def __next__(self):
+                        if self.i:
+                            self.i = 0
+                            return 'a'
+                        raise Exc
+                return BogonIter()
+            def __getitem__(self, key):
+                return key
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        class FailingUserDict:
+            def keys(self):
+                class BogonIter:
+                    def __init__(self):
+                        self.i = ord('a')
+                    def __iter__(self):
+                        return self
+                    def __next__(self):
+                        if self.i <= ord('z'):
+                            rtn = chr(self.i)
+                            self.i += 1
+                            return rtn
+                        raise StopIteration
+                return BogonIter()
+            def __getitem__(self, key):
+                raise Exc
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        d = self._empty_mapping()
+        class badseq(object):
+            def __iter__(self):
+                return self
+            def __next__(self):
+                raise Exc()
+
+        self.assertRaises(Exc, d.update, badseq())
+
+        self.assertRaises(ValueError, d.update, [(1, 2, 3)])
+
+    # no test_fromkeys or test_copy as both os.environ and selves don't support it
+
+    def test_get(self):
+        d = self._empty_mapping()
+        self.assertTrue(d.get(list(self.other.keys())[0]) is None)
+        self.assertEqual(d.get(list(self.other.keys())[0], 3), 3)
+        d = self.reference
+        self.assertTrue(d.get(list(self.other.keys())[0]) is None)
+        self.assertEqual(d.get(list(self.other.keys())[0], 3), 3)
+        self.assertEqual(d.get(list(self.inmapping.keys())[0]),
+                         list(self.inmapping.values())[0])
+        self.assertEqual(d.get(list(self.inmapping.keys())[0], 3),
+                         list(self.inmapping.values())[0])
+        self.assertRaises(TypeError, d.get)
+        self.assertRaises(TypeError, d.get, None, None, None)
+
+    def test_setdefault(self):
+        d = self._empty_mapping()
+        self.assertRaises(TypeError, d.setdefault)
+
+    def test_popitem(self):
+        d = self._empty_mapping()
+        self.assertRaises(KeyError, d.popitem)
+        self.assertRaises(TypeError, d.popitem, 42)
+
+    def test_pop(self):
+        d = self._empty_mapping()
+        k, v = list(self.inmapping.items())[0]
+        d[k] = v
+        self.assertRaises(KeyError, d.pop, list(self.other.keys())[0])
+
+        self.assertEqual(d.pop(k), v)
+        self.assertEqual(len(d), 0)
+
+        self.assertRaises(KeyError, d.pop, k)
+
+
+class TestMappingProtocol(BasicTestMappingProtocol):
+    def test_constructor(self):
+        BasicTestMappingProtocol.test_constructor(self)
+        self.assertTrue(self._empty_mapping() is not self._empty_mapping())
+        self.assertEqual(self.type2test(x=1, y=2), {"x": 1, "y": 2})
+
+    def test_bool(self):
+        BasicTestMappingProtocol.test_bool(self)
+        self.assertTrue(not self._empty_mapping())
+        self.assertTrue(self._full_mapping({"x": "y"}))
+        self.assertTrue(bool(self._empty_mapping()) is False)
+        self.assertTrue(bool(self._full_mapping({"x": "y"})) is True)
+
+    def test_keys(self):
+        BasicTestMappingProtocol.test_keys(self)
+        d = self._empty_mapping()
+        self.assertEqual(list(d.keys()), [])
+        d = self._full_mapping({'a': 1, 'b': 2})
+        k = d.keys()
+        self.assertIn('a', k)
+        self.assertIn('b', k)
+        self.assertNotIn('c', k)
+
+    def test_values(self):
+        BasicTestMappingProtocol.test_values(self)
+        d = self._full_mapping({1:2})
+        self.assertEqual(list(d.values()), [2])
+
+    def test_items(self):
+        BasicTestMappingProtocol.test_items(self)
+
+        d = self._full_mapping({1:2})
+        self.assertEqual(list(d.items()), [(1, 2)])
+
+    def test_contains(self):
+        d = self._empty_mapping()
+        self.assertNotIn('a', d)
+        self.assertTrue(not ('a' in d))
+        self.assertTrue('a' not in d)
+        d = self._full_mapping({'a': 1, 'b': 2})
+        self.assertIn('a', d)
+        self.assertIn('b', d)
+        self.assertNotIn('c', d)
+
+        self.assertRaises(TypeError, d.__contains__)
+
+    def test_len(self):
+        BasicTestMappingProtocol.test_len(self)
+        d = self._full_mapping({'a': 1, 'b': 2})
+        self.assertEqual(len(d), 2)
+
+    def test_getitem(self):
+        BasicTestMappingProtocol.test_getitem(self)
+        d = self._full_mapping({'a': 1, 'b': 2})
+        self.assertEqual(d['a'], 1)
+        self.assertEqual(d['b'], 2)
+        d['c'] = 3
+        d['a'] = 4
+        self.assertEqual(d['c'], 3)
+        self.assertEqual(d['a'], 4)
+        del d['b']
+        self.assertEqual(d, {'a': 4, 'c': 3})
+
+        self.assertRaises(TypeError, d.__getitem__)
+
+    def test_clear(self):
+        d = self._full_mapping({1:1, 2:2, 3:3})
+        d.clear()
+        self.assertEqual(d, {})
+
+        self.assertRaises(TypeError, d.clear, None)
+
+    def test_update(self):
+        BasicTestMappingProtocol.test_update(self)
+        # mapping argument
+        d = self._empty_mapping()
+        d.update({1:100})
+        d.update({2:20})
+        d.update({1:1, 2:2, 3:3})
+        self.assertEqual(d, {1:1, 2:2, 3:3})
+
+        # no argument
+        d.update()
+        self.assertEqual(d, {1:1, 2:2, 3:3})
+
+        # keyword arguments
+        d = self._empty_mapping()
+        d.update(x=100)
+        d.update(y=20)
+        d.update(x=1, y=2, z=3)
+        self.assertEqual(d, {"x":1, "y":2, "z":3})
+
+        # item sequence
+        d = self._empty_mapping()
+        d.update([("x", 100), ("y", 20)])
+        self.assertEqual(d, {"x":100, "y":20})
+
+        # Both item sequence and keyword arguments
+        d = self._empty_mapping()
+        d.update([("x", 100), ("y", 20)], x=1, y=2)
+        self.assertEqual(d, {"x":1, "y":2})
+
+        # iterator
+        d = self._full_mapping({1:3, 2:4})
+        d.update(self._full_mapping({1:2, 3:4, 5:6}).items())
+        self.assertEqual(d, {1:2, 2:4, 3:4, 5:6})
+
+        class SimpleUserDict:
+            def __init__(self):
+                self.d = {1:1, 2:2, 3:3}
+            def keys(self):
+                return self.d.keys()
+            def __getitem__(self, i):
+                return self.d[i]
+        d.clear()
+        d.update(SimpleUserDict())
+        self.assertEqual(d, {1:1, 2:2, 3:3})
+
+    def test_fromkeys(self):
+        self.assertEqual(self.type2test.fromkeys('abc'), {'a':None, 'b':None, 'c':None})
+        d = self._empty_mapping()
+        self.assertTrue(not(d.fromkeys('abc') is d))
+        self.assertEqual(d.fromkeys('abc'), {'a':None, 'b':None, 'c':None})
+        self.assertEqual(d.fromkeys((4,5),0), {4:0, 5:0})
+        self.assertEqual(d.fromkeys([]), {})
+        def g():
+            yield 1
+        self.assertEqual(d.fromkeys(g()), {1:None})
+        self.assertRaises(TypeError, {}.fromkeys, 3)
+        class dictlike(self.type2test): pass
+        self.assertEqual(dictlike.fromkeys('a'), {'a':None})
+        self.assertEqual(dictlike().fromkeys('a'), {'a':None})
+        self.assertTrue(dictlike.fromkeys('a').__class__ is dictlike)
+        self.assertTrue(dictlike().fromkeys('a').__class__ is dictlike)
+        self.assertTrue(type(dictlike.fromkeys('a')) is dictlike)
+        class mydict(self.type2test):
+            def __new__(cls):
+                return collections.UserDict()
+        ud = mydict.fromkeys('ab')
+        self.assertEqual(ud, {'a':None, 'b':None})
+        self.assertIsInstance(ud, collections.UserDict)
+        self.assertRaises(TypeError, dict.fromkeys)
+
+        class Exc(Exception): pass
+
+        class baddict1(self.type2test):
+            def __init__(self, *args, **kwargs):
+                raise Exc()
+
+        self.assertRaises(Exc, baddict1.fromkeys, [1])
+
+        class BadSeq(object):
+            def __iter__(self):
+                return self
+            def __next__(self):
+                raise Exc()
+
+        self.assertRaises(Exc, self.type2test.fromkeys, BadSeq())
+
+        class baddict2(self.type2test):
+            def __setitem__(self, key, value):
+                raise Exc()
+
+        self.assertRaises(Exc, baddict2.fromkeys, [1])
+
+    def test_copy(self):
+        d = self._full_mapping({1:1, 2:2, 3:3})
+        self.assertEqual(d.copy(), {1:1, 2:2, 3:3})
+        d = self._empty_mapping()
+        self.assertEqual(d.copy(), d)
+        self.assertIsInstance(d.copy(), d.__class__)
+        self.assertRaises(TypeError, d.copy, None)
+
+    def test_get(self):
+        BasicTestMappingProtocol.test_get(self)
+        d = self._empty_mapping()
+        self.assertTrue(d.get('c') is None)
+        self.assertEqual(d.get('c', 3), 3)
+        d = self._full_mapping({'a' : 1, 'b' : 2})
+        self.assertTrue(d.get('c') is None)
+        self.assertEqual(d.get('c', 3), 3)
+        self.assertEqual(d.get('a'), 1)
+        self.assertEqual(d.get('a', 3), 1)
+
+    def test_setdefault(self):
+        BasicTestMappingProtocol.test_setdefault(self)
+        d = self._empty_mapping()
+        self.assertTrue(d.setdefault('key0') is None)
+        d.setdefault('key0', [])
+        self.assertTrue(d.setdefault('key0') is None)
+        d.setdefault('key', []).append(3)
+        self.assertEqual(d['key'][0], 3)
+        d.setdefault('key', []).append(4)
+        self.assertEqual(len(d['key']), 2)
+
+    def test_popitem(self):
+        BasicTestMappingProtocol.test_popitem(self)
+        for copymode in -1, +1:
+            # -1: b has same structure as a
+            # +1: b is a.copy()
+            for log2size in range(12):
+                size = 2**log2size
+                a = self._empty_mapping()
+                b = self._empty_mapping()
+                for i in range(size):
+                    a[repr(i)] = i
+                    if copymode < 0:
+                        b[repr(i)] = i
+                if copymode > 0:
+                    b = a.copy()
+                for i in range(size):
+                    ka, va = ta = a.popitem()
+                    self.assertEqual(va, int(ka))
+                    kb, vb = tb = b.popitem()
+                    self.assertEqual(vb, int(kb))
+                    self.assertTrue(not(copymode < 0 and ta != tb))
+                self.assertTrue(not a)
+                self.assertTrue(not b)
+
+    def test_pop(self):
+        BasicTestMappingProtocol.test_pop(self)
+
+        # Tests for pop with specified key
+        d = self._empty_mapping()
+        k, v = 'abc', 'def'
+
+        self.assertEqual(d.pop(k, v), v)
+        d[k] = v
+        self.assertEqual(d.pop(k, 1), v)
+
+
+class TestHashMappingProtocol(TestMappingProtocol):
+
+    def test_getitem(self):
+        TestMappingProtocol.test_getitem(self)
+        class Exc(Exception): pass
+
+        class BadEq(object):
+            def __eq__(self, other):
+                raise Exc()
+            def __hash__(self):
+                return 24
+
+        d = self._empty_mapping()
+        d[BadEq()] = 42
+        self.assertRaises(KeyError, d.__getitem__, 23)
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+
+        d = self._empty_mapping()
+        x = BadHash()
+        d[x] = 42
+        x.fail = True
+        self.assertRaises(Exc, d.__getitem__, x)
+
+    def test_fromkeys(self):
+        TestMappingProtocol.test_fromkeys(self)
+        class mydict(self.type2test):
+            def __new__(cls):
+                return collections.UserDict()
+        ud = mydict.fromkeys('ab')
+        self.assertEqual(ud, {'a':None, 'b':None})
+        self.assertIsInstance(ud, collections.UserDict)
+
+    def test_pop(self):
+        TestMappingProtocol.test_pop(self)
+
+        class Exc(Exception): pass
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+
+        d = self._empty_mapping()
+        x = BadHash()
+        d[x] = 42
+        x.fail = True
+        self.assertRaises(Exc, d.pop, x)
+
+    def test_mutatingiteration(self):
+        d = self._empty_mapping()
+        d[1] = 1
+        try:
+            count = 0
+            for i in d:
+                d[i+1] = 1
+                if count >= 1:
+                    self.fail("changing dict size during iteration doesn't raise Error")
+                count += 1
+        except RuntimeError:
+            pass
+
+    def test_repr(self):
+        d = self._empty_mapping()
+        self.assertEqual(repr(d), '{}')
+        d[1] = 2
+        self.assertEqual(repr(d), '{1: 2}')
+        d = self._empty_mapping()
+        d[1] = d
+        self.assertEqual(repr(d), '{1: {...}}')
+
+        class Exc(Exception): pass
+
+        class BadRepr(object):
+            def __repr__(self):
+                raise Exc()
+
+        d = self._full_mapping({1: BadRepr()})
+        self.assertRaises(Exc, repr, d)
+
+    def test_repr_deep(self):
+        d = self._empty_mapping()
+        for i in range(get_c_recursion_limit() + 1):
+            d0 = d
+            d = self._empty_mapping()
+            d[1] = d0
+        self.assertRaises(RecursionError, repr, d)
+
+    def test_eq(self):
+        self.assertEqual(self._empty_mapping(), self._empty_mapping())
+        self.assertEqual(self._full_mapping({1: 2}),
+                         self._full_mapping({1: 2}))
+
+        class Exc(Exception): pass
+
+        class BadCmp(object):
+            def __eq__(self, other):
+                raise Exc()
+            def __hash__(self):
+                return 1
+
+        d1 = self._full_mapping({BadCmp(): 1})
+        d2 = self._full_mapping({1: 1})
+        self.assertRaises(Exc, lambda: BadCmp()==1)
+        self.assertRaises(Exc, lambda: d1==d2)
+
+    def test_setdefault(self):
+        TestMappingProtocol.test_setdefault(self)
+
+        class Exc(Exception): pass
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+
+        d = self._empty_mapping()
+        x = BadHash()
+        d[x] = 42
+        x.fail = True
+        self.assertRaises(Exc, d.setdefault, x, [])
diff --git a/test/dynamo/cpython/3_13/mathdata/cmath_testcases.txt b/test/dynamo/cpython/3_13/mathdata/cmath_testcases.txt
new file mode 100644
index 000000000000..0165e17634f4
--- /dev/null
+++ b/test/dynamo/cpython/3_13/mathdata/cmath_testcases.txt
@@ -0,0 +1,2514 @@
+-- Testcases for functions in cmath.
+--
+-- Each line takes the form:
+--
+-- <testid> <function> <input_value> -> <output_value> <flags>
+--
+-- where:
+--
+--   <testid> is a short name identifying the test,
+--
+--   <function> is the function to be tested (exp, cos, asinh, ...),
+--
+--   <input_value> is a pair of floats separated by whitespace
+--     representing real and imaginary parts of a complex number, and
+--
+--   <output_value> is the expected (ideal) output value, again
+--     represented as a pair of floats.
+--
+--   <flags> is a list of the floating-point flags required by C99
+--
+-- The possible flags are:
+--
+--   divide-by-zero : raised when a finite input gives a
+--     mathematically infinite result.
+--
+--   overflow : raised when a finite input gives a finite result whose
+--     real or imaginary part is too large to fit in the usual range
+--     of an IEEE 754 double.
+--
+--   invalid : raised for invalid inputs.
+--
+--   ignore-real-sign : indicates that the sign of the real part of
+--     the result is unspecified; if the real part of the result is
+--     given as inf, then both -inf and inf should be accepted as
+--     correct.
+--
+--   ignore-imag-sign : indicates that the sign of the imaginary part
+--     of the result is unspecified.
+--
+-- Flags may appear in any order.
+--
+-- Lines beginning with '--' (like this one) start a comment, and are
+-- ignored.  Blank lines, or lines containing only whitespace, are also
+-- ignored.
+
+-- The majority of the values below were computed with the help of
+-- version 2.3 of the MPFR library for multiple-precision
+-- floating-point computations with correct rounding.  All output
+-- values in this file are (modulo yet-to-be-discovered bugs)
+-- correctly rounded, provided that each input and output decimal
+-- floating-point value below is interpreted as a representation of
+-- the corresponding nearest IEEE 754 double-precision value.  See the
+-- MPFR homepage at http://www.mpfr.org for more information about the
+-- MPFR project.
+
+-- A minority of the test cases were generated with the help of
+-- mpmath 0.19 at 100 bit accuracy (http://mpmath.org) to improve
+-- coverage of real functions with real-valued arguments. These are
+-- used in test.test_math.MathTests.test_testfile, as well as in
+-- test_cmath.
+
+
+--------------------------
+-- acos: Inverse cosine --
+--------------------------
+
+-- zeros
+acos0000 acos 0.0 0.0 -> 1.5707963267948966 -0.0
+acos0001 acos 0.0 -0.0 -> 1.5707963267948966 0.0
+acos0002 acos -0.0 0.0 -> 1.5707963267948966 -0.0
+acos0003 acos -0.0 -0.0 -> 1.5707963267948966 0.0
+
+-- branch points: +/-1
+acos0010 acos 1.0 0.0 -> 0.0 -0.0
+acos0011 acos 1.0 -0.0 -> 0.0 0.0
+acos0012 acos -1.0 0.0 -> 3.1415926535897931 -0.0
+acos0013 acos -1.0 -0.0 -> 3.1415926535897931 0.0
+
+-- values along both sides of real axis
+acos0020 acos -9.8813129168249309e-324 0.0 -> 1.5707963267948966 -0.0
+acos0021 acos -9.8813129168249309e-324 -0.0 -> 1.5707963267948966 0.0
+acos0022 acos -1e-305 0.0 -> 1.5707963267948966 -0.0
+acos0023 acos -1e-305 -0.0 -> 1.5707963267948966 0.0
+acos0024 acos -1e-150 0.0 -> 1.5707963267948966 -0.0
+acos0025 acos -1e-150 -0.0 -> 1.5707963267948966 0.0
+acos0026 acos -9.9999999999999998e-17 0.0 -> 1.5707963267948968 -0.0
+acos0027 acos -9.9999999999999998e-17 -0.0 -> 1.5707963267948968 0.0
+acos0028 acos -0.001 0.0 -> 1.5717963269615634 -0.0
+acos0029 acos -0.001 -0.0 -> 1.5717963269615634 0.0
+acos0030 acos -0.57899999999999996 0.0 -> 2.1882979816120667 -0.0
+acos0031 acos -0.57899999999999996 -0.0 -> 2.1882979816120667 0.0
+acos0032 acos -0.99999999999999989 0.0 -> 3.1415926386886319 -0.0
+acos0033 acos -0.99999999999999989 -0.0 -> 3.1415926386886319 0.0
+acos0034 acos -1.0000000000000002 0.0 -> 3.1415926535897931 -2.1073424255447014e-08
+acos0035 acos -1.0000000000000002 -0.0 -> 3.1415926535897931 2.1073424255447014e-08
+acos0036 acos -1.0009999999999999 0.0 -> 3.1415926535897931 -0.044717633608306849
+acos0037 acos -1.0009999999999999 -0.0 -> 3.1415926535897931 0.044717633608306849
+acos0038 acos -2.0 0.0 -> 3.1415926535897931 -1.3169578969248168
+acos0039 acos -2.0 -0.0 -> 3.1415926535897931 1.3169578969248168
+acos0040 acos -23.0 0.0 -> 3.1415926535897931 -3.8281684713331012
+acos0041 acos -23.0 -0.0 -> 3.1415926535897931 3.8281684713331012
+acos0042 acos -10000000000000000.0 0.0 -> 3.1415926535897931 -37.534508668464674
+acos0043 acos -10000000000000000.0 -0.0 -> 3.1415926535897931 37.534508668464674
+acos0044 acos -9.9999999999999998e+149 0.0 -> 3.1415926535897931 -346.08091112966679
+acos0045 acos -9.9999999999999998e+149 -0.0 -> 3.1415926535897931 346.08091112966679
+acos0046 acos -1.0000000000000001e+299 0.0 -> 3.1415926535897931 -689.16608998577965
+acos0047 acos -1.0000000000000001e+299 -0.0 -> 3.1415926535897931 689.16608998577965
+acos0048 acos 9.8813129168249309e-324 0.0 -> 1.5707963267948966 -0.0
+acos0049 acos 9.8813129168249309e-324 -0.0 -> 1.5707963267948966 0.0
+acos0050 acos 1e-305 0.0 -> 1.5707963267948966 -0.0
+acos0051 acos 1e-305 -0.0 -> 1.5707963267948966 0.0
+acos0052 acos 1e-150 0.0 -> 1.5707963267948966 -0.0
+acos0053 acos 1e-150 -0.0 -> 1.5707963267948966 0.0
+acos0054 acos 9.9999999999999998e-17 0.0 -> 1.5707963267948966 -0.0
+acos0055 acos 9.9999999999999998e-17 -0.0 -> 1.5707963267948966 0.0
+acos0056 acos 0.001 0.0 -> 1.56979632662823 -0.0
+acos0057 acos 0.001 -0.0 -> 1.56979632662823 0.0
+acos0058 acos 0.57899999999999996 0.0 -> 0.95329467197772655 -0.0
+acos0059 acos 0.57899999999999996 -0.0 -> 0.95329467197772655 0.0
+acos0060 acos 0.99999999999999989 0.0 -> 1.4901161193847656e-08 -0.0
+acos0061 acos 0.99999999999999989 -0.0 -> 1.4901161193847656e-08 0.0
+acos0062 acos 1.0000000000000002 0.0 -> 0.0 -2.1073424255447014e-08
+acos0063 acos 1.0000000000000002 -0.0 -> 0.0 2.1073424255447014e-08
+acos0064 acos 1.0009999999999999 0.0 -> 0.0 -0.044717633608306849
+acos0065 acos 1.0009999999999999 -0.0 -> 0.0 0.044717633608306849
+acos0066 acos 2.0 0.0 -> 0.0 -1.3169578969248168
+acos0067 acos 2.0 -0.0 -> 0.0 1.3169578969248168
+acos0068 acos 23.0 0.0 -> 0.0 -3.8281684713331012
+acos0069 acos 23.0 -0.0 -> 0.0 3.8281684713331012
+acos0070 acos 10000000000000000.0 0.0 -> 0.0 -37.534508668464674
+acos0071 acos 10000000000000000.0 -0.0 -> 0.0 37.534508668464674
+acos0072 acos 9.9999999999999998e+149 0.0 -> 0.0 -346.08091112966679
+acos0073 acos 9.9999999999999998e+149 -0.0 -> 0.0 346.08091112966679
+acos0074 acos 1.0000000000000001e+299 0.0 -> 0.0 -689.16608998577965
+acos0075 acos 1.0000000000000001e+299 -0.0 -> 0.0 689.16608998577965
+
+-- random inputs
+acos0100 acos -3.3307113324596682 -10.732007530863266 -> 1.8706085694482339 3.113986806554613
+acos0101 acos -2863.952991743291 -2681013315.2571239 -> 1.5707973950301699 22.402607843274758
+acos0102 acos -0.33072639793220088 -0.85055464658253055 -> 1.8219426895922601 0.79250166729311966
+acos0103 acos -2.5722325842097802 -12.703940809821574 -> 1.7699942413107408 3.2565170156527325
+acos0104 acos -42.495233785459583 -0.54039320751337161 -> 3.1288732573153304 4.4424815519735601
+acos0105 acos -1.1363818625856401 9641.1325498630376 -> 1.5709141948820049 -9.8669410553254284
+acos0106 acos -2.4398426824157866e-11 0.33002051890266165 -> 1.570796326818066 -0.32430578041578667
+acos0107 acos -1.3521340428186552 2.9369737912076772 -> 1.9849059192339338 -1.8822893674117942
+acos0108 acos -1.827364706477915 1.0355459232147557 -> 2.5732246307960032 -1.4090688267854969
+acos0109 acos -0.25978373706403546 10.09712669185833 -> 1.5963940386378306 -3.0081673050196063
+acos0110 acos 0.33561778471072551 -4587350.6823999118 -> 1.5707962536333251 16.031960402579539
+acos0111 acos 0.49133444610998445 -0.8071422362990015 -> 1.1908761712801788 0.78573345813187867
+acos0112 acos 0.42196734507823974 -2.4812965431745115 -> 1.414091186100692 1.651707260988172
+acos0113 acos 2.961426210100655 -219.03295695248664 -> 1.5572768319822778 6.0824659885827304
+acos0114 acos 2.886209063652641 -20.38011207220606 -> 1.4302765252297889 3.718201853147642
+acos0115 acos 0.4180568075276509 1.4833433990823484 -> 1.3393834558303042 -1.2079847758301576
+acos0116 acos 52.376111405924718 0.013930429001941001 -> 0.00026601761804024188 -4.6515066691204714
+acos0117 acos 41637948387.625969 1.563418292894041 -> 3.7547918507883548e-11 -25.145424989809381
+acos0118 acos 0.061226659122249526 0.8447234394615154 -> 1.5240280306367315 -0.76791798971140812
+acos0119 acos 2.4480466420442959e+26 0.18002339201384662 -> 7.353756620564798e-28 -61.455650015996376
+
+-- values near infinity
+acos0200 acos 1.6206860518683021e+308 1.0308426226285283e+308 -> 0.56650826093826223 -710.54206874241561
+acos0201 acos 1.2067735875070062e+308 -1.3429173724390276e+308 -> 0.83874369390864889 710.48017794027498
+acos0202 acos -7.4130145132549047e+307 1.1759130543927645e+308 -> 2.1332729346478536 -710.21871115698752
+acos0203 acos -8.6329426442257249e+307 -1.2316282952184133e+308 -> 2.1821511032444838 710.29752145697148
+acos0204 acos 0.0 1.4289713855849746e+308 -> 1.5707963267948966 -710.24631069738996
+acos0205 acos -0.0 1.3153524545987432e+308 -> 1.5707963267948966 -710.1634604787539
+acos0206 acos 0.0 -9.6229037669269321e+307 -> 1.5707963267948966 709.85091679573691
+acos0207 acos -0.0 -4.9783616421107088e+307 -> 1.5707963267948966 709.19187157911233
+acos0208 acos 1.3937541925739389e+308 0.0 -> 0.0 -710.22135678707264
+acos0209 acos 9.1362388967371536e+307 -0.0 -> 0.0 709.79901953124613
+acos0210 acos -1.3457361220697436e+308 0.0 -> 3.1415926535897931 -710.18629698871848
+acos0211 acos -5.4699090056144284e+307 -0.0 -> 3.1415926535897931 709.28603271085649
+acos0212 acos 1.5880716932358901e+308 5.5638401252339929 -> 3.503519487773873e-308 -710.35187633140583
+acos0213 acos 1.2497211663463164e+308 -3.0456477717911024 -> 2.4370618453197486e-308 710.11227628223412
+acos0214 acos -9.9016224006029528e+307 4.9570427340789056 -> 3.1415926535897931 -709.87946935229468
+acos0215 acos -1.5854071066874139e+308 -4.4233577741497783 -> 3.1415926535897931 710.35019704672004
+acos0216 acos 9.3674623083647628 1.5209559051877979e+308 -> 1.5707963267948966 -710.30869484491086
+acos0217 acos 8.1773832021784383 -6.6093445795000056e+307 -> 1.5707963267948966 709.4752552227792
+acos0218 acos -3.1845935000665104 1.5768856396650893e+308 -> 1.5707963267948966 -710.34480761042687
+acos0219 acos -1.0577303880953903 -6.4574626815735613e+307 -> 1.5707963267948966 709.45200719662046
+
+-- values near 0
+acos0220 acos 1.8566986970714045e-320 3.1867234156760402e-321 -> 1.5707963267948966 -3.1867234156760402e-321
+acos0221 acos 7.9050503334599447e-323 -8.8931816251424378e-323 -> 1.5707963267948966 8.8931816251424378e-323
+acos0222 acos -4.4465908125712189e-323 2.4654065097222727e-311 -> 1.5707963267948966 -2.4654065097222727e-311
+acos0223 acos -6.1016916408192619e-311 -2.4703282292062327e-323 -> 1.5707963267948966 2.4703282292062327e-323
+acos0224 acos 0.0 3.4305783621842729e-311 -> 1.5707963267948966 -3.4305783621842729e-311
+acos0225 acos -0.0 1.6117409498633145e-319 -> 1.5707963267948966 -1.6117409498633145e-319
+acos0226 acos 0.0 -4.9900630229965901e-322 -> 1.5707963267948966 4.9900630229965901e-322
+acos0227 acos -0.0 -4.4889279210592818e-311 -> 1.5707963267948966 4.4889279210592818e-311
+acos0228 acos 5.3297678681477214e-312 0.0 -> 1.5707963267948966 -0.0
+acos0229 acos 6.2073425897211614e-313 -0.0 -> 1.5707963267948966 0.0
+acos0230 acos -4.9406564584124654e-324 0.0 -> 1.5707963267948966 -0.0
+acos0231 acos -1.7107517052899003e-318 -0.0 -> 1.5707963267948966 0.0
+
+-- special values
+acos1000 acos 0.0 0.0 -> 1.5707963267948966 -0.0
+acos1001 acos 0.0 -0.0 -> 1.5707963267948966 0.0
+acos1002 acos -0.0 0.0 -> 1.5707963267948966 -0.0
+acos1003 acos -0.0 -0.0 -> 1.5707963267948966 0.0
+acos1004 acos 0.0 nan -> 1.5707963267948966 nan
+acos1005 acos -0.0 nan -> 1.5707963267948966 nan
+acos1006 acos -2.3 inf -> 1.5707963267948966 -inf
+acos1007 acos -0.0 inf -> 1.5707963267948966 -inf
+acos1008 acos 0.0 inf -> 1.5707963267948966 -inf
+acos1009 acos 2.3 inf -> 1.5707963267948966 -inf
+acos1010 acos -2.3 nan -> nan nan
+acos1011 acos 2.3 nan -> nan nan
+acos1012 acos -inf 2.3 -> 3.1415926535897931 -inf
+acos1013 acos -inf 0.0 -> 3.1415926535897931 -inf
+acos1014 acos inf 2.3 -> 0.0 -inf
+acos1015 acos inf 0.0 -> 0.0 -inf
+acos1016 acos -inf inf -> 2.3561944901923448 -inf
+acos1017 acos inf inf -> 0.78539816339744828 -inf
+acos1018 acos inf nan -> nan inf                        ignore-imag-sign
+acos1019 acos -inf nan -> nan inf                       ignore-imag-sign
+acos1020 acos nan 0.0 -> nan nan
+acos1021 acos nan 2.3 -> nan nan
+acos1022 acos nan inf -> nan -inf
+acos1023 acos nan nan -> nan nan
+acos1024 acos -2.3 -inf -> 1.5707963267948966 inf
+acos1025 acos -0.0 -inf -> 1.5707963267948966 inf
+acos1026 acos 0.0 -inf -> 1.5707963267948966 inf
+acos1027 acos 2.3 -inf -> 1.5707963267948966 inf
+acos1028 acos -inf -2.3 -> 3.1415926535897931 inf
+acos1029 acos -inf -0.0 -> 3.1415926535897931 inf
+acos1030 acos inf -2.3 -> 0.0 inf
+acos1031 acos inf -0.0 -> 0.0 inf
+acos1032 acos -inf -inf -> 2.3561944901923448 inf
+acos1033 acos inf -inf -> 0.78539816339744828 inf
+acos1034 acos nan -0.0 -> nan nan
+acos1035 acos nan -2.3 -> nan nan
+acos1036 acos nan -inf -> nan inf
+
+
+--------------------------------------
+-- acosh: Inverse hyperbolic cosine --
+--------------------------------------
+
+-- zeros
+acosh0000 acosh 0.0 0.0 -> 0.0 1.5707963267948966
+acosh0001 acosh 0.0 -0.0 -> 0.0 -1.5707963267948966
+acosh0002 acosh -0.0 0.0 -> 0.0 1.5707963267948966
+acosh0003 acosh -0.0 -0.0 -> 0.0 -1.5707963267948966
+
+-- branch points: +/-1
+acosh0010 acosh 1.0 0.0 -> 0.0 0.0
+acosh0011 acosh 1.0 -0.0 -> 0.0 -0.0
+acosh0012 acosh -1.0 0.0 -> 0.0 3.1415926535897931
+acosh0013 acosh -1.0 -0.0 -> 0.0 -3.1415926535897931
+
+-- values along both sides of real axis
+acosh0020 acosh -9.8813129168249309e-324 0.0 -> 0.0 1.5707963267948966
+acosh0021 acosh -9.8813129168249309e-324 -0.0 -> 0.0 -1.5707963267948966
+acosh0022 acosh -1e-305 0.0 -> 0.0 1.5707963267948966
+acosh0023 acosh -1e-305 -0.0 -> 0.0 -1.5707963267948966
+acosh0024 acosh -1e-150 0.0 -> 0.0 1.5707963267948966
+acosh0025 acosh -1e-150 -0.0 -> 0.0 -1.5707963267948966
+acosh0026 acosh -9.9999999999999998e-17 0.0 -> 0.0 1.5707963267948968
+acosh0027 acosh -9.9999999999999998e-17 -0.0 -> 0.0 -1.5707963267948968
+acosh0028 acosh -0.001 0.0 -> 0.0 1.5717963269615634
+acosh0029 acosh -0.001 -0.0 -> 0.0 -1.5717963269615634
+acosh0030 acosh -0.57899999999999996 0.0 -> 0.0 2.1882979816120667
+acosh0031 acosh -0.57899999999999996 -0.0 -> 0.0 -2.1882979816120667
+acosh0032 acosh -0.99999999999999989 0.0 -> 0.0 3.1415926386886319
+acosh0033 acosh -0.99999999999999989 -0.0 -> 0.0 -3.1415926386886319
+acosh0034 acosh -1.0000000000000002 0.0 -> 2.1073424255447014e-08 3.1415926535897931
+acosh0035 acosh -1.0000000000000002 -0.0 -> 2.1073424255447014e-08 -3.1415926535897931
+acosh0036 acosh -1.0009999999999999 0.0 -> 0.044717633608306849 3.1415926535897931
+acosh0037 acosh -1.0009999999999999 -0.0 -> 0.044717633608306849 -3.1415926535897931
+acosh0038 acosh -2.0 0.0 -> 1.3169578969248168 3.1415926535897931
+acosh0039 acosh -2.0 -0.0 -> 1.3169578969248168 -3.1415926535897931
+acosh0040 acosh -23.0 0.0 -> 3.8281684713331012 3.1415926535897931
+acosh0041 acosh -23.0 -0.0 -> 3.8281684713331012 -3.1415926535897931
+acosh0042 acosh -10000000000000000.0 0.0 -> 37.534508668464674 3.1415926535897931
+acosh0043 acosh -10000000000000000.0 -0.0 -> 37.534508668464674 -3.1415926535897931
+acosh0044 acosh -9.9999999999999998e+149 0.0 -> 346.08091112966679 3.1415926535897931
+acosh0045 acosh -9.9999999999999998e+149 -0.0 -> 346.08091112966679 -3.1415926535897931
+acosh0046 acosh -1.0000000000000001e+299 0.0 -> 689.16608998577965 3.1415926535897931
+acosh0047 acosh -1.0000000000000001e+299 -0.0 -> 689.16608998577965 -3.1415926535897931
+acosh0048 acosh 9.8813129168249309e-324 0.0 -> 0.0 1.5707963267948966
+acosh0049 acosh 9.8813129168249309e-324 -0.0 -> 0.0 -1.5707963267948966
+acosh0050 acosh 1e-305 0.0 -> 0.0 1.5707963267948966
+acosh0051 acosh 1e-305 -0.0 -> 0.0 -1.5707963267948966
+acosh0052 acosh 1e-150 0.0 -> 0.0 1.5707963267948966
+acosh0053 acosh 1e-150 -0.0 -> 0.0 -1.5707963267948966
+acosh0054 acosh 9.9999999999999998e-17 0.0 -> 0.0 1.5707963267948966
+acosh0055 acosh 9.9999999999999998e-17 -0.0 -> 0.0 -1.5707963267948966
+acosh0056 acosh 0.001 0.0 -> 0.0 1.56979632662823
+acosh0057 acosh 0.001 -0.0 -> 0.0 -1.56979632662823
+acosh0058 acosh 0.57899999999999996 0.0 -> 0.0 0.95329467197772655
+acosh0059 acosh 0.57899999999999996 -0.0 -> 0.0 -0.95329467197772655
+acosh0060 acosh 0.99999999999999989 0.0 -> 0.0 1.4901161193847656e-08
+acosh0061 acosh 0.99999999999999989 -0.0 -> 0.0 -1.4901161193847656e-08
+acosh0062 acosh 1.0000000000000002 0.0 -> 2.1073424255447014e-08 0.0
+acosh0063 acosh 1.0000000000000002 -0.0 -> 2.1073424255447014e-08 -0.0
+acosh0064 acosh 1.0009999999999999 0.0 -> 0.044717633608306849 0.0
+acosh0065 acosh 1.0009999999999999 -0.0 -> 0.044717633608306849 -0.0
+acosh0066 acosh 2.0 0.0 -> 1.3169578969248168 0.0
+acosh0067 acosh 2.0 -0.0 -> 1.3169578969248168 -0.0
+acosh0068 acosh 23.0 0.0 -> 3.8281684713331012 0.0
+acosh0069 acosh 23.0 -0.0 -> 3.8281684713331012 -0.0
+acosh0070 acosh 10000000000000000.0 0.0 -> 37.534508668464674 0.0
+acosh0071 acosh 10000000000000000.0 -0.0 -> 37.534508668464674 -0.0
+acosh0072 acosh 9.9999999999999998e+149 0.0 -> 346.08091112966679 0.0
+acosh0073 acosh 9.9999999999999998e+149 -0.0 -> 346.08091112966679 -0.0
+acosh0074 acosh 1.0000000000000001e+299 0.0 -> 689.16608998577965 0.0
+acosh0075 acosh 1.0000000000000001e+299 -0.0 -> 689.16608998577965 -0.0
+
+-- random inputs
+acosh0100 acosh -1.4328589581250843 -1.8370347775558309 -> 1.5526962646549587 -2.190250168435786
+acosh0101 acosh -0.31075819156220957 -1.0772555786839297 -> 0.95139168286193709 -1.7812228089636479
+acosh0102 acosh -1.9044776578070453 -20.485370158932124 -> 3.7177411088932359 -1.6633888745861227
+acosh0103 acosh -0.075642506000858742 -21965976320.873051 -> 24.505907742881991 -1.5707963267983402
+acosh0104 acosh -1.6162271181056307 -3.0369343458696099 -> 1.9407057262861227 -2.0429549461750209
+acosh0105 acosh -0.3103780280298063 0.00018054880018078987 -> 0.00018992877058761416 1.886386995096728
+acosh0106 acosh -9159468751.5897655 5.8014747664273649 -> 23.631201197959193 3.1415926529564078
+acosh0107 acosh -0.037739157550933884 0.21841357493510705 -> 0.21685844960602488 1.6076735133449402
+acosh0108 acosh -8225991.0508394297 0.28318543008913644 -> 16.615956520420287 3.1415926191641019
+acosh0109 acosh -35.620070502302639 0.31303237005015 -> 4.2658980006943965 3.1328013255541873
+acosh0110 acosh 96.729939906820917 -0.029345228372365334 -> 5.2650434775863548 -0.00030338895866972843
+acosh0111 acosh 0.59656024007966491 -2.0412294654163978 -> 1.4923002024287835 -1.312568421900338
+acosh0112 acosh 109.29384112677828 -0.00015454863061533812 -> 5.3871662961545477 -1.4141245154061214e-06
+acosh0113 acosh 8.6705651969361597 -3.6723631649787465 -> 2.9336180958363545 -0.40267362031872861
+acosh0114 acosh 1.8101646445052686 -0.012345132721855478 -> 1.1997148566285769 -0.0081813912760150265
+acosh0115 acosh 52.56897195025288 0.001113916065985443 -> 4.6551827622264135 2.1193445872040307e-05
+acosh0116 acosh 0.28336786164214739 355643992457.40485 -> 27.290343226816528 1.5707963267940999
+acosh0117 acosh 0.73876621291911437 2.8828594541104322e-20 -> 4.2774820978159067e-20 0.73955845836827927
+acosh0118 acosh 0.025865471781718878 37125746064318.492 -> 31.938478989418012 1.5707963267948959
+acosh0119 acosh 2.2047353511780132 0.074712248143489271 -> 1.4286403248698021 0.037997904971626598
+
+-- values near infinity
+acosh0200 acosh 8.1548592876467785e+307 9.0943779335951128e+307 -> 710.08944620800605 0.83981165425478954
+acosh0201 acosh 1.4237229680972531e+308 -1.0336966617874858e+308 -> 710.4543331094759 -0.6279972876348755
+acosh0202 acosh -1.5014526899738939e+308 1.5670700378448792e+308 -> 710.66420706795464 2.3348137299106697
+acosh0203 acosh -1.0939040375213928e+308 -1.0416960351127978e+308 -> 710.30182863115886 -2.380636147787027
+acosh0204 acosh 0.0 1.476062433559588e+308 -> 710.27873384716929 1.5707963267948966
+acosh0205 acosh -0.0 6.2077210326221094e+307 -> 709.41256457484769 1.5707963267948966
+acosh0206 acosh 0.0 -1.5621899909968308e+308 -> 710.33544449990734 -1.5707963267948966
+acosh0207 acosh -0.0 -8.3556624833839122e+307 -> 709.70971018048317 -1.5707963267948966
+acosh0208 acosh 1.3067079752499342e+308 0.0 -> 710.15686680107228 0.0
+acosh0209 acosh 1.5653640340214026e+308 -0.0 -> 710.33747422926706 -0.0
+acosh0210 acosh -6.9011375992290636e+307 0.0 -> 709.51845699719922 3.1415926535897931
+acosh0211 acosh -9.9539576809926973e+307 -0.0 -> 709.88474095870185 -3.1415926535897931
+acosh0212 acosh 7.6449598518914925e+307 9.5706540768268358 -> 709.62081731754802 1.2518906916769345e-307
+acosh0213 acosh 5.4325410972602197e+307 -7.8064807816522706 -> 709.279177727925 -1.4369851312471974e-307
+acosh0214 acosh -1.1523626112360465e+308 7.0617510038869336 -> 710.03117010216909 3.1415926535897931
+acosh0215 acosh -1.1685027786862599e+308 -5.1568558357925625 -> 710.04507907571417 -3.1415926535897931
+acosh0216 acosh 3.0236370339788721 1.7503248720096417e+308 -> 710.44915723458064 1.5707963267948966
+acosh0217 acosh 6.6108007926031149 -9.1469968225806149e+307 -> 709.80019633903328 -1.5707963267948966
+acosh0218 acosh -5.1096262905623959 6.4484926785412395e+307 -> 709.45061713997973 1.5707963267948966
+acosh0219 acosh -2.8080920608735846 -1.7716118836519368e+308 -> 710.46124562363445 -1.5707963267948966
+
+-- values near 0
+acosh0220 acosh 4.5560530326699304e-317 7.3048989121436657e-318 -> 7.3048989121436657e-318 1.5707963267948966
+acosh0221 acosh 4.8754274133585331e-314 -9.8469794897684199e-315 -> 9.8469794897684199e-315 -1.5707963267948966
+acosh0222 acosh -4.6748876009960097e-312 9.7900342887557606e-318 -> 9.7900342887557606e-318 1.5707963267948966
+acosh0223 acosh -4.3136871538399236e-320 -4.9406564584124654e-323 -> 4.9406564584124654e-323 -1.5707963267948966
+acosh0224 acosh 0.0 4.3431013866496774e-314 -> 4.3431013866496774e-314 1.5707963267948966
+acosh0225 acosh -0.0 6.0147334335829184e-317 -> 6.0147334335829184e-317 1.5707963267948966
+acosh0226 acosh 0.0 -1.2880291387081297e-320 -> 1.2880291387081297e-320 -1.5707963267948966
+acosh0227 acosh -0.0 -1.4401563976534621e-317 -> 1.4401563976534621e-317 -1.5707963267948966
+acosh0228 acosh 1.3689680570863091e-313 0.0 -> 0.0 1.5707963267948966
+acosh0229 acosh 1.5304346893494371e-312 -0.0 -> 0.0 -1.5707963267948966
+acosh0230 acosh -3.7450175954766488e-320 0.0 -> 0.0 1.5707963267948966
+acosh0231 acosh -8.4250563080885801e-311 -0.0 -> 0.0 -1.5707963267948966
+
+-- special values
+acosh1000 acosh 0.0 0.0 -> 0.0 1.5707963267948966
+acosh1001 acosh -0.0 0.0 -> 0.0 1.5707963267948966
+acosh1002 acosh 0.0 inf -> inf 1.5707963267948966
+acosh1003 acosh 2.3 inf -> inf 1.5707963267948966
+acosh1004 acosh -0.0 inf -> inf 1.5707963267948966
+acosh1005 acosh -2.3 inf -> inf 1.5707963267948966
+acosh1006 acosh 0.0 nan -> nan nan
+acosh1007 acosh 2.3 nan -> nan nan
+acosh1008 acosh -0.0 nan -> nan nan
+acosh1009 acosh -2.3 nan -> nan nan
+acosh1010 acosh -inf 0.0 -> inf 3.1415926535897931
+acosh1011 acosh -inf 2.3 -> inf 3.1415926535897931
+acosh1012 acosh inf 0.0 -> inf 0.0
+acosh1013 acosh inf 2.3 -> inf 0.0
+acosh1014 acosh -inf inf -> inf 2.3561944901923448
+acosh1015 acosh inf inf -> inf 0.78539816339744828
+acosh1016 acosh inf nan -> inf nan
+acosh1017 acosh -inf nan -> inf nan
+acosh1018 acosh nan 0.0 -> nan nan
+acosh1019 acosh nan 2.3 -> nan nan
+acosh1020 acosh nan inf -> inf nan
+acosh1021 acosh nan nan -> nan nan
+acosh1022 acosh 0.0 -0.0 -> 0.0 -1.5707963267948966
+acosh1023 acosh -0.0 -0.0 -> 0.0 -1.5707963267948966
+acosh1024 acosh 0.0 -inf -> inf -1.5707963267948966
+acosh1025 acosh 2.3 -inf -> inf -1.5707963267948966
+acosh1026 acosh -0.0 -inf -> inf -1.5707963267948966
+acosh1027 acosh -2.3 -inf -> inf -1.5707963267948966
+acosh1028 acosh -inf -0.0 -> inf -3.1415926535897931
+acosh1029 acosh -inf -2.3 -> inf -3.1415926535897931
+acosh1030 acosh inf -0.0 -> inf -0.0
+acosh1031 acosh inf -2.3 -> inf -0.0
+acosh1032 acosh -inf -inf -> inf -2.3561944901923448
+acosh1033 acosh inf -inf -> inf -0.78539816339744828
+acosh1034 acosh nan -0.0 -> nan nan
+acosh1035 acosh nan -2.3 -> nan nan
+acosh1036 acosh nan -inf -> inf nan
+
+
+------------------------
+-- asin: Inverse sine --
+------------------------
+
+-- zeros
+asin0000 asin 0.0 0.0 -> 0.0 0.0
+asin0001 asin 0.0 -0.0 -> 0.0 -0.0
+asin0002 asin -0.0 0.0 -> -0.0 0.0
+asin0003 asin -0.0 -0.0 -> -0.0 -0.0
+
+-- branch points: +/-1
+asin0010 asin 1.0 0.0 -> 1.5707963267948966 0.0
+asin0011 asin 1.0 -0.0 -> 1.5707963267948966 -0.0
+asin0012 asin -1.0 0.0 -> -1.5707963267948966 0.0
+asin0013 asin -1.0 -0.0 -> -1.5707963267948966 -0.0
+
+-- values along both sides of real axis
+asin0020 asin -9.8813129168249309e-324 0.0 -> -9.8813129168249309e-324 0.0
+asin0021 asin -9.8813129168249309e-324 -0.0 -> -9.8813129168249309e-324 -0.0
+asin0022 asin -1e-305 0.0 -> -1e-305 0.0
+asin0023 asin -1e-305 -0.0 -> -1e-305 -0.0
+asin0024 asin -1e-150 0.0 -> -1e-150 0.0
+asin0025 asin -1e-150 -0.0 -> -1e-150 -0.0
+asin0026 asin -9.9999999999999998e-17 0.0 -> -9.9999999999999998e-17 0.0
+asin0027 asin -9.9999999999999998e-17 -0.0 -> -9.9999999999999998e-17 -0.0
+asin0028 asin -0.001 0.0 -> -0.0010000001666667416 0.0
+asin0029 asin -0.001 -0.0 -> -0.0010000001666667416 -0.0
+asin0030 asin -0.57899999999999996 0.0 -> -0.61750165481717001 0.0
+asin0031 asin -0.57899999999999996 -0.0 -> -0.61750165481717001 -0.0
+asin0032 asin -0.99999999999999989 0.0 -> -1.5707963118937354 0.0
+asin0033 asin -0.99999999999999989 -0.0 -> -1.5707963118937354 -0.0
+asin0034 asin -1.0000000000000002 0.0 -> -1.5707963267948966 2.1073424255447014e-08
+asin0035 asin -1.0000000000000002 -0.0 -> -1.5707963267948966 -2.1073424255447014e-08
+asin0036 asin -1.0009999999999999 0.0 -> -1.5707963267948966 0.044717633608306849
+asin0037 asin -1.0009999999999999 -0.0 -> -1.5707963267948966 -0.044717633608306849
+asin0038 asin -2.0 0.0 -> -1.5707963267948966 1.3169578969248168
+asin0039 asin -2.0 -0.0 -> -1.5707963267948966 -1.3169578969248168
+asin0040 asin -23.0 0.0 -> -1.5707963267948966 3.8281684713331012
+asin0041 asin -23.0 -0.0 -> -1.5707963267948966 -3.8281684713331012
+asin0042 asin -10000000000000000.0 0.0 -> -1.5707963267948966 37.534508668464674
+asin0043 asin -10000000000000000.0 -0.0 -> -1.5707963267948966 -37.534508668464674
+asin0044 asin -9.9999999999999998e+149 0.0 -> -1.5707963267948966 346.08091112966679
+asin0045 asin -9.9999999999999998e+149 -0.0 -> -1.5707963267948966 -346.08091112966679
+asin0046 asin -1.0000000000000001e+299 0.0 -> -1.5707963267948966 689.16608998577965
+asin0047 asin -1.0000000000000001e+299 -0.0 -> -1.5707963267948966 -689.16608998577965
+asin0048 asin 9.8813129168249309e-324 0.0 -> 9.8813129168249309e-324 0.0
+asin0049 asin 9.8813129168249309e-324 -0.0 -> 9.8813129168249309e-324 -0.0
+asin0050 asin 1e-305 0.0 -> 1e-305 0.0
+asin0051 asin 1e-305 -0.0 -> 1e-305 -0.0
+asin0052 asin 1e-150 0.0 -> 1e-150 0.0
+asin0053 asin 1e-150 -0.0 -> 1e-150 -0.0
+asin0054 asin 9.9999999999999998e-17 0.0 -> 9.9999999999999998e-17 0.0
+asin0055 asin 9.9999999999999998e-17 -0.0 -> 9.9999999999999998e-17 -0.0
+asin0056 asin 0.001 0.0 -> 0.0010000001666667416 0.0
+asin0057 asin 0.001 -0.0 -> 0.0010000001666667416 -0.0
+asin0058 asin 0.57899999999999996 0.0 -> 0.61750165481717001 0.0
+asin0059 asin 0.57899999999999996 -0.0 -> 0.61750165481717001 -0.0
+asin0060 asin 0.99999999999999989 0.0 -> 1.5707963118937354 0.0
+asin0061 asin 0.99999999999999989 -0.0 -> 1.5707963118937354 -0.0
+asin0062 asin 1.0000000000000002 0.0 -> 1.5707963267948966 2.1073424255447014e-08
+asin0063 asin 1.0000000000000002 -0.0 -> 1.5707963267948966 -2.1073424255447014e-08
+asin0064 asin 1.0009999999999999 0.0 -> 1.5707963267948966 0.044717633608306849
+asin0065 asin 1.0009999999999999 -0.0 -> 1.5707963267948966 -0.044717633608306849
+asin0066 asin 2.0 0.0 -> 1.5707963267948966 1.3169578969248168
+asin0067 asin 2.0 -0.0 -> 1.5707963267948966 -1.3169578969248168
+asin0068 asin 23.0 0.0 -> 1.5707963267948966 3.8281684713331012
+asin0069 asin 23.0 -0.0 -> 1.5707963267948966 -3.8281684713331012
+asin0070 asin 10000000000000000.0 0.0 -> 1.5707963267948966 37.534508668464674
+asin0071 asin 10000000000000000.0 -0.0 -> 1.5707963267948966 -37.534508668464674
+asin0072 asin 9.9999999999999998e+149 0.0 -> 1.5707963267948966 346.08091112966679
+asin0073 asin 9.9999999999999998e+149 -0.0 -> 1.5707963267948966 -346.08091112966679
+asin0074 asin 1.0000000000000001e+299 0.0 -> 1.5707963267948966 689.16608998577965
+asin0075 asin 1.0000000000000001e+299 -0.0 -> 1.5707963267948966 -689.16608998577965
+
+-- random inputs
+asin0100 asin -1.5979555835086083 -0.15003009814595247 -> -1.4515369557405788 -1.0544476399790823
+asin0101 asin -0.57488225895317679 -9.6080397838952743e-13 -> -0.61246024460412851 -1.174238005400403e-12
+asin0102 asin -3.6508087930516249 -0.36027527093220152 -> -1.4685890605305874 -1.9742273007152038
+asin0103 asin -1.5238659792326819 -1.1360813516996364 -> -0.86080051691147275 -1.3223742205689195
+asin0104 asin -1592.0639045555306 -0.72362427935018236 -> -1.5703418071175179 -8.0659336918729228
+asin0105 asin -0.19835471371312019 4.2131508416697709 -> -0.045777831019935149 2.1461732751933171
+asin0106 asin -1.918471054430213 0.40603305079779234 -> -1.3301396585791556 1.30263642314981
+asin0107 asin -254495.01623373642 0.71084414434470822 -> -1.5707935336394359 13.140183712762321
+asin0108 asin -0.31315882715691157 3.9647994288429866 -> -0.076450403840916004 2.0889762138713457
+asin0109 asin -0.90017064284720816 1.2530659485907105 -> -0.53466509741943447 1.1702811557577
+asin0110 asin 2.1615181696571075 -0.14058647488229523 -> 1.4976166323896871 -1.4085811039334604
+asin0111 asin 1.2104749210707795 -0.85732484485298999 -> 0.83913071588343924 -1.0681719250525901
+asin0112 asin 1.7059733185128891 -0.84032966373156581 -> 1.0510900815816229 -1.2967979791361652
+asin0113 asin 9.9137085017290687 -1.4608383970250893 -> 1.4237704820128891 -2.995414677560686
+asin0114 asin 117.12344751041495 -5453908091.5334015 -> 2.1475141411392012e-08 -23.112745450217066
+asin0115 asin 0.081041187798029227 0.067054349860173196 -> 0.080946786856771813 0.067223991060639698
+asin0116 asin 46.635472322049949 2.3835190718056678 -> 1.5197194940010779 4.5366989600972083
+asin0117 asin 3907.0687961127105 19.144021886390181 -> 1.5658965233083235 8.9637018715924217
+asin0118 asin 1.0889312322308273 509.01577883554768 -> 0.0021392803817829316 6.9256294494524706
+asin0119 asin 0.10851518277509224 1.5612510908217476 -> 0.058491014243902621 1.2297075725621327
+
+-- values near infinity
+asin0200 asin 1.5230241998821499e+308 5.5707228994084525e+307 -> 1.2201446370892068 710.37283486535966
+asin0201 asin 8.1334317698672204e+307 -9.2249425197872451e+307 -> 0.72259991284020042 -710.0962453049026
+asin0202 asin -9.9138506659241768e+307 6.701544526434995e+307 -> -0.97637511742194594 710.06887486671371
+asin0203 asin -1.4141298868173842e+308 -5.401505134514191e+307 -> -1.2059319055160587 -710.30396478954628
+asin0204 asin 0.0 9.1618092977897431e+307 -> 0.0 709.80181441050593
+asin0205 asin -0.0 6.8064342551939755e+307 -> -0.0 709.50463910853489
+asin0206 asin 0.0 -6.4997516454798215e+307 -> 0.0 -709.45853469751592
+asin0207 asin -0.0 -1.6767449053345242e+308 -> -0.0 -710.4062101803022
+asin0208 asin 5.4242749957378916e+307 0.0 -> 1.5707963267948966 709.27765497888902
+asin0209 asin 9.5342145121164749e+307 -0.0 -> 1.5707963267948966 -709.84165758595907
+asin0210 asin -7.0445698006201847e+307 0.0 -> -1.5707963267948966 709.53902780872136
+asin0211 asin -1.0016025569769706e+308 -0.0 -> -1.5707963267948966 -709.89095709697881
+asin0212 asin 1.6552203778877204e+308 0.48761543336249491 -> 1.5707963267948966 710.39328998153474
+asin0213 asin 1.2485712830384869e+308 -4.3489311161278899 -> 1.5707963267948966 -710.1113557467786
+asin0214 asin -1.5117842813353125e+308 5.123452666102434 -> -1.5707963267948966 710.30264641923031
+asin0215 asin -1.3167634313008016e+308 -0.52939679793528982 -> -1.5707963267948966 -710.16453260239768
+asin0216 asin 0.80843929176985907 1.0150851827767876e+308 -> 7.9642507396113875e-309 709.90432835561637
+asin0217 asin 8.2544809829680901 -1.7423548140539474e+308 -> 4.7375430746865733e-308 -710.44459336242164
+asin0218 asin -5.2499000118824295 4.6655578977512214e+307 -> -1.1252459249113292e-307 709.1269781491103
+asin0219 asin -5.9904782760833433 -4.7315689314781163e+307 -> -1.2660659419394637e-307 -709.14102757522312
+
+-- special values
+asin1000 asin -0.0 0.0 -> -0.0 0.0
+asin1001 asin 0.0 0.0 -> 0.0 0.0
+asin1002 asin -0.0 -0.0 -> -0.0 -0.0
+asin1003 asin 0.0 -0.0 -> 0.0 -0.0
+asin1004 asin -inf 0.0 -> -1.5707963267948966 inf
+asin1005 asin -inf 2.2999999999999998 -> -1.5707963267948966 inf
+asin1006 asin nan 0.0 -> nan nan
+asin1007 asin nan 2.2999999999999998 -> nan nan
+asin1008 asin -0.0 inf -> -0.0 inf
+asin1009 asin -2.2999999999999998 inf -> -0.0 inf
+asin1010 asin -inf inf -> -0.78539816339744828 inf
+asin1011 asin nan inf -> nan inf
+asin1012 asin -0.0 nan -> -0.0 nan
+asin1013 asin -2.2999999999999998 nan -> nan nan
+asin1014 asin -inf nan -> nan inf ignore-imag-sign
+asin1015 asin nan nan -> nan nan
+asin1016 asin inf 0.0 -> 1.5707963267948966 inf
+asin1017 asin inf 2.2999999999999998 -> 1.5707963267948966 inf
+asin1018 asin 0.0 inf -> 0.0 inf
+asin1019 asin 2.2999999999999998 inf -> 0.0 inf
+asin1020 asin inf inf -> 0.78539816339744828 inf
+asin1021 asin 0.0 nan -> 0.0 nan
+asin1022 asin 2.2999999999999998 nan -> nan nan
+asin1023 asin inf nan -> nan inf ignore-imag-sign
+asin1024 asin inf -0.0 -> 1.5707963267948966 -inf
+asin1025 asin inf -2.2999999999999998 -> 1.5707963267948966 -inf
+asin1026 asin nan -0.0 -> nan nan
+asin1027 asin nan -2.2999999999999998 -> nan nan
+asin1028 asin 0.0 -inf -> 0.0 -inf
+asin1029 asin 2.2999999999999998 -inf -> 0.0 -inf
+asin1030 asin inf -inf -> 0.78539816339744828 -inf
+asin1031 asin nan -inf -> nan -inf
+asin1032 asin -inf -0.0 -> -1.5707963267948966 -inf
+asin1033 asin -inf -2.2999999999999998 -> -1.5707963267948966 -inf
+asin1034 asin -0.0 -inf -> -0.0 -inf
+asin1035 asin -2.2999999999999998 -inf -> -0.0 -inf
+asin1036 asin -inf -inf -> -0.78539816339744828 -inf
+
+
+------------------------------------
+-- asinh: Inverse hyperbolic sine --
+------------------------------------
+
+-- zeros
+asinh0000 asinh 0.0 0.0 -> 0.0 0.0
+asinh0001 asinh 0.0 -0.0 -> 0.0 -0.0
+asinh0002 asinh -0.0 0.0 -> -0.0 0.0
+asinh0003 asinh -0.0 -0.0 -> -0.0 -0.0
+
+-- branch points: +/-i
+asinh0010 asinh 0.0 1.0 -> 0.0 1.5707963267948966
+asinh0011 asinh 0.0 -1.0 -> 0.0 -1.5707963267948966
+asinh0012 asinh -0.0 1.0 -> -0.0 1.5707963267948966
+asinh0013 asinh -0.0 -1.0 -> -0.0 -1.5707963267948966
+
+-- values along both sides of imaginary axis
+asinh0020 asinh 0.0 -9.8813129168249309e-324 -> 0.0 -9.8813129168249309e-324
+asinh0021 asinh -0.0 -9.8813129168249309e-324 -> -0.0 -9.8813129168249309e-324
+asinh0022 asinh 0.0 -1e-305 -> 0.0 -1e-305
+asinh0023 asinh -0.0 -1e-305 -> -0.0 -1e-305
+asinh0024 asinh 0.0 -1e-150 -> 0.0 -1e-150
+asinh0025 asinh -0.0 -1e-150 -> -0.0 -1e-150
+asinh0026 asinh 0.0 -9.9999999999999998e-17 -> 0.0 -9.9999999999999998e-17
+asinh0027 asinh -0.0 -9.9999999999999998e-17 -> -0.0 -9.9999999999999998e-17
+asinh0028 asinh 0.0 -0.001 -> 0.0 -0.0010000001666667416
+asinh0029 asinh -0.0 -0.001 -> -0.0 -0.0010000001666667416
+asinh0030 asinh 0.0 -0.57899999999999996 -> 0.0 -0.61750165481717001
+asinh0031 asinh -0.0 -0.57899999999999996 -> -0.0 -0.61750165481717001
+asinh0032 asinh 0.0 -0.99999999999999989 -> 0.0 -1.5707963118937354
+asinh0033 asinh -0.0 -0.99999999999999989 -> -0.0 -1.5707963118937354
+asinh0034 asinh 0.0 -1.0000000000000002 -> 2.1073424255447014e-08 -1.5707963267948966
+asinh0035 asinh -0.0 -1.0000000000000002 -> -2.1073424255447014e-08 -1.5707963267948966
+asinh0036 asinh 0.0 -1.0009999999999999 -> 0.044717633608306849 -1.5707963267948966
+asinh0037 asinh -0.0 -1.0009999999999999 -> -0.044717633608306849 -1.5707963267948966
+asinh0038 asinh 0.0 -2.0 -> 1.3169578969248168 -1.5707963267948966
+asinh0039 asinh -0.0 -2.0 -> -1.3169578969248168 -1.5707963267948966
+asinh0040 asinh 0.0 -20.0 -> 3.6882538673612966 -1.5707963267948966
+asinh0041 asinh -0.0 -20.0 -> -3.6882538673612966 -1.5707963267948966
+asinh0042 asinh 0.0 -10000000000000000.0 -> 37.534508668464674 -1.5707963267948966
+asinh0043 asinh -0.0 -10000000000000000.0 -> -37.534508668464674 -1.5707963267948966
+asinh0044 asinh 0.0 -9.9999999999999998e+149 -> 346.08091112966679 -1.5707963267948966
+asinh0045 asinh -0.0 -9.9999999999999998e+149 -> -346.08091112966679 -1.5707963267948966
+asinh0046 asinh 0.0 -1.0000000000000001e+299 -> 689.16608998577965 -1.5707963267948966
+asinh0047 asinh -0.0 -1.0000000000000001e+299 -> -689.16608998577965 -1.5707963267948966
+asinh0048 asinh 0.0 9.8813129168249309e-324 -> 0.0 9.8813129168249309e-324
+asinh0049 asinh -0.0 9.8813129168249309e-324 -> -0.0 9.8813129168249309e-324
+asinh0050 asinh 0.0 1e-305 -> 0.0 1e-305
+asinh0051 asinh -0.0 1e-305 -> -0.0 1e-305
+asinh0052 asinh 0.0 1e-150 -> 0.0 1e-150
+asinh0053 asinh -0.0 1e-150 -> -0.0 1e-150
+asinh0054 asinh 0.0 9.9999999999999998e-17 -> 0.0 9.9999999999999998e-17
+asinh0055 asinh -0.0 9.9999999999999998e-17 -> -0.0 9.9999999999999998e-17
+asinh0056 asinh 0.0 0.001 -> 0.0 0.0010000001666667416
+asinh0057 asinh -0.0 0.001 -> -0.0 0.0010000001666667416
+asinh0058 asinh 0.0 0.57899999999999996 -> 0.0 0.61750165481717001
+asinh0059 asinh -0.0 0.57899999999999996 -> -0.0 0.61750165481717001
+asinh0060 asinh 0.0 0.99999999999999989 -> 0.0 1.5707963118937354
+asinh0061 asinh -0.0 0.99999999999999989 -> -0.0 1.5707963118937354
+asinh0062 asinh 0.0 1.0000000000000002 -> 2.1073424255447014e-08 1.5707963267948966
+asinh0063 asinh -0.0 1.0000000000000002 -> -2.1073424255447014e-08 1.5707963267948966
+asinh0064 asinh 0.0 1.0009999999999999 -> 0.044717633608306849 1.5707963267948966
+asinh0065 asinh -0.0 1.0009999999999999 -> -0.044717633608306849 1.5707963267948966
+asinh0066 asinh 0.0 2.0 -> 1.3169578969248168 1.5707963267948966
+asinh0067 asinh -0.0 2.0 -> -1.3169578969248168 1.5707963267948966
+asinh0068 asinh 0.0 20.0 -> 3.6882538673612966 1.5707963267948966
+asinh0069 asinh -0.0 20.0 -> -3.6882538673612966 1.5707963267948966
+asinh0070 asinh 0.0 10000000000000000.0 -> 37.534508668464674 1.5707963267948966
+asinh0071 asinh -0.0 10000000000000000.0 -> -37.534508668464674 1.5707963267948966
+asinh0072 asinh 0.0 9.9999999999999998e+149 -> 346.08091112966679 1.5707963267948966
+asinh0073 asinh -0.0 9.9999999999999998e+149 -> -346.08091112966679 1.5707963267948966
+asinh0074 asinh 0.0 1.0000000000000001e+299 -> 689.16608998577965 1.5707963267948966
+asinh0075 asinh -0.0 1.0000000000000001e+299 -> -689.16608998577965 1.5707963267948966
+
+-- random inputs
+asinh0100 asinh -0.5946402853710423 -0.044506548910000145 -> -0.56459775392653022 -0.038256221441536356
+asinh0101 asinh -0.19353958046180916 -0.017489624793193454 -> -0.19237926804196651 -0.017171741895336792
+asinh0102 asinh -0.033117585138955893 -8.5256414015933757 -> -2.8327758348650969 -1.5668848791092411
+asinh0103 asinh -1.5184043184035716 -0.73491245339073275 -> -1.2715891419764005 -0.39204624408542355
+asinh0104 asinh -0.60716120271208818 -0.28900743958436542 -> -0.59119299421187232 -0.24745931678118135
+asinh0105 asinh -0.0237177865112429 2.8832601052166313 -> -1.7205820772413236 1.5620261702963094
+asinh0106 asinh -2.3906812342743979 2.6349216848574013 -> -1.9609636249445124 0.8142142660574706
+asinh0107 asinh -0.0027605019787620517 183.85588476550555 -> -5.9072920005445066 1.5707813120847871
+asinh0108 asinh -0.99083661164404713 0.028006797051617648 -> -0.8750185251283995 0.019894099615994653
+asinh0109 asinh -3.0362951937986393 0.86377266758504867 -> -1.8636030714685221 0.26475058859950168
+asinh0110 asinh 0.34438464536152769 -0.71603790174885029 -> 0.43985415690734164 -0.71015037409294324
+asinh0111 asinh 4.4925124413876256 -60604595352.871613 -> 25.520783738612078 -1.5707963267207683
+asinh0112 asinh 2.3213991428170337 -7.5459667007307258 -> 2.7560464993451643 -1.270073210856117
+asinh0113 asinh 0.21291939741682028 -1.2720428814784408 -> 0.77275088137338266 -1.3182099250896895
+asinh0114 asinh 6.6447359379455957 -0.97196191666946996 -> 2.602830695139672 -0.14368247412319965
+asinh0115 asinh 7.1326256655083746 2.1516360452706857 -> 2.7051146374367212 0.29051701669727581
+asinh0116 asinh 0.18846550905063442 3.4705348585339832 -> 1.917697875799296 1.514155593347924
+asinh0117 asinh 0.19065075303281598 0.26216814548222012 -> 0.19603050785932474 0.26013422809614117
+asinh0118 asinh 2.0242004665739719 0.70510281647495787 -> 1.4970366212896002 0.30526007200481453
+asinh0119 asinh 37.336596461576057 717.29157391678234 -> 7.269981997945294 1.5187910219576033
+
+-- values near infinity
+asinh0200 asinh 1.0760517500874541e+308 1.1497786241240167e+308 -> 710.34346055651815 0.81850936961793475
+asinh0201 asinh 1.1784839328845529e+308 -1.6478429586716638e+308 -> 710.59536255783678 -0.94996311735607697
+asinh0202 asinh -4.8777682248909193e+307 1.4103736217538474e+308 -> -710.28970147376992 1.2378239519096443
+asinh0203 asinh -1.2832478903233108e+308 -1.5732392613155698e+308 -> -710.59750164290745 -0.88657181439322452
+asinh0204 asinh 0.0 6.8431383856345372e+307 -> 709.51001718444604 1.5707963267948966
+asinh0205 asinh -0.0 8.601822432238051e+307 -> -709.73874482126689 1.5707963267948966
+asinh0206 asinh 0.0 -5.5698396067303782e+307 -> 709.30413698733742 -1.5707963267948966
+asinh0207 asinh -0.0 -7.1507777734621804e+307 -> -709.55399186002705 -1.5707963267948966
+asinh0208 asinh 1.6025136110019349e+308 0.0 -> 710.3609292261076 0.0
+asinh0209 asinh 1.3927819858239114e+308 -0.0 -> 710.22065899832899 -0.0
+asinh0210 asinh -6.0442994056210995e+307 0.0 -> -709.38588631057621 0.0
+asinh0211 asinh -1.2775271979042634e+308 -0.0 -> -710.13428215553972 -0.0
+asinh0212 asinh 1.0687496260268489e+308 1.0255615699476961 -> 709.95584521407841 9.5959010882679093e-309
+asinh0213 asinh 1.0050967333370962e+308 -0.87668970117333433 -> 709.89443961168183 -8.7224410556242882e-309
+asinh0214 asinh -5.7161452814862392e+307 8.2377808413450122 -> -709.33006540611166 1.4411426644501116e-307
+asinh0215 asinh -8.2009040727653315e+307 -6.407409526654976 -> -709.69101513070109 -7.8130526461510088e-308
+asinh0216 asinh 6.4239368496483982 1.6365990821551427e+308 -> 710.38197618101287 1.5707963267948966
+asinh0217 asinh 5.4729111423315882 -1.1227237438144211e+308 -> 710.00511346983546 -1.5707963267948966
+asinh0218 asinh -8.3455818297412723 1.443172020182019e+308 -> -710.25619930551818 1.5707963267948966
+asinh0219 asinh -2.6049726230372441 -1.7952291144022702e+308 -> -710.47448847685644 -1.5707963267948966
+
+-- values near 0
+asinh0220 asinh 1.2940113339664088e-314 6.9169190417774516e-323 -> 1.2940113339664088e-314 6.9169190417774516e-323
+asinh0221 asinh 2.3848478863874649e-315 -3.1907655025717717e-310 -> 2.3848478863874649e-315 -3.1907655025717717e-310
+asinh0222 asinh -3.0097643679641622e-316 4.6936236354918422e-322 -> -3.0097643679641622e-316 4.6936236354918422e-322
+asinh0223 asinh -1.787997087755751e-308 -8.5619622834902341e-310 -> -1.787997087755751e-308 -8.5619622834902341e-310
+asinh0224 asinh 0.0 1.2491433448427325e-314 -> 0.0 1.2491433448427325e-314
+asinh0225 asinh -0.0 2.5024072154538062e-308 -> -0.0 2.5024072154538062e-308
+asinh0226 asinh 0.0 -2.9643938750474793e-323 -> 0.0 -2.9643938750474793e-323
+asinh0227 asinh -0.0 -2.9396905927554169e-320 -> -0.0 -2.9396905927554169e-320
+asinh0228 asinh 5.64042930029359e-317 0.0 -> 5.64042930029359e-317 0.0
+asinh0229 asinh 3.3833911866596068e-318 -0.0 -> 3.3833911866596068e-318 -0.0
+asinh0230 asinh -4.9406564584124654e-324 0.0 -> -4.9406564584124654e-324 0.0
+asinh0231 asinh -2.2211379227994845e-308 -0.0 -> -2.2211379227994845e-308 -0.0
+
+-- special values
+asinh1000 asinh 0.0 0.0 -> 0.0 0.0
+asinh1001 asinh 0.0 -0.0 -> 0.0 -0.0
+asinh1002 asinh -0.0 0.0 -> -0.0 0.0
+asinh1003 asinh -0.0 -0.0 -> -0.0 -0.0
+asinh1004 asinh 0.0 inf -> inf 1.5707963267948966
+asinh1005 asinh 2.3 inf -> inf 1.5707963267948966
+asinh1006 asinh 0.0 nan -> nan nan
+asinh1007 asinh 2.3 nan -> nan nan
+asinh1008 asinh inf 0.0 -> inf 0.0
+asinh1009 asinh inf 2.3 -> inf 0.0
+asinh1010 asinh inf inf -> inf 0.78539816339744828
+asinh1011 asinh inf nan -> inf nan
+asinh1012 asinh nan 0.0 -> nan 0.0
+asinh1013 asinh nan 2.3 -> nan nan
+asinh1014 asinh nan inf -> inf nan                      ignore-real-sign
+asinh1015 asinh nan nan -> nan nan
+asinh1016 asinh 0.0 -inf -> inf -1.5707963267948966
+asinh1017 asinh 2.3 -inf -> inf -1.5707963267948966
+asinh1018 asinh inf -0.0 -> inf -0.0
+asinh1019 asinh inf -2.3 -> inf -0.0
+asinh1020 asinh inf -inf -> inf -0.78539816339744828
+asinh1021 asinh nan -0.0 -> nan -0.0
+asinh1022 asinh nan -2.3 -> nan nan
+asinh1023 asinh nan -inf -> inf nan                     ignore-real-sign
+asinh1024 asinh -0.0 -inf -> -inf -1.5707963267948966
+asinh1025 asinh -2.3 -inf -> -inf -1.5707963267948966
+asinh1026 asinh -0.0 nan -> nan nan
+asinh1027 asinh -2.3 nan -> nan nan
+asinh1028 asinh -inf -0.0 -> -inf -0.0
+asinh1029 asinh -inf -2.3 -> -inf -0.0
+asinh1030 asinh -inf -inf -> -inf -0.78539816339744828
+asinh1031 asinh -inf nan -> -inf nan
+asinh1032 asinh -0.0 inf -> -inf 1.5707963267948966
+asinh1033 asinh -2.3 inf -> -inf 1.5707963267948966
+asinh1034 asinh -inf 0.0 -> -inf 0.0
+asinh1035 asinh -inf 2.3 -> -inf 0.0
+asinh1036 asinh -inf inf -> -inf 0.78539816339744828
+
+
+---------------------------
+-- atan: Inverse tangent --
+---------------------------
+
+-- zeros
+-- These are tested in testAtanSign in test_cmath.py
+-- atan0000 atan 0.0 0.0 -> 0.0 0.0
+-- atan0001 atan 0.0 -0.0 -> 0.0 -0.0
+-- atan0002 atan -0.0 0.0 -> -0.0 0.0
+-- atan0003 atan -0.0 -0.0 -> -0.0 -0.0
+
+-- values along both sides of imaginary axis
+atan0010 atan 0.0 -9.8813129168249309e-324 -> 0.0 -9.8813129168249309e-324
+atan0011 atan -0.0 -9.8813129168249309e-324 -> -0.0 -9.8813129168249309e-324
+atan0012 atan 0.0 -1e-305 -> 0.0 -1e-305
+atan0013 atan -0.0 -1e-305 -> -0.0 -1e-305
+atan0014 atan 0.0 -1e-150 -> 0.0 -1e-150
+atan0015 atan -0.0 -1e-150 -> -0.0 -1e-150
+atan0016 atan 0.0 -9.9999999999999998e-17 -> 0.0 -9.9999999999999998e-17
+atan0017 atan -0.0 -9.9999999999999998e-17 -> -0.0 -9.9999999999999998e-17
+atan0018 atan 0.0 -0.001 -> 0.0 -0.0010000003333335333
+atan0019 atan -0.0 -0.001 -> -0.0 -0.0010000003333335333
+atan0020 atan 0.0 -0.57899999999999996 -> 0.0 -0.6609570902866303
+atan0021 atan -0.0 -0.57899999999999996 -> -0.0 -0.6609570902866303
+atan0022 atan 0.0 -0.99999999999999989 -> 0.0 -18.714973875118524
+atan0023 atan -0.0 -0.99999999999999989 -> -0.0 -18.714973875118524
+atan0024 atan 0.0 -1.0000000000000002 -> 1.5707963267948966 -18.36840028483855
+atan0025 atan -0.0 -1.0000000000000002 -> -1.5707963267948966 -18.36840028483855
+atan0026 atan 0.0 -1.0009999999999999 -> 1.5707963267948966 -3.8007011672919218
+atan0027 atan -0.0 -1.0009999999999999 -> -1.5707963267948966 -3.8007011672919218
+atan0028 atan 0.0 -2.0 -> 1.5707963267948966 -0.54930614433405489
+atan0029 atan -0.0 -2.0 -> -1.5707963267948966 -0.54930614433405489
+atan0030 atan 0.0 -20.0 -> 1.5707963267948966 -0.050041729278491265
+atan0031 atan -0.0 -20.0 -> -1.5707963267948966 -0.050041729278491265
+atan0032 atan 0.0 -10000000000000000.0 -> 1.5707963267948966 -9.9999999999999998e-17
+atan0033 atan -0.0 -10000000000000000.0 -> -1.5707963267948966 -9.9999999999999998e-17
+atan0034 atan 0.0 -9.9999999999999998e+149 -> 1.5707963267948966 -1e-150
+atan0035 atan -0.0 -9.9999999999999998e+149 -> -1.5707963267948966 -1e-150
+atan0036 atan 0.0 -1.0000000000000001e+299 -> 1.5707963267948966 -9.9999999999999999e-300
+atan0037 atan -0.0 -1.0000000000000001e+299 -> -1.5707963267948966 -9.9999999999999999e-300
+atan0038 atan 0.0 9.8813129168249309e-324 -> 0.0 9.8813129168249309e-324
+atan0039 atan -0.0 9.8813129168249309e-324 -> -0.0 9.8813129168249309e-324
+atan0040 atan 0.0 1e-305 -> 0.0 1e-305
+atan0041 atan -0.0 1e-305 -> -0.0 1e-305
+atan0042 atan 0.0 1e-150 -> 0.0 1e-150
+atan0043 atan -0.0 1e-150 -> -0.0 1e-150
+atan0044 atan 0.0 9.9999999999999998e-17 -> 0.0 9.9999999999999998e-17
+atan0045 atan -0.0 9.9999999999999998e-17 -> -0.0 9.9999999999999998e-17
+atan0046 atan 0.0 0.001 -> 0.0 0.0010000003333335333
+atan0047 atan -0.0 0.001 -> -0.0 0.0010000003333335333
+atan0048 atan 0.0 0.57899999999999996 -> 0.0 0.6609570902866303
+atan0049 atan -0.0 0.57899999999999996 -> -0.0 0.6609570902866303
+atan0050 atan 0.0 0.99999999999999989 -> 0.0 18.714973875118524
+atan0051 atan -0.0 0.99999999999999989 -> -0.0 18.714973875118524
+atan0052 atan 0.0 1.0000000000000002 -> 1.5707963267948966 18.36840028483855
+atan0053 atan -0.0 1.0000000000000002 -> -1.5707963267948966 18.36840028483855
+atan0054 atan 0.0 1.0009999999999999 -> 1.5707963267948966 3.8007011672919218
+atan0055 atan -0.0 1.0009999999999999 -> -1.5707963267948966 3.8007011672919218
+atan0056 atan 0.0 2.0 -> 1.5707963267948966 0.54930614433405489
+atan0057 atan -0.0 2.0 -> -1.5707963267948966 0.54930614433405489
+atan0058 atan 0.0 20.0 -> 1.5707963267948966 0.050041729278491265
+atan0059 atan -0.0 20.0 -> -1.5707963267948966 0.050041729278491265
+atan0060 atan 0.0 10000000000000000.0 -> 1.5707963267948966 9.9999999999999998e-17
+atan0061 atan -0.0 10000000000000000.0 -> -1.5707963267948966 9.9999999999999998e-17
+atan0062 atan 0.0 9.9999999999999998e+149 -> 1.5707963267948966 1e-150
+atan0063 atan -0.0 9.9999999999999998e+149 -> -1.5707963267948966 1e-150
+atan0064 atan 0.0 1.0000000000000001e+299 -> 1.5707963267948966 9.9999999999999999e-300
+atan0065 atan -0.0 1.0000000000000001e+299 -> -1.5707963267948966 9.9999999999999999e-300
+
+-- random inputs
+atan0100 atan -0.32538873661060214 -1.5530461550412578 -> -1.3682728427554227 -0.69451401598762041
+atan0101 atan -0.45863393495197929 -4799.1747094903594 -> -1.5707963068820623 -0.00020836916050636145
+atan0102 atan -8.3006999685976162 -2.6788890251790938 -> -1.4619862771810199 -0.034811669653327826
+atan0103 atan -1.8836307682985314 -1.1441976638861771 -> -1.1839984370871612 -0.20630956157312796
+atan0104 atan -0.00063230482407491669 -4.9312520961829485 -> -1.5707692093223147 -0.20563867743008304
+atan0105 atan -0.84278137150065946 179012.37493146997 -> -1.5707963267685969 5.5862059836425272e-06
+atan0106 atan -0.95487853984049287 14.311334539886177 -> -1.5661322859434561 0.069676024526232005
+atan0107 atan -1.3513252539663239 6.0500727021632198e-08 -> -0.93371676315220975 2.140800269742656e-08
+atan0108 atan -0.20566254458595795 0.11933771944159823 -> -0.20556463711174916 0.11493405387141732
+atan0109 atan -0.58563718795408559 0.64438965423212868 -> -0.68361089300233124 0.46759762751800249
+atan0110 atan 48.479267751948292 -78.386382460112543 -> 1.5650888770910523 -0.0092276811373297584
+atan0111 atan 1.0575373914056061 -0.75988012377296987 -> 0.94430886722043594 -0.31915698126703118
+atan0112 atan 4444810.4314677203 -0.56553404593942558 -> 1.5707961018134231 -2.8625446437701909e-14
+atan0113 atan 0.010101405082520009 -0.032932668550282478 -> 0.01011202676646334 -0.032941214776834996
+atan0114 atan 1.5353585300154911 -2.1947099346796519 -> 1.3400310739206394 -0.29996003607449045
+atan0115 atan 0.21869457055670882 9.9915684254007093 -> 1.5685846078876444 0.1003716881759439
+atan0116 atan 0.17783290150246836 0.064334689863650957 -> 0.17668728064286277 0.062435808728873846
+atan0117 atan 15.757474087615918 383.57262142534 -> 1.5706894060369621 0.0026026817278826603
+atan0118 atan 10.587017408533317 0.21720238081843438 -> 1.4766594681336236 0.0019199097383010061
+atan0119 atan 0.86026078678781204 0.1230148609359502 -> 0.7147259322534929 0.070551221954286605
+
+-- values near infinity
+atan0200 atan 7.8764397011195798e+307 8.1647921137746308e+307 -> 1.5707963267948966 6.3439446939604493e-309
+atan0201 atan 1.5873698696131487e+308 -1.0780367422960641e+308 -> 1.5707963267948966 -2.9279309368530781e-309
+atan0202 atan -1.5844551864825834e+308 1.0290657809098675e+308 -> -1.5707963267948966 2.8829614736961417e-309
+atan0203 atan -1.3168792562524032e+308 -9.088432341614825e+307 -> -1.5707963267948966 -3.5499373057390056e-309
+atan0204 atan 0.0 1.0360465742258337e+308 -> 1.5707963267948966 9.6520757355646018e-309
+atan0205 atan -0.0 1.0045063210373196e+308 -> -1.5707963267948966 9.955138947929503e-309
+atan0206 atan 0.0 -9.5155296715763696e+307 -> 1.5707963267948966 -1.050913648020118e-308
+atan0207 atan -0.0 -1.5565700490496501e+308 -> -1.5707963267948966 -6.4243816114189071e-309
+atan0208 atan 1.2956339389525244e+308 0.0 -> 1.5707963267948966 0.0
+atan0209 atan 1.4408126243772151e+308 -0.0 -> 1.5707963267948966 -0.0
+atan0210 atan -1.0631786461936417e+308 0.0 -> -1.5707963267948966 0.0
+atan0211 atan -1.0516056964171069e+308 -0.0 -> -1.5707963267948966 -0.0
+atan0212 atan 1.236162319603838e+308 4.6827953496242936 -> 1.5707963267948966 0.0
+atan0213 atan 7.000516472897218e+307 -5.8631608017844163 -> 1.5707963267948966 -0.0
+atan0214 atan -1.5053444003338508e+308 5.1199197268420313 -> -1.5707963267948966 0.0
+atan0215 atan -1.399172518147259e+308 -3.5687766472913673 -> -1.5707963267948966 -0.0
+atan0216 atan 8.1252833070803021 6.2782953917343822e+307 -> 1.5707963267948966 1.5927890256908564e-308
+atan0217 atan 2.8034285947515167 -1.3378049775753878e+308 -> 1.5707963267948966 -7.4749310756219562e-309
+atan0218 atan -1.4073509988974953 1.6776381785968355e+308 -> -1.5707963267948966 5.9607608646364569e-309
+atan0219 atan -2.7135551527592119 -1.281567445525738e+308 -> -1.5707963267948966 -7.8029447727565326e-309
+
+-- imaginary part = +/-1, real part tiny
+atan0300 atan -1e-150 -1.0 -> -0.78539816339744828 -173.04045556483339
+atan0301 atan 1e-155 1.0 -> 0.78539816339744828 178.79691829731851
+atan0302 atan 9.9999999999999999e-161 -1.0 -> 0.78539816339744828 -184.55338102980363
+atan0303 atan -1e-165 1.0 -> -0.78539816339744828 190.30984376228875
+atan0304 atan -9.9998886718268301e-321 -1.0 -> -0.78539816339744828 -368.76019403576692
+
+-- Additional real values (mpmath)
+atan0400 atan 1.7976931348623157e+308 0.0 -> 1.5707963267948966192 0.0
+atan0401 atan -1.7976931348623157e+308 0.0 -> -1.5707963267948966192 0.0
+atan0402 atan 1e-17 0.0 -> 1.0000000000000000715e-17 0.0
+atan0403 atan -1e-17 0.0 -> -1.0000000000000000715e-17 0.0
+atan0404 atan 0.0001 0.0 -> 0.000099999999666666673459 0.0
+atan0405 atan -0.0001 0.0 -> -0.000099999999666666673459 0.0
+atan0406 atan 0.999999999999999 0.0 -> 0.78539816339744781002 0.0
+atan0407 atan 1.000000000000001 0.0 -> 0.78539816339744886473 0.0
+atan0408 atan 14.101419947171719 0.0 -> 1.4999999999999999969 0.0
+atan0409 atan 1255.7655915007897 0.0 -> 1.5700000000000000622 0.0
+
+-- special values
+atan1000 atan -0.0 0.0 -> -0.0 0.0
+atan1001 atan nan 0.0 -> nan 0.0
+atan1002 atan -0.0 1.0 -> -0.0 inf divide-by-zero
+atan1003 atan -inf 0.0 -> -1.5707963267948966 0.0
+atan1004 atan -inf 2.2999999999999998 -> -1.5707963267948966 0.0
+atan1005 atan nan 2.2999999999999998 -> nan nan
+atan1006 atan -0.0 inf -> -1.5707963267948966 0.0
+atan1007 atan -2.2999999999999998 inf -> -1.5707963267948966 0.0
+atan1008 atan -inf inf -> -1.5707963267948966 0.0
+atan1009 atan nan inf -> nan 0.0
+atan1010 atan -0.0 nan -> nan nan
+atan1011 atan -2.2999999999999998 nan -> nan nan
+atan1012 atan -inf nan -> -1.5707963267948966 0.0 ignore-imag-sign
+atan1013 atan nan nan -> nan nan
+atan1014 atan 0.0 0.0 -> 0.0 0.0
+atan1015 atan 0.0 1.0 -> 0.0 inf divide-by-zero
+atan1016 atan inf 0.0 -> 1.5707963267948966 0.0
+atan1017 atan inf 2.2999999999999998 -> 1.5707963267948966 0.0
+atan1018 atan 0.0 inf -> 1.5707963267948966 0.0
+atan1019 atan 2.2999999999999998 inf -> 1.5707963267948966 0.0
+atan1020 atan inf inf -> 1.5707963267948966 0.0
+atan1021 atan 0.0 nan -> nan nan
+atan1022 atan 2.2999999999999998 nan -> nan nan
+atan1023 atan inf nan -> 1.5707963267948966 0.0 ignore-imag-sign
+atan1024 atan 0.0 -0.0 -> 0.0 -0.0
+atan1025 atan nan -0.0 -> nan -0.0
+atan1026 atan 0.0 -1.0 -> 0.0 -inf divide-by-zero
+atan1027 atan inf -0.0 -> 1.5707963267948966 -0.0
+atan1028 atan inf -2.2999999999999998 -> 1.5707963267948966 -0.0
+atan1029 atan nan -2.2999999999999998 -> nan nan
+atan1030 atan 0.0 -inf -> 1.5707963267948966 -0.0
+atan1031 atan 2.2999999999999998 -inf -> 1.5707963267948966 -0.0
+atan1032 atan inf -inf -> 1.5707963267948966 -0.0
+atan1033 atan nan -inf -> nan -0.0
+atan1034 atan -0.0 -0.0 -> -0.0 -0.0
+atan1035 atan -0.0 -1.0 -> -0.0 -inf divide-by-zero
+atan1036 atan -inf -0.0 -> -1.5707963267948966 -0.0
+atan1037 atan -inf -2.2999999999999998 -> -1.5707963267948966 -0.0
+atan1038 atan -0.0 -inf -> -1.5707963267948966 -0.0
+atan1039 atan -2.2999999999999998 -inf -> -1.5707963267948966 -0.0
+atan1040 atan -inf -inf -> -1.5707963267948966 -0.0
+
+
+---------------------------------------
+-- atanh: Inverse hyperbolic tangent --
+---------------------------------------
+
+-- zeros
+-- These are tested in testAtanhSign in test_cmath.py
+-- atanh0000 atanh 0.0 0.0 -> 0.0 0.0
+-- atanh0001 atanh 0.0 -0.0 -> 0.0 -0.0
+-- atanh0002 atanh -0.0 0.0 -> -0.0 0.0
+-- atanh0003 atanh -0.0 -0.0 -> -0.0 -0.0
+
+-- values along both sides of real axis
+atanh0010 atanh -9.8813129168249309e-324 0.0 -> -9.8813129168249309e-324 0.0
+atanh0011 atanh -9.8813129168249309e-324 -0.0 -> -9.8813129168249309e-324 -0.0
+atanh0012 atanh -1e-305 0.0 -> -1e-305 0.0
+atanh0013 atanh -1e-305 -0.0 -> -1e-305 -0.0
+atanh0014 atanh -1e-150 0.0 -> -1e-150 0.0
+atanh0015 atanh -1e-150 -0.0 -> -1e-150 -0.0
+atanh0016 atanh -9.9999999999999998e-17 0.0 -> -9.9999999999999998e-17 0.0
+atanh0017 atanh -9.9999999999999998e-17 -0.0 -> -9.9999999999999998e-17 -0.0
+atanh0018 atanh -0.001 0.0 -> -0.0010000003333335333 0.0
+atanh0019 atanh -0.001 -0.0 -> -0.0010000003333335333 -0.0
+atanh0020 atanh -0.57899999999999996 0.0 -> -0.6609570902866303 0.0
+atanh0021 atanh -0.57899999999999996 -0.0 -> -0.6609570902866303 -0.0
+atanh0022 atanh -0.99999999999999989 0.0 -> -18.714973875118524 0.0
+atanh0023 atanh -0.99999999999999989 -0.0 -> -18.714973875118524 -0.0
+atanh0024 atanh -1.0000000000000002 0.0 -> -18.36840028483855 1.5707963267948966
+atanh0025 atanh -1.0000000000000002 -0.0 -> -18.36840028483855 -1.5707963267948966
+atanh0026 atanh -1.0009999999999999 0.0 -> -3.8007011672919218 1.5707963267948966
+atanh0027 atanh -1.0009999999999999 -0.0 -> -3.8007011672919218 -1.5707963267948966
+atanh0028 atanh -2.0 0.0 -> -0.54930614433405489 1.5707963267948966
+atanh0029 atanh -2.0 -0.0 -> -0.54930614433405489 -1.5707963267948966
+atanh0030 atanh -23.0 0.0 -> -0.043505688494814884 1.5707963267948966
+atanh0031 atanh -23.0 -0.0 -> -0.043505688494814884 -1.5707963267948966
+atanh0032 atanh -10000000000000000.0 0.0 -> -9.9999999999999998e-17 1.5707963267948966
+atanh0033 atanh -10000000000000000.0 -0.0 -> -9.9999999999999998e-17 -1.5707963267948966
+atanh0034 atanh -9.9999999999999998e+149 0.0 -> -1e-150 1.5707963267948966
+atanh0035 atanh -9.9999999999999998e+149 -0.0 -> -1e-150 -1.5707963267948966
+atanh0036 atanh -1.0000000000000001e+299 0.0 -> -9.9999999999999999e-300 1.5707963267948966
+atanh0037 atanh -1.0000000000000001e+299 -0.0 -> -9.9999999999999999e-300 -1.5707963267948966
+atanh0038 atanh 9.8813129168249309e-324 0.0 -> 9.8813129168249309e-324 0.0
+atanh0039 atanh 9.8813129168249309e-324 -0.0 -> 9.8813129168249309e-324 -0.0
+atanh0040 atanh 1e-305 0.0 -> 1e-305 0.0
+atanh0041 atanh 1e-305 -0.0 -> 1e-305 -0.0
+atanh0042 atanh 1e-150 0.0 -> 1e-150 0.0
+atanh0043 atanh 1e-150 -0.0 -> 1e-150 -0.0
+atanh0044 atanh 9.9999999999999998e-17 0.0 -> 9.9999999999999998e-17 0.0
+atanh0045 atanh 9.9999999999999998e-17 -0.0 -> 9.9999999999999998e-17 -0.0
+atanh0046 atanh 0.001 0.0 -> 0.0010000003333335333 0.0
+atanh0047 atanh 0.001 -0.0 -> 0.0010000003333335333 -0.0
+atanh0048 atanh 0.57899999999999996 0.0 -> 0.6609570902866303 0.0
+atanh0049 atanh 0.57899999999999996 -0.0 -> 0.6609570902866303 -0.0
+atanh0050 atanh 0.99999999999999989 0.0 -> 18.714973875118524 0.0
+atanh0051 atanh 0.99999999999999989 -0.0 -> 18.714973875118524 -0.0
+atanh0052 atanh 1.0000000000000002 0.0 -> 18.36840028483855 1.5707963267948966
+atanh0053 atanh 1.0000000000000002 -0.0 -> 18.36840028483855 -1.5707963267948966
+atanh0054 atanh 1.0009999999999999 0.0 -> 3.8007011672919218 1.5707963267948966
+atanh0055 atanh 1.0009999999999999 -0.0 -> 3.8007011672919218 -1.5707963267948966
+atanh0056 atanh 2.0 0.0 -> 0.54930614433405489 1.5707963267948966
+atanh0057 atanh 2.0 -0.0 -> 0.54930614433405489 -1.5707963267948966
+atanh0058 atanh 23.0 0.0 -> 0.043505688494814884 1.5707963267948966
+atanh0059 atanh 23.0 -0.0 -> 0.043505688494814884 -1.5707963267948966
+atanh0060 atanh 10000000000000000.0 0.0 -> 9.9999999999999998e-17 1.5707963267948966
+atanh0061 atanh 10000000000000000.0 -0.0 -> 9.9999999999999998e-17 -1.5707963267948966
+atanh0062 atanh 9.9999999999999998e+149 0.0 -> 1e-150 1.5707963267948966
+atanh0063 atanh 9.9999999999999998e+149 -0.0 -> 1e-150 -1.5707963267948966
+atanh0064 atanh 1.0000000000000001e+299 0.0 -> 9.9999999999999999e-300 1.5707963267948966
+atanh0065 atanh 1.0000000000000001e+299 -0.0 -> 9.9999999999999999e-300 -1.5707963267948966
+
+-- random inputs
+atanh0100 atanh -0.54460925980633501 -0.54038050126721027 -> -0.41984265808446974 -0.60354153938352828
+atanh0101 atanh -1.6934614269829051 -0.48807386108113621 -> -0.58592769102243281 -1.3537837470975898
+atanh0102 atanh -1.3467293985501207 -0.47868354895395876 -> -0.69961624370709985 -1.1994450156570076
+atanh0103 atanh -5.6142232418984888 -544551613.39307702 -> -1.8932657550925744e-17 -1.5707963249585235
+atanh0104 atanh -0.011841460381263651 -3.259978899823385 -> -0.0010183936547405188 -1.2731614020743838
+atanh0105 atanh -0.0073345736950029532 0.35821949670922248 -> -0.0065004869024682466 0.34399359971920895
+atanh0106 atanh -13.866782244320014 0.9541129545860273 -> -0.071896852055058899 1.5658322704631409
+atanh0107 atanh -708.59964982780775 21.984802159266675 -> -0.0014098779074189741 1.5707525842838959
+atanh0108 atanh -30.916832076030602 1.3691897138829843 -> -0.032292682045743676 1.5693652094847115
+atanh0109 atanh -0.57461806339861754 0.29534797443913063 -> -0.56467464472482765 0.39615612824172625
+atanh0110 atanh 0.40089246737415685 -1.632285984300659 -> 0.1063832707890608 -1.0402821335326482
+atanh0111 atanh 2119.6167688262176 -1.5383653437377242e+17 -> 8.9565008518382049e-32 -1.5707963267948966
+atanh0112 atanh 756.86017850941641 -6.6064087133223817 -> 0.0013211481136820046 -1.5707847948702234
+atanh0113 atanh 4.0490617718041602 -2.5784456791040652e-12 -> 0.25218425538553618 -1.5707963267947291
+atanh0114 atanh 10.589254957173523 -0.13956391149624509 -> 0.094700890282197664 -1.5695407140217623
+atanh0115 atanh 1.0171187553160499 0.70766113465354019 -> 0.55260251975367791 0.96619711116641682
+atanh0116 atanh 0.031645502527750849 0.067319983726544394 -> 0.031513018344086742 0.067285437670549036
+atanh0117 atanh 0.13670177624994517 0.43240089361857947 -> 0.11538933151017253 0.41392008145336212
+atanh0118 atanh 0.64173899243596688 2.9008577686695256 -> 0.065680142424134405 1.2518535724053921
+atanh0119 atanh 0.19313813528025942 38.799619150741869 -> 0.00012820765917366644 1.5450292202823612
+
+-- values near infinity
+atanh0200 atanh 5.3242646831347954e+307 1.3740396080084153e+308 -> 2.4519253616695576e-309 1.5707963267948966
+atanh0201 atanh 1.158701641241358e+308 -6.5579268873375853e+307 -> 6.5365375267795098e-309 -1.5707963267948966
+atanh0202 atanh -1.3435325735762247e+308 9.8947369259601547e+307 -> -4.8256680906589956e-309 1.5707963267948966
+atanh0203 atanh -1.4359857522598942e+308 -9.4701204702391004e+307 -> -4.8531282262872645e-309 -1.5707963267948966
+atanh0204 atanh 0.0 5.6614181068098497e+307 -> 0.0 1.5707963267948966
+atanh0205 atanh -0.0 6.9813212721450139e+307 -> -0.0 1.5707963267948966
+atanh0206 atanh 0.0 -7.4970613060311453e+307 -> 0.0 -1.5707963267948966
+atanh0207 atanh -0.0 -1.5280601880314068e+308 -> -0.0 -1.5707963267948966
+atanh0208 atanh 8.2219472336000745e+307 0.0 -> 1.2162568933954813e-308 1.5707963267948966
+atanh0209 atanh 1.4811519617280899e+308 -0.0 -> 6.7515017083951325e-309 -1.5707963267948966
+atanh0210 atanh -1.2282016263598785e+308 0.0 -> -8.1419856360537615e-309 1.5707963267948966
+atanh0211 atanh -1.0616427760154426e+308 -0.0 -> -9.4193642399489563e-309 -1.5707963267948966
+atanh0212 atanh 1.2971536510180682e+308 5.2847948452333293 -> 7.7091869510998328e-309 1.5707963267948966
+atanh0213 atanh 1.1849860977411851e+308 -7.9781906447459949 -> 8.4389175696339014e-309 -1.5707963267948966
+atanh0214 atanh -1.4029969422586635e+308 0.93891986543663375 -> -7.127599283218073e-309 1.5707963267948966
+atanh0215 atanh -4.7508098912248211e+307 -8.2702421247039908 -> -2.1049042645278043e-308 -1.5707963267948966
+atanh0216 atanh 8.2680742115769998 8.1153898410918065e+307 -> 0.0 1.5707963267948966
+atanh0217 atanh 1.2575325146218885 -1.4746679147661649e+308 -> 0.0 -1.5707963267948966
+atanh0218 atanh -2.4618803682310899 1.3781522717005568e+308 -> -0.0 1.5707963267948966
+atanh0219 atanh -4.0952386694788112 -1.231083376353703e+308 -> -0.0 -1.5707963267948966
+
+-- values near 0
+atanh0220 atanh 3.8017563659811628e-314 2.6635484239074319e-312 -> 3.8017563659811628e-314 2.6635484239074319e-312
+atanh0221 atanh 1.7391110733611878e-321 -4.3547800672541419e-313 -> 1.7391110733611878e-321 -4.3547800672541419e-313
+atanh0222 atanh -5.9656816081325078e-317 9.9692253555416263e-313 -> -5.9656816081325078e-317 9.9692253555416263e-313
+atanh0223 atanh -6.5606671178400239e-313 -2.1680936406357335e-309 -> -6.5606671178400239e-313 -2.1680936406357335e-309
+atanh0224 atanh 0.0 2.5230944401820779e-319 -> 0.0 2.5230944401820779e-319
+atanh0225 atanh -0.0 5.6066569490064658e-320 -> -0.0 5.6066569490064658e-320
+atanh0226 atanh 0.0 -2.4222487249468377e-317 -> 0.0 -2.4222487249468377e-317
+atanh0227 atanh -0.0 -3.0861101089206037e-316 -> -0.0 -3.0861101089206037e-316
+atanh0228 atanh 3.1219222884393986e-310 0.0 -> 3.1219222884393986e-310 0.0
+atanh0229 atanh 9.8926337564976196e-309 -0.0 -> 9.8926337564976196e-309 -0.0
+atanh0230 atanh -1.5462535092918154e-312 0.0 -> -1.5462535092918154e-312 0.0
+atanh0231 atanh -9.8813129168249309e-324 -0.0 -> -9.8813129168249309e-324 -0.0
+
+-- real part = +/-1, imaginary part tiny
+atanh0300 atanh 1.0 1e-153 -> 176.49433320432448 0.78539816339744828
+atanh0301 atanh 1.0 9.9999999999999997e-155 -> 177.64562575082149 0.78539816339744828
+atanh0302 atanh -1.0 1e-161 -> -185.70467357630065 0.78539816339744828
+atanh0303 atanh 1.0 -1e-165 -> 190.30984376228875 -0.78539816339744828
+atanh0304 atanh -1.0 -9.8813129168249309e-324 -> -372.22003596069061 -0.78539816339744828
+
+-- special values
+atanh1000 atanh 0.0 0.0 -> 0.0 0.0
+atanh1001 atanh 0.0 nan -> 0.0 nan
+atanh1002 atanh 1.0 0.0 -> inf 0.0                      divide-by-zero
+atanh1003 atanh 0.0 inf -> 0.0 1.5707963267948966
+atanh1004 atanh 2.3 inf -> 0.0 1.5707963267948966
+atanh1005 atanh 2.3 nan -> nan nan
+atanh1006 atanh inf 0.0 -> 0.0 1.5707963267948966
+atanh1007 atanh inf 2.3 -> 0.0 1.5707963267948966
+atanh1008 atanh inf inf -> 0.0 1.5707963267948966
+atanh1009 atanh inf nan -> 0.0 nan
+atanh1010 atanh nan 0.0 -> nan nan
+atanh1011 atanh nan 2.3 -> nan nan
+atanh1012 atanh nan inf -> 0.0 1.5707963267948966       ignore-real-sign
+atanh1013 atanh nan nan -> nan nan
+atanh1014 atanh 0.0 -0.0 -> 0.0 -0.0
+atanh1015 atanh 1.0 -0.0 -> inf -0.0                    divide-by-zero
+atanh1016 atanh 0.0 -inf -> 0.0 -1.5707963267948966
+atanh1017 atanh 2.3 -inf -> 0.0 -1.5707963267948966
+atanh1018 atanh inf -0.0 -> 0.0 -1.5707963267948966
+atanh1019 atanh inf -2.3 -> 0.0 -1.5707963267948966
+atanh1020 atanh inf -inf -> 0.0 -1.5707963267948966
+atanh1021 atanh nan -0.0 -> nan nan
+atanh1022 atanh nan -2.3 -> nan nan
+atanh1023 atanh nan -inf -> 0.0 -1.5707963267948966     ignore-real-sign
+atanh1024 atanh -0.0 -0.0 -> -0.0 -0.0
+atanh1025 atanh -0.0 nan -> -0.0 nan
+atanh1026 atanh -1.0 -0.0 -> -inf -0.0                  divide-by-zero
+atanh1027 atanh -0.0 -inf -> -0.0 -1.5707963267948966
+atanh1028 atanh -2.3 -inf -> -0.0 -1.5707963267948966
+atanh1029 atanh -2.3 nan -> nan nan
+atanh1030 atanh -inf -0.0 -> -0.0 -1.5707963267948966
+atanh1031 atanh -inf -2.3 -> -0.0 -1.5707963267948966
+atanh1032 atanh -inf -inf -> -0.0 -1.5707963267948966
+atanh1033 atanh -inf nan -> -0.0 nan
+atanh1034 atanh -0.0 0.0 -> -0.0 0.0
+atanh1035 atanh -1.0 0.0 -> -inf 0.0                    divide-by-zero
+atanh1036 atanh -0.0 inf -> -0.0 1.5707963267948966
+atanh1037 atanh -2.3 inf -> -0.0 1.5707963267948966
+atanh1038 atanh -inf 0.0 -> -0.0 1.5707963267948966
+atanh1039 atanh -inf 2.3 -> -0.0 1.5707963267948966
+atanh1040 atanh -inf inf -> -0.0 1.5707963267948966
+
+
+----------------------------
+-- log: Natural logarithm --
+----------------------------
+
+log0000 log 1.0 0.0 -> 0.0 0.0
+log0001 log 1.0 -0.0 -> 0.0 -0.0
+log0002 log -1.0 0.0 -> 0.0 3.1415926535897931
+log0003 log -1.0 -0.0 -> 0.0 -3.1415926535897931
+-- values along both sides of real axis
+log0010 log -9.8813129168249309e-324 0.0 -> -743.74692474082133 3.1415926535897931
+log0011 log -9.8813129168249309e-324 -0.0 -> -743.74692474082133 -3.1415926535897931
+log0012 log -1e-305 0.0 -> -702.28845336318398 3.1415926535897931
+log0013 log -1e-305 -0.0 -> -702.28845336318398 -3.1415926535897931
+log0014 log -1e-150 0.0 -> -345.38776394910684 3.1415926535897931
+log0015 log -1e-150 -0.0 -> -345.38776394910684 -3.1415926535897931
+log0016 log -9.9999999999999998e-17 0.0 -> -36.841361487904734 3.1415926535897931
+log0017 log -9.9999999999999998e-17 -0.0 -> -36.841361487904734 -3.1415926535897931
+log0018 log -0.001 0.0 -> -6.9077552789821368 3.1415926535897931
+log0019 log -0.001 -0.0 -> -6.9077552789821368 -3.1415926535897931
+log0020 log -0.57899999999999996 0.0 -> -0.54645280140914188 3.1415926535897931
+log0021 log -0.57899999999999996 -0.0 -> -0.54645280140914188 -3.1415926535897931
+log0022 log -0.99999999999999989 0.0 -> -1.1102230246251565e-16 3.1415926535897931
+log0023 log -0.99999999999999989 -0.0 -> -1.1102230246251565e-16 -3.1415926535897931
+log0024 log -1.0000000000000002 0.0 -> 2.2204460492503128e-16 3.1415926535897931
+log0025 log -1.0000000000000002 -0.0 -> 2.2204460492503128e-16 -3.1415926535897931
+log0026 log -1.0009999999999999 0.0 -> 0.00099950033308342321 3.1415926535897931
+log0027 log -1.0009999999999999 -0.0 -> 0.00099950033308342321 -3.1415926535897931
+log0028 log -2.0 0.0 -> 0.69314718055994529 3.1415926535897931
+log0029 log -2.0 -0.0 -> 0.69314718055994529 -3.1415926535897931
+log0030 log -23.0 0.0 -> 3.1354942159291497 3.1415926535897931
+log0031 log -23.0 -0.0 -> 3.1354942159291497 -3.1415926535897931
+log0032 log -10000000000000000.0 0.0 -> 36.841361487904734 3.1415926535897931
+log0033 log -10000000000000000.0 -0.0 -> 36.841361487904734 -3.1415926535897931
+log0034 log -9.9999999999999998e+149 0.0 -> 345.38776394910684 3.1415926535897931
+log0035 log -9.9999999999999998e+149 -0.0 -> 345.38776394910684 -3.1415926535897931
+log0036 log -1.0000000000000001e+299 0.0 -> 688.47294280521965 3.1415926535897931
+log0037 log -1.0000000000000001e+299 -0.0 -> 688.47294280521965 -3.1415926535897931
+log0038 log 9.8813129168249309e-324 0.0 -> -743.74692474082133 0.0
+log0039 log 9.8813129168249309e-324 -0.0 -> -743.74692474082133 -0.0
+log0040 log 1e-305 0.0 -> -702.28845336318398 0.0
+log0041 log 1e-305 -0.0 -> -702.28845336318398 -0.0
+log0042 log 1e-150 0.0 -> -345.38776394910684 0.0
+log0043 log 1e-150 -0.0 -> -345.38776394910684 -0.0
+log0044 log 9.9999999999999998e-17 0.0 -> -36.841361487904734 0.0
+log0045 log 9.9999999999999998e-17 -0.0 -> -36.841361487904734 -0.0
+log0046 log 0.001 0.0 -> -6.9077552789821368 0.0
+log0047 log 0.001 -0.0 -> -6.9077552789821368 -0.0
+log0048 log 0.57899999999999996 0.0 -> -0.54645280140914188 0.0
+log0049 log 0.57899999999999996 -0.0 -> -0.54645280140914188 -0.0
+log0050 log 0.99999999999999989 0.0 -> -1.1102230246251565e-16 0.0
+log0051 log 0.99999999999999989 -0.0 -> -1.1102230246251565e-16 -0.0
+log0052 log 1.0000000000000002 0.0 -> 2.2204460492503128e-16 0.0
+log0053 log 1.0000000000000002 -0.0 -> 2.2204460492503128e-16 -0.0
+log0054 log 1.0009999999999999 0.0 -> 0.00099950033308342321 0.0
+log0055 log 1.0009999999999999 -0.0 -> 0.00099950033308342321 -0.0
+log0056 log 2.0 0.0 -> 0.69314718055994529 0.0
+log0057 log 2.0 -0.0 -> 0.69314718055994529 -0.0
+log0058 log 23.0 0.0 -> 3.1354942159291497 0.0
+log0059 log 23.0 -0.0 -> 3.1354942159291497 -0.0
+log0060 log 10000000000000000.0 0.0 -> 36.841361487904734 0.0
+log0061 log 10000000000000000.0 -0.0 -> 36.841361487904734 -0.0
+log0062 log 9.9999999999999998e+149 0.0 -> 345.38776394910684 0.0
+log0063 log 9.9999999999999998e+149 -0.0 -> 345.38776394910684 -0.0
+log0064 log 1.0000000000000001e+299 0.0 -> 688.47294280521965 0.0
+log0065 log 1.0000000000000001e+299 -0.0 -> 688.47294280521965 -0.0
+
+-- random inputs
+log0066 log -1.9830454945186191e-16 -2.0334448025673346 -> 0.70973130194329803 -1.5707963267948968
+log0067 log -0.96745853024741857 -0.84995816228299692 -> 0.25292811398722387 -2.4207570438536905
+log0068 log -0.1603644313948418 -0.2929942111041835 -> -1.0965857872427374 -2.0715870859971419
+log0069 log -0.15917913168438699 -0.25238799251132177 -> -1.2093477313249901 -2.1334784232033863
+log0070 log -0.68907818535078802 -3.0693105853476346 -> 1.1460398629184565 -1.7916403813913211
+log0071 log -17.268133447565589 6.8165120014604756 -> 2.9212694465974836 2.7656245081603164
+log0072 log -1.7153894479690328 26.434055372802636 -> 3.2767542953718003 1.6355986276341734
+log0073 log -8.0456794648936578e-06 0.19722758057570208 -> -1.6233969848296075 1.5708371206810101
+log0074 log -2.4306442691323173 0.6846919750700996 -> 0.92633592001969589 2.8670160576718331
+log0075 log -3.5488049250888194 0.45324040643185254 -> 1.2747008374256426 3.0145640007885111
+log0076 log 0.18418516851510189 -0.26062518836212617 -> -1.1421287121940344 -0.95558440841183434
+log0077 log 2.7124837795638399 -13.148769067133387 -> 2.5971659975706802 -1.3673583045209439
+log0078 log 3.6521275476169149e-13 -3.7820543023170673e-05 -> -10.182658136741569 -1.5707963171384316
+log0079 log 5.0877545813862239 -1.2834978326786852 -> 1.6576856213076328 -0.24711583497738485
+log0080 log 0.26477986808461512 -0.67659001194187429 -> -0.31944085207999973 -1.197773671987121
+log0081 log 0.0014754261398071962 5.3514691608205442 -> 1.6773711707153829 1.5705206219261802
+log0082 log 0.29667334462157885 0.00020056045042584795 -> -1.2151233667079588 0.00067603114168689204
+log0083 log 0.82104233671099425 3.9005387130133102 -> 1.3827918965299593 1.3633304701848363
+log0084 log 0.27268135358180667 124.42088110945804 -> 4.8236724223559229 1.5686047258789015
+log0085 log 0.0026286959168267485 0.47795808180573013 -> -0.73821712137809126 1.5652965360960087
+
+-- values near infinity
+log0100 log 1.0512025744003172e+308 7.2621669750664611e+307 -> 709.44123967814494 0.60455434048332968
+log0101 log 5.5344249034372126e+307 -1.2155859158431275e+308 -> 709.48562300345679 -1.143553056717973
+log0102 log -1.3155575403469408e+308 1.1610793541663864e+308 -> 709.75847809546428 2.41848796504974
+log0103 log -1.632366720973235e+308 -1.54299446211448e+308 -> 710.00545236515586 -2.3843326028455087
+log0104 log 0.0 5.9449276692327712e+307 -> 708.67616191258526 1.5707963267948966
+log0105 log -0.0 1.1201850459025692e+308 -> 709.30970253338171 1.5707963267948966
+log0106 log 0.0 -1.6214225933466528e+308 -> 709.6795125501086 -1.5707963267948966
+log0107 log -0.0 -1.7453269791591058e+308 -> 709.75315056087379 -1.5707963267948966
+log0108 log 1.440860577601428e+308 0.0 -> 709.56144920058262 0.0
+log0109 log 1.391515176148282e+308 -0.0 -> 709.52660185041327 -0.0
+log0110 log -1.201354401295296e+308 0.0 -> 709.37965823023956 3.1415926535897931
+log0111 log -1.6704337825976804e+308 -0.0 -> 709.70929198492399 -3.1415926535897931
+log0112 log 7.2276974655190223e+307 7.94879711369164 -> 708.87154406512104 1.0997689307850458e-307
+log0113 log 1.1207859593716076e+308 -6.1956200868221147 -> 709.31023883080104 -5.5279244310803286e-308
+log0114 log -4.6678933874471045e+307 9.947107893220382 -> 708.43433142431388 3.1415926535897931
+log0115 log -1.5108012453950142e+308 -5.3117197179375619 -> 709.60884877835008 -3.1415926535897931
+log0116 log 7.4903750871504435 1.5320703776626352e+308 -> 709.62282865085137 1.5707963267948966
+log0117 log 5.9760325525654778 -8.0149473997349123e+307 -> 708.97493177248396 -1.5707963267948966
+log0118 log -7.880194206386629 1.7861845814767441e+308 -> 709.77629046837137 1.5707963267948966
+log0119 log -9.886438993852865 -6.19235781080747e+307 -> 708.71693946977302 -1.5707963267948966
+
+-- values near 0
+log0120 log 2.2996867579227779e-308 6.7861840770939125e-312 -> -708.36343567717392 0.00029509166223339815
+log0121 log 6.9169190417774516e-323 -9.0414013188948118e-322 -> -739.22766796468386 -1.4944423210001669
+log0122 log -1.5378064962914011e-316 1.8243628389354635e-310 -> -713.20014803142965 1.5707971697228842
+log0123 log -2.3319898483706837e-321 -2.2358763941866371e-313 -> -719.9045008332522 -1.570796337224766
+log0124 log 0.0 3.872770101081121e-315 -> -723.96033425374401 1.5707963267948966
+log0125 log -0.0 9.6342800939043076e-322 -> -739.16707236281752 1.5707963267948966
+log0126 log 0.0 -2.266099393427834e-308 -> -708.37814861757965 -1.5707963267948966
+log0127 log -0.0 -2.1184695673766626e-315 -> -724.56361036731812 -1.5707963267948966
+log0128 log 1.1363509854348671e-322 0.0 -> -741.30457770545206 0.0
+log0129 log 3.5572726500569751e-322 -0.0 -> -740.16340580236522 -0.0
+log0130 log -2.3696071074040593e-310 0.0 -> -712.93865466421641 3.1415926535897931
+log0131 log -2.813283897266934e-317 -0.0 -> -728.88512203138862 -3.1415926535897931
+
+-- values near the unit circle
+log0200 log -0.59999999999999998 0.80000000000000004 -> 2.2204460492503132e-17 2.2142974355881808
+log0201 log 0.79999999999999993 0.60000000000000009 -> 6.1629758220391547e-33 0.64350110879328448
+
+-- special values
+log1000 log -0.0 0.0 -> -inf 3.1415926535897931         divide-by-zero
+log1001 log 0.0 0.0 -> -inf 0.0                         divide-by-zero
+log1002 log 0.0 inf -> inf 1.5707963267948966
+log1003 log 2.3 inf -> inf 1.5707963267948966
+log1004 log -0.0 inf -> inf 1.5707963267948966
+log1005 log -2.3 inf -> inf 1.5707963267948966
+log1006 log 0.0 nan -> nan nan
+log1007 log 2.3 nan -> nan nan
+log1008 log -0.0 nan -> nan nan
+log1009 log -2.3 nan -> nan nan
+log1010 log -inf 0.0 -> inf 3.1415926535897931
+log1011 log -inf 2.3 -> inf 3.1415926535897931
+log1012 log inf 0.0 -> inf 0.0
+log1013 log inf 2.3 -> inf 0.0
+log1014 log -inf inf -> inf 2.3561944901923448
+log1015 log inf inf -> inf 0.78539816339744828
+log1016 log inf nan -> inf nan
+log1017 log -inf nan -> inf nan
+log1018 log nan 0.0 -> nan nan
+log1019 log nan 2.3 -> nan nan
+log1020 log nan inf -> inf nan
+log1021 log nan nan -> nan nan
+log1022 log -0.0 -0.0 -> -inf -3.1415926535897931       divide-by-zero
+log1023 log 0.0 -0.0 -> -inf -0.0                       divide-by-zero
+log1024 log 0.0 -inf -> inf -1.5707963267948966
+log1025 log 2.3 -inf -> inf -1.5707963267948966
+log1026 log -0.0 -inf -> inf -1.5707963267948966
+log1027 log -2.3 -inf -> inf -1.5707963267948966
+log1028 log -inf -0.0 -> inf -3.1415926535897931
+log1029 log -inf -2.3 -> inf -3.1415926535897931
+log1030 log inf -0.0 -> inf -0.0
+log1031 log inf -2.3 -> inf -0.0
+log1032 log -inf -inf -> inf -2.3561944901923448
+log1033 log inf -inf -> inf -0.78539816339744828
+log1034 log nan -0.0 -> nan nan
+log1035 log nan -2.3 -> nan nan
+log1036 log nan -inf -> inf nan
+
+
+------------------------------
+-- log10: Logarithm base 10 --
+------------------------------
+
+logt0000 log10 1.0 0.0 -> 0.0 0.0
+logt0001 log10 1.0 -0.0 -> 0.0 -0.0
+logt0002 log10 -1.0 0.0 -> 0.0 1.3643763538418414
+logt0003 log10 -1.0 -0.0 -> 0.0 -1.3643763538418414
+-- values along both sides of real axis
+logt0010 log10 -9.8813129168249309e-324 0.0 -> -323.0051853474518 1.3643763538418414
+logt0011 log10 -9.8813129168249309e-324 -0.0 -> -323.0051853474518 -1.3643763538418414
+logt0012 log10 -1e-305 0.0 -> -305.0 1.3643763538418414
+logt0013 log10 -1e-305 -0.0 -> -305.0 -1.3643763538418414
+logt0014 log10 -1e-150 0.0 -> -150.0 1.3643763538418414
+logt0015 log10 -1e-150 -0.0 -> -150.0 -1.3643763538418414
+logt0016 log10 -9.9999999999999998e-17 0.0 -> -16.0 1.3643763538418414
+logt0017 log10 -9.9999999999999998e-17 -0.0 -> -16.0 -1.3643763538418414
+logt0018 log10 -0.001 0.0 -> -3.0 1.3643763538418414
+logt0019 log10 -0.001 -0.0 -> -3.0 -1.3643763538418414
+logt0020 log10 -0.57899999999999996 0.0 -> -0.23732143627256383 1.3643763538418414
+logt0021 log10 -0.57899999999999996 -0.0 -> -0.23732143627256383 -1.3643763538418414
+logt0022 log10 -0.99999999999999989 0.0 -> -4.821637332766436e-17 1.3643763538418414
+logt0023 log10 -0.99999999999999989 -0.0 -> -4.821637332766436e-17 -1.3643763538418414
+logt0024 log10 -1.0000000000000002 0.0 -> 9.6432746655328696e-17 1.3643763538418414
+logt0025 log10 -1.0000000000000002 -0.0 -> 9.6432746655328696e-17 -1.3643763538418414
+logt0026 log10 -1.0009999999999999 0.0 -> 0.0004340774793185929 1.3643763538418414
+logt0027 log10 -1.0009999999999999 -0.0 -> 0.0004340774793185929 -1.3643763538418414
+logt0028 log10 -2.0 0.0 -> 0.3010299956639812 1.3643763538418414
+logt0029 log10 -2.0 -0.0 -> 0.3010299956639812 -1.3643763538418414
+logt0030 log10 -23.0 0.0 -> 1.3617278360175928 1.3643763538418414
+logt0031 log10 -23.0 -0.0 -> 1.3617278360175928 -1.3643763538418414
+logt0032 log10 -10000000000000000.0 0.0 -> 16.0 1.3643763538418414
+logt0033 log10 -10000000000000000.0 -0.0 -> 16.0 -1.3643763538418414
+logt0034 log10 -9.9999999999999998e+149 0.0 -> 150.0 1.3643763538418414
+logt0035 log10 -9.9999999999999998e+149 -0.0 -> 150.0 -1.3643763538418414
+logt0036 log10 -1.0000000000000001e+299 0.0 -> 299.0 1.3643763538418414
+logt0037 log10 -1.0000000000000001e+299 -0.0 -> 299.0 -1.3643763538418414
+logt0038 log10 9.8813129168249309e-324 0.0 -> -323.0051853474518 0.0
+logt0039 log10 9.8813129168249309e-324 -0.0 -> -323.0051853474518 -0.0
+logt0040 log10 1e-305 0.0 -> -305.0 0.0
+logt0041 log10 1e-305 -0.0 -> -305.0 -0.0
+logt0042 log10 1e-150 0.0 -> -150.0 0.0
+logt0043 log10 1e-150 -0.0 -> -150.0 -0.0
+logt0044 log10 9.9999999999999998e-17 0.0 -> -16.0 0.0
+logt0045 log10 9.9999999999999998e-17 -0.0 -> -16.0 -0.0
+logt0046 log10 0.001 0.0 -> -3.0 0.0
+logt0047 log10 0.001 -0.0 -> -3.0 -0.0
+logt0048 log10 0.57899999999999996 0.0 -> -0.23732143627256383 0.0
+logt0049 log10 0.57899999999999996 -0.0 -> -0.23732143627256383 -0.0
+logt0050 log10 0.99999999999999989 0.0 -> -4.821637332766436e-17 0.0
+logt0051 log10 0.99999999999999989 -0.0 -> -4.821637332766436e-17 -0.0
+logt0052 log10 1.0000000000000002 0.0 -> 9.6432746655328696e-17 0.0
+logt0053 log10 1.0000000000000002 -0.0 -> 9.6432746655328696e-17 -0.0
+logt0054 log10 1.0009999999999999 0.0 -> 0.0004340774793185929 0.0
+logt0055 log10 1.0009999999999999 -0.0 -> 0.0004340774793185929 -0.0
+logt0056 log10 2.0 0.0 -> 0.3010299956639812 0.0
+logt0057 log10 2.0 -0.0 -> 0.3010299956639812 -0.0
+logt0058 log10 23.0 0.0 -> 1.3617278360175928 0.0
+logt0059 log10 23.0 -0.0 -> 1.3617278360175928 -0.0
+logt0060 log10 10000000000000000.0 0.0 -> 16.0 0.0
+logt0061 log10 10000000000000000.0 -0.0 -> 16.0 -0.0
+logt0062 log10 9.9999999999999998e+149 0.0 -> 150.0 0.0
+logt0063 log10 9.9999999999999998e+149 -0.0 -> 150.0 -0.0
+logt0064 log10 1.0000000000000001e+299 0.0 -> 299.0 0.0
+logt0065 log10 1.0000000000000001e+299 -0.0 -> 299.0 -0.0
+
+-- random inputs
+logt0066 log10 -1.9830454945186191e-16 -2.0334448025673346 -> 0.30823238806798503 -0.68218817692092071
+logt0067 log10 -0.96745853024741857 -0.84995816228299692 -> 0.10984528422284802 -1.051321426174086
+logt0068 log10 -0.1603644313948418 -0.2929942111041835 -> -0.47624115633305419 -0.89967884023059597
+logt0069 log10 -0.15917913168438699 -0.25238799251132177 -> -0.52521304641665956 -0.92655790645688119
+logt0070 log10 -0.68907818535078802 -3.0693105853476346 -> 0.4977187885066448 -0.77809953119328823
+logt0071 log10 -17.268133447565589 6.8165120014604756 -> 1.2686912008098534 1.2010954629104202
+logt0072 log10 -1.7153894479690328 26.434055372802636 -> 1.423076309032751 0.71033145859005309
+logt0073 log10 -8.0456794648936578e-06 0.19722758057570208 -> -0.70503235244987561 0.68220589348055516
+logt0074 log10 -2.4306442691323173 0.6846919750700996 -> 0.40230257845332595 1.2451292533748923
+logt0075 log10 -3.5488049250888194 0.45324040643185254 -> 0.55359553977141063 1.3092085108866405
+logt0076 log10 0.18418516851510189 -0.26062518836212617 -> -0.49602019732913638 -0.41500503556604301
+logt0077 log10 2.7124837795638399 -13.148769067133387 -> 1.1279348613317008 -0.59383616643803216
+logt0078 log10 3.6521275476169149e-13 -3.7820543023170673e-05 -> -4.4222722398941112 -0.68218817272717114
+logt0079 log10 5.0877545813862239 -1.2834978326786852 -> 0.71992371806426847 -0.10732104352159283
+logt0080 log10 0.26477986808461512 -0.67659001194187429 -> -0.13873139935281681 -0.52018649631300229
+logt0081 log10 0.0014754261398071962 5.3514691608205442 -> 0.72847304354528819 0.6820684398178033
+logt0082 log10 0.29667334462157885 0.00020056045042584795 -> -0.52772137299296806 0.00029359659442937261
+logt0083 log10 0.82104233671099425 3.9005387130133102 -> 0.60053889028349361 0.59208690021184018
+logt0084 log10 0.27268135358180667 124.42088110945804 -> 2.094894315538069 0.68123637673656989
+logt0085 log10 0.0026286959168267485 0.47795808180573013 -> -0.32060362226100814 0.67979964816877081
+
+-- values near infinity
+logt0100 log10 1.0512025744003172e+308 7.2621669750664611e+307 -> 308.10641562682065 0.26255461408256975
+logt0101 log10 5.5344249034372126e+307 -1.2155859158431275e+308 -> 308.12569106009209 -0.496638782296212
+logt0102 log10 -1.3155575403469408e+308 1.1610793541663864e+308 -> 308.24419052091019 1.0503359777705266
+logt0103 log10 -1.632366720973235e+308 -1.54299446211448e+308 -> 308.3514500834093 -1.0355024924378222
+logt0104 log10 0.0 5.9449276692327712e+307 -> 307.77414657501117 0.68218817692092071
+logt0105 log10 -0.0 1.1201850459025692e+308 -> 308.04928977068465 0.68218817692092071
+logt0106 log10 0.0 -1.6214225933466528e+308 -> 308.20989622030174 -0.68218817692092071
+logt0107 log10 -0.0 -1.7453269791591058e+308 -> 308.24187680203539 -0.68218817692092071
+logt0108 log10 1.440860577601428e+308 0.0 -> 308.15862195908755 0.0
+logt0109 log10 1.391515176148282e+308 -0.0 -> 308.14348794720007 -0.0
+logt0110 log10 -1.201354401295296e+308 0.0 -> 308.07967114380773 1.3643763538418414
+logt0111 log10 -1.6704337825976804e+308 -0.0 -> 308.22282926451624 -1.3643763538418414
+logt0112 log10 7.2276974655190223e+307 7.94879711369164 -> 307.85899996571993 4.7762357800858463e-308
+logt0113 log10 1.1207859593716076e+308 -6.1956200868221147 -> 308.04952268169455 -2.4007470767963597e-308
+logt0114 log10 -4.6678933874471045e+307 9.947107893220382 -> 307.66912092839902 1.3643763538418414
+logt0115 log10 -1.5108012453950142e+308 -5.3117197179375619 -> 308.1792073341565 -1.3643763538418414
+logt0116 log10 7.4903750871504435 1.5320703776626352e+308 -> 308.18527871564157 0.68218817692092071
+logt0117 log10 5.9760325525654778 -8.0149473997349123e+307 -> 307.90390067652424 -0.68218817692092071
+logt0118 log10 -7.880194206386629 1.7861845814767441e+308 -> 308.25192633617331 0.68218817692092071
+logt0119 log10 -9.886438993852865 -6.19235781080747e+307 -> 307.79185604308338 -0.68218817692092071
+
+-- values near 0
+logt0120 log10 2.2996867579227779e-308 6.7861840770939125e-312 -> -307.63833129662572 0.00012815668056362305
+logt0121 log10 6.9169190417774516e-323 -9.0414013188948118e-322 -> -321.04249706727148 -0.64902805353306059
+logt0122 log10 -1.5378064962914011e-316 1.8243628389354635e-310 -> -309.73888878263222 0.68218854299989429
+logt0123 log10 -2.3319898483706837e-321 -2.2358763941866371e-313 -> -312.65055220919641 -0.68218818145055538
+logt0124 log10 0.0 3.872770101081121e-315 -> -314.41197828323476 0.68218817692092071
+logt0125 log10 -0.0 9.6342800939043076e-322 -> -321.01618073175331 0.68218817692092071
+logt0126 log10 0.0 -2.266099393427834e-308 -> -307.64472104545649 -0.68218817692092071
+logt0127 log10 -0.0 -2.1184695673766626e-315 -> -314.67397777042407 -0.68218817692092071
+logt0128 log10 1.1363509854348671e-322 0.0 -> -321.94448750709819 0.0
+logt0129 log10 3.5572726500569751e-322 -0.0 -> -321.44888284668451 -0.0
+logt0130 log10 -2.3696071074040593e-310 0.0 -> -309.62532365619722 1.3643763538418414
+logt0131 log10 -2.813283897266934e-317 -0.0 -> -316.55078643961042 -1.3643763538418414
+
+-- values near the unit circle
+logt0200 log10 -0.59999999999999998 0.80000000000000004 -> 9.6432746655328709e-18 0.96165715756846815
+logt0201 log10 0.79999999999999993 0.60000000000000009 -> 2.6765463916147622e-33 0.2794689806475476
+
+-- special values
+logt1000 log10 -0.0 0.0 -> -inf 1.3643763538418414         divide-by-zero
+logt1001 log10 0.0 0.0 -> -inf 0.0                         divide-by-zero
+logt1002 log10 0.0 inf -> inf 0.68218817692092071
+logt1003 log10 2.3 inf -> inf 0.68218817692092071
+logt1004 log10 -0.0 inf -> inf 0.68218817692092071
+logt1005 log10 -2.3 inf -> inf 0.68218817692092071
+logt1006 log10 0.0 nan -> nan nan
+logt1007 log10 2.3 nan -> nan nan
+logt1008 log10 -0.0 nan -> nan nan
+logt1009 log10 -2.3 nan -> nan nan
+logt1010 log10 -inf 0.0 -> inf 1.3643763538418414
+logt1011 log10 -inf 2.3 -> inf 1.3643763538418414
+logt1012 log10 inf 0.0 -> inf 0.0
+logt1013 log10 inf 2.3 -> inf 0.0
+logt1014 log10 -inf inf -> inf 1.0232822653813811
+logt1015 log10 inf inf -> inf 0.34109408846046035
+logt1016 log10 inf nan -> inf nan
+logt1017 log10 -inf nan -> inf nan
+logt1018 log10 nan 0.0 -> nan nan
+logt1019 log10 nan 2.3 -> nan nan
+logt1020 log10 nan inf -> inf nan
+logt1021 log10 nan nan -> nan nan
+logt1022 log10 -0.0 -0.0 -> -inf -1.3643763538418414       divide-by-zero
+logt1023 log10 0.0 -0.0 -> -inf -0.0                       divide-by-zero
+logt1024 log10 0.0 -inf -> inf -0.68218817692092071
+logt1025 log10 2.3 -inf -> inf -0.68218817692092071
+logt1026 log10 -0.0 -inf -> inf -0.68218817692092071
+logt1027 log10 -2.3 -inf -> inf -0.68218817692092071
+logt1028 log10 -inf -0.0 -> inf -1.3643763538418414
+logt1029 log10 -inf -2.3 -> inf -1.3643763538418414
+logt1030 log10 inf -0.0 -> inf -0.0
+logt1031 log10 inf -2.3 -> inf -0.0
+logt1032 log10 -inf -inf -> inf -1.0232822653813811
+logt1033 log10 inf -inf -> inf -0.34109408846046035
+logt1034 log10 nan -0.0 -> nan nan
+logt1035 log10 nan -2.3 -> nan nan
+logt1036 log10 nan -inf -> inf nan
+
+
+-----------------------
+-- sqrt: Square root --
+-----------------------
+
+-- zeros
+sqrt0000 sqrt 0.0 0.0 -> 0.0 0.0
+sqrt0001 sqrt 0.0 -0.0 -> 0.0 -0.0
+sqrt0002 sqrt -0.0 0.0 -> 0.0 0.0
+sqrt0003 sqrt -0.0 -0.0 -> 0.0 -0.0
+
+-- values along both sides of real axis
+sqrt0010 sqrt -9.8813129168249309e-324 0.0 -> 0.0 3.1434555694052576e-162
+sqrt0011 sqrt -9.8813129168249309e-324 -0.0 -> 0.0 -3.1434555694052576e-162
+sqrt0012 sqrt -1e-305 0.0 -> 0.0 3.1622776601683791e-153
+sqrt0013 sqrt -1e-305 -0.0 -> 0.0 -3.1622776601683791e-153
+sqrt0014 sqrt -1e-150 0.0 -> 0.0 9.9999999999999996e-76
+sqrt0015 sqrt -1e-150 -0.0 -> 0.0 -9.9999999999999996e-76
+sqrt0016 sqrt -9.9999999999999998e-17 0.0 -> 0.0 1e-08
+sqrt0017 sqrt -9.9999999999999998e-17 -0.0 -> 0.0 -1e-08
+sqrt0018 sqrt -0.001 0.0 -> 0.0 0.031622776601683791
+sqrt0019 sqrt -0.001 -0.0 -> 0.0 -0.031622776601683791
+sqrt0020 sqrt -0.57899999999999996 0.0 -> 0.0 0.76092049518987193
+sqrt0021 sqrt -0.57899999999999996 -0.0 -> 0.0 -0.76092049518987193
+sqrt0022 sqrt -0.99999999999999989 0.0 -> 0.0 0.99999999999999989
+sqrt0023 sqrt -0.99999999999999989 -0.0 -> 0.0 -0.99999999999999989
+sqrt0024 sqrt -1.0000000000000002 0.0 -> 0.0 1.0
+sqrt0025 sqrt -1.0000000000000002 -0.0 -> 0.0 -1.0
+sqrt0026 sqrt -1.0009999999999999 0.0 -> 0.0 1.000499875062461
+sqrt0027 sqrt -1.0009999999999999 -0.0 -> 0.0 -1.000499875062461
+sqrt0028 sqrt -2.0 0.0 -> 0.0 1.4142135623730951
+sqrt0029 sqrt -2.0 -0.0 -> 0.0 -1.4142135623730951
+sqrt0030 sqrt -23.0 0.0 -> 0.0 4.7958315233127191
+sqrt0031 sqrt -23.0 -0.0 -> 0.0 -4.7958315233127191
+sqrt0032 sqrt -10000000000000000.0 0.0 -> 0.0 100000000.0
+sqrt0033 sqrt -10000000000000000.0 -0.0 -> 0.0 -100000000.0
+sqrt0034 sqrt -9.9999999999999998e+149 0.0 -> 0.0 9.9999999999999993e+74
+sqrt0035 sqrt -9.9999999999999998e+149 -0.0 -> 0.0 -9.9999999999999993e+74
+sqrt0036 sqrt -1.0000000000000001e+299 0.0 -> 0.0 3.1622776601683796e+149
+sqrt0037 sqrt -1.0000000000000001e+299 -0.0 -> 0.0 -3.1622776601683796e+149
+sqrt0038 sqrt 9.8813129168249309e-324 0.0 -> 3.1434555694052576e-162 0.0
+sqrt0039 sqrt 9.8813129168249309e-324 -0.0 -> 3.1434555694052576e-162 -0.0
+sqrt0040 sqrt 1e-305 0.0 -> 3.1622776601683791e-153 0.0
+sqrt0041 sqrt 1e-305 -0.0 -> 3.1622776601683791e-153 -0.0
+sqrt0042 sqrt 1e-150 0.0 -> 9.9999999999999996e-76 0.0
+sqrt0043 sqrt 1e-150 -0.0 -> 9.9999999999999996e-76 -0.0
+sqrt0044 sqrt 9.9999999999999998e-17 0.0 -> 1e-08 0.0
+sqrt0045 sqrt 9.9999999999999998e-17 -0.0 -> 1e-08 -0.0
+sqrt0046 sqrt 0.001 0.0 -> 0.031622776601683791 0.0
+sqrt0047 sqrt 0.001 -0.0 -> 0.031622776601683791 -0.0
+sqrt0048 sqrt 0.57899999999999996 0.0 -> 0.76092049518987193 0.0
+sqrt0049 sqrt 0.57899999999999996 -0.0 -> 0.76092049518987193 -0.0
+sqrt0050 sqrt 0.99999999999999989 0.0 -> 0.99999999999999989 0.0
+sqrt0051 sqrt 0.99999999999999989 -0.0 -> 0.99999999999999989 -0.0
+sqrt0052 sqrt 1.0000000000000002 0.0 -> 1.0 0.0
+sqrt0053 sqrt 1.0000000000000002 -0.0 -> 1.0 -0.0
+sqrt0054 sqrt 1.0009999999999999 0.0 -> 1.000499875062461 0.0
+sqrt0055 sqrt 1.0009999999999999 -0.0 -> 1.000499875062461 -0.0
+sqrt0056 sqrt 2.0 0.0 -> 1.4142135623730951 0.0
+sqrt0057 sqrt 2.0 -0.0 -> 1.4142135623730951 -0.0
+sqrt0058 sqrt 23.0 0.0 -> 4.7958315233127191 0.0
+sqrt0059 sqrt 23.0 -0.0 -> 4.7958315233127191 -0.0
+sqrt0060 sqrt 10000000000000000.0 0.0 -> 100000000.0 0.0
+sqrt0061 sqrt 10000000000000000.0 -0.0 -> 100000000.0 -0.0
+sqrt0062 sqrt 9.9999999999999998e+149 0.0 -> 9.9999999999999993e+74 0.0
+sqrt0063 sqrt 9.9999999999999998e+149 -0.0 -> 9.9999999999999993e+74 -0.0
+sqrt0064 sqrt 1.0000000000000001e+299 0.0 -> 3.1622776601683796e+149 0.0
+sqrt0065 sqrt 1.0000000000000001e+299 -0.0 -> 3.1622776601683796e+149 -0.0
+
+-- random inputs
+sqrt0100 sqrt -0.34252542541549913 -223039880.15076211 -> 10560.300180587592 -10560.300196805192
+sqrt0101 sqrt -0.88790791393018909 -5.3307751730827402 -> 1.5027154613689004 -1.7737140896343291
+sqrt0102 sqrt -113916.89291310767 -0.018143374626153858 -> 2.6877817875351178e-05 -337.51576691038952
+sqrt0103 sqrt -0.63187172386197121 -0.26293913366617694 -> 0.16205707495266153 -0.81125471918761971
+sqrt0104 sqrt -0.058185169308906215 -2.3548312990430991 -> 1.0717660342420072 -1.0985752598086966
+sqrt0105 sqrt -1.0580584765935896 0.14400319259151736 -> 0.069837489270111242 1.030987755262468
+sqrt0106 sqrt -1.1667595947504932 0.11159711473953678 -> 0.051598531319315251 1.0813981705111229
+sqrt0107 sqrt -0.5123728411449906 0.026175433648339085 -> 0.018278026262418718 0.71603556293597614
+sqrt0108 sqrt -3.7453400060067228 1.0946500314809635 -> 0.27990088541692498 1.9554243814742367
+sqrt0109 sqrt -0.0027736121575097673 1.0367943000839817 -> 0.71903560338719175 0.72096172651250545
+sqrt0110 sqrt 1501.2559699453188 -1.1997325207283589 -> 38.746047664730959 -0.015481998720355024
+sqrt0111 sqrt 1.4830075326850578 -0.64100878436755349 -> 1.244712815741096 -0.25749264258434584
+sqrt0112 sqrt 0.095395618499734602 -0.48226565701639595 -> 0.54175904053472879 -0.44509239434231551
+sqrt0113 sqrt 0.50109185681863277 -0.54054037379892561 -> 0.7868179858332387 -0.34349772344520979
+sqrt0114 sqrt 0.98779807595367897 -0.00019848758437225191 -> 0.99388031770665153 -9.9854872279921968e-05
+sqrt0115 sqrt 11.845472380792259 0.0010051104581506761 -> 3.4417252072345397 0.00014601840612346451
+sqrt0116 sqrt 2.3558249686735975 0.25605157371744403 -> 1.5371278477386647 0.083288964575761404
+sqrt0117 sqrt 0.77584894123159098 1.0496420627016076 -> 1.0200744386390885 0.51449287568756552
+sqrt0118 sqrt 1.8961715669604893 0.34940793467158854 -> 1.3827991781411615 0.12634080935066902
+sqrt0119 sqrt 0.96025378316565801 0.69573224860140515 -> 1.0358710342209998 0.33581991658093457
+
+-- values near 0
+sqrt0120 sqrt 7.3577938365086866e-313 8.1181408465112743e-319 -> 8.5777583531543516e-157 4.732087634251168e-163
+sqrt0121 sqrt 1.2406883874892108e-310 -5.1210133324269776e-312 -> 1.1140990057468052e-155 -2.2982756945349973e-157
+sqrt0122 sqrt -7.1145453001139502e-322 2.9561379244703735e-314 -> 1.2157585807480286e-157 1.2157586100077242e-157
+sqrt0123 sqrt -4.9963244206801218e-314 -8.4718424423690227e-319 -> 1.8950582312540437e-162 -2.2352459419578971e-157
+sqrt0124 sqrt 0.0 7.699553609385195e-318 -> 1.9620848107797476e-159 1.9620848107797476e-159
+sqrt0125 sqrt -0.0 3.3900826606499415e-309 -> 4.1170879639922327e-155 4.1170879639922327e-155
+sqrt0126 sqrt 0.0 -9.8907989772250828e-319 -> 7.032353438652342e-160 -7.032353438652342e-160
+sqrt0127 sqrt -0.0 -1.3722939367590908e-315 -> 2.6194407196566702e-158 -2.6194407196566702e-158
+sqrt0128 sqrt 7.9050503334599447e-323 0.0 -> 8.8910349979403099e-162 0.0
+sqrt0129 sqrt 1.8623241768349486e-309 -0.0 -> 4.3154654173506579e-155 -0.0
+sqrt0130 sqrt -2.665971134499887e-308 0.0 -> 0.0 1.6327801856036491e-154
+sqrt0131 sqrt -1.5477066694467245e-310 -0.0 -> 0.0 -1.2440685951533077e-155
+
+-- inputs whose absolute value overflows
+sqrt0140 sqrt 1.6999999999999999e+308 -1.6999999999999999e+308 -> 1.4325088230154573e+154 -5.9336458271212207e+153
+sqrt0141 sqrt -1.797e+308 -9.9999999999999999e+306 -> 3.7284476432057307e+152 -1.3410406899802901e+154
+
+-- Additional real values (mpmath)
+sqrt0150 sqrt 1.7976931348623157e+308 0.0 -> 1.3407807929942596355e+154 0.0
+sqrt0151 sqrt 2.2250738585072014e-308 0.0 -> 1.4916681462400413487e-154 0.0
+sqrt0152 sqrt 5e-324 0.0 -> 2.2227587494850774834e-162 0.0
+sqrt0153 sqrt 5e-324 1.0 -> 0.7071067811865476 0.7071067811865476
+
+-- special values
+sqrt1000 sqrt 0.0 0.0 -> 0.0 0.0
+sqrt1001 sqrt -0.0 0.0 -> 0.0 0.0
+sqrt1002 sqrt 0.0 inf -> inf inf
+sqrt1003 sqrt 2.3 inf -> inf inf
+sqrt1004 sqrt inf inf -> inf inf
+sqrt1005 sqrt -0.0 inf -> inf inf
+sqrt1006 sqrt -2.3 inf -> inf inf
+sqrt1007 sqrt -inf inf -> inf inf
+sqrt1008 sqrt nan inf -> inf inf
+sqrt1009 sqrt 0.0 nan -> nan nan
+sqrt1010 sqrt 2.3 nan -> nan nan
+sqrt1011 sqrt -0.0 nan -> nan nan
+sqrt1012 sqrt -2.3 nan -> nan nan
+sqrt1013 sqrt -inf 0.0 -> 0.0 inf
+sqrt1014 sqrt -inf 2.3 -> 0.0 inf
+sqrt1015 sqrt inf 0.0 -> inf 0.0
+sqrt1016 sqrt inf 2.3 -> inf 0.0
+sqrt1017 sqrt -inf nan -> nan inf       ignore-imag-sign
+sqrt1018 sqrt inf nan -> inf nan
+sqrt1019 sqrt nan 0.0 -> nan nan
+sqrt1020 sqrt nan 2.3 -> nan nan
+sqrt1021 sqrt nan nan -> nan nan
+sqrt1022 sqrt 0.0 -0.0 -> 0.0 -0.0
+sqrt1023 sqrt -0.0 -0.0 -> 0.0 -0.0
+sqrt1024 sqrt 0.0 -inf -> inf -inf
+sqrt1025 sqrt 2.3 -inf -> inf -inf
+sqrt1026 sqrt inf -inf -> inf -inf
+sqrt1027 sqrt -0.0 -inf -> inf -inf
+sqrt1028 sqrt -2.3 -inf -> inf -inf
+sqrt1029 sqrt -inf -inf -> inf -inf
+sqrt1030 sqrt nan -inf -> inf -inf
+sqrt1031 sqrt -inf -0.0 -> 0.0 -inf
+sqrt1032 sqrt -inf -2.3 -> 0.0 -inf
+sqrt1033 sqrt inf -0.0 -> inf -0.0
+sqrt1034 sqrt inf -2.3 -> inf -0.0
+sqrt1035 sqrt nan -0.0 -> nan nan
+sqrt1036 sqrt nan -2.3 -> nan nan
+
+
+-- For exp, cosh, sinh, tanh we limit tests to arguments whose
+-- imaginary part is less than 10 in absolute value:  most math
+-- libraries have poor accuracy for (real) sine and cosine for
+-- large arguments, and the accuracy of these complex functions
+-- suffer correspondingly.
+--
+-- Similarly, for cos, sin and tan we limit tests to arguments
+-- with relatively small real part.
+
+
+-------------------------------
+-- exp: Exponential function --
+-------------------------------
+
+-- zeros
+exp0000 exp 0.0 0.0 -> 1.0 0.0
+exp0001 exp 0.0 -0.0 -> 1.0 -0.0
+exp0002 exp -0.0 0.0 -> 1.0 0.0
+exp0003 exp -0.0 -0.0 -> 1.0 -0.0
+
+-- random inputs
+exp0004 exp -17.957359009564684 -1.108613895795274 -> 7.0869292576226611e-09 -1.4225929202377833e-08
+exp0005 exp -1.4456149663368642e-15 -0.75359817331772239 -> 0.72923148323917997 -0.68426708517419033
+exp0006 exp -0.76008654883512661 -0.46657235480105019 -> 0.41764393109928666 -0.21035108396792854
+exp0007 exp -5.7071614697735731 -2.3744161818115816e-11 -> 0.0033220890242068356 -7.8880219364953578e-14
+exp0008 exp -0.4653981327927097 -5.2236706667445587e-21 -> 0.62788507378216663 -3.2798648420026468e-21
+exp0009 exp -3.2444565242295518 1.1535625304243959 -> 0.015799936931457641 0.035644950380024749
+exp0010 exp -3.0651456337977727 0.87765086532391878 -> 0.029805595629855953 0.035882775180855669
+exp0011 exp -0.11080823753233926 0.96486386300873106 -> 0.50979112534376314 0.73575512419561562
+exp0012 exp -2.5629722598928648 0.019636235754708079 -> 0.077060452853917397 0.0015133717341137684
+exp0013 exp -3.3201709957983357e-10 1.2684017344487268 -> 0.29780699855434889 0.95462610007689186
+exp0014 exp 0.88767276057993272 -0.18953422986895557 -> 2.3859624049858095 -0.45771559132044426
+exp0015 exp 1.5738333486794742 -2.2576803075544328e-11 -> 4.8251091132458654 -1.0893553826776623e-10
+exp0016 exp 1.6408702341813795 -1.438879484380837 -> 0.6786733590689048 -5.1148284173168825
+exp0017 exp 1.820279424202033 -0.020812040370785722 -> 6.1722462896420902 -0.1284755888435051
+exp0018 exp 1.7273965735945873 -0.61140621328954947 -> 4.6067931898799976 -3.2294267694441308
+exp0019 exp 2.5606034306862995 0.098153136008435504 -> 12.881325889966629 1.2684184812864494
+exp0020 exp 10.280368619483029 3.4564622559748535 -> -27721.283321551502 -9028.9663215568835
+exp0021 exp 1.104007405129741e-155 0.21258803067317278 -> 0.97748813933531764 0.21099037290544478
+exp0022 exp 0.027364777809295172 0.00059226603500623363 -> 1.0277424518451876 0.0006086970181346579
+exp0023 exp 0.94356313429255245 3.418530463518592 -> -2.4712285695346194 -0.70242654900218349
+
+-- cases where exp(z) representable, exp(z.real) not
+exp0030 exp 710.0 0.78500000000000003 -> 1.5803016909637158e+308 1.5790437551806911e+308
+exp0031 exp 710.0 -0.78500000000000003 -> 1.5803016909637158e+308 -1.5790437551806911e+308
+
+-- values for which exp(x) is subnormal, or underflows to 0
+exp0040 exp -735.0 0.78500000000000003 -> 4.3976783136329355e-320 4.3942198541120468e-320
+exp0041 exp -735.0 -2.3559999999999999 -> -4.3952079854037293e-320 -4.396690182341253e-320
+exp0042 exp -745.0 0.0 -> 4.9406564584124654e-324 0.0
+exp0043 exp -745.0 0.7 -> 0.0 0.0
+exp0044 exp -745.0 2.1 -> -0.0 0.0
+exp0045 exp -745.0 3.7 -> -0.0 -0.0
+exp0046 exp -745.0 5.3 -> 0.0 -0.0
+
+-- values for which exp(z) overflows
+exp0050 exp 710.0 0.0 -> inf 0.0                        overflow
+exp0051 exp 711.0 0.7 -> inf inf                        overflow
+exp0052 exp 710.0 1.5 -> 1.5802653829857376e+307 inf    overflow
+exp0053 exp 710.0 1.6 -> -6.5231579995501372e+306 inf   overflow
+exp0054 exp 710.0 2.8 -> -inf 7.4836177417448528e+307   overflow
+
+-- Additional real values (mpmath)
+exp0070 exp 1e-08 0.0 -> 1.00000001000000005 0.0
+exp0071 exp 0.0003 0.0 -> 1.0003000450045003375 0.0
+exp0072 exp 0.2 0.0 -> 1.2214027581601698475 0.0
+exp0073 exp 1.0 0.0 -> 2.7182818284590452354 0.0
+exp0074 exp -1e-08 0.0 -> 0.99999999000000005 0.0
+exp0075 exp -0.0003 0.0 -> 0.99970004499550033751 0.0
+exp0076 exp -1.0 0.0 -> 0.3678794411714423216 0.0
+exp0077 exp 2.220446049250313e-16 0.0 -> 1.000000000000000222 0.0
+exp0078 exp -1.1102230246251565e-16 0.0 -> 0.99999999999999988898 0.0
+exp0079 exp 2.302585092994046 0.0 -> 10.000000000000002171 0.0
+exp0080 exp -2.302585092994046 0.0 -> 0.099999999999999978292 0.0
+exp0081 exp 709.7827 0.0 -> 1.7976699566638014654e+308 0.0
+
+-- special values
+exp1000 exp 0.0 0.0 -> 1.0 0.0
+exp1001 exp -0.0 0.0 -> 1.0 0.0
+exp1002 exp 0.0 inf -> nan nan          invalid
+exp1003 exp 2.3 inf -> nan nan          invalid
+exp1004 exp -0.0 inf -> nan nan         invalid
+exp1005 exp -2.3 inf -> nan nan         invalid
+exp1006 exp 0.0 nan -> nan nan
+exp1007 exp 2.3 nan -> nan nan
+exp1008 exp -0.0 nan -> nan nan
+exp1009 exp -2.3 nan -> nan nan
+exp1010 exp -inf 0.0 -> 0.0 0.0
+exp1011 exp -inf 1.4 -> 0.0 0.0
+exp1012 exp -inf 2.8 -> -0.0 0.0
+exp1013 exp -inf 4.2 -> -0.0 -0.0
+exp1014 exp -inf 5.6 -> 0.0 -0.0
+exp1015 exp -inf 7.0 -> 0.0 0.0
+exp1016 exp inf 0.0 -> inf 0.0
+exp1017 exp inf 1.4 -> inf inf
+exp1018 exp inf 2.8 -> -inf inf
+exp1019 exp inf 4.2 -> -inf -inf
+exp1020 exp inf 5.6 -> inf -inf
+exp1021 exp inf 7.0 -> inf inf
+exp1022 exp -inf inf -> 0.0 0.0         ignore-real-sign ignore-imag-sign
+exp1023 exp inf inf -> inf nan          invalid ignore-real-sign
+exp1024 exp -inf nan -> 0.0 0.0         ignore-real-sign ignore-imag-sign
+exp1025 exp inf nan -> inf nan          ignore-real-sign
+exp1026 exp nan 0.0 -> nan 0.0
+exp1027 exp nan 2.3 -> nan nan
+exp1028 exp nan inf -> nan nan
+exp1029 exp nan nan -> nan nan
+exp1030 exp 0.0 -0.0 -> 1.0 -0.0
+exp1031 exp -0.0 -0.0 -> 1.0 -0.0
+exp1032 exp 0.0 -inf -> nan nan         invalid
+exp1033 exp 2.3 -inf -> nan nan         invalid
+exp1034 exp -0.0 -inf -> nan nan        invalid
+exp1035 exp -2.3 -inf -> nan nan        invalid
+exp1036 exp -inf -0.0 -> 0.0 -0.0
+exp1037 exp -inf -1.4 -> 0.0 -0.0
+exp1038 exp -inf -2.8 -> -0.0 -0.0
+exp1039 exp -inf -4.2 -> -0.0 0.0
+exp1040 exp -inf -5.6 -> 0.0 0.0
+exp1041 exp -inf -7.0 -> 0.0 -0.0
+exp1042 exp inf -0.0 -> inf -0.0
+exp1043 exp inf -1.4 -> inf -inf
+exp1044 exp inf -2.8 -> -inf -inf
+exp1045 exp inf -4.2 -> -inf inf
+exp1046 exp inf -5.6 -> inf inf
+exp1047 exp inf -7.0 -> inf -inf
+exp1048 exp -inf -inf -> 0.0 0.0        ignore-real-sign ignore-imag-sign
+exp1049 exp inf -inf -> inf nan         invalid ignore-real-sign
+exp1050 exp nan -0.0 -> nan -0.0
+exp1051 exp nan -2.3 -> nan nan
+exp1052 exp nan -inf -> nan nan
+
+
+-----------------------------
+-- cosh: Hyperbolic Cosine --
+-----------------------------
+
+-- zeros
+cosh0000 cosh 0.0 0.0 -> 1.0 0.0
+cosh0001 cosh 0.0 -0.0 -> 1.0 -0.0
+cosh0002 cosh -0.0 0.0 -> 1.0 -0.0
+cosh0003 cosh -0.0 -0.0 -> 1.0 0.0
+
+-- random inputs
+cosh0004 cosh -0.85395264297414253 -8.8553756148671958 -> -1.1684340348021185 0.51842195359787435
+cosh0005 cosh -19.584904237211223 -0.066582627994906177 -> 159816812.23336992 10656776.050406246
+cosh0006 cosh -0.11072618401130772 -1.484820215073247 -> 0.086397164744949503 0.11054275637717284
+cosh0007 cosh -3.4764840250681752 -0.48440348288275276 -> 14.325931955190844 7.5242053548737955
+cosh0008 cosh -0.52047063604524602 -0.3603805382775585 -> 1.0653940354683802 0.19193293606252473
+cosh0009 cosh -1.39518962975995 0.0074738604700702906 -> 2.1417031027235969 -0.01415518712296308
+cosh0010 cosh -0.37107064757653541 0.14728085307856609 -> 1.0580601496776991 -0.055712531964568587
+cosh0011 cosh -5.8470200958739653 4.0021722388336292 -> -112.86220667618285 131.24734033545013
+cosh0012 cosh -0.1700261444851883 0.97167540135354513 -> 0.57208748253577946 -0.1410904820240203
+cosh0013 cosh -0.44042397902648783 1.0904791964139742 -> 0.50760322393058133 -0.40333966652010816
+cosh0014 cosh 0.052267552491867299 -3.8889011430644174 -> -0.73452303414639297 0.035540704833537134
+cosh0015 cosh 0.98000764177127453 -1.2548829247784097 -> 0.47220747341416142 -1.0879421432180316
+cosh0016 cosh 0.083594701222644008 -0.88847899930181284 -> 0.63279782419312613 -0.064954566816002285
+cosh0017 cosh 1.38173531783776 -0.43185040816732229 -> 1.9221663374671647 -0.78073830858849347
+cosh0018 cosh 0.57315681120148465 -0.22255760951027942 -> 1.1399733125173004 -0.1335512343605956
+cosh0019 cosh 1.8882512333062347 4.5024932182383797 -> -0.7041602065362691 -3.1573822131964615
+cosh0020 cosh 0.5618219206858317 0.92620452129575348 -> 0.69822380405378381 0.47309067471054522
+cosh0021 cosh 0.54361442847062591 0.64176483583018462 -> 0.92234462074193491 0.34167906495845501
+cosh0022 cosh 0.0014777403107920331 1.3682028122677661 -> 0.2012106963899549 0.001447518137863219
+cosh0023 cosh 2.218885944363501 2.0015727395883687 -> -1.94294321081968 4.1290269176083196
+
+-- large real part
+cosh0030 cosh 710.5 2.3519999999999999 -> -1.2967465239355998e+308 1.3076707908857333e+308
+cosh0031 cosh -710.5 0.69999999999999996 -> 1.4085466381392499e+308 -1.1864024666450239e+308
+cosh0032 cosh 720.0 0.0 -> inf 0.0 overflow
+
+-- Additional real values (mpmath)
+cosh0050 cosh 1e-150 0.0 -> 1.0 0.0
+cosh0051 cosh 1e-18 0.0 -> 1.0 0.0
+cosh0052 cosh 1e-09 0.0 -> 1.0000000000000000005 0.0
+cosh0053 cosh 0.0003 0.0 -> 1.0000000450000003375 0.0
+cosh0054 cosh 0.2 0.0 -> 1.0200667556190758485 0.0
+cosh0055 cosh 1.0 0.0 -> 1.5430806348152437785 0.0
+cosh0056 cosh -1e-18 0.0 -> 1.0 -0.0
+cosh0057 cosh -0.0003 0.0 -> 1.0000000450000003375 -0.0
+cosh0058 cosh -1.0 0.0 -> 1.5430806348152437785 -0.0
+cosh0059 cosh 1.3169578969248168 0.0 -> 2.0000000000000001504 0.0
+cosh0060 cosh -1.3169578969248168 0.0 -> 2.0000000000000001504 -0.0
+cosh0061 cosh 17.328679513998633 0.0 -> 16777216.000000021938 0.0
+cosh0062 cosh 18.714973875118524 0.0 -> 67108864.000000043662 0.0
+cosh0063 cosh 709.7827 0.0 -> 8.9883497833190073272e+307 0.0
+cosh0064 cosh -709.7827 0.0 -> 8.9883497833190073272e+307 -0.0
+
+-- special values
+cosh1000 cosh 0.0 0.0 -> 1.0 0.0
+cosh1001 cosh 0.0 inf -> nan 0.0        invalid ignore-imag-sign
+cosh1002 cosh 0.0 nan -> nan 0.0        ignore-imag-sign
+cosh1003 cosh 2.3 inf -> nan nan        invalid
+cosh1004 cosh 2.3 nan -> nan nan
+cosh1005 cosh inf 0.0 -> inf 0.0
+cosh1006 cosh inf 1.4 -> inf inf
+cosh1007 cosh inf 2.8 -> -inf inf
+cosh1008 cosh inf 4.2 -> -inf -inf
+cosh1009 cosh inf 5.6 -> inf -inf
+cosh1010 cosh inf 7.0 -> inf inf
+cosh1011 cosh inf inf -> inf nan        invalid ignore-real-sign
+cosh1012 cosh inf nan -> inf nan
+cosh1013 cosh nan 0.0 -> nan 0.0        ignore-imag-sign
+cosh1014 cosh nan 2.3 -> nan nan
+cosh1015 cosh nan inf -> nan nan
+cosh1016 cosh nan nan -> nan nan
+cosh1017 cosh 0.0 -0.0 -> 1.0 -0.0
+cosh1018 cosh 0.0 -inf -> nan 0.0       invalid ignore-imag-sign
+cosh1019 cosh 2.3 -inf -> nan nan       invalid
+cosh1020 cosh inf -0.0 -> inf -0.0
+cosh1021 cosh inf -1.4 -> inf -inf
+cosh1022 cosh inf -2.8 -> -inf -inf
+cosh1023 cosh inf -4.2 -> -inf inf
+cosh1024 cosh inf -5.6 -> inf inf
+cosh1025 cosh inf -7.0 -> inf -inf
+cosh1026 cosh inf -inf -> inf nan       invalid ignore-real-sign
+cosh1027 cosh nan -0.0 -> nan 0.0       ignore-imag-sign
+cosh1028 cosh nan -2.3 -> nan nan
+cosh1029 cosh nan -inf -> nan nan
+cosh1030 cosh -0.0 -0.0 -> 1.0 0.0
+cosh1031 cosh -0.0 -inf -> nan 0.0      invalid ignore-imag-sign
+cosh1032 cosh -0.0 nan -> nan 0.0       ignore-imag-sign
+cosh1033 cosh -2.3 -inf -> nan nan      invalid
+cosh1034 cosh -2.3 nan -> nan nan
+cosh1035 cosh -inf -0.0 -> inf 0.0
+cosh1036 cosh -inf -1.4 -> inf inf
+cosh1037 cosh -inf -2.8 -> -inf inf
+cosh1038 cosh -inf -4.2 -> -inf -inf
+cosh1039 cosh -inf -5.6 -> inf -inf
+cosh1040 cosh -inf -7.0 -> inf inf
+cosh1041 cosh -inf -inf -> inf nan      invalid ignore-real-sign
+cosh1042 cosh -inf nan -> inf nan
+cosh1043 cosh -0.0 0.0 -> 1.0 -0.0
+cosh1044 cosh -0.0 inf -> nan 0.0       invalid ignore-imag-sign
+cosh1045 cosh -2.3 inf -> nan nan       invalid
+cosh1046 cosh -inf 0.0 -> inf -0.0
+cosh1047 cosh -inf 1.4 -> inf -inf
+cosh1048 cosh -inf 2.8 -> -inf -inf
+cosh1049 cosh -inf 4.2 -> -inf inf
+cosh1050 cosh -inf 5.6 -> inf inf
+cosh1051 cosh -inf 7.0 -> inf -inf
+cosh1052 cosh -inf inf -> inf nan       invalid ignore-real-sign
+
+
+---------------------------
+-- sinh: Hyperbolic Sine --
+---------------------------
+
+-- zeros
+sinh0000 sinh 0.0 0.0 -> 0.0 0.0
+sinh0001 sinh 0.0 -0.0 -> 0.0 -0.0
+sinh0002 sinh -0.0 0.0 -> -0.0 0.0
+sinh0003 sinh -0.0 -0.0 -> -0.0 -0.0
+
+-- random inputs
+sinh0004 sinh -17.282588091462742 -0.38187948694103546 -> -14867386.857248396 -5970648.6553516639
+sinh0005 sinh -343.91971203143208 -5.0172868877771525e-22 -> -1.1518691776521735e+149 -5.7792581214689021e+127
+sinh0006 sinh -14.178122253300922 -1.9387157579351293 -> 258440.37909034826 -670452.58500946441
+sinh0007 sinh -1.0343810581686239 -1.0970235266369905 -> -0.56070858278092739 -1.4098883258046697
+sinh0008 sinh -0.066126561416368204 -0.070461584169961872 -> -0.066010558700938124 -0.070557276738637542
+sinh0009 sinh -0.37630149150308484 3.3621734692162173 -> 0.37591118119332617 -0.23447115926369383
+sinh0010 sinh -0.049941960978670055 0.40323767020414625 -> -0.045955482136329009 0.3928878494430646
+sinh0011 sinh -16.647852603903715 0.0026852219129082098 -> -8492566.5739382561 22804.480671133562
+sinh0012 sinh -1.476625314303694 0.89473773116683386 -> -1.2982943334382224 1.7966593367791204
+sinh0013 sinh -422.36429577556913 0.10366634502307912 -> -1.3400321008920044e+183 1.3941600948045599e+182
+sinh0014 sinh 0.09108340745641981 -0.40408227416070353 -> 0.083863724802237902 -0.39480716553935602
+sinh0015 sinh 2.036064132067386 -2.6831729961386239 -> -3.37621124363175 -1.723868330002817
+sinh0016 sinh 2.5616717223063317 -0.0078978498622717767 -> 6.4399415853815869 -0.051472264400722133
+sinh0017 sinh 0.336804011985188 -6.5654622971649337 -> 0.32962499307574578 -0.29449170159995197
+sinh0018 sinh 0.23774603755649693 -0.92467195799232049 -> 0.14449839490603389 -0.82109449053556793
+sinh0019 sinh 0.0011388273541465494 1.9676196882949855 -> -0.00044014605389634999 0.92229398407098806
+sinh0020 sinh 3.2443870105663759 0.8054287559616895 -> 8.8702890778527426 9.2610748597042196
+sinh0021 sinh 0.040628908857054738 0.098206391190944958 -> 0.04044426841671233 0.098129544739707392
+sinh0022 sinh 4.7252283918217696e-30 9.1198155642656697 -> -4.5071980561644404e-30 0.30025730701661713
+sinh0023 sinh 0.043713693678420068 0.22512549887532657 -> 0.042624198673416713 0.22344201231217961
+
+-- large real part
+sinh0030 sinh 710.5 -2.3999999999999999 -> -1.3579970564885919e+308 -1.24394470907798e+308
+sinh0031 sinh -710.5 0.80000000000000004 -> -1.2830671601735164e+308 1.3210954193997678e+308
+sinh0032 sinh 720.0 0.0 -> inf 0.0 overflow
+
+-- Additional real values (mpmath)
+sinh0050 sinh 1e-100 0.0 -> 1.00000000000000002e-100 0.0
+sinh0051 sinh 5e-17 0.0 -> 4.9999999999999998955e-17 0.0
+sinh0052 sinh 1e-16 0.0 -> 9.999999999999999791e-17 0.0
+sinh0053 sinh 3.7e-08 0.0 -> 3.7000000000000008885e-8 0.0
+sinh0054 sinh 0.001 0.0 -> 0.0010000001666666750208 0.0
+sinh0055 sinh 0.2 0.0 -> 0.20133600254109399895 0.0
+sinh0056 sinh 1.0 0.0 -> 1.1752011936438014569 0.0
+sinh0057 sinh -3.7e-08 0.0 -> -3.7000000000000008885e-8 0.0
+sinh0058 sinh -0.001 0.0 -> -0.0010000001666666750208 0.0
+sinh0059 sinh -1.0 0.0 -> -1.1752011936438014569 0.0
+sinh0060 sinh 1.4436354751788103 0.0 -> 1.9999999999999999078 0.0
+sinh0061 sinh -1.4436354751788103 0.0 -> -1.9999999999999999078 0.0
+sinh0062 sinh 17.328679513998633 0.0 -> 16777215.999999992136 0.0
+sinh0063 sinh 18.714973875118524 0.0 -> 67108864.000000036211 0.0
+sinh0064 sinh 709.7827 0.0 -> 8.9883497833190073272e+307 0.0
+sinh0065 sinh -709.7827 0.0 -> -8.9883497833190073272e+307 0.0
+
+-- special values
+sinh1000 sinh 0.0 0.0 -> 0.0 0.0
+sinh1001 sinh 0.0 inf -> 0.0 nan        invalid ignore-real-sign
+sinh1002 sinh 0.0 nan -> 0.0 nan        ignore-real-sign
+sinh1003 sinh 2.3 inf -> nan nan        invalid
+sinh1004 sinh 2.3 nan -> nan nan
+sinh1005 sinh inf 0.0 -> inf 0.0
+sinh1006 sinh inf 1.4 -> inf inf
+sinh1007 sinh inf 2.8 -> -inf inf
+sinh1008 sinh inf 4.2 -> -inf -inf
+sinh1009 sinh inf 5.6 -> inf -inf
+sinh1010 sinh inf 7.0 -> inf inf
+sinh1011 sinh inf inf -> inf nan        invalid ignore-real-sign
+sinh1012 sinh inf nan -> inf nan        ignore-real-sign
+sinh1013 sinh nan 0.0 -> nan 0.0
+sinh1014 sinh nan 2.3 -> nan nan
+sinh1015 sinh nan inf -> nan nan
+sinh1016 sinh nan nan -> nan nan
+sinh1017 sinh 0.0 -0.0 -> 0.0 -0.0
+sinh1018 sinh 0.0 -inf -> 0.0 nan       invalid ignore-real-sign
+sinh1019 sinh 2.3 -inf -> nan nan       invalid
+sinh1020 sinh inf -0.0 -> inf -0.0
+sinh1021 sinh inf -1.4 -> inf -inf
+sinh1022 sinh inf -2.8 -> -inf -inf
+sinh1023 sinh inf -4.2 -> -inf inf
+sinh1024 sinh inf -5.6 -> inf inf
+sinh1025 sinh inf -7.0 -> inf -inf
+sinh1026 sinh inf -inf -> inf nan       invalid ignore-real-sign
+sinh1027 sinh nan -0.0 -> nan -0.0
+sinh1028 sinh nan -2.3 -> nan nan
+sinh1029 sinh nan -inf -> nan nan
+sinh1030 sinh -0.0 -0.0 -> -0.0 -0.0
+sinh1031 sinh -0.0 -inf -> 0.0 nan      invalid ignore-real-sign
+sinh1032 sinh -0.0 nan -> 0.0 nan       ignore-real-sign
+sinh1033 sinh -2.3 -inf -> nan nan      invalid
+sinh1034 sinh -2.3 nan -> nan nan
+sinh1035 sinh -inf -0.0 -> -inf -0.0
+sinh1036 sinh -inf -1.4 -> -inf -inf
+sinh1037 sinh -inf -2.8 -> inf -inf
+sinh1038 sinh -inf -4.2 -> inf inf
+sinh1039 sinh -inf -5.6 -> -inf inf
+sinh1040 sinh -inf -7.0 -> -inf -inf
+sinh1041 sinh -inf -inf -> inf nan      invalid ignore-real-sign
+sinh1042 sinh -inf nan -> inf nan       ignore-real-sign
+sinh1043 sinh -0.0 0.0 -> -0.0 0.0
+sinh1044 sinh -0.0 inf -> 0.0 nan       invalid ignore-real-sign
+sinh1045 sinh -2.3 inf -> nan nan       invalid
+sinh1046 sinh -inf 0.0 -> -inf 0.0
+sinh1047 sinh -inf 1.4 -> -inf inf
+sinh1048 sinh -inf 2.8 -> inf inf
+sinh1049 sinh -inf 4.2 -> inf -inf
+sinh1050 sinh -inf 5.6 -> -inf -inf
+sinh1051 sinh -inf 7.0 -> -inf inf
+sinh1052 sinh -inf inf -> inf nan       invalid ignore-real-sign
+
+
+------------------------------
+-- tanh: Hyperbolic Tangent --
+------------------------------
+
+-- Disabled test: replaced by test_math.testTanhSign()
+-- and test_cmath.testTanhSign()
+
+-- -- zeros
+-- tanh0000 tanh 0.0 0.0 -> 0.0 0.0
+-- tanh0001 tanh 0.0 -0.0 -> 0.0 -0.0
+-- tanh0002 tanh -0.0 0.0 -> -0.0 0.0
+-- tanh0003 tanh -0.0 -0.0 -> -0.0 -0.0
+
+-- random inputs
+tanh0004 tanh -21.200500450664993 -1.6970729480342996 -> -1.0 1.9241352344849399e-19
+tanh0005 tanh -0.34158771504251928 -8.0848504951747131 -> -2.123711225855613 1.2827526782026006
+tanh0006 tanh -15.454144725193689 -0.23619582288265617 -> -0.99999999999993283 -3.4336684248260036e-14
+tanh0007 tanh -7.6103163119661952 -0.7802748320307008 -> -0.99999999497219438 -4.9064845343755437e-07
+tanh0008 tanh -0.15374717235792129 -0.6351086327306138 -> -0.23246081703561869 -0.71083467433910219
+tanh0009 tanh -0.49101115474392465 0.09723001264886301 -> -0.45844445715492133 0.077191158541805888
+tanh0010 tanh -0.10690612157664491 2.861612800856395 -> -0.11519761626257358 -0.28400488355647507
+tanh0011 tanh -0.91505774192066702 1.5431174597727007 -> -1.381109893068114 0.025160819663709356
+tanh0012 tanh -0.057433367093792223 0.35491159541246459 -> -0.065220499046696953 0.36921788332369498
+tanh0013 tanh -1.3540418621233514 0.18969415642242535 -> -0.88235642861151387 0.043764069984411721
+tanh0014 tanh 0.94864783961003529 -0.11333689578867717 -> 0.74348401861861368 -0.051271042543855221
+tanh0015 tanh 1.9591698133845488 -0.0029654444904578339 -> 0.9610270776968135 -0.00022664240049212933
+tanh0016 tanh 1.0949715796669197 -0.24706642853984456 -> 0.81636574501369386 -0.087767436914149954
+tanh0017 tanh 5770428.2113731047 -3.7160580339833165 -> 1.0 -0.0
+tanh0018 tanh 1.5576782321399629 -1.0357943787966468 -> 1.0403002384895388 -0.081126347894671463
+tanh0019 tanh 0.62378536230552961 2.3471393579560216 -> 0.85582499238960363 -0.53569473646842869
+tanh0020 tanh 17.400628602508025 9.3987059533841979 -> 0.99999999999999845 -8.0175867720530832e-17
+tanh0021 tanh 0.15026177509871896 0.50630349159505472 -> 0.19367536571827768 0.53849847858853661
+tanh0022 tanh 0.57433977530711167 1.0071604546265627 -> 1.0857848159262844 0.69139213955872214
+tanh0023 tanh 0.16291181500449456 0.006972810241567544 -> 0.16149335907551157 0.0067910772903467817
+
+-- large real part
+tanh0030 tanh 710 0.13 -> 1.0 0.0
+tanh0031 tanh -711 7.4000000000000004 -> -1.0 0.0
+tanh0032 tanh 1000 -2.3199999999999998 -> 1.0 0.0
+tanh0033 tanh -1.0000000000000001e+300 -9.6699999999999999 -> -1.0 -0.0
+
+-- Additional real values (mpmath)
+tanh0050 tanh 1e-100 0.0 -> 1.00000000000000002e-100 0.0
+tanh0051 tanh 5e-17 0.0 -> 4.9999999999999998955e-17 0.0
+tanh0052 tanh 1e-16 0.0 -> 9.999999999999999791e-17 0.0
+tanh0053 tanh 3.7e-08 0.0 -> 3.6999999999999983559e-8 0.0
+tanh0054 tanh 0.001 0.0 -> 0.00099999966666680002076 0.0
+tanh0055 tanh 0.2 0.0 -> 0.19737532022490401141 0.0
+tanh0056 tanh 1.0 0.0 -> 0.76159415595576488812 0.0
+tanh0057 tanh -3.7e-08 0.0 -> -3.6999999999999983559e-8 0.0
+tanh0058 tanh -0.001 0.0 -> -0.00099999966666680002076 0.0
+tanh0059 tanh -1.0 0.0 -> -0.76159415595576488812 0.0
+tanh0060 tanh 0.5493061443340549 0.0 -> 0.50000000000000003402 0.0
+tanh0061 tanh -0.5493061443340549 0.0 -> -0.50000000000000003402 0.0
+tanh0062 tanh 17.328679513998633 0.0 -> 0.99999999999999822364 0.0
+tanh0063 tanh 18.714973875118524 0.0 -> 0.99999999999999988898 0.0
+tanh0064 tanh 711 0.0 -> 1.0 0.0
+tanh0065 tanh 1.797e+308 0.0 -> 1.0 0.0
+
+--special values
+tanh1000 tanh 0.0 0.0 -> 0.0 0.0
+tanh1001 tanh 0.0 inf -> nan nan        invalid
+tanh1002 tanh 2.3 inf -> nan nan        invalid
+tanh1003 tanh 0.0 nan -> nan nan
+tanh1004 tanh 2.3 nan -> nan nan
+tanh1005 tanh inf 0.0 -> 1.0 0.0
+tanh1006 tanh inf 0.7 -> 1.0 0.0
+tanh1007 tanh inf 1.4 -> 1.0 0.0
+tanh1008 tanh inf 2.1 -> 1.0 -0.0
+tanh1009 tanh inf 2.8 -> 1.0 -0.0
+tanh1010 tanh inf 3.5 -> 1.0 0.0
+tanh1011 tanh inf inf -> 1.0 0.0        ignore-imag-sign
+tanh1012 tanh inf nan -> 1.0 0.0        ignore-imag-sign
+tanh1013 tanh nan 0.0 -> nan 0.0
+tanh1014 tanh nan 2.3 -> nan nan
+tanh1015 tanh nan inf -> nan nan
+tanh1016 tanh nan nan -> nan nan
+tanh1017 tanh 0.0 -0.0 -> 0.0 -0.0
+tanh1018 tanh 0.0 -inf -> nan nan       invalid
+tanh1019 tanh 2.3 -inf -> nan nan       invalid
+tanh1020 tanh inf -0.0 -> 1.0 -0.0
+tanh1021 tanh inf -0.7 -> 1.0 -0.0
+tanh1022 tanh inf -1.4 -> 1.0 -0.0
+tanh1023 tanh inf -2.1 -> 1.0 0.0
+tanh1024 tanh inf -2.8 -> 1.0 0.0
+tanh1025 tanh inf -3.5 -> 1.0 -0.0
+tanh1026 tanh inf -inf -> 1.0 0.0       ignore-imag-sign
+tanh1027 tanh nan -0.0 -> nan -0.0
+tanh1028 tanh nan -2.3 -> nan nan
+tanh1029 tanh nan -inf -> nan nan
+tanh1030 tanh -0.0 -0.0 -> -0.0 -0.0
+tanh1031 tanh -0.0 -inf -> nan nan      invalid
+tanh1032 tanh -2.3 -inf -> nan nan      invalid
+tanh1033 tanh -0.0 nan -> nan nan
+tanh1034 tanh -2.3 nan -> nan nan
+tanh1035 tanh -inf -0.0 -> -1.0 -0.0
+tanh1036 tanh -inf -0.7 -> -1.0 -0.0
+tanh1037 tanh -inf -1.4 -> -1.0 -0.0
+tanh1038 tanh -inf -2.1 -> -1.0 0.0
+tanh1039 tanh -inf -2.8 -> -1.0 0.0
+tanh1040 tanh -inf -3.5 -> -1.0 -0.0
+tanh1041 tanh -inf -inf -> -1.0 0.0     ignore-imag-sign
+tanh1042 tanh -inf nan -> -1.0 0.0      ignore-imag-sign
+tanh1043 tanh -0.0 0.0 -> -0.0 0.0
+tanh1044 tanh -0.0 inf -> nan nan       invalid
+tanh1045 tanh -2.3 inf -> nan nan       invalid
+tanh1046 tanh -inf 0.0 -> -1.0 0.0
+tanh1047 tanh -inf 0.7 -> -1.0 0.0
+tanh1048 tanh -inf 1.4 -> -1.0 0.0
+tanh1049 tanh -inf 2.1 -> -1.0 -0.0
+tanh1050 tanh -inf 2.8 -> -1.0 -0.0
+tanh1051 tanh -inf 3.5 -> -1.0 0.0
+tanh1052 tanh -inf inf -> -1.0 0.0      ignore-imag-sign
+
+
+-----------------
+-- cos: Cosine --
+-----------------
+
+-- zeros
+cos0000 cos 0.0 0.0 -> 1.0 -0.0
+cos0001 cos 0.0 -0.0 -> 1.0 0.0
+cos0002 cos -0.0 0.0 -> 1.0 0.0
+cos0003 cos -0.0 -0.0 -> 1.0 -0.0
+
+-- random inputs
+cos0004 cos -2.0689194692073034 -0.0016802181751734313 -> -0.47777827208561469 -0.0014760401501695971
+cos0005 cos -0.4209627318177977 -1.8238516774258027 -> 2.9010402201444108 -1.2329207042329617
+cos0006 cos -1.9402181630694557 -2.9751857392891217 -> -3.5465459297970985 -9.1119163586282248
+cos0007 cos -3.3118320290191616 -0.87871302909286142 -> -1.3911528636565498 0.16878141517391701
+cos0008 cos -4.9540404623376872 -0.57949232239026827 -> 0.28062445586552065 0.59467861308508008
+cos0009 cos -0.45374584316245026 1.3950283448373935 -> 1.9247665574290578 0.83004572204761107
+cos0010 cos -0.42578172040176843 1.2715881615413049 -> 1.7517161459489148 0.67863902697363332
+cos0011 cos -0.13862985354300136 0.43587635877670328 -> 1.0859880290361912 0.062157548146672272
+cos0012 cos -0.11073221308966584 9.9384082307326475e-15 -> 0.99387545040722947 1.0982543264065479e-15
+cos0013 cos -1.5027633662054623e-07 0.0069668060249955498 -> 1.0000242682912412 1.0469545565660995e-09
+cos0014 cos 4.9728645490503052 -0.00027479808860952822 -> 0.25754011731975501 -0.00026552849549083186
+cos0015 cos 7.81969303486719 -0.79621523445878783 -> 0.045734882501585063 0.88253139933082991
+cos0016 cos 0.13272421880766716 -0.74668445308718201 -> 1.2806012244432847 0.10825373267437005
+cos0017 cos 4.2396521985973274 -2.2178848380884881 -> -2.1165117057056855 -4.0416492444641401
+cos0018 cos 1.1622206624927296 -0.50400115461197081 -> 0.44884072613370379 0.4823469915034318
+cos0019 cos 1.628772864620884e-08 0.58205705428979282 -> 1.1742319995791435 -1.0024839481956604e-08
+cos0020 cos 2.6385212606111241 2.9886107100937296 -> -8.7209475927161417 -4.7748352107199796
+cos0021 cos 4.8048375263775256 0.0062248852898515658 -> 0.092318702015846243 0.0061983430422306142
+cos0022 cos 7.9914515433858515 0.71659966615501436 -> -0.17375439906936566 -0.77217043527294582
+cos0023 cos 0.45124351152540226 1.6992693993812158 -> 2.543477948972237 -1.1528193694875477
+
+-- Additional real values (mpmath)
+cos0050 cos 1e-150 0.0 -> 1.0 -0.0
+cos0051 cos 1e-18 0.0 -> 1.0 -0.0
+cos0052 cos 1e-09 0.0 -> 0.9999999999999999995 -0.0
+cos0053 cos 0.0003 0.0 -> 0.9999999550000003375 -0.0
+cos0054 cos 0.2 0.0 -> 0.98006657784124162892 -0.0
+cos0055 cos 1.0 0.0 -> 0.5403023058681397174 -0.0
+cos0056 cos -1e-18 0.0 -> 1.0 0.0
+cos0057 cos -0.0003 0.0 -> 0.9999999550000003375 0.0
+cos0058 cos -1.0 0.0 -> 0.5403023058681397174 0.0
+cos0059 cos 1.0471975511965976 0.0 -> 0.50000000000000009945 -0.0
+cos0060 cos 2.5707963267948966 0.0 -> -0.84147098480789647357 -0.0
+cos0061 cos -2.5707963267948966 0.0 -> -0.84147098480789647357 0.0
+cos0062 cos 7.225663103256523 0.0 -> 0.58778525229247407559 -0.0
+cos0063 cos -8.79645943005142 0.0 -> -0.80901699437494722255 0.0
+
+-- special values
+cos1000 cos -0.0 0.0 -> 1.0 0.0
+cos1001 cos -inf 0.0 -> nan 0.0 invalid ignore-imag-sign
+cos1002 cos nan 0.0 -> nan 0.0 ignore-imag-sign
+cos1003 cos -inf 2.2999999999999998 -> nan nan invalid
+cos1004 cos nan 2.2999999999999998 -> nan nan
+cos1005 cos -0.0 inf -> inf 0.0
+cos1006 cos -1.3999999999999999 inf -> inf inf
+cos1007 cos -2.7999999999999998 inf -> -inf inf
+cos1008 cos -4.2000000000000002 inf -> -inf -inf
+cos1009 cos -5.5999999999999996 inf -> inf -inf
+cos1010 cos -7.0 inf -> inf inf
+cos1011 cos -inf inf -> inf nan invalid ignore-real-sign
+cos1012 cos nan inf -> inf nan
+cos1013 cos -0.0 nan -> nan 0.0 ignore-imag-sign
+cos1014 cos -2.2999999999999998 nan -> nan nan
+cos1015 cos -inf nan -> nan nan
+cos1016 cos nan nan -> nan nan
+cos1017 cos 0.0 0.0 -> 1.0 -0.0
+cos1018 cos inf 0.0 -> nan 0.0 invalid ignore-imag-sign
+cos1019 cos inf 2.2999999999999998 -> nan nan invalid
+cos1020 cos 0.0 inf -> inf -0.0
+cos1021 cos 1.3999999999999999 inf -> inf -inf
+cos1022 cos 2.7999999999999998 inf -> -inf -inf
+cos1023 cos 4.2000000000000002 inf -> -inf inf
+cos1024 cos 5.5999999999999996 inf -> inf inf
+cos1025 cos 7.0 inf -> inf -inf
+cos1026 cos inf inf -> inf nan invalid ignore-real-sign
+cos1027 cos 0.0 nan -> nan 0.0 ignore-imag-sign
+cos1028 cos 2.2999999999999998 nan -> nan nan
+cos1029 cos inf nan -> nan nan
+cos1030 cos 0.0 -0.0 -> 1.0 0.0
+cos1031 cos inf -0.0 -> nan 0.0 invalid ignore-imag-sign
+cos1032 cos nan -0.0 -> nan 0.0 ignore-imag-sign
+cos1033 cos inf -2.2999999999999998 -> nan nan invalid
+cos1034 cos nan -2.2999999999999998 -> nan nan
+cos1035 cos 0.0 -inf -> inf 0.0
+cos1036 cos 1.3999999999999999 -inf -> inf inf
+cos1037 cos 2.7999999999999998 -inf -> -inf inf
+cos1038 cos 4.2000000000000002 -inf -> -inf -inf
+cos1039 cos 5.5999999999999996 -inf -> inf -inf
+cos1040 cos 7.0 -inf -> inf inf
+cos1041 cos inf -inf -> inf nan invalid ignore-real-sign
+cos1042 cos nan -inf -> inf nan
+cos1043 cos -0.0 -0.0 -> 1.0 -0.0
+cos1044 cos -inf -0.0 -> nan 0.0 invalid ignore-imag-sign
+cos1045 cos -inf -2.2999999999999998 -> nan nan invalid
+cos1046 cos -0.0 -inf -> inf -0.0
+cos1047 cos -1.3999999999999999 -inf -> inf -inf
+cos1048 cos -2.7999999999999998 -inf -> -inf -inf
+cos1049 cos -4.2000000000000002 -inf -> -inf inf
+cos1050 cos -5.5999999999999996 -inf -> inf inf
+cos1051 cos -7.0 -inf -> inf -inf
+cos1052 cos -inf -inf -> inf nan invalid ignore-real-sign
+
+
+---------------
+-- sin: Sine --
+---------------
+
+-- zeros
+sin0000 sin 0.0 0.0 -> 0.0 0.0
+sin0001 sin 0.0 -0.0 -> 0.0 -0.0
+sin0002 sin -0.0 0.0 -> -0.0 0.0
+sin0003 sin -0.0 -0.0 -> -0.0 -0.0
+
+-- random inputs
+sin0004 sin -0.18691829163163759 -0.74388741985507034 -> -0.2396636733773444 -0.80023231101856751
+sin0005 sin -0.45127453702459158 -461.81339920716164 -> -7.9722299331077877e+199 -1.6450205811004628e+200
+sin0006 sin -0.47669228345768921 -2.7369936564987514 -> -3.557238022267124 -6.8308030771226615
+sin0007 sin -0.31024285525950857 -1.4869219939188296 -> -0.70972676047175209 -1.9985029635426839
+sin0008 sin -4.4194573407025608 -1.405999210989288 -> 2.0702480800802685 0.55362250792180601
+sin0009 sin -1.7810832046434898e-05 0.0016439555384379083 -> -1.7810856113185261e-05 0.0016439562786668375
+sin0010 sin -0.8200017874897666 0.61724876887771929 -> -0.8749078195948865 0.44835295550987758
+sin0011 sin -1.4536502806107114 0.63998575534150415 -> -1.2035709929437679 0.080012187489163708
+sin0012 sin -2.2653412155506079 0.13172760685583729 -> -0.77502093809190431 -0.084554426868229532
+sin0013 sin -0.02613983069491858 0.18404766597776073 -> -0.026580778863127943 0.18502525396735642
+sin0014 sin 1.5743065001054617 -0.53125574272642029 -> 1.1444596332092725 0.0019537598099352077
+sin0015 sin 7.3833101791283289e-20 -0.16453221324236217 -> 7.4834720674379429e-20 -0.16527555646466915
+sin0016 sin 0.34763834641254038 -2.8377416421089565 -> 2.918883541504663 -8.0002718053250224
+sin0017 sin 0.077105785180421563 -0.090056027316200674 -> 0.077341973814471304 -0.089909869380524587
+sin0018 sin 3.9063227798142329e-17 -0.05954098654295524 -> 3.9132490348956512e-17 -0.059576172859837351
+sin0019 sin 0.57333917932544598 8.7785221430594696e-06 -> 0.54244029338302935 7.3747869125301368e-06
+sin0020 sin 0.024861722816513169 0.33044620756118515 -> 0.026228801369651 0.3363889671570689
+sin0021 sin 1.4342727387492671 0.81361889790284347 -> 1.3370960060947923 0.12336137961387163
+sin0022 sin 1.1518087354403725 4.8597235966150558 -> 58.919141989603041 26.237003403758852
+sin0023 sin 0.00087773078406649192 34.792379211312095 -> 565548145569.38245 644329685822700.62
+
+-- Additional real values (mpmath)
+sin0050 sin 1e-100 0.0 -> 1.00000000000000002e-100 0.0
+sin0051 sin 3.7e-08 0.0 -> 3.6999999999999992001e-8 0.0
+sin0052 sin 0.001 0.0 -> 0.00099999983333334168748 0.0
+sin0053 sin 0.2 0.0 -> 0.19866933079506122634 0.0
+sin0054 sin 1.0 0.0 -> 0.84147098480789650665 0.0
+sin0055 sin -3.7e-08 0.0 -> -3.6999999999999992001e-8 0.0
+sin0056 sin -0.001 0.0 -> -0.00099999983333334168748 0.0
+sin0057 sin -1.0 0.0 -> -0.84147098480789650665 0.0
+sin0058 sin 0.5235987755982989 0.0 -> 0.50000000000000004642 0.0
+sin0059 sin -0.5235987755982989 0.0 -> -0.50000000000000004642 0.0
+sin0060 sin 2.6179938779914944 0.0 -> 0.49999999999999996018 -0.0
+sin0061 sin -2.6179938779914944 0.0 -> -0.49999999999999996018 -0.0
+sin0062 sin 7.225663103256523 0.0 -> 0.80901699437494673648 0.0
+sin0063 sin -8.79645943005142 0.0 -> -0.58778525229247340658 -0.0
+
+-- special values
+sin1000 sin -0.0 0.0 -> -0.0 0.0
+sin1001 sin -inf 0.0 -> nan 0.0 invalid ignore-imag-sign
+sin1002 sin nan 0.0 -> nan 0.0 ignore-imag-sign
+sin1003 sin -inf 2.2999999999999998 -> nan nan invalid
+sin1004 sin nan 2.2999999999999998 -> nan nan
+sin1005 sin -0.0 inf -> -0.0 inf
+sin1006 sin -1.3999999999999999 inf -> -inf inf
+sin1007 sin -2.7999999999999998 inf -> -inf -inf
+sin1008 sin -4.2000000000000002 inf -> inf -inf
+sin1009 sin -5.5999999999999996 inf -> inf inf
+sin1010 sin -7.0 inf -> -inf inf
+sin1011 sin -inf inf -> nan inf invalid ignore-imag-sign
+sin1012 sin nan inf -> nan inf ignore-imag-sign
+sin1013 sin -0.0 nan -> -0.0 nan
+sin1014 sin -2.2999999999999998 nan -> nan nan
+sin1015 sin -inf nan -> nan nan
+sin1016 sin nan nan -> nan nan
+sin1017 sin 0.0 0.0 -> 0.0 0.0
+sin1018 sin inf 0.0 -> nan 0.0 invalid ignore-imag-sign
+sin1019 sin inf 2.2999999999999998 -> nan nan invalid
+sin1020 sin 0.0 inf -> 0.0 inf
+sin1021 sin 1.3999999999999999 inf -> inf inf
+sin1022 sin 2.7999999999999998 inf -> inf -inf
+sin1023 sin 4.2000000000000002 inf -> -inf -inf
+sin1024 sin 5.5999999999999996 inf -> -inf inf
+sin1025 sin 7.0 inf -> inf inf
+sin1026 sin inf inf -> nan inf invalid ignore-imag-sign
+sin1027 sin 0.0 nan -> 0.0 nan
+sin1028 sin 2.2999999999999998 nan -> nan nan
+sin1029 sin inf nan -> nan nan
+sin1030 sin 0.0 -0.0 -> 0.0 -0.0
+sin1031 sin inf -0.0 -> nan 0.0 invalid ignore-imag-sign
+sin1032 sin nan -0.0 -> nan 0.0 ignore-imag-sign
+sin1033 sin inf -2.2999999999999998 -> nan nan invalid
+sin1034 sin nan -2.2999999999999998 -> nan nan
+sin1035 sin 0.0 -inf -> 0.0 -inf
+sin1036 sin 1.3999999999999999 -inf -> inf -inf
+sin1037 sin 2.7999999999999998 -inf -> inf inf
+sin1038 sin 4.2000000000000002 -inf -> -inf inf
+sin1039 sin 5.5999999999999996 -inf -> -inf -inf
+sin1040 sin 7.0 -inf -> inf -inf
+sin1041 sin inf -inf -> nan inf invalid ignore-imag-sign
+sin1042 sin nan -inf -> nan inf ignore-imag-sign
+sin1043 sin -0.0 -0.0 -> -0.0 -0.0
+sin1044 sin -inf -0.0 -> nan 0.0 invalid ignore-imag-sign
+sin1045 sin -inf -2.2999999999999998 -> nan nan invalid
+sin1046 sin -0.0 -inf -> -0.0 -inf
+sin1047 sin -1.3999999999999999 -inf -> -inf -inf
+sin1048 sin -2.7999999999999998 -inf -> -inf inf
+sin1049 sin -4.2000000000000002 -inf -> inf inf
+sin1050 sin -5.5999999999999996 -inf -> inf -inf
+sin1051 sin -7.0 -inf -> -inf -inf
+sin1052 sin -inf -inf -> nan inf invalid ignore-imag-sign
+
+
+------------------
+-- tan: Tangent --
+------------------
+
+-- zeros
+tan0000 tan 0.0 0.0 -> 0.0 0.0
+tan0001 tan 0.0 -0.0 -> 0.0 -0.0
+tan0002 tan -0.0 0.0 -> -0.0 0.0
+tan0003 tan -0.0 -0.0 -> -0.0 -0.0
+
+-- random inputs
+tan0004 tan -0.56378561833861074 -1.7110276237187664e+73 -> -0.0 -1.0
+tan0005 tan -3.5451633993471915e-12 -2.855471863564059 -> -4.6622441304889575e-14 -0.99340273843093951
+tan0006 tan -2.502442719638696 -0.26742234390504221 -> 0.66735215252994995 -0.39078997935420956
+tan0007 tan -0.87639597720371365 -55.586225523280206 -> -1.0285264565948176e-48 -1.0
+tan0008 tan -0.015783869596427243 -520.05944436039272 -> -0.0 -1.0
+tan0009 tan -0.84643549990725164 2.0749097935396343 -> -0.031412661676959573 1.0033548479526764
+tan0010 tan -0.43613792248559646 8.1082741629458059 -> -1.3879848444644593e-07 0.99999988344224011
+tan0011 tan -1.0820906367833114 0.28571868992480248 -> -1.3622485737936536 0.99089269377971245
+tan0012 tan -1.1477859580220084 1.9021637002708041 -> -0.034348450042071196 1.0293954097901687
+tan0013 tan -0.12465543176953409 3.0606851016344815e-05 -> -0.12530514290387343 3.1087420769945479e-05
+tan0014 tan 3.7582848717525343 -692787020.44038939 -> 0.0 -1.0
+tan0015 tan 2.2321967655142176e-06 -10.090069423008169 -> 1.5369846120622643e-14 -0.99999999655723759
+tan0016 tan 0.88371172390245012 -1.1635053630132823 -> 0.19705017118625889 -1.0196452280843129
+tan0017 tan 2.1347414231849267 -1.9311339960416831 -> -0.038663576915982524 -1.0174399993980778
+tan0018 tan 5.9027945255899974 -2.1574195684607135e-183 -> -0.39986591539281496 -2.5023753167976915e-183
+tan0019 tan 0.44811489490805362 683216075670.07556 -> 0.0 1.0
+tan0020 tan 4.1459766396068325 12.523017205605756 -> 2.4022514758988068e-11 1.0000000000112499
+tan0021 tan 1.7809617968443272 1.5052381702853379 -> -0.044066222118946903 1.0932684517702778
+tan0022 tan 1.1615313900880577 1.7956298728647107 -> 0.041793186826390362 1.0375339546034792
+tan0023 tan 0.067014779477908945 5.8517361577457097 -> 2.2088639754800034e-06 0.9999836182420061
+
+-- Additional real values (mpmath)
+tan0050 tan 1e-100 0.0 -> 1.00000000000000002e-100 0.0
+tan0051 tan 3.7e-08 0.0 -> 3.7000000000000017328e-8 0.0
+tan0052 tan 0.001 0.0 -> 0.0010000003333334666875 0.0
+tan0053 tan 0.2 0.0 -> 0.20271003550867249488 0.0
+tan0054 tan 1.0 0.0 -> 1.5574077246549022305 0.0
+tan0055 tan -3.7e-08 0.0 -> -3.7000000000000017328e-8 0.0
+tan0056 tan -0.001 0.0 -> -0.0010000003333334666875 0.0
+tan0057 tan -1.0 0.0 -> -1.5574077246549022305 0.0
+tan0058 tan 0.4636476090008061 0.0 -> 0.49999999999999997163 0.0
+tan0059 tan -0.4636476090008061 0.0 -> -0.49999999999999997163 0.0
+tan0060 tan 1.1071487177940904 0.0 -> 1.9999999999999995298 0.0
+tan0061 tan -1.1071487177940904 0.0 -> -1.9999999999999995298 0.0
+tan0062 tan 1.5 0.0 -> 14.101419947171719388 0.0
+tan0063 tan 1.57 0.0 -> 1255.7655915007896475 0.0
+tan0064 tan 1.5707963267948961 0.0 -> 1978937966095219.0538 0.0
+tan0065 tan 7.225663103256523 0.0 -> 1.3763819204711701522 0.0
+tan0066 tan -8.79645943005142 0.0 -> 0.7265425280053614098 0.0
+
+-- special values
+tan1000 tan -0.0 0.0 -> -0.0 0.0
+tan1001 tan -inf 0.0 -> nan nan invalid
+tan1002 tan -inf 2.2999999999999998 -> nan nan invalid
+tan1003 tan nan 0.0 -> nan nan
+tan1004 tan nan 2.2999999999999998 -> nan nan
+tan1005 tan -0.0 inf -> -0.0 1.0
+tan1006 tan -0.69999999999999996 inf -> -0.0 1.0
+tan1007 tan -1.3999999999999999 inf -> -0.0 1.0
+tan1008 tan -2.1000000000000001 inf -> 0.0 1.0
+tan1009 tan -2.7999999999999998 inf -> 0.0 1.0
+tan1010 tan -3.5 inf -> -0.0 1.0
+tan1011 tan -inf inf -> -0.0 1.0 ignore-real-sign
+tan1012 tan nan inf -> -0.0 1.0 ignore-real-sign
+tan1013 tan -0.0 nan -> -0.0 nan
+tan1014 tan -2.2999999999999998 nan -> nan nan
+tan1015 tan -inf nan -> nan nan
+tan1016 tan nan nan -> nan nan
+tan1017 tan 0.0 0.0 -> 0.0 0.0
+tan1018 tan inf 0.0 -> nan nan invalid
+tan1019 tan inf 2.2999999999999998 -> nan nan invalid
+tan1020 tan 0.0 inf -> 0.0 1.0
+tan1021 tan 0.69999999999999996 inf -> 0.0 1.0
+tan1022 tan 1.3999999999999999 inf -> 0.0 1.0
+tan1023 tan 2.1000000000000001 inf -> -0.0 1.0
+tan1024 tan 2.7999999999999998 inf -> -0.0 1.0
+tan1025 tan 3.5 inf -> 0.0 1.0
+tan1026 tan inf inf -> -0.0 1.0 ignore-real-sign
+tan1027 tan 0.0 nan -> 0.0 nan
+tan1028 tan 2.2999999999999998 nan -> nan nan
+tan1029 tan inf nan -> nan nan
+tan1030 tan 0.0 -0.0 -> 0.0 -0.0
+tan1031 tan inf -0.0 -> nan nan invalid
+tan1032 tan inf -2.2999999999999998 -> nan nan invalid
+tan1033 tan nan -0.0 -> nan nan
+tan1034 tan nan -2.2999999999999998 -> nan nan
+tan1035 tan 0.0 -inf -> 0.0 -1.0
+tan1036 tan 0.69999999999999996 -inf -> 0.0 -1.0
+tan1037 tan 1.3999999999999999 -inf -> 0.0 -1.0
+tan1038 tan 2.1000000000000001 -inf -> -0.0 -1.0
+tan1039 tan 2.7999999999999998 -inf -> -0.0 -1.0
+tan1040 tan 3.5 -inf -> 0.0 -1.0
+tan1041 tan inf -inf -> -0.0 -1.0 ignore-real-sign
+tan1042 tan nan -inf -> -0.0 -1.0 ignore-real-sign
+tan1043 tan -0.0 -0.0 -> -0.0 -0.0
+tan1044 tan -inf -0.0 -> nan nan invalid
+tan1045 tan -inf -2.2999999999999998 -> nan nan invalid
+tan1046 tan -0.0 -inf -> -0.0 -1.0
+tan1047 tan -0.69999999999999996 -inf -> -0.0 -1.0
+tan1048 tan -1.3999999999999999 -inf -> -0.0 -1.0
+tan1049 tan -2.1000000000000001 -inf -> 0.0 -1.0
+tan1050 tan -2.7999999999999998 -inf -> 0.0 -1.0
+tan1051 tan -3.5 -inf -> -0.0 -1.0
+tan1052 tan -inf -inf -> -0.0 -1.0 ignore-real-sign
+
+
+------------------------------------------------------------------------
+-- rect: Conversion from polar coordinates to rectangular coordinates --
+------------------------------------------------------------------------
+--
+-- For cmath.rect, we can use the same testcase syntax as for the
+-- complex -> complex functions above, but here the input arguments
+-- should be interpreted as a pair of floating-point numbers rather
+-- than the real and imaginary parts of a complex number.
+--
+-- Here are the 'spirit of C99' rules for rect.  First, the short
+-- version:
+--
+--    rect(x, t) = exp(log(x)+it) for positive-signed x
+--    rect(x, t) = -exp(log(-x)+it) for negative-signed x
+--    rect(nan, t) = exp(nan + it), except that in rect(nan, +-0) the
+--      sign of the imaginary part is unspecified.
+--
+-- and now the long version:
+--
+--   rect(x, -t) = conj(rect(x, t)) for all x and t
+--   rect(-x, t) = -rect(x, t) for all x and t
+--   rect(+0, +0) returns +0 + i0
+--   rect(+0, inf) returns +- 0 +- i0, where the signs of the real and
+--     imaginary parts are unspecified.
+--   rect(x, inf) returns NaN + i NaN and raises the "invalid"
+--     floating-point exception, for finite nonzero x.
+--   rect(inf, inf) returns +-inf + i NaN and raises the "invalid"
+--     floating-point exception (where the sign of the real part of the
+--     result is unspecified).
+--   rect(inf, +0) returns inf+i0
+--   rect(inf, x) returns inf*cis(x), for finite nonzero x
+--   rect(inf, NaN) returns +-inf+i NaN, where the sign of the real part
+--     of the result is unspecified.
+--   rect(NaN, x) returns NaN + i NaN for all nonzero numbers (including
+--     infinities) x
+--   rect(NaN, 0) returns NaN +- i0, where the sign of the imaginary
+--     part is unspecified
+--   rect(NaN, NaN) returns NaN + i NaN
+--   rect(x, NaN) returns NaN + i NaN for finite nonzero x
+--   rect(+0, NaN) return +-0 +- i0, where the signs of the real and
+--     imaginary parts are unspecified.
+
+-- special values
+rect1000 rect 0.0 0.0 -> 0.0 0.0
+rect1001 rect 0.0 inf -> 0.0 0.0        ignore-real-sign ignore-imag-sign
+rect1002 rect 2.3 inf -> nan nan        invalid
+rect1003 rect inf inf -> inf nan        invalid ignore-real-sign
+rect1004 rect inf 0.0 -> inf 0.0
+rect1005 rect inf 1.4 -> inf inf
+rect1006 rect inf 2.8 -> -inf inf
+rect1007 rect inf 4.2 -> -inf -inf
+rect1008 rect inf 5.6 -> inf -inf
+rect1009 rect inf 7.0 -> inf inf
+rect1010 rect nan 0.0 -> nan 0.0        ignore-imag-sign
+rect1011 rect nan 2.3 -> nan nan
+rect1012 rect nan inf -> nan nan
+rect1013 rect nan nan -> nan nan
+rect1014 rect inf nan -> inf nan        ignore-real-sign
+rect1015 rect 2.3 nan -> nan nan
+rect1016 rect 0.0 nan -> 0.0 0.0        ignore-real-sign ignore-imag-sign
+rect1017 rect 0.0 -0.0 -> 0.0 -0.0
+rect1018 rect 0.0 -inf -> 0.0 0.0       ignore-real-sign ignore-imag-sign
+rect1019 rect 2.3 -inf -> nan nan       invalid
+rect1020 rect inf -inf -> inf nan       invalid ignore-real-sign
+rect1021 rect inf -0.0 -> inf -0.0
+rect1022 rect inf -1.4 -> inf -inf
+rect1023 rect inf -2.8 -> -inf -inf
+rect1024 rect inf -4.2 -> -inf inf
+rect1025 rect inf -5.6 -> inf inf
+rect1026 rect inf -7.0 -> inf -inf
+rect1027 rect nan -0.0 -> nan 0.0       ignore-imag-sign
+rect1028 rect nan -2.3 -> nan nan
+rect1029 rect nan -inf -> nan nan
+rect1030 rect -0.0 0.0 -> -0.0 -0.0
+rect1031 rect -0.0 inf -> 0.0 0.0       ignore-real-sign ignore-imag-sign
+rect1032 rect -2.3 inf -> nan nan       invalid
+rect1033 rect -inf inf -> -inf nan      invalid ignore-real-sign
+rect1034 rect -inf 0.0 -> -inf -0.0
+rect1035 rect -inf 1.4 -> -inf -inf
+rect1036 rect -inf 2.8 -> inf -inf
+rect1037 rect -inf 4.2 -> inf inf
+rect1038 rect -inf 5.6 -> -inf inf
+rect1039 rect -inf 7.0 -> -inf -inf
+rect1040 rect -inf nan -> inf nan       ignore-real-sign
+rect1041 rect -2.3 nan -> nan nan
+rect1042 rect -0.0 nan -> 0.0 0.0       ignore-real-sign ignore-imag-sign
+rect1043 rect -0.0 -0.0 -> -0.0 0.0
+rect1044 rect -0.0 -inf -> 0.0 0.0      ignore-real-sign ignore-imag-sign
+rect1045 rect -2.3 -inf -> nan nan      invalid
+rect1046 rect -inf -inf -> -inf nan     invalid ignore-real-sign
+rect1047 rect -inf -0.0 -> -inf 0.0
+rect1048 rect -inf -1.4 -> -inf inf
+rect1049 rect -inf -2.8 -> inf inf
+rect1050 rect -inf -4.2 -> inf -inf
+rect1051 rect -inf -5.6 -> -inf -inf
+rect1052 rect -inf -7.0 -> -inf inf
+
+-------------------------------------------------------------------------
+-- polar: Conversion from rectangular coordinates to polar coordinates --
+-------------------------------------------------------------------------
+--
+-- For cmath.polar, we can use the same testcase syntax as for the
+-- complex -> complex functions above, but here the output arguments
+-- should be interpreted as a pair of floating-point numbers rather
+-- than the real and imaginary parts of a complex number.
+--
+-- Annex G of the C99 standard describes fully both the real and
+-- imaginary parts of polar (as cabs and carg, respectively, which in turn
+-- are defined in terms of the functions hypot and atan2).
+
+-- overflow
+polar0100 polar 1.4e308 1.4e308 -> inf 0.78539816339744828      overflow
+
+-- special values
+polar1000 polar 0.0 0.0 -> 0.0 0.0
+polar1001 polar 0.0 -0.0 -> 0.0 -0.0
+polar1002 polar -0.0 0.0 -> 0.0 3.1415926535897931
+polar1003 polar -0.0 -0.0 -> 0.0 -3.1415926535897931
+polar1004 polar inf 0.0 -> inf 0.0
+polar1005 polar inf 2.3 -> inf 0.0
+polar1006 polar inf inf -> inf 0.78539816339744828
+polar1007 polar 2.3 inf -> inf 1.5707963267948966
+polar1008 polar 0.0 inf -> inf 1.5707963267948966
+polar1009 polar -0.0 inf -> inf 1.5707963267948966
+polar1010 polar -2.3 inf -> inf 1.5707963267948966
+polar1011 polar -inf inf -> inf 2.3561944901923448
+polar1012 polar -inf 2.3 -> inf 3.1415926535897931
+polar1013 polar -inf 0.0 -> inf 3.1415926535897931
+polar1014 polar -inf -0.0 -> inf -3.1415926535897931
+polar1015 polar -inf -2.3 -> inf -3.1415926535897931
+polar1016 polar -inf -inf -> inf -2.3561944901923448
+polar1017 polar -2.3 -inf -> inf -1.5707963267948966
+polar1018 polar -0.0 -inf -> inf -1.5707963267948966
+polar1019 polar 0.0 -inf -> inf -1.5707963267948966
+polar1020 polar 2.3 -inf -> inf -1.5707963267948966
+polar1021 polar inf -inf -> inf -0.78539816339744828
+polar1022 polar inf -2.3 -> inf -0.0
+polar1023 polar inf -0.0 -> inf -0.0
+polar1024 polar nan -inf -> inf nan
+polar1025 polar nan -2.3 -> nan nan
+polar1026 polar nan -0.0 -> nan nan
+polar1027 polar nan 0.0 -> nan nan
+polar1028 polar nan 2.3 -> nan nan
+polar1029 polar nan inf -> inf nan
+polar1030 polar nan nan -> nan nan
+polar1031 polar inf nan -> inf nan
+polar1032 polar 2.3 nan -> nan nan
+polar1033 polar 0.0 nan -> nan nan
+polar1034 polar -0.0 nan -> nan nan
+polar1035 polar -2.3 nan -> nan nan
+polar1036 polar -inf nan -> inf nan
diff --git a/test/dynamo/cpython/3_13/mathdata/floating_points.txt b/test/dynamo/cpython/3_13/mathdata/floating_points.txt
new file mode 100644
index 000000000000..539073d19d85
--- /dev/null
+++ b/test/dynamo/cpython/3_13/mathdata/floating_points.txt
@@ -0,0 +1,1028 @@
+# These numbers are used to test floating point binary-to-decimal conversion.
+# They are based on the TCL test suite (tests/expr.test), which is based on
+# test data from:
+# Brigitte Verdonk, Annie Cuyt, Dennis Verschaeren, A precision and range
+# independent tool for testing floating-point arithmetic II: Conversions,
+# ACM Transactions on Mathematical Software 27:2 (March 2001), pp. 119-140.
+
+0E0
+-0E0
+1E0
+15E-1
+125E-2
+1125E-3
+10625E-4
+103125E-5
+1015625E-6
+10078125E-7
+100390625E-8
+1001953125E-9
+10009765625E-10
+100048828125E-11
+1000244140625E-12
+10001220703125E-13
+100006103515625E-14
+1000030517578125E-15
+10000152587890625E-16
++8E153
+-1E153
++9E306
+-2E153
++7E-304
+-3E-49
++7E-303
+-6E-49
++9E43
+-9E44
++8E303
+-1E303
++7E-287
+-2E-204
++2E-205
+-9E-47
++34E195
+-68E195
++85E194
+-67E97
++93E-234
+-19E-87
++38E-87
+-38E-88
+-69E220
++18E43
+-36E43
++61E-99
+-43E-92
++86E-92
+-51E-74
++283E85
+-566E85
++589E187
+-839E143
+-744E-234
++930E-235
+-186E-234
++604E175
+-302E175
++755E174
+-151E175
++662E-213
+-408E-74
++510E-75
++6782E55
+-2309E92
++7963E34
+-3391E55
++7903E-96
+-7611E-226
++4907E-196
+-5547E-311
++5311E241
+-5311E243
++5311E242
++9269E-45
+-8559E-289
++8699E-276
+-8085E-64
++74819E201
+-82081E41
++51881E37
+-55061E157
++77402E-215
+-33891E-92
++38701E-215
+-82139E-76
++75859E25
++89509E140
+-57533E287
++46073E-32
+-92146E-32
++83771E-74
+-34796E-276
++584169E229
++164162E41
+-328324E41
++209901E-11
+-419802E-11
++940189E-112
+-892771E-213
++757803E120
+-252601E120
++252601E121
+-505202E120
++970811E-264
+-654839E-60
++289767E-178
+-579534E-178
+-8823691E130
++9346704E229
+-1168338E229
+-6063369E-136
++3865421E-225
+-5783893E-127
++2572231E223
+-5144462E223
++1817623E109
++6431543E-97
+-5444097E-21
++8076999E-121
+-9997649E-270
++50609263E157
++70589528E130
+-88236910E129
++87575437E-310
+-23135572E-127
++85900881E177
+-84863171E113
++68761586E232
+-50464069E286
++27869147E-248
+-55738294E-248
++70176353E-53
+-80555086E-32
+-491080654E121
++526250918E287
+-245540327E121
+-175150874E-310
++350301748E-310
+-437877185E-311
++458117166E52
+-916234332E52
++229058583E52
+-525789935E98
++282926897E-227
+-565853794E-227
++667284113E-240
+-971212611E-126
++9981396317E-182
+-5035231965E-156
++8336960483E-153
+-8056371144E-155
++6418488827E79
+-3981006983E252
++7962013966E252
+-4713898551E261
++8715380633E-58
+-9078555839E-109
++9712126110E-127
++42333842451E201
+-84667684902E201
++23792120709E-315
+-78564021519E-227
++71812054883E-188
+-30311163631E-116
++71803914657E292
++36314223356E-109
++18157111678E-109
+-45392779195E-110
++778380362293E218
+-685763015669E280
++952918668151E70
+-548357443505E32
++384865004907E-285
+-769730009814E-285
++697015418417E-93
+-915654049301E-28
++178548656339E169
+-742522891517E259
++742522891517E258
+-357097312678E169
+-3113521449172E218
++3891901811465E217
+-1556760724586E218
++9997878507563E-195
+-7247563029154E-319
++3623781514577E-319
+-3092446298323E-200
++6363857920591E145
+-8233559360849E94
++2689845954547E49
+-5379691909094E49
++5560322501926E-301
+-7812878489261E-179
++8439398533053E-256
+-2780161250963E-301
+-87605699161665E155
+-17521139832333E156
+-88218101363513E-170
++38639244311627E-115
++35593959807306E261
+-53390939710959E260
++71187919614612E261
+-88984899518265E260
++77003665618895E-73
+-15400733123779E-72
++61602932495116E-72
+-30801466247558E-72
++834735494917063E-300
+-589795149206434E-151
++475603213226859E-42
+-294897574603217E-151
++850813008001913E93
+-203449172043339E185
++406898344086678E185
+-813796688173356E185
++6045338514609393E244
+-5145963778954906E142
++2572981889477453E142
+-6965949469487146E74
++6182410494241627E-119
+-8510309498186985E-277
++6647704637273331E-212
+-2215901545757777E-212
++3771476185376383E276
+-3729901848043846E212
++3771476185376383E277
+-9977830465649166E119
++8439928496349319E-142
+-8204230082070882E-59
++8853686434843997E-244
+-5553274272288559E-104
++36149023611096162E144
+-36149023611096162E147
++18074511805548081E146
+-18074511805548081E147
++97338774138954421E-290
+-88133809804950961E-308
++94080055902682397E-243
+-24691002732654881E-115
++52306490527514614E49
+-26153245263757307E49
++55188692254193604E165
+-68985865317742005E164
++27176258005319167E-261
+-73169230107256116E-248
++91461537634070145E-249
+-54352516010638334E-261
++586144289638535878E280
+-601117006785295431E245
++293072144819267939E280
+-953184713238516652E272
++902042358290366539E-281
+-557035730189854663E-294
++902042358290366539E-280
+-354944100507554393E-238
++272104041512242479E199
+-816312124536727437E199
++544208083024484958E199
+-792644927852378159E78
+-679406450132979175E-263
++543525160106383340E-262
++7400253695682920196E215
+-1850063423920730049E215
++3700126847841460098E215
+-9250317119603650245E214
++8396094300569779681E-252
+-3507665085003296281E-75
++7015330170006592562E-75
+-7015330170006592562E-74
++7185620434951919351E205
+-1360520207561212395E198
++2178999185345151731E-184
+-8691089486201567102E-218
++4345544743100783551E-218
+-4357998370690303462E-184
++59825267349106892461E177
+-62259110684423957791E47
++58380168477038565599E265
+-62259110684423957791E48
+-33584377202279118724E-252
+-57484963479615354808E205
++71856204349519193510E204
+-14371240869903838702E205
++36992084760177624177E-318
+-73984169520355248354E-318
++99257763227713890244E-115
+-87336362425182547697E-280
++7E289
+-3E153
++6E153
+-5E243
++7E-161
+-7E-172
++8E-63
+-7E-113
++8E126
+-4E126
++5E125
+-1E126
++8E-163
+-1E-163
++2E-163
+-4E-163
++51E195
+-37E46
++74E46
+-56E289
++69E-145
+-70E-162
++56E-161
+-21E-303
++34E-276
+-68E-276
++85E-277
+-87E-274
++829E102
+-623E100
++723E-162
+-457E-102
++914E-102
+-323E-135
++151E176
+-302E176
++921E90
+-604E176
++823E-206
+-463E-114
++348E-274
++9968E100
+-6230E99
++1246E100
++6676E-296
+-8345E-297
++1669E-296
+-3338E-296
++3257E58
+-6514E58
++2416E176
++8085E-63
+-3234E-62
++1617E-62
+-6468E-62
++53418E111
+-60513E160
++26709E111
+-99447E166
++12549E48
+-25098E48
++50196E48
+-62745E47
++83771E-73
+-97451E-167
++86637E-203
+-75569E-254
++473806E83
+-947612E83
++292369E76
+-584738E76
++933587E-140
+-720919E-14
++535001E-149
+-890521E-235
++548057E81
+-706181E88
++820997E106
+-320681E63
++928609E-261
+-302276E-254
++151138E-254
++4691773E45
+-9383546E45
++3059949E-243
+-6119898E-243
++5356626E-213
+-4877378E-199
++7716693E223
+-5452869E109
++4590831E156
+-9181662E156
+-3714436E-261
++4643045E-262
+-7428872E-261
++52942146E130
+-27966061E145
++26471073E130
+-55932122E145
++95412548E-99
+-47706274E-99
++23853137E-99
+-78493654E-301
++65346417E29
+-51083099E167
++89396333E264
+-84863171E114
++59540836E-251
+-74426045E-252
++14885209E-251
+-29770418E-251
++982161308E122
+-245540327E122
++491080654E122
++525452622E-310
+-771837113E-134
++820858081E-150
+-262726311E-310
++923091487E209
+-653777767E273
++842116236E-53
+-741111169E-202
++839507247E-284
+-951487269E-264
+-9821613080E121
++6677856011E-31
+-3573796826E-266
++7147593652E-266
+-9981396317E-181
++3268888835E272
+-2615111068E273
++1307555534E273
++2990671154E-190
+-1495335577E-190
++5981342308E-190
+-7476677885E-191
++82259684194E-202
+-93227267727E-49
++41129842097E-202
+-47584241418E-314
+-79360293406E92
++57332259349E225
+-57202326162E111
++86860597053E-206
+-53827010643E-200
++53587107423E-61
++635007636765E200
++508006109412E201
+-254003054706E201
++561029718715E-72
+-897647549944E-71
++112205943743E-71
+-873947086081E-236
++809184709177E116
+-573112917422E81
++286556458711E81
++952805821491E-259
+-132189992873E-44
+-173696038493E-144
++1831132757599E-107
+-9155663787995E-108
++7324531030396E-107
+-9277338894969E-200
++8188292423973E287
+-5672557437938E59
++2836278718969E59
+-9995153153494E54
++9224786422069E-291
+-3142213164987E-294
++6284426329974E-294
+-8340483752889E-301
++67039371486466E89
+-62150786615239E197
++33519685743233E89
+-52563419496999E156
++32599460466991E-65
+-41010988798007E-133
++65198920933982E-65
+-82021977596014E-133
++80527976643809E61
+-74712611505209E158
++53390939710959E261
+-69277302659155E225
++46202199371337E-72
+-23438635467783E-179
++41921560615349E-67
+-92404398742674E-72
++738545606647197E124
+-972708181182949E117
+-837992143580825E87
++609610927149051E-255
+-475603213226859E-41
++563002800671023E-177
+-951206426453718E-41
++805416432656519E202
+-530658674694337E159
++946574173863918E208
+-318329953318553E113
+-462021993713370E-73
++369617594970696E-72
++3666156212014994E233
+-1833078106007497E233
++8301790508624232E174
+-1037723813578029E174
++7297662880581139E-286
+-5106185698912191E-276
++7487252720986826E-165
+-3743626360493413E-165
++3773057430100257E230
+-7546114860200514E230
++4321222892463822E58
+-7793560217139653E51
++26525993941010681E112
+-53051987882021362E112
++72844871414247907E77
+-88839359596763261E105
++18718131802467065E-166
+-14974505441973652E-165
++73429396004640239E106
+-58483921078398283E57
++41391519190645203E165
+-82783038381290406E165
++58767043776702677E-163
+-90506231831231999E-129
++64409240769861689E-159
+-77305427432277771E-190
++476592356619258326E273
+-953184713238516652E273
++899810892172646163E283
+-929167076892018333E187
++647761278967534239E-312
+-644290479820542942E-180
++926145344610700019E-225
+-958507931896511964E-246
++272104041512242479E200
+-792644927852378159E79
++544208083024484958E200
+-929963218616126365E290
++305574339166810102E-219
+-152787169583405051E-219
++611148678333620204E-219
+-763935847917025255E-220
++7439550220920798612E158
+-3719775110460399306E158
++9299437776150998265E157
+-7120190517612959703E120
++3507665085003296281E-73
+-7015330170006592562E-73
+-6684428762278255956E-294
+-1088416166048969916E200
+-8707329328391759328E200
++4439021781608558002E-65
+-8878043563217116004E-65
++2219510890804279001E-65
++33051223951904955802E55
+-56961524140903677624E120
++71201905176129597030E119
++14030660340013185124E-73
+-17538325425016481405E-74
++67536228609141569109E-133
+-35620497849450218807E-306
++66550376797582521751E-126
+-71240995698900437614E-306
++3E24
+-6E24
++6E26
+-7E25
++1E-14
+-2E-14
++4E-14
+-8E-14
++5E26
+-8E27
++1E27
+-4E27
++9E-13
+-7E-20
++56E25
+-70E24
++51E26
++71E-17
+-31E-5
++62E-5
+-94E-8
++67E27
+-81E24
++54E23
+-54E25
++63E-22
+-63E-23
++43E-4
+-86E-4
++942E26
+-471E25
++803E24
+-471E26
+-409E-21
++818E-21
+-867E-8
++538E27
+-857E24
++269E27
+-403E26
++959E-7
+-959E-6
++373E-27
+-746E-27
++4069E24
+-4069E23
+-8138E24
++8294E-15
+-4147E-14
++4147E-15
+-8294E-14
++538E27
+-2690E26
++269E27
+-2152E27
++1721E-17
+-7979E-27
++6884E-17
+-8605E-18
++82854E27
+-55684E24
++27842E24
+-48959E25
++81921E-17
+-76207E-8
++4147E-15
+-41470E-16
++89309E24
++75859E26
+-75859E25
++14257E-23
+-28514E-23
++57028E-23
+-71285E-24
++344863E27
+-951735E27
++200677E23
+-401354E24
++839604E-11
+-209901E-11
++419802E-11
+-537734E-24
++910308E26
+-227577E26
++455154E26
+-531013E25
++963019E-21
+-519827E-13
++623402E-27
+-311701E-27
++9613651E26
+-9191316E23
++4595658E23
+-2297829E23
+-1679208E-11
++3379223E27
+-6758446E27
++5444097E-21
+-8399969E-27
++8366487E-16
+-8366487E-15
++65060671E25
++65212389E23
++55544957E-13
+-51040905E-20
++99585767E-22
+-99585767E-23
++40978393E26
+-67488159E24
++69005339E23
+-81956786E26
+-87105552E-21
++10888194E-21
+-21776388E-21
++635806667E27
+-670026614E25
++335013307E26
+-335013307E25
++371790617E-24
+-371790617E-25
++743581234E-24
+-743581234E-25
++202464477E24
+-404928954E24
++997853758E27
+-997853758E26
++405498418E-17
+-582579084E-14
++608247627E-18
+-291289542E-14
+-9537100005E26
++6358066670E27
+-1271613334E27
++5229646999E-16
++5229646999E-17
++4429943614E24
+-8859887228E24
++2214971807E24
+-4176887093E26
++4003495257E-20
+-4361901637E-23
++8723803274E-23
+-8006990514E-20
++72835110098E27
+-36417555049E27
++84279630104E25
+-84279630104E24
++21206176437E-27
+-66461566917E-22
++64808355539E-16
+-84932679673E-19
++65205430094E26
+-68384463429E25
++32602715047E26
+-62662203426E27
++58784444678E-18
+-50980203373E-21
++29392222339E-18
+-75529940323E-27
+-937495906299E26
++842642485799E-20
+-387824150699E-23
++924948814726E-27
+-775648301398E-23
++547075707432E25
++683844634290E24
+-136768926858E25
++509802033730E-22
++101960406746E-21
+-815683253968E-21
++7344124123524E24
+-9180155154405E23
++6479463327323E27
+-1836031030881E24
++4337269293039E-19
+-4599163554373E-23
++9198327108746E-23
++4812803938347E27
+-8412030890011E23
++9625607876694E27
+-4739968828249E24
++9697183891673E-23
+-7368108517543E-20
++51461358161422E25
+-77192037242133E26
++77192037242133E25
+-51461358161422E27
++43999661561541E-21
+-87999323123082E-21
++48374886826137E-26
+-57684246567111E-23
++87192805957686E23
+-75108713005913E24
++64233110587487E27
+-77577471133384E-23
++48485919458365E-24
+-56908598265713E-26
++589722294620133E23
++652835804449289E-22
+-656415363936202E-23
++579336749585745E-25
+-381292764980839E-26
++965265859649698E23
+-848925235434882E27
++536177612222491E23
+-424462617717441E27
++276009279888989E-27
+-608927158043691E-26
++552018559777978E-27
+-425678377667758E-22
++8013702726927119E26
++8862627962362001E27
+-5068007907757162E26
+-7379714799828406E-23
++4114538064016107E-27
+-3689857399914203E-23
++5575954851815478E23
++3395700941739528E27
++4115535777581961E-23
+-8231071555163922E-23
++6550246696190871E-26
+-68083046403986701E27
++43566388595783643E27
+-87132777191567286E27
++59644881059342141E25
+-83852770718576667E23
++99482967418206961E-25
+-99482967418206961E-26
++87446669969994614E-27
+-43723334984997307E-27
++5E24
+-8E25
++1E25
+-4E25
++2E-5
+-5E-6
++4E-5
+-3E-20
++3E27
+-9E26
++7E25
+-6E27
++2E-21
+-5E-22
+-4E-21
++87E25
+-97E24
++82E-24
+-41E-24
++76E-23
++83E25
+-50E27
++25E27
+-99E27
++97E-10
+-57E-20
++997E23
++776E24
+-388E24
++521E-10
+-506E-26
++739E-10
+-867E-7
+-415E24
++332E25
+-664E25
++291E-13
+-982E-8
++582E-13
+-491E-8
++4574E26
+-8609E26
++2287E26
+-4818E24
++6529E-8
+-8151E-21
++1557E-12
+-2573E-18
++4929E-16
+-3053E-22
++9858E-16
+-7767E-11
++54339E26
+-62409E25
++32819E27
+-89849E27
++63876E-20
+-15969E-20
++31938E-20
+-79845E-21
++89306E27
+-25487E24
++79889E24
+-97379E26
++81002E-8
+-43149E-25
++40501E-8
+-60318E-10
+-648299E27
++780649E24
++720919E-14
+-629703E-11
++557913E24
+-847899E23
++565445E27
+-736531E24
++680013E-19
+-529981E-10
++382923E-23
+-633614E-18
++2165479E27
+-8661916E27
++4330958E27
+-9391993E22
+-5767352E-14
++7209190E-15
+-1441838E-14
++8478990E22
++1473062E24
++8366487E-14
+-8399969E-25
++9366737E-12
+-9406141E-13
++65970979E24
+-65060671E26
++54923002E27
+-63846927E25
++99585767E-21
++67488159E25
+-69005339E24
++81956786E27
+-40978393E27
++77505754E-12
+-38752877E-12
++82772981E-15
+-95593517E-25
++200036989E25
+-772686455E27
++859139907E23
+-400073978E25
++569014327E-14
+-794263862E-15
++397131931E-15
+-380398957E-16
++567366773E27
+-337440795E24
++134976318E25
+-269952636E25
++932080597E-20
+-331091924E-15
+-413864905E-16
++8539246247E26
+-5859139791E26
++6105010149E24
+-3090745820E27
++3470877773E-20
+-6136309089E-27
++8917758713E-19
+-6941755546E-20
++9194900535E25
+-1838980107E26
++7355920428E26
+-3677960214E26
++8473634343E-17
+-8870766274E-16
++4435383137E-16
+-9598990129E-15
++71563496764E26
+-89454370955E25
++17890874191E26
+-35781748382E26
++57973447842E-19
+-28986723921E-19
++76822711313E-19
+-97699466874E-20
++67748656762E27
+-19394840991E24
++38789681982E24
+-33874328381E27
++54323763886E-27
+-58987193887E-20
++27161881943E-27
+-93042648033E-19
++520831059055E27
+-768124264394E25
++384062132197E25
++765337749889E-25
++794368912771E25
+-994162090146E23
++781652779431E26
++910077190046E-26
+-455038595023E-26
++471897551096E-20
+-906698409911E-21
++8854128003935E25
+-8146122716299E27
++7083302403148E26
+-3541651201574E26
++8394920649291E-25
+-7657975756753E-22
++5473834002228E-20
+-6842292502785E-21
+-2109568884597E25
++8438275538388E25
+-4219137769194E25
++3200141789841E-25
+-8655689322607E-22
++6400283579682E-25
+-8837719634493E-21
++19428217075297E24
+-38856434150594E24
++77712868301188E24
+-77192037242133E27
++76579757567530E-23
++15315951513506E-22
+-38289878783765E-23
++49378033925202E25
+-50940527102367E24
++98756067850404E25
+-99589397544892E26
+-56908598265713E-25
++97470695699657E-22
+-35851901247343E-25
++154384074484266E27
+-308768148968532E27
++910990389005985E23
++271742424169201E-27
+-543484848338402E-27
++162192083357563E-26
+-869254552770081E-23
++664831007626046E24
+-332415503813023E24
++943701829041427E24
+-101881054204734E24
++828027839666967E-27
+-280276135608777E-27
++212839188833879E-21
+-113817196531426E-25
++9711553197796883E27
+-2739849386524269E26
++5479698773048538E26
++6124568318523113E-25
+-1139777988171071E-24
++6322612303128019E-27
+-2955864564844617E-25
+-9994029144998961E25
+-2971238324022087E27
+-1656055679333934E-27
+-1445488709150234E-26
++55824717499885172E27
+-69780896874856465E26
++84161538867545199E25
+-27912358749942586E27
++24711112462926331E-25
+-12645224606256038E-27
+-12249136637046226E-25
++74874448287465757E27
+-35642836832753303E24
+-71285673665506606E24
++43723334984997307E-26
++10182419849537963E-24
+-93501703572661982E-26
+
+# A value that caused a crash in debug builds for Python >= 2.7, 3.1
+# See http://bugs.python.org/issue7632
+2183167012312112312312.23538020374420446192e-370
+
+# Another value designed to test a corner case of Python's strtod code.
+0.99999999999999999999999999999999999999999e+23
diff --git a/test/dynamo/cpython/3_13/mathdata/formatfloat_testcases.txt b/test/dynamo/cpython/3_13/mathdata/formatfloat_testcases.txt
new file mode 100644
index 000000000000..25c07ba2939b
--- /dev/null
+++ b/test/dynamo/cpython/3_13/mathdata/formatfloat_testcases.txt
@@ -0,0 +1,355 @@
+-- 'f' code formatting, with explicit precision (>= 0).  Output always
+-- has the given number of places after the point;  zeros are added if
+-- necessary to make this true.
+
+-- zeros
+%.0f 0 -> 0
+%.1f 0 -> 0.0
+%.2f 0 -> 0.00
+%.3f 0 -> 0.000
+%.50f 0 -> 0.00000000000000000000000000000000000000000000000000
+
+-- precision 0;  result should never include a .
+%.0f 1.5 -> 2
+%.0f 2.5 -> 2
+%.0f 3.5 -> 4
+%.0f 0.0 -> 0
+%.0f 0.1 -> 0
+%.0f 0.001 -> 0
+%.0f 10.0 -> 10
+%.0f 10.1 -> 10
+%.0f 10.01 -> 10
+%.0f 123.456 -> 123
+%.0f 1234.56 -> 1235
+%.0f 1e49 -> 9999999999999999464902769475481793196872414789632
+%.0f 9.9999999999999987e+49 -> 99999999999999986860582406952576489172979654066176
+%.0f 1e50 -> 100000000000000007629769841091887003294964970946560
+
+-- precision 1
+%.1f 0.0001 -> 0.0
+%.1f 0.001 -> 0.0
+%.1f 0.01 -> 0.0
+%.1f 0.04 -> 0.0
+%.1f 0.06 -> 0.1
+%.1f 0.25 -> 0.2
+%.1f 0.75 -> 0.8
+%.1f 1.4 -> 1.4
+%.1f 1.5 -> 1.5
+%.1f 10.0 -> 10.0
+%.1f 1000.03 -> 1000.0
+%.1f 1234.5678 -> 1234.6
+%.1f 1234.7499 -> 1234.7
+%.1f 1234.75 -> 1234.8
+
+-- precision 2
+%.2f 0.0001 -> 0.00
+%.2f 0.001 -> 0.00
+%.2f 0.004999 -> 0.00
+%.2f 0.005001 -> 0.01
+%.2f 0.01 -> 0.01
+%.2f 0.125 -> 0.12
+%.2f 0.375 -> 0.38
+%.2f 1234500 -> 1234500.00
+%.2f 1234560 -> 1234560.00
+%.2f 1234567 -> 1234567.00
+%.2f 1234567.8 -> 1234567.80
+%.2f 1234567.89 -> 1234567.89
+%.2f 1234567.891 -> 1234567.89
+%.2f 1234567.8912 -> 1234567.89
+
+-- alternate form always includes a decimal point.  This only
+-- makes a difference when the precision is 0.
+%#.0f 0 -> 0.
+%#.1f 0 -> 0.0
+%#.0f 1.5 -> 2.
+%#.0f 2.5 -> 2.
+%#.0f 10.1 -> 10.
+%#.0f 1234.56 -> 1235.
+%#.1f 1.4 -> 1.4
+%#.2f 0.375 -> 0.38
+
+-- if precision is omitted it defaults to 6
+%f 0 -> 0.000000
+%f 1230000 -> 1230000.000000
+%f 1234567 -> 1234567.000000
+%f 123.4567 -> 123.456700
+%f 1.23456789 -> 1.234568
+%f 0.00012 -> 0.000120
+%f 0.000123 -> 0.000123
+%f 0.00012345 -> 0.000123
+%f 0.000001 -> 0.000001
+%f 0.0000005001 -> 0.000001
+%f 0.0000004999 -> 0.000000
+
+-- 'e' code formatting with explicit precision (>= 0). Output should
+-- always have exactly the number of places after the point that were
+-- requested.
+
+-- zeros
+%.0e 0 -> 0e+00
+%.1e 0 -> 0.0e+00
+%.2e 0 -> 0.00e+00
+%.10e 0 -> 0.0000000000e+00
+%.50e 0 -> 0.00000000000000000000000000000000000000000000000000e+00
+
+-- precision 0.  no decimal point in the output
+%.0e 0.01 -> 1e-02
+%.0e 0.1 -> 1e-01
+%.0e 1 -> 1e+00
+%.0e 10 -> 1e+01
+%.0e 100 -> 1e+02
+%.0e 0.012 -> 1e-02
+%.0e 0.12 -> 1e-01
+%.0e 1.2 -> 1e+00
+%.0e 12 -> 1e+01
+%.0e 120 -> 1e+02
+%.0e 123.456 -> 1e+02
+%.0e 0.000123456 -> 1e-04
+%.0e 123456000 -> 1e+08
+%.0e 0.5 -> 5e-01
+%.0e 1.4 -> 1e+00
+%.0e 1.5 -> 2e+00
+%.0e 1.6 -> 2e+00
+%.0e 2.4999999 -> 2e+00
+%.0e 2.5 -> 2e+00
+%.0e 2.5000001 -> 3e+00
+%.0e 3.499999999999 -> 3e+00
+%.0e 3.5 -> 4e+00
+%.0e 4.5 -> 4e+00
+%.0e 5.5 -> 6e+00
+%.0e 6.5 -> 6e+00
+%.0e 7.5 -> 8e+00
+%.0e 8.5 -> 8e+00
+%.0e 9.4999 -> 9e+00
+%.0e 9.5 -> 1e+01
+%.0e 10.5 -> 1e+01
+%.0e 14.999 -> 1e+01
+%.0e 15 -> 2e+01
+
+-- precision 1
+%.1e 0.0001 -> 1.0e-04
+%.1e 0.001 -> 1.0e-03
+%.1e 0.01 -> 1.0e-02
+%.1e 0.1 -> 1.0e-01
+%.1e 1 -> 1.0e+00
+%.1e 10 -> 1.0e+01
+%.1e 100 -> 1.0e+02
+%.1e 120 -> 1.2e+02
+%.1e 123 -> 1.2e+02
+%.1e 123.4 -> 1.2e+02
+
+-- precision 2
+%.2e 0.00013 -> 1.30e-04
+%.2e 0.000135 -> 1.35e-04
+%.2e 0.0001357 -> 1.36e-04
+%.2e 0.0001 -> 1.00e-04
+%.2e 0.001 -> 1.00e-03
+%.2e 0.01 -> 1.00e-02
+%.2e 0.1 -> 1.00e-01
+%.2e 1 -> 1.00e+00
+%.2e 10 -> 1.00e+01
+%.2e 100 -> 1.00e+02
+%.2e 1000 -> 1.00e+03
+%.2e 1500 -> 1.50e+03
+%.2e 1590 -> 1.59e+03
+%.2e 1598 -> 1.60e+03
+%.2e 1598.7 -> 1.60e+03
+%.2e 1598.76 -> 1.60e+03
+%.2e 9999 -> 1.00e+04
+
+-- omitted precision defaults to 6
+%e 0 -> 0.000000e+00
+%e 165 -> 1.650000e+02
+%e 1234567 -> 1.234567e+06
+%e 12345678 -> 1.234568e+07
+%e 1.1 -> 1.100000e+00
+
+-- alternate form always contains a decimal point.  This only makes
+-- a difference when precision is 0.
+
+%#.0e 0.01 -> 1.e-02
+%#.0e 0.1 -> 1.e-01
+%#.0e 1 -> 1.e+00
+%#.0e 10 -> 1.e+01
+%#.0e 100 -> 1.e+02
+%#.0e 0.012 -> 1.e-02
+%#.0e 0.12 -> 1.e-01
+%#.0e 1.2 -> 1.e+00
+%#.0e 12 -> 1.e+01
+%#.0e 120 -> 1.e+02
+%#.0e 123.456 -> 1.e+02
+%#.0e 0.000123456 -> 1.e-04
+%#.0e 123456000 -> 1.e+08
+%#.0e 0.5 -> 5.e-01
+%#.0e 1.4 -> 1.e+00
+%#.0e 1.5 -> 2.e+00
+%#.0e 1.6 -> 2.e+00
+%#.0e 2.4999999 -> 2.e+00
+%#.0e 2.5 -> 2.e+00
+%#.0e 2.5000001 -> 3.e+00
+%#.0e 3.499999999999 -> 3.e+00
+%#.0e 3.5 -> 4.e+00
+%#.0e 4.5 -> 4.e+00
+%#.0e 5.5 -> 6.e+00
+%#.0e 6.5 -> 6.e+00
+%#.0e 7.5 -> 8.e+00
+%#.0e 8.5 -> 8.e+00
+%#.0e 9.4999 -> 9.e+00
+%#.0e 9.5 -> 1.e+01
+%#.0e 10.5 -> 1.e+01
+%#.0e 14.999 -> 1.e+01
+%#.0e 15 -> 2.e+01
+%#.1e 123.4 -> 1.2e+02
+%#.2e 0.0001357 -> 1.36e-04
+
+-- 'g' code formatting.
+
+-- zeros
+%.0g 0 -> 0
+%.1g 0 -> 0
+%.2g 0 -> 0
+%.3g 0 -> 0
+%.4g 0 -> 0
+%.10g 0 -> 0
+%.50g 0 -> 0
+%.100g 0 -> 0
+
+-- precision 0 doesn't make a lot of sense for the 'g' code (what does
+-- it mean to have no significant digits?); in practice, it's interpreted
+-- as identical to precision 1
+%.0g 1000 -> 1e+03
+%.0g 100 -> 1e+02
+%.0g 10 -> 1e+01
+%.0g 1 -> 1
+%.0g 0.1 -> 0.1
+%.0g 0.01 -> 0.01
+%.0g 1e-3 -> 0.001
+%.0g 1e-4 -> 0.0001
+%.0g 1e-5 -> 1e-05
+%.0g 1e-6 -> 1e-06
+%.0g 12 -> 1e+01
+%.0g 120 -> 1e+02
+%.0g 1.2 -> 1
+%.0g 0.12 -> 0.1
+%.0g 0.012 -> 0.01
+%.0g 0.0012 -> 0.001
+%.0g 0.00012 -> 0.0001
+%.0g 0.000012 -> 1e-05
+%.0g 0.0000012 -> 1e-06
+
+-- precision 1 identical to precision 0
+%.1g 1000 -> 1e+03
+%.1g 100 -> 1e+02
+%.1g 10 -> 1e+01
+%.1g 1 -> 1
+%.1g 0.1 -> 0.1
+%.1g 0.01 -> 0.01
+%.1g 1e-3 -> 0.001
+%.1g 1e-4 -> 0.0001
+%.1g 1e-5 -> 1e-05
+%.1g 1e-6 -> 1e-06
+%.1g 12 -> 1e+01
+%.1g 120 -> 1e+02
+%.1g 1.2 -> 1
+%.1g 0.12 -> 0.1
+%.1g 0.012 -> 0.01
+%.1g 0.0012 -> 0.001
+%.1g 0.00012 -> 0.0001
+%.1g 0.000012 -> 1e-05
+%.1g 0.0000012 -> 1e-06
+
+-- precision 2
+%.2g 1000 -> 1e+03
+%.2g 100 -> 1e+02
+%.2g 10 -> 10
+%.2g 1 -> 1
+%.2g 0.1 -> 0.1
+%.2g 0.01 -> 0.01
+%.2g 0.001 -> 0.001
+%.2g 1e-4 -> 0.0001
+%.2g 1e-5 -> 1e-05
+%.2g 1e-6 -> 1e-06
+%.2g 1234 -> 1.2e+03
+%.2g 123 -> 1.2e+02
+%.2g 12.3 -> 12
+%.2g 1.23 -> 1.2
+%.2g 0.123 -> 0.12
+%.2g 0.0123 -> 0.012
+%.2g 0.00123 -> 0.0012
+%.2g 0.000123 -> 0.00012
+%.2g 0.0000123 -> 1.2e-05
+
+-- bad cases from http://bugs.python.org/issue9980
+%.12g 38210.0 -> 38210
+%.12g 37210.0 -> 37210
+%.12g 36210.0 -> 36210
+
+-- alternate g formatting:  always include decimal point and
+-- exactly <precision> significant digits.
+%#.0g 0 -> 0.
+%#.1g 0 -> 0.
+%#.2g 0 -> 0.0
+%#.3g 0 -> 0.00
+%#.4g 0 -> 0.000
+
+%#.0g 0.2 -> 0.2
+%#.1g 0.2 -> 0.2
+%#.2g 0.2 -> 0.20
+%#.3g 0.2 -> 0.200
+%#.4g 0.2 -> 0.2000
+%#.10g 0.2 -> 0.2000000000
+
+%#.0g 2 -> 2.
+%#.1g 2 -> 2.
+%#.2g 2 -> 2.0
+%#.3g 2 -> 2.00
+%#.4g 2 -> 2.000
+
+%#.0g 20 -> 2.e+01
+%#.1g 20 -> 2.e+01
+%#.2g 20 -> 20.
+%#.3g 20 -> 20.0
+%#.4g 20 -> 20.00
+
+%#.0g 234.56 -> 2.e+02
+%#.1g 234.56 -> 2.e+02
+%#.2g 234.56 -> 2.3e+02
+%#.3g 234.56 -> 235.
+%#.4g 234.56 -> 234.6
+%#.5g 234.56 -> 234.56
+%#.6g 234.56 -> 234.560
+
+-- repr formatting.  Result always includes decimal point and at
+-- least one digit after the point, or an exponent.
+%r 0 -> 0.0
+%r 1 -> 1.0
+
+%r 0.01 -> 0.01
+%r 0.02 -> 0.02
+%r 0.03 -> 0.03
+%r 0.04 -> 0.04
+%r 0.05 -> 0.05
+
+-- values >= 1e16 get an exponent
+%r 10 -> 10.0
+%r 100 -> 100.0
+%r 1e15 -> 1000000000000000.0
+%r 9.999e15 -> 9999000000000000.0
+%r 9999999999999998 -> 9999999999999998.0
+%r 9999999999999999 -> 1e+16
+%r 1e16 -> 1e+16
+%r 1e17 -> 1e+17
+
+-- as do values < 1e-4
+%r 1e-3 -> 0.001
+%r 1.001e-4 -> 0.0001001
+%r 1.0000000000000001e-4 -> 0.0001
+%r 1.000000000000001e-4 -> 0.0001000000000000001
+%r 1.00000000001e-4 -> 0.000100000000001
+%r 1.0000000001e-4 -> 0.00010000000001
+%r 1e-4 -> 0.0001
+%r 0.99999999999999999e-4 -> 0.0001
+%r 0.9999999999999999e-4 -> 9.999999999999999e-05
+%r 0.999999999999e-4 -> 9.99999999999e-05
+%r 0.999e-4 -> 9.99e-05
+%r 1e-5 -> 1e-05
diff --git a/test/dynamo/cpython/3_13/mathdata/ieee754.txt b/test/dynamo/cpython/3_13/mathdata/ieee754.txt
new file mode 100644
index 000000000000..a8b8a0a2148f
--- /dev/null
+++ b/test/dynamo/cpython/3_13/mathdata/ieee754.txt
@@ -0,0 +1,183 @@
+======================================
+Python IEEE 754 floating point support
+======================================
+
+>>> from sys import float_info as FI
+>>> from math import *
+>>> PI = pi
+>>> E = e
+
+You must never compare two floats with == because you are not going to get
+what you expect. We treat two floats as equal if the difference between them
+is small than epsilon.
+>>> EPS = 1E-15
+>>> def equal(x, y):
+...     """Almost equal helper for floats"""
+...     return abs(x - y) < EPS
+
+
+NaNs and INFs
+=============
+
+In Python 2.6 and newer NaNs (not a number) and infinity can be constructed
+from the strings 'inf' and 'nan'.
+
+>>> INF = float('inf')
+>>> NINF = float('-inf')
+>>> NAN = float('nan')
+
+>>> INF
+inf
+>>> NINF
+-inf
+>>> NAN
+nan
+
+The math module's ``isnan`` and ``isinf`` functions can be used to detect INF
+and NAN:
+>>> isinf(INF), isinf(NINF), isnan(NAN)
+(True, True, True)
+>>> INF == -NINF
+True
+
+Infinity
+--------
+
+Ambiguous operations like ``0 * inf`` or ``inf - inf`` result in NaN.
+>>> INF * 0
+nan
+>>> INF - INF
+nan
+>>> INF / INF
+nan
+
+However unambigous operations with inf return inf:
+>>> INF * INF
+inf
+>>> 1.5 * INF
+inf
+>>> 0.5 * INF
+inf
+>>> INF / 1000
+inf
+
+Not a Number
+------------
+
+NaNs are never equal to another number, even itself
+>>> NAN == NAN
+False
+>>> NAN < 0
+False
+>>> NAN >= 0
+False
+
+All operations involving a NaN return a NaN except for nan**0 and 1**nan.
+>>> 1 + NAN
+nan
+>>> 1 * NAN
+nan
+>>> 0 * NAN
+nan
+>>> 1 ** NAN
+1.0
+>>> NAN ** 0
+1.0
+>>> 0 ** NAN
+nan
+>>> (1.0 + FI.epsilon) * NAN
+nan
+
+Misc Functions
+==============
+
+The power of 1 raised to x is always 1.0, even for special values like 0,
+infinity and NaN.
+
+>>> pow(1, 0)
+1.0
+>>> pow(1, INF)
+1.0
+>>> pow(1, -INF)
+1.0
+>>> pow(1, NAN)
+1.0
+
+The power of 0 raised to x is defined as 0, if x is positive. Negative
+finite values are a domain error or zero division error and NaN result in a
+silent NaN.
+
+>>> pow(0, 0)
+1.0
+>>> pow(0, INF)
+0.0
+>>> pow(0, -INF)
+inf
+>>> 0 ** -1
+Traceback (most recent call last):
+...
+ZeroDivisionError: 0.0 cannot be raised to a negative power
+>>> pow(0, NAN)
+nan
+
+
+Trigonometric Functions
+=======================
+
+>>> sin(INF)
+Traceback (most recent call last):
+...
+ValueError: math domain error
+>>> sin(NINF)
+Traceback (most recent call last):
+...
+ValueError: math domain error
+>>> sin(NAN)
+nan
+>>> cos(INF)
+Traceback (most recent call last):
+...
+ValueError: math domain error
+>>> cos(NINF)
+Traceback (most recent call last):
+...
+ValueError: math domain error
+>>> cos(NAN)
+nan
+>>> tan(INF)
+Traceback (most recent call last):
+...
+ValueError: math domain error
+>>> tan(NINF)
+Traceback (most recent call last):
+...
+ValueError: math domain error
+>>> tan(NAN)
+nan
+
+Neither pi nor tan are exact, but you can assume that tan(pi/2) is a large value
+and tan(pi) is a very small value:
+>>> tan(PI/2) > 1E10
+True
+>>> -tan(-PI/2) > 1E10
+True
+>>> tan(PI) < 1E-15
+True
+
+>>> asin(NAN), acos(NAN), atan(NAN)
+(nan, nan, nan)
+>>> asin(INF), asin(NINF)
+Traceback (most recent call last):
+...
+ValueError: math domain error
+>>> acos(INF), acos(NINF)
+Traceback (most recent call last):
+...
+ValueError: math domain error
+>>> equal(atan(INF), PI/2), equal(atan(NINF), -PI/2)
+(True, True)
+
+
+Hyberbolic Functions
+====================
+
diff --git a/test/dynamo/cpython/3_13/mathdata/math_testcases.txt b/test/dynamo/cpython/3_13/mathdata/math_testcases.txt
new file mode 100644
index 000000000000..958518824376
--- /dev/null
+++ b/test/dynamo/cpython/3_13/mathdata/math_testcases.txt
@@ -0,0 +1,633 @@
+-- Testcases for functions in math.
+--
+-- Each line takes the form:
+--
+-- <testid> <function> <input_value> -> <output_value> <flags>
+--
+-- where:
+--
+--   <testid> is a short name identifying the test,
+--
+--   <function> is the function to be tested (exp, cos, asinh, ...),
+--
+--   <input_value> is a string representing a floating-point value
+--
+--   <output_value> is the expected (ideal) output value, again
+--     represented as a string.
+--
+--   <flags> is a list of the floating-point flags required by C99
+--
+-- The possible flags are:
+--
+--   divide-by-zero : raised when a finite input gives a
+--     mathematically infinite result.
+--
+--   overflow : raised when a finite input gives a finite result that
+--     is too large to fit in the usual range of an IEEE 754 double.
+--
+--   invalid : raised for invalid inputs (e.g., sqrt(-1))
+--
+--   ignore-sign : indicates that the sign of the result is
+--     unspecified; e.g., if the result is given as inf,
+--     then both -inf and inf should be accepted as correct.
+--
+-- Flags may appear in any order.
+--
+-- Lines beginning with '--' (like this one) start a comment, and are
+-- ignored.  Blank lines, or lines containing only whitespace, are also
+-- ignored.
+
+-- Many of the values below were computed with the help of
+-- version 2.4 of the MPFR library for multiple-precision
+-- floating-point computations with correct rounding.  All output
+-- values in this file are (modulo yet-to-be-discovered bugs)
+-- correctly rounded, provided that each input and output decimal
+-- floating-point value below is interpreted as a representation of
+-- the corresponding nearest IEEE 754 double-precision value.  See the
+-- MPFR homepage at http://www.mpfr.org for more information about the
+-- MPFR project.
+
+
+-------------------------
+-- erf: error function --
+-------------------------
+
+erf0000 erf 0.0 -> 0.0
+erf0001 erf -0.0 -> -0.0
+erf0002 erf inf -> 1.0
+erf0003 erf -inf -> -1.0
+erf0004 erf nan -> nan
+
+-- tiny values
+erf0010 erf 1e-308 -> 1.1283791670955125e-308
+erf0011 erf 5e-324 -> 4.9406564584124654e-324
+erf0012 erf 1e-10 -> 1.1283791670955126e-10
+
+-- small integers
+erf0020 erf 1 -> 0.84270079294971489
+erf0021 erf 2 -> 0.99532226501895271
+erf0022 erf 3 -> 0.99997790950300136
+erf0023 erf 4 -> 0.99999998458274209
+erf0024 erf 5 -> 0.99999999999846256
+erf0025 erf 6 -> 1.0
+
+erf0030 erf -1 -> -0.84270079294971489
+erf0031 erf -2 -> -0.99532226501895271
+erf0032 erf -3 -> -0.99997790950300136
+erf0033 erf -4 -> -0.99999998458274209
+erf0034 erf -5 -> -0.99999999999846256
+erf0035 erf -6 -> -1.0
+
+-- huge values should all go to +/-1, depending on sign
+erf0040 erf -40 -> -1.0
+erf0041 erf 1e16 -> 1.0
+erf0042 erf -1e150 -> -1.0
+erf0043 erf 1.7e308 -> 1.0
+
+-- Issue 8986: inputs x with exp(-x*x) near the underflow threshold
+-- incorrectly signalled overflow on some platforms.
+erf0100 erf 26.2 -> 1.0
+erf0101 erf 26.4 -> 1.0
+erf0102 erf 26.6 -> 1.0
+erf0103 erf 26.8 -> 1.0
+erf0104 erf 27.0 -> 1.0
+erf0105 erf 27.2 -> 1.0
+erf0106 erf 27.4 -> 1.0
+erf0107 erf 27.6 -> 1.0
+
+erf0110 erf -26.2 -> -1.0
+erf0111 erf -26.4 -> -1.0
+erf0112 erf -26.6 -> -1.0
+erf0113 erf -26.8 -> -1.0
+erf0114 erf -27.0 -> -1.0
+erf0115 erf -27.2 -> -1.0
+erf0116 erf -27.4 -> -1.0
+erf0117 erf -27.6 -> -1.0
+
+----------------------------------------
+-- erfc: complementary error function --
+----------------------------------------
+
+erfc0000 erfc 0.0 -> 1.0
+erfc0001 erfc -0.0 -> 1.0
+erfc0002 erfc inf -> 0.0
+erfc0003 erfc -inf -> 2.0
+erfc0004 erfc nan -> nan
+
+-- tiny values
+erfc0010 erfc 1e-308 -> 1.0
+erfc0011 erfc 5e-324 -> 1.0
+erfc0012 erfc 1e-10 -> 0.99999999988716204
+
+-- small integers
+erfc0020 erfc 1 -> 0.15729920705028513
+erfc0021 erfc 2 -> 0.0046777349810472662
+erfc0022 erfc 3 -> 2.2090496998585441e-05
+erfc0023 erfc 4 -> 1.541725790028002e-08
+erfc0024 erfc 5 -> 1.5374597944280349e-12
+erfc0025 erfc 6 -> 2.1519736712498913e-17
+
+erfc0030 erfc -1 -> 1.8427007929497148
+erfc0031 erfc -2 -> 1.9953222650189528
+erfc0032 erfc -3 -> 1.9999779095030015
+erfc0033 erfc -4 -> 1.9999999845827421
+erfc0034 erfc -5 -> 1.9999999999984626
+erfc0035 erfc -6 -> 2.0
+
+-- as x -> infinity, erfc(x) behaves like exp(-x*x)/x/sqrt(pi)
+erfc0040 erfc 20 -> 5.3958656116079012e-176
+erfc0041 erfc 25 -> 8.3001725711965228e-274
+erfc0042 erfc 27 -> 5.2370464393526292e-319
+erfc0043 erfc 28 -> 0.0
+
+-- huge values
+erfc0050 erfc -40 -> 2.0
+erfc0051 erfc 1e16 -> 0.0
+erfc0052 erfc -1e150 -> 2.0
+erfc0053 erfc 1.7e308 -> 0.0
+
+-- Issue 8986: inputs x with exp(-x*x) near the underflow threshold
+-- incorrectly signalled overflow on some platforms.
+erfc0100 erfc 26.2 -> 1.6432507924389461e-300
+erfc0101 erfc 26.4 -> 4.4017768588035426e-305
+erfc0102 erfc 26.6 -> 1.0885125885442269e-309
+erfc0103 erfc 26.8 -> 2.4849621571966629e-314
+erfc0104 erfc 27.0 -> 5.2370464393526292e-319
+erfc0105 erfc 27.2 -> 9.8813129168249309e-324
+erfc0106 erfc 27.4 -> 0.0
+erfc0107 erfc 27.6 -> 0.0
+
+erfc0110 erfc -26.2 -> 2.0
+erfc0111 erfc -26.4 -> 2.0
+erfc0112 erfc -26.6 -> 2.0
+erfc0113 erfc -26.8 -> 2.0
+erfc0114 erfc -27.0 -> 2.0
+erfc0115 erfc -27.2 -> 2.0
+erfc0116 erfc -27.4 -> 2.0
+erfc0117 erfc -27.6 -> 2.0
+
+---------------------------------------------------------
+-- lgamma: log of absolute value of the gamma function --
+---------------------------------------------------------
+
+-- special values
+lgam0000 lgamma 0.0 -> inf      divide-by-zero
+lgam0001 lgamma -0.0 -> inf     divide-by-zero
+lgam0002 lgamma inf -> inf
+lgam0003 lgamma -inf -> inf
+lgam0004 lgamma nan -> nan
+
+-- negative integers
+lgam0010 lgamma -1 -> inf       divide-by-zero
+lgam0011 lgamma -2 -> inf       divide-by-zero
+lgam0012 lgamma -1e16 -> inf    divide-by-zero
+lgam0013 lgamma -1e300 -> inf   divide-by-zero
+lgam0014 lgamma -1.79e308 -> inf divide-by-zero
+
+-- small positive integers give factorials
+lgam0020 lgamma 1 -> 0.0
+lgam0021 lgamma 2 -> 0.0
+lgam0022 lgamma 3 -> 0.69314718055994529
+lgam0023 lgamma 4 -> 1.791759469228055
+lgam0024 lgamma 5 -> 3.1780538303479458
+lgam0025 lgamma 6 -> 4.7874917427820458
+
+-- half integers
+lgam0030 lgamma 0.5 -> 0.57236494292470008
+lgam0031 lgamma 1.5 -> -0.12078223763524522
+lgam0032 lgamma 2.5 -> 0.28468287047291918
+lgam0033 lgamma 3.5 -> 1.2009736023470743
+lgam0034 lgamma -0.5 -> 1.2655121234846454
+lgam0035 lgamma -1.5 -> 0.86004701537648098
+lgam0036 lgamma -2.5 -> -0.056243716497674054
+lgam0037 lgamma -3.5 -> -1.309006684993042
+
+-- values near 0
+lgam0040 lgamma 0.1 -> 2.252712651734206
+lgam0041 lgamma 0.01 -> 4.5994798780420219
+lgam0042 lgamma 1e-8 -> 18.420680738180209
+lgam0043 lgamma 1e-16 -> 36.841361487904734
+lgam0044 lgamma 1e-30 -> 69.077552789821368
+lgam0045 lgamma 1e-160 -> 368.41361487904732
+lgam0046 lgamma 1e-308 -> 709.19620864216608
+lgam0047 lgamma 5.6e-309 -> 709.77602713741896
+lgam0048 lgamma 5.5e-309 -> 709.79404564292167
+lgam0049 lgamma 1e-309 -> 711.49879373516012
+lgam0050 lgamma 1e-323 -> 743.74692474082133
+lgam0051 lgamma 5e-324 -> 744.44007192138122
+lgam0060 lgamma -0.1 -> 2.3689613327287886
+lgam0061 lgamma -0.01 -> 4.6110249927528013
+lgam0062 lgamma -1e-8 -> 18.420680749724522
+lgam0063 lgamma -1e-16 -> 36.841361487904734
+lgam0064 lgamma -1e-30 -> 69.077552789821368
+lgam0065 lgamma -1e-160 -> 368.41361487904732
+lgam0066 lgamma -1e-308 -> 709.19620864216608
+lgam0067 lgamma -5.6e-309 -> 709.77602713741896
+lgam0068 lgamma -5.5e-309 -> 709.79404564292167
+lgam0069 lgamma -1e-309 -> 711.49879373516012
+lgam0070 lgamma -1e-323 -> 743.74692474082133
+lgam0071 lgamma -5e-324 -> 744.44007192138122
+
+-- values near negative integers
+lgam0080 lgamma -0.99999999999999989 -> 36.736800569677101
+lgam0081 lgamma -1.0000000000000002 -> 36.043653389117154
+lgam0082 lgamma -1.9999999999999998 -> 35.350506208557213
+lgam0083 lgamma -2.0000000000000004 -> 34.657359027997266
+lgam0084 lgamma -100.00000000000001 -> -331.85460524980607
+lgam0085 lgamma -99.999999999999986 -> -331.85460524980596
+
+-- large inputs
+lgam0100 lgamma 170 -> 701.43726380873704
+lgam0101 lgamma 171 -> 706.57306224578736
+lgam0102 lgamma 171.624 -> 709.78077443669895
+lgam0103 lgamma 171.625 -> 709.78591682948365
+lgam0104 lgamma 172 -> 711.71472580228999
+lgam0105 lgamma 2000 -> 13198.923448054265
+lgam0106 lgamma 2.55998332785163e305 -> 1.7976931348623099e+308
+lgam0107 lgamma 2.55998332785164e305 -> inf overflow
+lgam0108 lgamma 1.7e308 -> inf overflow
+
+-- inputs for which gamma(x) is tiny
+lgam0120 lgamma -100.5 -> -364.90096830942736
+lgam0121 lgamma -160.5 -> -656.88005261126432
+lgam0122 lgamma -170.5 -> -707.99843314507882
+lgam0123 lgamma -171.5 -> -713.14301641168481
+lgam0124 lgamma -176.5 -> -738.95247590846486
+lgam0125 lgamma -177.5 -> -744.13144651738037
+lgam0126 lgamma -178.5 -> -749.3160351186001
+
+lgam0130 lgamma -1000.5 -> -5914.4377011168517
+lgam0131 lgamma -30000.5 -> -279278.6629959144
+lgam0132 lgamma -4503599627370495.5 -> -1.5782258434492883e+17
+
+-- results close to 0:  positive argument ...
+lgam0150 lgamma 0.99999999999999989 -> 6.4083812134800075e-17
+lgam0151 lgamma 1.0000000000000002 -> -1.2816762426960008e-16
+lgam0152 lgamma 1.9999999999999998 -> -9.3876980655431170e-17
+lgam0153 lgamma 2.0000000000000004 -> 1.8775396131086244e-16
+
+-- ... and negative argument
+lgam0160 lgamma -2.7476826467 -> -5.2477408147689136e-11
+lgam0161 lgamma -2.457024738 -> 3.3464637541912932e-10
+
+
+---------------------------
+-- gamma: Gamma function --
+---------------------------
+
+-- special values
+gam0000 gamma 0.0 -> inf        divide-by-zero
+gam0001 gamma -0.0 -> -inf      divide-by-zero
+gam0002 gamma inf -> inf
+gam0003 gamma -inf -> nan       invalid
+gam0004 gamma nan -> nan
+
+-- negative integers inputs are invalid
+gam0010 gamma -1 -> nan         invalid
+gam0011 gamma -2 -> nan         invalid
+gam0012 gamma -1e16 -> nan      invalid
+gam0013 gamma -1e300 -> nan     invalid
+
+-- small positive integers give factorials
+gam0020 gamma 1 -> 1
+gam0021 gamma 2 -> 1
+gam0022 gamma 3 -> 2
+gam0023 gamma 4 -> 6
+gam0024 gamma 5 -> 24
+gam0025 gamma 6 -> 120
+
+-- half integers
+gam0030 gamma 0.5 -> 1.7724538509055161
+gam0031 gamma 1.5 -> 0.88622692545275805
+gam0032 gamma 2.5 -> 1.3293403881791370
+gam0033 gamma 3.5 -> 3.3233509704478426
+gam0034 gamma -0.5 -> -3.5449077018110322
+gam0035 gamma -1.5 -> 2.3632718012073548
+gam0036 gamma -2.5 -> -0.94530872048294190
+gam0037 gamma -3.5 -> 0.27008820585226911
+
+-- values near 0
+gam0040 gamma 0.1 -> 9.5135076986687306
+gam0041 gamma 0.01 -> 99.432585119150602
+gam0042 gamma 1e-8 -> 99999999.422784343
+gam0043 gamma 1e-16 -> 10000000000000000
+gam0044 gamma 1e-30 -> 9.9999999999999988e+29
+gam0045 gamma 1e-160 -> 1.0000000000000000e+160
+gam0046 gamma 1e-308 -> 1.0000000000000000e+308
+gam0047 gamma 5.6e-309 -> 1.7857142857142848e+308
+gam0048 gamma 5.5e-309 -> inf   overflow
+gam0049 gamma 1e-309 -> inf     overflow
+gam0050 gamma 1e-323 -> inf     overflow
+gam0051 gamma 5e-324 -> inf     overflow
+gam0060 gamma -0.1 -> -10.686287021193193
+gam0061 gamma -0.01 -> -100.58719796441078
+gam0062 gamma -1e-8 -> -100000000.57721567
+gam0063 gamma -1e-16 -> -10000000000000000
+gam0064 gamma -1e-30 -> -9.9999999999999988e+29
+gam0065 gamma -1e-160 -> -1.0000000000000000e+160
+gam0066 gamma -1e-308 -> -1.0000000000000000e+308
+gam0067 gamma -5.6e-309 -> -1.7857142857142848e+308
+gam0068 gamma -5.5e-309 -> -inf overflow
+gam0069 gamma -1e-309 -> -inf   overflow
+gam0070 gamma -1e-323 -> -inf   overflow
+gam0071 gamma -5e-324 -> -inf   overflow
+
+-- values near negative integers
+gam0080 gamma -0.99999999999999989 -> -9007199254740992.0
+gam0081 gamma -1.0000000000000002 -> 4503599627370495.5
+gam0082 gamma -1.9999999999999998 -> 2251799813685248.5
+gam0083 gamma -2.0000000000000004 -> -1125899906842623.5
+gam0084 gamma -100.00000000000001 -> -7.5400833348831090e-145
+gam0085 gamma -99.999999999999986 -> 7.5400833348840962e-145
+
+-- large inputs
+gam0100 gamma 170 -> 4.2690680090047051e+304
+gam0101 gamma 171 -> 7.2574156153079990e+306
+gam0102 gamma 171.624 -> 1.7942117599248104e+308
+gam0103 gamma 171.625 -> inf    overflow
+gam0104 gamma 172 -> inf        overflow
+gam0105 gamma 2000 -> inf       overflow
+gam0106 gamma 1.7e308 -> inf    overflow
+
+-- inputs for which gamma(x) is tiny
+gam0120 gamma -100.5 -> -3.3536908198076787e-159
+gam0121 gamma -160.5 -> -5.2555464470078293e-286
+gam0122 gamma -170.5 -> -3.3127395215386074e-308
+gam0123 gamma -171.5 -> 1.9316265431711902e-310
+gam0124 gamma -176.5 -> -1.1956388629358166e-321
+gam0125 gamma -177.5 -> 4.9406564584124654e-324
+gam0126 gamma -178.5 -> -0.0
+gam0127 gamma -179.5 -> 0.0
+gam0128 gamma -201.0001 -> 0.0
+gam0129 gamma -202.9999 -> -0.0
+gam0130 gamma -1000.5 -> -0.0
+gam0131 gamma -1000000000.3 -> -0.0
+gam0132 gamma -4503599627370495.5 -> 0.0
+
+-- inputs that cause problems for the standard reflection formula,
+-- thanks to loss of accuracy in 1-x
+gam0140 gamma -63.349078729022985 -> 4.1777971677761880e-88
+gam0141 gamma -127.45117632943295 -> 1.1831110896236810e-214
+
+
+-----------------------------------------------------------
+-- log1p: log(1 + x), without precision loss for small x --
+-----------------------------------------------------------
+
+-- special values
+log1p0000 log1p 0.0 -> 0.0
+log1p0001 log1p -0.0 -> -0.0
+log1p0002 log1p inf -> inf
+log1p0003 log1p -inf -> nan             invalid
+log1p0004 log1p nan -> nan
+
+-- singularity at -1.0
+log1p0010 log1p -1.0 -> -inf            divide-by-zero
+log1p0011 log1p -0.9999999999999999 -> -36.736800569677101
+
+-- finite values < 1.0 are invalid
+log1p0020 log1p -1.0000000000000002 -> nan invalid
+log1p0021 log1p -1.1 -> nan invalid
+log1p0022 log1p -2.0 -> nan invalid
+log1p0023 log1p -1e300 -> nan invalid
+
+-- tiny x: log1p(x) ~ x
+log1p0110 log1p 5e-324 -> 5e-324
+log1p0111 log1p 1e-320 -> 1e-320
+log1p0112 log1p 1e-300 -> 1e-300
+log1p0113 log1p 1e-150 -> 1e-150
+log1p0114 log1p 1e-20 -> 1e-20
+
+log1p0120 log1p -5e-324 -> -5e-324
+log1p0121 log1p -1e-320 -> -1e-320
+log1p0122 log1p -1e-300 -> -1e-300
+log1p0123 log1p -1e-150 -> -1e-150
+log1p0124 log1p -1e-20 -> -1e-20
+
+-- some (mostly) random small and moderate-sized values
+log1p0200 log1p -0.89156889782277482 -> -2.2216403106762863
+log1p0201 log1p -0.23858496047770464 -> -0.27257668276980057
+log1p0202 log1p -0.011641726191307515 -> -0.011710021654495657
+log1p0203 log1p -0.0090126398571693817 -> -0.0090534993825007650
+log1p0204 log1p -0.00023442805985712781 -> -0.00023445554240995693
+log1p0205 log1p -1.5672870980936349e-5 -> -1.5672993801662046e-5
+log1p0206 log1p -7.9650013274825295e-6 -> -7.9650330482740401e-6
+log1p0207 log1p -2.5202948343227410e-7 -> -2.5202951519170971e-7
+log1p0208 log1p -8.2446372820745855e-11 -> -8.2446372824144559e-11
+log1p0209 log1p -8.1663670046490789e-12 -> -8.1663670046824230e-12
+log1p0210 log1p 7.0351735084656292e-18 -> 7.0351735084656292e-18
+log1p0211 log1p 5.2732161907375226e-12 -> 5.2732161907236188e-12
+log1p0212 log1p 1.0000000000000000e-10 -> 9.9999999995000007e-11
+log1p0213 log1p 2.1401273266000197e-9 -> 2.1401273243099470e-9
+log1p0214 log1p 1.2668914653979560e-8 -> 1.2668914573728861e-8
+log1p0215 log1p 1.6250007816299069e-6 -> 1.6249994613175672e-6
+log1p0216 log1p 8.3740495645839399e-6 -> 8.3740145024266269e-6
+log1p0217 log1p 3.0000000000000001e-5 -> 2.9999550008999799e-5
+log1p0218 log1p 0.0070000000000000001 -> 0.0069756137364252423
+log1p0219 log1p 0.013026235315053002 -> 0.012942123564008787
+log1p0220 log1p 0.013497160797236184 -> 0.013406885521915038
+log1p0221 log1p 0.027625599078135284 -> 0.027250897463483054
+log1p0222 log1p 0.14179687245544870 -> 0.13260322540908789
+
+-- large values
+log1p0300 log1p 1.7976931348623157e+308 -> 709.78271289338397
+log1p0301 log1p 1.0000000000000001e+300 -> 690.77552789821368
+log1p0302 log1p 1.0000000000000001e+70 -> 161.18095650958321
+log1p0303 log1p 10000000000.000000 -> 23.025850930040455
+
+-- other values transferred from testLog1p in test_math
+log1p0400 log1p -0.63212055882855767 -> -1.0000000000000000
+log1p0401 log1p 1.7182818284590451 -> 1.0000000000000000
+log1p0402 log1p 1.0000000000000000 -> 0.69314718055994529
+log1p0403 log1p 1.2379400392853803e+27 -> 62.383246250395075
+
+
+-----------------------------------------------------------
+-- expm1: exp(x) - 1, without precision loss for small x --
+-----------------------------------------------------------
+
+-- special values
+expm10000 expm1 0.0 -> 0.0
+expm10001 expm1 -0.0 -> -0.0
+expm10002 expm1 inf -> inf
+expm10003 expm1 -inf -> -1.0
+expm10004 expm1 nan -> nan
+
+-- expm1(x) ~ x for tiny x
+expm10010 expm1 5e-324 -> 5e-324
+expm10011 expm1 1e-320 -> 1e-320
+expm10012 expm1 1e-300 -> 1e-300
+expm10013 expm1 1e-150 -> 1e-150
+expm10014 expm1 1e-20 -> 1e-20
+
+expm10020 expm1 -5e-324 -> -5e-324
+expm10021 expm1 -1e-320 -> -1e-320
+expm10022 expm1 -1e-300 -> -1e-300
+expm10023 expm1 -1e-150 -> -1e-150
+expm10024 expm1 -1e-20 -> -1e-20
+
+-- moderate sized values, where direct evaluation runs into trouble
+expm10100 expm1 1e-10 -> 1.0000000000500000e-10
+expm10101 expm1 -9.9999999999999995e-08 -> -9.9999995000000163e-8
+expm10102 expm1 3.0000000000000001e-05 -> 3.0000450004500034e-5
+expm10103 expm1 -0.0070000000000000001 -> -0.0069755570667648951
+expm10104 expm1 -0.071499208740094633 -> -0.069002985744820250
+expm10105 expm1 -0.063296004180116799 -> -0.061334416373633009
+expm10106 expm1 0.02390954035597756 -> 0.024197665143819942
+expm10107 expm1 0.085637352649044901 -> 0.089411184580357767
+expm10108 expm1 0.5966174947411006 -> 0.81596588596501485
+expm10109 expm1 0.30247206212075139 -> 0.35319987035848677
+expm10110 expm1 0.74574727375889516 -> 1.1080161116737459
+expm10111 expm1 0.97767512926555711 -> 1.6582689207372185
+expm10112 expm1 0.8450154566787712 -> 1.3280137976535897
+expm10113 expm1 -0.13979260323125264 -> -0.13046144381396060
+expm10114 expm1 -0.52899322039643271 -> -0.41080213643695923
+expm10115 expm1 -0.74083261478900631 -> -0.52328317124797097
+expm10116 expm1 -0.93847766984546055 -> -0.60877704724085946
+expm10117 expm1 10.0 -> 22025.465794806718
+expm10118 expm1 27.0 -> 532048240600.79865
+expm10119 expm1 123 -> 2.6195173187490626e+53
+expm10120 expm1 -12.0 -> -0.99999385578764666
+expm10121 expm1 -35.100000000000001 -> -0.99999999999999944
+
+-- extreme negative values
+expm10201 expm1 -37.0 -> -0.99999999999999989
+expm10200 expm1 -38.0 -> -1.0
+expm10210 expm1 -710.0 -> -1.0
+-- the formula expm1(x) = 2 * sinh(x/2) * exp(x/2) doesn't work so
+-- well when exp(x/2) is subnormal or underflows to zero; check we're
+-- not using it!
+expm10211 expm1 -1420.0 -> -1.0
+expm10212 expm1 -1450.0 -> -1.0
+expm10213 expm1 -1500.0 -> -1.0
+expm10214 expm1 -1e50 -> -1.0
+expm10215 expm1 -1.79e308 -> -1.0
+
+-- extreme positive values
+expm10300 expm1 300 -> 1.9424263952412558e+130
+expm10301 expm1 700 -> 1.0142320547350045e+304
+-- the next test (expm10302) is disabled because it causes failure on
+-- OS X 10.4/Intel: apparently all values over 709.78 produce an
+-- overflow on that platform.  See issue #7575.
+-- expm10302 expm1 709.78271289328393 -> 1.7976931346824240e+308
+expm10303 expm1 709.78271289348402 -> inf overflow
+expm10304 expm1 1000 -> inf overflow
+expm10305 expm1 1e50 -> inf overflow
+expm10306 expm1 1.79e308 -> inf overflow
+
+-- weaker version of expm10302
+expm10307 expm1 709.5 -> 1.3549863193146328e+308
+
+-------------------------
+-- log2: log to base 2 --
+-------------------------
+
+-- special values
+log20000 log2 0.0 -> -inf               divide-by-zero
+log20001 log2 -0.0 -> -inf              divide-by-zero
+log20002 log2 inf -> inf
+log20003 log2 -inf -> nan               invalid
+log20004 log2 nan -> nan
+
+-- exact value at 1.0
+log20010 log2 1.0 -> 0.0
+
+-- negatives
+log20020 log2 -5e-324 -> nan            invalid
+log20021 log2 -1.0 -> nan               invalid
+log20022 log2 -1.7e-308 -> nan          invalid
+
+-- exact values at powers of 2
+log20100 log2 2.0 -> 1.0
+log20101 log2 4.0 -> 2.0
+log20102 log2 8.0 -> 3.0
+log20103 log2 16.0 -> 4.0
+log20104 log2 32.0 -> 5.0
+log20105 log2 64.0 -> 6.0
+log20106 log2 128.0 -> 7.0
+log20107 log2 256.0 -> 8.0
+log20108 log2 512.0 -> 9.0
+log20109 log2 1024.0 -> 10.0
+log20110 log2 2048.0 -> 11.0
+
+log20200 log2 0.5 -> -1.0
+log20201 log2 0.25 -> -2.0
+log20202 log2 0.125 -> -3.0
+log20203 log2 0.0625 -> -4.0
+
+-- values close to 1.0
+log20300 log2 1.0000000000000002 -> 3.2034265038149171e-16
+log20301 log2 1.0000000001 -> 1.4426951601859516e-10
+log20302 log2 1.00001 -> 1.4426878274712997e-5
+
+log20310 log2 0.9999999999999999 -> -1.6017132519074588e-16
+log20311 log2 0.9999999999 -> -1.4426951603302210e-10
+log20312 log2 0.99999 -> -1.4427022544056922e-5
+
+-- tiny values
+log20400 log2 5e-324 -> -1074.0
+log20401 log2 1e-323 -> -1073.0
+log20402 log2 1.5e-323 -> -1072.4150374992789
+log20403 log2 2e-323 -> -1072.0
+
+log20410 log2 1e-308 -> -1023.1538532253076
+log20411 log2 2.2250738585072014e-308 -> -1022.0
+log20412 log2 4.4501477170144028e-308 -> -1021.0
+log20413 log2 1e-307 -> -1019.8319251304202
+
+-- huge values
+log20500 log2 1.7976931348623157e+308 -> 1024.0
+log20501 log2 1.7e+308 -> 1023.9193879716706
+log20502 log2 8.9884656743115795e+307 -> 1023.0
+
+-- selection of random values
+log20600 log2 -7.2174324841039838e+289 -> nan   invalid
+log20601 log2 -2.861319734089617e+265 -> nan    invalid
+log20602 log2 -4.3507646894008962e+257 -> nan   invalid
+log20603 log2 -6.6717265307520224e+234 -> nan   invalid
+log20604 log2 -3.9118023786619294e+229 -> nan   invalid
+log20605 log2 -1.5478221302505161e+206 -> nan   invalid
+log20606 log2 -1.4380485131364602e+200 -> nan   invalid
+log20607 log2 -3.7235198730382645e+185 -> nan   invalid
+log20608 log2 -1.0472242235095724e+184 -> nan   invalid
+log20609 log2 -5.0141781956163884e+160 -> nan   invalid
+log20610 log2 -2.1157958031160324e+124 -> nan   invalid
+log20611 log2 -7.9677558612567718e+90 -> nan    invalid
+log20612 log2 -5.5553906194063732e+45 -> nan    invalid
+log20613 log2 -16573900952607.953 -> nan        invalid
+log20614 log2 -37198371019.888618 -> nan        invalid
+log20615 log2 -6.0727115121422674e-32 -> nan    invalid
+log20616 log2 -2.5406841656526057e-38 -> nan    invalid
+log20617 log2 -4.9056766703267657e-43 -> nan    invalid
+log20618 log2 -2.1646786075228305e-71 -> nan    invalid
+log20619 log2 -2.470826790488573e-78 -> nan     invalid
+log20620 log2 -3.8661709303489064e-165 -> nan   invalid
+log20621 log2 -1.0516496976649986e-182 -> nan   invalid
+log20622 log2 -1.5935458614317996e-255 -> nan   invalid
+log20623 log2 -2.8750977267336654e-293 -> nan   invalid
+log20624 log2 -7.6079466794732585e-296 -> nan   invalid
+log20625 log2 3.2073253539988545e-307 -> -1018.1505544209213
+log20626 log2 1.674937885472249e-244 -> -809.80634755783126
+log20627 log2 1.0911259044931283e-214 -> -710.76679472274213
+log20628 log2 2.0275372624809709e-154 -> -510.55719818383272
+log20629 log2 7.3926087369631841e-115 -> -379.13564735312292
+log20630 log2 1.3480198206342423e-86 -> -285.25497445094436
+log20631 log2 8.9927384655719947e-83 -> -272.55127136401637
+log20632 log2 3.1452398713597487e-60 -> -197.66251564496875
+log20633 log2 7.0706573215457351e-55 -> -179.88420087782217
+log20634 log2 3.1258285390731669e-49 -> -161.13023800505653
+log20635 log2 8.2253046627829942e-41 -> -133.15898277355879
+log20636 log2 7.8691367397519897e+49 -> 165.75068202732419
+log20637 log2 2.9920561983925013e+64 -> 214.18453534573757
+log20638 log2 4.7827254553946841e+77 -> 258.04629628445673
+log20639 log2 3.1903566496481868e+105 -> 350.47616767491166
+log20640 log2 5.6195082449502419e+113 -> 377.86831861008250
+log20641 log2 9.9625658250651047e+125 -> 418.55752921228753
+log20642 log2 2.7358945220961532e+145 -> 483.13158636923413
+log20643 log2 2.785842387926931e+174 -> 579.49360214860280
+log20644 log2 2.4169172507252751e+193 -> 642.40529039289652
+log20645 log2 3.1689091206395632e+205 -> 682.65924573798395
+log20646 log2 2.535995592365391e+208 -> 692.30359597460460
+log20647 log2 6.2011236566089916e+233 -> 776.64177576730913
+log20648 log2 2.1843274820677632e+253 -> 841.57499717289647
+log20649 log2 8.7493931063474791e+297 -> 989.74182713073981
diff --git a/test/dynamo/cpython/3_13/seq_tests.diff b/test/dynamo/cpython/3_13/seq_tests.diff
new file mode 100644
index 000000000000..03c7021e4f96
--- /dev/null
+++ b/test/dynamo/cpython/3_13/seq_tests.diff
@@ -0,0 +1,68 @@
+diff --git a/test/dynamo/cpython/3_13/seq_tests.py b/test/dynamo/cpython/3_13/seq_tests.py
+index 719c9434a16..4325892276d 100644
+--- a/test/dynamo/cpython/3_13/seq_tests.py
++++ b/test/dynamo/cpython/3_13/seq_tests.py
+@@ -1,3 +1,54 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ """
+ Tests common to tuple, list and UserList.UserList
+ """
+@@ -95,7 +146,7 @@ class LyingList(list):
+     def __iter__(self):
+         yield 1
+ 
+-class CommonTest(unittest.TestCase):
++class CommonTest(__TestCase):
+     # The type to be tested
+     type2test = None
+ 
diff --git a/test/dynamo/cpython/3_13/seq_tests.py b/test/dynamo/cpython/3_13/seq_tests.py
new file mode 100644
index 000000000000..4325892276d4
--- /dev/null
+++ b/test/dynamo/cpython/3_13/seq_tests.py
@@ -0,0 +1,483 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+"""
+Tests common to tuple, list and UserList.UserList
+"""
+
+import unittest
+import sys
+import pickle
+from test import support
+from test.support import ALWAYS_EQ, NEVER_EQ
+
+# Various iterables
+# This is used for checking the constructor (here and in test_deque.py)
+def iterfunc(seqn):
+    'Regular generator'
+    for i in seqn:
+        yield i
+
+class Sequence:
+    'Sequence using __getitem__'
+    def __init__(self, seqn):
+        self.seqn = seqn
+    def __getitem__(self, i):
+        return self.seqn[i]
+
+class IterFunc:
+    'Sequence using iterator protocol'
+    def __init__(self, seqn):
+        self.seqn = seqn
+        self.i = 0
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self.i >= len(self.seqn): raise StopIteration
+        v = self.seqn[self.i]
+        self.i += 1
+        return v
+
+class IterGen:
+    'Sequence using iterator protocol defined with a generator'
+    def __init__(self, seqn):
+        self.seqn = seqn
+        self.i = 0
+    def __iter__(self):
+        for val in self.seqn:
+            yield val
+
+class IterNextOnly:
+    'Missing __getitem__ and __iter__'
+    def __init__(self, seqn):
+        self.seqn = seqn
+        self.i = 0
+    def __next__(self):
+        if self.i >= len(self.seqn): raise StopIteration
+        v = self.seqn[self.i]
+        self.i += 1
+        return v
+
+class IterNoNext:
+    'Iterator missing __next__()'
+    def __init__(self, seqn):
+        self.seqn = seqn
+        self.i = 0
+    def __iter__(self):
+        return self
+
+class IterGenExc:
+    'Test propagation of exceptions'
+    def __init__(self, seqn):
+        self.seqn = seqn
+        self.i = 0
+    def __iter__(self):
+        return self
+    def __next__(self):
+        3 // 0
+
+class IterFuncStop:
+    'Test immediate stop'
+    def __init__(self, seqn):
+        pass
+    def __iter__(self):
+        return self
+    def __next__(self):
+        raise StopIteration
+
+from itertools import chain
+def itermulti(seqn):
+    'Test multiple tiers of iterators'
+    return chain(map(lambda x:x, iterfunc(IterGen(Sequence(seqn)))))
+
+class LyingTuple(tuple):
+    def __iter__(self):
+        yield 1
+
+class LyingList(list):
+    def __iter__(self):
+        yield 1
+
+class CommonTest(__TestCase):
+    # The type to be tested
+    type2test = None
+
+    def test_constructors(self):
+        l0 = []
+        l1 = [0]
+        l2 = [0, 1]
+
+        u = self.type2test()
+        u0 = self.type2test(l0)
+        u1 = self.type2test(l1)
+        u2 = self.type2test(l2)
+
+        uu = self.type2test(u)
+        uu0 = self.type2test(u0)
+        uu1 = self.type2test(u1)
+        uu2 = self.type2test(u2)
+
+        v = self.type2test(tuple(u))
+        class OtherSeq:
+            def __init__(self, initseq):
+                self.__data = initseq
+            def __len__(self):
+                return len(self.__data)
+            def __getitem__(self, i):
+                return self.__data[i]
+        s = OtherSeq(u0)
+        v0 = self.type2test(s)
+        self.assertEqual(len(v0), len(s))
+
+        s = "this is also a sequence"
+        vv = self.type2test(s)
+        self.assertEqual(len(vv), len(s))
+
+        # Create from various iteratables
+        for s in ("123", "", range(1000), ('do', 1.2), range(2000,2200,5)):
+            for g in (Sequence, IterFunc, IterGen,
+                      itermulti, iterfunc):
+                self.assertEqual(self.type2test(g(s)), self.type2test(s))
+            self.assertEqual(self.type2test(IterFuncStop(s)), self.type2test())
+            self.assertEqual(self.type2test(c for c in "123"), self.type2test("123"))
+            self.assertRaises(TypeError, self.type2test, IterNextOnly(s))
+            self.assertRaises(TypeError, self.type2test, IterNoNext(s))
+            self.assertRaises(ZeroDivisionError, self.type2test, IterGenExc(s))
+
+        # Issue #23757
+        self.assertEqual(self.type2test(LyingTuple((2,))), self.type2test((1,)))
+        self.assertEqual(self.type2test(LyingList([2])), self.type2test([1]))
+
+        with self.assertRaises(TypeError):
+            self.type2test(unsupported_arg=[])
+
+    def test_truth(self):
+        self.assertFalse(self.type2test())
+        self.assertTrue(self.type2test([42]))
+
+    def test_getitem(self):
+        u = self.type2test([0, 1, 2, 3, 4])
+        for i in range(len(u)):
+            self.assertEqual(u[i], i)
+            self.assertEqual(u[int(i)], i)
+        for i in range(-len(u), -1):
+            self.assertEqual(u[i], len(u)+i)
+            self.assertEqual(u[int(i)], len(u)+i)
+        self.assertRaises(IndexError, u.__getitem__, -len(u)-1)
+        self.assertRaises(IndexError, u.__getitem__, len(u))
+        self.assertRaises(ValueError, u.__getitem__, slice(0,10,0))
+
+        u = self.type2test()
+        self.assertRaises(IndexError, u.__getitem__, 0)
+        self.assertRaises(IndexError, u.__getitem__, -1)
+
+        self.assertRaises(TypeError, u.__getitem__)
+
+        a = self.type2test([10, 11])
+        self.assertEqual(a[0], 10)
+        self.assertEqual(a[1], 11)
+        self.assertEqual(a[-2], 10)
+        self.assertEqual(a[-1], 11)
+        self.assertRaises(IndexError, a.__getitem__, -3)
+        self.assertRaises(IndexError, a.__getitem__, 3)
+
+    def test_getslice(self):
+        l = [0, 1, 2, 3, 4]
+        u = self.type2test(l)
+
+        self.assertEqual(u[0:0], self.type2test())
+        self.assertEqual(u[1:2], self.type2test([1]))
+        self.assertEqual(u[-2:-1], self.type2test([3]))
+        self.assertEqual(u[-1000:1000], u)
+        self.assertEqual(u[1000:-1000], self.type2test([]))
+        self.assertEqual(u[:], u)
+        self.assertEqual(u[1:None], self.type2test([1, 2, 3, 4]))
+        self.assertEqual(u[None:3], self.type2test([0, 1, 2]))
+
+        # Extended slices
+        self.assertEqual(u[::], u)
+        self.assertEqual(u[::2], self.type2test([0, 2, 4]))
+        self.assertEqual(u[1::2], self.type2test([1, 3]))
+        self.assertEqual(u[::-1], self.type2test([4, 3, 2, 1, 0]))
+        self.assertEqual(u[::-2], self.type2test([4, 2, 0]))
+        self.assertEqual(u[3::-2], self.type2test([3, 1]))
+        self.assertEqual(u[3:3:-2], self.type2test([]))
+        self.assertEqual(u[3:2:-2], self.type2test([3]))
+        self.assertEqual(u[3:1:-2], self.type2test([3]))
+        self.assertEqual(u[3:0:-2], self.type2test([3, 1]))
+        self.assertEqual(u[::-100], self.type2test([4]))
+        self.assertEqual(u[100:-100:], self.type2test([]))
+        self.assertEqual(u[-100:100:], u)
+        self.assertEqual(u[100:-100:-1], u[::-1])
+        self.assertEqual(u[-100:100:-1], self.type2test([]))
+        self.assertEqual(u[-100:100:2], self.type2test([0, 2, 4]))
+
+        # Test extreme cases with long ints
+        a = self.type2test([0,1,2,3,4])
+        self.assertEqual(a[ -pow(2,128): 3 ], self.type2test([0,1,2]))
+        self.assertEqual(a[ 3: pow(2,145) ], self.type2test([3,4]))
+        self.assertEqual(a[3::sys.maxsize], self.type2test([3]))
+
+    def test_contains(self):
+        u = self.type2test([0, 1, 2])
+        for i in u:
+            self.assertIn(i, u)
+        for i in min(u)-1, max(u)+1:
+            self.assertNotIn(i, u)
+
+        self.assertRaises(TypeError, u.__contains__)
+
+    def test_contains_fake(self):
+        # Sequences must use rich comparison against each item
+        # (unless "is" is true, or an earlier item answered)
+        # So ALWAYS_EQ must be found in all non-empty sequences.
+        self.assertNotIn(ALWAYS_EQ, self.type2test([]))
+        self.assertIn(ALWAYS_EQ, self.type2test([1]))
+        self.assertIn(1, self.type2test([ALWAYS_EQ]))
+        self.assertNotIn(NEVER_EQ, self.type2test([]))
+        self.assertNotIn(ALWAYS_EQ, self.type2test([NEVER_EQ]))
+        self.assertIn(NEVER_EQ, self.type2test([ALWAYS_EQ]))
+
+    def test_contains_order(self):
+        # Sequences must test in-order.  If a rich comparison has side
+        # effects, these will be visible to tests against later members.
+        # In this test, the "side effect" is a short-circuiting raise.
+        class DoNotTestEq(Exception):
+            pass
+        class StopCompares:
+            def __eq__(self, other):
+                raise DoNotTestEq
+
+        checkfirst = self.type2test([1, StopCompares()])
+        self.assertIn(1, checkfirst)
+        checklast = self.type2test([StopCompares(), 1])
+        self.assertRaises(DoNotTestEq, checklast.__contains__, 1)
+
+    def test_len(self):
+        self.assertEqual(len(self.type2test()), 0)
+        self.assertEqual(len(self.type2test([])), 0)
+        self.assertEqual(len(self.type2test([0])), 1)
+        self.assertEqual(len(self.type2test([0, 1, 2])), 3)
+
+    def test_minmax(self):
+        u = self.type2test([0, 1, 2])
+        self.assertEqual(min(u), 0)
+        self.assertEqual(max(u), 2)
+
+    def test_addmul(self):
+        u1 = self.type2test([0])
+        u2 = self.type2test([0, 1])
+        self.assertEqual(u1, u1 + self.type2test())
+        self.assertEqual(u1, self.type2test() + u1)
+        self.assertEqual(u1 + self.type2test([1]), u2)
+        self.assertEqual(self.type2test([-1]) + u1, self.type2test([-1, 0]))
+        self.assertEqual(self.type2test(), u2*0)
+        self.assertEqual(self.type2test(), 0*u2)
+        self.assertEqual(self.type2test(), u2*0)
+        self.assertEqual(self.type2test(), 0*u2)
+        self.assertEqual(u2, u2*1)
+        self.assertEqual(u2, 1*u2)
+        self.assertEqual(u2, u2*1)
+        self.assertEqual(u2, 1*u2)
+        self.assertEqual(u2+u2, u2*2)
+        self.assertEqual(u2+u2, 2*u2)
+        self.assertEqual(u2+u2, u2*2)
+        self.assertEqual(u2+u2, 2*u2)
+        self.assertEqual(u2+u2+u2, u2*3)
+        self.assertEqual(u2+u2+u2, 3*u2)
+
+        class subclass(self.type2test):
+            pass
+        u3 = subclass([0, 1])
+        self.assertEqual(u3, u3*1)
+        self.assertIsNot(u3, u3*1)
+
+    def test_iadd(self):
+        u = self.type2test([0, 1])
+        u += self.type2test()
+        self.assertEqual(u, self.type2test([0, 1]))
+        u += self.type2test([2, 3])
+        self.assertEqual(u, self.type2test([0, 1, 2, 3]))
+        u += self.type2test([4, 5])
+        self.assertEqual(u, self.type2test([0, 1, 2, 3, 4, 5]))
+
+        u = self.type2test("spam")
+        u += self.type2test("eggs")
+        self.assertEqual(u, self.type2test("spameggs"))
+
+    def test_imul(self):
+        u = self.type2test([0, 1])
+        u *= 3
+        self.assertEqual(u, self.type2test([0, 1, 0, 1, 0, 1]))
+        u *= 0
+        self.assertEqual(u, self.type2test([]))
+
+    def test_getitemoverwriteiter(self):
+        # Verify that __getitem__ overrides are not recognized by __iter__
+        class T(self.type2test):
+            def __getitem__(self, key):
+                return str(key) + '!!!'
+        self.assertEqual(next(iter(T((1,2)))), 1)
+
+    def test_repeat(self):
+        for m in range(4):
+            s = tuple(range(m))
+            for n in range(-3, 5):
+                self.assertEqual(self.type2test(s*n), self.type2test(s)*n)
+            self.assertEqual(self.type2test(s)*(-4), self.type2test([]))
+            self.assertEqual(id(s), id(s*1))
+
+    def test_bigrepeat(self):
+        if sys.maxsize <= 2147483647:
+            x = self.type2test([0])
+            x *= 2**16
+            self.assertRaises(MemoryError, x.__mul__, 2**16)
+            if hasattr(x, '__imul__'):
+                self.assertRaises(MemoryError, x.__imul__, 2**16)
+
+    def test_subscript(self):
+        a = self.type2test([10, 11])
+        self.assertEqual(a.__getitem__(0), 10)
+        self.assertEqual(a.__getitem__(1), 11)
+        self.assertEqual(a.__getitem__(-2), 10)
+        self.assertEqual(a.__getitem__(-1), 11)
+        self.assertRaises(IndexError, a.__getitem__, -3)
+        self.assertRaises(IndexError, a.__getitem__, 3)
+        self.assertEqual(a.__getitem__(slice(0,1)), self.type2test([10]))
+        self.assertEqual(a.__getitem__(slice(1,2)), self.type2test([11]))
+        self.assertEqual(a.__getitem__(slice(0,2)), self.type2test([10, 11]))
+        self.assertEqual(a.__getitem__(slice(0,3)), self.type2test([10, 11]))
+        self.assertEqual(a.__getitem__(slice(3,5)), self.type2test([]))
+        self.assertRaises(ValueError, a.__getitem__, slice(0, 10, 0))
+        self.assertRaises(TypeError, a.__getitem__, 'x')
+
+    def test_count(self):
+        a = self.type2test([0, 1, 2])*3
+        self.assertEqual(a.count(0), 3)
+        self.assertEqual(a.count(1), 3)
+        self.assertEqual(a.count(3), 0)
+
+        self.assertEqual(a.count(ALWAYS_EQ), 9)
+        self.assertEqual(self.type2test([ALWAYS_EQ, ALWAYS_EQ]).count(1), 2)
+        self.assertEqual(self.type2test([ALWAYS_EQ, ALWAYS_EQ]).count(NEVER_EQ), 2)
+        self.assertEqual(self.type2test([NEVER_EQ, NEVER_EQ]).count(ALWAYS_EQ), 0)
+
+        self.assertRaises(TypeError, a.count)
+
+        class BadExc(Exception):
+            pass
+
+        class BadCmp:
+            def __eq__(self, other):
+                if other == 2:
+                    raise BadExc()
+                return False
+
+        self.assertRaises(BadExc, a.count, BadCmp())
+
+    def test_index(self):
+        u = self.type2test([0, 1])
+        self.assertEqual(u.index(0), 0)
+        self.assertEqual(u.index(1), 1)
+        self.assertRaises(ValueError, u.index, 2)
+
+        u = self.type2test([-2, -1, 0, 0, 1, 2])
+        self.assertEqual(u.count(0), 2)
+        self.assertEqual(u.index(0), 2)
+        self.assertEqual(u.index(0, 2), 2)
+        self.assertEqual(u.index(-2, -10), 0)
+        self.assertEqual(u.index(0, 3), 3)
+        self.assertEqual(u.index(0, 3, 4), 3)
+        self.assertRaises(ValueError, u.index, 2, 0, -10)
+
+        self.assertEqual(u.index(ALWAYS_EQ), 0)
+        self.assertEqual(self.type2test([ALWAYS_EQ, ALWAYS_EQ]).index(1), 0)
+        self.assertEqual(self.type2test([ALWAYS_EQ, ALWAYS_EQ]).index(NEVER_EQ), 0)
+        self.assertRaises(ValueError, self.type2test([NEVER_EQ, NEVER_EQ]).index, ALWAYS_EQ)
+
+        self.assertRaises(TypeError, u.index)
+
+        class BadExc(Exception):
+            pass
+
+        class BadCmp:
+            def __eq__(self, other):
+                if other == 2:
+                    raise BadExc()
+                return False
+
+        a = self.type2test([0, 1, 2, 3])
+        self.assertRaises(BadExc, a.index, BadCmp())
+
+        a = self.type2test([-2, -1, 0, 0, 1, 2])
+        self.assertEqual(a.index(0), 2)
+        self.assertEqual(a.index(0, 2), 2)
+        self.assertEqual(a.index(0, -4), 2)
+        self.assertEqual(a.index(-2, -10), 0)
+        self.assertEqual(a.index(0, 3), 3)
+        self.assertEqual(a.index(0, -3), 3)
+        self.assertEqual(a.index(0, 3, 4), 3)
+        self.assertEqual(a.index(0, -3, -2), 3)
+        self.assertEqual(a.index(0, -4*sys.maxsize, 4*sys.maxsize), 2)
+        self.assertRaises(ValueError, a.index, 0, 4*sys.maxsize,-4*sys.maxsize)
+        self.assertRaises(ValueError, a.index, 2, 0, -10)
+
+    def test_pickle(self):
+        lst = self.type2test([4, 5, 6, 7])
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            lst2 = pickle.loads(pickle.dumps(lst, proto))
+            self.assertEqual(lst2, lst)
+            self.assertNotEqual(id(lst2), id(lst))
+
+    @support.suppress_immortalization()
+    def test_free_after_iterating(self):
+        support.check_free_after_iterating(self, iter, self.type2test)
+        support.check_free_after_iterating(self, reversed, self.type2test)
diff --git a/test/dynamo/cpython/3_13/test_cmath.diff b/test/dynamo/cpython/3_13/test_cmath.diff
new file mode 100644
index 000000000000..7157e8c0498f
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_cmath.diff
@@ -0,0 +1,116 @@
+diff --git a/test/dynamo/cpython/3_13/test_cmath.py b/test/dynamo/cpython/3_13/test_cmath.py
+index a96a5780b31..883e87a0733 100644
+--- a/test/dynamo/cpython/3_13/test_cmath.py
++++ b/test/dynamo/cpython/3_13/test_cmath.py
+@@ -1,5 +1,55 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ from test.support import requires_IEEE_754, cpython_only, import_helper
+-from test.support.testcase import ComplexesAreIdenticalMixin
+ from test.test_math import parse_testfile, test_file
+ import test.test_math as test_math
+ import unittest
+@@ -50,7 +100,7 @@ complex_nans = [complex(x, y) for x, y in [
+         (INF, NAN)
+         ]]
+ 
+-class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
++class CMathTests(__TestCase):
+     # list of all functions in cmath
+     test_functions = [getattr(cmath, fname) for fname in [
+             'acos', 'acosh', 'asin', 'asinh', 'atan', 'atanh',
+@@ -66,6 +116,39 @@ class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
+     def tearDown(self):
+         self.test_values.close()
+ 
++    def assertFloatIdentical(self, x, y):
++        """Fail unless floats x and y are identical, in the sense that:
++        (1) both x and y are nans, or
++        (2) both x and y are infinities, with the same sign, or
++        (3) both x and y are zeros, with the same sign, or
++        (4) x and y are both finite and nonzero, and x == y
++
++        """
++        msg = 'floats {!r} and {!r} are not identical'
++
++        if math.isnan(x) or math.isnan(y):
++            if math.isnan(x) and math.isnan(y):
++                return
++        elif x == y:
++            if x != 0.0:
++                return
++            # both zero; check that signs match
++            elif math.copysign(1.0, x) == math.copysign(1.0, y):
++                return
++            else:
++                msg += ': zeros have different signs'
++        self.fail(msg.format(x, y))
++
++    def assertComplexesAreIdentical(self, x, y):
++        """Fail unless complex numbers x and y have equal values and signs.
++
++        In particular, if x and y both have real (or imaginary) part
++        zero, but the zeros have different signs, this test will fail.
++
++        """
++        self.assertFloatIdentical(x.real, y.real)
++        self.assertFloatIdentical(x.imag, y.imag)
++
+     def rAssertAlmostEqual(self, a, b, rel_err = 2e-15, abs_err = 5e-323,
+                            msg=None):
+         """Fail if the two floating-point numbers are not almost equal.
+@@ -590,4 +673,4 @@ class IsCloseTests(test_math.IsCloseTests):
+ 
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_cmath.py b/test/dynamo/cpython/3_13/test_cmath.py
new file mode 100644
index 000000000000..883e87a07337
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_cmath.py
@@ -0,0 +1,676 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+from test.support import requires_IEEE_754, cpython_only, import_helper
+from test.test_math import parse_testfile, test_file
+import test.test_math as test_math
+import unittest
+import cmath, math
+from cmath import phase, polar, rect, pi
+import platform
+import sys
+
+
+INF = float('inf')
+NAN = float('nan')
+
+complex_zeros = [complex(x, y) for x in [0.0, -0.0] for y in [0.0, -0.0]]
+complex_infinities = [complex(x, y) for x, y in [
+        (INF, 0.0),  # 1st quadrant
+        (INF, 2.3),
+        (INF, INF),
+        (2.3, INF),
+        (0.0, INF),
+        (-0.0, INF), # 2nd quadrant
+        (-2.3, INF),
+        (-INF, INF),
+        (-INF, 2.3),
+        (-INF, 0.0),
+        (-INF, -0.0), # 3rd quadrant
+        (-INF, -2.3),
+        (-INF, -INF),
+        (-2.3, -INF),
+        (-0.0, -INF),
+        (0.0, -INF), # 4th quadrant
+        (2.3, -INF),
+        (INF, -INF),
+        (INF, -2.3),
+        (INF, -0.0)
+        ]]
+complex_nans = [complex(x, y) for x, y in [
+        (NAN, -INF),
+        (NAN, -2.3),
+        (NAN, -0.0),
+        (NAN, 0.0),
+        (NAN, 2.3),
+        (NAN, INF),
+        (-INF, NAN),
+        (-2.3, NAN),
+        (-0.0, NAN),
+        (0.0, NAN),
+        (2.3, NAN),
+        (INF, NAN)
+        ]]
+
+class CMathTests(__TestCase):
+    # list of all functions in cmath
+    test_functions = [getattr(cmath, fname) for fname in [
+            'acos', 'acosh', 'asin', 'asinh', 'atan', 'atanh',
+            'cos', 'cosh', 'exp', 'log', 'log10', 'sin', 'sinh',
+            'sqrt', 'tan', 'tanh']]
+    # test first and second arguments independently for 2-argument log
+    test_functions.append(lambda x : cmath.log(x, 1729. + 0j))
+    test_functions.append(lambda x : cmath.log(14.-27j, x))
+
+    def setUp(self):
+        self.test_values = open(test_file, encoding="utf-8")
+
+    def tearDown(self):
+        self.test_values.close()
+
+    def assertFloatIdentical(self, x, y):
+        """Fail unless floats x and y are identical, in the sense that:
+        (1) both x and y are nans, or
+        (2) both x and y are infinities, with the same sign, or
+        (3) both x and y are zeros, with the same sign, or
+        (4) x and y are both finite and nonzero, and x == y
+
+        """
+        msg = 'floats {!r} and {!r} are not identical'
+
+        if math.isnan(x) or math.isnan(y):
+            if math.isnan(x) and math.isnan(y):
+                return
+        elif x == y:
+            if x != 0.0:
+                return
+            # both zero; check that signs match
+            elif math.copysign(1.0, x) == math.copysign(1.0, y):
+                return
+            else:
+                msg += ': zeros have different signs'
+        self.fail(msg.format(x, y))
+
+    def assertComplexesAreIdentical(self, x, y):
+        """Fail unless complex numbers x and y have equal values and signs.
+
+        In particular, if x and y both have real (or imaginary) part
+        zero, but the zeros have different signs, this test will fail.
+
+        """
+        self.assertFloatIdentical(x.real, y.real)
+        self.assertFloatIdentical(x.imag, y.imag)
+
+    def rAssertAlmostEqual(self, a, b, rel_err = 2e-15, abs_err = 5e-323,
+                           msg=None):
+        """Fail if the two floating-point numbers are not almost equal.
+
+        Determine whether floating-point values a and b are equal to within
+        a (small) rounding error.  The default values for rel_err and
+        abs_err are chosen to be suitable for platforms where a float is
+        represented by an IEEE 754 double.  They allow an error of between
+        9 and 19 ulps.
+        """
+
+        # special values testing
+        if math.isnan(a):
+            if math.isnan(b):
+                return
+            self.fail(msg or '{!r} should be nan'.format(b))
+
+        if math.isinf(a):
+            if a == b:
+                return
+            self.fail(msg or 'finite result where infinity expected: '
+                      'expected {!r}, got {!r}'.format(a, b))
+
+        # if both a and b are zero, check whether they have the same sign
+        # (in theory there are examples where it would be legitimate for a
+        # and b to have opposite signs; in practice these hardly ever
+        # occur).
+        if not a and not b:
+            if math.copysign(1., a) != math.copysign(1., b):
+                self.fail(msg or 'zero has wrong sign: expected {!r}, '
+                          'got {!r}'.format(a, b))
+
+        # if a-b overflows, or b is infinite, return False.  Again, in
+        # theory there are examples where a is within a few ulps of the
+        # max representable float, and then b could legitimately be
+        # infinite.  In practice these examples are rare.
+        try:
+            absolute_error = abs(b-a)
+        except OverflowError:
+            pass
+        else:
+            # test passes if either the absolute error or the relative
+            # error is sufficiently small.  The defaults amount to an
+            # error of between 9 ulps and 19 ulps on an IEEE-754 compliant
+            # machine.
+            if absolute_error <= max(abs_err, rel_err * abs(a)):
+                return
+        self.fail(msg or
+                  '{!r} and {!r} are not sufficiently close'.format(a, b))
+
+    def test_constants(self):
+        e_expected = 2.71828182845904523536
+        pi_expected = 3.14159265358979323846
+        self.assertAlmostEqual(cmath.pi, pi_expected, places=9,
+            msg="cmath.pi is {}; should be {}".format(cmath.pi, pi_expected))
+        self.assertAlmostEqual(cmath.e, e_expected, places=9,
+            msg="cmath.e is {}; should be {}".format(cmath.e, e_expected))
+
+    def test_infinity_and_nan_constants(self):
+        self.assertEqual(cmath.inf.real, math.inf)
+        self.assertEqual(cmath.inf.imag, 0.0)
+        self.assertEqual(cmath.infj.real, 0.0)
+        self.assertEqual(cmath.infj.imag, math.inf)
+
+        self.assertTrue(math.isnan(cmath.nan.real))
+        self.assertEqual(cmath.nan.imag, 0.0)
+        self.assertEqual(cmath.nanj.real, 0.0)
+        self.assertTrue(math.isnan(cmath.nanj.imag))
+        # Also check that the sign of all of these is positive:
+        self.assertEqual(math.copysign(1., cmath.nan.real), 1.)
+        self.assertEqual(math.copysign(1., cmath.nan.imag), 1.)
+        self.assertEqual(math.copysign(1., cmath.nanj.real), 1.)
+        self.assertEqual(math.copysign(1., cmath.nanj.imag), 1.)
+
+        # Check consistency with reprs.
+        self.assertEqual(repr(cmath.inf), "inf")
+        self.assertEqual(repr(cmath.infj), "infj")
+        self.assertEqual(repr(cmath.nan), "nan")
+        self.assertEqual(repr(cmath.nanj), "nanj")
+
+    def test_user_object(self):
+        # Test automatic calling of __complex__ and __float__ by cmath
+        # functions
+
+        # some random values to use as test values; we avoid values
+        # for which any of the functions in cmath is undefined
+        # (i.e. 0., 1., -1., 1j, -1j) or would cause overflow
+        cx_arg = 4.419414439 + 1.497100113j
+        flt_arg = -6.131677725
+
+        # a variety of non-complex numbers, used to check that
+        # non-complex return values from __complex__ give an error
+        non_complexes = ["not complex", 1, 5, 2., None,
+                         object(), NotImplemented]
+
+        # Now we introduce a variety of classes whose instances might
+        # end up being passed to the cmath functions
+
+        # usual case: new-style class implementing __complex__
+        class MyComplex:
+            def __init__(self, value):
+                self.value = value
+            def __complex__(self):
+                return self.value
+
+        # classes for which __complex__ raises an exception
+        class SomeException(Exception):
+            pass
+        class MyComplexException:
+            def __complex__(self):
+                raise SomeException
+
+        # some classes not providing __float__ or __complex__
+        class NeitherComplexNorFloat(object):
+            pass
+        class Index:
+            def __int__(self): return 2
+            def __index__(self): return 2
+        class MyInt:
+            def __int__(self): return 2
+
+        # other possible combinations of __float__ and __complex__
+        # that should work
+        class FloatAndComplex:
+            def __float__(self):
+                return flt_arg
+            def __complex__(self):
+                return cx_arg
+        class JustFloat:
+            def __float__(self):
+                return flt_arg
+
+        for f in self.test_functions:
+            # usual usage
+            self.assertEqual(f(MyComplex(cx_arg)), f(cx_arg))
+            # other combinations of __float__ and __complex__
+            self.assertEqual(f(FloatAndComplex()), f(cx_arg))
+            self.assertEqual(f(JustFloat()), f(flt_arg))
+            self.assertEqual(f(Index()), f(int(Index())))
+            # TypeError should be raised for classes not providing
+            # either __complex__ or __float__, even if they provide
+            # __int__ or __index__:
+            self.assertRaises(TypeError, f, NeitherComplexNorFloat())
+            self.assertRaises(TypeError, f, MyInt())
+            # non-complex return value from __complex__ -> TypeError
+            for bad_complex in non_complexes:
+                self.assertRaises(TypeError, f, MyComplex(bad_complex))
+            # exceptions in __complex__ should be propagated correctly
+            self.assertRaises(SomeException, f, MyComplexException())
+
+    def test_input_type(self):
+        # ints should be acceptable inputs to all cmath
+        # functions, by virtue of providing a __float__ method
+        for f in self.test_functions:
+            for arg in [2, 2.]:
+                self.assertEqual(f(arg), f(arg.__float__()))
+
+        # but strings should give a TypeError
+        for f in self.test_functions:
+            for arg in ["a", "long_string", "0", "1j", ""]:
+                self.assertRaises(TypeError, f, arg)
+
+    def test_cmath_matches_math(self):
+        # check that corresponding cmath and math functions are equal
+        # for floats in the appropriate range
+
+        # test_values in (0, 1)
+        test_values = [0.01, 0.1, 0.2, 0.5, 0.9, 0.99]
+
+        # test_values for functions defined on [-1., 1.]
+        unit_interval = test_values + [-x for x in test_values] + \
+            [0., 1., -1.]
+
+        # test_values for log, log10, sqrt
+        positive = test_values + [1.] + [1./x for x in test_values]
+        nonnegative = [0.] + positive
+
+        # test_values for functions defined on the whole real line
+        real_line = [0.] + positive + [-x for x in positive]
+
+        test_functions = {
+            'acos' : unit_interval,
+            'asin' : unit_interval,
+            'atan' : real_line,
+            'cos' : real_line,
+            'cosh' : real_line,
+            'exp' : real_line,
+            'log' : positive,
+            'log10' : positive,
+            'sin' : real_line,
+            'sinh' : real_line,
+            'sqrt' : nonnegative,
+            'tan' : real_line,
+            'tanh' : real_line}
+
+        for fn, values in test_functions.items():
+            float_fn = getattr(math, fn)
+            complex_fn = getattr(cmath, fn)
+            for v in values:
+                z = complex_fn(v)
+                self.rAssertAlmostEqual(float_fn(v), z.real)
+                self.assertEqual(0., z.imag)
+
+        # test two-argument version of log with various bases
+        for base in [0.5, 2., 10.]:
+            for v in positive:
+                z = cmath.log(v, base)
+                self.rAssertAlmostEqual(math.log(v, base), z.real)
+                self.assertEqual(0., z.imag)
+
+    @requires_IEEE_754
+    def test_specific_values(self):
+        # Some tests need to be skipped on ancient OS X versions.
+        # See issue #27953.
+        SKIP_ON_TIGER = {'tan0064'}
+
+        osx_version = None
+        if sys.platform == 'darwin':
+            version_txt = platform.mac_ver()[0]
+            try:
+                osx_version = tuple(map(int, version_txt.split('.')))
+            except ValueError:
+                pass
+
+        def rect_complex(z):
+            """Wrapped version of rect that accepts a complex number instead of
+            two float arguments."""
+            return cmath.rect(z.real, z.imag)
+
+        def polar_complex(z):
+            """Wrapped version of polar that returns a complex number instead of
+            two floats."""
+            return complex(*polar(z))
+
+        for id, fn, ar, ai, er, ei, flags in parse_testfile(test_file):
+            arg = complex(ar, ai)
+            expected = complex(er, ei)
+
+            # Skip certain tests on OS X 10.4.
+            if osx_version is not None and osx_version < (10, 5):
+                if id in SKIP_ON_TIGER:
+                    continue
+
+            if fn == 'rect':
+                function = rect_complex
+            elif fn == 'polar':
+                function = polar_complex
+            else:
+                function = getattr(cmath, fn)
+            if 'divide-by-zero' in flags or 'invalid' in flags:
+                try:
+                    actual = function(arg)
+                except ValueError:
+                    continue
+                else:
+                    self.fail('ValueError not raised in test '
+                          '{}: {}(complex({!r}, {!r}))'.format(id, fn, ar, ai))
+
+            if 'overflow' in flags:
+                try:
+                    actual = function(arg)
+                except OverflowError:
+                    continue
+                else:
+                    self.fail('OverflowError not raised in test '
+                          '{}: {}(complex({!r}, {!r}))'.format(id, fn, ar, ai))
+
+            actual = function(arg)
+
+            if 'ignore-real-sign' in flags:
+                actual = complex(abs(actual.real), actual.imag)
+                expected = complex(abs(expected.real), expected.imag)
+            if 'ignore-imag-sign' in flags:
+                actual = complex(actual.real, abs(actual.imag))
+                expected = complex(expected.real, abs(expected.imag))
+
+            # for the real part of the log function, we allow an
+            # absolute error of up to 2e-15.
+            if fn in ('log', 'log10'):
+                real_abs_err = 2e-15
+            else:
+                real_abs_err = 5e-323
+
+            error_message = (
+                '{}: {}(complex({!r}, {!r}))\n'
+                'Expected: complex({!r}, {!r})\n'
+                'Received: complex({!r}, {!r})\n'
+                'Received value insufficiently close to expected value.'
+                ).format(id, fn, ar, ai,
+                     expected.real, expected.imag,
+                     actual.real, actual.imag)
+            self.rAssertAlmostEqual(expected.real, actual.real,
+                                        abs_err=real_abs_err,
+                                        msg=error_message)
+            self.rAssertAlmostEqual(expected.imag, actual.imag,
+                                        msg=error_message)
+
+    def check_polar(self, func):
+        def check(arg, expected):
+            got = func(arg)
+            for e, g in zip(expected, got):
+                self.rAssertAlmostEqual(e, g)
+        check(0, (0., 0.))
+        check(1, (1., 0.))
+        check(-1, (1., pi))
+        check(1j, (1., pi / 2))
+        check(-3j, (3., -pi / 2))
+        inf = float('inf')
+        check(complex(inf, 0), (inf, 0.))
+        check(complex(-inf, 0), (inf, pi))
+        check(complex(3, inf), (inf, pi / 2))
+        check(complex(5, -inf), (inf, -pi / 2))
+        check(complex(inf, inf), (inf, pi / 4))
+        check(complex(inf, -inf), (inf, -pi / 4))
+        check(complex(-inf, inf), (inf, 3 * pi / 4))
+        check(complex(-inf, -inf), (inf, -3 * pi / 4))
+        nan = float('nan')
+        check(complex(nan, 0), (nan, nan))
+        check(complex(0, nan), (nan, nan))
+        check(complex(nan, nan), (nan, nan))
+        check(complex(inf, nan), (inf, nan))
+        check(complex(-inf, nan), (inf, nan))
+        check(complex(nan, inf), (inf, nan))
+        check(complex(nan, -inf), (inf, nan))
+
+    def test_polar(self):
+        self.check_polar(polar)
+
+    @cpython_only
+    def test_polar_errno(self):
+        # Issue #24489: check a previously set C errno doesn't disturb polar()
+        _testcapi = import_helper.import_module('_testcapi')
+        def polar_with_errno_set(z):
+            _testcapi.set_errno(11)
+            try:
+                return polar(z)
+            finally:
+                _testcapi.set_errno(0)
+        self.check_polar(polar_with_errno_set)
+
+    def test_phase(self):
+        self.assertAlmostEqual(phase(0), 0.)
+        self.assertAlmostEqual(phase(1.), 0.)
+        self.assertAlmostEqual(phase(-1.), pi)
+        self.assertAlmostEqual(phase(-1.+1E-300j), pi)
+        self.assertAlmostEqual(phase(-1.-1E-300j), -pi)
+        self.assertAlmostEqual(phase(1j), pi/2)
+        self.assertAlmostEqual(phase(-1j), -pi/2)
+
+        # zeros
+        self.assertEqual(phase(complex(0.0, 0.0)), 0.0)
+        self.assertEqual(phase(complex(0.0, -0.0)), -0.0)
+        self.assertEqual(phase(complex(-0.0, 0.0)), pi)
+        self.assertEqual(phase(complex(-0.0, -0.0)), -pi)
+
+        # infinities
+        self.assertAlmostEqual(phase(complex(-INF, -0.0)), -pi)
+        self.assertAlmostEqual(phase(complex(-INF, -2.3)), -pi)
+        self.assertAlmostEqual(phase(complex(-INF, -INF)), -0.75*pi)
+        self.assertAlmostEqual(phase(complex(-2.3, -INF)), -pi/2)
+        self.assertAlmostEqual(phase(complex(-0.0, -INF)), -pi/2)
+        self.assertAlmostEqual(phase(complex(0.0, -INF)), -pi/2)
+        self.assertAlmostEqual(phase(complex(2.3, -INF)), -pi/2)
+        self.assertAlmostEqual(phase(complex(INF, -INF)), -pi/4)
+        self.assertEqual(phase(complex(INF, -2.3)), -0.0)
+        self.assertEqual(phase(complex(INF, -0.0)), -0.0)
+        self.assertEqual(phase(complex(INF, 0.0)), 0.0)
+        self.assertEqual(phase(complex(INF, 2.3)), 0.0)
+        self.assertAlmostEqual(phase(complex(INF, INF)), pi/4)
+        self.assertAlmostEqual(phase(complex(2.3, INF)), pi/2)
+        self.assertAlmostEqual(phase(complex(0.0, INF)), pi/2)
+        self.assertAlmostEqual(phase(complex(-0.0, INF)), pi/2)
+        self.assertAlmostEqual(phase(complex(-2.3, INF)), pi/2)
+        self.assertAlmostEqual(phase(complex(-INF, INF)), 0.75*pi)
+        self.assertAlmostEqual(phase(complex(-INF, 2.3)), pi)
+        self.assertAlmostEqual(phase(complex(-INF, 0.0)), pi)
+
+        # real or imaginary part NaN
+        for z in complex_nans:
+            self.assertTrue(math.isnan(phase(z)))
+
+    def test_abs(self):
+        # zeros
+        for z in complex_zeros:
+            self.assertEqual(abs(z), 0.0)
+
+        # infinities
+        for z in complex_infinities:
+            self.assertEqual(abs(z), INF)
+
+        # real or imaginary part NaN
+        self.assertEqual(abs(complex(NAN, -INF)), INF)
+        self.assertTrue(math.isnan(abs(complex(NAN, -2.3))))
+        self.assertTrue(math.isnan(abs(complex(NAN, -0.0))))
+        self.assertTrue(math.isnan(abs(complex(NAN, 0.0))))
+        self.assertTrue(math.isnan(abs(complex(NAN, 2.3))))
+        self.assertEqual(abs(complex(NAN, INF)), INF)
+        self.assertEqual(abs(complex(-INF, NAN)), INF)
+        self.assertTrue(math.isnan(abs(complex(-2.3, NAN))))
+        self.assertTrue(math.isnan(abs(complex(-0.0, NAN))))
+        self.assertTrue(math.isnan(abs(complex(0.0, NAN))))
+        self.assertTrue(math.isnan(abs(complex(2.3, NAN))))
+        self.assertEqual(abs(complex(INF, NAN)), INF)
+        self.assertTrue(math.isnan(abs(complex(NAN, NAN))))
+
+
+    @requires_IEEE_754
+    def test_abs_overflows(self):
+        # result overflows
+        self.assertRaises(OverflowError, abs, complex(1.4e308, 1.4e308))
+
+    def assertCEqual(self, a, b):
+        eps = 1E-7
+        if abs(a.real - b[0]) > eps or abs(a.imag - b[1]) > eps:
+            self.fail((a ,b))
+
+    def test_rect(self):
+        self.assertCEqual(rect(0, 0), (0, 0))
+        self.assertCEqual(rect(1, 0), (1., 0))
+        self.assertCEqual(rect(1, -pi), (-1., 0))
+        self.assertCEqual(rect(1, pi/2), (0, 1.))
+        self.assertCEqual(rect(1, -pi/2), (0, -1.))
+
+    def test_isfinite(self):
+        real_vals = [float('-inf'), -2.3, -0.0,
+                     0.0, 2.3, float('inf'), float('nan')]
+        for x in real_vals:
+            for y in real_vals:
+                z = complex(x, y)
+                self.assertEqual(cmath.isfinite(z),
+                                  math.isfinite(x) and math.isfinite(y))
+
+    def test_isnan(self):
+        self.assertFalse(cmath.isnan(1))
+        self.assertFalse(cmath.isnan(1j))
+        self.assertFalse(cmath.isnan(INF))
+        self.assertTrue(cmath.isnan(NAN))
+        self.assertTrue(cmath.isnan(complex(NAN, 0)))
+        self.assertTrue(cmath.isnan(complex(0, NAN)))
+        self.assertTrue(cmath.isnan(complex(NAN, NAN)))
+        self.assertTrue(cmath.isnan(complex(NAN, INF)))
+        self.assertTrue(cmath.isnan(complex(INF, NAN)))
+
+    def test_isinf(self):
+        self.assertFalse(cmath.isinf(1))
+        self.assertFalse(cmath.isinf(1j))
+        self.assertFalse(cmath.isinf(NAN))
+        self.assertTrue(cmath.isinf(INF))
+        self.assertTrue(cmath.isinf(complex(INF, 0)))
+        self.assertTrue(cmath.isinf(complex(0, INF)))
+        self.assertTrue(cmath.isinf(complex(INF, INF)))
+        self.assertTrue(cmath.isinf(complex(NAN, INF)))
+        self.assertTrue(cmath.isinf(complex(INF, NAN)))
+
+    @requires_IEEE_754
+    def testTanhSign(self):
+        for z in complex_zeros:
+            self.assertComplexesAreIdentical(cmath.tanh(z), z)
+
+    # The algorithm used for atan and atanh makes use of the system
+    # log1p function; If that system function doesn't respect the sign
+    # of zero, then atan and atanh will also have difficulties with
+    # the sign of complex zeros.
+    @requires_IEEE_754
+    def testAtanSign(self):
+        for z in complex_zeros:
+            self.assertComplexesAreIdentical(cmath.atan(z), z)
+
+    @requires_IEEE_754
+    def testAtanhSign(self):
+        for z in complex_zeros:
+            self.assertComplexesAreIdentical(cmath.atanh(z), z)
+
+
+class IsCloseTests(test_math.IsCloseTests):
+    isclose = cmath.isclose
+
+    def test_reject_complex_tolerances(self):
+        with self.assertRaises(TypeError):
+            self.isclose(1j, 1j, rel_tol=1j)
+
+        with self.assertRaises(TypeError):
+            self.isclose(1j, 1j, abs_tol=1j)
+
+        with self.assertRaises(TypeError):
+            self.isclose(1j, 1j, rel_tol=1j, abs_tol=1j)
+
+    def test_complex_values(self):
+        # test complex values that are close to within 12 decimal places
+        complex_examples = [(1.0+1.0j, 1.000000000001+1.0j),
+                            (1.0+1.0j, 1.0+1.000000000001j),
+                            (-1.0+1.0j, -1.000000000001+1.0j),
+                            (1.0-1.0j, 1.0-0.999999999999j),
+                            ]
+
+        self.assertAllClose(complex_examples, rel_tol=1e-12)
+        self.assertAllNotClose(complex_examples, rel_tol=1e-13)
+
+    def test_complex_near_zero(self):
+        # test values near zero that are near to within three decimal places
+        near_zero_examples = [(0.001j, 0),
+                              (0.001, 0),
+                              (0.001+0.001j, 0),
+                              (-0.001+0.001j, 0),
+                              (0.001-0.001j, 0),
+                              (-0.001-0.001j, 0),
+                              ]
+
+        self.assertAllClose(near_zero_examples, abs_tol=1.5e-03)
+        self.assertAllNotClose(near_zero_examples, abs_tol=0.5e-03)
+
+        self.assertIsClose(0.001-0.001j, 0.001+0.001j, abs_tol=2e-03)
+        self.assertIsNotClose(0.001-0.001j, 0.001+0.001j, abs_tol=1e-03)
+
+    def test_complex_special(self):
+        self.assertIsNotClose(INF, INF*1j)
+        self.assertIsNotClose(INF*1j, INF)
+        self.assertIsNotClose(INF, -INF)
+        self.assertIsNotClose(-INF, INF)
+        self.assertIsNotClose(0, INF)
+        self.assertIsNotClose(0, INF*1j)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_complex.diff b/test/dynamo/cpython/3_13/test_complex.diff
new file mode 100644
index 000000000000..a7867e47f227
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_complex.diff
@@ -0,0 +1,231 @@
+diff --git a/test/dynamo/cpython/3_13/test_complex.py b/test/dynamo/cpython/3_13/test_complex.py
+index 6ff1a8ab29d..ab5bd3dab62 100644
+--- a/test/dynamo/cpython/3_13/test_complex.py
++++ b/test/dynamo/cpython/3_13/test_complex.py
+@@ -1,16 +1,143 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
+ import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import (
++    run_tests,
++    xfailIfTorchDynamo,
++)
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
+ import sys
+-from test import support
+-from test.support.testcase import ComplexesAreIdenticalMixin
+-from test.support.numbers import (
+-    VALID_UNDERSCORE_LITERALS,
+-    INVALID_UNDERSCORE_LITERALS,
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
+ )
+ 
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
++import unittest
++import sys
++from test import support
++from test.support.testcase import ComplexesAreIdenticalMixin
+ from random import random
+ from math import isnan, copysign
++import math
+ import operator
+ 
++VALID_UNDERSCORE_LITERALS = [
++    '0_0_0',
++    '4_2',
++    '1_0000_0000',
++    '0b1001_0100',
++    '0xffff_ffff',
++    '0o5_7_7',
++    '1_00_00.5',
++    '1_00_00.5e5',
++    '1_00_00e5_1',
++    '1e1_0',
++    '.1_4',
++    '.1_4e1',
++    '0b_0',
++    '0x_f',
++    '0o_5',
++    '1_00_00j',
++    '1_00_00.5j',
++    '1_00_00e5_1j',
++    '.1_4j',
++    '(1_2.5+3_3j)',
++    '(.5_6j)',
++]
++INVALID_UNDERSCORE_LITERALS = [
++    # Trailing underscores:
++    '0_',
++    '42_',
++    '1.4j_',
++    '0x_',
++    '0b1_',
++    '0xf_',
++    '0o5_',
++    '0 if 1_Else 1',
++    # Underscores in the base selector:
++    '0_b0',
++    '0_xf',
++    '0_o5',
++    # Old-style octal, still disallowed:
++    '0_7',
++    '09_99',
++    # Multiple consecutive underscores:
++    '4_______2',
++    '0.1__4',
++    '0.1__4j',
++    '0b1001__0100',
++    '0xffff__ffff',
++    '0x___',
++    '0o5__77',
++    '1e1__0',
++    '1e1__0j',
++    # Underscore right before a dot:
++    '1_.4',
++    '1_.4j',
++    # Underscore right after a dot:
++    '1._4',
++    '1._4j',
++    '._5',
++    '._5j',
++    # Underscore right after a sign:
++    '1.0e+_1',
++    '1.0e+_1j',
++    # Underscore right before j:
++    '1.4_j',
++    '1.4e5_j',
++    # Underscore right before e:
++    '1_e1',
++    '1.4_e1',
++    '1.4_e1j',
++    # Underscore right after e:
++    '1e_1',
++    '1.4e_1',
++    '1.4e_1j',
++    # Complex cases with parens:
++    '(1+1.5_j_)',
++    '(1+1.5_j)',
++]
++
+ INF = float("inf")
+ NAN = float("nan")
+ DBL_MAX = sys.float_info.max
+@@ -45,7 +172,40 @@ class WithComplex:
+     def __complex__(self):
+         return self.value
+ 
+-class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
++class ComplexTest(__TestCase):
++
++    def assertFloatIdentical(self, x, y):
++        """Fail unless floats x and y are identical, in the sense that:
++        (1) both x and y are nans, or
++        (2) both x and y are infinities, with the same sign, or
++        (3) both x and y are zeros, with the same sign, or
++        (4) x and y are both finite and nonzero, and x == y
++
++        """
++        msg = 'floats {!r} and {!r} are not identical'
++
++        if math.isnan(x) or math.isnan(y):
++            if math.isnan(x) and math.isnan(y):
++                return
++        elif x == y:
++            if x != 0.0:
++                return
++            # both zero; check that signs match
++            elif math.copysign(1.0, x) == math.copysign(1.0, y):
++                return
++            else:
++                msg += ': zeros have different signs'
++        self.fail(msg.format(x, y))
++
++    def assertComplexesAreIdentical(self, x, y):
++        """Fail unless complex numbers x and y have equal values and signs.
++
++        In particular, if x and y both have real (or imaginary) part
++        zero, but the zeros have different signs, this test will fail.
++
++        """
++        self.assertFloatIdentical(x.real, y.real)
++        self.assertFloatIdentical(x.imag, y.imag)
+ 
+     def assertAlmostEqual(self, a, b):
+         if isinstance(a, complex):
+@@ -74,6 +234,29 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+         # check that relative difference < eps
+         self.assertTrue(abs((x-y)/y) < eps)
+ 
++    def assertFloatsAreIdentical(self, x, y):
++        """assert that floats x and y are identical, in the sense that:
++        (1) both x and y are nans, or
++        (2) both x and y are infinities, with the same sign, or
++        (3) both x and y are zeros, with the same sign, or
++        (4) x and y are both finite and nonzero, and x == y
++
++        """
++        msg = 'floats {!r} and {!r} are not identical'
++
++        if isnan(x) or isnan(y):
++            if isnan(x) and isnan(y):
++                return
++        elif x == y:
++            if x != 0.0:
++                return
++            # both zero; check that signs match
++            elif copysign(1.0, x) == copysign(1.0, y):
++                return
++            else:
++                msg += ': zeros have different signs'
++        self.fail(msg.format(x, y))
++
+     def assertClose(self, x, y, eps=1e-9):
+         """Return true iff complexes x and y "are close"."""
+         self.assertCloseAbs(x.real, y.real, eps)
+@@ -855,4 +1038,4 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
+ 
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_complex.py b/test/dynamo/cpython/3_13/test_complex.py
new file mode 100644
index 000000000000..ab5bd3dab62b
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_complex.py
@@ -0,0 +1,1041 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import (
+    run_tests,
+    xfailIfTorchDynamo,
+)
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+import unittest
+import sys
+from test import support
+from test.support.testcase import ComplexesAreIdenticalMixin
+from random import random
+from math import isnan, copysign
+import math
+import operator
+
+VALID_UNDERSCORE_LITERALS = [
+    '0_0_0',
+    '4_2',
+    '1_0000_0000',
+    '0b1001_0100',
+    '0xffff_ffff',
+    '0o5_7_7',
+    '1_00_00.5',
+    '1_00_00.5e5',
+    '1_00_00e5_1',
+    '1e1_0',
+    '.1_4',
+    '.1_4e1',
+    '0b_0',
+    '0x_f',
+    '0o_5',
+    '1_00_00j',
+    '1_00_00.5j',
+    '1_00_00e5_1j',
+    '.1_4j',
+    '(1_2.5+3_3j)',
+    '(.5_6j)',
+]
+INVALID_UNDERSCORE_LITERALS = [
+    # Trailing underscores:
+    '0_',
+    '42_',
+    '1.4j_',
+    '0x_',
+    '0b1_',
+    '0xf_',
+    '0o5_',
+    '0 if 1_Else 1',
+    # Underscores in the base selector:
+    '0_b0',
+    '0_xf',
+    '0_o5',
+    # Old-style octal, still disallowed:
+    '0_7',
+    '09_99',
+    # Multiple consecutive underscores:
+    '4_______2',
+    '0.1__4',
+    '0.1__4j',
+    '0b1001__0100',
+    '0xffff__ffff',
+    '0x___',
+    '0o5__77',
+    '1e1__0',
+    '1e1__0j',
+    # Underscore right before a dot:
+    '1_.4',
+    '1_.4j',
+    # Underscore right after a dot:
+    '1._4',
+    '1._4j',
+    '._5',
+    '._5j',
+    # Underscore right after a sign:
+    '1.0e+_1',
+    '1.0e+_1j',
+    # Underscore right before j:
+    '1.4_j',
+    '1.4e5_j',
+    # Underscore right before e:
+    '1_e1',
+    '1.4_e1',
+    '1.4_e1j',
+    # Underscore right after e:
+    '1e_1',
+    '1.4e_1',
+    '1.4e_1j',
+    # Complex cases with parens:
+    '(1+1.5_j_)',
+    '(1+1.5_j)',
+]
+
+INF = float("inf")
+NAN = float("nan")
+DBL_MAX = sys.float_info.max
+# These tests ensure that complex math does the right thing
+
+ZERO_DIVISION = (
+    (1+1j, 0+0j),
+    (1+1j, 0.0),
+    (1+1j, 0),
+    (1.0, 0+0j),
+    (1, 0+0j),
+)
+
+class WithIndex:
+    def __init__(self, value):
+        self.value = value
+    def __index__(self):
+        return self.value
+
+class WithFloat:
+    def __init__(self, value):
+        self.value = value
+    def __float__(self):
+        return self.value
+
+class ComplexSubclass(complex):
+    pass
+
+class WithComplex:
+    def __init__(self, value):
+        self.value = value
+    def __complex__(self):
+        return self.value
+
+class ComplexTest(__TestCase):
+
+    def assertFloatIdentical(self, x, y):
+        """Fail unless floats x and y are identical, in the sense that:
+        (1) both x and y are nans, or
+        (2) both x and y are infinities, with the same sign, or
+        (3) both x and y are zeros, with the same sign, or
+        (4) x and y are both finite and nonzero, and x == y
+
+        """
+        msg = 'floats {!r} and {!r} are not identical'
+
+        if math.isnan(x) or math.isnan(y):
+            if math.isnan(x) and math.isnan(y):
+                return
+        elif x == y:
+            if x != 0.0:
+                return
+            # both zero; check that signs match
+            elif math.copysign(1.0, x) == math.copysign(1.0, y):
+                return
+            else:
+                msg += ': zeros have different signs'
+        self.fail(msg.format(x, y))
+
+    def assertComplexesAreIdentical(self, x, y):
+        """Fail unless complex numbers x and y have equal values and signs.
+
+        In particular, if x and y both have real (or imaginary) part
+        zero, but the zeros have different signs, this test will fail.
+
+        """
+        self.assertFloatIdentical(x.real, y.real)
+        self.assertFloatIdentical(x.imag, y.imag)
+
+    def assertAlmostEqual(self, a, b):
+        if isinstance(a, complex):
+            if isinstance(b, complex):
+                unittest.TestCase.assertAlmostEqual(self, a.real, b.real)
+                unittest.TestCase.assertAlmostEqual(self, a.imag, b.imag)
+            else:
+                unittest.TestCase.assertAlmostEqual(self, a.real, b)
+                unittest.TestCase.assertAlmostEqual(self, a.imag, 0.)
+        else:
+            if isinstance(b, complex):
+                unittest.TestCase.assertAlmostEqual(self, a, b.real)
+                unittest.TestCase.assertAlmostEqual(self, 0., b.imag)
+            else:
+                unittest.TestCase.assertAlmostEqual(self, a, b)
+
+    def assertCloseAbs(self, x, y, eps=1e-9):
+        """Return true iff floats x and y "are close"."""
+        # put the one with larger magnitude second
+        if abs(x) > abs(y):
+            x, y = y, x
+        if y == 0:
+            return abs(x) < eps
+        if x == 0:
+            return abs(y) < eps
+        # check that relative difference < eps
+        self.assertTrue(abs((x-y)/y) < eps)
+
+    def assertFloatsAreIdentical(self, x, y):
+        """assert that floats x and y are identical, in the sense that:
+        (1) both x and y are nans, or
+        (2) both x and y are infinities, with the same sign, or
+        (3) both x and y are zeros, with the same sign, or
+        (4) x and y are both finite and nonzero, and x == y
+
+        """
+        msg = 'floats {!r} and {!r} are not identical'
+
+        if isnan(x) or isnan(y):
+            if isnan(x) and isnan(y):
+                return
+        elif x == y:
+            if x != 0.0:
+                return
+            # both zero; check that signs match
+            elif copysign(1.0, x) == copysign(1.0, y):
+                return
+            else:
+                msg += ': zeros have different signs'
+        self.fail(msg.format(x, y))
+
+    def assertClose(self, x, y, eps=1e-9):
+        """Return true iff complexes x and y "are close"."""
+        self.assertCloseAbs(x.real, y.real, eps)
+        self.assertCloseAbs(x.imag, y.imag, eps)
+
+    def check_div(self, x, y):
+        """Compute complex z=x*y, and check that z/x==y and z/y==x."""
+        z = x * y
+        if x != 0:
+            q = z / x
+            self.assertClose(q, y)
+            q = z.__truediv__(x)
+            self.assertClose(q, y)
+        if y != 0:
+            q = z / y
+            self.assertClose(q, x)
+            q = z.__truediv__(y)
+            self.assertClose(q, x)
+
+    def test_truediv(self):
+        simple_real = [float(i) for i in range(-5, 6)]
+        simple_complex = [complex(x, y) for x in simple_real for y in simple_real]
+        for x in simple_complex:
+            for y in simple_complex:
+                self.check_div(x, y)
+
+        # A naive complex division algorithm (such as in 2.0) is very prone to
+        # nonsense errors for these (overflows and underflows).
+        self.check_div(complex(1e200, 1e200), 1+0j)
+        self.check_div(complex(1e-200, 1e-200), 1+0j)
+
+        # Just for fun.
+        for i in range(100):
+            self.check_div(complex(random(), random()),
+                           complex(random(), random()))
+
+        self.assertAlmostEqual(complex.__truediv__(2+0j, 1+1j), 1-1j)
+        self.assertRaises(TypeError, operator.truediv, 1j, None)
+        self.assertRaises(TypeError, operator.truediv, None, 1j)
+
+        for denom_real, denom_imag in [(0, NAN), (NAN, 0), (NAN, NAN)]:
+            z = complex(0, 0) / complex(denom_real, denom_imag)
+            self.assertTrue(isnan(z.real))
+            self.assertTrue(isnan(z.imag))
+
+    def test_truediv_zero_division(self):
+        for a, b in ZERO_DIVISION:
+            with self.assertRaises(ZeroDivisionError):
+                a / b
+
+    def test_floordiv(self):
+        with self.assertRaises(TypeError):
+            (1+1j) // (1+0j)
+        with self.assertRaises(TypeError):
+            (1+1j) // 1.0
+        with self.assertRaises(TypeError):
+            (1+1j) // 1
+        with self.assertRaises(TypeError):
+            1.0 // (1+0j)
+        with self.assertRaises(TypeError):
+            1 // (1+0j)
+
+    def test_floordiv_zero_division(self):
+        for a, b in ZERO_DIVISION:
+            with self.assertRaises(TypeError):
+                a // b
+
+    def test_richcompare(self):
+        self.assertIs(complex.__eq__(1+1j, 1<<10000), False)
+        self.assertIs(complex.__lt__(1+1j, None), NotImplemented)
+        self.assertIs(complex.__eq__(1+1j, None), NotImplemented)
+        self.assertIs(complex.__eq__(1+1j, 1+1j), True)
+        self.assertIs(complex.__eq__(1+1j, 2+2j), False)
+        self.assertIs(complex.__ne__(1+1j, 1+1j), False)
+        self.assertIs(complex.__ne__(1+1j, 2+2j), True)
+        for i in range(1, 100):
+            f = i / 100.0
+            self.assertIs(complex.__eq__(f+0j, f), True)
+            self.assertIs(complex.__ne__(f+0j, f), False)
+            self.assertIs(complex.__eq__(complex(f, f), f), False)
+            self.assertIs(complex.__ne__(complex(f, f), f), True)
+        self.assertIs(complex.__lt__(1+1j, 2+2j), NotImplemented)
+        self.assertIs(complex.__le__(1+1j, 2+2j), NotImplemented)
+        self.assertIs(complex.__gt__(1+1j, 2+2j), NotImplemented)
+        self.assertIs(complex.__ge__(1+1j, 2+2j), NotImplemented)
+        self.assertRaises(TypeError, operator.lt, 1+1j, 2+2j)
+        self.assertRaises(TypeError, operator.le, 1+1j, 2+2j)
+        self.assertRaises(TypeError, operator.gt, 1+1j, 2+2j)
+        self.assertRaises(TypeError, operator.ge, 1+1j, 2+2j)
+        self.assertIs(operator.eq(1+1j, 1+1j), True)
+        self.assertIs(operator.eq(1+1j, 2+2j), False)
+        self.assertIs(operator.ne(1+1j, 1+1j), False)
+        self.assertIs(operator.ne(1+1j, 2+2j), True)
+        self.assertIs(operator.eq(1+1j, 2.0), False)
+
+    def test_richcompare_boundaries(self):
+        def check(n, deltas, is_equal, imag = 0.0):
+            for delta in deltas:
+                i = n + delta
+                z = complex(i, imag)
+                self.assertIs(complex.__eq__(z, i), is_equal(delta))
+                self.assertIs(complex.__ne__(z, i), not is_equal(delta))
+        # For IEEE-754 doubles the following should hold:
+        #    x in [2 ** (52 + i), 2 ** (53 + i + 1)] -> x mod 2 ** i == 0
+        # where the interval is representable, of course.
+        for i in range(1, 10):
+            pow = 52 + i
+            mult = 2 ** i
+            check(2 ** pow, range(1, 101), lambda delta: delta % mult == 0)
+            check(2 ** pow, range(1, 101), lambda delta: False, float(i))
+        check(2 ** 53, range(-100, 0), lambda delta: True)
+
+    def test_add(self):
+        self.assertEqual(1j + int(+1), complex(+1, 1))
+        self.assertEqual(1j + int(-1), complex(-1, 1))
+        self.assertRaises(OverflowError, operator.add, 1j, 10**1000)
+        self.assertRaises(TypeError, operator.add, 1j, None)
+        self.assertRaises(TypeError, operator.add, None, 1j)
+
+    def test_sub(self):
+        self.assertEqual(1j - int(+1), complex(-1, 1))
+        self.assertEqual(1j - int(-1), complex(1, 1))
+        self.assertRaises(OverflowError, operator.sub, 1j, 10**1000)
+        self.assertRaises(TypeError, operator.sub, 1j, None)
+        self.assertRaises(TypeError, operator.sub, None, 1j)
+
+    def test_mul(self):
+        self.assertEqual(1j * int(20), complex(0, 20))
+        self.assertEqual(1j * int(-1), complex(0, -1))
+        self.assertRaises(OverflowError, operator.mul, 1j, 10**1000)
+        self.assertRaises(TypeError, operator.mul, 1j, None)
+        self.assertRaises(TypeError, operator.mul, None, 1j)
+
+    def test_mod(self):
+        # % is no longer supported on complex numbers
+        with self.assertRaises(TypeError):
+            (1+1j) % (1+0j)
+        with self.assertRaises(TypeError):
+            (1+1j) % 1.0
+        with self.assertRaises(TypeError):
+            (1+1j) % 1
+        with self.assertRaises(TypeError):
+            1.0 % (1+0j)
+        with self.assertRaises(TypeError):
+            1 % (1+0j)
+
+    def test_mod_zero_division(self):
+        for a, b in ZERO_DIVISION:
+            with self.assertRaises(TypeError):
+                a % b
+
+    def test_divmod(self):
+        self.assertRaises(TypeError, divmod, 1+1j, 1+0j)
+        self.assertRaises(TypeError, divmod, 1+1j, 1.0)
+        self.assertRaises(TypeError, divmod, 1+1j, 1)
+        self.assertRaises(TypeError, divmod, 1.0, 1+0j)
+        self.assertRaises(TypeError, divmod, 1, 1+0j)
+
+    def test_divmod_zero_division(self):
+        for a, b in ZERO_DIVISION:
+            self.assertRaises(TypeError, divmod, a, b)
+
+    def test_pow(self):
+        self.assertAlmostEqual(pow(1+1j, 0+0j), 1.0)
+        self.assertAlmostEqual(pow(0+0j, 2+0j), 0.0)
+        self.assertEqual(pow(0+0j, 2000+0j), 0.0)
+        self.assertEqual(pow(0, 0+0j), 1.0)
+        self.assertEqual(pow(-1, 0+0j), 1.0)
+        self.assertRaises(ZeroDivisionError, pow, 0+0j, 1j)
+        self.assertRaises(ZeroDivisionError, pow, 0+0j, -1000)
+        self.assertAlmostEqual(pow(1j, -1), 1/1j)
+        self.assertAlmostEqual(pow(1j, 200), 1)
+        self.assertRaises(ValueError, pow, 1+1j, 1+1j, 1+1j)
+        self.assertRaises(OverflowError, pow, 1e200+1j, 1e200+1j)
+        self.assertRaises(TypeError, pow, 1j, None)
+        self.assertRaises(TypeError, pow, None, 1j)
+        self.assertAlmostEqual(pow(1j, 0.5), 0.7071067811865476+0.7071067811865475j)
+
+        a = 3.33+4.43j
+        self.assertEqual(a ** 0j, 1)
+        self.assertEqual(a ** 0.+0.j, 1)
+
+        self.assertEqual(3j ** 0j, 1)
+        self.assertEqual(3j ** 0, 1)
+
+        try:
+            0j ** a
+        except ZeroDivisionError:
+            pass
+        else:
+            self.fail("should fail 0.0 to negative or complex power")
+
+        try:
+            0j ** (3-2j)
+        except ZeroDivisionError:
+            pass
+        else:
+            self.fail("should fail 0.0 to negative or complex power")
+
+        # The following is used to exercise certain code paths
+        self.assertEqual(a ** 105, a ** 105)
+        self.assertEqual(a ** -105, a ** -105)
+        self.assertEqual(a ** -30, a ** -30)
+
+        self.assertEqual(0.0j ** 0, 1)
+
+        b = 5.1+2.3j
+        self.assertRaises(ValueError, pow, a, b, 0)
+
+        # Check some boundary conditions; some of these used to invoke
+        # undefined behaviour (https://bugs.python.org/issue44698). We're
+        # not actually checking the results of these operations, just making
+        # sure they don't crash (for example when using clang's
+        # UndefinedBehaviourSanitizer).
+        values = (sys.maxsize, sys.maxsize+1, sys.maxsize-1,
+                  -sys.maxsize, -sys.maxsize+1, -sys.maxsize+1)
+        for real in values:
+            for imag in values:
+                with self.subTest(real=real, imag=imag):
+                    c = complex(real, imag)
+                    try:
+                        c ** real
+                    except OverflowError:
+                        pass
+                    try:
+                        c ** c
+                    except OverflowError:
+                        pass
+
+        # gh-113841: possible undefined division by 0 in _Py_c_pow()
+        x, y = 9j, 33j**3
+        with self.assertRaises(OverflowError):
+            x**y
+
+    def test_pow_with_small_integer_exponents(self):
+        # Check that small integer exponents are handled identically
+        # regardless of their type.
+        values = [
+            complex(5.0, 12.0),
+            complex(5.0e100, 12.0e100),
+            complex(-4.0, INF),
+            complex(INF, 0.0),
+        ]
+        exponents = [-19, -5, -3, -2, -1, 0, 1, 2, 3, 5, 19]
+        for value in values:
+            for exponent in exponents:
+                with self.subTest(value=value, exponent=exponent):
+                    try:
+                        int_pow = value**exponent
+                    except OverflowError:
+                        int_pow = "overflow"
+                    try:
+                        float_pow = value**float(exponent)
+                    except OverflowError:
+                        float_pow = "overflow"
+                    try:
+                        complex_pow = value**complex(exponent)
+                    except OverflowError:
+                        complex_pow = "overflow"
+                    self.assertEqual(str(float_pow), str(int_pow))
+                    self.assertEqual(str(complex_pow), str(int_pow))
+
+    def test_boolcontext(self):
+        for i in range(100):
+            self.assertTrue(complex(random() + 1e-6, random() + 1e-6))
+        self.assertTrue(not complex(0.0, 0.0))
+        self.assertTrue(1j)
+
+    def test_conjugate(self):
+        self.assertClose(complex(5.3, 9.8).conjugate(), 5.3-9.8j)
+
+    def test_constructor(self):
+        def check(z, x, y):
+            self.assertIs(type(z), complex)
+            self.assertFloatsAreIdentical(z.real, x)
+            self.assertFloatsAreIdentical(z.imag, y)
+
+        check(complex(),  0.0, 0.0)
+        check(complex(10), 10.0, 0.0)
+        check(complex(4.25), 4.25, 0.0)
+        check(complex(4.25+0j), 4.25, 0.0)
+        check(complex(4.25+0.5j), 4.25, 0.5)
+        check(complex(ComplexSubclass(4.25+0.5j)), 4.25, 0.5)
+        check(complex(WithComplex(4.25+0.5j)), 4.25, 0.5)
+
+        check(complex(1, 10), 1.0, 10.0)
+        check(complex(1, 10.0), 1.0, 10.0)
+        check(complex(1, 4.25), 1.0, 4.25)
+        check(complex(1.0, 10), 1.0, 10.0)
+        check(complex(4.25, 10), 4.25, 10.0)
+        check(complex(1.0, 10.0), 1.0, 10.0)
+        check(complex(4.25, 0.5), 4.25, 0.5)
+
+        check(complex(4.25+0j, 0), 4.25, 0.0)
+        check(complex(ComplexSubclass(4.25+0j), 0), 4.25, 0.0)
+        check(complex(WithComplex(4.25+0j), 0), 4.25, 0.0)
+        check(complex(4.25j, 0), 0.0, 4.25)
+        check(complex(0j, 4.25), 0.0, 4.25)
+        check(complex(0, 4.25+0j), 0.0, 4.25)
+        check(complex(0, ComplexSubclass(4.25+0j)), 0.0, 4.25)
+        with self.assertRaisesRegex(TypeError,
+                "second argument must be a number, not 'WithComplex'"):
+            complex(0, WithComplex(4.25+0j))
+        check(complex(0.0, 4.25j), -4.25, 0.0)
+        check(complex(4.25+0j, 0j), 4.25, 0.0)
+        check(complex(4.25j, 0j), 0.0, 4.25)
+        check(complex(0j, 4.25+0j), 0.0, 4.25)
+        check(complex(0j, 4.25j), -4.25, 0.0)
+
+        check(complex(real=4.25), 4.25, 0.0)
+        check(complex(real=4.25+0j), 4.25, 0.0)
+        check(complex(real=4.25+1.5j), 4.25, 1.5)
+        check(complex(imag=1.5), 0.0, 1.5)
+        check(complex(real=4.25, imag=1.5), 4.25, 1.5)
+        check(complex(4.25, imag=1.5), 4.25, 1.5)
+
+        # check that the sign of a zero in the real or imaginary part
+        # is preserved when constructing from two floats.
+        for x in 1.0, -1.0:
+            for y in 0.0, -0.0:
+                check(complex(x, y), x, y)
+                check(complex(y, x), y, x)
+
+        c = complex(4.25, 1.5)
+        self.assertIs(complex(c), c)
+        c2 = ComplexSubclass(c)
+        self.assertEqual(c2, c)
+        self.assertIs(type(c2), ComplexSubclass)
+        del c, c2
+
+        self.assertRaisesRegex(TypeError,
+            "first argument must be a string or a number, not 'dict'",
+            complex, {})
+        self.assertRaisesRegex(TypeError,
+            "first argument must be a string or a number, not 'NoneType'",
+            complex, None)
+        self.assertRaisesRegex(TypeError,
+            "first argument must be a string or a number, not 'dict'",
+            complex, {1:2}, 0)
+        self.assertRaisesRegex(TypeError,
+            "can't take second arg if first is a string",
+            complex, '1', 0)
+        self.assertRaisesRegex(TypeError,
+            "second argument must be a number, not 'dict'",
+            complex, 0, {1:2})
+        self.assertRaisesRegex(TypeError,
+                "second arg can't be a string",
+            complex, 0, '1')
+
+        self.assertRaises(TypeError, complex, WithComplex(1.5))
+        self.assertRaises(TypeError, complex, WithComplex(1))
+        self.assertRaises(TypeError, complex, WithComplex(None))
+        self.assertRaises(TypeError, complex, WithComplex(4.25+0j), object())
+        self.assertRaises(TypeError, complex, WithComplex(1.5), object())
+        self.assertRaises(TypeError, complex, WithComplex(1), object())
+        self.assertRaises(TypeError, complex, WithComplex(None), object())
+
+        class EvilExc(Exception):
+            pass
+
+        class evilcomplex:
+            def __complex__(self):
+                raise EvilExc
+
+        self.assertRaises(EvilExc, complex, evilcomplex())
+
+        check(complex(WithFloat(4.25)), 4.25, 0.0)
+        check(complex(WithFloat(4.25), 1.5), 4.25, 1.5)
+        check(complex(1.5, WithFloat(4.25)), 1.5, 4.25)
+        self.assertRaises(TypeError, complex, WithFloat(42))
+        self.assertRaises(TypeError, complex, WithFloat(42), 1.5)
+        self.assertRaises(TypeError, complex, 1.5, WithFloat(42))
+        self.assertRaises(TypeError, complex, WithFloat(None))
+        self.assertRaises(TypeError, complex, WithFloat(None), 1.5)
+        self.assertRaises(TypeError, complex, 1.5, WithFloat(None))
+
+        check(complex(WithIndex(42)), 42.0, 0.0)
+        check(complex(WithIndex(42), 1.5), 42.0, 1.5)
+        check(complex(1.5, WithIndex(42)), 1.5, 42.0)
+        self.assertRaises(OverflowError, complex, WithIndex(2**2000))
+        self.assertRaises(OverflowError, complex, WithIndex(2**2000), 1.5)
+        self.assertRaises(OverflowError, complex, 1.5, WithIndex(2**2000))
+        self.assertRaises(TypeError, complex, WithIndex(None))
+        self.assertRaises(TypeError, complex, WithIndex(None), 1.5)
+        self.assertRaises(TypeError, complex, 1.5, WithIndex(None))
+
+        class MyInt:
+            def __int__(self):
+                return 42
+
+        self.assertRaises(TypeError, complex, MyInt())
+        self.assertRaises(TypeError, complex, MyInt(), 1.5)
+        self.assertRaises(TypeError, complex, 1.5, MyInt())
+
+        class complex0(complex):
+            """Test usage of __complex__() when inheriting from 'complex'"""
+            def __complex__(self):
+                return 42j
+
+        class complex1(complex):
+            """Test usage of __complex__() with a __new__() method"""
+            def __new__(self, value=0j):
+                return complex.__new__(self, 2*value)
+            def __complex__(self):
+                return self
+
+        class complex2(complex):
+            """Make sure that __complex__() calls fail if anything other than a
+            complex is returned"""
+            def __complex__(self):
+                return None
+
+        check(complex(complex0(1j)), 0.0, 42.0)
+        with self.assertWarns(DeprecationWarning):
+            check(complex(complex1(1j)), 0.0, 2.0)
+        self.assertRaises(TypeError, complex, complex2(1j))
+
+    def test___complex__(self):
+        z = 3 + 4j
+        self.assertEqual(z.__complex__(), z)
+        self.assertEqual(type(z.__complex__()), complex)
+
+        z = ComplexSubclass(3 + 4j)
+        self.assertEqual(z.__complex__(), 3 + 4j)
+        self.assertEqual(type(z.__complex__()), complex)
+
+    @support.requires_IEEE_754
+    def test_constructor_special_numbers(self):
+        for x in 0.0, -0.0, INF, -INF, NAN:
+            for y in 0.0, -0.0, INF, -INF, NAN:
+                with self.subTest(x=x, y=y):
+                    z = complex(x, y)
+                    self.assertFloatsAreIdentical(z.real, x)
+                    self.assertFloatsAreIdentical(z.imag, y)
+                    z = ComplexSubclass(x, y)
+                    self.assertIs(type(z), ComplexSubclass)
+                    self.assertFloatsAreIdentical(z.real, x)
+                    self.assertFloatsAreIdentical(z.imag, y)
+                    z = complex(ComplexSubclass(x, y))
+                    self.assertIs(type(z), complex)
+                    self.assertFloatsAreIdentical(z.real, x)
+                    self.assertFloatsAreIdentical(z.imag, y)
+                    z = ComplexSubclass(complex(x, y))
+                    self.assertIs(type(z), ComplexSubclass)
+                    self.assertFloatsAreIdentical(z.real, x)
+                    self.assertFloatsAreIdentical(z.imag, y)
+
+    def test_constructor_from_string(self):
+        def check(z, x, y):
+            self.assertIs(type(z), complex)
+            self.assertFloatsAreIdentical(z.real, x)
+            self.assertFloatsAreIdentical(z.imag, y)
+
+        check(complex("1"), 1.0, 0.0)
+        check(complex("1j"), 0.0, 1.0)
+        check(complex("-1"), -1.0, 0.0)
+        check(complex("+1"), 1.0, 0.0)
+        check(complex("1+2j"), 1.0, 2.0)
+        check(complex("(1+2j)"), 1.0, 2.0)
+        check(complex("(1.5+4.25j)"), 1.5, 4.25)
+        check(complex("4.25+1J"), 4.25, 1.0)
+        check(complex(" ( +4.25-6J )"), 4.25, -6.0)
+        check(complex(" ( +4.25-J )"), 4.25, -1.0)
+        check(complex(" ( +4.25+j )"), 4.25, 1.0)
+        check(complex("J"), 0.0, 1.0)
+        check(complex("( j )"), 0.0, 1.0)
+        check(complex("+J"), 0.0, 1.0)
+        check(complex("( -j)"), 0.0, -1.0)
+        check(complex('1-1j'), 1.0, -1.0)
+        check(complex('1J'), 0.0, 1.0)
+
+        check(complex('1e-500'), 0.0, 0.0)
+        check(complex('-1e-500j'), 0.0, -0.0)
+        check(complex('1e-500+1e-500j'), 0.0, 0.0)
+        check(complex('-1e-500+1e-500j'), -0.0, 0.0)
+        check(complex('1e-500-1e-500j'), 0.0, -0.0)
+        check(complex('-1e-500-1e-500j'), -0.0, -0.0)
+
+        # SF bug 543840:  complex(string) accepts strings with \0
+        # Fixed in 2.3.
+        self.assertRaises(ValueError, complex, '1+1j\0j')
+        self.assertRaises(ValueError, complex, "")
+        self.assertRaises(ValueError, complex, "\0")
+        self.assertRaises(ValueError, complex, "3\09")
+        self.assertRaises(ValueError, complex, "1+")
+        self.assertRaises(ValueError, complex, "1+1j+1j")
+        self.assertRaises(ValueError, complex, "--")
+        self.assertRaises(ValueError, complex, "(1+2j")
+        self.assertRaises(ValueError, complex, "1+2j)")
+        self.assertRaises(ValueError, complex, "1+(2j)")
+        self.assertRaises(ValueError, complex, "(1+2j)123")
+        self.assertRaises(ValueError, complex, "x")
+        self.assertRaises(ValueError, complex, "1j+2")
+        self.assertRaises(ValueError, complex, "1e1ej")
+        self.assertRaises(ValueError, complex, "1e++1ej")
+        self.assertRaises(ValueError, complex, ")1+2j(")
+        # the following three are accepted by Python 2.6
+        self.assertRaises(ValueError, complex, "1..1j")
+        self.assertRaises(ValueError, complex, "1.11.1j")
+        self.assertRaises(ValueError, complex, "1e1.1j")
+
+        # check that complex accepts long unicode strings
+        self.assertIs(type(complex("1"*500)), complex)
+        # check whitespace processing
+        self.assertEqual(complex('\N{EM SPACE}(\N{EN SPACE}1+1j ) '), 1+1j)
+        # Invalid unicode string
+        # See bpo-34087
+        self.assertRaises(ValueError, complex, '\u3053\u3093\u306b\u3061\u306f')
+
+    def test_constructor_negative_nans_from_string(self):
+        self.assertEqual(copysign(1., complex("-nan").real), -1.)
+        self.assertEqual(copysign(1., complex("-nanj").imag), -1.)
+        self.assertEqual(copysign(1., complex("-nan-nanj").real), -1.)
+        self.assertEqual(copysign(1., complex("-nan-nanj").imag), -1.)
+
+    def test_underscores(self):
+        # check underscores
+        for lit in VALID_UNDERSCORE_LITERALS:
+            if not any(ch in lit for ch in 'xXoObB'):
+                self.assertEqual(complex(lit), eval(lit))
+                self.assertEqual(complex(lit), complex(lit.replace('_', '')))
+        for lit in INVALID_UNDERSCORE_LITERALS:
+            if lit in ('0_7', '09_99'):  # octals are not recognized here
+                continue
+            if not any(ch in lit for ch in 'xXoObB'):
+                self.assertRaises(ValueError, complex, lit)
+
+    def test_hash(self):
+        for x in range(-30, 30):
+            self.assertEqual(hash(x), hash(complex(x, 0)))
+            x /= 3.0    # now check against floating-point
+            self.assertEqual(hash(x), hash(complex(x, 0.)))
+
+        self.assertNotEqual(hash(2000005 - 1j), -1)
+
+    def test_abs(self):
+        nums = [complex(x/3., y/7.) for x in range(-9,9) for y in range(-9,9)]
+        for num in nums:
+            self.assertAlmostEqual((num.real**2 + num.imag**2)  ** 0.5, abs(num))
+
+        self.assertRaises(OverflowError, abs, complex(DBL_MAX, DBL_MAX))
+
+    def test_repr_str(self):
+        def test(v, expected, test_fn=self.assertEqual):
+            test_fn(repr(v), expected)
+            test_fn(str(v), expected)
+
+        test(1+6j, '(1+6j)')
+        test(1-6j, '(1-6j)')
+
+        test(-(1+0j), '(-1+-0j)', test_fn=self.assertNotEqual)
+
+        test(complex(1., INF), "(1+infj)")
+        test(complex(1., -INF), "(1-infj)")
+        test(complex(INF, 1), "(inf+1j)")
+        test(complex(-INF, INF), "(-inf+infj)")
+        test(complex(NAN, 1), "(nan+1j)")
+        test(complex(1, NAN), "(1+nanj)")
+        test(complex(NAN, NAN), "(nan+nanj)")
+        test(complex(-NAN, -NAN), "(nan+nanj)")
+
+        test(complex(0, INF), "infj")
+        test(complex(0, -INF), "-infj")
+        test(complex(0, NAN), "nanj")
+
+        self.assertEqual(1-6j,complex(repr(1-6j)))
+        self.assertEqual(1+6j,complex(repr(1+6j)))
+        self.assertEqual(-6j,complex(repr(-6j)))
+        self.assertEqual(6j,complex(repr(6j)))
+
+    @support.requires_IEEE_754
+    def test_negative_zero_repr_str(self):
+        def test(v, expected, test_fn=self.assertEqual):
+            test_fn(repr(v), expected)
+            test_fn(str(v), expected)
+
+        test(complex(0., 1.),   "1j")
+        test(complex(-0., 1.),  "(-0+1j)")
+        test(complex(0., -1.),  "-1j")
+        test(complex(-0., -1.), "(-0-1j)")
+
+        test(complex(0., 0.),   "0j")
+        test(complex(0., -0.),  "-0j")
+        test(complex(-0., 0.),  "(-0+0j)")
+        test(complex(-0., -0.), "(-0-0j)")
+
+    def test_pos(self):
+        self.assertEqual(+(1+6j), 1+6j)
+        self.assertEqual(+ComplexSubclass(1, 6), 1+6j)
+        self.assertIs(type(+ComplexSubclass(1, 6)), complex)
+
+    def test_neg(self):
+        self.assertEqual(-(1+6j), -1-6j)
+
+    def test_getnewargs(self):
+        self.assertEqual((1+2j).__getnewargs__(), (1.0, 2.0))
+        self.assertEqual((1-2j).__getnewargs__(), (1.0, -2.0))
+        self.assertEqual((2j).__getnewargs__(), (0.0, 2.0))
+        self.assertEqual((-0j).__getnewargs__(), (0.0, -0.0))
+        self.assertEqual(complex(0, INF).__getnewargs__(), (0.0, INF))
+        self.assertEqual(complex(INF, 0).__getnewargs__(), (INF, 0.0))
+
+    @support.requires_IEEE_754
+    def test_plus_minus_0j(self):
+        # test that -0j and 0j literals are not identified
+        z1, z2 = 0j, -0j
+        self.assertFloatsAreIdentical(z1.imag, 0.0)
+        self.assertFloatsAreIdentical(z2.imag, -0.0)
+
+    @support.requires_IEEE_754
+    def test_negated_imaginary_literal(self):
+        z0 = -0j
+        z1 = -7j
+        z2 = -1e1000j
+        # Note: In versions of Python < 3.2, a negated imaginary literal
+        # accidentally ended up with real part 0.0 instead of -0.0, thanks to a
+        # modification during CST -> AST translation (see issue #9011).  That's
+        # fixed in Python 3.2.
+        self.assertFloatsAreIdentical(z0.real, -0.0)
+        self.assertFloatsAreIdentical(z0.imag, -0.0)
+        self.assertFloatsAreIdentical(z1.real, -0.0)
+        self.assertFloatsAreIdentical(z1.imag, -7.0)
+        self.assertFloatsAreIdentical(z2.real, -0.0)
+        self.assertFloatsAreIdentical(z2.imag, -INF)
+
+    @support.requires_IEEE_754
+    def test_overflow(self):
+        self.assertEqual(complex("1e500"), complex(INF, 0.0))
+        self.assertEqual(complex("-1e500j"), complex(0.0, -INF))
+        self.assertEqual(complex("-1e500+1.8e308j"), complex(-INF, INF))
+
+    @support.requires_IEEE_754
+    def test_repr_roundtrip(self):
+        vals = [0.0, 1e-500, 1e-315, 1e-200, 0.0123, 3.1415, 1e50, INF, NAN]
+        vals += [-v for v in vals]
+
+        # complex(repr(z)) should recover z exactly, even for complex
+        # numbers involving an infinity, nan, or negative zero
+        for x in vals:
+            for y in vals:
+                z = complex(x, y)
+                roundtrip = complex(repr(z))
+                self.assertComplexesAreIdentical(z, roundtrip)
+
+        # if we predefine some constants, then eval(repr(z)) should
+        # also work, except that it might change the sign of zeros
+        inf, nan = float('inf'), float('nan')
+        infj, nanj = complex(0.0, inf), complex(0.0, nan)
+        for x in vals:
+            for y in vals:
+                z = complex(x, y)
+                roundtrip = eval(repr(z))
+                # adding 0.0 has no effect beside changing -0.0 to 0.0
+                self.assertFloatsAreIdentical(0.0 + z.real,
+                                              0.0 + roundtrip.real)
+                self.assertFloatsAreIdentical(0.0 + z.imag,
+                                              0.0 + roundtrip.imag)
+
+    def test_format(self):
+        # empty format string is same as str()
+        self.assertEqual(format(1+3j, ''), str(1+3j))
+        self.assertEqual(format(1.5+3.5j, ''), str(1.5+3.5j))
+        self.assertEqual(format(3j, ''), str(3j))
+        self.assertEqual(format(3.2j, ''), str(3.2j))
+        self.assertEqual(format(3+0j, ''), str(3+0j))
+        self.assertEqual(format(3.2+0j, ''), str(3.2+0j))
+
+        # empty presentation type should still be analogous to str,
+        # even when format string is nonempty (issue #5920).
+        self.assertEqual(format(3.2+0j, '-'), str(3.2+0j))
+        self.assertEqual(format(3.2+0j, '<'), str(3.2+0j))
+        z = 4/7. - 100j/7.
+        self.assertEqual(format(z, ''), str(z))
+        self.assertEqual(format(z, '-'), str(z))
+        self.assertEqual(format(z, '<'), str(z))
+        self.assertEqual(format(z, '10'), str(z))
+        z = complex(0.0, 3.0)
+        self.assertEqual(format(z, ''), str(z))
+        self.assertEqual(format(z, '-'), str(z))
+        self.assertEqual(format(z, '<'), str(z))
+        self.assertEqual(format(z, '2'), str(z))
+        z = complex(-0.0, 2.0)
+        self.assertEqual(format(z, ''), str(z))
+        self.assertEqual(format(z, '-'), str(z))
+        self.assertEqual(format(z, '<'), str(z))
+        self.assertEqual(format(z, '3'), str(z))
+
+        self.assertEqual(format(1+3j, 'g'), '1+3j')
+        self.assertEqual(format(3j, 'g'), '0+3j')
+        self.assertEqual(format(1.5+3.5j, 'g'), '1.5+3.5j')
+
+        self.assertEqual(format(1.5+3.5j, '+g'), '+1.5+3.5j')
+        self.assertEqual(format(1.5-3.5j, '+g'), '+1.5-3.5j')
+        self.assertEqual(format(1.5-3.5j, '-g'), '1.5-3.5j')
+        self.assertEqual(format(1.5+3.5j, ' g'), ' 1.5+3.5j')
+        self.assertEqual(format(1.5-3.5j, ' g'), ' 1.5-3.5j')
+        self.assertEqual(format(-1.5+3.5j, ' g'), '-1.5+3.5j')
+        self.assertEqual(format(-1.5-3.5j, ' g'), '-1.5-3.5j')
+
+        self.assertEqual(format(-1.5-3.5e-20j, 'g'), '-1.5-3.5e-20j')
+        self.assertEqual(format(-1.5-3.5j, 'f'), '-1.500000-3.500000j')
+        self.assertEqual(format(-1.5-3.5j, 'F'), '-1.500000-3.500000j')
+        self.assertEqual(format(-1.5-3.5j, 'e'), '-1.500000e+00-3.500000e+00j')
+        self.assertEqual(format(-1.5-3.5j, '.2e'), '-1.50e+00-3.50e+00j')
+        self.assertEqual(format(-1.5-3.5j, '.2E'), '-1.50E+00-3.50E+00j')
+        self.assertEqual(format(-1.5e10-3.5e5j, '.2G'), '-1.5E+10-3.5E+05j')
+
+        self.assertEqual(format(1.5+3j, '<20g'),  '1.5+3j              ')
+        self.assertEqual(format(1.5+3j, '*<20g'), '1.5+3j**************')
+        self.assertEqual(format(1.5+3j, '>20g'),  '              1.5+3j')
+        self.assertEqual(format(1.5+3j, '^20g'),  '       1.5+3j       ')
+        self.assertEqual(format(1.5+3j, '<20'),   '(1.5+3j)            ')
+        self.assertEqual(format(1.5+3j, '>20'),   '            (1.5+3j)')
+        self.assertEqual(format(1.5+3j, '^20'),   '      (1.5+3j)      ')
+        self.assertEqual(format(1.123-3.123j, '^20.2'), '     (1.1-3.1j)     ')
+
+        self.assertEqual(format(1.5+3j, '20.2f'), '          1.50+3.00j')
+        self.assertEqual(format(1.5+3j, '>20.2f'), '          1.50+3.00j')
+        self.assertEqual(format(1.5+3j, '<20.2f'), '1.50+3.00j          ')
+        self.assertEqual(format(1.5e20+3j, '<20.2f'), '150000000000000000000.00+3.00j')
+        self.assertEqual(format(1.5e20+3j, '>40.2f'), '          150000000000000000000.00+3.00j')
+        self.assertEqual(format(1.5e20+3j, '^40,.2f'), '  150,000,000,000,000,000,000.00+3.00j  ')
+        self.assertEqual(format(1.5e21+3j, '^40,.2f'), ' 1,500,000,000,000,000,000,000.00+3.00j ')
+        self.assertEqual(format(1.5e21+3000j, ',.2f'), '1,500,000,000,000,000,000,000.00+3,000.00j')
+
+        # Issue 7094: Alternate formatting (specified by #)
+        self.assertEqual(format(1+1j, '.0e'), '1e+00+1e+00j')
+        self.assertEqual(format(1+1j, '#.0e'), '1.e+00+1.e+00j')
+        self.assertEqual(format(1+1j, '.0f'), '1+1j')
+        self.assertEqual(format(1+1j, '#.0f'), '1.+1.j')
+        self.assertEqual(format(1.1+1.1j, 'g'), '1.1+1.1j')
+        self.assertEqual(format(1.1+1.1j, '#g'), '1.10000+1.10000j')
+
+        # Alternate doesn't make a difference for these, they format the same with or without it
+        self.assertEqual(format(1+1j, '.1e'),  '1.0e+00+1.0e+00j')
+        self.assertEqual(format(1+1j, '#.1e'), '1.0e+00+1.0e+00j')
+        self.assertEqual(format(1+1j, '.1f'),  '1.0+1.0j')
+        self.assertEqual(format(1+1j, '#.1f'), '1.0+1.0j')
+
+        # Misc. other alternate tests
+        self.assertEqual(format((-1.5+0.5j), '#f'), '-1.500000+0.500000j')
+        self.assertEqual(format((-1.5+0.5j), '#.0f'), '-2.+0.j')
+        self.assertEqual(format((-1.5+0.5j), '#e'), '-1.500000e+00+5.000000e-01j')
+        self.assertEqual(format((-1.5+0.5j), '#.0e'), '-2.e+00+5.e-01j')
+        self.assertEqual(format((-1.5+0.5j), '#g'), '-1.50000+0.500000j')
+        self.assertEqual(format((-1.5+0.5j), '.0g'), '-2+0.5j')
+        self.assertEqual(format((-1.5+0.5j), '#.0g'), '-2.+0.5j')
+
+        # zero padding is invalid
+        self.assertRaises(ValueError, (1.5+0.5j).__format__, '010f')
+
+        # '=' alignment is invalid
+        self.assertRaises(ValueError, (1.5+3j).__format__, '=20')
+
+        # integer presentation types are an error
+        for t in 'bcdoxX':
+            self.assertRaises(ValueError, (1.5+0.5j).__format__, t)
+
+        # make sure everything works in ''.format()
+        self.assertEqual('*{0:.3f}*'.format(3.14159+2.71828j), '*3.142+2.718j*')
+
+        # issue 3382
+        self.assertEqual(format(complex(NAN, NAN), 'f'), 'nan+nanj')
+        self.assertEqual(format(complex(1, NAN), 'f'), '1.000000+nanj')
+        self.assertEqual(format(complex(NAN, 1), 'f'), 'nan+1.000000j')
+        self.assertEqual(format(complex(NAN, -1), 'f'), 'nan-1.000000j')
+        self.assertEqual(format(complex(NAN, NAN), 'F'), 'NAN+NANj')
+        self.assertEqual(format(complex(1, NAN), 'F'), '1.000000+NANj')
+        self.assertEqual(format(complex(NAN, 1), 'F'), 'NAN+1.000000j')
+        self.assertEqual(format(complex(NAN, -1), 'F'), 'NAN-1.000000j')
+        self.assertEqual(format(complex(INF, INF), 'f'), 'inf+infj')
+        self.assertEqual(format(complex(1, INF), 'f'), '1.000000+infj')
+        self.assertEqual(format(complex(INF, 1), 'f'), 'inf+1.000000j')
+        self.assertEqual(format(complex(INF, -1), 'f'), 'inf-1.000000j')
+        self.assertEqual(format(complex(INF, INF), 'F'), 'INF+INFj')
+        self.assertEqual(format(complex(1, INF), 'F'), '1.000000+INFj')
+        self.assertEqual(format(complex(INF, 1), 'F'), 'INF+1.000000j')
+        self.assertEqual(format(complex(INF, -1), 'F'), 'INF-1.000000j')
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_dict.diff b/test/dynamo/cpython/3_13/test_dict.diff
new file mode 100644
index 000000000000..9589bcf797bd
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_dict.diff
@@ -0,0 +1,122 @@
+diff --git a/test/dynamo/cpython/3_13/test_dict.py b/test/dynamo/cpython/3_13/test_dict.py
+index 4729132c5a5..14f829c1715 100644
+--- a/test/dynamo/cpython/3_13/test_dict.py
++++ b/test/dynamo/cpython/3_13/test_dict.py
+@@ -1,3 +1,57 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import (
++    run_tests,
++    xfailIfTorchDynamo,
++)
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ import collections
+ import collections.abc
+ import gc
+@@ -11,7 +65,7 @@ from test import support
+ from test.support import import_helper, get_c_recursion_limit
+ 
+ 
+-class DictTest(unittest.TestCase):
++class DictTest(__TestCase):
+ 
+     def test_invalid_keyword_arguments(self):
+         class Custom(dict):
+@@ -265,6 +319,7 @@ class DictTest(unittest.TestCase):
+ 
+         self.assertRaises(ValueError, {}.update, [(1, 2, 3)])
+ 
++    @unittest.skip("test hangs")
+     def test_fromkeys(self):
+         self.assertEqual(dict.fromkeys('abc'), {'a':None, 'b':None, 'c':None})
+         d = {}
+@@ -477,7 +532,7 @@ class DictTest(unittest.TestCase):
+         for copymode in -1, +1:
+             # -1: b has same structure as a
+             # +1: b is a.copy()
+-            for log2size in range(12):
++            for log2size in range(4):
+                 size = 2**log2size
+                 a = {}
+                 b = {}
+@@ -1006,18 +1061,6 @@ class DictTest(unittest.TestCase):
+             pass
+         self._tracked(MyDict())
+ 
+-    @support.cpython_only
+-    def test_track_lazy_instance_dicts(self):
+-        class C:
+-            pass
+-        o = C()
+-        d = o.__dict__
+-        self._not_tracked(d)
+-        o.untracked = 42
+-        self._not_tracked(d)
+-        o.tracked = []
+-        self._tracked(d)
+-
+     def make_shared_key_dict(self, n):
+         class C:
+             pass
+@@ -1622,7 +1665,7 @@ class DictTest(unittest.TestCase):
+                 self.assertGreaterEqual(eq_count, 1)
+ 
+ 
+-class CAPITest(unittest.TestCase):
++class CAPITest(__TestCase):
+ 
+     # Test _PyDict_GetItem_KnownHash()
+     @support.cpython_only
+@@ -1666,4 +1709,4 @@ class SubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+ 
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_dict.py b/test/dynamo/cpython/3_13/test_dict.py
new file mode 100644
index 000000000000..14f829c1715c
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_dict.py
@@ -0,0 +1,1712 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import (
+    run_tests,
+    xfailIfTorchDynamo,
+)
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+import collections
+import collections.abc
+import gc
+import pickle
+import random
+import string
+import sys
+import unittest
+import weakref
+from test import support
+from test.support import import_helper, get_c_recursion_limit
+
+
+class DictTest(__TestCase):
+
+    def test_invalid_keyword_arguments(self):
+        class Custom(dict):
+            pass
+        for invalid in {1 : 2}, Custom({1 : 2}):
+            with self.assertRaises(TypeError):
+                dict(**invalid)
+            with self.assertRaises(TypeError):
+                {}.update(**invalid)
+
+    def test_constructor(self):
+        # calling built-in types without argument must return empty
+        self.assertEqual(dict(), {})
+        self.assertIsNot(dict(), {})
+
+    def test_literal_constructor(self):
+        # check literal constructor for different sized dicts
+        # (to exercise the BUILD_MAP oparg).
+        for n in (0, 1, 6, 256, 400):
+            items = [(''.join(random.sample(string.ascii_letters, 8)), i)
+                     for i in range(n)]
+            random.shuffle(items)
+            formatted_items = ('{!r}: {:d}'.format(k, v) for k, v in items)
+            dictliteral = '{' + ', '.join(formatted_items) + '}'
+            self.assertEqual(eval(dictliteral), dict(items))
+
+    def test_merge_operator(self):
+
+        a = {0: 0, 1: 1, 2: 1}
+        b = {1: 1, 2: 2, 3: 3}
+
+        c = a.copy()
+        c |= b
+
+        self.assertEqual(a | b, {0: 0, 1: 1, 2: 2, 3: 3})
+        self.assertEqual(c, {0: 0, 1: 1, 2: 2, 3: 3})
+
+        c = b.copy()
+        c |= a
+
+        self.assertEqual(b | a, {1: 1, 2: 1, 3: 3, 0: 0})
+        self.assertEqual(c, {1: 1, 2: 1, 3: 3, 0: 0})
+
+        c = a.copy()
+        c |= [(1, 1), (2, 2), (3, 3)]
+
+        self.assertEqual(c, {0: 0, 1: 1, 2: 2, 3: 3})
+
+        self.assertIs(a.__or__(None), NotImplemented)
+        self.assertIs(a.__or__(()), NotImplemented)
+        self.assertIs(a.__or__("BAD"), NotImplemented)
+        self.assertIs(a.__or__(""), NotImplemented)
+
+        self.assertRaises(TypeError, a.__ior__, None)
+        self.assertEqual(a.__ior__(()), {0: 0, 1: 1, 2: 1})
+        self.assertRaises(ValueError, a.__ior__, "BAD")
+        self.assertEqual(a.__ior__(""), {0: 0, 1: 1, 2: 1})
+
+    def test_bool(self):
+        self.assertIs(not {}, True)
+        self.assertTrue({1: 2})
+        self.assertIs(bool({}), False)
+        self.assertIs(bool({1: 2}), True)
+
+    def test_keys(self):
+        d = {}
+        self.assertEqual(set(d.keys()), set())
+        d = {'a': 1, 'b': 2}
+        k = d.keys()
+        self.assertEqual(set(k), {'a', 'b'})
+        self.assertIn('a', k)
+        self.assertIn('b', k)
+        self.assertIn('a', d)
+        self.assertIn('b', d)
+        self.assertRaises(TypeError, d.keys, None)
+        self.assertEqual(repr(dict(a=1).keys()), "dict_keys(['a'])")
+
+    def test_values(self):
+        d = {}
+        self.assertEqual(set(d.values()), set())
+        d = {1:2}
+        self.assertEqual(set(d.values()), {2})
+        self.assertRaises(TypeError, d.values, None)
+        self.assertEqual(repr(dict(a=1).values()), "dict_values([1])")
+
+    def test_items(self):
+        d = {}
+        self.assertEqual(set(d.items()), set())
+
+        d = {1:2}
+        self.assertEqual(set(d.items()), {(1, 2)})
+        self.assertRaises(TypeError, d.items, None)
+        self.assertEqual(repr(dict(a=1).items()), "dict_items([('a', 1)])")
+
+    def test_views_mapping(self):
+        mappingproxy = type(type.__dict__)
+        class Dict(dict):
+            pass
+        for cls in [dict, Dict]:
+            d = cls()
+            m1 = d.keys().mapping
+            m2 = d.values().mapping
+            m3 = d.items().mapping
+
+            for m in [m1, m2, m3]:
+                self.assertIsInstance(m, mappingproxy)
+                self.assertEqual(m, d)
+
+            d["foo"] = "bar"
+
+            for m in [m1, m2, m3]:
+                self.assertIsInstance(m, mappingproxy)
+                self.assertEqual(m, d)
+
+    def test_contains(self):
+        d = {}
+        self.assertNotIn('a', d)
+        self.assertFalse('a' in d)
+        self.assertTrue('a' not in d)
+        d = {'a': 1, 'b': 2}
+        self.assertIn('a', d)
+        self.assertIn('b', d)
+        self.assertNotIn('c', d)
+
+        self.assertRaises(TypeError, d.__contains__)
+
+    def test_len(self):
+        d = {}
+        self.assertEqual(len(d), 0)
+        d = {'a': 1, 'b': 2}
+        self.assertEqual(len(d), 2)
+
+    def test_getitem(self):
+        d = {'a': 1, 'b': 2}
+        self.assertEqual(d['a'], 1)
+        self.assertEqual(d['b'], 2)
+        d['c'] = 3
+        d['a'] = 4
+        self.assertEqual(d['c'], 3)
+        self.assertEqual(d['a'], 4)
+        del d['b']
+        self.assertEqual(d, {'a': 4, 'c': 3})
+
+        self.assertRaises(TypeError, d.__getitem__)
+
+        class BadEq(object):
+            def __eq__(self, other):
+                raise Exc()
+            def __hash__(self):
+                return 24
+
+        d = {}
+        d[BadEq()] = 42
+        self.assertRaises(KeyError, d.__getitem__, 23)
+
+        class Exc(Exception): pass
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+
+        x = BadHash()
+        d[x] = 42
+        x.fail = True
+        self.assertRaises(Exc, d.__getitem__, x)
+
+    def test_clear(self):
+        d = {1:1, 2:2, 3:3}
+        d.clear()
+        self.assertEqual(d, {})
+
+        self.assertRaises(TypeError, d.clear, None)
+
+    def test_update(self):
+        d = {}
+        d.update({1:100})
+        d.update({2:20})
+        d.update({1:1, 2:2, 3:3})
+        self.assertEqual(d, {1:1, 2:2, 3:3})
+
+        d.update()
+        self.assertEqual(d, {1:1, 2:2, 3:3})
+
+        self.assertRaises((TypeError, AttributeError), d.update, None)
+
+        class SimpleUserDict:
+            def __init__(self):
+                self.d = {1:1, 2:2, 3:3}
+            def keys(self):
+                return self.d.keys()
+            def __getitem__(self, i):
+                return self.d[i]
+        d.clear()
+        d.update(SimpleUserDict())
+        self.assertEqual(d, {1:1, 2:2, 3:3})
+
+        class Exc(Exception): pass
+
+        d.clear()
+        class FailingUserDict:
+            def keys(self):
+                raise Exc
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        class FailingUserDict:
+            def keys(self):
+                class BogonIter:
+                    def __init__(self):
+                        self.i = 1
+                    def __iter__(self):
+                        return self
+                    def __next__(self):
+                        if self.i:
+                            self.i = 0
+                            return 'a'
+                        raise Exc
+                return BogonIter()
+            def __getitem__(self, key):
+                return key
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        class FailingUserDict:
+            def keys(self):
+                class BogonIter:
+                    def __init__(self):
+                        self.i = ord('a')
+                    def __iter__(self):
+                        return self
+                    def __next__(self):
+                        if self.i <= ord('z'):
+                            rtn = chr(self.i)
+                            self.i += 1
+                            return rtn
+                        raise StopIteration
+                return BogonIter()
+            def __getitem__(self, key):
+                raise Exc
+        self.assertRaises(Exc, d.update, FailingUserDict())
+
+        class badseq(object):
+            def __iter__(self):
+                return self
+            def __next__(self):
+                raise Exc()
+
+        self.assertRaises(Exc, {}.update, badseq())
+
+        self.assertRaises(ValueError, {}.update, [(1, 2, 3)])
+
+    @unittest.skip("test hangs")
+    def test_fromkeys(self):
+        self.assertEqual(dict.fromkeys('abc'), {'a':None, 'b':None, 'c':None})
+        d = {}
+        self.assertIsNot(d.fromkeys('abc'), d)
+        self.assertEqual(d.fromkeys('abc'), {'a':None, 'b':None, 'c':None})
+        self.assertEqual(d.fromkeys((4,5),0), {4:0, 5:0})
+        self.assertEqual(d.fromkeys([]), {})
+        def g():
+            yield 1
+        self.assertEqual(d.fromkeys(g()), {1:None})
+        self.assertRaises(TypeError, {}.fromkeys, 3)
+        class dictlike(dict): pass
+        self.assertEqual(dictlike.fromkeys('a'), {'a':None})
+        self.assertEqual(dictlike().fromkeys('a'), {'a':None})
+        self.assertIsInstance(dictlike.fromkeys('a'), dictlike)
+        self.assertIsInstance(dictlike().fromkeys('a'), dictlike)
+        class mydict(dict):
+            def __new__(cls):
+                return collections.UserDict()
+        ud = mydict.fromkeys('ab')
+        self.assertEqual(ud, {'a':None, 'b':None})
+        self.assertIsInstance(ud, collections.UserDict)
+        self.assertRaises(TypeError, dict.fromkeys)
+
+        class Exc(Exception): pass
+
+        class baddict1(dict):
+            def __init__(self):
+                raise Exc()
+
+        self.assertRaises(Exc, baddict1.fromkeys, [1])
+
+        class BadSeq(object):
+            def __iter__(self):
+                return self
+            def __next__(self):
+                raise Exc()
+
+        self.assertRaises(Exc, dict.fromkeys, BadSeq())
+
+        class baddict2(dict):
+            def __setitem__(self, key, value):
+                raise Exc()
+
+        self.assertRaises(Exc, baddict2.fromkeys, [1])
+
+        # test fast path for dictionary inputs
+        res = dict(zip(range(6), [0]*6))
+        d = dict(zip(range(6), range(6)))
+        self.assertEqual(dict.fromkeys(d, 0), res)
+        # test fast path for set inputs
+        d = set(range(6))
+        self.assertEqual(dict.fromkeys(d, 0), res)
+        # test slow path for other iterable inputs
+        d = list(range(6))
+        self.assertEqual(dict.fromkeys(d, 0), res)
+
+        # test fast path when object's constructor returns large non-empty dict
+        class baddict3(dict):
+            def __new__(cls):
+                return d
+        d = {i : i for i in range(1000)}
+        res = d.copy()
+        res.update(a=None, b=None, c=None)
+        self.assertEqual(baddict3.fromkeys({"a", "b", "c"}), res)
+
+        # test slow path when object is a proper subclass of dict
+        class baddict4(dict):
+            def __init__(self):
+                dict.__init__(self, d)
+        d = {i : i for i in range(1000)}
+        res = d.copy()
+        res.update(a=None, b=None, c=None)
+        self.assertEqual(baddict4.fromkeys({"a", "b", "c"}), res)
+
+    def test_copy(self):
+        d = {1: 1, 2: 2, 3: 3}
+        self.assertIsNot(d.copy(), d)
+        self.assertEqual(d.copy(), d)
+        self.assertEqual(d.copy(), {1: 1, 2: 2, 3: 3})
+
+        copy = d.copy()
+        d[4] = 4
+        self.assertNotEqual(copy, d)
+
+        self.assertEqual({}.copy(), {})
+        self.assertRaises(TypeError, d.copy, None)
+
+    def test_copy_fuzz(self):
+        for dict_size in [10, 100, 1000, 10000, 100000]:
+            dict_size = random.randrange(
+                dict_size // 2, dict_size + dict_size // 2)
+            with self.subTest(dict_size=dict_size):
+                d = {}
+                for i in range(dict_size):
+                    d[i] = i
+
+                d2 = d.copy()
+                self.assertIsNot(d2, d)
+                self.assertEqual(d, d2)
+                d2['key'] = 'value'
+                self.assertNotEqual(d, d2)
+                self.assertEqual(len(d2), len(d) + 1)
+
+    def test_copy_maintains_tracking(self):
+        class A:
+            pass
+
+        key = A()
+
+        for d in ({}, {'a': 1}, {key: 'val'}):
+            d2 = d.copy()
+            self.assertEqual(gc.is_tracked(d), gc.is_tracked(d2))
+
+    def test_copy_noncompact(self):
+        # Dicts don't compact themselves on del/pop operations.
+        # Copy will use a slow merging strategy that produces
+        # a compacted copy when roughly 33% of dict is a non-used
+        # keys-space (to optimize memory footprint).
+        # In this test we want to hit the slow/compacting
+        # branch of dict.copy() and make sure it works OK.
+        d = {k: k for k in range(1000)}
+        for k in range(950):
+            del d[k]
+        d2 = d.copy()
+        self.assertEqual(d2, d)
+
+    def test_get(self):
+        d = {}
+        self.assertIs(d.get('c'), None)
+        self.assertEqual(d.get('c', 3), 3)
+        d = {'a': 1, 'b': 2}
+        self.assertIs(d.get('c'), None)
+        self.assertEqual(d.get('c', 3), 3)
+        self.assertEqual(d.get('a'), 1)
+        self.assertEqual(d.get('a', 3), 1)
+        self.assertRaises(TypeError, d.get)
+        self.assertRaises(TypeError, d.get, None, None, None)
+
+    def test_setdefault(self):
+        # dict.setdefault()
+        d = {}
+        self.assertIs(d.setdefault('key0'), None)
+        d.setdefault('key0', [])
+        self.assertIs(d.setdefault('key0'), None)
+        d.setdefault('key', []).append(3)
+        self.assertEqual(d['key'][0], 3)
+        d.setdefault('key', []).append(4)
+        self.assertEqual(len(d['key']), 2)
+        self.assertRaises(TypeError, d.setdefault)
+
+        class Exc(Exception): pass
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+
+        x = BadHash()
+        d[x] = 42
+        x.fail = True
+        self.assertRaises(Exc, d.setdefault, x, [])
+
+    def test_setdefault_atomic(self):
+        # Issue #13521: setdefault() calls __hash__ and __eq__ only once.
+        class Hashed(object):
+            def __init__(self):
+                self.hash_count = 0
+                self.eq_count = 0
+            def __hash__(self):
+                self.hash_count += 1
+                return 42
+            def __eq__(self, other):
+                self.eq_count += 1
+                return id(self) == id(other)
+        hashed1 = Hashed()
+        y = {hashed1: 5}
+        hashed2 = Hashed()
+        y.setdefault(hashed2, [])
+        self.assertEqual(hashed1.hash_count, 1)
+        self.assertEqual(hashed2.hash_count, 1)
+        self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
+
+    def test_setitem_atomic_at_resize(self):
+        class Hashed(object):
+            def __init__(self):
+                self.hash_count = 0
+                self.eq_count = 0
+            def __hash__(self):
+                self.hash_count += 1
+                return 42
+            def __eq__(self, other):
+                self.eq_count += 1
+                return id(self) == id(other)
+        hashed1 = Hashed()
+        # 5 items
+        y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
+        hashed2 = Hashed()
+        # 6th item forces a resize
+        y[hashed2] = []
+        self.assertEqual(hashed1.hash_count, 1)
+        self.assertEqual(hashed2.hash_count, 1)
+        self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
+
+    def test_popitem(self):
+        # dict.popitem()
+        for copymode in -1, +1:
+            # -1: b has same structure as a
+            # +1: b is a.copy()
+            for log2size in range(4):
+                size = 2**log2size
+                a = {}
+                b = {}
+                for i in range(size):
+                    a[repr(i)] = i
+                    if copymode < 0:
+                        b[repr(i)] = i
+                if copymode > 0:
+                    b = a.copy()
+                for i in range(size):
+                    ka, va = ta = a.popitem()
+                    self.assertEqual(va, int(ka))
+                    kb, vb = tb = b.popitem()
+                    self.assertEqual(vb, int(kb))
+                    self.assertFalse(copymode < 0 and ta != tb)
+                self.assertFalse(a)
+                self.assertFalse(b)
+
+        d = {}
+        self.assertRaises(KeyError, d.popitem)
+
+    def test_pop(self):
+        # Tests for pop with specified key
+        d = {}
+        k, v = 'abc', 'def'
+        d[k] = v
+        self.assertRaises(KeyError, d.pop, 'ghi')
+
+        self.assertEqual(d.pop(k), v)
+        self.assertEqual(len(d), 0)
+
+        self.assertRaises(KeyError, d.pop, k)
+
+        self.assertEqual(d.pop(k, v), v)
+        d[k] = v
+        self.assertEqual(d.pop(k, 1), v)
+
+        self.assertRaises(TypeError, d.pop)
+
+        class Exc(Exception): pass
+
+        class BadHash(object):
+            fail = False
+            def __hash__(self):
+                if self.fail:
+                    raise Exc()
+                else:
+                    return 42
+
+        x = BadHash()
+        d[x] = 42
+        x.fail = True
+        self.assertRaises(Exc, d.pop, x)
+
+    def test_mutating_iteration(self):
+        # changing dict size during iteration
+        d = {}
+        d[1] = 1
+        with self.assertRaises(RuntimeError):
+            for i in d:
+                d[i+1] = 1
+
+    def test_mutating_iteration_delete(self):
+        # change dict content during iteration
+        d = {}
+        d[0] = 0
+        with self.assertRaises(RuntimeError):
+            for i in d:
+                del d[0]
+                d[0] = 0
+
+    def test_mutating_iteration_delete_over_values(self):
+        # change dict content during iteration
+        d = {}
+        d[0] = 0
+        with self.assertRaises(RuntimeError):
+            for i in d.values():
+                del d[0]
+                d[0] = 0
+
+    def test_mutating_iteration_delete_over_items(self):
+        # change dict content during iteration
+        d = {}
+        d[0] = 0
+        with self.assertRaises(RuntimeError):
+            for i in d.items():
+                del d[0]
+                d[0] = 0
+
+    def test_mutating_lookup(self):
+        # changing dict during a lookup (issue #14417)
+        class NastyKey:
+            mutate_dict = None
+
+            def __init__(self, value):
+                self.value = value
+
+            def __hash__(self):
+                # hash collision!
+                return 1
+
+            def __eq__(self, other):
+                if NastyKey.mutate_dict:
+                    mydict, key = NastyKey.mutate_dict
+                    NastyKey.mutate_dict = None
+                    del mydict[key]
+                return self.value == other.value
+
+        key1 = NastyKey(1)
+        key2 = NastyKey(2)
+        d = {key1: 1}
+        NastyKey.mutate_dict = (d, key1)
+        d[key2] = 2
+        self.assertEqual(d, {key2: 2})
+
+    def test_repr(self):
+        d = {}
+        self.assertEqual(repr(d), '{}')
+        d[1] = 2
+        self.assertEqual(repr(d), '{1: 2}')
+        d = {}
+        d[1] = d
+        self.assertEqual(repr(d), '{1: {...}}')
+
+        class Exc(Exception): pass
+
+        class BadRepr(object):
+            def __repr__(self):
+                raise Exc()
+
+        d = {1: BadRepr()}
+        self.assertRaises(Exc, repr, d)
+
+    def test_repr_deep(self):
+        d = {}
+        for i in range(get_c_recursion_limit() + 1):
+            d = {1: d}
+        self.assertRaises(RecursionError, repr, d)
+
+    def test_eq(self):
+        self.assertEqual({}, {})
+        self.assertEqual({1: 2}, {1: 2})
+
+        class Exc(Exception): pass
+
+        class BadCmp(object):
+            def __eq__(self, other):
+                raise Exc()
+            def __hash__(self):
+                return 1
+
+        d1 = {BadCmp(): 1}
+        d2 = {1: 1}
+
+        with self.assertRaises(Exc):
+            d1 == d2
+
+    def test_keys_contained(self):
+        self.helper_keys_contained(lambda x: x.keys())
+        self.helper_keys_contained(lambda x: x.items())
+
+    def helper_keys_contained(self, fn):
+        # Test rich comparisons against dict key views, which should behave the
+        # same as sets.
+        empty = fn(dict())
+        empty2 = fn(dict())
+        smaller = fn({1:1, 2:2})
+        larger = fn({1:1, 2:2, 3:3})
+        larger2 = fn({1:1, 2:2, 3:3})
+        larger3 = fn({4:1, 2:2, 3:3})
+
+        self.assertTrue(smaller <  larger)
+        self.assertTrue(smaller <= larger)
+        self.assertTrue(larger >  smaller)
+        self.assertTrue(larger >= smaller)
+
+        self.assertFalse(smaller >= larger)
+        self.assertFalse(smaller >  larger)
+        self.assertFalse(larger  <= smaller)
+        self.assertFalse(larger  <  smaller)
+
+        self.assertFalse(smaller <  larger3)
+        self.assertFalse(smaller <= larger3)
+        self.assertFalse(larger3 >  smaller)
+        self.assertFalse(larger3 >= smaller)
+
+        # Inequality strictness
+        self.assertTrue(larger2 >= larger)
+        self.assertTrue(larger2 <= larger)
+        self.assertFalse(larger2 > larger)
+        self.assertFalse(larger2 < larger)
+
+        self.assertTrue(larger == larger2)
+        self.assertTrue(smaller != larger)
+
+        # There is an optimization on the zero-element case.
+        self.assertTrue(empty == empty2)
+        self.assertFalse(empty != empty2)
+        self.assertFalse(empty == smaller)
+        self.assertTrue(empty != smaller)
+
+        # With the same size, an elementwise compare happens
+        self.assertTrue(larger != larger3)
+        self.assertFalse(larger == larger3)
+
+    def test_errors_in_view_containment_check(self):
+        class C:
+            def __eq__(self, other):
+                raise RuntimeError
+
+        d1 = {1: C()}
+        d2 = {1: C()}
+        with self.assertRaises(RuntimeError):
+            d1.items() == d2.items()
+        with self.assertRaises(RuntimeError):
+            d1.items() != d2.items()
+        with self.assertRaises(RuntimeError):
+            d1.items() <= d2.items()
+        with self.assertRaises(RuntimeError):
+            d1.items() >= d2.items()
+
+        d3 = {1: C(), 2: C()}
+        with self.assertRaises(RuntimeError):
+            d2.items() < d3.items()
+        with self.assertRaises(RuntimeError):
+            d3.items() > d2.items()
+
+    def test_dictview_set_operations_on_keys(self):
+        k1 = {1:1, 2:2}.keys()
+        k2 = {1:1, 2:2, 3:3}.keys()
+        k3 = {4:4}.keys()
+
+        self.assertEqual(k1 - k2, set())
+        self.assertEqual(k1 - k3, {1,2})
+        self.assertEqual(k2 - k1, {3})
+        self.assertEqual(k3 - k1, {4})
+        self.assertEqual(k1 & k2, {1,2})
+        self.assertEqual(k1 & k3, set())
+        self.assertEqual(k1 | k2, {1,2,3})
+        self.assertEqual(k1 ^ k2, {3})
+        self.assertEqual(k1 ^ k3, {1,2,4})
+
+    def test_dictview_set_operations_on_items(self):
+        k1 = {1:1, 2:2}.items()
+        k2 = {1:1, 2:2, 3:3}.items()
+        k3 = {4:4}.items()
+
+        self.assertEqual(k1 - k2, set())
+        self.assertEqual(k1 - k3, {(1,1), (2,2)})
+        self.assertEqual(k2 - k1, {(3,3)})
+        self.assertEqual(k3 - k1, {(4,4)})
+        self.assertEqual(k1 & k2, {(1,1), (2,2)})
+        self.assertEqual(k1 & k3, set())
+        self.assertEqual(k1 | k2, {(1,1), (2,2), (3,3)})
+        self.assertEqual(k1 ^ k2, {(3,3)})
+        self.assertEqual(k1 ^ k3, {(1,1), (2,2), (4,4)})
+
+    def test_items_symmetric_difference(self):
+        rr = random.randrange
+        for _ in range(100):
+            left = {x:rr(3) for x in range(20) if rr(2)}
+            right = {x:rr(3) for x in range(20) if rr(2)}
+            with self.subTest(left=left, right=right):
+                expected = set(left.items()) ^ set(right.items())
+                actual = left.items() ^ right.items()
+                self.assertEqual(actual, expected)
+
+    def test_dictview_mixed_set_operations(self):
+        # Just a few for .keys()
+        self.assertTrue({1:1}.keys() == {1})
+        self.assertTrue({1} == {1:1}.keys())
+        self.assertEqual({1:1}.keys() | {2}, {1, 2})
+        self.assertEqual({2} | {1:1}.keys(), {1, 2})
+        # And a few for .items()
+        self.assertTrue({1:1}.items() == {(1,1)})
+        self.assertTrue({(1,1)} == {1:1}.items())
+        self.assertEqual({1:1}.items() | {2}, {(1,1), 2})
+        self.assertEqual({2} | {1:1}.items(), {(1,1), 2})
+
+    def test_missing(self):
+        # Make sure dict doesn't have a __missing__ method
+        self.assertFalse(hasattr(dict, "__missing__"))
+        self.assertFalse(hasattr({}, "__missing__"))
+        # Test several cases:
+        # (D) subclass defines __missing__ method returning a value
+        # (E) subclass defines __missing__ method raising RuntimeError
+        # (F) subclass sets __missing__ instance variable (no effect)
+        # (G) subclass doesn't define __missing__ at all
+        class D(dict):
+            def __missing__(self, key):
+                return 42
+        d = D({1: 2, 3: 4})
+        self.assertEqual(d[1], 2)
+        self.assertEqual(d[3], 4)
+        self.assertNotIn(2, d)
+        self.assertNotIn(2, d.keys())
+        self.assertEqual(d[2], 42)
+
+        class E(dict):
+            def __missing__(self, key):
+                raise RuntimeError(key)
+        e = E()
+        with self.assertRaises(RuntimeError) as c:
+            e[42]
+        self.assertEqual(c.exception.args, (42,))
+
+        class F(dict):
+            def __init__(self):
+                # An instance variable __missing__ should have no effect
+                self.__missing__ = lambda key: None
+        f = F()
+        with self.assertRaises(KeyError) as c:
+            f[42]
+        self.assertEqual(c.exception.args, (42,))
+
+        class G(dict):
+            pass
+        g = G()
+        with self.assertRaises(KeyError) as c:
+            g[42]
+        self.assertEqual(c.exception.args, (42,))
+
+    def test_tuple_keyerror(self):
+        # SF #1576657
+        d = {}
+        with self.assertRaises(KeyError) as c:
+            d[(1,)]
+        self.assertEqual(c.exception.args, ((1,),))
+
+    def test_bad_key(self):
+        # Dictionary lookups should fail if __eq__() raises an exception.
+        class CustomException(Exception):
+            pass
+
+        class BadDictKey:
+            def __hash__(self):
+                return hash(self.__class__)
+
+            def __eq__(self, other):
+                if isinstance(other, self.__class__):
+                    raise CustomException
+                return other
+
+        d = {}
+        x1 = BadDictKey()
+        x2 = BadDictKey()
+        d[x1] = 1
+        for stmt in ['d[x2] = 2',
+                     'z = d[x2]',
+                     'x2 in d',
+                     'd.get(x2)',
+                     'd.setdefault(x2, 42)',
+                     'd.pop(x2)',
+                     'd.update({x2: 2})']:
+            with self.assertRaises(CustomException):
+                exec(stmt, locals())
+
+    def test_resize1(self):
+        # Dict resizing bug, found by Jack Jansen in 2.2 CVS development.
+        # This version got an assert failure in debug build, infinite loop in
+        # release build.  Unfortunately, provoking this kind of stuff requires
+        # a mix of inserts and deletes hitting exactly the right hash codes in
+        # exactly the right order, and I can't think of a randomized approach
+        # that would be *likely* to hit a failing case in reasonable time.
+
+        d = {}
+        for i in range(5):
+            d[i] = i
+        for i in range(5):
+            del d[i]
+        for i in range(5, 9):  # i==8 was the problem
+            d[i] = i
+
+    def test_resize2(self):
+        # Another dict resizing bug (SF bug #1456209).
+        # This caused Segmentation faults or Illegal instructions.
+
+        class X(object):
+            def __hash__(self):
+                return 5
+            def __eq__(self, other):
+                if resizing:
+                    d.clear()
+                return False
+        d = {}
+        resizing = False
+        d[X()] = 1
+        d[X()] = 2
+        d[X()] = 3
+        d[X()] = 4
+        d[X()] = 5
+        # now trigger a resize
+        resizing = True
+        d[9] = 6
+
+    def test_empty_presized_dict_in_freelist(self):
+        # Bug #3537: if an empty but presized dict with a size larger
+        # than 7 was in the freelist, it triggered an assertion failure
+        with self.assertRaises(ZeroDivisionError):
+            d = {'a': 1 // 0, 'b': None, 'c': None, 'd': None, 'e': None,
+                 'f': None, 'g': None, 'h': None}
+        d = {}
+
+    def test_container_iterator(self):
+        # Bug #3680: tp_traverse was not implemented for dictiter and
+        # dictview objects.
+        class C(object):
+            pass
+        views = (dict.items, dict.values, dict.keys)
+        for v in views:
+            obj = C()
+            ref = weakref.ref(obj)
+            container = {obj: 1}
+            obj.v = v(container)
+            obj.x = iter(obj.v)
+            del obj, container
+            gc.collect()
+            self.assertIs(ref(), None, "Cycle was not collected")
+
+    def _not_tracked(self, t):
+        # Nested containers can take several collections to untrack
+        gc.collect()
+        gc.collect()
+        self.assertFalse(gc.is_tracked(t), t)
+
+    def _tracked(self, t):
+        self.assertTrue(gc.is_tracked(t), t)
+        gc.collect()
+        gc.collect()
+        self.assertTrue(gc.is_tracked(t), t)
+
+    def test_string_keys_can_track_values(self):
+        # Test that this doesn't leak.
+        for i in range(10):
+            d = {}
+            for j in range(10):
+                d[str(j)] = j
+            d["foo"] = d
+
+    @support.cpython_only
+    def test_track_literals(self):
+        # Test GC-optimization of dict literals
+        x, y, z, w = 1.5, "a", (1, None), []
+
+        self._not_tracked({})
+        self._not_tracked({x:(), y:x, z:1})
+        self._not_tracked({1: "a", "b": 2})
+        self._not_tracked({1: 2, (None, True, False, ()): int})
+        self._not_tracked({1: object()})
+
+        # Dicts with mutable elements are always tracked, even if those
+        # elements are not tracked right now.
+        self._tracked({1: []})
+        self._tracked({1: ([],)})
+        self._tracked({1: {}})
+        self._tracked({1: set()})
+
+    @support.cpython_only
+    def test_track_dynamic(self):
+        # Test GC-optimization of dynamically-created dicts
+        class MyObject(object):
+            pass
+        x, y, z, w, o = 1.5, "a", (1, object()), [], MyObject()
+
+        d = dict()
+        self._not_tracked(d)
+        d[1] = "a"
+        self._not_tracked(d)
+        d[y] = 2
+        self._not_tracked(d)
+        d[z] = 3
+        self._not_tracked(d)
+        self._not_tracked(d.copy())
+        d[4] = w
+        self._tracked(d)
+        self._tracked(d.copy())
+        d[4] = None
+        self._not_tracked(d)
+        self._not_tracked(d.copy())
+
+        # dd isn't tracked right now, but it may mutate and therefore d
+        # which contains it must be tracked.
+        d = dict()
+        dd = dict()
+        d[1] = dd
+        self._not_tracked(dd)
+        self._tracked(d)
+        dd[1] = d
+        self._tracked(dd)
+
+        d = dict.fromkeys([x, y, z])
+        self._not_tracked(d)
+        dd = dict()
+        dd.update(d)
+        self._not_tracked(dd)
+        d = dict.fromkeys([x, y, z, o])
+        self._tracked(d)
+        dd = dict()
+        dd.update(d)
+        self._tracked(dd)
+
+        d = dict(x=x, y=y, z=z)
+        self._not_tracked(d)
+        d = dict(x=x, y=y, z=z, w=w)
+        self._tracked(d)
+        d = dict()
+        d.update(x=x, y=y, z=z)
+        self._not_tracked(d)
+        d.update(w=w)
+        self._tracked(d)
+
+        d = dict([(x, y), (z, 1)])
+        self._not_tracked(d)
+        d = dict([(x, y), (z, w)])
+        self._tracked(d)
+        d = dict()
+        d.update([(x, y), (z, 1)])
+        self._not_tracked(d)
+        d.update([(x, y), (z, w)])
+        self._tracked(d)
+
+    @support.cpython_only
+    def test_track_subtypes(self):
+        # Dict subtypes are always tracked
+        class MyDict(dict):
+            pass
+        self._tracked(MyDict())
+
+    def make_shared_key_dict(self, n):
+        class C:
+            pass
+
+        dicts = []
+        for i in range(n):
+            a = C()
+            a.x, a.y, a.z = 1, 2, 3
+            dicts.append(a.__dict__)
+
+        return dicts
+
+    @support.cpython_only
+    def test_splittable_setdefault(self):
+        """split table must keep correct insertion
+        order when attributes are adding using setdefault()"""
+        a, b = self.make_shared_key_dict(2)
+
+        a['a'] = 1
+        size_a = sys.getsizeof(a)
+        a['b'] = 2
+        b.setdefault('b', 2)
+        size_b = sys.getsizeof(b)
+        b['a'] = 1
+
+        self.assertEqual(list(a), ['x', 'y', 'z', 'a', 'b'])
+        self.assertEqual(list(b), ['x', 'y', 'z', 'b', 'a'])
+
+    @support.cpython_only
+    def test_splittable_del(self):
+        """split table must be combined when del d[k]"""
+        a, b = self.make_shared_key_dict(2)
+
+        orig_size = sys.getsizeof(a)
+
+        del a['y']  # split table is combined
+        with self.assertRaises(KeyError):
+            del a['y']
+
+        self.assertEqual(list(a), ['x', 'z'])
+        self.assertEqual(list(b), ['x', 'y', 'z'])
+
+        # Two dicts have different insertion order.
+        a['y'] = 42
+        self.assertEqual(list(a), ['x', 'z', 'y'])
+        self.assertEqual(list(b), ['x', 'y', 'z'])
+
+    @support.cpython_only
+    def test_splittable_pop(self):
+        a, b = self.make_shared_key_dict(2)
+
+        a.pop('y')
+        with self.assertRaises(KeyError):
+            a.pop('y')
+
+        self.assertEqual(list(a), ['x', 'z'])
+        self.assertEqual(list(b), ['x', 'y', 'z'])
+
+        # Two dicts have different insertion order.
+        a['y'] = 42
+        self.assertEqual(list(a), ['x', 'z', 'y'])
+        self.assertEqual(list(b), ['x', 'y', 'z'])
+
+    @support.cpython_only
+    def test_splittable_pop_pending(self):
+        """pop a pending key in a split table should not crash"""
+        a, b = self.make_shared_key_dict(2)
+
+        a['a'] = 4
+        with self.assertRaises(KeyError):
+            b.pop('a')
+
+    @support.cpython_only
+    def test_splittable_popitem(self):
+        """split table must be combined when d.popitem()"""
+        a, b = self.make_shared_key_dict(2)
+
+        orig_size = sys.getsizeof(a)
+
+        item = a.popitem()  # split table is combined
+        self.assertEqual(item, ('z', 3))
+        with self.assertRaises(KeyError):
+            del a['z']
+
+        self.assertGreater(sys.getsizeof(a), orig_size)
+        self.assertEqual(list(a), ['x', 'y'])
+        self.assertEqual(list(b), ['x', 'y', 'z'])
+
+    @support.cpython_only
+    def test_splittable_update(self):
+        """dict.update(other) must preserve order in other."""
+        class C:
+            def __init__(self, order):
+                if order:
+                    self.a, self.b, self.c = 1, 2, 3
+                else:
+                    self.c, self.b, self.a = 1, 2, 3
+        o = C(True)
+        o = C(False)  # o.__dict__ has reversed order.
+        self.assertEqual(list(o.__dict__), ["c", "b", "a"])
+
+        d = {}
+        d.update(o.__dict__)
+        self.assertEqual(list(d), ["c", "b", "a"])
+
+    @support.cpython_only
+    def test_splittable_to_generic_combinedtable(self):
+        """split table must be correctly resized and converted to generic combined table"""
+        class C:
+            pass
+
+        a = C()
+        a.x = 1
+        d = a.__dict__
+        d[2] = 2 # split table is resized to a generic combined table
+
+        self.assertEqual(list(d), ['x', 2])
+
+    def test_iterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            data = {1:"a", 2:"b", 3:"c"}
+            it = iter(data)
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            self.assertEqual(list(it), list(data))
+
+            it = pickle.loads(d)
+            try:
+                drop = next(it)
+            except StopIteration:
+                continue
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            del data[drop]
+            self.assertEqual(list(it), list(data))
+
+    def test_itemiterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            data = {1:"a", 2:"b", 3:"c"}
+            # dictviews aren't picklable, only their iterators
+            itorg = iter(data.items())
+            d = pickle.dumps(itorg, proto)
+            it = pickle.loads(d)
+            # note that the type of the unpickled iterator
+            # is not necessarily the same as the original.  It is
+            # merely an object supporting the iterator protocol, yielding
+            # the same objects as the original one.
+            # self.assertEqual(type(itorg), type(it))
+            self.assertIsInstance(it, collections.abc.Iterator)
+            self.assertEqual(dict(it), data)
+
+            it = pickle.loads(d)
+            drop = next(it)
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            del data[drop[0]]
+            self.assertEqual(dict(it), data)
+
+    def test_valuesiterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            data = {1:"a", 2:"b", 3:"c"}
+            # data.values() isn't picklable, only its iterator
+            it = iter(data.values())
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            self.assertEqual(list(it), list(data.values()))
+
+            it = pickle.loads(d)
+            drop = next(it)
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            values = list(it) + [drop]
+            self.assertEqual(sorted(values), sorted(list(data.values())))
+
+    def test_reverseiterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            data = {1:"a", 2:"b", 3:"c"}
+            it = reversed(data)
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            self.assertEqual(list(it), list(reversed(data)))
+
+            it = pickle.loads(d)
+            try:
+                drop = next(it)
+            except StopIteration:
+                continue
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            del data[drop]
+            self.assertEqual(list(it), list(reversed(data)))
+
+    def test_reverseitemiterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            data = {1:"a", 2:"b", 3:"c"}
+            # dictviews aren't picklable, only their iterators
+            itorg = reversed(data.items())
+            d = pickle.dumps(itorg, proto)
+            it = pickle.loads(d)
+            # note that the type of the unpickled iterator
+            # is not necessarily the same as the original.  It is
+            # merely an object supporting the iterator protocol, yielding
+            # the same objects as the original one.
+            # self.assertEqual(type(itorg), type(it))
+            self.assertIsInstance(it, collections.abc.Iterator)
+            self.assertEqual(dict(it), data)
+
+            it = pickle.loads(d)
+            drop = next(it)
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            del data[drop[0]]
+            self.assertEqual(dict(it), data)
+
+    def test_reversevaluesiterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            data = {1:"a", 2:"b", 3:"c"}
+            # data.values() isn't picklable, only its iterator
+            it = reversed(data.values())
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            self.assertEqual(list(it), list(reversed(data.values())))
+
+            it = pickle.loads(d)
+            drop = next(it)
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            values = list(it) + [drop]
+            self.assertEqual(sorted(values), sorted(data.values()))
+
+    def test_instance_dict_getattr_str_subclass(self):
+        class Foo:
+            def __init__(self, msg):
+                self.msg = msg
+        f = Foo('123')
+        class _str(str):
+            pass
+        self.assertEqual(f.msg, getattr(f, _str('msg')))
+        self.assertEqual(f.msg, f.__dict__[_str('msg')])
+
+    def test_object_set_item_single_instance_non_str_key(self):
+        class Foo: pass
+        f = Foo()
+        f.__dict__[1] = 1
+        f.a = 'a'
+        self.assertEqual(f.__dict__, {1:1, 'a':'a'})
+
+    def check_reentrant_insertion(self, mutate):
+        # This object will trigger mutation of the dict when replaced
+        # by another value.  Note this relies on refcounting: the test
+        # won't achieve its purpose on fully-GCed Python implementations.
+        class Mutating:
+            def __del__(self):
+                mutate(d)
+
+        d = {k: Mutating() for k in 'abcdefghijklmnopqr'}
+        for k in list(d):
+            d[k] = k
+
+    def test_reentrant_insertion(self):
+        # Reentrant insertion shouldn't crash (see issue #22653)
+        def mutate(d):
+            d['b'] = 5
+        self.check_reentrant_insertion(mutate)
+
+        def mutate(d):
+            d.update(self.__dict__)
+            d.clear()
+        self.check_reentrant_insertion(mutate)
+
+        def mutate(d):
+            while d:
+                d.popitem()
+        self.check_reentrant_insertion(mutate)
+
+    def test_merge_and_mutate(self):
+        class X:
+            def __hash__(self):
+                return 0
+
+            def __eq__(self, o):
+                other.clear()
+                return False
+
+        l = [(i,0) for i in range(1, 1337)]
+        other = dict(l)
+        other[X()] = 0
+        d = {X(): 0, 1: 1}
+        self.assertRaises(RuntimeError, d.update, other)
+
+    def test_free_after_iterating(self):
+        support.check_free_after_iterating(self, iter, dict)
+        support.check_free_after_iterating(self, lambda d: iter(d.keys()), dict)
+        support.check_free_after_iterating(self, lambda d: iter(d.values()), dict)
+        support.check_free_after_iterating(self, lambda d: iter(d.items()), dict)
+
+    def test_equal_operator_modifying_operand(self):
+        # test fix for seg fault reported in bpo-27945 part 3.
+        class X():
+            def __del__(self):
+                dict_b.clear()
+
+            def __eq__(self, other):
+                dict_a.clear()
+                return True
+
+            def __hash__(self):
+                return 13
+
+        dict_a = {X(): 0}
+        dict_b = {X(): X()}
+        self.assertTrue(dict_a == dict_b)
+
+        # test fix for seg fault reported in bpo-38588 part 1.
+        class Y:
+            def __eq__(self, other):
+                dict_d.clear()
+                return True
+
+        dict_c = {0: Y()}
+        dict_d = {0: set()}
+        self.assertTrue(dict_c == dict_d)
+
+    def test_fromkeys_operator_modifying_dict_operand(self):
+        # test fix for seg fault reported in issue 27945 part 4a.
+        class X(int):
+            def __hash__(self):
+                return 13
+
+            def __eq__(self, other):
+                if len(d) > 1:
+                    d.clear()
+                return False
+
+        d = {}  # this is required to exist so that d can be constructed!
+        d = {X(1): 1, X(2): 2}
+        try:
+            dict.fromkeys(d)  # shouldn't crash
+        except RuntimeError:  # implementation defined
+            pass
+
+    def test_fromkeys_operator_modifying_set_operand(self):
+        # test fix for seg fault reported in issue 27945 part 4b.
+        class X(int):
+            def __hash__(self):
+                return 13
+
+            def __eq__(self, other):
+                if len(d) > 1:
+                    d.clear()
+                return False
+
+        d = {}  # this is required to exist so that d can be constructed!
+        d = {X(1), X(2)}
+        try:
+            dict.fromkeys(d)  # shouldn't crash
+        except RuntimeError:  # implementation defined
+            pass
+
+    def test_dictitems_contains_use_after_free(self):
+        class X:
+            def __eq__(self, other):
+                d.clear()
+                return NotImplemented
+
+        d = {0: set()}
+        (0, X()) in d.items()
+
+    def test_dict_contain_use_after_free(self):
+        # bpo-40489
+        class S(str):
+            def __eq__(self, other):
+                d.clear()
+                return NotImplemented
+
+            def __hash__(self):
+                return hash('test')
+
+        d = {S(): 'value'}
+        self.assertFalse('test' in d)
+
+    def test_init_use_after_free(self):
+        class X:
+            def __hash__(self):
+                pair[:] = []
+                return 13
+
+        pair = [X(), 123]
+        dict([pair])
+
+    def test_oob_indexing_dictiter_iternextitem(self):
+        class X(int):
+            def __del__(self):
+                d.clear()
+
+        d = {i: X(i) for i in range(8)}
+
+        def iter_and_mutate():
+            for result in d.items():
+                if result[0] == 2:
+                    d[2] = None # free d[2] --> X(2).__del__ was called
+
+        self.assertRaises(RuntimeError, iter_and_mutate)
+
+    def test_reversed(self):
+        d = {"a": 1, "b": 2, "foo": 0, "c": 3, "d": 4}
+        del d["foo"]
+        r = reversed(d)
+        self.assertEqual(list(r), list('dcba'))
+        self.assertRaises(StopIteration, next, r)
+
+    def test_reverse_iterator_for_empty_dict(self):
+        # bpo-38525: reversed iterator should work properly
+
+        # empty dict is directly used for reference count test
+        self.assertEqual(list(reversed({})), [])
+        self.assertEqual(list(reversed({}.items())), [])
+        self.assertEqual(list(reversed({}.values())), [])
+        self.assertEqual(list(reversed({}.keys())), [])
+
+        # dict() and {} don't trigger the same code path
+        self.assertEqual(list(reversed(dict())), [])
+        self.assertEqual(list(reversed(dict().items())), [])
+        self.assertEqual(list(reversed(dict().values())), [])
+        self.assertEqual(list(reversed(dict().keys())), [])
+
+    def test_reverse_iterator_for_shared_shared_dicts(self):
+        class A:
+            def __init__(self, x, y):
+                if x: self.x = x
+                if y: self.y = y
+
+        self.assertEqual(list(reversed(A(1, 2).__dict__)), ['y', 'x'])
+        self.assertEqual(list(reversed(A(1, 0).__dict__)), ['x'])
+        self.assertEqual(list(reversed(A(0, 1).__dict__)), ['y'])
+
+    def test_dict_copy_order(self):
+        # bpo-34320
+        od = collections.OrderedDict([('a', 1), ('b', 2)])
+        od.move_to_end('a')
+        expected = list(od.items())
+
+        copy = dict(od)
+        self.assertEqual(list(copy.items()), expected)
+
+        # dict subclass doesn't override __iter__
+        class CustomDict(dict):
+            pass
+
+        pairs = [('a', 1), ('b', 2), ('c', 3)]
+
+        d = CustomDict(pairs)
+        self.assertEqual(pairs, list(dict(d).items()))
+
+        class CustomReversedDict(dict):
+            def keys(self):
+                return reversed(list(dict.keys(self)))
+
+            __iter__ = keys
+
+            def items(self):
+                return reversed(dict.items(self))
+
+        d = CustomReversedDict(pairs)
+        self.assertEqual(pairs[::-1], list(dict(d).items()))
+
+    @support.cpython_only
+    def test_dict_items_result_gc(self):
+        # bpo-42536: dict.items's tuple-reuse speed trick breaks the GC's
+        # assumptions about what can be untracked. Make sure we re-track result
+        # tuples whenever we reuse them.
+        it = iter({None: []}.items())
+        gc.collect()
+        # That GC collection probably untracked the recycled internal result
+        # tuple, which is initialized to (None, None). Make sure it's re-tracked
+        # when it's mutated and returned from __next__:
+        self.assertTrue(gc.is_tracked(next(it)))
+
+    @support.cpython_only
+    def test_dict_items_result_gc_reversed(self):
+        # Same as test_dict_items_result_gc above, but reversed.
+        it = reversed({None: []}.items())
+        gc.collect()
+        self.assertTrue(gc.is_tracked(next(it)))
+
+    def test_store_evilattr(self):
+        class EvilAttr:
+            def __init__(self, d):
+                self.d = d
+
+            def __del__(self):
+                if 'attr' in self.d:
+                    del self.d['attr']
+                gc.collect()
+
+        class Obj:
+            pass
+
+        obj = Obj()
+        obj.__dict__ = {}
+        for _ in range(10):
+            obj.attr = EvilAttr(obj.__dict__)
+
+    def test_str_nonstr(self):
+        # cpython uses a different lookup function if the dict only contains
+        # `str` keys. Make sure the unoptimized path is used when a non-`str`
+        # key appears.
+
+        class StrSub(str):
+            pass
+
+        eq_count = 0
+        # This class compares equal to the string 'key3'
+        class Key3:
+            def __hash__(self):
+                return hash('key3')
+
+            def __eq__(self, other):
+                nonlocal eq_count
+                if isinstance(other, Key3) or isinstance(other, str) and other == 'key3':
+                    eq_count += 1
+                    return True
+                return False
+
+        key3_1 = StrSub('key3')
+        key3_2 = Key3()
+        key3_3 = Key3()
+
+        dicts = []
+
+        # Create dicts of the form `{'key1': 42, 'key2': 43, key3: 44}` in a
+        # bunch of different ways. In all cases, `key3` is not of type `str`.
+        # `key3_1` is a `str` subclass and `key3_2` is a completely unrelated
+        # type.
+        for key3 in (key3_1, key3_2):
+            # A literal
+            dicts.append({'key1': 42, 'key2': 43, key3: 44})
+
+            # key3 inserted via `dict.__setitem__`
+            d = {'key1': 42, 'key2': 43}
+            d[key3] = 44
+            dicts.append(d)
+
+            # key3 inserted via `dict.setdefault`
+            d = {'key1': 42, 'key2': 43}
+            self.assertEqual(d.setdefault(key3, 44), 44)
+            dicts.append(d)
+
+            # key3 inserted via `dict.update`
+            d = {'key1': 42, 'key2': 43}
+            d.update({key3: 44})
+            dicts.append(d)
+
+            # key3 inserted via `dict.__ior__`
+            d = {'key1': 42, 'key2': 43}
+            d |= {key3: 44}
+            dicts.append(d)
+
+            # `dict(iterable)`
+            def make_pairs():
+                yield ('key1', 42)
+                yield ('key2', 43)
+                yield (key3, 44)
+            d = dict(make_pairs())
+            dicts.append(d)
+
+            # `dict.copy`
+            d = d.copy()
+            dicts.append(d)
+
+            # dict comprehension
+            d = {key: 42 + i for i,key in enumerate(['key1', 'key2', key3])}
+            dicts.append(d)
+
+        for d in dicts:
+            with self.subTest(d=d):
+                self.assertEqual(d.get('key1'), 42)
+
+                # Try to make an object that is of type `str` and is equal to
+                # `'key1'`, but (at least on cpython) is a different object.
+                noninterned_key1 = 'ke'
+                noninterned_key1 += 'y1'
+                if support.check_impl_detail(cpython=True):
+                    # suppress a SyntaxWarning
+                    interned_key1 = 'key1'
+                    self.assertFalse(noninterned_key1 is interned_key1)
+                self.assertEqual(d.get(noninterned_key1), 42)
+
+                self.assertEqual(d.get('key3'), 44)
+                self.assertEqual(d.get(key3_1), 44)
+                self.assertEqual(d.get(key3_2), 44)
+
+                # `key3_3` itself is definitely not a dict key, so make sure
+                # that `__eq__` gets called.
+                #
+                # Note that this might not hold for `key3_1` and `key3_2`
+                # because they might be the same object as one of the dict keys,
+                # in which case implementations are allowed to skip the call to
+                # `__eq__`.
+                eq_count = 0
+                self.assertEqual(d.get(key3_3), 44)
+                self.assertGreaterEqual(eq_count, 1)
+
+
+class CAPITest(__TestCase):
+
+    # Test _PyDict_GetItem_KnownHash()
+    @support.cpython_only
+    def test_getitem_knownhash(self):
+        _testinternalcapi = import_helper.import_module('_testinternalcapi')
+        dict_getitem_knownhash = _testinternalcapi.dict_getitem_knownhash
+
+        d = {'x': 1, 'y': 2, 'z': 3}
+        self.assertEqual(dict_getitem_knownhash(d, 'x', hash('x')), 1)
+        self.assertEqual(dict_getitem_knownhash(d, 'y', hash('y')), 2)
+        self.assertEqual(dict_getitem_knownhash(d, 'z', hash('z')), 3)
+
+        # not a dict
+        self.assertRaises(SystemError, dict_getitem_knownhash, [], 1, hash(1))
+        # key does not exist
+        self.assertRaises(KeyError, dict_getitem_knownhash, {}, 1, hash(1))
+
+        class Exc(Exception): pass
+        class BadEq:
+            def __eq__(self, other):
+                raise Exc
+            def __hash__(self):
+                return 7
+
+        k1, k2 = BadEq(), BadEq()
+        d = {k1: 1}
+        self.assertEqual(dict_getitem_knownhash(d, k1, hash(k1)), 1)
+        self.assertRaises(Exc, dict_getitem_knownhash, d, k2, hash(k2))
+
+
+from test import mapping_tests
+
+class GeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+    type2test = dict
+
+class Dict(dict):
+    pass
+
+class SubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+    type2test = Dict
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_float.diff b/test/dynamo/cpython/3_13/test_float.diff
new file mode 100644
index 000000000000..6b8586b1c663
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_float.diff
@@ -0,0 +1,279 @@
+diff --git a/test/dynamo/cpython/3_13/test_float.py b/test/dynamo/cpython/3_13/test_float.py
+index 97f951f1299..ce2c46777e0 100644
+--- a/test/dynamo/cpython/3_13/test_float.py
++++ b/test/dynamo/cpython/3_13/test_float.py
+@@ -1,3 +1,54 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ import fractions
+ import operator
+ import os
+@@ -8,11 +59,84 @@ import time
+ import unittest
+ 
+ from test import support
+-from test.support.testcase import FloatsAreIdenticalMixin
+-from test.support.numbers import (
+-    VALID_UNDERSCORE_LITERALS,
+-    INVALID_UNDERSCORE_LITERALS,
+-)
++
++VALID_UNDERSCORE_LITERALS = [
++    '0_0_0',
++    '4_2',
++    '1_0000_0000',
++    '0b1001_0100',
++    '0xffff_ffff',
++    '0o5_7_7',
++    '1_00_00.5',
++    '1_00_00.5e5',
++    '1_00_00e5_1',
++    '1e1_0',
++    '.1_4',
++    '.1_4e1',
++    '0b_0',
++    '0x_f',
++    '0o_5',
++    '1_00_00j',
++    '1_00_00.5j',
++    '1_00_00e5_1j',
++    '.1_4j',
++    '(1_2.5+3_3j)',
++    '(.5_6j)',
++]
++INVALID_UNDERSCORE_LITERALS = [
++    # Trailing underscores:
++    '0_',
++    '42_',
++    '1.4j_',
++    '0x_',
++    '0b1_',
++    '0xf_',
++    '0o5_',
++    '0 if 1_Else 1',
++    # Underscores in the base selector:
++    '0_b0',
++    '0_xf',
++    '0_o5',
++    # Old-style octal, still disallowed:
++    '0_7',
++    '09_99',
++    # Multiple consecutive underscores:
++    '4_______2',
++    '0.1__4',
++    '0.1__4j',
++    '0b1001__0100',
++    '0xffff__ffff',
++    '0x___',
++    '0o5__77',
++    '1e1__0',
++    '1e1__0j',
++    # Underscore right before a dot:
++    '1_.4',
++    '1_.4j',
++    # Underscore right after a dot:
++    '1._4',
++    '1._4j',
++    '._5',
++    '._5j',
++    # Underscore right after a sign:
++    '1.0e+_1',
++    '1.0e+_1j',
++    # Underscore right before j:
++    '1.4_j',
++    '1.4e5_j',
++    # Underscore right before e:
++    '1_e1',
++    '1.4_e1',
++    '1.4_e1j',
++    # Underscore right after e:
++    '1e_1',
++    '1.4e_1',
++    '1.4e_1j',
++    # Complex cases with parens:
++    '(1+1.5_j_)',
++    '(1+1.5_j)',
++]
++
+ from math import isinf, isnan, copysign, ldexp
+ import math
+ 
+@@ -35,7 +159,7 @@ class FloatSubclass(float):
+ class OtherFloatSubclass(float):
+     pass
+ 
+-class GeneralFloatCases(unittest.TestCase):
++class GeneralFloatCases(__TestCase):
+ 
+     def test_float(self):
+         self.assertEqual(float(3.14), 3.14)
+@@ -620,7 +744,7 @@ class GeneralFloatCases(unittest.TestCase):
+ 
+ 
+ @unittest.skipUnless(hasattr(float, "__getformat__"), "requires __getformat__")
+-class FormatFunctionsTestCase(unittest.TestCase):
++class FormatFunctionsTestCase(__TestCase):
+     def test_getformat(self):
+         self.assertIn(float.__getformat__('double'),
+                       ['unknown', 'IEEE, big-endian', 'IEEE, little-endian'])
+@@ -645,7 +769,7 @@ LE_FLOAT_NAN = bytes(reversed(BE_FLOAT_NAN))
+ # is accident (today).
+ # let's also try to guarantee that -0.0 and 0.0 don't get confused.
+ 
+-class IEEEFormatTestCase(unittest.TestCase):
++class IEEEFormatTestCase(__TestCase):
+ 
+     @support.requires_IEEE_754
+     def test_double_specials_do_unpack(self):
+@@ -670,7 +794,7 @@ class IEEEFormatTestCase(unittest.TestCase):
+         self.assertEqual(struct.pack("<f", 3.40282356e38), struct.pack("<f", FLT_MAX))
+         self.assertEqual(struct.pack("<f", -3.40282356e38), struct.pack("<f", -FLT_MAX))
+ 
+-class FormatTestCase(unittest.TestCase):
++class FormatTestCase(__TestCase):
+ 
+     def test_format(self):
+         # these should be rewritten to use both format(x, spec) and
+@@ -767,7 +891,7 @@ class FormatTestCase(unittest.TestCase):
+         self.assertEqual(format(-123.34, '00.10e'), '-1.2334000000e+02')
+         self.assertEqual(format(-123.34, '00.10g'), '-123.34')
+ 
+-class ReprTestCase(unittest.TestCase):
++class ReprTestCase(__TestCase):
+     def test_repr(self):
+         with open(os.path.join(os.path.split(__file__)[0],
+                   'mathdata',
+@@ -832,7 +956,29 @@ class ReprTestCase(unittest.TestCase):
+             self.assertEqual(repr(float(negs)), str(float(negs)))
+ 
+ @support.requires_IEEE_754
+-class RoundTestCase(unittest.TestCase, FloatsAreIdenticalMixin):
++class RoundTestCase(__TestCase):
++    def assertFloatsAreIdentical(self, x, y):
++        """assert that floats x and y are identical, in the sense that:
++        (1) both x and y are nans, or
++        (2) both x and y are infinities, with the same sign, or
++        (3) both x and y are zeros, with the same sign, or
++        (4) x and y are both finite and nonzero, and x == y
++
++        """
++        msg = 'floats {!r} and {!r} are not identical'
++
++        if isnan(x) or isnan(y):
++            if isnan(x) and isnan(y):
++                return
++        elif x == y:
++            if x != 0.0:
++                return
++            # both zero; check that signs match
++            elif copysign(1.0, x) == copysign(1.0, y):
++                return
++            else:
++                msg += ': zeros have different signs'
++        self.fail(msg.format(x, y))
+ 
+     def test_inf_nan(self):
+         self.assertRaises(OverflowError, round, INF)
+@@ -955,7 +1101,7 @@ class RoundTestCase(unittest.TestCase, FloatsAreIdenticalMixin):
+ 
+ # Beginning with Python 2.6 float has cross platform compatible
+ # ways to create and represent inf and nan
+-class InfNanTest(unittest.TestCase):
++class InfNanTest(__TestCase):
+     def test_inf_from_str(self):
+         self.assertTrue(isinf(float("inf")))
+         self.assertTrue(isinf(float("+inf")))
+@@ -1056,12 +1202,35 @@ class InfNanTest(unittest.TestCase):
+ 
+ fromHex = float.fromhex
+ toHex = float.hex
+-class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
++class HexFloatTestCase(__TestCase):
+     MAX = fromHex('0x.fffffffffffff8p+1024')  # max normal
+     MIN = fromHex('0x1p-1022')                # min normal
+     TINY = fromHex('0x0.0000000000001p-1022') # min subnormal
+     EPS = fromHex('0x0.0000000000001p0') # diff between 1.0 and next float up
+ 
++    def assertFloatsAreIdentical(self, x, y):
++        """assert that floats x and y are identical, in the sense that:
++        (1) both x and y are nans, or
++        (2) both x and y are infinities, with the same sign, or
++        (3) both x and y are zeros, with the same sign, or
++        (4) x and y are both finite and nonzero, and x == y
++
++        """
++        msg = 'floats {!r} and {!r} are not identical'
++
++        if isnan(x) or isnan(y):
++            if isnan(x) and isnan(y):
++                return
++        elif x == y:
++            if x != 0.0:
++                return
++            # both zero; check that signs match
++            elif copysign(1.0, x) == copysign(1.0, y):
++                return
++            else:
++                msg += ': zeros have different signs'
++        self.fail(msg.format(x, y))
++
+     def identical(self, x, y):
+         self.assertFloatsAreIdentical(x, y)
+ 
+@@ -1500,5 +1669,5 @@ class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
+         self.assertEqual(getattr(f, 'foo', 'none'), 'bar')
+ 
+ 
+-if __name__ == '__main__':
+-    unittest.main()
++if __name__ == "__main__":
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_float.py b/test/dynamo/cpython/3_13/test_float.py
new file mode 100644
index 000000000000..ce2c46777e06
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_float.py
@@ -0,0 +1,1673 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+import fractions
+import operator
+import os
+import random
+import sys
+import struct
+import time
+import unittest
+
+from test import support
+
+VALID_UNDERSCORE_LITERALS = [
+    '0_0_0',
+    '4_2',
+    '1_0000_0000',
+    '0b1001_0100',
+    '0xffff_ffff',
+    '0o5_7_7',
+    '1_00_00.5',
+    '1_00_00.5e5',
+    '1_00_00e5_1',
+    '1e1_0',
+    '.1_4',
+    '.1_4e1',
+    '0b_0',
+    '0x_f',
+    '0o_5',
+    '1_00_00j',
+    '1_00_00.5j',
+    '1_00_00e5_1j',
+    '.1_4j',
+    '(1_2.5+3_3j)',
+    '(.5_6j)',
+]
+INVALID_UNDERSCORE_LITERALS = [
+    # Trailing underscores:
+    '0_',
+    '42_',
+    '1.4j_',
+    '0x_',
+    '0b1_',
+    '0xf_',
+    '0o5_',
+    '0 if 1_Else 1',
+    # Underscores in the base selector:
+    '0_b0',
+    '0_xf',
+    '0_o5',
+    # Old-style octal, still disallowed:
+    '0_7',
+    '09_99',
+    # Multiple consecutive underscores:
+    '4_______2',
+    '0.1__4',
+    '0.1__4j',
+    '0b1001__0100',
+    '0xffff__ffff',
+    '0x___',
+    '0o5__77',
+    '1e1__0',
+    '1e1__0j',
+    # Underscore right before a dot:
+    '1_.4',
+    '1_.4j',
+    # Underscore right after a dot:
+    '1._4',
+    '1._4j',
+    '._5',
+    '._5j',
+    # Underscore right after a sign:
+    '1.0e+_1',
+    '1.0e+_1j',
+    # Underscore right before j:
+    '1.4_j',
+    '1.4e5_j',
+    # Underscore right before e:
+    '1_e1',
+    '1.4_e1',
+    '1.4_e1j',
+    # Underscore right after e:
+    '1e_1',
+    '1.4e_1',
+    '1.4e_1j',
+    # Complex cases with parens:
+    '(1+1.5_j_)',
+    '(1+1.5_j)',
+]
+
+from math import isinf, isnan, copysign, ldexp
+import math
+
+try:
+    import _testcapi
+except ImportError:
+    _testcapi = None
+
+INF = float("inf")
+NAN = float("nan")
+
+
+#locate file with float format test values
+test_dir = os.path.dirname(__file__) or os.curdir
+format_testfile = os.path.join(test_dir, 'mathdata', 'formatfloat_testcases.txt')
+
+class FloatSubclass(float):
+    pass
+
+class OtherFloatSubclass(float):
+    pass
+
+class GeneralFloatCases(__TestCase):
+
+    def test_float(self):
+        self.assertEqual(float(3.14), 3.14)
+        self.assertEqual(float(314), 314.0)
+        self.assertEqual(float("  3.14  "), 3.14)
+        self.assertRaises(ValueError, float, "  0x3.1  ")
+        self.assertRaises(ValueError, float, "  -0x3.p-1  ")
+        self.assertRaises(ValueError, float, "  +0x3.p-1  ")
+        self.assertRaises(ValueError, float, "++3.14")
+        self.assertRaises(ValueError, float, "+-3.14")
+        self.assertRaises(ValueError, float, "-+3.14")
+        self.assertRaises(ValueError, float, "--3.14")
+        self.assertRaises(ValueError, float, ".nan")
+        self.assertRaises(ValueError, float, "+.inf")
+        self.assertRaises(ValueError, float, ".")
+        self.assertRaises(ValueError, float, "-.")
+        self.assertRaises(TypeError, float, {})
+        self.assertRaisesRegex(TypeError, "not 'dict'", float, {})
+        # Lone surrogate
+        self.assertRaises(ValueError, float, '\uD8F0')
+        # check that we don't accept alternate exponent markers
+        self.assertRaises(ValueError, float, "-1.7d29")
+        self.assertRaises(ValueError, float, "3D-14")
+        self.assertEqual(float("  \u0663.\u0661\u0664  "), 3.14)
+        self.assertEqual(float("\N{EM SPACE}3.14\N{EN SPACE}"), 3.14)
+        # extra long strings should not be a problem
+        float(b'.' + b'1'*1000)
+        float('.' + '1'*1000)
+        # Invalid unicode string
+        # See bpo-34087
+        self.assertRaises(ValueError, float, '\u3053\u3093\u306b\u3061\u306f')
+
+    def test_noargs(self):
+        self.assertEqual(float(), 0.0)
+
+    def test_underscores(self):
+        for lit in VALID_UNDERSCORE_LITERALS:
+            if not any(ch in lit for ch in 'jJxXoObB'):
+                self.assertEqual(float(lit), eval(lit))
+                self.assertEqual(float(lit), float(lit.replace('_', '')))
+        for lit in INVALID_UNDERSCORE_LITERALS:
+            if lit in ('0_7', '09_99'):  # octals are not recognized here
+                continue
+            if not any(ch in lit for ch in 'jJxXoObB'):
+                self.assertRaises(ValueError, float, lit)
+        # Additional test cases; nan and inf are never valid as literals,
+        # only in the float() constructor, but we don't allow underscores
+        # in or around them.
+        self.assertRaises(ValueError, float, '_NaN')
+        self.assertRaises(ValueError, float, 'Na_N')
+        self.assertRaises(ValueError, float, 'IN_F')
+        self.assertRaises(ValueError, float, '-_INF')
+        self.assertRaises(ValueError, float, '-INF_')
+        # Check that we handle bytes values correctly.
+        self.assertRaises(ValueError, float, b'0_.\xff9')
+
+    def test_non_numeric_input_types(self):
+        # Test possible non-numeric types for the argument x, including
+        # subclasses of the explicitly documented accepted types.
+        class CustomStr(str): pass
+        class CustomBytes(bytes): pass
+        class CustomByteArray(bytearray): pass
+
+        factories = [
+            bytes,
+            bytearray,
+            lambda b: CustomStr(b.decode()),
+            CustomBytes,
+            CustomByteArray,
+            memoryview,
+        ]
+        try:
+            from array import array
+        except ImportError:
+            pass
+        else:
+            factories.append(lambda b: array('B', b))
+
+        for f in factories:
+            x = f(b" 3.14  ")
+            with self.subTest(type(x)):
+                self.assertEqual(float(x), 3.14)
+                with self.assertRaisesRegex(ValueError, "could not convert"):
+                    float(f(b'A' * 0x10))
+
+    def test_float_memoryview(self):
+        self.assertEqual(float(memoryview(b'12.3')[1:4]), 2.3)
+        self.assertEqual(float(memoryview(b'12.3\x00')[1:4]), 2.3)
+        self.assertEqual(float(memoryview(b'12.3 ')[1:4]), 2.3)
+        self.assertEqual(float(memoryview(b'12.3A')[1:4]), 2.3)
+        self.assertEqual(float(memoryview(b'12.34')[1:4]), 2.3)
+
+    def test_error_message(self):
+        def check(s):
+            with self.assertRaises(ValueError, msg='float(%r)' % (s,)) as cm:
+                float(s)
+            self.assertEqual(str(cm.exception),
+                'could not convert string to float: %r' % (s,))
+
+        check('\xbd')
+        check('123\xbd')
+        check('  123 456  ')
+        check(b'  123 456  ')
+        # all whitespace (cf. https://github.com/python/cpython/issues/95605)
+        check('')
+        check(' ')
+        check('\t \n')
+
+        # non-ascii digits (error came from non-digit '!')
+        check('\u0663\u0661\u0664!')
+        # embedded NUL
+        check('123\x00')
+        check('123\x00 245')
+        check('123\x00245')
+        # byte string with embedded NUL
+        check(b'123\x00')
+        # non-UTF-8 byte string
+        check(b'123\xa0')
+
+    @support.run_with_locale('LC_NUMERIC', 'fr_FR', 'de_DE', '')
+    def test_float_with_comma(self):
+        # set locale to something that doesn't use '.' for the decimal point
+        # float must not accept the locale specific decimal point but
+        # it still has to accept the normal python syntax
+        import locale
+        if not locale.localeconv()['decimal_point'] == ',':
+            self.skipTest('decimal_point is not ","')
+
+        self.assertEqual(float("  3.14  "), 3.14)
+        self.assertEqual(float("+3.14  "), 3.14)
+        self.assertEqual(float("-3.14  "), -3.14)
+        self.assertEqual(float(".14  "), .14)
+        self.assertEqual(float("3.  "), 3.0)
+        self.assertEqual(float("3.e3  "), 3000.0)
+        self.assertEqual(float("3.2e3  "), 3200.0)
+        self.assertEqual(float("2.5e-1  "), 0.25)
+        self.assertEqual(float("5e-1"), 0.5)
+        self.assertRaises(ValueError, float, "  3,14  ")
+        self.assertRaises(ValueError, float, "  +3,14  ")
+        self.assertRaises(ValueError, float, "  -3,14  ")
+        self.assertRaises(ValueError, float, "  0x3.1  ")
+        self.assertRaises(ValueError, float, "  -0x3.p-1  ")
+        self.assertRaises(ValueError, float, "  +0x3.p-1  ")
+        self.assertEqual(float("  25.e-1  "), 2.5)
+        self.assertAlmostEqual(float("  .25e-1  "), .025)
+
+    def test_floatconversion(self):
+        # Make sure that calls to __float__() work properly
+        class Foo1(object):
+            def __float__(self):
+                return 42.
+
+        class Foo2(float):
+            def __float__(self):
+                return 42.
+
+        class Foo3(float):
+            def __new__(cls, value=0.):
+                return float.__new__(cls, 2*value)
+
+            def __float__(self):
+                return self
+
+        class Foo4(float):
+            def __float__(self):
+                return 42
+
+        # Issue 5759: __float__ not called on str subclasses (though it is on
+        # unicode subclasses).
+        class FooStr(str):
+            def __float__(self):
+                return float(str(self)) + 1
+
+        self.assertEqual(float(Foo1()), 42.)
+        self.assertEqual(float(Foo2()), 42.)
+        with self.assertWarns(DeprecationWarning):
+            self.assertEqual(float(Foo3(21)), 42.)
+        self.assertRaises(TypeError, float, Foo4(42))
+        self.assertEqual(float(FooStr('8')), 9.)
+
+        class Foo5:
+            def __float__(self):
+                return ""
+        self.assertRaises(TypeError, time.sleep, Foo5())
+
+        # Issue #24731
+        class F:
+            def __float__(self):
+                return OtherFloatSubclass(42.)
+        with self.assertWarns(DeprecationWarning):
+            self.assertEqual(float(F()), 42.)
+        with self.assertWarns(DeprecationWarning):
+            self.assertIs(type(float(F())), float)
+        with self.assertWarns(DeprecationWarning):
+            self.assertEqual(FloatSubclass(F()), 42.)
+        with self.assertWarns(DeprecationWarning):
+            self.assertIs(type(FloatSubclass(F())), FloatSubclass)
+
+        class MyIndex:
+            def __init__(self, value):
+                self.value = value
+            def __index__(self):
+                return self.value
+
+        self.assertEqual(float(MyIndex(42)), 42.0)
+        self.assertRaises(OverflowError, float, MyIndex(2**2000))
+
+        class MyInt:
+            def __int__(self):
+                return 42
+
+        self.assertRaises(TypeError, float, MyInt())
+
+    def test_keyword_args(self):
+        with self.assertRaisesRegex(TypeError, 'keyword argument'):
+            float(x='3.14')
+
+    def test_keywords_in_subclass(self):
+        class subclass(float):
+            pass
+        u = subclass(2.5)
+        self.assertIs(type(u), subclass)
+        self.assertEqual(float(u), 2.5)
+        with self.assertRaises(TypeError):
+            subclass(x=0)
+
+        class subclass_with_init(float):
+            def __init__(self, arg, newarg=None):
+                self.newarg = newarg
+        u = subclass_with_init(2.5, newarg=3)
+        self.assertIs(type(u), subclass_with_init)
+        self.assertEqual(float(u), 2.5)
+        self.assertEqual(u.newarg, 3)
+
+        class subclass_with_new(float):
+            def __new__(cls, arg, newarg=None):
+                self = super().__new__(cls, arg)
+                self.newarg = newarg
+                return self
+        u = subclass_with_new(2.5, newarg=3)
+        self.assertIs(type(u), subclass_with_new)
+        self.assertEqual(float(u), 2.5)
+        self.assertEqual(u.newarg, 3)
+
+    def test_is_integer(self):
+        self.assertFalse((1.1).is_integer())
+        self.assertTrue((1.).is_integer())
+        self.assertFalse(float("nan").is_integer())
+        self.assertFalse(float("inf").is_integer())
+
+    def test_floatasratio(self):
+        for f, ratio in [
+                (0.875, (7, 8)),
+                (-0.875, (-7, 8)),
+                (0.0, (0, 1)),
+                (11.5, (23, 2)),
+            ]:
+            self.assertEqual(f.as_integer_ratio(), ratio)
+
+        for i in range(10000):
+            f = random.random()
+            f *= 10 ** random.randint(-100, 100)
+            n, d = f.as_integer_ratio()
+            self.assertEqual(float(n).__truediv__(d), f)
+
+        R = fractions.Fraction
+        self.assertEqual(R(0, 1),
+                         R(*float(0.0).as_integer_ratio()))
+        self.assertEqual(R(5, 2),
+                         R(*float(2.5).as_integer_ratio()))
+        self.assertEqual(R(1, 2),
+                         R(*float(0.5).as_integer_ratio()))
+        self.assertEqual(R(4728779608739021, 2251799813685248),
+                         R(*float(2.1).as_integer_ratio()))
+        self.assertEqual(R(-4728779608739021, 2251799813685248),
+                         R(*float(-2.1).as_integer_ratio()))
+        self.assertEqual(R(-2100, 1),
+                         R(*float(-2100.0).as_integer_ratio()))
+
+        self.assertRaises(OverflowError, float('inf').as_integer_ratio)
+        self.assertRaises(OverflowError, float('-inf').as_integer_ratio)
+        self.assertRaises(ValueError, float('nan').as_integer_ratio)
+
+    def test_float_containment(self):
+        floats = (INF, -INF, 0.0, 1.0, NAN)
+        for f in floats:
+            self.assertIn(f, [f])
+            self.assertIn(f, (f,))
+            self.assertIn(f, {f})
+            self.assertIn(f, {f: None})
+            self.assertEqual([f].count(f), 1, "[].count('%r') != 1" % f)
+            self.assertIn(f, floats)
+
+        for f in floats:
+            # nonidentical containers, same type, same contents
+            self.assertTrue([f] == [f], "[%r] != [%r]" % (f, f))
+            self.assertTrue((f,) == (f,), "(%r,) != (%r,)" % (f, f))
+            self.assertTrue({f} == {f}, "{%r} != {%r}" % (f, f))
+            self.assertTrue({f : None} == {f: None}, "{%r : None} != "
+                                                   "{%r : None}" % (f, f))
+
+            # identical containers
+            l, t, s, d = [f], (f,), {f}, {f: None}
+            self.assertTrue(l == l, "[%r] not equal to itself" % f)
+            self.assertTrue(t == t, "(%r,) not equal to itself" % f)
+            self.assertTrue(s == s, "{%r} not equal to itself" % f)
+            self.assertTrue(d == d, "{%r : None} not equal to itself" % f)
+
+    def assertEqualAndEqualSign(self, a, b):
+        # fail unless a == b and a and b have the same sign bit;
+        # the only difference from assertEqual is that this test
+        # distinguishes -0.0 and 0.0.
+        self.assertEqual((a, copysign(1.0, a)), (b, copysign(1.0, b)))
+
+    def test_float_floor(self):
+        self.assertIsInstance(float(0.5).__floor__(), int)
+        self.assertEqual(float(0.5).__floor__(), 0)
+        self.assertEqual(float(1.0).__floor__(), 1)
+        self.assertEqual(float(1.5).__floor__(), 1)
+        self.assertEqual(float(-0.5).__floor__(), -1)
+        self.assertEqual(float(-1.0).__floor__(), -1)
+        self.assertEqual(float(-1.5).__floor__(), -2)
+        self.assertEqual(float(1.23e167).__floor__(), 1.23e167)
+        self.assertEqual(float(-1.23e167).__floor__(), -1.23e167)
+        self.assertRaises(ValueError, float("nan").__floor__)
+        self.assertRaises(OverflowError, float("inf").__floor__)
+        self.assertRaises(OverflowError, float("-inf").__floor__)
+
+    def test_float_ceil(self):
+        self.assertIsInstance(float(0.5).__ceil__(), int)
+        self.assertEqual(float(0.5).__ceil__(), 1)
+        self.assertEqual(float(1.0).__ceil__(), 1)
+        self.assertEqual(float(1.5).__ceil__(), 2)
+        self.assertEqual(float(-0.5).__ceil__(), 0)
+        self.assertEqual(float(-1.0).__ceil__(), -1)
+        self.assertEqual(float(-1.5).__ceil__(), -1)
+        self.assertEqual(float(1.23e167).__ceil__(), 1.23e167)
+        self.assertEqual(float(-1.23e167).__ceil__(), -1.23e167)
+        self.assertRaises(ValueError, float("nan").__ceil__)
+        self.assertRaises(OverflowError, float("inf").__ceil__)
+        self.assertRaises(OverflowError, float("-inf").__ceil__)
+
+    @support.requires_IEEE_754
+    def test_float_mod(self):
+        # Check behaviour of % operator for IEEE 754 special cases.
+        # In particular, check signs of zeros.
+        mod = operator.mod
+
+        self.assertEqualAndEqualSign(mod(-1.0, 1.0), 0.0)
+        self.assertEqualAndEqualSign(mod(-1e-100, 1.0), 1.0)
+        self.assertEqualAndEqualSign(mod(-0.0, 1.0), 0.0)
+        self.assertEqualAndEqualSign(mod(0.0, 1.0), 0.0)
+        self.assertEqualAndEqualSign(mod(1e-100, 1.0), 1e-100)
+        self.assertEqualAndEqualSign(mod(1.0, 1.0), 0.0)
+
+        self.assertEqualAndEqualSign(mod(-1.0, -1.0), -0.0)
+        self.assertEqualAndEqualSign(mod(-1e-100, -1.0), -1e-100)
+        self.assertEqualAndEqualSign(mod(-0.0, -1.0), -0.0)
+        self.assertEqualAndEqualSign(mod(0.0, -1.0), -0.0)
+        self.assertEqualAndEqualSign(mod(1e-100, -1.0), -1.0)
+        self.assertEqualAndEqualSign(mod(1.0, -1.0), -0.0)
+
+    @support.requires_IEEE_754
+    def test_float_pow(self):
+        # test builtin pow and ** operator for IEEE 754 special cases.
+        # Special cases taken from section F.9.4.4 of the C99 specification
+
+        for pow_op in pow, operator.pow:
+            # x**NAN is NAN for any x except 1
+            self.assertTrue(isnan(pow_op(-INF, NAN)))
+            self.assertTrue(isnan(pow_op(-2.0, NAN)))
+            self.assertTrue(isnan(pow_op(-1.0, NAN)))
+            self.assertTrue(isnan(pow_op(-0.5, NAN)))
+            self.assertTrue(isnan(pow_op(-0.0, NAN)))
+            self.assertTrue(isnan(pow_op(0.0, NAN)))
+            self.assertTrue(isnan(pow_op(0.5, NAN)))
+            self.assertTrue(isnan(pow_op(2.0, NAN)))
+            self.assertTrue(isnan(pow_op(INF, NAN)))
+            self.assertTrue(isnan(pow_op(NAN, NAN)))
+
+            # NAN**y is NAN for any y except +-0
+            self.assertTrue(isnan(pow_op(NAN, -INF)))
+            self.assertTrue(isnan(pow_op(NAN, -2.0)))
+            self.assertTrue(isnan(pow_op(NAN, -1.0)))
+            self.assertTrue(isnan(pow_op(NAN, -0.5)))
+            self.assertTrue(isnan(pow_op(NAN, 0.5)))
+            self.assertTrue(isnan(pow_op(NAN, 1.0)))
+            self.assertTrue(isnan(pow_op(NAN, 2.0)))
+            self.assertTrue(isnan(pow_op(NAN, INF)))
+
+            # (+-0)**y raises ZeroDivisionError for y a negative odd integer
+            self.assertRaises(ZeroDivisionError, pow_op, -0.0, -1.0)
+            self.assertRaises(ZeroDivisionError, pow_op, 0.0, -1.0)
+
+            # (+-0)**y raises ZeroDivisionError for y finite and negative
+            # but not an odd integer
+            self.assertRaises(ZeroDivisionError, pow_op, -0.0, -2.0)
+            self.assertRaises(ZeroDivisionError, pow_op, -0.0, -0.5)
+            self.assertRaises(ZeroDivisionError, pow_op, 0.0, -2.0)
+            self.assertRaises(ZeroDivisionError, pow_op, 0.0, -0.5)
+
+            # (+-0)**y is +-0 for y a positive odd integer
+            self.assertEqualAndEqualSign(pow_op(-0.0, 1.0), -0.0)
+            self.assertEqualAndEqualSign(pow_op(0.0, 1.0), 0.0)
+
+            # (+-0)**y is 0 for y finite and positive but not an odd integer
+            self.assertEqualAndEqualSign(pow_op(-0.0, 0.5), 0.0)
+            self.assertEqualAndEqualSign(pow_op(-0.0, 2.0), 0.0)
+            self.assertEqualAndEqualSign(pow_op(0.0, 0.5), 0.0)
+            self.assertEqualAndEqualSign(pow_op(0.0, 2.0), 0.0)
+
+            # (-1)**+-inf is 1
+            self.assertEqualAndEqualSign(pow_op(-1.0, -INF), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, INF), 1.0)
+
+            # 1**y is 1 for any y, even if y is an infinity or nan
+            self.assertEqualAndEqualSign(pow_op(1.0, -INF), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, -2.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, -1.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, -0.5), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, 0.5), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, 1.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, 2.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, INF), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, NAN), 1.0)
+
+            # x**+-0 is 1 for any x, even if x is a zero, infinity, or nan
+            self.assertEqualAndEqualSign(pow_op(-INF, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-2.0, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-0.5, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-0.0, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(0.0, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(0.5, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(2.0, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(INF, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(NAN, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-INF, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-2.0, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-0.5, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-0.0, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(0.0, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(0.5, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(2.0, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(INF, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(NAN, -0.0), 1.0)
+
+            # x**y defers to complex pow for finite negative x and
+            # non-integral y.
+            self.assertEqual(type(pow_op(-2.0, -0.5)), complex)
+            self.assertEqual(type(pow_op(-2.0, 0.5)), complex)
+            self.assertEqual(type(pow_op(-1.0, -0.5)), complex)
+            self.assertEqual(type(pow_op(-1.0, 0.5)), complex)
+            self.assertEqual(type(pow_op(-0.5, -0.5)), complex)
+            self.assertEqual(type(pow_op(-0.5, 0.5)), complex)
+
+            # x**-INF is INF for abs(x) < 1
+            self.assertEqualAndEqualSign(pow_op(-0.5, -INF), INF)
+            self.assertEqualAndEqualSign(pow_op(-0.0, -INF), INF)
+            self.assertEqualAndEqualSign(pow_op(0.0, -INF), INF)
+            self.assertEqualAndEqualSign(pow_op(0.5, -INF), INF)
+
+            # x**-INF is 0 for abs(x) > 1
+            self.assertEqualAndEqualSign(pow_op(-INF, -INF), 0.0)
+            self.assertEqualAndEqualSign(pow_op(-2.0, -INF), 0.0)
+            self.assertEqualAndEqualSign(pow_op(2.0, -INF), 0.0)
+            self.assertEqualAndEqualSign(pow_op(INF, -INF), 0.0)
+
+            # x**INF is 0 for abs(x) < 1
+            self.assertEqualAndEqualSign(pow_op(-0.5, INF), 0.0)
+            self.assertEqualAndEqualSign(pow_op(-0.0, INF), 0.0)
+            self.assertEqualAndEqualSign(pow_op(0.0, INF), 0.0)
+            self.assertEqualAndEqualSign(pow_op(0.5, INF), 0.0)
+
+            # x**INF is INF for abs(x) > 1
+            self.assertEqualAndEqualSign(pow_op(-INF, INF), INF)
+            self.assertEqualAndEqualSign(pow_op(-2.0, INF), INF)
+            self.assertEqualAndEqualSign(pow_op(2.0, INF), INF)
+            self.assertEqualAndEqualSign(pow_op(INF, INF), INF)
+
+            # (-INF)**y is -0.0 for y a negative odd integer
+            self.assertEqualAndEqualSign(pow_op(-INF, -1.0), -0.0)
+
+            # (-INF)**y is 0.0 for y negative but not an odd integer
+            self.assertEqualAndEqualSign(pow_op(-INF, -0.5), 0.0)
+            self.assertEqualAndEqualSign(pow_op(-INF, -2.0), 0.0)
+
+            # (-INF)**y is -INF for y a positive odd integer
+            self.assertEqualAndEqualSign(pow_op(-INF, 1.0), -INF)
+
+            # (-INF)**y is INF for y positive but not an odd integer
+            self.assertEqualAndEqualSign(pow_op(-INF, 0.5), INF)
+            self.assertEqualAndEqualSign(pow_op(-INF, 2.0), INF)
+
+            # INF**y is INF for y positive
+            self.assertEqualAndEqualSign(pow_op(INF, 0.5), INF)
+            self.assertEqualAndEqualSign(pow_op(INF, 1.0), INF)
+            self.assertEqualAndEqualSign(pow_op(INF, 2.0), INF)
+
+            # INF**y is 0.0 for y negative
+            self.assertEqualAndEqualSign(pow_op(INF, -2.0), 0.0)
+            self.assertEqualAndEqualSign(pow_op(INF, -1.0), 0.0)
+            self.assertEqualAndEqualSign(pow_op(INF, -0.5), 0.0)
+
+            # basic checks not covered by the special cases above
+            self.assertEqualAndEqualSign(pow_op(-2.0, -2.0), 0.25)
+            self.assertEqualAndEqualSign(pow_op(-2.0, -1.0), -0.5)
+            self.assertEqualAndEqualSign(pow_op(-2.0, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-2.0, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-2.0, 1.0), -2.0)
+            self.assertEqualAndEqualSign(pow_op(-2.0, 2.0), 4.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, -2.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, -1.0), -1.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, 1.0), -1.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, 2.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(2.0, -2.0), 0.25)
+            self.assertEqualAndEqualSign(pow_op(2.0, -1.0), 0.5)
+            self.assertEqualAndEqualSign(pow_op(2.0, -0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(2.0, 0.0), 1.0)
+            self.assertEqualAndEqualSign(pow_op(2.0, 1.0), 2.0)
+            self.assertEqualAndEqualSign(pow_op(2.0, 2.0), 4.0)
+
+            # 1 ** large and -1 ** large; some libms apparently
+            # have problems with these
+            self.assertEqualAndEqualSign(pow_op(1.0, -1e100), 1.0)
+            self.assertEqualAndEqualSign(pow_op(1.0, 1e100), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, -1e100), 1.0)
+            self.assertEqualAndEqualSign(pow_op(-1.0, 1e100), 1.0)
+
+            # check sign for results that underflow to 0
+            self.assertEqualAndEqualSign(pow_op(-2.0, -2000.0), 0.0)
+            self.assertEqual(type(pow_op(-2.0, -2000.5)), complex)
+            self.assertEqualAndEqualSign(pow_op(-2.0, -2001.0), -0.0)
+            self.assertEqualAndEqualSign(pow_op(2.0, -2000.0), 0.0)
+            self.assertEqualAndEqualSign(pow_op(2.0, -2000.5), 0.0)
+            self.assertEqualAndEqualSign(pow_op(2.0, -2001.0), 0.0)
+            self.assertEqualAndEqualSign(pow_op(-0.5, 2000.0), 0.0)
+            self.assertEqual(type(pow_op(-0.5, 2000.5)), complex)
+            self.assertEqualAndEqualSign(pow_op(-0.5, 2001.0), -0.0)
+            self.assertEqualAndEqualSign(pow_op(0.5, 2000.0), 0.0)
+            self.assertEqualAndEqualSign(pow_op(0.5, 2000.5), 0.0)
+            self.assertEqualAndEqualSign(pow_op(0.5, 2001.0), 0.0)
+
+            # check we don't raise an exception for subnormal results,
+            # and validate signs.  Tests currently disabled, since
+            # they fail on systems where a subnormal result from pow
+            # is flushed to zero (e.g. Debian/ia64.)
+            #self.assertTrue(0.0 < pow_op(0.5, 1048) < 1e-315)
+            #self.assertTrue(0.0 < pow_op(-0.5, 1048) < 1e-315)
+            #self.assertTrue(0.0 < pow_op(0.5, 1047) < 1e-315)
+            #self.assertTrue(0.0 > pow_op(-0.5, 1047) > -1e-315)
+            #self.assertTrue(0.0 < pow_op(2.0, -1048) < 1e-315)
+            #self.assertTrue(0.0 < pow_op(-2.0, -1048) < 1e-315)
+            #self.assertTrue(0.0 < pow_op(2.0, -1047) < 1e-315)
+            #self.assertTrue(0.0 > pow_op(-2.0, -1047) > -1e-315)
+
+    def test_hash(self):
+        for x in range(-30, 30):
+            self.assertEqual(hash(float(x)), hash(x))
+        self.assertEqual(hash(float(sys.float_info.max)),
+                         hash(int(sys.float_info.max)))
+        self.assertEqual(hash(float('inf')), sys.hash_info.inf)
+        self.assertEqual(hash(float('-inf')), -sys.hash_info.inf)
+
+    def test_hash_nan(self):
+        value = float('nan')
+        self.assertEqual(hash(value), object.__hash__(value))
+        class H:
+            def __hash__(self):
+                return 42
+        class F(float, H):
+            pass
+        value = F('nan')
+        self.assertEqual(hash(value), object.__hash__(value))
+
+
+@unittest.skipUnless(hasattr(float, "__getformat__"), "requires __getformat__")
+class FormatFunctionsTestCase(__TestCase):
+    def test_getformat(self):
+        self.assertIn(float.__getformat__('double'),
+                      ['unknown', 'IEEE, big-endian', 'IEEE, little-endian'])
+        self.assertIn(float.__getformat__('float'),
+                      ['unknown', 'IEEE, big-endian', 'IEEE, little-endian'])
+        self.assertRaises(ValueError, float.__getformat__, 'chicken')
+        self.assertRaises(TypeError, float.__getformat__, 1)
+
+
+BE_DOUBLE_INF = b'\x7f\xf0\x00\x00\x00\x00\x00\x00'
+LE_DOUBLE_INF = bytes(reversed(BE_DOUBLE_INF))
+BE_DOUBLE_NAN = b'\x7f\xf8\x00\x00\x00\x00\x00\x00'
+LE_DOUBLE_NAN = bytes(reversed(BE_DOUBLE_NAN))
+
+BE_FLOAT_INF = b'\x7f\x80\x00\x00'
+LE_FLOAT_INF = bytes(reversed(BE_FLOAT_INF))
+BE_FLOAT_NAN = b'\x7f\xc0\x00\x00'
+LE_FLOAT_NAN = bytes(reversed(BE_FLOAT_NAN))
+
+# on an IEEE platform, all we guarantee is that bit patterns
+# representing infinities or NaNs do not raise an exception; all else
+# is accident (today).
+# let's also try to guarantee that -0.0 and 0.0 don't get confused.
+
+class IEEEFormatTestCase(__TestCase):
+
+    @support.requires_IEEE_754
+    def test_double_specials_do_unpack(self):
+        for fmt, data in [('>d', BE_DOUBLE_INF),
+                          ('>d', BE_DOUBLE_NAN),
+                          ('<d', LE_DOUBLE_INF),
+                          ('<d', LE_DOUBLE_NAN)]:
+            struct.unpack(fmt, data)
+
+    @support.requires_IEEE_754
+    def test_float_specials_do_unpack(self):
+        for fmt, data in [('>f', BE_FLOAT_INF),
+                          ('>f', BE_FLOAT_NAN),
+                          ('<f', LE_FLOAT_INF),
+                          ('<f', LE_FLOAT_NAN)]:
+            struct.unpack(fmt, data)
+
+    @support.requires_IEEE_754
+    @unittest.skipIf(_testcapi is None, 'needs _testcapi')
+    def test_serialized_float_rounding(self):
+        FLT_MAX = _testcapi.FLT_MAX
+        self.assertEqual(struct.pack("<f", 3.40282356e38), struct.pack("<f", FLT_MAX))
+        self.assertEqual(struct.pack("<f", -3.40282356e38), struct.pack("<f", -FLT_MAX))
+
+class FormatTestCase(__TestCase):
+
+    def test_format(self):
+        # these should be rewritten to use both format(x, spec) and
+        # x.__format__(spec)
+
+        self.assertEqual(format(0.0, 'f'), '0.000000')
+
+        # the default is 'g', except for empty format spec
+        self.assertEqual(format(0.0, ''), '0.0')
+        self.assertEqual(format(0.01, ''), '0.01')
+        self.assertEqual(format(0.01, 'g'), '0.01')
+
+        # empty presentation type should format in the same way as str
+        # (issue 5920)
+        x = 100/7.
+        self.assertEqual(format(x, ''), str(x))
+        self.assertEqual(format(x, '-'), str(x))
+        self.assertEqual(format(x, '>'), str(x))
+        self.assertEqual(format(x, '2'), str(x))
+
+        self.assertEqual(format(1.0, 'f'), '1.000000')
+
+        self.assertEqual(format(-1.0, 'f'), '-1.000000')
+
+        self.assertEqual(format( 1.0, ' f'), ' 1.000000')
+        self.assertEqual(format(-1.0, ' f'), '-1.000000')
+        self.assertEqual(format( 1.0, '+f'), '+1.000000')
+        self.assertEqual(format(-1.0, '+f'), '-1.000000')
+
+        # % formatting
+        self.assertEqual(format(-1.0, '%'), '-100.000000%')
+
+        # conversion to string should fail
+        self.assertRaises(ValueError, format, 3.0, "s")
+
+        # confirm format options expected to fail on floats, such as integer
+        # presentation types
+        for format_spec in 'sbcdoxX':
+            self.assertRaises(ValueError, format, 0.0, format_spec)
+            self.assertRaises(ValueError, format, 1.0, format_spec)
+            self.assertRaises(ValueError, format, -1.0, format_spec)
+            self.assertRaises(ValueError, format, 1e100, format_spec)
+            self.assertRaises(ValueError, format, -1e100, format_spec)
+            self.assertRaises(ValueError, format, 1e-100, format_spec)
+            self.assertRaises(ValueError, format, -1e-100, format_spec)
+
+        # issue 3382
+        self.assertEqual(format(NAN, 'f'), 'nan')
+        self.assertEqual(format(NAN, 'F'), 'NAN')
+        self.assertEqual(format(INF, 'f'), 'inf')
+        self.assertEqual(format(INF, 'F'), 'INF')
+
+    @support.requires_IEEE_754
+    def test_format_testfile(self):
+        with open(format_testfile, encoding="utf-8") as testfile:
+            for line in testfile:
+                if line.startswith('--'):
+                    continue
+                line = line.strip()
+                if not line:
+                    continue
+
+                lhs, rhs = map(str.strip, line.split('->'))
+                fmt, arg = lhs.split()
+                f = float(arg)
+                self.assertEqual(fmt % f, rhs)
+                self.assertEqual(fmt % -f, '-' + rhs)
+                if fmt != '%r':
+                    fmt2 = fmt[1:]
+                    self.assertEqual(format(f, fmt2), rhs)
+                    self.assertEqual(format(-f, fmt2), '-' + rhs)
+
+    def test_issue5864(self):
+        self.assertEqual(format(123.456, '.4'), '123.5')
+        self.assertEqual(format(1234.56, '.4'), '1.235e+03')
+        self.assertEqual(format(12345.6, '.4'), '1.235e+04')
+
+    def test_issue35560(self):
+        self.assertEqual(format(123.0, '00'), '123.0')
+        self.assertEqual(format(123.34, '00f'), '123.340000')
+        self.assertEqual(format(123.34, '00e'), '1.233400e+02')
+        self.assertEqual(format(123.34, '00g'), '123.34')
+        self.assertEqual(format(123.34, '00.10f'), '123.3400000000')
+        self.assertEqual(format(123.34, '00.10e'), '1.2334000000e+02')
+        self.assertEqual(format(123.34, '00.10g'), '123.34')
+        self.assertEqual(format(123.34, '01f'), '123.340000')
+
+        self.assertEqual(format(-123.0, '00'), '-123.0')
+        self.assertEqual(format(-123.34, '00f'), '-123.340000')
+        self.assertEqual(format(-123.34, '00e'), '-1.233400e+02')
+        self.assertEqual(format(-123.34, '00g'), '-123.34')
+        self.assertEqual(format(-123.34, '00.10f'), '-123.3400000000')
+        self.assertEqual(format(-123.34, '00.10f'), '-123.3400000000')
+        self.assertEqual(format(-123.34, '00.10e'), '-1.2334000000e+02')
+        self.assertEqual(format(-123.34, '00.10g'), '-123.34')
+
+class ReprTestCase(__TestCase):
+    def test_repr(self):
+        with open(os.path.join(os.path.split(__file__)[0],
+                  'mathdata',
+                  'floating_points.txt'), encoding="utf-8") as floats_file:
+            for line in floats_file:
+                line = line.strip()
+                if not line or line.startswith('#'):
+                    continue
+                v = eval(line)
+                self.assertEqual(v, eval(repr(v)))
+
+    @unittest.skipUnless(getattr(sys, 'float_repr_style', '') == 'short',
+                         "applies only when using short float repr style")
+    def test_short_repr(self):
+        # test short float repr introduced in Python 3.1.  One aspect
+        # of this repr is that we get some degree of str -> float ->
+        # str roundtripping.  In particular, for any numeric string
+        # containing 15 or fewer significant digits, those exact same
+        # digits (modulo trailing zeros) should appear in the output.
+        # No more repr(0.03) -> "0.029999999999999999"!
+
+        test_strings = [
+            # output always includes *either* a decimal point and at
+            # least one digit after that point, or an exponent.
+            '0.0',
+            '1.0',
+            '0.01',
+            '0.02',
+            '0.03',
+            '0.04',
+            '0.05',
+            '1.23456789',
+            '10.0',
+            '100.0',
+            # values >= 1e16 get an exponent...
+            '1000000000000000.0',
+            '9999999999999990.0',
+            '1e+16',
+            '1e+17',
+            # ... and so do values < 1e-4
+            '0.001',
+            '0.001001',
+            '0.00010000000000001',
+            '0.0001',
+            '9.999999999999e-05',
+            '1e-05',
+            # values designed to provoke failure if the FPU rounding
+            # precision isn't set correctly
+            '8.72293771110361e+25',
+            '7.47005307342313e+26',
+            '2.86438000439698e+28',
+            '8.89142905246179e+28',
+            '3.08578087079232e+35',
+            ]
+
+        for s in test_strings:
+            negs = '-'+s
+            self.assertEqual(s, repr(float(s)))
+            self.assertEqual(negs, repr(float(negs)))
+            # Since Python 3.2, repr and str are identical
+            self.assertEqual(repr(float(s)), str(float(s)))
+            self.assertEqual(repr(float(negs)), str(float(negs)))
+
+@support.requires_IEEE_754
+class RoundTestCase(__TestCase):
+    def assertFloatsAreIdentical(self, x, y):
+        """assert that floats x and y are identical, in the sense that:
+        (1) both x and y are nans, or
+        (2) both x and y are infinities, with the same sign, or
+        (3) both x and y are zeros, with the same sign, or
+        (4) x and y are both finite and nonzero, and x == y
+
+        """
+        msg = 'floats {!r} and {!r} are not identical'
+
+        if isnan(x) or isnan(y):
+            if isnan(x) and isnan(y):
+                return
+        elif x == y:
+            if x != 0.0:
+                return
+            # both zero; check that signs match
+            elif copysign(1.0, x) == copysign(1.0, y):
+                return
+            else:
+                msg += ': zeros have different signs'
+        self.fail(msg.format(x, y))
+
+    def test_inf_nan(self):
+        self.assertRaises(OverflowError, round, INF)
+        self.assertRaises(OverflowError, round, -INF)
+        self.assertRaises(ValueError, round, NAN)
+        self.assertRaises(TypeError, round, INF, 0.0)
+        self.assertRaises(TypeError, round, -INF, 1.0)
+        self.assertRaises(TypeError, round, NAN, "ceci n'est pas un integer")
+        self.assertRaises(TypeError, round, -0.0, 1j)
+
+    def test_inf_nan_ndigits(self):
+        self.assertEqual(round(INF, 0), INF)
+        self.assertEqual(round(-INF, 0), -INF)
+        self.assertTrue(math.isnan(round(NAN, 0)))
+
+    def test_large_n(self):
+        for n in [324, 325, 400, 2**31-1, 2**31, 2**32, 2**100]:
+            self.assertEqual(round(123.456, n), 123.456)
+            self.assertEqual(round(-123.456, n), -123.456)
+            self.assertEqual(round(1e300, n), 1e300)
+            self.assertEqual(round(1e-320, n), 1e-320)
+        self.assertEqual(round(1e150, 300), 1e150)
+        self.assertEqual(round(1e300, 307), 1e300)
+        self.assertEqual(round(-3.1415, 308), -3.1415)
+        self.assertEqual(round(1e150, 309), 1e150)
+        self.assertEqual(round(1.4e-315, 315), 1e-315)
+
+    def test_small_n(self):
+        for n in [-308, -309, -400, 1-2**31, -2**31, -2**31-1, -2**100]:
+            self.assertFloatsAreIdentical(round(123.456, n), 0.0)
+            self.assertFloatsAreIdentical(round(-123.456, n), -0.0)
+            self.assertFloatsAreIdentical(round(1e300, n), 0.0)
+            self.assertFloatsAreIdentical(round(1e-320, n), 0.0)
+
+    def test_overflow(self):
+        self.assertRaises(OverflowError, round, 1.6e308, -308)
+        self.assertRaises(OverflowError, round, -1.7e308, -308)
+
+    @unittest.skipUnless(getattr(sys, 'float_repr_style', '') == 'short',
+                         "applies only when using short float repr style")
+    def test_previous_round_bugs(self):
+        # particular cases that have occurred in bug reports
+        self.assertEqual(round(562949953421312.5, 1),
+                          562949953421312.5)
+        self.assertEqual(round(56294995342131.5, 3),
+                         56294995342131.5)
+        # round-half-even
+        self.assertEqual(round(25.0, -1), 20.0)
+        self.assertEqual(round(35.0, -1), 40.0)
+        self.assertEqual(round(45.0, -1), 40.0)
+        self.assertEqual(round(55.0, -1), 60.0)
+        self.assertEqual(round(65.0, -1), 60.0)
+        self.assertEqual(round(75.0, -1), 80.0)
+        self.assertEqual(round(85.0, -1), 80.0)
+        self.assertEqual(round(95.0, -1), 100.0)
+
+    @unittest.skipUnless(getattr(sys, 'float_repr_style', '') == 'short',
+                         "applies only when using short float repr style")
+    def test_matches_float_format(self):
+        # round should give the same results as float formatting
+        for i in range(500):
+            x = i/1000.
+            self.assertEqual(float(format(x, '.0f')), round(x, 0))
+            self.assertEqual(float(format(x, '.1f')), round(x, 1))
+            self.assertEqual(float(format(x, '.2f')), round(x, 2))
+            self.assertEqual(float(format(x, '.3f')), round(x, 3))
+
+        for i in range(5, 5000, 10):
+            x = i/1000.
+            self.assertEqual(float(format(x, '.0f')), round(x, 0))
+            self.assertEqual(float(format(x, '.1f')), round(x, 1))
+            self.assertEqual(float(format(x, '.2f')), round(x, 2))
+            self.assertEqual(float(format(x, '.3f')), round(x, 3))
+
+        for i in range(500):
+            x = random.random()
+            self.assertEqual(float(format(x, '.0f')), round(x, 0))
+            self.assertEqual(float(format(x, '.1f')), round(x, 1))
+            self.assertEqual(float(format(x, '.2f')), round(x, 2))
+            self.assertEqual(float(format(x, '.3f')), round(x, 3))
+
+    def test_format_specials(self):
+        # Test formatting of nans and infs.
+
+        def test(fmt, value, expected):
+            # Test with both % and format().
+            self.assertEqual(fmt % value, expected, fmt)
+            fmt = fmt[1:] # strip off the %
+            self.assertEqual(format(value, fmt), expected, fmt)
+
+        for fmt in ['%e', '%f', '%g', '%.0e', '%.6f', '%.20g',
+                    '%#e', '%#f', '%#g', '%#.20e', '%#.15f', '%#.3g']:
+            pfmt = '%+' + fmt[1:]
+            sfmt = '% ' + fmt[1:]
+            test(fmt, INF, 'inf')
+            test(fmt, -INF, '-inf')
+            test(fmt, NAN, 'nan')
+            test(fmt, -NAN, 'nan')
+            # When asking for a sign, it's always provided. nans are
+            #  always positive.
+            test(pfmt, INF, '+inf')
+            test(pfmt, -INF, '-inf')
+            test(pfmt, NAN, '+nan')
+            test(pfmt, -NAN, '+nan')
+            # When using ' ' for a sign code, only infs can be negative.
+            #  Others have a space.
+            test(sfmt, INF, ' inf')
+            test(sfmt, -INF, '-inf')
+            test(sfmt, NAN, ' nan')
+            test(sfmt, -NAN, ' nan')
+
+    def test_None_ndigits(self):
+        for x in round(1.23), round(1.23, None), round(1.23, ndigits=None):
+            self.assertEqual(x, 1)
+            self.assertIsInstance(x, int)
+        for x in round(1.78), round(1.78, None), round(1.78, ndigits=None):
+            self.assertEqual(x, 2)
+            self.assertIsInstance(x, int)
+
+
+# Beginning with Python 2.6 float has cross platform compatible
+# ways to create and represent inf and nan
+class InfNanTest(__TestCase):
+    def test_inf_from_str(self):
+        self.assertTrue(isinf(float("inf")))
+        self.assertTrue(isinf(float("+inf")))
+        self.assertTrue(isinf(float("-inf")))
+        self.assertTrue(isinf(float("infinity")))
+        self.assertTrue(isinf(float("+infinity")))
+        self.assertTrue(isinf(float("-infinity")))
+
+        self.assertEqual(repr(float("inf")), "inf")
+        self.assertEqual(repr(float("+inf")), "inf")
+        self.assertEqual(repr(float("-inf")), "-inf")
+        self.assertEqual(repr(float("infinity")), "inf")
+        self.assertEqual(repr(float("+infinity")), "inf")
+        self.assertEqual(repr(float("-infinity")), "-inf")
+
+        self.assertEqual(repr(float("INF")), "inf")
+        self.assertEqual(repr(float("+Inf")), "inf")
+        self.assertEqual(repr(float("-iNF")), "-inf")
+        self.assertEqual(repr(float("Infinity")), "inf")
+        self.assertEqual(repr(float("+iNfInItY")), "inf")
+        self.assertEqual(repr(float("-INFINITY")), "-inf")
+
+        self.assertEqual(str(float("inf")), "inf")
+        self.assertEqual(str(float("+inf")), "inf")
+        self.assertEqual(str(float("-inf")), "-inf")
+        self.assertEqual(str(float("infinity")), "inf")
+        self.assertEqual(str(float("+infinity")), "inf")
+        self.assertEqual(str(float("-infinity")), "-inf")
+
+        self.assertRaises(ValueError, float, "info")
+        self.assertRaises(ValueError, float, "+info")
+        self.assertRaises(ValueError, float, "-info")
+        self.assertRaises(ValueError, float, "in")
+        self.assertRaises(ValueError, float, "+in")
+        self.assertRaises(ValueError, float, "-in")
+        self.assertRaises(ValueError, float, "infinit")
+        self.assertRaises(ValueError, float, "+Infin")
+        self.assertRaises(ValueError, float, "-INFI")
+        self.assertRaises(ValueError, float, "infinitys")
+
+        self.assertRaises(ValueError, float, "++Inf")
+        self.assertRaises(ValueError, float, "-+inf")
+        self.assertRaises(ValueError, float, "+-infinity")
+        self.assertRaises(ValueError, float, "--Infinity")
+
+    def test_inf_as_str(self):
+        self.assertEqual(repr(1e300 * 1e300), "inf")
+        self.assertEqual(repr(-1e300 * 1e300), "-inf")
+
+        self.assertEqual(str(1e300 * 1e300), "inf")
+        self.assertEqual(str(-1e300 * 1e300), "-inf")
+
+    def test_nan_from_str(self):
+        self.assertTrue(isnan(float("nan")))
+        self.assertTrue(isnan(float("+nan")))
+        self.assertTrue(isnan(float("-nan")))
+
+        self.assertEqual(repr(float("nan")), "nan")
+        self.assertEqual(repr(float("+nan")), "nan")
+        self.assertEqual(repr(float("-nan")), "nan")
+
+        self.assertEqual(repr(float("NAN")), "nan")
+        self.assertEqual(repr(float("+NAn")), "nan")
+        self.assertEqual(repr(float("-NaN")), "nan")
+
+        self.assertEqual(str(float("nan")), "nan")
+        self.assertEqual(str(float("+nan")), "nan")
+        self.assertEqual(str(float("-nan")), "nan")
+
+        self.assertRaises(ValueError, float, "nana")
+        self.assertRaises(ValueError, float, "+nana")
+        self.assertRaises(ValueError, float, "-nana")
+        self.assertRaises(ValueError, float, "na")
+        self.assertRaises(ValueError, float, "+na")
+        self.assertRaises(ValueError, float, "-na")
+
+        self.assertRaises(ValueError, float, "++nan")
+        self.assertRaises(ValueError, float, "-+NAN")
+        self.assertRaises(ValueError, float, "+-NaN")
+        self.assertRaises(ValueError, float, "--nAn")
+
+    def test_nan_as_str(self):
+        self.assertEqual(repr(1e300 * 1e300 * 0), "nan")
+        self.assertEqual(repr(-1e300 * 1e300 * 0), "nan")
+
+        self.assertEqual(str(1e300 * 1e300 * 0), "nan")
+        self.assertEqual(str(-1e300 * 1e300 * 0), "nan")
+
+    def test_inf_signs(self):
+        self.assertEqual(copysign(1.0, float('inf')), 1.0)
+        self.assertEqual(copysign(1.0, float('-inf')), -1.0)
+
+    def test_nan_signs(self):
+        # The sign of float('nan') should be predictable.
+        self.assertEqual(copysign(1.0, float('nan')), 1.0)
+        self.assertEqual(copysign(1.0, float('-nan')), -1.0)
+
+
+fromHex = float.fromhex
+toHex = float.hex
+class HexFloatTestCase(__TestCase):
+    MAX = fromHex('0x.fffffffffffff8p+1024')  # max normal
+    MIN = fromHex('0x1p-1022')                # min normal
+    TINY = fromHex('0x0.0000000000001p-1022') # min subnormal
+    EPS = fromHex('0x0.0000000000001p0') # diff between 1.0 and next float up
+
+    def assertFloatsAreIdentical(self, x, y):
+        """assert that floats x and y are identical, in the sense that:
+        (1) both x and y are nans, or
+        (2) both x and y are infinities, with the same sign, or
+        (3) both x and y are zeros, with the same sign, or
+        (4) x and y are both finite and nonzero, and x == y
+
+        """
+        msg = 'floats {!r} and {!r} are not identical'
+
+        if isnan(x) or isnan(y):
+            if isnan(x) and isnan(y):
+                return
+        elif x == y:
+            if x != 0.0:
+                return
+            # both zero; check that signs match
+            elif copysign(1.0, x) == copysign(1.0, y):
+                return
+            else:
+                msg += ': zeros have different signs'
+        self.fail(msg.format(x, y))
+
+    def identical(self, x, y):
+        self.assertFloatsAreIdentical(x, y)
+
+    def test_ends(self):
+        self.identical(self.MIN, ldexp(1.0, -1022))
+        self.identical(self.TINY, ldexp(1.0, -1074))
+        self.identical(self.EPS, ldexp(1.0, -52))
+        self.identical(self.MAX, 2.*(ldexp(1.0, 1023) - ldexp(1.0, 970)))
+
+    def test_invalid_inputs(self):
+        invalid_inputs = [
+            'infi',   # misspelt infinities and nans
+            '-Infinit',
+            '++inf',
+            '-+Inf',
+            '--nan',
+            '+-NaN',
+            'snan',
+            'NaNs',
+            'nna',
+            'an',
+            'nf',
+            'nfinity',
+            'inity',
+            'iinity',
+            '0xnan',
+            '',
+            ' ',
+            'x1.0p0',
+            '0xX1.0p0',
+            '+ 0x1.0p0', # internal whitespace
+            '- 0x1.0p0',
+            '0 x1.0p0',
+            '0x 1.0p0',
+            '0x1 2.0p0',
+            '+0x1 .0p0',
+            '0x1. 0p0',
+            '-0x1.0 1p0',
+            '-0x1.0 p0',
+            '+0x1.0p +0',
+            '0x1.0p -0',
+            '0x1.0p 0',
+            '+0x1.0p+ 0',
+            '-0x1.0p- 0',
+            '++0x1.0p-0', # double signs
+            '--0x1.0p0',
+            '+-0x1.0p+0',
+            '-+0x1.0p0',
+            '0x1.0p++0',
+            '+0x1.0p+-0',
+            '-0x1.0p-+0',
+            '0x1.0p--0',
+            '0x1.0.p0',
+            '0x.p0', # no hex digits before or after point
+            '0x1,p0', # wrong decimal point character
+            '0x1pa',
+            '0x1p\uff10',  # fullwidth Unicode digits
+            '\uff10x1p0',
+            '0x\uff11p0',
+            '0x1.\uff10p0',
+            '0x1p0 \n 0x2p0',
+            '0x1p0\0 0x1p0',  # embedded null byte is not end of string
+            ]
+        for x in invalid_inputs:
+            try:
+                result = fromHex(x)
+            except ValueError:
+                pass
+            else:
+                self.fail('Expected float.fromhex(%r) to raise ValueError; '
+                          'got %r instead' % (x, result))
+
+
+    def test_whitespace(self):
+        value_pairs = [
+            ('inf', INF),
+            ('-Infinity', -INF),
+            ('nan', NAN),
+            ('1.0', 1.0),
+            ('-0x.2', -0.125),
+            ('-0.0', -0.0)
+            ]
+        whitespace = [
+            '',
+            ' ',
+            '\t',
+            '\n',
+            '\n \t',
+            '\f',
+            '\v',
+            '\r'
+            ]
+        for inp, expected in value_pairs:
+            for lead in whitespace:
+                for trail in whitespace:
+                    got = fromHex(lead + inp + trail)
+                    self.identical(got, expected)
+
+
+    def test_from_hex(self):
+        MIN = self.MIN
+        MAX = self.MAX
+        TINY = self.TINY
+        EPS = self.EPS
+
+        # two spellings of infinity, with optional signs; case-insensitive
+        self.identical(fromHex('inf'), INF)
+        self.identical(fromHex('+Inf'), INF)
+        self.identical(fromHex('-INF'), -INF)
+        self.identical(fromHex('iNf'), INF)
+        self.identical(fromHex('Infinity'), INF)
+        self.identical(fromHex('+INFINITY'), INF)
+        self.identical(fromHex('-infinity'), -INF)
+        self.identical(fromHex('-iNFiNitY'), -INF)
+
+        # nans with optional sign; case insensitive
+        self.identical(fromHex('nan'), NAN)
+        self.identical(fromHex('+NaN'), NAN)
+        self.identical(fromHex('-NaN'), NAN)
+        self.identical(fromHex('-nAN'), NAN)
+
+        # variations in input format
+        self.identical(fromHex('1'), 1.0)
+        self.identical(fromHex('+1'), 1.0)
+        self.identical(fromHex('1.'), 1.0)
+        self.identical(fromHex('1.0'), 1.0)
+        self.identical(fromHex('1.0p0'), 1.0)
+        self.identical(fromHex('01'), 1.0)
+        self.identical(fromHex('01.'), 1.0)
+        self.identical(fromHex('0x1'), 1.0)
+        self.identical(fromHex('0x1.'), 1.0)
+        self.identical(fromHex('0x1.0'), 1.0)
+        self.identical(fromHex('+0x1.0'), 1.0)
+        self.identical(fromHex('0x1p0'), 1.0)
+        self.identical(fromHex('0X1p0'), 1.0)
+        self.identical(fromHex('0X1P0'), 1.0)
+        self.identical(fromHex('0x1P0'), 1.0)
+        self.identical(fromHex('0x1.p0'), 1.0)
+        self.identical(fromHex('0x1.0p0'), 1.0)
+        self.identical(fromHex('0x.1p4'), 1.0)
+        self.identical(fromHex('0x.1p04'), 1.0)
+        self.identical(fromHex('0x.1p004'), 1.0)
+        self.identical(fromHex('0x1p+0'), 1.0)
+        self.identical(fromHex('0x1P-0'), 1.0)
+        self.identical(fromHex('+0x1p0'), 1.0)
+        self.identical(fromHex('0x01p0'), 1.0)
+        self.identical(fromHex('0x1p00'), 1.0)
+        self.identical(fromHex(' 0x1p0 '), 1.0)
+        self.identical(fromHex('\n 0x1p0'), 1.0)
+        self.identical(fromHex('0x1p0 \t'), 1.0)
+        self.identical(fromHex('0xap0'), 10.0)
+        self.identical(fromHex('0xAp0'), 10.0)
+        self.identical(fromHex('0xaP0'), 10.0)
+        self.identical(fromHex('0xAP0'), 10.0)
+        self.identical(fromHex('0xbep0'), 190.0)
+        self.identical(fromHex('0xBep0'), 190.0)
+        self.identical(fromHex('0xbEp0'), 190.0)
+        self.identical(fromHex('0XBE0P-4'), 190.0)
+        self.identical(fromHex('0xBEp0'), 190.0)
+        self.identical(fromHex('0xB.Ep4'), 190.0)
+        self.identical(fromHex('0x.BEp8'), 190.0)
+        self.identical(fromHex('0x.0BEp12'), 190.0)
+
+        # moving the point around
+        pi = fromHex('0x1.921fb54442d18p1')
+        self.identical(fromHex('0x.006487ed5110b46p11'), pi)
+        self.identical(fromHex('0x.00c90fdaa22168cp10'), pi)
+        self.identical(fromHex('0x.01921fb54442d18p9'), pi)
+        self.identical(fromHex('0x.03243f6a8885a3p8'), pi)
+        self.identical(fromHex('0x.06487ed5110b46p7'), pi)
+        self.identical(fromHex('0x.0c90fdaa22168cp6'), pi)
+        self.identical(fromHex('0x.1921fb54442d18p5'), pi)
+        self.identical(fromHex('0x.3243f6a8885a3p4'), pi)
+        self.identical(fromHex('0x.6487ed5110b46p3'), pi)
+        self.identical(fromHex('0x.c90fdaa22168cp2'), pi)
+        self.identical(fromHex('0x1.921fb54442d18p1'), pi)
+        self.identical(fromHex('0x3.243f6a8885a3p0'), pi)
+        self.identical(fromHex('0x6.487ed5110b46p-1'), pi)
+        self.identical(fromHex('0xc.90fdaa22168cp-2'), pi)
+        self.identical(fromHex('0x19.21fb54442d18p-3'), pi)
+        self.identical(fromHex('0x32.43f6a8885a3p-4'), pi)
+        self.identical(fromHex('0x64.87ed5110b46p-5'), pi)
+        self.identical(fromHex('0xc9.0fdaa22168cp-6'), pi)
+        self.identical(fromHex('0x192.1fb54442d18p-7'), pi)
+        self.identical(fromHex('0x324.3f6a8885a3p-8'), pi)
+        self.identical(fromHex('0x648.7ed5110b46p-9'), pi)
+        self.identical(fromHex('0xc90.fdaa22168cp-10'), pi)
+        self.identical(fromHex('0x1921.fb54442d18p-11'), pi)
+        # ...
+        self.identical(fromHex('0x1921fb54442d1.8p-47'), pi)
+        self.identical(fromHex('0x3243f6a8885a3p-48'), pi)
+        self.identical(fromHex('0x6487ed5110b46p-49'), pi)
+        self.identical(fromHex('0xc90fdaa22168cp-50'), pi)
+        self.identical(fromHex('0x1921fb54442d18p-51'), pi)
+        self.identical(fromHex('0x3243f6a8885a30p-52'), pi)
+        self.identical(fromHex('0x6487ed5110b460p-53'), pi)
+        self.identical(fromHex('0xc90fdaa22168c0p-54'), pi)
+        self.identical(fromHex('0x1921fb54442d180p-55'), pi)
+
+
+        # results that should overflow...
+        self.assertRaises(OverflowError, fromHex, '-0x1p1024')
+        self.assertRaises(OverflowError, fromHex, '0x1p+1025')
+        self.assertRaises(OverflowError, fromHex, '+0X1p1030')
+        self.assertRaises(OverflowError, fromHex, '-0x1p+1100')
+        self.assertRaises(OverflowError, fromHex, '0X1p123456789123456789')
+        self.assertRaises(OverflowError, fromHex, '+0X.8p+1025')
+        self.assertRaises(OverflowError, fromHex, '+0x0.8p1025')
+        self.assertRaises(OverflowError, fromHex, '-0x0.4p1026')
+        self.assertRaises(OverflowError, fromHex, '0X2p+1023')
+        self.assertRaises(OverflowError, fromHex, '0x2.p1023')
+        self.assertRaises(OverflowError, fromHex, '-0x2.0p+1023')
+        self.assertRaises(OverflowError, fromHex, '+0X4p+1022')
+        self.assertRaises(OverflowError, fromHex, '0x1.ffffffffffffffp+1023')
+        self.assertRaises(OverflowError, fromHex, '-0X1.fffffffffffff9p1023')
+        self.assertRaises(OverflowError, fromHex, '0X1.fffffffffffff8p1023')
+        self.assertRaises(OverflowError, fromHex, '+0x3.fffffffffffffp1022')
+        self.assertRaises(OverflowError, fromHex, '0x3fffffffffffffp+970')
+        self.assertRaises(OverflowError, fromHex, '0x10000000000000000p960')
+        self.assertRaises(OverflowError, fromHex, '-0Xffffffffffffffffp960')
+
+        # ...and those that round to +-max float
+        self.identical(fromHex('+0x1.fffffffffffffp+1023'), MAX)
+        self.identical(fromHex('-0X1.fffffffffffff7p1023'), -MAX)
+        self.identical(fromHex('0X1.fffffffffffff7fffffffffffffp1023'), MAX)
+
+        # zeros
+        self.identical(fromHex('0x0p0'), 0.0)
+        self.identical(fromHex('0x0p1000'), 0.0)
+        self.identical(fromHex('-0x0p1023'), -0.0)
+        self.identical(fromHex('0X0p1024'), 0.0)
+        self.identical(fromHex('-0x0p1025'), -0.0)
+        self.identical(fromHex('0X0p2000'), 0.0)
+        self.identical(fromHex('0x0p123456789123456789'), 0.0)
+        self.identical(fromHex('-0X0p-0'), -0.0)
+        self.identical(fromHex('-0X0p-1000'), -0.0)
+        self.identical(fromHex('0x0p-1023'), 0.0)
+        self.identical(fromHex('-0X0p-1024'), -0.0)
+        self.identical(fromHex('-0x0p-1025'), -0.0)
+        self.identical(fromHex('-0x0p-1072'), -0.0)
+        self.identical(fromHex('0X0p-1073'), 0.0)
+        self.identical(fromHex('-0x0p-1074'), -0.0)
+        self.identical(fromHex('0x0p-1075'), 0.0)
+        self.identical(fromHex('0X0p-1076'), 0.0)
+        self.identical(fromHex('-0X0p-2000'), -0.0)
+        self.identical(fromHex('-0x0p-123456789123456789'), -0.0)
+
+        # values that should underflow to 0
+        self.identical(fromHex('0X1p-1075'), 0.0)
+        self.identical(fromHex('-0X1p-1075'), -0.0)
+        self.identical(fromHex('-0x1p-123456789123456789'), -0.0)
+        self.identical(fromHex('0x1.00000000000000001p-1075'), TINY)
+        self.identical(fromHex('-0x1.1p-1075'), -TINY)
+        self.identical(fromHex('0x1.fffffffffffffffffp-1075'), TINY)
+
+        # check round-half-even is working correctly near 0 ...
+        self.identical(fromHex('0x1p-1076'), 0.0)
+        self.identical(fromHex('0X2p-1076'), 0.0)
+        self.identical(fromHex('0X3p-1076'), TINY)
+        self.identical(fromHex('0x4p-1076'), TINY)
+        self.identical(fromHex('0X5p-1076'), TINY)
+        self.identical(fromHex('0X6p-1076'), 2*TINY)
+        self.identical(fromHex('0x7p-1076'), 2*TINY)
+        self.identical(fromHex('0X8p-1076'), 2*TINY)
+        self.identical(fromHex('0X9p-1076'), 2*TINY)
+        self.identical(fromHex('0xap-1076'), 2*TINY)
+        self.identical(fromHex('0Xbp-1076'), 3*TINY)
+        self.identical(fromHex('0xcp-1076'), 3*TINY)
+        self.identical(fromHex('0Xdp-1076'), 3*TINY)
+        self.identical(fromHex('0Xep-1076'), 4*TINY)
+        self.identical(fromHex('0xfp-1076'), 4*TINY)
+        self.identical(fromHex('0x10p-1076'), 4*TINY)
+        self.identical(fromHex('-0x1p-1076'), -0.0)
+        self.identical(fromHex('-0X2p-1076'), -0.0)
+        self.identical(fromHex('-0x3p-1076'), -TINY)
+        self.identical(fromHex('-0X4p-1076'), -TINY)
+        self.identical(fromHex('-0x5p-1076'), -TINY)
+        self.identical(fromHex('-0x6p-1076'), -2*TINY)
+        self.identical(fromHex('-0X7p-1076'), -2*TINY)
+        self.identical(fromHex('-0X8p-1076'), -2*TINY)
+        self.identical(fromHex('-0X9p-1076'), -2*TINY)
+        self.identical(fromHex('-0Xap-1076'), -2*TINY)
+        self.identical(fromHex('-0xbp-1076'), -3*TINY)
+        self.identical(fromHex('-0xcp-1076'), -3*TINY)
+        self.identical(fromHex('-0Xdp-1076'), -3*TINY)
+        self.identical(fromHex('-0xep-1076'), -4*TINY)
+        self.identical(fromHex('-0Xfp-1076'), -4*TINY)
+        self.identical(fromHex('-0X10p-1076'), -4*TINY)
+
+        # ... and near MIN ...
+        self.identical(fromHex('0x0.ffffffffffffd6p-1022'), MIN-3*TINY)
+        self.identical(fromHex('0x0.ffffffffffffd8p-1022'), MIN-2*TINY)
+        self.identical(fromHex('0x0.ffffffffffffdap-1022'), MIN-2*TINY)
+        self.identical(fromHex('0x0.ffffffffffffdcp-1022'), MIN-2*TINY)
+        self.identical(fromHex('0x0.ffffffffffffdep-1022'), MIN-2*TINY)
+        self.identical(fromHex('0x0.ffffffffffffe0p-1022'), MIN-2*TINY)
+        self.identical(fromHex('0x0.ffffffffffffe2p-1022'), MIN-2*TINY)
+        self.identical(fromHex('0x0.ffffffffffffe4p-1022'), MIN-2*TINY)
+        self.identical(fromHex('0x0.ffffffffffffe6p-1022'), MIN-2*TINY)
+        self.identical(fromHex('0x0.ffffffffffffe8p-1022'), MIN-2*TINY)
+        self.identical(fromHex('0x0.ffffffffffffeap-1022'), MIN-TINY)
+        self.identical(fromHex('0x0.ffffffffffffecp-1022'), MIN-TINY)
+        self.identical(fromHex('0x0.ffffffffffffeep-1022'), MIN-TINY)
+        self.identical(fromHex('0x0.fffffffffffff0p-1022'), MIN-TINY)
+        self.identical(fromHex('0x0.fffffffffffff2p-1022'), MIN-TINY)
+        self.identical(fromHex('0x0.fffffffffffff4p-1022'), MIN-TINY)
+        self.identical(fromHex('0x0.fffffffffffff6p-1022'), MIN-TINY)
+        self.identical(fromHex('0x0.fffffffffffff8p-1022'), MIN)
+        self.identical(fromHex('0x0.fffffffffffffap-1022'), MIN)
+        self.identical(fromHex('0x0.fffffffffffffcp-1022'), MIN)
+        self.identical(fromHex('0x0.fffffffffffffep-1022'), MIN)
+        self.identical(fromHex('0x1.00000000000000p-1022'), MIN)
+        self.identical(fromHex('0x1.00000000000002p-1022'), MIN)
+        self.identical(fromHex('0x1.00000000000004p-1022'), MIN)
+        self.identical(fromHex('0x1.00000000000006p-1022'), MIN)
+        self.identical(fromHex('0x1.00000000000008p-1022'), MIN)
+        self.identical(fromHex('0x1.0000000000000ap-1022'), MIN+TINY)
+        self.identical(fromHex('0x1.0000000000000cp-1022'), MIN+TINY)
+        self.identical(fromHex('0x1.0000000000000ep-1022'), MIN+TINY)
+        self.identical(fromHex('0x1.00000000000010p-1022'), MIN+TINY)
+        self.identical(fromHex('0x1.00000000000012p-1022'), MIN+TINY)
+        self.identical(fromHex('0x1.00000000000014p-1022'), MIN+TINY)
+        self.identical(fromHex('0x1.00000000000016p-1022'), MIN+TINY)
+        self.identical(fromHex('0x1.00000000000018p-1022'), MIN+2*TINY)
+
+        # ... and near 1.0.
+        self.identical(fromHex('0x0.fffffffffffff0p0'), 1.0-EPS)
+        self.identical(fromHex('0x0.fffffffffffff1p0'), 1.0-EPS)
+        self.identical(fromHex('0X0.fffffffffffff2p0'), 1.0-EPS)
+        self.identical(fromHex('0x0.fffffffffffff3p0'), 1.0-EPS)
+        self.identical(fromHex('0X0.fffffffffffff4p0'), 1.0-EPS)
+        self.identical(fromHex('0X0.fffffffffffff5p0'), 1.0-EPS/2)
+        self.identical(fromHex('0X0.fffffffffffff6p0'), 1.0-EPS/2)
+        self.identical(fromHex('0x0.fffffffffffff7p0'), 1.0-EPS/2)
+        self.identical(fromHex('0x0.fffffffffffff8p0'), 1.0-EPS/2)
+        self.identical(fromHex('0X0.fffffffffffff9p0'), 1.0-EPS/2)
+        self.identical(fromHex('0X0.fffffffffffffap0'), 1.0-EPS/2)
+        self.identical(fromHex('0x0.fffffffffffffbp0'), 1.0-EPS/2)
+        self.identical(fromHex('0X0.fffffffffffffcp0'), 1.0)
+        self.identical(fromHex('0x0.fffffffffffffdp0'), 1.0)
+        self.identical(fromHex('0X0.fffffffffffffep0'), 1.0)
+        self.identical(fromHex('0x0.ffffffffffffffp0'), 1.0)
+        self.identical(fromHex('0X1.00000000000000p0'), 1.0)
+        self.identical(fromHex('0X1.00000000000001p0'), 1.0)
+        self.identical(fromHex('0x1.00000000000002p0'), 1.0)
+        self.identical(fromHex('0X1.00000000000003p0'), 1.0)
+        self.identical(fromHex('0x1.00000000000004p0'), 1.0)
+        self.identical(fromHex('0X1.00000000000005p0'), 1.0)
+        self.identical(fromHex('0X1.00000000000006p0'), 1.0)
+        self.identical(fromHex('0X1.00000000000007p0'), 1.0)
+        self.identical(fromHex('0x1.00000000000007ffffffffffffffffffffp0'),
+                       1.0)
+        self.identical(fromHex('0x1.00000000000008p0'), 1.0)
+        self.identical(fromHex('0x1.00000000000008000000000000000001p0'),
+                       1+EPS)
+        self.identical(fromHex('0X1.00000000000009p0'), 1.0+EPS)
+        self.identical(fromHex('0x1.0000000000000ap0'), 1.0+EPS)
+        self.identical(fromHex('0x1.0000000000000bp0'), 1.0+EPS)
+        self.identical(fromHex('0X1.0000000000000cp0'), 1.0+EPS)
+        self.identical(fromHex('0x1.0000000000000dp0'), 1.0+EPS)
+        self.identical(fromHex('0x1.0000000000000ep0'), 1.0+EPS)
+        self.identical(fromHex('0X1.0000000000000fp0'), 1.0+EPS)
+        self.identical(fromHex('0x1.00000000000010p0'), 1.0+EPS)
+        self.identical(fromHex('0X1.00000000000011p0'), 1.0+EPS)
+        self.identical(fromHex('0x1.00000000000012p0'), 1.0+EPS)
+        self.identical(fromHex('0X1.00000000000013p0'), 1.0+EPS)
+        self.identical(fromHex('0X1.00000000000014p0'), 1.0+EPS)
+        self.identical(fromHex('0x1.00000000000015p0'), 1.0+EPS)
+        self.identical(fromHex('0x1.00000000000016p0'), 1.0+EPS)
+        self.identical(fromHex('0X1.00000000000017p0'), 1.0+EPS)
+        self.identical(fromHex('0x1.00000000000017ffffffffffffffffffffp0'),
+                       1.0+EPS)
+        self.identical(fromHex('0x1.00000000000018p0'), 1.0+2*EPS)
+        self.identical(fromHex('0X1.00000000000018000000000000000001p0'),
+                       1.0+2*EPS)
+        self.identical(fromHex('0x1.00000000000019p0'), 1.0+2*EPS)
+        self.identical(fromHex('0X1.0000000000001ap0'), 1.0+2*EPS)
+        self.identical(fromHex('0X1.0000000000001bp0'), 1.0+2*EPS)
+        self.identical(fromHex('0x1.0000000000001cp0'), 1.0+2*EPS)
+        self.identical(fromHex('0x1.0000000000001dp0'), 1.0+2*EPS)
+        self.identical(fromHex('0x1.0000000000001ep0'), 1.0+2*EPS)
+        self.identical(fromHex('0X1.0000000000001fp0'), 1.0+2*EPS)
+        self.identical(fromHex('0x1.00000000000020p0'), 1.0+2*EPS)
+
+        # Regression test for a corner-case bug reported in b.p.o. 44954
+        self.identical(fromHex('0x.8p-1074'), 0.0)
+        self.identical(fromHex('0x.80p-1074'), 0.0)
+        self.identical(fromHex('0x.81p-1074'), TINY)
+        self.identical(fromHex('0x8p-1078'), 0.0)
+        self.identical(fromHex('0x8.0p-1078'), 0.0)
+        self.identical(fromHex('0x8.1p-1078'), TINY)
+        self.identical(fromHex('0x80p-1082'), 0.0)
+        self.identical(fromHex('0x81p-1082'), TINY)
+        self.identical(fromHex('.8p-1074'), 0.0)
+        self.identical(fromHex('8p-1078'), 0.0)
+        self.identical(fromHex('-.8p-1074'), -0.0)
+        self.identical(fromHex('+8p-1078'), 0.0)
+
+    def test_roundtrip(self):
+        def roundtrip(x):
+            return fromHex(toHex(x))
+
+        for x in [NAN, INF, self.MAX, self.MIN, self.MIN-self.TINY, self.TINY, 0.0]:
+            self.identical(x, roundtrip(x))
+            self.identical(-x, roundtrip(-x))
+
+        # fromHex(toHex(x)) should exactly recover x, for any non-NaN float x.
+        import random
+        for i in range(10000):
+            e = random.randrange(-1200, 1200)
+            m = random.random()
+            s = random.choice([1.0, -1.0])
+            try:
+                x = s*ldexp(m, e)
+            except OverflowError:
+                pass
+            else:
+                self.identical(x, fromHex(toHex(x)))
+
+    def test_subclass(self):
+        class F(float):
+            def __new__(cls, value):
+                return float.__new__(cls, value + 1)
+
+        f = F.fromhex((1.5).hex())
+        self.assertIs(type(f), F)
+        self.assertEqual(f, 2.5)
+
+        class F2(float):
+            def __init__(self, value):
+                self.foo = 'bar'
+
+        f = F2.fromhex((1.5).hex())
+        self.assertIs(type(f), F2)
+        self.assertEqual(f, 1.5)
+        self.assertEqual(getattr(f, 'foo', 'none'), 'bar')
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_int.diff b/test/dynamo/cpython/3_13/test_int.diff
new file mode 100644
index 000000000000..a9e4880015e5
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_int.diff
@@ -0,0 +1,179 @@
+diff --git a/test/dynamo/cpython/3_13/test_int.py b/test/dynamo/cpython/3_13/test_int.py
+index 48825f46911..ac7aeacbc01 100644
+--- a/test/dynamo/cpython/3_13/test_int.py
++++ b/test/dynamo/cpython/3_13/test_int.py
+@@ -1,13 +1,137 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ import sys
+ import time
+ 
+ import unittest
+ from unittest import mock
+ from test import support
+-from test.support.numbers import (
+-    VALID_UNDERSCORE_LITERALS,
+-    INVALID_UNDERSCORE_LITERALS,
+-)
++
++VALID_UNDERSCORE_LITERALS = [
++    '0_0_0',
++    '4_2',
++    '1_0000_0000',
++    '0b1001_0100',
++    '0xffff_ffff',
++    '0o5_7_7',
++    '1_00_00.5',
++    '1_00_00.5e5',
++    '1_00_00e5_1',
++    '1e1_0',
++    '.1_4',
++    '.1_4e1',
++    '0b_0',
++    '0x_f',
++    '0o_5',
++    '1_00_00j',
++    '1_00_00.5j',
++    '1_00_00e5_1j',
++    '.1_4j',
++    '(1_2.5+3_3j)',
++    '(.5_6j)',
++]
++INVALID_UNDERSCORE_LITERALS = [
++    # Trailing underscores:
++    '0_',
++    '42_',
++    '1.4j_',
++    '0x_',
++    '0b1_',
++    '0xf_',
++    '0o5_',
++    '0 if 1_Else 1',
++    # Underscores in the base selector:
++    '0_b0',
++    '0_xf',
++    '0_o5',
++    # Old-style octal, still disallowed:
++    '0_7',
++    '09_99',
++    # Multiple consecutive underscores:
++    '4_______2',
++    '0.1__4',
++    '0.1__4j',
++    '0b1001__0100',
++    '0xffff__ffff',
++    '0x___',
++    '0o5__77',
++    '1e1__0',
++    '1e1__0j',
++    # Underscore right before a dot:
++    '1_.4',
++    '1_.4j',
++    # Underscore right after a dot:
++    '1._4',
++    '1._4j',
++    '._5',
++    '._5j',
++    # Underscore right after a sign:
++    '1.0e+_1',
++    '1.0e+_1j',
++    # Underscore right before j:
++    '1.4_j',
++    '1.4e5_j',
++    # Underscore right before e:
++    '1_e1',
++    '1.4_e1',
++    '1.4_e1j',
++    # Underscore right after e:
++    '1e_1',
++    '1.4e_1',
++    '1.4e_1j',
++    # Complex cases with parens:
++    '(1+1.5_j_)',
++    '(1+1.5_j)',
++]
+ 
+ try:
+     import _pylong
+@@ -38,7 +162,7 @@ L = [
+ class IntSubclass(int):
+     pass
+ 
+-class IntTestCases(unittest.TestCase):
++class IntTestCases(__TestCase):
+ 
+     def test_basic(self):
+         self.assertEqual(int(314), 314)
+@@ -607,7 +731,7 @@ class IntTestCases(unittest.TestCase):
+         self.assertEqual(int('1_2_3_4_5_6_7', 32), 1144132807)
+ 
+ 
+-class IntStrDigitLimitsTests(unittest.TestCase):
++class IntStrDigitLimitsTests(__TestCase):
+ 
+     int_class = int  # Override this in subclasses to reuse the suite.
+ 
+@@ -818,7 +942,7 @@ class IntSubclassStrDigitLimitsTests(IntStrDigitLimitsTests):
+     int_class = IntSubclass
+ 
+ 
+-class PyLongModuleTests(unittest.TestCase):
++class PyLongModuleTests(__TestCase):
+     # Tests of the functions in _pylong.py.  Those get used when the
+     # number of digits in the input values are large enough.
+ 
+@@ -922,4 +1046,4 @@ class PyLongModuleTests(unittest.TestCase):
+             bits <<= 1
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_int.py b/test/dynamo/cpython/3_13/test_int.py
new file mode 100644
index 000000000000..072c591e073a
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_int.py
@@ -0,0 +1,1049 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+import sys
+import time
+
+import unittest
+from unittest import mock
+from test import support
+
+VALID_UNDERSCORE_LITERALS = [
+    '0_0_0',
+    '4_2',
+    '1_0000_0000',
+    '0b1001_0100',
+    '0xffff_ffff',
+    '0o5_7_7',
+    '1_00_00.5',
+    '1_00_00.5e5',
+    '1_00_00e5_1',
+    '1e1_0',
+    '.1_4',
+    '.1_4e1',
+    '0b_0',
+    '0x_f',
+    '0o_5',
+    '1_00_00j',
+    '1_00_00.5j',
+    '1_00_00e5_1j',
+    '.1_4j',
+    '(1_2.5+3_3j)',
+    '(.5_6j)',
+]
+INVALID_UNDERSCORE_LITERALS = [
+    # Trailing underscores:
+    '0_',
+    '42_',
+    '1.4j_',
+    '0x_',
+    '0b1_',
+    '0xf_',
+    '0o5_',
+    '0 if 1_Else 1',
+    # Underscores in the base selector:
+    '0_b0',
+    '0_xf',
+    '0_o5',
+    # Old-style octal, still disallowed:
+    '0_7',
+    '09_99',
+    # Multiple consecutive underscores:
+    '4_______2',
+    '0.1__4',
+    '0.1__4j',
+    '0b1001__0100',
+    '0xffff__ffff',
+    '0x___',
+    '0o5__77',
+    '1e1__0',
+    '1e1__0j',
+    # Underscore right before a dot:
+    '1_.4',
+    '1_.4j',
+    # Underscore right after a dot:
+    '1._4',
+    '1._4j',
+    '._5',
+    '._5j',
+    # Underscore right after a sign:
+    '1.0e+_1',
+    '1.0e+_1j',
+    # Underscore right before j:
+    '1.4_j',
+    '1.4e5_j',
+    # Underscore right before e:
+    '1_e1',
+    '1.4_e1',
+    '1.4_e1j',
+    # Underscore right after e:
+    '1e_1',
+    '1.4e_1',
+    '1.4e_1j',
+    # Complex cases with parens:
+    '(1+1.5_j_)',
+    '(1+1.5_j)',
+]
+
+try:
+    import _pylong
+except ImportError:
+    _pylong = None
+
+L = [
+        ('0', 0),
+        ('1', 1),
+        ('9', 9),
+        ('10', 10),
+        ('99', 99),
+        ('100', 100),
+        ('314', 314),
+        (' 314', 314),
+        ('314 ', 314),
+        ('  \t\t  314  \t\t  ', 314),
+        (repr(sys.maxsize), sys.maxsize),
+        ('  1x', ValueError),
+        ('  1  ', 1),
+        ('  1\02  ', ValueError),
+        ('', ValueError),
+        (' ', ValueError),
+        ('  \t\t  ', ValueError),
+        ("\u0200", ValueError)
+]
+
+class IntSubclass(int):
+    pass
+
+class IntTestCases(__TestCase):
+
+    def test_basic(self):
+        self.assertEqual(int(314), 314)
+        self.assertEqual(int(3.14), 3)
+        # Check that conversion from float truncates towards zero
+        self.assertEqual(int(-3.14), -3)
+        self.assertEqual(int(3.9), 3)
+        self.assertEqual(int(-3.9), -3)
+        self.assertEqual(int(3.5), 3)
+        self.assertEqual(int(-3.5), -3)
+        self.assertEqual(int("-3"), -3)
+        self.assertEqual(int(" -3 "), -3)
+        self.assertEqual(int("\N{EM SPACE}-3\N{EN SPACE}"), -3)
+        # Different base:
+        self.assertEqual(int("10",16), 16)
+        # Test conversion from strings and various anomalies
+        for s, v in L:
+            for sign in "", "+", "-":
+                for prefix in "", " ", "\t", "  \t\t  ":
+                    ss = prefix + sign + s
+                    vv = v
+                    if sign == "-" and v is not ValueError:
+                        vv = -v
+                    try:
+                        self.assertEqual(int(ss), vv)
+                    except ValueError:
+                        pass
+
+        s = repr(-1-sys.maxsize)
+        x = int(s)
+        self.assertEqual(x+1, -sys.maxsize)
+        self.assertIsInstance(x, int)
+        # should return int
+        self.assertEqual(int(s[1:]), sys.maxsize+1)
+
+        # should return int
+        x = int(1e100)
+        self.assertIsInstance(x, int)
+        x = int(-1e100)
+        self.assertIsInstance(x, int)
+
+
+        # SF bug 434186:  0x80000000/2 != 0x80000000>>1.
+        # Worked by accident in Windows release build, but failed in debug build.
+        # Failed in all Linux builds.
+        x = -1-sys.maxsize
+        self.assertEqual(x >> 1, x//2)
+
+        x = int('1' * 600)
+        self.assertIsInstance(x, int)
+
+
+        self.assertRaises(TypeError, int, 1, 12)
+        self.assertRaises(TypeError, int, "10", 2, 1)
+
+        self.assertEqual(int('0o123', 0), 83)
+        self.assertEqual(int('0x123', 16), 291)
+
+        # Bug 1679: "0x" is not a valid hex literal
+        self.assertRaises(ValueError, int, "0x", 16)
+        self.assertRaises(ValueError, int, "0x", 0)
+
+        self.assertRaises(ValueError, int, "0o", 8)
+        self.assertRaises(ValueError, int, "0o", 0)
+
+        self.assertRaises(ValueError, int, "0b", 2)
+        self.assertRaises(ValueError, int, "0b", 0)
+
+        # SF bug 1334662: int(string, base) wrong answers
+        # Various representations of 2**32 evaluated to 0
+        # rather than 2**32 in previous versions
+
+        self.assertEqual(int('100000000000000000000000000000000', 2), 4294967296)
+        self.assertEqual(int('102002022201221111211', 3), 4294967296)
+        self.assertEqual(int('10000000000000000', 4), 4294967296)
+        self.assertEqual(int('32244002423141', 5), 4294967296)
+        self.assertEqual(int('1550104015504', 6), 4294967296)
+        self.assertEqual(int('211301422354', 7), 4294967296)
+        self.assertEqual(int('40000000000', 8), 4294967296)
+        self.assertEqual(int('12068657454', 9), 4294967296)
+        self.assertEqual(int('4294967296', 10), 4294967296)
+        self.assertEqual(int('1904440554', 11), 4294967296)
+        self.assertEqual(int('9ba461594', 12), 4294967296)
+        self.assertEqual(int('535a79889', 13), 4294967296)
+        self.assertEqual(int('2ca5b7464', 14), 4294967296)
+        self.assertEqual(int('1a20dcd81', 15), 4294967296)
+        self.assertEqual(int('100000000', 16), 4294967296)
+        self.assertEqual(int('a7ffda91', 17), 4294967296)
+        self.assertEqual(int('704he7g4', 18), 4294967296)
+        self.assertEqual(int('4f5aff66', 19), 4294967296)
+        self.assertEqual(int('3723ai4g', 20), 4294967296)
+        self.assertEqual(int('281d55i4', 21), 4294967296)
+        self.assertEqual(int('1fj8b184', 22), 4294967296)
+        self.assertEqual(int('1606k7ic', 23), 4294967296)
+        self.assertEqual(int('mb994ag', 24), 4294967296)
+        self.assertEqual(int('hek2mgl', 25), 4294967296)
+        self.assertEqual(int('dnchbnm', 26), 4294967296)
+        self.assertEqual(int('b28jpdm', 27), 4294967296)
+        self.assertEqual(int('8pfgih4', 28), 4294967296)
+        self.assertEqual(int('76beigg', 29), 4294967296)
+        self.assertEqual(int('5qmcpqg', 30), 4294967296)
+        self.assertEqual(int('4q0jto4', 31), 4294967296)
+        self.assertEqual(int('4000000', 32), 4294967296)
+        self.assertEqual(int('3aokq94', 33), 4294967296)
+        self.assertEqual(int('2qhxjli', 34), 4294967296)
+        self.assertEqual(int('2br45qb', 35), 4294967296)
+        self.assertEqual(int('1z141z4', 36), 4294967296)
+
+        # tests with base 0
+        # this fails on 3.0, but in 2.x the old octal syntax is allowed
+        self.assertEqual(int(' 0o123  ', 0), 83)
+        self.assertEqual(int(' 0o123  ', 0), 83)
+        self.assertEqual(int('000', 0), 0)
+        self.assertEqual(int('0o123', 0), 83)
+        self.assertEqual(int('0x123', 0), 291)
+        self.assertEqual(int('0b100', 0), 4)
+        self.assertEqual(int(' 0O123   ', 0), 83)
+        self.assertEqual(int(' 0X123  ', 0), 291)
+        self.assertEqual(int(' 0B100 ', 0), 4)
+        with self.assertRaises(ValueError):
+            int('010', 0)
+
+        # without base still base 10
+        self.assertEqual(int('0123'), 123)
+        self.assertEqual(int('0123', 10), 123)
+
+        # tests with prefix and base != 0
+        self.assertEqual(int('0x123', 16), 291)
+        self.assertEqual(int('0o123', 8), 83)
+        self.assertEqual(int('0b100', 2), 4)
+        self.assertEqual(int('0X123', 16), 291)
+        self.assertEqual(int('0O123', 8), 83)
+        self.assertEqual(int('0B100', 2), 4)
+
+        # the code has special checks for the first character after the
+        #  type prefix
+        self.assertRaises(ValueError, int, '0b2', 2)
+        self.assertRaises(ValueError, int, '0b02', 2)
+        self.assertRaises(ValueError, int, '0B2', 2)
+        self.assertRaises(ValueError, int, '0B02', 2)
+        self.assertRaises(ValueError, int, '0o8', 8)
+        self.assertRaises(ValueError, int, '0o08', 8)
+        self.assertRaises(ValueError, int, '0O8', 8)
+        self.assertRaises(ValueError, int, '0O08', 8)
+        self.assertRaises(ValueError, int, '0xg', 16)
+        self.assertRaises(ValueError, int, '0x0g', 16)
+        self.assertRaises(ValueError, int, '0Xg', 16)
+        self.assertRaises(ValueError, int, '0X0g', 16)
+
+        # SF bug 1334662: int(string, base) wrong answers
+        # Checks for proper evaluation of 2**32 + 1
+        self.assertEqual(int('100000000000000000000000000000001', 2), 4294967297)
+        self.assertEqual(int('102002022201221111212', 3), 4294967297)
+        self.assertEqual(int('10000000000000001', 4), 4294967297)
+        self.assertEqual(int('32244002423142', 5), 4294967297)
+        self.assertEqual(int('1550104015505', 6), 4294967297)
+        self.assertEqual(int('211301422355', 7), 4294967297)
+        self.assertEqual(int('40000000001', 8), 4294967297)
+        self.assertEqual(int('12068657455', 9), 4294967297)
+        self.assertEqual(int('4294967297', 10), 4294967297)
+        self.assertEqual(int('1904440555', 11), 4294967297)
+        self.assertEqual(int('9ba461595', 12), 4294967297)
+        self.assertEqual(int('535a7988a', 13), 4294967297)
+        self.assertEqual(int('2ca5b7465', 14), 4294967297)
+        self.assertEqual(int('1a20dcd82', 15), 4294967297)
+        self.assertEqual(int('100000001', 16), 4294967297)
+        self.assertEqual(int('a7ffda92', 17), 4294967297)
+        self.assertEqual(int('704he7g5', 18), 4294967297)
+        self.assertEqual(int('4f5aff67', 19), 4294967297)
+        self.assertEqual(int('3723ai4h', 20), 4294967297)
+        self.assertEqual(int('281d55i5', 21), 4294967297)
+        self.assertEqual(int('1fj8b185', 22), 4294967297)
+        self.assertEqual(int('1606k7id', 23), 4294967297)
+        self.assertEqual(int('mb994ah', 24), 4294967297)
+        self.assertEqual(int('hek2mgm', 25), 4294967297)
+        self.assertEqual(int('dnchbnn', 26), 4294967297)
+        self.assertEqual(int('b28jpdn', 27), 4294967297)
+        self.assertEqual(int('8pfgih5', 28), 4294967297)
+        self.assertEqual(int('76beigh', 29), 4294967297)
+        self.assertEqual(int('5qmcpqh', 30), 4294967297)
+        self.assertEqual(int('4q0jto5', 31), 4294967297)
+        self.assertEqual(int('4000001', 32), 4294967297)
+        self.assertEqual(int('3aokq95', 33), 4294967297)
+        self.assertEqual(int('2qhxjlj', 34), 4294967297)
+        self.assertEqual(int('2br45qc', 35), 4294967297)
+        self.assertEqual(int('1z141z5', 36), 4294967297)
+
+    def test_invalid_signs(self):
+        with self.assertRaises(ValueError):
+            int('+')
+        with self.assertRaises(ValueError):
+            int('-')
+        with self.assertRaises(ValueError):
+            int('- 1')
+        with self.assertRaises(ValueError):
+            int('+ 1')
+        with self.assertRaises(ValueError):
+            int(' + 1 ')
+
+    def test_unicode(self):
+        self.assertEqual(int("१२३४५६७८९०1234567890"), 12345678901234567890)
+        self.assertEqual(int('١٢٣٤٥٦٧٨٩٠'), 1234567890)
+        self.assertEqual(int("१२३४५६७८९०1234567890", 0), 12345678901234567890)
+        self.assertEqual(int('١٢٣٤٥٦٧٨٩٠', 0), 1234567890)
+
+    def test_underscores(self):
+        for lit in VALID_UNDERSCORE_LITERALS:
+            if any(ch in lit for ch in '.eEjJ'):
+                continue
+            self.assertEqual(int(lit, 0), eval(lit))
+            self.assertEqual(int(lit, 0), int(lit.replace('_', ''), 0))
+        for lit in INVALID_UNDERSCORE_LITERALS:
+            if any(ch in lit for ch in '.eEjJ'):
+                continue
+            self.assertRaises(ValueError, int, lit, 0)
+        # Additional test cases with bases != 0, only for the constructor:
+        self.assertEqual(int("1_00", 3), 9)
+        self.assertEqual(int("0_100"), 100)  # not valid as a literal!
+        self.assertEqual(int(b"1_00"), 100)  # byte underscore
+        self.assertRaises(ValueError, int, "_100")
+        self.assertRaises(ValueError, int, "+_100")
+        self.assertRaises(ValueError, int, "1__00")
+        self.assertRaises(ValueError, int, "100_")
+
+    @support.cpython_only
+    def test_small_ints(self):
+        # Bug #3236: Return small longs from PyLong_FromString
+        self.assertIs(int('10'), 10)
+        self.assertIs(int('-1'), -1)
+        self.assertIs(int(b'10'), 10)
+        self.assertIs(int(b'-1'), -1)
+
+    def test_no_args(self):
+        self.assertEqual(int(), 0)
+
+    def test_keyword_args(self):
+        # Test invoking int() using keyword arguments.
+        self.assertEqual(int('100', base=2), 4)
+        with self.assertRaisesRegex(TypeError, 'keyword argument'):
+            int(x=1.2)
+        with self.assertRaisesRegex(TypeError, 'keyword argument'):
+            int(x='100', base=2)
+        self.assertRaises(TypeError, int, base=10)
+        self.assertRaises(TypeError, int, base=0)
+
+    def test_int_base_limits(self):
+        """Testing the supported limits of the int() base parameter."""
+        self.assertEqual(int('0', 5), 0)
+        with self.assertRaises(ValueError):
+            int('0', 1)
+        with self.assertRaises(ValueError):
+            int('0', 37)
+        with self.assertRaises(ValueError):
+            int('0', -909)  # An old magic value base from Python 2.
+        with self.assertRaises(ValueError):
+            int('0', base=0-(2**234))
+        with self.assertRaises(ValueError):
+            int('0', base=2**234)
+        # Bases 2 through 36 are supported.
+        for base in range(2,37):
+            self.assertEqual(int('0', base=base), 0)
+
+    def test_int_base_bad_types(self):
+        """Not integer types are not valid bases; issue16772."""
+        with self.assertRaises(TypeError):
+            int('0', 5.5)
+        with self.assertRaises(TypeError):
+            int('0', 5.0)
+
+    def test_int_base_indexable(self):
+        class MyIndexable(object):
+            def __init__(self, value):
+                self.value = value
+            def __index__(self):
+                return self.value
+
+        # Check out of range bases.
+        for base in 2**100, -2**100, 1, 37:
+            with self.assertRaises(ValueError):
+                int('43', base)
+
+        # Check in-range bases.
+        self.assertEqual(int('101', base=MyIndexable(2)), 5)
+        self.assertEqual(int('101', base=MyIndexable(10)), 101)
+        self.assertEqual(int('101', base=MyIndexable(36)), 1 + 36**2)
+
+    def test_non_numeric_input_types(self):
+        # Test possible non-numeric types for the argument x, including
+        # subclasses of the explicitly documented accepted types.
+        class CustomStr(str): pass
+        class CustomBytes(bytes): pass
+        class CustomByteArray(bytearray): pass
+
+        factories = [
+            bytes,
+            bytearray,
+            lambda b: CustomStr(b.decode()),
+            CustomBytes,
+            CustomByteArray,
+            memoryview,
+        ]
+        try:
+            from array import array
+        except ImportError:
+            pass
+        else:
+            factories.append(lambda b: array('B', b))
+
+        for f in factories:
+            x = f(b'100')
+            with self.subTest(type(x)):
+                self.assertEqual(int(x), 100)
+                if isinstance(x, (str, bytes, bytearray)):
+                    self.assertEqual(int(x, 2), 4)
+                else:
+                    msg = "can't convert non-string"
+                    with self.assertRaisesRegex(TypeError, msg):
+                        int(x, 2)
+                with self.assertRaisesRegex(ValueError, 'invalid literal'):
+                    int(f(b'A' * 0x10))
+
+    def test_int_memoryview(self):
+        self.assertEqual(int(memoryview(b'123')[1:3]), 23)
+        self.assertEqual(int(memoryview(b'123\x00')[1:3]), 23)
+        self.assertEqual(int(memoryview(b'123 ')[1:3]), 23)
+        self.assertEqual(int(memoryview(b'123A')[1:3]), 23)
+        self.assertEqual(int(memoryview(b'1234')[1:3]), 23)
+
+    def test_string_float(self):
+        self.assertRaises(ValueError, int, '1.2')
+
+    def test_intconversion(self):
+        # Test __int__()
+        class ClassicMissingMethods:
+            pass
+        self.assertRaises(TypeError, int, ClassicMissingMethods())
+
+        class MissingMethods(object):
+            pass
+        self.assertRaises(TypeError, int, MissingMethods())
+
+        class Foo0:
+            def __int__(self):
+                return 42
+
+        self.assertEqual(int(Foo0()), 42)
+
+        class Classic:
+            pass
+        for base in (object, Classic):
+            class IntOverridesTrunc(base):
+                def __int__(self):
+                    return 42
+                def __trunc__(self):
+                    return -12
+            self.assertEqual(int(IntOverridesTrunc()), 42)
+
+            class JustTrunc(base):
+                def __trunc__(self):
+                    return 42
+            with self.assertWarns(DeprecationWarning):
+                self.assertEqual(int(JustTrunc()), 42)
+
+            class ExceptionalTrunc(base):
+                def __trunc__(self):
+                    1 / 0
+            with self.assertRaises(ZeroDivisionError), \
+                 self.assertWarns(DeprecationWarning):
+                int(ExceptionalTrunc())
+
+            for trunc_result_base in (object, Classic):
+                class Index(trunc_result_base):
+                    def __index__(self):
+                        return 42
+
+                class TruncReturnsNonInt(base):
+                    def __trunc__(self):
+                        return Index()
+                with self.assertWarns(DeprecationWarning):
+                    self.assertEqual(int(TruncReturnsNonInt()), 42)
+
+                class Intable(trunc_result_base):
+                    def __int__(self):
+                        return 42
+
+                class TruncReturnsNonIndex(base):
+                    def __trunc__(self):
+                        return Intable()
+                with self.assertWarns(DeprecationWarning):
+                    self.assertEqual(int(TruncReturnsNonInt()), 42)
+
+                class NonIntegral(trunc_result_base):
+                    def __trunc__(self):
+                        # Check that we avoid infinite recursion.
+                        return NonIntegral()
+
+                class TruncReturnsNonIntegral(base):
+                    def __trunc__(self):
+                        return NonIntegral()
+                try:
+                    with self.assertWarns(DeprecationWarning):
+                        int(TruncReturnsNonIntegral())
+                except TypeError as e:
+                    self.assertEqual(str(e),
+                                      "__trunc__ returned non-Integral"
+                                      " (type NonIntegral)")
+                else:
+                    self.fail("Failed to raise TypeError with %s" %
+                              ((base, trunc_result_base),))
+
+                # Regression test for bugs.python.org/issue16060.
+                class BadInt(trunc_result_base):
+                    def __int__(self):
+                        return 42.0
+
+                class TruncReturnsBadInt(base):
+                    def __trunc__(self):
+                        return BadInt()
+
+                with self.assertRaises(TypeError), \
+                     self.assertWarns(DeprecationWarning):
+                    int(TruncReturnsBadInt())
+
+    def test_int_subclass_with_index(self):
+        class MyIndex(int):
+            def __index__(self):
+                return 42
+
+        class BadIndex(int):
+            def __index__(self):
+                return 42.0
+
+        my_int = MyIndex(7)
+        self.assertEqual(my_int, 7)
+        self.assertEqual(int(my_int), 7)
+
+        self.assertEqual(int(BadIndex()), 0)
+
+    def test_int_subclass_with_int(self):
+        class MyInt(int):
+            def __int__(self):
+                return 42
+
+        class BadInt(int):
+            def __int__(self):
+                return 42.0
+
+        my_int = MyInt(7)
+        self.assertEqual(my_int, 7)
+        self.assertEqual(int(my_int), 42)
+
+        my_int = BadInt(7)
+        self.assertEqual(my_int, 7)
+        self.assertRaises(TypeError, int, my_int)
+
+    def test_int_returns_int_subclass(self):
+        class BadIndex:
+            def __index__(self):
+                return True
+
+        class BadIndex2(int):
+            def __index__(self):
+                return True
+
+        class BadInt:
+            def __int__(self):
+                return True
+
+        class BadInt2(int):
+            def __int__(self):
+                return True
+
+        class TruncReturnsBadIndex:
+            def __trunc__(self):
+                return BadIndex()
+
+        class TruncReturnsBadInt:
+            def __trunc__(self):
+                return BadInt()
+
+        class TruncReturnsIntSubclass:
+            def __trunc__(self):
+                return True
+
+        bad_int = BadIndex()
+        with self.assertWarns(DeprecationWarning):
+            n = int(bad_int)
+        self.assertEqual(n, 1)
+        self.assertIs(type(n), int)
+
+        bad_int = BadIndex2()
+        n = int(bad_int)
+        self.assertEqual(n, 0)
+        self.assertIs(type(n), int)
+
+        bad_int = BadInt()
+        with self.assertWarns(DeprecationWarning):
+            n = int(bad_int)
+        self.assertEqual(n, 1)
+        self.assertIs(type(n), int)
+
+        bad_int = BadInt2()
+        with self.assertWarns(DeprecationWarning):
+            n = int(bad_int)
+        self.assertEqual(n, 1)
+        self.assertIs(type(n), int)
+
+        bad_int = TruncReturnsBadIndex()
+        with self.assertWarns(DeprecationWarning):
+            n = int(bad_int)
+        self.assertEqual(n, 1)
+        self.assertIs(type(n), int)
+
+        bad_int = TruncReturnsBadInt()
+        with self.assertWarns(DeprecationWarning):
+            self.assertRaises(TypeError, int, bad_int)
+
+        good_int = TruncReturnsIntSubclass()
+        with self.assertWarns(DeprecationWarning):
+            n = int(good_int)
+        self.assertEqual(n, 1)
+        self.assertIs(type(n), int)
+        with self.assertWarns(DeprecationWarning):
+            n = IntSubclass(good_int)
+        self.assertEqual(n, 1)
+        self.assertIs(type(n), IntSubclass)
+
+    def test_error_message(self):
+        def check(s, base=None):
+            with self.assertRaises(ValueError,
+                                   msg="int(%r, %r)" % (s, base)) as cm:
+                if base is None:
+                    int(s)
+                else:
+                    int(s, base)
+            self.assertEqual(cm.exception.args[0],
+                "invalid literal for int() with base %d: %r" %
+                (10 if base is None else base, s))
+
+        check('\xbd')
+        check('123\xbd')
+        check('  123 456  ')
+
+        check('123\x00')
+        # SF bug 1545497: embedded NULs were not detected with explicit base
+        check('123\x00', 10)
+        check('123\x00 245', 20)
+        check('123\x00 245', 16)
+        check('123\x00245', 20)
+        check('123\x00245', 16)
+        # byte string with embedded NUL
+        check(b'123\x00')
+        check(b'123\x00', 10)
+        # non-UTF-8 byte string
+        check(b'123\xbd')
+        check(b'123\xbd', 10)
+        # lone surrogate in Unicode string
+        check('123\ud800')
+        check('123\ud800', 10)
+
+    def test_issue31619(self):
+        self.assertEqual(int('1_0_1_0_1_0_1_0_1_0_1_0_1_0_1_0_1_0_1_0_1_0_1_0_1_0_1_0_1_0_1', 2),
+                         0b1010101010101010101010101010101)
+        self.assertEqual(int('1_2_3_4_5_6_7_0_1_2_3', 8), 0o12345670123)
+        self.assertEqual(int('1_2_3_4_5_6_7_8_9', 16), 0x123456789)
+        self.assertEqual(int('1_2_3_4_5_6_7', 32), 1144132807)
+
+
+class IntStrDigitLimitsTests(__TestCase):
+
+    int_class = int  # Override this in subclasses to reuse the suite.
+
+    def setUp(self):
+        super().setUp()
+        self._previous_limit = sys.get_int_max_str_digits()
+        sys.set_int_max_str_digits(2048)
+
+    def tearDown(self):
+        sys.set_int_max_str_digits(self._previous_limit)
+        super().tearDown()
+
+    def test_disabled_limit(self):
+        self.assertGreater(sys.get_int_max_str_digits(), 0)
+        self.assertLess(sys.get_int_max_str_digits(), 20_000)
+        with support.adjust_int_max_str_digits(0):
+            self.assertEqual(sys.get_int_max_str_digits(), 0)
+            i = self.int_class('1' * 20_000)
+            str(i)
+        self.assertGreater(sys.get_int_max_str_digits(), 0)
+
+    def test_max_str_digits_edge_cases(self):
+        """Ignore the +/- sign and space padding."""
+        int_class = self.int_class
+        maxdigits = sys.get_int_max_str_digits()
+
+        int_class('1' * maxdigits)
+        int_class(' ' + '1' * maxdigits)
+        int_class('1' * maxdigits + ' ')
+        int_class('+' + '1' * maxdigits)
+        int_class('-' + '1' * maxdigits)
+        self.assertEqual(len(str(10 ** (maxdigits - 1))), maxdigits)
+
+    def check(self, i, base=None):
+        with self.assertRaises(ValueError):
+            if base is None:
+                self.int_class(i)
+            else:
+                self.int_class(i, base)
+
+    def test_max_str_digits(self):
+        maxdigits = sys.get_int_max_str_digits()
+
+        self.check('1' * (maxdigits + 1))
+        self.check(' ' + '1' * (maxdigits + 1))
+        self.check('1' * (maxdigits + 1) + ' ')
+        self.check('+' + '1' * (maxdigits + 1))
+        self.check('-' + '1' * (maxdigits + 1))
+        self.check('1' * (maxdigits + 1))
+
+        i = 10 ** maxdigits
+        with self.assertRaises(ValueError):
+            str(i)
+
+    def test_denial_of_service_prevented_int_to_str(self):
+        """Regression test: ensure we fail before performing O(N**2) work."""
+        maxdigits = sys.get_int_max_str_digits()
+        assert maxdigits < 50_000, maxdigits  # A test prerequisite.
+
+        huge_int = int(f'0x{"c"*65_000}', base=16)  # 78268 decimal digits.
+        digits = 78_268
+        with (
+                support.adjust_int_max_str_digits(digits),
+                support.CPUStopwatch() as sw_convert):
+            huge_decimal = str(huge_int)
+        self.assertEqual(len(huge_decimal), digits)
+        # Ensuring that we chose a slow enough conversion to measure.
+        # It takes 0.1 seconds on a Zen based cloud VM in an opt build.
+        # Some OSes have a low res 1/64s timer, skip if hard to measure.
+        if sw_convert.seconds < sw_convert.clock_info.resolution * 2:
+            raise unittest.SkipTest('"slow" conversion took only '
+                                    f'{sw_convert.seconds} seconds.')
+
+        # We test with the limit almost at the size needed to check performance.
+        # The performant limit check is slightly fuzzy, give it a some room.
+        with support.adjust_int_max_str_digits(int(.995 * digits)):
+            with (
+                    self.assertRaises(ValueError) as err,
+                    support.CPUStopwatch() as sw_fail_huge):
+                str(huge_int)
+        self.assertIn('conversion', str(err.exception))
+        self.assertLessEqual(sw_fail_huge.seconds, sw_convert.seconds/2)
+
+        # Now we test that a conversion that would take 30x as long also fails
+        # in a similarly fast fashion.
+        extra_huge_int = int(f'0x{"c"*500_000}', base=16)  # 602060 digits.
+        with (
+                self.assertRaises(ValueError) as err,
+                support.CPUStopwatch() as sw_fail_extra_huge):
+            # If not limited, 8 seconds said Zen based cloud VM.
+            str(extra_huge_int)
+        self.assertIn('conversion', str(err.exception))
+        self.assertLess(sw_fail_extra_huge.seconds, sw_convert.seconds/2)
+
+    def test_denial_of_service_prevented_str_to_int(self):
+        """Regression test: ensure we fail before performing O(N**2) work."""
+        maxdigits = sys.get_int_max_str_digits()
+        assert maxdigits < 100_000, maxdigits  # A test prerequisite.
+
+        digits = 133700
+        huge = '8'*digits
+        with (
+                support.adjust_int_max_str_digits(digits),
+                support.CPUStopwatch() as sw_convert):
+            int(huge)
+        # Ensuring that we chose a slow enough conversion to measure.
+        # It takes 0.1 seconds on a Zen based cloud VM in an opt build.
+        # Some OSes have a low res 1/64s timer, skip if hard to measure.
+        if sw_convert.seconds < sw_convert.clock_info.resolution * 2:
+            raise unittest.SkipTest('"slow" conversion took only '
+                                    f'{sw_convert.seconds} seconds.')
+
+        with support.adjust_int_max_str_digits(digits - 1):
+            with (
+                    self.assertRaises(ValueError) as err,
+                    support.CPUStopwatch() as sw_fail_huge):
+                int(huge)
+        self.assertIn('conversion', str(err.exception))
+        self.assertLessEqual(sw_fail_huge.seconds, sw_convert.seconds/2)
+
+        # Now we test that a conversion that would take 30x as long also fails
+        # in a similarly fast fashion.
+        extra_huge = '7'*1_200_000
+        with (
+                self.assertRaises(ValueError) as err,
+                support.CPUStopwatch() as sw_fail_extra_huge):
+            # If not limited, 8 seconds in the Zen based cloud VM.
+            int(extra_huge)
+        self.assertIn('conversion', str(err.exception))
+        self.assertLessEqual(sw_fail_extra_huge.seconds, sw_convert.seconds/2)
+
+    def test_power_of_two_bases_unlimited(self):
+        """The limit does not apply to power of 2 bases."""
+        maxdigits = sys.get_int_max_str_digits()
+
+        for base in (2, 4, 8, 16, 32):
+            with self.subTest(base=base):
+                self.int_class('1' * (maxdigits + 1), base)
+                assert maxdigits < 100_000
+                self.int_class('1' * 100_000, base)
+
+    def test_underscores_ignored(self):
+        maxdigits = sys.get_int_max_str_digits()
+
+        triples = maxdigits // 3
+        s = '111' * triples
+        s_ = '1_11' * triples
+        self.int_class(s)  # succeeds
+        self.int_class(s_)  # succeeds
+        self.check(f'{s}111')
+        self.check(f'{s_}_111')
+
+    def test_sign_not_counted(self):
+        int_class = self.int_class
+        max_digits = sys.get_int_max_str_digits()
+        s = '5' * max_digits
+        i = int_class(s)
+        pos_i = int_class(f'+{s}')
+        assert i == pos_i
+        neg_i = int_class(f'-{s}')
+        assert -pos_i == neg_i
+        str(pos_i)
+        str(neg_i)
+
+    def _other_base_helper(self, base):
+        int_class = self.int_class
+        max_digits = sys.get_int_max_str_digits()
+        s = '2' * max_digits
+        i = int_class(s, base)
+        if base > 10:
+            with self.assertRaises(ValueError):
+                str(i)
+        elif base < 10:
+            str(i)
+        with self.assertRaises(ValueError) as err:
+            int_class(f'{s}1', base)
+
+    def test_int_from_other_bases(self):
+        base = 3
+        with self.subTest(base=base):
+            self._other_base_helper(base)
+        base = 36
+        with self.subTest(base=base):
+            self._other_base_helper(base)
+
+    def test_int_max_str_digits_is_per_interpreter(self):
+        # Changing the limit in one interpreter does not change others.
+        code = """if 1:
+        # Subinterpreters maintain and enforce their own limit
+        import sys
+        sys.set_int_max_str_digits(2323)
+        try:
+            int('3'*3333)
+        except ValueError:
+            pass
+        else:
+            raise AssertionError('Expected a int max str digits ValueError.')
+        """
+        with support.adjust_int_max_str_digits(4000):
+            before_value = sys.get_int_max_str_digits()
+            self.assertEqual(support.run_in_subinterp(code), 0,
+                             'subinterp code failure, check stderr.')
+            after_value = sys.get_int_max_str_digits()
+            self.assertEqual(before_value, after_value)
+
+
+class IntSubclassStrDigitLimitsTests(IntStrDigitLimitsTests):
+    int_class = IntSubclass
+
+
+class PyLongModuleTests(__TestCase):
+    # Tests of the functions in _pylong.py.  Those get used when the
+    # number of digits in the input values are large enough.
+
+    def setUp(self):
+        super().setUp()
+        self._previous_limit = sys.get_int_max_str_digits()
+        sys.set_int_max_str_digits(0)
+
+    def tearDown(self):
+        sys.set_int_max_str_digits(self._previous_limit)
+        super().tearDown()
+
+    def _test_pylong_int_to_decimal(self, n, suffix):
+        s = str(n)
+        self.assertEqual(s[-10:], suffix)
+        s2 = str(-n)
+        self.assertEqual(s2, '-' + s)
+        s3 = '%d' % n
+        self.assertEqual(s3, s)
+        s4 = b'%d' % n
+        self.assertEqual(s4, s.encode('ascii'))
+
+    def test_pylong_int_to_decimal(self):
+        self._test_pylong_int_to_decimal((1 << 100_000), '9883109376')
+        self._test_pylong_int_to_decimal((1 << 100_000) - 1, '9883109375')
+        self._test_pylong_int_to_decimal(10**30_000, '0000000000')
+        self._test_pylong_int_to_decimal(10**30_000 - 1, '9999999999')
+        self._test_pylong_int_to_decimal(3**60_000, '9313200001')
+
+    @support.requires_resource('cpu')
+    def test_pylong_int_to_decimal_2(self):
+        self._test_pylong_int_to_decimal(2**1_000_000, '2747109376')
+        self._test_pylong_int_to_decimal(10**300_000, '0000000000')
+        self._test_pylong_int_to_decimal(3**600_000, '3132000001')
+
+    def test_pylong_int_divmod(self):
+        n = (1 << 100_000)
+        a, b = divmod(n*3 + 1, n)
+        assert a == 3 and b == 1
+
+    def test_pylong_str_to_int(self):
+        v1 = 1 << 100_000
+        s = str(v1)
+        v2 = int(s)
+        assert v1 == v2
+        v3 = int(' -' + s)
+        assert -v1 == v3
+        v4 = int(' +' + s + ' ')
+        assert v1 == v4
+        with self.assertRaises(ValueError) as err:
+            int(s + 'z')
+        with self.assertRaises(ValueError) as err:
+            int(s + '_')
+        with self.assertRaises(ValueError) as err:
+            int('_' + s)
+
+    @support.cpython_only  # tests implementation details of CPython.
+    @unittest.skipUnless(_pylong, "_pylong module required")
+    @mock.patch.object(_pylong, "int_to_decimal_string")
+    def test_pylong_misbehavior_error_path_to_str(
+            self, mock_int_to_str):
+        with support.adjust_int_max_str_digits(20_000):
+            big_value = int('7'*19_999)
+            mock_int_to_str.return_value = None  # not a str
+            with self.assertRaises(TypeError) as ctx:
+                str(big_value)
+            self.assertIn('_pylong.int_to_decimal_string did not',
+                          str(ctx.exception))
+            mock_int_to_str.side_effect = RuntimeError("testABC")
+            with self.assertRaises(RuntimeError):
+                str(big_value)
+
+    @support.cpython_only  # tests implementation details of CPython.
+    @unittest.skipUnless(_pylong, "_pylong module required")
+    @mock.patch.object(_pylong, "int_from_string")
+    def test_pylong_misbehavior_error_path_from_str(
+            self, mock_int_from_str):
+        big_value = '7'*19_999
+        with support.adjust_int_max_str_digits(20_000):
+            mock_int_from_str.return_value = b'not an int'
+            with self.assertRaises(TypeError) as ctx:
+                int(big_value)
+            self.assertIn('_pylong.int_from_string did not',
+                          str(ctx.exception))
+
+            mock_int_from_str.side_effect = RuntimeError("test123")
+            with self.assertRaises(RuntimeError):
+                int(big_value)
+
+    def test_pylong_roundtrip(self):
+        from random import randrange, getrandbits
+        bits = 5000
+        while bits <= 1_000_000:
+            bits += randrange(-100, 101) # break bitlength patterns
+            hibit = 1 << (bits - 1)
+            n = hibit | getrandbits(bits - 1)
+            assert n.bit_length() == bits
+            sn = str(n)
+            self.assertFalse(sn.startswith('0'))
+            self.assertEqual(n, int(sn))
+            bits <<= 1
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_int_literal.diff b/test/dynamo/cpython/3_13/test_int_literal.diff
new file mode 100644
index 000000000000..2f25367ff9fa
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_int_literal.diff
@@ -0,0 +1,74 @@
+diff --git a/test/dynamo/cpython/3_13/test_int_literal.py b/test/dynamo/cpython/3_13/test_int_literal.py
+index bf725710d55..831d03666fb 100644
+--- a/test/dynamo/cpython/3_13/test_int_literal.py
++++ b/test/dynamo/cpython/3_13/test_int_literal.py
+@@ -1,3 +1,54 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ """Test correct treatment of hex/oct constants.
+ 
+ This is complex because of changes due to PEP 237.
+@@ -5,7 +56,7 @@ This is complex because of changes due to PEP 237.
+ 
+ import unittest
+ 
+-class TestHexOctBin(unittest.TestCase):
++class TestHexOctBin(__TestCase):
+ 
+     def test_hex_baseline(self):
+         # A few upper/lowercase tests
+@@ -140,4 +191,4 @@ class TestHexOctBin(unittest.TestCase):
+         self.assertEqual(-0b1111111111111111111111111111111111111111111111111111111111111111, -18446744073709551615)
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_int_literal.py b/test/dynamo/cpython/3_13/test_int_literal.py
new file mode 100644
index 000000000000..831d03666fb9
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_int_literal.py
@@ -0,0 +1,194 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+"""Test correct treatment of hex/oct constants.
+
+This is complex because of changes due to PEP 237.
+"""
+
+import unittest
+
+class TestHexOctBin(__TestCase):
+
+    def test_hex_baseline(self):
+        # A few upper/lowercase tests
+        self.assertEqual(0x0, 0X0)
+        self.assertEqual(0x1, 0X1)
+        self.assertEqual(0x123456789abcdef, 0X123456789abcdef)
+        # Baseline tests
+        self.assertEqual(0x0, 0)
+        self.assertEqual(0x10, 16)
+        self.assertEqual(0x7fffffff, 2147483647)
+        self.assertEqual(0x7fffffffffffffff, 9223372036854775807)
+        # Ditto with a minus sign and parentheses
+        self.assertEqual(-(0x0), 0)
+        self.assertEqual(-(0x10), -16)
+        self.assertEqual(-(0x7fffffff), -2147483647)
+        self.assertEqual(-(0x7fffffffffffffff), -9223372036854775807)
+        # Ditto with a minus sign and NO parentheses
+        self.assertEqual(-0x0, 0)
+        self.assertEqual(-0x10, -16)
+        self.assertEqual(-0x7fffffff, -2147483647)
+        self.assertEqual(-0x7fffffffffffffff, -9223372036854775807)
+
+    def test_hex_unsigned(self):
+        # Positive constants
+        self.assertEqual(0x80000000, 2147483648)
+        self.assertEqual(0xffffffff, 4294967295)
+        # Ditto with a minus sign and parentheses
+        self.assertEqual(-(0x80000000), -2147483648)
+        self.assertEqual(-(0xffffffff), -4294967295)
+        # Ditto with a minus sign and NO parentheses
+        # This failed in Python 2.2 through 2.2.2 and in 2.3a1
+        self.assertEqual(-0x80000000, -2147483648)
+        self.assertEqual(-0xffffffff, -4294967295)
+
+        # Positive constants
+        self.assertEqual(0x8000000000000000, 9223372036854775808)
+        self.assertEqual(0xffffffffffffffff, 18446744073709551615)
+        # Ditto with a minus sign and parentheses
+        self.assertEqual(-(0x8000000000000000), -9223372036854775808)
+        self.assertEqual(-(0xffffffffffffffff), -18446744073709551615)
+        # Ditto with a minus sign and NO parentheses
+        # This failed in Python 2.2 through 2.2.2 and in 2.3a1
+        self.assertEqual(-0x8000000000000000, -9223372036854775808)
+        self.assertEqual(-0xffffffffffffffff, -18446744073709551615)
+
+    def test_oct_baseline(self):
+        # A few upper/lowercase tests
+        self.assertEqual(0o0, 0O0)
+        self.assertEqual(0o1, 0O1)
+        self.assertEqual(0o1234567, 0O1234567)
+        # Baseline tests
+        self.assertEqual(0o0, 0)
+        self.assertEqual(0o20, 16)
+        self.assertEqual(0o17777777777, 2147483647)
+        self.assertEqual(0o777777777777777777777, 9223372036854775807)
+        # Ditto with a minus sign and parentheses
+        self.assertEqual(-(0o0), 0)
+        self.assertEqual(-(0o20), -16)
+        self.assertEqual(-(0o17777777777), -2147483647)
+        self.assertEqual(-(0o777777777777777777777), -9223372036854775807)
+        # Ditto with a minus sign and NO parentheses
+        self.assertEqual(-0o0, 0)
+        self.assertEqual(-0o20, -16)
+        self.assertEqual(-0o17777777777, -2147483647)
+        self.assertEqual(-0o777777777777777777777, -9223372036854775807)
+
+    def test_oct_unsigned(self):
+        # Positive constants
+        self.assertEqual(0o20000000000, 2147483648)
+        self.assertEqual(0o37777777777, 4294967295)
+        # Ditto with a minus sign and parentheses
+        self.assertEqual(-(0o20000000000), -2147483648)
+        self.assertEqual(-(0o37777777777), -4294967295)
+        # Ditto with a minus sign and NO parentheses
+        # This failed in Python 2.2 through 2.2.2 and in 2.3a1
+        self.assertEqual(-0o20000000000, -2147483648)
+        self.assertEqual(-0o37777777777, -4294967295)
+
+        # Positive constants
+        self.assertEqual(0o1000000000000000000000, 9223372036854775808)
+        self.assertEqual(0o1777777777777777777777, 18446744073709551615)
+        # Ditto with a minus sign and parentheses
+        self.assertEqual(-(0o1000000000000000000000), -9223372036854775808)
+        self.assertEqual(-(0o1777777777777777777777), -18446744073709551615)
+        # Ditto with a minus sign and NO parentheses
+        # This failed in Python 2.2 through 2.2.2 and in 2.3a1
+        self.assertEqual(-0o1000000000000000000000, -9223372036854775808)
+        self.assertEqual(-0o1777777777777777777777, -18446744073709551615)
+
+    def test_bin_baseline(self):
+        # A few upper/lowercase tests
+        self.assertEqual(0b0, 0B0)
+        self.assertEqual(0b1, 0B1)
+        self.assertEqual(0b10101010101, 0B10101010101)
+        # Baseline tests
+        self.assertEqual(0b0, 0)
+        self.assertEqual(0b10000, 16)
+        self.assertEqual(0b1111111111111111111111111111111, 2147483647)
+        self.assertEqual(0b111111111111111111111111111111111111111111111111111111111111111, 9223372036854775807)
+        # Ditto with a minus sign and parentheses
+        self.assertEqual(-(0b0), 0)
+        self.assertEqual(-(0b10000), -16)
+        self.assertEqual(-(0b1111111111111111111111111111111), -2147483647)
+        self.assertEqual(-(0b111111111111111111111111111111111111111111111111111111111111111), -9223372036854775807)
+        # Ditto with a minus sign and NO parentheses
+        self.assertEqual(-0b0, 0)
+        self.assertEqual(-0b10000, -16)
+        self.assertEqual(-0b1111111111111111111111111111111, -2147483647)
+        self.assertEqual(-0b111111111111111111111111111111111111111111111111111111111111111, -9223372036854775807)
+
+    def test_bin_unsigned(self):
+        # Positive constants
+        self.assertEqual(0b10000000000000000000000000000000, 2147483648)
+        self.assertEqual(0b11111111111111111111111111111111, 4294967295)
+        # Ditto with a minus sign and parentheses
+        self.assertEqual(-(0b10000000000000000000000000000000), -2147483648)
+        self.assertEqual(-(0b11111111111111111111111111111111), -4294967295)
+        # Ditto with a minus sign and NO parentheses
+        # This failed in Python 2.2 through 2.2.2 and in 2.3a1
+        self.assertEqual(-0b10000000000000000000000000000000, -2147483648)
+        self.assertEqual(-0b11111111111111111111111111111111, -4294967295)
+
+        # Positive constants
+        self.assertEqual(0b1000000000000000000000000000000000000000000000000000000000000000, 9223372036854775808)
+        self.assertEqual(0b1111111111111111111111111111111111111111111111111111111111111111, 18446744073709551615)
+        # Ditto with a minus sign and parentheses
+        self.assertEqual(-(0b1000000000000000000000000000000000000000000000000000000000000000), -9223372036854775808)
+        self.assertEqual(-(0b1111111111111111111111111111111111111111111111111111111111111111), -18446744073709551615)
+        # Ditto with a minus sign and NO parentheses
+        # This failed in Python 2.2 through 2.2.2 and in 2.3a1
+        self.assertEqual(-0b1000000000000000000000000000000000000000000000000000000000000000, -9223372036854775808)
+        self.assertEqual(-0b1111111111111111111111111111111111111111111111111111111111111111, -18446744073709551615)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_iter.diff b/test/dynamo/cpython/3_13/test_iter.diff
new file mode 100644
index 000000000000..0df882d39ca6
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_iter.diff
@@ -0,0 +1,85 @@
+diff --git a/test/dynamo/cpython/3_13/test_iter.py b/test/dynamo/cpython/3_13/test_iter.py
+index 1b9f3cf7624..d0c68f4314c 100644
+--- a/test/dynamo/cpython/3_13/test_iter.py
++++ b/test/dynamo/cpython/3_13/test_iter.py
+@@ -1,3 +1,57 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import (
++    skipIfTorchDynamo,
++    run_tests,
++)
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ # Test iterators.
+ 
+ import sys
+@@ -104,7 +158,7 @@ class EmptyIterClass:
+ 
+ # Main test suite
+ 
+-class TestCase(unittest.TestCase):
++class TestCase(__TestCase):
+ 
+     # Helper to check that an iterator returns a given sequence
+     def check_iterator(self, it, seq, pickle=True):
+@@ -635,6 +689,7 @@ class TestCase(unittest.TestCase):
+                 pass
+ 
+     # Test zip()'s use of iterators.
++    @skipIfTorchDynamo("infinite loop")
+     def test_builtin_zip(self):
+         self.assertEqual(list(zip()), [])
+         self.assertEqual(list(zip(*[])), [])
+@@ -1187,4 +1242,4 @@ class TestCase(unittest.TestCase):
+ 
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_iter.py b/test/dynamo/cpython/3_13/test_iter.py
new file mode 100644
index 000000000000..d0c68f4314c2
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_iter.py
@@ -0,0 +1,1245 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import (
+    skipIfTorchDynamo,
+    run_tests,
+)
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+# Test iterators.
+
+import sys
+import unittest
+from test.support import cpython_only
+from test.support.os_helper import TESTFN, unlink
+from test.support import check_free_after_iterating, ALWAYS_EQ, NEVER_EQ
+from test.support import BrokenIter
+import pickle
+import collections.abc
+import functools
+import contextlib
+import builtins
+import traceback
+
+# Test result of triple loop (too big to inline)
+TRIPLETS = [(0, 0, 0), (0, 0, 1), (0, 0, 2),
+            (0, 1, 0), (0, 1, 1), (0, 1, 2),
+            (0, 2, 0), (0, 2, 1), (0, 2, 2),
+
+            (1, 0, 0), (1, 0, 1), (1, 0, 2),
+            (1, 1, 0), (1, 1, 1), (1, 1, 2),
+            (1, 2, 0), (1, 2, 1), (1, 2, 2),
+
+            (2, 0, 0), (2, 0, 1), (2, 0, 2),
+            (2, 1, 0), (2, 1, 1), (2, 1, 2),
+            (2, 2, 0), (2, 2, 1), (2, 2, 2)]
+
+# Helper classes
+
+class BasicIterClass:
+    def __init__(self, n):
+        self.n = n
+        self.i = 0
+    def __next__(self):
+        res = self.i
+        if res >= self.n:
+            raise StopIteration
+        self.i = res + 1
+        return res
+    def __iter__(self):
+        return self
+
+class IteratingSequenceClass:
+    def __init__(self, n):
+        self.n = n
+    def __iter__(self):
+        return BasicIterClass(self.n)
+
+class IteratorProxyClass:
+    def __init__(self, i):
+        self.i = i
+    def __next__(self):
+        return next(self.i)
+    def __iter__(self):
+        return self
+
+class SequenceClass:
+    def __init__(self, n):
+        self.n = n
+    def __getitem__(self, i):
+        if 0 <= i < self.n:
+            return i
+        else:
+            raise IndexError
+
+class SequenceProxyClass:
+    def __init__(self, s):
+        self.s = s
+    def __getitem__(self, i):
+        return self.s[i]
+
+class UnlimitedSequenceClass:
+    def __getitem__(self, i):
+        return i
+
+class DefaultIterClass:
+    pass
+
+class NoIterClass:
+    def __getitem__(self, i):
+        return i
+    __iter__ = None
+
+class BadIterableClass:
+    def __iter__(self):
+        raise ZeroDivisionError
+
+class CallableIterClass:
+    def __init__(self):
+        self.i = 0
+    def __call__(self):
+        i = self.i
+        self.i = i + 1
+        if i > 100:
+            raise IndexError # Emergency stop
+        return i
+
+class EmptyIterClass:
+    def __len__(self):
+        return 0
+    def __getitem__(self, i):
+        raise StopIteration
+
+# Main test suite
+
+class TestCase(__TestCase):
+
+    # Helper to check that an iterator returns a given sequence
+    def check_iterator(self, it, seq, pickle=True):
+        if pickle:
+            self.check_pickle(it, seq)
+        res = []
+        while 1:
+            try:
+                val = next(it)
+            except StopIteration:
+                break
+            res.append(val)
+        self.assertEqual(res, seq)
+
+    # Helper to check that a for loop generates a given sequence
+    def check_for_loop(self, expr, seq, pickle=True):
+        if pickle:
+            self.check_pickle(iter(expr), seq)
+        res = []
+        for val in expr:
+            res.append(val)
+        self.assertEqual(res, seq)
+
+    # Helper to check picklability
+    def check_pickle(self, itorg, seq):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            d = pickle.dumps(itorg, proto)
+            it = pickle.loads(d)
+            # Cannot assert type equality because dict iterators unpickle as list
+            # iterators.
+            # self.assertEqual(type(itorg), type(it))
+            self.assertTrue(isinstance(it, collections.abc.Iterator))
+            self.assertEqual(list(it), seq)
+
+            it = pickle.loads(d)
+            try:
+                next(it)
+            except StopIteration:
+                continue
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            self.assertEqual(list(it), seq[1:])
+
+    # Test basic use of iter() function
+    def test_iter_basic(self):
+        self.check_iterator(iter(range(10)), list(range(10)))
+
+    # Test that iter(iter(x)) is the same as iter(x)
+    def test_iter_idempotency(self):
+        seq = list(range(10))
+        it = iter(seq)
+        it2 = iter(it)
+        self.assertTrue(it is it2)
+
+    # Test that for loops over iterators work
+    def test_iter_for_loop(self):
+        self.check_for_loop(iter(range(10)), list(range(10)))
+
+    # Test several independent iterators over the same list
+    def test_iter_independence(self):
+        seq = range(3)
+        res = []
+        for i in iter(seq):
+            for j in iter(seq):
+                for k in iter(seq):
+                    res.append((i, j, k))
+        self.assertEqual(res, TRIPLETS)
+
+    # Test triple list comprehension using iterators
+    def test_nested_comprehensions_iter(self):
+        seq = range(3)
+        res = [(i, j, k)
+               for i in iter(seq) for j in iter(seq) for k in iter(seq)]
+        self.assertEqual(res, TRIPLETS)
+
+    # Test triple list comprehension without iterators
+    def test_nested_comprehensions_for(self):
+        seq = range(3)
+        res = [(i, j, k) for i in seq for j in seq for k in seq]
+        self.assertEqual(res, TRIPLETS)
+
+    # Test a class with __iter__ in a for loop
+    def test_iter_class_for(self):
+        self.check_for_loop(IteratingSequenceClass(10), list(range(10)))
+
+    # Test a class with __iter__ with explicit iter()
+    def test_iter_class_iter(self):
+        self.check_iterator(iter(IteratingSequenceClass(10)), list(range(10)))
+
+    # Test for loop on a sequence class without __iter__
+    def test_seq_class_for(self):
+        self.check_for_loop(SequenceClass(10), list(range(10)))
+
+    # Test iter() on a sequence class without __iter__
+    def test_seq_class_iter(self):
+        self.check_iterator(iter(SequenceClass(10)), list(range(10)))
+
+    def test_mutating_seq_class_iter_pickle(self):
+        orig = SequenceClass(5)
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            # initial iterator
+            itorig = iter(orig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, seq = pickle.loads(d)
+            seq.n = 7
+            self.assertIs(type(it), type(itorig))
+            self.assertEqual(list(it), list(range(7)))
+
+            # running iterator
+            next(itorig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, seq = pickle.loads(d)
+            seq.n = 7
+            self.assertIs(type(it), type(itorig))
+            self.assertEqual(list(it), list(range(1, 7)))
+
+            # empty iterator
+            for i in range(1, 5):
+                next(itorig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, seq = pickle.loads(d)
+            seq.n = 7
+            self.assertIs(type(it), type(itorig))
+            self.assertEqual(list(it), list(range(5, 7)))
+
+            # exhausted iterator
+            self.assertRaises(StopIteration, next, itorig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, seq = pickle.loads(d)
+            seq.n = 7
+            self.assertTrue(isinstance(it, collections.abc.Iterator))
+            self.assertEqual(list(it), [])
+
+    def test_mutating_seq_class_exhausted_iter(self):
+        a = SequenceClass(5)
+        exhit = iter(a)
+        empit = iter(a)
+        for x in exhit:  # exhaust the iterator
+            next(empit)  # not exhausted
+        a.n = 7
+        self.assertEqual(list(exhit), [])
+        self.assertEqual(list(empit), [5, 6])
+        self.assertEqual(list(a), [0, 1, 2, 3, 4, 5, 6])
+
+    def test_reduce_mutating_builtins_iter(self):
+        # This is a reproducer of issue #101765
+        # where iter `__reduce__` calls could lead to a segfault or SystemError
+        # depending on the order of C argument evaluation, which is undefined
+
+        # Backup builtins
+        builtins_dict = builtins.__dict__
+        orig = {"iter": iter, "reversed": reversed}
+
+        def run(builtin_name, item, sentinel=None):
+            it = iter(item) if sentinel is None else iter(item, sentinel)
+
+            class CustomStr:
+                def __init__(self, name, iterator):
+                    self.name = name
+                    self.iterator = iterator
+                def __hash__(self):
+                    return hash(self.name)
+                def __eq__(self, other):
+                    # Here we exhaust our iterator, possibly changing
+                    # its `it_seq` pointer to NULL
+                    # The `__reduce__` call should correctly get
+                    # the pointers after this call
+                    list(self.iterator)
+                    return other == self.name
+
+            # del is required here
+            # to not prematurely call __eq__ from
+            # the hash collision with the old key
+            del builtins_dict[builtin_name]
+            builtins_dict[CustomStr(builtin_name, it)] = orig[builtin_name]
+
+            return it.__reduce__()
+
+        types = [
+            (EmptyIterClass(),),
+            (bytes(8),),
+            (bytearray(8),),
+            ((1, 2, 3),),
+            (lambda: 0, 0),
+            (tuple[int],)  # GenericAlias
+        ]
+
+        try:
+            run_iter = functools.partial(run, "iter")
+            # The returned value of `__reduce__` should not only be valid
+            # but also *empty*, as `it` was exhausted during `__eq__`
+            # i.e "xyz" returns (iter, ("",))
+            self.assertEqual(run_iter("xyz"), (orig["iter"], ("",)))
+            self.assertEqual(run_iter([1, 2, 3]), (orig["iter"], ([],)))
+
+            # _PyEval_GetBuiltin is also called for `reversed` in a branch of
+            # listiter_reduce_general
+            self.assertEqual(
+                run("reversed", orig["reversed"](list(range(8)))),
+                (reversed, ([],))
+            )
+
+            for case in types:
+                self.assertEqual(run_iter(*case), (orig["iter"], ((),)))
+        finally:
+            # Restore original builtins
+            for key, func in orig.items():
+                # need to suppress KeyErrors in case
+                # a failed test deletes the key without setting anything
+                with contextlib.suppress(KeyError):
+                    # del is required here
+                    # to not invoke our custom __eq__ from
+                    # the hash collision with the old key
+                    del builtins_dict[key]
+                builtins_dict[key] = func
+
+    # Test a new_style class with __iter__ but no next() method
+    def test_new_style_iter_class(self):
+        class IterClass(object):
+            def __iter__(self):
+                return self
+        self.assertRaises(TypeError, iter, IterClass())
+
+    # Test two-argument iter() with callable instance
+    def test_iter_callable(self):
+        self.check_iterator(iter(CallableIterClass(), 10), list(range(10)), pickle=True)
+
+    # Test two-argument iter() with function
+    def test_iter_function(self):
+        def spam(state=[0]):
+            i = state[0]
+            state[0] = i+1
+            return i
+        self.check_iterator(iter(spam, 10), list(range(10)), pickle=False)
+
+    # Test two-argument iter() with function that raises StopIteration
+    def test_iter_function_stop(self):
+        def spam(state=[0]):
+            i = state[0]
+            if i == 10:
+                raise StopIteration
+            state[0] = i+1
+            return i
+        self.check_iterator(iter(spam, 20), list(range(10)), pickle=False)
+
+    def test_iter_function_concealing_reentrant_exhaustion(self):
+        # gh-101892: Test two-argument iter() with a function that
+        # exhausts its associated iterator but forgets to either return
+        # a sentinel value or raise StopIteration.
+        HAS_MORE = 1
+        NO_MORE = 2
+
+        def exhaust(iterator):
+            """Exhaust an iterator without raising StopIteration."""
+            list(iterator)
+
+        def spam():
+            # Touching the iterator with exhaust() below will call
+            # spam() once again so protect against recursion.
+            if spam.is_recursive_call:
+                return NO_MORE
+            spam.is_recursive_call = True
+            exhaust(spam.iterator)
+            return HAS_MORE
+
+        spam.is_recursive_call = False
+        spam.iterator = iter(spam, NO_MORE)
+        with self.assertRaises(StopIteration):
+            next(spam.iterator)
+
+    # Test exception propagation through function iterator
+    def test_exception_function(self):
+        def spam(state=[0]):
+            i = state[0]
+            state[0] = i+1
+            if i == 10:
+                raise RuntimeError
+            return i
+        res = []
+        try:
+            for x in iter(spam, 20):
+                res.append(x)
+        except RuntimeError:
+            self.assertEqual(res, list(range(10)))
+        else:
+            self.fail("should have raised RuntimeError")
+
+    # Test exception propagation through sequence iterator
+    def test_exception_sequence(self):
+        class MySequenceClass(SequenceClass):
+            def __getitem__(self, i):
+                if i == 10:
+                    raise RuntimeError
+                return SequenceClass.__getitem__(self, i)
+        res = []
+        try:
+            for x in MySequenceClass(20):
+                res.append(x)
+        except RuntimeError:
+            self.assertEqual(res, list(range(10)))
+        else:
+            self.fail("should have raised RuntimeError")
+
+    # Test for StopIteration from __getitem__
+    def test_stop_sequence(self):
+        class MySequenceClass(SequenceClass):
+            def __getitem__(self, i):
+                if i == 10:
+                    raise StopIteration
+                return SequenceClass.__getitem__(self, i)
+        self.check_for_loop(MySequenceClass(20), list(range(10)), pickle=False)
+
+    # Test a big range
+    def test_iter_big_range(self):
+        self.check_for_loop(iter(range(10000)), list(range(10000)))
+
+    # Test an empty list
+    def test_iter_empty(self):
+        self.check_for_loop(iter([]), [])
+
+    # Test a tuple
+    def test_iter_tuple(self):
+        self.check_for_loop(iter((0,1,2,3,4,5,6,7,8,9)), list(range(10)))
+
+    # Test a range
+    def test_iter_range(self):
+        self.check_for_loop(iter(range(10)), list(range(10)))
+
+    # Test a string
+    def test_iter_string(self):
+        self.check_for_loop(iter("abcde"), ["a", "b", "c", "d", "e"])
+
+    # Test a directory
+    def test_iter_dict(self):
+        dict = {}
+        for i in range(10):
+            dict[i] = None
+        self.check_for_loop(dict, list(dict.keys()))
+
+    # Test a file
+    def test_iter_file(self):
+        f = open(TESTFN, "w", encoding="utf-8")
+        try:
+            for i in range(5):
+                f.write("%d\n" % i)
+        finally:
+            f.close()
+        f = open(TESTFN, "r", encoding="utf-8")
+        try:
+            self.check_for_loop(f, ["0\n", "1\n", "2\n", "3\n", "4\n"], pickle=False)
+            self.check_for_loop(f, [], pickle=False)
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+    # Test list()'s use of iterators.
+    def test_builtin_list(self):
+        self.assertEqual(list(SequenceClass(5)), list(range(5)))
+        self.assertEqual(list(SequenceClass(0)), [])
+        self.assertEqual(list(()), [])
+
+        d = {"one": 1, "two": 2, "three": 3}
+        self.assertEqual(list(d), list(d.keys()))
+
+        self.assertRaises(TypeError, list, list)
+        self.assertRaises(TypeError, list, 42)
+
+        f = open(TESTFN, "w", encoding="utf-8")
+        try:
+            for i in range(5):
+                f.write("%d\n" % i)
+        finally:
+            f.close()
+        f = open(TESTFN, "r", encoding="utf-8")
+        try:
+            self.assertEqual(list(f), ["0\n", "1\n", "2\n", "3\n", "4\n"])
+            f.seek(0, 0)
+            self.assertEqual(list(f),
+                             ["0\n", "1\n", "2\n", "3\n", "4\n"])
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+    # Test tuples()'s use of iterators.
+    def test_builtin_tuple(self):
+        self.assertEqual(tuple(SequenceClass(5)), (0, 1, 2, 3, 4))
+        self.assertEqual(tuple(SequenceClass(0)), ())
+        self.assertEqual(tuple([]), ())
+        self.assertEqual(tuple(()), ())
+        self.assertEqual(tuple("abc"), ("a", "b", "c"))
+
+        d = {"one": 1, "two": 2, "three": 3}
+        self.assertEqual(tuple(d), tuple(d.keys()))
+
+        self.assertRaises(TypeError, tuple, list)
+        self.assertRaises(TypeError, tuple, 42)
+
+        f = open(TESTFN, "w", encoding="utf-8")
+        try:
+            for i in range(5):
+                f.write("%d\n" % i)
+        finally:
+            f.close()
+        f = open(TESTFN, "r", encoding="utf-8")
+        try:
+            self.assertEqual(tuple(f), ("0\n", "1\n", "2\n", "3\n", "4\n"))
+            f.seek(0, 0)
+            self.assertEqual(tuple(f),
+                             ("0\n", "1\n", "2\n", "3\n", "4\n"))
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+    # Test filter()'s use of iterators.
+    def test_builtin_filter(self):
+        self.assertEqual(list(filter(None, SequenceClass(5))),
+                         list(range(1, 5)))
+        self.assertEqual(list(filter(None, SequenceClass(0))), [])
+        self.assertEqual(list(filter(None, ())), [])
+        self.assertEqual(list(filter(None, "abc")), ["a", "b", "c"])
+
+        d = {"one": 1, "two": 2, "three": 3}
+        self.assertEqual(list(filter(None, d)), list(d.keys()))
+
+        self.assertRaises(TypeError, filter, None, list)
+        self.assertRaises(TypeError, filter, None, 42)
+
+        class Boolean:
+            def __init__(self, truth):
+                self.truth = truth
+            def __bool__(self):
+                return self.truth
+        bTrue = Boolean(True)
+        bFalse = Boolean(False)
+
+        class Seq:
+            def __init__(self, *args):
+                self.vals = args
+            def __iter__(self):
+                class SeqIter:
+                    def __init__(self, vals):
+                        self.vals = vals
+                        self.i = 0
+                    def __iter__(self):
+                        return self
+                    def __next__(self):
+                        i = self.i
+                        self.i = i + 1
+                        if i < len(self.vals):
+                            return self.vals[i]
+                        else:
+                            raise StopIteration
+                return SeqIter(self.vals)
+
+        seq = Seq(*([bTrue, bFalse] * 25))
+        self.assertEqual(list(filter(lambda x: not x, seq)), [bFalse]*25)
+        self.assertEqual(list(filter(lambda x: not x, iter(seq))), [bFalse]*25)
+
+    # Test max() and min()'s use of iterators.
+    def test_builtin_max_min(self):
+        self.assertEqual(max(SequenceClass(5)), 4)
+        self.assertEqual(min(SequenceClass(5)), 0)
+        self.assertEqual(max(8, -1), 8)
+        self.assertEqual(min(8, -1), -1)
+
+        d = {"one": 1, "two": 2, "three": 3}
+        self.assertEqual(max(d), "two")
+        self.assertEqual(min(d), "one")
+        self.assertEqual(max(d.values()), 3)
+        self.assertEqual(min(iter(d.values())), 1)
+
+        f = open(TESTFN, "w", encoding="utf-8")
+        try:
+            f.write("medium line\n")
+            f.write("xtra large line\n")
+            f.write("itty-bitty line\n")
+        finally:
+            f.close()
+        f = open(TESTFN, "r", encoding="utf-8")
+        try:
+            self.assertEqual(min(f), "itty-bitty line\n")
+            f.seek(0, 0)
+            self.assertEqual(max(f), "xtra large line\n")
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+    # Test map()'s use of iterators.
+    def test_builtin_map(self):
+        self.assertEqual(list(map(lambda x: x+1, SequenceClass(5))),
+                         list(range(1, 6)))
+
+        d = {"one": 1, "two": 2, "three": 3}
+        self.assertEqual(list(map(lambda k, d=d: (k, d[k]), d)),
+                         list(d.items()))
+        dkeys = list(d.keys())
+        expected = [(i < len(d) and dkeys[i] or None,
+                     i,
+                     i < len(d) and dkeys[i] or None)
+                    for i in range(3)]
+
+        f = open(TESTFN, "w", encoding="utf-8")
+        try:
+            for i in range(10):
+                f.write("xy" * i + "\n") # line i has len 2*i+1
+        finally:
+            f.close()
+        f = open(TESTFN, "r", encoding="utf-8")
+        try:
+            self.assertEqual(list(map(len, f)), list(range(1, 21, 2)))
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+    # Test zip()'s use of iterators.
+    @skipIfTorchDynamo("infinite loop")
+    def test_builtin_zip(self):
+        self.assertEqual(list(zip()), [])
+        self.assertEqual(list(zip(*[])), [])
+        self.assertEqual(list(zip(*[(1, 2), 'ab'])), [(1, 'a'), (2, 'b')])
+
+        self.assertRaises(TypeError, zip, None)
+        self.assertRaises(TypeError, zip, range(10), 42)
+        self.assertRaises(TypeError, zip, range(10), zip)
+
+        self.assertEqual(list(zip(IteratingSequenceClass(3))),
+                         [(0,), (1,), (2,)])
+        self.assertEqual(list(zip(SequenceClass(3))),
+                         [(0,), (1,), (2,)])
+
+        d = {"one": 1, "two": 2, "three": 3}
+        self.assertEqual(list(d.items()), list(zip(d, d.values())))
+
+        # Generate all ints starting at constructor arg.
+        class IntsFrom:
+            def __init__(self, start):
+                self.i = start
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                i = self.i
+                self.i = i+1
+                return i
+
+        f = open(TESTFN, "w", encoding="utf-8")
+        try:
+            f.write("a\n" "bbb\n" "cc\n")
+        finally:
+            f.close()
+        f = open(TESTFN, "r", encoding="utf-8")
+        try:
+            self.assertEqual(list(zip(IntsFrom(0), f, IntsFrom(-100))),
+                             [(0, "a\n", -100),
+                              (1, "bbb\n", -99),
+                              (2, "cc\n", -98)])
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+        self.assertEqual(list(zip(range(5))), [(i,) for i in range(5)])
+
+        # Classes that lie about their lengths.
+        class NoGuessLen5:
+            def __getitem__(self, i):
+                if i >= 5:
+                    raise IndexError
+                return i
+
+        class Guess3Len5(NoGuessLen5):
+            def __len__(self):
+                return 3
+
+        class Guess30Len5(NoGuessLen5):
+            def __len__(self):
+                return 30
+
+        def lzip(*args):
+            return list(zip(*args))
+
+        self.assertEqual(len(Guess3Len5()), 3)
+        self.assertEqual(len(Guess30Len5()), 30)
+        self.assertEqual(lzip(NoGuessLen5()), lzip(range(5)))
+        self.assertEqual(lzip(Guess3Len5()), lzip(range(5)))
+        self.assertEqual(lzip(Guess30Len5()), lzip(range(5)))
+
+        expected = [(i, i) for i in range(5)]
+        for x in NoGuessLen5(), Guess3Len5(), Guess30Len5():
+            for y in NoGuessLen5(), Guess3Len5(), Guess30Len5():
+                self.assertEqual(lzip(x, y), expected)
+
+    def test_unicode_join_endcase(self):
+
+        # This class inserts a Unicode object into its argument's natural
+        # iteration, in the 3rd position.
+        class OhPhooey:
+            def __init__(self, seq):
+                self.it = iter(seq)
+                self.i = 0
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                i = self.i
+                self.i = i+1
+                if i == 2:
+                    return "fooled you!"
+                return next(self.it)
+
+        f = open(TESTFN, "w", encoding="utf-8")
+        try:
+            f.write("a\n" + "b\n" + "c\n")
+        finally:
+            f.close()
+
+        f = open(TESTFN, "r", encoding="utf-8")
+        # Nasty:  string.join(s) can't know whether unicode.join() is needed
+        # until it's seen all of s's elements.  But in this case, f's
+        # iterator cannot be restarted.  So what we're testing here is
+        # whether string.join() can manage to remember everything it's seen
+        # and pass that on to unicode.join().
+        try:
+            got = " - ".join(OhPhooey(f))
+            self.assertEqual(got, "a\n - b\n - fooled you! - c\n")
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+    # Test iterators with 'x in y' and 'x not in y'.
+    def test_in_and_not_in(self):
+        for sc5 in IteratingSequenceClass(5), SequenceClass(5):
+            for i in range(5):
+                self.assertIn(i, sc5)
+            for i in "abc", -1, 5, 42.42, (3, 4), [], {1: 1}, 3-12j, sc5:
+                self.assertNotIn(i, sc5)
+
+        self.assertIn(ALWAYS_EQ, IteratorProxyClass(iter([1])))
+        self.assertIn(ALWAYS_EQ, SequenceProxyClass([1]))
+        self.assertNotIn(ALWAYS_EQ, IteratorProxyClass(iter([NEVER_EQ])))
+        self.assertNotIn(ALWAYS_EQ, SequenceProxyClass([NEVER_EQ]))
+        self.assertIn(NEVER_EQ, IteratorProxyClass(iter([ALWAYS_EQ])))
+        self.assertIn(NEVER_EQ, SequenceProxyClass([ALWAYS_EQ]))
+
+        self.assertRaises(TypeError, lambda: 3 in 12)
+        self.assertRaises(TypeError, lambda: 3 not in map)
+        self.assertRaises(ZeroDivisionError, lambda: 3 in BadIterableClass())
+
+        d = {"one": 1, "two": 2, "three": 3, 1j: 2j}
+        for k in d:
+            self.assertIn(k, d)
+            self.assertNotIn(k, d.values())
+        for v in d.values():
+            self.assertIn(v, d.values())
+            self.assertNotIn(v, d)
+        for k, v in d.items():
+            self.assertIn((k, v), d.items())
+            self.assertNotIn((v, k), d.items())
+
+        f = open(TESTFN, "w", encoding="utf-8")
+        try:
+            f.write("a\n" "b\n" "c\n")
+        finally:
+            f.close()
+        f = open(TESTFN, "r", encoding="utf-8")
+        try:
+            for chunk in "abc":
+                f.seek(0, 0)
+                self.assertNotIn(chunk, f)
+                f.seek(0, 0)
+                self.assertIn((chunk + "\n"), f)
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+    # Test iterators with operator.countOf (PySequence_Count).
+    def test_countOf(self):
+        from operator import countOf
+        self.assertEqual(countOf([1,2,2,3,2,5], 2), 3)
+        self.assertEqual(countOf((1,2,2,3,2,5), 2), 3)
+        self.assertEqual(countOf("122325", "2"), 3)
+        self.assertEqual(countOf("122325", "6"), 0)
+
+        self.assertRaises(TypeError, countOf, 42, 1)
+        self.assertRaises(TypeError, countOf, countOf, countOf)
+
+        d = {"one": 3, "two": 3, "three": 3, 1j: 2j}
+        for k in d:
+            self.assertEqual(countOf(d, k), 1)
+        self.assertEqual(countOf(d.values(), 3), 3)
+        self.assertEqual(countOf(d.values(), 2j), 1)
+        self.assertEqual(countOf(d.values(), 1j), 0)
+
+        f = open(TESTFN, "w", encoding="utf-8")
+        try:
+            f.write("a\n" "b\n" "c\n" "b\n")
+        finally:
+            f.close()
+        f = open(TESTFN, "r", encoding="utf-8")
+        try:
+            for letter, count in ("a", 1), ("b", 2), ("c", 1), ("d", 0):
+                f.seek(0, 0)
+                self.assertEqual(countOf(f, letter + "\n"), count)
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+    # Test iterators with operator.indexOf (PySequence_Index).
+    def test_indexOf(self):
+        from operator import indexOf
+        self.assertEqual(indexOf([1,2,2,3,2,5], 1), 0)
+        self.assertEqual(indexOf((1,2,2,3,2,5), 2), 1)
+        self.assertEqual(indexOf((1,2,2,3,2,5), 3), 3)
+        self.assertEqual(indexOf((1,2,2,3,2,5), 5), 5)
+        self.assertRaises(ValueError, indexOf, (1,2,2,3,2,5), 0)
+        self.assertRaises(ValueError, indexOf, (1,2,2,3,2,5), 6)
+
+        self.assertEqual(indexOf("122325", "2"), 1)
+        self.assertEqual(indexOf("122325", "5"), 5)
+        self.assertRaises(ValueError, indexOf, "122325", "6")
+
+        self.assertRaises(TypeError, indexOf, 42, 1)
+        self.assertRaises(TypeError, indexOf, indexOf, indexOf)
+        self.assertRaises(ZeroDivisionError, indexOf, BadIterableClass(), 1)
+
+        f = open(TESTFN, "w", encoding="utf-8")
+        try:
+            f.write("a\n" "b\n" "c\n" "d\n" "e\n")
+        finally:
+            f.close()
+        f = open(TESTFN, "r", encoding="utf-8")
+        try:
+            fiter = iter(f)
+            self.assertEqual(indexOf(fiter, "b\n"), 1)
+            self.assertEqual(indexOf(fiter, "d\n"), 1)
+            self.assertEqual(indexOf(fiter, "e\n"), 0)
+            self.assertRaises(ValueError, indexOf, fiter, "a\n")
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+        iclass = IteratingSequenceClass(3)
+        for i in range(3):
+            self.assertEqual(indexOf(iclass, i), i)
+        self.assertRaises(ValueError, indexOf, iclass, -1)
+
+    # Test iterators with file.writelines().
+    def test_writelines(self):
+        f = open(TESTFN, "w", encoding="utf-8")
+
+        try:
+            self.assertRaises(TypeError, f.writelines, None)
+            self.assertRaises(TypeError, f.writelines, 42)
+
+            f.writelines(["1\n", "2\n"])
+            f.writelines(("3\n", "4\n"))
+            f.writelines({'5\n': None})
+            f.writelines({})
+
+            # Try a big chunk too.
+            class Iterator:
+                def __init__(self, start, finish):
+                    self.start = start
+                    self.finish = finish
+                    self.i = self.start
+
+                def __next__(self):
+                    if self.i >= self.finish:
+                        raise StopIteration
+                    result = str(self.i) + '\n'
+                    self.i += 1
+                    return result
+
+                def __iter__(self):
+                    return self
+
+            class Whatever:
+                def __init__(self, start, finish):
+                    self.start = start
+                    self.finish = finish
+
+                def __iter__(self):
+                    return Iterator(self.start, self.finish)
+
+            f.writelines(Whatever(6, 6+2000))
+            f.close()
+
+            f = open(TESTFN, encoding="utf-8")
+            expected = [str(i) + "\n" for i in range(1, 2006)]
+            self.assertEqual(list(f), expected)
+
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+
+    # Test iterators on RHS of unpacking assignments.
+    def test_unpack_iter(self):
+        a, b = 1, 2
+        self.assertEqual((a, b), (1, 2))
+
+        a, b, c = IteratingSequenceClass(3)
+        self.assertEqual((a, b, c), (0, 1, 2))
+
+        try:    # too many values
+            a, b = IteratingSequenceClass(3)
+        except ValueError:
+            pass
+        else:
+            self.fail("should have raised ValueError")
+
+        try:    # not enough values
+            a, b, c = IteratingSequenceClass(2)
+        except ValueError:
+            pass
+        else:
+            self.fail("should have raised ValueError")
+
+        try:    # not iterable
+            a, b, c = len
+        except TypeError:
+            pass
+        else:
+            self.fail("should have raised TypeError")
+
+        a, b, c = {1: 42, 2: 42, 3: 42}.values()
+        self.assertEqual((a, b, c), (42, 42, 42))
+
+        f = open(TESTFN, "w", encoding="utf-8")
+        lines = ("a\n", "bb\n", "ccc\n")
+        try:
+            for line in lines:
+                f.write(line)
+        finally:
+            f.close()
+        f = open(TESTFN, "r", encoding="utf-8")
+        try:
+            a, b, c = f
+            self.assertEqual((a, b, c), lines)
+        finally:
+            f.close()
+            try:
+                unlink(TESTFN)
+            except OSError:
+                pass
+
+        (a, b), (c,) = IteratingSequenceClass(2), {42: 24}
+        self.assertEqual((a, b, c), (0, 1, 42))
+
+
+    @cpython_only
+    def test_ref_counting_behavior(self):
+        class C(object):
+            count = 0
+            def __new__(cls):
+                cls.count += 1
+                return object.__new__(cls)
+            def __del__(self):
+                cls = self.__class__
+                assert cls.count > 0
+                cls.count -= 1
+        x = C()
+        self.assertEqual(C.count, 1)
+        del x
+        self.assertEqual(C.count, 0)
+        l = [C(), C(), C()]
+        self.assertEqual(C.count, 3)
+        try:
+            a, b = iter(l)
+        except ValueError:
+            pass
+        del l
+        self.assertEqual(C.count, 0)
+
+
+    # Make sure StopIteration is a "sink state".
+    # This tests various things that weren't sink states in Python 2.2.1,
+    # plus various things that always were fine.
+
+    def test_sinkstate_list(self):
+        # This used to fail
+        a = list(range(5))
+        b = iter(a)
+        self.assertEqual(list(b), list(range(5)))
+        a.extend(range(5, 10))
+        self.assertEqual(list(b), [])
+
+    def test_sinkstate_tuple(self):
+        a = (0, 1, 2, 3, 4)
+        b = iter(a)
+        self.assertEqual(list(b), list(range(5)))
+        self.assertEqual(list(b), [])
+
+    def test_sinkstate_string(self):
+        a = "abcde"
+        b = iter(a)
+        self.assertEqual(list(b), ['a', 'b', 'c', 'd', 'e'])
+        self.assertEqual(list(b), [])
+
+    def test_sinkstate_sequence(self):
+        # This used to fail
+        a = SequenceClass(5)
+        b = iter(a)
+        self.assertEqual(list(b), list(range(5)))
+        a.n = 10
+        self.assertEqual(list(b), [])
+
+    def test_sinkstate_callable(self):
+        # This used to fail
+        def spam(state=[0]):
+            i = state[0]
+            state[0] = i+1
+            if i == 10:
+                raise AssertionError("shouldn't have gotten this far")
+            return i
+        b = iter(spam, 5)
+        self.assertEqual(list(b), list(range(5)))
+        self.assertEqual(list(b), [])
+
+    def test_sinkstate_dict(self):
+        # XXX For a more thorough test, see towards the end of:
+        # http://mail.python.org/pipermail/python-dev/2002-July/026512.html
+        a = {1:1, 2:2, 0:0, 4:4, 3:3}
+        for b in iter(a), a.keys(), a.items(), a.values():
+            b = iter(a)
+            self.assertEqual(len(list(b)), 5)
+            self.assertEqual(list(b), [])
+
+    def test_sinkstate_yield(self):
+        def gen():
+            for i in range(5):
+                yield i
+        b = gen()
+        self.assertEqual(list(b), list(range(5)))
+        self.assertEqual(list(b), [])
+
+    def test_sinkstate_range(self):
+        a = range(5)
+        b = iter(a)
+        self.assertEqual(list(b), list(range(5)))
+        self.assertEqual(list(b), [])
+
+    def test_sinkstate_enumerate(self):
+        a = range(5)
+        e = enumerate(a)
+        b = iter(e)
+        self.assertEqual(list(b), list(zip(range(5), range(5))))
+        self.assertEqual(list(b), [])
+
+    def test_3720(self):
+        # Avoid a crash, when an iterator deletes its next() method.
+        class BadIterator(object):
+            def __iter__(self):
+                return self
+            def __next__(self):
+                del BadIterator.__next__
+                return 1
+
+        try:
+            for i in BadIterator() :
+                pass
+        except TypeError:
+            pass
+
+    def test_extending_list_with_iterator_does_not_segfault(self):
+        # The code to extend a list with an iterator has a fair
+        # amount of nontrivial logic in terms of guessing how
+        # much memory to allocate in advance, "stealing" refs,
+        # and then shrinking at the end.  This is a basic smoke
+        # test for that scenario.
+        def gen():
+            for i in range(500):
+                yield i
+        lst = [0] * 500
+        for i in range(240):
+            lst.pop(0)
+        lst.extend(gen())
+        self.assertEqual(len(lst), 760)
+
+    @cpython_only
+    def test_iter_overflow(self):
+        # Test for the issue 22939
+        it = iter(UnlimitedSequenceClass())
+        # Manually set `it_index` to PY_SSIZE_T_MAX-2 without a loop
+        it.__setstate__(sys.maxsize - 2)
+        self.assertEqual(next(it), sys.maxsize - 2)
+        self.assertEqual(next(it), sys.maxsize - 1)
+        with self.assertRaises(OverflowError):
+            next(it)
+        # Check that Overflow error is always raised
+        with self.assertRaises(OverflowError):
+            next(it)
+
+    def test_iter_neg_setstate(self):
+        it = iter(UnlimitedSequenceClass())
+        it.__setstate__(-42)
+        self.assertEqual(next(it), 0)
+        self.assertEqual(next(it), 1)
+
+    def test_free_after_iterating(self):
+        check_free_after_iterating(self, iter, SequenceClass, (0,))
+
+    def test_error_iter(self):
+        for typ in (DefaultIterClass, NoIterClass):
+            self.assertRaises(TypeError, iter, typ())
+        self.assertRaises(ZeroDivisionError, iter, BadIterableClass())
+
+    def test_exception_locations(self):
+        # The location of an exception raised from __init__ or
+        # __next__ should should be the iterator expression
+
+        def init_raises():
+            try:
+                for x in BrokenIter(init_raises=True):
+                    pass
+            except Exception as e:
+                return e
+
+        def next_raises():
+            try:
+                for x in BrokenIter(next_raises=True):
+                    pass
+            except Exception as e:
+                return e
+
+        def iter_raises():
+            try:
+                for x in BrokenIter(iter_raises=True):
+                    pass
+            except Exception as e:
+                return e
+
+        for func, expected in [(init_raises, "BrokenIter(init_raises=True)"),
+                               (next_raises, "BrokenIter(next_raises=True)"),
+                               (iter_raises, "BrokenIter(iter_raises=True)"),
+                              ]:
+            with self.subTest(func):
+                exc = func()
+                f = traceback.extract_tb(exc.__traceback__)[0]
+                indent = 16
+                co = func.__code__
+                self.assertEqual(f.lineno, co.co_firstlineno + 2)
+                self.assertEqual(f.end_lineno, co.co_firstlineno + 2)
+                self.assertEqual(f.line[f.colno - indent : f.end_colno - indent],
+                                 expected)
+
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_list.diff b/test/dynamo/cpython/3_13/test_list.diff
new file mode 100644
index 000000000000..943f67dd4a00
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_list.diff
@@ -0,0 +1,77 @@
+diff --git a/test/dynamo/cpython/3_13/test_list.py b/test/dynamo/cpython/3_13/test_list.py
+index 23ef902aa0b..30e69ff75bd 100644
+--- a/test/dynamo/cpython/3_13/test_list.py
++++ b/test/dynamo/cpython/3_13/test_list.py
+@@ -1,6 +1,57 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ import sys
+ import textwrap
+-from test import list_tests
++import list_tests
+ from test.support import cpython_only
+ from test.support.script_helper import assert_python_ok
+ import pickle
+@@ -324,6 +375,7 @@ class ListTest(list_tests.CommonTest):
+             a.append(4)
+             self.assertEqual(list(it), [])
+ 
++    @unittest.skip("Fails on python <=3.13.2 and passes on >=3.13.3")
+     def test_deopt_from_append_list(self):
+         # gh-132011: it used to crash, because
+         # of `CALL_LIST_APPEND` specialization failure.
+@@ -345,4 +397,4 @@ class ListTest(list_tests.CommonTest):
+         self.assertEqual(rc, 0)
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_list.py b/test/dynamo/cpython/3_13/test_list.py
new file mode 100644
index 000000000000..6e4c6d99d169
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_list.py
@@ -0,0 +1,398 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+import sys
+import textwrap
+import list_tests
+from test.support import cpython_only
+from test.support.script_helper import assert_python_ok
+import pickle
+import unittest
+
+class ListTest(list_tests.CommonTest):
+    type2test = list
+
+    def test_basic(self):
+        self.assertEqual(list([]), [])
+        l0_3 = [0, 1, 2, 3]
+        l0_3_bis = list(l0_3)
+        self.assertEqual(l0_3, l0_3_bis)
+        self.assertTrue(l0_3 is not l0_3_bis)
+        self.assertEqual(list(()), [])
+        self.assertEqual(list((0, 1, 2, 3)), [0, 1, 2, 3])
+        self.assertEqual(list(''), [])
+        self.assertEqual(list('spam'), ['s', 'p', 'a', 'm'])
+        self.assertEqual(list(x for x in range(10) if x % 2),
+                         [1, 3, 5, 7, 9])
+
+        if sys.maxsize == 0x7fffffff:
+            # This test can currently only work on 32-bit machines.
+            # XXX If/when PySequence_Length() returns a ssize_t, it should be
+            # XXX re-enabled.
+            # Verify clearing of bug #556025.
+            # This assumes that the max data size (sys.maxint) == max
+            # address size this also assumes that the address size is at
+            # least 4 bytes with 8 byte addresses, the bug is not well
+            # tested
+            #
+            # Note: This test is expected to SEGV under Cygwin 1.3.12 or
+            # earlier due to a newlib bug.  See the following mailing list
+            # thread for the details:
+            self.assertRaises(MemoryError, list, range(sys.maxsize // 2))
+
+        # This code used to segfault in Py2.4a3
+        x = []
+        x.extend(-y for y in x)
+        self.assertEqual(x, [])
+
+    def test_keyword_args(self):
+        with self.assertRaisesRegex(TypeError, 'keyword argument'):
+            list(sequence=[])
+
+    def test_keywords_in_subclass(self):
+        class subclass(list):
+            pass
+        u = subclass([1, 2])
+        self.assertIs(type(u), subclass)
+        self.assertEqual(list(u), [1, 2])
+        with self.assertRaises(TypeError):
+            subclass(sequence=())
+
+        class subclass_with_init(list):
+            def __init__(self, seq, newarg=None):
+                super().__init__(seq)
+                self.newarg = newarg
+        u = subclass_with_init([1, 2], newarg=3)
+        self.assertIs(type(u), subclass_with_init)
+        self.assertEqual(list(u), [1, 2])
+        self.assertEqual(u.newarg, 3)
+
+        class subclass_with_new(list):
+            def __new__(cls, seq, newarg=None):
+                self = super().__new__(cls, seq)
+                self.newarg = newarg
+                return self
+        u = subclass_with_new([1, 2], newarg=3)
+        self.assertIs(type(u), subclass_with_new)
+        self.assertEqual(list(u), [1, 2])
+        self.assertEqual(u.newarg, 3)
+
+    def test_truth(self):
+        super().test_truth()
+        self.assertTrue(not [])
+        self.assertTrue([42])
+
+    def test_identity(self):
+        self.assertTrue([] is not [])
+
+    def test_len(self):
+        super().test_len()
+        self.assertEqual(len([]), 0)
+        self.assertEqual(len([0]), 1)
+        self.assertEqual(len([0, 1, 2]), 3)
+
+    def test_overflow(self):
+        lst = [4, 5, 6, 7]
+        n = int((sys.maxsize*2+2) // len(lst))
+        def mul(a, b): return a * b
+        def imul(a, b): a *= b
+        self.assertRaises((MemoryError, OverflowError), mul, lst, n)
+        self.assertRaises((MemoryError, OverflowError), imul, lst, n)
+
+    def test_empty_slice(self):
+        x = []
+        x[:] = x
+        self.assertEqual(x, [])
+
+    def test_list_resize_overflow(self):
+        # gh-97616: test new_allocated * sizeof(PyObject*) overflow
+        # check in list_resize()
+        lst = [0] * 65
+        del lst[1:]
+        self.assertEqual(len(lst), 1)
+
+        size = sys.maxsize
+        with self.assertRaises((MemoryError, OverflowError)):
+            lst * size
+        with self.assertRaises((MemoryError, OverflowError)):
+            lst *= size
+
+    def test_repr_mutate(self):
+        class Obj:
+            @staticmethod
+            def __repr__():
+                try:
+                    mylist.pop()
+                except IndexError:
+                    pass
+                return 'obj'
+
+        mylist = [Obj() for _ in range(5)]
+        self.assertEqual(repr(mylist), '[obj, obj, obj]')
+
+    def test_repr_large(self):
+        # Check the repr of large list objects
+        def check(n):
+            l = [0] * n
+            s = repr(l)
+            self.assertEqual(s,
+                '[' + ', '.join(['0'] * n) + ']')
+        check(10)       # check our checking code
+        check(1000000)
+
+    def test_iterator_pickle(self):
+        orig = self.type2test([4, 5, 6, 7])
+        data = [10, 11, 12, 13, 14, 15]
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            # initial iterator
+            itorig = iter(orig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, a = pickle.loads(d)
+            a[:] = data
+            self.assertEqual(type(it), type(itorig))
+            self.assertEqual(list(it), data)
+
+            # running iterator
+            next(itorig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, a = pickle.loads(d)
+            a[:] = data
+            self.assertEqual(type(it), type(itorig))
+            self.assertEqual(list(it), data[1:])
+
+            # empty iterator
+            for i in range(1, len(orig)):
+                next(itorig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, a = pickle.loads(d)
+            a[:] = data
+            self.assertEqual(type(it), type(itorig))
+            self.assertEqual(list(it), data[len(orig):])
+
+            # exhausted iterator
+            self.assertRaises(StopIteration, next, itorig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, a = pickle.loads(d)
+            a[:] = data
+            self.assertEqual(list(it), [])
+
+    def test_reversed_pickle(self):
+        orig = self.type2test([4, 5, 6, 7])
+        data = [10, 11, 12, 13, 14, 15]
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            # initial iterator
+            itorig = reversed(orig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, a = pickle.loads(d)
+            a[:] = data
+            self.assertEqual(type(it), type(itorig))
+            self.assertEqual(list(it), data[len(orig)-1::-1])
+
+            # running iterator
+            next(itorig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, a = pickle.loads(d)
+            a[:] = data
+            self.assertEqual(type(it), type(itorig))
+            self.assertEqual(list(it), data[len(orig)-2::-1])
+
+            # empty iterator
+            for i in range(1, len(orig)):
+                next(itorig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, a = pickle.loads(d)
+            a[:] = data
+            self.assertEqual(type(it), type(itorig))
+            self.assertEqual(list(it), [])
+
+            # exhausted iterator
+            self.assertRaises(StopIteration, next, itorig)
+            d = pickle.dumps((itorig, orig), proto)
+            it, a = pickle.loads(d)
+            a[:] = data
+            self.assertEqual(list(it), [])
+
+    def test_step_overflow(self):
+        a = [0, 1, 2, 3, 4]
+        a[1::sys.maxsize] = [0]
+        self.assertEqual(a[3::sys.maxsize], [3])
+
+    def test_no_comdat_folding(self):
+        # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
+        # optimization causes failures in code that relies on distinct
+        # function addresses.
+        class L(list): pass
+        with self.assertRaises(TypeError):
+            (3,) + L([1,2])
+
+    def test_equal_operator_modifying_operand(self):
+        # test fix for seg fault reported in bpo-38588 part 2.
+        class X:
+            def __eq__(self,other) :
+                list2.clear()
+                return NotImplemented
+
+        class Y:
+            def __eq__(self, other):
+                list1.clear()
+                return NotImplemented
+
+        class Z:
+            def __eq__(self, other):
+                list3.clear()
+                return NotImplemented
+
+        list1 = [X()]
+        list2 = [Y()]
+        self.assertTrue(list1 == list2)
+
+        list3 = [Z()]
+        list4 = [1]
+        self.assertFalse(list3 == list4)
+
+    def test_lt_operator_modifying_operand(self):
+        # See gh-120298
+        class evil:
+            def __lt__(self, other):
+                other.clear()
+                return NotImplemented
+
+        a = [[evil()]]
+        with self.assertRaises(TypeError):
+            a[0] < a
+
+    def test_list_index_modifing_operand(self):
+        # See gh-120384
+        class evil:
+            def __init__(self, lst):
+                self.lst = lst
+            def __iter__(self):
+                yield from self.lst
+                self.lst.clear()
+
+        lst = list(range(5))
+        operand = evil(lst)
+        with self.assertRaises(ValueError):
+            lst[::-1] = operand
+
+    @cpython_only
+    def test_preallocation(self):
+        iterable = [0] * 10
+        iter_size = sys.getsizeof(iterable)
+
+        self.assertEqual(iter_size, sys.getsizeof(list([0] * 10)))
+        self.assertEqual(iter_size, sys.getsizeof(list(range(10))))
+
+    def test_count_index_remove_crashes(self):
+        # bpo-38610: The count(), index(), and remove() methods were not
+        # holding strong references to list elements while calling
+        # PyObject_RichCompareBool().
+        class X:
+            def __eq__(self, other):
+                lst.clear()
+                return NotImplemented
+
+        lst = [X()]
+        with self.assertRaises(ValueError):
+            lst.index(lst)
+
+        class L(list):
+            def __eq__(self, other):
+                str(other)
+                return NotImplemented
+
+        lst = L([X()])
+        lst.count(lst)
+
+        lst = L([X()])
+        with self.assertRaises(ValueError):
+            lst.remove(lst)
+
+        # bpo-39453: list.__contains__ was not holding strong references
+        # to list elements while calling PyObject_RichCompareBool().
+        lst = [X(), X()]
+        3 in lst
+        lst = [X(), X()]
+        X() in lst
+
+    def test_tier2_invalidates_iterator(self):
+        # GH-121012
+        for _ in range(100):
+            a = [1, 2, 3]
+            it = iter(a)
+            for _ in it:
+                pass
+            a.append(4)
+            self.assertEqual(list(it), [])
+
+    @unittest.skip("Fails on python <=3.13.2 and passes on >=3.13.3")
+    def test_deopt_from_append_list(self):
+        # gh-132011: it used to crash, because
+        # of `CALL_LIST_APPEND` specialization failure.
+        code = textwrap.dedent("""
+            l = []
+            def lappend(l, x, y):
+                l.append((x, y))
+            for x in range(3):
+                lappend(l, None, None)
+            try:
+                lappend(list, None, None)
+            except TypeError:
+                pass
+            else:
+                raise AssertionError
+        """)
+
+        rc, _, _ = assert_python_ok("-c", code)
+        self.assertEqual(rc, 0)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_math.diff b/test/dynamo/cpython/3_13/test_math.diff
new file mode 100644
index 000000000000..4192addeca5c
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_math.diff
@@ -0,0 +1,191 @@
+diff --git a/test/dynamo/cpython/3_13/test_math.py b/test/dynamo/cpython/3_13/test_math.py
+index 5ee3055c871..51773d5f478 100644
+--- a/test/dynamo/cpython/3_13/test_math.py
++++ b/test/dynamo/cpython/3_13/test_math.py
+@@ -1,3 +1,58 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import (
++    slowTest,
++    run_tests,
++    skipIfTorchDynamo,
++)
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ # Python test set -- math module
+ # XXXX Should not do tests around zero only
+ 
+@@ -242,7 +297,7 @@ class BadDescr:
+     def __get__(self, obj, objtype=None):
+         raise ValueError
+ 
+-class MathTests(unittest.TestCase):
++class MathTests(__TestCase):
+ 
+     def ftest(self, name, got, expected, ulp_tol=5, abs_tol=0.0):
+         """Compare arguments expected and got, as floats, if either
+@@ -533,6 +588,7 @@ class MathTests(unittest.TestCase):
+         self.ftest('fabs(0)', math.fabs(0), 0)
+         self.ftest('fabs(1)', math.fabs(1), 1)
+ 
++    @skipIfTorchDynamo("infinite loop")
+     def testFactorial(self):
+         self.assertEqual(math.factorial(0), 1)
+         total = 1
+@@ -1072,6 +1128,7 @@ class MathTests(unittest.TestCase):
+         with self.assertRaises(ValueError):
+             math.dist([1, 2], [3, 4, 5])
+ 
++    @slowTest
+     def testIsqrt(self):
+         # Test a variety of inputs, large and small.
+         test_values = (
+@@ -1202,12 +1259,6 @@ class MathTests(unittest.TestCase):
+             self.assertEqual(math.ldexp(NINF, n), NINF)
+             self.assertTrue(math.isnan(math.ldexp(NAN, n)))
+ 
+-    @requires_IEEE_754
+-    def testLdexp_denormal(self):
+-        # Denormal output incorrectly rounded (truncated)
+-        # on some Windows.
+-        self.assertEqual(math.ldexp(6993274598585239, -1126), 1e-323)
+-
+     def testLog(self):
+         self.assertRaises(TypeError, math.log)
+         self.assertRaises(TypeError, math.log, 1, 2, 3)
+@@ -1233,6 +1284,7 @@ class MathTests(unittest.TestCase):
+         self.assertRaises(ValueError, math.log1p, -1)
+         self.assertEqual(math.log1p(INF), INF)
+ 
++    @skipIfTorchDynamo("Infinite loop")
+     @requires_IEEE_754
+     def testLog2(self):
+         self.assertRaises(TypeError, math.log2)
+@@ -1251,6 +1303,7 @@ class MathTests(unittest.TestCase):
+         self.assertRaises(ValueError, math.log2, NINF)
+         self.assertTrue(math.isnan(math.log2(NAN)))
+ 
++    @skipIfTorchDynamo("Infinite loop")
+     @requires_IEEE_754
+     # log2() is not accurate enough on Mac OS X Tiger (10.4)
+     @support.requires_mac_ver(10, 5)
+@@ -1332,7 +1385,7 @@ class MathTests(unittest.TestCase):
+         with self.assertRaises(RuntimeError):
+             sumprod(raise_after(5), range(10))
+ 
+-        from test.test_iter import BasicIterClass
++        from test_iter import BasicIterClass
+ 
+         self.assertEqual(sumprod(BasicIterClass(1), [1]), 0)
+         self.assertEqual(sumprod([1], BasicIterClass(1)), 0)
+@@ -2252,6 +2305,7 @@ class MathTests(unittest.TestCase):
+         self.assertEqual(type(prod([1, decimal.Decimal(2.0), 3, 4, 5, 6])),
+                          decimal.Decimal)
+ 
++    @skipIfTorchDynamo("Infinite loop")
+     def testPerm(self):
+         perm = math.perm
+         factorial = math.factorial
+@@ -2316,6 +2370,7 @@ class MathTests(unittest.TestCase):
+             self.assertIs(type(perm(IntSubclass(5), IntSubclass(k))), int)
+             self.assertIs(type(perm(MyIndexable(5), MyIndexable(k))), int)
+ 
++    @skipIfTorchDynamo("infinite loop")
+     def testComb(self):
+         comb = math.comb
+         factorial = math.factorial
+@@ -2446,6 +2501,7 @@ class MathTests(unittest.TestCase):
+             math.nextafter(1.0, INF, steps=-1)
+ 
+ 
++    @unittest.skip("flaky test under torch dynamo")  # works on pytest and crashes on unittest
+     @requires_IEEE_754
+     def test_ulp(self):
+         self.assertEqual(math.ulp(1.0), sys.float_info.epsilon)
+@@ -2508,7 +2564,7 @@ class MathTests(unittest.TestCase):
+         self.assertEqual(math.copysign(1.0, x), math.copysign(1.0, y))
+ 
+ 
+-class IsCloseTests(unittest.TestCase):
++class IsCloseTests(__TestCase):
+     isclose = math.isclose  # subclasses should override this
+ 
+     def assertIsClose(self, a, b, *args, **kwargs):
+@@ -2631,7 +2687,7 @@ class IsCloseTests(unittest.TestCase):
+         self.assertAllNotClose(fraction_examples, rel_tol=1e-9)
+ 
+ 
+-class FMATests(unittest.TestCase):
++class FMATests(__TestCase):
+     """ Tests for math.fma. """
+ 
+     def test_fma_nan_results(self):
+@@ -2719,8 +2775,7 @@ class FMATests(unittest.TestCase):
+     # properly: it doesn't use the right sign when the result is zero.
+     @unittest.skipIf(
+         sys.platform.startswith(("freebsd", "wasi", "netbsd", "emscripten"))
+-        or (sys.platform == "android" and platform.machine() == "x86_64")
+-        or support.linked_to_musl(),  # gh-131032
++        or (sys.platform == "android" and platform.machine() == "x86_64"),
+         f"this platform doesn't implement IEE 754-2008 properly")
+     def test_fma_zero_result(self):
+         nonnegative_finites = [0.0, 1e-300, 2.3, 1e300]
+@@ -2879,10 +2934,5 @@ class FMATests(unittest.TestCase):
+         )
+ 
+ 
+-def load_tests(loader, tests, pattern):
+-    from doctest import DocFileSuite
+-    tests.addTest(DocFileSuite(os.path.join("mathdata", "ieee754.txt")))
+-    return tests
+-
+-if __name__ == '__main__':
+-    unittest.main()
++if __name__ == "__main__":
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_math.py b/test/dynamo/cpython/3_13/test_math.py
new file mode 100644
index 000000000000..51773d5f4783
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_math.py
@@ -0,0 +1,2938 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import (
+    slowTest,
+    run_tests,
+    skipIfTorchDynamo,
+)
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+# Python test set -- math module
+# XXXX Should not do tests around zero only
+
+from test.support import verbose, requires_IEEE_754
+from test import support
+import unittest
+import fractions
+import itertools
+import decimal
+import math
+import os
+import platform
+import random
+import struct
+import sys
+
+
+eps = 1E-05
+NAN = float('nan')
+INF = float('inf')
+NINF = float('-inf')
+FLOAT_MAX = sys.float_info.max
+FLOAT_MIN = sys.float_info.min
+
+# detect evidence of double-rounding: fsum is not always correctly
+# rounded on machines that suffer from double rounding.
+x, y = 1e16, 2.9999 # use temporary values to defeat peephole optimizer
+HAVE_DOUBLE_ROUNDING = (x + y == 1e16 + 4)
+
+# locate file with test values
+if __name__ == '__main__':
+    file = sys.argv[0]
+else:
+    file = __file__
+test_dir = os.path.dirname(file) or os.curdir
+math_testcases = os.path.join(test_dir, 'mathdata', 'math_testcases.txt')
+test_file = os.path.join(test_dir, 'mathdata', 'cmath_testcases.txt')
+
+
+def to_ulps(x):
+    """Convert a non-NaN float x to an integer, in such a way that
+    adjacent floats are converted to adjacent integers.  Then
+    abs(ulps(x) - ulps(y)) gives the difference in ulps between two
+    floats.
+
+    The results from this function will only make sense on platforms
+    where native doubles are represented in IEEE 754 binary64 format.
+
+    Note: 0.0 and -0.0 are converted to 0 and -1, respectively.
+    """
+    n = struct.unpack('<q', struct.pack('<d', x))[0]
+    if n < 0:
+        n = ~(n+2**63)
+    return n
+
+
+# Here's a pure Python version of the math.factorial algorithm, for
+# documentation and comparison purposes.
+#
+# Formula:
+#
+#   factorial(n) = factorial_odd_part(n) << (n - count_set_bits(n))
+#
+# where
+#
+#   factorial_odd_part(n) = product_{i >= 0} product_{0 < j <= n >> i; j odd} j
+#
+# The outer product above is an infinite product, but once i >= n.bit_length,
+# (n >> i) < 1 and the corresponding term of the product is empty.  So only the
+# finitely many terms for 0 <= i < n.bit_length() contribute anything.
+#
+# We iterate downwards from i == n.bit_length() - 1 to i == 0.  The inner
+# product in the formula above starts at 1 for i == n.bit_length(); for each i
+# < n.bit_length() we get the inner product for i from that for i + 1 by
+# multiplying by all j in {n >> i+1 < j <= n >> i; j odd}.  In Python terms,
+# this set is range((n >> i+1) + 1 | 1, (n >> i) + 1 | 1, 2).
+
+def count_set_bits(n):
+    """Number of '1' bits in binary expansion of a nonnnegative integer."""
+    return 1 + count_set_bits(n & n - 1) if n else 0
+
+def partial_product(start, stop):
+    """Product of integers in range(start, stop, 2), computed recursively.
+    start and stop should both be odd, with start <= stop.
+
+    """
+    numfactors = (stop - start) >> 1
+    if not numfactors:
+        return 1
+    elif numfactors == 1:
+        return start
+    else:
+        mid = (start + numfactors) | 1
+        return partial_product(start, mid) * partial_product(mid, stop)
+
+def py_factorial(n):
+    """Factorial of nonnegative integer n, via "Binary Split Factorial Formula"
+    described at http://www.luschny.de/math/factorial/binarysplitfact.html
+
+    """
+    inner = outer = 1
+    for i in reversed(range(n.bit_length())):
+        inner *= partial_product((n >> i + 1) + 1 | 1, (n >> i) + 1 | 1)
+        outer *= inner
+    return outer << (n - count_set_bits(n))
+
+def ulp_abs_check(expected, got, ulp_tol, abs_tol):
+    """Given finite floats `expected` and `got`, check that they're
+    approximately equal to within the given number of ulps or the
+    given absolute tolerance, whichever is bigger.
+
+    Returns None on success and an error message on failure.
+    """
+    ulp_error = abs(to_ulps(expected) - to_ulps(got))
+    abs_error = abs(expected - got)
+
+    # Succeed if either abs_error <= abs_tol or ulp_error <= ulp_tol.
+    if abs_error <= abs_tol or ulp_error <= ulp_tol:
+        return None
+    else:
+        fmt = ("error = {:.3g} ({:d} ulps); "
+               "permitted error = {:.3g} or {:d} ulps")
+        return fmt.format(abs_error, ulp_error, abs_tol, ulp_tol)
+
+def parse_mtestfile(fname):
+    """Parse a file with test values
+
+    -- starts a comment
+    blank lines, or lines containing only a comment, are ignored
+    other lines are expected to have the form
+      id fn arg -> expected [flag]*
+
+    """
+    with open(fname, encoding="utf-8") as fp:
+        for line in fp:
+            # strip comments, and skip blank lines
+            if '--' in line:
+                line = line[:line.index('--')]
+            if not line.strip():
+                continue
+
+            lhs, rhs = line.split('->')
+            id, fn, arg = lhs.split()
+            rhs_pieces = rhs.split()
+            exp = rhs_pieces[0]
+            flags = rhs_pieces[1:]
+
+            yield (id, fn, float(arg), float(exp), flags)
+
+
+def parse_testfile(fname):
+    """Parse a file with test values
+
+    Empty lines or lines starting with -- are ignored
+    yields id, fn, arg_real, arg_imag, exp_real, exp_imag
+    """
+    with open(fname, encoding="utf-8") as fp:
+        for line in fp:
+            # skip comment lines and blank lines
+            if line.startswith('--') or not line.strip():
+                continue
+
+            lhs, rhs = line.split('->')
+            id, fn, arg_real, arg_imag = lhs.split()
+            rhs_pieces = rhs.split()
+            exp_real, exp_imag = rhs_pieces[0], rhs_pieces[1]
+            flags = rhs_pieces[2:]
+
+            yield (id, fn,
+                   float(arg_real), float(arg_imag),
+                   float(exp_real), float(exp_imag),
+                   flags)
+
+
+def result_check(expected, got, ulp_tol=5, abs_tol=0.0):
+    # Common logic of MathTests.(ftest, test_testcases, test_mtestcases)
+    """Compare arguments expected and got, as floats, if either
+    is a float, using a tolerance expressed in multiples of
+    ulp(expected) or absolutely (if given and greater).
+
+    As a convenience, when neither argument is a float, and for
+    non-finite floats, exact equality is demanded. Also, nan==nan
+    as far as this function is concerned.
+
+    Returns None on success and an error message on failure.
+    """
+
+    # Check exactly equal (applies also to strings representing exceptions)
+    if got == expected:
+        if not got and not expected:
+            if math.copysign(1, got) != math.copysign(1, expected):
+                return f"expected {expected}, got {got} (zero has wrong sign)"
+        return None
+
+    failure = "not equal"
+
+    # Turn mixed float and int comparison (e.g. floor()) to all-float
+    if isinstance(expected, float) and isinstance(got, int):
+        got = float(got)
+    elif isinstance(got, float) and isinstance(expected, int):
+        expected = float(expected)
+
+    if isinstance(expected, float) and isinstance(got, float):
+        if math.isnan(expected) and math.isnan(got):
+            # Pass, since both nan
+            failure = None
+        elif math.isinf(expected) or math.isinf(got):
+            # We already know they're not equal, drop through to failure
+            pass
+        else:
+            # Both are finite floats (now). Are they close enough?
+            failure = ulp_abs_check(expected, got, ulp_tol, abs_tol)
+
+    # arguments are not equal, and if numeric, are too far apart
+    if failure is not None:
+        fail_fmt = "expected {!r}, got {!r}"
+        fail_msg = fail_fmt.format(expected, got)
+        fail_msg += ' ({})'.format(failure)
+        return fail_msg
+    else:
+        return None
+
+class FloatLike:
+    def __init__(self, value):
+        self.value = value
+
+    def __float__(self):
+        return self.value
+
+class IntSubclass(int):
+    pass
+
+# Class providing an __index__ method.
+class MyIndexable(object):
+    def __init__(self, value):
+        self.value = value
+
+    def __index__(self):
+        return self.value
+
+class BadDescr:
+    def __get__(self, obj, objtype=None):
+        raise ValueError
+
+class MathTests(__TestCase):
+
+    def ftest(self, name, got, expected, ulp_tol=5, abs_tol=0.0):
+        """Compare arguments expected and got, as floats, if either
+        is a float, using a tolerance expressed in multiples of
+        ulp(expected) or absolutely, whichever is greater.
+
+        As a convenience, when neither argument is a float, and for
+        non-finite floats, exact equality is demanded. Also, nan==nan
+        in this function.
+        """
+        failure = result_check(expected, got, ulp_tol, abs_tol)
+        if failure is not None:
+            self.fail("{}: {}".format(name, failure))
+
+    def testConstants(self):
+        # Ref: Abramowitz & Stegun (Dover, 1965)
+        self.ftest('pi', math.pi, 3.141592653589793238462643)
+        self.ftest('e', math.e, 2.718281828459045235360287)
+        self.assertEqual(math.tau, 2*math.pi)
+
+    def testAcos(self):
+        self.assertRaises(TypeError, math.acos)
+        self.ftest('acos(-1)', math.acos(-1), math.pi)
+        self.ftest('acos(0)', math.acos(0), math.pi/2)
+        self.ftest('acos(1)', math.acos(1), 0)
+        self.assertRaises(ValueError, math.acos, INF)
+        self.assertRaises(ValueError, math.acos, NINF)
+        self.assertRaises(ValueError, math.acos, 1 + eps)
+        self.assertRaises(ValueError, math.acos, -1 - eps)
+        self.assertTrue(math.isnan(math.acos(NAN)))
+
+    def testAcosh(self):
+        self.assertRaises(TypeError, math.acosh)
+        self.ftest('acosh(1)', math.acosh(1), 0)
+        self.ftest('acosh(2)', math.acosh(2), 1.3169578969248168)
+        self.assertRaises(ValueError, math.acosh, 0)
+        self.assertRaises(ValueError, math.acosh, -1)
+        self.assertEqual(math.acosh(INF), INF)
+        self.assertRaises(ValueError, math.acosh, NINF)
+        self.assertTrue(math.isnan(math.acosh(NAN)))
+
+    def testAsin(self):
+        self.assertRaises(TypeError, math.asin)
+        self.ftest('asin(-1)', math.asin(-1), -math.pi/2)
+        self.ftest('asin(0)', math.asin(0), 0)
+        self.ftest('asin(1)', math.asin(1), math.pi/2)
+        self.assertRaises(ValueError, math.asin, INF)
+        self.assertRaises(ValueError, math.asin, NINF)
+        self.assertRaises(ValueError, math.asin, 1 + eps)
+        self.assertRaises(ValueError, math.asin, -1 - eps)
+        self.assertTrue(math.isnan(math.asin(NAN)))
+
+    def testAsinh(self):
+        self.assertRaises(TypeError, math.asinh)
+        self.ftest('asinh(0)', math.asinh(0), 0)
+        self.ftest('asinh(1)', math.asinh(1), 0.88137358701954305)
+        self.ftest('asinh(-1)', math.asinh(-1), -0.88137358701954305)
+        self.assertEqual(math.asinh(INF), INF)
+        self.assertEqual(math.asinh(NINF), NINF)
+        self.assertTrue(math.isnan(math.asinh(NAN)))
+
+    def testAtan(self):
+        self.assertRaises(TypeError, math.atan)
+        self.ftest('atan(-1)', math.atan(-1), -math.pi/4)
+        self.ftest('atan(0)', math.atan(0), 0)
+        self.ftest('atan(1)', math.atan(1), math.pi/4)
+        self.ftest('atan(inf)', math.atan(INF), math.pi/2)
+        self.ftest('atan(-inf)', math.atan(NINF), -math.pi/2)
+        self.assertTrue(math.isnan(math.atan(NAN)))
+
+    def testAtanh(self):
+        self.assertRaises(TypeError, math.atan)
+        self.ftest('atanh(0)', math.atanh(0), 0)
+        self.ftest('atanh(0.5)', math.atanh(0.5), 0.54930614433405489)
+        self.ftest('atanh(-0.5)', math.atanh(-0.5), -0.54930614433405489)
+        self.assertRaises(ValueError, math.atanh, 1)
+        self.assertRaises(ValueError, math.atanh, -1)
+        self.assertRaises(ValueError, math.atanh, INF)
+        self.assertRaises(ValueError, math.atanh, NINF)
+        self.assertTrue(math.isnan(math.atanh(NAN)))
+
+    def testAtan2(self):
+        self.assertRaises(TypeError, math.atan2)
+        self.ftest('atan2(-1, 0)', math.atan2(-1, 0), -math.pi/2)
+        self.ftest('atan2(-1, 1)', math.atan2(-1, 1), -math.pi/4)
+        self.ftest('atan2(0, 1)', math.atan2(0, 1), 0)
+        self.ftest('atan2(1, 1)', math.atan2(1, 1), math.pi/4)
+        self.ftest('atan2(1, 0)', math.atan2(1, 0), math.pi/2)
+        self.ftest('atan2(1, -1)', math.atan2(1, -1), 3*math.pi/4)
+
+        # math.atan2(0, x)
+        self.ftest('atan2(0., -inf)', math.atan2(0., NINF), math.pi)
+        self.ftest('atan2(0., -2.3)', math.atan2(0., -2.3), math.pi)
+        self.ftest('atan2(0., -0.)', math.atan2(0., -0.), math.pi)
+        self.assertEqual(math.atan2(0., 0.), 0.)
+        self.assertEqual(math.atan2(0., 2.3), 0.)
+        self.assertEqual(math.atan2(0., INF), 0.)
+        self.assertTrue(math.isnan(math.atan2(0., NAN)))
+        # math.atan2(-0, x)
+        self.ftest('atan2(-0., -inf)', math.atan2(-0., NINF), -math.pi)
+        self.ftest('atan2(-0., -2.3)', math.atan2(-0., -2.3), -math.pi)
+        self.ftest('atan2(-0., -0.)', math.atan2(-0., -0.), -math.pi)
+        self.assertEqual(math.atan2(-0., 0.), -0.)
+        self.assertEqual(math.atan2(-0., 2.3), -0.)
+        self.assertEqual(math.atan2(-0., INF), -0.)
+        self.assertTrue(math.isnan(math.atan2(-0., NAN)))
+        # math.atan2(INF, x)
+        self.ftest('atan2(inf, -inf)', math.atan2(INF, NINF), math.pi*3/4)
+        self.ftest('atan2(inf, -2.3)', math.atan2(INF, -2.3), math.pi/2)
+        self.ftest('atan2(inf, -0.)', math.atan2(INF, -0.0), math.pi/2)
+        self.ftest('atan2(inf, 0.)', math.atan2(INF, 0.0), math.pi/2)
+        self.ftest('atan2(inf, 2.3)', math.atan2(INF, 2.3), math.pi/2)
+        self.ftest('atan2(inf, inf)', math.atan2(INF, INF), math.pi/4)
+        self.assertTrue(math.isnan(math.atan2(INF, NAN)))
+        # math.atan2(NINF, x)
+        self.ftest('atan2(-inf, -inf)', math.atan2(NINF, NINF), -math.pi*3/4)
+        self.ftest('atan2(-inf, -2.3)', math.atan2(NINF, -2.3), -math.pi/2)
+        self.ftest('atan2(-inf, -0.)', math.atan2(NINF, -0.0), -math.pi/2)
+        self.ftest('atan2(-inf, 0.)', math.atan2(NINF, 0.0), -math.pi/2)
+        self.ftest('atan2(-inf, 2.3)', math.atan2(NINF, 2.3), -math.pi/2)
+        self.ftest('atan2(-inf, inf)', math.atan2(NINF, INF), -math.pi/4)
+        self.assertTrue(math.isnan(math.atan2(NINF, NAN)))
+        # math.atan2(+finite, x)
+        self.ftest('atan2(2.3, -inf)', math.atan2(2.3, NINF), math.pi)
+        self.ftest('atan2(2.3, -0.)', math.atan2(2.3, -0.), math.pi/2)
+        self.ftest('atan2(2.3, 0.)', math.atan2(2.3, 0.), math.pi/2)
+        self.assertEqual(math.atan2(2.3, INF), 0.)
+        self.assertTrue(math.isnan(math.atan2(2.3, NAN)))
+        # math.atan2(-finite, x)
+        self.ftest('atan2(-2.3, -inf)', math.atan2(-2.3, NINF), -math.pi)
+        self.ftest('atan2(-2.3, -0.)', math.atan2(-2.3, -0.), -math.pi/2)
+        self.ftest('atan2(-2.3, 0.)', math.atan2(-2.3, 0.), -math.pi/2)
+        self.assertEqual(math.atan2(-2.3, INF), -0.)
+        self.assertTrue(math.isnan(math.atan2(-2.3, NAN)))
+        # math.atan2(NAN, x)
+        self.assertTrue(math.isnan(math.atan2(NAN, NINF)))
+        self.assertTrue(math.isnan(math.atan2(NAN, -2.3)))
+        self.assertTrue(math.isnan(math.atan2(NAN, -0.)))
+        self.assertTrue(math.isnan(math.atan2(NAN, 0.)))
+        self.assertTrue(math.isnan(math.atan2(NAN, 2.3)))
+        self.assertTrue(math.isnan(math.atan2(NAN, INF)))
+        self.assertTrue(math.isnan(math.atan2(NAN, NAN)))
+
+    def testCbrt(self):
+        self.assertRaises(TypeError, math.cbrt)
+        self.ftest('cbrt(0)', math.cbrt(0), 0)
+        self.ftest('cbrt(1)', math.cbrt(1), 1)
+        self.ftest('cbrt(8)', math.cbrt(8), 2)
+        self.ftest('cbrt(0.0)', math.cbrt(0.0), 0.0)
+        self.ftest('cbrt(-0.0)', math.cbrt(-0.0), -0.0)
+        self.ftest('cbrt(1.2)', math.cbrt(1.2), 1.062658569182611)
+        self.ftest('cbrt(-2.6)', math.cbrt(-2.6), -1.375068867074141)
+        self.ftest('cbrt(27)', math.cbrt(27), 3)
+        self.ftest('cbrt(-1)', math.cbrt(-1), -1)
+        self.ftest('cbrt(-27)', math.cbrt(-27), -3)
+        self.assertEqual(math.cbrt(INF), INF)
+        self.assertEqual(math.cbrt(NINF), NINF)
+        self.assertTrue(math.isnan(math.cbrt(NAN)))
+
+    def testCeil(self):
+        self.assertRaises(TypeError, math.ceil)
+        self.assertEqual(int, type(math.ceil(0.5)))
+        self.assertEqual(math.ceil(0.5), 1)
+        self.assertEqual(math.ceil(1.0), 1)
+        self.assertEqual(math.ceil(1.5), 2)
+        self.assertEqual(math.ceil(-0.5), 0)
+        self.assertEqual(math.ceil(-1.0), -1)
+        self.assertEqual(math.ceil(-1.5), -1)
+        self.assertEqual(math.ceil(0.0), 0)
+        self.assertEqual(math.ceil(-0.0), 0)
+        #self.assertEqual(math.ceil(INF), INF)
+        #self.assertEqual(math.ceil(NINF), NINF)
+        #self.assertTrue(math.isnan(math.ceil(NAN)))
+
+        class TestCeil:
+            def __ceil__(self):
+                return 42
+        class FloatCeil(float):
+            def __ceil__(self):
+                return 42
+        class TestNoCeil:
+            pass
+        class TestBadCeil:
+            __ceil__ = BadDescr()
+        self.assertEqual(math.ceil(TestCeil()), 42)
+        self.assertEqual(math.ceil(FloatCeil()), 42)
+        self.assertEqual(math.ceil(FloatLike(42.5)), 43)
+        self.assertRaises(TypeError, math.ceil, TestNoCeil())
+        self.assertRaises(ValueError, math.ceil, TestBadCeil())
+
+        t = TestNoCeil()
+        t.__ceil__ = lambda *args: args
+        self.assertRaises(TypeError, math.ceil, t)
+        self.assertRaises(TypeError, math.ceil, t, 0)
+
+        self.assertEqual(math.ceil(FloatLike(+1.0)), +1.0)
+        self.assertEqual(math.ceil(FloatLike(-1.0)), -1.0)
+
+    @requires_IEEE_754
+    def testCopysign(self):
+        self.assertEqual(math.copysign(1, 42), 1.0)
+        self.assertEqual(math.copysign(0., 42), 0.0)
+        self.assertEqual(math.copysign(1., -42), -1.0)
+        self.assertEqual(math.copysign(3, 0.), 3.0)
+        self.assertEqual(math.copysign(4., -0.), -4.0)
+
+        self.assertRaises(TypeError, math.copysign)
+        # copysign should let us distinguish signs of zeros
+        self.assertEqual(math.copysign(1., 0.), 1.)
+        self.assertEqual(math.copysign(1., -0.), -1.)
+        self.assertEqual(math.copysign(INF, 0.), INF)
+        self.assertEqual(math.copysign(INF, -0.), NINF)
+        self.assertEqual(math.copysign(NINF, 0.), INF)
+        self.assertEqual(math.copysign(NINF, -0.), NINF)
+        # and of infinities
+        self.assertEqual(math.copysign(1., INF), 1.)
+        self.assertEqual(math.copysign(1., NINF), -1.)
+        self.assertEqual(math.copysign(INF, INF), INF)
+        self.assertEqual(math.copysign(INF, NINF), NINF)
+        self.assertEqual(math.copysign(NINF, INF), INF)
+        self.assertEqual(math.copysign(NINF, NINF), NINF)
+        self.assertTrue(math.isnan(math.copysign(NAN, 1.)))
+        self.assertTrue(math.isnan(math.copysign(NAN, INF)))
+        self.assertTrue(math.isnan(math.copysign(NAN, NINF)))
+        self.assertTrue(math.isnan(math.copysign(NAN, NAN)))
+        # copysign(INF, NAN) may be INF or it may be NINF, since
+        # we don't know whether the sign bit of NAN is set on any
+        # given platform.
+        self.assertTrue(math.isinf(math.copysign(INF, NAN)))
+        # similarly, copysign(2., NAN) could be 2. or -2.
+        self.assertEqual(abs(math.copysign(2., NAN)), 2.)
+
+    def testCos(self):
+        self.assertRaises(TypeError, math.cos)
+        self.ftest('cos(-pi/2)', math.cos(-math.pi/2), 0, abs_tol=math.ulp(1))
+        self.ftest('cos(0)', math.cos(0), 1)
+        self.ftest('cos(pi/2)', math.cos(math.pi/2), 0, abs_tol=math.ulp(1))
+        self.ftest('cos(pi)', math.cos(math.pi), -1)
+        try:
+            self.assertTrue(math.isnan(math.cos(INF)))
+            self.assertTrue(math.isnan(math.cos(NINF)))
+        except ValueError:
+            self.assertRaises(ValueError, math.cos, INF)
+            self.assertRaises(ValueError, math.cos, NINF)
+        self.assertTrue(math.isnan(math.cos(NAN)))
+
+    @unittest.skipIf(sys.platform == 'win32' and platform.machine() in ('ARM', 'ARM64'),
+                    "Windows UCRT is off by 2 ULP this test requires accuracy within 1 ULP")
+    def testCosh(self):
+        self.assertRaises(TypeError, math.cosh)
+        self.ftest('cosh(0)', math.cosh(0), 1)
+        self.ftest('cosh(2)-2*cosh(1)**2', math.cosh(2)-2*math.cosh(1)**2, -1) # Thanks to Lambert
+        self.assertEqual(math.cosh(INF), INF)
+        self.assertEqual(math.cosh(NINF), INF)
+        self.assertTrue(math.isnan(math.cosh(NAN)))
+
+    def testDegrees(self):
+        self.assertRaises(TypeError, math.degrees)
+        self.ftest('degrees(pi)', math.degrees(math.pi), 180.0)
+        self.ftest('degrees(pi/2)', math.degrees(math.pi/2), 90.0)
+        self.ftest('degrees(-pi/4)', math.degrees(-math.pi/4), -45.0)
+        self.ftest('degrees(0)', math.degrees(0), 0)
+
+    def testExp(self):
+        self.assertRaises(TypeError, math.exp)
+        self.ftest('exp(-1)', math.exp(-1), 1/math.e)
+        self.ftest('exp(0)', math.exp(0), 1)
+        self.ftest('exp(1)', math.exp(1), math.e)
+        self.assertEqual(math.exp(INF), INF)
+        self.assertEqual(math.exp(NINF), 0.)
+        self.assertTrue(math.isnan(math.exp(NAN)))
+        self.assertRaises(OverflowError, math.exp, 1000000)
+
+    def testExp2(self):
+        self.assertRaises(TypeError, math.exp2)
+        self.ftest('exp2(-1)', math.exp2(-1), 0.5)
+        self.ftest('exp2(0)', math.exp2(0), 1)
+        self.ftest('exp2(1)', math.exp2(1), 2)
+        self.ftest('exp2(2.3)', math.exp2(2.3), 4.924577653379665)
+        self.assertEqual(math.exp2(INF), INF)
+        self.assertEqual(math.exp2(NINF), 0.)
+        self.assertTrue(math.isnan(math.exp2(NAN)))
+        self.assertRaises(OverflowError, math.exp2, 1000000)
+
+    def testFabs(self):
+        self.assertRaises(TypeError, math.fabs)
+        self.ftest('fabs(-1)', math.fabs(-1), 1)
+        self.ftest('fabs(0)', math.fabs(0), 0)
+        self.ftest('fabs(1)', math.fabs(1), 1)
+
+    @skipIfTorchDynamo("infinite loop")
+    def testFactorial(self):
+        self.assertEqual(math.factorial(0), 1)
+        total = 1
+        for i in range(1, 1000):
+            total *= i
+            self.assertEqual(math.factorial(i), total)
+            self.assertEqual(math.factorial(i), py_factorial(i))
+        self.assertRaises(ValueError, math.factorial, -1)
+        self.assertRaises(ValueError, math.factorial, -10**100)
+
+    def testFactorialNonIntegers(self):
+        self.assertRaises(TypeError, math.factorial, 5.0)
+        self.assertRaises(TypeError, math.factorial, 5.2)
+        self.assertRaises(TypeError, math.factorial, -1.0)
+        self.assertRaises(TypeError, math.factorial, -1e100)
+        self.assertRaises(TypeError, math.factorial, decimal.Decimal('5'))
+        self.assertRaises(TypeError, math.factorial, decimal.Decimal('5.2'))
+        self.assertRaises(TypeError, math.factorial, "5")
+
+    # Other implementations may place different upper bounds.
+    @support.cpython_only
+    def testFactorialHugeInputs(self):
+        # Currently raises OverflowError for inputs that are too large
+        # to fit into a C long.
+        self.assertRaises(OverflowError, math.factorial, 10**100)
+        self.assertRaises(TypeError, math.factorial, 1e100)
+
+    def testFloor(self):
+        self.assertRaises(TypeError, math.floor)
+        self.assertEqual(int, type(math.floor(0.5)))
+        self.assertEqual(math.floor(0.5), 0)
+        self.assertEqual(math.floor(1.0), 1)
+        self.assertEqual(math.floor(1.5), 1)
+        self.assertEqual(math.floor(-0.5), -1)
+        self.assertEqual(math.floor(-1.0), -1)
+        self.assertEqual(math.floor(-1.5), -2)
+        #self.assertEqual(math.ceil(INF), INF)
+        #self.assertEqual(math.ceil(NINF), NINF)
+        #self.assertTrue(math.isnan(math.floor(NAN)))
+
+        class TestFloor:
+            def __floor__(self):
+                return 42
+        class FloatFloor(float):
+            def __floor__(self):
+                return 42
+        class TestNoFloor:
+            pass
+        class TestBadFloor:
+            __floor__ = BadDescr()
+        self.assertEqual(math.floor(TestFloor()), 42)
+        self.assertEqual(math.floor(FloatFloor()), 42)
+        self.assertEqual(math.floor(FloatLike(41.9)), 41)
+        self.assertRaises(TypeError, math.floor, TestNoFloor())
+        self.assertRaises(ValueError, math.floor, TestBadFloor())
+
+        t = TestNoFloor()
+        t.__floor__ = lambda *args: args
+        self.assertRaises(TypeError, math.floor, t)
+        self.assertRaises(TypeError, math.floor, t, 0)
+
+        self.assertEqual(math.floor(FloatLike(+1.0)), +1.0)
+        self.assertEqual(math.floor(FloatLike(-1.0)), -1.0)
+
+    def testFmod(self):
+        self.assertRaises(TypeError, math.fmod)
+        self.ftest('fmod(10, 1)', math.fmod(10, 1), 0.0)
+        self.ftest('fmod(10, 0.5)', math.fmod(10, 0.5), 0.0)
+        self.ftest('fmod(10, 1.5)', math.fmod(10, 1.5), 1.0)
+        self.ftest('fmod(-10, 1)', math.fmod(-10, 1), -0.0)
+        self.ftest('fmod(-10, 0.5)', math.fmod(-10, 0.5), -0.0)
+        self.ftest('fmod(-10, 1.5)', math.fmod(-10, 1.5), -1.0)
+        self.assertTrue(math.isnan(math.fmod(NAN, 1.)))
+        self.assertTrue(math.isnan(math.fmod(1., NAN)))
+        self.assertTrue(math.isnan(math.fmod(NAN, NAN)))
+        self.assertRaises(ValueError, math.fmod, 1., 0.)
+        self.assertRaises(ValueError, math.fmod, INF, 1.)
+        self.assertRaises(ValueError, math.fmod, NINF, 1.)
+        self.assertRaises(ValueError, math.fmod, INF, 0.)
+        self.assertEqual(math.fmod(3.0, INF), 3.0)
+        self.assertEqual(math.fmod(-3.0, INF), -3.0)
+        self.assertEqual(math.fmod(3.0, NINF), 3.0)
+        self.assertEqual(math.fmod(-3.0, NINF), -3.0)
+        self.assertEqual(math.fmod(0.0, 3.0), 0.0)
+        self.assertEqual(math.fmod(0.0, NINF), 0.0)
+        self.assertRaises(ValueError, math.fmod, INF, INF)
+
+    def testFrexp(self):
+        self.assertRaises(TypeError, math.frexp)
+
+        def testfrexp(name, result, expected):
+            (mant, exp), (emant, eexp) = result, expected
+            if abs(mant-emant) > eps or exp != eexp:
+                self.fail('%s returned %r, expected %r'%\
+                          (name, result, expected))
+
+        testfrexp('frexp(-1)', math.frexp(-1), (-0.5, 1))
+        testfrexp('frexp(0)', math.frexp(0), (0, 0))
+        testfrexp('frexp(1)', math.frexp(1), (0.5, 1))
+        testfrexp('frexp(2)', math.frexp(2), (0.5, 2))
+
+        self.assertEqual(math.frexp(INF)[0], INF)
+        self.assertEqual(math.frexp(NINF)[0], NINF)
+        self.assertTrue(math.isnan(math.frexp(NAN)[0]))
+
+    @requires_IEEE_754
+    @unittest.skipIf(HAVE_DOUBLE_ROUNDING,
+                         "fsum is not exact on machines with double rounding")
+    def testFsum(self):
+        # math.fsum relies on exact rounding for correct operation.
+        # There's a known problem with IA32 floating-point that causes
+        # inexact rounding in some situations, and will cause the
+        # math.fsum tests below to fail; see issue #2937.  On non IEEE
+        # 754 platforms, and on IEEE 754 platforms that exhibit the
+        # problem described in issue #2937, we simply skip the whole
+        # test.
+
+        # Python version of math.fsum, for comparison.  Uses a
+        # different algorithm based on frexp, ldexp and integer
+        # arithmetic.
+        from sys import float_info
+        mant_dig = float_info.mant_dig
+        etiny = float_info.min_exp - mant_dig
+
+        def msum(iterable):
+            """Full precision summation.  Compute sum(iterable) without any
+            intermediate accumulation of error.  Based on the 'lsum' function
+            at https://code.activestate.com/recipes/393090-binary-floating-point-summation-accurate-to-full-p/
+
+            """
+            tmant, texp = 0, 0
+            for x in iterable:
+                mant, exp = math.frexp(x)
+                mant, exp = int(math.ldexp(mant, mant_dig)), exp - mant_dig
+                if texp > exp:
+                    tmant <<= texp-exp
+                    texp = exp
+                else:
+                    mant <<= exp-texp
+                tmant += mant
+            # Round tmant * 2**texp to a float.  The original recipe
+            # used float(str(tmant)) * 2.0**texp for this, but that's
+            # a little unsafe because str -> float conversion can't be
+            # relied upon to do correct rounding on all platforms.
+            tail = max(len(bin(abs(tmant)))-2 - mant_dig, etiny - texp)
+            if tail > 0:
+                h = 1 << (tail-1)
+                tmant = tmant // (2*h) + bool(tmant & h and tmant & 3*h-1)
+                texp += tail
+            return math.ldexp(tmant, texp)
+
+        test_values = [
+            ([], 0.0),
+            ([0.0], 0.0),
+            ([1e100, 1.0, -1e100, 1e-100, 1e50, -1.0, -1e50], 1e-100),
+            ([1e100, 1.0, -1e100, 1e-100, 1e50, -1, -1e50], 1e-100),
+            ([2.0**53, -0.5, -2.0**-54], 2.0**53-1.0),
+            ([2.0**53, 1.0, 2.0**-100], 2.0**53+2.0),
+            ([2.0**53+10.0, 1.0, 2.0**-100], 2.0**53+12.0),
+            ([2.0**53-4.0, 0.5, 2.0**-54], 2.0**53-3.0),
+            ([1./n for n in range(1, 1001)],
+             float.fromhex('0x1.df11f45f4e61ap+2')),
+            ([(-1.)**n/n for n in range(1, 1001)],
+             float.fromhex('-0x1.62a2af1bd3624p-1')),
+            ([1e16, 1., 1e-16], 10000000000000002.0),
+            ([1e16-2., 1.-2.**-53, -(1e16-2.), -(1.-2.**-53)], 0.0),
+            # exercise code for resizing partials array
+            ([2.**n - 2.**(n+50) + 2.**(n+52) for n in range(-1074, 972, 2)] +
+             [-2.**1022],
+             float.fromhex('0x1.5555555555555p+970')),
+            ]
+
+        # Telescoping sum, with exact differences (due to Sterbenz)
+        terms = [1.7**i for i in range(1001)]
+        test_values.append((
+            [terms[i+1] - terms[i] for i in range(1000)] + [-terms[1000]],
+            -terms[0]
+        ))
+
+        for i, (vals, expected) in enumerate(test_values):
+            try:
+                actual = math.fsum(vals)
+            except OverflowError:
+                self.fail("test %d failed: got OverflowError, expected %r "
+                          "for math.fsum(%.100r)" % (i, expected, vals))
+            except ValueError:
+                self.fail("test %d failed: got ValueError, expected %r "
+                          "for math.fsum(%.100r)" % (i, expected, vals))
+            self.assertEqual(actual, expected)
+
+        from random import random, gauss, shuffle
+        for j in range(1000):
+            vals = [7, 1e100, -7, -1e100, -9e-20, 8e-20] * 10
+            s = 0
+            for i in range(200):
+                v = gauss(0, random()) ** 7 - s
+                s += v
+                vals.append(v)
+            shuffle(vals)
+
+            s = msum(vals)
+            self.assertEqual(msum(vals), math.fsum(vals))
+
+        self.assertEqual(math.fsum([1.0, math.inf]), math.inf)
+        self.assertTrue(math.isnan(math.fsum([math.nan, 1.0])))
+        self.assertEqual(math.fsum([1e100, FloatLike(1.0), -1e100, 1e-100,
+                                    1e50, FloatLike(-1.0), -1e50]), 1e-100)
+        self.assertRaises(OverflowError, math.fsum, [1e+308, 1e+308])
+        self.assertRaises(ValueError, math.fsum, [math.inf, -math.inf])
+        self.assertRaises(TypeError, math.fsum, ['spam'])
+        self.assertRaises(TypeError, math.fsum, 1)
+        self.assertRaises(OverflowError, math.fsum, [10**1000])
+
+        def bad_iter():
+            yield 1.0
+            raise ZeroDivisionError
+
+        self.assertRaises(ZeroDivisionError, math.fsum, bad_iter())
+
+    def testGcd(self):
+        gcd = math.gcd
+        self.assertEqual(gcd(0, 0), 0)
+        self.assertEqual(gcd(1, 0), 1)
+        self.assertEqual(gcd(-1, 0), 1)
+        self.assertEqual(gcd(0, 1), 1)
+        self.assertEqual(gcd(0, -1), 1)
+        self.assertEqual(gcd(7, 1), 1)
+        self.assertEqual(gcd(7, -1), 1)
+        self.assertEqual(gcd(-23, 15), 1)
+        self.assertEqual(gcd(120, 84), 12)
+        self.assertEqual(gcd(84, -120), 12)
+        self.assertEqual(gcd(1216342683557601535506311712,
+                             436522681849110124616458784), 32)
+
+        x = 434610456570399902378880679233098819019853229470286994367836600566
+        y = 1064502245825115327754847244914921553977
+        for c in (652560,
+                  576559230871654959816130551884856912003141446781646602790216406874):
+            a = x * c
+            b = y * c
+            self.assertEqual(gcd(a, b), c)
+            self.assertEqual(gcd(b, a), c)
+            self.assertEqual(gcd(-a, b), c)
+            self.assertEqual(gcd(b, -a), c)
+            self.assertEqual(gcd(a, -b), c)
+            self.assertEqual(gcd(-b, a), c)
+            self.assertEqual(gcd(-a, -b), c)
+            self.assertEqual(gcd(-b, -a), c)
+
+        self.assertEqual(gcd(), 0)
+        self.assertEqual(gcd(120), 120)
+        self.assertEqual(gcd(-120), 120)
+        self.assertEqual(gcd(120, 84, 102), 6)
+        self.assertEqual(gcd(120, 1, 84), 1)
+
+        self.assertRaises(TypeError, gcd, 120.0)
+        self.assertRaises(TypeError, gcd, 120.0, 84)
+        self.assertRaises(TypeError, gcd, 120, 84.0)
+        self.assertRaises(TypeError, gcd, 120, 1, 84.0)
+        self.assertEqual(gcd(MyIndexable(120), MyIndexable(84)), 12)
+
+    def testHypot(self):
+        from decimal import Decimal
+        from fractions import Fraction
+
+        hypot = math.hypot
+
+        # Test different numbers of arguments (from zero to five)
+        # against a straightforward pure python implementation
+        args = math.e, math.pi, math.sqrt(2.0), math.gamma(3.5), math.sin(2.1)
+        for i in range(len(args)+1):
+            self.assertAlmostEqual(
+                hypot(*args[:i]),
+                math.sqrt(sum(s**2 for s in args[:i]))
+            )
+
+        # Test allowable types (those with __float__)
+        self.assertEqual(hypot(12.0, 5.0), 13.0)
+        self.assertEqual(hypot(12, 5), 13)
+        self.assertEqual(hypot(0.75, -1), 1.25)
+        self.assertEqual(hypot(-1, 0.75), 1.25)
+        self.assertEqual(hypot(0.75, FloatLike(-1.)), 1.25)
+        self.assertEqual(hypot(FloatLike(-1.), 0.75), 1.25)
+        self.assertEqual(hypot(Decimal(12), Decimal(5)), 13)
+        self.assertEqual(hypot(Fraction(12, 32), Fraction(5, 32)), Fraction(13, 32))
+        self.assertEqual(hypot(True, False, True, True, True), 2.0)
+
+        # Test corner cases
+        self.assertEqual(hypot(0.0, 0.0), 0.0)     # Max input is zero
+        self.assertEqual(hypot(-10.5), 10.5)       # Negative input
+        self.assertEqual(hypot(), 0.0)             # Negative input
+        self.assertEqual(1.0,
+            math.copysign(1.0, hypot(-0.0))        # Convert negative zero to positive zero
+        )
+        self.assertEqual(                          # Handling of moving max to the end
+            hypot(1.5, 1.5, 0.5),
+            hypot(1.5, 0.5, 1.5),
+        )
+
+        # Test handling of bad arguments
+        with self.assertRaises(TypeError):         # Reject keyword args
+            hypot(x=1)
+        with self.assertRaises(TypeError):         # Reject values without __float__
+            hypot(1.1, 'string', 2.2)
+        int_too_big_for_float = 10 ** (sys.float_info.max_10_exp + 5)
+        with self.assertRaises((ValueError, OverflowError)):
+            hypot(1, int_too_big_for_float)
+
+        # Any infinity gives positive infinity.
+        self.assertEqual(hypot(INF), INF)
+        self.assertEqual(hypot(0, INF), INF)
+        self.assertEqual(hypot(10, INF), INF)
+        self.assertEqual(hypot(-10, INF), INF)
+        self.assertEqual(hypot(NAN, INF), INF)
+        self.assertEqual(hypot(INF, NAN), INF)
+        self.assertEqual(hypot(NINF, NAN), INF)
+        self.assertEqual(hypot(NAN, NINF), INF)
+        self.assertEqual(hypot(-INF, INF), INF)
+        self.assertEqual(hypot(-INF, -INF), INF)
+        self.assertEqual(hypot(10, -INF), INF)
+
+        # If no infinity, any NaN gives a NaN.
+        self.assertTrue(math.isnan(hypot(NAN)))
+        self.assertTrue(math.isnan(hypot(0, NAN)))
+        self.assertTrue(math.isnan(hypot(NAN, 10)))
+        self.assertTrue(math.isnan(hypot(10, NAN)))
+        self.assertTrue(math.isnan(hypot(NAN, NAN)))
+        self.assertTrue(math.isnan(hypot(NAN)))
+
+        # Verify scaling for extremely large values
+        fourthmax = FLOAT_MAX / 4.0
+        for n in range(32):
+            self.assertTrue(math.isclose(hypot(*([fourthmax]*n)),
+                                         fourthmax * math.sqrt(n)))
+
+        # Verify scaling for extremely small values
+        for exp in range(32):
+            scale = FLOAT_MIN / 2.0 ** exp
+            self.assertEqual(math.hypot(4*scale, 3*scale), 5*scale)
+
+        self.assertRaises(TypeError, math.hypot, *([1.0]*18), 'spam')
+
+    @requires_IEEE_754
+    @unittest.skipIf(HAVE_DOUBLE_ROUNDING,
+                     "hypot() loses accuracy on machines with double rounding")
+    def testHypotAccuracy(self):
+        # Verify improved accuracy in cases that were known to be inaccurate.
+        #
+        # The new algorithm's accuracy depends on IEEE 754 arithmetic
+        # guarantees, on having the usual ROUND HALF EVEN rounding mode, on
+        # the system not having double rounding due to extended precision,
+        # and on the compiler maintaining the specified order of operations.
+        #
+        # This test is known to succeed on most of our builds.  If it fails
+        # some build, we either need to add another skipIf if the cause is
+        # identifiable; otherwise, we can remove this test entirely.
+
+        hypot = math.hypot
+        Decimal = decimal.Decimal
+        high_precision = decimal.Context(prec=500)
+
+        for hx, hy in [
+            # Cases with a 1 ulp error in Python 3.7 compiled with Clang
+            ('0x1.10e89518dca48p+29', '0x1.1970f7565b7efp+30'),
+            ('0x1.10106eb4b44a2p+29', '0x1.ef0596cdc97f8p+29'),
+            ('0x1.459c058e20bb7p+30', '0x1.993ca009b9178p+29'),
+            ('0x1.378371ae67c0cp+30', '0x1.fbe6619854b4cp+29'),
+            ('0x1.f4cd0574fb97ap+29', '0x1.50fe31669340ep+30'),
+            ('0x1.494b2cdd3d446p+29', '0x1.212a5367b4c7cp+29'),
+            ('0x1.f84e649f1e46dp+29', '0x1.1fa56bef8eec4p+30'),
+            ('0x1.2e817edd3d6fap+30', '0x1.eb0814f1e9602p+29'),
+            ('0x1.0d3a6e3d04245p+29', '0x1.32a62fea52352p+30'),
+            ('0x1.888e19611bfc5p+29', '0x1.52b8e70b24353p+29'),
+
+            # Cases with 2 ulp error in Python 3.8
+            ('0x1.538816d48a13fp+29', '0x1.7967c5ca43e16p+29'),
+            ('0x1.57b47b7234530p+29', '0x1.74e2c7040e772p+29'),
+            ('0x1.821b685e9b168p+30', '0x1.677dc1c1e3dc6p+29'),
+            ('0x1.9e8247f67097bp+29', '0x1.24bd2dc4f4baep+29'),
+            ('0x1.b73b59e0cb5f9p+29', '0x1.da899ab784a97p+28'),
+            ('0x1.94a8d2842a7cfp+30', '0x1.326a51d4d8d8ap+30'),
+            ('0x1.e930b9cd99035p+29', '0x1.5a1030e18dff9p+30'),
+            ('0x1.1592bbb0e4690p+29', '0x1.a9c337b33fb9ap+29'),
+            ('0x1.1243a50751fd4p+29', '0x1.a5a10175622d9p+29'),
+            ('0x1.57a8596e74722p+30', '0x1.42d1af9d04da9p+30'),
+
+            # Cases with 1 ulp error in version fff3c28052e6b0
+            ('0x1.ee7dbd9565899p+29', '0x1.7ab4d6fc6e4b4p+29'),
+            ('0x1.5c6bfbec5c4dcp+30', '0x1.02511184b4970p+30'),
+            ('0x1.59dcebba995cap+30', '0x1.50ca7e7c38854p+29'),
+            ('0x1.768cdd94cf5aap+29', '0x1.9cfdc5571d38ep+29'),
+            ('0x1.dcf137d60262ep+29', '0x1.1101621990b3ep+30'),
+            ('0x1.3a2d006e288b0p+30', '0x1.e9a240914326cp+29'),
+            ('0x1.62a32f7f53c61p+29', '0x1.47eb6cd72684fp+29'),
+            ('0x1.d3bcb60748ef2p+29', '0x1.3f13c4056312cp+30'),
+            ('0x1.282bdb82f17f3p+30', '0x1.640ba4c4eed3ap+30'),
+            ('0x1.89d8c423ea0c6p+29', '0x1.d35dcfe902bc3p+29'),
+        ]:
+            x = float.fromhex(hx)
+            y = float.fromhex(hy)
+            with self.subTest(hx=hx, hy=hy, x=x, y=y):
+                with decimal.localcontext(high_precision):
+                    z = float((Decimal(x)**2 + Decimal(y)**2).sqrt())
+                self.assertEqual(hypot(x, y), z)
+
+    def testDist(self):
+        from decimal import Decimal as D
+        from fractions import Fraction as F
+
+        dist = math.dist
+        sqrt = math.sqrt
+
+        # Simple exact cases
+        self.assertEqual(dist((1.0, 2.0, 3.0), (4.0, 2.0, -1.0)), 5.0)
+        self.assertEqual(dist((1, 2, 3), (4, 2, -1)), 5.0)
+
+        # Test different numbers of arguments (from zero to nine)
+        # against a straightforward pure python implementation
+        for i in range(9):
+            for j in range(5):
+                p = tuple(random.uniform(-5, 5) for k in range(i))
+                q = tuple(random.uniform(-5, 5) for k in range(i))
+                self.assertAlmostEqual(
+                    dist(p, q),
+                    sqrt(sum((px - qx) ** 2.0 for px, qx in zip(p, q)))
+                )
+
+        # Test non-tuple inputs
+        self.assertEqual(dist([1.0, 2.0, 3.0], [4.0, 2.0, -1.0]), 5.0)
+        self.assertEqual(dist(iter([1.0, 2.0, 3.0]), iter([4.0, 2.0, -1.0])), 5.0)
+
+        # Test allowable types (those with __float__)
+        self.assertEqual(dist((14.0, 1.0), (2.0, -4.0)), 13.0)
+        self.assertEqual(dist((14, 1), (2, -4)), 13)
+        self.assertEqual(dist((FloatLike(14.), 1), (2, -4)), 13)
+        self.assertEqual(dist((11, 1), (FloatLike(-1.), -4)), 13)
+        self.assertEqual(dist((14, FloatLike(-1.)), (2, -6)), 13)
+        self.assertEqual(dist((14, -1), (2, -6)), 13)
+        self.assertEqual(dist((D(14), D(1)), (D(2), D(-4))), D(13))
+        self.assertEqual(dist((F(14, 32), F(1, 32)), (F(2, 32), F(-4, 32))),
+                         F(13, 32))
+        self.assertEqual(dist((True, True, False, False, True, True),
+                              (True, False, True, False, False, False)),
+                         2.0)
+
+        # Test corner cases
+        self.assertEqual(dist((13.25, 12.5, -3.25),
+                              (13.25, 12.5, -3.25)),
+                         0.0)                      # Distance with self is zero
+        self.assertEqual(dist((), ()), 0.0)        # Zero-dimensional case
+        self.assertEqual(1.0,                      # Convert negative zero to positive zero
+            math.copysign(1.0, dist((-0.0,), (0.0,)))
+        )
+        self.assertEqual(1.0,                      # Convert negative zero to positive zero
+            math.copysign(1.0, dist((0.0,), (-0.0,)))
+        )
+        self.assertEqual(                          # Handling of moving max to the end
+            dist((1.5, 1.5, 0.5), (0, 0, 0)),
+            dist((1.5, 0.5, 1.5), (0, 0, 0))
+        )
+
+        # Verify tuple subclasses are allowed
+        class T(tuple):
+            pass
+        self.assertEqual(dist(T((1, 2, 3)), ((4, 2, -1))), 5.0)
+
+        # Test handling of bad arguments
+        with self.assertRaises(TypeError):         # Reject keyword args
+            dist(p=(1, 2, 3), q=(4, 5, 6))
+        with self.assertRaises(TypeError):         # Too few args
+            dist((1, 2, 3))
+        with self.assertRaises(TypeError):         # Too many args
+            dist((1, 2, 3), (4, 5, 6), (7, 8, 9))
+        with self.assertRaises(TypeError):         # Scalars not allowed
+            dist(1, 2)
+        with self.assertRaises(TypeError):         # Reject values without __float__
+            dist((1.1, 'string', 2.2), (1, 2, 3))
+        with self.assertRaises(ValueError):        # Check dimension agree
+            dist((1, 2, 3, 4), (5, 6, 7))
+        with self.assertRaises(ValueError):        # Check dimension agree
+            dist((1, 2, 3), (4, 5, 6, 7))
+        with self.assertRaises(TypeError):
+            dist((1,)*17 + ("spam",), (1,)*18)
+        with self.assertRaises(TypeError):         # Rejects invalid types
+            dist("abc", "xyz")
+        int_too_big_for_float = 10 ** (sys.float_info.max_10_exp + 5)
+        with self.assertRaises((ValueError, OverflowError)):
+            dist((1, int_too_big_for_float), (2, 3))
+        with self.assertRaises((ValueError, OverflowError)):
+            dist((2, 3), (1, int_too_big_for_float))
+        with self.assertRaises(TypeError):
+            dist((1,), 2)
+        with self.assertRaises(TypeError):
+            dist([1], 2)
+
+        class BadFloat:
+            __float__ = BadDescr()
+
+        with self.assertRaises(ValueError):
+            dist([1], [BadFloat()])
+
+        # Verify that the one dimensional case is equivalent to abs()
+        for i in range(20):
+            p, q = random.random(), random.random()
+            self.assertEqual(dist((p,), (q,)), abs(p - q))
+
+        # Test special values
+        values = [NINF, -10.5, -0.0, 0.0, 10.5, INF, NAN]
+        for p in itertools.product(values, repeat=3):
+            for q in itertools.product(values, repeat=3):
+                diffs = [px - qx for px, qx in zip(p, q)]
+                if any(map(math.isinf, diffs)):
+                    # Any infinite difference gives positive infinity.
+                    self.assertEqual(dist(p, q), INF)
+                elif any(map(math.isnan, diffs)):
+                    # If no infinity, any NaN gives a NaN.
+                    self.assertTrue(math.isnan(dist(p, q)))
+
+        # Verify scaling for extremely large values
+        fourthmax = FLOAT_MAX / 4.0
+        for n in range(32):
+            p = (fourthmax,) * n
+            q = (0.0,) * n
+            self.assertTrue(math.isclose(dist(p, q), fourthmax * math.sqrt(n)))
+            self.assertTrue(math.isclose(dist(q, p), fourthmax * math.sqrt(n)))
+
+        # Verify scaling for extremely small values
+        for exp in range(32):
+            scale = FLOAT_MIN / 2.0 ** exp
+            p = (4*scale, 3*scale)
+            q = (0.0, 0.0)
+            self.assertEqual(math.dist(p, q), 5*scale)
+            self.assertEqual(math.dist(q, p), 5*scale)
+
+    def test_math_dist_leak(self):
+        # gh-98897: Check for error handling does not leak memory
+        with self.assertRaises(ValueError):
+            math.dist([1, 2], [3, 4, 5])
+
+    @slowTest
+    def testIsqrt(self):
+        # Test a variety of inputs, large and small.
+        test_values = (
+            list(range(1000))
+            + list(range(10**6 - 1000, 10**6 + 1000))
+            + [2**e + i for e in range(60, 200) for i in range(-40, 40)]
+            + [3**9999, 10**5001]
+        )
+
+        for value in test_values:
+            with self.subTest(value=value):
+                s = math.isqrt(value)
+                self.assertIs(type(s), int)
+                self.assertLessEqual(s*s, value)
+                self.assertLess(value, (s+1)*(s+1))
+
+        # Negative values
+        with self.assertRaises(ValueError):
+            math.isqrt(-1)
+
+        # Integer-like things
+        s = math.isqrt(True)
+        self.assertIs(type(s), int)
+        self.assertEqual(s, 1)
+
+        s = math.isqrt(False)
+        self.assertIs(type(s), int)
+        self.assertEqual(s, 0)
+
+        class IntegerLike(object):
+            def __init__(self, value):
+                self.value = value
+
+            def __index__(self):
+                return self.value
+
+        s = math.isqrt(IntegerLike(1729))
+        self.assertIs(type(s), int)
+        self.assertEqual(s, 41)
+
+        with self.assertRaises(ValueError):
+            math.isqrt(IntegerLike(-3))
+
+        # Non-integer-like things
+        bad_values = [
+            3.5, "a string", decimal.Decimal("3.5"), 3.5j,
+            100.0, -4.0,
+        ]
+        for value in bad_values:
+            with self.subTest(value=value):
+                with self.assertRaises(TypeError):
+                    math.isqrt(value)
+
+    def test_lcm(self):
+        lcm = math.lcm
+        self.assertEqual(lcm(0, 0), 0)
+        self.assertEqual(lcm(1, 0), 0)
+        self.assertEqual(lcm(-1, 0), 0)
+        self.assertEqual(lcm(0, 1), 0)
+        self.assertEqual(lcm(0, -1), 0)
+        self.assertEqual(lcm(7, 1), 7)
+        self.assertEqual(lcm(7, -1), 7)
+        self.assertEqual(lcm(-23, 15), 345)
+        self.assertEqual(lcm(120, 84), 840)
+        self.assertEqual(lcm(84, -120), 840)
+        self.assertEqual(lcm(1216342683557601535506311712,
+                             436522681849110124616458784),
+                             16592536571065866494401400422922201534178938447014944)
+
+        x = 43461045657039990237
+        y = 10645022458251153277
+        for c in (652560,
+                  57655923087165495981):
+            a = x * c
+            b = y * c
+            d = x * y * c
+            self.assertEqual(lcm(a, b), d)
+            self.assertEqual(lcm(b, a), d)
+            self.assertEqual(lcm(-a, b), d)
+            self.assertEqual(lcm(b, -a), d)
+            self.assertEqual(lcm(a, -b), d)
+            self.assertEqual(lcm(-b, a), d)
+            self.assertEqual(lcm(-a, -b), d)
+            self.assertEqual(lcm(-b, -a), d)
+
+        self.assertEqual(lcm(), 1)
+        self.assertEqual(lcm(120), 120)
+        self.assertEqual(lcm(-120), 120)
+        self.assertEqual(lcm(120, 84, 102), 14280)
+        self.assertEqual(lcm(120, 0, 84), 0)
+
+        self.assertRaises(TypeError, lcm, 120.0)
+        self.assertRaises(TypeError, lcm, 120.0, 84)
+        self.assertRaises(TypeError, lcm, 120, 84.0)
+        self.assertRaises(TypeError, lcm, 120, 0, 84.0)
+        self.assertEqual(lcm(MyIndexable(120), MyIndexable(84)), 840)
+
+    def testLdexp(self):
+        self.assertRaises(TypeError, math.ldexp)
+        self.assertRaises(TypeError, math.ldexp, 2.0, 1.1)
+        self.ftest('ldexp(0,1)', math.ldexp(0,1), 0)
+        self.ftest('ldexp(1,1)', math.ldexp(1,1), 2)
+        self.ftest('ldexp(1,-1)', math.ldexp(1,-1), 0.5)
+        self.ftest('ldexp(-1,1)', math.ldexp(-1,1), -2)
+        self.assertRaises(OverflowError, math.ldexp, 1., 1000000)
+        self.assertRaises(OverflowError, math.ldexp, -1., 1000000)
+        self.assertEqual(math.ldexp(1., -1000000), 0.)
+        self.assertEqual(math.ldexp(-1., -1000000), -0.)
+        self.assertEqual(math.ldexp(INF, 30), INF)
+        self.assertEqual(math.ldexp(NINF, -213), NINF)
+        self.assertTrue(math.isnan(math.ldexp(NAN, 0)))
+
+        # large second argument
+        for n in [10**5, 10**10, 10**20, 10**40]:
+            self.assertEqual(math.ldexp(INF, -n), INF)
+            self.assertEqual(math.ldexp(NINF, -n), NINF)
+            self.assertEqual(math.ldexp(1., -n), 0.)
+            self.assertEqual(math.ldexp(-1., -n), -0.)
+            self.assertEqual(math.ldexp(0., -n), 0.)
+            self.assertEqual(math.ldexp(-0., -n), -0.)
+            self.assertTrue(math.isnan(math.ldexp(NAN, -n)))
+
+            self.assertRaises(OverflowError, math.ldexp, 1., n)
+            self.assertRaises(OverflowError, math.ldexp, -1., n)
+            self.assertEqual(math.ldexp(0., n), 0.)
+            self.assertEqual(math.ldexp(-0., n), -0.)
+            self.assertEqual(math.ldexp(INF, n), INF)
+            self.assertEqual(math.ldexp(NINF, n), NINF)
+            self.assertTrue(math.isnan(math.ldexp(NAN, n)))
+
+    def testLog(self):
+        self.assertRaises(TypeError, math.log)
+        self.assertRaises(TypeError, math.log, 1, 2, 3)
+        self.ftest('log(1/e)', math.log(1/math.e), -1)
+        self.ftest('log(1)', math.log(1), 0)
+        self.ftest('log(e)', math.log(math.e), 1)
+        self.ftest('log(32,2)', math.log(32,2), 5)
+        self.ftest('log(10**40, 10)', math.log(10**40, 10), 40)
+        self.ftest('log(10**40, 10**20)', math.log(10**40, 10**20), 2)
+        self.ftest('log(10**1000)', math.log(10**1000),
+                   2302.5850929940457)
+        self.assertRaises(ValueError, math.log, -1.5)
+        self.assertRaises(ValueError, math.log, -10**1000)
+        self.assertRaises(ValueError, math.log, 10, -10)
+        self.assertRaises(ValueError, math.log, NINF)
+        self.assertEqual(math.log(INF), INF)
+        self.assertTrue(math.isnan(math.log(NAN)))
+
+    def testLog1p(self):
+        self.assertRaises(TypeError, math.log1p)
+        for n in [2, 2**90, 2**300]:
+            self.assertAlmostEqual(math.log1p(n), math.log1p(float(n)))
+        self.assertRaises(ValueError, math.log1p, -1)
+        self.assertEqual(math.log1p(INF), INF)
+
+    @skipIfTorchDynamo("Infinite loop")
+    @requires_IEEE_754
+    def testLog2(self):
+        self.assertRaises(TypeError, math.log2)
+
+        # Check some integer values
+        self.assertEqual(math.log2(1), 0.0)
+        self.assertEqual(math.log2(2), 1.0)
+        self.assertEqual(math.log2(4), 2.0)
+
+        # Large integer values
+        self.assertEqual(math.log2(2**1023), 1023.0)
+        self.assertEqual(math.log2(2**1024), 1024.0)
+        self.assertEqual(math.log2(2**2000), 2000.0)
+
+        self.assertRaises(ValueError, math.log2, -1.5)
+        self.assertRaises(ValueError, math.log2, NINF)
+        self.assertTrue(math.isnan(math.log2(NAN)))
+
+    @skipIfTorchDynamo("Infinite loop")
+    @requires_IEEE_754
+    # log2() is not accurate enough on Mac OS X Tiger (10.4)
+    @support.requires_mac_ver(10, 5)
+    def testLog2Exact(self):
+        # Check that we get exact equality for log2 of powers of 2.
+        actual = [math.log2(math.ldexp(1.0, n)) for n in range(-1074, 1024)]
+        expected = [float(n) for n in range(-1074, 1024)]
+        self.assertEqual(actual, expected)
+
+    def testLog10(self):
+        self.assertRaises(TypeError, math.log10)
+        self.ftest('log10(0.1)', math.log10(0.1), -1)
+        self.ftest('log10(1)', math.log10(1), 0)
+        self.ftest('log10(10)', math.log10(10), 1)
+        self.ftest('log10(10**1000)', math.log10(10**1000), 1000.0)
+        self.assertRaises(ValueError, math.log10, -1.5)
+        self.assertRaises(ValueError, math.log10, -10**1000)
+        self.assertRaises(ValueError, math.log10, NINF)
+        self.assertEqual(math.log(INF), INF)
+        self.assertTrue(math.isnan(math.log10(NAN)))
+
+    def testSumProd(self):
+        sumprod = math.sumprod
+        Decimal = decimal.Decimal
+        Fraction = fractions.Fraction
+
+        # Core functionality
+        self.assertEqual(sumprod(iter([10, 20, 30]), (1, 2, 3)), 140)
+        self.assertEqual(sumprod([1.5, 2.5], [3.5, 4.5]), 16.5)
+        self.assertEqual(sumprod([], []), 0)
+        self.assertEqual(sumprod([-1], [1.]), -1)
+        self.assertEqual(sumprod([1.], [-1]), -1)
+
+        # Type preservation and coercion
+        for v in [
+            (10, 20, 30),
+            (1.5, -2.5),
+            (Fraction(3, 5), Fraction(4, 5)),
+            (Decimal(3.5), Decimal(4.5)),
+            (2.5, 10),             # float/int
+            (2.5, Fraction(3, 5)), # float/fraction
+            (25, Fraction(3, 5)),  # int/fraction
+            (25, Decimal(4.5)),    # int/decimal
+        ]:
+            for p, q in [(v, v), (v, v[::-1])]:
+                with self.subTest(p=p, q=q):
+                    expected = sum(p_i * q_i for p_i, q_i in zip(p, q, strict=True))
+                    actual = sumprod(p, q)
+                    self.assertEqual(expected, actual)
+                    self.assertEqual(type(expected), type(actual))
+
+        # Bad arguments
+        self.assertRaises(TypeError, sumprod)               # No args
+        self.assertRaises(TypeError, sumprod, [])           # One arg
+        self.assertRaises(TypeError, sumprod, [], [], [])   # Three args
+        self.assertRaises(TypeError, sumprod, None, [10])   # Non-iterable
+        self.assertRaises(TypeError, sumprod, [10], None)   # Non-iterable
+        self.assertRaises(TypeError, sumprod, ['x'], [1.0])
+
+        # Uneven lengths
+        self.assertRaises(ValueError, sumprod, [10, 20], [30])
+        self.assertRaises(ValueError, sumprod, [10], [20, 30])
+
+        # Overflows
+        self.assertEqual(sumprod([10**20], [1]), 10**20)
+        self.assertEqual(sumprod([1], [10**20]), 10**20)
+        self.assertEqual(sumprod([10**10], [10**10]), 10**20)
+        self.assertEqual(sumprod([10**7]*10**5, [10**7]*10**5), 10**19)
+        self.assertRaises(OverflowError, sumprod, [10**1000], [1.0])
+        self.assertRaises(OverflowError, sumprod, [1.0], [10**1000])
+
+        # Error in iterator
+        def raise_after(n):
+            for i in range(n):
+                yield i
+            raise RuntimeError
+        with self.assertRaises(RuntimeError):
+            sumprod(range(10), raise_after(5))
+        with self.assertRaises(RuntimeError):
+            sumprod(raise_after(5), range(10))
+
+        from test_iter import BasicIterClass
+
+        self.assertEqual(sumprod(BasicIterClass(1), [1]), 0)
+        self.assertEqual(sumprod([1], BasicIterClass(1)), 0)
+
+        # Error in multiplication
+        class BadMultiply:
+            def __mul__(self, other):
+                raise RuntimeError
+            def __rmul__(self, other):
+                raise RuntimeError
+        with self.assertRaises(RuntimeError):
+            sumprod([10, BadMultiply(), 30], [1, 2, 3])
+        with self.assertRaises(RuntimeError):
+            sumprod([1, 2, 3], [10, BadMultiply(), 30])
+
+        # Error in addition
+        with self.assertRaises(TypeError):
+            sumprod(['abc', 3], [5, 10])
+        with self.assertRaises(TypeError):
+            sumprod([5, 10], ['abc', 3])
+
+        # Special values should give the same as the pure python recipe
+        self.assertEqual(sumprod([10.1, math.inf], [20.2, 30.3]), math.inf)
+        self.assertEqual(sumprod([10.1, math.inf], [math.inf, 30.3]), math.inf)
+        self.assertEqual(sumprod([10.1, math.inf], [math.inf, math.inf]), math.inf)
+        self.assertEqual(sumprod([10.1, -math.inf], [20.2, 30.3]), -math.inf)
+        self.assertTrue(math.isnan(sumprod([10.1, math.inf], [-math.inf, math.inf])))
+        self.assertTrue(math.isnan(sumprod([10.1, math.nan], [20.2, 30.3])))
+        self.assertTrue(math.isnan(sumprod([10.1, math.inf], [math.nan, 30.3])))
+        self.assertTrue(math.isnan(sumprod([10.1, math.inf], [20.3, math.nan])))
+
+        # Error cases that arose during development
+        args = ((-5, -5, 10), (1.5, 4611686018427387904, 2305843009213693952))
+        self.assertEqual(sumprod(*args), 0.0)
+
+
+    @requires_IEEE_754
+    @unittest.skipIf(HAVE_DOUBLE_ROUNDING,
+                         "sumprod() accuracy not guaranteed on machines with double rounding")
+    @support.cpython_only    # Other implementations may choose a different algorithm
+    def test_sumprod_accuracy(self):
+        sumprod = math.sumprod
+        self.assertEqual(sumprod([0.1] * 10, [1]*10), 1.0)
+        self.assertEqual(sumprod([0.1] * 20, [True, False] * 10), 1.0)
+        self.assertEqual(sumprod([True, False] * 10, [0.1] * 20), 1.0)
+        self.assertEqual(sumprod([1.0, 10E100, 1.0, -10E100], [1.0]*4), 2.0)
+
+    @support.requires_resource('cpu')
+    def test_sumprod_stress(self):
+        sumprod = math.sumprod
+        product = itertools.product
+        Decimal = decimal.Decimal
+        Fraction = fractions.Fraction
+
+        class Int(int):
+            def __add__(self, other):
+                return Int(int(self) + int(other))
+            def __mul__(self, other):
+                return Int(int(self) * int(other))
+            __radd__ = __add__
+            __rmul__ = __mul__
+            def __repr__(self):
+                return f'Int({int(self)})'
+
+        class Flt(float):
+            def __add__(self, other):
+                return Int(int(self) + int(other))
+            def __mul__(self, other):
+                return Int(int(self) * int(other))
+            __radd__ = __add__
+            __rmul__ = __mul__
+            def __repr__(self):
+                return f'Flt({int(self)})'
+
+        def baseline_sumprod(p, q):
+            """This defines the target behavior including exceptions and special values.
+            However, it is subject to rounding errors, so float inputs should be exactly
+            representable with only a few bits.
+            """
+            total = 0
+            for p_i, q_i in zip(p, q, strict=True):
+                total += p_i * q_i
+            return total
+
+        def run(func, *args):
+            "Make comparing functions easier. Returns error status, type, and result."
+            try:
+                result = func(*args)
+            except (AssertionError, NameError):
+                raise
+            except Exception as e:
+                return type(e), None, 'None'
+            return None, type(result), repr(result)
+
+        pools = [
+            (-5, 10, -2**20, 2**31, 2**40, 2**61, 2**62, 2**80, 1.5, Int(7)),
+            (5.25, -3.5, 4.75, 11.25, 400.5, 0.046875, 0.25, -1.0, -0.078125),
+            (-19.0*2**500, 11*2**1000, -3*2**1500, 17*2*333,
+               5.25, -3.25, -3.0*2**(-333),  3, 2**513),
+            (3.75, 2.5, -1.5, float('inf'), -float('inf'), float('NaN'), 14,
+                9, 3+4j, Flt(13), 0.0),
+            (13.25, -4.25, Decimal('10.5'), Decimal('-2.25'), Fraction(13, 8),
+                 Fraction(-11, 16), 4.75 + 0.125j, 97, -41, Int(3)),
+            (Decimal('6.125'), Decimal('12.375'), Decimal('-2.75'), Decimal(0),
+                 Decimal('Inf'), -Decimal('Inf'), Decimal('NaN'), 12, 13.5),
+            (-2.0 ** -1000, 11*2**1000, 3, 7, -37*2**32, -2*2**-537, -2*2**-538,
+                 2*2**-513),
+            (-7 * 2.0 ** -510, 5 * 2.0 ** -520, 17, -19.0, -6.25),
+            (11.25, -3.75, -0.625, 23.375, True, False, 7, Int(5)),
+        ]
+
+        for pool in pools:
+            for size in range(4):
+                for args1 in product(pool, repeat=size):
+                    for args2 in product(pool, repeat=size):
+                        args = (args1, args2)
+                        self.assertEqual(
+                            run(baseline_sumprod, *args),
+                            run(sumprod, *args),
+                            args,
+                        )
+
+    @requires_IEEE_754
+    @unittest.skipIf(HAVE_DOUBLE_ROUNDING,
+                         "sumprod() accuracy not guaranteed on machines with double rounding")
+    @support.cpython_only    # Other implementations may choose a different algorithm
+    @support.requires_resource('cpu')
+    def test_sumprod_extended_precision_accuracy(self):
+        import operator
+        from fractions import Fraction
+        from itertools import starmap
+        from collections import namedtuple
+        from math import log2, exp2, fabs
+        from random import choices, uniform, shuffle
+        from statistics import median
+
+        DotExample = namedtuple('DotExample', ('x', 'y', 'target_sumprod', 'condition'))
+
+        def DotExact(x, y):
+            vec1 = map(Fraction, x)
+            vec2 = map(Fraction, y)
+            return sum(starmap(operator.mul, zip(vec1, vec2, strict=True)))
+
+        def Condition(x, y):
+            return 2.0 * DotExact(map(abs, x), map(abs, y)) / abs(DotExact(x, y))
+
+        def linspace(lo, hi, n):
+            width = (hi - lo) / (n - 1)
+            return [lo + width * i for i in range(n)]
+
+        def GenDot(n, c):
+            """ Algorithm 6.1 (GenDot) works as follows. The condition number (5.7) of
+            the dot product xT y is proportional to the degree of cancellation. In
+            order to achieve a prescribed cancellation, we generate the first half of
+            the vectors x and y randomly within a large exponent range. This range is
+            chosen according to the anticipated condition number. The second half of x
+            and y is then constructed choosing xi randomly with decreasing exponent,
+            and calculating yi such that some cancellation occurs. Finally, we permute
+            the vectors x, y randomly and calculate the achieved condition number.
+            """
+
+            assert n >= 6
+            n2 = n // 2
+            x = [0.0] * n
+            y = [0.0] * n
+            b = log2(c)
+
+            # First half with exponents from 0 to |_b/2_| and random ints in between
+            e = choices(range(int(b/2)), k=n2)
+            e[0] = int(b / 2) + 1
+            e[-1] = 0.0
+
+            x[:n2] = [uniform(-1.0, 1.0) * exp2(p) for p in e]
+            y[:n2] = [uniform(-1.0, 1.0) * exp2(p) for p in e]
+
+            # Second half
+            e = list(map(round, linspace(b/2, 0.0 , n-n2)))
+            for i in range(n2, n):
+                x[i] = uniform(-1.0, 1.0) * exp2(e[i - n2])
+                y[i] = (uniform(-1.0, 1.0) * exp2(e[i - n2]) - DotExact(x, y)) / x[i]
+
+            # Shuffle
+            pairs = list(zip(x, y))
+            shuffle(pairs)
+            x, y = zip(*pairs)
+
+            return DotExample(x, y, DotExact(x, y), Condition(x, y))
+
+        def RelativeError(res, ex):
+            x, y, target_sumprod, condition = ex
+            n = DotExact(list(x) + [-res], list(y) + [1])
+            return fabs(n / target_sumprod)
+
+        def Trial(dotfunc, c, n):
+            ex = GenDot(10, c)
+            res = dotfunc(ex.x, ex.y)
+            return RelativeError(res, ex)
+
+        times = 1000          # Number of trials
+        n = 20                # Length of vectors
+        c = 1e30              # Target condition number
+
+        # If the following test fails, it means that the C math library
+        # implementation of fma() is not compliant with the C99 standard
+        # and is inaccurate.  To solve this problem, make a new build
+        # with the symbol UNRELIABLE_FMA defined.  That will enable a
+        # slower but accurate code path that avoids the fma() call.
+        relative_err = median(Trial(math.sumprod, c, n) for i in range(times))
+        self.assertLess(relative_err, 1e-16)
+
+    def testModf(self):
+        self.assertRaises(TypeError, math.modf)
+
+        def testmodf(name, result, expected):
+            (v1, v2), (e1, e2) = result, expected
+            if abs(v1-e1) > eps or abs(v2-e2):
+                self.fail('%s returned %r, expected %r'%\
+                          (name, result, expected))
+
+        testmodf('modf(1.5)', math.modf(1.5), (0.5, 1.0))
+        testmodf('modf(-1.5)', math.modf(-1.5), (-0.5, -1.0))
+
+        self.assertEqual(math.modf(INF), (0.0, INF))
+        self.assertEqual(math.modf(NINF), (-0.0, NINF))
+
+        modf_nan = math.modf(NAN)
+        self.assertTrue(math.isnan(modf_nan[0]))
+        self.assertTrue(math.isnan(modf_nan[1]))
+
+    def testPow(self):
+        self.assertRaises(TypeError, math.pow)
+        self.ftest('pow(0,1)', math.pow(0,1), 0)
+        self.ftest('pow(1,0)', math.pow(1,0), 1)
+        self.ftest('pow(2,1)', math.pow(2,1), 2)
+        self.ftest('pow(2,-1)', math.pow(2,-1), 0.5)
+        self.assertEqual(math.pow(INF, 1), INF)
+        self.assertEqual(math.pow(NINF, 1), NINF)
+        self.assertEqual((math.pow(1, INF)), 1.)
+        self.assertEqual((math.pow(1, NINF)), 1.)
+        self.assertTrue(math.isnan(math.pow(NAN, 1)))
+        self.assertTrue(math.isnan(math.pow(2, NAN)))
+        self.assertTrue(math.isnan(math.pow(0, NAN)))
+        self.assertEqual(math.pow(1, NAN), 1)
+        self.assertRaises(OverflowError, math.pow, 1e+100, 1e+100)
+
+        # pow(0., x)
+        self.assertEqual(math.pow(0., INF), 0.)
+        self.assertEqual(math.pow(0., 3.), 0.)
+        self.assertEqual(math.pow(0., 2.3), 0.)
+        self.assertEqual(math.pow(0., 2.), 0.)
+        self.assertEqual(math.pow(0., 0.), 1.)
+        self.assertEqual(math.pow(0., -0.), 1.)
+        self.assertRaises(ValueError, math.pow, 0., -2.)
+        self.assertRaises(ValueError, math.pow, 0., -2.3)
+        self.assertRaises(ValueError, math.pow, 0., -3.)
+        self.assertEqual(math.pow(0., NINF), INF)
+        self.assertTrue(math.isnan(math.pow(0., NAN)))
+
+        # pow(INF, x)
+        self.assertEqual(math.pow(INF, INF), INF)
+        self.assertEqual(math.pow(INF, 3.), INF)
+        self.assertEqual(math.pow(INF, 2.3), INF)
+        self.assertEqual(math.pow(INF, 2.), INF)
+        self.assertEqual(math.pow(INF, 0.), 1.)
+        self.assertEqual(math.pow(INF, -0.), 1.)
+        self.assertEqual(math.pow(INF, -2.), 0.)
+        self.assertEqual(math.pow(INF, -2.3), 0.)
+        self.assertEqual(math.pow(INF, -3.), 0.)
+        self.assertEqual(math.pow(INF, NINF), 0.)
+        self.assertTrue(math.isnan(math.pow(INF, NAN)))
+
+        # pow(-0., x)
+        self.assertEqual(math.pow(-0., INF), 0.)
+        self.assertEqual(math.pow(-0., 3.), -0.)
+        self.assertEqual(math.pow(-0., 2.3), 0.)
+        self.assertEqual(math.pow(-0., 2.), 0.)
+        self.assertEqual(math.pow(-0., 0.), 1.)
+        self.assertEqual(math.pow(-0., -0.), 1.)
+        self.assertRaises(ValueError, math.pow, -0., -2.)
+        self.assertRaises(ValueError, math.pow, -0., -2.3)
+        self.assertRaises(ValueError, math.pow, -0., -3.)
+        self.assertEqual(math.pow(-0., NINF), INF)
+        self.assertTrue(math.isnan(math.pow(-0., NAN)))
+
+        # pow(NINF, x)
+        self.assertEqual(math.pow(NINF, INF), INF)
+        self.assertEqual(math.pow(NINF, 3.), NINF)
+        self.assertEqual(math.pow(NINF, 2.3), INF)
+        self.assertEqual(math.pow(NINF, 2.), INF)
+        self.assertEqual(math.pow(NINF, 0.), 1.)
+        self.assertEqual(math.pow(NINF, -0.), 1.)
+        self.assertEqual(math.pow(NINF, -2.), 0.)
+        self.assertEqual(math.pow(NINF, -2.3), 0.)
+        self.assertEqual(math.pow(NINF, -3.), -0.)
+        self.assertEqual(math.pow(NINF, NINF), 0.)
+        self.assertTrue(math.isnan(math.pow(NINF, NAN)))
+
+        # pow(-1, x)
+        self.assertEqual(math.pow(-1., INF), 1.)
+        self.assertEqual(math.pow(-1., 3.), -1.)
+        self.assertRaises(ValueError, math.pow, -1., 2.3)
+        self.assertEqual(math.pow(-1., 2.), 1.)
+        self.assertEqual(math.pow(-1., 0.), 1.)
+        self.assertEqual(math.pow(-1., -0.), 1.)
+        self.assertEqual(math.pow(-1., -2.), 1.)
+        self.assertRaises(ValueError, math.pow, -1., -2.3)
+        self.assertEqual(math.pow(-1., -3.), -1.)
+        self.assertEqual(math.pow(-1., NINF), 1.)
+        self.assertTrue(math.isnan(math.pow(-1., NAN)))
+
+        # pow(1, x)
+        self.assertEqual(math.pow(1., INF), 1.)
+        self.assertEqual(math.pow(1., 3.), 1.)
+        self.assertEqual(math.pow(1., 2.3), 1.)
+        self.assertEqual(math.pow(1., 2.), 1.)
+        self.assertEqual(math.pow(1., 0.), 1.)
+        self.assertEqual(math.pow(1., -0.), 1.)
+        self.assertEqual(math.pow(1., -2.), 1.)
+        self.assertEqual(math.pow(1., -2.3), 1.)
+        self.assertEqual(math.pow(1., -3.), 1.)
+        self.assertEqual(math.pow(1., NINF), 1.)
+        self.assertEqual(math.pow(1., NAN), 1.)
+
+        # pow(x, 0) should be 1 for any x
+        self.assertEqual(math.pow(2.3, 0.), 1.)
+        self.assertEqual(math.pow(-2.3, 0.), 1.)
+        self.assertEqual(math.pow(NAN, 0.), 1.)
+        self.assertEqual(math.pow(2.3, -0.), 1.)
+        self.assertEqual(math.pow(-2.3, -0.), 1.)
+        self.assertEqual(math.pow(NAN, -0.), 1.)
+
+        # pow(x, y) is invalid if x is negative and y is not integral
+        self.assertRaises(ValueError, math.pow, -1., 2.3)
+        self.assertRaises(ValueError, math.pow, -15., -3.1)
+
+        # pow(x, NINF)
+        self.assertEqual(math.pow(1.9, NINF), 0.)
+        self.assertEqual(math.pow(1.1, NINF), 0.)
+        self.assertEqual(math.pow(0.9, NINF), INF)
+        self.assertEqual(math.pow(0.1, NINF), INF)
+        self.assertEqual(math.pow(-0.1, NINF), INF)
+        self.assertEqual(math.pow(-0.9, NINF), INF)
+        self.assertEqual(math.pow(-1.1, NINF), 0.)
+        self.assertEqual(math.pow(-1.9, NINF), 0.)
+
+        # pow(x, INF)
+        self.assertEqual(math.pow(1.9, INF), INF)
+        self.assertEqual(math.pow(1.1, INF), INF)
+        self.assertEqual(math.pow(0.9, INF), 0.)
+        self.assertEqual(math.pow(0.1, INF), 0.)
+        self.assertEqual(math.pow(-0.1, INF), 0.)
+        self.assertEqual(math.pow(-0.9, INF), 0.)
+        self.assertEqual(math.pow(-1.1, INF), INF)
+        self.assertEqual(math.pow(-1.9, INF), INF)
+
+        # pow(x, y) should work for x negative, y an integer
+        self.ftest('(-2.)**3.', math.pow(-2.0, 3.0), -8.0)
+        self.ftest('(-2.)**2.', math.pow(-2.0, 2.0), 4.0)
+        self.ftest('(-2.)**1.', math.pow(-2.0, 1.0), -2.0)
+        self.ftest('(-2.)**0.', math.pow(-2.0, 0.0), 1.0)
+        self.ftest('(-2.)**-0.', math.pow(-2.0, -0.0), 1.0)
+        self.ftest('(-2.)**-1.', math.pow(-2.0, -1.0), -0.5)
+        self.ftest('(-2.)**-2.', math.pow(-2.0, -2.0), 0.25)
+        self.ftest('(-2.)**-3.', math.pow(-2.0, -3.0), -0.125)
+        self.assertRaises(ValueError, math.pow, -2.0, -0.5)
+        self.assertRaises(ValueError, math.pow, -2.0, 0.5)
+
+        # the following tests have been commented out since they don't
+        # really belong here:  the implementation of ** for floats is
+        # independent of the implementation of math.pow
+        #self.assertEqual(1**NAN, 1)
+        #self.assertEqual(1**INF, 1)
+        #self.assertEqual(1**NINF, 1)
+        #self.assertEqual(1**0, 1)
+        #self.assertEqual(1.**NAN, 1)
+        #self.assertEqual(1.**INF, 1)
+        #self.assertEqual(1.**NINF, 1)
+        #self.assertEqual(1.**0, 1)
+
+    def testRadians(self):
+        self.assertRaises(TypeError, math.radians)
+        self.ftest('radians(180)', math.radians(180), math.pi)
+        self.ftest('radians(90)', math.radians(90), math.pi/2)
+        self.ftest('radians(-45)', math.radians(-45), -math.pi/4)
+        self.ftest('radians(0)', math.radians(0), 0)
+
+    @requires_IEEE_754
+    def testRemainder(self):
+        from fractions import Fraction
+
+        def validate_spec(x, y, r):
+            """
+            Check that r matches remainder(x, y) according to the IEEE 754
+            specification. Assumes that x, y and r are finite and y is nonzero.
+            """
+            fx, fy, fr = Fraction(x), Fraction(y), Fraction(r)
+            # r should not exceed y/2 in absolute value
+            self.assertLessEqual(abs(fr), abs(fy/2))
+            # x - r should be an exact integer multiple of y
+            n = (fx - fr) / fy
+            self.assertEqual(n, int(n))
+            if abs(fr) == abs(fy/2):
+                # If |r| == |y/2|, n should be even.
+                self.assertEqual(n/2, int(n/2))
+
+        # triples (x, y, remainder(x, y)) in hexadecimal form.
+        testcases = [
+            # Remainders modulo 1, showing the ties-to-even behaviour.
+            '-4.0 1 -0.0',
+            '-3.8 1  0.8',
+            '-3.0 1 -0.0',
+            '-2.8 1 -0.8',
+            '-2.0 1 -0.0',
+            '-1.8 1  0.8',
+            '-1.0 1 -0.0',
+            '-0.8 1 -0.8',
+            '-0.0 1 -0.0',
+            ' 0.0 1  0.0',
+            ' 0.8 1  0.8',
+            ' 1.0 1  0.0',
+            ' 1.8 1 -0.8',
+            ' 2.0 1  0.0',
+            ' 2.8 1  0.8',
+            ' 3.0 1  0.0',
+            ' 3.8 1 -0.8',
+            ' 4.0 1  0.0',
+
+            # Reductions modulo 2*pi
+            '0x0.0p+0 0x1.921fb54442d18p+2 0x0.0p+0',
+            '0x1.921fb54442d18p+0 0x1.921fb54442d18p+2  0x1.921fb54442d18p+0',
+            '0x1.921fb54442d17p+1 0x1.921fb54442d18p+2  0x1.921fb54442d17p+1',
+            '0x1.921fb54442d18p+1 0x1.921fb54442d18p+2  0x1.921fb54442d18p+1',
+            '0x1.921fb54442d19p+1 0x1.921fb54442d18p+2 -0x1.921fb54442d17p+1',
+            '0x1.921fb54442d17p+2 0x1.921fb54442d18p+2 -0x0.0000000000001p+2',
+            '0x1.921fb54442d18p+2 0x1.921fb54442d18p+2  0x0p0',
+            '0x1.921fb54442d19p+2 0x1.921fb54442d18p+2  0x0.0000000000001p+2',
+            '0x1.2d97c7f3321d1p+3 0x1.921fb54442d18p+2  0x1.921fb54442d14p+1',
+            '0x1.2d97c7f3321d2p+3 0x1.921fb54442d18p+2 -0x1.921fb54442d18p+1',
+            '0x1.2d97c7f3321d3p+3 0x1.921fb54442d18p+2 -0x1.921fb54442d14p+1',
+            '0x1.921fb54442d17p+3 0x1.921fb54442d18p+2 -0x0.0000000000001p+3',
+            '0x1.921fb54442d18p+3 0x1.921fb54442d18p+2  0x0p0',
+            '0x1.921fb54442d19p+3 0x1.921fb54442d18p+2  0x0.0000000000001p+3',
+            '0x1.f6a7a2955385dp+3 0x1.921fb54442d18p+2  0x1.921fb54442d14p+1',
+            '0x1.f6a7a2955385ep+3 0x1.921fb54442d18p+2  0x1.921fb54442d18p+1',
+            '0x1.f6a7a2955385fp+3 0x1.921fb54442d18p+2 -0x1.921fb54442d14p+1',
+            '0x1.1475cc9eedf00p+5 0x1.921fb54442d18p+2  0x1.921fb54442d10p+1',
+            '0x1.1475cc9eedf01p+5 0x1.921fb54442d18p+2 -0x1.921fb54442d10p+1',
+
+            # Symmetry with respect to signs.
+            ' 1  0.c  0.4',
+            '-1  0.c -0.4',
+            ' 1 -0.c  0.4',
+            '-1 -0.c -0.4',
+            ' 1.4  0.c -0.4',
+            '-1.4  0.c  0.4',
+            ' 1.4 -0.c -0.4',
+            '-1.4 -0.c  0.4',
+
+            # Huge modulus, to check that the underlying algorithm doesn't
+            # rely on 2.0 * modulus being representable.
+            '0x1.dp+1023 0x1.4p+1023  0x0.9p+1023',
+            '0x1.ep+1023 0x1.4p+1023 -0x0.ap+1023',
+            '0x1.fp+1023 0x1.4p+1023 -0x0.9p+1023',
+        ]
+
+        for case in testcases:
+            with self.subTest(case=case):
+                x_hex, y_hex, expected_hex = case.split()
+                x = float.fromhex(x_hex)
+                y = float.fromhex(y_hex)
+                expected = float.fromhex(expected_hex)
+                validate_spec(x, y, expected)
+                actual = math.remainder(x, y)
+                # Cheap way of checking that the floats are
+                # as identical as we need them to be.
+                self.assertEqual(actual.hex(), expected.hex())
+
+        # Test tiny subnormal modulus: there's potential for
+        # getting the implementation wrong here (for example,
+        # by assuming that modulus/2 is exactly representable).
+        tiny = float.fromhex('1p-1074')  # min +ve subnormal
+        for n in range(-25, 25):
+            if n == 0:
+                continue
+            y = n * tiny
+            for m in range(100):
+                x = m * tiny
+                actual = math.remainder(x, y)
+                validate_spec(x, y, actual)
+                actual = math.remainder(-x, y)
+                validate_spec(-x, y, actual)
+
+        # Special values.
+        # NaNs should propagate as usual.
+        for value in [NAN, 0.0, -0.0, 2.0, -2.3, NINF, INF]:
+            self.assertIsNaN(math.remainder(NAN, value))
+            self.assertIsNaN(math.remainder(value, NAN))
+
+        # remainder(x, inf) is x, for non-nan non-infinite x.
+        for value in [-2.3, -0.0, 0.0, 2.3]:
+            self.assertEqual(math.remainder(value, INF), value)
+            self.assertEqual(math.remainder(value, NINF), value)
+
+        # remainder(x, 0) and remainder(infinity, x) for non-NaN x are invalid
+        # operations according to IEEE 754-2008 7.2(f), and should raise.
+        for value in [NINF, -2.3, -0.0, 0.0, 2.3, INF]:
+            with self.assertRaises(ValueError):
+                math.remainder(INF, value)
+            with self.assertRaises(ValueError):
+                math.remainder(NINF, value)
+            with self.assertRaises(ValueError):
+                math.remainder(value, 0.0)
+            with self.assertRaises(ValueError):
+                math.remainder(value, -0.0)
+
+    def testSin(self):
+        self.assertRaises(TypeError, math.sin)
+        self.ftest('sin(0)', math.sin(0), 0)
+        self.ftest('sin(pi/2)', math.sin(math.pi/2), 1)
+        self.ftest('sin(-pi/2)', math.sin(-math.pi/2), -1)
+        try:
+            self.assertTrue(math.isnan(math.sin(INF)))
+            self.assertTrue(math.isnan(math.sin(NINF)))
+        except ValueError:
+            self.assertRaises(ValueError, math.sin, INF)
+            self.assertRaises(ValueError, math.sin, NINF)
+        self.assertTrue(math.isnan(math.sin(NAN)))
+
+    def testSinh(self):
+        self.assertRaises(TypeError, math.sinh)
+        self.ftest('sinh(0)', math.sinh(0), 0)
+        self.ftest('sinh(1)**2-cosh(1)**2', math.sinh(1)**2-math.cosh(1)**2, -1)
+        self.ftest('sinh(1)+sinh(-1)', math.sinh(1)+math.sinh(-1), 0)
+        self.assertEqual(math.sinh(INF), INF)
+        self.assertEqual(math.sinh(NINF), NINF)
+        self.assertTrue(math.isnan(math.sinh(NAN)))
+
+    def testSqrt(self):
+        self.assertRaises(TypeError, math.sqrt)
+        self.ftest('sqrt(0)', math.sqrt(0), 0)
+        self.ftest('sqrt(0)', math.sqrt(0.0), 0.0)
+        self.ftest('sqrt(2.5)', math.sqrt(2.5), 1.5811388300841898)
+        self.ftest('sqrt(0.25)', math.sqrt(0.25), 0.5)
+        self.ftest('sqrt(25.25)', math.sqrt(25.25), 5.024937810560445)
+        self.ftest('sqrt(1)', math.sqrt(1), 1)
+        self.ftest('sqrt(4)', math.sqrt(4), 2)
+        self.assertEqual(math.sqrt(INF), INF)
+        self.assertRaises(ValueError, math.sqrt, -1)
+        self.assertRaises(ValueError, math.sqrt, NINF)
+        self.assertTrue(math.isnan(math.sqrt(NAN)))
+
+    def testTan(self):
+        self.assertRaises(TypeError, math.tan)
+        self.ftest('tan(0)', math.tan(0), 0)
+        self.ftest('tan(pi/4)', math.tan(math.pi/4), 1)
+        self.ftest('tan(-pi/4)', math.tan(-math.pi/4), -1)
+        try:
+            self.assertTrue(math.isnan(math.tan(INF)))
+            self.assertTrue(math.isnan(math.tan(NINF)))
+        except ValueError:
+            self.assertRaises(ValueError, math.tan, INF)
+            self.assertRaises(ValueError, math.tan, NINF)
+        self.assertTrue(math.isnan(math.tan(NAN)))
+
+    def testTanh(self):
+        self.assertRaises(TypeError, math.tanh)
+        self.ftest('tanh(0)', math.tanh(0), 0)
+        self.ftest('tanh(1)+tanh(-1)', math.tanh(1)+math.tanh(-1), 0,
+                   abs_tol=math.ulp(1))
+        self.ftest('tanh(inf)', math.tanh(INF), 1)
+        self.ftest('tanh(-inf)', math.tanh(NINF), -1)
+        self.assertTrue(math.isnan(math.tanh(NAN)))
+
+    @requires_IEEE_754
+    def testTanhSign(self):
+        # check that tanh(-0.) == -0. on IEEE 754 systems
+        self.assertEqual(math.tanh(-0.), -0.)
+        self.assertEqual(math.copysign(1., math.tanh(-0.)),
+                         math.copysign(1., -0.))
+
+    def test_trunc(self):
+        self.assertEqual(math.trunc(1), 1)
+        self.assertEqual(math.trunc(-1), -1)
+        self.assertEqual(type(math.trunc(1)), int)
+        self.assertEqual(type(math.trunc(1.5)), int)
+        self.assertEqual(math.trunc(1.5), 1)
+        self.assertEqual(math.trunc(-1.5), -1)
+        self.assertEqual(math.trunc(1.999999), 1)
+        self.assertEqual(math.trunc(-1.999999), -1)
+        self.assertEqual(math.trunc(-0.999999), -0)
+        self.assertEqual(math.trunc(-100.999), -100)
+
+        class TestTrunc:
+            def __trunc__(self):
+                return 23
+        class FloatTrunc(float):
+            def __trunc__(self):
+                return 23
+        class TestNoTrunc:
+            pass
+        class TestBadTrunc:
+            __trunc__ = BadDescr()
+
+        self.assertEqual(math.trunc(TestTrunc()), 23)
+        self.assertEqual(math.trunc(FloatTrunc()), 23)
+
+        self.assertRaises(TypeError, math.trunc)
+        self.assertRaises(TypeError, math.trunc, 1, 2)
+        self.assertRaises(TypeError, math.trunc, FloatLike(23.5))
+        self.assertRaises(TypeError, math.trunc, TestNoTrunc())
+        self.assertRaises(ValueError, math.trunc, TestBadTrunc())
+
+    def testIsfinite(self):
+        self.assertTrue(math.isfinite(0.0))
+        self.assertTrue(math.isfinite(-0.0))
+        self.assertTrue(math.isfinite(1.0))
+        self.assertTrue(math.isfinite(-1.0))
+        self.assertFalse(math.isfinite(float("nan")))
+        self.assertFalse(math.isfinite(float("inf")))
+        self.assertFalse(math.isfinite(float("-inf")))
+
+    def testIsnan(self):
+        self.assertTrue(math.isnan(float("nan")))
+        self.assertTrue(math.isnan(float("-nan")))
+        self.assertTrue(math.isnan(float("inf") * 0.))
+        self.assertFalse(math.isnan(float("inf")))
+        self.assertFalse(math.isnan(0.))
+        self.assertFalse(math.isnan(1.))
+
+    def testIsinf(self):
+        self.assertTrue(math.isinf(float("inf")))
+        self.assertTrue(math.isinf(float("-inf")))
+        self.assertTrue(math.isinf(1E400))
+        self.assertTrue(math.isinf(-1E400))
+        self.assertFalse(math.isinf(float("nan")))
+        self.assertFalse(math.isinf(0.))
+        self.assertFalse(math.isinf(1.))
+
+    def test_nan_constant(self):
+        # `math.nan` must be a quiet NaN with positive sign bit
+        self.assertTrue(math.isnan(math.nan))
+        self.assertEqual(math.copysign(1., math.nan), 1.)
+
+    def test_inf_constant(self):
+        self.assertTrue(math.isinf(math.inf))
+        self.assertGreater(math.inf, 0.0)
+        self.assertEqual(math.inf, float("inf"))
+        self.assertEqual(-math.inf, float("-inf"))
+
+    # RED_FLAG 16-Oct-2000 Tim
+    # While 2.0 is more consistent about exceptions than previous releases, it
+    # still fails this part of the test on some platforms.  For now, we only
+    # *run* test_exceptions() in verbose mode, so that this isn't normally
+    # tested.
+    @unittest.skipUnless(verbose, 'requires verbose mode')
+    def test_exceptions(self):
+        try:
+            x = math.exp(-1000000000)
+        except:
+            # mathmodule.c is failing to weed out underflows from libm, or
+            # we've got an fp format with huge dynamic range
+            self.fail("underflowing exp() should not have raised "
+                        "an exception")
+        if x != 0:
+            self.fail("underflowing exp() should have returned 0")
+
+        # If this fails, probably using a strict IEEE-754 conforming libm, and x
+        # is +Inf afterwards.  But Python wants overflows detected by default.
+        try:
+            x = math.exp(1000000000)
+        except OverflowError:
+            pass
+        else:
+            self.fail("overflowing exp() didn't trigger OverflowError")
+
+        # If this fails, it could be a puzzle.  One odd possibility is that
+        # mathmodule.c's macros are getting confused while comparing
+        # Inf (HUGE_VAL) to a NaN, and artificially setting errno to ERANGE
+        # as a result (and so raising OverflowError instead).
+        try:
+            x = math.sqrt(-1.0)
+        except ValueError:
+            pass
+        else:
+            self.fail("sqrt(-1) didn't raise ValueError")
+
+    @requires_IEEE_754
+    def test_testfile(self):
+        # Some tests need to be skipped on ancient OS X versions.
+        # See issue #27953.
+        SKIP_ON_TIGER = {'tan0064'}
+
+        osx_version = None
+        if sys.platform == 'darwin':
+            version_txt = platform.mac_ver()[0]
+            try:
+                osx_version = tuple(map(int, version_txt.split('.')))
+            except ValueError:
+                pass
+
+        fail_fmt = "{}: {}({!r}): {}"
+
+        failures = []
+        for id, fn, ar, ai, er, ei, flags in parse_testfile(test_file):
+            # Skip if either the input or result is complex
+            if ai != 0.0 or ei != 0.0:
+                continue
+            if fn in ['rect', 'polar']:
+                # no real versions of rect, polar
+                continue
+            # Skip certain tests on OS X 10.4.
+            if osx_version is not None and osx_version < (10, 5):
+                if id in SKIP_ON_TIGER:
+                    continue
+
+            func = getattr(math, fn)
+
+            if 'invalid' in flags or 'divide-by-zero' in flags:
+                er = 'ValueError'
+            elif 'overflow' in flags:
+                er = 'OverflowError'
+
+            try:
+                result = func(ar)
+            except ValueError:
+                result = 'ValueError'
+            except OverflowError:
+                result = 'OverflowError'
+
+            # C99+ says for math.h's sqrt: If the argument is +∞ or ±0, it is
+            # returned, unmodified.  On another hand, for csqrt: If z is ±0+0i,
+            # the result is +0+0i.  Lets correct zero sign of er to follow
+            # first convention.
+            if id in ['sqrt0002', 'sqrt0003', 'sqrt1001', 'sqrt1023']:
+                er = math.copysign(er, ar)
+
+            # Default tolerances
+            ulp_tol, abs_tol = 5, 0.0
+
+            failure = result_check(er, result, ulp_tol, abs_tol)
+            if failure is None:
+                continue
+
+            msg = fail_fmt.format(id, fn, ar, failure)
+            failures.append(msg)
+
+        if failures:
+            self.fail('Failures in test_testfile:\n  ' +
+                      '\n  '.join(failures))
+
+    @requires_IEEE_754
+    def test_mtestfile(self):
+        fail_fmt = "{}: {}({!r}): {}"
+
+        failures = []
+        for id, fn, arg, expected, flags in parse_mtestfile(math_testcases):
+            func = getattr(math, fn)
+
+            if 'invalid' in flags or 'divide-by-zero' in flags:
+                expected = 'ValueError'
+            elif 'overflow' in flags:
+                expected = 'OverflowError'
+
+            try:
+                got = func(arg)
+            except ValueError:
+                got = 'ValueError'
+            except OverflowError:
+                got = 'OverflowError'
+
+            # Default tolerances
+            ulp_tol, abs_tol = 5, 0.0
+
+            # Exceptions to the defaults
+            if fn == 'gamma':
+                # Experimental results on one platform gave
+                # an accuracy of <= 10 ulps across the entire float
+                # domain. We weaken that to require 20 ulp accuracy.
+                ulp_tol = 20
+
+            elif fn == 'lgamma':
+                # we use a weaker accuracy test for lgamma;
+                # lgamma only achieves an absolute error of
+                # a few multiples of the machine accuracy, in
+                # general.
+                abs_tol = 1e-15
+
+            elif fn == 'erfc' and arg >= 0.0:
+                # erfc has less-than-ideal accuracy for large
+                # arguments (x ~ 25 or so), mainly due to the
+                # error involved in computing exp(-x*x).
+                #
+                # Observed between CPython and mpmath at 25 dp:
+                #       x <  0 : err <= 2 ulp
+                #  0 <= x <  1 : err <= 10 ulp
+                #  1 <= x < 10 : err <= 100 ulp
+                # 10 <= x < 20 : err <= 300 ulp
+                # 20 <= x      : < 600 ulp
+                #
+                if arg < 1.0:
+                    ulp_tol = 10
+                elif arg < 10.0:
+                    ulp_tol = 100
+                else:
+                    ulp_tol = 1000
+
+            failure = result_check(expected, got, ulp_tol, abs_tol)
+            if failure is None:
+                continue
+
+            msg = fail_fmt.format(id, fn, arg, failure)
+            failures.append(msg)
+
+        if failures:
+            self.fail('Failures in test_mtestfile:\n  ' +
+                      '\n  '.join(failures))
+
+    def test_prod(self):
+        from fractions import Fraction as F
+
+        prod = math.prod
+        self.assertEqual(prod([]), 1)
+        self.assertEqual(prod([], start=5), 5)
+        self.assertEqual(prod(list(range(2,8))), 5040)
+        self.assertEqual(prod(iter(list(range(2,8)))), 5040)
+        self.assertEqual(prod(range(1, 10), start=10), 3628800)
+
+        self.assertEqual(prod([1, 2, 3, 4, 5]), 120)
+        self.assertEqual(prod([1.0, 2.0, 3.0, 4.0, 5.0]), 120.0)
+        self.assertEqual(prod([1, 2, 3, 4.0, 5.0]), 120.0)
+        self.assertEqual(prod([1.0, 2.0, 3.0, 4, 5]), 120.0)
+        self.assertEqual(prod([1., F(3, 2)]), 1.5)
+
+        # Error in multiplication
+        class BadMultiply:
+            def __rmul__(self, other):
+                raise RuntimeError
+        with self.assertRaises(RuntimeError):
+            prod([10., BadMultiply()])
+
+        # Test overflow in fast-path for integers
+        self.assertEqual(prod([1, 1, 2**32, 1, 1]), 2**32)
+        # Test overflow in fast-path for floats
+        self.assertEqual(prod([1.0, 1.0, 2**32, 1, 1]), float(2**32))
+
+        self.assertRaises(TypeError, prod)
+        self.assertRaises(TypeError, prod, 42)
+        self.assertRaises(TypeError, prod, ['a', 'b', 'c'])
+        self.assertRaises(TypeError, prod, ['a', 'b', 'c'], start='')
+        self.assertRaises(TypeError, prod, [b'a', b'c'], start=b'')
+        values = [bytearray(b'a'), bytearray(b'b')]
+        self.assertRaises(TypeError, prod, values, start=bytearray(b''))
+        self.assertRaises(TypeError, prod, [[1], [2], [3]])
+        self.assertRaises(TypeError, prod, [{2:3}])
+        self.assertRaises(TypeError, prod, [{2:3}]*2, start={2:3})
+        self.assertRaises(TypeError, prod, [[1], [2], [3]], start=[])
+
+        # Some odd cases
+        self.assertEqual(prod([2, 3], start='ab'), 'abababababab')
+        self.assertEqual(prod([2, 3], start=[1, 2]), [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2])
+        self.assertEqual(prod([], start={2: 3}), {2:3})
+
+        with self.assertRaises(TypeError):
+            prod([10, 20], 1)     # start is a keyword-only argument
+
+        self.assertEqual(prod([0, 1, 2, 3]), 0)
+        self.assertEqual(prod([1, 0, 2, 3]), 0)
+        self.assertEqual(prod([1, 2, 3, 0]), 0)
+
+        def _naive_prod(iterable, start=1):
+            for elem in iterable:
+                start *= elem
+            return start
+
+        # Big integers
+
+        iterable = range(1, 10000)
+        self.assertEqual(prod(iterable), _naive_prod(iterable))
+        iterable = range(-10000, -1)
+        self.assertEqual(prod(iterable), _naive_prod(iterable))
+        iterable = range(-1000, 1000)
+        self.assertEqual(prod(iterable), 0)
+
+        # Big floats
+
+        iterable = [float(x) for x in range(1, 1000)]
+        self.assertEqual(prod(iterable), _naive_prod(iterable))
+        iterable = [float(x) for x in range(-1000, -1)]
+        self.assertEqual(prod(iterable), _naive_prod(iterable))
+        iterable = [float(x) for x in range(-1000, 1000)]
+        self.assertIsNaN(prod(iterable))
+
+        # Float tests
+
+        self.assertIsNaN(prod([1, 2, 3, float("nan"), 2, 3]))
+        self.assertIsNaN(prod([1, 0, float("nan"), 2, 3]))
+        self.assertIsNaN(prod([1, float("nan"), 0, 3]))
+        self.assertIsNaN(prod([1, float("inf"), float("nan"),3]))
+        self.assertIsNaN(prod([1, float("-inf"), float("nan"),3]))
+        self.assertIsNaN(prod([1, float("nan"), float("inf"),3]))
+        self.assertIsNaN(prod([1, float("nan"), float("-inf"),3]))
+
+        self.assertEqual(prod([1, 2, 3, float('inf'),-3,4]), float('-inf'))
+        self.assertEqual(prod([1, 2, 3, float('-inf'),-3,4]), float('inf'))
+
+        self.assertIsNaN(prod([1,2,0,float('inf'), -3, 4]))
+        self.assertIsNaN(prod([1,2,0,float('-inf'), -3, 4]))
+        self.assertIsNaN(prod([1, 2, 3, float('inf'), -3, 0, 3]))
+        self.assertIsNaN(prod([1, 2, 3, float('-inf'), -3, 0, 2]))
+
+        # Type preservation
+
+        self.assertEqual(type(prod([1, 2, 3, 4, 5, 6])), int)
+        self.assertEqual(type(prod([1, 2.0, 3, 4, 5, 6])), float)
+        self.assertEqual(type(prod(range(1, 10000))), int)
+        self.assertEqual(type(prod(range(1, 10000), start=1.0)), float)
+        self.assertEqual(type(prod([1, decimal.Decimal(2.0), 3, 4, 5, 6])),
+                         decimal.Decimal)
+
+    @skipIfTorchDynamo("Infinite loop")
+    def testPerm(self):
+        perm = math.perm
+        factorial = math.factorial
+        # Test if factorial definition is satisfied
+        for n in range(500):
+            for k in (range(n + 1) if n < 100 else range(30) if n < 200 else range(10)):
+                self.assertEqual(perm(n, k),
+                                 factorial(n) // factorial(n - k))
+
+        # Test for Pascal's identity
+        for n in range(1, 100):
+            for k in range(1, n):
+                self.assertEqual(perm(n, k), perm(n - 1, k - 1) * k + perm(n - 1, k))
+
+        # Test corner cases
+        for n in range(1, 100):
+            self.assertEqual(perm(n, 0), 1)
+            self.assertEqual(perm(n, 1), n)
+            self.assertEqual(perm(n, n), factorial(n))
+
+        # Test one argument form
+        for n in range(20):
+            self.assertEqual(perm(n), factorial(n))
+            self.assertEqual(perm(n, None), factorial(n))
+
+        # Raises TypeError if any argument is non-integer or argument count is
+        # not 1 or 2
+        self.assertRaises(TypeError, perm, 10, 1.0)
+        self.assertRaises(TypeError, perm, 10, decimal.Decimal(1.0))
+        self.assertRaises(TypeError, perm, 10, "1")
+        self.assertRaises(TypeError, perm, 10.0, 1)
+        self.assertRaises(TypeError, perm, decimal.Decimal(10.0), 1)
+        self.assertRaises(TypeError, perm, "10", 1)
+
+        self.assertRaises(TypeError, perm)
+        self.assertRaises(TypeError, perm, 10, 1, 3)
+        self.assertRaises(TypeError, perm)
+
+        # Raises Value error if not k or n are negative numbers
+        self.assertRaises(ValueError, perm, -1, 1)
+        self.assertRaises(ValueError, perm, -2**1000, 1)
+        self.assertRaises(ValueError, perm, 1, -1)
+        self.assertRaises(ValueError, perm, 1, -2**1000)
+
+        # Returns zero if k is greater than n
+        self.assertEqual(perm(1, 2), 0)
+        self.assertEqual(perm(1, 2**1000), 0)
+
+        n = 2**1000
+        self.assertEqual(perm(n, 0), 1)
+        self.assertEqual(perm(n, 1), n)
+        self.assertEqual(perm(n, 2), n * (n-1))
+        if support.check_impl_detail(cpython=True):
+            self.assertRaises(OverflowError, perm, n, n)
+
+        for n, k in (True, True), (True, False), (False, False):
+            self.assertEqual(perm(n, k), 1)
+            self.assertIs(type(perm(n, k)), int)
+        self.assertEqual(perm(IntSubclass(5), IntSubclass(2)), 20)
+        self.assertEqual(perm(MyIndexable(5), MyIndexable(2)), 20)
+        for k in range(3):
+            self.assertIs(type(perm(IntSubclass(5), IntSubclass(k))), int)
+            self.assertIs(type(perm(MyIndexable(5), MyIndexable(k))), int)
+
+    @skipIfTorchDynamo("infinite loop")
+    def testComb(self):
+        comb = math.comb
+        factorial = math.factorial
+        # Test if factorial definition is satisfied
+        for n in range(500):
+            for k in (range(n + 1) if n < 100 else range(30) if n < 200 else range(10)):
+                self.assertEqual(comb(n, k), factorial(n)
+                    // (factorial(k) * factorial(n - k)))
+
+        # Test for Pascal's identity
+        for n in range(1, 100):
+            for k in range(1, n):
+                self.assertEqual(comb(n, k), comb(n - 1, k - 1) + comb(n - 1, k))
+
+        # Test corner cases
+        for n in range(100):
+            self.assertEqual(comb(n, 0), 1)
+            self.assertEqual(comb(n, n), 1)
+
+        for n in range(1, 100):
+            self.assertEqual(comb(n, 1), n)
+            self.assertEqual(comb(n, n - 1), n)
+
+        # Test Symmetry
+        for n in range(100):
+            for k in range(n // 2):
+                self.assertEqual(comb(n, k), comb(n, n - k))
+
+        # Raises TypeError if any argument is non-integer or argument count is
+        # not 2
+        self.assertRaises(TypeError, comb, 10, 1.0)
+        self.assertRaises(TypeError, comb, 10, decimal.Decimal(1.0))
+        self.assertRaises(TypeError, comb, 10, "1")
+        self.assertRaises(TypeError, comb, 10.0, 1)
+        self.assertRaises(TypeError, comb, decimal.Decimal(10.0), 1)
+        self.assertRaises(TypeError, comb, "10", 1)
+
+        self.assertRaises(TypeError, comb, 10)
+        self.assertRaises(TypeError, comb, 10, 1, 3)
+        self.assertRaises(TypeError, comb)
+
+        # Raises Value error if not k or n are negative numbers
+        self.assertRaises(ValueError, comb, -1, 1)
+        self.assertRaises(ValueError, comb, -2**1000, 1)
+        self.assertRaises(ValueError, comb, 1, -1)
+        self.assertRaises(ValueError, comb, 1, -2**1000)
+
+        # Returns zero if k is greater than n
+        self.assertEqual(comb(1, 2), 0)
+        self.assertEqual(comb(1, 2**1000), 0)
+
+        n = 2**1000
+        self.assertEqual(comb(n, 0), 1)
+        self.assertEqual(comb(n, 1), n)
+        self.assertEqual(comb(n, 2), n * (n-1) // 2)
+        self.assertEqual(comb(n, n), 1)
+        self.assertEqual(comb(n, n-1), n)
+        self.assertEqual(comb(n, n-2), n * (n-1) // 2)
+        if support.check_impl_detail(cpython=True):
+            self.assertRaises(OverflowError, comb, n, n//2)
+
+        for n, k in (True, True), (True, False), (False, False):
+            self.assertEqual(comb(n, k), 1)
+            self.assertIs(type(comb(n, k)), int)
+        self.assertEqual(comb(IntSubclass(5), IntSubclass(2)), 10)
+        self.assertEqual(comb(MyIndexable(5), MyIndexable(2)), 10)
+        for k in range(3):
+            self.assertIs(type(comb(IntSubclass(5), IntSubclass(k))), int)
+            self.assertIs(type(comb(MyIndexable(5), MyIndexable(k))), int)
+
+    @requires_IEEE_754
+    def test_nextafter(self):
+        # around 2^52 and 2^63
+        self.assertEqual(math.nextafter(4503599627370496.0, -INF),
+                         4503599627370495.5)
+        self.assertEqual(math.nextafter(4503599627370496.0, INF),
+                         4503599627370497.0)
+        self.assertEqual(math.nextafter(9223372036854775808.0, 0.0),
+                         9223372036854774784.0)
+        self.assertEqual(math.nextafter(-9223372036854775808.0, 0.0),
+                         -9223372036854774784.0)
+
+        # around 1.0
+        self.assertEqual(math.nextafter(1.0, -INF),
+                         float.fromhex('0x1.fffffffffffffp-1'))
+        self.assertEqual(math.nextafter(1.0, INF),
+                         float.fromhex('0x1.0000000000001p+0'))
+        self.assertEqual(math.nextafter(1.0, -INF, steps=1),
+                         float.fromhex('0x1.fffffffffffffp-1'))
+        self.assertEqual(math.nextafter(1.0, INF, steps=1),
+                         float.fromhex('0x1.0000000000001p+0'))
+        self.assertEqual(math.nextafter(1.0, -INF, steps=3),
+                         float.fromhex('0x1.ffffffffffffdp-1'))
+        self.assertEqual(math.nextafter(1.0, INF, steps=3),
+                         float.fromhex('0x1.0000000000003p+0'))
+
+        # x == y: y is returned
+        for steps in range(1, 5):
+            self.assertEqual(math.nextafter(2.0, 2.0, steps=steps), 2.0)
+            self.assertEqualSign(math.nextafter(-0.0, +0.0, steps=steps), +0.0)
+            self.assertEqualSign(math.nextafter(+0.0, -0.0, steps=steps), -0.0)
+
+        # around 0.0
+        smallest_subnormal = sys.float_info.min * sys.float_info.epsilon
+        self.assertEqual(math.nextafter(+0.0, INF), smallest_subnormal)
+        self.assertEqual(math.nextafter(-0.0, INF), smallest_subnormal)
+        self.assertEqual(math.nextafter(+0.0, -INF), -smallest_subnormal)
+        self.assertEqual(math.nextafter(-0.0, -INF), -smallest_subnormal)
+        self.assertEqualSign(math.nextafter(smallest_subnormal, +0.0), +0.0)
+        self.assertEqualSign(math.nextafter(-smallest_subnormal, +0.0), -0.0)
+        self.assertEqualSign(math.nextafter(smallest_subnormal, -0.0), +0.0)
+        self.assertEqualSign(math.nextafter(-smallest_subnormal, -0.0), -0.0)
+
+        # around infinity
+        largest_normal = sys.float_info.max
+        self.assertEqual(math.nextafter(INF, 0.0), largest_normal)
+        self.assertEqual(math.nextafter(-INF, 0.0), -largest_normal)
+        self.assertEqual(math.nextafter(largest_normal, INF), INF)
+        self.assertEqual(math.nextafter(-largest_normal, -INF), -INF)
+
+        # NaN
+        self.assertIsNaN(math.nextafter(NAN, 1.0))
+        self.assertIsNaN(math.nextafter(1.0, NAN))
+        self.assertIsNaN(math.nextafter(NAN, NAN))
+
+        self.assertEqual(1.0, math.nextafter(1.0, INF, steps=0))
+        with self.assertRaises(ValueError):
+            math.nextafter(1.0, INF, steps=-1)
+
+
+    @unittest.skip("flaky test under torch dynamo")  # works on pytest and crashes on unittest
+    @requires_IEEE_754
+    def test_ulp(self):
+        self.assertEqual(math.ulp(1.0), sys.float_info.epsilon)
+        # use int ** int rather than float ** int to not rely on pow() accuracy
+        self.assertEqual(math.ulp(2 ** 52), 1.0)
+        self.assertEqual(math.ulp(2 ** 53), 2.0)
+        self.assertEqual(math.ulp(2 ** 64), 4096.0)
+
+        # min and max
+        self.assertEqual(math.ulp(0.0),
+                         sys.float_info.min * sys.float_info.epsilon)
+        self.assertEqual(math.ulp(FLOAT_MAX),
+                         FLOAT_MAX - math.nextafter(FLOAT_MAX, -INF))
+
+        # special cases
+        self.assertEqual(math.ulp(INF), INF)
+        self.assertIsNaN(math.ulp(math.nan))
+
+        # negative number: ulp(-x) == ulp(x)
+        for x in (0.0, 1.0, 2 ** 52, 2 ** 64, INF):
+            with self.subTest(x=x):
+                self.assertEqual(math.ulp(-x), math.ulp(x))
+
+    def test_issue39871(self):
+        # A SystemError should not be raised if the first arg to atan2(),
+        # copysign(), or remainder() cannot be converted to a float.
+        class F:
+            def __float__(self):
+                self.converted = True
+                1/0
+        for func in math.atan2, math.copysign, math.remainder:
+            y = F()
+            with self.assertRaises(TypeError):
+                func("not a number", y)
+
+            # There should not have been any attempt to convert the second
+            # argument to a float.
+            self.assertFalse(getattr(y, "converted", False))
+
+    def test_input_exceptions(self):
+        self.assertRaises(TypeError, math.exp, "spam")
+        self.assertRaises(TypeError, math.erf, "spam")
+        self.assertRaises(TypeError, math.atan2, "spam", 1.0)
+        self.assertRaises(TypeError, math.atan2, 1.0, "spam")
+        self.assertRaises(TypeError, math.atan2, 1.0)
+        self.assertRaises(TypeError, math.atan2, 1.0, 2.0, 3.0)
+
+    # Custom assertions.
+
+    def assertIsNaN(self, value):
+        if not math.isnan(value):
+            self.fail("Expected a NaN, got {!r}.".format(value))
+
+    def assertEqualSign(self, x, y):
+        """Similar to assertEqual(), but compare also the sign with copysign().
+
+        Function useful to compare signed zeros.
+        """
+        self.assertEqual(x, y)
+        self.assertEqual(math.copysign(1.0, x), math.copysign(1.0, y))
+
+
+class IsCloseTests(__TestCase):
+    isclose = math.isclose  # subclasses should override this
+
+    def assertIsClose(self, a, b, *args, **kwargs):
+        self.assertTrue(self.isclose(a, b, *args, **kwargs),
+                        msg="%s and %s should be close!" % (a, b))
+
+    def assertIsNotClose(self, a, b, *args, **kwargs):
+        self.assertFalse(self.isclose(a, b, *args, **kwargs),
+                         msg="%s and %s should not be close!" % (a, b))
+
+    def assertAllClose(self, examples, *args, **kwargs):
+        for a, b in examples:
+            self.assertIsClose(a, b, *args, **kwargs)
+
+    def assertAllNotClose(self, examples, *args, **kwargs):
+        for a, b in examples:
+            self.assertIsNotClose(a, b, *args, **kwargs)
+
+    def test_negative_tolerances(self):
+        # ValueError should be raised if either tolerance is less than zero
+        with self.assertRaises(ValueError):
+            self.assertIsClose(1, 1, rel_tol=-1e-100)
+        with self.assertRaises(ValueError):
+            self.assertIsClose(1, 1, rel_tol=1e-100, abs_tol=-1e10)
+
+    def test_identical(self):
+        # identical values must test as close
+        identical_examples = [(2.0, 2.0),
+                              (0.1e200, 0.1e200),
+                              (1.123e-300, 1.123e-300),
+                              (12345, 12345.0),
+                              (0.0, -0.0),
+                              (345678, 345678)]
+        self.assertAllClose(identical_examples, rel_tol=0.0, abs_tol=0.0)
+
+    def test_eight_decimal_places(self):
+        # examples that are close to 1e-8, but not 1e-9
+        eight_decimal_places_examples = [(1e8, 1e8 + 1),
+                                         (-1e-8, -1.000000009e-8),
+                                         (1.12345678, 1.12345679)]
+        self.assertAllClose(eight_decimal_places_examples, rel_tol=1e-8)
+        self.assertAllNotClose(eight_decimal_places_examples, rel_tol=1e-9)
+
+    def test_near_zero(self):
+        # values close to zero
+        near_zero_examples = [(1e-9, 0.0),
+                              (-1e-9, 0.0),
+                              (-1e-150, 0.0)]
+        # these should not be close to any rel_tol
+        self.assertAllNotClose(near_zero_examples, rel_tol=0.9)
+        # these should be close to abs_tol=1e-8
+        self.assertAllClose(near_zero_examples, abs_tol=1e-8)
+
+    def test_identical_infinite(self):
+        # these are close regardless of tolerance -- i.e. they are equal
+        self.assertIsClose(INF, INF)
+        self.assertIsClose(INF, INF, abs_tol=0.0)
+        self.assertIsClose(NINF, NINF)
+        self.assertIsClose(NINF, NINF, abs_tol=0.0)
+
+    def test_inf_ninf_nan(self):
+        # these should never be close (following IEEE 754 rules for equality)
+        not_close_examples = [(NAN, NAN),
+                              (NAN, 1e-100),
+                              (1e-100, NAN),
+                              (INF, NAN),
+                              (NAN, INF),
+                              (INF, NINF),
+                              (INF, 1.0),
+                              (1.0, INF),
+                              (INF, 1e308),
+                              (1e308, INF)]
+        # use largest reasonable tolerance
+        self.assertAllNotClose(not_close_examples, abs_tol=0.999999999999999)
+
+    def test_zero_tolerance(self):
+        # test with zero tolerance
+        zero_tolerance_close_examples = [(1.0, 1.0),
+                                         (-3.4, -3.4),
+                                         (-1e-300, -1e-300)]
+        self.assertAllClose(zero_tolerance_close_examples, rel_tol=0.0)
+
+        zero_tolerance_not_close_examples = [(1.0, 1.000000000000001),
+                                             (0.99999999999999, 1.0),
+                                             (1.0e200, .999999999999999e200)]
+        self.assertAllNotClose(zero_tolerance_not_close_examples, rel_tol=0.0)
+
+    def test_asymmetry(self):
+        # test the asymmetry example from PEP 485
+        self.assertAllClose([(9, 10), (10, 9)], rel_tol=0.1)
+
+    def test_integers(self):
+        # test with integer values
+        integer_examples = [(100000001, 100000000),
+                            (123456789, 123456788)]
+
+        self.assertAllClose(integer_examples, rel_tol=1e-8)
+        self.assertAllNotClose(integer_examples, rel_tol=1e-9)
+
+    def test_decimals(self):
+        # test with Decimal values
+        from decimal import Decimal
+
+        decimal_examples = [(Decimal('1.00000001'), Decimal('1.0')),
+                            (Decimal('1.00000001e-20'), Decimal('1.0e-20')),
+                            (Decimal('1.00000001e-100'), Decimal('1.0e-100')),
+                            (Decimal('1.00000001e20'), Decimal('1.0e20'))]
+        self.assertAllClose(decimal_examples, rel_tol=1e-8)
+        self.assertAllNotClose(decimal_examples, rel_tol=1e-9)
+
+    def test_fractions(self):
+        # test with Fraction values
+        from fractions import Fraction
+
+        fraction_examples = [
+            (Fraction(1, 100000000) + 1, Fraction(1)),
+            (Fraction(100000001), Fraction(100000000)),
+            (Fraction(10**8 + 1, 10**28), Fraction(1, 10**20))]
+        self.assertAllClose(fraction_examples, rel_tol=1e-8)
+        self.assertAllNotClose(fraction_examples, rel_tol=1e-9)
+
+
+class FMATests(__TestCase):
+    """ Tests for math.fma. """
+
+    def test_fma_nan_results(self):
+        # Selected representative values.
+        values = [
+            -math.inf, -1e300, -2.3, -1e-300, -0.0,
+            0.0, 1e-300, 2.3, 1e300, math.inf, math.nan
+        ]
+
+        # If any input is a NaN, the result should be a NaN, too.
+        for a, b in itertools.product(values, repeat=2):
+            with self.subTest(a=a, b=b):
+                self.assertIsNaN(math.fma(math.nan, a, b))
+                self.assertIsNaN(math.fma(a, math.nan, b))
+                self.assertIsNaN(math.fma(a, b, math.nan))
+
+    def test_fma_infinities(self):
+        # Cases involving infinite inputs or results.
+        positives = [1e-300, 2.3, 1e300, math.inf]
+        finites = [-1e300, -2.3, -1e-300, -0.0, 0.0, 1e-300, 2.3, 1e300]
+        non_nans = [-math.inf, -2.3, -0.0, 0.0, 2.3, math.inf]
+
+        # ValueError due to inf * 0 computation.
+        for c in non_nans:
+            for infinity in [math.inf, -math.inf]:
+                for zero in [0.0, -0.0]:
+                    with self.subTest(c=c, infinity=infinity, zero=zero):
+                        with self.assertRaises(ValueError):
+                            math.fma(infinity, zero, c)
+                        with self.assertRaises(ValueError):
+                            math.fma(zero, infinity, c)
+
+        # ValueError when a*b and c both infinite of opposite signs.
+        for b in positives:
+            with self.subTest(b=b):
+                with self.assertRaises(ValueError):
+                    math.fma(math.inf, b, -math.inf)
+                with self.assertRaises(ValueError):
+                    math.fma(math.inf, -b, math.inf)
+                with self.assertRaises(ValueError):
+                    math.fma(-math.inf, -b, -math.inf)
+                with self.assertRaises(ValueError):
+                    math.fma(-math.inf, b, math.inf)
+                with self.assertRaises(ValueError):
+                    math.fma(b, math.inf, -math.inf)
+                with self.assertRaises(ValueError):
+                    math.fma(-b, math.inf, math.inf)
+                with self.assertRaises(ValueError):
+                    math.fma(-b, -math.inf, -math.inf)
+                with self.assertRaises(ValueError):
+                    math.fma(b, -math.inf, math.inf)
+
+        # Infinite result when a*b and c both infinite of the same sign.
+        for b in positives:
+            with self.subTest(b=b):
+                self.assertEqual(math.fma(math.inf, b, math.inf), math.inf)
+                self.assertEqual(math.fma(math.inf, -b, -math.inf), -math.inf)
+                self.assertEqual(math.fma(-math.inf, -b, math.inf), math.inf)
+                self.assertEqual(math.fma(-math.inf, b, -math.inf), -math.inf)
+                self.assertEqual(math.fma(b, math.inf, math.inf), math.inf)
+                self.assertEqual(math.fma(-b, math.inf, -math.inf), -math.inf)
+                self.assertEqual(math.fma(-b, -math.inf, math.inf), math.inf)
+                self.assertEqual(math.fma(b, -math.inf, -math.inf), -math.inf)
+
+        # Infinite result when a*b finite, c infinite.
+        for a, b in itertools.product(finites, finites):
+            with self.subTest(b=b):
+                self.assertEqual(math.fma(a, b, math.inf), math.inf)
+                self.assertEqual(math.fma(a, b, -math.inf), -math.inf)
+
+        # Infinite result when a*b infinite, c finite.
+        for b, c in itertools.product(positives, finites):
+            with self.subTest(b=b, c=c):
+                self.assertEqual(math.fma(math.inf, b, c), math.inf)
+                self.assertEqual(math.fma(-math.inf, b, c), -math.inf)
+                self.assertEqual(math.fma(-math.inf, -b, c), math.inf)
+                self.assertEqual(math.fma(math.inf, -b, c), -math.inf)
+
+                self.assertEqual(math.fma(b, math.inf, c), math.inf)
+                self.assertEqual(math.fma(b, -math.inf, c), -math.inf)
+                self.assertEqual(math.fma(-b, -math.inf, c), math.inf)
+                self.assertEqual(math.fma(-b, math.inf, c), -math.inf)
+
+    # gh-73468: On some platforms, libc fma() doesn't implement IEE 754-2008
+    # properly: it doesn't use the right sign when the result is zero.
+    @unittest.skipIf(
+        sys.platform.startswith(("freebsd", "wasi", "netbsd", "emscripten"))
+        or (sys.platform == "android" and platform.machine() == "x86_64"),
+        f"this platform doesn't implement IEE 754-2008 properly")
+    def test_fma_zero_result(self):
+        nonnegative_finites = [0.0, 1e-300, 2.3, 1e300]
+
+        # Zero results from exact zero inputs.
+        for b in nonnegative_finites:
+            with self.subTest(b=b):
+                self.assertIsPositiveZero(math.fma(0.0, b, 0.0))
+                self.assertIsPositiveZero(math.fma(0.0, b, -0.0))
+                self.assertIsNegativeZero(math.fma(0.0, -b, -0.0))
+                self.assertIsPositiveZero(math.fma(0.0, -b, 0.0))
+                self.assertIsPositiveZero(math.fma(-0.0, -b, 0.0))
+                self.assertIsPositiveZero(math.fma(-0.0, -b, -0.0))
+                self.assertIsNegativeZero(math.fma(-0.0, b, -0.0))
+                self.assertIsPositiveZero(math.fma(-0.0, b, 0.0))
+
+                self.assertIsPositiveZero(math.fma(b, 0.0, 0.0))
+                self.assertIsPositiveZero(math.fma(b, 0.0, -0.0))
+                self.assertIsNegativeZero(math.fma(-b, 0.0, -0.0))
+                self.assertIsPositiveZero(math.fma(-b, 0.0, 0.0))
+                self.assertIsPositiveZero(math.fma(-b, -0.0, 0.0))
+                self.assertIsPositiveZero(math.fma(-b, -0.0, -0.0))
+                self.assertIsNegativeZero(math.fma(b, -0.0, -0.0))
+                self.assertIsPositiveZero(math.fma(b, -0.0, 0.0))
+
+        # Exact zero result from nonzero inputs.
+        self.assertIsPositiveZero(math.fma(2.0, 2.0, -4.0))
+        self.assertIsPositiveZero(math.fma(2.0, -2.0, 4.0))
+        self.assertIsPositiveZero(math.fma(-2.0, -2.0, -4.0))
+        self.assertIsPositiveZero(math.fma(-2.0, 2.0, 4.0))
+
+        # Underflow to zero.
+        tiny = 1e-300
+        self.assertIsPositiveZero(math.fma(tiny, tiny, 0.0))
+        self.assertIsNegativeZero(math.fma(tiny, -tiny, 0.0))
+        self.assertIsPositiveZero(math.fma(-tiny, -tiny, 0.0))
+        self.assertIsNegativeZero(math.fma(-tiny, tiny, 0.0))
+        self.assertIsPositiveZero(math.fma(tiny, tiny, -0.0))
+        self.assertIsNegativeZero(math.fma(tiny, -tiny, -0.0))
+        self.assertIsPositiveZero(math.fma(-tiny, -tiny, -0.0))
+        self.assertIsNegativeZero(math.fma(-tiny, tiny, -0.0))
+
+        # Corner case where rounding the multiplication would
+        # give the wrong result.
+        x = float.fromhex('0x1p-500')
+        y = float.fromhex('0x1p-550')
+        z = float.fromhex('0x1p-1000')
+        self.assertIsNegativeZero(math.fma(x-y, x+y, -z))
+        self.assertIsPositiveZero(math.fma(y-x, x+y, z))
+        self.assertIsNegativeZero(math.fma(y-x, -(x+y), -z))
+        self.assertIsPositiveZero(math.fma(x-y, -(x+y), z))
+
+    def test_fma_overflow(self):
+        a = b = float.fromhex('0x1p512')
+        c = float.fromhex('0x1p1023')
+        # Overflow from multiplication.
+        with self.assertRaises(OverflowError):
+            math.fma(a, b, 0.0)
+        self.assertEqual(math.fma(a, b/2.0, 0.0), c)
+        # Overflow from the addition.
+        with self.assertRaises(OverflowError):
+            math.fma(a, b/2.0, c)
+        # No overflow, even though a*b overflows a float.
+        self.assertEqual(math.fma(a, b, -c), c)
+
+        # Extreme case: a * b is exactly at the overflow boundary, so the
+        # tiniest offset makes a difference between overflow and a finite
+        # result.
+        a = float.fromhex('0x1.ffffffc000000p+511')
+        b = float.fromhex('0x1.0000002000000p+512')
+        c = float.fromhex('0x0.0000000000001p-1022')
+        with self.assertRaises(OverflowError):
+            math.fma(a, b, 0.0)
+        with self.assertRaises(OverflowError):
+            math.fma(a, b, c)
+        self.assertEqual(math.fma(a, b, -c),
+                         float.fromhex('0x1.fffffffffffffp+1023'))
+
+        # Another extreme case: here a*b is about as large as possible subject
+        # to math.fma(a, b, c) being finite.
+        a = float.fromhex('0x1.ae565943785f9p+512')
+        b = float.fromhex('0x1.3094665de9db8p+512')
+        c = float.fromhex('0x1.fffffffffffffp+1023')
+        self.assertEqual(math.fma(a, b, -c), c)
+
+    def test_fma_single_round(self):
+        a = float.fromhex('0x1p-50')
+        self.assertEqual(math.fma(a - 1.0, a + 1.0, 1.0), a*a)
+
+    def test_random(self):
+        # A collection of randomly generated inputs for which the naive FMA
+        # (with two rounds) gives a different result from a singly-rounded FMA.
+
+        # tuples (a, b, c, expected)
+        test_values = [
+            ('0x1.694adde428b44p-1', '0x1.371b0d64caed7p-1',
+             '0x1.f347e7b8deab8p-4', '0x1.19f10da56c8adp-1'),
+            ('0x1.605401ccc6ad6p-2', '0x1.ce3a40bf56640p-2',
+             '0x1.96e3bf7bf2e20p-2', '0x1.1af6d8aa83101p-1'),
+            ('0x1.e5abd653a67d4p-2', '0x1.a2e400209b3e6p-1',
+             '0x1.a90051422ce13p-1', '0x1.37d68cc8c0fbbp+0'),
+            ('0x1.f94e8efd54700p-2', '0x1.123065c812cebp-1',
+             '0x1.458f86fb6ccd0p-1', '0x1.ccdcee26a3ff3p-1'),
+            ('0x1.bd926f1eedc96p-1', '0x1.eee9ca68c5740p-1',
+             '0x1.960c703eb3298p-2', '0x1.3cdcfb4fdb007p+0'),
+            ('0x1.27348350fbccdp-1', '0x1.3b073914a53f1p-1',
+             '0x1.e300da5c2b4cbp-1', '0x1.4c51e9a3c4e29p+0'),
+            ('0x1.2774f00b3497bp-1', '0x1.7038ec336bff0p-2',
+             '0x1.2f6f2ccc3576bp-1', '0x1.99ad9f9c2688bp-1'),
+            ('0x1.51d5a99300e5cp-1', '0x1.5cd74abd445a1p-1',
+             '0x1.8880ab0bbe530p-1', '0x1.3756f96b91129p+0'),
+            ('0x1.73cb965b821b8p-2', '0x1.218fd3d8d5371p-1',
+             '0x1.d1ea966a1f758p-2', '0x1.5217b8fd90119p-1'),
+            ('0x1.4aa98e890b046p-1', '0x1.954d85dff1041p-1',
+             '0x1.122b59317ebdfp-1', '0x1.0bf644b340cc5p+0'),
+            ('0x1.e28f29e44750fp-1', '0x1.4bcc4fdcd18fep-1',
+             '0x1.fd47f81298259p-1', '0x1.9b000afbc9995p+0'),
+            ('0x1.d2e850717fe78p-3', '0x1.1dd7531c303afp-1',
+             '0x1.e0869746a2fc2p-2', '0x1.316df6eb26439p-1'),
+            ('0x1.cf89c75ee6fbap-2', '0x1.b23decdc66825p-1',
+             '0x1.3d1fe76ac6168p-1', '0x1.00d8ea4c12abbp+0'),
+            ('0x1.3265ae6f05572p-2', '0x1.16d7ec285f7a2p-1',
+             '0x1.0b8405b3827fbp-1', '0x1.5ef33c118a001p-1'),
+            ('0x1.c4d1bf55ec1a5p-1', '0x1.bc59618459e12p-2',
+             '0x1.ce5b73dc1773dp-1', '0x1.496cf6164f99bp+0'),
+            ('0x1.d350026ac3946p-1', '0x1.9a234e149a68cp-2',
+             '0x1.f5467b1911fd6p-2', '0x1.b5cee3225caa5p-1'),
+        ]
+        for a_hex, b_hex, c_hex, expected_hex in test_values:
+            with self.subTest(a_hex=a_hex, b_hex=b_hex, c_hex=c_hex,
+                              expected_hex=expected_hex):
+                a = float.fromhex(a_hex)
+                b = float.fromhex(b_hex)
+                c = float.fromhex(c_hex)
+                expected = float.fromhex(expected_hex)
+                self.assertEqual(math.fma(a, b, c), expected)
+                self.assertEqual(math.fma(b, a, c), expected)
+
+    # Custom assertions.
+    def assertIsNaN(self, value):
+        self.assertTrue(
+            math.isnan(value),
+            msg="Expected a NaN, got {!r}".format(value)
+        )
+
+    def assertIsPositiveZero(self, value):
+        self.assertTrue(
+            value == 0 and math.copysign(1, value) > 0,
+            msg="Expected a positive zero, got {!r}".format(value)
+        )
+
+    def assertIsNegativeZero(self, value):
+        self.assertTrue(
+            value == 0 and math.copysign(1, value) < 0,
+            msg="Expected a negative zero, got {!r}".format(value)
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.diff b/test/dynamo/cpython/3_13/test_ordered_dict.diff
new file mode 100644
index 000000000000..c55fee2f7daf
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_ordered_dict.diff
@@ -0,0 +1,173 @@
+diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.py b/test/dynamo/cpython/3_13/test_ordered_dict.py
+index a9b6a84996e..b77eff70414 100644
+--- a/test/dynamo/cpython/3_13/test_ordered_dict.py
++++ b/test/dynamo/cpython/3_13/test_ordered_dict.py
+@@ -1,3 +1,57 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import (
++    run_tests,
++    xfailIfTorchDynamo,
++)
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ import builtins
+ import contextlib
+ import copy
+@@ -760,7 +814,7 @@ class _TriggerSideEffectOnEqual:
+     def side_effect(self):
+         raise NotImplementedError
+ 
+-class PurePythonOrderedDictTests(OrderedDictTests, unittest.TestCase):
++class PurePythonOrderedDictTests(OrderedDictTests, __TestCase):
+ 
+     module = py_coll
+     OrderedDict = py_coll.OrderedDict
+@@ -781,7 +835,7 @@ class PurePythonOrderedDictTests(OrderedDictTests, unittest.TestCase):
+         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+ 
+ 
+-class CPythonBuiltinDictTests(unittest.TestCase):
++class CPythonBuiltinDictTests(__TestCase):
+     """Builtin dict preserves insertion order.
+ 
+     Reuse some of tests in OrderedDict selectively.
+@@ -800,6 +854,7 @@ for method in (
+ del method
+ 
+ 
++
+ class CPythonOrderedDictSideEffects:
+ 
+     def check_runtime_error_issue119004(self, dict1, dict2):
+@@ -878,7 +933,7 @@ class CPythonOrderedDictSideEffects:
+ @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
+ class CPythonOrderedDictTests(OrderedDictTests,
+                               CPythonOrderedDictSideEffects,
+-                              unittest.TestCase):
++                              __TestCase):
+ 
+     module = c_coll
+     OrderedDict = c_coll.OrderedDict
+@@ -986,7 +1041,7 @@ class CPythonOrderedDictSubclassTests(CPythonOrderedDictTests):
+         pass
+ 
+ 
+-class PurePythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
++class PurePythonOrderedDictWithSlotsCopyingTests(__TestCase):
+ 
+     module = py_coll
+     class OrderedDict(py_coll.OrderedDict):
+@@ -995,7 +1050,7 @@ class PurePythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
+ 
+ 
+ @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
+-class CPythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
++class CPythonOrderedDictWithSlotsCopyingTests(__TestCase):
+ 
+     module = c_coll
+     class OrderedDict(c_coll.OrderedDict):
+@@ -1008,6 +1063,7 @@ class PurePythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+     @classmethod
+     def setUpClass(cls):
+         cls.type2test = py_coll.OrderedDict
++        super().setUpClass()
+ 
+     def test_popitem(self):
+         d = self._empty_mapping()
+@@ -1020,6 +1076,7 @@ class CPythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+     @classmethod
+     def setUpClass(cls):
+         cls.type2test = c_coll.OrderedDict
++        super().setUpClass()
+ 
+     def test_popitem(self):
+         d = self._empty_mapping()
+@@ -1033,6 +1090,7 @@ class PurePythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+         class MyOrderedDict(py_coll.OrderedDict):
+             pass
+         cls.type2test = MyOrderedDict
++        super().setUpClass()
+ 
+     def test_popitem(self):
+         d = self._empty_mapping()
+@@ -1047,6 +1105,7 @@ class CPythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+         class MyOrderedDict(c_coll.OrderedDict):
+             pass
+         cls.type2test = MyOrderedDict
++        super().setUpClass()
+ 
+     def test_popitem(self):
+         d = self._empty_mapping()
+@@ -1120,21 +1179,22 @@ class SimpleLRUCacheTests:
+         self.assertEqual(list(c), [1, 3, 2])
+ 
+ 
+-class PySimpleLRUCacheTests(SimpleLRUCacheTests, unittest.TestCase):
++class PySimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
+ 
+     class type2test(SimpleLRUCache, py_coll.OrderedDict):
+         pass
+ 
+ 
+ @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
+-class CSimpleLRUCacheTests(SimpleLRUCacheTests, unittest.TestCase):
++class CSimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
+ 
+     @classmethod
+     def setUpClass(cls):
+         class type2test(SimpleLRUCache, c_coll.OrderedDict):
+             pass
+         cls.type2test = type2test
++        super().setUpClass()
+ 
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.py b/test/dynamo/cpython/3_13/test_ordered_dict.py
new file mode 100644
index 000000000000..b77eff704149
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_ordered_dict.py
@@ -0,0 +1,1200 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import (
+    run_tests,
+    xfailIfTorchDynamo,
+)
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+import builtins
+import contextlib
+import copy
+import gc
+import operator
+import pickle
+import re
+from random import randrange, shuffle
+import struct
+import sys
+import unittest
+import weakref
+from collections.abc import MutableMapping
+from test import mapping_tests, support
+from test.support import import_helper, suppress_immortalization
+
+
+py_coll = import_helper.import_fresh_module('collections',
+                                            blocked=['_collections'])
+c_coll = import_helper.import_fresh_module('collections',
+                                           fresh=['_collections'])
+
+
+@contextlib.contextmanager
+def replaced_module(name, replacement):
+    original_module = sys.modules[name]
+    sys.modules[name] = replacement
+    try:
+        yield
+    finally:
+        sys.modules[name] = original_module
+
+
+class OrderedDictTests:
+
+    def test_init(self):
+        OrderedDict = self.OrderedDict
+        with self.assertRaises(TypeError):
+            OrderedDict([('a', 1), ('b', 2)], None)                                 # too many args
+        pairs = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]
+        self.assertEqual(sorted(OrderedDict(dict(pairs)).items()), pairs)           # dict input
+        self.assertEqual(sorted(OrderedDict(**dict(pairs)).items()), pairs)         # kwds input
+        self.assertEqual(list(OrderedDict(pairs).items()), pairs)                   # pairs input
+        self.assertEqual(list(OrderedDict([('a', 1), ('b', 2), ('c', 9), ('d', 4)],
+                                          c=3, e=5).items()), pairs)                # mixed input
+
+        # make sure no positional args conflict with possible kwdargs
+        self.assertEqual(list(OrderedDict(self=42).items()), [('self', 42)])
+        self.assertEqual(list(OrderedDict(other=42).items()), [('other', 42)])
+        self.assertRaises(TypeError, OrderedDict, 42)
+        self.assertRaises(TypeError, OrderedDict, (), ())
+        self.assertRaises(TypeError, OrderedDict.__init__)
+
+        # Make sure that direct calls to __init__ do not clear previous contents
+        d = OrderedDict([('a', 1), ('b', 2), ('c', 3), ('d', 44), ('e', 55)])
+        d.__init__([('e', 5), ('f', 6)], g=7, d=4)
+        self.assertEqual(list(d.items()),
+            [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ('f', 6), ('g', 7)])
+
+    def test_468(self):
+        OrderedDict = self.OrderedDict
+        items = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ('f', 6), ('g', 7)]
+        shuffle(items)
+        argdict = OrderedDict(items)
+        d = OrderedDict(**argdict)
+        self.assertEqual(list(d.items()), items)
+
+    def test_update(self):
+        OrderedDict = self.OrderedDict
+        with self.assertRaises(TypeError):
+            OrderedDict().update([('a', 1), ('b', 2)], None)                        # too many args
+        pairs = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]
+        od = OrderedDict()
+        od.update(dict(pairs))
+        self.assertEqual(sorted(od.items()), pairs)                                 # dict input
+        od = OrderedDict()
+        od.update(**dict(pairs))
+        self.assertEqual(sorted(od.items()), pairs)                                 # kwds input
+        od = OrderedDict()
+        od.update(pairs)
+        self.assertEqual(list(od.items()), pairs)                                   # pairs input
+        od = OrderedDict()
+        od.update([('a', 1), ('b', 2), ('c', 9), ('d', 4)], c=3, e=5)
+        self.assertEqual(list(od.items()), pairs)                                   # mixed input
+
+        # Issue 9137: Named argument called 'other' or 'self'
+        # shouldn't be treated specially.
+        od = OrderedDict()
+        od.update(self=23)
+        self.assertEqual(list(od.items()), [('self', 23)])
+        od = OrderedDict()
+        od.update(other={})
+        self.assertEqual(list(od.items()), [('other', {})])
+        od = OrderedDict()
+        od.update(red=5, blue=6, other=7, self=8)
+        self.assertEqual(sorted(list(od.items())),
+                         [('blue', 6), ('other', 7), ('red', 5), ('self', 8)])
+
+        # Make sure that direct calls to update do not clear previous contents
+        # add that updates items are not moved to the end
+        d = OrderedDict([('a', 1), ('b', 2), ('c', 3), ('d', 44), ('e', 55)])
+        d.update([('e', 5), ('f', 6)], g=7, d=4)
+        self.assertEqual(list(d.items()),
+            [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ('f', 6), ('g', 7)])
+
+        self.assertRaises(TypeError, OrderedDict().update, 42)
+        self.assertRaises(TypeError, OrderedDict().update, (), ())
+        self.assertRaises(TypeError, OrderedDict.update)
+
+        self.assertRaises(TypeError, OrderedDict().update, 42)
+        self.assertRaises(TypeError, OrderedDict().update, (), ())
+        self.assertRaises(TypeError, OrderedDict.update)
+
+    def test_init_calls(self):
+        calls = []
+        class Spam:
+            def keys(self):
+                calls.append('keys')
+                return ()
+            def items(self):
+                calls.append('items')
+                return ()
+
+        self.OrderedDict(Spam())
+        self.assertEqual(calls, ['keys'])
+
+    def test_overridden_init(self):
+        # Sync-up pure Python OD class with C class where
+        # a consistent internal state is created in __new__
+        # rather than __init__.
+        OrderedDict = self.OrderedDict
+        class ODNI(OrderedDict):
+            def __init__(*args, **kwargs):
+                pass
+        od = ODNI()
+        od['a'] = 1  # This used to fail because __init__ was bypassed
+
+    def test_fromkeys(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict.fromkeys('abc')
+        self.assertEqual(list(od.items()), [(c, None) for c in 'abc'])
+        od = OrderedDict.fromkeys('abc', value=None)
+        self.assertEqual(list(od.items()), [(c, None) for c in 'abc'])
+        od = OrderedDict.fromkeys('abc', value=0)
+        self.assertEqual(list(od.items()), [(c, 0) for c in 'abc'])
+
+    def test_abc(self):
+        OrderedDict = self.OrderedDict
+        self.assertIsInstance(OrderedDict(), MutableMapping)
+        self.assertTrue(issubclass(OrderedDict, MutableMapping))
+
+    def test_clear(self):
+        OrderedDict = self.OrderedDict
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        shuffle(pairs)
+        od = OrderedDict(pairs)
+        self.assertEqual(len(od), len(pairs))
+        od.clear()
+        self.assertEqual(len(od), 0)
+
+    def test_delitem(self):
+        OrderedDict = self.OrderedDict
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        od = OrderedDict(pairs)
+        del od['a']
+        self.assertNotIn('a', od)
+        with self.assertRaises(KeyError):
+            del od['a']
+        self.assertEqual(list(od.items()), pairs[:2] + pairs[3:])
+
+    def test_setitem(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict([('d', 1), ('b', 2), ('c', 3), ('a', 4), ('e', 5)])
+        od['c'] = 10           # existing element
+        od['f'] = 20           # new element
+        self.assertEqual(list(od.items()),
+                         [('d', 1), ('b', 2), ('c', 10), ('a', 4), ('e', 5), ('f', 20)])
+
+    def test_iterators(self):
+        OrderedDict = self.OrderedDict
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        shuffle(pairs)
+        od = OrderedDict(pairs)
+        self.assertEqual(list(od), [t[0] for t in pairs])
+        self.assertEqual(list(od.keys()), [t[0] for t in pairs])
+        self.assertEqual(list(od.values()), [t[1] for t in pairs])
+        self.assertEqual(list(od.items()), pairs)
+        self.assertEqual(list(reversed(od)),
+                         [t[0] for t in reversed(pairs)])
+        self.assertEqual(list(reversed(od.keys())),
+                         [t[0] for t in reversed(pairs)])
+        self.assertEqual(list(reversed(od.values())),
+                         [t[1] for t in reversed(pairs)])
+        self.assertEqual(list(reversed(od.items())), list(reversed(pairs)))
+
+    def test_detect_deletion_during_iteration(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict.fromkeys('abc')
+        it = iter(od)
+        key = next(it)
+        del od[key]
+        with self.assertRaises(Exception):
+            # Note, the exact exception raised is not guaranteed
+            # The only guarantee that the next() will not succeed
+            next(it)
+
+    def test_sorted_iterators(self):
+        OrderedDict = self.OrderedDict
+        with self.assertRaises(TypeError):
+            OrderedDict([('a', 1), ('b', 2)], None)
+        pairs = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]
+        od = OrderedDict(pairs)
+        self.assertEqual(sorted(od), [t[0] for t in pairs])
+        self.assertEqual(sorted(od.keys()), [t[0] for t in pairs])
+        self.assertEqual(sorted(od.values()), [t[1] for t in pairs])
+        self.assertEqual(sorted(od.items()), pairs)
+        self.assertEqual(sorted(reversed(od)),
+                         sorted([t[0] for t in reversed(pairs)]))
+
+    def test_iterators_empty(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict()
+        empty = []
+        self.assertEqual(list(od), empty)
+        self.assertEqual(list(od.keys()), empty)
+        self.assertEqual(list(od.values()), empty)
+        self.assertEqual(list(od.items()), empty)
+        self.assertEqual(list(reversed(od)), empty)
+        self.assertEqual(list(reversed(od.keys())), empty)
+        self.assertEqual(list(reversed(od.values())), empty)
+        self.assertEqual(list(reversed(od.items())), empty)
+
+    def test_popitem(self):
+        OrderedDict = self.OrderedDict
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        shuffle(pairs)
+        od = OrderedDict(pairs)
+        while pairs:
+            self.assertEqual(od.popitem(), pairs.pop())
+        with self.assertRaises(KeyError):
+            od.popitem()
+        self.assertEqual(len(od), 0)
+
+    def test_popitem_last(self):
+        OrderedDict = self.OrderedDict
+        pairs = [(i, i) for i in range(30)]
+
+        obj = OrderedDict(pairs)
+        for i in range(8):
+            obj.popitem(True)
+        obj.popitem(True)
+        obj.popitem(last=True)
+        self.assertEqual(len(obj), 20)
+
+    def test_pop(self):
+        OrderedDict = self.OrderedDict
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        shuffle(pairs)
+        od = OrderedDict(pairs)
+        shuffle(pairs)
+        while pairs:
+            k, v = pairs.pop()
+            self.assertEqual(od.pop(k), v)
+        with self.assertRaises(KeyError):
+            od.pop('xyz')
+        self.assertEqual(len(od), 0)
+        self.assertEqual(od.pop(k, 12345), 12345)
+
+        # make sure pop still works when __missing__ is defined
+        class Missing(OrderedDict):
+            def __missing__(self, key):
+                return 0
+        m = Missing(a=1)
+        self.assertEqual(m.pop('b', 5), 5)
+        self.assertEqual(m.pop('a', 6), 1)
+        self.assertEqual(m.pop('a', 6), 6)
+        self.assertEqual(m.pop('a', default=6), 6)
+        with self.assertRaises(KeyError):
+            m.pop('a')
+
+    def test_equality(self):
+        OrderedDict = self.OrderedDict
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        shuffle(pairs)
+        od1 = OrderedDict(pairs)
+        od2 = OrderedDict(pairs)
+        self.assertEqual(od1, od2)          # same order implies equality
+        pairs = pairs[2:] + pairs[:2]
+        od2 = OrderedDict(pairs)
+        self.assertNotEqual(od1, od2)       # different order implies inequality
+        # comparison to regular dict is not order sensitive
+        self.assertEqual(od1, dict(od2))
+        self.assertEqual(dict(od2), od1)
+        # different length implied inequality
+        self.assertNotEqual(od1, OrderedDict(pairs[:-1]))
+
+    def test_copying(self):
+        OrderedDict = self.OrderedDict
+        # Check that ordered dicts are copyable, deepcopyable, picklable,
+        # and have a repr/eval round-trip
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        od = OrderedDict(pairs)
+        od.x = ['x']
+        od.z = ['z']
+        def check(dup):
+            msg = "\ncopy: %s\nod: %s" % (dup, od)
+            self.assertIsNot(dup, od, msg)
+            self.assertEqual(dup, od)
+            self.assertEqual(list(dup.items()), list(od.items()))
+            self.assertEqual(len(dup), len(od))
+            self.assertEqual(type(dup), type(od))
+        check(od.copy())
+        dup = copy.copy(od)
+        check(dup)
+        self.assertIs(dup.x, od.x)
+        self.assertIs(dup.z, od.z)
+        self.assertFalse(hasattr(dup, 'y'))
+        dup = copy.deepcopy(od)
+        check(dup)
+        self.assertEqual(dup.x, od.x)
+        self.assertIsNot(dup.x, od.x)
+        self.assertEqual(dup.z, od.z)
+        self.assertIsNot(dup.z, od.z)
+        self.assertFalse(hasattr(dup, 'y'))
+        # pickle directly pulls the module, so we have to fake it
+        with replaced_module('collections', self.module):
+            for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+                with self.subTest(proto=proto):
+                    dup = pickle.loads(pickle.dumps(od, proto))
+                    check(dup)
+                    self.assertEqual(dup.x, od.x)
+                    self.assertEqual(dup.z, od.z)
+                    self.assertFalse(hasattr(dup, 'y'))
+        check(eval(repr(od)))
+        update_test = OrderedDict()
+        update_test.update(od)
+        check(update_test)
+        check(OrderedDict(od))
+
+    def test_yaml_linkage(self):
+        OrderedDict = self.OrderedDict
+        # Verify that __reduce__ is setup in a way that supports PyYAML's dump() feature.
+        # In yaml, lists are native but tuples are not.
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        od = OrderedDict(pairs)
+        # yaml.dump(od) -->
+        # '!!python/object/apply:__main__.OrderedDict\n- - [a, 1]\n  - [b, 2]\n'
+        self.assertTrue(all(type(pair)==list for pair in od.__reduce__()[1]))
+
+    def test_reduce_not_too_fat(self):
+        OrderedDict = self.OrderedDict
+        # do not save instance dictionary if not needed
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        od = OrderedDict(pairs)
+        self.assertIsInstance(od.__dict__, dict)
+        self.assertIsNone(od.__reduce__()[2])
+        od.x = 10
+        self.assertEqual(od.__dict__['x'], 10)
+        self.assertEqual(od.__reduce__()[2], {'x': 10})
+
+    def test_pickle_recursive(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict()
+        od[1] = od
+
+        # pickle directly pulls the module, so we have to fake it
+        with replaced_module('collections', self.module):
+            for proto in range(-1, pickle.HIGHEST_PROTOCOL + 1):
+                dup = pickle.loads(pickle.dumps(od, proto))
+                self.assertIsNot(dup, od)
+                self.assertEqual(list(dup.keys()), [1])
+                self.assertIs(dup[1], dup)
+
+    def test_repr(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict([('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)])
+        self.assertEqual(repr(od),
+            "OrderedDict({'c': 1, 'b': 2, 'a': 3, 'd': 4, 'e': 5, 'f': 6})")
+        self.assertEqual(eval(repr(od)), od)
+        self.assertEqual(repr(OrderedDict()), "OrderedDict()")
+
+    def test_repr_recursive(self):
+        OrderedDict = self.OrderedDict
+        # See issue #9826
+        od = OrderedDict.fromkeys('abc')
+        od['x'] = od
+        self.assertEqual(repr(od),
+            "OrderedDict({'a': None, 'b': None, 'c': None, 'x': ...})")
+
+    def test_repr_recursive_values(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict()
+        od[42] = od.values()
+        r = repr(od)
+        # Cannot perform a stronger test, as the contents of the repr
+        # are implementation-dependent.  All we can say is that we
+        # want a str result, not an exception of any sort.
+        self.assertIsInstance(r, str)
+        od[42] = od.items()
+        r = repr(od)
+        # Again.
+        self.assertIsInstance(r, str)
+
+    def test_setdefault(self):
+        OrderedDict = self.OrderedDict
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        shuffle(pairs)
+        od = OrderedDict(pairs)
+        pair_order = list(od.items())
+        self.assertEqual(od.setdefault('a', 10), 3)
+        # make sure order didn't change
+        self.assertEqual(list(od.items()), pair_order)
+        self.assertEqual(od.setdefault('x', 10), 10)
+        # make sure 'x' is added to the end
+        self.assertEqual(list(od.items())[-1], ('x', 10))
+        self.assertEqual(od.setdefault('g', default=9), 9)
+
+        # make sure setdefault still works when __missing__ is defined
+        class Missing(OrderedDict):
+            def __missing__(self, key):
+                return 0
+        self.assertEqual(Missing().setdefault(5, 9), 9)
+
+    def test_reinsert(self):
+        OrderedDict = self.OrderedDict
+        # Given insert a, insert b, delete a, re-insert a,
+        # verify that a is now later than b.
+        od = OrderedDict()
+        od['a'] = 1
+        od['b'] = 2
+        del od['a']
+        self.assertEqual(list(od.items()), [('b', 2)])
+        od['a'] = 1
+        self.assertEqual(list(od.items()), [('b', 2), ('a', 1)])
+
+    def test_move_to_end(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict.fromkeys('abcde')
+        self.assertEqual(list(od), list('abcde'))
+        od.move_to_end('c')
+        self.assertEqual(list(od), list('abdec'))
+        od.move_to_end('c', False)
+        self.assertEqual(list(od), list('cabde'))
+        od.move_to_end('c', False)
+        self.assertEqual(list(od), list('cabde'))
+        od.move_to_end('e')
+        self.assertEqual(list(od), list('cabde'))
+        od.move_to_end('b', last=False)
+        self.assertEqual(list(od), list('bcade'))
+        with self.assertRaises(KeyError):
+            od.move_to_end('x')
+        with self.assertRaises(KeyError):
+            od.move_to_end('x', False)
+
+    def test_move_to_end_issue25406(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict.fromkeys('abc')
+        od.move_to_end('c', last=False)
+        self.assertEqual(list(od), list('cab'))
+        od.move_to_end('a', last=False)
+        self.assertEqual(list(od), list('acb'))
+
+        od = OrderedDict.fromkeys('abc')
+        od.move_to_end('a')
+        self.assertEqual(list(od), list('bca'))
+        od.move_to_end('c')
+        self.assertEqual(list(od), list('bac'))
+
+    def test_sizeof(self):
+        OrderedDict = self.OrderedDict
+        # Wimpy test: Just verify the reported size is larger than a regular dict
+        d = dict(a=1)
+        od = OrderedDict(**d)
+        self.assertGreater(sys.getsizeof(od), sys.getsizeof(d))
+
+    def test_views(self):
+        OrderedDict = self.OrderedDict
+        # See http://bugs.python.org/issue24286
+        s = 'the quick brown fox jumped over a lazy dog yesterday before dawn'.split()
+        od = OrderedDict.fromkeys(s)
+        self.assertEqual(od.keys(), dict(od).keys())
+        self.assertEqual(od.items(), dict(od).items())
+
+    def test_override_update(self):
+        OrderedDict = self.OrderedDict
+        # Verify that subclasses can override update() without breaking __init__()
+        class MyOD(OrderedDict):
+            def update(self, *args, **kwds):
+                raise Exception()
+        items = [('a', 1), ('c', 3), ('b', 2)]
+        self.assertEqual(list(MyOD(items).items()), items)
+
+    def test_highly_nested(self):
+        # Issues 25395 and 35983: test that the trashcan mechanism works
+        # correctly for OrderedDict: deleting a highly nested OrderDict
+        # should not crash Python.
+        OrderedDict = self.OrderedDict
+        obj = None
+        for _ in range(1000):
+            obj = OrderedDict([(None, obj)])
+        del obj
+        support.gc_collect()
+
+    def test_highly_nested_subclass(self):
+        # Issues 25395 and 35983: test that the trashcan mechanism works
+        # correctly for OrderedDict: deleting a highly nested OrderDict
+        # should not crash Python.
+        OrderedDict = self.OrderedDict
+        deleted = []
+        class MyOD(OrderedDict):
+            def __del__(self):
+                deleted.append(self.i)
+        obj = None
+        for i in range(100):
+            obj = MyOD([(None, obj)])
+            obj.i = i
+        del obj
+        support.gc_collect()
+        self.assertEqual(deleted, list(reversed(range(100))))
+
+    def test_delitem_hash_collision(self):
+        OrderedDict = self.OrderedDict
+
+        class Key:
+            def __init__(self, hash):
+                self._hash = hash
+                self.value = str(id(self))
+            def __hash__(self):
+                return self._hash
+            def __eq__(self, other):
+                try:
+                    return self.value == other.value
+                except AttributeError:
+                    return False
+            def __repr__(self):
+                return self.value
+
+        def blocking_hash(hash):
+            # See the collision-handling in lookdict (in Objects/dictobject.c).
+            MINSIZE = 8
+            i = (hash & MINSIZE-1)
+            return (i << 2) + i + hash + 1
+
+        COLLIDING = 1
+
+        key = Key(COLLIDING)
+        colliding = Key(COLLIDING)
+        blocking = Key(blocking_hash(COLLIDING))
+
+        od = OrderedDict()
+        od[key] = ...
+        od[blocking] = ...
+        od[colliding] = ...
+        od['after'] = ...
+
+        del od[blocking]
+        del od[colliding]
+        self.assertEqual(list(od.items()), [(key, ...), ('after', ...)])
+
+    def test_issue24347(self):
+        OrderedDict = self.OrderedDict
+
+        class Key:
+            def __hash__(self):
+                return randrange(100000)
+
+        od = OrderedDict()
+        for i in range(100):
+            key = Key()
+            od[key] = i
+
+        # These should not crash.
+        with self.assertRaises(KeyError):
+            list(od.values())
+        with self.assertRaises(KeyError):
+            list(od.items())
+        with self.assertRaises(KeyError):
+            repr(od)
+        with self.assertRaises(KeyError):
+            od.copy()
+
+    def test_issue24348(self):
+        OrderedDict = self.OrderedDict
+
+        class Key:
+            def __hash__(self):
+                return 1
+
+        od = OrderedDict()
+        od[Key()] = 0
+        # This should not crash.
+        od.popitem()
+
+    def test_issue24667(self):
+        """
+        dict resizes after a certain number of insertion operations,
+        whether or not there were deletions that freed up slots in the
+        hash table.  During fast node lookup, OrderedDict must correctly
+        respond to all resizes, even if the current "size" is the same
+        as the old one.  We verify that here by forcing a dict resize
+        on a sparse odict and then perform an operation that should
+        trigger an odict resize (e.g. popitem).  One key aspect here is
+        that we will keep the size of the odict the same at each popitem
+        call.  This verifies that we handled the dict resize properly.
+        """
+        OrderedDict = self.OrderedDict
+
+        od = OrderedDict()
+        for c0 in '0123456789ABCDEF':
+            for c1 in '0123456789ABCDEF':
+                if len(od) == 4:
+                    # This should not raise a KeyError.
+                    od.popitem(last=False)
+                key = c0 + c1
+                od[key] = key
+
+    # Direct use of dict methods
+
+    def test_dict_setitem(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict()
+        dict.__setitem__(od, 'spam', 1)
+        self.assertNotIn('NULL', repr(od))
+
+    def test_dict_delitem(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict()
+        od['spam'] = 1
+        od['ham'] = 2
+        dict.__delitem__(od, 'spam')
+        with self.assertRaises(KeyError):
+            repr(od)
+
+    def test_dict_clear(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict()
+        od['spam'] = 1
+        od['ham'] = 2
+        dict.clear(od)
+        self.assertNotIn('NULL', repr(od))
+
+    def test_dict_pop(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict()
+        od['spam'] = 1
+        od['ham'] = 2
+        dict.pop(od, 'spam')
+        with self.assertRaises(KeyError):
+            repr(od)
+
+    def test_dict_popitem(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict()
+        od['spam'] = 1
+        od['ham'] = 2
+        dict.popitem(od)
+        with self.assertRaises(KeyError):
+            repr(od)
+
+    def test_dict_setdefault(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict()
+        dict.setdefault(od, 'spam', 1)
+        self.assertNotIn('NULL', repr(od))
+
+    def test_dict_update(self):
+        OrderedDict = self.OrderedDict
+        od = OrderedDict()
+        dict.update(od, [('spam', 1)])
+        self.assertNotIn('NULL', repr(od))
+
+    @suppress_immortalization()
+    def test_reference_loop(self):
+        # Issue 25935
+        OrderedDict = self.OrderedDict
+        class A:
+            od = OrderedDict()
+        A.od[A] = None
+        r = weakref.ref(A)
+        del A
+        gc.collect()
+        self.assertIsNone(r())
+
+    def test_free_after_iterating(self):
+        support.check_free_after_iterating(self, iter, self.OrderedDict)
+        support.check_free_after_iterating(self, lambda d: iter(d.keys()), self.OrderedDict)
+        support.check_free_after_iterating(self, lambda d: iter(d.values()), self.OrderedDict)
+        support.check_free_after_iterating(self, lambda d: iter(d.items()), self.OrderedDict)
+
+    def test_merge_operator(self):
+        OrderedDict = self.OrderedDict
+
+        a = OrderedDict({0: 0, 1: 1, 2: 1})
+        b = OrderedDict({1: 1, 2: 2, 3: 3})
+
+        c = a.copy()
+        d = a.copy()
+        c |= b
+        d |= list(b.items())
+        expected = OrderedDict({0: 0, 1: 1, 2: 2, 3: 3})
+        self.assertEqual(a | dict(b), expected)
+        self.assertEqual(a | b, expected)
+        self.assertEqual(c, expected)
+        self.assertEqual(d, expected)
+
+        c = b.copy()
+        c |= a
+        expected = OrderedDict({1: 1, 2: 1, 3: 3, 0: 0})
+        self.assertEqual(dict(b) | a, expected)
+        self.assertEqual(b | a, expected)
+        self.assertEqual(c, expected)
+
+        self.assertIs(type(a | b), OrderedDict)
+        self.assertIs(type(dict(a) | b), OrderedDict)
+        self.assertIs(type(a | dict(b)), OrderedDict)
+
+        expected = a.copy()
+        a |= ()
+        a |= ""
+        self.assertEqual(a, expected)
+
+        with self.assertRaises(TypeError):
+            a | None
+        with self.assertRaises(TypeError):
+            a | ()
+        with self.assertRaises(TypeError):
+            a | "BAD"
+        with self.assertRaises(TypeError):
+            a | ""
+        with self.assertRaises(ValueError):
+            a |= "BAD"
+
+    @support.cpython_only
+    def test_ordered_dict_items_result_gc(self):
+        # bpo-42536: OrderedDict.items's tuple-reuse speed trick breaks the GC's
+        # assumptions about what can be untracked. Make sure we re-track result
+        # tuples whenever we reuse them.
+        it = iter(self.OrderedDict({None: []}).items())
+        gc.collect()
+        # That GC collection probably untracked the recycled internal result
+        # tuple, which is initialized to (None, None). Make sure it's re-tracked
+        # when it's mutated and returned from __next__:
+        self.assertTrue(gc.is_tracked(next(it)))
+
+
+class _TriggerSideEffectOnEqual:
+    count = 0   # number of calls to __eq__
+    trigger = 1 # count value when to trigger side effect
+
+    def __eq__(self, other):
+        if self.__class__.count == self.__class__.trigger:
+            self.side_effect()
+        self.__class__.count += 1
+        return True
+
+    def __hash__(self):
+        # all instances represent the same key
+        return -1
+
+    def side_effect(self):
+        raise NotImplementedError
+
+class PurePythonOrderedDictTests(OrderedDictTests, __TestCase):
+
+    module = py_coll
+    OrderedDict = py_coll.OrderedDict
+
+    def test_issue119004_attribute_error(self):
+        class Key(_TriggerSideEffectOnEqual):
+            def side_effect(self):
+                del dict1[TODEL]
+
+        TODEL = Key()
+        dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+        dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+        # This causes an AttributeError due to the linked list being changed
+        msg = re.escape("'NoneType' object has no attribute 'key'")
+        self.assertRaisesRegex(AttributeError, msg, operator.eq, dict1, dict2)
+        self.assertEqual(Key.count, 2)
+        self.assertDictEqual(dict1, dict.fromkeys((0, 4.2)))
+        self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+
+class CPythonBuiltinDictTests(__TestCase):
+    """Builtin dict preserves insertion order.
+
+    Reuse some of tests in OrderedDict selectively.
+    """
+
+    module = builtins
+    OrderedDict = dict
+
+for method in (
+    "test_init test_update test_abc test_clear test_delitem " +
+    "test_setitem test_detect_deletion_during_iteration " +
+    "test_popitem test_reinsert test_override_update " +
+    "test_highly_nested test_highly_nested_subclass " +
+    "test_delitem_hash_collision ").split():
+    setattr(CPythonBuiltinDictTests, method, getattr(OrderedDictTests, method))
+del method
+
+
+
+class CPythonOrderedDictSideEffects:
+
+    def check_runtime_error_issue119004(self, dict1, dict2):
+        msg = re.escape("OrderedDict mutated during iteration")
+        self.assertRaisesRegex(RuntimeError, msg, operator.eq, dict1, dict2)
+
+    def test_issue119004_change_size_by_clear(self):
+        class Key(_TriggerSideEffectOnEqual):
+            def side_effect(self):
+                dict1.clear()
+
+        dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+        dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+        self.check_runtime_error_issue119004(dict1, dict2)
+        self.assertEqual(Key.count, 2)
+        self.assertDictEqual(dict1, {})
+        self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+    def test_issue119004_change_size_by_delete_key(self):
+        class Key(_TriggerSideEffectOnEqual):
+            def side_effect(self):
+                del dict1[TODEL]
+
+        TODEL = Key()
+        dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+        dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+        self.check_runtime_error_issue119004(dict1, dict2)
+        self.assertEqual(Key.count, 2)
+        self.assertDictEqual(dict1, dict.fromkeys((0, 4.2)))
+        self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+    def test_issue119004_change_linked_list_by_clear(self):
+        class Key(_TriggerSideEffectOnEqual):
+            def side_effect(self):
+                dict1.clear()
+                dict1['a'] = dict1['b'] = 'c'
+
+        dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+        dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+        self.check_runtime_error_issue119004(dict1, dict2)
+        self.assertEqual(Key.count, 2)
+        self.assertDictEqual(dict1, dict.fromkeys(('a', 'b'), 'c'))
+        self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+    def test_issue119004_change_linked_list_by_delete_key(self):
+        class Key(_TriggerSideEffectOnEqual):
+            def side_effect(self):
+                del dict1[TODEL]
+                dict1['a'] = 'c'
+
+        TODEL = Key()
+        dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+        dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+        self.check_runtime_error_issue119004(dict1, dict2)
+        self.assertEqual(Key.count, 2)
+        self.assertDictEqual(dict1, {0: None, 'a': 'c', 4.2: None})
+        self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+    def test_issue119004_change_size_by_delete_key_in_dict_eq(self):
+        class Key(_TriggerSideEffectOnEqual):
+            trigger = 0
+            def side_effect(self):
+                del dict1[TODEL]
+
+        TODEL = Key()
+        dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+        dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+        self.assertEqual(Key.count, 0)
+        # the side effect is in dict.__eq__ and modifies the length
+        self.assertNotEqual(dict1, dict2)
+        self.assertEqual(Key.count, 2)
+        self.assertDictEqual(dict1, dict.fromkeys((0, 4.2)))
+        self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+
+@unittest.skipUnless(c_coll, 'requires the C version of the collections module')
+class CPythonOrderedDictTests(OrderedDictTests,
+                              CPythonOrderedDictSideEffects,
+                              __TestCase):
+
+    module = c_coll
+    OrderedDict = c_coll.OrderedDict
+    check_sizeof = support.check_sizeof
+
+    @support.cpython_only
+    def test_sizeof_exact(self):
+        OrderedDict = self.OrderedDict
+        calcsize = struct.calcsize
+        size = support.calcobjsize
+        check = self.check_sizeof
+
+        basicsize = size('nQ2P' + '3PnPn2P')
+        keysize = calcsize('n2BI2n')
+
+        entrysize = calcsize('n2P')
+        p = calcsize('P')
+        nodesize = calcsize('Pn2P')
+
+        od = OrderedDict()
+        check(od, basicsize)  # 8byte indices + 8*2//3 * entry table
+        od.x = 1
+        check(od, basicsize)
+        od.update([(i, i) for i in range(3)])
+        check(od, basicsize + keysize + 8*p + 8 + 5*entrysize + 3*nodesize)
+        od.update([(i, i) for i in range(3, 10)])
+        check(od, basicsize + keysize + 16*p + 16 + 10*entrysize + 10*nodesize)
+
+        check(od.keys(), size('P'))
+        check(od.items(), size('P'))
+        check(od.values(), size('P'))
+
+        itersize = size('iP2n2P')
+        check(iter(od), itersize)
+        check(iter(od.keys()), itersize)
+        check(iter(od.items()), itersize)
+        check(iter(od.values()), itersize)
+
+    def test_key_change_during_iteration(self):
+        OrderedDict = self.OrderedDict
+
+        od = OrderedDict.fromkeys('abcde')
+        self.assertEqual(list(od), list('abcde'))
+        with self.assertRaises(RuntimeError):
+            for i, k in enumerate(od):
+                od.move_to_end(k)
+                self.assertLess(i, 5)
+        with self.assertRaises(RuntimeError):
+            for k in od:
+                od['f'] = None
+        with self.assertRaises(RuntimeError):
+            for k in od:
+                del od['c']
+        self.assertEqual(list(od), list('bdeaf'))
+
+    def test_iterators_pickling(self):
+        OrderedDict = self.OrderedDict
+        pairs = [('c', 1), ('b', 2), ('a', 3), ('d', 4), ('e', 5), ('f', 6)]
+        od = OrderedDict(pairs)
+
+        for method_name in ('keys', 'values', 'items'):
+            meth = getattr(od, method_name)
+            expected = list(meth())[1:]
+            for i in range(pickle.HIGHEST_PROTOCOL + 1):
+                with self.subTest(method_name=method_name, protocol=i):
+                    it = iter(meth())
+                    next(it)
+                    p = pickle.dumps(it, i)
+                    unpickled = pickle.loads(p)
+                    self.assertEqual(list(unpickled), expected)
+                    self.assertEqual(list(it), expected)
+
+    @support.cpython_only
+    def test_weakref_list_is_not_traversed(self):
+        # Check that the weakref list is not traversed when collecting
+        # OrderedDict objects. See bpo-39778 for more information.
+
+        gc.collect()
+
+        x = self.OrderedDict()
+        x.cycle = x
+
+        cycle = []
+        cycle.append(cycle)
+
+        x_ref = weakref.ref(x)
+        cycle.append(x_ref)
+
+        del x, cycle, x_ref
+
+        gc.collect()
+
+
+class PurePythonOrderedDictSubclassTests(PurePythonOrderedDictTests):
+
+    module = py_coll
+    class OrderedDict(py_coll.OrderedDict):
+        pass
+
+
+class CPythonOrderedDictSubclassTests(CPythonOrderedDictTests):
+
+    module = c_coll
+    class OrderedDict(c_coll.OrderedDict):
+        pass
+
+
+class PurePythonOrderedDictWithSlotsCopyingTests(__TestCase):
+
+    module = py_coll
+    class OrderedDict(py_coll.OrderedDict):
+        __slots__ = ('x', 'y')
+    test_copying = OrderedDictTests.test_copying
+
+
+@unittest.skipUnless(c_coll, 'requires the C version of the collections module')
+class CPythonOrderedDictWithSlotsCopyingTests(__TestCase):
+
+    module = c_coll
+    class OrderedDict(c_coll.OrderedDict):
+        __slots__ = ('x', 'y')
+    test_copying = OrderedDictTests.test_copying
+
+
+class PurePythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.type2test = py_coll.OrderedDict
+        super().setUpClass()
+
+    def test_popitem(self):
+        d = self._empty_mapping()
+        self.assertRaises(KeyError, d.popitem)
+
+
+@unittest.skipUnless(c_coll, 'requires the C version of the collections module')
+class CPythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.type2test = c_coll.OrderedDict
+        super().setUpClass()
+
+    def test_popitem(self):
+        d = self._empty_mapping()
+        self.assertRaises(KeyError, d.popitem)
+
+
+class PurePythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+
+    @classmethod
+    def setUpClass(cls):
+        class MyOrderedDict(py_coll.OrderedDict):
+            pass
+        cls.type2test = MyOrderedDict
+        super().setUpClass()
+
+    def test_popitem(self):
+        d = self._empty_mapping()
+        self.assertRaises(KeyError, d.popitem)
+
+
+@unittest.skipUnless(c_coll, 'requires the C version of the collections module')
+class CPythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+
+    @classmethod
+    def setUpClass(cls):
+        class MyOrderedDict(c_coll.OrderedDict):
+            pass
+        cls.type2test = MyOrderedDict
+        super().setUpClass()
+
+    def test_popitem(self):
+        d = self._empty_mapping()
+        self.assertRaises(KeyError, d.popitem)
+
+
+class SimpleLRUCache:
+
+    def __init__(self, size):
+        super().__init__()
+        self.size = size
+        self.counts = dict.fromkeys(('get', 'set', 'del'), 0)
+
+    def __getitem__(self, item):
+        self.counts['get'] += 1
+        value = super().__getitem__(item)
+        self.move_to_end(item)
+        return value
+
+    def __setitem__(self, key, value):
+        self.counts['set'] += 1
+        while key not in self and len(self) >= self.size:
+            self.popitem(last=False)
+        super().__setitem__(key, value)
+        self.move_to_end(key)
+
+    def __delitem__(self, key):
+        self.counts['del'] += 1
+        super().__delitem__(key)
+
+
+class SimpleLRUCacheTests:
+
+    def test_add_after_full(self):
+        c = self.type2test(2)
+        c['t1'] = 1
+        c['t2'] = 2
+        c['t3'] = 3
+        self.assertEqual(c.counts, {'get': 0, 'set': 3, 'del': 0})
+        self.assertEqual(list(c), ['t2', 't3'])
+        self.assertEqual(c.counts, {'get': 0, 'set': 3, 'del': 0})
+
+    def test_popitem(self):
+        c = self.type2test(3)
+        for i in range(1, 4):
+            c[i] = i
+        self.assertEqual(c.popitem(last=False), (1, 1))
+        self.assertEqual(c.popitem(last=True), (3, 3))
+        self.assertEqual(c.counts, {'get': 0, 'set': 3, 'del': 0})
+
+    def test_pop(self):
+        c = self.type2test(3)
+        for i in range(1, 4):
+            c[i] = i
+        self.assertEqual(c.counts, {'get': 0, 'set': 3, 'del': 0})
+        self.assertEqual(c.pop(2), 2)
+        self.assertEqual(c.counts, {'get': 0, 'set': 3, 'del': 0})
+        self.assertEqual(c.pop(4, 0), 0)
+        self.assertEqual(c.counts, {'get': 0, 'set': 3, 'del': 0})
+        self.assertRaises(KeyError, c.pop, 4)
+        self.assertEqual(c.counts, {'get': 0, 'set': 3, 'del': 0})
+
+    def test_change_order_on_get(self):
+        c = self.type2test(3)
+        for i in range(1, 4):
+            c[i] = i
+        self.assertEqual(list(c), list(range(1, 4)))
+        self.assertEqual(c.counts, {'get': 0, 'set': 3, 'del': 0})
+        self.assertEqual(c[2], 2)
+        self.assertEqual(c.counts, {'get': 1, 'set': 3, 'del': 0})
+        self.assertEqual(list(c), [1, 3, 2])
+
+
+class PySimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
+
+    class type2test(SimpleLRUCache, py_coll.OrderedDict):
+        pass
+
+
+@unittest.skipUnless(c_coll, 'requires the C version of the collections module')
+class CSimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        class type2test(SimpleLRUCache, c_coll.OrderedDict):
+            pass
+        cls.type2test = type2test
+        super().setUpClass()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_set.diff b/test/dynamo/cpython/3_13/test_set.diff
new file mode 100644
index 000000000000..fd2c916f6d17
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_set.diff
@@ -0,0 +1,608 @@
+diff --git a/test/dynamo/cpython/3_13/test_set.py b/test/dynamo/cpython/3_13/test_set.py
+index d9102eb98a5..0b8e99a04c4 100644
+--- a/test/dynamo/cpython/3_13/test_set.py
++++ b/test/dynamo/cpython/3_13/test_set.py
+@@ -1,3 +1,53 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ import unittest
+ from test import support
+ from test.support import warnings_helper
+@@ -38,7 +88,7 @@ class HashCountingInt(int):
+         self.hash_count += 1
+         return int.__hash__(self)
+ 
+-class TestJointOps:
++class _TestJointOps:
+     # Tests common to both set and frozenset
+ 
+     def setUp(self):
+@@ -47,6 +97,7 @@ class TestJointOps:
+         self.letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+         self.s = self.thetype(word)
+         self.d = dict.fromkeys(word)
++        super().setUp()
+ 
+     def test_new_or_init(self):
+         self.assertRaises(TypeError, self.thetype, [], 2)
+@@ -355,7 +406,7 @@ class TestJointOps:
+     def test_free_after_iterating(self):
+         support.check_free_after_iterating(self, iter, self.thetype)
+ 
+-class TestSet(TestJointOps, unittest.TestCase):
++class TestSet(_TestJointOps, __TestCase):
+     thetype = set
+     basetype = set
+ 
+@@ -675,7 +726,7 @@ class TestSetSubclass(TestSet):
+             subclass_with_new([1, 2], newarg=3)
+ 
+ 
+-class TestFrozenSet(TestJointOps, unittest.TestCase):
++class TestFrozenSet(_TestJointOps, __TestCase):
+     thetype = frozenset
+     basetype = frozenset
+ 
+@@ -811,10 +862,17 @@ class TestFrozenSetSubclass(TestFrozenSet):
+ class SetSubclassWithSlots(set):
+     __slots__ = ('x', 'y', '__dict__')
+ 
+-class TestSetSubclassWithSlots(unittest.TestCase):
++class TestSetSubclassWithSlots(__TestCase):
+     thetype = SetSubclassWithSlots
+-    setUp = TestJointOps.setUp
+-    test_pickling = TestJointOps.test_pickling
++    test_pickling = _TestJointOps.test_pickling
++
++    def setUp(self):
++        self.word = word = 'simsalabim'
++        self.otherword = 'madagascar'
++        self.letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
++        self.s = self.thetype(word)
++        self.d = dict.fromkeys(word)
++        super().setUp()
+ 
+ class FrozenSetSubclassWithSlots(frozenset):
+     __slots__ = ('x', 'y', '__dict__')
+@@ -828,7 +886,7 @@ empty_set = set()
+ 
+ #==============================================================================
+ 
+-class TestBasicOps:
++class _TestBasicOps:
+ 
+     def test_repr(self):
+         if self.repr is not None:
+@@ -934,7 +992,7 @@ class TestBasicOps:
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestBasicOpsEmpty(TestBasicOps, unittest.TestCase):
++class TestBasicOpsEmpty(_TestBasicOps, __TestCase):
+     def setUp(self):
+         self.case   = "empty set"
+         self.values = []
+@@ -942,10 +1000,11 @@ class TestBasicOpsEmpty(TestBasicOps, unittest.TestCase):
+         self.dup    = set(self.values)
+         self.length = 0
+         self.repr   = "set()"
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
++class TestBasicOpsSingleton(_TestBasicOps, __TestCase):
+     def setUp(self):
+         self.case   = "unit set (number)"
+         self.values = [3]
+@@ -953,6 +1012,7 @@ class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
+         self.dup    = set(self.values)
+         self.length = 1
+         self.repr   = "{3}"
++        super().setUp()
+ 
+     def test_in(self):
+         self.assertIn(3, self.set)
+@@ -962,7 +1022,7 @@ class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
++class TestBasicOpsTuple(_TestBasicOps, __TestCase):
+     def setUp(self):
+         self.case   = "unit set (tuple)"
+         self.values = [(0, "zero")]
+@@ -970,6 +1030,7 @@ class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
+         self.dup    = set(self.values)
+         self.length = 1
+         self.repr   = "{(0, 'zero')}"
++        super().setUp()
+ 
+     def test_in(self):
+         self.assertIn((0, "zero"), self.set)
+@@ -979,7 +1040,7 @@ class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestBasicOpsTriple(TestBasicOps, unittest.TestCase):
++class TestBasicOpsTriple(_TestBasicOps, __TestCase):
+     def setUp(self):
+         self.case   = "triple set"
+         self.values = [0, "zero", operator.add]
+@@ -987,36 +1048,39 @@ class TestBasicOpsTriple(TestBasicOps, unittest.TestCase):
+         self.dup    = set(self.values)
+         self.length = 3
+         self.repr   = None
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestBasicOpsString(TestBasicOps, unittest.TestCase):
++class TestBasicOpsString(_TestBasicOps, __TestCase):
+     def setUp(self):
+         self.case   = "string set"
+         self.values = ["a", "b", "c"]
+         self.set    = set(self.values)
+         self.dup    = set(self.values)
+         self.length = 3
++        super().setUp()
+ 
+     def test_repr(self):
+         self.check_repr_against_values()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestBasicOpsBytes(TestBasicOps, unittest.TestCase):
++class TestBasicOpsBytes(_TestBasicOps, __TestCase):
+     def setUp(self):
+         self.case   = "bytes set"
+         self.values = [b"a", b"b", b"c"]
+         self.set    = set(self.values)
+         self.dup    = set(self.values)
+         self.length = 3
++        super().setUp()
+ 
+     def test_repr(self):
+         self.check_repr_against_values()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestBasicOpsMixedStringBytes(TestBasicOps, unittest.TestCase):
++class TestBasicOpsMixedStringBytes(_TestBasicOps, __TestCase):
+     def setUp(self):
+         self.enterContext(warnings_helper.check_warnings())
+         warnings.simplefilter('ignore', BytesWarning)
+@@ -1025,6 +1089,7 @@ class TestBasicOpsMixedStringBytes(TestBasicOps, unittest.TestCase):
+         self.set    = set(self.values)
+         self.dup    = set(self.values)
+         self.length = 4
++        super().setUp()
+ 
+     def test_repr(self):
+         self.check_repr_against_values()
+@@ -1038,7 +1103,7 @@ def baditer():
+ def gooditer():
+     yield True
+ 
+-class TestExceptionPropagation(unittest.TestCase):
++class TestExceptionPropagation(__TestCase):
+     """SF 628246:  Set constructor should not trap iterator TypeErrors"""
+ 
+     def test_instanceWithException(self):
+@@ -1065,7 +1130,7 @@ class TestExceptionPropagation(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+-class TestSetOfSets(unittest.TestCase):
++class TestSetOfSets(__TestCase):
+     def test_constructor(self):
+         inner = frozenset([1])
+         outer = set([inner])
+@@ -1078,9 +1143,10 @@ class TestSetOfSets(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+-class TestBinaryOps(unittest.TestCase):
++class TestBinaryOps(__TestCase):
+     def setUp(self):
+         self.set = set((2, 4, 6))
++        super().setUp()
+ 
+     def test_eq(self):              # SF bug 643115
+         self.assertEqual(self.set, set({2:1,4:3,6:5}))
+@@ -1151,9 +1217,10 @@ class TestBinaryOps(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+-class TestUpdateOps(unittest.TestCase):
++class TestUpdateOps(__TestCase):
+     def setUp(self):
+         self.set = set((2, 4, 6))
++        super().setUp()
+ 
+     def test_union_subset(self):
+         self.set |= set([2])
+@@ -1237,10 +1304,11 @@ class TestUpdateOps(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+-class TestMutate(unittest.TestCase):
++class TestMutate(__TestCase):
+     def setUp(self):
+         self.values = ["a", "b", "c"]
+         self.set = set(self.values)
++        super().setUp()
+ 
+     def test_add_present(self):
+         self.set.add("c")
+@@ -1311,7 +1379,7 @@ class TestMutate(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+-class TestSubsets:
++class _TestSubsets:
+ 
+     case2method = {"<=": "issubset",
+                    ">=": "issuperset",
+@@ -1334,22 +1402,22 @@ class TestSubsets:
+             result = eval("x" + case + "y", locals())
+             self.assertEqual(result, expected)
+             # Test the "friendly" method-name spelling, if one exists.
+-            if case in TestSubsets.case2method:
+-                method = getattr(x, TestSubsets.case2method[case])
++            if case in _TestSubsets.case2method:
++                method = getattr(x, _TestSubsets.case2method[case])
+                 result = method(y)
+                 self.assertEqual(result, expected)
+ 
+             # Now do the same for the operands reversed.
+-            rcase = TestSubsets.reverse[case]
++            rcase = _TestSubsets.reverse[case]
+             result = eval("y" + rcase + "x", locals())
+             self.assertEqual(result, expected)
+-            if rcase in TestSubsets.case2method:
+-                method = getattr(y, TestSubsets.case2method[rcase])
++            if rcase in _TestSubsets.case2method:
++                method = getattr(y, _TestSubsets.case2method[rcase])
+                 result = method(x)
+                 self.assertEqual(result, expected)
+ #------------------------------------------------------------------------------
+ 
+-class TestSubsetEqualEmpty(TestSubsets, unittest.TestCase):
++class TestSubsetEqualEmpty(_TestSubsets, __TestCase):
+     left  = set()
+     right = set()
+     name  = "both empty"
+@@ -1357,7 +1425,7 @@ class TestSubsetEqualEmpty(TestSubsets, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestSubsetEqualNonEmpty(TestSubsets, unittest.TestCase):
++class TestSubsetEqualNonEmpty(_TestSubsets, __TestCase):
+     left  = set([1, 2])
+     right = set([1, 2])
+     name  = "equal pair"
+@@ -1365,7 +1433,7 @@ class TestSubsetEqualNonEmpty(TestSubsets, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestSubsetEmptyNonEmpty(TestSubsets, unittest.TestCase):
++class TestSubsetEmptyNonEmpty(_TestSubsets, __TestCase):
+     left  = set()
+     right = set([1, 2])
+     name  = "one empty, one non-empty"
+@@ -1373,7 +1441,7 @@ class TestSubsetEmptyNonEmpty(TestSubsets, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestSubsetPartial(TestSubsets, unittest.TestCase):
++class TestSubsetPartial(_TestSubsets, __TestCase):
+     left  = set([1])
+     right = set([1, 2])
+     name  = "one a non-empty proper subset of other"
+@@ -1381,7 +1449,7 @@ class TestSubsetPartial(TestSubsets, unittest.TestCase):
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestSubsetNonOverlap(TestSubsets, unittest.TestCase):
++class TestSubsetNonOverlap(_TestSubsets, __TestCase):
+     left  = set([1])
+     right = set([2])
+     name  = "neither empty, neither contains"
+@@ -1389,7 +1457,7 @@ class TestSubsetNonOverlap(TestSubsets, unittest.TestCase):
+ 
+ #==============================================================================
+ 
+-class TestOnlySetsInBinaryOps:
++class _TestOnlySetsInBinaryOps:
+ 
+     def test_eq_ne(self):
+         # Unlike the others, this is testing that == and != *are* allowed.
+@@ -1505,47 +1573,52 @@ class TestOnlySetsInBinaryOps:
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestOnlySetsNumeric(TestOnlySetsInBinaryOps, unittest.TestCase):
++class TestOnlySetsNumeric(_TestOnlySetsInBinaryOps, __TestCase):
+     def setUp(self):
+         self.set   = set((1, 2, 3))
+         self.other = 19
+         self.otherIsIterable = False
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestOnlySetsDict(TestOnlySetsInBinaryOps, unittest.TestCase):
++class TestOnlySetsDict(_TestOnlySetsInBinaryOps, __TestCase):
+     def setUp(self):
+         self.set   = set((1, 2, 3))
+         self.other = {1:2, 3:4}
+         self.otherIsIterable = True
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestOnlySetsOperator(TestOnlySetsInBinaryOps, unittest.TestCase):
++class TestOnlySetsOperator(_TestOnlySetsInBinaryOps, __TestCase):
+     def setUp(self):
+         self.set   = set((1, 2, 3))
+         self.other = operator.add
+         self.otherIsIterable = False
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestOnlySetsTuple(TestOnlySetsInBinaryOps, unittest.TestCase):
++class TestOnlySetsTuple(_TestOnlySetsInBinaryOps, __TestCase):
+     def setUp(self):
+         self.set   = set((1, 2, 3))
+         self.other = (2, 4, 6)
+         self.otherIsIterable = True
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestOnlySetsString(TestOnlySetsInBinaryOps, unittest.TestCase):
++class TestOnlySetsString(_TestOnlySetsInBinaryOps, __TestCase):
+     def setUp(self):
+         self.set   = set((1, 2, 3))
+         self.other = 'abc'
+         self.otherIsIterable = True
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, unittest.TestCase):
++class TestOnlySetsGenerator(_TestOnlySetsInBinaryOps, __TestCase):
+     def setUp(self):
+         def gen():
+             for i in range(0, 10, 2):
+@@ -1553,10 +1626,11 @@ class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, unittest.TestCase):
+         self.set   = set((1, 2, 3))
+         self.other = gen()
+         self.otherIsIterable = True
++        super().setUp()
+ 
+ #==============================================================================
+ 
+-class TestCopying:
++class _TestCopying:
+ 
+     def test_copy(self):
+         dup = self.set.copy()
+@@ -1577,40 +1651,46 @@ class TestCopying:
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestCopyingEmpty(TestCopying, unittest.TestCase):
++class TestCopyingEmpty(_TestCopying, __TestCase):
+     def setUp(self):
+         self.set = set()
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestCopyingSingleton(TestCopying, unittest.TestCase):
++class TestCopyingSingleton(_TestCopying, __TestCase):
+     def setUp(self):
+         self.set = set(["hello"])
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestCopyingTriple(TestCopying, unittest.TestCase):
++class TestCopyingTriple(_TestCopying, __TestCase):
+     def setUp(self):
+         self.set = set(["zero", 0, None])
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestCopyingTuple(TestCopying, unittest.TestCase):
++class TestCopyingTuple(_TestCopying, __TestCase):
+     def setUp(self):
+         self.set = set([(1, 2)])
++        super().setUp()
+ 
+ #------------------------------------------------------------------------------
+ 
+-class TestCopyingNested(TestCopying, unittest.TestCase):
++class TestCopyingNested(_TestCopying, __TestCase):
+     def setUp(self):
+         self.set = set([((1, 2), (3, 4))])
++        super().setUp()
+ 
+ #==============================================================================
+ 
+-class TestIdentities(unittest.TestCase):
++class TestIdentities(__TestCase):
+     def setUp(self):
+         self.a = set('abracadabra')
+         self.b = set('alacazam')
++        super().setUp()
+ 
+     def test_binopsVsSubsets(self):
+         a, b = self.a, self.b
+@@ -1727,7 +1807,7 @@ def L(seqn):
+     'Test multiple tiers of iterators'
+     return chain(map(lambda x:x, R(Ig(G(seqn)))))
+ 
+-class TestVariousIteratorArgs(unittest.TestCase):
++class TestVariousIteratorArgs(__TestCase):
+ 
+     def test_constructor(self):
+         for cons in (set, frozenset):
+@@ -1785,7 +1865,7 @@ class bad_dict_clear:
+     def __hash__(self):
+         return 0
+ 
+-class TestWeirdBugs(unittest.TestCase):
++class TestWeirdBugs(__TestCase):
+     def test_8420_set_merge(self):
+         # This used to segfault
+         global be_bad, set2, dict2
+@@ -1826,7 +1906,7 @@ class TestWeirdBugs(unittest.TestCase):
+         s.update(other)
+ 
+ 
+-class TestOperationsMutating:
++class _TestOperationsMutating:
+     """Regression test for bpo-46615"""
+ 
+     constructor1 = None
+@@ -1862,7 +1942,7 @@ class TestOperationsMutating:
+                 self.assertIn("changed size during iteration", str(e))
+ 
+ 
+-class TestBinaryOpsMutating(TestOperationsMutating):
++class _TestBinaryOpsMutating(_TestOperationsMutating):
+ 
+     def test_eq_with_mutation(self):
+         self.check_set_op_does_not_crash(lambda a, b: a == b)
+@@ -1933,24 +2013,24 @@ class TestBinaryOpsMutating(TestOperationsMutating):
+         self.check_set_op_does_not_crash(f3)
+ 
+ 
+-class TestBinaryOpsMutating_Set_Set(TestBinaryOpsMutating, unittest.TestCase):
++class TestBinaryOpsMutating_Set_Set(_TestBinaryOpsMutating, __TestCase):
+     constructor1 = set
+     constructor2 = set
+ 
+-class TestBinaryOpsMutating_Subclass_Subclass(TestBinaryOpsMutating, unittest.TestCase):
++class TestBinaryOpsMutating_Subclass_Subclass(_TestBinaryOpsMutating, __TestCase):
+     constructor1 = SetSubclass
+     constructor2 = SetSubclass
+ 
+-class TestBinaryOpsMutating_Set_Subclass(TestBinaryOpsMutating, unittest.TestCase):
++class TestBinaryOpsMutating_Set_Subclass(_TestBinaryOpsMutating, __TestCase):
+     constructor1 = set
+     constructor2 = SetSubclass
+ 
+-class TestBinaryOpsMutating_Subclass_Set(TestBinaryOpsMutating, unittest.TestCase):
++class TestBinaryOpsMutating_Subclass_Set(_TestBinaryOpsMutating, __TestCase):
+     constructor1 = SetSubclass
+     constructor2 = set
+ 
+ 
+-class TestMethodsMutating(TestOperationsMutating):
++class _TestMethodsMutating(_TestOperationsMutating):
+ 
+     def test_issubset_with_mutation(self):
+         self.check_set_op_does_not_crash(set.issubset)
+@@ -1986,27 +2066,27 @@ class TestMethodsMutating(TestOperationsMutating):
+         self.check_set_op_does_not_crash(set.update)
+ 
+ 
+-class TestMethodsMutating_Set_Set(TestMethodsMutating, unittest.TestCase):
++class TestMethodsMutating_Set_Set(_TestMethodsMutating, __TestCase):
+     constructor1 = set
+     constructor2 = set
+ 
+-class TestMethodsMutating_Subclass_Subclass(TestMethodsMutating, unittest.TestCase):
++class TestMethodsMutating_Subclass_Subclass(_TestMethodsMutating, __TestCase):
+     constructor1 = SetSubclass
+     constructor2 = SetSubclass
+ 
+-class TestMethodsMutating_Set_Subclass(TestMethodsMutating, unittest.TestCase):
++class TestMethodsMutating_Set_Subclass(_TestMethodsMutating, __TestCase):
+     constructor1 = set
+     constructor2 = SetSubclass
+ 
+-class TestMethodsMutating_Subclass_Set(TestMethodsMutating, unittest.TestCase):
++class TestMethodsMutating_Subclass_Set(_TestMethodsMutating, __TestCase):
+     constructor1 = SetSubclass
+     constructor2 = set
+ 
+-class TestMethodsMutating_Set_Dict(TestMethodsMutating, unittest.TestCase):
++class TestMethodsMutating_Set_Dict(_TestMethodsMutating, __TestCase):
+     constructor1 = set
+     constructor2 = dict.fromkeys
+ 
+-class TestMethodsMutating_Set_List(TestMethodsMutating, unittest.TestCase):
++class TestMethodsMutating_Set_List(_TestMethodsMutating, __TestCase):
+     constructor1 = set
+     constructor2 = list
+ 
+@@ -2068,7 +2148,7 @@ def faces(G):
+     return f
+ 
+ 
+-class TestGraphs(unittest.TestCase):
++class TestGraphs(__TestCase):
+ 
+     def test_cube(self):
+ 
+@@ -2118,4 +2198,4 @@ class TestGraphs(unittest.TestCase):
+ #==============================================================================
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_set.py b/test/dynamo/cpython/3_13/test_set.py
new file mode 100644
index 000000000000..0b8e99a04c45
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_set.py
@@ -0,0 +1,2201 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+import unittest
+from test import support
+from test.support import warnings_helper
+import gc
+import weakref
+import operator
+import copy
+import pickle
+from random import randrange, shuffle
+import warnings
+import collections
+import collections.abc
+import itertools
+
+class PassThru(Exception):
+    pass
+
+def check_pass_thru():
+    raise PassThru
+    yield 1
+
+class BadCmp:
+    def __hash__(self):
+        return 1
+    def __eq__(self, other):
+        raise RuntimeError
+
+class ReprWrapper:
+    'Used to test self-referential repr() calls'
+    def __repr__(self):
+        return repr(self.value)
+
+class HashCountingInt(int):
+    'int-like object that counts the number of times __hash__ is called'
+    def __init__(self, *args):
+        self.hash_count = 0
+    def __hash__(self):
+        self.hash_count += 1
+        return int.__hash__(self)
+
+class _TestJointOps:
+    # Tests common to both set and frozenset
+
+    def setUp(self):
+        self.word = word = 'simsalabim'
+        self.otherword = 'madagascar'
+        self.letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+        self.s = self.thetype(word)
+        self.d = dict.fromkeys(word)
+        super().setUp()
+
+    def test_new_or_init(self):
+        self.assertRaises(TypeError, self.thetype, [], 2)
+        self.assertRaises(TypeError, set().__init__, a=1)
+
+    def test_uniquification(self):
+        actual = sorted(self.s)
+        expected = sorted(self.d)
+        self.assertEqual(actual, expected)
+        self.assertRaises(PassThru, self.thetype, check_pass_thru())
+        self.assertRaises(TypeError, self.thetype, [[]])
+
+    def test_len(self):
+        self.assertEqual(len(self.s), len(self.d))
+
+    def test_contains(self):
+        for c in self.letters:
+            self.assertEqual(c in self.s, c in self.d)
+        self.assertRaises(TypeError, self.s.__contains__, [[]])
+        s = self.thetype([frozenset(self.letters)])
+        self.assertIn(self.thetype(self.letters), s)
+
+    def test_union(self):
+        u = self.s.union(self.otherword)
+        for c in self.letters:
+            self.assertEqual(c in u, c in self.d or c in self.otherword)
+        self.assertEqual(self.s, self.thetype(self.word))
+        self.assertEqual(type(u), self.basetype)
+        self.assertRaises(PassThru, self.s.union, check_pass_thru())
+        self.assertRaises(TypeError, self.s.union, [[]])
+        for C in set, frozenset, dict.fromkeys, str, list, tuple:
+            self.assertEqual(self.thetype('abcba').union(C('cdc')), set('abcd'))
+            self.assertEqual(self.thetype('abcba').union(C('efgfe')), set('abcefg'))
+            self.assertEqual(self.thetype('abcba').union(C('ccb')), set('abc'))
+            self.assertEqual(self.thetype('abcba').union(C('ef')), set('abcef'))
+            self.assertEqual(self.thetype('abcba').union(C('ef'), C('fg')), set('abcefg'))
+
+        # Issue #6573
+        x = self.thetype()
+        self.assertEqual(x.union(set([1]), x, set([2])), self.thetype([1, 2]))
+
+    def test_or(self):
+        i = self.s.union(self.otherword)
+        self.assertEqual(self.s | set(self.otherword), i)
+        self.assertEqual(self.s | frozenset(self.otherword), i)
+        try:
+            self.s | self.otherword
+        except TypeError:
+            pass
+        else:
+            self.fail("s|t did not screen-out general iterables")
+
+    def test_intersection(self):
+        i = self.s.intersection(self.otherword)
+        for c in self.letters:
+            self.assertEqual(c in i, c in self.d and c in self.otherword)
+        self.assertEqual(self.s, self.thetype(self.word))
+        self.assertEqual(type(i), self.basetype)
+        self.assertRaises(PassThru, self.s.intersection, check_pass_thru())
+        for C in set, frozenset, dict.fromkeys, str, list, tuple:
+            self.assertEqual(self.thetype('abcba').intersection(C('cdc')), set('cc'))
+            self.assertEqual(self.thetype('abcba').intersection(C('efgfe')), set(''))
+            self.assertEqual(self.thetype('abcba').intersection(C('ccb')), set('bc'))
+            self.assertEqual(self.thetype('abcba').intersection(C('ef')), set(''))
+            self.assertEqual(self.thetype('abcba').intersection(C('cbcf'), C('bag')), set('b'))
+        s = self.thetype('abcba')
+        z = s.intersection()
+        if self.thetype == frozenset():
+            self.assertEqual(id(s), id(z))
+        else:
+            self.assertNotEqual(id(s), id(z))
+
+    def test_isdisjoint(self):
+        def f(s1, s2):
+            'Pure python equivalent of isdisjoint()'
+            return not set(s1).intersection(s2)
+        for larg in '', 'a', 'ab', 'abc', 'ababac', 'cdc', 'cc', 'efgfe', 'ccb', 'ef':
+            s1 = self.thetype(larg)
+            for rarg in '', 'a', 'ab', 'abc', 'ababac', 'cdc', 'cc', 'efgfe', 'ccb', 'ef':
+                for C in set, frozenset, dict.fromkeys, str, list, tuple:
+                    s2 = C(rarg)
+                    actual = s1.isdisjoint(s2)
+                    expected = f(s1, s2)
+                    self.assertEqual(actual, expected)
+                    self.assertTrue(actual is True or actual is False)
+
+    def test_and(self):
+        i = self.s.intersection(self.otherword)
+        self.assertEqual(self.s & set(self.otherword), i)
+        self.assertEqual(self.s & frozenset(self.otherword), i)
+        try:
+            self.s & self.otherword
+        except TypeError:
+            pass
+        else:
+            self.fail("s&t did not screen-out general iterables")
+
+    def test_difference(self):
+        i = self.s.difference(self.otherword)
+        for c in self.letters:
+            self.assertEqual(c in i, c in self.d and c not in self.otherword)
+        self.assertEqual(self.s, self.thetype(self.word))
+        self.assertEqual(type(i), self.basetype)
+        self.assertRaises(PassThru, self.s.difference, check_pass_thru())
+        self.assertRaises(TypeError, self.s.difference, [[]])
+        for C in set, frozenset, dict.fromkeys, str, list, tuple:
+            self.assertEqual(self.thetype('abcba').difference(C('cdc')), set('ab'))
+            self.assertEqual(self.thetype('abcba').difference(C('efgfe')), set('abc'))
+            self.assertEqual(self.thetype('abcba').difference(C('ccb')), set('a'))
+            self.assertEqual(self.thetype('abcba').difference(C('ef')), set('abc'))
+            self.assertEqual(self.thetype('abcba').difference(), set('abc'))
+            self.assertEqual(self.thetype('abcba').difference(C('a'), C('b')), set('c'))
+
+    def test_sub(self):
+        i = self.s.difference(self.otherword)
+        self.assertEqual(self.s - set(self.otherword), i)
+        self.assertEqual(self.s - frozenset(self.otherword), i)
+        try:
+            self.s - self.otherword
+        except TypeError:
+            pass
+        else:
+            self.fail("s-t did not screen-out general iterables")
+
+    def test_symmetric_difference(self):
+        i = self.s.symmetric_difference(self.otherword)
+        for c in self.letters:
+            self.assertEqual(c in i, (c in self.d) ^ (c in self.otherword))
+        self.assertEqual(self.s, self.thetype(self.word))
+        self.assertEqual(type(i), self.basetype)
+        self.assertRaises(PassThru, self.s.symmetric_difference, check_pass_thru())
+        self.assertRaises(TypeError, self.s.symmetric_difference, [[]])
+        for C in set, frozenset, dict.fromkeys, str, list, tuple:
+            self.assertEqual(self.thetype('abcba').symmetric_difference(C('cdc')), set('abd'))
+            self.assertEqual(self.thetype('abcba').symmetric_difference(C('efgfe')), set('abcefg'))
+            self.assertEqual(self.thetype('abcba').symmetric_difference(C('ccb')), set('a'))
+            self.assertEqual(self.thetype('abcba').symmetric_difference(C('ef')), set('abcef'))
+
+    def test_xor(self):
+        i = self.s.symmetric_difference(self.otherword)
+        self.assertEqual(self.s ^ set(self.otherword), i)
+        self.assertEqual(self.s ^ frozenset(self.otherword), i)
+        try:
+            self.s ^ self.otherword
+        except TypeError:
+            pass
+        else:
+            self.fail("s^t did not screen-out general iterables")
+
+    def test_equality(self):
+        self.assertEqual(self.s, set(self.word))
+        self.assertEqual(self.s, frozenset(self.word))
+        self.assertEqual(self.s == self.word, False)
+        self.assertNotEqual(self.s, set(self.otherword))
+        self.assertNotEqual(self.s, frozenset(self.otherword))
+        self.assertEqual(self.s != self.word, True)
+
+    def test_setOfFrozensets(self):
+        t = map(frozenset, ['abcdef', 'bcd', 'bdcb', 'fed', 'fedccba'])
+        s = self.thetype(t)
+        self.assertEqual(len(s), 3)
+
+    def test_sub_and_super(self):
+        p, q, r = map(self.thetype, ['ab', 'abcde', 'def'])
+        self.assertTrue(p < q)
+        self.assertTrue(p <= q)
+        self.assertTrue(q <= q)
+        self.assertTrue(q > p)
+        self.assertTrue(q >= p)
+        self.assertFalse(q < r)
+        self.assertFalse(q <= r)
+        self.assertFalse(q > r)
+        self.assertFalse(q >= r)
+        self.assertTrue(set('a').issubset('abc'))
+        self.assertTrue(set('abc').issuperset('a'))
+        self.assertFalse(set('a').issubset('cbs'))
+        self.assertFalse(set('cbs').issuperset('a'))
+
+    def test_pickling(self):
+        for i in range(pickle.HIGHEST_PROTOCOL + 1):
+            if type(self.s) not in (set, frozenset):
+                self.s.x = ['x']
+                self.s.z = ['z']
+            p = pickle.dumps(self.s, i)
+            dup = pickle.loads(p)
+            self.assertEqual(self.s, dup, "%s != %s" % (self.s, dup))
+            if type(self.s) not in (set, frozenset):
+                self.assertEqual(self.s.x, dup.x)
+                self.assertEqual(self.s.z, dup.z)
+                self.assertFalse(hasattr(self.s, 'y'))
+                del self.s.x, self.s.z
+
+    def test_iterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            itorg = iter(self.s)
+            data = self.thetype(self.s)
+            d = pickle.dumps(itorg, proto)
+            it = pickle.loads(d)
+            # Set iterators unpickle as list iterators due to the
+            # undefined order of set items.
+            # self.assertEqual(type(itorg), type(it))
+            self.assertIsInstance(it, collections.abc.Iterator)
+            self.assertEqual(self.thetype(it), data)
+
+            it = pickle.loads(d)
+            try:
+                drop = next(it)
+            except StopIteration:
+                continue
+            d = pickle.dumps(it, proto)
+            it = pickle.loads(d)
+            self.assertEqual(self.thetype(it), data - self.thetype((drop,)))
+
+    def test_deepcopy(self):
+        class Tracer:
+            def __init__(self, value):
+                self.value = value
+            def __hash__(self):
+                return self.value
+            def __deepcopy__(self, memo=None):
+                return Tracer(self.value + 1)
+        t = Tracer(10)
+        s = self.thetype([t])
+        dup = copy.deepcopy(s)
+        self.assertNotEqual(id(s), id(dup))
+        for elem in dup:
+            newt = elem
+        self.assertNotEqual(id(t), id(newt))
+        self.assertEqual(t.value + 1, newt.value)
+
+    def test_gc(self):
+        # Create a nest of cycles to exercise overall ref count check
+        class A:
+            pass
+        s = set(A() for i in range(1000))
+        for elem in s:
+            elem.cycle = s
+            elem.sub = elem
+            elem.set = set([elem])
+
+    def test_subclass_with_custom_hash(self):
+        # Bug #1257731
+        class H(self.thetype):
+            def __hash__(self):
+                return int(id(self) & 0x7fffffff)
+        s=H()
+        f=set()
+        f.add(s)
+        self.assertIn(s, f)
+        f.remove(s)
+        f.add(s)
+        f.discard(s)
+
+    def test_badcmp(self):
+        s = self.thetype([BadCmp()])
+        # Detect comparison errors during insertion and lookup
+        self.assertRaises(RuntimeError, self.thetype, [BadCmp(), BadCmp()])
+        self.assertRaises(RuntimeError, s.__contains__, BadCmp())
+        # Detect errors during mutating operations
+        if hasattr(s, 'add'):
+            self.assertRaises(RuntimeError, s.add, BadCmp())
+            self.assertRaises(RuntimeError, s.discard, BadCmp())
+            self.assertRaises(RuntimeError, s.remove, BadCmp())
+
+    def test_cyclical_repr(self):
+        w = ReprWrapper()
+        s = self.thetype([w])
+        w.value = s
+        if self.thetype == set:
+            self.assertEqual(repr(s), '{set(...)}')
+        else:
+            name = repr(s).partition('(')[0]    # strip class name
+            self.assertEqual(repr(s), '%s({%s(...)})' % (name, name))
+
+    def test_do_not_rehash_dict_keys(self):
+        n = 10
+        d = dict.fromkeys(map(HashCountingInt, range(n)))
+        self.assertEqual(sum(elem.hash_count for elem in d), n)
+        s = self.thetype(d)
+        self.assertEqual(sum(elem.hash_count for elem in d), n)
+        s.difference(d)
+        self.assertEqual(sum(elem.hash_count for elem in d), n)
+        if hasattr(s, 'symmetric_difference_update'):
+            s.symmetric_difference_update(d)
+        self.assertEqual(sum(elem.hash_count for elem in d), n)
+        d2 = dict.fromkeys(set(d))
+        self.assertEqual(sum(elem.hash_count for elem in d), n)
+        d3 = dict.fromkeys(frozenset(d))
+        self.assertEqual(sum(elem.hash_count for elem in d), n)
+        d3 = dict.fromkeys(frozenset(d), 123)
+        self.assertEqual(sum(elem.hash_count for elem in d), n)
+        self.assertEqual(d3, dict.fromkeys(d, 123))
+
+    def test_container_iterator(self):
+        # Bug #3680: tp_traverse was not implemented for set iterator object
+        class C(object):
+            pass
+        obj = C()
+        ref = weakref.ref(obj)
+        container = set([obj, 1])
+        obj.x = iter(container)
+        del obj, container
+        gc.collect()
+        self.assertTrue(ref() is None, "Cycle was not collected")
+
+    def test_free_after_iterating(self):
+        support.check_free_after_iterating(self, iter, self.thetype)
+
+class TestSet(_TestJointOps, __TestCase):
+    thetype = set
+    basetype = set
+
+    def test_init(self):
+        s = self.thetype()
+        s.__init__(self.word)
+        self.assertEqual(s, set(self.word))
+        s.__init__(self.otherword)
+        self.assertEqual(s, set(self.otherword))
+        self.assertRaises(TypeError, s.__init__, s, 2)
+        self.assertRaises(TypeError, s.__init__, 1)
+
+    def test_constructor_identity(self):
+        s = self.thetype(range(3))
+        t = self.thetype(s)
+        self.assertNotEqual(id(s), id(t))
+
+    def test_set_literal(self):
+        s = set([1,2,3])
+        t = {1,2,3}
+        self.assertEqual(s, t)
+
+    def test_set_literal_insertion_order(self):
+        # SF Issue #26020 -- Expect left to right insertion
+        s = {1, 1.0, True}
+        self.assertEqual(len(s), 1)
+        stored_value = s.pop()
+        self.assertEqual(type(stored_value), int)
+
+    def test_set_literal_evaluation_order(self):
+        # Expect left to right expression evaluation
+        events = []
+        def record(obj):
+            events.append(obj)
+        s = {record(1), record(2), record(3)}
+        self.assertEqual(events, [1, 2, 3])
+
+    def test_hash(self):
+        self.assertRaises(TypeError, hash, self.s)
+
+    def test_clear(self):
+        self.s.clear()
+        self.assertEqual(self.s, set())
+        self.assertEqual(len(self.s), 0)
+
+    def test_copy(self):
+        dup = self.s.copy()
+        self.assertEqual(self.s, dup)
+        self.assertNotEqual(id(self.s), id(dup))
+        self.assertEqual(type(dup), self.basetype)
+
+    def test_add(self):
+        self.s.add('Q')
+        self.assertIn('Q', self.s)
+        dup = self.s.copy()
+        self.s.add('Q')
+        self.assertEqual(self.s, dup)
+        self.assertRaises(TypeError, self.s.add, [])
+
+    def test_remove(self):
+        self.s.remove('a')
+        self.assertNotIn('a', self.s)
+        self.assertRaises(KeyError, self.s.remove, 'Q')
+        self.assertRaises(TypeError, self.s.remove, [])
+        s = self.thetype([frozenset(self.word)])
+        self.assertIn(self.thetype(self.word), s)
+        s.remove(self.thetype(self.word))
+        self.assertNotIn(self.thetype(self.word), s)
+        self.assertRaises(KeyError, self.s.remove, self.thetype(self.word))
+
+    def test_remove_keyerror_unpacking(self):
+        # https://bugs.python.org/issue1576657
+        for v1 in ['Q', (1,)]:
+            try:
+                self.s.remove(v1)
+            except KeyError as e:
+                v2 = e.args[0]
+                self.assertEqual(v1, v2)
+            else:
+                self.fail()
+
+    def test_remove_keyerror_set(self):
+        key = self.thetype([3, 4])
+        try:
+            self.s.remove(key)
+        except KeyError as e:
+            self.assertTrue(e.args[0] is key,
+                         "KeyError should be {0}, not {1}".format(key,
+                                                                  e.args[0]))
+        else:
+            self.fail()
+
+    def test_discard(self):
+        self.s.discard('a')
+        self.assertNotIn('a', self.s)
+        self.s.discard('Q')
+        self.assertRaises(TypeError, self.s.discard, [])
+        s = self.thetype([frozenset(self.word)])
+        self.assertIn(self.thetype(self.word), s)
+        s.discard(self.thetype(self.word))
+        self.assertNotIn(self.thetype(self.word), s)
+        s.discard(self.thetype(self.word))
+
+    def test_pop(self):
+        for i in range(len(self.s)):
+            elem = self.s.pop()
+            self.assertNotIn(elem, self.s)
+        self.assertRaises(KeyError, self.s.pop)
+
+    def test_update(self):
+        retval = self.s.update(self.otherword)
+        self.assertEqual(retval, None)
+        for c in (self.word + self.otherword):
+            self.assertIn(c, self.s)
+        self.assertRaises(PassThru, self.s.update, check_pass_thru())
+        self.assertRaises(TypeError, self.s.update, [[]])
+        for p, q in (('cdc', 'abcd'), ('efgfe', 'abcefg'), ('ccb', 'abc'), ('ef', 'abcef')):
+            for C in set, frozenset, dict.fromkeys, str, list, tuple:
+                s = self.thetype('abcba')
+                self.assertEqual(s.update(C(p)), None)
+                self.assertEqual(s, set(q))
+        for p in ('cdc', 'efgfe', 'ccb', 'ef', 'abcda'):
+            q = 'ahi'
+            for C in set, frozenset, dict.fromkeys, str, list, tuple:
+                s = self.thetype('abcba')
+                self.assertEqual(s.update(C(p), C(q)), None)
+                self.assertEqual(s, set(s) | set(p) | set(q))
+
+    def test_ior(self):
+        self.s |= set(self.otherword)
+        for c in (self.word + self.otherword):
+            self.assertIn(c, self.s)
+
+    def test_intersection_update(self):
+        retval = self.s.intersection_update(self.otherword)
+        self.assertEqual(retval, None)
+        for c in (self.word + self.otherword):
+            if c in self.otherword and c in self.word:
+                self.assertIn(c, self.s)
+            else:
+                self.assertNotIn(c, self.s)
+        self.assertRaises(PassThru, self.s.intersection_update, check_pass_thru())
+        self.assertRaises(TypeError, self.s.intersection_update, [[]])
+        for p, q in (('cdc', 'c'), ('efgfe', ''), ('ccb', 'bc'), ('ef', '')):
+            for C in set, frozenset, dict.fromkeys, str, list, tuple:
+                s = self.thetype('abcba')
+                self.assertEqual(s.intersection_update(C(p)), None)
+                self.assertEqual(s, set(q))
+                ss = 'abcba'
+                s = self.thetype(ss)
+                t = 'cbc'
+                self.assertEqual(s.intersection_update(C(p), C(t)), None)
+                self.assertEqual(s, set('abcba')&set(p)&set(t))
+
+    def test_iand(self):
+        self.s &= set(self.otherword)
+        for c in (self.word + self.otherword):
+            if c in self.otherword and c in self.word:
+                self.assertIn(c, self.s)
+            else:
+                self.assertNotIn(c, self.s)
+
+    def test_difference_update(self):
+        retval = self.s.difference_update(self.otherword)
+        self.assertEqual(retval, None)
+        for c in (self.word + self.otherword):
+            if c in self.word and c not in self.otherword:
+                self.assertIn(c, self.s)
+            else:
+                self.assertNotIn(c, self.s)
+        self.assertRaises(PassThru, self.s.difference_update, check_pass_thru())
+        self.assertRaises(TypeError, self.s.difference_update, [[]])
+        self.assertRaises(TypeError, self.s.symmetric_difference_update, [[]])
+        for p, q in (('cdc', 'ab'), ('efgfe', 'abc'), ('ccb', 'a'), ('ef', 'abc')):
+            for C in set, frozenset, dict.fromkeys, str, list, tuple:
+                s = self.thetype('abcba')
+                self.assertEqual(s.difference_update(C(p)), None)
+                self.assertEqual(s, set(q))
+
+                s = self.thetype('abcdefghih')
+                s.difference_update()
+                self.assertEqual(s, self.thetype('abcdefghih'))
+
+                s = self.thetype('abcdefghih')
+                s.difference_update(C('aba'))
+                self.assertEqual(s, self.thetype('cdefghih'))
+
+                s = self.thetype('abcdefghih')
+                s.difference_update(C('cdc'), C('aba'))
+                self.assertEqual(s, self.thetype('efghih'))
+
+    def test_isub(self):
+        self.s -= set(self.otherword)
+        for c in (self.word + self.otherword):
+            if c in self.word and c not in self.otherword:
+                self.assertIn(c, self.s)
+            else:
+                self.assertNotIn(c, self.s)
+
+    def test_symmetric_difference_update(self):
+        retval = self.s.symmetric_difference_update(self.otherword)
+        self.assertEqual(retval, None)
+        for c in (self.word + self.otherword):
+            if (c in self.word) ^ (c in self.otherword):
+                self.assertIn(c, self.s)
+            else:
+                self.assertNotIn(c, self.s)
+        self.assertRaises(PassThru, self.s.symmetric_difference_update, check_pass_thru())
+        self.assertRaises(TypeError, self.s.symmetric_difference_update, [[]])
+        for p, q in (('cdc', 'abd'), ('efgfe', 'abcefg'), ('ccb', 'a'), ('ef', 'abcef')):
+            for C in set, frozenset, dict.fromkeys, str, list, tuple:
+                s = self.thetype('abcba')
+                self.assertEqual(s.symmetric_difference_update(C(p)), None)
+                self.assertEqual(s, set(q))
+
+    def test_ixor(self):
+        self.s ^= set(self.otherword)
+        for c in (self.word + self.otherword):
+            if (c in self.word) ^ (c in self.otherword):
+                self.assertIn(c, self.s)
+            else:
+                self.assertNotIn(c, self.s)
+
+    def test_inplace_on_self(self):
+        t = self.s.copy()
+        t |= t
+        self.assertEqual(t, self.s)
+        t &= t
+        self.assertEqual(t, self.s)
+        t -= t
+        self.assertEqual(t, self.thetype())
+        t = self.s.copy()
+        t ^= t
+        self.assertEqual(t, self.thetype())
+
+    def test_weakref(self):
+        s = self.thetype('gallahad')
+        p = weakref.proxy(s)
+        self.assertEqual(str(p), str(s))
+        s = None
+        support.gc_collect()  # For PyPy or other GCs.
+        self.assertRaises(ReferenceError, str, p)
+
+    def test_rich_compare(self):
+        class TestRichSetCompare:
+            def __gt__(self, some_set):
+                self.gt_called = True
+                return False
+            def __lt__(self, some_set):
+                self.lt_called = True
+                return False
+            def __ge__(self, some_set):
+                self.ge_called = True
+                return False
+            def __le__(self, some_set):
+                self.le_called = True
+                return False
+
+        # This first tries the builtin rich set comparison, which doesn't know
+        # how to handle the custom object. Upon returning NotImplemented, the
+        # corresponding comparison on the right object is invoked.
+        myset = {1, 2, 3}
+
+        myobj = TestRichSetCompare()
+        myset < myobj
+        self.assertTrue(myobj.gt_called)
+
+        myobj = TestRichSetCompare()
+        myset > myobj
+        self.assertTrue(myobj.lt_called)
+
+        myobj = TestRichSetCompare()
+        myset <= myobj
+        self.assertTrue(myobj.ge_called)
+
+        myobj = TestRichSetCompare()
+        myset >= myobj
+        self.assertTrue(myobj.le_called)
+
+
+class SetSubclass(set):
+    pass
+
+class TestSetSubclass(TestSet):
+    thetype = SetSubclass
+    basetype = set
+
+    def test_keywords_in_subclass(self):
+        class subclass(set):
+            pass
+        u = subclass([1, 2])
+        self.assertIs(type(u), subclass)
+        self.assertEqual(set(u), {1, 2})
+        with self.assertRaises(TypeError):
+            subclass(sequence=())
+
+        class subclass_with_init(set):
+            def __init__(self, arg, newarg=None):
+                super().__init__(arg)
+                self.newarg = newarg
+        u = subclass_with_init([1, 2], newarg=3)
+        self.assertIs(type(u), subclass_with_init)
+        self.assertEqual(set(u), {1, 2})
+        self.assertEqual(u.newarg, 3)
+
+        class subclass_with_new(set):
+            def __new__(cls, arg, newarg=None):
+                self = super().__new__(cls, arg)
+                self.newarg = newarg
+                return self
+        u = subclass_with_new([1, 2])
+        self.assertIs(type(u), subclass_with_new)
+        self.assertEqual(set(u), {1, 2})
+        self.assertIsNone(u.newarg)
+        # disallow kwargs in __new__ only (https://bugs.python.org/issue43413#msg402000)
+        with self.assertRaises(TypeError):
+            subclass_with_new([1, 2], newarg=3)
+
+
+class TestFrozenSet(_TestJointOps, __TestCase):
+    thetype = frozenset
+    basetype = frozenset
+
+    def test_init(self):
+        s = self.thetype(self.word)
+        s.__init__(self.otherword)
+        self.assertEqual(s, set(self.word))
+
+    def test_constructor_identity(self):
+        s = self.thetype(range(3))
+        t = self.thetype(s)
+        self.assertEqual(id(s), id(t))
+
+    def test_hash(self):
+        self.assertEqual(hash(self.thetype('abcdeb')),
+                         hash(self.thetype('ebecda')))
+
+        # make sure that all permutations give the same hash value
+        n = 100
+        seq = [randrange(n) for i in range(n)]
+        results = set()
+        for i in range(200):
+            shuffle(seq)
+            results.add(hash(self.thetype(seq)))
+        self.assertEqual(len(results), 1)
+
+    def test_copy(self):
+        dup = self.s.copy()
+        self.assertEqual(id(self.s), id(dup))
+
+    def test_frozen_as_dictkey(self):
+        seq = list(range(10)) + list('abcdefg') + ['apple']
+        key1 = self.thetype(seq)
+        key2 = self.thetype(reversed(seq))
+        self.assertEqual(key1, key2)
+        self.assertNotEqual(id(key1), id(key2))
+        d = {}
+        d[key1] = 42
+        self.assertEqual(d[key2], 42)
+
+    def test_hash_caching(self):
+        f = self.thetype('abcdcda')
+        self.assertEqual(hash(f), hash(f))
+
+    def test_hash_effectiveness(self):
+        n = 13
+        hashvalues = set()
+        addhashvalue = hashvalues.add
+        elemmasks = [(i+1, 1<<i) for i in range(n)]
+        for i in range(2**n):
+            addhashvalue(hash(frozenset([e for e, m in elemmasks if m&i])))
+        self.assertEqual(len(hashvalues), 2**n)
+
+        def zf_range(n):
+            # https://en.wikipedia.org/wiki/Set-theoretic_definition_of_natural_numbers
+            nums = [frozenset()]
+            for i in range(n-1):
+                num = frozenset(nums)
+                nums.append(num)
+            return nums[:n]
+
+        def powerset(s):
+            for i in range(len(s)+1):
+                yield from map(frozenset, itertools.combinations(s, i))
+
+        for n in range(18):
+            t = 2 ** n
+            mask = t - 1
+            for nums in (range, zf_range):
+                u = len({h & mask for h in map(hash, powerset(nums(n)))})
+                self.assertGreater(4*u, t)
+
+class FrozenSetSubclass(frozenset):
+    pass
+
+class TestFrozenSetSubclass(TestFrozenSet):
+    thetype = FrozenSetSubclass
+    basetype = frozenset
+
+    def test_keywords_in_subclass(self):
+        class subclass(frozenset):
+            pass
+        u = subclass([1, 2])
+        self.assertIs(type(u), subclass)
+        self.assertEqual(set(u), {1, 2})
+        with self.assertRaises(TypeError):
+            subclass(sequence=())
+
+        class subclass_with_init(frozenset):
+            def __init__(self, arg, newarg=None):
+                self.newarg = newarg
+        u = subclass_with_init([1, 2], newarg=3)
+        self.assertIs(type(u), subclass_with_init)
+        self.assertEqual(set(u), {1, 2})
+        self.assertEqual(u.newarg, 3)
+
+        class subclass_with_new(frozenset):
+            def __new__(cls, arg, newarg=None):
+                self = super().__new__(cls, arg)
+                self.newarg = newarg
+                return self
+        u = subclass_with_new([1, 2], newarg=3)
+        self.assertIs(type(u), subclass_with_new)
+        self.assertEqual(set(u), {1, 2})
+        self.assertEqual(u.newarg, 3)
+
+    def test_constructor_identity(self):
+        s = self.thetype(range(3))
+        t = self.thetype(s)
+        self.assertNotEqual(id(s), id(t))
+
+    def test_copy(self):
+        dup = self.s.copy()
+        self.assertNotEqual(id(self.s), id(dup))
+
+    def test_nested_empty_constructor(self):
+        s = self.thetype()
+        t = self.thetype(s)
+        self.assertEqual(s, t)
+
+    def test_singleton_empty_frozenset(self):
+        Frozenset = self.thetype
+        f = frozenset()
+        F = Frozenset()
+        efs = [Frozenset(), Frozenset([]), Frozenset(()), Frozenset(''),
+               Frozenset(), Frozenset([]), Frozenset(()), Frozenset(''),
+               Frozenset(range(0)), Frozenset(Frozenset()),
+               Frozenset(frozenset()), f, F, Frozenset(f), Frozenset(F)]
+        # All empty frozenset subclass instances should have different ids
+        self.assertEqual(len(set(map(id, efs))), len(efs))
+
+
+class SetSubclassWithSlots(set):
+    __slots__ = ('x', 'y', '__dict__')
+
+class TestSetSubclassWithSlots(__TestCase):
+    thetype = SetSubclassWithSlots
+    test_pickling = _TestJointOps.test_pickling
+
+    def setUp(self):
+        self.word = word = 'simsalabim'
+        self.otherword = 'madagascar'
+        self.letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+        self.s = self.thetype(word)
+        self.d = dict.fromkeys(word)
+        super().setUp()
+
+class FrozenSetSubclassWithSlots(frozenset):
+    __slots__ = ('x', 'y', '__dict__')
+
+class TestFrozenSetSubclassWithSlots(TestSetSubclassWithSlots):
+    thetype = FrozenSetSubclassWithSlots
+
+# Tests taken from test_sets.py =============================================
+
+empty_set = set()
+
+#==============================================================================
+
+class _TestBasicOps:
+
+    def test_repr(self):
+        if self.repr is not None:
+            self.assertEqual(repr(self.set), self.repr)
+
+    def check_repr_against_values(self):
+        text = repr(self.set)
+        self.assertTrue(text.startswith('{'))
+        self.assertTrue(text.endswith('}'))
+
+        result = text[1:-1].split(', ')
+        result.sort()
+        sorted_repr_values = [repr(value) for value in self.values]
+        sorted_repr_values.sort()
+        self.assertEqual(result, sorted_repr_values)
+
+    def test_length(self):
+        self.assertEqual(len(self.set), self.length)
+
+    def test_self_equality(self):
+        self.assertEqual(self.set, self.set)
+
+    def test_equivalent_equality(self):
+        self.assertEqual(self.set, self.dup)
+
+    def test_copy(self):
+        self.assertEqual(self.set.copy(), self.dup)
+
+    def test_self_union(self):
+        result = self.set | self.set
+        self.assertEqual(result, self.dup)
+
+    def test_empty_union(self):
+        result = self.set | empty_set
+        self.assertEqual(result, self.dup)
+
+    def test_union_empty(self):
+        result = empty_set | self.set
+        self.assertEqual(result, self.dup)
+
+    def test_self_intersection(self):
+        result = self.set & self.set
+        self.assertEqual(result, self.dup)
+
+    def test_empty_intersection(self):
+        result = self.set & empty_set
+        self.assertEqual(result, empty_set)
+
+    def test_intersection_empty(self):
+        result = empty_set & self.set
+        self.assertEqual(result, empty_set)
+
+    def test_self_isdisjoint(self):
+        result = self.set.isdisjoint(self.set)
+        self.assertEqual(result, not self.set)
+
+    def test_empty_isdisjoint(self):
+        result = self.set.isdisjoint(empty_set)
+        self.assertEqual(result, True)
+
+    def test_isdisjoint_empty(self):
+        result = empty_set.isdisjoint(self.set)
+        self.assertEqual(result, True)
+
+    def test_self_symmetric_difference(self):
+        result = self.set ^ self.set
+        self.assertEqual(result, empty_set)
+
+    def test_empty_symmetric_difference(self):
+        result = self.set ^ empty_set
+        self.assertEqual(result, self.set)
+
+    def test_self_difference(self):
+        result = self.set - self.set
+        self.assertEqual(result, empty_set)
+
+    def test_empty_difference(self):
+        result = self.set - empty_set
+        self.assertEqual(result, self.dup)
+
+    def test_empty_difference_rev(self):
+        result = empty_set - self.set
+        self.assertEqual(result, empty_set)
+
+    def test_iteration(self):
+        for v in self.set:
+            self.assertIn(v, self.values)
+        setiter = iter(self.set)
+        self.assertEqual(setiter.__length_hint__(), len(self.set))
+
+    def test_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            p = pickle.dumps(self.set, proto)
+            copy = pickle.loads(p)
+            self.assertEqual(self.set, copy,
+                             "%s != %s" % (self.set, copy))
+
+    def test_issue_37219(self):
+        with self.assertRaises(TypeError):
+            set().difference(123)
+        with self.assertRaises(TypeError):
+            set().difference_update(123)
+
+#------------------------------------------------------------------------------
+
+class TestBasicOpsEmpty(_TestBasicOps, __TestCase):
+    def setUp(self):
+        self.case   = "empty set"
+        self.values = []
+        self.set    = set(self.values)
+        self.dup    = set(self.values)
+        self.length = 0
+        self.repr   = "set()"
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestBasicOpsSingleton(_TestBasicOps, __TestCase):
+    def setUp(self):
+        self.case   = "unit set (number)"
+        self.values = [3]
+        self.set    = set(self.values)
+        self.dup    = set(self.values)
+        self.length = 1
+        self.repr   = "{3}"
+        super().setUp()
+
+    def test_in(self):
+        self.assertIn(3, self.set)
+
+    def test_not_in(self):
+        self.assertNotIn(2, self.set)
+
+#------------------------------------------------------------------------------
+
+class TestBasicOpsTuple(_TestBasicOps, __TestCase):
+    def setUp(self):
+        self.case   = "unit set (tuple)"
+        self.values = [(0, "zero")]
+        self.set    = set(self.values)
+        self.dup    = set(self.values)
+        self.length = 1
+        self.repr   = "{(0, 'zero')}"
+        super().setUp()
+
+    def test_in(self):
+        self.assertIn((0, "zero"), self.set)
+
+    def test_not_in(self):
+        self.assertNotIn(9, self.set)
+
+#------------------------------------------------------------------------------
+
+class TestBasicOpsTriple(_TestBasicOps, __TestCase):
+    def setUp(self):
+        self.case   = "triple set"
+        self.values = [0, "zero", operator.add]
+        self.set    = set(self.values)
+        self.dup    = set(self.values)
+        self.length = 3
+        self.repr   = None
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestBasicOpsString(_TestBasicOps, __TestCase):
+    def setUp(self):
+        self.case   = "string set"
+        self.values = ["a", "b", "c"]
+        self.set    = set(self.values)
+        self.dup    = set(self.values)
+        self.length = 3
+        super().setUp()
+
+    def test_repr(self):
+        self.check_repr_against_values()
+
+#------------------------------------------------------------------------------
+
+class TestBasicOpsBytes(_TestBasicOps, __TestCase):
+    def setUp(self):
+        self.case   = "bytes set"
+        self.values = [b"a", b"b", b"c"]
+        self.set    = set(self.values)
+        self.dup    = set(self.values)
+        self.length = 3
+        super().setUp()
+
+    def test_repr(self):
+        self.check_repr_against_values()
+
+#------------------------------------------------------------------------------
+
+class TestBasicOpsMixedStringBytes(_TestBasicOps, __TestCase):
+    def setUp(self):
+        self.enterContext(warnings_helper.check_warnings())
+        warnings.simplefilter('ignore', BytesWarning)
+        self.case   = "string and bytes set"
+        self.values = ["a", "b", b"a", b"b"]
+        self.set    = set(self.values)
+        self.dup    = set(self.values)
+        self.length = 4
+        super().setUp()
+
+    def test_repr(self):
+        self.check_repr_against_values()
+
+#==============================================================================
+
+def baditer():
+    raise TypeError
+    yield True
+
+def gooditer():
+    yield True
+
+class TestExceptionPropagation(__TestCase):
+    """SF 628246:  Set constructor should not trap iterator TypeErrors"""
+
+    def test_instanceWithException(self):
+        self.assertRaises(TypeError, set, baditer())
+
+    def test_instancesWithoutException(self):
+        # All of these iterables should load without exception.
+        set([1,2,3])
+        set((1,2,3))
+        set({'one':1, 'two':2, 'three':3})
+        set(range(3))
+        set('abc')
+        set(gooditer())
+
+    def test_changingSizeWhileIterating(self):
+        s = set([1,2,3])
+        try:
+            for i in s:
+                s.update([4])
+        except RuntimeError:
+            pass
+        else:
+            self.fail("no exception when changing size during iteration")
+
+#==============================================================================
+
+class TestSetOfSets(__TestCase):
+    def test_constructor(self):
+        inner = frozenset([1])
+        outer = set([inner])
+        element = outer.pop()
+        self.assertEqual(type(element), frozenset)
+        outer.add(inner)        # Rebuild set of sets with .add method
+        outer.remove(inner)
+        self.assertEqual(outer, set())   # Verify that remove worked
+        outer.discard(inner)    # Absence of KeyError indicates working fine
+
+#==============================================================================
+
+class TestBinaryOps(__TestCase):
+    def setUp(self):
+        self.set = set((2, 4, 6))
+        super().setUp()
+
+    def test_eq(self):              # SF bug 643115
+        self.assertEqual(self.set, set({2:1,4:3,6:5}))
+
+    def test_union_subset(self):
+        result = self.set | set([2])
+        self.assertEqual(result, set((2, 4, 6)))
+
+    def test_union_superset(self):
+        result = self.set | set([2, 4, 6, 8])
+        self.assertEqual(result, set([2, 4, 6, 8]))
+
+    def test_union_overlap(self):
+        result = self.set | set([3, 4, 5])
+        self.assertEqual(result, set([2, 3, 4, 5, 6]))
+
+    def test_union_non_overlap(self):
+        result = self.set | set([8])
+        self.assertEqual(result, set([2, 4, 6, 8]))
+
+    def test_intersection_subset(self):
+        result = self.set & set((2, 4))
+        self.assertEqual(result, set((2, 4)))
+
+    def test_intersection_superset(self):
+        result = self.set & set([2, 4, 6, 8])
+        self.assertEqual(result, set([2, 4, 6]))
+
+    def test_intersection_overlap(self):
+        result = self.set & set([3, 4, 5])
+        self.assertEqual(result, set([4]))
+
+    def test_intersection_non_overlap(self):
+        result = self.set & set([8])
+        self.assertEqual(result, empty_set)
+
+    def test_isdisjoint_subset(self):
+        result = self.set.isdisjoint(set((2, 4)))
+        self.assertEqual(result, False)
+
+    def test_isdisjoint_superset(self):
+        result = self.set.isdisjoint(set([2, 4, 6, 8]))
+        self.assertEqual(result, False)
+
+    def test_isdisjoint_overlap(self):
+        result = self.set.isdisjoint(set([3, 4, 5]))
+        self.assertEqual(result, False)
+
+    def test_isdisjoint_non_overlap(self):
+        result = self.set.isdisjoint(set([8]))
+        self.assertEqual(result, True)
+
+    def test_sym_difference_subset(self):
+        result = self.set ^ set((2, 4))
+        self.assertEqual(result, set([6]))
+
+    def test_sym_difference_superset(self):
+        result = self.set ^ set((2, 4, 6, 8))
+        self.assertEqual(result, set([8]))
+
+    def test_sym_difference_overlap(self):
+        result = self.set ^ set((3, 4, 5))
+        self.assertEqual(result, set([2, 3, 5, 6]))
+
+    def test_sym_difference_non_overlap(self):
+        result = self.set ^ set([8])
+        self.assertEqual(result, set([2, 4, 6, 8]))
+
+#==============================================================================
+
+class TestUpdateOps(__TestCase):
+    def setUp(self):
+        self.set = set((2, 4, 6))
+        super().setUp()
+
+    def test_union_subset(self):
+        self.set |= set([2])
+        self.assertEqual(self.set, set((2, 4, 6)))
+
+    def test_union_superset(self):
+        self.set |= set([2, 4, 6, 8])
+        self.assertEqual(self.set, set([2, 4, 6, 8]))
+
+    def test_union_overlap(self):
+        self.set |= set([3, 4, 5])
+        self.assertEqual(self.set, set([2, 3, 4, 5, 6]))
+
+    def test_union_non_overlap(self):
+        self.set |= set([8])
+        self.assertEqual(self.set, set([2, 4, 6, 8]))
+
+    def test_union_method_call(self):
+        self.set.update(set([3, 4, 5]))
+        self.assertEqual(self.set, set([2, 3, 4, 5, 6]))
+
+    def test_intersection_subset(self):
+        self.set &= set((2, 4))
+        self.assertEqual(self.set, set((2, 4)))
+
+    def test_intersection_superset(self):
+        self.set &= set([2, 4, 6, 8])
+        self.assertEqual(self.set, set([2, 4, 6]))
+
+    def test_intersection_overlap(self):
+        self.set &= set([3, 4, 5])
+        self.assertEqual(self.set, set([4]))
+
+    def test_intersection_non_overlap(self):
+        self.set &= set([8])
+        self.assertEqual(self.set, empty_set)
+
+    def test_intersection_method_call(self):
+        self.set.intersection_update(set([3, 4, 5]))
+        self.assertEqual(self.set, set([4]))
+
+    def test_sym_difference_subset(self):
+        self.set ^= set((2, 4))
+        self.assertEqual(self.set, set([6]))
+
+    def test_sym_difference_superset(self):
+        self.set ^= set((2, 4, 6, 8))
+        self.assertEqual(self.set, set([8]))
+
+    def test_sym_difference_overlap(self):
+        self.set ^= set((3, 4, 5))
+        self.assertEqual(self.set, set([2, 3, 5, 6]))
+
+    def test_sym_difference_non_overlap(self):
+        self.set ^= set([8])
+        self.assertEqual(self.set, set([2, 4, 6, 8]))
+
+    def test_sym_difference_method_call(self):
+        self.set.symmetric_difference_update(set([3, 4, 5]))
+        self.assertEqual(self.set, set([2, 3, 5, 6]))
+
+    def test_difference_subset(self):
+        self.set -= set((2, 4))
+        self.assertEqual(self.set, set([6]))
+
+    def test_difference_superset(self):
+        self.set -= set((2, 4, 6, 8))
+        self.assertEqual(self.set, set([]))
+
+    def test_difference_overlap(self):
+        self.set -= set((3, 4, 5))
+        self.assertEqual(self.set, set([2, 6]))
+
+    def test_difference_non_overlap(self):
+        self.set -= set([8])
+        self.assertEqual(self.set, set([2, 4, 6]))
+
+    def test_difference_method_call(self):
+        self.set.difference_update(set([3, 4, 5]))
+        self.assertEqual(self.set, set([2, 6]))
+
+#==============================================================================
+
+class TestMutate(__TestCase):
+    def setUp(self):
+        self.values = ["a", "b", "c"]
+        self.set = set(self.values)
+        super().setUp()
+
+    def test_add_present(self):
+        self.set.add("c")
+        self.assertEqual(self.set, set("abc"))
+
+    def test_add_absent(self):
+        self.set.add("d")
+        self.assertEqual(self.set, set("abcd"))
+
+    def test_add_until_full(self):
+        tmp = set()
+        expected_len = 0
+        for v in self.values:
+            tmp.add(v)
+            expected_len += 1
+            self.assertEqual(len(tmp), expected_len)
+        self.assertEqual(tmp, self.set)
+
+    def test_remove_present(self):
+        self.set.remove("b")
+        self.assertEqual(self.set, set("ac"))
+
+    def test_remove_absent(self):
+        try:
+            self.set.remove("d")
+            self.fail("Removing missing element should have raised LookupError")
+        except LookupError:
+            pass
+
+    def test_remove_until_empty(self):
+        expected_len = len(self.set)
+        for v in self.values:
+            self.set.remove(v)
+            expected_len -= 1
+            self.assertEqual(len(self.set), expected_len)
+
+    def test_discard_present(self):
+        self.set.discard("c")
+        self.assertEqual(self.set, set("ab"))
+
+    def test_discard_absent(self):
+        self.set.discard("d")
+        self.assertEqual(self.set, set("abc"))
+
+    def test_clear(self):
+        self.set.clear()
+        self.assertEqual(len(self.set), 0)
+
+    def test_pop(self):
+        popped = {}
+        while self.set:
+            popped[self.set.pop()] = None
+        self.assertEqual(len(popped), len(self.values))
+        for v in self.values:
+            self.assertIn(v, popped)
+
+    def test_update_empty_tuple(self):
+        self.set.update(())
+        self.assertEqual(self.set, set(self.values))
+
+    def test_update_unit_tuple_overlap(self):
+        self.set.update(("a",))
+        self.assertEqual(self.set, set(self.values))
+
+    def test_update_unit_tuple_non_overlap(self):
+        self.set.update(("a", "z"))
+        self.assertEqual(self.set, set(self.values + ["z"]))
+
+#==============================================================================
+
+class _TestSubsets:
+
+    case2method = {"<=": "issubset",
+                   ">=": "issuperset",
+                  }
+
+    reverse = {"==": "==",
+               "!=": "!=",
+               "<":  ">",
+               ">":  "<",
+               "<=": ">=",
+               ">=": "<=",
+              }
+
+    def test_issubset(self):
+        x = self.left
+        y = self.right
+        for case in "!=", "==", "<", "<=", ">", ">=":
+            expected = case in self.cases
+            # Test the binary infix spelling.
+            result = eval("x" + case + "y", locals())
+            self.assertEqual(result, expected)
+            # Test the "friendly" method-name spelling, if one exists.
+            if case in _TestSubsets.case2method:
+                method = getattr(x, _TestSubsets.case2method[case])
+                result = method(y)
+                self.assertEqual(result, expected)
+
+            # Now do the same for the operands reversed.
+            rcase = _TestSubsets.reverse[case]
+            result = eval("y" + rcase + "x", locals())
+            self.assertEqual(result, expected)
+            if rcase in _TestSubsets.case2method:
+                method = getattr(y, _TestSubsets.case2method[rcase])
+                result = method(x)
+                self.assertEqual(result, expected)
+#------------------------------------------------------------------------------
+
+class TestSubsetEqualEmpty(_TestSubsets, __TestCase):
+    left  = set()
+    right = set()
+    name  = "both empty"
+    cases = "==", "<=", ">="
+
+#------------------------------------------------------------------------------
+
+class TestSubsetEqualNonEmpty(_TestSubsets, __TestCase):
+    left  = set([1, 2])
+    right = set([1, 2])
+    name  = "equal pair"
+    cases = "==", "<=", ">="
+
+#------------------------------------------------------------------------------
+
+class TestSubsetEmptyNonEmpty(_TestSubsets, __TestCase):
+    left  = set()
+    right = set([1, 2])
+    name  = "one empty, one non-empty"
+    cases = "!=", "<", "<="
+
+#------------------------------------------------------------------------------
+
+class TestSubsetPartial(_TestSubsets, __TestCase):
+    left  = set([1])
+    right = set([1, 2])
+    name  = "one a non-empty proper subset of other"
+    cases = "!=", "<", "<="
+
+#------------------------------------------------------------------------------
+
+class TestSubsetNonOverlap(_TestSubsets, __TestCase):
+    left  = set([1])
+    right = set([2])
+    name  = "neither empty, neither contains"
+    cases = "!="
+
+#==============================================================================
+
+class _TestOnlySetsInBinaryOps:
+
+    def test_eq_ne(self):
+        # Unlike the others, this is testing that == and != *are* allowed.
+        self.assertEqual(self.other == self.set, False)
+        self.assertEqual(self.set == self.other, False)
+        self.assertEqual(self.other != self.set, True)
+        self.assertEqual(self.set != self.other, True)
+
+    def test_ge_gt_le_lt(self):
+        self.assertRaises(TypeError, lambda: self.set < self.other)
+        self.assertRaises(TypeError, lambda: self.set <= self.other)
+        self.assertRaises(TypeError, lambda: self.set > self.other)
+        self.assertRaises(TypeError, lambda: self.set >= self.other)
+
+        self.assertRaises(TypeError, lambda: self.other < self.set)
+        self.assertRaises(TypeError, lambda: self.other <= self.set)
+        self.assertRaises(TypeError, lambda: self.other > self.set)
+        self.assertRaises(TypeError, lambda: self.other >= self.set)
+
+    def test_update_operator(self):
+        try:
+            self.set |= self.other
+        except TypeError:
+            pass
+        else:
+            self.fail("expected TypeError")
+
+    def test_update(self):
+        if self.otherIsIterable:
+            self.set.update(self.other)
+        else:
+            self.assertRaises(TypeError, self.set.update, self.other)
+
+    def test_union(self):
+        self.assertRaises(TypeError, lambda: self.set | self.other)
+        self.assertRaises(TypeError, lambda: self.other | self.set)
+        if self.otherIsIterable:
+            self.set.union(self.other)
+        else:
+            self.assertRaises(TypeError, self.set.union, self.other)
+
+    def test_intersection_update_operator(self):
+        try:
+            self.set &= self.other
+        except TypeError:
+            pass
+        else:
+            self.fail("expected TypeError")
+
+    def test_intersection_update(self):
+        if self.otherIsIterable:
+            self.set.intersection_update(self.other)
+        else:
+            self.assertRaises(TypeError,
+                              self.set.intersection_update,
+                              self.other)
+
+    def test_intersection(self):
+        self.assertRaises(TypeError, lambda: self.set & self.other)
+        self.assertRaises(TypeError, lambda: self.other & self.set)
+        if self.otherIsIterable:
+            self.set.intersection(self.other)
+        else:
+            self.assertRaises(TypeError, self.set.intersection, self.other)
+
+    def test_sym_difference_update_operator(self):
+        try:
+            self.set ^= self.other
+        except TypeError:
+            pass
+        else:
+            self.fail("expected TypeError")
+
+    def test_sym_difference_update(self):
+        if self.otherIsIterable:
+            self.set.symmetric_difference_update(self.other)
+        else:
+            self.assertRaises(TypeError,
+                              self.set.symmetric_difference_update,
+                              self.other)
+
+    def test_sym_difference(self):
+        self.assertRaises(TypeError, lambda: self.set ^ self.other)
+        self.assertRaises(TypeError, lambda: self.other ^ self.set)
+        if self.otherIsIterable:
+            self.set.symmetric_difference(self.other)
+        else:
+            self.assertRaises(TypeError, self.set.symmetric_difference, self.other)
+
+    def test_difference_update_operator(self):
+        try:
+            self.set -= self.other
+        except TypeError:
+            pass
+        else:
+            self.fail("expected TypeError")
+
+    def test_difference_update(self):
+        if self.otherIsIterable:
+            self.set.difference_update(self.other)
+        else:
+            self.assertRaises(TypeError,
+                              self.set.difference_update,
+                              self.other)
+
+    def test_difference(self):
+        self.assertRaises(TypeError, lambda: self.set - self.other)
+        self.assertRaises(TypeError, lambda: self.other - self.set)
+        if self.otherIsIterable:
+            self.set.difference(self.other)
+        else:
+            self.assertRaises(TypeError, self.set.difference, self.other)
+
+#------------------------------------------------------------------------------
+
+class TestOnlySetsNumeric(_TestOnlySetsInBinaryOps, __TestCase):
+    def setUp(self):
+        self.set   = set((1, 2, 3))
+        self.other = 19
+        self.otherIsIterable = False
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestOnlySetsDict(_TestOnlySetsInBinaryOps, __TestCase):
+    def setUp(self):
+        self.set   = set((1, 2, 3))
+        self.other = {1:2, 3:4}
+        self.otherIsIterable = True
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestOnlySetsOperator(_TestOnlySetsInBinaryOps, __TestCase):
+    def setUp(self):
+        self.set   = set((1, 2, 3))
+        self.other = operator.add
+        self.otherIsIterable = False
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestOnlySetsTuple(_TestOnlySetsInBinaryOps, __TestCase):
+    def setUp(self):
+        self.set   = set((1, 2, 3))
+        self.other = (2, 4, 6)
+        self.otherIsIterable = True
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestOnlySetsString(_TestOnlySetsInBinaryOps, __TestCase):
+    def setUp(self):
+        self.set   = set((1, 2, 3))
+        self.other = 'abc'
+        self.otherIsIterable = True
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestOnlySetsGenerator(_TestOnlySetsInBinaryOps, __TestCase):
+    def setUp(self):
+        def gen():
+            for i in range(0, 10, 2):
+                yield i
+        self.set   = set((1, 2, 3))
+        self.other = gen()
+        self.otherIsIterable = True
+        super().setUp()
+
+#==============================================================================
+
+class _TestCopying:
+
+    def test_copy(self):
+        dup = self.set.copy()
+        dup_list = sorted(dup, key=repr)
+        set_list = sorted(self.set, key=repr)
+        self.assertEqual(len(dup_list), len(set_list))
+        for i in range(len(dup_list)):
+            self.assertTrue(dup_list[i] is set_list[i])
+
+    def test_deep_copy(self):
+        dup = copy.deepcopy(self.set)
+        ##print type(dup), repr(dup)
+        dup_list = sorted(dup, key=repr)
+        set_list = sorted(self.set, key=repr)
+        self.assertEqual(len(dup_list), len(set_list))
+        for i in range(len(dup_list)):
+            self.assertEqual(dup_list[i], set_list[i])
+
+#------------------------------------------------------------------------------
+
+class TestCopyingEmpty(_TestCopying, __TestCase):
+    def setUp(self):
+        self.set = set()
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestCopyingSingleton(_TestCopying, __TestCase):
+    def setUp(self):
+        self.set = set(["hello"])
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestCopyingTriple(_TestCopying, __TestCase):
+    def setUp(self):
+        self.set = set(["zero", 0, None])
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestCopyingTuple(_TestCopying, __TestCase):
+    def setUp(self):
+        self.set = set([(1, 2)])
+        super().setUp()
+
+#------------------------------------------------------------------------------
+
+class TestCopyingNested(_TestCopying, __TestCase):
+    def setUp(self):
+        self.set = set([((1, 2), (3, 4))])
+        super().setUp()
+
+#==============================================================================
+
+class TestIdentities(__TestCase):
+    def setUp(self):
+        self.a = set('abracadabra')
+        self.b = set('alacazam')
+        super().setUp()
+
+    def test_binopsVsSubsets(self):
+        a, b = self.a, self.b
+        self.assertTrue(a - b < a)
+        self.assertTrue(b - a < b)
+        self.assertTrue(a & b < a)
+        self.assertTrue(a & b < b)
+        self.assertTrue(a | b > a)
+        self.assertTrue(a | b > b)
+        self.assertTrue(a ^ b < a | b)
+
+    def test_commutativity(self):
+        a, b = self.a, self.b
+        self.assertEqual(a&b, b&a)
+        self.assertEqual(a|b, b|a)
+        self.assertEqual(a^b, b^a)
+        if a != b:
+            self.assertNotEqual(a-b, b-a)
+
+    def test_summations(self):
+        # check that sums of parts equal the whole
+        a, b = self.a, self.b
+        self.assertEqual((a-b)|(a&b)|(b-a), a|b)
+        self.assertEqual((a&b)|(a^b), a|b)
+        self.assertEqual(a|(b-a), a|b)
+        self.assertEqual((a-b)|b, a|b)
+        self.assertEqual((a-b)|(a&b), a)
+        self.assertEqual((b-a)|(a&b), b)
+        self.assertEqual((a-b)|(b-a), a^b)
+
+    def test_exclusion(self):
+        # check that inverse operations show non-overlap
+        a, b, zero = self.a, self.b, set()
+        self.assertEqual((a-b)&b, zero)
+        self.assertEqual((b-a)&a, zero)
+        self.assertEqual((a&b)&(a^b), zero)
+
+# Tests derived from test_itertools.py =======================================
+
+def R(seqn):
+    'Regular generator'
+    for i in seqn:
+        yield i
+
+class G:
+    'Sequence using __getitem__'
+    def __init__(self, seqn):
+        self.seqn = seqn
+    def __getitem__(self, i):
+        return self.seqn[i]
+
+class I:
+    'Sequence using iterator protocol'
+    def __init__(self, seqn):
+        self.seqn = seqn
+        self.i = 0
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self.i >= len(self.seqn): raise StopIteration
+        v = self.seqn[self.i]
+        self.i += 1
+        return v
+
+class Ig:
+    'Sequence using iterator protocol defined with a generator'
+    def __init__(self, seqn):
+        self.seqn = seqn
+        self.i = 0
+    def __iter__(self):
+        for val in self.seqn:
+            yield val
+
+class X:
+    'Missing __getitem__ and __iter__'
+    def __init__(self, seqn):
+        self.seqn = seqn
+        self.i = 0
+    def __next__(self):
+        if self.i >= len(self.seqn): raise StopIteration
+        v = self.seqn[self.i]
+        self.i += 1
+        return v
+
+class N:
+    'Iterator missing __next__()'
+    def __init__(self, seqn):
+        self.seqn = seqn
+        self.i = 0
+    def __iter__(self):
+        return self
+
+class E:
+    'Test propagation of exceptions'
+    def __init__(self, seqn):
+        self.seqn = seqn
+        self.i = 0
+    def __iter__(self):
+        return self
+    def __next__(self):
+        3 // 0
+
+class S:
+    'Test immediate stop'
+    def __init__(self, seqn):
+        pass
+    def __iter__(self):
+        return self
+    def __next__(self):
+        raise StopIteration
+
+from itertools import chain
+def L(seqn):
+    'Test multiple tiers of iterators'
+    return chain(map(lambda x:x, R(Ig(G(seqn)))))
+
+class TestVariousIteratorArgs(__TestCase):
+
+    def test_constructor(self):
+        for cons in (set, frozenset):
+            for s in ("123", "", range(1000), ('do', 1.2), range(2000,2200,5)):
+                for g in (G, I, Ig, S, L, R):
+                    self.assertEqual(sorted(cons(g(s)), key=repr), sorted(g(s), key=repr))
+                self.assertRaises(TypeError, cons , X(s))
+                self.assertRaises(TypeError, cons , N(s))
+                self.assertRaises(ZeroDivisionError, cons , E(s))
+
+    def test_inline_methods(self):
+        s = set('november')
+        for data in ("123", "", range(1000), ('do', 1.2), range(2000,2200,5), 'december'):
+            for meth in (s.union, s.intersection, s.difference, s.symmetric_difference, s.isdisjoint):
+                for g in (G, I, Ig, L, R):
+                    expected = meth(data)
+                    actual = meth(g(data))
+                    if isinstance(expected, bool):
+                        self.assertEqual(actual, expected)
+                    else:
+                        self.assertEqual(sorted(actual, key=repr), sorted(expected, key=repr))
+                self.assertRaises(TypeError, meth, X(s))
+                self.assertRaises(TypeError, meth, N(s))
+                self.assertRaises(ZeroDivisionError, meth, E(s))
+
+    def test_inplace_methods(self):
+        for data in ("123", "", range(1000), ('do', 1.2), range(2000,2200,5), 'december'):
+            for methname in ('update', 'intersection_update',
+                             'difference_update', 'symmetric_difference_update'):
+                for g in (G, I, Ig, S, L, R):
+                    s = set('january')
+                    t = s.copy()
+                    getattr(s, methname)(list(g(data)))
+                    getattr(t, methname)(g(data))
+                    self.assertEqual(sorted(s, key=repr), sorted(t, key=repr))
+
+                self.assertRaises(TypeError, getattr(set('january'), methname), X(data))
+                self.assertRaises(TypeError, getattr(set('january'), methname), N(data))
+                self.assertRaises(ZeroDivisionError, getattr(set('january'), methname), E(data))
+
+class bad_eq:
+    def __eq__(self, other):
+        if be_bad:
+            set2.clear()
+            raise ZeroDivisionError
+        return self is other
+    def __hash__(self):
+        return 0
+
+class bad_dict_clear:
+    def __eq__(self, other):
+        if be_bad:
+            dict2.clear()
+        return self is other
+    def __hash__(self):
+        return 0
+
+class TestWeirdBugs(__TestCase):
+    def test_8420_set_merge(self):
+        # This used to segfault
+        global be_bad, set2, dict2
+        be_bad = False
+        set1 = {bad_eq()}
+        set2 = {bad_eq() for i in range(75)}
+        be_bad = True
+        self.assertRaises(ZeroDivisionError, set1.update, set2)
+
+        be_bad = False
+        set1 = {bad_dict_clear()}
+        dict2 = {bad_dict_clear(): None}
+        be_bad = True
+        set1.symmetric_difference_update(dict2)
+
+    def test_iter_and_mutate(self):
+        # Issue #24581
+        s = set(range(100))
+        s.clear()
+        s.update(range(100))
+        si = iter(s)
+        s.clear()
+        a = list(range(100))
+        s.update(range(100))
+        list(si)
+
+    def test_merge_and_mutate(self):
+        class X:
+            def __hash__(self):
+                return hash(0)
+            def __eq__(self, o):
+                other.clear()
+                return False
+
+        other = set()
+        other = {X() for i in range(10)}
+        s = {0}
+        s.update(other)
+
+
+class _TestOperationsMutating:
+    """Regression test for bpo-46615"""
+
+    constructor1 = None
+    constructor2 = None
+
+    def make_sets_of_bad_objects(self):
+        class Bad:
+            def __eq__(self, other):
+                if not enabled:
+                    return False
+                if randrange(20) == 0:
+                    set1.clear()
+                if randrange(20) == 0:
+                    set2.clear()
+                return bool(randrange(2))
+            def __hash__(self):
+                return randrange(2)
+        # Don't behave poorly during construction.
+        enabled = False
+        set1 = self.constructor1(Bad() for _ in range(randrange(50)))
+        set2 = self.constructor2(Bad() for _ in range(randrange(50)))
+        # Now start behaving poorly
+        enabled = True
+        return set1, set2
+
+    def check_set_op_does_not_crash(self, function):
+        for _ in range(100):
+            set1, set2 = self.make_sets_of_bad_objects()
+            try:
+                function(set1, set2)
+            except RuntimeError as e:
+                # Just make sure we don't crash here.
+                self.assertIn("changed size during iteration", str(e))
+
+
+class _TestBinaryOpsMutating(_TestOperationsMutating):
+
+    def test_eq_with_mutation(self):
+        self.check_set_op_does_not_crash(lambda a, b: a == b)
+
+    def test_ne_with_mutation(self):
+        self.check_set_op_does_not_crash(lambda a, b: a != b)
+
+    def test_lt_with_mutation(self):
+        self.check_set_op_does_not_crash(lambda a, b: a < b)
+
+    def test_le_with_mutation(self):
+        self.check_set_op_does_not_crash(lambda a, b: a <= b)
+
+    def test_gt_with_mutation(self):
+        self.check_set_op_does_not_crash(lambda a, b: a > b)
+
+    def test_ge_with_mutation(self):
+        self.check_set_op_does_not_crash(lambda a, b: a >= b)
+
+    def test_and_with_mutation(self):
+        self.check_set_op_does_not_crash(lambda a, b: a & b)
+
+    def test_or_with_mutation(self):
+        self.check_set_op_does_not_crash(lambda a, b: a | b)
+
+    def test_sub_with_mutation(self):
+        self.check_set_op_does_not_crash(lambda a, b: a - b)
+
+    def test_xor_with_mutation(self):
+        self.check_set_op_does_not_crash(lambda a, b: a ^ b)
+
+    def test_iadd_with_mutation(self):
+        def f(a, b):
+            a &= b
+        self.check_set_op_does_not_crash(f)
+
+    def test_ior_with_mutation(self):
+        def f(a, b):
+            a |= b
+        self.check_set_op_does_not_crash(f)
+
+    def test_isub_with_mutation(self):
+        def f(a, b):
+            a -= b
+        self.check_set_op_does_not_crash(f)
+
+    def test_ixor_with_mutation(self):
+        def f(a, b):
+            a ^= b
+        self.check_set_op_does_not_crash(f)
+
+    def test_iteration_with_mutation(self):
+        def f1(a, b):
+            for x in a:
+                pass
+            for y in b:
+                pass
+        def f2(a, b):
+            for y in b:
+                pass
+            for x in a:
+                pass
+        def f3(a, b):
+            for x, y in zip(a, b):
+                pass
+        self.check_set_op_does_not_crash(f1)
+        self.check_set_op_does_not_crash(f2)
+        self.check_set_op_does_not_crash(f3)
+
+
+class TestBinaryOpsMutating_Set_Set(_TestBinaryOpsMutating, __TestCase):
+    constructor1 = set
+    constructor2 = set
+
+class TestBinaryOpsMutating_Subclass_Subclass(_TestBinaryOpsMutating, __TestCase):
+    constructor1 = SetSubclass
+    constructor2 = SetSubclass
+
+class TestBinaryOpsMutating_Set_Subclass(_TestBinaryOpsMutating, __TestCase):
+    constructor1 = set
+    constructor2 = SetSubclass
+
+class TestBinaryOpsMutating_Subclass_Set(_TestBinaryOpsMutating, __TestCase):
+    constructor1 = SetSubclass
+    constructor2 = set
+
+
+class _TestMethodsMutating(_TestOperationsMutating):
+
+    def test_issubset_with_mutation(self):
+        self.check_set_op_does_not_crash(set.issubset)
+
+    def test_issuperset_with_mutation(self):
+        self.check_set_op_does_not_crash(set.issuperset)
+
+    def test_intersection_with_mutation(self):
+        self.check_set_op_does_not_crash(set.intersection)
+
+    def test_union_with_mutation(self):
+        self.check_set_op_does_not_crash(set.union)
+
+    def test_difference_with_mutation(self):
+        self.check_set_op_does_not_crash(set.difference)
+
+    def test_symmetric_difference_with_mutation(self):
+        self.check_set_op_does_not_crash(set.symmetric_difference)
+
+    def test_isdisjoint_with_mutation(self):
+        self.check_set_op_does_not_crash(set.isdisjoint)
+
+    def test_difference_update_with_mutation(self):
+        self.check_set_op_does_not_crash(set.difference_update)
+
+    def test_intersection_update_with_mutation(self):
+        self.check_set_op_does_not_crash(set.intersection_update)
+
+    def test_symmetric_difference_update_with_mutation(self):
+        self.check_set_op_does_not_crash(set.symmetric_difference_update)
+
+    def test_update_with_mutation(self):
+        self.check_set_op_does_not_crash(set.update)
+
+
+class TestMethodsMutating_Set_Set(_TestMethodsMutating, __TestCase):
+    constructor1 = set
+    constructor2 = set
+
+class TestMethodsMutating_Subclass_Subclass(_TestMethodsMutating, __TestCase):
+    constructor1 = SetSubclass
+    constructor2 = SetSubclass
+
+class TestMethodsMutating_Set_Subclass(_TestMethodsMutating, __TestCase):
+    constructor1 = set
+    constructor2 = SetSubclass
+
+class TestMethodsMutating_Subclass_Set(_TestMethodsMutating, __TestCase):
+    constructor1 = SetSubclass
+    constructor2 = set
+
+class TestMethodsMutating_Set_Dict(_TestMethodsMutating, __TestCase):
+    constructor1 = set
+    constructor2 = dict.fromkeys
+
+class TestMethodsMutating_Set_List(_TestMethodsMutating, __TestCase):
+    constructor1 = set
+    constructor2 = list
+
+
+# Application tests (based on David Eppstein's graph recipes ====================================
+
+def powerset(U):
+    """Generates all subsets of a set or sequence U."""
+    U = iter(U)
+    try:
+        x = frozenset([next(U)])
+        for S in powerset(U):
+            yield S
+            yield S | x
+    except StopIteration:
+        yield frozenset()
+
+def cube(n):
+    """Graph of n-dimensional hypercube."""
+    singletons = [frozenset([x]) for x in range(n)]
+    return dict([(x, frozenset([x^s for s in singletons]))
+                 for x in powerset(range(n))])
+
+def linegraph(G):
+    """Graph, the vertices of which are edges of G,
+    with two vertices being adjacent iff the corresponding
+    edges share a vertex."""
+    L = {}
+    for x in G:
+        for y in G[x]:
+            nx = [frozenset([x,z]) for z in G[x] if z != y]
+            ny = [frozenset([y,z]) for z in G[y] if z != x]
+            L[frozenset([x,y])] = frozenset(nx+ny)
+    return L
+
+def faces(G):
+    'Return a set of faces in G.  Where a face is a set of vertices on that face'
+    # currently limited to triangles,squares, and pentagons
+    f = set()
+    for v1, edges in G.items():
+        for v2 in edges:
+            for v3 in G[v2]:
+                if v1 == v3:
+                    continue
+                if v1 in G[v3]:
+                    f.add(frozenset([v1, v2, v3]))
+                else:
+                    for v4 in G[v3]:
+                        if v4 == v2:
+                            continue
+                        if v1 in G[v4]:
+                            f.add(frozenset([v1, v2, v3, v4]))
+                        else:
+                            for v5 in G[v4]:
+                                if v5 == v3 or v5 == v2:
+                                    continue
+                                if v1 in G[v5]:
+                                    f.add(frozenset([v1, v2, v3, v4, v5]))
+    return f
+
+
+class TestGraphs(__TestCase):
+
+    def test_cube(self):
+
+        g = cube(3)                             # vert --> {v1, v2, v3}
+        vertices1 = set(g)
+        self.assertEqual(len(vertices1), 8)     # eight vertices
+        for edge in g.values():
+            self.assertEqual(len(edge), 3)      # each vertex connects to three edges
+        vertices2 = set(v for edges in g.values() for v in edges)
+        self.assertEqual(vertices1, vertices2)  # edge vertices in original set
+
+        cubefaces = faces(g)
+        self.assertEqual(len(cubefaces), 6)     # six faces
+        for face in cubefaces:
+            self.assertEqual(len(face), 4)      # each face is a square
+
+    def test_cuboctahedron(self):
+
+        # http://en.wikipedia.org/wiki/Cuboctahedron
+        # 8 triangular faces and 6 square faces
+        # 12 identical vertices each connecting a triangle and square
+
+        g = cube(3)
+        cuboctahedron = linegraph(g)            # V( --> {V1, V2, V3, V4}
+        self.assertEqual(len(cuboctahedron), 12)# twelve vertices
+
+        vertices = set(cuboctahedron)
+        for edges in cuboctahedron.values():
+            self.assertEqual(len(edges), 4)     # each vertex connects to four other vertices
+        othervertices = set(edge for edges in cuboctahedron.values() for edge in edges)
+        self.assertEqual(vertices, othervertices)   # edge vertices in original set
+
+        cubofaces = faces(cuboctahedron)
+        facesizes = collections.defaultdict(int)
+        for face in cubofaces:
+            facesizes[len(face)] += 1
+        self.assertEqual(facesizes[3], 8)       # eight triangular faces
+        self.assertEqual(facesizes[4], 6)       # six square faces
+
+        for vertex in cuboctahedron:
+            edge = vertex                       # Cuboctahedron vertices are edges in Cube
+            self.assertEqual(len(edge), 2)      # Two cube vertices define an edge
+            for cubevert in edge:
+                self.assertIn(cubevert, g)
+
+
+#==============================================================================
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_sort.diff b/test/dynamo/cpython/3_13/test_sort.diff
new file mode 100644
index 000000000000..78fde5ef19a1
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_sort.diff
@@ -0,0 +1,101 @@
+diff --git a/test/dynamo/cpython/3_13/test_sort.py b/test/dynamo/cpython/3_13/test_sort.py
+index 2a7cfb7affa..d661ae544b9 100644
+--- a/test/dynamo/cpython/3_13/test_sort.py
++++ b/test/dynamo/cpython/3_13/test_sort.py
+@@ -1,3 +1,54 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ from test import support
+ import random
+ import unittest
+@@ -39,7 +90,7 @@ def check(tag, expected, raw, compare=None):
+             nerrors += 1
+             return
+ 
+-class TestBase(unittest.TestCase):
++class TestBase(__TestCase):
+     def testStressfully(self):
+         # Try a variety of sizes at and around powers of 2, and at powers of 10.
+         sizes = [0]
+@@ -151,7 +202,7 @@ class TestBase(unittest.TestCase):
+                 self.assertEqual(forced, native)
+ #==============================================================================
+ 
+-class TestBugs(unittest.TestCase):
++class TestBugs(__TestCase):
+ 
+     def test_bug453523(self):
+         # bug 453523 -- list.sort() crasher.
+@@ -188,7 +239,7 @@ class TestBugs(unittest.TestCase):
+ 
+ #==============================================================================
+ 
+-class TestDecorateSortUndecorate(unittest.TestCase):
++class TestDecorateSortUndecorate(__TestCase):
+ 
+     def test_decorated(self):
+         data = 'The quick Brown fox Jumped over The lazy Dog'.split()
+@@ -309,7 +360,7 @@ def check_against_PyObject_RichCompareBool(self, L):
+             self.assertIs(opt, ref)
+             #note: not assertEqual! We want to ensure *identical* behavior.
+ 
+-class TestOptimizedCompares(unittest.TestCase):
++class TestOptimizedCompares(__TestCase):
+     def test_safe_object_compare(self):
+         heterogeneous_lists = [[0, 'foo'],
+                                [0.0, 'foo'],
+@@ -408,4 +459,4 @@ class TestOptimizedCompares(unittest.TestCase):
+ #==============================================================================
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_sort.py b/test/dynamo/cpython/3_13/test_sort.py
new file mode 100644
index 000000000000..d661ae544b99
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_sort.py
@@ -0,0 +1,462 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+from test import support
+import random
+import unittest
+from functools import cmp_to_key
+
+verbose = support.verbose
+nerrors = 0
+
+
+def check(tag, expected, raw, compare=None):
+    global nerrors
+
+    if verbose:
+        print("    checking", tag)
+
+    orig = raw[:]   # save input in case of error
+    if compare:
+        raw.sort(key=cmp_to_key(compare))
+    else:
+        raw.sort()
+
+    if len(expected) != len(raw):
+        print("error in", tag)
+        print("length mismatch;", len(expected), len(raw))
+        print(expected)
+        print(orig)
+        print(raw)
+        nerrors += 1
+        return
+
+    for i, good in enumerate(expected):
+        maybe = raw[i]
+        if good is not maybe:
+            print("error in", tag)
+            print("out of order at index", i, good, maybe)
+            print(expected)
+            print(orig)
+            print(raw)
+            nerrors += 1
+            return
+
+class TestBase(__TestCase):
+    def testStressfully(self):
+        # Try a variety of sizes at and around powers of 2, and at powers of 10.
+        sizes = [0]
+        for power in range(1, 10):
+            n = 2 ** power
+            sizes.extend(range(n-1, n+2))
+        sizes.extend([10, 100, 1000])
+
+        class Complains(object):
+            maybe_complain = True
+
+            def __init__(self, i):
+                self.i = i
+
+            def __lt__(self, other):
+                if Complains.maybe_complain and random.random() < 0.001:
+                    if verbose:
+                        print("        complaining at", self, other)
+                    raise RuntimeError
+                return self.i < other.i
+
+            def __repr__(self):
+                return "Complains(%d)" % self.i
+
+        class Stable(object):
+            def __init__(self, key, i):
+                self.key = key
+                self.index = i
+
+            def __lt__(self, other):
+                return self.key < other.key
+
+            def __repr__(self):
+                return "Stable(%d, %d)" % (self.key, self.index)
+
+        for n in sizes:
+            x = list(range(n))
+            if verbose:
+                print("Testing size", n)
+
+            s = x[:]
+            check("identity", x, s)
+
+            s = x[:]
+            s.reverse()
+            check("reversed", x, s)
+
+            s = x[:]
+            random.shuffle(s)
+            check("random permutation", x, s)
+
+            y = x[:]
+            y.reverse()
+            s = x[:]
+            check("reversed via function", y, s, lambda a, b: (b>a)-(b<a))
+
+            if verbose:
+                print("    Checking against an insane comparison function.")
+                print("        If the implementation isn't careful, this may segfault.")
+            s = x[:]
+            s.sort(key=cmp_to_key(lambda a, b:  int(random.random() * 3) - 1))
+            check("an insane function left some permutation", x, s)
+
+            if len(x) >= 2:
+                def bad_key(x):
+                    raise RuntimeError
+                s = x[:]
+                self.assertRaises(RuntimeError, s.sort, key=bad_key)
+
+            x = [Complains(i) for i in x]
+            s = x[:]
+            random.shuffle(s)
+            Complains.maybe_complain = True
+            it_complained = False
+            try:
+                s.sort()
+            except RuntimeError:
+                it_complained = True
+            if it_complained:
+                Complains.maybe_complain = False
+                check("exception during sort left some permutation", x, s)
+
+            s = [Stable(random.randrange(10), i) for i in range(n)]
+            augmented = [(e, e.index) for e in s]
+            augmented.sort()    # forced stable because ties broken by index
+            x = [e for e, i in augmented] # a stable sort of s
+            check("stability", x, s)
+
+    def test_small_stability(self):
+        from itertools import product
+        from operator import itemgetter
+
+        # Exhaustively test stability across all lists of small lengths
+        # and only a few distinct elements.
+        # This can provoke edge cases that randomization is unlikely to find.
+        # But it can grow very expensive quickly, so don't overdo it.
+        NELTS = 3
+        MAXSIZE = 9
+
+        pick0 = itemgetter(0)
+        for length in range(MAXSIZE + 1):
+            # There are NELTS ** length distinct lists.
+            for t in product(range(NELTS), repeat=length):
+                xs = list(zip(t, range(length)))
+                # Stability forced by index in each element.
+                forced = sorted(xs)
+                # Use key= to hide the index from compares.
+                native = sorted(xs, key=pick0)
+                self.assertEqual(forced, native)
+#==============================================================================
+
+class TestBugs(__TestCase):
+
+    def test_bug453523(self):
+        # bug 453523 -- list.sort() crasher.
+        # If this fails, the most likely outcome is a core dump.
+        # Mutations during a list sort should raise a ValueError.
+
+        class C:
+            def __lt__(self, other):
+                if L and random.random() < 0.75:
+                    L.pop()
+                else:
+                    L.append(3)
+                return random.random() < 0.5
+
+        L = [C() for i in range(50)]
+        self.assertRaises(ValueError, L.sort)
+
+    def test_undetected_mutation(self):
+        # Python 2.4a1 did not always detect mutation
+        memorywaster = []
+        for i in range(20):
+            def mutating_cmp(x, y):
+                L.append(3)
+                L.pop()
+                return (x > y) - (x < y)
+            L = [1,2]
+            self.assertRaises(ValueError, L.sort, key=cmp_to_key(mutating_cmp))
+            def mutating_cmp(x, y):
+                L.append(3)
+                del L[:]
+                return (x > y) - (x < y)
+            self.assertRaises(ValueError, L.sort, key=cmp_to_key(mutating_cmp))
+            memorywaster = [memorywaster]
+
+#==============================================================================
+
+class TestDecorateSortUndecorate(__TestCase):
+
+    def test_decorated(self):
+        data = 'The quick Brown fox Jumped over The lazy Dog'.split()
+        copy = data[:]
+        random.shuffle(data)
+        data.sort(key=str.lower)
+        def my_cmp(x, y):
+            xlower, ylower = x.lower(), y.lower()
+            return (xlower > ylower) - (xlower < ylower)
+        copy.sort(key=cmp_to_key(my_cmp))
+
+    def test_baddecorator(self):
+        data = 'The quick Brown fox Jumped over The lazy Dog'.split()
+        self.assertRaises(TypeError, data.sort, key=lambda x,y: 0)
+
+    def test_stability(self):
+        data = [(random.randrange(100), i) for i in range(200)]
+        copy = data[:]
+        data.sort(key=lambda t: t[0])   # sort on the random first field
+        copy.sort()                     # sort using both fields
+        self.assertEqual(data, copy)    # should get the same result
+
+    def test_key_with_exception(self):
+        # Verify that the wrapper has been removed
+        data = list(range(-2, 2))
+        dup = data[:]
+        self.assertRaises(ZeroDivisionError, data.sort, key=lambda x: 1/x)
+        self.assertEqual(data, dup)
+
+    def test_key_with_mutation(self):
+        data = list(range(10))
+        def k(x):
+            del data[:]
+            data[:] = range(20)
+            return x
+        self.assertRaises(ValueError, data.sort, key=k)
+
+    def test_key_with_mutating_del(self):
+        data = list(range(10))
+        class SortKiller(object):
+            def __init__(self, x):
+                pass
+            def __del__(self):
+                del data[:]
+                data[:] = range(20)
+            def __lt__(self, other):
+                return id(self) < id(other)
+        self.assertRaises(ValueError, data.sort, key=SortKiller)
+
+    def test_key_with_mutating_del_and_exception(self):
+        data = list(range(10))
+        ## dup = data[:]
+        class SortKiller(object):
+            def __init__(self, x):
+                if x > 2:
+                    raise RuntimeError
+            def __del__(self):
+                del data[:]
+                data[:] = list(range(20))
+        self.assertRaises(RuntimeError, data.sort, key=SortKiller)
+        ## major honking subtlety: we *can't* do:
+        ##
+        ## self.assertEqual(data, dup)
+        ##
+        ## because there is a reference to a SortKiller in the
+        ## traceback and by the time it dies we're outside the call to
+        ## .sort() and so the list protection gimmicks are out of
+        ## date (this cost some brain cells to figure out...).
+
+    def test_reverse(self):
+        data = list(range(100))
+        random.shuffle(data)
+        data.sort(reverse=True)
+        self.assertEqual(data, list(range(99,-1,-1)))
+
+    def test_reverse_stability(self):
+        data = [(random.randrange(100), i) for i in range(200)]
+        copy1 = data[:]
+        copy2 = data[:]
+        def my_cmp(x, y):
+            x0, y0 = x[0], y[0]
+            return (x0 > y0) - (x0 < y0)
+        def my_cmp_reversed(x, y):
+            x0, y0 = x[0], y[0]
+            return (y0 > x0) - (y0 < x0)
+        data.sort(key=cmp_to_key(my_cmp), reverse=True)
+        copy1.sort(key=cmp_to_key(my_cmp_reversed))
+        self.assertEqual(data, copy1)
+        copy2.sort(key=lambda x: x[0], reverse=True)
+        self.assertEqual(data, copy2)
+
+#==============================================================================
+def check_against_PyObject_RichCompareBool(self, L):
+    ## The idea here is to exploit the fact that unsafe_tuple_compare uses
+    ## PyObject_RichCompareBool for the second elements of tuples. So we have,
+    ## for (most) L, sorted(L) == [y[1] for y in sorted([(0,x) for x in L])]
+    ## This will work as long as __eq__ => not __lt__ for all the objects in L,
+    ## which holds for all the types used below.
+    ##
+    ## Testing this way ensures that the optimized implementation remains consistent
+    ## with the naive implementation, even if changes are made to any of the
+    ## richcompares.
+    ##
+    ## This function tests sorting for three lists (it randomly shuffles each one):
+    ##                        1. L
+    ##                        2. [(x,) for x in L]
+    ##                        3. [((x,),) for x in L]
+
+    random.seed(0)
+    random.shuffle(L)
+    L_1 = L[:]
+    L_2 = [(x,) for x in L]
+    L_3 = [((x,),) for x in L]
+    for L in [L_1, L_2, L_3]:
+        optimized = sorted(L)
+        reference = [y[1] for y in sorted([(0,x) for x in L])]
+        for (opt, ref) in zip(optimized, reference):
+            self.assertIs(opt, ref)
+            #note: not assertEqual! We want to ensure *identical* behavior.
+
+class TestOptimizedCompares(__TestCase):
+    def test_safe_object_compare(self):
+        heterogeneous_lists = [[0, 'foo'],
+                               [0.0, 'foo'],
+                               [('foo',), 'foo']]
+        for L in heterogeneous_lists:
+            self.assertRaises(TypeError, L.sort)
+            self.assertRaises(TypeError, [(x,) for x in L].sort)
+            self.assertRaises(TypeError, [((x,),) for x in L].sort)
+
+        float_int_lists = [[1,1.1],
+                           [1<<70,1.1],
+                           [1.1,1],
+                           [1.1,1<<70]]
+        for L in float_int_lists:
+            check_against_PyObject_RichCompareBool(self, L)
+
+    def test_unsafe_object_compare(self):
+
+        # This test is by ppperry. It ensures that unsafe_object_compare is
+        # verifying ms->key_richcompare == tp->richcompare before comparing.
+
+        class WackyComparator(int):
+            def __lt__(self, other):
+                elem.__class__ = WackyList2
+                return int.__lt__(self, other)
+
+        class WackyList1(list):
+            pass
+
+        class WackyList2(list):
+            def __lt__(self, other):
+                raise ValueError
+
+        L = [WackyList1([WackyComparator(i), i]) for i in range(10)]
+        elem = L[-1]
+        with self.assertRaises(ValueError):
+            L.sort()
+
+        L = [WackyList1([WackyComparator(i), i]) for i in range(10)]
+        elem = L[-1]
+        with self.assertRaises(ValueError):
+            [(x,) for x in L].sort()
+
+        # The following test is also by ppperry. It ensures that
+        # unsafe_object_compare handles Py_NotImplemented appropriately.
+        class PointlessComparator:
+            def __lt__(self, other):
+                return NotImplemented
+        L = [PointlessComparator(), PointlessComparator()]
+        self.assertRaises(TypeError, L.sort)
+        self.assertRaises(TypeError, [(x,) for x in L].sort)
+
+        # The following tests go through various types that would trigger
+        # ms->key_compare = unsafe_object_compare
+        lists = [list(range(100)) + [(1<<70)],
+                 [str(x) for x in range(100)] + ['\uffff'],
+                 [bytes(x) for x in range(100)],
+                 [cmp_to_key(lambda x,y: x<y)(x) for x in range(100)]]
+        for L in lists:
+            check_against_PyObject_RichCompareBool(self, L)
+
+    def test_unsafe_latin_compare(self):
+        check_against_PyObject_RichCompareBool(self, [str(x) for
+                                                      x in range(100)])
+
+    def test_unsafe_long_compare(self):
+        check_against_PyObject_RichCompareBool(self, [x for
+                                                      x in range(100)])
+
+    def test_unsafe_float_compare(self):
+        check_against_PyObject_RichCompareBool(self, [float(x) for
+                                                      x in range(100)])
+
+    def test_unsafe_tuple_compare(self):
+        # This test was suggested by Tim Peters. It verifies that the tuple
+        # comparison respects the current tuple compare semantics, which do not
+        # guarantee that x < x <=> (x,) < (x,)
+        #
+        # Note that we don't have to put anything in tuples here, because
+        # the check function does a tuple test automatically.
+
+        check_against_PyObject_RichCompareBool(self, [float('nan')]*100)
+        check_against_PyObject_RichCompareBool(self, [float('nan') for
+                                                      _ in range(100)])
+
+    def test_not_all_tuples(self):
+        self.assertRaises(TypeError, [(1.0, 1.0), (False, "A"), 6].sort)
+        self.assertRaises(TypeError, [('a', 1), (1, 'a')].sort)
+        self.assertRaises(TypeError, [(1, 'a'), ('a', 1)].sort)
+
+    def test_none_in_tuples(self):
+        expected = [(None, 1), (None, 2)]
+        actual = sorted([(None, 2), (None, 1)])
+        self.assertEqual(actual, expected)
+
+#==============================================================================
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_tuple.diff b/test/dynamo/cpython/3_13/test_tuple.diff
new file mode 100644
index 000000000000..46d4bb32d9ef
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_tuple.diff
@@ -0,0 +1,67 @@
+diff --git a/test/dynamo/cpython/3_13/test_tuple.py b/test/dynamo/cpython/3_13/test_tuple.py
+index 9ce80c5e8ea..e52c0cbc140 100644
+--- a/test/dynamo/cpython/3_13/test_tuple.py
++++ b/test/dynamo/cpython/3_13/test_tuple.py
+@@ -1,4 +1,55 @@
+-from test import support, seq_tests
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
++from test import support
++import seq_tests
+ import unittest
+ 
+ import gc
+@@ -510,4 +561,4 @@ class TupleTest(seq_tests.CommonTest):
+ #            pileup 262,143 mean 8.0 coll 262,143 z +92683.6
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_tuple.py b/test/dynamo/cpython/3_13/test_tuple.py
new file mode 100644
index 000000000000..e52c0cbc1403
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_tuple.py
@@ -0,0 +1,564 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+from test import support
+import seq_tests
+import unittest
+
+import gc
+import pickle
+
+# For tuple hashes, we normally only run a test to ensure that we get
+# the same results across platforms in a handful of cases.  If that's
+# so, there's no real point to running more.  Set RUN_ALL_HASH_TESTS to
+# run more anyway.  That's usually of real interest only when analyzing,
+# or changing, the hash algorithm.  In which case it's usually also
+# most useful to set JUST_SHOW_HASH_RESULTS, to see all the results
+# instead of wrestling with test "failures".  See the bottom of the
+# file for extensive notes on what we're testing here and why.
+RUN_ALL_HASH_TESTS = False
+JUST_SHOW_HASH_RESULTS = False # if RUN_ALL_HASH_TESTS, just display
+
+class TupleTest(seq_tests.CommonTest):
+    type2test = tuple
+
+    def test_getitem_error(self):
+        t = ()
+        msg = "tuple indices must be integers or slices"
+        with self.assertRaisesRegex(TypeError, msg):
+            t['a']
+
+    def test_constructors(self):
+        super().test_constructors()
+        # calling built-in types without argument must return empty
+        self.assertEqual(tuple(), ())
+        t0_3 = (0, 1, 2, 3)
+        t0_3_bis = tuple(t0_3)
+        self.assertTrue(t0_3 is t0_3_bis)
+        self.assertEqual(tuple([]), ())
+        self.assertEqual(tuple([0, 1, 2, 3]), (0, 1, 2, 3))
+        self.assertEqual(tuple(''), ())
+        self.assertEqual(tuple('spam'), ('s', 'p', 'a', 'm'))
+        self.assertEqual(tuple(x for x in range(10) if x % 2),
+                         (1, 3, 5, 7, 9))
+
+    def test_keyword_args(self):
+        with self.assertRaisesRegex(TypeError, 'keyword argument'):
+            tuple(sequence=())
+
+    def test_keywords_in_subclass(self):
+        class subclass(tuple):
+            pass
+        u = subclass([1, 2])
+        self.assertIs(type(u), subclass)
+        self.assertEqual(list(u), [1, 2])
+        with self.assertRaises(TypeError):
+            subclass(sequence=())
+
+        class subclass_with_init(tuple):
+            def __init__(self, arg, newarg=None):
+                self.newarg = newarg
+        u = subclass_with_init([1, 2], newarg=3)
+        self.assertIs(type(u), subclass_with_init)
+        self.assertEqual(list(u), [1, 2])
+        self.assertEqual(u.newarg, 3)
+
+        class subclass_with_new(tuple):
+            def __new__(cls, arg, newarg=None):
+                self = super().__new__(cls, arg)
+                self.newarg = newarg
+                return self
+        u = subclass_with_new([1, 2], newarg=3)
+        self.assertIs(type(u), subclass_with_new)
+        self.assertEqual(list(u), [1, 2])
+        self.assertEqual(u.newarg, 3)
+
+    def test_truth(self):
+        super().test_truth()
+        self.assertTrue(not ())
+        self.assertTrue((42, ))
+
+    def test_len(self):
+        super().test_len()
+        self.assertEqual(len(()), 0)
+        self.assertEqual(len((0,)), 1)
+        self.assertEqual(len((0, 1, 2)), 3)
+
+    def test_iadd(self):
+        super().test_iadd()
+        u = (0, 1)
+        u2 = u
+        u += (2, 3)
+        self.assertTrue(u is not u2)
+
+    def test_imul(self):
+        super().test_imul()
+        u = (0, 1)
+        u2 = u
+        u *= 3
+        self.assertTrue(u is not u2)
+
+    def test_tupleresizebug(self):
+        # Check that a specific bug in _PyTuple_Resize() is squashed.
+        def f():
+            for i in range(1000):
+                yield i
+        self.assertEqual(list(tuple(f())), list(range(1000)))
+
+    # We expect tuples whose base components have deterministic hashes to
+    # have deterministic hashes too - and, indeed, the same hashes across
+    # platforms with hash codes of the same bit width.
+    def test_hash_exact(self):
+        def check_one_exact(t, e32, e64):
+            got = hash(t)
+            expected = e32 if support.NHASHBITS == 32 else e64
+            if got != expected:
+                msg = f"FAIL hash({t!r}) == {got} != {expected}"
+                self.fail(msg)
+
+        check_one_exact((), 750394483, 5740354900026072187)
+        check_one_exact((0,), 1214856301, -8753497827991233192)
+        check_one_exact((0, 0), -168982784, -8458139203682520985)
+        check_one_exact((0.5,), 2077348973, -408149959306781352)
+        check_one_exact((0.5, (), (-2, 3, (4, 6))), 714642271,
+                        -1845940830829704396)
+
+    # Various tests for hashing of tuples to check that we get few collisions.
+    # Does something only if RUN_ALL_HASH_TESTS is true.
+    #
+    # Earlier versions of the tuple hash algorithm had massive collisions
+    # reported at:
+    # - https://bugs.python.org/issue942952
+    # - https://bugs.python.org/issue34751
+    def test_hash_optional(self):
+        from itertools import product
+
+        if not RUN_ALL_HASH_TESTS:
+            return
+
+        # If specified, `expected` is a 2-tuple of expected
+        # (number_of_collisions, pileup) values, and the test fails if
+        # those aren't the values we get.  Also if specified, the test
+        # fails if z > `zlimit`.
+        def tryone_inner(tag, nbins, hashes, expected=None, zlimit=None):
+            from collections import Counter
+
+            nballs = len(hashes)
+            mean, sdev = support.collision_stats(nbins, nballs)
+            c = Counter(hashes)
+            collisions = nballs - len(c)
+            z = (collisions - mean) / sdev
+            pileup = max(c.values()) - 1
+            del c
+            got = (collisions, pileup)
+            failed = False
+            prefix = ""
+            if zlimit is not None and z > zlimit:
+                failed = True
+                prefix = f"FAIL z > {zlimit}; "
+            if expected is not None and got != expected:
+                failed = True
+                prefix += f"FAIL {got} != {expected}; "
+            if failed or JUST_SHOW_HASH_RESULTS:
+                msg = f"{prefix}{tag}; pileup {pileup:,} mean {mean:.1f} "
+                msg += f"coll {collisions:,} z {z:+.1f}"
+                if JUST_SHOW_HASH_RESULTS:
+                    import sys
+                    print(msg, file=sys.__stdout__)
+                else:
+                    self.fail(msg)
+
+        def tryone(tag, xs,
+                   native32=None, native64=None, hi32=None, lo32=None,
+                   zlimit=None):
+            NHASHBITS = support.NHASHBITS
+            hashes = list(map(hash, xs))
+            tryone_inner(tag + f"; {NHASHBITS}-bit hash codes",
+                         1 << NHASHBITS,
+                         hashes,
+                         native32 if NHASHBITS == 32 else native64,
+                         zlimit)
+
+            if NHASHBITS > 32:
+                shift = NHASHBITS - 32
+                tryone_inner(tag + "; 32-bit upper hash codes",
+                             1 << 32,
+                             [h >> shift for h in hashes],
+                             hi32,
+                             zlimit)
+
+                mask = (1 << 32) - 1
+                tryone_inner(tag + "; 32-bit lower hash codes",
+                             1 << 32,
+                             [h & mask for h in hashes],
+                             lo32,
+                             zlimit)
+
+        # Tuples of smallish positive integers are common - nice if we
+        # get "better than random" for these.
+        tryone("range(100) by 3", list(product(range(100), repeat=3)),
+               (0, 0), (0, 0), (4, 1), (0, 0))
+
+        # A previous hash had systematic problems when mixing integers of
+        # similar magnitude but opposite sign, obscurely related to that
+        # j ^ -2 == -j when j is odd.
+        cands = list(range(-10, -1)) + list(range(9))
+
+        # Note:  -1 is omitted because hash(-1) == hash(-2) == -2, and
+        # there's nothing the tuple hash can do to avoid collisions
+        # inherited from collisions in the tuple components' hashes.
+        tryone("-10 .. 8 by 4", list(product(cands, repeat=4)),
+               (0, 0), (0, 0), (0, 0), (0, 0))
+        del cands
+
+        # The hashes here are a weird mix of values where all the
+        # variation is in the lowest bits and across a single high-order
+        # bit - the middle bits are all zeroes. A decent hash has to
+        # both propagate low bits to the left and high bits to the
+        # right.  This is also complicated a bit in that there are
+        # collisions among the hashes of the integers in L alone.
+        L = [n << 60 for n in range(100)]
+        tryone("0..99 << 60 by 3", list(product(L, repeat=3)),
+               (0, 0), (0, 0), (0, 0), (324, 1))
+        del L
+
+        # Used to suffer a massive number of collisions.
+        tryone("[-3, 3] by 18", list(product([-3, 3], repeat=18)),
+               (7, 1), (0, 0), (7, 1), (6, 1))
+
+        # And even worse.  hash(0.5) has only a single bit set, at the
+        # high end. A decent hash needs to propagate high bits right.
+        tryone("[0, 0.5] by 18", list(product([0, 0.5], repeat=18)),
+               (5, 1), (0, 0), (9, 1), (12, 1))
+
+        # Hashes of ints and floats are the same across platforms.
+        # String hashes vary even on a single platform across runs, due
+        # to hash randomization for strings.  So we can't say exactly
+        # what this should do.  Instead we insist that the # of
+        # collisions is no more than 4 sdevs above the theoretically
+        # random mean.  Even if the tuple hash can't achieve that on its
+        # own, the string hash is trying to be decently pseudo-random
+        # (in all bit positions) on _its_ own.  We can at least test
+        # that the tuple hash doesn't systematically ruin that.
+        tryone("4-char tuples",
+               list(product("abcdefghijklmnopqrstuvwxyz", repeat=4)),
+               zlimit=4.0)
+
+        # The "old tuple test".  See https://bugs.python.org/issue942952.
+        # Ensures, for example, that the hash:
+        #   is non-commutative
+        #   spreads closely spaced values
+        #   doesn't exhibit cancellation in tuples like (x,(x,y))
+        N = 50
+        base = list(range(N))
+        xp = list(product(base, repeat=2))
+        inps = base + list(product(base, xp)) + \
+                     list(product(xp, base)) + xp + list(zip(base))
+        tryone("old tuple test", inps,
+               (2, 1), (0, 0), (52, 49), (7, 1))
+        del base, xp, inps
+
+        # The "new tuple test".  See https://bugs.python.org/issue34751.
+        # Even more tortured nesting, and a mix of signed ints of very
+        # small magnitude.
+        n = 5
+        A = [x for x in range(-n, n+1) if x != -1]
+        B = A + [(a,) for a in A]
+        L2 = list(product(A, repeat=2))
+        L3 = L2 + list(product(A, repeat=3))
+        L4 = L3 + list(product(A, repeat=4))
+        # T = list of testcases. These consist of all (possibly nested
+        # at most 2 levels deep) tuples containing at most 4 items from
+        # the set A.
+        T = A
+        T += [(a,) for a in B + L4]
+        T += product(L3, B)
+        T += product(L2, repeat=2)
+        T += product(B, L3)
+        T += product(B, B, L2)
+        T += product(B, L2, B)
+        T += product(L2, B, B)
+        T += product(B, repeat=4)
+        assert len(T) == 345130
+        tryone("new tuple test", T,
+               (9, 1), (0, 0), (21, 5), (6, 1))
+
+    def test_repr(self):
+        l0 = tuple()
+        l2 = (0, 1, 2)
+        a0 = self.type2test(l0)
+        a2 = self.type2test(l2)
+
+        self.assertEqual(str(a0), repr(l0))
+        self.assertEqual(str(a2), repr(l2))
+        self.assertEqual(repr(a0), "()")
+        self.assertEqual(repr(a2), "(0, 1, 2)")
+
+    def _not_tracked(self, t):
+        # Nested tuples can take several collections to untrack
+        gc.collect()
+        gc.collect()
+        self.assertFalse(gc.is_tracked(t), t)
+
+    def _tracked(self, t):
+        self.assertTrue(gc.is_tracked(t), t)
+        gc.collect()
+        gc.collect()
+        self.assertTrue(gc.is_tracked(t), t)
+
+    @support.cpython_only
+    def test_track_literals(self):
+        # Test GC-optimization of tuple literals
+        x, y, z = 1.5, "a", []
+
+        self._not_tracked(())
+        self._not_tracked((1,))
+        self._not_tracked((1, 2))
+        self._not_tracked((1, 2, "a"))
+        self._not_tracked((1, 2, (None, True, False, ()), int))
+        self._not_tracked((object(),))
+        self._not_tracked(((1, x), y, (2, 3)))
+
+        # Tuples with mutable elements are always tracked, even if those
+        # elements are not tracked right now.
+        self._tracked(([],))
+        self._tracked(([1],))
+        self._tracked(({},))
+        self._tracked((set(),))
+        self._tracked((x, y, z))
+
+    def check_track_dynamic(self, tp, always_track):
+        x, y, z = 1.5, "a", []
+
+        check = self._tracked if always_track else self._not_tracked
+        check(tp())
+        check(tp([]))
+        check(tp(set()))
+        check(tp([1, x, y]))
+        check(tp(obj for obj in [1, x, y]))
+        check(tp(set([1, x, y])))
+        check(tp(tuple([obj]) for obj in [1, x, y]))
+        check(tuple(tp([obj]) for obj in [1, x, y]))
+
+        self._tracked(tp([z]))
+        self._tracked(tp([[x, y]]))
+        self._tracked(tp([{x: y}]))
+        self._tracked(tp(obj for obj in [x, y, z]))
+        self._tracked(tp(tuple([obj]) for obj in [x, y, z]))
+        self._tracked(tuple(tp([obj]) for obj in [x, y, z]))
+
+    @support.cpython_only
+    def test_track_dynamic(self):
+        # Test GC-optimization of dynamically constructed tuples.
+        self.check_track_dynamic(tuple, False)
+
+    @support.cpython_only
+    def test_track_subtypes(self):
+        # Tuple subtypes must always be tracked
+        class MyTuple(tuple):
+            pass
+        self.check_track_dynamic(MyTuple, True)
+
+    @support.cpython_only
+    def test_bug7466(self):
+        # Trying to untrack an unfinished tuple could crash Python
+        self._not_tracked(tuple(gc.collect() for i in range(101)))
+
+    def test_repr_large(self):
+        # Check the repr of large list objects
+        def check(n):
+            l = (0,) * n
+            s = repr(l)
+            self.assertEqual(s,
+                '(' + ', '.join(['0'] * n) + ')')
+        check(10)       # check our checking code
+        check(1000000)
+
+    def test_iterator_pickle(self):
+        # Userlist iterators don't support pickling yet since
+        # they are based on generators.
+        data = self.type2test([4, 5, 6, 7])
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            itorg = iter(data)
+            d = pickle.dumps(itorg, proto)
+            it = pickle.loads(d)
+            self.assertEqual(type(itorg), type(it))
+            self.assertEqual(self.type2test(it), self.type2test(data))
+
+            it = pickle.loads(d)
+            next(it)
+            d = pickle.dumps(it, proto)
+            self.assertEqual(self.type2test(it), self.type2test(data)[1:])
+
+    def test_reversed_pickle(self):
+        data = self.type2test([4, 5, 6, 7])
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            itorg = reversed(data)
+            d = pickle.dumps(itorg, proto)
+            it = pickle.loads(d)
+            self.assertEqual(type(itorg), type(it))
+            self.assertEqual(self.type2test(it), self.type2test(reversed(data)))
+
+            it = pickle.loads(d)
+            next(it)
+            d = pickle.dumps(it, proto)
+            self.assertEqual(self.type2test(it), self.type2test(reversed(data))[1:])
+
+    def test_no_comdat_folding(self):
+        # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
+        # optimization causes failures in code that relies on distinct
+        # function addresses.
+        class T(tuple): pass
+        with self.assertRaises(TypeError):
+            [3,] + T((1,2))
+
+    def test_lexicographic_ordering(self):
+        # Issue 21100
+        a = self.type2test([1, 2])
+        b = self.type2test([1, 2, 0])
+        c = self.type2test([1, 3])
+        self.assertLess(a, b)
+        self.assertLess(b, c)
+
+# Notes on testing hash codes.  The primary thing is that Python doesn't
+# care about "random" hash codes.  To the contrary, we like them to be
+# very regular when possible, so that the low-order bits are as evenly
+# distributed as possible.  For integers this is easy: hash(i) == i for
+# all not-huge i except i==-1.
+#
+# For tuples of mixed type there's really no hope of that, so we want
+# "randomish" here instead.  But getting close to pseudo-random in all
+# bit positions is more expensive than we've been willing to pay for.
+#
+# We can tolerate large deviations from random - what we don't want is
+# catastrophic pileups on a relative handful of hash codes.  The dict
+# and set lookup routines remain effective provided that full-width hash
+# codes for not-equal objects are distinct.
+#
+# So we compute various statistics here based on what a "truly random"
+# hash would do, but don't automate "pass or fail" based on those
+# results.  Instead those are viewed as inputs to human judgment, and the
+# automated tests merely ensure we get the _same_ results across
+# platforms.  In fact, we normally don't bother to run them at all -
+# set RUN_ALL_HASH_TESTS to force it.
+#
+# When global JUST_SHOW_HASH_RESULTS is True, the tuple hash statistics
+# are just displayed to stdout.  A typical output line looks like:
+#
+# old tuple test; 32-bit upper hash codes; \
+#             pileup 49 mean 7.4 coll 52 z +16.4
+#
+# "old tuple test" is just a string name for the test being run.
+#
+# "32-bit upper hash codes" means this was run under a 64-bit build and
+# we've shifted away the lower 32 bits of the hash codes.
+#
+# "pileup" is 0 if there were no collisions across those hash codes.
+# It's 1 less than the maximum number of times any single hash code was
+# seen.  So in this case, there was (at least) one hash code that was
+# seen 50 times:  that hash code "piled up" 49 more times than ideal.
+#
+# "mean" is the number of collisions a perfectly random hash function
+# would have yielded, on average.
+#
+# "coll" is the number of collisions actually seen.
+#
+# "z" is "coll - mean" divided by the standard deviation of the number
+# of collisions a perfectly random hash function would suffer.  A
+# positive value is "worse than random", and negative value "better than
+# random".  Anything of magnitude greater than 3 would be highly suspect
+# for a hash function that claimed to be random.  It's essentially
+# impossible that a truly random function would deliver a result 16.4
+# sdevs "worse than random".
+#
+# But we don't care here!  That's why the test isn't coded to fail.
+# Knowing something about how the high-order hash code bits behave
+# provides insight, but is irrelevant to how the dict and set lookup
+# code performs.  The low-order bits are much more important to that,
+# and on the same test those did "just like random":
+#
+# old tuple test; 32-bit lower hash codes; \
+#            pileup 1 mean 7.4 coll 7 z -0.2
+#
+# So there are always tradeoffs to consider.  For another:
+#
+# 0..99 << 60 by 3; 32-bit hash codes; \
+#            pileup 0 mean 116.4 coll 0 z -10.8
+#
+# That was run under a 32-bit build, and is spectacularly "better than
+# random".  On a 64-bit build the wider hash codes are fine too:
+#
+# 0..99 << 60 by 3; 64-bit hash codes; \
+#             pileup 0 mean 0.0 coll 0 z -0.0
+#
+# but their lower 32 bits are poor:
+#
+# 0..99 << 60 by 3; 32-bit lower hash codes; \
+#             pileup 1 mean 116.4 coll 324 z +19.2
+#
+# In a statistical sense that's waaaaay too many collisions, but (a) 324
+# collisions out of a million hash codes isn't anywhere near being a
+# real problem; and, (b) the worst pileup on a single hash code is a measly
+# 1 extra.  It's a relatively poor case for the tuple hash, but still
+# fine for practical use.
+#
+# This isn't, which is what Python 3.7.1 produced for the hashes of
+# itertools.product([0, 0.5], repeat=18).  Even with a fat 64-bit
+# hashcode, the highest pileup was over 16,000 - making a dict/set
+# lookup on one of the colliding values thousands of times slower (on
+# average) than we expect.
+#
+# [0, 0.5] by 18; 64-bit hash codes; \
+#            pileup 16,383 mean 0.0 coll 262,128 z +6073641856.9
+# [0, 0.5] by 18; 32-bit lower hash codes; \
+#            pileup 262,143 mean 8.0 coll 262,143 z +92683.6
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_unittest/test_assertions.diff b/test/dynamo/cpython/3_13/test_unittest/test_assertions.diff
new file mode 100644
index 000000000000..9f2d27da51d2
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_unittest/test_assertions.diff
@@ -0,0 +1,89 @@
+diff --git a/test/dynamo/cpython/3_13/test_unittest/test_assertions.py b/test/dynamo/cpython/3_13/test_unittest/test_assertions.py
+index 1dec947ea76..5a8c2a9d3af 100644
+--- a/test/dynamo/cpython/3_13/test_unittest/test_assertions.py
++++ b/test/dynamo/cpython/3_13/test_unittest/test_assertions.py
+@@ -1,3 +1,54 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch.testing._internal.common_utils import run_tests
++
++
++__TestCase = torch._dynamo.test_case.CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ import datetime
+ import warnings
+ import weakref
+@@ -6,7 +57,7 @@ from test.support import gc_collect
+ from itertools import product
+ 
+ 
+-class Test_Assertions(unittest.TestCase):
++class Test_Assertions(__TestCase):
+     def test_AlmostEqual(self):
+         self.assertAlmostEqual(1.00000001, 1.0)
+         self.assertNotAlmostEqual(1.0000001, 1.0)
+@@ -141,12 +192,13 @@ class Test_Assertions(unittest.TestCase):
+             self.fail('assertNotRegex should have failed.')
+ 
+ 
+-class TestLongMessage(unittest.TestCase):
++class TestLongMessage(__TestCase):
+     """Test that the individual asserts honour longMessage.
+     This actually tests all the message behaviour for
+     asserts that use longMessage."""
+ 
+     def setUp(self):
++        super().setUp()
+         class TestableTestFalse(unittest.TestCase):
+             longMessage = False
+             failureException = self.failureException
+@@ -414,4 +466,4 @@ class TestLongMessage(unittest.TestCase):
+ 
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_unittest/test_assertions.py b/test/dynamo/cpython/3_13/test_unittest/test_assertions.py
new file mode 100644
index 000000000000..5a8c2a9d3af9
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_unittest/test_assertions.py
@@ -0,0 +1,469 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch.testing._internal.common_utils import run_tests
+
+
+__TestCase = torch._dynamo.test_case.CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+import datetime
+import warnings
+import weakref
+import unittest
+from test.support import gc_collect
+from itertools import product
+
+
+class Test_Assertions(__TestCase):
+    def test_AlmostEqual(self):
+        self.assertAlmostEqual(1.00000001, 1.0)
+        self.assertNotAlmostEqual(1.0000001, 1.0)
+        self.assertRaises(self.failureException,
+                          self.assertAlmostEqual, 1.0000001, 1.0)
+        self.assertRaises(self.failureException,
+                          self.assertNotAlmostEqual, 1.00000001, 1.0)
+
+        self.assertAlmostEqual(1.1, 1.0, places=0)
+        self.assertRaises(self.failureException,
+                          self.assertAlmostEqual, 1.1, 1.0, places=1)
+
+        self.assertAlmostEqual(0, .1+.1j, places=0)
+        self.assertNotAlmostEqual(0, .1+.1j, places=1)
+        self.assertRaises(self.failureException,
+                          self.assertAlmostEqual, 0, .1+.1j, places=1)
+        self.assertRaises(self.failureException,
+                          self.assertNotAlmostEqual, 0, .1+.1j, places=0)
+
+        self.assertAlmostEqual(float('inf'), float('inf'))
+        self.assertRaises(self.failureException, self.assertNotAlmostEqual,
+                          float('inf'), float('inf'))
+
+    def test_AmostEqualWithDelta(self):
+        self.assertAlmostEqual(1.1, 1.0, delta=0.5)
+        self.assertAlmostEqual(1.0, 1.1, delta=0.5)
+        self.assertNotAlmostEqual(1.1, 1.0, delta=0.05)
+        self.assertNotAlmostEqual(1.0, 1.1, delta=0.05)
+
+        self.assertAlmostEqual(1.0, 1.0, delta=0.5)
+        self.assertRaises(self.failureException, self.assertNotAlmostEqual,
+                          1.0, 1.0, delta=0.5)
+
+        self.assertRaises(self.failureException, self.assertAlmostEqual,
+                          1.1, 1.0, delta=0.05)
+        self.assertRaises(self.failureException, self.assertNotAlmostEqual,
+                          1.1, 1.0, delta=0.5)
+
+        self.assertRaises(TypeError, self.assertAlmostEqual,
+                          1.1, 1.0, places=2, delta=2)
+        self.assertRaises(TypeError, self.assertNotAlmostEqual,
+                          1.1, 1.0, places=2, delta=2)
+
+        first = datetime.datetime.now()
+        second = first + datetime.timedelta(seconds=10)
+        self.assertAlmostEqual(first, second,
+                               delta=datetime.timedelta(seconds=20))
+        self.assertNotAlmostEqual(first, second,
+                                  delta=datetime.timedelta(seconds=5))
+
+    def test_assertRaises(self):
+        def _raise(e):
+            raise e
+        self.assertRaises(KeyError, _raise, KeyError)
+        self.assertRaises(KeyError, _raise, KeyError("key"))
+        try:
+            self.assertRaises(KeyError, lambda: None)
+        except self.failureException as e:
+            self.assertIn("KeyError not raised", str(e))
+        else:
+            self.fail("assertRaises() didn't fail")
+        try:
+            self.assertRaises(KeyError, _raise, ValueError)
+        except ValueError:
+            pass
+        else:
+            self.fail("assertRaises() didn't let exception pass through")
+        with self.assertRaises(KeyError) as cm:
+            try:
+                raise KeyError
+            except Exception as e:
+                exc = e
+                raise
+        self.assertIs(cm.exception, exc)
+
+        with self.assertRaises(KeyError):
+            raise KeyError("key")
+        try:
+            with self.assertRaises(KeyError):
+                pass
+        except self.failureException as e:
+            self.assertIn("KeyError not raised", str(e))
+        else:
+            self.fail("assertRaises() didn't fail")
+        try:
+            with self.assertRaises(KeyError):
+                raise ValueError
+        except ValueError:
+            pass
+        else:
+            self.fail("assertRaises() didn't let exception pass through")
+
+    def test_assertRaises_frames_survival(self):
+        # Issue #9815: assertRaises should avoid keeping local variables
+        # in a traceback alive.
+        class A:
+            pass
+        wr = None
+
+        class Foo(unittest.TestCase):
+
+            def foo(self):
+                nonlocal wr
+                a = A()
+                wr = weakref.ref(a)
+                try:
+                    raise OSError
+                except OSError:
+                    raise ValueError
+
+            def test_functional(self):
+                self.assertRaises(ValueError, self.foo)
+
+            def test_with(self):
+                with self.assertRaises(ValueError):
+                    self.foo()
+
+        Foo("test_functional").run()
+        gc_collect()  # For PyPy or other GCs.
+        self.assertIsNone(wr())
+        Foo("test_with").run()
+        gc_collect()  # For PyPy or other GCs.
+        self.assertIsNone(wr())
+
+    def testAssertNotRegex(self):
+        self.assertNotRegex('Ala ma kota', r'r+')
+        try:
+            self.assertNotRegex('Ala ma kota', r'k.t', 'Message')
+        except self.failureException as e:
+            self.assertIn('Message', e.args[0])
+        else:
+            self.fail('assertNotRegex should have failed.')
+
+
+class TestLongMessage(__TestCase):
+    """Test that the individual asserts honour longMessage.
+    This actually tests all the message behaviour for
+    asserts that use longMessage."""
+
+    def setUp(self):
+        super().setUp()
+        class TestableTestFalse(unittest.TestCase):
+            longMessage = False
+            failureException = self.failureException
+
+            def testTest(self):
+                pass
+
+        class TestableTestTrue(unittest.TestCase):
+            longMessage = True
+            failureException = self.failureException
+
+            def testTest(self):
+                pass
+
+        self.testableTrue = TestableTestTrue('testTest')
+        self.testableFalse = TestableTestFalse('testTest')
+
+    def testDefault(self):
+        self.assertTrue(unittest.TestCase.longMessage)
+
+    def test_formatMsg(self):
+        self.assertEqual(self.testableFalse._formatMessage(None, "foo"), "foo")
+        self.assertEqual(self.testableFalse._formatMessage("foo", "bar"), "foo")
+
+        self.assertEqual(self.testableTrue._formatMessage(None, "foo"), "foo")
+        self.assertEqual(self.testableTrue._formatMessage("foo", "bar"), "bar : foo")
+
+        # This blows up if _formatMessage uses string concatenation
+        self.testableTrue._formatMessage(object(), 'foo')
+
+    def test_formatMessage_unicode_error(self):
+        one = ''.join(chr(i) for i in range(255))
+        # this used to cause a UnicodeDecodeError constructing msg
+        self.testableTrue._formatMessage(one, '\uFFFD')
+
+    def assertMessages(self, methodName, args, errors):
+        """
+        Check that methodName(*args) raises the correct error messages.
+        errors should be a list of 4 regex that match the error when:
+          1) longMessage = False and no msg passed;
+          2) longMessage = False and msg passed;
+          3) longMessage = True and no msg passed;
+          4) longMessage = True and msg passed;
+        """
+        def getMethod(i):
+            useTestableFalse  = i < 2
+            if useTestableFalse:
+                test = self.testableFalse
+            else:
+                test = self.testableTrue
+            return getattr(test, methodName)
+
+        for i, expected_regex in enumerate(errors):
+            testMethod = getMethod(i)
+            kwargs = {}
+            withMsg = i % 2
+            if withMsg:
+                kwargs = {"msg": "oops"}
+
+            with self.assertRaisesRegex(self.failureException,
+                                        expected_regex=expected_regex):
+                testMethod(*args, **kwargs)
+
+    def testAssertTrue(self):
+        self.assertMessages('assertTrue', (False,),
+                            ["^False is not true$", "^oops$", "^False is not true$",
+                             "^False is not true : oops$"])
+
+    def testAssertFalse(self):
+        self.assertMessages('assertFalse', (True,),
+                            ["^True is not false$", "^oops$", "^True is not false$",
+                             "^True is not false : oops$"])
+
+    def testNotEqual(self):
+        self.assertMessages('assertNotEqual', (1, 1),
+                            ["^1 == 1$", "^oops$", "^1 == 1$",
+                             "^1 == 1 : oops$"])
+
+    def testAlmostEqual(self):
+        self.assertMessages(
+            'assertAlmostEqual', (1, 2),
+            [r"^1 != 2 within 7 places \(1 difference\)$", "^oops$",
+             r"^1 != 2 within 7 places \(1 difference\)$",
+             r"^1 != 2 within 7 places \(1 difference\) : oops$"])
+
+    def testNotAlmostEqual(self):
+        self.assertMessages('assertNotAlmostEqual', (1, 1),
+                            ["^1 == 1 within 7 places$", "^oops$",
+                             "^1 == 1 within 7 places$", "^1 == 1 within 7 places : oops$"])
+
+    def test_baseAssertEqual(self):
+        self.assertMessages('_baseAssertEqual', (1, 2),
+                            ["^1 != 2$", "^oops$", "^1 != 2$", "^1 != 2 : oops$"])
+
+    def testAssertSequenceEqual(self):
+        # Error messages are multiline so not testing on full message
+        # assertTupleEqual and assertListEqual delegate to this method
+        self.assertMessages('assertSequenceEqual', ([], [None]),
+                            [r"\+ \[None\]$", "^oops$", r"\+ \[None\]$",
+                             r"\+ \[None\] : oops$"])
+
+    def testAssertSetEqual(self):
+        self.assertMessages('assertSetEqual', (set(), set([None])),
+                            ["None$", "^oops$", "None$",
+                             "None : oops$"])
+
+    def testAssertIn(self):
+        self.assertMessages('assertIn', (None, []),
+                            [r'^None not found in \[\]$', "^oops$",
+                             r'^None not found in \[\]$',
+                             r'^None not found in \[\] : oops$'])
+
+    def testAssertNotIn(self):
+        self.assertMessages('assertNotIn', (None, [None]),
+                            [r'^None unexpectedly found in \[None\]$', "^oops$",
+                             r'^None unexpectedly found in \[None\]$',
+                             r'^None unexpectedly found in \[None\] : oops$'])
+
+    def testAssertDictEqual(self):
+        self.assertMessages('assertDictEqual', ({}, {'key': 'value'}),
+                            [r"\+ \{'key': 'value'\}$", "^oops$",
+                             r"\+ \{'key': 'value'\}$",
+                             r"\+ \{'key': 'value'\} : oops$"])
+
+    def testAssertMultiLineEqual(self):
+        self.assertMessages('assertMultiLineEqual', ("", "foo"),
+                            [r"\+ foo\n$", "^oops$",
+                             r"\+ foo\n$",
+                             r"\+ foo\n : oops$"])
+
+    def testAssertLess(self):
+        self.assertMessages('assertLess', (2, 1),
+                            ["^2 not less than 1$", "^oops$",
+                             "^2 not less than 1$", "^2 not less than 1 : oops$"])
+
+    def testAssertLessEqual(self):
+        self.assertMessages('assertLessEqual', (2, 1),
+                            ["^2 not less than or equal to 1$", "^oops$",
+                             "^2 not less than or equal to 1$",
+                             "^2 not less than or equal to 1 : oops$"])
+
+    def testAssertGreater(self):
+        self.assertMessages('assertGreater', (1, 2),
+                            ["^1 not greater than 2$", "^oops$",
+                             "^1 not greater than 2$",
+                             "^1 not greater than 2 : oops$"])
+
+    def testAssertGreaterEqual(self):
+        self.assertMessages('assertGreaterEqual', (1, 2),
+                            ["^1 not greater than or equal to 2$", "^oops$",
+                             "^1 not greater than or equal to 2$",
+                             "^1 not greater than or equal to 2 : oops$"])
+
+    def testAssertIsNone(self):
+        self.assertMessages('assertIsNone', ('not None',),
+                            ["^'not None' is not None$", "^oops$",
+                             "^'not None' is not None$",
+                             "^'not None' is not None : oops$"])
+
+    def testAssertIsNotNone(self):
+        self.assertMessages('assertIsNotNone', (None,),
+                            ["^unexpectedly None$", "^oops$",
+                             "^unexpectedly None$",
+                             "^unexpectedly None : oops$"])
+
+    def testAssertIs(self):
+        self.assertMessages('assertIs', (None, 'foo'),
+                            ["^None is not 'foo'$", "^oops$",
+                             "^None is not 'foo'$",
+                             "^None is not 'foo' : oops$"])
+
+    def testAssertIsNot(self):
+        self.assertMessages('assertIsNot', (None, None),
+                            ["^unexpectedly identical: None$", "^oops$",
+                             "^unexpectedly identical: None$",
+                             "^unexpectedly identical: None : oops$"])
+
+    def testAssertRegex(self):
+        self.assertMessages('assertRegex', ('foo', 'bar'),
+                            ["^Regex didn't match:",
+                             "^oops$",
+                             "^Regex didn't match:",
+                             "^Regex didn't match: (.*) : oops$"])
+
+    def testAssertNotRegex(self):
+        self.assertMessages('assertNotRegex', ('foo', 'foo'),
+                            ["^Regex matched:",
+                             "^oops$",
+                             "^Regex matched:",
+                             "^Regex matched: (.*) : oops$"])
+
+
+    def assertMessagesCM(self, methodName, args, func, errors):
+        """
+        Check that the correct error messages are raised while executing:
+          with method(*args):
+              func()
+        *errors* should be a list of 4 regex that match the error when:
+          1) longMessage = False and no msg passed;
+          2) longMessage = False and msg passed;
+          3) longMessage = True and no msg passed;
+          4) longMessage = True and msg passed;
+        """
+        p = product((self.testableFalse, self.testableTrue),
+                    ({}, {"msg": "oops"}))
+        for (cls, kwargs), err in zip(p, errors):
+            method = getattr(cls, methodName)
+            with self.assertRaisesRegex(cls.failureException, err):
+                with method(*args, **kwargs) as cm:
+                    func()
+
+    def testAssertRaises(self):
+        self.assertMessagesCM('assertRaises', (TypeError,), lambda: None,
+                              ['^TypeError not raised$', '^oops$',
+                               '^TypeError not raised$',
+                               '^TypeError not raised : oops$'])
+
+    def testAssertRaisesRegex(self):
+        # test error not raised
+        self.assertMessagesCM('assertRaisesRegex', (TypeError, 'unused regex'),
+                              lambda: None,
+                              ['^TypeError not raised$', '^oops$',
+                               '^TypeError not raised$',
+                               '^TypeError not raised : oops$'])
+        # test error raised but with wrong message
+        def raise_wrong_message():
+            raise TypeError('foo')
+        self.assertMessagesCM('assertRaisesRegex', (TypeError, 'regex'),
+                              raise_wrong_message,
+                              ['^"regex" does not match "foo"$', '^oops$',
+                               '^"regex" does not match "foo"$',
+                               '^"regex" does not match "foo" : oops$'])
+
+    def testAssertWarns(self):
+        self.assertMessagesCM('assertWarns', (UserWarning,), lambda: None,
+                              ['^UserWarning not triggered$', '^oops$',
+                               '^UserWarning not triggered$',
+                               '^UserWarning not triggered : oops$'])
+
+    def test_assertNotWarns(self):
+        def warn_future():
+            warnings.warn('xyz', FutureWarning, stacklevel=2)
+        self.assertMessagesCM('_assertNotWarns', (FutureWarning,),
+                              warn_future,
+                              ['^FutureWarning triggered$',
+                               '^oops$',
+                               '^FutureWarning triggered$',
+                               '^FutureWarning triggered : oops$'])
+
+    def testAssertWarnsRegex(self):
+        # test error not raised
+        self.assertMessagesCM('assertWarnsRegex', (UserWarning, 'unused regex'),
+                              lambda: None,
+                              ['^UserWarning not triggered$', '^oops$',
+                               '^UserWarning not triggered$',
+                               '^UserWarning not triggered : oops$'])
+        # test warning raised but with wrong message
+        def raise_wrong_message():
+            warnings.warn('foo')
+        self.assertMessagesCM('assertWarnsRegex', (UserWarning, 'regex'),
+                              raise_wrong_message,
+                              ['^"regex" does not match "foo"$', '^oops$',
+                               '^"regex" does not match "foo"$',
+                               '^"regex" does not match "foo" : oops$'])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_userdict.diff b/test/dynamo/cpython/3_13/test_userdict.diff
new file mode 100644
index 000000000000..1c0157489206
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_userdict.diff
@@ -0,0 +1,74 @@
+diff --git a/test/dynamo/cpython/3_13/test_userdict.py b/test/dynamo/cpython/3_13/test_userdict.py
+index 61e79f553e8..c953390355e 100644
+--- a/test/dynamo/cpython/3_13/test_userdict.py
++++ b/test/dynamo/cpython/3_13/test_userdict.py
+@@ -1,3 +1,54 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ # Check every path through every method of UserDict
+ 
+ from test import mapping_tests, support
+@@ -215,10 +266,10 @@ class UserDictTest(mapping_tests.TestHashMappingProtocol):
+ 
+     # Decorate existing test with recursion limit, because
+     # the test is for C structure, but `UserDict` is a Python structure.
+-    test_repr_deep = support.infinite_recursion(25)(
+-        mapping_tests.TestHashMappingProtocol.test_repr_deep,
+-    )
++    # test_repr_deep = support.infinite_recursion(25)(
++    #     mapping_tests.TestHashMappingProtocol.test_repr_deep,
++    # )
+ 
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_userdict.py b/test/dynamo/cpython/3_13/test_userdict.py
new file mode 100644
index 000000000000..c953390355e6
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_userdict.py
@@ -0,0 +1,275 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+# Check every path through every method of UserDict
+
+from test import mapping_tests, support
+import unittest
+import collections
+
+d0 = {}
+d1 = {"one": 1}
+d2 = {"one": 1, "two": 2}
+d3 = {"one": 1, "two": 3, "three": 5}
+d4 = {"one": None, "two": None}
+d5 = {"one": 1, "two": 1}
+
+class UserDictTest(mapping_tests.TestHashMappingProtocol):
+    type2test = collections.UserDict
+
+    def test_all(self):
+        # Test constructors
+        u = collections.UserDict()
+        u0 = collections.UserDict(d0)
+        u1 = collections.UserDict(d1)
+        u2 = collections.UserDict(d2)
+
+        uu = collections.UserDict(u)
+        uu0 = collections.UserDict(u0)
+        uu1 = collections.UserDict(u1)
+        uu2 = collections.UserDict(u2)
+
+        # keyword arg constructor
+        self.assertEqual(collections.UserDict(one=1, two=2), d2)
+        # item sequence constructor
+        self.assertEqual(collections.UserDict([('one',1), ('two',2)]), d2)
+        self.assertEqual(collections.UserDict(dict=[('one',1), ('two',2)]),
+                         {'dict': [('one', 1), ('two', 2)]})
+        # both together
+        self.assertEqual(collections.UserDict([('one',1), ('two',2)], two=3, three=5), d3)
+
+        # alternate constructor
+        self.assertEqual(collections.UserDict.fromkeys('one two'.split()), d4)
+        self.assertEqual(collections.UserDict().fromkeys('one two'.split()), d4)
+        self.assertEqual(collections.UserDict.fromkeys('one two'.split(), 1), d5)
+        self.assertEqual(collections.UserDict().fromkeys('one two'.split(), 1), d5)
+        self.assertTrue(u1.fromkeys('one two'.split()) is not u1)
+        self.assertIsInstance(u1.fromkeys('one two'.split()), collections.UserDict)
+        self.assertIsInstance(u2.fromkeys('one two'.split()), collections.UserDict)
+
+        # Test __repr__
+        self.assertEqual(str(u0), str(d0))
+        self.assertEqual(repr(u1), repr(d1))
+        self.assertIn(repr(u2), ("{'one': 1, 'two': 2}",
+                                 "{'two': 2, 'one': 1}"))
+
+        # Test rich comparison and __len__
+        all = [d0, d1, d2, u, u0, u1, u2, uu, uu0, uu1, uu2]
+        for a in all:
+            for b in all:
+                self.assertEqual(a == b, len(a) == len(b))
+
+        # Test __getitem__
+        self.assertEqual(u2["one"], 1)
+        self.assertRaises(KeyError, u1.__getitem__, "two")
+
+        # Test __setitem__
+        u3 = collections.UserDict(u2)
+        u3["two"] = 2
+        u3["three"] = 3
+
+        # Test __delitem__
+        del u3["three"]
+        self.assertRaises(KeyError, u3.__delitem__, "three")
+
+        # Test clear
+        u3.clear()
+        self.assertEqual(u3, {})
+
+        # Test copy()
+        u2a = u2.copy()
+        self.assertEqual(u2a, u2)
+        u2b = collections.UserDict(x=42, y=23)
+        u2c = u2b.copy() # making a copy of a UserDict is special cased
+        self.assertEqual(u2b, u2c)
+
+        class MyUserDict(collections.UserDict):
+            def display(self): print(self)
+
+        m2 = MyUserDict(u2)
+        m2a = m2.copy()
+        self.assertEqual(m2a, m2)
+
+        # SF bug #476616 -- copy() of UserDict subclass shared data
+        m2['foo'] = 'bar'
+        self.assertNotEqual(m2a, m2)
+
+        # Test keys, items, values
+        self.assertEqual(sorted(u2.keys()), sorted(d2.keys()))
+        self.assertEqual(sorted(u2.items()), sorted(d2.items()))
+        self.assertEqual(sorted(u2.values()), sorted(d2.values()))
+
+        # Test "in".
+        for i in u2.keys():
+            self.assertIn(i, u2)
+            self.assertEqual(i in u1, i in d1)
+            self.assertEqual(i in u0, i in d0)
+
+        # Test update
+        t = collections.UserDict()
+        t.update(u2)
+        self.assertEqual(t, u2)
+
+        # Test get
+        for i in u2.keys():
+            self.assertEqual(u2.get(i), u2[i])
+            self.assertEqual(u1.get(i), d1.get(i))
+            self.assertEqual(u0.get(i), d0.get(i))
+
+        # Test "in" iteration.
+        for i in range(20):
+            u2[i] = str(i)
+        ikeys = []
+        for k in u2:
+            ikeys.append(k)
+        keys = u2.keys()
+        self.assertEqual(set(ikeys), set(keys))
+
+        # Test setdefault
+        t = collections.UserDict()
+        self.assertEqual(t.setdefault("x", 42), 42)
+        self.assertIn("x", t)
+        self.assertEqual(t.setdefault("x", 23), 42)
+
+        # Test pop
+        t = collections.UserDict(x=42)
+        self.assertEqual(t.pop("x"), 42)
+        self.assertRaises(KeyError, t.pop, "x")
+        self.assertEqual(t.pop("x", 1), 1)
+        t["x"] = 42
+        self.assertEqual(t.pop("x", 1), 42)
+
+        # Test popitem
+        t = collections.UserDict(x=42)
+        self.assertEqual(t.popitem(), ("x", 42))
+        self.assertRaises(KeyError, t.popitem)
+
+    def test_init(self):
+        for kw in 'self', 'other', 'iterable':
+            self.assertEqual(list(collections.UserDict(**{kw: 42}).items()),
+                             [(kw, 42)])
+        self.assertEqual(list(collections.UserDict({}, dict=42).items()),
+                         [('dict', 42)])
+        self.assertEqual(list(collections.UserDict({}, dict=None).items()),
+                         [('dict', None)])
+        self.assertEqual(list(collections.UserDict(dict={'a': 42}).items()),
+                         [('dict', {'a': 42})])
+        self.assertRaises(TypeError, collections.UserDict, 42)
+        self.assertRaises(TypeError, collections.UserDict, (), ())
+        self.assertRaises(TypeError, collections.UserDict.__init__)
+
+    def test_update(self):
+        for kw in 'self', 'dict', 'other', 'iterable':
+            d = collections.UserDict()
+            d.update(**{kw: 42})
+            self.assertEqual(list(d.items()), [(kw, 42)])
+        self.assertRaises(TypeError, collections.UserDict().update, 42)
+        self.assertRaises(TypeError, collections.UserDict().update, {}, {})
+        self.assertRaises(TypeError, collections.UserDict.update)
+
+    def test_missing(self):
+        # Make sure UserDict doesn't have a __missing__ method
+        self.assertEqual(hasattr(collections.UserDict, "__missing__"), False)
+        # Test several cases:
+        # (D) subclass defines __missing__ method returning a value
+        # (E) subclass defines __missing__ method raising RuntimeError
+        # (F) subclass sets __missing__ instance variable (no effect)
+        # (G) subclass doesn't define __missing__ at all
+        class D(collections.UserDict):
+            def __missing__(self, key):
+                return 42
+        d = D({1: 2, 3: 4})
+        self.assertEqual(d[1], 2)
+        self.assertEqual(d[3], 4)
+        self.assertNotIn(2, d)
+        self.assertNotIn(2, d.keys())
+        self.assertEqual(d[2], 42)
+        class E(collections.UserDict):
+            def __missing__(self, key):
+                raise RuntimeError(key)
+        e = E()
+        try:
+            e[42]
+        except RuntimeError as err:
+            self.assertEqual(err.args, (42,))
+        else:
+            self.fail("e[42] didn't raise RuntimeError")
+        class F(collections.UserDict):
+            def __init__(self):
+                # An instance variable __missing__ should have no effect
+                self.__missing__ = lambda key: None
+                collections.UserDict.__init__(self)
+        f = F()
+        try:
+            f[42]
+        except KeyError as err:
+            self.assertEqual(err.args, (42,))
+        else:
+            self.fail("f[42] didn't raise KeyError")
+        class G(collections.UserDict):
+            pass
+        g = G()
+        try:
+            g[42]
+        except KeyError as err:
+            self.assertEqual(err.args, (42,))
+        else:
+            self.fail("g[42] didn't raise KeyError")
+
+    # Decorate existing test with recursion limit, because
+    # the test is for C structure, but `UserDict` is a Python structure.
+    # test_repr_deep = support.infinite_recursion(25)(
+    #     mapping_tests.TestHashMappingProtocol.test_repr_deep,
+    # )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_userlist.diff b/test/dynamo/cpython/3_13/test_userlist.diff
new file mode 100644
index 000000000000..299a8abeb99a
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_userlist.diff
@@ -0,0 +1,78 @@
+diff --git a/test/dynamo/cpython/3_13/test_userlist.py b/test/dynamo/cpython/3_13/test_userlist.py
+index 312702c8e39..a4532922f5d 100644
+--- a/test/dynamo/cpython/3_13/test_userlist.py
++++ b/test/dynamo/cpython/3_13/test_userlist.py
+@@ -1,7 +1,58 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests
++
++__TestCase = CPythonTestCase
++
++
++# redirect import statements
++import sys
++import importlib.abc
++
++redirect_imports = (
++    "test.mapping_tests",
++    "test.typinganndata",
++    "test.test_grammar",
++    "test.test_math",
++    "test.test_iter",
++    "test.typinganndata.ann_module",
++)
++
++class RedirectImportFinder(importlib.abc.MetaPathFinder):
++    def find_spec(self, fullname, path, target=None):
++        # Check if the import is the problematic one
++        if fullname in redirect_imports:
++            try:
++                # Attempt to import the standalone module
++                name = fullname.removeprefix("test.")
++                r = importlib.import_module(name)
++                # Redirect the module in sys.modules
++                sys.modules[fullname] = r
++                # Return a module spec from the found module
++                return importlib.util.find_spec(name)
++            except ImportError:
++                return None
++        return None
++
++# Add the custom finder to sys.meta_path
++sys.meta_path.insert(0, RedirectImportFinder())
++
++
++# ======= END DYNAMO PATCH =======
++
+ # Check every path through every method of UserList
+ 
+ from collections import UserList
+-from test import list_tests
++import list_tests
+ import unittest
+ from test import support
+ 
+@@ -69,9 +120,9 @@ class UserListTest(list_tests.CommonTest):
+ 
+     # Decorate existing test with recursion limit, because
+     # the test is for C structure, but `UserList` is a Python structure.
+-    test_repr_deep = support.infinite_recursion(25)(
+-        list_tests.CommonTest.test_repr_deep,
+-    )
++    # test_repr_deep = support.infinite_recursion(25)(
++    #     list_tests.CommonTest.test_repr_deep,
++    # )
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_userlist.py b/test/dynamo/cpython/3_13/test_userlist.py
new file mode 100644
index 000000000000..a4532922f5d4
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_userlist.py
@@ -0,0 +1,128 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+# Check every path through every method of UserList
+
+from collections import UserList
+import list_tests
+import unittest
+from test import support
+
+
+class UserListTest(list_tests.CommonTest):
+    type2test = UserList
+
+    def test_getslice(self):
+        super().test_getslice()
+        l = [0, 1, 2, 3, 4]
+        u = self.type2test(l)
+        for i in range(-3, 6):
+            self.assertEqual(u[:i], l[:i])
+            self.assertEqual(u[i:], l[i:])
+            for j in range(-3, 6):
+                self.assertEqual(u[i:j], l[i:j])
+
+    def test_slice_type(self):
+        l = [0, 1, 2, 3, 4]
+        u = UserList(l)
+        self.assertIsInstance(u[:], u.__class__)
+        self.assertEqual(u[:],u)
+
+    def test_add_specials(self):
+        u = UserList("spam")
+        u2 = u + "eggs"
+        self.assertEqual(u2, list("spameggs"))
+
+    def test_radd_specials(self):
+        u = UserList("eggs")
+        u2 = "spam" + u
+        self.assertEqual(u2, list("spameggs"))
+        u2 = u.__radd__(UserList("spam"))
+        self.assertEqual(u2, list("spameggs"))
+
+    def test_iadd(self):
+        super().test_iadd()
+        u = [0, 1]
+        u += UserList([0, 1])
+        self.assertEqual(u, [0, 1, 0, 1])
+
+    def test_mixedcmp(self):
+        u = self.type2test([0, 1])
+        self.assertEqual(u, [0, 1])
+        self.assertNotEqual(u, [0])
+        self.assertNotEqual(u, [0, 2])
+
+    def test_mixedadd(self):
+        u = self.type2test([0, 1])
+        self.assertEqual(u + [], u)
+        self.assertEqual(u + [2], [0, 1, 2])
+
+    def test_getitemoverwriteiter(self):
+        # Verify that __getitem__ overrides *are* recognized by __iter__
+        class T(self.type2test):
+            def __getitem__(self, key):
+                return str(key) + '!!!'
+        self.assertEqual(next(iter(T((1,2)))), "0!!!")
+
+    def test_userlist_copy(self):
+        u = self.type2test([6, 8, 1, 9, 1])
+        v = u.copy()
+        self.assertEqual(u, v)
+        self.assertEqual(type(u), type(v))
+
+    # Decorate existing test with recursion limit, because
+    # the test is for C structure, but `UserList` is a Python structure.
+    # test_repr_deep = support.infinite_recursion(25)(
+    #     list_tests.CommonTest.test_repr_deep,
+    # )
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/typinganndata/__init__.py b/test/dynamo/cpython/3_13/typinganndata/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo/cpython/3_13/typinganndata/_typed_dict_helper.py b/test/dynamo/cpython/3_13/typinganndata/_typed_dict_helper.py
new file mode 100644
index 000000000000..9df0ede7d40e
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/_typed_dict_helper.py
@@ -0,0 +1,30 @@
+"""Used to test `get_type_hints()` on a cross-module inherited `TypedDict` class
+
+This script uses future annotations to postpone a type that won't be available
+on the module inheriting from to `Foo`. The subclass in the other module should
+look something like this:
+
+    class Bar(_typed_dict_helper.Foo, total=False):
+        b: int
+
+In addition, it uses multiple levels of Annotated to test the interaction
+between the __future__ import, Annotated, and Required.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Generic, Optional, Required, TypedDict, TypeVar
+
+
+OptionalIntType = Optional[int]
+
+class Foo(TypedDict):
+    a: OptionalIntType
+
+T = TypeVar("T")
+
+class FooGeneric(TypedDict, Generic[T]):
+    a: Optional[T]
+
+class VeryAnnotated(TypedDict, total=False):
+    a: Annotated[Annotated[Annotated[Required[int], "a"], "b"], "c"]
diff --git a/test/dynamo/cpython/3_13/typinganndata/ann_module.py b/test/dynamo/cpython/3_13/typinganndata/ann_module.py
new file mode 100644
index 000000000000..5081e6b58345
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/ann_module.py
@@ -0,0 +1,62 @@
+
+
+"""
+The module for testing variable annotations.
+Empty lines above are for good reason (testing for correct line numbers)
+"""
+
+from typing import Optional
+from functools import wraps
+
+__annotations__[1] = 2
+
+class C:
+
+    x = 5; y: Optional['C'] = None
+
+from typing import Tuple
+x: int = 5; y: str = x; f: Tuple[int, int]
+
+class M(type):
+
+    __annotations__['123'] = 123
+    o: type = object
+
+(pars): bool = True
+
+class D(C):
+    j: str = 'hi'; k: str= 'bye'
+
+from types import new_class
+h_class = new_class('H', (C,))
+j_class = new_class('J')
+
+class F():
+    z: int = 5
+    def __init__(self, x):
+        pass
+
+class Y(F):
+    def __init__(self):
+        super(F, self).__init__(123)
+
+class Meta(type):
+    def __new__(meta, name, bases, namespace):
+        return super().__new__(meta, name, bases, namespace)
+
+class S(metaclass = Meta):
+    x: str = 'something'
+    y: str = 'something else'
+
+def foo(x: int = 10):
+    def bar(y: List[str]):
+        x: str = 'yes'
+    bar()
+
+def dec(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+    return wrapper
+
+u: int | float
diff --git a/test/dynamo/cpython/3_13/typinganndata/ann_module2.py b/test/dynamo/cpython/3_13/typinganndata/ann_module2.py
new file mode 100644
index 000000000000..76cf5b3ad97e
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/ann_module2.py
@@ -0,0 +1,36 @@
+"""
+Some correct syntax for variable annotation here.
+More examples are in test_grammar and test_parser.
+"""
+
+from typing import no_type_check, ClassVar
+
+i: int = 1
+j: int
+x: float = i/10
+
+def f():
+    class C: ...
+    return C()
+
+f().new_attr: object = object()
+
+class C:
+    def __init__(self, x: int) -> None:
+        self.x = x
+
+c = C(5)
+c.new_attr: int = 10
+
+__annotations__ = {}
+
+
+@no_type_check
+class NTC:
+    def meth(self, param: complex) -> None:
+        ...
+
+class CV:
+    var: ClassVar['CV']
+
+CV.var = CV()
diff --git a/test/dynamo/cpython/3_13/typinganndata/ann_module3.py b/test/dynamo/cpython/3_13/typinganndata/ann_module3.py
new file mode 100644
index 000000000000..eccd7be22dd8
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/ann_module3.py
@@ -0,0 +1,18 @@
+"""
+Correct syntax for variable annotation that should fail at runtime
+in a certain manner. More examples are in test_grammar and test_parser.
+"""
+
+def f_bad_ann():
+    __annotations__[1] = 2
+
+class C_OK:
+    def __init__(self, x: int) -> None:
+        self.x: no_such_name = x  # This one is OK as proposed by Guido
+
+class D_bad_ann:
+    def __init__(self, x: int) -> None:
+        sfel.y: int = 0
+
+def g_bad_ann():
+    no_such_name.attr: int = 0
diff --git a/test/dynamo/cpython/3_13/typinganndata/ann_module4.py b/test/dynamo/cpython/3_13/typinganndata/ann_module4.py
new file mode 100644
index 000000000000..13e9aee54c98
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/ann_module4.py
@@ -0,0 +1,5 @@
+# This ann_module isn't for test_typing,
+# it's for test_module
+
+a:int=3
+b:str=4
diff --git a/test/dynamo/cpython/3_13/typinganndata/ann_module5.py b/test/dynamo/cpython/3_13/typinganndata/ann_module5.py
new file mode 100644
index 000000000000..837041e121f6
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/ann_module5.py
@@ -0,0 +1,10 @@
+# Used by test_typing to verify that Final wrapped in ForwardRef works.
+
+from __future__ import annotations
+
+from typing import Final
+
+name: Final[str] = "final"
+
+class MyClass:
+    value: Final = 3000
diff --git a/test/dynamo/cpython/3_13/typinganndata/ann_module6.py b/test/dynamo/cpython/3_13/typinganndata/ann_module6.py
new file mode 100644
index 000000000000..679175669bc3
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/ann_module6.py
@@ -0,0 +1,7 @@
+# Tests that top-level ClassVar is not allowed
+
+from __future__ import annotations
+
+from typing import ClassVar
+
+wrong: ClassVar[int] = 1
diff --git a/test/dynamo/cpython/3_13/typinganndata/ann_module695.py b/test/dynamo/cpython/3_13/typinganndata/ann_module695.py
new file mode 100644
index 000000000000..2ede9fe38256
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/ann_module695.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+from typing import Callable
+
+
+class A[T, *Ts, **P]:
+    x: T
+    y: tuple[*Ts]
+    z: Callable[P, str]
+
+
+class B[T, *Ts, **P]:
+    T = int
+    Ts = str
+    P = bytes
+    x: T
+    y: Ts
+    z: P
+
+
+def generic_function[T, *Ts, **P](
+    x: T, *y: *Ts, z: P.args, zz: P.kwargs
+) -> None: ...
diff --git a/test/dynamo/cpython/3_13/typinganndata/ann_module7.py b/test/dynamo/cpython/3_13/typinganndata/ann_module7.py
new file mode 100644
index 000000000000..8f890cd28025
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/ann_module7.py
@@ -0,0 +1,11 @@
+# Tests class have ``__text_signature__``
+
+from __future__ import annotations
+
+DEFAULT_BUFFER_SIZE = 8192
+
+class BufferedReader(object):
+    """BufferedReader(raw, buffer_size=DEFAULT_BUFFER_SIZE)\n--\n\n
+    Create a new buffered reader using the given readable raw IO object.
+    """
+    pass
diff --git a/test/dynamo/cpython/3_13/typinganndata/ann_module8.py b/test/dynamo/cpython/3_13/typinganndata/ann_module8.py
new file mode 100644
index 000000000000..bd0314813784
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/ann_module8.py
@@ -0,0 +1,10 @@
+# Test `@no_type_check`,
+# see https://bugs.python.org/issue46571
+
+class NoTypeCheck_Outer:
+    class Inner:
+        x: int
+
+
+def NoTypeCheck_function(arg: int) -> int:
+    ...
diff --git a/test/dynamo/cpython/3_13/typinganndata/ann_module9.py b/test/dynamo/cpython/3_13/typinganndata/ann_module9.py
new file mode 100644
index 000000000000..952217393e1f
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/ann_module9.py
@@ -0,0 +1,14 @@
+# Test ``inspect.formatannotation``
+# https://github.com/python/cpython/issues/96073
+
+from typing import Union, List
+
+ann = Union[List[str], int]
+
+# mock typing._type_repr behaviour
+class A: ...
+
+A.__module__ = 'testModule.typing'
+A.__qualname__ = 'A'
+
+ann1 = Union[List[A], int]
diff --git a/test/dynamo/cpython/3_13/typinganndata/mod_generics_cache.py b/test/dynamo/cpython/3_13/typinganndata/mod_generics_cache.py
new file mode 100644
index 000000000000..6c1ee2fec837
--- /dev/null
+++ b/test/dynamo/cpython/3_13/typinganndata/mod_generics_cache.py
@@ -0,0 +1,24 @@
+"""Module for testing the behavior of generics across different modules."""
+
+from typing import TypeVar, Generic, Optional, TypeAliasType
+
+default_a: Optional['A'] = None
+default_b: Optional['B'] = None
+
+T = TypeVar('T')
+
+
+class A(Generic[T]):
+    some_b: 'B'
+
+
+class B(Generic[T]):
+    class A(Generic[T]):
+        pass
+
+    my_inner_a1: 'B.A'
+    my_inner_a2: A
+    my_outer_a: 'A'  # unless somebody calls get_type_hints with localns=B.__dict__
+
+type Alias = int
+OldStyle = TypeAliasType("OldStyle", int)
diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py
index 8c24a40007ae..6eb29470993c 100644
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@@ -19,7 +19,10 @@
 from torch._higher_order_ops.wrap import tag_activation_checkpoint
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
+<<<<<<< HEAD
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SM90OrLater,
 )
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
@@ -82,9 +85,15 @@ def match_rng_op(node, op):
             for node in gm.graph.nodes:
                 if match_rng_op(node, op) or node.target == op:
                     actual_count += 1
+<<<<<<< HEAD
             assert (
                 actual_count >= freq_ge
             ), f"In graph {gm}, expected {op} to have occurred at least {freq_ge} times in the graph, but got {actual_count}."
+=======
+            assert actual_count >= freq_ge, (
+                f"In graph {gm}, expected {op} to have occurred at least {freq_ge} times in the graph, but got {actual_count}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return gm
 
 
@@ -968,6 +977,52 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
+<<<<<<< HEAD
+=======
+    @requires_cuda
+    @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
+    def test_compile_selective_checkpoint_list_ops(self, device):
+        def selective_checkpointing_context_fn():
+            # recompute everything
+            no_recompute_list = []
+            return create_selective_checkpoint_contexts(
+                _get_custom_policy(no_recompute_list=no_recompute_list)
+            )
+
+        def gn(x, y):
+            return torch.cat([x, y]).sin()
+
+        def fn(x, y):
+            return torch.utils.checkpoint.checkpoint(
+                gn,
+                x,
+                y,
+                use_reentrant=False,
+                context_fn=selective_checkpointing_context_fn,
+            )
+
+        x = torch.randn(4, 4, requires_grad=True, device=device)
+        y = torch.randn(4, 4, requires_grad=True, device=device)
+
+        fw_compiler = functools.partial(
+            count_ops,
+            freqs=[1],
+            ops=[torch.ops.aten.cat.default],
+        )
+        bw_compiler = functools.partial(
+            count_ops,
+            freqs=[1],
+            ops=[torch.ops.aten.cat.default],
+        )
+        backend = aot_autograd(
+            fw_compiler=fw_compiler,
+            bw_compiler=bw_compiler,
+            partition_fn=min_cut_rematerialization_partition,
+        )
+        self._validate(fn, backend, x, y)
+        self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     @unittest.skip(
         "In-place op support in selective checkpointing + torch.compile "
@@ -1285,10 +1340,13 @@ def fn(x, ys):
         self.assertEqual(ref, res)
 
     @requires_cuda
+<<<<<<< HEAD
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION and not PLATFORM_SUPPORTS_CUDNN_ATTENTION,
         "Flash and CuDNN attention not support on GPU arch."
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pattern_matcher(self, device):
         # Check that the sdpa op is recomputed in the backward graph
         # tests percolate_tags
diff --git a/test/dynamo/test_aot_autograd_cache.py b/test/dynamo/test_aot_autograd_cache.py
index 694278f16486..bfc84b4c41fc 100644
--- a/test/dynamo/test_aot_autograd_cache.py
+++ b/test/dynamo/test_aot_autograd_cache.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
+=======
+import copy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import shutil
 import unittest
@@ -22,8 +26,14 @@
 from torch._guards import TracingContext
 from torch._inductor import config as inductor_config
 from torch._inductor.runtime.runtime_utils import cache_dir
+<<<<<<< HEAD
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.runtime.triton_compat import tl, triton
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._subclasses import FakeTensorMode
 from torch.compiler._cache import CacheArtifactManager
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
@@ -39,6 +49,79 @@
 from torch.testing._internal.two_tensor import TwoTensor
 
 
+<<<<<<< HEAD
+=======
+def saved_tensors_hooks_to_gm(
+    pack_fn,
+    unpack_fn,
+    pack_cache_hash=None,
+    unpack_cache_hash=None,
+    symbolic_tracing=True,
+    inp_fn=None,
+):
+    if symbolic_tracing:
+        pack_gm = torch.fx.symbolic_trace(pack_fn)
+        unpack_gm = torch.fx.symbolic_trace(unpack_fn)
+    else:
+        from functorch import make_fx
+
+        if inp_fn:
+            inp = inp_fn()
+        else:
+            inp = torch.randn(2, 3)
+            torch._dynamo.mark_dynamic(inp, 0)
+            torch._dynamo.mark_dynamic(inp, 1)
+        pack_out = pack_fn(inp)
+        pack_gm = make_fx(pack_fn)(inp)
+        unpack_gm = make_fx(unpack_fn)(pack_out)
+
+    def set_manual_hash(g, manual_hash):
+        for node in g.nodes:
+            if node.meta and node.meta.get("is_wrapped", False):
+                node.meta["user_cache_hash"] = manual_hash
+
+    if pack_cache_hash:
+        set_manual_hash(pack_gm.graph, pack_cache_hash)
+    if unpack_cache_hash:
+        set_manual_hash(unpack_gm.graph, unpack_cache_hash)
+    return pack_gm, unpack_gm
+
+
+def amax_to_scale(
+    amax: torch.Tensor,
+    float8_dtype: torch.dtype,
+    round_scales_to_power_of_2: bool = False,
+):
+    amax = amax.to(torch.float64)
+    res = torch.finfo(float8_dtype).max / torch.clamp(amax, min=1e-12)
+    res = res.to(torch.float32)
+    return res
+
+
+# Must be at module level to use fx.wrap
+@torch.fx.wrap
+def _pack_fp8_with_scale_wrap(x):
+    if not x.dtype.is_floating_point:
+        return x
+
+    amax = torch.max(torch.abs(x))
+    scale = amax_to_scale(amax, torch.float8_e5m2)
+    x_scaled = x.to(torch.float32) * scale
+    x_fp8 = x_scaled.to(torch.float8_e5m2)
+    return x.dtype, scale, x_fp8
+
+
+@torch.fx.wrap
+def _unpack_fp8_with_scale_wrap(x):
+    if isinstance(x, torch.Tensor):
+        return x
+
+    dtype, scale, x_fp8 = x
+    y = x_fp8.to(torch.float32) / scale
+    return y.to(dtype)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @instantiate_parametrized_tests
 class AOTAutogradCacheTests(InductorTestCase):
     def setUp(self):
@@ -93,13 +176,22 @@ def fn(x, y):
         b = torch.rand(100, 100, dtype=dtype, device=device, requires_grad=True)
 
         # Record artifacts
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled_fn = torch.compile(fn, dynamic=dynamic)
 
             # A first call should miss in the cache.
             eager_result = fn(a, b)
             compiled_result = compiled_fn(a, b)
             compiled_result.sum().backward()
+<<<<<<< HEAD
+=======
+            if hasattr(a, "_dynamo_weak_dynamic_indices"):
+                del a._dynamo_weak_dynamic_indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(eager_result, compiled_result)
             self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
@@ -127,11 +219,20 @@ def fn(x, y):
         shutil.rmtree(os.path.join(cache_dir(), "triton"), ignore_errors=True)
 
         # We did not load anything so dont hit yet
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eager_result = fn(a, b)
             compiled_result = compiled_fn(a, b)
             self.assertEqual(eager_result, compiled_result)
             compiled_result.sum().backward()
+<<<<<<< HEAD
+=======
+            if hasattr(a, "_dynamo_weak_dynamic_indices"):
+                del a._dynamo_weak_dynamic_indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 4)
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
             self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
@@ -145,7 +246,11 @@ def fn(x, y):
         shutil.rmtree(os.path.join(cache_dir(), "triton"), ignore_errors=True)
 
         # Hot load and hit
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cache_info = torch.compiler.load_cache_artifacts(artifact_bytes)
 
             self.assertEqual(len(cache_info.inductor_artifacts), 2)
@@ -156,6 +261,11 @@ def fn(x, y):
             eager_result = fn(a, b)
             compiled_result = compiled_fn(a, b)
             compiled_result.sum().backward()
+<<<<<<< HEAD
+=======
+            if hasattr(a, "_dynamo_weak_dynamic_indices"):
+                del a._dynamo_weak_dynamic_indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(eager_result, compiled_result)
             self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 4)
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 2)
@@ -195,6 +305,92 @@ def fn(x, y):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
 
+<<<<<<< HEAD
+=======
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_multi_graph_specialization(self):
+        """
+        Verify multi graph specializations all cache hit
+        """
+
+        def fn(x):
+            return x * 5
+
+        a = torch.randn(5)
+        a8 = torch.randn(8)
+        a16 = torch.randn(16)
+        torch._dynamo.mark_dynamic(
+            a,
+            0,
+            specialize_on=[
+                lambda x: x == 8,
+                lambda x: x == 16,
+            ],
+        )
+
+        compiled_fn = torch.compile(fn, backend="inductor")
+
+        # A first call should miss in the cache.
+        compiled_fn(a)
+        compiled_fn(a8)
+        compiled_fn(a16)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 3)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 3)
+
+        self._clear_dynamo_and_codecache()
+
+        # A second call should hit on all 3 graphs
+        compiled_fn(a)
+        compiled_fn(a8)
+        compiled_fn(a16)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 3)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 3)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 3)
+
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_symbol_specialization(self):
+        """
+        Verify the symbol specializations don't cause cache miss.
+        """
+
+        def fn(x, y, z):
+            return (torch.randn(5) + x + y, z * torch.randn(1))
+
+        a = torch.rand(5)
+        torch._dynamo.maybe_mark_dynamic(a, 0)
+        b = torch.rand(5)
+        c = torch.randn(6)
+        torch._dynamo.maybe_mark_dynamic(c, 0)
+
+        compiled_fn = torch.compile(fn, backend="inductor")
+
+        # A first call should miss in the cache.
+        compiled_fn(a, b, c)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        # A second call should hit even if a new dimension is marked as dynamic
+        # that is later specialized as part of tracing.
+        a = torch.rand(5)
+        torch._dynamo.maybe_mark_dynamic(a, 0)
+        b = torch.rand(5)
+        torch._dynamo.maybe_mark_dynamic(b, 0)
+        c = torch.randn(6)
+        torch._dynamo.maybe_mark_dynamic(c, 0)
+        self._clear_dynamo_and_codecache()
+
+        compiled_fn(a, b, c)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @functorch_config.patch({"enable_autograd_cache": True})
     def test_aot_runtime_trace_joint(self):
         @torch.compile(backend="inductor")
@@ -241,10 +437,25 @@ def fn(x, y):
         self._clear_dynamo_and_codecache()
         torch._inductor.codecache.FxGraphCache.clear()
         self.assertEqual(fn(a, b), compiled_fn(a, b))
+<<<<<<< HEAD
         self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
         # We save again into the cache
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 2)
+=======
+
+        if functorch_config.bundled_autograd_cache:
+            # Bundled AutogradCache doesn't care if FxGraphCache is cleared
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        else:
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+            # We save again into the cache
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
@@ -269,6 +480,137 @@ def fn(a):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
 
     @inductor_config.patch("fx_graph_remote_cache", False)
+<<<<<<< HEAD
+=======
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch(
+        {"enable_autograd_cache": True, "strict_autograd_cache": True}
+    )
+    def test_invoke_subgraph(self):
+        from torch._higher_order_ops.invoke_subgraph import mark_compile_region
+
+        @mark_compile_region
+        def gn(x, y):
+            return x + y
+
+        @torch.compile
+        def fn(x, y):
+            return gn(x, y) + gn(x, y)
+
+        a = torch.randn(25)
+        b = torch.randn(25)
+
+        fn(a, b)
+
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch(
+        {"enable_autograd_cache": True, "strict_autograd_cache": True}
+    )
+    @parametrize("fn_select", ("tag_activation_checkpoint", "allow_in_graph"))
+    def test_unsafe_mark_cacheable(self, fn_select):
+        if fn_select == "tag_activation_checkpoint":
+            from torch.utils.checkpoint import checkpoint
+
+            def gn(x, y, z=None):
+                a = torch.matmul(x, y)
+                if z is not None:
+                    return torch.matmul(a, z)
+                return a
+
+            @torch.compile
+            def fn(x, y, z):
+                return torch.cos(checkpoint(gn, x, y, use_reentrant=False, z=z))
+
+            fn_name = "torch.ops.higher_order.tag_activation_checkpoint"
+        else:
+            assert fn_select == "allow_in_graph"
+
+            @torch._dynamo.allow_in_graph
+            class AllowInGraphFunc(torch.autograd.Function):
+                @staticmethod
+                def forward(_, x):
+                    torch._dynamo.graph_break()
+                    return x.sin()
+
+            @torch.compile
+            def fn(x, y, z):
+                return AllowInGraphFunc.apply(x)
+
+            fn_name = "torch._dynamo.variables.misc.trampoline_autograd_apply"
+
+        x = torch.randn(4, 4)
+        y = torch.randn(4, 4)
+        z = torch.randn(4, 4)
+        args = (x, y, z)
+
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.BackendCompilerFailed,
+            r".*BypassAOTAutogradCache: Unsupported call_function target .*",
+        ):
+            fn(*args)
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
+
+        self._clear_dynamo_and_codecache()
+
+        if fn_select == "allow_in_graph":
+            # TODO: Fix allow in graph
+            raise unittest.SkipTest(
+                "Allow in graph produces an unserializable cache artifact"
+            )
+
+        with inductor_config.patch(
+            "unsafe_marked_cacheable_functions", {fn_name: "key1"}
+        ):
+            fn(*args)
+
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
+
+            self._clear_dynamo_and_codecache()
+
+            fn(*args)
+
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
+
+        self._clear_dynamo_and_codecache()
+        with inductor_config.patch(
+            "unsafe_marked_cacheable_functions", {fn_name: "key2"}
+        ):
+            fn(*args)
+
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
+
+            self._clear_dynamo_and_codecache()
+
+            fn(*args)
+
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 2)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
+
+        # On second try with same key, it should hit once more
+        with inductor_config.patch(
+            "unsafe_marked_cacheable_functions", {fn_name: "key1"}
+        ):
+            self._clear_dynamo_and_codecache()
+
+            fn(*args)
+
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 3)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
+
+    @inductor_config.patch("fx_graph_remote_cache", False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch("fx_graph_cache", False)
     @functorch_config.patch({"enable_autograd_cache": True})
     def test_fx_graph_cache_off(self):
@@ -344,9 +686,168 @@ def fn(a, b):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
 
+<<<<<<< HEAD
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch({"fx_graph_cache": True})
+    @functorch_config.patch({"enable_autograd_cache": True})
+=======
+    @requires_cuda
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @functorch_config.patch({"autograd_cache_allow_custom_autograd_functions": True})
+    def test_custom_autograd_function_miss(self):
+        class MyAutogradFunction(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                y = x.sin()
+                ctx.save_for_backward(y)
+                ctx.foo = x.cos()
+                return y
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                result = ctx.saved_tensors[0]
+                return grad_output * result + ctx.foo * grad_output
+
+        def fn(a):
+            return MyAutogradFunction.apply(a)
+
+        a = torch.randn(5, device="cuda", requires_grad=True)
+        a2 = a.clone().detach_().requires_grad_(True)
+        compiled_fn = torch.compile(fn, backend="inductor")
+        result = compiled_fn(a)
+        result.sum().backward()
+        self.assertEqual(fn(a), result)
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        class MyAutogradFunction(torch.autograd.Function):  # noqa: F811
+            # Change the function slightly
+            @staticmethod
+            def forward(ctx, x):
+                y = x.cos()
+                ctx.save_for_backward(y)
+                ctx.foo = x.sin()
+                return y
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                result = ctx.saved_tensors[0]
+                return grad_output * result + ctx.foo * grad_output
+
+        # Clear dynamo and run again. Should be a cache miss.
+        counters.clear()
+        self._clear_dynamo_and_codecache()
+        result = compiled_fn(a2)
+        self.assertEqual(fn(a2), result)
+        result.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+    @requires_cuda
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @functorch_config.patch({"autograd_cache_allow_custom_autograd_functions": True})
+    def test_custom_autograd_function(self):
+        class MyAutogradFunction(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                y = x.sin()
+                ctx.save_for_backward(y)
+                ctx.foo = x.cos()
+                return y
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                result = ctx.saved_tensors[0]
+                return grad_output * result + ctx.foo * grad_output
+
+        def fn(a):
+            return MyAutogradFunction.apply(a)
+
+        a = torch.randn(5, device="cuda", requires_grad=True)
+        a2 = a.clone().detach_().requires_grad_(True)
+        compiled_fn = torch.compile(fn, backend="inductor")
+        result = compiled_fn(a)
+        result.sum().backward()
+        self.assertEqual(fn(a), result)
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        # Clear dynamo and run again. Should be a cache hit.
+        counters.clear()
+        self._clear_dynamo_and_codecache()
+        result = compiled_fn(a2)
+        self.assertEqual(fn(a2), result)
+        result.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
+
+    @requires_cuda
+    @requires_triton()
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @functorch_config.patch({"autograd_cache_allow_custom_autograd_functions": True})
+    def test_custom_autograd_function_with_custom_triton_kernel(self):
+        @triton.jit
+        def my_jit(x):
+            arg_0 = tl.load(x)
+            tl.store(x, arg_0 + 1)
+
+        @torch._library.triton_op("test::my_triton_op", mutates_args=())
+        def my_triton_op(x: torch.Tensor) -> torch.Tensor:
+            y = x.clone().detach_().requires_grad_(True)
+            torch._library.capture_triton(my_jit)[1,](y)
+            return y
+
+        class MyAutogradFunction(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                y = torch.ops.test.my_triton_op(x)
+                ctx.save_for_backward(y)
+                ctx.foo = x.cos()
+                return y
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                result = ctx.saved_tensors[0]
+                return grad_output * result + ctx.foo * grad_output
+
+        def fn(a):
+            return MyAutogradFunction.apply(a)
+
+        a = torch.randn(5, device="cuda", requires_grad=True)
+        a2 = a.clone().detach_().requires_grad_(True)
+        compiled_fn = torch.compile(fn, backend="inductor")
+        result = compiled_fn(a)
+        self.assertEqual(fn(a), result)
+        result.sum().backward()
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        # Clear dynamo and run again. Should be a cache hit.
+        counters.clear()
+        self._clear_dynamo_and_codecache()
+        result = compiled_fn(a2)
+        self.assertEqual(fn(a2), result)
+        result.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 0)
+
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch({"fx_graph_cache": True})
     @functorch_config.patch({"enable_autograd_cache": True})
+    @functorch_config.patch({"strict_autograd_cache": True})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_autograd_lazy_backward(self):
         """
         Lazily compile the backward, and lazily save to cache
@@ -395,6 +896,47 @@ def fn(a, b):
         self.assertEqual(b.grad, b2.grad)
 
     @inductor_config.patch("fx_graph_remote_cache", False)
+<<<<<<< HEAD
+=======
+    @inductor_config.patch({"fx_graph_cache": True})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @functorch_config.patch({"strict_autograd_cache": True})
+    def test_autograd_no_dynamo_trace_backward(self):
+        """
+        Test that dynamo does not trace into the backward compiled function,
+        even on cache hit.
+        """
+        torch._dynamo.eval_frame.clear_dynamo_tls()
+
+        @torch.compile
+        def fn(x):
+            # Calls x.sum().backward() during forward execution of fn
+            (x_grad,) = torch.autograd.grad(x.sum(), x)
+            return x_grad
+
+        a = torch.randn(10, 10, requires_grad=True, device="cpu")
+        result = fn(a)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        # Backward of `sum` will run during execution of graph break
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+        traced_frame_infos = copy.deepcopy(
+            torch._dynamo.eval_frame.dynamo_tls.traced_frame_infos
+        )
+
+        torch._dynamo.reset()
+        torch._dynamo.eval_frame.clear_dynamo_tls()
+        result2 = fn(a)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+        new_traced_frame_infos = torch._dynamo.eval_frame.dynamo_tls.traced_frame_infos
+        self.assertEqual(result, result2)
+        # Dynamo should trace exactly the same frames on cache hit
+        self.assertEqual(traced_frame_infos, new_traced_frame_infos)
+
+    @inductor_config.patch("fx_graph_remote_cache", False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch("fx_graph_cache", True)
     @functorch_config.patch({"enable_autograd_cache": True})
     def test_autograd_function(self):
@@ -754,6 +1296,338 @@ def f(x, y):
         self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
         self.assertEqual(len(CompiledTritonKernels._cache), 0)
 
+<<<<<<< HEAD
+=======
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @functorch_config.patch({"strict_autograd_cache": True})
+    def test_dynamic_shapes_different_sizes(self):
+        # The forward and backward function have different symint inputs,
+        # but the same underlying symbols
+        def fn(x, y):
+            z = x * y
+            return (torch.cat((x, x), dim=0), z)
+
+        (x1, y1) = torch.randn(5, requires_grad=True), torch.randn(5)
+        compiled_fn = torch.compile(fn, backend="inductor", dynamic=True)
+        x_compiled, _ = compiled_fn(x1, y1)
+        x_compiled.sum().backward()
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+        self._clear_dynamo_and_codecache()
+
+        # Run a second time and see it cache hit instead of erroring
+        (x2, y2) = torch.randn(5, requires_grad=True), torch.randn(5)
+        x_compiled, _ = compiled_fn(x2, y2)
+        x_compiled.sum().backward()
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    @unittest.skipIf(not SM80OrLater, "bfloat16, float8")
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @functorch_config.patch({"activation_memory_budget": 1.0})
+    @functorch_config.patch({"activation_memory_budget_runtime_estimator": "testing"})
+    @functorch_config.patch({"saved_tensors_hooks_filtering_mode": "all"})
+    def test_saved_tensors_hooks_autograd_cache(self):
+        ctx = torch.autograd.graph.saved_tensors_hooks
+        device = torch.device("cuda:0")
+
+        def pack_cpu(x):
+            return x.to(device="cpu")
+
+        def unpack_cpu(x):
+            return x.to(device=device)
+
+        def pack_cpu2(x):
+            return x.to(device="cpu")
+
+        def unpack_cpu2(x):
+            return x.to(device=device)
+
+        def pack_mul2(x):
+            return x * 2
+
+        def unpack_mul2(x):
+            return x / 2
+
+        # Can not use custom AutogradFunction here,
+        # Cache bypasses AutogradFunction Ctx usage.
+        # Can not save in ctx non floating point dtypes.
+        # For non-symbolic tracing all dtypes and devices and burned in the graph.
+
+        def fn(x):
+            x = x + 1
+            x = x.sin().cos()
+            x = x.relu()
+            x = x.exp()
+            x = 2 * x
+            return x
+
+        backend = "inductor"
+
+        def inp_fn():
+            x = torch.ones(2, 3, device=device, requires_grad=True)
+            torch._dynamo.mark_dynamic(x, 0)
+            torch._dynamo.mark_dynamic(x, 1)
+            return x
+
+        x = inp_fn()
+        fn_compiled = torch.compile(fn, backend=backend, fullgraph=True)
+        y = fn_compiled(x)
+        y.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        with ctx(
+            *saved_tensors_hooks_to_gm(
+                pack_cpu,
+                unpack_cpu,
+                symbolic_tracing=False,
+                inp_fn=inp_fn,
+                pack_cache_hash="cpu_offload",
+                unpack_cache_hash="cpu_offload",
+            )
+        ):
+            x = inp_fn()
+            y = fn_compiled(x)
+            y.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 2)
+
+        with ctx(
+            *saved_tensors_hooks_to_gm(
+                pack_cpu2,
+                unpack_cpu2,
+                symbolic_tracing=False,
+                inp_fn=inp_fn,
+                pack_cache_hash="cpu_offload",
+                unpack_cache_hash="cpu_offload",
+            )
+        ):
+            x = inp_fn()
+            y = fn_compiled(x)
+            y.sum().backward()
+
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 2)
+
+        with ctx(
+            *saved_tensors_hooks_to_gm(pack_mul2, unpack_mul2, symbolic_tracing=False)
+        ):
+            x = inp_fn()
+            y = fn_compiled(x)
+            y.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 3)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 3)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    @unittest.skipIf(not SM80OrLater, "bfloat16, float8")
+    @inductor_config.patch("fx_graph_remote_cache", False)
+    @inductor_config.patch("fx_graph_cache", True)
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_saved_tensors_hooks_autograd_cache_symbolic(self):
+        def pack_fp8_with_scale(x):
+            return _pack_fp8_with_scale_wrap(x)
+
+        def unpack_fp8_with_scale(packed):
+            return _unpack_fp8_with_scale_wrap(packed)
+
+        ctx = torch.autograd.graph.saved_tensors_hooks
+
+        def fn(x):
+            x = x + 1
+            # Relu saves bitmask in AutogradContext
+            x = x.relu()
+            x = x.relu()
+            return x
+
+        device = torch.device("cuda:0")
+        backend = "inductor"
+
+        def inp_fn():
+            x = torch.ones(2, 3, device=device, requires_grad=True)
+            torch._dynamo.mark_dynamic(x, 0)
+            torch._dynamo.mark_dynamic(x, 1)
+            return x
+
+        x = inp_fn()
+        fn_compiled = torch.compile(fn, backend=backend, fullgraph=True)
+        y = fn_compiled(x)
+        y.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+        with ctx(
+            *saved_tensors_hooks_to_gm(
+                pack_fp8_with_scale,
+                unpack_fp8_with_scale,
+                "fp8_with_scale_dtype_floating_point",
+                "fp8_with_scale_dtype_floating_point",
+            )
+        ):
+            x = inp_fn()
+            y = fn_compiled(x)
+            y.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 2)
+
+        with ctx(
+            *saved_tensors_hooks_to_gm(
+                pack_fp8_with_scale,
+                unpack_fp8_with_scale,
+                "fp8_with_scale_dtype_floating_point",
+                "fp8_with_scale_dtype_floating_point",
+            )
+        ):
+            x = inp_fn()
+            y = fn_compiled(x)
+            y.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 2)
+
+        with ctx(
+            *saved_tensors_hooks_to_gm(
+                pack_fp8_with_scale,
+                unpack_fp8_with_scale,
+                "fp8_with_scale_dtype_floating_point_MISS",
+                "fp8_with_scale_dtype_floating_point_MISS",
+            )
+        ):
+            x = inp_fn()
+            y = fn_compiled(x)
+            y.sum().backward()
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 3)
+
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @inductor_config.patch(
+        {
+            "fx_graph_cache": True,
+            "fx_graph_remote_cache": False,
+            "autotune_local_cache": True,
+        }
+    )
+    def test_cache_lazy_backward_for_compiled_autograd(self):
+        device = "cpu"
+        dtype = torch.float32
+        dynamic = True
+        """
+        Verify that we can populate and hot load functions from the cache.
+        """
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
+        if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
+            raise unittest.SkipTest("requires SM80 or later")
+
+        def fn(x, y):
+            return x.sin() @ y
+
+        a = torch.rand(100, 100, dtype=dtype, device=device, requires_grad=True)
+        b = torch.rand(100, 100, dtype=dtype, device=device, requires_grad=True)
+
+        # Record artifacts
+        with fresh_cache():
+            compiled_fn = torch.compile(fn, dynamic=dynamic)
+
+            # A first call should miss in the cache.
+            eager_result = fn(a, b)
+            expected_grads = torch.autograd.grad(eager_result.sum(), inputs=(a, b))
+            compiled_result = compiled_fn(a, b)
+            with torch._dynamo.compiled_autograd._enable(
+                torch.compile(dynamic=dynamic)
+            ):
+                actual_grads = torch.autograd.grad(compiled_result.sum(), inputs=(a, b))
+            if hasattr(a, "_dynamo_weak_dynamic_indices"):
+                del a._dynamo_weak_dynamic_indices
+            self.assertEqual(eager_result, compiled_result)
+            self.assertEqual(expected_grads[0], actual_grads[0])
+            self.assertEqual(expected_grads[1], actual_grads[1])
+            self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 3)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+            self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+            self.assertEqual(counters["compiled_autograd"]["captures"], 1)
+
+        artifacts = torch.compiler.save_cache_artifacts()
+
+        self.assertIsNotNone(artifacts)
+
+        artifact_bytes, cache_info = artifacts
+
+        autotune_expect = 2 if device == GPU_TYPE else 0
+
+        self.assertEqual(len(cache_info.inductor_artifacts), 3)
+        self.assertEqual(len(cache_info.autotune_artifacts), autotune_expect)
+        self.assertEqual(len(cache_info.aot_autograd_artifacts), 1)
+        self.assertEqual(len(cache_info.pgo_artifacts), 0)
+
+        self._clear_all_caches()
+
+        # Clean triton kernels
+        shutil.rmtree(os.path.join(cache_dir(), "triton"), ignore_errors=True)
+
+        # Hot load and hit, should not recompile
+        with fresh_cache():
+            cache_info = torch.compiler.load_cache_artifacts(artifact_bytes)
+
+            self.assertEqual(len(cache_info.inductor_artifacts), 3)
+            self.assertEqual(len(cache_info.autotune_artifacts), autotune_expect)
+            self.assertEqual(len(cache_info.aot_autograd_artifacts), 1)
+            self.assertEqual(len(cache_info.pgo_artifacts), 0)
+
+            for i in range(3):
+                counters.clear()
+                eager_result = fn(a, b)
+                expected_grads = torch.autograd.grad(eager_result.sum(), inputs=(a, b))
+                compiled_result = compiled_fn(a, b)
+                with torch._dynamo.compiled_autograd._enable(
+                    torch.compile(dynamic=dynamic)
+                ):
+                    actual_grads = torch.autograd.grad(
+                        compiled_result.sum(), inputs=(a, b)
+                    )
+                if hasattr(a, "_dynamo_weak_dynamic_indices"):
+                    del a._dynamo_weak_dynamic_indices
+                self.assertEqual(eager_result, compiled_result)
+                self.assertEqual(expected_grads[0], actual_grads[0])
+                self.assertEqual(expected_grads[1], actual_grads[1])
+
+                if i == 0:
+                    # initial compile
+                    self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
+                    self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 3)
+                    self.assertEqual(
+                        counters["inductor"]["fxgraph_lookup_write_file"], 3
+                    )
+                    self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 0)
+                    self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
+                    self.assertEqual(
+                        counters["aot_autograd"]["autograd_cache_saved"], 0
+                    )
+                    self.assertEqual(counters["compiled_autograd"]["captures"], 1)
+                else:
+                    # no recompiles
+                    self.assertFalse(counters)
+
+
+@functorch_config.patch({"bundled_autograd_cache": True})
+class AOTAutogradCacheBundledTests(AOTAutogradCacheTests):
+    pass
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @inductor_config.patch("fx_graph_cache", True)
 class AOTAutogradCachePicklerTests(torch._dynamo.test_case.TestCase):
@@ -776,6 +1650,10 @@ def default_config(self):
             is_export=False,
             no_tangents=False,
             enable_log=False,
+<<<<<<< HEAD
+=======
+            precompile_backend_id=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _get_dynamo_output(self, fn, *args, **kwargs):
@@ -804,7 +1682,12 @@ def gen_cache_key(self, f, config, inputs=None):
         # Needs a shape env for FxGraphCache.check_can_cache to pass.
         # Not needed for actual key calculation.
         with torch._guards.tracing(ctx):
+<<<<<<< HEAD
             return autograd_cache_key(fx_g, example_inputs, config, {})
+=======
+            with sanitize_gm_for_cache(fx_g):
+                return autograd_cache_key(fx_g, example_inputs, config, {})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_basic_hash_key(self):
         def fn(x):
diff --git a/test/dynamo/test_autograd_function.py b/test/dynamo/test_autograd_function.py
index a7405cf7baca..ecb804ce8feb 100644
--- a/test/dynamo/test_autograd_function.py
+++ b/test/dynamo/test_autograd_function.py
@@ -455,9 +455,15 @@ def backward(ctx, grad_output):
 
                 # Modify gradient using .data (Dangerous: Breaks autograd tracking!)
                 modified_grad = grad_output.clone()
+<<<<<<< HEAD
                 modified_grad.data[
                     input_tensor.data < 0
                 ] = 0  # Zero-out gradients for negative inputs
+=======
+                modified_grad.data[input_tensor.data < 0] = (
+                    0  # Zero-out gradients for negative inputs
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 return modified_grad * 3
 
@@ -598,7 +604,10 @@ def forward(self, L_x_: "f32[]", L_z_: "f32[]", L_weird_b: "f32[]", L_weird_c: "
         l_weird_b = L_weird_b
         l_weird_c = L_weird_c
 
+<<<<<<< HEAD
         function_ctx = torch.autograd.function.FunctionCtx();  function_ctx = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fwd_body_0 = self.fwd_body_0
         bwd_body_0 = self.bwd_body_0
         autograd_function_apply: "f32[]" = torch.ops.higher_order.autograd_function_apply(fwd_body_0, bwd_body_0, l_x_, l_z_, l_weird_b, l_weird_c, args_tensor_mask = [True, False, True], non_differentiable_idx = []);  fwd_body_0 = bwd_body_0 = l_x_ = l_z_ = l_weird_b = l_weird_c = None
@@ -1120,7 +1129,10 @@ def forward(self, L_x_: "f32[5, 3]", L_weight_: "f32[4, 3]"):
         l_x_ = L_x_
         l_weight_ = L_weight_
 
+<<<<<<< HEAD
         function_ctx = torch.autograd.function.FunctionCtx();  function_ctx = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fwd_body_0 = self.fwd_body_0
         bwd_body_0 = self.bwd_body_0
         autograd_function_apply: "f32[5, 4]" = torch.ops.higher_order.autograd_function_apply(fwd_body_0, bwd_body_0, l_x_, l_weight_, args_tensor_mask = [True, True], non_differentiable_idx = []);  fwd_body_0 = bwd_body_0 = l_x_ = l_weight_ = None
@@ -1305,7 +1317,10 @@ def forward(self, L_x_: "f32[]", L_y_: "f32[]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
+<<<<<<< HEAD
         function_ctx = torch.autograd.function.FunctionCtx();  function_ctx = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fwd_body_0 = self.fwd_body_0
         bwd_body_0 = self.bwd_body_0
         autograd_function_apply = torch.ops.higher_order.autograd_function_apply(fwd_body_0, bwd_body_0, l_x_, l_y_, args_tensor_mask = [True, True], non_differentiable_idx = [1]);  fwd_body_0 = bwd_body_0 = l_x_ = l_y_ = None
@@ -1474,7 +1489,11 @@ def fn():
         self.assertEqual(out, x + 1)
         self.assertEqual(x.grad.shape, shape)
         self.assertEqual(cnt.frame_count, 1)
+<<<<<<< HEAD
         self.assertEqual(cnt.op_count, 2)
+=======
+        self.assertEqual(cnt.op_count, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_cuda
     def test_triton_kernel_basic(self):
diff --git a/test/dynamo/test_backward_higher_order_ops.py b/test/dynamo/test_backward_higher_order_ops.py
index 2aa5ee3e7189..5e848ac33bb6 100644
--- a/test/dynamo/test_backward_higher_order_ops.py
+++ b/test/dynamo/test_backward_higher_order_ops.py
@@ -1,7 +1,15 @@
 # Owner(s): ["module: dynamo"]
+<<<<<<< HEAD
 # flake8: noqa
 
 import functools
+=======
+# flake8: noqa: B950
+
+import functools
+import itertools
+from unittest import mock
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo.test_case
@@ -11,7 +19,10 @@
 from torch._dynamo import compiled_autograd
 from torch._dynamo._trace_wrapped_higher_order_op import trace_wrapped
 from torch._dynamo.testing import normalize_gm
+<<<<<<< HEAD
 from torch._dynamo.utils import counters
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.proxy_tensor import make_fx
 
 
@@ -91,7 +102,14 @@ def forward(self, grad_1: "f32[2]"):
 """,
         )
 
+<<<<<<< HEAD
     def test_invoke_in_pt2_compiled_autograd(self):
+=======
+    @mock.patch(
+        "torch._functorch.aot_autograd.AOT_COUNTER", new_callable=itertools.count
+    )
+    def test_invoke_in_pt2_compiled_autograd(self, _):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph = None
 
         def compiler_fn(gm):
@@ -121,11 +139,16 @@ def fn(x, y):
                 out.backward(grad_out)
             actual = normalize_gm(graph.print_readable(False))
             self.assertEqual(x.grad, grad_out * grad_out)
+<<<<<<< HEAD
             if backend in ["aot_eager", "inductor"]:
+=======
+            if backend == "aot_eager":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertExpectedInline(
                     actual,
                     """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, L_inputs_ : list):
         l_inputs_ = L_inputs_
 
@@ -143,12 +166,76 @@ def forward(self, L_inputs_ : list):
 
         new_grad_1: "f32[2]" = torch.clone(result);  result = None
         return (new_grad, new_grad_1)
+=======
+    def forward(self, L_inputs_ : list, s69: "Sym(s21)", L_sizes_0_: "f32[0, s21]"):
+        l_inputs_ = L_inputs_
+        l_sizes_0_ = L_sizes_0_
+
+        getitem: "f32[s21]" = l_inputs_[0]
+        getitem_1: "f32[s21]" = l_inputs_[1]
+        getitem_2: "f32[s21]" = l_inputs_[2];  l_inputs_ = None
+
+        size: "Sym(s21)" = l_sizes_0_.size(1);  l_sizes_0_ = None
+
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False)]);  getitem = size = None
+        getitem_9: "f32[s21]" = validate_outputs[0];  validate_outputs = None
+
+        call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((), [], getitem_9);  getitem_9 = None
+        aot1_tangents_1: "f32[s21]" = call_aot_bwd_prologue[0];  call_aot_bwd_prologue = None
+
+        accumulate_grad = torch__dynamo_compiled_autograd_ops_AccumulateGrad([aot1_tangents_1], getitem_1, None, False);  getitem_1 = None
+        getitem_11: "f32[s21]" = accumulate_grad[0];  accumulate_grad = None
+
+        result: "f32[s21]" = aot1_tangents_1 * aot1_tangents_1;  aot1_tangents_1 = None
+
+        accumulate_grad_1 = torch__dynamo_compiled_autograd_ops_AccumulateGrad([result], getitem_2, None, False);  result = getitem_2 = None
+        getitem_12: "f32[s21]" = accumulate_grad_1[0];  accumulate_grad_1 = None
+        return (getitem_11, getitem_12)
+""",
+                )
+            elif backend == "inductor":
+                self.assertExpectedInline(
+                    actual,
+                    """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_inputs_ : list, s69: "Sym(s21)", L_sizes_0_: "f32[0, s21]"):
+        l_inputs_ = L_inputs_
+        l_sizes_0_ = L_sizes_0_
+
+        getitem: "f32[s21]" = l_inputs_[0]
+        getitem_1: "f32[s21]" = l_inputs_[1]
+        getitem_2: "f32[s21]" = l_inputs_[2];  l_inputs_ = None
+
+        size: "Sym(s21)" = l_sizes_0_.size(1);  l_sizes_0_ = None
+
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False)]);  getitem = size = None
+        getitem_9: "f32[s21]" = validate_outputs[0];  validate_outputs = None
+
+        call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((), [], getitem_9);  getitem_9 = None
+        aot3_tangents_1: "f32[s21]" = call_aot_bwd_prologue[0];  call_aot_bwd_prologue = None
+
+        accumulate_grad = torch__dynamo_compiled_autograd_ops_AccumulateGrad([aot3_tangents_1], getitem_1, None, False);  getitem_1 = None
+        getitem_11: "f32[s21]" = accumulate_grad[0];  accumulate_grad = None
+
+        result: "f32[s21]" = aot3_tangents_1 * aot3_tangents_1;  aot3_tangents_1 = None
+
+        accumulate_grad_1 = torch__dynamo_compiled_autograd_ops_AccumulateGrad([result], getitem_2, None, False);  result = getitem_2 = None
+        getitem_12: "f32[s21]" = accumulate_grad_1[0];  accumulate_grad_1 = None
+        return (getitem_11, getitem_12)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
                 )
 
             graph = None
 
+<<<<<<< HEAD
     def test_invoke_in_pt2_compiled_autograd_side_effect(self):
+=======
+    @mock.patch(
+        "torch._functorch.aot_autograd.AOT_COUNTER", new_callable=itertools.count
+    )
+    def test_invoke_in_pt2_compiled_autograd_side_effect(self, _):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def _side_effect_stateful_fn2(x, obj):
             obj.counter = obj.counter + 1
             return _multiply(x)
@@ -199,6 +286,7 @@ def fn(x, y):
                     actual,
                     """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, L_inputs_ : list, L_hooks_1_keywords_fn_keywords_obj_counter: "Sym(s1)"):
         l_inputs_ = L_inputs_
         l_hooks_1_keywords_fn_keywords_obj_counter = L_hooks_1_keywords_fn_keywords_obj_counter
@@ -219,6 +307,35 @@ def forward(self, L_inputs_ : list, L_hooks_1_keywords_fn_keywords_obj_counter:
 
         new_grad_1: "f32[2]" = torch.clone(result);  result = None
         return (new_grad, new_grad_1, add)
+=======
+    def forward(self, L_inputs_ : list, s69: "Sym(s21)", L_sizes_0_: "f32[0, s21]", L_hooks_1_keywords_fn_keywords_obj_counter: "Sym(s45)"):
+        l_inputs_ = L_inputs_
+        l_sizes_0_ = L_sizes_0_
+        l_hooks_1_keywords_fn_keywords_obj_counter = L_hooks_1_keywords_fn_keywords_obj_counter
+
+        getitem: "f32[s21]" = l_inputs_[0]
+        getitem_1: "f32[s21]" = l_inputs_[1]
+        getitem_2: "f32[s21]" = l_inputs_[2];  l_inputs_ = None
+
+        size: "Sym(s21)" = l_sizes_0_.size(1);  l_sizes_0_ = None
+
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False)]);  getitem = size = None
+        getitem_9: "f32[s21]" = validate_outputs[0];  validate_outputs = None
+
+        call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((), [], getitem_9);  getitem_9 = None
+        aot0_tangents_1: "f32[s21]" = call_aot_bwd_prologue[0];  call_aot_bwd_prologue = None
+
+        accumulate_grad = torch__dynamo_compiled_autograd_ops_AccumulateGrad([aot0_tangents_1], getitem_1, None, False);  getitem_1 = None
+        getitem_11: "f32[s21]" = accumulate_grad[0];  accumulate_grad = None
+
+        add: "Sym(s45 + 1)" = l_hooks_1_keywords_fn_keywords_obj_counter + 1;  l_hooks_1_keywords_fn_keywords_obj_counter = None
+
+        result: "f32[s21]" = aot0_tangents_1 * aot0_tangents_1;  aot0_tangents_1 = None
+
+        accumulate_grad_1 = torch__dynamo_compiled_autograd_ops_AccumulateGrad([result], getitem_2, None, False);  result = getitem_2 = None
+        getitem_12: "f32[s21]" = accumulate_grad_1[0];  accumulate_grad_1 = None
+        return (getitem_11, getitem_12, add)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
                 )
 
diff --git a/test/dynamo/test_base_hop.py b/test/dynamo/test_base_hop.py
index c6cb06eaf8e2..aef150045c36 100644
--- a/test/dynamo/test_base_hop.py
+++ b/test/dynamo/test_base_hop.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: dynamo"]
 import unittest
+<<<<<<< HEAD
+=======
+import unittest.mock as mock
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo.test_case
@@ -10,6 +14,11 @@
     EagerAndRecordGraphs,
     normalize_gm,
 )
+<<<<<<< HEAD
+=======
+from torch._higher_order_ops.schema import find_hop_schema
+from torch.testing._internal.common_utils import instantiate_parametrized_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
 
@@ -73,6 +82,288 @@ def forward(self, l_x_: "f32[3, 3]", l_y_: "f32[3, 3]"):
 """,  # NOQA: B950
         )
 
+<<<<<<< HEAD
+=======
+    def test_schema_gen_single_return(self):
+        def inner(x, y):
+            return (x @ y).sin().cos()
+
+        x = torch.randn(3, 3, requires_grad=False)
+        y = torch.randn(3, 3, requires_grad=False)
+
+        backend = EagerAndRecordGraphs()
+
+        @torch.compile(backend=backend)
+        def f(x, y):
+            return invoke_quant_test(inner, x, y, scheme="nf4")
+
+        out = f(x.clone(), y)
+        self.assertEqual(out, inner(x.clone(), y))
+        schemas = find_hop_schema(backend.graphs[0], invoke_quant_test)
+        self.assertEqual(len(schemas), 1)
+        self.assertExpectedInline(
+            str(schemas[0]),
+            """invoke_quant_test(Any subgraph, Tensor arg0, Tensor arg1, *, str scheme="nf4") -> ((Tensor))""",  # noqa: B950
+        )
+
+    def test_schema_gen_pytree_in_out(self):
+        def inner(x_y):
+            x, y = x_y
+            return [
+                (x @ y).sin().cos(),
+                (x + y, x - y),
+                {"out": (x @ y,)},
+            ]
+
+        # make x not require grad because we want to inplace mutate it
+        x = torch.randn(3, 3, requires_grad=False)
+        y = torch.randn(3, 3, requires_grad=True)
+
+        backend = EagerAndRecordGraphs()
+
+        @torch.compile(backend=backend)
+        def f(x, y):
+            return invoke_quant_test(inner, [x, y], scheme="nf4")
+
+        out = f(x.clone(), y)
+        self.assertEqual(out, inner([x.clone(), y]))
+        schemas = find_hop_schema(backend.graphs[0], invoke_quant_test)
+        self.assertEqual(len(schemas), 1)
+        self.assertExpectedInline(
+            str(schemas[0]),
+            """invoke_quant_test(Any subgraph, Tensor arg0, Tensor arg1, *, str scheme="nf4") -> (Tensor, Tensor, Tensor, Tensor)""",  # noqa: B950
+        )
+
+    def test_schema_gen_single_return_with_mutation(self):
+        def inner(x, y):
+            x.add_(1)
+            y.mul_(-1)
+            return (x @ y).sin().cos()
+
+        x = torch.randn(3, 3, requires_grad=False)
+        y = torch.randn(3, 3, requires_grad=False)
+
+        backend = EagerAndRecordGraphs()
+
+        def f(x, y):
+            return invoke_quant_test(inner, x, y, scheme="nf4")
+
+        with mock.patch(
+            "torch._dynamo.variables.higher_order_ops.BaseHOPVariable.supports_input_mutation",
+            True,
+        ):
+            torch.compile(f, backend=backend, fullgraph=True)(x.clone(), y)
+        self.assertEqual(len(backend.graphs), 1)
+        self.assertExpectedInline(
+            normalize_graph(backend.graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3]", L_y_: "f32[3, 3]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        subgraph_0 = self.subgraph_0
+        invoke_quant_test = torch.ops.higher_order.invoke_quant_test(subgraph_0, l_x_, l_y_, scheme = 'nf4');  subgraph_0 = l_x_ = l_y_ = None
+        getitem: "f32[3, 3]" = invoke_quant_test[0];  invoke_quant_test = None
+        return (getitem,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[3, 3]", l_y_: "f32[3, 3]"):
+            add_: "f32[3, 3]" = l_x_.add_(1);  add_ = None
+
+            mul_: "f32[3, 3]" = l_y_.mul_(-1);  mul_ = None
+
+            matmul: "f32[3, 3]" = l_x_ @ l_y_;  l_x_ = l_y_ = None
+            sin: "f32[3, 3]" = matmul.sin();  matmul = None
+            cos: "f32[3, 3]" = sin.cos();  sin = None
+            return (cos,)
+""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(find_hop_schema(backend.graphs[0], invoke_quant_test)[0]),
+            """invoke_quant_test(Any subgraph, Tensor(a1!) arg0, Tensor(a2!) arg1, *, str scheme="nf4") -> ((Tensor))""",
+        )
+
+    def test_schema_gen_pytree_in_out_with_mutation(self):
+        def inner(x_y):
+            x, y = x_y
+            x.add_(1)
+            return [
+                (x @ y).sin().cos(),
+                (x + y, x - y),
+                {"out": (x @ y,)},
+            ]
+
+        # make x not require grad because we want to inplace mutate it
+        x = torch.randn(3, 3, requires_grad=False)
+        y = torch.randn(3, 3, requires_grad=True)
+
+        bk = EagerAndRecordGraphs()
+
+        def f(x, y):
+            return invoke_quant_test(inner, [x, y], scheme="nf4")
+
+        with (
+            mock.patch(
+                "torch._dynamo.variables.higher_order_ops.BaseHOPVariable.supports_input_mutation",
+                True,
+            ),
+            torch.no_grad(),
+        ):
+            torch.compile(f, backend=bk, fullgraph=True)(x.clone(), y)
+
+        self.assertEqual(len(bk.graphs), 1)
+        self.assertExpectedInline(
+            normalize_graph(bk.graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3]", L_y_: "f32[3, 3]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        subgraph_0 = self.subgraph_0
+        invoke_quant_test = torch.ops.higher_order.invoke_quant_test(subgraph_0, l_x_, l_y_, scheme = 'nf4');  subgraph_0 = l_x_ = l_y_ = None
+        getitem: "f32[3, 3]" = invoke_quant_test[0]
+        getitem_1: "f32[3, 3]" = invoke_quant_test[1]
+        getitem_2: "f32[3, 3]" = invoke_quant_test[2]
+        getitem_3: "f32[3, 3]" = invoke_quant_test[3];  invoke_quant_test = None
+        return (getitem, getitem_1, getitem_2, getitem_3)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[3, 3]", l_y_: "f32[3, 3]"):
+            add_: "f32[3, 3]" = l_x_.add_(1);  add_ = None
+
+            matmul: "f32[3, 3]" = l_x_ @ l_y_
+            sin: "f32[3, 3]" = matmul.sin();  matmul = None
+            child: "f32[3, 3]" = sin.cos();  sin = None
+
+            child_1: "f32[3, 3]" = l_x_ + l_y_
+            child_2: "f32[3, 3]" = l_x_ - l_y_
+
+            child_3: "f32[3, 3]" = l_x_ @ l_y_;  l_x_ = l_y_ = None
+            return (child, child_1, child_2, child_3)
+""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(find_hop_schema(bk.graphs[0], invoke_quant_test)[0]),
+            """invoke_quant_test(Any subgraph, Tensor(a1!) arg0, Tensor arg1, *, str scheme="nf4") -> (Tensor, Tensor, Tensor, Tensor)""",  # noqa: B950
+        )
+
+    def test_none_input(self):
+        def inner(x, y):
+            if x is not None:
+                return y.sin()
+            return y.cos()
+
+        backend = EagerAndRecordGraphs()
+
+        @torch.compile(backend=backend, fullgraph=True)
+        def f(x, y):
+            return invoke_quant_test(inner, x, y, scheme="nf4")
+
+        x = None
+        y = torch.randn(3, 4)
+        out = f(x, y)
+        self.assertEqual(out, inner(x, y))
+        self.assertExpectedInline(
+            normalize_graph(backend.graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_y_: "f32[3, 4]"):
+        l_y_ = L_y_
+
+        subgraph_0 = self.subgraph_0
+        invoke_quant_test = torch.ops.higher_order.invoke_quant_test(subgraph_0, l_y_, scheme = 'nf4');  subgraph_0 = l_y_ = None
+        getitem: "f32[3, 4]" = invoke_quant_test[0];  invoke_quant_test = None
+        return (getitem,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_y_: "f32[3, 4]"):
+            cos: "f32[3, 4]" = l_y_.cos();  l_y_ = None
+            return (cos,)
+""",
+        )
+
+    def test_int_input(self):
+        def inner(x, y):
+            return x + y
+
+        backend = EagerAndRecordGraphs()
+
+        @torch.compile(backend=backend, fullgraph=True)
+        def f(x, y):
+            return invoke_quant_test(inner, x, y, scheme="nf4")
+
+        x = 1
+        y = torch.randn(3, 4)
+        out = f(x, y)
+        self.assertEqual(out, inner(x, y))
+        self.assertExpectedInline(
+            normalize_graph(backend.graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_y_: "f32[3, 4]"):
+        l_y_ = L_y_
+
+        subgraph_0 = self.subgraph_0
+        invoke_quant_test = torch.ops.higher_order.invoke_quant_test(subgraph_0, l_y_, scheme = 'nf4');  subgraph_0 = l_y_ = None
+        getitem: "f32[3, 4]" = invoke_quant_test[0];  invoke_quant_test = None
+        return (getitem,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_y_: "f32[3, 4]"):
+            add: "f32[3, 4]" = 1 + l_y_;  l_y_ = None
+            return (add,)
+""",
+        )
+
+    def test_auto_functionalize(self):
+        def inner(x, y):
+            x.add_(1)
+            return x + y
+
+        backend = AotEagerAndRecordGraphs()
+
+        def f(x, y):
+            return invoke_quant_test(inner, x, y, scheme="nf4")
+
+        x = torch.randn(3, 3, requires_grad=False)
+        x_clone = x.clone()
+        y = torch.randn(3, 3, requires_grad=True)
+        with (
+            mock.patch(
+                "torch._dynamo.variables.higher_order_ops.BaseHOPVariable.supports_input_mutation",
+                True,
+            ),
+            torch.no_grad(),
+        ):
+            compiled_out = torch.compile(f, backend=backend, fullgraph=True)(x, y)
+        self.assertEqual(x, x_clone + 1)
+        self.assertEqual(compiled_out, x_clone + y + 1)
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertExpectedInline(
+            normalize_graph(backend.fw_graphs[0]),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3, 3]"):
+        auto_functionalized_subgraph_0 = self.auto_functionalized_subgraph_0
+        _tree_spec_constant0 = self._tree_spec_constant0
+        auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.higher_order.invoke_quant_test, subgraph = auto_functionalized_subgraph_0, arg1 = arg1_1, scheme = 'nf4', _arg0_base_index = 0, _all_bases = [arg0_1], _op_schema = _tree_spec_constant0);  auto_functionalized_subgraph_0 = arg1_1 = _tree_spec_constant0 = None
+        getitem: "f32[3, 3]" = auto_functionalized_v2[0]
+        getitem_1: "f32[3, 3]" = auto_functionalized_v2[1];  auto_functionalized_v2 = None
+        copy_: "f32[3, 3]" = torch.ops.aten.copy_.default(arg0_1, getitem_1);  arg0_1 = getitem_1 = copy_ = None
+        return (getitem,)
+
+    class auto_functionalized_subgraph_0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3, 3]"):
+            add: "f32[3, 3]" = torch.ops.aten.add.Tensor(arg0_1, 1)
+            add_1: "f32[3, 3]" = torch.ops.aten.add.Tensor(add, arg1_1);  arg1_1 = None
+            copy_: "f32[3, 3]" = torch.ops.aten.copy_.default(arg0_1, add);  arg0_1 = add = copy_ = None
+            return (add_1,)
+""",  # noqa: B950
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(assume_static_by_default=True)
     def test_aot_eager(self):
         def inner(x, y):
@@ -130,7 +421,10 @@ def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3, 3]", arg2_1: "f32[3, 3]")
             mm: "f32[3, 3]" = torch.ops.aten.mm.default(arg0_1, arg1_1)
             clone: "f32[3, 3]" = torch.ops.aten.clone.default(mm)
             sin: "f32[3, 3]" = torch.ops.aten.sin.default(mm);  mm = None
+<<<<<<< HEAD
             cos: "f32[3, 3]" = torch.ops.aten.cos.default(sin);  cos = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sin_1: "f32[3, 3]" = torch.ops.aten.sin.default(sin);  sin = None
             neg: "f32[3, 3]" = torch.ops.aten.neg.default(sin_1);  sin_1 = None
             mul: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg2_1, neg);  arg2_1 = neg = None
@@ -159,10 +453,22 @@ def inner2(x, y):
         def f(inner, x, y):
             return invoke_quant_test(inner, x, y, scheme="nf4")
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "aliases of the inputs"):
             f(inner, x, y)
 
         with self.assertRaisesRegex(RuntimeError, "inputs are mutated"):
+=======
+        with self.assertRaisesRegex(
+            RuntimeError, "Encountered aliasing during higher order op tracing"
+        ):
+            f(inner, x, y)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Encountered input mutation during higher order op tracing",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f(inner2, x, y)
 
     def test_eager_call(self):
@@ -182,6 +488,12 @@ def inner(x, y):
         invoke_quant_test(result, x, y, scheme="nf4")
 
 
+<<<<<<< HEAD
+=======
+instantiate_parametrized_tests(BaseHOPTest)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_buffers_override.py b/test/dynamo/test_buffers_override.py
new file mode 100644
index 000000000000..3ceba631423d
--- /dev/null
+++ b/test/dynamo/test_buffers_override.py
@@ -0,0 +1,49 @@
+# Owner(s): ["module: dynamo"]
+
+import torch
+import torch._dynamo.test_case
+import torch.nn as nn
+
+
+class TestBuffersOverride(torch._dynamo.test_case.TestCase):
+    def test_buffers_override(self):
+        class SomeModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                # Override buffers; should not cause breakage
+                # this is because we use `named_buffers` for
+                # static marking
+                self.register_buffer("A", torch.ones(3, 3))
+                self.buffers = []
+
+            def forward(self):
+                return self.A * torch.zeros(1, 1)
+
+        model = SomeModel().to(torch.device("cpu"))
+        compiled_model = torch.compile(model)
+        self.assertEqual(compiled_model.A, torch.ones(3, 3))
+        compiled_model()
+
+    def test_named_buffers_override(self):
+        class SomeModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                # Override buffers; should not cause breakage
+                # but skip the marking static here since
+                # named_buffers is overridden
+                self.register_buffer("B", torch.ones(3, 3))
+                self.named_buffers = []
+
+            def forward(self):
+                return self.B * torch.zeros(1, 1)
+
+        model = SomeModel().to(torch.device("cpu"))
+        compiled_model = torch.compile(model)
+        self.assertEqual(compiled_model.B, torch.ones(3, 3))
+        compiled_model()
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_bytecode_utils.py b/test/dynamo/test_bytecode_utils.py
index fa906a2ac162..b4400189373d 100644
--- a/test/dynamo/test_bytecode_utils.py
+++ b/test/dynamo/test_bytecode_utils.py
@@ -53,8 +53,13 @@ def fn():
         fn_str = f"""\
 def fn():
     foo.bar(1, 2, 3)
+<<<<<<< HEAD
 {str(chr(10)).join(' ' * 4 + 'x' + str(i) + ' = 1' for i in range(1 << 9))}
     l = [{' '.join('x' + str(i) + ',' for i in range(1 << 9))}]
+=======
+{str(chr(10)).join(" " * 4 + "x" + str(i) + " = 1" for i in range(1 << 9))}
+    l = [{" ".join("x" + str(i) + "," for i in range(1 << 9))}]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         locals = {}
         exec(fn_str, {}, locals)
diff --git a/test/dynamo/test_callback.py b/test/dynamo/test_callback.py
index c1122cf68f0d..e0978f652e29 100644
--- a/test/dynamo/test_callback.py
+++ b/test/dynamo/test_callback.py
@@ -1,9 +1,21 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
 from unittest.mock import Mock
 
 from torch._dynamo.callback import callback_handler
 from torch._dynamo.test_case import run_tests, TestCase
+=======
+import unittest
+from unittest.mock import Mock
+
+import torch
+from torch._dynamo.callback import callback_handler, CallbackArgs, CallbackTrigger
+from torch._dynamo.test_case import run_tests, TestCase
+from torch._guards import CompileId
+from torch.testing._internal.common_utils import TEST_WITH_ROCM
+from torch.testing._internal.inductor_utils import HAS_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CallbackTests(TestCase):
@@ -15,6 +27,7 @@ def setUp(self) -> None:
         callback_handler.register_end_callback(self._on_compile_end)
 
     def tearDown(self) -> None:
+<<<<<<< HEAD
         return super().tearDown()
         callback_handler.clear()
 
@@ -31,13 +44,31 @@ def test_callbacks_with_duplicate_prevention(self) -> None:
         callback_handler._CompilationCallbackHandler__prevent_duplicate_callbacks = True
 
         with callback_handler.install_callbacks(), callback_handler.install_callbacks():
+=======
+        callback_handler.clear()
+        return super().tearDown()
+
+    def test_callbacks_with_duplicate_prevention(self) -> None:
+        trigger = CallbackTrigger.DYNAMO
+        compile_id = CompileId(0, 0)
+        with (
+            callback_handler.install_callbacks(trigger, compile_id),
+            callback_handler.install_callbacks(trigger, compile_id),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._on_compile_start.assert_called_once()
         self._on_compile_end.assert_called_once()
 
     def test_counter(self) -> None:
+<<<<<<< HEAD
         callback_handler._CompilationCallbackHandler__prevent_duplicate_callbacks = True
 
         with callback_handler.install_callbacks():
+=======
+        trigger = CallbackTrigger.DYNAMO
+        compile_id = CompileId(0, 0)
+        with callback_handler.install_callbacks(trigger, compile_id):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(
                 callback_handler._CompilationCallbackHandler__pending_callbacks_counter,
                 1,
@@ -47,6 +78,7 @@ def test_counter(self) -> None:
         )
 
     def test_counter_assertion(self) -> None:
+<<<<<<< HEAD
         callback_handler._CompilationCallbackHandler__prevent_duplicate_callbacks = True
         callback_handler._CompilationCallbackHandler__pending_callbacks_counter -= 1
 
@@ -61,6 +93,98 @@ def test_counter_assertion(self) -> None:
         )
 
         callback_handler._CompilationCallbackHandler__pending_callbacks_counter += 1
+=======
+        callback_handler._CompilationCallbackHandler__pending_callbacks_counter -= 1
+        with self.assertRaisesRegex(
+            AssertionError, "Pending callbacks counter cannot become negative."
+        ):
+            trigger = CallbackTrigger.DYNAMO
+            compile_id = CompileId(0, 0)
+            with callback_handler.install_callbacks(trigger, str(compile_id)):
+                pass
+        self.assertEqual(
+            callback_handler._CompilationCallbackHandler__pending_callbacks_counter, 0
+        )
+
+    @unittest.skipIf(
+        TEST_WITH_ROCM, "ROCm outputs a different number of autotuning logs"
+    )
+    @unittest.skipIf(not HAS_CUDA, "requires triton")
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_triggers(self) -> None:
+        torch._dynamo.reset()
+        order = []
+
+        def on_start(args: CallbackArgs):
+            nonlocal order
+            order.append(f"start={args}")
+
+        def on_end(args: CallbackArgs):
+            nonlocal order
+            order.append(f"end={args}")
+
+        torch._dynamo.callback.on_compile_start(on_start)
+        torch._dynamo.callback.on_compile_start(on_end)
+
+        class TinyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(10, 10)
+                self.relu = torch.nn.ReLU()
+                self.fc2 = torch.nn.Linear(10, 10)
+
+            def forward(self, x):
+                temp = self.fc1(x)
+                temp = self.relu(temp)
+                torch._dynamo.graph_break()
+                return self.fc2(temp)
+
+        model = TinyModel().to("cuda")
+        compiled_model = torch.compile(model, mode="max-autotune")
+        x = torch.randn(10, 10, device="cuda")
+
+        loss = compiled_model(x).sum()
+        loss.backward()
+        self.assertExpectedInline(
+            "\n".join(order),
+            """\
+start=CallbackArgs(callback_trigger=<CallbackTrigger.DYNAMO: 1>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.DYNAMO: 1>, compile_id='0/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.DYNAMO: 1>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.DYNAMO: 1>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id='0/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.TRITON_AUTOTUNING: 3>, compile_id='0/0')""",  # noqa: B950
+        )
+        order.clear()
+
+        compiled_model.zero_grad()
+        loss = compiled_model(x).sum()
+        loss.backward()
+        self.assertExpectedInline(
+            "\n".join(order),
+            """\
+start=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='0/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='1/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='1/0')
+start=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='0/0')
+end=CallbackArgs(callback_trigger=<CallbackTrigger.CUDAGRAPH_RECORDING: 4>, compile_id='0/0')""",  # noqa: B950
+        )
+        order.clear()
+
+        compiled_model.zero_grad()
+        loss = compiled_model(x).sum()
+        loss.backward()
+        self.assertEqual(len(order), 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_compile.py b/test/dynamo/test_compile.py
index 6dd0b626eb66..1859d6342803 100644
--- a/test/dynamo/test_compile.py
+++ b/test/dynamo/test_compile.py
@@ -81,11 +81,19 @@ def test_compilation_callback(self):
         torch._dynamo.reset()
 
         @torch._dynamo.on_compile_start
+<<<<<<< HEAD
         def start_callback():
             print("Compilation started.")
 
         @torch._dynamo.on_compile_end
         def end_callback():
+=======
+        def start_callback(_):
+            print("Compilation started.")
+
+        @torch._dynamo.on_compile_end
+        def end_callback(_):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             print("Compilation ended.")
 
         mod = ToyModel()
@@ -116,13 +124,21 @@ def test_compilation_callback_with_graph_break(self):
         counter = 0
 
         @torch._dynamo.on_compile_start
+<<<<<<< HEAD
         def start_callback():
+=======
+        def start_callback(_):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nonlocal counter
             counter += 1
             print(f"Counter = {counter}")
 
         @torch._dynamo.on_compile_end
+<<<<<<< HEAD
         def end_callback():
+=======
+        def end_callback(_):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nonlocal counter
             counter += 1
             print(f"Counter = {counter}")
@@ -212,6 +228,31 @@ def fn(x: torch.Tensor) -> torch.Tensor:
         out = torch.compile(fn)(a)
         self.assertEqual(out, a)
 
+<<<<<<< HEAD
+=======
+    def test_to_sparse_to_dense_with_graph_break(self):
+        def fn(x):
+            x = x.to_sparse()
+            x = x.to_dense()
+            return x
+
+        x = torch.tensor([[1.0]])
+        c_fn = torch.compile(fn)
+
+        output = fn(x)
+        c_output = c_fn(x)
+        self.assertEqual(output, c_output)
+
+    def test_list_bad_access(self):
+        @torch.compile(backend="eager")
+        def fn(x, y):
+            a = [x]
+            return a[y]
+
+        with self.assertRaises(IndexError):
+            fn(torch.randn(10), 99)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # The private variants of the below functions are extensively tested
 # So as long as the signatures match we're good
@@ -223,9 +264,23 @@ def check_signature(self, public_fn_name, private_fn_name, private_namespace):
         public_sig = inspect.signature(public_fn)
         private_sig = inspect.signature(private_fn)
 
+<<<<<<< HEAD
         self.assertEqual(
             public_sig,
             private_sig,
+=======
+        matching = public_sig == private_sig
+        matching |= len(public_sig.parameters) < len(private_sig.parameters) and all(
+            public == private
+            for public, private in zip(
+                public_sig.parameters.items(), private_sig.parameters.items()
+            )
+        )
+
+        self.assertEqual(
+            matching,
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"Signatures do not match for function {public_fn_name}() \n Public: {public_sig} \n Private: {private_sig}",
         )
 
diff --git a/test/dynamo/test_comptime.py b/test/dynamo/test_comptime.py
index 1f79c04c6248..cb2dd5cbad38 100644
--- a/test/dynamo/test_comptime.py
+++ b/test/dynamo/test_comptime.py
@@ -57,18 +57,30 @@ def f(x):
         self.assertExpectedInline(
             FILE.getvalue().strip(),
             """\
+<<<<<<< HEAD
 FakeTensor(..., size=(s0,))
 2
 [FakeTensor(..., size=(s0,)), 2]
 (FakeTensor(..., size=(s0,)), 2)
 {'foo': FakeTensor(..., size=(s0,))}
+=======
+FakeTensor(..., size=(s77,))
+2
+[FakeTensor(..., size=(s77,)), 2]
+(FakeTensor(..., size=(s77,)), 2)
+{'foo': FakeTensor(..., size=(s77,))}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 range(1, 3, 1)
 Employee(name='foo', id=2)
 UserDefinedListVariable(mylist)
 defaultdict(NestedUserFunctionVariable(), {})
 set()
 {'a','b'}
+<<<<<<< HEAD
 s0""",
+=======
+s77""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_print_graph(self):
@@ -309,6 +321,16 @@ def _(ctx):
             'obj_weakref': None
             'guarded_class': None
         }
+<<<<<<< HEAD
+=======
+        global '' AUTOGRAD_SAVED_TENSORS_HOOKS
+        {
+            'guard_types': None,
+            'code': None,
+            'obj_weakref': None
+            'guarded_class': None
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         global '' GRAD_MODE
         {
             'guard_types': None,
diff --git a/test/dynamo/test_ctx_manager.py b/test/dynamo/test_ctx_manager.py
index 44edc5305e14..73fd4e307e23 100644
--- a/test/dynamo/test_ctx_manager.py
+++ b/test/dynamo/test_ctx_manager.py
@@ -1,7 +1,10 @@
 # Owner(s): ["module: dynamo"]
 import contextlib
 import sys
+<<<<<<< HEAD
 import traceback
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 from contextlib import contextmanager
 
@@ -16,7 +19,10 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+<<<<<<< HEAD
     TEST_WITH_ROCM,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -32,6 +38,19 @@
 k_glb = 0
 
 
+<<<<<<< HEAD
+=======
+@contextlib.contextmanager
+def set_default_dtype(dtype):
+    old_dtype = torch.get_default_dtype()
+    try:
+        torch.set_default_dtype(dtype)
+        yield
+    finally:
+        torch.set_default_dtype(old_dtype)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CustomizedCtxManager:
     def __init__(self, mode):
         self.prev = torch.is_grad_enabled()
@@ -659,7 +678,11 @@ def fn(a_float32, b_float32):
         self.assertTrue(same(ref, res))
 
     @unittest.skipIf(
+<<<<<<< HEAD
         not PLATFORM_SUPPORTS_FLASH_ATTENTION or TEST_WITH_ROCM,
+=======
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "Can't run fused SDPA on this platform",
     )
     def test_autocast_sdpa(self):
@@ -1244,10 +1267,20 @@ def fn(z):
             def f(x, y):
                 return x + y
 
+<<<<<<< HEAD
             x, y = torch.ones(
                 1,
             ), torch.zeros(
                 1,
+=======
+            x, y = (
+                torch.ones(
+                    1,
+                ),
+                torch.zeros(
+                    1,
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return f(x, y)
 
@@ -1281,10 +1314,20 @@ def fn(z):
             def f(x, y):
                 return x + y
 
+<<<<<<< HEAD
             x, y = torch.ones(
                 1,
             ), torch.zeros(
                 1,
+=======
+            x, y = (
+                torch.ones(
+                    1,
+                ),
+                torch.zeros(
+                    1,
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return f(x, y)
 
@@ -1327,10 +1370,20 @@ def inner_fn(x, y):
 
                 return inner_fn(x, y) + x
 
+<<<<<<< HEAD
             x, y = torch.ones(
                 1,
             ), torch.zeros(
                 1,
+=======
+            x, y = (
+                torch.ones(
+                    1,
+                ),
+                torch.zeros(
+                    1,
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return f(x, y)
 
@@ -1745,10 +1798,20 @@ def fn(x):
 class ContextlibContextManagerTests(torch._dynamo.test_case.TestCase):
     def setUp(self):
         self._prev = torch._dynamo.config.enable_trace_contextlib
+<<<<<<< HEAD
+        torch._dynamo.config.enable_trace_contextlib = True
+
+    def tearDown(self):
+        torch._dynamo.config.enable_trace_contextlib = self._prev
+=======
+        self._u_prev = torch._dynamo.config.enable_trace_unittest
         torch._dynamo.config.enable_trace_contextlib = True
+        torch._dynamo.config.enable_trace_unittest = True
 
     def tearDown(self):
         torch._dynamo.config.enable_trace_contextlib = self._prev
+        torch._dynamo.config.enable_trace_unittest = self._u_prev
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_ctx_basic0(self):
         @contextlib.contextmanager
@@ -2692,6 +2755,7 @@ def fn(t):
         self.assertEqual(y, t.sin())
 
 
+<<<<<<< HEAD
 class CPythonContextManagerTestCase(torch._dynamo.test_case.TestCase):
     # Tests taken from CPython source code in cpython/Lib/test/test_contextlib.py
     # https://github.com/python/cpython/blob/d48cc82ed25e26b02eb97c6263d95dcaa1e9111b/Lib/test/test_contextlib.py#L70
@@ -3019,6 +3083,8 @@ def fn(t):
         self.assertEqual(depth, 0)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_parametrized_tests(CtxManagerTests)
 instantiate_parametrized_tests(ContextlibContextManagerTests)
 
diff --git a/test/dynamo/test_decorators.py b/test/dynamo/test_decorators.py
index 6842345e8d16..08ade0efa7ce 100644
--- a/test/dynamo/test_decorators.py
+++ b/test/dynamo/test_decorators.py
@@ -845,12 +845,47 @@ def fn(x):
         self.assertEqual(cnts.frame_count, 3)
         self.assertEqual(cnts.op_count, 6)
 
+<<<<<<< HEAD
     def test_skip(self):
         def fn2(x):
             return x.sin()
 
         @torch._dynamo.disable(recursive=False)
         def fn1(x):
+=======
+    def test_skip_frame(self):
+        cnts = torch._dynamo.testing.CompileCounter()
+
+        @torch.compile(backend=cnts)
+        def fn(x):
+            x = x + 1
+            torch._dynamo.skip_frame()
+            return x + 1
+
+        inp = torch.ones(3, 3)
+        self.assertEqual(fn(inp), inp + 2)
+        self.assertEqual(cnts.frame_count, 0)
+
+        @torch.compile(backend=cnts)
+        def gn(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            x = x + 1
+            torch._dynamo.skip_frame()
+            return x + 1
+
+        self.assertEqual(gn(inp), inp + 3)
+        self.assertEqual(cnts.frame_count, 1)
+
+    def test_disable_recursive_false(self):
+        def fn2(x):
+            return x + 1
+
+        @torch._dynamo.disable(recursive=False)
+        def fn1(x):
+            if torch.compiler.is_compiling():
+                raise RuntimeError("bad")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = x.sigmoid()
             return fn2(x.cos())
 
@@ -862,6 +897,82 @@ def fn(x):
         opt_fn(torch.randn(4))
         self.assertEqual(cnts.frame_count, 2)
 
+<<<<<<< HEAD
+=======
+        # test that applying disable nonrecursive doesn't modify the original function
+        def fn3(x):
+            if torch.compiler.is_compiling():
+                return x - 1
+            return fn2(x) + 2
+
+        @torch.compile(backend=cnts)
+        def outer(f, x):
+            return f(x)
+
+        inp = torch.ones(3)
+        fn3_disabled = torch._dynamo.disable(fn3, recursive=False)
+
+        torch._dynamo.reset()
+
+        cnts.clear()
+        res = outer(fn3, inp)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(res, inp - 1)
+
+        cnts.clear()
+        res = outer(fn3_disabled, inp)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(res, inp + 3)
+
+        torch._dynamo.reset()
+
+        cnts.clear()
+        res = outer(fn3_disabled, inp)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(res, inp + 3)
+
+        cnts.clear()
+        res = outer(fn3, inp)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(res, inp - 1)
+
+        # directly compiling a disabled function should result in a compile
+        torch._dynamo.reset()
+        cnts.clear()
+        res = torch.compile(fn3_disabled, backend=cnts)(inp)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(res, inp - 1)
+
+    def test_disable_recursive_false_weird(self):
+        from torch._dynamo.types import FrameAction, FrameExecStrategy
+
+        # test the case where the next invocation of the function is
+        # manually skipped
+        def fn(x):
+            if torch.compiler.is_compiling():
+                return x - 1
+            return x + 1
+
+        fn_disabled = torch._dynamo.disable(fn, recursive=False)
+
+        torch._dynamo.eval_frame.set_code_exec_strategy(
+            fn.__code__, FrameExecStrategy(FrameAction.SKIP, FrameAction.DEFAULT)
+        )
+
+        @torch.compile(backend="eager")
+        def outer(fn, x):
+            return fn(x)
+
+        inp = torch.ones(3)
+        self.assertEqual(outer(fn_disabled, inp), inp + 1)
+
+        torch._dynamo.eval_frame.set_code_exec_strategy(
+            fn.__code__, FrameExecStrategy(FrameAction.DEFAULT, FrameAction.DEFAULT)
+        )
+
+        self.assertEqual(torch.compile(fn, backend="eager")(inp), inp - 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_substitute_in_graph(self):
         counters.clear()
 
@@ -1394,11 +1505,15 @@ def f(x):
 
         @torch.compile(backend="eager")
         def g(x):
+<<<<<<< HEAD
             # cause a skipped frame
             try:
                 torch._dynamo.graph_break()
             except Exception:
                 pass
+=======
+            torch._dynamo.skip_frame()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # NOTE: torch._dynamo.is_compiling() will get traced
             # and return true. torch.compiler.is_compiling() is skipped
             # and will return false.
@@ -1457,6 +1572,151 @@ def fail_backend(gm, ex):
         with torch.compiler.set_stance("default", force_backend=fail_backend):
             f(torch.randn(3, 3))
 
+<<<<<<< HEAD
+=======
+    # also tests a lot of torch._dynamo.patch_dynamo_config functionality
+    def test_dont_skip_tracing(self):
+        from torch._dynamo.test_dont_skip_tracing_functions import f1, f3, f4, f5, f6
+
+        cnts = torch._dynamo.testing.CompileCounter()
+
+        # make sure test_dont_skip_tracing_functions is actually skipped by trace rules
+        torch.compile(f1, backend=cnts)(torch.randn(3))
+        self.assertEqual(cnts.frame_count, 0)
+
+        f1_unskip = torch._dynamo.dont_skip_tracing(f1)
+
+        # basic test
+        def g1(x):
+            return f1_unskip(x)
+
+        cnts.clear()
+        torch.compile(g1, backend=cnts, fullgraph=True)(torch.randn(3))
+        self.assertEqual(cnts.frame_count, 1)
+
+        # test that dont_skip_tracing is traceable
+        def g2(x):
+            return torch._dynamo.dont_skip_tracing(f1)(x)
+
+        cnts.clear()
+        torch.compile(g2, backend=cnts, fullgraph=True)(torch.randn(3))
+        self.assertEqual(cnts.frame_count, 1)
+
+        # test that dont_skip_tracing is recursive, applied to non-skipped function
+        @torch._dynamo.dont_skip_tracing
+        def g3(x):
+            return f1(x)
+
+        cnts.clear()
+        torch.compile(g3, backend=cnts, fullgraph=True)(torch.randn(3))
+        self.assertEqual(cnts.frame_count, 1)
+
+        # test that dont_skip_tracing is recursive, applied to skipped function
+        f3_unskip = torch._dynamo.dont_skip_tracing(f3)
+        cnts.clear()
+        torch.compile(f3_unskip, backend=cnts, fullgraph=True)(torch.randn(3))
+        self.assertEqual(cnts.frame_count, 1)
+
+        # test dont_skip_tracing with graph breaks
+        inp = torch.ones(3)
+        res = torch.compile(f4, backend=cnts)(inp)
+        self.assertEqual(res, inp + 6)
+
+        @torch.compile(backend=cnts)
+        def g4(x):
+            x = f5(x, 1)
+            x = torch._dynamo.dont_skip_tracing(f6)(x)
+            x = f5(x, 8)
+            return x
+
+        res = g4(inp)
+        self.assertEqual(res, inp + 6)
+
+        # test nested dont_skip_tracing
+        # this also happens to test if a previously skipped frame (f4)
+        # can actually be compiled if called as a top-level function (in the case of a graph break)
+        # TODO the reset is necessary for now since attempting to trace f4 previously
+        # resulted in an unconditional skip
+        torch._dynamo.reset()
+        f4_unskip = torch._dynamo.dont_skip_tracing(f4)
+        res = torch.compile(f4_unskip, backend=cnts)(inp)
+        self.assertEqual(res, inp + 15)
+
+        # test dont_skip_tracing that is activated outside torch.compile
+        f4_unskip2 = torch._dynamo.dont_skip_tracing(torch.compile(f4, backend=cnts))
+        res = f4_unskip2(inp)
+        self.assertEqual(res, inp + 15)
+
+        # test context manager from inside
+        @torch.compile(backend=cnts)
+        def g5(x):
+            x = f5(x, 1)
+            with torch._dynamo.dont_skip_tracing():
+                x = f5(x, 2)
+                torch._dynamo.graph_break()
+                x = f5(x, 4)
+            x = f5(x, 8)
+            return x
+
+        res = g5(inp)
+        self.assertEqual(res, inp + 6)
+
+        # test context manager from outside
+        with torch._dynamo.dont_skip_tracing():
+            res = torch.compile(f4, backend=cnts)(inp)
+        self.assertEqual(res, inp + 15)
+
+        # test skipped function from different dont_skip_tracing regions
+        @torch.compile(backend=cnts)
+        def g6(x):
+            fn1 = f5
+            with torch._dynamo.dont_skip_tracing():
+                fn2 = f5
+                x = fn1(x, 1)
+            x = fn2(x, 2)
+            return x
+
+        res = g6(inp)
+        self.assertEqual(res, inp + 1)
+
+    def test_patch_dynamo_config_errors(self):
+        @torch.compile(backend="eager")
+        def f1(x):
+            with torch._dynamo.patch_dynamo_config(nonexistent=False):
+                return x + 1
+
+        with self.assertRaisesRegex(Exception, "patch_dynamo_config does not support"):
+            f1(torch.randn(3))
+
+        @torch.compile(backend="eager")
+        def f2(x):
+            with torch._dynamo.patch_dynamo_config("verbose", {"a": 1}):
+                return x + 1
+
+        with self.assertRaisesRegex(
+            Exception, "patch_dynamo_config does not support .* with non-safe-constant"
+        ):
+            f2(torch.randn(3))
+
+        @torch.compile(backend="eager")
+        def f3(x):
+            with torch._dynamo.patch_dynamo_config({"recompile_limit": 1}):
+                return x + 1
+
+        with self.assertRaisesRegex(Exception, "patch_dynamo_config does not support"):
+            f3(torch.randn(3))
+
+        @torch.compile(backend="eager")
+        def f4(x):
+            with torch._dynamo.patch_dynamo_config(verbose=object()):
+                return x + 1
+
+        with self.assertRaisesRegex(
+            Exception, "Cannot convert patch_dynamo_config args/kwargs to constants."
+        ):
+            f4(torch.randn(3))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_dicts.py b/test/dynamo/test_dicts.py
index 1b6b99a8b0cb..8774f034356f 100644
--- a/test/dynamo/test_dicts.py
+++ b/test/dynamo/test_dicts.py
@@ -1,17 +1,25 @@
 # Owner(s): ["module: dynamo"]
 
 # ruff: noqa: TRY002
+<<<<<<< HEAD
 # flake8: noqa
 
 import dataclasses
 import gc
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 import types
 import unittest
 import weakref
 from collections import defaultdict, namedtuple, OrderedDict
+<<<<<<< HEAD
 from dataclasses import dataclass, fields, is_dataclass
 from typing import Any, Optional, Tuple
+=======
+from typing import Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo.test_case
@@ -20,8 +28,12 @@
 import torch.nn
 import torch.utils.checkpoint
 from torch._dynamo.testing import same
+<<<<<<< HEAD
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import TestCase
+=======
+from torch._dynamo.utils import dict_items
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SimpleDict(dict):
@@ -433,7 +445,11 @@ class dotdict(dict):
         config = dotdict({"a": 1, "b": 2})
 
         def fn(x):
+<<<<<<< HEAD
             x2 = x * 2
+=======
+            x2 = x * 2  # noqa: F841
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x3 = x * config.get("a", 3)
             return x3
 
@@ -641,8 +657,13 @@ def test_dict_subclass_initialization_in_graph(self):
         ):
 
             class CustomDict(super_class):
+<<<<<<< HEAD
                 def __new__(self, *args, **kwargs):
                     return super().__new__(self, *args, **kwargs)
+=======
+                def __new__(cls, *args, **kwargs):
+                    return super().__new__(cls, *args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 def __init__(self, *args, **kwargs):
                     super().__init__(*args, **kwargs)
@@ -804,7 +825,11 @@ def fn(x):
             d = {"a": 2, "b": 3, "c": 5 * x}
             mp = types.MappingProxyType(d)
             y = torch.sin(x * mp["a"])
+<<<<<<< HEAD
             for k, v in mp.items():
+=======
+            for k, v in mp.items():  # noqa: PERF102
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 y += torch.cos(x * v)
             return mp
 
@@ -821,7 +846,11 @@ def test_mapping_proxy_for_nonlocal(self):
         def fn(x):
             mp = types.MappingProxyType(d)
             y = torch.sin(x * mp["a"])
+<<<<<<< HEAD
             for k, v in mp.items():
+=======
+            for k, v in mp.items():  # noqa: PERF102
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 y += torch.cos(x * v)
             d["d"] = 4
             return mp
@@ -842,8 +871,15 @@ def test_mapping_proxy_existing(self):
 
         def fn(x, mp):
             y = torch.sin(x * mp["a"])
+<<<<<<< HEAD
             for k, v in mp.items():
                 y += torch.cos(x * v)
+=======
+            for k, v in mp.items():  # noqa: PERF102
+                y += torch.cos(x * v)
+            if isinstance(mp, types.MappingProxyType):
+                y *= 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return y
 
         opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
@@ -863,6 +899,24 @@ def fn(x, mp):
         res = opt_fn(x, mp)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
+=======
+    def test_dict_construction_from_mapping_proxy(self):
+        d = {"a": 2, "b": 3, "c": 5}
+
+        def fn(x, mp):
+            d = dict(mp)
+            y = torch.sin(x * d["a"])
+            return y
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.randn(4)
+        mp = types.MappingProxyType(d)
+        ref = fn(x, mp)
+        res = opt_fn(x, mp)
+        self.assertEqual(ref, res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mapping_proxy_existing_mutation(self):
         d = {"a": 2, "b": 3, "c": 5}
 
@@ -935,6 +989,82 @@ def fn(x, d):
         self.assertEqual(ref, res)
         self.assertEqual(d1.calls, d2.calls)
 
+<<<<<<< HEAD
+=======
+    def test_items_type(self):
+        def fn():
+            d = dict({"a": 1, "b": "2", "c": torch.tensor(3)})  # noqa: C418
+            return d.items()
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        ref = fn()
+        res = opt_fn()
+        self.assertEqual(ref, res)
+        self.assertEqual(type(res), dict_items)
+
+    def test_builtin_or_with_invalid_types(self):
+        args = (
+            1,  # int
+            1.0,  # float
+            "a",  # str
+            (1, 2),  # tuple
+            [1, 2],  # list
+        )
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(b: Any):
+            a = {"one": torch.ones(1)}
+            return a | b
+
+        from torch._dynamo.exc import InternalTorchDynamoError
+
+        for arg in args:
+            with self.assertRaisesRegex(
+                InternalTorchDynamoError, "unsupported operand type"
+            ):
+                _ = fn(arg)
+
+    def test_builtin_or_with_diff_keys(self):
+        def f():
+            a = {"one": torch.ones(1)}
+            b = {"two": torch.ones(2)}
+            return a, b, a | b, b | a, a.__or__(b), b.__or__(a)
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=True)
+        self.assertEqual(f(), opt_f())
+
+    def test_builtin_or_with_same_keys(self):
+        def f():
+            a = {"one": torch.ones(1), "two": torch.ones(2)}
+            b = {"one": torch.ones(1), "three": torch.ones(3)}
+            return a, b, a | b, b | a, a.__or__(b), b.__or__(a)
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=True)
+        self.assertEqual(f(), opt_f())
+
+    def test_builtin_ior_(self):
+        def f():
+            a = {"one": torch.ones(1)}
+            b = {"two": torch.ones(2)}
+            a |= b
+            return a, b
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=True)
+        self.assertEqual(f(), opt_f())
+
+    def test_newly_constructed_default_dict(self):
+        def f(x):
+            d = defaultdict(list)
+            d[0] = 42
+            return x + 1, d
+
+        x = torch.ones(2)
+        ref = f(x)
+        res = torch.compile(f, backend="eager", fullgraph=True)(x)
+
+        self.assertEqual(ref, res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_dynamic_shapes.py b/test/dynamo/test_dynamic_shapes.py
index 7a2ceb0f26b9..f69e3d7c6e69 100644
--- a/test/dynamo/test_dynamic_shapes.py
+++ b/test/dynamo/test_dynamic_shapes.py
@@ -49,6 +49,15 @@ def make_dynamic_cls(cls):
         suffix,
         (config, "assume_static_by_default", False),
         (config, "specialize_int", False),
+<<<<<<< HEAD
+=======
+        # When we unspecialize float, we wobble tests by changing
+        # the op count since previously we would just specialize and constant
+        # fold floats into the graph, whereas when we unspecialize we will have
+        # ops for item, add, and all other tensorified operations. Since these
+        # tests really aren't testing that, we purposely specialize floats here.
+        (config, "specialize_float", True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (fx_config, "translation_validation", TEST_Z3),
         (fx_config, "check_shape_env_recorded_events", True),
         (fx_config, "validate_shape_env_version_key", True),
diff --git a/test/dynamo/test_einops.py b/test/dynamo/test_einops.py
new file mode 100644
index 000000000000..af15b91434c1
--- /dev/null
+++ b/test/dynamo/test_einops.py
@@ -0,0 +1,158 @@
+# Owner(s): ["module: dynamo"]
+import importlib
+import subprocess
+import sys
+import unittest
+
+import torch
+import torch._dynamo.config
+import torch._dynamo.test_case
+from torch import nn
+from torch._dynamo.test_case import TestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+
+
+HAS_EINOPS = importlib.util.find_spec("einops")
+
+if HAS_EINOPS:
+    import einops
+
+    einops_version = einops.__version__
+else:
+    einops_version = "none"
+einops_version_sanitized = einops_version.replace(".", "_")
+
+
+@unittest.skipIf(not HAS_EINOPS, "these tests require einops")
+class TestEinops(TestCase):
+    """
+    These tests adapted from similar tests in the einops repo.
+    https://github.com/arogozhnikov/einops/blob/main/einops/tests/test_other.py#L254
+
+    The goal of this test suite is to test torch.compile x einops for multiple
+    versions of einops. Our goal is to prevent regressions in einops from changes
+    in PyTorch.
+    """
+
+    @unittest.skipIf(
+        einops_version == "0.6.1", "https://github.com/pytorch/pytorch/issues/157417"
+    )
+    @parametrize("version", [einops_version_sanitized])
+    def test_functions(self, version):
+        from einops import einsum, pack, rearrange, reduce, repeat, unpack
+
+        class TorchModuleWithOperations(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x_abc, suffix=""):
+                a, b, c = x_abc.shape
+
+                def suf(pattern):
+                    parts = pattern.split()
+                    return " ".join(
+                        [p if p[-1] not in "acd" else p + suffix for p in parts]
+                    )
+
+                # patterns look a bit strange because names a, c, d will be modified on every run
+                # by suf function
+                x_abcd = repeat(x_abc, suf("a b c -> a b c 4"))
+                x_abc = reduce(x_abcd, suf("a b c d -> a b c"), "min")
+                x_abdc, ps = pack([x_abc] * (2 + len(suffix)), suf("a b * c"))
+                x_array = unpack(
+                    rearrange(x_abdc, suf("a b d c -> (a b ) 1 c d")), ps, "ab one1 c *"
+                )
+                x1 = x_array[0] + len(x_array)
+                x1 = rearrange(x1, suf("(a b ) 1 c -> a b c"), b=b)
+                addition = einsum(x_abc, x_abcd, suf("a b c , a b c d -> d"))[0]
+                return x1 + addition
+
+        original = TorchModuleWithOperations()
+        # Einops only interacts with Dynamo but we test backend="inductor" just in case
+        compiled = torch.compile(original, backend="inductor", fullgraph=True)
+        for size in [10, 20, 40]:
+            x = torch.rand([size, size + 1, size + 2])
+            for suffix in ["", "suf1", "other_suffix"]:
+                result1 = compiled(x, suffix)
+                result2 = original(x.double(), suffix).float()
+                self.assertEqual(result1, result2)
+
+    @parametrize("version", [einops_version_sanitized])
+    def test_layers(self, version):
+        from einops.layers.torch import EinMix, Rearrange, Reduce
+
+        original = nn.Sequential(
+            Rearrange("b (t c) -> b t c", c=16),
+            EinMix(
+                "b t c -> qkv b t cout",
+                weight_shape="qkv c cout",
+                bias_shape="qkv cout",
+                qkv=3,
+                c=16,
+                cout=8,
+            ),
+            Reduce("qkv b t cout -> b t qkv", "min", cout=8),
+        )
+
+        # Einops only interacts with Dynamo but we test backend="inductor" just in case
+        compiled = torch.compile(original, backend="inductor", fullgraph=True)
+
+        for size in [16, 32, 64]:
+            x = torch.rand([size, size])
+            result1 = original(x)
+            result2 = compiled(x.double()).float()
+            self.assertEqual(result1, result2)
+
+    @parametrize("version", [einops_version_sanitized])
+    def test_no_recompile_on_lazy_state(self, version):
+        """einops has some lazy state that gets initialized the first time an API
+        is called. This should not trigger a recompile."""
+        script = """\
+import torch
+import torch.nn as nn
+from einops import einsum, pack, reduce, repeat, unpack, rearrange
+
+class TorchModuleWithOperations(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x_abc, suffix=""):
+        a, b, c = x_abc.shape
+
+        def suf(pattern):
+            parts = pattern.split()
+            return " ".join([p if p[-1] not in "acd" else p + suffix for p in parts])
+
+        # patterns look a bit strange because names a, c, d will be modified on every run
+        # by suf function
+        x_abcd = repeat(x_abc, suf("a b c -> a b c 4"))
+        x_abc = reduce(x_abcd, suf("a b c d -> a b c"), "min")
+        x_abdc, ps = pack([x_abc] * (2 + len(suffix)), suf("a b * c"))
+        x_array = unpack(rearrange(x_abdc, suf("a b d c -> (a b ) 1 c d")), ps, "ab one1 c *")
+        x1 = x_array[0] + len(x_array)
+        x1 = rearrange(x1, suf("(a b ) 1 c -> a b c"), b=b)
+        addition = einsum(x_abc, x_abcd, suf("a b c , a b c d -> d"))[0]
+        return x1 + addition
+
+compiled_fn = torch.compile(TorchModuleWithOperations(), fullgraph=True)
+x = torch.arange(2 * 3 * 5).view(2, 3, 5)
+y = compiled_fn(x)
+
+# Should not recompile!
+with torch.compiler.set_stance("fail_on_recompile"):
+    z = compiled_fn(x)
+"""
+        subprocess.check_output([sys.executable, "-c", script])
+
+
+instantiate_parametrized_tests(
+    TestEinops,
+)
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py
index 39a7ff75eb55..754c0fb5c800 100644
--- a/test/dynamo/test_error_messages.py
+++ b/test/dynamo/test_error_messages.py
@@ -1,9 +1,17 @@
 # Owner(s): ["module: dynamo"]
 
+<<<<<<< HEAD
+=======
+import logging
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 import traceback
 import unittest
 import warnings
+<<<<<<< HEAD
+=======
+from functools import lru_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo
@@ -11,6 +19,10 @@
 import torch._dynamo.test_case
 import torch.utils._pytree as python_pytree
 from torch._dynamo.exc import Unsupported
+<<<<<<< HEAD
+=======
+from torch._dynamo.testing import skipIfNotPy312
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.utils import counters
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
@@ -35,6 +47,17 @@
 """
 
 
+<<<<<<< HEAD
+=======
+class GenericCtxMgr:
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GraphBreakMessagesTest(LoggingTestCase):
     def test_dynamic_shape_operator(self):
         def fn():
@@ -87,7 +110,16 @@ def fn(x):
                 torch.Tensor([1])
             ),
             """\
+<<<<<<< HEAD
 Tensor.item
+=======
+Unsupported Tensor.item() call with capture_scalar_outputs=False
+  Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.
+  Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
+
+  Developer debug context: call_method TensorVariable() item () {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from user code:
    File "test_error_messages.py", line N, in fn
@@ -107,7 +139,11 @@ def fn(x):
                 """\
 Data dependent operator
   Explanation: Operator `aten.equal.default` has a non-Tensor output whose value is dependent on the data of Tensor inputs.
+<<<<<<< HEAD
   Hint: Consider wrapping the operator into a PyTorch-understood custom operator (see https:/pytorch.org/tutorials/advanced/custom_ops_landing_page.html)
+=======
+  Hint: Consider wrapping the operator into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Developer debug context: aten.equal.default
 
@@ -117,6 +153,36 @@ def fn(x):
     return torch.equal(x, x)""",
             )
 
+<<<<<<< HEAD
+=======
+    def test_sort_with_nonconstant_keys(self):
+        lst = [
+            torch.tensor(4),
+            torch.tensor(1),
+            torch.tensor(2),
+            torch.tensor(3),
+        ]
+
+        def fn(lst):
+            return sorted(lst)
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(fn, backend="eager", fullgraph=True)(lst),
+            """\
+sort with non-constant keys
+  Explanation: Cannot perform sort with non-constant key. First non-constant key type: <class 'torch.Tensor'>. Most notably, we cannot sort with Tensor or SymInt keys, but we can sort ints.
+  Hint: Use something else as the key.
+
+  Developer debug context: TensorVariable()
+
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    return sorted(lst)""",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_super_call_method(self):
         def fn(it):
             return [x + 1 for x in it]
@@ -141,6 +207,36 @@ def fn(it):
     return [x + 1 for x in it]""",
         )
 
+<<<<<<< HEAD
+=======
+    def test_dict_items_input(self):
+        def fn(x, items):
+            it = iter(items)
+            return next(it), x.sin()
+
+        x = torch.randn(3)
+        dct = {"a": 3, "b": 3}
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(fn, backend="eager", fullgraph=True)(x, dct.items()),
+            """\
+Unsupported method call
+  Explanation: Dynamo does not know how to trace method `__iter__` of class `dict_items`
+  Hint: Avoid calling `dict_items.__iter__` in your code.
+  Hint: Please report an issue to PyTorch.
+  Hint: Consider moving the creation of dict view object (e.g. `dict.keys()`, `dict.items()`,) to the compiled region, instead of passing it as an input to the compiled region.
+  Hint: Dynamo does not fully support tracing builtin iterators (e.g. `map`, `zip`, `enumerate`) passed in from uncompiled to compiled regions (e.g. `torch.compile(fn)(enumerate(...))`). This can happen unintentionally if a previous graph break happens with a builtin iterator in the local scope.
+
+  Developer debug context: call_method UserDefinedObjectVariable(dict_items) __iter__ () {}
+
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    it = iter(items)""",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_super_call_function(self):
         def fn(it):
             return [x + 1 for x in it()]
@@ -176,6 +272,10 @@ def fn(obj):
 Unsupported context manager
   Explanation: Dynamo does not know how to enter a `int` context manager.
   Hint: Avoid using the unsupported context manager.
+<<<<<<< HEAD
+=======
+  Hint: If the context manager seems like it should be supported (e.g. torch.set_grad_enabled), then it may be the case that it was created outside the compiled region, which Dynamo does not support. Supported context managers can cross graph break boundaries only if they are local non-closure variables, or are intermediate values.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Hint: File an issue to PyTorch. Simple context managers can potentially be supported, but note that context managers can't be supported in general
 
   Developer debug context: Attempted SETUP_WITH/BEFORE_WITH on ConstantVariable(int: 3)
@@ -246,7 +346,11 @@ def post_munge(s):
 Attempted to call function marked as skipped
   Explanation: Dynamo developers have intentionally marked that the function `skip` in file `case.py` should not be traced.
   Hint: Avoid calling the function `skip`.
+<<<<<<< HEAD
   Hint: Remove the function `skip` or the file `case.py` from torch/_dynamo/trace_rules.py. More graph breaks may occur as a result of attempting to trace into the function.
+=======
+  Hint: Apply `@torch._dynamo.dont_skip_tracing` to the function `skip` to force tracing into the function. More graph breaks may occur as a result of attempting to trace into the function.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Hint: Please file an issue to PyTorch.
 
   Developer debug context: module: unittest.case, qualname: skip, skip reason: <missing reason>
@@ -295,10 +399,17 @@ def post_munge(s):
 Attempted to inline function marked as skipped
   Explanation: Dynamo developers have intentionally marked that the function `skip` should not be traced.
   Hint: Avoid calling the function `skip`.
+<<<<<<< HEAD
   Hint: Remove the function `case.py` from torch/_dynamo/trace_rules.py. More graph breaks may occur as a result of attempting to trace into the function.
   Hint: Please file an issue to PyTorch.
 
   Developer debug context: qualname: skip, name: skip, filename: `case.py`, skip reason: skipped according trace_rules.lookup SKIP_DIRS
+=======
+  Hint: Apply `@torch._dynamo.dont_skip_tracing` to the function `skip` to force tracing into the function. More graph breaks may occur as a result of attempting to trace into the function.
+  Hint: Please file an issue to PyTorch.
+
+  Developer debug context: qualname: skip, name: skip, filename: `case.py`, skip reason: skipped according trace_rules.lookup unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 from user code:
@@ -307,6 +418,7 @@ def post_munge(s):
             post_munge=post_munge,
         )
 
+<<<<<<< HEAD
     def test_disable(self):
         @torch.compiler.disable
         def inner():
@@ -339,6 +451,8 @@ def post_munge(s):
             post_munge=post_munge,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamo_graph_break_fn(self):
         def fn():
             torch._dynamo.graph_break()
@@ -600,6 +714,7 @@ def fn(mod, x):
         )
 
     def test_generic_ctx_mgr_graph_break(self):
+<<<<<<< HEAD
         class CtxMgr:
             def __enter__(self):
                 return self
@@ -613,6 +728,14 @@ def fn():
                     pass
                 with CtxMgr():
                     with CtxMgr():
+=======
+        def fn():
+            with GenericCtxMgr():
+                with GenericCtxMgr():
+                    pass
+                with GenericCtxMgr():
+                    with GenericCtxMgr():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         pass
                     torch._dynamo.graph_break()
 
@@ -627,7 +750,11 @@ def fn():
   Hint: Move the offending context manager(s) to outside the compiled region.
   Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
 
+<<<<<<< HEAD
   Developer debug context: Active generic context managers: [GenericContextWrappingVariable(CtxMgr), GenericContextWrappingVariable(CtxMgr)]
+=======
+  Developer debug context: Active generic context managers: [GenericContextWrappingVariable(GenericCtxMgr), GenericContextWrappingVariable(GenericCtxMgr)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 from user code:
@@ -637,21 +764,66 @@ def fn():
 
         self.assertExpectedInline(
             munge_exc(cm.exception.__cause__, suppress_suffix=True, skip=0),
+<<<<<<< HEAD
             """None""",
         )
 
     def test_unsupported_bytecode(self):
+=======
+            """\
+Call to `torch._dynamo.graph_break()`
+  Explanation: User-inserted graph break. Message: None
+  Hint: Remove the `torch._dynamo.graph_break()` call.
+
+  Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
+""",
+        )
+
+    def test_load_build_class(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn():
             class Foo:
                 pass
 
             return Foo
 
+<<<<<<< HEAD
         def post_munge(s):
             s = re.sub(r"0x[0-9A-Fa-f]+", "0xmem_addr", s)
             s = re.sub(
                 r"Instruction\(.*opname='LOAD_BUILD_CLASS'.*\)\n",
                 "Instruction(LOAD_BUILD_CLASS)",
+=======
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
+            """\
+LOAD_BUILD_CLASS bytecode not supported
+  Explanation: Dynamo does not support tracing classes that are defined in the compiled region.
+  Hint: Move the class definition out of the compiled region.
+  Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
+
+  Developer debug context:
+
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    class Foo:""",
+        )
+
+    @skipIfNotPy312
+    def test_unsupported_bytecode(self):
+        async def fn():
+            async for i in range(3):
+                print(i)
+            return 1
+
+        def post_munge(s):
+            s = re.sub(r"0x[0-9A-Fa-f]+", "0xmem_addr", s)
+            s = re.sub(
+                r"Instruction\(.*opname='GET_AITER'.*\)\n",
+                "Instruction(GET_AITER)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 s,
             )
             return s
@@ -661,6 +833,7 @@ def post_munge(s):
             lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
             """\
 Missing bytecode handler
+<<<<<<< HEAD
   Explanation: Dynamo does not know how to handle the bytecode instruction `LOAD_BUILD_CLASS`.
   Hint: Do not trace code that produces the `LOAD_BUILD_CLASS` bytecode instruction (see https:/docs.python.org/3/library/dis.html for bytecode semantics).
   Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
@@ -670,6 +843,17 @@ def post_munge(s):
 from user code:
    File "test_error_messages.py", line N, in fn
     class Foo:""",
+=======
+  Explanation: Dynamo does not know how to handle the bytecode instruction `GET_AITER`.
+  Hint: Do not trace code that produces the `GET_AITER` bytecode instruction (see https://docs.python.org/3/library/dis.html for bytecode semantics).
+  Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
+
+  Developer debug context: GET_AITER with args (<torch._dynamo.symbolic_convert.InstructionTranslator object at 0xmem_addr>, Instruction(GET_AITER)
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    async for i in range(3):""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             post_munge=post_munge,
         )
 
@@ -690,9 +874,15 @@ def post_munge(s):
             """\
 Reconstruction failure
   Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
+<<<<<<< HEAD
   Hint: If Dynamo attempting to trace a return statement and your code is attempting to return a variable that Dynamo cannot reconstruct, then remove it from the return statement.
   Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
   Hint: Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't havereconstruction rules may be fundamentally unreconstructable.
+=======
+  Hint: If Dynamo is attempting to trace a return statement and your code is attempting to return a variable that Dynamo cannot reconstruct, then remove it from the return statement.
+  Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
+  Hint: Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have reconstruction rules may be fundamentally unreconstructable.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Developer debug context: UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
 
@@ -744,9 +934,15 @@ def post_munge(s):
             """\
 Reconstruction failure
   Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
+<<<<<<< HEAD
   Hint: If Dynamo attempting to trace a return statement and your code is attempting to return a variable that Dynamo cannot reconstruct, then remove it from the return statement.
   Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
   Hint: Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't havereconstruction rules may be fundamentally unreconstructable.
+=======
+  Hint: If Dynamo is attempting to trace a return statement and your code is attempting to return a variable that Dynamo cannot reconstruct, then remove it from the return statement.
+  Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
+  Hint: Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have reconstruction rules may be fundamentally unreconstructable.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Developer debug context: UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
 
@@ -835,6 +1031,47 @@ def fn(x):
 """,
         )
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(IS_FBCODE, "assert gets patched in internal pytest")
+    @make_logging_test(graph_breaks=True)
+    def test_assert_failure_in_generic_ctx_mgr(self, records):
+        def fn(x):
+            with GenericCtxMgr():
+                assert x is None
+
+        with self.assertRaises(AssertionError):
+            torch.compile(fn, backend="eager")(torch.randn(3))
+
+        # only 1 graph break message
+        self.assertEqual(len(records), 1)
+        self.assertExpectedInline(
+            munge_exc(records[0].getMessage(), suppress_suffix=True, skip=0),
+            """\
+Graph break: skip: from user code at:
+  File "test_error_messages.py", line N, in fn
+    assert x is None
+""",
+        )
+        self.assertExpectedInline(
+            munge_exc(records[0].exc_info[1], suppress_suffix=True, skip=0),
+            """\
+Data-dependent assertion failed (cannot compile partial graph)
+  Explanation: Dynamo has determined when encountering a data-dependent assert failure that it should not compile the partial graph.
+  Hint: This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround.
+  Hint: Use `torch._assert()` to raise a hard AssertionError when the check fails. This error will propagate back the user code that called the compiled function (i.e. Dynamo will not trace any exception handling).
+  Hint: Remove the assert statement.
+  Hint: Move the assert statement outside of any context managers in order to graph break with partial graph compilation (if fullgraph=False).
+
+  Developer debug context: value: ConstantVariable(bool: False)
+
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    assert x is None""",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_no_internal_compiler_stacktrace(self):
         def fn():
             gn()
@@ -864,8 +1101,13 @@ def gn():
 Traceback (most recent call last):
   File "test_error_messages.py", line N, in test_no_internal_compiler_stacktrace
     torch.compile(fn, backend="eager", fullgraph=True)()
+<<<<<<< HEAD
   File "eval_frame.py", line N, in _fn
     raise e.with_traceback(None) from None
+=======
+  File "eval_frame.py", line N, in compile_wrapper
+    raise e.with_traceback(None) from e.__cause__  # User compiler error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch._dynamo.exc.Unsupported: Call to `torch._dynamo.graph_break()`
   Explanation: User-inserted graph break. Message: None
   Hint: Remove the `torch._dynamo.graph_break()` call.
@@ -1010,8 +1252,20 @@ def f3(x):
         self.assertIn("Foo().attr = x  # 0", records[-1].getMessage())
         self.assertIn("Foo().attr = x  # 1", records[-1].getMessage())
 
+<<<<<<< HEAD
         self.assertExpectedInline(
             munge_exc(records[-1].getMessage(), skip=0),
+=======
+        def post_munge(s):
+            return re.sub(
+                r"torch_dynamo_resume_in_f(\d)_at_(\d+)",
+                r"torch_dynamo_resume_in_f\1_at_N",
+                s,
+            )
+
+        self.assertExpectedInline(
+            post_munge(munge_exc(records[-1].getMessage(), skip=0)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """\
 Graph break in user code at test_error_messages.py:N
 Graph Break Reason: STORE_ATTR-caused graph break
@@ -1020,12 +1274,20 @@ def f3(x):
     f3(torch.randn(3))
   File "test_error_messages.py", line N, in f3
     Foo().attr = x  # 0
+<<<<<<< HEAD
   File "test_error_messages.py", line N, in torch_dynamo_resume_in_f3_at_999
+=======
+  File "test_error_messages.py", line N, in torch_dynamo_resume_in_f3_at_N
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Foo().attr = x  # 1
 
 ========== most recent `torch.compile` tracing attempt started here ==========
 
+<<<<<<< HEAD
   File "test_error_messages.py", line N, in torch_dynamo_resume_in_f3_at_1000
+=======
+  File "test_error_messages.py", line N, in torch_dynamo_resume_in_f3_at_N
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Foo().attr = x
 
 NOTE: the most recent `torch.compile` tracing attempt might not be where you applied `torch.compile`! This is due to how graph breaks are implemented - the optimized code object returned by Dynamo will call another Dynamo-generated resume function and tracing is re-enabled by calling the resume function as a normal Python function, which Dynamo intercepts as a top-level frame.
@@ -1077,6 +1339,139 @@ def f3(x):
 """,
         )
 
+<<<<<<< HEAD
+=======
+    @make_logging_test(dynamo=logging.DEBUG)
+    def test_lru_cache_warning_logs_user_stack_trace(self, records):
+        @lru_cache
+        def foo(x):
+            return x + 1
+
+        torch.compile(foo, backend="eager")(torch.randn(4))
+
+        lru_cache_log = None
+        for record in records:
+            if "call to a lru_cache wrapped function at:" in record.getMessage():
+                lru_cache_log = record.getMessage()
+                break
+
+        self.assertIsNotNone(lru_cache_log, "No lru_cache warning was logged")
+
+        self.assertExpectedInline(
+            munge_exc(lru_cache_log),
+            """\
+call to a lru_cache wrapped function at: _dynamo/external_utils.py:N
+  File "test_error_messages.py", line N, in test_lru_cache_warning_logs_user_stack_trace
+    torch.compile(foo, backend="eager")(torch.randn(4))
+""",
+        )
+
+    @make_logging_test(dynamo=logging.DEBUG)
+    def test_lru_cache_warning_logs_nested_call(self, records):
+        @lru_cache
+        def foo(x):
+            return x + 1
+
+        def nested(x):
+            return foo(x)
+
+        torch.compile(nested, backend="eager")(torch.randn(4))
+
+        lru_cache_log = None
+        for record in records:
+            if "call to a lru_cache wrapped function at:" in record.getMessage():
+                lru_cache_log = record.getMessage()
+                break
+
+        self.assertIsNotNone(lru_cache_log, "No lru_cache warning was logged")
+
+        self.assertExpectedInline(
+            munge_exc(lru_cache_log),
+            """\
+call to a lru_cache wrapped function at: test_error_messages.py:N
+  File "test_error_messages.py", line N, in test_lru_cache_warning_logs_nested_call
+    torch.compile(nested, backend="eager")(torch.randn(4))
+  File "test_error_messages.py", line N, in nested
+    return foo(x)
+""",
+        )
+
+    def test_disable_message(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def outer(fn, x):
+            return fn(x)
+
+        @torch.compiler.disable
+        def f(x):
+            return x + 1
+
+        def post_munge(s):
+            return re.sub(r"0x[0-9A-Fa-f]+", "0xmem_addr", s)
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: outer(f, torch.randn(3)),
+            """\
+Skip calling `torch.compiler.disable()`d function
+  Explanation: Skip calling function `<function GraphBreakMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: None)
+  Hint: Remove the `torch.compiler.disable` call
+
+  Developer debug context: <function GraphBreakMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>
+
+
+from user code:
+   File "test_error_messages.py", line N, in outer
+    return fn(x)""",
+            post_munge=post_munge,
+        )
+
+        @torch.compiler.disable(reason="test message")
+        def g(x):
+            return x + 2
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: outer(g, torch.randn(3)),
+            """\
+Skip calling `torch.compiler.disable()`d function
+  Explanation: Skip calling function `<function GraphBreakMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: test message)
+  Hint: Remove the `torch.compiler.disable` call
+
+  Developer debug context: <function GraphBreakMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>
+
+
+from user code:
+   File "test_error_messages.py", line N, in outer
+    return fn(x)""",
+            post_munge=post_munge,
+        )
+
+        class Mod(torch.nn.Module):
+            def forward(self, x):
+                return x + 3
+
+        mod = Mod()
+        mod.compile()
+        mod = torch.compiler.disable(mod, reason="test message 2")
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: outer(mod, torch.randn(3)),
+            """\
+Unsupported function call (delayed)
+  Explanation: Dynamo determined that a graph break should occur when calling `L['fn']`. Reason: Optimized `nn.Module` is wrapped with `torch.compiler.disable` (reason: test message 2)
+
+
+  Developer debug context: source: LocalSource(local_name='fn', is_input=True, dynamism=None, is_derefed_cell_contents=False)
+
+
+from user code:
+   File "test_error_messages.py", line N, in outer
+    return fn(x)""",
+            post_munge=post_munge,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_exc.py b/test/dynamo/test_exc.py
index 2d0288b8da04..646679f0dc92 100644
--- a/test/dynamo/test_exc.py
+++ b/test/dynamo/test_exc.py
@@ -256,14 +256,22 @@ def fn(x, shape):
   ==> L['x'].size()[0]: 3
   ==> L['x'].storage_offset(): 0
   ==> L['x'].stride()[0]: 1
+<<<<<<< HEAD
   ==> s0: 3
   ==> s1: 0
   ==> s2: 1
   ==> s3: 1
+=======
+  ==> s3: 1
+  ==> s52: 1
+  ==> s77: 3
+  ==> s86: 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Assertions:
   ==> (== 0 L['x'].storage_offset())
   ==> (== 1 L['x'].stride()[0])
+<<<<<<< HEAD
   ==> (== L['shape'][0] s1)
   ==> (== L['shape'][1] s2)
   ==> (== L['shape'][2] s3)
@@ -284,6 +292,28 @@ def fn(x, shape):
   ==> (== L['x'].size()[0] s0)
   ==> (> s0 0)
   ==> (>= 0 s1)
+=======
+  ==> (== L['shape'][0] s86)
+  ==> (== L['shape'][1] s52)
+  ==> (== L['shape'][2] s3)
+  ==> (== L['x'].size()[0] s77)
+  ==> (> s77 1)
+
+Target Expressions:
+  ==> (!= (+ s3 s52 s86) s77)
+  ==> (<= 0 s3)
+  ==> (<= 0 s52)
+  ==> (<= 0 s86)
+  ==> (<= 2 s77)
+  ==> (== 0 L['x'].storage_offset())
+  ==> (== 1 L['x'].stride()[0])
+  ==> (== L['shape'][0] s86)
+  ==> (== L['shape'][1] s52)
+  ==> (== L['shape'][2] s3)
+  ==> (== L['x'].size()[0] s77)
+  ==> (> s77 0)
+  ==> (>= 0 s86)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Failed Source Expressions:
   ==> (== (+ L['shape'][0] L['shape'][1] L['shape'][2]) L['x'].size()[0])""",
@@ -309,7 +339,11 @@ def fn(x, shape):
             BisectValidationException,
             lambda: fn(torch.randn(20), (5, 10, 5)),
             """\
+<<<<<<< HEAD
 translation validation failed when evaluating: Eq(s1 + s2 + s3, s0)
+=======
+translation validation failed when evaluating: Eq(s3 + s52 + s86, s77)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Failure occurred while running node:
     %split : [num_users=3] = call_method[target=split](args = (%l_x_, (%l_shape_0_, %l_shape_1_, %l_shape_2_)), kwargs = {})
@@ -321,14 +355,22 @@ def fn(x, shape):
   ==> L['x'].size()[0]: 3
   ==> L['x'].storage_offset(): 0
   ==> L['x'].stride()[0]: 1
+<<<<<<< HEAD
   ==> s0: 3
   ==> s1: 1
   ==> s2: 1
   ==> s3: 0
+=======
+  ==> s3: 0
+  ==> s52: 1
+  ==> s77: 3
+  ==> s86: 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Assertions:
   ==> (== 0 L['x'].storage_offset())
   ==> (== 1 L['x'].stride()[0])
+<<<<<<< HEAD
   ==> (== L['shape'][0] s1)
   ==> (== L['shape'][1] s2)
   ==> (== L['shape'][2] s3)
@@ -348,6 +390,27 @@ def fn(x, shape):
   ==> (== L['shape'][2] s3)
   ==> (== L['x'].size()[0] s0)
   ==> (> s0 0)
+=======
+  ==> (== L['shape'][0] s86)
+  ==> (== L['shape'][1] s52)
+  ==> (== L['shape'][2] s3)
+  ==> (== L['x'].size()[0] s77)
+  ==> (> s77 1)
+
+Target Expressions:
+  ==> (!= (+ s3 s52 s86) s77)
+  ==> (<= 0 s3)
+  ==> (<= 0 s52)
+  ==> (<= 0 s86)
+  ==> (<= 2 s77)
+  ==> (== 0 L['x'].storage_offset())
+  ==> (== 1 L['x'].stride()[0])
+  ==> (== L['shape'][0] s86)
+  ==> (== L['shape'][1] s52)
+  ==> (== L['shape'][2] s3)
+  ==> (== L['x'].size()[0] s77)
+  ==> (> s77 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Failed Source Expressions:
   ==> (== (+ L['shape'][0] L['shape'][1] L['shape'][2]) L['x'].size()[0])""",
diff --git a/test/dynamo/test_exceptions.py b/test/dynamo/test_exceptions.py
index 06449cc8fd1a..c94f2055b6de 100644
--- a/test/dynamo/test_exceptions.py
+++ b/test/dynamo/test_exceptions.py
@@ -2,7 +2,10 @@
 
 import contextlib
 import sys
+<<<<<<< HEAD
 import unittest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo.config
@@ -20,7 +23,20 @@
 
 
 class CustomException(Exception):
+<<<<<<< HEAD
     ...
+=======
+    pass
+
+
+class CustomExceptionMeta(type):
+    def __instancecheck__(cls, instance):
+        return True
+
+
+class CustomExceptionWithInstanceCheck(Exception, metaclass=CustomExceptionMeta):
+    pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CustomExceptionWithArgs(Exception):
@@ -151,6 +167,17 @@ def f(x: torch.Tensor):
         self.assertTrue(torch.equal(out, inp + 1))
 
     @make_dynamo_test
+<<<<<<< HEAD
+=======
+    def test_isinstance_CustomException(self):
+        assert isinstance(CustomException, type)
+        assert not isinstance(CustomException(), type)
+        C = CustomExceptionWithInstanceCheck
+        assert isinstance(C, C)
+        assert isinstance(C(), C)
+
+    @make_dynamo_test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_propagate_exception_inside_ctx_manager(self):
         @contextlib.contextmanager
         def cm():
@@ -199,7 +226,10 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     @unittest.skipIf(sys.version_info < (3, 11), "Python 3.11+")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @make_dynamo_test
     def test_raise_match(self):
         a = AttributeError
@@ -281,7 +311,10 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="eager")
         opt_fn(x)
 
+<<<<<<< HEAD
     @unittest.skipIf(sys.version_info < (3, 11), "Python 3.11+")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_exception_with_ctx_manager(self):
         def fn(x):
             x = torch.cos(x)
@@ -344,7 +377,11 @@ def fn(x):
 
     def test_raise_custom_exception(self):
         class Exc(Exception):
+<<<<<<< HEAD
             ...
+=======
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile(backend="eager", fullgraph=True)
         def fn(t):
@@ -361,7 +398,11 @@ def fn(t):
 
     def test_raise_custom_exception_with_args(self):
         class Exc(Exception):
+<<<<<<< HEAD
             ...
+=======
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile(backend="eager", fullgraph=True)
         def fn(t):
@@ -568,6 +609,37 @@ def fn(x, d, key):
         self.assertEqual(ref[1], res[1])
 
     @make_dynamo_test
+<<<<<<< HEAD
+=======
+    def test_raise_from_None_2(self):
+        def fn():
+            try:
+                raise ValueError
+            except Exception:
+                raise TypeError from None
+
+        try:
+            fn()
+        except TypeError as e:
+            assert e.__cause__ is None
+            assert e.__suppress_context__ is True
+
+    @make_dynamo_test
+    def test_raise_from_other(self):
+        def fn():
+            try:
+                raise ValueError
+            except Exception as e:
+                raise TypeError from e
+
+        try:
+            fn()
+        except TypeError as e:
+            assert isinstance(e.__cause__, ValueError)
+            assert e.__suppress_context__ is True
+
+    @make_dynamo_test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reraise_first_exc(self):
         def fn():
             try:
@@ -847,7 +919,10 @@ def fn(t):
         t = torch.randn(2)
         fn(t)
 
+<<<<<<< HEAD
     @unittest.skipIf(sys.version_info < (3, 11), "Python 3.11+")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_user_defined_exception_with_args(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(t):
@@ -880,6 +955,7 @@ def test_raise_set___context__(self):
         assert exc2.__context__ is None
 
 
+<<<<<<< HEAD
 class CPythonExceptionTests(torch._dynamo.test_case.TestCase):
     # Tests taken from CPython source code in cpython/Lib/test/test_exceptions.py
     # https://github.com/python/cpython/blob/v3.13.1/Lib/test/test_exceptions.py
@@ -1109,6 +1185,8 @@ def test_no_hang_on_context_chain_cycle3(self):
         self.assertIs(a.__context__, c)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_parametrized_tests(ExceptionTests)
 
 
diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index e9f2471caecf..10b75d9c016e 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -1873,7 +1873,11 @@ def true_fn(x):
                     return x + x
 
                 def false_fn(x):
+<<<<<<< HEAD
                     return x[:2]
+=======
+                    return x[:2].clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 return cond(x.shape[0] <= 2, true_fn, false_fn, [x])
 
@@ -1883,7 +1887,11 @@ def true_fn(x):
                     return x + x
 
                 def false_fn(x):
+<<<<<<< HEAD
                     return x[:2]
+=======
+                    return x[:2].clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 return cond(x.shape[0] <= 2, true_fn, false_fn, (x,))
 
@@ -1901,7 +1909,11 @@ def forward(self, x):
     le = sym_size_int <= 2;  sym_size_int = None
     cond_true_0 = self.cond_true_0
     cond_false_0 = self.cond_false_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(le, cond_true_0, cond_false_0, [l_x_]);  le = cond_true_0 = cond_false_0 = l_x_ = None
+=======
+    cond = torch.ops.higher_order.cond(le, cond_true_0, cond_false_0, (l_x_,));  le = cond_true_0 = cond_false_0 = l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_3 = cond[0]
     sym_size_int_1 = torch.ops.aten.sym_size.int(getitem_3, 0);  getitem_3 = None
     sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int_1);  sym_constrain_range_for_size_default = None
@@ -1924,7 +1936,12 @@ def forward(self, l_x_):
 def forward(self, l_x_):
     l_x__1 = l_x_
     getitem = l_x__1[slice(None, 2, None)];  l_x__1 = None
+<<<<<<< HEAD
     return (getitem,)""",
+=======
+    clone = getitem.clone();  getitem = None
+    return (clone,)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             # We could successfully export branches that return different sizes
             torch._dynamo.export(mod)(torch.randn(3, 2))
@@ -1977,7 +1994,11 @@ def body(x):
         xs = torch.randn(0, 2)
         with self.assertRaisesRegex(
             torch._dynamo.exc.Unsupported,
+<<<<<<< HEAD
             "zero-sized tensor",
+=======
+            "Observed exception",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch._dynamo.export(mod)(xs)
 
@@ -2516,7 +2537,12 @@ def forward(self, x):
         dynamic_shapes = {"x": (dim0,)}
         with self.assertRaisesRegex(
             torch._dynamo.exc.UserError,
+<<<<<<< HEAD
             "Not all values.*valid.*inferred to be a constant",
+=======
+            "You marked.*but your code specialized it to be a constant.*"
+            "If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.export.export(bar, (t,), dynamic_shapes=dynamic_shapes, strict=True)
 
@@ -2703,7 +2729,11 @@ def forward(self, x, y):
                 for node in ebar.graph_module.graph.nodes
                 if node.op == "placeholder"
             ],
+<<<<<<< HEAD
             ["torch.Size([s0, s1, s1])", "torch.Size([s0, s1, s1])"],
+=======
+            ["torch.Size([s17, s27, s27])", "torch.Size([s17, s27, s27])"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @torch._dynamo.config.patch(
@@ -3302,7 +3332,16 @@ def f_branch_return_non_tensor(x):
 
     def test_cond_raise_user_error_on_branch_return_multiple_tensors(self):
         def f_branch_return_multiple_tensors(pred, x, y):
+<<<<<<< HEAD
             return cond(pred, lambda x: (x, x), lambda x: (x, x), [y])
+=======
+            return cond(
+                pred,
+                lambda x: (x.clone(), x.clone()),
+                lambda x: (x.clone(), x.clone()),
+                [y],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         example_inputs = (torch.tensor(True), torch.randn(4), torch.randn(2))
         gm, _ = torch._dynamo.export(
@@ -3324,10 +3363,17 @@ def forward(self, x):
 
     def test_cond_raise_user_error_on_mismatch_return_length(self):
         def true_fn(x):
+<<<<<<< HEAD
             return x
 
         def false_fn(x):
             return (x, x)
+=======
+            return x.clone()
+
+        def false_fn(x):
+            return (x.clone(), x.clone())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def f_mismatch_return_length(x):
             return cond(torch.tensor(100), true_fn, false_fn, [x])
@@ -3458,7 +3504,11 @@ def test_symbool_guards(
                     gm, guards = torch._dynamo.export(f)(pred, x)
                     actual = normalize_gm(gm.print_readable(print_output=False))
                     # TODO: This is naughty, EXPECTTEST_ACCEPT=1 doesn't work
+<<<<<<< HEAD
                     self.assertExpectedInline(actual, exp_graph[i])
+=======
+                    self.assertExpectedInline(actual, exp_graph[i].format(size=size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dynamo_shape_env_guards = [
                         guard
                         for guard in guards
@@ -3480,23 +3530,41 @@ def test_symbool_guards(
         true_graph = """\
 class GraphModule(torch.nn.Module):
     def forward(self, pred, x):
+<<<<<<< HEAD
         arg1: "f32[s1, s2]";
 
         arg0, arg1, = fx_pytree.tree_flatten_spec(([pred, x], {}), self._in_spec)
         l_x_ = arg1
 
         sin: "f32[s1, s2]" = l_x_.sin();  l_x_ = None
+=======
+        arg0: "Sym(Eq(s26, {size}))"; arg1: "f32[s77, s27]";
+
+        arg0, arg1, = fx_pytree.tree_flatten_spec(([pred, x], {{}}), self._in_spec)
+        l_x_ = arg1
+
+        sin: "f32[s77, s27]" = l_x_.sin();  l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return pytree.tree_unflatten([sin], self._out_spec)
 """
         false_graph = """\
 class GraphModule(torch.nn.Module):
     def forward(self, pred, x):
+<<<<<<< HEAD
         arg1: "f32[s1, s2]";
 
         arg0, arg1, = fx_pytree.tree_flatten_spec(([pred, x], {}), self._in_spec)
         l_x_ = arg1
 
         cos: "f32[s1, s2]" = l_x_.cos();  l_x_ = None
+=======
+        arg0: "Sym(Eq(s26, {size}))"; arg1: "f32[s77, s27]";
+
+        arg0, arg1, = fx_pytree.tree_flatten_spec(([pred, x], {{}}), self._in_spec)
+        l_x_ = arg1
+
+        cos: "f32[s77, s27]" = l_x_.cos();  l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return pytree.tree_unflatten([cos], self._out_spec)
 """
         true_guard_code = [
@@ -3848,7 +3916,11 @@ def forward(self, pred, x):
     d = torch.ones(6, 4)
     cond_true_0 = self.cond_true_0
     cond_false_0 = self.cond_false_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(l_pred_, cond_true_0, cond_false_0, [a, b, l_x_, d, c]);  l_pred_ = cond_true_0 = cond_false_0 = a = b = l_x_ = d = c = None
+=======
+    cond = torch.ops.higher_order.cond(l_pred_, cond_true_0, cond_false_0, (a, b, l_x_, d, c));  l_pred_ = cond_true_0 = cond_false_0 = a = b = l_x_ = d = c = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return pytree.tree_unflatten([getitem], self._out_spec)""",  # noqa: B950,E122
         )
@@ -4571,6 +4643,23 @@ def forward(self, x):
         out = graph(x)
         self.assertEqual(ref_out, out)
 
+<<<<<<< HEAD
+=======
+    def test_strict_fake_tensor_prop_real_tensors(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return bool(x.eq(0.1).any().item())
+
+        model = Foo()
+        inputs = (torch.randn(64),)
+        ref = model(*inputs)
+        with torch._functorch.config.patch(fake_tensor_propagate_real_tensors=True):
+            ep = torch.export.export(model, inputs, strict=True)
+            res = ep.module()(*inputs)
+
+        self.assertEqual(ref, res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ExportTestsDevice(torch._dynamo.test_case.TestCase):
     def test_export_with_parameters(self, device):
diff --git a/test/dynamo/test_flat_apply.py b/test/dynamo/test_flat_apply.py
index 8e5d94529918..92b944b055e3 100644
--- a/test/dynamo/test_flat_apply.py
+++ b/test/dynamo/test_flat_apply.py
@@ -175,7 +175,11 @@ def fn(x, y):
 class <lambda>(torch.nn.Module):
     def forward(self, arg0_1: "f32[10]", arg1_1: "f32[10]"):
         mul: "f32[10]" = torch.ops.aten.mul.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
+<<<<<<< HEAD
         _tensor_constant0 = self._tensor_constant0
+=======
+        _tensor_constant0: "f32[1]" = self._tensor_constant0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         add: "f32[10]" = torch.ops.aten.add.Tensor(mul, _tensor_constant0);  mul = _tensor_constant0 = None
         return (add,)
 """,  # NOQA: B950
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index fea24d61429c..9bb815a5c7e7 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -36,7 +36,10 @@
 from torch.nn import functional as F
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     disable_translation_validation_if_dynamic_shapes,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     instantiate_parametrized_tests,
     parametrize,
 )
@@ -152,6 +155,31 @@ def test_inline_script_if_tracing_fn_with_default_args(a, b):
     def test_inline_lru_cache_fn_with_default_args(a, b):
         return inline_lru_cache_fn_with_default_args(a, 2, b)
 
+<<<<<<< HEAD
+=======
+    def test_lru_cache_warning_issued_during_tracing(self):
+        import warnings
+        from functools import lru_cache
+
+        @lru_cache
+        def foo(x):
+            return x + 1
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            torch.compile(foo, backend="eager")(torch.randn(4))
+
+        for warning in w:
+            warning_message = str(warning.message)
+            if (
+                "Dynamo detected a call to a `functools.lru_cache`-wrapped"
+                in warning_message
+            ):
+                break
+        else:
+            self.assertTrue(False, "Expected warning about lru_cache not found")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @make_test
     def test_add(a, b):
         return a + b
@@ -498,6 +526,20 @@ def test_tuple2(a, b):
         args = [a, b]
         return sub(*args)
 
+<<<<<<< HEAD
+=======
+    def test_size_tuple_add(self):
+        def fn():
+            size = torch.Size([])
+            assert isinstance(size + size, torch.Size)
+            assert isinstance(size + (), tuple)
+            assert isinstance(size + (), torch.Size)
+
+        fn()
+        compiled_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        compiled_fn()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @make_test
     def test_is_in_onnx_export(x, y):
         if torch.onnx.is_in_onnx_export():
@@ -1003,6 +1045,7 @@ def test_tensor_dim(x):
         fn = torch.Tensor.dim
         return fn(x + 1)
 
+<<<<<<< HEAD
     @make_test
     def test_tensor_is_inference(x):
         if x.is_inference():
@@ -1010,6 +1053,8 @@ def test_tensor_is_inference(x):
         else:
             return x - 1
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_is_inference_recompilation(self):
         def fn(x):
             if x.is_inference():
@@ -1021,8 +1066,12 @@ def fn(x):
             x_inference = torch.randn(2, 2)
 
         cnts = torch._dynamo.testing.CompileCounter()
+<<<<<<< HEAD
         opt_fn = torch.compile(fn, backend=cnts, fullgraph=True)
 
+=======
+        opt_fn = torch.compile(fn, backend=cnts, fullgraph=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(2, 2)
 
         self.assertEqual(fn(x), opt_fn(x))
@@ -1031,6 +1080,24 @@ def fn(x):
         self.assertEqual(fn(x_inference), opt_fn(x_inference))
         self.assertEqual(cnts.frame_count, 2)  # Recompiles
 
+<<<<<<< HEAD
+=======
+    def test_is_inference_mode_global_recompilation(self):
+        def fn(x):
+            if torch.is_inference_mode_enabled():
+                return x + 1
+            else:
+                return x - 1
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnts, fullgraph=False)
+
+        x = torch.randn(2, 2)
+
+        self.assertEqual(fn(x), opt_fn(x))
+        self.assertEqual(cnts.frame_count, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @make_test
     def test_get_privateuse1_name(x):
         if torch._C._get_privateuse1_backend_name() == "privateuseone":
@@ -1664,6 +1731,51 @@ def test_tuple_contains(a, b):
         return a - b
 
     @make_test
+<<<<<<< HEAD
+=======
+    def test_set_invalid_ConstantVariable_op(a, b):
+        s = set({"banana", "apple", "orange"})
+        try:
+            s - 1
+        except TypeError:
+            return a + b
+        except Exception:
+            return a - b
+        else:
+            return a * b
+
+    @make_test
+    def test_set_pop_raise_KeyError(a, b):
+        s = set()
+        try:
+            s.pop()
+        except KeyError:
+            return a + b
+        except Exception:
+            return a - b
+        else:
+            return a * b
+
+    @make_test
+    def test_set_issubset(a, b):
+        vals1 = {"a", "b", "c"}
+        vals2 = {"b", "c"}
+        vals3 = {"b", "e", "f"}
+        if vals2.issubset(vals1) and not vals2.issubset(vals3):
+            return a + b
+        return a - b
+
+    @make_test
+    def test_set_issuperset(a, b):
+        vals1 = {"a", "b", "c"}
+        vals2 = {"b", "c"}
+        vals3 = {"b", "e", "f"}
+        if vals1.issuperset(vals2) and not vals1.issuperset(vals3):
+            return a + b
+        return a - b
+
+    @make_test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_set_update_bytecode(x):
         # This produces bytecode SET_UPDATE since python 3.9
         var = {"apple", "banana", "cherry"}
@@ -1711,7 +1823,12 @@ def fn(a):
     def test_set_intersection(a, b):
         set1 = {"apple", "banana", "cherry"}
         set2 = {"google", "microsoft", "apple"}
+<<<<<<< HEAD
         intersection_set = set1.intersection(set2)
+=======
+        set3 = {"shoes", "flipflops", "apple"}
+        intersection_set = set1.intersection(set2, set3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if "apple" in intersection_set:
             x = a + b
         else:
@@ -1720,6 +1837,7 @@ def test_set_intersection(a, b):
             y = a + b
         else:
             y = a - b
+<<<<<<< HEAD
         return x, y
 
     @make_test
@@ -1732,12 +1850,130 @@ def test_set_union(a, b):
         else:
             x = a - b
         if "banana" in union_set:
+=======
+        if "shoes" in intersection_set:
+            z = a + b
+        else:
+            z = a - b
+        return x, y, z
+
+    @make_test
+    def test_set_intersection_update(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        set3 = {"shoes", "flipflops", "apple"}
+        set1.intersection_update(set2, set3)
+        if "apple" in set1:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in set1:
+            y = a + b
+        else:
+            y = a - b
+        if "shoes" in set1:
+            z = a + b
+        else:
+            z = a - b
+        return x, y, z
+
+    @parametrize("_type", [set])
+    def test_set_union(self, _type):
+        @make_test
+        def fn(a, b):
+            set1 = _type({"apple", "banana", "cherry"})
+            set2 = _type({"google", "microsoft", "apple"})
+            set3 = _type({"shoes", "flipflops", "sneakers"})
+            union_set = set1.union(set2, set3)
+            if "apple" in union_set:
+                x = a + b
+            else:
+                x = a - b
+            if "banana" in union_set:
+                y = a + b
+            else:
+                y = a - b
+            if "shoes" in union_set:
+                z = a + b
+            else:
+                z = a - b
+            return x, y, z
+
+        fn(self)
+
+    @parametrize(
+        "fn_name", ["add", "symmetric_difference", "symmetric_difference_update"]
+    )
+    def test_set_raise_TypeError(self, fn_name):
+        @make_test
+        def fn(a, b):
+            set1 = {"apple", "banana", "cherry"}
+            try:
+                getattr(set1, fn_name)()
+            except TypeError:
+                return a + b
+            return a - b
+
+        fn(self)
+
+    @make_test
+    def test_set_difference(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        set3 = {"shoes", "flipflops", "sneakers"}
+        difference_set = set1.difference(set2, set3)
+        if "apple" in difference_set:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in difference_set:
+            y = a + b
+        else:
+            y = a - b
+        if "shoes" in difference_set:
+            z = a + b
+        else:
+            z = a - b
+        return x, y, z
+
+    @make_test
+    def test_set_difference_update(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        set3 = {"shoes", "flipflops", "sneakers"}
+        set1.difference_update(set2, set3)
+        if "apple" in set1:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in set1:
+            y = a + b
+        else:
+            y = a - b
+        if "shoes" in set1:
+            z = a + b
+        else:
+            z = a - b
+        return x, y, z
+
+    @make_test
+    def test_set_symmetric_difference(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        symmetric_diff_set = set1.difference(set2)
+        if "apple" in symmetric_diff_set:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in symmetric_diff_set:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             y = a + b
         else:
             y = a - b
         return x, y
 
     @make_test
+<<<<<<< HEAD
     def test_set_difference(a, b):
         set1 = {"apple", "banana", "cherry"}
         set2 = {"google", "microsoft", "apple"}
@@ -1747,6 +1983,17 @@ def test_set_difference(a, b):
         else:
             x = a - b
         if "banana" in difference_set:
+=======
+    def test_set_symmetric_difference_update(a, b):
+        set1 = {"apple", "banana", "cherry"}
+        set2 = {"google", "microsoft", "apple"}
+        set1.difference(set2)
+        if "apple" in set1:
+            x = a + b
+        else:
+            x = a - b
+        if "banana" in set1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             y = a + b
         else:
             y = a - b
@@ -1784,6 +2031,26 @@ def fn(x):
         x = torch.rand(4)
         self.assertEqual(fn(x), opt_fn(x))
 
+<<<<<<< HEAD
+=======
+    @parametrize("method", ["add", "__contains__"])
+    def test_set_raise_TypeError_on_unshashable_obj(self, method):
+        @make_test
+        def fn(a, b):
+            s = set({1, 2, 3, 4})
+            try:
+                m = getattr(s, method)
+                m([[]])
+            except TypeError:
+                return a + b
+            except Exception:
+                return a - b
+            else:
+                return a * b
+
+        fn(self)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_constant_set(self):
         s = set([1, 2])
 
@@ -2235,7 +2502,10 @@ def test_distributed_is_initialized(x):
         else:
             return x - 1
 
+<<<<<<< HEAD
     @disable_translation_validation_if_dynamic_shapes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @make_test
     def test_torch_distributions_functions(x):
         normal = torch.distributions.Normal(x, torch.tensor(1))
@@ -2648,7 +2918,11 @@ def forward(self, L_x_: "f32[3]"):
                 normalize_gm(backend.graphs[0].print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", L_x_: "f32[s0]"):
+=======
+    def forward(self, s77: "Sym(s77)", L_x_: "f32[s77]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         l_x_ = L_x_
 
         sum_1: "f32[]" = l_x_.sum();  l_x_ = None
@@ -2878,6 +3152,7 @@ def forward(self, L_lambda0_keywords_y_: "f32[2, 2]"):
                 normalize_gm(backend.graphs[0].print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
         l_lambda0_keywords_y_ = L_lambda0_keywords_y_
 
@@ -2885,6 +3160,15 @@ def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
         mul_1: "f32[s0, s0]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
 
         mul_2: "f32[s0, s0]" = torch.mul(mul, mul_1);  mul = mul_1 = None
+=======
+    def forward(self, s9: "Sym(s9)", L_lambda0_keywords_y_: "f32[s9, s9]"):
+        l_lambda0_keywords_y_ = L_lambda0_keywords_y_
+
+        mul: "f32[s9, s9]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
+        mul_1: "f32[s9, s9]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
+
+        mul_2: "f32[s9, s9]" = torch.mul(mul, mul_1);  mul = mul_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (mul_2,)
 """,
             )
@@ -2925,6 +3209,7 @@ def forward(self, L_lambda0_keywords_y_: "f32[2, 2]"):
                 normalize_gm(backend.graphs[0].print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
         l_lambda0_keywords_y_ = L_lambda0_keywords_y_
 
@@ -2933,6 +3218,16 @@ def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
         add: "f32[s0, s0]" = l_lambda0_keywords_y_ + l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
 
         mul_1: "f32[s0, s0]" = torch.mul(mul, add);  mul = add = None
+=======
+    def forward(self, s9: "Sym(s9)", L_lambda0_keywords_y_: "f32[s9, s9]"):
+        l_lambda0_keywords_y_ = L_lambda0_keywords_y_
+
+        mul: "f32[s9, s9]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
+
+        add: "f32[s9, s9]" = l_lambda0_keywords_y_ + l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
+
+        mul_1: "f32[s9, s9]" = torch.mul(mul, add);  mul = add = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (mul_1,)
 """,
             )
@@ -2975,6 +3270,7 @@ def forward(self, L_lambda0_keywords_y_: "f32[2, 2]"):
                 normalize_gm(backend.graphs[0].print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
         l_lambda0_keywords_y_ = L_lambda0_keywords_y_
 
@@ -2983,6 +3279,16 @@ def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
         add: "f32[s0, s0]" = l_lambda0_keywords_y_ + l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
 
         mul_1: "f32[s0, s0]" = torch.mul(mul, add);  mul = add = None
+=======
+    def forward(self, s9: "Sym(s9)", L_lambda0_keywords_y_: "f32[s9, s9]"):
+        l_lambda0_keywords_y_ = L_lambda0_keywords_y_
+
+        mul: "f32[s9, s9]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
+
+        add: "f32[s9, s9]" = l_lambda0_keywords_y_ + l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
+
+        mul_1: "f32[s9, s9]" = torch.mul(mul, add);  mul = add = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (mul_1,)
 """,
             )
@@ -3022,6 +3328,7 @@ def forward(self, L_x_: "f32[2, 2]"):
                 normalize_gm(backend.graphs[0].print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", L_x_: "f32[s0, s0]"):
         l_x_ = L_x_
 
@@ -3030,6 +3337,16 @@ def forward(self, s0: "Sym(s0)", L_x_: "f32[s0, s0]"):
         mul_2: "f32[s0, s0]" = 20 * l_x_;  l_x_ = None
 
         mul_3: "f32[s0, s0]" = torch.mul(mul_1, mul_2);  mul_1 = mul_2 = None
+=======
+    def forward(self, s77: "Sym(s77)", L_x_: "f32[s77, s77]"):
+        l_x_ = L_x_
+
+        mul: "f32[s77, s77]" = l_x_ * 4
+        mul_1: "f32[s77, s77]" = mul * l_x_;  mul = None
+        mul_2: "f32[s77, s77]" = 20 * l_x_;  l_x_ = None
+
+        mul_3: "f32[s77, s77]" = torch.mul(mul_1, mul_2);  mul_1 = mul_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (mul_3,)
 """,
             )
@@ -3814,6 +4131,44 @@ def test_map_unpack_vars(a, b):
         x, y = map(lambda x: x + 1, [a, b])
         return x + y
 
+<<<<<<< HEAD
+=======
+    @make_test
+    def test_map_list_extend(a):
+        y = [1]
+
+        def inner(z):
+            return z + y[-1]
+
+        y.extend(map(inner, range(3)))
+        return a + 1, y
+
+    @make_test
+    def test_map_deque_extendleft(a):
+        y = collections.deque([1])
+
+        def inner(z):
+            return z + y[0]
+
+        y.extendleft(map(inner, range(3)))
+        return a + 1, y
+
+    def test_unsqueeze_inplace(self):
+        def fn(x):
+            return torch.Tensor.unsqueeze_(x, dim=1) + 1
+
+        def self_fn(x):
+            return x.unsqueeze_(dim=1) + 1
+
+        v = torch.ones([3], device="cpu")
+        # identical tensor since modify inplace
+        v2 = torch.ones([3], device="cpu")
+        opt_fn = torch.compile(fn)
+        opt_self_fn = torch.compile(self_fn)
+        self.assertEqual(v, v2)
+        self.assertEqual(opt_fn(v), opt_self_fn(v2))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_enumerate_custom(self):
         class MyClass:
             def __iter__(self):
@@ -3858,6 +4213,42 @@ def f():
         self.assertTrue(same(res, torch.ones(1)))
         self.assertTrue(f is f())
 
+<<<<<<< HEAD
+=======
+    def test_functools_partial_binding(self):
+        class Foo:
+            def __init__(self, x):
+                self.x = x
+
+            @functools.lru_cache  # noqa: B019
+            def incr(self, val):
+                self.x += val
+
+        def fn(x):
+            f = Foo(4)
+            f.incr(3)
+            return x + f.x
+
+        x = torch.randn(2)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        self.assertEqual(fn(x), opt_fn(x))
+
+    def test_functools_cache_guard(self):
+        class Foo:
+            @functools.lru_cache  # noqa: B019
+            def run(self, val, c=1.0):
+                return val * c * 2
+
+        f = Foo()
+
+        def fn(x):
+            return f.run(x)
+
+        x = torch.randn(2)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        self.assertEqual(fn(x), opt_fn(x))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def udf_mul(x, y):
     return x * y
@@ -4137,6 +4528,23 @@ def fn(x):
         ref = opt_fn(x)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
+=======
+    @parametrize("_type", [set, frozenset], name_fn=lambda t: t.__name__)
+    def test_set_call___init__(self, _type):
+        @make_test
+        def fn(a, b):
+            s = _type({"apple", "banana", "cherry"})
+            s.__init__({"google", "microsoft", "apple"})
+            # frozenset should remain the same while set gets updated
+            if "banana" in s:
+                return a + b
+            else:
+                return a - b
+
+        fn(self)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_frozenset_construction(self):
         def fn(x):
             s = frozenset({x})
@@ -4603,6 +5011,71 @@ def fn(x, tup):
         self.assertTrue(ref_tup.checked)
         self.assertTrue(res_tup.checked)
 
+<<<<<<< HEAD
+=======
+    def test_udf_tuple_construction(self):
+        class MyTuple(tuple):  # noqa: SLOT001
+            pass
+
+        def fn(x):
+            tup = MyTuple([1, 2, 3])
+            if 3 in tup:
+                x = torch.cos(x)
+            else:
+                x = torch.sin(x)
+            return x, tup
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.randn(4)
+        ref_x, ref_tup = fn(x)
+        res_x, res_tup = opt_fn(x)
+        self.assertEqual(ref_x, res_x)
+        self.assertEqual(ref_tup, res_tup)
+
+    def test_udf_tuple_construction_custom_new(self):
+        class MyTuple(tuple):  # noqa: SLOT001
+            def __new__(cls, *args, **kwargs):
+                return super().__new__(cls, [1, 2, 3])
+
+        def fn(x):
+            tup = MyTuple()
+            if 3 in tup:
+                x = torch.cos(x)
+            else:
+                x = torch.sin(x)
+            return x, tup
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.randn(4)
+        ref_x, ref_tup = fn(x)
+        res_x, res_tup = opt_fn(x)
+        self.assertEqual(ref_x, res_x)
+        self.assertEqual(ref_tup, res_tup)
+
+    def test_udf_namedtuple(self):
+        class MyTuple(NamedTuple):
+            a: torch.Tensor
+            b: torch.Tensor
+
+        class PairTensor(MyTuple):
+            def __new__(cls, a, b):
+                return super().__new__(cls, a, b)
+
+            def __add__(self, other):
+                return PairTensor(self.a + other.a, self.b + other.b)
+
+        def fn(pair1, pair2):
+            return pair1 + pair2
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        pair1 = PairTensor(torch.randn(4), torch.randn(2, 8))
+        pair2 = PairTensor(torch.randn(1), torch.randn(2, 1))
+        ref = fn(pair1, pair2)
+        res = opt_fn(pair1, pair2)
+        self.assertEqual(ref.a, res.a)
+        self.assertEqual(ref.b, res.b)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_udf_tuple_reconstruction(self):
         class MyTuple(tuple):  # noqa: SLOT001
             pass
@@ -4786,6 +5259,10 @@ def __getattribute__(self, name):
 
 
 instantiate_parametrized_tests(FunctionTests)
+<<<<<<< HEAD
+=======
+instantiate_parametrized_tests(DefaultsTests)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_generator.py b/test/dynamo/test_generator.py
index d1f4289a5793..96db482462f3 100644
--- a/test/dynamo/test_generator.py
+++ b/test/dynamo/test_generator.py
@@ -12,6 +12,10 @@
 from torch._dynamo.utils import counters
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+<<<<<<< HEAD
+=======
+    make_dynamo_test,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parametrize,
 )
 
@@ -861,6 +865,33 @@ def fn(t):
         y = fn(t)
         self.assertEqual(y, t + sum(range(6)))
 
+<<<<<<< HEAD
+=======
+    def test_list_extend(self):
+        def f(x):
+            y = [1]
+            y.extend(y[-1] + z for z in range(3))
+            return x + 1, y
+
+        self.assertEqual(
+            f(torch.ones(3)),
+            torch.compile(f, backend="eager", fullgraph=True)(torch.ones(3)),
+        )
+
+    def test_deque_extendleft(self):
+        import collections
+
+        def f(x):
+            y = collections.deque([1])
+            y.extendleft(y[0] + z for z in range(3))
+            return x + 1, y
+
+        self.assertEqual(
+            f(torch.ones(3)),
+            torch.compile(f, backend="eager", fullgraph=True)(torch.ones(3)),
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestGeneratorSend(GeneratorTestsBase):
     def test_send(self):
@@ -1069,6 +1100,10 @@ def fn(t):
         self.assertEqual(L, [1, -123, -1, 456])
 
     @parametrize("exc", [RuntimeError, AttributeError])
+<<<<<<< HEAD
+=======
+    @make_dynamo_test
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_close_capture_and_reraise_exc(self, exc):
         def whoo(t):
             try:
@@ -1079,7 +1114,10 @@ def whoo(t):
             finally:
                 pass
 
+<<<<<<< HEAD
         @torch.compile(backend="eager", fullgraph=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn(t):
             gen = whoo(t)
             i = next(gen)
@@ -1087,8 +1125,19 @@ def fn(t):
             return i
 
         t = torch.randn(2)
+<<<<<<< HEAD
         with self.assertRaises(exc):
             fn(t)
+=======
+
+        z = 0
+        try:
+            fn(t)
+        except exc:
+            z = 1
+        finally:
+            assert z == 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_close_with_subgen(self):
         L = []
@@ -1450,6 +1499,7 @@ def fn(t):
         self._compile_check(fn)
 
 
+<<<<<<< HEAD
 class GeneratorCloseCPythonTests(GeneratorTestsBase):
     # Taken from commit
     # https://github.com/python/cpython/blob/d51a4ca1123e3e49e5cae4273355bdfd9e419a10
@@ -1775,6 +1825,8 @@ def fn(t):
         self._compile_check(fn)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_parametrized_tests(GeneratorTests)
 instantiate_parametrized_tests(TestGeneratorSend)
 instantiate_parametrized_tests(TestGeneratorClose)
diff --git a/test/dynamo/test_graph_deduplication.py b/test/dynamo/test_graph_deduplication.py
index 805d8f6be2d0..01602fe15558 100644
--- a/test/dynamo/test_graph_deduplication.py
+++ b/test/dynamo/test_graph_deduplication.py
@@ -1,10 +1,25 @@
 # Owner(s): ["module: dynamo"]
 # flake8: noqa: B950
+<<<<<<< HEAD
 import torch
 import torch.fx
 from torch._dynamo.graph_deduplication import _flatten_args_kwargs
 from torch._dynamo.test_case import TestCase
 from torch._dynamo.testing import AotEagerAndRecordGraphs, normalize_gm
+=======
+import contextlib
+
+import torch
+import torch.fx
+from torch._dynamo.graph_utils import _detect_cycles
+from torch._dynamo.test_case import TestCase
+from torch._dynamo.testing import (
+    AotEagerAndRecordGraphs,
+    extract_graph_and_tracker,
+    normalize_gm,
+)
+from torch.utils._ordered_set import OrderedSet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def extract_graph(fn, *args, **kwargs):
@@ -18,9 +33,38 @@ def graph_str(gm):
 
 
 class GraphDededuplicationTests(TestCase):
+<<<<<<< HEAD
     def run_and_return_graphs(self, fn, *args, **kwargs):
         with torch._dynamo.config.patch("use_graph_deduplication", True):
             return extract_graph(fn, *args, **kwargs)
+=======
+    def setUp(self):
+        self.exit_stack = contextlib.ExitStack()
+        self.exit_stack.enter_context(
+            torch._dynamo.config.patch("use_graph_deduplication", True)
+        )
+        super().setUp()
+
+    def tearDown(self):
+        self.exit_stack.close()
+        super().tearDown()
+
+    def run_and_return_graphs(self, fn, *args, **kwargs):
+        return extract_graph(fn, *args, **kwargs)
+
+    def run_and_get_simple_graph(self):
+        def fn(x, y):
+            x0 = x + 1
+            y0 = y + 2
+            z = x0.sum() + y0.sum()
+            return z
+
+        x = torch.rand(10, 10, requires_grad=False)
+        y = torch.rand(10, 20, requires_grad=False)
+
+        _, _, fw_graphs = self.run_and_return_graphs(fn, x, y)
+        return fw_graphs[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_single_subgraph(self):
         def inner_fn(x, y):
@@ -59,6 +103,7 @@ def forward(self, L_x_: "f32[10, 10]", L_y_: "f32[10, 20]"):
         subgraph_0 = self.subgraph_0
         l_x_ = L_x_
         l_y_ = L_y_
+<<<<<<< HEAD
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, \
 'subgraph_0', (l_x_, l_y_));  invoke_subgraph = None
 
@@ -71,6 +116,17 @@ def forward(self, L_x_: "f32[10, 10]", L_y_: "f32[10, 20]"):
 
         invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(subgraph_0, \
 'subgraph_0', (l_x_, l_y_));  subgraph_0 = l_x_ = l_y_ = None
+=======
+
+        o1: "f32[10, 20]" = torch.sin(l_y_)
+
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  invoke_subgraph = None
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, o1);  o1 = None
+
+        getitem_1: "f32[]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+
+        invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = l_y_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         getitem_2: "f32[]" = invoke_subgraph_2[0];  invoke_subgraph_2 = None
 
@@ -99,22 +155,40 @@ class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[10, 10]", primals_2: "f32[10, 20]"):
         sin: "f32[10, 20]" = torch.ops.aten.sin.default(primals_2)
 
+<<<<<<< HEAD
         ___forward_subgraph_0_post_graph = self.___forward_subgraph_0_post_graph
         invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(___forward_subgraph_0_post_graph, '___forward_subgraph_0_post_graph', (primals_1, sin));  ___forward_subgraph_0_post_graph = sin = None
         getitem_1: "f32[]" = invoke_subgraph_5[0];  invoke_subgraph_5 = None
         ___forward_subgraph_0_post_graph_1 = self.___forward_subgraph_0_post_graph
         invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(___forward_subgraph_0_post_graph_1, '___forward_subgraph_0_post_graph', (primals_1, primals_2));  ___forward_subgraph_0_post_graph_1 = primals_1 = None
         getitem_2: "f32[]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
+=======
+        partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+        invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, sin);  partitioned_fw_subgraph_0_0 = sin = None
+        getitem_1: "f32[]" = invoke_subgraph_5[0];  invoke_subgraph_5 = None
+        partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
+        invoke_subgraph_7 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_0', primals_1, primals_2);  partitioned_fw_subgraph_0_1 = primals_1 = None
+        getitem_2: "f32[]" = invoke_subgraph_7[0];  invoke_subgraph_7 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mul: "f32[]" = torch.ops.aten.mul.Tensor(getitem_2, getitem_2)
 
         mul_1: "f32[]" = torch.ops.aten.mul.Tensor(getitem_1, mul);  mul = None
         return (mul_1, primals_2, getitem_1, getitem_2)
 
+<<<<<<< HEAD
     class ___forward_subgraph_0_post_graph(torch.nn.Module):
         def forward(self, primals_0: "f32[10, 10]", primals_1: "f32[10, 20]"):
             add: "f32[10, 10]" = torch.ops.aten.add.Tensor(primals_0, 1);  primals_0 = None
             add_1: "f32[10, 20]" = torch.ops.aten.add.Tensor(primals_1, 2);  primals_1 = None
+=======
+    class partitioned_fw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[10, 10]", primals_1: "f32[10, 20]"):
+            add: "f32[10, 10]" = torch.ops.aten.add.Tensor(primals_0, 1);  primals_0 = None
+
+            add_1: "f32[10, 20]" = torch.ops.aten.add.Tensor(primals_1, 2);  primals_1 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sum_1: "f32[]" = torch.ops.aten.sum.default(add);  add = None
             sum_2: "f32[]" = torch.ops.aten.sum.default(add_1);  add_1 = None
             add_2: "f32[]" = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
@@ -157,13 +231,21 @@ def forward(self, L_x_: "f32[10, 10]"):
 
         x0: "f32[10, 10]" = l_x_ + 2;  l_x_ = None
 
+<<<<<<< HEAD
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', (x0,));  x0 = None
+=======
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', x0);  x0 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         getitem: "f32[10, 10]" = invoke_subgraph[0];  invoke_subgraph = None
 
         o_3: "f32[10, 10]" = torch.cos(getitem);  getitem = None
 
+<<<<<<< HEAD
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', (o_3,));  subgraph_0 = o_3 = None
+=======
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', o_3);  subgraph_0 = o_3 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         getitem_1: "f32[10, 10]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
 
@@ -187,15 +269,26 @@ class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[10, 10]"):
         add: "f32[10, 10]" = torch.ops.aten.add.Tensor(primals_1, 2);  primals_1 = None
 
+<<<<<<< HEAD
         ___forward_subgraph_0_post_graph = self.___forward_subgraph_0_post_graph
         invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(___forward_subgraph_0_post_graph, '___forward_subgraph_0_post_graph', (add,));  ___forward_subgraph_0_post_graph = add = None
+=======
+        partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+        invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', add);  partitioned_fw_subgraph_0_0 = add = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem: "f32[10, 10]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
 
         cos: "f32[10, 10]" = torch.ops.aten.cos.default(getitem)
 
+<<<<<<< HEAD
         ___forward_subgraph_0_post_graph_1 = self.___forward_subgraph_0_post_graph
         invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(___forward_subgraph_0_post_graph_1, '___forward_subgraph_0_post_graph', (cos,));  ___forward_subgraph_0_post_graph_1 = cos = None
         getitem_1: "f32[10, 10]" = invoke_subgraph_5[0];  invoke_subgraph_5 = None
+=======
+        partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
+        invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_0', cos);  partitioned_fw_subgraph_0_1 = cos = None
+        getitem_1: "f32[10, 10]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sin: "f32[10, 10]" = torch.ops.aten.sin.default(getitem_1)
         cos_1: "f32[10, 10]" = torch.ops.aten.cos.default(getitem_1);  getitem_1 = None
@@ -204,10 +297,19 @@ def forward(self, primals_1: "f32[10, 10]"):
         neg: "f32[10, 10]" = torch.ops.aten.neg.default(sin_1);  sin_1 = None
         return (sin, cos_1, neg)
 
+<<<<<<< HEAD
     class ___forward_subgraph_0_post_graph(torch.nn.Module):
         def forward(self, primals_0: "f32[10, 10]"):
             mul: "f32[10, 10]" = torch.ops.aten.mul.Tensor(primals_0, 7);  primals_0 = None
             add: "f32[10, 10]" = torch.ops.aten.add.Tensor(mul, 1);  mul = None
+=======
+    class partitioned_fw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[10, 10]"):
+            mul: "f32[10, 10]" = torch.ops.aten.mul.Tensor(primals_0, 7);  primals_0 = None
+
+            add: "f32[10, 10]" = torch.ops.aten.add.Tensor(mul, 1);  mul = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             add_1: "f32[10, 10]" = torch.ops.aten.add.Tensor(add, 2);  add = None
             return (add_1,)
 """,
@@ -265,15 +367,20 @@ def forward(self, L_x_: "f32[10, 10]", L_y_: "f32[10, 20]"):
 
         y0: "f32[10, 20]" = torch.sin(l_y_)
 
+<<<<<<< HEAD
         invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', \
 (x0, y0));  invoke_subgraph_3 = None
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', \
 (l_x_, l_y_))
+=======
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         getitem: "f32[]" = invoke_subgraph[0];  invoke_subgraph = None
 
         o1: "f32[]" = torch.sin(getitem);  getitem = None
 
+<<<<<<< HEAD
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', \
 (l_x_, y0))
 
@@ -290,6 +397,23 @@ def forward(self, L_x_: "f32[10, 10]", L_y_: "f32[10, 20]"):
         getitem_2: "f32[]" = invoke_subgraph_2[0];  invoke_subgraph_2 = None
 
         mul_2: "f32[]" = o1 * getitem_1;  o1 = getitem_1 = None
+=======
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, y0)
+
+        getitem_1: "f32[]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+
+        mul_2: "f32[]" = o1 * getitem_1;  o1 = getitem_1 = None
+
+        invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = l_y_ = None
+
+        getitem_2: "f32[]" = invoke_subgraph_2[0];  invoke_subgraph_2 = None
+
+        invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', x0, y0);  invoke_subgraph_3 = None
+        invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', x0, y0);  subgraph_1 = x0 = y0 = None
+
+        getitem_4: "f32[10, 10]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mul_3: "f32[10, 10]" = mul_2 * getitem_4;  mul_2 = getitem_4 = None
         add_13: "f32[10, 10]" = mul_3 + getitem_2;  mul_3 = getitem_2 = None
         return (add_13,)
@@ -326,6 +450,7 @@ def forward(self, primals_1: "f32[10, 10]", primals_2: "f32[10, 20]"):
 
         sin: "f32[10, 20]" = torch.ops.aten.sin.default(primals_2)
 
+<<<<<<< HEAD
         ___forward_subgraph_0_post_graph = self.___forward_subgraph_0_post_graph
         invoke_subgraph_9 = torch.ops.higher_order.invoke_subgraph(___forward_subgraph_0_post_graph, '___forward_subgraph_0_post_graph', (primals_1, primals_2));  ___forward_subgraph_0_post_graph = None
         getitem_1: "f32[]" = invoke_subgraph_9[0];  invoke_subgraph_9 = None
@@ -354,15 +479,58 @@ class ___forward_subgraph_0_post_graph(torch.nn.Module):
         def forward(self, primals_0: "f32[10, 10]", primals_1: "f32[10, 20]"):
             add: "f32[10, 10]" = torch.ops.aten.add.Tensor(primals_0, 1);  primals_0 = None
             add_1: "f32[10, 20]" = torch.ops.aten.add.Tensor(primals_1, 2);  primals_1 = None
+=======
+        partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+        invoke_subgraph_9 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, primals_2);  partitioned_fw_subgraph_0_0 = None
+        getitem: "f32[]" = invoke_subgraph_9[0];  invoke_subgraph_9 = None
+
+        sin_1: "f32[]" = torch.ops.aten.sin.default(getitem)
+
+        partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
+        invoke_subgraph_11 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_0', primals_1, sin);  partitioned_fw_subgraph_0_1 = None
+        getitem_1: "f32[]" = invoke_subgraph_11[0];  invoke_subgraph_11 = None
+
+        mul: "f32[]" = torch.ops.aten.mul.Tensor(sin_1, getitem_1);  sin_1 = None
+
+        partitioned_fw_subgraph_0_2 = self.partitioned_fw_subgraph_0_0
+        invoke_subgraph_13 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_2, 'partitioned_fw_subgraph_0_0', primals_1, primals_2);  partitioned_fw_subgraph_0_2 = None
+        getitem_2: "f32[]" = invoke_subgraph_13[0];  invoke_subgraph_13 = None
+        partitioned_fw_subgraph_1_0 = self.partitioned_fw_subgraph_1_0
+        invoke_subgraph_15 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_1_0, 'partitioned_fw_subgraph_1_0', cos, sin);  partitioned_fw_subgraph_1_0 = cos = sin = None
+        getitem_19: "f32[]" = invoke_subgraph_15[3]
+        getitem_18: "f32[10, 20]" = invoke_subgraph_15[2]
+        getitem_17: "f32[10, 10]" = invoke_subgraph_15[1]
+        getitem_4: "f32[10, 10]" = invoke_subgraph_15[0];  invoke_subgraph_15 = None
+
+        mul_1: "f32[10, 10]" = torch.ops.aten.mul.Tensor(mul, getitem_4);  mul = None
+        add: "f32[10, 10]" = torch.ops.aten.add.Tensor(mul_1, getitem_2);  mul_1 = getitem_2 = None
+        return (add, primals_1, primals_2, getitem, getitem_1, getitem_19, getitem_18, getitem_17, getitem_4)
+
+    class partitioned_fw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[10, 10]", primals_1: "f32[10, 20]"):
+            add: "f32[10, 10]" = torch.ops.aten.add.Tensor(primals_0, 1);  primals_0 = None
+
+            add_1: "f32[10, 20]" = torch.ops.aten.add.Tensor(primals_1, 2);  primals_1 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sum_1: "f32[]" = torch.ops.aten.sum.default(add);  add = None
             sum_2: "f32[]" = torch.ops.aten.sum.default(add_1);  add_1 = None
             add_2: "f32[]" = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
             return (add_2,)
 
+<<<<<<< HEAD
     class ___forward_subgraph_1_post_graph(torch.nn.Module):
         def forward(self, primals_0: "f32[10, 10]", primals_1: "f32[10, 20]"):
             add: "f32[10, 10]" = torch.ops.aten.add.Tensor(primals_0, 2)
             add_1: "f32[10, 20]" = torch.ops.aten.add.Tensor(primals_1, 3)
+=======
+    class partitioned_fw_subgraph_1_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[10, 10]", primals_1: "f32[10, 20]"):
+            add: "f32[10, 10]" = torch.ops.aten.add.Tensor(primals_0, 2)
+
+            add_1: "f32[10, 20]" = torch.ops.aten.add.Tensor(primals_1, 3)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cos: "f32[10, 20]" = torch.ops.aten.cos.default(add_1);  add_1 = None
             sum_1: "f32[]" = torch.ops.aten.sum.default(cos);  cos = None
             mul: "f32[10, 10]" = torch.ops.aten.mul.Tensor(add, sum_1);  add = None
@@ -404,14 +572,20 @@ def forward(self, primals_1: "f32[10, 10]", primals_2: "f32[10, 20]"):
 
         sum_1: "f32[]" = torch.ops.aten.sum.default(add);  add = None
 
+<<<<<<< HEAD
         ___forward_subgraph_0_post_graph = self.___forward_subgraph_0_post_graph
         invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(___forward_subgraph_0_post_graph, '___forward_subgraph_0_post_graph', (primals_1, sum_1));  ___forward_subgraph_0_post_graph = sum_1 = None
+=======
+        partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+        invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, sum_1);  partitioned_fw_subgraph_0_0 = sum_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem: "f32[]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
 
         add_1: "f32[]" = torch.ops.aten.add.Tensor(getitem, 2);  getitem = None
 
         sum_2: "f32[]" = torch.ops.aten.sum.default(add_1);  add_1 = None
 
+<<<<<<< HEAD
         ___forward_subgraph_0_post_graph_1 = self.___forward_subgraph_0_post_graph
         invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(___forward_subgraph_0_post_graph_1, '___forward_subgraph_0_post_graph', (primals_1, sum_2));  ___forward_subgraph_0_post_graph_1 = primals_1 = sum_2 = None
         getitem_1: "f32[]" = invoke_subgraph_5[0];  invoke_subgraph_5 = None
@@ -420,6 +594,17 @@ def forward(self, primals_1: "f32[10, 10]", primals_2: "f32[10, 20]"):
     class ___forward_subgraph_0_post_graph(torch.nn.Module):
         def forward(self, primals_0: "f32[10, 10]", primals_1: "f32[]"):
             add: "f32[10, 10]" = torch.ops.aten.add.Tensor(primals_0, 1);  primals_0 = None
+=======
+        partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
+        invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_0', primals_1, sum_2);  partitioned_fw_subgraph_0_1 = primals_1 = sum_2 = None
+        getitem_1: "f32[]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
+        return (getitem_1,)
+
+    class partitioned_fw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[10, 10]", primals_1: "f32[]"):
+            add: "f32[10, 10]" = torch.ops.aten.add.Tensor(primals_0, 1);  primals_0 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sum_1: "f32[]" = torch.ops.aten.sum.default(add);  add = None
             add_1: "f32[]" = torch.ops.aten.add.Tensor(sum_1, primals_1);  sum_1 = primals_1 = None
             return (add_1,)
@@ -427,12 +612,15 @@ def forward(self, primals_0: "f32[10, 10]", primals_1: "f32[]"):
         )
 
     def test_input_mutation(self):
+<<<<<<< HEAD
         def inner_fn(x, y):
             x0 = x + 1
             y0 = y + 2
             z = x0.sum() + y0.sum()
             return z
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def inner_fn2(x, y):
             x0 = x + 1
             y0 = y + 1
@@ -442,9 +630,12 @@ def inner_fn2(x, y):
 
         def fn(x, y):
             x0 = torch.sin(x)
+<<<<<<< HEAD
             _y0 = torch.cos(y)
             # o0 = inner_fn(x0, y0)
             # o1 = inner_fn(x0, o0)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             o2 = inner_fn2(x0, y)
             o3 = inner_fn2(x0.clone(), y.clone())
             return o2 + o3
@@ -475,12 +666,16 @@ def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
 
         add_3: "f32[10, 20]" = torch.ops.aten.add.Tensor(arg1_1, add_1);  add_1 = None
 
+<<<<<<< HEAD
         repeated_subgraph0 = self.repeated_subgraph0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, \
 'subgraph_0', (add_2, add_3));  repeated_subgraph0 = None
         getitem: "f32[]" = invoke_subgraph[0];  invoke_subgraph = None
 
         clone: "f32[10, 10]" = torch.ops.aten.clone.default(add_2);  add_2 = None
+=======
+        clone: "f32[10, 10]" = torch.ops.aten.clone.default(add_2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         clone_1: "f32[10, 20]" = torch.ops.aten.clone.default(add_3)
 
         add_4: "f32[10, 10]" = torch.ops.aten.add.Tensor(clone, 1)
@@ -491,9 +686,17 @@ def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
 
         add_7: "f32[10, 20]" = torch.ops.aten.add.Tensor(clone_1, add_5);  clone_1 = add_5 = None
 
+<<<<<<< HEAD
         repeated_subgraph0_1 = self.repeated_subgraph0
         invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, \
 'subgraph_0', (add_6, add_7));  repeated_subgraph0_1 = add_6 = add_7 = None
+=======
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', add_2, add_3);  repeated_subgraph0 = add_2 = None
+        getitem: "f32[]" = invoke_subgraph[0];  invoke_subgraph = None
+        repeated_subgraph0_1 = self.repeated_subgraph0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', add_6, add_7);  repeated_subgraph0_1 = add_6 = add_7 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_1: "f32[]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
 
         add_8: "f32[]" = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
@@ -551,6 +754,7 @@ def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
 
         view_3: "f32[10, 10]" = torch.ops.aten.view.default(view_2, [10, 10]);  view_2 = None
 
+<<<<<<< HEAD
         repeated_subgraph0 = self.repeated_subgraph0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, \
 'subgraph_0', (arg0_1, arg1_1));  repeated_subgraph0 = None
@@ -563,6 +767,21 @@ def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
         add: "f32[10, 10]" = torch.ops.aten.add.Tensor(view_1, view_3);  view_1 = view_3 = None
         sum_1: "f32[]" = torch.ops.aten.sum.default(getitem);  getitem = None
         add_1: "f32[10, 10]" = torch.ops.aten.add.Tensor(add, sum_1);  add = sum_1 = None
+=======
+        add: "f32[10, 10]" = torch.ops.aten.add.Tensor(view_1, view_3);  view_1 = view_3 = None
+
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', arg0_1, arg1_1);  repeated_subgraph0 = None
+        getitem: "f32[]" = invoke_subgraph[0];  invoke_subgraph = None
+
+        sum_1: "f32[]" = torch.ops.aten.sum.default(getitem);  getitem = None
+        add_1: "f32[10, 10]" = torch.ops.aten.add.Tensor(add, sum_1);  add = sum_1 = None
+
+        repeated_subgraph0_1 = self.repeated_subgraph0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', arg0_1, arg1_1);  repeated_subgraph0_1 = arg0_1 = arg1_1 = None
+        getitem_1: "f32[]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sum_2: "f32[]" = torch.ops.aten.sum.default(getitem_1);  getitem_1 = None
         add_2: "f32[10, 10]" = torch.ops.aten.add.Tensor(add_1, sum_2);  add_1 = sum_2 = None
         return (add_2,)
@@ -570,7 +789,13 @@ def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
     class repeated_subgraph0(torch.nn.Module):
         def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
             mul: "f32[10, 10]" = torch.ops.aten.mul.Tensor(arg0_1, 2);  arg0_1 = None
+<<<<<<< HEAD
             mul_1: "f32[10, 20]" = torch.ops.aten.mul.Tensor(arg1_1, 2);  arg1_1 = None
+=======
+
+            mul_1: "f32[10, 20]" = torch.ops.aten.mul.Tensor(arg1_1, 2);  arg1_1 = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sum_1: "f32[]" = torch.ops.aten.sum.default(mul);  mul = None
             sum_2: "f32[]" = torch.ops.aten.sum.default(mul_1);  mul_1 = None
             add: "f32[]" = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
@@ -578,11 +803,515 @@ def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
 """,
         )
 
+<<<<<<< HEAD
     def test_flatten_with_slices(self):
         tree = [{"x": 3}, ["x", slice(1, 2, 3), 1], [4, 5, 6, [slice(3, 4, 5)]]]
         out = _flatten_args_kwargs(tree)
         self.assertExpectedInline(
             str(out), """[3, 'x', 1, 2, 3, 1, 4, 5, 6, 3, 4, 5]"""
+=======
+    def test_cycle_detection_no_cycle(self):
+        mod = self.run_and_get_simple_graph()
+        self.assertExpectedInline(
+            _detect_cycles(mod.graph, {}), """no cycle detected"""
+        )
+
+    def test_cycle_detection_single_node(self):
+        def fn(x, y):
+            x0 = x + 1
+            y0 = y + 2
+            z = x0.sum() + y0.sum()
+            return z
+
+        x = torch.rand(10, 10, requires_grad=False)
+        y = torch.rand(10, 20, requires_grad=False)
+
+        _, _, fw_graphs = self.run_and_return_graphs(fn, x, y)
+        mod = fw_graphs[0]
+        add_node = next(n for n in mod.graph.nodes if n.name == "add")
+        add_2 = next(n for n in mod.graph.nodes if n.name == "add_2")
+        args = add_node.args
+        add_node.args = (args[0], add_2)
+        self.assertExpectedInline(
+            _detect_cycles(mod.graph, {add_2: OrderedSet([add_2])}),
+            """cycle detected in path: deque([output, add_2, add_2])""",
+        )
+
+    def test_cycle_detection_two_node(self):
+        def fn(x, y):
+            x0 = x + 1
+            y0 = y + 2
+            z = x0.sum() + y0.sum()
+            return z
+
+        x = torch.rand(10, 10, requires_grad=False)
+        y = torch.rand(10, 20, requires_grad=False)
+
+        _, _, fw_graphs = self.run_and_return_graphs(fn, x, y)
+        mod = fw_graphs[0]
+        add_node = next(n for n in mod.graph.nodes if n.name == "add")
+        add_2 = next(n for n in mod.graph.nodes if n.name == "add_2")
+        args = add_node.args
+        add_node.args = (args[0], add_2)
+        self.assertExpectedInline(
+            _detect_cycles(
+                mod.graph,
+                {add_2: OrderedSet([add_node]), add_node: OrderedSet([add_2])},
+            ),
+            """cycle detected in path: deque([output, add_2, add, add_2])""",
+        )
+
+    def test_cycle_detection_arg_and_additional_deps(self):
+        def fn(x, y):
+            x0 = x + 1
+            y0 = y + 2
+            z = x0.sum() + y0.sum()
+            return z
+
+        x = torch.rand(10, 10, requires_grad=False)
+        y = torch.rand(10, 20, requires_grad=False)
+
+        _, _, fw_graphs = self.run_and_return_graphs(fn, x, y)
+        mod = fw_graphs[0]
+        add_node = next(n for n in mod.graph.nodes if n.name == "add")
+        add_2 = next(n for n in mod.graph.nodes if n.name == "add_2")
+        args = add_node.args
+        add_node.args = (args[0], add_2)
+        self.assertExpectedInline(
+            _detect_cycles(mod.graph, {add_2: OrderedSet([add_node])}),
+            """cycle detected in path: deque([output, add_2, add, add_2])""",
+        )
+
+    def test_cycle_detection_simple(self):
+        mod = self.run_and_get_simple_graph()
+        add_node = next(n for n in mod.graph.nodes if n.name == "add")
+        add_2 = next(n for n in mod.graph.nodes if n.name == "add_2")
+        args = add_node.args
+        add_node.args = (args[0], add_2)
+        self.assertExpectedInline(
+            _detect_cycles(mod.graph, {}),
+            """cycle detected in path: deque([output, add_2, sum_1, add, add_2])""",
+        )
+
+    def test_cycle_detection_complex(self):
+        def inner_fn(x, y):
+            x0 = x.view(x.size())
+            return x0.view(x.size())
+
+        def inner_fn2(x, y):
+            x = x * 2
+            y = y * 2
+            return x.sum() + y.sum()
+
+        def fn(x, y):
+            o0 = inner_fn(x, y)
+            o1 = inner_fn(x, y)
+            o2 = inner_fn2(x, y)
+            o3 = inner_fn2(x, y)
+            return o0 + o1 + o2.sum() + o3.sum()
+
+        x = torch.rand(10, 10, requires_grad=False)
+        y = torch.rand(10, 20, requires_grad=False)
+        x_clone = x.clone()
+        y_clone = y.clone()
+
+        _, _, fw_graphs = self.run_and_return_graphs(fn, x_clone, y_clone)
+        mod = fw_graphs[0]
+        invoke_subgraph_node = next(
+            n for n in mod.graph.nodes if n.name == "invoke_subgraph"
+        )
+        add_2 = next(n for n in mod.graph.nodes if n.name == "add_2")
+        args = invoke_subgraph_node.args
+        invoke_subgraph_node.args = (add_2, args[1])
+        self.assertExpectedInline(
+            _detect_cycles(mod.graph, {}),
+            """cycle detected in path: deque([output, add_2, add_1, sum_1, getitem, invoke_subgraph, add_2])""",
+        )
+
+    def test_autocast_ordering(self):
+        from torch._dynamo.graph_deduplication import (
+            _populate_additional_deps,
+            _stable_topological_sort,
+        )
+
+        def inner_fn(x, y):
+            x0 = x.view(x.size())
+            return x0.view(x.size())
+
+        def inner_fn2(x, y):
+            x = x * 2
+            y = y * 2
+            return x.sum() + y.sum()
+
+        def fn(x, y):
+            o0 = inner_fn(x, y)
+            o1 = inner_fn(x, y)
+            o2 = inner_fn2(x, y)
+            o3 = inner_fn2(x, y)
+            return o0 + o1 + o2.sum() + o3.sum()
+
+        x = torch.rand(10, 10, requires_grad=False)
+        y = torch.rand(10, 20, requires_grad=False)
+        x_clone = x.clone()
+        y_clone = y.clone()
+
+        _, _, fw_graphs = self.run_and_return_graphs(fn, x_clone, y_clone)
+        mod = fw_graphs[0]
+
+        def get_node(name):
+            return next(n for n in mod.graph.nodes if n.name == name)
+
+        sum_1 = get_node("sum_1")
+        enter_autocast = mod.graph.call_function(torch.amp._enter_autocast)
+        sum_1.append(enter_autocast)
+        sum_2 = get_node("sum_2")
+        exit_autocast = mod.graph.call_function(torch.amp._exit_autocast)
+        sum_2.append(exit_autocast)
+        additional_deps = _populate_additional_deps(mod.graph, {})
+        invoke_subgraph = get_node("invoke_subgraph")
+        invoke_subgraph.append(enter_autocast)
+        getitem_1 = get_node("getitem_1")
+        getitem_1.append(exit_autocast)
+        self.assertExpectedInline(
+            graph_str(mod),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
+        view: "f32[10, 10]" = torch.ops.aten.view.default(arg0_1, [10, 10])
+
+        view_1: "f32[10, 10]" = torch.ops.aten.view.default(view, [10, 10]);  view = None
+
+        view_2: "f32[10, 10]" = torch.ops.aten.view.default(arg0_1, [10, 10])
+
+        view_3: "f32[10, 10]" = torch.ops.aten.view.default(view_2, [10, 10]);  view_2 = None
+
+        add: "f32[10, 10]" = torch.ops.aten.add.Tensor(view_1, view_3);  view_1 = view_3 = None
+
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', arg0_1, arg1_1);  repeated_subgraph0 = None
+        _enter_autocast = torch.amp.autocast_mode._enter_autocast();  _enter_autocast = None
+        getitem: "f32[]" = invoke_subgraph[0];  invoke_subgraph = None
+
+        sum_1: "f32[]" = torch.ops.aten.sum.default(getitem);  getitem = None
+        add_1: "f32[10, 10]" = torch.ops.aten.add.Tensor(add, sum_1);  add = sum_1 = None
+
+        repeated_subgraph0_1 = self.repeated_subgraph0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', arg0_1, arg1_1);  repeated_subgraph0_1 = arg0_1 = arg1_1 = None
+        getitem_1: "f32[]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+        _exit_autocast = torch.amp.autocast_mode._exit_autocast();  _exit_autocast = None
+
+        sum_2: "f32[]" = torch.ops.aten.sum.default(getitem_1);  getitem_1 = None
+        add_2: "f32[10, 10]" = torch.ops.aten.add.Tensor(add_1, sum_2);  add_1 = sum_2 = None
+        return (add_2,)
+
+    class repeated_subgraph0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
+            mul: "f32[10, 10]" = torch.ops.aten.mul.Tensor(arg0_1, 2);  arg0_1 = None
+
+            mul_1: "f32[10, 20]" = torch.ops.aten.mul.Tensor(arg1_1, 2);  arg1_1 = None
+
+            sum_1: "f32[]" = torch.ops.aten.sum.default(mul);  mul = None
+            sum_2: "f32[]" = torch.ops.aten.sum.default(mul_1);  mul_1 = None
+            add: "f32[]" = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
+            return (add,)
+""",
+        )
+        _stable_topological_sort(mod.graph, additional_deps)
+        self.assertExpectedInline(
+            graph_str(mod),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
+        view: "f32[10, 10]" = torch.ops.aten.view.default(arg0_1, [10, 10])
+
+        view_1: "f32[10, 10]" = torch.ops.aten.view.default(view, [10, 10]);  view = None
+
+        view_2: "f32[10, 10]" = torch.ops.aten.view.default(arg0_1, [10, 10])
+
+        view_3: "f32[10, 10]" = torch.ops.aten.view.default(view_2, [10, 10]);  view_2 = None
+
+        add: "f32[10, 10]" = torch.ops.aten.add.Tensor(view_1, view_3);  view_1 = view_3 = None
+
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', arg0_1, arg1_1);  repeated_subgraph0 = None
+        getitem: "f32[]" = invoke_subgraph[0];  invoke_subgraph = None
+
+        sum_1: "f32[]" = torch.ops.aten.sum.default(getitem);  getitem = None
+
+        _enter_autocast = torch.amp.autocast_mode._enter_autocast();  _enter_autocast = None
+
+        add_1: "f32[10, 10]" = torch.ops.aten.add.Tensor(add, sum_1);  add = sum_1 = None
+
+        repeated_subgraph0_1 = self.repeated_subgraph0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', arg0_1, arg1_1);  repeated_subgraph0_1 = arg0_1 = arg1_1 = None
+        getitem_1: "f32[]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+
+        sum_2: "f32[]" = torch.ops.aten.sum.default(getitem_1);  getitem_1 = None
+
+        _exit_autocast = torch.amp.autocast_mode._exit_autocast();  _exit_autocast = None
+
+        add_2: "f32[10, 10]" = torch.ops.aten.add.Tensor(add_1, sum_2);  add_1 = sum_2 = None
+        return (add_2,)
+
+    class repeated_subgraph0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
+            mul: "f32[10, 10]" = torch.ops.aten.mul.Tensor(arg0_1, 2);  arg0_1 = None
+
+            mul_1: "f32[10, 20]" = torch.ops.aten.mul.Tensor(arg1_1, 2);  arg1_1 = None
+
+            sum_1: "f32[]" = torch.ops.aten.sum.default(mul);  mul = None
+            sum_2: "f32[]" = torch.ops.aten.sum.default(mul_1);  mul_1 = None
+            add: "f32[]" = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
+            return (add,)
+""",
+        )
+
+    def test_output_nodes_last(self):
+        from torch._dynamo.graph_deduplication import _stable_topological_sort
+
+        def inner_fn(x, y):
+            x0 = x.view(x.size())
+            return x0.view(x.size())
+
+        def inner_fn2(x, y):
+            x = x * 2
+            y = y * 2
+            return x.sum() + y.sum()
+
+        def fn(x, y):
+            o0 = inner_fn(x, y)
+            o1 = inner_fn(x, y)
+            o2 = inner_fn2(x, y)
+            o3 = inner_fn2(x, y)
+            return o0 + o1 + o2.sum() + o3.sum()
+
+        x = torch.rand(10, 10, requires_grad=False)
+        y = torch.rand(10, 20, requires_grad=False)
+        x_clone = x.clone()
+        y_clone = y.clone()
+
+        _, _, fw_graphs = self.run_and_return_graphs(fn, x_clone, y_clone)
+        mod = fw_graphs[0]
+        output = next(n for n in mod.graph.nodes if n.op == "output")
+        add_2 = next(n for n in mod.graph.nodes if n.name == "sum_2")
+        add_2.append(output)
+
+        self.assertExpectedInline(
+            graph_str(mod),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
+        view: "f32[10, 10]" = torch.ops.aten.view.default(arg0_1, [10, 10])
+
+        view_1: "f32[10, 10]" = torch.ops.aten.view.default(view, [10, 10]);  view = None
+
+        view_2: "f32[10, 10]" = torch.ops.aten.view.default(arg0_1, [10, 10])
+
+        view_3: "f32[10, 10]" = torch.ops.aten.view.default(view_2, [10, 10]);  view_2 = None
+
+        add: "f32[10, 10]" = torch.ops.aten.add.Tensor(view_1, view_3);  view_1 = view_3 = None
+
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', arg0_1, arg1_1);  repeated_subgraph0 = None
+        getitem: "f32[]" = invoke_subgraph[0];  invoke_subgraph = None
+
+        sum_1: "f32[]" = torch.ops.aten.sum.default(getitem);  getitem = None
+        add_1: "f32[10, 10]" = torch.ops.aten.add.Tensor(add, sum_1);  add = sum_1 = None
+
+        repeated_subgraph0_1 = self.repeated_subgraph0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', arg0_1, arg1_1);  repeated_subgraph0_1 = arg0_1 = arg1_1 = None
+        getitem_1: "f32[]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+
+        sum_2: "f32[]" = torch.ops.aten.sum.default(getitem_1);  getitem_1 = None
+        return (add_2,)
+        add_2: "f32[10, 10]" = torch.ops.aten.add.Tensor(add_1, sum_2);  add_1 = sum_2 = None
+
+    class repeated_subgraph0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
+            mul: "f32[10, 10]" = torch.ops.aten.mul.Tensor(arg0_1, 2);  arg0_1 = None
+
+            mul_1: "f32[10, 20]" = torch.ops.aten.mul.Tensor(arg1_1, 2);  arg1_1 = None
+
+            sum_1: "f32[]" = torch.ops.aten.sum.default(mul);  mul = None
+            sum_2: "f32[]" = torch.ops.aten.sum.default(mul_1);  mul_1 = None
+            add: "f32[]" = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
+            return (add,)
+""",
+        )
+        _stable_topological_sort(mod.graph, {})
+        self.assertExpectedInline(
+            graph_str(mod),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
+        view: "f32[10, 10]" = torch.ops.aten.view.default(arg0_1, [10, 10])
+
+        view_1: "f32[10, 10]" = torch.ops.aten.view.default(view, [10, 10]);  view = None
+
+        view_2: "f32[10, 10]" = torch.ops.aten.view.default(arg0_1, [10, 10])
+
+        view_3: "f32[10, 10]" = torch.ops.aten.view.default(view_2, [10, 10]);  view_2 = None
+
+        add: "f32[10, 10]" = torch.ops.aten.add.Tensor(view_1, view_3);  view_1 = view_3 = None
+
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', arg0_1, arg1_1);  repeated_subgraph0 = None
+        getitem: "f32[]" = invoke_subgraph[0];  invoke_subgraph = None
+
+        sum_1: "f32[]" = torch.ops.aten.sum.default(getitem);  getitem = None
+        add_1: "f32[10, 10]" = torch.ops.aten.add.Tensor(add, sum_1);  add = sum_1 = None
+
+        repeated_subgraph0_1 = self.repeated_subgraph0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', arg0_1, arg1_1);  repeated_subgraph0_1 = arg0_1 = arg1_1 = None
+        getitem_1: "f32[]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+
+        sum_2: "f32[]" = torch.ops.aten.sum.default(getitem_1);  getitem_1 = None
+        add_2: "f32[10, 10]" = torch.ops.aten.add.Tensor(add_1, sum_2);  add_1 = sum_2 = None
+        return (add_2,)
+
+    class repeated_subgraph0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[10, 10]", arg1_1: "f32[10, 20]"):
+            mul: "f32[10, 10]" = torch.ops.aten.mul.Tensor(arg0_1, 2);  arg0_1 = None
+
+            mul_1: "f32[10, 20]" = torch.ops.aten.mul.Tensor(arg1_1, 2);  arg1_1 = None
+
+            sum_1: "f32[]" = torch.ops.aten.sum.default(mul);  mul = None
+            sum_2: "f32[]" = torch.ops.aten.sum.default(mul_1);  mul_1 = None
+            add: "f32[]" = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
+            return (add,)
+""",
+        )
+
+    def test_mutation_ordering(self):
+        from torch._dynamo.graph_deduplication import _stable_topological_sort
+
+        def inner_fn(x, y):
+            x0 = x.view(x.size())
+            return x0.view(x.size())
+
+        def inner_fn2(x, y):
+            x = x * 2
+            y = y * 2
+            return x.sum() + y.sum()
+
+        def fn(x, y):
+            o0 = inner_fn(x, y)
+            o1 = inner_fn(x, y)
+            x.add_(x)
+            o2 = inner_fn2(x, y)
+            y.mul_(y)
+            o3 = inner_fn2(x, y)
+            return o0 + o1 + o2.sum() + o3.sum()
+
+        x = torch.rand(10, 10)
+        y = torch.rand(10, 20)
+        x_clone = x.clone()
+        y_clone = y.clone()
+
+        graph, _ = extract_graph_and_tracker(fn, x_clone, y_clone)
+
+        def graph_code(graph):
+            return graph.python_code("self").src
+
+        def get_node(name):
+            return next(n for n in graph.nodes if n.name == name)
+
+        self.assertExpectedInline(
+            graph_code(graph),
+            """\
+
+
+
+def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
+    subgraph_0 = self.subgraph_0
+    l_x_ = L_x_
+    l_y_ = L_y_
+    x0 = l_x_.view((10, 10))
+    o0 = x0.view((10, 10));  x0 = None
+    x0_1 = l_x_.view((10, 10))
+    o1 = x0_1.view((10, 10));  x0_1 = None
+    add_ = l_x_.add_(l_x_);  add_ = None
+    add_2 = o0 + o1;  o0 = o1 = None
+    invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_)
+    mul_ = l_y_.mul_(l_y_);  mul_ = None
+    getitem = invoke_subgraph[0];  invoke_subgraph = None
+    sum_5 = getitem.sum();  getitem = None
+    add_3 = add_2 + sum_5;  add_2 = sum_5 = None
+    invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = l_y_ = None
+    getitem_1 = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+    sum_6 = getitem_1.sum();  getitem_1 = None
+    add_4 = add_3 + sum_6;  add_3 = sum_6 = None
+    return (add_4,)
+    """,
+        )
+
+        # Shuffle nodes in the graph
+        add_ = get_node("add_")
+        mul_ = get_node("mul_")
+        o1 = get_node("o1")
+        o1.append(mul_)
+        add_2 = get_node("add_2")
+        add_2.append(add_)
+
+        self.assertExpectedInline(
+            graph_code(graph),
+            """\
+
+
+
+def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
+    subgraph_0 = self.subgraph_0
+    l_x_ = L_x_
+    l_y_ = L_y_
+    x0 = l_x_.view((10, 10))
+    o0 = x0.view((10, 10));  x0 = None
+    x0_1 = l_x_.view((10, 10))
+    o1 = x0_1.view((10, 10));  x0_1 = None
+    mul_ = l_y_.mul_(l_y_);  mul_ = None
+    add_2 = o0 + o1;  o0 = o1 = None
+    add_ = l_x_.add_(l_x_);  add_ = None
+    invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_)
+    getitem = invoke_subgraph[0];  invoke_subgraph = None
+    sum_5 = getitem.sum();  getitem = None
+    add_3 = add_2 + sum_5;  add_2 = sum_5 = None
+    invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = l_y_ = None
+    getitem_1 = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+    sum_6 = getitem_1.sum();  getitem_1 = None
+    add_4 = add_3 + sum_6;  add_3 = sum_6 = None
+    return (add_4,)
+    """,
+        )
+        _stable_topological_sort(
+            graph, torch._dynamo.graph_deduplication.last_node_to_additional_deps
+        )
+        self.assertExpectedInline(
+            graph_code(graph),
+            """\
+
+
+
+def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
+    subgraph_0 = self.subgraph_0
+    l_x_ = L_x_
+    l_y_ = L_y_
+    x0 = l_x_.view((10, 10))
+    o0 = x0.view((10, 10));  x0 = None
+    x0_1 = l_x_.view((10, 10))
+    o1 = x0_1.view((10, 10));  x0_1 = None
+    add_2 = o0 + o1;  o0 = o1 = None
+    add_ = l_x_.add_(l_x_);  add_ = None
+    invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_)
+    mul_ = l_y_.mul_(l_y_);  mul_ = None
+    getitem = invoke_subgraph[0];  invoke_subgraph = None
+    sum_5 = getitem.sum();  getitem = None
+    add_3 = add_2 + sum_5;  add_2 = sum_5 = None
+    invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = l_y_ = None
+    getitem_1 = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+    sum_6 = getitem_1.sum();  getitem_1 = None
+    add_4 = add_3 + sum_6;  add_3 = sum_6 = None
+    return (add_4,)
+    """,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
diff --git a/test/dynamo/test_graph_region_tracker.py b/test/dynamo/test_graph_region_tracker.py
index c672385873cc..866bf988cf60 100644
--- a/test/dynamo/test_graph_region_tracker.py
+++ b/test/dynamo/test_graph_region_tracker.py
@@ -49,6 +49,13 @@ def get_result(self, fn, *args, **kwargs):
         region_groups = tree_map(lambda n: n.name, region_groups)
         return str(region_groups)
 
+<<<<<<< HEAD
+=======
+    def get_mutation_tracking(self, fn, *args, **kwargs):
+        _, region_tracker = extract_graph_and_tracker(fn, *args, **kwargs)
+        return str(region_tracker.node_to_mutated_arg_positions)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_regions_single_region_group(self):
         def inner_fn(x, y):
             x0 = x + 1
@@ -57,12 +64,20 @@ def inner_fn(x, y):
             return z
 
         def fn(x, y):
+<<<<<<< HEAD
             _o0 = inner_fn(x, y)
+=======
+            o0 = inner_fn(x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             o1 = torch.sin(y)
             o2 = inner_fn(x, o1)
             o3 = inner_fn(x, y)
             o4 = o3 * o3
+<<<<<<< HEAD
             return o2 * o4
+=======
+            return o2 * o4 + o0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertExpectedInline(
             self.get_result(
@@ -295,6 +310,80 @@ def reset_default_dtype():
 [['x1', 'y1', 'sum_1', 'o4'], ['x1_1', 'y1_1', 'sum_2', 'o5']]]""",
                 )
 
+<<<<<<< HEAD
+=======
+    def test_mutation_tracking_simple(self):
+        def fn(x, y, z):
+            x0 = torch.sin(x)
+            y0 = torch.cos(y)
+            z.sin_()
+            y0.add_(z)
+            return x0.sum() + y0.sum()
+
+        self.assertExpectedInline(
+            self.get_mutation_tracking(
+                fn,
+                torch.rand(10, 10),
+                torch.rand(10, 20),
+                torch.ones(10, 20),
+            ),
+            """{sin_: OrderedSet([0]), add_: OrderedSet([0])}""",
+        )
+
+    def test_mutation_tracking_setitem(self):
+        def fn(x):
+            y = x + 1
+            y[0] = 3
+            return y
+
+        self.assertExpectedInline(
+            self.get_mutation_tracking(fn, torch.rand(10, 10)),
+            """{setitem: OrderedSet([0])}""",
+        )
+
+    def test_mutation_tracking_allow_in_graph(self):
+        @torch._dynamo.allow_in_graph
+        def fn_mut(x, y):
+            x.add_(y)
+            return x.sum() + y.sum()
+
+        def fn(x, y):
+            z = x + y
+            o0 = fn_mut(z, y)
+            z.sin_()
+            return x + o0
+
+        self.assertExpectedInline(
+            self.get_mutation_tracking(
+                fn,
+                torch.rand(20, 10),
+                torch.rand(20, 10),
+            ),
+            """{o0: OrderedSet([0]), sin_: OrderedSet([0])}""",
+        )
+
+    def test_non_tensor_arg_hashing(self):
+        def inner(x, w, t):
+            y = x + x
+            return torch.conv2d(y, w, None, *t)
+
+        def fn(x, y):
+            o1 = inner(x, y, ((1, 1), (0, 0), (1, 1), 1))
+            o2 = inner(x, y, ((1, 1), (0, 0), (1, 1), 1))
+            o3 = inner(x, y, ((1, 1), (0, 0), (1, 1), 1))
+            o4 = inner(x, y, ((2, 2), (0, 0), (1, 1), 1))
+            return o1.sum() + o2.sum() + o3.sum() + o4.sum()
+
+        self.assertExpectedInline(
+            self.get_result(
+                fn,
+                torch.rand(32, 256, 56, 56),
+                torch.nn.Parameter(torch.rand(512, 256, 1, 1)),
+            ),
+            """[[['y', 'o1'], ['y_1', 'o2'], ['y_2', 'o3']]]""",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_guard_manager.py b/test/dynamo/test_guard_manager.py
index 814419e71d05..89679739901e 100644
--- a/test/dynamo/test_guard_manager.py
+++ b/test/dynamo/test_guard_manager.py
@@ -201,12 +201,15 @@ def test_default_device_guard(self):
         finally:
             torch.set_default_device(None)
 
+<<<<<<< HEAD
     def test_data_ptr_match_guard(self):
         foo = torch.tensor([1, 2, 3])
         guard = guards.DATA_PTR_MATCH(foo, ["x.data_ptr() == foo.data_ptr()"])
         self.assertTrue(guard(foo))
         self.assertFalse(guard(torch.tensor([1, 2, 3])))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_length_check_guard(self):
         foo = [1, 2, 3]
         guard = guards.LENGTH_CHECK(len(foo), ["len(x) == len(foo)"])
@@ -295,7 +298,19 @@ def test_tensor_match_guard(self):
         x = torch.randn(4, 4)
         size = list(x.size())
         stride = list(x.stride())
+<<<<<<< HEAD
         guard_manager.add_tensor_match_guard(x, size, stride, "x", ["check_tensor(x)"])
+=======
+        guard_manager.add_tensor_match_guard(
+            x,
+            size,
+            stride,
+            "x",
+            ["check_tensor(x)"],
+            type(x),
+            torch._C._dispatch_keys(x),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(guard_manager.check(x))
         self.assertTrue(guard_manager.check_verbose(x).result)
         self.assertTrue(guard_manager.check(torch.randn(4, 4)))
@@ -538,6 +553,7 @@ def fn(x, y, z):
             guard_str,
         )
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(enable_cpp_framelocals_guard_eval=False)
     def test_framelocals_guard_config_flag(self):
         def fn(x):
@@ -557,6 +573,8 @@ def fn(x):
             guard_str,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dict_getitem_accessor(self):
         foo = {
             "a": 1,
diff --git a/test/dynamo/test_guard_serialization.py b/test/dynamo/test_guard_serialization.py
new file mode 100644
index 000000000000..9e93f3048ea8
--- /dev/null
+++ b/test/dynamo/test_guard_serialization.py
@@ -0,0 +1,1260 @@
+# Owner(s): ["module: dynamo"]
+
+import dataclasses
+import importlib
+import pickle
+import sys
+import types
+import unittest
+from collections.abc import Iterator
+from unittest.mock import patch
+
+import torch
+import torch._dynamo.testing
+import torch._inductor.config
+import torch._inductor.test_case
+import torch.onnx.operators
+import torch.utils.cpp_extension
+from torch._dynamo.bytecode_transformation import transform_code_object
+from torch._dynamo.exc import PackageError
+from torch._dynamo.guards import CheckFunctionManager, CompileId
+from torch._dynamo.symbolic_convert import (
+    ExceptionStack,
+    InstructionTranslator,
+    SpeculationLog,
+)
+from torch._dynamo.utils import dynamo_timed, get_metrics_context
+from torch._guards import compile_context, CompileContext, tracing
+from torch.overrides import TorchFunctionMode
+from torch.testing._internal.inductor_utils import HAS_GPU
+from torch.utils import _pytree as pytree
+
+
+@dataclasses.dataclass
+class _FrameState:
+    f_locals: dict
+    f_globals: dict
+    f_code: types.CodeType
+    f_builtins: dict
+
+
+class GlobalModule(torch.nn.Module):
+    def forward(self, x):
+        return x + 1
+
+
+def global_func(x):
+    return x + 1
+
+
+class GlobalTorchFunctionMode(TorchFunctionMode):
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        return func(*args, **kwargs)
+
+
+class SubclassWithMeta(torch.Tensor):
+    @staticmethod
+    def __new__(cls, a, extra, outer_size=None, outer_stride=None):
+        if outer_size is None:
+            outer_size = a.size()
+        if outer_stride is None:
+            outer_stride = a.stride()
+
+        shape = outer_size
+        kwargs = {}
+        kwargs["strides"] = outer_stride
+        kwargs["storage_offset"] = a.storage_offset()
+        kwargs["device"] = a.device
+        kwargs["layout"] = a.layout
+        kwargs["requires_grad"] = a.requires_grad
+        kwargs["dtype"] = a.dtype
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)
+
+    def __init__(self, a, extra, outer_size=None, outer_stride=None):
+        self.a = a
+        self.extra = extra
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        if kwargs is None:
+            kwargs = {}
+        args_a = pytree.tree_map_only(SubclassWithMeta, lambda x: x.a, args)
+        kwargs_a = pytree.tree_map_only(SubclassWithMeta, lambda x: x.a, kwargs)
+        out_a = func(*args_a, **kwargs_a)
+        if isinstance(out_a, torch.Tensor):
+            assert isinstance(args[0], SubclassWithMeta)
+            return SubclassWithMeta(out_a, extra=args[0].extra)
+        return out_a
+
+    def __tensor_flatten__(self):
+        # store extra in meta
+        return ["a"], {"extra": self.extra}
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
+        assert isinstance(meta, dict)
+        a = inner_tensors["a"]
+        # pull out extra from meta
+        extra = meta["extra"]
+        if type(a) is torch.Tensor:
+            assert outer_size is not None
+            assert outer_stride is not None
+        return SubclassWithMeta(a, extra, outer_size, outer_stride)
+
+
+class SubclassWithCustomMetadataGuard(torch.Tensor):
+    @staticmethod
+    def __new__(cls, a, extra, outer_size=None, outer_stride=None):
+        if outer_size is None:
+            outer_size = a.size()
+        if outer_stride is None:
+            outer_stride = a.stride()
+
+        shape = outer_size
+        kwargs = {}
+        kwargs["strides"] = outer_stride
+        kwargs["storage_offset"] = a.storage_offset()
+        kwargs["device"] = a.device
+        kwargs["layout"] = a.layout
+        kwargs["requires_grad"] = a.requires_grad
+        kwargs["dtype"] = a.dtype
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)
+
+    def __init__(self, a, extra, outer_size=None, outer_stride=None):
+        self.a = a
+        self.extra = extra
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        if kwargs is None:
+            kwargs = {}
+        args_a = pytree.tree_map_only(
+            SubclassWithCustomMetadataGuard, lambda x: x.a, args
+        )
+        kwargs_a = pytree.tree_map_only(
+            SubclassWithCustomMetadataGuard, lambda x: x.a, kwargs
+        )
+        out_a = func(*args_a, **kwargs_a)
+        if isinstance(out_a, torch.Tensor):
+            assert isinstance(args[0], SubclassWithCustomMetadataGuard)
+            return SubclassWithCustomMetadataGuard(out_a, extra=args[0].extra)
+        return out_a
+
+    @classmethod
+    def __metadata_guard__(cls, meta1, meta2):
+        # Define custom metadata guard logic that only looks at "bar" to determine
+        # metadata equivalence. This is more purposefully more lax than the default
+        # guard behavior.
+        return meta1["extra"]["bar"] == meta2["extra"]["bar"]
+
+    def __tensor_flatten__(self):
+        # store extra in meta
+        return ["a"], {"extra": self.extra}
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
+        assert isinstance(meta, dict)
+        a = inner_tensors["a"]
+        # pull out extra from meta
+        extra = meta["extra"]
+        if type(a) is torch.Tensor:
+            assert outer_size is not None
+            assert outer_stride is not None
+        return SubclassWithCustomMetadataGuard(a, extra, outer_size, outer_stride)
+
+
+class SubclassWithSubclassInnerTensor(torch.Tensor):
+    @staticmethod
+    def __new__(cls, a, extra, outer_size=None, outer_stride=None):
+        if outer_size is None:
+            outer_size = a.size()
+        if outer_stride is None:
+            outer_stride = a.stride()
+
+        shape = outer_size
+        kwargs = {}
+        kwargs["strides"] = outer_stride
+        kwargs["storage_offset"] = a.storage_offset()
+        kwargs["device"] = a.device
+        kwargs["layout"] = a.layout
+        kwargs["requires_grad"] = a.requires_grad
+        kwargs["dtype"] = a.dtype
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)
+
+    def __init__(self, a, extra, outer_size=None, outer_stride=None):
+        self.a = a
+        self.inner_sub = SubclassWithMeta(a + 1, extra=extra)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        if kwargs is None:
+            kwargs = {}
+        args_a = pytree.tree_map_only(
+            SubclassWithSubclassInnerTensor, lambda x: x.a, args
+        )
+        kwargs_a = pytree.tree_map_only(
+            SubclassWithSubclassInnerTensor, lambda x: x.a, kwargs
+        )
+        out_a = func(*args_a, **kwargs_a)
+        if isinstance(out_a, torch.Tensor):
+            assert isinstance(args[0], SubclassWithSubclassInnerTensor)
+            return SubclassWithSubclassInnerTensor(out_a, extra=args[0].inner_sub.extra)
+        return out_a
+
+    def __tensor_flatten__(self):
+        return ["a", "inner_sub"], None
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
+        assert meta is None
+        a = inner_tensors["a"]
+        extra = inner_tensors["inner_sub"].extra
+        if type(a) is torch.Tensor:
+            assert outer_size is not None
+            assert outer_stride is not None
+        return SubclassWithSubclassInnerTensor(a, extra, outer_size, outer_stride)
+
+
+# defines a custom __eq__() / __hash__() to be registered as a pytree constant type
+class CustomConstantType:
+    def __init__(self, a, b):
+        self.a = a
+        self.b = b
+
+    def __eq__(self, other):
+        # custom eq ignores b
+        return self.a == other.a
+
+    def __hash__(self):
+        # custom hash ignores b
+        return hash(self.a)
+
+
+pytree.register_constant(CustomConstantType)
+
+
+class TestGuardSerialization(torch._inductor.test_case.TestCase):
+    def test_function_locals(self):
+        def foo(x):
+            return x + 1
+
+        def fn(x, g):
+            return g(x) + 1
+
+        self._test_serialization("TENSOR_MATCH", fn, torch.randn(3), foo)
+
+    def _tracefunc(self, frame, event, arg):
+        if event != "call":
+            return
+
+        if self._frame_state is not None:
+            return
+
+        self._frame_state = _FrameState(
+            f_locals=dict(frame.f_locals),
+            f_globals=dict(frame.f_globals),
+            f_code=frame.f_code,
+            f_builtins=frame.f_builtins,
+        )
+
+    def _test_serialization(self, guard_type, fn, *args, **kwargs):
+        # kwargs might contain a callable that generates kwargs
+        kwarg_gen_fn = kwargs.get("_gen_fn", None)
+        if kwarg_gen_fn is not None:
+            kwargs = kwarg_gen_fn()
+
+        self._frame_state = None
+        sys.settrace(self._tracefunc)
+        if isinstance(fn, torch.nn.Module):
+            fn = fn.forward
+        try:
+            fn(*args, **kwargs)
+        finally:
+            sys.settrace(None)
+
+        assert self._frame_state is not None
+
+        # Set f_locals from regenerated kwargs to handle exhausted input iterators
+        # NB: This is super janky and might cause unforeseen problems
+        if kwarg_gen_fn is not None:
+            kwargs = kwarg_gen_fn()
+            for key in self._frame_state.f_locals.keys():
+                if key in kwargs and isinstance(kwargs[key], Iterator):
+                    self._frame_state.f_locals[key] = kwargs[key]
+
+        def guard_filter_fn(guards):
+            ret = [
+                g.guard_type == guard_type or guard_type in g.derived_guard_types
+                for g in guards
+            ]
+            self.assertTrue(any(ret))
+            return ret
+
+        ref_gm = None
+        loaded_gm = None
+
+        def transform(instructions: list, code_options: dict[str, object]):
+            """
+            The goal is here is not to reimplement dynamo, but just to have a
+            simplified version to extract the state from symbolic convert.
+            Should not work on all cases, but should work on simple functions
+            in this test file.
+            """
+            nonlocal ref_gm
+            nonlocal loaded_gm
+
+            tracer = InstructionTranslator(
+                instructions,
+                self._frame_state.f_code,
+                self._frame_state.f_locals,
+                self._frame_state.f_globals,
+                self._frame_state.f_builtins,
+                fn.__closure__ or (),
+                torch.overrides._get_current_function_mode_stack(),
+                code_options,
+                torch._dynamo.lookup_backend("eager"),
+                one_graph=False,
+                export=False,
+                export_constraints=None,
+                frame_state=None,
+                speculation_log=SpeculationLog(),
+                exn_vt_stack=ExceptionStack(),
+                distributed_state=None,
+                package=None,
+            )
+            with (
+                compile_context(CompileContext(CompileId(0, 0))),
+                tracing(tracer.output.tracing_context),
+                tracer.set_current_tx(),
+                get_metrics_context(),
+                dynamo_timed(""),
+            ):
+                tracer.run()
+
+                check_fn_manager = CheckFunctionManager(
+                    self._frame_state.f_code,
+                    tracer.output,
+                    guard_filter_fn=guard_filter_fn,
+                    guards_serialization_mode="save",
+                )
+                ref_gm = check_fn_manager.guard_manager
+                guards_state = check_fn_manager.guards_state
+                self.assertIsNotNone(guards_state)
+                guards_state = pickle.loads(guards_state)
+
+                check_fn_manager = CheckFunctionManager(
+                    self._frame_state.f_code,
+                    guards_state.output_graph,
+                    guards_serialization_mode="load",
+                    shape_code_parts=guards_state.shape_code_parts,
+                )
+                loaded_gm = check_fn_manager.guard_manager
+
+        try:
+            transform_code_object(self._frame_state.f_code, transform)
+        finally:
+            self._frame_state = None
+
+        self.assertIsNotNone(ref_gm)
+        self.assertIsNotNone(loaded_gm)
+        return ref_gm, loaded_gm
+
+    def _test_check_fn(self, ref, loaded, inputs, expected):
+        self.assertIsInstance(inputs, dict)
+        self.assertEqual(ref.check(inputs), expected)
+        self.assertEqual(ref.check(inputs), loaded.check(inputs))
+
+    def test_tensor_match(self):
+        def f(x: torch.Tensor):
+            return x + 1
+
+        ref, loaded = self._test_serialization(
+            "TENSOR_MATCH", f, torch.ones(2, dtype=torch.float32)
+        )
+        self._test_check_fn(
+            ref, loaded, {"x": torch.randn(2, dtype=torch.float32)}, True
+        )
+        self._test_check_fn(
+            ref, loaded, {"x": torch.randn(3, dtype=torch.float32)}, False
+        )
+        self._test_check_fn(
+            ref, loaded, {"x": torch.randn(2, dtype=torch.float64)}, False
+        )
+        self._test_check_fn(ref, loaded, {"x": None}, False)
+
+    def test_not_present_in_generic_dict(self):
+        class Module(torch.nn.Module):
+            def forward(self, x: torch.Tensor):
+                return x + 1
+
+        m = Module()
+
+        def fn(x):
+            return m(x)
+
+        ref, loaded = self._test_serialization(
+            "NOT_PRESENT_IN_GENERIC_DICT", fn, torch.ones(2, dtype=torch.float32)
+        )
+        self._test_check_fn(ref, loaded, {"m": m}, True)
+
+        m.forward = types.MethodType(lambda x: x + 2, m)
+        self._test_check_fn(ref, loaded, {"m": m}, False)
+
+    def test_hasattr_serialization(self):
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = 1
+
+            def forward(self, x: torch.Tensor):
+                if hasattr(self, "a"):
+                    return x + self.a
+                else:
+                    return x + 2
+
+        m = Module()
+
+        def fn(x):
+            return m(x)
+
+        ref, loaded = self._test_serialization("HASATTR", fn, torch.randn(3))
+        self._test_check_fn(ref, loaded, {"m": m}, True)
+        delattr(m, "a")
+        self._test_check_fn(ref, loaded, {"m": m}, False)
+
+    def test_type_match(self):
+        class LocalModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor):
+                return x + 1
+
+        m = LocalModule()
+
+        def fn(m, x):
+            return m(x)
+
+        with self.assertRaisesRegex(
+            TypeError, "Please define the class at global scope"
+        ):
+            self._test_serialization("TYPE_MATCH", fn, m, torch.randn(3))
+
+        m = GlobalModule()
+        ref, loaded = self._test_serialization("TYPE_MATCH", fn, m, torch.randn(3))
+        self._test_check_fn(ref, loaded, {"m": m}, True)
+        self._test_check_fn(ref, loaded, {"m": GlobalModule()}, True)
+        self._test_check_fn(ref, loaded, {"m": torch.nn.Module()}, False)
+
+    def test_tensor_subclass_metadata_match(self):
+        class LocalSubclass(torch.Tensor):
+            @staticmethod
+            def __new__(cls, a, outer_size=None, outer_stride=None):
+                if outer_size is None:
+                    outer_size = a.size()
+                if outer_stride is None:
+                    outer_stride = a.stride()
+
+                shape = outer_size
+                kwargs = {}
+                kwargs["strides"] = outer_stride
+                kwargs["storage_offset"] = a.storage_offset()
+                kwargs["device"] = a.device
+                kwargs["layout"] = a.layout
+                kwargs["requires_grad"] = a.requires_grad
+                kwargs["dtype"] = a.dtype
+                return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)
+
+            def __init__(self, a, outer_size=None, outer_stride=None):
+                self.a = a
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args, kwargs):
+                if kwargs is None:
+                    kwargs = {}
+                args_a = pytree.tree_map_only(LocalSubclass, lambda x: x.a, args)
+                kwargs_a = pytree.tree_map_only(LocalSubclass, lambda x: x.a, kwargs)
+                out_a = func(*args_a, **kwargs_a)
+                if isinstance(out_a, torch.Tensor):
+                    return LocalSubclass(out_a)
+                return out_a
+
+            def __tensor_flatten__(self):
+                return ["a"], None
+
+            @staticmethod
+            def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
+                assert meta is None
+                a = inner_tensors["a"]
+                if type(a) is torch.Tensor:
+                    assert outer_size is not None
+                    assert outer_stride is not None
+                return LocalSubclass(a, outer_size, outer_stride)
+
+        def fn(x):
+            return x * 2
+
+        # === example subclass defined locally (error) ===
+        local_sub = LocalSubclass(torch.randn(3))
+        with self.assertRaisesRegex(
+            PackageError, "Please define the class at global scope"
+        ):
+            self._test_serialization("TENSOR_SUBCLASS_METADATA_MATCH", fn, local_sub)
+
+        # === example subclass with None extra metadata ===
+        from torch.testing._internal.two_tensor import TwoTensor
+
+        tt = TwoTensor(torch.randn(3), torch.randn(3))
+        ref, loaded = self._test_serialization("TENSOR_SUBCLASS_METADATA_MATCH", fn, tt)
+        self._test_check_fn(ref, loaded, {"x": tt}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.ones_like(tt)}, True)
+
+        # used below for convenience; returned func accepts some metadata and whether the
+        # guard is expected to pass for the given subclass type
+        def _get_meta_test_check_fn(ref, loaded, subclass_type):
+            def _f(meta, expected, ref=ref, loaded=loaded, subclass_type=subclass_type):
+                self._test_check_fn(
+                    ref,
+                    loaded,
+                    {"x": subclass_type(torch.randn(3), extra=meta)},
+                    expected,
+                )
+
+            return _f
+
+        # === example subclass with extra metadata ===
+        extra_meta = {
+            "foo": 5,
+            "bar": "hello",
+        }
+        sub = SubclassWithMeta(torch.randn(3), extra=extra_meta)
+        ref, loaded = self._test_serialization(
+            "TENSOR_SUBCLASS_METADATA_MATCH", fn, sub
+        )
+        self._test_check_fn(ref, loaded, {"x": sub}, True)
+        check_with_meta = _get_meta_test_check_fn(ref, loaded, SubclassWithMeta)
+        check_with_meta(dict(extra_meta), True)
+        # different "foo"
+        check_with_meta({"foo": 6, "bar": "hello"}, False)
+        # different "bar"
+        check_with_meta({"foo": 5, "bar": "world"}, False)
+
+        # === example subclass with custom metadata guard logic ===
+        sub = SubclassWithCustomMetadataGuard(torch.randn(3), extra=extra_meta)
+        ref, loaded = self._test_serialization(
+            "TENSOR_SUBCLASS_METADATA_MATCH", fn, sub
+        )
+        self._test_check_fn(ref, loaded, {"x": sub}, True)
+        check_with_meta = _get_meta_test_check_fn(
+            ref, loaded, SubclassWithCustomMetadataGuard
+        )
+        check_with_meta(dict(extra_meta), True)
+        # different "foo"; custom logic says this is okay
+        check_with_meta({"foo": 6, "bar": "hello"}, True)
+        # different "bar"
+        check_with_meta({"foo": 5, "bar": "world"}, False)
+
+        # === example subclass with subclass inner tensor ===
+        sub = SubclassWithSubclassInnerTensor(torch.randn(3), extra=extra_meta)
+        ref, loaded = self._test_serialization(
+            "TENSOR_SUBCLASS_METADATA_MATCH", fn, sub
+        )
+        self._test_check_fn(ref, loaded, {"x": sub}, True)
+        check_with_meta = _get_meta_test_check_fn(
+            ref, loaded, SubclassWithSubclassInnerTensor
+        )
+        check_with_meta(dict(extra_meta), True)
+        # different "foo"
+        check_with_meta({"foo": 6, "bar": "hello"}, False)
+        # different "bar"
+        check_with_meta({"foo": 5, "bar": "world"}, False)
+
+    def test_equals_match(self):
+        def fn(x, y):
+            # CustomConstantType is registered as a pytree constant so this should
+            # result in an EQUALS_MATCH guard.
+            if x in y:
+                return torch.zeros(3)
+            return torch.ones(3)
+
+        x = CustomConstantType(4, 5)
+        y = [CustomConstantType(2, 3), CustomConstantType(4, 5)]
+        ref, loaded = self._test_serialization("EQUALS_MATCH", fn, x, y)
+        self._test_check_fn(ref, loaded, {"x": x, "y": y}, True)
+        # custom __eq__ says that CustomConstantType(4, 5) == CustomConstantType(4, 9)
+        self._test_check_fn(
+            ref,
+            loaded,
+            {
+                "x": CustomConstantType(4, 5),
+                "y": [CustomConstantType(2, 3), CustomConstantType(4, 9)],
+            },
+            True,
+        )
+        self._test_check_fn(ref, loaded, {"x": x, "y": []}, False)
+        self._test_check_fn(
+            ref,
+            loaded,
+            {
+                "x": x,
+                "y": [CustomConstantType(2, 3), CustomConstantType(6, 7)],
+            },
+            False,
+        )
+
+    def test_constant_match(self):
+        # === bool constant ===
+        def fn(x, y):
+            if y:
+                return x + 1
+            return x + 2
+
+        x = torch.randn(3)
+        y = True
+
+        ref, loaded = self._test_serialization("CONSTANT_MATCH", fn, x, y)
+        self._test_check_fn(ref, loaded, {"x": x, "y": y}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "y": True}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(4), "y": True}, True)
+        # guard should fail for different y value
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "y": False}, False)
+
+        # === None constant ===
+        def fn(x, y):
+            if y is None:
+                return x + 1
+            return x + 2
+
+        x = torch.randn(3)
+        y = None
+
+        ref, loaded = self._test_serialization("CONSTANT_MATCH", fn, x, y)
+        self._test_check_fn(ref, loaded, {"x": x, "y": y}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "y": None}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(4), "y": None}, True)
+        # guard should fail for non-None y value
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "y": 5}, False)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "y": True}, False)
+
+        # === int constant ===
+        def fn(x, y):
+            return x + y
+
+        x = torch.randn(3)
+        y = 5
+
+        ref, loaded = self._test_serialization("CONSTANT_MATCH", fn, x, y)
+        self._test_check_fn(ref, loaded, {"x": x, "y": y}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "y": 5}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(4), "y": 5}, True)
+        # guard should fail for different y value
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "y": 6}, False)
+
+    def test_nn_module(self):
+        def fn(m, x):
+            return m(x)
+
+        m = GlobalModule()
+        x = torch.randn(3)
+
+        # config setting controls whether the NN_MODULE guard is installed
+        with patch("torch._dynamo.config.inline_inbuilt_nn_modules", False):
+            # we don't support NN_MODULE because it adds an ID_MATCH guard, and we don't
+            # support that in serialization
+            with self.assertRaisesRegex(
+                PackageError, "NN_MODULE guard cannot be serialized."
+            ):
+                self._test_serialization("NN_MODULE", fn, m, x)
+
+    def test_function_match(self):
+        def fn(x):
+            # usage of this context manager installs a FUNCTION_MATCH guard
+            with torch.no_grad():
+                y = x * 2
+            return y
+
+        x = torch.randn(3)
+
+        # we don't support FUNCTION_MATCH because it adds an ID_MATCH guard, and we don't
+        # support that in serialization
+        with self.assertRaisesRegex(
+            PackageError, "FUNCTION_MATCH guard cannot be serialized."
+        ):
+            self._test_serialization("FUNCTION_MATCH", fn, x)
+
+    def test_closure_match(self):
+        def fn(x):
+            # usage of this global function installs a CLOSURE_MATCH guard
+            return global_func(x)
+
+        x = torch.randn(3)
+
+        # we don't support CLOSURE_MATCH because it adds a FUNCTION_MATCH guard, and we don't
+        # support that in serialization
+        with self.assertRaisesRegex(
+            PackageError, "CLOSURE_MATCH guard cannot be serialized."
+        ):
+            self._test_serialization("CLOSURE_MATCH", fn, x)
+
+    def test_sequence_length(self):
+        # tuple input installs a SEQUENCE_LENGTH guard
+        def fn(t, x):
+            return t[1] + x
+
+        t = tuple(torch.randn(3) for _ in range(3))
+        x = torch.randn(3)
+
+        ref, loaded = self._test_serialization("SEQUENCE_LENGTH", fn, t, x)
+        self._test_check_fn(ref, loaded, {"x": x, "t": t}, True)
+        self._test_check_fn(
+            ref,
+            loaded,
+            {
+                "x": torch.randn(3),
+                "t": tuple(torch.randn(3) for _ in range(3)),
+            },
+            True,
+        )
+        # different types in tuple of same length shouldn't fail SEQUENCE_LENGTH guard
+        # (it should fail the separate TYPE_MATCH guard but that isn't tested here)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "t": (0, 1, 2)}, True)
+        # different length tuple
+        self._test_check_fn(
+            ref,
+            loaded,
+            {
+                "x": torch.randn(3),
+                "t": tuple(torch.randn(3) for _ in range(4)),
+            },
+            False,
+        )
+
+    def test_tuple_iterator_len(self):
+        def fn(t, x):
+            if len(list(t)) > 2:
+                return x * 2
+            return x + 1
+
+        tup = (1, 2, 3)
+        x = torch.randn(3)
+
+        # func to generate kwargs; useful for avoiding iterator exhaustion issues
+        def _gen_kwargs(tup=tup, x=x):
+            return {"t": iter(tup), "x": x}
+
+        ref, loaded = self._test_serialization(
+            "TUPLE_ITERATOR_LEN", fn, _gen_fn=_gen_kwargs
+        )
+
+        # same tuple
+        self._test_check_fn(ref, loaded, {"t": iter(tup), "x": x}, True)
+        self._test_check_fn(ref, loaded, {"t": iter(tup), "x": torch.randn(4)}, True)
+        # same length tuple, different contents
+        self._test_check_fn(ref, loaded, {"t": iter((3, 2, 1)), "x": x}, True)
+        self._test_check_fn(
+            ref, loaded, {"t": iter((3, 2, 1)), "x": torch.randn(4)}, True
+        )
+        # different tuple lengths
+        self._test_check_fn(ref, loaded, {"t": iter((1, 2)), "x": x}, False)
+        self._test_check_fn(
+            ref, loaded, {"t": iter((1, 2)), "x": torch.randn(4)}, False
+        )
+        self._test_check_fn(ref, loaded, {"t": iter((1, 2, 3, 4)), "x": x}, False)
+        self._test_check_fn(
+            ref, loaded, {"t": iter((1, 2, 3, 4)), "x": torch.randn(4)}, False
+        )
+
+    def test_range_iterator_match(self):
+        def fn(x, r):
+            y = x
+            for val in r:
+                y = x + val
+            return y
+
+        x = torch.randn(3)
+
+        def _gen_kwargs(x=x):
+            return {"x": x, "r": iter(range(2, 15, 3))}
+
+        ref, loaded = self._test_serialization(
+            "RANGE_ITERATOR_MATCH", fn, _gen_fn=_gen_kwargs
+        )
+
+        # same range
+        self._test_check_fn(ref, loaded, {"x": x, "r": iter(range(2, 15, 3))}, True)
+        self._test_check_fn(
+            ref, loaded, {"x": torch.randn(4), "r": iter(range(2, 15, 3))}, True
+        )
+        # equivalent even with different end
+        self._test_check_fn(ref, loaded, {"x": x, "r": iter(range(2, 16, 3))}, True)
+        self._test_check_fn(
+            ref, loaded, {"x": torch.randn(4), "r": iter(range(2, 16, 3))}, True
+        )
+        # different start
+        self._test_check_fn(ref, loaded, {"x": x, "r": iter(range(1, 15, 3))}, False)
+        self._test_check_fn(
+            ref, loaded, {"x": torch.randn(4), "r": iter(range(1, 15, 3))}, False
+        )
+        # different end resulting in different values
+        self._test_check_fn(ref, loaded, {"x": x, "r": iter(range(2, 18, 3))}, False)
+        self._test_check_fn(
+            ref, loaded, {"x": torch.randn(4), "r": iter(range(2, 18, 3))}, False
+        )
+        # different step
+        self._test_check_fn(ref, loaded, {"x": x, "r": iter(range(2, 15, 4))}, False)
+        self._test_check_fn(
+            ref, loaded, {"x": torch.randn(4), "r": iter(range(2, 15, 4))}, False
+        )
+
+    def test_dict_version(self):
+        def fn(x):
+            return pytree.tree_leaves(x)[0] + 1
+
+        with self.assertRaisesRegex(
+            PackageError, "DICT_VERSION guard cannot be serialized."
+        ):
+            self._test_serialization("DICT_VERSION", fn, {"t": torch.randn(3)})
+
+    def test_dict_contains(self):
+        def fn(x):
+            if x.__contains__("t"):
+                return x["t"] + 1
+            else:
+                return torch.ones(3)
+
+        ref, loaded = self._test_serialization(
+            "DICT_CONTAINS", fn, {"t": torch.randn(3)}
+        )
+
+        self._test_check_fn(ref, loaded, {"x": {"t": torch.randn(3)}}, True)
+        self._test_check_fn(ref, loaded, {"x": {}}, False)
+        self._test_check_fn(
+            ref, loaded, {"x": {"t": torch.randn(3), "d": torch.randn(3)}}, True
+        )
+
+    def test_bool_match(self):
+        def fn(x, b):
+            if b:
+                return x + 1
+            else:
+                return x + 2
+
+        ref, loaded = self._test_serialization("BOOL_MATCH", fn, torch.randn(3), True)
+
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "b": True}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "b": False}, False)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "b": None}, False)
+
+    def test_none_match(self):
+        def fn(x, b):
+            if b is None:
+                return x + 1
+            else:
+                return x + 2
+
+        ref, loaded = self._test_serialization("NONE_MATCH", fn, torch.randn(3), None)
+
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "b": None}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "b": False}, False)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3), "b": True}, False)
+
+    def test_id_match(self):
+        def fn(x):
+            return x + id(x)
+
+        with self.assertRaisesRegex(
+            PackageError, "ID_MATCH guard cannot be serialized."
+        ):
+            self._test_serialization("ID_MATCH", fn, torch.randn(3))
+
+    def test_dispatch_key_set_match(self):
+        def fn(x, dks):
+            if dks.has("CPU"):
+                return torch.sin(x + 1)
+            else:
+                return torch.sin(x - 1)
+
+        x = torch.randn(3)
+        dks = torch._C._dispatch_keys(x)
+        ref, loaded = self._test_serialization("DISPATCH_KEY_SET_MATCH", fn, x, dks)
+
+        self._test_check_fn(ref, loaded, {"x": x, "dks": dks}, True)
+
+        x = torch.randn(3, device="meta")
+        dks = torch._C._dispatch_keys(x)
+        self._test_check_fn(ref, loaded, {"x": x, "dks": dks}, False)
+
+    def test_name_match(self):
+        def fn(x, y):
+            return torch.cond(x, lambda x: y + 1, lambda x: y - 1, (y,))
+
+        x = torch.tensor(True)
+        y = torch.randn(3)
+        ref, loaded = self._test_serialization("NAME_MATCH", fn, x, y)
+
+        self._test_check_fn(ref, loaded, {"x": x, "y": y}, True)
+
+        op = importlib.import_module("torch._higher_order_ops.cond").cond_op
+        prev, op.__name__ = op.__name__, ""
+        try:
+            self._test_check_fn(ref, loaded, {"x": x, "y": y}, False)
+        finally:
+            op.__name__ = prev
+
+    def test_dual_level(self):
+        def fn(x):
+            with torch.autograd.forward_ad.dual_level():
+                return x + 1
+
+        x = torch.randn(3)
+        ref, loaded = self._test_serialization("DUAL_LEVEL", fn, x)
+
+        self._test_check_fn(ref, loaded, {"x": x}, True)
+        with torch.autograd.forward_ad.dual_level():
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+
+    def test_functorch_stack_match(self):
+        # Test when functorch stack is empty.
+        def fn(x):
+            return torch.func.jvp(torch.sin, (x,), (x,))
+
+        x = torch.randn(3, 4)
+        ref, loaded = self._test_serialization("FUNCTORCH_STACK_MATCH", fn, x)
+
+        self._test_check_fn(ref, loaded, {"x": x}, True)
+        with torch._functorch.vmap.vmap_increment_nesting(2, "error"):
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        def fn(x):
+            def g(x):
+                return torch.vmap(torch.func.grad(torch.sin))(x)
+
+            return torch.vmap(g)(x)
+
+        x = torch.randn(4, 5)
+        ref, loaded = self._test_serialization("FUNCTORCH_STACK_MATCH", fn, x)
+        self._test_check_fn(ref, loaded, {"x": x}, True)
+        with torch._functorch.eager_transforms.grad_increment_nesting():
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        # Test when there are more than 0 functorch layers.
+        # Simulate the case where torch.compile is nested inside eager transforms.
+
+        # Case 1: vmap
+        def fn(x):
+            return x.sum()
+
+        ref = loaded = None
+
+        def run(x):
+            nonlocal ref, loaded
+            # Turn off automatic dynamic shape to so that functionalization
+            # doesn't produce extra SymInt to serialize.
+            with torch._dynamo.config.patch(automatic_dynamic_shapes=False):
+                ref, loaded = self._test_serialization("FUNCTORCH_STACK_MATCH", fn, x)
+            return fn(x)
+
+        torch.vmap(run)(x)
+
+        self._test_check_fn(ref, loaded, {"x": x}, False)
+        with torch._functorch.vmap.vmap_increment_nesting(1, "error"):
+            self._test_check_fn(ref, loaded, {"x": x}, True)
+            with torch._functorch.vmap.vmap_increment_nesting(1, "error"):
+                self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        with torch._functorch.eager_transforms.grad_increment_nesting():
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        # Case 2: grad
+        x = torch.randn(3, 2)
+        ref = loaded = None
+        torch.func.grad(run)(x)
+        self._test_check_fn(ref, loaded, {"x": x}, False)
+        with torch._functorch.eager_transforms.grad_increment_nesting():
+            self._test_check_fn(ref, loaded, {"x": x}, True)
+            with torch._functorch.eager_transforms.grad_increment_nesting():
+                self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        with torch._functorch.vmap.vmap_increment_nesting(1, "error"):
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        # Case 3: jvp + vmap
+        x = torch.randn(3, 4)
+        ref = loaded = None
+
+        def fn(x):
+            return torch.func.jvp(torch.sin, (x,), (x,))
+
+        torch.func.jvp(torch.vmap(run), (x,), (x,))
+        self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        with torch._functorch.eager_transforms.jvp_increment_nesting():
+            with torch._functorch.vmap.vmap_increment_nesting(1, "error"):
+                self._test_check_fn(ref, loaded, {"x": x}, True)
+
+        with torch._functorch.vmap.vmap_increment_nesting(1, "error"):
+            with torch._functorch.eager_transforms.jvp_increment_nesting():
+                self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        # Case 4: functionalize
+        x = torch.randn(3, 2)
+        ref = loaded = None
+        torch.func.functionalize(run)(x)
+        self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        torch._C._functorch._func_increment_nesting(True)
+        try:
+            self._test_check_fn(ref, loaded, {"x": x}, True)
+        finally:
+            torch._C._functorch._func_decrement_nesting()
+
+        with torch._functorch.eager_transforms.jvp_increment_nesting():
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        # Case 5: vmap + grad
+        def fn(x):
+            return x.sum()
+
+        x = torch.randn(3, 2)
+        ref = loaded = None
+        torch.vmap(torch.func.grad(run))(x)
+        self._test_check_fn(ref, loaded, {"x": x}, False)
+        with torch._functorch.vmap.vmap_increment_nesting(1, "error"):
+            with torch._functorch.eager_transforms.grad_increment_nesting():
+                self._test_check_fn(ref, loaded, {"x": x}, True)
+
+        with torch._functorch.eager_transforms.grad_increment_nesting():
+            with torch._functorch.vmap.vmap_increment_nesting(1, "error"):
+                self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        with torch._functorch.vmap.vmap_increment_nesting(1, "error"):
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+
+        with torch._functorch.eager_transforms.grad_increment_nesting():
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+
+    def test_duplicate_input(self):
+        def fn(x, x_):
+            return x + x_
+
+        x = torch.randn(3, 2)
+        with self.assertRaisesRegex(
+            PackageError, "DUPLICATE_INPUT guard cannot be serialized"
+        ):
+            self._test_serialization("DUPLICATE_INPUT", fn, x, x)
+
+    def test_weakref_alive(self):
+        mod = torch.nn.Linear(10, 10, bias=False)
+        for p in mod.parameters():
+            p.grad = torch.rand_like(p)
+
+        opt = torch.optim.SGD(mod.parameters(), lr=0.1)
+
+        def fn():
+            params = []
+            opt._init_group(opt.param_groups[0], params, [], [])
+            return params[0].sum()
+
+        with self.assertRaisesRegex(
+            PackageError, "WEAKREF_ALIVE guard cannot be serialized"
+        ):
+            with torch.set_grad_enabled(False):
+                self._test_serialization("WEAKREF_ALIVE", fn)
+
+    def test_mapping_keys_check(self):
+        def fn(mp):
+            return mp["a"] + 1
+
+        mp = types.MappingProxyType({"a": torch.randn(3, 2), "b": torch.randn(3, 2)})
+        ref, loaded = self._test_serialization("MAPPING_KEYS_CHECK", fn, mp)
+        self._test_check_fn(ref, loaded, {"mp": mp}, True)
+        self._test_check_fn(
+            ref,
+            loaded,
+            {
+                "mp": types.MappingProxyType(
+                    {"b": torch.randn(3, 2), "a": torch.randn(3, 2)}
+                )
+            },
+            False,
+        )
+        self._test_check_fn(
+            ref, loaded, {"mp": types.MappingProxyType({"a": torch.randn(3, 2)})}, False
+        )
+
+    def test_dict_keys_match(self):
+        def fn(x):
+            ret = 1
+            for k in x:
+                ret += x[k]
+            return ret
+
+        x = {"a": torch.randn(3, 2), "b": torch.randn(3, 2)}
+        ref, loaded = self._test_serialization("DICT_KEYS_MATCH", fn, x)
+        self._test_check_fn(ref, loaded, {"x": x}, True)
+        self._test_check_fn(
+            ref,
+            loaded,
+            {"x": {"b": torch.randn(3, 2), "a": torch.randn(3, 2)}},
+            False,
+        )
+        self._test_check_fn(ref, loaded, {"x": {"a": torch.randn(3, 2)}}, False)
+
+    @torch._dynamo.config.patch("skip_nnmodule_hook_guards", False)
+    def test_empty_nn_module_hooks_dict(self):
+        class Module(torch.nn.Module):
+            def forward(self, x: torch.Tensor):
+                return x + 1
+
+        m = Module()
+
+        def fn(x):
+            return m(x)
+
+        x = torch.ones(2, dtype=torch.float32)
+        ref, loaded = self._test_serialization("EMPTY_NN_MODULE_HOOKS_DICT", fn, x)
+        self._test_check_fn(ref, loaded, {"m": m, "x": x}, True)
+
+        h = m.register_forward_hook(lambda *args, **kwargs: None)
+        self._test_check_fn(ref, loaded, {"m": m, "x": x}, False)
+        h.remove()
+
+        h = m.register_forward_pre_hook(lambda *args, **kwargs: None)
+        self._test_check_fn(ref, loaded, {"m": m, "x": x}, False)
+        h.remove()
+
+        h = m.register_backward_hook(lambda *args, **kwargs: None)
+        self._test_check_fn(ref, loaded, {"m": m, "x": x}, False)
+        h.remove()
+
+    def test_grad_mode(self):
+        def fn(x):
+            return x + 1
+
+        x = torch.randn(3, 2)
+        with torch.enable_grad():
+            ref, loaded = self._test_serialization("GRAD_MODE", fn, x)
+        with torch.no_grad():
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+        with torch.enable_grad():
+            self._test_check_fn(ref, loaded, {"x": x}, True)
+
+    def test_deterministic_algorithms(self):
+        def fn(x):
+            return x + 1
+
+        deterministic_restore = torch.are_deterministic_algorithms_enabled()
+        try:
+            x = torch.randn(3, 2)
+            torch.use_deterministic_algorithms(True)
+            ref, loaded = self._test_serialization("DETERMINISTIC_ALGORITHMS", fn, x)
+            torch.use_deterministic_algorithms(False)
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+            torch.use_deterministic_algorithms(True)
+            self._test_check_fn(ref, loaded, {"x": x}, True)
+        finally:
+            torch.use_deterministic_algorithms(deterministic_restore)
+
+    def test_torch_function_state(self):
+        def fn(x):
+            return x + 1
+
+        x = torch.randn(3, 2)
+
+        class LocalTorchFunctionMode(TorchFunctionMode):
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                return func(*args, **kwargs)
+
+        with GlobalTorchFunctionMode():
+            ref, loaded = self._test_serialization("TORCH_FUNCTION_STATE", fn, x)
+            self._test_check_fn(ref, loaded, {"x": x}, True)
+        self._test_check_fn(ref, loaded, {"x": x}, False)
+        with GlobalTorchFunctionMode():
+            with torch._C.DisableTorchFunction():
+                self._test_check_fn(ref, loaded, {"x": x}, False)
+        with self.assertRaisesRegex(
+            PackageError,
+            "defined in local scope. Please define the class at global scope",
+        ):
+            with LocalTorchFunctionMode():
+                ref, loaded = self._test_serialization("TORCH_FUNCTION_STATE", fn, x)
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_fsdp_training_state(self):
+        from torch.distributed.fsdp._fully_shard._fsdp_common import TrainingState
+        from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
+
+        param_group = FSDPParamGroup(
+            [],  # params: List[nn.Parameter],
+            (torch.nn.Linear(1, 1),),  # module: nn.Module,
+            None,  # mesh_info: FSDPMeshInfo,
+            None,  # post_forward_mesh_info: Optional[FSDPMeshInfo],
+            torch.device("cpu"),  # device: torch.device,
+            None,  # shard_placement_fn: Optional[Callable],
+            None,  # mp_policy: MixedPrecisionPolicy,
+            None,  # offload_policy: OffloadPolicy,
+        )
+
+        def fn(x):
+            with param_group.use_training_state(TrainingState.FORWARD):
+                if param_group._training_state == TrainingState.FORWARD:
+                    return x + 1
+                else:
+                    return x - 1
+
+        x = torch.randn(3, 2)
+
+        with torch.enable_grad():
+            ref, loaded = self._test_serialization("FSDP_TRAINING_STATE", fn, x)
+        with torch.no_grad():
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+        with torch.enable_grad():
+            self._test_check_fn(ref, loaded, {"x": x}, True)
+
+    def test_default_device(self):
+        device = torch.get_default_device()
+
+        def fn(x):
+            return x + 1
+
+        x = torch.randn(3, 2)
+        try:
+            torch.set_default_device("cpu")
+            ref, loaded = self._test_serialization("DEFAULT_DEVICE", fn, x)
+            torch.set_default_device("meta")
+            self._test_check_fn(ref, loaded, {"x": x}, False)
+            torch.set_default_device("cpu")
+            self._test_check_fn(ref, loaded, {"x": x}, True)
+        finally:
+            torch.set_default_device(device)
+
+    def test_shape_env(self):
+        def fn(x):
+            return x + 1
+
+        x = torch.randn(3, 2)
+        ref, loaded = self._test_serialization("SHAPE_ENV", fn, x)
+        self._test_check_fn(ref, loaded, {"x": x}, True)
+
+        x = torch.randn(3, 2)
+        torch._dynamo.mark_dynamic(x, 0, min=3, max=10)
+        ref, loaded = self._test_serialization("SHAPE_ENV", fn, x)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(4, 2)}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(10, 2)}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(11, 2)}, False)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(2, 2)}, False)
+
+        x = torch.randn(3, 3, 2)
+        torch._dynamo.mark_dynamic(x, 1, min=3, max=10)
+        ref, loaded = self._test_serialization("SHAPE_ENV", fn, x)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3, 4, 2)}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3, 10, 2)}, True)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3, 11, 2)}, False)
+        self._test_check_fn(ref, loaded, {"x": torch.randn(3, 2, 2)}, False)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 5985612c1d6b..dac6cb5c3429 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -296,6 +296,61 @@ def f(x):
         arg_count = ifdynstaticdefault(3, 4)
         self._test_wrap_simple(f, default_args_generator((x,)), arg_count)
 
+<<<<<<< HEAD
+=======
+    def test_allow_python_side_effects_utility(self):
+        from torch._dynamo.utils import (
+            _disable_side_effect_safety_checks_for_current_subtracer,
+        )
+        from torch._higher_order_ops.wrap import dynamo_bypassing_wrapper
+
+        def wrapper(fn):
+            return fn
+
+        count = 0
+
+        def does_side_effect(x):
+            nonlocal count
+            count += 1
+            return x.sin()
+
+        def does_side_effect_wrapped(*args, **kwargs):
+            return _disable_side_effect_safety_checks_for_current_subtracer(
+                does_side_effect, *args, **kwargs
+            )
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            return dynamo_bypassing_wrapper(wrapper, does_side_effect_wrapped, x)
+
+        x = torch.tensor(1.0)
+        fn(x)
+
+        def inner_does_side_effect(x):
+            nonlocal count
+            count += 1
+            return x
+
+        # Test that any nested HOPs are unaffected
+        def outer(x):
+            return dynamo_bypassing_wrapper(wrapper, inner_does_side_effect, x)
+
+        def outer_wrapped(*args, **kwargs):
+            return _disable_side_effect_safety_checks_for_current_subtracer(
+                outer, *args, **kwargs
+            )
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn_nested(x):
+            return dynamo_bypassing_wrapper(wrapper, outer_wrapped, x)
+
+        x = torch.tensor(1.0)
+        with self.assertRaisesRegex(
+            RuntimeError, "Mutating a variable not in the current scope"
+        ):
+            fn_nested(x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_symint_input(self):
         def f(x):
             i = x.size(0)
@@ -413,6 +468,7 @@ def forward(self, l_x_: "f32[3, 1]"):
                 actual_graph,
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", L_x_: "f32[s0, 1]"):
         l_x_ = L_x_
 
@@ -425,6 +481,20 @@ class wrap_body_0(torch.nn.Module):
         def forward(self, s0: "Sym(s0)", l_x_: "f32[s0, 1]"):
             view: "f32[s0]" = l_x_.view(s0);  l_x_ = s0 = None
             add: "f32[s0]" = view + 0.5;  view = None
+=======
+    def forward(self, s77: "Sym(s77)", L_x_: "f32[s77, 1]"):
+        l_x_ = L_x_
+
+        wrap_body_0 = self.wrap_body_0
+        wrap = torch.ops.higher_order.wrap(wrap_body_0, s77, l_x_);  wrap_body_0 = s77 = l_x_ = None
+        getitem: "f32[s77]" = wrap[0];  wrap = None
+        return (getitem,)
+
+    class wrap_body_0(torch.nn.Module):
+        def forward(self, s77: "Sym(s77)", l_x_: "f32[s77, 1]"):
+            view: "f32[s77]" = l_x_.view(s77);  l_x_ = s77 = None
+            add: "f32[s77]" = view + 0.5;  view = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (add,)
 """,
             )
@@ -606,13 +676,18 @@ def k(x):
                 out_graph,
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", L_x_: "f32[s0]"):
+=======
+    def forward(self, s77: "Sym(s77)", L_x_: "f32[s77]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         l_x_ = L_x_
 
         sum_1: "f32[]" = l_x_.sum()
         item: "Sym(zuf0)" = sum_1.item();  sum_1 = None
 
         wrap_body_1 = self.wrap_body_1
+<<<<<<< HEAD
         wrap = torch.ops.higher_order.wrap(wrap_body_1, s0, l_x_, item);  wrap_body_1 = s0 = l_x_ = item = None
         getitem: "f32[s0]" = wrap[0];  wrap = None
         return (getitem,)
@@ -627,6 +702,22 @@ def forward(self, s0: "Sym(s0)", l_x_: "f32[s0]", item: "Sym(zuf0)"):
         class wrap_body_0(torch.nn.Module):
             def forward(self, s0: "Sym(s0)", l_x_: "f32[s0]", item: "Sym(zuf0)"):
                 add: "f32[s0]" = l_x_ + item;  l_x_ = item = None
+=======
+        wrap = torch.ops.higher_order.wrap(wrap_body_1, s77, l_x_, item);  wrap_body_1 = s77 = l_x_ = item = None
+        getitem: "f32[s77]" = wrap[0];  wrap = None
+        return (getitem,)
+
+    class wrap_body_1(torch.nn.Module):
+        def forward(self, s77: "Sym(s77)", l_x_: "f32[s77]", item: "Sym(zuf0)"):
+            wrap_body_0 = self.wrap_body_0
+            wrap = torch.ops.higher_order.wrap(wrap_body_0, s77, l_x_, item);  wrap_body_0 = s77 = l_x_ = item = None
+            getitem: "f32[s77]" = wrap[0];  wrap = None
+            return (getitem,)
+
+        class wrap_body_0(torch.nn.Module):
+            def forward(self, s77: "Sym(s77)", l_x_: "f32[s77]", item: "Sym(zuf0)"):
+                add: "f32[s77]" = l_x_ + item;  l_x_ = item = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return (add,)
 """,
             )
@@ -692,7 +783,11 @@ def k(x):
                 out_graph,
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", L_x_: "f32[s0]"):
+=======
+    def forward(self, s77: "Sym(s77)", L_x_: "f32[s77]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         l_x_ = L_x_
 
         c: "i64[u0, 1]" = l_x_.nonzero()
@@ -704,22 +799,39 @@ def forward(self, s0: "Sym(s0)", L_x_: "f32[s0]"):
         _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
 
         wrap_body_1 = self.wrap_body_1
+<<<<<<< HEAD
         wrap = torch.ops.higher_order.wrap(wrap_body_1, s0, l_x_, sym_size_int_1, c);  wrap_body_1 = s0 = l_x_ = sym_size_int_1 = c = None
         getitem: "f32[s0]" = wrap[0]
+=======
+        wrap = torch.ops.higher_order.wrap(wrap_body_1, s77, l_x_, sym_size_int_1, c);  wrap_body_1 = s77 = l_x_ = sym_size_int_1 = c = None
+        getitem: "f32[s77]" = wrap[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_1: "f32[u0, 1]" = wrap[1];  wrap = None
         return (getitem, getitem_1)
 
     class wrap_body_1(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, s0: "Sym(s0)", l_x_: "f32[s0]", u0: "Sym(u0)", c: "i64[u0, 1]"):
             wrap_body_0 = self.wrap_body_0
             wrap = torch.ops.higher_order.wrap(wrap_body_0, s0, l_x_, u0, c);  wrap_body_0 = s0 = l_x_ = u0 = c = None
             child: "f32[s0]" = wrap[0]
+=======
+        def forward(self, s77: "Sym(s77)", l_x_: "f32[s77]", u0: "Sym(u0)", c: "i64[u0, 1]"):
+            wrap_body_0 = self.wrap_body_0
+            wrap = torch.ops.higher_order.wrap(wrap_body_0, s77, l_x_, u0, c);  wrap_body_0 = s77 = l_x_ = u0 = c = None
+            child: "f32[s77]" = wrap[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             child_1: "f32[u0, 1]" = wrap[1];  wrap = None
             return (child, child_1)
 
         class wrap_body_0(torch.nn.Module):
+<<<<<<< HEAD
             def forward(self, s0: "Sym(s0)", l_x_: "f32[s0]", u0: "Sym(u0)", c: "i64[u0, 1]"):
                 child: "f32[s0]" = l_x_.sin();  l_x_ = None
+=======
+            def forward(self, s77: "Sym(s77)", l_x_: "f32[s77]", u0: "Sym(u0)", c: "i64[u0, 1]"):
+                child: "f32[s77]" = l_x_.sin();  l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 child_1: "f32[u0, 1]" = c.sin();  c = None
                 return (child, child_1)
 """,
@@ -994,11 +1106,16 @@ def k(x):
             out_graph,
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", L_x_: "f32[s0, s1]", s2: "Sym(s2)", L_y_: "f32[s1, s2]"):
+=======
+    def forward(self, s77: "Sym(s77)", s27: "Sym(s27)", L_x_: "f32[s77, s27]", s94: "Sym(s94)", L_y_: "f32[s27, s94]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         l_x_ = L_x_
         l_y_ = L_y_
 
         wrap_body_1 = self.wrap_body_1
+<<<<<<< HEAD
         wrap = torch.ops.higher_order.wrap(wrap_body_1, s0, s1, l_x_, s2, l_y_);  wrap_body_1 = s0 = s1 = l_x_ = s2 = l_y_ = None
         getitem: "f32[s0, s2]" = wrap[0];  wrap = None
         return (getitem,)
@@ -1013,6 +1130,22 @@ def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", l_x_: "f32[s0, s1]", s2: "Sym(s2
         class wrap_body_0(torch.nn.Module):
             def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", l_x_: "f32[s0, s1]", s2: "Sym(s2)", l_y_: "f32[s1, s2]"):
                 matmul: "f32[s0, s2]" = l_x_ @ l_y_;  l_x_ = l_y_ = None
+=======
+        wrap = torch.ops.higher_order.wrap(wrap_body_1, s77, s27, l_x_, s94, l_y_);  wrap_body_1 = s77 = s27 = l_x_ = s94 = l_y_ = None
+        getitem: "f32[s77, s94]" = wrap[0];  wrap = None
+        return (getitem,)
+
+    class wrap_body_1(torch.nn.Module):
+        def forward(self, s77: "Sym(s77)", s27: "Sym(s27)", l_x_: "f32[s77, s27]", s94: "Sym(s94)", l_y_: "f32[s27, s94]"):
+            wrap_body_0 = self.wrap_body_0
+            wrap = torch.ops.higher_order.wrap(wrap_body_0, s77, s27, l_x_, s94, l_y_);  wrap_body_0 = s77 = s27 = l_x_ = s94 = l_y_ = None
+            getitem: "f32[s77, s94]" = wrap[0];  wrap = None
+            return (getitem,)
+
+        class wrap_body_0(torch.nn.Module):
+            def forward(self, s77: "Sym(s77)", s27: "Sym(s27)", l_x_: "f32[s77, s27]", s94: "Sym(s94)", l_y_: "f32[s27, s94]"):
+                matmul: "f32[s77, s94]" = l_x_ @ l_y_;  l_x_ = l_y_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return (matmul,)
 """,
         )
@@ -1748,18 +1881,30 @@ def forward(self, L_xs_ : torch.Tensor, L_y_ : torch.Tensor):
     l_y_ = L_y_
     map_body_1 = self.map_body_1
     map_impl = torch.ops.higher_order.map_impl(map_body_1, [l_xs_], [l_y_]);  map_body_1 = l_xs_ = l_y_ = None
+<<<<<<< HEAD
     getitem_1 = map_impl[0];  map_impl = None
     return (getitem_1,)""",
+=======
+    getitem = map_impl[0];  map_impl = None
+    return (getitem,)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertExpectedInline(
                 body_graph,
                 """\
 def forward(self, child : torch.Tensor, l_y_ : torch.Tensor):
+<<<<<<< HEAD
     child_1 = child[0];  child_1 = None
     map_body_0 = self.map_body_0
     map_impl = torch.ops.higher_order.map_impl(map_body_0, [child], [l_y_]);  map_body_0 = child = l_y_ = None
     getitem_1 = map_impl[0];  map_impl = None
     return (getitem_1,)""",
+=======
+    map_body_0 = self.map_body_0
+    map_impl = torch.ops.higher_order.map_impl(map_body_0, [child], [l_y_]);  map_body_0 = child = l_y_ = None
+    getitem = map_impl[0];  map_impl = None
+    return (getitem,)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def test_map_multi_return(self):
@@ -1777,9 +1922,15 @@ def forward(self, L_x_ : torch.Tensor):
     l_x_ = L_x_
     map_body_0 = self.map_body_0
     map_impl = torch.ops.higher_order.map_impl(map_body_0, [l_x_], []);  map_body_0 = l_x_ = None
+<<<<<<< HEAD
     getitem_1 = map_impl[0]
     getitem_2 = map_impl[1];  map_impl = None
     return (getitem_1, getitem_2)""",
+=======
+    getitem = map_impl[0]
+    getitem_1 = map_impl[1];  map_impl = None
+    return (getitem, getitem_1)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertExpectedInline(
                 body_graph,
@@ -1792,7 +1943,17 @@ def forward(self, child : torch.Tensor):
 
     def test_map_pytree_return(self):
         def _construct_pytree(a):
+<<<<<<< HEAD
             return (a, [[[a]]], a, (a, (a,), a), {"a": a})
+=======
+            return (
+                a.clone(),
+                [[[a.clone()]]],
+                a.clone(),
+                (a.clone(), (a.clone(),), a.clone()),
+                {"a": a.clone()},
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def f(x):
             def inner_f(xs):
@@ -1811,6 +1972,7 @@ def forward(self, L_x_ : torch.Tensor):
     l_x_ = L_x_
     map_body_0 = self.map_body_0
     map_impl = torch.ops.higher_order.map_impl(map_body_0, [l_x_], []);  map_body_0 = l_x_ = None
+<<<<<<< HEAD
     getitem_1 = map_impl[0]
     getitem_2 = map_impl[1]
     getitem_3 = map_impl[2]
@@ -1819,12 +1981,33 @@ def forward(self, L_x_ : torch.Tensor):
     getitem_6 = map_impl[5]
     value = map_impl[6];  map_impl = None
     return (getitem_1, getitem_2, getitem_3, getitem_4, getitem_5, getitem_6, value)""",
+=======
+    getitem = map_impl[0]
+    getitem_1 = map_impl[1]
+    getitem_2 = map_impl[2]
+    getitem_3 = map_impl[3]
+    getitem_4 = map_impl[4]
+    getitem_5 = map_impl[5]
+    value = map_impl[6];  map_impl = None
+    return (getitem, getitem_1, getitem_2, getitem_3, getitem_4, getitem_5, value)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertExpectedInline(
                 body_graph,
                 """\
 def forward(self, child : torch.Tensor):
+<<<<<<< HEAD
     return (child, child, child, child, child, child, child)""",
+=======
+    child_1 = child.clone()
+    child_2 = child.clone()
+    child_3 = child.clone()
+    child_4 = child.clone()
+    child_5 = child.clone()
+    child_6 = child.clone()
+    child_7 = child.clone();  child = None
+    return (child_1, child_2, child_3, child_4, child_5, child_6, child_7)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def test_map_kwargs(self):
@@ -1857,8 +2040,13 @@ def forward(self, L_x_ : torch.Tensor):
     l_x_ = L_x_
     map_body_0 = self.map_body_0
     map_impl = torch.ops.higher_order.map_impl(map_body_0, [l_x_], [3]);  map_body_0 = l_x_ = None
+<<<<<<< HEAD
     getitem_1 = map_impl[0];  map_impl = None
     return (getitem_1,)""",
+=======
+    getitem = map_impl[0];  map_impl = None
+    return (getitem,)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertExpectedInline(
                 body_graph,
@@ -1888,8 +2076,13 @@ def forward(self, L_x_ : torch.Tensor):
     l_x_ = L_x_
     map_body_0 = self.map_body_0
     map_impl = torch.ops.higher_order.map_impl(map_body_0, [l_x_], [3]);  map_body_0 = l_x_ = None
+<<<<<<< HEAD
     getitem_1 = map_impl[0];  map_impl = None
     return (getitem_1,)""",
+=======
+    getitem = map_impl[0];  map_impl = None
+    return (getitem,)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertExpectedInline(
                 body_graph,
@@ -2143,7 +2336,11 @@ def forward(self, L_x_ : torch.Tensor):
     gt = sum_1 > 0;  sum_1 = None
     cond_true_0 = self.cond_true_0
     cond_false_0 = self.cond_false_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(gt, cond_true_0, cond_false_0, [l_x_]);  gt = cond_true_0 = cond_false_0 = l_x_ = None
+=======
+    cond = torch.ops.higher_order.cond(gt, cond_true_0, cond_false_0, (l_x_,));  gt = cond_true_0 = cond_false_0 = l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return (getitem,)""",
             )
@@ -2187,7 +2384,11 @@ def forward(self, L_x_ : torch.Tensor):
     gt = sum_1 > 0;  sum_1 = None
     cond_true_0 = self.cond_true_0
     cond_false_0 = self.cond_false_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(gt, cond_true_0, cond_false_0, []);  gt = cond_true_0 = cond_false_0 = None
+=======
+    cond = torch.ops.higher_order.cond(gt, cond_true_0, cond_false_0, ());  gt = cond_true_0 = cond_false_0 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return (getitem,)""",
             )
@@ -2279,6 +2480,7 @@ def body(x):
         mod = Module()
 
         mod_for_compile = torch.compile(mod, backend=cnt, dynamic=True, fullgraph=False)
+<<<<<<< HEAD
         mod_for_eager = Module()
 
         res = mod_for_compile(torch.Tensor([[6, 4, 5], [3, 4, 5], [6, 6, 6]]))
@@ -2288,6 +2490,14 @@ def body(x):
         self.assertEqual(
             res, mod_for_eager(torch.Tensor([[6, 4, 5], [3, 4, 5], [6, 6, 6]]))
         )
+=======
+
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "map doesn't work unless it is captured completely with torch.compile",
+        ):
+            mod_for_compile(torch.Tensor([[6, 4, 5], [3, 4, 5], [6, 6, 6]]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_map_side_effect(self):
         backend = EagerAndRecordGraphs()
@@ -2312,6 +2522,7 @@ def body(x):
         mod = Module()
 
         mod_for_compile = torch.compile(mod, backend=cnt, dynamic=True, fullgraph=False)
+<<<<<<< HEAD
         mod_for_eager = Module()
 
         res = mod_for_compile(torch.Tensor([[6, 4, 5], [3, 4, 5], [6, 6, 6]]))
@@ -2323,6 +2534,14 @@ def body(x):
         # Since we are tracing through the Python dispatch logic, it ends up 9 graphs.
         self.assertEqual(len(backend.graphs), 9)
         self.assertEqual(res, eager)
+=======
+
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "map doesn't work unless it is captured completely with torch.compile",
+        ):
+            mod_for_compile(torch.Tensor([[6, 4, 5], [3, 4, 5], [6, 6, 6]]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_wrap_subgraph_name_is_valid(self):
         backend = EagerAndRecordGraphs()
@@ -2923,7 +3142,14 @@ def inner2(x, y):
         actual_stack = self._get_source_fn_stack(gm, {"cos", "add", "sin"})
         self.assertExpectedInline(
             pprint.pformat(actual_stack),
+<<<<<<< HEAD
             """{'add': ['map', 'map', 'add'], 'cos': ['map', 'cos'], 'sin': ['sin']}""",
+=======
+            """\
+{'add': ['map_impl', 'map_impl', 'add'],
+ 'cos': ['map_impl', 'cos'],
+ 'sin': ['sin']}""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_grad_source_fn_stack(self):
@@ -3115,7 +3341,11 @@ def forward(self, L_pred_ : torch.Tensor, L_pytree_in_0_ : torch.Tensor, L_pytre
     l_pytree_in_4_g_ = L_pytree_in_4_g_
     cond_true_0 = self.cond_true_0
     cond_false_0 = self.cond_false_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(l_pred_, cond_true_0, cond_false_0, [l_pytree_in_0_, l_pytree_in_1_0_0_0_, l_pytree_in_2_, l_pytree_in_3_0_, l_pytree_in_3_1_0_, l_pytree_in_3_2_, l_pytree_in_4_g_]);  l_pred_ = cond_true_0 = cond_false_0 = l_pytree_in_0_ = l_pytree_in_1_0_0_0_ = l_pytree_in_2_ = l_pytree_in_3_0_ = l_pytree_in_3_1_0_ = l_pytree_in_3_2_ = l_pytree_in_4_g_ = None
+=======
+    cond = torch.ops.higher_order.cond(l_pred_, cond_true_0, cond_false_0, (l_pytree_in_0_, l_pytree_in_1_0_0_0_, l_pytree_in_2_, l_pytree_in_3_0_, l_pytree_in_3_1_0_, l_pytree_in_3_2_, l_pytree_in_4_g_));  l_pred_ = cond_true_0 = cond_false_0 = l_pytree_in_0_ = l_pytree_in_1_0_0_0_ = l_pytree_in_2_ = l_pytree_in_3_0_ = l_pytree_in_3_1_0_ = l_pytree_in_3_2_ = l_pytree_in_4_g_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return (getitem,)""",  # noqa: B950
         )
@@ -3127,14 +3357,22 @@ def fn(pred, pytree_in):
             )
 
         pred = torch.tensor(True)
+<<<<<<< HEAD
         for pytree_in in [(1,), ("string",), (1.0,)]:
+=======
+        for pytree_in in [("string",), (1.0,)]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with self.assertRaisesRegex(
                 RuntimeError,
                 r"Expect operands to be a tuple of possibly nested dict/list/tuple",
             ):
                 fn(pred, pytree_in)
 
+<<<<<<< HEAD
         for pytree_in in [(1,), ("string",), (1.0,)]:
+=======
+        for pytree_in in [("string",), (1.0,)]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with self.assertRaisesRegex(
                 torch._dynamo.exc.UncapturedHigherOrderOpError,
                 r"Cond doesn't work unless it is captured completely with torch.compile",
@@ -3157,6 +3395,68 @@ def false_fn():
         self.assertEqual(fn(zeros, ones, ones), torch.tensor([2.0]))
         self.assertEqual(fn(ones, ones, ones), torch.tensor([3.0]))
 
+<<<<<<< HEAD
+=======
+    def test_hopify_generic_wrap(self):
+        from torch._higher_order_ops.wrap import dynamo_bypassing_wrapper
+
+        def my_hop_fn_impl(fn, *args, k=1, **kwargs):
+            def wrapper(*args, **kwargs):
+                out = fn(*args, **kwargs)
+                if isinstance(out, tuple):
+                    return (out[0] + k,)
+                return out + k
+
+            return wrapper
+
+        def my_hop_fn(fn, *args, k=1, **kwargs):
+            return dynamo_bypassing_wrapper(
+                functools.partial(my_hop_fn_impl, k=k), fn, *args, **kwargs
+            )
+
+        def my_hop_fn_2_impl(fn, *args, g=None):
+            def wrapper(*args, **kwargs):
+                assert g is not None
+                out = fn(*args)
+                if isinstance(out, tuple):
+                    return (g(out[0]),)
+                return g(out)
+
+            return wrapper
+
+        def my_hop_fn_2(fn, *args, g=None, **kwargs):
+            return dynamo_bypassing_wrapper(
+                functools.partial(my_hop_fn_2_impl, g=g), fn, *args, **kwargs
+            )
+
+        def gn(x, h=1):
+            return x.sin() + h
+
+        def fn(x, b):
+            out = my_hop_fn(gn, x, h=b, k=2)
+            return out
+
+        a = torch.rand((4, 4), requires_grad=True)
+        b = torch.rand((4, 4))
+        compiled_fn = torch.compile(
+            fn, backend="aot_eager_decomp_partition", fullgraph=True
+        )
+        self.assertEqual(compiled_fn(a, b), fn(a, b))
+
+        def g(x):
+            return x.cos()
+
+        def fn_2(x, b):
+            out = my_hop_fn_2(fn, x, b, g=g)
+            return out
+
+        a = torch.rand((4, 4), requires_grad=True)
+        compiled_fn_2 = torch.compile(
+            fn_2, backend="aot_eager_decomp_partition", fullgraph=True
+        )
+        self.assertEqual(compiled_fn_2(a, b), fn_2(a, b))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_hints_wrapper(self):
         def ref_fn(x, y):
             x = x + y
@@ -6657,6 +6957,22 @@ def wrapper_fn(x, out_dims):
         self.assertEqual(cnt.frame_count, 3)
         self.assertEqual(cnt.op_count, 18)
 
+<<<<<<< HEAD
+=======
+    def test_vmap_out_dims_None(self):
+        # issue https://github.com/pytorch/pytorch/issues/149509
+        def fn(x, y):
+            return x, y * 2
+
+        def wrapper_fn(x, y):
+            return torch.func.vmap(fn, in_dims=(None, 0), out_dims=(None, 0))(x, y)
+
+        x, y = torch.randn(4), torch.randn(3, 4)
+        expected = wrapper_fn(x, y)
+        got = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=True)(x, y)
+        self.assertEqual(expected, got)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_vmap_new_tensor_in_body(self):
         def fn(x):
             return x + torch.ones(3)
@@ -6895,7 +7211,11 @@ def test_cond_with_kwargs(self):
 
         def test(pred, x):
             def true_fn(x):
+<<<<<<< HEAD
                 return x
+=======
+                return x.clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def false_fn(x):
                 return -x
@@ -6919,7 +7239,11 @@ def test_cond_with_invalid_kwargs(self):
 
         def test(pred, mode, x):
             def true_fn(x):
+<<<<<<< HEAD
                 return x
+=======
+                return x.clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def false_fn(x):
                 return -x
diff --git a/test/dynamo/test_hooks.py b/test/dynamo/test_hooks.py
index a6b65c4d9a34..ab6e1116dae7 100644
--- a/test/dynamo/test_hooks.py
+++ b/test/dynamo/test_hooks.py
@@ -859,6 +859,69 @@ def forward_hook(self, inputs, out):
         self.assertEqual(ref, res)
         self.assertEqual(cnts.frame_count, 2)
 
+<<<<<<< HEAD
+=======
+    @torch._dynamo.config.patch(wrap_top_frame=True)
+    def test_wrap_top_frame_with_hooks(self):
+        class ToyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.net1 = torch.nn.Linear(18, 18, bias=False)
+
+            def forward(self, x):
+                return self.net1(x)
+
+        mod = ToyModel()
+        mod.register_forward_pre_hook(lambda mod, input: input[0] + 1)
+
+        # Case 1: torch.compile(mod)
+        cnts = torch._dynamo.testing.CompileCounter()
+        compiled_mod = torch.compile(mod, backend=cnts)
+
+        x = torch.rand(18, 18)
+        ref = mod(x)
+        res = compiled_mod(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 1)
+
+        # Case 2: mod.compile()
+        cnts = torch._dynamo.testing.CompileCounter()
+        mod.compile(backend=cnts)
+        res = mod(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 1)
+
+    def test_global_module_forward_pre_hook(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x):
+                return x - 1
+
+        counter = 0
+
+        def hook(mod, args):
+            nonlocal counter
+            counter += 1
+            return args
+
+        x = torch.rand(18, 18)
+        mod = Mod()
+        compiled_mod = torch.compile(mod, backend="eager")
+
+        try:
+            hook_handle = torch.nn.modules.module.register_module_forward_pre_hook(hook)
+            ref = mod(x)
+            self.assertEqual(counter, 1)
+            with self.assertWarnsRegex(
+                UserWarning,
+                r"Using `torch.compile\(module\)` when there are global hooks.*",
+            ):
+                res = compiled_mod(x)
+            self.assertEqual(counter, 3)
+            self.assertEqual(ref, res)
+        finally:
+            hook_handle.remove()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_inline_and_install.py b/test/dynamo/test_inline_and_install.py
new file mode 100644
index 000000000000..6a94b9f9ea85
--- /dev/null
+++ b/test/dynamo/test_inline_and_install.py
@@ -0,0 +1,82 @@
+# Owner(s): ["module: dynamo"]
+import unittest
+
+from torch._dynamo import config
+from torch._dynamo.testing import make_test_cls_with_patches
+
+
+try:
+    from . import test_export
+except ImportError:
+    import test_export
+
+
+test_classes = {}
+
+
+def make_dynamic_cls(cls):
+    suffix = "_inline_and_install"
+
+    cls_prefix = "InlineAndInstall"
+
+    test_class = make_test_cls_with_patches(
+        cls,
+        cls_prefix,
+        suffix,
+        (config, "install_free_tensors", True),
+        (config, "inline_inbuilt_nn_modules", True),
+        xfail_prop="_expected_failure_inline_and_install",
+    )
+
+    test_classes[test_class.__name__] = test_class
+    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
+    globals()[test_class.__name__] = test_class
+    test_class.__module__ = __name__
+    return test_class
+
+
+tests = [
+    test_export.ExportTests,
+]
+for test in tests:
+    make_dynamic_cls(test)
+del test
+
+# After installing and inlining is turned on, these tests won't throw
+# errors in export (which is expected for the test to pass)
+# Therefore, these unittest are expected to fail, and we need to update the
+# semantics
+unittest.expectedFailure(
+    InlineAndInstallExportTests.test_invalid_input_global_inline_and_install  # noqa: F821
+)
+unittest.expectedFailure(
+    InlineAndInstallExportTests.test_invalid_input_global_multiple_access_inline_and_install  # noqa: F821
+)
+unittest.expectedFailure(
+    InlineAndInstallExportTests.test_invalid_input_nonlocal_inline_and_install  # noqa: F821
+)
+
+
+# These tests do string comparisson on the graphs, and since buffers are now inlined, they
+# are named different, resulting in failure
+unittest.expectedFailure(
+    InlineAndInstallExportTests.test_param_buffer_safe_from_mutation_simple_inline_and_install  # noqa: F821
+)
+
+
+# This particular test is marked expecting failure, since dynamo was creating second param for a
+# and this was causing a failure in the sum; however with these changes, that test is fixed
+# so will now pass, so we need to mark that it is no longer expected to fail
+def expectedSuccess(test_item):
+    test_item.__unittest_expecting_failure__ = False
+    return test_item
+
+
+expectedSuccess(
+    InlineAndInstallExportTests.test_sum_param_inline_and_install  # noqa: F821
+)
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_input_attr_tracking.py b/test/dynamo/test_input_attr_tracking.py
index ed12bbf727a4..38a5a02a5d88 100644
--- a/test/dynamo/test_input_attr_tracking.py
+++ b/test/dynamo/test_input_attr_tracking.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: dynamo"]
+<<<<<<< HEAD
 # flake8: noqa
+=======
+# flake8: noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
diff --git a/test/dynamo/test_install_free_tensors.py b/test/dynamo/test_install_free_tensors.py
new file mode 100644
index 000000000000..3858b827bd59
--- /dev/null
+++ b/test/dynamo/test_install_free_tensors.py
@@ -0,0 +1,561 @@
+# Owner(s): ["module: dynamo"]
+import unittest
+from collections.abc import Sequence
+from typing import Any, Callable, Union
+
+import torch
+import torch._dynamo
+import torch._dynamo.test_case
+import torch._dynamo.testing
+from torch._dynamo.testing import EagerAndRecordGraphs
+from torch.fx.graph_module import GraphModule
+
+
+def compile_and_extract_graph(
+    fn, *args, **kwargs
+) -> tuple[Callable, list[torch.fx.GraphModule]]:
+    backend = EagerAndRecordGraphs()
+    result_fn = torch.compile(backend=backend, fullgraph=True)(fn)
+    # Run fn to capture graph
+    _ = result_fn(*args, **kwargs)
+    return result_fn, backend.graphs
+
+
+def get_num_input_nodes(graph: GraphModule) -> int:
+    """Returns the number of input nodes in the input GraphModule
+    by counting the number of placeholder tensors
+    """
+    placeholder_cnt = 0
+    for node in graph.graph.nodes:
+        # Missing in some export tests so check manually
+        placeholder_is_tensor = "example_value" in node.meta and isinstance(
+            node.meta["example_value"], torch.Tensor
+        )
+        if node.op == "placeholder" and placeholder_is_tensor:
+            placeholder_cnt += 1
+    return placeholder_cnt
+
+
+class SimpleLinearModule(torch.nn.Module):
+    """
+    Simple linear model with 1 parameter and 1 buffer
+    for basic testing purposes
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.fwd = torch.nn.Linear(5, 1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fwd(x)
+
+
+class ResBlock(torch.nn.Module):
+    """
+    Basic resnet building block - used for testing structure
+    more typical of real models (i.e sequential, activations,
+    and batchnorm)
+    """
+
+    def __init__(self, in_: int, out_: int):
+        super().__init__()
+        self.conv1 = torch.nn.Sequential(
+            torch.nn.Conv2d(in_, out_, kernel_size=3, padding=1),
+            torch.nn.BatchNorm2d(out_),
+            torch.nn.ReLU(),
+        )
+        self.conv2 = torch.nn.Sequential(
+            torch.nn.Conv2d(out_, out_, kernel_size=3, padding=1),
+            torch.nn.BatchNorm2d(out_),
+        )
+        self.activation = torch.nn.ReLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        skip = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out += skip
+        out = self.activation(out)
+        return out
+
+
+class InstallParamsAsGraphAttrTests(torch._dynamo.test_case.TestCase):
+    @torch._dynamo.config.patch(inline_inbuilt_nn_modules=True)
+    @torch._dynamo.config.patch(install_free_tensors=False)
+    def check_num_inputs_and_equality_no_install(
+        self,
+        fn_to_compile: Union[torch.nn.Module, Callable],
+        expected_num_inline_inputs: int,
+        example_inputs: Sequence[Any],
+    ) -> None:
+        """Compiles the original fn, then:
+        * Checks that the number of inputs in the graph is expected_num_inputs
+        * Checks that the compiled fn and original fn are equal
+        """
+        # inlined ex
+        opt_fn, graphs = compile_and_extract_graph(fn_to_compile, *example_inputs)
+        self.assertEqual(len(graphs), 1, msg="Expected 1 graph (no breaks)")
+        actual_num_inputs = get_num_input_nodes(graphs[0])
+        self.assertEqual(actual_num_inputs, expected_num_inline_inputs)
+        self.assertEqual(opt_fn(*example_inputs), fn_to_compile(*example_inputs))
+
+    @torch._dynamo.config.patch(inline_inbuilt_nn_modules=True)
+    @torch._dynamo.config.patch(install_free_tensors=True)
+    def check_num_inputs_and_equality_install(
+        self,
+        fn_to_compile: Union[torch.nn.Module, Callable],
+        expected_num_installed_inputs: int,
+        example_inputs: Sequence[Any],
+    ) -> None:
+        """Compiles the original fn, then:
+        * Checks the number of inputs when installed is consistent with original_fn
+        # Checks that the compiled fn when installed and original fn are equal
+        """
+        opt_installed_fn, graphs = compile_and_extract_graph(
+            fn_to_compile, *example_inputs
+        )
+        self.assertEqual(len(graphs), 1, msg="Expected 1 graph (no breaks)")
+        actual_num_inputs = get_num_input_nodes(graphs[0])
+        self.assertEqual(actual_num_inputs, expected_num_installed_inputs)
+        self.assertEqual(
+            opt_installed_fn(*example_inputs), fn_to_compile(*example_inputs)
+        )
+
+    # ==================== Test Params and Buffer from NN Module ====================
+    def test_optimizing_linear(self) -> None:
+        net = SimpleLinearModule()
+        input1 = torch.randn((1, 5))
+        # Expected: 1 + 1 * 2 = 3
+        self.check_num_inputs_and_equality_no_install(net, 3, (input1,))
+        self.check_num_inputs_and_equality_install(net, 1, (input1,))
+
+    def test_breadth_linear(self) -> None:
+        class BreadthModel(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.fwd = torch.nn.Linear(1, 1)
+                self.fwd2 = torch.nn.Linear(1, 1)
+                self.fwd3 = torch.nn.Linear(1, 1)
+                self.fwd4 = torch.nn.Linear(1, 1)
+                self.fwd5 = torch.nn.Linear(1, 1)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return (
+                    self.fwd(x)
+                    + self.fwd2(x)
+                    + self.fwd3(x)
+                    + self.fwd4(x)
+                    + self.fwd5(x)
+                )
+
+        net = BreadthModel()
+        input1 = torch.randn((1, 1))
+        # Expected: 1 + 5 * 2 = 11
+        self.check_num_inputs_and_equality_no_install(net, 11, (input1,))
+        self.check_num_inputs_and_equality_install(net, 1, (input1,))
+
+    def test_nested_linear(self) -> None:
+        class NestedModel(torch.nn.Module):
+            def __init__(self, inner_module: torch.nn.Module) -> None:
+                super().__init__()
+                self.fwd = torch.nn.Linear(1, 1)
+                self.inner_module = inner_module
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.fwd(self.inner_module(x))
+
+        # Nest 5x
+        kDepth = 4
+        net = SimpleLinearModule()
+        for _ in range(kDepth):
+            net = NestedModel(net)
+        input1 = torch.randn((1, 5))
+        self.check_num_inputs_and_equality_no_install(
+            net, 1 + 2 * (kDepth + 1), (input1,)
+        )
+        self.check_num_inputs_and_equality_install(net, 1, (input1,))
+
+    def test_simple_batchnorm(self) -> None:
+        net = torch.nn.BatchNorm2d(3)
+        tensor = torch.randn((1, 3, 3, 3))
+        # BatchNorm2d has 2 params, and 3 buffers
+        self.check_num_inputs_and_equality_no_install(net, 6, (tensor,))
+        self.check_num_inputs_and_equality_install(net, 1, (tensor,))
+
+    def test_nets_as_input(self) -> None:
+        """
+        Tests when the nn.Module is an input to the fn we are optimizing
+
+        In this case, we should treat it as regular input, which means we
+        can lift parameters/buffers, but should not install them
+        """
+        # Test nn model as input
+        net = SimpleLinearModule()
+        net2 = SimpleLinearModule()
+        x = torch.randn(1, 5)
+
+        def test_fn(x: torch.Tensor, net: torch.nn.Module) -> torch.Tensor:
+            return net(x)
+
+        # When nn is in input, we don't install the params
+        self.check_num_inputs_and_equality_no_install(test_fn, 3, (x, net))
+        self.check_num_inputs_and_equality_install(test_fn, 1, (x, net))
+
+        def test_fn2(
+            x: torch.Tensor, net: torch.nn.Module, net2: torch.nn.Module
+        ) -> torch.Tensor:
+            return net(x) + net2(x)
+
+        self.check_num_inputs_and_equality_no_install(test_fn2, 5, (x, net, net2))
+        self.check_num_inputs_and_equality_install(test_fn2, 1, (x, net, net2))
+
+        def test_fn3(x: torch.Tensor, net: torch.nn.Module) -> torch.Tensor:
+            return net(x) + net2(x)
+
+        # In case of local scope (net2 here), we can install
+        self.check_num_inputs_and_equality_no_install(test_fn3, 5, (x, net))
+        self.check_num_inputs_and_equality_install(test_fn3, 1, (x, net))
+
+        def test_fn_list(x: torch.Tensor, nets: list[torch.nn.Module]):
+            return sum([net(x) for net in nets])
+
+        self.check_num_inputs_and_equality_no_install(test_fn_list, 5, (x, [net, net2]))
+        self.check_num_inputs_and_equality_install(test_fn_list, 1, (x, [net, net2]))
+
+    def test_resnet_structure(self) -> None:
+        net = ResBlock(3, 3)
+        tensor = torch.randn(1, 3, 3, 3)
+        # Conv2d has 2 params, BatchNorm2d has 3 buffers + 2 params, and Relu has 0 params
+        # So expected = 2 + 5 + 5 + 2 = 14 + 1 for input
+        self.check_num_inputs_and_equality_no_install(net, 15, (tensor,))
+        self.check_num_inputs_and_equality_install(net, 1, (tensor,))
+
+    def test_transformer(self) -> None:
+        # needs eval mode - must disable dropout
+        transformer = torch.nn.Transformer(d_model=32).eval()
+        src = torch.rand(10, 32, 32)
+        tgt = torch.rand(20, 32, 32)
+
+        self.check_num_inputs_and_equality_no_install(transformer, 186, (src, tgt))
+        self.check_num_inputs_and_equality_install(transformer, 2, (src, tgt))
+
+    # ==================== Test Parameters and Buffers as input ====================
+    def test_optimizing_params_in_input(self) -> None:
+        param = torch.nn.Parameter(torch.randn(1, 5))
+        net = SimpleLinearModule()
+
+        def test_fn(x: torch.Tensor) -> torch.Tensor:
+            return net(x)
+
+        self.check_num_inputs_and_equality_no_install(test_fn, 3, (param,))
+        self.check_num_inputs_and_equality_install(test_fn, 1, (param,))
+
+        x = torch.randn(1, 5)
+
+        def test_fn2(x: torch.Tensor, param: torch.nn.Parameter) -> torch.Tensor:
+            return net(x) + param
+
+        # net gets installed, param does not here
+        self.check_num_inputs_and_equality_no_install(test_fn2, 4, (x, param))
+        self.check_num_inputs_and_equality_install(test_fn2, 2, (x, param))
+
+        global global_param
+        global_param = torch.nn.Parameter(torch.randn(1, 5))
+
+        def test_fn3(x: torch.Tensor) -> torch.Tensor:
+            return net(x) + global_param
+
+        # net and global does too
+        self.check_num_inputs_and_equality_no_install(test_fn3, 4, (x,))
+        self.check_num_inputs_and_equality_install(test_fn3, 1, (x,))
+
+        def test_fn4(
+            x: torch.Tensor, list_params: list[torch.nn.Parameter]
+        ) -> torch.Tensor:
+            return net(x) + sum(list_params)
+
+        # list_params should not be installed
+        self.check_num_inputs_and_equality_no_install(test_fn4, 4, (x, [param, param]))
+        self.check_num_inputs_and_equality_install(test_fn4, 2, (x, [param, param]))
+
+    def test_optimizing_buffer_in_input(self) -> None:
+        buf = torch.nn.Buffer(data=torch.ones((1, 5)))
+        net = SimpleLinearModule()
+
+        def test_fn(x: torch.Tensor) -> torch.Tensor:
+            return net(x)
+
+        self.check_num_inputs_and_equality_no_install(test_fn, 3, (buf,))
+        self.check_num_inputs_and_equality_install(test_fn, 1, (buf,))
+
+        x = torch.randn(1, 5)
+
+        def test_fn2(x: torch.Tensor, buf: torch.nn.Buffer):
+            return net(x) + buf
+
+        # net gets installed, buf does not here
+        self.check_num_inputs_and_equality_no_install(test_fn2, 4, (x, buf))
+        self.check_num_inputs_and_equality_install(test_fn2, 2, (x, buf))
+
+        global global_buf
+        global_buf = torch.nn.Buffer(torch.randn(1, 5))
+
+        def test_fn3(x: torch.Tensor) -> torch.Tensor:
+            return net(x) + global_buf
+
+        # net and global does too
+        self.check_num_inputs_and_equality_no_install(test_fn3, 4, (x,))
+        self.check_num_inputs_and_equality_install(test_fn3, 1, (x,))
+
+    def test_optimizing_buffer_and_param_in_input(self) -> None:
+        param = torch.nn.Parameter(torch.randn(5, 1))
+        buf = torch.nn.Buffer(data=torch.ones((1, 1)))
+        x = torch.randn(1, 5)
+
+        def test_linear(x: torch.Tensor) -> torch.Tensor:
+            return param * x + buf
+
+        self.check_num_inputs_and_equality_no_install(test_linear, 3, (x,))
+        self.check_num_inputs_and_equality_install(test_linear, 1, (x,))
+
+        def test_linear_explicit(
+            x: torch.Tensor, a: torch.Tensor, b: torch.Tensor
+        ) -> torch.Tensor:
+            return a * x + b
+
+        # Now, param and buf are input so should not be inlined
+        self.check_num_inputs_and_equality_no_install(
+            test_linear_explicit, 3, (x, param, buf)
+        )
+        self.check_num_inputs_and_equality_install(
+            test_linear_explicit, 3, (x, param, buf)
+        )
+
+
+class InstallParamsWhenExport(torch._dynamo.test_case.TestCase):
+    @torch._dynamo.config.patch(inline_inbuilt_nn_modules=True)
+    @torch._dynamo.config.patch(install_free_tensors=True)
+    def check_export_matches_expectation(
+        self,
+        fn_to_export: Callable,
+        expected_num_exported_inputs: int,
+        example_inputs: Sequence[Any],
+    ) -> None:
+        """Exports the original fn, then:
+        * Checks that the number of inputs in the exported is expected_num_exported_inputs
+        * Checks that the exported fn and original fn are equal
+        """
+        exported_fn = torch._dynamo.export(fn_to_export)
+        out_graph = exported_fn(*example_inputs)[0]
+        actual_num_inputs = get_num_input_nodes(out_graph)
+        self.assertEqual(actual_num_inputs, expected_num_exported_inputs)
+        self.assertEqual(out_graph(*example_inputs), fn_to_export(*example_inputs))
+
+    def test_simple_linear(self) -> None:
+        net = SimpleLinearModule()
+        input1 = torch.randn((1, 5))
+        self.check_export_matches_expectation(net, 1, (input1,))
+
+        def test_fn(x: torch.Tensor) -> torch.Tensor:
+            return net(x)
+
+        self.check_export_matches_expectation(test_fn, 1, (input1,))
+
+        # Check multiple inputs
+        def test_fn_2(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            return net(x) + net(y)
+
+        input2 = torch.randn((1, 5))
+        self.check_export_matches_expectation(test_fn_2, 2, (input1, input2))
+
+    def test_simple_batchnorm(self) -> None:
+        net = torch.nn.BatchNorm2d(3)
+        tensor = torch.randn((1, 3, 3, 3))
+        self.check_export_matches_expectation(net, 1, (tensor,))
+
+        def test_fn(x: torch.Tensor) -> torch.Tensor:
+            return net(x)
+
+        self.check_export_matches_expectation(test_fn, 1, (tensor,))
+
+    def test_resnet_structure(self) -> None:
+        net = ResBlock(3, 3)
+        tensor = torch.randn(1, 3, 3, 3)
+        self.check_export_matches_expectation(net, 1, (tensor,))
+
+        def test_fn(x: torch.Tensor) -> torch.Tensor:
+            return net(x)
+
+        self.check_export_matches_expectation(test_fn, 1, (tensor,))
+
+    def test_transformer(self) -> None:
+        transformer = torch.nn.Transformer(d_model=32).eval()
+        src = torch.rand(10, 32, 32)
+        tgt = torch.rand(20, 32, 32)
+
+        self.check_export_matches_expectation(transformer, 2, (src, tgt))
+
+        def test_fn(src: torch.Tensor, tgt: torch.Tensor) -> torch.Tensor:
+            return transformer(src, tgt)
+
+        self.check_export_matches_expectation(test_fn, 2, (src, tgt))
+
+    def test_optimizing_params_in_input(self) -> None:
+        param = torch.nn.Parameter(torch.randn(1, 5))
+        net = SimpleLinearModule()
+
+        def test_fn(x: torch.Tensor) -> torch.Tensor:
+            return net(x)
+
+        self.check_export_matches_expectation(net, 1, (param,))
+        self.check_export_matches_expectation(test_fn, 1, (param,))
+
+        x = torch.randn(1, 5)
+
+        def test_fn2(x: torch.Tensor, param: torch.nn.Parameter) -> torch.Tensor:
+            return net(x) + param
+
+        # net gets installed, param does not here
+        self.check_export_matches_expectation(test_fn2, 2, (x, param))
+
+        def test_fn3(
+            x: torch.Tensor, list_params: list[torch.nn.Parameter]
+        ) -> torch.Tensor:
+            return net(x) + sum(list_params)
+
+        # list_params should not be installed or inlined here
+        self.check_export_matches_expectation(test_fn3, 2, (x, [param, param]))
+
+    def test_optimizing_buffer_in_input(self) -> None:
+        buf = torch.nn.Buffer(data=torch.ones((1, 5)))
+        net = SimpleLinearModule()
+
+        def test_fn(x: torch.Tensor) -> torch.Tensor:
+            return net(x)
+
+        self.check_export_matches_expectation(net, 1, (buf,))
+        self.check_export_matches_expectation(test_fn, 1, (buf,))
+
+        x = torch.randn(1, 5)
+
+        def test_fn2(x: torch.Tensor, buf: torch.nn.Buffer) -> torch.Tensor:
+            return net(x) + buf
+
+        # net gets installed, buf does not here
+        self.check_export_matches_expectation(test_fn2, 2, (x, buf))
+
+    def test_optimizing_buffer_and_param_in_input(self) -> None:
+        param = torch.nn.Parameter(torch.randn(5, 1))
+        buf = torch.nn.Buffer(data=torch.ones((1, 1)))
+        x = torch.randn(1, 5)
+
+        def test_linear_explicit(
+            x: torch.Tensor, a: torch.Tensor, b: torch.Tensor
+        ) -> torch.Tensor:
+            return a * x + b
+
+        # Now, param and buf are input so should not be inlined
+        self.check_export_matches_expectation(test_linear_explicit, 3, (x, param, buf))
+
+    def test_global_tensor_export(self) -> None:
+        global x
+        x = torch.randn((5, 5))
+
+        def fn(a: torch.Tensor) -> torch.Tensor:
+            return a + x
+
+        inp = torch.randn(5, 5)
+        self.check_export_matches_expectation(fn, 1, (inp,))
+
+    def test_nonlocal_closure(self) -> None:
+        x = torch.randn((5, 5))
+
+        def fn(a: torch.Tensor) -> torch.Tensor:
+            return a + x
+
+        inp = torch.randn((5, 5))
+        self.check_export_matches_expectation(fn, 1, (inp,))
+
+    @torch._dynamo.config.patch(inline_inbuilt_nn_modules=True)
+    @torch._dynamo.config.patch(install_free_tensors=True)
+    def test_modify_net_state(self) -> None:
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+                self.a = None
+
+            def forward(self, x):
+                if self.a is None:
+                    self.a = torch.ones_like(x)
+                return self.linear(x) + self.a
+
+        mod = Mod()
+        inp = torch.randn(5, 5)
+        # NOTE: since this fn modifies original class,
+        # need to get reference value before tracing
+        res = mod(inp)
+        mod.a = None
+        ep = torch._dynamo.export(mod)
+        graph, _ = ep(inp)
+        self.assertEqual(graph(inp), res)
+
+    def test_list_of_tensor(self) -> None:
+        def fn(x: list[torch.Tensor]):
+            return x[0] + x[1]
+
+        inp = [torch.tensor([1.3, 3.77, 0.1]), torch.tensor([8.7, 6.23, 9.9])]
+        self.check_export_matches_expectation(fn, 2, (inp,))
+
+    def test_nested_list_of_tensor(self) -> None:
+        def fn(x: list[Union[list[torch.Tensor], torch.Tensor]]):
+            return x[0][0] + x[1]  # type: ignore[index]
+
+        inp = [[torch.tensor([1.3, 3.77, 0.1])], torch.tensor([8.7, 6.23, 9.9])]
+        self.check_export_matches_expectation(fn, 2, (inp,))
+
+    def test_dict_of_tensor(self) -> None:
+        inp_dict = {"temp": torch.tensor(12)}
+
+        def fn(inp: dict[str, torch.Tensor]) -> torch.Tensor:
+            return inp_dict["temp"] + 5
+
+        self.check_export_matches_expectation(fn, 1, (inp_dict,))
+
+    # TODO[lucaskabela]: register the flatten/unflatten function so we can evaluate this test
+    @unittest.expectedFailure
+    def test_user_defined_object(self) -> None:
+        class UserDefinedTestClass:
+            def __init__(self, x, y) -> None:
+                self.x = x
+                self.y = y
+
+        x = torch.randn((3, 3))
+        y = torch.randn((3, 3))
+
+        def fn(obj: UserDefinedTestClass, inp: torch.Tensor) -> torch.Tensor:
+            return obj.x + obj.y + inp
+
+        z = torch.randn((3, 1))
+
+        self.check_export_matches_expectation(fn, 2, (UserDefinedTestClass(x, y), z))
+
+    def test_tensors_as_nn_attr(self) -> None:
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.ones((5, 5))
+                self.b = torch.ones((5, 5))
+
+            def forward(self, x):
+                return self.a + self.b + x
+
+        mod = Mod()
+        inp = torch.randn(5, 5)
+        self.check_export_matches_expectation(mod, 1, (inp,))
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_interop.py b/test/dynamo/test_interop.py
index 18a42efc1234..06e77d3f2442 100644
--- a/test/dynamo/test_interop.py
+++ b/test/dynamo/test_interop.py
@@ -28,6 +28,22 @@ def test_trace_fn(self):
         trace_fn = torch.jit.trace(fn, [torch.zeros(10), torch.zeros(10)])
         self._common(lambda a, b: trace_fn(a, b) + 1)
 
+<<<<<<< HEAD
+=======
+    def test_staticmethod_script_fn(self):
+        class Foo:
+            @staticmethod
+            @torch.jit.script
+            def _g(a):
+                return a**2
+
+            def g(self, a, b):
+                return self._g(a) + b
+
+        foo = Foo()
+        self._common(lambda a, b: foo.g(a, b) + 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_vmap_in_graph(self):
         from functools import wraps
 
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index 3dfa2a54eba4..b64783690f50 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -23,6 +23,10 @@
     find_free_port,
     munge_exc,
     skipIfTorchDynamo,
+<<<<<<< HEAD
+=======
+    xfailIfS390X,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.inductor_utils import HAS_CUDA
 from torch.testing._internal.logging_utils import (
@@ -39,6 +43,7 @@
 
 
 def munge_shape_guards(s: str) -> str:
+<<<<<<< HEAD
     SHAPE_GUARD = (
         "SYMBOLIC_SHAPE_GUARD"
         if torch._dynamo.config.enable_cpp_symbolic_shape_guards
@@ -65,6 +70,20 @@ def munge(s):
         lines = list(dict.fromkeys(lines))
 
     return "\n".join(lines)
+=======
+    SHAPE_GUARD_REGEX = (
+        r"[| ]* \+- SYMBOLIC_SHAPE_GUARD:"
+        if torch._dynamo.config.enable_cpp_symbolic_shape_guards
+        else r"^\+- LAMBDA_GUARD:"
+    )
+
+    def munge(s):
+        s = re.sub(r"[^ ]+:\d+ in [^ ]+", "#:# in #", s)
+        return re.subn(SHAPE_GUARD_REGEX, "+- __SHAPE_GUARD__:", s)
+
+    lines = [munge(l) for l in s.splitlines()]
+    return "\n".join([line for line, nsubs in lines if nsubs > 0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def example_fn(a):
@@ -167,6 +186,25 @@ def test_dynamo_debug_default_off_artifacts(self, records):
         self.assertEqual(len([r for r in records if ".__bytecode" in r.name]), 0)
         self.assertEqual(len([r for r in records if ".__output_code" in r.name]), 0)
 
+<<<<<<< HEAD
+=======
+    @make_logging_test(hierarchical_compile=True)
+    def test_hierarchical_compile(self, records):
+        from torch._higher_order_ops.invoke_subgraph import mark_compile_region
+
+        @mark_compile_region
+        def gn(x):
+            return x * 2
+
+        def fn(x):
+            return gn(x)
+
+        fn_opt = torch.compile(fn, backend="inductor")
+        fn_opt(torch.ones(1000, 1000))
+        fn_opt(torch.ones(1000, 1000))
+        self.assertGreater(len(records), 0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @make_logging_test()
     def test_dynamo_error(self, records):
         try:
@@ -189,8 +227,13 @@ def test_dynamo_error(self, records):
         )
 
     test_aot = within_range_record_test(2, 6, aot=logging.INFO)
+<<<<<<< HEAD
     test_inductor_debug = within_range_record_test(3, 25, inductor=logging.DEBUG)
     test_inductor_info = within_range_record_test(2, 9, inductor=logging.INFO)
+=======
+    test_inductor_debug = within_range_record_test(3, 28, inductor=logging.DEBUG)
+    test_inductor_info = within_range_record_test(2, 10, inductor=logging.INFO)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @make_logging_test()
     def test_inductor_error(self, records):
@@ -506,6 +549,31 @@ def test_invalid_artifact_flag(self):
         with self.assertRaises(ValueError):
             torch._logging.set_logs(aot_graphs=5)
 
+<<<<<<< HEAD
+=======
+    def test_invalid_artifact_flag_error_msg(self):
+        env = dict(os.environ)
+        env["TORCH_LOGS"] = "not_an_existing_log_artifact_should_error"
+        _, stderr = self.run_process_no_exception(
+            "import torch",
+            env=env,
+        )
+        lines = stderr.decode().split("\n")
+        # This is a sanity assert that our error is not spammy.
+        # As of this test creation this was 18.
+        # See this issue for the purpose o this test:
+        # https://github.com/pytorch/pytorch/issues/151055
+        self.assertTrue(len(lines) < 50)
+        # The other sanity assert - check that the last few lines
+        # map to the actual error message we want to raise
+        # (I could use an expecttest here, although it would break
+        #  whenever someone adds a new logging artifact)
+        self.assertEqual(
+            lines[-5], 'For more info on various settings, try TORCH_LOGS="help"'
+        )
+        self.assertEqual(lines[-4], "Valid settings:")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_distributed()
     def test_distributed_rank_logging(self):
         env = dict(os.environ)
@@ -588,7 +656,11 @@ def f(x):
         fn_opt = torch.compile(f, backend="eager")
         fn_opt(torch.randn(3, 3))
 
+<<<<<<< HEAD
         self.assertEqual(len(records), 4)
+=======
+        self.assertEqual(len(records), 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         messages = [
             "\n".join(record.getMessage().split("\n")[-2:]) for record in records
         ]
@@ -612,12 +684,15 @@ def f(x):
         #     return g(g(x))
         #            ~^^^^^^""",
         # )
+<<<<<<< HEAD
         self.assertExpectedInline(
             messages[3],
             """\
             return x * 2
                    ~~^~~""",
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNotPy311
     @make_logging_test(trace_call=True)
@@ -761,7 +836,11 @@ def test_optimizer_non_static_param(self, records):
     @requires_cuda
     @unittest.skipIf(not SM90OrLater, "requires H100+ GPU")
     def test_autotuning(self, records):
+<<<<<<< HEAD
         with torch._inductor.utils.fresh_inductor_cache():
+=======
+        with torch._inductor.utils.fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def f(a, b):
                 return torch.mm(a, b)
@@ -817,6 +896,11 @@ def fn(a):
             len([r for r in records if "return a + 1" in r.getMessage()]), 0
         )
 
+<<<<<<< HEAD
+=======
+    # there are some additional deprecation warnings in stderr, probably due to newer dependencies used on s390x
+    @xfailIfS390X
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_logs_out(self):
         import tempfile
 
@@ -905,6 +989,10 @@ def bar():
     "aot_graphs_effects",
     "pre_grad_graphs",
     "post_grad_graphs",
+<<<<<<< HEAD
+=======
+    "inductor_metrics",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "ir_pre_fusion",
     "ir_post_fusion",
     "compiled_autograd",
@@ -914,6 +1002,10 @@ def bar():
     "graph_breaks",
     "graph",
     "graph_code",
+<<<<<<< HEAD
+=======
+    "graph_code_verbose",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "graph_sizes",
     "ddp_graphs",
     "perf_hints",
@@ -932,8 +1024,16 @@ def bar():
     "cudagraph_static_inputs",
     "benchmarking",
     "loop_ordering",
+<<<<<<< HEAD
+    "autotuning",
+    "graph_region_expansion",
+=======
+    "loop_tiling",
     "autotuning",
     "graph_region_expansion",
+    "hierarchical_compile",
+    "compute_dependencies",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 for name in torch._logging._internal.log_registry.artifact_names:
     if name not in exclusions:
diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index ec3aa2c60672..bc209998fdac 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -3,6 +3,7 @@
 
 import torch._dynamo
 from torch._dynamo.test_minifier_common import MinifierTestBase
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfNNModuleInlined
 
 
@@ -11,6 +12,19 @@
 
 class MinifierTests(MinifierTestBase):
     # Test that compile, runtime, and accuracy errors after dynamo can be repro'd (both CPU and CUDA)
+=======
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_utils import skipIfNNModuleInlined
+
+
+requires_gpu = unittest.skipUnless(
+    torch.cuda.is_available() or torch.xpu.is_available(), "requires cuda or xpu"
+)
+
+
+class MinifierTests(MinifierTestBase):
+    # Test that compile, runtime, and accuracy errors after dynamo can be repro'd (both CPU and CUDA/XPU)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_after_dynamo(self, device, backend, expected_error):
         run_code = f"""\
 @torch.compile(backend={backend!r})
@@ -41,6 +55,7 @@ def test_after_dynamo_cpu_accuracy_error(self):
             "cpu", "relu_accuracy_error_TESTING_ONLY", "AccuracyError"
         )
 
+<<<<<<< HEAD
     @requires_cuda
     def test_after_dynamo_cuda_compile_error(self):
         self._test_after_dynamo(
@@ -57,6 +72,24 @@ def test_after_dynamo_cuda_runtime_error(self):
     def test_after_dynamo_cuda_accuracy_error(self):
         self._test_after_dynamo(
             "cuda", "relu_accuracy_error_TESTING_ONLY", "AccuracyError"
+=======
+    @requires_gpu
+    def test_after_dynamo_cuda_compile_error(self, device):
+        self._test_after_dynamo(
+            device, "relu_compile_error_TESTING_ONLY", "ReluCompileError"
+        )
+
+    @requires_gpu
+    def test_after_dynamo_cuda_runtime_error(self, device):
+        self._test_after_dynamo(
+            device, "relu_runtime_error_TESTING_ONLY", "ReluRuntimeError"
+        )
+
+    @requires_gpu
+    def test_after_dynamo_cuda_accuracy_error(self, device):
+        self._test_after_dynamo(
+            device, "relu_accuracy_error_TESTING_ONLY", "AccuracyError"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_after_dynamo_non_leaf_compile_error(self):
@@ -94,6 +127,7 @@ def test_after_dynamo_cpu_accuracy_backend_passes(self):
             "cpu", "relu_accuracy_error_TESTING_ONLY"
         )
 
+<<<<<<< HEAD
     @requires_cuda
     def test_after_dynamo_cuda_compile_backend_passes(self):
         self._test_after_dynamo_backend_passes(
@@ -116,16 +150,48 @@ def test_after_dynamo_cuda_accuracy_backend_passes(self):
     @skipIfNNModuleInlined()
     @requires_cuda
     def test_cpu_cuda_module_after_dynamo(self):
+=======
+    @requires_gpu
+    def test_after_dynamo_cuda_compile_backend_passes(self, device):
+        self._test_after_dynamo_backend_passes(
+            device, "relu_compile_error_TESTING_ONLY"
+        )
+
+    @requires_gpu
+    def test_after_dynamo_cuda_runtime_backend_passes(self, device):
+        self._test_after_dynamo_backend_passes(
+            device, "relu_runtime_error_TESTING_ONLY"
+        )
+
+    @requires_gpu
+    def test_after_dynamo_cuda_accuracy_backend_passes(self, device):
+        self._test_after_dynamo_backend_passes(
+            device, "relu_accuracy_error_TESTING_ONLY"
+        )
+
+    # Test that a module with mixed cpu/(cuda|xpu) parts with an error after dynamo can be repro'd
+    @skipIfNNModuleInlined()
+    @requires_gpu
+    def test_cpu_cuda_module_after_dynamo(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         backend_name = "relu_compile_error_TESTING_ONLY"
         run_code = f"""\
 class CpuCudaModule(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
+<<<<<<< HEAD
         self.m_x = torch.nn.Linear(20, 20).cuda()
         self.m_y = torch.nn.Linear(20, 20)
         self.p_x = torch.nn.Parameter(torch.randn(20, 20).cuda())
         self.p_y = torch.nn.Parameter(torch.randn(20, 20))
         self.b_x = torch.nn.Buffer(torch.ones(20, 20).cuda())
+=======
+        self.m_x = torch.nn.Linear(20, 20).to(device)
+        self.m_y = torch.nn.Linear(20, 20)
+        self.p_x = torch.nn.Parameter(torch.randn(20, 20).to(device))
+        self.p_y = torch.nn.Parameter(torch.randn(20, 20))
+        self.b_x = torch.nn.Buffer(torch.ones(20, 20).to(device))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.b_y = torch.nn.Buffer(torch.ones(20, 20))
 
     def forward(self, x, y):
@@ -135,12 +201,20 @@ def forward(self, x, y):
 
 @torch.compile(backend={backend_name!r})
 def inner(x1, y1):
+<<<<<<< HEAD
     x2 = torch.randn(20, 20).cuda()
+=======
+    x2 = torch.randn(20, 20).to(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     y2 = torch.randn(20, 20)
     x3, y3 = mod(x1 + x2, y1 + y2)
     return torch.relu(x3.cpu() + y3)
 
+<<<<<<< HEAD
 inner(torch.randn(20, 20).cuda(), torch.randn(20, 20))
+=======
+inner(torch.randn(20, 20).to(device), torch.randn(20, 20))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 
         res = self._run_full_test(run_code, "dynamo", "ReluCompileError", isolate=False)
@@ -151,18 +225,30 @@ def inner(x1, y1):
 class Repro(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
+<<<<<<< HEAD
         self.G__mod___m_x = Linear(in_features=20, out_features=20, bias=True).cuda()
         self.G__mod___m_y = Linear(in_features=20, out_features=20, bias=True)
         self.register_buffer('G__mod___b_x', torch.randn([20, 20], dtype=torch.float32).cuda())
         self.register_buffer('G__mod___b_y', torch.randn([20, 20], dtype=torch.float32))
         self.G__mod___p_x = torch.nn.Parameter(torch.randn([20, 20], dtype=torch.float32, device="cuda"))
+=======
+        self.G__mod___m_x = Linear(in_features=20, out_features=20, bias=True).to(device)
+        self.G__mod___m_y = Linear(in_features=20, out_features=20, bias=True)
+        self.register_buffer('G__mod___b_x', torch.randn([20, 20], dtype=torch.float32).to(device))
+        self.register_buffer('G__mod___b_y', torch.randn([20, 20], dtype=torch.float32))
+        self.G__mod___p_x = torch.nn.Parameter(torch.randn([20, 20], dtype=torch.float32, device=device))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.G__mod___p_y = torch.nn.Parameter(torch.randn([20, 20], dtype=torch.float32))
 
     def forward(self, L_x1_ : torch.Tensor, L_y1_ : torch.Tensor):
         l_x1_ = L_x1_
         l_y1_ = L_y1_
         randn = torch.randn(20, 20)
+<<<<<<< HEAD
         x2 = randn.cuda();  randn = None
+=======
+        x2 = randn.to(device);  randn = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y2 = torch.randn(20, 20)
         add = l_x1_ + x2;  l_x1_ = x2 = None
         add_1 = l_y1_ + y2;  l_y1_ = y2 = None
@@ -213,6 +299,14 @@ def forward(self, x_19):
         )
 
 
+<<<<<<< HEAD
+=======
+devices = ["cuda", "xpu", "cpu"]
+instantiate_device_type_tests(
+    MinifierTests, globals(), only_for=devices, allow_xpu=True
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 7541bd3b9d88..221879f28c5f 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -11,6 +11,10 @@
 import gc
 import importlib
 import itertools
+<<<<<<< HEAD
+=======
+import json
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import math
 import operator
@@ -31,6 +35,10 @@
 
 import torch
 import torch._dynamo.testing
+<<<<<<< HEAD
+=======
+import torch._inductor.config
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch._inductor.test_case
 import torch.onnx.operators
 import torch.utils._pytree as python_pytree
@@ -50,9 +58,15 @@
     skipIfNotPy311,
     unsupported,
 )
+<<<<<<< HEAD
 from torch._dynamo.utils import counters, ifdynstaticdefault
 from torch._dynamo.variables import builder
 from torch._inductor.utils import run_and_get_code
+=======
+from torch._dynamo.utils import call_size, counters, ifdynstaticdefault
+from torch._dynamo.variables import builder
+from torch._inductor.utils import fresh_cache, run_and_get_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization import MinMaxObserver
 from torch.ao.quantization.fake_quantize import FakeQuantize
 from torch.ao.quantization.qconfig import QConfig
@@ -75,6 +89,10 @@
     TEST_CUDA,
     TEST_MULTIGPU,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_methods_invocations import (
     sample_inputs_take_along_dim,
 )
@@ -83,8 +101,15 @@
     IS_FBCODE,
     scoped_load_inline,
     set_default_dtype,
+<<<<<<< HEAD
+    skipIfNNModuleInlined,
+    skipIfWindows,
+=======
+    skipIfHpu,
     skipIfNNModuleInlined,
     skipIfWindows,
+    TEST_HPU,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     wrapDeterministicFlagAPITest,
 )
 from torch.testing._internal.jit_utils import JitTestCase
@@ -103,6 +128,11 @@
 # Defined in CPython's Include/object.h
 TPFLAGS_MAPPING = 1 << 6
 
+<<<<<<< HEAD
+=======
+GLOBAL_INT = 1
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Specializes a test to run only if translation validation is set.
 def onlyIfTranslationValidation(fn: typing.Callable) -> typing.Callable:
@@ -572,6 +602,48 @@ def fn(x):
             self.assertEqual(obj.y, x + 1)
         self.assertEqual(obj.__dict__.keys(), {"pfx_x", "pfx_y"})
 
+<<<<<<< HEAD
+=======
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_unbacked_repeat_cat(self):
+        def f(x, n):
+            m = x.item()
+            x = torch.empty(x).repeat(n)  # s0*u0
+            return torch.cat([x, x], dim=0)
+
+        fn = torch.compile(f, backend="eager", dynamic=True, fullgraph=True)
+        fn(torch.tensor([5]), 5)
+
+    def test_tensor_setattr_getset_descriptor(self):
+        # Tensor attribute `real` has special getter/setter for complex dtype.
+        def f(x):
+            x.real = 10
+            return x + 1
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=False)
+        x = torch.ones(5, dtype=torch.cfloat)
+
+        res = opt_f(x)
+        ref = f(x)
+        self.assertEqual(res, ref)
+
+    def test_newly_constructed_tensor_attr_mutation(self):
+        def f(x):
+            y = x + 10
+            y.grad = x
+            y.foo = 42
+            return y
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=True)
+        x = torch.ones(5)
+
+        res = opt_f(x)
+        ref = f(x)
+        self.assertEqual(res, ref)
+        self.assertEqual(res.grad, ref.grad)
+        self.assertEqual(res.foo, ref.foo)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_closure_recompiles(self):
         cnt = CompileCounter()
 
@@ -1046,9 +1118,13 @@ def fn(x, y):
 not ___dict_contains('cccccccc', G['sys'].modules)
 str(L['x'].device) == 'cpu'
 str(L['x'].dtype) == 'torch.float32'
+<<<<<<< HEAD
 utils_device.CURRENT_DEVICE == None""".split(
             "\n"
         ):
+=======
+utils_device.CURRENT_DEVICE == None""".split("\n"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIn(
                 line,
                 guard_code_str,
@@ -1178,17 +1254,50 @@ def _fn(a, b, func=func):
             get_test_fn(func=min),
             2,
             expected_ops=1,
+<<<<<<< HEAD
             expected_ops_dynamic=ifdynstaticdefault(1, 10),
+=======
+            expected_ops_dynamic=ifdynstaticdefault(1, 7),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         torch._dynamo.testing.standard_test(
             self,
             get_test_fn(func=max),
             2,
             expected_ops=1,
+<<<<<<< HEAD
             expected_ops_dynamic=ifdynstaticdefault(1, 5),
         )
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+=======
+            expected_ops_dynamic=ifdynstaticdefault(1, 7),
+        )
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_bound_shape_checks(self):
+        def f1(x, y):
+            b = x.item()
+            torch._check_is_size(b)
+            torch._check(b < y.shape[0])
+            return y[:b]
+
+        fn1 = torch.compile(f1, fullgraph=True, backend="eager")
+        fn1(torch.tensor(4), torch.ones(10))
+
+        def f2(x, index):
+            idx = index.item()
+            torch._check(idx >= 0)
+            torch._check(idx < x.size(0))
+            return x[idx]
+
+        A = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        index = torch.tensor(1, dtype=torch.int64)
+        fn2 = torch.compile(f2, fullgraph=True, backend="eager")
+        fn2(A, index)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_arange_length_with_float32_dtype(self):
         @torch.compile(fullgraph=True)
         def f(x):
@@ -1270,7 +1379,16 @@ def fn2(x):
 
         torch._dynamo.testing.standard_test(self, fn=fn2, nargs=1, expected_ops=1)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(specialize_float=False)
+=======
+    # When we unspecialize float, we wobble this test by changing
+    # the op count since previously we would just specialize and constant
+    # fold floats into the graph, whereas when we unspecialize we will have
+    # ops for item, add, and all other tensorified operations. Since this
+    # test really isn't testing that, we purposely specialize floats here.
+    @torch._dynamo.config.patch(specialize_float=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_config_obj(self):
         class Cfg:
             def __init__(self) -> None:
@@ -1295,7 +1413,11 @@ def fn(x, cfg):
         cfg2.val = 2.0
         v = opt_fn(v, cfg2)  # 7
         self.assertEqual(v[0], 7)
+<<<<<<< HEAD
         self.assertEqual(cnts.op_count, 9)
+=======
+        self.assertEqual(cnts.op_count, 8)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_config_getattr_default(self):
         class Cfg:
@@ -1457,6 +1579,38 @@ def fn3(inputs):
         self.assertEqual(cnts.frame_count, 3)
         self.assertEqual(cnts.op_count, 9)
 
+<<<<<<< HEAD
+=======
+    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    def test_user_code_statically_known(self):
+        from torch.fx.experimental.symbolic_shapes import (
+            has_static_value,
+            statically_known_true,
+        )
+
+        @torch.compile(fullgraph=True, backend="eager")
+        def f(x):
+            # At this point, this isn't statically known, only the hint says so.
+            if statically_known_true(x.shape[0] > 9):
+                raise Exception()
+            torch._check(x.shape[0] >= 10)
+            # But now it is.
+            return statically_known_true(x.shape[0] > 9), has_static_value(x.shape[0])
+
+        x = torch.zeros(10)
+        torch._dynamo.mark_dynamic(x, 0)
+        self.assertEqual(f(x), (True, False))
+
+        @torch.compile(fullgraph=True, dynamic=True, backend="eager")
+        def g(x, y):
+            n = x.item()
+            torch._check(n == 3)
+            return has_static_value(4.0), has_static_value(n)
+
+        out = g(torch.tensor([3]), torch.zeros(1))
+        self.assertEqual(out, (True, True))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dictcomp(self):
         def fn1(inputs):
             return {k: v + 1 for k, v in inputs.items()}
@@ -1911,6 +2065,33 @@ def fn(g, x):
         with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
             res = opt_fn(g, torch.ones(2, 2))
 
+<<<<<<< HEAD
+=======
+    def test_set_descriptor(self):
+        class Field:
+            def __set__(self, obj, value):
+                obj.__dict__["field"] += value * 2
+
+        class Foo:
+            field = Field()
+
+            def __init__(self):
+                self.__dict__["field"] = 0
+
+        def fn(x, foo):
+            foo.field = 10
+            return x + foo.field
+
+        opt_fn = torch.compile(fn, fullgraph=True, backend="eager")
+        x = torch.zeros(2)
+        foo1, foo2 = Foo(), Foo()
+
+        ref = fn(x, foo1)
+        res = opt_fn(x, foo2)
+        self.assertEqual(ref, res)
+        self.assertEqual(foo1.field, foo2.field)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_get_attr_function(self):
         def fn(g, x):
             return g(x)
@@ -2680,7 +2861,11 @@ def test_dtypes_no_graphbreaks(self):
             "int",
             np.intp,
             np.int32,
+<<<<<<< HEAD
             np.uint8
+=======
+            np.uint8,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # np.dtype('int')       # XXX: as above
         ]
 
@@ -3147,6 +3332,61 @@ def fn(m, x):
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 4)
 
+<<<<<<< HEAD
+=======
+    def test_global_state_guard_serialization(self):
+        GlobalStateGuard = torch._C._dynamo.guards.GlobalStateGuard
+        guards = GlobalStateGuard()
+        serialized_guards = guards.dump()
+        json_guards = json.loads(serialized_guards)
+
+        samples = []
+        # Test on non autocast state and autocast cache states.
+        self.assertIn("autocast_state", json_guards)
+        for key, value in json_guards.items():
+            if type(value) == int:
+                variant = value + 1
+            elif type(value) == bool:
+                variant = not value
+            elif isinstance(value, dict) and key == "autocast_state":
+                variant = value.copy()
+                variant["cached_enabled"] = not variant["cached_enabled"]
+                continue
+            else:
+                self.fail(f"Unknown global state type {key}: {value}")
+            new_dict = json_guards.copy()
+            new_dict[key] = variant
+            samples.append(new_dict)
+
+        for sample in samples:
+            guards.load(json.dumps(sample))
+            self.assertFalse(guards.check())
+
+        guards.load(json.dumps(json_guards))
+        self.assertTrue(guards.check())
+
+        # Test on autocast states.
+        def _test_autocast(dtype):
+            with torch.autocast("cpu", dtype):
+                guards = GlobalStateGuard()
+                serialized_guards = guards.dump()
+                json_guards = json.loads(serialized_guards)
+
+                for i, enabled in enumerate(json_guards["autocast_state"]["enabled"]):
+                    if enabled:
+                        self.assertEqual(
+                            type(json_guards["autocast_state"]["dtype"][i]), int
+                        )
+                        json_guards["autocast_state"]["dtype"][i] += 1
+                        guards.load(json.dumps(json_guards))
+                        self.assertFalse(guards.check())
+
+        _test_autocast(torch.float16)
+        _test_autocast(torch.float32)
+        _test_autocast(torch.float64)
+        _test_autocast(torch.bfloat16)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_type_copy(self):
         def fn(seq):
             a, b = seq
@@ -3246,6 +3486,29 @@ def fn(x):
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 9)
 
+<<<<<<< HEAD
+=======
+    def test_nesteduserfunction_setattr(self):
+        x = 0
+
+        def update(y):
+            def wrapper():
+                x += y
+
+            return wrapper
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(t):
+            w = update(123)
+            w.__wrapped__ = x
+            return t.sin(), w
+
+        t = torch.randn(2)
+        y, w = fn(t)
+        self.assertEqual(y, t.sin())
+        self.assertEqual(w.__wrapped__, x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_object_setattr(self):
         @dataclasses.dataclass
         class A:
@@ -3599,7 +3862,16 @@ def fn(counter):
 
         self.assertTrue(same(out[0], out[1]))
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(specialize_float=False)
+=======
+    # When we unspecialize float, we wobble this test by changing
+    # the op count since previously we would just specialize and constant
+    # fold floats into the graph, whereas when we unspecialize we will have
+    # ops for item, add, and all other tensorified operations. Since this
+    # test really isn't testing that, we purposely specialize floats here.
+    @torch._dynamo.config.patch(specialize_float=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_closure_out_of_scope_cell(self):
         cell1 = torch.rand(1).item()
         cell2 = torch.rand(3, 3)
@@ -3621,7 +3893,16 @@ def inner():
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 1)
 
+<<<<<<< HEAD
     @torch._dynamo.config.patch(specialize_float=False)
+=======
+    # When we unspecialize float, we wobble this test by changing
+    # the op count since previously we would just specialize and constant
+    # fold floats into the graph, whereas when we unspecialize we will have
+    # ops for item, add, and all other tensorified operations. Since this
+    # test really isn't testing that, we purposely specialize floats here.
+    @torch._dynamo.config.patch(specialize_float=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_closure_out_of_scope_cell_with_mutation(self):
         cell1 = torch.rand(1).item()
         orig1 = cell1
@@ -3648,6 +3929,7 @@ def inner():
             result1, result2, _ = opt_fn()
             self.assertAlmostEqual(orig1 + 1 * i, result1)
             self.assertTrue(torch.allclose(orig2 + 10 * i, result2))
+<<<<<<< HEAD
             if i == 1:
                 # No automatic dynamic
                 self.assertEqual(cnts.frame_count, 1)
@@ -3660,6 +3942,10 @@ def inner():
                 # No more recompiles
                 self.assertEqual(cnts.frame_count, 0)
                 self.assertEqual(cnts.op_count, 0)
+=======
+            self.assertEqual(cnts.frame_count, 1)
+            self.assertEqual(cnts.op_count, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cnts.clear()
 
     def test_closure_with_mutation_and_graph_break(self):
@@ -4227,6 +4513,7 @@ def test_version_ci(self):
         # temporary test to check that the ci torch version is set correctly
         self.assertTrue(hasattr(torch, "_subclasses"))
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "requires cuda")
     def test_rand(self):
         cnts = torch._dynamo.testing.CompileCounter()
@@ -4248,6 +4535,8 @@ def fn():
 
         self.assertTrue(same(res, ref_run1))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_slice_input(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
@@ -5350,9 +5639,15 @@ def __init__(self) -> None:
 
             def forward(self, idx, targets=None):
                 b, t = idx.size()
+<<<<<<< HEAD
                 assert (
                     t <= self.block_size
                 ), "Cannot forward, model block size is exhausted."
+=======
+                assert t <= self.block_size, (
+                    "Cannot forward, model block size is exhausted."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # forward the GPT model
                 token_embeddings = self.tok_emb(
@@ -5682,11 +5977,23 @@ def body(x):
 
         error_message = ""
         if torch._dynamo.config.inline_inbuilt_nn_modules:
+<<<<<<< HEAD
             error_message = r"HigherOrderOperator: Mutating a variable not in the current scope \(SideEffects\)"
         else:
             error_message = "Can't inplace modify module params/buffers"
 
         with self.assertRaisesRegex(Unsupported, error_message):
+=======
+            error_message = (
+                "map doesn't work unless it is captured completely with torch.compile"
+            )
+        else:
+            error_message = "Can't inplace modify module params/buffers"
+
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UncapturedHigherOrderOpError, error_message
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             opt_fn = torch.compile(mod, backend="eager", fullgraph=True)
             opt_fn(torch.randn(3, 2))
 
@@ -5784,7 +6091,11 @@ def test_cond_export_single_arg(self):
         from functorch.experimental.control_flow import cond
 
         def true_fn(x):
+<<<<<<< HEAD
             return x
+=======
+            return x.clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def false_fn(x):
             return x.sin()
@@ -5894,6 +6205,7 @@ def g2(a, b):
         def count_graph_break_msgs(msgs):
             return sum("Graph break in user code" in msg for msg in msgs)
 
+<<<<<<< HEAD
         with self.assertLogs(
             logger="torch._dynamo", level=logging.DEBUG
         ) as log, torch._dynamo.config.patch(verbose=True):
@@ -5903,6 +6215,19 @@ def count_graph_break_msgs(msgs):
         with self.assertLogs(
             logger="torch._dynamo", level=logging.DEBUG
         ) as log, torch._dynamo.config.patch(verbose=False):
+=======
+        with (
+            self.assertLogs(logger="torch._dynamo", level=logging.DEBUG) as log,
+            torch._dynamo.config.patch(verbose=True),
+        ):
+            f1(torch.randn(10), torch.randn(10))
+            self.assertGreater(count_graph_break_msgs(log.output), 1)
+
+        with (
+            self.assertLogs(logger="torch._dynamo", level=logging.DEBUG) as log,
+            torch._dynamo.config.patch(verbose=False),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             g1(torch.randn(10), torch.randn(10))
             self.assertEqual(count_graph_break_msgs(log.output), 1)
 
@@ -5930,6 +6255,7 @@ def fn(param, y):
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 3)
 
+<<<<<<< HEAD
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION,
         "Can't run fused SDPA on this platform",
@@ -5981,6 +6307,8 @@ def forward(self, query, key, value):
         opt_mod = torch.compile(module, backend="inductor")
         opt_mod(query, key, value)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_generate_tensor_from_list_of_numpy_primitive_type(self):
         # Test sth like torch.LongTensor(list(np.int64, np.int64, ...))
         def fn():
@@ -6437,6 +6765,7 @@ def fn(x, obj):
         res = opt_fn(x, obj)
         self.assertTrue(same(ref, res))
 
+<<<<<<< HEAD
     def test_torch_cuda_is_available(self):
         def fn(x):
             if torch.cuda.is_available():
@@ -6450,6 +6779,8 @@ def fn(x):
         res = opt_fn(x)
         self.assertTrue(same(ref, res))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_variable_tracker_recursively_contains(self):
         # VariableTracker.recursively_contains should be updated correctly when mutation happens
         def fn(x):
@@ -6467,6 +6798,7 @@ def fn(x):
         res = opt_fn(x)
         self.assertTrue(same(ref, res))
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "requires cuda")
     @unittest.skipIf(not torch.backends.cudnn.is_available(), "requires cudnn")
     def test_torch_cudnn_is_acceptable(self):
@@ -6522,6 +6854,8 @@ def fn(x, y):
         res = opt_fn(x, y)
         self.assertTrue(same(ref, res))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_disable_flag(self):
         cnt = torch._dynamo.testing.CompileCounter()
 
@@ -6817,6 +7151,7 @@ def guard_export_print(guards):
             # This guard was created
             self.assertTrue(guard.name != "nested_fn.__closure__[0].cell_contents")
 
+<<<<<<< HEAD
     @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
     def test_symint_as_device_kwarg(self):
         def f(rank):
@@ -6828,6 +7163,8 @@ def f(rank):
         opt_out = torch.compile(backend="eager", dynamic=True, fullgraph=True)(f)(x)
         self.assertEqual(out, opt_out)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_MULTIGPU, "need multiple GPU")
     def test_symint_as_device_kwarg_multi_gpu(self):
         def fn(rank):
@@ -7014,6 +7351,152 @@ def test_torch_package_working_with_trace(self):
 
         optimized_loaded_model = torch.compile(loaded_model, backend="eager")(*inputs)
 
+<<<<<<< HEAD
+=======
+    def test_precompile_entry_hit(self):
+        from torch._C._dynamo.eval_frame import (
+            _load_precompile_entry,
+            _reset_precompile_entries,
+        )
+
+        def fn(x):
+            return x + 1
+
+        def injected(x):
+            return x + 42
+
+        args = (torch.randn(3, 2),)
+
+        compiled_fn = torch.compile(fn)
+        _load_precompile_entry(
+            fn.__code__,
+            torch._dynamo.guards.GuardManagerWrapper(),
+            injected.__code__,
+        )
+        self.assertEqual(compiled_fn(*args), injected(*args))
+        _reset_precompile_entries(fn.__code__)
+
+        self.assertEqual(compiled_fn(*args), fn(*args))
+
+    def test_precompile_entry_miss(self):
+        from torch._C._dynamo.eval_frame import _load_precompile_entry
+
+        def fn(x):
+            return x + 1
+
+        guard_manager = torch._dynamo.guards.RootGuardManager()
+        guard_manager.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
+
+        def injected(x):
+            return x + 42
+
+        args = (torch.randn(3, 2),)
+
+        compiled_fn = torch.compile(fn)
+        _load_precompile_entry(
+            fn.__code__,
+            torch._dynamo.guards.GuardManagerWrapper(guard_manager),
+            injected.__code__,
+        )
+        self.assertEqual(compiled_fn(*args), fn(*args))
+
+    def test_precompile_entries(self):
+        from torch._C._dynamo.eval_frame import (
+            _load_precompile_entry,
+            _reset_precompile_entries,
+        )
+
+        def fn(x):
+            return x + 1
+
+        guard_manager_bool = torch._dynamo.guards.RootGuardManager()
+        guard_manager_bool.add_lambda_guard(lambda L: isinstance(L["x"], bool), [])
+
+        def injected_bool(x: bool):
+            return x + 102
+
+        guard_manager_int = torch._dynamo.guards.RootGuardManager()
+        guard_manager_int.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
+
+        def injected_int(x: int):
+            return x + 42
+
+        guard_manager_tensor = torch._dynamo.guards.RootGuardManager()
+        guard_manager_tensor.add_lambda_guard(
+            lambda L: isinstance(L["x"], torch.Tensor), []
+        )
+
+        def injected_tensor(x: torch.Tensor):
+            return x + 100
+
+        guard_manager_str = torch._dynamo.guards.RootGuardManager()
+        guard_manager_str.add_lambda_guard(lambda L: isinstance(L["x"], str), [])
+
+        def injected_str(x: str):
+            return x + "1"
+
+        args = (torch.randn(3, 2),)
+
+        compiled_fn = torch.compile(fn)
+        _load_precompile_entry(
+            fn.__code__,
+            torch._dynamo.guards.GuardManagerWrapper(guard_manager_bool),
+            injected_bool.__code__,
+        )
+
+        _load_precompile_entry(
+            fn.__code__,
+            torch._dynamo.guards.GuardManagerWrapper(guard_manager_int),
+            injected_int.__code__,
+        )
+
+        _load_precompile_entry(
+            fn.__code__,
+            torch._dynamo.guards.GuardManagerWrapper(guard_manager_tensor),
+            injected_tensor.__code__,
+        )
+
+        _load_precompile_entry(
+            fn.__code__,
+            torch._dynamo.guards.GuardManagerWrapper(guard_manager_str),
+            injected_str.__code__,
+        )
+
+        self.assertEqual(compiled_fn(*args), injected_tensor(*args))
+        self.assertEqual(compiled_fn(True), injected_bool(True))
+        self.assertEqual(compiled_fn(10), injected_int(10))
+        self.assertEqual(compiled_fn("10"), injected_str("10"))
+        _reset_precompile_entries(fn.__code__)
+
+        self.assertEqual(compiled_fn(*args), fn(*args))
+
+    def test_precompile_fail_on_recompile(self):
+        from torch._C._dynamo.eval_frame import _load_precompile_entry
+
+        @torch.compiler.disable
+        def graph(x, s0):
+            return x + s0
+
+        def fn(x):
+            nonlocal graph  # Forcing fn and injected to have the same closure.
+            return x - 1
+
+        def injected(x):
+            s0 = call_size(x, 0)
+            return graph(x, s0)
+
+        args = (torch.randn(3, 2),)
+
+        compiled_fn = torch.compile(fn)
+        _load_precompile_entry(
+            fn.__code__,
+            torch._dynamo.guards.GuardManagerWrapper(),
+            injected.__code__,
+        )
+        with torch.compiler.set_stance("fail_on_recompile"):
+            self.assertEqual(compiled_fn(*args), injected(*args))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_shape_and_tuple_equality(self):
         def fn(x, y, t):
             z = x * y
@@ -7620,6 +8103,60 @@ def dyn_fn(x):
             torch.compile(dyn_fn, backend="eager")(y)
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+<<<<<<< HEAD
+=======
+    def test_unbacked_empty_tensor(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            n = x.item()
+            return torch.empty((n - 1) // 2)
+
+        self.assertEqual(fn(torch.tensor([4])).size(0), 1)
+        self.assertEqual(fn(torch.tensor([1])).size(0), 0)
+
+    def test_sym_and_terms(self):
+        from torch.fx.experimental.symbolic_shapes import sym_and
+
+        @torch.compile(fullgraph=True, dynamic=True, backend="eager")
+        def fn(xs):
+            u0, u1 = xs.tolist()
+            torch._check(sym_and(u0 >= 3, u0 <= 10, u1 >= 2))
+
+            # test individual checks
+            n = 0
+            if u0 >= 3:
+                n += 1
+            if u0 <= 11:
+                n += 1
+            if u1 >= 1:
+                n += 1
+            return u0 + u1 + n
+
+        fn(torch.tensor([5, 6]))
+        fn(torch.tensor([8, 7]))
+        with self.assertRaises(RuntimeError):
+            fn(torch.tensor([9, 0]))
+
+    def test_unbacked_2d_expand(self):
+        @torch.compile(fullgraph=True, dynamic=True, backend="inductor")
+        def func(a, b):
+            a.expand(b.shape)
+            return a * 10
+
+        a = torch.rand(1, 1)
+        b = torch.rand(1, 1)
+
+        torch._dynamo.decorators.mark_unbacked(a, 0)
+        torch._dynamo.decorators.mark_unbacked(a, 1)
+        torch._dynamo.decorators.mark_unbacked(b, 0)
+        torch._dynamo.decorators.mark_unbacked(b, 1)
+        func(a, b)
+        func(torch.rand(4, 5), torch.rand(4, 5))
+        with self.assertRaises(RuntimeError):
+            func(torch.rand(1, 1), torch.rand(2, 1))
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sym_constrain_range_on_replaced_unbacked_symbol(self):
         # Tests the following case:
         # Deferred runtime asserts adds sym_constrain_range(u0).
@@ -7635,11 +8172,29 @@ def fn(x, y, z):
             torch._check(u0 == s0 + s1)
             return x, y, z
 
+<<<<<<< HEAD
         inputs = (x := torch.randn(16, 10), y := torch.randn(16, 10), torch.tensor(32))
+=======
+        inputs = (
+            x := torch.randn(16, 10),
+            y := torch.randn(16, 10),
+            torch.tensor(32 - 7),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._dynamo.mark_dynamic(x, 0)
         torch._dynamo.mark_dynamic(y, 0)
         opt = torch.compile(fn, fullgraph=True)
         opt(*inputs)
+<<<<<<< HEAD
+=======
+        with self.assertRaises(RuntimeError):
+            inputs = (
+                x := torch.randn(16, 10),
+                y := torch.randn(16, 10),
+                torch.tensor(32),
+            )
+            opt(*inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     @torch._dynamo.config.patch(assume_static_by_default=True)
@@ -7812,6 +8367,34 @@ def forward(self, x, y):
 
         self.assertEqual(counter.frame_count, 1)
 
+<<<<<<< HEAD
+=======
+    @torch.compiler.config.patch(dynamic_sources="L['self']._modules['inner'].x")
+    def test_dynamic_sources_precedence_over_int_specialization(self):
+        builder._DYNAMIC_SOURCES = None
+
+        counter = CompileCounter()
+
+        class Model(torch.nn.Module):
+            def __init__(self, x) -> None:
+                super().__init__()
+                self.inner = torch.nn.Linear(10, 10)
+                # attach attribute to builtin nn module.
+                self.inner.x = x
+
+            @torch.compile(fullgraph=True, backend=counter)
+            def forward(self, a):
+                return a * self.inner.x
+
+        m1 = Model(50)
+        m2 = Model(60)
+        with fresh_cache():
+            m1(torch.rand(1, 2, 3))
+            m2(torch.rand(1, 2, 3))
+
+        self.assertEqual(counter.frame_count, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch.compiler.config.patch(dynamic_sources="L['x']")
     def test_dynamic_sources_int(self):
         counter = CompileCounter()
@@ -7840,6 +8423,37 @@ def fn(x):
 
         self.assertEqual(counter.frame_count, 1)
 
+<<<<<<< HEAD
+=======
+    @torch.compiler.config.patch(unbacked_sources="L['x']")
+    def test_unbacked_sources_tensor(self):
+        counter = CompileCounter()
+
+        @torch.compile(backend=counter)
+        def fn(x):
+            return x * x
+
+        fn(torch.randn(0))
+        fn(torch.randn(1))
+        fn(torch.randn(2))
+
+        self.assertEqual(counter.frame_count, 1)
+
+    @torch.compiler.config.patch(unbacked_sources="L['x']")
+    def test_unbacked_sources_scalar(self):
+        counter = CompileCounter()
+
+        @torch.compile(backend=counter)
+        def fn(x):
+            return x * x
+
+        fn(0)
+        fn(1)
+        fn(2)
+
+        self.assertEqual(counter.frame_count, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch.compiler.config.patch(dynamic_sources="L['x']")
     def test_dynamic_sources_graph_break(self):
         counter = CompileCounter()
@@ -7874,6 +8488,23 @@ def fn(x, y):
 
         self.assertEqual(counter.frame_count, 1)
 
+<<<<<<< HEAD
+=======
+    @torch.compiler.config.patch(dynamic_sources="L\\['x.*'\\], L\\['y.*'\\]")
+    def test_dynamic_sources_dynamic_override_regex(self):
+        counter = CompileCounter()
+
+        @torch.compile(dynamic=False, backend=counter)
+        def fn(x1, y1):
+            return x1 * y1
+
+        fn(2, torch.randn(2))
+        fn(3, torch.randn(3))
+        fn(4, torch.randn(4))
+
+        self.assertEqual(counter.frame_count, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cannot_trace_mark_dynamic(self):
         y = torch.randn([3, 3, 3])
 
@@ -7912,8 +8543,14 @@ def h(a):
         def f(a):
             return h(a)
 
+<<<<<<< HEAD
         with warnings.catch_warnings(record=True) as w, self.assertRaises(
             torch._dynamo.exc.BackendCompilerFailed
+=======
+        with (
+            warnings.catch_warnings(record=True) as w,
+            self.assertRaises(torch._dynamo.exc.BackendCompilerFailed),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             f(torch.randn(2, 2, requires_grad=True))
 
@@ -8106,8 +8743,12 @@ def run_fn():
 
     def test_torch_compile_ctx_on_forward_and_training_step(self):
         class MyModel(torch.nn.Module):
+<<<<<<< HEAD
             def forward(self):
                 ...
+=======
+            def forward(self): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def training_step(self):
                 self()
@@ -8276,6 +8917,7 @@ def func(x):
         self.assertTrue(isinstance(compile_out, torch.Size))
         self.assertEqual(eager_out, compile_out)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_MULTIGPU, "need multiple GPU")
     def test_cuda_set_device(self):
         def fn():
@@ -8291,6 +8933,8 @@ def fn():
             self.assertEqual(res.device.index, 0)
             self.assertEqual(counter.frame_count, 2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nested_function_resuming_with_correct_globals(self):
         # https://github.com/pytorch/pytorch/issues/99665
         try:
@@ -8311,6 +8955,20 @@ def fn(x, y):
         res = opt_fn(x, y)
         self.assertTrue(same(ref, res))
 
+<<<<<<< HEAD
+=======
+    def test_recursion_depth_guards(self):
+        @torch.compile(dynamic=True)
+        def foo(*args, **kwargs):
+            if sum(args) == 0:
+                return 0
+            return 1
+
+        args = list(range(2000))
+        foo(*args)
+        # Previously would have crashed
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dataclasses.dataclass
     class CSETestCase:
         expr: str
@@ -9087,6 +9745,68 @@ def fn(img):
         self.assertEqual(msg, "shape torch.Size([8, 8]) batch size 1.00")
         self.assertEqual(res, img1 + torch.sin(img1))
 
+<<<<<<< HEAD
+=======
+    def test_sourceless_namedtuple(self):
+        from collections import namedtuple
+
+        CustomDtype = namedtuple("CustomDtype", ["dtype", "higher_dtype"])
+
+        class CustomTensor(torch.Tensor):
+            _data: torch.Tensor
+            custom_dtype: CustomDtype
+            __torch_function__ = torch._C._disabled_torch_function_impl
+            __slots__ = [
+                "_data",
+                "custom_dtype",
+            ]
+
+            def __new__(
+                cls,
+                data: torch.Tensor,
+                custom_dtype: CustomDtype,
+            ):
+                self = torch.Tensor._make_wrapper_subclass(
+                    cls,
+                    data.size(),
+                    strides=data.stride(),
+                    storage_offset=data.storage_offset(),
+                    dtype=custom_dtype.dtype,
+                    layout=data.layout,
+                    requires_grad=data.requires_grad,
+                    device=data.device,
+                )
+                self._data = data
+                self.custom_dtype = custom_dtype
+                return self
+
+            def __tensor_flatten__(self):
+                meta = {
+                    "custom_dtype": self.custom_dtype,
+                }
+                return ["_data"], meta
+
+            @staticmethod
+            def __tensor_unflatten__(
+                inner_tensors: dict, metadata, outer_size, outer_stride
+            ):
+                return CustomTensor(
+                    inner_tensors["_data"],
+                    metadata["custom_dtype"],
+                )
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs={}):
+                return func(*args, **kwargs)
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            y = CustomTensor(x, CustomDtype(torch.float32, torch.bfloat16))
+            return y, y.custom_dtype
+
+        fn(torch.ones(2, 2, device="cpu"))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Compiling autograd.Function traces fwd function twice, but the same unbacked symints were not identified
     # as the same across the two tracings. This is an unlikely situation in real use cases, so we add another
     # `test_validate_outputs_unbacked_by_custom_op` to mitigate it and keep this one as expected failure
@@ -9529,6 +10249,7 @@ def fn():
         res = opt_func()
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     def test_torch_device_python_type(self):
         for device, device_type, index in [
             ("cpu", "cpu", None),
@@ -9559,6 +10280,8 @@ def fn(target):
             res = opt_func(a)
             self.assertIsInstance(res, torch.Tensor)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_torch_dtype_python_type(self):
         def fn(target):
             target_dtype = target.dtype
@@ -10063,6 +10786,29 @@ def fn(x, y):
         actual = fn_opt(*inps)
         expected = fn(*inps)
 
+<<<<<<< HEAD
+=======
+    def test_nested_dataclass_reconstruct(self):
+        @dataclasses.dataclass(frozen=True)
+        class NestedDataClass:
+            x: int = 2
+
+        @dataclasses.dataclass(frozen=True)
+        class TestDataClass:
+            y: torch.Tensor
+            ndc: NestedDataClass = NestedDataClass()
+
+        def fn(y):
+            dc = TestDataClass(y)
+            z = dc.y + dc.ndc.x
+            return z, dc
+
+        fn_opt = torch.compile()(fn)
+        inps = (torch.ones(2, 2),)
+        actual = fn_opt(*inps)
+        expected = fn(*inps)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_frozen_dataclass_default_value(self):
         @dataclasses.dataclass(frozen=True)
         class TestDataClass:
@@ -10318,8 +11064,13 @@ def test_shape_env_equal_constructor(self):
 ShapeEnv not equal: field values don't match:
 
 ==> settings: values don't match.
+<<<<<<< HEAD
   >  Left: ShapeEnvSettings(allow_scalar_outputs=False, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, allow_complex_guards_as_runtime_asserts=False)
   > Right: ShapeEnvSettings(allow_scalar_outputs=True, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, allow_complex_guards_as_runtime_asserts=False)
+=======
+  >  Left: ShapeEnvSettings(allow_scalar_outputs=False, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, allow_complex_guards_as_runtime_asserts=False, trace_asserts=False)
+  > Right: ShapeEnvSettings(allow_scalar_outputs=True, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, allow_complex_guards_as_runtime_asserts=False, trace_asserts=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
         )
         self._replay_and_check(main)
@@ -10343,6 +11094,7 @@ def test_shape_env_equal_create_symbolic_sizes_strides_storage_offset(self):
   >  Left: {x.size()[0]: x.size()[0], x.size()[1]: x.size()[1], x.storage_offset(): x.storage_offset(), x.stride()[0]: x.stride()[0], x.stride()[1]: x.stride()[1]}
   > Right: {}
 ==> source_to_var: values don't match.
+<<<<<<< HEAD
   >  Left: {x.size()[0]: s0, x.size()[1]: s1}
   > Right: {}
 ==> val_to_var: values don't match.
@@ -10356,6 +11108,24 @@ def test_shape_env_equal_create_symbolic_sizes_strides_storage_offset(self):
   > Right: {}
 ==> var_to_val: values don't match.
   >  Left: {s0: 3, s1: 2}
+=======
+  >  Left: {x.size()[0]: s93, x.size()[1]: s44}
+  > Right: {}
+==> unique_ids: values don't match.
+  >  Left: {44, 93}
+  > Right: {}
+==> val_to_var: values don't match.
+  >  Left: {2: s44, 3: s93}
+  > Right: {}
+==> var_to_range: values don't match.
+  >  Left: {s44: VR[2, int_oo], s93: VR[2, int_oo]}
+  > Right: {}
+==> var_to_sources: values don't match.
+  >  Left: {s44: [TensorPropertySource(base=ConstantSource(source_name='x'), prop=<TensorProperty.SIZE: 0>, idx=1)], s93: [TensorPropertySource(base=ConstantSource(source_name='x'), prop=<TensorProperty.SIZE: 0>, idx=0)]}
+  > Right: {}
+==> var_to_val: values don't match.
+  >  Left: {s44: 2, s93: 3}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   > Right: {}
 """,
         )
@@ -10414,6 +11184,7 @@ def test_shape_env_equal_evaluate_expr_divisible(self):
 ShapeEnv not equal: field values don't match:
 
 ==> axioms: values don't match.
+<<<<<<< HEAD
   >  Left: {(Mod(s0, 3)) < 0: False, (Mod(s0, 3)) <= 0: True, 0 < (Mod(s0, 3)): False, 0 <= (Mod(s0, 3)): True, Eq(0, Mod(s0, 3)): True, Eq(Mod(s0, 3), 0): True, Ne(0, Mod(s0, 3)): False, Ne(Mod(s0, 3), 0): False}
   > Right: {}
 ==> divisible: values don't match.
@@ -10421,6 +11192,15 @@ def test_shape_env_equal_evaluate_expr_divisible(self):
   > Right: {}
 ==> guards: values don't match.
   >  Left: [Eq(Mod(s0, 3), 0)]
+=======
+  >  Left: {(Mod(s93, 3)) < 0: False, (Mod(s93, 3)) <= 0: True, 0 < (Mod(s93, 3)): False, 0 <= (Mod(s93, 3)): True, Eq(0, Mod(s93, 3)): True, Eq(Mod(s93, 3), 0): True, Ne(0, Mod(s93, 3)): False, Ne(Mod(s93, 3), 0): False}
+  > Right: {}
+==> divisible: values don't match.
+  >  Left: {Mod(s93, 3)}
+  > Right: {}
+==> guards: values don't match.
+  >  Left: [Eq(Mod(s93, 3), 0)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   > Right: []
 ==> name_to_node: values don't match.
   >  Left: {_assert, eq, mod, x_size_0_, x_size_1_, x_storage_offset, x_stride_0_, x_stride_1_}
@@ -10457,17 +11237,29 @@ def test_shape_env_equal_evaluate_expr_replacement(self):
   >  Left: {False: False, True: True}
   > Right: {}
 ==> guards: values don't match.
+<<<<<<< HEAD
   >  Left: [Eq(s0, 3)]
+=======
+  >  Left: [Eq(s93, 3)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   > Right: []
 ==> name_to_node: values don't match.
   >  Left: {_assert, eq, x_size_0_, x_size_1_, x_storage_offset, x_stride_0_, x_stride_1_}
   > Right: {x_size_0_, x_size_1_, x_storage_offset, x_stride_0_, x_stride_1_}
 ==> replacements: values don't match.
+<<<<<<< HEAD
   >  Left: {s0: 3}
   > Right: {}
 ==> var_to_range: values don't match.
   >  Left: {s0: VR[3, 3], s1: VR[2, int_oo]}
   > Right: {s0: VR[2, int_oo], s1: VR[2, int_oo]}
+=======
+  >  Left: {s93: 3}
+  > Right: {}
+==> var_to_range: values don't match.
+  >  Left: {s44: VR[2, int_oo], s93: VR[3, 3]}
+  > Right: {s44: VR[2, int_oo], s93: VR[2, int_oo]}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
         )
         self._replay_and_check(main)
@@ -10498,17 +11290,29 @@ def test_shape_env_equal_evaluate_expr_refinement(self):
 ShapeEnv not equal: field values don't match:
 
 ==> axioms: values don't match.
+<<<<<<< HEAD
   >  Left: {3 <= s0: True, s0 < 3: False}
   > Right: {}
 ==> guards: values don't match.
   >  Left: [s0 >= 3]
+=======
+  >  Left: {3 <= s93: True, s93 < 3: False}
+  > Right: {}
+==> guards: values don't match.
+  >  Left: [s93 >= 3]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   > Right: []
 ==> name_to_node: values don't match.
   >  Left: {_assert, ge, x_size_0_, x_size_1_, x_storage_offset, x_stride_0_, x_stride_1_}
   > Right: {x_size_0_, x_size_1_, x_storage_offset, x_stride_0_, x_stride_1_}
 ==> var_to_range: values don't match.
+<<<<<<< HEAD
   >  Left: {s0: VR[3, int_oo], s1: VR[2, int_oo]}
   > Right: {s0: VR[2, int_oo], s1: VR[2, int_oo]}
+=======
+  >  Left: {s44: VR[2, int_oo], s93: VR[3, int_oo]}
+  > Right: {s44: VR[2, int_oo], s93: VR[2, int_oo]}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
         )
         self._replay_and_check(main)
@@ -10626,6 +11430,13 @@ def func(a, b):
     @skipIfWindows(
         msg="AssertionError: False is not true : Encountered an unexpected fallback to 'aten pow' in dynamo compiled code"
     )
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        torch._inductor.config.cpu_backend != "cpp",
+        "Skip for non cpp backend CPU as comments contain 'aten.pow' ",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_torch_dynamo_codegen_pow(self):
         def pow(x):
             return x**2
@@ -10872,7 +11683,11 @@ def fn(x, y):
         torch._dynamo.decorators.mark_unbacked(x, 1, strict=True)
         y = torch.randn(5, 5)
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "RelaxedUnspecConstraint"):
+=======
+        with self.assertRaisesRegex(RuntimeError, "specialized"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn(x, y)
 
     def test_sym_max_unbacked_sizelike_simplification(self):
@@ -10882,6 +11697,7 @@ def cf(x):
             torch._check_is_size(u0)
             torch._check_is_size(u1)
             torch._check(u0 + u1 == 20)
+<<<<<<< HEAD
             if guard_size_oblivious(torch.sym_max(1, u0 + u1) == 20):
                 return torch.tensor(True)
             else:
@@ -10889,6 +11705,20 @@ def cf(x):
 
         # Previously would have thrown guard on data dependent
         cf(torch.tensor([10, 10])).item()
+=======
+
+            y = 0
+            if guard_size_oblivious(torch.sym_max(1, u0 + u1) == 20):
+                y += 1
+            if guard_size_oblivious(torch.sym_max(1, u0**2 + u1 + 2) != 1):
+                y += 1
+            if guard_size_oblivious(torch.sym_min(1, u0) == 1):
+                y += 1
+            return y
+
+        # Previously would have thrown guard on data dependent
+        self.assertEqual(cf(torch.tensor([10, 10])), 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_guard_size_oblivious(self):
@@ -11473,6 +12303,7 @@ def fn(x, d):
         with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
             fn(torch.randn(4), d)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "requires cuda")
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
@@ -11490,6 +12321,8 @@ def f(mask, box):
 
         f(torch.tensor([30, 30], device="cuda"), torch.tensor([68, 32], device="cuda"))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_iter_type(self):
         @torch.compile(fullgraph=True)
         def fn(y):
@@ -11539,6 +12372,33 @@ def fn(x):
         res = fn(x)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
+=======
+    def test_descriptor_side_effect(self):
+        # This pattern (readonly descriptor but writable value in `__dict__`) is
+        # from scipy `_make_tuple_bunch`:
+        # https://github.com/scipy/scipy/blob/maintenance/1.9.x/scipy/_lib/_bunch.py#L32-L226
+        def fget(obj):
+            return obj.__dict__["field"]
+
+        class MyClass:
+            def __init__(self, n):
+                self.__dict__["field"] = n
+
+            field = property(fget)
+
+        def fn(x):
+            obj = MyClass(42)
+            return x + obj.field, obj
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.randn(4)
+        ref_t, ref_obj = fn(x)
+        res_t, res_obj = opt_fn(x)
+        self.assertEqual(ref_t, res_t)
+        self.assertEqual(ref_obj.field, res_obj.field)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_assert_size_stride(self):
         x = torch.randn(2, 3, 4)
         with self.assertRaisesRegex(
@@ -12044,9 +12904,204 @@ def fn(x):
         x = torch.randn(4)
         self.assertEqual(fn(x), opt_fn(x))
 
+<<<<<<< HEAD
+=======
+    def test_guard_filter_fn_by_id(self):
+        def guard_filter_fn(entries):
+            return [entry.guard_type != "ID_MATCH" for entry in entries]
 
-class TestTracer(JitTestCase):
-    def test_jit_save(self):
+        @torch.compile(fullgraph=True, options={"guard_filter_fn": guard_filter_fn})
+        def fn(x):
+            return id(x)
+
+        inputs = (torch.randn(3, 2),)
+        fn(*inputs)
+
+        inputs_1 = (torch.randn(3, 2),)
+        with torch.compiler.set_stance("fail_on_recompile"):
+            self.assertEqual(fn(*inputs_1), id(inputs[0]))
+
+    def test_guard_filter_fn_by_is_global(self):
+        def guard_filter_fn(entries):
+            return [not entry.is_global for entry in entries]
+
+        global GLOBAL_INT
+
+        @torch.compile(fullgraph=True, options={"guard_filter_fn": guard_filter_fn})
+        def fn(x):
+            return x + GLOBAL_INT
+
+        GLOBAL_INT = 1
+        fn(torch.randn(3, 2))
+
+        GLOBAL_INT = 2
+        inputs = (torch.randn(3, 2),)
+        with torch.compiler.set_stance("fail_on_recompile"):
+            self.assertEqual(fn(*inputs), inputs[0] + 1)
+
+    def test_guard_filter_fn_by_name_and_value(self):
+        def guard_filter_fn(entries):
+            return [
+                not (entry.name == "y" and entry.value is None) for entry in entries
+            ]
+
+        @torch.compile(fullgraph=True, options={"guard_filter_fn": guard_filter_fn})
+        def fn(x, y):
+            if y is not None:
+                x += y
+            return x
+
+        fn(torch.randn(3, 2), None)
+
+        inputs = (torch.randn(3, 2), torch.tensor(1))
+        with torch.compiler.set_stance("fail_on_recompile"):
+            self.assertEqual(fn(*inputs), inputs[0])
+
+    def test_guard_filter_inbuilt_nn_modules(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.norm = torch.nn.LayerNorm(8)
+
+            def forward(self, x):
+                return self.norm(x)
+
+        mod = Mod()
+        opt_mod = torch.compile(
+            mod,
+            options={
+                "guard_filter_fn": torch.compiler.skip_guard_on_inbuilt_nn_modules_unsafe
+            },
+        )
+
+        x = torch.rand(4, 8)
+        opt_mod(x)
+
+        mod.norm.eps = 1e-02
+        # Since the guards are skipped on inbuilt nn modules, we should not recompile
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            opt_mod(x)
+
+    def test_guard_filter_nn_modules(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c = 2
+                self.norm = torch.nn.LayerNorm(8)
+
+            def forward(self, x):
+                return self.norm(x) + self.c
+
+        mod = Mod()
+        opt_mod = torch.compile(
+            mod,
+            options={
+                "guard_filter_fn": torch.compiler.skip_guard_on_all_nn_modules_unsafe
+            },
+        )
+
+        x = torch.rand(4, 8)
+        opt_mod(x)
+
+        mod.c = 3
+        # Since the guards are skipped on all nn modules, we should not recompile
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            opt_mod(x)
+
+    def test_guard_filter_tensors(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c = 2.0
+                self.norm = torch.nn.LayerNorm(8)
+
+            def forward(self, x):
+                return self.norm(x) + self.c
+
+        mod = Mod()
+        opt_mod = torch.compile(
+            mod,
+            options={
+                "guard_filter_fn": torch.compiler.keep_tensor_guards_unsafe,
+            },
+        )
+
+        x = torch.rand(4, 8)
+        opt_mod(x)
+
+        mod.c = 3.0
+        # Since the guards are skipped on all tensors
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            opt_mod(x)
+
+    def test_guard_filter_globals(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c = 2
+                self.norm = torch.nn.LayerNorm(8)
+
+            def forward(self, x):
+                return self.norm(x) + self.c + GLOBAL_INT
+
+        mod = Mod()
+        opt_mod = torch.compile(
+            mod,
+            options={
+                "guard_filter_fn": torch.compiler.skip_guard_on_globals_unsafe,
+            },
+        )
+
+        global GLOBAL_INT
+        GLOBAL_INT = 1
+        x = torch.rand(4, 8)
+        opt_mod(x)
+
+        GLOBAL_INT = 2
+        # Since the guards are skipped on globals, we should not recompile
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            opt_mod(x)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_builtin_bool_on_symint(self):
+        def f(x):
+            return bool(x.item())
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=True)
+        x = torch.randint(10, (1,))
+
+        ref = f(x)
+        res = opt_f(x)
+        self.assertEqual(ref, res)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_builtin_bool_on_symfloat(self):
+        def f(x):
+            return bool(x.item())
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=True)
+        x = torch.randn(1)
+
+        ref = f(x)
+        res = opt_f(x)
+        self.assertEqual(ref, res)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_builtin_bool_on_symbool(self):
+        def f(x):
+            return bool(x.item())
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=True)
+        x = torch.randn(1) == 1
+
+        ref = f(x)
+        res = opt_f(x)
+        self.assertEqual(ref, res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+
+class TestTracer(JitTestCase):
+    def test_jit_save(self):
         def fn():
             class Foo(torch.nn.Module):
                 def __init__(self) -> None:
@@ -12138,6 +13193,302 @@ def fn(x, y):
         self.assertTrue(y.grad is not None)
 
 
+<<<<<<< HEAD
+=======
+class MiscTestsDevice(torch._inductor.test_case.TestCase):
+    def test_rand(self, device):
+        cnts = torch._dynamo.testing.CompileCounter()
+        device = device
+
+        def fn():
+            return torch.randn(10, device=device)
+
+        torch.manual_seed(10)
+        ref_run1 = fn()
+
+        torch.manual_seed(10)
+        ref_run2 = fn()
+        self.assertTrue(same(ref_run1, ref_run2))
+
+        torch.manual_seed(10)
+        opt_fn = torch.compile(fn, backend=cnts, fullgraph=True)
+        res = opt_fn()
+
+        self.assertTrue(same(res, ref_run1))
+
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+        "Can't run fused SDPA on this platform",
+    )
+    def test_parsing_sdpa(self, device):
+        class MyModule(torch.nn.Module):
+            def forward(self, query, key, value):
+                out = F.scaled_dot_product_attention(query, key, value, None, 0, True)
+                out = F.scaled_dot_product_attention(
+                    query, key, value, None, 0, True, scale=8
+                )
+                out = F.scaled_dot_product_attention(
+                    query=query,
+                    key=key,
+                    value=value,
+                    attn_mask=None,
+                    dropout_p=0,
+                    is_causal=True,
+                )
+                out = F.scaled_dot_product_attention(
+                    query,
+                    key=key,
+                    value=value,
+                    attn_mask=None,
+                    dropout_p=0,
+                    is_causal=True,
+                )
+                out = F.scaled_dot_product_attention(
+                    query, key, value, None, dropout_p=0, is_causal=True
+                )
+                out = F.scaled_dot_product_attention(query, key, value, None, scale=8)
+                return out
+
+        device = device
+        dtype = torch.float16
+        seq_len_q = 1
+        seq_len_k = 1
+        head_dim = 8
+        query = torch.ones(
+            1, 8, seq_len_q, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        key = torch.ones(
+            1, 8, seq_len_k, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        value = torch.ones(
+            1, 8, seq_len_k, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        module = MyModule()
+        opt_mod = torch.compile(module, backend="inductor")
+        opt_mod(query, key, value)
+
+    def test_torch_device_is_available(self, device):
+        def fn(x):
+            if TEST_HPU or TEST_CUDA:
+                return x + 1
+            else:
+                return x - 1
+
+        x = torch.rand(4)
+        ref = fn(x)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    @unittest.skipIf(not torch.backends.cudnn.is_available(), "requires cudnn")
+    def test_torch_cudnn_is_acceptable(self, device):
+        def fn(x):
+            if torch.backends.cudnn.is_acceptable(tensor=x):
+                return x + 1
+            return x
+
+        x = torch.rand(4).to(device)
+        ref = fn(x)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    @unittest.skipIf(not torch.backends.cudnn.is_available(), "requires cudnn")
+    def test_torch_cudnn_is_acceptable_bad_inputs(self, device):
+        def fn1(x):
+            if torch.backends.cudnn.is_acceptable("invalid"):
+                return x + 1
+            return x
+
+        def fn2(x):
+            if torch.backends.cudnn.is_acceptable(x, 3.14):
+                return x + 1
+            return x
+
+        with self.assertRaisesRegex(
+            AssertionError, "Expect input to cudnn.is_acceptable to be a tensor"
+        ):
+            x1 = torch.rand(4).to(device)
+            opt_fn1 = torch.compile(fn1, backend="eager", fullgraph=True)
+            res1 = opt_fn1(x1)
+
+        with self.assertRaisesRegex(
+            AssertionError, "Expect 1 input to cudnn.is_acceptable"
+        ):
+            x2 = torch.rand(4).to(device)
+            opt_fn2 = torch.compile(fn2, backend="eager", fullgraph=True)
+            res = opt_fn2(x2)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    @torch._dynamo.config.patch(recompile_limit=999)
+    def test_legacy_cuda_tensor(self):
+        typs = [
+            torch.cuda.FloatTensor,
+            torch.cuda.DoubleTensor,
+            torch.cuda.HalfTensor,
+            torch.cuda.BFloat16Tensor,
+            torch.cuda.ByteTensor,
+            torch.cuda.CharTensor,
+            torch.cuda.IntTensor,
+            torch.cuda.ShortTensor,
+            torch.cuda.LongTensor,
+        ]
+
+        def f2(typ):
+            return typ([1, 2, 3])
+
+        compiled_f2 = torch.compile(f2, backend="eager", fullgraph=True)
+        for typ in typs:
+            output = compiled_f2(typ)
+            expected = f2(typ)
+            self.assertEqual(output, expected)
+
+    def test_get_device(self, device):
+        def fn(x, y):
+            x = x + 1
+            y = y + 1
+            return x.get_device(), y.get_device()
+
+        x = torch.rand(4, device=device)
+        y = torch.rand(4, device="cpu")
+        ref = fn(x, y)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x, y)
+        self.assertTrue(same(ref, res))
+
+    def test_symint_as_device_kwarg(self, device):
+        def f(rank):
+            # -2 to make device id 0 for easier testing on CI
+            return torch.ones(10, device=rank.size(0) - 2)
+
+        x = torch.randn(2)
+        out = f(torch.randn(2))
+        opt_out = torch.compile(backend="eager", dynamic=True, fullgraph=True)(f)(x)
+        self.assertEqual(out, opt_out)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "need multiple GPU")
+    def test_cuda_set_device(self, device):
+        def fn():
+            a = torch.ones(2, device=device)
+            torch.cuda.set_device(1)
+            return a + 1
+
+        with torch.cuda.device(0):
+            counter = CompileCounter()
+            opt_fn = torch.compile(fn, backend=counter)
+            res = opt_fn()
+            self.assertEqual(res.device.type, "cuda")
+            self.assertEqual(res.device.index, 0)
+            self.assertEqual(counter.frame_count, 2)
+
+    def test_torch_device_python_type(self):
+        for device, device_type, index in [
+            ("cpu", "cpu", None),
+            ("cuda:0", "cuda", 0),
+            ("hpu:0", "hpu", 0),
+        ]:
+            if (device == "cuda:0" and not TEST_CUDA) or (
+                device == "hpu:0" and not TEST_HPU
+            ):
+                continue
+
+            def fn(target):
+                target_device = target.device
+                a = torch.zeros(2, 3, device=target_device)
+                # Constant assert at trace time
+                assert isinstance(target_device, torch.device)
+                assert target_device.type == device_type
+                assert target_device.index == index
+                b = torch.zeros(2, 3, device=target_device)
+                c = torch.zeros(2, 3, device=target_device)
+                return a + b + c
+
+            from torch._dynamo.variables import ConstantVariable
+
+            device = torch.device(device)
+            expected_variable = ConstantVariable(device)
+            self.assertEqual(expected_variable.python_type(), type(device))
+
+            opt_func = torch.compile(fn, backend="eager", fullgraph=True)
+            a = torch.tensor([2, 3], device=device)
+            res = opt_func(a)
+            self.assertIsInstance(res, torch.Tensor)
+
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    @torch._functorch.config.patch(fake_tensor_propagate_real_tensors=True)
+    def test_interpolate_propagate_real_tensors(self, device):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(mask, box):
+            # u0, u1 = mask.tolist()
+            mask = torch.randn(1, 1, 30, 30, device=device)
+            h, w = box.tolist()
+            return torch.nn.functional.interpolate(
+                mask, (h, w), mode="bilinear", align_corners=False
+            )
+
+        f(torch.tensor([30, 30], device=device), torch.tensor([68, 32], device=device))
+
+    def test_scalar_isin_decomposition(self):
+        def f():
+            x = torch.tensor(0)
+            return torch.isin(x, x)
+
+        opt_f = torch.compile(f, backend="inductor", fullgraph=True)
+        ref = f()
+        res = opt_f()
+        self.assertEqual(ref, res)
+
+    def test_randint_no_graphbreak(self):
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def f(actions, n_act, epsilon=0.1):
+            actions_random = torch.randint_like(actions, n_act)
+
+            return actions_random
+
+        x = torch.ones([1], dtype=torch.int64)
+        y = torch.tensor(5)
+        f(x, y)
+
+    def test_dynamic_float_scalar_tensor_coersion(self):
+        # Minified version of https://github.com/pytorch/pytorch/issues/158376#issuecomment-3079591367
+        class Foo:
+            def __init__(self):
+                self.config = type(
+                    "Config", (), {"pad_val": 1123581321.0, "tolerance": 1e-6}
+                )
+
+            @torch.compile(fullgraph=True)
+            def forward(self, input):
+                outputs = torch.where(
+                    torch.abs(input - self.config.pad_val) < self.config.tolerance,
+                    torch.tensor(
+                        self.config.pad_val, dtype=input.dtype, device=input.device
+                    ),
+                    torch.tensor(
+                        self.config.pad_val + 1, dtype=input.dtype, device=input.device
+                    ),
+                )
+                return outputs
+
+        foo = Foo()
+        inputs = torch.randn(3, 4)
+        result = foo.forward(inputs)
+
+        original_pad_val = foo.config.pad_val
+        foo.config.pad_val += 1.0
+        result2 = foo.forward(inputs)
+
+        # Previously would crash with:
+        #   RuntimeError: value cannot be converted to type at::Half without overflow
+
+
+devices = ("cuda", "hpu")
+instantiate_device_type_tests(MiscTestsDevice, globals(), only_for=devices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_modes.py b/test/dynamo/test_modes.py
index 0641e44075f2..ed3accfacb45 100644
--- a/test/dynamo/test_modes.py
+++ b/test/dynamo/test_modes.py
@@ -207,6 +207,21 @@ def fn(x):
 
         self.assertEqual(_len_torch_function_stack(), 0)
 
+<<<<<<< HEAD
+=======
+    def test_is_torch_function_all_disabled(self):
+        @torch.compile(fullgraph=True)
+        def fn(x):
+            return (
+                torch._C._is_torch_function_all_disabled(),
+                torch.add(x, 1.0),
+            )
+
+        input = torch.ones(2, 2)
+        res, _ = fn(input)
+        self.assertFalse(res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_error_empty_stack_pop_torch_function_mode(self):
         @torch.compile(fullgraph=True)
         def fn(x):
@@ -361,9 +376,13 @@ def fn(x):
         inp = (torch.ones(2, 2) + 1).as_subclass(TestSubclass)
 
         fn_opt = torch.compile(fn, fullgraph=True)
+<<<<<<< HEAD
         with TestMode(), torch._dynamo.config.patch(
             "traceable_tensor_subclasses", {TestSubclass}
         ):
+=======
+        with TestMode():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with torch._C.DisableTorchFunctionSubclass():
                 expected = fn(inp)
                 actual = fn_opt(inp)
@@ -392,9 +411,13 @@ def fn(x):
         inp = (torch.ones(2, 2) + 1).as_subclass(TestSubclass)
 
         fn_opt = torch.compile(fn, fullgraph=True)
+<<<<<<< HEAD
         with TestMode(), torch._dynamo.config.patch(
             "traceable_tensor_subclasses", {TestSubclass}
         ):
+=======
+        with TestMode():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             expected = fn(inp)
             actual = fn_opt(inp)
 
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 61b83da7afee..c58383d31fb2 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -2,7 +2,10 @@
 # ruff: noqa: F841
 
 import collections
+<<<<<<< HEAD
 import contextlib
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import copy
 import itertools
 import os
@@ -1173,7 +1176,10 @@ def test_fn(self):
     return test_fn
 
 
+<<<<<<< HEAD
 @contextlib.contextmanager
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def temporary_tensor_subclass(torch_function=None):
     class TensorProxy(torch.Tensor):
         @classmethod
@@ -1182,11 +1188,15 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
                 torch_function()
             return super().__torch_function__(func, types, args, kwargs)
 
+<<<<<<< HEAD
     torch._dynamo.config.traceable_tensor_subclasses.add(TensorProxy)
     try:
         yield TensorProxy
     finally:
         torch._dynamo.config.traceable_tensor_subclasses.remove(TensorProxy)
+=======
+    return TensorProxy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class NNModuleTests(torch._dynamo.test_case.TestCase):
@@ -1305,6 +1315,10 @@ def test_unsupportedmodule(self):
         self.assertTrue(torch._dynamo.testing.same(r, m(i)))
         self.assertEqual(cnt.op_count, 6)
 
+<<<<<<< HEAD
+=======
+    @patch.object(torch._dynamo.config, "allow_unspec_int_on_nn_module", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_self_mutating1(self):
         m1 = torch.nn.Linear(10, 10)
         m2 = SelfMutatingModule(m1)
@@ -1377,6 +1391,7 @@ def foo(x):
             x = x.sigmoid()
             return x
 
+<<<<<<< HEAD
         with temporary_tensor_subclass() as TensorProxy:
             x = torch.randn(1).as_subclass(TensorProxy)
             cnt = torch._dynamo.testing.CompileCounter()
@@ -1386,6 +1401,17 @@ def foo(x):
 
             self.assertEqual(cnt.op_count, 4)
             self.assertTrue(torch._dynamo.testing.same(out1, out2))
+=======
+        TensorProxy = temporary_tensor_subclass()
+        x = torch.randn(1).as_subclass(TensorProxy)
+        cnt = torch._dynamo.testing.CompileCounter()
+        out1 = foo(x)
+        opt_foo = torch.compile(foo, backend=cnt, fullgraph=True)
+        out2 = opt_foo(x)
+
+        self.assertEqual(cnt.op_count, 4)
+        self.assertTrue(torch._dynamo.testing.same(out1, out2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_torch_function_with_closure(self):
         def run():
@@ -1406,6 +1432,7 @@ def function():
                 # TODO(future PR): support writes as well
                 counter + 1
 
+<<<<<<< HEAD
             with temporary_tensor_subclass(function) as TensorProxy:
                 x = torch.randn(1).as_subclass(TensorProxy)
                 x = torch.randn(1)
@@ -1416,6 +1443,18 @@ def function():
 
                 self.assertEqual(cnt.op_count, 4)
                 self.assertTrue(torch._dynamo.testing.same(out1, out2))
+=======
+            TensorProxy = temporary_tensor_subclass(function)
+            x = torch.randn(1).as_subclass(TensorProxy)
+            x = torch.randn(1)
+            cnt = torch._dynamo.testing.CompileCounter()
+            out1 = foo(x)
+            opt_foo = torch.compile(foo, backend=cnt, fullgraph=True)
+            out2 = opt_foo(x)
+
+            self.assertEqual(cnt.op_count, 4)
+            self.assertTrue(torch._dynamo.testing.same(out1, out2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         run()
 
@@ -1437,6 +1476,7 @@ def one_break(x):
             return x
 
         try:
+<<<<<<< HEAD
             with temporary_tensor_subclass() as TensorProxy:
                 x = torch.randn(1).as_subclass(TensorProxy)
                 x1 = one_break(x)
@@ -1467,6 +1507,38 @@ def one_break(x):
                     compile_ids.add(cid)
 
                 self.assertEqual(len(compile_ids), 3)
+=======
+            TensorProxy = temporary_tensor_subclass()
+            x = torch.randn(1).as_subclass(TensorProxy)
+            x1 = one_break(x)
+
+            cnt = torch._dynamo.testing.CompileCounter()
+            opt_one_break = torch.compile(one_break, backend=cnt)
+            x2 = opt_one_break(x)
+
+            self.assertTrue(torch._dynamo.testing.same(x1, x2))
+            self.assertEqual(cnt.frame_count, 2)
+            self.assertEqual(cnt.op_count, 2)
+
+            compile_ids = set()
+            for r in results:
+                # A mangled classname looks like __subclass_TensorProxy_94524181138240_c0
+                # where the last segment contains the compile_id.
+                prefix = "__subclass_TensorProxy_"
+                before, sep, after = r.partition(prefix)
+                self.assertEqual(before, "")
+                self.assertEqual(sep, prefix)
+
+                class_type_id, compile_id = after.split("_")
+                self.assertTrue(class_type_id.isnumeric())
+                self.assertTrue(compile_id.startswith("c"))
+
+                cid = compile_id[1:]
+                self.assertTrue(cid.isnumeric())
+                compile_ids.add(cid)
+
+            self.assertEqual(len(compile_ids), 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         finally:
             TensorWithTFOverrideVariable.global_mangled_class_name = original
@@ -2099,11 +2171,20 @@ def forward(self, x):
         mod = MockModule()
         # Each submod is compiled separately and has a different nn module
         # guard. Ensure that recompilation logic is handle correctly.
+<<<<<<< HEAD
         with unittest.mock.patch(
             "torch._dynamo.config.error_on_recompile", True
         ), unittest.mock.patch(
             "torch._dynamo.config.recompile_limit",
             recompile_limit,
+=======
+        with (
+            unittest.mock.patch("torch._dynamo.config.error_on_recompile", True),
+            unittest.mock.patch(
+                "torch._dynamo.config.recompile_limit",
+                recompile_limit,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             x = torch.randn(*size, requires_grad=True)
             mod(x)
@@ -2165,11 +2246,20 @@ def forward(self, x):
         mod = MockModule()
         # Each submod is compiled separately and has a different nn module
         # guard. Ensure that recompilation logic is handle correctly.
+<<<<<<< HEAD
         with unittest.mock.patch(
             "torch._dynamo.config.error_on_recompile", True
         ), unittest.mock.patch(
             "torch._dynamo.config.recompile_limit",
             recompile_limit,
+=======
+        with (
+            unittest.mock.patch("torch._dynamo.config.error_on_recompile", True),
+            unittest.mock.patch(
+                "torch._dynamo.config.recompile_limit",
+                recompile_limit,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             x = torch.randn(*size, requires_grad=True)
             mod(x)
@@ -3053,6 +3143,22 @@ def forward(self, inp):
 
         self.assertEqual(model.x, compiled_model.x)
 
+<<<<<<< HEAD
+=======
+    def test_delattr_on_compiled_module(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        model = Mod()
+        compiled_model = torch.compile(model)
+        compiled_model.foo = 42
+        del compiled_model.foo
+
+        self.assertFalse(hasattr(model, "foo"))
+        self.assertFalse(hasattr(compiled_model, "foo"))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_globals_change_in_other_file(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(x):
@@ -3076,6 +3182,10 @@ def fn(x):
         "inductor backend is not available",
     )
     def test_save_and_load_inductor(self):
+<<<<<<< HEAD
+=======
+        torch._logging.set_logs(inductor_metrics=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod = MockModule()
         opt_mod = torch.compile(mod, backend="inductor")
         inp = torch.randn(10, 10)
@@ -3095,8 +3205,15 @@ def test_save_and_load_inductor(self):
         torch._inductor.metrics.generated_kernel_count = 0
         loaded_model(inp)
         self.assertGreater(torch._inductor.metrics.generated_kernel_count, 0)
+<<<<<<< HEAD
 
     def test_save_and_load_all_backends(self):
+=======
+        torch._logging.set_logs()
+
+    def test_save_and_load_all_backends(self):
+        torch._logging.set_logs(inductor_metrics=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod = MockModule()
         inp = torch.randn(10, 10)
         for backend in torch._dynamo.list_backends():
@@ -3120,6 +3237,11 @@ def test_save_and_load_all_backends(self):
             except torch._dynamo.exc.BackendCompilerFailed:
                 pass
 
+<<<<<<< HEAD
+=======
+    torch._logging.set_logs()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_monkeypatching_forward(self):
         class FakeModule(torch.nn.Module):
             def forward(self, x):
@@ -3312,6 +3434,67 @@ def f(x):
         # Make sure Dynamo actually traced the method.
         self.assertEqual(cache.bool_invoked, 1)
 
+<<<<<<< HEAD
+=======
+    def test_patch_module(self):
+        def set_attrs_from_orig_model(cls_instance, mod, *func_names):
+            cls_instance.__dict__.update(mod.__dict__)
+            if func_names is not None:
+                for func in func_names:
+                    setattr(cls_instance, func, getattr(mod, func))
+
+        class PatchedMyModule(torch.nn.Module):
+            def __init__(self, mod):
+                super().__init__()
+                set_attrs_from_orig_model(self, mod, "resolve_input")
+
+            def forward(self, x):
+                x = self.resolve_input(x)
+                return x
+
+        class MyModule(torch.nn.Module):
+            def __init__(self, input_dim, output_dim):
+                super().__init__()
+                self.linear = torch.nn.Linear(
+                    in_features=input_dim, out_features=output_dim
+                )
+
+            def resolve_input(self, x):
+                x = self.linear(x)
+                return x
+
+            def forward(self, x):
+                x = self.linear(x)
+                return x
+
+        module = MyModule(input_dim=1, output_dim=1)
+        patched_module = PatchedMyModule(module)
+        compiled_module = torch.compile(patched_module, backend="eager", fullgraph=True)
+
+        input_tensor = torch.tensor([1.0], dtype=torch.float)
+        ref = module(input_tensor)
+        res = compiled_module(input_tensor)
+        self.assertEqual(ref, res)
+
+    def test_unhashable_nn_submodule(self):
+        class UnhashableModule(torch.nn.Module):
+            def __hash__(self):
+                raise TypeError("Unhashable module")
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.unhashable_attr = UnhashableModule()
+
+            def forward(self, x):
+                return x
+
+        mod = MyModule()
+        x = torch.randn(1)
+        compiled_mod = torch.compile(mod, backend="eager")
+        compiled_mod(x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 devices = ["cuda", "hpu"]
 instantiate_device_type_tests(NNModuleTestsDevice, globals(), only_for=devices)
diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index 614baec1e3dc..55afda831197 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -3,6 +3,10 @@
 PYTEST_DONT_REWRITE (prevents pytest from rewriting assertions, which interferes
 with test_adam in OptimizerTests)
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 
 import torch
diff --git a/test/dynamo/test_package.py b/test/dynamo/test_package.py
new file mode 100644
index 000000000000..22dc30632d28
--- /dev/null
+++ b/test/dynamo/test_package.py
@@ -0,0 +1,184 @@
+# Owner(s): ["module: dynamo"]
+
+import os
+import unittest
+
+import torch
+import torch._dynamo.testing
+import torch._inductor.config
+import torch._inductor.test_case
+import torch.onnx.operators
+import torch.utils.cpp_extension
+from torch._dynamo.package import CompilePackage, DynamoStore
+from torch._functorch import config as functorch_config
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+@functorch_config.patch("bundled_autograd_cache", True)
+@instantiate_parametrized_tests
+class TestPackage(torch._inductor.test_case.TestCase):
+    def path(self):
+        path = os.path.join(cache_dir(), f"package_{self.id()}")
+        os.makedirs(path, exist_ok=True)
+        return path
+
+    @parametrize("backend", ("eager", "inductor"))
+    @parametrize("device", ("cpu", "cuda"))
+    def test_basic_fn(self, backend, device):
+        if device == "cuda" and not HAS_CUDA:
+            raise unittest.SkipTest("Requires CUDA/Triton")
+        ctx = DynamoStore()
+
+        def fn(x):
+            return x + 1
+
+        args = (
+            torch.randn(
+                3,
+                2,
+                device=device,
+            ),
+        )
+
+        # Saving
+        package = CompilePackage(fn)
+        compiled_fn = torch._dynamo.optimize(backend, package=package)(fn)
+        expected = compiled_fn(*args)
+        if backend == "eager":
+            for backend_id, backend in package.cached_backends.items():
+                ctx.record_eager_backend(backend_id, backend)
+
+        ctx.save_package(package, self.path())
+        # Loading
+        torch._dynamo.reset()
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Detected recompile when torch.compile stance is 'fail_on_recompile'",
+            ):
+                compiled_fn(*args)
+
+            package, backends = ctx.load_package(fn, self.path())
+            compiled_fn = torch._dynamo.optimize(package=package)(fn)
+            package.install(backends)
+            self.assertEqual(expected, compiled_fn(*args))
+
+    @parametrize("backend", ("eager", "inductor"))
+    @parametrize("device", ("cpu", "cuda"))
+    def test_graph_break_bomb(self, backend, device):
+        if device == "cuda" and not HAS_CUDA:
+            raise unittest.SkipTest("Requires CUDA/Triton")
+
+        ctx = DynamoStore()
+
+        def fn(x, l, r):
+            if l > r:
+                return x.sum()
+            mid = (l + r) // 2
+            if x.sum() == mid:
+                return x.sum()
+            elif x.sum() < mid:
+                return fn(x, l, mid)
+            else:
+                return fn(x, mid + 1, r)
+
+        def guard_filter_fn(guards):
+            return [
+                guard.guard_type not in ("CLOSURE_MATCH", "FUNCTION_MATCH")
+                for guard in guards
+            ]
+
+        # Saving
+        package = CompilePackage(fn)
+        compiled_fn = torch._dynamo.optimize(
+            backend=backend, package=package, guard_filter_fn=guard_filter_fn
+        )(fn)
+        N = 10
+        args_list = [(torch.tensor(x, device=device), 0, N - 1) for x in range(N)]
+        for args in args_list:
+            compiled_fn(*args)
+        if backend == "eager":
+            for backend_id, backend in package.cached_backends.items():
+                ctx.record_eager_backend(backend_id, backend)
+        ctx.save_package(package, self.path())
+
+        # Loading
+        torch._dynamo.reset()
+        with torch.compiler.set_stance("fail_on_recompile"):
+            for args in args_list:
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Detected recompile when torch.compile stance is 'fail_on_recompile'",
+                ):
+                    compiled_fn(*args)
+            package, backends = ctx.load_package(fn, self.path())
+            compiled_fn = torch._dynamo.optimize(
+                backend="eager", package=package, guard_filter_fn=guard_filter_fn
+            )(fn)
+            package.install(backends)
+            for args in args_list:
+                self.assertEqual(compiled_fn(*args), args[0].sum())
+
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Detected recompile when torch.compile stance is 'fail_on_recompile'",
+            ):
+                compiled_fn(torch.tensor(N), 0, N - 1)
+
+    @parametrize("backend", ("eager", "inductor"))
+    @parametrize("device", ("cpu", "cuda"))
+    def test_dynamic_shape(self, backend, device):
+        if device == "cuda" and not HAS_CUDA:
+            raise unittest.SkipTest("Requires CUDA/Triton")
+        ctx = DynamoStore()
+
+        def fn(x):
+            return x + x.shape[0]
+
+        args = (torch.randn(3, 2, device=device),)
+        args1 = (torch.randn(5, 2, device=device),)
+        args2 = (torch.randn(7, 2, device=device),)
+        expected1 = fn(*args1)
+
+        torch._dynamo.mark_dynamic(args[0], 0, min=3, max=5)
+
+        # Saving
+        package = CompilePackage(fn)
+        compiled_fn = torch._dynamo.optimize(backend=backend, package=package)(fn)
+        compiled_fn(*args)
+        if backend == "eager":
+            for backend_id, backend in package.cached_backends.items():
+                ctx.record_eager_backend(backend_id, backend)
+        ctx.save_package(package, self.path())
+
+        # Loading
+        torch._dynamo.reset()
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Detected recompile when torch.compile stance is 'fail_on_recompile'",
+            ):
+                compiled_fn(*args1)
+
+            package, backends = ctx.load_package(fn, self.path())
+            compiled_fn = torch._dynamo.optimize(package=package)(fn)
+            package.install(backends)
+
+            self.assertEqual(expected1, compiled_fn(*args1))
+
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Detected recompile when torch.compile stance is 'fail_on_recompile'",
+            ):
+                compiled_fn(*args2)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_pgo.py b/test/dynamo/test_pgo.py
index 5bddef7e4e06..ee15a6b69246 100644
--- a/test/dynamo/test_pgo.py
+++ b/test/dynamo/test_pgo.py
@@ -1,7 +1,14 @@
 # Owner(s): ["module: dynamo"]
 
 import contextlib
+<<<<<<< HEAD
 import os
+=======
+import importlib.util
+import os
+import re
+import tempfile
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._dynamo.config
 import torch._dynamo.test_case
@@ -9,7 +16,11 @@
 import torch.compiler.config
 import torch.nested
 from torch._dynamo.testing import CompileCounter
+<<<<<<< HEAD
 from torch._inductor.utils import clear_inductor_caches, fresh_inductor_cache
+=======
+from torch._inductor.utils import clear_caches, fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class PgoTest(torch._dynamo.test_case.TestCase):
@@ -21,7 +32,11 @@ def setUp(self):
             torch._dynamo.config.patch(automatic_dynamic_local_pgo=True)
         )
         if os.environ.get("INDUCTOR_TEST_DISABLE_FRESH_CACHE") != "1":
+<<<<<<< HEAD
             self._test_stack.enter_context(fresh_inductor_cache())
+=======
+            self._test_stack.enter_context(fresh_cache())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mock_cache.PatchCaches.setUp()
 
     def tearDown(self):
@@ -32,7 +47,11 @@ def tearDown(self):
 
     def reset(self):
         torch._dynamo.reset()
+<<<<<<< HEAD
         clear_inductor_caches()
+=======
+        clear_caches()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_basic(self):
         cnts = CompileCounter()
@@ -52,6 +71,152 @@ def f(x):
         f(torch.randn(2, 6))
         self.assertEqual(cnts.frame_count, 1)
 
+<<<<<<< HEAD
+=======
+    def test_whitelist_suggestion(self):
+        cnts = CompileCounter()
+
+        @torch.compile(backend=cnts, fullgraph=True)
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin = torch.nn.Linear(4, 4)
+                self.attr = torch.randn(4)
+
+            def forward(self, x, y):
+                return self.lin(x) + self.attr + y
+
+        sources = [
+            "L['x']",
+            "L['self']._modules['lin']._parameters['weight']",
+            "L['self']._modules['lin']._parameters['bias']",
+            "L['self'].attr",
+            "L['y']",
+        ]
+
+        def check_whitelist(sources_):
+            state = torch._dynamo.pgo.render_code_state(
+                torch._dynamo.pgo.get_code_state()
+            )
+            whitelist = re.search(r'TORCH_COMPILE_DYNAMIC_SOURCES="(.*)"', state).group(
+                1
+            )
+            for src in sources_:
+                self.assertTrue(src in whitelist)
+
+        # check growing whitelist
+        f = Foo()
+        f(torch.randn(2, 4), torch.randn(4))
+        # only x
+        f(torch.randn(4, 4), torch.randn(4))
+        check_whitelist(sources[:1])
+        # x, lin.weight
+        f.lin = torch.nn.Linear(8, 4)
+        f(torch.randn(8, 8), torch.randn(4))
+        check_whitelist(sources[:2])
+        # x, y, lin.weight, lin.bias, attr
+        f.lin = torch.nn.Linear(8, 8)
+        f.attr = torch.randn(8)
+        f(torch.randn(8, 8), torch.randn(8))
+        check_whitelist(sources)
+
+        # now use suggested whitelist
+        self.reset()
+        cnts.clear()
+        state = torch._dynamo.pgo.render_code_state(torch._dynamo.pgo.get_code_state())
+        whitelist = re.search(r'TORCH_COMPILE_DYNAMIC_SOURCES="(.*)"', state).group(1)
+        with torch.compiler.config.patch(dynamic_sources=whitelist):
+            f = Foo()
+            f(torch.randn(2, 4), torch.randn(4))
+            f(torch.randn(4, 4), torch.randn(4))
+            f.lin = torch.nn.Linear(8, 8)
+            f.attr = torch.randn(8)
+            f(torch.randn(8, 8), torch.randn(8))
+            self.assertEqual(cnts.frame_count, 1)
+
+    def test_pgo_dynamic_false(self):
+        @torch.compile(backend="eager", dynamic=False)
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                x += 2
+                y += 2
+                torch._dynamo.graph_break()
+                x -= 2
+                y *= 2
+                return x, y
+
+        self.reset()
+        f = Foo()
+        f(torch.randn(2, 4), torch.randn(2, 4))
+        f(torch.randn(4, 4), torch.randn(6, 8))
+
+        # check PGO code state is overwritten with static value, both before/after graph break
+        for code_state in torch._dynamo.pgo.get_code_state().values():
+            self.assertEqual(code_state.automatic_dynamic["L['x']"].size, (4, 4))
+            self.assertEqual(code_state.automatic_dynamic["L['y']"].size, (6, 8))
+
+    def test_whitelist_ints_floats(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        class Bar(torch.nn.Module):
+            def __init__(self, c):
+                super().__init__()
+                self.c = c
+
+            def forward(self, x, y, z):
+                if self.c == 1.0:
+                    return x + y + torch.tensor([z])
+
+        f = Bar(1.0)
+        f(2, 1.0, 2.0)
+        f(3, 1.2, 2.0)
+        state = torch._dynamo.pgo.render_code_state(torch._dynamo.pgo.get_code_state())
+        whitelist = re.search(r'TORCH_COMPILE_DYNAMIC_SOURCES="(.*)"', state).group(1)
+        self.assertTrue("L['x']" in whitelist)
+        self.assertTrue("L['y']" in whitelist)
+        self.assertTrue(
+            "___as_tensor(L['y'])" not in whitelist
+        )  # ephemeral FloatTensor source
+        self.assertTrue("L['z']" not in whitelist)  # static float
+        self.assertTrue("L['self'].c" not in whitelist)  # static float property
+
+    def test_pgo_dynamic_params(self):
+        cnts = CompileCounter()
+
+        @torch.compile(backend=cnts, fullgraph=True)
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin = None
+
+            def forward(self, x):
+                return self.lin(x)
+
+        f = Foo()
+
+        def run():
+            self.reset()
+            cnts.clear()
+            f.lin = torch.nn.Linear(4, 4)
+            f(torch.randn(2, 4))
+            f(torch.randn(4, 4))
+            f.lin = torch.nn.Linear(8, 8)
+            f(torch.randn(8, 8))
+
+        # recompile each run
+        run()
+        self.assertEqual(cnts.frame_count, 3)
+
+        # parameter static shapes are forced static, so we recompile once
+        run()
+        self.assertEqual(cnts.frame_count, 2)
+
+        # flags are flipped, PGO records dynamism, so params are dynamically compiled to start
+        torch._dynamo.config.force_parameter_static_shapes = False
+        torch._dynamo.config.force_nn_module_property_static_shapes = False
+        run()
+        self.assertEqual(cnts.frame_count, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_njt(self):
         cnts = CompileCounter()
 
@@ -108,7 +273,11 @@ def f(x):
         self.assertEqual(cnts.frame_count, 2)
 
         torch._dynamo.reset()
+<<<<<<< HEAD
         clear_inductor_caches()
+=======
+        clear_caches()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cnts.clear()
 
         with torch.compiler.config.patch(job_id="foo"):
@@ -156,6 +325,51 @@ def f(x):
                     mock_cache.global_stats.dynamo_pgo, mock_cache.Stats(4, 1, 2)
                 )
 
+<<<<<<< HEAD
+=======
+    # Test that if the same file appears in two different paths for two different compilations PGO still works.
+    def test_different_file_paths_local_pgo(self):
+        content = """
+import torch
+def run(cnt):
+    @torch.compile(backend=cnt, fullgraph=True)
+    def func(x):
+        return x*10
+    func(torch.rand(10))
+    func(torch.rand(20))
+    func(torch.rand(30))
+"""
+        temp_dir1 = tempfile.TemporaryDirectory()
+        temp_dir2 = tempfile.TemporaryDirectory()
+
+        path1 = os.path.join(temp_dir1.name, "example.py")
+        path2 = os.path.join(temp_dir2.name, "example.py")
+        cnts = CompileCounter()
+
+        assert path1 != path2
+
+        def write_load_and_run(path):
+            with open(path, "w") as file:
+                file.write(content)
+            spec = importlib.util.spec_from_file_location("example", path1)
+            assert spec is not None
+            module = importlib.util.module_from_spec(spec)
+            assert spec.loader is not None
+            spec.loader.exec_module(module)
+            module.run(cnts)
+
+        write_load_and_run(path1)
+        self.assertEqual(cnts.frame_count, 2)
+        state = torch._dynamo.pgo.render_code_state(torch._dynamo.pgo.get_code_state())
+        self.assertTrue("hash(390fe689)" in state)
+        self.assertTrue("/example.py:4:func:" in state)
+        self.assertTrue(" L['x']: tensor size=[?] stride=[1]" in state)
+        # We should compile this only once due to PGO.
+        cnts.clear()
+        write_load_and_run(path2)
+        self.assertEqual(cnts.frame_count, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_precompile_context.py b/test/dynamo/test_precompile_context.py
new file mode 100644
index 000000000000..d3a5140cbe82
--- /dev/null
+++ b/test/dynamo/test_precompile_context.py
@@ -0,0 +1,99 @@
+# Owner(s): ["module: dynamo"]
+
+import torch
+import torch._dynamo
+import torch._dynamo.test_case
+import torch._functorch
+from torch._dynamo.precompile_context import PrecompileContext
+from torch._functorch import config as functorch_config
+from torch._functorch._aot_autograd.autograd_cache import (
+    BundledAOTAutogradCacheArtifact,
+)
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch.testing._internal.inductor_utils import GPU_TYPE, requires_triton
+
+
+@functorch_config.patch({"enable_autograd_cache": True})
+@functorch_config.patch(
+    {"bundled_autograd_cache": True}
+)  # Requires bundledaotautograd cache for now
+class PrecompileContextTests(InductorTestCase):
+    def setUp(self):
+        """
+        Reset all counters and caches before each unit test
+        """
+        super().setUp()
+        # Clear PrecompileContext cache artifacts
+        PrecompileContext.clear()
+
+    @requires_triton()
+    def test_basic(self):
+        """
+        Test that after torch.compile, PrecompileContext._new_cache_artifacts length is 1
+        """
+
+        def simple_function(x):
+            return x.sin() + x.cos()
+
+        compiled_fn = torch.compile(simple_function)
+
+        # Run the compiled function
+        x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
+        result = compiled_fn(x)
+        result.sum().backward()
+        # Check that PrecompileContext._new_cache_artifacts_by_key has length 1
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 1)
+
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
+        result = PrecompileContext.serialize()
+        assert result is not None
+        serialized, cache_info = result
+        self.assertEqual(len(cache_info.precompile_aot_autograd_artifacts), 1)
+
+        artifacts = PrecompileContext.deserialize(serialized)
+        assert artifacts is not None
+        deserialized = artifacts["precompile_aot_autograd"]
+        assert len(deserialized) == 1
+        entry = deserialized[0]
+        assert isinstance(entry, BundledAOTAutogradCacheArtifact)
+        entry = entry.after_deserialization()
+        # Now that we've serialized, there should be no new cache artifacts
+        self.assertEqual(
+            len(PrecompileContext._new_cache_artifacts["precompile_aot_autograd"]), 0
+        )
+
+    @requires_triton()
+    def test_serialize_by_key(self):
+        """
+        Test that after torch.compile, PrecompileContext._new_cache_artifacts length is 1
+        """
+
+        def simple_function(x):
+            return x.sin() + x.cos()
+
+        compiled_fn = torch.compile(simple_function)
+
+        # Run the compiled function
+        x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
+        result = compiled_fn(x)
+        result.sum().backward()
+        # Check that PrecompileContext._new_cache_artifacts_by_key has length 1
+        # TODO: the key right now is the AOTAutogradCacheKey, but will be backend_id once
+        # we have torch._dynamo.package implemented
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 1)
+        key = next(iter(PrecompileContext._new_cache_artifacts_by_key.keys()))
+        result = PrecompileContext.serialize_artifact_by_key(key)
+        assert isinstance(result, BundledAOTAutogradCacheArtifact)
+        self.assertEqual(result.key, key)
+
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
+        result = PrecompileContext.serialize()
+        assert result is not None
+        _, cache_info = result
+        self.assertEqual(len(cache_info.precompile_aot_autograd_artifacts), 1)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_recompile_ux.py b/test/dynamo/test_recompile_ux.py
index 4507d3394620..0f4489a141e0 100644
--- a/test/dynamo/test_recompile_ux.py
+++ b/test/dynamo/test_recompile_ux.py
@@ -238,9 +238,13 @@ def f(x):
 tensor 'x' size mismatch at index 0. expected 11, actual 12
 tensor 'x' size mismatch at index 0. expected 10, actual 12
 tensor 'x' size mismatch at index 0. expected 9, actual 12
+<<<<<<< HEAD
 tensor 'x' size mismatch at index 0. expected 8, actual 12""".split(
             "\n"
         ):
+=======
+tensor 'x' size mismatch at index 0. expected 8, actual 12""".split("\n"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertIn(
                 line,
                 failure_str,
@@ -276,9 +280,13 @@ def filter_reasons():
             opt_f([7, 8])
 
             for line in """\
+<<<<<<< HEAD
 len(x) == 3""".split(
                 "\n"
             ):
+=======
+len(x) == 3""".split("\n"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertIn(line, filter_reasons())
 
             failure_reasons.clear()
@@ -286,9 +294,13 @@ def filter_reasons():
 
             for line in """\
 len(x) == 2
+<<<<<<< HEAD
 len(x) == 3""".split(
                 "\n"
             ):
+=======
+len(x) == 3""".split("\n"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertIn(line, filter_reasons())
 
     @torch._dynamo.config.patch(recompile_limit=1)
diff --git a/test/dynamo/test_recompiles.py b/test/dynamo/test_recompiles.py
index 6fc919344dce..27d2f2cdea1c 100644
--- a/test/dynamo/test_recompiles.py
+++ b/test/dynamo/test_recompiles.py
@@ -474,6 +474,34 @@ def fn(x):
 
         self.assertEqual(counter.frame_count, 2)
 
+<<<<<<< HEAD
+=======
+    def test_dunder_call_recompile(self):
+        class Foo:
+            def __call__(self, x):
+                return x + 1
+
+        counter = torch._dynamo.testing.CompileCounter()
+
+        @torch.compile(backend=counter)
+        def f(x, foo):
+            return foo(x)
+
+        x = torch.ones(2)
+        foo1 = Foo()
+        foo2 = Foo()
+
+        # no recompilation
+        f(x, foo1)
+        f(x, foo2)
+        self.assertEqual(counter.frame_count, 1)
+
+        # one recompilation
+        Foo.__call__ = lambda self, x: x + 2
+        f(x, foo1)
+        self.assertEqual(counter.frame_count, 2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_reconstruct.py b/test/dynamo/test_reconstruct.py
index 4eecfdf13989..7d9f4c63a492 100644
--- a/test/dynamo/test_reconstruct.py
+++ b/test/dynamo/test_reconstruct.py
@@ -7,6 +7,14 @@
 import torch
 import torch._dynamo.test_case
 from torch.testing._internal.common_utils import IS_FBCODE
+<<<<<<< HEAD
+=======
+from torch.testing._internal.inductor_utils import requires_triton
+from torch.utils._triton import (
+    has_triton_experimental_host_tma,
+    has_triton_tensor_descriptor_host_tma,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _filter_instructions(instructions, opname):
@@ -300,6 +308,152 @@ def fn(model, states, x):
             got = opt_fn(model, states, x)
             self.assertEqual(expected, got)
 
+<<<<<<< HEAD
+=======
+    def test_graph_break_in_wrapped_user_function(self):
+        def fn(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            assert torch.compiler.is_compiling()
+            assert not torch.is_grad_enabled()
+            return x + 2
+
+        @torch.compile(backend="eager")
+        def gn(x):
+            x = torch.no_grad()(fn)(x)
+            # reconstruction failure would cause a skipped frame
+            assert torch.compiler.is_compiling()
+            assert torch.is_grad_enabled()
+            return x
+
+        inp = torch.randn(3)
+        self.assertEqual(gn(inp), inp + 3)
+
+    def test_graph_break_in_wrapped_user_method(self):
+        class Foo:
+            def __init__(self):
+                self.a = 1
+                self.b = 2
+
+            def fn(self, x):
+                x = x + self.a
+                torch._dynamo.graph_break()
+                assert torch.compiler.is_compiling()
+                assert not torch.is_grad_enabled()
+                return x + self.b
+
+        obj = Foo()
+
+        @torch.compile(backend="eager")
+        def gn(x):
+            obj.fn = torch.no_grad()(obj.fn)
+            x = obj.fn(x)
+            # reconstruction failure would cause a skipped frame
+            assert torch.compiler.is_compiling()
+            assert torch.is_grad_enabled()
+            return x
+
+        inp = torch.randn(3)
+        self.assertEqual(gn(inp), inp + 3)
+
+    def test_graph_break_in_wrapped_nested_function(self):
+        @torch.compile(backend="eager")
+        def gn(x):
+            a = 1
+            b = 2
+
+            @torch.no_grad()
+            def fn(x):
+                x = x + a
+                torch._dynamo.graph_break()
+                assert torch.compiler.is_compiling()
+                assert not torch.is_grad_enabled()
+                return x + b
+
+            x = fn(x)
+            # reconstruction failure would cause a skipped frame
+            assert torch.compiler.is_compiling()
+            assert torch.is_grad_enabled()
+            return x
+
+        inp = torch.randn(3)
+        self.assertEqual(gn(inp), inp + 3)
+
+    def test_graph_break_in_wrapped_skipped_function(self):
+        from torch._dynamo import trace_rules
+        from torch._dynamo.testing import _skipped_function_for_test_reconstruct
+        from torch._dynamo.variables import SkipFunctionVariable
+
+        self.assertIs(
+            trace_rules.lookup(_skipped_function_for_test_reconstruct),
+            SkipFunctionVariable,
+        )
+
+        def fn(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            assert torch.compiler.is_compiling()
+            assert not torch.is_grad_enabled()
+            return x + 2
+
+        @torch.compile(backend="eager")
+        def gn(x):
+            x = torch.no_grad()(_skipped_function_for_test_reconstruct)(fn, x)
+            # reconstruction failure would cause a skipped frame
+            assert torch.compiler.is_compiling()
+            assert torch.is_grad_enabled()
+            return x
+
+        inp = torch.randn(3)
+        self.assertEqual(gn(inp), inp + 3)
+
+    @requires_triton()
+    @unittest.skipIf(
+        not has_triton_experimental_host_tma(),
+        "Test requires triton.tools.experimental_descriptor API",
+    )
+    def test_tma_experimental_reconstruct(self):
+        import triton
+
+        def create_tma(tensor):
+            tma = triton.tools.experimental_descriptor.create_2d_tma_descriptor(
+                tensor.data_ptr(),
+                tensor.size(0),
+                tensor.size(1),
+                32,
+                32,
+                tensor.element_size(),
+            )
+            return tensor + 1, tma
+
+        x = torch.randn(128, 128, device="cuda")
+
+        ref = create_tma(x)
+        res = torch.compile(create_tma, backend="eager")(x)
+        self.assertEqual(ref[1].desc, res[1].desc)
+
+    @requires_triton()
+    @unittest.skipIf(
+        not has_triton_tensor_descriptor_host_tma(),
+        "Test requires triton.tools.tensor_descriptor API",
+    )
+    def test_tma_stable_reconstruct(self):
+        import triton
+
+        def create_tma(tensor):
+            tma = triton.tools.tensor_descriptor.TensorDescriptor.from_tensor(
+                tensor,
+                [32, 32],
+            )
+            return tensor + 1, tma
+
+        x = torch.randn(128, 128, device="cuda")
+
+        ref = create_tma(x)
+        res = torch.compile(create_tma, backend="eager")(x)
+        self.assertEqual(ref, res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_reorder_logs.py b/test/dynamo/test_reorder_logs.py
index 0b22ca50c18c..e2ed1c1220ba 100644
--- a/test/dynamo/test_reorder_logs.py
+++ b/test/dynamo/test_reorder_logs.py
@@ -202,7 +202,20 @@ def f(x):
 
         graph_break_key = counters["graph_break"].keys()
         self.assertEqual(len(graph_break_key), 1)
+<<<<<<< HEAD
         self.assertEqual(next(iter(graph_break_key)), "Tensor.item")
+=======
+        self.assertExpectedInline(
+            next(iter(graph_break_key)),
+            """\
+Unsupported Tensor.item() call with capture_scalar_outputs=False
+  Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.
+  Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
+
+  Developer debug context: call_method TensorVariable() item () {}
+""",  # noqa: B950
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 043c0cdc5fe5..2e86aa1a2d03 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -13,6 +13,10 @@
 import importlib
 import inspect
 import itertools
+<<<<<<< HEAD
+=======
+import logging
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import random
 import sys
@@ -22,7 +26,11 @@
 import warnings
 import weakref
 from abc import ABC
+<<<<<<< HEAD
 from collections import namedtuple
+=======
+from collections import defaultdict, namedtuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterator
 from copy import deepcopy
 from enum import Enum, IntEnum
@@ -43,22 +51,43 @@
 from torch import nn
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import CompileCounter, rand_strided, same, skipIfPy312
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
 from torch.nn import functional as F
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
+=======
+from torch._inductor.utils import fresh_cache
+from torch.nn import functional as F
+from torch.profiler import profile, ProfilerActivity
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+    PLATFORM_SUPPORTS_FP8,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_CUDA,
 )
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     disable_translation_validation_if_dynamic_shapes,
     instantiate_parametrized_tests,
     parametrize,
+=======
+    instantiate_parametrized_tests,
+    parametrize,
+    serialTest,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfHpu,
     skipIfWindows,
     TEST_WITH_ROCM,
 )
+<<<<<<< HEAD
+from torch.testing._internal.two_tensor import TwoTensor
+=======
+from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch.testing._internal.two_tensor import TwoTensor
+from torch.utils._python_dispatch import TorchDispatchMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _orig_module_call = torch.nn.Module.__call__
@@ -174,9 +203,15 @@ def shapes_to_tensor(x, device=None):
     if torch.jit.is_scripting():
         return torch.as_tensor(x, device=device)
     if torch.jit.is_tracing():
+<<<<<<< HEAD
         assert all(
             isinstance(t, torch.Tensor) for t in x
         ), "Shape should be tensor during tracing!"
+=======
+        assert all(isinstance(t, torch.Tensor) for t in x), (
+            "Shape should be tensor during tracing!"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # as_tensor should not be used in tracing because it records a constant
         ret = torch.stack(x)
         if ret.device != device:  # avoid recording a hard-coded device if not necessary
@@ -475,9 +510,15 @@ def forward(
         real_seq_length = seq_length
 
         if past_key_value is not None:
+<<<<<<< HEAD
             assert (
                 len(past_key_value) == 2
             ), f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
+=======
+            assert len(past_key_value) == 2, (
+                f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             real_seq_length += (
                 past_key_value[0].shape[2] if query_length is None else query_length
             )
@@ -943,6 +984,29 @@ def __init__(self, x):
         self.x = x + 2
 
 
+<<<<<<< HEAD
+=======
+class LRUCacheWarningTests(LoggingTestCase):
+    @requires_cuda
+    @make_logging_test(dynamo=logging.DEBUG)
+    def test_lru_cache_warning_issued_during_tracing(self, records):
+        torch.set_default_device("cuda")
+
+        @torch.compile(backend="eager")
+        def f(x):
+            torch.get_device_module()
+            x = x.cos().sin()
+            return x
+
+        result = f(torch.randn(1024))
+        self.assertIsInstance(result, torch.Tensor)
+
+        for record in records:
+            if "call to a lru_cache wrapped function at:" in record.getMessage():
+                self.fail("lru_cache warning was incorrectly logged")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ReproTests(torch._dynamo.test_case.TestCase):
     def setUp(self) -> None:
         try:
@@ -1244,19 +1308,30 @@ def test_reformer_eval(self):
         with torch.no_grad():
             cnt = self._reformer(nopython=True)
         self.assertEqual(cnt.frame_count, 1)
+<<<<<<< HEAD
         self.assertEqual(cnt.op_count, 11)
+=======
+        self.assertEqual(cnt.op_count, 10)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_reformer_train(self):
         with torch.enable_grad():
             cnt = self._reformer(nopython=False)
         expected_op_count = (
+<<<<<<< HEAD
             """11""" if torch._dynamo.config.inline_inbuilt_nn_modules else """5"""
+=======
+            """10""" if torch._dynamo.config.inline_inbuilt_nn_modules else """4"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.assertExpectedInline(cnt.frame_count, """1""")
         self.assertExpectedInline(cnt.op_count, expected_op_count)
 
+<<<<<<< HEAD
     @disable_translation_validation_if_dynamic_shapes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_longformer_chunk(self):
         input1 = torch.randn([1, 4096, 1])
         input2 = torch.randn([12, 4096, 64])
@@ -1446,7 +1521,10 @@ def fn4(x):
             self.assertEqual(cnt.frame_count, 1)
             self.assertEqual(cnt.op_count, 1)
 
+<<<<<<< HEAD
     @disable_translation_validation_if_dynamic_shapes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_create_rand_mask_from_inputs(self):
         args = [
             torch.randn([1, 64, 64]),
@@ -1666,7 +1744,17 @@ def fn(cfg):
         self.assertEqual(opt_fn(cfg), 64)
         # With unspec int, maximum computation is preserved
         self.assertExpectedInline(cnt.frame_count, """1""")
+<<<<<<< HEAD
         self.assertExpectedInline(cnt.op_count, """3""")
+=======
+        if torch._dynamo.config.automatic_dynamic_shapes:
+            if not torch._dynamo.config.assume_static_by_default:
+                self.assertExpectedInline(cnt.op_count, """4""")
+            else:
+                self.assertExpectedInline(cnt.op_count, """3""")
+        else:
+            self.assertExpectedInline(cnt.op_count, """3""")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_reformer_sorting(self):
         x = torch.zeros([1, 12, 4096], dtype=torch.int64)
@@ -3210,12 +3298,22 @@ def f(x):
         opt_f = torch.compile(f, backend="eager")
         with self.assertRaisesRegex(AssertionError, "torch.Size"):
             opt_f(args)
+<<<<<<< HEAD
         self.assertEqual(
             torch._dynamo.utils.counters["graph_break"][
                 "assert with non-string message"
             ],
             1,
         )
+=======
+        for gb, cnt in torch._dynamo.utils.counters["graph_break"].items():
+            if "assert with non-string message" in gb:
+                self.assertEqual(cnt, 1)
+                break
+        else:
+            # graph break not found
+            self.assertTrue(False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_rewrite_assert_noop(self):
         def f(x):
@@ -3454,6 +3552,43 @@ def fn(x, obj):
         self.assertEqual(obj1.b.item(), 0)
         self.assertEqual(obj2.a.item(), 2)
 
+<<<<<<< HEAD
+=======
+    def test_delattr_return(self):
+        class MyObject:
+            def __init__(self, val):
+                self.val = val
+                self.deletion_attempted = False
+
+            def __delattr__(self, attr):
+                if attr == "val":
+                    self.deletion_attempted = True
+                else:
+                    super().__delattr__(attr)
+
+        @torch.compile(fullgraph=True, backend="eager")
+        def test_delattr(input_tensor):
+            instance_a = MyObject(1)
+            instance_b = MyObject(2)
+            del instance_a.val
+            del instance_b.val
+            exists_a = hasattr(instance_a, "val")
+            exists_b = hasattr(instance_b, "val")
+            deletion_attempted_a = instance_a.deletion_attempted
+            deletion_attempted_b = instance_b.deletion_attempted
+            return (
+                input_tensor + 1,
+                exists_a,
+                exists_b,
+                deletion_attempted_a,
+                deletion_attempted_b,
+            )
+
+        result = test_delattr(torch.ones(1))
+        self.assertEqual(result[0], torch.tensor([2.0]))
+        self.assertEqual(result[1:], (True, True, True, True))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_delattr_raises(self):
         class MyObj:
             def __init__(self, a, b):
@@ -3699,7 +3834,11 @@ def fn(input, mask):
         expected = fn(*inputs1)
         actual = fn_opt(*inputs2)
         self.assertTrue(same(actual, expected))
+<<<<<<< HEAD
         self.assertEqual(cnt.op_count, 2)
+=======
+        self.assertEqual(cnt.op_count, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(cnt.frame_count, 1)
         cnt.clear()
         counters.clear()
@@ -3876,6 +4015,44 @@ def randint_fn(high, size, out):
         opt_model(17, (12,), out2)
 
     @requires_cuda
+<<<<<<< HEAD
+=======
+    @serialTest
+    def test_mem_leak_guards(self):
+        def gn(x0, x):
+            return x0 * x
+
+        class MyMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            @torch._dynamo.disable(recursive=False)
+            def forward(self, running_x):
+                # This line creates an temp tensor, which should not be leaked
+                running_x = torch.sin(running_x)
+                x = running_x
+                # This creates a TENSOR_ALIASING guard
+                x = gn(running_x, running_x)
+                # This creates a NO_TENSOR_ALIASING guard which was leaking memory
+                x = gn(running_x, x)
+                return x
+
+        mod = MyMod().cuda()
+
+        fn = torch.compile(mod, backend="eager")
+        x = torch.randn(10, 10, device="cuda")
+        torch.cuda.reset_peak_memory_stats()
+
+        fn(x)
+        peak_mem1 = torch.cuda.max_memory_allocated()
+
+        for _ in range(1000):
+            fn(x)
+        peak_mem2 = torch.cuda.max_memory_allocated()
+        self.assertTrue(peak_mem1 == peak_mem2)
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_guard_default_device(self):
         try:
             torch.set_default_device("cuda")
@@ -3903,7 +4080,11 @@ def test_list_self_reference(self):
         root = []
         root[:] = [root, root, None, None]
 
+<<<<<<< HEAD
         @torch.compile(fullgraph=True, backend="eager")
+=======
+        @torch.compile(fullgraph=False, backend="eager")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_bug():
             return root[0]
 
@@ -4038,6 +4219,7 @@ def forward(self, **inp):
         res = torch.compile(mod, backend="eager", fullgraph=True)(**inputs)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
     def test_call_finally_python_3_8(self):
         # Issue - https://github.com/pytorch/pytorch/issues/97811
         def make_fn(g):
@@ -4073,6 +4255,8 @@ def fn():
         result = torch.compile(fn, backend="aot_eager")()
         self.assertEqual(result, torch.ones(4))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_string_format(self):
         s = "temp{i}"
 
@@ -4710,6 +4894,32 @@ def foo(a):
         ):
             f_compiled(a)
 
+<<<<<<< HEAD
+=======
+    # https://github.com/pytorch/pytorch/issues/146598
+    @unittest.expectedFailure
+    def test_lru_cache_tracing(self):
+        from functools import lru_cache
+
+        counter = 0
+
+        @lru_cache
+        def cached_fn(x):
+            nonlocal counter
+            counter += 1
+            return x + 1
+
+        compiled_fn = torch.compile(cached_fn, backend="eager")
+
+        t = torch.randn(2, 2)
+        result1 = compiled_fn(t)
+        self.assertEqual(counter, 1)
+
+        result2 = compiled_fn(t)
+        self.assertEqual(counter, 1)
+        self.assertEqual(result1, result2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dont_aggressively_write_assert(self):
         record_graph = torch._dynamo.testing.EagerAndRecordGraphs()
 
@@ -4731,7 +4941,11 @@ def f(x):
         self.assertExpectedInline(
             str(graph.code).strip(),
             """\
+<<<<<<< HEAD
 def forward(self, s0 : torch.SymInt, s1 : torch.SymInt, L_x_ : torch.Tensor):
+=======
+def forward(self, s77 : torch.SymInt, s27 : torch.SymInt, L_x_ : torch.Tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     l_x_ = L_x_
     getitem_2 = l_x_[0]
     sum_1 = getitem_2.sum();  getitem_2 = None
@@ -4791,9 +5005,15 @@ def set(self, name: str, value: Any) -> None:
                 with warnings.catch_warnings(record=True):
                     data_len = len(value)
                 if len(self._fields):
+<<<<<<< HEAD
                     assert (
                         len(self) == data_len
                     ), f"Adding a field of length {data_len} to a Instances of length {len(self)}"
+=======
+                    assert len(self) == data_len, (
+                        f"Adding a field of length {data_len} to a Instances of length {len(self)}"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._fields[name] = value
 
             def get(self, name: str) -> Any:
@@ -4930,6 +5150,69 @@ def fn(x_weak, y):
         res = opt_fn(x_weak, y)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
+=======
+    # The programming model around (weak)references is that we DO NOT guarantee
+    # any behavior that depends on deallocation order. We do guarantee "eventual consistency",
+    # that is, after the torch.compile'd function is finished running (including any graph breaks),
+    # refcount semantics will match eager's.
+    def test_weakref_callback(self):
+        called1 = False
+
+        def callback1(ref):
+            nonlocal called1
+            called1 = True
+            if not torch.compiler.is_compiling():
+                raise RuntimeError("callback1 expected to be compiled")
+
+        # weakref callbacks that should be called in the compiled region will be compiled.
+        # But the exact place in the compiled code that the callback is made is undefined.
+        @torch.compile(backend="eager")
+        def fn(x):
+            y = x + 1
+            ref = weakref.ref(y, callback1)
+            torch._dynamo.graph_break()
+            return ref
+
+        fn(torch.ones(3))
+        self.assertTrue(called1)
+
+        called2 = False
+
+        def callback2(ref):
+            nonlocal called2
+            called2 = True
+            if torch.compiler.is_compiling():
+                raise RuntimeError("callback2 expected to not be compiled")
+
+        # weakref callbacks that fire outside the compiled region work
+        @torch.compile(backend="eager")
+        def gn(x):
+            y = x + 1
+            ref = weakref.ref(y, callback2)
+            torch._dynamo.graph_break()
+            return y, ref
+
+        y, _ = gn(torch.ones(3))
+        del y
+        self.assertTrue(called2)
+
+        def callback3(ref):
+            raise RuntimeError("callback3 should not be called")
+
+        # The callback will NOT be called if both the weakref and the referrent are
+        # deleted in the same compiled region (graph breaks act like a "memory sync"
+        # and thus make things tricky - the callback is actually expected to be called).
+        # This test does NOT mean that this behavior is part of the (weak)ref programming
+        # model, but rather reminds us that this is an intentionally allowed weakref-Dynamo behavior.
+        @torch.compile(backend="eager")
+        def hn(x):
+            y = x + 1
+            _ = weakref.ref(y, callback3)
+
+        hn(torch.ones(3))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #     @torch._functorch.config.patch(
     #         recompute_views=True,
     #     )
@@ -5040,6 +5323,68 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
+<<<<<<< HEAD
+=======
+    def test_super_classmethod(self):
+        class Parent:
+            @classmethod
+            def greet(cls):
+                if cls == Parent:
+                    return 4
+                if cls == Child:
+                    return 3
+                if cls == GrandChild:
+                    return 5
+                return 2
+
+        class Child(Parent):
+            def greet(self, x):
+                return x * super().greet()
+
+        class GrandChild(Child):
+            pass
+
+        grand_child = GrandChild()
+
+        def fn(x):
+            return grand_child.greet(x)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.ones(4)
+        ref = fn(x)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
+    def test_super_classmethod_inheritance(self):
+        class GrandParent:
+            @classmethod
+            def greet(cls, x):
+                return cls.A * x
+
+        class Parent(GrandParent):
+            @classmethod
+            def greet(cls, x):
+                return super().greet(x)
+
+        class Child(Parent):
+            A = 5
+
+            @classmethod
+            def greet(cls, x):
+                return super().greet(x)
+
+        child = Child()
+
+        def fn(x):
+            return child.greet(x)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.ones(4)
+        ref = fn(x)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_super_diamond(self):
         class A:
             def __init__(self):
@@ -5344,7 +5689,11 @@ def fn2():
             y = torch.randn(100, 10)
             return torch.mm(x, y).sum()
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.compile(fn)()
 
         torch.compile(fn2)()
@@ -5365,6 +5714,10 @@ def forward(self, x):
         mod = Mod()
         opt_mod = torch.compile(mod, backend="eager", fullgraph=True)
         x = torch.randn(4)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(mod(x), opt_mod(x))
 
     def test_enum(self):
@@ -5406,6 +5759,7 @@ def func(x, m):
         self.assertEqual(func(x, 0), opt_func(x, 0))
 
     def test_grad(self):
+<<<<<<< HEAD
         def fn(x, y):
             x._grad = y
             return x.grad.data
@@ -5414,6 +5768,23 @@ def fn(x, y):
         y = torch.randn(4)
         opt_fn = torch.compile(fn, backend="eager")
         self.assertEqual(fn(x, y), opt_fn(x, y))
+=======
+        # Write to `grad` or `_grad` should reflecte in reading from the other,
+        # and should be codegen-ed.
+        def fn(x, y):
+            x._grad = y + 1
+            y.grad = x + 2
+            return x.grad.data, y._grad.data
+
+        x0 = torch.randn(4, requires_grad=True)
+        y0 = torch.randn(4, requires_grad=True)
+        x1 = x0.clone()
+        y1 = y0.clone()
+        opt_fn = torch.compile(fn, backend="eager")
+        self.assertEqual(fn(x0, y0), opt_fn(x1, y1))
+        self.assertEqual(x0.grad, x1.grad)
+        self.assertEqual(y0.grad, y1.grad)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_nn_module_stack_bc(self):
         from torch._dynamo.mutation_guard import GenerationTracker
@@ -5492,6 +5863,7 @@ def forward(self):
 
     # https://github.com/pytorch/pytorch/issues/121621
     def test_tensor_random(self):
+<<<<<<< HEAD
         def random_op(tensor, params):
             res = tensor.random_(**params)
             return res
@@ -5511,6 +5883,29 @@ def uniform_op(tensor, params):
         params = {"from": -10, "to": 10}
         tensor = torch.randn([2, 3])
         uniform_op(tensor, params)
+=======
+        def random_op(tensor, args, kwargs):
+            res = tensor.random_(*args, **kwargs)
+            return res
+
+        random_op = torch.compile(random_op)
+        tensor = torch.randn([2, 3])
+        random_op(tensor, [], {"from": -10, "to": 10})
+        random_op(tensor, [-10], {"to": 10})
+        random_op(tensor, [-10, 10], {})
+
+    # https://github.com/pytorch/pytorch/issues/131019
+    def test_tensor_uniform(self):
+        def uniform_op(tensor, args, kwargs):
+            res = tensor.uniform_(*args, **kwargs)
+            return res
+
+        uniform_op = torch.compile(uniform_op)
+        tensor = torch.randn([2, 3])
+        uniform_op(tensor, [], {"from": -10, "to": 10})
+        uniform_op(tensor, [-10], {"to": 10})
+        uniform_op(tensor, [-10, 10], {})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_data_attr_mutation_after_saved_for_bw(self):
         def f(x):
@@ -5609,6 +6004,20 @@ def f(x):
         torch.view_as_real(out_test).sum().backward()
         self.assertEqual(x_ref.grad, x_test.grad)
 
+<<<<<<< HEAD
+=======
+    def test_add_complex_conj(self):
+        def f(x):
+            return x + x.conj()
+
+        x = torch.randn(4, dtype=torch.complex64, requires_grad=True)
+        out = torch.compile(f)(x)
+        expected_complex = (2 * x.real).to(dtype=out.dtype)
+
+        self.assertTrue(out.dtype == torch.complex64)
+        self.assertEqual(out, expected_complex)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # https://github.com/pytorch/pytorch/issues/132200
     def test_partitioner_cse_respects_mutation_boundaries(self):
         set_available = hasattr(torch.ops, "fsdp") and hasattr(torch.ops.fsdp, "set_")
@@ -5668,6 +6077,46 @@ def f(x, l):
         self.assertEqual(result, result_test)
         self.assertEqual(x, x_test)
 
+<<<<<<< HEAD
+=======
+    def test_aot_autograd_runtime_wrapper_prologue_profiled(self):
+        # Names for prologue profiling event
+        prologue_name = "AOTDispatcher Runtime Wrapper Prologue"
+
+        # Simple linear op to compile
+        mod = torch.nn.Linear(4, 4)
+        opt_mod = torch.compile(mod)
+        x = torch.randn(4, 4)
+
+        # Run this test with grad and no-grad to test both boolean cases trace_joint
+        for c in [contextlib.nullcontext, torch.no_grad]:
+            # Run compiled op with profiling
+            with c():
+                # warmup before profiling
+                opt_mod(x)
+                with profile(activities=[ProfilerActivity.CPU]) as prof:
+                    opt_mod(x)
+
+            # Make sure events are populated then find prologue event and last start time
+            events = prof.events()
+            self.assertTrue(events is not None)
+
+            prologue_event = None
+            last_start_time = 0
+            for event in events:
+                if hasattr(event, "name") and prologue_name in event.name:
+                    prologue_event = event
+                if event.time_range.start > last_start_time:
+                    last_start_time = event.time_range.start
+
+            # Make sure prologue event exist
+            self.assertTrue(prologue_event is not None)
+
+            # Make sure there is at least one other event (compiled function) that starts
+            # after prologue starts
+            self.assertLess(prologue_event.time_range.end, last_start_time)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_changing_stride(self):
         cnt = torch._dynamo.testing.CompileCounter()
 
@@ -5780,6 +6229,43 @@ def test_optimized_module_training(self):
         mod.eval()
         self.assertFalse(opt_mod.training)
 
+<<<<<<< HEAD
+=======
+    def test_optimized_module_patched_init(self):
+        # A regression test for #138157, and the pattern acame from deepspeed.
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x.mul(5.0)
+
+        def patch_init(init):
+            @functools.wraps(init)
+            def wrapper(module, *args, **kwargs):
+                if not hasattr(module, "_ds_child_entered"):
+                    # child's __init__ was called, since parents all see the same object they can now skip post_init
+                    module._ds_child_entered = True
+                init(module, *args, **kwargs)
+
+            return wrapper
+
+        def patch_init_for_class(cls):
+            if "__init__" in cls.__dict__:
+                cls._old_init = cls.__init__
+                cls.__init__ = patch_init(cls.__init__)
+
+        patch_init_for_class(MyModule)
+        mod = MyModule()
+        opt_mod = torch.compile(mod)
+
+        x = torch.rand(10)
+        ref = mod(x)
+        res = opt_mod(x)
+
+        self.assertEqual(ref, res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_os_fspath(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(x):
@@ -6224,7 +6710,11 @@ def test_distributions_subclass(self):
         from torch.distributions import Categorical
 
         class SubCateg(Categorical):
+<<<<<<< HEAD
             ...
+=======
+            pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile(backend="eager", fullgraph=True)
         def make_dist_and_execute(t, d):
@@ -6349,6 +6839,45 @@ def fn(x):
         inp = torch.randn(3, 3)
         self.assertEqual(fn(inp), opt_fn(inp))
 
+<<<<<<< HEAD
+=======
+    def test_ones_out_dynamic(self):
+        def ones_fn(size, out):
+            return torch.ones(size, out=out)
+
+        opt_model = torch.compile(ones_fn)
+
+        out1 = torch.empty(2, 3)
+        opt_model((2, 3), out1)
+
+        out2 = torch.empty(3, 4)
+        opt_model((3, 4), out2)
+
+    def test_zeros_out_dynamic(self):
+        def zeros_fn(size, out):
+            return torch.zeros(size, out=out)
+
+        opt_model = torch.compile(zeros_fn)
+
+        out1 = torch.empty(2, 3)
+        opt_model((2, 3), out1)
+
+        out2 = torch.empty(3, 4)
+        opt_model((3, 4), out2)
+
+    def test_empty_out_dynamic(self):
+        def empty_fn(size, out):
+            return torch.empty(size, out=out)
+
+        opt_model = torch.compile(empty_fn)
+
+        out1 = torch.empty(2, 3)
+        opt_model((2, 3), out1)
+
+        out2 = torch.empty(3, 4)
+        opt_model((3, 4), out2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dataclass_in_module(self):
         @dataclasses.dataclass
         class MyData:
@@ -6570,6 +7099,12 @@ def f(image_latent):
         torch.cuda.manual_seed_all(54321)
         expected = f(torch.randn((2, 12, 16, 32, 32))).sum()
 
+<<<<<<< HEAD
+=======
+        # https://github.com/pytorch/pytorch/issues/147171
+        torch._inductor.config.fallback_random = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for backend in ["eager", "aot_eager"]:
             torch.manual_seed(54321)
             torch.cuda.manual_seed_all(54321)
@@ -6594,9 +7129,18 @@ def test_incompatible_configs(self):
         ):
             torch.compile(lambda: None)
 
+<<<<<<< HEAD
         with torch._dynamo.config.patch(
             suppress_errors=True, fail_on_recompile_limit_hit=True
         ), self.assertRaises(AssertionError):
+=======
+        with (
+            torch._dynamo.config.patch(
+                suppress_errors=True, fail_on_recompile_limit_hit=True
+            ),
+            self.assertRaises(AssertionError),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.compile(lambda: None)
 
     def test_str_isalnum(self):
@@ -6609,6 +7153,35 @@ def f(x, c):
         c = "foobar"
         self.assertEqual(f(x, c), opt_f(x, c))
 
+<<<<<<< HEAD
+=======
+    def test_amp_foreach_fake_impl(self):
+        inv_scale = torch.full((1,), 0.25)
+        found_inf = torch.full((1,), 0.0)
+        grads = [torch.ones(10), torch.ones(10)]
+
+        def f():
+            res = torch._amp_foreach_non_finite_check_and_unscale_(
+                grads, found_inf, inv_scale
+            )
+            return res
+
+        ref = f()
+        res = torch.compile(f, backend="aot_eager")()
+        self.assertEqual(ref, res)
+
+    def test_delete_local_error(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            y = x + 1
+            del y
+            z = y + 1  # noqa: F821
+            return z
+
+        with self.assertRaises(torch._dynamo.exc.Unsupported):
+            fn(torch.ones(3))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ReproTestsDevice(torch._dynamo.test_case.TestCase):
     def test_sub_alpha_scalar_repro(self, device):
@@ -6737,6 +7310,22 @@ def test_megablocks_moe(self, device):
         out2, _ = torch.compile(moe_mlp, backend="eager")(x)
         self.assertEqual(out1, out2)
 
+<<<<<<< HEAD
+=======
+    def test_tensor_size_hasattr(self):
+        def fn(x):
+            if hasattr(x, "size"):
+                x = x * 2
+            if hasattr(x, "stride"):
+                x = x * 3
+            return x * 5
+
+        x = torch.ones(4)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        self.assertEqual(fn(x), opt_fn(x))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_cuda
     def test_memleak_when_graph_input_has_tensor_attr(self, device):
         @torch.compile(backend="eager")
@@ -6837,6 +7426,149 @@ def f(x, s0, s1, s2):
             out = f_compiled(x, s0, s1, s2)
             self.assertEqual(out_ref, out)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "requires gpu with fp8 support")
+    @requires_cuda
+    def test_partitioner_saves_weights_for_bw(self):
+        def mul_tiled(a, *bs):
+            for b in bs:
+                a = a.unflatten(0, (b.shape[0], -1)).unflatten(-1, (b.shape[-1], -1))
+                a = a * b[:, None, :, None]
+                a = a.flatten(end_dim=1).flatten(start_dim=-2)
+            return a
+
+        def scale(t, amax_t):
+            max_v = torch.finfo(torch.float8_e4m3fn).max
+            scale_t = torch.clamp(amax_t.float(), min=1e-12) / max_v
+            t_fp8 = mul_tiled(t, scale_t.reciprocal()).to(torch.float8_e4m3fn)
+            return t_fp8, scale_t
+
+        def matmul(first, amax_first, second_t, amax_second_t, bias):
+            first_fp8, scale_first = scale(first, amax_first)
+            second_t_fp8, scale_second_t = scale(second_t, amax_second_t)
+            post_scales = []
+            post_bias = None
+            post_scales = [scale_first, scale_second_t.t()]
+            scale_first = scale_first.new_ones((1, 1))
+            scale_second_t = scale_second_t.t().new_ones((1, 1))
+            post_bias, bias = bias, None
+            res = torch._scaled_mm(
+                first_fp8,
+                second_t_fp8.t(),
+                scale_a=scale_first,
+                scale_b=scale_second_t.t(),
+                bias=bias,
+                out_dtype=torch.bfloat16,
+                use_fast_accum=False,
+            )
+            res = mul_tiled(res, *post_scales).to(torch.bfloat16)
+            if post_bias is not None:
+                res += post_bias
+            return res
+
+        @torch.compiler.allow_in_graph
+        class Fp8LinearFn(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, a, b_t, bias):
+                amax_a = a.abs().unflatten(-1, (1, -1)).amax(dim=-1)
+                amax_b_t = b_t.abs().unflatten(-1, (1, -1)).amax(dim=-1)
+                out = matmul(a, amax_a, b_t, amax_b_t, bias)
+                ctx.a_requires_grad = a.requires_grad
+                ctx.b_requires_grad = b_t.requires_grad
+                ctx.bias_requires_grad = (
+                    bias.requires_grad if bias is not None else False
+                )
+                ctx.save_for_backward(a, b_t, amax_b_t)
+                return out
+
+            @staticmethod
+            def backward(ctx, grad_out):
+                a, b_t, amax_b_t = ctx.saved_tensors
+                # Workaround for https://github.com/pytorch/pytorch/issues/141881.
+                # The partitioner would pre-compute the transposed scaling of the weight
+                # in the forward (as it's most efficient, but it actually uses too much
+                # memory). We prevent that by making the scaling depend on the gradient
+                # in a way that has no effect and will be optimized away later.
+                # Care is needed to support tensor parallelism and circumvent bugs.
+                #        b_t = b_t + grad_out[:1, :, None].squeeze(0) * 0
+                if ctx.a_requires_grad:
+                    b = b_t.t().contiguous()
+                    amax_grad_out = grad_out.abs().unflatten(-1, (1, -1)).amax(dim=-1)
+                    amax_b = amax_b_t.t().unflatten(-1, (1, -1)).amax(dim=-1)
+                    amax_b = amax_b.repeat_interleave(
+                        b.shape[0] // amax_b.shape[0], dim=0, output_size=b.shape[0]
+                    )
+                    grad_a = matmul(grad_out, amax_grad_out, b, amax_b, None)
+                else:
+                    grad_a = None
+                if ctx.b_requires_grad:
+                    grad_b = grad_out.t() @ a
+                else:
+                    grad_b = None
+                if ctx.bias_requires_grad:
+                    grad_bias = grad_out.sum(dim=0)
+                else:
+                    grad_bias = None
+                return grad_a, grad_b, grad_bias
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.nn.Parameter(
+                    torch.randn(
+                        64, 64, dtype=torch.bfloat16, device="cuda", requires_grad=True
+                    )
+                )
+                self.b = torch.nn.Parameter(
+                    torch.randn(
+                        64, 64, dtype=torch.bfloat16, device="cuda", requires_grad=True
+                    )
+                )
+                self.bias = torch.nn.Parameter(
+                    torch.randn(
+                        64, dtype=torch.bfloat16, device="cuda", requires_grad=True
+                    )
+                )
+
+        class CustomLinear(torch.nn.Linear):
+            def forward(self, input: torch.Tensor) -> torch.Tensor:
+                out = Fp8LinearFn.apply(
+                    input.flatten(end_dim=-2), self.weight, self.bias
+                )
+                out = out.unflatten(0, input.shape[:-1])
+                return out
+
+        m = CustomLinear(64, 64, dtype=torch.bfloat16, device="cuda")
+        m = torch.compile(m, backend="aot_eager")
+
+        # simple mode to track how many collective ops we saw in the backward
+        class TrackingMode(TorchDispatchMode):
+            def __init__(self):
+                super().__init__()
+                self.ops_counter = defaultdict(int)
+
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                rs = func(*args, **kwargs)
+                self.ops_counter[func] += 1
+                return rs
+
+        a = torch.randn(64, 64, dtype=torch.bfloat16, device="cuda", requires_grad=True)
+        out = m(a)
+        with TrackingMode() as mode:
+            out.sum().backward()
+        # If you print out the AOT fw and bw graphs,
+        # the main thing to look for is that both weights (primals_1/primals_2)
+        # *are* saved for backward, and become back inputs.
+        # The easier-to-test thing I'm checking for here is that the recompute
+        # on primals_2 happens in the backward. With the recompute,
+        # there are 5 _to_copy ops in the backwrad. Without it, there are 4
+        # (aka if you set torch._functorch.config.treat_parameters_as_free_to_save = False)
+        self.assertEqual(mode.ops_counter[torch.ops.aten._to_copy.default], 5)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_getattr_return(self):
         _WrapperDescriptor = type(type.__call__)
         _MethodWrapper = type(all.__call__)
@@ -6896,6 +7628,59 @@ def func(a):
         output = capturedOutput.getvalue()
         self.assertNotIn("class GraphModule", output)
 
+<<<<<<< HEAD
+=======
+    def test_deepcopy_constant_tensor_in_aot_bwd(self):
+        class Fn(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x + 1
+
+            @staticmethod
+            def backward(ctx, grad_out):
+                return grad_out * torch.tensor(2) * grad_out.shape[0]
+
+        def f(x):
+            return Fn.apply(x)
+
+        x = torch.randn(8, requires_grad=True)
+        out = f(x)  # should not raise
+        c_out = torch.compile(f, backend="aot_eager", dynamic=True)(x)
+        expected = torch.autograd.grad(out.sum(), inputs=(x,))
+        actual = torch.autograd.grad(c_out.sum(), inputs=(x,))
+        self.assertEqual(expected, actual)
+
+    def test_module_attribute_error(self):
+        @torch.compile(backend="eager")
+        def f1(x):
+            return torch._bar(x)
+
+        @torch.compile(backend="eager")
+        def f2(x):
+            try:
+                return torch._bar(x)
+            except AttributeError:
+                return x + 1
+
+        with self.assertRaises(AttributeError):
+            f1(torch.ones(3))
+
+        self.assertEqual(f2(torch.ones(3)), torch.ones(3) + 1)
+
+    def test_torch_cuda_is_initialized(self):
+        @torch.compile(fullgraph=True, backend="eager")
+        def f(x):
+            if torch.cuda.is_initialized():
+                return x + 1
+            return x + 2
+
+        inp = torch.randn(3)
+        self.assertEqual(f(inp), inp + 1)
+
+        with mock.patch("torch.cuda.is_initialized", lambda: False):
+            self.assertEqual(f(inp), inp + 2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(ReproTests)
 
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
index fe5d099e7334..6011870615b7 100644
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@@ -245,21 +245,37 @@ def test_schedule(self):
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
+=======
+{"compilation_metrics_runtime": "METRICS", "frame_id": 0, "frame_compile_id": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -277,17 +293,29 @@ def test_cudagraphs(self):
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -318,16 +346,26 @@ def fn(x, y):
 {"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 1, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_y_": [1000, 1000], "l_x_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -338,18 +376,30 @@ def fn(x, y):
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+<<<<<<< HEAD
 {"create_symbol": {"symbol": "s0", "val": "1", "vr": "[-int_oo, int_oo]", "source": "L['y']", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"inductor_pre_grad_graph": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+=======
+{"create_symbol": {"symbol": "s48", "val": "1", "vr": "[-int_oo, int_oo]", "source": "L['y']", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
@@ -372,16 +422,26 @@ def test_example_fn(self):
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "ones_1": [1000, 1000], "output_1": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -429,18 +489,28 @@ def test_example_training_fn(self):
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_output_graph": {"sizes": {"l_stack0_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "sum_1": []}}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"aot_joint_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"aot_forward_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"aot_backward_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
@@ -449,9 +519,14 @@ def test_example_training_fn(self):
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
 {"compilation_metrics": "METRICS", "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
@@ -514,18 +589,28 @@ def throw(x):
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"aot_joint_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_backward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "dynamo_error", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 """,  # noqa: B950
@@ -568,6 +653,7 @@ def forward(self, x):
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
@@ -600,6 +686,54 @@ def forward(self, x):
 {"artifact": {"name": "aotautograd_cache_hash", "encoding": "json"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+=======
+{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
+{"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "rank": 0, "frame_id": 2, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
+{"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
+{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_x_": [1024, 1024], "l__self___layers_0": [1024, 1024], "l__self___layers_1": [1024, 1024]}}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"optimize_ddp_split_graph": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"optimize_ddp_split_child": {"name": "submod_0"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"optimize_ddp_split_child": {"name": "submod_1"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_joint_graph": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_backward_graph": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_joint_graph": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_backward_graph": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
             )
         else:
@@ -608,6 +742,12 @@ def forward(self, x):
                 """\
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
+=======
+{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['args'][0]"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
@@ -649,18 +789,28 @@ def forward(self, x):
 {"describe_storage": {"id": 2, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 2, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 2, "source": "L['self']._modules['layers']._modules['0']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"aot_joint_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_forward_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_backward_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -670,18 +820,28 @@ def forward(self, x):
 {"describe_storage": {"id": 17, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 30, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 17, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 30, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"aot_joint_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_forward_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_backward_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -713,16 +873,26 @@ def fn(x):
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_x_": [1], "add": [1]}}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -764,6 +934,7 @@ def fn(a, b):
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 200}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [5, 10], "is_leaf": true, "stride": [10, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+<<<<<<< HEAD
 {"create_symbol": {"symbol": "s0", "val": "5", "vr": "[2, int_oo]", "source": "L['a'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"create_symbol": {"symbol": "s1", "val": "10", "vr": "[2, int_oo]", "source": "L['a'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"describe_storage": {"id": 1, "describer_id": "ID", "size": 600}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
@@ -773,6 +944,17 @@ def fn(a, b):
 {"create_symbol": {"symbol": "s3", "val": "15", "vr": "[2, int_oo]", "source": "L['b'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"guard_added_fast": {"expr": "Eq(s1, s2)", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": ["s0", "s1"], "l_b_": ["s1", "s3"], "matmul": ["s0", "s3"]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+=======
+{"create_symbol": {"symbol": "s97", "val": "5", "vr": "[2, int_oo]", "source": "L['a'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"create_symbol": {"symbol": "s98", "val": "10", "vr": "[2, int_oo]", "source": "L['a'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"describe_storage": {"id": 1, "describer_id": "ID", "size": 600}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 15], "is_leaf": true, "stride": [15, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['b']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"create_symbol": {"symbol": "s52", "val": "10", "vr": "[2, int_oo]", "source": "L['b'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"create_symbol": {"symbol": "s20", "val": "15", "vr": "[2, int_oo]", "source": "L['b'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"guard_added_fast": {"expr": "Eq(s98, s52)", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_a_": ["s97", "s52"], "l_b_": ["s52", "s20"], "matmul": ["s97", "s20"]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
 """,  # noqa: B950
@@ -874,16 +1056,26 @@ def fn(a):
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1], "sin": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"artifact": {"name": "before_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_post_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -894,11 +1086,20 @@ def fn(a):
 {"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"dynamo_output_graph": {"sizes": {"l_a_": [1], "sin": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+<<<<<<< HEAD
 {"inductor_pre_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_recompile_pre_grad", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+=======
+{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -1063,12 +1264,16 @@ def test_compiled_autograd_chromium(self):
             '{"chromium_event": {}, "compiled_autograd_id": 0, "attempt": 0, "has_payload": "HASH"}',
             '{"chromium_event": {}, "compiled_autograd_id": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, '
             '"has_payload": "HASH"}',
+<<<<<<< HEAD
             '{"chromium_event": {}, "compiled_autograd_id": 1, "frame_id": 1, "frame_compile_id": 1, "attempt": 0, '
             '"has_payload": "HASH"}',
             '{"chromium_event": {}, "compiled_autograd_id": 1, "attempt": 0, "has_payload": "HASH"}',
             '{"chromium_event": {}, "compiled_autograd_id": 1, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, '
             '"has_payload": "HASH"}',
             '{"chromium_event": {}, "compiled_autograd_id": 1, "frame_id": 1, "frame_compile_id": 1, "attempt": 0, '
+=======
+            '{"chromium_event": {}, "compiled_autograd_id": 0, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, '
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             '"has_payload": "HASH"}',
         ]
         logs = self.buffer.getvalue()
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index 6d647b323b52..6bae392bdbd6 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -36,8 +36,13 @@
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
 
+<<<<<<< HEAD
 def traceable_subclass(c):
     return torch._dynamo.config.patch("traceable_tensor_subclasses", {c})
+=======
+def nontraceable_subclass(c):
+    return torch._dynamo.config.patch("nontraceable_tensor_subclasses", {c})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _check_recompiles(self, fn, inputs1, inputs2, expected_recompiles):
@@ -108,9 +113,16 @@ def mk_obscure(base_is_nt):
         for requires_grad_1, requires_grad_2 in itertools.product(
             [True, False], repeat=2
         ):
+<<<<<<< HEAD
             yield partial(
                 mk_leaf, base_is_nt, requires_grad_1, requires_grad_2
             ), f"{prefix}_leaf_{requires_grad_1}_{requires_grad_2}"
+=======
+            yield (
+                partial(mk_leaf, base_is_nt, requires_grad_1, requires_grad_2),
+                f"{prefix}_leaf_{requires_grad_1}_{requires_grad_2}",
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # (3) obscure case:
         # view is not a leaf (implies requires_grad True)
@@ -118,9 +130,16 @@ def mk_obscure(base_is_nt):
         yield partial(mk_obscure, base_is_nt), f"{prefix}_obscure"
 
     # Subclass -> Dense
+<<<<<<< HEAD
     yield lambda: get_jagged_tensor(((2, 3, 4), 3), None, requires_grad=True)[
         0
     ].clone(), "subclass_dense"
+=======
+    yield (
+        lambda: get_jagged_tensor(((2, 3, 4), 3), None, requires_grad=True)[0].clone(),
+        "subclass_dense",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Dense -> Subclass -> Dense -> Subclass
     def mk_dense_subclass_dense_subclass():
@@ -151,17 +170,24 @@ def mk_subclass_dense_subclass_dense():
 class BaseTorchFunction(torch.Tensor):
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
+<<<<<<< HEAD
         if kwargs is None:
             kwargs = {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().__torch_function__(func, types, args, kwargs)
 
 
 class MockSubclass(torch.Tensor):
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
+<<<<<<< HEAD
         if kwargs is None:
             kwargs = {}
         return func(*args, **kwargs)
+=======
+        return super().__torch_function__(func, types, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class AttrSubclass(torch.Tensor):
@@ -170,18 +196,25 @@ class AttrSubclass(torch.Tensor):
 
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
+<<<<<<< HEAD
         if kwargs is None:
             kwargs = {}
 
         return func(*args, **kwargs)
+=======
+        return super().__torch_function__(func, types, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DummyNDim(torch.Tensor):
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
+<<<<<<< HEAD
         if kwargs is None:
             kwargs = {}
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if func == torch.Tensor.ndim.__get__:
             return 10
 
@@ -206,9 +239,12 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
 class SigmoidToExpSubclass(torch.Tensor):
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
+<<<<<<< HEAD
         if kwargs is None:
             kwargs = {}
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if func == torch.Tensor.sigmoid:
             return super().__torch_function__(torch.Tensor.exp, types, args, kwargs)
 
@@ -427,6 +463,7 @@ def counter(gm, example_inputs):
 
 class SubclassTests(torch._dynamo.test_case.TestCase):
     @classmethod
+<<<<<<< HEAD
     def setUpClass(cls):
         super().setUpClass()
         cls._exit_stack.enter_context(
@@ -436,6 +473,8 @@ def setUpClass(cls):
         )
 
     @classmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def tearDownClass(cls):
         cls._exit_stack.close()
 
@@ -453,6 +492,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
                     kwargs = {}
                 return super().__torch_function__(func, types, args, kwargs)
 
+<<<<<<< HEAD
         with torch._dynamo.config.patch(
             "traceable_tensor_subclasses", {BadNewTorchFunction}
         ):
@@ -465,6 +505,16 @@ def fn(x):
 
             res = fn(input)
             self.assertIsInstance(res, BadNewTorchFunction)
+=======
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            return torch.add(x, 1)
+
+        input = torch.ones(2, 2).as_subclass(BadNewTorchFunction)
+
+        res = fn(input)
+        self.assertIsInstance(res, BadNewTorchFunction)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_no_torch_function_recompiles(self):
         class NJT:
@@ -522,6 +572,60 @@ def fn(x):
         res, _ = fn(input)
         self.assertFalse(res)
 
+<<<<<<< HEAD
+=======
+    def test_disable_all_torch_function(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            with torch._C.DisableTorchFunction():
+                torch._dynamo.graph_break()
+                return (
+                    torch._C._is_torch_function_enabled(),
+                    torch._C._is_torch_function_all_disabled(),
+                    torch.add(x, 1.0),
+                )
+
+        input = torch.ones(2, 2)
+        res1, res2, _ = fn(input)
+        self.assertFalse(res1)
+        self.assertTrue(res2)
+
+    def test_disable_all_torch_function_restore_values(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            with torch._C.DisableTorchFunction():
+                x = torch._C._is_torch_function_all_disabled()
+
+            return (
+                x,
+                torch._C._is_torch_function_all_disabled(),
+                torch.add(x, 1.0),
+            )
+
+        input = torch.ones(2, 2)
+        res1, res2, _ = fn(input)
+        self.assertTrue(res1)
+        self.assertFalse(res2)
+
+    def test_disable_all_torch_function_restore_values_graph_break(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            with torch._C.DisableTorchFunction():
+                torch._dynamo.graph_break()
+                x = torch._C._is_torch_function_all_disabled()
+
+            return (
+                x,
+                torch._C._is_torch_function_all_disabled(),
+                torch.add(x, 1.0),
+            )
+
+        input = torch.ones(2, 2)
+        res1, res2, _ = fn(input)
+        self.assertTrue(res1)
+        self.assertFalse(res2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_torch_function_state_nested(self):
         @torch.compile(backend="eager")
         def fn(x):
@@ -564,7 +668,11 @@ def fn(x):
     def test_return_subclass(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(x):
+<<<<<<< HEAD
             return MockSubclass(torch.add(x, 1.0))
+=======
+            return MockSubclass(torch.add(x, 1.0)) * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         input = torch.ones(2, 2)
 
@@ -574,7 +682,11 @@ def fn(x):
     def test_return_as_subclass(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(x):
+<<<<<<< HEAD
             return torch.add(x, 1.0).as_subclass(MockSubclass)
+=======
+            return torch.add(x, 1.0).as_subclass(MockSubclass) * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         input = torch.ones(2, 2)
 
@@ -585,6 +697,7 @@ def test_return_local_subclass(self):
         class LocalSubclass(torch.Tensor):
             @classmethod
             def __torch_function__(cls, func, types, args=(), kwargs=None):
+<<<<<<< HEAD
                 if kwargs is None:
                     kwargs = {}
                 return func(*args, **kwargs)
@@ -599,6 +712,18 @@ def fn(x):
 
             res = fn(input)
             self.assertIsInstance(res, LocalSubclass)
+=======
+                return super().__torch_function__(func, types, args, kwargs)
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            return LocalSubclass(torch.add(x, 1.0)) * 2
+
+        input = torch.ones(2, 2)
+
+        res = fn(input)
+        self.assertIsInstance(res, LocalSubclass)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_torch_function_list_args(self):
         HANDLED_FUNCTIONS = {}
@@ -653,6 +778,7 @@ def fn(v0, v1):
         ],
     )
     def test_type_check(self, comparison, input_type):
+<<<<<<< HEAD
         with torch._dynamo.config.patch("traceable_tensor_subclasses", {DummyNDim}):
 
             def fn(x):
@@ -665,6 +791,18 @@ def fn(x):
             exp_res = fn(input)
             act_res = torch.compile(backend="eager", fullgraph=True)(fn)(input)
             self.assertEqual(exp_res, act_res)
+=======
+        def fn(x):
+            if comparison(x, DummyNDim):
+                return torch.ones(1, 1)
+            else:
+                return torch.zeros(2, 2)
+
+        input = torch.ones(2, 2).as_subclass(input_type)
+        exp_res = fn(input)
+        act_res = torch.compile(backend="eager", fullgraph=True)(fn)(input)
+        self.assertEqual(exp_res, act_res)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_torch_function_call_on_method(self):
         x = torch.ones(2, 2)
@@ -711,6 +849,7 @@ def fn(a, w):
 
         fn_opt = torch.compile(fn)
 
+<<<<<<< HEAD
         with torch._dynamo.config.patch("traceable_tensor_subclasses", {LocalSubclass}):
             res_exp = fn(x, wrapped)
             res_act = fn_opt(y, wrapped2)
@@ -723,11 +862,23 @@ class LocalSubclass(torch.Tensor):
             def __torch_function__(cls, func, types, args=(), kwargs=None):
                 if kwargs is None:
                     kwargs = {}
+=======
+        res_exp = fn(x, wrapped)
+        res_act = fn_opt(y, wrapped2)
+
+        self.assertEqual(res_exp, res_act)
+
+    def test_user_overridden_method_unsupported(self):
+        class LocalSubclass(torch.Tensor):
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return super().__torch_function__(func, types, args, kwargs)
 
             def sigmoid(self):
                 return None
 
+<<<<<<< HEAD
         @torch.compile(backend="eager", fullgraph=True)
         def fn(x):
             x.sigmoid()
@@ -743,6 +894,20 @@ def fn(x):
             fn(x)
 
     def test_user_overidden_attr_unsupported(self):
+=======
+        def fn(x):
+            x.sigmoid()
+
+        x = torch.ones(2, 2).as_subclass(LocalSubclass)
+        fn_opt = compile_full_eager(fn)
+
+        res_exp = fn(x)
+        res_act = fn_opt(x)
+
+        self.assertEqual(res_exp, res_act)
+
+    def test_user_overridden_attr_unsupported(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class LocalSubclass(torch.Tensor):
             @classmethod
             def __torch_function__(cls, func, types, args=(), kwargs=None):
@@ -756,6 +921,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         def fn(x):
             return x.ndim
 
+<<<<<<< HEAD
         msg = (
             "Accessing overridden method/attribute ndim on a tensor"
             " subclass with a __torch_function__ override is not supported"
@@ -769,12 +935,25 @@ def fn(x):
     def test_user_overidden_property_unsupported(self):
         class LocalSubclass(torch.Tensor):
             def __init__(self) -> None:
+=======
+        msg = "`torch.compile` only support tracing certain types of overridden tensor subclass attributes"
+        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, msg):
+            x = torch.ones(2, 2).as_subclass(LocalSubclass)
+            fn(x)
+
+    def test_user_overridden_property_unsupported(self):
+        class LocalSubclass(torch.Tensor):
+            def __init__(self, *args, **kwargs) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._ndim = 10
 
             @classmethod
             def __torch_function__(cls, func, types, args=(), kwargs=None):
+<<<<<<< HEAD
                 if kwargs is None:
                     kwargs = {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return super().__torch_function__(func, types, args, kwargs)
 
             @property
@@ -785,6 +964,7 @@ def ndim(self):
             def ndim(self, value):
                 self._ndim = value
 
+<<<<<<< HEAD
         @torch.compile(backend="eager", fullgraph=True)
         def fn(x):
             return x.ndim
@@ -798,6 +978,18 @@ def fn(x):
         ), self.assertRaisesRegex(torch._dynamo.exc.Unsupported, msg):
             x = torch.ones(2, 2).as_subclass(LocalSubclass)
             fn(x)
+=======
+        def fn(x):
+            return x + x.ndim
+
+        x = LocalSubclass(torch.ones(2, 2))
+        fn_opt = compile_full_eager(fn)
+
+        res_exp = fn(x)
+        res_act = fn_opt(x)
+
+        self.assertEqual(res_exp, res_act)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_overridden_method_guarding(self):
         class LocalSubclass(torch.Tensor):
@@ -811,21 +1003,29 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         def fn(x):
             return x.sigmoid()
 
+<<<<<<< HEAD
         with torch._dynamo.config.patch(
             error_on_recompile=True, traceable_tensor_subclasses={LocalSubclass}
         ):
+=======
+        with torch._dynamo.config.patch(error_on_recompile=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = torch.ones(2, 2).as_subclass(LocalSubclass)
             fn(x)
             fn(x)
             x = torch.ones(2, 2).as_subclass(LocalSubclass)
             fn(x)
 
+<<<<<<< HEAD
         with torch._dynamo.config.patch(
             traceable_tensor_subclasses={LocalSubclass}
         ), self.assertRaisesRegex(
             TypeError,
             "'bool' object is not callable",
         ):
+=======
+        with self.assertRaisesRegex(TypeError, "'bool' object is not callable"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             LocalSubclass.sigmoid = False
             fn(x)
 
@@ -874,6 +1074,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         def fn(x):
             return torch.clone(x)
 
+<<<<<<< HEAD
         with torch._dynamo.config.patch(traceable_tensor_subclasses={TestTensor}):
             inp = torch.ones(4, 4)
             x = inp.as_subclass(TestTensor)
@@ -881,6 +1082,14 @@ def fn(x):
             compiled_fn = torch.compile(fn, fullgraph=True)
             out = compiled_fn(x)
             self.assertEqual(out, torch.ones(4, 4) * 2)
+=======
+        inp = torch.ones(4, 4)
+        x = inp.as_subclass(TestTensor)
+        torch._dynamo.mark_dynamic(x, 0)
+        compiled_fn = torch.compile(fn, fullgraph=True)
+        out = compiled_fn(x)
+        self.assertEqual(out, torch.ones(4, 4) * 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_torch_function_wrapper_class_with_kwargs(self):
         x = torch.ones(2, 2)
@@ -895,6 +1104,27 @@ def fn(w):
         res_act = fn_opt(wrapped)
         self.assertEqual(res_exp, res_act)
 
+<<<<<<< HEAD
+=======
+    def test_tensor_subclass_with_non_classmethod_torch_function(self):
+        class MySubclass(torch.Tensor):
+            def __torch_function__(self, func, types, args, kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                with torch._C.DisableTorchFunctionSubclass():
+                    return func(*args, **kwargs)
+
+        def fn(x):
+            return x + 1
+
+        fn_opt = compile_full_eager(fn)
+
+        x = torch.randn(2, 2).as_subclass(MySubclass)
+        res_exp = fn(x)
+        res_act = fn_opt(x)
+        self.assertEqual(res_exp, res_act)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_subclass_custom_attr(self):
         class AttrSubclass(torch.Tensor):
             x: int = 10
@@ -910,6 +1140,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         def fn(x):
             return x.x + torch.ones(2, 2)
 
+<<<<<<< HEAD
         with traceable_subclass(AttrSubclass):
             input = torch.ones(2, 2).as_subclass(AttrSubclass)
             fn_opt = compile_full_eager(fn)
@@ -917,6 +1148,309 @@ def fn(x):
             res_exp = fn(input)
             res_act = fn_opt(input)
             self.assertEqual(res_exp, res_act)
+=======
+        input = torch.ones(2, 2).as_subclass(AttrSubclass)
+        fn_opt = compile_full_eager(fn)
+
+        res_exp = fn(input)
+        res_act = fn_opt(input)
+        self.assertEqual(res_exp, res_act)
+
+    def test_make_subclass(self):
+        # Make sure `torch.Tensor._make_subclass` is traceable, and Dynamo
+        # models its aliasing relationships correctly.
+        class MySubclass(torch.Tensor):
+            pass
+
+        def fn(x):
+            # Downcast then upcast
+            y = torch.Tensor._make_subclass(MySubclass, x)
+            z = torch.Tensor._make_subclass(torch.Tensor, x)
+            # Now `x, y, z` should have the same underlying data.
+            x += 1
+            y += 2
+            z += 3
+            res = x * y + z
+            return res
+
+        x0 = torch.randn(2, 2)
+        x1 = x0.clone()
+
+        fn_opt = compile_full_eager(fn)
+
+        res_exp = fn(x0)
+        res_act = fn_opt(x1)
+        self.assertEqual(res_exp, res_act)
+        self.assertEqual(x0, x1)
+
+    def test_subclass_override_shape_and_to(self):
+        # This is a slight variabtion of
+        # https://github.com/huggingface/diffusers/blob/fbf6b856cc61fd22ad8635547bff4aafe05723f3/src/diffusers/quantizers/gguf/utils.py#L398-L435
+        class MySubclass(torch.Tensor):
+            def to(self, *args, **kwargs):
+                new = super().to(*args, **kwargs)
+                new.tensor_shape = getattr(self, "tensor_shape", new.data.shape)
+                return new
+
+            @property
+            def shape(self):
+                if not hasattr(self, "tensor_shape"):
+                    self.tensor_shape = self.size()
+                return self.tensor_shape
+
+        def fn(x):
+            x_shape = x.shape
+            y = x.to("cpu")
+            return x + 1, y + 2, x_shape, x.tensor_shape, y.tensor_shape
+
+        x0 = torch.nn.Parameter(torch.randn(2, 2).as_subclass(MySubclass))
+        x1 = torch.nn.Parameter(x0.clone().as_subclass(MySubclass))
+
+        fn_opt = compile_full_eager(fn)
+
+        res_exp = fn(x0)
+        res_act = fn_opt(x1)
+        self.assertEqual(res_exp, res_act)
+        self.assertEqual(x0, x1)
+        self.assertEqual(x0.tensor_shape, x1.tensor_shape)
+
+    def test_subclass_dont_invoke_torch_function_on_overridden_method(self):
+        # We shouldn't fire `__torch_function__` for overridden tensor methods.
+        class MySubclass(torch.Tensor):
+            def to(self, device):
+                return self * len(device)
+
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                if func is torch.Tensor.to:
+                    torch._dynamo.graph_break()
+                return super().__torch_function__(func, types, args, kwargs)
+
+        def fn(x):
+            return x.to("cpu")
+
+        x = torch.nn.Parameter(torch.randn(2, 2).as_subclass(MySubclass))
+
+        fn_opt = compile_full_eager(fn)
+
+        res_exp = fn(x)
+        res_act = fn_opt(x)
+        self.assertEqual(res_exp, res_act)
+
+    def test_subclass_dont_invoke_torch_function_on_overridden_attr(self):
+        from types import MethodWrapperType
+
+        # We shouldn't fire `__torch_function__` for overridden tensor attrs.
+        class MySubclass(torch.Tensor):
+            def ndim(self):
+                return 42
+
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                if type(func) is MethodWrapperType and func.__name__ == "ndim":
+                    torch._dynamo.graph_break()
+                return super().__torch_function__(func, types, args, kwargs)
+
+        def fn(x):
+            return x + x.ndim()
+
+        x = torch.nn.Parameter(torch.randn(2, 2).as_subclass(MySubclass))
+
+        fn_opt = compile_full_eager(fn)
+
+        res_exp = fn(x)
+        res_act = fn_opt(x)
+        self.assertEqual(res_exp, res_act)
+
+    def test_parameter_subclass_with_old_torch_function(self):
+        class MySubclass(torch.nn.Parameter):
+            pass
+
+        def fn(x):
+            x = x.t()
+            x = x.T
+            return x + 1
+
+        fn_opt = compile_full_eager(fn)
+
+        x = torch.randn(2, 2).as_subclass(MySubclass)
+        res_exp = fn(x)
+        res_act = fn_opt(x)
+        self.assertEqual(res_exp, res_act)
+
+    def test_subclass_with_disabled_torch_function(self):
+        class MySubclass(torch.Tensor):
+            __torch_function__ = torch._C._disabled_torch_function_impl
+
+        def fn(x):
+            x = x.t()
+            x = x.T
+            return x + 1
+
+        fn_opt = compile_full_eager(fn)
+
+        x = torch.randn(2, 2).as_subclass(MySubclass)
+        res_exp = fn(x)
+        res_act = fn_opt(x)
+        self.assertEqual(res_exp, res_act)
+
+    def test_parameter_subclass_custom_torch_func_and_dynamic_attr(self):
+        # This is a slight variation of
+        # https://github.com/huggingface/diffusers/blob/fbf6b856cc61fd22ad8635547bff4aafe05723f3/src/diffusers/quantizers/gguf/utils.py#L398-L435
+        # which basically
+        # 1. uses tensor subclass to attach quantization metadata onto tensors
+        # 2. preserve them across torch ops
+        # 3. use the metadata to dequantize the tensor
+        # 4. convert it to a regular tensor.
+        #
+        # The test is meant to make sure Dynamo won't graph break over it.
+        class GGUFParameter(torch.nn.Parameter):
+            def __new__(cls, data, requires_grad=False, quant_type=None):
+                data = data if data is not None else torch.empty(0)
+                self = torch.Tensor._make_subclass(cls, data, requires_grad)
+                return self
+
+            def __init__(self, *args, quant_type=None, **kwargs):
+                self.quant_type = quant_type
+
+            def as_tensor(self):
+                return torch.Tensor._make_subclass(
+                    torch.Tensor, self, self.requires_grad
+                )
+
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+
+                result = super().__torch_function__(func, types, args, kwargs)
+
+                quant_type = None
+                for arg in args:
+                    if isinstance(arg, list) and isinstance(arg[0], GGUFParameter):
+                        quant_type = arg[0].quant_type
+                        break
+                    if isinstance(arg, GGUFParameter):
+                        quant_type = arg.quant_type
+                        break
+                if isinstance(result, torch.Tensor):
+                    return cls(result, quant_type=quant_type)
+                # Handle tuples and lists
+                elif isinstance(result, (tuple, list)):
+                    # Preserve the original type (tuple or list)
+                    wrapped = [
+                        (
+                            cls(x, quant_type=quant_type)
+                            if isinstance(x, torch.Tensor)
+                            else x
+                        )
+                        for x in result
+                    ]
+                    return type(result)(wrapped)
+                else:
+                    return result
+
+        def f(x):
+            tmp = x * 2
+            tmp = tmp + tmp.quant_type
+            tmp = tmp.as_tensor()
+            return tmp * 3
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=True)
+
+        x = GGUFParameter(torch.ones(2), quant_type=42)
+        res = f(x)
+        ref = opt_f(x)
+        self.assertEqual(res, ref)
+
+    def test_newly_constructed_tensor_subclass_attr_mutation(self):
+        # Make sure the attribute mutation for newly constructed tensor subclass
+        # object (from constructor call) is handled both during Dynamo tracing
+        # and codegen-ed to be visible outside `torch.compile`.
+        class MySubclass(torch.Tensor):
+            pass
+
+        def f():
+            x = MySubclass(torch.ones(2))
+            x.bar = 42
+            return x, x * x.bar
+
+        opt_f = compile_full_eager(f)
+
+        res = f()
+        ref = opt_f()
+
+        self.assertEqual(res, ref)
+        self.assertEqual(res[0].bar, ref[0].bar)
+
+    def test_as_subclass_attr_mutation(self):
+        # Make sure the attribute mutation for newly constructed tensor subclass
+        # object (from as_subclass call) is handled both during Dynamo tracing
+        # and codegen-ed to be visible outside `torch.compile`.
+        class MySubclass(torch.Tensor):
+            pass
+
+        def f():
+            x = torch.ones(2).as_subclass(MySubclass)
+            x.bar = 42
+            return x, x * x.bar
+
+        opt_f = compile_full_eager(f)
+
+        res = f()
+        ref = opt_f()
+
+        self.assertEqual(res, ref)
+        self.assertEqual(res[0].bar, ref[0].bar)
+
+    def test_tensor_subclass_attr_codegen_tos(self):
+        # This repros a very subtle interaction between
+        # `TensorWithTFOverrideVariable` attribute mutation codegen and
+        # `PyCodegen.top_of_stack`. It was uncovered from
+        # `test_tensor_subclass_deepcopy`.
+        class MySubclass(torch.Tensor):
+            def __new__(cls, elem, *args, **kwargs):
+                r = torch.Tensor._make_subclass(cls, torch.ones(0))
+                r.elem = elem
+                return r
+
+        def f(t):
+            return MySubclass(t.elem.clone())
+
+        opt_f = compile_full_eager(f)
+
+        t = MySubclass(torch.ones(2))
+        res = f(t)
+        ref = opt_f(t)
+
+        self.assertEqual(res, ref)
+        self.assertEqual(res.elem, ref.elem)
+        self.assertEqual(type(res), type(ref))
+
+    def test_nontraceable_tensor_subclass(self):
+        # This will error if Dynamo tries to wrap it as a tensor variable,
+        # because that involves calling certain methods to inspect the tensor
+        # property, which will blow up in the overridden `__torch_function__`.
+        class MySubclass(torch.Tensor):
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                raise RuntimeError("one shall not pass")
+
+        def f(t):
+            return t.foo + torch.ones(10)
+
+        opt_f = torch.compile(f, backend="eager", fullgraph=False)
+
+        t = MySubclass(torch.ones(2))
+        t.foo = 42
+        # Make sure the `nontraceable_tensor_subclasses` config prevents Dynamo
+        # from wrapping `t`.
+        with nontraceable_subclass(MySubclass):
+            res = f(t)
+            ref = opt_f(t)
+
+        self.assertEqual(res, ref)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_compile_with_fake_tensor_dynamic_dim(self):
         x = torch.randn([3, 4])
@@ -1337,21 +1871,36 @@ def fn(x):
         # During fakeifying, we end up allocating a separate symint
         # for the outer and inner tensor (in this test, s0 is unused).
         expected_var_to_val = {
+<<<<<<< HEAD
             "s0": 8,
             "s1": 4,
         }
         expected_var_to_sources = {
             "s0": "L['x'].size()[0]",
             "s1": "L['x'].inner_elem.size()[0]",
+=======
+            "s50": 4,
+            "s77": 8,
+        }
+        expected_var_to_sources = {
+            "s50": "L['x'].inner_elem.size()[0]",
+            "s77": "L['x'].size()[0]",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         self.assertEqual(curr_var_to_val, expected_var_to_val)
         self.assertEqual(curr_var_to_sources, expected_var_to_sources)
         self.assertExpectedInline(
             "\n".join(guards),
             """\
+<<<<<<< HEAD
 Eq(2*s1, s0)
 2*s1 < 13
 s1 > 3""",
+=======
+Eq(2*s50, s77)
+2*s50 < 13
+s50 > 3""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_wrapper_subclass_with_same_sized_inner_tensor(self):
@@ -1940,9 +2489,15 @@ def f(tt):
             normalize_gm(fw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0, s1]", primals_4: "f32[s0, s1]", primals_5: "Sym(s0)", primals_6: "Sym(s1)", primals_7: "Sym(s1)"):
         mul: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(primals_3, primals_1);  primals_3 = None
         mul_3: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(primals_4, primals_1);  primals_4 = None
+=======
+    def forward(self, primals_1: "Sym(s47)", primals_2: "Sym(s16)", primals_3: "f32[s47, s16]", primals_4: "f32[s47, s16]", primals_5: "Sym(s47)", primals_6: "Sym(s16)", primals_7: "Sym(s16)"):
+        mul: "f32[s47, s16]" = torch.ops.aten.mul.Tensor(primals_3, primals_1);  primals_3 = None
+        mul_3: "f32[s47, s16]" = torch.ops.aten.mul.Tensor(primals_4, primals_1);  primals_4 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (mul, mul_3, primals_5, primals_7, primals_7, primals_1, primals_5, primals_7)
 """,  # noqa: B950
         )
@@ -1951,9 +2506,15 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0
             normalize_gm(bw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s0)", primals_5: "Sym(s0)", primals_7: "Sym(s1)", tangents_1: "f32[s0, s1]", tangents_2: "f32[s0, s1]"):
         mul_8: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(tangents_1, primals_1);  tangents_1 = None
         mul_9: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(tangents_2, primals_1);  tangents_2 = primals_1 = None
+=======
+    def forward(self, primals_1: "Sym(s47)", primals_5: "Sym(s47)", primals_7: "Sym(s16)", tangents_1: "f32[s47, s16]", tangents_2: "f32[s47, s16]"):
+        mul_8: "f32[s47, s16]" = torch.ops.aten.mul.Tensor(tangents_1, primals_1);  tangents_1 = None
+        mul_9: "f32[s47, s16]" = torch.ops.aten.mul.Tensor(tangents_2, primals_1);  tangents_2 = primals_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (None, None, mul_8, mul_9, primals_5, primals_7, primals_7)
 """,  # noqa: B950
         )
@@ -1973,12 +2534,21 @@ def f(tt):
             normalize_gm(fw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0, s1]", primals_4: "f32[s0, s1]", primals_5: "Sym(s0)", primals_6: "Sym(s1)", primals_7: "Sym(s1)"):
         clone: "f32[s0, s1]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
         clone_1: "f32[s0, s1]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
 
         view: "f32[s1, s0]" = torch.ops.aten.view.default(clone, [primals_2, primals_1]);  clone = None
         view_1: "f32[s1, s0]" = torch.ops.aten.view.default(clone_1, [primals_2, primals_1]);  clone_1 = primals_1 = None
+=======
+    def forward(self, primals_1: "Sym(s47)", primals_2: "Sym(s16)", primals_3: "f32[s47, s16]", primals_4: "f32[s47, s16]", primals_5: "Sym(s47)", primals_6: "Sym(s16)", primals_7: "Sym(s16)"):
+        clone: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
+        clone_1: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
+
+        view: "f32[s16, s47]" = torch.ops.aten.view.default(clone, [primals_2, primals_1]);  clone = None
+        view_1: "f32[s16, s47]" = torch.ops.aten.view.default(clone_1, [primals_2, primals_1]);  clone_1 = primals_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (view, view_1, primals_2, primals_5, primals_5, primals_5, primals_7)
 """,  # noqa: B950
         )
@@ -1987,9 +2557,15 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0
             normalize_gm(bw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_5: "Sym(s0)", primals_7: "Sym(s1)", tangents_1: "f32[s1, s0]", tangents_2: "f32[s1, s0]"):
         view_2: "f32[s0, s1]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
         view_3: "f32[s0, s1]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+=======
+    def forward(self, primals_5: "Sym(s47)", primals_7: "Sym(s16)", tangents_1: "f32[s16, s47]", tangents_2: "f32[s16, s47]"):
+        view_2: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
+        view_3: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (None, None, view_2, view_3, primals_5, primals_7, primals_7)
 """,  # noqa: B950
         )
@@ -2011,6 +2587,7 @@ def f(tt, a, b):
             normalize_gm(fw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0, s1]", primals_4: "f32[s0, s1]", primals_5: "Sym(s0)", primals_6: "Sym(s1)", primals_7: "Sym(s1)"):
         mul: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(primals_3, primals_1);  primals_3 = None
         mul_3: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(primals_4, primals_1);  primals_4 = None
@@ -2020,6 +2597,17 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0
         mul_19: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(mul_11, primals_1);  mul_11 = None
         mul_24: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(mul_16, primals_2);  mul_16 = None
         mul_27: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(mul_19, primals_2);  mul_19 = None
+=======
+    def forward(self, primals_1: "Sym(s97)", primals_2: "Sym(s98)", primals_3: "f32[s97, s98]", primals_4: "f32[s97, s98]", primals_5: "Sym(s97)", primals_6: "Sym(s98)", primals_7: "Sym(s98)"):
+        mul: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(primals_3, primals_1);  primals_3 = None
+        mul_3: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(primals_4, primals_1);  primals_4 = None
+        mul_8: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul, primals_2);  mul = None
+        mul_11: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_3, primals_2);  mul_3 = None
+        mul_16: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_8, primals_1);  mul_8 = None
+        mul_19: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_11, primals_1);  mul_11 = None
+        mul_24: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_16, primals_2);  mul_16 = None
+        mul_27: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_19, primals_2);  mul_19 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (mul_24, mul_27, primals_5, primals_7, primals_7, primals_1, primals_2, primals_5, primals_7)
 """,  # noqa: B950
         )
@@ -2028,6 +2616,7 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0
             normalize_gm(bw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_5: "Sym(s0)", primals_7: "Sym(s1)", tangents_1: "f32[s0, s1]", tangents_2: "f32[s0, s1]"):
         mul_32: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(tangents_1, primals_2);  tangents_1 = None
         mul_33: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(tangents_2, primals_2);  tangents_2 = None
@@ -2037,6 +2626,17 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_5: "Sym(s0
         mul_37: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(mul_35, primals_2);  mul_35 = primals_2 = None
         mul_38: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(mul_36, primals_1);  mul_36 = None
         mul_39: "f32[s0, s1]" = torch.ops.aten.mul.Tensor(mul_37, primals_1);  mul_37 = primals_1 = None
+=======
+    def forward(self, primals_1: "Sym(s97)", primals_2: "Sym(s98)", primals_5: "Sym(s97)", primals_7: "Sym(s98)", tangents_1: "f32[s97, s98]", tangents_2: "f32[s97, s98]"):
+        mul_32: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(tangents_1, primals_2);  tangents_1 = None
+        mul_33: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(tangents_2, primals_2);  tangents_2 = None
+        mul_34: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_32, primals_1);  mul_32 = None
+        mul_35: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_33, primals_1);  mul_33 = None
+        mul_36: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_34, primals_2);  mul_34 = None
+        mul_37: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_35, primals_2);  mul_35 = primals_2 = None
+        mul_38: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_36, primals_1);  mul_36 = None
+        mul_39: "f32[s97, s98]" = torch.ops.aten.mul.Tensor(mul_37, primals_1);  mul_37 = primals_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (None, None, mul_38, mul_39, primals_5, primals_7, primals_7)
 """,  # noqa: B950
         )
@@ -2056,12 +2656,21 @@ def f(tt):
             normalize_gm(fw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0, s1]", primals_4: "f32[s0, s1]", primals_5: "Sym(s0)", primals_6: "Sym(s1)", primals_7: "Sym(s1)"):
         clone: "f32[s0, s1]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
         clone_1: "f32[s0, s1]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
 
         view: "f32[s0, s1]" = torch.ops.aten.view.default(clone, [primals_1, primals_2]);  clone = None
         view_1: "f32[s0, s1]" = torch.ops.aten.view.default(clone_1, [primals_1, primals_2]);  clone_1 = primals_1 = primals_2 = None
+=======
+    def forward(self, primals_1: "Sym(s47)", primals_2: "Sym(s16)", primals_3: "f32[s47, s16]", primals_4: "f32[s47, s16]", primals_5: "Sym(s47)", primals_6: "Sym(s16)", primals_7: "Sym(s16)"):
+        clone: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
+        clone_1: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
+
+        view: "f32[s47, s16]" = torch.ops.aten.view.default(clone, [primals_1, primals_2]);  clone = None
+        view_1: "f32[s47, s16]" = torch.ops.aten.view.default(clone_1, [primals_1, primals_2]);  clone_1 = primals_1 = primals_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (view, view_1, primals_5, primals_7, primals_7, primals_5, primals_7)
 """,  # noqa: B950
         )
@@ -2070,9 +2679,15 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0
             normalize_gm(bw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_5: "Sym(s0)", primals_7: "Sym(s1)", tangents_1: "f32[s0, s1]", tangents_2: "f32[s0, s1]"):
         view_2: "f32[s0, s1]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
         view_3: "f32[s0, s1]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+=======
+    def forward(self, primals_5: "Sym(s47)", primals_7: "Sym(s16)", tangents_1: "f32[s47, s16]", tangents_2: "f32[s47, s16]"):
+        view_2: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
+        view_3: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (None, None, view_2, view_3, primals_5, primals_7, primals_7)
 """,  # noqa: B950
         )
@@ -2092,6 +2707,7 @@ def f(tt):
             normalize_gm(fw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0, s1]", primals_4: "f32[s0, s1]", primals_5: "Sym(s0)", primals_6: "Sym(s1)", primals_7: "Sym(s1)"):
         clone: "f32[s0, s1]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
         clone_1: "f32[s0, s1]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
@@ -2099,6 +2715,15 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0
         mul_6: "Sym(s0*s1)" = primals_1 * primals_2;  primals_1 = primals_2 = None
         view: "f32[s0*s1]" = torch.ops.aten.view.default(clone, [mul_6]);  clone = None
         view_1: "f32[s0*s1]" = torch.ops.aten.view.default(clone_1, [mul_6]);  clone_1 = None
+=======
+    def forward(self, primals_1: "Sym(s47)", primals_2: "Sym(s16)", primals_3: "f32[s47, s16]", primals_4: "f32[s47, s16]", primals_5: "Sym(s47)", primals_6: "Sym(s16)", primals_7: "Sym(s16)"):
+        clone: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
+        clone_1: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
+
+        mul_6: "Sym(s16*s47)" = primals_1 * primals_2;  primals_1 = primals_2 = None
+        view: "f32[s16*s47]" = torch.ops.aten.view.default(clone, [mul_6]);  clone = None
+        view_1: "f32[s16*s47]" = torch.ops.aten.view.default(clone_1, [mul_6]);  clone_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (view, view_1, mul_6, primals_5, primals_7)
 """,  # noqa: B950
         )
@@ -2107,9 +2732,15 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0
             normalize_gm(bw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_5: "Sym(s0)", primals_7: "Sym(s1)", tangents_1: "f32[s0*s1]", tangents_2: "f32[s0*s1]"):
         view_2: "f32[s0, s1]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
         view_3: "f32[s0, s1]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+=======
+    def forward(self, primals_5: "Sym(s47)", primals_7: "Sym(s16)", tangents_1: "f32[s16*s47]", tangents_2: "f32[s16*s47]"):
+        view_2: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
+        view_3: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (None, None, view_2, view_3, primals_5, primals_7, primals_7)
 """,  # noqa: B950
         )
@@ -2129,6 +2760,7 @@ def f(tt):
             normalize_gm(fw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0, s1]", primals_4: "f32[s0, s1]", primals_5: "Sym(s0)", primals_6: "Sym(s1)", primals_7: "Sym(s1)"):
         clone: "f32[s0, s1]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
         clone_1: "f32[s0, s1]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
@@ -2136,6 +2768,15 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0
         mul_6: "Sym(s0*s1)" = primals_1 * primals_2;  primals_1 = primals_2 = None
         view: "f32[s0*s1]" = torch.ops.aten.view.default(clone, [mul_6])
         view_1: "f32[s0*s1]" = torch.ops.aten.view.default(clone_1, [mul_6]);  clone_1 = None
+=======
+    def forward(self, primals_1: "Sym(s47)", primals_2: "Sym(s16)", primals_3: "f32[s47, s16]", primals_4: "f32[s47, s16]", primals_5: "Sym(s47)", primals_6: "Sym(s16)", primals_7: "Sym(s16)"):
+        clone: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
+        clone_1: "f32[s47, s16]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
+
+        mul_6: "Sym(s16*s47)" = primals_1 * primals_2;  primals_1 = primals_2 = None
+        view: "f32[s16*s47]" = torch.ops.aten.view.default(clone, [mul_6])
+        view_1: "f32[s16*s47]" = torch.ops.aten.view.default(clone_1, [mul_6]);  clone_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (clone, view, view_1, mul_6, primals_5, primals_7)
 """,  # noqa: B950
         )
@@ -2144,9 +2785,15 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "Sym(s1)", primals_3: "f32[s0
             normalize_gm(bw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_5: "Sym(s0)", primals_7: "Sym(s1)", tangents_1: "f32[s0*s1]", tangents_2: "f32[s0*s1]"):
         view_2: "f32[s0, s1]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
         view_3: "f32[s0, s1]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+=======
+    def forward(self, primals_5: "Sym(s47)", primals_7: "Sym(s16)", tangents_1: "f32[s16*s47]", tangents_2: "f32[s16*s47]"):
+        view_2: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_1, [primals_5, primals_7]);  tangents_1 = None
+        view_3: "f32[s47, s16]" = torch.ops.aten.view.default(tangents_2, [primals_5, primals_7]);  tangents_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (None, None, view_2, view_3, primals_5, primals_7, primals_7)
 """,  # noqa: B950
         )
@@ -2225,6 +2872,7 @@ def forward(self, primals_1: "f32[3, 4]", primals_2: "f32[3, 4]"):
             normalize_gm(fw[1].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s0)", primals_2: "f32[3, s0]", primals_3: "f32[3, s0]", primals_4: "Sym(s0)", primals_5: "Sym(s0)"):
         clone: "f32[3, s0]" = torch.ops.aten.clone.default(primals_2);  primals_2 = None
         clone_1: "f32[3, s0]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
@@ -2233,6 +2881,16 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "f32[3, s0]", primals_3: "f32
         sym_numel_default: "Sym(3*s0)" = torch.ops.aten.sym_numel.default(clone)
         view_1: "f32[3*s0]" = torch.ops.aten.view.default(clone_1, [-1])
         return (clone, view, view_1, sym_numel_default, clone_1, primals_5)
+=======
+    def forward(self, primals_1: "Sym(s16)", primals_2: "f32[3, s16]", primals_3: "f32[3, s16]", primals_4: "Sym(s16)", primals_5: "Sym(s16)"):
+        clone: "f32[3, s16]" = torch.ops.aten.clone.default(primals_2);  primals_2 = None
+        clone_1: "f32[3, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
+
+        view: "f32[3*s16]" = torch.ops.aten.view.default(clone, [-1])
+        sym_size_int_2: "Sym(3*s16)" = torch.ops.aten.sym_size.int(view, 0)
+        view_1: "f32[3*s16]" = torch.ops.aten.view.default(clone_1, [-1])
+        return (clone, view, view_1, sym_size_int_2, clone_1, primals_5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -2251,9 +2909,15 @@ def forward(self, tangents_1: "f32[12]", tangents_2: "f32[12]"):
             normalize_gm(bw[1].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_5: "Sym(s0)", tangents_1: "f32[3*s0]", tangents_2: "f32[3*s0]"):
         view_2: "f32[3, s0]" = torch.ops.aten.view.default(tangents_1, [3, primals_5]);  tangents_1 = None
         view_3: "f32[3, s0]" = torch.ops.aten.view.default(tangents_2, [3, primals_5]);  tangents_2 = None
+=======
+    def forward(self, primals_5: "Sym(s16)", tangents_1: "f32[3*s16]", tangents_2: "f32[3*s16]"):
+        view_2: "f32[3, s16]" = torch.ops.aten.view.default(tangents_1, [3, primals_5]);  tangents_1 = None
+        view_3: "f32[3, s16]" = torch.ops.aten.view.default(tangents_2, [3, primals_5]);  tangents_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (None, view_2, view_3, primals_5, primals_5)
 """,  # noqa: B950
         )
@@ -2281,6 +2945,7 @@ def f(tt):
             normalize_gm(fw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s0)", primals_2: "f32[3, s0]", primals_3: "f32[3, s0]", primals_4: "Sym(s0)", primals_5: "Sym(s0)"):
         clone: "f32[3, s0]" = torch.ops.aten.clone.default(primals_2);  primals_2 = None
         clone_1: "f32[3, s0]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
@@ -2289,6 +2954,16 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "f32[3, s0]", primals_3: "f32
         sym_numel_default: "Sym(3*s0)" = torch.ops.aten.sym_numel.default(clone)
         view_1: "f32[3*s0]" = torch.ops.aten.view.default(clone_1, [-1])
         return (clone, view, view_1, sym_numel_default, clone_1, primals_5)
+=======
+    def forward(self, primals_1: "Sym(s16)", primals_2: "f32[3, s16]", primals_3: "f32[3, s16]", primals_4: "Sym(s16)", primals_5: "Sym(s16)"):
+        clone: "f32[3, s16]" = torch.ops.aten.clone.default(primals_2);  primals_2 = None
+        clone_1: "f32[3, s16]" = torch.ops.aten.clone.default(primals_3);  primals_3 = None
+
+        view: "f32[3*s16]" = torch.ops.aten.view.default(clone, [-1])
+        sym_size_int_2: "Sym(3*s16)" = torch.ops.aten.sym_size.int(view, 0)
+        view_1: "f32[3*s16]" = torch.ops.aten.view.default(clone_1, [-1])
+        return (clone, view, view_1, sym_size_int_2, clone_1, primals_5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,  # noqa: B950
         )
 
@@ -2296,9 +2971,15 @@ def forward(self, primals_1: "Sym(s0)", primals_2: "f32[3, s0]", primals_3: "f32
             normalize_gm(bw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_5: "Sym(s0)", tangents_1: "f32[3*s0]", tangents_2: "f32[3*s0]"):
         view_2: "f32[3, s0]" = torch.ops.aten.view.default(tangents_1, [3, primals_5]);  tangents_1 = None
         view_3: "f32[3, s0]" = torch.ops.aten.view.default(tangents_2, [3, primals_5]);  tangents_2 = None
+=======
+    def forward(self, primals_5: "Sym(s16)", tangents_1: "f32[3*s16]", tangents_2: "f32[3*s16]"):
+        view_2: "f32[3, s16]" = torch.ops.aten.view.default(tangents_1, [3, primals_5]);  tangents_1 = None
+        view_3: "f32[3, s16]" = torch.ops.aten.view.default(tangents_2, [3, primals_5]);  tangents_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (None, view_2, view_3, primals_5, primals_5)
 """,  # noqa: B950
         )
@@ -2465,10 +3146,17 @@ def f(nt):
             normalize_gm(fw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s2)", primals_2: "Sym(s3)", primals_3: "Sym(s1)", primals_4: "f64[s0, s1]", primals_5: "i64[s2 + 1]", primals_6: "f32[s6, 0]", primals_7: "f32[s7, 0]", primals_8: "Sym(s2)", primals_9: "Sym(s1)", primals_10: "Sym(s1)"):
         clone: "f64[s0, s1]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
 
         mul: "f64[s0, s1]" = torch.ops.aten.mul.Tensor(clone, primals_1);  clone = None
+=======
+    def forward(self, primals_1: "Sym(s51)", primals_2: "Sym(s71)", primals_3: "Sym(s55)", primals_4: "f64[s64, s55]", primals_5: "i64[s51 + 1]", primals_6: "f32[s0, 0]", primals_7: "f32[s83, 0]", primals_8: "Sym(s51)", primals_9: "Sym(s55)", primals_10: "Sym(s55)"):
+        clone: "f64[s64, s55]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
+
+        mul: "f64[s64, s55]" = torch.ops.aten.mul.Tensor(clone, primals_1);  clone = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (mul, primals_5, primals_6, primals_7, primals_8, primals_10, primals_10, primals_1, primals_8, primals_10)
 """,  # noqa: B950
         )
@@ -2477,8 +3165,13 @@ def forward(self, primals_1: "Sym(s2)", primals_2: "Sym(s3)", primals_3: "Sym(s1
             normalize_gm(bw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s2)", primals_8: "Sym(s2)", primals_10: "Sym(s1)", tangents_1: "f64[s0, s1]", tangents_2: "i64[s2 + 1]", tangents_3: "f32[s6, 0]", tangents_4: "f32[s7, 0]"):
         mul_1: "f64[s0, s1]" = torch.ops.aten.mul.Tensor(tangents_1, primals_1);  tangents_1 = primals_1 = None
+=======
+    def forward(self, primals_1: "Sym(s51)", primals_8: "Sym(s51)", primals_10: "Sym(s55)", tangents_1: "f64[s64, s55]", tangents_2: "i64[s51 + 1]", tangents_3: "f32[s0, 0]", tangents_4: "f32[s83, 0]"):
+        mul_1: "f64[s64, s55]" = torch.ops.aten.mul.Tensor(tangents_1, primals_1);  tangents_1 = primals_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (None, None, None, mul_1, tangents_2, tangents_3, tangents_4, primals_8, primals_10, primals_10)
 """,  # noqa: B950
         )
@@ -2498,11 +3191,19 @@ def f(nt):
             normalize_gm(fw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_1: "Sym(s2)", primals_2: "Sym(s3)", primals_3: "Sym(s1)", primals_4: "f64[s0, s1]", primals_5: "i64[s2 + 1]", primals_6: "f32[s6, 0]", primals_7: "f32[s7, 0]", primals_8: "Sym(s2)", primals_9: "Sym(s1)", primals_10: "Sym(s1)"):
         clone: "f64[s0, s1]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
 
         cat: "f64[s0, 2*s1]" = torch.ops.aten.cat.default([clone, clone], 1);  clone = None
         add_2: "Sym(2*s1)" = primals_10 + primals_10
+=======
+    def forward(self, primals_1: "Sym(s51)", primals_2: "Sym(s71)", primals_3: "Sym(s55)", primals_4: "f64[s64, s55]", primals_5: "i64[s51 + 1]", primals_6: "f32[s0, 0]", primals_7: "f32[s83, 0]", primals_8: "Sym(s51)", primals_9: "Sym(s55)", primals_10: "Sym(s55)"):
+        clone: "f64[s64, s55]" = torch.ops.aten.clone.default(primals_4);  primals_4 = None
+
+        cat: "f64[s64, 2*s55]" = torch.ops.aten.cat.default([clone, clone], 1);  clone = None
+        add_2: "Sym(2*s55)" = primals_10 + primals_10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (cat, primals_5, primals_6, primals_7, primals_8, add_2, add_2, primals_8, primals_10, add_2)
 """,  # noqa: B950
         )
@@ -2511,11 +3212,19 @@ def forward(self, primals_1: "Sym(s2)", primals_2: "Sym(s3)", primals_3: "Sym(s1
             normalize_gm(bw[0].print_readable(print_output=False)),
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, primals_8: "Sym(s2)", primals_10: "Sym(s1)", add_2: "Sym(2*s1)", tangents_1: "f64[s0, 2*s1]", tangents_2: "i64[s2 + 1]", tangents_3: "f32[s6, 0]", tangents_4: "f32[s7, 0]"):
         slice_1: "f64[s0, s1]" = torch.ops.aten.slice.Tensor(tangents_1, 1, 0, primals_10)
         slice_2: "f64[s0, s1]" = torch.ops.aten.slice.Tensor(tangents_1, 1, primals_10, add_2);  tangents_1 = add_2 = None
 
         add_4: "f64[s0, s1]" = torch.ops.aten.add.Tensor(slice_1, slice_2);  slice_1 = slice_2 = None
+=======
+    def forward(self, primals_8: "Sym(s51)", primals_10: "Sym(s55)", add_2: "Sym(2*s55)", tangents_1: "f64[s64, 2*s55]", tangents_2: "i64[s51 + 1]", tangents_3: "f32[s0, 0]", tangents_4: "f32[s83, 0]"):
+        slice_1: "f64[s64, s55]" = torch.ops.aten.slice.Tensor(tangents_1, 1, 0, primals_10)
+        slice_2: "f64[s64, s55]" = torch.ops.aten.slice.Tensor(tangents_1, 1, primals_10, add_2);  tangents_1 = add_2 = None
+
+        add_4: "f64[s64, s55]" = torch.ops.aten.add.Tensor(slice_1, slice_2);  slice_1 = slice_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (None, None, None, add_4, tangents_2, tangents_3, tangents_4, primals_8, primals_10, primals_10)
 """,  # noqa: B950
         )
@@ -2544,20 +3253,29 @@ def f(nt):
             normalize_gm(fw[0].print_readable(print_output=False)),
             """\
 class <lambda>(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, arg0_1: "Sym(s3)", arg1_1: "Sym(s4)", arg2_1: "Sym(s2)", arg3_1: "f64[9, s2]", arg4_1: "i64[s3 + 1]", arg5_1: "f32[s7, 0]", arg6_1: "f32[s8, 0]", arg7_1: "Sym(s3)", arg8_1: "Sym(s2)", arg9_1: "Sym(s2)"):
+=======
+    def forward(self, arg0_1: "Sym(s51)", arg1_1: "Sym(s71)", arg2_1: "Sym(s55)", arg3_1: "f64[9, s55]", arg4_1: "i64[s51 + 1]", arg5_1: "f32[s0, 0]", arg6_1: "f32[s83, 0]", arg7_1: "Sym(s51)", arg8_1: "Sym(s55)", arg9_1: "Sym(s55)"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         randn: "f64[2, 5]" = torch.ops.aten.randn.default([2, 5], dtype = torch.float64, device = device(type='cpu'), pin_memory = False)
         randn_1: "f64[3, 5]" = torch.ops.aten.randn.default([3, 5], dtype = torch.float64, device = device(type='cpu'), pin_memory = False)
         randn_2: "f64[4, 5]" = torch.ops.aten.randn.default([4, 5], dtype = torch.float64, device = device(type='cpu'), pin_memory = False)
 
         cat: "f64[9, 5]" = torch.ops.aten.cat.default([randn, randn_1, randn_2]);  randn = randn_1 = randn_2 = None
         zeros: "i64[1]" = torch.ops.aten.zeros.default([1], dtype = torch.int64, device = device(type='cpu'), pin_memory = False)
+<<<<<<< HEAD
         _tensor_constant0 = self._tensor_constant0
+=======
+        _tensor_constant0: "i64[3]" = self._tensor_constant0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lift_fresh_copy: "i64[3]" = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
         cumsum: "i64[3]" = torch.ops.aten.cumsum.default(lift_fresh_copy, 0);  lift_fresh_copy = None
         cat_1: "i64[4]" = torch.ops.aten.cat.default([zeros, cumsum]);  zeros = cumsum = None
         zeros_1: "f32[2, 0]" = torch.ops.aten.zeros.default([2, 0], device = device(type='cpu'), pin_memory = False)
         zeros_2: "f32[4, 0]" = torch.ops.aten.zeros.default([4, 0], device = device(type='cpu'), pin_memory = False)
 
+<<<<<<< HEAD
         cat_2: "f64[9, s2 + 5]" = torch.ops.aten.cat.default([cat, arg3_1], 1);  cat = arg3_1 = None
 
         sin: "f64[9, s2 + 5]" = torch.ops.aten.sin.default(cat_2)
@@ -2565,6 +3283,15 @@ def forward(self, arg0_1: "Sym(s3)", arg1_1: "Sym(s4)", arg2_1: "Sym(s2)", arg3_
 
         sym_size_int: "Sym(s2 + 5)" = torch.ops.aten.sym_size.int(cat_2, 1);  cat_2 = None
         sym_stride_int: "Sym(s2 + 5)" = torch.ops.aten.sym_stride.int(mul, 0)
+=======
+        cat_2: "f64[9, s55 + 5]" = torch.ops.aten.cat.default([cat, arg3_1], 1);  cat = arg3_1 = None
+
+        sin: "f64[9, s55 + 5]" = torch.ops.aten.sin.default(cat_2)
+        mul: "f64[9, s55 + 5]" = torch.ops.aten.mul.Tensor(sin, 3);  sin = None
+
+        sym_size_int: "Sym(s55 + 5)" = torch.ops.aten.sym_size.int(cat_2, 1);  cat_2 = None
+        sym_stride_int: "Sym(s55 + 5)" = torch.ops.aten.sym_stride.int(mul, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (mul, cat_1, zeros_1, zeros_2, sym_size_int, sym_stride_int)
 """,  # noqa: B950
         )
@@ -2721,10 +3448,17 @@ def f(nt):
             norm_graph,
             """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s1: "Sym(s1)", L_nt_: "f64[3, s1, 5]"):
         l_nt_ = L_nt_
 
         add: "f64[3, s1, 5]" = l_nt_ + 2;  l_nt_ = None
+=======
+    def forward(self, s71: "Sym(s71)", L_nt_: "f64[3, s71, 5]"):
+        l_nt_ = L_nt_
+
+        add: "f64[3, s71, 5]" = l_nt_ + 2;  l_nt_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (add,)
 """,  # noqa: B950
         )
@@ -2921,6 +3655,28 @@ def fn(values, lengths):
         lengths = torch.tensor([2, 4, 3])
         self._validate_compile(fn, arg_fn=lambda: (values, lengths))
 
+<<<<<<< HEAD
+=======
+    def test_in_graph_construction_from_input_6(self):
+        # Construct with symbolic int.
+        def fn(values, offsets, max_seqlen):
+            t = torch.nested.nested_tensor_from_jagged(
+                values, offsets, max_seqlen=max_seqlen
+            )
+            return torch.nested.nested_tensor_from_jagged(
+                values, t.offsets(), max_seqlen=t._maybe_max_seqlen
+            )
+
+        opt_fn = torch.compile(fn, fullgraph=True, dynamic=True)
+        values = torch.randn(10, 5)
+        offsets = torch.tensor([0, 2, 4, 7, 10])
+        max_seqlen = 5
+
+        ref = fn(values, offsets, max_seqlen)
+        res = opt_fn(values, offsets, max_seqlen)
+        self.assertEqualIgnoringNestedInts(ref, res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #
     # Case 2: in-graph construction where offsets are graph intermediates
     #
@@ -3217,12 +3973,110 @@ def backend(gm, args):
 
             # varies based on the type of view
             guard_str = "\n".join(guards)
+<<<<<<< HEAD
             if nt_view_name == "subclass_dense":
                 self.assertExpectedInline(guard_str, """Eq(s3 - 1, s0)""")
+=======
+
+            if nt_view_name == "base_is_nt_False_basic":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s85 - 1, s64)
+Eq(s20, s64)
+Eq(s80 - 1, s77)
+Eq(s72, s71)""",
+                )
+            elif nt_view_name == "base_is_nt_False_leaf_False_False":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s85 - 1, s64)
+Eq(s80 - 1, s77)
+Eq(s72, s71)""",
+                )
+            elif nt_view_name == "base_is_nt_False_leaf_False_True":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s85 - 1, s64)
+Eq(s20, s64)
+Eq(s80 - 1, s77)
+Eq(s72, s71)""",
+                )
+            elif nt_view_name == "base_is_nt_False_leaf_True_False":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s85 - 1, s64)
+Eq(s20, s64)
+Eq(s80 - 1, s77)
+Eq(s72, s71)""",
+                )
+            elif nt_view_name == "base_is_nt_False_leaf_True_True":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s85 - 1, s64)
+Eq(s20, s64)
+Eq(s80 - 1, s77)
+Eq(s72, s71)""",
+                )
+            elif nt_view_name == "base_is_nt_False_obscure":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s85 - 1, s64)
+Eq(s20, s64)
+Eq(s80 - 1, s77)
+Eq(s72, s71)""",
+                )
+            elif nt_view_name == "base_is_nt_True_basic":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s17 - 1, s83)
+Eq(s20, s83)""",
+                )
+            elif nt_view_name == "base_is_nt_True_leaf_False_False":
+                self.assertExpectedInline(
+                    guard_str,
+                    """Eq(s17 - 1, s83)""",
+                )
+            elif nt_view_name == "base_is_nt_True_leaf_False_True":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s17 - 1, s83)
+Eq(s20, s83)""",
+                )
+            elif nt_view_name == "base_is_nt_True_leaf_True_False":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s17 - 1, s83)
+Eq(s20, s83)""",
+                )
+            elif nt_view_name == "base_is_nt_True_leaf_True_True":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s17 - 1, s83)
+Eq(s20, s83)""",
+                )
+            elif nt_view_name == "base_is_nt_True_obscure":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s17 - 1, s83)
+Eq(s20, s83)""",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif nt_view_name == "dense_subclass_dense_subclass":
                 self.assertExpectedInline(
                     guard_str,
                     """\
+<<<<<<< HEAD
 Eq(s5 - 1, s2)
 Eq(s12 - 1, s7)
 Eq(s11, s9)""",
@@ -3240,6 +4094,21 @@ def backend(gm, args):
 Eq(s13 - 1, s8)
 Eq(s12, s10)""",
                 )
+=======
+Eq(s85 - 1, s77)
+Eq(s80 - 1, s78)
+Eq(s72, s71)""",
+                )
+            elif nt_view_name == "subclass_dense":
+                self.assertExpectedInline(
+                    guard_str,
+                    """\
+Eq(s85 - 1, s77)
+Eq(s20, s77)""",
+                )
+            else:
+                raise NotImplementedError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return gm
 
         torch._dynamo.reset()
diff --git a/test/dynamo/test_trace_rules.py b/test/dynamo/test_trace_rules.py
index 90aa18caee48..0feea51e76a9 100644
--- a/test/dynamo/test_trace_rules.py
+++ b/test/dynamo/test_trace_rules.py
@@ -151,9 +151,15 @@ def heuristic_record_if_in_graph_function(obj, module, name):
                 types.WrapperDescriptorType,
             ),
         ) or is_special_functions(obj):
+<<<<<<< HEAD
             torch_name_rule_map[
                 f"{module.__name__}.{name}"
             ] = TorchInGraphFunctionVariable
+=======
+            torch_name_rule_map[f"{module.__name__}.{name}"] = (
+                TorchInGraphFunctionVariable
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if c_binding_only:
                 if not hasattr(obj, "__code__"):
                     c_binding_in_graph_functions.add(obj)
@@ -398,12 +404,24 @@ def fn(x):
         )
         self.assertTrue("torch._dynamo" not in torch._dynamo.trace_rules.MOD_INLINELIST)
 
+<<<<<<< HEAD
         with unittest.mock.patch(
             "torch._dynamo.trace_rules.torch_name_rule_map",
             _torch_name_rule_map,
         ), unittest.mock.patch(
             "torch._dynamo.trace_rules.get_torch_obj_rule_map",
             torch._dynamo.trace_rules.get_torch_obj_rule_map.__wrapped__,  # bypass functools.lru_cache
+=======
+        with (
+            unittest.mock.patch(
+                "torch._dynamo.trace_rules.torch_name_rule_map",
+                _torch_name_rule_map,
+            ),
+            unittest.mock.patch(
+                "torch._dynamo.trace_rules.get_torch_obj_rule_map",
+                torch._dynamo.trace_rules.get_torch_obj_rule_map.__wrapped__,  # bypass functools.lru_cache
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             x = torch.rand(3)
             opt_fn = torch.compile(backend="eager", fullgraph=True)(fn)
@@ -419,9 +437,15 @@ def fn(x):
 
         _manual_torch_name_rule_map = manual_torch_name_rule_map.copy()
         # Force inline `mod.func` by setting trace rule.
+<<<<<<< HEAD
         _manual_torch_name_rule_map[
             f"{mod.__name__}.{func.__name__}"
         ] = UserFunctionVariable
+=======
+        _manual_torch_name_rule_map[f"{mod.__name__}.{func.__name__}"] = (
+            UserFunctionVariable
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _torch_name_rule_map = [
             _manual_torch_name_rule_map,
@@ -429,12 +453,24 @@ def fn(x):
             torch_non_c_binding_in_graph_functions,
         ]
 
+<<<<<<< HEAD
         with unittest.mock.patch(
             "torch._dynamo.trace_rules.torch_name_rule_map",
             _torch_name_rule_map,
         ), unittest.mock.patch(
             "torch._dynamo.trace_rules.get_torch_obj_rule_map",
             torch._dynamo.trace_rules.get_torch_obj_rule_map.__wrapped__,
+=======
+        with (
+            unittest.mock.patch(
+                "torch._dynamo.trace_rules.torch_name_rule_map",
+                _torch_name_rule_map,
+            ),
+            unittest.mock.patch(
+                "torch._dynamo.trace_rules.get_torch_obj_rule_map",
+                torch._dynamo.trace_rules.get_torch_obj_rule_map.__wrapped__,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # First adding the module to SKIP_DIRS so that it will be skipped by default.
             torch._dynamo.trace_rules.add(mod.__name__)
diff --git a/test/dynamo/test_unittest.py b/test/dynamo/test_unittest.py
new file mode 100644
index 000000000000..de1364d424d7
--- /dev/null
+++ b/test/dynamo/test_unittest.py
@@ -0,0 +1,31 @@
+# Owner(s): ["module: dynamo"]
+import unittest
+
+import torch
+import torch._dynamo.test_case
+from torch.testing._internal.common_utils import make_dynamo_test
+
+
+class TestUnittest(torch._dynamo.test_case.TestCase):
+    def setUp(self):
+        self._prev = torch._dynamo.config.enable_trace_unittest
+        torch._dynamo.config.enable_trace_unittest = True
+
+    def tearDown(self):
+        torch._dynamo.config.enable_trace_unittest = self._prev
+
+    @make_dynamo_test
+    def test_SkipTest(self):
+        z = 0
+        SkipTest = unittest.SkipTest
+        try:
+            raise SkipTest("abcd")
+        except Exception:
+            z = 1
+        self.assertEqual(z, 1)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index 79fdb0a37add..e61fc574588e 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -12,7 +12,11 @@
 from torch._dynamo.comptime import comptime
 from torch._dynamo.testing import CompileCounter, CompileCounterWithBackend, same
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfWindows
+=======
+from torch.testing._internal.common_utils import requires_cuda, skipIfWindows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.logging_utils import logs_to_string
 
 
@@ -62,6 +66,53 @@ def fn(x, y):
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 2)
 
+<<<<<<< HEAD
+=======
+    @requires_cuda
+    def test_no_recompilations_with_efficient_attention(self):
+        def fn(q, k, v, attn_mask):
+            from torch.nn.attention import sdpa_kernel, SDPBackend
+            from torch.nn.functional import scaled_dot_product_attention
+
+            with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
+                return scaled_dot_product_attention(
+                    q, k, v, attn_mask=attn_mask, scale=1.0
+                )
+
+        def make_q_k_v_mask(batch, num_heads, head_dim, seq_len_kv):
+            from collections import namedtuple
+            from functools import partial
+
+            dtype = torch.float16
+            device = "cuda"
+            make_tensor = partial(
+                torch.rand, device=device, dtype=dtype, requires_grad=True
+            )
+            seq_len_q = 64
+            SdpaShape = namedtuple(
+                "Sdpa_Shape", ["batch", "num_heads", "seq_len", "head_dim"]
+            )
+            query = make_tensor(SdpaShape(batch, num_heads, seq_len_q, head_dim))
+            kv_shape = SdpaShape(batch, num_heads, seq_len_kv, head_dim)
+            key, value = make_tensor(kv_shape), make_tensor(kv_shape)
+            mask = torch.randn(
+                (batch, num_heads, seq_len_q, seq_len_kv), device=device, dtype=dtype
+            )
+
+            return query, key, value, mask
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnts)
+
+        q, k, v, mask = make_q_k_v_mask(16, 16, 64, 15)
+        opt_fn(q, k, v, mask)
+
+        q, k, v, mask = make_q_k_v_mask(16, 16, 64, 16)
+        opt_fn(q, k, v, mask)
+
+        self.assertEqual(cnts.frame_count, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.expectedFailure  # array scalars decay to 0D arrays
     def test_builtin_max_min(self):
         # test unspecialized primitive max/min
@@ -222,7 +273,11 @@ def fn(x, rand1, rand2, rand3):
         self.assertEqual(rand2_1.getstate(), rand2_2.getstate())
         self.assertEqual(rand3_1.getstate(), rand3_2.getstate())
 
+<<<<<<< HEAD
     def test_random_object_overriden_methods(self):
+=======
+    def test_random_object_overridden_methods(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # these will result in graph breaks, but we shouldn't crash
         def get_rng():
             rand1 = random.Random(1)
@@ -653,6 +708,7 @@ def fn(x, y):
             self.assertEqual(fn_opt(x, y1), fn(x, y1))
             self.assertEqual(fn_opt(x, y2), fn(x, y2))
             self.assertEqual(fn_opt(x, y3), fn(x, y3))
+<<<<<<< HEAD
             if i == 0:
                 # This is kind of quirky part of automatic dynamic,
                 # since it just uses source name + tx.f_code as the key
@@ -661,6 +717,9 @@ def fn(x, y):
                 self.assertEqual(cnt.frame_count, 2)
             else:
                 self.assertEqual(cnt.frame_count, 1)
+=======
+            self.assertEqual(cnt.frame_count, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch._dynamo.config.patch(specialize_float=False, assume_static_by_default=False)
     def test_unspec_float_input_f64(self):
@@ -846,8 +905,15 @@ def fn(x, scaler):
         self.assertEqual(ref.device, res.device)
 
 
+<<<<<<< HEAD
 devices = ["cuda", "hpu"]
 instantiate_device_type_tests(UnspecTestsDevice, globals(), only_for=devices)
+=======
+devices = ["cuda", "hpu", "xpu"]
+instantiate_device_type_tests(
+    UnspecTestsDevice, globals(), only_for=devices, allow_xpu=True
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
index b4f0b74775c1..d141f28e4221 100644
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: dynamo"]
 import dataclasses
+<<<<<<< HEAD
+=======
+import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import pprint
 import sys
 from unittest import mock
@@ -41,7 +45,11 @@ def test_larger_multiplier_for_smaller_tensor(self):
         self.assertFalse(
             utils.same(
                 a,
+<<<<<<< HEAD
                 a * 6,
+=======
+                a * 9,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fp64_ref=fp64_ref,
                 use_larger_multiplier_for_smaller_tensor=True,
                 tol=tol,
@@ -141,6 +149,75 @@ def break_it2(x):
             compilation_events = [arg[0][0] for arg in log_event.call_args_list]
             self.assertEqual(compilation_events[-1].num_graph_breaks, 2)
 
+<<<<<<< HEAD
+=======
+    def test_traced_code_query(self):
+        try:
+            from .utils import add, break_it
+        except ImportError:
+            from utils import add, break_it
+
+        traced_code_lists = []
+
+        def get_filenames(traced_code_lists):
+            return [
+                [code.co_filename for code in code_list]
+                for code_list in traced_code_lists
+            ]
+
+        def my_backend(gm, example_inputs):
+            from torch._dynamo.utils import get_traced_code
+
+            nonlocal traced_code_lists
+            traced_code_lists.append(get_traced_code())
+            return gm.forward
+
+        utils_path = os.path.join(os.path.dirname(__file__), "utils.py")
+
+        # === no inlining ===
+        @torch.compile(backend=my_backend)
+        def fn(x):
+            return x * 2
+
+        x = torch.randn(3)
+        traced_code_lists = []
+        fn(x)
+        self.assertEqual(get_filenames(traced_code_lists), [[__file__]])
+
+        # === successful inlining ===
+        @torch.compile(backend=my_backend)
+        def fn(x):
+            return add(x) * 2
+
+        x = torch.randn(3)
+        traced_code_lists = []
+        fn(x)
+        utils_path = os.path.join(os.path.dirname(__file__), "utils.py")
+        self.assertEqual(get_filenames(traced_code_lists), [[__file__, utils_path]])
+
+        # === graph break occurs during inlining ===
+        @torch.compile(backend=my_backend)
+        def fn(x):
+            z = x + 1
+            y = break_it(z)
+            return y * 2
+
+        x = torch.randn(3)
+        traced_code_lists = []
+        fn(x)
+        self.assertEqual(get_filenames(traced_code_lists), [[__file__], [utils_path]])
+
+        # === empty graph ===
+        @torch.compile(backend=my_backend)
+        def fn(x):
+            return x
+
+        x = torch.randn(3)
+        traced_code_lists = []
+        fn(x)
+        self.assertEqual(traced_code_lists, [])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestModel(torch.nn.Module):
     def __init__(self):
@@ -174,6 +251,10 @@ def add(x, y):
 
         add(torch.rand([10]), torch.rand([10]))
         utils.reset_frame_count()
+<<<<<<< HEAD
+=======
+        torch._logging._internal.structured_logging_overhead.clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dynamo_config.patch(
         {
@@ -229,14 +310,31 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  '_recursive_joint_graph_passes': [0.0],
  '_recursive_post_grad_passes': [0.0, 0.0],
  '_recursive_pre_grad_passes': [0.0],
+<<<<<<< HEAD
  'async_compile.wait': [0.0, 0.0],
  'backward._backward_impl': [0.0],
+=======
+ 'additional_fake_tensor_prop': [0.0, 0.0],
+ 'aot_collect_metadata': [0.0],
+ 'aot_trace_joint_graph': [0.0],
+ 'async_compile.wait': [0.0, 0.0],
+ 'backward._backward_impl': [0.0],
+ 'build_guards': [0.0],
+ 'bytecode_tracing': [0.0],
+ 'compile_attempt_0': [0.0],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'compile_file': [0.0, 0.0],
  'compile_fx.<locals>.bw_compiler': [0.0],
  'compile_fx.<locals>.fw_compiler_base': [0.0],
  'compile_fx_inner': [0.0, 0.0],
  'create_aot_dispatcher_function': [0.0],
+<<<<<<< HEAD
  'gc': [0.0]}""",  # noqa: B950
+=======
+ 'fx_codegen_and_compile': [0.0, 0.0],
+ 'gc': [0.0],
+ 'min_cut_rematerialization_partition': [0.0]}""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Now validate utils.calculate_time_spent(). Formatting the return
@@ -273,12 +371,22 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
             e.inductor_config = None
             e.cuda_version = None
             e.triton_version = None
+<<<<<<< HEAD
+=======
+            e.python_version = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # First event is for the forward. Formatting makes reading diffs
         # much easier.
         raw = dataclasses.asdict(compilation_events[0])
         del raw["feature_usage"]
         del raw["ir_count"]
+<<<<<<< HEAD
+=======
+        del raw["param_numel"]
+        del raw["param_bytes"]
+        del raw["param_count"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # guard_latency_us is not deterministic
         del raw["guard_latency_us"]
         self.assertExpectedInline(
@@ -298,8 +406,13 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'compliant_custom_ops': set(),
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
+<<<<<<< HEAD
  'cuda_synchronize_time_us': None,
  'cuda_version': None,
+=======
+ 'cuda_version': None,
+ 'cudagraph_skip_reason': None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'distributed_ephemeral_timeout_us': None,
  'duration_us': 0,
  'dynamo_compile_time_before_restart_us': 0,
@@ -317,7 +430,11 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'graph_input_count': 1,
  'graph_node_count': 3,
  'graph_op_count': 1,
+<<<<<<< HEAD
  'guard_count': 8,
+=======
+ 'guard_count': 9,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'has_guarded_code': True,
  'inductor_code_gen_cumulative_compile_time_us': 0,
  'inductor_compile_time_s': 0.0,
@@ -335,8 +452,16 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'non_compliant_ops': set(),
  'num_graph_breaks': 0,
  'num_triton_bundles': None,
+<<<<<<< HEAD
+ 'post_grad_pass_time_us': 0,
+ 'pre_grad_pass_time_us': 0,
+=======
+ 'pgo_get_remote_code_state_time_us': None,
+ 'pgo_put_remote_code_state_time_us': None,
  'post_grad_pass_time_us': 0,
  'pre_grad_pass_time_us': 0,
+ 'python_version': None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'recompile_reason': None,
  'remote_cache_time_saved_s': None,
  'remote_cache_version': None,
@@ -366,6 +491,12 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
         del raw["feature_usage"]
         del raw["ir_count"]
         del raw["guard_latency_us"]
+<<<<<<< HEAD
+=======
+        del raw["param_numel"]
+        del raw["param_bytes"]
+        del raw["param_count"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             pprint.pformat(raw),
             """\
@@ -383,8 +514,13 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'compliant_custom_ops': None,
  'config_inline_inbuilt_nn_modules': None,
  'config_suppress_errors': None,
+<<<<<<< HEAD
  'cuda_synchronize_time_us': None,
  'cuda_version': None,
+=======
+ 'cuda_version': None,
+ 'cudagraph_skip_reason': None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'distributed_ephemeral_timeout_us': None,
  'duration_us': 0,
  'dynamo_compile_time_before_restart_us': None,
@@ -420,8 +556,16 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'non_compliant_ops': None,
  'num_graph_breaks': 0,
  'num_triton_bundles': None,
+<<<<<<< HEAD
+ 'post_grad_pass_time_us': 0,
+ 'pre_grad_pass_time_us': None,
+=======
+ 'pgo_get_remote_code_state_time_us': None,
+ 'pgo_put_remote_code_state_time_us': None,
  'post_grad_pass_time_us': 0,
  'pre_grad_pass_time_us': None,
+ 'python_version': None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  'recompile_reason': None,
  'remote_cache_time_saved_s': None,
  'remote_cache_version': None,
@@ -484,6 +628,114 @@ def test2(x):
             compilation_events = [arg[0][0] for arg in log_event.call_args_list]
         self.assertEqual(compilation_events[0].ir_count, second)
 
+<<<<<<< HEAD
+=======
+    @dynamo_config.patch({"log_compilation_metrics": True})
+    @inductor_config.patch({"force_disable_caches": True})
+    def test_dynamic_shape_feature_use(self):
+        compilation_events = []
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+
+            @torch.compile()
+            def f(x):
+                return x * x
+
+            f(torch.randn(4))
+            f(torch.randn(3))
+            compilation_events = [
+                arg[0][0].feature_usage for arg in log_event.call_args_list
+            ]
+        self.assertIn(
+            ("dynamo.automatic_dynamic_shapes", True), compilation_events[1].items()
+        )
+
+        compilation_events = []
+        with (
+            dynamo_config.patch({"automatic_dynamic_shapes": False}),
+            mock.patch("torch._dynamo.utils.log_compilation_event") as log_event,
+        ):
+
+            @torch.compile()
+            def f(x):
+                return x * x
+
+            f(torch.randn(4))
+            f(torch.randn(3))
+            compilation_events = [
+                arg[0][0].feature_usage for arg in log_event.call_args_list
+            ]
+        self.assertIn(
+            ("dynamo.automatic_dynamic_shapes", False), compilation_events[1].items()
+        )
+
+    @dynamo_config.patch({"log_compilation_metrics": True})
+    def test_num_params(self):
+        import torch.nn as nn
+        import torch.nn.functional as F
+
+        class ModelSimple(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.conv1 = nn.Conv2d(1, 20, 5)
+
+            def forward(self, x):
+                return F.relu(self.conv1(x))
+
+        self.assertEqual([x.numel() for x in ModelSimple().parameters()], [500, 20])
+
+        compilation_events = []
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            m = ModelSimple()
+            torch.compile(m)(torch.randn(1, 10, 10))
+            compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+        self.assertEqual(compilation_events[0].param_numel, 520)
+        self.assertEqual(compilation_events[0].param_bytes, 4 * 520)
+        self.assertEqual(compilation_events[0].param_count, 2)
+
+        class ModelWrapped(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.m1 = ModelSimple()
+                self.m2 = ModelSimple()
+
+            def forward(self, x):
+                return self.m1(x) + self.m2(x)
+
+        compilation_events = []
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            m = ModelWrapped()
+            torch.compile(m)(torch.randn(1, 10, 10))
+            compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+        self.assertEqual(compilation_events[0].param_numel, 1040)
+        self.assertEqual(compilation_events[0].param_bytes, 4 * 1040)
+        self.assertEqual(compilation_events[0].param_count, 4)
+
+        # Test a tied module
+        l1 = nn.Linear(4, 4)
+        l2 = nn.Linear(4, 4)
+        m = nn.Sequential(l1, nn.Sequential(l1, l2))
+        self.assertEqual([x.numel() for x in m.parameters()], [16, 4, 16, 4])
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            torch.compile(m)(torch.randn(4, 4))
+            compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+        self.assertEqual(compilation_events[0].param_numel, 40)
+        self.assertEqual(compilation_events[0].param_bytes, 4 * 40)
+        self.assertEqual(compilation_events[0].param_count, 4)
+
+        # Test tied weights
+        l1 = nn.Linear(4, 4)
+        l2 = nn.Linear(4, 4)
+        l1.weight = l2.weight
+        m = nn.Sequential(l1, nn.Sequential(l2))
+        self.assertEqual([x.numel() for x in m.parameters()], [16, 4, 4])
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            torch.compile(m)(torch.randn(4, 4))
+            compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+        self.assertEqual(compilation_events[0].param_numel, 24)
+        self.assertEqual(compilation_events[0].param_bytes, 4 * 24)
+        self.assertEqual(compilation_events[0].param_count, 3)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestInductorConfigParsingForLogging(TestCase):
     """
diff --git a/test/dynamo/utils.py b/test/dynamo/utils.py
index 930e93c4953c..05540b461ed5 100644
--- a/test/dynamo/utils.py
+++ b/test/dynamo/utils.py
@@ -39,6 +39,13 @@ def add(x):
     return x + 1
 
 
+<<<<<<< HEAD
+=======
+def break_it(x):
+    return x.sum().item()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def create_dummy_module_and_function():
     module = types.ModuleType("dummy_module")
     module.__spec__ = importlib.machinery.ModuleSpec(
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAlmostEqual b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAlmostEqual
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertDictEqual b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertDictEqual
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertFalse b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertFalse
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertGreater b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertGreater
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertGreaterEqual b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertGreaterEqual
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertIn b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertIn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertIs b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertIs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertIsNone b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertIsNone
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertIsNot b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertIsNot
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertIsNotNone b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertIsNotNone
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertLess b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertLess
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertLessEqual b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertLessEqual
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertMultiLineEqual b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertMultiLineEqual
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertNotIn b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertNotIn
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertNotRegex b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertNotRegex
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertRaises b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertRaises
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertRaisesRegex b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertRaisesRegex
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertRegex b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertRegex
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertSequenceEqual b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertSequenceEqual
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertSetEqual b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertSetEqual
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertTrue b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertTrue
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertWarns b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertWarns
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertWarnsRegex b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testAssertWarnsRegex
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testNotAlmostEqual b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testNotAlmostEqual
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testNotEqual b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.testNotEqual
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.test_assertNotWarns b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.test_assertNotWarns
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.test_baseAssertEqual b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.test_baseAssertEqual
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.test_formatMsg b/test/dynamo_expected_failures/CPython313-test_assertions-TestLongMessage.test_formatMsg
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-Test_Assertions.testAssertNotRegex b/test/dynamo_expected_failures/CPython313-test_assertions-Test_Assertions.testAssertNotRegex
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-Test_Assertions.test_AmostEqualWithDelta b/test/dynamo_expected_failures/CPython313-test_assertions-Test_Assertions.test_AmostEqualWithDelta
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_assertions-Test_Assertions.test_assertRaises_frames_survival b/test/dynamo_expected_failures/CPython313-test_assertions-Test_Assertions.test_assertRaises_frames_survival
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.testAtanSign b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.testAtanSign
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.testAtanhSign b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.testAtanhSign
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.testTanhSign b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.testTanhSign
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs_overflows b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_abs_overflows
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_cmath_matches_math b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_cmath_matches_math
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_input_type b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_input_type
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_isfinite b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_isfinite
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_isinf b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_isinf
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_isnan b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_isnan
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_phase b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_phase
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_polar b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_polar
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_polar_errno b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_polar_errno
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_rect b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_rect
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_specific_values b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_specific_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_user_object b/test/dynamo_expected_failures/CPython313-test_cmath-CMathTests.test_user_object
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_asymmetry b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_asymmetry
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_complex_near_zero b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_complex_near_zero
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_complex_special b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_complex_special
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_complex_values b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_complex_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_decimals b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_decimals
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_eight_decimal_places b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_eight_decimal_places
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_fractions b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_fractions
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_identical b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_identical
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_identical_infinite b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_identical_infinite
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_inf_ninf_nan b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_inf_ninf_nan
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_integers b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_integers
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_near_zero b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_near_zero
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_negative_tolerances b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_negative_tolerances
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_reject_complex_tolerances b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_reject_complex_tolerances
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_zero_tolerance b/test/dynamo_expected_failures/CPython313-test_cmath-IsCloseTests.test_zero_tolerance
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test___complex__ b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test___complex__
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_abs b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_abs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_add b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_add
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_boolcontext b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_boolcontext
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_conjugate b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_conjugate
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_from_string b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_from_string
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_negative_nans_from_string b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_negative_nans_from_string
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_special_numbers b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_constructor_special_numbers
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_format b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_format
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_getnewargs b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_getnewargs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_hash b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_hash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_mul b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_mul
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_negative_zero_repr_str b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_negative_zero_repr_str
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_overflow b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_overflow
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pos b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pos
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow_with_small_integer_exponents b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_pow_with_small_integer_exponents
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_repr_roundtrip b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_repr_roundtrip
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_repr_str b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_repr_str
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_richcompare b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_richcompare
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_richcompare_boundaries b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_richcompare_boundaries
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_sub b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_sub
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_truediv b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_truediv
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_truediv_zero_division b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_truediv_zero_division
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_underscores b/test/dynamo_expected_failures/CPython313-test_complex-ComplexTest.test_underscores
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-CAPITest.test_getitem_knownhash b/test/dynamo_expected_failures/CPython313-test_dict-CAPITest.test_getitem_knownhash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_bad_key b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_bad_key
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_clear b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_constructor b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_contains b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_contains
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_fuzz b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_fuzz
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_maintains_tracking b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_maintains_tracking
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_noncompact b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_noncompact
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_contain_use_after_free b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_contain_use_after_free
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_copy_order b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_copy_order
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictitems_contains_use_after_free b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictitems_contains_use_after_free
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictview_mixed_set_operations b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictview_mixed_set_operations
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictview_set_operations_on_items b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictview_set_operations_on_items
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictview_set_operations_on_keys b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictview_set_operations_on_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_empty_presized_dict_in_freelist b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_empty_presized_dict_in_freelist
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_eq b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_eq
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_equal_operator_modifying_operand b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_equal_operator_modifying_operand
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_errors_in_view_containment_check b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_errors_in_view_containment_check
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_dict_operand b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_dict_operand
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_set_operand b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_set_operand
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_get b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_get
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_getitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_init_use_after_free b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_init_use_after_free
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_instance_dict_getattr_str_subclass b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_instance_dict_getattr_str_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_invalid_keyword_arguments b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_invalid_keyword_arguments
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_itemiterator_pickling b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_itemiterator_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_items b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_items
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_items_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_items_symmetric_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_iterator_pickling b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_iterator_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_keys b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_keys_contained b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_keys_contained
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_literal_constructor b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_literal_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_merge_and_mutate b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_merge_and_mutate
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_merge_operator b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_merge_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_missing b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_missing
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_iteration b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_iteration_delete b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_iteration_delete
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_iteration_delete_over_items b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_iteration_delete_over_items
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_iteration_delete_over_values b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_iteration_delete_over_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_lookup b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_lookup
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_object_set_item_single_instance_non_str_key b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_object_set_item_single_instance_non_str_key
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_oob_indexing_dictiter_iternextitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_oob_indexing_dictiter_iternextitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_pop b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_popitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reentrant_insertion b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reentrant_insertion
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_repr b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_repr_deep b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_repr_deep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_resize2 b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_resize2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_empty_dict b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_empty_dict
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_shared_shared_dicts b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_shared_shared_dicts
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reversed b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reversed
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverseitemiterator_pickling b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverseitemiterator_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverseiterator_pickling b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverseiterator_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reversevaluesiterator_pickling b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reversevaluesiterator_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault_atomic b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault_atomic
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setitem_atomic_at_resize b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setitem_atomic_at_resize
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_del b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_del
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop_pending b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop_pending
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_popitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_to_generic_combinedtable b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_to_generic_combinedtable
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_update b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_store_evilattr b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_store_evilattr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_str_nonstr b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_str_nonstr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_tuple_keyerror b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_tuple_keyerror
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_update b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_values b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_valuesiterator_pickling b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_valuesiterator_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_views_mapping b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_views_mapping
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_constructor b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_get b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_get
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_items b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_items
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_keys b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_read b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_read
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_update b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_values b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_dict-GeneralMappingTests.test_write
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_bool b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_bool
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_constructor b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_get b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_get
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_items b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_items
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_keys b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_read b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_read
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_update b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_values b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_dict-SubclassMappingTests.test_write
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-FormatFunctionsTestCase.test_getformat b/test/dynamo_expected_failures/CPython313-test_float-FormatFunctionsTestCase.test_getformat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-FormatTestCase.test_format b/test/dynamo_expected_failures/CPython313-test_float-FormatTestCase.test_format
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-FormatTestCase.test_format_testfile b/test/dynamo_expected_failures/CPython313-test_float-FormatTestCase.test_format_testfile
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-FormatTestCase.test_issue35560 b/test/dynamo_expected_failures/CPython313-test_float-FormatTestCase.test_issue35560
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-FormatTestCase.test_issue5864 b/test/dynamo_expected_failures/CPython313-test_float-FormatTestCase.test_issue5864
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_error_message b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_error_message
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_ceil b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_ceil
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_containment b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_containment
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_floor b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_floor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_memoryview b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_memoryview
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_mod b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_mod
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_pow b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_pow
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_with_comma b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_float_with_comma
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_floatasratio b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_floatasratio
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_floatconversion b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_floatconversion
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_hash b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_hash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_hash_nan b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_hash_nan
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_keyword_args b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_keyword_args
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_keywords_in_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_non_numeric_input_types b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_non_numeric_input_types
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_underscores b/test/dynamo_expected_failures/CPython313-test_float-GeneralFloatCases.test_underscores
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_from_hex b/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_from_hex
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_invalid_inputs b/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_invalid_inputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_roundtrip b/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_roundtrip
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_subclass b/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_whitespace b/test/dynamo_expected_failures/CPython313-test_float-HexFloatTestCase.test_whitespace
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_double_specials_do_unpack b/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_double_specials_do_unpack
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_float_specials_do_unpack b/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_float_specials_do_unpack
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_serialized_float_rounding b/test/dynamo_expected_failures/CPython313-test_float-IEEEFormatTestCase.test_serialized_float_rounding
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-InfNanTest.test_inf_from_str b/test/dynamo_expected_failures/CPython313-test_float-InfNanTest.test_inf_from_str
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-InfNanTest.test_nan_from_str b/test/dynamo_expected_failures/CPython313-test_float-InfNanTest.test_nan_from_str
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-ReprTestCase.test_repr b/test/dynamo_expected_failures/CPython313-test_float-ReprTestCase.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_format_specials b/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_format_specials
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_inf_nan b/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_inf_nan
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_matches_float_format b/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_matches_float_format
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_overflow b/test/dynamo_expected_failures/CPython313-test_float-RoundTestCase.test_overflow
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_denial_of_service_prevented_int_to_str b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_denial_of_service_prevented_int_to_str
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_denial_of_service_prevented_str_to_int b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_denial_of_service_prevented_str_to_int
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_disabled_limit b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_disabled_limit
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_int_from_other_bases b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_int_from_other_bases
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_int_max_str_digits_is_per_interpreter b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_int_max_str_digits_is_per_interpreter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_max_str_digits b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_max_str_digits
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_max_str_digits_edge_cases b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_max_str_digits_edge_cases
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_power_of_two_bases_unlimited b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_power_of_two_bases_unlimited
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_sign_not_counted b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_sign_not_counted
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_underscores_ignored b/test/dynamo_expected_failures/CPython313-test_int-IntStrDigitLimitsTests.test_underscores_ignored
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_denial_of_service_prevented_int_to_str b/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_denial_of_service_prevented_int_to_str
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_denial_of_service_prevented_str_to_int b/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_denial_of_service_prevented_str_to_int
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_disabled_limit b/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_disabled_limit
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_int_from_other_bases b/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_int_from_other_bases
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_int_max_str_digits_is_per_interpreter b/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_int_max_str_digits_is_per_interpreter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_max_str_digits b/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_max_str_digits
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_max_str_digits_edge_cases b/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_max_str_digits_edge_cases
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_power_of_two_bases_unlimited b/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_power_of_two_bases_unlimited
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_sign_not_counted b/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_sign_not_counted
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_underscores_ignored b/test/dynamo_expected_failures/CPython313-test_int-IntSubclassStrDigitLimitsTests.test_underscores_ignored
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_basic b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_basic
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_error_message b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_error_message
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_base_indexable b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_base_indexable
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_base_limits b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_base_limits
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_memoryview b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_memoryview
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_returns_int_subclass b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_returns_int_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_index b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_index
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_int b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_int_subclass_with_int
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_intconversion b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_intconversion
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_invalid_signs b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_invalid_signs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_keyword_args b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_keyword_args
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_non_numeric_input_types b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_non_numeric_input_types
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_string_float b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_string_float
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_underscores b/test/dynamo_expected_failures/CPython313-test_int-IntTestCases.test_underscores
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-PyLongModuleTests.test_pylong_misbehavior_error_path_from_str b/test/dynamo_expected_failures/CPython313-test_int-PyLongModuleTests.test_pylong_misbehavior_error_path_from_str
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-PyLongModuleTests.test_pylong_misbehavior_error_path_to_str b/test/dynamo_expected_failures/CPython313-test_int-PyLongModuleTests.test_pylong_misbehavior_error_path_to_str
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-PyLongModuleTests.test_pylong_roundtrip b/test/dynamo_expected_failures/CPython313-test_int-PyLongModuleTests.test_pylong_roundtrip
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_int-PyLongModuleTests.test_pylong_str_to_int b/test/dynamo_expected_failures/CPython313-test_int-PyLongModuleTests.test_pylong_str_to_int
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_3720 b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_3720
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_filter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_filter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_list b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_list
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_map b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_map
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_max_min b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_max_min
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_tuple b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_tuple
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_zip b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_builtin_zip
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_countOf b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_countOf
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_error_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_error_iter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_function b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_function
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_locations b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_locations
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_sequence b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_exception_sequence
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_in_and_not_in b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_in_and_not_in
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_indexOf b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_indexOf
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_basic b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_basic
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_big_range b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_big_range
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_callable b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_callable
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_class_for b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_class_for
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_class_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_class_iter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_dict b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_dict
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_empty b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_empty
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_file b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_file
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_for_loop b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_for_loop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_function b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_function
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_function_concealing_reentrant_exhaustion b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_function_concealing_reentrant_exhaustion
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_function_stop b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_function_stop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_independence b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_independence
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_neg_setstate b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_neg_setstate
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_overflow b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_overflow
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_range b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_range
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_string b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_string
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_tuple b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_iter_tuple
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_exhausted_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_exhausted_iter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_iter_pickle b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_iter_pickle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_nested_comprehensions_for b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_nested_comprehensions_for
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_nested_comprehensions_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_nested_comprehensions_iter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_new_style_iter_class b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_new_style_iter_class
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_reduce_mutating_builtins_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_reduce_mutating_builtins_iter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_ref_counting_behavior b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_ref_counting_behavior
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_seq_class_for b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_seq_class_for
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_seq_class_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_seq_class_iter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_callable b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_callable
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_dict b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_dict
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_enumerate b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_enumerate
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_list b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_list
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_range b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_range
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_sequence b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_sequence
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_string b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_string
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_tuple b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_tuple
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_yield b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_sinkstate_yield
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_stop_sequence b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_stop_sequence
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_unicode_join_endcase b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_unicode_join_endcase
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_unpack_iter b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_unpack_iter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_writelines b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_writelines
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_addmul b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_addmul
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_append b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_append
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_basic b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_basic
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_clear b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_constructor_exception_handling b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_constructor_exception_handling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_constructors b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_constructors
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains_fake b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains_fake
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains_order b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_contains_order
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_copy b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count_index_remove_crashes b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_count_index_remove_crashes
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_delitem b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_delslice b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_delslice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_deopt_from_append_list b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_deopt_from_append_list
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_empty_slice b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_empty_slice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_equal_operator_modifying_operand b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_equal_operator_modifying_operand
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_exhausted_iterator b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_exhausted_iterator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_extend b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_extend
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_extendedslicing b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_extendedslicing
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitem b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitem_error b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitem_error
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitemoverwriteiter b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getitemoverwriteiter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getslice b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_getslice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_iadd b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_iadd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_imul b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_imul
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_index b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_index
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_init b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_insert b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_insert
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_iterator_pickle b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_iterator_pickle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_keyword_args b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_keyword_args
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_keywords_in_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_list_index_modifing_operand b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_list_index_modifing_operand
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_list_resize_overflow b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_list_resize_overflow
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_lt_operator_modifying_operand b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_lt_operator_modifying_operand
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_no_comdat_folding b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_no_comdat_folding
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_overflow b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_overflow
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_pickle b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_pickle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_pop b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_preallocation b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_preallocation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_remove b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_remove
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repeat b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repeat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repr b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repr_deep b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repr_deep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repr_mutate b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_repr_mutate
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_reverse b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_reverse
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_reversed b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_reversed
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_reversed_pickle b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_reversed_pickle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_set_subscript b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_set_subscript
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_setitem b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_setitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_setitem_error b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_setitem_error
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_setslice b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_setslice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_slice b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_slice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_slice_assign_iterator b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_slice_assign_iterator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_sort b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_sort
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_step_overflow b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_step_overflow
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_subscript b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_subscript
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_tier2_invalidates_iterator b/test/dynamo_expected_failures/CPython313-test_list-ListTest.test_tier2_invalidates_iterator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_infinities b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_infinities
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_nan_results b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_nan_results
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_overflow b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_overflow
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_single_round b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_single_round
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_zero_result b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_fma_zero_result
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_random b/test/dynamo_expected_failures/CPython313-test_math-FMATests.test_random
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-IsCloseTests.test_decimals b/test/dynamo_expected_failures/CPython313-test_math-IsCloseTests.test_decimals
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-IsCloseTests.test_fractions b/test/dynamo_expected_failures/CPython313-test_math-IsCloseTests.test_fractions
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-IsCloseTests.test_negative_tolerances b/test/dynamo_expected_failures/CPython313-test_math-IsCloseTests.test_negative_tolerances
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAcos b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAcos
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAcosh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAcosh
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAsin b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAsin
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAsinh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAsinh
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtan b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtan
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtan2 b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtan2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtanh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testAtanh
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCbrt b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCbrt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCeil b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCeil
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testComb b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testComb
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCopysign b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCopysign
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCos b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCos
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCosh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testCosh
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testDegrees b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testDegrees
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testDist b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testDist
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testExp b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testExp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testExp2 b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testExp2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFabs b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFabs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFactorial b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFactorial
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFactorialHugeInputs b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFactorialHugeInputs
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFactorialNonIntegers b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFactorialNonIntegers
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFloor b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFloor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFmod b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFmod
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFrexp b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFrexp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFsum b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testFsum
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testGcd b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testGcd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testHypot b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testHypot
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testHypotAccuracy b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testHypotAccuracy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testIsqrt b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testIsqrt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLdexp b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLdexp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog10 b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog10
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog1p b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog1p
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog2 b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog2
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog2Exact b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testLog2Exact
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testModf b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testModf
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testPerm b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testPerm
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testPow b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testPow
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testRadians b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testRadians
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testRemainder b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testRemainder
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSin b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSin
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSinh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSinh
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSqrt b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSqrt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSumProd b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testSumProd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testTan b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testTan
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.testTanh b/test/dynamo_expected_failures/CPython313-test_math-MathTests.testTanh
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_exceptions b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_exceptions
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_input_exceptions b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_input_exceptions
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_issue39871 b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_issue39871
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_lcm b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_lcm
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_math_dist_leak b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_math_dist_leak
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_mtestfile b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_mtestfile
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_nextafter b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_nextafter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_prod b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_prod
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_sumprod_accuracy b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_sumprod_accuracy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_sumprod_extended_precision_accuracy b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_sumprod_extended_precision_accuracy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_sumprod_stress b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_sumprod_stress
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_testfile b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_testfile
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_trunc b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_trunc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_ulp b/test/dynamo_expected_failures/CPython313-test_math-MathTests.test_ulp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_delitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_delitem_hash_collision
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_detect_deletion_during_iteration b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_detect_deletion_during_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_highly_nested b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_highly_nested
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_highly_nested_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_override_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_reinsert b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_reinsert
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_setitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_get b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_get
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_items b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_items
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_keys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_read b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_read
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_values b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonGeneralMappingTests.test_write
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_468 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_468
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_copying b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_copying
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_delitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_delitem_hash_collision
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_detect_deletion_during_iteration b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_detect_deletion_during_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_delitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_setitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_dict_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_equality b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_equality
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_fromkeys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_fromkeys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_highly_nested b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_highly_nested
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_highly_nested_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_init_calls
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_delete_key
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key_in_dict_eq b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key_in_dict_eq
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24347
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24348
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24667 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24667
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_iterators b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_iterators
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_iterators_empty b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_iterators_empty
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_iterators_pickling b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_iterators_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_key_change_during_iteration b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_key_change_during_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_merge_operator b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_merge_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_move_to_end b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_move_to_end
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_move_to_end_issue25406 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_move_to_end_issue25406
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_ordered_dict_items_result_gc b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_ordered_dict_items_result_gc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_overridden_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_override_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_pickle_recursive b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_pickle_recursive
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_popitem_last b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_popitem_last
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_reduce_not_too_fat b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_reduce_not_too_fat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_reference_loop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_reference_loop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_reinsert b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_reinsert
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_repr b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_repr_recursive b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_repr_recursive
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_repr_recursive_values b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_repr_recursive_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_setitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_sizeof b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_sizeof
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_sizeof_exact b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_sizeof_exact
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_sorted_iterators b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_sorted_iterators
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_views b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_views
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_weakref_list_is_not_traversed b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_weakref_list_is_not_traversed
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_yaml_linkage b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_yaml_linkage
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_468 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_468
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_copying b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_copying
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_delitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_delitem_hash_collision
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_detect_deletion_during_iteration b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_detect_deletion_during_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_dict_delitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_dict_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_dict_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_dict_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_dict_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_dict_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_equality b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_equality
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_fromkeys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_fromkeys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_highly_nested b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_highly_nested
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_highly_nested_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_init_calls
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_delete_key
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key_in_dict_eq b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key_in_dict_eq
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24347
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24348
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24667 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24667
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_iterators b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_iterators
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_iterators_empty b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_iterators_empty
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_iterators_pickling b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_iterators_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_key_change_during_iteration b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_key_change_during_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_merge_operator b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_merge_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end_issue25406 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end_issue25406
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_ordered_dict_items_result_gc b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_ordered_dict_items_result_gc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_overridden_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_override_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_pickle_recursive b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_pickle_recursive
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_popitem_last b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_popitem_last
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_reduce_not_too_fat b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_reduce_not_too_fat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_reference_loop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_reference_loop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_reinsert b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_reinsert
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_repr b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_repr_recursive b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_repr_recursive
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_repr_recursive_values b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_repr_recursive_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_setitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_sizeof b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_sizeof
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_sizeof_exact b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_sizeof_exact
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_sorted_iterators b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_sorted_iterators
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_views b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_views
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_weakref_list_is_not_traversed b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_weakref_list_is_not_traversed
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_yaml_linkage b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_yaml_linkage
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictWithSlotsCopyingTests.test_copying b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictWithSlotsCopyingTests.test_copying
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_bool b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_bool
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_constructor b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_get b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_get
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_items b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_items
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_keys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_read b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_read
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_values b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonSubclassMappingTests.test_write
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_add_after_full b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_add_after_full
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_change_order_on_get b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_change_order_on_get
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CSimpleLRUCacheTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_bool b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_bool
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_constructor b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_get b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_get
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_items b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_items
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_keys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_len b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_len
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_read b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_read
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_values b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonGeneralMappingTests.test_write
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_468 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_468
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_abc b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_abc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_copying b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_copying
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_delitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_delitem_hash_collision
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_detect_deletion_during_iteration b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_detect_deletion_during_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_delitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_setitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_dict_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_equality b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_equality
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_fromkeys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_fromkeys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_highly_nested b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_highly_nested
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_highly_nested_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_init_calls
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue119004_attribute_error b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue119004_attribute_error
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24347
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24348
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24667 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24667
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_iterators b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_iterators
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_iterators_empty b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_iterators_empty
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_merge_operator b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_merge_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_move_to_end b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_move_to_end
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_move_to_end_issue25406 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_move_to_end_issue25406
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_ordered_dict_items_result_gc b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_ordered_dict_items_result_gc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_overridden_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_override_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_pickle_recursive b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_pickle_recursive
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_popitem_last b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_popitem_last
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_reduce_not_too_fat b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_reduce_not_too_fat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_reference_loop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_reference_loop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_reinsert b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_reinsert
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_repr b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_repr_recursive b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_repr_recursive
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_repr_recursive_values b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_repr_recursive_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_setitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_sizeof b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_sizeof
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_sorted_iterators b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_sorted_iterators
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_views b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_views
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_yaml_linkage b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_yaml_linkage
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_468 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_468
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_abc b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_abc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_copying b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_copying
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_delitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_delitem_hash_collision
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_detect_deletion_during_iteration b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_detect_deletion_during_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_delitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_setitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_dict_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_equality b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_equality
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_fromkeys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_fromkeys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_highly_nested b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_highly_nested
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_highly_nested_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_init_calls
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue119004_attribute_error b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue119004_attribute_error
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24347
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24348
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24667 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24667
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_iterators b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_iterators
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_iterators_empty b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_iterators_empty
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_merge_operator b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_merge_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_move_to_end b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_move_to_end
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_move_to_end_issue25406 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_move_to_end_issue25406
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_ordered_dict_items_result_gc b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_ordered_dict_items_result_gc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_overridden_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_override_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_pickle_recursive b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_pickle_recursive
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_popitem_last b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_popitem_last
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_reduce_not_too_fat b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_reduce_not_too_fat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_reference_loop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_reference_loop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_reinsert b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_reinsert
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_repr b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_repr_recursive b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_repr_recursive
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_repr_recursive_values b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_repr_recursive_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_setitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_setitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_sizeof b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_sizeof
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_sorted_iterators b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_sorted_iterators
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_views b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_views
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_yaml_linkage b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_yaml_linkage
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictWithSlotsCopyingTests.test_copying b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictWithSlotsCopyingTests.test_copying
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_bool b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_bool
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_constructor b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_get b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_get
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_getitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_items b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_items
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_keys b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_len b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_len
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_read b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_read
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_setdefault b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_values b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_write b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonSubclassMappingTests.test_write
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PySimpleLRUCacheTests.test_add_after_full b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PySimpleLRUCacheTests.test_add_after_full
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PySimpleLRUCacheTests.test_change_order_on_get b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PySimpleLRUCacheTests.test_change_order_on_get
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PySimpleLRUCacheTests.test_pop b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PySimpleLRUCacheTests.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PySimpleLRUCacheTests.test_popitem b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PySimpleLRUCacheTests.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsBytes.test_iteration b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsBytes.test_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsBytes.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsBytes.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsBytes.test_repr b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsBytes.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsEmpty.test_iteration b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsEmpty.test_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsEmpty.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsEmpty.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsMixedStringBytes.test_iteration b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsMixedStringBytes.test_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsMixedStringBytes.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsMixedStringBytes.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsMixedStringBytes.test_repr b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsMixedStringBytes.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsSingleton.test_iteration b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsSingleton.test_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsSingleton.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsSingleton.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsString.test_iteration b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsString.test_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsString.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsString.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsString.test_repr b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsString.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_copy b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_difference b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_difference_rev b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_difference_rev
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_intersection
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_isdisjoint b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_isdisjoint
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_symmetric_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_union b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_empty_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_equivalent_equality b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_equivalent_equality
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_intersection_empty b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_intersection_empty
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_iteration b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_length b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_length
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_difference b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_equality b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_equality
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_intersection
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_isdisjoint b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_isdisjoint
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_symmetric_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_union b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_self_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_union_empty b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTriple.test_union_empty
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTuple.test_iteration b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTuple.test_iteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTuple.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestBasicOpsTuple.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_and_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_eq_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ge_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_gt_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iadd_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ior_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_isub_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_iteration_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ixor_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_le_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_lt_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_ne_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_or_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_sub_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Set.test_xor_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_and_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_eq_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ge_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_gt_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iadd_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ior_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_isub_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_iteration_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ixor_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_le_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_lt_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_ne_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_or_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_sub_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Set_Subclass.test_xor_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_and_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_eq_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ge_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_gt_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iadd_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ior_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_isub_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_iteration_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ixor_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_le_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_lt_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_ne_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_or_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_sub_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Set.test_xor_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_and_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_and_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_eq_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_eq_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ge_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ge_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_gt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_gt_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iadd_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iadd_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ior_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ior_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_isub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_isub_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iteration_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_iteration_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ixor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ixor_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_le_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_le_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_lt_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_lt_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ne_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_ne_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_or_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_or_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_sub_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_sub_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_xor_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestBinaryOpsMutating_Subclass_Subclass.test_xor_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestCopyingEmpty.test_deep_copy b/test/dynamo_expected_failures/CPython313-test_set-TestCopyingEmpty.test_deep_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestCopyingNested.test_deep_copy b/test/dynamo_expected_failures/CPython313-test_set-TestCopyingNested.test_deep_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestCopyingSingleton.test_deep_copy b/test/dynamo_expected_failures/CPython313-test_set-TestCopyingSingleton.test_deep_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestCopyingTriple.test_deep_copy b/test/dynamo_expected_failures/CPython313-test_set-TestCopyingTriple.test_deep_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestCopyingTuple.test_deep_copy b/test/dynamo_expected_failures/CPython313-test_set-TestCopyingTuple.test_deep_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestExceptionPropagation.test_changingSizeWhileIterating b/test/dynamo_expected_failures/CPython313-test_set-TestExceptionPropagation.test_changingSizeWhileIterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_and b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_and
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_badcmp b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_badcmp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_constructor_identity b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_constructor_identity
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_container_iterator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_contains
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_copy b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_cyclical_repr b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_cyclical_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_deepcopy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_do_not_rehash_dict_keys b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_do_not_rehash_dict_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_equality b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_equality
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_frozen_as_dictkey b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_frozen_as_dictkey
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_gc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_hash b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_hash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_hash_caching b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_hash_caching
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_hash_effectiveness b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_hash_effectiveness
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_intersection
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_iterator_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_iterator_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_new_or_init b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_new_or_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_or b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_or
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_setOfFrozensets b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_setOfFrozensets
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_sub b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_sub
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_subclass_with_custom_hash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_symmetric_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_uniquification b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_uniquification
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_xor b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSet.test_xor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_and b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_and
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_badcmp b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_badcmp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_constructor_identity b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_constructor_identity
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_container_iterator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_contains
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_copy b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_cyclical_repr b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_cyclical_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_deepcopy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_do_not_rehash_dict_keys b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_do_not_rehash_dict_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_equality b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_equality
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_frozen_as_dictkey b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_frozen_as_dictkey
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_gc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_hash b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_hash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_hash_caching b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_hash_caching
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_hash_effectiveness b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_hash_effectiveness
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_init b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_intersection
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_isdisjoint b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_isdisjoint
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_iterator_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_iterator_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_keywords_in_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_len b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_len
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_nested_empty_constructor b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_nested_empty_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_new_or_init b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_new_or_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_or b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_or
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_setOfFrozensets b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_setOfFrozensets
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_singleton_empty_frozenset b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_singleton_empty_frozenset
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_sub b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_sub
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_sub_and_super b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_sub_and_super
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_subclass_with_custom_hash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_symmetric_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_uniquification b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_uniquification
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_xor b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_xor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclassWithSlots.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclassWithSlots.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestGraphs.test_cube b/test/dynamo_expected_failures/CPython313-test_set-TestGraphs.test_cube
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestGraphs.test_cuboctahedron b/test/dynamo_expected_failures/CPython313-test_set-TestGraphs.test_cuboctahedron
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_intersection_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_isdisjoint_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issubset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_issuperset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_symmetric_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_union_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Dict.test_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_intersection_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_isdisjoint_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issubset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_issuperset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_symmetric_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_union_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_List.test_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_intersection_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_isdisjoint_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issubset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_issuperset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_symmetric_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_union_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Set.test_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_intersection_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_isdisjoint_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issubset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_issuperset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_symmetric_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_union_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Set_Subclass.test_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_intersection_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_isdisjoint_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issubset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_issuperset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_symmetric_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_union_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Set.test_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_intersection_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_isdisjoint_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_isdisjoint_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issubset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issubset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issuperset_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_issuperset_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_symmetric_difference_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_union_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_union_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_update_with_mutation b/test/dynamo_expected_failures/CPython313-test_set-TestMethodsMutating_Subclass_Subclass.test_update_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsDict.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsDict.test_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsDict.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsDict.test_update_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_difference_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_difference_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_difference_update_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_ge_gt_le_lt b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_ge_gt_le_lt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_intersection
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_intersection_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_intersection_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_intersection_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_intersection_update_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_sym_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_sym_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_sym_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_sym_difference_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_sym_difference_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_sym_difference_update_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsGenerator.test_update_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsNumeric.test_update_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_intersection
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_intersection_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_intersection_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_sym_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_sym_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_sym_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_sym_difference_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsOperator.test_update_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_intersection
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_intersection_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_intersection_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_update_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsTuple.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsTuple.test_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsTuple.test_update_operator b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsTuple.test_update_operator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_badcmp b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_badcmp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_constructor_identity b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_constructor_identity
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_container_iterator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_contains
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_copy b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_cyclical_repr b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_cyclical_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_deepcopy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_difference_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_discard b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_discard
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_do_not_rehash_dict_keys b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_do_not_rehash_dict_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_gc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_hash b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_hash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_intersection
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_intersection_update b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_intersection_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_iterator_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_iterator_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_new_or_init b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_new_or_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_or b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_or
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_remove b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_remove
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_remove_keyerror_unpacking b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_remove_keyerror_unpacking
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_rich_compare b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_rich_compare
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_setOfFrozensets b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_setOfFrozensets
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_set_literal_evaluation_order b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_set_literal_evaluation_order
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_subclass_with_custom_hash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_symmetric_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_symmetric_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_symmetric_difference_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_uniquification b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_uniquification
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_update b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_weakref b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_weakref
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetOfSets.test_constructor b/test/dynamo_expected_failures/CPython313-test_set-TestSetOfSets.test_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_add b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_add
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_and b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_and
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_badcmp b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_badcmp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_clear b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_constructor_identity b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_constructor_identity
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_container_iterator b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_container_iterator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_contains
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_copy b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_cyclical_repr b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_cyclical_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_deepcopy b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_deepcopy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_difference_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_discard b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_discard
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_do_not_rehash_dict_keys b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_do_not_rehash_dict_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_equality b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_equality
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_gc b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_gc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_hash b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_hash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_iand b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_iand
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_init b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_inplace_on_self b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_inplace_on_self
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_intersection b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_intersection
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_intersection_update b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_intersection_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_ior b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_ior
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_isdisjoint b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_isdisjoint
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_isub b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_isub
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_iterator_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_iterator_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_ixor b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_ixor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_keywords_in_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_len b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_len
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_new_or_init b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_new_or_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_or b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_or
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_pop b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_remove b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_remove
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_remove_keyerror_set b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_remove_keyerror_set
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_remove_keyerror_unpacking b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_remove_keyerror_unpacking
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_rich_compare b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_rich_compare
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_setOfFrozensets b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_setOfFrozensets
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_set_literal_evaluation_order b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_set_literal_evaluation_order
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_sub b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_sub
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_sub_and_super b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_sub_and_super
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_subclass_with_custom_hash b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_subclass_with_custom_hash
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_symmetric_difference b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_symmetric_difference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_symmetric_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_symmetric_difference_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_union b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_union
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_uniquification b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_uniquification
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_update b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_weakref b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_weakref
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_xor b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_xor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclassWithSlots.test_pickling b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclassWithSlots.test_pickling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSubsetEmptyNonEmpty.test_issubset b/test/dynamo_expected_failures/CPython313-test_set-TestSubsetEmptyNonEmpty.test_issubset
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSubsetEqualEmpty.test_issubset b/test/dynamo_expected_failures/CPython313-test_set-TestSubsetEqualEmpty.test_issubset
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSubsetEqualNonEmpty.test_issubset b/test/dynamo_expected_failures/CPython313-test_set-TestSubsetEqualNonEmpty.test_issubset
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSubsetNonOverlap.test_issubset b/test/dynamo_expected_failures/CPython313-test_set-TestSubsetNonOverlap.test_issubset
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSubsetPartial.test_issubset b/test/dynamo_expected_failures/CPython313-test_set-TestSubsetPartial.test_issubset
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_constructor b/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_inline_methods b/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_inline_methods
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_inplace_methods b/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_inplace_methods
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestWeirdBugs.test_8420_set_merge b/test/dynamo_expected_failures/CPython313-test_set-TestWeirdBugs.test_8420_set_merge
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestWeirdBugs.test_merge_and_mutate b/test/dynamo_expected_failures/CPython313-test_set-TestWeirdBugs.test_merge_and_mutate
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBase.testStressfully b/test/dynamo_expected_failures/CPython313-test_sort-TestBase.testStressfully
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBase.test_small_stability b/test/dynamo_expected_failures/CPython313-test_sort-TestBase.test_small_stability
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBugs.test_bug453523 b/test/dynamo_expected_failures/CPython313-test_sort-TestBugs.test_bug453523
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBugs.test_undetected_mutation b/test/dynamo_expected_failures/CPython313-test_sort-TestBugs.test_undetected_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_baddecorator b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_baddecorator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_decorated b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_decorated
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_exception b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_exception
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del_and_exception b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutating_del_and_exception
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutation b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_key_with_mutation
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_reverse b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_reverse
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_reverse_stability b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_reverse_stability
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_stability b/test/dynamo_expected_failures/CPython313-test_sort-TestDecorateSortUndecorate.test_stability
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_none_in_tuples b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_none_in_tuples
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_not_all_tuples b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_not_all_tuples
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_safe_object_compare b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_safe_object_compare
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_float_compare b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_float_compare
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_latin_compare b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_latin_compare
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_long_compare b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_long_compare
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_object_compare b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_object_compare
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_tuple_compare b/test/dynamo_expected_failures/CPython313-test_sort-TestOptimizedCompares.test_unsafe_tuple_compare
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_addmul b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_addmul
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_bug7466 b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_bug7466
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_constructors b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_constructors
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains_fake b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains_fake
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains_order b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_contains_order
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_count b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_count
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitem b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitem_error b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitem_error
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitemoverwriteiter b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getitemoverwriteiter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getslice b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_getslice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_hash_exact b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_hash_exact
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_iadd b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_iadd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_imul b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_imul
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_index b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_index
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_iterator_pickle b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_iterator_pickle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_keyword_args b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_keyword_args
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_keywords_in_subclass b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_keywords_in_subclass
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_no_comdat_folding b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_no_comdat_folding
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_pickle b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_pickle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_repeat b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_repeat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_reversed_pickle b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_reversed_pickle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_subscript b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_subscript
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_track_dynamic b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_track_dynamic
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_track_literals b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_track_literals
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_track_subtypes b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_track_subtypes
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_tupleresizebug b/test/dynamo_expected_failures/CPython313-test_tuple-TupleTest.test_tupleresizebug
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_all b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_all
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_bool b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_bool
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_clear b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_constructor b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_constructor
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_contains b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_contains
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_copy b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_eq b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_eq
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_fromkeys b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_fromkeys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_get b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_get
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_getitem b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_init b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_items b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_items
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_keys b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_keys
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_len b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_len
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_missing b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_missing
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_mutatingiteration b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_mutatingiteration
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_pop b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_popitem b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_popitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_read b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_read
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_repr b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_repr_deep b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_repr_deep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_setdefault b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_setdefault
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_update b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_update
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_values b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_values
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_write b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_write
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_add_specials b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_add_specials
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_addmul b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_addmul
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_append b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_append
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_clear b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_clear
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_constructor_exception_handling b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_constructor_exception_handling
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_constructors b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_constructors
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_contains b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_contains
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_contains_fake b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_contains_fake
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_contains_order b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_contains_order
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_copy b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_count b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_count
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_delitem b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_delitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_delslice b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_delslice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_exhausted_iterator b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_exhausted_iterator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extend b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extend
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extendedslicing b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extendedslicing
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_free_after_iterating b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getitem b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getitem_error b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getitem_error
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getitemoverwriteiter b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getitemoverwriteiter
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getslice b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_getslice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_iadd b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_iadd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_imul b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_imul
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_index b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_index
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_init b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_init
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_insert b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_insert
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_len b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_len
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_minmax b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_minmax
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedadd b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedadd
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedcmp b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_mixedcmp
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_pickle b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_pickle
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_pop b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_pop
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_radd_specials b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_radd_specials
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_remove b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_remove
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_repeat b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_repeat
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_repr b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_repr
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_repr_deep b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_repr_deep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_reverse b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_reverse
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_reversed b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_reversed
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_set_subscript b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_set_subscript
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_setitem b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_setitem
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_setitem_error b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_setitem_error
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_setslice b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_setslice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_assign_iterator b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_assign_iterator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_type b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_slice_type
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_sort b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_sort
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_subscript b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_subscript
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_truth b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_truth
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_userlist_copy b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_userlist_copy
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/FakeTensorPropTest.test_nonzero_stride b/test/dynamo_expected_failures/FakeTensorPropTest.test_nonzero_stride
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/TestAutograd.test_custom_function_preserve_torch_function_when_return_as_is b/test/dynamo_expected_failures/TestAutograd.test_custom_function_preserve_torch_function_when_return_as_is
new file mode 100644
index 000000000000..f243ff1904b0
--- /dev/null
+++ b/test/dynamo_expected_failures/TestAutograd.test_custom_function_preserve_torch_function_when_return_as_is
@@ -0,0 +1,10 @@
+- Need to handle `class` block inside `torch.compile` region (`LOAD_BUILD_CLASS`)
+or properly graph break on it rather than skipping the frame altogether.
+https://github.com/pytorch/pytorch/issues/128942
+
+Fundamental issue is Dynamo tries to probe tensor object properties, but that
+could trigger user-defined `__torch_function__` for tensor subclass objects.
+
+In this case the `LOAD_BUILD_CLASS` error caused Dynamo to start tracing in the
+`__init__` of the following class, but `self._data = data` hasn't fired yet, and
+its `__torch_function__` errors when Dynamo is probing tensor property
diff --git a/test/dynamo_expected_failures/TestCustomOp.test_override_cea b/test/dynamo_expected_failures/TestCustomOp.test_override_cea
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/TestIterator.test_iterator b/test/dynamo_expected_failures/TestIterator.test_iterator
index e69de29bb2d1..880a24b122bb 100644
--- a/test/dynamo_expected_failures/TestIterator.test_iterator
+++ b/test/dynamo_expected_failures/TestIterator.test_iterator
@@ -0,0 +1 @@
+https://github.com/pytorch/pytorch/issues/150005
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_module_buffer b/test/dynamo_expected_failures/TestLazyModules.test_lazy_module_buffer
new file mode 100644
index 000000000000..89dda61098d2
--- /dev/null
+++ b/test/dynamo_expected_failures/TestLazyModules.test_lazy_module_buffer
@@ -0,0 +1 @@
+Related to `_BufferMeta.__instancecheck__`: https://github.com/pytorch/pytorch/issues/149991
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_module_jit_buffer b/test/dynamo_expected_failures/TestLazyModules.test_lazy_module_jit_buffer
new file mode 100644
index 000000000000..89dda61098d2
--- /dev/null
+++ b/test/dynamo_expected_failures/TestLazyModules.test_lazy_module_jit_buffer
@@ -0,0 +1 @@
+Related to `_BufferMeta.__instancecheck__`: https://github.com/pytorch/pytorch/issues/149991
diff --git a/test/dynamo_expected_failures/TestScript.test_python_frontend b/test/dynamo_expected_failures/TestScript.test_python_frontend
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/TestScript.test_python_frontend_py3 b/test/dynamo_expected_failures/TestScript.test_python_frontend_py3
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_expected_failures/TestTorch.test_tensor_ressurecting_clear b/test/dynamo_expected_failures/TestTorch.test_tensor_ressurecting_clear
new file mode 100644
index 000000000000..276a4f74bbca
--- /dev/null
+++ b/test/dynamo_expected_failures/TestTorch.test_tensor_ressurecting_clear
@@ -0,0 +1 @@
+https://github.com/pytorch/pytorch/issues/149881
diff --git a/test/dynamo_expected_failures/TestTorchFunctionMode.test_subclass_hash b/test/dynamo_expected_failures/TestTorchFunctionMode.test_subclass_hash
new file mode 100644
index 000000000000..beb4bf5d003a
--- /dev/null
+++ b/test/dynamo_expected_failures/TestTorchFunctionMode.test_subclass_hash
@@ -0,0 +1,10 @@
+Need to handle `class` block inside `torch.compile` region (`LOAD_BUILD_CLASS`)
+or properly graph break on it rather than skipping the frame altogether.
+https://github.com/pytorch/pytorch/issues/128942
+
+Fundamental issue is Dynamo tries to probe tensor object properties, but that
+could trigger user-defined `__torch_function__` for tensor subclass objects.
+
+In this case the `LOAD_BUILD_CLASS` error caused Dynamo to start tracing in the
+`__init__` of the following class, but `self._diag = _diag` hasn't fired yet, and
+its `__torch_function__` errors when Dynamo is probing tensor property
diff --git a/test/dynamo_expected_failures/TestTorchFunctionWarning.test_warn_on_invalid_torch_function_tensor_subclass b/test/dynamo_expected_failures/TestTorchFunctionWarning.test_warn_on_invalid_torch_function_tensor_subclass
new file mode 100644
index 000000000000..c2ddc08d1e40
--- /dev/null
+++ b/test/dynamo_expected_failures/TestTorchFunctionWarning.test_warn_on_invalid_torch_function_tensor_subclass
@@ -0,0 +1,3 @@
+Dynamo cannot query properties of the tensor subclass object when wrapping it
+into a VT, because it has a `__torch_function__` that only allows limited
+torch ops.
diff --git a/test/dynamo_skips/CPython313-test_dict-DictTest.test_container_iterator b/test/dynamo_skips/CPython313-test_dict-DictTest.test_container_iterator
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_skips/CPython313-test_dict-DictTest.test_dict_items_result_gc b/test/dynamo_skips/CPython313-test_dict-DictTest.test_dict_items_result_gc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_skips/CPython313-test_dict-DictTest.test_dict_items_result_gc_reversed b/test/dynamo_skips/CPython313-test_dict-DictTest.test_dict_items_result_gc_reversed
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_skips/CPython313-test_dict-DictTest.test_free_after_iterating b/test/dynamo_skips/CPython313-test_dict-DictTest.test_free_after_iterating
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_skips/CPython313-test_dict-DictTest.test_track_dynamic b/test/dynamo_skips/CPython313-test_dict-DictTest.test_track_dynamic
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_skips/CPython313-test_dict-DictTest.test_track_literals b/test/dynamo_skips/CPython313-test_dict-DictTest.test_track_literals
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_skips/CPython313-test_dict-DictTest.test_track_subtypes b/test/dynamo_skips/CPython313-test_dict-DictTest.test_track_subtypes
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_skips/TestExperimentalUtils.test_profiler_overload_names b/test/dynamo_skips/TestExperimentalUtils.test_profiler_overload_names
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/dynamo_skips/TestExperimentalUtils.test_profiler_pattern_match_helper b/test/dynamo_skips/TestExperimentalUtils.test_profiler_pattern_match_helper
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/expect/HasDecompTest.test_aten_core_operators.expect b/test/expect/HasDecompTest.test_aten_core_operators.expect
index 60529dfcc637..5d07bf9883f4 100644
--- a/test/expect/HasDecompTest.test_aten_core_operators.expect
+++ b/test/expect/HasDecompTest.test_aten_core_operators.expect
@@ -42,6 +42,11 @@ aten::add_.Scalar
 aten::add_.Tensor
 aten::addbmm_
 aten::addmm
+<<<<<<< HEAD
+=======
+aten::addmm.dtype
+aten::addmm.dtype_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::addmm.out
 aten::addmm_
 aten::addmv
@@ -193,6 +198,11 @@ aten::div_.Scalar
 aten::div_.Scalar_mode
 aten::div_.Tensor
 aten::div_.Tensor_mode
+<<<<<<< HEAD
+=======
+aten::elu
+aten::elu.out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::embedding
 aten::embedding.out
 aten::empty_strided
diff --git a/test/expect/HasDecompTest.test_has_decomposition.expect b/test/expect/HasDecompTest.test_has_decomposition.expect
index 3faa1186562f..5e45142447e5 100644
--- a/test/expect/HasDecompTest.test_has_decomposition.expect
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@@ -355,7 +355,14 @@ aten::_functional_sym_constrain_range
 aten::_functional_sym_constrain_range_for_size
 aten::_fused_adagrad
 aten::_fused_adagrad.out
+<<<<<<< HEAD
 aten::_fused_adagrad_
+=======
+aten::_fused_adagrad.tensor_lr
+aten::_fused_adagrad.tensor_lr_out
+aten::_fused_adagrad_
+aten::_fused_adagrad_.tensor_lr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::_fused_adam
 aten::_fused_adam.out
 aten::_fused_adam.tensor_lr
@@ -371,6 +378,10 @@ aten::_fused_adamw_.tensor_lr
 aten::_fused_moving_avg_obs_fq_helper
 aten::_fused_moving_avg_obs_fq_helper.out
 aten::_fused_moving_avg_obs_fq_helper_functional
+<<<<<<< HEAD
+=======
+aten::_fused_rms_norm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::_fused_sdp_choice
 aten::_fused_sgd
 aten::_fused_sgd.out
@@ -383,6 +394,10 @@ aten::_fw_primal_copy
 aten::_fw_primal_copy.out
 aten::_grid_sampler_2d_cpu_fallback
 aten::_grid_sampler_2d_cpu_fallback.out
+<<<<<<< HEAD
+=======
+aten::_grouped_mm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::_has_same_storage_numel
 aten::_histogramdd_bin_edges
 aten::_histogramdd_bin_edges.out
@@ -650,6 +665,10 @@ aten::_values_copy
 aten::_values_copy.out
 aten::_weight_int4pack_mm
 aten::_weight_int4pack_mm_for_cpu
+<<<<<<< HEAD
+=======
+aten::_weight_int4pack_mm_with_scales_and_zeros
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::_weight_int8pack_mm
 aten::_weight_norm_interface_backward
 aten::_weight_norm_interface_backward.out
@@ -720,6 +739,11 @@ aten::blackman_window.out
 aten::blackman_window.periodic
 aten::blackman_window.periodic_out
 aten::bmm
+<<<<<<< HEAD
+=======
+aten::bmm.dtype
+aten::bmm.dtype_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::bmm.out
 aten::ccol_indices
 aten::ccol_indices_copy
@@ -992,6 +1016,11 @@ aten::mkldnn_rnn_layer.out
 aten::mkldnn_rnn_layer_backward
 aten::mkldnn_rnn_layer_backward.out
 aten::mm
+<<<<<<< HEAD
+=======
+aten::mm.dtype
+aten::mm.dtype_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::mm.out
 aten::mode
 aten::mode.values
@@ -1084,6 +1113,11 @@ aten::randint.low_generator_out
 aten::randint.low_out
 aten::randint.out
 aten::randint_like
+<<<<<<< HEAD
+=======
+aten::randint_like.Tensor
+aten::randint_like.Tensor_out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten::randint_like.low_dtype
 aten::randint_like.low_dtype_out
 aten::randint_like.out
diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
index f671992770fa..d755a319798d 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -8,7 +8,11 @@ torch.fx._symbolic_trace.Tracer.trace(self, root: Union[torch.nn.modules.module.
 torch.fx._symbolic_trace.symbolic_trace(root: Union[torch.nn.modules.module.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None) -> torch.fx.graph_module.GraphModule
 torch.fx._symbolic_trace.wrap(fn_or_name: Union[str, Callable])
 torch.fx.graph.Graph.__init__(self, owning_module: Optional[GraphModule] = None, tracer_cls: Optional[Type[Tracer]] = None, tracer_extras: Optional[Dict[str, Any]] = None)
+<<<<<<< HEAD
 torch.fx.graph.Graph.call_function(self, the_function: Callable[..., Any], args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
+=======
+torch.fx.graph.Graph.call_function(self, the_function: Callable[..., Any], args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None, name: Optional[str] = None) -> torch.fx.node.Node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.fx.graph.Graph.call_method(self, method_name: str, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
 torch.fx.graph.Graph.call_module(self, module_name: str, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
 torch.fx.graph.Graph.create_node(self, op: str, target: 'Target', args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, name: Optional[str] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node
@@ -64,7 +68,11 @@ torch.fx.node.map_aggregate(a: torch.fx.node.Argument, fn: Callable[[torch.fx.no
 torch.fx.node.map_arg(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Node], torch.fx.node.Argument]) -> torch.fx.node.Argument
 torch.fx.passes.reinplace.reinplace(gm, *sample_args)
 torch.fx.passes.runtime_assert.insert_deferred_runtime_asserts(gm: torch.fx.graph_module.GraphModule, shape_env: Any, name: str, export: bool = False) -> None
+<<<<<<< HEAD
 torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None, keep_original_order: Optional[bool] = False, keep_original_node_name: Optional[bool] = False)
+=======
+torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None, keep_original_order: Optional[bool] = False, keep_original_node_name: Optional[bool] = False, keep_original_input_name: bool = True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 torch.fx.proxy.Attribute.__init__(self, root: torch.fx.proxy.Proxy, attr: str)
 torch.fx.proxy.Proxy.__init__(self, node: torch.fx.node.Node, tracer: 'Optional[TracerBase]' = None)
 torch.fx.proxy.Proxy.keys(self)
diff --git a/test/expect/TestTensorBoard.test_audio.expect b/test/expect/TestTensorBoard.test_audio.expect
index 84bc0fbdcb95..703fd635bebd 100644
--- a/test/expect/TestTensorBoard.test_audio.expect
+++ b/test/expect/TestTensorBoard.test_audio.expect
@@ -1,7 +1,11 @@
 value {
   tag: "dummy"
   audio {
+<<<<<<< HEAD
     sample_rate: 44100.0
+=======
+    sample_rate: 44100
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     num_channels: 1
     length_frames: 42
     encoded_audio_string: "RIFFx\000\000\000WAVEfmt \020\000\000\000\001\000\001\000D\254\000\000\210X\001\000\002\000\020\000dataT\000\000\000\000\000\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177\377\177"
diff --git a/test/expect/TestTensorBoard.test_histogram_auto.expect b/test/expect/TestTensorBoard.test_histogram_auto.expect
index 6f31cc5d0fa5..0f146b99b4f8 100644
--- a/test/expect/TestTensorBoard.test_histogram_auto.expect
+++ b/test/expect/TestTensorBoard.test_histogram_auto.expect
@@ -1,6 +1,7 @@
 value {
   tag: "dummy"
   histo {
+<<<<<<< HEAD
     max: 1023.0
     num: 1024.0
     sum: 523776.0
@@ -19,5 +20,25 @@ value {
     bucket: 186.0
     bucket: 186.0
     bucket: 94.0
+=======
+    max: 1023
+    num: 1024
+    sum: 523776
+    sum_squares: 357389824
+    bucket_limit: 0
+    bucket_limit: 186
+    bucket_limit: 372
+    bucket_limit: 558
+    bucket_limit: 744
+    bucket_limit: 930
+    bucket_limit: 1023
+    bucket: 0
+    bucket: 186
+    bucket: 186
+    bucket: 186
+    bucket: 186
+    bucket: 186
+    bucket: 94
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
diff --git a/test/expect/TestTensorBoard.test_histogram_doane.expect b/test/expect/TestTensorBoard.test_histogram_doane.expect
index 6f31cc5d0fa5..0f146b99b4f8 100644
--- a/test/expect/TestTensorBoard.test_histogram_doane.expect
+++ b/test/expect/TestTensorBoard.test_histogram_doane.expect
@@ -1,6 +1,7 @@
 value {
   tag: "dummy"
   histo {
+<<<<<<< HEAD
     max: 1023.0
     num: 1024.0
     sum: 523776.0
@@ -19,5 +20,25 @@ value {
     bucket: 186.0
     bucket: 186.0
     bucket: 94.0
+=======
+    max: 1023
+    num: 1024
+    sum: 523776
+    sum_squares: 357389824
+    bucket_limit: 0
+    bucket_limit: 186
+    bucket_limit: 372
+    bucket_limit: 558
+    bucket_limit: 744
+    bucket_limit: 930
+    bucket_limit: 1023
+    bucket: 0
+    bucket: 186
+    bucket: 186
+    bucket: 186
+    bucket: 186
+    bucket: 186
+    bucket: 94
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
diff --git a/test/expect/TestTensorBoard.test_histogram_fd.expect b/test/expect/TestTensorBoard.test_histogram_fd.expect
index 6f31cc5d0fa5..0f146b99b4f8 100644
--- a/test/expect/TestTensorBoard.test_histogram_fd.expect
+++ b/test/expect/TestTensorBoard.test_histogram_fd.expect
@@ -1,6 +1,7 @@
 value {
   tag: "dummy"
   histo {
+<<<<<<< HEAD
     max: 1023.0
     num: 1024.0
     sum: 523776.0
@@ -19,5 +20,25 @@ value {
     bucket: 186.0
     bucket: 186.0
     bucket: 94.0
+=======
+    max: 1023
+    num: 1024
+    sum: 523776
+    sum_squares: 357389824
+    bucket_limit: 0
+    bucket_limit: 186
+    bucket_limit: 372
+    bucket_limit: 558
+    bucket_limit: 744
+    bucket_limit: 930
+    bucket_limit: 1023
+    bucket: 0
+    bucket: 186
+    bucket: 186
+    bucket: 186
+    bucket: 186
+    bucket: 186
+    bucket: 94
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
diff --git a/test/expect/TestTensorBoard.test_mesh.expect b/test/expect/TestTensorBoard.test_mesh.expect
index e737f100458c..5733549bb916 100644
--- a/test/expect/TestTensorBoard.test_mesh.expect
+++ b/test/expect/TestTensorBoard.test_mesh.expect
@@ -13,6 +13,7 @@ value {
         size: 3
       }
     }
+<<<<<<< HEAD
     float_val: 1.0
     float_val: 1.0
     float_val: 1.0
@@ -25,6 +26,20 @@ value {
     float_val: -1.0
     float_val: 1.0
     float_val: -1.0
+=======
+    float_val: 1
+    float_val: 1
+    float_val: 1
+    float_val: -1
+    float_val: -1
+    float_val: 1
+    float_val: 1
+    float_val: -1
+    float_val: -1
+    float_val: -1
+    float_val: 1
+    float_val: -1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   metadata {
     plugin_data {
@@ -48,6 +63,7 @@ value {
         size: 3
       }
     }
+<<<<<<< HEAD
     float_val: 0.0
     float_val: 2.0
     float_val: 3.0
@@ -60,6 +76,20 @@ value {
     float_val: 1.0
     float_val: 3.0
     float_val: 2.0
+=======
+    float_val: 0
+    float_val: 2
+    float_val: 3
+    float_val: 0
+    float_val: 3
+    float_val: 1
+    float_val: 0
+    float_val: 1
+    float_val: 2
+    float_val: 1
+    float_val: 3
+    float_val: 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   metadata {
     plugin_data {
@@ -83,6 +113,7 @@ value {
         size: 3
       }
     }
+<<<<<<< HEAD
     float_val: 255.0
     float_val: 0.0
     float_val: 0.0
@@ -95,6 +126,20 @@ value {
     float_val: 255.0
     float_val: 0.0
     float_val: 255.0
+=======
+    float_val: 255
+    float_val: 0
+    float_val: 0
+    float_val: 0
+    float_val: 255
+    float_val: 0
+    float_val: 0
+    float_val: 0
+    float_val: 255
+    float_val: 255
+    float_val: 0
+    float_val: 255
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   metadata {
     plugin_data {
diff --git a/test/expect/TestTensorBoard.test_scalar_new_style.expect b/test/expect/TestTensorBoard.test_scalar_new_style.expect
index 8e9cfc351f2c..ed80d5678dbb 100644
--- a/test/expect/TestTensorBoard.test_scalar_new_style.expect
+++ b/test/expect/TestTensorBoard.test_scalar_new_style.expect
@@ -2,7 +2,11 @@ value {
   tag: "test_scalar"
   tensor {
     dtype: DT_FLOAT
+<<<<<<< HEAD
     float_val: 1.0
+=======
+    float_val: 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   metadata {
     plugin_data {
diff --git a/test/export/test_converter.py b/test/export/test_converter.py
index 3db578c45111..822356179478 100644
--- a/test/export/test_converter.py
+++ b/test/export/test_converter.py
@@ -10,7 +10,11 @@
 from torch._export.converter import TS2EPConverter
 from torch.export import ExportedProgram
 from torch.testing._internal.common_quantized import override_quantized_engine
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS, run_tests
+=======
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, xfailIfS390X
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.torchbind_impls import (
     _empty_tensor_queue,
     init_torchbind_implementations,
@@ -24,6 +28,7 @@ class TestConverter(TestCase):
     def setUp(self):
         init_torchbind_implementations()
 
+<<<<<<< HEAD
         @torch._library.register_fake_class("_TorchScriptTesting::_TensorQueue")
         class _FakeTensorQueue:
             def __init__(self, queue):
@@ -50,6 +55,8 @@ def is_empty(self):
             def float_size(self):
                 return float(len(self.queue))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.torch_bind_ops = [
             torch.ops._TorchScriptTesting.queue_pop,
             torch.ops._TorchScriptTesting.queue_push,
@@ -57,9 +64,13 @@ def float_size(self):
         ]
 
     def tearDown(self):
+<<<<<<< HEAD
         torch._library.fake_class_registry.deregister_fake_class(
             "_TorchScriptTesting::_TensorQueue"
         )
+=======
+        return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_equal_ts_ep_converter(
         self,
@@ -1431,6 +1442,11 @@ def func3(x):  # noqa: F841
         IS_WINDOWS,
         "Windows does not support qnnpack",
     )
+<<<<<<< HEAD
+=======
+    # qnnpack not supported on s390x
+    @xfailIfS390X
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ts2ep_convert_quantized_model(self):
         class Standalone(torch.nn.Module):
             def __init__(self):
@@ -1474,6 +1490,11 @@ def fuse_model(self):
             ep_out, _ = pytree.tree_flatten(ep.module()(*inp))
             self._check_tensor_list_equal(orig_out, ep_out)
 
+<<<<<<< HEAD
+=======
+    # qnnpack not supported on s390x
+    @xfailIfS390X
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ts2ep_convert_quantized_model_with_opcontext(self):
         class M(torch.nn.Module):
             def __init__(self, linear_op):
@@ -1491,6 +1512,29 @@ def forward(self, x):
         inp = (torch.randn(1, 10),)
         self._check_equal_ts_ep_converter(m, inp, ["script"])
 
+<<<<<<< HEAD
+=======
+    def test_ts2ep_convert_quantized_model_with_opcontext_and_constant(self):
+        class M(torch.nn.Module):
+            def __init__(self, linear_op):
+                super().__init__()
+                self.linear_op = linear_op
+
+            def forward(self, x):
+                x = torch.ops.prepacked.linear_clamp_run(
+                    x + torch.ones(1), self.linear_op
+                )
+                return x
+
+        linear_op = torch.ops.prepacked.linear_clamp_prepack(
+            torch.randn(10, 10), torch.randn(10)
+        )
+
+        m = M(linear_op)
+        inp = (torch.randn(1, 10),)
+        self._check_equal_ts_ep_converter(m, inp, ["script"])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_db.py b/test/export/test_db.py
index 30ee827d117d..16be83e5a1b1 100644
--- a/test/export/test_db.py
+++ b/test/export/test_db.py
@@ -40,6 +40,10 @@ def test_exportdb_supported(self, name: str, case: ExportCase) -> None:
             args_export,
             kwargs_export,
             dynamic_shapes=case.dynamic_shapes,
+<<<<<<< HEAD
+=======
+            strict=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         exported_program.graph_module.print_readable()
 
@@ -72,6 +76,10 @@ def test_exportdb_not_supported(self, name: str, case: ExportCase) -> None:
                 case.example_args,
                 case.example_kwargs,
                 dynamic_shapes=case.dynamic_shapes,
+<<<<<<< HEAD
+=======
+                strict=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     exportdb_not_supported_rewrite_cases = [
@@ -97,6 +105,10 @@ def test_exportdb_not_supported_rewrite(
                 rewrite_case.example_args,
                 rewrite_case.example_kwargs,
                 dynamic_shapes=rewrite_case.dynamic_shapes,
+<<<<<<< HEAD
+=======
+                strict=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
diff --git a/test/export/test_draft_export.py b/test/export/test_draft_export.py
index 92139bd2d693..f74716af1294 100644
--- a/test/export/test_draft_export.py
+++ b/test/export/test_draft_export.py
@@ -4,8 +4,15 @@
 import unittest
 
 import torch
+<<<<<<< HEAD
 from torch.export import Dim, export
 from torch.export._draft_export import draft_export, FailureType
+=======
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.export import Dim, draft_export, export
+from torch.export._draft_export import FailureType
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
 from torch.testing._internal.torchbind_impls import (
@@ -20,6 +27,7 @@ def setUp(self):
         super().setUp()
         init_torchbind_implementations()
 
+<<<<<<< HEAD
         @torch._library.register_fake_class("_TorchScriptTesting::_TensorQueue")
         class FakeTensorQueue:
             def __init__(self, queue):
@@ -44,6 +52,8 @@ def is_empty(self):
             def float_size(self):
                 return float(len(self.queue))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.torch_bind_ops = [
             torch.ops._TorchScriptTesting.queue_pop,
             torch.ops._TorchScriptTesting.queue_push,
@@ -51,11 +61,17 @@ def float_size(self):
         ]
 
     def tearDown(self):
+<<<<<<< HEAD
         torch._library.fake_class_registry.deregister_fake_class(
             "_TorchScriptTesting::_TensorQueue"
         )
 
     def test_missing_meta_kernel_custom_op(self):
+=======
+        return
+
+    def test_missing_meta_kernel_custom_op_basic(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch.library._scoped_library("mylib", "FRAGMENT"):
 
             @torch.library.custom_op("mylib::foo2", mutates_args={})
@@ -80,6 +96,14 @@ def forward(self, a, b):
             inp = (torch.randn(3, 3), torch.randn(3, 3))
             self.assertEqual(ep.module()(*inp), M()(*inp))
 
+<<<<<<< HEAD
+=======
+            with torch._library.fake_profile.unsafe_generate_fake_kernels(
+                report.op_profiles
+            ):
+                ep.run_decompositions()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_missing_meta_kernel_impl(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
             torch.library.define(
@@ -90,7 +114,11 @@ def test_missing_meta_kernel_impl(self):
             )
 
             @torch.library.impl("mylib::foo", "cpu", lib=lib)
+<<<<<<< HEAD
             def foo_impl(a, b):
+=======
+            def foo_impl(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return a + b
 
             class M(torch.nn.Module):
@@ -112,6 +140,112 @@ def forward(self, a, b):
             inp = (torch.randn(3, 3), torch.randn(3, 3))
             self.assertEqual(ep.module()(*inp), M()(*inp))
 
+<<<<<<< HEAD
+=======
+            self.assertEqual(len(report.op_profiles), 1)
+            self.assertEqual(len(report.op_profiles["mylib.foo.default"]), 1)
+            print(report.op_profiles)
+
+            with torch._library.fake_profile.unsafe_generate_fake_kernels(
+                report.op_profiles
+            ):
+                ep = ep.run_decompositions()
+            self.assertEqual(ep.module()(*inp), M()(*inp))
+
+    def test_missing_meta_kernel_custom_op_multiple_profiles(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT"):
+
+            @torch.library.custom_op("mylib::foo3", mutates_args={})
+            def foo3_impl(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+                return a + b
+
+            class M(torch.nn.Module):
+                def forward(self, a, b, c, d):
+                    res1 = torch.ops.mylib.foo3(a, b)
+                    res2 = torch.ops.mylib.foo3(c, d)
+                    return res1, res2
+
+            inp = (
+                torch.ones(3, 4),
+                torch.ones(3, 4),
+                torch.ones(2, 3, 4),
+                torch.ones(2, 3, 4),
+            )
+
+            ep = draft_export(M(), inp)
+            report = ep._report
+
+            self.assertEqual(len(report.failures), 1)
+            self.assertEqual(
+                report.failures[0].failure_type, FailureType.MISSING_FAKE_KERNEL
+            )
+            self.assertEqual(len(report.op_profiles), 1)
+            self.assertEqual(len(report.op_profiles["mylib.foo3.default"]), 2)
+
+            with torch._library.fake_profile.unsafe_generate_fake_kernels(
+                report.op_profiles
+            ):
+                ep.run_decompositions()
+
+    def test_missing_meta_kernel_custom_op_update_profile(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT"):
+
+            @torch.library.custom_op("mylib::foo8", mutates_args={})
+            def foo8_impl(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+                return a + b
+
+            class M(torch.nn.Module):
+                def forward(self, a, b):
+                    res = torch.ops.mylib.foo8(a, b)
+                    return res
+
+            inp = (
+                torch.ones(3, 4),
+                torch.ones(3, 4),
+            )
+
+            ep = draft_export(M(), inp)
+            report = ep._report
+            self.assertEqual(len(report.op_profiles), 1)
+            self.assertEqual(len(report.op_profiles["mylib.foo8.default"]), 1)
+
+            new_inp = (
+                torch.ones(2, 3, 4),
+                torch.ones(2, 3, 4),
+            )
+
+            with torch._library.fake_profile.unsafe_generate_fake_kernels(
+                report.op_profiles
+            ):
+                with FakeTensorMode(allow_non_fake_inputs=True, shape_env=ShapeEnv()):
+                    torch.ops.mylib.foo8(*inp)
+                    with self.assertRaisesRegex(
+                        RuntimeError, "no profiles match the given inputs"
+                    ):
+                        torch.ops.mylib.foo8(*new_inp)
+
+                ep = draft_export(M(), new_inp)
+
+            report = ep._report
+            self.assertEqual(len(report.op_profiles), 1)
+            self.assertEqual(len(report.op_profiles["mylib.foo8.default"]), 1)
+
+            with (
+                torch._library.fake_profile.unsafe_generate_fake_kernels(
+                    report.op_profiles
+                ),
+                FakeTensorMode(allow_non_fake_inputs=True, shape_env=ShapeEnv()),
+            ):
+                torch.ops.mylib.foo8(*new_inp)
+
+                # Existing registration has been updated to match the new
+                # profile traced with draft-export
+                with self.assertRaisesRegex(
+                    RuntimeError, "no profiles match the given inputs"
+                ):
+                    torch.ops.mylib.foo8(*inp)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not torch.cuda.is_available(), "Requires cuda")
     def test_missing_meta_kernel_guard(self):
         with torch.library._scoped_library("mylib", "FRAGMENT"):
@@ -238,11 +372,15 @@ def forward(self, x):
 
         ep = draft_export(M(), (torch.tensor([938]),))
         report = ep._report
+<<<<<<< HEAD
         self.assertEqual(len(report.failures), 1)
         self.assertEqual(
             report.failures[0].failure_type, FailureType.DATA_DEPENDENT_ERROR
         )
         self.assertEqual(report.failures[0].data["expr"], "Eq(2*u1, 10)")
+=======
+        self.assertEqual(len(report.failures), 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_dedup_data_dependent_failure(self):
         class M(torch.nn.Module):
@@ -269,6 +407,18 @@ def forward(self, x, y, z):
         inp = (torch.tensor(4), torch.tensor(2), torch.tensor(6))
         self.assertEqual(ep.module()(*inp), M()(*inp))
 
+<<<<<<< HEAD
+=======
+        # the fake tensors on node.meta["val"] should have real_tensor
+        gm = ep.module()
+        tensors = [
+            node.meta.get("val").real_tensor
+            for node in gm.graph.nodes
+            if node.op == "placeholder"
+        ]
+        self.assertTrue(all(isinstance(t, torch.Tensor) for t in tensors))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_complex_data_dependent_expr(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -291,6 +441,7 @@ def forward(self, x, y):
         self.assertEqual(
             report.failures[0].failure_type, FailureType.DATA_DEPENDENT_ERROR
         )
+<<<<<<< HEAD
         self.assertTrue(len(report.expressions_created) >= 4)
         for _ep in [ep, ep.run_decompositions()]:
             # check data-dependent asserts
@@ -300,6 +451,9 @@ def forward(self, x, y):
                 if node.target == torch.ops.aten._assert_scalar.default
             ]
             self.assertEqual(len(assert_scalar_nodes), 5)
+=======
+        for _ep in [ep, ep.run_decompositions()]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # unbacked bindings
             unbacked_binding_symbols = set()
             for node in _ep.graph.nodes:
@@ -330,9 +484,13 @@ def forward(self, a):
         report = ep._report
 
         self.assertEqual(len(report.failures), 1)
+<<<<<<< HEAD
         self.assertEqual(
             report.failures[0].failure_type, FailureType.CONSTRAINT_VIOLATION_ERROR
         )
+=======
+        self.assertEqual(report.failures[0].failure_type, FailureType.GUARD_ADDED)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         inp = (torch.randn(3, 3),)
         self.assertEqual(ep.module()(*inp), M()(*inp))
@@ -393,6 +551,10 @@ def forward(self, x, mask, weight, bias):
                 return torch.nn.functional.linear(masked, weight, bias)
 
         x = torch.zeros(10)
+<<<<<<< HEAD
+=======
+        x[0] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inp = (torch.randn(10, 8, 7), x, torch.randn(25, 7), torch.randn(25))
         draft_ep = draft_export(M(), inp)
         ep = export(M(), inp)
@@ -432,6 +594,7 @@ def forward(self, tq, x):
         self.assertEqual(tq.size(), 2)
 
     def test_override_size_and_dtype_mismatched_fake_kernels(self):
+<<<<<<< HEAD
         class M(torch.nn.Module):
             def forward(self, a):
                 return torch.ops.mylib.foo(a)
@@ -548,6 +711,144 @@ def foo_fake_impl(a, b):
             report.failures[0].data["reason"],
             "Dtypes torch.bfloat16 and torch.float32 are not equal!",
         )
+=======
+        with torch.library._scoped_library("mylib", "FRAGMENT"):
+
+            class M(torch.nn.Module):
+                def forward(self, a):
+                    return torch.ops.mylib.foo9(a)
+
+            @torch.library.custom_op("mylib::foo9", mutates_args={})
+            def foo(a: torch.Tensor) -> list[torch.Tensor]:
+                x = a * 2
+                y = a.repeat(2, 2)
+                z = a.to(torch.bfloat16)
+                return [x, y, z]
+
+            @torch.library.register_fake("mylib::foo9")
+            def foo_fake_impl(a):
+                x = torch.empty_like(a)  # good
+                y = torch.empty_like(a)  # size mismatch
+                z = torch.empty_like(a)  # dtype mismatch
+                return [x, y, z]
+
+            mod = M()
+            inputs = (torch.randn(3, 3),)
+            with self.assertRaises(RuntimeError):
+                with torch._functorch.config.patch(
+                    fake_tensor_propagate_real_tensors=True
+                ):
+                    export(mod, inputs, strict=True)
+
+            ep = draft_export(mod, inputs)
+            report = ep._report
+            for ep_out, eager_out in zip(ep.module()(*inputs), mod(*inputs)):
+                self.assertTrue(torch.allclose(ep_out, eager_out))
+                self.assertEqual(ep_out.dtype, eager_out.dtype)
+
+            self.assertEqual(len(report.failures), 2)
+            self.assertEqual(
+                report.failures[0].failure_type, FailureType.MISMATCHED_FAKE_KERNEL
+            )
+            self.assertEqual(
+                report.failures[1].failure_type, FailureType.MISMATCHED_FAKE_KERNEL
+            )
+            self.assertEqual(
+                sorted([f.data["reason"] for f in report.failures]),
+                [
+                    "Dtypes torch.bfloat16 and torch.float32 are not equal!",
+                    "mismatch between fake value 3 and real value 6 ",
+                ],
+            )
+
+            with torch._library.fake_profile.unsafe_generate_fake_kernels(
+                report.op_profiles
+            ):
+                ep.run_decompositions()
+
+    def test_override_incorrectly_aliasing_kernel(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT"):
+
+            @torch.library.custom_op("mylib::foo10", mutates_args={})
+            def foo(a: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+                return a * 2, a + 2
+
+            @torch.library.register_fake("mylib::foo10")
+            def foo_fake_impl(a):
+                return a, torch.empty_like(a)  # incorrectly aliasing
+
+            class M(torch.nn.Module):
+                def forward(self, a):
+                    return torch.ops.mylib.foo10(a)
+
+            mod = M()
+            inputs = (torch.randn(3, 3),)
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Real tensor propagation found an aliasing mismatch",
+            ):
+                with torch._functorch.config.patch(
+                    fake_tensor_propagate_real_tensors=True
+                ):
+                    export(mod, inputs, strict=True)
+
+            ep = draft_export(mod, inputs)
+            report = ep._report
+            for ep_out, eager_out in zip(
+                tree_leaves(ep.module()(*inputs)), tree_leaves(mod(*inputs))
+            ):
+                self.assertTrue(torch.allclose(ep_out, eager_out))
+                self.assertEqual(ep_out.dtype, eager_out.dtype)
+
+            self.assertEqual(len(report.failures), 1)
+            self.assertEqual(
+                report.failures[0].failure_type, FailureType.MISMATCHED_FAKE_KERNEL
+            )
+            self.assertTrue(
+                "Mismatched aliasing spec between fake kernel and real kernel"
+                in report.failures[0].data["reason"]
+            )
+
+    def test_override_mismatched_fake_kernel_with_unbacked_symbols(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT"):
+
+            @torch.library.custom_op("mylib::foo11", mutates_args={})
+            def foo11(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+                return a[b.item()].to(torch.bfloat16)
+
+            @torch.library.register_fake("mylib::foo11")
+            def foo_fake_impl(a, b):
+                ctx = torch.library.get_ctx()
+                u = ctx.new_dynamic_size()
+                return torch.empty(u, a.shape[1], dtype=a.dtype)
+
+            class M(torch.nn.Module):
+                def forward(self, a, b):
+                    return torch.ops.mylib.foo11(a, b)
+
+            mod = M()
+            inputs = (torch.randn(100, 4), torch.tensor(10))
+
+            ep = draft_export(mod, inputs)
+
+            report = ep._report
+            for ep_out, eager_out in zip(ep.module()(*inputs), mod(*inputs)):
+                self.assertTrue(torch.allclose(ep_out, eager_out))
+                self.assertEqual(ep_out.dtype, eager_out.dtype)
+
+            self.assertEqual(len(report.failures), 1)
+            self.assertEqual(
+                report.failures[0].failure_type, FailureType.MISMATCHED_FAKE_KERNEL
+            )
+            self.assertEqual(
+                report.failures[0].data["reason"],
+                "Dtypes torch.bfloat16 and torch.float32 are not equal!",
+            )
+            with torch._library.fake_profile.unsafe_generate_fake_kernels(
+                report.op_profiles
+            ):
+                ep.run_decompositions()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # https://github.com/pytorch/pytorch/issues/140625
     @unittest.skipIf(IS_WINDOWS, "aoti_compile_and_package not supported on Windows")
@@ -567,6 +868,39 @@ def forward(self, x, y):
                 package_path=f.name,
             )
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        not torch.cuda.is_available()
+        or torch.cuda.get_device_properties(0).total_memory < 2**28,
+        "Requires 16 MB GPU memory to pass the test; setting it higher to catch violations",
+    )
+    def test_cuda_memory_usage(self):
+        # This used to OOM
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                for _ in range(100):
+                    x = x + 1e-3
+                return x
+
+        # measure base usage
+        device = torch.device("cuda:0")
+        torch.cuda.reset_peak_memory_stats()
+        base_usage = torch.cuda.memory_allocated(device)
+
+        # usage with input tensor allocated
+        x = torch.randn(2**10, 2**10).to(device)
+        x_usage = torch.cuda.memory_allocated(device)
+
+        # draft export peak memory usage
+        draft_export(Foo(), (x,), strict=False)
+        peak_mem_usage = torch.cuda.memory_stats(device)["allocated_bytes.all.peak"]
+
+        # right now it's actually exactly 4x;
+        # I guess original tensor, 2 tensors per add op, 1 for clone stored in node.meta["val"]
+        self.assertTrue((peak_mem_usage - base_usage) <= (x_usage - base_usage) * 4.0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_experimental.py b/test/export/test_experimental.py
index f95484f0a128..2b271c970026 100644
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@@ -1,5 +1,9 @@
 # Owner(s): ["oncall: export"]
 # flake8: noqa
+<<<<<<< HEAD
+=======
+import types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 from typing import Dict, List, Tuple
 
@@ -9,7 +13,11 @@
 from torch._functorch.aot_autograd import aot_export_module
 from torch.export import export, export_for_training
 from torch.export._trace import _convert_ts_to_export_experimental
+<<<<<<< HEAD
 from torch.export.experimental import _export_forward_backward
+=======
+from torch.export.experimental import _export_forward_backward, _sticky_export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.export.graph_signature import OutputKind
 from torch.testing import FileCheck
 
@@ -60,7 +68,13 @@ def _check_equality_and_annotations(m_func, inps):
             )
 
             # ExportedProgram from original module.
+<<<<<<< HEAD
             original_exported_module = torch.export.export_for_training(m_func(), inps)
+=======
+            original_exported_module = torch.export.export_for_training(
+                m_func(), inps, strict=True
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Check whether input annotations are the same as tracing the original module.
             orig_ph_name_list = [
@@ -116,7 +130,11 @@ def forward(self, x):
         m = Module()
         example_inputs = (torch.randn(3),)
         m(*example_inputs)
+<<<<<<< HEAD
         ep = torch.export.export_for_training(m, example_inputs)
+=======
+        ep = torch.export.export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         joint_ep = _export_forward_backward(ep)
         self.assertExpectedInline(
             str(joint_ep.graph_module.code).strip(),
@@ -226,7 +244,11 @@ def forward(self, x):
         example_inputs = (torch.randn(3),)
         m(*example_inputs)
         ep = torch.export.export_for_training(
+<<<<<<< HEAD
             m, example_inputs, dynamic_shapes={"x": {0: Dim("x0")}}
+=======
+            m, example_inputs, dynamic_shapes={"x": {0: Dim("x0")}}, strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         _export_forward_backward(ep)
 
@@ -261,7 +283,11 @@ def forward(self, x, labels):
         labels = torch.ones(4, dtype=torch.int64)
         inputs = (x, labels)
 
+<<<<<<< HEAD
         ep = export_for_training(net, inputs)
+=======
+        ep = export_for_training(net, inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep = _export_forward_backward(ep)
 
     def test_joint_loss_index(self):
@@ -281,7 +307,11 @@ def forward(self, x):
 
         inputs = (torch.randn(4, 4),)
         for i in [0, 1]:
+<<<<<<< HEAD
             ep = export_for_training(Foo(i), inputs)
+=======
+            ep = export_for_training(Foo(i), inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep_joint = _export_forward_backward(ep, joint_loss_index=i)
             for j, spec in enumerate(ep_joint.graph_signature.output_specs):
                 if i == j:
@@ -331,6 +361,117 @@ def forward(self, x, label):
             OutputKind.LOSS_OUTPUT,
         )
 
+<<<<<<< HEAD
+=======
+    def test_sticky_export(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        class Pipeline:
+            def __init__(self, model):
+                self.model = model
+
+            def generate(self, *args, **kwargs):
+                return self.model(*args, **kwargs)
+
+        inp = torch.randn(4, 4)
+
+        p = Pipeline(Model())
+        orig_forward = p.model.forward
+        p.model.forward = _sticky_export(p.model.forward)
+        res = p.generate(inp)
+
+        p.model.forward = orig_forward
+        res2 = p.generate(inp)
+        self.assertTrue(torch.allclose(res, res2))
+
+    def test_sticky_export_dynamic(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                if x.shape[0] < 5:
+                    return self.linear(x)
+                return x.sin()
+
+        class Pipeline:
+            def __init__(self, model):
+                self.model = model
+
+            def generate(self, *args, **kwargs):
+                return self.model(*args, **kwargs)
+
+        inp = torch.randn(4, 4)
+
+        def callback(*args, **kwargs):
+            # I think it is bit weird to use the forward arg name here, so
+            # lets just use ShapeCollections
+
+            flat_args, _ = torch.utils._pytree.tree_flatten((args, kwargs))
+            collections = torch.export.ShapesCollection()
+            for arg in flat_args:
+                if isinstance(arg, torch.Tensor):
+                    collections[arg] = {
+                        i: torch.export.Dim.AUTO for i in range(len(arg.shape))
+                    }
+            return collections
+
+        p = Pipeline(Model())
+        p.model.forward = _sticky_export(
+            p.model.forward, dynamic_shapes_callback=callback
+        )
+        _ = p.generate(inp)
+        self.assertExpectedInline(
+            str(p.model.forward._exported_artifact.code).strip(),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    linear_weight = self.linear.weight
+    linear_bias = self.linear.bias
+    sym_size_int_2 = torch.ops.aten.sym_size.int(x, 1)
+    linear = torch.ops.aten.linear.default(x, linear_weight, linear_bias);  x = linear_weight = linear_bias = None
+    eq = sym_size_int_2 == 4;  sym_size_int_2 = None
+    _assert_scalar_default = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(s27, 4) on node 'eq'");  eq = _assert_scalar_default = None
+    return pytree.tree_unflatten((linear,), self._out_spec)""",
+        )
+
+    def test_sticky_export_nested_inp(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, *, inputs):
+                return self.linear(inputs[0]) + self.linear(inputs[1])
+
+        class Pipeline:
+            def __init__(self, model):
+                self.model = model
+
+            def generate(self, *, input_tensor, input_tensor2):
+                inputs = [input_tensor, input_tensor2]
+                return self.model(inputs=inputs)
+
+        inp = torch.randn(4, 4)
+        inp2 = torch.randn(4, 4)
+
+        p = Pipeline(Model())
+        orig_forward = p.model.forward
+        p.model.forward = _sticky_export(p.model.forward)
+        res = p.generate(input_tensor=inp, input_tensor2=inp2)
+
+        p.model.forward = orig_forward
+        res2 = p.generate(input_tensor=inp, input_tensor2=inp2)
+        self.assertTrue(torch.allclose(res, res2))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_export.py b/test/export/test_export.py
index c7e70ad9254c..ff49b87303de 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -3,7 +3,13 @@
 # flake8: noqa
 import copy
 import dataclasses
+<<<<<<< HEAD
 import logging
+=======
+import functools
+import logging
+import math
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import operator
 import re
 import unittest
@@ -11,11 +17,20 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from re import escape
+<<<<<<< HEAD
 from typing import Dict, List
+=======
+from typing import Dict, List, Union
+from unittest.mock import MagicMock, patch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo as torchdynamo
 import torch.nn.functional as F
+<<<<<<< HEAD
+=======
+import torch.utils._pytree as pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functorch.experimental.control_flow import cond, map
 from torch import Tensor
 from torch._decomp import decomposition_table
@@ -31,6 +46,10 @@
 )
 from torch._higher_order_ops.associative_scan import associative_scan
 from torch._higher_order_ops.hints_wrap import hints_wrapper
+<<<<<<< HEAD
+=======
+from torch._higher_order_ops.scan import scan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.compile_fx import split_const_gm
 from torch._subclasses import FakeTensorMode
 from torch.export import (
@@ -57,7 +76,11 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
+<<<<<<< HEAD
     SM90OrLater,
+=======
+    xfailIfDistributedNotSupported,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_utils import (
     find_library_location,
@@ -69,6 +92,10 @@
     skipIfCrossRef,
     skipIfXpu,
     TEST_TRANSFORMERS,
+<<<<<<< HEAD
+=======
+    TEST_WITH_CROSSREF,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase as TorchTestCase,
 )
 from torch.testing._internal.custom_tensor import (
@@ -91,9 +118,12 @@
 )
 
 
+<<<<<<< HEAD
 if not IS_MACOS:
     from torch.testing._internal.distributed.fake_pg import FakeStore
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if HAS_GPU:
     import triton
     import triton.language as tl
@@ -199,6 +229,7 @@ class Inp3:
     p: torch.Tensor
 
 
+<<<<<<< HEAD
 NON_STRICT_SUFFIX = "_non_strict"
 RETRACEABILITY_STRICT_SUFFIX = "_retraceability"
 RETRACEABILITY_NON_STRICT_SUFFIX = "_retraceability_non_strict"
@@ -217,6 +248,31 @@ def is_non_strict_test(test_name):
 
 def is_non_strict_legacy_test(test_name):
     return test_name.endswith(LEGACY_EXPORT_NONSTRICT_SUFFIX)
+=======
+NON_STRICT_SUFFIX = "_nonstrict"
+STRICT_SUFFIX = "_strict"
+INLINE_AND_INSTALL_STRICT_SUFFIX = "_inline_and_install_strict"
+RETRACEABILITY_STRICT_SUFFIX = "_retraceability_strict"
+RETRACEABILITY_NON_STRICT_SUFFIX = "_retraceability_nonstrict"
+SERDES_SUFFIX = "serdes"
+SERDES_STRICT_SUFFIX = "_serdes_strict"
+SERDES_NON_STRICT_SUFFIX = "_serdes_nonstrict"
+PREDISPATCH_SUFFIX = "_pre_dispatch"
+TRAINING_IR_DECOMP_STRICT_SUFFIX = "_training_ir_to_decomp_strict"
+TRAINING_IR_DECOMP_NON_STRICT_SUFFIX = "_training_ir_to_decomp_nonstrict"
+CPP_RUNTIME_STRICT_SUFFIX = "_cpp_runtime_strict"
+CPP_RUNTIME_NONSTRICT_SUFFIX = "_cpp_runtime_nonstrict"
+
+
+# Now default mode is non strict, so original unammended test names
+# should be treated as non-strict
+def is_non_strict_test(test_name):
+    return not test_name.endswith(STRICT_SUFFIX)
+
+
+def is_inline_and_install_strict_test(test_name: str) -> bool:
+    return test_name.endswith(INLINE_AND_INSTALL_STRICT_SUFFIX)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_retracebility_test(test_name):
@@ -226,17 +282,37 @@ def is_retracebility_test(test_name):
 
 
 def is_serdes_test(test_name):
+<<<<<<< HEAD
     return test_name.endswith(SERDES_SUFFIX) or test_name.endswith(
+=======
+    return test_name.endswith(SERDES_STRICT_SUFFIX) or test_name.endswith(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         SERDES_NON_STRICT_SUFFIX
     )
 
 
+<<<<<<< HEAD
+=======
+def need_serdes_test(test_name):
+    return SERDES_SUFFIX in test_name
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_training_ir_test(test_name):
     return test_name.endswith(TRAINING_IR_DECOMP_STRICT_SUFFIX) or test_name.endswith(
         TRAINING_IR_DECOMP_NON_STRICT_SUFFIX
     )
 
 
+<<<<<<< HEAD
+=======
+def is_cpp_runtime_test(test_name):
+    return test_name.endswith(CPP_RUNTIME_STRICT_SUFFIX) or test_name.endswith(
+        CPP_RUNTIME_NONSTRICT_SUFFIX
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_hop_schema(ep: torch.export.ExportedProgram):
     hop_node = next(
         node
@@ -313,6 +389,26 @@ def forward(self, x, seq_len):
         seq_len = torch.tensor(5)
         torch.export.export(MySlice(), args=(x, seq_len))
 
+<<<<<<< HEAD
+=======
+    @torch.fx.experimental._config.patch(backed_size_oblivious=True)
+    def test_reshape_view_backed_size_oblivious(self):
+        N = 3
+
+        class MyModel(torch.nn.Module):
+            def forward(self, x):
+                y = x[:-1, :]  # [s0 - 1, 32]
+                stacked = torch.stack([y] * N, dim=0)  # [N * (s0 - 1), 32]
+                reshaped = stacked.reshape(-1, N, 32)  # [(s0 - 1), N, 32]
+                return reshaped
+
+        inps = (torch.randn(10, 32),)
+        spec = {
+            "x": (Dim.AUTO, Dim.STATIC),
+        }
+        ep = export(MyModel(), inps, dynamic_shapes=spec)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_constraints_error(self):
         class ConflictingConstraints(torch.nn.Module):
             def forward(self, x):
@@ -370,11 +466,20 @@ def forward(self, x, p):
 
         inputs = (torch.arange(10), torch.tensor(2))
 
+<<<<<<< HEAD
         # Without transforming the unbacked int expression, we can't export.
         with self.assertRaisesRegex(
             RuntimeError, escape("Could not guard on data-dependent expression")
         ):
             export(Module(identity), inputs, strict=True)
+=======
+        # See https://github.com/pytorch/pytorch/issues/154574
+        # # Without transforming the unbacked int expression, we can't export.
+        # with self.assertRaisesRegex(
+        #     RuntimeError, escape("Could not guard on data-dependent expression")
+        # ):
+        #     export(Module(identity), inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # It works if we transform the whole unbacked int expression into
         # an unbacked int.
@@ -523,6 +628,73 @@ def forward(self, x):
 
         self.assertEqual(counter, 1)
 
+<<<<<<< HEAD
+=======
+    def test_from_node_metadata_export(self):
+        class Foo(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.conv1d = torch.nn.Conv1d(3, 3, 3)
+                self.conv2d = torch.nn.Conv2d(3, 3, 3)
+
+            def forward(self, x):
+                x = self.conv2d(x)
+                x = x.squeeze(0)
+                x = self.conv1d(x)
+                return x
+
+            def example_inputs(self):
+                return
+
+        f = Foo()
+        inputs = (torch.randn(1, 3, 5, 5),)
+        gm = export(f, inputs).module()
+        from torch.fx.traceback import NodeSourceAction
+
+        for node in gm.graph.nodes:
+            if node.op in ("placeholder", "output"):
+                continue
+            if "weight" in node.name or "bias" in node.name:
+                self.assertTrue(
+                    node.meta["from_node"][-1].pass_name
+                    == "ExportedProgram.module().unlift()"
+                )
+                self.assertTrue(
+                    node.meta["from_node"][-1].action
+                    == [NodeSourceAction.CREATE, NodeSourceAction.REPLACE]
+                )
+            else:
+                self.assertTrue(
+                    node.meta["from_node"][-1].pass_name == "ExportedProgram.module()"
+                )
+                self.assertTrue(
+                    node.meta["from_node"][-1].action == [NodeSourceAction.CREATE]
+                )
+
+        ## re-export
+        gm2 = export(gm, inputs).module()
+
+        for node in gm2.graph.nodes:
+            if node.op in ("placeholder", "output"):
+                continue
+            if "weight" in node.name or "bias" in node.name:
+                self.assertTrue(
+                    node.meta["from_node"][-1].pass_name
+                    == "ExportedProgram.module().unlift()"
+                )
+                self.assertTrue(
+                    node.meta["from_node"][-1].action
+                    == [NodeSourceAction.CREATE, NodeSourceAction.REPLACE]
+                )
+            else:
+                self.assertTrue(
+                    node.meta["from_node"][-1].pass_name == "ExportedProgram.module()"
+                )
+                self.assertTrue(
+                    node.meta["from_node"][-1].action == [NodeSourceAction.CREATE]
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_bincount(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -568,6 +740,123 @@ def forward(self, x, y):
     return (1,)""",
         )
 
+<<<<<<< HEAD
+=======
+    def test_inline_script_function(self):
+        @torch.jit.script
+        def _forward(x: torch.Tensor):
+            if torch.jit.is_scripting():
+                return x.cos()
+            return x.sin()
+
+        class M(torch.nn.Module):
+            def forward(self, x: torch.Tensor):
+                return _forward(x)
+
+        x = torch.randn(3, 4)
+        ep = torch.export.export(M(), (x,))
+        FileCheck().check_count("torch.ops.aten.sin", 1, exactly=True).run(
+            str(ep.graph)
+        )
+        FileCheck().check_count("torch.ops.aten.cos", 0, exactly=True).run(
+            str(ep.graph)
+        )
+        res = ep.module()(x)
+        # We're inlining the original _forward function
+        # instead of the scripted function, so we get x.sin()
+        self.assertEqual(res, x.sin())
+
+    def test_inline_script_class_method(self):
+        class M(torch.nn.Module):
+            @staticmethod
+            @torch.jit.script
+            def _forward(x: torch.Tensor):
+                if torch.jit.is_scripting():
+                    return x.cos()
+                return x.sin()
+
+            def forward(self, x: torch.Tensor):
+                return M._forward(x)
+
+        x = torch.randn(3, 4)
+        ep = torch.export.export(M(), (x,))
+        FileCheck().check_count("torch.ops.aten.sin", 1, exactly=True).run(
+            str(ep.graph)
+        )
+        FileCheck().check_count("torch.ops.aten.cos", 0, exactly=True).run(
+            str(ep.graph)
+        )
+        res = ep.module()(x)
+        # We're inlining the original _forward function
+        # instead of the scripted function, so we get x.sin()
+        self.assertEqual(res, x.sin())
+
+    def test_inline_script_class_method_recursive(self):
+        f = 0.4
+        i = 2
+        s = "foo"
+
+        @torch.jit.script
+        def _inner(x: torch.Tensor, y: torch.Tensor, f: float, i: int, s_len: int):
+            return x * y * f * i * s_len
+
+        class M(torch.nn.Module):
+            @staticmethod
+            @torch.jit.script
+            def _forward(x: torch.Tensor, y: torch.Tensor, f: float, i: int, s: str):
+                if torch.jit.is_scripting():
+                    return _inner(x.cos(), y.cos(), f, i, len(s))
+                return _inner(x.sin(), y.sin(), f, i, len(s))
+
+            def forward(self, x: torch.Tensor):
+                return M._forward(x, y=x, f=f, i=i, s=s)
+
+        x = torch.randn(3, 4)
+        ep = torch.export.export(M(), (x,))
+        FileCheck().check_count("torch.ops.aten.sin", 2, exactly=True).run(
+            str(ep.graph)
+        )
+        FileCheck().check_count("torch.ops.aten.cos", 0, exactly=True).run(
+            str(ep.graph)
+        )
+        res = ep.module()(x)
+        # We're inlining the original _forward function
+        # instead of the scripted function, so we get x.sin()
+        self.assertEqual(res, _inner(x.sin(), x.sin(), f, i, len(s)))
+
+    def test_inline_script_method(self):
+        class M(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def _forward(self, x: torch.Tensor):
+                if torch.jit.is_scripting():
+                    return x.cos()
+                return x.sin()
+
+            def forward(self, x):
+                return self._forward(x)
+
+        class Wrapped(torch.nn.Module):
+            def __init__(self, mod):
+                super().__init__()
+                self.mod = mod
+
+            def forward(self, x):
+                return self.mod(x)
+
+        x = torch.randn(3, 4)
+        ep = torch.export.export(Wrapped(M()), (x,))
+        FileCheck().check_count("torch.ops.aten.sin", 1, exactly=True).run(
+            str(ep.graph)
+        )
+        FileCheck().check_count("torch.ops.aten.cos", 0, exactly=True).run(
+            str(ep.graph)
+        )
+        res = ep.module()(x)
+        # We're inlining the original _forward function
+        # instead of the scripted function, so we get x.sin()
+        self.assertEqual(res, x.sin())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_no_tensor_computation_2(self):
         class Module(torch.nn.Module):
             def forward(self, x, y):
@@ -653,6 +942,38 @@ def forward(self, x):
     return (add,)""",
         )
 
+<<<<<<< HEAD
+=======
+    def test_int_shape_specialization(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                ori_size = (
+                    int(x.shape[-2] / 1),
+                    int(x.shape[-1] / 1),
+                )
+                x = F.interpolate(x, size=ori_size, mode="bilinear")
+                return x
+
+        input1 = (torch.rand(1, 3, 28, 28),)
+        input2 = (torch.rand(1, 3, 56, 56),)
+        inputs = [input1, input2]
+        model = M()
+        dynamic_shapes = {
+            "x": {2: torch.export.Dim.DYNAMIC, 3: torch.export.Dim.DYNAMIC},
+        }
+        with self.assertRaisesRegex(
+            (
+                torch.fx.experimental.symbolic_shapes.ConstraintViolationError,
+                torch._dynamo.exc.UserError,
+            ),
+            (
+                r"your code specialized it to be a constant \(28\)(.*\n)*.*"
+                r"your code specialized it to be a constant \(28\)(.*\n)*.*"
+            ),
+        ):
+            export(model, input1, dynamic_shapes=dynamic_shapes, strict=False)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_external_call_non_strict_real_tensor(self):
         class ExternalMethod:
             def add(self, x):
@@ -671,6 +992,33 @@ def forward(self, x):
         ep = export(f, args, strict=False)
         self.assertEqual(ep.module()(*args), f(*args))
 
+<<<<<<< HEAD
+=======
+    @testing.expectedFailureCppSerDes  # Cpp serder seems to fail parsing complicated guards
+    def test_export_statically_known_true(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                shape = y.shape[0] ** 2 - 3 * y.shape[0]
+                end = shape
+                return x[:, :end]
+
+        dynamic_shapes = (
+            (torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC),
+            (torch.export.Dim.DYNAMIC, torch.export.Dim.DYNAMIC),
+        )
+
+        ep = export(
+            Foo(),
+            (torch.randn(4, 4), torch.randn(4, 4)),
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+        )
+        FileCheck().check_count("torch.ops.aten.slice.Tensor", 2, exactly=True).run(
+            str(ep.graph)
+        )
+        FileCheck().check_count("operator.sub", 1, exactly=True).run(str(ep.graph))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_colon_parameter(self):
         class M(torch.nn.Module):
             def __init__(self) -> None:
@@ -721,6 +1069,33 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         args = (torch.randn(15, 3, 256, 256), torch.ones(15, 32, 256, 256))
         self.assertEqual(gm(*args), m(*args))
 
+<<<<<<< HEAD
+=======
+    def test_unused_constant(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                y = torch.tensor(3)
+                return x * x
+
+        ep = export(M(), (torch.ones(3),))
+        self.assertEqual(len(ep.constants), 0)
+
+    def test_unbacked_bincount(self):
+        class Foo(torch.nn.Module):
+            def forward(self, xs):
+                u0, u1 = xs.tolist()
+                x = torch.ones(u0, dtype=torch.int64)
+                y = torch.bincount(x, minlength=u1)
+                return y
+
+        m = Foo()
+        x = torch.tensor([20, 10])
+        ep = export(m, (x,))
+        self.assertTrue(torch.allclose(ep.module()(x), m(x)))
+        y = torch.tensor([5, 10])
+        self.assertTrue(torch.allclose(ep.module()(y), m(y)))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu
     def test_export_custom_triton_kernel(self):
         @triton.jit
@@ -1089,12 +1464,16 @@ def forward(self, x, ys, zs, c):
             {"a": torch.zeros(5), "b": torch.ones(5)},
             torch.ones(4),
         )
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             RuntimeError,
             escape(
                 "Expected input at *args[1][0].shape[0] to be equal to 6, but got 5"
             ),
         ):
+=======
+        with self.assertRaisesRegex(RuntimeError, "to be equal to 6, but got 5"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep_ns.module()(*bad_runtime_inp1)
 
         bad_runtime_inp2 = (
@@ -1196,6 +1575,11 @@ def forward(self, tensor):
         self.assertEqual(orig_res, ep_res)
 
     def test_unbacked_to_cond(self):
+<<<<<<< HEAD
+=======
+        strict = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def forward(self, a):
                 az = a.nonzero()
@@ -1210,9 +1594,17 @@ def false_fn(x):
                 return r * 2
 
         M()(torch.randn(7))
+<<<<<<< HEAD
         torch.export.export(M(), (torch.randn(7),))
 
     def test_unbacked_to_cond_passthrough(self):
+=======
+        torch.export.export(M(), (torch.randn(7),), strict=strict)
+
+    def test_unbacked_to_cond_passthrough(self):
+        strict = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def forward(self, a):
                 az = a.nonzero()
@@ -1227,7 +1619,103 @@ def false_fn(x):
                 return r * 2
 
         M()(torch.randn(7))
+<<<<<<< HEAD
         torch.export.export(M(), (torch.randn(7),))
+=======
+        torch.export.export(M(), (torch.randn(7),), strict=strict)
+
+    def test_cond_branches_return_constant_int(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                idx = torch.cond(x.sum() > 3, lambda: 0, lambda: 1, tuple())
+                return x[idx]
+
+        args = (torch.randn(3, 3),)
+        m = M()
+        ep = export(M(), args)
+        if self._testMethodName == "test_cond_branches_return_constant_int":
+            self.assertExpectedInline(
+                normalize_gm(ep.module().print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, x):
+        x: "f32[3, 3]";
+
+        x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        sum_1: "f32[]" = torch.ops.aten.sum.default(x)
+        gt: "b8[]" = torch.ops.aten.gt.Scalar(sum_1, 3);  sum_1 = None
+
+        true_graph_0 = self.true_graph_0
+        false_graph_0 = self.false_graph_0
+        cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, ());  gt = true_graph_0 = false_graph_0 = None
+
+        getitem_1: "Sym(u0)" = cond[0];  cond = None
+
+        ge_1: "Sym(u0 >= 0)" = getitem_1 >= 0
+        _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_default = None
+        le_1: "Sym(u0 <= 1)" = getitem_1 <= 1
+        _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(le_1, "Runtime assertion failed for expression u0 <= 1 on node 'le_1'");  le_1 = _assert_scalar_default_1 = None
+
+        select: "f32[3]" = torch.ops.aten.select.int(x, 0, getitem_1);  x = getitem_1 = None
+        return pytree.tree_unflatten((select,), self._out_spec)
+
+    class true_graph_0(torch.nn.Module):
+        def forward(self):
+            return (0,)
+
+    class false_graph_0(torch.nn.Module):
+        def forward(self):
+            return (1,)
+""",  # noqa: B950
+            )
+        self.assertEqual(m(*args), ep.module()(*args))
+
+    def test_cond_branches_return_same_int(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                idx = torch.cond(x.sum() > 3, lambda: 0, lambda: 0, tuple())
+                return x[idx]
+
+        args = (torch.randn(3, 3),)
+        m = M()
+        ep = export(M(), args)
+        # Ideally, we could remove the cond at the front end directly
+        # since it's not used anyway. But we can only do this early
+        # optimization if all the outputs are the same constants, which
+        # will complicates the output check so just keep it in the graph.
+        # let downstream to dce it.
+        if self._testMethodName == "test_cond_branches_return_same_int":
+            self.assertExpectedInline(
+                normalize_gm(ep.module().print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, x):
+        x: "f32[3, 3]";
+
+        x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        sum_1: "f32[]" = torch.ops.aten.sum.default(x)
+        gt: "b8[]" = torch.ops.aten.gt.Scalar(sum_1, 3);  sum_1 = None
+
+        true_graph_0 = self.true_graph_0
+        false_graph_0 = self.false_graph_0
+        cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, ());  gt = true_graph_0 = false_graph_0 = None
+        getitem = cond[0];  cond = getitem = None
+
+        select: "f32[3]" = torch.ops.aten.select.int(x, 0, 0);  x = None
+        return pytree.tree_unflatten((select,), self._out_spec)
+
+    class true_graph_0(torch.nn.Module):
+        def forward(self):
+            return (0,)
+
+    class false_graph_0(torch.nn.Module):
+        def forward(self):
+            return (0,)
+""",  # noqa: B950
+            )
+
+        self.assertEqual(m(*args), ep.module()(*args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_cond_contains_unbacked_no_escape(self):
@@ -1536,7 +2024,11 @@ def foo_fake_impl(a, b):
             )
             with self.assertRaisesRegex(
                 error_type,
+<<<<<<< HEAD
                 r"Real tensor propagation found an output size mismatch between fake shape s1 and real shape 4, "
+=======
+                r"Real tensor propagation found an output size mismatch between fake shape s\d+ and real shape 4, "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 r"at output\.size\(0\), for func: mylib.foo.default",
             ):
                 export(
@@ -1619,6 +2111,7 @@ def forward(self, x, y):
         # as input, runtime assertion should fail. This is because we would create
         # guard on y.shape[0] > x.shape[0] but somehow in old export, we dce this
         # assertion.
+<<<<<<< HEAD
         if is_non_strict_test(self._testMethodName) and not is_non_strict_legacy_test(
             self._testMethodName
         ):
@@ -1626,6 +2119,9 @@ def forward(self, x, y):
                 ep.module()(x, x)
         else:
             self.assertEqual(ep.module()(x, x), model(x, x))
+=======
+        self.assertEqual(ep.module()(x, x), model(x, x))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ep.module()(x, y), model(x, y))
 
     def test_draft_export_checks_mutation_with_nan(self):
@@ -1747,11 +2243,16 @@ def _(x, y):
         with torch._functorch.config.patch(fake_tensor_propagate_real_tensors=True):
             ep = export(model, inputs)
 
+<<<<<<< HEAD
     # Bug: ep.run_decompositions() doesn't propagate real tensors
     @testing.expectedFailureTrainingIRToRunDecomp
     # Bug: ep.run_decompositions() doesn't propagate real tensors
     @testing.expectedFailureTrainingIRToRunDecompNonStrict
     def test_draft_export_infers_fake_kernel(self):
+=======
+    def test_draft_export_infers_fake_kernel(self):
+        strict = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch.library._scoped_library("export", "FRAGMENT") as lib:
             lib.define("bar(Tensor x) -> Tensor")
             lib.impl("bar", lambda x: x[0].clone(), "CPU")
@@ -1767,7 +2268,11 @@ def forward(self, x, y):
             model = Foo()
             inputs = (torch.randn(1, 3), torch.randn(2, 1))
             with torch._functorch.config.patch(fake_tensor_propagate_real_tensors=True):
+<<<<<<< HEAD
                 ep = export(model, inputs)
+=======
+                ep = export(model, inputs, strict=strict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # expecttest only works for the base TestExport class.
         if self.__class__ != TestExport:
@@ -1778,6 +2283,7 @@ def forward(self, x, y):
             """\
 def forward(self, x, y):
     foo = torch.ops.export.foo.default(x, y);  x = None
+<<<<<<< HEAD
     sym_size_int_3 = torch.ops.aten.sym_size.int(foo, 0)
     sym_size_int_4 = torch.ops.aten.sym_size.int(foo, 1)
     sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int_3);  sym_constrain_range_for_size_default = None
@@ -1791,6 +2297,21 @@ def forward(self, x, y):
     sym_constrain_range_for_size_default_2 = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int_5);  sym_constrain_range_for_size_default_2 = None
     ge_5 = sym_size_int_5 >= 0;  sym_size_int_5 = None
     _assert_scalar_default_2 = torch.ops.aten._assert_scalar.default(ge_5, "Runtime assertion failed for expression u2 >= 0 on node 'ge_5'");  ge_5 = _assert_scalar_default_2 = None
+=======
+    sym_size_int = torch.ops.aten.sym_size.int(foo, 0)
+    sym_size_int_1 = torch.ops.aten.sym_size.int(foo, 1)
+    sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int);  sym_constrain_range_for_size_default = None
+    ge = sym_size_int >= 0;  sym_size_int = None
+    _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
+    sym_constrain_range_for_size_default_1 = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int_1);  sym_constrain_range_for_size_default_1 = None
+    ge_1 = sym_size_int_1 >= 0;  sym_size_int_1 = None
+    _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_default_1 = None
+    bar = torch.ops.export.bar.default(y);  y = None
+    sym_size_int_2 = torch.ops.aten.sym_size.int(bar, 0)
+    sym_constrain_range_for_size_default_2 = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int_2);  sym_constrain_range_for_size_default_2 = None
+    ge_2 = sym_size_int_2 >= 0;  sym_size_int_2 = None
+    _assert_scalar_default_2 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u2 >= 0 on node 'ge_2'");  ge_2 = _assert_scalar_default_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (foo, bar)""",
         )
 
@@ -1818,8 +2339,11 @@ def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
             with torch._functorch.config.patch(fake_tensor_propagate_real_tensors=True):
                 ep = export(model, inputs)
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict  # Old export doesn't work with subclasses
     @testing.expectedFailureLegacyExportStrict  # Old export doesn't work with subclasses
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclasses_parameterization(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -1873,8 +2397,11 @@ def forward(self, x):
 
         self.assertEqual(res, ref_out)
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict  # Old export doesn't work with subclasses
     @testing.expectedFailureLegacyExportStrict  # Old export doesn't work with subclasses
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclasses_parameterization_nested(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -1947,8 +2474,11 @@ def forward(self, x):
         res = ep.module()(ref_x)
         self.assertEqual(res, ref_out)
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict  # Old export doesn't work with subclasses
     @testing.expectedFailureLegacyExportStrict  # Old export doesn't work with subclasses
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclass_nested_attr_access(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -1996,8 +2526,11 @@ def forward(self, x):
         ep = export(m, (ref_x,))
         self.assertTrue(torch.allclose(ep.module()(ref_x), ref_out))
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict  # Old export doesn't work with subclasses
     @testing.expectedFailureLegacyExportStrict  # Old export doesn't work with subclasses
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclass_nested_attr_access_submodule(self):
         class Bar(torch.nn.Module):
             def __init__(self):
@@ -2052,8 +2585,11 @@ def forward(self, x):
         ep = export(m, (ref_x,))
         self.assertTrue(torch.allclose(ep.module()(ref_x), ref_out))
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict  # Old export doesn't work with subclasses
     @testing.expectedFailureLegacyExportStrict  # Old export doesn't work with subclasses
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclass_nested_attr_access_const_metadata(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -2092,8 +2628,11 @@ def forward(self, x):
         ep = export(m, (ref_x,))
         self.assertTrue(torch.allclose(ep.module()(ref_x), ref_out))
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict  # Old export doesn't work with subclasses
     @testing.expectedFailureLegacyExportStrict  # Old export doesn't work with subclasses
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclass_nested_attr_access_const_metadata_not_top_level(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -2132,8 +2671,11 @@ def forward(self, x):
         ep = export(m, (ref_x,))
         self.assertTrue(torch.allclose(ep.module()(ref_x), ref_out))
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict  # Old export doesn't work with subclasses
     @testing.expectedFailureLegacyExportStrict  # Old export doesn't work with subclasses
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclass_nested_attr_access_const_metadata_not_top_level(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -2176,8 +2718,11 @@ def forward(self, x):
         ep = export(m, (ref_x,))
         self.assertTrue(torch.allclose(ep.module()(ref_x), ref_out))
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict  # Old export doesn't work with subclasses
     @testing.expectedFailureLegacyExportStrict  # Old export doesn't work with subclasses
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclass_nested_attr_access_complicated_metadata(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -2280,8 +2825,11 @@ def _bool_tensor(nz):
                     self.assertEqual(ep.module()(sample_input), nz)
                     print(ep)
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict  # Trivial error, just need to move the error check earlier, for real users it wont matter
     @testing.expectedFailureLegacyExportStrict  # Trivial error, just need to move the error check earlier, for real users it wont matter
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_script_module(self):
         class Foo(torch.nn.Module):
             def forward(self, rv: torch.Tensor, t: torch.Tensor):
@@ -2348,10 +2896,152 @@ def forward(self, x, y, z):
         }
         with self.assertRaisesRegex(
             torch._dynamo.exc.UserError,
+<<<<<<< HEAD
             r"Not all values of dy .* in the specified range are valid because dy was inferred to be a constant",
         ):
             export(Foo(), inputs, dynamic_shapes=shapes)
 
+=======
+            r"You marked.*but your code specialized it to be a constant.*"
+            r"If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO",
+        ):
+            export(Foo(), inputs, dynamic_shapes=shapes)
+
+    def test_dim_dynamic_specialization(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x + 2
+
+        # 0/1 specialization
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Received user-specified dim hint Dim.DYNAMIC.*"
+            r"but export 0/1 specialized due to hint of 0 for dimension "
+            r"inputs\['x'\]\.shape\[0\](.*\n)*.*"
+            r"Received user-specified dim hint Dim.DYNAMIC.*"
+            r"but export 0/1 specialized due to hint of 1 for dimension "
+            r"inputs\['x'\]\.shape\[1\].*",
+        ):
+            export(
+                Foo(),
+                (torch.randn(0, 1),),
+                dynamic_shapes={
+                    "x": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
+                },
+            )
+
+        class Bar(torch.nn.Module):
+            def forward(self, x):
+                assert x.shape[0] <= 32
+                return x + 2
+
+        # static specialization
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Received user-specified dim hint Dim.DYNAMIC.*"
+            r"but tracing inferred a static shape of 32 for dimension "
+            r"inputs\['x'\]\.shape\[0\](.*\n)*.*",
+        ):
+            export(
+                Bar(),
+                (torch.randn(32),),
+                dynamic_shapes={
+                    "x": {0: Dim.DYNAMIC(min=32)},
+                },
+            )
+
+    def test_dim_hint_ranges(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        inputs = (
+            torch.randn(6, 4),
+            torch.randn(6, 4),
+        )
+        shapes = {
+            "x": (Dim.AUTO(min=4), Dim.AUTO),
+            "y": (Dim.DYNAMIC(max=16), Dim.AUTO(max=32)),
+        }
+        ep = export(Foo(), inputs, dynamic_shapes=shapes)
+        ep.module()(torch.randn(8, 5), torch.randn(8, 5))
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected input at .* to be >= 4, but got 3"
+        ):
+            ep.module()(torch.randn(3, 5), torch.randn(3, 5))
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected input at .* to be <= 16, but got 17"
+        ):
+            ep.module()(torch.randn(17, 5), torch.randn(17, 5))
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected input at .* to be <= 32, but got 33"
+        ):
+            ep.module()(torch.randn(9, 33), torch.randn(9, 33))
+
+    def test_dim_hint_range_violations(self):
+        class Foo(torch.nn.Module):
+            def forward(self, xs):
+                x, y = xs["data"][0]
+                assert y.shape[0] <= 32
+                return x[6:], y + 2
+
+        x, y = torch.randn(8), torch.randn(8)
+
+        # conflict with lower bound
+        shapes = torch.export.ShapesCollection()
+        shapes[x] = [Dim.DYNAMIC(max=5)]
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Received user-specified .* \[None, 5\], conflicting with the inferred .*"
+            r"\[8, int_oo\],.* for inputs\['xs'\]\['data'\]\[0\]\[0\]\.shape\[0\]",
+        ):
+            export(Foo(), ({"data": [[x, y]]},), dynamic_shapes=shapes)
+
+        # conflict with upper bound
+        shapes = torch.export.ShapesCollection()
+        shapes[y] = [Dim.AUTO(min=48, max=62)]
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Received user-specified .* \[48, 62\], conflicting with the inferred .*"
+            r"\[2, 32\],.* for inputs\['xs'\]\['data'\]\[0\]\[1\]\.shape\[0\]",
+        ):
+            export(Foo(), ({"data": [[x, y]]},), dynamic_shapes=shapes)
+
+        class Bar(torch.nn.Module):
+            def forward(self, x):
+                return x + 2
+
+        # conflict with static range
+        shapes = {"x": [Dim.STATIC(min=6, max=8)]}
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Received user-specified .* \[6, 8\], conflicting with the inferred .*"
+            r"\[4, 4\],.* for inputs\['x'\].shape\[0\]",
+        ):
+            export(Bar(), (torch.randn(4),), dynamic_shapes=shapes)
+
+        # multiple conflicts
+        class Moo(torch.nn.Module):
+            def forward(self, x, y):
+                assert x.shape[0] <= 32
+                assert y.shape[0] >= 128
+                return x + 2, y + 2
+
+        inps = (torch.randn(16), torch.randn(256))
+        shapes = {
+            "x": (Dim.DYNAMIC(min=33),),
+            "y": (Dim.DYNAMIC(max=127),),
+        }
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Received user-specified .* \[33, None\], conflicting with the inferred .*"
+            r"\[2, 32\],.* for inputs\['x'\].shape\[0\](.*\n)*.*"
+            r"Received user-specified .* \[None, 127\], conflicting with the inferred .*"
+            r"\[128, int_oo\],.* for inputs\['y'\].shape\[0\]",
+        ):
+            export(Moo(), inps, dynamic_shapes=shapes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_torch_fn(self):
         class M1(torch.nn.Module):
             def __init__(self) -> None:
@@ -2823,7 +3513,11 @@ def forward(self, w):
         with self.assertRaisesRegex(
             RuntimeError,
             "Expected input.*shape.*= 9 to be "
+<<<<<<< HEAD
             "of the form 2\\*s1, where s1 is an integer",
+=======
+            "of the form 2\\*s92, where s92 is an integer",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             ep.module()(torch.randn(9))
 
@@ -3481,10 +4175,34 @@ def forward(self, x):
             dynamic_shapes=({0: Dim("x")},),
         )
 
+<<<<<<< HEAD
         self.assertEqual(
             str(ep_for_training.range_constraints), str(ep_for_real.range_constraints)
         )
 
+=======
+        # Since symbol names are based on hash of source names, and these differ across inference and
+        # training, we do range comparisons instead.
+        self.assertEqual(
+            str(ep_for_training.range_constraints.values()),
+            str(ep_for_real.range_constraints.values()),
+        )
+
+    def test_unbacked_unsqueeze(self):
+        class Unsqueeze(torch.nn.Module):
+            def forward(self, xs):
+                u0, u1 = xs.tolist()
+                x = torch.zeros(1, u0 + u1).contiguous()
+                return x.unsqueeze(-1)
+
+        mod = Unsqueeze()
+        x = torch.tensor([5, 6])
+        ep = export(mod, (x,), strict=False)
+        self.assertTrue(torch.allclose(mod(x), ep.module()(x)))
+        x = torch.tensor([1, 2])
+        self.assertTrue(torch.allclose(mod(x), ep.module()(x)))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_for_training_with_container_type(self):
         class Foo(torch.nn.Module):
             def __init__(self) -> None:
@@ -3675,15 +4393,59 @@ def forward(self, x, y):
         self.assertEqual(range_lower_bounds, [1, 2])
         self.assertEqual(range_upper_bounds, [2, 3])
 
-    def test_dynamic_shapes_builder_basic(self):
+<<<<<<< HEAD
+=======
+    def test_range_constraints_with_replacement(self):
         class M(torch.nn.Module):
-            def forward(self, x, y, z):
-                return x + y[0] + z["k"]
+            def forward(self, x, y):
+                return (x + y)[:3]
 
         m = M()
+        inp = (torch.randn(4), torch.randn(4))
+        dynamic_shapes = ((torch.export.Dim.DYNAMIC,), (torch.export.Dim.DYNAMIC,))
+        ep = export(m, inp, dynamic_shapes=dynamic_shapes)
+        assert len(ep.range_constraints) == 1
+        vr = next(iter(ep.range_constraints.values()))
+        self.assertEqual(vr.lower, 3)
+
+    def test_unbacked_linear_layer_norm_input(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(387, 128, bias=True)
+                self.layer_norm = torch.nn.LayerNorm(387)
 
-        x = torch.randn(4)
-        y = [torch.randn(4)]
+            def forward(self, x, mask):
+                masked_select = x.masked_select(mask)
+                view = masked_select.view(-1, 387)
+
+                linear = self.linear(view)
+                layer_norm = self.layer_norm(view)
+                return linear, layer_norm
+
+        inputs = (
+            torch.randn((256, 387), dtype=torch.float),
+            torch.randint(low=0, high=1, size=(256, 1), dtype=torch.bool),
+        )
+
+        model = MyModel()
+        ep = export(model, inputs)
+
+        ref = model(*inputs)
+        actual = ep.module()(*inputs)
+        self.assertTrue(torch.allclose(ref[0], actual[0]))
+        self.assertTrue(torch.allclose(ref[1], actual[1]))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    def test_dynamic_shapes_builder_basic(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y, z):
+                return x + y[0] + z["k"]
+
+        m = M()
+
+        x = torch.randn(4)
+        y = [torch.randn(4)]
         z = {"k": torch.randn(4)}
         args = (x, y, z)
 
@@ -3727,9 +4489,12 @@ def forward(self, x, y, z):
             if node.op == "placeholder":
                 self.assertEqual(str(tuple(node.meta["val"].shape)), f"({sym},)")
 
+<<<<<<< HEAD
     # retracing doesn't seem to like dataclass registration,
     # raising a dynamo error in fx_pytree.tree_flatten_spec
     @testing.expectedFailureRetraceability  # T186979579
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_builder_pytree(self):
         torch.export.register_dataclass(
             Inp1,
@@ -3758,6 +4523,137 @@ def forward(self, inp: Inp1):
             if node.op == "placeholder":
                 self.assertEqual(str(tuple(node.meta["val"].shape)), f"({sym},)")
 
+<<<<<<< HEAD
+=======
+    def test_dynamic_shapes_inferred_basic(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y, z):
+                # x and y[0] must have same dynamic shape (say `dim`) >= 3
+                tmp = (x + y[0])[:3]
+                # z["k"] must have static shape = 3
+                return tmp * z["k"]
+
+        m = M()
+        args = (torch.randn(4), [torch.randn(4)], {"k": torch.randn(3)})
+
+        additional_inputs = torch.export.AdditionalInputs()
+        # 4->5, 4->5, 3->3
+        good_args = (torch.randn(5), [torch.randn(5)], {"k": torch.randn(3)})
+        additional_inputs.add(good_args)
+
+        ep = export(m, args, dynamic_shapes=additional_inputs)
+        got_shapes = [
+            str(tuple(node.meta["val"].shape))
+            for node in ep.graph.find_nodes(op="placeholder")
+        ]
+        dim = next(iter(ep.range_constraints.keys()))
+        expected_shapes = [f"({dim},)", f"({dim},)", "(3,)"]
+        self.assertEqual(got_shapes, expected_shapes)
+
+        def expect_error(bad_args, run_time_msg, compile_time_msg):
+            with self.assertRaisesRegex(RuntimeError, run_time_msg):
+                ep.module()(*bad_args)
+
+            additional_inputs = torch.export.AdditionalInputs()
+            additional_inputs.add(bad_args)
+
+            with self.assertRaisesRegex(RuntimeError, compile_time_msg):
+                export(m, args, dynamic_shapes=additional_inputs)
+
+        expect_error(
+            # 4->2, 4->2, 3->3
+            bad_args=(torch.randn(2), [torch.randn(2)], {"k": torch.randn(3)}),
+            run_time_msg="Expected input.*to be >= 3, but got 2",
+            compile_time_msg="Expected input.*to be >= 3, but got 2",
+        )
+
+        expect_error(
+            # 4->6, 4->7, 3->3
+            bad_args=(torch.randn(6), [torch.randn(7)], {"k": torch.randn(3)}),
+            run_time_msg="Expected input.*to be equal to 6, but got 7",
+            compile_time_msg="Expected input.*to be equal to 6, but got 7",
+        )
+
+        expect_error(
+            # 4->5, 4->5, 3->4
+            bad_args=(torch.randn(5), [torch.randn(5)], {"k": torch.randn(4)}),
+            run_time_msg="Expected input.*to be equal to 3, but got 4",
+            compile_time_msg=r"You marked.*but your code specialized it to be a constant.*If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO",
+        )
+
+    def test_additional_inputs_constants(self):
+        @dataclass
+        class D:
+            b: bool
+            i: int
+            f: float
+            t: torch.Tensor
+
+        pytree.register_dataclass(D)
+
+        class M(torch.nn.Module):
+            def forward(self, d: D):
+                return d.i + d.f + d.t
+
+        input1 = (D(True, 3, 3.0, torch.ones(3)),)
+
+        # int and tensor change
+        input2 = (D(True, 4, 3.0, torch.ones(4)),)
+        ai = torch.export.AdditionalInputs()
+        ai.add(input1)
+        ai.add(input2)
+        dynamic_shapes = ai.dynamic_shapes(M(), input1)
+        self.assertEqual(
+            dynamic_shapes, {"d": [None, Dim.DYNAMIC, None, (Dim.DYNAMIC,)]}
+        )
+        torch.export.export(M(), input1, dynamic_shapes=ai)
+
+        # float changes, error
+        input2 = (D(True, 3, 4.0, torch.ones(3)),)
+        ai = torch.export.AdditionalInputs()
+        ai.add(input1)
+        ai.add(input2)
+        with self.assertRaisesRegex(
+            ValueError, r"they cannot be marked as dynamic: \(3\.0, 3\.0, 4\.0\)"
+        ):
+            ai.dynamic_shapes(M(), input1)
+        with self.assertRaisesRegex(
+            ValueError, r"they cannot be marked as dynamic: \(3\.0, 3\.0, 4\.0\)"
+        ):
+            torch.export.export(M(), input1, dynamic_shapes=ai)
+
+        # bool changes, error
+        input2 = (D(False, 3, 3.0, torch.ones(3)),)
+        ai = torch.export.AdditionalInputs()
+        ai.add(input1)
+        ai.add(input2)
+        with self.assertRaisesRegex(
+            ValueError, r"they cannot be marked as dynamic: \(True, True, False\)"
+        ):
+            ai.dynamic_shapes(M(), input1)
+        with self.assertRaisesRegex(
+            ValueError, r"they cannot be marked as dynamic: \(True, True, False\)"
+        ):
+            torch.export.export(M(), input1, dynamic_shapes=ai)
+
+        # Differing types
+        input1 = (D(True, 0, 3.0, torch.ones(3)),)
+        input2 = (D(True, False, 3.0, torch.ones(3)),)
+        ai = torch.export.AdditionalInputs()
+        ai.add(input1)
+        ai.add(input2)
+        with self.assertRaisesRegex(
+            ValueError,
+            r"differing types, so they cannot be marked as dynamic: \(0, 0, False\)",
+        ):
+            print(ai.dynamic_shapes(M(), input1))
+        with self.assertRaisesRegex(
+            ValueError,
+            r"differing types, so they cannot be marked as dynamic: \(0, 0, False\)",
+        ):
+            torch.export.export(M(), input1, dynamic_shapes=ai)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mismatched_dynamic_shapes(self):
         AUTO, STATIC = Dim.AUTO, Dim.STATIC
 
@@ -3813,7 +4709,11 @@ def forward(self, x):
 
         dynamic_shapes = (
             {"k": {"k": dim}},
+<<<<<<< HEAD
         )  # ValueError: Node type mismatch; expected <class 'list'>, but got .*_Dim.*.
+=======
+        )  # ValueError: Node type mismatch; expected <class 'list'>, but got .*Dim.*.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(
             torch._dynamo.exc.UserError,
             re.escape(
@@ -3872,6 +4772,22 @@ def forward(self, x):
         dynamic_shapes = ({"k": {"k2": [(dim,)], "k1": [(dim,)]}},)  # ok
         export(N(), inputs, dynamic_shapes=dynamic_shapes)
 
+<<<<<<< HEAD
+=======
+        class O(torch.nn.Module):
+            def forward(self, x):
+                return x + 2
+
+        inputs = (torch.randn(4, 8, 6),)
+        dynamic_shapes = {"x": (dim, None)}
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            r"Expected dynamic shape spec .* at `dynamic_shapes\['x'\]` to have the same length "
+            r"as the actual tensor shape torch\.Size\(\[4, 8, 6\]\) \(expected 3, but got 2 instead\)",
+        ):
+            export(O(), inputs, dynamic_shapes=dynamic_shapes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unbacked_bindings_for_divisible_u_symint(self):
         from torch._export.utils import _get_shape_env_from_gm
         from torch.utils._sympy.symbol import prefix_str, symbol_is_type, SymT
@@ -3981,7 +4897,10 @@ def forward(self, x, y, z):
         ep = export(Foo(), inps, dynamic_shapes=dynamic_shapes)
         # values should have no unbacked symbols, bindings should be empty
         for node in ep.graph.nodes:
+<<<<<<< HEAD
             symbols = []
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             val = node.meta.get("val")
             bindings = node.meta.get("unbacked_bindings")
             self.assertTrue(
@@ -4011,6 +4930,39 @@ def forward(self, x):
         ):
             _ = export(M(), (torch.tensor([2, 3, 5]),))
 
+<<<<<<< HEAD
+=======
+    def test_unbacked_infer_size(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                u0 = x.item()
+                torch._check_is_size(u0)
+                t = torch.empty(u0 - 1)
+                return t + t
+
+        ep = torch.export.export(Foo(), (torch.tensor([5]),))
+        ep.module()(torch.tensor([5]))
+        ep.module()(torch.tensor([1]))
+
+    def test_unbacked_pad(self):
+        class Foo(torch.nn.Module):
+            def forward(self, xs, pad):
+                u0, u1, u2 = xs.tolist()
+                x = torch.ones(u0, u1, u2)
+                pl0, pr0, pl1, pr1 = pad.tolist()
+                return torch.nn.functional.pad(x, (pl0, pr0, pl1, pr1))
+
+        x = torch.tensor([64, 64, 64])
+        pad = torch.tensor([8, -8, 4, 0])
+        m = Foo()
+        ep = export(m, (x, pad))
+        self.assertEqual(ep.module()(x, pad).shape, m(x, pad).shape)
+
+        # don't guard on negative/positive pad values
+        pad2 = torch.tensor([-5, 9, 0, 8])
+        self.assertEqual(ep.module()(x, pad2).shape, m(x, pad2).shape)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_suggested_fixes_for_data_dependent_errors_basic(self):
         # suggested fixes for data-dependent errors only work in non-strict mode
         strict = False
@@ -4043,6 +4995,7 @@ class M_v0(torch.nn.Module):
             def forward(self, t):
                 items = [t[i].item() for i in range(t.numel())]
                 r = torch.randn([items[0], items[1]])
+<<<<<<< HEAD
                 # Could not guard on data-dependent expression Eq(u2, -1)
                 return r.view(items[0], items[2])
 
@@ -4115,6 +5068,11 @@ def forward(self, t):
                 return r.view(items[0], items[2])
 
         M = M_v3
+=======
+                return r.view(items[0], items[2])
+
+        M = M_v0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         export(N(), (t,), strict=strict)
 
     def test_suggested_fixes_for_data_dependent_errors_puzzlers(self):
@@ -4158,6 +5116,7 @@ def forward(self, x, y, fixes):
             ],
         )
 
+<<<<<<< HEAD
         class cf_nomemo(torch.nn.Module):
             def forward(self, x, y, fixes):
                 i = y[0].item()
@@ -4193,6 +5152,8 @@ def forward(self, x, fixes):
             ],
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class cf_stacklist(torch.nn.Module):
             def forward(self, xs, y, fixes):
                 i = y.item()
@@ -4226,6 +5187,32 @@ def forward(self, x, offsets_t, fixes):
             fixes=[],  # nothing to fix!
         )
 
+<<<<<<< HEAD
+=======
+    def test_simple_unbacked_view(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                u0 = x.item()
+                y = torch.empty(5, u0)
+                return y.view(u0, 5)  # [5, u0] -> [u0, 5]
+
+        ep = export(Foo(), (torch.tensor([9]),))
+        self.assertEqual(ep.module()(torch.tensor([8])).size(0), 8)
+        self.assertEqual(ep.module()(torch.tensor([5])).size(0), 5)
+
+        class Foov2(torch.nn.Module):
+            def forward(self, xs):
+                xsl = xs.tolist()
+                a, b = xsl
+                x = torch.zeros(a)
+                return x.reshape(b)
+
+        xs = torch.tensor([4, 4])
+        ep = export(Foov2(), (xs,))
+        self.assertEqual(ep.module()(xs).size(0), 4)
+        self.assertEqual(ep.module()(torch.tensor([5, 5])).size(0), 5)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_no_suggested_fixes_for_data_dependent_errors(self):
         # suggested fixes for data-dependent errors only work in non-strict mode
         strict = False
@@ -4313,6 +5300,10 @@ def forward(self, x):
         # There should be nonzero view nodes in the graph
         self.assertTrue(view_count > 0)
 
+<<<<<<< HEAD
+=======
+    @testing.expectedFailureCppSerDes  # cpp ser/der not handling complicated symbols
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_solver_unsupported_sympy_function(self):
         # repro of https://github.com/pytorch/pytorch/issues/131897
 
@@ -4337,12 +5328,157 @@ def forward(self, x, y):
             torch.rand((1, 1, 32, 32)),
         )
 
+<<<<<<< HEAD
         dim = torch.export.Dim("Dim", min=16, max=64)
+=======
+        dim = torch.export.Dim.AUTO
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dynamic_shapes = {"x": {2: dim, 3: dim}, "y": {2: dim, 3: dim}}
 
         exported_program = export(model, inputs, dynamic_shapes=dynamic_shapes)
         self.assertEqual(exported_program.module()(*inputs), model(*inputs))
 
+<<<<<<< HEAD
+=======
+    def test_export_max_onnx_reported(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                s1 = max(x.shape[0], y.shape[0])
+                s2 = max(x.shape[1], y.shape[1])
+                z = torch.zeros((s1, s2), dtype=x.dtype)
+                z[: x.shape[0], : x.shape[1]] = x
+                z[: y.shape[0], : y.shape[1]] += y
+                return z
+
+        model = Model()
+        x = torch.arange(6).reshape((2, 3))
+        y = torch.arange(6).reshape((3, 2)) * 10
+        DYN = torch.export.Dim.DYNAMIC
+
+        ep = export(
+            model,
+            (x, y),
+            dynamic_shapes=({0: DYN, 1: DYN}, {0: DYN, 1: DYN}),
+            strict=True,
+        )
+        self.assertTrue(torch.allclose(ep.module()(x, y), model(x, y)))
+        x2 = torch.arange(4).reshape((2, 2))
+        y2 = torch.arange(9).reshape((3, 3))
+        self.assertTrue(torch.allclose(ep.module()(x2, y2), model(x2, y2)))
+
+    def test_export_max_nonstrict(self):
+        class FooMax(torch.nn.Module):
+            def forward(self, x):
+                return torch.ones(max(x.item(), 1024))
+
+        ep_non_strict_foo_max_symint = export(
+            FooMax(), (torch.tensor(4),), strict=False
+        ).graph
+        FileCheck().check_count("torch.sym_max", count=1, exactly=True).run(
+            str(ep_non_strict_foo_max_symint)
+        )
+
+        class FooMaxTensors(torch.nn.Module):
+            def forward(self, x):
+                return torch.ones(max(x, x)) + torch.ones(min(x, x))
+
+        ep_non_strict_foo_max_symint = export(
+            FooMaxTensors(), (torch.tensor(4),), strict=False
+        ).graph
+        FileCheck().check_count(
+            "torch.ops.aten.maximum.default", count=1, exactly=True
+        ).run(str(ep_non_strict_foo_max_symint))
+        FileCheck().check_count(
+            "torch.ops.aten.minimum.default", count=1, exactly=True
+        ).run(str(ep_non_strict_foo_max_symint))
+
+        class FooMaxTensorsIter(torch.nn.Module):
+            def forward(self, x):
+                return max([x, x]) + min([x, x]) + max(x, 5) + min(x, 3)
+
+        ep_non_strict_foo_max_symint = export(
+            FooMaxTensorsIter(), (torch.tensor(4),), strict=False
+        ).graph
+        FileCheck().check_count(
+            "torch.ops.aten.maximum.default", count=1, exactly=True
+        ).run(str(ep_non_strict_foo_max_symint))
+        FileCheck().check_count(
+            "torch.ops.aten.minimum.default", count=1, exactly=True
+        ).run(str(ep_non_strict_foo_max_symint))
+        FileCheck().check_count(
+            "torch.ops.aten.clamp.default", count=2, exactly=True
+        ).run(str(ep_non_strict_foo_max_symint))
+
+        class FooMaxTensorsSymInt(torch.nn.Module):
+            def forward(self, x, y):
+                return max([x.shape[0], y.shape[0], x.shape[0]]) + min(
+                    [x.shape[0], y.shape[0], x.shape[0]]
+                )
+
+        dynamic_shapes = {
+            "x": {0: torch.export.Dim.AUTO},
+            "y": {0: torch.export.Dim.AUTO},
+        }
+
+        ep_non_strict_foo_max_symint = export(
+            FooMaxTensorsSymInt(),
+            (torch.randn(4, 4), torch.randn(4, 4)),
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+        ).graph
+        FileCheck().check_count("torch.sym_max", count=1, exactly=True).run(
+            str(ep_non_strict_foo_max_symint)
+        )
+        FileCheck().check_count("torch.sym_min", count=1, exactly=True).run(
+            str(ep_non_strict_foo_max_symint)
+        )
+
+        class FooMaxTensorsSymShape(torch.nn.Module):
+            def forward(self, x):
+                return max(x, x.shape[0])
+
+        dynamic_shapes = {
+            "x": {0: torch.export.Dim.AUTO},
+        }
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Dynamo failed to run FX node with fake tensors"
+        ):
+            _ = export(
+                FooMaxTensorsSymShape(),
+                (torch.randn(4, 4),),
+                dynamic_shapes=dynamic_shapes,
+                strict=True,
+            ).graph
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Boolean value of Tensor with more than one value is ambiguous",
+        ):
+            _t = export(
+                FooMaxTensorsSymShape(),
+                (torch.randn(4, 4),),
+                dynamic_shapes=dynamic_shapes,
+                strict=False,
+            ).graph
+
+    def test_math_pow(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                b = x.item()
+                p = min(b, 10)
+                p = math.pow(p, 10)
+                return y * p
+
+        ep = export(M(), (torch.tensor(5), torch.randn(5)), strict=False)
+        FileCheck().check_count("torch.sym_min", count=1, exactly=True).run(
+            str(ep.graph)
+        )
+        FileCheck().check_count("operator.pow", count=1, exactly=True).run(
+            str(ep.graph)
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_mod_constraints(self):
         class BasicDynamiShapeModel(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -4362,7 +5498,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         em.module()(torch.randn(4, 3))
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             r"Runtime assertion failed for expression Eq\(Mod\(s0\*s1, s0 \- 1\), 0\)",
+=======
+            r"Runtime assertion failed for expression Eq\(Mod\(s27\*s77, s77 \- 1\), 0\)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             em.module()(torch.randn(4, 5))
 
@@ -4373,7 +5513,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = torch.randn(3, 5)
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "Expected.*shape\\[1\\] = 5 to be of the form 2\\*s1, where s1 is an integer",
+=======
+            "Expected.*shape\\[1\\] = 5 to be of the form 2\\*s33, where s33 is an integer",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             em.module()(x)
 
@@ -4396,8 +5540,11 @@ def forward(self, x, y):
         ep = export(Foo(), inputs, dynamic_shapes=shapes)
         ep.module()(torch.randn(6, 3), torch.randn(7, 4))
 
+<<<<<<< HEAD
     @testing.expectedFailureRetraceability  # T183144629
     @testing.expectedFailureSerDerNonStrict
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_map(self):
         class Module(torch.nn.Module):
             def forward(self, xs, y, z):
@@ -4550,6 +5697,34 @@ def forward(self, arg1, arg2, *args):
         args = (torch.ones(2, 3), torch.ones(3, 4), torch.ones(2, 3), torch.ones(3, 4))
         self._test_export_same_as_eager(kw_func, args)
 
+<<<<<<< HEAD
+=======
+    @testing.expectedFailureCppRuntime
+    def test_export_module(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.p1 = torch.nn.Parameter(torch.ones(3, 4))
+                self.p2 = torch.nn.Parameter(
+                    CustomTensorPlainOut(
+                        torch.ones(3, 4),
+                        torch.ones(3, 4),
+                    )
+                )
+
+            def forward(self, x):
+                a = (2 * self.p1 + self.p2).sum()
+                return x + a
+
+        model = Foo()
+        example_inputs = (torch.randn(3, 4),)
+        ep = export(model, example_inputs, strict=False)
+        before = list(ep.state_dict.keys())
+        ep.run_decompositions()
+        after = list(ep.state_dict.keys())
+        self.assertEqual(before, after)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_func_with_keyword_only_args(self):
         class Module(torch.nn.Module):
             def forward(self, arg1, arg2, *args, kw1, kw2):
@@ -4610,6 +5785,27 @@ def forward(self, scores, score_thr, topk: torch.Tensor, results=None):
         self.assertTrue(torch.allclose(orig_res[1], ep_res[1]))
         self.assertTrue(torch.allclose(orig_res[2], ep_res[2]))
 
+<<<<<<< HEAD
+=======
+    def test_multidimensional_slicing(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                b = x.item()
+                torch._check(b >= 0)
+                torch._check(b < y.shape[0])
+                return y[0, b]
+
+        if is_non_strict_test(self._testMethodName):
+            m = M()
+            inp = (torch.tensor(4), torch.ones(10, 10))
+            r = m(*inp)
+
+            epm = export(m, inp).module()
+            er = epm(*inp)
+
+            self.assertTrue(torch.allclose(er, r))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sequential_slicing(self):
         # See https://github.com/pytorch/pytorch/issues/137455
 
@@ -4720,6 +5916,142 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(orig_res[1], ep_res[1]))
         self.assertTrue(torch.allclose(orig_res[2], ep_res[2]))
 
+<<<<<<< HEAD
+=======
+    def test_unflatten_placeholder_update_child2parent_swap(self):
+        class Child(torch.nn.Module):
+            def forward(self, x):
+                torch.ops.aten.dropout_(x, 0.5, False)  # Applying dropout inplace
+                return x - 2
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.child = Child()
+
+            def forward(self, x):
+                f1 = self.child(x)
+                f2 = x * 4
+                return f1 + f2
+
+        m = Foo()
+        inp = torch.ones(3, 10, dtype=torch.float32)
+        orig_result = m(inp)
+
+        if not is_retracebility_test(self._testMethodName):
+            inp = torch.ones(3, 10, dtype=torch.float32)
+            ep = export(m, (inp,), preserve_module_call_signature=("child",))
+            unf = unflatten(ep)
+            unf.print_readable()
+
+            inp = torch.ones(3, 10, dtype=torch.float32)
+            ep_result = ep.module()(inp)
+            self.assertTrue(torch.allclose(ep_result, orig_result))
+
+            unf.set_submodule("child", m.child)
+            inp = torch.ones(3, 10, dtype=torch.float32)
+            unf_result = unf(inp)
+            self.assertTrue(torch.allclose(unf_result, orig_result))
+
+    def test_unflatten_placeholder_update_grandchild2cousin_swap(self):
+        class Grandchild(torch.nn.Module):
+            def forward(self, x):
+                a = x.to(torch.float32)  # .to is considered a mutation
+                return x + 4, a
+
+        class Child(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.grandchild = Grandchild()
+
+            def forward(self, x):
+                y, a = self.grandchild(x)
+                return y + a
+
+        class OtherGrandchild(torch.nn.Module):
+            def forward(self, x):
+                return x * 2
+
+        class OtherChild(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.other_grandchild = OtherGrandchild()
+
+            def forward(self, x):
+                return x + self.other_grandchild(x)
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.child = Child()
+                self.other_child = OtherChild()
+
+            def forward(self, x):
+                f1 = self.child(x)
+                f2 = self.other_child(x)
+                return f1 + f2
+
+        inp = torch.ones(2, 3, dtype=torch.float32)
+        orig_result = Foo()(inp)
+        self.assertTrue(torch.allclose(orig_result, torch.ones(2, 3) * 9))
+
+        if not is_retracebility_test(self._testMethodName):
+            inp = torch.ones(2, 3, dtype=torch.float32)
+            ep = export(Foo(), (inp,), preserve_module_call_signature=("child",))
+            unf = unflatten(ep)
+
+            inp = torch.ones(2, 3, dtype=torch.float32)
+            ep_result = ep.module()(inp)
+            self.assertTrue(torch.allclose(ep_result, orig_result))
+
+            unf.set_submodule("child", Child())
+            inp = torch.ones(2, 3, dtype=torch.float32)
+            unf_result = unf(inp)
+            self.assertTrue(torch.allclose(unf_result, orig_result))
+
+    def test_unflatten_buffer_update_child2parent_swap(self):
+        class Child(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.buf = torch.nn.Buffer(torch.tensor(10))
+
+            def forward(self, x):
+                self.buf.add_(1)
+                return x + 2
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.child = Child()
+
+            def forward(self, x):
+                y = self.child(x)  # child.buf <- 10 + 1 = 11, x + 2 = 3
+                x = y + self.child.buf  # 14
+                y = self.child(x)  # child.buf <- 11 + 1 = 12, x + 2 = 16
+                x = y + self.child.buf  # 28
+                y = self.child(x)  # child.buf <- 12 + 1 = 13, x + 2 = 30
+                x = y + self.child.buf  # 43
+                return x
+
+        inp = torch.ones(2, 3, dtype=torch.float32)
+        orig_result = Foo()(inp)
+        self.assertTrue(torch.allclose(orig_result, torch.ones(2, 3) * 43))
+
+        if not is_retracebility_test(self._testMethodName):
+            inp = torch.ones(2, 3, dtype=torch.float32)
+            ep = export(Foo(), (inp,), preserve_module_call_signature=("child",))
+            unf = unflatten(ep)
+
+            inp = torch.ones(2, 3, dtype=torch.float32)
+            ep_result = ep.module()(inp)
+            self.assertTrue(torch.allclose(ep_result, orig_result))
+
+            unf.set_submodule("child", Child())
+            inp = torch.ones(2, 3, dtype=torch.float32)
+            unf_result = unf(inp)
+            self.assertTrue(torch.allclose(unf_result, orig_result))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_func_with_var_keyword_pytree_args(self):
         class Module(torch.nn.Module):
             def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
@@ -4746,11 +6078,17 @@ def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
     @testing.expectedFailureSerDer  # we don't save placeholder metadata
     @testing.expectedFailureCppSerDes  # we don't save placeholder metadata
     @testing.expectedFailureSerDerNonStrict
+<<<<<<< HEAD
     @testing.expectedFailureNonStrict
     @testing.expectedFailureTrainingIRToRunDecompNonStrict  # source_fn_stack failure
     @testing.expectedFailureRetraceabilityNonStrict
     @testing.expectedFailureLegacyExportNonStrict
     def test_linear_conv(self):
+=======
+    def test_linear_conv(self):
+        strict = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class MyLinear(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -4771,7 +6109,11 @@ def forward(self, x):
                 x_linear = self.linear(x_conv)
                 return x_linear.cos()
 
+<<<<<<< HEAD
         ep = export(Foo(), (torch.randn(20, 16, 50, 100),))
+=======
+        ep = export(Foo(), (torch.randn(20, 16, 50, 100),), strict=strict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in ep.graph.nodes:
             if (
                 node.op == "placeholder"
@@ -4780,7 +6122,10 @@ def forward(self, x):
             ):
                 self.assertTrue("source_fn_stack" in node.meta)
 
+<<<<<<< HEAD
     @testing.expectedFailureRetraceability  # T186979579
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_dataclass(self):
         torch.export.register_dataclass(
             Inp2,
@@ -4801,6 +6146,7 @@ def forward(self, inputs):
         )
         self.assertEqual(
             [
+<<<<<<< HEAD
                 str(node.meta["val"].shape)
                 for node in efoo.graph_module.graph.nodes
                 if node.op == "placeholder"
@@ -4811,6 +6157,24 @@ def forward(self, inputs):
     def test_export_method(self):
         from torch._export.utils import sync_state, wrap_method
 
+=======
+                # First dimension varies across strict and non-strict
+                # since the source names are different, resulting in
+                # different symbol names.
+                str(node.meta["val"].shape[1:])
+                for node in efoo.graph_module.graph.nodes
+                if node.op == "placeholder"
+            ],
+            ["torch.Size([2, 3])", "torch.Size([3, 4])"],
+        )
+
+    @testing.expectedFailureCppSerDes
+    def test_export_method(self):
+        from torch._export.utils import sync_state, wrap_method
+
+        strict = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -4835,6 +6199,10 @@ def bar(self, x):
             wrap_method(em.foo),
             (ex,),
             dynamic_shapes={"x": (Dim.DYNAMIC,)},
+<<<<<<< HEAD
+=======
+            strict=strict,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).module()
 
         # ...bar
@@ -4842,6 +6210,10 @@ def bar(self, x):
             wrap_method(em.bar),
             (ex,),
             dynamic_shapes=((Dim.DYNAMIC,),),
+<<<<<<< HEAD
+=======
+            strict=strict,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).module()
 
         if is_serdes_test(self._testMethodName):
@@ -4901,8 +6273,13 @@ def forward(self, x, y):
         with self.assertRaisesRegex(
             torch._dynamo.exc.UserError,
             (
+<<<<<<< HEAD
                 "Constraints violated \\(batch\\)!(.*\n)*.*"
                 "batch was inferred to be a constant(.*\n)*.*"
+=======
+                "You marked.*but your code specialized it to be a constant.*"
+                "If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO(.*\n)*.*"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Suggested fixes:(.*\n)*.*"
                 "batch = 10"
             ),
@@ -4938,6 +6315,7 @@ def forward(self, x, y):
                 "y": (batch, size, size),
             },
         )
+<<<<<<< HEAD
         self.assertEqual(
             [
                 str(node.meta["val"].shape)
@@ -4946,6 +6324,12 @@ def forward(self, x, y):
             ],
             ["torch.Size([s0, s1, s1])", "torch.Size([s0, s1, s1])"],
         )
+=======
+
+        for node in efoo.graph_module.graph.nodes:
+            if node.op == "placeholder":
+                self.assertEqual(node.meta["val"].shape[1], node.meta["val"].shape[2])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(efoo.module()(*inputs).shape, foo(*inputs).shape)
 
         # pass dynamic shapes of inputs [multiple, mostly distinct]
@@ -4956,6 +6340,7 @@ def forward(self, x, y):
             inputs,
             dynamic_shapes={"x": (batch, M, K), "y": (batch, K, N)},
         )
+<<<<<<< HEAD
         self.assertEqual(
             [
                 str(node.meta["val"].shape)
@@ -4963,6 +6348,16 @@ def forward(self, x, y):
                 if node.op == "placeholder"
             ],
             ["torch.Size([s0, s1, s2])", "torch.Size([s0, s2, s5])"],
+=======
+        placeholders = [
+            node.meta["val"].shape
+            for node in efoo.graph_module.graph.nodes
+            if node.op == "placeholder"
+        ]
+        self.assertEqual(
+            placeholders[0][2],
+            placeholders[1][1],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(efoo.module()(*inputs).shape, foo(*inputs).shape)
 
@@ -4979,11 +6374,22 @@ def forward(self, inputs):
         )
         self.assertEqual(
             [
+<<<<<<< HEAD
                 str(node.meta["val"].shape)
                 for node in efoo.graph_module.graph.nodes
                 if node.op == "placeholder"
             ],
             ["torch.Size([s0, 2, 3])", "torch.Size([s0, 3, 4])"],
+=======
+                # First dimension varies across strict and non-strict
+                # since the source names are different, resulting in
+                # different symbol names.
+                str(node.meta["val"].shape[1:])
+                for node in efoo.graph_module.graph.nodes
+                if node.op == "placeholder"
+            ],
+            ["torch.Size([2, 3])", "torch.Size([3, 4])"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(efoo.module()(*inputs).shape, foo(*inputs).shape)
 
@@ -5000,11 +6406,22 @@ def forward(self, inputs):
         )
         self.assertEqual(
             [
+<<<<<<< HEAD
                 str(node.meta["val"].shape)
                 for node in efoo.graph_module.graph.nodes
                 if node.op == "placeholder"
             ],
             ["torch.Size([s0, 2, 3])", "torch.Size([s0, 3, 4])"],
+=======
+                # First dimension varies across strict and non-strict
+                # since the source names are different, resulting in
+                # different symbol names.
+                str(node.meta["val"].shape[1:])
+                for node in efoo.graph_module.graph.nodes
+                if node.op == "placeholder"
+            ],
+            ["torch.Size([2, 3])", "torch.Size([3, 4])"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(efoo.module()(*inputs).shape, foo(*inputs).shape)
 
@@ -5065,8 +6482,13 @@ def forward(self, x, y):
         with self.assertRaisesRegex(
             torch._dynamo.exc.UserError,
             (
+<<<<<<< HEAD
                 "Constraints violated \\(K1\\)!(.*\n)*.*"
                 "K1 was inferred to be a constant(.*\n)*.*"
+=======
+                "You marked.*but your code specialized it to be a constant.*"
+                "If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO(.*\n)*"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Suggested fixes:(.*\n)*.*"
                 "K1 = 3"
             ),
@@ -5314,7 +6736,11 @@ def forward(self, inputs):
             if node.op == "placeholder"
         ]
         self.assertEqual(len(input_shapes), 9)
+<<<<<<< HEAD
         self.assertTrue(all(shape == "torch.Size([s0])" for shape in input_shapes))
+=======
+        self.assertEqual(len(set(input_shapes)), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_error_does_not_reference_eager_fallback(self):
         class Module(torch.nn.Module):
@@ -5502,7 +6928,11 @@ def _patch_config(kwargs):
         with _patch_config({"allow_rnn": False}):
             with self.assertRaisesRegex(
                 torch._dynamo.exc.Unsupported,
+<<<<<<< HEAD
                 "TorchDynamo purposely graph breaks on RNN, GRU, LSTMs",
+=======
+                "Dynamo does not support RNN, GRU, or LSTM.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 _ = export(mod, inp, strict=True)
 
@@ -5511,14 +6941,45 @@ class Module(torch.nn.Module):
             def forward(self, x):
                 return x.to("cpu")
 
+<<<<<<< HEAD
         ep = export(Module(), (torch.tensor(1, device="cpu"),)).run_decompositions({})
+=======
+        ep = export(Module(), (torch.tensor(1, device="cpu"),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ops = []
         for node in ep.graph.nodes:
             if node.op == "call_function":
                 ops.append(node.target)
+<<<<<<< HEAD
         self.assertGreater(len(ops), 0)
         for op in ops:
             self.assertIn(op, (torch.ops.aten._to_copy.default,))
+=======
+
+        if is_training_ir_test(self._testMethodName):
+            # aten.to will just specialize by decomposing to a no-op
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                ],
+            )
+        else:
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten.to.dtype_layout,
+                ],
+            )
+
+        ep = ep.run_decompositions({})
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        self.assertEqual(len(ops), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_device_to_dynamic(self):
         class Module(torch.nn.Module):
@@ -5529,14 +6990,45 @@ def forward(self, x):
             Module(),
             (torch.tensor([1, 2], device="cpu"),),
             dynamic_shapes={"x": {0: Dim("i")}},
+<<<<<<< HEAD
         ).run_decompositions({})
+=======
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ops = []
         for node in ep.graph.nodes:
             if node.op == "call_function":
                 ops.append(node.target)
+<<<<<<< HEAD
         self.assertGreater(len(ops), 0)
         for op in ops:
             self.assertIn(op, (torch.ops.aten._to_copy.default,))
+=======
+
+        if is_training_ir_test(self._testMethodName):
+            # aten.to will just specialize by decomposing to a no-op
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                ],
+            )
+        else:
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten.to.dtype_layout,
+                ],
+            )
+
+        ep = ep.run_decompositions({})
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        self.assertEqual(len(ops), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_device_to_mutation(self):
         class Module(torch.nn.Module):
@@ -5545,10 +7037,105 @@ def forward(self, x):
                 y.add_(1)
                 return y, x
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             RuntimeError, "cannot mutate tensors with frozen storage"
         ):
             export(Module(), (torch.tensor(1, device="cpu"),)).run_decompositions({})
+=======
+        ep = export(Module(), (torch.tensor(1, device="cpu"),))
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        if is_training_ir_test(self._testMethodName):
+            # aten.to decomposes to no-op, add_ decomposes to functional variant
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten.add.Tensor,
+                ],
+            )
+        else:
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten.to.dtype_layout,
+                    torch.ops.aten.add_.Tensor,
+                ],
+            )
+
+        # test mutation
+        x = torch.tensor(2, device="cpu")
+        y, _ = ep.module()(x)
+        self.assertEqual(x.item(), 3)
+        self.assertEqual(id(y), id(x))
+
+        # test decomp ep
+        ep = ep.run_decompositions({})
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                self.assertNotEqual(node.target, torch.ops.aten.to.dtype_layout)
+
+        # test mutation for decomposed program
+        y, _ = ep.module()(x)
+        self.assertEqual(x.item(), 4)
+        self.assertEqual(id(y), id(x))
+
+    @requires_gpu
+    @testing.expectedFailureCppRuntime
+    def test_device_to_gpu(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x.to("cpu")
+
+        ep = export(Foo(), (torch.randn(64).to(GPU_TYPE),))
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        if is_training_ir_test(self._testMethodName):
+            # aten.to decomposes to _to_copy
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten._to_copy.default,
+                ],
+            )
+        else:
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten.to.dtype_layout,
+                ],
+            )
+
+        # Check device assertion
+        with self.assertRaisesRegex(RuntimeError, "Tensor device mismatch!"):
+            ep.module()(torch.randn(64))
+
+        ep = ep.run_decompositions()
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        self.assertEqual(len(ops), 2)
+        self.assertEqual(
+            ops,
+            [
+                torch.ops.aten._assert_tensor_metadata.default,
+                torch.ops.aten._to_copy.default,
+            ],
+        )
+
+        # Check device assertion again after decomp
+        with self.assertRaisesRegex(RuntimeError, "Tensor device mismatch!"):
+            ep.module()(torch.randn(64))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_tensor_constant_aten_to(self):
         class Module(torch.nn.Module):
@@ -5571,45 +7158,232 @@ def forward(self, x):
 
         self.assertEqual(ep.module()(*inputs), model(*inputs))
 
+<<<<<<< HEAD
+=======
+    def test_export_aten_to_unflatten(self):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x.sum()
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bar = Bar()
+
+            def forward(self, x):
+                to = x.to(torch.float)
+                return self.bar(to).sum()
+
+        inp = torch.randn(4, 4)
+
+        ep = export(
+            Foo(), (inp,), strict=False, preserve_module_call_signature=("bar",)
+        )
+        mod = ep.module()
+        self.assertTrue(torch.allclose(mod(inp), Foo()(inp)))
+
+    @testing.expectedFailureLegacyExportNonStrict
+    @testing.expectedFailureLegacyExportStrict
+    @testing.expectedFailureRetraceabilityNonStrict  # when we retrace, ep.module() is hierarchical
+    @testing.expectedFailureRetraceability  # when we retrace, ep.module() is hierarchical
+    def test_export_aten_to_unflatten_subclass(self):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x.sum()
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bar = Bar()
+                self.param = torch.nn.Parameter(
+                    TwoTensor(torch.ones(4, 4), torch.ones(4, 4))
+                )
+
+            def forward(self, x):
+                to = self.param.to(torch.float)
+                return (self.bar(to).sum() + x.sum()).get_elem_a()
+
+        inp = torch.randn(4, 4)
+
+        with self.assertRaisesRegex(
+            ValueError, "It looks like p_param is a tensor subclass."
+        ):
+            export(
+                Foo(), (inp,), strict=False, preserve_module_call_signature=("bar",)
+            ).run_decompositions({})
+
+    def test_export_aten_to_unflatten_subclass_pre_dispatch(self):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x.sum()
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bar = Bar()
+                self.param = torch.nn.Parameter(
+                    TwoTensor(torch.ones(4, 4), torch.ones(4, 4))
+                )
+
+            def forward(self, x):
+                to = self.param.to(torch.float)
+                return (self.bar(to).sum() + x.sum()).get_elem_a()
+
+        inp = torch.randn(4, 4)
+
+        ep = export_for_training(
+            Foo(), (inp,), strict=False, preserve_module_call_signature=("bar",)
+        )
+        unflat = unflatten(ep).bar
+        self.assertExpectedInline(
+            str(unflat.graph).strip(),
+            """\
+graph():
+    %_positional_arg_0 : [num_users=1] = placeholder[target=_positional_arg_0]
+    %_spec_0 : [num_users=1] = get_attr[target=_spec_0]
+    %tree_flatten_spec : [num_users=1] = call_function[target=torch.fx._pytree.tree_flatten_spec](args = (((%_positional_arg_0,), {}), %_spec_0), kwargs = {})
+    %to : [num_users=1] = call_function[target=operator.getitem](args = (%tree_flatten_spec, 0), kwargs = {})
+    %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.default](args = (%to,), kwargs = {})
+    %_spec_1 : [num_users=1] = get_attr[target=_spec_1]
+    %tree_unflatten : [num_users=1] = call_function[target=torch.utils._pytree.tree_unflatten](args = ((%sum_1,), %_spec_1), kwargs = {})
+    return tree_unflatten""",
+        )
+
+        with self.assertRaisesRegex(
+            ValueError, "It looks like p_param is a tensor subclass."
+        ):
+            ep.run_decompositions()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_float_conversion(self):
         class Module(torch.nn.Module):
             def forward(self, x):
                 return x.float()
 
+<<<<<<< HEAD
         ep = export(Module(), (torch.tensor(1, dtype=torch.float),)).run_decompositions(
             {}
         )
+=======
+        ep = export(Module(), (torch.tensor(1, dtype=torch.float),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ops = []
         for node in ep.graph.nodes:
             if node.op == "call_function":
                 ops.append(node.target)
+<<<<<<< HEAD
         self.assertGreater(len(ops), 0)
         for op in ops:
             self.assertIn(op, (torch.ops.aten._to_copy.default,))
+=======
+        if is_training_ir_test(self._testMethodName):
+            # .float() decomposes to no-op
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                ],
+            )
+        else:
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten.to.dtype,
+                ],
+            )
+
+        ep = ep.run_decompositions({})
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        self.assertEqual(len(ops), 1)
+
+        # test aliasing
+        x = torch.tensor(1, dtype=torch.float)
+        out = ep.module()(x)
+        self.assertEqual(id(x), id(out))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_float_conversion_from_int(self):
         class Module(torch.nn.Module):
             def forward(self, x):
                 return x.float()
 
+<<<<<<< HEAD
         ep = export(Module(), (torch.tensor(1, dtype=torch.int32),)).run_decompositions(
             {}
         )
+=======
+        ep = export(Module(), (torch.tensor(1, dtype=torch.int32),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ops = []
         for node in ep.graph.nodes:
             if node.op == "call_function":
                 ops.append(node.target)
+<<<<<<< HEAD
         self.assertGreater(len(ops), 0)
         self.assertIn(torch.ops.aten._to_copy.default, ops)
         self.assertIn(torch.ops.aten._assert_tensor_metadata.default, ops)
 
         self.assertEqual(ep.module()(torch.tensor(1, dtype=torch.int32)), 1)
+=======
+        if is_training_ir_test(self._testMethodName):
+            # .float() decomposes to _to_copy()
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten._to_copy.default,
+                ],
+            )
+        else:
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten.to.dtype,
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Raises error because the input dtype is not the same as the input
         # tensor when exporting.
         with self.assertRaisesRegex(RuntimeError, "Tensor dtype mismatch!"):
             ep.module()(torch.tensor(1, dtype=torch.float32))
 
+<<<<<<< HEAD
+=======
+        ep = ep.run_decompositions({})
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        self.assertEqual(
+            ops,
+            [
+                torch.ops.aten._assert_tensor_metadata.default,
+                torch.ops.aten._to_copy.default,
+            ],
+        )
+
+        # Check dtype assertion again after decomp
+        with self.assertRaisesRegex(RuntimeError, "Tensor dtype mismatch!"):
+            ep.module()(torch.tensor(1, dtype=torch.float32))
+
+        self.assertEqual(ep.module()(torch.tensor(1, dtype=torch.int32)), 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_device_to_mutation_float(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -5617,12 +7391,55 @@ def forward(self, x):
                 y.add_(1)
                 return y, x
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             RuntimeError, "cannot mutate tensors with frozen storage"
         ):
             export(Module(), (torch.tensor(1, dtype=torch.float),)).run_decompositions(
                 {}
             )
+=======
+        ep = export(Module(), (torch.tensor(1, dtype=torch.float),))
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        if is_training_ir_test(self._testMethodName):
+            # aten.to decomposes to no-op, add_ decomposes to functional variant
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten.add.Tensor,
+                ],
+            )
+        else:
+            self.assertEqual(
+                ops,
+                [
+                    torch.ops.aten._assert_tensor_metadata.default,
+                    torch.ops.aten.to.dtype,
+                    torch.ops.aten.add_.Tensor,
+                ],
+            )
+
+        # test mutation
+        x = torch.tensor(2, dtype=torch.float)
+        y, _ = ep.module()(x)
+        self.assertEqual(x.item(), 3.0)
+        self.assertEqual(id(y), id(x))
+
+        # test decomp ep
+        ep = ep.run_decompositions({})
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                self.assertNotEqual(node.target, torch.ops.aten.to.dtype)
+
+        # test mutation for decomposed program
+        y, _ = ep.module()(x)
+        self.assertEqual(x.item(), 4.0)
+        self.assertEqual(id(y), id(x))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_module(self):
         class MyLinear(torch.nn.Module):
@@ -6109,6 +7926,33 @@ def forward(self, a, b, alpha: int):
             if node.op == "placeholder":
                 self.assertTrue(isinstance(node.meta["val"], (Tensor, int)))
 
+<<<<<<< HEAD
+=======
+    @testing.expectedFailureRetraceability  # size gets unflattened into a tuple
+    def test_size_input(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+
+            def forward(self, theta, size):
+                return torch.nn.functional.affine_grid(theta, size, align_corners=None)
+
+        model = Model()
+        theta = torch.ones((1, 2, 3))
+        size = torch.Size((1, 3, 24, 24))
+        inp = (theta, size)
+        eager_result = model(*inp)
+
+        ep = export(model, inp)
+
+        epm = ep.module()
+        ep_result = epm(*inp)
+        self.assertTrue(torch.allclose(ep_result, eager_result))
+
+        args, _kwargs = ep.example_inputs
+        self.assertTrue(torch.allclose(arg, i) for arg, i in zip(args, inp))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensor_constant_with_wrapped_method(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -6296,6 +8140,27 @@ def forward(self, x, m):
         ep = torch.export.export_for_training(f, (torch.randn(2, 2), mod), strict=False)
         self.assertEqual(ref_out, ep.module()(ref_x, mod))
 
+<<<<<<< HEAD
+=======
+    def test_unbacked_noncontig_lin(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin = torch.nn.Linear(32, 64)
+
+            def forward(self, x):
+                n = x.item()
+                y = torch.empty(x).view(1, -1, 32)
+                return self.lin(y)
+
+        mod = Foo()
+        x = torch.tensor([128])
+        ep = export(mod, (x,))
+        self.assertEqual(mod(x).shape, ep.module()(x).shape)
+        x = torch.tensor([512])
+        self.assertEqual(mod(x).shape, ep.module()(x).shape)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_runtime_assert_for_prim(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -6342,6 +8207,30 @@ def forward(self, a, b, mode):
             _ = exported.module()(torch.randn(4, 4), torch.randn(4), "floor")
         self.assertTrue(torch.allclose(exported.module()(*inps), foo(*inps)))
 
+<<<<<<< HEAD
+=======
+    def test_sym_or_sym_and(self):
+        from torch.fx.experimental.symbolic_shapes import sym_and, sym_or
+
+        class Foo(torch.nn.Module):
+            def forward(self, xs):
+                u0, u1, u2 = xs.tolist()
+                torch._check(sym_or(u0 == 2, u0 == 4, u0 == 6))
+                torch._check(sym_and(u1 >= 4, u1 <= 8, u2 == 5))
+                return u0 + u1 + u2
+
+        ep = export(Foo(), (torch.tensor([2, 6, 5]),), strict=False)
+        ep.module()(torch.tensor([2, 6, 5]))
+        ep.module()(torch.tensor([4, 7, 5]))
+        ep.module()(torch.tensor([6, 5, 5]))
+        with self.assertRaisesRegex(
+            RuntimeError, r".* expression Eq\(u0, 2\) \| Eq\(u0, 4\) \| Eq\(u0, 6\) .*"
+        ):
+            ep.module()(torch.tensor([3, 6, 5]))
+        with self.assertRaisesRegex(RuntimeError, r".* expression u[\d]+ <= 5 .*"):
+            ep.module()(torch.tensor([6, 6, 6]))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_redundant_assert_max_upper_bound(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -6497,9 +8386,17 @@ def false_fn(x):
             str(schema),
             """cond(SymBool pred, GraphModule true_fn, GraphModule false_fn, Tensor[2] operands) -> Tensor[1]""",
         )
+<<<<<<< HEAD
         self.assertExpectedInline(
             ep.graph_module.code.strip(),
             """\
+=======
+        # serdes deserailizes tuple as list
+        if need_serdes_test(self._testMethodName):
+            self.assertExpectedInline(
+                ep.graph_module.code.strip(),
+                """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def forward(self, b_a_buffer, x):
     sym_size_int_1 = torch.ops.aten.sym_size.int(x, 0)
     gt = sym_size_int_1 > 4;  sym_size_int_1 = None
@@ -6508,7 +8405,39 @@ def forward(self, b_a_buffer, x):
     cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [x, b_a_buffer]);  gt = true_graph_0 = false_graph_0 = x = b_a_buffer = None
     getitem = cond[0];  cond = None
     return (getitem,)""",
+<<<<<<< HEAD
         )
+=======
+            )
+
+        else:
+            if is_inline_and_install_strict_test(self._testMethodName):
+                self.assertExpectedInline(
+                    ep.graph_module.code.strip(),
+                    """\
+def forward(self, b____modules__a____buffers__buffer, x):
+    sym_size_int_1 = torch.ops.aten.sym_size.int(x, 0)
+    gt = sym_size_int_1 > 4;  sym_size_int_1 = None
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (x, b____modules__a____buffers__buffer));  gt = true_graph_0 = false_graph_0 = x = b____modules__a____buffers__buffer = None
+    getitem = cond[0];  cond = None
+    return (getitem,)""",
+                )
+            else:
+                self.assertExpectedInline(
+                    ep.graph_module.code.strip(),
+                    """\
+def forward(self, b_a_buffer, x):
+    sym_size_int_1 = torch.ops.aten.sym_size.int(x, 0)
+    gt = sym_size_int_1 > 4;  sym_size_int_1 = None
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (x, b_a_buffer));  gt = true_graph_0 = false_graph_0 = x = b_a_buffer = None
+    getitem = cond[0];  cond = None
+    return (getitem,)""",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(
             torch.allclose(ep.module()(torch.ones(6, 4)), Foo()(torch.ones(6, 4)))
         )
@@ -6558,8 +8487,16 @@ def forward(self, x):
     @requires_cuda
     @testing.expectedFailureCppRuntime
     def test_export_associative_scan_symbol_dim(self):
+<<<<<<< HEAD
         dim1 = torch.export.Dim("dim0", min=5, max=15)
         xs = torch.ones(3, 10, 2, device=torch.device("cuda"))
+=======
+        device = torch.device("cuda")
+        combine_mode = "pointwise"
+
+        dim1 = torch.export.Dim("dim0", min=5, max=15)
+        xs = torch.ones(3, 10, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Foo(torch.nn.Module):
             def __init__(self) -> None:
@@ -6569,16 +8506,34 @@ def combine_fn(self, x, y):
                 return x + y
 
             def forward(self, x):
+<<<<<<< HEAD
                 return associative_scan(self.combine_fn, x, 2)
 
         ep = export(Foo(), (xs,), dynamic_shapes={"x": {1: dim1}})
         self.assertTrue(torch.allclose(ep.module()(xs), Foo()(xs)))
+=======
+                return associative_scan(
+                    self.combine_fn, x, 2, combine_mode=combine_mode
+                )
+
+        ep = export(Foo(), (xs,), dynamic_shapes={"x": {1: dim1}})
+        module_out = Foo()(xs)
+        self.assertTrue(torch.allclose(ep.module()(xs), module_out))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_cuda
     @testing.expectedFailureCppRuntime
     def test_export_associative_scan_symbol_scandim(self):
+<<<<<<< HEAD
         dim1 = torch.export.Dim("dim0", min=5, max=15)
         xs = torch.ones(3, 10, 2, device=torch.device("cuda"))
+=======
+        device = torch.device("cuda")
+        combine_mode = "pointwise"
+
+        dim1 = torch.export.Dim("dim0", min=5, max=15)
+        xs = torch.ones(3, 10, 2, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Foo(torch.nn.Module):
             def __init__(self) -> None:
@@ -6588,6 +8543,7 @@ def combine_fn(self, x, y):
                 return x + y
 
             def forward(self, x):
+<<<<<<< HEAD
                 return associative_scan(self.combine_fn, x, 1)
 
         ep = export(Foo(), (xs,), dynamic_shapes={"x": {1: dim1}})
@@ -6614,17 +8570,79 @@ def forward(self, x):
         inp = torch.ones(3, 10, 2, device=torch.device("cuda"))
         ep = export(M(), (inp,))
         epm = ep.module()
+=======
+                return associative_scan(
+                    self.combine_fn, x, 1, combine_mode=combine_mode
+                )
+
+        ep = export(Foo(), (xs,), dynamic_shapes={"x": {1: dim1}})
+        module_out = Foo()(xs)
+        self.assertTrue(torch.allclose(ep.module()(xs), module_out))
+
+    @requires_cuda
+    def test_export_associative_scan_lifted_buffers(self):
+        device = torch.device("cuda")
+        combine_mode = "pointwise"
+
+        class A(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.buffer = torch.nn.Buffer(torch.ones(3, 2, device=device))
+
+            def forward(self):
+                return self.buffer.cos()
+
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.a = A()
+
+            def combine_fn(self, x, y):
+                return (x + y) * self.a()
+
+            def forward(self, x):
+                return associative_scan(
+                    self.combine_fn, x, 1, combine_mode=combine_mode
+                )
+
+        inp = torch.ones(3, 10, 2, device=device)
+        ep = export(M(), (inp,))
+        epm = ep.module()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(torch.allclose(epm(inp), M()(inp)))
 
         for gm in epm.named_modules():
             if not isinstance(gm, torch.fx.GraphModule):
                 continue
             self.assertEqual(
+<<<<<<< HEAD
                 len([node for node in gm.graph.nodes if node.op == "placeholder"]), 1
             )
 
     # map_fn references module outside the module hierarchy
     @unittest.expectedFailure
+=======
+                len([node for node in gm.graph.nodes if node.op == "placeholder"]),
+                1,
+            )
+
+    # scan is not supported in sigmoid yet
+    @testing.expectedFailureCppRuntime
+    def test_export_scan_pytree_output(self):
+        def add(carry, accum):
+            return carry + carry, (accum[0]["moo"] + 1, accum[0]["moo2"] + 1)
+
+        class M(torch.nn.Module):
+            def forward(self, init, accum):
+                return scan(add, init, accum)
+
+        inp = torch.randn(3)
+        init, xs = torch.ones(3), ({"moo": torch.ones(3), "moo2": torch.ones(3)},)
+        ep = export(M(), (init, xs))
+        self.assertEqual(ep.module()(init, xs), M()(init, xs))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_map_buffers(self):
         class M1(torch.nn.Module):
             def __init__(self) -> None:
@@ -6658,6 +8676,7 @@ def forward(self, xs, y):
                 len([node for node in gm.graph.nodes if node.op == "placeholder"]), 2
             )
 
+<<<<<<< HEAD
     def test_check_is_size_error(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -6674,6 +8693,104 @@ def forward(self, x):
         error_msg = r"Could not guard on data-dependent expression"
         with self.assertRaisesRegex(error, error_msg):
             _ = export(f, (torch.tensor(6),))
+=======
+    def test_no_check_is_size_error(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                a = x.item()
+                return torch.randn(24).view(a, 4)
+
+        f = Module()
+        ep = export(f, (torch.tensor(6),))
+        ep.module()(torch.tensor(6))
+        with self.assertRaisesRegex(
+            RuntimeError, r"Runtime assertion failed for .* u.* 6"
+        ):
+            ep.module()(torch.tensor(5))
+
+    def test_is_non_negative_check_function(self):
+        import sympy as sp
+
+        from torch.fx.experimental.symbolic_shapes import _is_non_negative_check
+
+        x = sp.Symbol("x")
+        variable_name = sp.Symbol("variable_name")
+        tensor_shape = sp.Symbol("tensor.shape[0]")
+
+        self.assertEqual(_is_non_negative_check(variable_name >= 0), "variable_name")
+        self.assertEqual(_is_non_negative_check(tensor_shape >= 0), "tensor.shape[0]")
+
+        # Test cases where the condition is not checking for x >= 0
+        self.assertIsNone(_is_non_negative_check(x > 0))
+        self.assertIsNone(_is_non_negative_check(x == 0))
+        self.assertIsNotNone(_is_non_negative_check(0 <= x))
+        self.assertIsNone(_is_non_negative_check(x >= 1))
+
+    def test_suggest_torch_checks_with_non_negative_check(self):
+        from unittest.mock import patch
+
+        import sympy
+
+        from torch.export.dynamic_shapes import defaultdict
+        from torch.fx.experimental.symbolic_shapes import _suggest_torch_checks
+
+        u = sympy.Symbol("u")
+        cond = u >= 0
+        mock_exception = MagicMock(
+            spec=torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode
+        )
+        mock_exception.args = ["Test error message"]
+        mock_exception.cond = cond
+
+        mock_printer = MagicMock()
+        mock_printer.doprint.side_effect = lambda expr: (
+            str(cond) if expr == cond else "u < 0"  # Simulating the condition
+        )
+        with patch(
+            "torch.fx.experimental.symbolic_shapes._PythonMsgPrinter",
+            return_value=mock_printer,
+        ):
+            src_map = defaultdict(list)
+            src_map["u"] = ["u"]
+            _suggest_torch_checks(mock_exception, src_map)
+            error_msg = mock_exception.args[0]
+            self.assertIn("torch._check_is_size(u)", error_msg)
+            self.assertIn("torch._check(u < 0)", error_msg)
+
+    def test_suggest_torch_checks_with_regular_check(self):
+        import sympy
+
+        from torch.export.dynamic_shapes import defaultdict
+        from torch.fx.experimental.symbolic_shapes import _suggest_torch_checks
+
+        mock_exception = MagicMock(
+            spec=torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode
+        )
+        mock_exception.args = ["Test error message"]
+
+        mock_cond = MagicMock()
+        mock_cond.free_symbols = {sympy.Symbol("u")}
+        mock_exception.cond = mock_cond
+
+        mock_printer = MagicMock()
+        mock_printer.doprint.side_effect = lambda expr: (
+            "u > 5" if expr == mock_cond else "u <= 5"
+        )
+
+        with patch(
+            "torch.fx.experimental.symbolic_shapes._PythonMsgPrinter",
+            return_value=mock_printer,
+        ):
+            src_map = defaultdict(list)
+            src_map["u"] = ["u"]
+
+            _suggest_torch_checks(mock_exception, src_map)
+
+            error_msg = mock_exception.args[0]
+            self.assertIn("torch._check(u > 5)", error_msg)
+            self.assertIn("torch._check(u <= 5)", error_msg)
+            self.assertNotIn("torch._check_is_size", error_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_train_eval_on_exported_preautograd_module(self):
         class Foo(torch.nn.Module):
@@ -6764,7 +8881,10 @@ def forward(self):
         ep = export(m, ())
         self.assertEqual(ep.graph_signature.lifted_tensor_constants, ["x"])
 
+<<<<<<< HEAD
     @testing.expectedFailureRetraceability  # T186979579
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_preserve_shape_dynamism_for_unused_inputs(self):
         torch.export.register_dataclass(
             Inp3,
@@ -6812,6 +8932,67 @@ def forward(self, x, y):
                 dynamic_shapes={"x": (A, B), "y": (B, A)},
             )
 
+<<<<<<< HEAD
+=======
+    def test_multinomial_dynamic(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.multinomial(x, y.shape[0])
+
+        model = Model()
+        DYNAMIC = torch.export.Dim.DYNAMIC
+
+        def exported_module(inputs):
+            dynamic_shapes = tuple(tuple(DYNAMIC for _ in inp.shape) for inp in inputs)
+            ep = export(model, inputs, dynamic_shapes=dynamic_shapes)
+            return ep.module()
+
+        def check(inputs, epm):
+            eager_result = model(*inputs)
+            ep_result = epm(*inputs)
+            self.assertEqual(ep_result.shape, eager_result.shape)
+
+        inputs = (
+            torch.tensor([0, 10, 3, 0], dtype=torch.float32),
+            torch.ones(2, dtype=torch.int64),
+        )
+        epm = exported_module(inputs)
+        # output shape is (2,), where n_sample 2 <= dist_size 4
+        check(inputs, epm)
+
+        inputs = (
+            torch.tensor([0, 10, 3, 7, 6, 0], dtype=torch.float32),
+            torch.ones(3, dtype=torch.int64),
+        )
+        # output shape is (3,), with n_sample 3 <= dist_size 6
+        check(inputs, epm)
+
+        inputs = (
+            torch.tensor([0, 10, 3, 0], dtype=torch.float32),
+            torch.ones(5, dtype=torch.int64),
+        )
+        with self.assertRaisesRegex(RuntimeError, "cannot sample"):
+            # n_sample 5 > dist_size 4
+            epm(*inputs)
+
+        inputs = (
+            torch.tensor([[4, 5], [6, 7], [8, 9]], dtype=torch.float32),
+            torch.ones(2, dtype=torch.int64),
+        )
+        epm = exported_module(inputs)
+        # output shape is (3, 2), with n_row 3 and n_sample 2 <= dist_size 2
+        check(inputs, epm)
+
+        inputs = (
+            torch.tensor([[4, 5], [6, 7], [8, 9]], dtype=torch.float32),
+            torch.ones(3, dtype=torch.int64),
+        )
+        epm = exported_module(inputs)
+        with self.assertRaisesRegex(RuntimeError, "cannot sample"):
+            # n_sample 3 > dist_size 2
+            epm(*inputs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_with_wrong_inputs(self):
         class MyModule(torch.nn.Module):
             def forward(self, x):
@@ -6848,6 +9029,7 @@ def forward(self, x):
         self.assertEqual(id(state_dict), id(ep.state_dict))
 
     @unittest.skipIf(IS_FBCODE, "We can't customize decomp in fbcode")
+<<<<<<< HEAD
     def test_export_for_inference_e2e(self):
         class M(torch.nn.Module):
             def __init__(self) -> None:
@@ -6898,6 +9080,8 @@ def forward(self, p_lin_weight, p_lin_bias, x):
             ep_core.module()(torch.randn(4, 12))
 
     @unittest.skipIf(IS_FBCODE, "We can't customize decomp in fbcode")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_decomp_torture_case_1(self):
         class M(torch.nn.Module):
             def __init__(self) -> None:
@@ -6959,9 +9143,33 @@ def _decompose_linear_custom(x, weight, bias):
             decomp_table={torch.ops.aten.linear.default: _decompose_linear_custom}
         )
 
+<<<<<<< HEAD
         self.assertExpectedInline(
             str(ep_decompose_linear.graph_module.code).strip(),
             """\
+=======
+        if is_inline_and_install_strict_test(self._testMethodName):
+            self.assertExpectedInline(
+                str(ep_decompose_linear.graph_module.code).strip(),
+                """\
+def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_linear_bias, c_linear_weight, x, y):
+    conv2d = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias);  x = p_conv_weight = p_conv_bias = None
+    conv1d = torch.ops.aten.conv1d.default(y, p_conv1d_weight, p_conv1d_bias);  y = p_conv1d_weight = p_conv1d_bias = None
+    permute = torch.ops.aten.permute.default(c_linear_weight, [1, 0]);  c_linear_weight = None
+    matmul = torch.ops.aten.matmul.default(conv2d, permute);  conv2d = permute = None
+    mul = torch.ops.aten.mul.Tensor(c_linear_bias, 2);  c_linear_bias = None
+    add = torch.ops.aten.add.Tensor(matmul, mul);  matmul = mul = None
+    cos = torch.ops.aten.cos.default(add);  add = None
+    sum_1 = torch.ops.aten.sum.default(conv1d);  conv1d = None
+    add_1 = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
+    return (add_1,)""",
+            )
+
+        else:
+            self.assertExpectedInline(
+                str(ep_decompose_linear.graph_module.code).strip(),
+                """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_linear_weight, c_linear_bias, x, y):
     conv2d = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias);  x = p_conv_weight = p_conv_bias = None
     conv1d = torch.ops.aten.conv1d.default(y, p_conv1d_weight, p_conv1d_bias);  y = p_conv1d_weight = p_conv1d_bias = None
@@ -6973,7 +9181,11 @@ def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_
     sum_1 = torch.ops.aten.sum.default(conv1d);  conv1d = None
     add_1 = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
     return (add_1,)""",
+<<<<<<< HEAD
         )
+=======
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_export_decomps_dynamic(self):
         class M(torch.nn.Module):
@@ -7013,6 +9225,13 @@ def forward(self, x):
         inp = torch.randn(2)
         self.assertTrue(torch.allclose(ep.module()(inp), torch.nonzero(inp)))
 
+<<<<<<< HEAD
+=======
+    # TODO(pianpwk) blocker: https://github.com/pytorch/pytorch/issues/151809
+    @testing.expectedFailureSerDer
+    @testing.expectedFailureSerDerNonStrict
+    @testing.expectedFailureCppSerDes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_redundant_asserts(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -7636,6 +9855,19 @@ def forward(self, x, y, z):
         )
         self.assertEqual(ep.module()(*inputs), m3(*inputs))
 
+<<<<<<< HEAD
+=======
+    def test_operator_aten_tensor_mode_variant(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.div.Tensor_mode(x, 2, rounding_mode="floor")
+
+        m = Module()
+        args = (torch.randn(4, 3),)
+        ep = export(m, args)
+        self.assertEqual(ep.module()(*args), m(*args))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_then_compile_tensor_ctor(self):
         class M(torch.nn.Module):
             def forward(self, scores, mask):
@@ -9122,6 +11354,240 @@ def forward(self, x):
         assert torch.allclose(epm(*inp), eager)
         assert torch.allclose(ufm(*inp), eager)
 
+<<<<<<< HEAD
+=======
+    def test_none_buffers(self):
+        mod = torch.nn.InstanceNorm1d(1)
+        args = (torch.randn(1, 2),)
+        ep = torch.export.export(mod, args, strict=False)
+        self.assertTrue(torch.allclose(ep.module()(*args), mod(*args)))
+
+    def test_partial_patched_forward(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x + 2
+
+        def fancy_forward(x, y):
+            return x + 2 + y
+
+        Foo.forward = functools.partial(fancy_forward, y=torch.randn(4, 4))
+        x = torch.randn(4, 4)
+        # strict unsupported: "Tracing through optional input"
+        ep = export(Foo(), (x,), strict=False)
+        ep.module()(x)
+
+        class Bar(torch.nn.Module):
+            def forward(self, x, y, z):
+                return x + y + z
+
+        mod = Bar()
+        mod.forward = functools.partial(mod.forward, z=2)
+        mod.forward = functools.partial(mod.forward, y=torch.randn(4))
+        ep = export(mod, (x,), strict=False)
+        ep.module()(x)
+
+    @testing.expectedFailureCppRuntime
+    def test_symint_input_basic(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x * y
+
+        ep = export(M(), (4, 5))
+        self.assertEqual(ep.module()(4, 5), 20)
+        with self.assertRaisesRegex(RuntimeError, r"to be equal to 4, but got 3"):
+            self.assertEqual(ep.module()(3, 6), 18)
+
+        ep = export(M(), (4, 5), dynamic_shapes={"x": Dim.DYNAMIC, "y": Dim.AUTO})
+        self.assertEqual(ep.module()(4, 5), 20)
+        self.assertEqual(ep.module()(3, 6), 18)
+
+        ep = export(M(), (4, 5), dynamic_shapes={"x": Dim.DYNAMIC, "y": Dim.AUTO})
+        self.assertEqual(ep.module()(4, 5), 20)
+        self.assertEqual(ep.module()(3, 6), 18)
+
+        ep = export(M(), (5, 5), dynamic_shapes={"x": None, "y": Dim.AUTO})
+        self.assertEqual(ep.module()(5, 6), 30)
+        with self.assertRaisesRegex(RuntimeError, r"to be equal to 5, but got 3"):
+            self.assertEqual(ep.module()(3, 5), 18)
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x["moo"] * y
+
+        ep = export(
+            M(),
+            ({"moo": 2}, torch.ones(3, 3)),
+            dynamic_shapes={"x": {"moo": Dim.DYNAMIC}, "y": {0: Dim.DYNAMIC}},
+        )
+        inp = ({"moo": 3}, torch.ones(4, 3))
+        self.assertTrue(torch.allclose(ep.module()(*inp), M()(*inp)))
+
+    @testing.expectedFailureCppRuntime
+    @testing.expectedFailureRetraceabilityNonStrict  # no runtime asserts added for assert x == 3
+    def test_symint_input_specialization(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                assert x == 3
+                assert y.shape[0] == 4
+                return x * y
+
+        inp = (3, torch.randn(4, 4))
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            r"You marked.*but your code specialized it to be a constant.*"
+            r"If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO",
+        ):
+            ep = export(
+                M(),
+                inp,
+                dynamic_shapes=(Dim.DYNAMIC, None),
+            )
+
+        ep = export(
+            M(),
+            inp,
+            dynamic_shapes=(Dim.AUTO, None),
+        )
+        with self.assertRaisesRegex(RuntimeError, "to be equal to 3, but got 4"):
+            ep.module()(4, torch.randn(4, 4))
+
+    @testing.expectedFailureCppRuntime
+    @testing.expectedFailureRetraceabilityNonStrict  # no runtime asserts added for assert x == 3
+    def test_symint_input_ranges(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x * y
+
+        inp = (3, torch.randn(4, 4))
+        ep = export(
+            M(),
+            inp,
+            dynamic_shapes=(Dim.DYNAMIC(min=3, max=10), None),
+        )
+
+        ep.module()(4, torch.randn(4, 4))
+        with self.assertRaisesRegex(RuntimeError, "to be <= 10, but got 16"):
+            ep.module()(16, torch.randn(4, 4))
+        with self.assertRaisesRegex(RuntimeError, "to be >= 3, but got 2"):
+            ep.module()(2, torch.randn(4, 4))
+
+        # While tracing the range was found to be a subset of the original range
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                assert x > 3
+                assert x <= 5
+                return x * y
+
+        inp = (4, torch.randn(4, 4))
+        ep = export(
+            M(),
+            inp,
+            dynamic_shapes=(Dim.DYNAMIC(min=3, max=10), None),
+        )
+        constraints = list(ep.range_constraints.values())
+        constraint = constraints[0]
+        self.assertEqual(constraint.lower, 4)
+        self.assertEqual(constraint.upper, 5)
+
+        # While tracing the range was found to be bigger than the original range
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                assert x > 1
+                assert x < 20
+                return x * y
+
+        inp = (4, torch.randn(4, 4))
+        ep = export(
+            M(),
+            inp,
+            dynamic_shapes=(Dim.DYNAMIC(min=3, max=10), None),
+        )
+        constraints = list(ep.range_constraints.values())
+        constraint = constraints[0]
+        self.assertEqual(constraint.lower, 3)
+        self.assertEqual(constraint.upper, 10)
+
+        # While tracing the range was found to be outside of the original range
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                assert x > 10
+                assert x < 20
+                return x * y
+
+        inp = (14, torch.randn(4, 4))
+        with self.assertRaisesRegex(
+            ValueError, r"\[3, 10\], conflicting with .* \[11, 19\]"
+        ):
+            ep = export(
+                M(),
+                inp,
+                dynamic_shapes=(Dim.DYNAMIC(min=3, max=10), None),
+            )
+
+    @testing.expectedFailureCppRuntime
+    def test_symint_input_additional_inputs(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        additional_inputs = torch.export.AdditionalInputs()
+        additional_inputs.add((5, 5))
+        additional_inputs.add((3, 5))
+        additional_inputs.add((5, 4))
+        ep = torch.export.export(M(), (6, 7), dynamic_shapes=additional_inputs)
+        self.assertEqual(ep.module()(5, 5), 10)
+        self.assertEqual(ep.module()(3, 5), 8)
+        self.assertEqual(ep.module()(5, 4), 9)
+
+    @testing.expectedFailureCppRuntime
+    def test_symint_input_shapes_collection(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        import torch.utils._pytree as pytree
+        from torch.export.dynamic_shapes import _IntWrapper
+
+        args = (_IntWrapper(5), _IntWrapper(5))
+        shapes_collection = torch.export.ShapesCollection()
+        shapes_collection[args[0]] = Dim.DYNAMIC
+        shapes_collection[args[1]] = Dim.DYNAMIC
+        ep = torch.export.export(M(), args, dynamic_shapes=shapes_collection)
+        self.assertEqual(ep.module()(5, 5), 10)
+        self.assertEqual(ep.module()(3, 5), 8)
+        self.assertEqual(ep.module()(5, 4), 9)
+
+    def test_dynamic_shapes_bounds(self):
+        class M(torch.nn.Module):
+            """
+            Example: bounds on dynamic shapes
+            """
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor, zs: list[torch.Tensor]):
+                return x[:3] + y @ torch.cat(zs)
+
+        m = M()
+        x = torch.randn(7, 5)
+        y = torch.randn(3, 6)
+        zs = [torch.randn(2, 5), torch.randn(4, 5)]
+
+        from torch.export import Dim, ShapesCollection
+
+        dynamic_shapes = ShapesCollection()
+        dynamic_shapes[x] = (Dim.DYNAMIC, Dim.DYNAMIC)
+        dynamic_shapes[y] = (Dim.DYNAMIC, Dim.DYNAMIC)
+        for z in zs:
+            dynamic_shapes[z] = (Dim.DYNAMIC, Dim.DYNAMIC)
+
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            r"Constraints violated.*\n.*"
+            r"You marked L\['y'\].size\(\)\[0\] as dynamic but your code specialized it to be a constant \(3\).*"
+            r"If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO.",
+        ):
+            export(m, (x, y, zs), dynamic_shapes=dynamic_shapes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unflatten_random_dag_const_preserving_3_1(self):
         class N2(torch.nn.Module):
             def __init__(self):
@@ -9518,8 +11984,11 @@ def forward(self, x):
         ep2_result = ep2.module()(inp)
         self.assertTrue(torch.allclose(ep2_result, orig_result))
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict
     @testing.expectedFailureLegacyExportStrict
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_constant_tensor_with_non_functional(self):
         class TestModel(torch.nn.Module):
             def __init__(self):
@@ -9556,8 +12025,11 @@ def forward(self, c_params, x):
     return (add_2,)""",
         )
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict
     @testing.expectedFailureLegacyExportStrict
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_constant_tensor_with_non_functional_nested(self):
         class SubMod(torch.nn.Module):
             def __init__(self):
@@ -9826,6 +12298,62 @@ def forward(self, x):
             )
         )
 
+<<<<<<< HEAD
+=======
+    def test_stack_trace_make_fx(self):
+        class Foo(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                x = self.linear(x)
+                x *= 2.0
+                return x
+
+        inp = torch.randn(4, 4)
+        gm = torch.fx.experimental.proxy_tensor.make_fx(Foo(), stack_trace=True)(
+            inp,
+        )
+
+        # check correct lines are in stack trace
+        trace_mul = [node for node in gm.graph.nodes if node.name == "mul_"][
+            0
+        ].meta.get("stack_trace", "")
+        self.assertTrue(
+            re.search(r"test_export.py.*in forward\n.*x \*= 2.0", trace_mul)
+        )
+        trace_addmm = [node for node in gm.graph.nodes if node.name in ["addmm", "t"]][
+            0
+        ].meta.get("stack_trace", "")
+        self.assertTrue(
+            re.search(
+                r"test_export.py.*in forward\n.*x = self.linear\(x\)", trace_addmm
+            )
+        )
+
+        # check correct lines are still in stack trace after export
+        ep = export(
+            gm,
+            (torch.randn(4, 4),),
+        ).run_decompositions({})
+        # check correct lines are in stack trace
+        trace_mul = [node for node in ep.graph.nodes if node.name == "mul"][0].meta.get(
+            "stack_trace", ""
+        )
+        self.assertTrue(
+            re.search(r"test_export.py.*in forward\n.*x \*= 2.0", trace_mul)
+        )
+        trace_addmm = [
+            node for node in ep.graph.nodes if node.name in ["addmm", "linear"]
+        ][0].meta.get("stack_trace", "")
+        self.assertTrue(
+            re.search(
+                r"test_export.py.*in forward\n.*x = self.linear\(x\)", trace_addmm
+            )
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @testing.expectedFailureSerDerNonStrict  # register_constant needs to handle serialization
     @testing.expectedFailureSerDer  # register_constant needs to handle serialization
     def test_register_constant(self):
@@ -9881,7 +12409,11 @@ def forward(self, p_bar_linear_weight, p_bar_linear_bias, x):
     gt = torch.ops.aten.gt.Scalar(sum_1, 4);  sum_1 = None
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [p_bar_linear_bias, p_bar_linear_weight, x]);  gt = true_graph_0 = false_graph_0 = p_bar_linear_bias = p_bar_linear_weight = x = None
+=======
+    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (p_bar_linear_bias, p_bar_linear_weight, x));  gt = true_graph_0 = false_graph_0 = p_bar_linear_bias = p_bar_linear_weight = x = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     add = torch.ops.aten.add.Tensor(cos, getitem);  cos = getitem = None
     return (add,)""",
@@ -9977,6 +12509,10 @@ def forward(self, x):
         ).run(ep.graph_module.code)
 
     def test_replace_unbacked_with_very_large_upperbound(self):
+<<<<<<< HEAD
+=======
+        strict = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # beyond 2^53 where python floats lose precision
         VERY_LARGE_INT = 1000000007999999992
 
@@ -9996,7 +12532,11 @@ def forward(self, x, t):
             "x": (Dim.AUTO, Dim.STATIC),
             "t": (Dim.STATIC,),
         }
+<<<<<<< HEAD
         ep = export(Model(), inp, dynamic_shapes=spec)
+=======
+        ep = export(Model(), inp, dynamic_shapes=spec, strict=strict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(torch.allclose(Model()(*inp), ep.module()(*inp)))
 
     def test_predispatch_cond(self):
@@ -10040,7 +12580,11 @@ def true_fn(x, y):
 def forward(self, b_pred, b_t, x, y):
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(b_pred, true_graph_0, false_graph_0, [b_t, x, y]);  b_pred = true_graph_0 = false_graph_0 = b_t = x = y = None
+=======
+    cond = torch.ops.higher_order.cond(b_pred, true_graph_0, false_graph_0, (b_t, x, y));  b_pred = true_graph_0 = false_graph_0 = b_t = x = y = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return (getitem,)""",
         )  # noqa: B950
@@ -10065,6 +12609,48 @@ def forward(self, x, b_t, y):
     return (add_1,)""",
         )
 
+<<<<<<< HEAD
+=======
+    def test_python_asserts_with_sym_int(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                y = x + 1
+                assert y.max().item() > 0
+                return y
+
+        model = Model()
+        ep = torch.export.export(model, (torch.zeros(4, dtype=torch.int),))
+
+        """
+        Graph should look like:
+        class GraphModule(torch.nn.Module):
+            def forward(self, x: "i32[4]"):
+                add: "i32[4]" = torch.ops.aten.add.Tensor(x, 1);  x = None
+
+                max_1: "i32[]" = torch.ops.aten.max.default(add)
+                item: "Sym(u0)" = torch.ops.aten.item.default(max_1);  max_1 = None
+                ge: "Sym(u0 >= 1)" = item >= 1
+                _assert_scalar_default = torch.ops.aten._assert_scalar.default(
+                    ge,
+                    "Runtime assertion failed for expression u0 >= 1 on node 'ge'"
+                );  ge = _assert_scalar_default = None
+
+                gt_1: "Sym(u0 > 0)" = item > 0;  item = None
+                _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(
+                    gt_1,
+                    "Runtime assertion failed for expression 0 < u0 on node 'gt_1'"
+                );  gt_1 = _assert_scalar_default_1 = None
+                return (add,)
+        """
+        inputs = (torch.ones(4, dtype=torch.int),)
+        self.assertEqual(ep.module()(*inputs), model(*inputs))
+        inputs = (-torch.ones(4, dtype=torch.int),)
+        with self.assertRaisesRegex(
+            RuntimeError, "Runtime assertion failed for expression"
+        ):
+            ep.module()(*inputs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_predispatch_grad_wrappers(self):
         class Model(torch.nn.Module):
             def forward(self, x, y):
@@ -10431,6 +13017,29 @@ def forward(self, x):
         self.assertEqual(mod.foo, ep.module().foo)
         self.assertEqual(mod(torch.ones(4, 4)), ep.module()(torch.ones(4, 4)))
 
+<<<<<<< HEAD
+=======
+    def test_unbacked_scalar_constructor(self):
+        class Foo(torch.nn.Module):
+            def forward(self, u, zuf, b):
+                return (
+                    torch.tensor([u.item()]),
+                    torch.tensor([zuf.item()]),
+                    torch.tensor([b.item()]),
+                )
+
+        mod = Foo()
+        inps = (torch.tensor([3]), torch.tensor([3.14]), torch.tensor([True]))
+        ep = torch.export.export(mod, inps)
+        for eager_out, ep_out in zip(mod(*inps), ep.module()(*inps)):
+            self.assertTrue(torch.allclose(eager_out, ep_out))
+
+        # test with other inputs
+        inps = (torch.tensor([5]), torch.tensor([-1.2]), torch.tensor([False]))
+        for eager_out, ep_out in zip(mod(*inps), ep.module()(*inps)):
+            self.assertTrue(torch.allclose(eager_out, ep_out))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_symint_tensor_return(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -10547,6 +13156,33 @@ def forward(self, x1, x2, flag=True):
         # check that graph input names are as expected
         self.assertEqual(ep.graph_signature.user_inputs, ("x1", False, "x2"))
 
+<<<<<<< HEAD
+=======
+    def test_kwarg_dynamic_shapes_diff_order(self):
+        class DummyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.ones(4, 4)
+
+            def forward(self, baba, *, start, end):
+                return baba.sum() + start.sum() + end.sum()
+
+        f = DummyModel()
+        kwargs = {
+            "end": torch.ones(4, 4, 4),
+            "start": torch.ones(4, 4),
+        }
+        dynamic_shapes = {
+            "baba": {0: torch.export.Dim("end_dim")},
+            "end": {0: torch.export.Dim("end_dim")},
+            "start": {0: torch.export.Dim("end_dim"), 1: torch.export.Dim("end_dim")},
+        }
+        ep = torch.export.export(
+            f, (torch.ones(4, 4),), kwargs, dynamic_shapes=dynamic_shapes
+        ).run_decompositions()
+        ep.module()(torch.ones(4, 4), **kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_placeholder_naming_order_variadic(self):
         class Mod(torch.nn.Module):
             def forward(self, a, b, c, **kwargs):
@@ -10561,6 +13197,19 @@ def forward(self, a, b, c, **kwargs):
         )
         self.assertEqual(ep.graph_signature.user_inputs, ("a", "c", "b", "d"))
 
+<<<<<<< HEAD
+=======
+    def test_isnonzero(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.is_nonzero(x)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Boolean value of Tensor with more than"
+        ):
+            export(Foo(), (torch.randn(4, 4),), strict=False)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_placeholder_naming_collisions(self):
         # test collisions between nested user inputs
         class Foo(torch.nn.Module):
@@ -10599,6 +13248,7 @@ def forward(self, p, b_alpha, b, c_gamma):
             torch.randn(4),
         )
         ep = export(Foo(), inputs)
+<<<<<<< HEAD
         expected_names = [  # user inputs should be prioritized, unprefixed
             ("p_param_1", InputKind.PARAMETER),
             ("b_alpha_1", InputKind.BUFFER),
@@ -10609,6 +13259,31 @@ def forward(self, p, b_alpha, b, c_gamma):
             ("b_beta", InputKind.USER_INPUT),
             ("c_gamma", InputKind.USER_INPUT),
         ]
+=======
+        if is_inline_and_install_strict_test(self._testMethodName):
+            # when installed, prefix name
+            expected_names = [  # user inputs should be prioritized, unprefixed
+                ("p____parameters__param", InputKind.PARAMETER),
+                ("b____buffers__alpha", InputKind.BUFFER),
+                ("b____buffers__beta", InputKind.BUFFER),
+                ("c_gamma_1", InputKind.CONSTANT_TENSOR),
+                ("p_param", InputKind.USER_INPUT),
+                ("b_alpha", InputKind.USER_INPUT),
+                ("b_beta", InputKind.USER_INPUT),
+                ("c_gamma", InputKind.USER_INPUT),
+            ]
+        else:
+            expected_names = [  # user inputs should be prioritized, unprefixed
+                ("p_param_1", InputKind.PARAMETER),
+                ("b_alpha_1", InputKind.BUFFER),
+                ("b_beta_1", InputKind.BUFFER),
+                ("c_gamma_1", InputKind.CONSTANT_TENSOR),
+                ("p_param", InputKind.USER_INPUT),
+                ("b_alpha", InputKind.USER_INPUT),
+                ("b_beta", InputKind.USER_INPUT),
+                ("c_gamma", InputKind.USER_INPUT),
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         real_names = [
             (spec.arg.name, spec.kind) for spec in ep.graph_signature.input_specs
         ]
@@ -10722,6 +13397,24 @@ def forward(self, x, y, div="floor"):
         self.assertEqual(div_spec.arg.name, "div")
         self.assertEqual(div_spec.arg.value, "floor")
 
+<<<<<<< HEAD
+=======
+    def test_attr_assignment_extra(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                self.bar = x.sum()
+                return x + 2
+
+        with self.assertRaisesRegex(
+            ValueError,
+            "During torch.export, following attrs were created in the model.forward:",
+        ):
+            _ = export(Foo(), (torch.randn(4, 4),), strict=False)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_unbacked_deferred_runtime_retrace(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -10764,8 +13457,17 @@ def forward(self, x, y):
     sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(_local_scalar_dense);  sym_constrain_range_for_size_default = None
     ge_1 = _local_scalar_dense >= 3
     _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u2 >= 3 on node 'ge_1'");  ge_1 = _assert_scalar_default = None
+<<<<<<< HEAD
     le_1 = _local_scalar_dense <= 5;  _local_scalar_dense = None
     _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(le_1, "Runtime assertion failed for expression u2 <= 5 on node 'le_1'");  le_1 = _assert_scalar_default_1 = None
+=======
+    le_1 = _local_scalar_dense <= 5
+    _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(le_1, "Runtime assertion failed for expression u2 <= 5 on node 'le_1'");  le_1 = _assert_scalar_default_1 = None
+    gt = _local_scalar_dense > 2
+    _assert_scalar_2 = torch.ops.aten._assert_scalar.default(gt, "Runtime assertion failed for expression 2 < u0 on node 'gt_1'");  gt = _assert_scalar_2 = None
+    lt = _local_scalar_dense < 6;  _local_scalar_dense = None
+    _assert_scalar_3 = torch.ops.aten._assert_scalar.default(lt, "Runtime assertion failed for expression u0 < 6 on node 'lt_1'");  lt = _assert_scalar_3 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     full = torch.ops.aten.full.default([4, 4], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
     add = torch.ops.aten.add.Tensor(y, sum_1);  y = sum_1 = None
     sum_2 = torch.ops.aten.sum.dim_IntList(full, []);  full = None
@@ -10826,7 +13528,11 @@ def forward(self, x):
         self.assertEqual(out2.shape, torch.ones(11, 4, 3).shape)
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             r"Runtime assertion failed for expression Eq\(Mod\(s0\*s1, 4\*s0 \- 4\), 0\) on node 'eq.*'",
+=======
+            r"Runtime assertion failed for expression Eq\(Mod\(s27\*s77, 4\*s77 \- 4\), 0\) on node 'eq.*'",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             ep.module()(torch.randn(8, 8))  # fail
 
@@ -10858,7 +13564,11 @@ def forward(self, x, y, z):
         self.assertEqual(out2.shape, torch.ones(40).shape)
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             r"Runtime assertion failed for expression Eq\(s0\*s1, s2\*s3\) on node 'eq.*'",
+=======
+            r"Runtime assertion failed for expression Eq\((.*)\) on node '.*'",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):  # fail only at runtime
             ep.module()(torch.randn(5, 8), torch.randn(4, 5), torch.randn(30))  # fail
 
@@ -10885,7 +13595,11 @@ def forward(self, x, y):
         self.assertEqual(out1.shape, torch.ones(126).shape)
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             r"Runtime assertion failed for expression Eq\(s0\*s1\*s2, s3\) on node 'eq.*'",
+=======
+            r"Runtime assertion failed for expression Eq\((.*)\) on node '.*'",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):  # fail only at runtime
             ep.module()(torch.randn(4, 3, 2), torch.randn(10))  # fail
 
@@ -10907,12 +13621,23 @@ def forward(self, w, x, y, z):
             "y": [Dim("dy")],  # y & z incorrect, export is supposed to fail.
             "z": [Dim("dz")],  # suggested fix should be to match these up.
         }
+<<<<<<< HEAD
         with self.assertRaisesRegex(  # if disable=True, suggested fixes should not specialize.
             torch._dynamo.exc.UserError,
             r".*Constraints violated(.*\n)*"
             r"Suggested fixes:(.*\n)*"
             r".*dz = dy(.*\n)*",
         ) as msg:
+=======
+        with (
+            self.assertRaisesRegex(  # if disable=True, suggested fixes should not specialize.
+                torch._dynamo.exc.UserError,
+                r".*Constraints violated(.*\n)*"
+                r"Suggested fixes:(.*\n)*"
+                r".*dz = dy(.*\n)*",
+            ) as msg
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             export(
                 Foo(),
                 inputs,
@@ -10938,6 +13663,47 @@ def forward(self, x, y):
         self.assertFalse(placeholders[1].meta["val"].requires_grad)
         self.assertTrue(placeholders[2].meta["val"].requires_grad)
 
+<<<<<<< HEAD
+=======
+    def test_unbacked_expand(self):
+        class Foo(torch.nn.Module):
+            def forward(self, xs):
+                u0, u1, u2 = xs.tolist()
+                x = torch.empty(u0, u1, 1)
+                return x.expand(-1, u1, u2)
+
+        ep = export(Foo(), (torch.tensor([1, 2, 3]),))
+        self.assertEqual(
+            list(ep.module()(torch.tensor([3, 4, 5])).shape),
+            [3, 4, 5],
+        )
+        self.assertEqual(
+            list(ep.module()(torch.tensor([0, 1, 0])).shape),
+            [0, 1, 0],
+        )
+
+        class Bar(torch.nn.Module):
+            def forward(self, xs):
+                u0, u1 = xs.tolist()
+                x = torch.empty(u0)
+                return x.expand(u1)
+
+        ep = export(Bar(), (torch.tensor([2, 2]),))
+        self.assertEqual(
+            ep.module()(torch.tensor([5, 5])).shape[0],
+            5,
+        )
+        self.assertEqual(
+            ep.module()(torch.tensor([1, 1])).shape[0],
+            1,
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Runtime assertion failed for expression Eq\(u0, u1\) .*",
+        ):
+            ep.module()(torch.tensor([1, 5]))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reshape_view_helper(self):
         # see: https://github.com/pytorch/pytorch/issues/126607
         class Model(torch.nn.Module):
@@ -10966,12 +13732,20 @@ def forward(self, x):
         )
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             r"Runtime assertion failed for expression Ne\(s0, 20\)",
+=======
+            r"Runtime assertion failed for expression Ne\(s77, 20\)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             ep.module()(torch.randn(20, 20, 16))
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             r"Runtime assertion failed for expression Ne\(Mod\(s0, 20\), 0\)",
+=======
+            r"Runtime assertion failed for expression Ne\(Mod\(s77, 20\), 0\)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             ep.module()(torch.randn(400, 20, 16))
         ep.module()(torch.randn(42, 20, 16))
@@ -10983,6 +13757,47 @@ def forward(self, val):
 
         export(Foo(), args=(torch.tensor(1),))
 
+<<<<<<< HEAD
+=======
+    def test_custom_pytree(self):
+        class Foo:
+            def __init__(self, attr1, attr2):
+                if attr1 is None:
+                    raise ValueError("Shouldn't be None")
+                self.attr1 = attr1
+                self.attr2 = attr2
+
+        class FooModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.foo_attr = Foo(torch.ones(4, 4), torch.ones(4, 4))
+
+            def forward(self, foo):
+                return foo.attr1.sum() + foo.attr2.sum() + self.foo_attr.attr1.sum()
+
+        def flat(foo):
+            return torch.utils._pytree._list_flatten([foo.attr1, foo.attr2])
+
+        def flat_with_keys(foo):
+            return torch.utils._pytree._list_flatten_with_keys([foo.attr1, foo.attr2])
+
+        def unflat(val, context):
+            l = torch.utils._pytree._list_unflatten(val, context)
+            return Foo(l[0], l[1])
+
+        torch.utils._pytree.register_pytree_node(
+            Foo,
+            flat,
+            unflat,
+            flatten_with_keys_fn=flat_with_keys,
+            serialized_type_name=f"{Foo.__module__}.{Foo.__name__}",
+        )
+
+        torch.export.export(
+            FooModel(), (Foo(torch.ones(4, 4), torch.ones(4, 4)),), strict=False
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_allow_explicit_guards_as_runtime_asserts(self):
         # check that explicit guards are treated as runtime assertions
         class Foo(torch.nn.Module):
@@ -11009,17 +13824,29 @@ def forward(self, x, y):
         self.assertEqual(out1.shape, torch.ones(27).shape)
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             r"Runtime assertion failed for expression Ne\(s0, s1\)",
+=======
+            r"Runtime assertion failed for expression Ne\(s77, s17\)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):  # fail only at runtime
             ep.module()(torch.randn(4), torch.randn(4))  # fail
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             r"Runtime assertion failed for expression Ne\(s0, s1\**3\)",
+=======
+            r"Runtime assertion failed for expression Ne\(s77, s17\**3\)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             ep.module()(torch.randn(64), torch.randn(4))  # fail
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             r"Runtime assertion failed for expression Eq\(s0\**2, 3\*s1\)",
+=======
+            r"Runtime assertion failed for expression Eq\(s77\**2, 3\*s17\)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             ep.module()(torch.randn(10), torch.randn(9))  # fail
 
@@ -11047,6 +13874,20 @@ def forward(self, x, y):
             0,
         )
 
+<<<<<<< HEAD
+=======
+    def test_unbacked_kth_value(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                n = y.item()
+                k = min(n, 128)
+                return x.kthvalue(k, dim=0, keepdim=True).values
+
+        inps = (torch.arange(64), torch.tensor([32]))
+        ep = export(Foo(), inps)
+        self.assertEqual(ep.module()(*inps).item(), 31)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_constant_output_dup(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -11062,8 +13903,11 @@ def forward(self, x):
         self.assertTrue(torch.allclose(a, torch.ones(4, 4)))
         self.assertTrue(torch.allclose(b, torch.ones(4, 4)))
 
+<<<<<<< HEAD
     @testing.expectedFailureLegacyExportNonStrict
     @testing.expectedFailureLegacyExportStrict
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_constant_tensor_mutation(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -11257,6 +14101,181 @@ def forward(self, x, y):
         ):
             ep.module()(torch.randn(10), torch.tensor(2))
 
+<<<<<<< HEAD
+=======
+    @testing.expectedFailureCppSerDes  # TODO: When we deserialize we somehow hardcode sympy.lower to 2
+    @testing.expectedFailureSerDerNonStrict
+    @testing.expectedFailureSerDer
+    @torch.fx.experimental._config.patch(backed_size_oblivious=True)
+    def test_baddbmm(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.nn.Parameter(
+                    torch.randn(64, 64, 192, dtype=torch.float16)
+                )
+                self.bias = torch.nn.Parameter(
+                    torch.randn(64, 1, 192, dtype=torch.float16)
+                )
+
+            def forward(self, x):
+                return torch.ops.aten.baddbmm.default(self.bias, x, self.weight)
+
+        x1 = torch.randn(64, 2048, 64, dtype=torch.float16)
+        x2 = torch.randn(64, 1, 64, dtype=torch.float16)
+        m = M()
+
+        ep = export(m, (x2,), dynamic_shapes=({1: Dim("batch")},))
+
+        self.assertTrue(torch.allclose(m(x2), ep.module()(x2)))
+        self.assertTrue(torch.allclose(m(x1), ep.module()(x1)))
+
+    @testing.expectedFailureSerDerNonStrict  # construtor is not serialized today
+    @testing.expectedFailureSerDer  # constructor is not serialized today
+    @testing.expectedFailureRetraceability  # dynamo doesn't work with FlatApply op
+    def test_capture_subclass_constructor(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.buffer = torch.nn.Buffer(
+                    TwoTensor(torch.randn(4, 4), torch.randn(4, 4))
+                )
+
+            def forward(self, x):
+                two_tensor = TwoTensor(x, TwoTensor(x, x)) + self.buffer
+                val = x + two_tensor
+                return val.b.a
+
+        mod = Foo()
+        ep = export_for_training(mod, (torch.randn(4, 4),), strict=False)
+        self.assertExpectedInline(
+            str(ep.graph).strip(),
+            """\
+graph():
+    %b_buffer : [num_users=1] = placeholder[target=b_buffer]
+    %x : [num_users=1] = placeholder[target=x]
+    %twotensor___init__0 : [num_users=1] = get_attr[target=twotensor___init__0]
+    %twotensor_const_func_spec0 : [num_users=1] = get_attr[target=twotensor_const_func_spec0]
+    %flat_apply : [num_users=2] = call_function[target=torch.ops.higher_order.flat_apply](args = (%twotensor_const_func_spec0, %twotensor___init__0, %x, %x), kwargs = {})
+    %access_subclass_inner_tensor_default_7 : [num_users=1] = call_function[target=torch.ops.export.access_subclass_inner_tensor.default](args = (%flat_apply, b), kwargs = {})
+    %twotensor___init__1 : [num_users=1] = get_attr[target=twotensor___init__1]
+    %twotensor_const_func_spec0_1 : [num_users=1] = get_attr[target=twotensor_const_func_spec0]
+    %flat_apply_1 : [num_users=2] = call_function[target=torch.ops.higher_order.flat_apply](args = (%twotensor_const_func_spec0_1, %twotensor___init__1, %access_subclass_inner_tensor_default_7, %flat_apply), kwargs = {})
+    %access_subclass_inner_tensor_default_17 : [num_users=1] = call_function[target=torch.ops.export.access_subclass_inner_tensor.default](args = (%flat_apply_1, b), kwargs = {})
+    %access_subclass_inner_tensor_default_23 : [num_users=1] = call_function[target=torch.ops.export.access_subclass_inner_tensor.default](args = (%access_subclass_inner_tensor_default_17, b), kwargs = {})
+    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%flat_apply_1, %b_buffer), kwargs = {})
+    %add_1 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%access_subclass_inner_tensor_default_23, %add), kwargs = {})
+    %access_subclass_inner_tensor_default_24 : [num_users=1] = call_function[target=torch.ops.export.access_subclass_inner_tensor.default](args = (%add_1, b), kwargs = {})
+    %access_subclass_inner_tensor_default_29 : [num_users=1] = call_function[target=torch.ops.export.access_subclass_inner_tensor.default](args = (%access_subclass_inner_tensor_default_24, a), kwargs = {})
+    return (access_subclass_inner_tensor_default_29,)""",
+        )
+
+        inp = torch.randn(4, 4)
+        self.assertEqual(ep.module()(inp), mod(inp))
+
+        with torch.inference_mode():
+            ep = ep.run_decompositions({})
+
+        # There should be no subclases
+        self.assertExpectedInline(
+            str(ep.graph).strip(),
+            """\
+graph():
+    %b_parametrizations_buffer_original0 : [num_users=0] = placeholder[target=b_parametrizations_buffer_original0]
+    %b_parametrizations_buffer_original1 : [num_users=1] = placeholder[target=b_parametrizations_buffer_original1]
+    %x : [num_users=2] = placeholder[target=x]
+    %add_1 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %b_parametrizations_buffer_original1), kwargs = {})
+    %add_4 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %add_1), kwargs = {})
+    return (add_4,)""",
+        )
+
+        self.assertEqual(ep.module()(inp), mod(inp))
+
+        mod = Foo()
+        ep = export(mod, (torch.randn(4, 4),)).run_decompositions({})
+
+        self.assertEqual(ep.module()(inp), mod(inp))
+        if is_training_ir_test(self._testMethodName):
+            self.assertExpectedInline(
+                str(ep.graph).strip(),
+                """\
+graph():
+    %b_parametrizations_buffer_original0 : [num_users=0] = placeholder[target=b_parametrizations_buffer_original0]
+    %b_parametrizations_buffer_original1 : [num_users=1] = placeholder[target=b_parametrizations_buffer_original1]
+    %x : [num_users=2] = placeholder[target=x]
+    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %b_parametrizations_buffer_original1), kwargs = {})
+    %add_1 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %add), kwargs = {})
+    return (add_1,)""",
+            )
+        else:
+            self.assertExpectedInline(
+                str(ep.graph).strip(),
+                """\
+graph():
+    %b_parametrizations_buffer_original0 : [num_users=0] = placeholder[target=b_parametrizations_buffer_original0]
+    %b_parametrizations_buffer_original1 : [num_users=1] = placeholder[target=b_parametrizations_buffer_original1]
+    %x : [num_users=2] = placeholder[target=x]
+    %add_1 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %b_parametrizations_buffer_original1), kwargs = {})
+    %add_4 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %add_1), kwargs = {})
+    return (add_4,)""",
+            )
+
+    def test_capture_subclass_wrong(self):
+        from torch._export.wrappers import (
+            mark_subclass_constructor_exportable_experimental,
+        )
+
+        with self.assertRaisesRegex(RuntimeError, "on fn which is not supported. If"):
+
+            @torch._disable_dynamo
+            @mark_subclass_constructor_exportable_experimental
+            def fn(a, b):
+                return a + b
+
+        class Foo(torch.nn.Module):
+            @torch._disable_dynamo
+            @mark_subclass_constructor_exportable_experimental
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x.cos()
+
+        with self.assertRaisesRegex(
+            RuntimeError, "TestExport.test_capture_subclass_wrong.<locals>.Foo"
+        ):
+            export(Foo(), (torch.randn(4, 4),))
+
+    def test_capture_subclass_constructor_torch_ir(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.buffer = torch.nn.Buffer(
+                    TwoTensor(torch.randn(4, 4), torch.randn(4, 4))
+                )
+
+            def forward(self, x):
+                two_tensor = TwoTensor(x, TwoTensor(x, x)) + self.buffer
+                val = x + two_tensor
+                return val.b.a
+
+        mod = Foo()
+        gm_torch_ir = _export_to_torch_ir(mod, (torch.randn(4, 4),))
+        FileCheck().check_count(
+            "torch.testing._internal.two_tensor.TwoTensor", 2, exactly=True
+        ).run(gm_torch_ir.code)
+
+    def test_sym_float_operators(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return -(x.max().item() / 2) + x
+
+        m = Module()
+        args = (torch.ones(4),)
+        ep = export(m, args)
+        self.assertEqual(ep.module()(*args), m(*args))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cse_for_symint(self):
         class Foo(torch.nn.Module):
             # check sym ops only get computed once
@@ -11327,8 +14346,19 @@ def forward(self, x):
             list(nn_module_stack.values())[-1][0]
             for nn_module_stack in nn_module_stacks
         ]
+<<<<<<< HEAD
         self.assertEqual(filtered_nn_module_stack[0], "sub_net.0")
         self.assertEqual(filtered_nn_module_stack[1], "sub_net.2")
+=======
+
+        if is_inline_and_install_strict_test(self._testMethodName):
+            # when inlined and install have same ID so reference same layer
+            self.assertEqual(filtered_nn_module_stack[0], "sub_net.0")
+            self.assertEqual(filtered_nn_module_stack[1], "sub_net.0")
+        else:
+            self.assertEqual(filtered_nn_module_stack[0], "sub_net.0")
+            self.assertEqual(filtered_nn_module_stack[1], "sub_net.2")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_slice_nn_module_stack(self):
         class N(torch.nn.Module):
@@ -11361,8 +14391,21 @@ def forward(self, x, y):
             list(nn_module_stack.values())[-1][0]
             for nn_module_stack in nn_module_stacks
         ]
+<<<<<<< HEAD
         self.assertEqual(filtered_nn_module_stack[0], "mod_list_1.slice(2, 3, None).2")
         self.assertEqual(filtered_nn_module_stack[1], "mod_list_2.slice(4, 5, None).0")
+=======
+        if is_inline_and_install_strict_test(self._testMethodName):
+            self.assertEqual(filtered_nn_module_stack[0], "mod_list_1.2")
+            self.assertEqual(filtered_nn_module_stack[1], "mod_list_1.2")
+        else:
+            self.assertEqual(
+                filtered_nn_module_stack[0], "mod_list_1.slice(2, 3, None).2"
+            )
+            self.assertEqual(
+                filtered_nn_module_stack[1], "mod_list_2.slice(4, 5, None).0"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_split_const_gm_with_lifted_constants(self):
         class Model(torch.nn.Module):
@@ -11473,6 +14516,61 @@ def forward(self, x):
     return (getitem_3, cos_1)""",
         )
 
+<<<<<<< HEAD
+=======
+    def test_run_decompositions_keep_metadata(self):
+        """Make sure the metadata is kept after exported program run_decompositions."""
+
+        @torch.library.custom_op("mylib::add", mutates_args=())
+        def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: ...
+
+        @torch.library.register_fake("mylib::add")
+        def _(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            return torch.empty_like(x)
+
+        class TestModel(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.ops.mylib.add(x, y)
+
+        model = TestModel()
+        x_example = torch.randn(2, 3)
+        y_example = torch.randn(2, 3)
+        exported_program = export(model, (x_example, y_example))
+
+        for node in exported_program.graph.nodes:
+            node.meta["custom"] = {"my_field": "dummy"}
+
+        for node in exported_program.graph.nodes:
+            self.assertEqual(node.meta["custom"]["my_field"], "dummy")
+
+        decomposed_program = exported_program.run_decompositions()
+        for node in decomposed_program.graph.nodes:
+            self.assertEqual(node.meta["custom"]["my_field"], "dummy")
+
+    def test_run_decompositions_keep_tensor_constant_metadata(self):
+        """Make sure the metadata of tensor constants are kept after run_decompositions."""
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.b = torch.ones(3, 3)
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                return self.b + self.linear(x)
+
+        ep = export(M(), (torch.ones(3, 3),))
+        for node in ep.graph.nodes:
+            node.meta["custom"] = {"my_field": "dummy"}
+
+        for node in ep.graph.nodes:
+            self.assertEqual(node.meta["custom"]["my_field"], "dummy")
+
+        decomp_ep = ep.run_decompositions()
+        for node in decomp_ep.graph.nodes:
+            self.assertEqual(node.meta["custom"]["my_field"], "dummy")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_linear_preserve_dynamic_shape(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -11503,6 +14601,28 @@ def forward(self, x):
         self.assertTrue(torch.allclose(comp_mod(inp1), mod(inp1)))
         self.assertTrue(torch.allclose(comp_mod(inp2), mod(inp2)))
 
+<<<<<<< HEAD
+=======
+    @torch.fx.experimental._config.patch(backed_size_oblivious=True)
+    def test_repeat_interleave(self):
+        class M(torch.nn.Module):
+            def forward(self, values, batch_sizes):
+                return torch.repeat_interleave(
+                    torch.arange(
+                        values.shape[0],
+                    ),
+                    batch_sizes,
+                )
+
+        inp = (torch.randint(0, 10, (1, 3)), torch.randint(0, 10, (1,)))
+        ep = torch.export.export(
+            M(), inp, dynamic_shapes=({0: Dim("dim")}, {0: Dim("dim")})
+        )
+        self.assertTrue(torch.allclose(M()(*inp), ep.module()(*inp)))
+        inp = (torch.randint(0, 10, (2, 3)), torch.randint(0, 10, (2,)))
+        self.assertTrue(torch.allclose(M()(*inp), ep.module()(*inp)))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_automatic_dynamic_shapes_simple_equality(self):
         # The next 3 test cases tests for automatic dynamic shapes specs, verifying that automatic dynamism
         # leads to replacement symbols being set for equalities, and inferred relationships being checked
@@ -11837,7 +14957,11 @@ def test_dynamic_shapes_serdes_user_errors(self):
 
         self.assertExpectedInline(
             _load_dynamic_shapes(spec, from_dict=False),
+<<<<<<< HEAD
             """[[<class 'torch._export.serde.dynamic_shapes.dx'>]]""",
+=======
+            """[[Dim('dx', min=4, max=16)]]""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # check incorrect info in dims
@@ -11926,7 +15050,10 @@ def forward(self, input1: torch.Tensor):
         inps = (torch.randn(1, 224, 768, device="cpu"),)
         export(Foo(), inps)
 
+<<<<<<< HEAD
     @testing.expectedFailureCppSerDes  # TODO(pianpwk): PowByNatural valuerange deserialization
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dim_dynamic(self):
         dynamic = Dim.DYNAMIC
 
@@ -11971,7 +15098,12 @@ def forward(self, x):
 
         with self.assertRaisesRegex(
             torch._dynamo.exc.UserError,
+<<<<<<< HEAD
             r"Not all values of RelaxedUnspecConstraint.* are valid because .* was inferred to be a constant",
+=======
+            r"You marked.*but your code specialized it to be a constant.*"
+            r"If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             ep = export(
                 Specialize(),
@@ -11998,6 +15130,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             node.target == torch.ops.aten._assert_scalar.default
             for node in ep.graph.nodes
         ].count(True)
+<<<<<<< HEAD
         self.assertEqual(num_asserts, 1)
         with self.assertRaises(RuntimeError):
             ep.module()(torch.randn(4, 2))
@@ -12009,6 +15142,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
     @testing.expectedFailureRetraceabilityNonStrict
     @testing.expectedFailureLegacyExportNonStrict
     def test_hints_wrapper(self):
+=======
+        self.assertEqual(num_asserts, 2)
+        with self.assertRaises(RuntimeError):
+            ep.module()(torch.randn(4, 2))
+
+    @testing.expectedFailureSerDer  # T195866111
+    @testing.expectedFailureSerDerNonStrict
+    def test_hints_wrapper(self):
+        strict = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -12036,7 +15180,11 @@ def outer_body_fn(x, y):
         x = torch.randn(2, 4)
         y = torch.ones(4)
 
+<<<<<<< HEAD
         ep_for_training = torch.export.export_for_training(M(), (x, y))
+=======
+        ep_for_training = torch.export.export_for_training(M(), (x, y), strict=strict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertExpectedInline(
             normalize_gm(
                 ep_for_training.graph_module.print_readable(print_output=False)
@@ -12069,7 +15217,11 @@ def forward(self, arg0_1: "f32[2, 4]", arg1_1: "f32[4]"):
 """,
         )
 
+<<<<<<< HEAD
         ep = export(M(), (x, y)).run_decompositions({})
+=======
+        ep = export(M(), (x, y), strict=strict).run_decompositions({})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         export_res = ep.module()(x, y)
         ref_res = M()(x, y)
         self.assertEqual(export_res, ref_res)
@@ -12101,6 +15253,103 @@ def forward(self, arg0_1: "f32[2, 4]", arg1_1: "f32[4]"):
 """,
         )
 
+<<<<<<< HEAD
+=======
+    @testing.expectedFailureStrict  # test_hop doesn't have a dynamo implementation
+    @testing.expectedFailureRetraceability  # test_hop doesn't have a dynamo implementation
+    @testing.expectedFailureTrainingIRToRunDecomp  # test_hop doesn't have a dynamo implementation
+    @testing.expectedFailureSerDerNonStrict  # TODO: serde torch.FunctionSchema is not implemented yet
+    @testing.expectedFailureSerDer  # TODO: serde torch.FunctionSchema is not implemented yet
+    def test_export_function_schema(self):
+        import torch.utils._pytree as pytree
+        from torch._higher_order_ops.utils import (
+            _maybe_run_with_interpreter,
+            autograd_not_implemented,
+            reenter_make_fx,
+            unique_graph_id,
+        )
+        from torch._ops import HigherOrderOperator
+        from torch._subclasses.fake_tensor import FakeTensorMode
+        from torch.fx.experimental.proxy_tensor import (
+            ProxyTorchDispatchMode,
+            track_tensor_tree,
+        )
+
+        pytree.register_constant(torch.FunctionSchema)
+
+        class TestFunctionSchemaHop(HigherOrderOperator):
+            def __init__(self):
+                super().__init__("test_function_schema")
+
+            def __call__(
+                self,
+                fn,
+                x: torch.Tensor,
+                schema: Union[torch.FunctionSchema, pytree.TreeSpec],
+            ):
+                if isinstance(schema, torch.FunctionSchema):
+                    _, schema = pytree.tree_flatten(schema)
+                return super().__call__(fn, x, schema)
+
+        def trace_hop(proxy_mode, fn, x, schema):
+            sub_gm = reenter_make_fx(fn)(x)
+            i, gm_name = unique_graph_id(proxy_mode, prefix="_sub_gm")
+            proxy_mode.tracer.root.register_module(gm_name, sub_gm)
+
+            out_proxy = proxy_mode.tracer.create_proxy(
+                "call_function",
+                test_hop,
+                tuple(
+                    proxy_mode.tracer.unwrap_proxy(arg) for arg in (sub_gm, x, schema)
+                ),
+                {},
+            )
+            example_out = test_hop(sub_gm, x, schema)
+            return track_tensor_tree(
+                example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+            )
+
+        def dense_hop(fn, x, schema):
+            assert isinstance(schema, pytree.TreeSpec)
+            schema = pytree.tree_unflatten([], schema)
+            assert (
+                isinstance(schema, torch.FunctionSchema)
+                and schema == torch.ops.aten.sin.default._schema
+            )
+            return fn(x)
+
+        def fake_hop(mode, fn, x, schema):
+            with mode:
+                return dense_hop(fn, x, schema)
+
+        def func_hop(ctx, fn, x, schema):
+            unwrapped_x = ctx.unwrap_tensors(x)
+            functional_fn = ctx.functionalize(_maybe_run_with_interpreter(fn))
+            return ctx.wrap_tensors(test_hop(functional_fn, unwrapped_x, schema))
+
+        test_hop = TestFunctionSchemaHop()
+        test_hop.py_impl(ProxyTorchDispatchMode)(trace_hop)
+        test_hop.py_impl(torch._C.DispatchKey.CompositeExplicitAutograd)(dense_hop)
+        test_hop.py_impl(FakeTensorMode)(fake_hop)
+        test_hop.py_autograd_impl(
+            autograd_not_implemented(test_hop, deferred_error=True)
+        )
+        test_hop.py_functionalize_impl(func_hop)
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                def fn(x):
+                    return x.sin()
+
+                return test_hop(fn, x, torch.ops.aten.sin.default._schema)
+
+        mod = Model()
+        x = torch.randn(3, 4)
+        ep = export(mod, (x,))
+        self.assertEqual(x.sin(), ep.module()(x))
+        pytree._deregister_pytree_node(torch.FunctionSchema)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_export_for_training_with_state_dict_hooks(self):
         def _state_dict_pre_hook(mod, prefix, keep_vars):
             mod._buffers["test"] = torch.Tensor([1])
@@ -12234,6 +15483,11 @@ def forward(self, x):
     @contextmanager
     def distributed_env(self, world_size):
         try:
+<<<<<<< HEAD
+=======
+            from torch.testing._internal.distributed.fake_pg import FakeStore
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.distributed.init_process_group(
                 backend="fake",
                 world_size=world_size,
@@ -12245,7 +15499,11 @@ def distributed_env(self, world_size):
         finally:
             torch.distributed.destroy_process_group()
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
+=======
+    @xfailIfDistributedNotSupported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_distributed_all_reduce(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -12263,7 +15521,11 @@ def forward(self, x):
             inp = (torch.randn(4, 4),)
             self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
+=======
+    @xfailIfDistributedNotSupported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_distributed_all_gather(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -12279,7 +15541,11 @@ def forward(self, x):
                 torch.allclose(a, b) for a, b in zip(ep.module()(*inp), m(*inp))
             )
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
+=======
+    @xfailIfDistributedNotSupported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_distributed_all_gather_into_tensor(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -12293,7 +15559,11 @@ def forward(self, x):
             inp = (torch.randn(2),)
             self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
+=======
+    @xfailIfDistributedNotSupported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @testing.expectedFailureCppRuntime
     def test_distributed_all_to_all_single(self):
         class Foo(torch.nn.Module):
@@ -12311,7 +15581,11 @@ def forward(self, x):
             )
             self.assertEqual(len(nodes), 1)
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_MACOS, "Distributed not packaged in macos")
+=======
+    @xfailIfDistributedNotSupported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @testing.expectedFailureCppRuntime
     def test_distributed_reduce_scatter_tensor(self):
         class Foo(torch.nn.Module):
@@ -12371,6 +15645,33 @@ def forward(self, x):
             )
             FileCheck().check_count(op_name, 1, exactly=True).run(ep.graph_module.code)
 
+<<<<<<< HEAD
+=======
+    def test_wrapper_module(self):
+        def f(x):
+            return torch.abs(x)
+
+        from torch.export import _wrapper_utils
+
+        model = _wrapper_utils._WrapperModule(f)
+        ep = export(
+            model,
+            (
+                torch.randn(
+                    8,
+                ),
+            ),
+        )
+
+        self.assertExpectedInline(
+            str(ep.graph_module.code).strip(),
+            """\
+def forward(self, args_0):
+    abs_1 = torch.ops.aten.abs.default(args_0);  args_0 = None
+    return (abs_1,)""",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 class TestOneOffModelExportResult(TestCase):
@@ -12453,17 +15754,33 @@ def forward(self, q, k, v):
     _scaled_dot_product_flash_attention = torch.ops.aten._scaled_dot_product_flash_attention.default(q, k, v, 0.0, True, scale = 0.125);  q = k = v = None
     getitem = _scaled_dot_product_flash_attention[0];  _scaled_dot_product_flash_attention = None
     return (getitem,)"""
+<<<<<<< HEAD
         # TODO(eqy): this needs to stay in sync with default SDPA priority order
         if (False and SM90OrLater) and not torch.version.hip:
+=======
+        try:
+            self.assertExpectedInline(
+                ep.graph_module.code.strip(),
+                code_str,
+            )
+        except AssertionError:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             code_str = """\
 def forward(self, q, k, v):
     _scaled_dot_product_cudnn_attention = torch.ops.aten._scaled_dot_product_cudnn_attention.default(q, k, v, None, False, 0.0, True);  q = k = v = None
     getitem = _scaled_dot_product_cudnn_attention[0];  _scaled_dot_product_cudnn_attention = None
     return (getitem,)"""
+<<<<<<< HEAD
         self.assertExpectedInline(
             ep.graph_module.code.strip(),
             code_str,
         )
+=======
+            self.assertExpectedInline(
+                ep.graph_module.code.strip(),
+                code_str,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_int_list_output(self):
         class M(torch.nn.Module):
@@ -12600,6 +15917,7 @@ def forward(self, x):
     return (add, add_1)""",
         )
 
+<<<<<<< HEAD
     def test_logging_logger(self):
         logger = logging.getLogger(__name__)
 
@@ -12622,6 +15940,43 @@ def forward(self, x):
     mul = torch.ops.aten.mul.Tensor(add, add)
     add_1 = torch.ops.aten.add.Tensor(mul, mul);  mul = None
     return (add, add_1)""",
+=======
+    def test_print_graph_signature(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.buf = torch.nn.Buffer(torch.ones(3))
+
+            def forward(self, x):
+                x.add_(1)
+                self.buf.add_(2)
+                return self.buf + x
+
+        ep = export(M(), (torch.ones(3),))
+        self.assertExpectedInline(
+            str(ep.graph_signature).strip(),
+            """\
+# inputs
+b_buf: BUFFER target='buf' persistent=True
+x: USER_INPUT
+
+# outputs
+add: USER_OUTPUT""",
+        )
+
+        ep = ep.run_decompositions({})
+        self.assertExpectedInline(
+            str(ep.graph_signature).strip(),
+            """\
+# inputs
+b_buf: BUFFER target='buf' persistent=True
+x: USER_INPUT
+
+# outputs
+add_1: BUFFER_MUTATION target='buf'
+add: USER_INPUT_MUTATION target='x'
+add_2: USER_OUTPUT""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(not TEST_TRANSFORMERS, "No transformers")
@@ -12666,6 +16021,34 @@ def forward(self, x):
     return (add,)""",
         )
 
+<<<<<<< HEAD
+=======
+    def test_logging_logger(self):
+        strict = True
+        logger = logging.getLogger(__name__)
+
+        class M(torch.nn.Module):
+            def forward(self, x):
+                logger.log("start")
+                x1 = x + x
+                logger.debug(x1)
+                x2 = x1 * x1
+                logger.info(1, 2, 3)
+                x3 = x2 + x2
+                return (x1, x3)
+
+        gm = export(M(), (torch.randn(3, 3),), strict=strict).graph_module
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, x):
+    add = torch.ops.aten.add.Tensor(x, x);  x = None
+    mul = torch.ops.aten.mul.Tensor(add, add)
+    add_1 = torch.ops.aten.add.Tensor(mul, mul);  mul = None
+    return (add, add_1)""",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_constant_fqn(self):
         class Nested(torch.nn.Module):
             def __init__(self) -> None:
@@ -12929,6 +16312,7 @@ def forward(self, x, y):
 
         x, y = torch.randn(3, 2), torch.randn(3, 2)
         mod = Mod()
+<<<<<<< HEAD
         # TODO: strict mode doesn't work because dynamo add_mod is treated as a
         # user defined variable. We might need to add a CustomModule variable to support it.
         if self._testMethodName == "test_export_script_module":
@@ -12937,11 +16321,25 @@ def forward(self, x, y):
             ):
                 ep = export(mod, (x, y))
         else:
+=======
+        if is_non_strict_test(self._testMethodName):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep = export(mod, (x, y))
             self.assertEqual(ep.module()(x, y), mod(x, y))
             FileCheck().check_count("torch.ops.aten.add.Tensor", 1, exactly=True).run(
                 ep.graph_module.code
             )
+<<<<<<< HEAD
+=======
+            return
+
+        # TODO: strict mode doesn't work because dynamo add_mod is treated as a
+        # user defined variable. We might need to add a CustomModule variable to support it.
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.Unsupported, "UserDefined with non-function"
+        ):
+            ep = export(mod, (x, y))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_preserve_non_cia_op(self):
         class M(torch.nn.Module):
@@ -12954,7 +16352,10 @@ def forward(self, x):
         )
 
         decomp_table = default_decompositions()
+<<<<<<< HEAD
         del decomp_table[torch.ops.aten.elu.default]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ep = ep.run_decompositions(
             decomp_table=decomp_table,
@@ -12990,6 +16391,26 @@ def forward(self, x):
             "torch.ops.aten.upsample_trilinear3d.vec", 1, exactly=True
         ).run(ep.graph_module.code)
 
+<<<<<<< HEAD
+=======
+    def test_export_unbacked_lt(self):
+        class MyModel(torch.nn.Module):
+            def forward(self, x, ranks):
+                first_k = ranks.max().item()
+                narrow = x.narrow(dim=1, start=0, length=first_k)
+                lt = narrow < narrow.size(1)
+                return lt
+
+        inps = (torch.randn((8, 16)), torch.arange(8, dtype=torch.int8))
+        spec = {
+            "x": (Dim.AUTO, Dim.AUTO),
+            "ranks": (Dim.AUTO,),
+        }
+        traced = export(
+            MyModel(), inps, dynamic_shapes=spec, strict=True
+        ).run_decompositions({})
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_export_strict.py b/test/export/test_export_strict.py
new file mode 100644
index 000000000000..cbf6cb2856cf
--- /dev/null
+++ b/test/export/test_export_strict.py
@@ -0,0 +1,51 @@
+# Owner(s): ["oncall: export"]
+
+try:
+    from . import test_export, testing
+except ImportError:
+    import test_export  # @manual=fbcode//caffe2/test:test_export-library
+    import testing  # @manual=fbcode//caffe2/test:test_export-library
+
+from torch.export import export
+
+
+test_classes = {}
+
+
+def mocked_strict_export(*args, **kwargs):
+    # If user already specified strict, don't make it strict
+    if "strict" in kwargs:
+        return export(*args, **kwargs)
+    return export(*args, **kwargs, strict=True)
+
+
+def make_dynamic_cls(cls):
+    cls_prefix = "StrictExport"
+
+    test_class = testing.make_test_cls_with_mocked_export(
+        cls,
+        cls_prefix,
+        test_export.STRICT_SUFFIX,
+        mocked_strict_export,
+        xfail_prop="_expected_failure_strict",
+    )
+
+    test_classes[test_class.__name__] = test_class
+    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
+    globals()[test_class.__name__] = test_class
+    test_class.__module__ = __name__
+    return test_class
+
+
+tests = [
+    test_export.TestDynamismExpression,
+    test_export.TestExport,
+]
+for test in tests:
+    make_dynamic_cls(test)
+del test
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/export/test_export_training_ir_to_run_decomp.py b/test/export/test_export_training_ir_to_run_decomp.py
index 335f4ec7a0c1..57d5e329ef81 100644
--- a/test/export/test_export_training_ir_to_run_decomp.py
+++ b/test/export/test_export_training_ir_to_run_decomp.py
@@ -14,15 +14,26 @@
 
 
 def mocked_training_ir_to_run_decomp_export_strict(*args, **kwargs):
+<<<<<<< HEAD
     ep = torch.export.export_for_training(*args, **kwargs)
+=======
+    if "strict" in kwargs:
+        ep = torch.export.export_for_training(*args, **kwargs)
+    else:
+        ep = torch.export.export_for_training(*args, **kwargs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ep.run_decompositions({})
 
 
 def mocked_training_ir_to_run_decomp_export_non_strict(*args, **kwargs):
+<<<<<<< HEAD
     if "strict" in kwargs:
         ep = torch.export.export_for_training(*args, **kwargs)
     else:
         ep = torch.export.export_for_training(*args, **kwargs, strict=False)
+=======
+    ep = torch.export.export_for_training(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return ep.run_decompositions({})
 
diff --git a/test/export/test_export_with_inline_and_install.py b/test/export/test_export_with_inline_and_install.py
new file mode 100644
index 000000000000..c7f6c2803892
--- /dev/null
+++ b/test/export/test_export_with_inline_and_install.py
@@ -0,0 +1,79 @@
+# Owner(s): ["oncall: export"]
+
+
+import unittest
+
+from torch._dynamo import config
+from torch._dynamo.testing import make_test_cls_with_patches
+
+
+try:
+    from . import test_export, testing
+except ImportError:
+    import test_export  # @manual=fbcode//caffe2/test:test_export-library
+    import testing  # @manual=fbcode//caffe2/test:test_export-library
+
+from torch.export import export
+
+
+test_classes = {}
+
+
+def mocked_strict_export(*args, **kwargs):
+    # If user already specified strict, don't make it strict
+    if "strict" in kwargs:
+        return export(*args, **kwargs)
+    return export(*args, **kwargs, strict=True)
+
+
+def make_dynamic_cls(cls):
+    # Some test check for ending in suffix; need to make
+    # the `_strict` for end of string as a result
+    suffix = test_export.INLINE_AND_INSTALL_STRICT_SUFFIX
+
+    cls_prefix = "InlineAndInstall"
+
+    cls_a = testing.make_test_cls_with_mocked_export(
+        cls,
+        "StrictExport",
+        suffix,
+        mocked_strict_export,
+        xfail_prop="_expected_failure_strict",
+    )
+    test_class = make_test_cls_with_patches(
+        cls_a,
+        cls_prefix,
+        "",
+        (config, "install_free_tensors", True),
+        (config, "inline_inbuilt_nn_modules", True),
+        xfail_prop="_expected_failure_inline_and_install",
+    )
+
+    test_classes[test_class.__name__] = test_class
+    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
+    globals()[test_class.__name__] = test_class
+    test_class.__module__ = __name__
+    return test_class
+
+
+tests = [
+    test_export.TestDynamismExpression,
+    test_export.TestExport,
+]
+for test in tests:
+    make_dynamic_cls(test)
+del test
+
+
+# NOTE: For this test, we have a failure that occurs because the buffers (for BatchNorm2D) are installed, and not
+# graph input.  Therefore, they are not in the `program.graph_signature.inputs_to_buffers`
+# and so not found by the unit test when counting the buffers
+unittest.expectedFailure(
+    InlineAndInstallStrictExportTestExport.test_buffer_util_inline_and_install_strict  # noqa: F821
+)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/export/test_package.py b/test/export/test_package.py
new file mode 100644
index 000000000000..b650cb0396a5
--- /dev/null
+++ b/test/export/test_package.py
@@ -0,0 +1,94 @@
+# Owner(s): ["oncall: export"]
+import unittest
+
+import torch
+from torch._dynamo.eval_frame import is_dynamo_supported
+from torch.export import Dim
+from torch.export.experimental import _ExportPackage
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+@unittest.skipIf(not is_dynamo_supported(), "dynamo isn't supported")
+class TestPackage(TestCase):
+    def test_basic(self):
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            return x + 1
+
+        x = torch.randn(3, 2)
+        package = _ExportPackage()
+        self.assertEqual(
+            package._exporter("fn", fn)(x),
+            fn(x),
+        )
+        self.assertEqual(len(package.methods), 1)
+        self.assertEqual(len(package.methods["fn"].fallbacks), 1)
+        self.assertEqual(len(package.methods["fn"].overloads), 0)
+
+    def test_more_than_once(self):
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            return x + 1
+
+        x = torch.randn(3, 2)
+        package = _ExportPackage()
+        exporter = package._exporter("fn", fn)
+        exporter(x)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Cannot export .* more than once",
+        ):
+            exporter(x)
+
+    def test_error(self):
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            return x + 1
+
+        x = torch.randn(3, 2)
+        package = _ExportPackage()
+        exporter = package._exporter("fn", fn, fallback="error")
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Cannot export fallback .* when fallback policy is set to 'error'",
+        ):
+            exporter(x)
+
+    def test_overloads(self):
+        class Module(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                if x.shape[0] == 4:
+                    return x + 1
+                elif x.shape[0] == 3:
+                    return x - 1
+                else:
+                    return x + 2
+
+        fn = Module()
+        x = torch.randn(3, 2)
+        x2 = torch.randn(4, 2)
+        x3 = torch.randn(5, 2)
+
+        def spec(self, x):
+            assert x.shape[0] == 3
+
+        def spec2(self, x):
+            assert x.shape[0] == 4
+
+        def spec3(self, x):
+            assert x.shape[0] >= 5
+            return {"x": (Dim("batch", min=5), Dim.STATIC)}
+
+        package = _ExportPackage()
+        exporter = (
+            package._exporter("fn", fn)
+            ._define_overload("spec", spec)
+            ._define_overload("spec2", spec2)
+            ._define_overload("spec3", spec3)
+        )
+        self.assertEqual(exporter(x), x - 1)
+        self.assertEqual(exporter(x2), x2 + 1)
+        self.assertEqual(exporter(x3), x3 + 2)
+        self.assertEqual(len(package.methods), 1)
+        self.assertEqual(len(package.methods["fn"].overloads), 3)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/export/test_retraceability.py b/test/export/test_retraceability.py
index 071598878e2a..f026d046c6c9 100644
--- a/test/export/test_retraceability.py
+++ b/test/export/test_retraceability.py
@@ -13,6 +13,7 @@
 
 
 def mocked_retraceability_export_strict(*args, **kwargs):
+<<<<<<< HEAD
     ep = export(*args, **kwargs)
     if "dynamic_shapes" in kwargs:
         if isinstance(kwargs["dynamic_shapes"], dict):
@@ -27,6 +28,13 @@ def mocked_retraceability_export_non_strict(*args, **kwargs):
         ep = export(*args, **kwargs)
     else:
         ep = export(*args, **kwargs, strict=False)
+=======
+    if "strict" in kwargs:
+        ep = export(*args, **kwargs)
+    else:
+        ep = export(*args, **kwargs, strict=True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if "dynamic_shapes" in kwargs:
         if isinstance(kwargs["dynamic_shapes"], dict):
             kwargs["dynamic_shapes"] = tuple(kwargs["dynamic_shapes"].values())
@@ -34,7 +42,21 @@ def mocked_retraceability_export_non_strict(*args, **kwargs):
     if "strict" in kwargs:
         ep = export(ep.module(), *(args[1:]), **kwargs)
     else:
+<<<<<<< HEAD
         ep = export(ep.module(), *(args[1:]), **kwargs, strict=False)
+=======
+        ep = export(ep.module(), *(args[1:]), **kwargs, strict=True)
+    return ep
+
+
+def mocked_retraceability_export_non_strict(*args, **kwargs):
+    ep = export(*args, **kwargs)
+    if "dynamic_shapes" in kwargs:
+        if isinstance(kwargs["dynamic_shapes"], dict):
+            kwargs["dynamic_shapes"] = tuple(kwargs["dynamic_shapes"].values())
+
+    ep = export(ep.module(), *(args[1:]), **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ep
 
 
diff --git a/test/export/test_serdes.py b/test/export/test_serdes.py
index d22d19500f3a..9fb992ab6226 100644
--- a/test/export/test_serdes.py
+++ b/test/export/test_serdes.py
@@ -16,7 +16,15 @@
 
 
 def mocked_serder_export_strict(*args, **kwargs):
+<<<<<<< HEAD
     ep = export(*args, **kwargs)
+=======
+    if "strict" not in kwargs:
+        ep = export(*args, **kwargs, strict=True)
+    else:
+        ep = export(*args, **kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     buffer = io.BytesIO()
     save(ep, buffer)
     buffer.seek(0)
@@ -25,10 +33,14 @@ def mocked_serder_export_strict(*args, **kwargs):
 
 
 def mocked_serder_export_non_strict(*args, **kwargs):
+<<<<<<< HEAD
     if "strict" in kwargs:
         ep = export(*args, **kwargs)
     else:
         ep = export(*args, **kwargs, strict=False)
+=======
+    ep = export(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     buffer = io.BytesIO()
     save(ep, buffer)
     buffer.seek(0)
@@ -41,7 +53,11 @@ def make_dynamic_cls(cls, strict):
         test_class = testing.make_test_cls_with_mocked_export(
             cls,
             "SerDesExport",
+<<<<<<< HEAD
             test_export.SERDES_SUFFIX,
+=======
+            test_export.SERDES_STRICT_SUFFIX,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mocked_serder_export_strict,
             xfail_prop="_expected_failure_serdes",
         )
diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index 402bfa187db7..7e1575e88b8d 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -16,6 +16,10 @@
 
 import torch
 import torch._dynamo as torchdynamo
+<<<<<<< HEAD
+=======
+import torch._export.serde.schema as schema
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.export._trace
 import torch.utils._pytree as pytree
 from torch._export.db.case import ExportCase, SupportLevel
@@ -33,9 +37,18 @@
 from torch._higher_order_ops.torchbind import enable_torchbind_tracing
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
 from torch.export import Dim, export_for_training, load, save, unflatten
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import is_concrete_int, ValueRanges
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+=======
+from torch.export.pt2_archive.constants import ARCHIVE_VERSION_PATH
+from torch.fx.experimental.symbolic_shapes import is_concrete_int, ValueRanges
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    IS_FBCODE,
+    IS_MACOS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_WINDOWS,
     parametrize,
     run_tests,
@@ -99,7 +112,11 @@ def op_schema(cls, op):
                 return torch.ops.aten.add.Tensor._schema
 
         inp = (torch.ones(10),)
+<<<<<<< HEAD
         ep = export_for_training(TestModule(), inp)
+=======
+        ep = export_for_training(TestModule(), inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Register the custom op handler.
         foo_custom_op = FooExtensionOp()
@@ -164,7 +181,13 @@ def forward(self, x, y, use_p=False):
 
         model = MyModule().eval()
         random_inputs = (torch.rand([2, 3]), torch.rand([2, 3]))
+<<<<<<< HEAD
         exp_program = export_for_training(model, random_inputs, {"use_p": True})
+=======
+        exp_program = export_for_training(
+            model, random_inputs, {"use_p": True}, strict=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output_buffer = io.BytesIO()
         # Tests that example inputs are preserved when saving and loading module.
@@ -183,7 +206,11 @@ class M(torch.nn.Module):
             def forward(self, x):
                 return x.sin()
 
+<<<<<<< HEAD
         exp_program = export_for_training(M(), (torch.randn(4, 4),))
+=======
+        exp_program = export_for_training(M(), (torch.randn(4, 4),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output_buffer = io.BytesIO()
         # Tests that example forward arg names are preserved when saving and loading module.
@@ -223,7 +250,11 @@ def forward(self, x):
         inp = (torch.ones(10),)
         # Module will only be able to roundtrip if metadata
         # can be correctly parsed.
+<<<<<<< HEAD
         ep = export_for_training(MyModule(), inp)
+=======
+        ep = export_for_training(MyModule(), inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buffer = io.BytesIO()
         save(ep, buffer)
         loaded_ep = load(buffer)
@@ -287,7 +318,11 @@ def forward(self, x):
 
         # Check that module can be roundtripped, thereby confirming proper deserialization.
         inp = (torch.ones(10),)
+<<<<<<< HEAD
         ep = export_for_training(MyModule(), inp)
+=======
+        ep = export_for_training(MyModule(), inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buffer = io.BytesIO()
         save(ep, buffer)
         loaded_ep = load(buffer)
@@ -317,6 +352,10 @@ def forward(self, x, w, b):
                 torch.ones([512]),
                 torch.ones([512]),
             ),
+<<<<<<< HEAD
+=======
+            strict=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run_decompositions()
 
         serialized = ExportedProgramSerializer().serialize(exported_module)
@@ -354,7 +393,14 @@ def forward(self, a, b, c) -> torch.Tensor:
             "c": {0: dim0_ac, 1: dim1_bc},
         }
         exported_module = export_for_training(
+<<<<<<< HEAD
             DynamicShapeSimpleModel(), inputs, dynamic_shapes=dynamic_shapes
+=======
+            DynamicShapeSimpleModel(),
+            inputs,
+            dynamic_shapes=dynamic_shapes,
+            strict=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run_decompositions()
         serialized = ExportedProgramSerializer().serialize(exported_module)
         sym_size_nodes = [
@@ -415,7 +461,14 @@ def forward(self, a, b, c) -> torch.Tensor:
             "c": {0: dim0_ac, 1: dim1_bc},
         }
         exported_module = export_for_training(
+<<<<<<< HEAD
             DynamicShapeSimpleModel(), inputs, dynamic_shapes=dynamic_shapes
+=======
+            DynamicShapeSimpleModel(),
+            inputs,
+            dynamic_shapes=dynamic_shapes,
+            strict=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run_decompositions()
         serialized = ExportedProgramSerializer().serialize(exported_module)
         for v in serialized.exported_program.range_constraints.values():
@@ -441,7 +494,13 @@ def forward(self, x):
                 return torch.split(x, 2)
 
         input = torch.arange(10.0).reshape(5, 2)
+<<<<<<< HEAD
         exported_module = export_for_training(MyModule(), (input,)).run_decompositions()
+=======
+        exported_module = export_for_training(
+            MyModule(), (input,), strict=True
+        ).run_decompositions()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         serialized = ExportedProgramSerializer().serialize(exported_module)
         node = serialized.exported_program.graph_module.graph.nodes[-1]
@@ -503,8 +562,12 @@ def forward(self, x):
                 return torch.ops.aten.var_mean.correction(x, [1])[0]
 
         exported_module = export_for_training(
+<<<<<<< HEAD
             MyModule(),
             (torch.ones([512, 512], requires_grad=True),),
+=======
+            MyModule(), (torch.ones([512, 512], requires_grad=True),), strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run_decompositions()
 
         serialized = ExportedProgramSerializer().serialize(exported_module)
@@ -525,7 +588,11 @@ def forward(self, x):
                 return x + x
 
         ep = export_for_training(
+<<<<<<< HEAD
             M(), (torch.randn(4),), dynamic_shapes=({0: Dim("temp")},)
+=======
+            M(), (torch.randn(4),), dynamic_shapes=({0: Dim("temp")},), strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         range_constraints = list(ep.range_constraints.keys())
@@ -539,8 +606,17 @@ def forward(self, x):
         ep.range_constraints[symint] = ValueRanges(lower=lower_range, upper=upper_range)
 
         serialized = ExportedProgramSerializer().serialize(ep)
+<<<<<<< HEAD
         self.assertEqual(serialized.exported_program.range_constraints["s0"].min_val, 2)
         self.assertEqual(serialized.exported_program.range_constraints["s0"].max_val, 3)
+=======
+        self.assertEqual(
+            serialized.exported_program.range_constraints[symint.name].min_val, 2
+        )
+        self.assertEqual(
+            serialized.exported_program.range_constraints[symint.name].max_val, 3
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_kwargs_default(self) -> None:
         """
@@ -556,7 +632,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         f = Foo()
 
         x, _ = torch.sort(torch.randn(3, 4))
+<<<<<<< HEAD
         exported_module = export_for_training(f, (x,)).run_decompositions()
+=======
+        exported_module = export_for_training(f, (x,), strict=True).run_decompositions()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         serialized = ExportedProgramSerializer().serialize(exported_module)
 
         node = serialized.exported_program.graph_module.graph.nodes[-1]
@@ -574,7 +654,13 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 b = x + y
                 return b + a
 
+<<<<<<< HEAD
         ep = export_for_training(Module(), (torch.randn(3, 2), torch.randn(3, 2)))
+=======
+        ep = export_for_training(
+            Module(), (torch.randn(3, 2), torch.randn(3, 2)), strict=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         s = ExportedProgramSerializer().serialize(ep)
         c = canonicalize(s.exported_program)
         g = c.graph_module.graph
@@ -588,7 +674,11 @@ class M(torch.nn.Module):
             def forward(self, x):
                 return torch.ops.aten.sum.dim_IntList(x, [])
 
+<<<<<<< HEAD
         ep = torch.export.export_for_training(M(), (torch.randn(3, 2),))
+=======
+        ep = torch.export.export_for_training(M(), (torch.randn(3, 2),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         serialized = ExportedProgramSerializer().serialize(ep)
         for node in serialized.exported_program.graph_module.graph.nodes:
             if "aten.sum.dim_IntList" in node.target:
@@ -914,6 +1004,37 @@ def forward(self, a, b, c):
         inp = (torch.ones(3, 3), torch.ones(3, 3), torch.tensor(2))
         self.check_graph(Mod(), inp, use_pre_dispatch=False)
 
+<<<<<<< HEAD
+=======
+    def test_none_input(self):
+        """
+        Testing a backwards-compatibility breakage where old models do not have
+        an input spec with the node name.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, x, y, z):
+                return x + z
+
+        ep = torch.export.export(M(), (torch.ones(3, 3), None, torch.ones(3, 3)))
+
+        serialized_program = ExportedProgramSerializer(None, 2).serialize(ep)
+        serialized_program.exported_program.graph_module.signature.input_specs[1] = (
+            schema.InputSpec.create(
+                user_input=schema.UserInputSpec(
+                    arg=schema.Argument.create(as_none=True)
+                )
+            )
+        )
+        ep = ExportedProgramDeserializer(None).deserialize(
+            serialized_program.exported_program, {}, {}, {}
+        )
+        ep.graph_module.recompile()
+        unflattened = torch.export.unflatten(ep)
+        inp = (torch.rand(3, 3), None, torch.rand(3, 3))
+        self.assertEqual(unflattened(*inp), M()(*inp))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multi_return(self) -> None:
         """
         Test multiple return from a single node (ex. layer_norm has 2 outputs)
@@ -971,7 +1092,10 @@ def forward(self, a, b, c) -> torch.Tensor:
         dynamic_shapes = {"a": {0: dim0_ac}, "b": None, "c": {0: dim0_ac}}
         self.check_graph(DynamicShapeSimpleModel(), inputs, dynamic_shapes)
 
+<<<<<<< HEAD
     @unittest.expectedFailure  # T206587081
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sym_bool(self):
         class Module(torch.nn.Module):
             def forward(self, x, y):
@@ -1230,7 +1354,11 @@ def forward(self, x):
                 a = a * 2
                 return a, b
 
+<<<<<<< HEAD
         ep = torch.export.export_for_training(M(), (torch.ones(3),))
+=======
+        ep = torch.export.export_for_training(M(), (torch.ones(3),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # insert another getitem node
         for node in ep.graph.nodes:
@@ -1376,7 +1504,11 @@ def __init__(self) -> None:
             def forward(self):
                 return self.p * self.p
 
+<<<<<<< HEAD
         ep = torch.export.export_for_training(M(), ())
+=======
+        ep = torch.export.export_for_training(M(), (), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep._example_inputs = None
         roundtrip_ep = deserialize(serialize(ep))
         self.assertTrue(torch.allclose(ep.module()(), roundtrip_ep.module()()))
@@ -1404,7 +1536,11 @@ def forward(self, x):
                 return x + x
 
         f = Module()
+<<<<<<< HEAD
         ep = export_for_training(f, (torch.randn(1, 3),))
+=======
+        ep = export_for_training(f, (torch.randn(1, 3),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         serialized_program = ExportedProgramSerializer().serialize(ep)
         serialized_program.exported_program.schema_version.major = -1
@@ -1440,7 +1576,11 @@ def forward(self, x):
                 y = self.linear(y)
                 return y
 
+<<<<<<< HEAD
         ep = export_for_training(Module(), inp)
+=======
+        ep = export_for_training(Module(), inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         buffer = io.BytesIO()
         save(ep, buffer)
@@ -1449,6 +1589,10 @@ def forward(self, x):
 
         self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(IS_WINDOWS, "Cannot modify file in windows")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_save_file(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -1457,12 +1601,21 @@ def forward(self, x):
         f = Foo()
 
         inp = (torch.randn(2, 2),)
+<<<<<<< HEAD
         ep = export_for_training(f, inp)
 
         with tempfile.NamedTemporaryFile() as f:
             save(ep, f)
             f.seek(0)
             loaded_ep = load(f)
+=======
+        ep = export_for_training(f, inp, strict=True)
+
+        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+            save(ep, f.name)
+            f.seek(0)
+            loaded_ep = load(f.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
@@ -1474,9 +1627,15 @@ def forward(self, x, y):
         f = Foo()
 
         inp = (torch.tensor([6]), torch.tensor([7]))
+<<<<<<< HEAD
         ep = export_for_training(f, inp)
 
         with TemporaryFileName() as fname:
+=======
+        ep = export_for_training(f, inp, strict=True)
+
+        with TemporaryFileName(suffix=".pt2") as fname:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             path = Path(fname)
             save(ep, path)
             loaded_ep = load(path)
@@ -1492,7 +1651,11 @@ def forward(self, x):
 
         f = Foo()
 
+<<<<<<< HEAD
         ep = export_for_training(f, inp)
+=======
+        ep = export_for_training(f, inp, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         buffer = io.BytesIO()
         save(ep, buffer, extra_files={"extra.txt": "moo"})
@@ -1503,6 +1666,12 @@ def forward(self, x):
         self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
         self.assertEqual(extra_files["extra.txt"], "moo")
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_FBCODE or IS_MACOS or IS_WINDOWS, "The file path is different in fbcode CI"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_version_error(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -1510,6 +1679,7 @@ def forward(self, x):
 
         f = Foo()
 
+<<<<<<< HEAD
         ep = export_for_training(f, (torch.randn(1, 3),))
 
         with self.assertRaisesRegex(
@@ -1525,6 +1695,24 @@ def forward(self, x):
 
                 f.seek(0)
                 load(f)
+=======
+        ep = export_for_training(f, (torch.randn(1, 3),), strict=True)
+
+        with self.assertRaisesRegex(
+            ValueError, r"Saved archive version -1 does not match our current"
+        ):
+            with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+                save(ep, f.name)
+                f.seek(0)
+                file_prefix = f.name.split("/")[2].split(".")[0]
+
+                # Modify the version
+                with zipfile.ZipFile(f, "a") as zipf:
+                    zipf.writestr(f"{file_prefix}/{ARCHIVE_VERSION_PATH}", "-1")
+
+                f.seek(0)
+                load(f.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_save_constants(self):
         class Foo(torch.nn.Module):
@@ -1536,7 +1724,11 @@ def forward(self, x):
                 list_tensor = [torch.tensor(3), torch.tensor(4)]
                 return x + self.a + list_tensor[0] + list_tensor[1]
 
+<<<<<<< HEAD
         ep = export_for_training(Foo(), (torch.tensor(1),))
+=======
+        ep = export_for_training(Foo(), (torch.tensor(1),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buffer = io.BytesIO()
         save(ep, buffer)
         buffer.seek(0)
@@ -1562,7 +1754,11 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.zeros(4, 4),)
+<<<<<<< HEAD
         ep = export_for_training(f, inputs)
+=======
+        ep = export_for_training(f, inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Replace one of the values with an instance of our custom class
         for node in ep.graph.nodes:
@@ -1670,7 +1866,11 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.zeros(4, 4),)
+<<<<<<< HEAD
         ep = export_for_training(f, inputs)
+=======
+        ep = export_for_training(f, inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         new_gm = copy.deepcopy(ep.graph_module)
         new_gm.meta["custom"] = {}
@@ -1705,7 +1905,11 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.ones(2, 2),)
+<<<<<<< HEAD
         ep = export_for_training(f, inputs)
+=======
+        ep = export_for_training(f, inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         new_gm = copy.deepcopy(ep.graph_module)
         new_gm.meta["custom"] = {}
@@ -1741,7 +1945,11 @@ def forward(self, x):
         f = Foo()
 
         inputs = (torch.zeros(4, 4),)
+<<<<<<< HEAD
         ep = export_for_training(f, inputs)
+=======
+        ep = export_for_training(f, inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         new_gm = copy.deepcopy(ep.graph_module)
         new_gm.meta["custom"] = {}
diff --git a/test/export/test_sparse.py b/test/export/test_sparse.py
index 5858da199e2e..54a5e925edc5 100644
--- a/test/export/test_sparse.py
+++ b/test/export/test_sparse.py
@@ -7,7 +7,11 @@
 import unittest
 
 import torch
+<<<<<<< HEAD
 from torch._dynamo.config import is_fbcode
+=======
+from torch._environment import is_fbcode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
diff --git a/test/export/test_swap.py b/test/export/test_swap.py
index 8833c3c94ae7..1f8d67c98fd2 100644
--- a/test/export/test_swap.py
+++ b/test/export/test_swap.py
@@ -22,7 +22,13 @@
         {"strict": False},
         {"strict": True},
     ],
+<<<<<<< HEAD
     class_name_func=lambda cls, _, params: f"{cls.__name__}_{'strict' if params['strict'] else 'nonstrict'}",
+=======
+    class_name_func=lambda cls,
+    _,
+    params: f"{cls.__name__}_{'strict' if params['strict'] else 'nonstrict'}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 class TestSwap(TestCase):
     def test_unflatten_preserve_signature(self):
diff --git a/test/export/test_torchbind.py b/test/export/test_torchbind.py
index 673c0f0286e7..37cb172dff6b 100644
--- a/test/export/test_torchbind.py
+++ b/test/export/test_torchbind.py
@@ -67,6 +67,10 @@ def setUp(self):
         test.tq_size_counter = 0
         test.foo_add_tensor_counter = 0
 
+<<<<<<< HEAD
+=======
+        # We need different fake classes, which update the counters
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch._library.register_fake_class("_TorchScriptTesting::_Foo")
         class FakeFoo:
             def __init__(self, x: int, y: int):
@@ -560,6 +564,43 @@ def forward(self, token, obj_attr, x):
     return (getitem_3, add_1)""",  # noqa: B950
         )
 
+<<<<<<< HEAD
+=======
+    @parametrize("pre_dispatch", [True, False])
+    def test_custom_obj_unbacked_symint(self, pre_dispatch):
+        class MyModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(2, 3)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo_tensor_return(self.attr, x)
+                return a
+
+        input = torch.ones(2, 3)
+        ep = self._test_export_same_as_eager(
+            MyModule(), (input,), strict=False, pre_dispatch=pre_dispatch
+        )
+        gm = ep.module()
+        foo_node = next(
+            n
+            for n in gm.graph.nodes
+            if n.target == torch.ops._TorchScriptTesting.takes_foo_tensor_return.default
+        )
+        unbacked_bindings = foo_node.meta["unbacked_bindings"]
+        self.assertEqual(len(unbacked_bindings), 2)
+        u = next(iter(unbacked_bindings.keys()))
+        path = unbacked_bindings[u]
+        # the unbacked bindings should be CallMethodKey(name='size'), SequenceKey(idx=0)
+        # it should not include the effect token in the path
+        self.assertEqual(
+            type(u).__name__, "Symbol"
+        )  # check binding is symbol, not expr
+        self.assertEqual(len(path), 2)
+        self.assertEqual(path[0].name, "size")
+        self.assertEqual(path[1].idx, 0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("make_fx_tracing_mode", ["fake", "symbolic"])
     def test_make_fx_tensor_queue_methods(self, make_fx_tracing_mode):
         test = self
@@ -1268,7 +1309,11 @@ def forward(self, tq, x):
 
         tq1 = _empty_tensor_queue()
         tq1.push(x)
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "is alising"):
+=======
+        with self.assertRaisesRegex(RuntimeError, "is aliasing"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.compile(mod, backend=backend)(tq1, x)
 
     @parametrize("backend", ["eager", "aot_eager"])
diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py
index 8250f0027e37..d9a6cf37c353 100644
--- a/test/export/test_unflatten.py
+++ b/test/export/test_unflatten.py
@@ -3,7 +3,11 @@
 import copy
 import unittest
 from re import escape
+<<<<<<< HEAD
 from typing import Any, List
+=======
+from typing import Any, List, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo as torchdynamo
@@ -233,7 +237,11 @@ def forward(self, x, y):
             new_inps = *inps, torch.rand(2, 3)
             with self.assertRaisesRegex(
                 TypeError,
+<<<<<<< HEAD
                 "There is no flat args adapter sepcified. Are you sure you are calling this with the right arguments?",
+=======
+                "There is no flat args adapter specified. Are you sure you are calling this with the right arguments?",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 unflattened(new_inps)
 
@@ -245,6 +253,10 @@ def adapt(
                     input_spec: TreeSpec,
                     input_args: List[Any],
                     metadata: dict[str, Any],
+<<<<<<< HEAD
+=======
+                    obj: Optional[Any] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ) -> List[Any]:
                     while len(input_args) > 2:
                         input_args.pop(-1)
@@ -878,7 +890,11 @@ def forward(self, x, y):
         fn_count_sym_size = lambda graph: [node.target for node in graph.nodes].count(
             torch.ops.aten.sym_size.int
         )
+<<<<<<< HEAD
         self.assertEqual(fn_count_sym_size(unflat.graph), 1)
+=======
+        self.assertEqual(fn_count_sym_size(unflat.graph), 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(fn_count_sym_size(unflat.m1.graph), 1)
         self.assertEqual(fn_count_sym_size(unflat.m2.graph), 0)
 
@@ -953,6 +969,60 @@ def forward(self, x):
         unflattened.foo = torch.compile(unflattened.foo, fullgraph=True)
         self.compare_outputs(orig_eager, unflattened, inputs)
 
+<<<<<<< HEAD
+=======
+    def test_unflatten_none(self):
+        class M2(torch.nn.Module):
+            def forward(self, x, y):
+                return x + x, None
+
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.m2 = M2()
+
+            def forward(self, x, y):
+                x = x + x
+                return self.m2(x, y)
+
+        ep = export(
+            M(), (torch.rand(2, 3), None), preserve_module_call_signature=("m2",)
+        )
+        unflattened = unflatten(ep)
+        inp = (torch.randn(2, 3), None)
+        self.assertTrue(torch.allclose(M()(*inp)[0], unflattened(*inp)[0]))
+
+    def test_unflatten_empty_branch(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                if x is None:
+                    return torch.ones(3), torch.ones(3)
+                else:
+                    return x + x, x * x
+
+        class M1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.m = M()
+
+            def forward(self, x, y):
+                a, b = self.m(x)
+                c, d = self.m(y)
+                return a + b + c + d
+
+        ep = torch.export.export(M1(), (torch.randn(3), None))
+        unf = torch.export.unflatten(ep)
+        inp = (torch.randn(3), None)
+        self.assertTrue(torch.allclose(unf(*inp), M1()(*inp)))
+
+        ep = torch.export.export(
+            M1(), (torch.randn(3), None), preserve_module_call_signature="m"
+        )
+        unf = torch.export.unflatten(ep)
+        inp = (torch.randn(3), None)
+        self.assertTrue(torch.allclose(unf(*inp), M1()(*inp)))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_unflatten_training_ir.py b/test/export/test_unflatten_training_ir.py
index 684d9a149ecf..8399d4064e97 100644
--- a/test/export/test_unflatten_training_ir.py
+++ b/test/export/test_unflatten_training_ir.py
@@ -14,7 +14,11 @@
 
 
 def mocked_training_ir_export(*args, **kwargs):
+<<<<<<< HEAD
     return export_for_training(*args, **kwargs)
+=======
+    return export_for_training(*args, **kwargs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def make_dynamic_cls(cls):
diff --git a/test/export/test_verifier.py b/test/export/test_verifier.py
index dd3d18db1cda..005cdd073aec 100644
--- a/test/export/test_verifier.py
+++ b/test/export/test_verifier.py
@@ -20,7 +20,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         f = Foo()
 
+<<<<<<< HEAD
         ep = export_for_training(f, (torch.randn(100), torch.randn(100)))
+=======
+        ep = export_for_training(f, (torch.randn(100), torch.randn(100)), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         verifier = Verifier()
         verifier.check(ep)
@@ -48,7 +52,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         f = Foo()
 
         ep = export_for_training(
+<<<<<<< HEAD
             f, (torch.randn(100), torch.randn(100))
+=======
+            f, (torch.randn(100), torch.randn(100)), strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run_decompositions({})
         for node in ep.graph.nodes:
             if node.target == torch.ops.aten.add.Tensor:
@@ -72,7 +80,11 @@ def false_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         f = Foo()
 
+<<<<<<< HEAD
         ep = export_for_training(f, (torch.randn(3, 3), torch.randn(3, 3)))
+=======
+        ep = export_for_training(f, (torch.randn(3, 3), torch.randn(3, 3)), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         verifier = Verifier()
         verifier.check(ep)
@@ -92,7 +104,11 @@ def false_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         f = Foo()
 
         ep = export_for_training(
+<<<<<<< HEAD
             f, (torch.randn(3, 3), torch.randn(3, 3))
+=======
+            f, (torch.randn(3, 3), torch.randn(3, 3)), strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run_decompositions({})
         for node in ep.graph_module.true_graph_0.graph.nodes:
             if node.target == torch.ops.aten.add.Tensor:
@@ -111,7 +127,11 @@ def __init__(self) -> None:
             def forward(self, x: Tensor) -> Tensor:
                 return self.linear(x)
 
+<<<<<<< HEAD
         ep = export_for_training(M(), (torch.randn(10, 10),))
+=======
+        ep = export_for_training(M(), (torch.randn(10, 10),), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep.validate()
 
     def test_ep_verifier_invalid_param(self) -> None:
@@ -125,7 +145,11 @@ def __init__(self) -> None:
             def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 return x + y + self.a
 
+<<<<<<< HEAD
         ep = export_for_training(M(), (torch.randn(100), torch.randn(100)))
+=======
+        ep = export_for_training(M(), (torch.randn(100), torch.randn(100)), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Parameter doesn't exist in the state dict
         ep.graph_signature.input_specs[0] = InputSpec(
@@ -150,7 +174,11 @@ def __init__(self) -> None:
             def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 return x + y + self.a
 
+<<<<<<< HEAD
         ep = export_for_training(M(), (torch.randn(100), torch.randn(100)))
+=======
+        ep = export_for_training(M(), (torch.randn(100), torch.randn(100)), strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Buffer doesn't exist in the state dict
         ep.graph_signature.input_specs[0] = InputSpec(
@@ -182,7 +210,13 @@ def forward(self, x1, x2):
                 self.my_buffer2.add_(1.0)
                 return output
 
+<<<<<<< HEAD
         ep = export_for_training(M(), (torch.tensor(5.0), torch.tensor(6.0)))
+=======
+        ep = export_for_training(
+            M(), (torch.tensor(5.0), torch.tensor(6.0)), strict=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep.validate()
 
     def test_ep_verifier_invalid_output(self) -> None:
@@ -205,7 +239,13 @@ def forward(self, x1, x2):
                 self.my_buffer2.add_(1.0)
                 return output
 
+<<<<<<< HEAD
         ep = export_for_training(M(), (torch.tensor(5.0), torch.tensor(6.0)))
+=======
+        ep = export_for_training(
+            M(), (torch.tensor(5.0), torch.tensor(6.0)), strict=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output_node = list(ep.graph.nodes)[-1]
         output_node.args = (
diff --git a/test/export/testing.py b/test/export/testing.py
index ef347ff3df0e..b908cdfd824e 100644
--- a/test/export/testing.py
+++ b/test/export/testing.py
@@ -257,9 +257,15 @@ def expectedFailureTrainingIRToRunDecompNonStrict(fn):
     return fn
 
 
+<<<<<<< HEAD
 # Controls tests generated in test/export/test_export_nonstrict.py
 def expectedFailureNonStrict(fn):
     fn._expected_failure_non_strict = True
+=======
+# Controls tests generated in test/export/test_export_strict.py
+def expectedFailureStrict(fn):
+    fn._expected_failure_strict = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fn
 
 
@@ -307,6 +313,14 @@ def expectedFailureCppRuntime(fn):
     return fn
 
 
+<<<<<<< HEAD
+=======
+def expectedFailureCppRuntimeNonStrict(fn):
+    fn._expected_failure_cpp_runtime_non_strict = True
+    return fn
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Controls tests generated in test/export/test_export_legacy.py
 def expectedFailureLegacyExportStrict(fn):
     fn._expected_failure_legacy_export = True
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index 03b065a3691a..c177dcc677d8 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -10,7 +10,20 @@
 
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+<<<<<<< HEAD
 logging.basicConfig(level=logging.INFO, format=FORMAT)
+=======
+
+log = logging.getLogger("log")
+log.setLevel(logging.INFO)
+
+handler = logging.StreamHandler()
+formatter = logging.Formatter(FORMAT)
+handler.setFormatter(formatter)
+
+log.addHandler(handler)
+log.propagate = False  # Avoid double logging if root logger has handlers
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # How to run this test locally:
 # 1 Have two virtual environments (eg conda env), one without PyTorch installed (venv_nightly)
@@ -61,6 +74,10 @@
     ("profiler::_call_end_callbacks_on_jit_fut*", datetime.date(9999, 1, 1)),
     ("profiler::_record_function_enter", datetime.date(9999, 1, 1)),
     ("aten::_cholesky_helper", datetime.date(9999, 1, 1)),
+<<<<<<< HEAD
+=======
+    ("aten::_cslt_sparse_mm", datetime.date(9999, 1, 1)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ("aten::_lstsq_helper", datetime.date(9999, 1, 1)),
     ("aten::_syevd_helper", datetime.date(9999, 1, 1)),
     ("aten::_linalg_solve_out_helper_", datetime.date(9999, 1, 1)),
@@ -126,6 +143,12 @@
     ("aten::reduce_scatter_tensor", datetime.date(9999, 1, 30)),
     ("aten::all_gather_into_tensor", datetime.date(9999, 1, 30)),
     ("aten::all_reduce", datetime.date(9999, 1, 30)),
+<<<<<<< HEAD
+=======
+    # These ops are defined in torch/csrc/distributed/c10d/Ops.cpp
+    # TODO: add back restriction when c10d ops can be exported
+    ("c10d::.*", datetime.date(9999, 1, 1)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 ALLOW_LIST_COMPILED = [
@@ -256,10 +279,17 @@ def check_bc(existing_schemas):
         is_allow_list, trust_not_core_aten = allow_listed(existing_schema)
         if is_allow_list:
             if trust_not_core_aten or not is_core_aten_op(existing_schema):
+<<<<<<< HEAD
                 logging.info("schema: %s found on allowlist, skipping", existing_schema)
                 continue
             else:
                 logging.info(
+=======
+                log.info("schema: %s found on allowlist, skipping", existing_schema)
+                continue
+            else:
+                log.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "schema: %s found on allowlist, but is a core ATen op, checking BC. "
                     "NOTE: If you have removed an operator we will conservatively assume that "
                     "it is a core ATen op. If the operator you removed is not a core ATen op, "
@@ -269,6 +299,7 @@ def check_bc(existing_schemas):
                 )
         if has_valid_upgraders(existing_schema, version_map):
             if not is_core_aten_op(existing_schema):
+<<<<<<< HEAD
                 logging.info("schema: %s has valid upgrader, skipping", existing_schema)
                 continue
             else:
@@ -276,6 +307,15 @@ def check_bc(existing_schemas):
                     "schema: %s has a valid upgrader, but is a core ATen op, checking BC"
                 )
         logging.debug("processing existing schema: %s", existing_schema)
+=======
+                log.info("schema: %s has valid upgrader, skipping", existing_schema)
+                continue
+            else:
+                log.info(
+                    "schema: %s has a valid upgrader, but is a core ATen op, checking BC"
+                )
+        log.debug("processing existing schema: %s", existing_schema)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         matching_new_schemas = new_schema_dict.get(existing_schema.name, [])
         found = False
         for matching_new_schema in matching_new_schemas:
@@ -283,7 +323,11 @@ def check_bc(existing_schemas):
                 found = True
                 break
         if not found:
+<<<<<<< HEAD
             logging.warning(
+=======
+            log.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Can NOT find backward compatible schemas after changes "
                 "for schema %s from the following candidates:\n[\n%s\n]",
                 str(existing_schema),
@@ -293,9 +337,15 @@ def check_bc(existing_schemas):
             broken_ops.append(str(existing_schema))
             is_bc = False
     if is_bc:
+<<<<<<< HEAD
         logging.info("Found backward compatible schemas for all existing schemas")
     else:
         logging.warning(
+=======
+        log.info("Found backward compatible schemas for all existing schemas")
+    else:
+        log.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "The PR is introducing backward incompatible changes to the "
             "operator library. Please contact PyTorch team to confirm "
             "whether this change is wanted or not. \n\nBroken ops: "
@@ -312,9 +362,15 @@ def check_fc(existing_schemas):
     for existing_schema in existing_schemas:
         is_allow_list, _ = allow_listed(existing_schema)
         if is_allow_list:
+<<<<<<< HEAD
             logging.info("schema: %s found on allowlist, skipping", existing_schema)
             continue
         logging.info("processing existing schema: %s", existing_schema)
+=======
+            log.info("schema: %s found on allowlist, skipping", existing_schema)
+            continue
+        log.info("processing existing schema: %s", existing_schema)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         matching_new_schemas = new_schema_dict.get(existing_schema.name, [])
         found = False
         possible_failure_reasons = []
@@ -328,23 +384,38 @@ def check_fc(existing_schemas):
             if reason != "":
                 possible_failure_reasons.append(reason)
         if not found:
+<<<<<<< HEAD
             logging.warning(
+=======
+            log.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Can NOT find forward compatible schemas after changes "
                 "for schema %s from the following candidates:\n[\n\t%s\n]",
                 str(existing_schema),
                 "\n\t".join(str(s) for s in matching_new_schemas),
             )
+<<<<<<< HEAD
             logging.warning(
                 "Refer to following reasons for failure "
                 "to find FC schema:\n[\n%s\n]",
+=======
+            log.warning(
+                "Refer to following reasons for failure to find FC schema:\n[\n%s\n]",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "\n\t".join(str(r) for r in possible_failure_reasons),
             )
             broken_ops.append(str(existing_schema))
             is_fc = False
     if is_fc:
+<<<<<<< HEAD
         logging.info("Found forward compatible schemas for all existing schemas")
     else:
         logging.warning(
+=======
+        log.info("Found forward compatible schemas for all existing schemas")
+    else:
+        log.warning(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "The PR is introducing a potentially forward incompatible changes to the "
             "operator library. Please contact PyTorch team to confirm "
             "whether this change is wanted or not. \n\nBroken ops: "
@@ -371,7 +442,11 @@ def check_fc(existing_schemas):
                 break
 
             if dont_parse(line.strip()):
+<<<<<<< HEAD
                 logging.info("Not parsing schema line: %s", line.strip())
+=======
+                log.info("Not parsing schema line: %s", line.strip())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             s = parse_schema(line.strip())
             slist.append(s)
diff --git a/test/functorch/common_utils.py b/test/functorch/common_utils.py
index 480f079c4692..12b4e67d5dd4 100644
--- a/test/functorch/common_utils.py
+++ b/test/functorch/common_utils.py
@@ -523,6 +523,7 @@ def wrapped(
         dtypes=dtypes,
     ):
         name_parts = fn.__qualname__.split(".")
+<<<<<<< HEAD
         assert (
             len(name_parts) == 2
         ), "Decorator only applies to a test function of a test class"
@@ -532,6 +533,17 @@ def wrapped(
             assert (
                 len(matching_module_infos) == 1
             ), f"Couldn't find single ModuleInfo for {module_cls}"
+=======
+        assert len(name_parts) == 2, (
+            "Decorator only applies to a test function of a test class"
+        )
+        test_case_name, base_test_name = name_parts
+        for module_cls in module_classes:
+            matching_module_infos = [m for m in module_db if m.module_cls == module_cls]
+            assert len(matching_module_infos) == 1, (
+                f"Couldn't find single ModuleInfo for {module_cls}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             module_info = matching_module_infos[0]
             decorators = list(module_info.decorators)
             new_decorator = DecorateInfo(
@@ -614,3 +626,32 @@ def check_vmap_fallback(test_case, thunk, opinfo, dry_run=False):
             print(f"xfail('{opinfo.name}', '{opinfo.variant_test_name}'),")
         else:
             print(f"xfail('{opinfo.name}'),")
+<<<<<<< HEAD
+=======
+
+
+def saved_tensors_hooks_to_gm(
+    pack_fn, unpack_fn, pack_cache_hash, unpack_cache_hash, symbolic_tracing=True
+):
+    if symbolic_tracing:
+        pack_gm = torch.fx.symbolic_trace(pack_fn)
+        unpack_gm = torch.fx.symbolic_trace(unpack_fn)
+    else:
+        from torch.functorch import make_fx
+
+        inp = torch.randn(2, 3)
+        torch._dynamo.mark_dynamic(inp, 0)
+        torch._dynamo.mark_dynamic(inp, 1)
+        pack_out = pack_fn(inp)
+        pack_gm = make_fx(pack_fn)(inp)
+        unpack_gm = make_fx(unpack_fn)(pack_out)
+
+    def set_manual_hash(g, manual_hash):
+        node = next(iter(g.nodes))
+        node.meta["user_cache_hash"] = manual_hash
+
+    set_manual_hash(pack_gm.graph, pack_cache_hash)
+    set_manual_hash(unpack_gm.graph, unpack_cache_hash)
+
+    return pack_gm, unpack_gm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/functorch/functorch_additional_op_db.py b/test/functorch/functorch_additional_op_db.py
index 03e83615a189..072c2bfb975a 100644
--- a/test/functorch/functorch_additional_op_db.py
+++ b/test/functorch/functorch_additional_op_db.py
@@ -322,6 +322,7 @@ def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs):
     test_args = [
         (3, ([1, 2],)),
         (3, (slice(0, 3),)),
+<<<<<<< HEAD
         (3, ([slice(0, 3), 1],)),
         (3, ([[0, 2, 3], [1, 3, 3], [0, 0, 2]],)),
         (3, ([[0, 0, 3], [1, 1, 3], [0, 0, 2]],)),
@@ -329,6 +330,15 @@ def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs):
         (3, ([slice(None), [0, 3], slice(None)],)),
         (3, ([[0, 3], slice(None), slice(None)],)),
         (3, ([[0, 3], [1, 2], slice(None)],)),
+=======
+        (3, ((slice(0, 3), 1),)),
+        (3, (([0, 2, 3], [1, 3, 3], [0, 0, 2]),)),
+        (3, (([0, 0, 3], [1, 1, 3], [0, 0, 2]),)),
+        (3, ((slice(None), slice(None), [0, 3]),)),
+        (3, ((slice(None), [0, 3], slice(None)),)),
+        (3, (([0, 3], slice(None), slice(None)),)),
+        (3, (([0, 3], [1, 2], slice(None)),)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             3,
             (
@@ -337,6 +347,7 @@ def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs):
                 ],
             ),
         ),
+<<<<<<< HEAD
         (3, ([[0, 3], slice(None)],)),
         (3, ([[0, 3], Ellipsis],)),
         (3, ([[0, 2, 3], [1, 3, 3], torch.LongTensor([0, 0, 2])],)),
@@ -351,6 +362,22 @@ def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs):
         (6, ([slice(None), slice(None), slice(None), adv_idx, adv_idx],)),
         (6, ([slice(None), slice(None), adv_idx, adv_idx, adv_idx],)),
         (6, ([slice(None), slice(None), None, adv_idx, adv_idx, adv_idx],)),
+=======
+        (3, (([0, 3], slice(None)),)),
+        (3, (([0, 3], Ellipsis),)),
+        (3, (([0, 2, 3], [1, 3, 3], torch.LongTensor([0, 0, 2])),)),
+        (4, ((slice(None), adv_idx, adv_idx, slice(None)),)),
+        (4, ((slice(None), adv_idx, slice(None), adv_idx),)),
+        (4, ((adv_idx, slice(None), slice(None), adv_idx),)),
+        (4, ((slice(None), slice(None), adv_idx, adv_idx),)),
+        (4, ((Ellipsis, adv_idx, adv_idx),)),
+        (5, ((slice(None), slice(None), adv_idx, slice(None), adv_idx),)),
+        (5, ((slice(None), slice(None), adv_idx, adv_idx, slice(None)),)),
+        (5, ((slice(None), slice(None), adv_idx, None, adv_idx, slice(None)),)),
+        (6, ((slice(None), slice(None), slice(None), adv_idx, adv_idx),)),
+        (6, ((slice(None), slice(None), adv_idx, adv_idx, adv_idx),)),
+        (6, ((slice(None), slice(None), None, adv_idx, adv_idx, adv_idx),)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
     def get_shape(dim):
@@ -400,6 +427,7 @@ def sample_inputs_aten_index_put(op_info, device, dtype, requires_grad, **kwargs
     adv_idx = torch.LongTensor([[0, 1], [2, 3]])
     # self_shape, indices
     additional = [
+<<<<<<< HEAD
         ((5, 6, 7, 8), [None, adv_idx, adv_idx, None]),
         ((5, 6, 7, 8), [None, adv_idx, None, adv_idx]),
         ((5, 6, 7, 8), [adv_idx, None, None, adv_idx]),
@@ -408,12 +436,28 @@ def sample_inputs_aten_index_put(op_info, device, dtype, requires_grad, **kwargs
         ((5, 6, 7, 8, 9), [None, None, adv_idx, adv_idx, None]),
         ((5, 6, 7, 8, 9, 10), [None, None, None, adv_idx, adv_idx]),
         ((5, 6, 7, 8, 9, 10), [None, None, adv_idx, adv_idx, adv_idx]),
+=======
+        ((5, 6, 7, 8), (None, adv_idx, adv_idx, None)),
+        ((5, 6, 7, 8), (None, adv_idx, None, adv_idx)),
+        ((5, 6, 7, 8), (adv_idx, None, None, adv_idx)),
+        ((5, 6, 7, 8), (None, None, adv_idx, adv_idx)),
+        ((5, 6, 7, 8, 9), (None, None, adv_idx, None, adv_idx)),
+        ((5, 6, 7, 8, 9), (None, None, adv_idx, adv_idx, None)),
+        ((5, 6, 7, 8, 9, 10), (None, None, None, adv_idx, adv_idx)),
+        ((5, 6, 7, 8, 9, 10), (None, None, adv_idx, adv_idx, adv_idx)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     for self_shape, indices in additional:
         for broadcast_value in [False, True]:
             inp = make_arg(self_shape)
 
+<<<<<<< HEAD
             tmp_indices = [slice(None) if idx is None else idx for idx in indices]
+=======
+            tmp_indices = tuple(
+                [slice(None) if idx is None else idx for idx in indices]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             values_shape = inp[tmp_indices].shape
             if broadcast_value:
                 values_shape = values_shape[3:]
diff --git a/test/functorch/test_ac.py b/test/functorch/test_ac.py
index c3bc905a9379..b5eb0ad175bb 100644
--- a/test/functorch/test_ac.py
+++ b/test/functorch/test_ac.py
@@ -8,6 +8,10 @@
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM, TestCase
 from torch.testing._internal.inductor_utils import HAS_CUDA
 from torch.utils._triton import has_triton
+<<<<<<< HEAD
+=======
+from torch.utils.checkpoint import checkpoint
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils.flop_counter import FlopCounterMode, register_flop_formula
 
 
@@ -374,6 +378,36 @@ def call():
         try_seq_length(4, 7, "mm")
         try_seq_length(4, 9, "attn")
 
+<<<<<<< HEAD
+=======
+    def test_manual_ac(self):
+        # test that manual checkpoint boundaries are respected
+        # when autoac is set
+        def f(x):
+            tmp1 = torch.matmul(x, x.T)
+            tmp1 = torch.matmul(tmp1, tmp1)
+            tmp1 = torch.matmul(tmp1, tmp1)
+            out = torch.matmul(tmp1, x)
+            return out
+
+        def g(x):
+            x = checkpoint(f, x, use_reentrant=False)
+            x = checkpoint(f, x, use_reentrant=False)
+            return x
+
+        x = torch.randn(64, 1024, requires_grad=True)
+
+        def call():
+            return g(x).sum()
+
+        eager_mem, eager_flops = get_mem_and_flops(call)
+        # give the memory budget logic a value that should cause it to run,
+        # but not recompute the matmuls
+        mem, flops = get_mem_and_flops(call, memory_budget=0.01)
+        self.assertEqual(mem, eager_mem)
+        self.assertEqual(flops, eager_flops)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     # I'm using the cuda memory allocator to verify memory allocations
diff --git a/test/functorch/test_ac_knapsack.py b/test/functorch/test_ac_knapsack.py
index 357b943c0bf7..01dd2b0d9c0c 100644
--- a/test/functorch/test_ac_knapsack.py
+++ b/test/functorch/test_ac_knapsack.py
@@ -124,9 +124,13 @@ def test_recomputable_node_only_graph(self):
         )
 
     def test_recomputable_node_only_graph_with_larger_graph_context(self):
+<<<<<<< HEAD
         recomputable_node_only_graph_with_larger_graph_context = (
             self.graph_info_provider.recomputable_node_only_graph_with_larger_graph_context
         )
+=======
+        recomputable_node_only_graph_with_larger_graph_context = self.graph_info_provider.recomputable_node_only_graph_with_larger_graph_context  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expected_nodes = self.all_recomputable_banned_nodes
         # node1 does not have an indirect path to node5 because of node2
         # node2 has an indirect path to node5
@@ -279,6 +283,7 @@ def test_evaluate_distribution_of_results_for_knapsack_algo(self):
         )
 
     def test_get_knee_point_memory_budget(self):
+<<<<<<< HEAD
         max_mem_budget = 1.0
         min_mem_budget = 0.1
         iterations = 10
@@ -289,6 +294,35 @@ def test_get_knee_point_memory_budget(self):
             iterations=iterations,
         )
         self.assertEqual(knee_point_memory_budget, 0.4)
+=======
+        """
+        Checks if the method correctly estimates the knee point in the memory budget
+        where the trade-off between memory usage and recomputation runtime is optimal.
+
+        If memory budget and runtime are considered as equal cost, then the knee point
+        is where the distance from 0 is smallest.
+        """
+        max_mem_budget_to_expected_knee_point = {
+            0.1: 0.1,
+            0.2: 0.1,
+            0.3: 0.3,
+            0.4: 0.4,  # 0.3 and 0.4 provide the same algo output so this is arbitrary
+            0.5: 0.4,
+        }
+        for (
+            max_mem_budget,
+            expected_knee_point,
+        ) in max_mem_budget_to_expected_knee_point.items():
+            knee_point_memory_budget = (
+                self.knapsack_evaluator.get_knee_point_memory_budget(
+                    knapsack_algo=self.knapsack_algo,
+                    max_mem_budget=max_mem_budget,
+                    min_mem_budget=0.1,
+                    iterations=5,
+                )
+            )
+            self.assertEqual(knee_point_memory_budget, expected_knee_point)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_get_backward_memory_from_topologically_sorted_graph(self):
         result = self.knapsack_evaluator._get_backward_memory_from_topologically_sorted_graph(
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index bcf99f6a6635..84e7cf40b2cf 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -10,12 +10,27 @@
 import itertools
 import unittest
 import warnings
+<<<<<<< HEAD
 from contextlib import ContextDecorator, nullcontext
+=======
+from contextlib import ContextDecorator, ExitStack, nullcontext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functools import partial, wraps
 from typing import Any, Callable, Optional, Union
 from unittest.mock import patch
 
+<<<<<<< HEAD
 from common_utils import decorate, decorateForModules, skip, skipOps, xfail
+=======
+from common_utils import (
+    decorate,
+    decorateForModules,
+    saved_tensors_hooks_to_gm,
+    skip,
+    skipOps,
+    xfail,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo as torchdynamo
@@ -40,6 +55,11 @@
 )
 from functorch.experimental import control_flow
 from torch._decomp import decomposition_table
+<<<<<<< HEAD
+=======
+from torch._dynamo.testing import normalize_gm
+from torch._dynamo.utils import counters
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
 from torch._functorch.aot_autograd import (
     aot_export_joint_simple,
@@ -52,7 +72,14 @@
 from torch._subclasses.fake_tensor import DynamicOutputShapeException, FakeTensorMode
 from torch.fx.experimental.proxy_tensor import is_sym_node
 from torch.fx.experimental.symbolic_shapes import GuardOnDataDependentSymNode, ShapeEnv
+<<<<<<< HEAD
+from torch.nn.utils.rnn import PackedSequence
+=======
+from torch.nn.attention.flex_attention import flex_attention
 from torch.nn.utils.rnn import PackedSequence
+from torch.testing import FileCheck
+from torch.testing._internal.common_cuda import SM80OrLater
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     ops,
@@ -72,9 +99,15 @@
     parametrize,
     run_tests,
     skipIfRocm,
+<<<<<<< HEAD
     TestCase,
     xfail_inherited_tests,
     xfailIfS390X,
+=======
+    TEST_MKL,
+    TestCase,
+    xfail_inherited_tests,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfailIfTorchDynamo,
 )
 from torch.testing._internal.custom_tensor import ConstantExtraMetadataTensor
@@ -111,6 +144,76 @@
 # NB: numpy is a testing dependency!
 
 
+<<<<<<< HEAD
+=======
+def amax_to_scale(
+    amax: torch.Tensor,
+    float8_dtype: torch.dtype,
+    round_scales_to_power_of_2: bool = False,
+):
+    amax = amax.to(torch.float64)
+    res = torch.finfo(float8_dtype).max / torch.clamp(amax, min=1e-12)
+    res = res.to(torch.float32)
+    return res
+
+
+# Must be at module level to use fx.wrap
+@torch.fx.wrap
+def _pack_fp8_with_scale_wrap(x):
+    if not x.dtype.is_floating_point:
+        return x
+
+    amax = torch.max(torch.abs(x))
+    scale = amax_to_scale(amax, torch.float8_e5m2)
+    x_scaled = x.to(torch.float32) * scale
+    x_fp8 = x_scaled.to(torch.float8_e5m2)
+    return x.dtype, scale, x_fp8
+
+
+@torch.fx.wrap
+def _unpack_fp8_with_scale_wrap(x):
+    if isinstance(x, torch.Tensor):
+        return x
+
+    dtype, scale, x_fp8 = x
+    y = x_fp8.to(torch.float32) / scale
+    return y.to(dtype)
+
+
+@torch.fx.wrap
+def _pack_fp8_wrap(x):
+    if not x.dtype.is_floating_point:
+        return x
+
+    return (x.dtype, x.to(torch.float8_e5m2))
+
+
+@torch.fx.wrap
+def _unpack_fp8_wrap(x):
+    if isinstance(x, torch.Tensor):
+        return x
+
+    dtype, tensor = x
+    return tensor.to(dtype)
+
+
+def pack_fp8(x):
+    return _pack_fp8_wrap(x)
+
+
+def unpack_fp8(packed):
+    return _unpack_fp8_wrap(packed)
+
+
+def pack_fp8_with_scale(x):
+    return _pack_fp8_with_scale_wrap(x)
+
+
+def unpack_fp8_with_scale(packed):
+    return _unpack_fp8_with_scale_wrap(packed)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AOTTestCase(TestCase):
     pass
 
@@ -2489,8 +2592,14 @@ def setup_context(ctx, inputs, output):
         def fn(x: torch.Tensor, x1: torch.Tensor) -> torch.Tensor:
             return torch.ops._test._clone_create_graph(x, x1)
 
+<<<<<<< HEAD
         inp_x, inp_x1 = torch.randn(3, requires_grad=True), torch.randn(
             3, requires_grad=True
+=======
+        inp_x, inp_x1 = (
+            torch.randn(3, requires_grad=True),
+            torch.randn(3, requires_grad=True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         ref_x, ref_x1 = inp_x.clone(), inp_x1.clone()
@@ -3964,6 +4073,202 @@ def f(a, sz):
             str(out2.grad_fn.__class__), """<class 'ViewBackward0'>"""
         )
 
+<<<<<<< HEAD
+=======
+    def test_duplicated_arguments_on_tensor_overlap(self):
+        # Test whether we correctly handle duplicated arguments when changing the
+        # parameters, so that we take the base tensor as argument.
+        #
+        # - t0 and t1 must have storage overlap: triggers the target execution flow.
+        # - s0 and s1 must be equal: triggers the error in the target execution flow.
+
+        @torch.compile(dynamic=True)
+        def foo(t0, t1, s0, s1):
+            return t0.add_(s0), t1.add_(s1)
+
+        tensor = torch.rand(10)
+        foo(tensor, tensor[1:-1], 2, 2)
+
+    @parametrize("use_autograd", [False, True])
+    def test_mark_outputs_dynamic(self, use_autograd: bool):
+        counters.clear()
+        torch._dynamo.reset()
+
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def fn(x, y):
+            return torch.matmul(x, y)
+
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def fn2(z):
+            return z * 2
+
+        # 1. static
+        x = torch.randn(10, 10, requires_grad=use_autograd)
+        y = torch.randn(10, 10, requires_grad=use_autograd)
+        out = fn(x, y)
+        self.assertFalse(hasattr(out, "_dynamo_weak_dynamic_indices"))
+        out2 = fn2(out)
+        self.assertFalse(hasattr(out2, "_dynamo_weak_dynamic_indices"))
+        self.assertEqual(counters["aot_autograd"]["total"], 2)
+        counters.clear()
+
+        # 2. dynamic
+        x = torch.randn(20, 20)
+        y = torch.randn(20, 20)
+        out = fn(x, y)
+        self.assertTrue(hasattr(out, "_dynamo_weak_dynamic_indices"))
+        out2 = fn2(out)
+        self.assertTrue(hasattr(out2, "_dynamo_weak_dynamic_indices"))
+        self.assertEqual(counters["aot_autograd"]["total"], 2)
+        counters.clear()
+        torch._dynamo.reset()
+
+    def test_mark_activations_dynamic(self):
+        counters.clear()
+        torch._dynamo.reset()
+
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def fn(x, y):
+            out = torch.matmul(x, y)
+            out2 = torch.matmul(out, y)
+            out3 = torch.matmul(out2, y)
+            return torch.matmul(out3, y)
+
+        def make_assert_pack(dynamic):
+            def pack(activation):
+                assert hasattr(activation, "_dynamo_weak_dynamic_indices") == dynamic
+                return activation
+
+            return pack
+
+        def make_assert_unpack(dynamic):
+            def unpack(activation):
+                assert hasattr(activation, "_dynamo_weak_dynamic_indices") == dynamic
+                return activation
+
+            return unpack
+
+        # 1. static
+        x = torch.randn(10, 10, requires_grad=True)
+        y = torch.randn(10, 10, requires_grad=True)
+        with torch.autograd.graph.saved_tensors_hooks(
+            make_assert_pack(False), make_assert_unpack(False)
+        ):
+            fn(x, y)
+        self.assertEqual(counters["aot_autograd"]["total"], 1)
+        counters.clear()
+
+        # 2. dynamic
+        x = torch.randn(20, 20, requires_grad=True)
+        y = torch.randn(20, 20, requires_grad=True)
+        with torch.autograd.graph.saved_tensors_hooks(
+            make_assert_pack(True), make_assert_unpack(True)
+        ):
+            fn(x, y)
+        self.assertEqual(counters["aot_autograd"]["total"], 1)
+        counters.clear()
+        torch._dynamo.reset()
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    @torch._functorch.config.patch(saved_tensors_hooks_filtering_mode="no_static")
+    @torch._functorch.config.patch(recompute_views=True)
+    def test_saved_tensors_hooks_mutations_raise(self):
+        ctx = torch.autograd.graph.saved_tensors_hooks
+        device = "cuda"
+
+        class SAF(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return x
+
+            @staticmethod
+            def backward(ctx, gx):
+                (saved_x,) = ctx.saved_tensors
+                return gx + saved_x
+
+        def mutate(x):
+            return x.mul_(2)
+
+        def fn(x):
+            x = 2 * x
+            x = SAF.apply(x)
+            return x
+
+        def inp_fn():
+            x = torch.ones(2, 3, device=device, requires_grad=True)
+            torch._dynamo.mark_dynamic(x, 0)
+            torch._dynamo.mark_dynamic(x, 1)
+            return x
+
+        with self.assertRaisesRegex(
+            AssertionError, "Saved tensors hooks with inputs mutations are not allowed"
+        ):
+            try:
+                with ctx(*saved_tensors_hooks_to_gm(mutate, mutate, None, None)):
+                    x = inp_fn()
+                    y = torch.compile(fn, backend="aot_eager", fullgraph=True)(x)
+                    y.sum().backward()
+            except torch._dynamo.exc.BackendCompilerFailed as e:
+                raise e.inner_exception from e
+
+    def test_mark_activations_dynamic_with_nested(self):
+        # The flattened tensors of the nested tensor aren't
+        # marked as activations, but they add some offset
+        # to the fw_outs. This test ensures that we handle
+        # that offset properly.
+        counters.clear()
+        torch._dynamo.reset()
+
+        def make_assert_pack(dynamic):
+            def pack(activation):
+                assert hasattr(activation, "_dynamo_weak_dynamic_indices") == dynamic
+                return activation
+
+            return pack
+
+        def make_assert_unpack(dynamic):
+            def unpack(activation):
+                assert hasattr(activation, "_dynamo_weak_dynamic_indices") == dynamic
+                return activation
+
+            return unpack
+
+        # 1. static
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def fn(x, y, nt):
+            out = torch.matmul(x, y)
+            return out.sum() + nt.clone()
+
+        x = torch.randn(10, 10, requires_grad=True)
+        y = torch.randn(10, 10, requires_grad=True)
+        a = torch.randn(2, 3, requires_grad=True, dtype=torch.float64)
+        b = torch.randn(3, 3, requires_grad=True, dtype=torch.float64)
+        c = torch.randn(4, 3, requires_grad=True, dtype=torch.float64)
+        nt = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
+        with torch.autograd.graph.saved_tensors_hooks(
+            make_assert_pack(False), make_assert_unpack(False)
+        ):
+            fn(x, y, nt)
+        self.assertEqual(counters["aot_autograd"]["total"], 1)
+        counters.clear()
+
+        # 2. dynamic
+        x = torch.randn(20, 20, requires_grad=True)
+        y = torch.randn(20, 20, requires_grad=True)
+        a = torch.randn(2, 3, requires_grad=True, dtype=torch.float64)
+        b = torch.randn(3, 3, requires_grad=True, dtype=torch.float64)
+        c = torch.randn(4, 3, requires_grad=True, dtype=torch.float64)
+        nt = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
+        with torch.autograd.graph.saved_tensors_hooks(
+            make_assert_pack(True), make_assert_unpack(True)
+        ):
+            fn(x, y, nt)
+        self.assertEqual(counters["aot_autograd"]["total"], 1)
+        counters.clear()
+        torch._dynamo.reset()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def extract_graph(fx_g, _, graph_cell):
     graph_cell[0] = fx_g
@@ -4315,7 +4620,11 @@ def forward(self, arg0_1):
     gt = torch.ops.aten.gt.Scalar(sum_1, 4);  sum_1 = None
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+=======
+    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (arg0_1,));  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     add = torch.ops.aten.add.Tensor(getitem, 3)
     add_1 = torch.ops.aten.add.Tensor(getitem, 4);  getitem = None
@@ -4334,7 +4643,11 @@ def forward(self, arg0_1):
     cos_1 = torch.ops.aten.cos.default(add);  add = None
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [cos_1]);  gt = true_graph_0 = false_graph_0 = cos_1 = None
+=======
+    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (cos_1,));  gt = true_graph_0 = false_graph_0 = cos_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return (getitem,)""",  # noqa: B950
         )
@@ -4384,6 +4697,7 @@ def f(x, y):
         inps = [torch.randn(2, 2), torch.ones(2)]
         gm, _ = aot_export_module(M(), inps, trace_joint=False, pre_dispatch=True)
         self.assertExpectedInline(
+<<<<<<< HEAD
             str(gm.code).strip(),
             """\
 def forward(self, arg0_1, arg1_1):
@@ -4435,6 +4749,76 @@ def forward(self, arg0_1, arg1_1):
     add = torch.ops.aten.add.Tensor(cos, 5);  cos = None
     add_1 = torch.ops.aten.add.Tensor(add, arg1_1);  add = arg1_1 = None
     return (add_1,)""",
+=======
+            normalize_gm(gm.print_readable(False)),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2]"):
+        sum_1: "f32[]" = torch.ops.aten.sum.default(arg0_1)
+        gt: "b8[]" = torch.ops.aten.gt.Scalar(sum_1, 4);  sum_1 = None
+
+        true_graph_0 = self.true_graph_0
+        false_graph_0 = self.false_graph_0
+        cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (arg0_1, arg1_1));  gt = true_graph_0 = false_graph_0 = arg0_1 = arg1_1 = None
+        getitem: "f32[2, 2]" = cond[0];  cond = None
+
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(getitem, 3)
+        add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(getitem, 4);  getitem = None
+        return (add, add_1)
+
+    class true_graph_0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2]"):
+            sin: "f32[2, 2]" = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+
+            add: "f32[2, 2]" = torch.ops.aten.add.Tensor(sin, 5);  sin = None
+
+            cos: "f32[2, 2]" = torch.ops.aten.cos.default(add);  add = None
+
+            sum_1: "f32[]" = torch.ops.aten.sum.default(arg1_1);  arg1_1 = None
+
+            add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
+            return (add_1,)
+
+    class false_graph_0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2]"):
+            cos: "f32[2, 2]" = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
+
+            body_graph_0 = self.body_graph_0
+            map_impl = torch.ops.higher_order.map_impl(body_graph_0, [cos], [arg1_1]);  body_graph_0 = None
+            getitem_2: "f32[2, 2]" = map_impl[0];  map_impl = None
+
+            sum_1: "f32[]" = torch.ops.aten.sum.default(getitem_2);  getitem_2 = None
+
+            add: "f32[2, 2]" = torch.ops.aten.add.Tensor(cos, sum_1);  sum_1 = None
+
+            body_graph_1 = self.body_graph_1
+            map_impl_1 = torch.ops.higher_order.map_impl(body_graph_1, [cos], [arg1_1]);  body_graph_1 = cos = arg1_1 = None
+            getitem_5: "f32[2, 2]" = map_impl_1[0];  map_impl_1 = None
+
+            sum_2: "f32[]" = torch.ops.aten.sum.default(getitem_5);  getitem_5 = None
+
+            add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(add, sum_2);  add = sum_2 = None
+            return (add_1,)
+
+        class body_graph_0(torch.nn.Module):
+            def forward(self, arg0_1: "f32[2]", arg1_1: "f32[2]"):
+                cos: "f32[2]" = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
+
+                add: "f32[2]" = torch.ops.aten.add.Tensor(cos, 5);  cos = None
+
+                add_1: "f32[2]" = torch.ops.aten.add.Tensor(add, arg1_1);  add = arg1_1 = None
+                return (add_1,)
+
+        class body_graph_1(torch.nn.Module):
+            def forward(self, arg0_1: "f32[2]", arg1_1: "f32[2]"):
+                cos: "f32[2]" = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
+
+                add: "f32[2]" = torch.ops.aten.add.Tensor(cos, 5);  cos = None
+
+                add_1: "f32[2]" = torch.ops.aten.add.Tensor(add, arg1_1);  add = arg1_1 = None
+                return (add_1,)
+""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_aot_export_predispatch_map_2(self):
@@ -4455,6 +4839,7 @@ def f(x, y):
         inps = [torch.randn(2, 2), torch.ones(2)]
         gm, _ = aot_export_module(M(), inps, trace_joint=False, pre_dispatch=True)
         self.assertExpectedInline(
+<<<<<<< HEAD
             str(gm.code).strip(),
             """\
 def forward(self, arg0_1, arg1_1):
@@ -4474,6 +4859,33 @@ def forward(self, arg0_1, arg1_1):
     add = torch.ops.aten.add.Tensor(cos, 5);  cos = None
     add_1 = torch.ops.aten.add.Tensor(add, arg1_1);  add = arg1_1 = None
     return [add_1]""",
+=======
+            normalize_gm(gm.print_readable(False)),
+            """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[2, 2]", arg1_1: "f32[2]"):
+        cos: "f32[2, 2]" = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
+
+        _set_grad_enabled = torch._C._set_grad_enabled(True);  _set_grad_enabled = None
+
+        body_graph_0 = self.body_graph_0
+        map_impl = torch.ops.higher_order.map_impl(body_graph_0, [cos], [arg1_1]);  body_graph_0 = arg1_1 = None
+        getitem_2: "f32[2, 2]" = map_impl[0];  map_impl = None
+
+        sum_1: "f32[]" = torch.ops.aten.sum.default(getitem_2);  getitem_2 = None
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
+        return (add,)
+
+    class body_graph_0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[2]", arg1_1: "f32[2]"):
+            cos: "f32[2]" = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
+
+            add: "f32[2]" = torch.ops.aten.add.Tensor(cos, 5);  cos = None
+
+            add_1: "f32[2]" = torch.ops.aten.add.Tensor(add, arg1_1);  add = arg1_1 = None
+            return (add_1,)
+""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(IS_WINDOWS, "Windows isn't supported for this case")
@@ -4510,7 +4922,11 @@ def forward(self, arg0_1):
     gt = torch.ops.aten.gt.Scalar(sum_1, 4);  sum_1 = None
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+=======
+    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (arg0_1,));  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     add = torch.ops.aten.add.Tensor(getitem, 3)
     add_1 = torch.ops.aten.add.Tensor(getitem, 4);  getitem = None
@@ -4945,7 +5361,11 @@ def forward(self, arg0_1):
     gt = torch.ops.aten.gt.Scalar(sum_1, 4);  sum_1 = None
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+=======
+    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (arg0_1,));  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     add = torch.ops.aten.add.Tensor(getitem, 3)
     add_1 = torch.ops.aten.add.Tensor(getitem, 4);  getitem = None
@@ -4956,8 +5376,11 @@ def forward(self, arg0_1):
             gm.true_graph_0.code.strip(),
             """\
 def forward(self, arg0_1):
+<<<<<<< HEAD
     add = torch.ops.aten.add.Tensor(arg0_1, 4)
     add_1 = torch.ops.aten.add.Tensor(add, 5);  add = add_1 = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cos = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
     return (cos,)""",
         )
@@ -4966,8 +5389,11 @@ def forward(self, arg0_1):
             gm.false_graph_0.code.strip(),
             """\
 def forward(self, arg0_1):
+<<<<<<< HEAD
     add = torch.ops.aten.add.Tensor(arg0_1, 5)
     add_1 = torch.ops.aten.add.Tensor(add, 6);  add = add_1 = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sin = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
     return (sin,)""",
         )
@@ -4992,11 +5418,20 @@ def fn(p, x):
 
         mod = TestMod(fn)
         inp = torch.randn(2)
+<<<<<<< HEAD
         with patch(
             "functorch.compile.config.functionalize_rng_ops", True
         ), self.assertRaisesRegex(
             RuntimeError,
             "Functionalized RNG is not currently supported in the aot_export",
+=======
+        with (
+            patch("functorch.compile.config.functionalize_rng_ops", True),
+            self.assertRaisesRegex(
+                RuntimeError,
+                "Functionalized RNG is not currently supported in the aot_export",
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             aot_export_joint_simple(fn, [mod.p, inp], trace_joint=False)
             aot_export_joint_simple(fn, [mod.p, inp], trace_joint=True)
@@ -5100,6 +5535,35 @@ def f(x, mod_weight, mod_bias):
         self.assertEqual(get_num_ins_outs(bw_graph), (6, 3))
 
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
+<<<<<<< HEAD
+=======
+    def test_min_cut_partitioner_raise_getitems(self):
+        def f(x):
+            y = torch.split(x, x.size(0) // 2, dim=0)
+            a = y[0].sin()
+            b = y[1].cos()
+            return a + b
+
+        _, bw_graph = get_fw_bw_graph(f, [torch.randn(4, 4, requires_grad=True)])
+
+        self.assertExpectedInline(
+            bw_graph.code.strip(),
+            """\
+def forward(self, primals_1, tangents_1):
+    split = torch.ops.aten.split.Tensor(primals_1, 2);  primals_1 = None
+    getitem_1 = split[1]
+    getitem = split[0];  split = None
+    sin_1 = torch.ops.aten.sin.default(getitem_1);  getitem_1 = None
+    neg = torch.ops.aten.neg.default(sin_1);  sin_1 = None
+    mul = torch.ops.aten.mul.Tensor(tangents_1, neg);  neg = None
+    cos_1 = torch.ops.aten.cos.default(getitem);  getitem = None
+    mul_1 = torch.ops.aten.mul.Tensor(tangents_1, cos_1);  tangents_1 = cos_1 = None
+    cat = torch.ops.aten.cat.default([mul_1, mul]);  mul_1 = mul = None
+    return (cat,)""",
+        )
+
+    @unittest.skipIf(not USE_NETWORKX, "networkx not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_min_cut_partitioner_save_shape(self):
         def f(x):
             s = x.sum(dim=1)
@@ -5803,11 +6267,16 @@ def fn(x):
 class GradsNoForceContiguousContextManager(ContextDecorator):
     def __enter__(self):
         # flake8: noqa: TOR901
+<<<<<<< HEAD
         self.lib = torch.library.Library("_mylib", "FRAGMENT")
+=======
+        self.lib = torch.library.Library("_test_aotdispatch_lib", "FRAGMENT")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.d = {
             torch.channels_last: 0,
             torch.contiguous_format: 0,
         }
+<<<<<<< HEAD
 
         self.lib.define("foo(Tensor x) -> Tensor")
         self.lib.define("foo2(Tensor x) -> Tensor")
@@ -5842,6 +6311,66 @@ def foo_bwd(ctx, grad):
 
         _register_effectful_op(torch.ops._mylib.foo.default, _EffectType.ORDERED)
         _register_effectful_op(torch.ops._mylib.foo2.default, _EffectType.ORDERED)
+=======
+        self.tangent_strides = []
+
+        self.lib.define("log_tangents_memory_format(Tensor x) -> Tensor")
+        self.lib.define("log_tangents_memory_format_log(Tensor x) -> Tensor")
+
+        def log_tangents_memory_format_impl(a):
+            return a.clone()
+
+        def log_tangents_memory_format_meta(a):
+            return a.clone()
+
+        def log_tangents_memory_format_log_impl(x):
+            self.d[torch._prims_common.suggest_memory_format(x)] += 1
+            self.tangent_strides.append(x.stride())
+            return x.clone()
+
+        def log_tangents_memory_format_log_meta(a):
+            return a.clone()
+
+        for backend in ["CPU", "CUDA"]:
+            self.lib.impl(
+                "log_tangents_memory_format", log_tangents_memory_format_impl, backend
+            )
+            self.lib.impl(
+                "log_tangents_memory_format_log",
+                log_tangents_memory_format_log_impl,
+                backend,
+            )
+
+        self.lib.impl(
+            "log_tangents_memory_format", log_tangents_memory_format_meta, "Meta"
+        )
+        self.lib.impl(
+            "log_tangents_memory_format_log",
+            log_tangents_memory_format_log_meta,
+            "Meta",
+        )
+
+        def log_tangents_memory_format_bwd(ctx, grad):
+            torch.ops._test_aotdispatch_lib.log_tangents_memory_format_log(grad)
+            return grad.clone()
+
+        torch.library.register_autograd(
+            "_test_aotdispatch_lib::log_tangents_memory_format",
+            log_tangents_memory_format_bwd,
+            lib=self.lib,
+        )
+
+        from torch._higher_order_ops.effects import _EffectType, _register_effectful_op
+
+        _register_effectful_op(
+            torch.ops._test_aotdispatch_lib.log_tangents_memory_format.default,
+            _EffectType.ORDERED,
+        )
+        _register_effectful_op(
+            torch.ops._test_aotdispatch_lib.log_tangents_memory_format_log.default,
+            _EffectType.ORDERED,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return self
 
@@ -5917,8 +6446,13 @@ def forward(self, x, y):
         self.assertExpectedInline(
             shape_env.format_guards(),
             """\
+<<<<<<< HEAD
  - Eq(s1, 20)
  - Eq(s2, 30)""",
+=======
+ - Eq(s49, 20)
+ - Eq(s70, 30)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         assert torch.allclose(ref[0], res[0])
@@ -5986,7 +6520,11 @@ def forward(self, x, y):
         mod = torch.fx.GraphModule(tracer.root, graph)
 
         for node in mod.graph.nodes:
+<<<<<<< HEAD
             if node.op == "output":
+=======
+            if node.op != "call_function":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             self.assertTrue(node.stack_trace is not None)
             assert "test_aotdispatch.py" in node.stack_trace
@@ -6025,7 +6563,11 @@ def forward(self, x):
         mod = torch.fx.GraphModule(tracer.root, graph)
 
         for node in mod.graph.nodes:
+<<<<<<< HEAD
             if node.op == "output":
+=======
+            if node.op != "call_function":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             self.assertTrue(node.stack_trace is not None)
             assert "test_aotdispatch.py" in node.stack_trace
@@ -6098,7 +6640,11 @@ def forward(self, x, y, cont_inp):
                     z = y + 3
                     y.mul_(2)
                     r = self.conv(x)
+<<<<<<< HEAD
                     r = torch.ops._mylib.foo(r)
+=======
+                    r = torch.ops._test_aotdispatch_lib.log_tangents_memory_format(r)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return (
                         r,
                         r.transpose(0, 1),
@@ -6144,7 +6690,11 @@ def __init__(self) -> None:
 
                 def forward(self, x, y):
                     r = self.conv(x)
+<<<<<<< HEAD
                     r = torch.ops._mylib.foo(r)
+=======
+                    r = torch.ops._test_aotdispatch_lib.log_tangents_memory_format(r)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return r, y + 1
 
             m = M()
@@ -6187,7 +6737,11 @@ def __init__(self) -> None:
 
                 def forward(self, x):
                     r = self.conv(x)
+<<<<<<< HEAD
                     r = torch.ops._mylib.foo(r)
+=======
+                    r = torch.ops._test_aotdispatch_lib.log_tangents_memory_format(r)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return r
 
             m = M()
@@ -6467,6 +7021,604 @@ def _test_fn(fn, check_backward=True):
         _test_fn(fn_mutation)
         _test_fn(fn_inplace, check_backward=False)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    @parametrize("dynamic_shapes", [True, False])
+    @parametrize("test_subclasses", [True, False])
+    @parametrize("device", ["cuda", "cpu"])
+    @patch("torch._functorch.config.guess_tangent_strides_as_outputs", True)
+    def test_noncontig_nonmemformat_tangents(
+        self, dynamic_shapes, test_subclasses, device
+    ):
+        B = 2
+        T = 4
+        E = 6
+
+        def fn(x):
+            x = x + 1
+            return x.transpose(1, 2)
+
+        def _inp_dense():
+            t = torch.randn(B, T, E, device=device, requires_grad=True)
+            if dynamic_shapes:
+                for i in range(t.ndim):
+                    torch._dynamo.mark_dynamic(t, i)
+            return t
+
+        def _inp_sc():
+            return TwoTensor(_inp_dense(), _inp_dense())
+
+        _inp = _inp_dense if not test_subclasses else _inp_sc
+
+        comp_fn = torch.compile(fn, backend="aot_eager", fullgraph=True)
+
+        def _tg3(y):
+            t = torch.randn(
+                2 * y.shape, dtype=y.dtype, layout=y.layout, device=y.device
+            )
+            return t.as_strided(y.shape, tuple(s * 2 for s in y.stride()))
+
+        TEST_CASES = [
+            (_inp, lambda y: torch.ones(y.shape, dtype=y.dtype, device=y.device)),
+            # Memory overlap, dense tangent
+            (
+                _inp,
+                lambda y: torch.tensor([1], dtype=y.dtype, device=y.device).as_strided(
+                    y.shape, (0,) * y.ndim
+                ),
+            ),
+            # No memory overlap, not-dense tangent
+            (_inp, _tg3),
+        ]
+
+        for i, (inp_fn, tg_fn) in enumerate(TEST_CASES):
+            ref_x = inp_fn()
+            x = ref_x.detach().clone().requires_grad_()
+
+            ref_y = fn(ref_x)
+
+            y = comp_fn(x)
+            self.assertEqual(ref_y, y)
+
+            ref_tg = (
+                tg_fn(ref_y)
+                if not test_subclasses
+                else TwoTensor(tg_fn(ref_y), tg_fn(ref_y))
+            )
+            tg = ref_tg.clone()
+
+            ref_y.backward(ref_tg)
+            y.backward(tg)
+
+            self.assertEqual(ref_x.grad, x.grad)
+
+    @patch("torch._functorch.config.guess_tangent_strides_as_outputs", True)
+    def test_flex_attn_noncontiguous_tangents(self):
+        with GradsNoForceContiguousContextManager() as ctx:
+            E = 16  # embedding dim
+            H = 4  # number of heads
+
+            @torch.compile(backend="aot_eager", fullgraph=True)
+            def attn_fn(q, k, v):
+                y = flex_attention(query=q, key=k, value=v)
+                y = torch.ops._test_aotdispatch_lib.log_tangents_memory_format(y)
+                return y
+
+            class M(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.c_attn = torch.nn.Linear(E, 3 * E)
+
+                def forward(self, x):
+                    B, T, E = x.size()
+                    q, k, v = self.c_attn(x).split(E, dim=2)
+                    k = k.view(B, T, H, E // H).transpose(1, 2)  # (B, nh, T, hs)
+                    q = q.view(B, T, H, E // H).transpose(1, 2)  # (B, nh, T, hs)
+                    v = v.view(B, T, H, E // H).transpose(1, 2)  # (B, nh, T, hs)
+
+                    y = attn_fn(q, k, v)
+
+                    return y.transpose(1, 2).contiguous().view(B, T, E)
+
+            m = M()
+            B = 1
+            T = 8
+
+            def _inp():
+                return torch.randn(B, T, E, requires_grad=True)
+
+            x = _inp()
+            y = m(x)
+            y.backward(torch.ones_like(y).contiguous())
+
+            self.assertEqual(1, len(ctx.tangent_strides))
+            self.assertEqual((128, 4, 16, 1), ctx.tangent_strides[0])
+
+    def _test_pack_hooks(
+        self,
+        fn,
+        inp_fn,
+        hooks,
+        symbolic_tracing=True,
+        pre_compile_fn=None,
+        backend="inductor",
+    ):
+        ctx = torch.autograd.graph.saved_tensors_hooks
+        torch._dynamo.reset()
+        with ExitStack() as stack:
+            # All hooks in eager to get ref
+            for hook, _ in hooks:
+                pack, unpack = hook
+                stack.enter_context(ctx(pack, unpack))
+            ref_x = inp_fn()
+
+            def _f(t):
+                if t.dtype.is_floating_point:
+                    return t.detach().clone().requires_grad_()
+
+                return t
+
+            x = pytree.tree_map_only(torch.Tensor, _f, ref_x)
+
+            ref_y = fn(*ref_x)
+            ref_y.sum().backward()
+        if pre_compile_fn:
+            pre_compile_fn()
+
+        with ExitStack() as stack:
+            for hook, inline in hooks:
+                pack, unpack = hook
+                if inline:
+                    if symbolic_tracing:
+                        stack.enter_context(
+                            ctx(
+                                *saved_tensors_hooks_to_gm(
+                                    pack,
+                                    unpack,
+                                    "pack_hash",
+                                    "unpack_hash",
+                                )
+                            )
+                        )
+                    else:
+                        stack.enter_context(
+                            ctx(
+                                *saved_tensors_hooks_to_gm(
+                                    pack, unpack, "pack_hash", "unpack_hash"
+                                )
+                            )
+                        )
+                else:
+                    stack.enter_context(ctx(pack, unpack))
+            y = torch.compile(fn, backend=backend, fullgraph=True)(*x)
+            y.sum().backward()
+            self.assertEqual(ref_y, y, atol=1e-2, rtol=1e-2)
+            ref_x_grad = pytree.tree_map_only(torch.Tensor, lambda t: t.grad, ref_x)
+            x_grad = pytree.tree_map_only(torch.Tensor, lambda t: t.grad, x)
+            self.assertEqual(ref_x_grad, x_grad, atol=1e-2, rtol=1e-2)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    @unittest.skipIf(not SM80OrLater, "bfloat16, float8")
+    @parametrize("saved_tensors_hooks_filtering_mode", ["donated", "no_static", "all"])
+    def test_saved_tensors_hooks_base(self, saved_tensors_hooks_filtering_mode):
+        with patch(
+            "torch._functorch.config.saved_tensors_hooks_filtering_mode",
+            saved_tensors_hooks_filtering_mode,
+        ):
+            # y argument is expected to test saving of int tensor,
+            # to check filtering functionality to not apply hooks for e.g. is_floating_point
+            class SAF(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x, y):
+                    ctx.save_for_backward(x, y)
+                    return x
+
+                @staticmethod
+                def backward(ctx, gx):
+                    (saved_x, saved_y) = ctx.saved_tensors
+                    return gx + saved_x + saved_y, None
+
+            class AF(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    ctx.save_for_backward(x)
+                    ctx.d1 = x.size(1)
+                    return x
+
+                @staticmethod
+                def backward(ctx, gx):
+                    (saved_x,) = ctx.saved_tensors
+                    d1 = ctx.d1
+                    return gx + saved_x * d1
+
+            def fn(x, y):
+                x = x.relu()
+                x = x + 1
+                x = x.relu()
+                x = 2 * x
+                x = AF.apply(x)
+                return x
+
+            def simple_fn(x, y):
+                x = x + 1
+                x = x.t()
+                x = x.relu()
+                x = x.t()
+                x = SAF.apply(x, y)
+                return x
+
+            device = torch.device("cuda:0")
+
+            def inp_fn():
+                x = torch.ones(2, 2, device=device, requires_grad=True)
+                torch._dynamo.mark_dynamic(x, 0)
+                torch._dynamo.mark_dynamic(x, 1)
+                y = torch.zeros(2, 2, device=device, dtype=torch.int64)
+                return x, y
+
+            def pack_dev_sym_cpu(x):
+                return x.dtype, x.device, x.size(1), x.cpu()
+
+            def unpack_dev_sym_cpu(packed):
+                dtype, device, dim1, x = packed
+                x = x.to(device=device)
+                return x.to(dtype)
+
+            def pack_tensor(x):
+                return x.device, x.cpu()
+
+            def unpack_tensor(packed):
+                device, t_cpu = packed
+                return t_cpu.to(device)
+
+            def pack_bf16(x):
+                return x.dtype, x.to(dtype=torch.bfloat16)
+
+            def unpack_bf16(packed):
+                dtype, x = packed
+                return x.to(dtype)
+
+            def pack_mul2(x):
+                return x.dtype, x * 2
+
+            def unpack_mul2(x):
+                dtype, x = x
+                x = x / 2
+                return x.to(dtype)
+
+            def pack_wrapper_sc(x):
+                return WrapperSubclass(x)
+
+            def unpack_wrapper_sc(x):
+                return x.a
+
+            def pack_wrapper_two_tensor(x):
+                return TwoTensor(x, x)
+
+            def unpack_wrapper_two_tensor(x):
+                return x.a + x.b
+
+            def pack_mul2_eager(x):
+                return x * 2
+
+            def unpack_mul2_eager(x):
+                return x / 2
+
+            def pack_cpu(x):
+                return x.to(device="cpu")
+
+            def unpack_cpu(x):
+                return x.to(device=device)
+
+            for test_fn in [simple_fn, fn]:
+                self._test_pack_hooks(
+                    test_fn,
+                    inp_fn,
+                    [((pack_cpu, unpack_cpu), True)],
+                    symbolic_tracing=False,
+                )
+                self._test_pack_hooks(
+                    test_fn, inp_fn, [((pack_bf16, unpack_bf16), True)]
+                )
+                self._test_pack_hooks(
+                    test_fn, inp_fn, [((pack_mul2, unpack_mul2), True)]
+                )
+                self._test_pack_hooks(
+                    test_fn, inp_fn, [((pack_tensor, unpack_tensor), True)]
+                )
+                self._test_pack_hooks(
+                    test_fn, inp_fn, [((pack_dev_sym_cpu, unpack_dev_sym_cpu), True)]
+                )
+                self._test_pack_hooks(
+                    test_fn, inp_fn, [((pack_mul2_eager, unpack_mul2_eager), False)]
+                )
+                self._test_pack_hooks(
+                    test_fn,
+                    inp_fn,
+                    [((pack_fp8, unpack_fp8), True)],
+                )
+                self._test_pack_hooks(
+                    test_fn,
+                    inp_fn,
+                    [((pack_fp8_with_scale, unpack_fp8_with_scale), True)],
+                )
+                # Disable testing of Subclasses for now
+                # self._test_pack_hooks(test_fn, inp_fn, [(pack_wrapper_sc, unpack_wrapper_sc)])
+                # self._test_pack_hooks(
+                #     test_fn, inp_fn, [(pack_wrapper_two_tensor, unpack_wrapper_two_tensor)]
+                # )
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    @unittest.skipIf(not SM80OrLater, "bfloat16, float8")
+    def test_saved_tensors_hooks_params(self):
+        lib = torch.library.Library("_test_aotdispatch_lib", "FRAGMENT")
+        logged_shapes = []
+        logged_dtypes = []
+        lib.define("log(Tensor x) -> Tensor")
+
+        def log_impl(x):
+            logged_shapes.append(list(x.shape))
+            logged_dtypes.append(x.dtype)
+            return x.clone()
+
+        def log_meta(x):
+            return x.clone()
+
+        for backend in ["CPU", "CUDA"]:
+            lib.impl(
+                "log",
+                log_impl,
+                backend,
+            )
+        lib.impl("log", log_meta, "Meta")
+
+        def pack_fp8_with_scale_and_log(x):
+            torch.ops._test_aotdispatch_lib.log(x)
+            return _pack_fp8_with_scale_wrap(x)
+
+        def unpack_fp8_with_scale_and_log(packed):
+            return _unpack_fp8_with_scale_wrap(packed)
+
+        def m_inp_fn():
+            x = torch.ones(
+                2, 2, 2, device=device, dtype=torch.float64, requires_grad=True
+            )
+            torch._dynamo.mark_dynamic(x, 0)
+            torch._dynamo.mark_dynamic(x, 1)
+            return (x,)
+
+        class SAF0(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return x
+
+            @staticmethod
+            def backward(ctx, gx):
+                (saved_x,) = ctx.saved_tensors
+                return gx + saved_x
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = nn.Linear(2, 2)
+                self.relu = nn.ReLU()
+                self.fc2 = nn.Linear(2, 2)
+
+            def forward(self, x):
+                x = SAF0.apply(x)
+                x = x.to(dtype=torch.float32)
+                x = self.fc1(x)
+                x = self.relu(x)
+                x = self.fc2(x)
+                return x
+
+        def _reset_logged():
+            logged_shapes.clear()
+            logged_dtypes.clear()
+
+        device = torch.device("cuda:0")
+        m = M().to(device=device)
+
+        def _test_m():
+            self._test_pack_hooks(
+                m,
+                m_inp_fn,
+                [
+                    (
+                        (
+                            pack_fp8_with_scale_and_log,
+                            unpack_fp8_with_scale_and_log,
+                        ),
+                        True,
+                    )
+                ],
+                pre_compile_fn=_reset_logged,
+                backend="aot_eager",
+            )
+
+        with patch(
+            "torch._functorch.config.saved_tensors_hooks_filtering_mode", "donated"
+        ):
+            _reset_logged()
+            _test_m()
+            # Check that hooks were not applied to Parameters
+            # parameters excluded
+            self.assertFalse([2, 2] in logged_shapes)
+            self.assertTrue([2, 2, 2] in logged_shapes)
+            # input excluded
+            self.assertFalse(torch.float64 in logged_dtypes)
+
+        with patch(
+            "torch._functorch.config.saved_tensors_hooks_filtering_mode", "no_static"
+        ):
+            _reset_logged()
+            _test_m()
+            # Check that hooks were not applied to Parameters
+            # parameters excluded
+            self.assertFalse([2, 2] in logged_shapes)
+            self.assertTrue([2, 2, 2] in logged_shapes)
+            self.assertTrue(torch.float64 in logged_dtypes)
+
+        with patch("torch._functorch.config.saved_tensors_hooks_filtering_mode", "all"):
+            _reset_logged()
+            _test_m()
+            # Check that hooks were applied to all saved tensors
+            self.assertTrue([2, 2] in logged_shapes)
+            self.assertTrue([2, 2, 2] in logged_shapes)
+            self.assertTrue(torch.float64 in logged_dtypes)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    @unittest.skipIf(not SM80OrLater, "bfloat16, float8")
+    @torch._functorch.config.patch(saved_tensors_hooks_filtering_mode="all")
+    def test_saved_tensors_hooks_recompile(self):
+        ctx = torch.autograd.graph.saved_tensors_hooks
+
+        def pack_bf16(x):
+            return x.to(dtype=torch.bfloat16)
+
+        def unpack_bf16(x):
+            return x.to(dtype=torch.float)
+
+        def pack_mul2(x):
+            return x * 2
+
+        def unpack_mul2(x):
+            return x / 2
+
+        def _test(hooks, inline, expected_compile_count):
+            class SAF(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    ctx.save_for_backward(x)
+                    return x
+
+                @staticmethod
+                def backward(ctx, gx):
+                    (saved_x,) = ctx.saved_tensors
+                    return gx + saved_x
+
+            class AF(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    ctx.save_for_backward(x)
+                    ctx.d1 = x.size(1)
+                    return x
+
+                @staticmethod
+                def backward(ctx, gx):
+                    (saved_x,) = ctx.saved_tensors
+                    d1 = ctx.d1
+                    return gx + saved_x * d1
+
+            def fn(x):
+                x = x.relu()
+                x = x + 1
+                x = 2 * x
+                x = AF.apply(x)
+                return x
+
+            device = torch.device("cuda:0")
+
+            def inp_fn():
+                x = torch.ones(2, 3, device=device, requires_grad=True)
+                torch._dynamo.mark_dynamic(x, 0)
+                torch._dynamo.mark_dynamic(x, 1)
+                return x
+
+            from torch._dynamo.testing import CompileCounter
+
+            cnt = CompileCounter()
+            x = inp_fn()
+            y = torch.compile(fn, backend=cnt, fullgraph=True)(x)
+            y.sum().backward()
+
+            def _test_with_hooks(hooks):
+                with ExitStack() as stack:
+                    pack, unpack = hooks
+                    if inline:
+                        stack.enter_context(
+                            ctx(
+                                *saved_tensors_hooks_to_gm(
+                                    pack, unpack, "pack_hash", "unpack_hash"
+                                )
+                            )
+                        )
+                    else:
+                        stack.enter_context(ctx(pack, unpack))
+
+                    x = inp_fn()
+                    y = torch.compile(fn, backend=cnt, fullgraph=True)(x)
+                    y.sum().backward()
+
+            _test_with_hooks(hooks[0])
+            _test_with_hooks(hooks[1])
+            self.assertEqual(cnt.frame_count, expected_compile_count)
+
+        _test(
+            ((pack_bf16, unpack_bf16), (pack_mul2, unpack_mul2)),
+            inline=False,
+            expected_compile_count=1,
+        )
+        _test(
+            ((pack_bf16, unpack_bf16), (pack_mul2, unpack_mul2)),
+            inline=True,
+            expected_compile_count=3,
+        )
+
+    @torch._functorch.config.patch(donated_buffer=True)
+    @torch._functorch.config.patch(saved_tensors_hooks_filtering_mode="no_static")
+    def test_saved_tensors_hooks_donated_buffers(self):
+        pack_gm, unpack_gm = saved_tensors_hooks_to_gm(
+            pack_fp8,
+            unpack_fp8,
+            "pack_hash",
+            "unpack_hash",
+        )
+        logger_name = "torch._functorch._aot_autograd.jit_compile_runtime_wrappers"
+
+        class SAF(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return x
+
+            @staticmethod
+            def backward(ctx, gx):
+                (saved_x,) = ctx.saved_tensors
+                return gx + saved_x
+
+        def fn(x):
+            x0 = x
+            x = SAF.apply(x)
+            return x0, torch.nn.functional.relu(x)
+
+        inp = torch.rand([3, 3], requires_grad=True)
+        # 1. No donated buffers without hooks, as relu saves input which is also user output.
+        with self.assertLogs(logger_name, level="INFO") as captured:
+            out = torch.compile(fn, backend="aot_eager", fullgraph=True, dynamic=False)(
+                inp
+            )
+            out[1].sum().backward()
+            expected_msg = "bw_donated_idxs=[]"
+
+        FileCheck().check(expected_msg).run("\n".join(captured.output))
+
+        # 2. Hooks applied for all saved, as we set saved_tensors_hooks_no_filtering=True
+        # Results of the hooks become donated buffers.
+        inp = torch.rand([3, 3], requires_grad=True)
+        with torch.autograd.graph.saved_tensors_hooks(pack_gm, unpack_gm):
+            with self.assertLogs(logger_name, level="INFO") as captured:
+                out = torch.compile(
+                    fn, backend="aot_eager", fullgraph=True, dynamic=False
+                )(inp)
+                out[1].sum().backward()
+                expected_msg = "bw_donated_idxs=[0, 1]"
+
+        FileCheck().check(expected_msg).run("\n".join(captured.output))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # entries in here don't work and need to be fixed.
 # Each one of these is a bug (or needs to be investigated)
@@ -6520,6 +7672,27 @@ def _test_fn(fn, check_backward=True):
     decorate("nn.functional.conv2d", decorator=unittest.skipIf(IS_ARM64, "flaky")),
 }
 
+<<<<<<< HEAD
+=======
+if not TEST_MKL:
+    aot_autograd_failures.update(
+        {
+            decorate(
+                "matmul",
+                decorator=toleranceOverride(
+                    {torch.float32: tol(atol=6e-05, rtol=4e-06)}
+                ),
+            ),
+            decorate(
+                "__rmatmul__",
+                decorator=toleranceOverride(
+                    {torch.float32: tol(atol=6e-05, rtol=4e-06)}
+                ),
+            ),
+        }
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 symbolic_aot_autograd_failures = {
     xfail("combinations", ""),  # aten.masked_select.default
     xfail(
@@ -6681,7 +7854,10 @@ class TestEagerFusionOpInfo(AOTTestCase):
     def test_aot_autograd_exhaustive(self, device, dtype, op):
         _test_aot_autograd_helper(self, device, dtype, op)
 
+<<<<<<< HEAD
     @xfailIfS390X
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(op_db + hop_db, allowed_dtypes=(torch.float,))
     @patch("functorch.compile.config.debug_assert", True)
     @skipOps(
@@ -6728,13 +7904,19 @@ def test_aot_autograd_symbolic_exhaustive(self, device, dtype, op):
 
 
 class TestEagerFusionModuleInfo(AOTTestCase):
+<<<<<<< HEAD
     @xfailIfS390X
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @modules(module_db, allowed_dtypes=(torch.float,))
     @decorateForModules(unittest.expectedFailure, aot_autograd_module_failures)
     def test_aot_autograd_module_exhaustive(self, device, dtype, training, module_info):
         _test_aot_autograd_module_helper(self, device, dtype, training, module_info)
 
+<<<<<<< HEAD
     @xfailIfS390X
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @modules(module_db, allowed_dtypes=(torch.float,))
     @decorateForModules(
         unittest.expectedFailure,
@@ -6749,6 +7931,10 @@ def test_aot_autograd_symbolic_module_exhaustive(
 
 
 instantiate_parametrized_tests(TestAOTAutograd)
+<<<<<<< HEAD
+=======
+instantiate_parametrized_tests(TestAOTModuleSimplified)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 only_for = "cpu"
 instantiate_device_type_tests(
     TestPythonKey,
@@ -6853,6 +8039,46 @@ def run(f):
 
         self.assertEqual(out, optout)
 
+<<<<<<< HEAD
+=======
+    def test_mutations_in_bw_detached_from_tangent(self):
+        class AF(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, dummy, inplace_tensor):
+                ctx.inplace_tensor = inplace_tensor
+                return dummy.clone()
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                inplace_tensor = ctx.inplace_tensor
+                gradient_attachment = grad_output * 0 + 1
+                inplace_tensor.add_(1 * gradient_attachment)
+                return grad_output, None, None
+
+        def fn(dummy, inplace_tensor):
+            return AF.apply(dummy, inplace_tensor)
+
+        def _inps():
+            dummy = torch.zeros((2,), requires_grad=True)
+            inplace_tensor = torch.zeros((2,), requires_grad=False)
+            return dummy, inplace_tensor
+
+        inps = _inps()
+        out = fn(*inps)
+        ref_inps_after_fw = [x.clone().detach() for x in inps]
+        out.sum().backward()
+        ref_inps_after_bw = [x.clone().detach() for x in inps]
+
+        inps = _inps()
+        out = torch.compile(fn, backend="aot_eager", fullgraph=True)(*inps)
+        inps_after_fw = [x.clone().detach() for x in inps]
+        out.sum().backward()
+        inps_after_bw = [x.clone().detach() for x in inps]
+
+        self.assertEqual(ref_inps_after_fw, inps_after_fw)
+        self.assertEqual(ref_inps_after_bw, inps_after_bw)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class MockFXGraphCache:
     """
@@ -6869,17 +8095,38 @@ def load(self, gm, inputs):
         key, _ = compiled_fx_graph_hash(gm, inputs, {}, [])
         if key not in self.cache:
             self.cache[key] = gm
+<<<<<<< HEAD
         gm, _ = self.load_with_key(key, [], inputs, None, None, None, None)
         return gm
 
     def load_with_key(
         self, key, debug_lines, inputs, local, remote_cache, is_backward, constants
+=======
+        gm, _ = self.load_with_key(key, [], inputs, None, None, None, None, None)
+        return gm
+
+    def load_with_key(
+        self,
+        key,
+        debug_lines,
+        inputs,
+        local,
+        remote_cache,
+        is_backward,
+        constants,
+        evaluate_guards,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         gm = self.cache.get(key)
         if gm is not None:
             gm = make_boxed_func(gm)
             gm = MockFXGraphCacheOutput(gm)
+<<<<<<< HEAD
             gm._fx_graph_cache_key = key
+=======
+            gm._fx_graph_cache_key = key  # (cache_key, debug lines)
+            gm._fx_graph_cache_debug_lines = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gm._time_taken_ns = 0
         return gm, {}
 
@@ -6890,7 +8137,10 @@ def load_with_key(
     # BypassAOTAutogradCache: unsupported nodes
     "test_backward_mutation_data",  # Custom Autograd Function
     "test_backward_mutation_metadata",  # Custom Autograd Function
+<<<<<<< HEAD
     "test_custom_autograd",  # Custom Autograd Function
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_input_output_aliase_custom_autograd_function",
 )
 
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 8d791c761e7f..f7a4eb34b643 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -6,13 +6,23 @@
 import torch
 import torch.utils._pytree as pytree
 from functorch.experimental import control_flow
+<<<<<<< HEAD
 from functorch.experimental.control_flow import cond, UnsupportedAliasMutationException
+=======
+from functorch.experimental.control_flow import cond
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.testing import normalize_gm
 from torch._higher_order_ops.associative_scan import (
     _fake_associative_scan,
     associative_scan,
 )
+<<<<<<< HEAD
 from torch._higher_order_ops.scan import _fake_scan, scan
+=======
+from torch._higher_order_ops.map import _fake_map
+from torch._higher_order_ops.scan import _fake_scan, scan
+from torch._higher_order_ops.schema import HopSchemaGenerator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.while_loop import while_loop
 from torch._subclasses.functional_tensor import (
     CppFunctionalizeAPI,
@@ -36,7 +46,10 @@
     TEST_WITH_CROSSREF,
     TEST_WITH_TORCHDYNAMO,
     TestCase,
+<<<<<<< HEAD
     xfailIfTorchDynamo,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -74,6 +87,7 @@ def from_fun_old(t):
     return t
 
 
+<<<<<<< HEAD
 def _fake_map(f, x, *args):
     from functorch.experimental.control_flow import _stack_pytree, _unstack_pytree
 
@@ -84,6 +98,8 @@ def _fake_map(f, x, *args):
     return _stack_pytree(zs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _fake_while_loop(cond_fn, body_fn, operands):
     while cond_fn(*operands):
         operands = body_fn(*operands)
@@ -113,7 +129,11 @@ def compile_mode_helper(fct, compile_mode):
 ]
 
 
+<<<<<<< HEAD
 def get_scan_combine_fn(name, associative=True):
+=======
+def get_scan_combine_fn(name, associative=True, parameters=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def add(x: torch.Tensor, y: torch.Tensor):
         return x + y
 
@@ -156,6 +176,21 @@ def non_pointwise(x: torch.Tensor, y: torch.Tensor):
         W = torch.diag(torch.ones(2, device=x.device))
         return x @ W + y @ W
 
+<<<<<<< HEAD
+=======
+    def RNN(x: torch.Tensor, y: torch.Tensor):
+        c_new = y @ parameters[0] + parameters[1]
+        h_new = torch.tanh(c_new + x @ parameters[2] + parameters[3])
+        return h_new, h_new.clone()
+
+    def fct_c1_no_grad(x: torch.Tensor, y: torch.Tensor):
+        h_new = torch.tanh(x[0] + x[1] + y)
+        c2 = x[1] + y
+        with torch.no_grad():
+            c1 = x[0] + y
+        return (c1, c2), h_new
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if name == "add":
         fct = add
     elif name == "adds":
@@ -174,6 +209,13 @@ def non_pointwise(x: torch.Tensor, y: torch.Tensor):
         fct = complex_pointwise
     elif name == "non_pointwise":
         fct = non_pointwise
+<<<<<<< HEAD
+=======
+    elif name == "RNN":
+        fct = RNN
+    elif name == "fct_c1_no_grad":
+        fct = fct_c1_no_grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         raise ValueError("Combine_fn name unknown!")
 
@@ -442,6 +484,21 @@ def setUp(self):
         torch._dynamo.reset()
         super().setUp()
 
+<<<<<<< HEAD
+=======
+    def check_autograd(self, result, result_exp, params):
+        params_flatten = pytree.tree_leaves(params)
+        result_flatten = pytree.tree_leaves(result)
+        result_exp_flatten = pytree.tree_leaves(result_exp)
+        grad_exp_init = [torch.ones_like(el) for el in result_exp_flatten]
+        expected_grads = torch.autograd.grad(
+            result_exp_flatten, params_flatten, grad_exp_init
+        )
+        grad_init = [torch.ones_like(el) for el in result_flatten]
+        grads = torch.autograd.grad(result_flatten, params_flatten, grad_init)
+        self.assertEqual(grads, expected_grads, atol=6e-05, rtol=6e-06)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cond_no_trace(self):
         def true_fn(x):
             return x.sin()
@@ -503,7 +560,11 @@ def forward(self, pred_1, x_1):
     ones_like = torch.ops.aten.ones_like.default(getitem, pin_memory = False);  getitem = None
     true_graph_1 = self.true_graph_1
     false_graph_1 = self.false_graph_1
+<<<<<<< HEAD
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (ones_like, x_1));  pred_1 = true_graph_1 = false_graph_1 = ones_like = x_1 = None
+=======
+    cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (x_1, ones_like));  pred_1 = true_graph_1 = false_graph_1 = x_1 = ones_like = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0];  cond_1 = None
     return (getitem_1,)""",  # noqa: B950
         )
@@ -544,7 +605,11 @@ def forward(self, pred_1, x_1):
     ones_like = torch.ops.aten.ones_like.default(getitem, pin_memory = False);  getitem = None
     true_graph_1 = self.true_graph_1
     false_graph_1 = self.false_graph_1
+<<<<<<< HEAD
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (ones_like, x_1));  pred_1 = true_graph_1 = false_graph_1 = ones_like = x_1 = None
+=======
+    cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (x_1, ones_like));  pred_1 = true_graph_1 = false_graph_1 = x_1 = ones_like = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0];  cond_1 = None
     return (getitem_1,)""",  # noqa: B950
         )
@@ -653,7 +718,11 @@ def forward(self, pred_1, x_1, y_1, z_1):
     ones_like = torch.ops.aten.ones_like.default(getitem, pin_memory = False);  getitem = None
     true_graph_1 = self.true_graph_1
     false_graph_1 = self.false_graph_1
+<<<<<<< HEAD
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (ones_like, z_1, y_1));  pred_1 = true_graph_1 = false_graph_1 = ones_like = z_1 = y_1 = None
+=======
+    cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (z_1, y_1, ones_like));  pred_1 = true_graph_1 = false_graph_1 = z_1 = y_1 = ones_like = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0]
     getitem_2 = cond_1[1];  cond_1 = getitem_2 = None
     return (getitem_1,)""",  # noqa: B950
@@ -714,7 +783,11 @@ def forward(self, pred_1, x_1):
     _param_constant0_1 = self._param_constant0
     _param_constant1_1 = self._param_constant1
     _tensor_constant0_1 = self._tensor_constant0
+<<<<<<< HEAD
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (ones_like, _param_constant0_1, _param_constant1_1, x_1, sym_size_int, _tensor_constant0_1));  pred_1 = true_graph_1 = false_graph_1 = ones_like = _param_constant0_1 = _param_constant1_1 = x_1 = sym_size_int = _tensor_constant0_1 = None
+=======
+    cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (_param_constant0_1, _param_constant1_1, x_1, sym_size_int, _tensor_constant0_1, ones_like));  pred_1 = true_graph_1 = false_graph_1 = _param_constant0_1 = _param_constant1_1 = x_1 = sym_size_int = _tensor_constant0_1 = ones_like = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0];  getitem_1 = None
     getitem_2 = cond_1[1]
     getitem_3 = cond_1[2];  getitem_3 = None
@@ -832,7 +905,11 @@ def forward(self, pred_1, a_1, b_1, c_1):
     ones_like = torch.ops.aten.ones_like.default(getitem, pin_memory = False);  getitem = None
     true_graph_1 = self.true_graph_1
     false_graph_1 = self.false_graph_1
+<<<<<<< HEAD
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (ones_like, a_1, b_1, sym_size_int, sym_size_int_1, c_1, sym_size_int_2));  pred_1 = true_graph_1 = false_graph_1 = ones_like = a_1 = b_1 = sym_size_int = sym_size_int_1 = c_1 = sym_size_int_2 = None
+=======
+    cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (a_1, b_1, sym_size_int, sym_size_int_1, c_1, sym_size_int_2, ones_like));  pred_1 = true_graph_1 = false_graph_1 = a_1 = b_1 = sym_size_int = sym_size_int_1 = c_1 = sym_size_int_2 = ones_like = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0]
     getitem_2 = cond_1[1]
     getitem_3 = cond_1[2];  getitem_3 = None
@@ -854,10 +931,17 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1):
             gm.true_graph_1.code.strip(),
             """\
 def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1):
+<<<<<<< HEAD
     add = torch.ops.aten.add.Tensor(arg1_1, arg2_1);  arg1_1 = arg2_1 = add = None
     zeros_like = torch.ops.aten.zeros_like.default(arg5_1, pin_memory = False);  arg5_1 = None
     clone = torch.ops.aten.clone.default(arg0_1)
     clone_1 = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
+=======
+    add = torch.ops.aten.add.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = add = None
+    clone = torch.ops.aten.clone.default(arg6_1)
+    clone_1 = torch.ops.aten.clone.default(arg6_1);  arg6_1 = None
+    zeros_like = torch.ops.aten.zeros_like.default(arg4_1, pin_memory = False);  arg4_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return [clone, clone_1, None, None, zeros_like, None]""",
         )
 
@@ -910,7 +994,11 @@ def forward(self, pred_1):
     _tensor_constant0_1 = self._tensor_constant0
     _tensor_constant1_1 = self._tensor_constant1
     _tensor_constant2_1 = self._tensor_constant2
+<<<<<<< HEAD
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (ones_like, _tensor_constant0_1, _tensor_constant1_1, _tensor_constant2_1));  pred_1 = true_graph_1 = false_graph_1 = ones_like = _tensor_constant0_1 = _tensor_constant1_1 = _tensor_constant2_1 = None
+=======
+    cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (_tensor_constant0_1, _tensor_constant1_1, _tensor_constant2_1, ones_like));  pred_1 = true_graph_1 = false_graph_1 = _tensor_constant0_1 = _tensor_constant1_1 = _tensor_constant2_1 = ones_like = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0]
     getitem_2 = cond_1[1]
     getitem_3 = cond_1[2];  cond_1 = getitem_3 = None
@@ -940,10 +1028,17 @@ def false_fn(x):
     @skipIfTorchDynamo("Skip due to graph break when run with dynamo")
     def test_cond_autograd_same_pytree_output(self):
         def true_fn(x):
+<<<<<<< HEAD
             return {"res": [x["t"][0], (x["t"][2][0],)]}
 
         def false_fn(x):
             return {"res": [x["t"][1]["b"], (x["t"][2][0],)]}
+=======
+            return {"res": [x["t"][0].clone(), (x["t"][2][0].clone(),)]}
+
+        def false_fn(x):
+            return {"res": [x["t"][1]["b"].clone(), (x["t"][2][0].clone(),)]}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         a = torch.randn(4, requires_grad=True)
         b = torch.randn(4, requires_grad=True)
@@ -981,9 +1076,13 @@ def forward(self, pred_1):
     cond = torch.ops.higher_order.cond(pred_1, true_graph_0, false_graph_0, (_tensor_constant0, _tensor_constant1, _tensor_constant2));  pred_1 = true_graph_0 = false_graph_0 = _tensor_constant0 = _tensor_constant1 = _tensor_constant2 = None
     getitem = cond[0]
     getitem_1 = cond[1];  cond = None
+<<<<<<< HEAD
     view = torch.ops.aten.view.default(getitem, [4]);  getitem = None
     view_1 = torch.ops.aten.view.default(getitem_1, [4]);  getitem_1 = None
     return {'res': [view, (view_1,)]}""",  # noqa: B950
+=======
+    return {'res': [getitem, (getitem_1,)]}""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfTorchDynamo("Skip due to graph break when run with dynamo")
@@ -1039,7 +1138,11 @@ def forward(self, pred_1, x_1):
     _param_constant3_1 = self._param_constant3
     _param_constant4_1 = self._param_constant4
     _param_constant5_1 = self._param_constant5
+<<<<<<< HEAD
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (ones_like, x_1, _param_constant0_1, _param_constant1_1, _param_constant2_1, _param_constant3_1, _param_constant4_1, _param_constant5_1));  pred_1 = true_graph_1 = false_graph_1 = ones_like = x_1 = _param_constant0_1 = _param_constant1_1 = _param_constant2_1 = _param_constant3_1 = _param_constant4_1 = _param_constant5_1 = None
+=======
+    cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (x_1, _param_constant0_1, _param_constant1_1, _param_constant2_1, _param_constant3_1, _param_constant4_1, _param_constant5_1, ones_like));  pred_1 = true_graph_1 = false_graph_1 = x_1 = _param_constant0_1 = _param_constant1_1 = _param_constant2_1 = _param_constant3_1 = _param_constant4_1 = _param_constant5_1 = ones_like = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0]
     getitem_2 = cond_1[1];  getitem_2 = None
     getitem_3 = cond_1[2];  getitem_3 = None
@@ -1097,7 +1200,11 @@ def forward(self, pred_1, x_1):
     ones_like = torch.ops.aten.ones_like.default(getitem, pin_memory = False);  getitem = None
     true_graph_1 = self.true_graph_1
     false_graph_1 = self.false_graph_1
+<<<<<<< HEAD
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (ones_like, x_1));  pred_1 = true_graph_1 = false_graph_1 = ones_like = x_1 = None
+=======
+    cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (x_1, ones_like));  pred_1 = true_graph_1 = false_graph_1 = x_1 = ones_like = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0];  cond_1 = None
     return (getitem_1,)""",  # noqa: B950
         )
@@ -1152,7 +1259,11 @@ def forward(self, pred_1, x_1):
     ones_like = torch.ops.aten.ones_like.default(getitem, pin_memory = False);  getitem = None
     true_graph_1 = self.true_graph_1
     false_graph_1 = self.false_graph_1
+<<<<<<< HEAD
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (ones_like, x_1));  pred_1 = true_graph_1 = false_graph_1 = ones_like = x_1 = None
+=======
+    cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (x_1, ones_like));  pred_1 = true_graph_1 = false_graph_1 = x_1 = ones_like = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0];  cond_1 = None
     return (getitem_1,)""",  # noqa: B950
         )
@@ -1194,7 +1305,11 @@ def forward(self, pred_1, x_1):
     ones_like = torch.ops.aten.ones_like.default(getitem, pin_memory = False);  getitem = None
     true_graph_1 = self.true_graph_1
     false_graph_1 = self.false_graph_1
+<<<<<<< HEAD
     cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (ones_like, x_1));  pred_1 = true_graph_1 = false_graph_1 = ones_like = x_1 = None
+=======
+    cond_1 = torch.ops.higher_order.cond(pred_1, true_graph_1, false_graph_1, (x_1, ones_like));  pred_1 = true_graph_1 = false_graph_1 = x_1 = ones_like = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0];  cond_1 = None
     return (getitem_1,)""",  # noqa: B950
         )
@@ -1285,6 +1400,7 @@ def _extract_tensor_metadata_except_requires_grad(arg):
 
         return cond_outputs, cond_inputs
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("don't test compile on compile")
     @unittest.skipIf(not SM70OrLater, "triton")
     @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
@@ -1533,6 +1649,8 @@ def pred(x, w1, b1, w2, b2):
 
             self._test_cond_autograd(cond_fct, pred, true_fn, false_fn, operands)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: The compile_mode = `compile_dynamic_shape` raises the Error
     # torch._inductor.exc.LoweringException: NotImplementedError: get_size() is not
     # implemented by <class 'torch._inductor.ir.NoneAsConstantBuffer'>!
@@ -1640,6 +1758,10 @@ def f(x, y):
                 f, (torch.ones(3, 4, 5), torch.ones(4, 4, 5)), torch.ones(5)
             )
 
+<<<<<<< HEAD
+=======
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_map_illegal_outputs(self):
         def f(x, y):
             return x.item()
@@ -1653,12 +1775,24 @@ def f2(x, y):
         x = torch.ones([3])
         y = torch.ones([1, 2, 3])
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError, r"Expect outputs of map only contains tensors or None\."
+=======
+            RuntimeError, "map doesn't work unless it is captured completely"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             control_flow.map(f, x, y)
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError, r"Expect outputs of map only contains tensors or None\."
+=======
+            # Should be
+            # torch._dynamo.exc.UncapturedHigherOrderOpError,
+            # "Expected all leaves to be of torch.Tensor type.*",
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "map doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             control_flow.map(f1, x, y)
 
@@ -1762,6 +1896,43 @@ def fwbw(map_op, f, x, y):
         fake_outs = fwbw(_fake_map, f, x, y)
         self.assertEqual(true_outs, fake_outs)
 
+<<<<<<< HEAD
+=======
+    def test_map_autograd_higher_order(self):
+        from torch.autograd.functional import hessian as hes, jacobian as jac
+
+        def f(x, y):
+            return x.sin().cos() + y
+
+        def wrapper_jac(x, y):
+            return control_flow.map(f, x, y)
+
+        def wrapper_jac_fake(x, y):
+            return _fake_map(f, x, y)
+
+        def wrapper_hes(x, y):
+            return control_flow.map(f, x, y).sum()
+
+        def wrapper_hes_fake(x, y):
+            return _fake_map(f, x, y).sum()
+
+        for g_fct, (wrap, wrap_fake) in [
+            (jac, [wrapper_jac, wrapper_jac_fake]),
+            (hes, [wrapper_hes, wrapper_hes_fake]),
+        ]:
+            xs = torch.ones(3, 2, 2, requires_grad=True)
+            # Disable the gradient computation for y
+            y = torch.ones(2, requires_grad=False)
+            res = control_flow.map(f, xs, y)
+            expected_res = _fake_map(f, xs, y)
+            self.assertEqual(expected_res, res)
+
+            expected_grads = g_fct(wrap_fake, (xs, y))
+            grads = g_fct(wrap, (xs, y))
+            self.assertEqual(expected_res, res)
+            self.assertEqual(expected_grads, grads)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scan_y_less_ndim_then_dim(self):
         def combine_fn(carry, x):
             return carry @ x, (carry @ x).sum()
@@ -1779,11 +1950,20 @@ def combine_fn(carry, x):
     @parametrize("reverse", [False, True])
     @parametrize("compile_mode", ["none", "eager"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     def test_scan_compile(self, reverse, compile_mode, device):
         def add2(x: torch.Tensor, y: torch.Tensor):
             return x * y, x + y
 
         x = torch.randn(3, 10, 2, device=device)
+=======
+    @parametrize("autograd", [False, True])
+    def test_scan_compile(self, reverse, compile_mode, device, autograd):
+        def add2(x: torch.Tensor, y: torch.Tensor):
+            return x * y, x + y
+
+        x = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         scan_fct = compile_mode_helper(scan, compile_mode)
 
@@ -1791,12 +1971,20 @@ def add2(x: torch.Tensor, y: torch.Tensor):
             (
                 get_scan_combine_fn("add", False),
                 torch.cumsum,
+<<<<<<< HEAD
                 torch.zeros(10, 2, device=device),
+=======
+                torch.zeros(10, 2, device=device, requires_grad=autograd),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             (
                 get_scan_combine_fn("mul", False),
                 torch.cumprod,
+<<<<<<< HEAD
                 torch.ones(10, 2, device=device),
+=======
+                torch.ones(10, 2, device=device, requires_grad=autograd),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         ]:
             result = scan_fct(op, init, x, dim=0, reverse=reverse)
@@ -1806,6 +1994,12 @@ def add2(x: torch.Tensor, y: torch.Tensor):
                 result_exp_PT = op_pt(x, 0)
                 self.assertEqual(result[1], result_exp_PT)
 
+<<<<<<< HEAD
+=======
+            if autograd:
+                self.check_autograd(result, result_exp, (init, x))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Jax Examples
         x = torch.arange(0, 4, device=device, dtype=torch.int64)
         init = torch.zeros(1, device=device, dtype=torch.int64)
@@ -1857,8 +2051,15 @@ def add2(x: torch.Tensor, y: torch.Tensor):
         self.assertEqual(result, result_exp)
 
         # Non associative operation
+<<<<<<< HEAD
         x = torch.arange(0, 5, device=device, dtype=torch.float32)
         init = torch.ones(1, device=device, dtype=torch.float32)
+=======
+        x = torch.arange(
+            0, 5, device=device, dtype=torch.float32, requires_grad=autograd
+        )
+        init = torch.ones(1, device=device, dtype=torch.float32, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = scan_fct(
             get_scan_combine_fn("div", False),
             init,
@@ -1875,6 +2076,12 @@ def add2(x: torch.Tensor, y: torch.Tensor):
         )
         self.assertEqual(result, result_exp)
 
+<<<<<<< HEAD
+=======
+        if autograd:
+            self.check_autograd(result, result_exp, (init, x))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: provide an implementation for all compile modes and re-enable all test
     @skipIfTorchDynamo("don't test compile on compile")
     @requires_cuda
@@ -1948,6 +2155,7 @@ def test_scan_dtype(self, reverse, compile_mode, device, dtype):
             ],
         )
 
+<<<<<<< HEAD
     @requires_cuda
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
@@ -1959,21 +2167,50 @@ def test_scan_dim(self, reverse, device):
             shapes = [random.randint(1, 10) for _ in range(num_dim)]
             rnd_scan_dim = random.randint(0, num_dim - 1)
             x = torch.randn(*shapes, device=device)
+=======
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_dim(self, reverse, compile_mode, device, autograd):
+        import random
+
+        scan_fct = compile_mode_helper(scan, compile_mode)
+
+        num_dims = [random.randint(2, 5) for _ in range(5)]
+        for num_dim in num_dims:
+            shapes = [random.randint(1, 10) for _ in range(num_dim)]
+            rnd_scan_dim = random.randint(0, num_dim - 1)
+            x = torch.randn(*shapes, device=device, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             init_shapes = shapes[:rnd_scan_dim] + shapes[rnd_scan_dim + 1 :]
 
             for op, op_pt, init in [
                 (
                     get_scan_combine_fn("add", False),
                     torch.cumsum,
+<<<<<<< HEAD
                     torch.zeros(*init_shapes, device=device),
+=======
+                    torch.zeros(*init_shapes, device=device, requires_grad=autograd),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 (
                     get_scan_combine_fn("mul", False),
                     torch.cumprod,
+<<<<<<< HEAD
                     torch.ones(*init_shapes, device=device),
                 ),
             ]:
                 result = scan(op, init, x, dim=rnd_scan_dim, reverse=reverse)
+=======
+                    torch.ones(*init_shapes, device=device, requires_grad=autograd),
+                ),
+            ]:
+                result = scan_fct(op, init, x, dim=rnd_scan_dim, reverse=reverse)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result_exp = _fake_scan(
                     op, init=init, xs=x, dim=rnd_scan_dim, reverse=reverse
                 )
@@ -1984,6 +2221,7 @@ def test_scan_dim(self, reverse, device):
                     res_list[1] = res_list[1].movedim(0, rnd_scan_dim)
                     self.assertEqual(res_list[1], result_exp_PT)
 
+<<<<<<< HEAD
     @requires_cuda
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
@@ -1994,23 +2232,55 @@ def test_scan_binary_operator(self, reverse, device):
             timesteps, state_dim, requires_grad=True, device=device
         )
         A = torch.randn(state_dim, requires_grad=True, device=device)
+=======
+                if autograd:
+                    self.check_autograd(result, result_exp, (init, x))
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_binary_operator(self, reverse, compile_mode, device, autograd):
+        state_dim = 20
+        timesteps = 10
+        scan_fct = compile_mode_helper(scan, compile_mode)
+
+        projected_inputs = torch.randn(
+            timesteps, state_dim, requires_grad=autograd, device=device
+        )
+        A = torch.randn(state_dim, requires_grad=autograd, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elements = (A.repeat((timesteps, 1)), projected_inputs)
         init = tuple(
             [
                 torch.ones_like(
                     torch._ops.ops.aten.slice(elements[0], 0, 0, 1, 1),
+<<<<<<< HEAD
                     requires_grad=True,
+=======
+                    requires_grad=autograd,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             ]
             + [
                 torch.zeros_like(
                     torch._ops.ops.aten.slice(projected_inputs, 0, 0, 1, 1),
+<<<<<<< HEAD
                     requires_grad=True,
+=======
+                    requires_grad=autograd,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             ]
         )
 
+<<<<<<< HEAD
         result = scan(
+=======
+        result = scan_fct(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             get_scan_combine_fn("s5_operator", False),
             init,
             elements,
@@ -2026,6 +2296,7 @@ def test_scan_binary_operator(self, reverse, device):
         )
         self.assertEqual(result, expected_result)
 
+<<<<<<< HEAD
     @skipIfRocm(msg="Unsupported on ROCM yet")
     @requires_cuda
     @parametrize("reverse", [False, True])
@@ -2037,6 +2308,39 @@ def test_scan_tuple(self, reverse, device):
         init = tuple(torch._ops.ops.aten.slice(e, 0, 0, 1, 1) for e in inp)
 
         result_same = scan(
+=======
+        if autograd:
+            init_flatten, _ = pytree.tree_flatten(init)
+            elements_flatten, _ = pytree.tree_flatten(elements)
+
+            result_flatten, _ = pytree.tree_flatten(result)
+            result_exp_flatten, _ = pytree.tree_flatten(expected_result)
+            grad_out = [torch.ones_like(el) for el in result_exp_flatten]
+            expected_grads = torch.autograd.grad(
+                result_exp_flatten, (*init_flatten, *elements_flatten), grad_out
+            )
+            grads = torch.autograd.grad(
+                result_flatten, (*init_flatten, *elements_flatten), grad_out
+            )
+            self.assertEqual(grads, expected_grads)
+
+    @skipIfRocm(msg="Unsupported on ROCM yet")
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_tuple(self, reverse, compile_mode, device, autograd):
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        inp = (x, y)
+        init = tuple(torch._ops.ops.aten.slice(e, 0, 0, 1, 1) for e in inp)
+
+        scan_fct = compile_mode_helper(scan, compile_mode)
+
+        result_same = scan_fct(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             get_scan_combine_fn("tuple_fct", False),
             init,
             inp,
@@ -2052,6 +2356,12 @@ def test_scan_tuple(self, reverse, device):
         )
         self.assertEqual(result_same, expected_result)
 
+<<<<<<< HEAD
+=======
+        if autograd:
+            self.check_autograd(result_same, expected_result, (init, inp))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fct_different_output_tuple(x, y):
             return ((x[0] + y[0], x[1] * y[1]), (x[1] * y[1]))
 
@@ -2067,6 +2377,12 @@ def fct_different_output_tuple(x, y):
         self.assertEqual(result_diff, expected_result)
         self.assertEqual(result_diff[1], result_same[1][1])
 
+<<<<<<< HEAD
+=======
+        if autograd:
+            self.check_autograd(result_diff, expected_result, (init, inp))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scan_wrong_pytree(self):
         # Init and input have same pytree
         def fct_wrong_pytree(x, y):
@@ -2074,12 +2390,23 @@ def fct_wrong_pytree(x, y):
                 {
                     "i": x["i"] * y["j"][0][0],
                     "k": torch.tensor(0.0),
+<<<<<<< HEAD
                     "j": ([x["j"][1][0]["o"]], [{"o": torch.sin(x["i"])}]),
+=======
+                    "j": (
+                        [x["j"][1][0]["o"].clone()],
+                        [{"o": torch.sin(x["i"])}],
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 },
                 {
                     "i": x["i"] * y["j"][0][0],
                     "k": torch.tensor(0.0),
+<<<<<<< HEAD
                     "j": ([x["j"][1][0]["o"]], [{"o": torch.sin(x["i"])}]),
+=======
+                    "j": ([x["j"][1][0]["o"].clone()], [{"o": torch.sin(x["i"])}]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 },
             )
 
@@ -2096,7 +2423,11 @@ def fct_wrong_pytree(x, y):
             # torch._dynamo.exc.UncapturedHigherOrderOpError,
             # r"The tree structure of the inits and the carries are not identical.*",
             torch._dynamo.exc.UncapturedHigherOrderOpError,
+<<<<<<< HEAD
             r"scan must be captured completely with.*",
+=======
+            "Expected init and carry to have same number of outputs but got lhs.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             scan(fct_wrong_pytree, init, inp, dim=0)
 
@@ -2119,6 +2450,7 @@ def fct_float_output(x, y):
 
     @requires_cuda
     @parametrize("reverse", [False, True])
+<<<<<<< HEAD
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     def test_scan_complex_pytree(self, reverse, device):
         # Init and input have same pytree
@@ -2126,12 +2458,29 @@ def test_scan_complex_pytree(self, reverse, device):
         x = torch.randn(3, 2, 2, device=device)
         y = torch.randn(3, 2, 2, device=device)
         z = torch.randn(3, 2, 2, device=device)
+=======
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_complex_pytree(self, reverse, compile_mode, device, autograd):
+        # Init and input have same pytree
+
+        scan_fct = compile_mode_helper(scan, compile_mode)
+
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        z = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inp = {"i": x, "j": ([y], [{"o": z}])}
         inp_flat, inp_spec = pytree.tree_flatten(inp)
         init_flat = [torch._ops.ops.aten.slice(e, 0, 0, 1, 1) for e in inp_flat]
         init = pytree.tree_unflatten(init_flat, inp_spec)
 
+<<<<<<< HEAD
         result = scan(
+=======
+        result = scan_fct(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             get_scan_combine_fn("complex_pointwise", False),
             init,
             inp,
@@ -2147,6 +2496,12 @@ def test_scan_complex_pytree(self, reverse, device):
         )
         self.assertEqual(result, expected_result)
 
+<<<<<<< HEAD
+=======
+        if autograd:
+            self.check_autograd(result, expected_result, (init, inp))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: Does not work because of the usage of vmap witin associative_scan
     # The paT206899919 rameterization is commented out for the moment and the test is marked with expected fail
     # Fails with: AssertionError: scan is not an OpOverload
@@ -2196,9 +2551,16 @@ def body(x, y):
     @parametrize("compile_mode", ["none", "eager"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     def test_scan_downstream_scan_matmul(self, compile_mode, reverse, device):
         inp = torch.randn(3, 10, 2, device=device)
         init = torch.randn(3, 2, device=device)
+=======
+    @parametrize("autograd", [False, True])
+    def test_scan_downstream_scan_matmul(self, compile_mode, reverse, device, autograd):
+        inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+        init = torch.randn(3, 2, device=device, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for ind in range(2):
             # Chain with matmul
@@ -2225,18 +2587,36 @@ def chain_fct(inp):
             result = fct_cmp(inp)
             self.assertEqual(result, expected_result)
 
+<<<<<<< HEAD
+=======
+            if autograd:
+                self.check_autograd(result, expected_result, (init, inp))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: provide an implementation for all compile modes and re-enable all test
     @skipIfTorchDynamo("don't test compile on compile")
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     def test_scan_downstream_scan_scan_dim(self, compile_mode, reverse, device):
         inp = torch.randn(3, 10, 2, device=device)
         init = torch.randn(3, 2, device=device)
 
         # Chain with scan on different dim
         init2 = torch.randn(1, 10, 2, device=device)
+=======
+    @parametrize("autograd", [False, True])
+    def test_scan_downstream_scan_scan_dim(
+        self, compile_mode, reverse, device, autograd
+    ):
+        inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+        init = torch.randn(3, 2, device=device, requires_grad=autograd)
+
+        # Chain with scan on different dim
+        init2 = torch.randn(1, 10, 2, device=device, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def chain_fct_different_dim(inp):
             o1 = scan(
@@ -2276,6 +2656,7 @@ def chain_fct_different_dim(inp):
         result = fct_cmp(inp)
         self.assertEqual(result, expected_result)
 
+<<<<<<< HEAD
     @requires_cuda
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
@@ -2283,6 +2664,23 @@ def test_scan_non_pointwise(self, reverse, device):
         x = torch.randn(3, 10, 2, device=device)
         init = torch.randn(10, 2, device=device)
         result_expected = _fake_scan(
+=======
+        if autograd:
+            self.check_autograd(result, expected_result, (init, init2, inp))
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_non_pointwise(self, reverse, compile_mode, device, autograd):
+        scan_fct = compile_mode_helper(scan, compile_mode)
+
+        x = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
+        init = torch.randn(10, 2, device=device, requires_grad=autograd)
+        expected_result = _fake_scan(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             get_scan_combine_fn("non_pointwise", False),
             init=init,
             xs=x,
@@ -2290,14 +2688,25 @@ def test_scan_non_pointwise(self, reverse, device):
             reverse=reverse,
         )
 
+<<<<<<< HEAD
         out = scan(
+=======
+        result = scan_fct(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             get_scan_combine_fn("non_pointwise", False),
             init,
             x,
             dim=0,
             reverse=reverse,
         )
+<<<<<<< HEAD
         self.assertEqual(out, result_expected)
+=======
+        self.assertEqual(result, expected_result)
+
+        if autograd:
+            self.check_autograd(result, expected_result, (init, x))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_cuda
     @parametrize("reverse", [False, True])
@@ -2431,6 +2840,7 @@ def test_scan_compile_cnt(self, reverse, device):
             self.assertEqual(cnt.frame_count, 6)
 
     @skipIfTorchDynamo("don't test compile on compile")
+<<<<<<< HEAD
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager"])
     def test_scan_init_scanned_0(self, compile_mode):
@@ -2439,18 +2849,31 @@ def test_scan_init_scanned_0(self, compile_mode):
         # Only init and no input
         x = torch.randn(3, 1, 2)
         init = torch.randn(3, 2)
+=======
+    def test_scan_init_scanned_0(self):
+        # Only init and no input
+        x = torch.randn(3, 1, 2, device=torch.device("cpu"))
+        init = torch.randn(3, 2, device=torch.device("cpu"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim = 1
 
         # Scan dimension is 0
         init = torch._ops.ops.aten.slice(x, dim, 0, 1, 1)
         inp = torch._ops.ops.aten.slice(x, dim, 1, None, 1)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             # RuntimeError,
             # "scan\(\) operator doesn't support.*",
             torch._dynamo.exc.UncapturedHigherOrderOpError,
             "scan must be captured completely with.*",
         ):
             scan_fct(
+=======
+            RuntimeError,
+            "All xs leaves must at least have.*",
+        ):
+            scan(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 get_scan_combine_fn("add", False),
                 init,
                 inp,
@@ -2458,16 +2881,22 @@ def test_scan_init_scanned_0(self, compile_mode):
             )
 
     @skipIfTorchDynamo("don't test compile on compile")
+<<<<<<< HEAD
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager"])
     def test_scan_init_non_tensor(self, compile_mode):
         scan_fct = compile_mode_helper(scan, compile_mode)
 
         x = torch.randn(3, 1, 2)
+=======
+    def test_scan_init_non_tensor(self):
+        x = torch.randn(3, 1, 2, device=torch.device("cpu"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim = 1
 
         # Init is a float and not a tensor
         init = 1.0
+<<<<<<< HEAD
         if compile_mode == "none":
             with self.assertRaisesRegex(
                 RuntimeError,
@@ -2481,6 +2910,10 @@ def test_scan_init_non_tensor(self, compile_mode):
                 "Observed exception.*",
             ):
                 scan_fct(get_scan_combine_fn("add", False), init, x, dim=dim)
+=======
+        with self.assertRaisesRegex(RuntimeError, "All init leaves must be a Tensor.*"):
+            scan(get_scan_combine_fn("add", False), init, x, dim=dim, reverse=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfTorchDynamo("don't test compile on compile")
     def test_scan_init_wrong_shape(self):
@@ -2522,7 +2955,11 @@ def init_longer_carry(x: torch.Tensor, y: torch.Tensor):
 
         with self.assertRaisesRegex(
             torch._dynamo.exc.UncapturedHigherOrderOpError,
+<<<<<<< HEAD
             r"scan must be captured completely.*",
+=======
+            "Expected init and carry to have same number of outputs but got lhs.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             scan_fct(init_longer_carry, init, x, dim=dim)
 
@@ -2545,7 +2982,11 @@ def init_shorter_carry(x: torch.Tensor, y: torch.Tensor):
             # torch._dynamo.exc.Unsupported,
             # The tree structure of the inits and the carries are not identical!
             torch._dynamo.exc.UncapturedHigherOrderOpError,
+<<<<<<< HEAD
             r"scan must be captured completely.*",
+=======
+            "Expected init and carry to have same number of outputs but got lhs.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             scan_fct(init_shorter_carry, init, x, dim=dim)
 
@@ -2564,8 +3005,16 @@ def wrong_carry_shape(x: torch.Tensor, y: torch.Tensor):
         init = torch._ops.ops.aten.slice(x, dim, 0, 1, 1)
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.UncapturedHigherOrderOpError,
             "Expected init and carry to have same metadata.*",
+=======
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "scan must be captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             scan_fct(wrong_carry_shape, init, x, dim=dim)
 
@@ -2597,12 +3046,21 @@ def no_carry(x: torch.Tensor, y: torch.Tensor):
     @parametrize("reverse", [False, True])
     @parametrize("compile_mode", ["none", "eager"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     def test_scan_init(self, reverse, compile_mode, device):
         scan_fct = compile_mode_helper(scan, compile_mode)
 
         # Only init and no input
         x = torch.randn(3, 1, 2, device=device)
         init = torch.randn(3, 1, 2, device=device)
+=======
+    @parametrize("autograd", [False, True])
+    def test_scan_init(self, reverse, compile_mode, device, autograd):
+        scan_fct = compile_mode_helper(scan, compile_mode)
+
+        # Only init and no input
+        x = torch.randn(3, 1, 2, device=device, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim = 1
         op, op_pt = (get_scan_combine_fn("add", False), torch.cumsum)
 
@@ -2615,15 +3073,26 @@ def test_scan_init(self, reverse, compile_mode, device):
         self.assertEqual(result_init, result_exp)
         self.assertEqual(result_init[0], init)
 
+<<<<<<< HEAD
         x = torch.randn(3, 5, 2, device=device)
         init = torch.randn(3, 5, 2, device=device)
+=======
+        if autograd:
+            self.check_autograd(result, result_exp, (init,))
+
+        x = torch.randn(3, 5, 2, device=device, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim = 0
 
         op, op_pt = (get_scan_combine_fn("add", False), torch.cumsum)
         inp = torch._ops.ops.aten.slice(x, dim, 1, None, 1)
 
         # Init tensor scalar
+<<<<<<< HEAD
         init = torch.ones(1, device=device)
+=======
+        init = torch.ones(1, device=device, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def add_scalar_carry(x: torch.Tensor, y: torch.Tensor):
             return x + 1.0, x + y
@@ -2635,8 +3104,16 @@ def add_scalar_carry(x: torch.Tensor, y: torch.Tensor):
         self.assertEqual(result_init, result_exp)
         self.assertEqual(result_init[0], torch.tensor([3.0], device=device))
 
+<<<<<<< HEAD
         # Init tensor entirely different shape than inp
         init = torch.randn(7, 8, device=device)
+=======
+        if autograd:
+            self.check_autograd(result_init, result_exp, (init, inp))
+
+        # Init tensor entirely different shape than inp
+        init = torch.randn(7, 8, device=device, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def add_scalar_carry2(x: torch.Tensor, y: torch.Tensor):
             return x + 1.0, x[: y.shape[0], : y.shape[1]] + y
@@ -2658,6 +3135,12 @@ def add_scalar_carry2(x: torch.Tensor, y: torch.Tensor):
         self.assertEqual(result_init, result_exp)
         self.assertEqual(result_init[0].shape, torch.Size([2, 5, 2]))
 
+<<<<<<< HEAD
+=======
+        if autograd:
+            self.check_autograd(result_init, result_exp, (init, inp))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         init = torch.tile(init, (1, 2, 1))
 
         def add_scalar_carry_sliced_out(x: torch.Tensor, y: torch.Tensor):
@@ -2673,6 +3156,7 @@ def add_scalar_carry_sliced_out(x: torch.Tensor, y: torch.Tensor):
         self.assertEqual(result_init[0].shape, torch.Size([2, 10, 2]))
         self.assertEqual(result_init[1].shape, torch.Size([2, 2, 5, 2]))
 
+<<<<<<< HEAD
         # Correct case
         op, op_pt = (get_scan_combine_fn("add", False), torch.cumsum)
         x = torch.randn(3, 2, 2, device=device)
@@ -2684,6 +3168,16 @@ def add_scalar_carry_sliced_out(x: torch.Tensor, y: torch.Tensor):
         else:
             init = torch.zeros_like(torch.select_copy(x, 1, 0))
             inp = torch._ops.ops.aten.slice(x, dim, 1, None, 1)
+=======
+        if autograd:
+            self.check_autograd(result_init, result_exp, (init, inp))
+
+        # Correct case
+        op, op_pt = (get_scan_combine_fn("add", False), torch.cumsum)
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        init = torch.zeros(3, 2, device=device, requires_grad=autograd)
+        dim = 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         result = scan_fct(op, init, x, dim=dim, reverse=reverse)
         result_exp = _fake_scan(op, init=init, xs=x, dim=dim, reverse=reverse)
@@ -2692,9 +3186,18 @@ def add_scalar_carry_sliced_out(x: torch.Tensor, y: torch.Tensor):
         if not reverse:
             result_exp_PT = op_pt(x, dim)
             result = list(result)
+<<<<<<< HEAD
             result[1] = pytree.tree_map(lambda t: t.movedim(0, dim), result[1])
             self.assertEqual(result[1], result_exp_PT)
 
+=======
+            result[1] = pytree.tree_map(lambda t: torch.movedim(t, 0, dim), result[1])
+            self.assertEqual(result[1], result_exp_PT)
+
+        if autograd:
+            self.check_autograd(result, result_exp, (init, x))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_cuda
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
@@ -2731,10 +3234,20 @@ def test_scan_init_wrong_pytree_complex(self, reverse, device):
                 reverse=reverse,
             )
 
+<<<<<<< HEAD
     @requires_cuda
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     def test_scan_init_pytree_complex(self, reverse, device):
+=======
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_init_pytree_complex(self, reverse, compile_mode, device, autograd):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fct_pointwise_different_output(x, y):
             return (
                 {
@@ -2745,7 +3258,11 @@ def fct_pointwise_different_output(x, y):
                     ),
                 },
                 (
+<<<<<<< HEAD
                     y["i"],
+=======
+                    y["i"] * 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     {
                         "o": x["i"] * y["i"],
                         "j": (
@@ -2761,13 +3278,21 @@ def fct_pointwise_different_carry(x, y):
                 {
                     "i": x["i"] * y["i"],
                     "j": (
+<<<<<<< HEAD
                         x["i"],
+=======
+                        x["i"] * 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         [x["j"][1][0] * y["j"][0][0]],
                         [{"o": x["j"][2][0]["o"] + y["j"][1][0]["o"]}],
                     ),
                 },
                 (
+<<<<<<< HEAD
                     y["i"],
+=======
+                    y["i"] * 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     {
                         "o": x["i"] * y["i"] + x["j"][0][0],
                         "j": (
@@ -2778,9 +3303,17 @@ def fct_pointwise_different_carry(x, y):
                 ),
             )
 
+<<<<<<< HEAD
         x = torch.randn(3, 2, 2, device=device)
         y = torch.randn(3, 2, 2, device=device)
         z = torch.randn(3, 2, 2, device=device)
+=======
+        scan_fct = compile_mode_helper(scan, compile_mode)
+
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        z = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if reverse:
             init_start, init_end = -1, None
@@ -2804,7 +3337,11 @@ def fct_pointwise_different_carry(x, y):
                 [{"o": torch._ops.ops.aten.slice(z, 0, inp_start, inp_end, 1)}],
             ),
         }
+<<<<<<< HEAD
         result = scan(
+=======
+        result = scan_fct(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             get_scan_combine_fn("complex_pointwise", False),
             init,
             inp,
@@ -2820,8 +3357,20 @@ def fct_pointwise_different_carry(x, y):
         )
         self.assertEqual(result, expected_result)
 
+<<<<<<< HEAD
         # Pytree of output is different
         result = scan(fct_pointwise_different_output, init, inp, dim=0, reverse=reverse)
+=======
+        if autograd:
+            init_flat = pytree.tree_leaves(init)
+            inp_flat = pytree.tree_leaves(inp)
+            self.check_autograd(result, expected_result, (*init_flat, *inp_flat))
+
+        # Pytree of output is different
+        result = scan_fct(
+            fct_pointwise_different_output, init, inp, dim=0, reverse=reverse
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expected_result = _fake_scan(
             fct_pointwise_different_output, init=init, xs=inp, dim=0, reverse=reverse
         )
@@ -2843,19 +3392,96 @@ def fct_pointwise_different_carry(x, y):
                 [{"o": torch._ops.ops.aten.slice(z, 0, inp_start, inp_end, 1)}],
             ),
         }
+<<<<<<< HEAD
         result = scan(fct_pointwise_different_carry, init, inp, dim=0, reverse=reverse)
+=======
+        result = scan_fct(
+            fct_pointwise_different_carry, init, inp, dim=0, reverse=reverse
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expected_result = _fake_scan(
             fct_pointwise_different_carry, init=init, xs=inp, dim=0, reverse=reverse
         )
         self.assertEqual(result, expected_result)
 
+<<<<<<< HEAD
     def test_scan_RNN(self):
         dim = 1
         device = torch.device("cpu")
+=======
+        if autograd:
+            init_flat = pytree.tree_leaves(init)
+            inp_flat = pytree.tree_leaves(inp)
+            self.check_autograd(result, expected_result, (*init_flat, *inp_flat))
+
+    @skipIfTorchDynamo("don't test compile on compile")
+    @skipIfNoDynamoSupport
+    @skipIfCrossRef  # Arg order changes with crossref
+    def test_scan_pytree_output(self):
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+        x = torch.randn(3, 10, 2, device=torch.device("cpu"))
+        init = torch.randn(1, 10, 2, device=torch.device("cpu"))
+
+        def f(fct, init, xs):
+            return scan(fct, init, xs, dim=0, reverse=True)
+
+        def combine_fn(init, x):
+            a, b = (init[0] + x, init[1] - x)
+            return (a, b), a - b
+
+        # Check graph
+        backend = EagerAndRecordGraphs()
+        torch.compile(f, backend=backend)(combine_fn, (init, init.clone()), x)
+        gm = backend.graphs[0]
+
+        self.assertExpectedInline(
+            normalize_gm(gm.print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_init_0_: "f32[1, 10, 2]", L_init_1_: "f32[1, 10, 2]", L_xs_: "f32[3, 10, 2]"):
+        l_init_0_ = L_init_0_
+        l_init_1_ = L_init_1_
+        l_xs_ = L_xs_
+
+        elem: "f32[3, 10, 2]" = torch.movedim(l_xs_, 0, 0);  l_xs_ = None
+
+        flip: "f32[3, 10, 2]" = torch.flip(elem, [0]);  elem = None
+
+        scan_combine_fn_0 = self.scan_combine_fn_0
+        scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_0_, l_init_1_], [flip], []);  scan_combine_fn_0 = l_init_0_ = l_init_1_ = flip = None
+        getitem: "f32[1, 10, 2]" = scan[0]
+        getitem_1: "f32[1, 10, 2]" = scan[1]
+        out: "f32[3, 1, 10, 2]" = scan[2];  scan = None
+
+        out_1: "f32[3, 1, 10, 2]" = out.flip([0]);  out = None
+        return (getitem, getitem_1, out_1)
+
+    class scan_combine_fn_0(torch.nn.Module):
+        def forward(self, child: "f32[1, 10, 2]", child_1: "f32[1, 10, 2]", child_2: "f32[10, 2]"):
+            a: "f32[1, 10, 2]" = child + child_2;  child = None
+            b: "f32[1, 10, 2]" = child_1 - child_2;  child_1 = child_2 = None
+
+            child_3: "f32[1, 10, 2]" = a - b
+            return [a, b, child_3]
+""",  # noqa: B950
+        )
+
+    @skipIfTorchDynamo("Graph is not captured by backend if test with dynamo")
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("autograd", [False, True])
+    def test_scan_closure_RNN(self, compile_mode, autograd):
+        dim = 1
+        device = torch.device("cpu")
+        scan_fct = compile_mode_helper(scan, compile_mode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         rnn = torch.nn.RNN(
             input_size=5,
             hidden_size=7,
+<<<<<<< HEAD
         )
         rnn = rnn.to(device=device)
         x = torch.randn(1, 2, 5, device=device)
@@ -2885,6 +3511,473 @@ def RNN(x: torch.Tensor, y: torch.Tensor):
         result = scan(RNN, init=torch.select_copy(h, dim, 0), xs=x, dim=dim)
         self.assertEqual(result[0].unsqueeze(0), expected_result_state)
         self.assertEqual(result[1], expected_result[0])
+=======
+            batch_first=True,
+        )
+        rnn = rnn.to(device=device)
+        x = torch.randn(3, 10, 5, device=device, requires_grad=autograd)
+        h = torch.randn(3, 7, device=device, requires_grad=autograd)
+
+        W_ih = rnn.weight_ih_l0.T.clone()
+        b_ih = rnn.bias_ih_l0.clone()
+        W_hh = rnn.weight_hh_l0.T.clone()
+        b_hh = rnn.bias_hh_l0.clone()
+
+        if not autograd:
+            W_ih = W_ih.detach()
+            b_ih = b_ih.detach()
+            W_hh = W_hh.detach()
+            b_hh = b_hh.detach()
+
+        expected_result = rnn(x, torch.unsqueeze(h, 0))
+        expected_result_out = expected_result[0]
+        expected_result_state = expected_result[1][0, :]
+
+        result = scan_fct(
+            get_scan_combine_fn("RNN", True, parameters=[W_ih, b_ih, W_hh, b_hh]),
+            h,
+            x,
+            dim=dim,
+            reverse=False,
+        )
+        result_cmp = [result[0], torch.movedim(result[1], 0, dim)]
+        self.assertEqual(result_cmp[0], expected_result_state)
+        self.assertEqual(result_cmp[1], expected_result_out)
+
+        if autograd:
+            result_flat = pytree.tree_leaves(result)
+            result_exp_flat = [expected_result_state, expected_result_out]
+
+            grad_out_expected = [torch.ones_like(r) for r in result_exp_flat]
+            expected_grads = torch.autograd.grad(
+                result_exp_flat,
+                (
+                    h,
+                    x,
+                    rnn.weight_ih_l0,
+                    rnn.bias_ih_l0,
+                    rnn.weight_hh_l0,
+                    rnn.bias_hh_l0,
+                ),
+                grad_out_expected,
+            )
+            expected_add_input_grads = list(expected_grads[2:])
+            expected_grads = expected_grads[:2]
+
+            grad_out = [torch.ones_like(r) for r in result]
+            grads = torch.autograd.grad(
+                result_flat, (h, x, W_ih, b_ih, W_hh, b_hh), grad_out
+            )
+            add_input_grads = list(grads[2:])
+            add_input_grads[0] = add_input_grads[0].T
+            add_input_grads[2] = add_input_grads[2].T
+            grads = grads[:2]
+            self.assertEqual(grads, expected_grads)
+            self.assertEqual(add_input_grads, expected_add_input_grads)
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize(
+        "partial_grad", ["xs", "init", "additional_inputs", "complex", "random"]
+    )
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    def test_scan_closure_RNN_partial_autograd(
+        self, reverse, compile_mode, partial_grad, device
+    ):
+        dim = 1
+        scan_fct = compile_mode_helper(scan, compile_mode)
+
+        # The first two booleans are the xs
+        # The second two are the inits
+        # The last four are the additional_inputs
+        autograds = []
+
+        if partial_grad == "xs":
+            # xs tests
+            autograds.append([True, False, True, True, True, True, True, True])
+            autograds.append([False, False, True, True, True, True, True, True])
+        elif partial_grad == "init":
+            # init tests
+            autograds.append([True, True, False, True, True, True, True, True])
+            autograds.append([True, True, False, False, True, True, True, True])
+        elif partial_grad == "additional_inputs":
+            # additional input tests
+            autograds.append([True, True, True, True, False, True, False, True])
+            autograds.append([True, True, True, True, False, False, False, False])
+        elif partial_grad == "complex":
+            # complex cases
+            autograds.append([True, False, False, False, False, False, False, True])
+            autograds.append([False, False, True, True, False, False, False, True])
+        elif partial_grad == "random":
+            # random tests
+            import random
+
+            for _ in range(5):
+                autograds.append([bool(random.randint(0, 1)) for _ in range(8)])
+
+        for autograd in autograds:
+            x = torch.randn(3, 10, 5, device=device, requires_grad=autograd[0])
+            x1 = torch.randn(3, 10, 5, device=device, requires_grad=autograd[1])
+            h = torch.randn(3, 7, device=device, requires_grad=autograd[2])
+            h_1 = torch.randn(3, 7, device=device, requires_grad=autograd[3])
+            W_ih = torch.randn(5, 7, device=device, requires_grad=autograd[4])
+            b_ih = torch.randn(7, device=device, requires_grad=autograd[5])
+            W_hh = torch.randn(7, 7, device=device, requires_grad=autograd[6])
+            b_hh = torch.randn(7, device=device, requires_grad=autograd[7])
+
+            params = [
+                p
+                for p, a in zip([x, x1, h, h_1, W_ih, b_ih, W_hh, b_hh], autograd)
+                if a
+            ]
+
+            def RNN(x: torch.Tensor, y: torch.Tensor):
+                c_new_0 = x[0] + 1
+                c_new_1 = x[1] + 1
+                h_new = (
+                    torch.tanh(c_new_1 + x[0] @ W_hh + b_hh)
+                    + y[0] @ W_ih
+                    + y[1] @ W_ih
+                    + b_ih
+                    + x[1]
+                )
+                return (c_new_0, c_new_1), h_new
+
+            inits = (h, h_1)
+            result = scan_fct(RNN, inits, (x, x1), dim=dim, reverse=reverse)
+            result_exp = _fake_scan(RNN, (h, h_1), (x, x1), dim=dim, reverse=reverse)
+            self.assertEqual(result, result_exp)
+
+            if autograd:
+                result_flat = pytree.tree_leaves(result)
+                result_exp_flat = pytree.tree_leaves(result_exp)
+                exp_grad_mask = [
+                    True if r.requires_grad else False for r in result_exp_flat
+                ]
+                self.check_autograd(
+                    [r for r, m in zip(result_flat, exp_grad_mask) if m],
+                    [r for r, m in zip(result_exp_flat, exp_grad_mask) if m],
+                    params,
+                )
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_closure_combine_fn_with_no_grad_init_carries_unequal_grad(
+        self, reverse, compile_mode, device, autograd
+    ):
+        dim = 1
+        scan_fct = compile_mode_helper(scan, compile_mode)
+        x = torch.randn(3, 10, 7, device=device, requires_grad=autograd)
+        h1 = torch.randn(3, 7, device=device, requires_grad=autograd)
+        h2 = torch.randn(3, 7, device=device, requires_grad=autograd)
+
+        result = scan_fct(
+            get_scan_combine_fn("fct_c1_no_grad", True),
+            (h1, h2),
+            x,
+            dim=dim,
+            reverse=reverse,
+        )
+        result_exp = _fake_scan(
+            get_scan_combine_fn("fct_c1_no_grad", True),
+            (h1, h2),
+            x,
+            dim=dim,
+            reverse=reverse,
+        )
+        self.assertEqual(result, result_exp)
+
+        if autograd:
+            # TODO: Ideally we should be able to select the results that require gradients like this
+            # [leaf for leaf in pytree.tree_leaves(result) if leaf.requires_grad == True]
+            # However, for the scan operator this does not work, as all outputs always have
+            # grad_fn=<ScanAutogradOpBackward>
+            res_req_grad_flat = pytree.tree_leaves(result)[1:]
+            res_exp_req_grad_flat = pytree.tree_leaves(result_exp)[1:]
+            self.check_autograd(res_req_grad_flat, res_exp_req_grad_flat, (x, h2))
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_closure_combine_fn_with_no_grad_init_carries_equal_grad(
+        self, reverse, compile_mode, device, autograd
+    ):
+        dim = 1
+        scan_fct = compile_mode_helper(scan, compile_mode)
+        x = torch.randn(3, 10, 7, device=device, requires_grad=autograd)
+        h1 = torch.randn(3, 7, device=device, requires_grad=False)
+        h2 = torch.randn(3, 7, device=device, requires_grad=autograd)
+
+        result = scan_fct(
+            get_scan_combine_fn("fct_c1_no_grad", True),
+            (h1, h2),
+            x,
+            dim=dim,
+            reverse=reverse,
+        )
+        result_exp = _fake_scan(
+            get_scan_combine_fn("fct_c1_no_grad", True),
+            (h1, h2),
+            x,
+            dim=dim,
+            reverse=reverse,
+        )
+        self.assertEqual(result, result_exp)
+
+        if autograd:
+            # TODO: Ideally we should be able to select the results that require gradients like this
+            # [leaf for leaf in pytree.tree_leaves(result) if leaf.requires_grad == True]
+            # However, for the scan operator this does not work, as all outputs always have
+            # grad_fn=<ScanAutogradOpBackward>
+            res_req_grad_flat = pytree.tree_leaves(result)[1:]
+            res_exp_req_grad_flat = pytree.tree_leaves(result_exp)[1:]
+            self.check_autograd(res_req_grad_flat, res_exp_req_grad_flat, (x, h2))
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_closure_combine_fn_with_no_grad_for_out(
+        self, reverse, compile_mode, device, autograd
+    ):
+        dim = 1
+        scan_fct = compile_mode_helper(scan, compile_mode)
+        x = torch.randn(3, 10, 7, device=device, requires_grad=autograd)
+        h1 = torch.randn(3, 7, device=device, requires_grad=autograd)
+        h2 = torch.randn(3, 7, device=device, requires_grad=autograd)
+
+        def fct_ys_no_grad(x: torch.Tensor, y: torch.Tensor):
+            c1 = x[0] + y
+            c2 = x[1] + y
+            with torch.no_grad():
+                h_new = torch.tanh(x[0] + x[1] + y)
+            return (c1, c2), h_new
+
+        result = scan_fct(fct_ys_no_grad, (h1, h2), x, dim=dim, reverse=reverse)
+        result_exp = _fake_scan(fct_ys_no_grad, (h1, h2), x, dim=dim, reverse=reverse)
+        self.assertEqual(result, result_exp)
+
+        if autograd:
+            self.check_autograd(result[0], result_exp[0], (x, h1, h2))
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_closure_combine_fn_with_no_grad_additional_inputs_partial(
+        self, reverse, compile_mode, device, autograd
+    ):
+        dim = 1
+        scan_fct = compile_mode_helper(scan, compile_mode)
+        x = torch.randn(3, 10, 7, device=device, requires_grad=autograd)
+        h = torch.randn(3, 7, device=device, requires_grad=autograd)
+        W_ih = torch.randn(7, 7, device=device, requires_grad=autograd)
+        b_ih = torch.randn(7, device=device, requires_grad=autograd)
+        W_hh = torch.randn(7, 7, device=device, requires_grad=autograd)
+        b_hh = torch.randn(7, device=device, requires_grad=autograd)
+
+        def fct_no_grad_bhh_Whh(x: torch.Tensor, y: torch.Tensor):
+            c_new = y @ W_ih + b_ih + x
+
+            h_new = c_new + 1
+            with torch.no_grad():
+                h_new_no_grad = torch.tanh(x @ W_hh + b_hh)
+            h_new2 = h_new + h_new_no_grad
+
+            return c_new, h_new2
+
+        result = scan_fct(fct_no_grad_bhh_Whh, h, x, dim=dim, reverse=reverse)
+        result_exp = _fake_scan(fct_no_grad_bhh_Whh, h, x, dim=dim, reverse=reverse)
+        self.assertEqual(result, result_exp)
+
+        if autograd:
+            self.check_autograd(result[1], result_exp[1], (h, x, W_ih, b_ih))
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_closure_combine_fn_with_no_grad_additional_inputs_all(
+        self, reverse, compile_mode, device, autograd
+    ):
+        dim = 1
+        scan_fct = compile_mode_helper(scan, compile_mode)
+        x = torch.randn(3, 10, 7, device=device, requires_grad=autograd)
+        h = torch.randn(3, 7, device=device, requires_grad=autograd)
+        W_ih = torch.randn(7, 7, device=device, requires_grad=autograd)
+        b_ih = torch.randn(7, device=device, requires_grad=autograd)
+        W_hh = torch.randn(7, 7, device=device, requires_grad=autograd)
+        b_hh = torch.randn(7, device=device, requires_grad=autograd)
+
+        def fct_no_grad_bih_Wih_bhh_Whh(x: torch.Tensor, y: torch.Tensor):
+            c_new = x + y
+            h_new = c_new + x
+            with torch.no_grad():
+                c_new_no_grad = y @ W_ih + b_ih
+                h_new_no_grad = torch.tanh(x @ W_hh + b_hh)
+            c_new2 = c_new + c_new_no_grad
+            h_new2 = h_new + h_new_no_grad
+            return c_new2, h_new2
+
+        result = scan_fct(fct_no_grad_bih_Wih_bhh_Whh, h, x, dim=dim, reverse=reverse)
+        result_exp = _fake_scan(
+            fct_no_grad_bih_Wih_bhh_Whh, h, x, dim=dim, reverse=reverse
+        )
+        self.assertEqual(result, result_exp)
+
+        if autograd:
+            self.check_autograd(result[1], result_exp[1], (h, x))
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_closure_combine_fn_carries_ys_same_grad(
+        self, reverse, compile_mode, device, autograd
+    ):
+        dim = 1
+        scan_fct = compile_mode_helper(scan, compile_mode)
+        x = torch.randn(3, 10, 7, device=device, requires_grad=autograd)
+        h = torch.randn(3, 7, device=device, requires_grad=autograd)
+        W_ih = torch.randn(7, 7, device=device, requires_grad=autograd)
+        b_ih = torch.randn(7, device=device, requires_grad=autograd)
+        W_hh = torch.randn(7, 7, device=device, requires_grad=autograd)
+        b_hh = torch.randn(7, device=device, requires_grad=autograd)
+
+        def fct_no_grad_bih_Wih_bhh_Whh(x: torch.Tensor, y: torch.Tensor):
+            c_new = x + y
+            h_new = c_new + 1
+            with torch.no_grad():
+                c_new_no_grad = y @ W_ih + b_ih
+                h_new_no_grad = torch.tanh(x @ W_hh + b_hh)
+            c_new2 = c_new + c_new_no_grad
+            h_new2 = h_new + h_new_no_grad
+            return c_new2, h_new2
+
+        result = scan_fct(fct_no_grad_bih_Wih_bhh_Whh, h, x, dim=dim, reverse=reverse)
+        result_exp = _fake_scan(
+            fct_no_grad_bih_Wih_bhh_Whh, h, x, dim=dim, reverse=reverse
+        )
+        self.assertEqual(result, result_exp)
+
+        if autograd:
+            self.check_autograd(result[1], result_exp[1], (h, x))
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    @parametrize("reverse", [False, True])
+    @parametrize("compile_mode", ["none", "eager"])
+    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
+    def test_scan_closure_nested(self, reverse, compile_mode, device, autograd):
+        scan_fct = compile_mode_helper(scan, compile_mode)
+
+        # Simple non-nested case
+        x = torch.randn(3, 20, 5, device=device, requires_grad=autograd)
+        h = torch.randn(3, 7, device=device, requires_grad=autograd)
+        W = torch.randn(5, 7, device=device, requires_grad=autograd)
+        b = torch.randn(7, device=device, requires_grad=autograd)
+
+        def f1(x: torch.Tensor, y: torch.Tensor):
+            c_new = y @ W + b
+            h_new = torch.tanh(c_new + x)
+            return c_new, h_new
+
+        result = scan_fct(f1, h, x, dim=1, reverse=reverse)
+        result_exp = _fake_scan(f1, h, x, dim=1, reverse=reverse)
+        self.assertEqual(result, result_exp)
+
+        if autograd:
+            self.check_autograd(result, result_exp, (h, x, W, b))
+
+        # Nested case
+        def chain_fct(fct, f_1, f_2, xs, h_1, h_2):
+            o1 = fct(
+                f_1,
+                h_1,
+                xs,
+                dim=1,
+                reverse=reverse,
+            )
+            o2 = fct(
+                f_2,
+                h_2,
+                o1[1],
+                dim=0,
+                reverse=reverse,
+            )
+            return o2
+
+        x1 = torch.ones(3, 20, 5, device=device, requires_grad=autograd)
+        h1 = torch.zeros(3, 7, device=device, requires_grad=autograd)
+        h2 = torch.zeros(3, 3, device=device, requires_grad=autograd)
+        W_1 = torch.randn(5, 7, device=device, requires_grad=autograd)
+        b_1 = torch.randn(7, device=device, requires_grad=autograd)
+        W_2 = torch.randn(7, 3, device=device, requires_grad=autograd)
+        b_2 = torch.randn(3, device=device, requires_grad=autograd)
+
+        def f1(x: torch.Tensor, y: torch.Tensor):
+            c_new = y @ W_1 + b_1
+            h_new = torch.tanh(c_new + x)
+            return c_new, h_new
+
+        def f2(x: torch.Tensor, y: torch.Tensor):
+            c_new = y @ W_2 + b_2
+            h_new = torch.tanh(c_new + x)
+            return c_new, h_new
+
+        result1 = chain_fct(scan_fct, f1, f2, x1, h1, h2)
+        expected_result = chain_fct(_fake_scan, f1, f2, x1, h1, h2)
+        self.assertEqual(result1, expected_result)
+
+        if autograd:
+            self.check_autograd(result1, expected_result, (h1, h2, x1, W_1, b_1))
+
+        # Complex case
+        x1 = torch.randn(3, 20, 3, device=device, requires_grad=autograd)
+        h1 = torch.randn(3, 3, device=device, requires_grad=autograd)
+        h2 = torch.randn(3, 3, device=device, requires_grad=autograd)
+        W_1 = torch.randn(3, 3, device=device, requires_grad=autograd)
+        b_1 = torch.randn(3, device=device, requires_grad=autograd)
+        W_2 = torch.randn(3, 3, device=device, requires_grad=autograd)
+        b_2 = torch.randn(3, device=device, requires_grad=autograd)
+
+        def f1(x: torch.Tensor, y: torch.Tensor):
+            c_new = y @ W_1 + b_1
+            h_new = torch.tanh(c_new + x)
+            return c_new, h_new
+
+        def f2(x: torch.Tensor, y: torch.Tensor):
+            c_new = y @ W_2 + b_2 * b_1 + y @ W_1
+            h_new = torch.tanh(c_new + x)
+            return c_new, h_new
+
+        result1 = chain_fct(scan_fct, f1, f2, x1, h1, h2)
+        expected_result = chain_fct(_fake_scan, f1, f2, x1, h1, h2)
+        self.assertEqual(result1, expected_result)
+
+        if autograd:
+            self.check_autograd(
+                result1, expected_result, (h1, h2, x1, W_1, b_1, W_2, b_2)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoDynamoSupport
     def test_scan_simple_graph_wrong_dtype(self):
@@ -2925,12 +4018,21 @@ def f(fct, init, xs):
 def forward(self, fct_1, init_1, xs_1):
     permute = torch.ops.aten.permute.default(xs_1, [0, 1, 2])
     flip = torch.ops.aten.flip.default(permute, [0]);  permute = None
+<<<<<<< HEAD
     sym_size_int = torch.ops.aten.sym_size.int(init_1, 1)
     sym_size_int_1 = torch.ops.aten.sym_size.int(init_1, 2)
     sym_size_int_2 = torch.ops.aten.sym_size.int(xs_1, 1)
     sym_size_int_3 = torch.ops.aten.sym_size.int(xs_1, 2);  xs_1 = None
     scan_combine_graph_0 = self.scan_combine_graph_0
     scan = torch.ops.higher_order.scan(scan_combine_graph_0, [init_1], [flip], (sym_size_int, sym_size_int_1, sym_size_int_2, sym_size_int_3));  scan_combine_graph_0 = init_1 = flip = sym_size_int = sym_size_int_1 = sym_size_int_2 = sym_size_int_3 = None
+=======
+    sym_size_int_1 = torch.ops.aten.sym_size.int(init_1, 1)
+    sym_size_int_2 = torch.ops.aten.sym_size.int(init_1, 2)
+    sym_size_int_3 = torch.ops.aten.sym_size.int(xs_1, 1)
+    sym_size_int_4 = torch.ops.aten.sym_size.int(xs_1, 2);  xs_1 = None
+    scan_combine_graph_0 = self.scan_combine_graph_0
+    scan = torch.ops.higher_order.scan(scan_combine_graph_0, [init_1], [flip], (sym_size_int_1, sym_size_int_2, sym_size_int_3, sym_size_int_4));  scan_combine_graph_0 = init_1 = flip = sym_size_int_1 = sym_size_int_2 = sym_size_int_3 = sym_size_int_4 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = scan[0]
     getitem_1 = scan[1];  scan = None
     flip_1 = torch.ops.aten.flip.default(getitem_1, [0]);  getitem_1 = None
@@ -2958,13 +4060,127 @@ def forward(self, L_init_ : torch.Tensor, L_xs_ : torch.Tensor):
     return (carry, out_1)""",  # noqa: B950
         )
 
+<<<<<<< HEAD
+=======
+    @requires_cuda
+    def test_scan_input_mutation(self):
+        device = torch.device("cuda")
+
+        def fct_input_mutation(x, y):
+            x.add_(1)
+            return x + y, x + y + 2
+
+        x = torch.randn(3, 2, 2, device=device)
+        init = torch.randn(2, 2, device=device)
+
+        with self.assertRaisesRegex(
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "scan must be captured completely with torch.compile.*",
+        ):
+            scan(fct_input_mutation, init, x, dim=0)
+
+    @requires_cuda
+    def test_scan_input_carry_alias(self):
+        device = torch.device("cuda")
+
+        def fct_input_output_alias(x, y):
+            return (x[0], x[1] + y[1]), (x[1] + y[1] + 1, x[1] + y[1] + 2)
+
+        x = torch.randn(3, 2, 2, device=device)
+        y = torch.randn(3, 2, 2, device=device)
+        inp = (x, y)
+        init = (torch.randn(2, 2, device=device), torch.randn(2, 2, device=device))
+
+        with self.assertRaisesRegex(
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "scan must be captured completely with torch.compile.*",
+        ):
+            scan(fct_input_output_alias, init, inp, dim=0)
+
+    @requires_cuda
+    def test_scan_input_output_alias(self):
+        device = torch.device("cuda")
+
+        def fct_input_output_alias(x, y):
+            return (x[0] + 1, x[1] + y[1]), (x[1], x[1] + y[1] + 2)
+
+        x = torch.randn(3, 2, 2, device=device)
+        y = torch.randn(3, 2, 2, device=device)
+        inp = (x, y)
+        init = (torch.randn(2, 2, device=device), torch.randn(2, 2, device=device))
+
+        with self.assertRaisesRegex(
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "scan must be captured completely with torch.compile.*",
+        ):
+            scan(fct_input_output_alias, init, inp, dim=0)
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    def test_scan_carry_carry_alias(self):
+        device = torch.device("cuda")
+
+        def fct_carry_carry_alias(x, y):
+            c = x[0] + y[1]
+            return (c, c), (x[0] + y[1], x[0] + y[1] + 1)
+
+        x = torch.randn(3, 2, 2, device=device)
+        y = torch.randn(3, 2, 2, device=device)
+        inp = (x, y)
+        init = (torch.randn(2, 2, device=device), torch.randn(2, 2, device=device))
+
+        with self.assertRaisesRegex(
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "scan must be captured completely with torch.compile.*",
+        ):
+            scan(fct_carry_carry_alias, init, inp, dim=0)
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    def test_scan_carry_output_alias(self):
+        device = torch.device("cuda")
+
+        def fct_carry_output_alias(x, y):
+            c = x[0] + y[1]
+            return (x[0] + y[1], c), (c, x[0] + y[1] + 1)
+
+        x = torch.randn(3, 2, 2, device=device)
+        y = torch.randn(3, 2, 2, device=device)
+        inp = (x, y)
+        init = (torch.randn(2, 2, device=device), torch.randn(2, 2, device=device))
+
+        with self.assertRaisesRegex(
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "scan must be captured completely with torch.compile.*",
+        ):
+            scan(fct_carry_output_alias, init, inp, dim=0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class AssociativeScanModels:
     @staticmethod
     def get_scan_fct(compile_mode, combine_mode):
         # Compile the associative_scan according to the provided compile_mode
         if compile_mode != "fake":
+<<<<<<< HEAD
             compile_mode = "none"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assoc_scan_comp = compile_mode_helper(associative_scan, compile_mode)
 
             def scan_fct(combine_fn, xs, dim, reverse):
@@ -3093,11 +4309,24 @@ def _prepare_fake_kwargs(self, original_kwargs):
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
+<<<<<<< HEAD
+=======
+    # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @decorateIf(
         unittest.skip,
         lambda params: (
             params["combine_mode"] == "pointwise"
+<<<<<<< HEAD
             and (params["device"] == torch.device("cpu") or torch.version.hip)
+=======
+            and (
+                params["device"] == torch.device("cpu")
+                or params["compile_mode"] == "compile_dynamic_shape"
+                or torch.version.hip
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     )
     def test_associative_scan_compile(
@@ -3154,11 +4383,24 @@ def test_associative_scan_compile(
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
+<<<<<<< HEAD
+=======
+    # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @decorateIf(
         unittest.skip,
         lambda params: (
             params["combine_mode"] == "pointwise"
+<<<<<<< HEAD
             and (params["device"] == torch.device("cpu") or torch.version.hip)
+=======
+            and (
+                params["device"] == torch.device("cpu")
+                or params["compile_mode"] == "compile_dynamic_shape"
+                or torch.version.hip
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     )
     def test_associative_scan_dim(self, combine_mode, compile_mode, reverse, device):
@@ -3195,7 +4437,12 @@ def test_associative_scan_dim(self, combine_mode, compile_mode, reverse, device)
 
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
+<<<<<<< HEAD
     def test_associative_scan_dim_shape_failure(self):
+=======
+    @unittest.expectedFailure
+    def test_associative_scan_dim_shape_failure(self, compile_mode, combine_mode):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_dims = [2]
         for num_dim in num_dims:
             shapes = [9 for _ in range(num_dim)]
@@ -3205,7 +4452,11 @@ def test_associative_scan_dim_shape_failure(self):
             kwargs = {
                 "dim": rnd_scan_dim,
                 "reverse": True,
+<<<<<<< HEAD
                 "compile_mode": "none",
+=======
+                "compile_mode": "compile",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "combine_mode": "generic",
             }
             kwargs_fake = self._prepare_fake_kwargs(kwargs)
@@ -3224,11 +4475,24 @@ def test_associative_scan_dim_shape_failure(self):
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
+<<<<<<< HEAD
+=======
+    # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @decorateIf(
         unittest.skip,
         lambda params: (
             params["combine_mode"] == "pointwise"
+<<<<<<< HEAD
             and (params["device"] == torch.device("cpu") or torch.version.hip)
+=======
+            and (
+                params["device"] == torch.device("cpu")
+                or params["compile_mode"] == "compile_dynamic_shape"
+                or torch.version.hip
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     )
     def test_associative_scan_tuple(self, compile_mode, combine_mode, reverse, device):
@@ -3311,11 +4575,24 @@ def test_associative_scan_non_contiguous_tensor(
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
+<<<<<<< HEAD
+=======
+    # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @decorateIf(
         unittest.skip,
         lambda params: (
             params["combine_mode"] == "pointwise"
+<<<<<<< HEAD
             and (params["device"] == torch.device("cpu") or torch.version.hip)
+=======
+            and (
+                params["device"] == torch.device("cpu")
+                or params["compile_mode"] == "compile_dynamic_shape"
+                or torch.version.hip
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     )
     def test_associative_scan_complex_pytree(
@@ -3340,6 +4617,152 @@ def test_associative_scan_complex_pytree(
             inputs=inp,
         )
 
+<<<<<<< HEAD
+=======
+    @skipIfTorchDynamo("don't test compile on compile")
+    @skipIfNoDynamoSupport
+    @skipIfCrossRef  # Arg order changes with crossref
+    def test_associative_scan_pytree_output(self):
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+        x = (
+            (
+                torch.randn(3, 10, 2, device=torch.device("cpu")),
+                (torch.randn(3, 10, 2, device=torch.device("cpu")),),
+            ),
+            torch.randn(3, 10, 2, device=torch.device("cpu")),
+        )
+
+        def f(fct, xs):
+            return associative_scan(
+                fct, xs, dim=0, reverse=True, combine_mode="generic"
+            )
+
+        def combine_fn(x: torch.Tensor, y: torch.Tensor):
+            a, b = (x[0][0] + y[1], x[0][1][0] - y[1])
+            return (a, (b,)), a - b
+
+        # Check graph
+        backend = EagerAndRecordGraphs()
+        torch.compile(f, backend=backend)(combine_fn, x)
+        gm = backend.graphs[0]
+
+        self.assertExpectedInline(
+            normalize_gm(gm.print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs_1_: "f32[3, 10, 2]"):
+        l_xs_0_0_ = L_xs_0_0_
+        l_xs_0_1_0_ = L_xs_0_1_0_
+        l_xs_1_ = L_xs_1_
+
+        elem: "f32[3, 10, 2]" = torch.movedim(l_xs_0_0_, 0, 0);  l_xs_0_0_ = None
+        elem_1: "f32[3, 10, 2]" = torch.movedim(l_xs_0_1_0_, 0, 0);  l_xs_0_1_0_ = None
+        elem_2: "f32[3, 10, 2]" = torch.movedim(l_xs_1_, 0, 0);  l_xs_1_ = None
+
+        elem_3: "f32[3, 10, 2]" = torch.flip(elem, [0]);  elem = None
+        elem_4: "f32[3, 10, 2]" = torch.flip(elem_1, [0]);  elem_1 = None
+        elem_5: "f32[3, 10, 2]" = torch.flip(elem_2, [0]);  elem_2 = None
+
+        child: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_3, 0, 0, -1, 2)
+        child_1: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_4, 0, 0, -1, 2)
+        child_2: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_5, 0, 0, -1, 2)
+
+        child_3: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_3, 0, 1, None, 2)
+        child_4: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_4, 0, 1, None, 2)
+        child_5: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_5, 0, 1, None, 2)
+
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions = None
+
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting = None
+
+        _add_batch_dim: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+        _add_batch_dim_1: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_1, 0, 1);  child_1 = None
+        _add_batch_dim_2: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_2, 0, 1);  child_2 = _add_batch_dim_2 = None
+        _add_batch_dim_3: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_3, 0, 1);  child_3 = _add_batch_dim_3 = None
+        _add_batch_dim_4: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_4, 0, 1);  child_4 = _add_batch_dim_4 = None
+        _add_batch_dim_5: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_5, 0, 1);  child_5 = None
+
+        a: "f32[10, 2]" = _add_batch_dim + _add_batch_dim_5;  _add_batch_dim = None
+        b: "f32[10, 2]" = _add_batch_dim_1 - _add_batch_dim_5;  _add_batch_dim_1 = _add_batch_dim_5 = None
+
+        child_6: "f32[10, 2]" = a - b
+
+        child_7: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(a, 1, 1, 0);  a = None
+        child_8: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(b, 1, 1, 0);  b = None
+        child_9: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(child_6, 1, 1, 0);  child_6 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+
+        child_10: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_3, 0, 2, None, 2)
+        child_11: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_4, 0, 2, None, 2)
+        child_12: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_5, 0, 2, None, 2)
+
+        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+
+        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting_1 = None
+
+        _add_batch_dim_6: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_7, 0, 1)
+        _add_batch_dim_7: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_8, 0, 1)
+        _add_batch_dim_8: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_9, 0, 1);  _add_batch_dim_8 = None
+        _add_batch_dim_9: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_10, 0, 1);  child_10 = _add_batch_dim_9 = None
+        _add_batch_dim_10: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_11, 0, 1);  child_11 = _add_batch_dim_10 = None
+        _add_batch_dim_11: "f32[10, 2]" = torch._C._functorch._add_batch_dim(child_12, 0, 1);  child_12 = None
+
+        a_1: "f32[10, 2]" = _add_batch_dim_6 + _add_batch_dim_11;  _add_batch_dim_6 = None
+        b_1: "f32[10, 2]" = _add_batch_dim_7 - _add_batch_dim_11;  _add_batch_dim_7 = _add_batch_dim_11 = None
+
+        child_13: "f32[10, 2]" = a_1 - b_1
+
+        child_14: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(a_1, 1, 1, 0);  a_1 = None
+        child_15: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(b_1, 1, 1, 0);  b_1 = None
+        child_16: "f32[1, 10, 2]" = torch._C._functorch._remove_batch_dim(child_13, 1, 1, 0);  child_13 = None
+
+        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+
+        slice_10: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_3, 0, 0, 1);  elem_3 = None
+        cat: "f32[2, 10, 2]" = torch.cat([slice_10, child_14], dim = 0);  slice_10 = child_14 = None
+        slice_11: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_4, 0, 0, 1);  elem_4 = None
+        cat_1: "f32[2, 10, 2]" = torch.cat([slice_11, child_15], dim = 0);  slice_11 = child_15 = None
+        slice_12: "f32[1, 10, 2]" = torch.ops.aten.slice(elem_5, 0, 0, 1);  elem_5 = None
+        cat_2: "f32[2, 10, 2]" = torch.cat([slice_12, child_16], dim = 0);  slice_12 = child_16 = None
+
+        b_2: "f32[2, 10, 2]" = torch._C._nn.pad(child_7, [0, 0, 0, 0, 0, 1], 'constant', None);  child_7 = None
+
+        stacked: "f32[2, 2, 10, 2]" = torch.stack([cat, b_2], dim = 1);  cat = b_2 = None
+
+        interleaved: "f32[4, 10, 2]" = torch.flatten(stacked, start_dim = 0, end_dim = 1);  stacked = None
+
+        interleaved_1: "f32[3, 10, 2]" = torch.ops.aten.slice(interleaved, 0, 0, 3);  interleaved = None
+
+        b_3: "f32[2, 10, 2]" = torch._C._nn.pad(child_8, [0, 0, 0, 0, 0, 1], 'constant', None);  child_8 = None
+
+        stacked_1: "f32[2, 2, 10, 2]" = torch.stack([cat_1, b_3], dim = 1);  cat_1 = b_3 = None
+
+        interleaved_2: "f32[4, 10, 2]" = torch.flatten(stacked_1, start_dim = 0, end_dim = 1);  stacked_1 = None
+
+        interleaved_3: "f32[3, 10, 2]" = torch.ops.aten.slice(interleaved_2, 0, 0, 3);  interleaved_2 = None
+
+        b_4: "f32[2, 10, 2]" = torch._C._nn.pad(child_9, [0, 0, 0, 0, 0, 1], 'constant', None);  child_9 = None
+
+        stacked_2: "f32[2, 2, 10, 2]" = torch.stack([cat_2, b_4], dim = 1);  cat_2 = b_4 = None
+
+        interleaved_4: "f32[4, 10, 2]" = torch.flatten(stacked_2, start_dim = 0, end_dim = 1);  stacked_2 = None
+
+        interleaved_5: "f32[3, 10, 2]" = torch.ops.aten.slice(interleaved_4, 0, 0, 3);  interleaved_4 = None
+
+        child_17: "f32[3, 10, 2]" = interleaved_1.flip([0]);  interleaved_1 = None
+        child_18: "f32[3, 10, 2]" = interleaved_3.flip([0]);  interleaved_3 = None
+        child_19: "f32[3, 10, 2]" = interleaved_5.flip([0]);  interleaved_5 = None
+
+        movedim_3: "f32[3, 10, 2]" = torch.movedim(child_17, 0, 0);  child_17 = None
+        movedim_4: "f32[3, 10, 2]" = torch.movedim(child_18, 0, 0);  child_18 = None
+        movedim_5: "f32[3, 10, 2]" = torch.movedim(child_19, 0, 0);  child_19 = None
+        return (movedim_3, movedim_4, movedim_5)
+""",  # noqa: B950
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("combine_mode", ["pointwise", "generic"])
@@ -3348,11 +4771,24 @@ def test_associative_scan_complex_pytree(
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
+<<<<<<< HEAD
+=======
+    # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @decorateIf(
         unittest.skip,
         lambda params: (
             params["combine_mode"] == "pointwise"
+<<<<<<< HEAD
             and (params["device"] == torch.device("cpu") or torch.version.hip)
+=======
+            and (
+                params["device"] == torch.device("cpu")
+                or params["compile_mode"] == "compile_dynamic_shape"
+                or torch.version.hip
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     )
     def test_associative_scan_downstream_scan_matmul(
@@ -3389,11 +4825,24 @@ def second_chain_fct(scan_fct, inp, **kwargs):
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
+<<<<<<< HEAD
+=======
+    # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @decorateIf(
         unittest.skip,
         lambda params: (
             params["combine_mode"] == "pointwise"
+<<<<<<< HEAD
             and (params["device"] == torch.device("cpu") or torch.version.hip)
+=======
+            and (
+                params["device"] == torch.device("cpu")
+                or params["compile_mode"] == "compile_dynamic_shape"
+                or torch.version.hip
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     )
     def test_associative_scan_downstream_scan_scan(
@@ -3432,11 +4881,24 @@ def second_chain_fct(scan_fct, inp, **kwargs):
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
+<<<<<<< HEAD
+=======
+    # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @decorateIf(
         unittest.skip,
         lambda params: (
             params["combine_mode"] == "pointwise"
+<<<<<<< HEAD
             and (params["device"] == torch.device("cpu") or torch.version.hip)
+=======
+            and (
+                params["device"] == torch.device("cpu")
+                or params["compile_mode"] == "compile_dynamic_shape"
+                or torch.version.hip
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     )
     def test_associative_scan_downstream_scan_scan_different_dim(
@@ -3608,9 +5070,25 @@ def body_fn(ind, loop_val):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+<<<<<<< HEAD
     def test_associative_scan_cond_in_combine_fn(self, compile_mode, reverse, device):
         def combine_fn(x, y):
             val = cond(torch.sum(y) > 0.0, lambda y: y + 0.0, lambda y: 1.0 - y, (y,))
+=======
+    # Skipping the combination of compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+    @decorateIf(
+        unittest.skip,
+        lambda params: (
+            params["device"] == torch.device("cpu")
+            or params["compile_mode"] == "compile_dynamic_shape"
+            or torch.version.hip
+        ),
+    )
+    def test_associative_scan_cond_in_combine_fn(self, compile_mode, reverse, device):
+        def combine_fn(x, y):
+            val = cond(torch.sum(y) > 0.0, lambda y: y.clone(), lambda y: 1.0 - y, (y,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return x * val
 
         inp = torch.randn(3, 10, 1, device=device)
@@ -3732,11 +5210,24 @@ def test_associative_scan_non_pointwise_generic(
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
+<<<<<<< HEAD
+=======
+    # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
+    # as the current implementation does not support lifted arguments
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @decorateIf(
         unittest.skip,
         lambda params: (
             params["combine_mode"] == "pointwise"
+<<<<<<< HEAD
             and (params["device"] == torch.device("cpu") or torch.version.hip)
+=======
+            and (
+                params["device"] == torch.device("cpu")
+                or params["compile_mode"] == "compile_dynamic_shape"
+                or torch.version.hip
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     )
     def test_associative_scan_binary_operator(
@@ -3827,11 +5318,16 @@ def test_associative_scan_different_input_size_wrong_dim(self):
         elements = (x, deltaA, deltaB_u, C, y)
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             # Should be
             # ValueError,
             # "All xs leaves must at least have 'dim' number of dimensions and scan dimension > 0"
             torch._dynamo.exc.Unsupported,
             "Observed exception.*",
+=======
+            ValueError,
+            "All xs leaves must at least have.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             associative_scan(
                 get_scan_combine_fn("different_input_size_operator", True),
@@ -4137,13 +5633,17 @@ def fct_pointwise(x, y):
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
+<<<<<<< HEAD
     @requires_cuda
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_associative_scan_sparse_tensor(self):
         x = torch.tensor(
             [[[0.0, 0], [1.0, 2.0]], [[0.0, 0], [3.0, 4.0]], [[0.0, 0], [5.0, 6.0]]]
         ).to_sparse()
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError,
             "torch.compile does not support sparse Tensors",
         ):
@@ -4151,6 +5651,13 @@ def test_associative_scan_sparse_tensor(self):
                 get_scan_combine_fn("add", True),
                 x,
                 0,
+=======
+            ValueError,
+            "xs leaves must dense Tensors.*",
+        ):
+            associative_scan(
+                get_scan_combine_fn("add", True), x, 0, combine_mode="generic"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4173,10 +5680,15 @@ def fct_wrong_stride(x, y):
 
         for fct in [fct_wrong_dtype, fct_wrong_device, fct_wrong_stride]:
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 # Should be: RuntimeError,
                 # "The pytree of the output of the operator needs to match the xs pytree"
                 torch._dynamo.exc.Unsupported,
                 "Observed exception.*",
+=======
+                torch._dynamo.exc.UncapturedHigherOrderOpError,
+                "Expected initial_xs and combine_fn_output to have same metadata.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 associative_scan(fct, x, 0)
 
@@ -4185,7 +5697,11 @@ def test_associative_scan_wrong_pytree(self):
         def fct_wrong_pytree(x, y):
             return {
                 "i": x["i"] * y["j"][0][0],
+<<<<<<< HEAD
                 "k": 0.0,
+=======
+                "k": torch.tensor(0.0),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "j": ([x["j"][1][0]["o"]], [{"o": torch.sin(x["i"])}]),
             }
 
@@ -4195,11 +5711,16 @@ def fct_wrong_pytree(x, y):
         inp = {"i": x, "j": ([y], [{"o": z}])}
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             # Should be:
             # RuntimeError,
             # r"The number of leaves of the pytree of the output of the operator.*",
             torch._dynamo.exc.Unsupported,
             "Observed exception.*",
+=======
+            AssertionError,
+            "Combin_fn received wrong number of arguments.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             associative_scan(fct_wrong_pytree, inp, 0, combine_mode="generic")
 
@@ -4232,8 +5753,15 @@ def fct_input_mutation(x, y):
 
         with self.assertRaisesRegex(
             # Should be
+<<<<<<< HEAD
             RuntimeError,
             "Combine_fn might be modifying the input!",
+=======
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "associative_scan must be captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             associative_scan(fct_input_mutation, x, 0)
 
@@ -4250,11 +5778,43 @@ def fct_input_output_alias(x, y):
 
         with self.assertRaisesRegex(
             # Should be
+<<<<<<< HEAD
             RuntimeError,
             "Combine_fn might be aliasing the input!",
         ):
             associative_scan(fct_input_output_alias, inp, 0)
 
+=======
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "associative_scan must be captured completely with torch.compile.*",
+        ):
+            associative_scan(fct_input_output_alias, inp, 0)
+
+    @unittest.skipIf(not SM70OrLater, "triton")
+    @requires_cuda
+    def test_associative_scan_output_output_alias(self):
+        device = torch.device("cuda")
+
+        def fct_output_output_alias(x, y):
+            c = x[0] + y[1]
+            return c, c
+
+        x = torch.randn(3, 2, 2, device=device)
+        y = torch.randn(3, 2, 2, device=device)
+        inp = (x, y)
+
+        with self.assertRaisesRegex(
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "associative_scan must be captured completely with torch.compile.*",
+        ):
+            associative_scan(fct_output_output_alias, inp, 0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
 @skipIfNoDynamoSupport
@@ -4340,11 +5900,16 @@ def forward(self, L_pred_ : torch.Tensor, L_x_ : torch.Tensor):
     l_x_ = L_x_
     cond_true_0 = self.cond_true_0
     cond_false_0 = self.cond_false_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(l_pred_, cond_true_0, cond_false_0, [l_x_]);  l_pred_ = cond_true_0 = cond_false_0 = l_x_ = None
+=======
+    cond = torch.ops.higher_order.cond(l_pred_, cond_true_0, cond_false_0, (l_x_,));  l_pred_ = cond_true_0 = cond_false_0 = l_x_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result = cond[0];  cond = None
     grad_out = torch.ones_like(result)
     return (result, grad_out)""",  # noqa: B950
         )
+<<<<<<< HEAD
 
         self.assertExpectedInline(
             gm.cond_true_0.code.strip(),
@@ -4376,6 +5941,49 @@ def forward(self, L_ctx_saved_tensors_0_ : torch.Tensor, L_ctx_pred : torch.Tens
     cond = torch.ops.higher_order.cond(l_ctx_pred, cond_true_0, cond_false_0, [l_ctx_saved_tensors_0_, l_flat_grads_0_]);  l_ctx_pred = cond_true_0 = cond_false_0 = l_ctx_saved_tensors_0_ = l_flat_grads_0_ = None
     getitem = cond[0];  cond = None
     return (getitem,)""",  # noqa: B950
+=======
+        self.assertExpectedInline(
+            normalize_gm(backend.graphs[1].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_ctx_saved_tensors_0_: "f32[4]", L_ctx_pred: "b8[]", L_args_1_: "f32[4]"):
+        l_ctx_saved_tensors_0_ = L_ctx_saved_tensors_0_
+        l_ctx_pred = L_ctx_pred
+        l_args_1_ = L_args_1_
+
+        cond_true_0 = self.cond_true_0
+        cond_false_0 = self.cond_false_0
+        cond = torch.ops.higher_order.cond(l_ctx_pred, cond_true_0, cond_false_0, (l_args_1_, l_ctx_saved_tensors_0_));  l_ctx_pred = cond_true_0 = cond_false_0 = l_args_1_ = l_ctx_saved_tensors_0_ = None
+        getitem: "f32[4]" = cond[0];  cond = None
+        return (getitem,)
+
+    class cond_true_0(torch.nn.Module):
+        def forward(self, l_args_1_, l_ctx_saved_tensors_0_):
+            l_args_1__1 = l_args_1_
+            l_ctx_saved_tensors_0__1 = l_ctx_saved_tensors_0_
+
+            sin: "f32[4]" = torch.ops.aten.sin.default(l_ctx_saved_tensors_0__1);  sin = None
+
+            cos: "f32[4]" = torch.ops.aten.cos.default(l_ctx_saved_tensors_0__1);  l_ctx_saved_tensors_0__1 = None
+
+            mul: "f32[4]" = torch.ops.aten.mul.Tensor(l_args_1__1, cos);  l_args_1__1 = cos = None
+            return (mul,)
+
+    class cond_false_0(torch.nn.Module):
+        def forward(self, l_args_1_, l_ctx_saved_tensors_0_):
+            l_args_1__1 = l_args_1_
+            l_ctx_saved_tensors_0__1 = l_ctx_saved_tensors_0_
+
+            cos: "f32[4]" = torch.ops.aten.cos.default(l_ctx_saved_tensors_0__1);  cos = None
+
+            sin: "f32[4]" = torch.ops.aten.sin.default(l_ctx_saved_tensors_0__1);  l_ctx_saved_tensors_0__1 = None
+
+            neg: "f32[4]" = torch.ops.aten.neg.default(sin);  sin = None
+
+            mul: "f32[4]" = torch.ops.aten.mul.Tensor(l_args_1__1, neg);  l_args_1__1 = neg = None
+            return (mul,)
+""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_while_loop_op_mismatch_in_meta(self):
@@ -4740,10 +6348,17 @@ def test_while_loop_nested2_traced(self):
             gm.code.strip("\n"),
             """\
 def forward(self, arg0_1, arg1_1, arg2_1, arg3_1):
+<<<<<<< HEAD
     sym_size_int = torch.ops.aten.sym_size.int(arg2_1, 0)
     sym_size_int_1 = torch.ops.aten.sym_size.int(arg2_1, 1)
     sym_size_int_2 = torch.ops.aten.sym_size.int(arg3_1, 0)
     sym_size_int_3 = torch.ops.aten.sym_size.int(arg3_1, 1)
+=======
+    sym_size_int = torch.ops.aten.sym_size.int(arg3_1, 1)
+    sym_size_int_1 = torch.ops.aten.sym_size.int(arg2_1, 1)
+    sym_size_int_2 = torch.ops.aten.sym_size.int(arg2_1, 0)
+    sym_size_int_3 = torch.ops.aten.sym_size.int(arg3_1, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     while_loop_cond_graph_0 = self.while_loop_cond_graph_0
     while_loop_body_graph_0 = self.while_loop_body_graph_0
     while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0, while_loop_body_graph_0, (arg0_1, arg1_1, arg2_1, arg3_1), (sym_size_int, sym_size_int_1, sym_size_int_2, sym_size_int_3));  while_loop_cond_graph_0 = while_loop_body_graph_0 = arg0_1 = arg1_1 = arg2_1 = arg3_1 = sym_size_int = sym_size_int_1 = sym_size_int_2 = sym_size_int_3 = None
@@ -4906,6 +6521,7 @@ def f(a, b):
 def forward(self, a_1, b_1):
     sum_1 = torch.ops.aten.sum.default(a_1)
     gt = torch.ops.aten.gt.Scalar(sum_1, 0);  sum_1 = None
+<<<<<<< HEAD
     sym_size_int = torch.ops.aten.sym_size.int(a_1, 0)
     sym_size_int_1 = torch.ops.aten.sym_size.int(a_1, 1)
     sym_size_int_2 = torch.ops.aten.sym_size.int(b_1, 0)
@@ -4913,6 +6529,15 @@ def forward(self, a_1, b_1):
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
     cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [a_1, b_1, sym_size_int, sym_size_int_1, sym_size_int_2, sym_size_int_3]);  gt = true_graph_0 = false_graph_0 = a_1 = b_1 = sym_size_int = sym_size_int_1 = sym_size_int_2 = sym_size_int_3 = None
+=======
+    sym_size_int = torch.ops.aten.sym_size.int(a_1, 1)
+    sym_size_int_1 = torch.ops.aten.sym_size.int(b_1, 0)
+    sym_size_int_2 = torch.ops.aten.sym_size.int(b_1, 1)
+    sym_size_int_3 = torch.ops.aten.sym_size.int(a_1, 0)
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (a_1, b_1, sym_size_int, sym_size_int_1, sym_size_int_2, sym_size_int_3));  gt = true_graph_0 = false_graph_0 = a_1 = b_1 = sym_size_int = sym_size_int_1 = sym_size_int_2 = sym_size_int_3 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return getitem""",  # noqa: B950
         )
@@ -5044,8 +6669,12 @@ def f(x):
         graph_module = make_fx(torch.func.functionalize(f))(*example_inputs)
         self.assertEqual(graph_module(*example_inputs), f(*example_inputs))
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126988
     def test_cond_functionalized_input_mutation_on_true_brancte(self):
+=======
+    def test_cond_functionalized_input_mutation_on_true_branch(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def true_fn(x):
             view_x = x.view(x.shape)
             view_x.add_(1)
@@ -5079,13 +6708,21 @@ def forward(self, x_1):
         # torch.cond triggers the check of the branches because the predicate
         # is a SymBool.
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.TorchRuntimeError, "One of torch.cond branch"
+=======
+            torch._dynamo.exc.TorchRuntimeError,
+            "cond_true might be modifying the input!",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             make_fx(torch.func.functionalize(f), tracing_mode="symbolic")(
                 *example_inputs
             )
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126988
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cond_functionalized_input_mutation_on_false_branch(self):
         def true_fn(x):
             return x.sin().sum()
@@ -5120,16 +6757,27 @@ def forward(self, x_1):
         # torch.cond triggers the check of the branches because the predicate
         # is a SymBool.
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.TorchRuntimeError, "One of torch.cond branch"
+=======
+            torch._dynamo.exc.TorchRuntimeError,
+            "cond_false might be modifying the input!",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             make_fx(torch.func.functionalize(f), tracing_mode="symbolic")(
                 *example_inputs
             )
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126988
     def test_cond_functionalized_output_alias_input(self):
         def true_fn(x):
             return x
+=======
+    def test_cond_functionalized_output_alias_input(self):
+        def true_fn(x):
+            return x.clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def false_fn(x):
             view_x = x.view(x.shape)
@@ -5154,13 +6802,24 @@ def forward(self, x_1):
         # torch.cond triggers the check of the branches because the predicate
         # is a SymBool.
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.TorchRuntimeError, "One of torch.cond branch"
+=======
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             make_fx(torch.func.functionalize(f), tracing_mode="symbolic")(
                 *example_inputs
             )
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126988
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cond_functionalized_nested_input_mutation(self):
         def true_true_fn(x):
             x.add_(4)
@@ -5182,13 +6841,21 @@ def f(x):
 
         example_inputs = (torch.ones(4, 5),)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.TorchRuntimeError, "One of torch.cond branch"
+=======
+            torch._dynamo.exc.TorchRuntimeError,
+            "cond_true might be modifying the input!",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             make_fx(torch.func.functionalize(f), tracing_mode="symbolic")(
                 *example_inputs
             )
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126988
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cond_functionalized_nested_input_mutation_with_aot_func(self):
         def true_true_fn(x):
             x.add_(4)
@@ -5215,7 +6882,15 @@ def f(x):
             f(example_input_func)
 
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 torch._dynamo.exc.TorchRuntimeError, "One of torch.cond branch"
+=======
+                # Should be
+                # torch._dynamo.exc.Unsupported,
+                # "Encountered aliasing during higher order op tracing for HOP.*"
+                torch._dynamo.exc.UncapturedHigherOrderOpError,
+                "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 make_fx(f, tracing_mode="symbolic")(example_input_func)
         finally:
@@ -5233,7 +6908,15 @@ def wrapper(*args, **kwargs):
             return wrapper
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.TorchRuntimeError, "One of torch.cond branch"
+=======
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             make_fx(f_wrapper(f), tracing_mode="symbolic")(example_input_func)
 
@@ -5254,8 +6937,16 @@ def f(x):
             example_input_func = to_fun_old(example_input)
             torch._enable_functionalization(reapply_views=False)
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 torch._dynamo.exc.TorchRuntimeError,
                 "One of torch.cond branch might be aliasing",
+=======
+                # Should be
+                # torch._dynamo.exc.Unsupported,
+                # "Encountered aliasing during higher order op tracing for HOP.*"
+                torch._dynamo.exc.UncapturedHigherOrderOpError,
+                "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 f(example_input_func)
         finally:
@@ -5285,8 +6976,16 @@ def wrapper(*args, **kwargs):
             return wrapper
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.TorchRuntimeError,
             "One of torch.cond branch might be aliasing",
+=======
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             make_fx(f_wrapper(f), tracing_mode="symbolic")(example_input)
 
@@ -5392,11 +7091,19 @@ def f(x, pred, pred2):
 def forward(self, x_1, pred_1, pred2_1):
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(pred_1, true_graph_0, false_graph_0, [x_1]);  pred_1 = true_graph_0 = false_graph_0 = None
     getitem = cond[0];  cond = None
     true_graph_1 = self.true_graph_1
     false_graph_1 = self.false_graph_1
     cond_1 = torch.ops.higher_order.cond(pred2_1, true_graph_1, false_graph_1, [x_1]);  pred2_1 = true_graph_1 = false_graph_1 = x_1 = None
+=======
+    cond = torch.ops.higher_order.cond(pred_1, true_graph_0, false_graph_0, (x_1,));  pred_1 = true_graph_0 = false_graph_0 = None
+    getitem = cond[0];  cond = None
+    true_graph_1 = self.true_graph_1
+    false_graph_1 = self.false_graph_1
+    cond_1 = torch.ops.higher_order.cond(pred2_1, true_graph_1, false_graph_1, (x_1,));  pred2_1 = true_graph_1 = false_graph_1 = x_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0];  cond_1 = None
     add = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
     return add""",  # noqa: B950
@@ -5421,8 +7128,16 @@ def f(x, y):
 
         x = torch.randn(4)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.TorchRuntimeError,
             "Unmatched output spec from torch.cond branches",
+=======
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             make_fx(f)(x, torch.tensor(False))
 
@@ -5565,11 +7280,19 @@ def f(x, pred, pred2):
 def forward(self, x_1, pred_1, pred2_1):
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(pred_1, true_graph_0, false_graph_0, [x_1]);  pred_1 = true_graph_0 = false_graph_0 = None
     getitem = cond[0];  cond = None
     true_graph_1 = self.true_graph_1
     false_graph_1 = self.false_graph_1
     cond_1 = torch.ops.higher_order.cond(pred2_1, true_graph_1, false_graph_1, [x_1]);  pred2_1 = true_graph_1 = false_graph_1 = x_1 = None
+=======
+    cond = torch.ops.higher_order.cond(pred_1, true_graph_0, false_graph_0, (x_1,));  pred_1 = true_graph_0 = false_graph_0 = None
+    getitem = cond[0];  cond = None
+    true_graph_1 = self.true_graph_1
+    false_graph_1 = self.false_graph_1
+    cond_1 = torch.ops.higher_order.cond(pred2_1, true_graph_1, false_graph_1, (x_1,));  pred2_1 = true_graph_1 = false_graph_1 = x_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_1 = cond_1[0];  cond_1 = None
     add = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
     return add""",  # noqa: B950
@@ -5594,8 +7317,16 @@ def f(x, y):
 
         x = torch.randn(4)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             torch._dynamo.exc.TorchRuntimeError,
             "Unmatched output spec from torch.cond branches",
+=======
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             make_fx(f, tracing_mode="fake")(x, torch.tensor(False))
 
@@ -5867,8 +7598,11 @@ def wrapper(*args, **kwargs):
 
         self.assertEqual(gm(*example_inputs), f(*example_inputs))
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126988
     @xfailIfTorchDynamo
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_map_functionalized_arg_mutation(self):
         def map_fn(x, y):
             y.add_(4)
@@ -5880,12 +7614,20 @@ def f(xs, y):
         example_inputs = (torch.ones(3, 2, 4), torch.ones(4))
         functional_f = torch.func.functionalize(f)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             UnsupportedAliasMutationException, "torch.map is mutating the input!"
         ):
             functional_f(*example_inputs)
 
     # https://github.com/pytorch/pytorch/issues/126988
     @xfailIfTorchDynamo
+=======
+            torch._dynamo.exc.TorchRuntimeError,
+            "map might be modifying the input!",
+        ):
+            functional_f(*example_inputs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_map_functionalized_elem_mutation(self):
         def map_fn(x, y):
             x.add_(4)
@@ -5897,7 +7639,11 @@ def f(xs, y):
         example_inputs = (torch.ones(3, 2, 4), torch.ones(4))
         functional_f = torch.func.functionalize(f)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             UnsupportedAliasMutationException, "torch.map is mutating the input!"
+=======
+            torch._dynamo.exc.TorchRuntimeError, "map might be modifying the input!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             functional_f(*example_inputs)
 
@@ -5924,8 +7670,11 @@ def f(x, y):
         res_compiled = torch.compile(f)(*example_inputs)
         self.assertEqual(res, res_compiled)
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126988
     @xfailIfTorchDynamo
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_map_functionalized_elem_alias(self):
         def map_fn(x):
             x.view(x.shape)
@@ -5937,7 +7686,15 @@ def f(xs):
         example_inputs = (torch.ones(3, 2, 4),)
         functional_f = torch.func.functionalize(f)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             UnsupportedAliasMutationException, "torch.map is aliasing the input!"
+=======
+            # Should be
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing.*"
+            torch._dynamo.exc.UncapturedHigherOrderOpError,
+            "map doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             functional_f(*example_inputs)
 
@@ -6043,7 +7800,11 @@ def forward(self, x_1):
     sym_size_int_1 = torch.ops.aten.sym_size.int(x_1, 1)
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(eq, true_graph_0, false_graph_0, [x_1, sym_size_int, sym_size_int_1]);  eq = true_graph_0 = false_graph_0 = x_1 = sym_size_int = sym_size_int_1 = None
+=======
+    cond = torch.ops.higher_order.cond(eq, true_graph_0, false_graph_0, (x_1, sym_size_int_1, sym_size_int));  eq = true_graph_0 = false_graph_0 = x_1 = sym_size_int_1 = sym_size_int = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return getitem""",  # noqa: B950
         )
@@ -6075,7 +7836,11 @@ def forward(self, x_1):
     sym_size_int_1 = torch.ops.aten.sym_size.int(x_1, 0)
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [x_1, sym_size_int_1]);  gt = true_graph_0 = false_graph_0 = x_1 = sym_size_int_1 = None
+=======
+    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (x_1, sym_size_int_1));  gt = true_graph_0 = false_graph_0 = x_1 = sym_size_int_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return getitem""",  # noqa: B950
         )
@@ -6156,7 +7921,11 @@ def forward(self, x_1):
     false_graph_0 = self.false_graph_0
     _tensor_constant0 = self._tensor_constant0
     _tensor_constant1 = self._tensor_constant1
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(eq, true_graph_0, false_graph_0, [x_1, _tensor_constant0, sym_size_int, sym_size_int_1, _tensor_constant1]);  eq = true_graph_0 = false_graph_0 = x_1 = _tensor_constant0 = sym_size_int = sym_size_int_1 = _tensor_constant1 = None
+=======
+    cond = torch.ops.higher_order.cond(eq, true_graph_0, false_graph_0, (x_1, _tensor_constant0, sym_size_int_1, sym_size_int, _tensor_constant1));  eq = true_graph_0 = false_graph_0 = x_1 = _tensor_constant0 = sym_size_int_1 = sym_size_int = _tensor_constant1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return getitem""",  # noqa: B950
         )
@@ -6296,10 +8065,20 @@ def wrapper(*args, **kwargs):
             gm.code.strip(),
             """\
 def forward(self, pred_1, x_1):
+<<<<<<< HEAD
     body_graph_0 = self.body_graph_0
     map_impl = torch.ops.higher_order.map_impl(body_graph_0, [x_1], [pred_1]);  body_graph_0 = x_1 = pred_1 = None
     getitem = map_impl[0];  map_impl = None
     return getitem""",
+=======
+    unbind = torch.ops.aten.unbind.int(x_1)
+    getitem = unbind[0];  getitem = None
+    getitem_1 = unbind[1];  unbind = getitem_1 = None
+    body_graph_0 = self.body_graph_0
+    map_impl = torch.ops.higher_order.map_impl(body_graph_0, [x_1], [pred_1]);  body_graph_0 = x_1 = pred_1 = None
+    getitem_2 = map_impl[0];  map_impl = None
+    return getitem_2""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertExpectedInline(
             gm.body_graph_0.code.strip(),
@@ -6307,9 +8086,15 @@ def forward(self, pred_1, x_1):
 def forward(self, arg0_1, arg1_1):
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(arg1_1, true_graph_0, false_graph_0, [arg0_1]);  arg1_1 = true_graph_0 = false_graph_0 = arg0_1 = None
     getitem = cond[0];  cond = None
     return [getitem]""",  # noqa: B950
+=======
+    cond = torch.ops.higher_order.cond(arg1_1, true_graph_0, false_graph_0, (arg0_1,));  arg1_1 = true_graph_0 = false_graph_0 = arg0_1 = None
+    getitem = cond[0];  cond = None
+    return (getitem,)""",  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfCrossRef  # Arg order changes with crossref
@@ -6396,7 +8181,11 @@ def forward(self, x_1):
     sym_size_int_1 = torch.ops.aten.sym_size.int(x_1, 1)
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
     cond = torch.ops.higher_order.cond(eq, true_graph_0, false_graph_0, [x_1, sym_size_int, sym_size_int_1]);  eq = true_graph_0 = false_graph_0 = x_1 = sym_size_int = sym_size_int_1 = None
+=======
+    cond = torch.ops.higher_order.cond(eq, true_graph_0, false_graph_0, (x_1, sym_size_int_1, sym_size_int));  eq = true_graph_0 = false_graph_0 = x_1 = sym_size_int_1 = sym_size_int = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return getitem""",  # noqa: B950
             )
@@ -6535,7 +8324,11 @@ def fn(x):
             return torch.cond(
                 pred=torch.tensor([True]),
                 true_fn=lambda x: x + 100,
+<<<<<<< HEAD
                 false_fn=lambda x: x,
+=======
+                false_fn=lambda x: x.clone(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 operands=(x,),
             )
 
@@ -6549,7 +8342,11 @@ def fn(x, y):
             return torch.cond(
                 pred=x.sum() < y.sum(),
                 true_fn=lambda x, y: x + 100,
+<<<<<<< HEAD
                 false_fn=lambda x, y: y,
+=======
+                false_fn=lambda x, y: y.clone(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 operands=(x, y),
             )
 
@@ -6608,7 +8405,11 @@ def fn(x):
                 return torch.cond(
                     pred=torch.tensor([True]),
                     true_fn=lambda x: (x + c, x - c),
+<<<<<<< HEAD
                     false_fn=lambda x: (x, x),
+=======
+                    false_fn=lambda x: (x.clone(), x.clone()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     operands=(x,),
                 )
 
@@ -6618,7 +8419,11 @@ def fn(x):
                 return torch.cond(
                     pred=torch.tensor([True]),
                     true_fn=lambda x: (x + 1, x - 1),
+<<<<<<< HEAD
                     false_fn=lambda x: (x, x),
+=======
+                    false_fn=lambda x: (x.clone(), x.clone()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     operands=(x,),
                 )
 
@@ -6745,6 +8550,7 @@ def forward(self, a, b):
         self.assertExpectedInline(
             backend.graphs[0].code.strip(),
             """\
+<<<<<<< HEAD
 def forward(self, s0 : torch.SymInt, L_a_ : torch.Tensor, L_b_ : torch.Tensor, L_self_num : torch.SymInt):
     l_a_ = L_a_
     l_b_ = L_b_
@@ -6753,6 +8559,15 @@ def forward(self, s0 : torch.SymInt, L_a_ : torch.Tensor, L_b_ : torch.Tensor, L
     cond_true_0 = self.cond_true_0
     cond_false_0 = self.cond_false_0
     cond = torch.ops.higher_order.cond(tensor, cond_true_0, cond_false_0, [l_a_, l_b_, l_self_num, s0]);  tensor = cond_true_0 = cond_false_0 = l_a_ = l_b_ = l_self_num = s0 = None
+=======
+def forward(self, s97 : torch.SymInt, L_a_ : torch.Tensor, L_b_ : torch.Tensor):
+    l_a_ = L_a_
+    l_b_ = L_b_
+    tensor = torch.tensor([True])
+    cond_true_0 = self.cond_true_0
+    cond_false_0 = self.cond_false_0
+    cond = torch.ops.higher_order.cond(tensor, cond_true_0, cond_false_0, (l_a_, l_b_, s97));  tensor = cond_true_0 = cond_false_0 = l_a_ = l_b_ = s97 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem = cond[0];  cond = None
     return (getitem,)""",  # noqa: B950
         )
@@ -6814,8 +8629,11 @@ def f(init, xs):
             functional_f(example_init, example_inputs), f(example_init, example_inputs)
         )
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126988
     @xfailIfTorchDynamo
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scan_functionalized_elem_mutation(self):
         def add1(x, y):
             x.add_(4)
@@ -6828,8 +8646,20 @@ def f(init, xs):
         example_init = torch.ones(5, 4)
         functional_f = torch.func.functionalize(f)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             UnsupportedAliasMutationException,
             "Combine_fn might be modifying the input!",
+=======
+            # TODO: Fix this so that the HOPs show similar errors for functionalization
+            # This is the Exception with PYTORCH_TEST_WITH_DYNAMO=0
+            # RuntimeError,
+            # "torch.scan might be modifying the input!",
+            # This is the Exception with PYTORCH_TEST_WITH_DYNAMO=1
+            # torch._dynamo.exc.TorchDynamoException,
+            # "Unexpected exception when running generated GraphModule.*"
+            Exception,
+            ".*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             functional_f(example_init, example_inputs)
 
@@ -6842,6 +8672,7 @@ def f(init, xs):
 
         functional_f = torch.func.functionalize(f)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             UnsupportedAliasMutationException,
             "Combine_fn might be modifying the input!",
         ):
@@ -6849,6 +8680,21 @@ def f(init, xs):
 
     # https://github.com/pytorch/pytorch/issues/126988
     @xfailIfTorchDynamo
+=======
+            # TODO: Fix this so that the HOPs show similar errors for functionalization
+            # Should be
+            # This is the Exception with PYTORCH_TEST_WITH_DYNAMO=0
+            # RuntimeError,
+            # "torch.scan might be modifying the input!",
+            # This is the Exception with PYTORCH_TEST_WITH_DYNAMO=1
+            # torch._dynamo.exc.TorchDynamoException,
+            # "Unexpected exception when running generated GraphModule.*"
+            Exception,
+            ".*",
+        ):
+            functional_f(example_init, example_inputs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scan_functionalized_elem_alias(self):
         def add(x, y):
             return x, x
@@ -6860,7 +8706,20 @@ def f(init, xs):
         example_init = torch.ones(5, 4)
         functional_f = torch.func.functionalize(f)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             UnsupportedAliasMutationException, "Combine_fn might be aliasing the input!"
+=======
+            # TODO: Fix this so that the HOPs show similar errors for functionalization
+            # Should be
+            # This is the Exception with PYTORCH_TEST_WITH_DYNAMO=0
+            # torch._dynamo.exc.Unsupported,
+            # "Encountered aliasing during higher order op tracing for HOP.*"
+            # This is the Exception with PYTORCH_TEST_WITH_DYNAMO=1
+            # torch._dynamo.exc.UncapturedHigherOrderOpError,
+            # "scan must be captured completely with torch.compile.*",
+            Exception,
+            ".*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             functional_f(example_init, example_inputs)
 
@@ -6902,7 +8761,10 @@ def forward(self, L_init_ : torch.Tensor, L_xs_ : torch.Tensor, L_add_closure_0_
     out = scan[1];  scan = None
     return (carry, out)""",  # noqa: B950
             )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             self.assertExpectedInline(
                 backend.graphs[0].code.strip(),
@@ -6940,10 +8802,17 @@ def test_while_loop_op_int_carry_export(self, strict, dynamic):
                 """\
 class GraphModule(torch.nn.Module):
     def forward(self, x):
+<<<<<<< HEAD
         x: "f32[s0, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
         sym_size_int_1: "Sym(s0)" = torch.ops.aten.sym_size.int(x, 0)
+=======
+        x: "f32[s77, 3]";
+
+        x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         while_loop_cond_graph_0 = self.while_loop_cond_graph_0
         while_loop_body_graph_0 = self.while_loop_body_graph_0
@@ -6957,6 +8826,7 @@ def forward(self, x):
         gt_1: "Sym(u1 > 0)" = getitem_2 > 0
         _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(gt_1, "Runtime assertion failed for expression 0 < u1 on node 'gt_1'");  gt_1 = _assert_scalar_default_1 = None
 
+<<<<<<< HEAD
         getitem_1: "f32[s0, 3]" = while_loop[1];  while_loop = None
 
         add: "Sym(u1 + 1)" = getitem_2 + 1
@@ -6964,12 +8834,22 @@ def forward(self, x):
         add_1: "f32[s0, 3]" = torch.ops.aten.add.Tensor(getitem_1, getitem_2);  getitem_1 = None
 
         lt: "Sym(u1 < s0)" = getitem_2 < sym_size_int_1;  sym_size_int_1 = None
+=======
+        getitem_1: "f32[s77, 3]" = while_loop[1];  while_loop = None
+
+        add: "Sym(u1 + 1)" = getitem_2 + 1
+
+        add_1: "f32[s77, 3]" = torch.ops.aten.add.Tensor(getitem_1, getitem_2);  getitem_1 = None
+
+        lt: "Sym(u1 < s77)" = getitem_2 < sym_size_int_1;  sym_size_int_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mul: "Sym(2*u1)" = getitem_2 * 2;  getitem_2 = None
         ones: "f32[2*u1]" = torch.ops.aten.ones.default([mul], device = device(type='cpu'), pin_memory = False);  mul = None
         return pytree.tree_unflatten((add, add_1, lt, ones), self._out_spec)
 
     class while_loop_cond_graph_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, it_1: "Sym(u0)", x_1: "f32[s0, 3]"):
             sym_size_int: "Sym(s0)" = torch.ops.aten.sym_size.int(x_1, 0);  x_1 = None
             lt: "Sym(u0 < s0)" = it_1 < sym_size_int;  it_1 = sym_size_int = None
@@ -6978,6 +8858,16 @@ def forward(self, it_1: "Sym(u0)", x_1: "f32[s0, 3]"):
     class while_loop_body_graph_0(torch.nn.Module):
         def forward(self, it_1: "Sym(u0)", x_1: "f32[s0, 3]"):
             clone: "f32[s0, 3]" = torch.ops.aten.clone.default(x_1);  x_1 = None
+=======
+        def forward(self, it_1: "Sym(u0)", x_1: "f32[s77, 3]"):
+            sym_size_int: "Sym(s77)" = torch.ops.aten.sym_size.int(x_1, 0);  x_1 = None
+            lt: "Sym(u0 < s77)" = it_1 < sym_size_int;  it_1 = sym_size_int = None
+            return lt
+
+    class while_loop_body_graph_0(torch.nn.Module):
+        def forward(self, it_1: "Sym(u0)", x_1: "f32[s77, 3]"):
+            clone: "f32[s77, 3]" = torch.ops.aten.clone.default(x_1);  x_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             select: "f32[3]" = torch.ops.aten.select.int(clone, 0, it_1)
             select_1: "f32[3]" = torch.ops.aten.select.int(clone, 0, it_1)
             add: "f32[3]" = torch.ops.aten.add.Tensor(select_1, it_1);  select_1 = None
@@ -7007,12 +8897,20 @@ def test_while_loop_op_int_carry_compile(self, dynamic, backend):
                 normalize_gm(backend.graphs[0].print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", L_x_: "f32[s0, s1]"):
+=======
+    def forward(self, s77: "Sym(s77)", s27: "Sym(s27)", L_x_: "f32[s77, s27]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         l_x_ = L_x_
 
         cond_fn_0 = self.cond_fn_0
         body_fn_0 = self.body_fn_0
+<<<<<<< HEAD
         while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (0, l_x_), (s0, s1));  cond_fn_0 = body_fn_0 = l_x_ = s1 = None
+=======
+        while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (0, l_x_), (s27, s77));  cond_fn_0 = body_fn_0 = l_x_ = s27 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         getitem_4: "Sym(u1)" = while_loop[0]
 
@@ -7022,6 +8920,7 @@ def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", L_x_: "f32[s0, s1]"):
         gt_1: "Sym(u1 > 0)" = getitem_4 > 0
         _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(gt_1, "Runtime assertion failed for expression 0 < u1 on node 'gt_1'");  gt_1 = _assert_scalar_default_1 = None
 
+<<<<<<< HEAD
         out_x: "f32[s0, s1]" = while_loop[1];  while_loop = None
 
         add: "Sym(u1 + 1)" = getitem_4 + 1
@@ -7029,12 +8928,25 @@ def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", L_x_: "f32[s0, s1]"):
         add_1: "f32[s0, s1]" = getitem_4 + out_x;  out_x = None
 
         lt: "Sym(u1 < s0)" = getitem_4 < s0;  s0 = None
+=======
+        out_x: "f32[s77, s27]" = while_loop[1];  while_loop = None
+
+        gt: "Sym(u1 > 0)" = getitem_4 > 0
+        _check = torch._check(gt);  gt = _check = None
+
+        add: "Sym(u1 + 1)" = getitem_4 + 1
+
+        add_1: "f32[s77, s27]" = getitem_4 + out_x;  out_x = None
+
+        lt: "Sym(u1 < s77)" = getitem_4 < s77;  s77 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mul: "Sym(2*u1)" = getitem_4 * 2;  getitem_4 = None
         ones: "f32[2*u1]" = torch.ones(mul);  mul = None
         return (add, add_1, lt, ones)
 
     class cond_fn_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, unbacked_symint: "Sym(u0)", l_x_: "f32[s0, s1]", s0, s1):
             s0_1 = s0
             s1_1 = s1
@@ -7051,11 +8963,30 @@ def forward(self, unbacked_symint: "Sym(u0)", l_x_: "f32[s0, s1]", s0, s1):
             s1_1 = s1
 
             x_clone: "f32[s0, s1]" = l_x_.clone()
+=======
+        def forward(self, unbacked_symint: "Sym(u0)", l_x_: "f32[s77, s27]", s27, s77):
+            s27_1 = s27
+            s77_1 = s77
+
+            size = l_x_.size();  l_x_ = None
+            getitem: "Sym(s77)" = size[0]
+            getitem_1: "Sym(s27)" = size[1];  size = getitem_1 = None
+            lt: "Sym(u0 < s77)" = unbacked_symint < getitem;  unbacked_symint = getitem = None
+            return lt
+
+    class body_fn_0(torch.nn.Module):
+        def forward(self, unbacked_symint: "Sym(u0)", l_x_: "f32[s77, s27]", s27, s77):
+            s27_1 = s27
+            s77_1 = s77
+
+            x_clone: "f32[s77, s27]" = l_x_.clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             ge: "Sym(u0 >= 0)" = unbacked_symint >= 0
             _check = torch._check(ge);  ge = _check = None
 
             size = l_x_.size();  l_x_ = None
+<<<<<<< HEAD
             getitem: "Sym(s0)" = size[0]
             getitem_1: "Sym(s1)" = size[1];  size = getitem_1 = None
             lt: "Sym(u0 < s0)" = unbacked_symint < getitem;  getitem = None
@@ -7065,6 +8996,17 @@ def forward(self, unbacked_symint: "Sym(u0)", l_x_: "f32[s0, s1]", s0, s1):
             select_1: "f32[s1]" = x_clone.select(0, unbacked_symint)
             add: "f32[s1]" = select_1 + unbacked_symint;  select_1 = None
             copy_: "f32[s1]" = select.copy_(add);  select = add = copy_ = None
+=======
+            getitem: "Sym(s77)" = size[0]
+            getitem_1: "Sym(s27)" = size[1];  size = getitem_1 = None
+            lt: "Sym(u0 < s77)" = unbacked_symint < getitem;  getitem = None
+            _check_1 = torch._check(lt);  lt = _check_1 = None
+
+            select: "f32[s27]" = x_clone.select(0, unbacked_symint)
+            select_1: "f32[s27]" = x_clone.select(0, unbacked_symint)
+            add: "f32[s27]" = select_1 + unbacked_symint;  select_1 = None
+            copy_: "f32[s27]" = select.copy_(add);  select = add = copy_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             add_1: "Sym(u0 + 1)" = unbacked_symint + 1;  unbacked_symint = None
             return (add_1, x_clone)
@@ -7090,6 +9032,10 @@ def forward(self, t):
 
         t, = fx_pytree.tree_flatten_spec(([t], {}), self._in_spec)
         sum_1: "f32[]" = torch.ops.aten.sum.default(t)
+<<<<<<< HEAD
+=======
+        _assert_tensor_metadata_default = torch.ops.aten._assert_tensor_metadata.default(sum_1, dtype = torch.float32, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         to: "i64[]" = torch.ops.aten.to.dtype(sum_1, torch.int64);  sum_1 = None
         item: "Sym(u0)" = torch.ops.aten.item.default(to);  to = None
         sin: "f32[2, 3]" = torch.ops.aten.sin.default(t)
@@ -7235,12 +9181,21 @@ def test_while_loop_op_pytree_int_carry_export(self, strict, dynamic):
                 """\
 class GraphModule(torch.nn.Module):
     def forward(self, x):
+<<<<<<< HEAD
         x: "f32[s0, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
         sym_size_int_1: "Sym(s0)" = torch.ops.aten.sym_size.int(x, 0)
 
         sin: "f32[s0, 3]" = torch.ops.aten.sin.default(x);  x = None
+=======
+        x: "f32[s77, 3]";
+
+        x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
+
+        sin: "f32[s77, 3]" = torch.ops.aten.sin.default(x);  x = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         while_loop_cond_graph_0 = self.while_loop_cond_graph_0
         while_loop_body_graph_0 = self.while_loop_body_graph_0
@@ -7252,12 +9207,17 @@ def forward(self, x):
         getitem_9: "Sym(u8)" = while_loop[3]
         getitem_10: "Sym(u9)" = while_loop[4]
 
+<<<<<<< HEAD
         getitem_5: "f32[s0, 3]" = while_loop[5];  while_loop = None
+=======
+        getitem_5: "f32[s77, 3]" = while_loop[5];  while_loop = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         add: "Sym(u7 + 1)" = getitem_8 + 1
         add_1: "Sym(u8 + 1)" = getitem_9 + 1
         add_2: "Sym(u9 + 1)" = getitem_10 + 1
 
+<<<<<<< HEAD
         add_3: "f32[s0, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_8);  getitem_8 = None
         add_4: "f32[s0, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_9);  getitem_9 = None
         add_5: "f32[s0, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_10);  getitem_10 = None
@@ -7265,6 +9225,15 @@ def forward(self, x):
 
     class while_loop_cond_graph_0(torch.nn.Module):
         def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", arg3_1: "Sym(u18)", arg4_1: "Sym(u19)", arg5_1: "f32[s0, 3]"):
+=======
+        add_3: "f32[s77, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_8);  getitem_8 = None
+        add_4: "f32[s77, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_9);  getitem_9 = None
+        add_5: "f32[s77, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_10);  getitem_10 = None
+        return pytree.tree_unflatten((getitem_6, getitem_7, add, add_1, add_2, add_3, add_4, add_5, getitem_5), self._out_spec)
+
+    class while_loop_cond_graph_0(torch.nn.Module):
+        def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", arg3_1: "Sym(u18)", arg4_1: "Sym(u19)", arg5_1: "f32[s77, 3]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mul: "Sym(u17*u18)" = arg2_1 * arg3_1;  arg2_1 = arg3_1 = None
             mul_1: "Sym(u17*u18*u19)" = mul * arg4_1;  mul = arg4_1 = None
             mul_2: "Sym(u15*u16)" = arg0_1 * arg1_1;  arg0_1 = arg1_1 = None
@@ -7272,7 +9241,11 @@ def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", ar
             return lt
 
     class while_loop_body_graph_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", arg3_1: "Sym(u18)", arg4_1: "Sym(u19)", arg5_1: "f32[s0, 3]"):
+=======
+        def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", arg3_1: "Sym(u18)", arg4_1: "Sym(u19)", arg5_1: "f32[s77, 3]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             add: "Sym(u15 + 1)" = arg0_1 + 1;  arg0_1 = None
             add_1: "Sym(u16 + 1)" = arg1_1 + 1;  arg1_1 = None
 
@@ -7280,7 +9253,11 @@ def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", ar
             add_3: "Sym(u18 + 1)" = arg3_1 + 1;  arg3_1 = None
             add_4: "Sym(u19 + 1)" = arg4_1 + 1;  arg4_1 = None
 
+<<<<<<< HEAD
             add_5: "f32[s0, 3]" = torch.ops.aten.add.Tensor(arg5_1, 1);  arg5_1 = None
+=======
+            add_5: "f32[s77, 3]" = torch.ops.aten.add.Tensor(arg5_1, 1);  arg5_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (add, add_1, add_2, add_3, add_4, add_5)
 """,  # noqa: B950
             )
@@ -7306,6 +9283,7 @@ def test_while_loop_op_pytree_int_carry_compile(self, dynamic, backend):
                 normalize_gm(backend.graphs[0].print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", L_x_: "f32[s0, s1]"):
         l_x_ = L_x_
 
@@ -7314,6 +9292,16 @@ def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", L_x_: "f32[s0, s1]"):
         cond_fn_0 = self.cond_fn_0
         body_fn_0 = self.body_fn_0
         while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (s0, s1, 2, 2, 3, child), (s0, s1));  cond_fn_0 = body_fn_0 = s0 = s1 = child = None
+=======
+    def forward(self, s77: "Sym(s77)", s27: "Sym(s27)", L_x_: "f32[s77, s27]"):
+        l_x_ = L_x_
+
+        child: "f32[s77, s27]" = l_x_.sin();  l_x_ = None
+
+        cond_fn_0 = self.cond_fn_0
+        body_fn_0 = self.body_fn_0
+        while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (s77, s27, 2, 2, 3, child), (s27, s77));  cond_fn_0 = body_fn_0 = s77 = s27 = child = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         getitem_10: "Sym(u5)" = while_loop[0]
         getitem_11: "Sym(u6)" = while_loop[1]
@@ -7321,12 +9309,17 @@ def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", L_x_: "f32[s0, s1]"):
         getitem_13: "Sym(u8)" = while_loop[3]
         getitem_14: "Sym(u9)" = while_loop[4]
 
+<<<<<<< HEAD
         out_x: "f32[s0, s1]" = while_loop[5];  while_loop = None
+=======
+        out_x: "f32[s77, s27]" = while_loop[5];  while_loop = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         add: "Sym(u7 + 1)" = getitem_12 + 1
         add_1: "Sym(u8 + 1)" = getitem_13 + 1
         add_2: "Sym(u9 + 1)" = getitem_14 + 1
 
+<<<<<<< HEAD
         add_3: "f32[s0, s1]" = getitem_12 + out_x;  getitem_12 = None
         add_4: "f32[s0, s1]" = getitem_13 + out_x;  getitem_13 = None
         add_5: "f32[s0, s1]" = getitem_14 + out_x;  getitem_14 = None
@@ -7336,6 +9329,17 @@ class cond_fn_0(torch.nn.Module):
         def forward(self, unbacked_symint: "Sym(u0)", unbacked_symint_0: "Sym(u1)", unbacked_symint_1: "Sym(u2)", unbacked_symint_2: "Sym(u3)", unbacked_symint_3: "Sym(u4)", child: "f32[s0, s1]", s0, s1):
             s0_1 = s0
             s1_1 = s1
+=======
+        add_3: "f32[s77, s27]" = getitem_12 + out_x;  getitem_12 = None
+        add_4: "f32[s77, s27]" = getitem_13 + out_x;  getitem_13 = None
+        add_5: "f32[s77, s27]" = getitem_14 + out_x;  getitem_14 = None
+        return (getitem_10, getitem_11, add, add_1, add_2, add_3, add_4, add_5, out_x)
+
+    class cond_fn_0(torch.nn.Module):
+        def forward(self, unbacked_symint: "Sym(u0)", unbacked_symint_0: "Sym(u1)", unbacked_symint_1: "Sym(u2)", unbacked_symint_2: "Sym(u3)", unbacked_symint_3: "Sym(u4)", child: "f32[s77, s27]", s27, s77):
+            s27_1 = s27
+            s77_1 = s77
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             mul: "Sym(u2*u3)" = unbacked_symint_1 * unbacked_symint_2;  unbacked_symint_1 = unbacked_symint_2 = None
             mul_1: "Sym(u2*u3*u4)" = mul * unbacked_symint_3;  mul = unbacked_symint_3 = None
@@ -7344,9 +9348,15 @@ def forward(self, unbacked_symint: "Sym(u0)", unbacked_symint_0: "Sym(u1)", unba
             return lt
 
     class body_fn_0(torch.nn.Module):
+<<<<<<< HEAD
         def forward(self, unbacked_symint: "Sym(u0)", unbacked_symint_0: "Sym(u1)", unbacked_symint_1: "Sym(u2)", unbacked_symint_2: "Sym(u3)", unbacked_symint_3: "Sym(u4)", child: "f32[s0, s1]", s0, s1):
             s0_1 = s0
             s1_1 = s1
+=======
+        def forward(self, unbacked_symint: "Sym(u0)", unbacked_symint_0: "Sym(u1)", unbacked_symint_1: "Sym(u2)", unbacked_symint_2: "Sym(u3)", unbacked_symint_3: "Sym(u4)", child: "f32[s77, s27]", s27, s77):
+            s27_1 = s27
+            s77_1 = s77
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             add: "Sym(u0 + 1)" = unbacked_symint + 1;  unbacked_symint = None
             add_1: "Sym(u1 + 1)" = unbacked_symint_0 + 1;  unbacked_symint_0 = None
@@ -7355,7 +9365,11 @@ def forward(self, unbacked_symint: "Sym(u0)", unbacked_symint_0: "Sym(u1)", unba
             add_3: "Sym(u3 + 1)" = unbacked_symint_2 + 1;  unbacked_symint_2 = None
             add_4: "Sym(u4 + 1)" = unbacked_symint_3 + 1;  unbacked_symint_3 = None
 
+<<<<<<< HEAD
             child_1: "f32[s0, s1]" = child + 1;  child = None
+=======
+            child_1: "f32[s77, s27]" = child + 1;  child = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (add, add_1, add_2, add_3, add_4, child_1)
 """,  # noqa: B950
             )
@@ -7367,7 +9381,15 @@ def fn(f, *args):
         x = torch.randn(2, 2)
         for f in ALIAS_FN:
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 torch._dynamo.exc.BackendCompilerFailed, "might be aliasing the input"
+=======
+                # Should be
+                # torch._dynamo.exc.Unsupported,
+                # "Encountered aliasing during higher order op tracing for HOP.*"
+                torch._dynamo.exc.UncapturedHigherOrderOpError,
+                "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.compile(fn)(f, x)
 
@@ -7383,7 +9405,15 @@ def f(arg1, arg2):
         # as a result of auto lifting.
         for view_f in ALIAS_FN[1:]:
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 torch._dynamo.exc.BackendCompilerFailed, "might be aliasing the input"
+=======
+                # Should be
+                # torch._dynamo.exc.Unsupported,
+                # "Encountered aliasing during higher order op tracing for HOP.*"
+                torch._dynamo.exc.UncapturedHigherOrderOpError,
+                "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.compile(fn)(view_f, x)
 
@@ -7400,12 +9430,28 @@ def mutate_f(x):
         x = torch.randn(2, 2)
         for f in ALIAS_FN:
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 torch._dynamo.exc.BackendCompilerFailed, "might be modifying the input"
+=======
+                # Should be
+                # torch._dynamo.exc.Unsupported,
+                # "Encountered aliasing during higher order op tracing for HOP.*"
+                torch._dynamo.exc.UncapturedHigherOrderOpError,
+                "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 torch.compile(fn)(f, x)
 
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 torch._dynamo.exc.BackendCompilerFailed, "might be modifying the input"
+=======
+                # Should be
+                # torch._dynamo.exc.Unsupported,
+                # "Encountered aliasing during higher order op tracing for HOP.*"
+                torch._dynamo.exc.UncapturedHigherOrderOpError,
+                "Cond doesn't work unless it is captured completely with torch.compile.*",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 with torch.inference_mode(inference_mode):
                     torch.compile(fn)(f, x)
@@ -7471,7 +9517,11 @@ def forward(self, a, b1, b2, c):
         a, b1, b2, c, = fx_pytree.tree_flatten_spec(([a, b1, b2, c], {}), self._in_spec)
         true_graph_0 = self.true_graph_0
         false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
         cond = torch.ops.higher_order.cond(a, true_graph_0, false_graph_0, [c, b1, b2]);  a = true_graph_0 = false_graph_0 = c = b1 = b2 = None
+=======
+        cond = torch.ops.higher_order.cond(a, true_graph_0, false_graph_0, (c, b1, b2));  a = true_graph_0 = false_graph_0 = c = b1 = b2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem: "f32[10]" = cond[0];  cond = None
 
         mul: "f32[10]" = torch.ops.aten.mul.Tensor(getitem, 2);  getitem = None
@@ -7523,6 +9573,7 @@ def false_fn(x):
             """\
 class GraphModule(torch.nn.Module):
     def forward(self, x, y, z):
+<<<<<<< HEAD
         x: "f32[s0, 3]"; y: "f32[s1]"; z: "f32[s0, 3]";
 
         x, y, z, = fx_pytree.tree_flatten_spec(([x, y, z], {}), self._in_spec)
@@ -7547,6 +9598,32 @@ def forward(self, x: "f32[s0, 3]", sym_size_int_4: "Sym(s1)", sym_size_int_3: "S
             mul: "f32[s0, 3]" = torch.ops.aten.mul.Tensor(z, sym_size_int_3);  z = sym_size_int_3 = None
 
             add: "f32[s0, 3]" = torch.ops.aten.add.Tensor(x, mul);  x = mul = None
+=======
+        x: "f32[s68, 3]"; y: "f32[s17]"; z: "f32[s68, 3]";
+
+        x, y, z, = fx_pytree.tree_flatten_spec(([x, y, z], {}), self._in_spec)
+        sym_size_int_4: "Sym(s17)" = torch.ops.aten.sym_size.int(y, 0);  y = None
+        sym_size_int_5: "Sym(s68)" = torch.ops.aten.sym_size.int(z, 0)
+
+        gt: "Sym(s68 > 5)" = sym_size_int_5 > 5
+
+        true_graph_0 = self.true_graph_0
+        false_graph_0 = self.false_graph_0
+        cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (x, sym_size_int_4, sym_size_int_5, z));  gt = true_graph_0 = false_graph_0 = x = sym_size_int_4 = sym_size_int_5 = z = None
+        getitem: "f32[s68, 3]" = cond[0];  cond = None
+        return pytree.tree_unflatten((getitem,), self._out_spec)
+
+    class true_graph_0(torch.nn.Module):
+        def forward(self, x: "f32[s68, 3]", sym_size_int_4: "Sym(s17)", sym_size_int_5: "Sym(s68)", z: "f32[s68, 3]"):
+            add: "f32[s68, 3]" = torch.ops.aten.add.Tensor(x, sym_size_int_4);  x = sym_size_int_4 = None
+            return (add,)
+
+    class false_graph_0(torch.nn.Module):
+        def forward(self, x: "f32[s68, 3]", sym_size_int_4: "Sym(s17)", sym_size_int_5: "Sym(s68)", z: "f32[s68, 3]"):
+            mul: "f32[s68, 3]" = torch.ops.aten.mul.Tensor(z, sym_size_int_5);  z = sym_size_int_5 = None
+
+            add: "f32[s68, 3]" = torch.ops.aten.add.Tensor(x, mul);  x = mul = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (add,)
 """,  # noqa: B950
         )
@@ -7580,10 +9657,17 @@ def false_fn(x):
         _ = self._check_export_ret_graph_str(model, args, dynamic_shapes)
 
     @skipIfTorchDynamo(
+<<<<<<< HEAD
         "Skip because _merge_tensors is not intended for dynamo to compile"
     )
     def test_merge_tensors(self):
         from torch._higher_order_ops.cond import _merge_tensors
+=======
+        "Skip because _merge_output is not intended for dynamo to compile"
+    )
+    def test_merge_output(self):
+        from torch._higher_order_ops.cond import _merge_output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._subclasses.fake_tensor import FakeTensorMode
         from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
@@ -7628,7 +9712,11 @@ def _inner(case):
             with fake_mode:
                 t1 = torch.empty_strided(size1, stride1)
                 t2 = torch.empty_strided(size2, stride2)
+<<<<<<< HEAD
             out = _merge_tensors(t1, t2, fake_mode)
+=======
+            out = _merge_output(t1, t2, fake_mode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(str(tuple(out.size())), merged_size)
             self.assertEqual(str(tuple(out.stride())), merged_stride)
 
@@ -7709,7 +9797,11 @@ def false_fn(x):
                 normalize_gm(bk.graphs[0].print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", L_y_: "f32[s0, s1]", L_z_: "f32[s0, s1]", L_x_: "f32[s0, s1]"):
+=======
+    def forward(self, s17: "Sym(s17)", s94: "Sym(s94)", L_y_: "f32[s17, s94]", L_z_: "f32[s17, s94]", L_x_: "f32[s17, s94]"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         l_y_ = L_y_
         l_z_ = L_z_
         l_x_ = L_x_
@@ -7719,14 +9811,21 @@ def forward(self, s0: "Sym(s0)", s1: "Sym(s1)", L_y_: "f32[s0, s1]", L_z_: "f32[
 
         cond_true_0 = self.cond_true_0
         cond_false_0 = self.cond_false_0
+<<<<<<< HEAD
         cond = torch.ops.higher_order.cond(gt, cond_true_0, cond_false_0, [l_x_, s1, s0, s0, l_z_]);  gt = cond_true_0 = cond_false_0 = l_x_ = s1 = s0 = l_z_ = None
 
         getitem_5: "f32[u0, s1]" = cond[0]
+=======
+        cond = torch.ops.higher_order.cond(gt, cond_true_0, cond_false_0, (l_x_, s94, s17, s17, l_z_));  gt = cond_true_0 = cond_false_0 = l_x_ = s94 = s17 = l_z_ = None
+
+        getitem_5: "f32[u0, s94]" = cond[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sym_size_int: "Sym(u0)" = torch.ops.aten.sym_size.int(getitem_5, 0);  getitem_5 = None
         _check_is_size = torch._check_is_size(sym_size_int);  _check_is_size = None
 
         ge: "Sym(u0 >= 0)" = sym_size_int >= 0;  sym_size_int = None
         _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
+<<<<<<< HEAD
         ret: "f32[u0, s1]" = cond[0];  cond = None
 
         sum_2: "f32[]" = l_y_.sum();  l_y_ = None
@@ -7752,6 +9851,33 @@ def forward(self, l_x_, s1, s0_true_branch, getitem_2_false_branch, l_z__false_b
             add: "f32[s0, s1]" = l_x__1 + mul;  l_x__1 = mul = None
             getitem: "f32[2, s1]" = add[slice(None, 2, None)];  add = None
             clone: "f32[2, s1]" = getitem.clone();  getitem = None
+=======
+        ret: "f32[u0, s94]" = cond[0];  cond = None
+
+        sum_2: "f32[]" = l_y_.sum();  l_y_ = None
+        sub: "f32[u0, s94]" = sum_2 - ret;  sum_2 = ret = None
+        return (sub,)
+
+    class cond_true_0(torch.nn.Module):
+        def forward(self, l_x_, s94, s17_true_branch, getitem_2_false_branch, l_z__false_branch):
+            l_x__1 = l_x_
+            s94_1 = s94
+
+            add: "f32[s17, s94]" = l_x__1 + s17_true_branch;  l_x__1 = s17_true_branch = None
+            getitem: "f32[s17 - 2, s94]" = add[slice(2, None, None)];  add = None
+            clone: "f32[s17 - 2, s94]" = getitem.clone();  getitem = None
+            return (clone,)
+
+    class cond_false_0(torch.nn.Module):
+        def forward(self, l_x_, s94, s17_true_branch, getitem_2_false_branch, l_z__false_branch):
+            l_x__1 = l_x_
+            s94_1 = s94
+
+            mul: "f32[s17, s94]" = getitem_2_false_branch * l_z__false_branch;  getitem_2_false_branch = l_z__false_branch = None
+            add: "f32[s17, s94]" = l_x__1 + mul;  l_x__1 = mul = None
+            getitem: "f32[2, s94]" = add[slice(None, 2, None)];  add = None
+            clone: "f32[2, s94]" = getitem.clone();  getitem = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (clone,)
 """,  # noqa: B950
             )
@@ -7923,6 +10049,25 @@ def test_while_loop_schema_gen(self):
         )
         self.assertEqual(schema.parse(str(schema)), schema)
 
+<<<<<<< HEAD
+=======
+    def test_schema_tree_spec(self):
+        schema_gen = HopSchemaGenerator(torch.ops.higher_order.cond)
+        args = (torch.randn(3, 4), torch.randn(2, 3))
+        with self.assertRaisesRegex(
+            RuntimeError, "Please only add flattened inputs to the hop schema"
+        ):
+            schema_gen.add_arg("tuple_args", args)
+
+        for i, arg in enumerate(args):
+            schema_gen.add_arg(f"tuple_args{i}", arg)
+        schema_gen.add_schema_tree_spec(pytree.tree_flatten(args)[1])
+        flat_schema = schema_gen.gen_schema()
+        self.assertExpectedInline(
+            str(flat_schema), """cond(Tensor tuple_args0, Tensor tuple_args1) -> ()"""
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(TestHopSchema)
 instantiate_parametrized_tests(TestControlFlowTraced)
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index 1d48d0fb5119..853679c512b3 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -687,7 +687,11 @@ def unrelated(w, x):
         expected = (torch.zeros_like(x), torch.ones_like(x))
         self.assertEqual(result, expected)
 
+<<<<<<< HEAD
     # TODO: https://github.com/zou3519/functorch/issues/12
+=======
+    # TODO: https://github.com/pytorch/functorch/issues/12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCPU
     def test_unrelated_hessian(self, device):
         N = 5
@@ -4324,9 +4328,13 @@ def lennard_jones(r):
 
         def lennard_jones_force(r):
             """Get magnitude of LJ force"""
+<<<<<<< HEAD
             return -epsilon * (
                 (-12 * sigma**12 / r**13) + (6 * sigma**6 / r**7)
             )
+=======
+            return -epsilon * ((-12 * sigma**12 / r**13) + (6 * sigma**6 / r**7))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         r = torch.linspace(0.5, 2 * sigma, steps=100, requires_grad=True, device=device)
         drs = torch.outer(r, torch.tensor([1.0, 0, 0], device=device))
@@ -4495,8 +4503,14 @@ def test_find_learning_rate_ensembling(self, device, dropout_layer, mechanism):
         # This example mimics what a user might do when trying to find the optimal learning rate. They would
         # want to run a bunch of models with the same behavior (including the same dropout!) and have them
         # each run with different learning rates. Specifically, this is an example of using same randomness with vmap
+<<<<<<< HEAD
         points, labels = torch.randn(100, 2, 2, 2, 2, device=device), torch.randint(
             0, 2, (100,), device=device
+=======
+        points, labels = (
+            torch.randn(100, 2, 2, 2, 2, device=device),
+            torch.randint(0, 2, (100,), device=device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         class MLPClassifier(nn.Module):
diff --git a/test/functorch/test_memory_efficient_fusion.py b/test/functorch/test_memory_efficient_fusion.py
index 7bf263431ad0..7db1eb0e5ed4 100644
--- a/test/functorch/test_memory_efficient_fusion.py
+++ b/test/functorch/test_memory_efficient_fusion.py
@@ -208,6 +208,7 @@ def check(f, t, delta, check_val=True, graph_input=False):
     old_num_nodes = len(fx_g.graph.nodes)
     new_num_nodes = len(new_graph.nodes)
     if delta == -1:
+<<<<<<< HEAD
         assert (
             old_num_nodes >= new_num_nodes
         ), f"number of nodes increased {old_num_nodes}, {new_num_nodes}"
@@ -215,19 +216,35 @@ def check(f, t, delta, check_val=True, graph_input=False):
         assert (
             old_num_nodes == new_num_nodes + delta
         ), f"number of nodes not the same {old_num_nodes - delta}, {new_num_nodes}\n {fx_g.graph} \n {new_graph}"
+=======
+        assert old_num_nodes >= new_num_nodes, (
+            f"number of nodes increased {old_num_nodes}, {new_num_nodes}"
+        )
+    else:
+        assert old_num_nodes == new_num_nodes + delta, (
+            f"number of nodes not the same {old_num_nodes - delta}, {new_num_nodes}\n {fx_g.graph} \n {new_graph}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # a second pass should not reduce more nodes
     pass_2_graph = fx_graph_cse(new_graph)
     pass_2_num_nodes = len(pass_2_graph.nodes)
+<<<<<<< HEAD
     assert (
         pass_2_num_nodes == new_num_nodes
     ), f"second pass graph has less node {pass_2_num_nodes}, {new_num_nodes}\n {new_graph} \n {pass_2_graph}"
+=======
+    assert pass_2_num_nodes == new_num_nodes, (
+        f"second pass graph has less node {pass_2_num_nodes}, {new_num_nodes}\n {new_graph} \n {pass_2_graph}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # check correctness
     if check_val:
         true_result = fx_g(t)
         our_result = new_g(t)
         if true_result is None:  # both return None
+<<<<<<< HEAD
             assert (
                 our_result is None
             ), f"true result is None, CSE result is {our_result}"
@@ -235,6 +252,15 @@ def check(f, t, delta, check_val=True, graph_input=False):
             assert torch.all(
                 true_result == our_result
             ), f"results are different {true_result}, {our_result}"  # check results are the same
+=======
+            assert our_result is None, (
+                f"true result is None, CSE result is {our_result}"
+            )
+        else:  # results returned are the same
+            assert torch.all(true_result == our_result), (
+                f"results are different {true_result}, {our_result}"
+            )  # check results are the same
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class NoChangeTestCase(TestCase):
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 51ccd61107a7..4b22eaada28a 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -55,7 +55,10 @@
     TEST_WITH_ROCM,
     TestCase,
     unMarkDynamoStrictTest,
+<<<<<<< HEAD
     xfailIfS390X,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.opinfo.core import SampleInput
 from torch.utils import _pytree as pytree
@@ -1024,12 +1027,15 @@ def fn(inp, *args, **kwargs):
                 xfail(
                     "unbind_copy"
                 ),  # Batching rule not implemented for aten::unbind_copy.int.
+<<<<<<< HEAD
                 decorate("linalg.tensorsolve", decorator=xfailIfS390X),
                 decorate("nn.functional.max_pool1d", decorator=xfailIfS390X),
                 decorate("nn.functional.max_unpool2d", decorator=xfailIfS390X),
                 decorate(
                     "nn.functional.multilabel_margin_loss", decorator=xfailIfS390X
                 ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ),
     )
@@ -2154,9 +2160,15 @@ def test_extremal_numerics_nll_loss(self, device):
                 else:
                     weight = torch.randn(weight_shape, device=device)
                 target = torch.randint(0, C, target_shape, device=device)
+<<<<<<< HEAD
                 target[
                     0
                 ] = 1  # since we're ignoring index 0, at least one element must be non-zero
+=======
+                target[0] = (
+                    1  # since we're ignoring index 0, at least one element must be non-zero
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 fn = functools.partial(
                     torch.nn.functional.nll_loss, target=target, weight=weight, **kwargs
diff --git a/test/functorch/test_parsing.py b/test/functorch/test_parsing.py
index 46c9b340c594..d9ee8d519039 100644
--- a/test/functorch/test_parsing.py
+++ b/test/functorch/test_parsing.py
@@ -24,6 +24,10 @@
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 from unittest import mock
 
@@ -107,7 +111,11 @@ def test_invalid_expressions(self) -> None:
             ParsedExpression("(a) ((b c) (d ...))")
 
         # invalid identifiers
+<<<<<<< HEAD
         ParsedExpression("camelCase under_scored cApiTaLs \u00DF ...")
+=======
+        ParsedExpression("camelCase under_scored cApiTaLs \u00df ...")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaises(ValueError):
             ParsedExpression("1a")
         with self.assertRaises(ValueError):
diff --git a/test/functorch/test_rearrange.py b/test/functorch/test_rearrange.py
index d5f55d7e7a3b..2cba70e6852f 100644
--- a/test/functorch/test_rearrange.py
+++ b/test/functorch/test_rearrange.py
@@ -25,7 +25,10 @@
 SOFTWARE.
 """
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import numpy as np
 
 import torch
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index d552179fc9dc..b80419a25f3b 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -1532,7 +1532,11 @@ def test_unary_pointwise(self, case):
         self._test_unary(op, getter, "cpu")
 
         # test in-place
+<<<<<<< HEAD
         method = getattr(Tensor, f'{op.__name__ + "_"}')
+=======
+        method = getattr(Tensor, f"{op.__name__ + '_'}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_unary(method, getter, "cpu", check_propagates_grad=False)
 
     def test_clone(self):
@@ -1718,11 +1722,14 @@ def test_silu_backward(self):
         test(op, (getter([], device), getter([B0], device)), in_dims=(None, 0))
         test(op, (getter([2, B0], device), getter([2], device)), in_dims=(1, None))
 
+<<<<<<< HEAD
     @skipIf(
         TEST_WITH_TORCHDYNAMO
         and os.getenv("BUILD_ENVIRONMENT", "") == "linux-focal-py3.8-clang10",
         "Segfaults with dynamo on focal, see https://github.com/pytorch/pytorch/issues/107173",
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "case",
         [
@@ -4479,7 +4486,10 @@ def test_vmap_exhaustive(self, device, dtype, op):
                 xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
                 xfail("resize_"),
                 xfail("view_as_complex"),
+<<<<<<< HEAD
                 xfail("matrix_exp"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 xfail("fft.ihfft2"),
                 xfail("fft.ihfftn"),
                 xfail("allclose"),
@@ -4598,7 +4608,10 @@ def test_op_has_batch_rule(self, device, dtype, op):
             "polygamma",
             "pow",
             "remainder",
+<<<<<<< HEAD
             "scatter_add",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "scatter",
             "square",
             "sub",
@@ -5170,7 +5183,10 @@ def f(e_):
             xfail("linalg.vecdot"),
             # throws in vmap on CUDA
             # IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)
+<<<<<<< HEAD
             # https://github.com/pytorch/pytorch/runs/8110653462?check_suite_focus=true
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # but it passes locally
             xfail("linalg.diagonal"),
             skip("linalg.matrix_norm", ""),
diff --git a/test/functorch/test_vmap_registrations.py b/test/functorch/test_vmap_registrations.py
index 1bff959e3c4f..9cadbd3d4a01 100644
--- a/test/functorch/test_vmap_registrations.py
+++ b/test/functorch/test_vmap_registrations.py
@@ -120,7 +120,10 @@
     "aten::lu_solve",
     "aten::margin_ranking_loss",
     "aten::masked_select_backward",
+<<<<<<< HEAD
     "aten::matrix_exp",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "aten::matrix_exp_backward",
     "aten::max.names_dim",
     "aten::max.names_dim_max",
diff --git a/test/fx/quantization.py b/test/fx/quantization.py
index 3daa4da479ec..60ce2f624196 100644
--- a/test/fx/quantization.py
+++ b/test/fx/quantization.py
@@ -2,6 +2,10 @@
 **This file is EXPERIMENTAL and is mostly used for testing purposes! Do not
 rely on it for anything!**
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import operator
 import sys
 
diff --git a/test/fx/test_common_passes.py b/test/fx/test_common_passes.py
index 642da7255b68..248741f7d88d 100644
--- a/test/fx/test_common_passes.py
+++ b/test/fx/test_common_passes.py
@@ -9,7 +9,11 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+<<<<<<< HEAD
     run_tests,
+=======
+    raise_on_run_directly,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 
@@ -128,4 +132,8 @@ def test_correctness_factory(self, common_pass, f, device):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     run_tests()
+=======
+    raise_on_run_directly("test/test_fx.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_cse_pass.py b/test/fx/test_cse_pass.py
index 16aa9e70a029..a4a256a971d4 100644
--- a/test/fx/test_cse_pass.py
+++ b/test/fx/test_cse_pass.py
@@ -6,7 +6,11 @@
 from torch.fx import symbolic_trace
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.passes.dialect.common.cse_pass import CSEPass, get_CSE_banned_ops
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TestCase
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 banned_ops = get_CSE_banned_ops()
@@ -46,9 +50,15 @@ def check(self, f, t, delta, check_val=True, graph_input=False, P=None):
     old_num_nodes = len(fx_g.graph.nodes)
     new_num_nodes = len(new_graph.nodes)
 
+<<<<<<< HEAD
     assert (
         new_num_nodes < old_num_nodes
     ) == modified, "modified should be True if the number of nodes decrease"
+=======
+    assert (new_num_nodes < old_num_nodes) == modified, (
+        "modified should be True if the number of nodes decrease"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if delta == -1:
         self.assertTrue(
@@ -259,4 +269,8 @@ def f(x):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     run_tests()
+=======
+    raise_on_run_directly("test/test_fx.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_dce_pass.py b/test/fx/test_dce_pass.py
index e74b90f268da..e93596ee8358 100644
--- a/test/fx/test_dce_pass.py
+++ b/test/fx/test_dce_pass.py
@@ -5,7 +5,15 @@
 
 import torch
 import torch.fx
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_MACOS, TestCase
+=======
+from torch.testing._internal.common_utils import (
+    IS_MACOS,
+    raise_on_run_directly,
+    TestCase,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestDCE(TestCase):
@@ -232,6 +240,22 @@ def forward(self, a: torch.Tensor) -> torch.Tensor:
         # %add_ node should not be removed because it has side effects.
         self._run_dce_and_test(TestModule(), expect_dce_changes=False)
 
+<<<<<<< HEAD
+=======
+    def test_impure_random(self):
+        """
+        Test that DCE doesn't remove call_function for torch.rand.
+        """
+
+        class TestModule(torch.nn.Module):
+            def forward(self, a: torch.Tensor) -> torch.Tensor:
+                x = torch.rand([10])  # noqa: F841
+                return a * 2
+
+        # %torch.rand should not be removed because it has side effects.
+        self._run_dce_and_test(TestModule(), expect_dce_changes=False)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_impure_kwargs(self):
         """
         Test that DCE doesn't remove call_function nodes with side effects on kwargs.
@@ -315,3 +339,10 @@ def forward(
         # collective nodes should not be removed because they have side effects.
         self._run_dce_and_test(TestModule(), expect_dce_changes=False, custom=False)
         torch.distributed.destroy_process_group()
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_fx.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_dynamism.py b/test/fx/test_dynamism.py
index 67a92c604118..57e4bdabd2aa 100644
--- a/test/fx/test_dynamism.py
+++ b/test/fx/test_dynamism.py
@@ -2,7 +2,11 @@
 
 import torch
 from torch.fx.experimental._dynamism import track_dynamism_across_examples
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TestCase
+=======
+from torch.testing._internal.common_utils import TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestDynamism(TestCase):
@@ -110,6 +114,51 @@ def forward(self, x):
         }
         self.assertEqual(result, expected)
 
+<<<<<<< HEAD
 
 if __name__ == "__main__":
     run_tests()
+=======
+    def test_property_not_implemented(self):
+        class ModuleWithNotImplementedProperty(torch.nn.Module):
+            def __init__(self, x, y):
+                super().__init__()
+                self.linear = torch.nn.Linear(x, y)
+
+            @property
+            def not_implemented_property(self):
+                raise NotImplementedError("This property is not implemented")
+
+        module1 = ModuleWithNotImplementedProperty(10, 10)
+        module2 = ModuleWithNotImplementedProperty(10, 10)
+
+        result = track_dynamism_across_examples(
+            [
+                {"self": module1},
+                {"self": module2},
+            ]
+        )
+
+        expected = {
+            "self": {
+                "L['self']['_modules']['linear']['_parameters']['weight']": (
+                    False,
+                    False,
+                ),
+                "L['self']['_modules']['linear']['_parameters']['bias']": (False,),
+                "L['self']['_modules']['linear']['bias']": (False,),
+                "L['self']['_modules']['linear']['in_features']": (False,),
+                "L['self']['_modules']['linear']['out_features']": (False,),
+                "L['self']['_modules']['linear']['weight']": (False, False),
+            }
+        }
+
+        self.assertEqual(result, expected)
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_fx_const_fold.py b/test/fx/test_fx_const_fold.py
index c1e5929ca301..e539bff025a9 100644
--- a/test/fx/test_fx_const_fold.py
+++ b/test/fx/test_fx_const_fold.py
@@ -6,7 +6,11 @@
 import torch.fx
 from torch.fx.experimental import const_fold
 from torch.fx.passes.shape_prop import _extract_tensor_metadata, ShapeProp
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestConstFold(TestCase):
@@ -706,3 +710,10 @@ def forward(self, x, y):
         base_result = mod(in_x, in_y)
         fold_result = mod_folded(in_x, in_y)
         self.assertTrue(torch.equal(fold_result, base_result))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_fx.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_fx_node_hook.py b/test/fx/test_fx_node_hook.py
index 43cd0e0722e4..57c9096e5951 100644
--- a/test/fx/test_fx_node_hook.py
+++ b/test/fx/test_fx_node_hook.py
@@ -89,3 +89,13 @@ def replace_node_hook2(old, new, user):
         assert gm._create_node_hooks == []
         assert gm._erase_node_hooks == []
         assert gm._replace_hooks == []
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_fx_param_shape_control_flow.py b/test/fx/test_fx_param_shape_control_flow.py
index d943e1e0b368..b4a397855994 100644
--- a/test/fx/test_fx_param_shape_control_flow.py
+++ b/test/fx/test_fx_param_shape_control_flow.py
@@ -1,10 +1,16 @@
 # Owner(s): ["module: fx"]
 
+<<<<<<< HEAD
 import unittest
 
 import torch
 import torch.fx
 from torch.testing._internal.common_utils import TestCase
+=======
+import torch
+import torch.fx
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class MyModuleBase(torch.nn.Module):
@@ -158,4 +164,8 @@ def test_param_nelement_const(self):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     unittest.main()
+=======
+    raise_on_run_directly("test/test_fx.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_fx_split.py b/test/fx/test_fx_split.py
index 58f2fea08217..02aade183a39 100644
--- a/test/fx/test_fx_split.py
+++ b/test/fx/test_fx_split.py
@@ -223,3 +223,13 @@ def test_split_by_tags(self) -> None:
 
         self.assertTrue(type(gm_output) == type(split_gm_output))
         self.assertTrue(torch.equal(gm_output, split_gm_output))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_fx_traceback.py b/test/fx/test_fx_traceback.py
index 9fb98a525fe2..0a94fc9c39a6 100644
--- a/test/fx/test_fx_traceback.py
+++ b/test/fx/test_fx_traceback.py
@@ -102,7 +102,11 @@ def forward(self, x):
         )
 
         # Check node "linear" is created from node "x" in PropagateUnbackedSymInts
+<<<<<<< HEAD
         key_provenance = provenance["linear"]
+=======
+        key_provenance = provenance["linear"][0]["from_node"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(len(key_provenance), 1)
         key_provenance = key_provenance[0]
         check_node_source(
@@ -151,7 +155,13 @@ def forward(self, x):
             )
 
             # Check node "linear" is then created from node "x" in PropagateUnbackedSymInts
+<<<<<<< HEAD
             key_provenance = get_first_node_source_and_check(key_provenance)
+=======
+            key_provenance = get_first_node_source_and_check(key_provenance)[
+                "from_node"
+            ][0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             check_node_source(
                 key_provenance,
                 "x",
@@ -167,3 +177,13 @@ def forward(self, x):
                 "Interpreter_FlattenInputOutputSignature",
                 CREATE_STR,
             )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_fx_xform_observer.py b/test/fx/test_fx_xform_observer.py
index 7d4370b5dcf2..2f41b6d5065a 100644
--- a/test/fx/test_fx_xform_observer.py
+++ b/test/fx/test_fx_xform_observer.py
@@ -11,6 +11,7 @@
 from torch.testing._internal.common_utils import TestCase
 
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -19,6 +20,8 @@
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestGraphTransformObserver(TestCase):
     def test_graph_transform_observer(self):
         class M(torch.nn.Module):
@@ -186,3 +189,13 @@ def forward(self, x):
         self.assertEqual(len(gm2._create_node_hooks), 0)
         self.assertEqual(len(gm2._erase_node_hooks), 0)
         self.assertEqual(len(gm2._deepcopy_hooks), 0)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_gradual_type.py b/test/fx/test_gradual_type.py
index fcf50dad99ea..3e08b28725fe 100644
--- a/test/fx/test_gradual_type.py
+++ b/test/fx/test_gradual_type.py
@@ -17,7 +17,11 @@
 from torch.fx.experimental.unify_refinements import infer_symbolic_types
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.fx.tensor_type import Dyn, is_consistent, is_more_precise, TensorType
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -1168,4 +1172,8 @@ def forward(self, x: TensorType((4, 3, Dyn, Dyn))):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     unittest.main()
+=======
+    raise_on_run_directly("test/test_fx.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_graph_pickler.py b/test/fx/test_graph_pickler.py
index b593e0adaf28..f62735ab58a6 100644
--- a/test/fx/test_graph_pickler.py
+++ b/test/fx/test_graph_pickler.py
@@ -14,8 +14,12 @@
 import torch.library
 from torch._dynamo.testing import make_test_cls_with_patches
 from torch._inductor.test_case import TestCase
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TEST_WITH_ASAN
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_GPU
+=======
+from torch.testing._internal.inductor_utils import HAS_CPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Make the helper files in test/ importable
@@ -93,8 +97,15 @@ def fn(a, b):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     from torch._inductor.test_case import run_tests
 
     # Slow on ASAN after https://github.com/pytorch/pytorch/pull/94068
     if (HAS_CPU or HAS_GPU) and not TEST_WITH_ASAN:
         run_tests(needs="filelock")
+=======
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_lazy_graph_module.py b/test/fx/test_lazy_graph_module.py
index ac9d404d67dc..8e8f472e6f6d 100644
--- a/test/fx/test_lazy_graph_module.py
+++ b/test/fx/test_lazy_graph_module.py
@@ -15,7 +15,11 @@
 )
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.package import PackageExporter, PackageImporter
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import run_tests, TestCase
+=======
+from torch.testing._internal.common_utils import TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestLazyGraphModule(TestCase):
@@ -276,4 +280,11 @@ def f(x):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     run_tests()
+=======
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_matcher_utils.py b/test/fx/test_matcher_utils.py
index 26caf91485e2..86fb7d03e531 100644
--- a/test/fx/test_matcher_utils.py
+++ b/test/fx/test_matcher_utils.py
@@ -19,7 +19,11 @@
 from torch.fx.passes.utils.matcher_with_name_node_map_utils import (
     SubgraphMatcherWithNameNodeMap,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS, run_tests
+=======
+from torch.testing._internal.common_utils import IS_WINDOWS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -173,7 +177,11 @@ def pattern(x, weight):
             torch.randn(3, 3, 3, 3),
         )
         pattern_gm = export_for_training(
+<<<<<<< HEAD
             WrapperModule(pattern), example_inputs
+=======
+            WrapperModule(pattern), example_inputs, strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).module()
         before_split_res = pattern_gm(*example_inputs)
         pattern_gm, _ = _split_to_graph_and_name_node_map(pattern_gm)
@@ -204,11 +212,19 @@ def pattern(x, weight):
             torch.randn(3, 3, 3, 3),
         )
         pattern_gm = export_for_training(
+<<<<<<< HEAD
             WrapperModule(pattern), example_inputs
         ).module()
         matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
         target_gm = export_for_training(
             WrapperModule(target_graph), example_inputs
+=======
+            WrapperModule(pattern), example_inputs, strict=True
+        ).module()
+        matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
+        target_gm = export_for_training(
+            WrapperModule(target_graph), example_inputs, strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).module()
         internal_matches = matcher.match(target_gm.graph)
         for internal_match in internal_matches:
@@ -248,9 +264,17 @@ def forward(self, x):
                 return linear, {"linear": linear, "x": x}
 
         example_inputs = (torch.randn(3, 5),)
+<<<<<<< HEAD
         pattern_gm = export_for_training(Pattern(), example_inputs).module()
         matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
         target_gm = export_for_training(M(), example_inputs).module()
+=======
+        pattern_gm = export_for_training(
+            Pattern(), example_inputs, strict=True
+        ).module()
+        matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
+        target_gm = export_for_training(M(), example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         internal_matches = matcher.match(target_gm.graph)
         for internal_match in internal_matches:
             name_node_map = internal_match.name_node_map
@@ -267,4 +291,11 @@ def forward(self, x):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     run_tests()
+=======
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_net_min_base.py b/test/fx/test_net_min_base.py
new file mode 100644
index 000000000000..75382304e195
--- /dev/null
+++ b/test/fx/test_net_min_base.py
@@ -0,0 +1,109 @@
+# Owner(s): ["module: fx"]
+
+from unittest import mock
+
+import torch
+from torch.fx.passes.net_min_base import (
+    _MinimizerBase,
+    _MinimizerSettingBase,
+    FxNetMinimizerResultMismatchError,
+)
+from torch.fx.passes.tools_common import Names
+from torch.testing._internal.common_utils import TestCase
+
+
+class TestNetMinBaseBlock(TestCase):
+    def setUp(self) -> None:
+        # Setup test fixtures for each test method
+
+        class SimpleModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 5)
+                self.linear2 = torch.nn.Linear(5, 5)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                x = self.linear(x)
+                x = self.linear2(x)
+                x = self.relu(x)
+                return x
+
+        self.compare_fn = mock.MagicMock()
+
+        self.module = torch.fx.symbolic_trace(SimpleModule())
+        self.sample_input = (torch.randn(2, 10),)
+        self.settings = _MinimizerSettingBase(traverse_method="block")
+        self.minimizer = _MinimizerBase(
+            module=self.module,
+            sample_input=self.sample_input,
+            settings=self.settings,
+            compare_fn=self.compare_fn,
+        )
+        self.report = []
+
+    def assert_problematic_nodes(self, culprit_names: Names) -> None:
+        """
+        Quick helper function to assert that a set of nodes (when present together in a subgraph) cause a discrepancy
+        """
+        with mock.patch("torch.fx.passes.net_min_base._MinimizerBase._run_and_compare"):
+
+            def run_and_compare_side_effect(
+                split_module: torch.fx.GraphModule,
+                submod_name: str,
+                output_names: Names,
+                report_idx: int = -1,
+            ) -> None:
+                submodule = getattr(split_module, submod_name)
+
+                # Remove input/output layer
+                names = set([node.name for node in submodule.graph.nodes][1:-1])
+                if set(culprit_names) <= names:
+                    raise FxNetMinimizerResultMismatchError
+
+            self.minimizer._run_and_compare.side_effect = run_and_compare_side_effect
+
+            # Every single node should be a discrepancy
+            culprits = self.minimizer.minimize()
+            self.assertEqual({node.name for node in culprits}, set(culprit_names))
+
+    def test_no_discrepancy(self) -> None:
+        # No discrepancies should handle gracefully with an empty set
+        with (
+            mock.patch("torch.fx.passes.net_min_base._MinimizerBase.run_a"),
+            mock.patch("torch.fx.passes.net_min_base._MinimizerBase.run_b"),
+        ):
+            # Have both run_a and run_b return the same result
+            return_value = torch.zeros((2, 5))
+            self.minimizer.run_a.return_value = return_value
+            self.minimizer.run_b.return_value = return_value
+            self.compare_fn.return_value = (0, True)
+
+            # There should be no discrepancy between the two, and thus we should receive an empty set
+            culprits = self.minimizer.minimize()
+            self.assertEqual(culprits, set())
+
+    def test_all_nodes_discrepancy(self) -> None:
+        self.assert_problematic_nodes(["linear", "linear2", "relu"])
+
+    def test_first_node_discrepancy(self) -> None:
+        self.assert_problematic_nodes(["linear"])
+
+    def test_last_node_discrepancy(self) -> None:
+        self.assert_problematic_nodes(["relu"])
+
+    def test_middle_node_discrepancy(self) -> None:
+        self.assert_problematic_nodes(["linear2"])
+
+    def test_contiguous_partial_discrepancy_end(self) -> None:
+        self.assert_problematic_nodes(["linear2", "relu"])
+
+    def test_continugous_partial_discrepancy_beginning(self) -> None:
+        self.assert_problematic_nodes(["linear", "linear2"])
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
diff --git a/test/fx/test_partitioner_order.py b/test/fx/test_partitioner_order.py
index a646ec1bc776..53a0546a4938 100644
--- a/test/fx/test_partitioner_order.py
+++ b/test/fx/test_partitioner_order.py
@@ -1,6 +1,9 @@
 # Owner(s): ["module: fx"]
 
+<<<<<<< HEAD
 import unittest
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Mapping
 
 import torch
@@ -49,4 +52,11 @@ def test_partitioner_order(self):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     unittest.main()
+=======
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_pass_infra.py b/test/fx/test_pass_infra.py
index d0449b4c313e..61a89a6cc768 100644
--- a/test/fx/test_pass_infra.py
+++ b/test/fx/test_pass_infra.py
@@ -9,7 +9,11 @@
     PassManager,
     this_before_that_pass_constraint,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Pass that uses PassBase and returns a PassResult (best scenario)
@@ -228,3 +232,10 @@ def pass_fail(graph_module):
         error_msg = "pass_fail.*ReplaceAddWithMulPass.*replace_mul_with_div_pass.*ReplaceDivWithSubPass.*replace_sub_with_add_pass"
         with self.assertRaisesRegex(Exception, error_msg):
             pm(traced_m)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_fx.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_shape_inference.py b/test/fx/test_shape_inference.py
index 1caa4847bc24..91fb97842c67 100644
--- a/test/fx/test_shape_inference.py
+++ b/test/fx/test_shape_inference.py
@@ -108,3 +108,13 @@ def generate_graph_module(model):
         gm = generate_graph_module(m)
         input_tensors = [torch.randn(1, 1)]
         infer_shape(gm, input_tensors)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_source_matcher_utils.py b/test/fx/test_source_matcher_utils.py
index 544e676efcf2..f5e2ebc3c16c 100644
--- a/test/fx/test_source_matcher_utils.py
+++ b/test/fx/test_source_matcher_utils.py
@@ -18,6 +18,10 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+<<<<<<< HEAD
+=======
+    raise_on_run_directly,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchDynamo,
 )
 from torch.testing._internal.jit_utils import JitTestCase
@@ -481,3 +485,9 @@ def forward(self, x):
 
 
 instantiate_parametrized_tests(TestSourceMatcher)
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_fx.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/fx/test_z3_gradual_types.py b/test/fx/test_z3_gradual_types.py
index 70430e03c3a5..f655f6be4755 100644
--- a/test/fx/test_z3_gradual_types.py
+++ b/test/fx/test_z3_gradual_types.py
@@ -1783,8 +1783,14 @@ def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn])):
         self.assertEqual(s.check(), z3.sat)
 
         add_result = z3.Const(3, tensor_type)
+<<<<<<< HEAD
         broadcast_res1, broadcast_res2 = z3.Const(4, tensor_type), z3.Const(
             5, tensor_type
+=======
+        broadcast_res1, broadcast_res2 = (
+            z3.Const(4, tensor_type),
+            z3.Const(5, tensor_type),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # print(s.model())
diff --git a/test/higher_order_ops/test_invoke_quant.py b/test/higher_order_ops/test_invoke_quant.py
index 3fe1fa5ba65b..aefa2814e457 100644
--- a/test/higher_order_ops/test_invoke_quant.py
+++ b/test/higher_order_ops/test_invoke_quant.py
@@ -28,7 +28,11 @@
     skipIfXpu,
     TestCase,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import requires_gpu
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 invoke_quant_tracer = InvokeQuant()
@@ -205,12 +209,25 @@ def fn(x, y, z):
                 @ z
             )
 
+<<<<<<< HEAD
         x = torch.randn(64, 64, requires_grad=False, device="cuda", dtype=torch.float16)
         # make this a no-op to ensure equivalent numerics
         y = torch.randn(
             64, 64, requires_grad=False, device="cuda", dtype=torch.float16
         ).fill_(1.0)
         z = torch.randn(64, 64, requires_grad=False, device="cuda", dtype=torch.float16)
+=======
+        x = torch.randn(
+            64, 64, requires_grad=False, device=GPU_TYPE, dtype=torch.float16
+        )
+        # make this a no-op to ensure equivalent numerics
+        y = torch.randn(
+            64, 64, requires_grad=False, device=GPU_TYPE, dtype=torch.float16
+        ).fill_(1.0)
+        z = torch.randn(
+            64, 64, requires_grad=False, device=GPU_TYPE, dtype=torch.float16
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref = gn(x, y) @ z
 
         x_clone = x.clone().detach().requires_grad_(False)
diff --git a/test/higher_order_ops/test_invoke_subgraph.py b/test/higher_order_ops/test_invoke_subgraph.py
index 54874818f862..5b72d89e7c43 100644
--- a/test/higher_order_ops/test_invoke_subgraph.py
+++ b/test/higher_order_ops/test_invoke_subgraph.py
@@ -1,26 +1,63 @@
 # Owner(s): ["module: higher order operators"]
 # flake8: noqa: B950
+<<<<<<< HEAD
 
 import unittest
+=======
+# flake8: noqa: E731
+
+import unittest
+import unittest.mock as mock
+
+from parameterized import parameterized_class
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._dynamo
 import torch._functorch
 import torch._inductor
 import torch._inductor.decomposition
+<<<<<<< HEAD
 from functorch.compile import aot_function, nop
 from torch._dynamo.testing import AotEagerAndRecordGraphs, normalize_gm
 from torch._higher_order_ops.invoke_subgraph import mark_compile_region
+=======
+import torch.utils._pytree as pytree
+from functorch.compile import aot_function, nop
+from torch._dynamo.testing import (
+    AotEagerAndRecordGraphs,
+    EagerAndRecordGraphs,
+    InductorAndRecordGraphs,
+    normalize_gm,
+)
+from torch._higher_order_ops.schema import find_hop_schema
+from torch._inductor.pattern_matcher import (
+    CallFunctionVarArgs,
+    PatternMatcherPass,
+    register_graph_pattern,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     run_tests,
     skipIfTorchDynamo,
     TEST_WITH_CROSSREF,
     TestCase,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
 
 requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+from torch.testing._internal.triton_utils import requires_cuda, requires_gpu
+
+
+nested_compile_region = torch.compiler.nested_compile_region
+
+if HAS_GPU:
+    import triton
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @skipIfTorchDynamo("Not a torch._dynamo test")
@@ -30,7 +67,11 @@ def gn(x, y):
             return torch.mul(x, y)
 
         def fn(x, y):
+<<<<<<< HEAD
             return mark_compile_region(gn)(x, y)
+=======
+            return nested_compile_region(gn)(x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.randn(8, requires_grad=True)
         y = torch.randn(8, requires_grad=True)
@@ -53,7 +94,11 @@ def gn(x, y):
             return torch.mul(x, y)
 
         def fn(x, y):
+<<<<<<< HEAD
             return mark_compile_region(gn)(x, y)
+=======
+            return nested_compile_region(gn)(x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.randn(8, requires_grad=True)
         y = torch.randn(8, requires_grad=True)
@@ -73,11 +118,19 @@ def fn(x, y):
         self.assertEqual(y.grad, y_clone.grad)
 
     def test_multiple(self):
+<<<<<<< HEAD
         @mark_compile_region
         def cos(x):
             return torch.cos(x)
 
         @mark_compile_region
+=======
+        @nested_compile_region
+        def cos(x):
+            return torch.cos(x)
+
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def sin(x):
             return torch.sin(x)
 
@@ -104,7 +157,11 @@ def count_unique_get_attr_nodes(self, gm, args, expected):
         self.assertEqual(len(subgraph_attr_names), expected)
 
     def test_simple(self):
+<<<<<<< HEAD
         @mark_compile_region
+=======
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def gn(x, y):
             return torch.mul(x, y)
 
@@ -113,7 +170,409 @@ def fn(x, y):
 
         x = torch.randn(8, requires_grad=True)
         y = torch.randn(8, requires_grad=True)
+<<<<<<< HEAD
         ref = gn(x, y)
+=======
+        ref = fn(x, y)
+
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+        res = torch.compile(fn, backend="inductor", fullgraph=True)(x_clone, y_clone)
+
+        # Run backward
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
+        self.assertEqual(y.grad, y_clone.grad)
+
+    def test_module_forward(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c = 5
+
+            @nested_compile_region
+            def forward(self, x, y):
+                return torch.mul(x, y).sin() + self.c
+
+        mod = Mod()
+
+        def fn(x, y):
+            return mod(x, y) + mod(x, y)
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+        ref = fn(x, y)
+
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+        res = torch.compile(fn, backend="inductor", fullgraph=True)(x_clone, y_clone)
+
+        # Run backward
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
+        self.assertEqual(y.grad, y_clone.grad)
+
+    def test_gen_schema(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c = 5
+
+            @nested_compile_region
+            def forward(self, x, y):
+                return torch.mul(x, y).sin() + self.c
+
+        mod = Mod()
+
+        def fn(x, y):
+            return mod(x, y) + mod(x, y)
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+        backend = AotEagerAndRecordGraphs()
+        res = torch.compile(fn, backend=backend, fullgraph=True)(x_clone, y_clone)
+        res.sum().backward()
+
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertEqual(len(backend.bw_graphs), 1)
+        fw_schema = find_hop_schema(
+            backend.fw_graphs[0], torch.ops.higher_order.invoke_subgraph
+        )
+        bw_schema = find_hop_schema(
+            backend.bw_graphs[0], torch.ops.higher_order.invoke_subgraph
+        )
+        self.assertExpectedInline(
+            str(fw_schema[0]),
+            """invoke_subgraph(Any subgraph, str identifier, Tensor arg0, Tensor arg1) -> (Tensor, Tensor, Tensor)""",
+        )
+        self.assertExpectedInline(
+            str(fw_schema[1]),
+            """invoke_subgraph(Any subgraph, str identifier, Tensor arg0, Tensor arg1) -> (Tensor, Tensor, Tensor)""",
+        )
+        self.assertExpectedInline(
+            str(bw_schema[0]),
+            """invoke_subgraph(Any subgraph, str identifier, Tensor arg0, Tensor arg1, Tensor arg2) -> (Tensor, Tensor)""",
+        )
+        self.assertExpectedInline(
+            str(bw_schema[1]),
+            """invoke_subgraph(Any subgraph, str identifier, Tensor arg0, Tensor arg1, Tensor arg2) -> (Tensor, Tensor)""",
+        )
+
+    def test_gen_schema_with_buffer_mutation(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c = 5
+                self.register_buffer("buf", torch.ones(8, requires_grad=False))
+
+            @nested_compile_region
+            def forward(self, x, y):
+                self.buf.add_(1)
+                return torch.mul(x, y).sin() + self.c + self.buf
+
+        mod_ref = Mod()
+        mod = Mod()
+
+        def fn(mod, x, y):
+            return mod(x, y) + mod(x, y)
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+        ref = fn(mod_ref, x, y)
+
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+        backend = EagerAndRecordGraphs()
+        with (
+            mock.patch(
+                "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+                True,
+            ),
+            torch.no_grad(),
+        ):
+            res = torch.compile(fn, backend=backend, fullgraph=True)(
+                mod, x_clone, y_clone
+            )
+
+        self.assertEqual(len(backend.graphs), 1)
+        fw_schema = find_hop_schema(
+            backend.graphs[0], torch.ops.higher_order.invoke_subgraph
+        )
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[8]", L_y_: "f32[8]", L_mod_buffers_buf_: "f32[8]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+        l_mod_buffers_buf_ = L_mod_buffers_buf_
+
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_mod_buffers_buf_, l_x_, l_y_);  subgraph_0 = None
+        getitem: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+        subgraph_1 = self.subgraph_0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', l_mod_buffers_buf_, l_x_, l_y_);  subgraph_1 = l_mod_buffers_buf_ = l_x_ = l_y_ = None
+        getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+
+        add: "f32[8]" = getitem + getitem_1;  getitem = getitem_1 = None
+        return (add,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_mod_buffers_buf_: "f32[8]", l_x_: "f32[8]", l_y_: "f32[8]"):
+            add_: "f32[8]" = l_mod_buffers_buf_.add_(1);  add_ = None
+
+            mul: "f32[8]" = torch.mul(l_x_, l_y_);  l_x_ = l_y_ = None
+            sin: "f32[8]" = mul.sin();  mul = None
+            add: "f32[8]" = sin + 5;  sin = None
+            add_1: "f32[8]" = add + l_mod_buffers_buf_;  add = l_mod_buffers_buf_ = None
+            return (add_1,)
+""",
+            )
+        self.assertExpectedInline(
+            str(fw_schema[0]),
+            """invoke_subgraph(Any subgraph, str identifier, Tensor(a2!) arg0, Tensor arg1, Tensor arg2) -> ((Tensor))""",
+        )
+        self.assertExpectedInline(
+            str(fw_schema[1]),
+            """invoke_subgraph(Any subgraph, str identifier, Tensor(a2!) arg0, Tensor arg1, Tensor arg2) -> ((Tensor))""",
+        )
+        self.assertEqual(res, ref)
+        self.assertEqual(mod.buf, mod_ref.buf)
+
+    def test_auto_functionalize(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c = 5
+                self.register_buffer("buf", torch.ones(8, requires_grad=False))
+
+            @nested_compile_region
+            def forward(self, x, y):
+                return torch.mul(x, y).sin() * self.c * self.buf
+
+        mod_ref = Mod()
+        mod = Mod()
+
+        def fn(mod, x, y):
+            return mod(x, y) + mod(x, y)
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+        ref = fn(mod_ref, x, y)
+
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+        backend = AotEagerAndRecordGraphs()
+        with mock.patch(
+            "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+            True,
+        ):
+            res = torch.compile(fn, backend=backend, fullgraph=True)(
+                mod, x_clone, y_clone
+            )
+            res.sum().backward()
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertEqual(len(backend.bw_graphs), 1)
+        self.assertEqual(ref, res)
+        self.assertExpectedInline(
+            normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[8]", primals_2: "f32[8]", primals_3: "f32[8]"):
+        partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+
+        invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, primals_2, primals_3);  partitioned_fw_subgraph_0_0 = None
+        getitem_12: "f32[8]" = invoke_subgraph_4[3]
+        getitem_11: "f32[8]" = invoke_subgraph_4[2]
+        getitem_10: "f32[8]" = invoke_subgraph_4[1]
+        getitem: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
+
+        partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
+
+        invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_0', primals_1, primals_2, primals_3);  partitioned_fw_subgraph_0_1 = primals_1 = primals_2 = primals_3 = None
+        getitem_15: "f32[8]" = invoke_subgraph_6[3]
+        getitem_14: "f32[8]" = invoke_subgraph_6[2]
+        getitem_13: "f32[8]" = invoke_subgraph_6[1]
+        getitem_1: "f32[8]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
+
+        add: "f32[8]" = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
+        return (add, getitem_12, getitem_11, getitem_10, getitem_15, getitem_14, getitem_13)
+
+    class partitioned_fw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[8]", primals_1: "f32[8]", primals_2: "f32[8]"):
+            mul: "f32[8]" = torch.ops.aten.mul.Tensor(primals_0, primals_1)
+            sin: "f32[8]" = torch.ops.aten.sin.default(mul);  mul = None
+            mul_1: "f32[8]" = torch.ops.aten.mul.Tensor(sin, 5);  sin = None
+            mul_2: "f32[8]" = torch.ops.aten.mul.Tensor(mul_1, primals_2);  mul_1 = None
+            return (mul_2, primals_0, primals_1, primals_2)
+""",
+        )
+        self.assertExpectedInline(
+            normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, getitem_12: "f32[8]", getitem_11: "f32[8]", getitem_10: "f32[8]", getitem_15: "f32[8]", getitem_14: "f32[8]", getitem_13: "f32[8]", tangents_1: "f32[8]"):
+        partitioned_bw_subgraph_0_1 = self.partitioned_bw_subgraph_0_0
+
+        invoke_subgraph_7 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_1, 'partitioned_bw_subgraph_0_0', getitem_13, getitem_14, getitem_15, tangents_1);  partitioned_bw_subgraph_0_1 = getitem_13 = getitem_14 = getitem_15 = None
+        getitem_2: "f32[8]" = invoke_subgraph_7[0]
+        getitem_3: "f32[8]" = invoke_subgraph_7[1];  invoke_subgraph_7 = None
+
+        partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
+
+        invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_10, getitem_11, getitem_12, tangents_1);  partitioned_bw_subgraph_0_0 = getitem_10 = getitem_11 = getitem_12 = tangents_1 = None
+        getitem_6: "f32[8]" = invoke_subgraph_5[0]
+        getitem_7: "f32[8]" = invoke_subgraph_5[1];  invoke_subgraph_5 = None
+
+        add_1: "f32[8]" = torch.ops.aten.add.Tensor(getitem_2, getitem_6);  getitem_2 = getitem_6 = None
+        add_2: "f32[8]" = torch.ops.aten.add.Tensor(getitem_3, getitem_7);  getitem_3 = getitem_7 = None
+        return (add_1, add_2, None)
+
+    class partitioned_bw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[8]", primals_1: "f32[8]", primals_2: "f32[8]", tangents_0: "f32[8]"):
+            mul_3: "f32[8]" = torch.ops.aten.mul.Tensor(tangents_0, primals_2);  tangents_0 = primals_2 = None
+            mul_4: "f32[8]" = torch.ops.aten.mul.Tensor(mul_3, 5);  mul_3 = None
+            mul: "f32[8]" = torch.ops.aten.mul.Tensor(primals_0, primals_1)
+            cos: "f32[8]" = torch.ops.aten.cos.default(mul);  mul = None
+            mul_5: "f32[8]" = torch.ops.aten.mul.Tensor(mul_4, cos);  mul_4 = cos = None
+            mul_6: "f32[8]" = torch.ops.aten.mul.Tensor(mul_5, primals_0);  primals_0 = None
+            mul_7: "f32[8]" = torch.ops.aten.mul.Tensor(mul_5, primals_1);  mul_5 = primals_1 = None
+            return (mul_7, mul_6, None)
+""",
+        )
+
+    def test_buffer_mutation_works_under_no_grad(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.ones(8, requires_grad=False))
+
+            @nested_compile_region
+            def forward(self, x, y):
+                self.buf.add_(1)
+                return torch.mul(x, y).sin() * self.buf
+
+        mod_ref = Mod()
+        mod = Mod()
+
+        def fn(mod, x, y):
+            return mod(x, y) + mod(x, y)
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+        ref = fn(mod_ref, x, y)
+
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+        with mock.patch(
+            "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+            True,
+        ):
+            with torch.no_grad():
+                res = torch.compile(fn, fullgraph=True)(mod, x_clone, y_clone)
+        self.assertEqual(ref, res)
+        self.assertEqual(mod_ref.buf, mod.buf)
+
+        mod = Mod()
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+        with mock.patch(
+            "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+            True,
+        ):
+            with torch.inference_mode():
+                res = torch.compile(fn, fullgraph=True)(mod, x_clone, y_clone)
+        self.assertEqual(ref, res)
+        self.assertEqual(mod_ref.buf, mod.buf)
+
+        mod = Mod()
+        x_clone = x.detach().clone().requires_grad_(False)
+        y_clone = y.detach().clone().requires_grad_(False)
+        with mock.patch(
+            "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+            True,
+        ):
+            res = torch.compile(fn, fullgraph=True)(mod, x_clone, y_clone)
+        self.assertEqual(ref, res)
+        self.assertEqual(mod_ref.buf, mod.buf)
+
+    def test_buffer_mutation_errors_under_training(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.ones(8, requires_grad=False))
+
+            @nested_compile_region
+            def forward(self, x, y):
+                self.buf.add_(1)
+                return torch.mul(x, y).sin() * self.buf
+
+        mod = Mod()
+
+        def fn(mod, x, y):
+            return mod(x, y) + mod(x, y)
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "does not currently support training with in-place input or buffer mutations",
+        ):
+            with mock.patch(
+                "torch._dynamo.variables.higher_order_ops.InvokeSubgraphHigherOrderVariable.supports_input_mutation",
+                True,
+            ):
+                torch.compile(fn, backend="inductor", fullgraph=True)(mod, x, y)
+
+    def test_list(self):
+        @nested_compile_region
+        def gn(x, y):
+            return [torch.mul(x, y), torch.add(x, y)]
+
+        def fn(x, y):
+            lst = gn(x, y)
+            lst.append(torch.sin(x))
+            return lst[0] + lst[1] + lst[2]
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+        ref = fn(x, y)
+
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+        res = torch.compile(fn, backend="inductor", fullgraph=True)(x_clone, y_clone)
+
+        # Run backward
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
+        self.assertEqual(y.grad, y_clone.grad)
+
+    def test_tuple_of_tuple(self):
+        @nested_compile_region
+        def gn(x, y):
+            return ((torch.mul(x, y),), torch.add(x, y))
+
+        def fn(x, y):
+            tup = gn(x, y)
+            return tup[0][0] + tup[1]
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+        ref = fn(x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x_clone = x.detach().clone().requires_grad_(True)
         y_clone = y.detach().clone().requires_grad_(True)
@@ -139,7 +598,11 @@ def backward(ctx, grad_out):
                 a = grad_out.view(12, 5)
                 return torch.cos(torch.reshape(a, (3, 4, 5)))
 
+<<<<<<< HEAD
         @mark_compile_region
+=======
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def gn(x):
             return CustomOp.apply(x)
 
@@ -166,7 +629,11 @@ def fn(x):
 
     @requires_cuda
     def test_sdpa(self):
+<<<<<<< HEAD
         @mark_compile_region
+=======
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def gn(q, k, v):
             return torch.nn.functional.scaled_dot_product_attention(
                 q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True
@@ -198,7 +665,11 @@ def fn(q, k, v):
         res.sum().backward()
 
     def test_symint_from_fwd_to_bwd(self):
+<<<<<<< HEAD
         @mark_compile_region
+=======
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def gn(x, y):
             a = torch.sum(x, (1,), keepdim=True).view(y.shape[1], y.shape[0])
             return torch.matmul(a, y)
@@ -228,17 +699,29 @@ def fn(x, y):
         self.assertEqual(ref, res)
         res.sum().backward()
 
+<<<<<<< HEAD
     def test_dropout(self):
+=======
+    def test_dropout_checks_joint_graph(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # `dropout` tests that joint graph passes (not just partitioner) is ran
         # on the hop graphs. Inductor rng functionalization happens in the joint
         # graph passes. Without running joint graph passes, we would get an
         # error like AssertionError: should have been handled in
         # replace_random.py
+<<<<<<< HEAD
         @mark_compile_region
         def gn(x):
             return torch.nn.functional.dropout(torch.sin(x), p=0.5)
 
         @mark_compile_region
+=======
+        @nested_compile_region
+        def gn(x):
+            return torch.nn.functional.dropout(torch.sin(x), p=0.5)
+
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def hn(x):
             return torch.sin(x)
 
@@ -250,30 +733,124 @@ def fn(x):
         # between eager and Triton.
         res = torch.compile(fn, backend="inductor", fullgraph=True)(x)  # noqa: F841
 
+<<<<<<< HEAD
     def test_dedupe(self):
         @mark_compile_region
-        def gn(x, y):
-            return torch.mul(x, y)
+=======
+        torch.compiler.reset()
+        backend = InductorAndRecordGraphs()
+        res = torch.compile(fn, backend=backend, fullgraph=True)(x)
+        res.sum().backward()
 
-        def fn(x, y):
-            a = gn(x, y)
-            return gn(a, y)
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(
+                    backend.inductor_graphs[0].print_readable(print_output=False)
+                ),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[8]"):
+        partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
 
-        x = torch.randn(8, requires_grad=True)
-        y = torch.randn(8, requires_grad=True)
-        ref = fn(x, y)
+        invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1);  partitioned_fw_subgraph_0_0 = None
+        getitem_7: "b8[8]" = invoke_subgraph_4[2]
+        getitem_6: "f32[8]" = invoke_subgraph_4[1]
+        getitem: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
 
-        x_clone = x.detach().clone().requires_grad_(True)
-        y_clone = y.detach().clone().requires_grad_(True)
-        backend = AotEagerAndRecordGraphs()
-        res = torch.compile(fn, backend=backend, fullgraph=True)(x_clone, y_clone)
+        partitioned_fw_subgraph_1_0 = self.partitioned_fw_subgraph_1_0
 
-        # Run backward
-        ref.sum().backward()
-        res.sum().backward()
+        invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_1_0, 'partitioned_fw_subgraph_1_0', primals_1);  partitioned_fw_subgraph_1_0 = primals_1 = None
+        getitem_8: "f32[8]" = invoke_subgraph_6[1]
+        getitem_1: "f32[8]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
 
-        self.assertEqual(ref, res)
-        self.assertEqual(x.grad, x_clone.grad)
+        add: "f32[8]" = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
+        return (add, getitem_7, getitem_6, getitem_8)
+
+    class partitioned_fw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[8]"):
+            sin: "f32[8]" = torch.ops.aten.sin.default(primals_0)
+
+            inductor_seeds_default: "i64[1]" = torch.ops.prims.inductor_seeds.default(1, device(type='cpu'))
+            inductor_lookup_seed_default: "i64[]" = torch.ops.prims.inductor_lookup_seed.default(inductor_seeds_default, 0);  inductor_seeds_default = None
+            inductor_random_default: "f32[8]" = torch.ops.prims.inductor_random.default([8], inductor_lookup_seed_default, 'rand');  inductor_lookup_seed_default = None
+
+            gt: "b8[8]" = torch.ops.aten.gt.Scalar(inductor_random_default, 0.5);  inductor_random_default = None
+            mul: "f32[8]" = torch.ops.aten.mul.Tensor(gt, sin);  sin = None
+            mul_1: "f32[8]" = torch.ops.aten.mul.Tensor(mul, 2.0);  mul = None
+            return (mul_1, primals_0, gt)
+
+    class partitioned_fw_subgraph_1_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[8]"):
+            sin: "f32[8]" = torch.ops.aten.sin.default(primals_0)
+            return (sin, primals_0)
+""",
+            )
+
+    def test_dropout_checks_joint_graph_inference(self):
+        # Checks that joint graph results in inductor seeds for just the inference graph
+        @nested_compile_region
+        def gn(x):
+            return torch.nn.functional.dropout(torch.sin(x), p=0.5)
+
+        def fn(x):
+            return gn(x)
+
+        backend = InductorAndRecordGraphs()
+        x = torch.randn(8, requires_grad=False)
+        torch.compile(fn, backend=backend, fullgraph=True)(x)
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(
+                    backend.inductor_graphs[0].print_readable(print_output=False)
+                ),
+                """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[8]"):
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', arg0_1);  repeated_subgraph0 = arg0_1 = None
+        getitem: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+        return (getitem,)
+
+    class repeated_subgraph0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[8]"):
+            inductor_seeds_default: "i64[1]" = torch.ops.prims.inductor_seeds.default(1, device(type='cpu'))
+            inductor_lookup_seed_default: "i64[]" = torch.ops.prims.inductor_lookup_seed.default(inductor_seeds_default, 0);  inductor_seeds_default = None
+            inductor_random_default: "f32[8]" = torch.ops.prims.inductor_random.default([8], inductor_lookup_seed_default, 'rand');  inductor_lookup_seed_default = None
+
+            gt: "b8[8]" = torch.ops.aten.gt.Scalar(inductor_random_default, 0.5);  inductor_random_default = None
+            sin: "f32[8]" = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            mul: "f32[8]" = torch.ops.aten.mul.Tensor(gt, sin);  gt = sin = None
+            mul_1: "f32[8]" = torch.ops.aten.mul.Tensor(mul, 2.0);  mul = None
+            return (mul_1,)
+""",
+            )
+
+    def test_dedupe(self):
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        def gn(x, y):
+            return torch.mul(x, y)
+
+        def fn(x, y):
+            a = gn(x, y)
+            return gn(a, y)
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+        ref = fn(x, y)
+
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+        backend = AotEagerAndRecordGraphs()
+        res = torch.compile(fn, backend=backend, fullgraph=True)(x_clone, y_clone)
+
+        # Run backward
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
         self.assertEqual(y.grad, y_clone.grad)
 
         # Check that the Dynamo and AOT graphs have just one subgraph module
@@ -293,6 +870,7 @@ def forward(self, L_x_: "f32[8]", L_y_: "f32[8]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
+<<<<<<< HEAD
         invoke_subgraph_0 = self.invoke_subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(invoke_subgraph_0, 'invoke_subgraph_0', (l_x_, l_y_));  invoke_subgraph_0 = l_x_ = None
         a: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
@@ -303,6 +881,18 @@ def forward(self, L_x_: "f32[8]", L_y_: "f32[8]"):
         return (getitem_1,)
 
     class invoke_subgraph_0(torch.nn.Module):
+=======
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
+        a: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+
+        subgraph_1 = self.subgraph_0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', a, l_y_);  subgraph_1 = a = l_y_ = None
+        getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+        return (getitem_1,)
+
+    class subgraph_0(torch.nn.Module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(self, l_x_: "f32[8]", l_y_: "f32[8]"):
             mul: "f32[8]" = torch.mul(l_x_, l_y_);  l_x_ = l_y_ = None
             return (mul,)
@@ -314,13 +904,20 @@ def forward(self, l_x_: "f32[8]", l_y_: "f32[8]"):
             """\
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8]", primals_2: "f32[8]"):
+<<<<<<< HEAD
         ___forward_invoke_subgraph_0_post_graph = self.___forward_invoke_subgraph_0_post_graph
 
         invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(___forward_invoke_subgraph_0_post_graph, '___forward_invoke_subgraph_0_post_graph', (primals_1, primals_2));  ___forward_invoke_subgraph_0_post_graph = primals_1 = None
+=======
+        partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+
+        invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, primals_2);  partitioned_fw_subgraph_0_0 = primals_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_9: "f32[8]" = invoke_subgraph_4[2]
         getitem_8: "f32[8]" = invoke_subgraph_4[1]
         getitem: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
 
+<<<<<<< HEAD
         ___forward_invoke_subgraph_0_post_graph_1 = self.___forward_invoke_subgraph_0_post_graph
 
         invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(___forward_invoke_subgraph_0_post_graph_1, '___forward_invoke_subgraph_0_post_graph', (getitem, primals_2));  ___forward_invoke_subgraph_0_post_graph_1 = getitem = primals_2 = None
@@ -330,16 +927,68 @@ def forward(self, primals_1: "f32[8]", primals_2: "f32[8]"):
         return (getitem_1, getitem_9, getitem_8, getitem_11, getitem_10)
 
     class ___forward_invoke_subgraph_0_post_graph(torch.nn.Module):
+=======
+        partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
+
+        invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_0', getitem, primals_2);  partitioned_fw_subgraph_0_1 = getitem = primals_2 = None
+        getitem_11: "f32[8]" = invoke_subgraph_6[2]
+        getitem_10: "f32[8]" = invoke_subgraph_6[1]
+        getitem_1: "f32[8]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
+        return (getitem_1, getitem_9, getitem_8, getitem_11, getitem_10)
+
+    class partitioned_fw_subgraph_0_0(torch.nn.Module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(self, primals_0: "f32[8]", primals_1: "f32[8]"):
             mul: "f32[8]" = torch.ops.aten.mul.Tensor(primals_0, primals_1)
             return (mul, primals_0, primals_1)
 """,
         )
 
+<<<<<<< HEAD
     def test_nonlocal_update(self):
         counter = 2
 
         @mark_compile_region
+=======
+    def test_dce(self):
+        @nested_compile_region
+        def gn(x):
+            x = torch.sin(x)
+            # should be dce'd
+            y = torch.cos(x)  # noqa: F841
+            return x
+
+        def fn(x):
+            return gn(x)
+
+        backend = AotEagerAndRecordGraphs()
+        torch.compile(fn, backend=backend, fullgraph=True)(
+            torch.randn(4, requires_grad=False)
+        )
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+                """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[4]"):
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', arg0_1);  repeated_subgraph0 = arg0_1 = None
+        getitem: "f32[4]" = invoke_subgraph[0];  invoke_subgraph = None
+        return (getitem,)
+
+    class repeated_subgraph0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[4]"):
+            sin: "f32[4]" = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            return (sin,)
+""",
+            )
+
+    def test_nonlocal_update(self):
+        counter = 2
+
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def gn(x, y):
             nonlocal counter
             return (torch.mul(x, y) * counter,)
@@ -380,6 +1029,7 @@ def forward(self, L_x_: "f32[8]", L_y_: "f32[8]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
+<<<<<<< HEAD
         invoke_subgraph_0 = self.invoke_subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(invoke_subgraph_0, 'invoke_subgraph_0', (l_x_, l_y_));  invoke_subgraph_0 = l_x_ = None
         a: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
@@ -390,12 +1040,28 @@ def forward(self, L_x_: "f32[8]", L_y_: "f32[8]"):
         return (getitem_1,)
 
     class invoke_subgraph_0(torch.nn.Module):
+=======
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
+        a: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+
+        subgraph_1 = self.subgraph_1
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', a, l_y_);  subgraph_1 = a = l_y_ = None
+        getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+        return (getitem_1,)
+
+    class subgraph_0(torch.nn.Module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(self, l_x_: "f32[8]", l_y_: "f32[8]"):
             mul: "f32[8]" = torch.mul(l_x_, l_y_);  l_x_ = l_y_ = None
             child: "f32[8]" = mul * 2;  mul = None
             return (child,)
 
+<<<<<<< HEAD
     class invoke_subgraph_1(torch.nn.Module):
+=======
+    class subgraph_1(torch.nn.Module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(self, a: "f32[8]", l_y_: "f32[8]"):
             mul: "f32[8]" = torch.mul(a, l_y_);  a = l_y_ = None
             child: "f32[8]" = mul * 3;  mul = None
@@ -403,8 +1069,53 @@ def forward(self, a: "f32[8]", l_y_: "f32[8]"):
 """,
             )
 
+<<<<<<< HEAD
     def test_normalize_gm(self):
         @mark_compile_region
+=======
+    def test_view_to_reshape(self):
+        @nested_compile_region
+        def gn(x):
+            x = torch.sin(x)
+            x = x.view(1, 8)
+            return torch.sin(x)
+
+        def fn(x):
+            return gn(x)
+
+        x = torch.randn(8, requires_grad=False)
+
+        torch._dynamo.reset()
+        backend = InductorAndRecordGraphs()
+        torch.compile(fn, backend=backend, fullgraph=True)(x)
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(
+                    backend.inductor_graphs[0].print_readable(print_output=False)
+                ),
+                """\
+class <lambda>(torch.nn.Module):
+    def forward(self, arg0_1: "f32[8]"):
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', arg0_1);  repeated_subgraph0 = arg0_1 = None
+        getitem: "f32[1, 8]" = invoke_subgraph[0];  invoke_subgraph = None
+        return (getitem,)
+
+    class repeated_subgraph0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[8]"):
+            sin: "f32[8]" = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+
+            view: "f32[1, 8]" = torch.ops.aten.reshape.default(sin, [1, 8]);  sin = None
+
+            sin_1: "f32[1, 8]" = torch.ops.aten.sin.default(view);  view = None
+            return (sin_1,)
+""",
+            )
+
+    def test_normalize_gm(self):
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def gn(x, y):
             # Different graph give different names to intermediate nodes
             for _ in range(5):
@@ -433,6 +1144,7 @@ def forward(self, L_x_: "f32[8]", L_y_: "f32[8]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
+<<<<<<< HEAD
         invoke_subgraph_0 = self.invoke_subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(invoke_subgraph_0, 'invoke_subgraph_0', (l_x_, l_y_));  invoke_subgraph_0 = l_x_ = None
         x: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
@@ -451,6 +1163,26 @@ def forward(self, L_x_: "f32[8]", L_y_: "f32[8]"):
         return (x_4,)
 
     class invoke_subgraph_0(torch.nn.Module):
+=======
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_y_);  subgraph_0 = l_x_ = None
+        x: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+        subgraph_1 = self.subgraph_0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', x, l_y_);  subgraph_1 = x = None
+        x_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+        subgraph_2 = self.subgraph_0
+        invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(subgraph_2, 'subgraph_0', x_1, l_y_);  subgraph_2 = x_1 = None
+        x_2: "f32[8]" = invoke_subgraph_2[0];  invoke_subgraph_2 = None
+        subgraph_3 = self.subgraph_0
+        invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(subgraph_3, 'subgraph_0', x_2, l_y_);  subgraph_3 = x_2 = None
+        x_3: "f32[8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
+        subgraph_4 = self.subgraph_0
+        invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(subgraph_4, 'subgraph_0', x_3, l_y_);  subgraph_4 = x_3 = l_y_ = None
+        x_4: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
+        return (x_4,)
+
+    class subgraph_0(torch.nn.Module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(self, l_x_: "f32[8]", l_y_: "f32[8]"):
             x: "f32[8]" = l_x_ * l_y_;  l_x_ = None
             x_1: "f32[8]" = x * l_y_;  x = None
@@ -462,7 +1194,11 @@ def forward(self, l_x_: "f32[8]", l_y_: "f32[8]"):
             )
 
     def test_input_mutation(self):
+<<<<<<< HEAD
         @mark_compile_region
+=======
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def gn(x, y):
             x.add_(1)
             return torch.mul(x, y)
@@ -474,6 +1210,7 @@ def fn(x, y):
         y = torch.randn(8, requires_grad=False)
 
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             torch._dynamo.exc.Unsupported, "NYI: invoke_subgraph with aliasing"
         ):
@@ -488,19 +1225,84 @@ def gn(x):
 
         def fn(x):
             return gn(x)
+=======
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x, y)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered input mutation during higher order op tracing" in str(cause)
+        )
+
+    def test_input_mutation_inference_mode(self):
+        @nested_compile_region
+        def gn(x, y):
+            x.add_(1)
+            return torch.mul(x, y)
+
+        def fn(x, y):
+            z = torch.cos(x)
+            with torch.inference_mode():
+                return gn(torch.cos(z), y)
+
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+        x = torch.randn(8, requires_grad=False)
+        y = torch.randn(8, requires_grad=False)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x, y)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered input mutation during higher order op tracing" in str(cause)
+        )
+
+    def test_simple_module(self):
+        mod = torch.nn.Linear(8, 8)
+
+        @nested_compile_region
+        def gn(x):
+            return torch.cos(x), mod(x)
+
+        def fn(x):
+            out = gn(x)
+            return out[0] + out[1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
         # requires_grad is False deliberately to force None the joint_graph
         # outputs
         x = torch.randn(8, 8, requires_grad=False)
+<<<<<<< HEAD
 
         ref = mod(x)
         res = opt_fn(x)
         self.assertEqual(ref, res)
+=======
+        x_clone = x.detach().clone().requires_grad_(False)
+
+        ref = fn(x)
+        res = opt_fn(x_clone)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ref.sum().backward()
         res.sum().backward()
 
+<<<<<<< HEAD
+=======
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fail_with_direct_invoke_subgraph(self):
         from torch._higher_order_ops import invoke_subgraph
 
@@ -518,8 +1320,13 @@ def fn(x):
         ):
             opt_fn(x)
 
+<<<<<<< HEAD
     def test_input_aliasing(self):
         @mark_compile_region
+=======
+    def test_input_output_aliasing(self):
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def gn(x, y):
             return (x, torch.mul(x, y))
 
@@ -531,6 +1338,7 @@ def fn(x, y):
         y = torch.randn(8, requires_grad=False)
 
         opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             torch._dynamo.exc.Unsupported, "NYI: invoke_subgraph with aliasing"
         ):
@@ -538,6 +1346,152 @@ def fn(x, y):
 
     def test_kwargs_only(self):
         @mark_compile_region
+=======
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x, y)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered aliasing during higher order op tracing" in str(cause)
+        )
+
+    def test_input_input_aliasing(self):
+        @nested_compile_region
+        def gn(x, y):
+            return torch.mul(x, y)
+
+        def fn(x):
+            return gn(x, x.view(1, 8))
+
+        x = torch.randn(8, requires_grad=False)
+
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered aliasing during higher order op tracing" in str(cause)
+        )
+
+    def test_output_output_aliasing(self):
+        @nested_compile_region
+        def gn(x):
+            z = torch.cos(x)
+            return z, z.view(1, 8)
+
+        def fn(x):
+            return gn(x)
+
+        x = torch.randn(8, requires_grad=False)
+
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered aliasing during higher order op tracing" in str(cause)
+        )
+
+    def test_mod_attr_aliasing(self):
+        class MutateParam(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.ones(8)
+
+            def forward(self, x):
+                self.a.add_(1)
+                return torch.mul(x, self.a)
+
+        @nested_compile_region
+        def gn(x):
+            return mod(x)
+
+        def fn(x, y):
+            return gn(x) * y
+
+        mod = MutateParam()
+        x = torch.randn(8, requires_grad=False)
+        y = torch.randn(8, requires_grad=False)
+
+        fn(x, y)
+
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ) as cm:
+            opt_fn(x, y)
+
+        cause = cm.exception.__cause__
+        self.assertIsInstance(cause, torch._dynamo.exc.Unsupported)
+        self.assertTrue(
+            "Encountered input mutation during higher order op tracing" in str(cause)
+        )
+
+    def test_redundant_compile_region(self):
+        @nested_compile_region
+        @nested_compile_region
+        def gn(x):
+            return torch.sin(x)
+
+        def fn(x):
+            return gn(x) + gn(x)
+
+        backend = AotEagerAndRecordGraphs()
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+
+        x = torch.randn(8, 8, requires_grad=True)
+
+        ref = fn(x)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[8, 8]"):
+        l_x_ = L_x_
+
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_);  subgraph_0 = None
+        getitem: "f32[8, 8]" = invoke_subgraph[0];  invoke_subgraph = None
+        subgraph_1 = self.subgraph_0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', l_x_);  subgraph_1 = l_x_ = None
+        getitem_1: "f32[8, 8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+
+        add: "f32[8, 8]" = getitem + getitem_1;  getitem = getitem_1 = None
+        return (add,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[8, 8]"):
+            sin: "f32[8, 8]" = torch.sin(l_x_);  l_x_ = None
+            return (sin,)
+""",
+            )
+
+    def test_kwargs_only(self):
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def gn(x, *, y):
             return x * y
 
@@ -558,7 +1512,11 @@ def __init__(self):
                 super().__init__()
                 self.linear = torch.nn.Linear(8, 8)
 
+<<<<<<< HEAD
             @mark_compile_region
+=======
+            @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def helper(self, x):
                 return self.linear(x)
 
@@ -585,19 +1543,32 @@ def forward(self, L_x_: "f32[8, 8]", L_self_modules_linear_parameters_weight_: "
         l_self_modules_linear_parameters_weight_ = L_self_modules_linear_parameters_weight_
         l_self_modules_linear_parameters_bias_ = L_self_modules_linear_parameters_bias_
 
+<<<<<<< HEAD
         invoke_subgraph_0 = self.invoke_subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(invoke_subgraph_0, 'invoke_subgraph_0', (l_x_, l_self_modules_linear_parameters_weight_, l_self_modules_linear_parameters_bias_));  invoke_subgraph_0 = None
         getitem: "f32[8, 8]" = invoke_subgraph[0];  invoke_subgraph = None
         invoke_subgraph_1 = self.invoke_subgraph_0
         invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(invoke_subgraph_1, 'invoke_subgraph_0', (l_x_, l_self_modules_linear_parameters_weight_, l_self_modules_linear_parameters_bias_));  invoke_subgraph_1 = l_self_modules_linear_parameters_weight_ = l_self_modules_linear_parameters_bias_ = None
         getitem_1: "f32[8, 8]" = invoke_subgraph_2[0];  invoke_subgraph_2 = None
+=======
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_, l_self_modules_linear_parameters_weight_, l_self_modules_linear_parameters_bias_);  subgraph_0 = None
+        getitem: "f32[8, 8]" = invoke_subgraph[0];  invoke_subgraph = None
+        subgraph_1 = self.subgraph_0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', l_x_, l_self_modules_linear_parameters_weight_, l_self_modules_linear_parameters_bias_);  subgraph_1 = l_self_modules_linear_parameters_weight_ = l_self_modules_linear_parameters_bias_ = None
+        getitem_1: "f32[8, 8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mul: "f32[8, 8]" = getitem * getitem_1;  getitem = getitem_1 = None
         add: "f32[8, 8]" = l_x_ + mul;  mul = None
         add_1: "f32[8, 8]" = add + l_x_;  add = l_x_ = None
         return (add_1,)
 
+<<<<<<< HEAD
     class invoke_subgraph_0(torch.nn.Module):
+=======
+    class subgraph_0(torch.nn.Module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(self, l_x_: "f32[8, 8]", l_self_modules_linear_parameters_weight_: "f32[8, 8]", l_self_modules_linear_parameters_bias_: "f32[8]"):
             linear: "f32[8, 8]" = torch._C._nn.linear(l_x_, l_self_modules_linear_parameters_weight_, l_self_modules_linear_parameters_bias_);  l_x_ = l_self_modules_linear_parameters_weight_ = l_self_modules_linear_parameters_bias_ = None
             return (linear,)
@@ -615,7 +1586,11 @@ def forward(self, x):
         class Mod(torch.nn.Module):
             def __init__(self):
                 super().__init__()
+<<<<<<< HEAD
                 self.submod = mark_compile_region(SubMod())
+=======
+                self.submod = nested_compile_region(SubMod())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def forward(self, x):
                 return x + self.submod(x) * self.submod(x) + x
@@ -638,19 +1613,32 @@ class GraphModule(torch.nn.Module):
     def forward(self, L_x_: "f32[8, 8]"):
         l_x_ = L_x_
 
+<<<<<<< HEAD
         invoke_subgraph_0 = self.invoke_subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(invoke_subgraph_0, 'invoke_subgraph_0', (l_x_,));  invoke_subgraph_0 = None
         getitem: "f32[8, 8]" = invoke_subgraph[0];  invoke_subgraph = None
         invoke_subgraph_1 = self.invoke_subgraph_0
         invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(invoke_subgraph_1, 'invoke_subgraph_0', (l_x_,));  invoke_subgraph_1 = None
         getitem_1: "f32[8, 8]" = invoke_subgraph_2[0];  invoke_subgraph_2 = None
+=======
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_);  subgraph_0 = None
+        getitem: "f32[8, 8]" = invoke_subgraph[0];  invoke_subgraph = None
+        subgraph_1 = self.subgraph_0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', l_x_);  subgraph_1 = None
+        getitem_1: "f32[8, 8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         mul: "f32[8, 8]" = getitem * getitem_1;  getitem = getitem_1 = None
         add: "f32[8, 8]" = l_x_ + mul;  mul = None
         add_1: "f32[8, 8]" = add + l_x_;  add = l_x_ = None
         return (add_1,)
 
+<<<<<<< HEAD
     class invoke_subgraph_0(torch.nn.Module):
+=======
+    class subgraph_0(torch.nn.Module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(self, l_x_: "f32[8, 8]"):
             sin: "f32[8, 8]" = torch.sin(l_x_);  l_x_ = None
             return (sin,)
@@ -666,7 +1654,11 @@ def test_return_none(self):
         )
         ones = torch.ones(1000, device="cuda:0", dtype=torch.float32)
 
+<<<<<<< HEAD
         @mark_compile_region
+=======
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn(x, train):
             return F.dropout(x * weight, 0.33, train)
 
@@ -678,58 +1670,237 @@ def run(x, train=True):
         r1.sum().backward()
         weight.grad.clone()
 
+<<<<<<< HEAD
     def test_dynamic(self):
         @mark_compile_region
+=======
+    def test_return_none_from_fwd(self):
+        @nested_compile_region
         def gn(x):
-            return torch.sin(x)
+            return x * 2, None, x * 3
 
         def fn(x):
-            return gn(x)
+            ys = gn(x)
+            return ys[0] + ys[2]
 
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
         x = torch.randn(8, 8, requires_grad=True)
-        torch._dynamo.mark_dynamic(x, 0)
+        x_clone = x.detach().clone().requires_grad_(True)
+
         ref = fn(x)
-        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
-        res = opt_fn(x)
-        self.assertEqual(ref, res)
+        res = opt_fn(x_clone)
 
-    def test_bwd_partitioning(self):
-        @mark_compile_region
-        def gn(x, y):
-            z = torch.matmul(x, y)
-            return torch.sin(z)
+        ref.sum().backward()
+        res.sum().backward()
 
-        def fn(x, y):
-            return torch.sin(gn(x, y))
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
 
         backend = AotEagerAndRecordGraphs()
 
         opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
 
         x = torch.randn(8, 8, requires_grad=True)
-        y = torch.randn(8, 8, requires_grad=True)
-        x_clone = x.detach().clone().requires_grad_(True)
-        y_clone = y.detach().clone().requires_grad_(True)
-
-        ref = fn(x, y)
-        res = opt_fn(x_clone, y_clone)
-
-        ref.sum().backward()
+        res = opt_fn(x_clone)
         res.sum().backward()
 
-        self.assertEqual(ref, res)
-        self.assertEqual(x.grad, x_clone.grad)
-        self.assertEqual(y.grad, y_clone.grad)
-
-        if not TEST_WITH_CROSSREF:
+        self.assertEqual(len(backend.graphs), 1)
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertEqual(len(backend.bw_graphs), 1)
+        self.count_unique_get_attr_nodes(backend.graphs[0], [], 1)
+        self.count_unique_get_attr_nodes(backend.fw_graphs[0], [], 1)
+        self.count_unique_get_attr_nodes(backend.bw_graphs[0], [], 1)
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[8, 8]"):
+        l_x_ = L_x_
+
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_);  subgraph_0 = l_x_ = None
+        getitem: "f32[8, 8]" = invoke_subgraph[0]
+        getitem_1: "f32[8, 8]" = invoke_subgraph[2];  invoke_subgraph = None
+
+        add: "f32[8, 8]" = getitem + getitem_1;  getitem = getitem_1 = None
+        return (add,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[8, 8]"):
+            child: "f32[8, 8]" = l_x_ * 2
+            child_1: "f32[8, 8]" = l_x_ * 3;  l_x_ = None
+            return (child, None, child_1)
+""",
+            )
+
+            self.assertExpectedInline(
+                normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[8, 8]"):
+        partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+
+        invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1);  partitioned_fw_subgraph_0_0 = primals_1 = None
+        getitem: "f32[8, 8]" = invoke_subgraph_2[0]
+        getitem_2: "f32[8, 8]" = invoke_subgraph_2[2];  invoke_subgraph_2 = None
+
+        add: "f32[8, 8]" = torch.ops.aten.add.Tensor(getitem, getitem_2);  getitem = getitem_2 = None
+        return (add,)
+
+    class partitioned_fw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[8, 8]"):
+            mul: "f32[8, 8]" = torch.ops.aten.mul.Tensor(primals_0, 2)
+            mul_1: "f32[8, 8]" = torch.ops.aten.mul.Tensor(primals_0, 3);  primals_0 = None
+            return (mul, None, mul_1)
+""",
+            )
+
+            self.assertExpectedInline(
+                normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, tangents_1: "f32[8, 8]"):
+        partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
+
+        invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', tangents_1, tangents_1);  partitioned_bw_subgraph_0_0 = tangents_1 = None
+        getitem_3: "f32[8, 8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
+        return (getitem_3,)
+
+    class partitioned_bw_subgraph_0_0(torch.nn.Module):
+        def forward(self, tangents_0: "f32[8, 8]", tangents_1: "f32[8, 8]"):
+            mul_2: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, 3)
+            mul_3: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, 2);  tangents_1 = None
+
+            add: "f32[8, 8]" = torch.ops.aten.add.Tensor(mul_2, mul_3);  mul_2 = mul_3 = None
+            return (add,)
+""",
+            )
+
+    def test_dynamic(self):
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        def gn(x):
+            return torch.sin(x)
+
+        def fn(x):
+<<<<<<< HEAD
+            return gn(x)
+=======
+            return gn(x) + gn(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+
+        x = torch.randn(8, 8, requires_grad=True)
+        torch._dynamo.mark_dynamic(x, 0)
+        ref = fn(x)
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
+<<<<<<< HEAD
+    def test_bwd_partitioning(self):
+        @mark_compile_region
+=======
+    def test_complex(self):
+        # Observed in Wan2.1
+        @nested_compile_region
+        def gn(x):
+            return torch.sin(x)
+
+        def fn(x):
+            return gn(x) + gn(x)
+
+        x = torch.randn(2, 2, dtype=torch.complex64)
+        ref = fn(x)
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_pending_unbacked(self):
+        @nested_compile_region
+        def gn(x):
+            u = x[0].item()
+            return x * u
+
+        def fn(x):
+            return gn(x)
+
+        x = torch.randn(8)
+        torch._dynamo.mark_dynamic(x, 0)
+        ref = fn(x)
+        opt_fn = torch.compile(
+            fn, backend="eager", fullgraph=True
+        )  # Inductor fails with cpp compilation error
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_unbacked(self):
+        @nested_compile_region
+        def gn(x, y):
+            b = x.item()
+            torch._check_is_size(b)
+            torch._check(b < y.shape[0])
+            return y[:b].clone()
+
+        def fn(x, y):
+            return gn(x, y)
+
+        x = torch.tensor(4)
+        y = torch.randn(8)
+        ref = fn(x, y)
+        opt_fn = torch.compile(
+            fn, backend="eager", fullgraph=True
+        )  # Inductor fails with assertion error when lowering aten.sym_constrain_range_for_size.default
+        res = opt_fn(x, y)
+        self.assertEqual(ref, res)
+
+    def test_bwd_partitioning(self):
+        @nested_compile_region
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        def gn(x, y):
+            z = torch.matmul(x, y)
+            return torch.sin(z)
+
+        def fn(x, y):
+            return torch.sin(gn(x, y))
+
+        backend = AotEagerAndRecordGraphs()
+
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+
+        x = torch.randn(8, 8, requires_grad=True)
+        y = torch.randn(8, 8, requires_grad=True)
+        x_clone = x.detach().clone().requires_grad_(True)
+        y_clone = y.detach().clone().requires_grad_(True)
+
+        ref = fn(x, y)
+        res = opt_fn(x_clone, y_clone)
+
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
+        self.assertEqual(y.grad, y_clone.grad)
+
+        if not TEST_WITH_CROSSREF:
             self.assertExpectedInline(
                 normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
                 """\
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8, 8]", primals_2: "f32[8, 8]"):
+<<<<<<< HEAD
         ___forward_invoke_subgraph_0_post_graph = self.___forward_invoke_subgraph_0_post_graph
 
         invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(___forward_invoke_subgraph_0_post_graph, '___forward_invoke_subgraph_0_post_graph', (primals_1, primals_2));  ___forward_invoke_subgraph_0_post_graph = primals_1 = primals_2 = None
+=======
+        partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+
+        invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, primals_2);  partitioned_fw_subgraph_0_0 = primals_1 = primals_2 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_6: "f32[8, 8]" = invoke_subgraph_2[3]
         getitem_5: "f32[8, 8]" = invoke_subgraph_2[2]
         getitem_4: "f32[8, 8]" = invoke_subgraph_2[1]
@@ -739,10 +1910,19 @@ def forward(self, primals_1: "f32[8, 8]", primals_2: "f32[8, 8]"):
         cos: "f32[8, 8]" = torch.ops.aten.cos.default(getitem);  getitem = None
         return (sin, getitem_6, getitem_5, getitem_4, cos)
 
+<<<<<<< HEAD
     class ___forward_invoke_subgraph_0_post_graph(torch.nn.Module):
         def forward(self, primals_0: "f32[8, 8]", primals_1: "f32[8, 8]"):
             mm: "f32[8, 8]" = torch.ops.aten.mm.default(primals_0, primals_1)
             sin: "f32[8, 8]" = torch.ops.aten.sin.default(mm)
+=======
+    class partitioned_fw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "f32[8, 8]", primals_1: "f32[8, 8]"):
+            mm: "f32[8, 8]" = torch.ops.aten.mm.default(primals_0, primals_1)
+
+            sin: "f32[8, 8]" = torch.ops.aten.sin.default(mm)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             t: "f32[8, 8]" = torch.ops.aten.t.default(primals_0);  primals_0 = None
             t_1: "f32[8, 8]" = torch.ops.aten.t.default(primals_1);  primals_1 = None
             return (sin, mm, t, t_1)
@@ -756,23 +1936,917 @@ class GraphModule(torch.nn.Module):
     def forward(self, getitem_6: "f32[8, 8]", getitem_5: "f32[8, 8]", getitem_4: "f32[8, 8]", cos: "f32[8, 8]", tangents_1: "f32[8, 8]"):
         mul: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, cos);  tangents_1 = cos = None
 
+<<<<<<< HEAD
         ___backward_invoke_subgraph_0_post_graph = self.___backward_invoke_subgraph_0_post_graph
 
         invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(___backward_invoke_subgraph_0_post_graph, '___backward_invoke_subgraph_0_post_graph', (getitem_4, getitem_5, getitem_6, mul));  ___backward_invoke_subgraph_0_post_graph = getitem_4 = getitem_5 = getitem_6 = mul = None
+=======
+        partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
+
+        invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_4, getitem_5, getitem_6, mul);  partitioned_bw_subgraph_0_0 = getitem_4 = getitem_5 = getitem_6 = mul = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_1: "f32[8, 8]" = invoke_subgraph_3[0]
         getitem_2: "f32[8, 8]" = invoke_subgraph_3[1];  invoke_subgraph_3 = None
         return (getitem_1, getitem_2)
 
+<<<<<<< HEAD
     class ___backward_invoke_subgraph_0_post_graph(torch.nn.Module):
         def forward(self, mm: "f32[8, 8]", t: "f32[8, 8]", t_1: "f32[8, 8]", tangents_0: "f32[8, 8]"):
             cos: "f32[8, 8]" = torch.ops.aten.cos.default(mm);  mm = None
             mul: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_0, cos);  tangents_0 = cos = None
+=======
+    class partitioned_bw_subgraph_0_0(torch.nn.Module):
+        def forward(self, mm: "f32[8, 8]", t: "f32[8, 8]", t_1: "f32[8, 8]", tangents_0: "f32[8, 8]"):
+            cos: "f32[8, 8]" = torch.ops.aten.cos.default(mm);  mm = None
+            mul: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_0, cos);  tangents_0 = cos = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mm_1: "f32[8, 8]" = torch.ops.aten.mm.default(t, mul);  t = None
             mm_2: "f32[8, 8]" = torch.ops.aten.mm.default(mul, t_1);  mul = t_1 = None
             return (mm_2, mm_1)
 """,
             )
 
+<<<<<<< HEAD
+=======
+    def test_const_tensor(self):
+        @nested_compile_region
+        def gn(x):
+            return torch.tensor(64, dtype=torch.float32) * x
+
+        def fn(x):
+            return gn(x) + gn(x)
+
+        x = torch.randn(64, requires_grad=True)
+
+        opt_fn = torch.compile(fn, backend="aot_eager", fullgraph=True)
+
+        ref = fn(x)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
+    def test_ac(self):
+        def fn1(x):
+            return torch.cos(x)
+
+        @nested_compile_region
+        def fn1_checkpoint(x):
+            return torch.utils.checkpoint.checkpoint(fn1, x, use_reentrant=False)
+
+        def fn2(x):
+            return torch.sin(x)
+
+        @nested_compile_region
+        def fn2_checkpoint(x):
+            return torch.utils.checkpoint.checkpoint(fn2, x, use_reentrant=False)
+
+        def fn(x):
+            return (
+                fn1_checkpoint(x)
+                # repeat the same fn1_checkpoint to see that we dedupe
+                + fn1_checkpoint(x)
+                # Check that a new fn2_checkpoint goes through a different HOP
+                + fn2_checkpoint(x)
+            )
+
+        x = torch.randn(8, requires_grad=True)
+        ref = fn(x)
+
+        x_clone = x.clone().detach().requires_grad_(True)
+        backend = AotEagerAndRecordGraphs()
+        res = torch.compile(fn, backend=backend, fullgraph=True)(x_clone)
+
+        # Run backward
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
+
+        # Check that the Dynamo and AOT graphs have just one subgraph module
+        self.assertEqual(len(backend.graphs), 1)
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertEqual(len(backend.bw_graphs), 1)
+        self.count_unique_get_attr_nodes(backend.graphs[0], [], 2)
+        self.count_unique_get_attr_nodes(backend.fw_graphs[0], [], 2)
+        self.count_unique_get_attr_nodes(backend.bw_graphs[0], [], 2)
+
+        res = torch.compile(fn, backend="inductor", fullgraph=True)(x_clone)
+        self.assertEqual(ref, res)
+
+    def test_fake_tensor_checking(self):
+        @nested_compile_region
+        def gn(x):
+            return torch.sin(x)
+
+        def fn(x, y):
+            # x and y are different shapes, so we should use different graph
+            return gn(x), gn(y)
+
+        backend = AotEagerAndRecordGraphs()
+
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+
+        x = torch.randn(8, 8, requires_grad=True)
+        y = torch.randn(16, 16, requires_grad=True)
+
+        ref = fn(x, y)
+        res = opt_fn(x, y)
+
+        self.assertEqual(ref, res)
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[8, 8]", L_y_: "f32[16, 16]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_);  subgraph_0 = l_x_ = None
+        getitem: "f32[8, 8]" = invoke_subgraph[0];  invoke_subgraph = None
+        subgraph_1 = self.subgraph_1
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', l_y_);  subgraph_1 = l_y_ = None
+        getitem_1: "f32[16, 16]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+        return (getitem, getitem_1)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[8, 8]"):
+            sin: "f32[8, 8]" = torch.sin(l_x_);  l_x_ = None
+            return (sin,)
+
+    class subgraph_1(torch.nn.Module):
+        def forward(self, l_y_: "f32[16, 16]"):
+            sin: "f32[16, 16]" = torch.sin(l_y_);  l_y_ = None
+            return (sin,)
+""",
+            )
+
+    def test_different_symint(self):
+        """
+        Tests check that the same subgraph called with different symints use different graphs
+        """
+
+        @nested_compile_region
+        def gn(x):
+            return torch.sin(x)
+
+        def fn(x):
+            a = gn(x)
+            # Get first half of the tensor
+            b = torch.narrow(a, 0, 0, a.size()[0] // 2)
+            return gn(b)
+
+        opt_fn = torch.compile(fn, fullgraph=True)
+
+        x = torch.randn(8, 8, requires_grad=True)
+        torch._dynamo.mark_dynamic(x, 0)
+
+        ref = fn(x)
+        res = opt_fn(x)
+        torch._dynamo.reset()
+
+        backend = AotEagerAndRecordGraphs()
+
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+
+        x = torch.randn(8, 8, requires_grad=True)
+        torch._dynamo.mark_dynamic(x, 0)
+
+        ref = fn(x)
+        res = opt_fn(x)
+
+        self.assertEqual(ref, res)
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, s77: "Sym(s77)", L_x_: "f32[s77, 8]"):
+        l_x_ = L_x_
+
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', s77, l_x_);  subgraph_0 = l_x_ = None
+        a: "f32[s77, 8]" = invoke_subgraph[0];  invoke_subgraph = None
+
+        floordiv: "Sym((s77//2))" = s77 // 2
+        b: "f32[(s77//2), 8]" = torch.narrow(a, 0, 0, floordiv);  a = floordiv = None
+
+        subgraph_1 = self.subgraph_1
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', s77, b);  subgraph_1 = s77 = b = None
+        getitem_3: "f32[(s77//2), 8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+        return (getitem_3,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, s77: "Sym(s77)", l_x_: "f32[s77, 8]"):
+            sin: "f32[s77, 8]" = torch.sin(l_x_);  l_x_ = None
+            return (sin,)
+
+    class subgraph_1(torch.nn.Module):
+        def forward(self, s77: "Sym(s77)", b: "f32[(s77//2), 8]"):
+            sin: "f32[(s77//2), 8]" = torch.sin(b);  b = None
+            return (sin,)
+""",
+            )
+
+    def test_autograd_function(self):
+        class CustomOp(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return torch.sin(x)
+
+            @staticmethod
+            def backward(ctx, grad_out):
+                (x,) = ctx.saved_tensors
+                return x * torch.cos(grad_out)
+
+        @nested_compile_region
+        def gn(x):
+            return CustomOp.apply(x)
+
+        def fn(x):
+            return gn(x) + gn(x)
+
+        backend = AotEagerAndRecordGraphs()
+
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+
+        x = torch.randn(8, 8, requires_grad=True)
+        x_clone = x.detach().clone().requires_grad_(True)
+
+        ref = fn(x)
+        res = opt_fn(x_clone)
+
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[8, 8]"):
+        l_x_ = L_x_
+
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_);  subgraph_0 = None
+        getitem: "f32[8, 8]" = invoke_subgraph[0];  invoke_subgraph = None
+        subgraph_1 = self.subgraph_0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_0', l_x_);  subgraph_1 = l_x_ = None
+        getitem_1: "f32[8, 8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+
+        add: "f32[8, 8]" = getitem + getitem_1;  getitem = getitem_1 = None
+        return (add,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[8, 8]"):
+            fwd_body_0 = self.fwd_body_0
+            bwd_body_0 = self.bwd_body_0
+            autograd_function_apply: "f32[8, 8]" = torch.ops.higher_order.autograd_function_apply(fwd_body_0, bwd_body_0, l_x_, args_tensor_mask = [True], non_differentiable_idx = []);  fwd_body_0 = bwd_body_0 = l_x_ = None
+            return (autograd_function_apply,)
+
+        class fwd_body_0(torch.nn.Module):
+            def forward(self, ctx : torch.autograd.function.Function, x: "f32[8, 8]"):
+                _set_grad_enabled = torch._C._set_grad_enabled(False);  _set_grad_enabled = None
+
+                sin: "f32[8, 8]" = torch.sin(x)
+
+                _set_grad_enabled_1 = torch._C._set_grad_enabled(True);  _set_grad_enabled_1 = None
+                return (sin, [x])
+
+        class bwd_body_0(torch.nn.Module):
+            def forward(self, ctx : torch.autograd.function.Function, grad_out: "f32[8, 8]", x: "f32[8, 8]"):
+                _set_grad_enabled = torch._C._set_grad_enabled(False);  _set_grad_enabled = None
+
+                cos: "f32[8, 8]" = torch.cos(grad_out);  grad_out = None
+                mul: "f32[8, 8]" = x * cos;  x = cos = None
+
+                _set_grad_enabled_1 = torch._C._set_grad_enabled(True);  _set_grad_enabled_1 = None
+                return mul
+""",
+            )
+
+    @requires_gpu
+    def test_triton_kernel_native(self):
+        from torch.testing._internal.triton_utils import add_kernel
+
+        def call_triton_add(
+            x: torch.Tensor,
+            y: torch.Tensor,
+            output: torch.Tensor,
+            grid_type: int,
+            num=1,
+            positional=False,
+        ):
+            n_elements = output.numel()
+
+            def grid_fn(meta):
+                return (triton.cdiv(num, meta["BLOCK_SIZE"]),)
+
+            if grid_type == 0:
+                grid = (x.numel(),)
+            elif grid_type == 1:
+                grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            else:
+                grid = grid_fn
+
+            if positional:
+                add_kernel[grid](x, y, output, n_elements, 16)
+            else:
+                add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=16)
+
+            return output
+
+        @nested_compile_region
+        def gn(x, y):
+            o = torch.zeros_like(x)
+            call_triton_add(x, y, o, 0)
+            return o.sin()
+
+        def fn(x, y):
+            x = x.sin()
+            y = y.sin()
+            z = gn(x, y)
+            return gn(z, y)
+
+        t1 = torch.rand(5, device=GPU_TYPE)
+        t2 = torch.rand(5, device=GPU_TYPE)
+
+        ref = fn(t1, t2)
+        backend = AotEagerAndRecordGraphs()
+
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+
+        self.assertEqual(opt_fn(t1, t2), ref)
+
+        # NOTE THAT THIS TEST DOES NOT REALLY WORK
+        # We wanted one invoke_subgraph called twice, but because of
+        # constant_args_idx changing in the grpah, the graph equivalence fails
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[5]", L_y_: "f32[5]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        x: "f32[5]" = l_x_.sin();  l_x_ = None
+
+        y: "f32[5]" = l_y_.sin();  l_y_ = None
+
+        subgraph_0 = self.subgraph_0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', x, y);  subgraph_0 = x = None
+        z: "f32[5]" = invoke_subgraph[0];  invoke_subgraph = None
+
+        subgraph_1 = self.subgraph_1
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(subgraph_1, 'subgraph_1', z, y);  subgraph_1 = z = y = None
+        getitem_1: "f32[5]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+        return (getitem_1,)
+
+    class subgraph_0(torch.nn.Module):
+        def forward(self, x: "f32[5]", y: "f32[5]"):
+            o: "f32[5]" = torch.zeros_like(x)
+
+            triton_kernel_wrapper_mutation = torch.ops.higher_order.triton_kernel_wrapper_mutation(kernel_idx = 0, constant_args_idx = 0, grid = [(5, 1, 1)], tma_descriptor_metadata = {}, kwargs = {'in_ptr0': x, 'in_ptr1': y, 'out_ptr': o});  x = y = triton_kernel_wrapper_mutation = None
+
+            sin: "f32[5]" = o.sin();  o = None
+            return (sin,)
+
+    class subgraph_1(torch.nn.Module):
+        def forward(self, z: "f32[5]", y: "f32[5]"):
+            o: "f32[5]" = torch.zeros_like(z)
+
+            triton_kernel_wrapper_mutation = torch.ops.higher_order.triton_kernel_wrapper_mutation(kernel_idx = 0, constant_args_idx = 1, grid = [(5, 1, 1)], tma_descriptor_metadata = {}, kwargs = {'in_ptr0': z, 'in_ptr1': y, 'out_ptr': o});  z = y = triton_kernel_wrapper_mutation = None
+
+            sin: "f32[5]" = o.sin();  o = None
+            return (sin,)
+""",
+            )
+
+    @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
+    def test_unbacked_symbol(self):
+        @nested_compile_region
+        def gn(x):
+            return torch.sin(torch.nonzero(x))
+
+        def fn(x):
+            return gn(x) + gn(x)
+
+        x = torch.randn(64, 1, requires_grad=True)
+
+        # Inductor fails with a lowering error
+        opt_fn = torch.compile(fn, backend="aot_eager", fullgraph=True)
+
+        ref = fn(x)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
+    def test_different_strides_in_backward(self):
+        @nested_compile_region
+        def gn(x):
+            return torch.cos(x)
+
+        def fn(x):
+            a = gn(x)
+            a2 = gn(a)
+            b = torch.sin(a2)
+            c = gn(b)
+            c2 = gn(c)
+            return c.sum() + c2.sum()
+
+        opt_fn = torch.compile(fn, fullgraph=True)
+
+        x = torch.randn(8, 16, requires_grad=True)
+        torch._dynamo.mark_dynamic(x, 0)
+        x_clone = x.detach().clone().requires_grad_(True)
+        torch._dynamo.mark_dynamic(x_clone, 0)
+
+        ref = fn(x)
+        res = opt_fn(x_clone)
+
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+
+        torch.compiler.reset()
+        backend = AotEagerAndRecordGraphs()
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+
+        x = torch.randn(8, 16, requires_grad=True)
+        torch._dynamo.mark_dynamic(x, 0)
+        x_clone = x.detach().clone().requires_grad_(True)
+        torch._dynamo.mark_dynamic(x_clone, 0)
+        ref = fn(x)
+        res = opt_fn(x_clone)
+
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.assertEqual(ref, res)
+        self.assertEqual(x.grad, x_clone.grad)
+
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s77)", primals_2: "f32[s77, 16]"):
+        partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_1
+
+        invoke_subgraph_8 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_1', primals_1, primals_2);  partitioned_fw_subgraph_0_1 = primals_2 = None
+        getitem_17: "Sym(s77)" = invoke_subgraph_8[2]
+        getitem_16: "f32[s77, 16]" = invoke_subgraph_8[1]
+        getitem: "f32[s77, 16]" = invoke_subgraph_8[0];  invoke_subgraph_8 = None
+
+        partitioned_fw_subgraph_0_2 = self.partitioned_fw_subgraph_0_1
+
+        invoke_subgraph_10 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_2, 'partitioned_fw_subgraph_0_1', primals_1, getitem);  partitioned_fw_subgraph_0_2 = getitem = None
+        getitem_19: "Sym(s77)" = invoke_subgraph_10[2]
+        getitem_18: "f32[s77, 16]" = invoke_subgraph_10[1]
+        getitem_1: "f32[s77, 16]" = invoke_subgraph_10[0];  invoke_subgraph_10 = None
+
+        sin: "f32[s77, 16]" = torch.ops.aten.sin.default(getitem_1)
+
+        partitioned_fw_subgraph_0_3 = self.partitioned_fw_subgraph_0_1
+
+        invoke_subgraph_12 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_3, 'partitioned_fw_subgraph_0_1', primals_1, sin);  partitioned_fw_subgraph_0_3 = sin = None
+        getitem_21: "Sym(s77)" = invoke_subgraph_12[2]
+        getitem_20: "f32[s77, 16]" = invoke_subgraph_12[1]
+        getitem_2: "f32[s77, 16]" = invoke_subgraph_12[0];  invoke_subgraph_12 = None
+
+        partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
+
+        invoke_subgraph_14 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, getitem_2);  partitioned_fw_subgraph_0_0 = None
+        getitem_23: "Sym(s77)" = invoke_subgraph_14[2]
+        getitem_22: "f32[s77, 16]" = invoke_subgraph_14[1]
+        getitem_3: "f32[s77, 16]" = invoke_subgraph_14[0];  invoke_subgraph_14 = None
+
+        sum_1: "f32[]" = torch.ops.aten.sum.default(getitem_2);  getitem_2 = None
+        sum_2: "f32[]" = torch.ops.aten.sum.default(getitem_3);  getitem_3 = None
+        add_15: "f32[]" = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
+
+        cos: "f32[s77, 16]" = torch.ops.aten.cos.default(getitem_1);  getitem_1 = None
+        return (add_15, getitem_16, getitem_18, getitem_20, getitem_22, cos, primals_1, getitem_17, getitem_19, getitem_21, getitem_23)
+
+    class partitioned_fw_subgraph_0_1(torch.nn.Module):
+        def forward(self, primals_0: "Sym(s77)", primals_1: "f32[s77, 16]"):
+            cos: "f32[s77, 16]" = torch.ops.aten.cos.default(primals_1)
+            return (cos, primals_1, primals_0)
+
+    class partitioned_fw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "Sym(s77)", primals_1: "f32[s77, 16]"):
+            cos: "f32[s77, 16]" = torch.ops.aten.cos.default(primals_1)
+            return (cos, primals_1, primals_0)
+""",
+            )
+            self.assertExpectedInline(
+                normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "Sym(s77)", getitem_17: "Sym(s77)", getitem_19: "Sym(s77)", getitem_21: "Sym(s77)", getitem_23: "Sym(s77)", getitem_16: "f32[s77, 16]", getitem_18: "f32[s77, 16]", getitem_20: "f32[s77, 16]", getitem_22: "f32[s77, 16]", cos: "f32[s77, 16]", tangents_1: "f32[]"):
+        expand: "f32[s77, 16]" = torch.ops.aten.expand.default(tangents_1, [primals_1, 16]);  tangents_1 = primals_1 = None
+
+        partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
+
+        invoke_subgraph_15 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_23, getitem_22, expand);  partitioned_bw_subgraph_0_0 = getitem_23 = getitem_22 = None
+        getitem_5: "f32[s77, 16]" = invoke_subgraph_15[1];  invoke_subgraph_15 = None
+
+        add_16: "f32[s77, 16]" = torch.ops.aten.add.Tensor(expand, getitem_5);  expand = getitem_5 = None
+
+        partitioned_bw_subgraph_0_3 = self.partitioned_bw_subgraph_0_1
+
+        invoke_subgraph_13 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_3, 'partitioned_bw_subgraph_0_1', getitem_21, getitem_20, add_16);  partitioned_bw_subgraph_0_3 = getitem_21 = getitem_20 = add_16 = None
+        getitem_8: "f32[s77, 16]" = invoke_subgraph_13[1];  invoke_subgraph_13 = None
+
+        mul_10: "f32[s77, 16]" = torch.ops.aten.mul.Tensor(getitem_8, cos);  getitem_8 = cos = None
+
+        partitioned_bw_subgraph_0_2 = self.partitioned_bw_subgraph_0_1
+
+        invoke_subgraph_11 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_2, 'partitioned_bw_subgraph_0_1', getitem_19, getitem_18, mul_10);  partitioned_bw_subgraph_0_2 = getitem_19 = getitem_18 = mul_10 = None
+        getitem_11: "f32[s77, 16]" = invoke_subgraph_11[1];  invoke_subgraph_11 = None
+
+        partitioned_bw_subgraph_0_1 = self.partitioned_bw_subgraph_0_1
+
+        invoke_subgraph_9 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_1, 'partitioned_bw_subgraph_0_1', getitem_17, getitem_16, getitem_11);  partitioned_bw_subgraph_0_1 = getitem_17 = getitem_16 = getitem_11 = None
+        getitem_14: "f32[s77, 16]" = invoke_subgraph_9[1];  invoke_subgraph_9 = None
+        return (None, getitem_14)
+
+    class partitioned_bw_subgraph_0_0(torch.nn.Module):
+        def forward(self, primals_0: "Sym(s77)", primals_1: "f32[s77, 16]", tangents_0: "f32[s77, 16]"):
+            sin: "f32[s77, 16]" = torch.ops.aten.sin.default(primals_1);  primals_1 = None
+            neg: "f32[s77, 16]" = torch.ops.aten.neg.default(sin);  sin = None
+            mul_9: "f32[s77, 16]" = torch.ops.aten.mul.Tensor(tangents_0, neg);  tangents_0 = neg = None
+            return (None, mul_9)
+
+    class partitioned_bw_subgraph_0_1(torch.nn.Module):
+        def forward(self, primals_0: "Sym(s77)", primals_1: "f32[s77, 16]", tangents_0: "f32[s77, 16]"):
+            sin: "f32[s77, 16]" = torch.ops.aten.sin.default(primals_1);  primals_1 = None
+            neg: "f32[s77, 16]" = torch.ops.aten.neg.default(sin);  sin = None
+            mul_10: "f32[s77, 16]" = torch.ops.aten.mul.Tensor(tangents_0, neg);  tangents_0 = neg = None
+            return (None, mul_10)
+""",
+            )
+
+    def test_div(self):
+        @nested_compile_region
+        def gn(x):
+            div = torch.div(1024, 256, rounding_mode="trunc")
+            return div * torch.ones(64, div) * x
+
+        def fn(x):
+            return gn(x)
+
+        x = torch.randn(64, 1, requires_grad=True)
+
+        opt_fn = torch.compile(fn, fullgraph=True)
+
+        ref = fn(x)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
+    @requires_gpu
+    def test_preserves_strides(self):
+        class _CustomPass(PatternMatcherPass):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def __call__(self, g: torch.fx.Graph):
+                self.apply(g)
+
+        g = _CustomPass()
+        called = False
+
+        x = torch.randn(4, 4, 2, 2, device=GPU_TYPE)
+        other = torch.randn(4, 4, 2, 2, device=GPU_TYPE)
+
+        @register_graph_pattern(
+            CallFunctionVarArgs(torch.ops.aten.permute),
+            pass_dict=g,
+        )
+        def _(match, *args, **kwargs):
+            flat_args, spec = pytree.tree_flatten((args, kwargs))
+
+            def decomp(*flat_args):
+                args, kwargs = pytree.tree_unflatten(flat_args, spec)
+                return torch.ops.mylib.force_channels_last(
+                    torch.ops.aten.permute(*args, **kwargs)
+                )
+
+            nonlocal called
+            called = True
+            match.replace_by_example(decomp, flat_args)
+
+        from torch._inductor import config
+
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            lib.define(
+                "force_channels_last(Tensor x) -> Tensor",
+                tags=[torch._C.Tag.flexible_layout],
+            )
+
+            def impl2(x):
+                return x.clone(memory_format=torch.channels_last)
+
+            lib.impl("force_channels_last", impl2, "CompositeExplicitAutograd")
+
+            lib.define(
+                "add_op(Tensor x, Tensor y) -> Tensor",
+            )
+
+            def impl(x, y):
+                out = y.clone()  # contiguous with strides (16, 4, 2, 1)
+                out.add_(x.transpose(-1, -2))
+                return out
+
+            def meta(x, y):
+                return torch.empty_like(y, memory_format=torch.contiguous_format)
+
+            lib.impl("add_op", impl, "CompositeExplicitAutograd")
+            lib.impl("add_op", meta, "Meta")
+
+            @nested_compile_region
+            def gn(y, z):
+                return torch.ops.mylib.add_op.default(y, z)
+
+            def f(x, other):
+                y = x.transpose(2, 3).contiguous().transpose(2, 3)
+                z = y.sin().transpose(2, 3)
+                return gn(y, z)
+
+            with config.patch(
+                post_grad_custom_post_pass=g,
+            ):
+                f_compile = torch.compile(f, fullgraph=True)
+                self.assertEqual(f(x, other), f_compile(x, other))
+                self.assertTrue(called)
+
+    @requires_gpu
+    def test_preserves_output_strides(self):
+        # Have a graph pass that changes strides for the output op of the
+        # invoke_subgraph, and check if the output strides are preserved
+        x = torch.randn(4, 4, 2, 2, device=GPU_TYPE)
+        other = torch.randn(4, 4, 2, 2, device=GPU_TYPE)
+
+        class _CustomPass(PatternMatcherPass):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def __call__(self, g: torch.fx.Graph):
+                self.apply(g)
+
+        g = _CustomPass()
+        called = False
+
+        @register_graph_pattern(
+            CallFunctionVarArgs(torch.ops.aten.permute),
+            pass_dict=g,
+        )
+        def _(match, *args, **kwargs):
+            flat_args, spec = pytree.tree_flatten((args, kwargs))
+
+            def decomp(*flat_args):
+                args, kwargs = pytree.tree_unflatten(flat_args, spec)
+                return torch.ops.mylib.force_channels_last(
+                    torch.ops.aten.permute(*args, **kwargs)
+                )
+
+            nonlocal called
+            called = True
+            match.replace_by_example(decomp, flat_args)
+
+        from torch._inductor import config
+
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            lib.define(
+                "force_channels_last(Tensor x) -> Tensor",
+                tags=[torch._C.Tag.flexible_layout],
+            )
+
+            def impl2(x):
+                return x.clone(memory_format=torch.channels_last)
+
+            lib.impl("force_channels_last", impl2, "CompositeExplicitAutograd")
+
+            lib.define(
+                "add_op(Tensor x, Tensor y) -> Tensor",
+            )
+
+            def impl(x, y):
+                # Check that the input strides are preserved. This helps in
+                # testing that the HOP preserves the output strides.
+                assert x.stride() == (16, 4, 1, 2)
+                assert y.stride() == (16, 4, 2, 1)
+                out = y.clone()  # contiguous with strides (16, 4, 2, 1)
+                out.add_(x.transpose(-1, -2))
+                return out
+
+            def meta(x, y):
+                return torch.empty_like(y, memory_format=torch.contiguous_format)
+
+            lib.impl("add_op", impl, "CompositeExplicitAutograd")
+            lib.impl("add_op", meta, "Meta")
+
+            @nested_compile_region
+            def gn(x, other):
+                y = x.transpose(2, 3).contiguous().transpose(2, 3)
+                z = y.sin().transpose(2, 3)
+                return y, z
+
+            def f(x, other):
+                y, z = gn(x, other)
+                return torch.ops.mylib.add_op.default(y, z)
+
+            with config.patch(
+                post_grad_custom_post_pass=g,
+            ):
+                f_compile = torch.compile(f, fullgraph=True)
+                self.assertEqual(f(x, other), f_compile(x, other))
+                self.assertTrue(called)
+
+
+@skipIfTorchDynamo("Not a torch._dynamo test")
+@parameterized_class(
+    [
+        {"strict": False},
+        {"strict": True},
+    ],
+    class_name_func=lambda cls,
+    _,
+    params: f"{cls.__name__}{'Strict' if params['strict'] else 'Nonstrict'}",
+)
+class TestInvokeSubgraphExport(TestCase):
+    def test_simple_func(self):
+        @nested_compile_region
+        def gn(x, y):
+            return torch.mul(x, y)
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                x = gn(x, y)
+                x = gn(x, y)
+                return x
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+
+        ep = torch.export.export(M(), (x, y), strict=self.strict)
+        self.assertTrue(torch.allclose(ep.module()(x, y), M()(x, y)))
+        self.assertEqual(len(list(ep.graph_module.named_modules())), 2)
+
+        self.assertExpectedInline(
+            normalize_gm(ep.graph_module.print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, x: "f32[8]", y: "f32[8]"):
+        repeated_subgraph0 = self.repeated_subgraph0
+        invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', x, y);  repeated_subgraph0 = x = None
+        getitem: "f32[8]" = invoke_subgraph[0];  invoke_subgraph = None
+
+        repeated_subgraph0_1 = self.repeated_subgraph0
+        invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', getitem, y);  repeated_subgraph0_1 = getitem = y = None
+        getitem_1: "f32[8]" = invoke_subgraph_1[0];  invoke_subgraph_1 = None
+        return (getitem_1,)
+
+    class repeated_subgraph0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[8]", arg1_1: "f32[8]"):
+            mul: "f32[8]" = torch.ops.aten.mul.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
+            return (mul,)
+""",
+        )
+
+    def test_unbacked(self):
+        @nested_compile_region
+        def gn(x, y):
+            b = x.item()
+            torch._check_is_size(b)
+            torch._check(b < y.shape[0])
+            return y[:b].clone()
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                res = []
+                for _ in range(10):
+                    res.append(gn(x, y))
+                return torch.cat(res)
+
+        x = torch.tensor(4)
+        y = torch.randn(8)
+
+        ep = torch.export.export(M(), (x, y), strict=self.strict)
+        ep = ep.run_decompositions()
+
+        self.assertTrue(torch.allclose(ep.module()(x, y), M()(x, y)))
+        self.assertEqual(len(list(ep.graph_module.named_modules())), 2)
+
+    def test_pending_unbacked(self):
+        class M(torch.nn.Module):
+            @nested_compile_region
+            def gn(self, x):
+                u = x[0].item()
+                return x * u
+
+            def forward(self, x):
+                for _ in range(4):
+                    x = self.gn(x)
+                return x
+
+        ep = torch.export.export(
+            M(),
+            (torch.randn(8),),
+            strict=self.strict,
+            dynamic_shapes={"x": {0: torch.export.Dim.DYNAMIC}},
+        )
+        ep = ep.run_decompositions()
+
+        self.assertEqual(len(list(ep.graph_module.named_modules())), 2)
+
+        ep = torch.export.export(
+            M(),
+            (torch.randn(8, requires_grad=True),),
+            strict=self.strict,
+            dynamic_shapes={"x": {0: torch.export.Dim.DYNAMIC}},
+        )
+        ep = ep.run_decompositions()
+
+        self.assertEqual(len(list(ep.graph_module.named_modules())), 2)
+
+    def test_simple_method(self):
+        class M(torch.nn.Module):
+            @nested_compile_region
+            def gn(self, x, y):
+                return torch.mul(x, y)
+
+            def forward(self, x, y):
+                x = self.gn(x, y)
+                x = self.gn(x, y)
+                return x
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+
+        ep = torch.export.export(M(), (x, y), strict=self.strict)
+        self.assertTrue(torch.allclose(ep.module()(x, y), M()(x, y)))
+        self.assertEqual(len(list(ep.graph_module.named_modules())), 2)
+
+    def test_multiple_module(self):
+        b = torch.randn(8)
+
+        class N(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", b)
+
+            @nested_compile_region
+            def forward(self, x, y):
+                return x * y + self.buf
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod_list = torch.nn.ModuleList(N() for _ in range(10))
+
+            def forward(self, x, y):
+                for m in self.mod_list:
+                    x = m(x, y)
+                return x
+
+        x = torch.randn(8, requires_grad=True)
+        y = torch.randn(8, requires_grad=True)
+
+        ep = torch.export.export(M(), (x, y), strict=self.strict)
+        self.assertTrue(torch.allclose(ep.module()(x, y), M()(x, y)))
+        self.assertEqual(len(list(ep.graph_module.named_modules())), 2)
+
+
+class NegativeTesting(TestCase):
+    def test_graph_break(self):
+        @nested_compile_region
+        def gn(x):
+            torch._dynamo.graph_break()
+            return torch.cos(x)
+
+        def fn(x):
+            return gn(x)
+
+        x = torch.randn(8, 8, requires_grad=True)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+        ):
+            torch.compile(fn, backend="eager")(x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/higher_order_ops/test_with_effects.py b/test/higher_order_ops/test_with_effects.py
index cc05980dbe98..cc785d83e5a1 100644
--- a/test/higher_order_ops/test_with_effects.py
+++ b/test/higher_order_ops/test_with_effects.py
@@ -67,9 +67,15 @@ def fn_req_grad(t):
     out = aot_function(
         f,
         fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
+<<<<<<< HEAD
         bw_compiler=partial(extract_graph, graph_cell=bw_graph_cell)
         if requires_grad
         else nop,
+=======
+        bw_compiler=(
+            partial(extract_graph, graph_cell=bw_graph_cell) if requires_grad else nop
+        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         partition_fn=partitioner,
         decompositions=default_decompositions,
         dynamic=dynamic,
diff --git a/test/inductor/cpp/CMakeLists.txt b/test/inductor/cpp/CMakeLists.txt
index cc4954fc895a..c9ce60d1d058 100644
--- a/test/inductor/cpp/CMakeLists.txt
+++ b/test/inductor/cpp/CMakeLists.txt
@@ -43,5 +43,9 @@ FetchContent_MakeAvailable(googletest)
 # TODO(voz): This is a little assumptive of just this one test, rewrite with real dir includes
 include_directories(${ATEN_INCLUDE})
 add_executable(test_cpp_prefix test_cpp_prefix.cpp ../../torchinductor/codegen/cpp_prefix.h)
+<<<<<<< HEAD
 target_link_libraries(test_cpp_prefix gtest gtest_main)
+=======
+target_link_libraries(test_cpp_prefix gtest_main)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 add_test(NAME test_cpp_prefix COMMAND test_cpp_prefix)
diff --git a/test/inductor/custom_ops.cpp b/test/inductor/custom_ops.cpp
index abe839a06e8d..f0e95b56ca36 100644
--- a/test/inductor/custom_ops.cpp
+++ b/test/inductor/custom_ops.cpp
@@ -1,5 +1,11 @@
 #include <torch/csrc/api/include/torch/types.h>  // @manual=fbcode//caffe2:libtorch
 
+<<<<<<< HEAD
+=======
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cstdint>
 #include <iostream>
 #include <string>
@@ -10,6 +16,70 @@ Tensor custom_add_impl(Tensor t1, Tensor t2) {
   return t1 + t2;
 }
 
+<<<<<<< HEAD
+=======
+std::tuple<Tensor, std::optional<Tensor>, std::optional<Tensor>> fn_with_optional_tensor_output_impl(Tensor t1, Tensor t2) {
+  Tensor t3 = t1 + t2;
+  Tensor t4 = t1 - t2;
+  Tensor t5;
+  return {t3, t4, t5};
+}
+
+std::tuple<Tensor, std::optional<Tensor>, std::optional<Tensor>> fn_with_optional_tensor_output_meta(Tensor t1, Tensor t2) {
+  Tensor t3 = t1.clone();
+  Tensor t4 = t1.clone();
+  Tensor t5;
+  return {t3, t4, t5};
+}
+
+std::tuple<Tensor, std::optional<Tensor>, std::optional<Tensor>> fn_with_optional_tensor_output_2_impl(Tensor t1, Tensor t2) {
+  Tensor t3 = t1 + t2;
+  Tensor t4;
+  Tensor t5 = t1 - t2;
+  return {t3, t4, t5};
+}
+
+std::tuple<Tensor, std::optional<Tensor>, std::optional<Tensor>> fn_with_optional_tensor_output_2_meta(Tensor t1, Tensor t2) {
+  Tensor t3 = t1.clone();
+  Tensor t4;
+  Tensor t5 = t1.clone();
+  return {t3, t4, t5};
+}
+
+std::tuple<Tensor, std::optional<Tensor>, std::optional<Tensor>, std::optional<Tensor>> fn_with_optional_tensor_nullopt_output_impl(Tensor t1, Tensor t2) {
+  Tensor t3 = t1 + t2;
+  Tensor t4;
+  Tensor t5 = t1 - t2;
+  return {t3, t4, t5, std::nullopt};
+}
+
+
+std::tuple<Tensor, std::optional<Tensor>, std::optional<Tensor>, std::optional<Tensor>> fn_with_optional_tensor_nullopt_output_meta(Tensor t1, Tensor t2) {
+  Tensor t3 = t1.clone();
+  Tensor t4;
+  Tensor t5 = t1.clone();
+  return {t3, t4, t5, std::nullopt};
+}
+
+std::tuple<Tensor, std::optional<Tensor>, std::optional<Tensor>, int64_t, int64_t> fn_with_int_output_impl(Tensor t1, Tensor t2, int64_t i1) {
+  Tensor t3 = t1 + t2;
+  Tensor t4 = t1 - t2;
+  Tensor t5;
+  int64_t i2 = 0;
+  int64_t i3 = 0;
+  return {t3, t4, t5, i2, i3};
+}
+
+std::tuple<Tensor, std::optional<Tensor>, std::optional<Tensor>, int64_t, int64_t> fn_with_int_output_meta(Tensor t1, Tensor t2, int64_t i1) {
+  Tensor t3 = t1.clone();
+  Tensor t4 = t1.clone();
+  Tensor t5;
+  int64_t i2 = 0;
+  int64_t i3 = 0;
+  return {t3, t4, t5, i2, i3};
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor fn_with_all_inputs_impl(
     const Tensor& tensor,
     const c10::List<Tensor>& tensors,
@@ -310,10 +380,53 @@ void fn_out_variant_without_return_meta(
     Tensor& out) {
 }
 
+<<<<<<< HEAD
 } // namespace at
 
 TORCH_LIBRARY(aoti_custom_ops, m) {
   m.def("custom_add(Tensor t1, Tensor t2) -> Tensor");
+=======
+Tensor fn_square_impl(const Tensor& tensor) {
+  return tensor * tensor;
+}
+
+Tensor fn_square_meta(const Tensor& tensor) {
+  return at::empty_like(tensor);
+}
+} // namespace at
+
+
+extern "C" {
+  AOTI_TORCH_EXPORT AOTITorchError
+  aoti_torch_cpu_fn_square(
+      AtenTensorHandle input,
+      AtenTensorHandle* ret) {
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto tmp_result = at::fn_square_impl(
+          torch::aot_inductor::resolve_tensor_dispatch_flags(input));
+      *ret = torch::aot_inductor::new_tensor_handle(std::move(tmp_result));
+    });
+  }
+
+  AOTI_TORCH_EXPORT AOTITorchError
+  aoti_torch_cuda_fn_square(
+      AtenTensorHandle input,
+      AtenTensorHandle* ret) {
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto tmp_result = at::fn_square_impl(
+          torch::aot_inductor::resolve_tensor_dispatch_flags(input));
+      *ret = torch::aot_inductor::new_tensor_handle(std::move(tmp_result));
+    });
+  }
+}
+
+TORCH_LIBRARY(aoti_custom_ops, m) {
+  m.def("custom_add(Tensor t1, Tensor t2) -> Tensor");
+  m.def("fn_with_optional_tensor_output(Tensor t1, Tensor t2) -> (Tensor, Tensor?, Tensor?)");
+  m.def("fn_with_optional_tensor_output_2(Tensor t1, Tensor t2) -> (Tensor, Tensor?, Tensor?)");
+  m.def("fn_with_optional_tensor_nullopt_output(Tensor t1, Tensor t2) -> (Tensor, Tensor?, Tensor?, Tensor?)");
+  m.def("fn_with_int_output(Tensor t1, Tensor t2, int i) -> (Tensor, Tensor?, Tensor?, int, int)");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def(
       "fn_with_all_inputs(Tensor tensor, "
       "Tensor[] tensors, "
@@ -354,10 +467,21 @@ TORCH_LIBRARY(aoti_custom_ops, m) {
       "fn_with_input_mutation(Tensor(a!) t0, Tensor t1, Tensor(b!) t2) -> (Tensor, Tensor)");
 
   m.def("fn_out_variant_without_return(Tensor x, Tensor(a!) out) -> ()");
+<<<<<<< HEAD
+=======
+  m.def("fn_square(Tensor x) -> Tensor");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 TORCH_LIBRARY_IMPL(aoti_custom_ops, CompositeExplicitAutograd, m) {
   m.impl("custom_add", at::custom_add_impl);
+<<<<<<< HEAD
+=======
+  m.impl("fn_with_optional_tensor_output", at::fn_with_optional_tensor_output_impl);
+  m.impl("fn_with_optional_tensor_output_2", at::fn_with_optional_tensor_output_2_impl);
+  m.impl("fn_with_optional_tensor_nullopt_output", at::fn_with_optional_tensor_nullopt_output_impl);
+  m.impl("fn_with_int_output", at::fn_with_int_output_impl);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.impl("fn_with_all_inputs", at::fn_with_all_inputs_impl);
   m.impl("fn_with_default_input", at::fn_with_default_input_impl);
   m.impl("fn_with_tuple_output", at::fn_with_tuple_output_impl);
@@ -365,9 +489,20 @@ TORCH_LIBRARY_IMPL(aoti_custom_ops, CompositeExplicitAutograd, m) {
   m.impl("fn_with_mix_outputs", at::fn_with_mix_outputs_impl);
   m.impl("fn_with_input_mutation", at::fn_with_input_mutation_impl);
   m.impl("fn_out_variant_without_return", at::fn_out_variant_without_return_impl);
+<<<<<<< HEAD
+}
+
+TORCH_LIBRARY_IMPL(aoti_custom_ops, Meta, m) {
+=======
+  m.impl("fn_square", at::fn_square_impl);
 }
 
 TORCH_LIBRARY_IMPL(aoti_custom_ops, Meta, m) {
+  m.impl("fn_with_optional_tensor_output", at::fn_with_optional_tensor_output_meta);
+  m.impl("fn_with_optional_tensor_output_2", at::fn_with_optional_tensor_output_2_meta);
+  m.impl("fn_with_optional_tensor_nullopt_output", at::fn_with_optional_tensor_nullopt_output_meta);
+  m.impl("fn_with_int_output", at::fn_with_int_output_meta);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.impl("fn_with_all_inputs", at::fn_with_all_inputs_meta);
   m.impl("fn_with_default_input", at::fn_with_default_input_meta);
   m.impl("fn_with_tuple_output", at::fn_with_tuple_output_meta);
@@ -375,4 +510,8 @@ TORCH_LIBRARY_IMPL(aoti_custom_ops, Meta, m) {
   m.impl("fn_with_mix_outputs", at::fn_with_mix_outputs_meta);
   m.impl("fn_with_input_mutation", at::fn_with_input_mutation_meta);
   m.impl("fn_out_variant_without_return", at::fn_out_variant_without_return_meta);
+<<<<<<< HEAD
+=======
+  m.impl("fn_square", at::fn_square_meta);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/test/inductor/extension_backends/triton/device_interface.py b/test/inductor/extension_backends/triton/device_interface.py
index d1e997383642..bf8045979352 100644
--- a/test/inductor/extension_backends/triton/device_interface.py
+++ b/test/inductor/extension_backends/triton/device_interface.py
@@ -76,6 +76,7 @@ def set_device(device) -> None:
 
     @staticmethod
     def device_count() -> int:
+<<<<<<< HEAD
         raise NotImplementedError
 
     @staticmethod
@@ -83,10 +84,20 @@ def maybe_exchange_device(device: int) -> int:
         assert (
             device == 0
         ), f"Only device index 0 is supported, tried to set index to {device}"
+=======
+        return 1
+
+    @staticmethod
+    def maybe_exchange_device(device: int) -> int:
+        assert device == 0, (
+            f"Only device index 0 is supported, tried to set index to {device}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return 0  # previous device is always 0
 
     @staticmethod
     def exchange_device(device: int) -> int:
+<<<<<<< HEAD
         assert (
             device == 0
         ), f"Only device index 0 is supported, tried to set index to {device}"
@@ -101,6 +112,14 @@ def set_stream(stream) -> None:
         raise NotImplementedError
 
     @staticmethod
+=======
+        assert device == 0, (
+            f"Only device index 0 is supported, tried to set index to {device}"
+        )
+        return 0  # previous device is always 0
+
+    @staticmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_raw_stream(device_index: int):
         return None
 
@@ -108,10 +127,13 @@ def get_raw_stream(device_index: int):
     def synchronize(device) -> None:
         pass
 
+<<<<<<< HEAD
     @classmethod
     def get_device_properties(cls, device=None) -> DeviceProperties:
         return cls.Worker.get_device_properties(device)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Can be mock patched by @patch decorator.
     @staticmethod
     def is_available() -> bool:
@@ -120,7 +142,10 @@ def is_available() -> bool:
     @staticmethod
     def get_compute_capability(device) -> int:
         return 0
+<<<<<<< HEAD
 
     @staticmethod
     def triton_supported() -> bool:
         return True
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/inductor/indirect_assert_helper.py b/test/inductor/indirect_assert_helper.py
index ca9819ac1ec9..d08861e1c590 100644
--- a/test/inductor/indirect_assert_helper.py
+++ b/test/inductor/indirect_assert_helper.py
@@ -58,9 +58,15 @@ def lower2(x):
     assert dims in ("2", "3")
     shape_x = [3, 2, 4] if dims == "3" else [3, 2]
     if one_size:
+<<<<<<< HEAD
         assert (
             fn_name == "first_arg"
         ), "only first_arg can be tested for a special case of 1-size tensor"
+=======
+        assert fn_name == "first_arg", (
+            "only first_arg can be tested for a special case of 1-size tensor"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape_x[0] = 1
     assert dyn_shape in ("True", "False")
     dynamic_shapes = dyn_shape == "True"
diff --git a/test/inductor/s429861_repro.py b/test/inductor/s429861_repro.py
index 239fe8241d4f..f48e9b3aeb3a 100644
--- a/test/inductor/s429861_repro.py
+++ b/test/inductor/s429861_repro.py
@@ -588,6 +588,7 @@ def forward(
         ],
         0.5,
     )
+<<<<<<< HEAD
     getitem_79 = (
         getitem_80
     ) = (
@@ -735,6 +736,89 @@ def forward(
     ) = (
         getitem_152
     ) = getitem_153 = getitem_154 = getitem_155 = getitem_156 = getitem_157 = None
+=======
+    (
+        getitem_79,
+        getitem_80,
+        getitem_81,
+        getitem_82,
+        getitem_83,
+        getitem_84,
+        getitem_85,
+        getitem_86,
+        getitem_87,
+        getitem_88,
+        getitem_89,
+        getitem_90,
+        getitem_91,
+        getitem_92,
+        getitem_93,
+        getitem_94,
+        getitem_95,
+        getitem_96,
+        getitem_97,
+        getitem_98,
+        getitem_99,
+        getitem_100,
+        getitem_101,
+        getitem_102,
+        getitem_103,
+        getitem_104,
+        getitem_105,
+        getitem_106,
+        getitem_107,
+        getitem_108,
+        getitem_109,
+        getitem_110,
+        getitem_111,
+        getitem_112,
+        getitem_113,
+        getitem_114,
+        getitem_115,
+        getitem_116,
+        getitem_117,
+        getitem_118,
+        getitem_119,
+        getitem_120,
+        getitem_121,
+        getitem_122,
+        getitem_123,
+        getitem_124,
+        getitem_125,
+        getitem_126,
+        getitem_127,
+        getitem_128,
+        getitem_129,
+        getitem_130,
+        getitem_131,
+        getitem_132,
+        getitem_133,
+        getitem_134,
+        getitem_135,
+        getitem_136,
+        getitem_137,
+        getitem_138,
+        getitem_139,
+        getitem_140,
+        getitem_141,
+        getitem_142,
+        getitem_143,
+        getitem_144,
+        getitem_145,
+        getitem_146,
+        getitem_147,
+        getitem_148,
+        getitem_149,
+        getitem_150,
+        getitem_151,
+        getitem_152,
+        getitem_153,
+        getitem_154,
+        getitem_155,
+        getitem_156,
+        getitem_157,
+    ) = (None,) * 79
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_158: "f32[50][1]cuda:0" = _foreach_mul_1[0]
     getitem_159: "f32[23][1]cuda:0" = _foreach_mul_1[1]
     getitem_160: "f32[38][1]cuda:0" = _foreach_mul_1[2]
@@ -901,6 +985,7 @@ def forward(
         ],
         0.01,
     )
+<<<<<<< HEAD
     getitem_158 = (
         getitem_159
     ) = (
@@ -1048,6 +1133,89 @@ def forward(
     ) = (
         getitem_231
     ) = getitem_232 = getitem_233 = getitem_234 = getitem_235 = getitem_236 = None
+=======
+    (
+        getitem_158,
+        getitem_159,
+        getitem_160,
+        getitem_161,
+        getitem_162,
+        getitem_163,
+        getitem_164,
+        getitem_165,
+        getitem_166,
+        getitem_167,
+        getitem_168,
+        getitem_169,
+        getitem_170,
+        getitem_171,
+        getitem_172,
+        getitem_173,
+        getitem_174,
+        getitem_175,
+        getitem_176,
+        getitem_177,
+        getitem_178,
+        getitem_179,
+        getitem_180,
+        getitem_181,
+        getitem_182,
+        getitem_183,
+        getitem_184,
+        getitem_185,
+        getitem_186,
+        getitem_187,
+        getitem_188,
+        getitem_189,
+        getitem_190,
+        getitem_191,
+        getitem_192,
+        getitem_193,
+        getitem_194,
+        getitem_195,
+        getitem_196,
+        getitem_197,
+        getitem_198,
+        getitem_199,
+        getitem_200,
+        getitem_201,
+        getitem_202,
+        getitem_203,
+        getitem_204,
+        getitem_205,
+        getitem_206,
+        getitem_207,
+        getitem_208,
+        getitem_209,
+        getitem_210,
+        getitem_211,
+        getitem_212,
+        getitem_213,
+        getitem_214,
+        getitem_215,
+        getitem_216,
+        getitem_217,
+        getitem_218,
+        getitem_219,
+        getitem_220,
+        getitem_221,
+        getitem_222,
+        getitem_223,
+        getitem_224,
+        getitem_225,
+        getitem_226,
+        getitem_227,
+        getitem_228,
+        getitem_229,
+        getitem_230,
+        getitem_231,
+        getitem_232,
+        getitem_233,
+        getitem_234,
+        getitem_235,
+        getitem_236,
+    ) = (None,) * 79
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_237: "f32[50][1]cuda:0" = _foreach_add[0]
     getitem_238: "f32[23][1]cuda:0" = _foreach_add[1]
     getitem_239: "f32[38][1]cuda:0" = _foreach_add[2]
@@ -1294,6 +1462,7 @@ def forward(
             getitem_315,
         ],
     )
+<<<<<<< HEAD
     getitem = (
         getitem_1
     ) = (
@@ -1599,6 +1768,168 @@ def forward(
     ) = (
         getitem_310
     ) = getitem_311 = getitem_312 = getitem_313 = getitem_314 = getitem_315 = None
+=======
+    (
+        getitem,
+        getitem_1,
+        getitem_2,
+        getitem_3,
+        getitem_4,
+        getitem_5,
+        getitem_6,
+        getitem_7,
+        getitem_8,
+        getitem_9,
+        getitem_10,
+        getitem_11,
+        getitem_12,
+        getitem_13,
+        getitem_14,
+        getitem_15,
+        getitem_16,
+        getitem_17,
+        getitem_18,
+        getitem_19,
+        getitem_20,
+        getitem_21,
+        getitem_22,
+        getitem_23,
+        getitem_24,
+        getitem_25,
+        getitem_26,
+        getitem_27,
+        getitem_28,
+        getitem_29,
+        getitem_30,
+        getitem_31,
+        getitem_32,
+        getitem_33,
+        getitem_34,
+        getitem_35,
+        getitem_36,
+        getitem_37,
+        getitem_38,
+        getitem_39,
+        getitem_40,
+        getitem_41,
+        getitem_42,
+        getitem_43,
+        getitem_44,
+        getitem_45,
+        getitem_46,
+        getitem_47,
+        getitem_48,
+        getitem_49,
+        getitem_50,
+        getitem_51,
+        getitem_52,
+        getitem_53,
+        getitem_54,
+        getitem_55,
+        getitem_56,
+        getitem_57,
+        getitem_58,
+        getitem_59,
+        getitem_60,
+        getitem_61,
+        getitem_62,
+        getitem_63,
+        getitem_64,
+        getitem_65,
+        getitem_66,
+        getitem_67,
+        getitem_68,
+        getitem_69,
+        getitem_70,
+        getitem_71,
+        getitem_72,
+        getitem_73,
+        getitem_74,
+        getitem_75,
+        getitem_76,
+        getitem_77,
+        getitem_78,
+        getitem_237,
+        getitem_238,
+        getitem_239,
+        getitem_240,
+        getitem_241,
+        getitem_242,
+        getitem_243,
+        getitem_244,
+        getitem_245,
+        getitem_246,
+        getitem_247,
+        getitem_248,
+        getitem_249,
+        getitem_250,
+        getitem_251,
+        getitem_252,
+        getitem_253,
+        getitem_254,
+        getitem_255,
+        getitem_256,
+        getitem_257,
+        getitem_258,
+        getitem_259,
+        getitem_260,
+        getitem_261,
+        getitem_262,
+        getitem_263,
+        getitem_264,
+        getitem_265,
+        getitem_266,
+        getitem_267,
+        getitem_268,
+        getitem_269,
+        getitem_270,
+        getitem_271,
+        getitem_272,
+        getitem_273,
+        getitem_274,
+        getitem_275,
+        getitem_276,
+        getitem_277,
+        getitem_278,
+        getitem_279,
+        getitem_280,
+        getitem_281,
+        getitem_282,
+        getitem_283,
+        getitem_284,
+        getitem_285,
+        getitem_286,
+        getitem_287,
+        getitem_288,
+        getitem_289,
+        getitem_290,
+        getitem_291,
+        getitem_292,
+        getitem_293,
+        getitem_294,
+        getitem_295,
+        getitem_296,
+        getitem_297,
+        getitem_298,
+        getitem_299,
+        getitem_300,
+        getitem_301,
+        getitem_302,
+        getitem_303,
+        getitem_304,
+        getitem_305,
+        getitem_306,
+        getitem_307,
+        getitem_308,
+        getitem_309,
+        getitem_310,
+        getitem_311,
+        getitem_312,
+        getitem_313,
+        getitem_314,
+        getitem_315,
+    ) = (None,) * 158
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_316: "f32[50][1]cuda:0" = _foreach_div[0]
     getitem_317: "f32[23][1]cuda:0" = _foreach_div[1]
     getitem_318: "f32[38][1]cuda:0" = _foreach_div[2]
@@ -1765,6 +2096,7 @@ def forward(
         ],
         inf,
     )
+<<<<<<< HEAD
     getitem_316 = (
         getitem_317
     ) = (
@@ -1912,6 +2244,89 @@ def forward(
     ) = (
         getitem_389
     ) = getitem_390 = getitem_391 = getitem_392 = getitem_393 = getitem_394 = None
+=======
+    (
+        getitem_316,
+        getitem_317,
+        getitem_318,
+        getitem_319,
+        getitem_320,
+        getitem_321,
+        getitem_322,
+        getitem_323,
+        getitem_324,
+        getitem_325,
+        getitem_326,
+        getitem_327,
+        getitem_328,
+        getitem_329,
+        getitem_330,
+        getitem_331,
+        getitem_332,
+        getitem_333,
+        getitem_334,
+        getitem_335,
+        getitem_336,
+        getitem_337,
+        getitem_338,
+        getitem_339,
+        getitem_340,
+        getitem_341,
+        getitem_342,
+        getitem_343,
+        getitem_344,
+        getitem_345,
+        getitem_346,
+        getitem_347,
+        getitem_348,
+        getitem_349,
+        getitem_350,
+        getitem_351,
+        getitem_352,
+        getitem_353,
+        getitem_354,
+        getitem_355,
+        getitem_356,
+        getitem_357,
+        getitem_358,
+        getitem_359,
+        getitem_360,
+        getitem_361,
+        getitem_362,
+        getitem_363,
+        getitem_364,
+        getitem_365,
+        getitem_366,
+        getitem_367,
+        getitem_368,
+        getitem_369,
+        getitem_370,
+        getitem_371,
+        getitem_372,
+        getitem_373,
+        getitem_374,
+        getitem_375,
+        getitem_376,
+        getitem_377,
+        getitem_378,
+        getitem_379,
+        getitem_380,
+        getitem_381,
+        getitem_382,
+        getitem_383,
+        getitem_384,
+        getitem_385,
+        getitem_386,
+        getitem_387,
+        getitem_388,
+        getitem_389,
+        getitem_390,
+        getitem_391,
+        getitem_392,
+        getitem_393,
+        getitem_394,
+    ) = (None,) * 79
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_395: "f32[][]cuda:0" = _foreach_norm[0]
     getitem_396: "f32[][]cuda:0" = _foreach_norm[1]
     getitem_397: "f32[][]cuda:0" = _foreach_norm[2]
@@ -2078,6 +2493,7 @@ def forward(
         ],
         1.0,
     )
+<<<<<<< HEAD
     getitem_395 = (
         getitem_396
     ) = (
@@ -2225,6 +2641,89 @@ def forward(
     ) = (
         getitem_468
     ) = getitem_469 = getitem_470 = getitem_471 = getitem_472 = getitem_473 = None
+=======
+    (
+        getitem_395,
+        getitem_396,
+        getitem_397,
+        getitem_398,
+        getitem_399,
+        getitem_400,
+        getitem_401,
+        getitem_402,
+        getitem_403,
+        getitem_404,
+        getitem_405,
+        getitem_406,
+        getitem_407,
+        getitem_408,
+        getitem_409,
+        getitem_410,
+        getitem_411,
+        getitem_412,
+        getitem_413,
+        getitem_414,
+        getitem_415,
+        getitem_416,
+        getitem_417,
+        getitem_418,
+        getitem_419,
+        getitem_420,
+        getitem_421,
+        getitem_422,
+        getitem_423,
+        getitem_424,
+        getitem_425,
+        getitem_426,
+        getitem_427,
+        getitem_428,
+        getitem_429,
+        getitem_430,
+        getitem_431,
+        getitem_432,
+        getitem_433,
+        getitem_434,
+        getitem_435,
+        getitem_436,
+        getitem_437,
+        getitem_438,
+        getitem_439,
+        getitem_440,
+        getitem_441,
+        getitem_442,
+        getitem_443,
+        getitem_444,
+        getitem_445,
+        getitem_446,
+        getitem_447,
+        getitem_448,
+        getitem_449,
+        getitem_450,
+        getitem_451,
+        getitem_452,
+        getitem_453,
+        getitem_454,
+        getitem_455,
+        getitem_456,
+        getitem_457,
+        getitem_458,
+        getitem_459,
+        getitem_460,
+        getitem_461,
+        getitem_462,
+        getitem_463,
+        getitem_464,
+        getitem_465,
+        getitem_466,
+        getitem_467,
+        getitem_468,
+        getitem_469,
+        getitem_470,
+        getitem_471,
+        getitem_472,
+        getitem_473,
+    ) = (None,) * 79
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_474: "f32[][]cuda:0" = _foreach_maximum[0]
     getitem_475: "f32[][]cuda:0" = _foreach_maximum[1]
     getitem_476: "f32[][]cuda:0" = _foreach_maximum[2]
@@ -2558,6 +3057,7 @@ def forward(
             getitem_552,
         ],
     )
+<<<<<<< HEAD
     getitem_553 = (
         getitem_554
     ) = (
@@ -2863,6 +3363,168 @@ def forward(
     ) = (
         getitem_547
     ) = getitem_548 = getitem_549 = getitem_550 = getitem_551 = getitem_552 = None
+=======
+    (
+        getitem_553,
+        getitem_554,
+        getitem_555,
+        getitem_556,
+        getitem_557,
+        getitem_558,
+        getitem_559,
+        getitem_560,
+        getitem_561,
+        getitem_562,
+        getitem_563,
+        getitem_564,
+        getitem_565,
+        getitem_566,
+        getitem_567,
+        getitem_568,
+        getitem_569,
+        getitem_570,
+        getitem_571,
+        getitem_572,
+        getitem_573,
+        getitem_574,
+        getitem_575,
+        getitem_576,
+        getitem_577,
+        getitem_578,
+        getitem_579,
+        getitem_580,
+        getitem_581,
+        getitem_582,
+        getitem_583,
+        getitem_584,
+        getitem_585,
+        getitem_586,
+        getitem_587,
+        getitem_588,
+        getitem_589,
+        getitem_590,
+        getitem_591,
+        getitem_592,
+        getitem_593,
+        getitem_594,
+        getitem_595,
+        getitem_596,
+        getitem_597,
+        getitem_598,
+        getitem_599,
+        getitem_600,
+        getitem_601,
+        getitem_602,
+        getitem_603,
+        getitem_604,
+        getitem_605,
+        getitem_606,
+        getitem_607,
+        getitem_608,
+        getitem_609,
+        getitem_610,
+        getitem_611,
+        getitem_612,
+        getitem_613,
+        getitem_614,
+        getitem_615,
+        getitem_616,
+        getitem_617,
+        getitem_618,
+        getitem_619,
+        getitem_620,
+        getitem_621,
+        getitem_622,
+        getitem_623,
+        getitem_624,
+        getitem_625,
+        getitem_626,
+        getitem_627,
+        getitem_628,
+        getitem_629,
+        getitem_630,
+        getitem_631,
+        getitem_474,
+        getitem_475,
+        getitem_476,
+        getitem_477,
+        getitem_478,
+        getitem_479,
+        getitem_480,
+        getitem_481,
+        getitem_482,
+        getitem_483,
+        getitem_484,
+        getitem_485,
+        getitem_486,
+        getitem_487,
+        getitem_488,
+        getitem_489,
+        getitem_490,
+        getitem_491,
+        getitem_492,
+        getitem_493,
+        getitem_494,
+        getitem_495,
+        getitem_496,
+        getitem_497,
+        getitem_498,
+        getitem_499,
+        getitem_500,
+        getitem_501,
+        getitem_502,
+        getitem_503,
+        getitem_504,
+        getitem_505,
+        getitem_506,
+        getitem_507,
+        getitem_508,
+        getitem_509,
+        getitem_510,
+        getitem_511,
+        getitem_512,
+        getitem_513,
+        getitem_514,
+        getitem_515,
+        getitem_516,
+        getitem_517,
+        getitem_518,
+        getitem_519,
+        getitem_520,
+        getitem_521,
+        getitem_522,
+        getitem_523,
+        getitem_524,
+        getitem_525,
+        getitem_526,
+        getitem_527,
+        getitem_528,
+        getitem_529,
+        getitem_530,
+        getitem_531,
+        getitem_532,
+        getitem_533,
+        getitem_534,
+        getitem_535,
+        getitem_536,
+        getitem_537,
+        getitem_538,
+        getitem_539,
+        getitem_540,
+        getitem_541,
+        getitem_542,
+        getitem_543,
+        getitem_544,
+        getitem_545,
+        getitem_546,
+        getitem_547,
+        getitem_548,
+        getitem_549,
+        getitem_550,
+        getitem_551,
+        getitem_552,
+    ) = (None,) * 158
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_632: "f32[1][1]cuda:0" = _foreach_div_1[0]
     getitem_633: "f32[1][1]cuda:0" = _foreach_div_1[1]
     getitem_634: "f32[1][1]cuda:0" = _foreach_div_1[2]
@@ -3664,6 +4326,7 @@ def forward(
         ],
         alpha=1e-05,
     )
+<<<<<<< HEAD
     arg80_1 = (
         arg81_1
     ) = (
@@ -3809,6 +4472,89 @@ def forward(
     ) = (
         arg152_1
     ) = arg153_1 = arg154_1 = arg155_1 = arg156_1 = arg157_1 = arg158_1 = None
+=======
+    (
+        arg80_1,
+        arg81_1,
+        arg82_1,
+        arg83_1,
+        arg84_1,
+        arg85_1,
+        arg86_1,
+        arg87_1,
+        arg88_1,
+        arg89_1,
+        arg90_1,
+        arg91_1,
+        arg92_1,
+        arg93_1,
+        arg94_1,
+        arg95_1,
+        arg96_1,
+        arg97_1,
+        arg98_1,
+        arg99_1,
+        arg100_1,
+        arg101_1,
+        arg102_1,
+        arg103_1,
+        arg104_1,
+        arg105_1,
+        arg106_1,
+        arg107_1,
+        arg108_1,
+        arg109_1,
+        arg110_1,
+        arg111_1,
+        arg112_1,
+        arg113_1,
+        arg114_1,
+        arg115_1,
+        arg116_1,
+        arg117_1,
+        arg118_1,
+        arg119_1,
+        arg120_1,
+        arg121_1,
+        arg122_1,
+        arg123_1,
+        arg124_1,
+        arg125_1,
+        arg126_1,
+        arg127_1,
+        arg128_1,
+        arg129_1,
+        arg130_1,
+        arg131_1,
+        arg132_1,
+        arg133_1,
+        arg134_1,
+        arg135_1,
+        arg136_1,
+        arg137_1,
+        arg138_1,
+        arg139_1,
+        arg140_1,
+        arg141_1,
+        arg142_1,
+        arg143_1,
+        arg144_1,
+        arg145_1,
+        arg146_1,
+        arg147_1,
+        arg148_1,
+        arg149_1,
+        arg150_1,
+        arg151_1,
+        arg152_1,
+        arg153_1,
+        arg154_1,
+        arg155_1,
+        arg156_1,
+        arg157_1,
+        arg158_1,
+    ) = (None,) * 79
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_6952: "f32[50][1]cuda:0" = _foreach_add_1[0]
     getitem_6953: "f32[23][1]cuda:0" = _foreach_add_1[1]
     getitem_6954: "f32[38][1]cuda:0" = _foreach_add_1[2]
@@ -4055,6 +4801,7 @@ def forward(
             getitem_6951,
         ],
     )
+<<<<<<< HEAD
     getitem_6952 = (
         getitem_6953
     ) = (
@@ -4360,6 +5107,168 @@ def forward(
     ) = (
         getitem_6551
     ) = getitem_6631 = getitem_6711 = getitem_6791 = getitem_6871 = getitem_6951 = None
+=======
+    (
+        getitem_6952,
+        getitem_6953,
+        getitem_6954,
+        getitem_6955,
+        getitem_6956,
+        getitem_6957,
+        getitem_6958,
+        getitem_6959,
+        getitem_6960,
+        getitem_6961,
+        getitem_6962,
+        getitem_6963,
+        getitem_6964,
+        getitem_6965,
+        getitem_6966,
+        getitem_6967,
+        getitem_6968,
+        getitem_6969,
+        getitem_6970,
+        getitem_6971,
+        getitem_6972,
+        getitem_6973,
+        getitem_6974,
+        getitem_6975,
+        getitem_6976,
+        getitem_6977,
+        getitem_6978,
+        getitem_6979,
+        getitem_6980,
+        getitem_6981,
+        getitem_6982,
+        getitem_6983,
+        getitem_6984,
+        getitem_6985,
+        getitem_6986,
+        getitem_6987,
+        getitem_6988,
+        getitem_6989,
+        getitem_6990,
+        getitem_6991,
+        getitem_6992,
+        getitem_6993,
+        getitem_6994,
+        getitem_6995,
+        getitem_6996,
+        getitem_6997,
+        getitem_6998,
+        getitem_6999,
+        getitem_7000,
+        getitem_7001,
+        getitem_7002,
+        getitem_7003,
+        getitem_7004,
+        getitem_7005,
+        getitem_7006,
+        getitem_7007,
+        getitem_7008,
+        getitem_7009,
+        getitem_7010,
+        getitem_7011,
+        getitem_7012,
+        getitem_7013,
+        getitem_7014,
+        getitem_7015,
+        getitem_7016,
+        getitem_7017,
+        getitem_7018,
+        getitem_7019,
+        getitem_7020,
+        getitem_7021,
+        getitem_7022,
+        getitem_7023,
+        getitem_7024,
+        getitem_7025,
+        getitem_7026,
+        getitem_7027,
+        getitem_7028,
+        getitem_7029,
+        getitem_7030,
+        getitem_711,
+        getitem_791,
+        getitem_871,
+        getitem_951,
+        getitem_1031,
+        getitem_1111,
+        getitem_1191,
+        getitem_1271,
+        getitem_1351,
+        getitem_1431,
+        getitem_1511,
+        getitem_1591,
+        getitem_1671,
+        getitem_1751,
+        getitem_1831,
+        getitem_1911,
+        getitem_1991,
+        getitem_2071,
+        getitem_2151,
+        getitem_2231,
+        getitem_2311,
+        getitem_2391,
+        getitem_2471,
+        getitem_2551,
+        getitem_2631,
+        getitem_2711,
+        getitem_2791,
+        getitem_2871,
+        getitem_2951,
+        getitem_3031,
+        getitem_3111,
+        getitem_3191,
+        getitem_3271,
+        getitem_3351,
+        getitem_3431,
+        getitem_3511,
+        getitem_3591,
+        getitem_3671,
+        getitem_3751,
+        getitem_3831,
+        getitem_3911,
+        getitem_3991,
+        getitem_4071,
+        getitem_4151,
+        getitem_4231,
+        getitem_4311,
+        getitem_4391,
+        getitem_4471,
+        getitem_4551,
+        getitem_4631,
+        getitem_4711,
+        getitem_4791,
+        getitem_4871,
+        getitem_4951,
+        getitem_5031,
+        getitem_5111,
+        getitem_5191,
+        getitem_5271,
+        getitem_5351,
+        getitem_5431,
+        getitem_5511,
+        getitem_5591,
+        getitem_5671,
+        getitem_5751,
+        getitem_5831,
+        getitem_5911,
+        getitem_5991,
+        getitem_6071,
+        getitem_6151,
+        getitem_6231,
+        getitem_6311,
+        getitem_6391,
+        getitem_6471,
+        getitem_6551,
+        getitem_6631,
+        getitem_6711,
+        getitem_6791,
+        getitem_6871,
+        getitem_6951,
+    ) = (None,) * 158
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     getitem_7031: "f32[50][1]cuda:0" = _foreach_mul_2[0]
     getitem_7032: "f32[23][1]cuda:0" = _foreach_mul_2[1]
     getitem_7033: "f32[38][1]cuda:0" = _foreach_mul_2[2]
diff --git a/test/inductor/test_alignment.py b/test/inductor/test_alignment.py
new file mode 100644
index 000000000000..686f59cef13e
--- /dev/null
+++ b/test/inductor/test_alignment.py
@@ -0,0 +1,248 @@
+# Owner(s): ["module: inductor"]
+import contextlib
+import sys
+import unittest
+
+import torch
+from torch._inductor import config
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    MACOS_VERSION,
+    parametrize,
+)
+from torch.testing._internal.inductor_utils import GPU_TYPE, RUN_CPU, RUN_GPU
+
+
+try:
+    try:
+        from . import test_torchinductor
+    except ImportError:
+        import test_torchinductor  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
+except unittest.SkipTest:
+    if __name__ == "__main__":
+        sys.exit(0)
+    raise
+
+TestCase = test_torchinductor.TestCase
+check_model = test_torchinductor.check_model
+check_model_gpu = test_torchinductor.check_model_gpu
+skip_if_cpp_wrapper = test_torchinductor.skip_if_cpp_wrapper
+copy_tests = test_torchinductor.copy_tests
+define_custom_op_for_test = test_torchinductor.define_custom_op_for_test
+
+
+@instantiate_parametrized_tests
+class CommonTemplate:
+    def test_unaligned_input(self):
+        def fn(x):
+            return torch.nn.functional.relu(x)
+
+        x = torch.randn(1024 + 16, device=self.device)[1:-15]
+        # TODO (malfet): Investigate failures on MacOS-14
+        with (
+            contextlib.nullcontext()
+            if self.device != "mps" or MACOS_VERSION >= 15.0
+            else self.assertRaises(AssertionError)
+        ):
+            self.common(fn, (x,), check_lowp=False)
+
+    def test_unaligned_input_2d(self):
+        def fn(x):
+            return torch.nn.functional.relu(x)
+
+        x = torch.randn(1024, 1024 + 16, device=self.device)[:, 1:-15]
+        self.common(fn, (x,), check_lowp=False)
+
+    def test_alignment_without_custom_op(self):
+        def fn(x):
+            a = torch.nn.functional.relu(x)
+            b = (3 * a)[1:-15]
+            c = torch.cos(b)
+            return c
+
+        x = torch.randn(1024 + 16, device=self.device)
+        self.common(fn, (x,), check_lowp=False)
+
+    @config.patch(implicit_fallbacks=True)
+    def test_no_align_for_custom_op(self):
+        def slice1d(x):
+            return (3 * x)[1:-15]
+
+        def slice1d_meta(x):
+            return torch.empty_like(x)[1:-15]
+
+        define_custom_op_for_test("slice1d", slice1d, slice1d_meta)
+
+        def fn(x):
+            a = torch.nn.functional.relu(x)
+            b = torch.ops.test.slice1d(a)
+            c = torch.cos(b)
+            return c
+
+        x = torch.randn(1024 + 16, device=self.device)
+        self.common(fn, (x,), check_lowp=False)
+
+    @config.patch(implicit_fallbacks=True)
+    def test_no_align_for_custom_op_2d(self):
+        def slice2d(x):
+            return (3 * x)[..., 1:-15]
+
+        def slice2d_meta(x):
+            return torch.empty_like(x)[..., 1:-15]
+
+        define_custom_op_for_test("slice2d", slice2d, slice2d_meta)
+
+        def fn(x):
+            a = torch.nn.functional.relu(x)
+            b = torch.ops.test.slice2d(a)
+            c = torch.cos(b)
+            return c
+
+        x = torch.randn(1024, 1024 + 16, device=self.device)
+        self.common(fn, (x,), check_lowp=False)
+
+    @config.patch(implicit_fallbacks=True, alignment_asserts=True)
+    @skip_if_cpp_wrapper(
+        "Inductor does not generate alignment assertion for cpp_wrapper right now"
+    )
+    def test_incorrect_meta_for_custom_op_2d(self):
+        def slice2d(x):
+            return (3 * x)[..., 1:-15]
+
+        def slice2d_meta(x):
+            return torch.empty_like(x)[..., 0:-16]
+
+        define_custom_op_for_test("slice2d_incorrect_meta", slice2d, slice2d_meta)
+
+        def fn(x):
+            a = torch.nn.functional.relu(x)
+            b = torch.ops.test.slice2d_incorrect_meta(a)
+            c = torch.cos(b)
+            return c
+
+        x = torch.randn(1024, 1024 + 16, device=self.device)
+
+        expected_error = "Expect the tensor to be 16 bytes aligned. Fail due to storage_offset=1 itemsize=4"
+        with self.assertRaisesRegex(AssertionError, expected_error):
+            self.common(fn, (x,), check_lowp=False)
+
+    def test_slice(self):
+        def f(x):
+            return x[1:] + 1
+
+        x = torch.randn(1025, device=self.device)
+        self.common(f, (x,))
+
+    def test_view_dtype_slice(self):
+        def f(x):
+            return x.view(dtype=torch.float32)[1:] + 1
+
+        x = torch.randn(1025 * 2, dtype=torch.bfloat16, device=self.device)
+        self.common(f, (x,), reference_in_float=False)
+
+    @parametrize(
+        "size",
+        (
+            # wrapper for size = 128: https://gist.github.com/shunting314/88f1e72957b9fc5e9826aaa346a0e652
+            # ptx: https://gist.github.com/shunting314/eb657ee8821eef9f0685b7b91e2ad5c2
+            # the ptx file uses ld.global.b32 to load input buffer
+            128,
+            # wrapper for size = 1024: https://gist.github.com/shunting314/d7f64e1f52f6b1e2ec25e1a51052ce43
+            # ptx: https://gist.github.com/shunting314/a24ff7563bb6b04523d11b119ab0f2b2
+            # the ptx file uses ld.global.v2.b32 to load input buffer
+            1024,
+            # wrapper for size = 1024 * 1024: https://gist.github.com/shunting314/016b95cf0b6e9a75c25f5c9d5ed0a2ba
+            # ptx: https://gist.github.com/shunting314/360112a4893c759b114c12fc99958297
+            # the ptx file uses ld.global.v4.b32 to load input buffer
+            1024 * 1024,
+        ),
+    )
+    def test_slice_view_dtype(self, size):
+        offset = 1
+
+        def f(x):
+            return x[2:].view(dtype=torch.float32) + 1
+
+        x = torch.randn((size + offset) * 2, dtype=torch.bfloat16, device=self.device)
+        self.common(f, (x,), reference_in_float=False)
+
+    def test_Q4_K_dequantization(self):
+        """
+        Test the alignment issue for Q4_K dequantization.
+        """
+
+        QK_K = 256
+        K_SCALE_SIZE = 12
+
+        def get_scale_min(scales):
+            n_blocks = scales.shape[0]
+            scales = scales.view(torch.uint8)
+            scales = scales.reshape((n_blocks, 3, 4))
+
+            d, m, m_d = torch.split(scales, scales.shape[-2] // 3, dim=-2)
+
+            sc = torch.cat([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], dim=-1)
+            min = torch.cat([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], dim=-1)
+
+            return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8)))
+
+        def split_block_dims(blocks, *args):
+            n_max = blocks.shape[1]
+            dims = list(args) + [n_max - sum(args)]
+            return torch.split(blocks, dims, dim=1)
+
+        def dequantize_blocks_Q4_K(blocks, block_size, type_size):
+            n_blocks = blocks.shape[0]
+
+            d, dmin, scales, qs = split_block_dims(blocks, 2, 2, K_SCALE_SIZE)
+            d = d.view(torch.float16)
+            dmin = dmin.view(torch.float16)
+
+            sc, m = get_scale_min(scales)
+
+            d = (d * sc).reshape((n_blocks, -1, 1))
+            dm = (dmin * m).reshape((n_blocks, -1, 1))
+
+            qs = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor(
+                [0, 4], device=d.device, dtype=torch.uint8
+            ).reshape((1, 1, 2, 1))
+            qs = (qs & 0x0F).reshape((n_blocks, -1, 32))
+
+            return (d * qs - dm).reshape((n_blocks, QK_K))
+
+        data = torch.randint(
+            0, 16, (18432, 1728), device=self.device, dtype=torch.uint8
+        )
+
+        def dequantize(data):
+            block_size, type_size = 256, 144
+            rows = data.reshape((-1, data.shape[-1])).view(torch.uint8)
+            n_blocks = rows.numel() // type_size
+            blocks = rows.reshape((n_blocks, type_size))
+            blocks = dequantize_blocks_Q4_K(blocks, block_size, type_size)
+            return blocks.reshape(18432, 3072)
+
+        self.common(dequantize, (data,), check_lowp=False, atol=1e-3, rtol=1e-3)
+
+
+if RUN_CPU:
+
+    class CpuTests(TestCase):
+        common = check_model
+        device = "cpu"
+
+    copy_tests(CommonTemplate, CpuTests, "cpu")
+
+if RUN_GPU:
+
+    class GPUTests(TestCase):
+        common = check_model_gpu
+        device = GPU_TYPE
+
+    copy_tests(CommonTemplate, GPUTests, GPU_TYPE)
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    if RUN_CPU or RUN_GPU:
+        run_tests()
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 4e6fcbdd29ae..5f474d836c8c 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -5,6 +5,10 @@
 import sys
 import tempfile
 import unittest
+<<<<<<< HEAD
+=======
+import zipfile
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import skip
 from unittest.mock import patch
 
@@ -19,12 +23,24 @@
 from torch._dynamo.testing import rand_strided, same
 from torch._dynamo.utils import counters
 from torch._inductor import config
+<<<<<<< HEAD
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch._inductor.test_case import TestCase
 from torch._inductor.utils import is_big_gpu, run_and_get_cpp_code
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
 from torch.export import Dim, export, export_for_training
+=======
+from torch._inductor.package import package_aoti
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch._inductor.test_case import TestCase
+from torch._inductor.utils import is_big_gpu, run_and_get_cpp_code
+from torch._utils_internal import full_aoti_runtime_assert
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
+from torch.export import Dim, export, export_for_training
+from torch.export.pt2_archive._package import load_pt2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_cuda import (
@@ -38,6 +54,10 @@
     skipCUDAIf,
 )
 from torch.testing._internal.common_quantization import (
+<<<<<<< HEAD
+=======
+    _group_quantize_tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skip_if_no_torchvision,
     skipIfNoFBGEMM,
 )
@@ -47,16 +67,31 @@
     IS_FBCODE,
     IS_MACOS,
     IS_WINDOWS,
+<<<<<<< HEAD
+=======
+    parametrize,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfRocm,
     skipIfXpu,
     TEST_WITH_ROCM,
 )
 from torch.testing._internal.custom_tensor import CustomTensorPlainOut
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import GPU_TYPE
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch.testing._internal.triton_utils import HAS_GPU, requires_gpu
 from torch.utils import _pytree as pytree
 from torch.utils._triton import has_triton_tma
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
+from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
+from torch.testing._internal.triton_utils import requires_gpu
+from torch.utils import _pytree as pytree
+from torch.utils._triton import (
+    has_triton_experimental_host_tma,
+    has_triton_tensor_descriptor_host_tma,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if HAS_GPU:
@@ -68,12 +103,28 @@
         add_kernel_2d_autotuned,
         add_kernel_autotuned,
         add_kernel_autotuned_weird_param_order,
+<<<<<<< HEAD
         add_kernel_with_none_param_and_equal_to_1_arg,
         add_kernel_with_optional_param,
         add_kernel_with_scaling,
         add_kernel_with_tma_1d,
         add_kernel_with_tma_2d,
         mul2_inplace_kernel,
+=======
+        add_kernel_on_device_tma_new_api,
+        add_kernel_on_device_tma_old_api,
+        add_kernel_with_none_param_and_equal_to_1_arg,
+        add_kernel_with_optional_param,
+        add_kernel_with_scaling,
+        add_kernel_with_tma_1d_new_api,
+        add_kernel_with_tma_1d_old_api,
+        add_kernel_with_tma_2d_new_api,
+        add_kernel_with_tma_2d_old_api,
+        create_tensor_descriptor_shim,
+        mul2_inplace_kernel,
+        strange_config_matmul_kernel,
+        sub_kernel_autotuned,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 if IS_WINDOWS and IS_CI:
@@ -124,7 +175,16 @@
 
 
 class AOTInductorTestsTemplate:
+<<<<<<< HEAD
     def test_simple(self):
+=======
+    @common_utils.parametrize("embed_kernel_binary", [False, True])
+    @common_utils.parametrize("max_autotune", [False, True])
+    def test_simple(self, embed_kernel_binary, max_autotune):
+        if self.device == "cpu" and IS_MACOS and max_autotune:
+            raise unittest.SkipTest("max_autotune not supported on macos")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -138,22 +198,60 @@ def forward(self, x, y):
             torch.randn(10, 10, device=self.device),
         )
         model = Model()
+<<<<<<< HEAD
         self.check_model(model, example_inputs)
+=======
+        with config.patch(
+            {
+                "aot_inductor.embed_kernel_binary": embed_kernel_binary,
+                "max_autotune": max_autotune,
+            }
+        ):
+            self.check_model(model, example_inputs)
+
+            _, code = run_and_get_cpp_code(
+                AOTIRunnerUtil.compile, model, example_inputs
+            )
+            if self.device == GPU_TYPE:
+                FileCheck().check("launchKernel(").run(code)
+                if config.aot_inductor.embed_kernel_binary:
+                    # Not expect to see launchKernel("CUBIN_FILE_NAME"
+                    FileCheck().check_not('launchKernel("').run(code)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.use_minimal_arrayref_interface:
             self.code_check_count(
                 model, example_inputs, "AOTInductorModelRunMinimalArrayrefInterface(", 1
             )
 
+<<<<<<< HEAD
     def test_compile_wrapper_with_O0(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
                 self.linear = torch.nn.Linear(10, 10)
+=======
+    @unittest.skipIf(
+        IS_FBCODE,
+        "toolchain doesn't support ptx to fatbin",
+    )
+    @skipIfRocm
+    @common_utils.parametrize("embed_kernel_binary", [True, False])
+    def test_simple_multi_arch(self, embed_kernel_binary):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU_TYPE")
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def forward(self, x, y):
                 return x + self.linear(y)
 
         example_inputs = (
+<<<<<<< HEAD
             torch.randn(10, 10, device=self.device),
             torch.randn(10, 10, device=self.device),
         )
@@ -161,6 +259,25 @@ def forward(self, x, y):
         with config.patch("aot_inductor.compile_wrapper_with_O0", True):
             self.check_model(model, example_inputs)
             self.code_check_count(model, example_inputs, "__attribute__((", 2)
+=======
+            torch.randn(10, 16, device=self.device),
+            torch.randn(10, 10, device=self.device),
+        )
+        model = Model()
+        with config.patch(
+            {
+                "aot_inductor.embed_kernel_binary": embed_kernel_binary,
+                "aot_inductor.emit_multi_arch_kernel": True,
+            }
+        ):
+            self.check_model(model, example_inputs)
+            if not embed_kernel_binary:
+                _, code = run_and_get_cpp_code(
+                    AOTIRunnerUtil.compile, model, example_inputs
+                )
+                file_extension = ".spv" if self.device == "xpu" else ".fatbin"
+                FileCheck().check(file_extension).run(code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_small_constant(self):
         class Model(torch.nn.Module):
@@ -206,11 +323,39 @@ def forward(self, x, y):
             torch.randn(10, 10, device=self.device),
         )
         expected_path = os.path.join(tempfile.mkdtemp(dir=cache_dir()), "model.so")
+<<<<<<< HEAD
         actual_path = AOTIRunnerUtil.compile(
+=======
+        actual_path = AOTIRunnerUtil.legacy_compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model, example_inputs, options={"aot_inductor.output_path": expected_path}
         )
         self.assertTrue(actual_path == expected_path)
 
+<<<<<<< HEAD
+=======
+    def test_empty_constant_folding(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.w = torch.randn(4, 4, device=device)
+                self.b = torch.randn(4, device=device)
+
+            def forward(self, x):
+                return torch.matmul(x, self.w) + self.b
+
+        model = Model(self.device)
+        example_inputs = (torch.randn(4, 4, device=self.device),)
+        with config.patch({"aot_inductor.use_runtime_constant_folding": True}):
+            so_path, code = run_and_get_cpp_code(
+                AOTIRunnerUtil.legacy_compile, model, example_inputs
+            )
+            # We should have 1 input, 1 output, 2 constants for the model.
+            FileCheck().check_count("AOTInductorModelBase(1,", 1).check_next(
+                "1,"
+            ).check_next("2,").run(code)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_constant_folding(self):
         class Model(torch.nn.Module):
             def __init__(self, device):
@@ -228,6 +373,85 @@ def forward(self, x):
         with config.patch({"aot_inductor.use_runtime_constant_folding": True}):
             self.check_model(Model(self.device), example_inputs)
 
+<<<<<<< HEAD
+=======
+    def test_constant_folding_with_update(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.w_pre = torch.randn(4, 4, device=device)
+                self.b = torch.randn(4, device=device)
+
+            def forward(self, x):
+                w_transpose = torch.transpose(self.w_pre, 0, 1)
+                w_relu = torch.nn.functional.relu(w_transpose)
+                w = w_relu + self.b
+                return torch.matmul(x, w)
+
+        example_inputs = (torch.randn(4, 4, device=self.device),)
+        with (
+            torch.no_grad(),
+            config.patch(
+                {
+                    "always_keep_tensor_constants": True,
+                    "aot_inductor.use_runtime_constant_folding": True,
+                }
+            ),
+        ):
+            model = Model(self.device)
+            so_path = AOTIRunnerUtil.legacy_compile(
+                model=model,
+                example_inputs=example_inputs,
+            )
+
+        runner = AOTIRunnerUtil.legacy_load_runner(self.device, so_path)
+
+        def runner_call(*args, **kwargs):
+            import torch.fx._pytree as fx_pytree
+
+            call_spec = runner.get_call_spec()
+            in_spec = pytree.treespec_loads(call_spec[0])
+            out_spec = pytree.treespec_loads(call_spec[1])
+            flat_inputs = fx_pytree.tree_flatten_spec((args, kwargs), in_spec)
+            flat_inputs = [x for x in flat_inputs if isinstance(x, torch.Tensor)]
+            flat_outputs = runner.run(flat_inputs)
+            return pytree.tree_unflatten(flat_outputs, out_spec)
+
+        test_inputs = torch.randn(4, 4, device=self.device)
+        expected = model(test_inputs)
+        output = runner_call(test_inputs)
+        self.assertEqual(expected, output)
+
+        # Update with new weights on active buffer
+        new_weights = {
+            "L__self___b": torch.randn(4, device=self.device),
+            "L__self___w_pre": torch.randn(4, 4, device=self.device),
+        }
+        model.w_pre = new_weights["L__self___w_pre"]
+        model.b = new_weights["L__self___b"]
+        expected = model(test_inputs)
+        runner.update_constant_buffer(new_weights, False, False)
+        output = runner_call(test_inputs)
+        self.assertEqual(expected, output)
+
+        # Update with new weights on inactive buffer
+        new_weights = {
+            "L__self___b": torch.randn(4, device=self.device),
+            "L__self___w_pre": torch.randn(4, 4, device=self.device),
+        }
+        model.w_pre = new_weights["L__self___w_pre"]
+        model.b = new_weights["L__self___b"]
+        expected = model(test_inputs)
+        runner.update_constant_buffer(new_weights, True, False)
+        new_output = runner_call(test_inputs)
+        # We have not yet swapped the buffer, new_output should be the same as the old one.
+        self.assertEqual(output, new_output)
+        # Swap the buffer, should get the correct result now.
+        runner.swap_constant_buffer()
+        new_output = runner_call(test_inputs)
+        self.assertEqual(expected, new_output)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu
     def test_duplicate_constant_folding(self):
         class Model(torch.nn.Module):
@@ -246,6 +470,105 @@ def forward(self, x):
         with config.patch({"aot_inductor.use_runtime_constant_folding": True}):
             self.check_model(Model(self.device), example_inputs)
 
+<<<<<<< HEAD
+=======
+    def test_autotune_with_constant_folding(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device) -> None:
+                super().__init__()
+                self.x = torch.randn(2048, 2048, dtype=torch.float16, device=device)
+
+            def _quantize(self, input):
+                return torch.abs(input)
+
+            def forward(self, y):
+                abs_weight = self._quantize(self.x)
+                abs_y = self._quantize(y)
+
+                return abs_weight, abs_y
+
+        input1 = (torch.rand(2048, 2048, dtype=torch.float16, device=self.device),)
+        model = Model(self.device).to(self.device)
+
+        _ = model(*input1)
+
+        ep = torch.export.export(model, input1, dynamic_shapes=None, strict=False)
+        torch._inductor.aoti_compile_and_package(
+            ep, inductor_configs={"aot_inductor.use_runtime_constant_folding": True}
+        )
+
+    @common_utils.parametrize("dynamic", [False, True])
+    @common_utils.parametrize("tma_version", ["new", "old"])
+    def test_triton_kernel_on_device_tma(self, dynamic, tma_version):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+        if tma_version == "new" and not has_triton_tensor_descriptor_host_tma():
+            self.skipTest("requires triton.tools.tensor_descriptor TMA support")
+        if tma_version == "old" and not has_triton_experimental_host_tma():
+            self.skipTest("requires triton.tools.experimental_descriptor TMA support")
+
+        kernel = (
+            add_kernel_on_device_tma_new_api
+            if tma_version == "new"
+            else add_kernel_on_device_tma_old_api
+        )
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, a, b):
+                BLOCK_SIZE = 32
+                out = torch.zeros_like(a)
+                m, n = out.size()
+
+                # Allocate workspace for on-device TMA descriptors
+                # Need 128 bytes per descriptor, 3 descriptors total
+                if tma_version == "old":
+                    workspace = torch.zeros(3 * 128, dtype=torch.uint8, device=a.device)
+                else:
+                    workspace = None
+
+                grid = (triton.cdiv(m, BLOCK_SIZE), triton.cdiv(n, BLOCK_SIZE))
+
+                kernel[grid](
+                    a,
+                    b,
+                    out,
+                    m,
+                    n,
+                    workspace,
+                    BLOCK_SIZE=BLOCK_SIZE,
+                )
+
+                return out
+
+        a = torch.randn((32 * 4, 32 * 8), device=self.device)
+        b = torch.randn((32 * 4, 32 * 8), device=self.device)
+        example_inputs = (a, b)
+
+        triton.set_allocator(
+            lambda size, align, stream: torch.empty(
+                size, dtype=torch.int8, device="cuda"
+            )
+        )
+
+        dynamic_shapes = None
+        if dynamic:
+            dim0 = Dim("s0", min=2, max=1024)
+            dim1 = Dim("s1", min=2, max=1024)
+            dynamic_shapes = {
+                "a": {0: dim0, 1: None},
+                "b": {0: dim1, 1: None},
+            }
+
+        self.check_model(
+            Model(),
+            example_inputs=example_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu
     def test_multi_device(self):
         if self.device == "cpu" and GPU_TYPE == "xpu":
@@ -285,6 +608,32 @@ def forward(self, x, y):
         model = model.to(self.device)
         AOTIRunnerUtil.compile(model, example_inputs)
 
+<<<<<<< HEAD
+=======
+    def test_constant_type_propagation(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.w_pre = torch.randn(4, 4, device=device)
+                self.b = torch.randn(4, device=device)
+
+            def forward(self, x):
+                w_transpose = torch.transpose(self.w_pre, 0, 1)
+                w_relu = torch.nn.functional.relu(w_transpose)
+                w = w_relu + self.b
+                return torch.matmul(x, w)
+
+        model = Model(self.device)
+        example_inputs = (torch.randn(4, 4, device=self.device),)
+        with config.patch({"aot_inductor.use_runtime_constant_folding": True}):
+            so_path, code = run_and_get_cpp_code(
+                AOTIRunnerUtil.legacy_compile, model, example_inputs
+            )
+            FileCheck().check_not("torch::aot_inductor::ConstantType::Unknown").run(
+                code
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_subclasses(self):
         device_to_init = self.device
 
@@ -308,7 +657,10 @@ def forward(self, x):
 
         with torch.no_grad():
             result = AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 self.device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 m,
                 (ref_x,),
             )
@@ -449,9 +801,61 @@ def forward(self, y):
             example_inputs = (torch.randn(10, 10, device=self.device).to(dtype),)
 
             with config.patch({"freezing": True}):
+<<<<<<< HEAD
                 self.check_model(LinearModel(self.device), example_inputs)
 
     def test_linear_dynamic_maxautotune(self):
+=======
+                model = LinearModel(device=self.device)
+                self.check_model(model, example_inputs)
+
+    def test_same_backing(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::foo2",
+                "(Tensor a, Tensor b) -> Tensor",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo2", "CompositeExplicitAutograd", lib=lib)
+            def foo_impl(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+                return a + b
+
+            class M(torch.nn.Module):
+                def forward(self, a, b):
+                    x = a.shape[0]
+                    y = b.shape[0]
+                    a = torch.cat([a, a])
+                    a = torch.ops.mylib.foo2(a, a)
+                    a = a * x
+                    b = torch.cat([b, b])
+                    b = torch.ops.mylib.foo2(b, b)
+                    b = b * y
+                    return a, b
+
+            inp = (torch.ones(3, device=self.device), torch.ones(3, device=self.device))
+            self.check_model(M(), inp)
+
+    def test_empty_cat_dtype_promotion(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                z = torch.cat([x, y], dim=1)
+                z = z.to(dtype=torch.bfloat16)
+                return z * 2
+
+        model = Foo()
+        inps = (torch.randn(4, 10, dtype=torch.bfloat16), torch.randn(4, 0))
+        self.check_model(model, inps)
+
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    def test_linear_dynamic_maxautotune(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("using triton backend only is not supported on CPU")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -506,7 +910,13 @@ def forward(self, x):
 
         example_inputs = (torch.randn(2, 8, device=self.device),)
         counters.clear()
+<<<<<<< HEAD
         self.check_model(Model(), example_inputs)
+=======
+        model = Model().to(device=self.device)
+        actual = AOTIRunnerUtil.legacy_run(self.device, model, example_inputs)
+        self.assertTrue(same(model(*example_inputs), actual))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(counters["inductor"]["scmerge_split_removed"], 1)
         self.assertEqual(counters["inductor"]["scmerge_cat_removed"], 1)
         self.assertEqual(counters["inductor"]["scmerge_split_sections_removed"], 1)
@@ -558,10 +968,24 @@ def forward(self, x, y):
             torch.randn(10, 10, device=self.device),
             torch.randn(10, 10, device=self.device),
         )
+<<<<<<< HEAD
         self.check_model(Model(), example_inputs)
 
     @skip("Test was marked as expected failure, but does not fail always anymore.")
     def test_dynamic_smem_above_default_limit(self):
+=======
+        with config.patch({"aot_inductor.use_runtime_constant_folding": True}):
+            self.check_model(Model(), example_inputs)
+
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    @skip("Test was marked as expected failure, but does not fail always anymore.")
+    def test_dynamic_smem_above_default_limit(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("using triton backend only is not supported on CPU")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def forward(self, x, y):
                 return x @ y
@@ -612,6 +1036,11 @@ def forward(self, a):
         model = Model(N, K, self.device)
         batch = 2
         a = torch.randn(batch, M, K, device=self.device)
+<<<<<<< HEAD
+=======
+        # We should be able to call self.check_model here, but torch.export.export
+        # constants (non-parameter, non-buffer) doesn't work today.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         example_inputs = (a,)
         self.check_model(model, example_inputs)
 
@@ -745,6 +1174,26 @@ def forward(self, x, y):
         example_inputs = (x, y)
         self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
 
+<<<<<<< HEAD
+=======
+    def test_large_dynamic_dim(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x, y):
+                add_0 = x + y
+                return torch.nn.functional.relu(input=add_0, inplace=False)
+
+        x = torch.randn(128, 2048, device=self.device)
+        y = torch.randn(128, 2048, device=self.device)
+        # Use a dimension that exceeds the maximum value of a C long long (2^63 - 1)
+        dim0_x = Dim("dim0_x", min=1, max=1171368248680556527362)
+        dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_x}}
+        example_inputs = (x, y)
+        self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
@@ -879,7 +1328,17 @@ def forward(self, x, y):
             Model(), list_example_inputs, dynamic_shapes=dynamic_shapes
         )
 
+<<<<<<< HEAD
+    def test_addmm_multiple_dynamic(self):
+=======
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
     def test_addmm_multiple_dynamic(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("using triton backend only is not supported on CPU")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def __init__(self, n, k, device):
                 super().__init__()
@@ -916,7 +1375,17 @@ def forward(self, a):
             },
         )
 
+<<<<<<< HEAD
+    def test_bmm_multiple_dynamic(self):
+=======
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
     def test_bmm_multiple_dynamic(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("using triton backend only is not supported on CPU")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -994,7 +1463,10 @@ def forward(self, x, y):
         )
 
     # scaled_dot_product_flash_attention
+<<<<<<< HEAD
     @unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
     def test_sdpa(self):
         class Model(torch.nn.Module):
@@ -1011,7 +1483,10 @@ def forward(self, q, k, v):
         )
         self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
     @unittest.skipIf(
         # for archs where this isn't lowered to flash attention, the math
@@ -1125,6 +1600,154 @@ def forward(self, x, y):
         )
         self.check_model(Repro(), example_inputs)
 
+<<<<<<< HEAD
+=======
+    def test_size_with_unbacked_add_expr(self):
+        # Tests AOTI autotuning to make sure the correct input tensor sizes
+        # are generated for sizes that include an expr such as s0 + u0.
+
+        class Repro(torch.nn.Module):
+            def forward(self, values, repeats, mask, embeddings, x, z, scalar):
+                repeat_interleave = torch.repeat_interleave(values, repeats)
+                index = torch.clamp(repeat_interleave, min=0, max=400).int()
+                index_select = torch.index_select(embeddings, 0, index)
+
+                backed = z.size(0)
+                unbacked = scalar.item()
+                torch._check_is_size(unbacked)
+
+                unbacked_add_expr = backed + unbacked
+                repeated = x.repeat(unbacked_add_expr, 1)
+                return torch.cat([repeated, index_select], dim=1)
+
+        example_inputs = (
+            torch.ones(64, dtype=torch.int64, device=self.device),
+            torch.ones(64, dtype=torch.int64, device=self.device) * 12,
+            torch.ones((768,), dtype=torch.int64, device=self.device).bool(),
+            torch.randn((401, 8), dtype=torch.bfloat16, device=self.device),
+            torch.randn((1, 256), dtype=torch.bfloat16, device=self.device),
+            torch.ones(758, 127, dtype=torch.int64, device=self.device),
+            torch.scalar_tensor(10, dtype=torch.int32, device=self.device),
+        )
+        spec = {
+            "values": (Dim.DYNAMIC,),
+            "repeats": (Dim.DYNAMIC,),
+            "mask": (Dim.DYNAMIC,),
+            "embeddings": (Dim.DYNAMIC, Dim.STATIC),
+            "x": (Dim.STATIC, Dim.STATIC),
+            "z": (Dim.DYNAMIC, Dim.STATIC),
+            "scalar": (),
+        }
+        self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
+
+    def test_size_with_unbacked_add_expr_transitive(self):
+        # Edge case with torch._check(expr1, expr2) + torch._check(expr2, unbacked).
+        # When generating example input sizes for autotuning, it should coalesce
+        # expr1, expr2, unbacked into a single size.
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Repro(torch.nn.Module):
+            def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
+                index = torch.repeat_interleave(values, repeats)
+                index_select = torch.index_select(embeddings, 0, index)
+
+                u0, u1 = lst.tolist()
+                torch._check_is_size(u0)
+                torch._check_is_size(u1)
+                backed0, backed1 = z.size(0), z.size(1)
+
+                repeated0 = y.repeat(backed0 + u0, 1)
+                repeated1 = x.repeat(backed1 + u1, 1)
+                out1 = torch.empty_like(repeated1)
+                add_kernel[(out1.numel(),)](
+                    repeated1, repeated1, out1, out1.numel(), BLOCK_SIZE=2
+                )
+
+                # Implicitly add torch._check(expr2, unbacked)
+                cat = torch.cat([out1, index_select], dim=1)
+                add = repeated0 + repeated1
+
+                # Explicitly add torch._check(expr1, expr2)
+                torch._check(repeated0.size(0) == out1.size(0))
+                return cat, add
+
+        example_inputs = (
+            torch.ones(64, dtype=torch.int64, device=self.device),
+            torch.ones(64, dtype=torch.int64, device=self.device) * 24,
+            torch.ones((768,), dtype=torch.int64, device=self.device).bool(),
+            torch.randn((401, 8), dtype=torch.bfloat16, device=self.device),
+            torch.randn((2, 256), dtype=torch.bfloat16, device=self.device),
+            torch.randn((2, 256), dtype=torch.bfloat16, device=self.device),
+            torch.ones(758, 758, dtype=torch.int64, device=self.device),
+            torch.tensor([10, 10], dtype=torch.int32, device=self.device),
+        )
+        spec = {
+            "values": (Dim.DYNAMIC,),
+            "repeats": (Dim.DYNAMIC,),
+            "mask": (Dim.DYNAMIC,),
+            "embeddings": (Dim.DYNAMIC, Dim.STATIC),
+            "x": (Dim.DYNAMIC, Dim.STATIC),
+            "y": (Dim.DYNAMIC, Dim.STATIC),
+            "z": (Dim.DYNAMIC, Dim.DYNAMIC),
+            "lst": (Dim.STATIC,),
+        }
+        self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
+
+    @config.patch({"unbacked_symint_fallback": 128})
+    def test_size_with_unbacked_add_and_mul_expr(self):
+        # Edge case with torch._check(add_expr, mul_expr). When generating example
+        # input sizes for autotuning, make sure they coalesce into a single size.
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Repro(torch.nn.Module):
+            def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
+                u0, u1, u2 = lst.tolist()
+                torch._check_is_size(u0)
+                torch._check_is_size(u1)
+                torch._check_is_size(u2)
+                backed = z.size(0)
+                backed1 = z.size(1)
+
+                unbacked_add_expr = backed + u0
+                unbacked_mul_expr = backed1 + (u1 * u2)
+                repeated0 = x.repeat(unbacked_add_expr, 1)
+                repeated1 = y.repeat(unbacked_mul_expr, 1)
+                out0 = torch.empty_like(repeated0)
+                out1 = torch.empty_like(repeated1)
+                add_kernel[(out0.numel(),)](
+                    repeated0, repeated0, out0, out0.numel(), BLOCK_SIZE=2
+                )
+                add_kernel[(out1.numel(),)](
+                    repeated1, repeated1, out1, out1.numel(), BLOCK_SIZE=2
+                )
+
+                return torch.cat([out1, out0], dim=1)
+
+        example_inputs = (
+            torch.ones(64, dtype=torch.int64, device=self.device),
+            torch.ones(64, dtype=torch.int64, device=self.device) * 24,
+            torch.ones((768,), dtype=torch.int64, device=self.device).bool(),
+            torch.randn((401, 8), dtype=torch.bfloat16, device=self.device),
+            torch.randn((2, 256), dtype=torch.bfloat16, device=self.device),
+            torch.randn((2, 256), dtype=torch.bfloat16, device=self.device),
+            torch.ones(758, 758, dtype=torch.int64, device=self.device),
+            torch.tensor([10, 5, 2], dtype=torch.int32, device=self.device),
+        )
+        spec = {
+            "values": (Dim.DYNAMIC,),
+            "repeats": (Dim.DYNAMIC,),
+            "mask": (Dim.DYNAMIC,),
+            "embeddings": (Dim.DYNAMIC, Dim.STATIC),
+            "x": (Dim.DYNAMIC, Dim.STATIC),
+            "y": (Dim.DYNAMIC, Dim.STATIC),
+            "z": (Dim.DYNAMIC, Dim.DYNAMIC),
+            "lst": (Dim.STATIC,),
+        }
+        self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet")
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support flash SDPA"
@@ -1192,12 +1815,63 @@ def forward(self, x):
         example_inputs = (torch.ones(4, 4, device=self.device),)
         self.check_model(Foo(self.device), example_inputs)
 
-    def test_large_grid(self):
-        if self.device != GPU_TYPE:
-            raise unittest.SkipTest("requires GPU")
-
-        class Model(torch.nn.Module):
-            def __init__(self) -> None:
+<<<<<<< HEAD
+=======
+    def test_aoti_constant_tensor_name_collision(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.register_buffer(
+                    "_tensor_constant1",
+                    torch.ones(1, device=device, dtype=torch.float32),
+                    persistent=True,
+                )
+
+            def forward(self, x):
+                return self.linear(x)
+
+        class Foo(torch.nn.Module):
+            def __init__(self, user_float_feature_idx, device):
+                super().__init__()
+                self.user_float_feature_idx = user_float_feature_idx
+                self.register_buffer(
+                    "_tensor_constant0",
+                    torch.ones(1, device=device, dtype=torch.float32),
+                    persistent=True,
+                )
+                self.register_buffer(
+                    "_tensor_constant1",
+                    torch.ones(1, device=device, dtype=torch.float32),
+                    persistent=True,
+                )
+                self.sub_mod = SubModule(device)
+
+            def forward(self, x):
+                return (
+                    torch.index_select(
+                        x, 1, torch.tensor(self.user_float_feature_idx, device=x.device)
+                    ),
+                    self._tensor_constant0,
+                    self._tensor_constant1,
+                    self.sub_mod._tensor_constant1,
+                )
+
+        example_inputs = (torch.ones(4, 4, device=self.device),)
+        user_float_feature_idx = [1]
+        # we have to have run_decomposition first to trigger the name collision
+        ep = torch.export.export(
+            Foo(user_float_feature_idx, self.device), example_inputs, strict=False
+        ).run_decompositions()
+        gm = ep.module()
+        self.check_model(gm, example_inputs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    def test_large_grid(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
                 super().__init__()
 
             def forward(self, primals_5):
@@ -1614,6 +2288,25 @@ def test_while_loop_with_sym_expr_cond(self, dynamic):
             dynamic_shapes=dynamic_shapes,
         )
 
+<<<<<<< HEAD
+=======
+    @common_utils.parametrize("dynamic", [False, True])
+    def test_while_loop_with_conv(self, dynamic):
+        inputs = (torch.randn(2, 4, 4, 4, device=self.device, dtype=torch.float64),)
+        dim0_ab = Dim("s0", min=2, max=1024)
+        dynamic_shapes = None
+        if dynamic:
+            dynamic_shapes = {
+                "c": {},
+                "x": {0: dim0_ab, 1: None},
+            }
+        self.check_model_with_multiple_inputs(
+            WhileLoopModels.Conv(self.device),
+            prepend_counters(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch({"is_predispatch": True})
     def test_constant(self):
         class M(torch.nn.Module):
@@ -1650,12 +2343,20 @@ def forward(self, x, b):
         }
 
         # Compile & run model where dynamic dim size > 0.
+<<<<<<< HEAD
         so_path: str = AOTIRunnerUtil.compile(
+=======
+        package_path: str = AOTIRunnerUtil.compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Repro(),
             example_inputs,
             dynamic_shapes=dynamic_shapes,
         )
+<<<<<<< HEAD
         aot_inductor_module = AOTIRunnerUtil.load(self.device, so_path)
+=======
+        aot_inductor_module = torch._inductor.aoti_load_package(package_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aot_inductor_module(*example_inputs)
 
         # Re-run where dynamic dim size is 0.
@@ -1722,7 +2423,11 @@ def forward(self, a, b, alpha=1.0):
                     args=(a, b),
                     kwargs={"alpha": 2.0},
                 )
+<<<<<<< HEAD
                 kernel_runner = AOTIRunnerUtil.load_runner(self.device, so_path)
+=======
+                kernel_runner = AOTIRunnerUtil.legacy_load_runner(self.device, so_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 res = kernel_runner.run([a, b])
                 self.assertTrue(isinstance(res, list))
                 self.assertTrue(len(res) == 1)
@@ -1809,6 +2514,10 @@ def forward(self, x):
 
     @skipCUDAIf(True, "Test for x86 backend")
     @skipIfXpu
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(IS_FBCODE, "Need newer ideep")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_buffer_mutation_and_force_mmap_weights(self):
         class Model(nn.Module):
             def __init__(self):
@@ -1823,10 +2532,20 @@ def forward(self, x):
 
         example_inputs = (torch.randn(32, 16),)
         model = Model().eval()
+<<<<<<< HEAD
         with config.patch(
             {"freezing": True, "aot_inductor.force_mmap_weights": True}
         ), torch.no_grad():
             exported_model = export_for_training(model, example_inputs).module()
+=======
+        with (
+            config.patch({"freezing": True, "aot_inductor.force_mmap_weights": True}),
+            torch.no_grad(),
+        ):
+            exported_model = export_for_training(
+                model, example_inputs, strict=True
+            ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             quantizer = X86InductorQuantizer()
             quantizer.set_global(
                 xiq.get_default_x86_inductor_quantization_config(reduce_range=True)
@@ -1838,6 +2557,52 @@ def forward(self, x):
 
             self.check_model(converted_model, example_inputs)
 
+<<<<<<< HEAD
+=======
+    def test_fallback_mem_leak_fix(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x, y, idx):
+                tmp = x + y
+                w = torch.ops.aten.as_strided(tmp, x.shape, x.stride())
+                out = torch.ops.aten.index.Tensor(w, [idx])
+                return w, out
+
+        example_inputs = (
+            torch.randn(4, 1, 4, device=GPU_TYPE),
+            torch.randn(4, 1, 4, device=GPU_TYPE),
+            torch.randn(4, device=GPU_TYPE) > 0,
+        )
+
+        dim0 = Dim("dim0", min=1, max=2048)
+        dynamic_shapes = {
+            "x": {0: dim0},
+            "y": {0: dim0},
+            "idx": {0: dim0},
+        }
+        package_path: str = AOTIRunnerUtil.compile(
+            Model(),
+            example_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
+        aot_inductor_module = torch._inductor.aoti_load_package(package_path)
+        device_interface = get_interface_for_device(GPU_TYPE)
+        device: int = device_interface.current_device()
+        mem_before = device_interface.memory_allocated(device)
+        aot_inductor_module(*example_inputs)
+        mem_after = device_interface.memory_allocated(device)
+        self.assertEqual(mem_before, mem_after)
+
+        actual = aot_inductor_module(*example_inputs)
+        expected = Model()(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_multigpu()
     def test_replicate_on_devices(self):
         if self.device != GPU_TYPE:
@@ -1862,7 +2627,11 @@ def forward(self, x, y):
         # Compile model with AOTInductor
         device_interface = get_interface_for_device(GPU_TYPE)
         with device_interface.device(0):
+<<<<<<< HEAD
             so_path = AOTIRunnerUtil.compile(
+=======
+            package_path = AOTIRunnerUtil.compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model=Model(
                     w1.to(torch.device(GPU_TYPE, 0)), w2.to(torch.device(GPU_TYPE, 0))
                 ),
@@ -1873,7 +2642,11 @@ def forward(self, x, y):
         for i in range(device_interface.device_count()):
             with device_interface.device(i):
                 example_inputs = tuple(t.to(torch.device(GPU_TYPE, i)) for t in inputs)
+<<<<<<< HEAD
                 optimized = AOTIRunnerUtil.load(GPU_TYPE, so_path)
+=======
+                optimized = torch._inductor.aoti_load_package(package_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result_gpu = optimized(*example_inputs)
             self.assertTrue(same(result_cpu, result_gpu.cpu()))
 
@@ -1908,8 +2681,13 @@ def forward(self, x):
         example_inputs = (torch.randn(8, 10, device=device),)
         expected = model(*example_inputs)
 
+<<<<<<< HEAD
         so_path = AOTIRunnerUtil.compile(model, example_inputs)
         optimized = AOTIRunnerUtil.load(device, so_path)
+=======
+        so_path = AOTIRunnerUtil.legacy_compile(model, example_inputs)
+        optimized = AOTIRunnerUtil.legacy_load(device, so_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         actual = optimized(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
@@ -1958,14 +2736,20 @@ def forward(self, x, y):
         device_interface = get_interface_for_device(GPU_TYPE)
         with device_interface.device(0), torch.no_grad():
             result_gpu_0 = AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 GPU_TYPE,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Model(weight.to(torch.device(GPU_TYPE, 0))),
                 tuple(t.to(torch.device(GPU_TYPE, 0)) for t in inputs),
             )
 
         with device_interface.device(1), torch.no_grad():
             result_gpu_1 = AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 GPU_TYPE,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Model(weight.to(torch.device(GPU_TYPE, 1))),
                 tuple(t.to(torch.device(GPU_TYPE, 1)) for t in inputs),
             )
@@ -1973,6 +2757,44 @@ def forward(self, x, y):
         self.assertTrue(same(result_cpu, result_gpu_0.cpu()))
         self.assertTrue(same(result_cpu, result_gpu_1.cpu()))
 
+<<<<<<< HEAD
+=======
+    @requires_multigpu()
+    def test_load_package_multiple_gpus(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self, weight):
+                super().__init__()
+                self.weight = weight
+
+            def forward(self, x, y):
+                return x + torch.nn.functional.linear(y, self.weight)
+
+        weight = torch.randn(10, 10, device=self.device)
+        inputs = (
+            torch.randn(10, 10, device=self.device),
+            torch.randn(10, 10, device=self.device),
+        )
+        model = Model(weight).to(device=self.device)
+        result_ref = model(*inputs)
+
+        package_path = AOTIRunnerUtil.compile(model, inputs)
+
+        # Load AOT package on gpu:N
+        device_interface = get_interface_for_device(GPU_TYPE)
+        for i in range(device_interface.device_count()):
+            device = torch.device(GPU_TYPE, i)
+            with device_interface.device(i), torch.no_grad():
+                model_package = torch._inductor.aoti_load_package(
+                    package_path, device_index=i
+                )
+                inputs_on_device = [input.to(device=device) for input in inputs]
+                result_package = model_package(*inputs_on_device)
+            self.assertTrue(same(result_ref.cpu(), result_package.cpu()))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reuse_kernel(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -2060,8 +2882,13 @@ def forward(self, x, y, z):
         }
 
         example_inputs = (x, y, z)
+<<<<<<< HEAD
         m = Model(self.device).to(dtype=torch.float)
         self.check_model(m, example_inputs, dynamic_shapes=dynamic_shapes)
+=======
+        model = Model(self.device).to(dtype=torch.float)
+        self.check_model(model, example_inputs, dynamic_shapes=dynamic_shapes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_fake_tensor_device_validation(self):
         if self.device != GPU_TYPE:
@@ -2147,6 +2974,32 @@ def forward(self, x):
         example_inputs = (torch.randn(8, 4, 4, device=self.device),)
         self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
+=======
+    @patch("torch._dynamo.utils.CompileEventLogger.log_instant_event")
+    def test_backward_no_op_logging(self, mock_log_instant_event):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                return x
+
+        model = Model()
+        dummy_input = torch.randn(1, 5)
+
+        from torch._dynamo.utils import CompileEventLogLevel
+        from torch._inductor import compile_fx
+
+        graph_module = torch.fx.symbolic_trace(model)
+        compile_fx._compile_fx_inner(graph_module, (dummy_input,))
+        mock_log_instant_event.assert_called_once_with(
+            "backward no-op",
+            metadata={"compile_id": None},
+            log_level=CompileEventLogLevel.PT2_COMPILE,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_FBCODE, "Not runnable in fbcode")
     def test_dup_unbacked_sym_decl(self):
         class Model(torch.nn.Module):
@@ -2203,12 +3056,20 @@ def forward(self, x, weight, bias):
 
         # compiler under no_grad
         with torch.no_grad():
+<<<<<<< HEAD
             so_path = AOTIRunnerUtil.compile(m, example_inputs)
+=======
+            package_path = AOTIRunnerUtil.compile(m, example_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # run under grad enabled
         self.assertTrue(torch.is_grad_enabled())
 
+<<<<<<< HEAD
         optimized = AOTIRunnerUtil.load(self.device, so_path)
+=======
+        optimized = torch._inductor.aoti_load_package(package_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         actual = optimized(*example_inputs)
         actual = pytree.tree_leaves(actual)
 
@@ -2403,6 +3264,40 @@ def forward(self, a, b, lengths):
         )
         self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
+=======
+    @common_utils.parametrize("minmax", [min, max])
+    def test_sympy_cpp_printer_min_max(self, minmax):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b, ranks):
+                n_elements = a.numel()
+                out = torch.empty_like(a)
+                backed = a.size(0)
+                unbacked = int(ranks.max())
+                scaling_factor = minmax(backed, unbacked, 100)
+                add_kernel_with_scaling[(n_elements,)](
+                    a,
+                    b,
+                    out,
+                    n_elements,
+                    scaling_factor,
+                    BLOCK_SIZE=16,
+                )
+                return out
+
+        example_inputs = (
+            torch.randn(16, device=self.device),
+            torch.randn(16, device=self.device),
+            torch.arange(end=4, device=self.device, dtype=torch.int16),
+        )
+        torch._dynamo.mark_dynamic(example_inputs[0], 0)
+        torch._dynamo.mark_dynamic(example_inputs[1], 0)
+        self.check_model(Model(), example_inputs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("grid_type", [1, 2, 3])
     @common_utils.parametrize("num_dims", [1, 2])
     @common_utils.parametrize("dynamic", [False, True])
@@ -2530,11 +3425,28 @@ def forward(self, x):
         self.check_model(Model(), example_inputs)
 
     @common_utils.parametrize("dynamic", [False, True])
+<<<<<<< HEAD
     def test_triton_kernel_tma_descriptor_1d(self, dynamic):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
         if not has_triton_tma():
             raise unittest.SkipTest("requires Triton TMA")
+=======
+    @common_utils.parametrize("tma_version", ["new", "old"])
+    def test_triton_kernel_tma_descriptor_1d(self, dynamic, tma_version):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+        if tma_version == "new" and not has_triton_tensor_descriptor_host_tma():
+            self.skipTest("requires triton.tools.tensor_descriptor TMA support")
+        if tma_version == "old" and not has_triton_experimental_host_tma():
+            self.skipTest("requires triton.tools.experimental_descriptor TMA support")
+
+        kernel = (
+            add_kernel_with_tma_1d_new_api
+            if tma_version == "new"
+            else add_kernel_with_tma_1d_old_api
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -2546,11 +3458,16 @@ def forward(self, a, b):
                 n_elements = out.numel()
 
                 desc_a, desc_b, desc_out = (
+<<<<<<< HEAD
                     triton.tools.experimental_descriptor.create_1d_tma_descriptor(
                         t.data_ptr(),
                         n_elements,
                         BLOCK_SIZE,
                         t.element_size(),
+=======
+                    create_tensor_descriptor_shim(
+                        t, [BLOCK_SIZE], new_api=(tma_version == "new")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     for t in (a, b, out)
                 )
@@ -2558,7 +3475,11 @@ def forward(self, a, b):
                 grid = lambda meta: (  # noqa: E731
                     triton.cdiv(n_elements, meta["BLOCK_SIZE"]),
                 )
+<<<<<<< HEAD
                 add_kernel_with_tma_1d[grid](
+=======
+                kernel[grid](
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     desc_a,
                     desc_b,
                     desc_out,
@@ -2586,11 +3507,28 @@ def forward(self, a, b):
         )
 
     @common_utils.parametrize("dynamic", [False, True])
+<<<<<<< HEAD
     def test_triton_kernel_tma_descriptor_2d(self, dynamic):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
         if not has_triton_tma():
             raise unittest.SkipTest("requires Triton TMA")
+=======
+    @common_utils.parametrize("tma_version", ["new", "old"])
+    def test_triton_kernel_tma_descriptor_2d(self, dynamic, tma_version):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+        if tma_version == "new" and not has_triton_tensor_descriptor_host_tma():
+            self.skipTest("requires triton.tools.tensor_descriptor TMA support")
+        if tma_version == "old" and not has_triton_experimental_host_tma():
+            self.skipTest("requires triton.tools.experimental_descriptor TMA support")
+
+        kernel = (
+            add_kernel_with_tma_2d_new_api
+            if tma_version == "new"
+            else add_kernel_with_tma_2d_old_api
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -2603,6 +3541,7 @@ def forward(self, a, b):
                 x_size, y_size = out.size()
 
                 desc_a, desc_b, desc_out = (
+<<<<<<< HEAD
                     triton.tools.experimental_descriptor.create_2d_tma_descriptor(
                         t.data_ptr(),
                         x_size,
@@ -2610,6 +3549,12 @@ def forward(self, a, b):
                         BLOCK_SIZE_X,
                         BLOCK_SIZE_Y,
                         t.element_size(),
+=======
+                    create_tensor_descriptor_shim(
+                        t,
+                        [BLOCK_SIZE_X, BLOCK_SIZE_Y],
+                        new_api=(tma_version == "new"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     for t in (a, b, out)
                 )
@@ -2618,7 +3563,11 @@ def forward(self, a, b):
                     triton.cdiv(x_size, meta["BLOCK_SIZE_X"]),
                     triton.cdiv(y_size, meta["BLOCK_SIZE_Y"]),
                 )
+<<<<<<< HEAD
                 add_kernel_with_tma_2d[grid](
+=======
+                kernel[grid](
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     desc_a,
                     desc_b,
                     desc_out,
@@ -2815,8 +3764,13 @@ def forward(self, x, y):
         if dynamic:
             dim0_xy = Dim("s0", min=2, max=1024)
             dynamic_shapes = {
+<<<<<<< HEAD
                 "x": {0: dim0_xy, 1: None},
                 "y": {0: dim0_xy, 1: None},
+=======
+                "x": {0: dim0_xy},
+                "y": {0: dim0_xy},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         example_inputs = (
             torch.randn(2, device=self.device),
@@ -2849,6 +3803,45 @@ def forward(self, x):
         x = torch.randn(16, 16, device=self.device)
         self.check_model(Model(), (x,))
 
+<<<<<<< HEAD
+=======
+    def test_triton_kernel_dynamic_grid(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        import math
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x, y, n_elements_tensor):
+                output = torch.zeros_like(x)
+                n_elements_symint = n_elements_tensor.item()
+                n_elements = x.numel()
+
+                def grid(meta):
+                    n_elements_complicated = n_elements_symint // 1.0
+                    return (math.trunc(n_elements_complicated / meta["BLOCK_SIZE"]),)
+
+                add_kernel_autotuned[grid](
+                    x,
+                    y,
+                    output,
+                    n_elements,
+                )
+
+                return output
+
+        x = torch.randn(128, device=self.device)
+        y = torch.randn(128, device=self.device)
+        n_elem = torch.tensor(128)
+        dim0_x = Dim("dim0_x", min=8, max=256)
+        dim0_y = Dim("dim0_y", min=8, max=256)
+        dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_y}, "n_elements_tensor": {}}
+        self.check_model(Model(), (x, y, n_elem), dynamic_shapes=dynamic_shapes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_shifted_constraint_ranges(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -2941,7 +3934,52 @@ def forward(
 
             self.check_model(Model(), inputs)
 
+<<<<<<< HEAD
     def test_repeated_user_defined_triton_kernel(self):
+=======
+    def test_narrow_fallback(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, inp: torch.Tensor, dim: int, start: int, length: int):
+                return torch.ops.aten.narrow(inp, dim, start, length)
+
+        inputs = (torch.rand((3, 4), device=self.device), 0, 0, 2)
+
+        self.check_model(Model(), inputs)
+
+    def test_pad_fallback(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(
+                self,
+                inp: torch.Tensor,
+                pad: tuple[int, ...],
+            ):
+                return torch.ops.aten.pad(inp, pad)
+
+        inputs = (torch.rand((3, 3, 4, 2), device=self.device), (0, 1, 2, 1, 3, 3))
+
+        self.check_model(Model(), inputs)
+
+    def test_fill__fallback(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, inp: torch.Tensor, scalar: float):
+                torch.ops.aten.fill_(inp, scalar)
+                return inp
+
+        inputs = (torch.rand((3, 3, 4, 2), device=self.device), 0.5)
+        self.check_model(Model(), inputs)
+
+    @common_utils.parametrize("embed_kernel_binary", [False, True])
+    def test_repeated_user_defined_triton_kernel(self, embed_kernel_binary):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
 
@@ -2955,9 +3993,28 @@ def forward(self, x):
                 return x
 
         inputs = (torch.randn(4, 4, device=self.device),)
+<<<<<<< HEAD
         self.check_model(Model(), inputs)
 
     def test_convolution(self):
+=======
+        with config.patch({"aot_inductor.embed_kernel_binary": embed_kernel_binary}):
+            model = Model()
+            self.check_model(model, inputs)
+            _, code = run_and_get_cpp_code(AOTIRunnerUtil.compile, model, inputs)
+            FileCheck().check("launchKernel(").run(code)
+            if config.aot_inductor.embed_kernel_binary:
+                # Not expect to see launchKernel("CUBIN_FILE_NAME"
+                FileCheck().check_not('launchKernel("').run(code)
+
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    def test_convolution(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("using triton backend only is not supported on CPU")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -3094,12 +4151,20 @@ def forward(self, x):
                 return (self.foo_bar(x) + self.test_param) * self.test_buf
 
         with torch.no_grad():
+<<<<<<< HEAD
             so_path = AOTIRunnerUtil.compile(
                 model=TestModule().to(device=self.device),
                 example_inputs=(torch.rand(3, 4, device=self.device),),
             )
 
         runner = AOTIRunnerUtil.load_runner(self.device, so_path)
+=======
+            so_path = AOTIRunnerUtil.legacy_compile(
+                model=TestModule().to(device=self.device),
+                example_inputs=(torch.rand(3, 4, device=self.device),),
+            )
+        runner = AOTIRunnerUtil.legacy_load_runner(self.device, so_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         expected_original_fqns = {
             "L__self___test_param": "test_param",
@@ -3141,6 +4206,57 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         m = M()
         self.check_model(m, example_args, dynamic_shapes=dynamic_shapes)
 
+<<<<<<< HEAD
+=======
+    def test_proxy_executor_permute(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                return torch.ops.aten.permute.default(x, [0, 2, 1])
+
+        example_args = (torch.randn((1, 3001, 201), dtype=torch.complex64),)
+        m = M()
+        self.check_model(m, example_args)
+
+    def test_proxy_executor_abs(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                return torch.ops.aten.abs.default(x)
+
+        example_args = (torch.randn((1, 3001, 201), dtype=torch.complex64),)
+        m = M()
+        self.check_model(m, example_args)
+
+    def test_proxy_executor_squeeze(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                return torch.ops.aten.squeeze.dim(x, 0)
+
+        example_args = (torch.randn((1, 300, 201), dtype=torch.complex64),)
+        m = M()
+        self.check_model(m, example_args)
+
+    def test_proxy_executor_hann(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self):
+                return torch.ops.aten.hann_window.default(400)
+
+        example_args = ()
+        m = M()
+        self.check_model(m, example_args)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fqn(self):
         class NestedChild(torch.nn.Module):
             def __init__(self) -> None:
@@ -3203,6 +4319,10 @@ def forward(self, a):
         example_inputs = (torch.randn(2, M, K, device=self.device),)
         model = Model(N, K, self.device)
         self.check_model(model, example_inputs)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Update model weights, after this AOTInductor should re-generate model.so
         # if weights are stored in the model.so
         model.weight += 1
@@ -3266,12 +4386,20 @@ def forward(self, x, y):
             torch.randn(1, 16, device=GPU_TYPE),
         )
 
+<<<<<<< HEAD
         so_path: str = AOTIRunnerUtil.compile(
             Model(),
             example_inputs,
         )
         aot_inductor_module = AOTIRunnerUtil.load(GPU_TYPE, so_path)
 
+=======
+        package_path: str = AOTIRunnerUtil.compile(
+            Model(),
+            example_inputs,
+        )
+        aot_inductor_module = torch._inductor.aoti_load_package(package_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Don't assign outputs to a variable b/c it will allocate GPU memory.
         device_interface = get_interface_for_device(GPU_TYPE)
         device: int = device_interface.current_device()
@@ -3361,6 +4489,88 @@ def forward(self, q, k, v, attn_bias):
         )
         self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
+=======
+    def test_aoti_runtime_asserts(self):
+        from torch.export._draft_export import draft_export, FailureType
+
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::foo",
+                "(Tensor a, Tensor b) -> Tensor",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo", "cpu", lib=lib)
+            def foo(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+                return a[: b.item()]
+
+            @torch.library.impl_abstract("mylib::foo", lib=lib)
+            def foo_fake_impl(a, b):
+                ctx = torch.library.get_ctx()
+                u = ctx.new_dynamic_size()
+                return torch.empty(u)
+
+            class M(torch.nn.Module):
+                def forward(self, a, b):
+                    res = torch.ops.mylib.foo(a, b)
+                    s = res.shape[0]
+                    torch._check(s > 3)
+                    torch._check(s < a.shape[0])
+                    return a[s - 3]
+
+            example_inputs = (torch.randn(100), torch.tensor(10))
+            ep = draft_export(M(), example_inputs)
+            report = ep._report
+            need_config_patch = any(
+                not f.xfail and f.failure_type == FailureType.MISMATCHED_FAKE_KERNEL
+                for f in report.failures
+            )
+            m = ep.module()
+
+            # This should no longer be needed after #150093
+            from torch._functorch import config as functorch_config
+
+            with functorch_config.patch(
+                {"generate_fake_kernels_from_real_mismatches": need_config_patch}
+            ):
+                pt2_file = torch._inductor.aoti_compile_and_package(ep)
+            optimized = torch._inductor.aoti_load_package(pt2_file)
+
+            self.assertTrue(same(optimized(*example_inputs), m(*example_inputs)))
+
+            with self.assertRaisesRegex(Exception, "run_func_(.*) API call failed "):
+                optimized(torch.randn(100), torch.tensor(2))
+
+    @patch.dict(os.environ, {"TORCHINDUCTOR_SCALAR_ASSERTS_FULL": "1"})
+    def test_aoti_runtime_asserts_backed_symint(self):
+        if not full_aoti_runtime_assert():
+            raise unittest.SkipTest("full runtime assert not turned on")
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                y = x.reshape(100, -1).clone()
+                y = y + 1
+                return y
+
+        model = Model().to(self.device)
+        input1 = (torch.rand(100, device=self.device),)
+        input2 = (torch.rand(2099, device=self.device),)
+        dynamic_shapes = {
+            "x": {0: torch.export.Dim.DYNAMIC},
+        }
+        package_path = AOTIRunnerUtil.compile(
+            model,
+            input1,
+            dynamic_shapes=dynamic_shapes,
+        )
+        optimized = torch._inductor.aoti_load_package(package_path)
+        self.assertEqual(model(*input1), optimized(*input1))
+        with self.assertRaisesRegex(Exception, "run_func_(.*) API call failed "):
+            optimized(*input2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_put_with_none_index(self):
         # index_put falls back in the deterministic mode
         with DeterministicGuard(True):
@@ -3435,7 +4645,13 @@ def forward(self, x0, x1, x2, x4, x5, x6, x7, x8, x9):
         m = Model()
         inputs = tuple(inputs)
         with torch.no_grad():
+<<<<<<< HEAD
             so_path = AOTIRunnerUtil.compile(m, inputs, dynamic_shapes=dynamic_shapes)
+=======
+            so_path = AOTIRunnerUtil.legacy_compile(
+                m, inputs, dynamic_shapes=dynamic_shapes
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with open(os.path.splitext(so_path)[0] + ".cpp") as cpp:
             src_code = cpp.read()
             FileCheck().check_count(
@@ -3464,10 +4680,15 @@ def forward(self, x0, x1, x2, x4, x5, x6, x7, x8, x9):
                 else 19,  # we have 9 symbolic strides for which we don't generate checks
                 exactly=True,
             ).run(src_code)
+<<<<<<< HEAD
         optimized = AOTIRunnerUtil.load(self.device, so_path)
         actual = optimized(*inputs)
         expected = m(*inputs)
         torch.testing.assert_close(actual, expected)
+=======
+
+        self.check_model(m, inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
     @unittest.skipIf(
@@ -3510,19 +4731,45 @@ def forward(self, x0, x1):
                 dynamic_shapes=dynamic_shapes,
             )
 
-    def test_runtime_checks_complex(self):
+<<<<<<< HEAD
+=======
+    @skipIfXpu(msg="Total size of kernel arguments exceeds driver limit on XPU")
+    def test_runtime_checks_large(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
 
-            def forward(self, x0, x1, x2):
-                return (x0, x1, x2)
+            def forward(self, *inputs):
+                result = inputs[0]
+                for i in range(1, len(inputs)):
+                    result = result + inputs[i]
+                return result
 
         inputs = []
-        x0 = torch.tensor([1, -1], dtype=torch.complex32, device=self.device)
-        x1 = torch.tensor(
-            [1 + 1j, -1 + 1j, -2 + 2j, 3 - 3j, 0, 1j, 1, -1],
-            dtype=torch.complex64,
+        for i in range(1000):
+            inputs.append(torch.ones(8, 8, 8, dtype=torch.float16, device=self.device))
+        inputs = tuple(inputs)
+        model = Model()
+        with torch.no_grad():
+            AOTIRunnerUtil.compile(
+                model,
+                inputs,
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    def test_runtime_checks_complex(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x0, x1, x2):
+                return (x0, x1, x2)
+
+        inputs = []
+        x0 = torch.tensor([1, -1], dtype=torch.complex32, device=self.device)
+        x1 = torch.tensor(
+            [1 + 1j, -1 + 1j, -2 + 2j, 3 - 3j, 0, 1j, 1, -1],
+            dtype=torch.complex64,
             device=self.device,
         )
         x2 = torch.tensor(128, dtype=torch.complex128, device=self.device)
@@ -3556,15 +4803,52 @@ def forward(self, x):
         x = torch.randn(1, 4, dtype=torch.float16, device=self.device)
         model = Model()
         with torch.no_grad():
+<<<<<<< HEAD
             so_path: str = AOTIRunnerUtil.compile(
                 model,
                 (x,),
             )
         aot_inductor_module = AOTIRunnerUtil.load(self.device, so_path)
+=======
+            package_path: str = AOTIRunnerUtil.compile(
+                model,
+                (x,),
+            )
+        aot_inductor_module = torch._inductor.aoti_load_package(package_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x_casted = x.float()
         with self.assertRaisesRegex(Exception, ""):
             aot_inductor_module(x_casted)
 
+<<<<<<< HEAD
+=======
+    @patch.dict(os.environ, {"AOTI_RUNTIME_CHECK_INPUTS": "1"})
+    def test_runtime_checks_device_type_failed(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                return x + 1
+
+        x = torch.randn(1, 4, dtype=torch.float16, device="cpu")
+        model = Model()
+        with torch.no_grad():
+            package_path: str = AOTIRunnerUtil.compile(
+                model,
+                (x,),
+            )
+
+        aot_inductor_module = torch._inductor.aoti_load_package(package_path)
+        aot_inductor_module(x)
+        x_casted = x.to(GPU_TYPE)
+        with self.assertRaisesRegex(Exception, ""):
+            aot_inductor_module(x_casted)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_contiguous_output_alias(self):
         # Test return x, x.contiguous() where x is non-contiguous.
         class Model(torch.nn.Module):
@@ -3578,7 +4862,10 @@ def forward(self, x):
         model = Model()
         with torch.no_grad():
             result = AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 self.device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model,
                 (x,),
             )
@@ -3603,7 +4890,10 @@ def forward(self, x):
 
         with torch.no_grad():
             result = AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 self.device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model,
                 (x,),
             )
@@ -3640,10 +4930,17 @@ def forward(self, x):
         }
         model = Model()
         with torch.no_grad():
+<<<<<<< HEAD
             so_path: str = AOTIRunnerUtil.compile(
                 model, (x,), dynamic_shapes=dynamic_shapes
             )
         aot_inductor_module = AOTIRunnerUtil.load(self.device, so_path)
+=======
+            package_path: str = AOTIRunnerUtil.compile(
+                model, (x,), dynamic_shapes=dynamic_shapes
+            )
+        aot_inductor_module = torch._inductor.aoti_load_package(package_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # dynamic dim works fine
         _ = aot_inductor_module(y0)
         with self.assertRaisesRegex(Exception, ""):
@@ -3755,10 +5052,21 @@ def forward(self, values, offsets):
             example_inputs_3,
             example_inputs_4,
         ]
+<<<<<<< HEAD
 
         self.check_model_with_multiple_inputs(
             model, example_inputs_list, dynamic_shapes=dynamic_shapes
         )
+=======
+        for example_input in example_inputs_list:
+            actual = AOTIRunnerUtil.legacy_run(
+                self.device,
+                model,
+                example_input,
+                dynamic_shapes=dynamic_shapes,
+            )
+            self.assertTrue(same(model(*example_input), actual))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @common_utils.parametrize("max_autotune", [True, False])
     def test_misc_1(self, max_autotune):
@@ -3856,7 +5164,11 @@ def forward(self, a):
         # test default debug printing all tensor values codegen
         with config.patch({"aot_inductor.debug_intermediate_value_printer": "2"}):
             result, code = run_and_get_cpp_code(
+<<<<<<< HEAD
                 AOTIRunnerUtil.compile, model, example_inputs
+=======
+                AOTIRunnerUtil.legacy_compile, model, example_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             # check the c shim print_tensor_handle call is triggered by the config and injected the cpp output code as expected
@@ -3883,7 +5195,11 @@ def forward(self, a):
             }
         ):
             result, code = run_and_get_cpp_code(
+<<<<<<< HEAD
                 AOTIRunnerUtil.compile, model, example_inputs
+=======
+                AOTIRunnerUtil.legacy_compile, model, example_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             filtered_kernel_calls = [
                 (filtered_kernel_name, 2),
@@ -3907,6 +5223,50 @@ def forward(self, a):
                 FileCheck().check_not(f"before_launch - {kernel_name}").run(code)
                 FileCheck().check_not(f"after_launch - {kernel_name}").run(code)
 
+<<<<<<< HEAD
+=======
+    @common_utils.parametrize("enable_kernel_profile", (True, False))
+    def test_aoti_profiler(self, enable_kernel_profile):
+        # basic addmm model
+        class Model(torch.nn.Module):
+            def __init__(self, n, k, device):
+                super().__init__()
+                self.weight = torch.randn(n, k, device=device)
+                self.bias = torch.randn(n, device=device)
+
+            def forward(self, a):
+                return torch.nn.functional.linear(a, self.weight, self.bias)
+
+        if sys.platform not in ["linux", "win32"]:
+            raise unittest.SkipTest(
+                "enable_kernel_profile only supported on linux and win32"
+            )
+
+        M = 8
+        N = 6
+        K = 16
+        model = Model(N, K, self.device)
+        batch = 2
+        a = torch.randn(batch, M, K, device=self.device)
+        example_inputs = (a,)
+        kernel_calls = (
+            f"aoti_torch_{GPU_TYPE}_addmm_out"
+            if self.device == GPU_TYPE
+            else "aoti_torch_cpu_addmm_out"
+        )
+        with config.patch({"cpp.enable_kernel_profile": enable_kernel_profile}):
+            _, code = run_and_get_cpp_code(
+                AOTIRunnerUtil.compile, model, example_inputs
+            )
+            shim_fn_codes = (
+                f'RECORD_FUNCTION("{kernel_calls}", c10::ArrayRef<c10::IValue>());'
+            )
+            if enable_kernel_profile:
+                FileCheck().check(shim_fn_codes).run(code)
+            else:
+                FileCheck().check_not(shim_fn_codes).run(code)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_aoti_debug_printer_user_defined_triton_kernel(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -4022,6 +5382,75 @@ def forward(self, x):
                     2,
                 ).run(code)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FP8,
+        "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
+    )
+    @skipIfRocm  # _scaled_mm_out_cuda  is not compiled for ROCm platform
+    @skipIfXpu
+    def test_aoti_debug_printer_fp8_dtype(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self, dtype):
+                super().__init__()
+                self.out_dtype = dtype
+
+            def forward(self, x, weight, bias, scale_a, scale_b):
+                weight = weight.to(torch.float8_e4m3fn)
+                output = torch._scaled_mm(
+                    x,
+                    weight,
+                    bias=input_bias,
+                    out_dtype=self.out_dtype,
+                    scale_a=scale_a,
+                    scale_b=scale_b,
+                )
+                return output
+
+        dtype = torch.float16
+
+        a_scale = torch.Tensor([1.0]).to(device=GPU_TYPE)
+        b_scale = torch.Tensor([1.0]).to(device=GPU_TYPE)
+        input_bias = torch.rand(32, device=GPU_TYPE, dtype=dtype)
+        weight_shape = (32, 16)
+        weight = torch.rand(*weight_shape, device=GPU_TYPE, dtype=dtype).T
+        a_inverse_scale = 1 / a_scale
+        b_inverse_scale = 1 / b_scale
+
+        x_shape = (16, 16)
+        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(torch.float8_e4m3fn)
+
+        kernel_calls = [
+            (f"aoti_torch_{GPU_TYPE}__scaled_mm_out", 5),
+        ]
+
+        # test default debug printing all tensor values codegen
+        with config.patch({"aot_inductor.debug_intermediate_value_printer": "2"}):
+            result, code = run_and_get_cpp_code(
+                AOTIRunnerUtil.legacy_compile,
+                Model(dtype),
+                (x, weight, input_bias, a_inverse_scale, b_inverse_scale),
+            )
+
+            # check the c shim print_tensor_handle call is triggered by the config and injected the cpp output code as expected
+            self.assertEqual("aoti_torch_print_tensor_handle" in code, True)
+
+            # check the codegen for debug printing around the actual kernel call is expected and float8 dtype is printed as expected
+            for kernel_call, count in kernel_calls:
+                FileCheck().check_count(
+                    f"before_launch - {kernel_call}",
+                    count,
+                ).run(code)
+                FileCheck().check_count(
+                    f"after_launch - {kernel_call}",
+                    count,
+                ).run(code)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_aoti_debug_printing_model_inputs_codegen(self):
         if self.device != "cuda":
             raise unittest.SkipTest("requires CUDA")
@@ -4051,6 +5480,13 @@ def forward(self, a, b, c):
                 AOTIRunnerUtil.compile, model, example_inputs
             )
             self.assertEqual("aoti_torch_print_tensor_handle" in code, True)
+<<<<<<< HEAD
+=======
+
+            # check if the triton kernel is printed as comment
+            self.assertEqual("def triton_" in code, True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # check the codegen for debug printing around aoti model inputs is expected
             for kernel_call, count in kernel_calls:
                 FileCheck().check_count(
@@ -4105,8 +5541,20 @@ def forward(self, x):
         expected_scalar_args = [
             "buf3, u0",
             "buf4, u0",
+<<<<<<< HEAD
             "buf3, buf4, buf2, u0",
         ]
+=======
+            "buf4, buf5, buf3, u0",
+        ]
+        if full_aoti_runtime_assert():
+            # we'll have one more assertion
+            expected_scalar_args = [
+                "buf4, u0",
+                "buf5, u0",
+                "buf5, buf6, buf4, u0",
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # check the new behavior of codegen is expected
         result, code = run_and_get_cpp_code(
             AOTIRunnerUtil.compile, Model(), example_inputs
@@ -4116,9 +5564,47 @@ def forward(self, x):
                 scalar_line,
                 1,
             ).run(code)
+<<<<<<< HEAD
+
+        self.check_model(Model(), example_inputs)
 
+=======
         self.check_model(Model(), example_inputs)
 
+    def test_input_codegen_with_sympy_expr(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class MyModel(torch.nn.Module):
+            def forward(self, getitem_54, getitem_52, getitem_19, values_2, offsets):
+                bitwise_or = torch.bitwise_or(getitem_54, getitem_52)
+                combined = torch.cat([getitem_19, values_2], dim=0)
+                add = combined + bitwise_or
+
+                sliced = values_2[:-1] + offsets
+                return add, sliced
+
+        inps = (
+            torch.randint(0, 1, (240,), device=GPU_TYPE, dtype=torch.uint8),
+            torch.randint(0, 1, (240,), device=GPU_TYPE, dtype=torch.uint8),
+            torch.randn((192,), device=GPU_TYPE),
+            torch.randn((48,), device=GPU_TYPE),
+            torch.randint(0, 100, (47,), device=GPU_TYPE, dtype=torch.uint8),
+        )
+
+        dim = torch.export.Dim("dimensionality")
+        derived_dim = 2 * dim
+        spec = {
+            "getitem_54": (Dim.AUTO,),  # [s33 + 2*s40 + 1]
+            "getitem_52": (Dim.AUTO,),  # [s33 + 2*s40 + 1]
+            "getitem_19": (derived_dim,),  # [2*s40]
+            "values_2": (Dim.AUTO,),  # [s33 + 1]
+            "offsets": (Dim.AUTO,),  # [s33]
+        }
+
+        self.check_model(MyModel(), inps, dynamic_shapes=spec)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("mark_unbacked", (True, False))
     def test_unbacked_equals_input_size_runtime_assertion(self, mark_unbacked: bool):
         # This test checks the unbacked symint runtime assertions, for the following cases:
@@ -4145,12 +5631,20 @@ def forward(self, a, b, c):
 
         # Check the runtime assertion is codegen'ed.
         so_path, code = run_and_get_cpp_code(
+<<<<<<< HEAD
             AOTIRunnerUtil.compile, model, example_inputs
+=======
+            AOTIRunnerUtil.legacy_compile, model, example_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         lowerbound_check = "u1 >= 1" if mark_unbacked else "u0 >= 2"
         FileCheck().check_count(lowerbound_check, 1).run(code)
 
+<<<<<<< HEAD
         compiled = AOTIRunnerUtil.load(self.device, so_path)
+=======
+        compiled = AOTIRunnerUtil.legacy_load(self.device, so_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled(*example_inputs)
 
         # Check the runtime assertion.
@@ -4160,9 +5654,13 @@ def forward(self, a, b, c):
 
         # Try it again without runtime assertions.
         with config.patch({"scalar_asserts": False}):
+<<<<<<< HEAD
             AOTIRunnerUtil.run_multiple(
                 self.device, model, [example_inputs, unexpected_inputs]
             )
+=======
+            AOTIRunnerUtil.run_multiple(model, [example_inputs, unexpected_inputs])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_none_args_aot_codegen(self):
         if self.device != GPU_TYPE:
@@ -4268,6 +5766,7 @@ def forward(self, a):
         model = Model(N, K, self.device)
         a = torch.randn(M, K, device=self.device)
         example_inputs = (a,)
+<<<<<<< HEAD
         with torch.no_grad(), config.patch(
             {
                 "always_keep_tensor_constants": True,
@@ -4275,10 +5774,23 @@ def forward(self, a):
             }
         ):
             so_path = AOTIRunnerUtil.compile(
+=======
+        with (
+            torch.no_grad(),
+            config.patch(
+                {
+                    "always_keep_tensor_constants": True,
+                    "aot_inductor.package_constants_in_so": True,
+                }
+            ),
+        ):
+            so_path = AOTIRunnerUtil.legacy_compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model=model,
                 example_inputs=example_inputs,
             )
 
+<<<<<<< HEAD
         with torch.no_grad(), config.patch(
             {
                 "always_keep_tensor_constants": True,
@@ -4286,13 +5798,29 @@ def forward(self, a):
             }
         ):
             so_path_weightless = AOTIRunnerUtil.compile(
+=======
+        with (
+            torch.no_grad(),
+            config.patch(
+                {
+                    "always_keep_tensor_constants": True,
+                    "aot_inductor.package_constants_in_so": False,
+                }
+            ),
+        ):
+            so_path_weightless = AOTIRunnerUtil.legacy_compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model=model,
                 example_inputs=example_inputs,
             )
         self.assertTrue(os.path.getsize(so_path) > 10_000_000)
         self.assertTrue(os.path.getsize(so_path_weightless) < 10_000_000)
 
+<<<<<<< HEAD
         runner = AOTIRunnerUtil.load_runner(self.device, so_path_weightless)
+=======
+        runner = AOTIRunnerUtil.legacy_load_runner(self.device, so_path_weightless)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Let's check whether the model has correct constant name mapping.
         expected_original_fqns = {
@@ -4324,7 +5852,52 @@ def runner_call(*args, **kwargs):
         output = runner_call(test_inputs)
         self.assertEqual(expected, output)
 
+<<<<<<< HEAD
     def test_update_constant_buffer(self):
+=======
+    def test_weight_on_disk_legacy(self):
+        class Model(torch.nn.Module):
+            def __init__(self, n, k, device):
+                super().__init__()
+                self.weight = torch.randn(n, k, device=device)
+                self.bias = torch.randn(n, device=device)
+
+            def forward(self, a):
+                return torch.nn.functional.linear(a, self.weight, self.bias)
+
+        M, N, K = 128, 2048, 4096
+        model = Model(N, K, self.device)
+        a = torch.randn(M, K, device=self.device)
+        example_inputs = (a,)
+
+        with (
+            torch.no_grad(),
+            config.patch(
+                {
+                    "always_keep_tensor_constants": True,
+                    "aot_inductor.package_constants_in_so": False,
+                    "aot_inductor.package_constants_on_disk": True,
+                    "aot_inductor.package": True,
+                }
+            ),
+        ):
+            aoti_files = AOTIRunnerUtil.legacy_compile(
+                model=model,
+                example_inputs=example_inputs,
+            )
+
+        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+            package_path = package_aoti(
+                f.name,
+                {"model": aoti_files},
+            )
+            pt2_contents = load_pt2(package_path, load_weights_from_disk=True)
+            loaded1 = pt2_contents.aoti_runners["model"]
+
+        self.assertEqual(loaded1(a), model(a))
+
+    def test_extract_constants_map(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(torch.nn.Module):
             def __init__(self, n, k, device):
                 super().__init__()
@@ -4339,12 +5912,85 @@ def forward(self, a):
         a = torch.randn(M, K, device=self.device)
         example_inputs = (a,)
         with torch.no_grad(), config.patch({"always_keep_tensor_constants": True}):
+<<<<<<< HEAD
             so_path = AOTIRunnerUtil.compile(
+=======
+            so_path = AOTIRunnerUtil.legacy_compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model=model,
                 example_inputs=example_inputs,
             )
 
+<<<<<<< HEAD
         runner = AOTIRunnerUtil.load_runner(self.device, so_path)
+=======
+        runner = AOTIRunnerUtil.legacy_load_runner(self.device, so_path)
+
+        def runner_call(*args, **kwargs):
+            import torch.fx._pytree as fx_pytree
+
+            call_spec = runner.get_call_spec()
+            in_spec = pytree.treespec_loads(call_spec[0])
+            out_spec = pytree.treespec_loads(call_spec[1])
+            flat_inputs = fx_pytree.tree_flatten_spec((args, kwargs), in_spec)
+            flat_inputs = [x for x in flat_inputs if isinstance(x, torch.Tensor)]
+            flat_outputs = runner.run(flat_inputs)
+            return pytree.tree_unflatten(flat_outputs, out_spec)
+
+        test_inputs = torch.randn(M, K, device=self.device)
+        expected = model(test_inputs)
+        output = runner_call(test_inputs)
+        self.assertEqual(expected, output)
+
+        original_weights = {
+            "L__self___weight": model.weight,
+            "L__self___bias": model.bias,
+        }
+        new_weights = {
+            "L__self___weight": torch.randn(N, K, device=self.device),
+            "L__self___bias": torch.randn(N, device=self.device),
+        }
+
+        # Extract weights with use_inactive = False, this should be the current weight.
+        extracted_original_weights = runner.extract_constants_map(False)
+        self.assertEqual(original_weights, extracted_original_weights)
+
+        # update the inactive weights with new_weights, extract inactive weights.
+        runner.update_constant_buffer(new_weights, True, False)
+        extracted_new_weights = runner.extract_constants_map(True)
+        self.assertEqual(new_weights, extracted_new_weights)
+
+        # Swap constant buffer, this should give us the opposite weights.
+        runner.swap_constant_buffer()
+
+        extracted_inactive_weights = runner.extract_constants_map(True)
+        extracted_active_weights = runner.extract_constants_map(False)
+        self.assertEqual(original_weights, extracted_inactive_weights)
+        self.assertEqual(new_weights, extracted_active_weights)
+
+    def test_update_constant_buffer(self):
+        class Model(torch.nn.Module):
+            def __init__(self, n, k, device):
+                super().__init__()
+                self.weight = torch.randn(n, k, device=device)
+                self.bias = torch.randn(n, device=device)
+
+            def forward(self, a):
+                return torch.nn.functional.linear(a, self.weight, self.bias)
+
+        M, N, K = 8, 6, 16
+        model = Model(N, K, self.device)
+        a = torch.randn(M, K, device=self.device)
+        example_inputs = (a,)
+        # Attribute naming has changed in the new export API, so still use the legacy API here.
+        with torch.no_grad(), config.patch({"always_keep_tensor_constants": True}):
+            so_path = AOTIRunnerUtil.legacy_compile(
+                model=model,
+                example_inputs=example_inputs,
+            )
+
+        runner = AOTIRunnerUtil.legacy_load_runner(self.device, so_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Let's check whether the model has correct constant name mapping.
         expected_original_fqns = {
@@ -4382,31 +6028,266 @@ def runner_call(*args, **kwargs):
         )
         self.assertEqual(new_expected, new_output)
 
-    def test_cond_share_predicte(self):
+<<<<<<< HEAD
+=======
+    def test_update_inactive_constant_buffer(self):
         class Model(torch.nn.Module):
-            def forward(self, predicate, x):
-                y = torch.cond(
-                    predicate,
-                    lambda: x + 1,
-                    lambda: x + 2,
-                )
-
-                z = torch.cond(
-                    predicate,
-                    lambda: y + 1,
-                    lambda: y + 2,
-                )
-                return (z,)
+            def __init__(self, n, k, device):
+                super().__init__()
+                self.weight = torch.randn(n, k, device=device)
+                self.bias = torch.randn(n, device=device)
 
-        example_inputs = (
-            torch.tensor([True]).to(self.device),
-            torch.tensor([1, 2, 3]).to(self.device),
-        )
-        self.check_model(Model(), example_inputs)
+            def forward(self, a):
+                return torch.nn.functional.linear(a, self.weight, self.bias)
 
-    @unittest.skipIf(
-        IS_FBCODE,
-        "To enable after the C shim FC window ends",
+        M, N, K = 8, 6, 16
+        model = Model(N, K, self.device)
+        a = torch.randn(M, K, device=self.device)
+        example_inputs = (a,)
+        with torch.no_grad(), config.patch({"always_keep_tensor_constants": True}):
+            so_path = AOTIRunnerUtil.legacy_compile(
+                model=model,
+                example_inputs=example_inputs,
+            )
+
+        runner = AOTIRunnerUtil.legacy_load_runner(self.device, so_path)
+
+        def runner_call(*args, **kwargs):
+            import torch.fx._pytree as fx_pytree
+
+            call_spec = runner.get_call_spec()
+            in_spec = pytree.treespec_loads(call_spec[0])
+            out_spec = pytree.treespec_loads(call_spec[1])
+            flat_inputs = fx_pytree.tree_flatten_spec((args, kwargs), in_spec)
+            flat_inputs = [x for x in flat_inputs if isinstance(x, torch.Tensor)]
+            flat_outputs = runner.run(flat_inputs)
+            return pytree.tree_unflatten(flat_outputs, out_spec)
+
+        test_inputs = torch.randn(M, K, device=self.device)
+        expected = model(test_inputs)
+        output = runner_call(test_inputs)
+        self.assertEqual(expected, output)
+
+        new_weights = {
+            "L__self___weight": torch.randn(N, K, device=self.device),
+            "L__self___bias": torch.randn(N, device=self.device),
+        }
+        new_expected = torch.nn.functional.linear(
+            test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
+        )
+
+        runner.update_constant_buffer(new_weights, True, False)
+        output_before_swap = runner_call(test_inputs)
+        runner.swap_constant_buffer()
+        output_after_swap = runner_call(test_inputs)
+
+        self.assertEqual(expected, output_before_swap)
+        self.assertEqual(new_expected, output_after_swap)
+
+    def test_free_inactive_buffer(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self, n, k, device):
+                super().__init__()
+                self.weight = torch.randn(n, k, device=device)
+                self.bias = torch.randn(n, device=device)
+
+            def forward(self, a):
+                return torch.nn.functional.linear(a, self.weight, self.bias)
+
+        M, N, K = 8, 6, 16
+        model = Model(N, K, self.device)
+        a = torch.randn(M, K, device=self.device)
+        example_inputs = (a,)
+        with torch.no_grad(), config.patch({"always_keep_tensor_constants": True}):
+            so_path = AOTIRunnerUtil.legacy_compile(
+                model=model,
+                example_inputs=example_inputs,
+            )
+
+        runner = AOTIRunnerUtil.legacy_load_runner(self.device, so_path)
+
+        def runner_call(*args, **kwargs):
+            import torch.fx._pytree as fx_pytree
+
+            call_spec = runner.get_call_spec()
+            in_spec = pytree.treespec_loads(call_spec[0])
+            out_spec = pytree.treespec_loads(call_spec[1])
+            flat_inputs = fx_pytree.tree_flatten_spec((args, kwargs), in_spec)
+            flat_inputs = [x for x in flat_inputs if isinstance(x, torch.Tensor)]
+            flat_outputs = runner.run(flat_inputs)
+            return pytree.tree_unflatten(flat_outputs, out_spec)
+
+        test_inputs = torch.randn(M, K, device=self.device)
+        expected = model(test_inputs)
+        output = runner_call(test_inputs)
+        # Check the outputs, make sure the model is correct here.
+        self.assertEqual(expected, output)
+
+        new_weights = {
+            "L__self___weight": torch.randn(N, K, device=self.device),
+            "L__self___bias": torch.randn(N, device=self.device),
+        }
+        new_expected = torch.nn.functional.linear(
+            test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
+        )
+        runner.update_constant_buffer(new_weights, True, False)
+
+        # Make sure we have swapped buffer
+        runner.swap_constant_buffer()
+        output_after_swap = runner_call(test_inputs)
+        self.assertEqual(new_expected, output_after_swap)
+
+        # Free the secondary buffer
+        runner.free_inactive_constant_buffer()
+
+        # Create a new set of weights to refill into the already freed buffer.
+        new_weights_1 = {
+            "L__self___weight": torch.randn(N, K, device=self.device),
+            "L__self___bias": torch.randn(N, device=self.device),
+        }
+        new_expected_1 = torch.nn.functional.linear(
+            test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
+        )
+        runner.update_constant_buffer(new_weights_1, True, False)
+
+        output_after_swap_1 = runner_call(test_inputs)
+        self.assertEqual(new_expected_1, output_after_swap_1)
+
+        runner.free_inactive_constant_buffer()
+
+    def test_update_user_managed_buffer(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Model(torch.nn.Module):
+            def __init__(self, n, k, device):
+                super().__init__()
+                self.weight = torch.randn(n, k, device=device)
+                self.bias = torch.randn(n, device=device)
+
+            def forward(self, a):
+                return torch.nn.functional.linear(a, self.weight, self.bias)
+
+        M, N, K = 1024, 4096, 4096
+        model = Model(N, K, self.device)
+        a = torch.randn(M, K, device=self.device)
+        example_inputs = (a,)
+        # Attribute naming has changed in the new export API, so still use the legacy API here.
+        with torch.no_grad(), config.patch({"always_keep_tensor_constants": True}):
+            so_path = AOTIRunnerUtil.legacy_compile(
+                model=model,
+                example_inputs=example_inputs,
+            )
+
+        runner = AOTIRunnerUtil.legacy_load_runner(self.device, so_path)
+
+        def runner_call(*args, **kwargs):
+            import torch.fx._pytree as fx_pytree
+
+            call_spec = runner.get_call_spec()
+            in_spec = pytree.treespec_loads(call_spec[0])
+            out_spec = pytree.treespec_loads(call_spec[1])
+            flat_inputs = fx_pytree.tree_flatten_spec((args, kwargs), in_spec)
+            flat_inputs = [x for x in flat_inputs if isinstance(x, torch.Tensor)]
+            flat_outputs = runner.run(flat_inputs)
+            return pytree.tree_unflatten(flat_outputs, out_spec)
+
+        test_inputs = torch.randn(M, K, device=self.device)
+        expected = model(test_inputs)
+        output = runner_call(test_inputs)
+        self.assertEqual(expected, output)
+
+        new_weights = {
+            "L__self___weight": torch.randn(N, K, device=self.device),
+            "L__self___bias": torch.randn(N, device=self.device),
+        }
+        mem_before, _ = torch.cuda.mem_get_info(self.device)
+        # Do not use user managed_buffer, should have less free memory.
+        runner.update_constant_buffer(new_weights, True, False, False)
+        mem_after, _ = torch.cuda.mem_get_info(self.device)
+        self.assertGreater(mem_before, mem_after)
+
+        runner.swap_constant_buffer()
+        new_output = runner_call(test_inputs)
+        new_expected = torch.nn.functional.linear(
+            test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
+        )
+        self.assertEqual(new_expected, new_output)
+
+        # Inplace substitube tensor, without user managed buffer, result should be different.
+        new_weights["L__self___weight"].add_(1)
+        new_weights["L__self___bias"].add_(1)
+
+        new_output = runner_call(test_inputs)
+        # Same as the previous result
+        self.assertEqual(new_expected, new_output)
+        new_expected = torch.nn.functional.linear(
+            test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
+        )
+        # Differ from latest result
+        self.assertNotEqual(new_expected, new_output)
+
+        # Clear out all buffers
+        runner.free_inactive_constant_buffer()
+        runner.swap_constant_buffer()
+        runner.free_inactive_constant_buffer()
+
+        new_weights = {
+            "L__self___weight": torch.randn(N, K, device=self.device),
+            "L__self___bias": torch.randn(N, device=self.device),
+        }
+        mem_before, _ = torch.cuda.mem_get_info(self.device)
+        # Try user managed_buffer, should have same free memory.
+        runner.update_constant_buffer(new_weights, True, False, True)
+        mem_after, _ = torch.cuda.mem_get_info(self.device)
+        self.assertEqual(mem_before, mem_after)
+
+        runner.swap_constant_buffer()
+        new_output = runner_call(test_inputs)
+        new_expected = torch.nn.functional.linear(
+            test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
+        )
+        self.assertEqual(new_expected, new_output)
+
+        # Inplace substitube tensor, with user managed buffer, result should be the same.
+        new_weights["L__self___weight"].add_(1)
+        new_weights["L__self___bias"].add_(1)
+
+        new_output = runner_call(test_inputs)
+        new_expected = torch.nn.functional.linear(
+            test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
+        )
+        self.assertEqual(new_expected, new_output)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+    def test_cond_share_predicte(self):
+        class Model(torch.nn.Module):
+            def forward(self, predicate, x):
+                y = torch.cond(
+                    predicate,
+                    lambda: x + 1,
+                    lambda: x + 2,
+                )
+
+                z = torch.cond(
+                    predicate,
+                    lambda: y + 1,
+                    lambda: y + 2,
+                )
+                return (z,)
+
+        example_inputs = (
+            torch.tensor([True]).to(self.device),
+            torch.tensor([1, 2, 3]).to(self.device),
+        )
+        self.check_model(Model(), example_inputs)
+
+    @unittest.skipIf(
+        IS_FBCODE,
+        "To enable after the C shim FC window ends",
     )
     def test_misaligned_input_1(self):
         if self.device != "cuda":
@@ -4421,8 +6302,13 @@ def forward(self, x):
         example_inputs = (arg,)
         model = Model()
         expected = model(*example_inputs)
+<<<<<<< HEAD
         so_path = AOTIRunnerUtil.compile(model, example_inputs)
         optimized = AOTIRunnerUtil.load(self.device, so_path)
+=======
+        package_path = AOTIRunnerUtil.compile(model, example_inputs)
+        optimized = torch._inductor.aoti_load_package(package_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If the model is compiled with aligned inputs, the generated
         # code will check inputs alignment at runtime
         self.code_check_count(
@@ -4458,6 +6344,45 @@ def forward(self, x):
             model, example_inputs, "aoti_torch_clone_preserve_strides", 0
         )
 
+<<<<<<< HEAD
+=======
+    def test_autotuning_args_reuse(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                x_out = torch.empty_strided(
+                    (x.size()[0], x.size()[1]), (x.size()[1], 1), device=GPU_TYPE
+                )
+                x_out = torch.permute(x_out, [0, 1])
+                add_kernel_autotuned[(4,)](x, x, x_out, 16)
+
+                y_out = torch.empty_strided(
+                    (y.size()[0], y.size()[1]), (y.size()[1], 1), device=GPU_TYPE
+                )
+                y_out = torch.permute(y_out, [0, 1])
+                add_kernel_autotuned[(64,)](y, y, y_out, 64)
+
+                sub_kernel_autotuned[(4,)](x, x, x_out, 16)
+
+                return x_out, y_out
+
+        example_inputs = (
+            torch.randn(4, 4, device=GPU_TYPE),
+            torch.randn(8, 8, device=GPU_TYPE),
+        )
+        dim0_x = Dim("dim0_x", min=1, max=2048)
+        dim0_y = Dim("dim0_y", min=1, max=2048)
+        dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_y}}
+        self.check_model(
+            Model(),
+            example_inputs,
+            dynamic_shapes=dynamic_shapes,
+            options={"max_autotune": True},
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_FBCODE, "Not runnable in fbcode")
     def test_stft(self):
         N_FFT = 400
@@ -4549,6 +6474,121 @@ def forward(self, x, y):
         )
         self.check_model(Model(), example_inputs)
 
+<<<<<<< HEAD
+=======
+    @skipIfXpu(
+        msg="aten::convert_weight_to_int4pack is not currently implemented for XPU"
+    )
+    @parametrize("m", [32])
+    @parametrize("n", [64])
+    @parametrize("q_group", [32, 64])
+    @parametrize("num_groups", [1, 2])
+    def test__weight_int4pack_mm(self, m, n, q_group, num_groups):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self, weight, scale_and_zeros) -> None:
+                super().__init__()
+                self.weight = weight
+                self.scale_and_zeros = scale_and_zeros
+
+            def forward(self, a):
+                return torch._weight_int4pack_mm(
+                    a, self.weight, q_group, self.scale_and_zeros
+                )
+
+        def convert_weight_to_int4pack(b):
+            b_int32, b_scales_and_zeros = _group_quantize_tensor(
+                b, n_bit=4, q_group_size=q_group
+            )
+            b_int4pack = torch._convert_weight_to_int4pack(b_int32, innerKTiles=2)
+            return b_int4pack, b_scales_and_zeros
+
+        k = q_group * num_groups
+        a = torch.rand((m, k), device=self.device, dtype=torch.bfloat16)
+        b = torch.rand((k, n), device=self.device, dtype=torch.bfloat16)
+        b_int4pack, b_scales_and_zeros_f32 = convert_weight_to_int4pack(b)
+        model = Model(b_int4pack, b_scales_and_zeros_f32)
+        self.check_model(model, (a,))
+
+    @parametrize("m", [32])
+    @parametrize("n", [64])
+    @parametrize("q_group", [32, 64])
+    @parametrize("num_groups", [1, 2])
+    def test__weight_int4pack_mm_with_scales_and_zeros(self, m, n, q_group, num_groups):
+        if "xpu" not in self.device:
+            raise unittest.SkipTest("requires Intel GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self, weight, scale, zeros) -> None:
+                super().__init__()
+                self.weight = weight
+                self.scale = scale
+                self.zeros = zeros
+
+            def forward(self, a):
+                return torch._weight_int4pack_mm_with_scales_and_zeros(
+                    a, self.weight, q_group, self.scale, self.zeros
+                )
+
+        def _group_quantize_tensor_xpu(w, n_bit=4, q_group_size=16):
+            # w [k, n] = [32, 48]
+            assert w.dim() == 2
+            # w [n, k] = [48, 32]
+            w = w.transpose(0, 1).contiguous()
+            assert q_group_size > 1
+            assert w.shape[-1] % q_group_size == 0
+
+            # to_quant: [n * k / group_size, group_size]
+            to_quant = w.reshape(-1, q_group_size)
+            assert torch.isnan(to_quant).sum() == 0
+
+            max_val = to_quant.amax(dim=1, keepdim=True)
+            min_val = to_quant.amin(dim=1, keepdim=True)
+            max_int = 2**n_bit - 1
+            min_int = 0
+            scales = (max_val - min_val).clamp(min=1e-6) / max_int
+            assert torch.isnan(scales).sum() == 0
+
+            zeros = min_int - min_val.div(scales).round()
+            zeros = torch.clamp(zeros, min_int, max_int)
+            zeros = zeros.to(torch.int8)
+            assert torch.isnan(zeros).sum() == 0
+
+            out = to_quant.div(scales).add(zeros).round().clamp_(min_int, max_int)
+            assert torch.isnan(out).sum() == 0
+
+            # [n, k]
+            out = out.to(dtype=torch.int32).reshape(w.shape)
+            if out.device != torch.device("cpu"):
+                out = (out[::, 1::2] << 4 | out[::, 0::2]).to(torch.uint8)
+
+            # Scales and zeros for the same q-group should be contiguous, so we can
+            # load as a 32-bit word
+            scales = scales.view(w.shape[0], -1).transpose(0, 1).contiguous()
+            zeros = zeros.view(w.shape[0], -1).transpose(0, 1).contiguous()
+
+            return out, scales, zeros
+
+        def convert_weight_to_int4pack(b):
+            # b_uint8 [n, k //2]
+            b_uint8, scales, zeros = _group_quantize_tensor_xpu(
+                b, n_bit=4, q_group_size=q_group
+            )
+            # b_int4pack [k//8, n]
+            b_int4pack = torch._convert_weight_to_int4pack(b_uint8, innerKTiles=2)
+
+            return b_int4pack, scales, zeros
+
+        k = q_group * num_groups
+        a = torch.rand((m, k), device=self.device, dtype=torch.bfloat16)
+        b = torch.rand((k, n), device=self.device, dtype=torch.bfloat16)
+        b_int4pack, b_scales, zeros_int8 = convert_weight_to_int4pack(b)
+        model = Model(b_int4pack, b_scales, zeros_int8)
+        self.check_model(model, (a,))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_assert_tensor_meta(self):
         class Module(torch.nn.Module):
             def forward(self, x):
@@ -4571,6 +6611,187 @@ def forward(self, x):
                 rtol=1e-3,
             )
 
+<<<<<<< HEAD
+=======
+    @skipIfRocm  # RoCM does not support the config block size in test suite.
+    def test_triton_autotuning(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y, m):
+                _M, K = x.shape
+                K, N = y.shape
+                M = torch.abs(m)
+                out = torch.empty((_M, N), device=x.device, dtype=torch.float32)
+                grid = lambda META: (  # noqa: E731
+                    triton.cdiv(
+                        4096 * 2046, META["BLOCK_SIZE_M"] * META["BLOCK_SIZE_N"]
+                    ),
+                )
+                strange_config_matmul_kernel[grid](
+                    x,
+                    y,
+                    out,
+                    M,
+                    N,
+                    K,
+                )
+                return out
+
+        x = torch.randn(4096, 1024, device=self.device)
+        y = torch.randn(1024, 2048, device=self.device)
+        m = torch.tensor([4096], dtype=torch.int32, device=self.device)
+
+        with config.patch("triton.autotune_with_sample_inputs", True):
+            # The tuned best config on XPU is different with CUDA.
+            grid_0 = 32736 if GPU_TYPE == "xpu" else 1023
+            self.code_check_count(
+                Model(), (x, y, m), f"uint32_t grid_0 = {grid_0}L;", 1
+            )
+
+    @skipIfRocm  # RoCM does not support the config block size in test suite.
+    def test_triton_mutated_autotuning(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        @triton.jit
+        def add_one_kernel(X, Y, N):
+            pid = tl.program_id(axis=0)
+            block_start = pid
+            offsets = block_start + tl.arange(0, 1)
+
+            x = tl.load(X + offsets, mask=offsets < N)
+            y = x + 1
+            tl.store(Y + offsets, y, mask=offsets < N)
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y, m):
+                _M, K = x.shape
+                K, N = y.shape
+                M = torch.empty((1), device=x.device, dtype=torch.int32)
+                add_one_kernel[(1,)](m, M, 1)
+                out = torch.empty((_M, N), device=x.device, dtype=torch.float32)
+                grid = lambda META: (  # noqa: E731
+                    triton.cdiv(
+                        4096 * 2046, META["BLOCK_SIZE_M"] * META["BLOCK_SIZE_N"]
+                    ),
+                )
+                strange_config_matmul_kernel[grid](
+                    x,
+                    y,
+                    out,
+                    M,
+                    N,
+                    K,
+                )
+                return out
+
+        x = torch.randn(4096, 1024, device=self.device)
+        y = torch.randn(1024, 2048, device=self.device)
+        m = torch.tensor([4095], dtype=torch.int32, device=self.device)
+
+        with config.patch("triton.autotune_with_sample_inputs", True):
+            # The tuned best config on XPU is different with CUDA.
+            grid_0 = 32736 if GPU_TYPE == "xpu" else 1023
+            self.code_check_count(
+                Model(), (x, y, m), f"uint32_t grid_0 = {grid_0}L;", 1
+            )
+
+    @skipIfRocm
+    @patch.dict(os.environ, {"TRITON_DEBUG": "1"})
+    def test_triton_dynamic_launcher_grid(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        @triton.autotune(
+            configs=[
+                triton.Config({"BLOCK_SIZE": 32}, num_stages=5, num_warps=2),
+                triton.Config({"BLOCK_SIZE": 64}, num_stages=4, num_warps=4),
+            ],
+            key=["numel"],
+        )
+        @triton.jit
+        def add_one_kernel(X, Y, numel, BLOCK_SIZE: "tl.constexpr"):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            tl.device_assert(block_start < numel)
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+
+            x = tl.load(X + offsets)
+            y = x + 1
+            tl.store(Y + offsets, y)
+
+        class Model(torch.nn.Module):
+            def forward(self, x, value):
+                numel = value.item()
+                out = torch.zeros_like(x, dtype=torch.float16)
+
+                grid = lambda META: (  # noqa: E731
+                    triton.cdiv(numel, META["BLOCK_SIZE"]),
+                )
+                add_one_kernel[grid](x, out, numel)
+
+                return out
+
+        example_inputs = (
+            torch.randn(1024, device=self.device),
+            torch.tensor([1024], dtype=torch.int32, device=self.device),
+        )
+
+        with config.patch("triton.autotune_with_sample_inputs", True):
+            dim0_x = Dim("dim0_x", min=2, max=8192)
+            dynamic_shapes = {"x": {0: dim0_x}, "value": {0: Dim.AUTO}}
+            self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
+
+    @skipIfRocm
+    @patch.dict(os.environ, {"TRITON_DEBUG": "1"})
+    def test_triton_dynamic_launcher_grid_infer_from_tensor(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        @triton.autotune(
+            configs=[
+                triton.Config({"BLOCK_SIZE": 32}, num_stages=5, num_warps=2),
+                triton.Config({"BLOCK_SIZE": 64}, num_stages=4, num_warps=4),
+            ],
+            key=["numel"],
+        )
+        @triton.jit
+        def add_one_kernel(X, Y, numel, BLOCK_SIZE: "tl.constexpr"):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            tl.device_assert(block_start < numel)
+
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            x = tl.load(X + offsets)
+            y = x + 1
+            tl.store(Y + offsets, y)
+
+        class Model(torch.nn.Module):
+            def forward(self, x, dim_D):
+                numel = x.shape[1] * dim_D.item()
+                x = x.repeat(dim_D, 1)
+                out = torch.zeros_like(x, dtype=torch.float16)
+
+                grid = lambda META: (  # noqa: E731
+                    triton.cdiv(numel, META["BLOCK_SIZE"]),
+                )
+                add_one_kernel[grid](x, out, numel)
+
+                return out
+
+        example_inputs = (
+            torch.randn(1, 1024, device=self.device),
+            torch.tensor([2], dtype=torch.int32, device=self.device),
+        )
+
+        with config.patch("triton.autotune_with_sample_inputs", True):
+            dim1_x = Dim("dim1_x", min=2, max=8192)
+            dynamic_shapes = {"x": {0: Dim.AUTO, 1: dim1_x}, "dim_D": {0: Dim.AUTO}}
+            self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_composed_dynamic_size(self):
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -4656,6 +6877,58 @@ def wrapped(**kwargs):
         # compare against eager
         self.assertEqual(optimized(**model_kwargs), model(**model_kwargs))
 
+<<<<<<< HEAD
+=======
+    def test_clamp_decomposition(self):
+        class Model1(torch.nn.Module):
+            def forward(self, x):
+                return x.clamp(min=1.5)
+
+        class Model2(torch.nn.Module):
+            def forward(self, x):
+                return x.clamp(min=2)
+
+        x = torch.randint(4, (4,))
+
+        # the output should have float32 type, not int
+        self.check_model(Model1(), (x,))
+        # the output should have int type
+        self.check_model(Model2(), (x,))
+
+    def test_using_model_name_for_files(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 10)
+
+            def forward(self, x, y):
+                return x + self.linear(y)
+
+        example_inputs = (
+            torch.randn(10, 10, device=self.device),
+            torch.randn(10, 10, device=self.device),
+        )
+        model = Model().to(self.device)
+        with torch.no_grad():
+            package_path: str = AOTIRunnerUtil.compile(
+                model,
+                example_inputs,
+                inductor_configs={
+                    "aot_inductor.model_name_for_generated_files": "test_model"
+                },
+            )
+
+        with zipfile.ZipFile(package_path, "r") as zip_ref:
+            all_files = zip_ref.namelist()
+            base_dir = "test_model.wrapper/data/aotinductor/model/test_model"
+            self.assertTrue(f"{base_dir}.wrapper.cpp" in all_files)
+            self.assertTrue(f"{base_dir}.kernel.cpp" in all_files)
+            self.assertTrue(f"{base_dir}.wrapper.so" in all_files)
+
+        aot_inductor_module = torch._inductor.aoti_load_package(package_path)
+        self.assertEqual(aot_inductor_module(*example_inputs), model(*example_inputs))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class AOTInductorLoggingTest(LoggingTestCase):
     @make_logging_test(dynamic=logging.DEBUG)
@@ -4696,8 +6969,11 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
 CPU_TEST_FAILURES = {
     # TODO: failed internally
     "test_multiple_output_alias": fail_cpu(is_skip=True),
+<<<<<<< HEAD
     "test_update_constant_buffer": fail_cpu(is_skip=True),
     "test_so_without_weight": fail_cpu(is_skip=True),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 # test_failures, xfail by default, set is_skip=True to skip
@@ -4709,7 +6985,10 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     "test_scaled_dot_product_efficient_attention": fail_gpu(("xpu",)),
     # No fft implementation for XPU yet.
     "test_fft_c2c": fail_gpu(("xpu",), is_skip=True),
+<<<<<<< HEAD
     "test_stft": fail_gpu(("xpu",)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
diff --git a/test/inductor/test_aot_inductor_arrayref.py b/test/inductor/test_aot_inductor_arrayref.py
index 10af958eb2b5..3adc6d78b2c6 100644
--- a/test/inductor/test_aot_inductor_arrayref.py
+++ b/test/inductor/test_aot_inductor_arrayref.py
@@ -78,6 +78,11 @@ def fail_minimal_arrayref_interface(is_skip=False):
     "test_while_loop_with_mixed_device_dynamic_False": fail_stack_allocation(),
     "test_while_loop_with_sym_expr_cond_dynamic_True": fail_minimal_arrayref_interface(),
     "test_while_loop_with_sym_expr_cond_dynamic_False": fail_minimal_arrayref_interface(),
+<<<<<<< HEAD
+=======
+    "test_while_loop_with_conv_dynamic_True": fail_minimal_arrayref_interface(),
+    "test_while_loop_with_conv_dynamic_False": fail_minimal_arrayref_interface(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_while_loop_with_parameters": fail_minimal_arrayref_interface(),
     "test_while_loop_with_pytree_inputs": fail_stack_allocation(),
     # FIXME: failed with Segfault while exiting the Python runtime
@@ -91,7 +96,11 @@ def fail_minimal_arrayref_interface(is_skip=False):
     ),
     # https://github.com/pytorch/pytorch/issues/129550
     # https://github.com/pytorch/pytorch/issues/123691
+<<<<<<< HEAD
     "test_dynamic_scalar": fail_minimal_arrayref_interface(is_skip=True),
+=======
+    "test_dynamic_scalar": fail_stack_allocation(is_skip=True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # https://github.com/pytorch/pytorch/issues/122980
     "test_fft_c2c": fail_stack_allocation(is_skip=True),
     "test_freezing": fail_minimal_arrayref_interface(is_skip=True),
@@ -167,8 +176,34 @@ def fail_minimal_arrayref_interface(is_skip=False):
     "test_symbool_item": fail_minimal_arrayref_interface(is_skip=True),
     # TODO: AttributeError: 'ShapeAsConstantBuffer' object has no attribute 'dtype'
     "test_symfloat_item": fail_minimal_arrayref_interface(is_skip=True),
+<<<<<<< HEAD
     "test_update_constant_buffer": fail_stack_allocation(is_skip=True),
     "test_so_without_weight": fail_stack_allocation(is_skip=True),
+=======
+    # Causes a segfault when the process exits
+    "test_view_outputs": fail_stack_allocation(is_skip=True),
+    "test_pytree_inputs": fail_stack_allocation(is_skip=True),
+    "test_duplicated_params": fail_stack_allocation(is_skip=True),
+    "test_output_misaligned": fail_stack_allocation(is_skip=True),
+    "test_no_args": fail_stack_allocation(is_skip=True),
+    "test_fqn": fail_stack_allocation(is_skip=True),
+    "test_assert_tensor_meta": fail_stack_allocation(is_skip=True),
+    "test_clamp_decomposition": fail_stack_allocation(is_skip=True),
+    "test_aoti_constant_tensor_name_collision": fail_stack_allocation(is_skip=True),
+    "test_cond_unbacked_symint_closure_dynamic_False": fail_stack_allocation(
+        is_skip=True
+    ),
+    "test_empty_cat_dtype_promotion": fail_stack_allocation(is_skip=True),
+    "test_pad_fallback": fail_stack_allocation(is_skip=True),
+    "test_simple_embed_kernel_binary_False_max_autotune_True": fail_stack_allocation(
+        is_skip=True
+    ),
+    "test_simple_embed_kernel_binary_True_max_autotune_True": fail_stack_allocation(
+        is_skip=True
+    ),
+    # When running test_seq with test_issue_140766, the process segfaults
+    "test_seq": fail_stack_allocation(is_skip=True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
diff --git a/test/inductor/test_aot_inductor_custom_ops.py b/test/inductor/test_aot_inductor_custom_ops.py
index ce2ef3739d3b..63642bf7eab2 100644
--- a/test/inductor/test_aot_inductor_custom_ops.py
+++ b/test/inductor/test_aot_inductor_custom_ops.py
@@ -20,6 +20,11 @@
     IS_MACOS,
     IS_SANDCASTLE,
     IS_WINDOWS,
+<<<<<<< HEAD
+=======
+    skipIfRocm,
+    skipIfXpu,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch.testing._internal.triton_utils import HAS_CUDA
@@ -111,6 +116,7 @@ def _(x):
 class AOTInductorTestsTemplate:
     def test_custom_op_add(self) -> None:
         class M(torch.nn.Module):
+<<<<<<< HEAD
             def forward(self, x, y):
                 return torch.ops.aoti_custom_ops.custom_add(x, y)
 
@@ -119,6 +125,20 @@ def forward(self, x, y):
             torch.randn(3, 3, device=self.device),
             torch.randn(3, 3, device=self.device),
         )
+=======
+            def __init__(self, device):
+                super().__init__()
+                self.device = device
+                self.w = torch.randn(3, 3, device=device)
+
+            def forward(self, x):
+                const = torch.tensor([1], device=self.device)
+                x = torch.ops.aoti_custom_ops.custom_add(x, const)
+                return torch.ops.aoti_custom_ops.custom_add(x, self.w)
+
+        m = M(self.device).to(device=self.device)
+        args = (torch.randn(3, 3, device=self.device),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.check_model(m, args)
 
     def test_custom_op_add_output_path(self) -> None:
@@ -131,10 +151,69 @@ def forward(self, x, y):
             torch.randn(3, 3, device=self.device),
             torch.randn(3, 3, device=self.device),
         )
+<<<<<<< HEAD
         with config.patch("aot_inductor.output_path", "model.so"):
             with self.assertRaises(Exception):
                 self.check_model(m, args)
 
+=======
+        with config.patch("aot_inductor.output_path", "model.pt2"):
+            with self.assertRaises(Exception):
+                self.check_model(m, args)
+
+    def test_fn_with_optional_tensor_output(self) -> None:
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.ops.aoti_custom_ops.fn_with_optional_tensor_output(x, y)
+
+        m = M().to(device=self.device)
+        args = (
+            torch.randn(3, 3, device=self.device),
+            torch.randn(3, 3, device=self.device),
+        )
+        self.check_model(m, args)
+
+    def test_fn_with_optional_tensor_output_2(self) -> None:
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.ops.aoti_custom_ops.fn_with_optional_tensor_output_2(x, y)
+
+        m = M().to(device=self.device)
+        args = (
+            torch.randn(3, 3, device=self.device),
+            torch.randn(3, 3, device=self.device),
+        )
+        self.check_model(m, args)
+
+    def test_fn_with_optional_tensor_nullopt_output(self) -> None:
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.ops.aoti_custom_ops.fn_with_optional_tensor_nullopt_output(
+                    x, y
+                )
+
+        m = M().to(device=self.device)
+        args = (
+            torch.randn(3, 3, device=self.device),
+            torch.randn(3, 3, device=self.device),
+        )
+        self.check_model(m, args)
+
+    def test_fn_with_int_output(self) -> None:
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                i = x.shape[0]
+                z, _, _, i1, i2 = torch.ops.aoti_custom_ops.fn_with_int_output(x, y, i)
+                return z, z * (i1 + i2 + i)
+
+        m = M().to(device=self.device)
+        args = (
+            torch.randn(3, 3, device=self.device),
+            torch.randn(3, 3, device=self.device),
+        )
+        self.check_model(m, args)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_custom_op_all_inputs(self) -> None:
         class MyModel(torch.nn.Module):
             # pyre-fixme[3]: Return type must be annotated.
@@ -356,6 +435,43 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         self.assertEqual(len(inps), 0)
         self.assertTrue(sentinel_seen)
 
+<<<<<<< HEAD
+=======
+    @skipIfXpu
+    @skipIfRocm
+    def test_custom_op_square(self) -> None:
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aoti_custom_ops.fn_square(x)
+
+        m = Model().to(device=self.device)
+        args = (torch.randn(2, 3, device=self.device),)
+        with (
+            config.patch(
+                "aot_inductor.custom_ops_to_c_shims",
+                {
+                    torch.ops.aoti_custom_ops.fn_square.default: [
+                        """
+                AOTITorchError
+                aoti_torch_cpu_fn_square(
+                    AtenTensorHandle input,
+                    AtenTensorHandle* ret)""",
+                        """
+                AOTITorchError
+                aoti_torch_cuda_fn_square(
+                    AtenTensorHandle input,
+                    AtenTensorHandle* ret)""",
+                    ],
+                },
+            ),
+            config.patch(
+                "aot_inductor.custom_op_libs",
+                ["aoti_custom_ops"],
+            ),
+        ):
+            self.check_model(m, args)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class AOTInductorLoggingTest(LoggingTestCase):
     @make_logging_test(dynamic=logging.DEBUG)
diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py
index 28e01a40e9d4..2977abee9e47 100644
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@@ -15,10 +15,19 @@
 from parameterized import parameterized_class
 
 import torch
+<<<<<<< HEAD
 from torch._inductor.package import AOTICompiledModel, load_package, package_aoti
 from torch._inductor.test_case import TestCase
 from torch._inductor.utils import fresh_inductor_cache
 from torch.export import Dim
+=======
+from torch._inductor.codecache import get_kernel_bin_format
+from torch._inductor.package import AOTICompiledModel, load_package, package_aoti
+from torch._inductor.test_case import TestCase
+from torch._inductor.utils import fresh_cache
+from torch.export import Dim
+from torch.export.pt2_archive._package import load_pt2, load_weights_to_pt2_contents
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
     skipIfRocm,
@@ -85,7 +94,13 @@ def compile(
         if sys.platform != "darwin"
         else []
     ),
+<<<<<<< HEAD
     class_name_func=lambda cls, _, params: f"{cls.__name__}{'Cpp' if params['package_cpp_only'] else ''}_{params['device']}",
+=======
+    class_name_func=lambda cls,
+    _,
+    params: f"{cls.__name__}{'Cpp' if params['package_cpp_only'] else ''}_{params['device']}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 class TestAOTInductorPackage(TestCase):
     def check_model(
@@ -94,7 +109,10 @@ def check_model(
         example_inputs,
         inductor_configs=None,
         dynamic_shapes=None,
+<<<<<<< HEAD
         disable_constraint_solver=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         atol=None,
         rtol=None,
     ) -> AOTICompiledModel:
@@ -136,7 +154,11 @@ def forward(self, x, y):
 
     def test_remove_intermediate_files(self):
         # For CUDA, generated cpp files contain absolute path to the generated cubin files.
+<<<<<<< HEAD
         # With the package artifact, that cubin path should be overriden at the run time,
+=======
+        # With the package artifact, that cubin path should be overridden at the run time,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # so removing those intermeidate files in this test to verify that.
         class Model(torch.nn.Module):
             def forward(self, x, y):
@@ -157,7 +179,11 @@ def forward(self, x, y):
             torch.manual_seed(0)
             with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
                 ep = torch.export.export(model, example_inputs, strict=True)
+<<<<<<< HEAD
                 with fresh_inductor_cache():
+=======
+                with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # cubin files are removed when exiting this context
                     package_path = torch._inductor.aoti_compile_and_package(
                         ep,
@@ -184,7 +210,10 @@ def forward(self, x, y):
         self.check_model(Model(), example_inputs)
 
     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+<<<<<<< HEAD
     @skipIfRocm  # build system may be different
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfXpu  # build system may be different
     def test_compile_after_package(self):
         if not self.package_cpp_only:
@@ -212,17 +241,40 @@ def forward(self, x, y):
 
             options = {
                 "aot_inductor.package_cpp_only": self.package_cpp_only,
+<<<<<<< HEAD
+=======
+                # Require kernels to be compiled into .o files
+                "aot_inductor.embed_kernel_binary": True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
             ep = torch.export.export(model, example_inputs, strict=True)
             package_path = torch._inductor.aoti_compile_and_package(
                 ep, inductor_configs=options
             )
+<<<<<<< HEAD
             with tempfile.TemporaryDirectory() as tmp_dir, zipfile.ZipFile(
                 package_path, "r"
             ) as zip_ref:
                 zip_ref.extractall(tmp_dir)
                 tmp_path = Path(tmp_dir) / "data" / "aotinductor" / "model"
                 self.assertTrue(tmp_path.exists())
+=======
+            with (
+                tempfile.TemporaryDirectory() as tmp_dir,
+                zipfile.ZipFile(package_path, "r") as zip_ref,
+            ):
+                filenames = zip_ref.namelist()
+                prefix = filenames[0].split("/")[0]
+                zip_ref.extractall(tmp_dir)
+                tmp_path = Path(tmp_dir) / prefix / "data" / "aotinductor" / "model"
+                self.assertTrue(tmp_path.exists())
+                if self.device == GPU_TYPE:
+                    kernel_bin = get_kernel_bin_format(self.device)
+                    self.assertTrue(not list(tmp_path.glob(f"*.{kernel_bin}")))
+                    # Check if .cubin.o files exist and use unique kernel names
+                    self.assertTrue(list(tmp_path.glob(f"triton_*.{kernel_bin}.o")))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 build_path = tmp_path / "build"
                 self.assertTrue(not build_path.exists())
 
@@ -244,6 +296,77 @@ def forward(self, x, y):
                 actual = optimized(*example_inputs)
                 self.assertTrue(torch.allclose(actual, expected))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+    @skipIfRocm  # doesn't support multi-arch binary
+    @skipIfXpu  # doesn't support multi-arch binary
+    def test_compile_after_package_multi_arch(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("Only meant to test GPU_TYPE")
+        if not self.package_cpp_only:
+            raise unittest.SkipTest("Only meant to test cpp package")
+        if shutil.which("cmake") is None:
+            raise unittest.SkipTest("cmake is not available")
+        if shutil.which("make") is None:
+            raise unittest.SkipTest("make is not available")
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 10)
+
+            def forward(self, x, y):
+                return x + self.linear(y)
+
+        with torch.no_grad():
+            example_inputs = (
+                torch.randn(10, 10, device=self.device),
+                torch.randn(10, 10, device=self.device),
+            )
+            model = Model().to(device=self.device)
+            expected = model(*example_inputs)
+
+            options = {
+                "aot_inductor.package_cpp_only": self.package_cpp_only,
+                # Expect kernel to be embeded in the final binary.
+                # We will make it the default behavior for the standalone mode.
+                "aot_inductor.emit_multi_arch_kernel": True,
+                "aot_inductor.embed_kernel_binary": True,
+            }
+            ep = torch.export.export(model, example_inputs)
+            package_path = torch._inductor.aoti_compile_and_package(
+                ep, inductor_configs=options
+            )
+            with (
+                tempfile.TemporaryDirectory() as tmp_dir,
+                zipfile.ZipFile(package_path, "r") as zip_ref,
+            ):
+                filenames = zip_ref.namelist()
+                prefix = filenames[0].split("/")[0]
+                zip_ref.extractall(tmp_dir)
+                tmp_path = Path(tmp_dir) / prefix / "data" / "aotinductor" / "model"
+                self.assertTrue(tmp_path.exists())
+                # Create a build directory to run cmake
+                build_path = tmp_path / "build"
+                build_path.mkdir()
+                custom_env = os.environ.copy()
+                custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
+                subprocess.run(
+                    ["cmake", ".."],
+                    cwd=build_path,
+                    env=custom_env,
+                )
+                subprocess.run(["make"], cwd=build_path)
+
+                # Check if the .so file was build successfully
+                so_path = build_path / "libaoti_model.so"
+                self.assertTrue(so_path.exists())
+                optimized = torch._export.aot_load(str(so_path), self.device)
+                actual = optimized(*example_inputs)
+                self.assertTrue(torch.allclose(actual, expected))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_metadata(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -441,8 +564,13 @@ def forward(self, a, b):
             )
 
     @skipif(
+<<<<<<< HEAD
         lambda device, package_cpp_only: device == "cpu" or package_cpp_only,
         "No support for cpp only and cpu",
+=======
+        lambda device, package_cpp_only: package_cpp_only,
+        "No support for cpp only",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_package_without_weight(self):
         class Model(torch.nn.Module):
@@ -474,6 +602,66 @@ def forward(self, a):
         output = compiled(test_inputs)
         self.assertEqual(expected, output)
 
+<<<<<<< HEAD
+=======
+    @skipif(
+        lambda device, package_cpp_only: package_cpp_only,
+        "No support for cpp only",
+    )
+    def test_package_user_managed_weight(self):
+        class Model(torch.nn.Module):
+            def __init__(self, n, k, device):
+                super().__init__()
+                self.linear = torch.nn.Linear(k, n, device=device)
+
+            def forward(self, a):
+                return self.linear(a)
+
+        M, N, K = 128, 4096, 4096
+        model = Model(N, K, self.device)
+        example_inputs = (torch.randn(M, K, device=self.device),)
+
+        inductor_configs = {
+            "always_keep_tensor_constants": True,
+            "aot_inductor.package_constants_in_so": False,
+        }
+        compiled = compile(model, example_inputs, inductor_configs=inductor_configs)
+
+        self.assertEqual(
+            set(compiled.get_constant_fqns()), set(model.state_dict().keys())
+        )
+
+        compiled.load_constants(
+            model.state_dict(), check_full_update=True, user_managed=False
+        )
+
+        test_inputs = torch.randn(M, K, device=self.device)
+        expected = model(test_inputs)
+        output = compiled(test_inputs)
+        self.assertEqual(expected, output)
+
+        # Let's try to modify the weight in-place, result shouldn't change.
+        model.linear.weight.data *= 3.7
+        new_output = compiled(test_inputs)
+        self.assertEqual(new_output, output)
+
+        # Recreate a new model that we will test against user_managed=True
+        new_compiled = compile(model, example_inputs, inductor_configs=inductor_configs)
+        new_compiled.load_constants(
+            model.state_dict(), check_full_update=True, user_managed=True
+        )
+
+        expected = model(test_inputs)
+        new_output = new_compiled(test_inputs)
+        self.assertEqual(expected, new_output)
+
+        # Try to modify the weight in-place, result should change.
+        model.linear.weight.data *= 3.7
+        expected = model(test_inputs)
+        new_output = new_compiled(test_inputs)
+        self.assertEqual(new_output, expected)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_deepcopy_compiled_model(self):
         class Model(torch.nn.Module):
             def forward(self, x, y):
@@ -497,8 +685,13 @@ def forward(self, x, y):
         self.assertEqual(expected, output_copy)
 
     @skipif(
+<<<<<<< HEAD
         lambda device, package_cpp_only: device == "cpu" or package_cpp_only,
         "No support for cpp only and cpu",
+=======
+        lambda device, package_cpp_only: package_cpp_only,
+        "No support for cpp only",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_update_weights(self):
         class Model(torch.nn.Module):
@@ -528,6 +721,127 @@ def forward(self, a):
         output = compiled(test_inputs)
         self.assertEqual(expected, output)
 
+<<<<<<< HEAD
+=======
+    @skipif(
+        lambda device, package_cpp_only: package_cpp_only,
+        "No support for cpp only",
+    )
+    def test_package_shared_weights(self):
+        options = {
+            "aot_inductor.package": True,
+            "aot_inductor.package_cpp_only": self.package_cpp_only,
+            "always_keep_tensor_constants": True,
+            "aot_inductor.package_constants_in_so": False,
+            "aot_inductor.package_constants_on_disk": True,
+        }
+
+        class Bar(torch.nn.Module):
+            def __init__(self, p1, p2):
+                super().__init__()
+                self.p1 = p1
+                self.register_buffer("p2", p2)
+
+            def forward(self):
+                self.p1 += 1
+                self.p2 += 1
+                return self.p1, self.p2
+
+        class Bar2(torch.nn.Module):
+            def __init__(self, p1, p2):
+                super().__init__()
+                self.p1 = p1
+                self.register_buffer("p2", p2[2:3])
+
+            def forward(self):
+                self.p1 += 3
+                self.p2 += 3
+                return self.p1, self.p2
+
+        x = torch.randn(3, 4)
+        y = torch.randn(3, 4)
+        buffer = torch.nn.Buffer(x.clone())
+        buffer2 = torch.nn.Buffer(y.clone())
+        bar1 = Bar(buffer, buffer2)
+        bar2 = Bar2(buffer, buffer2)
+        ep1 = torch.export.export(bar1, ())
+        ep2 = torch.export.export(bar2, ())
+        aoti_files1 = torch._inductor.aot_compile(ep1.module(), (), options=options)
+        aoti_files2 = torch._inductor.aot_compile(ep2.module(), (), options=options)
+
+        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+            package_path = package_aoti(
+                f.name,
+                {"model1": aoti_files1, "model2": aoti_files2},
+            )
+            pt2_contents = load_pt2(package_path, load_weights_from_disk=True)
+            loaded1 = pt2_contents.aoti_runners["model1"]
+            loaded2 = pt2_contents.aoti_runners["model2"]
+
+            # note that loading like below doesn't work, because new weights will be loaded
+            # for each load_package call.
+            # loaded1 = load_package(package_path, "model1")
+            # loaded2 = load_package(package_path, "model2")
+
+        result_1_p1, result_1_p2 = loaded1()
+        self.assertEqual(result_1_p1, x + 1)
+        self.assertEqual(result_1_p2, y + 1)
+
+        result_2_p1, result_2_p2 = loaded2()
+        # the result already incremented by 1 from the run above
+        self.assertEqual(result_2_p1, x + 4)
+        self.assertEqual(result_2_p2, y[2:3] + 4)
+
+        # note that the returned result will not change though p2 changed
+        self.assertEqual(result_1_p2, y + 1)
+
+        # test shared weights but user managed
+        gm1 = ep1.module()
+        gm2 = ep2.module()
+        load_weights_to_pt2_contents(
+            pt2_contents, {"model1": gm1.state_dict(), "model2": gm2.state_dict()}
+        )
+        result_1_p1, result_1_p2 = loaded1()
+        self.assertEqual(result_1_p1, x + 1)
+        self.assertEqual(result_1_p2, y + 1)
+        self.assertEqual(gm1.p1, x + 1)
+        self.assertEqual(gm1.p2, y + 1)
+
+    @skipif(
+        lambda device, package_cpp_only: package_cpp_only,
+        "No support for cpp only",
+    )
+    def test_package_weights_on_disk_nested_module(self):
+        options = {
+            "aot_inductor.package": True,
+            "aot_inductor.package_cpp_only": self.package_cpp_only,
+            "always_keep_tensor_constants": True,
+            "aot_inductor.package_constants_in_so": False,
+            "aot_inductor.package_constants_on_disk": True,
+        }
+
+        # linear.weight's node name is linear_weight.
+        # This unit test tests that we package the right weight name
+        # `liear.weight`, but not `linear_weight`
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        x = torch.randn(3, 3).to(self.device)
+        bar1 = Bar().to(self.device)
+        ep = torch.export.export(bar1, (x,))
+        package_path = torch._inductor.aoti_compile_and_package(
+            ep, inductor_configs=options
+        )
+        pt2_contents = load_pt2(package_path, load_weights_from_disk=True)
+        loaded1 = pt2_contents.aoti_runners["model"]
+        self.assertEqual(loaded1(x), bar1(x))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_aot_inductor_utils.py b/test/inductor/test_aot_inductor_utils.py
index 37e10166bda8..7c8f65e79a79 100644
--- a/test/inductor/test_aot_inductor_utils.py
+++ b/test/inductor/test_aot_inductor_utils.py
@@ -5,6 +5,10 @@
 import shutil
 import tempfile
 import types
+<<<<<<< HEAD
+=======
+from typing import Any, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._export
@@ -15,11 +19,22 @@
 from torch._inductor import config
 from torch._inductor.test_case import TestCase
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_FBCODE
+=======
+from torch.testing._internal.common_utils import IS_FBCODE, run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import clone_preserve_strides_offset
 from torch.utils import _pytree as pytree
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from torch._C._aoti import AOTIModelContainerRunner
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class WrapperModule(torch.nn.Module):
     def __init__(self, model):
         super().__init__()
@@ -31,7 +46,11 @@ def forward(self, *args, **kwargs):
 
 class AOTIRunnerUtil:
     @staticmethod
+<<<<<<< HEAD
     def compile(
+=======
+    def legacy_compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model,
         example_inputs,
         options=None,
@@ -72,7 +91,11 @@ def compile(
         return so_path
 
     @staticmethod
+<<<<<<< HEAD
     def load_runner(device, so_path):
+=======
+    def legacy_load_runner(device, so_path: str) -> "AOTIModelContainerRunner":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if IS_FBCODE:
             from .fb import test_aot_inductor_model_runner_pybind  # @manual
 
@@ -101,10 +124,17 @@ def load_runner(device, so_path):
                 return torch._C._aoti.AOTIModelContainerRunnerCuda(so_path, 1, device)
 
     @staticmethod
+<<<<<<< HEAD
     def load(device, so_path):
         # TODO: unify fbcode and oss behavior to only use torch._export.aot_load
         if IS_FBCODE:
             runner = AOTIRunnerUtil.load_runner(device, so_path)
+=======
+    def legacy_load(device, so_path):
+        # TODO: unify fbcode and oss behavior to only use torch._export.aot_load
+        if IS_FBCODE:
+            runner = AOTIRunnerUtil.legacy_load_runner(device, so_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def optimized(*args, **kwargs):
                 call_spec = runner.get_call_spec()
@@ -120,26 +150,77 @@ def optimized(*args, **kwargs):
             return torch._export.aot_load(so_path, device)
 
     @staticmethod
+<<<<<<< HEAD
     def run(
         device,
+=======
+    def legacy_run(
+        device: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model,
         example_inputs,
         options=None,
         dynamic_shapes=None,
         disable_constraint_solver=False,
     ):
+<<<<<<< HEAD
         so_path = AOTIRunnerUtil.compile(
+=======
+        so_path = AOTIRunnerUtil.legacy_compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model,
             example_inputs,
             options=options,
             dynamic_shapes=dynamic_shapes,
             disable_constraint_solver=disable_constraint_solver,
         )
+<<<<<<< HEAD
         optimized = AOTIRunnerUtil.load(device, so_path)
+=======
+        optimized = AOTIRunnerUtil.legacy_load(device, so_path)
+        return optimized(*example_inputs)
+
+    @staticmethod
+    def compile(
+        model: Union[torch.nn.Module, types.FunctionType],
+        example_inputs: list[torch.Tensor],
+        inductor_configs: Optional[dict[str, Any]] = None,
+        dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
+    ):
+        if not isinstance(model, torch.nn.Module):
+            # This should really be the default behavior of torch.export.export
+            model = WrapperModule(model)
+
+        with torch.no_grad():
+            # strict=False needs extra migration work
+            ep = torch.export.export(
+                model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
+            )
+            package_path = torch._inductor.aoti_compile_and_package(
+                ep, inductor_configs=inductor_configs
+            )
+        return package_path
+
+    @staticmethod
+    def run(
+        model: Union[torch.nn.Module, types.FunctionType],
+        example_inputs: list[torch.Tensor],
+        inductor_configs: Optional[dict[str, Any]] = None,
+        dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
+    ):
+        package_path = AOTIRunnerUtil.compile(
+            model,
+            example_inputs,
+            inductor_configs=inductor_configs,
+            dynamic_shapes=dynamic_shapes,
+        )
+        optimized = torch._inductor.aoti_load_package(package_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return optimized(*example_inputs)
 
     @staticmethod
     def run_multiple(
+<<<<<<< HEAD
         device,
         model,
         list_example_inputs,
@@ -153,6 +234,20 @@ def run_multiple(
             dynamic_shapes=dynamic_shapes,
         )
         optimized = AOTIRunnerUtil.load(device, so_path)
+=======
+        model: Union[torch.nn.Module, types.FunctionType],
+        list_example_inputs: list[list[torch.Tensor]],
+        inductor_configs: Optional[dict[str, Any]] = None,
+        dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
+    ):
+        package_path = AOTIRunnerUtil.compile(
+            model,
+            list_example_inputs[0],
+            inductor_configs=inductor_configs,
+            dynamic_shapes=dynamic_shapes,
+        )
+        optimized = torch._inductor.aoti_load_package(package_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         list_output_tensors = []
         for example_inputs in list_example_inputs:
             list_output_tensors.append(optimized(*example_inputs))
@@ -165,6 +260,7 @@ def check_model(
     example_inputs,
     options=None,
     dynamic_shapes=None,
+<<<<<<< HEAD
     disable_constraint_solver=False,
     atol=None,
     rtol=None,
@@ -174,6 +270,19 @@ def check_model(
             "aot_inductor.allow_stack_allocation": self.allow_stack_allocation,
             "aot_inductor.use_minimal_arrayref_interface": self.use_minimal_arrayref_interface,
         }
+=======
+    atol=None,
+    rtol=None,
+):
+    with (
+        torch.no_grad(),
+        config.patch(
+            {
+                "aot_inductor.allow_stack_allocation": self.allow_stack_allocation,
+                "aot_inductor.use_minimal_arrayref_interface": self.use_minimal_arrayref_interface,
+            }
+        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         torch.manual_seed(0)
         if not isinstance(model, types.FunctionType):
@@ -196,12 +305,18 @@ def check_model(
 
         torch.manual_seed(0)
         actual = AOTIRunnerUtil.run(
+<<<<<<< HEAD
             self.device,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model,
             example_inputs,
             options,
             dynamic_shapes,
+<<<<<<< HEAD
             disable_constraint_solver,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     self.assertEqual(actual, expected, atol=atol, rtol=rtol)
@@ -214,11 +329,22 @@ def check_model_with_multiple_inputs(
     options=None,
     dynamic_shapes=None,
 ):
+<<<<<<< HEAD
     with torch.no_grad(), config.patch(
         {
             "aot_inductor.allow_stack_allocation": self.allow_stack_allocation,
             "aot_inductor.use_minimal_arrayref_interface": self.use_minimal_arrayref_interface,
         }
+=======
+    with (
+        torch.no_grad(),
+        config.patch(
+            {
+                "aot_inductor.allow_stack_allocation": self.allow_stack_allocation,
+                "aot_inductor.use_minimal_arrayref_interface": self.use_minimal_arrayref_interface,
+            }
+        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         torch.manual_seed(0)
         model = model.to(self.device)
@@ -228,7 +354,11 @@ def check_model_with_multiple_inputs(
 
         torch.manual_seed(0)
         list_actual = AOTIRunnerUtil.run_multiple(
+<<<<<<< HEAD
             self.device, model, list_example_inputs, options, dynamic_shapes
+=======
+            model, list_example_inputs, options, dynamic_shapes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     self.assertTrue(same(list_actual, list_expected))
@@ -241,6 +371,7 @@ def code_check_count(
     target_str: str,
     target_count: int,
 ):
+<<<<<<< HEAD
     with torch.no_grad(), config.patch(
         {
             "aot_inductor.allow_stack_allocation": self.allow_stack_allocation,
@@ -250,9 +381,30 @@ def code_check_count(
         so_path = torch._export.aot_compile(model, example_inputs)
 
     with open(os.path.splitext(so_path)[0] + ".cpp") as cpp:
+=======
+    with (
+        torch.no_grad(),
+        config.patch(
+            {
+                "aot_inductor.allow_stack_allocation": self.allow_stack_allocation,
+                "aot_inductor.use_minimal_arrayref_interface": self.use_minimal_arrayref_interface,
+            }
+        ),
+    ):
+        package_path = torch._export.aot_compile(model, example_inputs)
+
+    with open(os.path.splitext(package_path)[0] + ".cpp") as cpp:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         src_code = cpp.read()
         FileCheck().check_count(
             target_str,
             target_count,
             exactly=True,
         ).run(src_code)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    run_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/inductor/test_async_compile.py b/test/inductor/test_async_compile.py
index d05fa4748667..0f7860fa2cb7 100644
--- a/test/inductor/test_async_compile.py
+++ b/test/inductor/test_async_compile.py
@@ -3,7 +3,11 @@
 from torch._inductor import config
 from torch._inductor.async_compile import AsyncCompile, shutdown_compile_workers
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -32,7 +36,11 @@ def fn(x, y):
             pool = AsyncCompile.process_pool()
             pool.ready_future.result(timeout=120)
 
+<<<<<<< HEAD
             with fresh_inductor_cache():
+=======
+            with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 compiled_fn = torch.compile(fn)
                 self.assertEqual(fn(x, y), compiled_fn(x, y))
 
diff --git a/test/inductor/test_auto_functionalize.py b/test/inductor/test_auto_functionalize.py
index 7033d362170f..196ecf810832 100644
--- a/test/inductor/test_auto_functionalize.py
+++ b/test/inductor/test_auto_functionalize.py
@@ -255,9 +255,16 @@ def forward(self, arg0_1: "f32[3][1]cpu", arg1_1: "f32[3][1]cpu", arg2_1: "f32[3
 
     def test_auto_functionalize_on_view(self):
         for value in [True, False]:
+<<<<<<< HEAD
             with torch.library._scoped_library(
                 "mylib", "FRAGMENT"
             ) as lib, inductor_config.patch({"enable_auto_functionalized_v2": value}):
+=======
+            with (
+                torch.library._scoped_library("mylib", "FRAGMENT") as lib,
+                inductor_config.patch({"enable_auto_functionalized_v2": value}),
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.library.define(
                     "mylib::foo",
                     "(Tensor(a!) x) -> ()",
@@ -403,10 +410,17 @@ def f(x, y, z, n):
                     self.assertExpectedInline(
                         post_grad_graphs,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0][1]cpu", arg2_1: "f32[s0][1]cpu", arg3_1: "f32[s0][1]cpu", arg4_1: "f32[s0][1]cpu", arg5_1: "f32[s0][1]cpu"):
         foo_default = torch.ops.mylib.foo.default(arg3_1, [arg4_1, arg5_1], arg2_1, 2, arg1_1);  arg4_1 = arg5_1 = arg1_1 = foo_default = None
         copy_: "f32[s0][1]cpu" = torch.ops.aten.copy_.default(arg2_1, arg2_1);  arg2_1 = copy_ = None
         copy__1: "f32[s0][1]cpu" = torch.ops.aten.copy_.default(arg3_1, arg3_1);  arg3_1 = copy__1 = None
+=======
+def forward(self, arg0_1: "Sym(s72)", arg1_1: "f32[s72][1]cpu", arg2_1: "f32[s72][1]cpu", arg3_1: "f32[s72][1]cpu", arg4_1: "f32[s72][1]cpu", arg5_1: "f32[s72][1]cpu"):
+        foo_default = torch.ops.mylib.foo.default(arg3_1, [arg4_1, arg5_1], arg2_1, 2, arg1_1);  arg4_1 = arg5_1 = arg1_1 = foo_default = None
+        copy_: "f32[s72][1]cpu" = torch.ops.aten.copy_.default(arg2_1, arg2_1);  arg2_1 = copy_ = None
+        copy__1: "f32[s72][1]cpu" = torch.ops.aten.copy_.default(arg3_1, arg3_1);  arg3_1 = copy__1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ()""",  # noqa: B950
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -444,12 +458,26 @@ def run_aot_eager(self, f, orig_args, _dynamic=False):
             graph = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
         return [aot_eager_args, result, graph]
 
+<<<<<<< HEAD
     def run_inductor(self, f, orig_args, _dynamic=False):
         compiled_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
 
         log_stream, ctx = logs_to_string(
             "torch._inductor.compile_fx", "post_grad_graphs"
         )
+=======
+    def run_inductor(
+        self,
+        f,
+        orig_args,
+        _dynamic=False,
+        log_module="torch._inductor.compile_fx",
+        log_function="post_grad_graphs",
+    ):
+        compiled_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
+
+        log_stream, ctx = logs_to_string(log_module, log_function)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = None
         with ctx():
             result = torch.compile(
@@ -563,6 +591,7 @@ def f(x, y):
                     self.assertExpectedInline(
                         graph_aot,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0][1]cpu", arg2_1: "f32[s0][1]cpu"):
         auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _y_base_index = 1, _all_bases = [arg2_1, arg1_1])
         getitem_1: "f32[s0][1]cpu" = auto_functionalized_v2[1]
@@ -570,6 +599,15 @@ def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0][1]cpu", arg2_1: "f32[s0][1
         add: "f32[s0][1]cpu" = torch.ops.aten.add.Tensor(getitem_1, getitem_2)
         copy_: "f32[s0][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_2);  arg1_1 = getitem_2 = copy_ = None
         copy__1: "f32[s0][1]cpu" = torch.ops.aten.copy_.default(arg2_1, getitem_1);  arg2_1 = getitem_1 = copy__1 = None
+=======
+def forward(self, arg0_1: "Sym(s17)", arg1_1: "f32[s17][1]cpu", arg2_1: "f32[s17][1]cpu"):
+        auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _y_base_index = 1, _all_bases = [arg2_1, arg1_1])
+        getitem_1: "f32[s17][1]cpu" = auto_functionalized_v2[1]
+        getitem_2: "f32[s17][1]cpu" = auto_functionalized_v2[2];  auto_functionalized_v2 = None
+        add: "f32[s17][1]cpu" = torch.ops.aten.add.Tensor(getitem_1, getitem_2)
+        copy_: "f32[s17][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_2);  arg1_1 = getitem_2 = copy_ = None
+        copy__1: "f32[s17][1]cpu" = torch.ops.aten.copy_.default(arg2_1, getitem_1);  arg2_1 = getitem_1 = copy__1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (add,)""",  # noqa: B950
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -595,11 +633,19 @@ def forward(self, arg0_1: "f32[2][1]cpu", arg1_1: "f32[2][1]cpu"):
                     self.assertExpectedInline(
                         graph_inductor,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0][1]cpu", arg2_1: "f32[s0][1]cpu"):
         foo_default = torch.ops.mylib.foo.default(arg2_1, arg1_1);  foo_default = None
         add: "f32[s0][1]cpu" = torch.ops.aten.add.Tensor(arg2_1, arg1_1)
         copy_: "f32[s0][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy_ = None
         copy__1: "f32[s0][1]cpu" = torch.ops.aten.copy_.default(arg2_1, arg2_1);  arg2_1 = copy__1 = None
+=======
+def forward(self, arg0_1: "Sym(s17)", arg1_1: "f32[s17][1]cpu", arg2_1: "f32[s17][1]cpu"):
+        foo_default = torch.ops.mylib.foo.default(arg2_1, arg1_1);  foo_default = None
+        add: "f32[s17][1]cpu" = torch.ops.aten.add.Tensor(arg2_1, arg1_1)
+        copy_: "f32[s17][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy_ = None
+        copy__1: "f32[s17][1]cpu" = torch.ops.aten.copy_.default(arg2_1, arg2_1);  arg2_1 = copy__1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (add,)""",
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -663,10 +709,17 @@ def f(x):
                     self.assertExpectedInline(
                         graph_aot,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0][1]cpu"):
         auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _x_size = (), _x_stride = (), _x_storage_offset = 0, _y_base_index = 0, _y_size = (), _y_stride = (), _y_storage_offset = 1, _all_bases = [arg1_1])
         getitem_1: "f32[s0][1]cpu" = auto_functionalized_v2[1];  auto_functionalized_v2 = None
         copy_: "f32[s0][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_1);  arg1_1 = getitem_1 = copy_ = None
+=======
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu"):
+        auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _x_size = (), _x_stride = (), _x_storage_offset = 0, _y_base_index = 0, _y_size = (), _y_stride = (), _y_storage_offset = 1, _all_bases = [arg1_1])
+        getitem_1: "f32[s77][1]cpu" = auto_functionalized_v2[1];  auto_functionalized_v2 = None
+        copy_: "f32[s77][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_1);  arg1_1 = getitem_1 = copy_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ()""",  # noqa: B950
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -691,11 +744,19 @@ def forward(self, arg0_1: "f32[2][1]cpu"):
                     self.assertExpectedInline(
                         graph_inductor,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0][1]cpu"):
         as_strided_default: "f32[][]cpu" = torch.ops.aten.as_strided.default(arg1_1, [], [], 0)
         as_strided_default_1: "f32[][]cpu" = torch.ops.aten.as_strided.default(arg1_1, [], [], 1)
         foo_default = torch.ops.mylib.foo.default(as_strided_default, as_strided_default_1);  as_strided_default = as_strided_default_1 = foo_default = None
         copy_: "f32[s0][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy_ = None
+=======
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu"):
+        as_strided_default: "f32[][]cpu" = torch.ops.aten.as_strided.default(arg1_1, [], [], 0)
+        as_strided_default_1: "f32[][]cpu" = torch.ops.aten.as_strided.default(arg1_1, [], [], 1)
+        foo_default = torch.ops.mylib.foo.default(as_strided_default, as_strided_default_1);  as_strided_default = as_strided_default_1 = foo_default = None
+        copy_: "f32[s77][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  arg1_1 = copy_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ()""",  # noqa: B950
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -1291,6 +1352,7 @@ def f(x):
                     self.assertExpectedInline(
                         graph_aot,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0, s0][s0, 1]cpu"):
         floordiv: "Sym(0)" = 0 // arg0_1;  arg0_1 = None
         add_6: "Sym(2)" = floordiv + 2
@@ -1299,6 +1361,16 @@ def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0, s0][s0, 1]cpu"):
         copy_: "f32[s0, s0][s0, 1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_1);  arg1_1 = copy_ = None
         slice_3: "f32[2, s0][s0, 1]cpu" = torch.ops.aten.slice.Tensor(getitem_1, 0, 0, 2)
         slice_4: "f32[s0, 1][s0, 1]cpu" = torch.ops.aten.slice.Tensor(getitem_1, 1, 3, 4);  getitem_1 = None
+=======
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77, s77][s77, 1]cpu"):
+        floordiv: "Sym(0)" = 0 // arg0_1;  arg0_1 = None
+        add_6: "Sym(2)" = floordiv + 2
+        auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _x_slice_dim = 0, _x_slice_start = floordiv, _x_slice_end = add_6, _y_base_index = 0, _y_slice_dim = 1, _y_slice_start = 3, _y_slice_end = 4, _all_bases = [arg1_1]);  floordiv = add_6 = None
+        getitem_1: "f32[s77, s77][s77, 1]cpu" = auto_functionalized_v2[1];  auto_functionalized_v2 = None
+        copy_: "f32[s77, s77][s77, 1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_1);  arg1_1 = copy_ = None
+        slice_3: "f32[2, s77][s77, 1]cpu" = torch.ops.aten.slice.Tensor(getitem_1, 0, 0, 2)
+        slice_4: "f32[s77, 1][s77, 1]cpu" = torch.ops.aten.slice.Tensor(getitem_1, 1, 3, 4);  getitem_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (slice_3, slice_4)""",  # noqa: B950
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -1324,6 +1396,7 @@ def forward(self, arg0_1: "f32[10, 10][10, 1]cpu"):
                     self.assertExpectedInline(
                         graph_inductor,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0, s0][s0, 1]cpu"):
         slice_tensor: "f32[2, s0][s0, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 0, 2)
         slice_tensor_1: "f32[s0, 1][s0, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 1, 3, 4)
@@ -1331,6 +1404,17 @@ def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0, s0][s0, 1]cpu"):
         copy_: "f32[s0, s0][s0, 1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  copy_ = None
         slice_3: "f32[2, s0][s0, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 0, 2)
         slice_4: "f32[s0, 1][s0, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 1, 3, 4);  arg1_1 = None
+=======
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77, s77][s77, 1]cpu"):
+        floordiv: "Sym(0)" = 0 // arg0_1;  arg0_1 = None
+        add_6: "Sym(2)" = floordiv + 2;  floordiv = add_6 = None
+        slice_tensor: "f32[2, s77][s77, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 0, 2)
+        slice_tensor_1: "f32[s77, 1][s77, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 1, 3, 4)
+        foo_default = torch.ops.mylib.foo.default(slice_tensor, slice_tensor_1);  slice_tensor = slice_tensor_1 = foo_default = None
+        copy_: "f32[s77, s77][s77, 1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  copy_ = None
+        slice_3: "f32[2, s77][s77, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 0, 2)
+        slice_4: "f32[s77, 1][s77, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 1, 3, 4);  arg1_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (slice_3, slice_4)""",  # noqa: B950
                         ignore_comments=True,
                         ignore_empty_lines=True,
@@ -1470,18 +1554,30 @@ def f(x):
                     self.assertExpectedInline(
                         graph_aot,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0][1]cpu"):
         clone: "f32[s0][1]cpu" = torch.ops.aten.clone.default(arg1_1)
+=======
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu"):
+        clone: "f32[s77][1]cpu" = torch.ops.aten.clone.default(arg1_1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nonzero: "i64[u0, 1][1, u0]cpu" = torch.ops.aten.nonzero.default(clone);  clone = None
         sym_size_int_1: "Sym(u0)" = torch.ops.aten.sym_size.int(nonzero, 0)
         ge_1: "Sym(u0 >= 0)" = sym_size_int_1 >= 0;  sym_size_int_1 = None
         _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
         _to_copy: "f32[u0, 1][1, u0]cpu" = torch.ops.aten._to_copy.default(nonzero, dtype = torch.float32);  nonzero = None
         auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _x_alias = True, _y_base_index = 1, _y_alias = True, _all_bases = [arg1_1, _to_copy]);  _to_copy = None
+<<<<<<< HEAD
         getitem_1: "f32[s0][1]cpu" = auto_functionalized_v2[1]
         getitem_2: "f32[u0, 1][1, u0]cpu" = auto_functionalized_v2[2];  auto_functionalized_v2 = None
         copy_: "f32[s0][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_1);  arg1_1 = copy_ = None
         alias_1: "f32[s0][1]cpu" = torch.ops.aten.alias.default(getitem_1);  getitem_1 = None
+=======
+        getitem_1: "f32[s77][1]cpu" = auto_functionalized_v2[1]
+        getitem_2: "f32[u0, 1][1, u0]cpu" = auto_functionalized_v2[2];  auto_functionalized_v2 = None
+        copy_: "f32[s77][1]cpu" = torch.ops.aten.copy_.default(arg1_1, getitem_1);  arg1_1 = copy_ = None
+        alias_1: "f32[s77][1]cpu" = torch.ops.aten.alias.default(getitem_1);  getitem_1 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         slice_2: "f32[u0, 1][1, u0]cpu" = torch.ops.aten.slice.Tensor(getitem_2);  getitem_2 = None
         return (alias_1, slice_2)""",  # noqa: B950
                         ignore_comments=True,
@@ -1517,16 +1613,27 @@ def forward(self, arg0_1: "f32[2][1]cpu"):
                     self.assertExpectedInline(
                         graph_inductor,
                         """\
+<<<<<<< HEAD
 def forward(self, arg0_1: "Sym(s0)", arg1_1: "f32[s0][1]cpu"):
+=======
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nonzero: "i64[u0, 1][1, u0]cpu" = torch.ops.aten.nonzero.default(arg1_1)
         sym_size_int_1: "Sym(u0)" = torch.ops.aten.sym_size.int(nonzero, 0)
         ge_1: "Sym(u0 >= 0)" = sym_size_int_1 >= 0;  sym_size_int_1 = None
         _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
         convert_element_type: "f32[u0, 1][1, u0]cpu" = torch.ops.prims.convert_element_type.default(nonzero, torch.float32);  nonzero = None
+<<<<<<< HEAD
         alias_default: "f32[s0][1]cpu" = torch.ops.aten.alias.default(arg1_1)
         alias_default_1: "f32[u0, 1][1, u0]cpu" = torch.ops.aten.alias.default(convert_element_type)
         foo_default = torch.ops.mylib.foo.default(alias_default, alias_default_1);  alias_default = alias_default_1 = foo_default = None
         copy_: "f32[s0][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  copy_ = None
+=======
+        alias_default: "f32[s77][1]cpu" = torch.ops.aten.alias.default(arg1_1)
+        alias_default_1: "f32[u0, 1][1, u0]cpu" = torch.ops.aten.alias.default(convert_element_type)
+        foo_default = torch.ops.mylib.foo.default(alias_default, alias_default_1);  alias_default = alias_default_1 = foo_default = None
+        copy_: "f32[s77][1]cpu" = torch.ops.aten.copy_.default(arg1_1, arg1_1);  copy_ = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         slice_2: "f32[u0, 1][1, u0]cpu" = torch.ops.aten.slice.Tensor(convert_element_type);  convert_element_type = None
         return (arg1_1, slice_2)""",  # noqa: B950
                         ignore_comments=True,
@@ -1730,6 +1837,44 @@ def f(x, w):
             y = f(x, w)
         self.assertEqual(y, x.sin())
 
+<<<<<<< HEAD
+=======
+    @torch._inductor.config.patch(enable_auto_functionalized_v2=True)
+    def test_scheduling_with_multiple_mutates(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::foo",
+                "(Tensor! x, Tensor! y, Tensor z) -> ()",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo", "cpu", lib=lib)
+            @torch._dynamo.disable
+            def foo(x, y, z):
+                pass
+
+            def func(x, w):
+                a = torch.empty_like(x)  # buf0
+                b = torch.empty_like(x)  # buf1
+                torch.ops.mylib.foo(a, b, x)  # buf2, buf3, buf4
+                c = torch.mm(a, w)  # buf5
+                torch.ops.mylib.foo(c, b, x)  # buf6, buf7, buf8
+                return c
+
+            input = torch.rand(2, 2)
+            weight = torch.rand(2, 2)
+            [inductor_args, output, graph_inductor] = self.run_inductor(
+                func,
+                [input, weight],
+                False,
+                "torch._inductor.scheduler",
+                "compute_dependencies",
+            )
+            name_to_users = eval(graph_inductor)
+            self.assertNotEqual(name_to_users["buf1"], name_to_users["buf5"])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_benchmark_fusion.py b/test/inductor/test_benchmark_fusion.py
index 73b316cc0a0f..ad111993af51 100644
--- a/test/inductor/test_benchmark_fusion.py
+++ b/test/inductor/test_benchmark_fusion.py
@@ -7,10 +7,22 @@
 from torch._inductor.codegen.triton import TritonScheduling
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.test_operators import realize
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache, is_big_gpu, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import slowTest
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+=======
+from torch._inductor.utils import fresh_cache, is_big_gpu, run_and_get_code
+from torch.testing import FileCheck
+from torch.testing._internal.common_utils import slowTest
+from torch.testing._internal.inductor_utils import (
+    get_func_call,
+    HAS_CPU,
+    HAS_CUDA,
+    IS_BIG_GPU,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Make the helper files in test/ importable
@@ -24,6 +36,10 @@
     check_model,
     check_model_cuda,
     copy_tests,
+<<<<<<< HEAD
+=======
+    skip_if_cpp_wrapper,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._inductor import config
 from torch._inductor.scheduler import Scheduler
@@ -58,7 +74,14 @@ def f(x):
 
     @slowTest
     def test_resnet18(self):
+<<<<<<< HEAD
         import torchvision
+=======
+        try:
+            import torchvision
+        except ImportError:
+            self.skipTest("TorchVision not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model = torchvision.models.resnet18()
         model.eval()
@@ -87,9 +110,18 @@ def new_benchmark_fn(scheduler, nodes):
 
         # Disable dynamic_scale_rblock to make it easier to trigger register
         # spilling.
+<<<<<<< HEAD
         with unittest.mock.patch.object(
             Scheduler, "benchmark_fused_nodes", new_benchmark_fn
         ), config.patch("dynamic_scale_rblock", False):
+=======
+        with (
+            unittest.mock.patch.object(
+                Scheduler, "benchmark_fused_nodes", new_benchmark_fn
+            ),
+            config.patch("dynamic_scale_rblock", False),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             S = 512
 
             def f(*inputs):
@@ -123,7 +155,14 @@ def f(a, b):
 
         self.common(f, (a, b))
 
+<<<<<<< HEAD
     @torch._inductor.config.patch(max_autotune_gemm_backends="TRITON")
+=======
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    @config.patch(max_autotune_gemm_backends="TRITON")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_avoid_register_spilling(self):
         if self.device != "cuda":
             raise unittest.SkipTest("CUDA only")
@@ -158,9 +197,16 @@ def foo(m, inp):
                 ".run", 2, exactly=True
             ).run(out_code[0])
 
+<<<<<<< HEAD
         with config.patch(
             {"benchmark_fusion": False, "epilogue_fusion": False}
         ), torch.no_grad():
+=======
+        with (
+            config.patch({"benchmark_fusion": False, "epilogue_fusion": False}),
+            torch.no_grad(),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._dynamo.reset()
 
             foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
@@ -193,6 +239,10 @@ class BenchmarkingTest(TestCase):
         @unittest.skipIf(
             torch.cuda.device_count() < 2, "The test need at least 2 devices"
         )
+<<<<<<< HEAD
+=======
+        @skip_if_cpp_wrapper("This tests triton scheduling directly")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_benchmark_on_non_zero_device(self):
             hit_count = 0
             with torch.cuda.device("cuda:0"):
@@ -262,9 +312,13 @@ def foo(m, inp):
                 res, code = run_and_get_code(foo_c, m, inp)
 
             torch._dynamo.reset()
+<<<<<<< HEAD
             with unittest.mock.patch.object(
                 torch._inductor.config, "benchmark_epilogue_fusion", False
             ):
+=======
+            with config.patch(benchmark_epilogue_fusion=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
                 with torch.no_grad():
                     res2, code2 = run_and_get_code(foo_c, m, inp)
@@ -272,6 +326,7 @@ def foo(m, inp):
             self.assertEqual(res, res2, atol=1e-4, rtol=1.1)
             return code, code2
 
+<<<<<<< HEAD
         @fresh_inductor_cache()
         @torch._inductor.config.patch(max_autotune_gemm_backends="TRITON")
         def test_equivalent_template_code(self):
@@ -289,12 +344,28 @@ def test_equivalent_template_code(self):
 
         @fresh_inductor_cache()
         @torch._inductor.config.patch(max_autotune_gemm_backends="ATEN")
+=======
+        @fresh_cache()
+        @config.patch(max_autotune_gemm_backends="TRITON")
+        def test_equivalent_template_code(self):
+            code, code2 = self._equivalent_output_code_impl(256)
+            for out_code in [code, code2]:
+                FileCheck().check(get_func_call()).check_count(
+                    "empty_strided", 1, exactly=True
+                ).check("triton_tem_fused_addmm_relu_0").check_count(
+                    ".reset()" if config.cpp_wrapper else "del", 3, exactly=True
+                ).check("" if config.cpp_wrapper else "return").run(out_code[0])
+
+        @fresh_cache()
+        @config.patch(max_autotune_gemm_backends="ATEN")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_equivalent_extern_code(self):
             torch._dynamo.reset()
 
             code, code2 = self._equivalent_output_code_impl(512, 1, False)
 
             for out_code in [code, code2]:
+<<<<<<< HEAD
                 FileCheck().check("def call").check_count(
                     "empty_strided_cuda", 1, exactly=True
                 ).check("extern_kernels.").check_count("del", 3, exactly=True).check(
@@ -302,6 +373,13 @@ def test_equivalent_extern_code(self):
                 ).run(
                     out_code[0]
                 )
+=======
+                FileCheck().check(get_func_call()).check_count(
+                    "empty_strided", 1, exactly=True
+                ).check("" if config.cpp_wrapper else "extern_kernels.").check_count(
+                    ".reset()" if config.cpp_wrapper else "del", 3, exactly=True
+                ).check("" if config.cpp_wrapper else "return").run(out_code[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def test_changed_layout(self):
             # cat addmm planning will change layout - make sure propagated
diff --git a/test/inductor/test_best_config.py b/test/inductor/test_best_config.py
new file mode 100644
index 000000000000..7a2ce535a406
--- /dev/null
+++ b/test/inductor/test_best_config.py
@@ -0,0 +1,96 @@
+# Owner(s): ["module: inductor"]
+
+import glob
+import json
+import os
+import sys
+import tempfile
+import unittest
+
+import torch
+from torch._inductor import config
+from torch.testing._internal.common_utils import IS_LINUX, skipIfXpu
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+
+
+try:
+    import triton  # noqa: F401
+except ImportError as e:
+    if __name__ == "__main__":
+        sys.exit(0)
+    raise unittest.SkipTest("requires triton") from e
+
+from torch._inductor.test_case import run_tests, TestCase
+
+
+def trivial_kernel(x):
+    return torch.sin(x) + torch.cos(x)
+
+
+class TestKernelBestConfig(TestCase):
+    device_type = GPU_TYPE
+
+    @classmethod
+    def setUpClass(cls):
+        # Save the original configuration and environment variables.
+        cls.original_compile_threads = config.compile_threads
+        cls.original_max_autotune = config.max_autotune
+        cls.original_inductor_env = os.environ.get("TORCHINDUCTOR_CACHE_DIR", "")
+        cls.original_triton_env = os.environ.get("TRITON_CACHE_DIR", "")
+        super().setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        # Restore the original configuration and environment variables.
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = cls.original_inductor_env
+        os.environ["TRITON_CACHE_DIR"] = cls.original_triton_env
+        config.compile_threads = cls.original_compile_threads
+        config.max_autotune = cls.original_max_autotune
+        super().tearDownClass()
+
+    @skipIfXpu
+    def test_best_config_has_triton_cache_key(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            os.environ["TORCHINDUCTOR_CACHE_DIR"] = tmpdir
+            triton_cache_dir = os.path.join(tmpdir, "triton_cache")
+            os.environ["TRITON_CACHE_DIR"] = triton_cache_dir
+
+            config.compile_threads = 0
+            config.max_autotune = True
+
+            compiled_fn = torch.compile(trivial_kernel)
+
+            x = torch.randn(32, 10, device=GPU_TYPE)
+            compiled_fn(x)
+
+            # Search for .best_config files in the inductor cache directory.
+            best_config_files = glob.glob(
+                os.path.join(tmpdir, "**", "*.best_config"), recursive=True
+            )
+            self.assertGreater(
+                len(best_config_files),
+                0,
+                f"No best_config files found in {tmpdir}. Directory contents: {os.listdir(tmpdir)}",
+            )
+
+            # Validate that each best_config file contains a real triton_cache_hash,
+            # and that a corresponding Triton cache directory exists.
+            for file_path in best_config_files:
+                with open(file_path) as f:
+                    data = json.load(f)
+                self.assertIn(
+                    "triton_cache_hash",
+                    data,
+                    f"Missing triton_cache_hash in {os.path.basename(file_path)}",
+                )
+                cache_hash = data["triton_cache_hash"]
+                expected_path = os.path.join(triton_cache_dir, cache_hash)
+                self.assertTrue(
+                    os.path.exists(expected_path),
+                    f"Triton cache directory missing: {expected_path}",
+                )
+
+
+if __name__ == "__main__":
+    if IS_LINUX and HAS_GPU:
+        run_tests()
diff --git a/test/inductor/test_ck_backend.py b/test/inductor/test_ck_backend.py
index f2b7e3e4ffd5..03b44578e061 100644
--- a/test/inductor/test_ck_backend.py
+++ b/test/inductor/test_ck_backend.py
@@ -66,6 +66,7 @@ def setUp(self):
             os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = "1"
             super().setUp()
         finally:
+<<<<<<< HEAD
             os.environ[
                 "INDUCTOR_TEST_DISABLE_FRESH_CACHE"
             ] = old_disable_fresh_cache_envvar
@@ -73,6 +74,15 @@ def setUp(self):
     @unittest.skipIf(not torch.version.hip, "ROCM only")
     @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
+=======
+            os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = (
+                old_disable_fresh_cache_envvar
+            )
+
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    @parametrize("max_autotune_gemm_backends", ("CK", "CKTILE", "ATen,Triton,CK"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("autotune_in_subproc", (True, False))
     @parametrize("use_aoti", (True, False))
     def test_max_autotune_precompile_matmul(
@@ -99,14 +109,23 @@ def mm(a, b):
                 "max_autotune": True,
                 "autotune_in_subproc": autotune_in_subproc,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
+<<<<<<< HEAD
                 "compile_threads": 2,
                 "rocm.n_max_profiling_configs": 2,
+=======
+                "compile_threads": 16,
+                "rocm.ck_max_profiling_configs": 8,
+                "rocm.ck_tile_max_profiling_configs": 8,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "rocm.ck_dir": self.ck_dir,
             }
         ):
             if use_aoti:
                 Y_compiled = AOTIRunnerUtil.run(
+<<<<<<< HEAD
                     device="cuda",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     model=mm,
                     example_inputs=(a, b),
                 )
@@ -148,8 +167,14 @@ def test_max_autotune_precompile_matmul_dynamic(
                 "max_autotune": True,
                 "autotune_in_subproc": autotune_in_subproc,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
+<<<<<<< HEAD
                 "compile_threads": 2,
                 "rocm.n_max_profiling_configs": 2,
+=======
+                "compile_threads": 16,
+                "rocm.ck_max_profiling_configs": 8,
+                "rocm.ck_tile_max_profiling_configs": 8,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "rocm.ck_dir": self.ck_dir,
             }
         ):
@@ -223,9 +248,16 @@ def test_max_autotune_precompile_non_contiguous(self, max_autotune_gemm_backends
                 "max_autotune": True,
                 "autotune_in_subproc": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
+<<<<<<< HEAD
                 "compile_threads": 2,
                 "rocm.ck_dir": self.ck_dir,
                 "rocm.n_max_profiling_configs": 2,
+=======
+                "compile_threads": 16,
+                "rocm.ck_dir": self.ck_dir,
+                "rocm.ck_max_profiling_configs": 8,
+                "rocm.ck_tile_max_profiling_configs": 8,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
 
@@ -261,7 +293,11 @@ def test_max_autotune_addmm(self, max_autotune_gemm_backends, x_shape):
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "compile_threads": 2,
                 "rocm.ck_dir": self.ck_dir,
+<<<<<<< HEAD
                 "rocm.n_max_profiling_configs": 2,
+=======
+                "rocm.ck_max_profiling_configs": 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
 
@@ -352,7 +388,11 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
                 "max_autotune": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "compile_threads": 24,
+<<<<<<< HEAD
                 "rocm.n_max_profiling_configs": 24,
+=======
+                "rocm.ck_max_profiling_configs": 24,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "rocm.ck_dir": self.ck_dir,
             }
         ):
@@ -396,7 +436,11 @@ def test_max_autotune_conv2d(self, max_autotune_conv_backends):
                 "max_autotune_conv_backends": max_autotune_conv_backends,
                 "compile_threads": 4,
                 "rocm.ck_dir": self.ck_dir,
+<<<<<<< HEAD
                 "rocm.n_max_profiling_configs": 4,
+=======
+                "rocm.ck_max_profiling_configs": 4,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
 
@@ -437,7 +481,11 @@ def bmm(a, b):
                 "max_autotune": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "compile_threads": 2,
+<<<<<<< HEAD
                 "rocm.n_max_profiling_configs": 2,
+=======
+                "rocm.ck_max_profiling_configs": 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "rocm.ck_dir": self.ck_dir,
             }
         ):
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index 9f570c330a5e..fea69fe4c9b5 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -1,15 +1,34 @@
 # Owner(s): ["module: inductor"]
+<<<<<<< HEAD
 import os
 import pickle
 import shutil
 import tempfile
 import unittest
 from typing import Optional, Union
+=======
+import functools
+import logging
+import os
+import pickle
+import shutil
+import subprocess
+import sys
+import tempfile
+import unittest
+from contextlib import contextmanager
+from typing import Optional, Union
+from typing_extensions import override
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import mock
 
 import torch
 from torch._dynamo import reset
 from torch._dynamo.utils import counters
+<<<<<<< HEAD
+=======
+from torch._functorch import config as functorch_config
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
 from torch._inductor import config, metrics
 from torch._inductor.codecache import (
@@ -22,19 +41,43 @@
     TensorMetadata,
     TensorMetadataAndValues,
 )
+<<<<<<< HEAD
 from torch._inductor.custom_graph_pass import CustomGraphPass, get_hash_for_files
+=======
+from torch._inductor.custom_graph_pass import (
+    CustomGraphModulePass,
+    CustomGraphPass,
+    get_hash_for_files,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.graph import GraphLowering
 from torch._inductor.mock_cache import global_stats, PatchCaches, Stats
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
 from torch._inductor.utils import clear_inductor_caches, fresh_inductor_cache
 from torch._library import capture_triton
 from torch.compiler._cache import CacheArtifactManager
+=======
+from torch._inductor.utils import clear_caches, fresh_cache
+from torch._library import capture_triton
+from torch.compiler._cache import (
+    CacheArtifact,
+    CacheArtifactFactory,
+    CacheArtifactManager,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_cuda import SM80OrLater, TEST_MULTIGPU
 from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+<<<<<<< HEAD
+    parametrize,
+=======
+    IS_FBCODE,
     parametrize,
+    TEST_WITH_ROCM,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
@@ -42,6 +85,10 @@
     HAS_GPU,
     HAS_MULTIGPU,
     HAS_TRITON,
+<<<<<<< HEAD
+=======
+    patch_inductor_backend,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     requires_gpu,
     requires_triton,
 )
@@ -57,6 +104,39 @@
 torch._dynamo.config.fake_tensor_cache_crosscheck_enabled = True
 
 
+<<<<<<< HEAD
+=======
+class LogCaptureHandler(logging.Handler):
+    def __init__(self, level):
+        super().__init__(level)
+        self.records = []
+
+    def emit(self, record):
+        self.records.append(record)
+
+
+@contextmanager
+def capture_logs(log_name, log_level):
+    try:
+        logger = logging.getLogger(log_name)
+        old_level = logger.level
+        handler = logging.Handler()
+        logger.setLevel(log_level)
+        log_records = []
+
+        def emit(record):
+            log_records.append(record)
+
+        handler.emit = emit
+        logger.addHandler(handler)
+
+        yield log_records
+    finally:
+        logger.removeHandler(handler)
+        logger.setLevel(old_level)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MyModelConv2d(torch.nn.Module):
     def __init__(self, dim=512):
         super().__init__()
@@ -70,6 +150,19 @@ def forward(self, x):
         return x
 
 
+<<<<<<< HEAD
+=======
+class TestPyCodeCache(TestCase):
+    def test_linemaps_empty(self):
+        src = """import torch"""
+        (key, path) = PyCodeCache.write(src, "")
+        # Load with an empty linemap
+        PyCodeCache.load_by_key_path(key, path, linemap=[])
+        stack_frames = PyCodeCache.stack_frames_for_code(path, 0)
+        self.assertEqual(stack_frames, None)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @instantiate_parametrized_tests
 class TestFxGraphCache(TestCase):
     device_type = GPU_TYPE
@@ -88,17 +181,33 @@ def reset(self):
         AOTAutogradCache.clear()
         PyCodeCache.cache_clear(purge=True)
         torch._dynamo.reset()
+<<<<<<< HEAD
         clear_inductor_caches()
+=======
+        clear_caches()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_triton()
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
+<<<<<<< HEAD
+=======
+    @config.patch({"compile_threads": 1})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("device", (GPU_TYPE, "cpu"))
     @parametrize("dtype", (torch.float32, torch.bfloat16))
     @parametrize("dynamic", (False, True))
     @parametrize("bundle_triton", (False, True))
+<<<<<<< HEAD
     @parametrize("grad", (False, True))
     def test_cache_load_function(self, device, dtype, dynamic, bundle_triton, grad):
+=======
+    @parametrize("use_static_cuda_launcher", (False, True))
+    @parametrize("grad", (False, True))
+    def test_cache_load_function(
+        self, device, dtype, dynamic, bundle_triton, use_static_cuda_launcher, grad
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Verify that we can populate and load functions from the cache.
         """
@@ -106,6 +215,15 @@ def test_cache_load_function(self, device, dtype, dynamic, bundle_triton, grad):
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
         if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
             raise unittest.SkipTest("requires SM80 or later")
+<<<<<<< HEAD
+=======
+        if use_static_cuda_launcher and not (device == "cuda" and bundle_triton):
+            raise unittest.SkipTest(
+                "Static cuda launcher requires cuda and triton bundling"
+            )
+        if use_static_cuda_launcher and TEST_WITH_ROCM:
+            raise unittest.SkipTest("Static cuda launcher doesn't work with ROCM")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         grad_multiplier = 2 if grad else 1
 
@@ -116,7 +234,14 @@ def fn(x, y):
         a_orig = torch.rand(25, dtype=dtype, device=device)
         b_orig = torch.rand(5, 5, dtype=dtype, device=device)
 
+<<<<<<< HEAD
         with config.patch(bundle_triton_into_fx_graph_cache=bundle_triton):
+=======
+        with config.patch(
+            bundle_triton_into_fx_graph_cache=bundle_triton,
+            use_static_cuda_launcher=use_static_cuda_launcher,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled_fn = torch.compile(fn, dynamic=dynamic)
 
             a1 = a_orig.clone().requires_grad_(grad)
@@ -138,6 +263,7 @@ def fn(x, y):
             )
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
             self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
+<<<<<<< HEAD
             # "cuda" has .ptx and .cubin file, but xpu only has .spv file
             save_kernel_count = 6 if device == "xpu" else 7
             read_and_emit_kernel_count = 6 if device == "xpu" else 7
@@ -145,10 +271,42 @@ def fn(x, y):
                 self.assertEqual(
                     counters["inductor"]["triton_bundler_save_kernel"],
                     grad_multiplier * save_kernel_count,
+=======
+
+            # we expect:
+            #  .ttir
+            #  .ttgir
+            #  .llir
+            #  .ptx (cuda) or .spv (xpu)
+            #  .json
+            #  __grp__.*.json
+            # optionally, we can also get
+            #  .cubin (CUDA only)
+            #  .source (new versions of triton only, triton-lang/triton#6992)
+
+            # to avoid depending on the device and triton version, just assert that
+            # we have at least 6 kernels.
+            save_and_read_min_artifact_count = 6
+            if bundle_triton and device != "cpu":
+                self.assertGreaterEqual(
+                    counters["inductor"]["triton_bundler_save_kernel"],
+                    grad_multiplier * save_and_read_min_artifact_count,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 self.assertEqual(
                     counters["inductor"]["triton_bundler_read_and_emit_kernel"], 0
                 )
+<<<<<<< HEAD
+=======
+                if use_static_cuda_launcher:
+                    self.assertEqual(
+                        counters["inductor"]["triton_bundler_save_static_autotuner"],
+                        grad_multiplier if device == "cuda" else 0,
+                    )
+                    self.assertEqual(
+                        counters["inductor"]["triton_bundler_load_static_autotuner"], 0
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # A second call should hit. (First reset so in-memory guards
             # don't prevent compilation).
@@ -181,6 +339,7 @@ def fn(x, y):
             )
 
             if bundle_triton and device != "cpu":
+<<<<<<< HEAD
                 self.assertEqual(
                     counters["inductor"]["triton_bundler_save_kernel"],
                     grad_multiplier * save_kernel_count,
@@ -189,6 +348,25 @@ def fn(x, y):
                     counters["inductor"]["triton_bundler_read_and_emit_kernel"],
                     grad_multiplier * read_and_emit_kernel_count,
                 )
+=======
+                self.assertGreaterEqual(
+                    counters["inductor"]["triton_bundler_save_kernel"],
+                    grad_multiplier * save_and_read_min_artifact_count,
+                )
+                self.assertGreaterEqual(
+                    counters["inductor"]["triton_bundler_read_and_emit_kernel"],
+                    grad_multiplier * save_and_read_min_artifact_count,
+                )
+                if use_static_cuda_launcher:
+                    self.assertEqual(
+                        counters["inductor"]["triton_bundler_save_static_autotuner"],
+                        grad_multiplier if device == "cuda" else 0,
+                    )
+                    self.assertEqual(
+                        counters["inductor"]["triton_bundler_load_static_autotuner"],
+                        grad_multiplier if device == "cuda" else 0,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.reset()
 
@@ -220,6 +398,7 @@ def fn(x, y):
             )
 
             if bundle_triton and device != "cpu":
+<<<<<<< HEAD
                 self.assertEqual(
                     counters["inductor"]["triton_bundler_save_kernel"],
                     grad_multiplier * save_kernel_count * 2,
@@ -228,6 +407,25 @@ def fn(x, y):
                     counters["inductor"]["triton_bundler_read_and_emit_kernel"],
                     grad_multiplier * read_and_emit_kernel_count,
                 )
+=======
+                self.assertGreaterEqual(
+                    counters["inductor"]["triton_bundler_save_kernel"],
+                    grad_multiplier * save_and_read_min_artifact_count * 2,
+                )
+                self.assertGreaterEqual(
+                    counters["inductor"]["triton_bundler_read_and_emit_kernel"],
+                    grad_multiplier * save_and_read_min_artifact_count,
+                )
+                if use_static_cuda_launcher:
+                    self.assertEqual(
+                        counters["inductor"]["triton_bundler_save_static_autotuner"],
+                        grad_multiplier * 2 if device == "cuda" else 0,
+                    )
+                    self.assertEqual(
+                        counters["inductor"]["triton_bundler_load_static_autotuner"],
+                        grad_multiplier if device == "cuda" else 0,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_triton()
     @config.patch({"fx_graph_remote_cache": True})
@@ -235,13 +433,32 @@ def fn(x, y):
     @parametrize("dtype", (torch.float32, torch.bfloat16))
     @parametrize("dynamic", (False, True))
     @parametrize("bundle_triton", (False, True))
+<<<<<<< HEAD
     def test_remote_cache_load_function(self, device, dtype, dynamic, bundle_triton):
+=======
+    @parametrize("use_static_cuda_launcher", (False, True))
+    @config.patch(
+        {"compile_threads": 1}
+    )  # Can't check globalStats if there are workers
+    def test_remote_cache_load_function(
+        self, device, dtype, dynamic, bundle_triton, use_static_cuda_launcher
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from unittest.mock import patch
 
         if device == GPU_TYPE and not HAS_GPU:
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
         if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
             raise unittest.SkipTest("requires SM80 or later")
+<<<<<<< HEAD
+=======
+        if use_static_cuda_launcher and not (device == "cuda" and bundle_triton):
+            raise unittest.SkipTest(
+                "Static cuda launcher requires cuda and triton bundling"
+            )
+        if use_static_cuda_launcher and TEST_WITH_ROCM:
+            raise unittest.SkipTest("Static cuda launcher doesn't work with ROCM")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn(x, y):
             return (x * 2, y @ y)
@@ -249,6 +466,7 @@ def fn(x, y):
         a = torch.rand(25, dtype=dtype, device=device)
         b = torch.rand(5, 5, dtype=dtype, device=device)
 
+<<<<<<< HEAD
         with config.patch(
             {
                 "fx_graph_remote_cache": True,
@@ -258,15 +476,35 @@ def fn(x, y):
             os.environ.pop("TRITON_CACHE_MANAGER", None)
             for _ in range(4):
                 with fresh_inductor_cache():
+=======
+        with (
+            config.patch(
+                {
+                    "fx_graph_remote_cache": True,
+                    "bundle_triton_into_fx_graph_cache": bundle_triton,
+                    "use_static_cuda_launcher": use_static_cuda_launcher,
+                }
+            ),
+            patch.dict(os.environ),
+            PatchCaches(),
+        ):
+            os.environ.pop("TRITON_CACHE_MANAGER", None)
+            for _ in range(4):
+                with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     compiled_fn = torch.compile(fn, dynamic=dynamic)
                     self.assertEqual(fn(a, b), compiled_fn(a, b))
                 reset()
 
             self.assertEqual(global_stats.fx_graph, Stats(1, 3, 1))
 
+<<<<<<< HEAD
             with torch.compiler.config.patch(
                 {"cache_key_tag": "test"}
             ), fresh_inductor_cache():
+=======
+            with torch.compiler.config.patch({"cache_key_tag": "test"}), fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 compiled_fn = torch.compile(fn, dynamic=dynamic)
                 self.assertEqual(fn(a, b), compiled_fn(a, b))
 
@@ -304,7 +542,11 @@ def fn(x, y):
         b = torch.rand(100, 100, dtype=dtype, device=device)
 
         # Record artifacts
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled_fn = torch.compile(fn, dynamic=dynamic)
 
             # A first call should miss in the cache.
@@ -334,7 +576,11 @@ def fn(x, y):
         shutil.rmtree(os.path.join(cache_dir(), "triton"), ignore_errors=True)
 
         # We did not load anything so dont hit yet
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eager_result = fn(a, b)
             compiled_result = compiled_fn(a, b)
             self.assertEqual(eager_result, compiled_result)
@@ -348,7 +594,11 @@ def fn(x, y):
         shutil.rmtree(os.path.join(cache_dir(), "triton"), ignore_errors=True)
 
         # Hot load and hit
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cache_info = torch.compiler.load_cache_artifacts(artifact_bytes)
 
             self.assertEqual(len(cache_info.inductor_artifacts), 1)
@@ -381,7 +631,11 @@ def fn(x, y):
         a2 = torch.randn(4, 8)
         b2 = torch.randn(8, 4)
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eager_result = fn(a, b)
             compiled_result = compiled_fn(a, b)
             self.assertEqual(eager_result, compiled_result)
@@ -397,7 +651,11 @@ def fn(x, y):
 
         self.reset()
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.compiler.load_cache_artifacts(artifact_bytes)
             eager_result = fn(a, b)
             compiled_result = compiled_fn(a, b)
@@ -409,7 +667,11 @@ def fn(x, y):
 
         self.reset()
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eager_result = fn(a2, b2)
             compiled_result = compiled_fn(a2, b2)
             self.assertEqual(eager_result, compiled_result)
@@ -433,7 +695,11 @@ def f(x):
             return x * 2
 
         # Record artifacts
+<<<<<<< HEAD
         with torch.compiler.config.patch(job_id=self.id()), fresh_inductor_cache():
+=======
+        with torch.compiler.config.patch(job_id=self.id()), fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f(torch.randn(2, 3))
             f(torch.randn(2, 4))
             self.assertEqual(backend.frame_count, 2)
@@ -460,7 +726,11 @@ def f(x):
         shutil.rmtree(os.path.join(cache_dir(), "triton"), ignore_errors=True)
 
         # Hot load and hit
+<<<<<<< HEAD
         with torch.compiler.config.patch({"job_id": self.id()}), fresh_inductor_cache():
+=======
+        with torch.compiler.config.patch({"job_id": self.id()}), fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cache_info = torch.compiler.load_cache_artifacts(artifact_bytes)
 
             self.assertEqual(len(cache_info.inductor_artifacts), 2)
@@ -476,6 +746,133 @@ def f(x):
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
             self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)
 
+<<<<<<< HEAD
+=======
+    @torch._dynamo.config.patch(automatic_dynamic_local_pgo=True)
+    @torch._functorch.config.patch({"enable_autograd_cache": False})
+    @config.patch({"fx_graph_cache": True, "fx_graph_remote_cache": False})
+    def test_cache_hot_load_pgo_swap_file_names(self):
+        """
+        Verify that we can populate and hot load functions from the cache with pgo
+        with file name swapping
+        """
+
+        backend = torch._dynamo.testing.CompileCounterWithBackend("inductor")
+
+        @torch.compile(backend=backend, fullgraph=True)
+        def f(x):
+            return x * 2
+
+        # Record artifacts
+        with mock.patch(
+            "torch._utils_internal.get_mast_job_name_version", return_value=("foo", 5)
+        ):
+            with fresh_cache():
+                f(torch.randn(2, 3))
+                f(torch.randn(2, 4))
+                self.assertEqual(backend.frame_count, 2)
+
+            artifacts = torch.compiler.save_cache_artifacts()
+
+            self.assertIsNotNone(artifacts)
+
+        artifact_bytes, cache_info = artifacts
+
+        self.assertEqual(len(cache_info.pgo_artifacts), 2)
+
+        self.reset()
+        backend.clear()
+
+        # Clean triton kernels
+        shutil.rmtree(os.path.join(cache_dir(), "triton"), ignore_errors=True)
+
+        # Hot load and hit
+        with (
+            mock.patch(
+                "torch._utils_internal.get_mast_job_name_version",
+                return_value=("bar", 10),
+            ),
+            fresh_cache(),
+        ):
+            cache_info = torch.compiler.load_cache_artifacts(artifact_bytes)
+
+            self.assertEqual(len(cache_info.pgo_artifacts), 2)
+
+            f(torch.randn(2, 5))
+            f(torch.randn(2, 6))
+            self.assertEqual(backend.frame_count, 1)
+
+    def test_cache_hot_load_empty(self):
+        self.assertIsNone(torch.compiler.save_cache_artifacts())
+
+    def test_cache_hot_load_generic(self):
+        class CacheStub:
+            def __init__(self):
+                self.cache = {}
+
+            def lookup(self, key):
+                content = self.cache.get(key)
+                if content is None:
+                    return None
+
+                CacheArtifactManager.record_artifact(
+                    ArbitraryCacheArtifact.type(), key, content
+                )
+                return content
+
+            def save(self, key, content):
+                self.cache[key] = content
+                CacheArtifactManager.record_artifact(
+                    ArbitraryCacheArtifact.type(), key, content
+                )
+
+            def clear(self):
+                self.cache.clear()
+
+        cache_stub = CacheStub()
+
+        @CacheArtifactFactory.register
+        class ArbitraryCacheArtifact(CacheArtifact):
+            @override
+            def populate_cache(self) -> None:
+                cache_stub.cache[self.key] = self.content.decode()
+
+            @override
+            @staticmethod
+            def type() -> str:
+                return "test"
+
+            @override
+            @staticmethod
+            def encode(content: str) -> bytes:
+                return content.encode()
+
+        test_cache = {"1": "foo", "2": "bar", "foo": "bar"}
+
+        for k, v in test_cache.items():
+            cache_stub.save(k, v)
+
+        artifacts = torch.compiler.save_cache_artifacts()
+        self.assertIsNotNone(artifacts)
+        artifact_bytes, cache_info = artifacts
+
+        self.assertEqual(len(cache_info.test_artifacts), 3)
+
+        cache_stub.clear()
+        CacheArtifactManager.clear()
+
+        cache_info = torch.compiler.load_cache_artifacts(artifact_bytes)
+        self.assertEqual(len(cache_info.test_artifacts), 3)
+        self.assertEqual(cache_stub.cache, test_cache)
+
+        CacheArtifactManager.clear()
+        cache_stub.lookup("foo")
+        artifacts = torch.compiler.save_cache_artifacts()
+        self.assertIsNotNone(artifacts)
+        _, cache_info = artifacts
+        self.assertEqual(len(cache_info.test_artifacts), 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_triton()
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
@@ -497,7 +894,11 @@ def fn(mod, x):
         compiled_fn = torch.compile(fn, dynamic=dynamic)
 
         mod = MyModelConv2d().to(device=device, dtype=dtype)
+<<<<<<< HEAD
         inp = torch.randn(2, 3, 16, 16, device=device, dtype=dtype)
+=======
+        inp = torch.randn(2, 3, 16, 32, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # The first call should see all cache misses.
         counters.clear()
@@ -817,7 +1218,13 @@ def false_fn(x: torch.Tensor):
 
             return torch.cond(x.shape[0], true_fn, false_fn, (x,))
 
+<<<<<<< HEAD
         with config.patch(bundle_triton_into_fx_graph_cache=bundle_triton):
+=======
+        with config.patch(
+            bundle_triton_into_fx_graph_cache=bundle_triton,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled_fn = torch.compile(fn, dynamic=True, fullgraph=True)
 
             x = torch.randn(4, 4, device=GPU_TYPE)
@@ -982,8 +1389,18 @@ def fn2(x, y):
     @requires_triton()
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
+<<<<<<< HEAD
     @parametrize("bundle_triton", (False, True))
     def test_triton_op(self, bundle_triton):
+=======
+    @config.patch({"compile_threads": 1})
+    @parametrize("bundle_triton", (False, True))
+    @parametrize("use_static_cuda_launcher", (False, True))
+    def test_triton_op(self, bundle_triton, use_static_cuda_launcher):
+        if use_static_cuda_launcher and TEST_WITH_ROCM:
+            raise unittest.SkipTest("Static cuda launcher doesn't work with ROCM")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         libname = "my_cool_namespace"
         opname = "my_triton_operator"
 
@@ -1001,7 +1418,16 @@ def grid(meta):
         def f(x, y):
             return add(x, y)
 
+<<<<<<< HEAD
         with config.patch(bundle_triton_into_fx_graph_cache=bundle_triton):
+=======
+        compile_threads = 1 if use_static_cuda_launcher else config.compile_threads
+        with config.patch(
+            bundle_triton_into_fx_graph_cache=bundle_triton,
+            use_static_cuda_launcher=use_static_cuda_launcher,
+            compile_threads=compile_threads,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled_fn = torch.compile(f, fullgraph=True)
 
             x = torch.randn(4, device=GPU_TYPE)
@@ -1033,6 +1459,10 @@ def test_generated_kernel_count(self):
         """
         Test that we bump the generated_kernel_count metric on a cache hit.
         """
+<<<<<<< HEAD
+=======
+        torch._logging.set_logs(inductor_metrics=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn(x, y):
             return (x * y + y,)
@@ -1055,6 +1485,10 @@ def fn(x, y):
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
         self.assertEqual(metrics.generated_kernel_count, 2)
+<<<<<<< HEAD
+=======
+        torch._logging.set_logs()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
@@ -1310,6 +1744,423 @@ def forward(self, x):
         )
 
 
+<<<<<<< HEAD
+=======
+@instantiate_parametrized_tests
+class TestStandaloneCompile(TestCase):
+    def setUp(self):
+        super().setUp()
+        counters.clear()
+        PatchCaches.setUp()
+        CacheArtifactManager.clear()
+
+    def tearDown(self):
+        super().tearDown()
+        PatchCaches.tearDown()
+
+    def reset(self):
+        AOTAutogradCache.clear()
+        PyCodeCache.cache_clear(purge=True)
+        torch._dynamo.reset()
+        clear_caches()
+
+    def capture(self, fn, dynamic=None):
+        def inner(*args):
+            gm = None
+            actual_args = None
+            kwargs = None
+
+            def backend(gm_, args_, **kwargs_):
+                nonlocal gm
+                nonlocal actual_args
+                nonlocal kwargs
+                gm = gm_
+                actual_args = args_
+                kwargs = kwargs_
+                return gm
+
+            _ = torch.compile(fn, fullgraph=True, backend=backend, dynamic=dynamic)(
+                *args
+            )
+            return gm, actual_args, kwargs
+
+        return inner
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @parametrize("device", (GPU_TYPE, "cpu"))
+    @parametrize("format", ("binary", "unpacked"))
+    @parametrize("dynamic", (False, True))
+    @parametrize("graph_partition", (False, True))
+    def test_basic(
+        self, device: str, format: str, dynamic: bool, graph_partition: bool
+    ) -> None:
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
+
+        mod = torch.nn.Linear(1, 3, device=device)
+        x = torch.randn(4, 1, device=device)
+        if dynamic:
+            torch._dynamo.mark_dynamic(x, 0)
+
+        def f(x):
+            with torch.no_grad():
+                return mod(x), x.sin()
+
+        eager_out = f(x)
+
+        with (
+            tempfile.TemporaryDirectory() as temp_dir,
+            config.patch(graph_partition=graph_partition),
+        ):
+            path = (
+                temp_dir
+                if format == "unpacked"
+                else os.path.join(temp_dir, "compiled_artifact.bin")
+            )
+            with fresh_cache():
+                gm, args, kwargs = self.capture(f)(x)
+                assert not kwargs
+
+                compiled_artifact = torch._inductor.standalone_compile(gm, args)
+                compiled_artifact.save(path=path, format=format)
+
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+
+            with fresh_cache():
+                loaded = torch._inductor.CompiledArtifact.load(path=path, format=format)
+                if dynamic:
+                    concrete_args = [
+                        4 if isinstance(a, torch.SymInt) else a for a in args
+                    ]
+                else:
+                    concrete_args = args
+                compiled_out = loaded(*concrete_args)
+                self.assertEqual(eager_out, compiled_out)
+
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @parametrize("dynamic", (False, True))
+    def test_call_in_backend(self, dynamic: bool) -> None:
+        mod = torch.nn.Linear(1, 3)
+        x = torch.randn(4, 1)
+        if dynamic:
+            torch._dynamo.mark_dynamic(x, 0)
+
+        def f(x):
+            with torch.no_grad():
+                return mod(x)
+
+        eager_out = f(x)
+
+        def backend(gm, args, **kwargs):
+            return torch._inductor.standalone_compile(gm, args)
+
+        with fresh_cache():
+            compiled_out = torch.compile(f, fullgraph=True, backend=backend)(x)
+            self.assertEqual(eager_out, compiled_out)
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_save_in_new_path(self) -> None:
+        mod = torch.nn.Linear(1, 3)
+        x = torch.randn(4, 1)
+
+        def f(x):
+            with torch.no_grad():
+                return mod(x)
+
+        eager_out = f(x)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            path = os.path.join(temp_dir, "new_dir")
+            with fresh_cache():
+                gm, args, kwargs = self.capture(f)(x)
+                assert not kwargs
+
+                compiled_artifact = torch._inductor.standalone_compile(gm, args)
+                compiled_artifact.save(path=path, format="unpacked")
+
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+
+            with fresh_cache():
+                loaded = torch._inductor.CompiledArtifact.load(
+                    path=path, format="unpacked"
+                )
+                compiled_out = loaded(*args)[0]
+                self.assertEqual(eager_out, compiled_out)
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @parametrize("device", (GPU_TYPE, "cpu"))
+    def test_modify_unpacked_file(self, device: str) -> None:
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
+
+        x = torch.ones(4, device=device)
+
+        def f(x):
+            with torch.no_grad():
+                return 2 * x, x.sin()
+
+        eager_out = f(x)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            with fresh_cache():
+                gm, args, kwargs = self.capture(f)(x)
+                assert not kwargs
+
+                compiled_artifact = torch._inductor.standalone_compile(gm, args)
+                compiled_out = compiled_artifact(*args)
+                self.assertEqual(eager_out, compiled_out)
+
+                compiled_artifact.save(path=temp_dir, format="unpacked")
+
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+
+            with fresh_cache():
+                # Now modify the output file and expect to see the changes
+                for subdir in os.listdir(temp_dir):
+                    if subdir in ["aotautograd", "fxgraph"]:
+                        continue
+                    subdir_path = os.path.join(temp_dir, subdir)
+                    for file in os.listdir(subdir_path):
+                        file_path = os.path.join(subdir_path, file)
+                        assert os.path.isfile(file_path)
+                        with open(file_path) as f:
+                            file_contents = f.read()
+                        if device == GPU_TYPE:
+                            file_contents = file_contents.replace(
+                                "tmp1 = 2.0", "tmp1 = 8.0"
+                            )
+                        else:
+                            assert device == "cpu"
+                            file_contents = file_contents.replace(
+                                "auto tmp1 = static_cast<float>(2.0);",
+                                "auto tmp1 = static_cast<float>(8.0);",
+                            )
+                        with open(file_path, "w") as f:
+                            f.write(file_contents)
+
+                loaded = torch._inductor.CompiledArtifact.load(
+                    path=temp_dir, format="unpacked"
+                )
+                compiled_out = loaded(*args)
+                self.assertEqual(4 * eager_out[0], compiled_out[0])
+
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+
+    @unittest.skipIf(IS_FBCODE, "torch import error")
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_different_process(self):
+        x = torch.ones(4, 1)
+
+        def f(x):
+            return x.sin() * 2
+
+        gm, args, kwargs = self.capture(f)(x)
+        assert not kwargs
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            path = os.path.join(temp_dir, "compiled_artifact.bin")
+
+            with fresh_cache():
+                compiled_artifact = torch._inductor.standalone_compile(gm, args)
+                compiled_artifact.save(path=path)
+
+            script = f"""
+import torch
+from torch._inductor.utils import fresh_cache
+
+arg = torch.ones(4, 1)
+with fresh_cache():
+    loaded = torch._inductor.CompiledArtifact.load(path="{path}")
+    compiled_result = loaded(arg)[0]
+
+eager_result = arg.sin() * 2
+
+if not torch.allclose(eager_result, compiled_result, atol=0.1, rtol=0.01):
+    raise RuntimeError("tensors do not match")
+"""
+            try:
+                subprocess.check_output(
+                    [sys.executable, "-c", script],
+                    stderr=subprocess.STDOUT,
+                    cwd=os.path.dirname(os.path.realpath(__file__)),
+                )
+            except subprocess.CalledProcessError as e:
+                self.fail(
+                    msg=(
+                        "Subprocess exception while attempting to run test: "
+                        + e.output.decode("utf-8")
+                    )
+                )
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_dynamic_shapes_from_graph(self):
+        def f(x):
+            return x.shape[0] * x
+
+        x = torch.ones(3)
+        torch._dynamo.mark_dynamic(x, 0)
+        with fresh_cache():
+            # captured graph is lambda s0, x: x * s0
+            gm, args, kwargs = self.capture(f)(x)
+            assert not kwargs
+
+        compiled_artifact = torch._inductor.standalone_compile(
+            gm, args, dynamic_shapes="from_graph"
+        )
+        x = torch.ones(4)
+        (result,) = compiled_artifact(4, x)
+        self.assertEqual(result, x * 4)
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @parametrize("config_patches", [True, False])
+    def test_dynamic_shapes_from_example_inputs(self, config_patches):
+        def f(x):
+            return x.shape[0] * x
+
+        x = torch.ones(3)
+        torch._dynamo.mark_dynamic(x, 0)
+        with fresh_cache():
+            # captured graph is lambda s0, x: x * s0
+            gm, args, kwargs = self.capture(f)(x)
+            assert not kwargs
+
+        if config_patches:
+            config_patches = {"fx_graph_cache": True}
+        else:
+            config_patches = None
+
+        # specialized on example inputs
+        compiled_artifact = torch._inductor.standalone_compile(
+            gm,
+            (5, torch.ones(4)),
+            dynamic_shapes="from_example_inputs",
+            options={"config_patches": config_patches},
+        )
+        x = torch.ones(4)
+        (result,) = compiled_artifact(3, x)
+        # int 5 was baked in!
+        self.assertEqual(result, x * 5)
+
+        # size 4 was baked in
+        with self.assertRaisesRegex(AssertionError, "expected size 5==4"):
+            x = torch.randn(5)
+            (result,) = compiled_artifact(4, x)
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @parametrize("dynamic_shapes", ["from_graph", "from_example_inputs"])
+    def test_static_shapes(self, dynamic_shapes):
+        def f(x):
+            return x.shape[0] * x
+
+        static_x = torch.randn(3)
+        with fresh_cache():
+            # static_gm is lambda x: x * 3
+            static_gm, args, kwargs = self.capture(f, dynamic=False)(static_x)
+            assert not kwargs
+        compiled_artifact = torch._inductor.standalone_compile(
+            static_gm, [static_x], dynamic_shapes=dynamic_shapes
+        )
+        x = torch.randn(3)
+        (result,) = compiled_artifact(x)
+        self.assertEqual(result, x * 3)
+        with self.assertRaisesRegex(AssertionError, "expected size 4==3"):
+            x = torch.randn(4)
+            (result,) = compiled_artifact(x)
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @parametrize("dynamic_shapes", ["from_tracing_context", "from_graph"])
+    def test_backend(self, dynamic_shapes):
+        def f(x):
+            return x.shape[0] * x
+
+        x = torch.randn(3)
+        torch._dynamo.mark_dynamic(x, 0)
+
+        def backend(gm, args, **kwargs):
+            compiled_artifact = torch._inductor.standalone_compile(
+                gm, args, dynamic_shapes=dynamic_shapes
+            )
+            y = torch.randn(4)
+            (result,) = compiled_artifact(4, y)
+            self.assertEqual(result, y * 4)
+            return compiled_artifact
+
+        torch._dynamo.reset()
+        _ = torch.compile(f, backend=backend)(x)
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    def test_backend_dynamic_shapes_from_example_inputs(self):
+        def f(x):
+            return x.shape[0] * x
+
+        x = torch.ones(4)
+        torch._dynamo.mark_dynamic(x, 0)
+
+        def backend(gm, args, **kwargs):
+            compiled_artifact = torch._inductor.standalone_compile(
+                gm, [5, torch.ones(4)], dynamic_shapes="from_example_inputs"
+            )
+            y = torch.ones(4)
+            (result,) = compiled_artifact(4, y)
+            # 5 was baked in
+            self.assertEqual(result, y * 5)
+
+            # shape of y was baked in
+            with self.assertRaisesRegex(AssertionError, "expected size 5==4"):
+                y = torch.ones(5)
+                (result,) = compiled_artifact(4, y)
+
+            return compiled_artifact
+
+        torch._dynamo.reset()
+        _ = torch.compile(f, backend=backend)(x)
+
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @functorch_config.patch({"enable_autograd_cache": True})
+    @parametrize(
+        "dynamic_shapes", ["from_tracing_context", "from_graph", "from_example_inputs"]
+    )
+    def test_backend_static_shapes(self, dynamic_shapes):
+        # on static_x, all of these options should produce a static graph,
+        # but it's a bit hard to tell, so these are just smoke tests.
+        static_x = torch.randn(3)
+
+        def f(x):
+            return x.shape[0] * x
+
+        def backend(gm, args, **kwargs):
+            return torch._inductor.standalone_compile(
+                gm, args, dynamic_shapes=dynamic_shapes
+            )
+
+        result = torch.compile(f, backend=backend)(static_x)
+        self.assertEqual(result, static_x * 3)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestFxGraphCacheHashing(TestCase):
     def test_parameter_constants(self):
         """
@@ -1515,6 +2366,93 @@ def test_hash_config_changes(self):
             pickler.dumps(details3),
         )
 
+<<<<<<< HEAD
+=======
+    def test_hash_private_config_changes(self):
+        """
+        Test that private config settings affect hashes.
+        """
+        with config.patch({"_micro_pipeline_tp": False}):
+            details1 = FxGraphHashDetails(None, [], {}, [])
+            details2 = FxGraphHashDetails(None, [], {}, [])
+
+        with config.patch({"_micro_pipeline_tp": True}):
+            details3 = FxGraphHashDetails(None, [], {}, [])
+
+        gm = torch.fx.GraphModule({}, torch.fx.Graph())
+        pickler = FxGraphCachePickler(gm)
+
+        self.assertEqual(
+            pickler.dumps(details1),
+            pickler.dumps(details2),
+        )
+        self.assertNotEqual(
+            pickler.dumps(details1),
+            pickler.dumps(details3),
+        )
+
+    def test_non_serializable_custom_passes_causes_cache_miss(self):
+        class Mod(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.param = torch.nn.Parameter(torch.rand(4, 4))
+
+            def forward(self, x):
+                return x @ self.param
+
+        mod1 = Mod()
+        mod_compiled = torch.compile(mod1)
+        with torch.no_grad():
+            x = torch.rand(4, 4)
+            # miss
+            mod_compiled(x)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_bypass"], 0)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+            # hit
+            torch._dynamo.reset()
+            mod_compiled(x)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_bypass"], 0)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+            torch._dynamo.reset()
+            counters.clear()
+
+            # hit
+            mod_compiled(x)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_bypass"], 0)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+            with config.patch({"_fuse_ddp_communication_passes": ["new_pass_foo_bar"]}):
+                # miss (private config changed)
+                torch._dynamo.reset()
+                mod_compiled(x)
+                self.assertEqual(counters["inductor"]["fxgraph_cache_bypass"], 0)
+                self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
+                self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+                torch._dynamo.reset()
+                counters.clear()
+
+            with (
+                capture_logs("torch._inductor.codecache", logging.INFO) as logs,
+                config.patch({"_fuse_ddp_communication_passes": [lambda *args: None]}),
+            ):
+                # bypass (custom pass is not serializable)
+                mod_compiled(x)
+                self.assertEqual(counters["inductor"]["fxgraph_cache_bypass"], 1)
+                self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
+                self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+                counters.clear()
+            # assert that our bypass is explicit
+            self.assertTrue(
+                any(
+                    x.getMessage()
+                    == "Bypassing FX Graph Cache because 'Unsupported _fuse_ddp_communication_pass'"
+                    for x in logs
+                )
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_hash_custom_passes(self):
         """
         Test CustomGraphPass usage.
@@ -1551,6 +2489,45 @@ def uuid(self) -> Optional[Union[bytes, str]]:
                 pickler.dumps(details3),
             )
 
+<<<<<<< HEAD
+=======
+    def test_hash_custom_backend_pass(self):
+        """
+        Test CustomGraphModulePass usage.
+        """
+
+        class TestCustomGraphModulePass(CustomGraphModulePass):
+            def __init__(self):
+                self._uuid = None
+
+            def __call__(self, gm: torch.fx.GraphModule) -> None:
+                return None
+
+            def uuid(self) -> Optional[Union[bytes, str]]:
+                return self._uuid
+
+        custom_pass = TestCustomGraphModulePass()
+        with patch_inductor_backend("cpu", custom_pass=custom_pass):
+            custom_pass._uuid = "1"
+            details1 = FxGraphHashDetails(None, [], {}, [])
+            details2 = FxGraphHashDetails(None, [], {}, [])
+
+            custom_pass._uuid = "2"
+            details3 = FxGraphHashDetails(None, [], {}, [])
+
+            gm = torch.fx.GraphModule({}, torch.fx.Graph())
+            pickler = FxGraphCachePickler(gm)
+
+            self.assertEqual(
+                pickler.dumps(details1),
+                pickler.dumps(details2),
+            )
+            self.assertNotEqual(
+                pickler.dumps(details1),
+                pickler.dumps(details3),
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_bypass_unsupported(self):
         """
         Test _reduce_unsupported
@@ -1608,7 +2585,10 @@ def test_get_hash_for_files(self):
 
 class TestCudaCompileCommand(TestCase):
     @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+<<<<<<< HEAD
     @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cuda_compile_command(self):
         cmd_no_extra_args: str = cuda_compile_command(
             ["abc.cu", "def.cu"], "output", "so"
@@ -1630,7 +2610,11 @@ def test_cuda_compile_command(self):
             CUDACodeCache.compile("test123.cu", "so", ["-Wsomething"])
             check_output_mock.assert_called()
             cmd_parts: list[str] = check_output_mock.call_args[0][0]
+<<<<<<< HEAD
             assert cmd_parts[0] == "nvcc", cmd_parts
+=======
+            assert cmd_parts[0].endswith("nvcc"), cmd_parts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert "-Wsomething" in cmd_parts, cmd_parts
             assert "-DNDEBUG" in cmd_parts, cmd_parts
 
@@ -1651,7 +2635,62 @@ def tearDown(self):
     def reset(self):
         PyCodeCache.cache_clear(purge=True)
         torch._dynamo.reset()
+<<<<<<< HEAD
         clear_inductor_caches()
+=======
+        clear_caches()
+
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+    @unittest.skipIf(
+        TEST_WITH_ROCM, "Requires static cuda launcher, which does not support ROCM"
+    )
+    @config.patch({"use_static_cuda_launcher": True})
+    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
+    @config.patch({"autotune_local_cache": False})
+    @config.patch({"autotune_remote_cache": True})
+    @config.patch({"bundled_autotune_remote_cache": False})
+    @config.patch({"max_autotune": True})
+    @config.patch(
+        {"compile_threads": 1}
+    )  # Worker processes do not register PatchCaches() properly
+    def test_autotune_cache_warm_start(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y, a, b):
+                return x + y, a + b
+
+        def f(x, y, a, b):
+            return Model()(x, y, a, b)
+
+        x = torch.randn(100, 100).cuda()
+        y = torch.randn(100, 100).cuda()
+        a = torch.randn(1000, 100).cuda()
+        b = torch.randn(1000, 100).cuda()
+        f_compiled = torch.compile(f, fullgraph=True)
+
+        with PatchCaches():
+            a1 = f_compiled(x, y, a, b)
+
+            self.assertEqual(global_stats.autotune_remote, Stats(2, 0, 2))
+            self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+
+            # Don't reset FxGraphCache, see that it loads again
+            torch._dynamo.reset()
+            a2 = f_compiled(x, y, a, b)
+            self.assertEqual(a1, a2)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
+            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+
+        self.assertEqual(global_stats.autotune_remote, Stats(2, 2, 2))
+
+        # Check that the cache entries seem reasonable
+        for k in global_stats.autotune_remote.cache.keys():
+            self.assertRegex(k, r"[0-9a-z]{52}")
+        for k in global_stats.triton.cache.keys():
+            self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
     @unittest.skipIf(not SM80OrLater, "Requires SM80+")
@@ -1754,6 +2793,72 @@ def f(a, b, c, d, e, f):
         for k in global_stats.triton.cache.keys():
             self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
 
+<<<<<<< HEAD
+=======
+    @requires_triton()
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+    @config.patch({"fx_graph_cache": False})
+    @config.patch({"fx_graph_remote_cache": False})
+    @config.patch({"bundled_autotune_remote_cache": False})
+    @config.patch({"max_autotune": True})
+    @config.patch(
+        {"compile_threads": 1}
+    )  # Worker processes do not register PatchCaches() properly
+    @parametrize("remote_cache", (True, False))
+    def test_modified_autotune_cache(self, remote_cache):
+        """
+        If a developer changes the way the autotune cache is handled,
+        there's a chance it'll break the cache. This happened with
+        #150122. This test ensures that if torch code changes, then
+        old cache entries will be invalidated.
+        """
+
+        def mock_torch_key(value: str) -> bytes:
+            return value.encode("utf-8")
+
+        def get_autotune_stats():
+            if remote_cache:
+                return global_stats.autotune_remote
+            return global_stats.autotune_local
+
+        def fn(x, y):
+            return (x + y).relu()
+
+        x = torch.randn(100, 100).cuda()
+        y = torch.randn(100, 100).cuda()
+
+        with config.patch(
+            {
+                "autotune_local_cache": not remote_cache,
+                "autotune_remote_cache": remote_cache,
+            }
+        ):
+            with PatchCaches():
+                with mock.patch(
+                    "torch._inductor.codecache.torch_key",
+                    functools.partial(mock_torch_key, "torchkey1"),
+                ):
+                    f_compiled = torch.compile(fn, fullgraph=True)
+                    res1 = f_compiled(x, y)
+
+                self.assertEqual(get_autotune_stats(), Stats(1, 0, 1))
+
+                torch._dynamo.reset()
+                PyCodeCache.cache_clear()
+
+                with mock.patch(
+                    "torch._inductor.codecache.torch_key",
+                    functools.partial(mock_torch_key, "torchkey2"),
+                ):
+                    f_compiled = torch.compile(fn, fullgraph=True)
+                    res2 = f_compiled(x, y)
+
+                self.assertEqual(get_autotune_stats(), Stats(2, 0, 2))
+
+                self.assertEqual(res1, res2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestRemoteAOTAutogradCache(TestCase):
     @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
@@ -1847,20 +2952,32 @@ def fn(a, b):
 
 class TestUtils(TestCase):
     @config.patch({"fx_graph_remote_cache": False})
+<<<<<<< HEAD
     def test_fresh_inductor_cache(self):
+=======
+    def test_fresh_cache(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn(x, y):
             return x + y
 
         a = torch.rand(10)
         b = torch.rand(10)
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(len(PyCodeCache.modules), 0)
             res1 = torch.compile(fn)(a, b)
             cache_dir1 = cache_dir()
 
         torch._dynamo.reset()
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(len(PyCodeCache.modules), 0)
             res2 = torch.compile(fn)(a, b)
             cache_dir2 = cache_dir()
diff --git a/test/inductor/test_combo_kernels.py b/test/inductor/test_combo_kernels.py
index 23696b043151..d53d7ec32ef8 100644
--- a/test/inductor/test_combo_kernels.py
+++ b/test/inductor/test_combo_kernels.py
@@ -461,6 +461,28 @@ def fn(x, y):
     @requires_cuda
     @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
     @torch._dynamo.config.patch("assume_static_by_default", True)
+<<<<<<< HEAD
+=======
+    def test_dynamic_shapes_persistent_reduction_no_x_dim_2(self):
+        def fn(x, y):
+            return x.sum(2), y.sum(2)
+
+        inps = (
+            torch.rand(8, 16, 256, device="cuda"),
+            torch.rand(8, 32, 256, device="cuda"),
+        )
+        torch._dynamo.mark_dynamic(inps[0], (0, 1), min=1, max=256)
+        torch._dynamo.mark_dynamic(inps[1], (0, 1), min=1, max=256)
+        out_eager = fn(*inps)
+        out_compiled = torch.compile(fn)(*inps)
+
+        self.assertEqual(out_eager, out_compiled)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
+
+    @requires_cuda
+    @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
+    @torch._dynamo.config.patch("assume_static_by_default", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_2d_blocking_round_robin(self):
         def fn(a0, a1, a2, b0, b1, b2):
             c0 = torch.add(a0, b0)
diff --git a/test/inductor/test_compile.py b/test/inductor/test_compile.py
new file mode 100644
index 000000000000..e1f4f146636d
--- /dev/null
+++ b/test/inductor/test_compile.py
@@ -0,0 +1,115 @@
+# Owner(s): ["module: inductor"]
+import torch
+from torch import _dynamo as dynamo, _inductor as inductor
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import gen_gm_and_inputs
+from torch.fx import symbolic_trace
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.inductor_utils import HAS_CPU
+
+
+class MyModule(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.a = torch.nn.Linear(10, 10)
+        self.b = torch.nn.Linear(10, 10)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = self.relu(self.a(x))
+        x = torch.sigmoid(self.b(x))
+        return x
+
+
+class MyModule2(MyModule):
+    def forward(self, x):  # takes a dict of list
+        a, b = x["key"]
+        return {"result": super().forward(a) + b}
+
+
+class MyModule3(MyModule):
+    def forward(self, x):
+        return (super().forward(x),)
+
+
+class TestStandaloneInductor(TestCase):
+    """
+    These test check that you can call TorchInductor directly without
+    going through TorchDynamo.
+    """
+
+    def test_inductor_via_fx(self):
+        mod = MyModule3().eval()
+        inp = torch.randn(10)
+        correct = mod(inp)
+        mod_opt = inductor.compile(symbolic_trace(mod), [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_fx_tensor_return(self):
+        mod = MyModule().eval()
+        inp = torch.randn(10)
+        correct = mod(inp)
+        mod_opt = inductor.compile(symbolic_trace(mod), [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_fx_dict_input(self):
+        mod = MyModule2().eval()
+        inp = {"key": [torch.randn(10), torch.randn(10)]}
+        correct = mod(inp)
+        mod_opt = inductor.compile(symbolic_trace(mod), [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_make_fx(self):
+        mod = MyModule().eval()
+        inp = torch.randn(10)
+        correct = mod(inp)
+        mod_opt = inductor.compile(make_fx(mod)(inp), [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_bare_module(self):
+        mod = MyModule3().eval()
+        inp = torch.randn(10)
+        correct = mod(inp)
+        # no FX graph at all (mod must return list/tuple in this case)
+        mod_opt = inductor.compile(mod, [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_export1(self):
+        mod = MyModule3().eval()
+        inp = torch.randn(10)
+        correct = mod(inp)
+        gm, _ = dynamo.export(mod, inp, aten_graph=True)
+        mod_opt = inductor.compile(gm, [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_export2(self):
+        mod = MyModule2().eval()
+        inp = {"key": [torch.randn(10), torch.randn(10)]}
+        correct = mod(inp)
+        gm, _ = dynamo.export(mod, inp)
+        mod_opt = inductor.compile(gm, [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_op_with_multiple_outputs(self):
+        x1 = torch.randn((2, 512, 128))
+        x2 = [128]
+        x3 = torch.randn(128)
+        x4 = torch.randn((128,))
+        x5 = 1e-6
+        mod, inp = gen_gm_and_inputs(
+            torch.ops.aten.native_layer_norm.default, (x1, x2, x3, x4, x5), {}
+        )
+        mod_opt = inductor.compile(mod, inp)
+        self.assertEqual(mod(*inp), mod_opt(*inp))
+
+
+if __name__ == "__main__":
+    if HAS_CPU:
+        run_tests()
diff --git a/test/inductor/test_compile_subprocess.py b/test/inductor/test_compile_subprocess.py
index cbf19277a7c4..cfcbdd93e334 100644
--- a/test/inductor/test_compile_subprocess.py
+++ b/test/inductor/test_compile_subprocess.py
@@ -8,6 +8,10 @@
 import importlib
 import os
 import sys
+<<<<<<< HEAD
+=======
+import time
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -15,7 +19,11 @@
 from torch._inductor.compile_fx import _InProcessFxCompile, FxCompile, FxCompileMode
 from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import TEST_WITH_ASAN
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, RUN_CPU, RUN_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Make the helper files in test/ importable
@@ -36,6 +44,17 @@
 test_failures = {
     # TypeError: cannot pickle 'generator' object
     "test_layer_norm": TestFailure(("cpu", "cuda"), is_skip=True),
+<<<<<<< HEAD
+=======
+    "test_remove_noop_slice": TestFailure(("xpu"), is_skip=True),
+    "test_remove_noop_slice1": TestFailure(("xpu"), is_skip=True),
+    "test_remove_noop_slice_scatter": TestFailure(("xpu"), is_skip=True),
+    "test_remove_noop_view_default": TestFailure(("xpu"), is_skip=True),
+    "test_remove_noop_view_dtype": TestFailure(("xpu"), is_skip=True),
+    # TODO:remove test_upsample_bicubic2d after the following issue resolved:
+    # https://github.com/intel/intel-xpu-backend-for-triton/issues/4184
+    "test_upsample_bicubic2d": TestFailure(("xpu"), is_skip=False),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -66,8 +85,72 @@ def tearDown(self):
         TestCase.tearDown(self)
         torch._dynamo.reset()
 
+<<<<<<< HEAD
 
 if HAS_CPU:
+=======
+    @patch("torch._inductor.compile_fx.fx_compile_async", True)
+    def test_async(self):
+        # Test that async+subprocess works.
+        from torch._inductor.compile_fx_async import _AsyncFxCompile
+
+        @torch.compile(fullgraph=True, backend="inductor")
+        def model_add(x, y):
+            out = x
+            for i in range(500):
+                out = torch.add(out, y)
+            return out
+
+        _AsyncFxCompile._reset_stats()
+
+        with contextlib.ExitStack() as stack:
+            # TODO: Turn off local caches - they don't play nice w/ async currently.
+            stack.enter_context(
+                torch._inductor.config.patch(
+                    autotune_local_cache=False, fx_graph_cache=False
+                )
+            )
+            stack.enter_context(
+                torch._functorch.config.patch(enable_autograd_cache=False)
+            )
+
+            # How long to wait (in seconds) before giving up.
+            TIMEOUT = 300
+            # If non-None then how often (in seconds) to print a TICK message.
+            TICK_REPORT = None
+
+            start = time.time()
+            last_report = start
+            while _AsyncFxCompile._stat_compiled_runs < 4:
+                # Sleep a bit so we don't drive the CPU unnecessarily.
+                time.sleep(0.25)
+
+                x = torch.randn(100, 100)
+                y = torch.randn(100, 100)
+                model_add(x, y)
+
+                # DEBUGGING: Print a periodic message so we know we're still
+                # running...
+                now = time.time()
+                if TICK_REPORT is not None and (now - last_report > TICK_REPORT):
+                    print(f"*** TICK {int(now - start)}")
+                    last_report = now
+
+                if now - start > TIMEOUT:
+                    raise RuntimeError(
+                        "Test timed out before producing a compiled artifact."
+                    )
+
+            self.assertEqual(_AsyncFxCompile._stat_compiled_runs, 4)
+            # Make sure we ran eager at least once. Normally this will be
+            # something like 80.
+            self.assertGreater(_AsyncFxCompile._stat_eager_runs, 0)
+            self.assertEqual(_AsyncFxCompile._stat_bg_started, 1)
+            self.assertEqual(_AsyncFxCompile._stat_bg_finished, 1)
+
+
+if RUN_CPU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class CpuTests(TestSubprocess):
         common = check_model
@@ -77,7 +160,11 @@ class CpuTests(TestSubprocess):
         inductor.test_torchinductor.CommonTemplate, CpuTests, "cpu", test_failures
     )
 
+<<<<<<< HEAD
 if HAS_GPU and not TEST_WITH_ASAN:
+=======
+if RUN_GPU and not TEST_WITH_ASAN:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class GPUTests(TestSubprocess):
         common = check_model_gpu
@@ -91,5 +178,9 @@ class GPUTests(TestSubprocess):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
+<<<<<<< HEAD
     if HAS_CPU or HAS_GPU:
+=======
+    if RUN_CPU or RUN_GPU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 730e363a7c7e..664d1e6f5f7d 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -19,6 +19,10 @@
 from unittest import mock
 
 import torch
+<<<<<<< HEAD
+=======
+import torch.distributed as dist
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import _inductor as inductor
@@ -30,6 +34,11 @@
 from torch._inductor import config as inductor_config
 from torch._inductor.test_case import run_tests, TestCase
 from torch.nn.attention.flex_attention import flex_attention
+<<<<<<< HEAD
+=======
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.overrides import BaseTorchFunctionMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     ops,
@@ -44,6 +53,10 @@
 from torch.testing._internal.hop_db import hop_db
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_GPU
 from torch.testing._internal.logging_utils import logs_to_string
+<<<<<<< HEAD
+=======
+from torch.utils._python_dispatch import TorchDispatchMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # note: these tests are not run on windows due to inductor_utils.HAS_CPU
@@ -52,12 +65,17 @@
 def make_compiler_fn(
     fullgraph=True, dynamic=True, backend="inductor", gm_hook=lambda gm: None
 ):
+<<<<<<< HEAD
     assert backend in ["inductor", "aot_eager", "ca_eager"]
+=======
+    assert backend in ["inductor", "aot_eager", "eager", "ca_eager"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _compiler_fn(gm):
         """Same as torch.compile() but counts number of compiles"""
         gm_hook(gm)
 
+<<<<<<< HEAD
         if backend == "ca_eager":
             return gm
 
@@ -71,6 +89,23 @@ def _inner_compiler(gm_, example_inputs_):
         return torch.compile(
             gm, backend=_inner_compiler, fullgraph=fullgraph, dynamic=dynamic
         )
+=======
+        _backend = backend
+        if backend == "ca_eager":
+            return gm
+        elif backend != "eager":
+
+            def _inner_compiler(gm_, example_inputs_):
+                counters["compiled_autograd"]["compiles"] += 1
+                if backend == "inductor":
+                    return inductor.compile(gm_, example_inputs_)
+                elif backend == "aot_eager":
+                    return aot_eager(gm_, example_inputs_)
+
+            _backend = _inner_compiler
+
+        return torch.compile(gm, backend=_backend, fullgraph=fullgraph, dynamic=dynamic)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return _compiler_fn
 
@@ -95,14 +130,37 @@ def reset():
     torch._logging.set_logs(compiled_autograd_verbose=False)
     config.compiled_autograd = False
     compiled_autograd.reset()
+<<<<<<< HEAD
+=======
+    torch._dynamo.utils.counters.clear()
+
+
+class BaseCustomOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        return x * 2
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        raise NotImplementedError("must override")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestCompiledAutograd(TestCase):
     def setUp(self) -> None:
+<<<<<<< HEAD
+=======
+        self.exit_stack = contextlib.ExitStack()
+        self.exit_stack.enter_context(config.patch("record_runtime_overhead", False))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().setUp()
         reset()
 
     def tearDown(self) -> None:
+<<<<<<< HEAD
+=======
+        self.exit_stack.close()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().tearDown()
         reset()
 
@@ -119,9 +177,18 @@ def check_output_and_recompiles(
             torch.manual_seed(123)
             expected = list(fn())
             torch.manual_seed(123)
+<<<<<<< HEAD
             with compiled_autograd._enable(compiler_fn), mock.patch(
                 "torch._functorch.aot_autograd.AOT_COUNTER",
                 new_callable=itertools.count,
+=======
+            with (
+                compiled_autograd._enable(compiler_fn),
+                mock.patch(
+                    "torch._functorch.aot_autograd.AOT_COUNTER",
+                    new_callable=itertools.count,
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 opt_fn = torch.compile(fn) if compile_fn else fn
                 actual = list(opt_fn())
@@ -660,6 +727,7 @@ def fn():
 
         self.check_output_and_recompiles(fn)
 
+<<<<<<< HEAD
     def test_torch_compile_api_inductor(self):
         def fn():
             torch.manual_seed(123)
@@ -673,12 +741,28 @@ def fn():
                 x = torch.randn([1, 4])
 
                 result = model(x).sum()
+=======
+    @parametrize("api", ("compile", "optimize"))
+    @parametrize("backend", ("eager", "aot_eager", "inductor"))
+    def test_compile_api(self, api, backend):
+        def wrap(fn, backend):
+            if api == "compile":
+                return torch.compile(fn, backend=backend)
+            elif api == "optimize":
+                return torch._dynamo.optimize(backend)(fn)
+
+        def fn(model, inputs):
+            res = []
+            for inp in inputs:
+                result = model(inp).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result.backward()
                 res.append(model[0].weight.grad)
                 res.append(model[0].bias.grad)
                 model.zero_grad()
             return res
 
+<<<<<<< HEAD
         expected = fn()
         with config.patch(compiled_autograd=True):
             compiled_fn = torch.compile(fn)
@@ -699,12 +783,46 @@ def fn():
                 x = torch.randn([1, 4])
 
                 result = model(x).sum()
+=======
+        torch.manual_seed(123)
+        model = torch.nn.Sequential(
+            torch.nn.Linear(4, 4),
+            torch.nn.Sigmoid(),
+        )
+        inputs = [
+            torch.randn([1, 4]),
+            torch.randn([2, 4]),
+            torch.randn([3, 4]),
+        ]
+
+        expected = fn(model, inputs)
+        with config.patch(compiled_autograd=True):
+            compiled_fn = wrap(fn, backend)
+        actual = compiled_fn(model, inputs)
+        self.assertEqual(expected, actual)
+        self.assertEqual(counters["compiled_autograd"]["captures"], 2)
+
+    @parametrize("api", ("compile", "optimize"))
+    @parametrize("backend", ("eager", "aot_eager", "inductor"))
+    def test_compile_api_disable(self, api, backend):
+        def wrap(fn, backend):
+            if api == "compile":
+                return torch.compile(fn, backend=backend)
+            elif api == "optimize":
+                return torch._dynamo.optimize(backend)(fn)
+
+        def fn(model, inputs):
+            res = []
+            for inp in inputs:
+                result = model(inp).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result.backward()
                 res.append(model[0].weight.grad)
                 res.append(model[0].bias.grad)
                 model.zero_grad()
             return res
 
+<<<<<<< HEAD
         expected = fn()
         with config.patch(compiled_autograd=True):
             compiled_fn = torch.compile(fn, backend="aot_eager")
@@ -736,6 +854,137 @@ def fn():
             compiled_fn = torch.compile(fn, backend="eager")
         actual = compiled_fn()
         self.assertEqual(expected, actual)
+=======
+        torch.manual_seed(123)
+        model = torch.nn.Sequential(
+            torch.nn.Linear(4, 4),
+            torch.nn.Sigmoid(),
+        )
+        inputs = [
+            torch.randn([1, 4]),
+            torch.randn([2, 4]),
+            torch.randn([3, 4]),
+        ]
+
+        expected = fn(model, inputs)
+        with config.patch(compiled_autograd=True):
+            compiled_fn = wrap(fn, backend)
+        with torch._dynamo.compiled_autograd._disable():
+            actual = compiled_fn(model, inputs)
+        self.assertEqual(expected, actual)
+        self.assertTrue("compiled_autograd" not in counters)
+
+    @parametrize("backend", ("eager", "aot_eager", "inductor"))
+    def test_optimize_assert(self, backend):
+        # can be merged into the test above once we support
+        # no graph break on .backward
+
+        def fn(model, inp):
+            # NOTE: not calling .backward in the compiled fn
+            return model(inp).sum()
+
+        torch.manual_seed(123)
+        model = torch.nn.Sequential(
+            torch.nn.Linear(4, 4),
+            torch.nn.Sigmoid(),
+        )
+        inp = torch.randn([1, 4])
+
+        out = fn(model, inp)
+        out.backward()
+        expected = [p.grad for p in model.parameters()]
+        model.zero_grad()
+        with config.patch(compiled_autograd=True):
+            compiled_fn = torch._dynamo.optimize_assert(backend)(fn)
+
+        # should not error due to undefined `rebuild_ctx`
+        out = compiled_fn(model, inp)
+        out.backward()
+        actual = [p.grad for p in model.parameters()]
+        self.assertEqual(expected, actual)
+        self.assertEqual(counters["compiled_autograd"]["captures"], 0)
+
+    @config.patch(compiled_autograd=True)
+    def test_nested_context_manager(self):
+        def ctx():
+            return compiled_autograd._enable(torch.compile)
+
+        # ok
+        outer = ctx()
+        inner = ctx()
+        outer.__enter__()
+        inner.__enter__()
+        inner.__exit__(None, None, None)
+        outer.__exit__(None, None, None)
+
+        # not ok
+        outer = ctx()
+        inner = ctx()
+        outer.__enter__()
+        inner.__enter__()
+        with self.assertRaisesRegex(
+            AssertionError,
+            "Nested Compiled Autograd Contexts must return before their parent context",
+        ):
+            outer.__exit__(None, None, None)
+
+    @config.patch(compiled_autograd=True)
+    def test_nested_compile(self):
+        with torch.library._scoped_library("testlib", "FRAGMENT") as lib:
+            lib.define("square(Tensor x) -> Tensor")
+
+            @torch.library.impl("testlib::square", "CPU")
+            def square_impl(x: torch.Tensor) -> torch.Tensor:
+                # nested inference graph compile
+                @torch.compile(backend="eager")
+                def fn(x):
+                    return x**2
+
+                return fn(x)
+
+            class MyFn(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    return x
+
+                @staticmethod
+                def backward(ctx, x):
+                    return torch.ops.testlib.square(x)
+
+            x = torch.tensor([2.0, 3.0], requires_grad=True)
+
+            @torch.compile
+            def fn(x):
+                return MyFn.apply(x)
+
+            fn(x).sum().backward()
+
+    @config.patch(compiled_autograd=True)
+    def test_no_nested_compiled_autograd(self):
+        # We disable CA before entering the CA graph
+        # So re-entrants should be running with the eager autograd engine
+
+        def unrelated_autograd_call():
+            x = torch.randn(20, 20, requires_grad=True)
+            y = torch.randn(20, 20, requires_grad=True)
+            loss = torch.matmul(x, y).sum()
+            loss.backward()
+
+        class MyFn(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x
+
+            @staticmethod
+            def backward(ctx, gO):
+                unrelated_autograd_call()
+                return gO
+
+        x = torch.randn(10, 10, requires_grad=True)
+        loss = MyFn.apply(x).sum()
+
+        torch.compile(lambda: loss.backward(create_graph=True))()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(counters["compiled_autograd"]["captures"], 1)
 
     def test_multiple_torch_compile(self):
@@ -836,10 +1085,18 @@ def eager_with_check(gm, is_bwd):
             def inner_compiler(gm_, example_inputs_):
                 placeholders = get_placeholders(gm_)
                 if is_bwd:
+<<<<<<< HEAD
                     # should be boxed inputs
                     assert len(placeholders) == 1
                 else:
                     assert len(placeholders) > 1
+=======
+                    # boxed inputs
+                    assert isinstance(placeholders[0].meta["example_value"], list)
+                else:
+                    # not boxed inputs
+                    assert not isinstance(placeholders[0].meta["example_value"], list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 return gm_
 
@@ -878,7 +1135,16 @@ def test_inputs_aliasing_bytecode_attr_mutations(self):
         activ = torch.ones(100) * 2
         inputs = [param, activ]
         _, proxies, _, _ = compiler.begin_capture(
+<<<<<<< HEAD
             inputs=inputs, sizes=[], scalars=[], origins=[[], [], []]
+=======
+            inputs=inputs,
+            sizes=[],
+            scalars=[],
+            origins=[[], [], []],
+            accumulate_grad=False,
+            check_nans=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         param_proxy, activ_proxy = proxies
         buf = activ_proxy * 2
@@ -1163,8 +1429,52 @@ def fn():
                 yield model[2].bias.grad
                 model.zero_grad()
 
+<<<<<<< HEAD
         # TODO(jansel): we should be able to get this count to 1
         self.check_output_and_recompiles(fn, count=2)
+=======
+        self.check_output_and_recompiles(fn)
+
+    def test_dynamic_shapes_from_forward(self):
+        class ToyModel(nn.Module):
+            def __init__(self, in_feat=10, hidden_feat=50, out_feat=5):
+                super().__init__()
+                self.linear1 = nn.Linear(in_feat, hidden_feat)
+                self.linear2 = nn.Linear(hidden_feat, hidden_feat)
+                self.linear3 = nn.Linear(hidden_feat, out_feat)
+                self.mse_loss = torch.nn.MSELoss()
+
+            def forward(self, inputs, output):
+                out1 = self.linear1(inputs)
+                out2 = self.linear2(out1)
+                out3 = self.linear3(out2)
+                return self.mse_loss(out3, output)
+
+        m = ToyModel()
+        m = torch.compile(m)
+
+        def run(i):
+            torch._dynamo.utils.counters.clear()
+            inp = torch.randn(i, 10)
+            target = torch.randn(i, 5)
+            loss = m(inp, target)
+            with compiled_autograd._enable(make_compiler_fn(dynamic=None)):
+                loss.backward()
+
+        counters = torch._dynamo.utils.counters
+        run(3)
+        self.assertEqual(counters["compiled_autograd"]["captures"], 1)
+        self.assertEqual(counters["compiled_autograd"]["compiles"], 1)
+        run(4)
+        self.assertEqual(counters["compiled_autograd"]["captures"], 1)
+        self.assertEqual(counters["compiled_autograd"]["compiles"], 1)
+        run(5)
+        self.assertEqual(counters["compiled_autograd"]["captures"], 0)
+        self.assertEqual(counters["compiled_autograd"]["compiles"], 0)
+        run(6)
+        self.assertEqual(counters["compiled_autograd"]["captures"], 0)
+        self.assertEqual(counters["compiled_autograd"]["compiles"], 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_dynamic_shapes_eager_node(self):
         # Here, we have no way of marking the symbolic sizes using in SumBackward as dynamic
@@ -1190,7 +1500,30 @@ def fn():
                 yield model[2].bias.grad
                 model.zero_grad()
 
+<<<<<<< HEAD
         self.check_output_and_recompiles(fn, count=3)
+=======
+        self.check_output_and_recompiles(fn)
+
+    def test_dynamic_shapes_annotations(self):
+        @torch.compile
+        def f(x):
+            return x.sin().sin()
+
+        with torch._dynamo.compiled_autograd._enable(torch.compile):
+            x = torch.randn(2, 3, requires_grad=True)
+            torch._dynamo.mark_dynamic(x, 0)
+            out = f(x)
+            out.sum().backward()
+
+            x = torch.randn(4, 3, requires_grad=True)
+            torch._dynamo.mark_dynamic(x, 0)
+            out = f(x)
+            out.sum().backward()
+
+        # mark_dynamic should not cause ConstraintViolationError
+        self.assertEqual(counters["compiled_autograd"]["captures"], 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_torch_compile_api_dynamic_shapes(self):
         # Here, we have no way of marking the symbolic sizes using in SumBackward as dynamic
@@ -1395,7 +1728,11 @@ def backward(ctx, gO):
                 loss.backward()
                 yield x.grad
 
+<<<<<<< HEAD
         self.check_output_and_recompiles(fn, count=2)
+=======
+        self.check_output_and_recompiles(fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_custom_fn_saved_multiple_tensors(self):
         def fn():
@@ -1418,7 +1755,11 @@ def backward(ctx, gO_x, gO_y):
                 loss.backward()
                 yield x.grad
 
+<<<<<<< HEAD
         self.check_output_and_recompiles(fn, count=2)
+=======
+        self.check_output_and_recompiles(fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_custom_fn_saved_multiple_tensors_dedup(self):
         def fn():
@@ -1440,7 +1781,11 @@ def backward(ctx, gO):
                 loss.backward()
                 yield x.grad
 
+<<<<<<< HEAD
         self.check_output_and_recompiles(fn, count=2)
+=======
+        self.check_output_and_recompiles(fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_custom_fn_saved_shape_tensor(self):
         def fn():
@@ -1462,7 +1807,11 @@ def backward(ctx, gO):
                 loss.backward()
                 yield x.grad
 
+<<<<<<< HEAD
         self.check_output_and_recompiles(fn, count=2)
+=======
+        self.check_output_and_recompiles(fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_custom_fn_saved_attr(self):
         def fn():
@@ -1485,7 +1834,11 @@ def backward(ctx, gO):
                 yield x.grad
 
         self.check_output_and_recompiles(
+<<<<<<< HEAD
             fn, count=2, compiler_fn=make_compiler_fn(fullgraph=False)
+=======
+            fn, compiler_fn=make_compiler_fn(fullgraph=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_custom_fn_multiple_grads(self):
@@ -1508,7 +1861,11 @@ def backward(ctx, gO_1, gO_2):
                 yield x.grad
                 yield y.grad
 
+<<<<<<< HEAD
         self.check_output_and_recompiles(fn, count=2)
+=======
+        self.check_output_and_recompiles(fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_custom_fn_non_variable_input(self):
         def fn():
@@ -1532,7 +1889,11 @@ def backward(ctx, gO_1, gO_2, gO_3):
                 yield y
                 yield z
 
+<<<<<<< HEAD
         self.check_output_and_recompiles(fn, count=2)
+=======
+        self.check_output_and_recompiles(fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_GPU, "requires gpu")
     def test_logging_tensor_flaky(self) -> None:
@@ -1585,9 +1946,15 @@ def test_custom_fn_output_metadata(self):
         def my_compiler_fn(gm):
             for node in gm.graph.nodes:
                 if isinstance(node.target, torch._ops.OpOverload):
+<<<<<<< HEAD
                     assert (
                         node.target._name != "aten::_to_copy"
                     ), "there should be no implicit copies (e.g. dtype casting)"
+=======
+                    assert node.target._name != "aten::_to_copy", (
+                        "there should be no implicit copies (e.g. dtype casting)"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def inner_compiler(gm_, example_inputs_):
                 counters["compiled_autograd"]["compiles"] += 1
@@ -1699,7 +2066,11 @@ def backward(ctx, gO):
                 yield x.grad
 
         self.check_output_and_recompiles(
+<<<<<<< HEAD
             fn, count=[2, 6], compiler_fn=make_compiler_fn(fullgraph=False)
+=======
+            fn, count=[1, 3], compiler_fn=make_compiler_fn(fullgraph=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_custom_fn_compiled_fw_graph_break(self):
@@ -1725,9 +2096,15 @@ def backward(ctx, gO):
                 yield x.grad
 
         self.check_output_and_recompiles(
+<<<<<<< HEAD
             fn, count=2, compiler_fn=make_compiler_fn(fullgraph=False)
         )
         self.assertEqual(counters["stats"]["unique_graphs"], 5)  # 3 fw, 2 bw
+=======
+            fn, count=1, compiler_fn=make_compiler_fn(fullgraph=False)
+        )
+        self.assertEqual(counters["stats"]["unique_graphs"], 4)  # 3 fw, 1 bw
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_custom_fn_compiled_fw_bw_graph_break(self):
         def fn():
@@ -1753,9 +2130,15 @@ def backward(ctx, gO):
                 yield x.grad
 
         self.check_output_and_recompiles(
+<<<<<<< HEAD
             fn, count=[2, 6], compiler_fn=make_compiler_fn(fullgraph=False)
         )
         self.assertEqual(counters["stats"]["unique_graphs"], 9)  # 3 fw, 6 bw
+=======
+            fn, count=[1, 3], compiler_fn=make_compiler_fn(fullgraph=False)
+        )
+        self.assertEqual(counters["stats"]["unique_graphs"], 6)  # 3 fw, 3 bw
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_mismatch_fake_tensor_mode(self, dynamic_shape=False):
         """
@@ -2135,12 +2518,20 @@ def fn():
                 yield x.grad
 
         if is_traceable:
+<<<<<<< HEAD
             # compiles for 10 (static) and 100 (dynamic)
             self.check_output_and_recompiles(fn, 2)
         else:
             # compiles for 10 (static) and 100 (dynamic), each with a graph break
             self.check_output_and_recompiles(
                 fn, count=[2, 4], compiler_fn=make_compiler_fn(fullgraph=False)
+=======
+            self.check_output_and_recompiles(fn, 1)
+        else:
+            # compiles for 10 (static) and 100 (dynamic), each with a graph break
+            self.check_output_and_recompiles(
+                fn, count=[1, 2], compiler_fn=make_compiler_fn(fullgraph=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @parametrize("is_traceable", (True, False))
@@ -2332,10 +2723,17 @@ def fn():
                 yield x.grad
 
         if is_traceable:
+<<<<<<< HEAD
             self.check_output_and_recompiles(fn, 2)
         else:
             self.check_output_and_recompiles(
                 fn, count=[2, 4], compiler_fn=make_compiler_fn(fullgraph=False)
+=======
+            self.check_output_and_recompiles(fn, 1)
+        else:
+            self.check_output_and_recompiles(
+                fn, count=[1, 2], compiler_fn=make_compiler_fn(fullgraph=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @parametrize("is_traceable", (True, False))
@@ -2397,10 +2795,17 @@ def fn():
 
         # compiles for 10 (static) and 100 (dynamic)
         if is_traceable:
+<<<<<<< HEAD
             self.check_output_and_recompiles(fn, 2)
         else:
             self.check_output_and_recompiles(
                 fn, count=[2, 4], compiler_fn=make_compiler_fn(fullgraph=False)
+=======
+            self.check_output_and_recompiles(fn, 1)
+        else:
+            self.check_output_and_recompiles(
+                fn, count=[1, 2], compiler_fn=make_compiler_fn(fullgraph=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @parametrize("is_traceable", (True, False))
@@ -2537,9 +2942,15 @@ def fn():
             self.assertEqual(counters["stats"]["unique_graphs"], 3)
         else:
             self.check_output_and_recompiles(
+<<<<<<< HEAD
                 fn, count=[1, 4], compiler_fn=make_compiler_fn(fullgraph=False)
             )
             self.assertEqual(counters["stats"]["unique_graphs"], 3)
+=======
+                fn, count=[1, 3], compiler_fn=make_compiler_fn(fullgraph=False)
+            )
+            self.assertEqual(counters["stats"]["unique_graphs"], 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @parametrize("is_traceable", (True, False))
     @scoped_load_inline
@@ -2790,14 +3201,29 @@ def test_cudagraphs_cpu_division(self):
         loss = reduce_to_scalar_loss(out)
 
         stderr_msgs = io.StringIO()
+<<<<<<< HEAD
         with mock.patch("sys.stderr", stderr_msgs), compiled_autograd._enable(
             compiler_fn
+=======
+        with (
+            mock.patch("sys.stderr", stderr_msgs),
+            compiled_autograd._enable(compiler_fn),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch._inductor.config.triton.cudagraphs = True
             loss.backward()
             torch._inductor.config.triton.cudagraphs = False
 
+<<<<<<< HEAD
         self.assertFalse("skipping cudagraphs" in stderr_msgs.getvalue())
+=======
+        if inductor_config.cpp_wrapper:
+            self.assertIn("skipping cudagraphs", stderr_msgs.getvalue())
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+        else:
+            self.assertNotIn("skipping cudagraphs", stderr_msgs.getvalue())
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_cudagraphs_cpu_graph(self):
         from torch._dynamo.testing import reduce_to_scalar_loss
@@ -2823,14 +3249,27 @@ def test_cudagraphs_sdpa(self):
         value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
         out = torch.nn.functional.scaled_dot_product_attention(query, key, value)
 
+<<<<<<< HEAD
         with config.patch(compiled_autograd=True), inductor_config.patch(
             "triton.cudagraphs", True
+=======
+        with (
+            config.patch(compiled_autograd=True),
+            inductor_config.patch("triton.cudagraphs", True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             opt_bwd = torch.compile(lambda: out.sum().backward())
             opt_bwd()
 
         self.assertEqual(counters["compiled_autograd"]["captures"], 1)
+<<<<<<< HEAD
         self.assertEqual(counters["inductor"]["cudagraph_skips"], 0)
+=======
+        self.assertEqual(
+            counters["inductor"]["cudagraph_skips"],
+            2 if inductor_config.cpp_wrapper else 0,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_CUDA, "requires cuda")
     def test_cudagraphs_cpu_scalar_used_in_python_custom_op(self):
@@ -2850,8 +3289,14 @@ def backward(ctx, gO):
 
         x = torch.randn(10, requires_grad=True, device="cuda")
         out = MyFn.apply(x)
+<<<<<<< HEAD
         with config.patch(compiled_autograd=True), inductor_config.patch(
             "triton.cudagraphs", True
+=======
+        with (
+            config.patch(compiled_autograd=True),
+            inductor_config.patch("triton.cudagraphs", True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             opt_bwd = torch.compile(lambda: out.backward())
             opt_bwd()
@@ -2909,8 +3354,14 @@ def test_cudagraphs_cpu_scalar_used_in_cpp_custom_op(self, load_inline):
         )
 
         x = torch.randn(2, 2, requires_grad=True, device="cuda")
+<<<<<<< HEAD
         with config.patch(compiled_autograd=True), inductor_config.patch(
             "triton.cudagraphs", True
+=======
+        with (
+            config.patch(compiled_autograd=True),
+            inductor_config.patch("triton.cudagraphs", True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             out = torch.ops.test_cudagraphs_cpu_scalar_used_in_cpp_custom_op.custom_op_backed_by_autograd_fn(
                 x
@@ -2923,7 +3374,14 @@ def test_cudagraphs_cpu_scalar_used_in_cpp_custom_op(self, load_inline):
         # into it. We must skip since we do not know if the cpu scalar will be used only in ATen/prim ops.
         # In the future, we can consider having a cpu scalar movement pass sometime after we trace
         # into the custom C++ autograd::Function (like in AOTDispatcher)
+<<<<<<< HEAD
         self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+=======
+        self.assertEqual(
+            counters["inductor"]["cudagraph_skips"],
+            2 if inductor_config.cpp_wrapper else 1,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_logs(self):
         logs, ctx = logs_to_string(
@@ -3042,6 +3500,7 @@ def forward(model, x):
 
         expected_logs = [
             "code: CompiledFunctionBackward (NodeCall 2)",
+<<<<<<< HEAD
             "code: CompiledFunctionBackward0 (NodeCall 2)",
             "aot0_primals_3",
             "aot0_relu",
@@ -3060,6 +3519,8 @@ def forward(model, x):
             "aot0_mm_2",
             "aot0_sum_2",
             "aot0_view_1",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         found = 0
@@ -3186,19 +3647,26 @@ def fn():
             torch._dynamo.compiled_autograd.__name__, "compiled_autograd_verbose"
         )
         with ctx():
+<<<<<<< HEAD
             self.check_output_and_recompiles(fn, count=2)
+=======
+            self.check_output_and_recompiles(fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         patterns1 = [
             r".*Cache miss due to new autograd node: torch::autograd::GraphRoot \(NodeCall 0\) with key size (\d+), "
             r"previous key sizes=\[\]\n",
         ]
 
+<<<<<<< HEAD
         # recompile
         patterns2 = [
             r".*Cache miss due to 7 changed tensor shapes \(total of 7\): ",
             r"sizes\[0\], sizes\[1\], sizes\[2\], sizes\[3\], sizes\[4\], sizes\[5\], sizes\[6\]\n",
         ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         all_logs = logs.getvalue()
 
         pattern1 = r"".join(patterns1)
@@ -3209,10 +3677,13 @@ def fn():
         )  # for a single match: matches1=['match'], for multiple matches: matches1=[('match1', 'match2')]...
         self.assertEqual(len(matches1), len(patterns1))
 
+<<<<<<< HEAD
         pattern2 = r"".join(patterns2)
         matches2 = re.findall(pattern2, all_logs)
         self.assertEqual(len(matches2), 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_verbose_logs_dynamic_shapes(self):
         logs, ctx = logs_to_string(
             torch._dynamo.compiled_autograd.__name__, "compiled_autograd_verbose"
@@ -3233,11 +3704,16 @@ def test_verbose_logs_dynamic_shapes(self):
             with ctx(), compiled_autograd._enable(torch.compile(backend="eager")):
                 result.backward()
 
+<<<<<<< HEAD
         self.assertEqual(counters["compiled_autograd"]["captures"], 3)
+=======
+        self.assertEqual(counters["compiled_autograd"]["captures"], 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         actual_logs = logs.getvalue()
         expected_logs = [
             "Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]",
+<<<<<<< HEAD
             (
                 "Cache miss due to 7 changed tensor shapes (total of 14): "
                 "sizes[0], sizes[1], sizes[2], sizes[3], sizes[4], sizes[5], sizes[6]"
@@ -3246,6 +3722,8 @@ def test_verbose_logs_dynamic_shapes(self):
                 "Cache miss due to 7 changed tensor shapes (total of 14): "
                 "sizes[0], sizes[1], sizes[2], sizes[3], sizes[4], sizes[5], sizes[6]"
             ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         for expected in expected_logs:
             self.assertTrue(expected in actual_logs)
@@ -3333,9 +3811,18 @@ def compiler_fn(gm):
                 graphs.append(gm)
                 return inner_compiler_fn(gm)
 
+<<<<<<< HEAD
             with compiled_autograd._enable(compiler_fn), mock.patch(
                 "torch._functorch.aot_autograd.AOT_COUNTER",
                 new_callable=itertools.count,
+=======
+            with (
+                compiled_autograd._enable(compiler_fn),
+                mock.patch(
+                    "torch._functorch.aot_autograd.AOT_COUNTER",
+                    new_callable=itertools.count,
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 res = fn(x)
                 res.sum().backward()
@@ -3356,6 +3843,7 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         getitem_2 = inputs[2]
         getitem_3 = inputs[3]
         getitem_4 = inputs[4];  inputs = None
+<<<<<<< HEAD
 
         validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], True)]);  getitem = None
         getitem_5 = validate_outputs[0];  validate_outputs = None
@@ -3367,6 +3855,35 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
 
         getitem_8 = hooks[0];  getitem_8 = None
         call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((getitem_1, getitem_2), [], getitem_7);  getitem_1 = getitem_2 = getitem_7 = None
+=======
+        getitem_5 = sizes[0]
+        getitem_6 = sizes[1]
+        getitem_7 = sizes[2]
+        getitem_8 = sizes[3]
+        getitem_21 = sizes[4]
+        getitem_22 = sizes[5]
+        getitem_23 = sizes[6]
+        getitem_24 = sizes[7];  sizes = None
+        unwrap_maybe_dynamic_int = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_5);  getitem_5 = None
+        unwrap_maybe_dynamic_int_1 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_6);  getitem_6 = None
+        unwrap_maybe_dynamic_int_2 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_7);  getitem_7 = None
+        unwrap_maybe_dynamic_int_3 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_8);  getitem_8 = None
+        unwrap_maybe_dynamic_int_16 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_21);  getitem_21 = None
+        unwrap_maybe_dynamic_int_17 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_22);  getitem_22 = None
+        unwrap_maybe_dynamic_int_18 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_23);  getitem_23 = None
+        unwrap_maybe_dynamic_int_19 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_24);  getitem_24 = None
+
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], True)]);  getitem = None
+        getitem_25 = validate_outputs[0];  validate_outputs = None
+
+        sum_backward0 = torch__dynamo_compiled_autograd_ops_SumBackward0([getitem_25], [True], [unwrap_maybe_dynamic_int, unwrap_maybe_dynamic_int_1]);  getitem_25 = unwrap_maybe_dynamic_int = unwrap_maybe_dynamic_int_1 = None
+        getitem_26 = sum_backward0[0];  sum_backward0 = None
+        validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_26], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_2, unwrap_maybe_dynamic_int_3], True)]);  getitem_26 = unwrap_maybe_dynamic_int_2 = unwrap_maybe_dynamic_int_3 = None
+        getitem_27 = validate_outputs_1[0];  validate_outputs_1 = None
+
+        getitem_28 = hooks[0];  getitem_28 = None
+        call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((getitem_1, getitem_2), [], getitem_27);  getitem_1 = getitem_2 = getitem_27 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aot0_primals_1 = call_aot_bwd_prologue[0]
         aot0_primals_2 = call_aot_bwd_prologue[1]
         aot0_tangents_1 = call_aot_bwd_prologue[2]
@@ -3380,6 +3897,7 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
 
         make_subclass = torch__dynamo_compiled_autograd_make_subclass(aot0_add_2, aot0_add_3);  aot0_add_2 = aot0_add_3 = None
 
+<<<<<<< HEAD
         getitem_13 = hooks[1];  hooks = None
         call_backward = torch__dynamo_external_utils_call_backward(getitem_13, (), make_subclass);  getitem_13 = make_subclass = None
         getitem_16 = call_backward[0]
@@ -3392,6 +3910,20 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         getitem_20 = validate_outputs_2[1];  validate_outputs_2 = None
 
         accumulate_grad_ = torch.ops.inductor.accumulate_grad_.default(getitem_3, getitem_20);  getitem_3 = getitem_20 = accumulate_grad_ = None
+=======
+        getitem_33 = hooks[1];  hooks = None
+        call_backward = torch__dynamo_external_utils_call_backward(getitem_33, (), make_subclass);  getitem_33 = make_subclass = None
+        getitem_36 = call_backward[0]
+        getitem_37 = call_backward[1];  call_backward = None
+        validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_36, getitem_37], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_16, unwrap_maybe_dynamic_int_17], False), ((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_18, unwrap_maybe_dynamic_int_19], False)]);  getitem_36 = getitem_37 = unwrap_maybe_dynamic_int_16 = unwrap_maybe_dynamic_int_17 = unwrap_maybe_dynamic_int_18 = unwrap_maybe_dynamic_int_19 = None
+        getitem_39 = validate_outputs_2[0]
+
+        call_accumulate_grad_1 = torch__dynamo_external_utils_call_accumulate_grad(getitem_4, getitem_39, False);  getitem_4 = getitem_39 = call_accumulate_grad_1 = None
+
+        getitem_40 = validate_outputs_2[1];  validate_outputs_2 = None
+
+        call_accumulate_grad = torch__dynamo_external_utils_call_accumulate_grad(getitem_3, getitem_40, False);  getitem_3 = getitem_40 = call_accumulate_grad = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _exec_final_callbacks_stub = torch__dynamo_external_utils__exec_final_callbacks_stub();  _exec_final_callbacks_stub = None
         return []
@@ -3399,6 +3931,13 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
             )
 
     # https://github.com/pytorch/pytorch/issues/138920
+<<<<<<< HEAD
+=======
+    # Inductor has a joint graph pattern to remove pointless view pairs.
+    # That will remove the no-op view pairs this test is checking. Disable
+    # pattern matcher for this test.
+    @inductor_config.patch(pattern_matcher=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compiled_autograd_does_not_specialize_on_bw_symints(self):
         class Mod(torch.nn.Module):
             def __init__(self, a, b, c):
@@ -3547,9 +4086,16 @@ def tensor_hook(_):
 
         x = torch.ones(4, requires_grad=True)
         y = torch.ones(4, requires_grad=False)
+<<<<<<< HEAD
         with torch.autograd.graph.saved_tensors_hooks(
             pack_hook, unpack_hook
         ), compiled_autograd._enable(make_compiler_fn(fullgraph=False)):
+=======
+        with (
+            torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook),
+            compiled_autograd._enable(make_compiler_fn(fullgraph=False)),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out_test = f(x, y)
             self.assertEqual(pack_count, 1)
             self.assertEqual(unpack_count, 0)
@@ -3589,6 +4135,7 @@ class CompiledAutograd0(torch.nn.Module):
     def forward(self, inputs, sizes, scalars, hooks, packed_data):
         getitem = inputs[0]
         getitem_1 = inputs[1];  inputs = None
+<<<<<<< HEAD
 
         validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], False)]);  getitem = None
         getitem_2 = validate_outputs[0];  validate_outputs = None
@@ -3629,6 +4176,72 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         getitem_20 = validate_outputs_4[0];  validate_outputs_4 = None
 
         accumulate_grad_ = torch.ops.inductor.accumulate_grad_.default(getitem_1, getitem_20);  getitem_1 = getitem_20 = accumulate_grad_ = None
+=======
+        getitem_2 = sizes[0]
+        getitem_3 = sizes[1]
+        getitem_4 = sizes[2]
+        getitem_5 = sizes[3]
+        getitem_6 = sizes[4]
+        getitem_7 = sizes[5]
+        getitem_8 = sizes[6]
+        getitem_9 = sizes[7]
+        getitem_10 = sizes[8]
+        getitem_11 = sizes[9]
+        getitem_12 = sizes[10]
+        getitem_13 = sizes[11];  sizes = None
+        unwrap_maybe_dynamic_int = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_2);  getitem_2 = None
+        unwrap_maybe_dynamic_int_1 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_3);  getitem_3 = None
+        unwrap_maybe_dynamic_int_2 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_4);  getitem_4 = None
+        unwrap_maybe_dynamic_int_3 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_5);  getitem_5 = None
+        unwrap_maybe_dynamic_int_4 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_6);  getitem_6 = None
+        unwrap_maybe_dynamic_int_5 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_7);  getitem_7 = None
+        unwrap_maybe_dynamic_int_6 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_8);  getitem_8 = None
+        unwrap_maybe_dynamic_int_7 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_9);  getitem_9 = None
+        unwrap_maybe_dynamic_int_8 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_10);  getitem_10 = None
+        unwrap_maybe_dynamic_int_9 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_11);  getitem_11 = None
+        unwrap_maybe_dynamic_int_10 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_12);  getitem_12 = None
+        unwrap_maybe_dynamic_int_11 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_13);  getitem_13 = None
+
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], False)]);  getitem = None
+        getitem_14 = validate_outputs[0];  validate_outputs = None
+
+        sum_backward0 = torch__dynamo_compiled_autograd_ops_SumBackward0([getitem_14], [True], [unwrap_maybe_dynamic_int, unwrap_maybe_dynamic_int_1]);  getitem_14 = unwrap_maybe_dynamic_int = unwrap_maybe_dynamic_int_1 = None
+        getitem_15 = sum_backward0[0];  sum_backward0 = None
+        validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_15], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_2, unwrap_maybe_dynamic_int_3], False)]);  getitem_15 = unwrap_maybe_dynamic_int_2 = unwrap_maybe_dynamic_int_3 = None
+        getitem_16 = validate_outputs_1[0];  validate_outputs_1 = None
+
+        getitem_17 = hooks[0]
+        getitem_18 = packed_data[0]
+        getitem_19 = hooks[1]
+        getitem_20 = packed_data[1]
+        call_hook = torch__dynamo_external_utils_call_hook(getitem_17, getitem_18, hook_type = 'unpack_hook');  getitem_17 = getitem_18 = None
+        call_hook_1 = torch__dynamo_external_utils_call_hook(getitem_19, getitem_20, hook_type = 'unpack_hook');  getitem_19 = getitem_20 = None
+        mul_backward0 = torch__dynamo_compiled_autograd_ops_MulBackward0([getitem_16], [True, True], call_hook, 6, call_hook_1, 6);  getitem_16 = call_hook = call_hook_1 = None
+        getitem_21 = mul_backward0[0]
+        getitem_22 = mul_backward0[1];  mul_backward0 = None
+        validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_21, getitem_22], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_4, unwrap_maybe_dynamic_int_5], False), ((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_6, unwrap_maybe_dynamic_int_7], False)]);  getitem_21 = getitem_22 = unwrap_maybe_dynamic_int_4 = unwrap_maybe_dynamic_int_5 = unwrap_maybe_dynamic_int_6 = unwrap_maybe_dynamic_int_7 = None
+        getitem_23 = validate_outputs_2[0]
+        getitem_24 = validate_outputs_2[1];  validate_outputs_2 = None
+
+        getitem_25 = hooks[2]
+        getitem_26 = packed_data[2]
+        call_hook_2 = torch__dynamo_external_utils_call_hook(getitem_25, getitem_26, hook_type = 'unpack_hook');  getitem_25 = getitem_26 = None
+        cos_backward0 = torch__dynamo_compiled_autograd_ops_CosBackward0([getitem_24], [True], call_hook_2);  getitem_24 = call_hook_2 = None
+        getitem_27 = cos_backward0[0];  cos_backward0 = None
+        validate_outputs_3 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_27], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_8, unwrap_maybe_dynamic_int_9], False)]);  getitem_27 = unwrap_maybe_dynamic_int_8 = unwrap_maybe_dynamic_int_9 = None
+        getitem_28 = validate_outputs_3[0];  validate_outputs_3 = None
+        add = torch.add(getitem_23, getitem_28);  getitem_23 = getitem_28 = None
+
+        getitem_29 = hooks[3];  hooks = None
+        getitem_30 = packed_data[3];  packed_data = None
+        call_hook_3 = torch__dynamo_external_utils_call_hook(getitem_29, getitem_30, hook_type = 'unpack_hook');  getitem_29 = getitem_30 = None
+        sin_backward0 = torch__dynamo_compiled_autograd_ops_SinBackward0([add], [True], call_hook_3);  add = call_hook_3 = None
+        getitem_31 = sin_backward0[0];  sin_backward0 = None
+        validate_outputs_4 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_31], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_10, unwrap_maybe_dynamic_int_11], False)]);  getitem_31 = unwrap_maybe_dynamic_int_10 = unwrap_maybe_dynamic_int_11 = None
+        getitem_32 = validate_outputs_4[0];  validate_outputs_4 = None
+
+        call_accumulate_grad = torch__dynamo_external_utils_call_accumulate_grad(getitem_1, getitem_32, False);  getitem_1 = getitem_32 = call_accumulate_grad = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _exec_final_callbacks_stub = torch__dynamo_external_utils__exec_final_callbacks_stub();  _exec_final_callbacks_stub = None
         return []
 """,  # noqa: B950
@@ -3708,14 +4321,22 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         validate_outputs_3 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_14], [((None, None, device(type='cpu'), 6, 0, None), [getitem_4], False)]);  getitem_14 = getitem_4 = None
         getitem_15 = validate_outputs_3[0];  validate_outputs_3 = None
 
+<<<<<<< HEAD
         accumulate_grad_ = torch.ops.inductor.accumulate_grad_.default(getitem_1, getitem_15);  getitem_1 = getitem_15 = accumulate_grad_ = None
+=======
+        accumulate_grad__default = torch.ops.inductor.accumulate_grad_.default(getitem_1, getitem_15);  getitem_1 = getitem_15 = accumulate_grad__default = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _exec_final_callbacks_stub = torch__dynamo_external_utils__exec_final_callbacks_stub();  _exec_final_callbacks_stub = None
         return []
 """,  # noqa: B950
             )
 
         self.check_output_and_recompiles(
+<<<<<<< HEAD
             fn, count=2, compiler_fn=make_compiler_fn(gm_hook=check)
+=======
+            fn, compiler_fn=make_compiler_fn(gm_hook=check)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfWindows(msg="temp dir not compatible")
@@ -3788,7 +4409,11 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_11], [((None, None, device(type='cpu'), 6, 0, None), [getitem_3], False)]);  getitem_11 = getitem_3 = None
         getitem_12 = validate_outputs_2[0];  validate_outputs_2 = None
 
+<<<<<<< HEAD
         accumulate_grad_ = torch.ops.inductor.accumulate_grad_.default(getitem_1, getitem_12);  getitem_1 = getitem_12 = accumulate_grad_ = None
+=======
+        accumulate_grad__default = torch.ops.inductor.accumulate_grad_.default(getitem_1, getitem_12);  getitem_1 = getitem_12 = accumulate_grad__default = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _exec_final_callbacks_stub = torch__dynamo_external_utils__exec_final_callbacks_stub();  _exec_final_callbacks_stub = None
         return []
 """,  # noqa: B950
@@ -3797,7 +4422,11 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
             # 1 graph break on torch.load -> 2 dynamo graphs
             self.check_output_and_recompiles(
                 fn,
+<<<<<<< HEAD
                 count=[2, 4],
+=======
+                count=[1, 2],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 compiler_fn=make_compiler_fn(fullgraph=False, gm_hook=check),
             )
 
@@ -3911,6 +4540,857 @@ def context_fn():
             fn, count=[1, 5], compiler_fn=make_compiler_fn(fullgraph=False)
         )
 
+<<<<<<< HEAD
+=======
+    def test_dont_dce_side_effects(self):
+        class SideEffectfulBackward(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x
+
+            @staticmethod
+            def backward(ctx, gO):
+                torch.randn(10, 10)
+                return gO
+
+        x = torch.randn(10, 10, requires_grad=True)
+
+        # https://github.com/pytorch/pytorch/issues/147171
+        torch._inductor.config.fallback_random = True
+
+        @torch.compile(backend="aot_eager")
+        def fn(x):
+            return SideEffectfulBackward.apply(x).sum()
+
+        gm = None
+
+        def extract(ca_gm):
+            nonlocal gm
+            gm = ca_gm
+            return ca_gm
+
+        with compiled_autograd._enable(extract):
+            fn(x).backward()
+
+        self.assertTrue("aten.randn" in str(gm))
+
+    def test_aot_bwd_gm_runnable(self):
+        # This test ensures that the bw_module saved in
+        # CompiledFunction._lazy_backward_info is executable,
+        # by ensuring post grad passes have not ran on it.
+
+        post_grad_graphs = []
+
+        def post_grad_pass(graph):
+            nonlocal post_grad_graphs
+            post_grad_graphs.append(graph)
+            return graph
+
+        x = torch.randn(10, 10, requires_grad=True)
+        y = torch.randn(10, 10, requires_grad=True)
+        # forces symints to be saved for backward
+        # and forces aot compilation of the backward
+        torch._dynamo.mark_dynamic(x, 0)
+        torch._dynamo.mark_dynamic(y, 1)
+
+        @torch.compile
+        def fn(x, y):
+            return torch.matmul(x, y).sum()
+
+        with inductor_config.patch(post_grad_custom_post_pass=post_grad_pass):
+            loss = fn(x, y)
+            self.assertEqual(len(post_grad_graphs), 2)  # 1 fwd and 1 bwd
+
+        self.assertTrue(loss.grad_fn.name(), "CompiledFunctionBackward")
+        self.assertIsNot(
+            post_grad_graphs[1],
+            loss.grad_fn._forward_cls._lazy_backward_info.bw_module.graph,
+        )
+
+        with compiled_autograd._enable(lambda gm: gm):
+            loss.backward()
+
+    def test_anomaly_mode_already_nan(self):
+        def fn():
+            with torch.autograd.detect_anomaly():
+                a = torch.randn(5, 5, requires_grad=True)
+                a.grad = torch.full((5, 5), float("nan"))
+                b = torch.randn(5, 5)
+                out = torch.matmul(a, b)
+                loss = out.sum()
+                with torch._dynamo.compiled_autograd._enable(lambda gm: gm):
+                    loss.backward()
+
+        with self.assertRaisesRegex(
+            AssertionError, "already having NaN gradient. This is not supported."
+        ):
+            fn()
+
+    def test_anomaly_mode_backward(self):
+        def fn():
+            class MyFn(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    return x
+
+                @staticmethod
+                def backward(ctx, gO):
+                    return torch.full(gO.size(), float("nan"))
+
+            with torch.autograd.detect_anomaly():
+                a = torch.randn(5, 5, requires_grad=True)
+                out = MyFn.apply(a)
+                loss = out.sum()
+                with torch._dynamo.compiled_autograd._enable(lambda gm: gm):
+                    loss.backward()
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Compiled Autograd returned NaN gradients for parameters"
+        ):
+            fn()
+
+    def test_anomaly_mode_grad(self):
+        def fn():
+            class MyFn(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    return x
+
+                @staticmethod
+                def backward(ctx, gO):
+                    return torch.full(gO.size(), float("nan"))
+
+            with torch.autograd.detect_anomaly():
+                a = torch.randn(5, 5, requires_grad=True)
+                out = MyFn.apply(a)
+                loss = out.sum()
+                with torch._dynamo.compiled_autograd._enable(lambda gm: gm):
+                    torch.autograd.grad(loss, inputs=a)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Compiled Autograd returned NaN gradients for output nodes"
+        ):
+            fn()
+
+    def test_higher_order_gradients(self):
+        def f(x):
+            return x**3
+
+        def fn(fwd_compiler, ca_compiler):
+            torch.manual_seed(123)
+            x = torch.tensor(2.0, requires_grad=True)
+            first, second, third, fourth = None, None, None, None
+            try:
+                with compiled_autograd._enable(ca_compiler):
+                    first = torch.autograd.grad(
+                        fwd_compiler(f)(x), x, create_graph=True
+                    )[0]
+                    second = torch.autograd.grad(first, x, create_graph=True)[0]
+                    third = torch.autograd.grad(second, x, create_graph=True)[0]
+                    fourth = torch.autograd.grad(third, x, create_graph=True)[0]
+            except RuntimeError as e:
+                assert "does not currently support higher order gradients" in str(e)
+                return (first, second, third, fourth)
+
+            return (first, second, third, fourth)
+
+        def eager():
+            return torch.compile(backend="eager")
+
+        def aot_eager():
+            return torch.compile(backend="aot_eager")
+
+        # Without AOTAutograd, no problem
+        first, second, third, fourth = fn(eager(), eager())
+        self.assertEqual(counters["compiled_autograd"]["captures"], 4)
+        self.assertEqual(first, 12)  # 3x^2
+        self.assertEqual(second, 12)  # 6x
+        self.assertEqual(third, 6)  # 6
+        self.assertEqual(fourth, 0)
+        # and should cache hit
+        counters.clear()
+        _ = fn(eager(), eager())
+        self.assertEqual(counters["compiled_autograd"]["captures"], 0)
+        torch._dynamo.reset()
+
+        # With AOTAutograd, can't create_graph
+        first, second, third, fourth = fn(aot_eager(), aot_eager())
+        self.assertIsNone(second)
+
+        first, second, third, fourth = fn(aot_eager(), eager())
+        self.assertIsNone(second)
+
+        first, second, third, fourth = fn(eager(), aot_eager())
+        self.assertIsNone(third)
+
+    @unittest.skipIf(
+        not torch.distributed.is_available(),
+        "FakePG relies on distributed build",
+    )
+    def test_ddp_cpp_reducer_error(self):
+        from torch.testing._internal.distributed.fake_pg import FakeStore
+
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+        try:
+            model = torch.nn.Sequential(nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10))
+            model = DDP(model)
+            inputs = torch.randn(10, 10)
+            loss = model(inputs).sum()
+            with (
+                compiled_autograd._enable(compiler_fn),
+                self.assertRaisesRegex(
+                    RuntimeError,
+                    (
+                        r"Compiled autograd is not compatible with C\+\+ DDP Reducer, "
+                        r'please use torch._dynamo.config.optimize_ddp="python_reducer"'
+                    ),
+                ),
+            ):
+                loss.backward()
+
+        finally:
+            dist.destroy_process_group()
+
+    @unittest.skipIf(
+        not torch.distributed.is_available(),
+        "FakePG relies on distributed build",
+    )
+    @config.patch(optimize_ddp="python_reducer")
+    def test_ddp_python_reducer(self):
+        from torch.testing._internal.distributed.fake_pg import FakeStore
+
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+        try:
+            model = torch.nn.Sequential(nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10))
+            model = DDP(model)
+            inputs = torch.randn(10, 10)
+            loss = model(inputs).sum()
+            with compiled_autograd._enable(compiler_fn):
+                # no error expected
+                loss.backward()
+            self.assertEqual(counters["compiled_autograd"]["captures"], 1)
+        finally:
+            dist.destroy_process_group()
+
+    # Case 1.1: Stealable dense new_grad
+    # if (!GradMode::is_enabled() && !new_grad.is_sparse() &&
+    #     !new_grad.is_sparse_csr() &&
+    #     !(variable.is_sparse_csr() && new_grad.layout() == at::kStrided) &&
+    #     at::caching::adjusted_use_count(new_grad) <= num_expected_refs &&
+    #     (new_grad.is_mkldnn() || utils::obeys_layout_contract(new_grad, variable))) {
+    @unittest.expectedFailure
+    def test_accumulate_grad_polyfill_case_1_1(self):
+        def fn():
+            class StealableDenseOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    return torch.ones_like(grad_output, requires_grad=False) * 5
+
+            pre_hook_storage_id = None
+
+            def check(grad):
+                nonlocal pre_hook_storage_id
+                assert pre_hook_storage_id is None
+                pre_hook_storage_id = id(grad.untyped_storage())
+
+            var = torch.randn(2, 2, requires_grad=True)
+            var.register_hook(check)
+            output = StealableDenseOp.apply(var)
+            output.backward(torch.ones_like(output))
+
+            assert var.grad is not None, "Grad should be defined"
+            assert torch.equal(var.grad, torch.ones_like(var) * 5), (
+                "Grad content should be as returned by backward"
+            )
+            assert var.grad.requires_grad is False, (
+                "Detached grad should not require grad"
+            )
+            assert id(var.grad.untyped_storage()) == pre_hook_storage_id, (
+                "Should be stolen"
+            )
+            yield var.grad
+
+        self.check_output_and_recompiles(
+            fn,
+            compiler_fn=make_compiler_fn(fullgraph=False),
+            count=[1, 2],
+        )
+
+    # Case 1.2: Stealable sparse new_grad
+    # } else if (!GradMode::is_enabled() && new_grad.is_sparse() &&
+    #            new_grad._indices().is_contiguous() &&
+    #            new_grad._values().is_contiguous() &&
+    #            new_grad._indices().use_count() <= 1 &&
+    #            new_grad._values().use_count() <= 1 &&
+    #            new_grad.use_count() <= num_expected_refs) {
+    @unittest.expectedFailure
+    def test_accumulate_grad_polyfill_case_1_2(self):
+        def fn():
+            class StealableSparseOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    size = grad_output.size()
+                    indices = torch.tensor([[0, 1], [0, 1]], dtype=torch.int64)
+                    values = torch.tensor([5.0, 5.0])
+                    return torch.sparse_coo_tensor(
+                        indices, values, size, requires_grad=False
+                    )
+
+            pre_hook_storages_id = None
+
+            def check(grad):
+                nonlocal pre_hook_storages_id
+                assert pre_hook_storages_id is None
+                pre_hook_storages_id = [
+                    id(grad._indices().untyped_storage()),
+                    id(grad._values().untyped_storage()),
+                ]
+
+            var = torch.randn(2, 2, requires_grad=True)
+            var.register_hook(check)
+            output = StealableSparseOp.apply(var)
+            output.backward(torch.ones_like(output))
+
+            assert var.grad is not None, "Grad should be defined"
+            assert var.grad.is_sparse, "Grad should be sparse"
+            expected_dense_grad = torch.tensor([[5.0, 0.0], [0.0, 5.0]])
+            assert torch.equal(var.grad.to_dense(), expected_dense_grad), (
+                "Content should be equal after shallow copy"
+            )
+            assert var.grad.requires_grad is False, (
+                "Detached grad should not require grad"
+            )
+            assert (
+                id(var.grad._indices().untyped_storage()) == pre_hook_storages_id[0]
+            ), "Should be stolen"
+            assert (
+                id(var.grad._values().untyped_storage()) == pre_hook_storages_id[1]
+            ), "Should be stolen"
+            yield var.grad
+
+        self.check_output_and_recompiles(
+            fn,
+            compiler_fn=make_compiler_fn(fullgraph=False),
+            count=[1, 2],
+        )
+
+    # Case 1.3: Cloning sparse/nested new_grad
+    # else {
+    #   if (new_grad.is_sparse() || new_grad.is_sparse_csr() ||
+    #       new_grad.is_nested()) {
+    def test_accumulate_grad_polyfill_case_1_3(self):
+        def fn():
+            class CloneSparseGradOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    size = grad_output.size()
+                    indices = torch.tensor([[0, 1], [0, 1]], dtype=torch.int64)
+                    values = torch.tensor(
+                        [5.0, 5.0], requires_grad=True
+                    )  # Requires grad
+                    return torch.sparse_coo_tensor(
+                        indices, values, size, requires_grad=True
+                    )
+
+            pre_hook_storages_id = None
+
+            def check(grad):
+                nonlocal pre_hook_storages_id
+                assert pre_hook_storages_id is None
+                pre_hook_storages_id = [
+                    id(grad._indices().untyped_storage()),
+                    id(grad._values().untyped_storage()),
+                ]
+
+            var = torch.randn(2, 2, requires_grad=True)
+            var.register_hook(check)
+            output = CloneSparseGradOp.apply(var)
+            output.backward(
+                torch.ones_like(output), create_graph=True
+            )  # grad mode == create_graph
+
+            assert var.grad is not None, "Grad should be defined"
+            assert var.grad.is_sparse, "Grad should be sparse"
+            expected_dense_grad = torch.tensor([[5.0, 0.0], [0.0, 5.0]])
+            assert torch.equal(var.grad.to_dense(), expected_dense_grad), (
+                "Content should be equal after clone"
+            )
+            assert var.grad.requires_grad, (
+                "Grad should require grad for double backward"
+            )
+            assert (
+                id(var.grad._indices().untyped_storage()) != pre_hook_storages_id[0]
+            ), "Should be copied"
+            assert (
+                id(var.grad._values().untyped_storage()) != pre_hook_storages_id[1]
+            ), "Should be copied"
+            yield var.grad
+
+        self.check_output_and_recompiles(
+            fn,
+            compiler_fn=make_compiler_fn(fullgraph=False),
+            count=[1, 2],
+        )
+
+    # Case 1.5.1: Dense variable gradient layout contract
+    # else { // Covers various deep copy scenarios not covered by specific stealable paths
+    #   ...
+    #   if (new_grad.is_mkldnn()) {
+    #     ...
+    #   } else {
+    #       // Deep copies new_grad according to the "Gradient Layout Contract."
+    #       update_grad(utils::clone_obey_contract(new_grad, variable));
+    #   }
+    # }
+    def test_accumulate_grad_polyfill_case_1_5_1(self):
+        def fn():
+            class NotStealableRefsOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    return torch.ones_like(grad_output, requires_grad=False) * 10.0
+
+            var = torch.randn(2, 2, requires_grad=True)
+            grad_ref_holder = [None]
+
+            def check(grad):
+                # forces a clone due to refcount
+                grad_ref_holder[0] = grad
+                return grad
+
+            var.register_hook(check)
+            output = NotStealableRefsOp.apply(var)
+            output.backward(torch.ones_like(output))
+
+            assert var.grad is not None, "Grad should be defined"
+            assert torch.equal(var.grad, torch.ones_like(var) * 10.0), (
+                "Grad content should be as returned by backward"
+            )
+            assert (
+                grad_ref_holder[0].untyped_storage() is not var.grad.untyped_storage()
+            ), "Should be copied"
+            yield var.grad
+
+        self.check_output_and_recompiles(fn)
+
+    # Case 1.5.2: Non-dense variable gradient layout contract
+    # else { // Covers various deep copy scenarios not covered by specific stealable paths
+    #   ...
+    #   if (new_grad.is_mkldnn()) {
+    #     ...
+    #   } else {
+    #       // Deep copies new_grad according to the "Gradient Layout Contract."
+    #       update_grad(utils::clone_obey_contract(new_grad, variable));
+    #   }
+    # }
+    def test_accumulate_grad_polyfill_case_1_5_2(self):
+        def fn():
+            class SimpleDenseGradOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    return torch.ones_like(grad_output, requires_grad=False) * 7.0
+
+            # Create a non-contiguous variable
+            base_tensor = torch.randn(4, 4)
+            var = base_tensor[::2, ::2]
+            assert not var.is_contiguous(), (
+                "Variable should be non-contiguous for this test"
+            )
+            var.requires_grad_(True)
+
+            grad_ref_holder = [None]
+
+            def check(grad):
+                # forces a clone due to refcount
+                grad_ref_holder[0] = grad
+                return grad
+
+            var.register_hook(check)
+            output = SimpleDenseGradOp.apply(var)
+            output.backward(torch.ones_like(output))
+
+            assert var.grad is not None, "Grad should be defined"
+            # The `clone_obey_contract` branch 2 (`new_grad.clone(at::MemoryFormat::Contiguous)`)
+            # will make the resulting grad contiguous.
+            assert var.grad.is_contiguous(), (
+                "Resulting grad should be contiguous due to branch 2 of clone_obey_contract"
+            )
+            assert torch.equal(var.grad, torch.ones_like(var) * 7.0), (
+                "Grad content should be as returned by backward"
+            )
+            assert (
+                grad_ref_holder[0].untyped_storage() is not var.grad.untyped_storage()
+            ), "Should be copied"
+            yield var.grad
+
+        self.check_output_and_recompiles(
+            fn,
+        )
+
+    # Case 2.1: Sparse variable_grad + Dense new_grad
+    # } else if (!GradMode::is_enabled()) {
+    #   if (variable_grad.is_sparse() && !new_grad.is_sparse()) {
+    #       auto result = new_grad + variable_grad;
+    def test_accumulate_grad_polyfill_case_2_1(self):
+        def fn():
+            class SparseVarGradDenseNewGradOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    return torch.ones_like(grad_output) * 3.0
+
+            var = torch.randn(2, 2, requires_grad=True)
+            indices = torch.tensor([[0, 1], [0, 1]], dtype=torch.int64)
+            values = torch.tensor([1.0, 1.0])
+            var.grad = torch.sparse_coo_tensor(
+                indices, values, var.size(), requires_grad=False
+            )
+            initial_grad_ref = var.grad
+            output = SparseVarGradDenseNewGradOp.apply(var)
+
+            expected_sum = (torch.ones_like(var) * 3.0) + initial_grad_ref.to_dense()
+            output.backward(torch.ones_like(output))
+
+            assert var.grad is not None, "Grad should be defined"
+            assert not var.grad.is_sparse, "Resulting grad should be dense"
+            assert torch.equal(var.grad, expected_sum), "Grad content should be the sum"
+            assert var.grad is not initial_grad_ref, (
+                "Grad object should be replaced (out-of-place)"
+            )
+            yield var.grad
+
+        self.check_output_and_recompiles(
+            fn,
+            compiler_fn=lambda gm: gm,  # https://github.com/pytorch/pytorch/issues/154161
+            count=[1, 0],
+        )
+
+    # Case 2.3.1: Dense/Dense in-place addition
+    # } else if (!GradMode::is_enabled()) {
+    #   ...
+    # } else {
+    #   variable_grad += new_grad;
+    def test_accumulate_grad_polyfill_case_2_3_1(self):
+        def fn():
+            class DenseVarGradDenseNewGradOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    return torch.ones_like(grad_output) * 3.0
+
+            var = torch.randn(2, 2, requires_grad=True)
+            var.grad = torch.ones_like(var) * 1.0
+            initial_grad_ref = var.grad
+            output = DenseVarGradDenseNewGradOp.apply(var)
+            expected_sum = initial_grad_ref + (torch.ones_like(var) * 3.0)
+            output.backward(torch.ones_like(output))
+
+            assert var.grad is not None, "Grad should be defined"
+            assert not var.grad.is_sparse, "Resulting grad should be dense"
+            assert torch.equal(var.grad, expected_sum), "Grad content should be the sum"
+            assert var.grad is initial_grad_ref, (
+                "Grad object should be modified in-place (same object)"
+            )
+            yield var.grad
+
+        self.check_output_and_recompiles(fn)
+
+    # Case 2.3.2: Sparse/Sparse in-place addition
+    # } else if (!GradMode::is_enabled()) {
+    #   ...
+    # } else {
+    #   variable_grad += new_grad;
+    def test_accumulate_grad_polyfill_case_2_3_2(self):
+        def fn():
+            class SparseVarGradSparseNewGradOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    size = grad_output.size()
+                    indices = torch.tensor([[0, 1], [0, 1]], dtype=torch.int64)
+                    values = torch.tensor([3.0, 3.0])
+                    return torch.sparse_coo_tensor(
+                        indices, values, size, requires_grad=False
+                    )
+
+            var = torch.randn(2, 2, requires_grad=True)
+            indices_v = torch.tensor([[0, 0], [0, 1]], dtype=torch.int64)
+            values_v = torch.tensor([1.0, 2.0])
+            var.grad = torch.sparse_coo_tensor(
+                indices_v, values_v, var.size(), requires_grad=False
+            )
+            initial_grad_ref = var.grad
+
+            output = SparseVarGradSparseNewGradOp.apply(var)
+
+            new_grad_for_sum = torch.sparse_coo_tensor(
+                torch.tensor([[0, 1], [0, 1]], dtype=torch.int64),
+                torch.tensor([3.0, 3.0]),
+                var.size(),
+            )
+            expected_sum_dense = (
+                initial_grad_ref.to_dense() + new_grad_for_sum.to_dense()
+            )
+
+            output.backward(torch.ones_like(output))
+
+            assert var.grad is not None, "Grad should be defined"
+            assert var.grad.is_sparse, "Resulting grad should remain sparse"
+            assert torch.equal(var.grad.to_dense(), expected_sum_dense), (
+                "Grad content should be the sum of sparse grads"
+            )
+            assert var.grad is initial_grad_ref, (
+                "Grad object should be modified in-place (same object)"
+            )
+            yield var.grad
+
+        self.check_output_and_recompiles(
+            fn,
+            compiler_fn=lambda gm: gm,  # https://github.com/pytorch/pytorch/issues/154161
+            count=[1, 0],
+        )
+
+    # Case 2.3.3: Dense/Sparse in-place addition
+    # } else if (!GradMode::is_enabled()) {
+    #   ...
+    # } else {
+    #   variable_grad += new_grad;
+    def test_accumulate_grad_polyfill_case_2_3_3(self):
+        def fn():
+            class DenseVarGradSparseNewGradOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    size = grad_output.size()
+                    indices = torch.tensor([[0, 1], [0, 1]], dtype=torch.int64)
+                    values = torch.tensor([3.0, 3.0])  # New sparse values
+                    return torch.sparse_coo_tensor(
+                        indices, values, size, requires_grad=False
+                    )
+
+            var = torch.randn(2, 2, requires_grad=True)
+            var.grad = torch.ones_like(var) * 1.0  # Initial value
+            initial_grad_ref = var.grad
+            output = DenseVarGradSparseNewGradOp.apply(var)
+
+            new_grad_for_sum = torch.sparse_coo_tensor(
+                torch.tensor([[0, 1], [0, 1]], dtype=torch.int64),
+                torch.tensor([3.0, 3.0]),
+                var.size(),
+            ).to_dense()
+            expected_sum = initial_grad_ref + new_grad_for_sum
+
+            output.backward(torch.ones_like(output))
+
+            assert var.grad is not None, "Grad should be defined"
+            assert not var.grad.is_sparse, "Resulting grad should be dense"
+            assert torch.equal(var.grad, expected_sum), "Grad content should be the sum"
+            assert var.grad is initial_grad_ref, (
+                "Grad object should be modified in-place (same object)"
+            )
+            yield var.grad
+
+        self.check_output_and_recompiles(
+            fn,
+            compiler_fn=make_compiler_fn(fullgraph=False),
+            count=[1, 2],
+        )
+
+    # Case 3.1: Sparse variable_grad + Dense new_grad (reorder into Dense + Sparse)
+    # } else { // if GradMode::is_enabled()
+    #   at::Tensor result;
+    #   if (variable_grad.is_sparse() && !new_grad.is_sparse()) {
+    #     result = new_grad + variable_grad;
+    #   }
+    # }
+    def test_accumulate_grad_polyfill_case_3_1(self):
+        def fn():
+            class SparseVarGradDenseNewGradDoubleBackwardOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    return torch.ones_like(grad_output, requires_grad=True) * 3.0
+
+            var = torch.randn(2, 2, requires_grad=True)
+            indices = torch.tensor([[0, 1], [0, 1]], dtype=torch.int64)
+            values = torch.tensor([1.0, 1.0], requires_grad=True)
+            var.grad = torch.sparse_coo_tensor(
+                indices, values, var.size(), requires_grad=True
+            )
+            initial_grad_ref = var.grad
+
+            output = SparseVarGradDenseNewGradDoubleBackwardOp.apply(var)
+
+            expected_sum = (
+                torch.ones_like(var, requires_grad=True) * 3.0
+            ) + initial_grad_ref.to_dense()
+
+            output.backward(torch.ones_like(output), create_graph=True)
+
+            assert var.grad is not None, "Grad should be defined"
+            assert not var.grad.is_sparse, "Resulting grad should be dense"
+            assert torch.equal(var.grad, expected_sum), "Grad content should be the sum"
+            assert var.grad is not initial_grad_ref, (
+                "Grad object should be replaced (out-of-place)"
+            )
+            assert var.grad.requires_grad, (
+                "Resulting grad should track history for double backward"
+            )
+            yield var.grad
+
+        self.check_output_and_recompiles(
+            fn,
+            compiler_fn=lambda gm: gm,  # https://github.com/pytorch/pytorch/issues/154161
+            count=[1, 0],
+        )
+
+    # Case 3.2: variable_grad.defined() & GradMode::is_enabled() - Double backward (dense variable_grad + dense new_grad)
+    # } else { // if GradMode::is_enabled()
+    #   at::Tensor result;
+    #   ...
+    #   } else {
+    #     result = variable_grad + new_grad;
+    #   }
+    # }
+    def test_accumulate_grad_polyfill_case_3_2(self):
+        def fn():
+            class DenseVarGradDenseNewGradDoubleBackwardOp(BaseCustomOp):
+                @staticmethod
+                def backward(ctx, grad_output):
+                    return torch.ones_like(grad_output, requires_grad=True) * 3.0
+
+            var = torch.randn(2, 2, requires_grad=True)
+            var.grad = torch.ones_like(var) * 1.0
+            initial_grad_ref = var.grad
+
+            output = DenseVarGradDenseNewGradDoubleBackwardOp.apply(var)
+
+            expected_sum = initial_grad_ref + (
+                torch.ones_like(var, requires_grad=True) * 3.0
+            )
+
+            output.backward(torch.ones_like(output), create_graph=True)
+
+            assert var.grad is not None, "Grad should be defined"
+            assert not var.grad.is_sparse, "Resulting grad should be dense"
+            assert torch.equal(var.grad, expected_sum), "Grad content should be the sum"
+            assert var.grad is not initial_grad_ref, (
+                "Grad object should be replaced (out-of-place)"
+            )
+            assert var.grad.requires_grad, (
+                "Resulting grad should track history for double backward"
+            )
+            yield var.grad
+
+        self.check_output_and_recompiles(
+            fn,
+            compiler_fn=make_compiler_fn(fullgraph=False),
+            count=[1, 3],
+        )
+
+    def test_torch_function_mode(self):
+        called_funcs = []
+
+        class LoggingTorchFunctionMode(BaseTorchFunctionMode):
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                called_funcs.append(str(func.__name__))
+                return super().__torch_function__(func, types, args, kwargs)
+
+        class MyLoss(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, out):
+                ctx.save_for_backward(out)
+                return out.sum()
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                (saved,) = ctx.saved_tensors
+                return torch.ones_like(saved) * grad_output
+
+        x = torch.randn(2, 2, requires_grad=True)
+        y = torch.randn(2, 2)
+        z = torch.randn(2, 2)
+
+        def fwd(x, y, z):
+            out = x * y * z
+            loss = MyLoss.apply(out)
+            return loss
+
+        with LoggingTorchFunctionMode():
+            called_funcs.append("Forward")
+            loss = fwd(x, y, z)
+            called_funcs.append("Backward")
+            with torch._dynamo.compiled_autograd._enable(torch.compile):
+                loss.backward()
+
+        self.assertExpectedInline(
+            "\n".join(called_funcs),
+            """\
+Forward
+mul
+mul
+sum
+Backward
+_set_multithreading_enabled
+backward
+_set_multithreading_enabled""",
+        )  # noqa: B950
+
+    def test_torch_dispatch_mode(self):
+        called_funcs = []
+
+        class LoggingTorchDispatchMode(TorchDispatchMode):
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                called_funcs.append(str(func.__name__))
+                return func(*args, **kwargs)
+
+        class MyLoss(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, out):
+                ctx.save_for_backward(out)
+                return out.sum()
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                (saved,) = ctx.saved_tensors
+                return torch.ones_like(saved) * grad_output
+
+        x = torch.randn(2, 2, requires_grad=True)
+        y = torch.randn(2, 2)
+        z = torch.randn(2, 2)
+
+        def fwd(x, y, z):
+            out = x * y * z
+            loss = MyLoss.apply(out)
+            return loss
+
+        with LoggingTorchDispatchMode():
+            called_funcs.append("Forward")
+            loss = fwd(x, y, z)
+            called_funcs.append("Backward")
+            with torch._dynamo.compiled_autograd._enable(lambda gm: gm):
+                loss.backward()
+
+        self.assertExpectedInline(
+            "\n".join(called_funcs),
+            """\
+Forward
+mul.Tensor
+mul.Tensor
+sum.default
+Backward
+ones_like.default
+empty.memory_format
+empty.memory_format
+empty.memory_format
+empty.memory_format
+empty.memory_format
+empty.memory_format
+ones_like.default
+mul.Tensor
+mul.Tensor
+mul.Tensor
+new_empty_strided.default
+copy_.default""",
+        )  # noqa: B950
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def load_test_module(name):
     testdir = Path(__file__).absolute().parent.parent
@@ -3934,18 +5414,53 @@ def wrapped(self):
     return wrapped
 
 
+<<<<<<< HEAD
+=======
+def lookup_backend(test_name):
+    if test_name in xfail_by_backend["inductor"]:
+        return "aot_eager"
+    elif test_name in xfail_by_backend["aot_eager"]:
+        return "eager"
+    elif test_name in xfail_by_backend["eager"]:
+        return "ca_eager"
+    else:
+        assert test_name not in xfail_by_backend["ca_eager"]
+        return "inductor"
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def wrap_test_class(orig_cls):
     dct = orig_cls.__dict__.copy()
     for name in list(dct.keys()):
         fn = dct[name]
         if not callable(fn) or name in skipped_tests:
             continue
+<<<<<<< HEAD
         elif known_failures_re.match(name) or name in known_failing_tests:
             dct[name] = unittest.expectedFailure
         elif name.startswith("test_"):
             fullgraph = name not in known_graph_breaks_tests
             ctxs = [
                 compiled_autograd._enable(make_compiler_fn(fullgraph=fullgraph)),
+=======
+        elif (
+            xfail_re.match(name)
+            or name in xfail_by_backend["ca_eager"]
+            or name in xfail_divergence_from_eager
+        ):
+            dct[name] = unittest.expectedFailure
+        elif name.startswith("test_"):
+            backend = lookup_backend(name)
+            if not HAS_CUDA and backend == "inductor":
+                continue
+            ctxs = [
+                compiled_autograd._enable(
+                    make_compiler_fn(
+                        backend=backend,
+                        fullgraph=name not in known_graph_breaks_tests,
+                    )
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 test_contexts.get(name, contextlib.nullcontext()),
             ]
             dct[name] = make_wrapped(fn, ctxs)
@@ -4017,6 +5532,46 @@ def wrap_test_class(orig_cls):
     "test_checkpointing_without_reentrant_input_requires_grad_False",  # reentrant .backward
     "test_checkpointing_without_reentrant_input_requires_grad_True",  # reentrant .backward
     "test_checkpointing_without_reentrant_memory_savings",  # reentrant .backward
+<<<<<<< HEAD
+=======
+    "test_dtensor_basic",  # torch._dynamo.exc.Unsupported: Failed to convert args/kwargs to proxy
+    "test_dtensor_contiguous_dtensor_noncontiguous_local_as_tangent",  # subclass constructor
+    "test_retain_grad",  # retains_grad_hooks
+    "test_retain_grad_cycle",  # retains_grad_hooks
+    "test_retain_grad_inplace",  # retains_grad_hooks
+    "test_retain_grad_inplace_over_view",  # retains_grad_hooks
+    "test_retains_grad_can_always_observe_tensor_prehook",  # retains_grad_hooks
+    "test_retains_grad_inplace_multiple_outputs",  # retains_grad_hooks
+    "test_hook_edge_case_when_called_with_grad",  # retains_grad_hooks
+    "test_multi_grad_all_hooks",  # retains_grad_hooks
+    "test_prehook_ordering",  # retains_grad_hooks
+    "test_will_engine_execute_node",  # retains_grad_hooks
+    "test_backward_to_node",  # retains_grad_hooks
+    "test_backward_with_nonleaf_inputs",  # retains_grad_hook on non-leaf input
+    "test_create_graph_and_full_backward_hook_cycle",  # _pack_with_none
+    "test_full_backward_hook_double_backward",  # _pack_with_none
+    "test_grad_mode_restored_reentrant",  # assertTrue
+    "test_multi_grad_any_hooks",  # register_multi_grad_hook
+    "test_saved_variable_packing_unpacking_did_not_save_original_with_hooks",  # register_hooks
+    "test_graph_save_on_cpu",  # dynamo disabled
+    "test_nested_checkpoint_early_stop_False",  # dynamo disable
+    "test_nested_checkpoint_early_stop_True",  # dynamo disable
+    "test_nested_checkpoint_kwargs_early_stop_False",  # dynamo disable
+    "test_nested_checkpoint_kwargs_early_stop_True",  # dynamo disable
+    "test_nested_checkpoint_non_tensor_inputs_and_outputs_early_stop_False",  # dynamo disable
+    "test_nested_checkpoint_non_tensor_inputs_and_outputs_early_stop_True",  # dynamo disable
+    "test_nested_checkpoint_reentrant_backwards_early_stop_False",  # dynamo disable
+    "test_nested_checkpoint_reentrant_backwards_early_stop_True",  # dynamo disable
+    "test_nested_checkpoint_same_graph_early_stop_False",  # dynamo disable
+    "test_nested_checkpoint_same_graph_early_stop_True",  # dynamo disable
+    "test_nested_checkpoint_set_early_stop",  # dynamo disable
+    "test_nested_checkpoint_two_children_early_stop_False",  # dynamo disable
+    "test_nested_checkpoint_two_children_early_stop_True",  # dynamo disable
+    "test_dropout",  # dynamo disable
+    "test_dropout_inductor",  # dynamo disable
+    "test_function_with_kwargs",  # dynamo disable
+    "test_module",  # dynamo disable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 test_contexts = {
@@ -4027,6 +5582,7 @@ def wrap_test_class(orig_cls):
 }
 
 # These groups of tests aren't supported yet
+<<<<<<< HEAD
 known_failures_re = re.compile(r"^test_(sparse|profiler|gradcheck|named_tensor)")
 
 # Bugs needing investigation:
@@ -4132,10 +5688,102 @@ def wrap_test_class(orig_cls):
     "test_grad_batched_grad",  # torch._subclasses.fake_tensor.UnsupportedFakeTensorException: meta converter nyi
     "test_scalar_grad_mixed_device",  # Fake Tensors aren't propagating device properly for 0-dim grads
     # Category: Divergence from eager
+=======
+xfail_re = re.compile(r"^test_(sparse|profiler|gradcheck|named_tensor)")
+
+# Tests fail at different stages, we categorize them wrt to their backends
+# We run only the last passing backend in this order:
+# ca_eager -> eager -> aot_eager -> inductor
+xfail_by_backend = {
+    "ca_eager": {  # xfail
+        "test_callback_propagates_errors_from_device_thread",  # fullgraph for queue_callback, but graph break for RuntimeError
+        "test_reentrant_with_callbacks_both_depths",  # queue_callback
+        "test_reentrant_with_callbacks_depth_0",  # queue_callback
+        "test_reentrant_with_callbacks_depth_1",  # queue_callback
+        "test_current_graph_task_execution_order",  # nodes are already freed by the time dynamo traces the lifted hook
+        "test_autograd_inplace_views_cross_dtype",  # view_fn not supported by compiled autograd
+        "test_post_accumulate_grad_hook_ordering",  # accuracy error
+        "test_current_graph_task_id",  # autograd state already cleared once dynamo is called
+        "test_custom_function_forward_mode_forward_is_no_op",  # forward AD
+        "test_custom_function_forward_mode_inplace_checks",  # forward AD
+        "test_custom_function_forward_mode_view_checks",  # forward AD
+        "test_custom_function_forward_mode_wrong_formula",  # forward AD
+        "test_node_post_hook_registered_during_unpack_hook",  # 'NoneType' object has no attribute 'register_hook'
+        "test_custom_function_error",  # forward AD
+        "test_custom_function_save_for_forward",  # forward AD
+        "test_dont_materialize_grads",  # undefined grad
+        "test_no_grad_copy",  # setting static member in lifted backward
+        "test_no_grad_copy_sparse",  # setting static member in lifted backward
+        "test_node_ordering_when_none_returned",  # torch._dynamo.exc.Unsupported: TypeError <built-in method clone
+        "test_save_output_nr",  # output_nr grad passed as None
+        # IndexError: list index out of range (NB: x.grad = y where both x and y are input tensors)
+        "test_grad_nonleaf_register_hook",
+        "test_backward_twice_without_saved_values",  # https://github.com/pytorch/pytorch/issues/129938
+        # Category: Higher Order Gradients
+        "test_default_saved_tensors_hooks_double_backward",  # wrong when pack hook returns non-leaf
+        "test_saved_variable_packing_unpacking_saved_original_with_hooks",  # wrong when pack hook returns non-leaf
+        "test_nested_anomaly_detect_nan",  # nested anomaly
+        "test_select_sum",  # batched gradients
+        "test_custom_autograd_no_early_free",  # batched gradients
+        "test_grad_batched_grad",  # batched gradients
+        # Uncategorized
+        "test_lobpcg",  # NaNs
+        "test_autograd_simple_views_python",  # gradient is None
+        "test_function_returns_undefined_tensor",  # gradient is None
+        "test_input_buffer_accum",  # add(sparse, dense)
+        "test_return_duplicate",  # batched gradients
+        "test_return_duplicate_inplace",  # batched gradients
+        "test_naughty_autograd_function_stashing_ctx",  # error not raised
+        "test_unrelated_inputs",  # batched gradients
+        "test_nested_checkpoint_early_stop_False",  # unpack hook grad_fn semantics
+        "test_nested_checkpoint_early_stop_True",  # unpack hook grad_fn semantics
+        "test_nested_checkpoint_two_children_early_stop_False",  # unpack hook grad_fn semantics
+        "test_nested_checkpoint_two_children_early_stop_True",  # unpack hook grad_fn semantics
+        "test_dropout",  # functionalize_rng_ops not yet supported
+        "test_dropout_inductor",  # functionalize_rng_ops not yet supported
+        "test_function_with_kwargs",  # functionalize_rng_ops not yet supported
+        "test_module",  # functionalize_rng_ops not yet supported
+    },
+    "eager": {  # will be run without torch.compiling the CA graph
+        "test_setup_context_when_forward_has_default_args",  # autograd.Function with class methods
+        "test_accumulate_grad_tensor_reference",  # Out of bounds: frame_state_entry.stride[i] is None
+        "test_custom_function_exception",  # torch.no_grad(), torch._dynamo.exc.Unsupported: missing: WITH_EXCEPT_START
+        "test_to_sparse_backward",  # Out of bounds: frame_state_entry.stride[i] is None
+        "test_custom_function_non_tensor_inputs_outputs",  # gradient batching rule not implemented for aten::sym_size.int
+        "test_setitem",  # CopySlices accuracy error
+        "test_checkpointing_without_reentrant_saved_object_identity",  # same as https://github.com/pytorch/pytorch/issues/136193
+        "test_dtensor_different_gradient_placement",  # Dynamo failed to run FX node with fake tensors
+        "test_dtensor_noncontiguous_output",  # Dynamo failed to run FX node with fake tensors
+        "test_dtensor_partial_placement_graph_output",  # Dynamo failed to run FX node with fake tensors
+        "test_unwrap_async_collective_tensor_tangent",  # AttributeError: 'PlainTensorMeta' object has no attribute 'attrs'
+        "test_graph_save_on_cpu",  # torch.save should no-op and be recorded in the graph
+        "test_saving_variable_to_disk",  # torch.save should no-op and be recorded in the graph
+        "test_nested_checkpoint_early_stop_False",  # AOT backward higher order gradients
+        # Slow tests, these tests are close to CI timeout if we try to torch.compile them
+        "test_checkpointing",
+        "test_checkpointing_without_reentrant_memory_savings",
+        "test_checkpointing_without_reentrant_input_requires_grad_True",
+        "test_checkpointing_without_reentrant_input_requires_grad_False",
+    },
+    "aot_eager": {  # will be run with torch.compile(backend="eager")
+        # Category: FakeTensor
+        "test_wrapped_number_saved_tensors_hooks",  # Proxy tensor should carryover is_wrapped_number_ of its original
+        "test_scalar_grad_mixed_device",  # Fake Tensors aren't propagating device properly for 0-dim grads
+        "test_grad",  # AOT backward higher order gradients
+        "test_grad_materialize_grads",  # AOT backward higher order gradients
+    },
+    "inductor": {},  # will be run with torch.compile(backend="aot_eager")
+    # tests not present in this dict will be run with torch.compile(backend="inductor")
+}
+
+# These tests fail due to difference in semantics that we won't fix
+xfail_divergence_from_eager = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_invalid_gradients",  # can't give autograd error due to inaccurate output metadata of lifted backward
     "test_autograd_node_isinstance",  # backward ctx is a fake cls and not directly a Node instance
     "test_backward_hook_relative_ordering",  # compiled autograd collects breadth first, and module backward hook not supported
     "test_checkpointing_without_reentrant_custom_function_works",  # ctx.saved_tensors are cached by CA
+<<<<<<< HEAD
     # Category: Subclasses
     "test_dtensor_basic",
     "test_dtensor_contiguous_dtensor_noncontiguous_local_as_tangent",
@@ -4160,12 +5808,58 @@ def wrap_test_class(orig_cls):
 
 TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
 TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
+=======
+    "test_anomaly_mode_no_check_nan",  # different error messages
+    "test_anomaly_grad_warnings",  # different error messages
+    "test_anomaly_detect_nan",  # fake tensor errors on NaN
+    "test_once_differentiable",  # different node name: CompiledFunctionBackward
+    "test_function",  # different node name: CompiledFunctionBackward
+    "test_inplace_on_view_backward",  # different node name: CompiledFunctionBackward
+    "test_nested_anomaly_printstack_cleanup",  # anomaly NaN error message different
+    "test_not_implemented_grad",  # Dynamo changes the types of exceptions
+    "test_grad_call_compiled_backward_fn",  # different functorch error
+    "test_vjp_call_compiled_backward_fn",  # different functorch error
+    "test_vmap_call_compiled_backward_fn",  # different functorch error
+    "test_accumulate_grad",  # always out of place add for compiled autograd
+    "test_current_node",  # slightly different dispatched ops
+}
+
+skipped_tests = set()
+
+if not HAS_CUDA:
+    # Found Tesla M60 which is too old to be supported by the triton GPU compiler
+    skipped_tests.add("test_type_conversions")
+
+if IS_S390X:
+    skipped_tests.add("test_deep_reentrant")
+
+test_autograd = load_test_module("test_autograd")
+test_custom_ops = load_test_module("test_custom_ops")
+test_higher_order_ops = load_test_module("dynamo/test_higher_order_ops")
+
+TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
+TestNestedCheckpointWithCompiledAutograd = wrap_test_class(
+    test_autograd.TestNestedCheckpoint
+)
+TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
+HigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
+    test_higher_order_ops.HigherOrderOpTests
+)
+FuncTorchHigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
+    test_higher_order_ops.FuncTorchHigherOrderOpTests
+)
+ActivationCheckpointingTestsWithCompiledAutograd = wrap_test_class(
+    test_higher_order_ops.ActivationCheckpointingTests
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if torch.distributed.is_available() and HAS_CUDA:
     test_dtensor = load_test_module("distributed/tensor/test_dtensor_compile")
     TestDTensorCompileWithCompiledAutograd = wrap_test_class(
         test_dtensor.TestDTensorCompile
     )
 
+<<<<<<< HEAD
 xfail_hops = {
     # AssertionError: Tensor-likes are not close!
     "auto_functionalize",
@@ -4174,6 +5868,9 @@ def wrap_test_class(orig_cls):
     # AssertionError: assert type(args[1].realize()) is TensorVariable
     "map",
 }
+=======
+xfail_hops = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestCompiledAutogradOpInfo(TestCase):
@@ -4220,7 +5917,11 @@ def fn(x):
             # 1. Run eager
             torch.manual_seed(123)
             dummy = torch.randn(2, 2, dtype=dtype, device=device, requires_grad=True)
+<<<<<<< HEAD
             fn, op_out_ref = create_bwd_fn_closure(compiled_args, compiled_kwargs)
+=======
+            fn, op_out_ref = create_bwd_fn_closure(eager_args, eager_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn(dummy).backward()
             self.assertEqual(len(op_out_ref), 1)
             expected = op_out_ref[0]
@@ -4237,7 +5938,11 @@ def fn(x):
             self.assertEqual(expected, actual)
 
 
+<<<<<<< HEAD
 instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals(), only_for=("cpu",))
+=======
+instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_parametrized_tests(TestCompiledAutograd)
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index 7124296e7332..3ec078a9c2ac 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -568,7 +568,11 @@ def test_fn(self):
 
 
 class CompiledOptimizerParityTests(TestCase):
+<<<<<<< HEAD
     @skipCUDAIf(True, "failing Adam and RMSprop")
+=======
+    @skipCUDAIf(not has_triton(), "torch.compile with cuda requires triton")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipXPUIf(not has_triton(), "torch.compile with xpu requires triton")
     @optims(optim_db, dtypes=[torch.float32])
     @parametrize("use_closure", [True, False])
@@ -919,9 +923,15 @@ def test_S429861(self):
         import torch._dynamo
         import torch._inductor
         from torch._dynamo.debug_utils import aot_graph_input_parser
+<<<<<<< HEAD
         from torch._inductor.utils import fresh_inductor_cache
 
         with fresh_inductor_cache():
+=======
+        from torch._inductor.utils import fresh_cache
+
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             kwargs = aot_graph_input_parser(forward)
             torch.compile(forward)(**kwargs)
 
diff --git a/test/inductor/test_config.py b/test/inductor/test_config.py
index 2c3dc9d763ba..02c9b887eef1 100644
--- a/test/inductor/test_config.py
+++ b/test/inductor/test_config.py
@@ -5,6 +5,10 @@
 import torch
 from torch._dynamo.utils import counters
 from torch._inductor import config
+<<<<<<< HEAD
+=======
+from torch._inductor.pattern_matcher import PatternMatcherPass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_TRITON
 
@@ -226,6 +230,26 @@ def fn(x):
             torch._dynamo.reset()
             self.assertEqual(call_count, 1)
 
+<<<<<<< HEAD
+=======
+    def test_codegen_skips_custom_passes(self):
+        class _CustomPass(PatternMatcherPass):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def __call__(self, g: torch.fx.Graph):
+                self.apply(g)
+
+        g = _CustomPass()
+
+        with torch._inductor.config.patch(
+            post_grad_custom_post_pass=g,
+            post_grad_custom_pre_pass=g,
+        ):
+            code = torch._inductor.config.codegen_config()
+            self.assertNotIn("post_grad_custom", code)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_TRITON, "requires triton")
     def test_options_do_something(self):
         """
diff --git a/test/inductor/test_control_flow.py b/test/inductor/test_control_flow.py
index 6afc4bc07e91..b2c5b49b04b9 100644
--- a/test/inductor/test_control_flow.py
+++ b/test/inductor/test_control_flow.py
@@ -1,15 +1,28 @@
 # Owner(s): ["module: inductor"]
+<<<<<<< HEAD
+=======
+import contextlib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 import unittest
 
 import torch
 import torch._dynamo.testing
 from torch._higher_order_ops.associative_scan import associative_scan
+<<<<<<< HEAD
+=======
+from torch._higher_order_ops.map import _fake_map
+from torch._higher_order_ops.scan import _fake_scan, scan
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import (
     decorateIf,
     instantiate_parametrized_tests,
     parametrize,
+<<<<<<< HEAD
+=======
+    skipIfXpu,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
 from torch.testing._internal.triton_utils import requires_gpu
@@ -290,6 +303,10 @@ def test_cond_unbacked_symint_closure(self, device, dynamic):
             dynamic=dynamic,
         )
 
+<<<<<<< HEAD
+=======
+    @skipIfXpu(msg="Remove this skip after issue #154949 resolved.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu
     def test_cond_control_flow_with_precomputed_size(self):
         class TestModel(torch.nn.Module):
@@ -462,7 +479,10 @@ def false_fn(x):
                 dynamic=True,
             )
 
+<<<<<<< HEAD
     @unittest.skip("unbacked symints from inner to outer graph not supported yet")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     def test_cond_unbacked_symint_inner_to_outer(self, device):
@@ -568,7 +588,11 @@ def false_fn(x, y):
                 return torch.cond(p, true_fn, false_fn, [a, b])
 
         # AssertionError: Output aliasing is currently not supported...
+<<<<<<< HEAD
         with self.assertRaises(torch._dynamo.exc.BackendCompilerFailed):
+=======
+        with self.assertRaises(torch._dynamo.exc.UncapturedHigherOrderOpError):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.compile(Model())(
                 torch.tensor(True),
                 torch.randn(10, 20),
@@ -948,6 +972,35 @@ def body_fn(loop_idx, a, b):
 
             return torch._higher_order_ops.while_loop(cond_fn, body_fn, (c, a, b))
 
+<<<<<<< HEAD
+=======
+    class Conv(torch.nn.Module):
+        def __init__(self, device):
+            super().__init__()
+            self.conv2d = torch.nn.Conv2d(
+                4,
+                4,
+                (3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                device=device,
+                dtype=torch.float64,
+            )
+
+        def forward(self, c, x):
+            def cond_fn(loop_idx, x):
+                return loop_idx < x.size(0)
+
+            def body_fn(loop_idx, x):
+                return loop_idx + 1, self.conv2d(x) + 1
+
+            return torch._higher_order_ops.while_loop(
+                cond_fn,
+                body_fn,
+                (c, x),
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class WhileLoopTests(TestCase):
     def _run_test(
@@ -1257,6 +1310,20 @@ def test_while_loop_with_sym_expr_cond(self, device, dynamic):
             dynamic=dynamic,
         )
 
+<<<<<<< HEAD
+=======
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    def test_while_loop_with_conv(self, device, dynamic):
+        self._run_test(
+            model=WhileLoopModels.Conv(device),
+            inputs=(torch.randn(2, 4, 4, 4, dtype=torch.float64),),
+            device=device,
+            dynamic=dynamic,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class AssociativeScanTests(TestCase):
     @requires_gpu
@@ -1362,9 +1429,616 @@ def fct(x: torch.Tensor, y: torch.Tensor):
             self.assertEqual(result1, result3)
 
 
+<<<<<<< HEAD
+instantiate_parametrized_tests(CondTests)
+instantiate_parametrized_tests(WhileLoopTests)
+instantiate_parametrized_tests(AssociativeScanTests)
+=======
+class ScanModels:
+    class SimpleScan(torch.nn.Module):
+        def __init__(self, reverse, dim):
+            super().__init__()
+            self.reverse = reverse
+            self.dim = dim
+
+        def forward(self, _input, weight, bias):
+            def combine_fn(carry, x):
+                from torch.utils import _pytree as pytree
+
+                new_carry = {
+                    "param": carry["param"] @ x + carry["bias"],
+                    "bias": carry["bias"].sin(),
+                }
+                return new_carry, (
+                    pytree.tree_map(lambda x: x.clone(), new_carry),
+                    {"dummy": x.sin()},
+                )
+
+            return scan(
+                combine_fn,
+                {"param": weight, "bias": bias},
+                _input,
+                reverse=self.reverse,
+                dim=self.dim,
+            )
+
+    class ScanLinearWithView(torch.nn.Module):
+        def __init__(self, reverse, dim):
+            super().__init__()
+            self.reverse = reverse
+            self.dim = dim
+            self.linear = torch.nn.Linear(4, 4)
+
+        def forward(self, scan_op, init, xs):
+            def combine_fn(carry, x):
+                prev_sz = x.size()
+                x = self.linear(x.view(-1, x.size(-1)))
+                x_view = x.view(*prev_sz)
+                return x_view, x_view.clone()
+
+            return scan_op(combine_fn, init, xs, dim=self.dim, reverse=self.reverse)
+
+    class ScanConv(torch.nn.Module):
+        def __init__(self, reverse, dim):
+            super().__init__()
+            self.reverse = reverse
+            self.dim = dim
+            self.conv2d = torch.nn.Conv2d(
+                4, 4, (3, 3), stride=(1, 1), padding=(1, 1), dtype=torch.float64
+            )
+
+        # init = torch.randn(2, 4, 4, 4)
+        # xs = torch.randn(scan_dim, 2, 4, 4, 4)
+        def forward(self, scan_op, init, xs):
+            def combine_fn(carry, x):
+                x = self.conv2d(x)
+                return x, x.clone()
+
+            return scan_op(combine_fn, init, xs, dim=self.dim, reverse=self.reverse)
+
+    class ScanInCond(torch.nn.Module):
+        def __init__(self, reverse, dim):
+            super().__init__()
+            self.true_scan_linear = ScanModels.ScanLinearWithView(reverse, dim)
+            self.false_scan_linear = ScanModels.ScanLinearWithView(not reverse, dim)
+
+        def forward(self, scan_op, pred, init, xs):
+            def true_fn():
+                last_carry, y = self.true_scan_linear(scan_op, init, xs)
+                return last_carry.sum(), y.sin()
+
+            def false_fn():
+                last_carry, y = self.false_scan_linear(scan_op, init, xs)
+                return -last_carry.sum(), y.cos()
+
+            return torch.cond(pred, true_fn, false_fn, tuple())
+
+    class CondInScan(torch.nn.Module):
+        def __init__(self, reverse, dim):
+            super().__init__()
+            self.reverse = reverse
+            self.dim = dim
+            self.true_linear = torch.nn.Linear(4, 4)
+            self.false_linear = torch.nn.Linear(4, 4)
+
+        def forward(self, scan_op, init, xs):
+            def combine_fn(carry, x):
+                old_sizes = carry.size()
+                carry_view = carry.view(-1, carry.size()[-1])
+                new_carry_out = torch.cond(
+                    torch.all(carry_view > 1),
+                    lambda: self.true_linear(carry_view).sin(),
+                    lambda: self.false_linear(carry_view).cos(),
+                    tuple(),
+                )
+                return carry + new_carry_out.view(*old_sizes), new_carry_out
+
+            return scan_op(
+                combine_fn,
+                init,
+                xs,
+                dim=self.dim,
+                reverse=self.reverse,
+            )
+
+    class SimpleWithPytreeInOuts(torch.nn.Module):
+        def __init__(self, reverse, dim):
+            super().__init__()
+            self.reverse = reverse
+            self.dim = dim
+
+        def forward(self, scan_op, _input, weight, bias):
+            def combine_fn(carry, x):
+                from torch.utils import _pytree as pytree
+
+                new_carry = {
+                    "param": carry["param"] @ x + carry["bias"],
+                    "bias": carry["bias"].sin(),
+                }
+                return new_carry, (
+                    pytree.tree_map(lambda x: x.clone(), new_carry),
+                    {"dummy": x.sin()},
+                )
+
+            return scan_op(
+                combine_fn,
+                {"param": weight, "bias": bias},
+                _input,
+                reverse=self.reverse,
+                dim=self.dim,
+            )
+
+    class ChunkedCE(torch.nn.Module):
+        def __init__(self, chunk_size):
+            super().__init__()
+            self.chunk_size = chunk_size
+            self.ce = lambda logits, target: torch.abs(target - logits).sum()
+
+        def forward(self, scan_op, _input, weight, target, bias):
+            CHUNK_SIZE = self.chunk_size
+
+            def compute_loss(input_chunk, weight, bias, target):
+                logits = torch.addmm(bias, input_chunk, weight.t())
+                logits = logits.float()
+                loss = self.ce(logits, target)
+                return loss
+
+            grad_weight = torch.zeros_like(weight)
+            grad_bias = torch.zeros_like(bias)
+            loss_acc = torch.zeros((), device=_input.device)
+
+            chunks = _input.shape[0] // CHUNK_SIZE
+
+            _input_chunks = _input.view(CHUNK_SIZE, chunks, *_input.shape[1:])
+            target_chunks = target.view(CHUNK_SIZE, chunks, *target.shape[1:])
+
+            def combine_fn(carry, xs):
+                grad_weight, grad_bias, loss_acc = carry
+                input_chunk, target_chunk = xs
+                (
+                    (
+                        chunk_grad_input,
+                        chunk_grad_weight,
+                        chunk_grad_bias,
+                    ),
+                    chunk_loss,
+                ) = torch.func.grad_and_value(compute_loss, argnums=(0, 1, 2))(
+                    input_chunk, weight, bias, target_chunk
+                )
+                return (
+                    (
+                        grad_weight + chunk_grad_weight,
+                        grad_bias + chunk_grad_bias,
+                        loss_acc + chunk_loss,
+                    ),
+                    chunk_grad_input,
+                )
+
+            (grad_weight, grad_bias, loss_acc), grad_inputs = scan_op(
+                combine_fn,
+                (grad_weight, grad_bias, loss_acc),
+                (_input_chunks, target_chunks),
+            )
+            return (
+                grad_weight / chunks,
+                grad_bias / chunks,
+                loss_acc / chunks,
+                grad_inputs.view(-1, *_input.shape[1:]) / chunks,
+            )
+
+    class ChunkedCENoScan(torch.nn.Module):
+        def __init__(self, chunk_size):
+            super().__init__()
+            self.chunk_size = chunk_size
+            self.ce = lambda logits, target: torch.abs(target - logits).sum()
+
+        def forward(self, scan_op, _input, weight, target, bias):
+            CHUNK_SIZE = self.chunk_size
+
+            def compute_loss(input_chunk, weight, bias, target):
+                logits = torch.addmm(bias, input_chunk, weight.t())
+                logits = logits.float()
+                loss = self.ce(logits, target)
+                return loss
+
+            grad_weight = torch.zeros_like(weight)
+            grad_inputs = []
+            grad_bias = torch.zeros_like(bias)
+            loss_acc = torch.zeros((), device=_input.device)
+
+            chunks = _input.shape[0] // CHUNK_SIZE
+
+            def accumulate_chunk(input_chunk, target_chunk):
+                (
+                    (
+                        chunk_grad_input,
+                        chunk_grad_weight,
+                        chunk_grad_bias,
+                    ),
+                    chunk_loss,
+                ) = torch.func.grad_and_value(compute_loss, argnums=(0, 1, 2))(
+                    input_chunk, weight, bias, target_chunk
+                )
+                grad_weight.add_(chunk_grad_weight)
+                grad_bias.add_(chunk_grad_bias)
+                loss_acc.add_(chunk_loss)
+                return chunk_grad_input
+
+            accumulate_chunk = torch.compile(accumulate_chunk)
+
+            input_chunks = torch.chunk(_input, chunks=chunks, dim=0)
+            target_chunks = torch.chunk(target, chunks=chunks, dim=0)
+            for input_chunk, target_chunk in zip(input_chunks, target_chunks):
+                grad_inputs.append(accumulate_chunk(input_chunk, target_chunk))
+            return (
+                grad_weight / chunks,
+                grad_bias / chunks,
+                loss_acc / chunks,
+                torch.cat(grad_inputs, dim=0) / chunks,
+            )
+
+    class ScanWithClamp(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, scan_op, initial, xs):
+            def step(h_prev, x_t):
+                h_next = (h_prev + x_t).clamp(min=0.1)
+                return h_next, h_next.clone()
+
+            final, ys = scan_op(step, initial, xs)
+            return final, ys
+
+
+class ScanTests(TestCase):
+    def _run_test(
+        self,
+        model,
+        inputs,
+        device,
+        dynamic,
+        requires_grad=False,
+    ):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
+        compiled_model = torch.compile(backend=cnt, fullgraph=True, dynamic=dynamic)(
+            model
+        )
+
+        inputs = [inp.to(device=device) for inp in inputs]
+        model = model.to(device=device)
+        cloned_inputs = [inp.clone() for inp in inputs]
+        grad_ctx = contextlib.nullcontext() if requires_grad else torch.no_grad()
+        with grad_ctx:
+            result = model(scan, *cloned_inputs)
+            result_exp = model(_fake_scan, *cloned_inputs)
+
+            result_compiled = compiled_model(scan, *cloned_inputs)
+            result_compiled_exp = compiled_model(_fake_scan, *cloned_inputs)
+
+        self.assertEqual(result, result_exp)
+        self.assertEqual(result_exp, result_compiled)
+        self.assertEqual(result_compiled, result_compiled_exp)
+
+    def _compare_result(
+        self,
+        model1,
+        model2,
+        inputs,
+        device,
+    ):
+        inp_on_device = [elem.to(device=device) for elem in inputs]
+        cloned_inputs = [arg.clone() for arg in inp_on_device]
+        model1_out = model1(scan, *cloned_inputs)
+        model2_out = model2(scan, *cloned_inputs)
+        self.assertEqual(model1_out, model2_out)
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @parametrize("reverse", [True, False])
+    @parametrize("dim", [0, 1, 2])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_pytree_in_out(self, device, dynamic, reverse, dim):
+        self._run_test(
+            model=ScanModels.SimpleWithPytreeInOuts(reverse=reverse, dim=dim),
+            inputs=(
+                torch.ones(2, 2, 2),
+                torch.ones(2, 2),
+                torch.ones(2),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @parametrize("reverse", [True, False])
+    @parametrize("dim", [0, 1, 3])
+    @parametrize("scan_length", [1, 5])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_nn_modules(self, device, dynamic, reverse, dim, scan_length):
+        init = torch.randn(20, 16, 4, 4)
+        xs = torch.randn(scan_length, 20, 16, 4, 4)
+        xs = xs.movedim(0, dim)
+        self._run_test(
+            model=ScanModels.ScanLinearWithView(reverse=reverse, dim=dim),
+            inputs=(
+                init,
+                xs,
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @parametrize("reverse", [True, False])
+    @parametrize("dim", [0, 1, 3])
+    @parametrize("scan_length", [1, 5])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_conv(self, device, dynamic, reverse, dim, scan_length):
+        init = torch.randn(2, 4, 4, 4, dtype=torch.float64)
+        xs = torch.randn(scan_length, 2, 4, 4, 4, dtype=torch.float64)
+        xs = xs.movedim(0, dim)
+        self._run_test(
+            model=ScanModels.ScanConv(reverse=reverse, dim=dim),
+            inputs=(
+                init,
+                xs,
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @parametrize("reverse", [True, False])
+    @parametrize("dim", [0, 1, 3])
+    @parametrize("pred", [True, False])
+    @parametrize("scan_length", [1, 5])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_in_cond(self, device, dynamic, reverse, dim, pred, scan_length):
+        init = torch.randn(4, 4, 4)
+        xs = torch.randn(scan_length, 4, 4, 4)
+        xs = xs.movedim(0, dim)
+        self._run_test(
+            model=ScanModels.ScanInCond(reverse=reverse, dim=dim),
+            inputs=(
+                torch.tensor(pred),
+                init,
+                xs,
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @parametrize("reverse", [True, False])
+    @parametrize("dim", [0, 1, 3])
+    @parametrize("scan_length", [1, 5])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_cond_in_scan(self, device, dynamic, reverse, dim, scan_length):
+        init = torch.randn(2, 4, 4, 4)
+        xs = torch.randn(scan_length, 4, 4, 4)
+        xs = xs.movedim(0, dim)
+        self._run_test(
+            model=ScanModels.CondInScan(reverse=reverse, dim=dim),
+            inputs=(
+                init,
+                xs,
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_chunked_ce(self, device, dynamic):
+        self._run_test(
+            model=ScanModels.ChunkedCE(10),
+            inputs=(
+                torch.randn(100, 20),
+                torch.randn(20, 20),
+                torch.randn(100, 20),
+                torch.randn(20),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_compare_chunked_ce_with_no_scan(self, device, dynamic):
+        for trunk_size, B, T in zip([10, 20], [10, 100], [20, 40]):
+            self._compare_result(
+                model1=torch.compile(ScanModels.ChunkedCE(trunk_size), dynamic=dynamic),
+                model2=ScanModels.ChunkedCENoScan(trunk_size),
+                inputs=(
+                    torch.randn(B, T),
+                    torch.randn(T, T),
+                    torch.randn(B, T),
+                    torch.randn(T),
+                ),
+                device=device,
+            )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_scan_with_clamp(self, device, dynamic):
+        B = 4
+        T = 8
+        H = 16
+        self._run_test(
+            model=ScanModels.ScanWithClamp(),
+            inputs=(
+                torch.randn((B, H)),
+                torch.randn((T, B, H), requires_grad=True),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+
+class MapModels:
+    class Simple(torch.nn.Module):
+        def forward(self, map_op, x):
+            a = torch.ones(3, 4, device=x.device)
+
+            def f(x):
+                return x.sin() + a
+
+            return map_op(f, x)
+
+    class SimpleWithLinearWithView(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = torch.nn.Linear(3, 5)
+
+        def forward(self, map_op, x):
+            def f(x):
+                return self.linear(x).sin()
+
+            return map_op(f, x.view(4, 3))
+
+    class PytreeInOut(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = torch.nn.Linear(3, 5)
+
+        def forward(self, map_op, x, y, z):
+            def f(x_y_z):
+                x = x_y_z["x"]
+                y, (z,) = x_y_z["y_z"]
+                return self.linear(x).sin(), (self.linear(y), z.cos())
+
+            return map_op(f, {"x": x, "y_z": (y, (z,))})
+
+    class ReinterpretView(torch.nn.Module):
+        def forward(self, map_op, x, y, z):
+            def f(xyz):
+                x, y, z = xyz
+                return x.sin()[:2], y.cos()[:2] + z[-2:].clone()
+
+            return map_op(f, (x, y, z))
+
+    class NestedWithCond(torch.nn.Module):
+        def forward(self, map_op, x, y, z):
+            def true_fn(x, y, z):
+                def inner_f(yz):
+                    y, z = yz
+                    return y + z
+
+                return map_op(inner_f, (y, z))
+
+            def false_fn(x, y, z):
+                def inner_f(yz):
+                    y, z = yz
+                    return y - z
+
+                return map_op(inner_f, (y, z))
+
+            return torch._higher_order_ops.cond(
+                x.sum() > 0, true_fn, false_fn, (x, y, z)
+            )
+
+
+class MapTests(TestCase):
+    def _run_test(
+        self,
+        model,
+        inputs,
+        device,
+        dynamic=False,
+    ):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
+        compiled_model = torch.compile(backend=cnt, fullgraph=True, dynamic=dynamic)(
+            model
+        )
+
+        inputs = [inp.to(device=device) for inp in inputs]
+        model = model.to(device=device)
+        cloned_inputs = [inp.clone() for inp in inputs]
+        result = model(torch._higher_order_ops.map, *cloned_inputs)
+        result_exp = model(_fake_map, *cloned_inputs)
+        result_compiled = compiled_model(torch._higher_order_ops.map, *cloned_inputs)
+
+        self.assertEqual(result, result_exp)
+        self.assertEqual(result, result_compiled)
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_map_simple(self, device, dynamic):
+        self._run_test(
+            model=MapModels.Simple(),
+            inputs=(torch.randn(3, 4),),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_map_simple_linear_with_view(self, device, dynamic):
+        self._run_test(
+            model=MapModels.SimpleWithLinearWithView(),
+            inputs=(torch.randn(3, 4),),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_map_pytree_in_out(self, device, dynamic):
+        self._run_test(
+            model=MapModels.PytreeInOut(),
+            inputs=(
+                torch.randn(2, 5, 3),
+                torch.randn(2, 5, 3),
+                torch.randn(2, 4, 3),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_map_nested_with_cond(self, device, dynamic):
+        self._run_test(
+            model=MapModels.NestedWithCond(),
+            inputs=(
+                torch.randn(3, 2),
+                torch.randn(3, 10, 5),
+                torch.randn(3, 10, 5),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+
 instantiate_parametrized_tests(CondTests)
 instantiate_parametrized_tests(WhileLoopTests)
 instantiate_parametrized_tests(AssociativeScanTests)
+instantiate_parametrized_tests(ScanTests)
+instantiate_parametrized_tests(MapTests)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_cpp_wrapper_hipify.py b/test/inductor/test_cpp_wrapper_hipify.py
index 96517c7e969f..996d2c67adc5 100644
--- a/test/inductor/test_cpp_wrapper_hipify.py
+++ b/test/inductor/test_cpp_wrapper_hipify.py
@@ -79,6 +79,24 @@ def test_hipify_aoti_driver_header(self) -> None:
                 return func;
             }
 
+<<<<<<< HEAD
+=======
+            static inline hipFunction_t loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
+                hipModule_t mod;
+                hipFunction_t func;
+                CUDA_DRIVER_CHECK(hipModuleLoadData(&mod, start));
+                CUDA_DRIVER_CHECK(hipModuleGetFunction(&func, mod, funcName.c_str()));
+                if (sharedMemBytes > 0) {
+                    CUDA_DRIVER_CHECK(hipFuncSetAttribute(
+                        func,
+                        hipFuncAttributeMaxDynamicSharedMemorySize,
+                        sharedMemBytes
+                    ))
+                }
+                return func;
+            }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             static inline void launchKernel(
                     hipFunction_t func,
                     uint32_t gridX,
diff --git a/test/inductor/test_cpu_cpp_wrapper.py b/test/inductor/test_cpu_cpp_wrapper.py
index 758206d36734..dc98e0d28b76 100644
--- a/test/inductor/test_cpu_cpp_wrapper.py
+++ b/test/inductor/test_cpu_cpp_wrapper.py
@@ -97,6 +97,10 @@ def make_test_case(
     slow=False,
     func_inputs=None,
     code_string_count=None,
+<<<<<<< HEAD
+=======
+    test_build_separate=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     test_name = f"{name}_{device}" if device else name
     if code_string_count is None:
@@ -105,8 +109,18 @@ def make_test_case(
     func = getattr(tests, test_name)
     assert callable(func), "not a callable"
     func = slowTest(func) if slow else func
+<<<<<<< HEAD
 
     @config.patch(cpp_wrapper=True, search_autotune_cache=False)
+=======
+    new_test_name = f"{test_name}_separate" if test_build_separate else test_name
+
+    @config.patch(
+        cpp_wrapper=True,
+        search_autotune_cache=False,
+        cpp_wrapper_build_separate=test_build_separate,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def fn(self):
         tests.setUpClass()
         tests.setUp()
@@ -123,6 +137,11 @@ def fn(self):
                 # happen for tests validating build-dependent features (e.g. datatypes
                 # that are available on some platforms and not others).
                 if code:
+<<<<<<< HEAD
+=======
+                    if test_build_separate:
+                        self.assertIn("kernel_src", code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertIn("CppWrapperCodeCache", code)
                     self.assertTrue(
                         all(
@@ -134,14 +153,22 @@ def fn(self):
             tests.tearDown()
             tests.tearDownClass()
 
+<<<<<<< HEAD
     fn.__name__ = test_name
+=======
+    fn.__name__ = new_test_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import copy
 
     fn.__dict__ = copy.deepcopy(func.__dict__)
     if condition:
         setattr(
             CppWrapperTemplate,
+<<<<<<< HEAD
             test_name,
+=======
+            new_test_name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn,
         )
 
@@ -156,14 +183,28 @@ class BaseTest(NamedTuple):
         slow: bool = False
         func_inputs: list = None
         code_string_count: dict = {}
+<<<<<<< HEAD
+
+    for item in [
+        BaseTest("test_add_complex"),
+        BaseTest("test_add_complex4"),
+=======
+        test_build_separate: bool = False
 
     for item in [
         BaseTest("test_add_complex"),
+        BaseTest("test_add_complex", test_build_separate=True),
         BaseTest("test_add_complex4"),
+        BaseTest("test_add_complex4", test_build_separate=True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         BaseTest("test_as_strided"),  # buffer reuse
         BaseTest("test_bernoulli1"),
         BaseTest("test_bitwise"),  # int32
         BaseTest("test_bmm1"),
+<<<<<<< HEAD
+=======
+        BaseTest("test_bmm1", test_build_separate=True),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         BaseTest("test_bmm2"),
         BaseTest("test_cat"),  # alias
         BaseTest(
@@ -189,7 +230,11 @@ class BaseTest(NamedTuple):
         BaseTest(
             "test_conv2d_unary",
             "cpu",
+<<<<<<< HEAD
             test_mkldnn_pattern_matcher.TestPatternMatcher(),
+=======
+            test_mkldnn_pattern_matcher.TestPatternMatcherGenericCPU(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             condition=torch.backends.mkldnn.is_available(),
             slow=True,
         ),
@@ -246,7 +291,12 @@ class BaseTest(NamedTuple):
             for func in dir(test_cpu_repro.CPUReproTests())
             if func.startswith("test_lstm_packed_change_input_sizes")
         ],
+<<<<<<< HEAD
         BaseTest("test_max_pool2d6"),
+=======
+        BaseTest("test_max_pool2d6_dilation_1"),
+        BaseTest("test_max_pool2d6_dilation_2"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         BaseTest(
             "test_mkl_linear", "", test_cpu_repro.CPUReproTests(), condition=TEST_MKL
         ),
@@ -296,7 +346,11 @@ class BaseTest(NamedTuple):
             condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
             func_inputs=[
                 [
+<<<<<<< HEAD
                     "aoti_torch_cpu__qconv2d_pointwise_tensor",
+=======
+                    "aoti_torch_cpu__qconv_pointwise_tensor",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "torch.ops.quantized.max_pool2d",
                     "aoti_torch_cpu__qlinear_pointwise_tensor",
                 ]
@@ -358,7 +412,13 @@ class BaseTest(NamedTuple):
         BaseTest("test_view_as_complex"),
         BaseTest("test_view_as_real"),
         BaseTest(
+<<<<<<< HEAD
             "test_woq_int4", "cpu", test_mkldnn_pattern_matcher.TestPatternMatcher()
+=======
+            "test_woq_int4",
+            "cpu",
+            test_mkldnn_pattern_matcher.TestPatternMatcher(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     ]:
         make_test_case(
@@ -369,6 +429,10 @@ class BaseTest(NamedTuple):
             item.slow,
             item.func_inputs,
             item.code_string_count,
+<<<<<<< HEAD
+=======
+            item.test_build_separate,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     test_torchinductor.copy_tests(
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index c8dad8314a21..18354fd3544b 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -232,9 +232,17 @@ def forward(self, x):
                 metrics.reset()
                 v = torch.randn(*input_size)
                 mod = Model(output_size, kernel_size, stride).eval()
+<<<<<<< HEAD
                 with contextlib.nullcontext() if (
                     num_threads != 1
                 ) else set_num_threads(1):
+=======
+                with (
+                    contextlib.nullcontext()
+                    if (num_threads != 1)
+                    else set_num_threads(1)
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     with torch.no_grad():
                         self.common(
                             mod,
@@ -760,7 +768,23 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             else "aten.set_.source_Tensor",
             code,
         )
+<<<<<<< HEAD
         self.assertEqual(model(inp), result)
+=======
+        expected = model(inp)
+        self.assertEqual(expected, result)
+
+        # test cpp_wrapper_build_separate
+        with config.patch(cpp_wrapper=True, cpp_wrapper_build_separate=True):
+            result, code = run_and_get_cpp_code(fn_opt, inp)
+            self.assertIn("kernel_src", code)
+            self.assertEqual(expected, result)
+
+        with config.patch(cpp_wrapper=True, cpp_wrapper_build_separate=False):
+            result, code = run_and_get_cpp_code(fn_opt, inp)
+            self.assertNotIn("kernel_src", code)
+            self.assertEqual(expected, result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch._dynamo.config.patch(dynamic_shapes=True)
     @torch._dynamo.config.patch(assume_static_by_default=False)
@@ -987,6 +1011,36 @@ def fn(x):
         # aten parallel.
         self.common(fn, (v,), atol=5e-1, rtol=5e-1)
 
+<<<<<<< HEAD
+=======
+    def test_parallel_reduction_vectorization(self):
+        # Fix issue: https://github.com/pytorch/pytorch/issues/151523
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    in_channels=3,
+                    out_channels=16,
+                    kernel_size=(1, 7),
+                    stride=(2, 1),
+                    padding=0,
+                )
+
+            def forward(self, x, weight):
+                x = self.conv(x)
+                x = F.hardshrink(x, lambd=0)
+                x = x.view(x.size(0), -1)
+                x = torch.mv(weight, x[0])
+                return x
+
+        mod = Model().eval()
+        x = torch.randn(2, 3, 127, 255)
+        weight = torch.randn(10, 254976)
+        # Use same criterion as test_inplace_squeeze_needed
+        # for parallel reduction.
+        self.common(mod, (x, weight), atol=5e-1, rtol=5e-1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cat_mul(self):
         # https://github.com/pytorch/pytorch/issues/93365
         def fn(p0, p1):
@@ -1043,6 +1097,30 @@ def forward(self, x):
         x = torch.randn(1, 3, 64, 64)
         self.common(Model(), (x,))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        os.getenv("ATEN_CPU_CAPABILITY") == "default",
+        "Failing in periodic nogpu_NO_AVX2 after added in #152542",
+    )
+    @config.patch("cpp.use_decompose_tanh", "1")
+    def test_tanh_atan2_use_decompose_tanh(self):
+        # https://github.com/pytorch/pytorch/issues/148241
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.shrink = nn.Tanhshrink()
+
+            def forward(self, x):
+                x = self.shrink(x)
+                x = torch.atan2(x, x)
+                return x
+
+        x = torch.randn(1, 3, 64, 64)
+        with self.assertRaises(AssertionError):
+            self.common(Model(), (x,))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_propagation_issue_102065(self):
         def fn(x):
             x = torch.arange(x.numel())
@@ -1225,9 +1303,15 @@ def test_slice_scatter_default_end_value(self):
         # From HF AllenaiLongformerBase.
         def fn(query, key, window_overlap):
             batch_size, seq_len, num_heads, head_dim = query.size()
+<<<<<<< HEAD
             assert (
                 seq_len % (window_overlap * 2) == 0
             ), f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+=======
+            assert seq_len % (window_overlap * 2) == 0, (
+                f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
             diagonal_chunked_attention_scores = key
@@ -1239,11 +1323,19 @@ def fn(query, key, window_overlap):
                     window_overlap * 2 + 1,
                 )
             )
+<<<<<<< HEAD
             diagonal_attention_scores[
                 :, :3, :, window_overlap:
             ] = diagonal_chunked_attention_scores[
                 :, :, :window_overlap, : window_overlap + 1
             ]
+=======
+            diagonal_attention_scores[:, :3, :, window_overlap:] = (
+                diagonal_chunked_attention_scores[
+                    :, :, :window_overlap, : window_overlap + 1
+                ]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return diagonal_attention_scores
 
         self.common(
@@ -1370,9 +1462,26 @@ def fn(
         use_quant_list = [False, True]
         use_tensor_overload_list = [False, True]
 
+<<<<<<< HEAD
         assert dtype in [torch.uint8, torch.int8]
         quant_min = 0 if dtype == torch.uint8 else -128
         quant_max = 255 if dtype == torch.uint8 else 127
+=======
+        assert dtype in [
+            torch.uint8,
+            torch.int8,
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+        ]
+        quant_min = 0 if dtype == torch.uint8 else -128
+        quant_max = 255 if dtype == torch.uint8 else 127
+        if dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+            quant_min = int(torch.finfo(dtype).min)
+            quant_max = int(torch.finfo(dtype).max)
+            use_tensor_overload_list = [
+                False,
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for (
             use_dequant,
@@ -1428,6 +1537,17 @@ def test_dequant_quant_lowering_int8(self):
             torch.int8, dequant_out_dtype=torch.bfloat16
         )
 
+<<<<<<< HEAD
+=======
+    @requires_vectorization
+    def test_dequant_quant_lowering_fp8_e4m3(self):
+        self._test_dequant_quant_lowering_helper(torch.float8_e4m3fn)
+
+    @requires_vectorization
+    def test_dequant_quant_lowering_fp8_e5m2(self):
+        self._test_dequant_quant_lowering_helper(torch.float8_e5m2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_dequant_maxpool2d_lowering_helper(self, dtype):
         def fn(x, scale, zero_point, quant_min, quant_max, dtype):
             x = torch.ops.quantized_decomposed.dequantize_per_tensor(
@@ -2551,8 +2671,14 @@ def fn(a, dim, index, b):
         self.common(fn, inps)
         assert metrics.generated_cpp_vec_kernel_count == 2
 
+<<<<<<< HEAD
         with set_num_threads(1), config.patch(
             {"fx_graph_cache": False, "fx_graph_remote_cache": False}
+=======
+        with (
+            set_num_threads(1),
+            config.patch({"fx_graph_cache": False, "fx_graph_remote_cache": False}),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch._dynamo.reset()
             metrics.reset()
@@ -3445,7 +3571,11 @@ def forward(self, x):
                     metrics.reset()
                     m = Model().eval() if eval_mode else Model()
                     self.common(m, (x,))
+<<<<<<< HEAD
                     check_metrics_vec_kernel_count(8)
+=======
+                    check_metrics_vec_kernel_count(6)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_vectorization
     @config.patch("cpp.enable_tiling_heuristics", False)
@@ -3590,6 +3720,27 @@ def fn(a):
         x = torch.randn(1, 384, 20, 20).to(memory_format=torch.channels_last)
         self.common(fn, (x,))
 
+<<<<<<< HEAD
+=======
+    def test_issue_148058(self):
+        # Fix issue https://github.com/pytorch/pytorch/issues/148058
+        def fn(x):
+            x = F.gumbel_softmax(x, tau=1.0, hard=True)
+            x = torch.where(x > 0.5, x, torch.zeros_like(x))
+            x = torch.scatter(
+                x,
+                dim=1,
+                index=torch.ones(1, 2, dtype=torch.long),
+                src=torch.ones_like(x),
+            )
+            return x
+
+        metrics.reset()
+        x = torch.randn(1, 2)
+        # Only test for functionality since the output of gumbel_softmax has randomness
+        torch.compile(fn, backend="inductor")(x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_contiguous_index_with_constant_stride(self):
         def fn(x):
             x1 = x[:, :, :, ::2]
@@ -4121,8 +4272,13 @@ def forward(self, x):
                 compiled_m = torch.compile(mod, dynamic=dynamic)
                 actual, code = run_and_get_cpp_code(compiled_m, x)
                 self.assertEqual(expected, actual)
+<<<<<<< HEAD
                 # 2 generated kernels (one for var_mean, the other for result)
                 check_metrics_vec_kernel_count(2)
+=======
+                # 3 generated kernels (first one for var_mean, last two for result)
+                check_metrics_vec_kernel_count(3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # check loop split optimization
                 if fmt == torch.channels_last:
@@ -4154,8 +4310,13 @@ def forward(self, x):
                 compiled_m = torch.compile(mod)
                 actual = compiled_m(x)
                 self.assertEqual(expected, actual)
+<<<<<<< HEAD
                 # 2 generated kernels (one for var_mean, the other for result)
                 check_metrics_vec_kernel_count(2)
+=======
+                # 3 generated kernels (first one for var_mean, last two for result)
+                check_metrics_vec_kernel_count(3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # check that there is no outer loop fusion.
                 self.assertEqual(
                     len(metrics.cpp_outer_loop_fused_inner_counts),
@@ -4164,6 +4325,36 @@ def forward(self, x):
                 # check for parallel reduction.
                 self.assertEqual(metrics.parallel_reduction_count, 1)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        os.getenv("ATEN_CPU_CAPABILITY") == "default",
+        "Failing in periodic nogpu_NO_AVX2, see #150059 for example",
+    )
+    def test_group_norm_large_size(self):
+        # https://github.com/pytorch/pytorch/issues/141541
+        # We are using the chunk size of 4096 for cascade summation,
+        # the reduction size of this test case exceeded it.
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.gn = torch.nn.GroupNorm(32, 32)
+
+            def forward(self, x):
+                return self.gn(x)
+
+        for dynamic in [True, False]:
+            torch._dynamo.reset()
+            metrics.reset()
+            mod = M().eval()
+            x = torch.randn(1, 32, 128, 128, 128)
+            with torch.no_grad():
+                expected = mod(x)
+                compiled_m = torch.compile(mod, dynamic=dynamic)
+                actual = compiled_m(x)
+                self.assertEqual(expected, actual)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_int_div_vec(self):
         def fn(x, y, mode):
             return torch.div(x, y, rounding_mode=mode)
@@ -4704,9 +4895,18 @@ def forward(self, q, k, v):
         from torch.nn.attention import sdpa_kernel, SDPBackend
 
         context = contextlib.nullcontext if not is_inference else torch.no_grad
+<<<<<<< HEAD
         with config.patch(
             {"fallback_random": True}
         ), torch.cpu.amp.autocast(), context(), sdpa_kernel(SDPBackend.MATH):
+=======
+        with (
+            config.patch({"fallback_random": True}),
+            torch.cpu.amp.autocast(),
+            context(),
+            sdpa_kernel(SDPBackend.MATH),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.manual_seed(0)
             eager = mod(*inputs)
             torch.manual_seed(0)
@@ -5225,6 +5425,18 @@ def fn(x):
 
         self.common(fn, (x,))
 
+<<<<<<< HEAD
+=======
+    def test_vector_norm_compile(self):
+        x = torch.randn([16, 32], dtype=torch.float)
+        ref = torch.linalg.vector_norm(x, ord=2, dim=[], keepdim=False, dtype=None)
+        compiled_vector_norm = torch.compile(
+            torch.linalg.vector_norm, backend="inductor"
+        )
+        res = compiled_vector_norm(x, ord=2, dim=[], keepdim=False, dtype=None)
+        self.assertEqual(ref, res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
index b9912f36e430..ea338c6560dc 100644
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -45,6 +45,10 @@
 
 check_model = test_torchinductor.check_model
 set_num_threads = test_cpu_repro.set_num_threads
+<<<<<<< HEAD
+=======
+run_and_get_cpp_code = test_torchinductor.run_and_get_cpp_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 
@@ -265,6 +269,10 @@ def forward(self, x):
     )
     @dtypes(torch.float, torch.bfloat16, torch.half)
     @torch.fx.experimental._config.patch(use_duck_shape=False)
+<<<<<<< HEAD
+=======
+    @torch._dynamo.config.patch(specialize_float=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_linear_with_pointwise(
         self, batch_size, in_features, out_features, bias, epilogue, dtype
     ):
@@ -306,7 +314,10 @@ def forward(self, x):
                 dtype == torch.float32
                 and epilogue == "add"
                 and not bias
+<<<<<<< HEAD
                 and dynamo_config.dynamic_shapes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and not dynamo_config.assume_static_by_default
             )
         ):
@@ -649,8 +660,14 @@ def forward(self, mul_239, view_425, add_184):
         view_425 = torch.randn(flatten_BS, in_features)
         add_184 = torch.randn(batch_size, img_size_0, img_size_1, in_features)
         mod = M(bias=bias).eval()
+<<<<<<< HEAD
         with verify(dtype) as (atol, rtol), torch.cpu.amp.autocast(
             enabled=dtype == torch.bfloat16
+=======
+        with (
+            verify(dtype) as (atol, rtol),
+            torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             self.common(
                 mod,
@@ -1300,7 +1317,13 @@ def forward(self, arg152_1):
                 rtol=rtol,
             )
         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 2)
+<<<<<<< HEAD
         self.assertEqual(counters["inductor"]["cpp_epilogue_fusion_counter"], 2)
+=======
+        self.assertEqual(
+            counters["inductor"]["cpp_epilogue_fusion_counter"], 2 if TEST_MKL else 1
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @inductor_config.patch({"freezing": True})
     @patches
@@ -1350,10 +1373,17 @@ def forward(self, x):
         if dtype == torch.bfloat16:
             atol, rtol = 5e-2, 5e-2
 
+<<<<<<< HEAD
         with patch.object(
             select_algorithm, "VERIFY", dict(atol=atol, rtol=rtol)
         ), torch.no_grad(), torch.autocast(
             "cpu", enabled=(dtype == torch.bfloat16), dtype=dtype
+=======
+        with (
+            patch.object(select_algorithm, "VERIFY", dict(atol=atol, rtol=rtol)),
+            torch.no_grad(),
+            torch.autocast("cpu", enabled=(dtype == torch.bfloat16), dtype=dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             ref_res = ref_quantized_mod(input)
             cfn = torch.compile(ref_quantized_mod)
@@ -1373,6 +1403,7 @@ def forward(self, x):
     @patches
     @torch.no_grad
     @dtypes(torch.bfloat16)
+<<<<<<< HEAD
     @parametrize("batch_size", (32,))
     @parametrize("in_features", (128, 144))
     @parametrize("out_features", (64, 65))
@@ -1380,6 +1411,26 @@ def test_int8_woq_mm(self, dtype, batch_size, in_features, out_features):
         # x will be reshaped from 3d to 2d
         second_dim_size = 8
 
+=======
+    @parametrize(
+        "batch_size",
+        (
+            1,
+            17,
+            32,
+        ),
+    )
+    @parametrize(
+        "mid_dim",
+        (
+            1,
+            8,
+        ),
+    )
+    @parametrize("in_features", (128, 144, 1024))
+    @parametrize("out_features", (64, 65, 1024))
+    def test_int8_woq_mm(self, dtype, batch_size, mid_dim, in_features, out_features):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def _convert_weight_to_int8pack(w):
             scale, zp = _calculate_dynamic_per_channel_qparams(
                 w.to(torch.float), torch.int8
@@ -1411,14 +1462,108 @@ def forward(self, x, scale):
         counters.clear()
         # Currently, the corresponding torch.fx pattern only supports 3D x
         # Add 2D X case once the corresponding pattern-matcher pattern is added
+<<<<<<< HEAD
         x = torch.rand((batch_size, second_dim_size, in_features), dtype=dtype)
+=======
+        x = torch.rand((batch_size, mid_dim, in_features), dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         w = torch.rand((out_features, in_features), dtype=dtype)
         w_int8pack, w_scales = _convert_weight_to_int8pack(w)
         mod = M(w_int8pack).eval()
         self.common(mod, (x, w_scales))
         self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
+<<<<<<< HEAD
         vec_amx = VecAMX()
         self._check_amx_counter(vec_amx)
+=======
+        if batch_size * mid_dim >= 16:
+            vec_amx = VecAMX()
+            self._check_amx_counter(vec_amx)
+
+    @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True})
+    @patches
+    @torch.no_grad
+    @dtypes(torch.bfloat16)
+    @parametrize(
+        "batch_size",
+        (
+            1,
+            32,
+        ),
+    )
+    @parametrize(
+        "mid_dim",
+        (
+            1,
+            8,
+        ),
+    )
+    @parametrize("in_features", (128,))
+    @parametrize("out_features", (64,))
+    def test_int8_woq_mm_concat(
+        self, dtype, batch_size, mid_dim, in_features, out_features
+    ):
+        def _convert_weight_to_int8pack(w):
+            scale, zp = _calculate_dynamic_per_channel_qparams(
+                w.to(torch.float), torch.int8
+            )
+            scale = torch.from_numpy(scale)
+            zp = torch.from_numpy(zp)
+            w_int8 = torch.ao.quantization.fx._decomposed.quantize_per_channel(
+                input=w,
+                scales=scale,
+                zero_points=zp,
+                axis=0,
+                quant_min=-128,
+                quant_max=127,
+                dtype=torch.int8,
+            )
+            return w_int8, scale.to(torch.bfloat16)
+
+        class M(torch.nn.Module):
+            def __init__(self, w1, w2, w3):
+                super().__init__()
+                self.w1 = torch.nn.Parameter(w1, requires_grad=False)
+                self.w2 = torch.nn.Parameter(w2, requires_grad=False)
+                self.w3 = torch.nn.Parameter(w3, requires_grad=False)
+
+            def forward(self, x, scale1, scale2, scale3):
+                # Ref: _linear_fp_act_int8_weight_impl in torchao/dtypes/uintx/plain_layout.py
+                y1 = (
+                    torch.mm(x.reshape(-1, x.shape[-1]), self.w1.t().to(x.dtype))
+                    * scale1
+                )
+                y2 = (
+                    torch.mm(x.reshape(-1, x.shape[-1]), self.w2.t().to(x.dtype))
+                    * scale2
+                )
+                y3 = (
+                    torch.mm(x.reshape(-1, x.shape[-1]), self.w3.t().to(x.dtype))
+                    * scale3
+                )
+                return (
+                    y1.reshape(*x.shape[:-1], y1.shape[-1]),
+                    y2.reshape(*x.shape[:-1], y2.shape[-1]),
+                    y3.reshape(*x.shape[:-1], y3.shape[-1]),
+                )
+
+        counters.clear()
+        # Currently, the corresponding torch.fx pattern only supports 3D x
+        # Add 2D X case once the corresponding pattern-matcher pattern is added
+        x = torch.rand((batch_size, mid_dim, in_features), dtype=dtype)
+        w1 = torch.rand((out_features, in_features), dtype=dtype)
+        w2 = torch.rand((out_features, in_features), dtype=dtype)
+        w3 = torch.rand((out_features, in_features), dtype=dtype)
+        w1_int8pack, w1_scales = _convert_weight_to_int8pack(w1)
+        w2_int8pack, w2_scales = _convert_weight_to_int8pack(w2)
+        w3_int8pack, w3_scales = _convert_weight_to_int8pack(w3)
+        mod = M(w1_int8pack, w2_int8pack, w3_int8pack).eval()
+        self.common(mod, (x, w1_scales, w2_scales, w3_scales))
+        self.assertEqual(counters["inductor"]["cpp_templated_kernel_counter"], 1)
+        if batch_size * mid_dim >= 16:
+            vec_amx = VecAMX()
+            self._check_amx_counter(vec_amx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(
         not torch._C._cpu._is_amx_tile_supported(), "AMX ISA support is required"
@@ -1513,7 +1658,11 @@ def forward(self, a):
     @patches
     @torch.no_grad
     @dtypes(torch.bfloat16)
+<<<<<<< HEAD
     @parametrize("batch_size", (32,))
+=======
+    @parametrize("batch_size", (1,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("in_features", (128, 256))
     @parametrize("out_features", (64, 128))
     @parametrize("group_size", (32, 64))
@@ -1538,7 +1687,11 @@ def forward(self, x):
                 return y.reshape(*x_shape[:-1], out_features)
 
         counters.clear()
+<<<<<<< HEAD
         seq_len = 8
+=======
+        seq_len = 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.rand((batch_size, seq_len, in_features), dtype=dtype)
         mod = M(in_features, out_features, group_size).eval()
         self.common(mod, (x,), reference_in_float=False)
@@ -1549,6 +1702,282 @@ def forward(self, x):
             counters["inductor"]["select_algorithm_autotune"], autotune_count
         )
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        not torch._C._cpu._is_amx_tile_supported(), "AMX ISA support is required"
+    )
+    @inductor_config.patch({"freezing": True})
+    @patches
+    @torch.no_grad
+    @dtypes(torch.bfloat16)
+    @parametrize("batch_size", (64,))
+    @parametrize("in_features", (14336,))
+    @parametrize("out_features", (96,))
+    @parametrize("group_size", (128,))
+    @set_num_threads(1)
+    def test_int4_woq_mm_amx_Nc_larger_than_one(
+        self, dtype, batch_size, in_features, out_features, group_size
+    ):
+        """
+        Note:
+        `torch._weight_int4pack_mm_for_cpu` computes with float32, while the AMX-based GEMM
+        template computes with bfloat16. So, the difference of computation results may be big.
+        But we need `_weight_int4pack_mm_for_cpu` for its pattern.
+        Therefore, we define module M1 for its pattern and parameters and define module M2 for
+        the reference computation. M2's forward function gets the dequantized and unpacked weight
+        in bfloat16 then computes GEMM with bfloat16.
+        Besides, we need to skip the VERIFY patch and cannot use self.common for testing.
+        """
+
+        class M1(torch.nn.Module):
+            def __init__(self, K, N, group_size):
+                super().__init__()
+                self.linear_weight = torch.randint(
+                    0, 255, (N, K // 2), dtype=torch.uint8
+                )
+                self.qscale_and_zeros = torch.rand(K // group_size, N, 2, dtype=dtype)
+                self.group_size = group_size
+
+            def forward(self, x):
+                x_shape = x.shape
+                x = x.reshape(-1, x_shape[-1])
+                y = torch._weight_int4pack_mm_for_cpu(
+                    x, self.linear_weight, self.group_size, self.qscale_and_zeros
+                )
+                return y.reshape(*x_shape[:-1], out_features)
+
+        class M2(torch.nn.Module):
+            def __init__(self, mod: M1):
+                super().__init__()
+                self.mod = mod
+
+            def forward(self, x):
+                x_eye = torch.eye(x.shape[-1], device=x.device, dtype=x.dtype)
+                dq_w = self.mod(x_eye).T.contiguous()
+                return torch.nn.functional.linear(x, dq_w)
+
+        counters.clear()
+        seq_len = 8
+        x = torch.rand((batch_size, seq_len, in_features), dtype=dtype)
+        mod = M1(in_features, out_features, group_size).eval()
+        mod2 = M2(mod)
+        # Skip VERIFY during torch.compile and don't use self.common. See explanation above.
+        with patch.object(select_algorithm, "VERIFY", None):
+            m = torch.compile(mod)
+            y_ref = mod2(x)
+            y = m(x)
+            self.assertEqual(
+                y,
+                y_ref,
+                atol=1e-2,
+                rtol=1e-2,
+            )
+            self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+
+    @unittest.skipIf(
+        not torch._C._cpu._is_amx_tile_supported(), "AMX ISA support is required"
+    )
+    @inductor_config.patch({"freezing": True})
+    @inductor_config.patch({"cpp.use_small_dequant_buffer": True})
+    @patches
+    @torch.no_grad
+    @dtypes(torch.bfloat16)
+    @parametrize("batch_size", (16,))
+    @parametrize("in_features", (14336,))
+    @parametrize("out_features", (96,))
+    @parametrize("group_size", (128,))
+    @set_num_threads(1)
+    def test_int4_woq_mm_with_small_buffer_config(
+        self, dtype, batch_size, in_features, out_features, group_size
+    ):
+        class M1(torch.nn.Module):
+            def __init__(self, K, N, group_size):
+                super().__init__()
+                self.linear_weight = torch.randint(
+                    0, 255, (N, K // 2), dtype=torch.uint8
+                )
+                self.qscale_and_zeros = torch.rand(K // group_size, N, 2, dtype=dtype)
+                self.group_size = group_size
+
+            def forward(self, x):
+                x_shape = x.shape
+                x = x.reshape(-1, x_shape[-1])
+                y = torch._weight_int4pack_mm_for_cpu(
+                    x, self.linear_weight, self.group_size, self.qscale_and_zeros
+                )
+                return y.reshape(*x_shape[:-1], out_features)
+
+        counters.clear()
+        seq_len = 1
+        x = torch.rand((batch_size, seq_len, in_features), dtype=dtype)
+        mod = M1(in_features, out_features, group_size).eval()
+        with patch.object(select_algorithm, "VERIFY", None):
+            m = torch.compile(mod)
+            _, code = run_and_get_cpp_code(m, x)
+            kr = 32  # only kr=32 supported in woq int4 amx kernel
+            _target_code_check = f"constexpr int64_t Kc_blocks = {group_size // kr};"
+            torch._C.FileCheck().check(_target_code_check).run(code)
+
+    @unittest.skipIf(
+        not torch._C._cpu._is_amx_tile_supported(), "AMX ISA support is required"
+    )
+    @inductor_config.patch({"freezing": True})
+    @patches
+    @torch.no_grad
+    @dtypes(torch.bfloat16)
+    @parametrize("batch_size", (1, 4, 6))
+    @parametrize("in_features", (128, 1024))
+    @parametrize("out_features", (128, 1024))
+    @parametrize("group_size", (32, 64, 128))
+    def test_int4_woq_mm_amx(
+        self, dtype, batch_size, in_features, out_features, group_size
+    ):
+        """
+        Note:
+        `torch._weight_int4pack_mm_for_cpu` computes with float32, while the AMX-based GEMM
+        template computes with bfloat16. So, the difference of computation results may be big.
+        But we need `_weight_int4pack_mm_for_cpu` for its pattern.
+        Therefore, we define module M1 for its pattern and parameters and define module M2 for
+        the reference computation. M2's forward function gets the dequantized and unpacked weight
+        in bfloat16 then computes GEMM with bfloat16.
+        Besides, we need to skip the VERIFY patch and cannot use self.common for testing.
+        """
+
+        class M1(torch.nn.Module):
+            def __init__(self, K, N, group_size):
+                super().__init__()
+                self.linear_weight = torch.randint(
+                    0, 255, (N, K // 2), dtype=torch.uint8
+                )
+                self.qscale_and_zeros = torch.rand(K // group_size, N, 2, dtype=dtype)
+                self.group_size = group_size
+
+            def forward(self, x):
+                x_shape = x.shape
+                x = x.reshape(-1, x_shape[-1])
+                y = torch._weight_int4pack_mm_for_cpu(
+                    x, self.linear_weight, self.group_size, self.qscale_and_zeros
+                )
+                return y.reshape(*x_shape[:-1], out_features)
+
+        class M2(torch.nn.Module):
+            def __init__(self, mod: M1):
+                super().__init__()
+                self.mod = mod
+
+            def forward(self, x):
+                x_eye = torch.eye(x.shape[-1], device=x.device, dtype=x.dtype)
+                dq_w = self.mod(x_eye).T.contiguous()
+                return torch.nn.functional.linear(x, dq_w)
+
+        counters.clear()
+        seq_len = 8
+        x = torch.rand((batch_size, seq_len, in_features), dtype=dtype)
+        mod = M1(in_features, out_features, group_size).eval()
+        mod2 = M2(mod)
+        # Skip VERIFY during torch.compile and don't use self.common. See explanation above.
+        with patch.object(select_algorithm, "VERIFY", None):
+            m = torch.compile(mod)
+            y_ref = mod2(x)
+            y = m(x)
+            self.assertEqual(
+                y,
+                y_ref,
+                atol=1e-2,
+                rtol=1e-2,
+            )
+            self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+
+    @unittest.skipIf(
+        not torch._C._cpu._is_amx_tile_supported(), "AMX ISA support is required"
+    )
+    @inductor_config.patch({"freezing": True})
+    @inductor_config.patch({"cpp.enable_concat_linear": True})
+    @patches
+    @torch.no_grad
+    @dtypes(torch.bfloat16)
+    @parametrize("batch_size", (4,))
+    @parametrize("in_features", (256,))
+    @parametrize("out_features", ((512, 256, 256), (512, 512)))
+    @parametrize("group_size", (32, 128))
+    def test_int4_concat_woq_mm(
+        self, dtype, batch_size, in_features, out_features, group_size
+    ):
+        class M1(torch.nn.Module):
+            def __init__(self, K, out_features, group_size):
+                super().__init__()
+                self.linear_weight = [
+                    torch.randint(0, 255, (N, K // 2), dtype=torch.uint8)
+                    for N in out_features
+                ]
+                self.qscale_and_zeros = [
+                    torch.rand(K // group_size, N, 2, dtype=dtype) for N in out_features
+                ]
+                self.group_size = group_size
+                self.out_features = out_features
+
+            def forward(self, x):
+                x_shape = x.shape
+                x = x.reshape(-1, x_shape[-1])
+                y = [
+                    torch._weight_int4pack_mm_for_cpu(
+                        x,
+                        self.linear_weight[idx],
+                        self.group_size,
+                        self.qscale_and_zeros[idx],
+                    )
+                    for idx in range(len(self.out_features))
+                ]
+                return [
+                    y[idx].reshape(*x_shape[:-1], self.out_features[idx])
+                    for idx in range(len(self.out_features))
+                ]
+
+        class M2(torch.nn.Module):
+            def __init__(self, mod: M1):
+                super().__init__()
+                self.mod = mod
+
+            def forward(self, x):
+                x_eye = torch.eye(x.shape[-1], device=x.device, dtype=x.dtype)
+                dq_w_list = []
+                for idx in range(len(self.mod.out_features)):
+                    x_shape = x_eye.shape
+                    dq_w = torch._weight_int4pack_mm_for_cpu(
+                        x_eye,
+                        self.mod.linear_weight[idx],
+                        self.mod.group_size,
+                        self.mod.qscale_and_zeros[idx],
+                    )
+                    dq_w_list.append(
+                        dq_w.reshape(
+                            *x_shape[:-1], self.mod.out_features[idx]
+                        ).T.contiguous()
+                    )
+
+                return [torch.nn.functional.linear(x, dq_w) for dq_w in dq_w_list]
+
+        counters.clear()
+        seq_len = 8
+        x = torch.rand((batch_size, seq_len, in_features), dtype=dtype)
+        mod = M1(in_features, out_features, group_size).eval()
+        mod2 = M2(mod)
+        # Skip VERIFY during torch.compile and don't use self.common. See explanation above.
+        with patch.object(select_algorithm, "VERIFY", None):
+            y_ref = mod2(x)
+            m = torch.compile(mod)
+            y = m(x)
+            self.assertEqual(
+                y,
+                y_ref,
+                atol=1e-2,
+                rtol=1e-2,
+            )
+            # Only do once tuning, since the wgt has been concat
+            self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch({"freezing": True})
     @patches
     @torch.no_grad
@@ -1616,10 +2045,17 @@ def forward(self, x, other, other2):
             (input, other, other2),
         )
         atol, rtol = 5e-2, 5e-2
+<<<<<<< HEAD
         with patch.object(
             select_algorithm, "VERIFY", dict(atol=atol, rtol=rtol)
         ), torch.no_grad(), torch.autocast(
             "cpu", enabled=int8_mixed_bf16, dtype=torch.bfloat16
+=======
+        with (
+            patch.object(select_algorithm, "VERIFY", dict(atol=atol, rtol=rtol)),
+            torch.no_grad(),
+            torch.autocast("cpu", enabled=int8_mixed_bf16, dtype=torch.bfloat16),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             ref_res = ref_quantized_mod(input, other, other2)
             cfn = torch.compile(ref_quantized_mod)
@@ -1809,7 +2245,10 @@ def forward(self, x):
         with verify(dtype) as (atol, rtol), torch.no_grad():
             expected = mod(v)
             actual = test_aot_inductor_utils.AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 "cpu",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mod,
                 (v,),
             )
@@ -1855,9 +2294,17 @@ def forward(self, x):
             counters.clear()
             mod = M(in_features, out_features, gemm_num).eval()
             v = torch.randn(batch_size, in_features).to(dtype)
+<<<<<<< HEAD
             with verify(dtype) as (atol, rtol), torch.autocast(
                 device_type="cpu", dtype=dtype
             ), torch.no_grad():
+=======
+            with (
+                verify(dtype) as (atol, rtol),
+                torch.autocast(device_type="cpu", dtype=dtype),
+                torch.no_grad(),
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.common(mod, (v,), atol=atol, rtol=rtol)
             # gemm_num independent template instead of grouped gemm template
             self.assertEqual(
@@ -1909,9 +2356,17 @@ def forward(self, x):
             mod = M(in_features, out_features, gemm_num).eval()
             B = (2, batch_size) if input_3d else (batch_size,)
             v = torch.randn(*B, in_features).to(dtype)
+<<<<<<< HEAD
             with verify(dtype) as (atol, rtol), torch.autocast(
                 device_type="cpu", dtype=dtype
             ), torch.no_grad():
+=======
+            with (
+                verify(dtype) as (atol, rtol),
+                torch.autocast(device_type="cpu", dtype=dtype),
+                torch.no_grad(),
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.common(mod, (v,), atol=atol, rtol=rtol)
             self.assertEqual(counters["inductor"]["cpp_grouped_gemm_template"], 1)
 
@@ -1987,9 +2442,17 @@ def forward(self, x):
             mod = M(in_features, out_features, bias, epilogue).eval()
             B = (2, batch_size) if input_3d else (batch_size,)
             v = torch.randn(*B, in_features).to(dtype)
+<<<<<<< HEAD
             with verify(dtype) as (atol, rtol), torch.autocast(
                 device_type="cpu", dtype=dtype
             ), torch.no_grad():
+=======
+            with (
+                verify(dtype) as (atol, rtol),
+                torch.autocast(device_type="cpu", dtype=dtype),
+                torch.no_grad(),
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.common(mod, (v,), atol=atol, rtol=rtol)
             self.assertEqual(counters["inductor"]["cpp_grouped_gemm_template"], 1)
             if any(e != "none" for e in epilogue):
@@ -2045,7 +2508,10 @@ def forward(self, x):
         with verify(dtype) as (atol, rtol), torch.no_grad():
             expected = mod(v)
             actual = test_aot_inductor_utils.AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 "cpu",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mod,
                 (v,),
             )
@@ -2420,7 +2886,10 @@ def forward(self, x, w):
         with verify(dtype) as (atol, rtol), torch.no_grad():
             expected = mod(x, w)
             actual = test_aot_inductor_utils.AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 "cpu",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mod,
                 (x, w),
             )
@@ -2477,7 +2946,10 @@ def forward(self, x):
                 x,
             )
             actual = test_aot_inductor_utils.AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 "cpu",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mod,
                 (x,),
             )
@@ -2604,6 +3076,10 @@ def forward(self, x, other, noise):
                 return self.epilogue(result) + noise
 
         counters.clear()
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         u = torch.randn(bs, 8, Mdim, Kdim).to(dtype=dtype)
         v = torch.randn(bs, 8, Kdim, Ndim).to(dtype=dtype)
         noise = torch.randn(bs * 8, Mdim, Ndim).to(dtype=dtype)
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
index 53bfc82c9e1d..ac2a14185460 100644
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@@ -4,6 +4,10 @@
 import functools
 import gc
 import math
+<<<<<<< HEAD
+=======
+import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import unittest
 
@@ -17,6 +21,10 @@
 from torch._dynamo.utils import same
 from torch._inductor import config
 from torch._inductor.compile_fx import compile_fx_inner
+<<<<<<< HEAD
+=======
+from torch._inductor.runtime.benchmarking import benchmarker
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.runtime.hints import DeviceProperties
 from torch._inductor.utils import (
     run_and_get_code,
@@ -35,10 +43,26 @@
     DeterministicGuard,
     freeze_rng_state,
     IS_FBCODE,
+<<<<<<< HEAD
     skipIfRocm,
     TEST_WITH_ASAN,
     xfailIfPy312Plus,
 )
+=======
+    TEST_WITH_ASAN,
+    TEST_WITH_ROCM,
+    xfailIfPy312Plus,
+)
+from torch.testing._internal.inductor_utils import IS_BIG_GPU
+
+
+if TEST_WITH_ROCM:
+    config.force_layout_optimization = 1
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1"
+
+
+DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 requires_multigpu = functools.partial(
@@ -182,7 +206,10 @@ def f(q, k, v, mask):
 
             self.assertEqual(out, f(*inputs))
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_input_channels_last(self):
         m = torch.nn.Sequential(
             torch.nn.Conv2d(3, 3, 1, 1),
@@ -315,6 +342,35 @@ def fn(x, y):
                 )
                 self.assertTrue(same(fn(*inputs), (inputs[0] + inputs[1], 6)))
 
+<<<<<<< HEAD
+=======
+    def _test_split_reduction_impl(self, x):
+        def max(x):
+            return torch.max(x)
+
+        max_c = torch.compile(max)
+
+        out, code = run_and_get_code(max_c, x)
+        self.assertEqual(out, max(x))
+
+        if DO_PERF_TEST:
+            ms_c = benchmarker.benchmark_gpu(lambda: max_c(x))
+            ms_eager = benchmarker.benchmark_gpu(lambda: max(x))
+            print(f"compile {ms_c=:.03f}, eager {ms_eager=:.03f}")
+
+    def test_split_reduction_transposed(self):
+        x = torch.randn(4096, 8192, dtype=torch.bfloat16, device="cuda")
+        x = x.t().contiguous().t()
+
+        self._test_split_reduction_impl(x)
+
+    def test_split_reduction_channels_last(self):
+        x = torch.randn(4096, 8192, dtype=torch.bfloat16, device="cuda")
+        x = x.reshape([256, 256, 256, 2]).to(memory_format=torch.channels_last)
+
+        self._test_split_reduction_impl(x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch({"emulate_precision_casts": True})
     def test_bool_emulate_low_precision(self):
         from torch import device
@@ -545,7 +601,11 @@ def test_autotune_inplace_kernel(self):
         """
         This UT tests autotune on an inplace kernel. The autotune should not contaminate
         the input buffers when tuning with multiple configs. For more details, refer to
+<<<<<<< HEAD
         https://github.com/openai/triton/issues/781
+=======
+        https://github.com/triton-lang/triton/issues/781
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         https://github.com/pytorch/torchdynamo/issues/1670
         """
         from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
@@ -613,9 +673,15 @@ def kernel(in_out_ptr0, in_ptr0, xnumel, XBLOCK: tl.constexpr):
         kernel.run(inout1, in0, xnumel, stream=stream0)
         kernel.run(inout2, in0, xnumel, stream=stream0)
 
+<<<<<<< HEAD
         assert same(
             inout1, inout2, tol=0.001, equal_nan=True
         ), "failed autotune with inplace kernel"
+=======
+        assert same(inout1, inout2, tol=0.001, equal_nan=True), (
+            "failed autotune with inplace kernel"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_sort_stride_issue(self):
         # This minified testcase comes from detectron2_maskrcnn_r_50_fpn
@@ -806,9 +872,15 @@ def forward(self, enc_out: torch.Tensor, dec_in: torch.Tensor):
         ]
 
         for dec_inp in dec_inputs:
+<<<<<<< HEAD
             assert same_two_models(
                 mod, opt_mod, [enc_out, dec_inp], only_fwd=True
             ), "Inductor with dynamic shapes failed"
+=======
+            assert same_two_models(mod, opt_mod, [enc_out, dec_inp], only_fwd=True), (
+                "Inductor with dynamic shapes failed"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_issue97695_1input(self):
         def fn(arg3_1, relu, permute_1):
@@ -885,6 +957,32 @@ def test_scatter_index_not_wrapped(self):
             out, torch.scatter_reduce(input_orig.clone(), 0, index, src, "sum")
         )
 
+<<<<<<< HEAD
+=======
+    def test_libdevice_routing(self):
+        def foo(x):
+            return x.exp()
+
+        inp = torch.ones(64, device="cuda").to(torch.float64)
+
+        out, code = run_and_get_code(torch.compile(foo), inp)
+        FileCheck().check("libdevice.exp").run(code[0])
+        self.assertEqual(foo(inp), out)
+
+        inp = inp.to(torch.float)
+        out, code = run_and_get_code(torch.compile(foo), inp)
+        FileCheck().check_not("libdevice.exp").check("tl_math.exp").run(code[0])
+        self.assertEqual(foo(inp), out)
+
+        def foo(x):
+            return x.sigmoid()
+
+        inp = torch.ones(64, device="cuda").to(torch.float64)
+        out, code = run_and_get_code(torch.compile(foo), inp)
+        FileCheck().check("libdevice.exp").run(code[0])
+        self.assertEqual(foo(inp), out)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_uint_view_copy(self):
         @torch.compile
         def view_copy(target, source):
@@ -1075,6 +1173,12 @@ def fn(values, offsets):
 
         self.assertEqual(expect, actual)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(
         {
             "max_autotune_gemm_backends": "TRITON",
@@ -1507,6 +1611,22 @@ def test_multi_output_layout_fallback(self):
 
         self.assertEqual(o1, o2)
 
+<<<<<<< HEAD
+=======
+    def test_sorted_masks(self):
+        @torch.compile()
+        def foo(x, y):
+            return (x + y).sum(dim=1)
+
+        x = torch.rand([255, 255], device="cuda")
+        y = torch.rand([255, 255], device="cuda")
+
+        _, code = run_and_get_code(foo, x, y)
+        FileCheck().check("tl.load").check_same("r0_mask").check_same("xmask").run(
+            code[0]
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cat_int8_one_kernel(self):
         @torch.compile()
         def cat(inps):
@@ -1525,7 +1645,11 @@ def cat(inps):
 
     @config.patch("triton.use_block_ptr", True)
     def test_selecsls42b_misaligned_address(self):
+<<<<<<< HEAD
         # https://github.com/openai/triton/issues/2836
+=======
+        # https://github.com/triton-lang/triton/issues/2836
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile(fullgraph=True)
         def fn(arg207_1, arg208_1, convert_element_type_40, expand, full, mul_3):
@@ -1565,7 +1689,28 @@ def fn(arg207_1, arg208_1, convert_element_type_40, expand, full, mul_3):
         fn(*args)
         torch.cuda.synchronize()  # shake out Triton Error [CUDA]: misaligned address
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+    def test_mutated_aligned_tensor(self):
+        t = torch.rand(4096, device="cuda", dtype=torch.float16)
+
+        def foo(x):
+            return x.add_(1)
+
+        foo_c = torch.compile(dynamic=False)(foo)
+
+        t_orig = t.clone()
+
+        # First invocation, assume alignment, second invocation,
+        # copy to alignment and then mutate after fn invocation
+        self.assertEqual(foo_c(t[:-1]), foo(t_orig[:-1]))
+        self.assertEqual(t, t_orig)
+
+        self.assertEqual(foo_c(t[1:]), foo(t_orig[1:]))
+        self.assertEqual(t, t_orig)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_commutative_scan_op(self):
         from torch._higher_order_ops.associative_scan import associative_scan
 
@@ -1612,7 +1757,10 @@ def outer_reduce(x):
         self.assertEqual(outer_reduce(a), out)
         self.assertTrue("for roffset" not in code)
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_scaled_dot_product_efficient_attention_backward(self):
         from torch import nn, Tensor
 
@@ -1723,9 +1871,15 @@ def forward(self, x):
 
         m = ToyModel().to(device="cuda:0")
         input_tensor = torch.randn(32, 3, 64, 64).to(device="cuda:0")
+<<<<<<< HEAD
         from torch._inductor.utils import fresh_inductor_cache
 
         with fresh_inductor_cache():
+=======
+        from torch._inductor.utils import fresh_cache
+
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cm = torch.compile(m, mode="max-autotune")
             out = cm(input_tensor)
             out2 = m(input_tensor)
@@ -1832,11 +1986,17 @@ def foo(inp):
                     getitem_24,
                 ]
             )
+<<<<<<< HEAD
             getitem_17 = (
                 getitem_18
             ) = (
                 getitem_19
             ) = getitem_20 = getitem_21 = getitem_22 = getitem_23 = getitem_24 = None
+=======
+            getitem_17 = getitem_18 = getitem_19 = getitem_20 = getitem_21 = (
+                getitem_22
+            ) = getitem_23 = getitem_24 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return cat_1
 
         for mark_dynamic in [False, True]:
@@ -1871,16 +2031,29 @@ def foo(x0):
 
         def foo(x0):
             x1 = x0 + 1
+<<<<<<< HEAD
             x2 = x1.view(dtype)
+=======
+            x2 = x1.view(dtype).view([16 * 16])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return x2
 
         x0 = torch.randint(0, 255, (16, 16), device=device, dtype=torch.uint8)
         foo_c = torch.compile(foo, backend="inductor", fullgraph=True)
 
         with torch.no_grad():
+<<<<<<< HEAD
             y_c = foo_c(x0)
 
         self.assertEqual(foo(x0), y_c)
+=======
+            result, code = run_and_get_code(foo_c, x0)
+
+        FileCheck().check("call").check_not("torch.ops.aten.reshape.default(").run(
+            code[0]
+        )
+        self.assertEqual(foo(x0), result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(
         not config.is_fbcode(),
@@ -1907,6 +2080,35 @@ def f(x, y):
 
         self.assertEqual(f(x_ref, y_ref), out)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        not config.is_fbcode(),
+        "bfloat16 atomic add is only supported in fbcode today #97016",
+    )
+    @skipCUDAIf(
+        not SM90OrLater, "uses bfloat16 atomic add instrs which requires SM >= 90"
+    )
+    @config.patch({"bfloat16_atomic_adds_enabled": False})
+    def test_atomic_add_bfloat16_config(self):
+        def f(x, y):
+            return torch.index_select(x, 0, y)
+
+        x = torch.randn(
+            2000, 384, dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        y = torch.ones(713268, dtype=torch.int64, device="cuda")
+        x_ref = x.clone().detach().requires_grad_(True)
+        y_ref = y.clone().detach()
+
+        out, (_, bw_code) = run_fw_bw_and_get_code(lambda: torch.compile(f)(x, y))
+        fc = FileCheck()
+        fc.check_not("tl.atomic_add")
+        fc.run(bw_code)
+
+        self.assertEqual(f(x_ref, y_ref), out)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(
         not SM90OrLater, "uses bfloat16 atomic add instrs which requires SM >= 90"
     )
@@ -2004,6 +2206,52 @@ def interpolate_chunked(x):
         out_compiled = torch.compile(interpolate_chunked)(x)
         self.assertEqual(out_eager, out_compiled)
 
+<<<<<<< HEAD
+=======
+    def test_max_autotune_nograd(self):
+        """
+        https://github.com/pytorch/pytorch/issues/155688
+        Smallest repro for max-autotune not working with no_grad
+        Before adding __int__ function to torch.utils._sympy.functions.Identity,
+        running the max_autotune mode would raise an error:
+        TypeError: Expected a number but got Identity
+        """
+
+        class ToyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+                self.linear_layers = nn.ModuleList(
+                    [
+                        nn.Linear(4, 1, bias=True),
+                        nn.Linear(5, 1, bias=True),
+                        nn.Linear(6, 1, bias=True),
+                        nn.Linear(7, 1, bias=True),
+                        nn.Linear(8, 1, bias=True),
+                    ]
+                )
+
+            def forward(self, x):
+                for layer in self.linear_layers:
+                    x2 = layer(x)
+                    x2 = F.relu(x2)
+                    x = torch.cat((x, x2), dim=1)
+
+                return x
+
+        model = ToyModel().to("cuda")
+        input_tensor = torch.randn((2, 4)).to("cuda")
+
+        compile_default = torch.compile(model, mode="default")
+        compile_max_autotune = torch.compile(model, mode="max-autotune")
+
+        with torch.no_grad():
+            default_output = compile_default(input_tensor)
+            max_autotune_output = compile_max_autotune(input_tensor)
+
+        self.assertEqual(default_output, max_autotune_output)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_cudacodecache.py b/test/inductor/test_cudacodecache.py
index 2054c9abb50d..9f63047f77e6 100644
--- a/test/inductor/test_cudacodecache.py
+++ b/test/inductor/test_cudacodecache.py
@@ -10,7 +10,11 @@
 from torch._inductor.codegen.cuda.cuda_env import nvcc_exist
 from torch._inductor.exc import CUDACompileError
 from torch._inductor.test_case import TestCase as InductorTestCase
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _SOURCE_CODE = r"""
@@ -40,7 +44,11 @@
 @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUDA_HOME setup")
 class TestCUDACodeCache(InductorTestCase):
     def test_cuda_load(self):
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Test both .o and .so compilation.
             (
                 object_file_path,
@@ -67,13 +75,21 @@ def test_cuda_load(self):
             torch.testing.assert_close(y, expected_y)
 
     def test_compilation_error(self):
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             error_source_code = _SOURCE_CODE.replace("saxpy_device", "saxpy_wrong", 1)
             with self.assertRaises(CUDACompileError):
                 CUDACodeCache.compile(error_source_code, "o")
 
     def test_async_compile(self):
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             async_compile = AsyncCompile()
             compiled_res = async_compile.cuda(_SOURCE_CODE, "so")
             async_compile.wait(globals())
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
index 565d8087b02f..0ddb1617b860 100644
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -23,6 +23,10 @@
 from torch._inductor.cudagraph_trees import cudagraphify_impl as tree_cudagraphify_impl
 from torch._inductor.cudagraph_utils import FunctionID
 from torch._inductor.test_case import TestCase as InductorTestCase
+<<<<<<< HEAD
+=======
+from torch._inductor.utils import run_and_get_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._ops import OpOverload
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.immutable_collections import immutable_dict
@@ -30,9 +34,17 @@
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+<<<<<<< HEAD
     IS_CI,
     IS_LINUX,
     IS_WINDOWS,
+=======
+    IS_ARM64,
+    IS_CI,
+    IS_LINUX,
+    IS_WINDOWS,
+    IS_X86,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parametrize,
     skipIfRocm,
     TEST_CUDA_GRAPH,
@@ -499,6 +511,32 @@ def inp():
             ).run(captured_output[0])
             self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
+<<<<<<< HEAD
+=======
+        def test_index_put(self):
+            def fn(x, y, z):
+                x = torch.zeros_like(x)
+                return x.index_put_([y], z, True)
+
+            fn_c = torch.compile(mode="reduce-overhead")(fn)
+
+            for i in range(3):
+
+                def args():
+                    x = torch.zeros((512, 512), dtype=torch.bool, device="cuda")
+                    y = torch.arange(512, dtype=torch.int64, device="cuda")
+                    z = torch.ones((512, 512), dtype=torch.bool, device="cuda")
+                    return x, y, z
+
+                if i == 0:
+                    out, code = run_and_get_code(fn_c, *args())
+                    FileCheck().check("aten.index_put_").check_same("True").run(code[0])
+                else:
+                    out = fn_c(*args())
+
+                self.assertEqual(fn(*args()), out)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_function_compiled_multiple_times(self):
             def foo(x):
                 y = foo2(x)
@@ -672,6 +710,52 @@ def foo(x):
         @torch._inductor.config.patch("fx_graph_remote_cache", False)
         # Currently fx graph cache is turned off for specialize_float=False
         @torch._dynamo.config.patch("specialize_float", True)
+<<<<<<< HEAD
+=======
+        @requires_multigpu()
+        def test_cached_boxed_forward_device_index(self):
+            @torch.compile(mode="reduce-overhead")
+            def foo(x):
+                return x * x * x
+
+            # Run with device index 1 so that we can see
+            # on a cache hit we stay on device index 1
+            with torch.cuda._DeviceGuard(1):
+                torch.cuda.set_device(1)
+
+                inp = torch.rand([20, 20], device="cuda", requires_grad=True)
+                out = foo(inp)
+                self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
+                # Compile the backward and save to cache
+                back_inp = torch.empty_strided([20, 20], [0, 1], device="cuda")
+                out.backward(back_inp)
+                self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
+                self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+                self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
+
+                # Reset dynamo and rerun a few times
+                for i in range(3):
+                    torch._dynamo.reset()
+
+                    inp = torch.rand([20, 20], device="cuda", requires_grad=True)
+                    out = foo(inp)
+                    # Should cache hit each time; boxed_forward_device_index should still be set properly to 1
+                    self.assertEqual(
+                        counters["aot_autograd"]["autograd_cache_hit"], i + 1
+                    )
+                    back_inp = torch.empty_strided([20, 20], [0, 1], device="cuda")
+                    out.backward(back_inp)
+
+            # After everything, we should have cudagraphs on device 1
+            self.assertTrue(self.get_manager(device_index=0) is None)
+            self.assertFalse(self.get_manager(device_index=1) is None)
+
+        @torch._functorch.config.patch("enable_autograd_cache", True)
+        @torch._inductor.config.patch("fx_graph_cache", True)
+        @torch._inductor.config.patch("fx_graph_remote_cache", False)
+        # Currently fx graph cache is turned off for specialize_float=False
+        @torch._dynamo.config.patch("specialize_float", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_backward_gets_cached_cudagraphs(self):
             # We pass cpu tensors to foo and save that into the cache
             # On a subsequent run in a new process, cudagraphs should be
@@ -832,6 +916,24 @@ def test_unaligned_static_input_non_trees(self):
         def test_unaligned_static_input_no_cudagraphs(self):
             self._test_unaligned_static_input_impl(expected_clones=0)
 
+<<<<<<< HEAD
+=======
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._inductor.config.patch("triton.cudagraph_trees", False)
+        def test_graph_partition_gc(self):
+            def _test_dummy():
+                def foo(x):
+                    return x + 1
+
+                foo = torch.compile(foo)
+                for _ in range(3):
+                    foo(torch.randn(2, 3, device="cuda"))
+
+            _test_dummy()
+            gc.collect()
+            self.assertIsNone(self.get_manager())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_sparsity(self):
             def foo(view_6, buf31):
                 return aten._sparse_coo_tensor_with_dims_and_tensors(
@@ -1379,7 +1481,11 @@ def foo2(args):
             self.assertEqual(all_live_block_count(), 0)
 
         @skipIfRocm
+<<<<<<< HEAD
         @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only")
+=======
+        @unittest.skipUnless(IS_X86 and IS_LINUX, "cpp contexts are linux only")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch._inductor.config.patch("triton.cudagraph_trees_history_recording", True)
         def test_workspace_allocation_error(self):
             torch._C._cuda_clearCublasWorkspaces()
@@ -1402,6 +1508,7 @@ def foo(x, y):
                     foo(*inps)
                 except Exception as e:
                     thrown = True
+<<<<<<< HEAD
                     self.assertTrue(
                         "at::cuda::blas::gemm<float>" in str(e)
                         or "at::cuda::blas::gemm_internal_cublas<float>" in str(e)
@@ -1410,6 +1517,18 @@ def foo(x, y):
                         "getCurrentCUDABlasHandle" in str(e)
                         or "getNewWorkspace" in str(e)
                     )
+=======
+                    if not IS_ARM64:
+                        self.assertTrue(
+                            "at::cuda::blas::gemm<float, float>" in str(e)
+                            or "at::cuda::blas::gemm_internal_cublas<float, float>"
+                            in str(e)
+                        )
+                        self.assertTrue(
+                            "getCurrentCUDABlasHandle" in str(e)
+                            or "getNewWorkspace" in str(e)
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 self.assertTrue(thrown)
 
@@ -1975,6 +2094,52 @@ def test_storage_access_error(self):
             with self.assertRaisesRegex(Exception, "custom error msg"):
                 device = x.untyped_storage()
 
+<<<<<<< HEAD
+=======
+        def test_side_stream_memory_allocation(self):
+            from torch._inductor.cudagraph_trees import cudagraphify_impl
+
+            def multi_stream_allocation(args):
+                side_stream = torch.cuda.Stream()
+                side_stream.wait_stream(torch.cuda.current_stream())
+                with torch.cuda.stream(side_stream):
+                    side_stream_buffer = torch.ones(
+                        *args, device="cuda:0", dtype=torch.float32
+                    )
+                torch.cuda.current_stream().wait_stream(side_stream)
+
+                main_stream_buffer = torch.ones(
+                    *args, device="cuda:0", dtype=torch.float32
+                )
+
+                if isinstance(args, list):
+                    args.clear()
+
+                return main_stream_buffer, side_stream_buffer
+
+            graphed_multi_stream_func = cudagraphify_impl(
+                multi_stream_allocation,
+                inputs=[],
+                static_input_idxs=[],
+                is_backward=False,
+                is_inference=False,
+                device_index=0,
+                stack_traces=["dummy stack trace1", "dummy stack trace2"],
+            )
+
+            ref_out = torch.ones((2, 3), device="cuda:0", dtype=torch.float32)
+
+            for _ in range(3):
+                torch.compiler.cudagraph_mark_step_begin()
+                main_stream_buffer, side_stream_buffer = graphed_multi_stream_func(
+                    [2, 3]
+                )
+                self.assertEqual(main_stream_buffer, ref_out)
+                self.assertEqual(side_stream_buffer, ref_out)
+
+            self.assertEqual(self.get_manager().new_graph_id().id, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch._dynamo.config.patch("inline_inbuilt_nn_modules", False)
         @torch._inductor.config.patch("triton.cudagraph_support_input_mutation", False)
         def test_static_inputs_address_mutation_log(self):
@@ -2274,9 +2439,13 @@ def forward(self, x):
                 "on cudagraph node None due to static input data pointer changed.",
                 1,
                 exactly=True,
+<<<<<<< HEAD
             ).run(
                 captured_output[0]
             )
+=======
+            ).run(captured_output[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(counters["inductor"]["cudagraph_skips"], 2)
 
         @torch._dynamo.config.patch("inline_inbuilt_nn_modules", False)
@@ -2561,6 +2730,926 @@ def f(x):
             eager_result = f(example_input)
             self.assertEqual(compiled_result, eager_result)
 
+<<<<<<< HEAD
+=======
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                return x1 + y1 + z + y_cpu.cuda()
+
+            x, y = [torch.randn(2, 2, device="cuda") for _ in range(2)]
+            x_cloned, y_cloned = [tmp.clone() for tmp in [x, y]]
+            eager_out = f(x, y)
+
+            f_compiled = torch.compile(f, mode="reduce-overhead")
+
+            for _ in range(5):
+                compiled_out = f_compiled(x_cloned, y_cloned)
+                self.assertEqual(eager_out, compiled_out)
+
+            # 2 graph partitions lead to 2 cudagraph
+            self.assertEqual(self.get_manager().new_graph_id().id, 2)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_cpu_scalar1(self):
+            def f(x, y):
+                return x + y
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+
+            inputs = (torch.ones(2, 2, device="cuda"), torch.ones((), device="cpu"))
+            for i in range(3):
+                if i == 0:
+                    _, code = run_and_get_code(compiled_f, *inputs)
+                    FileCheck().check_count(".copy_", 1, exactly=True).run(code[0])
+                else:
+                    compiled_f(*inputs)
+            self.assertEqual(compiled_f(*inputs), f(*inputs))
+            self.assertEqual(self.get_manager().new_graph_id().id, 1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_cpu_scalar2(self):
+            def f(x, y, z):
+                return x + y, x + z
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+
+            inputs = (
+                torch.ones((), device="cpu"),
+                torch.ones(2, 2, device="cuda"),
+                torch.ones(2, 2, device="cuda"),
+            )
+            for i in range(3):
+                if i == 0:
+                    _, code = run_and_get_code(compiled_f, *inputs)
+                    FileCheck().check_count(".copy_", 1, exactly=True).run(code[0])
+                else:
+                    compiled_f(*inputs)
+            self.assertEqual(compiled_f(*inputs), f(*inputs))
+            self.assertEqual(self.get_manager().new_graph_id().id, 1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_cpu_scalar3(self):
+            def f(x, y, cpu_scalar_tensor):
+                z = x + y
+                z = z + cpu_scalar_tensor
+                return z
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+
+            inputs = (
+                torch.randn(2, 2, device="cuda"),
+                torch.randn(2, 2, device="cuda"),
+                torch.tensor(1, device="cpu"),
+            )
+            for i in range(3):
+                if i == 0:
+                    _, code = run_and_get_code(compiled_f, *inputs)
+                    FileCheck().check_count(".copy_", 1, exactly=True).run(code[0])
+                else:
+                    compiled_f(*inputs)
+            self.assertEqual(compiled_f(*inputs), f(*inputs))
+            self.assertEqual(self.get_manager().new_graph_id().id, 1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_cpu_scalar4(self):
+            # cpu_scalar_tensor is accessed by cpu_scalar2 which is
+            # added with a gpu tensor z. This test checks the cpu
+            # scalar tensors are still moved in this case.
+            def f(x, y, cpu_scalar_tensor):
+                cpu_scalar2 = cpu_scalar_tensor + 1
+                z = x + y
+                z = z + cpu_scalar2
+                return z
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+
+            inputs = (
+                torch.randn(2, 2, device="cuda"),
+                torch.randn(2, 2, device="cuda"),
+                torch.tensor(1, device="cpu"),
+            )
+            for i in range(3):
+                if i == 0:
+                    _, code = run_and_get_code(compiled_f, *inputs)
+                    FileCheck().check_count(".copy_", 1, exactly=True).run(code[0])
+                else:
+                    compiled_f(*inputs)
+            self.assertEqual(compiled_f(*inputs), f(*inputs))
+            self.assertEqual(self.get_manager().new_graph_id().id, 1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        # turn on input mutation support to avoid skipping cudagraph at dynamo level
+        @torch._inductor.config.patch("triton.cudagraph_support_input_mutation", True)
+        def test_graph_partition_cpu_scalar_mutation(self):
+            # tests that input mutation on a cpu scalar tensor x is correctly
+            # handled when moving x to gpu at the beginning of the graph.
+
+            @torch.compile(mode="reduce-overhead")
+            def foo(x, y):
+                return x.copy_(y)
+
+            x = torch.tensor(1)
+            y = torch.tensor(2, device="cuda")
+
+            for _ in range(3):
+                foo(x, y)
+
+            self.assertEqual(x, torch.tensor(2, device="cpu"))
+            self.assertEqual(y, torch.tensor(2, device="cuda"))
+            self.assertEqual(self.get_manager().new_graph_id().id, 1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_cpu_scalar_device_put(self):
+            @torch.compile(mode="reduce-overhead")
+            def foo(x):
+                y = x.to("cuda")
+                z = y.to("cpu")
+                return z
+
+            x = torch.tensor(1)
+            for _ in range(3):
+                foo(x)
+
+            self.assertEqual(x, torch.tensor(1, device="cpu"))
+
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._inductor.config.patch("triton.cudagraphs", False)
+        def test_graph_partition_reduce_overhead_mode_effectiveness(self):
+            # test that `mode="reduce-overhead"` still controls whether
+            # cudagraph is applied. i.e., cudagraph is not applied when
+            # mode="default".
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                return x1 + y1 + z + y_cpu.cuda()
+
+            x, y = [torch.randn(2, 2, device="cuda") for _ in range(2)]
+
+            f_compiled = torch.compile(f)
+            for _ in range(5):
+                _out = f_compiled(x, y)
+            self.assertEqual(self.get_manager() is None, True)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_forward_backward(self):
+            class Mod(torch.nn.Module):
+                def __init__(self) -> None:
+                    super().__init__()
+                    self.linear = torch.nn.Linear(16, 16)
+
+                def forward(self, x):
+                    x1 = x + 1
+                    y1 = x + 2
+                    y_cpu = y1.cpu() + 1
+                    z = x @ y1
+                    inp = x1 + y1 + z + y_cpu.cuda()
+                    return self.linear(inp)
+
+            model = Mod().cuda()
+
+            input_data = torch.randn(16, 16).cuda()
+
+            criterion = torch.nn.CrossEntropyLoss()
+            optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+
+            compiled_model = torch.compile(model, mode="reduce-overhead")
+
+            for _ in range(5):
+                output = compiled_model(input_data)
+                loss = criterion(output, torch.randint(0, 10, (16,)).cuda())
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+            # 2 graph partitions lead to 2 fwd cudagraphs and 1 bwd cudagraphs
+            self.assertEqual(self.get_manager().new_graph_id().id, 3)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_cpu_only(self):
+            class Mod(torch.nn.Module):
+                def __init__(self) -> None:
+                    super().__init__()
+                    self.linear = torch.nn.Linear(16, 16)
+
+                def forward(self, x):
+                    x1 = x + 1
+                    y1 = x + 2
+                    y_cpu = y1 + 1
+                    z = x @ y1
+                    inp = x1 + y1 + z + y_cpu
+                    return self.linear(inp)
+
+            model = Mod().cpu()
+
+            input_data = torch.randn(16, 16).cpu()
+
+            criterion = torch.nn.CrossEntropyLoss()
+            optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+
+            compiled_model = torch.compile(model, mode="default")
+
+            for _ in range(5):
+                output = compiled_model(input_data)
+                loss = criterion(output, torch.randint(0, 10, (16,)).cpu())
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+            # 0 cudagraph since all ops are on cpu
+            self.assertEqual(self.get_manager() is None, True)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_forward_with_skipped_cudagraphed_backward(self):
+            @torch.compile(mode="reduce-overhead")
+            def foo(x):
+                return x * x * x
+
+            for _ in range(3):
+                inp = torch.rand([20, 20], device="cuda", requires_grad=True)
+                out = foo(inp)
+
+                with config.patch(always_complex_memory_overlap_TESTING_ONLY=True):
+                    back_inp = torch.empty_strided([20, 20], [0, 1], device="cuda")
+                    out.backward(back_inp)
+
+            # we should not have cudagraph'd the backwards
+            new_id = self.get_manager().new_graph_id().id
+            self.assertEqual(new_id, 1)
+
+            self.assertFalse(self.get_manager().running_forwards_with_pending_backwards)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_forward_backward_not_called(self):
+            # tests saved tensor is handled correctly
+            def foo(x, y):
+                x_out = x * x * x
+                torch._dynamo.graph_break()
+                y_out = y * y * y
+                return x_out, y_out
+
+            foo = torch.compile(foo, mode="reduce-overhead")
+
+            for _ in range(3):
+                inps = [
+                    torch.rand([20, 20], requires_grad=True, device="cuda")
+                    for _ in range(2)
+                ]
+                x_out, y_out = foo(inps[0], inps[1])
+                x_out.sum().backward()
+
+            self.assertFalse(self.get_manager().running_forwards_with_pending_backwards)
+
+            # we should not have cudagraph'd the y backward
+            new_id = self.get_manager().new_graph_id().id
+            self.assertEqual(new_id, 3)
+
+        @requires_multigpu()
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_multiple_devices_msg(self):
+            def foo(x, y):
+                return (x + 1, y + 2)
+
+            foo = torch.compile(foo, mode="reduce-overhead")
+            for _ in range(3):
+                foo(torch.ones([10], device="cuda"), torch.ones([20]))
+
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 0)
+
+            with capture_stderr() as captured_output:
+                for _ in range(3):
+                    foo(
+                        torch.ones([10], device="cuda:0"),
+                        torch.ones([10], device="cuda:1"),
+                    )
+
+            FileCheck().check("skipping cudagraphs due to multiple devices").run(
+                captured_output[0]
+            )
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+            new_id = self.get_manager().new_graph_id().id
+            self.assertEqual(new_id, 1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_dynamic_shapes(self):
+            def foo(x):
+                return x + 1
+
+            compiled_foo = torch.compile(foo, mode="reduce-overhead", fullgraph=True)
+
+            for input_shape in range(1, 4):
+                for _ in range(3):
+                    compiled_foo(torch.randn(input_shape, device="cuda"))
+
+            # 3 cudagraphs for 3 input shapes
+            self.assertEqual(self.get_manager().new_graph_id().id, 3)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_cpu_op_and_dynamic_shapes(self):
+            def f(x, y):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                return x1 + y1 + z + y_cpu.cuda()
+
+            f_compiled = torch.compile(f)
+            x, y = torch.ones(3, 3, device="cuda"), torch.randn(3, 3, device="cuda")
+            for _ in range(3):
+                compiled_out = f_compiled(x, y)
+                self.assertEqual(compiled_out, f(x, y))
+
+            x, y = torch.ones(4, 4, device="cuda"), torch.randn(4, 4, device="cuda")
+            for _ in range(3):
+                compiled_out = f_compiled(x, y)
+                self.assertEqual(compiled_out, f(x, y))
+
+            # 4 cudagraphs, due to (2 dynamic shapes) x (2 graph partitions)
+            self.assertEqual(self.get_manager().new_graph_id().id, 4)
+
+        @config.patch(implicit_fallbacks=True)
+        @config.patch("graph_partition", False)
+        def test_skip_cudagraph_unsafe_ops(self):
+            @torch.library.custom_op(
+                "mylib::mysin",
+                mutates_args=["out_list"],
+                schema="(Tensor x, Tensor(a!)[]? out_list) -> Tensor",
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def mysin(x, out_list) -> torch.Tensor:
+                r = x.sin()
+                if out_list is not None:
+                    out_list[0].copy_(r)
+                return r
+
+            @mysin.register_fake
+            def _(x, out_list) -> torch.Tensor:
+                return torch.empty_like(x)
+
+            def fn(x):
+                x = x * 3
+                s = [torch.empty_like(x)]
+                x = mysin(x, s)
+                x = x / 3
+                return x, s[0]
+
+            x = torch.randn(3, requires_grad=False, device="cuda")
+            expected = fn(x)
+            compiled_f = torch.compile(fn, mode="reduce-overhead", fullgraph=True)
+
+            with capture_stderr() as captured_output:
+                for _ in range(3):
+                    result = compiled_f(x)
+                    self.assertEqual(result, expected)
+
+            FileCheck().check("incompatible op mylib.mysin.default").run(
+                captured_output[0]
+            )
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_custom_op(self):
+            @torch.library.custom_op(
+                "mylib::movement",
+                mutates_args=(),
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def movement(pic: torch.Tensor) -> torch.Tensor:
+                img = pic.cpu()
+                cropped_img = (img + 1) * 2
+                return cropped_img.cuda() / 255.0
+
+            @movement.register_fake
+            def _(pic):
+                return torch.empty_like(pic)
+
+            @torch.library.custom_op(
+                "mylib::modify",
+                mutates_args=(),
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def modify(pic: torch.Tensor) -> torch.Tensor:
+                pic1 = pic + 1
+                pic1_cpu = (pic1.cpu() + 1) * 2
+                return pic1_cpu.cuda() + pic
+
+            @modify.register_fake
+            def _(pic):
+                return torch.empty_like(pic)
+
+            @torch.library.custom_op("mylib::transform", mutates_args=())
+            def transform(pic: torch.Tensor) -> torch.Tensor:
+                return (pic + 1) * 2
+
+            @transform.register_fake
+            def _(pic):
+                return torch.empty_like(pic)
+
+            img = torch.randn(3, 64, 64, device="cuda")
+
+            def f(img):
+                x = (img + 10) * 2
+                y = movement(x)
+                z = y + 1
+                u = transform(z)
+                v = 2 * u + 1
+                out = modify(v)
+                return out + 1
+
+            compiled_f = torch.compile(f, fullgraph=True)
+
+            eager_out = f(img)
+            compiled_out = compiled_f(img)
+
+            self.assertEqual(eager_out, compiled_out)
+
+            compiled_f = torch.compile(f, mode="reduce-overhead", fullgraph=True)
+
+            eager_out = f(img)
+
+            for _ in range(3):
+                compiled_out = compiled_f(img)
+                self.assertEqual(eager_out, compiled_out)
+
+            # splitting on 2 custom gives 3 cudagraphs
+            self.assertEqual(self.get_manager().new_graph_id().id, 3)
+
+        @config.patch(implicit_fallbacks=True)
+        @config.patch("graph_partition", True)
+        def test_graph_partition_custom_op_mutation(self):
+            @torch.library.custom_op(
+                "mylib::mysin",
+                mutates_args=["out_list"],
+                schema="(Tensor x, Tensor(a!)[]? out_list) -> Tensor",
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def mysin(x, out_list) -> torch.Tensor:
+                r = x.sin()
+                if out_list is not None:
+                    out_list[0].copy_(r)
+                return r
+
+            @mysin.register_fake
+            def _(x, out_list) -> torch.Tensor:
+                return torch.empty_like(x)
+
+            def fn(x):
+                x = x * 3
+                s = [torch.empty_like(x)]
+                x = mysin(x, s)
+                x = x / 3
+                return x, s[0]
+
+            x = torch.randn(3, requires_grad=False, device="cuda")
+            expected = fn(x)
+            compiled_f = torch.compile(fn, mode="reduce-overhead", fullgraph=True)
+            for _ in range(3):
+                result = compiled_f(x)
+                self.assertEqual(result, expected)
+
+            # splitting on 1 custom gives 2 cudagraphs
+            self.assertEqual(self.get_manager().new_graph_id().id, 2)
+
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_custom_op_dynamoc_shapes(self):
+            @torch.library.custom_op(
+                "mylib::movement",
+                mutates_args=(),
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def movement(pic: torch.Tensor) -> torch.Tensor:
+                img = pic.cpu()
+                cropped_img = (img + 1) * 2
+                return cropped_img.cuda() / 255.0
+
+            @movement.register_fake
+            def _(pic):
+                return torch.empty_like(pic)
+
+            def f(img):
+                x = (img + 10) * 2
+                y = movement(x)
+                z = y + 1
+                v = 2 * z + 1
+                return v + 1
+
+            compiled_f = torch.compile(f, fullgraph=True)
+
+            compiled_f = torch.compile(f, mode="reduce-overhead", fullgraph=True)
+
+            def run(size):
+                img = torch.randn(3, size, size, device="cuda")
+                eager_out = f(img)
+                for _ in range(3):
+                    compiled_out = compiled_f(img)
+                    self.assertEqual(eager_out, compiled_out)
+
+            run(64)
+            run(17)
+            run(42)
+
+            # 2 (from splitting on 1 custom op) x 3 (dynamic shapes) = 6
+            self.assertEqual(self.get_manager().new_graph_id().id, 6)
+
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_custom_op_no_split(self):
+            @torch.library.custom_op(
+                "mylib::modify",
+                mutates_args=(),
+            )
+            def modify(x: torch.Tensor) -> torch.Tensor:
+                return (x + 1) * 2
+
+            @modify.register_fake
+            def _(pic):
+                return torch.empty_like(pic)
+
+            def f(img):
+                x = (img + 10) * 2
+                y = modify(x)
+                z = y + 1
+                v = 2 * z + 1
+                return v + 1
+
+            compiled_f = torch.compile(f, fullgraph=True)
+
+            compiled_f = torch.compile(f, mode="reduce-overhead", fullgraph=True)
+
+            def run(size):
+                img = torch.randn(3, size, size, device="cuda")
+                eager_out = f(img)
+                for _ in range(3):
+                    compiled_out = compiled_f(img)
+                    self.assertEqual(eager_out, compiled_out)
+
+            run(64)
+            run(17)
+            run(42)
+
+            # 1 (from not splitting on custom op) x 3 (dynamic shapes) = 3
+            self.assertEqual(self.get_manager().new_graph_id().id, 3)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_cpu_tensor_symints(self):
+            def f(x, y):
+                return x + 1, y + 1
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+
+            def run(shape_x, shape_y):
+                x = torch.randn(shape_x, device="cuda")
+                y = torch.randn(shape_y, device="cpu")
+                for _ in range(3):
+                    compiled_f(x, y)
+
+            # static shape. record a NEW cudagraph
+            run(shape_x=(2, 3), shape_y=(4, 4))
+
+            # shape_y becomes dynamic shape leading to a new dynamo graph.
+            # This new dynamo graph forces a NEW cudagraph although tensor y is on cpu
+            run(shape_x=(2, 3), shape_y=(5, 6))
+
+            # tensor y is on cpu so NO new cudagraph is recorded
+            run(shape_x=(2, 3), shape_y=(7, 8))
+
+            # shape_x becomes dynamic shape, leading to a new dynamo graph
+            # this new dynamo graph forces a NEW cudagraph
+            run(shape_x=(3, 4), shape_y=(4, 4))
+
+            # tensor y is on cpu so NO new cudagraph is recorded
+            run(shape_x=(3, 4), shape_y=(10, 11))
+
+            self.assertEqual(self.get_manager().new_graph_id().id, 3)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_reorder_cpu_and_gpu(self):
+            def f(x_cuda, y_cpu, z_cuda, weight_cuda, weight_cpu):
+                x_cuda0 = x_cuda + 1
+                x_cuda1 = x_cuda0 @ weight_cuda
+                x_cuda2 = 2 * (x_cuda1 + x_cuda)
+
+                y_cpu0 = y_cpu + 1
+                y_cpu1 = y_cpu0 @ weight_cpu
+
+                z_cuda0 = z_cuda + 1
+                z_cuda1 = z_cuda0 @ weight_cuda
+                z_cuda2 = 2 * (z_cuda1 + z_cuda)
+
+                return x_cuda2, y_cpu1, z_cuda2
+
+            x_cuda = torch.randn(3, 3, device="cuda")
+            y_cpu = torch.randn(3, 3, device="cpu")
+            z_cuda = torch.randn(3, 3, device="cuda")
+            weight_cuda = torch.randn(3, 3, device="cuda")
+            weight_cpu = torch.randn(3, 3, device="cpu")
+
+            eager_out = f(x_cuda, y_cpu, z_cuda, weight_cuda, weight_cpu)
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+            for _ in range(3):
+                compiled_out = compiled_f(
+                    x_cuda, y_cpu, z_cuda, weight_cuda, weight_cpu
+                )
+                self.assertEqual(eager_out, compiled_out)
+
+            # reorder merges ops on cuda into 1 graph partition
+            self.assertEqual(self.get_manager().new_graph_id().id, 1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_reorder_cpu_and_gpu_interleave(self):
+            def f(x_cuda, y_cpu, z_cuda, weight_cuda, weight_cpu):
+                # partition 1 on cuda, no dependency
+                x_cuda0 = x_cuda + 1
+                x_cuda1 = x_cuda0 @ weight_cuda
+                x_cuda2 = 2 * (x_cuda1 + x_cuda)
+
+                # partition 2 on cpu w/ dependency on partition 1
+                y_cpu0 = y_cpu + 1
+                x_cuda2_cpu = x_cuda2.cpu()  # adds dependency on gpu computations
+                y_cpu1 = y_cpu0 @ weight_cpu + x_cuda2_cpu
+
+                # partition 3 on cuda w/o dependency
+                z_cuda0 = z_cuda + 1
+                z_cuda1 = z_cuda0 @ weight_cuda
+                z_cuda2 = 2 * (z_cuda1 + z_cuda)
+
+                # partition 4 on cpu w/o dependency
+                y_cpu2 = y_cpu + 5
+                y_cpu3 = y_cpu2 @ weight_cpu
+
+                # partition 5 on cuda w/o dependency
+                u_cuda0 = z_cuda + 3
+                u_cuda1 = u_cuda0 @ weight_cuda
+                u_cuda2 = 2 * (u_cuda0 + u_cuda1)
+
+                return x_cuda2, y_cpu1, z_cuda2, y_cpu3, u_cuda2
+
+            x_cuda = torch.randn(3, 3, device="cuda")
+            y_cpu = torch.randn(3, 3, device="cpu")
+            z_cuda = torch.randn(3, 3, device="cuda")
+            weight_cuda = torch.randn(3, 3, device="cuda")
+            weight_cpu = torch.randn(3, 3, device="cpu")
+
+            eager_out = f(x_cuda, y_cpu, z_cuda, weight_cuda, weight_cpu)
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+            for _ in range(3):
+                compiled_out = compiled_f(
+                    x_cuda, y_cpu, z_cuda, weight_cuda, weight_cpu
+                )
+                self.assertEqual(eager_out, compiled_out)
+
+            # the optimal order is
+            # [[partition 4 on cpu], [partition 1,3,5 on cuda], [partition 2 on cpu]]
+            # since partition2 depends on partition1. So we have 1 cudagraph in total.
+            self.assertEqual(self.get_manager().new_graph_id().id, 1)
+
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_reorder_custom_op_with_no_dependency(self):
+            # Two reasons for this:
+            # 1. We want to reuse the same mask for many masked_fill calls
+            # 2. Prevent inductor from fusing this op into other ops (e.g. masked_fill)
+            #    so we can still reorder in scheduler
+            @torch.library.custom_op(
+                "mylib::create_mask",
+                mutates_args=(),
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def create_mask(
+                padded_size: int, original_size: int, device: torch.device
+            ) -> torch.Tensor:
+                mask = torch.zeros((padded_size,), dtype=torch.bool, device=device)
+                mask[original_size:] = True
+                return mask
+
+            @create_mask.register_fake
+            def _(padded_size, original_size, device):
+                return torch.empty((padded_size,), dtype=torch.bool, device=device)
+
+            def f(padded_tensor, original_tensor, weight):
+                original_size = original_tensor.size()[0]
+                padded_size = padded_tensor.size()[0]
+
+                # element wise op so we don't care padding value
+                padded_tensor = padded_tensor + 1
+                padded_tensor = torch.nn.functional.relu(padded_tensor)
+
+                # dot product requires padding with 0
+                dot_res = padded_tensor.dot(weight)
+                padded_tensor += dot_res
+
+                # min requires padding with inf, so we create mask now
+                mask = create_mask(padded_size, original_size, padded_tensor.device)
+                min_res = torch.min(
+                    torch.ops.aten.masked_fill(padded_tensor, mask, float("inf"))
+                )
+
+                # max requires padding with inf. we can reuse previous mask
+                max_res = torch.max(
+                    torch.ops.aten.masked_fill(padded_tensor, mask, -float("inf"))
+                )
+
+                return min_res + max_res + padded_tensor
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+
+            def run(padded_size, original_size):
+                padded_tensor = torch.randn(padded_size, device="cuda")
+                padded_tensor[original_size:] = 0
+                original_tensor = torch.randn(original_size, device="meta")
+
+                weight = torch.randn(padded_size, device="cuda")
+                eager_out = f(padded_tensor, original_tensor, weight)
+                for _ in range(3):
+                    compiled_out = compiled_f(padded_tensor, original_tensor, weight)
+                    self.assertEqual(eager_out, compiled_out)
+
+            # although custom op `create_mask` happens at the middle of function, reorder
+            # moves it to the front so we only have 1 partition. This leads to 1 cudagraph
+            run(8, 4)
+
+            # recompilation leads to 1 NEW cudagraph
+            run(8, 6)
+
+            self.assertEqual(self.get_manager().new_graph_id().id, 2)
+
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_reorder_custom_op_with_no_dependency1(self):
+            # wrap with custom op so this is not fused into other ops
+            @torch.library.custom_op(
+                "mylib::create_size_tensor",
+                mutates_args=(),
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def create_size_tensor(
+                tensor: torch.Tensor, device: torch.device
+            ) -> torch.Tensor:
+                size = tensor.size()[0]
+                zero = torch.zeros((), device=device)
+                return zero + size
+
+            @create_size_tensor.register_fake
+            def _(tensor, device):
+                size = tensor.size()[0]
+                zero = torch.zeros((), device=device, dtype=torch.int64)
+                return zero + size
+
+            def fill(
+                padded_tensor: torch.Tensor, original_size: torch.Tensor, value
+            ) -> torch.Tensor:
+                padded_size = padded_tensor.size()[0]
+                size_range = torch.arange(padded_size, device=padded_tensor.device)
+                padded_tensor = torch.where(
+                    size_range >= original_size, value, padded_tensor
+                )
+                return padded_tensor
+
+            def f(padded_tensor, original_tensor, weight):
+                # element wise op so we don't care padding value
+                padded_tensor = padded_tensor + 1
+                padded_tensor = torch.nn.functional.relu(padded_tensor)
+
+                # dot product requires padding with 0
+                dot_res = padded_tensor.dot(weight)
+                padded_tensor += dot_res
+
+                # min requires padding with inf, so we create mask now
+                original_size_cuda = create_size_tensor(original_tensor, "cuda")
+                padded_tensor = fill(padded_tensor, original_size_cuda, float("inf"))
+                min_res = torch.min(padded_tensor)
+
+                # max requires padding with inf. we can reuse previous mask
+                padded_tensor = fill(padded_tensor, original_size_cuda, -float("inf"))
+                max_res = torch.max(padded_tensor)
+
+                return min_res + max_res + padded_tensor
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+
+            def run(padded_size, original_size):
+                padded_tensor = torch.randn(padded_size, device="cuda")
+                padded_tensor[original_size:] = 0
+                original_tensor = torch.randn(original_size, device="meta")
+                weight = torch.randn(padded_size, device="cuda")
+                eager_out = f(padded_tensor, original_tensor, weight)
+                for _ in range(3):
+                    compiled_out = compiled_f(padded_tensor, original_tensor, weight)
+                    assert torch.allclose(eager_out, compiled_out)
+
+            # although custom op `create_mask` happens at the middle of function, reorder
+            # moves it to the front so we only have 1 partition. This leads to 1 cudagraph
+            run(8, 4)
+
+            # recompilation leads to 1 NEW cudagraph
+            run(8, 6)
+
+            # reuse previous cudagraph
+            run(8, 7)
+
+            self.assertEqual(self.get_manager().new_graph_id().id, 2)
+
+        def test_meta_tensor(self):
+            def foobar(x, y):
+                return x * 2, y * 3
+
+            foo_c = torch.compile(mode="reduce-overhead")(foobar)
+            t = torch.empty((1, 16, 128, 128), device="meta")
+            y = torch.rand([64], device="cuda")
+
+            eager_out = foobar(t, y)
+
+            for _ in range(3):
+                compiled_out = foo_c(t, y)
+
+            compiled_out = foo_c(t, y)
+            self.assertEqual(eager_out, compiled_out)
+            self.assertEqual(self.get_manager().new_graph_id().id, 1)
+
+        def test_cudagraph_capture_sizes(self):
+            torch._inductor.config.triton.cudagraph_capture_sizes = (2, 5, 7)
+
+            def f(x):
+                return x + 1
+
+            f = torch.compile(f, mode="reduce-overhead")
+
+            def run(shape):
+                x = torch.randn((shape, 5), device="cuda")
+                torch._dynamo.mark_dynamic(x, 0)
+                for _ in range(3):
+                    f(x)
+
+            for i in range(1, 10):
+                run(i)
+
+            self.assertEqual(self.get_manager().new_graph_id().id, 3)
+
+        def test_cudagraph_capture_sizes1(self):
+            torch._inductor.config.triton.cudagraph_capture_sizes = (
+                (2, 3),
+                (4, 5),
+                (6, 2),
+                (7, 3),
+            )
+
+            def f(x):
+                return x + 1
+
+            f = torch.compile(f, mode="reduce-overhead")
+
+            def run(batch_size, seq_len, d):
+                x = torch.randn((batch_size, seq_len, d), device="cuda")
+                torch._dynamo.mark_dynamic(x, 0)
+                torch._dynamo.mark_dynamic(x, 1)
+                for _ in range(3):
+                    f(x)
+
+            for i in range(2, 10):
+                for j in range(2, 10):
+                    run(i, j, 8)
+
+            self.assertEqual(self.get_manager().new_graph_id().id, 4)
+
+        def test_cudagraph_capture_sizes2(self):
+            torch._inductor.config.triton.cudagraph_capture_sizes = (
+                (2, 3, 4),
+                (4, 4, 3),
+                (3, 4, 4),
+                (4, 2, 3),
+            )
+
+            def f(x):
+                return x + 1
+
+            f = torch.compile(f, mode="reduce-overhead")
+
+            def run(batch_size, seq_len, d):
+                x = torch.randn((batch_size, seq_len, d), device="cuda")
+                torch._dynamo.mark_dynamic(x, 0)
+                torch._dynamo.mark_dynamic(x, 1)
+                torch._dynamo.mark_dynamic(x, 2)
+                for _ in range(3):
+                    f(x)
+
+            for i in range(2, 5):
+                for j in range(2, 5):
+                    for k in range(2, 5):
+                        run(i, j, k)
+
+            self.assertEqual(self.get_manager().new_graph_id().id, 4)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     class TestSAC(TestCase):
         def _make_observer_mode(self):
             class ObserverMode(TorchDispatchMode):
@@ -2886,6 +3975,13 @@ def test_cudagraphs_aot_eager_compat_equal(self):
         def test_cudagraphs_aot_eager_compat_equal_device_one(self):
             self._test_cudagraphs_aot_eager_compat_equal(torch.device("cuda:1"))
 
+<<<<<<< HEAD
+=======
+        @config.patch(graph_partition=True)
+        def test_graph_partition_cudagraphs_aot_eager_compat_equal(self):
+            self._test_cudagraphs_aot_eager_compat_equal(torch.device("cuda:0"))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @requires_multigpu()
         def test_multi_device(self):
             def gn(x, y):
diff --git a/test/inductor/test_custom_lowering.py b/test/inductor/test_custom_lowering.py
index 4786a97429eb..99d2a8a172e5 100644
--- a/test/inductor/test_custom_lowering.py
+++ b/test/inductor/test_custom_lowering.py
@@ -140,6 +140,27 @@ def add_custom_lowering(a, b):
             torch.ops.test_inductor_ops.add_custom, type_promotion_kind=None
         )(add_custom_lowering)
 
+<<<<<<< HEAD
+=======
+    def test_register_lowering_custom_dict(self):
+        custom_lowering_dict = {}
+
+        from torch._inductor.lowering import register_lowering
+
+        @torch.library.custom_op("helion_test::foo", mutates_args={})
+        def foo(x: torch.Tensor) -> torch.Tensor:
+            return x
+
+        @register_lowering(
+            torch.ops.helion_test.foo, lowering_dict=custom_lowering_dict
+        )
+        def foo_lowering(x):
+            return x
+
+        assert torch.ops.helion_test.foo in custom_lowering_dict
+        assert torch.ops.helion_test.foo not in torch._inductor.lowering.lowerings
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu()
     @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
     def test_jagged_to_padded_dense_sanity_cuda(self):
diff --git a/test/inductor/test_custom_post_grad_passes.py b/test/inductor/test_custom_post_grad_passes.py
index 457bbcdb82e7..73ed68216e61 100644
--- a/test/inductor/test_custom_post_grad_passes.py
+++ b/test/inductor/test_custom_post_grad_passes.py
@@ -8,12 +8,25 @@
 import torch.fx as fx
 from torch._dynamo.utils import counters
 from torch._inductor import config
+<<<<<<< HEAD
 from torch._inductor.custom_graph_pass import CustomGraphPass, get_hash_for_files
+=======
+from torch._inductor.codegen.common import get_custom_backend_pass_for_device
+from torch._inductor.custom_graph_pass import (
+    CustomGraphModulePass,
+    CustomGraphPass,
+    get_hash_for_files,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.lowering import lowerings as L
 from torch._inductor.pattern_matcher import Arg, CallFunction, PatternMatcherPass
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_utils import IS_LINUX
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CPU
+=======
+from torch.testing._internal.inductor_utils import HAS_CPU, patch_inductor_backend
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @config.patch({"freezing": True})
@@ -219,9 +232,13 @@ def merge_mm_shared_rhs(graph: fx.Graph):
             for m in matmuls:
                 rhs_vals[m.args[1]].add(m)
 
+<<<<<<< HEAD
             order = {}
             for idx, n in enumerate(graph.nodes):
                 order[n] = idx
+=======
+            order = {n: idx for idx, n in enumerate(graph.nodes)}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             for rhs, matmuls in rhs_vals.items():
                 if len(matmuls) == 1:
@@ -266,6 +283,38 @@ def f(W, nested_seqs):
 
         inner_test()
 
+<<<<<<< HEAD
+=======
+    def test_custom_backend_pass(self):
+        class CustomBackendPass(CustomGraphModulePass):
+            def __init__(self, existing_pass: CustomGraphModulePass = None):
+                super().__init__()
+                self.existing_pass = existing_pass
+
+            def __call__(self, gm: fx.GraphModule) -> None:
+                if self.existing_pass:
+                    self.existing_pass(gm)
+
+                change_cos_pass(gm.graph)
+
+            def uuid(self) -> bytes:
+                return get_hash_for_files((__file__,))
+
+        custom_backend_pass = CustomBackendPass(
+            get_custom_backend_pass_for_device("cpu")
+        )
+        with patch_inductor_backend("cpu", custom_pass=custom_backend_pass):
+
+            def g(x):
+                return x.sin().sin().sin()
+
+            def f(x):
+                return x.cos().cos().cos()
+
+            x = torch.randn(8, dtype=torch.float32)
+            torch.testing.assert_close(torch.compile(f)(x), g(x))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     if IS_LINUX and HAS_CPU and torch.backends.mkldnn.is_available():
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
index f6252847abdf..c4f5c4c05290 100644
--- a/test/inductor/test_cutlass_backend.py
+++ b/test/inductor/test_cutlass_backend.py
@@ -1,15 +1,31 @@
 # Owner(s): ["module: inductor"]
+<<<<<<< HEAD
+=======
+import itertools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import math
 import os
 import re
 import sysconfig
+<<<<<<< HEAD
 import unittest
 import unittest.mock as mock
 from pathlib import Path
 from typing import Callable, Optional
 
 from torch._inductor.utils import clear_inductor_caches
+=======
+import time
+import unittest
+import unittest.mock as mock
+from enum import Enum
+from pathlib import Path
+from typing import Callable, Optional
+
+from torch._inductor.codegen.cuda.serialization import get_cutlass_operation_serializer
+from torch._inductor.utils import clear_caches
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.export import Dim
 from torch.testing._internal.logging_utils import log_settings
 
@@ -26,6 +42,7 @@
 from torch._dynamo.utils import counters
 from torch._inductor import config
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
+<<<<<<< HEAD
 from torch._inductor.codegen.cuda.cutlass_utils import get_max_alignment
 from torch._inductor.exc import InductorError
 from torch._inductor.ir import ChoiceCaller, FixedLayout
@@ -35,13 +52,40 @@
 from torch.sparse import SparseSemiStructuredTensor, to_sparse_semi_structured
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import SM80OrLater, SM90OrLater
+=======
+from torch._inductor.codegen.cuda.cutlass_utils import (
+    _gen_ops_cached,
+    get_max_alignment,
+)
+from torch._inductor.exc import InductorError
+from torch._inductor.ir import FixedLayout
+from torch._inductor.select_algorithm import NoValidChoicesError
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import fresh_cache
+from torch.sparse import SparseSemiStructuredTensor, to_sparse_semi_structured
+from torch.testing import FileCheck
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FP8,
+    SM80OrLater,
+    SM90OrLater,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     IN_RE_WORKER,
     instantiate_parametrized_tests,
     IS_FBCODE,
     parametrize,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+=======
+from torch.testing._internal.inductor_utils import (
+    _quantize_rowwise,
+    _quantize_tensorwise,
+    HAS_CPU,
+    HAS_CUDA,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 torch.set_float32_matmul_precision("high")
@@ -61,6 +105,78 @@ def _get_path_without_sccache() -> str:
     return ":".join(path_envs)
 
 
+<<<<<<< HEAD
+=======
+def _check_if_instances_equal(op1, op2) -> bool:
+    """
+    Utility function to check if two instances of a class are equal.
+    """
+    # cutlass uses list and tuple inconsistently
+    if isinstance(op1, (list, tuple)):
+        return tuple(op1) == tuple(op2)
+
+    if type(op1) != type(op2):
+        return False
+
+    # some classes have __eq__ defined but they may be insufficient
+    if op1.__class__.__dict__.get("__eq__") and op1 != op2:
+        return False
+
+    if isinstance(op1, Enum):
+        return op1.value == op2.value
+
+    if hasattr(op1, "__dict__"):
+        for key, value in op1.__dict__.items():
+            if key not in op2.__dict__:
+                return False
+            if not _check_if_instances_equal(value, op2.__dict__[key]):
+                return False
+
+    return True
+
+
+un_ops_under_test = [torch.relu]
+bin_ops_under_test = [torch.add, torch.mul, torch.sub, torch.div]
+
+evt_all_ops = parametrize(
+    "op", un_ops_under_test + bin_ops_under_test, name_fn=lambda f: f.__name__
+)
+
+evt_bin_ops = parametrize("op", bin_ops_under_test, name_fn=lambda f: f.__name__)
+
+evt_all_shapes = parametrize("shape", itertools.product([512, 1024], repeat=2))
+
+
+def gen_args(op, shape, dtype=torch.float16):
+    if op in bin_ops_under_test:
+        return (torch.rand(*shape, device="cuda:0", dtype=dtype),)
+    else:
+        return ()
+
+
+use_evt_config = config.patch(
+    {
+        "max_autotune": True,
+        "max_autotune_gemm_backends": "CUTLASS",
+        "cuda.cutlass_max_profiling_configs": 1,
+        "benchmark_epilogue_fusion": False,  # EVT doesn't support benchmark fusion yet
+        "cuda.cutlass_tma_only": True,
+        "cuda.cutlass_epilogue_fusion_enabled": True,
+    }
+)
+
+fp8_config = config.patch(
+    {
+        "max_autotune": True,
+        "max_autotune_gemm_backends": "CUTLASS",
+        "cuda.cutlass_max_profiling_configs": 1,
+        "benchmark_epilogue_fusion": False,  # EVT doesn't support benchmark fusion yet
+        "cuda.cutlass_tma_only": True,
+    }
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @instantiate_parametrized_tests
 class TestCutlassBackend(TestCase):
     def setUp(self):
@@ -81,14 +197,40 @@ def setUp(self):
             os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = "1"
             super().setUp()
         finally:
+<<<<<<< HEAD
             os.environ[
                 "INDUCTOR_TEST_DISABLE_FRESH_CACHE"
             ] = old_disable_fresh_cache_envvar
+=======
+            os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = (
+                old_disable_fresh_cache_envvar
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.random.manual_seed(1234)
 
     def tearDown(self):
         super().tearDown()
+<<<<<<< HEAD
         clear_inductor_caches()
+=======
+        clear_caches()
+
+    def run_evt_test(self, model, op, shape, num_fusions=1):
+        M, N = shape
+        a = torch.ones(M, N).cuda().half()
+        b = torch.ones(N, N).cuda().half()
+        extra_args = gen_args(op, (M, N))
+        model = model.cuda()
+
+        result = torch.compile(model)(a, b, extra_args)
+        ref_result = model(a, b, extra_args)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"],
+            num_fusions,
+        )
+        torch.testing.assert_close(result, ref_result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
@@ -106,11 +248,15 @@ def mm(a, b):
         with config.patch(
             {
                 "max_autotune": True,
+<<<<<<< HEAD
                 "autotune_in_subproc": True,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "max_autotune_gemm_backends": "CUTLASS",
                 "compile_threads": 4,
                 "cuda.cutlass_backend_min_gemm_size": 100000,
                 "cuda.cutlass_max_profiling_configs": 2,
+<<<<<<< HEAD
                 # allow fallback to aten as intended
                 "autotune_fallback_to_aten": True,
             }
@@ -134,6 +280,41 @@ def mm(a, b):
                     for cc in passed_choice_callers
                 ), "Cutlass Kernels should have been filtered, GEMM size is too small"
             torch.testing.assert_close(Y_compiled, Y)
+=======
+            }
+        ):
+
+            def select_no_algorithm(*args, **kwargs):
+                raise NoValidChoicesError
+
+            with mock.patch(
+                "torch._inductor.kernel.mm.autotune_select_algorithm",
+                wraps=select_no_algorithm,
+            ) as sa:
+                with self.assertRaisesRegex(InductorError, r".*NoValidChoicesError.*"):
+                    _ = torch.compile(mm, dynamic=False)(a, b)
+                args, _ = sa.call_args
+                _, choices, _, __ = args
+
+                self.assertEqual(choices, [])
+
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_import_cutlass(self):
+        from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
+
+        self.assertTrue(try_import_cutlass())
+
+        import cutlass  # noqa: F401
+        import cutlass_library  # noqa: F401
+
+    def test_cutlass_key(self):
+        from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
+
+        self.assertTrue(try_import_cutlass())
+        from torch._inductor.codecache import cutlass_key
+
+        self.assertIsNotNone(cutlass_key())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
@@ -157,7 +338,10 @@ def test_cutlass_backend_subproc_mm(self):
                 "max_autotune_gemm_backends": "CUTLASS",
                 "compile_threads": 4,
                 "cuda.cutlass_max_profiling_configs": 4,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             Y_compiled = torch.compile(torch.mm)(a, b)
@@ -196,7 +380,10 @@ def test_cutlass_backend_subproc_addmm(self, shape_combo):
                 "max_autotune_gemm_backends": "CUTLASS",
                 "compile_threads": 4,
                 "cuda.cutlass_max_profiling_configs": 4,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             for x_shape in x_shapes:
@@ -224,7 +411,10 @@ def test_cutlass_backend_subproc_bmm(self):
                 "max_autotune_gemm_backends": "CUTLASS",
                 "compile_threads": 4,
                 "cuda.cutlass_max_profiling_configs": 4,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             Y_compiled = torch.compile(torch.bmm)(a, b)
@@ -257,7 +447,10 @@ def forward(self, a, b, c):
                 "autotune_in_subproc": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "cuda.cutlass_max_profiling_configs": 1,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             from torch._inductor.utils import run_and_get_code
@@ -266,8 +459,17 @@ def forward(self, a, b, c):
             expected = model(a, b, c)
             actual, codes = run_and_get_code(compiled, a, b, c)
             torch.testing.assert_close(actual, expected)
+<<<<<<< HEAD
             FileCheck().check_count(
                 "cuda_fused_0.cuda_fused_0",
+=======
+            pattern = r"cutlass_[\w]+\.cutlass_[\w]+"
+            match = re.search(pattern, codes[0])
+            self.assertTrue(match is not None)
+            cutlass_kernel = match.group()
+            FileCheck().check_count(
+                cutlass_kernel,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 2,
             ).run(codes[0])
 
@@ -296,7 +498,10 @@ def forward(self, a, b, c):
                 "autotune_in_subproc": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "cuda.cutlass_max_profiling_configs": 1,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "cuda.cutlass_max_profiling_swizzle_options": [
                     1,
                     2,
@@ -311,10 +516,14 @@ def forward(self, a, b, c):
             expected = model(a, b, c)
             actual, codes = run_and_get_code(compiled, a, b, c)
             torch.testing.assert_close(actual, expected)
+<<<<<<< HEAD
             FileCheck().check_count(
                 "cuda_fused_0.cuda_fused_0",
                 1,
             ).run(codes[0])
+=======
+            self.assertTrue(re.search(r"cutlass_.*.cutlass_.*", codes[0]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Verifies expected number of precompilations
             self.assertEqual(
                 torch._dynamo.utils.counters["inductor"][
@@ -340,21 +549,41 @@ def test_max_autotune_cutlass_backend_regular_mm(
         Main test for mm.
         """
 
+<<<<<<< HEAD
         class MyModel(torch.nn.Module):
             def forward(self, a, b):
                 return a @ b
 
         model = MyModel().cuda()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # M, N, K
         shapes = [
             (128, 128, 16),
             (1024, 1024, 256),
         ]
+<<<<<<< HEAD
         shapes = shapes[0:1] if not dynamic else shapes
+=======
+
+        # M, N, K
+        shapes = shapes if dynamic else shapes[0:1]
+
+        class MyModel(torch.nn.Module):
+            def forward(self, a, b):
+                return a @ b
+
+        model = MyModel().cuda()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputs = [
             (torch.randn(M, K).cuda().to(dtype), torch.randn(K, N).cuda().to(dtype))
             for (M, N, K) in shapes
         ]
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dynamic_shapes = (
             {
                 "a": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
@@ -364,6 +593,7 @@ def forward(self, a, b):
             else None
         )
 
+<<<<<<< HEAD
         with config.patch(
             {
                 "max_autotune": True,
@@ -379,12 +609,127 @@ def forward(self, a, b):
                 )
             else:
                 compiled_model = torch.compile(model, dynamic=dynamic)
+=======
+        with (
+            config.patch(
+                {
+                    "max_autotune": True,
+                    "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                    "cuda.cutlass_max_profiling_configs": 2,
+                }
+            ),
+            dynamo_config.patch({"error_on_recompile": dynamic}),
+        ):
+            expected = [model(*input) for input in inputs]
+            if use_aoti:
+                actual = AOTIRunnerUtil.run_multiple(
+                    model, inputs, dynamic_shapes=dynamic_shapes
+                )
+            else:
+                compiled_model = torch.compile(model, dynamic=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 actual = [compiled_model(*input) for input in inputs]
 
             torch.testing.assert_close(actual, expected)
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
+<<<<<<< HEAD
     @parametrize("dynamic", (False,))
+=======
+    @parametrize("dynamic", (False, True))
+    @parametrize("use_aoti", (False, True))
+    @parametrize("dtype", (torch.float8_e4m3fn,))
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_max_autotune_cutlass_backend_fp8_scaled_mm(
+        self,
+        dynamic: bool,
+        max_autotune_gemm_backends: str = "CUTLASS",
+        use_aoti: bool = False,
+        dtype: torch.dtype = torch.float16,
+    ):
+        """
+        Main test for mm.
+        """
+
+        # M, N, K
+        shapes = [
+            (128, 128, 16),
+            (1024, 1024, 256),
+        ]
+
+        # M, N, K
+        shapes = shapes if dynamic else shapes[0:1]
+
+        inputs = []
+        for shape in shapes:
+            M, N, K = shape
+            output_dtype = torch.bfloat16
+            device = "cuda"
+
+            x = torch.randn(M, K, dtype=output_dtype, device=device)
+            w = torch.randn(N, K, dtype=output_dtype, device=device)
+
+            # quantize weight (prior to inference)
+            w_fp8, w_inverse_scale = _quantize_rowwise(w, dtype)
+            w_t_fp8 = w_fp8.t()
+            w_inverse_scale = w_inverse_scale.t()  # scale_b should be (1, N)
+
+            # quantize input x
+            x_fp8, x_inverse_scale = _quantize_rowwise(x, dtype)
+
+            inputs.append((x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale))
+
+        class MyModel(torch.nn.Module):
+            def forward(self, x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale):
+                y = torch._scaled_mm(
+                    x_fp8,
+                    w_t_fp8,
+                    x_inverse_scale,
+                    w_inverse_scale,
+                    None,
+                    out_dtype=torch.bfloat16,
+                    use_fast_accum=False,
+                )
+                return y
+
+        dynamic_shapes = (
+            {
+                "x_fp8": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
+                "x_inverse_scale": {0: Dim.DYNAMIC, 1: 1},
+                "w_t_fp8": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
+                "w_inverse_scale": {0: 1, 1: Dim.DYNAMIC},
+            }
+            if dynamic
+            else None
+        )
+        model = MyModel().cuda()
+
+        with (
+            config.patch(
+                {
+                    "max_autotune": True,
+                    "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                    "cuda.cutlass_max_profiling_configs": 2,
+                    "benchmark_epilogue_fusion": False,  # EVT doesn't support benchmark fusion yet
+                    "cuda.cutlass_tma_only": True,
+                }
+            ),
+            dynamo_config.patch({"error_on_recompile": dynamic}),
+        ):
+            expected = [model(*input) for input in inputs]
+            if use_aoti:
+                actual = AOTIRunnerUtil.run_multiple(
+                    model, inputs, dynamic_shapes=dynamic_shapes
+                )
+            else:
+                compiled_model = torch.compile(model, dynamic=True)
+                actual = [compiled_model(*input) for input in inputs]
+
+            torch.testing.assert_close(actual, expected, rtol=1e-2, atol=0.05)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @parametrize("dynamic", (False, True))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("use_aoti", (False, True))
     @parametrize("dtype", (torch.float16, torch.bfloat16))
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
@@ -407,6 +752,7 @@ def forward(self, x, a, b):
         # M, N, K
         shapes = [
             (128, 128, 16),
+<<<<<<< HEAD
         ]
 
         x_shapes = [
@@ -418,6 +764,22 @@ def forward(self, x, a, b):
         for x_shape in x_shapes:
             torch._dynamo.reset()
             clear_inductor_caches()
+=======
+            (512, 512, 128),
+        ]
+        shapes = shapes[0:1] if not dynamic else shapes
+
+        x_shapes = [
+            lambda M, N: (M, N),
+            lambda M, N: (M, 1),
+            lambda M, N: (1, N),
+            lambda M, N: (N,),
+        ]
+        for x_shape in x_shapes:
+            torch._dynamo.reset()
+            clear_caches()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inputs = [
                 (
                     torch.randn(x_shape(M, N)).cuda().to(dtype),
@@ -426,6 +788,7 @@ def forward(self, x, a, b):
                 )
                 for (M, N, K) in shapes
             ]
+<<<<<<< HEAD
             with config.patch(
                 {
                     "max_autotune": True,
@@ -438,6 +801,35 @@ def forward(self, x, a, b):
                 if use_aoti:
                     actual = AOTIRunnerUtil.run_multiple(
                         "cuda", model, inputs, dynamic_shapes=None
+=======
+            dynamic_shapes = (
+                {
+                    "x": {
+                        i: v
+                        for i, v in enumerate(x_shape(Dim.DYNAMIC, Dim.DYNAMIC))
+                        if v != 1
+                    },
+                    "a": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
+                    "b": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
+                }
+                if dynamic
+                else None
+            )
+            with (
+                config.patch(
+                    {
+                        "max_autotune": True,
+                        "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                        "cuda.cutlass_max_profiling_configs": 2,
+                    }
+                ),
+                dynamo_config.patch({"error_on_recompile": dynamic}),
+            ):
+                expected = [model(*input) for input in inputs]
+                if use_aoti:
+                    actual = AOTIRunnerUtil.run_multiple(
+                        model, inputs, dynamic_shapes=dynamic_shapes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 else:
                     compiled_model = torch.compile(model, dynamic=dynamic)
@@ -446,7 +838,11 @@ def forward(self, x, a, b):
                 torch.testing.assert_close(actual, expected)
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
+<<<<<<< HEAD
     @parametrize("dynamic", (False,))
+=======
+    @parametrize("dynamic", (False, True))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("use_aoti", (False, True))
     @parametrize("dtype", (torch.float16, torch.bfloat16))
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
@@ -469,27 +865,55 @@ def forward(self, a, b):
         # B, M, N, K
         shapes = [
             (10, 4096, 2048, 25728),
+<<<<<<< HEAD
+        ]
+=======
+            (20, 2048, 1024, 12864),
         ]
+        shapes = shapes[0:1] if not dynamic else shapes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         inputs = [
             (
                 torch.randn(B, M, K).cuda().to(dtype),
+<<<<<<< HEAD
                 torch.randn(B, K, N).cuda().to(dtype),
             )
             for B, M, N, K in shapes
         ]
+=======
+                torch.randn(B, N, K).cuda().to(dtype).permute(0, 2, 1),
+            )
+            for B, M, N, K in shapes
+        ]
+        dynamic_shapes = (
+            {
+                "a": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC, 2: Dim.DYNAMIC},
+                "b": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC, 2: Dim.DYNAMIC},
+            }
+            if dynamic
+            else None
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with config.patch(
             {
                 "max_autotune": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "cuda.cutlass_max_profiling_configs": 2,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             expected = [model(*input) for input in inputs]
             if use_aoti:
                 actual = AOTIRunnerUtil.run_multiple(
+<<<<<<< HEAD
                     "cuda", model, inputs, dynamic_shapes=None
+=======
+                    model, inputs, dynamic_shapes=dynamic_shapes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 compiled_model = torch.compile(model, dynamic=dynamic)
@@ -518,7 +942,10 @@ def mm(a, b):
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "cuda.cutlass_max_profiling_configs": 2,
                 "cuda.cutlass_op_allowlist_regex": "stream_k",  # only stream-k GEMM Kernels
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             for M, K, N in (
@@ -574,7 +1001,10 @@ def _test_max_autotune_cutlass_backend_epilogue_fusion(
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "cuda.cutlass_max_profiling_configs": 4,
                 "cuda.version": "12.2",  # required to enable the Kernels we need
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             counters["inductor"]["cuda_epilogue_fusion_counter"] = 0
@@ -582,9 +1012,15 @@ def _test_max_autotune_cutlass_backend_epilogue_fusion(
             Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
             Y = mm(a, b)
             actual_count = counters["inductor"]["cuda_epilogue_fusion_counter"]
+<<<<<<< HEAD
             assert (
                 actual_count == expected_fuse_count
             ), f"Expected fuse count of {expected_fuse_count} but got {actual_count}"
+=======
+            assert actual_count == expected_fuse_count, (
+                f"Expected fuse count of {expected_fuse_count} but got {actual_count}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2)
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
@@ -673,7 +1109,10 @@ def mm(a, b):
                 "autotune_in_subproc": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "cuda.cutlass_max_profiling_configs": 2,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
@@ -692,7 +1131,10 @@ def forward(self, x, w):
                 "max_autotune": True,
                 "autotune_in_subproc": False,
                 "max_autotune_gemm_backends": "CUTLASS",
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "cuda.cutlass_max_profiling_configs": 2,
             }
         ):
@@ -707,7 +1149,10 @@ def forward(self, x, w):
             w = torch.randn(K, N).cuda().half()
 
             actual = AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 "cuda",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model,
                 (x, w),
                 dynamic_shapes=dynamic_shapes,
@@ -732,14 +1177,21 @@ def forward(self, x, w):
                 "max_autotune": True,
                 "autotune_in_subproc": False,
                 "max_autotune_gemm_backends": "CUTLASS",
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "cuda.cutlass_max_profiling_configs": 2,
             }
         ):
             model = MyModel()
             M, N, K = 128, 64, 64
             dynamic_shapes = {
+<<<<<<< HEAD
                 "x": {0: Dim.DYNAMIC},  # type: ignore[attr-defined]
+=======
+                "x": {0: Dim.DYNAMIC},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "w": None,
             }
 
@@ -747,7 +1199,10 @@ def forward(self, x, w):
             w = torch.randn(K, N).cuda().half()
 
             actual = AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 "cuda",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model,
                 (x, w),
                 dynamic_shapes=dynamic_shapes,
@@ -767,7 +1222,10 @@ def forward(self, x, w):
                 "max_autotune": True,
                 "autotune_in_subproc": False,
                 "max_autotune_gemm_backends": "CUTLASS",
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "cuda.cutlass_op_allowlist_regex": "128x256x64.*stream_k_warpspecialized_cooperative_epi_nosmem",
                 "cuda.cutlass_max_profiling_configs": 1,
             }
@@ -779,7 +1237,10 @@ def forward(self, x, w):
             w = torch.randn(K, N).cuda().half()
 
             actual = AOTIRunnerUtil.run(
+<<<<<<< HEAD
                 "cuda",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model,
                 (x, w),
             )
@@ -815,7 +1276,10 @@ def mm(a, b):
                 "max_autotune_gemm_backends": "CUTLASS",
                 "cuda.cutlass_max_profiling_configs": 2,
                 "autotune_local_cache": True,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             Y_compiled = torch.compile(mm, dynamic=dynamic)(a_sparse, b)
@@ -832,8 +1296,13 @@ def mm(a, b):
             f"('cuda', 'torch.float16', {k}, {n}, {n}, 1, 0)]"
         ]["high"]
         cutlass_kernels_count = 0
+<<<<<<< HEAD
         for kernel, time in high.items():
             if kernel.startswith("cutlass_gemm") and not math.isinf(time):
+=======
+        for kernel, duration in high.items():
+            if kernel.startswith("cutlass_gemm") and not math.isinf(duration):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cutlass_kernels_count += 1
         assert cutlass_kernels_count > 0
 
@@ -852,7 +1321,11 @@ def my_addmm(x, a, b, alpha, beta):
         def select_no_algorithm(*args, **kwargs):
             raise NoValidChoicesError
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with config.patch(
                 {
                     "max_autotune": True,
@@ -879,9 +1352,15 @@ def select_no_algorithm(*args, **kwargs):
                             choice_info = choice.info_dict()
                             op_conf_name = choice_info.get("op_conf_name", "")
                             assert isinstance(op_conf_name, str)
+<<<<<<< HEAD
                             assert (
                                 "pingpong" not in op_conf_name
                             ), "All pingpong Kernels should have been filtered"
+=======
+                            assert "pingpong" not in op_conf_name, (
+                                "All pingpong Kernels should have been filtered"
+                            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             cuda_template_count += 1
                     assert cuda_template_count > 0, "No CUDATemplateCaller choices"
 
@@ -900,7 +1379,11 @@ def addmm(x, a, b, alpha, beta):
         def select_no_algorithm(*args, **kwargs):
             raise NoValidChoicesError
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with config.patch(
                 {
                     "max_autotune": True,
@@ -927,14 +1410,110 @@ def select_no_algorithm(*args, **kwargs):
                             choice_info = choice.info_dict()
                             op_conf_name = choice_info.get("op_conf_name", "")
                             assert isinstance(op_conf_name, str)
+<<<<<<< HEAD
                             assert (
                                 "pingpong" in op_conf_name
                             ), "Only pingpong Kernels should have been allowed"
+=======
+                            assert "pingpong" in op_conf_name, (
+                                "Only pingpong Kernels should have been allowed"
+                            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             cuda_template_count += 1
                     assert cuda_template_count > 0, "No CUDATemplateCaller choices"
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+<<<<<<< HEAD
+=======
+    def test_cutlass_backend_fp8_scaled_mm_fast_accum_filtering(
+        self,
+    ):
+        float8_dtype = torch.float8_e4m3fn
+        # Only bf16 output type is supported for row-wise scaling, not fp32
+        output_dtype: torch.dtype = torch.bfloat16
+        device = "cuda"
+        M, K, N = 128, 128, 128  # Matmul Y = X [M, K] x W [N, K]
+        x = torch.randn(M, K, dtype=output_dtype, device=device)
+        w = torch.randn(N, K, dtype=output_dtype, device=device)
+        bias = None
+        # quantize weight (prior to inference)
+        w_fp8, w_inverse_scale = _quantize_rowwise(w, float8_dtype)
+        w_t_fp8 = w_fp8.t()
+        w_inverse_scale = w_inverse_scale.t()  # scale_b should be (1, N)
+
+        # quantize input x
+        x_fp8, x_inverse_scale = _quantize_rowwise(x, float8_dtype)
+
+        def linear(
+            x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias, use_fast_accum
+        ):
+            y = torch._scaled_mm(
+                x_fp8,
+                w_t_fp8,
+                x_inverse_scale,
+                w_inverse_scale,
+                bias,
+                out_dtype=output_dtype,
+                use_fast_accum=use_fast_accum,
+            )
+            return y
+
+        linear_compiled = torch.compile(linear, backend="inductor")
+
+        def select_no_algorithm(*args, **kwargs):
+            raise NoValidChoicesError
+
+        def run_test(use_fast_accum):
+            with fresh_cache():
+                with config.patch(
+                    {
+                        "max_autotune": True,
+                        "max_autotune_gemm_backends": "CUTLASS",
+                        "cuda.cutlass_max_profiling_configs": 2,
+                    }
+                ):
+                    with mock.patch(
+                        "torch._inductor.kernel.mm.autotune_select_algorithm",
+                        wraps=select_no_algorithm,
+                    ) as sa:
+                        with self.assertRaisesRegex(
+                            InductorError, r".*NoValidChoicesError.*"
+                        ):
+                            linear_compiled(
+                                x_fp8,
+                                x_inverse_scale,
+                                w_t_fp8,
+                                w_inverse_scale,
+                                bias,
+                                use_fast_accum,
+                            )
+
+                        args, _ = sa.call_args
+                        _, choices, _, _ = args
+                        cuda_template_count = 0
+                        for choice in choices:
+                            if isinstance(choice, CUDATemplateCaller):
+                                choice_info = choice.info_dict()
+                                op_conf_name = choice_info.get("op_conf_name", "")
+                                assert isinstance(op_conf_name, str)
+                                if use_fast_accum:
+                                    assert "fastaccum" in op_conf_name, (
+                                        "Only fastaccum Kernels should have been allowed"
+                                    )
+                                else:
+                                    assert "fastaccum" not in op_conf_name, (
+                                        "fastaccum Kernels should have been filtered"
+                                    )
+                                cuda_template_count += 1
+                        assert cuda_template_count > 0, "No CUDATemplateCaller choices"
+
+        run_test(True)
+        run_test(False)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cutlass_backend_shape_coverage_mm(
         self,
     ):
@@ -966,6 +1545,7 @@ def test_cutlass_backend_shape_coverage_mm(
         def select_no_algorithm(*args, **kwargs):
             raise NoValidChoicesError
 
+<<<<<<< HEAD
         with fresh_inductor_cache(), config.patch(
             {
                 "max_autotune": True,
@@ -977,6 +1557,22 @@ def select_no_algorithm(*args, **kwargs):
             "torch._inductor.kernel.mm.autotune_select_algorithm",
             wraps=select_no_algorithm,
         ) as sa:
+=======
+        with (
+            fresh_cache(),
+            config.patch(
+                {
+                    "max_autotune": True,
+                    "max_autotune_gemm_backends": "CUTLASS",
+                    "cuda.cutlass_max_profiling_configs": 2,
+                }
+            ),
+            mock.patch(
+                "torch._inductor.kernel.mm.autotune_select_algorithm",
+                wraps=select_no_algorithm,
+            ) as sa,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for input in inputs:
                 A, B = input
                 M, K = A.shape
@@ -1007,6 +1603,67 @@ def select_no_algorithm(*args, **kwargs):
                     f"M={M}, N={N}, K={K}",
                 )
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    @parametrize("presets", ("", "0", "0,999"))
+    def test_cutlass_presets(
+        self,
+        presets: str,
+    ):
+        """
+        Test if some configs can be generated with presets.
+        """
+
+        M, N, K = (128, 128, 16)
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(K, N).cuda().half()
+
+        def select_no_algorithm(*args, **kwargs):
+            raise NoValidChoicesError
+
+        with (
+            fresh_cache(),
+            config.patch(
+                {
+                    "max_autotune": True,
+                    "max_autotune_gemm_backends": "CUTLASS",
+                    "cuda.cutlass_max_profiling_configs": 2,
+                    "cuda.cutlass_presets": presets,
+                }
+            ),
+            mock.patch(
+                "torch._inductor.kernel.mm.autotune_select_algorithm",
+                wraps=select_no_algorithm,
+            ) as sa,
+        ):
+            with self.assertRaisesRegex(InductorError, r".*NoValidChoicesError.*"):
+                torch.compile(torch.mm)(A, B)
+
+            self.assertTrue(
+                sa.called,
+                f"autotune_select_algorithm was not called with shape M={M}, N={N}, K={K}",
+            )
+            args, _ = sa.call_args
+            op_name, choices, _, __ = args
+            assert op_name == "mm"
+            cuda_template_count = 0
+            for choice in choices:
+                if isinstance(choice, CUDATemplateCaller):
+                    choice_info = choice.info_dict()
+                    op_conf_name = choice_info.get("op_conf_name", "")
+                    assert isinstance(op_conf_name, str)
+                    cuda_template_count += 1
+
+            self.assertGreater(
+                cuda_template_count,
+                0,
+                "No CUDATemplateCaller choices found for matmul with shape "
+                f"M={M}, N={N}, K={K}",
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not SM80OrLater, "need sm_80")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_get_max_alignment(self):
@@ -1081,7 +1738,10 @@ def test_standalone_runner(self):
                 "max_autotune": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "cuda.cutlass_max_profiling_configs": 2,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "cuda.generate_test_runner": True,  # put standalone runner in the generated code
             }
         ):
@@ -1169,9 +1829,18 @@ def mm(a, b):
                 "force_disable_caches": True,
             }
         ):
+<<<<<<< HEAD
             with log_settings("+inductor"), self.assertLogs(
                 logger="torch._inductor.codegen.cuda", level=logging.DEBUG
             ) as test_log:
+=======
+            with (
+                log_settings("+inductor"),
+                self.assertLogs(
+                    logger="torch._inductor.codegen.cuda", level=logging.DEBUG
+                ) as test_log,
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Y_compiled = torch.compile(mm, dynamic=False)(a, b)
                 Y = mm(a, b)
                 torch.testing.assert_close(Y_compiled, Y)
@@ -1198,13 +1867,442 @@ def test_cutlass_backend_matmul_same_tensor(self):
                 "max_autotune": True,
                 "max_autotune_gemm_backends": max_autotune_gemm_backends,
                 "cuda.cutlass_max_profiling_configs": 2,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         ):
             compiled = torch.compile(torch.mm)
 
             torch.testing.assert_close(A @ A.t(), compiled(A, A.t()))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_flexible_layout(self):
+        class TestModel(torch.nn.Module):
+            def forward(self, B):
+                A = torch.zeros_like(B)
+                return A @ B
+
+        M = 1024
+        B = torch.randn(M, M).cuda().half()
+        model = TestModel().cuda()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "CUTLASS",
+                "cuda.cutlass_max_profiling_configs": 1,
+            }
+        ):
+            _ = torch.compile(model)(B)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    @use_evt_config
+    def test_evt_flexible_layout(self):
+        class TestModel(torch.nn.Module):
+            def forward(self, B):
+                A = torch.zeros_like(B)
+                return (A @ B).relu()
+
+        M = 1024
+        B = torch.randn(M, M).cuda().half()
+        model = TestModel().cuda().half()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "CUTLASS",
+                "cuda.cutlass_max_profiling_configs": 1,
+            }
+        ):
+            _ = torch.compile(model)(B)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 1
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_filtered_ops_cache(self):
+        class TestModel(torch.nn.Module):
+            def forward(self, B):
+                A = torch.zeros_like(B)
+                for _ in range(100):
+                    A = A @ B
+                return A
+
+        M = 1024
+        B = torch.randn(M, M).cuda().half()
+        model = TestModel().cuda()
+
+        start_time = time.time()
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "CUTLASS",
+                "cuda.cutlass_max_profiling_configs": 1,
+            }
+        ):
+            _ = torch.compile(model)(B)
+        self.assertTrue(time.time() - start_time < 60)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_compilation_time(self):
+        M = 1024
+        A = torch.randn(M, M).cuda().half()
+        B = torch.randn(M, M).cuda().half()
+
+        start_time = time.time()
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "CUTLASS",
+                "cuda.cutlass_max_profiling_configs": 1,
+            }
+        ):
+            _ = torch.compile(torch.mm)(A, B)
+        self.assertTrue(time.time() - start_time < 50)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    @evt_all_shapes
+    def test_evt_fusions_basic(self, op, shape):
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                res = (a @ b).relu()  # add extra activation to not hit addmm path
+                return op(res, *extra_args)
+
+        self.run_evt_test(TestModel(), op, shape)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_bin_ops
+    def test_evt_broadcasting(self, op):
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                acc = a @ b
+                return acc, op(acc.relu(), *extra_args)
+
+        M = 1024
+        N = 512
+        a = torch.ones(M, N).cuda().half()
+        b = torch.ones(N, N).cuda().half()
+        extra_args = gen_args(op, (M, N))
+        model = TestModel().cuda()
+
+        result = torch.compile(model)(a, b, extra_args)
+        ref_result = model(a, b, extra_args)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 1
+        )
+        torch.testing.assert_close(result, ref_result)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_mixed_dtypes(self, op):
+        M = 1024
+        N = 256
+
+        fp32_tensor = torch.ones(M, N).cuda().float()
+
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                acc = a @ b
+                out0 = op(acc.relu(), *extra_args)
+                out1 = torch.add(out0, fp32_tensor)
+                return out1
+
+        model = TestModel().cuda()
+        a = torch.ones(M, N).cuda().half()
+        b = torch.ones(N, N).cuda().half()
+        extra_args = gen_args(op, (M, N), dtype=torch.float16)
+
+        # baseline is cutlass kernel + triton
+        # matches expected casting behavior
+        with config.patch({"cuda.cutlass_epilogue_fusion_enabled": False}):
+            ref_result = torch.compile(model)(a, b, extra_args)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 0
+        )
+
+        torch._dynamo.reset()
+        result = torch.compile(model)(a, b, extra_args)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"],
+            1,
+        )
+
+        torch.testing.assert_close(result, ref_result)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_multi_op(self, op):
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                acc = a @ b
+                return torch.add(op(acc.relu(), *extra_args).relu(), acc)
+
+        self.run_evt_test(TestModel(), op, (1024, 512))
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_reuse_matmul_input(self, op):
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                acc = a @ b
+                return torch.add(op(acc.relu(), *extra_args).relu(), a)
+
+        self.run_evt_test(TestModel(), op, (1024, 1024))  # shape needs to be square
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    @parametrize(
+        "dynamic", (False, True)
+    )  # To not drastically increase test time we only test dynamic on this test
+    def test_evt_multi_output(self, op, dynamic):
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                acc = a @ b
+                z0 = acc.relu()
+                z = op(z0, *extra_args)
+                y = z + z0
+                return z, y
+
+        M = 1024
+        N = 512
+        shapes = [(512, 512)] if not dynamic else [(1024, 64), (128, 256)]
+        for i, shape in enumerate(shapes):
+            M, N = shape
+            a = torch.ones(M, N).cuda().half()
+            b = torch.ones(N, N).cuda().half()
+            extra_args = gen_args(op, (M, N))
+            model = TestModel().cuda()
+
+            result = torch.compile(model)(a, b, extra_args)
+            ref_result = model(a, b, extra_args)
+
+            self.assertEqual(
+                torch._dynamo.utils.counters["inductor"][
+                    "cuda_epilogue_fusion_counter"
+                ],
+                2 * (i + 1),
+            )
+            torch.testing.assert_close(result, ref_result)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    def test_evt_return_accumulator(self):
+        op = torch.add
+
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                acc = a @ b
+                return acc, op(acc.relu(), *extra_args)
+
+        M = 1024
+        N = 512
+        a = torch.ones(M, N).cuda().half()
+        b = torch.ones(N, N).cuda().half()
+        extra_args = gen_args(op, (M, N))
+        model = TestModel().cuda()
+
+        result = torch.compile(model)(a, b, extra_args)
+        ref_result = model(a, b, extra_args)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 1
+        )
+        torch.testing.assert_close(result, ref_result)
+
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    @parametrize("arch", ("90", "100"))
+    @parametrize("cuda_version", ("12.4", "12.6", "12.8"))
+    def test_gemm_operation_serialization(self, arch: str, cuda_version: str):
+        """
+        Testing serialization for GEMM operations generated by CUTLASS.
+        This should cover GroupedGemmOperation as well.
+        """
+        full_ops = _gen_ops_cached(arch, cuda_version)
+
+        serializer = get_cutlass_operation_serializer()
+        self.assertIsNotNone(serializer)
+
+        count = 0
+        for ops in full_ops.values():
+            for op_dict in ops.values():
+                for op_list in op_dict.values():
+                    for op in op_list:
+                        count += 1
+                        serialized = serializer.serialize(op)
+                        deserialized = serializer.deserialize(serialized)
+                        self.assertTrue(_check_if_instances_equal(op, deserialized))
+
+        self.assertGreater(count, 1000, "Too few ops generated")
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+")
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @fp8_config
+    @parametrize("float8_dtype", (torch.float8_e4m3fn,))
+    @parametrize(
+        "shape",
+        (
+            (
+                512,
+                128,
+                64,
+            ),
+        ),
+    )
+    @parametrize("has_bias", (False, True))
+    @parametrize("use_fast_accum", (False,))
+    def test_fp8_rowwise_scaling(
+        self,
+        float8_dtype: torch.dtype,
+        shape: tuple[int, int, int],
+        has_bias: bool,
+        use_fast_accum: bool,
+    ):
+        # Only bf16 output type is supported for row-wise scaling, not fp32
+        output_dtype: torch.dtype = torch.bfloat16
+        device = "cuda"
+        M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
+        x = torch.randn(M, K, dtype=output_dtype, device=device)
+        w = torch.randn(N, K, dtype=output_dtype, device=device)
+        bias = None
+        if has_bias:
+            bias = torch.randn(N, device=device, dtype=torch.bfloat16)
+
+        # quantize weight (prior to inference)
+        w_fp8, w_inverse_scale = _quantize_rowwise(w, float8_dtype)
+        w_t_fp8 = w_fp8.t()
+        w_inverse_scale = w_inverse_scale.t()  # scale_b should be (1, N)
+
+        # quantize input x
+        x_fp8, x_inverse_scale = _quantize_rowwise(x, float8_dtype)
+
+        def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
+            y = torch._scaled_mm(
+                x_fp8,
+                w_t_fp8,
+                x_inverse_scale,
+                w_inverse_scale,
+                bias,
+                out_dtype=output_dtype,
+                use_fast_accum=use_fast_accum,
+            )
+            return y
+
+        y_eager = linear(
+            x_fp8,
+            x_inverse_scale,
+            w_t_fp8,
+            w_inverse_scale,
+            bias,
+        )
+        linear_compiled = torch.compile(linear, backend="inductor")
+        y_compiled = linear_compiled(
+            x_fp8,
+            x_inverse_scale,
+            w_t_fp8,
+            w_inverse_scale,
+            bias,
+        )
+        self.assertEqual(y_eager.dtype, output_dtype)
+        self.assertEqual(y_compiled.dtype, output_dtype)
+        torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+")
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @fp8_config
+    @parametrize("float8_dtype", (torch.float8_e4m3fn,))
+    @parametrize(
+        "shape",
+        (
+            (
+                512,
+                128,
+                64,
+            ),
+        ),
+    )
+    @parametrize("has_bias", (False, True))
+    @parametrize("use_fast_accum", (False,))
+    def test_fp8_tensorwise_scaling(
+        self,
+        float8_dtype: torch.dtype,
+        shape: tuple[int, int, int],
+        has_bias: bool,
+        use_fast_accum: bool,
+    ):
+        device = "cuda"
+        M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
+        input_dtype = torch.bfloat16
+        output_dtype = torch.bfloat16
+        # input and output dtypes of _scaled_mm do not need to be the same, but
+        # typically in a model they are
+        x = torch.randn(M, K, dtype=input_dtype, device=device)
+        w = torch.randn(N, K, dtype=input_dtype, device=device)
+        bias = None
+        if has_bias:
+            bias = torch.randn(N, device=device, dtype=torch.bfloat16)
+
+        # quantize weight (prior to inference)
+        w_fp8, w_inverse_scale = _quantize_tensorwise(w, float8_dtype)
+        w_t_fp8 = w_fp8.t()
+
+        # quantize input x
+        x_fp8, x_inverse_scale = _quantize_tensorwise(x, float8_dtype)
+
+        def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
+            y = torch._scaled_mm(
+                x_fp8,
+                w_t_fp8,
+                x_inverse_scale,
+                w_inverse_scale,
+                bias,
+                out_dtype=output_dtype,
+                use_fast_accum=use_fast_accum,
+            )
+            return y
+
+        y_eager = linear(
+            x_fp8,
+            x_inverse_scale,
+            w_t_fp8,
+            w_inverse_scale,
+            bias,
+        )
+        linear_compiled = torch.compile(linear, backend="inductor", mode="max-autotune")
+        y_compiled = linear_compiled(
+            x_fp8,
+            x_inverse_scale,
+            w_t_fp8,
+            w_inverse_scale,
+            bias,
+        )
+        self.assertEqual(y_eager.dtype, output_dtype)
+        self.assertEqual(y_compiled.dtype, output_dtype)
+        # depending on the kernel config (BLOCK_M size, etc) selected during Inductor
+        # autotuning for the compiled case, the results can be different because of
+        # the way blocks of results are accumulated (float addition not associative), so
+        # setting a small absolute tolerance in these tests
+        torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.utils import is_big_gpu
diff --git a/test/inductor/test_cutlass_evt.py b/test/inductor/test_cutlass_evt.py
new file mode 100644
index 000000000000..cff432b9c3f3
--- /dev/null
+++ b/test/inductor/test_cutlass_evt.py
@@ -0,0 +1,569 @@
+# Owner(s): ["module: inductor"]
+import unittest
+
+import sympy
+
+import torch
+from torch._dynamo.test_case import TestCase
+from torch._inductor.codegen.cuda.cutlass_utils import (
+    torch_dtype_to_cutlass_type,
+    try_import_cutlass,
+)
+from torch._inductor.graph import GraphLowering
+from torch._inductor.ir import ComputedBuffer, FixedLayout, PermuteView, Pointwise
+from torch._inductor.scheduler import BaseSchedulerNode
+from torch._inductor.utils import OrderedSet
+from torch.testing._internal.common_cuda import SM90OrLater
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+
+
+if try_import_cutlass():
+    import cutlass_library as cutlass_lib
+    from cutlass_library import EpilogueScheduleType
+
+    LayoutType = cutlass_lib.LayoutType
+    DataType = cutlass_lib.DataType
+    from torch._inductor.codegen.cuda.cutlass_lib_extensions.evt_extensions import (
+        _render_argument_type,
+        _trace,
+        CutlassTensor,
+        trace,
+    )
+
+    BIAS_CODE = """def example_epilogue(accum, C, aux, bias):
+        F = accum + C + aux
+        E = relu(F) + bias
+        D = E + F
+        return D, F"""
+
+    TYPE_C = DataType.f32
+    M = 4224
+    N = 2048
+    BIAS = CutlassTensor(shape=(M, 1), element=TYPE_C, layout_tag=LayoutType.RowMajor)
+
+    EXAMPLE_TENSORS = {
+        "accum": CutlassTensor(
+            element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+        ),
+        "bias": BIAS,
+        # "beta": 0.5, TODO: mlazos support scalars
+        # "alpha": 0.5, TODO: mlazos support scalars
+        "D": CutlassTensor(
+            element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+        ),
+        "C": CutlassTensor(
+            element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+        ),
+        "F": CutlassTensor(
+            element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+        ),
+        "aux": CutlassTensor(
+            element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+        ),
+    }
+
+    class MockTileDescription:
+        threadblock_shape = (128, 128, 8)
+
+    def _create_mock_buffer_name_map(example_tensors):
+        name_to_buffer = {}
+        for name, tensor in example_tensors.items():
+            if isinstance(tensor, CutlassTensor):
+                name_to_buffer[name] = MockComputedBuffer(
+                    name, None, torch.float32, tensor.shape, tensor.stride
+                )
+
+        return name_to_buffer
+
+
+class MockSchedulerNode(BaseSchedulerNode):
+    def __init__(self, node, last_usage=None):
+        self.node = node
+        self.last_usage = last_usage or OrderedSet()
+
+
+class MockComputedBuffer(ComputedBuffer):
+    def __init__(self, name, inner_fn, dtype, size, strides=None):
+        self.name = name
+        ranges = [sympy.Integer(x) for x in size]
+        self.data = Pointwise(
+            device=None, dtype=dtype, inner_fn=inner_fn, ranges=ranges
+        )
+        self.layout = FixedLayout(None, dtype, ranges, strides)
+
+    def get_name(self):
+        return self.name
+
+    def num_reads(self):
+        # Needed to not inline in ComputedBuffer
+        return 1
+
+
+class MockGraphHandler(GraphLowering):
+    def __init__(self, name_to_buffer):
+        import torch._inductor.sizevars
+
+        self.sizevars = torch._inductor.sizevars.SizeVarAllocator()
+        self.name_to_buffer = name_to_buffer
+        self.graph_inputs = dict()
+        self.mutated_buffers = OrderedSet()
+        self.constants = dict()
+
+
+class TestCutlassEVT(TestCase):
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_py_codegen_accumulator_return(self):
+        from torch._inductor.codegen.cuda.cutlass_python_evt import CutlassEVTCodegen
+        from torch._inductor.virtualized import V
+
+        size = (100, 300, 200)
+        buf0 = MockComputedBuffer("buf0", None, torch.float32, size)
+        buf1 = MockComputedBuffer("buf1", None, torch.float32, size)
+        buf2 = MockComputedBuffer("buf2", None, torch.float32, size)
+
+        # buf0 is acc
+        # buf1 is external
+        def inner_fn_buf3(index):
+            tmp0 = buf0.make_loader()(index)
+            tmp1 = buf1.make_loader()(index)
+            tmp2 = buf2.make_loader()(index)
+            return tmp0 * tmp1 + tmp2
+
+        def inner_fn_buf4(index):
+            tmp0 = buf0.make_loader()(index)
+            tmp3 = buf3.make_loader()(index)
+            return tmp0 + tmp3
+
+        buf3 = MockComputedBuffer("buf3", inner_fn_buf3, torch.float32, size)
+        buf4 = MockComputedBuffer("buf4", inner_fn_buf4, torch.float32, size)
+        with V.set_graph_handler(
+            MockGraphHandler(
+                {"buf0": buf0, "buf1": buf1, "buf2": buf2, "buf3": buf3, "buf4": buf4}
+            )
+        ):
+            reads, writes, renames, code = CutlassEVTCodegen.ir_to_evt_python_code(
+                "buf0",
+                [
+                    MockSchedulerNode(buf3),
+                    MockSchedulerNode(buf4, last_usage=OrderedSet(["buf3"])),
+                ],
+                OrderedSet([]),
+            )
+        self.assertExpectedInline(reads, """['buf1', 'buf2']""")
+        self.assertExpectedInline(writes, """['buf0', 'buf3', 'buf4']""")
+        self.assertExpectedInline(
+            renames,
+            """{'accum': 'buf0', 'tmp_0': 'buf0', 'buf1': 'buf1', 'buf2': 'buf2', 'tmp_2': 'buf3', 'D': 'buf4'}""",
+        )
+        self.assertExpectedInline(
+            code,
+            """\
+def fn(accum, buf1, buf2):
+    tmp_0 = accum
+    tmp_1 = tmp_0 * buf1
+    tmp_2 = tmp_1 + buf2
+    D = tmp_0 + tmp_2
+
+return tmp_0, tmp_2, D""",
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_py_codegen_disjoint_read_indexing(self):
+        from torch._inductor.codegen.cuda.cutlass_python_evt import CutlassEVTCodegen
+        from torch._inductor.virtualized import V
+
+        size = (100, 300, 200)
+        buf0 = MockComputedBuffer("buf0", None, torch.float32, size)
+        permuted_buf_0 = PermuteView.create(buf0, [1, 0, 2])
+        buf1 = MockComputedBuffer("buf1", None, torch.float32, size)
+        buf2 = MockComputedBuffer("buf2", None, torch.float32, size)
+
+        # buf0 is acc
+        # buf1 is external
+        def inner_fn_buf3(index):
+            tmp0 = permuted_buf_0.make_loader()(index)
+            tmp1 = buf1.make_loader()(index)
+            tmp2 = buf2.make_loader()(index)
+            return tmp0 * tmp1 + tmp2
+
+        def inner_fn_buf4(index):
+            tmp0 = buf0.make_loader()(index)
+            tmp3 = buf3.make_loader()(index)
+            return tmp0 + tmp3
+
+        buf3 = MockComputedBuffer("buf3", inner_fn_buf3, torch.float32, size)
+        buf4 = MockComputedBuffer("buf4", inner_fn_buf4, torch.float32, size)
+
+        with V.set_graph_handler(
+            MockGraphHandler(
+                {"buf0": buf0, "buf1": buf1, "buf2": buf2, "buf3": buf3, "buf4": buf4}
+            )
+        ):
+            result = None
+            try:
+                CutlassEVTCodegen.ir_to_evt_python_code(
+                    "buf0",
+                    [MockSchedulerNode(buf3), MockSchedulerNode(buf4)],
+                    OrderedSet([]),
+                )
+            except NotImplementedError as e:
+                result = e
+
+            self.assertExpectedInline(
+                str(result),
+                """Unsupported indexing for buf0 with index 200*i0 + 60000*i1 + i2, \
+index strides [200, 60000, 1], and layout stride [60000, 200, 1]""",
+            )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_py_codegen_broadcasting(self):
+        from torch._inductor.codegen.cuda.cutlass_python_evt import CutlassEVTCodegen
+        from torch._inductor.virtualized import V
+
+        size = (100, 300, 200)
+        buf0 = MockComputedBuffer("buf0", None, torch.float32, size)
+        buf1 = MockComputedBuffer("buf1", None, torch.float32, size)
+        buf2 = MockComputedBuffer("buf2", None, torch.float32, size)
+
+        # buf0 is acc
+        # buf1 is external
+        def inner_fn_buf3(index):
+            tmp0 = buf0.make_loader()(index)
+            tmp1 = buf1.make_loader()(index)
+            tmp2 = buf2.make_loader()(index)
+            return tmp0 * tmp1 + tmp2
+
+        def inner_fn_buf4(index):
+            tmp0 = buf0.make_loader()(index)
+            tmp3 = buf3.make_loader()(index)
+            return tmp0 + tmp3 * tmp3
+
+        buf3 = MockComputedBuffer("buf3", inner_fn_buf3, torch.float32, size)
+        buf4 = MockComputedBuffer(
+            "buf4", inner_fn_buf4, torch.float32, (100, 300, 1)
+        )  # broadcast
+        with V.set_graph_handler(
+            MockGraphHandler(
+                {"buf0": buf0, "buf1": buf1, "buf2": buf2, "buf3": buf3, "buf4": buf4}
+            )
+        ):
+            reads, writes, renames, code = CutlassEVTCodegen.ir_to_evt_python_code(
+                "buf0",
+                [
+                    MockSchedulerNode(buf3),
+                    MockSchedulerNode(buf4, last_usage=OrderedSet(["buf0"])),
+                ],
+                OrderedSet([]),
+            )
+        self.assertExpectedInline(reads, """['buf1', 'buf2']""")
+        self.assertExpectedInline(writes, """['buf0', 'buf3', 'buf4']""")
+        self.assertExpectedInline(
+            renames,
+            """{'accum': 'buf0', 'tmp_0': 'buf0', 'buf1': 'buf1', 'buf2': 'buf2', 'tmp_2': 'buf3', 'D': 'buf4'}""",
+        )
+        self.assertExpectedInline(
+            code,
+            """\
+def fn(accum, buf1, buf2):
+    tmp_0 = accum
+    tmp_1 = tmp_0 * buf1
+    tmp_2 = tmp_1 + buf2
+    tmp_3 = tmp_2 * tmp_2
+    D = tmp_0 + tmp_3
+
+return tmp_0, tmp_2, D""",
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_py_codegen(self):
+        from torch._inductor.codegen.cuda.cutlass_python_evt import CutlassEVTCodegen
+        from torch._inductor.virtualized import V
+
+        size = (100, 300, 200)
+        buf0 = MockComputedBuffer("buf0", None, torch.float32, size)
+        buf1 = MockComputedBuffer("buf1", None, torch.float32, size)
+        buf2 = MockComputedBuffer("buf2", None, torch.float32, size)
+
+        # buf0 is acc
+        # buf1 is external
+        def inner_fn_buf3(index):
+            tmp0 = buf0.make_loader()(index)
+            tmp1 = buf1.make_loader()(index)
+            tmp2 = buf2.make_loader()(index)
+            return tmp0 * tmp1 + tmp2
+
+        def inner_fn_buf4(index):
+            tmp0 = buf0.make_loader()(index)
+            tmp3 = buf3.make_loader()(index)
+            return tmp0 + tmp3
+
+        buf3 = MockComputedBuffer("buf3", inner_fn_buf3, torch.float32, size)
+        buf4 = MockComputedBuffer("buf4", inner_fn_buf4, torch.float32, size)
+        with V.set_graph_handler(
+            MockGraphHandler(
+                {"buf0": buf0, "buf1": buf1, "buf2": buf2, "buf3": buf3, "buf4": buf4}
+            )
+        ):
+            reads, writes, renames, code = CutlassEVTCodegen.ir_to_evt_python_code(
+                "buf0",
+                [
+                    MockSchedulerNode(buf3),
+                    MockSchedulerNode(buf4),
+                ],
+                OrderedSet(["buf0"]),
+            )
+        self.assertExpectedInline(reads, """['buf1', 'buf2']""")
+        self.assertExpectedInline(writes, """['buf3', 'buf4']""")
+        self.assertExpectedInline(
+            renames,
+            """{'accum': 'buf0', 'buf1': 'buf1', 'buf2': 'buf2', 'tmp_1': 'buf3', 'D': 'buf4'}""",
+        )
+        self.assertExpectedInline(
+            code,
+            """\
+def fn(accum, buf1, buf2):
+    tmp_0 = accum * buf1
+    tmp_1 = tmp_0 + buf2
+    D = accum + tmp_1
+
+return tmp_1, D""",
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_example_tensor_creation(self):
+        from torch._inductor.codegen.cuda.cutlass_lib_extensions.evt_extensions import (
+            create_example_tensors,
+        )
+
+        row_major_buf0 = MockComputedBuffer(
+            "buf0", None, torch.float32, (3, 4, 1), (4, 1, 0)
+        )
+        col_major_buf1 = MockComputedBuffer(
+            "buf1", None, torch.float32, (3, 2, 1), (1, 3, 0)
+        )
+        buffer_renames = {"buf0": "buf0", "buf1": "buf1", "acc": "buf0"}
+        name_to_buffer = {"buf0": row_major_buf0, "buf1": col_major_buf1}
+        result = create_example_tensors(
+            buffer_renames, name_to_buffer, lambda x: int(x)
+        )
+        self.assertEqual(result["acc"].shape, (3, 4, 1))
+        self.assertEqual(result["acc"].stride, (4, 1, 0))
+        self.assertEqual(
+            result["acc"].element, torch_dtype_to_cutlass_type(torch.float32)
+        )
+
+        self.assertEqual(result["buf1"].shape, (3, 2, 1))
+        self.assertEqual(result["buf1"].stride, (1, 3, 0))
+        self.assertEqual(
+            result["buf1"].element, torch_dtype_to_cutlass_type(torch.float32)
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_evt_argument_codegen(self):
+        from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch
+
+        cuda_arch = int(get_cuda_arch())  # type: ignore[arg-type]
+        epilogue_functor = _trace(BIAS_CODE, EXAMPLE_TENSORS, cuda_arch)
+
+        self.assertExpectedInline(
+            _render_argument_type(
+                epilogue_functor,
+                _create_mock_buffer_name_map(EXAMPLE_TENSORS),
+                lambda x: int(x),
+            ),
+            """\
+{ /* thread */
+        { /* F */
+          { /* compute_1 */
+            { /* compute_0 */
+              {}, /* accum */
+              {}, /* C */
+              {}, /* compute_0 */
+            },
+            {/* ptr_aux */ (float*) aux, /* null_default */ float(0), /* dAux */ {2048, _1{}, _0{}}}, /* aux */
+            {}, /* compute_1 */
+          },
+          {/* ptr_aux */ (float*) F, /* dAux */ {2048, _1{}, _0{}}}, /* F */
+        },
+        {/* ptr_col */ (float*) bias, /* null_default */ float(0), /* dCol */ {}}, /* bias */
+        {}, /* compute_2 */
+        {}, /* compute_3 */
+        {}, /* compute_4 */
+      }
+""",
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_evt_argument_codegen_return_accumulator(self):
+        from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch
+
+        code = """
+def fn(accum, bias):
+    E = accum
+    D = E + bias
+    return D, E
+"""
+        example_tensors = {
+            "accum": CutlassTensor(
+                element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+            ),
+            "bias": BIAS,
+            # "beta": 0.5, TODO: mlazos support scalars
+            # "alpha": 0.5, TODO: mlazos support scalars
+            "D": CutlassTensor(
+                element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+            ),
+            "E": CutlassTensor(
+                element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+            ),
+        }
+
+        cuda_arch = int(get_cuda_arch())  # type: ignore[arg-type]
+        epilogue_functor = _trace(code, example_tensors, cuda_arch)
+
+        self.assertExpectedInline(
+            _render_argument_type(
+                epilogue_functor,
+                _create_mock_buffer_name_map(example_tensors),
+                lambda x: int(x),
+            ),
+            """\
+{ /* thread */
+        { /* E */
+          {}, /* accum */
+          {/* ptr_aux */ (float*) E, /* dAux */ {2048, _1{}, _0{}}}, /* E */
+        },
+        {/* ptr_col */ (float*) bias, /* null_default */ float(0), /* dCol */ {}}, /* bias */
+        {}, /* compute_0 */
+      }
+""",
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_evt_codegen(self):
+        _, _, code = trace(
+            BIAS_CODE,
+            EXAMPLE_TENSORS,
+            DataType.f32,
+            DataType.f32,
+            MockTileDescription(),
+            EpilogueScheduleType.ScheduleAuto,
+            _create_mock_buffer_name_map(EXAMPLE_TENSORS),
+            lambda x: x,  # static shapes
+        )
+        self.assertExpectedInline(
+            code,
+            """\
+
+using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+  cute::Shape<_128, _128, _8>, cutlass::epilogue::collective::EpilogueTileAuto,
+  float, float,
+  cutlass::epilogue::collective::EpilogueScheduleAuto
+>;
+
+using ElementC = float;
+using StrideC = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
+using TensorC = cutlass::epilogue::fusion::Sm90SrcFetch<float>;
+
+using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+using AuxDescriptor = cutlass::epilogue::collective::detail::AuxLoadDescriptor<EpilogueDescriptor, \
+cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>, float>;
+
+using Aux = cutlass::epilogue::fusion::Sm90AuxLoad<
+    AuxDescriptor::Stages, typename AuxDescriptor::EpilogueTile, float,
+    cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>, typename AuxDescriptor::SmemLayoutAtom, \
+typename AuxDescriptor::CopyOpS2R
+>;
+
+using Bias = cutlass::epilogue::fusion::Sm90ColBroadcast<
+    0 /*Stages*/, typename EpilogueDescriptor::TileShape, float, float,
+    cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>
+>;
+
+using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+    cutlass::plus, float, float,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<
+    Compute0,
+    Accum,
+    TensorC>;
+
+using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+    cutlass::plus, float, float,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<
+    Compute1,
+    EVTCompute0,
+    Aux>;
+
+using FDescriptor = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>, float
+>;
+
+using F = cutlass::epilogue::fusion::Sm90AuxStore<
+    FDescriptor::Stages, typename FDescriptor::EpilogueTile, float,
+    cutlass::FloatRoundStyle::round_to_nearest, cute::Stride<int64_t, cute::Int<1>, \
+cute::Int<0>>, typename FDescriptor::SmemLayoutAtom,
+    typename FDescriptor::CopyOpR2S
+>;
+
+using EVTF = cutlass::epilogue::fusion::Sm90EVT<
+    F,
+    EVTCompute1>;
+
+using Compute2 = cutlass::epilogue::fusion::Sm90Compute<
+    cutlass::epilogue::thread::ReLu, float, float,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using Compute3 = cutlass::epilogue::fusion::Sm90Compute<
+    cutlass::plus, float, float,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using Compute4 = cutlass::epilogue::fusion::Sm90Compute<
+    cutlass::plus, float, float,
+    cutlass::FloatRoundStyle::round_to_nearest
+>;
+
+using DagCompute4 = cutlass::epilogue::fusion::Sm90TopologicalVisitor<
+    float,
+    cute::tuple<
+        cute::seq<>,
+        cute::seq<>,
+        cute::seq<0>,
+        cute::seq<2, 1>,
+        cute::seq<3, 0>,
+    >,
+    EVTF,
+    Bias,
+    Compute2,
+    Compute3,
+    Compute4
+>;
+
+using ElementD = float;
+using StrideD = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
+
+""",
+        )
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    if HAS_CPU or HAS_CUDA:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_debug_trace.py b/test/inductor/test_debug_trace.py
index 145932a72a16..ddf57f681faa 100644
--- a/test/inductor/test_debug_trace.py
+++ b/test/inductor/test_debug_trace.py
@@ -10,7 +10,11 @@
 
 import torch
 from torch._inductor import config, test_operators
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import skipIfWindows
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 from torch.testing._internal.logging_utils import multiple_logs_to_string
@@ -44,7 +48,11 @@ def fn(a, b):
             "torch._inductor.debug", "ir_pre_fusion", "ir_post_fusion"
         )
 
+<<<<<<< HEAD
         # TODO(aakhundov): make this work with fresh_inductor_cache
+=======
+        # TODO(aakhundov): make this work with fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # instead of force_disable_caches. currently, with the latter
         # enabled, we get `inductor [('fxgraph_cache_hit', 1)]` in
         # the counters: so the cache is actually hit and the test fails.
@@ -54,9 +62,18 @@ def fn(a, b):
                 "force_disable_caches": True,
             }
         ):
+<<<<<<< HEAD
             with self.assertLogs(
                 logging.getLogger("torch._inductor.debug"), level=logging.WARNING
             ) as cm, ctx():
+=======
+            with (
+                self.assertLogs(
+                    logging.getLogger("torch._inductor.debug"), level=logging.WARNING
+                ) as cm,
+                ctx(),
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fn(torch.randn(16, 16), torch.randn(16, 16))
 
         m = None
@@ -261,9 +278,19 @@ def forward(self, x):
                 return self.relu(self.l(x))
 
         # no failure
+<<<<<<< HEAD
         with self.assertLogs(
             logging.getLogger("torch._inductor.debug"), level=logging.WARNING
         ), fresh_inductor_cache():
+=======
+        with (
+            self.assertLogs(
+                logging.getLogger("torch._inductor.debug"),
+                level=logging.WARNING,
+            ),
+            fresh_cache(),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             m = ToyModel().to(device=GPU_TYPE)
             m = torch.compile(m, mode="max-autotune")
             input_tensor = torch.randn(100).to(device=GPU_TYPE)
diff --git a/test/inductor/test_decompose_mem_bound_mm.py b/test/inductor/test_decompose_mem_bound_mm.py
index a9f3daa67528..4de78ad51860 100644
--- a/test/inductor/test_decompose_mem_bound_mm.py
+++ b/test/inductor/test_decompose_mem_bound_mm.py
@@ -301,6 +301,11 @@ def test_decompose_mm(self, m, n, k, has_bias, should_decompose):
         )
         counters.clear()
 
+<<<<<<< HEAD
+=======
+    # (1, 64, 32, False) vesrion fails
+    @unittest.skip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize(
         "m,k,n, should_decompose",
         [(1, 64, 16, True), (2, 64, 16, False), (1, 64, 32, False)],
@@ -369,6 +374,10 @@ def test_decompose_mm_mixed_precision(self, m, n, k, has_bias, should_decompose)
             )
             counters.clear()
 
+<<<<<<< HEAD
+=======
+    @unittest.skip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("m,k,n, should_decompose", [(20480, 5, 2, True)])
     @parametrize("has_bias", [True, False])
     def test_dynamic_shape(self, m, n, k, has_bias, should_decompose):
diff --git a/test/inductor/test_distributed_patterns.py b/test/inductor/test_distributed_patterns.py
index b61f35515b4b..cf8e097f90c2 100644
--- a/test/inductor/test_distributed_patterns.py
+++ b/test/inductor/test_distributed_patterns.py
@@ -29,8 +29,14 @@ def fw_pre_hook(mod, inp):
         mod.unsharded_weight.untyped_storage().resize_(
             mod.unsharded_weight.nelement() * mod.unsharded_weight.element_size()
         )
+<<<<<<< HEAD
         with torch.no_grad(), torch.autograd._unsafe_preserve_version_counter(
             mod.unsharded_weight
+=======
+        with (
+            torch.no_grad(),
+            torch.autograd._unsafe_preserve_version_counter(mod.unsharded_weight),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.ops.fsdp.copy_(mod.unsharded_weight, all_gather(mod.sharded_weight))
         mod._parameters["weight"] = mod.unsharded_weight
@@ -52,8 +58,14 @@ def bw_pre_hook(mod, gO):
         mod.unsharded_weight.untyped_storage().resize_(
             mod.unsharded_weight.nelement() * mod.unsharded_weight.element_size()
         )
+<<<<<<< HEAD
         with torch.no_grad(), torch.autograd._unsafe_preserve_version_counter(
             mod.unsharded_weight
+=======
+        with (
+            torch.no_grad(),
+            torch.autograd._unsafe_preserve_version_counter(mod.unsharded_weight),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.ops.fsdp.copy_(mod.unsharded_weight, all_gather(mod.sharded_weight))
         mod._parameters["weight"] = mod.unsharded_weight
@@ -338,7 +350,11 @@ def test_module_backward_hooks_eager(self):
         self.assertEqual(fw_cnt.op_count, 5)
         self.assertEqual(bw_cnt.frame_count, 2)  # grad=None and grad!=None
         self.assertEqual(
+<<<<<<< HEAD
             bw_cnt.op_count, 72
+=======
+            bw_cnt.op_count, 111
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )  # Number of ops in the Dynamo-produced graphs
 
     def test_module_backward_hooks_aot(self):
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index 7f7e6dba71a3..86670f5a63f7 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -9,13 +9,24 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from itertools import product
+<<<<<<< HEAD
 from typing import Callable, Optional, Union
+=======
+from typing import Callable, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest import expectedFailure, skip, skipUnless
 from unittest.mock import patch
 
 import torch
+<<<<<<< HEAD
 from torch._dynamo.testing import CompileCounterWithBackend, normalize_gm
 from torch._inductor import metrics
+=======
+import torch.nn as nn
+from torch._dynamo.testing import CompileCounterWithBackend, normalize_gm
+from torch._inductor import metrics
+from torch._inductor.runtime.triton_compat import HAS_WARP_SPEC
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import run_and_get_code
 from torch.nn.attention.experimental._paged_attention import PagedAttention
@@ -29,6 +40,10 @@
     BlockMask,
     create_block_mask,
     flex_attention,
+<<<<<<< HEAD
+=======
+    flex_attention_hop,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     noop_mask,
     or_masks,
 )
@@ -36,18 +51,36 @@
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16, TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
+<<<<<<< HEAD
     flex_attention_supported_platform as supported_platform,
 )
 from torch.testing._internal.common_utils import IS_MACOS, TEST_WITH_ROCM
 from torch.utils._triton import has_triton
+=======
+    dtypes,
+    dtypesIfCUDA,
+    flex_attention_supported_platform as supported_platform,
+    instantiate_device_type_tests,
+    largeTensorTest,
+    skipCPUIf,
+    skipCUDAIf,
+)
+from torch.utils._triton import has_triton, has_triton_tma_device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Use this decorator only when hitting Triton bugs on H100
 running_on_a100_only = skipUnless(
+<<<<<<< HEAD
     torch.cuda.is_available()
     and has_triton()
     and torch.cuda.get_device_capability() == (8, 0),
     "Requires A100 and Triton",
+=======
+    (torch.cuda.is_available() and has_triton())
+    and (torch.cuda.get_device_capability() == (8, 0) or torch.version.hip),
+    "Requires Triton + A100 or Triton + ROCm",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 Tolerances = namedtuple("Tolerances", ["atol", "rtol"])
@@ -57,6 +90,25 @@
 Tensor = torch.Tensor
 
 
+<<<<<<< HEAD
+=======
+T = TypeVar("T")
+M = TypeVar("M", bound=Callable)
+
+
+def large_tensor_test_class(
+    size: str, device: Optional[Union[torch.device, str]] = None
+) -> Callable[[type[T]], type[T]]:
+    def decorator(cls: type[T]) -> type[T]:
+        for name, method in list(cls.__dict__.items()):
+            if callable(method) and name.startswith("test_"):
+                setattr(cls, name, largeTensorTest(size, device)(method))
+        return cls
+
+    return decorator
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @contextmanager
 def temp_float32_matmul_precision(precision: str):
     """
@@ -73,6 +125,28 @@ def temp_float32_matmul_precision(precision: str):
         torch.set_float32_matmul_precision(original_precision)
 
 
+<<<<<<< HEAD
+=======
+def skip_on_cpu(test_func):
+    """Decorator to skip tests that are not supported on CPU."""
+    decorated_func = skipCPUIf(True, "Not supported on CPU")(test_func)
+    return decorated_func
+
+
+def skip_on_cuda(test_func):
+    """Decorator to skip tests that are not supported on CUDA."""
+    decorated_func = skipCUDAIf(True, "Not supported on CUDA")(test_func)
+    return decorated_func
+
+
+def skip_on_rocm(test_func):
+    """Decorator to skip tests that are not supported on CUDA."""
+    IS_ROCM = torch.cuda.is_available() and torch.version.hip
+    decorated_func = skipCUDAIf(IS_ROCM, "Not supported on ROCM")(test_func)
+    return decorated_func
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def rmse(ref, res):
     """
     Calculate root mean squared error
@@ -101,12 +175,22 @@ def create_block_mask_test(score_mod, query, key):
     return block_mask
 
 
+<<<<<<< HEAD
+=======
+@dataclass
+class DeviceConfig:
+    dtypes: list[torch.dtype]
+    dtypes_fast: list[torch.dtype]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST_ON_CUDA = (
     torch.cuda.is_available()
     and torch.utils._triton.has_triton()
     and torch.cuda.get_device_capability() >= (8, 0)
 )
 
+<<<<<<< HEAD
 if TEST_ON_CUDA:
     test_device = "cuda"
     test_dtypes = (
@@ -135,12 +219,58 @@ def create_block_mask_test(score_mod, query, key):
     )
 
     test_dtypes = (
+=======
+device_configs = {}
+test_device = ("cpu", "cuda")
+
+
+class SubstringSet:
+    def __init__(self, items):
+        self.items = set(items)
+
+    def __contains__(self, item):
+        if "cuda" in item:
+            item = "cuda"
+        return item in self.items
+
+
+DEVICE_SUPPORTS_BACKWARDS = SubstringSet(
+    [
+        "cuda",
+    ]
+)
+
+device_configs["cuda"] = DeviceConfig(
+    dtypes=(
+        [torch.float32, torch.bfloat16, torch.float16]
+        if PLATFORM_SUPPORTS_BF16
+        else [torch.float16, torch.float32]
+    ),
+    dtypes_fast=[torch.float16],
+)
+device_configs["cpu"] = DeviceConfig(
+    dtypes=(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         [torch.float32, torch.bfloat16, torch.float16]
         if torch.backends.mkldnn.is_available()
         and torch.ops.mkldnn._is_mkldnn_bf16_supported()
         else [torch.float32]
+<<<<<<< HEAD
     )
     test_dtypes_fast = [torch.float32]
+=======
+    ),
+    dtypes_fast=[torch.float32],
+)
+
+torch_config_string = torch.__config__.show()
+LONG_COMPILATION_ON_CPU = False
+
+if "CLANG" in torch_config_string.upper():
+    # if the compiler is clang, skip UT for CPU due to long compilation time found in CI
+    # TODO: check reason of long compile time
+    LONG_COMPILATION_ON_CPU = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # --------- Useful score mod functions for testing ---------
@@ -202,9 +332,15 @@ def _squared(score, b, h, m, n):
     return score * score
 
 
+<<<<<<< HEAD
 def _head_offset(dtype: torch.dtype):
     """Captured Buffer"""
     head_offset = torch.rand(H, device="cuda", dtype=dtype)
+=======
+def _head_offset(dtype: torch.dtype, device: str):
+    """Captured Buffer"""
+    head_offset = torch.rand(H, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def score_mod(score, b, h, m, n):
         return score * head_offset[h]
@@ -270,9 +406,15 @@ def _inverse_causal_mask(
     "_head_offset": _head_offset,
 }
 
+<<<<<<< HEAD
 B = 4
 H = 8
 S = 2048
+=======
+B = 2
+H = 4
+S = 256
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 D = 64
 
 test_Hq_Hkv = [
@@ -293,12 +435,33 @@ def _inverse_causal_mask(
     (256, 128),
 ]
 
+<<<<<<< HEAD
+=======
+test_strides = [
+    ((H * S * D, S * D, D, 1), 997),  # offset
+    ((H * D, D, B * H * D, 1), 499),  # transposed dimensions
+    ((H * S * D, D, H * D, 1), 0),  # heads/sequence transposed
+    (
+        (S * (D + 1), B * S * (D + 1), (D + 1), 1),
+        293,
+    ),  # additional buffer on one dim
+    (
+        (1, D, (B + 1) * (H + 1) * D, 1),
+        97,
+    ),  # additional buffer on multiple dim + shared dimension
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def query_key_value_clones(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
+<<<<<<< HEAD
     dtype: torch.dtype = None,
+=======
+    dtype: Optional[torch.dtype] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """Clones the query, key, and value tensors and moves them to the specified dtype."""
     if dtype is None:
@@ -318,6 +481,7 @@ def batch_reserve(paged_attention: PagedAttention, target_seq_len: Tensor):
         )
 
 
+<<<<<<< HEAD
 class TestFlexAttention(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -329,6 +493,16 @@ def setUp(self):
                 )
             if not IS_PLATFORM_SUPPORTED:
                 self.skipTest("skip UT due to not support on those platforms")
+=======
+@large_tensor_test_class("2GB", device="cuda")
+class TestFlexAttention(InductorTestCase):
+    def setUp(self):
+        super().setUp()
+        skipCPUIf(
+            LONG_COMPILATION_ON_CPU,
+            "skip UT for CPU due to long compilation time found in CI",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_equal(
         self,
@@ -414,7 +588,12 @@ def _check_out_and_grad(
     def run_test(
         self,
         score_mod: _score_mod_signature,
+<<<<<<< HEAD
         dtype: torch.dtype = torch.float16,
+=======
+        dtype: torch.dtype,
+        device: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Q_B: int = B,
         Q_H: int = H,
         Q_S: int = S,
@@ -425,6 +604,10 @@ def run_test(
         V_D: Optional[int] = None,
         block_mask: Optional[BlockMask] = None,
     ):
+<<<<<<< HEAD
+=======
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if KV_B is None:
             KV_B = Q_B
         if KV_H is None:
@@ -434,6 +617,7 @@ def run_test(
         if V_D is None:
             V_D = Q_D
 
+<<<<<<< HEAD
         if self.device == "cpu":
             test_inference_only = True
         else:
@@ -444,22 +628,47 @@ def run_test(
             dtype=dtype,
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+        q = torch.randn(
+            (Q_B, Q_H, Q_S, Q_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         k = torch.randn(
             (KV_B, KV_H, KV_S, Q_D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         v = torch.randn(
             (KV_B, KV_H, KV_S, V_D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
         )
         if block_mask is None:
             block_mask = create_block_mask(
                 noop_mask, Q_B, Q_H, Q_S, KV_S, device=self.device
+=======
+            device=device,
+            requires_grad=requires_grad,
+        )
+        if block_mask is None:
+            block_mask = create_block_mask(
+                noop_mask, Q_B, Q_H, Q_S, KV_S, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
@@ -471,7 +680,16 @@ def run_test(
         golden_out = sdpa_partial(q_gold, k_gold, v_gold)
         ref_out = sdpa_partial(q_ref, k_ref, v_ref)
         compiled_out = compiled_sdpa(q, k, v)
+<<<<<<< HEAD
         if test_inference_only:
+=======
+
+        assert isinstance(golden_out, torch.Tensor)
+        assert isinstance(ref_out, torch.Tensor)
+        assert isinstance(compiled_out, torch.Tensor)
+
+        if not requires_grad:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._check_out(
                 golden_out,
                 ref_out,
@@ -480,7 +698,11 @@ def run_test(
             )
         else:
             backward_grad = torch.randn(
+<<<<<<< HEAD
                 (Q_B, Q_H, Q_S, V_D), dtype=dtype, device=self.device
+=======
+                (Q_B, Q_H, Q_S, V_D), dtype=dtype, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             golden_out.backward(backward_grad.to(torch.float64))
@@ -509,7 +731,12 @@ def preprocess_paged_attention(
         k: Tensor,
         v: Tensor,
         block_mask,
+<<<<<<< HEAD
         dtype: torch.dtype = torch.float16,
+=======
+        dtype: torch.dtype,
+        device: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         page_size: int = 128,
     ) -> tuple[Tensor, Tensor, BlockMask, _score_mod_signature]:
         assert block_mask is not None, "Must provide block_mask"
@@ -529,7 +756,11 @@ def preprocess_paged_attention(
             KV_H,
             MAX_CACHED_SEQ_LEN,
             QK_D,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
         )
         v_cache = torch.zeros(
@@ -537,7 +768,11 @@ def preprocess_paged_attention(
             KV_H,
             MAX_CACHED_SEQ_LEN,
             V_D,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
         )
 
@@ -552,6 +787,7 @@ def preprocess_paged_attention(
         # Thus, kv tensors of batch index 1 will be scattered in the kv cache, simulating
         # a real use case of paged attention.
         paged_attention = PagedAttention(
+<<<<<<< HEAD
             n_pages, page_size, max_batch_size, device=self.device
         )
         batch_reserve(
@@ -572,15 +808,41 @@ def preprocess_paged_attention(
         )
         batch_reserve(
             paged_attention, torch.tensor([KV_S, KV_S, KV_S, KV_S], device=self.device)
+=======
+            n_pages, page_size, max_batch_size, device=device
+        )
+        batch_reserve(
+            paged_attention,
+            torch.tensor([KV_S // 4, KV_S // 2, KV_S // 4, KV_S // 3], device=device),
+        )
+        batch_reserve(
+            paged_attention,
+            torch.tensor([KV_S // 4, KV_S // 2, KV_S // 2, KV_S // 2], device=device),
+        )
+        batch_reserve(
+            paged_attention,
+            torch.tensor([KV_S // 2, KV_S, KV_S // 2, KV_S], device=device),
+        )
+        batch_reserve(
+            paged_attention, torch.tensor([KV_S, KV_S, KV_S, KV_S], device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # update cache with k and v
         input_pos = (
+<<<<<<< HEAD
             torch.arange(KV_S, device=self.device, dtype=torch.int32)
             .unsqueeze(0)
             .expand(KV_B, KV_S)
         )
         batch_idx = torch.arange(KV_B, device=self.device, dtype=torch.int32)
+=======
+            torch.arange(KV_S, device=device, dtype=torch.int32)
+            .unsqueeze(0)
+            .expand(KV_B, KV_S)
+        )
+        batch_idx = torch.arange(KV_B, device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         paged_attention.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
         # convert block mask and score mod
@@ -594,7 +856,12 @@ def run_paged_attention(
         q: Tensor,
         k: Tensor,
         v: Tensor,
+<<<<<<< HEAD
         dtype: torch.dtype = torch.float16,
+=======
+        dtype: torch.dtype,
+        device: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         block_mask: Optional[BlockMask] = None,
     ) -> tuple[Tensor, Tensor]:
         B, Q_H, Q_S, KV_H, KV_S = (
@@ -604,6 +871,7 @@ def run_paged_attention(
             k.shape[1],
             k.shape[2],
         )
+<<<<<<< HEAD
         if self.device == "cpu":
             test_inference_only = True
         else:
@@ -612,6 +880,11 @@ def run_paged_attention(
             block_mask = create_block_mask(
                 noop_mask, B, 1, Q_S, KV_S, device=self.device
             )
+=======
+
+        if block_mask is None:
+            block_mask = create_block_mask(noop_mask, B, 1, Q_S, KV_S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         (
             k_cache,
@@ -619,6 +892,7 @@ def run_paged_attention(
             converted_block_mask,
             converted_score_mod,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             score_mod,
             q,
             k,
@@ -626,16 +900,25 @@ def run_paged_attention(
             block_mask,
             dtype,
             block_mask.BLOCK_SIZE[1],
+=======
+            score_mod, q, k, v, block_mask, dtype, device, block_mask.BLOCK_SIZE[1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         compiled_sdpa = torch.compile(flex_attention)
 
         # compute
         return_lse = True
+<<<<<<< HEAD
         if test_inference_only:
             return_lse = False
             compiled_lse = None
             compiled_out = compiled_sdpa(
+=======
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+        if requires_grad:
+            compiled_out, compiled_lse = compiled_sdpa(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 q,
                 k_cache,
                 v_cache,
@@ -644,9 +927,16 @@ def run_paged_attention(
                 score_mod=converted_score_mod,
                 enable_gqa=(not Q_H == KV_H),
             )
+<<<<<<< HEAD
 
         else:
             compiled_out, compiled_lse = compiled_sdpa(
+=======
+        else:
+            return_lse = False
+            compiled_lse = None
+            compiled_out = compiled_sdpa(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 q,
                 k_cache,
                 v_cache,
@@ -659,8 +949,14 @@ def run_paged_attention(
 
     def run_test_with_paged_attention(
         self,
+<<<<<<< HEAD
         score_mod: Optional[Callable] = _identity,
         dtype: torch.dtype = torch.float16,
+=======
+        score_mod: Optional[Callable],
+        dtype: torch.dtype,
+        device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Q_B: int = B,
         Q_H: int = H,
         Q_S: int = S,
@@ -672,32 +968,52 @@ def run_test_with_paged_attention(
         block_mask: Optional[BlockMask] = None,
     ):
         assert Q_H % KV_H == 0
+<<<<<<< HEAD
         if self.device == "cpu":
             test_inference_only = True
         else:
             test_inference_only = False
         q = torch.randn(
             (Q_B, Q_H, Q_S, QK_D), dtype=dtype, device=self.device, requires_grad=False
+=======
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+
+        q = torch.randn(
+            (Q_B, Q_H, Q_S, QK_D), dtype=dtype, device=device, requires_grad=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         k = torch.randn(
             (KV_B, KV_H, KV_S, QK_D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=False,
         )
         v = torch.randn(
             (KV_B, KV_H, KV_S, V_D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=False,
         )
         q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
 
         if block_mask is None:
+<<<<<<< HEAD
             block_mask = create_block_mask(
                 noop_mask, Q_B, 1, Q_S, KV_S, device=self.device
             )
+=======
+            block_mask = create_block_mask(noop_mask, Q_B, 1, Q_S, KV_S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sdpa_partial = create_attention(
             score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
@@ -706,7 +1022,11 @@ def run_test_with_paged_attention(
         ref_out, ref_lse = sdpa_partial(q_ref, k_ref, v_ref, return_lse=True)
 
         compiled_out, compiled_lse = self.run_paged_attention(
+<<<<<<< HEAD
             score_mod, q, k, v, dtype, block_mask
+=======
+            score_mod, q, k, v, dtype, device, block_mask
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self._check_out(
             golden_out,
@@ -714,8 +1034,13 @@ def run_test_with_paged_attention(
             compiled_out,
             is_paged_attention=True,
         )
+<<<<<<< HEAD
 
         if not test_inference_only:
+=======
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+        if requires_grad:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._check_out(
                 golden_lse,
                 ref_lse,
@@ -726,7 +1051,12 @@ def run_test_with_paged_attention(
     def run_test_with_call(
         self,
         sdpa_call: Callable,
+<<<<<<< HEAD
         dtype: torch.dtype = torch.float16,
+=======
+        dtype: torch.dtype,
+        device: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Q_B: int = B,
         Q_H: int = H,
         Q_S: int = S,
@@ -736,6 +1066,7 @@ def run_test_with_call(
         KV_S: int = S,
         V_D: int = D,
     ):
+<<<<<<< HEAD
         if self.device == "cpu":
             test_inference_only = True
         else:
@@ -745,18 +1076,40 @@ def run_test_with_call(
             dtype=dtype,
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+
+        q = torch.randn(
+            (Q_B, Q_H, Q_S, Q_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         k = torch.randn(
             (KV_B, KV_H, KV_S, Q_D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         v = torch.randn(
             (KV_B, KV_H, KV_S, V_D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
@@ -764,7 +1117,11 @@ def run_test_with_call(
         golden_out = sdpa_call(q_gold, k_gold, v_gold)
         ref_out = sdpa_call(q_ref, k_ref, v_ref)
         compiled_out = compiled_sdpa(q, k, v)
+<<<<<<< HEAD
         if test_inference_only:
+=======
+        if not requires_grad:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._check_out(
                 golden_out,
                 ref_out,
@@ -773,7 +1130,11 @@ def run_test_with_call(
             )
         else:
             backward_grad = torch.randn(
+<<<<<<< HEAD
                 (Q_B, Q_H, Q_S, V_D), dtype=dtype, device=self.device
+=======
+                (Q_B, Q_H, Q_S, V_D), dtype=dtype, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             golden_out.backward(backward_grad.to(torch.float64))
@@ -798,12 +1159,18 @@ def run_test_with_call(
     def run_dynamic_test(
         self,
         score_mask_mod: tuple[Callable, Callable],
+<<<<<<< HEAD
         dtype: torch.dtype = torch.float16,
+=======
+        dtype: torch.dtype,
+        device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         B: int = B,
         H: int = H,
         S: int = S,
         D: int = D,
     ):
+<<<<<<< HEAD
         score_mod, mask_mod = score_mask_mod
 
         # First batch with original dimensions (B, H, S, D)
@@ -813,29 +1180,92 @@ def run_dynamic_test(
         q1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
         k1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
         v1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
+=======
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+
+        score_mod, mask_mod = score_mask_mod
+
+        # First batch with original dimensions (B, H, S, D)
+        block_mask1 = create_block_mask(mask_mod, 1, 1, S, S, device=device)
+        sdpa_partial1 = create_attention(score_mod, block_mask=block_mask1)
+
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+
+        q1 = torch.randn(
+            (B, H, S, D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+        k1 = torch.randn(
+            (B, H, S, D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+        v1 = torch.randn(
+            (B, H, S, D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         q1_ref, k1_ref, v1_ref = query_key_value_clones(q1, k1, v1)
         q1_gold, k1_gold, v1_gold = query_key_value_clones(q1, k1, v1, torch.float64)
         ref_out1 = sdpa_partial1(q1_ref, k1_ref, v1_ref)
         golden_out1 = sdpa_partial1(q1_gold, k1_gold, v1_gold)
 
+<<<<<<< HEAD
         backward_grad1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
         golden_out1.backward(backward_grad1.to(torch.float64))
         ref_out1.backward(backward_grad1)
+=======
+        if requires_grad:
+            backward_grad1 = torch.randn((B, H, S, D), dtype=dtype, device=device)
+            golden_out1.backward(backward_grad1.to(torch.float64))
+            ref_out1.backward(backward_grad1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Second batch with modified dimensions (B * 2, H, S / 2, D)
         B = int(B * 2)
         S = int(S / 2)
+<<<<<<< HEAD
         block_mask2 = create_block_mask(mask_mod, 1, 1, S, S)
         sdpa_partial2 = create_attention(score_mod, block_mask=block_mask2)
 
         q2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
         k2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
         v2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
+=======
+        block_mask2 = create_block_mask(mask_mod, 1, 1, S, S, device=device)
+        sdpa_partial2 = create_attention(score_mod, block_mask=block_mask2)
+
+        q2 = torch.randn(
+            (B, H, S, D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+        k2 = torch.randn(
+            (B, H, S, D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+        v2 = torch.randn(
+            (B, H, S, D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         q2_ref, k2_ref, v2_ref = query_key_value_clones(q2, k2, v2)
         q2_gold, k2_gold, v2_gold = query_key_value_clones(q2, k2, v2, torch.float64)
         ref_out2 = sdpa_partial2(q2_ref, k2_ref, v2_ref)
         golden_out2 = sdpa_partial2(q2_gold, k2_gold, v2_gold)
 
+<<<<<<< HEAD
         backward_grad2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
         golden_out2.backward(backward_grad2.to(torch.float64))
         ref_out2.backward(backward_grad2)
@@ -848,14 +1278,51 @@ def run_dynamic_test(
         q3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
         k3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
         v3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
+=======
+        if requires_grad:
+            backward_grad2 = torch.randn((B, H, S, D), dtype=dtype, device=device)
+            golden_out2.backward(backward_grad2.to(torch.float64))
+            ref_out2.backward(backward_grad2)
+
+        # Third batch with modified dimensions (B * 2, H, S / 4, D)
+        S = int(S / 2)
+        block_mask3 = create_block_mask(mask_mod, 1, 1, S, S, device=device)
+        sdpa_partial3 = create_attention(score_mod, block_mask=block_mask3)
+
+        q3 = torch.randn(
+            (B, H, S, D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+        k3 = torch.randn(
+            (B, H, S, D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+        v3 = torch.randn(
+            (B, H, S, D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         q3_ref, k3_ref, v3_ref = query_key_value_clones(q3, k3, v3)
         q3_gold, k3_gold, v3_gold = query_key_value_clones(q3, k3, v3, torch.float64)
         ref_out3 = sdpa_partial3(q3_ref, k3_ref, v3_ref)
         golden_out3 = sdpa_partial3(q3_gold, k3_gold, v3_gold)
 
+<<<<<<< HEAD
         backward_grad3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
         golden_out3.backward(backward_grad3.to(torch.float64))
         ref_out3.backward(backward_grad3)
+=======
+        if requires_grad:
+            backward_grad3 = torch.randn((B, H, S, D), dtype=dtype, device=device)
+            golden_out3.backward(backward_grad3.to(torch.float64))
+            ref_out3.backward(backward_grad3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Clear dynamo counters
         torch._dynamo.reset()
@@ -864,6 +1331,7 @@ def run_dynamic_test(
         backend = torch._dynamo.testing.CompileCounterWithBackend("inductor")
         compiled_sdpa1 = torch.compile(sdpa_partial1, backend=backend, dynamic=True)
         compiled_out1 = compiled_sdpa1(q1, k1, v1)
+<<<<<<< HEAD
         compiled_out1.backward(backward_grad1)
 
         self._check_out_and_grad(
@@ -880,11 +1348,34 @@ def run_dynamic_test(
             v1_ref,
             v1,
         )
+=======
+
+        if requires_grad:
+            compiled_out1.backward(backward_grad1)
+
+            self._check_out_and_grad(
+                golden_out1,
+                ref_out1,
+                compiled_out1,
+                q1_gold,
+                q1_ref,
+                q1,
+                k1_gold,
+                k1_ref,
+                k1,
+                v1_gold,
+                v1_ref,
+                v1,
+            )
+        else:
+            self._check_out(golden_out1, ref_out1, compiled_out1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(backend.frame_count, 1)
 
         # Second compilation with new dimensions
         compiled_sdpa2 = torch.compile(sdpa_partial2, backend=backend, dynamic=True)
         compiled_out2 = compiled_sdpa2(q2, k2, v2)
+<<<<<<< HEAD
         compiled_out2.backward(backward_grad2)
 
         self._check_out_and_grad(
@@ -901,11 +1392,34 @@ def run_dynamic_test(
             v2_ref,
             v2,
         )
+=======
+
+        if requires_grad:
+            compiled_out2.backward(backward_grad2)
+
+            self._check_out_and_grad(
+                golden_out2,
+                ref_out2,
+                compiled_out2,
+                q2_gold,
+                q2_ref,
+                q2,
+                k2_gold,
+                k2_ref,
+                k2,
+                v2_gold,
+                v2_ref,
+                v2,
+            )
+        else:
+            self._check_out(golden_out2, ref_out2, compiled_out2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(backend.frame_count, 1)
 
         # Third compilation with new dimensions
         compiled_sdpa3 = torch.compile(sdpa_partial3, backend=backend, dynamic=True)
         compiled_out3 = compiled_sdpa3(q3, k3, v3)
+<<<<<<< HEAD
         compiled_out3.backward(backward_grad3)
 
         self._check_out_and_grad(
@@ -922,17 +1436,45 @@ def run_dynamic_test(
             v3_ref,
             v3,
         )
+=======
+
+        if requires_grad:
+            compiled_out3.backward(backward_grad3)
+
+            self._check_out_and_grad(
+                golden_out3,
+                ref_out3,
+                compiled_out3,
+                q3_gold,
+                q3_ref,
+                q3,
+                k3_gold,
+                k3_ref,
+                k3,
+                v3_gold,
+                v3_ref,
+                v3,
+            )
+        else:
+            self._check_out(golden_out3, ref_out3, compiled_out3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(backend.frame_count, 1)
 
     def run_automatic_dynamic_test(
         self,
         score_mod: Callable,
+<<<<<<< HEAD
         dtype: torch.dtype = torch.float16,
+=======
+        dtype: torch.dtype,
+        device: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         B: int = B,
         H: int = H,
         S: int = S,
         D: int = D,
     ):
+<<<<<<< HEAD
         if self.device == "cpu":
             test_inference_only = True
         else:
@@ -945,18 +1487,43 @@ def run_automatic_dynamic_test(
             dtype=dtype,
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+
+        block_mask1 = create_block_mask(noop_mask, 1, 1, S, S, device=device)
+        sdpa_partial1 = create_attention(score_mod, block_mask=block_mask1)
+        # The first eager batch, shape (B, H, S, D)
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+
+        q1 = torch.randn(
+            (B, H, S, D),
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         k1 = torch.randn(
             (B, H, S, D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         v1 = torch.randn(
             (B, H, S, D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         golden_out1 = sdpa_partial1(
             q1.to(torch.float64), k1.to(torch.float64), v1.to(torch.float64)
@@ -966,25 +1533,44 @@ def run_automatic_dynamic_test(
         # The second eager batch, shape (B * 2, H, S / 2, D)
         B = int(B * 2)
         S = int(S / 2)
+<<<<<<< HEAD
         block_mask2 = create_block_mask(noop_mask, 1, 1, S, S, device=self.device)
+=======
+        block_mask2 = create_block_mask(noop_mask, 1, 1, S, S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sdpa_partial2 = create_attention(score_mod, block_mask=block_mask2)
         q2 = torch.randn(
             (B, H, S, D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         k2 = torch.randn(
             (B, H, S, D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         v2 = torch.randn(
             (B, H, S, D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         golden_out2 = sdpa_partial2(
             q2.to(torch.float64), k2.to(torch.float64), v2.to(torch.float64)
@@ -994,25 +1580,44 @@ def run_automatic_dynamic_test(
         # The third eager batch, shape (B * 4, H, S / 4, D)
         B = int(B * 2)
         S = int(S / 2)
+<<<<<<< HEAD
         block_mask3 = create_block_mask(noop_mask, 1, 1, S, S, device=self.device)
+=======
+        block_mask3 = create_block_mask(noop_mask, 1, 1, S, S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sdpa_partial3 = create_attention(score_mod, block_mask=block_mask3)
         q3 = torch.randn(
             (B, H, S, D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         k3 = torch.randn(
             (B, H, S, D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         v3 = torch.randn(
             (B, H, S, D),
             dtype=dtype,
+<<<<<<< HEAD
             device=self.device,
             requires_grad=not test_inference_only,
+=======
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         golden_out3 = sdpa_partial3(
             q3.to(torch.float64), k3.to(torch.float64), v3.to(torch.float64)
@@ -1035,16 +1640,29 @@ def run_automatic_dynamic_test(
 
         # The first batch.
         backend = torch._dynamo.testing.CompileCounterWithBackend("inductor")
+<<<<<<< HEAD
         compiled_out1 = torch.compile(sdpa_partial1, backend=backend)(q1, k1, v1)
+=======
+        compiled_out1 = torch.compile(sdpa_partial1, backend=backend, fullgraph=True)(
+            q1, k1, v1
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._check_equal(golden_out1, ref_out1, compiled_out1, fudge_factor)
         self.assertEqual(backend.frame_count, 1)
 
         # The second batch (automatic dynamic).
+<<<<<<< HEAD
         compiled_out2 = torch.compile(sdpa_partial2, backend=backend)(q2, k2, v2)
+=======
+        compiled_out2 = torch.compile(sdpa_partial2, backend=backend, fullgraph=True)(
+            q2, k2, v2
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._check_equal(golden_out2, ref_out2, compiled_out2, fudge_factor)
         self.assertEqual(backend.frame_count, 2)
 
         # The third batch (no re-compilation).
+<<<<<<< HEAD
         compiled_out3 = torch.compile(sdpa_partial3, backend=backend)(q3, k3, v3)
         self._check_equal(golden_out3, ref_out3, compiled_out3, fudge_factor)
         self.assertEqual(backend.frame_count, 2)
@@ -1060,6 +1678,28 @@ def test_builtin_score_mods(self, dtype: torch.dtype, score_mod: Callable):
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_seqlen_lt_default_sparse_block_size(
         self, dtype: torch.dtype, score_mod: Callable
+=======
+        compiled_out3 = torch.compile(sdpa_partial3, backend=backend, fullgraph=True)(
+            q3, k3, v3
+        )
+        self._check_equal(golden_out3, ref_out3, compiled_out3, fudge_factor)
+        self.assertEqual(backend.frame_count, 2)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    def test_builtin_score_mods(self, device, dtype, score_mod: Callable):
+        self.run_test(score_mod, dtype, device=device)
+        self.run_test_with_paged_attention(score_mod, dtype, device=device)
+
+    @running_on_a100_only
+    @common_utils.parametrize("score_mod", test_score_mods)
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_builtin_score_mods_seqlen_lt_default_sparse_block_size(
+        self, device, dtype, score_mod: Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         # _DEFAULT_SPARSE_BLOCK_SIZE is 128
         attention = functools.partial(
@@ -1067,6 +1707,7 @@ def test_builtin_score_mods_seqlen_lt_default_sparse_block_size(
             score_mod=score_mod,
             kernel_options={"FORCE_USE_FLEX_ATTENTION": True},
         )
+<<<<<<< HEAD
         self.run_test_with_call(attention, dtype, B, H, 64, D, B, H, 64, D)
 
     @running_on_a100_only
@@ -1074,17 +1715,34 @@ def test_builtin_score_mods_seqlen_lt_default_sparse_block_size(
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_seqlen_lt_custom_sparse_block_size(
         self, dtype: torch.dtype, score_mod: Callable
+=======
+        self.run_test_with_call(attention, dtype, device, B, H, 64, D, B, H, 64, D)
+
+    @running_on_a100_only
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    def test_builtin_score_mods_seqlen_lt_custom_sparse_block_size(
+        self, device, dtype: torch.dtype, score_mod: Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         def causal_mask(b, h, q, kv):
             return q >= kv
 
+<<<<<<< HEAD
         block_mask = create_block_mask(causal_mask, 1, 1, 64, 64, BLOCK_SIZE=256)
+=======
+        block_mask = create_block_mask(
+            causal_mask, 1, 1, 64, 64, BLOCK_SIZE=256, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attention = functools.partial(
             flex_attention,
             score_mod=score_mod,
             block_mask=block_mask,
             kernel_options={"FORCE_USE_FLEX_ATTENTION": True},
         )
+<<<<<<< HEAD
         self.run_test_with_call(attention, dtype, B, H, 64, D, B, H, 64, D)
 
     @supported_platform
@@ -1106,10 +1764,54 @@ def test_builtin_score_mods_automatic_dynamic(
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_different_seqlen(
         self, dtype: torch.dtype, score_mod: Callable
+=======
+        self.run_test_with_call(
+            attention,
+            dtype,
+            device,
+            B,
+            H,
+            64,
+            D,
+            B,
+            H,
+            64,
+            D,
+        )
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @common_utils.parametrize("score_mask_mod", test_score_mask_mod_map.items())
+    def test_builtin_score_mods_dynamic(
+        self, device, dtype: torch.dtype, score_mask_mod: tuple[Callable, Callable]
+    ):
+        self.run_dynamic_test(score_mask_mod, dtype, S=1024, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    def test_builtin_score_mods_automatic_dynamic(
+        self, device, dtype: torch.dtype, score_mod: Callable
+    ):
+        self.run_automatic_dynamic_test(score_mod, dtype, S=1024, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    def test_builtin_score_mods_different_seqlen(
+        self, device, dtype: torch.dtype, score_mod: Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         inputs = (
             score_mod,
             dtype,
+<<<<<<< HEAD
+=======
+            device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             B,
             H,
             S // 2,  # Seqlen of Q is different from seqlen of K/V
@@ -1122,27 +1824,54 @@ def test_builtin_score_mods_different_seqlen(
         self.run_test(*inputs)
         self.run_test_with_paged_attention(*inputs)
 
+<<<<<<< HEAD
     @common_utils.parametrize("dtype", test_dtypes)
+=======
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("score_mod", test_score_mods)
     @common_utils.parametrize("BLOCK_SIZE", test_block_size)
     def test_builtin_score_mods_different_block_size(
         self,
+<<<<<<< HEAD
+=======
+        device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype: torch.dtype,
         score_mod: Callable,
         BLOCK_SIZE: Union[int, tuple[int, int]],
     ):
         block_mask = create_block_mask(
+<<<<<<< HEAD
             noop_mask, B, H, S, S, BLOCK_SIZE=BLOCK_SIZE, device=self.device
         )
         self.run_test(score_mod, dtype, block_mask=block_mask)
         self.run_test_with_paged_attention(score_mod, dtype, block_mask=block_mask)
 
     @common_utils.parametrize("dtype", test_dtypes_fast)
+=======
+            noop_mask, B, H, S, S, BLOCK_SIZE=BLOCK_SIZE, device=device
+        )
+        self.run_test(score_mod, dtype, block_mask=block_mask, device=device)
+        self.run_test_with_paged_attention(
+            score_mod, dtype, block_mask=block_mask, device=device
+        )
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("batch_dims", test_Bq_Bkv)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_kv_batch_broadcast(
         self,
+<<<<<<< HEAD
+=======
+        device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype: torch.dtype,
         batch_dims: tuple[int, int],
         head_dims: tuple[int, int],
@@ -1154,6 +1883,7 @@ def test_kv_batch_broadcast(
         Bq, Bkv = batch_dims
         assert Bq > 1 and Bkv == 1
 
+<<<<<<< HEAD
         block_mask = create_block_mask(noop_mask, Bq, 1, S, S, device=self.device)
 
         self.run_test(
@@ -1171,11 +1901,70 @@ def test_kv_batch_broadcast(
         )
 
     @common_utils.parametrize("dtype", test_dtypes_fast)
+=======
+        block_mask = create_block_mask(noop_mask, Bq, 1, S, S, device=device)
+
+        self.run_test(
+            score_mod, dtype, device, Bq, Hq, S, D, Bkv, Hkv, S, D, block_mask
+        )
+
+    @supported_platform
+    @skip_on_cpu
+    def test_small_block_mask(self, device):
+        compiled_create_block_mask = torch.compile(create_block_mask)
+
+        def create_block_mask_from_seqlens(
+            q_batch: torch.Tensor,
+            kv_batch: torch.Tensor,
+        ) -> BlockMask:
+            B, H = None, None
+            Q_LEN = q_batch.size(0)
+            KV_LEN = kv_batch.size(0)
+
+            def batch_mask_mod(
+                b: torch.Tensor,
+                h: torch.Tensor,
+                q_idx: torch.Tensor,
+                kv_idx: torch.Tensor,
+            ):
+                q_idx_batch = q_batch[q_idx]
+                kv_idx_batch = kv_batch[kv_idx]
+                batch_mask = (
+                    (q_idx_batch == kv_idx_batch)
+                    & (q_idx_batch != -1)
+                    & (kv_idx_batch != -1)
+                )
+
+                return batch_mask
+
+            return compiled_create_block_mask(
+                batch_mask_mod,
+                B=B,
+                H=H,
+                Q_LEN=Q_LEN,
+                KV_LEN=KV_LEN,
+                device=device,
+            )
+
+        a = torch.tensor([2, 42, 18, 21, 4, 2, 7, 1, 1], device=device)
+        b = torch.tensor([57, 21, 16, 8], device=device)
+
+        for seqlen in [a, b]:
+            create_block_mask_from_seqlens(seqlen, seqlen)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("batch_dims", test_Bq_Bkv)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_kv_batch_broadcast_causal_mask(
         self,
+<<<<<<< HEAD
+=======
+        device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype: torch.dtype,
         batch_dims: tuple[int, int],
         head_dims: tuple[int, int],
@@ -1190,11 +1979,16 @@ def test_kv_batch_broadcast_causal_mask(
         def mask_mod(b, h, q, kv):
             return q >= kv
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, Bq, 1, S, S, device=self.device)
+=======
+        block_mask = create_block_mask(mask_mod, Bq, 1, S, S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attention = functools.partial(
             flex_attention, block_mask=block_mask, enable_gqa=(not Hq == Hkv)
         )
 
+<<<<<<< HEAD
         self.run_test_with_call(attention, dtype, Bq, Hq, S, D, Bkv, Hkv, S, D)
 
     @common_utils.parametrize("dtype", test_dtypes_fast)
@@ -1203,8 +1997,22 @@ def test_GQA(self, dtype: torch.dtype, score_mod: Callable):
         inputs = (
             score_mod,
             dtype,
-            B,
-            H * 4,  # Hq = 4*Hkv.
+=======
+        self.run_test_with_call(attention, dtype, device, Bq, Hq, S, D, Bkv, Hkv, S, D)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    @skip_on_rocm  # TODO: NaNs on ROCM
+    def test_GQA(self, device, dtype: torch.dtype, score_mod: Callable):
+        inputs = (
+            score_mod,
+            dtype,
+            device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+            B,
+            H * 4,  # Hq = 4*Hkv.
             S // 8,
             D,
             B,
@@ -1215,6 +2023,7 @@ def test_GQA(self, dtype: torch.dtype, score_mod: Callable):
         self.run_test(*inputs)
         self.run_test_with_paged_attention(*inputs)
 
+<<<<<<< HEAD
     test_strides = [
         ((H * S * D, S * D, D, 1), 997),  # offset
         ((H * D, D, B * H * D, 1), 499),  # transposed dimensions
@@ -1231,6 +2040,11 @@ def test_GQA(self, dtype: torch.dtype, score_mod: Callable):
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
+=======
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize(
         "q_s", test_strides[:2]
     )  # TODO: fix layout for query braodcasting
@@ -1246,23 +2060,42 @@ def test_GQA(self, dtype: torch.dtype, score_mod: Callable):
         ],
     )
     @common_utils.parametrize("do_s", test_strides[:3])
+<<<<<<< HEAD
     def test_strided_inputs(self, dtype: torch.dtype, q_s, k_s, v_s, do_s):
         q1 = torch.randn((B * H * S * D * 2), dtype=dtype, device="cuda")
         k1 = torch.randn((B * H * S * D * 2), dtype=dtype, device="cuda")
         v1 = torch.randn((B * H * S * D * 2), dtype=dtype, device="cuda")
         do1 = torch.randn((B * H * S * D * 2), dtype=dtype, device="cuda")
+=======
+    def test_strided_inputs(self, device, dtype: torch.dtype, q_s, k_s, v_s, do_s):
+        q1 = torch.randn((B * H * S * D * 2), dtype=dtype, device=device)
+        k1 = torch.randn((B * H * S * D * 2), dtype=dtype, device=device)
+        v1 = torch.randn((B * H * S * D * 2), dtype=dtype, device=device)
+        do1 = torch.randn((B * H * S * D * 2), dtype=dtype, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         q_shape = (B, H, S // 2, D)
         k_shape = (B, H, S, D)
         v_shape = (B, H, S, D)
         do_shape = (B, H, S // 2, D)
 
+<<<<<<< HEAD
+=======
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def coerce_to_strides(val, shape, strides):
             strides, offset = strides
             val_max = [x * (y - 1) for x, y in zip(strides, shape)]
             assert sum(val_max) + offset < B * H * S * D * 2
             assert strides[-1] == 1
+<<<<<<< HEAD
             return torch.as_strided(val, shape, strides, offset).requires_grad_(True)
+=======
+            return torch.as_strided(val, shape, strides, offset).requires_grad_(
+                requires_grad
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         q = coerce_to_strides(q1, q_shape, q_s)
         k = coerce_to_strides(k1, k_shape, k_s)
@@ -1272,7 +2105,11 @@ def coerce_to_strides(val, shape, strides):
         block_mask = _create_empty_block_mask(q, k)
         score_mod = _generate_alibi_bias(8)
         sdpa_partial = create_attention(score_mod=score_mod, block_mask=block_mask)
+<<<<<<< HEAD
         compiled_sdpa = torch.compile(sdpa_partial)
+=======
+        compiled_sdpa = torch.compile(sdpa_partial, fullgraph=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_out = sdpa_partial(q, k, v)
         compiled_out = compiled_sdpa(q, k, v)
 
@@ -1280,6 +2117,7 @@ def coerce_to_strides(val, shape, strides):
         torch.testing.assert_close(
             ref_out, compiled_out, atol=tolerance.atol, rtol=tolerance.rtol
         )
+<<<<<<< HEAD
         ref_out.backward(do)
         ref_grads = [q.grad, k.grad, v.grad]
         q.grad = None
@@ -1304,13 +2142,56 @@ def coerce_to_strides(val, shape, strides):
         # test paged attention which does not support backward
         q.requires_grad, k.requires_grad, v.requires_grad = False, False, False
         paged_compiled_out, _ = self.run_paged_attention(score_mod, q, k, v, dtype)
+=======
+        if requires_grad:
+            ref_out.backward(do)
+            ref_grads = [q.grad, k.grad, v.grad]
+            q.grad = None
+            k.grad = None
+            v.grad = None
+
+            compiled_out.backward(do)
+            compiled_grads = [q.grad, k.grad, v.grad]
+            q.grad = None
+            k.grad = None
+            v.grad = None
+            torch.testing.assert_close(
+                compiled_grads[0],
+                ref_grads[0],
+                atol=tolerance.atol,
+                rtol=tolerance.rtol,
+            )
+            torch.testing.assert_close(
+                compiled_grads[1],
+                ref_grads[1],
+                atol=tolerance.atol,
+                rtol=tolerance.rtol,
+            )
+            torch.testing.assert_close(
+                compiled_grads[2],
+                ref_grads[2],
+                atol=tolerance.atol,
+                rtol=tolerance.rtol,
+            )
+
+        # test paged attention which does not support backward
+        q.requires_grad, k.requires_grad, v.requires_grad = False, False, False
+        paged_compiled_out, _ = self.run_paged_attention(
+            score_mod, q, k, v, dtype, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.testing.assert_close(
             ref_out, paged_compiled_out, atol=tolerance.atol, rtol=tolerance.rtol
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_doc_mask_sparse(self):
         document_id = torch.zeros(S, dtype=torch.int, device="cuda")
+=======
+    def test_doc_mask_sparse(self, device):
+        document_id = torch.zeros(S, dtype=torch.int, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i in range(0, S, 256):
             document_id[i : i + 256] = i // 256
 
@@ -1319,26 +2200,47 @@ def document_masking_causal(score, b, h, q_idx, kv_idx):
             document_mask = document_id[q_idx] == document_id[kv_idx]
             return torch.where(causal_mask & document_mask, score, -float("inf"))
 
+<<<<<<< HEAD
         self.run_test(document_masking_causal, torch.float16)
         self.run_test_with_paged_attention(document_masking_causal, torch.float16)
 
     @supported_platform
     def test_index_multiple(self):
         bias = torch.randn(B, S, device="cuda")
+=======
+        self.run_test(document_masking_causal, torch.float16, device=device)
+        self.run_test_with_paged_attention(
+            document_masking_causal, torch.float16, device=device
+        )
+
+    @supported_platform
+    def test_index_multiple(self, device):
+        bias = torch.randn(B, S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def index_multiple(score, b, h, q_idx, kv_idx):
             return score + bias[b][q_idx]
 
+<<<<<<< HEAD
         self.run_test(index_multiple, torch.float16)
         self.run_test_with_paged_attention(index_multiple, torch.float16)
 
     @supported_platform
     def test_index_weird1(self):
         bias = torch.randn(4, B, H, S, device="cuda")
+=======
+        self.run_test(index_multiple, torch.float16, device=device)
+        self.run_test_with_paged_attention(index_multiple, torch.float16, device=device)
+
+    @supported_platform
+    def test_index_weird1(self, device):
+        bias = torch.randn(4, B, H, S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def index_weird1(score, b, h, q_idx, kv_idx):
             return score + bias[0][b, h][q_idx]
 
+<<<<<<< HEAD
         self.run_test(index_weird1, torch.float16)
         self.run_test_with_paged_attention(index_weird1, torch.float16)
 
@@ -1346,10 +2248,20 @@ def index_weird1(score, b, h, q_idx, kv_idx):
     def test_index_weird2(self):
         bias = torch.randn(B, H, 4, S, device="cuda")
         which_bias = torch.tensor(0, device="cuda")
+=======
+        self.run_test(index_weird1, torch.float16, device=device)
+        self.run_test_with_paged_attention(index_weird1, torch.float16, device=device)
+
+    @supported_platform
+    def test_index_weird2(self, device):
+        bias = torch.randn(B, H, 4, S, device=device)
+        which_bias = torch.tensor(0, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def index_weird2(score, b, h, q_idx, kv_idx):
             return score + bias[b][h][which_bias, q_idx]
 
+<<<<<<< HEAD
         self.run_test(index_weird2, torch.float16)
         self.run_test_with_paged_attention(index_weird2, torch.float16)
 
@@ -1365,6 +2277,25 @@ def score_mod(score, b, h, q, kv):
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes)
     def test_function_composition(self, dtype: torch.dtype):
+=======
+        self.run_test(index_weird2, torch.float16, device=device)
+        self.run_test_with_paged_attention(index_weird2, torch.float16, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    def test_skip_odd_keys(self, device, dtype: torch.dtype):
+        def score_mod(score, b, h, q, kv):
+            return torch.where(kv % 2 == 0, score, float("-inf"))
+
+        self.run_test(score_mod, dtype, device=device)
+        self.run_test_with_paged_attention(score_mod, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    def test_function_composition(self, device, dtype: torch.dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def score_mod_1(score, b, h, m, n):
             return score + (m - n)
 
@@ -1374,6 +2305,7 @@ def score_mod_2(score, b, h, m, n):
         def composed_score_mod(score, b, h, m, n):
             return score_mod_2(score_mod_1(score, b, h, m, n), b, h, m, n)
 
+<<<<<<< HEAD
         self.run_test(composed_score_mod, dtype)
         self.run_test_with_paged_attention(composed_score_mod, dtype)
 
@@ -1383,6 +2315,18 @@ def test_captured_buffers_all_dims(self, dtype: torch.dtype):
         head_scale = torch.randn(H, device="cuda")
         batch_scale = torch.randn(B, device="cuda")
         tok_scale = torch.randn(S, device="cuda")
+=======
+        self.run_test(composed_score_mod, dtype, device=device)
+        self.run_test_with_paged_attention(composed_score_mod, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    def test_captured_buffers_all_dims(self, device, dtype: torch.dtype):
+        head_scale = torch.randn(H, device=device)
+        batch_scale = torch.randn(B, device=device)
+        tok_scale = torch.randn(S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def all_bias(score, batch, head, token_q, token_kv):
             score = score + tok_scale[token_q]
@@ -1390,6 +2334,7 @@ def all_bias(score, batch, head, token_q, token_kv):
             score = score + head_scale[head]
             return score
 
+<<<<<<< HEAD
         self.run_test(all_bias, dtype)
         self.run_test_with_paged_attention(all_bias, dtype)
 
@@ -1397,11 +2342,22 @@ def all_bias(score, batch, head, token_q, token_kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_seq_masking(self, dtype):
         seq_idx = torch.zeros(S, device="cuda", dtype=torch.bool)
+=======
+        self.run_test(all_bias, dtype, device=device)
+        self.run_test_with_paged_attention(all_bias, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_seq_masking(self, device, dtype):
+        seq_idx = torch.zeros(S, device=device, dtype=torch.bool)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seq_idx[S // 2 :] = 1
 
         def seq_mask_mod(score, b, h, q, kv):
             return torch.where(seq_idx[q] == seq_idx[kv], score, float("-inf"))
 
+<<<<<<< HEAD
         self.run_test(seq_mask_mod, dtype)
         self.run_test_with_paged_attention(seq_mask_mod, dtype)
 
@@ -1409,10 +2365,21 @@ def seq_mask_mod(score, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_load_from_bias_seq_only(self, dtype):
         bias = torch.randn(S, S, device="cuda", dtype=dtype)
+=======
+        self.run_test(seq_mask_mod, dtype, device=device)
+        self.run_test_with_paged_attention(seq_mask_mod, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_load_from_bias_seq_only(self, device, dtype):
+        bias = torch.randn(S, S, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def bias_mod(score, b, h, q, kv):
             return score + bias[q, kv]
 
+<<<<<<< HEAD
         self.run_test(bias_mod, dtype)
         self.run_test_with_paged_attention(bias_mod, dtype)
 
@@ -1420,10 +2387,21 @@ def bias_mod(score, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_load_from_bias_seq_batch(self, dtype):
         bias = torch.randn(B, S, S, device="cuda", dtype=dtype)
+=======
+        self.run_test(bias_mod, dtype, device=device)
+        self.run_test_with_paged_attention(bias_mod, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_load_from_bias_seq_batch(self, device, dtype):
+        bias = torch.randn(B, S, S, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def bias_mod(score, b, h, q, kv):
             return score + bias[b, q, kv]
 
+<<<<<<< HEAD
         self.run_test(bias_mod, dtype)
         self.run_test_with_paged_attention(bias_mod, dtype)
 
@@ -1431,6 +2409,15 @@ def bias_mod(score, b, h, q, kv):
     def test_load_from_view_buffer(self):
         dtype = torch.float16
         device = "cuda"
+=======
+        self.run_test(bias_mod, dtype, device=device)
+        self.run_test_with_paged_attention(bias_mod, dtype, device=device)
+
+    @supported_platform
+    @skip_on_cpu
+    def test_load_from_view_buffer(self, device):
+        dtype = torch.float16
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         W = 8
 
         class SimpleAttention(torch.nn.Module):
@@ -1475,13 +2462,21 @@ def add_decomposed_rel_pos(self, q):
         out.sum().backward()
 
     @supported_platform
+<<<<<<< HEAD
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_load_from_bias_head_seq_batch(self, dtype):
         bias = torch.randn(B, H, S, S, device="cuda", dtype=dtype)
+=======
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_load_from_bias_head_seq_batch(self, device, dtype):
+        bias = torch.randn(B, H, S, S, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def bias_mod(score, b, h, q, kv):
             return score + bias[b, h, q, kv]
 
+<<<<<<< HEAD
         self.run_test(bias_mod, dtype)
         self.run_test_with_paged_attention(bias_mod, dtype)
 
@@ -1489,10 +2484,21 @@ def bias_mod(score, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_load_rel_bias(self, dtype):
         rel_bias = torch.randn(2 * S, device="cuda", dtype=dtype)
+=======
+        self.run_test(bias_mod, dtype, device=device)
+        self.run_test_with_paged_attention(bias_mod, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_load_rel_bias(self, device, dtype):
+        rel_bias = torch.randn(2 * S, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def bias_mod(score, b, h, q, kv):
             return score + rel_bias[(q - kv) + S]
 
+<<<<<<< HEAD
         self.run_test(bias_mod, dtype)
         self.run_test_with_paged_attention(bias_mod, dtype)
 
@@ -1500,6 +2506,16 @@ def bias_mod(score, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_dependent_causal_bidirectional(self, dtype):
         num_bidirectional = torch.randint(0, S, (B,), device="cuda", dtype=torch.int32)
+=======
+        self.run_test(bias_mod, dtype, device=device)
+        self.run_test_with_paged_attention(bias_mod, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_dependent_causal_bidirectional(self, device, dtype):
+        num_bidirectional = torch.randint(0, S, (B,), device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def bias_mod(score, b, h, q, kv):
             causal_attention = q >= kv
@@ -1513,12 +2529,22 @@ def bias_mod(score, b, h, q, kv):
                 -float("inf"),
             )
 
+<<<<<<< HEAD
         self.run_test(bias_mod, dtype)
         self.run_test_with_paged_attention(bias_mod, dtype)
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_natten_2d(self, dtype):
+=======
+        self.run_test(bias_mod, dtype, device=device)
+        self.run_test_with_paged_attention(bias_mod, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_natten_2d(self, device, dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         H = 32
         W = S // H
         WINDOW = 3
@@ -1537,12 +2563,20 @@ def natten_mask(score, b, h, q, kv):
                 float("-inf"),
             )
 
+<<<<<<< HEAD
         self.run_test(natten_mask, dtype)
         self.run_test_with_paged_attention(natten_mask, dtype)
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_subgraph_respect_decompostion(self, dtype):
+=======
+        self.run_test(natten_mask, dtype, device=device)
+        self.run_test_with_paged_attention(natten_mask, dtype, device=device)
+
+    @supported_platform
+    def test_subgraph_respect_decompostion(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._decomp import core_aten_decompositions
         from torch.fx.experimental.proxy_tensor import make_fx
 
@@ -1552,7 +2586,11 @@ def score_mod_func(score, b, h, q, kv):
         make_tensor = functools.partial(
             torch.randn,
             (2, 2, 128, 4),
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float64,
             requires_grad=True,
         )
@@ -1585,6 +2623,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
         )
 
     @supported_platform
+<<<<<<< HEAD
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_silu_on_score(self, dtype):
         def silu_score(score, b, h, q, kv):
@@ -1597,6 +2636,22 @@ def silu_score(score, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_padded_dense_causal(self, dtype):
         seq_len = torch.arange(B, device="cuda", dtype=torch.int32) + 1
+=======
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_silu_on_score(self, device, dtype):
+        def silu_score(score, b, h, q, kv):
+            return torch.nn.functional.silu(score)
+
+        self.run_test(silu_score, dtype, device=device)
+        self.run_test_with_paged_attention(silu_score, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_padded_dense_causal(self, device, dtype):
+        seq_len = torch.arange(B, device=device, dtype=torch.int32) + 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def create_padded_dense_wrapper(orig_score_mod):
             def njt_score_mod(qk, b, h, q, kv):
@@ -1608,16 +2663,27 @@ def njt_score_mod(qk, b, h, q, kv):
 
         causal_njt = create_padded_dense_wrapper(_causal)
 
+<<<<<<< HEAD
         self.run_test(causal_njt, dtype)
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_captured_scale(self, dtype):
         scale = torch.ones((), device="cuda", dtype=torch.int32)
+=======
+        self.run_test(causal_njt, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_captured_scale(self, device, dtype):
+        scale = torch.ones((), device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def score_mod_scale(qk, b, h, q, kv):
             return qk + scale
 
+<<<<<<< HEAD
         self.run_test(score_mod_scale, dtype)
         self.run_test_with_paged_attention(score_mod_scale, dtype)
 
@@ -1625,6 +2691,16 @@ def score_mod_scale(qk, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_recompile_changed_score_mod(self, dtype):
         scale = torch.ones((), device="cuda", dtype=torch.int32)
+=======
+        self.run_test(score_mod_scale, dtype, device=device)
+        self.run_test_with_paged_attention(score_mod_scale, dtype, device=device)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_recompile_changed_score_mod(self, device, dtype):
+        scale = torch.ones((), device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ADD = True
 
         def score_mod_scale(qk, b, h, q, kv):
@@ -1633,6 +2709,7 @@ def score_mod_scale(qk, b, h, q, kv):
             else:
                 return qk * scale
 
+<<<<<<< HEAD
         self.run_test(score_mod_scale, dtype)
         self.run_test_with_paged_attention(score_mod_scale, dtype)
 
@@ -1645,10 +2722,26 @@ def score_mod_scale(qk, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_captured_reduction(self, dtype):
         scale = torch.randn((B, 8), device="cuda")
+=======
+        self.run_test(score_mod_scale, dtype, device=device)
+        self.run_test_with_paged_attention(score_mod_scale, dtype, device=device)
+
+        ADD = False
+        self.run_test(score_mod_scale, dtype, device=device)
+        self.run_test_with_paged_attention(score_mod_scale, dtype, device=device)
+
+    @supported_platform
+    @expectedFailure  # If we capture a tensor then we can perform a reduction on it, and that shouldn't be allowed
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_captured_reduction(self, device, dtype):
+        scale = torch.randn((B, 8), device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def score_mod_scale(qk, b, h, q, kv):
             return qk + scale[b].sum(dim=-1)
 
+<<<<<<< HEAD
         self.run_test(score_mod_scale, dtype)
 
     @supported_platform
@@ -1660,6 +2753,19 @@ def test_multiple_score_mod_calls(self):
         ]
         values = [
             torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+=======
+        self.run_test(score_mod_scale, dtype, device=device)
+
+    @supported_platform
+    def test_multiple_score_mod_calls(self, device):
+        query = torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+        keys = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+            for _ in range(2)
+        ]
+        values = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(2)
         ]
 
@@ -1679,6 +2785,7 @@ def f(q, k1, k2, v1, v2):
         torch.testing.assert_close(out, out2, atol=tolerance.atol, rtol=tolerance.rtol)
 
     @supported_platform
+<<<<<<< HEAD
     def test_multiple_mask_calls(self):
         if TEST_WITH_ROCM:
             self.skipTest(
@@ -1694,6 +2801,19 @@ def test_multiple_mask_calls(self):
         value = torch.randn(
             (1, 4, 512, 64), dtype=torch.float32, device="cuda", requires_grad=True
         )
+=======
+    @skip_on_cpu
+    @skip_on_rocm  # TODO: Investigate
+    def test_multiple_mask_calls(self, device):
+        make_tensor = functools.partial(
+            torch.randn,
+            (1, 4, 512, 64),
+            dtype=torch.float32,
+            device=device,
+            requires_grad=True,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         window_size = 32
 
@@ -1703,9 +2823,23 @@ def causal_mask(b, h, q_idx, kv_idx):
         def causal_mask_slidewindow_mod(b, h, q_idx, kv_idx):
             return (q_idx >= kv_idx) & (q_idx <= kv_idx + window_size)
 
+<<<<<<< HEAD
         mask1 = create_block_mask(causal_mask, 1, None, 512, 512, _compile=False)
         mask2 = create_block_mask(
             causal_mask_slidewindow_mod, 1, None, 512, 512, _compile=False
+=======
+        mask1 = create_block_mask(
+            causal_mask, 1, None, 512, 512, _compile=False, device=device
+        )
+        mask2 = create_block_mask(
+            causal_mask_slidewindow_mod,
+            1,
+            None,
+            512,
+            512,
+            _compile=False,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         def f(q, k, v):
@@ -1727,6 +2861,7 @@ def f(q, k, v):
             torch.testing.assert_close(grad, grad_compiled, atol=3e-2, rtol=3e-2)
 
     @supported_platform
+<<<<<<< HEAD
     def test_multiple_score_mod_calls2(self):
         query = torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
         keys = [
@@ -1735,6 +2870,16 @@ def test_multiple_score_mod_calls2(self):
         ]
         values = [
             torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+=======
+    def test_multiple_score_mod_calls2(self, device):
+        query = torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+        keys = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+            for _ in range(3)
+        ]
+        values = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(3)
         ]
 
@@ -1752,6 +2897,7 @@ def f(q, k1, k2, k3, v1, v2, v3):
             return flex_attention(q3, k3, v3, score_mod=scoremod_1)
 
         out = f(query, *keys, *values)
+<<<<<<< HEAD
         out2 = torch.compile(f)(query, *keys, *values)
         self.assertTrue((out - out2).abs().mean() < 1e-2)
 
@@ -1764,6 +2910,20 @@ def test_multiple_score_mod_calls_paged_attention(self):
         ]
         values = [
             torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+=======
+        out2 = torch.compile(f, fullgraph=True)(query, *keys, *values)
+        self.assertTrue((out - out2).abs().mean() < 1e-2)
+
+    @supported_platform
+    def test_multiple_score_mod_calls_paged_attention(self, device):
+        query = torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+        keys = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+            for _ in range(2)
+        ]
+        values = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(2)
         ]
 
@@ -1779,7 +2939,11 @@ def f(q, k1, k2, v1, v2):
 
         eager_out = f(query, *keys, *values)
 
+<<<<<<< HEAD
         block_mask = create_block_mask(noop_mask, 1, 1, 1024, 1024)
+=======
+        block_mask = create_block_mask(noop_mask, 1, 1, 1024, 1024, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         (
             k_cache1,
@@ -1787,7 +2951,17 @@ def f(q, k1, k2, v1, v2):
             converted_block_mask1,
             converted_score_mod1,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             scoremod_1, query, keys[0], values[0], block_mask, torch.float32
+=======
+            scoremod_1,
+            query,
+            keys[0],
+            values[0],
+            block_mask,
+            torch.float32,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         (
             k_cache2,
@@ -1795,7 +2969,17 @@ def f(q, k1, k2, v1, v2):
             converted_block_mask2,
             converted_score_mod2,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             scoremod_2, query, keys[1], values[1], block_mask, torch.float32
+=======
+            scoremod_2,
+            query,
+            keys[1],
+            values[1],
+            block_mask,
+            torch.float32,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         def paged_f(q, k1, k2, v1, v2):
@@ -1814,7 +2998,11 @@ def paged_f(q, k1, k2, v1, v2):
                 block_mask=converted_block_mask2,
             )
 
+<<<<<<< HEAD
         compiled_out = torch.compile(paged_f)(
+=======
+        compiled_out = torch.compile(paged_f, fullgraph=True)(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             query, k_cache1, k_cache2, v_cache1, v_cache2
         )
         tolerance = Tolerances(atol=2e-1, rtol=2e-1)
@@ -1823,6 +3011,7 @@ def paged_f(q, k1, k2, v1, v2):
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_multiple_score_mod_calls2_paged_attention(self):
         query = torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
         keys = [
@@ -1831,6 +3020,16 @@ def test_multiple_score_mod_calls2_paged_attention(self):
         ]
         values = [
             torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+=======
+    def test_multiple_score_mod_calls2_paged_attention(self, device):
+        query = torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+        keys = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+            for _ in range(3)
+        ]
+        values = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(3)
         ]
 
@@ -1849,14 +3048,28 @@ def f(q, k1, k2, k3, v1, v2, v3):
 
         eager_out = f(query, *keys, *values)
 
+<<<<<<< HEAD
         block_mask = create_block_mask(noop_mask, 1, 1, 1024, 1024)
+=======
+        block_mask = create_block_mask(noop_mask, 1, 1, 1024, 1024, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             k_cache1,
             v_cache1,
             converted_block_mask1,
             converted_score_mod1,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             scoremod_1, query, keys[0], values[0], block_mask, torch.float32
+=======
+            scoremod_1,
+            query,
+            keys[0],
+            values[0],
+            block_mask,
+            torch.float32,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         (
             k_cache2,
@@ -1864,7 +3077,17 @@ def f(q, k1, k2, k3, v1, v2, v3):
             converted_block_mask2,
             converted_score_mod2,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             scoremod_2, query, keys[1], values[1], block_mask, torch.float32
+=======
+            scoremod_2,
+            query,
+            keys[1],
+            values[1],
+            block_mask,
+            torch.float32,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         (
             k_cache3,
@@ -1872,7 +3095,17 @@ def f(q, k1, k2, k3, v1, v2, v3):
             converted_block_mask3,
             converted_score_mod3,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             scoremod_1, query, keys[2], values[2], block_mask, torch.float32
+=======
+            scoremod_1,
+            query,
+            keys[2],
+            values[2],
+            block_mask,
+            torch.float32,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         paged_attention1 = functools.partial(
@@ -1898,7 +3131,11 @@ def paged_f(q, k1, k2, k3, v1, v2, v3):
                 block_mask=converted_block_mask3,
             )
 
+<<<<<<< HEAD
         compiled_out = torch.compile(paged_f)(
+=======
+        compiled_out = torch.compile(paged_f, fullgraph=True)(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             query, k_cache1, k_cache2, k_cache3, v_cache1, v_cache2, v_cache3
         )
         tolerance = Tolerances(atol=2e-1, rtol=2e-1)
@@ -1907,9 +3144,16 @@ def paged_f(q, k1, k2, k3, v1, v2, v3):
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_inputs_are_realized(self):
         def f(q, k, v):
             x = torch.randn(1024, device="cuda")
+=======
+    @skip_on_cpu
+    def test_inputs_are_realized(self, device):
+        def f(q, k, v):
+            x = torch.randn(1024, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = x * 2
 
             def func(qk, b, h, q, kv):
@@ -1918,7 +3162,11 @@ def func(qk, b, h, q, kv):
             return flex_attention(q.sin(), k, v, score_mod=func).cos()
 
         q, k, v = (
+<<<<<<< HEAD
             torch.randn(1, 8, 1024, 64, device="cuda", requires_grad=True)
+=======
+            torch.randn(1, 8, 1024, 64, device=device, requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(3)
         )
         ref = f(q, k, v)
@@ -1932,18 +3180,34 @@ def func(qk, b, h, q, kv):
             self.assertTrue((ref - out).abs().mean() < 1e-2)
 
     @supported_platform
+<<<<<<< HEAD
     def test_make_block_mask(self):
         def causal_mask(b, h, q_idx, kv_idx):
             return q_idx >= kv_idx
 
         block_mask_a = torch.compile(create_block_mask)(causal_mask, 1, 1, 512, 512)
         block_mask_b = create_block_mask(causal_mask, 1, 1, 512, 512)
+=======
+    @skip_on_cpu
+    def test_make_block_mask(self, device):
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        block_mask_a = torch.compile(create_block_mask, fullgraph=True)(
+            causal_mask, 1, 1, 512, 512, device=device
+        )
+        block_mask_b = create_block_mask(causal_mask, 1, 1, 512, 512, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(block_mask_a.kv_num_blocks, block_mask_b.kv_num_blocks)
         self.assertEqual(block_mask_a.kv_indices, block_mask_b.kv_indices)
         self.assertEqual(block_mask_a.q_num_blocks, block_mask_b.q_num_blocks)
 
     @supported_platform
+<<<<<<< HEAD
     def test_mask_mod_combiners(self):
+=======
+    def test_mask_mod_combiners(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def causal_mask(b, h, q, kv):
             return q >= kv
 
@@ -1953,6 +3217,7 @@ def neg_causal_mask(b, h, q, kv):
         def sliding_window(b, h, q, kv):
             return (q - kv) <= 512
 
+<<<<<<< HEAD
         block_mask = create_block_mask(
             and_masks(causal_mask, sliding_window), 1, 1, S, S
         )
@@ -1962,10 +3227,35 @@ def sliding_window(b, h, q, kv):
 
         block_mask = create_block_mask(
             and_masks(causal_mask, neg_causal_mask), 1, 1, S, S
+=======
+        local_s = 2048
+        block_mask = create_block_mask(
+            and_masks(causal_mask, sliding_window),
+            1,
+            1,
+            local_s,
+            local_s,
+            device=device,
+        )
+        self.assertExpectedInline(block_mask.kv_num_blocks.sum().item(), """28""")
+        attention = functools.partial(flex_attention, block_mask=block_mask)
+        self.run_test_with_call(
+            attention, Q_S=local_s, KV_S=local_s, dtype=torch.float16, device=device
+        )
+
+        block_mask = create_block_mask(
+            and_masks(causal_mask, neg_causal_mask),
+            1,
+            1,
+            local_s,
+            local_s,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(block_mask.kv_num_blocks.sum(), 0)
 
         block_mask1 = create_block_mask(
+<<<<<<< HEAD
             or_masks(causal_mask, neg_causal_mask), 1, 1, S, S
         )
         block_mask2 = create_block_mask(noop_mask, 1, 1, S, S)
@@ -1973,12 +3263,36 @@ def sliding_window(b, h, q, kv):
 
     @supported_platform
     def test_epilogue_fused(self):
+=======
+            or_masks(causal_mask, neg_causal_mask),
+            1,
+            1,
+            local_s,
+            local_s,
+            device=device,
+        )
+        block_mask2 = create_block_mask(
+            noop_mask, 1, 1, local_s, local_s, device=device
+        )
+        self.assertEqual(block_mask1.sparsity(), block_mask2.sparsity())
+
+    @supported_platform
+    @skip_on_cpu
+    def test_epilogue_fused(self, device):
+        # set so that metrics appear
+        torch._logging.set_logs(inductor_metrics=True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch.compile
         def f(q, k, v):
             out = flex_attention(q, k, v)
             return out.cos()
 
+<<<<<<< HEAD
         q, k, v = (torch.randn(1, 8, 1024, 64, device="cuda") for _ in range(3))
+=======
+        q, k, v = (torch.randn(1, 8, 1024, 64, device=device) for _ in range(3))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metrics.reset()
         _, code = run_and_get_code(f, q, k, v)
         fc = FileCheck()
@@ -1991,6 +3305,7 @@ def f(q, k, v):
         # We need this fudge factor for now as we write the extraneous logsumexp
         num_accesses += 1
         self.assertLess(metrics.num_bytes_accessed, accessed_bytes * num_accesses)
+<<<<<<< HEAD
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes)
@@ -1999,6 +3314,18 @@ def test_njt_causal(self, dtype):
             [0, 1024, 1024 + 512, S], device="cuda", dtype=torch.int32
         )
         seq_idx = torch.zeros(S, device="cuda", dtype=torch.int32)
+=======
+        torch._logging.set_logs()
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    def test_njt_causal(self, device, dtype):
+        offsets = torch.tensor(
+            [0, 1024, 1024 + 512, S], device=device, dtype=torch.int32
+        )
+        seq_idx = torch.zeros(S, device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for idx in range(len(offsets) - 1):
             seq_idx[offsets[idx] : offsets[idx + 1]] = idx
 
@@ -2012,6 +3339,7 @@ def njt_score_mod(qk, b, h, q, kv):
 
         causal_njt = create_njt_wrapper(_causal, offsets, seq_idx)
 
+<<<<<<< HEAD
         self.run_test(causal_njt, dtype)
         self.run_test_with_paged_attention(causal_njt, dtype)
 
@@ -2020,6 +3348,16 @@ def test_mixed_dtypes_fails(self):
         query = torch.randn((1, 1, 1024, 64), dtype=torch.float32, device="cuda")
         key = torch.randn((1, 1, 1024, 64), dtype=torch.float16, device="cuda")
         value = torch.randn((1, 1, 1024, 64), dtype=torch.float16, device="cuda")
+=======
+        self.run_test(causal_njt, dtype, device=device)
+        self.run_test_with_paged_attention(causal_njt, dtype, device=device)
+
+    @supported_platform
+    def test_mixed_dtypes_fails(self, device):
+        query = torch.randn((1, 1, 1024, 64), dtype=torch.float32, device=device)
+        key = torch.randn((1, 1, 1024, 64), dtype=torch.float16, device=device)
+        value = torch.randn((1, 1, 1024, 64), dtype=torch.float16, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(
             ValueError, "Expected query, key, and value to have the same dtype"
         ):
@@ -2027,20 +3365,38 @@ def test_mixed_dtypes_fails(self):
 
     @supported_platform
     @patch.object(torch._inductor.config, "max_autotune", True)
+<<<<<<< HEAD
     def test_max_autotune(self):
         def score_mod(score, b, h, m, n):
             return score * 2
 
         self.run_test(score_mod)
         self.run_test_with_paged_attention(score_mod)
+=======
+    def test_max_autotune(self, device):
+        def score_mod(score, b, h, m, n):
+            return score * 2
+
+        self.run_test(score_mod, dtype=torch.float16, device=device)
+        self.run_test_with_paged_attention(
+            score_mod, dtype=torch.float16, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @supported_platform
     @skip("TODO: Figure out why this is erroring")
     @patch.object(torch._inductor.config, "max_autotune", True)
+<<<<<<< HEAD
     def test_max_autotune_with_captured(self):
         head_scale = torch.randn(H, device="cuda")
         batch_scale = torch.randn(B, device="cuda")
         tok_scale = torch.randn(S, device="cuda")
+=======
+    def test_max_autotune_with_captured(self, device):
+        head_scale = torch.randn(H, device=device)
+        batch_scale = torch.randn(B, device=device)
+        tok_scale = torch.randn(S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def bias_mod(score, batch, head, token_q, token_kv):
             score = score + tok_scale[token_q]
@@ -2048,6 +3404,7 @@ def bias_mod(score, batch, head, token_q, token_kv):
             score = score + head_scale[head]
             return score
 
+<<<<<<< HEAD
         self.run_test(bias_mod)
 
     @supported_platform
@@ -2063,6 +3420,25 @@ def test_non_equal_head_dims(self, dtype, score_mod, head_dims):
 
     @supported_platform
     def test_autograd_function_in_score_mod(self):
+=======
+        self.run_test(bias_mod, dtype=torch.float32, device=device)
+
+    @supported_platform
+    @common_utils.parametrize("score_mod", test_score_mods)
+    @dtypes(*device_configs["cpu"].dtypes)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @common_utils.parametrize("head_dims", [(D, D // 2), (D // 2, D)])
+    def test_non_equal_head_dims(self, device, dtype, score_mod, head_dims):
+        qk_d, v_d = head_dims
+        self.run_test(score_mod, dtype, device, B, H, S, qk_d, B, H, S, V_D=v_d)
+        self.run_test_with_paged_attention(
+            score_mod, dtype, device, B, H, S, qk_d, B, H, S, V_D=v_d
+        )
+
+    @supported_platform
+    @skip_on_cpu
+    def test_autograd_function_in_score_mod(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class ApplyMask(torch.autograd.Function):
             generate_vmap_rule = True
 
@@ -2085,7 +3461,11 @@ def score_mod(score, b, h, q, kv):
         func = torch.compile(flex_attention, fullgraph=True)
 
         q, k, v = (
+<<<<<<< HEAD
             torch.randn(1, 8, 1024, 64, device="cuda", requires_grad=True)
+=======
+            torch.randn(1, 8, 1024, 64, device=device, requires_grad=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(3)
         )
 
@@ -2097,6 +3477,7 @@ def score_mod(score, b, h, q, kv):
         # self.run_test(score_mod)
 
     @supported_platform
+<<<<<<< HEAD
     def test_causal_block(self):
         def mask_mod(b, h, q, kv):
             return q >= kv
@@ -2120,6 +3501,36 @@ def test_new_empty_mask_mod(self):
         q, k, v = (torch.randn(4, 1, S, 64, device="cuda") for _ in range(3))
 
         attn_mask = torch.ones(4, 1, S, S, dtype=torch.bool, device="cuda").tril()
+=======
+    def test_causal_block(self, device):
+        def mask_mod(b, h, q, kv):
+            return q >= kv
+
+        block_mask = create_block_mask(mask_mod, 1, 1, S, S, device=device)
+        attention = functools.partial(flex_attention, block_mask=block_mask)
+
+        self.run_test_with_call(attention, dtype=torch.float16, device=device)
+
+    @supported_platform
+    def test_causal_block_paged_attention(self, device):
+        def mask_mod(b, h, q, kv):
+            return q >= kv
+
+        block_mask = create_block_mask(mask_mod, B, 1, S, S, device=device)
+        self.run_test_with_paged_attention(
+            score_mod=_identity,
+            dtype=torch.float16,
+            device=device,
+            block_mask=block_mask,
+        )
+
+    @supported_platform
+    def test_new_empty_mask_mod(self, device):
+        S = 128
+        q, k, v = (torch.randn(4, 1, S, 64, device=device) for _ in range(3))
+
+        attn_mask = torch.ones(4, 1, S, S, dtype=torch.bool, device=device).tril()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def score_mod(score, b, h, q_idx, kv_idx):
             h_ = h.new_zeros(h.shape)
@@ -2129,6 +3540,7 @@ def causal(b, h, q_idx, kv_idx):
             h_ = h.new_zeros(h.shape)
             return attn_mask[b, h_, q_idx, kv_idx]
 
+<<<<<<< HEAD
         block_mask = create_block_mask(causal, B=4, H=None, Q_LEN=S, KV_LEN=S)
         torch.compile(flex_attention)(q, k, v, score_mod, block_mask=block_mask)
 
@@ -2144,6 +3556,28 @@ def mask_mod(b, h, q, kv):
             return q >= kv
 
         block_mask = create_block_mask(mask_mod, B, 1, S // 8, S // 8)
+=======
+        block_mask = create_block_mask(
+            causal, B=4, H=None, Q_LEN=S, KV_LEN=S, device=device
+        )
+        torch.compile(flex_attention, fullgraph=True)(
+            q, k, v, score_mod, block_mask=block_mask
+        )
+
+    @supported_platform
+    @common_utils.parametrize("head_dim", [17, 24, 94, 121])
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    def test_non_pow_2_headdim(self, device, dtype, head_dim):
+        self.run_test(_rel_bias, dtype, device, B, H, S, head_dim, B, H, S, head_dim)
+
+    @supported_platform
+    def test_GQA_causal_mask(self, device):
+        def mask_mod(b, h, q, kv):
+            return q >= kv
+
+        block_mask = create_block_mask(mask_mod, B, 1, S // 8, S // 8, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attention = functools.partial(
             flex_attention, block_mask=block_mask, enable_gqa=True
         )
@@ -2151,6 +3585,10 @@ def mask_mod(b, h, q, kv):
         self.run_test_with_call(
             attention,
             torch.float16,
+<<<<<<< HEAD
+=======
+            device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             B,
             H * 4,  # Hq = 4*Hkv.
             S // 8,
@@ -2162,6 +3600,12 @@ def mask_mod(b, h, q, kv):
         )
 
         self.run_test_with_paged_attention(
+<<<<<<< HEAD
+=======
+            _identity,
+            dtype=torch.float16,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Q_H=H * 4,
             Q_S=S // 8,
             KV_H=H,
@@ -2170,6 +3614,7 @@ def mask_mod(b, h, q, kv):
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_custom_block_mask_generator(self):
         def mask_mod(b, h, q, kv):
             return q >= kv
@@ -2180,6 +3625,18 @@ def mask_mod(b, h, q, kv):
         def causal_constructor(S):
             num_blocks = torch.arange(S // BLOCK_SIZE, device="cuda") + 1
             indices = torch.arange(S // BLOCK_SIZE, device="cuda").expand(
+=======
+    def test_custom_block_mask_generator(self, device):
+        def mask_mod(b, h, q, kv):
+            return q >= kv
+
+        auto_mask = create_block_mask(mask_mod, 1, 1, S, S, device=device)
+        BLOCK_SIZE = 128
+
+        def causal_constructor(S):
+            num_blocks = torch.arange(S // BLOCK_SIZE, device=device) + 1
+            indices = torch.arange(S // BLOCK_SIZE, device=device).expand(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 S // BLOCK_SIZE, S // BLOCK_SIZE
             )
             num_blocks = num_blocks[None, None, :]
@@ -2192,14 +3649,26 @@ def causal_constructor(S):
         self.assertEqual(auto_mask.to_dense(), manual_mask.to_dense())
 
     @supported_platform
+<<<<<<< HEAD
     @common_utils.parametrize("dtype", test_dtypes)
     @common_utils.parametrize("score_mod", [_identity, _causal])
     def test_logsumexp_correctness(self, dtype, score_mod):
+=======
+    @skip_on_cpu
+    @dtypes(*device_configs["cpu"].dtypes)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @common_utils.parametrize("score_mod", [_identity, _causal])
+    def test_logsumexp_correctness(self, device, dtype, score_mod):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         make_tensor = functools.partial(
             torch.randn,
             (B, H, S, D),
             dtype=dtype,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
         q, k, v = make_tensor(), make_tensor(), make_tensor()
@@ -2238,12 +3707,21 @@ def eager_sdpa_hop(q, k, v, score_mod):
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_logsumexp_only_return(self):
+=======
+    @skip_on_cpu
+    def test_logsumexp_only_return(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         make_tensor = functools.partial(
             torch.randn,
             (B, H, S, D),
             dtype=torch.float32,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
         q, k, v = make_tensor(), make_tensor(), make_tensor()
@@ -2261,6 +3739,7 @@ def func(q, k, v, score_mod):
         )
 
     @supported_platform
+<<<<<<< HEAD
     @common_utils.parametrize(
         "score_mod", [_identity, _causal, _times_two, _squared, _trig, _trig2]
     )
@@ -2269,6 +3748,17 @@ def test_aot_eager_gradcheck(self, score_mod):
             torch.randn,
             (2, 2, 11, 4),
             device="cuda",
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "score_mod", [_identity, _causal, _times_two, _squared, _trig, _trig2]
+    )
+    def test_aot_eager_gradcheck(self, device, score_mod):
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 11, 4),
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float64,
             requires_grad=True,
         )
@@ -2283,7 +3773,12 @@ def test_aot_eager_gradcheck(self, score_mod):
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_eager_backward_strides(self):
+=======
+    @skip_on_cpu
+    def test_eager_backward_strides(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Repro(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -2305,6 +3800,7 @@ def forward(self, x):
                 x = torch.nn.attention.flex_attention.flex_attention(q, k, v)
                 return x
 
+<<<<<<< HEAD
         model = Repro().cuda()
         x = torch.randn((1, 512, 256), device="cuda", requires_grad=True)
         out = torch.compile(model, backend="aot_eager")(x)
@@ -2316,6 +3812,20 @@ def test_differentiable_logsumexp_gradcheck(self):
             torch.randn,
             (2, 2, 11, 4),
             device="cuda",
+=======
+        model = Repro().to(device)
+        x = torch.randn((1, 512, 256), device=device, requires_grad=True)
+        out = torch.compile(model, backend="aot_eager", fullgraph=True)(x)
+        out.backward(torch.ones_like(out))
+
+    @supported_platform
+    @skip_on_cpu
+    def test_differentiable_logsumexp_gradcheck(self, device):
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 11, 4),
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float64,
             requires_grad=True,
         )
@@ -2333,16 +3843,29 @@ def flex_attention_lse_only(q, k, v):
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_differentiable_logsumexp_compiled(self):
         make_tensor = functools.partial(
             torch.randn,
             (2, 2, 128, 64),
             device="cuda",
+=======
+    @skip_on_cpu
+    def test_differentiable_logsumexp_compiled(self, device):
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 128, 64),
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float32,
             requires_grad=True,
         )
         q, k, v = make_tensor(), make_tensor(), make_tensor()
+<<<<<<< HEAD
         lse_mask = torch.randn(2, 2, 128, device="cuda")
+=======
+        lse_mask = torch.randn(2, 2, 128, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         out, lse = flex_attention(q, k, v, return_lse=True)
         (out.mean() + (lse * lse_mask).sum()).backward()
@@ -2351,7 +3874,13 @@ def test_differentiable_logsumexp_compiled(self):
         k.grad = None
         v.grad = None
 
+<<<<<<< HEAD
         out2, lse2 = torch.compile(flex_attention)(q, k, v, return_lse=True)
+=======
+        out2, lse2 = torch.compile(flex_attention, fullgraph=True)(
+            q, k, v, return_lse=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (out2.mean() + (lse2 * lse_mask).sum()).backward()
         q_grad2, k_grad2, v_grad2 = q.grad, k.grad, v.grad
         tolerance = Tolerances(atol=1e-1, rtol=1e-1)
@@ -2370,28 +3899,47 @@ def test_differentiable_logsumexp_compiled(self):
 
     # Use weird mask to test reusing block_mask does work well.
     @supported_platform
+<<<<<<< HEAD
     def _test_block_mask_reuse_with_weird_mask(self):
+=======
+    @skip_on_cpu
+    def _test_block_mask_reuse_with_weird_mask(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def mask(b, h, q, kv):
             return (kv < 256) | (kv >= 2048)
 
         make_tensor = functools.partial(
             torch.randn,
             (4, 4, 4096, 64),
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float32,
             requires_grad=True,
         )
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask, None, None, 4096, 4096)
         # Compile 1st version with q/k/v(seqlen=4096) and block_mask(seqlen=4096)
         torch.compile(flex_attention, dynamic=True)(
+=======
+        block_mask = create_block_mask(mask, None, None, 4096, 4096, device=device)
+        # Compile 1st version with q/k/v(seqlen=4096) and block_mask(seqlen=4096)
+        torch.compile(flex_attention, dynamic=True, fullgraph=True)(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             make_tensor(), make_tensor(), make_tensor(), block_mask=block_mask
         )
 
         make_tensor2 = functools.partial(
             torch.randn,
             (4, 4, 2048, 64),
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float32,
             requires_grad=True,
         )
@@ -2399,7 +3947,11 @@ def mask(b, h, q, kv):
 
         # Compile 2st version with q/k/v(seqlen=2048) and block_mask(seqlen=4096),
         # The graph includes the BlockMask._adjust part.
+<<<<<<< HEAD
         out = torch.compile(flex_attention, dynamic=True)(
+=======
+        out = torch.compile(flex_attention, dynamic=True, fullgraph=True)(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             q, k, v, block_mask=block_mask
         )
         out.sum().backward()
@@ -2408,9 +3960,15 @@ def mask(b, h, q, kv):
         k.grad = None
         v.grad = None
 
+<<<<<<< HEAD
         block_mask2 = create_block_mask(mask, None, None, 2048, 2048)
         # Reuse the 1st version with q/k/v(seqlen=2048) and block_mask(seqlen=2048)
         out2 = torch.compile(flex_attention, dynamic=True)(
+=======
+        block_mask2 = create_block_mask(mask, None, None, 2048, 2048, device=device)
+        # Reuse the 1st version with q/k/v(seqlen=2048) and block_mask(seqlen=2048)
+        out2 = torch.compile(flex_attention, dynamic=True, fullgraph=True)(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             q, k, v, block_mask=block_mask2
         )
         out2.sum().backward()
@@ -2429,11 +3987,20 @@ def mask(b, h, q, kv):
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_float32_matmul_precision(self):
         make_tensor = functools.partial(
             torch.zeros,
             (2, 2, 128, 32),
             device="cuda",
+=======
+    @skip_on_cpu
+    def test_float32_matmul_precision(self, device):
+        make_tensor = functools.partial(
+            torch.zeros,
+            (2, 2, 128, 32),
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float32,
             requires_grad=False,
         )
@@ -2460,22 +4027,38 @@ def score_mod(score, b, h, q, kv):
         torch.testing.assert_close(grads_eager, grads_compile)
 
     @supported_platform
+<<<<<<< HEAD
     @common_utils.parametrize("score_mod_name", ["_head_offset"])
     @common_utils.parametrize("mode", ["eager", "aot_eager"])
     def test_captured_score_mod_aot_eager_gradcheck(
         self, score_mod_name: str, mode: str
+=======
+    @skip_on_cpu
+    @common_utils.parametrize("score_mod_name", ["_head_offset"])
+    @common_utils.parametrize("mode", ["eager", "aot_eager"])
+    def test_captured_score_mod_aot_eager_gradcheck(
+        self, device, score_mod_name: str, mode: str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         make_tensor = functools.partial(
             torch.randn,
             (2, 2, 11, 4),
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float64,
             requires_grad=True,
         )
         query, key, value = make_tensor(), make_tensor(), make_tensor()
 
         func = torch.compile(flex_attention, backend=mode, fullgraph=True)
+<<<<<<< HEAD
         score_mod = captured_buffers_map[score_mod_name](torch.float64)
+=======
+        score_mod = captured_buffers_map[score_mod_name](torch.float64, device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertTrue(
             torch.autograd.gradcheck(
@@ -2484,9 +4067,17 @@ def test_captured_score_mod_aot_eager_gradcheck(
         )
 
     @supported_platform
+<<<<<<< HEAD
     @common_utils.parametrize("mode", ["eager", "aot_eager"])
     def test_document_masking_edge_case(self, mode):
         document_masks = torch.full((2, 128), 0, dtype=torch.int32, device="cuda")
+=======
+    @skip_on_cpu
+    @common_utils.parametrize("mode", ["eager", "aot_eager"])
+    def test_document_masking_edge_case(self, device, mode):
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+        document_masks = torch.full((2, 128), 0, dtype=torch.int32, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         document_masks[:, 64:] = 1
 
         def mask_mod(b, h, q, kv):
@@ -2496,13 +4087,20 @@ def mask_mod(b, h, q, kv):
         make_tensor = functools.partial(
             torch.randn,
             (2, 1, 128, 4),
+<<<<<<< HEAD
             device="cuda",
             dtype=torch.float64,
             requires_grad=True,
+=======
+            device=device,
+            dtype=torch.float64,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         query, key, value = make_tensor(), make_tensor(), make_tensor()
         func = torch.compile(flex_attention, backend=mode, fullgraph=True)
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, 2, 1, 128, 128)
         out = func(query, key, value, block_mask=block_mask)
         out.sum().backward()
@@ -2513,6 +4111,20 @@ def test_strided_backwards(self):
         Q = torch.randn(shape, requires_grad=True, device="cuda")
         K = torch.randn(shape, requires_grad=True, device="cuda")
         V = torch.randn(shape, requires_grad=True, device="cuda")
+=======
+        block_mask = create_block_mask(mask_mod, 2, 1, 128, 128, device=device)
+        out = func(query, key, value, block_mask=block_mask)
+        if requires_grad:
+            out.sum().backward()
+
+    @supported_platform
+    @skip_on_cpu
+    def test_strided_backwards(self, device):
+        shape = (1, 2, 4096, 64)
+        Q = torch.randn(shape, requires_grad=True, device=device)
+        K = torch.randn(shape, requires_grad=True, device=device)
+        V = torch.randn(shape, requires_grad=True, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         func = torch.compile(flex_attention, dynamic=True, fullgraph=True)
 
         K_sliced = K[:, :, :-128]
@@ -2542,21 +4154,35 @@ def test_strided_backwards(self):
         ],
     )
     @common_utils.parametrize("shape", [(2, 1, 128, 16), (4, 2, 64, 16)])
+<<<<<<< HEAD
     def test_flex_attention_stride_ordering(self, mode, permute_order, shape):
         if TEST_WITH_ROCM:
             self.skipTest(
                 "ROCM BUG SEE: https://github.com/pytorch/pytorch/issues/140855"
             )
+=======
+    def test_flex_attention_stride_ordering(self, device, mode, permute_order, shape):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._inductor.ir import get_stride_order
 
         dtype = torch.float32
         # Setup
+<<<<<<< HEAD
         make_tensor = functools.partial(
             torch.randn,
             shape,
             device="cuda",
             dtype=dtype,
             requires_grad=False if mode == "paged_attention" else True,
+=======
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+        make_tensor = functools.partial(
+            torch.randn,
+            shape,
+            device=device,
+            dtype=dtype,
+            requires_grad=False if mode == "paged_attention" else requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Create and permute tensors
@@ -2569,7 +4195,13 @@ def test_flex_attention_stride_ordering(self, mode, permute_order, shape):
             func = torch.compile(flex_attention, backend=mode, fullgraph=True)
             out = func(query, key, value)
         elif mode == "paged_attention":
+<<<<<<< HEAD
             out, _ = self.run_paged_attention(_identity, query, key, value, dtype)
+=======
+            out, _ = self.run_paged_attention(
+                _identity, query, key, value, dtype, device=device
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             func = flex_attention
             out = func(query, key, value)
@@ -2584,22 +4216,36 @@ def test_flex_attention_stride_ordering(self, mode, permute_order, shape):
         )
 
     @supported_platform
+<<<<<<< HEAD
+=======
+    @skip_on_cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("mode", ["eager", "inductor"])
     @common_utils.parametrize(
         "permute_order",
         [(0, 1, 2, 3), (1, 0, 2, 3), (0, 2, 1, 3), (2, 0, 1, 3), (0, 1, 3, 2)],
     )
     @common_utils.parametrize("shape", [(2, 5, 128, 16), (4, 2, 64, 16)])
+<<<<<<< HEAD
     def test_flex_attention_backward_stride_ordering(self, mode, permute_order, shape):
         if TEST_WITH_ROCM:
             self.skipTest(
                 "ROCM BUG SEE: https://github.com/pytorch/pytorch/issues/140855"
             )
+=======
+    def test_flex_attention_backward_stride_ordering(
+        self, device, mode, permute_order, shape
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._inductor.ir import get_stride_order
 
         dtype = torch.float32
         make_tensor = functools.partial(
+<<<<<<< HEAD
             torch.randn, shape, device="cuda", dtype=dtype, requires_grad=False
+=======
+            torch.randn, shape, device=device, dtype=dtype, requires_grad=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         query, key, value = make_tensor(), make_tensor(), make_tensor()
@@ -2634,10 +4280,16 @@ def test_flex_attention_backward_stride_ordering(self, mode, permute_order, shap
             )
 
     @supported_platform
+<<<<<<< HEAD
     def test_non_contiguous_last_dim(self):
         """Test flex_attention with tensors having non contiguous last dimension."""
         B, H, D = 4, 8, 64
         device = "cuda"
+=======
+    def test_non_contiguous_last_dim(self, device):
+        """Test flex_attention with tensors having non contiguous last dimension."""
+        B, H, D = 4, 8, 64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype = torch.float16 if device == "cuda" else torch.float32
         for S in [16, 64]:
 
@@ -2654,7 +4306,11 @@ def column_major_tensor():
             k = column_major_tensor()
             v = column_major_tensor()
 
+<<<<<<< HEAD
             requires_grad = True
+=======
+            requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if requires_grad:
                 q.requires_grad_(True)
                 k.requires_grad_(True)
@@ -2699,6 +4355,7 @@ def column_major_tensor():
 
     @supported_platform
     @common_utils.parametrize("compile", [True, False])
+<<<<<<< HEAD
     def test_fully_masked_out_rows_0_check(self, compile: bool):
         # Ensure fully masked out rows won't cause NaNs.
         query = torch.randn(
@@ -2709,18 +4366,45 @@ def test_fully_masked_out_rows_0_check(self, compile: bool):
         )
         value = torch.randn(
             (B, H, S, D), dtype=torch.float32, device="cuda", requires_grad=True
+=======
+    def test_fully_masked_out_rows_0_check(self, device, compile: bool):
+        # Ensure fully masked out rows won't cause NaNs.
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+        query = torch.randn(
+            (B, H, S, D),
+            dtype=torch.float32,
+            device=device,
+            requires_grad=requires_grad,
         )
-
-        M = S // 2
-
+        key = torch.randn(
+            (B, H, S, D),
+            dtype=torch.float32,
+            device=device,
+            requires_grad=requires_grad,
+        )
+        value = torch.randn(
+            (B, H, S, D),
+            dtype=torch.float32,
+            device=device,
+            requires_grad=requires_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        )
+
+        M = S // 2
+
         def mask_mod(b, h, q, kv):
             return q < M
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, B, 1, S, S)
+=======
+        block_mask = create_block_mask(mask_mod, B, 1, S, S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         flex = (
             torch.compile(flex_attention, dynamic=False) if compile else flex_attention
         )
+<<<<<<< HEAD
         out, lse = flex(query, key, value, block_mask=block_mask, return_lse=True)
         self.assertEqual(out[:, :, M:, :].sum(), 0)
         self.assertTrue((lse[:, :, M:] == -float("inf")).all())
@@ -2732,16 +4416,38 @@ def mask_mod(b, h, q, kv):
     @supported_platform
     @common_utils.parametrize("compile", [True, False])
     def test_fully_masked_out_rows(self, compile: bool):
+=======
+        if requires_grad:
+            out, lse = flex(query, key, value, block_mask=block_mask, return_lse=True)
+            self.assertEqual(out[:, :, M:, :].sum(), 0)
+            self.assertTrue((lse[:, :, M:] == -float("inf")).all())
+
+            loss = out.sum() + lse.sum()
+            loss.backward()
+            self.assertEqual(query.grad[:, :, M:, :].sum(), 0)
+        else:
+            out = flex(query, key, value, block_mask=block_mask, return_lse=False)
+
+        self.assertEqual(out[:, :, M:, :].sum(), 0)
+
+    @supported_platform
+    def test_fully_masked_out_rows(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         M = S // 2
 
         def mask_mod(b, h, q, kv):
             return q < M
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, B, 1, S, S)
+=======
+        block_mask = create_block_mask(mask_mod, B, 1, S, S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def noop_mod(score, b, h, q_idx, kv_idx):
             return score
 
+<<<<<<< HEAD
         self.run_test(noop_mod, torch.float32, B, H, S, D, B, H, S, D, block_mask)
 
     @supported_platform
@@ -2750,6 +4456,19 @@ def test_kernel_options_argument_is_respected(self):
             torch.randn,
             (2, 2, 128, 64),
             device="cuda",
+=======
+        self.run_test(
+            noop_mod, torch.float32, device, B, H, S, D, B, H, S, D, block_mask
+        )
+
+    @supported_platform
+    @skip_on_cpu
+    def test_kernel_options_argument_is_respected(self, device):
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 128, 64),
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float32,
             requires_grad=True,
         )
@@ -2757,11 +4476,20 @@ def test_kernel_options_argument_is_respected(self):
 
         # Ensure we respect user's input kernel options.
         _, code = run_and_get_code(
+<<<<<<< HEAD
             torch.compile(flex_attention), q, k, v, kernel_options={"BLOCK_M": 16}
+=======
+            torch.compile(flex_attention, fullgraph=True),
+            q,
+            k,
+            v,
+            kernel_options={"BLOCK_M": 16},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         FileCheck().check("BLOCK_M : tl.constexpr = 16").run(code[0])
 
     @supported_platform
+<<<<<<< HEAD
     def test_comparison_vs_sdpa(self):
         def causal(score, b, h, q_idx, kv_idx):
             return torch.where(q_idx >= kv_idx, score, -float("inf"))
@@ -2833,19 +4561,33 @@ def causal_mask(b, h, q_idx, kv_idx):
     @supported_platform
     def test_block_mask_non_divisible(self):
         seq = torch.arange(1023, device="cuda") // 128
+=======
+    def test_block_mask_non_divisible(self, device):
+        seq = torch.arange(1023, device=device) // 128
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def mod(b, h, q, kv):
             return seq[q] == seq[kv]
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mod, None, None, 1023, 1023, device="cuda")
         torch.compile(create_block_mask)(mod, None, None, 1023, 1023, device="cuda")
         self.run_test_with_call(
             lambda q, k, v: flex_attention(q, k, v, block_mask=block_mask),
+=======
+        block_mask = create_block_mask(mod, None, None, 1023, 1023, device=device)
+        torch.compile(create_block_mask)(mod, None, None, 1023, 1023, device=device)
+        self.run_test_with_call(
+            lambda q, k, v: flex_attention(q, k, v, block_mask=block_mask),
+            torch.float16,
+            device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Q_S=1023,
             KV_S=1023,
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_head_bias_req_grad(self):
         B, H, S, D = 1, 4, 256, 64
         bias = torch.randn(H, device="cuda", dtype=torch.float16, requires_grad=True)
@@ -3096,6 +4838,23 @@ def test_modular_indexing(self):
         B, H, N, D = 100, 12, 128, 64
         dtype = torch.bfloat16
         device = torch.device("cuda")
+=======
+    def test_causal_block_non_divisible(self, device):
+        def mask_mod(b, h, q, kv):
+            return q >= kv
+
+        block_mask = create_block_mask(mask_mod, B, 1, S - 1, S - 1, device=device)
+        attention = functools.partial(flex_attention, block_mask=block_mask)
+
+        self.run_test_with_call(attention, torch.float16, device, Q_S=S - 1, KV_S=S - 1)
+
+    @supported_platform
+    @skip_on_cpu
+    def test_modular_indexing(self, device):
+        B, H, N, D = 100, 12, 128, 64
+        dtype = torch.bfloat16
+        device = torch.device(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class Attention(torch.nn.Module):
             def __init__(self):
@@ -3118,7 +4877,11 @@ def score_mod(score, batch, head, q_idx, k_idx):
 
             return score_mod
 
+<<<<<<< HEAD
         m = Attention().cuda().eval().to(dtype)
+=======
+        m = Attention().to(device).eval().to(dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = torch.compile(m, mode="default", fullgraph=False)
 
         q = torch.randn(B, H, N, D, device=device, dtype=dtype)
@@ -3128,12 +4891,21 @@ def score_mod(score, batch, head, q_idx, k_idx):
         m(q, k, v)
 
     @supported_platform
+<<<<<<< HEAD
     def test_force_write_lse(self):
+=======
+    @skip_on_cpu
+    def test_force_write_lse(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype = torch.float32
         make_tensor = functools.partial(
             torch.randn,
             (2, 2, 128, 16),
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
             requires_grad=False,
         )
@@ -3144,18 +4916,29 @@ def test_force_write_lse(self):
         out_compiled, lse_compiled = flex_compile(query, key, value, return_lse=True)
 
         out_paged, lse_paged = self.run_paged_attention(
+<<<<<<< HEAD
             score_mod=_identity, q=query, k=key, v=value, dtype=dtype
+=======
+            score_mod=_identity, q=query, k=key, v=value, dtype=dtype, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         torch.testing.assert_close(lse_eager, lse_compiled, atol=3e-3, rtol=0)
         torch.testing.assert_close(lse_eager, lse_paged, atol=3e-3, rtol=0)
 
     @supported_platform
+<<<<<<< HEAD
     @common_utils.parametrize("backend", ["flex_attention", "flex_decode", "eager"])
     def test_lse_masked_output(self, backend):
         if backend == "flex_decode":
             if TEST_WITH_ROCM:
                 self.skipTest("backend=flex_decode is unsupported on ROCM, for now")
+=======
+    @skip_on_cpu
+    @common_utils.parametrize("backend", ["flex_attention", "flex_decode", "eager"])
+    def test_lse_masked_output(self, device, backend):
+        if backend == "flex_decode":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             kernel_options = {"FORCE_USE_FLEX_ATTENTION": False}
             flex_call = torch.compile(flex_attention, fullgraph=True)
             N_CTX = 96
@@ -3172,7 +4955,11 @@ def test_lse_masked_output(self, backend):
         make_tensor = functools.partial(
             torch.randn,
             (2, 2, N_CTX, 64),
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float32,
             requires_grad=True,
         )
@@ -3188,10 +4975,22 @@ def global_causal(b, h, q_idx, kv_idx):
             return causal_mask & window_mask
 
         sliding_window_causal = torch.nn.attention.flex_attention.create_block_mask(
+<<<<<<< HEAD
             sliding_window_causal, B=None, H=None, Q_LEN=N_CTX, KV_LEN=N_CTX
         )
         global_causal = torch.nn.attention.flex_attention.create_block_mask(
             global_causal, B=None, H=None, Q_LEN=N_CTX, KV_LEN=N_CTX
+=======
+            sliding_window_causal,
+            B=None,
+            H=None,
+            Q_LEN=N_CTX,
+            KV_LEN=N_CTX,
+            device=device,
+        )
+        global_causal = torch.nn.attention.flex_attention.create_block_mask(
+            global_causal, B=None, H=None, Q_LEN=N_CTX, KV_LEN=N_CTX, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         local_attn = functools.partial(
@@ -3235,6 +5034,7 @@ def global_causal(b, h, q_idx, kv_idx):
         torch.testing.assert_close(flex_v_grad, v.grad, atol=3e-3, rtol=2e-3)
 
     @supported_platform
+<<<<<<< HEAD
     def test_mixed_device_error_message(self):
         # Create tensors on different devices
         cpu_tensor = torch.randn(2, 2, 128, 16, device="cpu")
@@ -3242,6 +5042,16 @@ def test_mixed_device_error_message(self):
 
         # Use different devices for query, key, and value
         query, key, value = cpu_tensor, cuda_tensor, cpu_tensor
+=======
+    @skip_on_cpu
+    def test_mixed_device_error_message(self, device):
+        # Create tensors on different devices
+        cpu_tensor = torch.randn(2, 2, 128, 16, device="cpu")
+        gpu_tensor = torch.randn(2, 2, 128, 16, device=device)
+
+        # Use different devices for query, key, and value
+        query, key, value = cpu_tensor, gpu_tensor, cpu_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         expected_error_message = (
             "Expected query, key, and value to have the same device type, "
@@ -3253,9 +5063,16 @@ def test_mixed_device_error_message(self):
             flex_attention(query, key, value)
 
     @supported_platform
+<<<<<<< HEAD
     def test_captured_wrong_device_error_message(self):
         means = torch.randn(64, 3).cuda()
         length_scales = torch.logspace(0.001, 0.1, 8)
+=======
+    @skip_on_cpu
+    def test_captured_wrong_device_error_message(self, device):
+        means = torch.randn(64, 3, device=device)
+        length_scales = torch.logspace(0.001, 0.1, 8, device="cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def euclidean_dist_pos_embed(score, b, h, q_idx, k_idx):
             q_pos = means[q_idx]
@@ -3267,15 +5084,27 @@ def euclidean_dist_pos_embed(score, b, h, q_idx, k_idx):
 
         expected_error_message = "Buffers cannot be created"
 
+<<<<<<< HEAD
         q, k, v = (torch.randn(1, 8, 64, 64, device="cuda") for _ in range(3))
+=======
+        q, k, v = (torch.randn(1, 8, 64, 64, device=device) for _ in range(3))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(RuntimeError, expected_error_message):
             torch.compile(flex_attention)(q, k, v, score_mod=euclidean_dist_pos_embed)
 
     @supported_platform
+<<<<<<< HEAD
     def test_cant_lower_error_message(self):
         # We can't lower a 256-element reduction inside a pointwise reduction
         means = torch.randn(64, 256).cuda()
         length_scales = torch.logspace(0.001, 0.1, 8).cuda()
+=======
+    @skip_on_cpu
+    def test_cant_lower_error_message(self, device):
+        # We can't lower a 256-element reduction inside a pointwise reduction
+        means = torch.randn(64, 256, device=device)
+        length_scales = torch.logspace(0.001, 0.1, 8, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def euclidean_dist_pos_embed(score, b, h, q_idx, k_idx):
             q_pos = means[q_idx]
@@ -3287,15 +5116,27 @@ def euclidean_dist_pos_embed(score, b, h, q_idx, k_idx):
 
         expected_error_message = "Buffers cannot be created"
 
+<<<<<<< HEAD
         q, k, v = (torch.randn(1, 8, 64, 64, device="cuda") for _ in range(3))
+=======
+        q, k, v = (torch.randn(1, 8, 64, 64, device=device) for _ in range(3))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(RuntimeError, expected_error_message):
             torch.compile(flex_attention)(q, k, v, score_mod=euclidean_dist_pos_embed)
 
     @supported_platform
+<<<<<<< HEAD
     def test_reduction_unrolled(self):
         # We can't lower a 256-element reduction inside a pointwise reduction
         means = torch.randn(S, 3).cuda()
         length_scales = torch.logspace(0.001, 0.1, H).cuda()
+=======
+    @skip_on_cpu
+    def test_reduction_unrolled(self, device):
+        # We can't lower a 256-element reduction inside a pointwise reduction
+        means = torch.randn(S, 3, device=device)
+        length_scales = torch.logspace(0.001, 0.1, H, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def euclidean_dist_pos_embed(score, b, h, q_idx, k_idx):
             q_pos = means[q_idx]
@@ -3305,27 +5146,52 @@ def euclidean_dist_pos_embed(score, b, h, q_idx, k_idx):
             inv_dist = torch.exp(-dist / scale)
             return inv_dist * score
 
+<<<<<<< HEAD
         self.run_test(euclidean_dist_pos_embed, torch.bfloat16)
 
     @supported_platform
     def test_invalid_block_size(self):
         # Create tensors on different devices
         q, k, v = (torch.randn(1, 8, 128, 64, device="cuda") for _ in range(3))
+=======
+        self.run_test(euclidean_dist_pos_embed, torch.bfloat16, device=device)
+
+    @supported_platform
+    @skip_on_cpu
+    def test_invalid_block_size(self, device):
+        # Create tensors on different devices
+        q, k, v = (torch.randn(1, 8, 128, 64, device=device) for _ in range(3))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         expected_error_message = (
             "ValueError: Q and KV block size must be divisible by BLOCK_M and BLOCK_N."
         )
+<<<<<<< HEAD
         block_mask = create_block_mask(noop_mask, 1, 8, 128, 128, BLOCK_SIZE=96)
+=======
+        block_mask = create_block_mask(
+            noop_mask, 1, 8, 128, 128, BLOCK_SIZE=96, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaisesRegex(RuntimeError, expected_error_message):
             torch.compile(flex_attention)(q, k, v, block_mask=block_mask)
 
     @supported_platform
+<<<<<<< HEAD
     def test_small_q_kv_len(self):
         make_tensor = functools.partial(
             torch.ones,
             (1, 1, 1, 16),
             device="cuda",
+=======
+    @skip_on_cpu
+    def test_small_q_kv_len(self, device):
+        make_tensor = functools.partial(
+            torch.ones,
+            (1, 1, 1, 16),
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float32,
             requires_grad=True,
         )
@@ -3349,7 +5215,12 @@ def test_small_q_kv_len(self):
         torch.testing.assert_close(grads_eager, grads_compile)
 
     @supported_platform
+<<<<<<< HEAD
     def test_dynamic_shapes_bug_dynamic_batch(self):
+=======
+    @skip_on_cpu
+    def test_dynamic_shapes_bug_dynamic_batch(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def _flex_attention_mask(b, h, q_idx, kv_idx, input_lengths):
             padding_condition = (q_idx < input_lengths[b]) & (kv_idx < input_lengths[b])
             return padding_condition
@@ -3380,6 +5251,10 @@ def forward(self, x, input_lengths):
                     H=None,
                     Q_LEN=max_time,
                     KV_LEN=max_time,
+<<<<<<< HEAD
+=======
+                    device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 x = torch.compile(
@@ -3392,6 +5267,7 @@ def forward(self, x, input_lengths):
                 )
                 return x
 
+<<<<<<< HEAD
         model = Model(128).cuda()
         B, F, T = 16, 256, 12
         for _ in range(5):
@@ -3409,6 +5285,26 @@ def test_dynamic_shapes_with_custom_kernel_options(self):
             torch.ones,
             (8, 8, 1024, 64),
             device="cuda",
+=======
+        model = Model(128).to(device)
+        B, F, T = 16, 256, 12
+        for _ in range(5):
+            x = torch.randn(B, T, F, device=device)
+            l = torch.randint(0, T, (B,), device=device)
+            model(x, l)
+
+        assert counter.frame_count == 1, (
+            f"Expected 1 graph, but got {counter.frame_count} graphs"
+        )
+
+    @supported_platform
+    @skip_on_cpu
+    def test_dynamic_shapes_with_custom_kernel_options(self, device):
+        make_tensor = functools.partial(
+            torch.ones,
+            (8, 8, 1024, 64),
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.bfloat16,
         )
         query, key, value = make_tensor(), make_tensor(), make_tensor()
@@ -3421,6 +5317,7 @@ def test_dynamic_shapes_with_custom_kernel_options(self):
         torch.testing.assert_close(out_eager, out_compiled, atol=3e-3, rtol=2e-3)
 
     @supported_platform
+<<<<<<< HEAD
     def test_dynamic_shapes_with_max_autotune(self):
         make_tensor = functools.partial(
             torch.ones,
@@ -3430,6 +5327,19 @@ def test_dynamic_shapes_with_max_autotune(self):
         )
         query, key, value = make_tensor(), make_tensor(), make_tensor()
         block_mask = create_block_mask(_causal_mask, None, None, 1024, 1024)
+=======
+    def test_dynamic_shapes_with_max_autotune(self, device):
+        make_tensor = functools.partial(
+            torch.ones,
+            (8, 8, 1024, 64),
+            device=device,
+            dtype=torch.float if device == "cpu" else torch.bfloat16,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+        block_mask = create_block_mask(
+            _causal_mask, None, None, 1024, 1024, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         out_eager = flex_attention(query, key, value, block_mask=block_mask)
 
@@ -3441,11 +5351,20 @@ def test_dynamic_shapes_with_max_autotune(self):
         torch.testing.assert_close(out_eager, out_compiled, atol=3e-3, rtol=2e-3)
 
     @supported_platform
+<<<<<<< HEAD
     def test_zero_length_sequence_error(self):
         make_tensor = functools.partial(
             torch.ones,
             (8, 8, 0, 64),  # Zero in sequence dimension
             device="cuda",
+=======
+    @skip_on_cpu
+    def test_zero_length_sequence_error(self, device):
+        make_tensor = functools.partial(
+            torch.ones,
+            (8, 8, 0, 64),  # Zero in sequence dimension
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.bfloat16,
         )
         query, key, value = make_tensor(), make_tensor(), make_tensor()
@@ -3458,11 +5377,22 @@ def test_zero_length_sequence_error(self):
             flex_compile(query, key, value)
 
     @supported_platform
+<<<<<<< HEAD
     def test_causal_block_non_divisible_with_captured_buffer(self):
         Q_S = S - 3
         KV_S = S - 3
         offset_q = torch.randn(Q_S, device="cuda", dtype=torch.bfloat16)
         offset_kv = torch.randn(KV_S, device="cuda", dtype=torch.bfloat16)
+=======
+    def test_causal_block_non_divisible_with_captured_buffer(
+        self,
+        device,
+    ):
+        Q_S = S - 3
+        KV_S = S - 3
+        offset_q = torch.randn(Q_S, device=device, dtype=torch.bfloat16)
+        offset_kv = torch.randn(KV_S, device=device, dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def score_mod(score, b, h, q, kv):
             return score + offset_q[q] + offset_kv[kv]
@@ -3470,6 +5400,7 @@ def score_mod(score, b, h, q, kv):
         def mask_mod(b, h, q, kv):
             return q >= kv
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, B, 1, Q_S, KV_S)
 
         attention = functools.partial(flex_attention, block_mask=block_mask)
@@ -3482,6 +5413,22 @@ def test_non_divisible_with_captured_buffer(self):
         KV_S = S + 3
 
         multiplier = torch.randn(Q_S, device="cuda", dtype=torch.bfloat16)
+=======
+        block_mask = create_block_mask(mask_mod, B, 1, Q_S, KV_S, device=device)
+
+        attention = functools.partial(flex_attention, block_mask=block_mask)
+
+        self.run_test_with_call(
+            attention, Q_S=Q_S, KV_S=KV_S, dtype=torch.bfloat16, device=device
+        )
+
+    @supported_platform
+    def test_non_divisible_with_captured_buffer(self, device):
+        Q_S = S + 3
+        KV_S = S + 3
+
+        multiplier = torch.randn(Q_S, device=device, dtype=torch.bfloat16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def apply_multiplicative_bias(score, b, h, q_idx, kv_idx):
             return score * multiplier[q_idx]
@@ -3490,6 +5437,7 @@ def apply_multiplicative_bias(score, b, h, q_idx, kv_idx):
             flex_attention, score_mod=apply_multiplicative_bias
         )
 
+<<<<<<< HEAD
         self.run_test_with_call(attention, Q_S=Q_S, KV_S=KV_S)
 
     @supported_platform
@@ -3499,6 +5447,28 @@ def test_num_warps_8_error(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_qkv_and_block_mask_on_the_same_device(self):
+=======
+        self.run_test_with_call(
+            attention, Q_S=Q_S, KV_S=KV_S, dtype=torch.bfloat16, device=device
+        )
+
+    @supported_platform
+    def test_num_warps_8_error(self, device):
+        attention = functools.partial(flex_attention, score_mod=_identity)
+        self.run_test_with_call(
+            attention,
+            dtype=torch.float16,
+            device=device,
+            Q_S=128,
+            KV_S=128,
+            Q_D=128,
+            V_D=128,
+        )
+
+    @supported_platform
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    def test_qkv_and_block_mask_on_the_same_device(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         make_tensor = functools.partial(
             torch.ones,
             (2, 2, 256, 32),
@@ -3518,7 +5488,12 @@ def mask_mod(b, h, q, kv):
             torch.compile(flex_attention)(query, key, value, block_mask=block_mask)
 
     @supported_platform
+<<<<<<< HEAD
     def test_free_symbol_dynamic(self):
+=======
+    @skip_on_cpu
+    def test_free_symbol_dynamic(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def batch_flip_causal(b, h, q_idx, kv_idx):
             return (q_idx >= kv_idx) & (b % 2 == 0)
 
@@ -3542,7 +5517,11 @@ def forward(self, x, block_mask=None):
                 )
                 return y.transpose(1, 2).contiguous().view(B, T, C)
 
+<<<<<<< HEAD
         model = SimpleAttention().cuda()
+=======
+        model = SimpleAttention().to(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model.compile(mode="default", dynamic=True)
         sequence_len = 256
 
@@ -3550,7 +5529,13 @@ def forward(self, x, block_mask=None):
         torch._dynamo.reset()
         for batch_shape in [4, 16, 32]:
             # Create dense mask
+<<<<<<< HEAD
             rand_mask = torch.randint(0, 2, (batch_shape, sequence_len)).cuda().bool()
+=======
+            rand_mask = torch.randint(
+                0, 2, (batch_shape, sequence_len), device=device
+            ).bool()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             block_mask = torch.compile(create_block_mask, dynamic=True)(
                 B=batch_shape,
                 BLOCK_SIZE=128,
@@ -3558,17 +5543,30 @@ def forward(self, x, block_mask=None):
                 H=None,
                 Q_LEN=sequence_len,
                 KV_LEN=sequence_len,
+<<<<<<< HEAD
                 device="cuda",
             )
 
             # Run forward pass
             x = torch.randn(batch_shape, sequence_len, 512).cuda()
+=======
+                device=device,
+            )
+
+            # Run forward pass
+            x = torch.randn(batch_shape, sequence_len, 512, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model(x, block_mask=block_mask)
 
         self.assertEqual(torch._dynamo.utils.counters["aot_autograd"]["ok"], 2)
 
     @supported_platform
+<<<<<<< HEAD
     def test_symbol_closure_in_score_mod(self):
+=======
+    @skip_on_cpu
+    def test_symbol_closure_in_score_mod(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class SimpleAttention(torch.nn.Module):
             def __init__(self, dim=512, n_head=8):
                 super().__init__()
@@ -3589,7 +5587,11 @@ def forward(self, x, block_mask=None):
                     block_mask=block_mask,
                 )
 
+<<<<<<< HEAD
         model = SimpleAttention().cuda()
+=======
+        model = SimpleAttention().to(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._dynamo.testing import EagerAndRecordGraphs
 
         backend = EagerAndRecordGraphs()
@@ -3598,7 +5600,11 @@ def forward(self, x, block_mask=None):
 
         torch._dynamo.reset()
         for batch_shape in [4, 16, 32]:
+<<<<<<< HEAD
             x = torch.randn(batch_shape, sequence_len, 512).cuda()
+=======
+            x = torch.randn(batch_shape, sequence_len, 512, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model(x)
         self.assertEqual(len(backend.graphs), 1)
         self.assertExpectedInline(
@@ -3610,12 +5616,21 @@ def forward(self, child : torch.Tensor, child_1 : torch.Tensor, child_2 : torch.
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_fw_bw_graph_correctness(self):
+=======
+    @skip_on_cpu
+    def test_fw_bw_graph_correctness(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cnt = CompileCounterWithBackend("aot_eager")
         make_tensor = functools.partial(
             torch.randn,
             (2, 2, 128, 4),
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.float64,
             requires_grad=True,
         )
@@ -3624,7 +5639,11 @@ def test_fw_bw_graph_correctness(self):
         def causal_mask(b, h, q_idx, kv_idx):
             return q_idx >= kv_idx
 
+<<<<<<< HEAD
         block_mask = create_block_mask(causal_mask, 1, 1, 128, 128)
+=======
+        block_mask = create_block_mask(causal_mask, 1, 1, 128, 128, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         func = torch.compile(flex_attention, backend=cnt, fullgraph=True)
         out = func(query, key, value, _squared, block_mask=block_mask)
@@ -3633,10 +5652,14 @@ def causal_mask(b, h, q_idx, kv_idx):
         self.assertEqual(len(cnt.graphs), 1)
         graph = cnt.graphs[0]
         norm_graph = normalize_gm(graph.print_readable(print_output=False))
+<<<<<<< HEAD
 
         self.assertExpectedInline(
             norm_graph,
             """\
+=======
+        expected_graph = """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GraphModule(torch.nn.Module):
     def forward(self, L_query_: "f64[2, 2, 128, 4]", L_key_: "f64[2, 2, 128, 4]", L_value_: "f64[2, 2, 128, 4]", L_block_mask_kv_indices: "i32[1, 1, 1, 1]", L_block_mask_kv_num_blocks: "i32[1, 1, 1]", L_block_mask_full_kv_num_blocks: "i32[1, 1, 1]", L_block_mask_full_kv_indices: "i32[1, 1, 1, 1]", L_block_mask_q_num_blocks: "i32[1, 1, 1]", L_block_mask_q_indices: "i32[1, 1, 1, 1]", L_block_mask_full_q_num_blocks: "i32[1, 1, 1]", L_block_mask_full_q_indices: "i32[1, 1, 1, 1]"):
         l_query_ = L_query_
@@ -3666,7 +5689,14 @@ class mask_fn_0(torch.nn.Module):
         def forward(self, child: "i32[]", child_1: "i32[]", child_2: "i32[]", child_3: "i32[]"):
             ge: "b8[]" = child_2 >= child_3;  child_2 = child_3 = None
             return ge
+<<<<<<< HEAD
 """,  # noqa: B950
+=======
+"""
+        self.assertExpectedInline(
+            norm_graph,
+            expected_graph,  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # Save the AOT graphs
         aot_graphs = []
@@ -3684,6 +5714,7 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
         out.sum().backward()
 
         joint_graph = normalize_gm(aot_graphs[1].print_readable(print_output=False))
+<<<<<<< HEAD
 
         self.assertExpectedInline(
             joint_graph,
@@ -3691,6 +5722,12 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f64[2, 2, 128, 4]", primals_2: "f64[2, 2, 128, 4]", primals_3: "f64[2, 2, 128, 4]", full: "i32[1, 1, 1]", full_default: "i32[1, 1, 1, 1]", convert_element_type: "i32[1, 1, 1]", convert_element_type_1: "i32[1, 1, 1, 1]", getitem_2: "f64[2, 2, 128, 4]", getitem_3: "f32[2, 2, 128]", tangents_1: "f64[2, 2, 128, 4]"):
         full_default_4: "f32[2, 2, 128]" = torch.ops.aten.full.default([2, 2, 128], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
+=======
+        expected_joint_graph = """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f64[2, 2, 128, 4]", primals_2: "f64[2, 2, 128, 4]", primals_3: "f64[2, 2, 128, 4]", full: "i32[1, 1, 1]", full_default: "i32[1, 1, 1, 1]", convert_element_type: "i32[1, 1, 1]", convert_element_type_1: "i32[1, 1, 1, 1]", getitem_2: "f64[2, 2, 128, 4]", getitem_3: "f32[2, 2, 128]", tangents_1: "f64[2, 2, 128, 4]"):
+        full_default_4: "f32[2, 2, 128]" = torch.ops.aten.full.default([2, 2, 128], 0, dtype = torch.float32, layout = torch.strided, device = device(type='GPU_TYPE', index=0), pin_memory = False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fw_graph0 = self.fw_graph0
         joint_graph0 = self.joint_graph0
         mask_graph0 = self.mask_graph0
@@ -3707,7 +5744,10 @@ def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i3
 
     class joint_graph0(torch.nn.Module):
         def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i32[]", arg4_1: "i32[]", arg5_1: "f64[]"):
+<<<<<<< HEAD
             mul: "f64[]" = torch.ops.aten.mul.Tensor(arg0_1, arg0_1);  mul = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mul_1: "f64[]" = torch.ops.aten.mul.Tensor(arg5_1, arg0_1)
             mul_2: "f64[]" = torch.ops.aten.mul.Tensor(arg5_1, arg0_1);  arg5_1 = arg0_1 = None
             add: "f64[]" = torch.ops.aten.add.Tensor(mul_2, mul_1);  mul_2 = mul_1 = None
@@ -3715,6 +5755,7 @@ def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i3
 
     class mask_graph0(torch.nn.Module):
         def forward(self, arg0_1: "i32[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i32[]"):
+<<<<<<< HEAD
             full: "b8[]" = torch.ops.aten.full.default([], True, dtype = torch.bool, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
             return full
 """,  # noqa: B950
@@ -3722,6 +5763,161 @@ def forward(self, arg0_1: "i32[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i3
 
     @unittest.skipIf(TEST_ON_CUDA, "Testing CPU error message")
     def test_cpu_error_message_return_lse(self):
+=======
+            full_default: "b8[]" = torch.ops.aten.full.default([], True, dtype = torch.bool, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
+            return full_default
+""".replace(  # noqa: B950
+            "GPU_TYPE", torch.device(device).type
+        )
+
+        self.assertExpectedInline(
+            joint_graph,
+            expected_joint_graph,
+        )
+
+    @supported_platform
+    def test_tensor_subclass_dispatch_order(self, device):
+        """Test that tensor subclasses get proper dispatch priority over modes.
+
+        This test verifies the fix that allows tensor subclasses' pyimpl to run before
+        FakeTensorMode/FunctionalTensorMode implementations, preventing issues
+        where subclasses that error on as_strided would fail in flex_attention.
+        """
+        import torch.utils._pytree as pytree
+        from torch.utils._python_dispatch import return_and_correct_aliasing
+
+        class AsStridedErrorTensor(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                assert isinstance(elem, torch.Tensor)
+                return torch.Tensor._make_wrapper_subclass(
+                    cls,
+                    elem.shape,
+                    strides=elem.stride(),
+                    storage_offset=elem.storage_offset(),
+                    dtype=elem.dtype,
+                    layout=elem.layout,
+                    device=elem.device,
+                    requires_grad=elem.requires_grad,
+                )
+
+            def __init__(self, elem):
+                self.elem = elem
+
+            def __repr__(self):
+                return f"AsStridedErrorTensor({self.elem})"
+
+            def __tensor_flatten__(self):
+                return ["elem"], None
+
+            @staticmethod
+            def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
+                assert meta is None
+                elem = inner_tensors["elem"]
+                return AsStridedErrorTensor(elem)
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args, kwargs=None):
+                # Error if as_strided is called
+                if func is torch.ops.aten.as_strided.default:
+                    raise RuntimeError("as_strided was called on AsStridedErrorTensor!")
+
+                if kwargs is None:
+                    kwargs = {}
+                args_elem = pytree.tree_map_only(
+                    AsStridedErrorTensor, lambda x: x.elem, args
+                )
+                kwargs_elem = pytree.tree_map_only(
+                    AsStridedErrorTensor, lambda x: x.elem, kwargs
+                )
+
+                out = func(*args_elem, **kwargs_elem)
+
+                def wrap_output(x):
+                    if isinstance(x, torch.Tensor):
+                        return AsStridedErrorTensor(x)
+                    return x
+
+                out_wrapped = pytree.tree_map(wrap_output, out)
+                return return_and_correct_aliasing(func, args, kwargs, out_wrapped)
+
+        from torch._higher_order_ops.flex_attention import (
+            flex_attention as flex_attention_hop,
+        )
+
+        @flex_attention_hop.py_impl(AsStridedErrorTensor)
+        def flex_attention_as_strided_error_tensor(
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            score_mod,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers=(),
+            mask_mod_other_buffers=(),
+        ):
+            inner_q, inner_k, inner_v = query.elem, key.elem, value.elem
+            out, lse = flex_attention_hop(
+                inner_q,
+                inner_k,
+                inner_v,
+                score_mod,
+                block_mask,
+                scale,
+                kernel_options,
+                score_mod_other_buffers,
+                mask_mod_other_buffers,
+            )
+            return AsStridedErrorTensor(out), AsStridedErrorTensor(lse)
+
+        # Test setup
+        B, H, S, D = 2, 1, 128, 16
+        dtype = torch.float32
+
+        # Create regular tensors
+        query_elem = torch.randn(B, H, S, D, device=device, dtype=dtype)
+        key_elem = torch.randn(B, H, S, D, device=device, dtype=dtype)
+        value_elem = torch.randn(B, H, S, D, device=device, dtype=dtype)
+
+        # Test 1: Verify as_strided raises error when called directly on AsStridedErrorTensor
+        test_tensor = AsStridedErrorTensor(query_elem)
+        with self.assertRaisesRegex(
+            RuntimeError, "as_strided was called on AsStridedErrorTensor!"
+        ):
+            torch.as_strided(
+                test_tensor, size=(B, H, S, D), stride=test_tensor.stride()
+            )
+
+        # Test 2: Run flex_attention with normal tensors first
+        compiled_fn = torch.compile(flex_attention, backend="aot_eager", fullgraph=True)
+        normal_out, normal_lse = compiled_fn(
+            query_elem, key_elem, value_elem, return_lse=True
+        )
+
+        # Test 3: Wrap in our subclass
+        query = AsStridedErrorTensor(query_elem)
+        key = AsStridedErrorTensor(key_elem)
+        value = AsStridedErrorTensor(value_elem)
+
+        # This should NOT error with as_strided after the fix
+        # Before the fix, it would error because FakeTensorMode would directly
+        # call flex_attention_fake_impl which uses as_strided
+        out, lse = compiled_fn(query, key, value, return_lse=True)
+        # Verify we got valid output
+        self.assertIsInstance(out, AsStridedErrorTensor)
+        self.assertIsInstance(lse, AsStridedErrorTensor)
+        self.assertEqual(out.shape, (B, H, S, D))
+        self.assertEqual(lse.shape, (B, H, S))
+
+        # Test 4: Compare outputs between normal tensors and subclassed tensors
+        torch.testing.assert_close(out.elem, normal_out, rtol=1e-5, atol=1e-5)
+        torch.testing.assert_close(lse.elem, normal_lse, rtol=1e-5, atol=1e-5)
+
+    @supported_platform
+    @skip_on_cuda
+    def test_cpu_error_message_return_lse(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         make_tensor = functools.partial(
             torch.randn,
             (2, 2, 128, 16),
@@ -3738,7 +5934,11 @@ def test_cpu_error_message_return_lse(self):
             attention(query, key, value, return_lse=True)
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+<<<<<<< HEAD
     def test_device_cuda_1(self):
+=======
+    def test_device_cuda_1(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class TestModule(torch.nn.Module):
             def forward(self, q, k, v, block_mask):
                 return flex_attention(q, k, v, block_mask=block_mask)
@@ -3759,6 +5959,7 @@ def forward(self, q, k, v, block_mask):
         self.assertEqual(attn_output.device, torch.device("cuda:1"))
 
     @supported_platform
+<<<<<<< HEAD
     def test_validate_small_embedding_size_error_message(self):
         # eager support for small embedding size
         q, k, v = [torch.randn(2, 2, 128, 8, device="cuda") for _ in range(3)]
@@ -3770,6 +5971,146 @@ def test_validate_small_embedding_size_error_message(self):
 
         # compiled gpu kernel does not support small embedding size
         q, k, v = [torch.randn(2, 2, 128, 8, device="cuda") for _ in range(3)]
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "ops_to_save",
+        [
+            [
+                torch.ops.aten.mm.default,
+            ],
+            [
+                flex_attention_hop,
+            ],
+            [torch.ops.aten.mm.default, flex_attention_hop],
+        ],
+    )
+    def test_selective_ac(self, device, ops_to_save):
+        class FlexAttentionModule(nn.Module):
+            def __init__(self, hidden_size, num_heads):
+                super().__init__()
+                self.hidden_size = hidden_size
+                self.num_heads = num_heads
+                self.head_dim = hidden_size // num_heads
+
+                # In-projections (query, key, value)
+                self.q_proj = nn.Linear(hidden_size, hidden_size)
+                self.k_proj = nn.Linear(hidden_size, hidden_size)
+                self.v_proj = nn.Linear(hidden_size, hidden_size)
+
+                # Out-projection
+                self.out_proj = nn.Linear(hidden_size, hidden_size)
+
+            def forward(self, x):
+                batch_size, seq_len, _ = x.size()
+
+                # Project queries, keys, and values
+                q = (
+                    self.q_proj(x)
+                    .view(batch_size, seq_len, self.num_heads, self.head_dim)
+                    .transpose(1, 2)
+                )
+                k = (
+                    self.k_proj(x)
+                    .view(batch_size, seq_len, self.num_heads, self.head_dim)
+                    .transpose(1, 2)
+                )
+                v = (
+                    self.v_proj(x)
+                    .view(batch_size, seq_len, self.num_heads, self.head_dim)
+                    .transpose(1, 2)
+                )
+
+                # Apply flex attention
+                attn_output = flex_attention(
+                    q,
+                    k,
+                    v,
+                )
+
+                # Reshape output
+                attn_output = (
+                    attn_output.transpose(1, 2)
+                    .contiguous()
+                    .view(batch_size, seq_len, self.hidden_size)
+                )
+
+                # Out projection
+                output = self.out_proj(attn_output)
+
+                return output
+
+        from torch.utils.checkpoint import (
+            checkpoint,
+            create_selective_checkpoint_contexts,
+        )
+
+        context_fn = functools.partial(
+            create_selective_checkpoint_contexts, ops_to_save
+        )
+
+        # Define a model that uses FlexAttention with selective activation checkpointing
+        class SacModule(nn.Module):
+            def __init__(self, hidden_size, num_heads, context_fn):
+                super().__init__()
+                self.flex_attn = FlexAttentionModule(hidden_size, num_heads)
+                self.context_fn = context_fn
+
+            def forward(self, x):
+                def flex_attn_fn(x):
+                    return self.flex_attn(x)
+
+                output = checkpoint(
+                    flex_attn_fn,
+                    x,
+                    use_reentrant=False,
+                    context_fn=self.context_fn,
+                )
+
+                return output
+
+        flex_module = SacModule(hidden_size=512, num_heads=8, context_fn=context_fn).to(
+            "cuda", dtype=torch.bfloat16
+        )
+        x = torch.ones(8, 1024, 512, device="cuda", dtype=torch.bfloat16)
+
+        # Run without compilation
+        output_module = flex_module(x)
+        compiled_module = torch.compile(flex_module)
+        output_compiled = compiled_module(x)
+
+        torch.testing.assert_close(output_module, output_compiled, rtol=1e-2, atol=1e-2)
+
+        # Calculate gradients and compare them
+        x.requires_grad_(True)
+        output_module = flex_module(x)
+        output_compiled = compiled_module(x)
+        grad_output = torch.ones_like(output_module)
+
+        grad_module = torch.autograd.grad(
+            outputs=output_module, inputs=x, grad_outputs=grad_output, retain_graph=True
+        )[0]
+
+        grad_compiled = torch.autograd.grad(
+            outputs=output_compiled, inputs=x, grad_outputs=grad_output
+        )[0]
+
+        torch.testing.assert_close(grad_module, grad_compiled, rtol=1e-2, atol=1e-2)
+
+    @supported_platform
+    @skip_on_cpu
+    def test_validate_small_embedding_size_error_message(self, device):
+        # eager support for small embedding size
+        q, k, v = [torch.randn(2, 2, 128, 8, device=device) for _ in range(3)]
+        flex_attention(q, k, v)
+
+        # compiled cpu support for small embedding size
+        q, k, v = [torch.randn(2, 2, 128, 8, device=device) for _ in range(3)]
+        flex_attention(q, k, v)
+
+        # compiled gpu kernel does not support small embedding size
+        q, k, v = [torch.randn(2, 2, 128, 8, device=device) for _ in range(3)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled_fa = torch.compile(flex_attention)
 
         with self.assertRaisesRegex(
@@ -3780,6 +6121,7 @@ def test_validate_small_embedding_size_error_message(self):
             compiled_fa(q, k, v)
 
         # compiled gpu kernel supports large embedding size
+<<<<<<< HEAD
         q, k, v = [torch.randn(2, 2, 128, 16, device="cuda") for _ in range(3)]
         compiled_fa = torch.compile(flex_attention)
 
@@ -3788,11 +6130,98 @@ class TestBlockMask(InductorTestCase):
     @supported_platform
     def test_block_mask_attributes(self):
         offset = torch.zeros(8, device="cuda")
+=======
+        q, k, v = [torch.randn(2, 2, 128, 16, device=device) for _ in range(3)]
+        compiled_fa = torch.compile(flex_attention)
+
+    @unittest.skipIf(
+        not has_triton() or not HAS_WARP_SPEC,
+        reason="FBCODE Triton is required for this test",
+    )
+    def test_triton_template_warp_specialization(self, device):
+        def make_tensor():
+            return torch.rand(4, 16, 4096, 64, device=device, dtype=torch.bfloat16)
+
+        q, k, v = make_tensor(), make_tensor(), make_tensor()
+        flex_compiled = torch.compile(flex_attention, fullgraph=True)
+
+        positional_args = (q, k, v)
+        keyword_args = {
+            "kernel_options": {
+                "num_warps": 4,
+                "num_consumer_groups": 2,
+                "num_buffers_warp_spec": 3,
+            }
+        }
+
+        # Check if kernel code contains warp specialization parameters
+        _, kernel_code = run_and_get_code(
+            flex_compiled,
+            *positional_args,
+            **keyword_args,
+        )
+        assert kernel_code is not None, "Failed to retrieve compiled kernel code"
+        assert "num_consumer_groups" in kernel_code[0], (
+            "num_consumer_groups missing in kernel definition"
+        )
+        assert "num_buffers_warp_spec" in kernel_code[0], (
+            "num_buffers_warp_spec missing in kernel definition"
+        )
+
+        # Validate correctness
+        C1 = flex_compiled(q, k, v)
+        C2 = flex_attention(q, k, v)
+
+        assert torch.allclose(C1, C2, atol=1e-2, rtol=1e-2), (
+            "Warp specialized kernel result differs from reference"
+        )
+
+    @supported_platform
+    @skip_on_cpu
+    @skipCUDAIf(not has_triton_tma_device(), "Requires TMA enabled CUDA device")
+    def test_tma_with_customer_kernel_options(self):
+        make_tensor = functools.partial(
+            torch.ones,
+            (1, 1, 256, 128),
+            device="cuda",
+            dtype=torch.bfloat16,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+
+        kernel_options_1 = {
+            "BLOCK_M": 128,
+            "BLOCK_N": 128,
+            "USE_TMA": False,
+        }
+        kernel_options_2 = {"BLOCK_M": 128, "BLOCK_N": 128, "USE_TMA": True}
+
+        flex_compile = torch.compile(flex_attention, fullgraph=True, dynamic=True)
+        out_compiled = flex_compile(query, key, value, kernel_options=kernel_options_1)
+        out_tma_compiled = flex_compile(
+            query, key, value, kernel_options=kernel_options_2
+        )
+
+        # vanilla compiled vs TMA compiled
+        torch.testing.assert_close(out_tma_compiled, out_compiled, atol=2e-1, rtol=2e-1)
+
+
+class TestBlockMask(InductorTestCase):
+    def setUp(self):
+        super().setUp()
+
+    @supported_platform
+    def test_block_mask_attributes(self, device):
+        offset = torch.zeros(8, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def causal_mask(b, h, q, kv):
             return (q + (offset[b] * 128)) >= kv
 
+<<<<<<< HEAD
         block_mask = create_block_mask(causal_mask, 4, 2, 2048, 2048)
+=======
+        block_mask = create_block_mask(causal_mask, 4, 2, 2048, 2048, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(block_mask.shape, (4, 2, 2048, 2048))
         self.assertEqual(block_mask[0].shape, (2, 2048, 2048))
         self.assertEqual(block_mask[0, 0].shape, (2048, 2048))
@@ -3802,15 +6231,24 @@ def causal_mask(b, h, q, kv):
         self.assertEqual(block_mask[1, 0].sparsity(), 46.875)
         self.assertEqual(block_mask.sparsity(), block_mask[1].sparsity())
 
+<<<<<<< HEAD
         offset = torch.arange(8, device="cuda")
         block_mask = create_block_mask(causal_mask, 8, 1, 2048, 2048)
+=======
+        offset = torch.arange(8, device=device)
+        block_mask = create_block_mask(causal_mask, 8, 1, 2048, 2048, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(block_mask.sparsity(), 29.1015625)
         self.assertTrue(block_mask.sparsity() < block_mask[0].sparsity())
         self.assertTrue(block_mask[0].sparsity() > block_mask[1].sparsity())
 
     @supported_platform
     @common_utils.parametrize("BLOCK_SIZE", [32, 64, 128, 256, (32, 64), (64, 32)])
+<<<<<<< HEAD
     def test_block_size_changes(self, BLOCK_SIZE: Union[int, tuple[int, int]]):
+=======
+    def test_block_size_changes(self, device, BLOCK_SIZE: Union[int, tuple[int, int]]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         B, H, Q_LEN, KV_LEN = 4, 2, 2048, 2048
 
         if isinstance(BLOCK_SIZE, int):
@@ -3820,20 +6258,33 @@ def test_block_size_changes(self, BLOCK_SIZE: Union[int, tuple[int, int]]):
             Q_BLOCK_SIZE, KV_BLOCK_SIZE = BLOCK_SIZE
 
         block_mask = create_block_mask(
+<<<<<<< HEAD
             noop_mask, B, H, Q_LEN, KV_LEN, BLOCK_SIZE=BLOCK_SIZE
+=======
+            noop_mask, B, H, Q_LEN, KV_LEN, BLOCK_SIZE=BLOCK_SIZE, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.assertEqual(block_mask.BLOCK_SIZE, (Q_BLOCK_SIZE, KV_BLOCK_SIZE))
         self.assertEqual(block_mask.shape, (B, H, Q_LEN, KV_LEN))
 
     @supported_platform
+<<<<<<< HEAD
     def test_getitem(self):
         offset = torch.zeros(8, device="cuda")
+=======
+    def test_getitem(self, device):
+        offset = torch.zeros(8, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def causal_mask(b, h, q, kv):
             return (q + (offset[b] * 128)) >= kv
 
+<<<<<<< HEAD
         block_mask = create_block_mask(causal_mask, 4, 2, 512, 512)
+=======
+        block_mask = create_block_mask(causal_mask, 4, 2, 512, 512, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert block_mask.kv_num_blocks.shape == (4, 2, 4)
         assert block_mask.kv_indices.shape == (4, 2, 4, 4)
 
@@ -3884,17 +6335,31 @@ def causal_mask(b, h, q, kv):
             )
 
     @supported_platform
+<<<<<<< HEAD
     def test_block_mask_device_change(self):
         offset = torch.zeros(8, device="cuda")
+=======
+    def test_block_mask_device_change(self, device):
+        device = torch.device(device)
+        offset = torch.zeros(8, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def causal_mask(b, h, q, kv):
             return (q + (offset[b] * 128)) >= kv
 
+<<<<<<< HEAD
         block_mask = create_block_mask(causal_mask, 1, 1, 512, 512)
         assert block_mask.kv_indices.is_cuda
         assert block_mask.kv_num_blocks.is_cuda
         assert block_mask.q_indices.is_cuda
         assert block_mask.q_num_blocks.is_cuda
+=======
+        block_mask = create_block_mask(causal_mask, 1, 1, 512, 512, device=device)
+        assert block_mask.kv_indices.device.type == device.type
+        assert block_mask.kv_num_blocks.device.type == device.type
+        assert block_mask.q_indices.device.type == device.type
+        assert block_mask.q_num_blocks.device.type == device.type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         block_mask = block_mask.to("cpu")
         assert block_mask.kv_indices.is_cpu
@@ -3902,6 +6367,7 @@ def causal_mask(b, h, q, kv):
         assert block_mask.q_indices.is_cpu
         assert block_mask.q_num_blocks.is_cpu
 
+<<<<<<< HEAD
         block_mask = block_mask.to("cuda")
         assert block_mask.kv_indices.is_cuda
         assert block_mask.kv_num_blocks.is_cuda
@@ -3911,49 +6377,94 @@ def causal_mask(b, h, q, kv):
     @supported_platform
     def test_compiling_create_block_mask(self):
         seq = torch.arange(512, device="cuda") // 127
+=======
+        block_mask = block_mask.to(device)
+        assert block_mask.kv_indices.device.type == device.type
+        assert block_mask.kv_num_blocks.device.type == device.type
+        assert block_mask.q_indices.device.type == device.type
+        assert block_mask.q_num_blocks.device.type == device.type
+
+    @supported_platform
+    def test_compiling_create_block_mask(self, device):
+        seq = torch.arange(512, device=device) // 127
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def mask_mod(b, h, q, kv):
             return (q >= kv) & (seq[q] == seq[kv])
 
         block_mask = torch.compile(create_block_mask, fullgraph=True)(
+<<<<<<< HEAD
             mask_mod, 1, 1, 512, 512
+=======
+            mask_mod, 1, 1, 512, 512, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertIsInstance(block_mask, BlockMask)
         self.assertEqual(block_mask.kv_num_blocks.shape, torch.Size((1, 1, 4)))
         self.assertEqual(block_mask.kv_indices.shape, torch.Size((1, 1, 4, 4)))
 
     @supported_platform
+<<<<<<< HEAD
     def test_compiling_create_block_mask_no_recompile(self):
+=======
+    def test_compiling_create_block_mask_no_recompile(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def mask_mod(b, h, q, kv):
             return q >= kv
 
         torch._dynamo.reset()
+<<<<<<< HEAD
         block_mask = torch.compile(create_block_mask)(mask_mod, 2, 4, 1024, 1024)
+=======
+        block_mask = torch.compile(create_block_mask)(
+            mask_mod, 2, 4, 1024, 1024, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertIsInstance(block_mask, BlockMask)
         self.assertEqual(block_mask.kv_num_blocks.shape, torch.Size((2, 4, 8)))
         self.assertEqual(block_mask.kv_indices.shape, torch.Size((2, 4, 8, 8)))
         self.assertEqual(torch._dynamo.utils.counters["aot_autograd"]["ok"], 1)
 
         # automatic dynamic shapes triggered and recompilation.
+<<<<<<< HEAD
         block_mask = torch.compile(create_block_mask)(mask_mod, 4, 8, 2048, 2048)
+=======
+        block_mask = torch.compile(create_block_mask)(
+            mask_mod, 4, 8, 2048, 2048, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertIsInstance(block_mask, BlockMask)
         self.assertEqual(block_mask.kv_num_blocks.shape, torch.Size((4, 8, 16)))
         self.assertEqual(block_mask.kv_indices.shape, torch.Size((4, 8, 16, 16)))
         self.assertEqual(torch._dynamo.utils.counters["aot_autograd"]["ok"], 2)
 
         # no recompilation.
+<<<<<<< HEAD
         block_mask = torch.compile(create_block_mask)(mask_mod, 6, 16, 3072, 3072)
+=======
+        block_mask = torch.compile(create_block_mask)(
+            mask_mod, 6, 16, 3072, 3072, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertIsInstance(block_mask, BlockMask)
         self.assertEqual(block_mask.kv_num_blocks.shape, torch.Size((6, 16, 24)))
         self.assertEqual(block_mask.kv_indices.shape, torch.Size((6, 16, 24, 24)))
         self.assertEqual(torch._dynamo.utils.counters["aot_autograd"]["ok"], 2)
 
     @supported_platform
+<<<<<<< HEAD
     def test_block_mask_viz(self):
         def causal_mask(b, h, q, kv):
             return q >= kv
 
         block_mask = create_block_mask(causal_mask, 1, 1, 2048, 2048)
+=======
+    def test_block_mask_viz(self, device):
+        def causal_mask(b, h, q, kv):
+            return q >= kv
+
+        block_mask = create_block_mask(causal_mask, 1, 1, 2048, 2048, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def replace_non_printable(s):
             def replace(c):
@@ -3989,12 +6500,22 @@ def replace(c):
 )""",
         )
 
+<<<<<<< HEAD
         offset = torch.arange(8, device="cuda")
+=======
+        offset = torch.arange(8, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def causal_offset_mask(b, h, q, kv):
             return (q + offset[b] * 128) >= kv
 
+<<<<<<< HEAD
         block_mask = create_block_mask(causal_offset_mask, 8, 1, 2048, 2048)
+=======
+        block_mask = create_block_mask(
+            causal_offset_mask, 8, 1, 2048, 2048, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         str_block_mask = str(block_mask)
         self.assertTrue("sparsity=29.10" in str_block_mask)
 
@@ -4025,8 +6546,12 @@ def generate_test_inputs(self, full_seq_len: bool, device):
 
     @supported_platform
     @common_utils.parametrize("full_indices", [False, True])
+<<<<<<< HEAD
     def test_from_kv_blocks(self, full_indices: bool):
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+=======
+    def test_from_kv_blocks(self, device, full_indices: bool):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             kv_num_blocks,
             kv_indices,
@@ -4079,8 +6604,12 @@ def test_from_kv_blocks(self, full_indices: bool):
             self.assertIsNone(block_mask.full_q_indices)
 
     @supported_platform
+<<<<<<< HEAD
     def test_block_size(self):
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+=======
+    def test_block_size(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kv_num_blocks, kv_indices, _, _ = self.generate_test_inputs(False, device)
         block_mask = BlockMask.from_kv_blocks(kv_num_blocks, kv_indices)
         self.assertEqual(
@@ -4095,11 +6624,19 @@ def test_block_size(self):
         self.assertEqual(block_mask_custom.BLOCK_SIZE, custom_block_size)
 
     @supported_platform
+<<<<<<< HEAD
     def test_upcast_appropriately(self):
         q = torch.randn((1, 1, 128, 16), dtype=torch.float16, device="cuda")
         k = torch.randn((1, 1, 128, 16), dtype=torch.float16, device="cuda")
         v = torch.randn((1, 1, 128, 16), dtype=torch.float16, device="cuda")
         mass = torch.ones((1), dtype=torch.float16, device="cuda")
+=======
+    def test_upcast_appropriately(self, device):
+        q = torch.randn((1, 1, 128, 16), dtype=torch.float16, device=device)
+        k = torch.randn((1, 1, 128, 16), dtype=torch.float16, device=device)
+        v = torch.randn((1, 1, 128, 16), dtype=torch.float16, device=device)
+        mass = torch.ones((1), dtype=torch.float16, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def score_mod(score, b, h, q_idx, kv_idx):
             return score + torch.log(mass[0])
@@ -4107,8 +6644,12 @@ def score_mod(score, b, h, q_idx, kv_idx):
         torch.compile(flex_attention)(q, k, v, score_mod=score_mod)
 
     @supported_platform
+<<<<<<< HEAD
     def test_init_mismatched_full_kv(self):
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+=======
+    def test_init_mismatched_full_kv(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kv_num_blocks, kv_indices, full_kv_num_blocks, _ = self.generate_test_inputs(
             True, device
         )
@@ -4129,8 +6670,12 @@ def test_init_mismatched_full_kv(self):
             )
 
     @supported_platform
+<<<<<<< HEAD
     def test_init_mismatched_full_q(self):
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+=======
+    def test_init_mismatched_full_q(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kv_num_blocks, kv_indices, _, _ = self.generate_test_inputs(False, device)
 
         with self.assertRaises(AssertionError):
@@ -4150,11 +6695,19 @@ def test_init_mismatched_full_q(self):
 
     @supported_platform
     @common_utils.parametrize("compile", [False, True])
+<<<<<<< HEAD
     def test_no_q_info(self, compile: bool):
         def causal_mask(b, h, q_idx, kv_idx):
             return q_idx >= kv_idx
 
         block_mask = create_block_mask(causal_mask, 1, 1, 2048, 2048)
+=======
+    def test_no_q_info(self, device, compile: bool):
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        block_mask = create_block_mask(causal_mask, 1, 1, 2048, 2048, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # manually set q_num_blocks and q_indices to None
         block_mask.q_num_blocks = None
         block_mask.q_indices = None
@@ -4172,7 +6725,11 @@ def causal_mask(b, h, q_idx, kv_idx):
                 2,
                 2048,
                 64,
+<<<<<<< HEAD
                 device="cuda",
+=======
+                device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype=torch.float16,
                 requires_grad=True,
             )
@@ -4187,7 +6744,11 @@ def causal_mask(b, h, q_idx, kv_idx):
         torch.testing.assert_close(causal_mask_out, sdpa_mask_out, atol=5e-3, rtol=0.0)
 
     @supported_platform
+<<<<<<< HEAD
     def test_doc_mask_clamped_repro(self):
+=======
+    def test_doc_mask_clamped_repro(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def _offsets_to_doc_ids_tensor(offsets):
             device = offsets.device
             counts = offsets[1:] - offsets[:-1]
@@ -4223,7 +6784,10 @@ def generate_random_lengths(total_length, num_documents):
                 lengths[index] += 1
             return lengths
 
+<<<<<<< HEAD
         device = "cuda"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         max_seq_len, doc_count = 128, 4
         SEQ_LEN = max_seq_len
 
@@ -4253,7 +6817,11 @@ def generate_random_lengths(total_length, num_documents):
         )
         for i in range(5):
             lengths = generate_random_lengths(1024 + i, 5)
+<<<<<<< HEAD
             offsets = length_to_offsets(lengths, "cuda")
+=======
+            offsets = length_to_offsets(lengths, device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             doc_ids = _offsets_to_doc_ids_tensor(offsets)
 
             def doc_mask_mod(b, h, q_idx, kv_idx):
@@ -4265,11 +6833,21 @@ def doc_mask_mod(b, h, q_idx, kv_idx):
             q, k, v = (
                 torch.randn(1, 12, 1024 + i, 64, device=device) for _ in range(3)
             )
+<<<<<<< HEAD
             block_mask = create_block_mask(doc_mask_mod, None, None, 1024 + i, 1024 + i)
             torch.compile(flex_attention)(q, k, v, block_mask=block_mask)
 
     @supported_platform
     def test_eager_tracing_correctness(self):
+=======
+            block_mask = create_block_mask(
+                doc_mask_mod, None, None, 1024 + i, 1024 + i, device=device
+            )
+            torch.compile(flex_attention)(q, k, v, block_mask=block_mask)
+
+    @supported_platform
+    def test_eager_tracing_correctness(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qk_dims = 64
         v_dims = 128
         q_heads = 4
@@ -4277,7 +6855,11 @@ def test_eager_tracing_correctness(self):
         seq_len = 256
         batch_size = 1
 
+<<<<<<< HEAD
         make_tensor = functools.partial(torch.randn, device="cuda", dtype=torch.float16)
+=======
+        make_tensor = functools.partial(torch.randn, device=device, dtype=torch.float16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         q = make_tensor(*(batch_size, q_heads, seq_len, qk_dims))
         k = make_tensor(*(batch_size, kv_heads, seq_len, qk_dims))
         v = make_tensor(*(batch_size, kv_heads, seq_len, v_dims))
@@ -4299,7 +6881,11 @@ def flex_attention_fn():
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_create_is_cuda_graphable(self):
+=======
+    def test_create_is_cuda_graphable(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def mask_mod(b, h, q, kv):
             return q >= kv
 
@@ -4312,7 +6898,11 @@ def mask_mod(b, h, q, kv):
 
     @common_utils.parametrize("compile", [False, True])
     @supported_platform
+<<<<<<< HEAD
     def test_block_mask_vs_sequence_lengths(self, compile):
+=======
+    def test_block_mask_vs_sequence_lengths(self, device, compile):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if compile:
             flex_attention_call = torch.compile(flex_attention)
         else:
@@ -4324,22 +6914,35 @@ def mask_mod(b, h, q_idx, kv_idx):
         def create_inputs(S):
             q, k, v = (
                 torch.randn(
+<<<<<<< HEAD
                     1, 8, S, 64, dtype=torch.float16, requires_grad=True, device="cuda"
+=======
+                    1, 8, S, 64, dtype=torch.float16, requires_grad=True, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for _ in range(3)
             )
             return q, k, v
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, None, None, 1024, 1024)
+=======
+        block_mask = create_block_mask(mask_mod, None, None, 1024, 1024, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flex_attention_call(*create_inputs(1024), block_mask=block_mask)
         with self.assertRaisesRegex(ValueError, "block_mask was created for"):
             flex_attention_call(*create_inputs(2048), block_mask=block_mask)
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, None, None, 1023, 1023)
+=======
+        block_mask = create_block_mask(mask_mod, None, None, 1023, 1023, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(ValueError, "block_mask was created for"):
             flex_attention_call(*create_inputs(1024), block_mask=block_mask)
 
 
+<<<<<<< HEAD
 class TestPagedAttention(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -4351,6 +6954,16 @@ def setUp(self):
                 )
             if not IS_PLATFORM_SUPPORTED:
                 self.skipTest("skip UT due to not support on those platforms")
+=======
+@large_tensor_test_class("2GB", device="cuda")
+class TestPagedAttention(InductorTestCase):
+    def setUp(self):
+        super().setUp()
+        skipCPUIf(
+            LONG_COMPILATION_ON_CPU,
+            "skip UT for CPU due to long compilation time found in CI",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_equal(
         self,
@@ -4369,9 +6982,15 @@ def _check_equal(
             msg = f"{name} Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X."
             self.assertTrue(False, msg)
 
+<<<<<<< HEAD
     def allocate_page_cache(self, n_pages: int, page_size: int):
         max_batch_size = 3
         paged_cache = PagedAttention(n_pages, page_size, max_batch_size)
+=======
+    def allocate_page_cache(self, n_pages: int, page_size: int, device: str):
+        max_batch_size = 3
+        paged_cache = PagedAttention(n_pages, page_size, max_batch_size, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return paged_cache
 
     def cdiv(self, x, y):
@@ -4381,9 +7000,15 @@ def roundup(self, x, y):
         return (x + y - 1) // y * y
 
     @supported_platform
+<<<<<<< HEAD
     def test_page_allocation(self):
         n_pages, page_size = 12, 4
         paged_cache = self.allocate_page_cache(n_pages, page_size)
+=======
+    def test_page_allocation(self, device):
+        n_pages, page_size = 12, 4
+        paged_cache = self.allocate_page_cache(n_pages, page_size, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         batch_reserve(paged_cache, torch.tensor([8, 24, 16]))
 
@@ -4391,6 +7016,7 @@ def test_page_allocation(self):
             AssertionError, "requested 2 pages but there are only 0 empty pages"
         ):
             paged_cache.reserve(
+<<<<<<< HEAD
                 torch.tensor([0], device="cuda"), torch.tensor([16], device="cuda")
             )
 
@@ -4403,6 +7029,22 @@ def test_page_allocation(self):
     def test_allocate(self):
         n_pages, page_size = 12, 4
         paged_cache = self.allocate_page_cache(n_pages, page_size)
+=======
+                torch.tensor([0], device=device),
+                torch.tensor([16], device=device),
+            )
+
+        paged_cache.erase(torch.tensor([1], device=device))
+        paged_cache.reserve(
+            torch.tensor([0], device=device),
+            torch.tensor([16], device=device),
+        )
+
+    @supported_platform
+    def test_allocate(self, device):
+        n_pages, page_size = 12, 4
+        paged_cache = self.allocate_page_cache(n_pages, page_size, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         target_seq_len = torch.tensor([3, 11, 8])
         batch_reserve(paged_cache, target_seq_len)
@@ -4414,7 +7056,11 @@ def test_allocate(self):
         )
 
         # deallocate batch 1
+<<<<<<< HEAD
         paged_cache.erase(torch.tensor([1], device="cuda"))
+=======
+        paged_cache.erase(torch.tensor([1], device=device))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         target_seq_len = torch.tensor([3, 0, 8])
         expected_allocated_pages = self.cdiv(target_seq_len, page_size).sum()
         self.assertEqual(paged_cache.capacity, self.roundup(target_seq_len, page_size))
@@ -4437,6 +7083,7 @@ def test_allocate(self):
         self.assertEqual(len(paged_cache.empty_pages), n_pages)
 
     @supported_platform
+<<<<<<< HEAD
     def test_convert_logical_block_mask(self):
         n_pages, page_size, max_batch_size, max_seq_len = 8, 128, 2, 512
         paged_cache = PagedAttention(n_pages, page_size, max_batch_size)
@@ -4453,6 +7100,24 @@ def test_convert_logical_block_mask(self):
         self.assertEqual(
             paged_cache.capacity,
             torch.tensor([512, 512], device="cuda"),
+=======
+    def test_convert_logical_block_mask(self, device):
+        n_pages, page_size, max_batch_size, max_seq_len = 8, 128, 2, 512
+        paged_cache = PagedAttention(n_pages, page_size, max_batch_size, device=device)
+
+        batch_reserve(paged_cache, torch.tensor([100, 200], device=device))
+        batch_reserve(paged_cache, torch.tensor([150, 300], device=device))
+        batch_reserve(paged_cache, torch.tensor([300, 512], device=device))
+        batch_reserve(paged_cache, torch.tensor([512, 512], device=device))
+
+        expected_page_table = torch.tensor(
+            [[0, 3, 5, 7, -1, -1, -1, -1], [2, 1, 4, 6, -1, -1, -1, -1]],
+            device=device,
+        )
+        self.assertEqual(
+            paged_cache.capacity,
+            torch.tensor([512, 512], device=device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(paged_cache.page_table, expected_page_table)
 
@@ -4461,14 +7126,22 @@ def causal_mask(b, h, q, kv):
             return q >= kv
 
         block_mask = create_block_mask(
+<<<<<<< HEAD
             causal_mask, max_batch_size, 1, max_seq_len, max_seq_len
+=======
+            causal_mask, max_batch_size, 1, max_seq_len, max_seq_len, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
 
         zeros = [0, 0, 0, 0]
         # Check that the new block mask is correct
         expected_kv_num_blocks = torch.tensor(
+<<<<<<< HEAD
             [[[1, 1, 1, 1]], [[1, 1, 1, 1]]], device="cuda", dtype=torch.int32
+=======
+            [[[1, 1, 1, 1]], [[1, 1, 1, 1]]], device=device, dtype=torch.int32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         expected_kv_indices = torch.tensor(
             [
@@ -4489,11 +7162,19 @@ def causal_mask(b, h, q, kv):
                     ]
                 ],
             ],
+<<<<<<< HEAD
             device="cuda",
             dtype=torch.int32,
         )
         expected_full_kv_num_blocks = torch.tensor(
             [[[0, 1, 2, 3]], [[0, 1, 2, 3]]], device="cuda:0", dtype=torch.int32
+=======
+            device=device,
+            dtype=torch.int32,
+        )
+        expected_full_kv_num_blocks = torch.tensor(
+            [[[0, 1, 2, 3]], [[0, 1, 2, 3]]], device=device, dtype=torch.int32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         expected_full_kv_indices = torch.tensor(
             [
@@ -4514,7 +7195,11 @@ def causal_mask(b, h, q, kv):
                     ]
                 ],
             ],
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=torch.int32,
         )
         self.assertEqual(new_block_mask.kv_num_blocks, expected_kv_num_blocks)
@@ -4523,6 +7208,7 @@ def causal_mask(b, h, q, kv):
         self.assertEqual(new_block_mask.full_kv_indices, expected_full_kv_indices)
 
     @supported_platform
+<<<<<<< HEAD
     def test_convert_mask_mod(self):
         n_pages, page_size, max_batch_size = 8, 128, 2
         paged_cache = PagedAttention(n_pages, page_size, max_batch_size)
@@ -4539,12 +7225,34 @@ def test_convert_mask_mod(self):
         self.assertEqual(
             paged_cache.capacity,
             torch.tensor([512, 512], device="cuda"),
+=======
+    def test_convert_mask_mod(self, device):
+        n_pages, page_size, max_batch_size = 8, 128, 2
+        paged_cache = PagedAttention(n_pages, page_size, max_batch_size, device=device)
+
+        batch_reserve(paged_cache, torch.tensor([100, 200], device=device))
+        batch_reserve(paged_cache, torch.tensor([150, 300], device=device))
+        batch_reserve(paged_cache, torch.tensor([300, 512], device=device))
+        batch_reserve(paged_cache, torch.tensor([512, 512], device=device))
+
+        expected_page_table = torch.tensor(
+            [[0, 3, 5, 7, -1, -1, -1, -1], [2, 1, 4, 6, -1, -1, -1, -1]],
+            device=device,
+        )
+        self.assertEqual(
+            paged_cache.capacity,
+            torch.tensor([512, 512], device=device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(paged_cache.page_table, expected_page_table)
 
         expected_physical_to_logical = torch.tensor(
             [[0, -1, -1, 1, -1, 2, -1, 3], [-1, 1, 0, -1, 2, -1, 3, -1]],
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(paged_cache.physical_to_logical, expected_physical_to_logical)
 
@@ -4564,6 +7272,7 @@ def causal_mask(b, h, q, kv):
         self.assertEqual(converted_causal_mask(1, 0, 64, 270), True)
 
     @supported_platform
+<<<<<<< HEAD
     def test_update(self):
         dtype = torch.float32
 
@@ -4587,12 +7296,41 @@ def test_update(self):
         batch_idx = torch.arange(max_batch_size, device="cuda", dtype=torch.int32)
         input_pos = (
             torch.arange(max_seq_len, device="cuda", dtype=torch.int32)
+=======
+    def test_update(self, device):
+        dtype = torch.float32
+
+        n_pages, page_size, max_batch_size, max_seq_len = 6, 2, 2, 6
+        paged_cache = PagedAttention(n_pages, page_size, max_batch_size, device=device)
+
+        n_heads, head_dim = 2, 3
+        cache_shape = (1, n_heads, n_pages * page_size, head_dim)
+        k_cache = torch.zeros(cache_shape, dtype=dtype, device=device)
+
+        batch_reserve(paged_cache, torch.tensor([1, 3], device=device))
+        batch_reserve(paged_cache, torch.tensor([4, 5], device=device))
+        batch_reserve(paged_cache, torch.tensor([6, 6], device=device))
+
+        expected_page_table = torch.tensor(
+            [[0, 3, 5, -1, -1, -1], [2, 1, 4, -1, -1, -1]],
+            device=device,
+        )
+        self.assertEqual(paged_cache.page_table, expected_page_table)
+
+        batch_idx = torch.arange(max_batch_size, device=device, dtype=torch.int32)
+        input_pos = (
+            torch.arange(max_seq_len, device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .unsqueeze(0)
             .expand(max_batch_size, max_seq_len)
         )
         k = torch.arange(
             max_batch_size * n_heads * max_seq_len * head_dim,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
         ).view(max_batch_size, n_heads, max_seq_len, head_dim)
 
@@ -4648,14 +7386,28 @@ def test_update(self):
                     ],
                 ]
             ],
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
         )
         self.assertEqual(k_cache, expected_cache)
 
+<<<<<<< HEAD
     @common_utils.parametrize("dtype", test_dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_paged_builtin_score_mods(self, dtype: torch.dtype, score_mod: Callable):
+=======
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    def test_paged_builtin_score_mods(
+        self, device, dtype: torch.dtype, score_mod: Callable
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         n_pages, page_size, max_batch_size, max_seq_len = 32, 128, 4, 512
         n_heads, head_dim = 4, 16
 
@@ -4663,14 +7415,22 @@ def causal_mask(b, h, q, kv):
             return q >= kv
 
         block_mask = create_block_mask(
+<<<<<<< HEAD
             causal_mask, max_batch_size, 1, max_seq_len, max_seq_len, device=self.device
+=======
+            causal_mask, max_batch_size, 1, max_seq_len, max_seq_len, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         q = torch.randn(
             max_batch_size,
             n_heads,
             max_seq_len,
             head_dim,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
             requires_grad=False,
         )
@@ -4679,7 +7439,11 @@ def causal_mask(b, h, q, kv):
             n_heads,
             max_seq_len,
             head_dim,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
             requires_grad=False,
         )
@@ -4688,7 +7452,11 @@ def causal_mask(b, h, q, kv):
             n_heads,
             max_seq_len,
             head_dim,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
             requires_grad=False,
         )
@@ -4707,7 +7475,11 @@ def causal_mask(b, h, q, kv):
             n_heads,
             MAX_CACHED_SEQ_LEN,
             head_dim,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
         )
         v_cache = torch.zeros(
@@ -4715,6 +7487,7 @@ def causal_mask(b, h, q, kv):
             n_heads,
             MAX_CACHED_SEQ_LEN,
             head_dim,
+<<<<<<< HEAD
             device=self.device,
             dtype=dtype,
         )
@@ -4741,6 +7514,22 @@ def causal_mask(b, h, q, kv):
         batch_idx = torch.arange(max_batch_size, device=self.device, dtype=torch.int32)
         input_pos = (
             torch.arange(max_seq_len, device=self.device, dtype=torch.int32)
+=======
+            device=device,
+            dtype=dtype,
+        )
+
+        paged_cache = PagedAttention(n_pages, page_size, max_batch_size, device=device)
+        batch_reserve(paged_cache, torch.tensor([100, 200, 50, 300], device=device))
+        batch_reserve(paged_cache, torch.tensor([100, 512, 300, 300], device=device))
+        batch_reserve(paged_cache, torch.tensor([512, 512, 300, 300], device=device))
+        batch_reserve(paged_cache, torch.tensor([512, 512, 512, 300], device=device))
+        batch_reserve(paged_cache, torch.tensor([512, 512, 512, 512], device=device))
+
+        batch_idx = torch.arange(max_batch_size, device=device, dtype=torch.int32)
+        input_pos = (
+            torch.arange(max_seq_len, device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .unsqueeze(0)
             .expand(max_batch_size, max_seq_len)
         )
@@ -4791,6 +7580,7 @@ def get_params(dtypes: list[torch.dtype]) -> list[Params]:
     return params
 
 
+<<<<<<< HEAD
 # ROCM BUG SEE: https://github.com/pytorch/pytorch/issues/140855
 supports_learnable_bias = unittest.skipUnless(
     torch.cuda.is_available()
@@ -4798,23 +7588,48 @@ def get_params(dtypes: list[torch.dtype]) -> list[Params]:
     and torch.cuda.get_device_capability() >= (8, 0)
     and not TEST_WITH_ROCM,
     "Requires CUDA and Triton, and is not supported on ROCm",
+=======
+supports_learnable_bias = unittest.skipUnless(
+    (torch.cuda.is_available() and has_triton())
+    and (torch.cuda.get_device_capability() >= (8, 0) or torch.version.hip),
+    "Requires Triton + A100 or Triton + ROCm",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
 @supports_learnable_bias
+<<<<<<< HEAD
 class TestLearnableBiases(InductorTestCase):
     def setUp(self):
         super().setUp()
         self.device = "cuda"
+=======
+@large_tensor_test_class("2GB", device="cuda")
+class TestLearnableBiases(InductorTestCase):
+    def setUp(self):
+        super().setUp()
+        skipCPUIf(
+            LONG_COMPILATION_ON_CPU,
+            "skip UT for CPU due to long compilation time found in CI",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.dtype = torch.float32
         self.atol = 3e-2
         self.rtol = 3e-2
 
+<<<<<<< HEAD
     def _init_tensors(self, params: Params):
         make_tensor = functools.partial(
             torch.randn,
             (params.batch_size, params.num_heads, params.seq_length, params.head_dim),
             device=self.device,
+=======
+    def _init_tensors(self, params: Params, device: str):
+        make_tensor = functools.partial(
+            torch.randn,
+            (params.batch_size, params.num_heads, params.seq_length, params.head_dim),
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -4871,6 +7686,7 @@ def _check_outputs_and_grads(
         ):
             self._gold_check(eager, compiled, gold, name)
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
@@ -4880,6 +7696,18 @@ def test_relative_1d_bias(self, params, mode: str):
         bias = torch.randn(
             2 * params.seq_length,
             device=self.device,
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    @common_utils.parametrize("mode", ["default", "max-autotune-no-cudagraphs"])
+    def test_relative_1d_bias(self, device, params, mode: str):
+        query, key, value = self._init_tensors(params, device=device)
+        bias = torch.randn(
+            2 * params.seq_length,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -4904,6 +7732,7 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, bias),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
@@ -4913,6 +7742,18 @@ def test_absolute_2d_bias(self, params):
             params.seq_length,
             params.seq_length,
             device=self.device,
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_absolute_2d_bias(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+        bias = torch.randn(
+            params.seq_length,
+            params.seq_length,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -4937,16 +7778,29 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, bias),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
     def test_head_specific_bias(self, params):
         query, key, value = self._init_tensors(params)
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_head_specific_bias(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bias = torch.randn(
             params.num_heads,
             params.seq_length,
             params.seq_length,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -4971,17 +7825,30 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, bias),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
     def test_batch_head_bias(self, params):
         query, key, value = self._init_tensors(params)
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_batch_head_bias(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bias = torch.randn(
             params.batch_size,
             params.num_heads,
             params.seq_length,
             params.seq_length,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -5006,6 +7873,7 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, bias),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
@@ -5014,6 +7882,17 @@ def test_multiplicative_bias(self, params):
         bias = torch.randn(
             params.seq_length,
             device=self.device,
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_multiplicative_bias(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+        bias = torch.randn(
+            params.seq_length,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -5038,6 +7917,7 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, bias),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
@@ -5047,6 +7927,18 @@ def test_local_window_bias(self, params):
         bias = torch.randn(
             2 * window_size + 1,
             device=self.device,
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_local_window_bias(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+        window_size = 8
+        bias = torch.randn(
+            2 * window_size + 1,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -5072,6 +7964,7 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, bias),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
@@ -5080,6 +7973,17 @@ def test_global_tokens_bias(self, params):
         bias = torch.randn(
             params.seq_length,
             device=self.device,
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_global_tokens_bias(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+        bias = torch.randn(
+            params.seq_length,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -5104,21 +8008,38 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, bias),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
     def test_weird_bias(self, params):
         query, key, value = self._init_tensors(params)
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_weird_bias(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bias = torch.randn(
             params.batch_size,
             params.num_heads,
             4,
             params.seq_length,
+<<<<<<< HEAD
             device=self.device,
             dtype=params.dtype,
             requires_grad=True,
         )
         which_bias = torch.tensor(0, device=self.device)
+=======
+            device=device,
+            dtype=params.dtype,
+            requires_grad=True,
+        )
+        which_bias = torch.tensor(0, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def bias_func(score, b, h, q_idx, kv_idx):
             return score + bias[b, h, which_bias, q_idx]
@@ -5140,6 +8061,7 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, bias),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
@@ -5148,6 +8070,17 @@ def test_indirect_bias(self, params):
         bias = torch.randn(
             params.seq_length,
             device=self.device,
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_indirect_bias(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+        bias = torch.randn(
+            params.seq_length,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -5156,7 +8089,11 @@ def test_indirect_bias(self, params):
             0,
             params.seq_length,
             (params.seq_length,),
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         def bias_func(score, b, h, q_idx, kv_idx):
@@ -5179,6 +8116,7 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, bias),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params([torch.float32]), name_fn=lambda x: f"{x}"
     )
@@ -5188,6 +8126,18 @@ def test_symmetric_bias(self, params, mode: str):
         bias = torch.randn(
             params.seq_length,
             device=self.device,
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    @common_utils.parametrize("mode", ["default", "max-autotune-no-cudagraphs"])
+    def test_symmetric_bias(self, device, params, mode: str):
+        query, key, value = self._init_tensors(params, device=device)
+        bias = torch.randn(
+            params.seq_length,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -5216,6 +8166,7 @@ def bias_func(score, b, h, q_idx, kv_idx):
                 (query, key, value, bias),
             )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
@@ -5225,6 +8176,18 @@ def test_flipped_indexed_bias(self, params):
             params.seq_length,
             params.seq_length,
             device=self.device,
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_flipped_indexed_bias(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+        bias = torch.randn(
+            params.seq_length,
+            params.seq_length,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -5249,6 +8212,7 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, bias),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
@@ -5258,6 +8222,18 @@ def test_head_specific_gate(self, params, mode: str):
         gate_score = torch.randn(
             params.num_heads,
             device=self.device,
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    @common_utils.parametrize("mode", ["default", "max-autotune-no-cudagraphs"])
+    def test_head_specific_gate(self, device, params, mode: str):
+        query, key, value = self._init_tensors(params, device=device)
+        gate_score = torch.randn(
+            params.num_heads,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -5282,6 +8258,7 @@ def bias_func(score, b, h, q_idx, kv_idx):
             (query, key, value, gate_score),
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
@@ -5291,12 +8268,28 @@ def test_distinct_biases(self, params):
         bias1 = torch.randn(
             params.seq_length,
             device=self.device,
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_distinct_biases(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+        # Create two separate bias tensors
+        bias1 = torch.randn(
+            params.seq_length,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
         bias2 = torch.randn(
             params.seq_length,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,
         )
@@ -5330,11 +8323,20 @@ def bias_func(score, b, h, q_idx, kv_idx):
             ],
         )
 
+<<<<<<< HEAD
     @common_utils.parametrize(
         "params", get_params(test_dtypes), name_fn=lambda x: f"{x}"
     )
     def test_relative_1d_bias_only_grad(self, params):
         query, key, value = self._init_tensors(params)
+=======
+    @skip_on_cpu
+    @common_utils.parametrize(
+        "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
+    )
+    def test_relative_1d_bias_only_grad(self, device, params):
+        query, key, value = self._init_tensors(params, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         query = query.detach().requires_grad_(False)
         key = key.detach().requires_grad_(False)
         value = value.detach().requires_grad_(False)
@@ -5342,7 +8344,11 @@ def test_relative_1d_bias_only_grad(self, params):
         # Only bias requires gradients
         bias = torch.randn(
             2 * params.seq_length,
+<<<<<<< HEAD
             device=self.device,
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=params.dtype,
             requires_grad=True,  # Only bias needs gradients
         )
@@ -5366,11 +8372,327 @@ def bias_func(score, b, h, q_idx, kv_idx):
             out_eager, out_compiled, out_gold, (bias,), names=["out", "bias"]
         )
 
+<<<<<<< HEAD
 
 common_utils.instantiate_parametrized_tests(TestFlexAttention)
 common_utils.instantiate_parametrized_tests(TestBlockMask)
 common_utils.instantiate_parametrized_tests(TestPagedAttention)
 common_utils.instantiate_parametrized_tests(TestLearnableBiases)
+=======
+    def _test_flex_attention_with_dynamic_max_autotune(self, device):
+        query = torch.randn(2, 16, 512, 64, device=device)
+        key = torch.randn(2, 16, 512, 64, device=device)
+        value = torch.randn(2, 16, 512, 64, device=device)
+        query.requires_grad = True
+        key.requires_grad = True
+        value.requires_grad = True
+
+        shape = (2, 16, 512, 16, 512, 64)
+        B, Hq, M, Hkv, N, D = shape
+
+        score_mod = _generate_alibi_bias(8)
+
+        def causal(b, h, m, n):
+            return m >= n
+
+        mask_shape = (1, 1, M, N)
+        block_mask = torch.compile(create_block_mask)(
+            causal, *mask_shape, device=device
+        )
+
+        compiled_sdpa = torch.compile(
+            flex_attention, dynamic=True, mode="max-autotune-no-cudagraphs"
+        )
+
+        out = compiled_sdpa(
+            query=query,
+            key=key,
+            value=value,
+            score_mod=score_mod,
+            block_mask=block_mask,
+            enable_gqa=True,
+            kernel_options=None,
+        )
+        out.sum().backward()
+
+        self.assertEqual(
+            out.shape, query.shape, f"Expected shape {query.shape}, got {out.shape}"
+        )
+
+    @skip_on_cpu
+    def test_flex_attention_with_dynamic_max_autotune(self, device):
+        self._test_flex_attention_with_dynamic_max_autotune(device)
+
+    @skip_on_cpu
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_flex_attention_with_dynamic_max_autotune_graph_partition(self, device):
+        self._test_flex_attention_with_dynamic_max_autotune(device)
+
+    @skip_on_cpu
+    def test_inspect_bug(self, device):
+        # https://github.com/pytorch/pytorch/issues/139374
+        def sliding_window(b, h, q_idx, kv_idx, val):
+            return (q_idx - kv_idx).abs() < val
+
+        sliding_window2 = functools.partial(
+            sliding_window, val=torch.randn((), device=device)
+        )
+        opt_fn = torch.compile(create_block_mask, fullgraph=True)
+        create_block_mask(sliding_window2, None, None, 1024, 1024, device=device)
+        # checks that the compile is working
+        opt_fn(sliding_window2, None, None, 1024, 1024, device=device)
+
+    @supported_platform
+    @skip_on_cpu
+    def test_head_bias_req_grad(self, device):
+        B, H, S, D = 1, 4, 256, 64
+        bias = torch.randn(H, device=device, dtype=torch.float16, requires_grad=True)
+
+        bias_flex = bias.detach().clone().requires_grad_(True)
+
+        def head_bias(score, b, h, q_idx, kv_idx):
+            return score + bias_flex[h]
+
+        bias_sdpa_ref = bias.detach().clone().requires_grad_(True)
+        implicit_bias_sdpa_ref = bias_sdpa_ref
+        implicit_bias_sdpa_ref = implicit_bias_sdpa_ref.view(H, 1, 1).expand(H, S, S)
+        bias_sdpa_gold = (
+            bias.detach().clone().to(dtype=torch.float64).requires_grad_(True)
+        )
+        implicit_bias_sdpa_gold = bias_sdpa_gold
+        implicit_bias_sdpa_gold = implicit_bias_sdpa_gold.view(H, 1, 1).expand(H, S, S)
+
+        self._test_learnable_bias_inner(
+            B,
+            H,
+            S,
+            D,
+            head_bias,
+            bias_flex,
+            implicit_bias_sdpa_ref,
+            bias_sdpa_ref,
+            implicit_bias_sdpa_gold,
+            bias_sdpa_gold,
+            device,
+        )
+
+    @supported_platform
+    @skip_on_cpu
+    def test_comparison_vs_sdpa_with_learnable_bias(self, device):
+        # 1-dimensional bias:
+        B, H, S, D = 1, 1, 256, 64
+        bias = torch.randn(
+            2 * S, device=device, dtype=torch.float16, requires_grad=True
+        )
+
+        bias_flex = bias.detach().clone().requires_grad_(True)
+
+        def rel_pos_1d(score, b, h, q_idx, kv_idx):
+            return score + bias_flex[q_idx + kv_idx]
+
+        bias_indices = torch.arange(S)[:, None] + torch.arange(S)
+        bias_sdpa_ref = bias.detach().clone().requires_grad_(True)
+        implicit_bias_sdpa_ref = bias_sdpa_ref[bias_indices]
+        bias_sdpa_gold = (
+            bias.detach().clone().to(dtype=torch.float64).requires_grad_(True)
+        )
+        implicit_bias_sdpa_gold = bias_sdpa_gold[bias_indices]
+
+        self._test_learnable_bias_inner(
+            B,
+            H,
+            S,
+            D,
+            rel_pos_1d,
+            bias_flex,
+            implicit_bias_sdpa_ref,
+            bias_sdpa_ref,
+            implicit_bias_sdpa_gold,
+            bias_sdpa_gold,
+            device,
+        )
+
+        # 2-dimensional bias:
+        B, H, S, D = 1, 1, 256, 64
+        bias = torch.randn(S, S, device=device, dtype=torch.float16, requires_grad=True)
+
+        bias_flex = bias.detach().clone().requires_grad_(True)
+
+        def rel_pos_2d(score, b, h, q_idx, kv_idx):
+            return score + bias_flex[q_idx, kv_idx]
+
+        bias_sdpa_ref = bias.detach().clone().requires_grad_(True)
+        implicit_bias_sdpa_ref = bias_sdpa_ref
+        bias_sdpa_gold = (
+            bias.detach().clone().to(dtype=torch.float64).requires_grad_(True)
+        )
+        implicit_bias_sdpa_gold = bias_sdpa_gold
+
+        self._test_learnable_bias_inner(
+            B,
+            H,
+            S,
+            D,
+            rel_pos_2d,
+            bias_flex,
+            implicit_bias_sdpa_ref,
+            bias_sdpa_ref,
+            implicit_bias_sdpa_gold,
+            bias_sdpa_gold,
+            device,
+        )
+
+        # 2-dimensional bias + index multiple
+        B, H, S, D = 1, 1, 256, 64
+        bias = torch.randn(S, S, device=device, dtype=torch.float16, requires_grad=True)
+
+        bias_flex = bias.detach().clone().requires_grad_(True)
+
+        def rel_pos_2d(score, b, h, q_idx, kv_idx):
+            return score + bias_flex[q_idx][kv_idx]
+
+        bias_sdpa_ref = bias.detach().clone().requires_grad_(True)
+        implicit_bias_sdpa_ref = bias_sdpa_ref
+        bias_sdpa_gold = (
+            bias.detach().clone().to(dtype=torch.float64).requires_grad_(True)
+        )
+        implicit_bias_sdpa_gold = bias_sdpa_gold
+
+        self._test_learnable_bias_inner(
+            B,
+            H,
+            S,
+            D,
+            rel_pos_2d,
+            bias_flex,
+            implicit_bias_sdpa_ref,
+            bias_sdpa_ref,
+            implicit_bias_sdpa_gold,
+            bias_sdpa_gold,
+            device,
+        )
+
+        # 2-dimensional bias + transposed:
+        B, H, S, D = 1, 1, 256, 64
+        bias = torch.randn(S, S, device=device, dtype=torch.float16, requires_grad=True)
+
+        bias_flex = bias.detach().clone().requires_grad_(True)
+
+        def rel_pos_2d_transposed(score, b, h, q_idx, kv_idx):
+            return score + bias_flex[kv_idx, q_idx]
+
+        bias_sdpa_ref = bias.detach().clone().requires_grad_(True)
+        implicit_bias_sdpa_ref = bias_sdpa_ref.transpose(-1, -2)
+        bias_sdpa_gold = (
+            bias.detach().clone().to(dtype=torch.float64).requires_grad_(True)
+        )
+        implicit_bias_sdpa_gold = bias_sdpa_gold.transpose(-1, -2)
+
+        self._test_learnable_bias_inner(
+            B,
+            H,
+            S,
+            D,
+            rel_pos_2d_transposed,
+            bias_flex,
+            implicit_bias_sdpa_ref,
+            bias_sdpa_ref,
+            implicit_bias_sdpa_gold,
+            bias_sdpa_gold,
+            device,
+        )
+
+        # 3-dimensional bias + transposed
+        B, H, S, D = 4, 8, 256, 64
+        bias = torch.randn(
+            H, S, S, device=device, dtype=torch.float16, requires_grad=True
+        )
+
+        bias_flex = bias.detach().clone().requires_grad_(True)
+
+        def rel_pos_3d_transposed(score, b, h, q_idx, kv_idx):
+            return score + bias_flex[h, kv_idx, q_idx]
+
+        bias_sdpa_ref = bias.detach().clone().requires_grad_(True)
+        implicit_bias_sdpa_ref = bias_sdpa_ref.transpose(-1, -2)
+        bias_sdpa_gold = (
+            bias.detach().clone().to(dtype=torch.float64).requires_grad_(True)
+        )
+        implicit_bias_sdpa_gold = bias_sdpa_gold.transpose(-1, -2)
+
+        self._test_learnable_bias_inner(
+            B,
+            H,
+            S,
+            D,
+            rel_pos_3d_transposed,
+            bias_flex,
+            implicit_bias_sdpa_ref,
+            bias_sdpa_ref,
+            implicit_bias_sdpa_gold,
+            bias_sdpa_gold,
+            device,
+        )
+
+    def _test_learnable_bias_inner(
+        self,
+        B,
+        H,
+        S,
+        D,
+        score_mod,
+        bias_flex,
+        implicit_bias_sdpa_ref,
+        bias_sdpa_ref,
+        implicit_bias_sdpa_gold,
+        bias_sdpa_gold,
+        device,
+    ):
+        make_tensor = functools.partial(
+            torch.ones,
+            (B, H, S, D),
+            device=device,
+            dtype=torch.float16,
+            requires_grad=True,
+        )
+        q_ref, k_ref, v_ref = make_tensor(), make_tensor(), make_tensor()
+        q_gold, k_gold, v_gold = query_key_value_clones(
+            q_ref, k_ref, v_ref, torch.float64
+        )
+        q_flex, k_flex, v_flex = query_key_value_clones(q_ref, k_ref, v_ref)
+
+        out_ref = torch.nn.functional.scaled_dot_product_attention(
+            q_ref, k_ref, v_ref, attn_mask=implicit_bias_sdpa_ref
+        )
+        out_ref.sum().backward()
+        out_gold = torch.nn.functional.scaled_dot_product_attention(
+            q_gold, k_gold, v_gold, attn_mask=implicit_bias_sdpa_gold
+        )
+        out_gold.sum().backward()
+        out_flex = flex_attention(q_flex, k_flex, v_flex, score_mod=score_mod)
+        out_flex.sum().backward()
+
+        name = score_mod.__name__
+        for ref, flex, gold in [
+            (out_ref, out_flex, out_gold),
+            (q_ref.grad, q_flex.grad, q_gold.grad),
+            (k_ref.grad, k_flex.grad, k_gold.grad),
+            (v_ref.grad, v_flex.grad, v_gold.grad),
+            (bias_sdpa_ref.grad, bias_flex.grad, bias_sdpa_gold.grad),
+        ]:
+            ref_error = rmse(ref, gold)
+            flex_error = rmse(flex, gold)
+            self.assertTrue(
+                ref_error * 1.2 >= flex_error,
+                f"{name} -> Ref error: {ref_error}, Flex eager Error: {flex_error}",
+            )
+
+
+instantiate_device_type_tests(TestFlexAttention, globals(), only_for=test_device)
+instantiate_device_type_tests(TestPagedAttention, globals(), only_for=test_device)
+instantiate_device_type_tests(TestBlockMask, globals(), only_for=("cuda",))
+instantiate_device_type_tests(TestLearnableBiases, globals(), only_for=test_device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
index 8b1a8fb37c37..bb9f86462ddd 100644
--- a/test/inductor/test_flex_decoding.py
+++ b/test/inductor/test_flex_decoding.py
@@ -2,9 +2,16 @@
 # flake8: noqa: B950
 
 import functools
+<<<<<<< HEAD
 from collections import namedtuple
 from typing import Callable, Optional, Union
 from unittest import expectedFailure, skipUnless
+=======
+import unittest
+from collections import namedtuple
+from typing import Callable, Optional, Union
+from unittest import expectedFailure
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -25,6 +32,7 @@
     PLATFORM_SUPPORTS_BF16,
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfRocm
 from torch.utils._triton import has_triton
 
@@ -37,12 +45,56 @@
     "Requires CUDA and Triton",
 )
 
+=======
+from torch.testing._internal.common_device_type import (
+    flex_attention_supported_platform as supported_platform,
+    instantiate_device_type_tests,
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tolerances = namedtuple("Tolerances", ["atol", "rtol"])
 torch.set_float32_matmul_precision("high")
 
 index = torch.ops.aten.index
 Tensor = torch.Tensor
 
+<<<<<<< HEAD
+=======
+TEST_ON_CUDA = (
+    torch.cuda.is_available()
+    and torch.utils._triton.has_triton()
+    and torch.cuda.get_device_capability() >= (8, 0)
+)
+
+if TEST_ON_CUDA:
+    test_device = ("cuda",)
+    test_dtypes = (
+        [torch.float32, torch.bfloat16, torch.float16]
+        if PLATFORM_SUPPORTS_BF16
+        else [torch.float16, torch.float32]
+    )
+    test_dtypes_fast = [torch.float16]
+    SKIP_UT_ON_CPU = False
+else:
+    test_device = ("cpu",)
+    torch_config_string = torch.__config__.show()
+    SKIP_UT_ON_CPU = True
+    LONG_COMPILATION_ON_CPU = False
+    if "CLANG" in torch_config_string.upper():
+        # if the compiler is clang, skip UT for CPU due to long compilation time found in CI
+        # TODO: check reason of long compile time
+        LONG_COMPILATION_ON_CPU = True
+
+    test_dtypes = (
+        [torch.float32, torch.bfloat16]
+        if torch.backends.mkldnn.is_available()
+        and torch.ops.mkldnn._is_mkldnn_bf16_supported()
+        else [torch.float32]
+    )
+    test_dtypes_fast = [torch.float32]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def create_attention(score_mod, block_mask, enable_gqa=False):
     return functools.partial(
@@ -60,6 +112,7 @@ def create_block_mask_test(score_mod, query, key):
     return block_mask
 
 
+<<<<<<< HEAD
 test_dtypes = (
     [torch.float16, torch.bfloat16, torch.float32]
     if PLATFORM_SUPPORTS_BF16
@@ -68,6 +121,8 @@ def create_block_mask_test(score_mod, query, key):
 
 test_dtypes_fast = [torch.float16]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 test_page_sizes = [64, 128, 256]
 
 
@@ -90,7 +145,11 @@ def _windowed(score, b, h, q, kv):
 
 
 def _get_windowed_sdpa_mask(Mq, Mkv, offset):
+<<<<<<< HEAD
     return torch.tril(torch.ones(Mkv, Mkv, dtype=torch.bool, device="cuda"))[
+=======
+    return torch.tril(torch.ones(Mkv, Mkv, dtype=torch.bool, device=test_device[0]))[
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offset : offset + Mq
     ]
 
@@ -145,7 +204,11 @@ def _squared(score, b, h, m, n):
 
 def _head_offset(dtype: torch.dtype):
     """Captured Buffer"""
+<<<<<<< HEAD
     head_offset = torch.rand(Hq, device="cuda", dtype=dtype)
+=======
+    head_offset = torch.rand(Hq, device=test_device[0], dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def score_mod(score, b, h, m, n):
         return score * head_offset[h]
@@ -210,6 +273,33 @@ def _trig2(score, b, h, m, n):
 (Hq, Hkv) = (16, 8)
 
 
+<<<<<<< HEAD
+=======
+def input_strides_1(B, H, S, D):
+    return ((H * S * D, S * D, D, 1), 997)  # offset
+
+
+def input_strides_2(B, H, S, D):
+    return ((H * D, D, B * H * D, 1), 499)  # transposed dimensions
+
+
+def input_strides_3(B, H, S, D):
+    return ((S * (D + 1), B * S * (D + 1), (D + 1), 1), 293)  # additional buffer
+
+
+def input_strides_4(B, H, S, D):
+    return ((1, D, (B + 1) * (H + 1) * D, 1), 97)  # shared dimension
+
+
+test_input_strides = [
+    input_strides_1,
+    input_strides_2,
+    input_strides_3,
+    input_strides_4,
+]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def query_key_value_clones(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -235,6 +325,19 @@ def batch_reserve(paged_attention: PagedAttention, target_seq_len: Tensor):
 
 
 class TestFlexDecoding(InductorTestCase):
+<<<<<<< HEAD
+=======
+    def setUp(self):
+        super().setUp()
+        self.test_inference_only = False
+        if test_device[0] == "cpu":
+            if LONG_COMPILATION_ON_CPU:
+                self.skipTest(
+                    "skip UT for CPU due to long compilation time found in CI"
+                )
+            self.test_inference_only = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _check_equal(
         self,
         golden_out: torch.Tensor,
@@ -295,6 +398,7 @@ def run_test(
         KV_S: int = S,
         V_D: int = D,
         block_mask: Optional[BlockMask] = None,
+<<<<<<< HEAD
     ):
         assert (
             score_mod is not None or block_mask is not None
@@ -311,6 +415,34 @@ def run_test(
         )
         v = torch.randn(
             (KV_B, KV_H, KV_S, V_D), dtype=dtype, device="cuda", requires_grad=False
+=======
+        device="cuda",
+    ):
+        assert score_mod is not None or block_mask is not None, (
+            "Must provide score_mod or block_mask"
+        )
+        assert Q_H % KV_H == 0
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+
+        q = torch.randn(
+            (Q_B, Q_H, Q_S, Q_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=not self.test_inference_only,
+        )
+        k = torch.randn(
+            (KV_B, KV_H, KV_S, Q_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=not self.test_inference_only,
+        )
+        v = torch.randn(
+            (KV_B, KV_H, KV_S, V_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=not self.test_inference_only,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
@@ -319,20 +451,39 @@ def run_test(
             score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
         )
         compiled_sdpa = torch.compile(sdpa_partial)
+<<<<<<< HEAD
         golden_out, gold_lse = sdpa_partial(q_gold, k_gold, v_gold, return_lse=True)
         ref_out, ref_lse = sdpa_partial(q_ref, k_ref, v_ref, return_lse=True)
         compiled_out, compiled_lse = compiled_sdpa(q, k, v, return_lse=True)
 
+=======
+        if not self.test_inference_only:
+            golden_out, gold_lse = sdpa_partial(q_gold, k_gold, v_gold, return_lse=True)
+            ref_out, ref_lse = sdpa_partial(q_ref, k_ref, v_ref, return_lse=True)
+            compiled_out, compiled_lse = compiled_sdpa(q, k, v, return_lse=True)
+            self._check_out(
+                gold_lse,
+                ref_lse,
+                compiled_lse,
+            )
+        else:
+            golden_out = sdpa_partial(q_gold, k_gold, v_gold, return_lse=False)
+            ref_out = sdpa_partial(q_ref, k_ref, v_ref, return_lse=False)
+            compiled_out = compiled_sdpa(q, k, v, return_lse=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._check_out(
             golden_out,
             ref_out,
             compiled_out,
         )
+<<<<<<< HEAD
         self._check_out(
             gold_lse,
             ref_lse,
             compiled_lse,
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def run_test_with_call(
         self,
@@ -347,6 +498,7 @@ def run_test_with_call(
         KV_H: int = Hkv,
         KV_S: int = S,
         V_D: int = D,
+<<<<<<< HEAD
     ):
         if not golden_call:
             golden_call = sdpa_call
@@ -361,6 +513,33 @@ def run_test_with_call(
         )
         v = torch.randn(
             (KV_B, KV_H, KV_S, V_D), dtype=dtype, device="cuda", requires_grad=False
+=======
+        device="cuda",
+    ):
+        if not golden_call:
+            golden_call = sdpa_call
+
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+
+        q = torch.randn(
+            (Q_B, KV_H, Q_S, Q_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+        )
+        k = torch.randn(
+            (KV_B, KV_H, KV_S, Q_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+        )
+        v = torch.randn(
+            (KV_B, KV_H, KV_S, V_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
@@ -385,8 +564,16 @@ def preprocess_paged_attention(
         block_mask,
         dtype: torch.dtype = torch.float16,
         page_size: int = 128,
+<<<<<<< HEAD
+    ):
+        assert block_mask is not None, "Must provide block_mask"
+=======
+        device="cuda",
     ):
         assert block_mask is not None, "Must provide block_mask"
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Q_B, Q_H, Q_S, _ = q.shape
         KV_B, KV_H, KV_S, QK_D = k.shape
         _, _, _, V_D = v.shape
@@ -403,7 +590,11 @@ def preprocess_paged_attention(
             KV_H,
             MAX_CACHED_SEQ_LEN,
             QK_D,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
         )
         v_cache = torch.zeros(
@@ -411,11 +602,16 @@ def preprocess_paged_attention(
             KV_H,
             MAX_CACHED_SEQ_LEN,
             V_D,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
         )
 
         # "randomly" initialize the page table
+<<<<<<< HEAD
         paged_attention = PagedAttention(n_pages, page_size, max_batch_size)
         batch_reserve(
             paged_attention,
@@ -431,15 +627,42 @@ def preprocess_paged_attention(
         )
         batch_reserve(
             paged_attention, torch.tensor([KV_S, KV_S, KV_S, KV_S], device="cuda")
+=======
+        paged_attention = PagedAttention(
+            n_pages, page_size, max_batch_size, device=device
+        )
+        batch_reserve(
+            paged_attention,
+            torch.tensor([KV_S // 4, KV_S // 2, KV_S // 4, KV_S // 3], device=device),
+        )
+        batch_reserve(
+            paged_attention,
+            torch.tensor([KV_S // 4, KV_S // 2, KV_S // 2, KV_S // 2], device=device),
+        )
+        batch_reserve(
+            paged_attention,
+            torch.tensor([KV_S // 2, KV_S, KV_S // 2, KV_S], device=device),
+        )
+        batch_reserve(
+            paged_attention, torch.tensor([KV_S, KV_S, KV_S, KV_S], device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # update cache with k and v
         input_pos = (
+<<<<<<< HEAD
             torch.arange(KV_S, device="cuda", dtype=torch.int32)
             .unsqueeze(0)
             .expand(KV_B, KV_S)
         )
         batch_idx = torch.arange(KV_B, device="cuda", dtype=torch.int32)
+=======
+            torch.arange(KV_S, device=device, dtype=torch.int32)
+            .unsqueeze(0)
+            .expand(KV_B, KV_S)
+        )
+        batch_idx = torch.arange(KV_B, device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         paged_attention.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
         # convert block mask and score mod
@@ -456,11 +679,22 @@ def run_paged_attention(
         v: Tensor,
         dtype: torch.dtype = torch.float16,
         block_mask: Optional[BlockMask] = None,
+<<<<<<< HEAD
     ):
         Q_B, Q_H, KV_H = q.shape[0], q.shape[1], k.shape[1]
 
         if block_mask is None:
             block_mask = create_block_mask(noop_mask, Q_B, 1, 1, S)
+=======
+        device="cuda",
+    ):
+        Q_B, Q_H, KV_H = q.shape[0], q.shape[1], k.shape[1]
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+
+        if block_mask is None:
+            block_mask = create_block_mask(noop_mask, Q_B, 1, 1, S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         (
             k_cache,
@@ -468,12 +702,17 @@ def run_paged_attention(
             converted_block_mask,
             converted_score_mod,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             score_mod, q, k, v, block_mask, dtype, block_mask.BLOCK_SIZE[1]
+=======
+            score_mod, q, k, v, block_mask, dtype, block_mask.BLOCK_SIZE[1], device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         compiled_sdpa = torch.compile(flex_attention)
 
         # compute
+<<<<<<< HEAD
         compiled_out, compiled_lse = compiled_sdpa(
             q,
             k_cache,
@@ -483,6 +722,29 @@ def run_paged_attention(
             score_mod=converted_score_mod,
             enable_gqa=(not Q_H == KV_H),
         )
+=======
+        if not self.test_inference_only:
+            compiled_out, compiled_lse = compiled_sdpa(
+                q,
+                k_cache,
+                v_cache,
+                return_lse=True,
+                block_mask=converted_block_mask,
+                score_mod=converted_score_mod,
+                enable_gqa=(not Q_H == KV_H),
+            )
+        else:
+            compiled_lse = None
+            compiled_out = compiled_sdpa(
+                q,
+                k_cache,
+                v_cache,
+                return_lse=False,
+                block_mask=converted_block_mask,
+                score_mod=converted_score_mod,
+                enable_gqa=(not Q_H == KV_H),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return compiled_out, compiled_lse
 
     def run_test_with_paged_attention(
@@ -498,6 +760,7 @@ def run_test_with_paged_attention(
         KV_S: int = S,
         V_D: int = D,
         block_mask: Optional[BlockMask] = None,
+<<<<<<< HEAD
     ):
         assert Q_H % KV_H == 0
 
@@ -505,25 +768,48 @@ def run_test_with_paged_attention(
             (Q_B, Q_H, Q_S, QK_D),
             dtype=dtype,
             device="cuda",
+=======
+        device="cuda",
+    ):
+        assert Q_H % KV_H == 0
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+        q = torch.randn(
+            (Q_B, Q_H, Q_S, QK_D),
+            dtype=dtype,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=False,
         )
         k = torch.randn(
             (KV_B, KV_H, KV_S, QK_D),
             dtype=dtype,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=False,
         )
         v = torch.randn(
             (KV_B, KV_H, KV_S, V_D),
             dtype=dtype,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=False,
         )
         q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
 
         if block_mask is None:
+<<<<<<< HEAD
             block_mask = create_block_mask(noop_mask, Q_B, 1, 1, KV_S)
+=======
+            block_mask = create_block_mask(noop_mask, Q_B, 1, 1, KV_S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sdpa_partial = create_attention(
             score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
@@ -532,7 +818,11 @@ def run_test_with_paged_attention(
         ref_out, ref_lse = sdpa_partial(q_ref, k_ref, v_ref, return_lse=True)
 
         compiled_out, compiled_lse = self.run_paged_attention(
+<<<<<<< HEAD
             score_mod, q, k, v, dtype, block_mask
+=======
+            score_mod, q, k, v, dtype, block_mask, device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self._check_out(
@@ -540,11 +830,20 @@ def run_test_with_paged_attention(
             ref_out,
             compiled_out,
         )
+<<<<<<< HEAD
         self._check_out(
             gold_lse,
             ref_lse,
             compiled_lse,
         )
+=======
+        if not self.test_inference_only:
+            self._check_out(
+                gold_lse,
+                ref_lse,
+                compiled_lse,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def run_test_with_call_paged_attention(
         self,
@@ -560,6 +859,7 @@ def run_test_with_call_paged_attention(
         KV_H: int = Hkv,
         KV_S: int = S,
         V_D: int = D,
+<<<<<<< HEAD
     ):
         q = torch.randn(
             (Q_B, KV_H, Q_S * (Q_H // KV_H), Q_D),
@@ -572,6 +872,30 @@ def run_test_with_call_paged_attention(
         )
         v = torch.randn(
             (KV_B, KV_H, KV_S, V_D), dtype=dtype, device="cuda", requires_grad=False
+=======
+        device="cuda",
+    ):
+        if device == "cpu" and dtype is torch.float16:
+            dtype = torch.float32
+
+        q = torch.randn(
+            (Q_B, KV_H, Q_S * (Q_H // KV_H), Q_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+        )
+        k = torch.randn(
+            (KV_B, KV_H, KV_S, Q_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+        )
+        v = torch.randn(
+            (KV_B, KV_H, KV_S, V_D),
+            dtype=dtype,
+            device=device,
+            requires_grad=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
@@ -583,12 +907,21 @@ def run_test_with_call_paged_attention(
         ref_out = golden_call(q_ref, k_ref, v_ref)
 
         if mask_mod is not None:
+<<<<<<< HEAD
             block_mask = create_block_mask(mask_mod, Q_B, 1, Q_S, KV_S)
         else:
             block_mask = create_block_mask(noop_mask, Q_B, 1, Q_S, KV_S)
 
         compiled_out, _ = self.run_paged_attention(
             score_mod, q, k, v, dtype, block_mask
+=======
+            block_mask = create_block_mask(mask_mod, Q_B, 1, Q_S, KV_S, device=device)
+        else:
+            block_mask = create_block_mask(noop_mask, Q_B, 1, Q_S, KV_S, device=device)
+
+        compiled_out, _ = self.run_paged_attention(
+            score_mod, q, k, v, dtype, block_mask, device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self._check_out(
@@ -599,6 +932,10 @@ def run_test_with_call_paged_attention(
 
     @supported_platform
     @expectedFailure
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_bw_decoding_fails(self, dtype):
         make_kv = functools.partial(
@@ -632,12 +969,23 @@ def sdpa_hop(q, k, v, score_mod, block_mask):
     @common_utils.parametrize("score_mod", test_score_mods)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
     def test_builtin_score_mods(
+<<<<<<< HEAD
         self, dtype: torch.dtype, score_mod: Callable, head_dims
     ):
         Hq, Hkv = head_dims
         assert Hq % Hkv == 0
         self.run_test(score_mod, dtype, Q_H=Hq, KV_H=Hkv)
         self.run_test_with_paged_attention(score_mod, dtype, Q_H=Hq, KV_H=Hkv)
+=======
+        self, device, dtype: torch.dtype, score_mod: Callable, head_dims
+    ):
+        Hq, Hkv = head_dims
+        assert Hq % Hkv == 0
+        self.run_test(score_mod, dtype, Q_H=Hq, KV_H=Hkv, device=device)
+        self.run_test_with_paged_attention(
+            score_mod, dtype, Q_H=Hq, KV_H=Hkv, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
@@ -646,6 +994,10 @@ def test_builtin_score_mods(
     @common_utils.parametrize("page_size", test_page_sizes)
     def test_paged_attention_page_size(
         self,
+<<<<<<< HEAD
+=======
+        device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype: torch.dtype,
         score_mod: Callable,
         head_dims: tuple[int, int],
@@ -661,9 +1013,17 @@ def causal_offset_mask(b, h, q_idx, kv_idx):
             return causal_offset_mask
 
         mod = generate_causal_offset(
+<<<<<<< HEAD
             torch.tensor(192, device="cuda", dtype=torch.int32)
         )
         block_mask = create_block_mask(mod, B, 1, 1, S, BLOCK_SIZE=page_size)
+=======
+            torch.tensor(192, device=device, dtype=torch.int32)
+        )
+        block_mask = create_block_mask(
+            mod, B, 1, 1, S, BLOCK_SIZE=page_size, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.run_test_with_paged_attention(
             score_mod,
@@ -674,6 +1034,10 @@ def causal_offset_mask(b, h, q_idx, kv_idx):
             KV_H=Hkv,
             KV_S=S,
             block_mask=block_mask,
+<<<<<<< HEAD
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @supported_platform
@@ -682,10 +1046,15 @@ def causal_offset_mask(b, h, q_idx, kv_idx):
     @common_utils.parametrize("BLOCK_SIZE", test_block_size)
     def test_builtin_score_mods_different_block_size(
         self,
+<<<<<<< HEAD
+=======
+        device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype: torch.dtype,
         score_mod: Callable,
         BLOCK_SIZE: Union[int, tuple[int, int]],
     ):
+<<<<<<< HEAD
         block_mask = create_block_mask(noop_mask, B, 1, 1, S, BLOCK_SIZE=BLOCK_SIZE)
         self.run_test(score_mod, dtype, block_mask=block_mask)
 
@@ -707,18 +1076,33 @@ def input_strides_4(B, H, S, D):
         input_strides_3,
         input_strides_4,
     ]
+=======
+        block_mask = create_block_mask(
+            noop_mask, B, 1, 1, S, BLOCK_SIZE=BLOCK_SIZE, device=device
+        )
+        self.run_test(score_mod, dtype, block_mask=block_mask, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     @common_utils.parametrize("k_s", test_input_strides)
     @common_utils.parametrize("v_s", test_input_strides)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
+<<<<<<< HEAD
     def test_strided_inputs(self, dtype: torch.dtype, k_s, v_s, head_dims):
         Hq, Hkv = head_dims
         assert Hq % Hkv == 0
         q1 = torch.randn((B * Hq * D), dtype=dtype, device="cuda")
         k1 = torch.randn((B * Hkv * S * D * 4), dtype=dtype, device="cuda")
         v1 = torch.randn((B * Hkv * S * D * 4), dtype=dtype, device="cuda")
+=======
+    def test_strided_inputs(self, device, dtype: torch.dtype, k_s, v_s, head_dims):
+        Hq, Hkv = head_dims
+        assert Hq % Hkv == 0
+        q1 = torch.randn((B * Hq * D), dtype=dtype, device=device)
+        k1 = torch.randn((B * Hkv * S * D * 4), dtype=dtype, device=device)
+        v1 = torch.randn((B * Hkv * S * D * 4), dtype=dtype, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         k_shape = (B, Hkv, S, D)
         v_shape = (B, Hkv, S, D)
@@ -753,7 +1137,13 @@ def test_strided_inputs(self, dtype: torch.dtype, k_s, v_s, head_dims):
             ref_out, compiled_out, atol=tolerance.atol, rtol=tolerance.rtol
         )
 
+<<<<<<< HEAD
         paged_compiled_out, _ = self.run_paged_attention(score_mod, q, k, v, dtype)
+=======
+        paged_compiled_out, _ = self.run_paged_attention(
+            score_mod, q, k, v, dtype, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.testing.assert_close(
             ref_out, paged_compiled_out, atol=tolerance.atol, rtol=tolerance.rtol
         )
@@ -765,6 +1155,10 @@ def test_strided_inputs(self, dtype: torch.dtype, k_s, v_s, head_dims):
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_kv_batch_broadcast(
         self,
+<<<<<<< HEAD
+=======
+        device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype: torch.dtype,
         head_dims: tuple[int, int],
         batch_dims: tuple[int, int],
@@ -776,6 +1170,7 @@ def test_kv_batch_broadcast(
         Bq, Bkv = batch_dims
         assert Bq > 1 and Bkv == 1
 
+<<<<<<< HEAD
         block_mask = create_block_mask(noop_mask, Bq, 1, 1, S)
 
         self.run_test(
@@ -790,10 +1185,17 @@ def test_kv_batch_broadcast(
             S,
             D,
             block_mask,
+=======
+        block_mask = create_block_mask(noop_mask, Bq, 1, 1, S, device=device)
+
+        self.run_test(
+            score_mod, dtype, Bq, Hq, 1, D, Bkv, Hkv, S, D, block_mask, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes)
+<<<<<<< HEAD
     def test_skip_odd_keys(self, dtype: torch.dtype):
         def score_mod(score, b, h, q, kv):
             return torch.where(kv % 2 == 0, score, float("-inf"))
@@ -804,6 +1206,18 @@ def score_mod(score, b, h, q, kv):
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes)
     def test_function_composition(self, dtype: torch.dtype):
+=======
+    def test_skip_odd_keys(self, device, dtype: torch.dtype):
+        def score_mod(score, b, h, q, kv):
+            return torch.where(kv % 2 == 0, score, float("-inf"))
+
+        self.run_test(score_mod, dtype, device=device)
+        self.run_test_with_paged_attention(score_mod, dtype, device=device)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    def test_function_composition(self, device, dtype: torch.dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def score_mod_1(score, b, h, m, n):
             return score + (m - n)
 
@@ -813,6 +1227,7 @@ def score_mod_2(score, b, h, m, n):
         def composed_score_mod(score, b, h, m, n):
             return score_mod_2(score_mod_1(score, b, h, m, n), b, h, m, n)
 
+<<<<<<< HEAD
         self.run_test(composed_score_mod, dtype)
         self.run_test_with_paged_attention(composed_score_mod, dtype)
 
@@ -820,10 +1235,20 @@ def composed_score_mod(score, b, h, m, n):
     @common_utils.parametrize("dtype", test_dtypes)
     def test_captured_buffers(self, dtype: torch.dtype):
         head_offset = torch.rand(Hq, device="cuda", dtype=dtype)
+=======
+        self.run_test(composed_score_mod, dtype, device=device)
+        self.run_test_with_paged_attention(composed_score_mod, dtype, device=device)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    def test_captured_buffers(self, device, dtype: torch.dtype):
+        head_offset = torch.rand(Hq, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def score_mod(score, b, h, m, n):
             return score + head_offset[h]
 
+<<<<<<< HEAD
         self.run_test(score_mod, dtype)
         self.run_test_with_paged_attention(score_mod, dtype)
 
@@ -834,6 +1259,18 @@ def test_captured_buffers_all_dims(self, dtype: torch.dtype):
         batch_scale = torch.randn(B, device="cuda")
         kv_scale = torch.randn(S, device="cuda")
         q_scale = torch.randn(1, device="cuda")
+=======
+        self.run_test(score_mod, dtype, device=device)
+        self.run_test_with_paged_attention(score_mod, dtype, device=device)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    def test_captured_buffers_all_dims(self, device, dtype: torch.dtype):
+        head_scale = torch.randn(Hq, device=device)
+        batch_scale = torch.randn(B, device=device)
+        kv_scale = torch.randn(S, device=device)
+        q_scale = torch.randn(1, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def all_bias(score, batch, head, token_q, token_kv):
             score = score + kv_scale[token_kv]
@@ -842,6 +1279,7 @@ def all_bias(score, batch, head, token_q, token_kv):
             score = score + batch_scale[batch]
             return score
 
+<<<<<<< HEAD
         self.run_test(all_bias, dtype)
         self.run_test_with_paged_attention(all_bias, dtype)
 
@@ -849,11 +1287,21 @@ def all_bias(score, batch, head, token_q, token_kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_seq_masking(self, dtype):
         seq_idx = torch.zeros(S, device="cuda", dtype=torch.bool)
+=======
+        self.run_test(all_bias, dtype, device=device)
+        self.run_test_with_paged_attention(all_bias, dtype, device=device)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_seq_masking(self, device, dtype):
+        seq_idx = torch.zeros(S, device=device, dtype=torch.bool)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seq_idx[S // 2 :] = 1
 
         def seq_mask_mod(score, b, h, q, kv):
             return torch.where(seq_idx[q] == seq_idx[kv], score, float("-inf"))
 
+<<<<<<< HEAD
         self.run_test(seq_mask_mod, dtype)
         self.run_test_with_paged_attention(seq_mask_mod, dtype)
 
@@ -861,10 +1309,20 @@ def seq_mask_mod(score, b, h, q, kv):
     def test_non_divisible_offset_mask(self):
         KV_S = S - 3
         offset_tensor = torch.tensor(S // 2 - 3, device="cuda", dtype=torch.int32)
+=======
+        self.run_test(seq_mask_mod, dtype, device=device)
+        self.run_test_with_paged_attention(seq_mask_mod, dtype, device=device)
+
+    @supported_platform
+    def test_non_divisible_offset_mask(self, device):
+        KV_S = S - 3
+        offset_tensor = torch.tensor(S // 2 - 3, device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def mask_mod(b, h, q, kv):
             return kv >= q + offset_tensor
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, B, 1, 1, KV_S)
         self.run_test(KV_S=KV_S, block_mask=block_mask)
 
@@ -873,6 +1331,16 @@ def test_non_divisible_offset_mask_with_captured_buffer(self):
         KV_S = S - 3
         offset_kv = torch.randn(KV_S, device="cuda", dtype=torch.bfloat16)
         offset_tensor = torch.tensor(S // 2 - 3, device="cuda", dtype=torch.int32)
+=======
+        block_mask = create_block_mask(mask_mod, B, 1, 1, KV_S, device=device)
+        self.run_test(KV_S=KV_S, block_mask=block_mask, device=device)
+
+    @supported_platform
+    def test_non_divisible_offset_mask_with_captured_buffer(self, device):
+        KV_S = S - 3
+        offset_kv = torch.randn(KV_S, device=device, dtype=torch.bfloat16)
+        offset_tensor = torch.tensor(S // 2 - 3, device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def score_mod(score, b, h, q, kv):
             return score + offset_kv[kv]
@@ -880,6 +1348,7 @@ def score_mod(score, b, h, q, kv):
         def mask_mod(b, h, q, kv):
             return kv >= q + offset_tensor
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, B, 1, 1, KV_S)
         self.run_test(KV_S=KV_S, block_mask=block_mask, score_mod=score_mod)
 
@@ -888,14 +1357,34 @@ def test_non_divisible_multi_token_offset_mask(self):
         KV_S = S - 3
         Q_S = 3
         offset_tensor = torch.tensor(S // 2 - 1, device="cuda", dtype=torch.int32)
+=======
+        block_mask = create_block_mask(mask_mod, B, 1, 1, KV_S, device=device)
+        self.run_test(
+            KV_S=KV_S, block_mask=block_mask, score_mod=score_mod, device=device
+        )
+
+    @supported_platform
+    def test_non_divisible_multi_token_offset_mask(self, device):
+        KV_S = S - 3
+        Q_S = 3
+        offset_tensor = torch.tensor(S // 2 - 1, device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def mask_mod(b, h, q, kv):
             return kv >= q + offset_tensor
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, B, 1, Q_S, KV_S)
         self.run_test(Q_S=Q_S, KV_S=KV_S, block_mask=block_mask)
 
     @supported_platform
+=======
+        block_mask = create_block_mask(mask_mod, B, 1, Q_S, KV_S, device=device)
+        self.run_test(Q_S=Q_S, KV_S=KV_S, block_mask=block_mask, device=device)
+
+    @supported_platform
+    @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_non_divisible_multi_token_offset_mask_with_captured_buffer(self):
         KV_S = S - 3
         Q_S = 3
@@ -914,12 +1403,18 @@ def mask_mod(b, h, q, kv):
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
+<<<<<<< HEAD
     def test_load_from_bias_seq_only(self, dtype):
         bias = torch.randn(1, S, device="cuda", dtype=dtype)
+=======
+    def test_load_from_bias_seq_only(self, device, dtype):
+        bias = torch.randn(1, S, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def bias_mod(score, b, h, q, kv):
             return score + bias[q, kv]
 
+<<<<<<< HEAD
         self.run_test(bias_mod, dtype)
         self.run_test_with_paged_attention(bias_mod, dtype)
 
@@ -927,10 +1422,20 @@ def bias_mod(score, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_load_from_bias_seq_batch(self, dtype):
         bias = torch.randn(B, 1, S, device="cuda", dtype=dtype)
+=======
+        self.run_test(bias_mod, dtype, device=device)
+        self.run_test_with_paged_attention(bias_mod, dtype, device=device)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_load_from_bias_seq_batch(self, device, dtype):
+        bias = torch.randn(B, 1, S, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def bias_mod(score, b, h, q, kv):
             return score + bias[b, q, kv]
 
+<<<<<<< HEAD
         self.run_test(bias_mod, dtype)
         self.run_test_with_paged_attention(bias_mod, dtype)
 
@@ -938,37 +1443,70 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_load_from_bias_head_seq_batch(self, dtype):
+=======
+        self.run_test(bias_mod, dtype, device=device)
+        self.run_test_with_paged_attention(bias_mod, dtype, device=device)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_load_from_bias_head_seq_batch(self, device, dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bias = torch.randn(
             B,
             Hq,
             1,
             S,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
         )
 
         def bias_mod(score, b, h, q, kv):
             return score + bias[b, h, q, kv]
 
+<<<<<<< HEAD
         self.run_test(bias_mod, dtype)
         self.run_test_with_paged_attention(bias_mod, dtype)
+=======
+        self.run_test(bias_mod, dtype, device=device)
+        self.run_test_with_paged_attention(bias_mod, dtype, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @supported_platform
     @common_utils.parametrize("score_mod", test_score_mods)
     @common_utils.parametrize("dtype", test_dtypes)
     @common_utils.parametrize("head_dims", [(D, D // 2), (D // 2, D)])
+<<<<<<< HEAD
     def test_non_equal_head_dims(self, dtype, score_mod, head_dims):
         qk_d, v_d = head_dims
         self.run_test(score_mod, dtype, B, Hq, 1, qk_d, B, Hkv, S, V_D=v_d)
         self.run_test_with_paged_attention(
             score_mod, dtype, B, Hq, 1, qk_d, B, Hkv, S, V_D=v_d
+=======
+    def test_non_equal_head_dims(self, device, dtype, score_mod, head_dims):
+        qk_d, v_d = head_dims
+        self.run_test(
+            score_mod, dtype, B, Hq, 1, qk_d, B, Hkv, S, V_D=v_d, device=device
+        )
+        self.run_test_with_paged_attention(
+            score_mod, dtype, B, Hq, 1, qk_d, B, Hkv, S, V_D=v_d, device=device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
+<<<<<<< HEAD
     def test_head_dependent_mask_mod(self, dtype: torch.dtype, score_mod, head_dims):
+=======
+    def test_head_dependent_mask_mod(
+        self, device, dtype: torch.dtype, score_mod, head_dims
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Hq, Hkv = head_dims
         assert Hq % Hkv == 0
 
@@ -976,7 +1514,11 @@ def head_attention_mod(kv_head_num):
             head_type = torch.tensor(
                 [False if i % kv_head_num == 0 else True for i in range(kv_head_num)],
                 dtype=torch.bool,
+<<<<<<< HEAD
                 device="cuda",
+=======
+                device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             def mask_mod(b, h, q_idx, kv_idx):
@@ -988,6 +1530,7 @@ def mask_mod(b, h, q_idx, kv_idx):
             return mask_mod
 
         mask_mod = head_attention_mod(Hq)
+<<<<<<< HEAD
         mask = create_block_mask(mask_mod, 1, Hq, 1, S, device="cuda")
         self.run_test(score_mod, dtype, Q_H=Hq, KV_H=Hkv, block_mask=mask)
         self.run_test_with_paged_attention(score_mod, dtype, Q_H=Hq, KV_H=Hkv)
@@ -995,6 +1538,19 @@ def mask_mod(b, h, q_idx, kv_idx):
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_subgraph_respect_decompostion(self, dtype):
+=======
+        mask = create_block_mask(mask_mod, 1, Hq, 1, S, device=device)
+        self.run_test(
+            score_mod, dtype, Q_H=Hq, KV_H=Hkv, block_mask=mask, device=device
+        )
+        self.run_test_with_paged_attention(
+            score_mod, dtype, Q_H=Hq, KV_H=Hkv, device=device
+        )
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_subgraph_respect_decompostion(self, device, dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._decomp import core_aten_decompositions
         from torch.fx.experimental.proxy_tensor import make_fx
 
@@ -1005,14 +1561,22 @@ def score_mod_func(score, b, h, q, kv):
             torch.randn,
             (2, 2, 128, 4),
             dtype=dtype,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
         make_q = functools.partial(
             torch.randn,
             (2, 2, 8, 4),
             dtype=dtype,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             requires_grad=True,
         )
         query, key, value = make_q(), make_kv(), make_kv()
@@ -1045,6 +1609,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
+<<<<<<< HEAD
     def test_silu_on_score(self, dtype):
         def silu_score(score, b, h, q, kv):
             return torch.nn.functional.silu(score)
@@ -1056,6 +1621,19 @@ def silu_score(score, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_padded_dense_causal(self, dtype):
         seq_len = torch.arange(B, device="cuda", dtype=torch.int32) + 1
+=======
+    def test_silu_on_score(self, device, dtype):
+        def silu_score(score, b, h, q, kv):
+            return torch.nn.functional.silu(score)
+
+        self.run_test(silu_score, dtype, device=device)
+        self.run_test_with_paged_attention(silu_score, dtype, device=device)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_padded_dense_causal(self, device, dtype):
+        seq_len = torch.arange(B, device=device, dtype=torch.int32) + 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def create_padded_dense_wrapper(orig_score_mod):
             def njt_score_mod(qk, b, h, q, kv):
@@ -1067,6 +1645,7 @@ def njt_score_mod(qk, b, h, q, kv):
 
         causal_njt = create_padded_dense_wrapper(_causal)
 
+<<<<<<< HEAD
         self.run_test(causal_njt, dtype)
         self.run_test_with_paged_attention(causal_njt, dtype)
 
@@ -1074,10 +1653,20 @@ def njt_score_mod(qk, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_captured_scale(self, dtype):
         scale = torch.ones((), device="cuda", dtype=torch.int32)
+=======
+        self.run_test(causal_njt, dtype, device=device)
+        self.run_test_with_paged_attention(causal_njt, dtype, device=device)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_captured_scale(self, device, dtype):
+        scale = torch.ones((), device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def score_mod_scale(qk, b, h, q, kv):
             return qk + scale
 
+<<<<<<< HEAD
         self.run_test(score_mod_scale, dtype)
         self.run_test_with_paged_attention(score_mod_scale, dtype)
 
@@ -1085,6 +1674,15 @@ def score_mod_scale(qk, b, h, q, kv):
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_recompile_changed_score_mod(self, dtype):
         scale = torch.ones((), device="cuda", dtype=torch.int32)
+=======
+        self.run_test(score_mod_scale, dtype, device=device)
+        self.run_test_with_paged_attention(score_mod_scale, dtype, device=device)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_recompile_changed_score_mod(self, device, dtype):
+        scale = torch.ones((), device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ADD = True
 
         def score_mod_scale(qk, b, h, q, kv):
@@ -1093,28 +1691,50 @@ def score_mod_scale(qk, b, h, q, kv):
             else:
                 return qk * scale
 
+<<<<<<< HEAD
         self.run_test(score_mod_scale, dtype)
         self.run_test_with_paged_attention(score_mod_scale, dtype)
 
         ADD = False
         self.run_test(score_mod_scale, dtype)
         self.run_test_with_paged_attention(score_mod_scale, dtype)
+=======
+        self.run_test(score_mod_scale, dtype, device=device)
+        self.run_test_with_paged_attention(score_mod_scale, dtype, device=device)
+
+        ADD = False
+        self.run_test(score_mod_scale, dtype, device=device)
+        self.run_test_with_paged_attention(score_mod_scale, dtype, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @supported_platform
     @common_utils.parametrize("head_dim", [17, 24, 94, 121])
     @common_utils.parametrize("dtype", test_dtypes_fast)
+<<<<<<< HEAD
     def test_non_pow_2_headdim(self, dtype, head_dim):
         self.run_test(_rel_bias, dtype, B, Hq, S, head_dim, B, Hkv, S, head_dim)
+=======
+    def test_non_pow_2_headdim(self, device, dtype, head_dim):
+        self.run_test(
+            _rel_bias, dtype, B, Hq, S, head_dim, B, Hkv, S, head_dim, device=device
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @supported_platform
     @expectedFailure  # If we capture a tensor then we can perform a reduction on it, and that shouldn't be allowed
     @common_utils.parametrize("dtype", test_dtypes_fast)
+<<<<<<< HEAD
     def test_captured_reduction(self, dtype):
         scale = torch.randn((B, 8), device="cuda")
+=======
+    def test_captured_reduction(self, device, dtype):
+        scale = torch.randn((B, 8), device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def score_mod_scale(qk, b, h, q, kv):
             return qk + scale[b].sum(dim=-1)
 
+<<<<<<< HEAD
         self.run_test(score_mod_scale, dtype)
 
     @supported_platform
@@ -1126,6 +1746,19 @@ def test_multiple_score_mod_calls(self):
         ]
         values = [
             torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+=======
+        self.run_test(score_mod_scale, dtype, device=device)
+
+    @supported_platform
+    def test_multiple_score_mod_calls(self, device):
+        query = torch.randn((1, 8, 4, 64), dtype=torch.float32, device=device)
+        keys = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+            for _ in range(2)
+        ]
+        values = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(2)
         ]
 
@@ -1145,6 +1778,7 @@ def f(q, k1, k2, v1, v2):
         torch.testing.assert_close(out, out2, atol=tolerance.atol, rtol=tolerance.rtol)
 
     @supported_platform
+<<<<<<< HEAD
     def test_multiple_score_mod_calls2(self):
         query = torch.randn((1, 8, 4, 64), dtype=torch.float32, device="cuda")
         keys = [
@@ -1153,6 +1787,16 @@ def test_multiple_score_mod_calls2(self):
         ]
         values = [
             torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+=======
+    def test_multiple_score_mod_calls2(self, device):
+        query = torch.randn((1, 8, 4, 64), dtype=torch.float32, device=device)
+        keys = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+            for _ in range(3)
+        ]
+        values = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(3)
         ]
 
@@ -1174,6 +1818,7 @@ def f(q, k1, k2, k3, v1, v2, v3):
         self.assertTrue((out - out2).abs().mean() < 1e-2)
 
     @supported_platform
+<<<<<<< HEAD
     def test_multiple_score_mod_calls_paged_attention(self):
         query = torch.randn((1, 8, 4, 64), dtype=torch.float32, device="cuda")
         keys = [
@@ -1182,6 +1827,16 @@ def test_multiple_score_mod_calls_paged_attention(self):
         ]
         values = [
             torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+=======
+    def test_multiple_score_mod_calls_paged_attention(self, device):
+        query = torch.randn((1, 8, 4, 64), dtype=torch.float32, device=device)
+        keys = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+            for _ in range(2)
+        ]
+        values = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(2)
         ]
 
@@ -1191,7 +1846,11 @@ def scoremod_1(qk, b, h, q, kv):
         def scoremod_2(qk, b, h, q, kv):
             return torch.where(q >= kv, qk, -float("inf"))
 
+<<<<<<< HEAD
         block_mask = create_block_mask(noop_mask, 1, 1, 4, 1024)
+=======
+        block_mask = create_block_mask(noop_mask, 1, 1, 4, 1024, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def f(q, k1, k2, v1, v2):
             q2 = flex_attention(q, k1, v1, score_mod=scoremod_1, block_mask=block_mask)
@@ -1207,7 +1866,17 @@ def f(q, k1, k2, v1, v2):
             converted_block_mask1,
             converted_score_mod1,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             scoremod_1, query, keys[0], values[0], block_mask, torch.float32
+=======
+            scoremod_1,
+            query,
+            keys[0],
+            values[0],
+            block_mask,
+            torch.float32,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         (
             k_cache2,
@@ -1215,7 +1884,17 @@ def f(q, k1, k2, v1, v2):
             converted_block_mask2,
             converted_score_mod2,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             scoremod_2, query, keys[1], values[1], block_mask, torch.float32
+=======
+            scoremod_2,
+            query,
+            keys[1],
+            values[1],
+            block_mask,
+            torch.float32,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         def paged_f(q, k1, k2, v1, v2):
@@ -1243,6 +1922,7 @@ def paged_f(q, k1, k2, v1, v2):
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_multiple_score_mod_calls_paged_attention2(self):
         query = torch.randn((1, 8, 4, 64), dtype=torch.float32, device="cuda")
         keys = [
@@ -1251,6 +1931,16 @@ def test_multiple_score_mod_calls_paged_attention2(self):
         ]
         values = [
             torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+=======
+    def test_multiple_score_mod_calls_paged_attention2(self, device):
+        query = torch.randn((1, 8, 4, 64), dtype=torch.float32, device=device)
+        keys = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+            for _ in range(3)
+        ]
+        values = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(3)
         ]
 
@@ -1260,7 +1950,11 @@ def scoremod_1(qk, b, h, q, kv):
         def scoremod_2(qk, b, h, q, kv):
             return torch.where(q >= kv, qk, -float("inf"))
 
+<<<<<<< HEAD
         block_mask = create_block_mask(noop_mask, 1, 1, 4, 1024)
+=======
+        block_mask = create_block_mask(noop_mask, 1, 1, 4, 1024, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         attention1 = functools.partial(
             flex_attention, score_mod=scoremod_1, block_mask=block_mask
@@ -1281,7 +1975,17 @@ def f(q, k1, k2, k3, v1, v2, v3):
             converted_block_mask1,
             converted_score_mod1,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             scoremod_1, query, keys[0], values[0], block_mask, torch.float32
+=======
+            scoremod_1,
+            query,
+            keys[0],
+            values[0],
+            block_mask,
+            torch.float32,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         (
             k_cache2,
@@ -1289,7 +1993,17 @@ def f(q, k1, k2, k3, v1, v2, v3):
             converted_block_mask2,
             converted_score_mod2,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             scoremod_2, query, keys[1], values[1], block_mask, torch.float32
+=======
+            scoremod_2,
+            query,
+            keys[1],
+            values[1],
+            block_mask,
+            torch.float32,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         (
             k_cache3,
@@ -1297,7 +2011,17 @@ def f(q, k1, k2, k3, v1, v2, v3):
             converted_block_mask3,
             converted_score_mod3,
         ) = self.preprocess_paged_attention(
+<<<<<<< HEAD
             scoremod_1, query, keys[2], values[2], block_mask, torch.float32
+=======
+            scoremod_1,
+            query,
+            keys[2],
+            values[2],
+            block_mask,
+            torch.float32,
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         paged_attention1 = functools.partial(
@@ -1333,11 +2057,19 @@ def paged_f(q, k1, k2, k3, v1, v2, v3):
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes)
+<<<<<<< HEAD
     def test_njt_causal(self, dtype):
         offsets = torch.tensor(
             [0, 1024, 1024 + 512, S], device="cuda", dtype=torch.int32
         )
         seq_idx = torch.zeros(S, device="cuda", dtype=torch.int32)
+=======
+    def test_njt_causal(self, device, dtype):
+        offsets = torch.tensor(
+            [0, 1024, 1024 + 512, S], device=device, dtype=torch.int32
+        )
+        seq_idx = torch.zeros(S, device=device, dtype=torch.int32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for idx in range(len(offsets) - 1):
             seq_idx[offsets[idx] : offsets[idx + 1]] = idx
 
@@ -1351,6 +2083,7 @@ def njt_score_mod(qk, b, h, q, kv):
 
         causal_njt = create_njt_wrapper(_causal, offsets, seq_idx)
 
+<<<<<<< HEAD
         self.run_test(causal_njt, dtype)
         self.run_test_with_paged_attention(causal_njt, dtype)
 
@@ -1359,6 +2092,16 @@ def test_mixed_dtypes_fails(self):
         query = torch.randn((1, 1, 8, 64), dtype=torch.float32, device="cuda")
         key = torch.randn((1, 1, 1024, 64), dtype=torch.float16, device="cuda")
         value = torch.randn((1, 1, 1024, 64), dtype=torch.float16, device="cuda")
+=======
+        self.run_test(causal_njt, dtype, device=device)
+        self.run_test_with_paged_attention(causal_njt, dtype, device=device)
+
+    @supported_platform
+    def test_mixed_dtypes_fails(self, device):
+        query = torch.randn((1, 1, 8, 64), dtype=torch.float32, device=device)
+        key = torch.randn((1, 1, 1024, 64), dtype=torch.float16, device=device)
+        value = torch.randn((1, 1, 1024, 64), dtype=torch.float16, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(
             ValueError, "Expected query, key, and value to have the same dtype"
         ):
@@ -1366,6 +2109,7 @@ def test_mixed_dtypes_fails(self):
 
     @supported_platform
     @patch.object(torch._inductor.config, "max_autotune", True)
+<<<<<<< HEAD
     def test_max_autotune(self):
         def score_mod(score, b, h, m, n):
             return score * 2
@@ -1380,6 +2124,22 @@ def test_max_autotune_with_captured(self):
         batch_scale = torch.randn(B, device="cuda")
         tok_scale = torch.randn(S, device="cuda")
         q_scale = torch.randn(1, device="cuda")
+=======
+    def test_max_autotune(self, device):
+        def score_mod(score, b, h, m, n):
+            return score * 2
+
+        self.run_test(score_mod, device=device)
+        self.run_test_with_paged_attention(score_mod, device=device)
+
+    @supported_platform
+    @patch.object(torch._inductor.config, "max_autotune", True)
+    def test_max_autotune_with_captured(self, device):
+        head_scale = torch.randn(Hq, device=device)
+        batch_scale = torch.randn(B, device=device)
+        tok_scale = torch.randn(S, device=device)
+        q_scale = torch.randn(1, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def bias_mod(score, batch, head, token_q, token_kv):
             score = score + tok_scale[token_kv]
@@ -1388,6 +2148,7 @@ def bias_mod(score, batch, head, token_q, token_kv):
             score = score + head_scale[head]
             return score
 
+<<<<<<< HEAD
         self.run_test(bias_mod)
         self.run_test_with_paged_attention(bias_mod)
 
@@ -1402,6 +2163,31 @@ def test_fully_masked_out_rows_0_check_gqa(self):
         )
         value = torch.randn(
             (B, Hkv, S, D), dtype=torch.float32, device="cuda", requires_grad=True
+=======
+        self.run_test(bias_mod, device=device)
+        self.run_test_with_paged_attention(bias_mod, device=device)
+
+    @supported_platform
+    def test_fully_masked_out_rows_0_check_gqa(self, device):
+        # Ensure fully masked out rows won't cause NaNs.
+        query = torch.randn(
+            (B, Hq, S, D),
+            dtype=torch.float32,
+            device=device,
+            requires_grad=not self.test_inference_only,
+        )
+        key = torch.randn(
+            (B, Hkv, S, D),
+            dtype=torch.float32,
+            device=device,
+            requires_grad=not self.test_inference_only,
+        )
+        value = torch.randn(
+            (B, Hkv, S, D),
+            dtype=torch.float32,
+            device=device,
+            requires_grad=not self.test_inference_only,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         M = S // 2
@@ -1409,6 +2195,7 @@ def test_fully_masked_out_rows_0_check_gqa(self):
         def mask_mod(b, h, q, kv):
             return q < M
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, 1, 1, S, S)
 
         flex = torch.compile(flex_attention, dynamic=False)
@@ -1426,6 +2213,39 @@ def mask_mod(b, h, q, kv):
     @supported_platform
     @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_windowed_no_mask_vs_sdpa(self):
+=======
+        block_mask = create_block_mask(mask_mod, 1, 1, S, S, device=device)
+
+        flex = torch.compile(flex_attention, dynamic=False)
+        if not self.test_inference_only:
+            out, lse = flex(
+                query,
+                key,
+                value,
+                block_mask=block_mask,
+                enable_gqa=True,
+                return_lse=True,
+            )
+            self.assertTrue((lse[:, :, M:] == -float("inf")).all())
+
+            loss = out.sum() + lse.sum()
+            loss.backward()
+            self.assertEqual(query.grad[:, :, M:, :].sum(), 0)
+        else:
+            out = flex(
+                query,
+                key,
+                value,
+                block_mask=block_mask,
+                enable_gqa=True,
+                return_lse=False,
+            )
+        self.assertEqual(out[:, :, M:, :].sum(), 0)
+
+    @supported_platform
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
+    def test_windowed_no_mask_vs_sdpa(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         score_mod = _generate_windowed(1000)
         attention = functools.partial(flex_attention, score_mod=score_mod)
 
@@ -1435,16 +2255,29 @@ def test_windowed_no_mask_vs_sdpa(self):
             torch.nn.functional.scaled_dot_product_attention, attn_mask=sdpa_mask
         )
 
+<<<<<<< HEAD
         self.run_test_with_call(attention, sdpa_attention, Q_H=16, KV_H=16, Q_S=8)
 
     @supported_platform
     def test_windowed_full_mask_vs_sdpa(self):
+=======
+        self.run_test_with_call(
+            attention, sdpa_attention, Q_H=16, KV_H=16, Q_S=8, device=device
+        )
+
+    @supported_platform
+    def test_windowed_full_mask_vs_sdpa(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def mask_mod(b, h, q, kv):
             return q + 1000 >= kv
 
         score_mod = _generate_windowed(1000)
 
+<<<<<<< HEAD
         block_mask = create_block_mask(mask_mod, 1, 1, 8, S)
+=======
+        block_mask = create_block_mask(mask_mod, 1, 1, 8, S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attention = functools.partial(
             flex_attention, block_mask=block_mask, score_mod=score_mod
         )
@@ -1454,6 +2287,7 @@ def mask_mod(b, h, q, kv):
             torch.nn.functional.scaled_dot_product_attention, attn_mask=sdpa_mask
         )
 
+<<<<<<< HEAD
         self.run_test_with_call(attention, sdpa_attention, Q_H=16, KV_H=16, Q_S=8)
 
     @supported_platform
@@ -1462,6 +2296,18 @@ def mask_mod(b, h, q, kv):
             return q + 1000 >= kv
 
         block_mask = create_block_mask(mask_mod, 1, 1, 8, S)
+=======
+        self.run_test_with_call(
+            attention, sdpa_attention, Q_H=16, KV_H=16, Q_S=8, device=device
+        )
+
+    @supported_platform
+    def test_windowed_partial_block_vs_sdpa(self, device):
+        def mask_mod(b, h, q, kv):
+            return q + 1000 >= kv
+
+        block_mask = create_block_mask(mask_mod, 1, 1, 8, S, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attention = functools.partial(flex_attention, block_mask=block_mask)
 
         sdpa_mask = _get_windowed_sdpa_mask(8, S, 1000)
@@ -1469,41 +2315,74 @@ def mask_mod(b, h, q, kv):
             torch.nn.functional.scaled_dot_product_attention, attn_mask=sdpa_mask
         )
 
+<<<<<<< HEAD
         self.run_test_with_call(attention, sdpa_attention, Q_H=16, KV_H=16, Q_S=8)
 
     @supported_platform
     def test_windowed_no_mask_vs_sdpa_paged_attention(self):
+=======
+        self.run_test_with_call(
+            attention, sdpa_attention, Q_H=16, KV_H=16, Q_S=8, device=device
+        )
+
+    @supported_platform
+    def test_windowed_no_mask_vs_sdpa_paged_attention(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         score_mod = _generate_windowed(1000)
 
         sdpa_mask = _get_windowed_sdpa_mask(8, S, 1000)
 
         self.run_test_with_call_paged_attention(
+<<<<<<< HEAD
             score_mod, None, sdpa_mask, Q_H=16, KV_H=16, Q_S=8
         )
 
     @supported_platform
     def test_windowed_full_mask_vs_sdpa_paged_attention(self):
+=======
+            score_mod, None, sdpa_mask, Q_H=16, KV_H=16, Q_S=8, device=device
+        )
+
+    @supported_platform
+    def test_windowed_full_mask_vs_sdpa_paged_attention(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def mask_mod(b, h, q, kv):
             return q + 1000 >= kv
 
         score_mod = _generate_windowed(1000)
         sdpa_mask = _get_windowed_sdpa_mask(8, S, 1000)
         self.run_test_with_call_paged_attention(
+<<<<<<< HEAD
             score_mod, mask_mod, sdpa_mask, Q_H=16, KV_H=16, Q_S=8
         )
 
     @supported_platform
     def test_windowed_partial_block_vs_sdpa_paged_attention(self):
+=======
+            score_mod, mask_mod, sdpa_mask, Q_H=16, KV_H=16, Q_S=8, device=device
+        )
+
+    @supported_platform
+    def test_windowed_partial_block_vs_sdpa_paged_attention(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def mask_mod(b, h, q, kv):
             return q + 1000 >= kv
 
         sdpa_mask = _get_windowed_sdpa_mask(8, S, 1000)
 
         self.run_test_with_call_paged_attention(
+<<<<<<< HEAD
             None, mask_mod, sdpa_mask, Q_H=16, KV_H=16, Q_S=8
         )
 
     @supported_platform
+=======
+            None, mask_mod, sdpa_mask, Q_H=16, KV_H=16, Q_S=8, device=device
+        )
+
+    @supported_platform
+    @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("dtype", test_dtypes)
     @common_utils.parametrize("score_mod", [_identity, _causal])
     def test_logsumexp_correctness(self, dtype, score_mod):
@@ -1557,6 +2436,10 @@ def eager_sdpa_hop(q, k, v, score_mod):
         )
 
     @supported_platform
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_logsumexp_only_return(self):
         make_q = functools.partial(
             torch.randn,
@@ -1588,7 +2471,11 @@ def func(q, k, v, score_mod):
         )
 
     @supported_platform
+<<<<<<< HEAD
     def test_non_sparse_mulitple_block_size(self):
+=======
+    def test_non_sparse_mulitple_block_size(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def generate_causal_offset(offset: torch.Tensor):
             def causal_offset_mask(b, h, q_idx, kv_idx):
                 return (offset + q_idx) >= kv_idx
@@ -1599,9 +2486,15 @@ def noop(score, b, h, q_idx, kv_idx):  # noqa: F841
             return score
 
         mod = generate_causal_offset(
+<<<<<<< HEAD
             torch.tensor(192, device="cuda", dtype=torch.int32)
         )
         block_mask = create_block_mask(mod, 1, 1, 1, 65)
+=======
+            torch.tensor(192, device=device, dtype=torch.int32)
+        )
+        block_mask = create_block_mask(mod, 1, 1, 1, 65, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.run_test(
             score_mod=None,
@@ -1615,6 +2508,10 @@ def noop(score, b, h, q_idx, kv_idx):  # noqa: F841
             KV_H=1,
             KV_S=65,
             V_D=16,
+<<<<<<< HEAD
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.run_test_with_paged_attention(
             score_mod=None,
@@ -1628,6 +2525,7 @@ def noop(score, b, h, q_idx, kv_idx):  # noqa: F841
             KV_H=1,
             KV_S=65,
             V_D=16,
+<<<<<<< HEAD
         )
 
     @supported_platform
@@ -1638,6 +2536,19 @@ def test_do_not_trigger_dynamic_shapes_on_empty_block_mask(self):
         for i in range(5):
             k = torch.randn(B, H, S + i, D, device="cuda")
             v = torch.randn(B, H, S + i, D, device="cuda")
+=======
+            device=device,
+        )
+
+    @supported_platform
+    def test_do_not_trigger_dynamic_shapes_on_empty_block_mask(self, device):
+        torch._dynamo.reset()
+        H = Hq
+        q = torch.randn(B, H, 1, D, device=device)
+        for i in range(5):
+            k = torch.randn(B, H, S + i, D, device=device)
+            v = torch.randn(B, H, S + i, D, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             compiled_flex_attention = torch.compile(flex_attention)
             ref = flex_attention(q, k, v)
             res = compiled_flex_attention(q, k, v)
@@ -1652,7 +2563,12 @@ def test_do_not_trigger_dynamic_shapes_on_empty_block_mask(self):
                 self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2)
 
     @supported_platform
+<<<<<<< HEAD
     def test_larger_block_mask_bug(self):
+=======
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_larger_block_mask_bug(self, device, dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def mask_mod(b, h, q_idx, kv_idx):
             return q_idx >= kv_idx
 
@@ -1662,7 +2578,11 @@ def mask_mod(b, h, q_idx, kv_idx):
             H=None,
             Q_LEN=2,
             KV_LEN=2,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Compile flex attention
@@ -1670,9 +2590,15 @@ def mask_mod(b, h, q_idx, kv_idx):
 
         # Create input tensors
         shape = (2, 1, 2, 16)
+<<<<<<< HEAD
         q = torch.normal(0.0, 3.0, shape, device="cuda", dtype=torch.float16)
         k = torch.normal(0.0, 3.0, shape, device="cuda", dtype=torch.float16)
         v = torch.normal(0.0, 3.0, shape, device="cuda", dtype=torch.float16)
+=======
+        q = torch.normal(0.0, 3.0, shape, device=device, dtype=dtype)
+        k = torch.normal(0.0, 3.0, shape, device=device, dtype=dtype)
+        v = torch.normal(0.0, 3.0, shape, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         eager = flex_attention(q, k, v, block_mask=mask_2)
         out = flex_attention_compiled(q, k, v, block_mask=mask_2)
         torch.testing.assert_close(eager, out, atol=5e-3, rtol=5e-3)
@@ -1681,7 +2607,11 @@ def mask_mod(b, h, q_idx, kv_idx):
     @common_utils.parametrize("score_mod", test_score_mods)
     @supported_platform
     def test_decode_at_different_input_position(
+<<<<<<< HEAD
         self, dtype: torch.dtype, score_mod: Callable
+=======
+        self, device, dtype: torch.dtype, score_mod: Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         n_pages, page_size, max_batch_size, max_seq_len = 32, 64, 4, 512
         n_heads, head_dim = 4, 16
@@ -1695,7 +2625,11 @@ def causal_mask(b, h, q, kv):
             1,
             max_seq_len,
             max_seq_len,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             BLOCK_SIZE=page_size,
         )
 
@@ -1708,7 +2642,11 @@ def causal_mask(b, h, q, kv):
                 n_heads,
                 1,
                 head_dim,
+<<<<<<< HEAD
                 device="cuda",
+=======
+                device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype=dtype,
                 requires_grad=False,
             )
@@ -1717,7 +2655,11 @@ def causal_mask(b, h, q, kv):
                 n_heads,
                 seq_len,
                 head_dim,
+<<<<<<< HEAD
                 device="cuda",
+=======
+                device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype=dtype,
                 requires_grad=False,
             )
@@ -1726,7 +2668,11 @@ def causal_mask(b, h, q, kv):
                 n_heads,
                 seq_len,
                 head_dim,
+<<<<<<< HEAD
                 device="cuda",
+=======
+                device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dtype=dtype,
                 requires_grad=False,
             )
@@ -1755,12 +2701,21 @@ def causal_mask(b, h, q, kv):
         golden_outs = torch.cat(golden_outs)
 
         # init paged attention
+<<<<<<< HEAD
         paged_cache = PagedAttention(n_pages, page_size, max_batch_size, device="cuda")
         batch_reserve(paged_cache, torch.tensor([100, 200, 50, 300], device="cuda"))
         batch_reserve(paged_cache, torch.tensor([100, 512, 300, 300], device="cuda"))
         batch_reserve(paged_cache, torch.tensor([512, 512, 300, 300], device="cuda"))
         batch_reserve(paged_cache, torch.tensor([512, 512, 512, 300], device="cuda"))
         batch_reserve(paged_cache, torch.tensor([512, 512, 512, 512], device="cuda"))
+=======
+        paged_cache = PagedAttention(n_pages, page_size, max_batch_size, device=device)
+        batch_reserve(paged_cache, torch.tensor([100, 200, 50, 300], device=device))
+        batch_reserve(paged_cache, torch.tensor([100, 512, 300, 300], device=device))
+        batch_reserve(paged_cache, torch.tensor([512, 512, 300, 300], device=device))
+        batch_reserve(paged_cache, torch.tensor([512, 512, 512, 300], device=device))
+        batch_reserve(paged_cache, torch.tensor([512, 512, 512, 512], device=device))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # allocate paged kv cache
         MAX_CACHED_SEQ_LEN = n_pages * page_size
@@ -1769,7 +2724,11 @@ def causal_mask(b, h, q, kv):
             n_heads,
             MAX_CACHED_SEQ_LEN,
             head_dim,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
         )
         v_cache = torch.zeros(
@@ -1777,14 +2736,23 @@ def causal_mask(b, h, q, kv):
             n_heads,
             MAX_CACHED_SEQ_LEN,
             head_dim,
+<<<<<<< HEAD
             device="cuda",
+=======
+            device=device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype=dtype,
         )
 
         # prefill paged kv cache
         for i, seq_len in enumerate(prefill_length):
+<<<<<<< HEAD
             batch_idx = torch.tensor([i], device="cuda", dtype=torch.int32)
             input_pos = torch.arange(seq_len, device="cuda", dtype=torch.int32).view(
+=======
+            batch_idx = torch.tensor([i], device=device, dtype=torch.int32)
+            input_pos = torch.arange(seq_len, device=device, dtype=torch.int32).view(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 1, seq_len
             )
             paged_cache.assign(
@@ -1792,8 +2760,13 @@ def causal_mask(b, h, q, kv):
             )
 
         # get paged out and check correctness
+<<<<<<< HEAD
         batch_idx = torch.arange(max_batch_size, device="cuda", dtype=torch.int32)
         input_pos = torch.tensor(prefill_length, device="cuda", dtype=torch.int32).view(
+=======
+        batch_idx = torch.arange(max_batch_size, device=device, dtype=torch.int32)
+        input_pos = torch.tensor(prefill_length, device=device, dtype=torch.int32).view(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             max_batch_size, 1
         )
         new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
@@ -1818,7 +2791,11 @@ def causal_mask(b, h, q, kv):
             self._check_equal(golden_outs, ref_outs, paged_out, fudge_factor, "Out")
 
 
+<<<<<<< HEAD
 common_utils.instantiate_parametrized_tests(TestFlexDecoding)
+=======
+instantiate_device_type_tests(TestFlexDecoding, globals(), only_for=test_device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_foreach.py b/test/inductor/test_foreach.py
index e68ed88a4f2a..96f12d8ba116 100644
--- a/test/inductor/test_foreach.py
+++ b/test/inductor/test_foreach.py
@@ -2,6 +2,10 @@
 
 import sys
 import unittest
+<<<<<<< HEAD
+=======
+import unittest.mock as mock
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._inductor
@@ -49,6 +53,14 @@ def add_op(x, y):
     return torch.add(x, y)
 
 
+<<<<<<< HEAD
+=======
+def add_inplace_op(x, y):
+    x.add_(y)
+    return x.sin()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def addrecip_op(x, y):
     return torch.reciprocal(torch.add(x, y))
 
@@ -77,6 +89,10 @@ def recipaddmul_op(x, y, z):
 
 # More general functions
 foreach_map_add_fn = foreach_map_wrapper(add_op)
+<<<<<<< HEAD
+=======
+foreach_map_add_inplace = foreach_map_wrapper(add_inplace_op)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 foreach_map_recipaddmul = foreach_map_wrapper(addrecip_op)
 foreach_map_addcmul = foreach_map_wrapper(addcmul_op)
 foreach_map_recipaddmul = foreach_map_wrapper(recipaddmul_op)
@@ -431,7 +447,11 @@ def fn(a, b):
     @requires_cuda
     @scalar_bin_ops
     @unittest.skip(
+<<<<<<< HEAD
         "Triton recursion depth exceeded: https://github.com/openai/triton/issues/1763"
+=======
+        "Triton recursion depth exceeded: https://github.com/triton-lang/triton/issues/1763"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_kernel_split_arg_limit_scalar(self, op):
         def fn(a):
@@ -1030,6 +1050,45 @@ def ref_fn(xs, ys):
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
 
     @requires_cuda
+<<<<<<< HEAD
+=======
+    def test_foreach_map_input_mutation(self):
+        def fn(xs, ys):
+            outs = foreach_map_add_inplace(xs, ys)
+            return outs[0].sum() + outs[1].sum() + outs[2].sum()
+
+        ref_inps = (
+            [
+                torch.rand(10, 20, device="cuda:0", requires_grad=True),
+                torch.rand(10, 30, device="cuda:0", requires_grad=True),
+                torch.rand(30, 30, device="cuda:0", requires_grad=True),
+            ],
+            [
+                torch.rand(10, 20, device="cuda:0", requires_grad=True),
+                torch.rand(10, 30, device="cuda:0", requires_grad=True),
+                torch.rand(30, 30, device="cuda:0", requires_grad=True),
+            ],
+        )
+        # Set requires_grad to be False to avoid mutating a leaf variable
+        inps = (
+            [x.clone().detach().requires_grad_(False) for x in ref_inps[0]],
+            [y.clone().detach().requires_grad_(False) for y in ref_inps[1]],
+        )
+
+        # TODO: after decomposing auto_functionalized, we're getting
+        # a functional subgraph with an inlined epilogue.
+        with self.assertRaisesRegex(
+            torch._inductor.exc.InductorError,
+            "Buffer mutation detected during lowering of aten.copy_.default",
+        ):
+            with mock.patch(
+                "torch._dynamo.variables.higher_order_ops.BaseHOPVariable.supports_input_mutation",
+                True,
+            ):
+                _ = run_fw_bw_and_get_code(lambda: torch.compile(fn)(*inps))
+
+    @requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @foreach_map_un_ops
     def test_foreach_map_backward_unary(self, op):
         from torch._dynamo.polyfills import foreach_map_fn
diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
index 658db2dc5f5e..201df05dfb29 100644
--- a/test/inductor/test_fp8.py
+++ b/test/inductor/test_fp8.py
@@ -8,6 +8,7 @@
 from torch import Tensor
 from torch._inductor import config, utils
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -16,6 +17,24 @@
     TEST_WITH_ROCM,
 )
 from torch.testing._internal.inductor_utils import HAS_CUDA
+=======
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FP8,
+    PLATFORM_SUPPORTS_MX_GEMM,
+)
+from torch.testing._internal.common_quantized import ceil_div, to_blocked
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+from torch.testing._internal.inductor_utils import (
+    _quantize_rowwise,
+    _quantize_tensorwise,
+    _to_fp8_saturated,
+    HAS_CPU,
+    HAS_CUDA,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import has_triton_tma_device
 
 
@@ -24,6 +43,7 @@
 
 f8_msg = "FP8 is only supported on H100+, SM 8.9 and MI300+ devices"
 
+<<<<<<< HEAD
 # define the e4m3/e5m2 constants
 E4M3_MAX_POS = torch.finfo(torch.float8_e4m3fn).max
 E5M2_MAX_POS = torch.finfo(torch.float8_e5m2).max
@@ -99,6 +119,8 @@ def _quantize_rowwise(x: Tensor, float8_dtype: torch.dtype):
     inverse_scale = scale.reciprocal()
     return x_fp8, inverse_scale
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _fix_fp8_dtype_for_rocm(
     dtype: Union[torch.dtype, list[torch.dtype], tuple[torch.dtype]], device
@@ -129,10 +151,16 @@ def _fix_fp8_dtype_for_rocm(
 
 @instantiate_parametrized_tests
 class TestFP8Types(TestCase):
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported yet")
     @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2))
     def test_xblock_for_small_numel(self, float8_dtype: torch.dtype):
+=======
+    @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2))
+    @parametrize("device", ("cuda", "cpu"))
+    def test_xblock_for_small_numel(self, float8_dtype: torch.dtype, device: str):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         TritonOverrides.to_dtype will set min_elem_per_thread to 2 or 4
         depends on the variant of fp8 type.
@@ -141,15 +169,26 @@ def test_xblock_for_small_numel(self, float8_dtype: torch.dtype):
 
         We should not pick a XBLOCK larger than xnumel
         """
+<<<<<<< HEAD
+=======
+        float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device=device)
+        if device == "cuda" and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def f(x):
             return x.to(dtype=float8_dtype)
 
+<<<<<<< HEAD
         x = torch.randn(1, device="cuda")
+=======
+        x = torch.randn(1, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expected = f(x)
         actual = torch.compile(f)(x)
         torch.testing.assert_close(expected.half(), actual.half(), rtol=1e-2, atol=1e-2)
 
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported yet")
     @parametrize("dtype", (torch.float16, torch.bfloat16))
@@ -165,6 +204,24 @@ def fp8_matmul_unwrapped(x):
             output_scale = None
             input_bias = torch.rand(32, device="cuda", dtype=dtype)
             weight = torch.rand(*weight_shape, device="cuda", dtype=dtype).T.to(
+=======
+    @parametrize("dtype", (torch.float16, torch.bfloat16))
+    @parametrize("device", ("cuda", "cpu"))
+    def test_eager_fallback(self, dtype: torch.dtype, device: torch.device):
+        if device == "cuda" and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        weight_shape = (32, 16)
+
+        e4m3_type = torch.float8_e4m3fn
+        e4m3_type = _fix_fp8_dtype_for_rocm(e4m3_type, device=device)
+
+        def fp8_matmul_unwrapped(x):
+            a_scale = torch.Tensor([1.0]).to(device=device)
+            b_scale = torch.Tensor([1.0]).to(device=device)
+            output_scale = None
+            input_bias = torch.rand(32, device=device, dtype=dtype)
+            weight = torch.rand(*weight_shape, device=device, dtype=dtype).T.to(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 e4m3_type
             )
             a_inverse_scale = 1 / a_scale
@@ -185,6 +242,7 @@ def fp8_matmul_unwrapped(x):
         )
 
         x_shape = (16, 16)
+<<<<<<< HEAD
         x = torch.rand(*x_shape, device="cuda", dtype=dtype).to(e4m3_type)
         y_fp8 = compiled_fp8_matmul(x)  # noqa: F841
 
@@ -199,6 +257,25 @@ def fp8_matmul_unwrapped(x):
     @parametrize("dst_types", [f8_type_pair])
     def test_valid_cast(self, dtype: torch.dtype, shape: str, dst_types: tuple):
         dst_types = _fix_fp8_dtype_for_rocm(dst_types, device="cuda")
+=======
+        x = torch.rand(*x_shape, device=device, dtype=dtype).to(e4m3_type)
+        y_fp8 = compiled_fp8_matmul(x)  # noqa: F841
+
+        x_shape = (15, 16)
+        x = torch.rand(*x_shape, device=device, dtype=dtype).to(e4m3_type)
+        y_fp8 = compiled_fp8_matmul(x)  # noqa: F841
+
+    @parametrize("dtype", (torch.float16, torch.bfloat16, torch.float))
+    @parametrize("shape", ("15,3,13", "4,2048,4096"))
+    @parametrize("dst_types", [(torch.float8_e4m3fn, torch.float8_e5m2)])
+    @parametrize("device", ("cuda", "cpu"))
+    def test_valid_cast(
+        self, dtype: torch.dtype, shape: str, dst_types: tuple, device: torch.device
+    ):
+        if device == "cuda" and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        dst_types = _fix_fp8_dtype_for_rocm(dst_types, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         e4m3, e5m2 = dst_types
 
         def fp8_cast(x):
@@ -209,7 +286,11 @@ def fp8_cast(x):
         compiled_fp8_cast = torch.compile(fp8_cast, backend="inductor", dynamic=True)
 
         shape = [int(dim) for dim in shape.split(",")]
+<<<<<<< HEAD
         x = torch.rand(*shape, device="cuda", dtype=dtype)
+=======
+        x = torch.rand(*shape, device=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y0_fp8, y1_fp8 = compiled_fp8_cast(x)
 
         torch.testing.assert_close(y0_fp8, x, rtol=5e-1, atol=5e-1)
@@ -238,6 +319,7 @@ def fp8_cast(x, dtype):
             x = torch.rand(*x_shape, device="cuda").to(dtype=torch.float8_e5m2)
             compiled_fp8_cast(x, torch.float8_e4m3fn)
 
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("src_dtype", (torch.float16, torch.bfloat16, torch.float))
     @parametrize("dst_dtype", f8_type_pair)
@@ -246,6 +328,22 @@ def test_to_fp8_saturated(
         self, src_dtype: torch.dtype, dst_dtype: torch.dtype, shape: str
     ):
         dst_dtype = _fix_fp8_dtype_for_rocm(dst_dtype, device="cuda")
+=======
+    @parametrize("src_dtype", (torch.float16, torch.bfloat16, torch.float))
+    @parametrize("dst_dtype", (torch.float8_e4m3fn, torch.float8_e5m2))
+    @parametrize("shape", ("16,16,16", "4,2048,4096"))
+    @parametrize("device", ("cuda", "cpu"))
+    def test_to_fp8_saturated(
+        self,
+        src_dtype: torch.dtype,
+        dst_dtype: torch.dtype,
+        shape: str,
+        device: torch.device,
+    ):
+        if device == "cuda" and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        dst_dtype = _fix_fp8_dtype_for_rocm(dst_dtype, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fp8_saturated(x, dtype):
             return _to_fp8_saturated(x, dtype)
@@ -254,18 +352,36 @@ def fp8_saturated(x, dtype):
             fp8_saturated, backend="inductor", dynamic=True
         )
         shape = [int(dim) for dim in shape.split(",")]
+<<<<<<< HEAD
         x = torch.rand(*shape, device="cuda", dtype=src_dtype)
+=======
+        x = torch.rand(*shape, device=device, dtype=src_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y_compiled = compiled_fp8_cast(x, dst_dtype)
         y = fp8_saturated(x, dst_dtype)
 
         torch.testing.assert_close(y_compiled.half(), y.half(), rtol=5e-1, atol=5e-1)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm fails with accuracy issue")
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("float8_dtype", f8_type_pair)
     @parametrize("shape", ("1,1,15", "1,10,15", "1,10,512", "1,10,4096", "4,2048,4096"))
     def test_amax_fp8_quant(self, float8_dtype: torch.dtype, shape: str):
         float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device="cuda")
+=======
+    @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2))
+    @parametrize("shape", ("1,1,15", "1,10,15", "1,10,512", "1,10,4096", "4,2048,4096"))
+    @parametrize("device", ("cuda", "cpu"))
+    def test_amax_fp8_quant(
+        self, float8_dtype: torch.dtype, shape: str, device: torch.device
+    ):
+        float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device=device)
+        if device == "cuda" and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(
+                "FP8 is only supported on H100+ and sm_89 and MI300+ devices"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape = [int(dim) for dim in shape.split(",")]
         batch_size, sequence_length, hidden_size = shape
 
@@ -278,19 +394,36 @@ def amax_fp8(x: Tensor, scale: Tensor):
         compiled_amax_fp8_quant = torch.compile(amax_fp8, backend="inductor")
 
         x_shape = (batch_size, sequence_length, hidden_size)
+<<<<<<< HEAD
         x = torch.rand(*x_shape, device="cuda", dtype=torch.half)
         scale = torch.tensor(0.2, device="cuda", dtype=torch.float)
+=======
+        x = torch.rand(*x_shape, device=device, dtype=torch.half)
+        scale = torch.tensor(0.2, device=device, dtype=torch.float)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         y_compiled = compiled_amax_fp8_quant(x, scale)
         y = amax_fp8(x, scale)
 
         torch.testing.assert_close(y_compiled.half(), y.half(), rtol=1e-2, atol=1e-2)
 
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("float8_dtype", f8_type_pair)
     @parametrize("shape", ("1,1,15", "1,10,15", "1,10,512", "1,10,4096", "4,2048,4096"))
     def test_amax_along_with_fp8_quant(self, float8_dtype: torch.dtype, shape: str):
         float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device="cuda")
+=======
+    @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2))
+    @parametrize("shape", ("1,1,15", "1,10,15", "1,10,512", "1,10,4096", "4,2048,4096"))
+    @parametrize("device", ("cuda", "cpu"))
+    def test_amax_along_with_fp8_quant(
+        self, float8_dtype: torch.dtype, shape: str, device: torch.device
+    ):
+        if device == "cuda" and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape = [int(dim) for dim in shape.split(",")]
         batch_size, sequence_length, hidden_size = shape
 
@@ -303,12 +436,21 @@ def amax_fp8(x: Tensor, scale: Tensor, amax_buffer: Tensor):
         compiled_amax_fp8_quant = torch.compile(amax_fp8, backend="inductor")
 
         x_shape = (batch_size, sequence_length, hidden_size)
+<<<<<<< HEAD
         x = torch.rand(*x_shape, device="cuda", dtype=torch.half)
         scale = torch.tensor(1.0, device="cuda", dtype=torch.float)
 
         amax_buffer_compiled = torch.zeros((1), device="cuda", dtype=torch.half)
         y_compiled = compiled_amax_fp8_quant(x, scale, amax_buffer_compiled)
         amax_buffer = torch.zeros((1), device="cuda", dtype=torch.half)
+=======
+        x = torch.rand(*x_shape, device=device, dtype=torch.half)
+        scale = torch.tensor(1.0, device=device, dtype=torch.float)
+
+        amax_buffer_compiled = torch.zeros((1), device=device, dtype=torch.half)
+        y_compiled = compiled_amax_fp8_quant(x, scale, amax_buffer_compiled)
+        amax_buffer = torch.zeros((1), device=device, dtype=torch.half)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y = amax_fp8(x, scale, amax_buffer)
 
         torch.testing.assert_close(y_compiled.half(), y.half(), rtol=1e-1, atol=1e-1)
@@ -316,6 +458,7 @@ def amax_fp8(x: Tensor, scale: Tensor, amax_buffer: Tensor):
             amax_buffer_compiled, amax_buffer, rtol=1e-2, atol=1e-2
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm fails with accuracy issue")
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("float8_dtype", f8_type_pair)
@@ -325,6 +468,24 @@ def test_layernorm_fp8_quant(
         self, float8_dtype: torch.dtype, amax_keep_dim: bool, shape: str
     ):
         float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device="cuda")
+=======
+    @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2))
+    @parametrize("amax_keep_dim", (True, False))
+    @parametrize("shape", ("1,1,15", "1,10,15", "1,10,512", "1,10,4096", "4,2048,4096"))
+    @parametrize("device", ("cuda", "cpu"))
+    def test_layernorm_fp8_quant(
+        self,
+        float8_dtype: torch.dtype,
+        amax_keep_dim: bool,
+        shape: str,
+        device: torch.device,
+    ):
+        if device == "cuda" and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(
+                "FP8 is only supported on H100+ and sm_89 and MI300+ devices"
+            )
+        float8_dtype = _fix_fp8_dtype_for_rocm(float8_dtype, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape = [int(dim) for dim in shape.split(",")]
         batch_size, sequence_length, hidden_size = shape
 
@@ -346,12 +507,21 @@ def ln_fp8(x: Tensor, scale: Tensor, amax_buffer: Tensor):
         compiled_ln_fp8_quant = torch.compile(ln_fp8, backend="inductor")
 
         x_shape = (batch_size, sequence_length, hidden_size)
+<<<<<<< HEAD
         x = torch.rand(*x_shape, device="cuda", dtype=torch.half)
         scale = torch.tensor(0.2, device="cuda", dtype=torch.float)
 
         amax_buffer_compiled = torch.zeros((1), device="cuda", dtype=torch.half)
         y_compiled = compiled_ln_fp8_quant(x, scale, amax_buffer_compiled)
         amax_buffer = torch.zeros((1), device="cuda", dtype=torch.half)
+=======
+        x = torch.rand(*x_shape, device=device, dtype=torch.half)
+        scale = torch.tensor(0.2, device=device, dtype=torch.float)
+
+        amax_buffer_compiled = torch.zeros((1), device=device, dtype=torch.half)
+        y_compiled = compiled_ln_fp8_quant(x, scale, amax_buffer_compiled)
+        amax_buffer = torch.zeros((1), device=device, dtype=torch.half)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y = ln_fp8(x, scale, amax_buffer)
 
         torch.testing.assert_close(y_compiled.half(), y.half(), rtol=1e-1, atol=1e-1)
@@ -360,7 +530,11 @@ def ln_fp8(x: Tensor, scale: Tensor, amax_buffer: Tensor):
         )
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+<<<<<<< HEAD
     @parametrize("float8_dtype", f8_type_pair)
+=======
+    @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("shape", ("4,2048,4096",))
     @parametrize("keepdim", (False, True))
     def test_layernorm_fp8_quant_benchmark(
@@ -426,7 +600,10 @@ def ln_fp8(x: Tensor, scale: Tensor, amax_buffer: Tensor):
 
 @instantiate_parametrized_tests
 class TestFP8Lowering(TestCase):
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("dtype", (torch.bfloat16, torch.float32))
     @parametrize("shape", ("16,16,32", "16,32,32", "1024,1024,512"))
@@ -448,6 +625,10 @@ def test_tensorwise_scaling(
 
         device = "cuda"
         dtype_float8 = torch.float8_e4m3fn
+<<<<<<< HEAD
+=======
+        dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         shape = [int(dim) for dim in shape.split(",")]
         M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
@@ -504,7 +685,10 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             # setting a small absolute tolerance in these tests
             torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("shape", ("16,16,32", "16,32,32", "1024,1024,512"))
     @parametrize("has_bias", (False, True))
@@ -519,6 +703,10 @@ def test_rowwise_scaling(
         dtype: torch.dtype = torch.bfloat16
         device = "cuda"
         dtype_float8 = torch.float8_e4m3fn
+<<<<<<< HEAD
+=======
+        dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         shape = [int(dim) for dim in shape.split(",")]
         M, K, N = shape  # Matmul Y = X [M, K] x W [N, K]
@@ -570,7 +758,10 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         self.assertEqual(y_compiled.dtype, dtype)
         torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("M", (1, 3, 33, 257, 1024))
     @parametrize("K", (16, 32, 1024))
@@ -586,6 +777,10 @@ def test_tensorwise_scaling_acceptable_input_dims(
         use_fast_accum = True
         device = "cuda"
         dtype_float8 = torch.float8_e4m3fn
+<<<<<<< HEAD
+=======
+        dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.randn(M, K, dtype=dtype, device=device)
         w = torch.randn(N, K, dtype=dtype, device=device)
@@ -628,7 +823,10 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         self.assertEqual(y_compiled.dtype, dtype)
         torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.07)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("M", (1, 3, 33, 257, 1024))
     @parametrize("K", (16, 32, 1024))
@@ -643,6 +841,10 @@ def test_rowwise_scaling_acceptable_input_dims(
         use_fast_accum = True
         device = "cuda"
         dtype_float8 = torch.float8_e4m3fn
+<<<<<<< HEAD
+=======
+        dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.randn(M, K, dtype=dtype, device=device)
         w = torch.randn(N, K, dtype=dtype, device=device)
@@ -687,13 +889,59 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         self.assertEqual(y_compiled.dtype, dtype)
         torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.07)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
+=======
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, "Not supported on non B200")
+    def test_mx_fp8_max_autotune(self):
+        M, K, N = 128, 32, 128
+        BLOCK_SIZE = 32
+        device = "cuda"
+        dtype = torch.bfloat16
+        A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+        B_ref = torch.eye(N, device=device, dtype=torch.bfloat16)
+        A = A_ref.to(torch.float8_e4m3fn)
+        B = B_ref.to(torch.float8_e4m3fn)
+        A_scale = torch.full(
+            (M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu
+        )
+        B_scale = torch.full(
+            (N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu
+        )
+        A_scale = to_blocked(A_scale)
+        B_scale = to_blocked(B_scale)
+
+        def linear(A, B, A_scale, B_scale):
+            y = torch._scaled_mm(
+                A,
+                B.t(),
+                A_scale,
+                B_scale,
+                out_dtype=torch.bfloat16,
+                use_fast_accum=False,
+            )
+            return y
+
+        y_eager = linear(A, B, A_scale, B_scale)
+
+        linear_compiled = torch.compile(linear, backend="inductor", mode="max-autotune")
+        y_compiled = linear_compiled(A, B, A_scale, B_scale)
+        self.assertEqual(y_eager.dtype, dtype)
+        self.assertEqual(y_compiled.dtype, dtype)
+        torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.07)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     def test_unacceptable_input_dims(self):
         # for compiled ops, type checking is in torch/_meta_registrations.py
         dtype: torch.dtype = torch.bfloat16
         device = "cuda"
         dtype_float8 = torch.float8_e4m3fn
+<<<<<<< HEAD
+=======
+        dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, device)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         M, K, N = 64, 15, 2048  # K needs to be a multiple of 16
         x = torch.randn(M, K, dtype=dtype, device=device)
         w = torch.randn(N, K, dtype=dtype, device=device)
@@ -727,12 +975,20 @@ def linear(x, w_t_fp8, w_inverse_scale, bias):
             in str(cm.exception)
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     def test_unacceptable_scale_dims_rowwise_scaling(self):
         dtype: torch.dtype = torch.bfloat16
         device = "cuda"
         dtype_float8 = torch.float8_e4m3fn
+<<<<<<< HEAD
+=======
+        dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, device)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         M, K, N = 233, 32, 128
         x = torch.randn(M, K, dtype=dtype, device=device)
         w = torch.randn(N, K, dtype=dtype, device=device)
@@ -765,5 +1021,9 @@ def linear(x, w_t_fp8, w_inverse_scale, bias):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     if HAS_CUDA:
+=======
+    if HAS_CUDA or HAS_CPU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_fused_attention.py b/test/inductor/test_fused_attention.py
index ac4541e3d681..e0c0636b15d7 100644
--- a/test/inductor/test_fused_attention.py
+++ b/test/inductor/test_fused_attention.py
@@ -15,7 +15,11 @@
     SM80OrLater,
 )
 from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_XPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def checkpoint_wrapper(fn):
@@ -61,6 +65,13 @@ def _check_common(
         args2 = self._clone_inputs(args1)
 
         for training in [False, True] if check_train else [False]:
+<<<<<<< HEAD
+=======
+            if training and self.device == "xpu":
+                # Intel GPU have not implemented sdpa backward yet mode.
+                # TODO: remove this when sdpa backward is implemented for XPU.
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for x in itertools.chain(args1[:], args2[:]):
                 if isinstance(x, torch.Tensor) and x.is_floating_point():
                     x.requires_grad = training
@@ -120,7 +131,11 @@ def dot_prod_attention(
         for dtype in [torch.float, torch.half]:
             atol = 0.001
             rtol = 1.3e-6 if dtype == torch.float else 0.7
+<<<<<<< HEAD
             if self.device == "cpu" and dtype == torch.half:
+=======
+            if self.device in ["cpu", "xpu"] and dtype == torch.half:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 atol = 2e-3
                 rtol = 1e-2
             self._check_common(dot_prod_attention, dtype=dtype, atol=atol, rtol=rtol)
@@ -144,10 +159,17 @@ def dot_prod_attention(
                 .matmul(value)
             )
 
+<<<<<<< HEAD
         for dtype in [torch.float, torch.half]:
             atol = 0.001
             rtol = 1.3e-6 if dtype == torch.float else 0.7
             if self.device == "cpu" and dtype == torch.half:
+=======
+        for dtype in [torch.half]:
+            atol = 0.001
+            rtol = 1.3e-6 if dtype == torch.float else 0.7
+            if self.device in ["cpu", "xpu"] and dtype == torch.half:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 atol = 2e-3
                 rtol = 1e-2
             with torch.no_grad():
@@ -160,6 +182,14 @@ def dot_prod_attention(
                 )
 
     def _test_insignificant_strides(self):
+<<<<<<< HEAD
+=======
+        if self.device == "xpu":
+            self.skipTest(
+                "The operator 'aten::_scaled_dot_product_efficient_attention'"
+                " is not currently implemented for the XPU device. "
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f32 = torch.float32
 
         # repro taken from https://github.com/pytorch/pytorch/issues/124289
@@ -229,7 +259,11 @@ def forward(
             )
             return _scaled_dot_product_efficient_attention
 
+<<<<<<< HEAD
         kwargs = aot_graph_input_parser(forward, device="cuda")
+=======
+        kwargs = aot_graph_input_parser(forward, device=GPU_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # runs successfully
         out_eager = forward(**kwargs)
         out_c = torch.compile(forward)(**kwargs)
@@ -389,9 +423,15 @@ def sfdp_pattern_7(query, key, value, training):
         )
 
         args = (
+<<<<<<< HEAD
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
+=======
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self._check_common(
             checkpoint_wrapper(sfdp_pattern_7),
@@ -421,9 +461,15 @@ def sfdp_pattern_8(query, key, value):
         self._check_common(sfdp_pattern_8, args, atol=2e-3)
 
         args = (
+<<<<<<< HEAD
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
+=======
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self._check_common(checkpoint_wrapper(sfdp_pattern_8), args, atol=2e-3)
 
@@ -455,9 +501,15 @@ def sfdp_pattern_9(query, key, value, training):
             atol=2e-3,
         )
         args = (
+<<<<<<< HEAD
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
+=======
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self._check_common(
             checkpoint_wrapper(sfdp_pattern_9),
@@ -488,9 +540,15 @@ def sfdp_pattern_10(query, key, value):
         self._check_common(sfdp_pattern_10, args, atol=2e-3)
 
         args = (
+<<<<<<< HEAD
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
             torch.randn((2, 8, 4, 16), device="cuda", dtype=torch.half),
+=======
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+            torch.randn((2, 8, 4, 16), device=GPU_TYPE, dtype=torch.half),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self._check_common(checkpoint_wrapper(sfdp_pattern_10), args, atol=2e-3)
 
@@ -940,6 +998,7 @@ def dot_prod_attention(
             check_train=False,
         )
 
+<<<<<<< HEAD
 
 if HAS_CUDA and PLATFORM_SUPPORTS_FUSED_ATTENTION:
 
@@ -948,12 +1007,147 @@ class SDPAPatternRewriterCudaTests(TestSDPAPatternRewriterTemplate):
         test_sdpa_rewriter_1_cuda = (
             TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_1
         )
+=======
+    def _test_sdpa_rewriter_20(self):
+        def dot_prod_attention(
+            query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, training
+        ) -> torch.Tensor:
+            """Input tensors assumed to have shape (batch_size, seq_len, n_head, embed_dim)"""
+            q = query.transpose(1, 2)
+            k = key.transpose(1, 2)
+            v = value.transpose(1, 2)
+            bs = q.size(0)
+            k_len = k.size(-2)
+            q = q / math.sqrt(q.size(-1))
+            scores = torch.matmul(q, k.transpose(-2, -1))
+            attn_mask = torch.ones(
+                bs, k_len, dtype=torch.bool, device=query.device
+            ).tril(diagonal=0)
+            attn_mask = (attn_mask == 0).view((bs, 1, 1, k_len)).expand_as(scores)
+            scores = scores.masked_fill(attn_mask, -float("inf"))
+            weights = torch.nn.functional.softmax(scores, dim=-1)
+            weights = torch.nn.functional.dropout(
+                weights,
+                p=0.4,
+                training=training,
+                inplace=False,
+            )
+            return torch.matmul(weights, v)
+
+        self._check_common(dot_prod_attention, check_train=False, has_dropout=True)
+
+    def _test_sdpa_rewriter_21(self):
+        def dot_prod_attention(
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            attn_mask: torch.Tensor,
+        ) -> torch.Tensor:
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            scores = torch.matmul(query, key.permute(0, 1, 3, 2))
+            scores += attn_mask
+            attn_weights = scores.float().softmax(dim=-1).type(value.dtype)
+            return attn_weights.matmul(value)
+
+        tensor_shape = (4, 2, 16, 32)
+        attn_mask = torch.randn((1, 1, 1, 2), dtype=torch.float, device=self.device)
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            attn_mask,
+        ]
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            has_dropout=False,
+            check_train=False,
+        )
+
+    def _test_sdpa_rewriter_22(self):
+        def dot_prod_attention(
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            attn_mask: torch.Tensor,
+        ) -> torch.Tensor:
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            scores = torch.matmul(query, key.permute(0, 1, 3, 2))
+            scores += attn_mask
+            attn_weights = scores.float().softmax(dim=-1).type(value.dtype)
+            return attn_weights.matmul(value), key, value
+
+        tensor_shape = (4, 2, 16, 32)
+        attn_mask = torch.randn((1, 1, 2, 2), dtype=torch.float, device=self.device)
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            attn_mask,
+        ]
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            has_dropout=False,
+            check_train=False,
+        )
+        # test attn_mask with stride of last dim != 1
+        attn_mask_ = attn_mask.transpose(2, 3)
+        args[3] = attn_mask_
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            has_dropout=False,
+            check_train=False,
+            contains=self.device == "cpu",
+        )
+
+    def _test_sdpa_rewriter_23(self):
+        def dot_prod_attention(
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+        ) -> torch.Tensor:
+            attn_mask = torch.full((1, 1, 1, 2), 0.0, device=query.device)
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            scores = torch.matmul(query, key.permute(0, 1, 3, 2))
+            scores += attn_mask
+            attn_weights = scores.float().softmax(dim=-1).type(value.dtype)
+            return attn_weights.matmul(value), key, value
+
+        tensor_shape = (4, 2, 16, 32)
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+        ]
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            has_dropout=False,
+            check_train=False,
+        )
+
+
+if HAS_XPU or (HAS_CUDA and PLATFORM_SUPPORTS_FUSED_ATTENTION):
+
+    class SDPAPatternRewriterGpuTests(TestSDPAPatternRewriterTemplate):
+        device = GPU_TYPE
+        test_sdpa_rewriter_1_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         test_sdpa_rewriter_1_freezing = (
             TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_1_freezing
         )
         test_insignificant_strides = (
             TestSDPAPatternRewriterTemplate._test_insignificant_strides
         )
+<<<<<<< HEAD
         test_pattern_fails_with_reuse_cuda = (
             TestSDPAPatternRewriterTemplate._test_pattern_fails_with_reuse
         )
@@ -1016,6 +1210,66 @@ class SDPAPatternRewriterCudaTests(TestSDPAPatternRewriterTemplate):
         )
 
     class SDPAPatternRewriterCudaDynamicTests(SDPAPatternRewriterCudaTests):
+=======
+        test_pattern_fails_with_reuse_gpu = (
+            TestSDPAPatternRewriterTemplate._test_pattern_fails_with_reuse
+        )
+        test_sdpa_rewriter_2_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_2
+        test_sdpa_rewriter_3_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_3
+        test_sdpa_rewriter_4_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_4
+        test_sdpa_rewriter_5_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_5
+        test_sdpa_rewriter_6_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_6
+        test_sdpa_rewriter_7_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_7
+        test_sdpa_rewriter_8_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_8
+        test_sdpa_rewriter_9_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_9
+        test_sdpa_rewriter_10_gpu = (
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_10
+        )
+        test_pattern_fails_with_tensor_factor_gpu = (
+            TestSDPAPatternRewriterTemplate._test_pattern_fails_with_tensor_factor
+        )
+        test_pattern_fails_with_unsupported_mask_gpu = (
+            TestSDPAPatternRewriterTemplate._test_pattern_fails_with_unsupported_mask
+        )
+        test_sdpa_rewriter_11_gpu = (
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_11
+        )
+        test_sdpa_rewriter_12_gpu = (
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_12
+        )
+        test_sdpa_prev_13_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_prev_13
+        test_sdpa_prev_14_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_prev_14
+        test_sdpa_prev_15_gpu = TestSDPAPatternRewriterTemplate._test_sdpa_prev_15
+        test_sdpa_rewriter_13_gpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_13, dtype=torch.half
+        )
+        test_sdpa_rewriter_14_gpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_14
+        )
+        test_sdpa_rewriter_15_gpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_15
+        )
+        test_sdpa_rewriter_17_gpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_17
+        )
+        test_sdpa_rewriter_19_gpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_19
+        )
+        test_sdpa_rewriter_20_gpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_20
+        )
+        test_sdpa_rewriter_21_gpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_21
+        )
+        test_sdpa_rewriter_22_gpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_22
+        )
+        test_sdpa_rewriter_23_gpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_23
+        )
+
+    class SDPAPatternRewriterGpuDynamicTests(SDPAPatternRewriterGpuTests):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         use_static_shapes = False
 
 
@@ -1068,6 +1322,21 @@ class SDPAPatternRewriterCpuTests(TestSDPAPatternRewriterTemplate):
         test_sdpa_rewriter_19_cpu = functools.partialmethod(
             TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_19
         )
+<<<<<<< HEAD
+=======
+        test_sdpa_rewriter_20_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_20
+        )
+        test_sdpa_rewriter_21_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_21
+        )
+        test_sdpa_rewriter_22_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_22
+        )
+        test_sdpa_rewriter_23_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_23
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class SDPAPatternRewriterCpuDynamicTests(SDPAPatternRewriterCpuTests):
         use_static_shapes = False
diff --git a/test/inductor/test_fuzzer.py b/test/inductor/test_fuzzer.py
index cf36465d2e77..32f82b879bcd 100644
--- a/test/inductor/test_fuzzer.py
+++ b/test/inductor/test_fuzzer.py
@@ -11,6 +11,10 @@
 from torch._inductor.fuzzer import ConfigFuzzer, MODULE_DEFAULTS, SamplingMethod, Status
 from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal import fake_config_module as fake_config
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import IS_LINUX
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
@@ -163,6 +167,10 @@ def myfn():
         )
 
     @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not IS_LINUX, "PerfCounters are only supported on Linux")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_config_fuzzer_dynamo_bisect(self):
         # these values just chosen randomly, change to different ones if necessary
         key_1 = {"dead_code_elimination": False, "specialize_int": True}
diff --git a/test/inductor/test_fxir_backend.py b/test/inductor/test_fxir_backend.py
new file mode 100644
index 000000000000..02a1d59627c9
--- /dev/null
+++ b/test/inductor/test_fxir_backend.py
@@ -0,0 +1,512 @@
+# Owner(s): ["module: inductor"]
+"""
+Test the FX IR backend.
+"""
+
+import itertools
+import operator
+import unittest
+from typing import Callable, Optional
+
+import sympy
+
+import torch
+import torch._inductor.codegen.common as common
+import torch.utils._pytree as pytree
+from torch._dynamo.exc import BackendCompilerFailed
+from torch._dynamo.utils import same
+from torch._higher_order_ops.triton_kernel_wrap import triton_kernel_wrapper_mutation
+from torch._inductor import config
+from torch._inductor.codegen.common import register_backend_for_device
+from torch._inductor.codegen.cpp import CppScheduling
+from torch._inductor.codegen.triton import TritonScheduling
+from torch._inductor.codegen.wrapper_fxir import FxConverter, WrapperFxCodegen
+from torch._inductor.select_algorithm import extern_kernels
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_GPU,
+    requires_gpu,
+    TRITON_HAS_CPU,
+)
+
+
+@requires_gpu()
+@config.patch(
+    compile_threads=1,
+    alignment_asserts=False,
+    size_asserts=False,
+    scalar_asserts=False,
+    nan_asserts=False,
+)
+@instantiate_parametrized_tests
+class FxirTestCase(InductorTestCase):
+    device = GPU_TYPE
+
+    def _count_ops(self, gm: torch.fx.GraphModule, target: Callable) -> int:
+        return len(gm.graph.find_nodes(op="call_function", target=target))
+
+    def _run_and_capture_graphs(self, opt, args) -> torch.fx.GraphModule:
+        gms = []
+
+        orig_generate = FxConverter.generate
+
+        def generate(self) -> torch.fx.GraphModule:
+            nonlocal gms
+            gm = orig_generate(self)
+            gms.append(gm)
+            return gm
+
+        with unittest.mock.patch.object(
+            torch._inductor.codegen.wrapper_fxir.FxConverter, "generate", generate
+        ):
+            opt(*args)
+
+        return gms
+
+    def _compile_and_check(
+        self,
+        func,
+        args,
+        expected_num_triton_kernels: int = 1,
+        metadata_only: bool = False,
+        compile_kwargs: Optional[dict] = None,
+    ):
+        if compile_kwargs is None:
+            compile_kwargs = {}
+
+        opt = torch.compile(func, **compile_kwargs)
+
+        # Get the FX graph from the backend.
+        gms = self._run_and_capture_graphs(opt, args)
+
+        # Check the code for triton kernels.
+        num_kernels = sum(
+            self._count_ops(gm, triton_kernel_wrapper_mutation) for gm in gms
+        )
+        self.assertEqual(num_kernels, expected_num_triton_kernels)
+
+        # Check accuracy.
+        result = opt(*args)
+        ref = func(*args)
+        if metadata_only:
+            # When we only want to check metadata, fill in zeros for tensor data.
+            ref, result = tuple(
+                pytree.tree_map(torch.zeros_like, x) for x in (ref, result)
+            )
+
+        self.assertTrue(same(ref, result))
+
+        return gms
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+        # Register the FX backend.
+        register_backend_for_device(cls.device, TritonScheduling, WrapperFxCodegen)
+
+    def test_basic(self):
+        args = [torch.randn(8, device=self.device) for _ in range(2)]
+        self._compile_and_check(torch.add, args)
+
+    def test_multiple_kernels(self):
+        def foo(x, y):
+            return x.sum() + y.sum()
+
+        args = [torch.randn(length, device=self.device) for length in [517, 1029]]
+        self._compile_and_check(foo, args, expected_num_triton_kernels=2)
+
+    def test_free(self):
+        """
+        Test a program that frees a buffer which is no longer in use.
+        """
+
+        def foo(x, y, z):
+            w = x.sum() + y
+            return z.sum() + w.sum()
+
+        args = [torch.randn(length, device=self.device) for length in [517, 1029, 123]]
+        (gm,) = self._compile_and_check(foo, args, expected_num_triton_kernels=3)
+
+        # Check the generated code for frees.
+        num_frees = gm.code.count("= None")
+        self.assertGreater(num_frees, 0)
+
+    def test_extern(self):
+        """
+        Test a program that calls an extern kernel.
+        """
+
+        def foo(x, y):
+            return x @ y + y.sum()
+
+        args = [
+            torch.randn(size, device=self.device) for size in [(129, 129), (129, 1)]
+        ]
+        (gm,) = self._compile_and_check(foo, args, expected_num_triton_kernels=1)
+
+        # Check for the extern kernel
+        num_extern = self._count_ops(gm, extern_kernels.addmm)
+        self.assertEqual(num_extern, 1)
+
+    def test_fallback(self):
+        """
+        Test a program that calls an aten fallback.
+        """
+
+        length = 8
+
+        def foo(x):
+            return x + torch.randn(1, device=self.device)
+
+        args = (torch.randn(length, device=self.device),)
+
+        # Since the program has a random output, just check metadata.
+        # Don't check for an exact value.
+        (gm,) = self._compile_and_check(
+            foo, args, expected_num_triton_kernels=2, metadata_only=True
+        )
+
+        # Check for the fallback kernel.
+        num_fallback = self._count_ops(gm, torch.ops.aten.randint.low_out)
+        self.assertEqual(num_fallback, 1)
+
+    def test_cat_inputs(self):
+        """
+        Test concatenation of graph inputs.
+        """
+
+        def foo(x, y):
+            return torch.cat((x, y)) + 1
+
+        args = [torch.randn(8, device=self.device) for _ in range(2)]
+        self._compile_and_check(foo, args, expected_num_triton_kernels=1)
+
+    def test_cat_views(self):
+        """
+        Test concatenation with multiple kernels writing to the same buffer.
+        """
+
+        def foo(x, y):
+            a = x - 2
+            b = y.sum(0, keepdim=True)
+            c = torch.cat((a, b)).clone()
+            return a, b, c
+
+        args = [torch.randn(8, device=self.device) for _ in range(2)]
+        (gm,) = self._compile_and_check(foo, args, expected_num_triton_kernels=2)
+
+        def get_offset(node: torch.fx.Node) -> int:
+            (input_, shape, stride, offset) = node.args
+            assert isinstance(offset, int)
+            return offset
+
+        # Check for 2 views, one of which is offset.
+        as_strided_nodes = list(
+            gm.graph.find_nodes(op="call_function", target=torch.as_strided)
+        )
+        self.assertEqual(len(as_strided_nodes), 2)
+        num_offset_views = sum(get_offset(node) > 0 for node in as_strided_nodes)
+        self.assertEqual(num_offset_views, 1)
+
+    def test_cat_to_alloc(self):
+        """
+        Test concatenation that's optimized out to an allocation.
+        """
+        length = 8
+
+        def foo(x):
+            y, z = tuple(
+                torch.arange(length // 2, device=self.device) for _ in range(2)
+            )
+            return x + torch.cat((y, z))
+
+        args = [torch.randn(length, device=self.device)]
+        (gm,) = self._compile_and_check(foo, args, expected_num_triton_kernels=1)
+
+        # Expect a single allocation, even though eager mode would use 2.
+        num_allocs = self._count_ops(gm, torch.empty_strided)
+        self.assertEqual(num_allocs, 1)
+
+    def test_cat_reinterpret_view(self):
+        """
+        Test torch.cat using ReinterpretView.
+        """
+        length = 8
+
+        def foo(x):
+            y, z = tuple(torch.randn(length // 2, device=self.device) for _ in range(2))
+            return x + torch.cat((y, z))
+
+        args = [torch.randn(length, device=self.device)]
+
+        # Since this test generates random numbers, check metadata only.
+        (gm,) = self._compile_and_check(
+            foo, args, expected_num_triton_kernels=3, metadata_only=True
+        )
+
+        # Check for as_strided. We map ReinterpretView to this.
+        num_as_strided = self._count_ops(gm, torch.as_strided)
+        self.assertEqual(num_as_strided, 2)
+
+    def test_reshape_output(self):
+        """
+        Test reshaping the output, which maps to a ReinterpretView.
+        """
+
+        def foo(x, y):
+            return torch.reshape(x + y, (8,))
+
+        args = [torch.randn((2, 4), device=self.device) for _ in range(2)]
+        (gm,) = self._compile_and_check(foo, args, expected_num_triton_kernels=1)
+
+        # Check for as_strided. We map ReinterpretView to this.
+        num_as_strided = self._count_ops(gm, torch.as_strided)
+        self.assertEqual(num_as_strided, 1)
+
+    def test_extern_multi_output(self):
+        """
+        Test an extern kernel with multiple outputs.
+        Also test a graph with multiple outputs.
+        """
+
+        def foo(x):
+            top, idx = torch.topk(x, 2)
+            return top + 1, idx * 2
+
+        args = [torch.randn(8, device=self.device)]
+        (gm,) = self._compile_and_check(foo, args, expected_num_triton_kernels=2)
+
+        # Check for multiple kernel outputs via getitems.
+        num_getitems = self._count_ops(gm, operator.getitem)
+        self.assertEqual(num_getitems, 2)
+
+        # Check for multiple graph outputs.
+        output_node = gm.graph.find_nodes(op="output")[0]
+        self.assertEqual(len(output_node.args[0]), 2)
+
+    def test_duplicate_input(self):
+        """
+        Test duplicated inputs. This will collapse into a single input in the GM.
+        """
+
+        args = [torch.randn(4, device=self.device)] * 2
+        (gm,) = self._compile_and_check(torch.add, args, expected_num_triton_kernels=1)
+
+        num_placeholders = len(gm.graph.find_nodes(op="placeholder"))
+        self.assertEqual(num_placeholders, 1)
+
+    def test_backward(self):
+        """
+        Test a program with a backward pass.
+        """
+
+        x = torch.ones(5, device=self.device)  # input tensor
+        y = torch.zeros(3, device=self.device)  # expected output
+        w = torch.randn(5, 3, requires_grad=True, device=self.device)
+        b = torch.randn(3, requires_grad=True, device=self.device)
+
+        def foo(x, y):
+            z = torch.matmul(x, w) + b
+            loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
+            loss.backward()
+            return w.grad, b.grad
+
+        # Expect separate forward and backward graphs.
+        (forward_gm, backward_gm) = self._compile_and_check(
+            foo, (x, y), expected_num_triton_kernels=3
+        )
+
+    def test_custom_compiler(self):
+        """
+        Test a derived backend with a custom compiler.
+        """
+        offset = 1
+
+        class CustomWrapperCodegen(WrapperFxCodegen):
+            def compile_graph(self, gm):
+                def compiled_fn(*args):
+                    # Adds an offset to the program's outputs.
+                    outputs = gm(*args)
+                    return pytree.tree_map(lambda x: x + 1, outputs)
+
+                return compiled_fn
+
+        args = [torch.randn(8, device=self.device) for _ in range(2)]
+        custom_backend = common.DeviceCodegen(
+            TritonScheduling, CustomWrapperCodegen, None
+        )
+        with unittest.mock.patch.dict(
+            common.device_codegens, {self.device: custom_backend}
+        ):
+            func = torch.add
+            opt = torch.compile(func)
+            result = opt(*args)
+
+        # Check the output is offset from eager mode.
+        ref = func(*args)
+        self.assertFalse(same(result, ref))
+        self.assertNotEqual(offset, 0)
+        self.assertTrue(same(result - offset, ref))
+
+    def test_dynamic_shapes_and_strides(self):
+        """
+        Test a graph with dynamic shapes and strides.
+        """
+
+        static_dims = (8, 8)
+
+        def get_input():
+            full_size = (16, 8)
+            full = torch.randn(full_size, device=self.device)
+            view = torch.as_strided(full, static_dims, full.stride())
+            return view
+
+        func = torch.add
+        args = [get_input() for _ in range(2)]
+        (gm,) = self._compile_and_check(func, args, compile_kwargs={"dynamic": True})
+
+        # Check for a symbolic output shape.
+        (empty_strided,) = gm.graph.find_nodes(
+            op="call_function", target=torch.empty_strided
+        )
+        example_tensor = empty_strided.meta["val"]
+        symbolic_dims = example_tensor.shape
+        self.assertEqual(len(symbolic_dims), len(static_dims))
+
+        # Check for symbolic output strides.
+        (stride, one) = example_tensor.stride()
+        self.assertEqual(one, sympy.S.One)
+
+        # Find the size symbols, and check for a corresponding placeholders defining them.
+        for symbol in itertools.chain(symbolic_dims, [stride]):
+            self.assertTrue(isinstance(symbol, torch.SymInt))
+            (placeholder,) = [
+                node
+                for node in gm.graph.find_nodes(op="placeholder")
+                if node.name == str(symbol)
+            ]
+            self.assertEqual(placeholder.meta["val"], symbol)
+
+    @config.patch({"trace.enabled": True})
+    @unittest.mock.patch("torch._inductor.debug.DebugFormatter.output_code")
+    def test_debug(self, mock_output_code):
+        # Compile in debug mode.
+        args = [torch.randn(11, device=self.device) for _ in range(2)]
+        self._compile_and_check(torch.sub, args)
+
+        # Check the output code for a Triton kernel call.
+        mock_output_code.assert_called_once()
+        (output_filename,) = mock_output_code.call_args.args
+        with open(output_filename) as f:
+            output_code = f.read()
+        self.assertIn("triton_kernel_wrapper_mutation", output_code)
+
+    @parametrize(
+        "const",
+        (1, 1.5),
+    )
+    def test_export_const_placeholder(self, const):
+        """
+        Test that we can compile a graph coming from torch.export with a constant input.
+        """
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x, y):
+                return x - y
+
+        args = (torch.randn(8, device=self.device), const)
+        mod = TestModule()
+        export_gm = torch.export.export(mod, args).module()
+
+        def compile_module(*inps):
+            torch._inductor.compile(export_gm, inps)
+
+        (inductor_gm,) = self._run_and_capture_graphs(compile_module, args)
+        result = inductor_gm(*args)
+        ref = mod(*args)
+
+        self.assertTrue(same(ref, result))
+
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_subgraph_raises(self):
+        """
+        Test a model with subgraphs. This is not yet supported, so check that we get the
+        expected exception.
+        """
+
+        def foo(cond, x):
+            return torch.cond(cond, torch.cos, torch.sin, [x])
+
+        cond = torch.tensor([True], device=self.device)
+        x = torch.ones([2, 3], device=self.device)
+
+        with self.assertRaisesRegex(BackendCompilerFailed, "Subgraph"):
+            self._compile_and_check(foo, [cond, x])
+
+    def test_cpp_raises(self):
+        """
+        Test the C++ CPU backend. C++ kernels are not yet supported, so for now check
+        that we get the expected exception.
+        """
+
+        def foo(x, y):
+            return x + y * 5
+
+        device = torch.device("cpu")
+        args = [torch.randn(5, device=device) for _ in range(2)]
+
+        cpp_backend = common.DeviceCodegen(CppScheduling, WrapperFxCodegen, None)
+        with (
+            unittest.mock.patch.dict(
+                common.device_codegens, {device.type: cpp_backend}
+            ),
+            self.assertRaisesRegex(BackendCompilerFailed, "Triton"),
+        ):
+            self._compile_and_check(foo, args)
+
+    @parametrize("enable_tuning", (False, True))
+    @parametrize("use_dynamic_shapes", (False, True))
+    def test_autotune(self, use_dynamic_shapes: bool, enable_tuning: bool):
+        orig_run = torch._inductor.runtime.triton_heuristics.CachingAutotuner.run
+        called = False
+
+        def run(*args, **kwargs):
+            nonlocal called
+            called = True
+            return orig_run(*args, **kwargs)
+
+        args = [torch.randn(8, device=self.device) for _ in range(2)]
+
+        with (
+            config.patch("triton.autotune_at_compile_time", enable_tuning),
+            unittest.mock.patch.object(
+                torch._inductor.runtime.triton_heuristics.CachingAutotuner, "run", run
+            ),
+        ):
+            # Compile and check that the tuner was called.
+            self.assertFalse(called)
+            (gm,) = self._compile_and_check(
+                torch.mul, args, compile_kwargs={"dynamic": use_dynamic_shapes}
+            )
+            self.assertEqual(called, enable_tuning)
+
+        # Check for a symbolic output shape.
+        (empty_strided,) = gm.graph.find_nodes(
+            op="call_function", target=torch.empty_strided
+        )
+        (shape, stride) = empty_strided.args
+        output_is_symbolic = any(isinstance(dim, torch.SymInt) for dim in shape)
+        self.assertEqual(output_is_symbolic, use_dynamic_shapes)
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    if HAS_GPU or TRITON_HAS_CPU:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_gpu_cpp_wrapper.py b/test/inductor/test_gpu_cpp_wrapper.py
index db7ba1576e04..5484d851b9c5 100644
--- a/test/inductor/test_gpu_cpp_wrapper.py
+++ b/test/inductor/test_gpu_cpp_wrapper.py
@@ -7,12 +7,17 @@
 import torch
 from torch._inductor import config
 from torch._inductor.test_case import TestCase as InductorTestCase
+<<<<<<< HEAD
 from torch._inductor.utils import is_gpu
 from torch.testing._internal.common_device_type import (
     get_desired_device_type_test_bases,
 )
 from torch.testing._internal.common_utils import slowTest
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+=======
+from torch.testing._internal.common_utils import slowTest
+from torch.testing._internal.inductor_utils import GPU_TYPE, RUN_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -39,12 +44,15 @@
     raise
 
 
+<<<<<<< HEAD
 _desired_test_bases = get_desired_device_type_test_bases(allow_xpu=True)
 RUN_GPU = HAS_GPU and any(
     is_gpu(getattr(x, "device_type", "")) for x in _desired_test_bases
 )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GpuWrapperTemplate:
     pass
 
@@ -312,12 +320,21 @@ class BaseTest(NamedTuple):
         skip_list = ["test_addmm", "test_linear_relu"]
         # need to skip instead of omit, otherwise fbcode ci can be flaky
         for test_name in skip_list:
+<<<<<<< HEAD
             test_failures_gpu_wrapper[
                 f"{test_name}_cuda"
             ] = test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
             test_failures_gpu_wrapper[
                 f"{test_name}_gpu_dynamic_shapes"
             ] = test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
+=======
+            test_failures_gpu_wrapper[f"{test_name}_cuda"] = (
+                test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
+            )
+            test_failures_gpu_wrapper[f"{test_name}_gpu_dynamic_shapes"] = (
+                test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     test_torchinductor.copy_tests(
         GpuWrapperTemplate, TestGpuWrapper, "gpu_wrapper", test_failures_gpu_wrapper
diff --git a/test/inductor/test_group_batch_fusion.py b/test/inductor/test_group_batch_fusion.py
index 58a356c63e81..8f214a4b7912 100644
--- a/test/inductor/test_group_batch_fusion.py
+++ b/test/inductor/test_group_batch_fusion.py
@@ -484,7 +484,11 @@ def test_pointwise_op_fusion_post_grad(self):
         self.assertEqual(counters["inductor"]["batch_aten_tanh"], 1)
         self.assertEqual(counters["inductor"]["batch_aten_relu"], 1)
         self.assertEqual(counters["inductor"]["batch_aten_sigmoid"], 1)
+<<<<<<< HEAD
         self.assertEqual(counters["inductor"]["unbind_stack_aten_pass"], 1)
+=======
+        self.assertEqual(counters["inductor"]["unbind_stack_aten_pass"], 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref.sum().backward()
         res.sum().backward()
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
diff --git a/test/inductor/test_helion_kernels.py b/test/inductor/test_helion_kernels.py
new file mode 100644
index 000000000000..7690c13f63bd
--- /dev/null
+++ b/test/inductor/test_helion_kernels.py
@@ -0,0 +1,69 @@
+# Owner(s): ["module: inductor"]
+import torch
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_utils import instantiate_parametrized_tests
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_HELION, requires_helion
+
+
+if HAS_HELION:
+    import helion
+    import helion.language as hl
+
+
+class HelionTests(TestCase):
+    @requires_helion()
+    def test_add_kernel(self):
+        @helion.kernel(config=helion.Config(block_sizes=[1, 2]))
+        def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            # match pytorch broadcasting rules
+            x, y = torch.broadcast_tensors(x, y)
+            out = torch.empty(
+                x.shape,
+                # match type promotion of torch.add
+                dtype=torch.promote_types(x.dtype, y.dtype),
+                device=x.device,
+            )
+            # tile will be a tuple of blocks
+            for tile in hl.tile(out.size()):
+                out[tile] = x[tile] + y[tile]
+            return out
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            return add(x, y)
+
+        x = torch.randn(4, 8, device=GPU_TYPE, dtype=torch.float16)
+        y = torch.randn(4, 8, device=GPU_TYPE, dtype=torch.float16)
+
+        out = add(x, y)
+        compiled_add = torch.compile(f, fullgraph=True, backend="inductor")
+        compiled_out = compiled_add(x, y)
+
+        self.assertEqual(out, x + y)
+        self.assertEqual(compiled_out, x + y)
+
+    @requires_helion()
+    def test_softmax_view_reshape(self):
+        @helion.kernel(config={"block_size": 1})
+        def softmax(x: torch.Tensor) -> torch.Tensor:
+            n, _m = x.size()
+            out = torch.empty_like(x)
+            for tile_n in hl.tile(n):
+                values = x[tile_n, :]
+                amax = torch.amax(values, dim=1).view(tile_n, 1)
+                exp = torch.exp(values - amax)
+                sum_exp = torch.reshape(torch.sum(exp, dim=1), [tile_n, 1])
+                out[tile_n, :] = exp / sum_exp
+            return out
+
+        x = torch.randn([1024, 1024], device=GPU_TYPE, dtype=torch.float16)
+        result = softmax(x)
+        self.assertEqual(
+            result, torch.nn.functional.softmax(x, dim=1), rtol=1e-2, atol=1e-1
+        )
+
+
+instantiate_parametrized_tests(HelionTests)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_indexing.py b/test/inductor/test_indexing.py
index aa1dfc26abae..b67bd4609fdc 100644
--- a/test/inductor/test_indexing.py
+++ b/test/inductor/test_indexing.py
@@ -95,7 +95,11 @@ def test_indexing_simplification(self):
             ModularIndexing(i0 + i1 * i2 * r3, i2, r3), ModularIndexing(i0, i2, r3)
         )
 
+<<<<<<< HEAD
         # if there are negative terms, we cannot optimize away zero terms due to https://github.com/openai/triton/issues/619
+=======
+        # if there are negative terms, we cannot optimize away zero terms due to https://github.com/triton-lang/triton/issues/619
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             ModularIndexing(-i0 + i1 * 20, 2, 10), ModularIndexing(-i0 + i1 * 20, 2, 10)
         )
@@ -205,6 +209,16 @@ def test_modular_indexing_pairs_not_merged(self):
         self.assertEqual(expr2, actual)
         self.assertNotEqual(ModularIndexing(x, 1, b), actual)
 
+<<<<<<< HEAD
+=======
+    def test_modular_indexing_positive(self):
+        x = sympy.Symbol("x", integer=True, positive=True)
+        expr = ModularIndexing(x, 1, 1024) - 1
+        expr2 = abs(expr)
+
+        self.assertNotEqual(expr2, expr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_expand_floor_div_skipped(self):
         sizevars = SizeVarAllocator()
         x = sympy.Symbol("x", integer=True, positive=True)
@@ -320,6 +334,21 @@ def test_print_round(self):
             texpr(expr), """libdevice.llrint((1/2)*x).to(tl.int64)"""
         )
 
+<<<<<<< HEAD
+=======
+    def test_print_integer(self):
+        expr = sympy.S((-1) << 63)
+        self.assertExpectedInline(cexpr(expr), f"""(-1{LONG_SUFFIX} << 63)""")
+
+        expr = sympy.S(((-1) << 63) - 1)
+        with self.assertRaises(OverflowError):
+            cexpr(expr)
+
+        expr = sympy.S(1 << 63)
+        with self.assertRaises(OverflowError):
+            cexpr(expr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_print_mod(self):
         x = sympy.Symbol("x", integer=True)
         expr = Mod(x - 1, 2)
@@ -420,9 +449,15 @@ def test_print_Min_Max(self):
             )
             self.assertEqual(
                 cexpr(expr),
+<<<<<<< HEAD
                 f"std::{s}({{x, 2LL*x, 3LL*x}})"
                 if sys.platform in ["darwin", "win32"]
                 else f"std::{s}({{x, 2L*x, 3L*x}})",
+=======
+                f"std::{s}<int64_t>({{x, 2LL*x, 3LL*x}})"
+                if sys.platform in ["darwin", "win32"]
+                else f"std::{s}<int64_t>({{x, 2L*x, 3L*x}})",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
diff --git a/test/inductor/test_inductor_freezing.py b/test/inductor/test_inductor_freezing.py
index 377a87a673a0..612ab8c6d5a8 100644
--- a/test/inductor/test_inductor_freezing.py
+++ b/test/inductor/test_inductor_freezing.py
@@ -17,7 +17,16 @@
 from torch._inductor.utils import override_lowering, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import SM80OrLater, tf32_on_and_off
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_FBCODE, skipIfRocm, skipIfXpu
+=======
+from torch.testing._internal.common_utils import (
+    IS_FBCODE,
+    skipIfRocm,
+    skipIfXpu,
+    TEST_WITH_SLOW_GRADCHECK,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Make the helper files in test/ importable
@@ -510,7 +519,11 @@ def foo(mod, x):
                 out_optimized_for_infernece, code = run_and_get_code(foo, mod, x)
 
             # we unfuse the conv bias, but it should only have one constant in the kernel
+<<<<<<< HEAD
             if self.device == GPU_TYPE:
+=======
+            if self.device == "cuda":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 FileCheck().check_not(".run(").check("conv").check(".run(").check_same(
                     "frozen_param"
                 ).check_not("frozen_param").check_next("return").run(code[0])
@@ -555,7 +568,11 @@ def foo(mod, x):
                 out_optimized_for_infernece, code = run_and_get_code(foo, mod, x)
 
             # we unfuse the conv bias, but it should only have one constant in the kernel
+<<<<<<< HEAD
             if self.device == GPU_TYPE:
+=======
+            if self.device == "cuda":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 FileCheck().check_not(".run(").check("conv").check(".run(").check_same(
                     "frozen_param"
                 ).check_not("frozen_param").check_next("return").run(code[0])
@@ -785,6 +802,13 @@ def foo(mod, inp):
 
     @skipIfXpu
     @unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        TEST_WITH_SLOW_GRADCHECK,
+        "Failing in slow gradcheck on cuda12.8, see https://github.com/pytorch/pytorch/pull/156731 for example",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cpp_wrapper(self):
         mod = ConvBN(3, 32, kernel_size=3, stride=2).eval().to(self.device)
 
@@ -879,7 +903,11 @@ def my_inner_compile(gm, example_inputs, *args, **kwargs):
         # in the joint graph rather than torch.ops.aten.convolution.default.
         # Currently we only handle aten.convolution.default in layout
         # optimization. That's why the count may be 0 here for CPU.
+<<<<<<< HEAD
         if self.device == GPU_TYPE:
+=======
+        if self.device == "cuda":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertTrue(nconv == 1)
 
     def test_unequal_bias_horizontal_addmm_fusion(self):
diff --git a/test/inductor/test_inductor_scheduler.py b/test/inductor/test_inductor_scheduler.py
new file mode 100644
index 000000000000..fb59ba76481b
--- /dev/null
+++ b/test/inductor/test_inductor_scheduler.py
@@ -0,0 +1,176 @@
+# Owner(s): ["module: inductor"]
+
+import torch
+import torch._inductor.metrics as metrics
+import torch.utils.flop_counter
+from torch._dynamo.utils import counters
+from torch._inductor.ir import FixedLayout
+from torch._inductor.utils import fresh_cache
+from torch.testing._internal.common_cuda import SM70OrLater
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    skipCUDAIf,
+)
+from torch.testing._internal.common_utils import parametrize, run_tests, TestCase
+
+
+def FlopCounterMode(*args, **kwargs):
+    return torch.utils.flop_counter.FlopCounterMode(*args, **kwargs, display=False)
+
+
+def get_total_flops(mode):
+    return sum(v for _, v in mode.flop_counts["Global"].items())
+
+
+def random_tensor(size, dtype, **kwargs):
+    if dtype in [torch.half, torch.bfloat16, torch.float, torch.double]:
+        return torch.randn(size, dtype=dtype, **kwargs)
+    elif dtype in [torch.uint8, torch.int8, torch.short, torch.int, torch.long]:
+        return torch.randint(0, 100, size, dtype=dtype, **kwargs)
+    else:
+        raise ValueError("Unsupported data type")
+
+
+def cT(device, dtype):
+    def T(*shape, requires_grad=False):
+        return random_tensor(
+            shape, requires_grad=requires_grad, device=device, dtype=dtype
+        )
+
+    return T
+
+
+inductor_metrics_log = torch._logging.getArtifactLogger(__name__, "inductor_metrics")
+
+
+def _test_cases(device, dtype):
+    T = cT(device, dtype)
+
+    def composite(x, y, z):
+        tmp = torch.mm(x + 10, y / 12)
+        return torch.mm(tmp, z)
+
+    def composite_relu(x, y):
+        tmp = torch.mm(x, y)
+        return torch.relu(tmp)
+
+    test_cases = [
+        (torch.mm, [T(4, 5), T(5, 6)], {}),
+        (torch.add, [T(4, 5), T(4, 5)], {}),
+        (composite, [T(5, 4), T(4, 3), T(3, 12)], {}),
+        (composite_relu, [T(5, 4), T(4, 3)], {}),
+    ]
+    return test_cases
+
+
+class TestScheduler(TestCase):
+    @dtypes(torch.float, torch.float16)
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    def test_disable_get_estimated_runtime_logging(self, device, dtype):
+        if device == "cpu":
+            return
+        tc = _test_cases(device, dtype)
+        # turn off logging of inductor metrics so that they don't get logged
+        torch._logging.set_logs(inductor_metrics=False)
+        metrics.reset()
+        for op, example_inputs, kwargs in tc:
+            comp = torch.compile(op)
+            torch._dynamo.reset()
+            with fresh_cache():
+                comp(*example_inputs, **kwargs)
+            self.assertEqual(metrics.num_bytes_accessed, 0)
+            self.assertEqual(any(m[1] for m in metrics.node_runtimes), False)
+            self.assertEqual(any(m[1] for m in metrics.nodes_num_elem), False)
+            metrics.reset()
+        torch._logging.set_logs()
+
+    @dtypes(torch.float, torch.float16)
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    def test_get_estimated_runtime_logging(self, device, dtype):
+        if device == "cpu":
+            return
+        tc = _test_cases(device, dtype)
+        expected_metrics = [
+            # num_bytes_accessed, number of nonzero node_runtimes
+            (74 * dtype.itemsize, 1),
+            (60 * dtype.itemsize, 1),
+            (222 * dtype.itemsize, 4),
+            (77 * dtype.itemsize, 2),
+        ]
+        tc_plus_metrics = zip(tc, expected_metrics)
+
+        metrics.reset()
+        torch._logging.set_logs(inductor_metrics=True)
+        for test_case, met in tc_plus_metrics:
+            op, example_inputs, kwargs = test_case
+            enba, enr = met
+
+            comp = torch.compile(op)
+            torch._dynamo.reset()
+            with fresh_cache():
+                comp(*example_inputs, **kwargs)
+            self.assertEqual(enba, metrics.num_bytes_accessed)
+            nonzero_node_runtimes = sum(1 for x in metrics.node_runtimes if x[1] != 0)
+            self.assertEqual(enr, nonzero_node_runtimes)
+            metrics.reset()
+        torch._logging.set_logs()
+
+    @dtypes(torch.float, torch.float16)
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    @parametrize(
+        "options",
+        [
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+                "force_disable_caches": True,
+            },
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON,ATEN",
+                "force_disable_caches": True,
+            },
+        ],
+    )
+    def test_flop_counter_op(self, device, dtype, options):
+        if device == "cpu":
+            return
+        if (
+            options["max_autotune_gemm_backends"] == "TRITON"
+            and torch.cuda.is_available()
+            and not torch._inductor.utils.use_triton_template(
+                FixedLayout(torch.device("cuda"), torch.float16, [400, 800])
+            )
+        ):
+            return
+
+        tc = _test_cases(device, dtype)
+
+        torch._logging.set_logs(inductor_metrics=True)
+        for op, example_inputs, kwargs in tc:
+            comp = torch.compile(op, options=options)
+            # next two lines are required, otherwise the flops will be cached from pervious runs of this function.
+            torch._dynamo.reset()
+            with fresh_cache():
+                # actually run to set the counters
+                comp(*example_inputs, **kwargs)
+                with FlopCounterMode() as mode:
+                    comp(*example_inputs, **kwargs)
+            reference_flops = get_total_flops(mode)
+
+            self.assertEqual(
+                reference_flops,
+                counters["inductor"]["flop_count"],
+                msg=f"op = {op} reference flops = {reference_flops} != counters {counters['inductor']['flop_count']}",
+            )
+            if op != torch.add:
+                self.assertNotEqual(reference_flops, 0, msg=f"op = {op} is 0 flops")
+            counters["inductor"]["flop_count"] = 0
+        torch._logging.set_logs()
+
+
+instantiate_device_type_tests(TestScheduler, globals())
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_kernel_benchmark.py b/test/inductor/test_kernel_benchmark.py
index 9d946e9bec9f..1dad5e4ebd23 100644
--- a/test/inductor/test_kernel_benchmark.py
+++ b/test/inductor/test_kernel_benchmark.py
@@ -4,6 +4,10 @@
 import os
 import subprocess
 import sys
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -12,10 +16,17 @@
 from torch._inductor import config
 from torch._inductor.codecache import PyCodeCache
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import xfailIfSM89
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+=======
+from torch._inductor.utils import fresh_cache
+from torch.testing import FileCheck
+from torch.testing._internal.common_cuda import xfailIfSM89
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestKernelBenchmark(TestCase):
@@ -148,7 +159,14 @@ def f(x):
     @config.patch(
         max_autotune=True, max_autotune_gemm_backends="TRITON", force_shape_pad=True
     )
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_matmul_triton_kernel_benchmark(self):
         M = 12544
         N = 256
@@ -166,7 +184,11 @@ def f(a, b):
     @config.patch(
         max_autotune=True, max_autotune_gemm_backends="TRITON", shape_padding=False
     )
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mm_triton_kernel_benchmark(self):
         M = 2048
         N = 2432
@@ -463,6 +485,12 @@ def f(a):
         compiled_module = self.get_compiled_module()
         self.verify_remove_inductor_deps(compiled_module)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch("triton.unique_kernel_names", True)
     @config.patch("triton.unique_kernel_names", True)
     @config.patch(benchmark_kernel=False)
diff --git a/test/inductor/test_loop_ordering.py b/test/inductor/test_loop_ordering.py
index a3125e370904..b9d86e06ed17 100644
--- a/test/inductor/test_loop_ordering.py
+++ b/test/inductor/test_loop_ordering.py
@@ -5,6 +5,10 @@
 import unittest
 
 import numpy as np
+<<<<<<< HEAD
+=======
+import sympy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import nn
@@ -16,6 +20,7 @@
 from torch._inductor.scheduler import SchedulerNode
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.test_operators import realize
+<<<<<<< HEAD
 from torch._inductor.utils import sympy_index_symbol
 from torch._inductor.virtualized import ops, V
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
@@ -25,6 +30,25 @@
 from torch.utils._sympy.functions import ModularIndexing
 
 
+=======
+from torch._inductor.utils import run_and_get_code, sympy_index_symbol
+from torch._inductor.virtualized import ops, V
+from torch.testing import FileCheck
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    skipIfRocm,
+)
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._pytree import tree_map
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+
+
+# set so that metrics appear
+torch._logging.set_logs(inductor_metrics=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1"
 
 
@@ -39,9 +63,17 @@ class MockScheduler:
     def get_backend(cls, *args):
         return TritonScheduling(cls)
 
+<<<<<<< HEAD
 
 @inductor_config.patch(loop_ordering_after_fusion=True)
 class ImplDetailTest(TestCase):
+=======
+    def can_buffer_be_removed_through_fusion(self, *args, **kwargs):
+        return False
+
+
+class MockSchedulerTest(TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _exit_stack = None
 
     @classmethod
@@ -59,6 +91,12 @@ def tearDownClass(cls):
         super().tearDownClass()
         cls._exit_stack.close()
 
+<<<<<<< HEAD
+=======
+
+@inductor_config.patch(loop_ordering_after_fusion=True)
+class ImplDetailTest(MockSchedulerTest):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def _get_snode_body_sym_prefix(snode):
         body = snode._body
@@ -401,10 +439,13 @@ def f(x):
 
     @skipIfRocm
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+")
+<<<<<<< HEAD
     @skipIfRocm
     # Related PR: https://github.com/pytorch/pytorch/pull/149369
     # This test can't function for ROCm because fp8 'mul_cuda' op is not supported
     # in eager mode that is required here to check vs compiled results
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fp8_cast_and_t(self):
         """
         This test repros the not able to fuses issue in
@@ -428,10 +469,13 @@ def f(x, scale):
 
     @skipIfRocm
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 requires H100+ and MI300+")
+<<<<<<< HEAD
     @skipIfRocm
     # Related PR: https://github.com/pytorch/pytorch/pull/149369
     # This test can't function for ROCm because fp8 'mul_cuda' op is not supported
     # in eager mode that is required here to check vs compiled results
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fp8_pattern_2(self):
         """
         This test repros the fp8 fusion relation issue here:
@@ -441,9 +485,15 @@ def test_fp8_pattern_2(self):
         M, K = 4096, 4096
 
         input_tensor = torch.randn(
+<<<<<<< HEAD
             M, K, device="cuda", dtype=ref_dtype, requires_grad=False
         )
         scale = torch.Tensor([10.0]).to("cuda")
+=======
+            M, K, device=GPU_TYPE, dtype=ref_dtype, requires_grad=False
+        )
+        scale = torch.Tensor([10.0]).to(GPU_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         E4M3_MAX_POS = torch.finfo(torch.float8_e4m3fn).max
 
@@ -515,6 +565,534 @@ def f(x):
             print(f"{ms=:.3f}")
 
 
+<<<<<<< HEAD
+=======
+@inductor_config.patch(
+    {
+        "triton.unique_kernel_names": True,
+        "loop_ordering_after_fusion": True,
+        "triton.max_tiles": 3,
+        "triton.coalesce_tiling_analysis": True,
+    }
+)
+@instantiate_parametrized_tests
+class MemoryCoalescingTest(MockSchedulerTest):
+    """Tests for memory coalescing analysis with specific tensor sizes."""
+
+    device = GPU_TYPE
+    _exit_stack = None
+
+    def setUp(self):
+        super().setUp()
+        metrics.reset()
+
+    def _create_buffer(self, name, sizes):
+        """Create a buffer with specified sizes"""
+
+        strides = ir.FlexibleLayout.contiguous_strides(sizes)
+
+        box = ir.TensorBox.create(
+            ir.Buffer(
+                name=name,
+                layout=ir.FixedLayout(
+                    torch.device(self.device),
+                    dtype=torch.float32,
+                    size=sizes,
+                    stride=strides,
+                ),
+            )
+        )
+        box_loader = box.make_loader()
+
+        def inner_fn(index):
+            return box_loader(index) * 2
+
+        buf = ir.Pointwise.create(
+            device=box.get_device(),
+            dtype=box.get_dtype(),
+            inner_fn=inner_fn,
+            ranges=box.get_size(),
+        )
+        buf.realize()
+        computed_buf = buf.data.data
+        computed_buf.decide_layout()
+
+        return computed_buf
+
+    def _create_scheduler_node(self, buf):
+        s = SchedulerNode(V.graph.scheduler, buf)
+        s.min_order = 0
+        s.max_order = 100
+        return s
+
+    @parametrize(
+        "inps",
+        (
+            ((128, 384, 196), (768, 64, 196), (128, 6, 64, 196)),
+            ((64,), (16, 4), (16, 4)),
+            ((5, 6), (3, 10), (30,)),
+            ((5, 6, 20), (3, 10, 20), (30, 20)),
+        ),
+    )
+    def test_inferred_splits(self, inps):
+        """
+        Test memory coalescing analysis with the specified tensor sizes.
+        Using direct SchedulerNode creation with sizes (128, 384, 196) and (768, 64, 196).
+        """
+
+        s1, s2, expected_size = inps
+
+        # Create buffers with the specified sizes
+        buf1 = self._create_buffer("buffer1", s1)
+        buf2 = self._create_buffer("buffer2", s2)
+
+        # Create scheduler nodes
+        snode1 = self._create_scheduler_node(buf1)
+        snode2 = self._create_scheduler_node(buf2)
+
+        # Create a fused node
+        fused_node = torch._inductor.scheduler.FusedSchedulerNode.fuse(snode1, snode2)
+
+        from torch._inductor import tiling_utils
+
+        fused_norm_read_writes = tiling_utils.extract_normalized_read_writes(fused_node)
+
+        var_ranges = fused_norm_read_writes.var_ranges
+        self.assertEqual(list(var_ranges.values()), list(expected_size))
+
+    def test_remapped_reads(self):
+        from torch._inductor import tiling_utils
+
+        def fn(nodes):
+            assert len(nodes) == 1
+            fused_norm_read_writes = tiling_utils.extract_normalized_read_writes(
+                nodes[0]
+            )
+
+            self.assertTrue(len(fused_norm_read_writes.var_ranges) == 2)
+
+            # both reads remapped correctly
+            FileCheck().check("4*n0 + n1").run(
+                repr(fused_norm_read_writes.reads.keys())
+            )
+            FileCheck().check("n0 + 4*n1").run(
+                repr(fused_norm_read_writes.reads.keys())
+            )
+
+            return nodes
+
+        with torch._inductor.config.patch(_post_fusion_custom_pass=fn):
+
+            @torch.compile()
+            def foo(x, y):
+                return x + y
+
+            foo(
+                torch.rand([4, 4], device=GPU_TYPE),
+                torch.rand([4, 4], device=GPU_TYPE).T,
+            )
+
+    def test_remapped_reads_split(self):
+        from torch._inductor import tiling_utils
+
+        def fn(nodes):
+            self.assertTrue(len(nodes) == 1)
+            fused_norm_read_writes = tiling_utils.extract_normalized_read_writes(
+                nodes[0]
+            )
+
+            inp_node_reads = nodes[0].get_nodes()[1]._body.get_read_exprs()
+            node_ranges = nodes[0].get_nodes()[1]._body.var_ranges
+            self.assertTrue(len(node_ranges) == 1)
+            self.assertTrue(next(iter(node_ranges.values())) == 36)
+            var = next(iter(node_ranges.keys()))
+
+            r = FloorDiv(var, 6) + 6 * ModularIndexing(var, 1, 6)
+            self.assertTrue(r in inp_node_reads)
+
+            # mapped reads
+            self.assertTrue(list(fused_norm_read_writes.var_ranges.values()) == [6, 6])
+            n0, n1 = list(fused_norm_read_writes.var_ranges.keys())
+
+            # translation of above is n0 + 6 * n1
+            self.assertTrue((n0 + 6 * n1) in fused_norm_read_writes.reads.keys())
+
+            return nodes
+
+        with torch._inductor.config.patch(_post_fusion_custom_pass=fn):
+
+            @torch.compile()
+            def foo(x, y):
+                return (
+                    x + y
+                ).contiguous().flatten() + torch.ops._inductor_test.realize(
+                    (y.T + 1).flatten()
+                )
+
+            foo(
+                torch.rand([6, 6], device=GPU_TYPE),
+                torch.rand([6, 6], device=GPU_TYPE).T,
+            )
+
+    def test_reduction_pointwise(self):
+        # test one pw var, one red var
+        from torch._inductor import tiling_utils
+
+        def fn(nodes):
+            self.assertTrue(len(nodes) == 1)
+            fused_rw = tiling_utils.extract_normalized_read_writes(nodes[0])
+
+            i_vars, r_vars = fused_rw.index_vars, fused_rw.reduce_vars
+            self.assertTrue(len(i_vars) == 1)
+            self.assertTrue(len(r_vars) == 1)
+
+            # single write to index var
+            self.assertTrue(
+                fused_rw.index_vars[0] == next(iter(fused_rw.writes.keys()))
+            )
+
+            # the write to the fused intermediary node should be removed
+            self.assertTrue(len(fused_rw.writes) == 1)
+
+            # single read
+            self.assertTrue(len(fused_rw.reads) == 1)
+            # that is applied to two bufs
+            self.assertTrue(len(next(iter(fused_rw.reads.values()))) == 2)
+
+            # and the read should be in terms of the index + reduce var,
+            # even though node is pointwise
+            self.assertTrue(256 * i_vars[0] + r_vars[0] in fused_rw.reads)
+
+            return nodes
+
+        with torch._inductor.config.patch(_post_fusion_custom_pass=fn), torch.no_grad():
+
+            @torch.compile()
+            def foo(x, y):
+                out = torch.ops._inductor_test.realize(x + y)
+                return out.sum(dim=1)
+
+            foo(
+                torch.rand(256, 256, device=GPU_TYPE),
+                torch.rand(256, 256, device=GPU_TYPE),
+            )
+
+    def test_reduction_no_pointwise(self):
+        # test one pw var, one red var
+        from torch._inductor import tiling_utils
+
+        def fn(nodes):
+            self.assertTrue(len(nodes) == 1)
+            fused_rw = tiling_utils.extract_normalized_read_writes(nodes[0])
+
+            i_vars, r_vars = fused_rw.index_vars, fused_rw.reduce_vars
+            self.assertTrue(len(i_vars) == 0)
+            self.assertTrue(len(r_vars) == 1)
+
+            return nodes
+
+        with torch._inductor.config.patch(_post_fusion_custom_pass=fn), torch.no_grad():
+
+            @torch.compile()
+            def foo(x):
+                return x.sum()
+
+            foo(torch.rand(1024, device=GPU_TYPE))
+
+    def test_coalescing(self):
+        from torch._inductor import tiling_utils
+
+        # Define symbolic variables
+        i, j, n, m = sympy.symbols("i j n m", integer=True)
+
+        # Test cases: (expression, var_ranges, expected_result)
+        test_cases = [
+            # Simple direct case
+            (i + j * 5, {i: 10, j: 8}, i),
+            # Floor division case
+            (i + FloorDiv(j, 2), {i: 4, j: 8}, i),
+            # Modular indexing
+            (i * 10 + ModularIndexing(j, 1, 3), {i: 5, j: 10}, j),
+            # Case with no coalescing variable
+            (i * 2 + j * 3, {i: 8, j: 5}, None),
+            # Division case
+            (i / 2, {i: 10}, None),
+            # More complex floor division
+            (j + FloorDiv(i, 3), {i: 6, j: 12}, j),
+            # Addition inside modular indexing
+            (ModularIndexing(i + 3, 1, 6), {i: 8, j: 12}, i),
+        ]
+
+        for expr, var_ranges, expected in test_cases:
+            # Test the function
+            result = tiling_utils.find_coalesced_var(expr, var_ranges)
+            self.assertEqual(result, expected)
+
+    @parametrize("downcast_transposed_v", (False, True))
+    def test_tiled_coalesce_analysis(self, downcast_transposed_v):
+        # test one pw var, one red var
+        from torch._inductor import tiling_utils
+
+        def fn(nodes):
+            self.assertTrue(len(nodes) == 1)
+
+            coalesce_analysis = tiling_utils.analyze_memory_coalescing(nodes[0])
+
+            i_vars = coalesce_analysis.norm_read_writes.index_vars
+
+            # because output is contiguous, second dimension should
+            # coalesce twice as many bytes as first dimension
+            # if not downcasted
+            # if downcasted, should be equal, bc larger dtype size
+            # we also weight writes x 2
+            cont_reads = coalesce_analysis.coalesced_by_var[i_vars[1]]
+            t_reads = coalesce_analysis.coalesced_by_var[i_vars[0]]
+
+            if not downcast_transposed_v:
+                self.assertEqual(cont_reads, t_reads * 3)
+            else:
+                self.assertEqual(cont_reads, t_reads * 1.5)
+
+            return nodes
+
+        with torch._inductor.config.patch(_post_fusion_custom_pass=fn), torch.no_grad():
+
+            @torch.compile()
+            def foo(x, y):
+                return x + y.to(x.dtype)
+
+            y_dtype = torch.float if not downcast_transposed_v else torch.float64
+            foo(
+                torch.rand(256, 256, device=GPU_TYPE),
+                torch.rand(256, 256, device=GPU_TYPE, dtype=y_dtype).T,
+            )
+
+    def test_solve_for_zero(self):
+        from torch._inductor import tiling_utils
+
+        x, y = sympy.symbols("x y", integer=True)
+        # Test cases: (expression, expected_result)
+        test_cases = [
+            # Simple linear expressions
+            (x + 5, (-5)),
+            (2 * x - 10, (5)),
+            # Constant expressions (should return None)
+            (sympy.Integer(7), None),
+            (sympy.Integer(0), None),
+            # FloorDiv cases (should return None per function)
+            (FloorDiv(x, 2), None),
+            (FloorDiv(x, 2) + 5, None),
+            # ModularIndexing cases
+            (ModularIndexing(x, 1, 5), (5)),
+            (ModularIndexing(x, 1, 3), (3)),
+            # Expressions with no constant solution
+            (x**2 + 1, None),  # No real solution
+        ]
+        for expr, expected in test_cases:
+            result = tiling_utils.solve_for_zero(expr)
+            self.assertEqual(result, expected)
+
+    def test_solve_for_tiling(self):
+        from torch._inductor import tiling_utils
+
+        x = sympy.Symbol("x", integer=True)
+
+        test_cases = [
+            # Simple linear cases that coalesce
+            (3 * x, None),
+            # # # # Expression with no free symbols
+            # (sympy.Integer(5), None),
+            (x / 3, 3),
+            (FloorDiv(x * 2, 6), 3),
+            # # ModularIndexing expressions
+            (ModularIndexing(FloorDiv(x, 4), 1, 64), 4),
+            (x + ModularIndexing(x, 1, 5), None),
+            (x**2, None),  # Non-linear, diff is not constant
+            (4096 * (ModularIndexing(32 * x, 1, 2048)) + FloorDiv(x, 64), 64),
+            (4096 * (ModularIndexing(x, 1, 2048)) + FloorDiv(x, 2048), 2048),
+        ]
+
+        for expr, expected in test_cases:
+            result = tiling_utils.solve_for_tiling(expr)
+            self.assertEqual(result, expected)
+
+    def test_induced_fused_tiling(self):
+        from torch._inductor import tiling_utils
+
+        def fn(nodes):
+            self.assertTrue(len(nodes) == 1)
+
+            coalesce_analysis = tiling_utils.analyze_memory_coalescing(nodes[0])
+            self.assertEqual(coalesce_analysis.suggested_split.tiling_factor, 64)
+            return nodes
+
+        with torch._inductor.config.patch(_post_fusion_custom_pass=fn), torch.no_grad():
+
+            def forward(permute):
+                clone = torch.ops.aten.clone.default(
+                    permute, memory_format=torch.contiguous_format
+                )
+                view_2 = torch.ops.aten.view.default(clone, [-1, 32])
+                amax_1 = torch.ops.aten.amax.default(view_2, [1])
+                return amax_1
+
+            XDIM = 2048
+            YDIM = 4096
+
+            arg0_1 = torch.randn([XDIM, YDIM], device=GPU_TYPE, dtype=torch.bfloat16)
+            permute = torch.ops.aten.permute.default(arg0_1, [1, 0])
+
+            out, code = run_and_get_code(torch.compile(forward), (permute))
+
+            self.assertEqual(out, forward(permute))
+            FileCheck().check("YBLOCK").check("XBLOCK").run(code[0])
+
+
+layouts = ("cont", "NHWC", "T")
+
+
+@inductor_config.patch(
+    {
+        "triton.unique_kernel_names": True,
+        "loop_ordering_after_fusion": True,
+        "triton.coalesce_tiling_analysis": True,
+    }
+)
+@instantiate_parametrized_tests
+class TestTiling(TestCase):
+    def T(self, layout: str):
+        SIZE_A = 128
+        SIZE_B = 256
+        SIZE_C = 512
+
+        if layout == "cont":
+            return torch.rand(SIZE_A, SIZE_B, SIZE_C, device=GPU_TYPE).unsqueeze(0)
+        elif layout == "T":
+            return (
+                torch.rand(SIZE_A, SIZE_B, SIZE_C, device=GPU_TYPE)
+                .transpose(1, 2)
+                .contiguous()
+                .transpose(1, 2)
+                .unsqueeze(0)
+            )
+        else:
+            assert layout == "NHWC"
+            return torch.rand([1, SIZE_A, SIZE_B, SIZE_C], device=GPU_TYPE).to(
+                memory_format=torch.channels_last
+            )
+
+    @parametrize("a", layouts)
+    @parametrize("b", layouts)
+    def test_pointwise(self, a, b):
+        def foo(x, y):
+            return x + y
+
+        x, y = self.T(a), self.T(b)
+        res, code = run_and_get_code(torch.compile(foo), x, y)
+
+        if a != b:
+            FileCheck().check("ynumel").run(code[0])
+        else:
+            FileCheck().check_not("ynumel").run(code[0])
+
+        self.assertEqual(res, foo(x, y))
+
+    def test_tiled_reduction(self):
+        def f(a, b):
+            return (a * b).sum(dim=-1)
+
+        N = 512
+        inps = (
+            torch.randn(N, N, N, device=GPU_TYPE).permute(2, 1, 0),
+            torch.randn(N, N, N, device=GPU_TYPE).permute(1, 2, 0),
+        )
+        f_c = torch.compile(f)
+        out, code = run_and_get_code(f_c, *inps)
+
+        FileCheck().check_dag("xnumel = 512").check_dag("ynumel = 512").check_dag(
+            "rnumel"
+        ).run(code[0])
+        self.assertEqual(out, f(*inps), atol=0.001, rtol=0.04)
+
+    def test_3d_pointwise(self):
+        inps = (self.T("cont"), self.T("T"), self.T("NHWC"))
+
+        def f(x, y, z):
+            return x + y + z
+
+        f_c = torch.compile(f)
+        out, code = run_and_get_code(f_c, *inps)
+
+        FileCheck().check_dag("znumel").check_dag("ynumel").check_dag("xnumel").run(
+            code[0]
+        )
+        self.assertEqual(out, f(*inps))
+
+    def test_cat(self):
+        # test unwrapping Identity
+
+        def f(x, y):
+            return torch.cat((x, y)) + 1
+
+        x = self.T("cont")
+        y = self.T("T")
+
+        inps = (x, y)
+
+        f_c = torch.compile(f)
+        out, code = run_and_get_code(f_c, *inps)
+        FileCheck().check_dag("ynumel").check_dag("xnumel").run(code[0])
+        self.assertEqual(out, f(*inps))
+
+    def test_penalized_small_dim(self):
+        x = torch.rand([2000, 1], device=GPU_TYPE)
+        y = torch.rand([4, 1], device=GPU_TYPE).T
+
+        # dont tile when it doesnt affect total coalesced mem accesses much
+        def f(x, y):
+            return x + y
+
+        inps = (x, y)
+
+        f_c = torch.compile(f)
+        out, code = run_and_get_code(f_c, *inps)
+        FileCheck().check_not("ynumel").check_dag("xnumel").run(code[0])
+        self.assertEqual(out, f(*inps))
+
+    def test_mutation_deps(self):
+        def f(x):
+            return x.add_(1)
+
+        x = self.T("cont")
+
+        from torch._inductor import tiling_utils
+
+        def fn(nodes):
+            self.assertTrue(len(nodes) == 1)
+
+            coalesce_analysis = tiling_utils.analyze_memory_coalescing(nodes[0])
+            assert coalesce_analysis is not None
+
+            reads = coalesce_analysis.norm_read_writes.reads
+            writes = coalesce_analysis.norm_read_writes.writes
+
+            self.assertTrue(len(reads) == 1 and len(writes) == 1)
+            self.assertEqual(
+                list(coalesce_analysis.norm_read_writes.reads.values()),
+                [OrderedSet(("arg0_1",))],
+            )
+            self.assertEqual(
+                list(coalesce_analysis.norm_read_writes.writes.values()),
+                [OrderedSet(("buf1",))],
+            )
+
+            return nodes
+
+        with torch._inductor.config.patch(_post_fusion_custom_pass=fn), torch.no_grad():
+            torch.compile(f)(x)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     if HAS_GPU:
         run_tests()
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index bb38620cb1f4..e13495362342 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -6,9 +6,19 @@
 import logging
 import math
 import os
+<<<<<<< HEAD
 import tempfile
 import unittest
 from typing import Callable, Optional
+=======
+import random
+import re
+import tempfile
+import unittest
+from typing import Callable, Optional
+from unittest import mock
+from unittest.mock import MagicMock
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import multiprocessing as mp, nn
@@ -18,8 +28,14 @@
 from torch._dynamo.utils import same
 from torch._inductor import config
 from torch._inductor.autotune_process import (
+<<<<<<< HEAD
     BenchmarkRequest,
     CUDA_VISIBLE_DEVICES,
+=======
+    _TestBenchmarkRequest,
+    CUDA_VISIBLE_DEVICES,
+    TuningProcess,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TuningProcessPool,
 )
 from torch._inductor.graph import GraphLowering
@@ -27,8 +43,15 @@
 from torch._inductor.kernel.mm_plus_mm import aten_mm_plus_mm
 from torch._inductor.select_algorithm import (
     AlgorithmSelectorCache,
+<<<<<<< HEAD
     TritonTemplateCaller,
 )
+=======
+    TritonTemplate,
+    TritonTemplateCaller,
+)
+from torch._inductor.template_heuristics import CUDAConfigHeuristic, GemmConfig
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
@@ -36,6 +59,12 @@
     IS_WINDOWS,
     parametrize,
     TEST_WITH_ROCM,
+<<<<<<< HEAD
+=======
+    MI300_ARCH,
+    runOnRocmArch,
+    skipIfXpu,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.logging_utils import multiple_logs_to_string
 from torch.utils._triton import has_triton_tma_device
@@ -44,12 +73,22 @@
 aten = torch.ops.aten
 from torch._inductor.mock_cache import global_stats, PatchCaches, Stats
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache, run_and_get_code
 from torch._inductor.virtualized import V
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import skipIfRocm, skipIfXpu
 from torch.testing._internal.inductor_utils import (
+=======
+from torch._inductor.utils import fresh_cache, run_and_get_code
+from torch._inductor.virtualized import V
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing import FileCheck
+from torch.testing._internal.inductor_utils import (
+    get_func_call,
+    get_kernel_launch,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GPU_TYPE,
     HAS_CPU,
     HAS_CUDA,
@@ -62,6 +101,7 @@
     torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 
+<<<<<<< HEAD
 def _get_func_call() -> str:
     return "void inductor_entry_impl(" if config.cpp_wrapper else "def call("
 
@@ -70,6 +110,8 @@ def _get_kernel_launch() -> str:
     return "call_triton_" if config.cpp_wrapper else ".run("
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def benchmark_choice(choice, args, out, expected_out, timings):
     result = choice.benchmark(*args, out=out)
     if expected_out is not None:
@@ -83,6 +125,7 @@ def benchmark(self, *args, out):
         raise RuntimeError("This choice caller will always throw")
 
 
+<<<<<<< HEAD
 @instantiate_parametrized_tests
 class TestMaxAutotune(TestCase):
     def _create_buffer(self, name, shape):
@@ -197,6 +240,14 @@ def mm_plus_mm(a, b, c, d):
         ):
             torch.compile(mm_plus_mm)(a, b, c, d)
 
+=======
+@unittest.mock.patch(
+    "torch._inductor.select_algorithm.TritonTemplate.test_cache", new=True
+)
+@config.patch(enable_caching_generated_triton_templates=True)
+@instantiate_parametrized_tests
+class TestMaxAutotune(TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("dynamic", (False, True))
     def test_max_autotune_mm_plus_mm_zero_size_input(self, dynamic):
         """
@@ -215,6 +266,7 @@ def mm_plus_mm(a, b, c, d):
         with config.patch({"max_autotune": True}):
             torch.compile(mm_plus_mm, dynamic=dynamic)(a, b, c, d)
 
+<<<<<<< HEAD
     @parametrize("dynamic", (False, True))
     def test_max_autotune_regular_mm(self, dynamic: bool):
         """
@@ -231,6 +283,8 @@ def mm(a, b):
         with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
@@ -264,7 +318,10 @@ def mm(a, b):
         with config.patch(
             {
                 "max_autotune": True,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "triton.enable_persistent_tma_matmul": "1",
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
@@ -286,6 +343,7 @@ def mm(a, b):
         a = torch.randn(M, K).to(torch.float16).cuda()
         b = torch.randn(K, N).to(torch.float16).cuda()
 
+<<<<<<< HEAD
         with self.assertRaises(BackendCompilerFailed) as context, config.patch(
             {
                 "max_autotune": True,
@@ -293,6 +351,17 @@ def mm(a, b):
                 "triton.enable_persistent_tma_matmul": "1",
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
+=======
+        with (
+            self.assertRaises(BackendCompilerFailed) as context,
+            config.patch(
+                {
+                    "max_autotune": True,
+                    "triton.enable_persistent_tma_matmul": "1",
+                    "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
+                }
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
@@ -301,6 +370,40 @@ def mm(a, b):
         # given the config flags above, we should have no choices left.
         self.assertIn("NoValidChoicesError", str(context.exception))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        not has_triton_tma_device(), "Need device-side TMA support in Triton"
+    )
+    def test_max_autotune_regular_mm_tma_dynamic_outer_dim(self):
+        def mm(a, b):
+            return torch.mm(a, b)
+
+        M, N, K = 21, 31, 11
+        a = torch.randn(M, K).to(torch.float16).cuda()
+        b = torch.randn(K, N).to(torch.float16).cuda()
+
+        # TMA requires 16-byte alignment: here we repeat the dims
+        # by the factor of 8, as float16 is 2-byte. All dims are
+        # repeated due to the possible transpositions below.
+        a = a.repeat(8, 8)
+        b = b.repeat(8, 8)
+
+        torch._dynamo.mark_dynamic(a, 0)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": "1",
+                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
+            }
+        ):
+            c_actual = torch.compile(mm)(a, b)
+            c_expected = mm(a, b)
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("dynamic", (False, True))
     def test_max_autotune_regular_mm_zero_size_input(self, dynamic: bool):
         """
@@ -317,6 +420,7 @@ def mm(a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
+<<<<<<< HEAD
     def test_precompilation_threads(self):
         import threading
         from typing import Any
@@ -404,6 +508,8 @@ def addmm(x, a, b):
             Y = addmm(x, a, b)
             torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
@@ -439,7 +545,10 @@ def addmm(x, a, b):
         with config.patch(
             {
                 "max_autotune": True,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "triton.enable_persistent_tma_matmul": "1",
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
@@ -462,6 +571,7 @@ def addmm(x, a, b):
         b = torch.randn(K, N).to(torch.float16).cuda()
         x = torch.randn(N).to(torch.float16).cuda()
 
+<<<<<<< HEAD
         with self.assertRaises(BackendCompilerFailed) as context, config.patch(
             {
                 "max_autotune": True,
@@ -469,6 +579,17 @@ def addmm(x, a, b):
                 "triton.enable_persistent_tma_matmul": "1",
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
+=======
+        with (
+            self.assertRaises(BackendCompilerFailed) as context,
+            config.patch(
+                {
+                    "max_autotune": True,
+                    "triton.enable_persistent_tma_matmul": "1",
+                    "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
+                }
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.compile(addmm, dynamic=dynamic)(x, a, b)
 
@@ -477,7 +598,44 @@ def addmm(x, a, b):
         # given the config flags above, we should have no choices left.
         self.assertIn("NoValidChoicesError", str(context.exception))
 
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @unittest.skipIf(
+        not has_triton_tma_device(), "Need device-side TMA support in Triton"
+    )
+    def test_max_autotune_addmm_tma_dynamic_outer_dim(self):
+        def addmm(x, a, b):
+            return torch.addmm(x, a, b)
+
+        M, N, K = 21, 31, 11
+        a = torch.randn(M, K).to(torch.float16).cuda()
+        b = torch.randn(K, N).to(torch.float16).cuda()
+        x = torch.randn(N).to(torch.float16).cuda()
+
+        # TMA requires 16-byte alignment: here we repeat the dims
+        # by the factor of 8, as float16 is 2-byte. All dims are
+        # repeated due to the possible transpositions below.
+        x = x.repeat(8)
+        a = a.repeat(8, 8)
+        b = b.repeat(8, 8)
+
+        torch._dynamo.mark_dynamic(a, 0)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": "1",
+                "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
+            }
+        ):
+            c_actual = torch.compile(addmm)(x, a, b)
+            c_expected = addmm(x, a, b)
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
+
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support sm carveout")
     @unittest.skipIf(IS_WINDOWS, "Windows doesn't support persistent TMA")
     @unittest.skipIf(
@@ -526,7 +684,10 @@ def scaled_mm(
         with config.patch(
             {
                 "max_autotune": True,
+<<<<<<< HEAD
                 "autotune_fallback_to_aten": False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "triton.enable_persistent_tma_matmul": True,
                 "max_autotune_gemm_backends": "TRITON",
                 "test_configs.autotune_choice_name_regex": "tma",
@@ -623,6 +784,7 @@ def foo(mod, x):
             FileCheck().check_not("extern_kernels.convolution").run(code[0])
             self.assertEqual(conv1x1(input_tensor), out, atol=1e-2, rtol=0)
 
+<<<<<<< HEAD
     def test_filled_cache_precompile(self):
         def fn(a, b, c):
             a = (a @ b) @ c
@@ -657,6 +819,9 @@ def fn(a, b, c):
         self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 0)
 
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(max_autotune=True, max_fusion_size=2)
     def test_jit_fusion_matches_aot_fusion(self):
         # In this example, AOTInductor's JIT-compile will fuse(buf1, buf2) due
@@ -679,6 +844,7 @@ def fn(x, number):
         )
         torch._export.aot_compile(fn, args=inputs)
 
+<<<<<<< HEAD
     @config.patch(autotune_local_cache=False, autotune_remote_cache=False)
     @skipIfRocm
     def test_precompilations(self):
@@ -696,6 +862,8 @@ def fn(a, b, c):
 
         self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 2)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cat_addmm(self):
         def fn(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
             return torch.cat(
@@ -721,6 +889,7 @@ def fn(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
             actual = torch.compile(fn)(*args)
             torch.testing.assert_close(actual, expected, atol=1e-2, rtol=1e-2)
 
+<<<<<<< HEAD
     def test_triton_template_with_epilogues_and_dynamic_shape(self):
         def fn(
             x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor, mul: torch.Tensor
@@ -763,6 +932,8 @@ def fn(
             y1_expected = fn(x1, w, b, mul1)
             torch.testing.assert_close(y1, y1_expected)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(
         benchmark_kernel=True,
         fallback_random=True,
@@ -808,7 +979,11 @@ def test_autotune_device_guard(self):
         def f(x, y):
             return x @ y
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             act = torch.compile(f)(x, y)
         ref = f(x, y)
         self.assertTrue(torch.allclose(act, ref, atol=4 * 1e-3, rtol=4 * 1e-3))
@@ -862,7 +1037,11 @@ def forward(self, x):
 
         m_c = torch.compile(mode="max-autotune")(mod)
         out, code = run_and_get_code(m_c, x)
+<<<<<<< HEAD
         self.assertEqual(out, mod(x))
+=======
+        self.assertEqual(out, mod(x), atol=2e-3, rtol=1e-3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         FileCheck().check("triton_tem_fused_baddbmm").run(code[0])
 
@@ -907,8 +1086,13 @@ def f(x, y):
 
         # mm kernel, and cos kernel
         count = 2 if using_triton_mm else 1
+<<<<<<< HEAD
         FileCheck().check(_get_func_call()).check_count(
             _get_kernel_launch(), count, exactly=True
+=======
+        FileCheck().check(get_func_call()).check_count(
+            get_kernel_launch(), count, exactly=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run(code[0])
 
         def f(x, y):
@@ -920,8 +1104,13 @@ def f(x, y):
         f_c = torch.compile(mode="max-autotune-no-cudagraphs")(f)
         _, code = run_and_get_code(f_c, inps[0], inps[1])
         self.assertEqual(f_c(*inps), f(*inps), atol=0.03, rtol=0.25)
+<<<<<<< HEAD
         FileCheck().check(_get_func_call()).check_count(
             _get_kernel_launch(), 2, exactly=True
+=======
+        FileCheck().check(get_func_call()).check_count(
+            get_kernel_launch(), 2, exactly=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run(code[0])
 
         def f(x, y):
@@ -942,6 +1131,7 @@ def f(x, y, z, other):
             diag = torch.diagonal(mul)
             diag.copy_(other)
             x = torch.mm(mul, z)
+<<<<<<< HEAD
             y = torch.diagonal(x).add_(torch.tensor(1, device="cuda"))
             return y
 
@@ -952,13 +1142,37 @@ def f(x, y, z, other):
             pre_fusion_tream,
             post_fusion_stream,
         ), ctx = multiple_logs_to_string(
+=======
+            y = torch.diagonal(x).add_(torch.tensor(1, device=GPU_TYPE))
+            return y
+
+        t = functools.partial(torch.randn, device=GPU_TYPE)
+        inps = (t(3, 3), t(3, 3), t(3, 3), t(3))
+        fn = torch.compile(f, mode="max-autotune-no-cudagraphs")
+        (
+            (
+                pre_fusion_tream,
+                post_fusion_stream,
+            ),
+            ctx,
+        ) = multiple_logs_to_string(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "torch._inductor.debug", "ir_pre_fusion", "ir_post_fusion"
         )
 
         with config.patch({"trace.debug_dir": tempfile.mkdtemp()}):
+<<<<<<< HEAD
             with self.assertLogs(
                 logging.getLogger("torch._inductor.debug"), level=logging.INFO
             ) as cm, ctx():
+=======
+            with (
+                self.assertLogs(
+                    logging.getLogger("torch._inductor.debug"), level=logging.INFO
+                ) as cm,
+                ctx(),
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out = fn(*inps)
 
         self.assertEqual(f(*inps), out)
@@ -1110,7 +1324,10 @@ def f(x1, y1, x2, y2):
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="",
+<<<<<<< HEAD
         autotune_fallback_to_aten=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_no_valid_choices(self):
         a = torch.zeros([2, 2], device=GPU_TYPE)
@@ -1123,7 +1340,10 @@ def test_no_valid_choices(self):
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
+<<<<<<< HEAD
         autotune_fallback_to_aten=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_inf_timing(self, multi_template):
         from unittest.mock import patch
@@ -1136,8 +1356,14 @@ def mock_lookup(self, *args, **kwargs):
 
         a = torch.zeros([16, 16], device=GPU_TYPE)
         b = torch.zeros([16, 16], device=GPU_TYPE)
+<<<<<<< HEAD
         with patch.object(AlgorithmSelectorCache, "lookup", mock_lookup), config.patch(
             benchmark_epilogue_fusion=multi_template
+=======
+        with (
+            patch.object(AlgorithmSelectorCache, "lookup", mock_lookup),
+            config.patch(benchmark_epilogue_fusion=multi_template),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             with self.assertRaises(BackendCompilerFailed) as context:
                 torch.compile(lambda a, b: a.matmul(b))(a, b)
@@ -1186,147 +1412,1093 @@ def f(x, y):
             actual = (opt_f(x, y), x.grad, linear.weight.grad, linear.bias.grad)
             assert same(expect, actual, tol=1e-2), f"ref:\n{expect}\nact:\n{actual}"
 
+<<<<<<< HEAD
+=======
+    @skipIfXpu
+    @unittest.skipIf(TEST_WITH_ROCM, "decompose_k not supported on ROCm")
+    @unittest.skipIf(
+        config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
+    )
+    @parametrize("dynamic", (True, False))
+    @parametrize("dtype", (torch.float16, torch.bfloat16))
+    @parametrize("sizes", ((32, 32, 32768), (64, 128, 200000), (64, 64, 177147)))
+    @config.patch(
+        max_autotune=True,
+        max_autotune_gemm_backends="TRITON",
+    )
+    def test_max_autotune_decompose_k(self, sizes, dtype, dynamic):
+        fp16_red_setting = (
+            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
+        )
+        bf16_red_setting = (
+            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
+        )
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
 
-@instantiate_parametrized_tests
-class TestMaxAutotuneRemoteCache(TestCase):
-    def setUp(self):
-        super().setUp()
-        PatchCaches.setUp()
+        M, N, K = sizes
 
-    def tearDown(self):
-        super().tearDown()
-        PatchCaches.tearDown()
+        a = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        b = torch.randn(K, N, dtype=dtype, device="cuda", requires_grad=True)
 
-    @parametrize("dynamic", (False, True))
-    @config.patch(
-        {"compile_threads": 1, "prologue_fusion": False}
-    )  # Worker processes do not register PatchCaches() properly
-    def test_max_autotune_remote_caching(self, dynamic: bool):
-        from unittest.mock import patch
+        possible_splits = range(2, min(K // M, K // N) + 1)
 
-        def mm(a, b):
-            a = torch.sin(a)
-            return a @ b
+        divisors = {split for split in possible_splits if K % split == 0}
 
-        a = torch.randn(100, 10).to(GPU_TYPE)
-        b = torch.randn(10, 100).to(GPU_TYPE)
+        def check_divisors(code):
+            for kernel in code:
+                if "decompose_k" in kernel:
+                    divisor_found = False
+                    for divisor in divisors:
+                        if f"{divisor}_split" in kernel:
+                            divisor_found = True
+                            break
 
-        class Model(torch.nn.Module):
-            def forward(self, x, y):
-                return x + y
+                    self.assertTrue(
+                        divisor_found,
+                        f"Could not find a split in {divisors} in {kernel}",
+                    )
 
-        def f(x, y):
-            return Model()(x, y)
+        compiled_func = torch.compile(lambda a, b: a @ b, dynamic=dynamic)
+        # We assume with the large k dim relative to m, n, decompose_k will be most performant
+        out, code = run_and_get_code(compiled_func, a, b)
 
-        x = torch.randn(100, 100).to(GPU_TYPE)
-        y = torch.randn(100, 100).to(GPU_TYPE)
+        if dynamic:
+            FileCheck().check_not("extern_kernels.bmm_dtype").check_not(
+                "decompose_k"
+            ).run(code[0])
+        else:
+            FileCheck().check("extern_kernels.bmm_dtype").check_regex(
+                "triton_.*_fused_0.run"
+            ).check("decompose_k").run(code[0])
+            check_divisors(code)
+            torch.testing.assert_close(out, a @ b, atol=1e-2, rtol=1e-2)
+
+        # Test adding epilogue also equivalent to eager
+        compiled_func = torch.compile(lambda a, b: (a @ b).relu(), dynamic=dynamic)
+        out, code = run_and_get_code(compiled_func, a, b)
+        if dynamic:
+            FileCheck().check_not("extern_kernels.bmm_dtype").check_not(
+                "decompose_k"
+            ).run(code[0])
+        else:
+            FileCheck().check("extern_kernels.bmm_dtype").check_regex(
+                "triton_.*_fused_0.run"
+            ).check("decompose_k").run(code[0])
+            check_divisors(code)
+            torch.testing.assert_close(
+                compiled_func(a, b), (a @ b).relu(), atol=1e-2, rtol=1e-2
+            )
 
-        with config.patch(
-            {
-                "autotune_local_cache": False,
-                "autotune_remote_cache": True,
-            }
-        ), patch.dict(os.environ), PatchCaches():
-            os.environ.pop("TRITON_CACHE_MANAGER", None)
-            with config.patch({"max_autotune": True}):
-                for _ in range(4):
-                    with fresh_inductor_cache():
-                        torch.compile(mm, dynamic=dynamic)(a, b)
-                    reset()
-                with torch.compiler.config.patch(
-                    {"cache_key_tag": "test"}
-                ), fresh_inductor_cache():
-                    torch.compile(mm, dynamic=dynamic)(a, b)
-                    reset()
+        # Test adding reinterpret view before subgraph
+        a = a.transpose(0, 1)
+        compiled_func = torch.compile(
+            lambda a, b: (a.transpose(0, 1) @ b).relu(), dynamic=dynamic
+        )
+        out, code = run_and_get_code(compiled_func, a, b)
+        if dynamic:
+            FileCheck().check_not("extern_kernels.bmm_dtype").check_not(
+                "decompose_k"
+            ).run(code[0])
+        else:
+            FileCheck().check("extern_kernels.bmm_dtype").check_regex(
+                "triton_.*_fused_0.run"
+            ).check("decompose_k").run(code[0])
+            check_divisors(code)
+            torch.testing.assert_close(
+                compiled_func(a, b),
+                (a.transpose(0, 1) @ b).relu(),
+                atol=1e-2,
+                rtol=1e-2,
+            )
 
-                global_stats.report()
-                self.assertEqual(global_stats.autotune_remote, Stats(2, 3, 2))
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
+            fp16_red_setting
+        )
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
+            bf16_red_setting
+        )
 
-            global_stats.reset()
-            for _ in range(4):
-                with fresh_inductor_cache():
-                    torch.compile(f, dynamic=dynamic)(x, y)
-                reset()
-            with torch.compiler.config.patch(
-                {"cache_key_tag": "test"}
-            ), fresh_inductor_cache():
-                torch.compile(mm, dynamic=dynamic)(a, b)
-                reset()
-            global_stats.report()
-            self.assertEqual(global_stats.autotune_remote, Stats(2, 3, 2))
+    @skipIfXpu
+    @unittest.skipIf(TEST_WITH_ROCM, "decompose_k not supported on ROCm")
+    @unittest.skipIf(
+        config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
+    )
+    @config.patch(
+        max_autotune=True,
+        max_autotune_gemm_backends="TRITON",
+    )
+    def test_max_autotune_decompose_k_dynamic_input(self):
+        def f(a, b):
+            a_in = torch.stack((a, a), dim=0)
+            return (a_in @ b).relu()
 
+        a = torch.randn(
+            32, 32768, dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        b = torch.randn(
+            32768, 64, dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
 
-class _TestBenchmarkRequest(BenchmarkRequest):
-    def __init__(
-        self, value: float, multi_device: bool, parent_visible_devices: Optional[str]
-    ) -> None:
-        self.value = value
-        self.multi_device = multi_device
-        self.parent_visible_devices = parent_visible_devices
+        torch._dynamo.reset()
+        torch._dynamo.maybe_mark_dynamic(a, 0)
+        compiled_func = torch.compile(f)
+
+        with mock.patch(
+            "torch._inductor.kernel.mm.use_decompose_k_choice"
+        ) as decomp_mock:
+            decomp_mock.return_value = True
+
+            out, code = run_and_get_code(compiled_func, a, b)
+            FileCheck().check("extern_kernels.bmm_dtype").check_regex(
+                "triton_.*_fused_0.run"
+            ).check("decompose_k").check_regex(r"s[0-9]+ = s[0-9]+").check_regex(
+                r"2\*s[0-9]+"
+            ).check_regex("s[0-9]+ = 32").run(code[0])
+            torch.testing.assert_close(
+                out,
+                f(a, b),
+                atol=1e-2,
+                rtol=1e-2,
+            )
 
-    def benchmark(
-        self, *input_tensors: torch.Tensor, output_tensor: Optional[torch.Tensor] = None
-    ) -> float:
-        # Verify that the visible devices env var is set correctly. If multi-device
-        # auto-tuning is disabled, the visible devices should be unmanipulated from
-        # the parent process. If multi-device auto-tuning is enabled, the visible
-        # devices should be a _single_ valid device number. Note that we can't perform
-        # this validation directly from the test body because benchmarks execute in a
-        # separate process. If the check fails, however, the test will detect the
-        # failure by virtue of not receiving the expected result back.
-        visible_devices = os.environ.get(CUDA_VISIBLE_DEVICES)
-        if not self.multi_device:
-            assert visible_devices == self.parent_visible_devices
-        else:
-            assert self.parent_visible_devices is not None
-            valid_devices = self.parent_visible_devices.split(",")
-            assert visible_devices in valid_devices
+    @skipIfXpu
+    @unittest.skipIf(TEST_WITH_ROCM, "decompose_k not supported on ROCm")
+    @unittest.skipIf(
+        config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
+    )
+    @config.patch(
+        max_autotune=True,
+        max_autotune_gemm_backends="TRITON",
+    )
+    def test_max_autotune_decompose_k_dynamic_input_bwd(self):
+        def f(a, b):
+            # 256 * s0
+            a_in = torch.cat([a for _ in range(256)], dim=0)
+            return (a_in @ b).relu().sum()
+
+        a = torch.randn(8, 64, dtype=torch.bfloat16, device="cuda", requires_grad=True)
+        b = torch.randn(
+            64, 32768, dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
 
-        return self.value
+        torch._dynamo.reset()
+        torch._dynamo.maybe_mark_dynamic(a, 0)
+        compiled_func = torch.compile(f)
+        res = compiled_func(a, b)
+        res.backward()
+
+        with mock.patch(
+            "torch._inductor.kernel.mm.use_decompose_k_choice"
+        ) as decomp_mock:
+            decomp_mock.return_value = True
+
+            out, code = run_and_get_code(compiled_func, a, b)
+            out.backward()
+
+            FileCheck().check("extern_kernels.bmm_dtype").check_regex(
+                "triton_.*_fused_0.run"
+            ).check("decompose_k").check_regex(r"s[0-9]+ = s[0-9]+").check_regex(
+                r"256\*s[0-9]+"
+            ).check_regex("s[0-9]+ = 8").run(
+                # code[1] in this case given backwards
+                code[1]
+            )
 
+    @skipIfXpu
+    @unittest.skipIf(TEST_WITH_ROCM, "decompose_k not supported on ROCm")
+    @unittest.skipIf(
+        config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
+    )
+    @config.patch(
+        max_autotune=True,
+        max_autotune_gemm_backends="TRITON",
+    )
+    def test_max_autotune_decompose_k_output_stride(self):
+        def f(a, b):
+            a = a.transpose(0, 1)
+            return a @ b
 
-class _TestTritonTemplateCaller(TritonTemplateCaller):
-    def __init__(self, bmreq: _TestBenchmarkRequest):
-        self.bmreq = bmreq
+        a = torch.randn((32768, 256), device="cuda", dtype=torch.bfloat16)
+        b = torch.randn((32768, 1152), device="cuda", dtype=torch.bfloat16)
 
-    def __str__(self) -> str:
-        return "test"
+        b = b[:, :1096]
 
+        # Force only decomposeK choice
+        with (
+            mock.patch(
+                "torch._inductor.kernel.mm.V.choices.get_base_mm_configs"
+            ) as base_mm_mock,
+            mock.patch(
+                "torch._inductor.kernel.mm.use_decompose_k_choice"
+            ) as decompose_mock,
+        ):
+            mm_configs_mock = MagicMock()
+            mm_configs_mock.return_value = []
+            base_mm_mock.return_value = mm_configs_mock
+            decompose_mock.return_value = True
+            compiled_f = torch.compile(f)
+            out, code = run_and_get_code(compiled_f, a, b)
+
+            # Output stride equal to original gm output stride
+            # If output stride is not correctly checked, this will be (1152, 1) which can cause nans
+            self.assertEqual(out.stride(), (1096, 1))
+
+            FileCheck().check_not("extern_kernels.bmm_dtype").check(
+                "decompose_k"
+            ).check(" empty_strided_cuda((256, 1096), (1096, 1), torch.bfloat16)").run(
+                code[0]
+            )
 
-class TestTuningProcess(TestCase):
-    def test_tuning_pool_crash(self):
-        # Use only one device/subprocess so we test the process restarts
-        # and is usable after a "crash".
-        with config.patch({"autotune_multi_device": False}):
-            tuning_pool = TuningProcessPool()
-            tuning_pool.initialize()
+    def test_triton_template_generated_code_cache_key(self):
+        generate_and_load_args = len(
+            inspect.signature(
+                torch._inductor.select_algorithm.TritonTemplate.generate_and_load
+            ).parameters
+        )
+        make_key_args = len(
+            inspect.signature(
+                torch._inductor.select_algorithm.GeneratedCodeCache.make_key
+            ).parameters
+        )
 
-            # First force the tuning process to "crash" by setting a bogus
-            # string for the expected visible devices.
-            bmreq = _TestBenchmarkRequest(3.14, False, "invalid")
-            choice = _TestTritonTemplateCaller(bmreq)
+        # Make sure all args of generate_and_load_args are passed to make_key_args (Except generate_with_caching)
+        # update this function each time new arg added to generate_and_load and make sure arg is added to make_key
+        self.assertEqual(generate_and_load_args - 1, make_key_args)
+        self.assertEqual(generate_and_load_args, 16)
 
-            timings = tuning_pool.benchmark([choice])
-            self.assertTrue(choice in timings)
-            self.assertEqual(timings[choice], float("inf"))
+    @fresh_cache()
+    @config.patch(
+        {
+            "max_autotune": True,
+            "test_configs.max_mm_configs": 4,
+            "max_autotune_gemm_backends": "TRITON",
+        }
+    )
+    def test_triton_template_generated_code_cache_strategy(self):
+        def func_test1(x, y, z, m):
+            a = torch.matmul(x, y)
+            b = torch.matmul(z, m)
+            return a, b
+
+        a = torch.rand(10, 22, device=GPU_TYPE)
+        b = torch.rand(22, 30, device=GPU_TYPE)
+        # Test that the testing strategy works by overriding input_dependent_preserved_state and simulate a cache hit.
+        with unittest.mock.patch(
+            "torch._inductor.select_algorithm.TritonTemplateKernel.input_dependent_preserved_state",
+            new=(lambda self: "same always"),
+        ):
+            with self.assertRaisesRegex(
+                torch._inductor.exc.InductorError,
+                r".*Generated code cache results in wrong output.*",
+            ):
+                torch.compile(func_test1, dynamic=False)(a, b, a, b)
 
-            # Then send another request and make sure the sub-process
-            # has restarted and is operational. 'valid_devices' expected
-            # to be None because autotune_multi_device is off.
-            choice.bmreq.parent_visible_devices = os.environ.get(CUDA_VISIBLE_DEVICES)
+    @config.patch(
+        {
+            "max_autotune": True,
+            "test_configs.max_mm_configs": 4,
+            "max_autotune_gemm_backends": "TRITON",
+        }
+    )
+    def test_triton_template_generated_code_caching(self):
+        def reset_counters():
+            torch._dynamo.utils.counters.clear()
+
+        def hits():
+            return torch._dynamo.utils.counters["inductor"][
+                "generated_module_cache_hit"
+            ]
+
+        def misses():
+            return torch._dynamo.utils.counters["inductor"][
+                "generated_module_cache_miss"
+            ]
+
+        # remove white space from x.
+        def remove_white_space(x: str) -> str:
+            return re.sub(r"\s+", "", x)
+
+        def get_cache_key_and_events() -> tuple[str, str]:
+            cache = TritonTemplate.all_templates["mm"]._generated_code_cache._cache
+            cache_key = next(iter(cache))
+            events = str(cache[cache_key].events)
+            return cache_key, events
+
+        def func_test1(x, y, z, m):
+            a = torch.matmul(x, y)
+            b = torch.matmul(z, m)
+            return a, b
+
+        a = torch.rand(10, 22, device=GPU_TYPE)
+        b = torch.rand(22, 30, device=GPU_TYPE)
+
+        # Valid cache hit.
+        with fresh_cache():
+            reset_counters()
+            compile_results = torch.compile(func_test1, dynamic=False)(a, b, a, b)
+            eager_results = func_test1(a, b, a, b)
+            self.assertEqual(compile_results, eager_results, atol=0.05, rtol=0.05)
+            self.assertEqual(hits(), 4)
+            self.assertEqual(misses(), 4)
+
+            cache_key, events = get_cache_key_and_events()
 
-            timings = tuning_pool.benchmark([choice])
-            self.assertTrue(choice in timings)
-            self.assertEqual(timings[choice], bmreq.value)
+            if not TEST_WITH_ROCM:
+                expected = """{
+                        'input_nodes':[
+                            "[[10,22],[22,1],torch.float32,device(type='cuda',index=0),0]",
+                            "[[22,30],[30,1],torch.float32,device(type='cuda',index=0),0]"],
+                        'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[10,30],
+                        'layout':"[[10,30],[30,1],torch.float32,device(type='cuda',index=0),0]",
+                        'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity',
+                        'kwargs':{'EVEN_K':False,'ALLOW_TF32':True,'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
+                        'BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8}}"""
+
+                expected = expected.replace("cuda", GPU_TYPE)
+                self.assertExpectedInline(
+                    remove_white_space(cache_key),
+                    remove_white_space(expected),
+                )
 
-            tuning_pool.terminate()
+                self.assertEqual(
+                    remove_white_space(events),
+                    remove_white_space("""[('def_kernel', ['A', 'B'], {})]"""),
+                )
 
-    # XPU have to enable XPU_VISIBLE_DEVICES to control devices visibility.
-    @skipIfXpu
-    def test_tuning_pool_multiple_devices(self):
-        with config.patch({"autotune_multi_device": True}):
-            # Adapt the test to the available devices (and whether CUDA_VISIBLE_DEVICES
-            # is already set in the environment); use a subset of the available devices
+        # Test symbolic shapes with different symbols. Will cache miss due to different symbols in inputs.
+        with fresh_cache():
+            a = torch.rand(10, 22, device=GPU_TYPE)
+            b = torch.rand(22, 30, device=GPU_TYPE)
+
+            c = torch.rand(9, 21, device=GPU_TYPE)
+            d = torch.rand(21, 30, device=GPU_TYPE)
+            reset_counters()
+            compiled_results = torch.compile(func_test1, dynamic=True)(a, b, c, d)
+            eager_results = func_test1(a, b, c, d)
+
+            self.assertEqual(compiled_results, eager_results, atol=0.05, rtol=0.05)
+
+            self.assertEqual(hits(), 0)
+            self.assertEqual(misses(), 8)
+
+            cache_key, events = get_cache_key_and_events()
+
+            if not TEST_WITH_ROCM:
+                expected = """{
+                    'input_nodes':[
+                        "[[s77,s17],[s17,1],torch.float32,device(type='cuda',index=0),0]",
+                        "[[s17,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]"],
+                    'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[s77,s94],
+                    'layout':"[[s77,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]",'num_consumer_groups':0,
+                    'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','kwargs':{'EVEN_K':False,'ALLOW_TF32':True,
+                    'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32','BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8}}"""
+                expected = expected.replace("cuda", GPU_TYPE)
+                self.assertExpectedInline(
+                    remove_white_space(cache_key),
+                    remove_white_space(expected),
+                )
+
+                self.assertExpectedInline(
+                    remove_white_space(events),
+                    remove_white_space(
+                        """[('def_kernel',['A','B'],{}),('size',['A',0],{}),('size',['B',1],{}),('size',['A',1],{})]"""
+                    ),
+                )
+                self.assertExpectedInline(
+                    remove_white_space(events),
+                    remove_white_space(
+                        """[
+                            ('def_kernel', ['A', 'B'], {}),
+                            ('size', ['A', 0], {}),
+                            ('size', ['B', 1], {}),
+                            ('size', ['A', 1], {})]
+                        """
+                    ),
+                )
+
+        # Test duck typing.
+        with fresh_cache():
+            reset_counters()
+
+            compile_results = torch.compile(func_test1, dynamic=True)(a, b, a, b)
+            eager_results = func_test1(a, b, a, b)
+            self.assertEqual(compile_results, eager_results, atol=0.05, rtol=0.05)
+
+            self.assertEqual(hits(), 4)
+            self.assertEqual(misses(), 4)
+
+        # Test loop.
+        def test_func2(x):
+            for i in range(0, 10):
+                x = torch.matmul(x, x)
+            return x
+
+        with fresh_cache():
+            reset_counters()
+            input = torch.rand(10, 10, device=GPU_TYPE)
+
+            compile_results = torch.compile(test_func2, dynamic=False)(input)
+            eager_results = test_func2(input)
+            self.assertEqual(compile_results, eager_results, atol=0.05, rtol=0.05)
+
+            self.assertEqual(hits(), 36)
+            self.assertEqual(misses(), 4)
+
+        with fresh_cache():
+            reset_counters()
+            input = torch.rand(10, 10, device=GPU_TYPE)
+
+            compile_results = torch.compile(test_func2, dynamic=True)(input)
+            eager_results = test_func2(input)
+            self.assertEqual(compile_results, eager_results, atol=0.05, rtol=0.05)
+
+            self.assertEqual(hits(), 36)
+            self.assertEqual(misses(), 4)
+
+        # No cache hit due to symbolic expressions passed i.e mm(s0 + s1, 2) vs mm(s3, 2).
+        reset_counters()
+
+        def test_func3(x, y, z, m, l):
+            a = torch.matmul(x, y)
+            b = torch.matmul(torch.cat([x, z], 1), torch.cat([y, m, l], 0))
+            return a, b
+
+        with fresh_cache():
+            a = torch.rand(10, 22, device=GPU_TYPE)
+            b = torch.rand(22, 30, device=GPU_TYPE)
+            c = torch.rand(10, 11, device=GPU_TYPE)
+            d = torch.rand(8, 30, device=GPU_TYPE)
+            e = torch.rand(3, 30, device=GPU_TYPE)
+
+            compile_results = torch.compile(test_func3, dynamic=True)(a, b, c, d, e)
+            eager_results = test_func3(a, b, c, d, e)
+            self.assertEqual(compile_results, eager_results, atol=0.05, rtol=0.05)
+
+            self.assertEqual(hits(), 0)
+            self.assertEqual(misses(), 7)
+
+    @config.patch(
+        {
+            "max_autotune": True,
+            "test_configs.max_mm_configs": 4,
+            "max_autotune_gemm_backends": "TRITON",
+        }
+    )
+    def test_triton_template_generated_code_caching_bmm(self):
+        def func_test1(x, y, z, m):
+            a = torch.bmm(x, y)
+            b = torch.bmm(z, m)
+            return a, b
+
+        a = torch.rand(10, 10, 22, device=GPU_TYPE)
+        b = torch.rand(10, 22, 30, device=GPU_TYPE)
+
+        def hits():
+            return torch._dynamo.utils.counters["inductor"][
+                "generated_module_cache_hit"
+            ]
+
+        def misses():
+            return torch._dynamo.utils.counters["inductor"][
+                "generated_module_cache_miss"
+            ]
+
+        # Valid cache hit.
+        with fresh_cache():
+            torch._dynamo.utils.counters.clear()
+            compile_results = torch.compile(func_test1, dynamic=False)(a, b, a, b)
+            eager_results = func_test1(a, b, a, b)
+            self.assertEqual(compile_results, eager_results, atol=0.05, rtol=0.05)
+            self.assertEqual(hits(), 4)
+            self.assertEqual(misses(), 4)
+
+    @config.patch(
+        {
+            "max_autotune": True,
+            "test_configs.max_mm_configs": 4,
+            "max_autotune_gemm_backends": "ATEN, TRITON",
+        }
+    )
+    def test_triton_template_generated_code_caching_mm_plus_mm(self):
+        def func_test1(x, y, z, m):
+            a = torch.mm(x, y)
+            b = torch.mm(z, m)
+            sum1 = a + b
+
+            c = torch.mm(x, y)
+            d = torch.mm(z, m)
+            sum2 = c + d
+            return sum1, sum2
+
+        a = torch.rand(10, 40, device=GPU_TYPE)
+        b = torch.rand(40, 30, device=GPU_TYPE)
+
+        def hits():
+            return torch._dynamo.utils.counters["inductor"][
+                "generated_module_cache_hit"
+            ]
+
+        def misses():
+            return torch._dynamo.utils.counters["inductor"][
+                "generated_module_cache_miss"
+            ]
+
+        # Valid cache hit.
+        with fresh_cache():
+            torch._dynamo.utils.counters.clear()
+            compile_results = torch.compile(func_test1, dynamic=False)(a, b, a, b)
+            eager_results = func_test1(a, b, a, b)
+            self.assertEqual(compile_results, eager_results, atol=0.05, rtol=0.05)
+            self.assertEqual(hits(), 4)
+            self.assertEqual(misses(), 4)
+
+    @skipIfXpu
+    @unittest.skipIf(TEST_WITH_ROCM, "decompose_k not supported on ROCm")
+    @unittest.skipIf(
+        config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
+    )
+    @config.patch(
+        max_autotune=True,
+        max_autotune_gemm_backends="TRITON",
+        autotune_fallback_to_aten=False,
+        disable_decompose_k=True,
+    )
+    def test_max_autotune_disable_decompose_K(self):
+        M, N, K = (32, 32, 32768)
+
+        a = torch.randn(M, K, dtype=torch.float16, device="cuda", requires_grad=True)
+        b = torch.randn(K, N, dtype=torch.float16, device="cuda", requires_grad=True)
+
+        compiled_func = torch.compile(lambda a, b: a @ b)
+        out, code = run_and_get_code(compiled_func, a, b)
+
+        for codegen in code:
+            FileCheck().check_not("decompose_k").run(codegen)
+
+    @skipIfXpu
+    @unittest.skipIf(
+        TEST_WITH_ROCM, "exhaustive currently only thoroughly tested on NVIDIA"
+    )
+    @config.patch(max_autotune=True, max_autotune_gemm_search_space="EXHAUSTIVE")
+    def test_max_autotune_exhaustive(self):
+        def f(a, b):
+            return a @ b
+
+        M, N, K = (1024, 1024, 1024)
+
+        a = torch.randn(M, K, dtype=torch.float16, device="cuda", requires_grad=True)
+        b = torch.randn(K, N, dtype=torch.float16, device="cuda", requires_grad=True)
+
+        with mock.patch(
+            "torch._inductor.kernel.mm.V.choices.get_config_heuristics"
+        ) as config_mock:
+            config_heuristics = CUDAConfigHeuristic()
+
+            # Traditionally, this would be set of all possible configs
+            # We mock out the code path for the sake of the unit test
+            config_heuristics.exhaustive_configs = [GemmConfig(32, 32, 32, 1, 8, 8)]
+            config_mock.return_value = config_heuristics
+
+            from torch._dynamo.utils import counters
+
+            compiled_func = torch.compile(f)
+            compiled_func(a, b)
+
+            # Only benchmarks 2 choices, aten and the exhaustive triton config
+            # Counter can be InductorBenchmarker or TritonBenchmarker
+            for counter in counters["inductor"]:
+                if "benchmark_gpu" in counter:
+                    self.assertEqual(counters["inductor"][counter], 2)
+
+
+class TestMaxAutotunePrecompile(TestCase):
+    def test_precompilation_threads(self):
+        import threading
+        from typing import Any
+        from unittest.mock import Mock, patch
+
+        class FakeChoiceCaller(ChoiceCaller):
+            def __init__(self) -> None:
+                super().__init__("none", [], Mock(), description="")
+                self.thread_id = None
+
+            def precompile(self):
+                self.thread_id = threading.get_ident()
+
+            def call_name(self) -> str:
+                return None
+
+            def to_callable(self):
+                return None
+
+            def hash_key(self) -> str:
+                return str(hash(self))
+
+            def output_node(self) -> "TensorBox":  # noqa: F821
+                return None
+
+        fake_choices = [FakeChoiceCaller() for i in range(10)]
+        fake_lookup_result = dict.fromkeys(fake_choices, 0.123)
+
+        def no_lookup(
+            choices: list[ChoiceCaller],
+            op: str,
+            inputs: str,
+            benchmark: Callable[[Any], dict[ChoiceCaller, float]],
+        ) -> Optional[dict[ChoiceCaller, float]]:
+            if benchmark is not None:
+                return benchmark(choices)
+
+        asc = AlgorithmSelectorCache()
+
+        def fake_benchmark_fn(*args, **kwargs):
+            return fake_lookup_result
+
+        main_thread_id = threading.get_ident()
+        mock_debug_handler = Mock()
+        old_debug_handler = V.debug
+        try:
+            V.set_debug_handler(mock_debug_handler)
+            with patch.object(asc, "lookup", new=no_lookup):
+                with patch.object(
+                    asc, "make_benchmark_fn", return_value=fake_benchmark_fn
+                ):
+                    with config.patch(
+                        {
+                            "autotune_in_subproc": False,
+                            "compile_threads": len(fake_choices),
+                        }
+                    ):
+                        asc("test_call", fake_choices, [], Mock())
+            for fake_choice in fake_choices:
+                assert fake_choice.thread_id is not None, (
+                    "Expected all ChoiceCaller's precompile method to have been called"
+                )
+                assert fake_choice.thread_id != main_thread_id, (
+                    "Expected all ChoiceCaller's precompile method to have been called on separate thread"
+                )
+        finally:
+            V.set_debug_handler(old_debug_handler)
+
+    def test_filled_cache_precompile(self):
+        def fn(a, b, c):
+            a = (a @ b) @ c
+            a, b, c = (t.to(torch.float16) for t in [a, b, c])
+            return (a @ b) @ c
+
+        fn_c = torch.compile(mode="max-autotune-no-cudagraphs")(fn)
+        inputs = [torch.rand([256, 256], device=GPU_TYPE) for _ in range(3)]
+        from torch._dynamo.utils import counters
+
+        self.assertEqual(fn(*inputs), fn_c(*inputs), atol=1e-2, rtol=1e-2)
+
+        torch._dynamo.reset()
+        counters.clear()
+
+        fn_c = torch.compile(mode="max-autotune-no-cudagraphs")(fn)
+        self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 0)
+
+    @fresh_cache()
+    @config.patch(search_autotune_cache=True)
+    def test_search_autotune_cache(self):
+        def fn(a, b, c):
+            a = (a @ b) @ c
+            a, b, c = (t.to(torch.float16) for t in [a, b, c])
+            return (a @ b) @ c
+
+        fn_c = torch.compile()(fn)
+        inputs = [torch.rand([256, 256], device=GPU_TYPE) for _ in range(3)]
+        from torch._dynamo.utils import counters
+
+        self.assertEqual(fn(*inputs), fn_c(*inputs), atol=1e-2, rtol=1e-2)
+        self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 0)
+
+    @config.patch(autotune_local_cache=False, autotune_remote_cache=False)
+    @runOnRocmArch(MI300_ARCH)
+    def test_precompilations(self):
+        def fn(a, b, c):
+            a = (a @ b) @ c
+            a, b, c = (t.to(torch.float16) for t in [a, b, c])
+            return (a @ b) @ c
+
+        fn_c = torch.compile(mode="max-autotune-no-cudagraphs")(fn)
+        inputs = [torch.rand([256, 256], device=GPU_TYPE) for _ in range(3)]
+
+        torch.testing.assert_close(fn_c(*inputs), fn(*inputs), atol=1e-2, rtol=1e-2)
+
+        from torch._dynamo.utils import counters
+
+        self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 2)
+
+
+@instantiate_parametrized_tests
+class TestMaxAutotuneSubproc(TestCase):
+    def _create_buffer(self, name, shape):
+        return Buffer(
+            name=name,
+            layout=FixedLayout(
+                torch.device(f"{GPU_TYPE}:0"), dtype=torch.float32, size=shape
+            ),
+        )
+
+    # XPU have not support multiprocessing reduction in torch/multiprocessing/reductions.py
+    @skipIfXpu
+    def test_benchmark_choice_in_subproc(self):
+        gm = make_fx(
+            lambda: torch.zeros(2, 3)
+        )()  # a dummy graph to construct the GraphLowering
+        graph = GraphLowering(gm)
+
+        # the graph handler is neede to create benchmark example value below
+        with V.set_graph_handler(graph):
+            buf1 = self._create_buffer("mat1", (2, 3))
+            buf2 = self._create_buffer("mat2", (3, 2))
+            buf3 = self._create_buffer("mat3", (2, 3))
+            buf4 = self._create_buffer("mat4", (3, 2))
+
+            layout = FixedLayout(torch.device(f"{GPU_TYPE}:0"), torch.float32, (2, 2))
+
+            mat1 = AlgorithmSelectorCache.benchmark_example_value(buf1)
+            mat2 = AlgorithmSelectorCache.benchmark_example_value(buf2)
+            mat3 = AlgorithmSelectorCache.benchmark_example_value(buf3)
+            mat4 = AlgorithmSelectorCache.benchmark_example_value(buf4)
+
+            out = AlgorithmSelectorCache.benchmark_example_value(layout)
+            # expected_out = (mat1 @ mat2) + (mat3 @ mat4)
+            expected_out = None
+
+            choice = aten_mm_plus_mm.bind((buf1, buf2, buf3, buf4), layout)
+            # use a tensor since the mutation to a python list in a sub process
+            # is not synced back to the parent process
+            timings = torch.zeros(3, dtype=torch.float32)
+            ctx = mp.get_context("spawn")
+            child = ctx.Process(
+                target=benchmark_choice,
+                args=(choice, (mat1, mat2, mat3, mat4), out, expected_out, timings),
+            )
+            child.start()
+            child.join()
+            self.assertEqual(0, child.exitcode)
+            print(f"timings is {timings}, out {out}, expected_out {expected_out}")
+
+    # XPU have not support multiprocessing reduction in torch/multiprocessing/reductions.py
+    @skipIfXpu
+    def test_benchmark_choice_fail_in_subproc(self):
+        gm = make_fx(
+            lambda: torch.zeros(2, 3)
+        )()  # a dummy graph to construct the GraphLowering
+        graph = GraphLowering(gm)
+
+        # the graph handler is neede to create benchmark example value below
+        with V.set_graph_handler(graph):
+            buf1 = self._create_buffer("mat1", (2, 3))
+            buf2 = self._create_buffer("mat2", (3, 2))
+            buf3 = self._create_buffer("mat3", (2, 3))
+            buf4 = self._create_buffer("mat4", (3, 2))
+
+            layout = FixedLayout(torch.device(f"{GPU_TYPE}:0"), torch.float32, (2, 2))
+
+            mat1 = AlgorithmSelectorCache.benchmark_example_value(buf1)
+            mat2 = AlgorithmSelectorCache.benchmark_example_value(buf2)
+            mat3 = AlgorithmSelectorCache.benchmark_example_value(buf3)
+            mat4 = AlgorithmSelectorCache.benchmark_example_value(buf4)
+
+            out = AlgorithmSelectorCache.benchmark_example_value(layout)
+            expected_out = (mat1 @ mat2) + (mat3 @ mat4)
+
+            choice = FailChoiceCaller("fail_choice_caller", [], None, description="")
+
+            # use a tensor since python list is not synced back
+            timings = torch.zeros(3, dtype=torch.float32)
+            ctx = mp.get_context("spawn")
+            child = ctx.Process(
+                target=benchmark_choice,
+                args=(choice, (mat1, mat2, mat3, mat4), out, expected_out, timings),
+            )
+            child.start()
+            child.join()
+            self.assertNotEqual(0, child.exitcode)
+
+    @parametrize("autotune_in_subproc", (True, False))
+    @parametrize("autotune_multi_device", (True, False))
+    def test_max_autotune_mm_plus_mm(self, autotune_in_subproc, autotune_multi_device):
+        """
+        This crash previously due to a triton issue: https://github.com/triton-lang/triton/issues/1298 .
+        With autotuning in subprocess, we don't crash anymore.
+        """
+        m, n, k = 2048, 1536, 64
+
+        def mm_plus_mm(a, b, c, d):
+            return a @ b + c @ d
+
+        a = torch.randn(m, k).to(GPU_TYPE)
+        b = torch.randn(k, n).to(GPU_TYPE)
+        c = torch.randn(m, k).to(GPU_TYPE)
+        d = torch.randn(k, n).to(GPU_TYPE)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": autotune_in_subproc,
+                "autotune_multi_device": autotune_multi_device,
+            }
+        ):
+            torch.compile(mm_plus_mm)(a, b, c, d)
+
+    @parametrize("dynamic", (False, True))
+    def test_max_autotune_regular_mm(self, dynamic: bool):
+        """
+        Make sure autotuning mm in sub processes work without crashes.
+        """
+
+        def mm(a, b):
+            a = torch.sin(a)
+            return a @ b
+
+        a = torch.randn(100, 10).to(GPU_TYPE)
+        b = torch.randn(10, 100).to(GPU_TYPE)
+
+        with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
+            torch.compile(mm, dynamic=dynamic)(a, b)
+
+    @parametrize("dynamic", (False, True))
+    def test_max_autotune_addmm(self, dynamic=False):
+        """
+        Make sure autotuning addmm in sub processes work without crashes.
+        """
+
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+        def addmm(x, a, b):
+            return torch.addmm(x, a, b)
+
+        x = torch.randn(100).to(GPU_TYPE)
+        a = torch.randn(100, 10).to(GPU_TYPE)
+        b = torch.randn(10, 100).to(GPU_TYPE)
+        with config.patch({"max_autotune": True, "autotune_in_subproc": True}):
+            Y_compiled = torch.compile(addmm, dynamic=dynamic)(x, a, b)
+            Y = addmm(x, a, b)
+            torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2)
+
+    def test_triton_template_with_epilogues_and_dynamic_shape(self):
+        def fn(
+            x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor, mul: torch.Tensor
+        ) -> torch.Tensor:
+            return (
+                torch.nn.functional.relu(
+                    torch.matmul(torch.transpose(x, 0, 1), torch.transpose(w, 0, 1))
+                    + bias
+                )
+                * mul
+            )
+
+        M0 = 5
+        M1 = 8
+        K = 4
+        N = 3
+        w = torch.rand(N, K).to(GPU_TYPE).half()
+        b = torch.rand(N).to(GPU_TYPE).half()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": "Triton",
+            }
+        ):
+            compiled_fn = torch.compile(
+                fn, fullgraph=True, dynamic=True, mode="max-autotune-no-cudagraphs"
+            )
+
+            x0 = torch.rand(K, M0).to(GPU_TYPE).half()
+            mul0 = torch.rand(M0, N).to(GPU_TYPE).half()
+            y0 = compiled_fn(x0, w, b, mul0)
+            y0_expected = fn(x0, w, b, mul0)
+            torch.testing.assert_close(y0, y0_expected)
+
+            x1 = torch.rand(K, M1).to(GPU_TYPE).half()
+            mul1 = torch.rand(M1, N).to(GPU_TYPE).half()
+            y1 = compiled_fn(x1, w, b, mul1)
+            y1_expected = fn(x1, w, b, mul1)
+            torch.testing.assert_close(y1, y1_expected)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+
+@instantiate_parametrized_tests
+class TestMaxAutotuneRemoteCache(TestCase):
+    def setUp(self):
+        super().setUp()
+        PatchCaches.setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        PatchCaches.tearDown()
+
+    @parametrize("dynamic", (False, True))
+    @config.patch(
+        {"compile_threads": 1, "prologue_fusion": False}
+    )  # Worker processes do not register PatchCaches() properly
+    def test_max_autotune_remote_caching(self, dynamic: bool):
+        from unittest.mock import patch
+
+        def mm(a, b):
+            a = torch.sin(a)
+            return a @ b
+
+        a = torch.randn(100, 10).to(GPU_TYPE)
+        b = torch.randn(10, 100).to(GPU_TYPE)
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        def f(x, y):
+            return Model()(x, y)
+
+        x = torch.randn(100, 100).to(GPU_TYPE)
+        y = torch.randn(100, 100).to(GPU_TYPE)
+
+<<<<<<< HEAD
+        with config.patch(
+            {
+                "autotune_local_cache": False,
+                "autotune_remote_cache": True,
+            }
+        ), patch.dict(os.environ), PatchCaches():
+            os.environ.pop("TRITON_CACHE_MANAGER", None)
+            with config.patch({"max_autotune": True}):
+                for _ in range(4):
+                    with fresh_inductor_cache():
+                        torch.compile(mm, dynamic=dynamic)(a, b)
+                    reset()
+                with torch.compiler.config.patch(
+                    {"cache_key_tag": "test"}
+                ), fresh_inductor_cache():
+=======
+        with (
+            config.patch(
+                {
+                    "autotune_local_cache": False,
+                    "autotune_remote_cache": True,
+                }
+            ),
+            patch.dict(os.environ),
+            PatchCaches(),
+        ):
+            os.environ.pop("TRITON_CACHE_MANAGER", None)
+            with config.patch({"max_autotune": True}):
+                for _ in range(4):
+                    with fresh_cache():
+                        torch.compile(mm, dynamic=dynamic)(a, b)
+                    reset()
+                with (
+                    torch.compiler.config.patch({"cache_key_tag": "test"}),
+                    fresh_cache(),
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+                    torch.compile(mm, dynamic=dynamic)(a, b)
+                    reset()
+
+                global_stats.report()
+                self.assertEqual(global_stats.autotune_remote, Stats(2, 3, 2))
+
+            global_stats.reset()
+            for _ in range(4):
+<<<<<<< HEAD
+                with fresh_inductor_cache():
+                    torch.compile(f, dynamic=dynamic)(x, y)
+                reset()
+            with torch.compiler.config.patch(
+                {"cache_key_tag": "test"}
+            ), fresh_inductor_cache():
+=======
+                with fresh_cache():
+                    torch.compile(f, dynamic=dynamic)(x, y)
+                reset()
+            with torch.compiler.config.patch({"cache_key_tag": "test"}), fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+                torch.compile(mm, dynamic=dynamic)(a, b)
+                reset()
+            global_stats.report()
+            self.assertEqual(global_stats.autotune_remote, Stats(2, 3, 2))
+
+
+<<<<<<< HEAD
+class _TestBenchmarkRequest(BenchmarkRequest):
+    def __init__(
+        self, value: float, multi_device: bool, parent_visible_devices: Optional[str]
+    ) -> None:
+        self.value = value
+        self.multi_device = multi_device
+        self.parent_visible_devices = parent_visible_devices
+
+    def benchmark(
+        self, *input_tensors: torch.Tensor, output_tensor: Optional[torch.Tensor] = None
+    ) -> float:
+        # Verify that the visible devices env var is set correctly. If multi-device
+        # auto-tuning is disabled, the visible devices should be unmanipulated from
+        # the parent process. If multi-device auto-tuning is enabled, the visible
+        # devices should be a _single_ valid device number. Note that we can't perform
+        # this validation directly from the test body because benchmarks execute in a
+        # separate process. If the check fails, however, the test will detect the
+        # failure by virtue of not receiving the expected result back.
+        visible_devices = os.environ.get(CUDA_VISIBLE_DEVICES)
+        if not self.multi_device:
+            assert visible_devices == self.parent_visible_devices
+        else:
+            assert self.parent_visible_devices is not None
+            valid_devices = self.parent_visible_devices.split(",")
+            assert visible_devices in valid_devices
+
+        return self.value
+
+
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+class _TestTritonTemplateCaller(TritonTemplateCaller):
+    def __init__(self, bmreq: _TestBenchmarkRequest):
+        self.bmreq = bmreq
+
+    def __str__(self) -> str:
+        return "test"
+
+
+class TestTuningProcess(TestCase):
+<<<<<<< HEAD
+    def test_tuning_pool_crash(self):
+        # Use only one device/subprocess so we test the process restarts
+        # and is usable after a "crash".
+        with config.patch({"autotune_multi_device": False}):
+            tuning_pool = TuningProcessPool()
+            tuning_pool.initialize()
+
+            # First force the tuning process to "crash" by setting a bogus
+            # string for the expected visible devices.
+            bmreq = _TestBenchmarkRequest(3.14, False, "invalid")
+            choice = _TestTritonTemplateCaller(bmreq)
+
+            timings = tuning_pool.benchmark([choice])
+            self.assertTrue(choice in timings)
+            self.assertEqual(timings[choice], float("inf"))
+
+            # Then send another request and make sure the sub-process
+            # has restarted and is operational. 'valid_devices' expected
+            # to be None because autotune_multi_device is off.
+            choice.bmreq.parent_visible_devices = os.environ.get(CUDA_VISIBLE_DEVICES)
+
+            timings = tuning_pool.benchmark([choice])
+            self.assertTrue(choice in timings)
+            self.assertEqual(timings[choice], bmreq.value)
+
+            tuning_pool.terminate()
+
+    # XPU have to enable XPU_VISIBLE_DEVICES to control devices visibility.
+    @skipIfXpu
+    def test_tuning_pool_multiple_devices(self):
+        with config.patch({"autotune_multi_device": True}):
+            # Adapt the test to the available devices (and whether CUDA_VISIBLE_DEVICES
+            # is already set in the environment); use a subset of the available devices
             # to ensure only the subset are visible to the sub-processes.
             if CUDA_VISIBLE_DEVICES in os.environ:
                 visible_devices = os.environ[CUDA_VISIBLE_DEVICES].split(",")
@@ -1351,6 +2523,140 @@ def test_tuning_pool_multiple_devices(self):
             self.assertEqual(timings[choice2], choice2.bmreq.value)
 
             tuning_pool.terminate()
+=======
+    def check_healthy(self, p: TuningProcess, device: Optional[int] = None):
+        result = random.random()
+        bmreq = _TestBenchmarkRequest(result, device=device)
+        p.put(bmreq.benchmark)
+        self.assertEqual(p.get(), result)
+
+    def test_tuning_subproc_timeout(self):
+        p = TuningProcess(None)
+
+        bmreq = _TestBenchmarkRequest(0, sleep=120)
+        p.put(bmreq.benchmark)
+        with self.assertRaises(TimeoutError):
+            p.get(timeout=1.0)
+
+        # Make sure the TuningProcess is still usable after a timeout.
+        self.check_healthy(p)
+        p.shutdown()
+
+    def test_tuning_subproc_exception(self):
+        p = TuningProcess(None)
+
+        bmreq = _TestBenchmarkRequest(0, exc=RuntimeError("Fail"))
+        p.put(bmreq.benchmark)
+        with self.assertRaises(RuntimeError):
+            p.get()
+
+        # Make sure the TuningProcess is still usable after an exception.
+        self.check_healthy(p)
+        p.shutdown()
+
+    def test_tuning_subproc_crash(self):
+        p = TuningProcess(None)
+
+        bmreq = _TestBenchmarkRequest(0, crash=True)
+        p.put(bmreq.benchmark)
+        with self.assertRaises(EOFError):
+            p.get()
+
+        # Make sure the TuningProcess is still usable after a crash.
+        self.check_healthy(p)
+        p.shutdown()
+
+    def test_tuning_subproc_killed(self):
+        p = TuningProcess(None)
+        p.kill()
+        self.check_healthy(p)
+        p.shutdown()
+
+    def test_visible_devices(self):
+        device_list = TuningProcessPool.get_device_list()
+        for device in device_list:
+            p = TuningProcess(device)
+            self.check_healthy(p, device=device)
+            p.shutdown()
+
+
+class TestTuningProcessPool(TestCase):
+    # Use only one device/subprocess so we test the process restarts
+    # and is usable after a crash.
+    @config.patch({"autotune_multi_device": False})
+    def test_tuning_pool_crash(self):
+        tuning_pool = TuningProcessPool()
+
+        # First force the tuning process to crash.
+        bmreq = _TestBenchmarkRequest(0, crash=True)
+        choice = _TestTritonTemplateCaller(bmreq)
+
+        timings = tuning_pool.benchmark([choice])
+        self.assertTrue(choice in timings)
+        self.assertEqual(timings[choice], float("inf"))
+
+        # Then send another request and make sure the sub-process
+        # has restarted and is operational.
+        bmreq = _TestBenchmarkRequest(3.14)
+        choice = _TestTritonTemplateCaller(bmreq)
+
+        timings = tuning_pool.benchmark([choice])
+        self.assertTrue(choice in timings)
+        self.assertEqual(timings[choice], bmreq.result)
+
+        tuning_pool.shutdown()
+
+    @config.patch({"autotune_multi_device": False})
+    def test_tuning_pool_timeout(self):
+        tuning_pool = TuningProcessPool()
+
+        # First force the tuning process to timeout.
+        bmreq = _TestBenchmarkRequest(0, sleep=120)
+        choice = _TestTritonTemplateCaller(bmreq)
+
+        with config.patch({"max_autotune_subproc_result_timeout_seconds": 1.0}):
+            timings = tuning_pool.benchmark([choice])
+        self.assertTrue(choice in timings)
+        self.assertEqual(timings[choice], float("inf"))
+
+        # Then send another request and make sure the sub-process
+        # has restarted and is operational.
+        bmreq = _TestBenchmarkRequest(3.14)
+        choice = _TestTritonTemplateCaller(bmreq)
+
+        timings = tuning_pool.benchmark([choice])
+        self.assertTrue(choice in timings)
+        self.assertEqual(timings[choice], bmreq.result)
+
+        tuning_pool.shutdown()
+
+    # XPU have to enable XPU_VISIBLE_DEVICES to control devices visibility.
+    @skipIfXpu
+    @config.patch({"autotune_multi_device": True})
+    def test_tuning_pool_multiple_devices(self):
+        # Adapt the test to the available devices (and whether CUDA_VISIBLE_DEVICES
+        # is already set in the environment); use a subset of the available devices
+        # to ensure only the subset are visible to the sub-processes.
+        if CUDA_VISIBLE_DEVICES in os.environ:
+            visible_devices = os.environ[CUDA_VISIBLE_DEVICES].split(",")
+        else:
+            visible_devices = [str(d) for d in range(torch.cuda.device_count())]
+
+        cuda_visible_devices = ",".join(visible_devices[-2:])
+        with unittest.mock.patch.dict(
+            os.environ, {CUDA_VISIBLE_DEVICES: cuda_visible_devices}
+        ):
+            tuning_pool = TuningProcessPool()
+
+        choice1 = _TestTritonTemplateCaller(_TestBenchmarkRequest(3.14))
+        choice2 = _TestTritonTemplateCaller(_TestBenchmarkRequest(2.718))
+
+        timings = tuning_pool.benchmark([choice1, choice2])
+        self.assertEqual(timings[choice1], choice1.bmreq.result)
+        self.assertEqual(timings[choice2], choice2.bmreq.result)
+
+        tuning_pool.shutdown()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @instantiate_parametrized_tests
@@ -1373,21 +2679,34 @@ def setUpClass(cls):
         )
 
     def check_code(self, code_str, num_kernels, num_allocs, num_deallocs):
+<<<<<<< HEAD
         FileCheck().check(_get_func_call()).check_count(
             _get_kernel_launch(),
+=======
+        FileCheck().check(get_func_call()).check_count(
+            get_kernel_launch(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             num_kernels,
             exactly=True,
         ).run(code_str)
 
         if num_allocs is not None:
+<<<<<<< HEAD
             FileCheck().check(_get_func_call()).check_count(
+=======
+            FileCheck().check(get_func_call()).check_count(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "empty_strided", num_allocs, exactly=True
             ).run(code_str)
 
         # skip the deallocation check when using cpp_wrapper; most deallocations happen
         # outside of our control via RAIIAtenTensorHandle
         if num_deallocs is not None and not config.cpp_wrapper:
+<<<<<<< HEAD
             FileCheck().check(_get_func_call()).check_count(
+=======
+            FileCheck().check(get_func_call()).check_count(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "del", num_deallocs, exactly=True
             ).run(code_str)
 
@@ -1463,6 +2782,7 @@ def foo(x, y):
 
         out, code = run_and_get_code(torch.compile(foo), x, y)
         self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
+<<<<<<< HEAD
         self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
 
         # should not be done in low precision
@@ -1473,6 +2793,11 @@ def foo(x, y):
             .check("dot")
             .run(code[0])
         )
+=======
+
+        # should not be done in low precision, two kernels
+        self.check_code(code[0], num_kernels=2, num_allocs=2, num_deallocs=3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_downcast(self):
         # per heuristics, dont fuse a downcast into a mm because it would lead to more reads inside kernel
@@ -1510,8 +2835,11 @@ def foo(x, y):
         {
             "max_autotune_gemm_backends": "Triton",
             "benchmark_epilogue_fusion": True,
+<<<<<<< HEAD
             "use_mixed_mm": False,
             "mixed_mm_choice": "default",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "max_epilogue_benchmarked_choices": 3,
         }
     )
@@ -1527,8 +2855,13 @@ def multi_use(x, y):
 
         out, code = run_and_get_code(torch.compile(multi_use), x, y)
 
+<<<<<<< HEAD
         FileCheck().check(_get_func_call()).check_count(
             _get_kernel_launch(), 2, exactly=True
+=======
+        FileCheck().check(get_func_call()).check_count(
+            get_kernel_launch(), 2, exactly=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run(code[0])
         self.assertEqual(out, multi_use(x, y), atol=0.05, rtol=0.05)
 
@@ -1537,8 +2870,13 @@ def resolve_pending(x):
 
         x = torch.rand([128, 128], device=GPU_TYPE)
         out, code = run_and_get_code(torch.compile(resolve_pending), x)
+<<<<<<< HEAD
         FileCheck().check(_get_func_call()).check_count(
             _get_kernel_launch(), 1, exactly=True
+=======
+        FileCheck().check(get_func_call()).check_count(
+            get_kernel_launch(), 1, exactly=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run(code[0])
         self.assertEqual(out, resolve_pending(x), atol=0.05, rtol=0.05)
 
@@ -1546,8 +2884,11 @@ def resolve_pending(x):
         {
             "max_autotune_gemm_backends": "Triton",
             "benchmark_epilogue_fusion": True,
+<<<<<<< HEAD
             "use_mixed_mm": False,
             "mixed_mm_choice": "default",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "max_epilogue_benchmarked_choices": 3,
         }
     )
@@ -1561,8 +2902,13 @@ def test_multiple_fusions(x):
 
         x = torch.rand([128, 128], dtype=torch.float16, device=GPU_TYPE)
         out, code = run_and_get_code(torch.compile(test_multiple_fusions), x)
+<<<<<<< HEAD
         FileCheck().check(_get_func_call()).check_count(
             _get_kernel_launch(), 1, exactly=True
+=======
+        FileCheck().check(get_func_call()).check_count(
+            get_kernel_launch(), 1, exactly=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).run(code[0])
         self.assertEqual(out, test_multiple_fusions(x), atol=0.05, rtol=0.05)
 
diff --git a/test/inductor/test_memory_planning.py b/test/inductor/test_memory_planning.py
index a069a80af924..40992ae2f8c4 100644
--- a/test/inductor/test_memory_planning.py
+++ b/test/inductor/test_memory_planning.py
@@ -53,6 +53,7 @@ def test_python_wrapper(self):
         result, code = run_and_get_cpp_code(compiled, *args)
 
         FileCheck().check(
+<<<<<<< HEAD
             "pool1 = empty_strided_" + GPU_TYPE + "((4*s0*s1 + align(4*s0*s0), ), (1, )"
         ).check_next(
             "buf0 = alloc_from_pool(pool1, 0, torch.float32, (s0, s0), (s0, 1))"
@@ -61,6 +62,14 @@ def test_python_wrapper(self):
         ).run(
             code
         )
+=======
+            "pool1 = empty_strided_"
+            + GPU_TYPE
+            + "((4*s27*s77 + align(4*s77*s77), ), (1, )"
+        ).check_next(
+            "buf0 = alloc_from_pool(pool1, 0, torch.float32, (s77, s77), (s77, 1))"
+        ).check("buf1 = alloc_from_pool(pool1, align(4*s77*s77),").run(code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(same(f(*args), result))
 
     def test_cpp_wrapper(self):
@@ -70,12 +79,19 @@ def test_cpp_wrapper(self):
             result, code = run_and_get_cpp_code(compiled, *args)
 
         FileCheck().check(
+<<<<<<< HEAD
             "aoti_torch__alloc_from_pool(pool1, 0, cached_torch_dtype_float32, 2, int_array_4, int_array_5, &tmp_tensor_handle_0)"
         ).check_next("auto buf0 = RAIIAtenTensorHandle(tmp_tensor_handle_0);").check(
             "auto buf1 = RAIIAtenTensorHandle(tmp_tensor_handle_1);"
         ).run(
             code
         )
+=======
+            "aoti_torch__alloc_from_pool(pool1, 0, cached_torch_dtype_float32, 2, int_array_2, int_array_3, &tmp_tensor_handle_0)"
+        ).check_next("auto buf0 = RAIIAtenTensorHandle(tmp_tensor_handle_0);").check(
+            "auto buf1 = RAIIAtenTensorHandle(tmp_tensor_handle_1);"
+        ).run(code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(same(f(*args), result))
 
     @skipIfXpu(msg="aoti doesn't work on XPU")
@@ -91,6 +107,7 @@ def test_aoti(self):
         dim0_x = Dim("dim0_x", min=1, max=2048)
         dynamic_shapes = ({0: dim0_x}, None, None)
         result, code = run_and_get_cpp_code(
+<<<<<<< HEAD
             lambda: AOTIRunnerUtil.run(GPU_TYPE, f, args, dynamic_shapes=dynamic_shapes)
         )
 
@@ -113,6 +130,22 @@ def test_aoti(self):
         ).run(
             code
         )
+=======
+            lambda: AOTIRunnerUtil.run(f, args, dynamic_shapes=dynamic_shapes)
+        )
+
+        FileCheck().check(
+            "int64_t int_array_0[] = {24L + align(12L*s77), };"
+        ).check_next("int64_t int_array_1[] = {1L, };").check_next(
+            "AtenTensorHandle pool1_handle;"
+        ).check_next(
+            "aoti_torch_empty_strided(1, int_array_0, int_array_1,"
+        ).check_next("RAIIAtenTensorHandle pool1(pool1_handle);").check_next(
+            "int64_t int_array_2[] = {s77, 3L};"
+        ).check_next("int64_t int_array_3[] = {3L, 1L};").check_next(
+            "AtenTensorHandle tmp_tensor_handle_0;"
+        ).check_next("aoti_torch__alloc_from_pool(pool1, 0").run(code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(same(f(*args), result))
 
 
diff --git a/test/inductor/test_metrics.py b/test/inductor/test_metrics.py
index cf8c94143943..7f04a530a2b7 100644
--- a/test/inductor/test_metrics.py
+++ b/test/inductor/test_metrics.py
@@ -50,9 +50,13 @@ def triton_red_fused_add_sum_2(in_out_ptr0, in_ptr0, xnumel, rnumel, XBLOCK : tl
     tmp5 = tmp4 + tmp2
     tl.debug_barrier()
     tl.store(in_out_ptr0 + (x0), tmp5, xmask)
+<<<<<<< HEAD
 """.replace(
     "GPU_TYPE", GPU_TYPE
 )
+=======
+""".replace("GPU_TYPE", GPU_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestMetrics(TestCase):
diff --git a/test/inductor/test_minifier.py b/test/inductor/test_minifier.py
index 702ade28a61a..17b1852ced52 100644
--- a/test/inductor/test_minifier.py
+++ b/test/inductor/test_minifier.py
@@ -75,9 +75,13 @@ def inner(x):
     return x - torch.tensor(655, dtype=torch.half, device='GPU_TYPE') * 100
 
 inner(torch.tensor(655 * 100, dtype=torch.half, device='GPU_TYPE'))
+<<<<<<< HEAD
 """.replace(
             "GPU_TYPE", GPU_TYPE
         )
+=======
+""".replace("GPU_TYPE", GPU_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # If we disable RMSE against fp64, this triggers accuracy error,
         # as the increased precision from torch.compile changes the result
diff --git a/test/inductor/test_minifier_utils.py b/test/inductor/test_minifier_utils.py
index 95b25aab7809..ba5874a7b480 100644
--- a/test/inductor/test_minifier_utils.py
+++ b/test/inductor/test_minifier_utils.py
@@ -85,7 +85,11 @@ def true_fn(x):
 #         gt = item > 0;  item = None
 #         true_graph_0 = self.true_graph_0
 #         false_graph_0 = self.false_graph_0
+<<<<<<< HEAD
 #         cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [x]);  gt = true_graph_0 = false_graph_0 = x = None
+=======
+#         cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (x,));  gt = true_graph_0 = false_graph_0 = x = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #         getitem = cond[0];  cond = None
 #         return pytree.tree_unflatten((getitem,), self._out_spec)""",
         )
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index fbc11009f567..dfa4cc75783d 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -10,9 +10,20 @@
 from torch._dynamo.utils import counters
 from torch._inductor import config, metrics
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
 from torch._inductor.utils import run_and_get_code
 from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
 from torch.nn import functional as F
+=======
+from torch._inductor.utils import (
+    is_mkldnn_bf16_supported,
+    is_mkldnn_fp16_supported,
+    run_and_get_code,
+)
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
+from torch.nn import functional as F
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_quantization import (
     _generate_qdq_quantized_model,
     skipIfNoDynamoSupport,
@@ -29,11 +40,23 @@
     skipIfNoXPU,
     skipIfRocm,
     skipIfRocmArch,
+<<<<<<< HEAD
+=======
+    skipIfXpu,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_ACL,
     TEST_MKL,
     xfailIfACL,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import _check_has_dynamic_shape, HAS_CPU
+=======
+from torch.testing._internal.inductor_utils import (
+    _check_has_dynamic_shape,
+    clone_preserve_strides_offset,
+    HAS_CPU,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # The dict value is match_nodes(computation_op+unary_op)
@@ -91,7 +114,11 @@ def get_default_quantizer(is_qat, is_dynamic):
     return quantizer
 
 
+<<<<<<< HEAD
 def cal_conv_generated_kernel_number(mod, input, dtype, dim=4):
+=======
+def cal_conv_generated_kernel_number(mod, input, dtype, dim=4, device="cpu"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # this function is to decide how many kernels are generated
     # while testing conv2d/3d/deconv2d
     # the assumption is:
@@ -103,11 +130,22 @@ def cal_conv_generated_kernel_number(mod, input, dtype, dim=4):
     #       and force the output to have same stride with eager.
     #       So there will be a to_contiguous for output if eager output is contiguouse
     mod = copy.deepcopy(mod)
+<<<<<<< HEAD
     input = input.clone()
     if dtype == torch.float32:
         maybe_autocast = contextlib.nullcontext()
     else:
         maybe_autocast = torch.amp.autocast("cpu", dtype=dtype)
+=======
+    mod = mod.to(device=device)
+    input = input.clone()
+    input = input.to(device)
+
+    if dtype == torch.float32:
+        maybe_autocast = contextlib.nullcontext()
+    else:
+        maybe_autocast = torch.amp.autocast(device_type=device, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with torch.no_grad(), maybe_autocast:
         output = mod(input)
     input_kernel, output_kernel = 0, 0
@@ -121,11 +159,28 @@ def cal_conv_generated_kernel_number(mod, input, dtype, dim=4):
         TEST_ACL and dtype == torch.bfloat16
     ):
         output_kernel = 1
+<<<<<<< HEAD
     return input_kernel + output_kernel
 
 
 @config.patch({"freezing": True})
 class TestPatternMatcherBase(TestCase):
+=======
+
+    return input_kernel + output_kernel
+
+
+class TestPatternMatcherBase(TestCase):
+    def setUp(self):
+        TestCase.setUp(self)
+        self.ctx_stack = contextlib.ExitStack()
+        self.ctx_stack.enter_context(config.patch({"freezing": True}))
+
+    def tearDown(self):
+        TestCase.tearDown(self)
+        self.ctx_stack.close()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _check_unary_is_decomposed(self, unary_fn):
         return not any(
             isinstance(unary_fn, fn)
@@ -155,6 +210,7 @@ def _test_common(
         quantizer=None,
         compile_options={},  # noqa: B006
     ):
+<<<<<<< HEAD
         counters.clear()
         torch._dynamo.reset()
         has_xpu = any(
@@ -175,6 +231,31 @@ def _test_common(
             maybe_autocast = torch.amp.autocast(
                 device_type=device_type, dtype=torch.float16
             )
+=======
+        if not hasattr(self, "device"):
+            has_xpu = any(
+                isinstance(input, torch.Tensor) and input.device.type == "xpu"
+                for input in inputs
+            )
+            device = "xpu" if has_xpu else "cpu"
+        else:
+            device = self.device
+
+        mod = mod.to(device=device)
+        if device != "cpu":
+            inputs = tuple(
+                clone_preserve_strides_offset(x, device=device) for x in inputs
+            )
+        counters.clear()
+        torch._dynamo.reset()
+        if check_autocast == torch.bfloat16 and is_mkldnn_bf16_supported(device):
+            maybe_autocast = torch.amp.autocast(
+                device_type=device, dtype=torch.bfloat16
+            )
+            atol, rtol = 1e-2, 1e-2
+        elif check_autocast == torch.float16 and (is_mkldnn_fp16_supported(device)):
+            maybe_autocast = torch.amp.autocast(device_type=device, dtype=torch.float16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             atol, rtol = 1e-2, 1e-2
         else:
             assert check_autocast == torch.float32
@@ -216,6 +297,17 @@ def _test_code_common(
                 torch.compile(mod, fullgraph=True, dynamic=check_dynamic),
                 *clone_inputs,
             )
+<<<<<<< HEAD
+=======
+            assert_keywords = ["assert_size_stride", "assert_alignment"]
+            filtered_lines = [
+                line
+                for line in source_code.splitlines()
+                if not any(assert_key in line for assert_key in assert_keywords)
+            ]
+            source_code = "\n".join(filtered_lines)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for op in include_ops:
                 self.assertIn(op, source_code)
             if num_include_ops is not None:
@@ -233,8 +325,13 @@ def _test_code_common(
                 torch.testing.assert_close(actual, expected, atol=atol, rtol=rtol)
 
 
+<<<<<<< HEAD
 class TestPatternMatcher(TestPatternMatcherBase):
     def _test_conv_unary_cpu_base(self, dim=4):
+=======
+class TestPatternMatcherGeneric(TestPatternMatcherBase):
+    def _test_conv_unary_base(self, dim=4):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert dim == 4 or dim == 5
 
         class M(torch.nn.Module):
@@ -257,9 +354,15 @@ def forward(self, x):
         dtypes = [
             torch.float,
         ]
+<<<<<<< HEAD
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
             dtypes.append(torch.bfloat16)
         if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+=======
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtypes.append(torch.float16)
         cl_format = torch.channels_last if dim == 4 else torch.channels_last_3d
         options = itertools.product(
@@ -304,19 +407,30 @@ def matcher_check_fn():
 
             self._test_common(mod, (v,), matcher_check_fn, check_autocast=dtype)
             generated_kernel_count = cal_conv_generated_kernel_number(
+<<<<<<< HEAD
                 mod, v, dtype, dim
+=======
+                mod, v, dtype, dim, self.device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertEqual(metrics.generated_kernel_count, generated_kernel_count)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     def test_conv2d_unary_cpu(self):
         self._test_conv_unary_cpu_base(dim=4)
+=======
+    def test_conv2d_unary(self, device):
+        self.device = device
+        self._test_conv_unary_base(dim=4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     def test_conv3d_unary_cpu(self):
         self._test_conv_unary_cpu_base(dim=5)
 
@@ -505,6 +619,11 @@ def matcher_check_fn():
             self._test_common(mod, (v,), matcher_check_fn, check_autocast=dtype)
             # 1 kernel for "to_lowp", 2 kernels for unary ops
             self.assertEqual(metrics.generated_kernel_count, 3)
+=======
+    def test_conv3d_unary(self, device):
+        self.device = device
+        self._test_conv_unary_base(dim=5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_conv_transpose_unary_base(self, dim=4):
         assert dim == 4 or dim == 5
@@ -533,9 +652,15 @@ def forward(self, x):
         dtypes = [
             torch.float,
         ]
+<<<<<<< HEAD
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
             dtypes.append(torch.bfloat16)
         if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+=======
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtypes.append(torch.float16)
 
         cl_format = torch.channels_last if dim == 4 else torch.channels_last_3d
@@ -575,20 +700,40 @@ def matcher_check_fn():
 
             self._test_common(mod, (v,), matcher_check_fn, check_autocast=dtype)
             generated_kernel_count = cal_conv_generated_kernel_number(
+<<<<<<< HEAD
                 mod, v, dtype, dim
+=======
+                mod, v, dtype, dim, self.device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertEqual(metrics.generated_kernel_count, generated_kernel_count)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     def test_conv_transpose2d_unary_cpu(self):
+=======
+    @skipIfXpu(
+        msg="The operator 'mkldnn::_convolution_transpose_pointwise' is not currently implemented for the XPU device."
+    )
+    def test_conv_transpose2d_unary(self, device):
+        self.device = device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_conv_transpose_unary_base(dim=4)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     def test_conv_transpose3d_unary_cpu(self):
+=======
+    @skipIfXpu(
+        msg="The operator 'mkldnn::_convolution_transpose_pointwise' is not currently implemented for the XPU device."
+    )
+    def test_conv_transpose3d_unary(self, device):
+        self.device = device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_conv_transpose_unary_base(dim=5)
 
     def _test_conv_binary_base(self, dim=4):
@@ -622,9 +767,15 @@ def forward(self, x):
         dtypes = [
             torch.float,
         ]
+<<<<<<< HEAD
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
             dtypes.append(torch.bfloat16)
         if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+=======
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtypes.append(torch.float16)
         cl_format = torch.channels_last if dim == 4 else torch.channels_last_3d
         test_memory_format = [torch.contiguous_format, cl_format]
@@ -669,20 +820,34 @@ def matcher_check_fn():
 
             self._test_common(mod, (v,), matcher_check_fn, check_autocast=dtype)
             generated_kernel_count = cal_conv_generated_kernel_number(
+<<<<<<< HEAD
                 mod, v, dtype, dim
+=======
+                mod, v, dtype, dim, self.device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.assertEqual(metrics.generated_kernel_count, generated_kernel_count)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     def test_conv2d_binary(self):
+=======
+    def test_conv2d_binary(self, device):
+        self.device = device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_conv_binary_base(dim=4)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     def test_conv3d_binary(self):
+=======
+    def test_conv3d_binary(self, device):
+        self.device = device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_conv_binary_base(dim=5)
 
     def _test_conv_binary_broadcast_shapes_base(self, dim=4):
@@ -713,9 +878,15 @@ def forward(self, x, x2):
         dtypes = [
             torch.float,
         ]
+<<<<<<< HEAD
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
             dtypes.append(torch.bfloat16)
         if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+=======
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtypes.append(torch.float16)
         cl_format = torch.channels_last if dim == 4 else torch.channels_last_3d
         test_memory_format = [torch.contiguous_format, cl_format]
@@ -779,16 +950,268 @@ def matcher_check_fn():
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     def test_conv2d_binary_broadcast_shapes_cpu(self):
+=======
+    def test_conv2d_binary_broadcast_shapes(self, device):
+        self.device = device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_conv_binary_broadcast_shapes_base(dim=4)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
+<<<<<<< HEAD
     def test_conv3d_binary_broadcast_shapes_cpu(self):
         self._test_conv_binary_broadcast_shapes_base(dim=5)
 
     def test_linear_binary(self):
+=======
+    def test_conv3d_binary_broadcast_shapes(self, device):
+        self.device = device
+        self._test_conv_binary_broadcast_shapes_base(dim=5)
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    @unittest.skipIf(IS_FBCODE, "Failing in fbcode")
+    def test_conv2d_linear_add_broadcast_shapes(self, device):
+        self.device = device
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 16, kernel_size=3, stride=1)
+                self.linear = torch.nn.Linear(3, 16)
+
+            def forward(self, x1, x2):
+                return self.conv(x1) + self.linear(x2)[:, :, None, None]
+
+        metrics.reset()
+        mod = M().eval()
+        x1 = torch.randn(2, 3, 56, 56)
+        x2 = torch.randn(2, 3)
+
+        def matcher_check_fn():
+            match_nodes = 0 if TEST_ACL else 2
+            self.assertEqual(
+                counters["inductor"]["mkldnn_conv_binary_unary_fusion_matcher_nodes"],
+                match_nodes,
+            )
+            self.assertEqual(
+                counters["inductor"]["mkldnn_conv_weight_pack_matcher_nodes"], 1
+            )
+
+        self._test_common(mod, (x1, x2), matcher_check_fn)
+
+
+class TestPatternMatcher(TestPatternMatcherBase):
+    def test_linear_unary(self, device="cpu"):
+        self.device = device
+
+        class M(torch.nn.Module):
+            def __init__(
+                self,
+                unary_fn,
+                in_features,
+                out_features,
+                bias,
+                **kwargs,
+            ):
+                super().__init__()
+                self.linear = torch.nn.Linear(
+                    in_features,
+                    out_features,
+                    bias,
+                    **kwargs,
+                )
+                self.unary_fn = unary_fn
+
+            def forward(self, x):
+                x = self.linear(x)
+                return self.unary_fn(x)
+
+        dtypes = []
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+            dtypes.append(torch.float16)
+        options = itertools.product(unary_list, [True, False], dtypes)
+        for unary_fn, bias, dtype in options:
+            metrics.reset()
+            mod = M(unary_fn, 10, 30, bias=bias).eval()
+            # only fuse for linear when the dtype is bf16
+            mod = mod
+            v = torch.randn(2, 10)
+
+            def matcher_check_fn():
+                match_nodes = unary_list[unary_fn]
+                if self._check_unary_is_decomposed(unary_fn):
+                    # Has extra dtype conversion nodes for autocast.
+                    match_nodes += 2
+                self.assertEqual(
+                    counters["inductor"]["mkldnn_unary_fusion_matcher_nodes"],
+                    0 if TEST_ACL else match_nodes,
+                )
+                self.assertEqual(
+                    counters["inductor"]["mkldnn_linear_weight_pack_matcher_count"], 1
+                )
+
+            self._test_common(mod, (v,), matcher_check_fn, check_autocast=dtype)
+            # only generated 1 kernel for "to"
+            self.assertEqual(metrics.generated_kernel_count, 2 if TEST_ACL else 1)
+
+    @unittest.skipIf(not TEST_MKL, "Test requires MKL")
+    def test_linear_fp32(self, device="cpu"):
+        self.device = device
+
+        class M(torch.nn.Module):
+            def __init__(self, bias):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 30, bias)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        for bias in [True, False]:
+            mod = M(bias=bias).eval()
+            v = torch.randn(2, 10)
+
+            # packing pass.
+            def matcher_check_fn():
+                self.assertEqual(
+                    counters["inductor"]["mkldnn_linear_weight_pack_matcher_count"], 1
+                )
+
+            self._test_common(mod, (v,), matcher_check_fn)
+
+    @unittest.skipIf(not TEST_MKL, "Test requires MKL")
+    def test_linear_input_non_contiguous_3D_wo_bias(self, device="cpu"):
+        self.device = device
+
+        # Activation is 3D, non-contiguous and without Bias
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4096, 1024, bias=False)
+
+            def forward(self, x):
+                x = torch.ops.aten.permute.default(x, [0, 2, 1, 3])
+                x = torch.ops.aten.reshape.default(x, [4, 1, 4096])
+                return self.linear(x)
+
+        mod = M().eval()
+        v = torch.randn(4, 32, 1, 128)
+
+        dtypes = [torch.float]
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+            dtypes.append(torch.float16)
+
+        for dtype in dtypes:
+            torch._dynamo.reset()
+            autocast_enabled = (
+                True if dtype in [torch.bfloat16, torch.float16] else False
+            )
+            with (
+                torch.no_grad(),
+                torch.autocast(
+                    device_type="cpu",
+                    enabled=autocast_enabled,
+                    dtype=dtype,
+                ),
+            ):
+                expected = mod(v)
+                actual, (source_code,) = run_and_get_code(
+                    torch.compile(mod, fullgraph=True),
+                    v,
+                )
+                self.assertIn(
+                    "torch.ops.mkldnn._linear_pointwise.default"
+                    if autocast_enabled
+                    else "torch.ops.mkl._mkl_linear.default",
+                    source_code,
+                )
+                torch.testing.assert_close(actual, expected, atol=1e-2, rtol=1e-2)
+
+    @skipIfXpu(
+        msg="Different with CPU, two linears will be concat on XPU for better performance"
+    )
+    def test_linear_add_bias(self, device="cpu"):
+        self.device = device
+
+        class M(torch.nn.Module):
+            def __init__(self, device, dtype, unary_fn, cast_bias):
+                super().__init__()
+                self.linear1 = torch.nn.Linear(10, 64, bias=False)
+                self.bias1 = torch.randn(64, device=device)
+                self.linear2 = torch.nn.Linear(10, 64, bias=False)
+                self.bias2 = torch.randn(64, device=device)
+                if cast_bias:
+                    self.bias1 = self.bias1.to(dtype=dtype, device=device)
+                    self.bias2 = self.bias2.to(dtype=dtype, device=device)
+                self.unary_fn = unary_fn
+
+            def forward(self, x):
+                a = self.linear1(x) + self.bias1
+                b = self.linear2(x) + self.bias2
+                return self.unary_fn(a), self.unary_fn(b)
+
+        dtypes = []
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+            dtypes.append(torch.float16)
+        options = itertools.product(unary_list, dtypes)
+        for unary_fn, dtype in options:
+            metrics.reset()
+            fold_mod = M(self.device, dtype, unary_fn, cast_bias=True).eval()
+            v = torch.randn(2, 10)
+
+            def folder_matcher_check_fn():
+                match_nodes = unary_list[unary_fn]
+                if self._check_unary_is_decomposed(unary_fn):
+                    # Has extra dtype conversion nodes for autocast.
+                    match_nodes += 2
+                # we have 2 linears, so we double the matcher_count/nodes
+                self.assertEqual(
+                    counters["inductor"]["mkldnn_unary_fusion_matcher_count"],
+                    0 if TEST_ACL else 2,
+                )
+                self.assertEqual(
+                    counters["inductor"]["mkldnn_unary_fusion_matcher_nodes"],
+                    0 if TEST_ACL else match_nodes * 2,
+                )
+                self.assertEqual(
+                    counters["inductor"]["mkldnn_linear_weight_pack_matcher_count"], 2
+                )
+
+            self._test_common(
+                fold_mod,
+                (v,),
+                folder_matcher_check_fn,
+                check_autocast=dtype,
+            )
+            self.assertEqual(metrics.generated_kernel_count, 3 if TEST_ACL else 1)
+            # we won't fold the bias if bias is not same dtype with weight
+            # https://github.com/pytorch/pytorch/pull/129138
+            metrics.reset()
+            mod = M(self.device, dtype, unary_fn, cast_bias=False).eval()
+
+            def matcher_check_fn():
+                self.assertEqual(
+                    counters["inductor"]["mkldnn_linear_weight_pack_matcher_count"], 2
+                )
+
+            self._test_common(mod, (v,), matcher_check_fn, check_autocast=dtype)
+            # 1 kernel for "to_lowp", 2 kernels for unary ops
+            self.assertEqual(metrics.generated_kernel_count, 3)
+
+    def test_linear_binary(self, device="cpu"):
+        self.device = device
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def __init__(self, binary_fn, in_channels, out_channels, bias, **kwargs):
                 super().__init__()
@@ -803,9 +1226,15 @@ def forward(self, x, y):
                 return x
 
         dtypes = []
+<<<<<<< HEAD
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
             dtypes.append(torch.bfloat16)
         if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+=======
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtypes.append(torch.float16)
         options = itertools.product(
             binary_list, [[2, 3, 10], [2, 10]], [True, False], dtypes
@@ -845,7 +1274,13 @@ def matcher_check_fn():
             )
             self.assertEqual(metrics.generated_kernel_count, 2 if TEST_ACL else 1)
 
+<<<<<<< HEAD
     def test_linear_binary_broadcast_shapes_cpu(self):
+=======
+    def test_linear_binary_broadcast_shapes(self, device="cpu"):
+        self.device = device
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def __init__(self, binary_fn, in_channels, out_channels, bias, **kwargs):
                 super().__init__()
@@ -860,9 +1295,15 @@ def forward(self, x, y):
                 return x
 
         dtypes = []
+<<<<<<< HEAD
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
             dtypes.append(torch.bfloat16)
         if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+=======
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtypes.append(torch.float16)
         options = itertools.product(
             binary_list,
@@ -908,6 +1349,7 @@ def matcher_check_fn():
             )
             self.assertEqual(metrics.generated_kernel_count, 2 if TEST_ACL else 1)
 
+<<<<<<< HEAD
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
@@ -940,6 +1382,14 @@ def matcher_check_fn():
         self._test_common(mod, (x1, x2), matcher_check_fn)
 
     def test_multi_linear_share_same_input(self):
+=======
+    @skipIfXpu(
+        msg="Different with CPU, two linears will be concat on XPU for better performance"
+    )
+    def test_multi_linear_share_same_input(self, device="cpu"):
+        self.device = device
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # llama pattern.
         class M(torch.nn.Module):
             def __init__(
@@ -953,9 +1403,15 @@ def forward(self, x):
                 return F.silu(self.w1(x)) * F.relu(self.w2(x))
 
         dtypes = []
+<<<<<<< HEAD
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
             dtypes.append(torch.bfloat16)
         if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+=======
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtypes.append(torch.float16)
 
         def matcher_check_fn():
@@ -1008,6 +1464,7 @@ def matcher_check_fn():
             #    int8_mixed_bf16: [dequant_node, optional(convert_element_type_4),
             #     dequantize_per_channel, optional(convert_element_type_3), clone, convolution]
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 3
             )
             self.assertEqual(
@@ -1016,6 +1473,16 @@ def matcher_check_fn():
             )
             self.assertEqual(
                 counters["inductor"]["qconv2d_unary_lower_count"], 0 if TEST_ACL else 3
+=======
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 3
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_weight_prepack_matcher_nodes"],
+                18 if int8_mixed_bf16 else 12,
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_lower_count"], 0 if TEST_ACL else 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         self._test_common(
@@ -1069,7 +1536,11 @@ def _qconv2d_unary_test_helper(
         device="cpu",
         int8_mixed_bf16=False,
         unary_op=torch.nn.ReLU(),
+<<<<<<< HEAD
         qconv2d_unary_matcher_nodes=None,
+=======
+        qconv_unary_matcher_nodes=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         class M(torch.nn.Module):
             def __init__(
@@ -1098,6 +1569,7 @@ def forward(self, x):
         def matcher_check_fn():
             # 1. Dequant-Conv2D pattern matched in quantization weight prepack * 2
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 2
             )
             # 2. QConv2D Unary fusion in post-grad fusion pass * 2
@@ -1112,6 +1584,22 @@ def matcher_check_fn():
                 self.assertEqual(
                     counters["inductor"]["qconv2d_unary_matcher_nodes"],
                     0 if TEST_ACL else qconv2d_unary_matcher_nodes,
+=======
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 2
+            )
+            # 2. QConv2D Unary fusion in post-grad fusion pass * 2
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_matcher_count"],
+                0 if TEST_ACL else 2,
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_lower_count"], 0 if TEST_ACL else 2
+            )
+            if qconv_unary_matcher_nodes:
+                self.assertEqual(
+                    counters["inductor"]["qconv_unary_matcher_nodes"],
+                    0 if TEST_ACL else qconv_unary_matcher_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         self._test_common(
@@ -1195,7 +1683,11 @@ def test_qconv2d_hardtanh_int8_mixed_bf16_cpu(self):
         self._qconv2d_unary_test_helper(
             unary_op=torch.nn.Hardtanh(),
             int8_mixed_bf16=True,
+<<<<<<< HEAD
             qconv2d_unary_matcher_nodes=11,
+=======
+            qconv_unary_matcher_nodes=11,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfNoDynamoSupport
@@ -1213,7 +1705,11 @@ def test_qconv2d_hardtanh_int8_mixed_bf16_xpu(self):
             device="xpu",
             unary_op=torch.nn.Hardtanh(),
             int8_mixed_bf16=True,
+<<<<<<< HEAD
             qconv2d_unary_matcher_nodes=11,
+=======
+            qconv_unary_matcher_nodes=11,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfNoDynamoSupport
@@ -1247,7 +1743,11 @@ def test_qconv2d_hardswish_int8_mixed_bf16_cpu(self):
         self._qconv2d_unary_test_helper(
             unary_op=torch.nn.Hardswish(),
             int8_mixed_bf16=True,
+<<<<<<< HEAD
             qconv2d_unary_matcher_nodes=17,
+=======
+            qconv_unary_matcher_nodes=17,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfNoDynamoSupport
@@ -1266,7 +1766,11 @@ def test_qconv2d_hardswish_int8_mixed_bf16_xpu(self):
             device="xpu",
             unary_op=torch.nn.Hardswish(),
             int8_mixed_bf16=True,
+<<<<<<< HEAD
             qconv2d_unary_matcher_nodes=17,
+=======
+            qconv_unary_matcher_nodes=17,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfNoDynamoSupport
@@ -1300,7 +1804,11 @@ def test_qconv2d_silu_int8_mixed_bf16_cpu(self):
         self._qconv2d_unary_test_helper(
             unary_op=torch.nn.SiLU(),
             int8_mixed_bf16=True,
+<<<<<<< HEAD
             qconv2d_unary_matcher_nodes=11,
+=======
+            qconv_unary_matcher_nodes=11,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skipIfNoDynamoSupport
@@ -1319,7 +1827,11 @@ def test_qconv2d_silu_int8_mixed_bf16_xpu(self):
             device="xpu",
             unary_op=torch.nn.SiLU(),
             int8_mixed_bf16=True,
+<<<<<<< HEAD
             qconv2d_unary_matcher_nodes=11,
+=======
+            qconv_unary_matcher_nodes=11,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _qconv2d_add_test_helper(
@@ -1380,7 +1892,11 @@ def forward(self, x):
             def matcher_check_fn():
                 # 1. Dequant-Conv2D pattern matched in quantization weight prepack * 4
                 self.assertEqual(
+<<<<<<< HEAD
                     counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 4
+=======
+                    counters["inductor"]["qconv_weight_prepack_matcher_count"], 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 # 2. Qconv2d Binary Unary fusion in post-grad fusion pass * 2
                 self.assertEqual(
@@ -1477,7 +1993,11 @@ def forward(self, x, x2, x3):
             def matcher_check_fn():
                 # 1. Dequant-Conv2D pattern matched in quantization weight prepack * 2
                 self.assertEqual(
+<<<<<<< HEAD
                     counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 2
+=======
+                    counters["inductor"]["qconv_weight_prepack_matcher_count"], 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 # 2. Qconv2d Binary Unary fusion in post-grad fusion pass * 2
                 self.assertEqual(
@@ -1576,7 +2096,11 @@ def forward(self, x1, x2):
             def matcher_check_fn():
                 # 1. Dequant-Conv2D pattern matched in quantization weight prepack * 1
                 self.assertEqual(
+<<<<<<< HEAD
                     counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 1
+=======
+                    counters["inductor"]["qconv_weight_prepack_matcher_count"], 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 # 2. Qconv2d Binary Unary fusion in post-grad fusion pass * 0
                 self.assertEqual(
@@ -1632,6 +2156,7 @@ def forward(self, x: torch.Tensor):
 
         def matcher_check_fn():
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 4
             )
             self.assertEqual(
@@ -1640,6 +2165,16 @@ def matcher_check_fn():
             )
             self.assertEqual(
                 counters["inductor"]["qconv2d_unary_lower_count"], 0 if TEST_ACL else 4
+=======
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 4
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_matcher_count"],
+                0 if TEST_ACL else 3,
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_lower_count"], 0 if TEST_ACL else 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         self._test_common(
@@ -1805,14 +2340,22 @@ def matcher_check_fn():
             # 1. Dequant-conv pattern matched in quantization weight prepack * 1
             #    [dequantize_per_tensor, dequantize_per_channel, clone, convolution]
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 1
             )
             self.assertEqual(
                 counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 4
+=======
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 1
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_weight_prepack_matcher_nodes"], 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             # 2. QConv2D Unary fusion in post-grad fusion pass * 1
             #    [qconv2d_pointwise_default, quantize_per_tensor]
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_unary_matcher_count"],
                 0 if TEST_ACL else 1,
             )
@@ -1822,6 +2365,17 @@ def matcher_check_fn():
             )
             self.assertEqual(
                 counters["inductor"]["qconv2d_unary_lower_count"], 0 if TEST_ACL else 1
+=======
+                counters["inductor"]["qconv_unary_matcher_count"],
+                0 if TEST_ACL else 1,
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_matcher_nodes"],
+                0 if TEST_ACL else 2,
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_lower_count"], 0 if TEST_ACL else 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         self._test_common(
@@ -1860,16 +2414,28 @@ def matcher_check_fn():
             # 1. Dequant-conv pattern matched in quantization weight prepack * 1
             #    [convert_element_type_1, sub, mul_1, dequantize_per_channel, clone, convolution]
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 2
+=======
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             # 2. QConv2D Unary fusion in post-grad fusion pass * 1
             #    [qconv2d_pointwise_default, relu, div_1, round_2, add_1, clamp_min_1, clamp_max_1, convert_element_type_2]
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_unary_matcher_count"],
                 0 if TEST_ACL else 2,
             )
             self.assertEqual(
                 counters["inductor"]["qconv2d_unary_lower_count"], 0 if TEST_ACL else 2
+=======
+                counters["inductor"]["qconv_unary_matcher_count"],
+                0 if TEST_ACL else 2,
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_lower_count"], 0 if TEST_ACL else 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         self._test_common(
@@ -1959,10 +2525,17 @@ def matcher_check_fn():
             # 1. Dequant-conv pattern matched in quantization weight prepack * 2
             #    [dequantize_per_tensor, dequantize_per_channel, clone, convolution]
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 2
             )
             self.assertEqual(
                 counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 8
+=======
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 2
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_weight_prepack_matcher_nodes"], 8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             # 2. Qconv2d Binary fusion in post-grad fusion pass * 1
             #    [qconv2d_pointwise_default_1, dequantize_per_tensor, add_3, quantize_per_tensor]
@@ -2028,10 +2601,17 @@ def matcher_check_fn():
             # 1. Dequant-conv pattern matched in quantization weight prepack * 2
             #    [dequantize_per_tensor, dequantize_per_channel, clone, convolution]
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 2
             )
             self.assertEqual(
                 counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 8
+=======
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 2
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_weight_prepack_matcher_nodes"], 8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             # 2. Qconv2d Binary fusion in post-grad fusion pass * 1
             #    [qconv2d_pointwise_default_1, dequantize_per_tensor, add_3, relu, quantize_per_tensor]
@@ -2100,10 +2680,17 @@ def matcher_check_fn():
             # 2. Dequant-conv pattern matched in quantization weight prepack * 3
             #    [dequantize_per_tensor, dequantize_per_channel, clone, convolution]
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 3
             )
             self.assertEqual(
                 counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 12
+=======
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 3
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_weight_prepack_matcher_nodes"], 12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             # 3. Qconv2d Binary fusion in post-grad fusion pass * 1
             #    [qconv2d_pointwise_default_1, add_3]
@@ -2140,6 +2727,62 @@ def test_qconv2d_dequant_promotion_cpu(self):
     def test_qconv2d_dequant_promotion_xpu(self):
         self._test_qconv2d_dequant_promotion_helper(device="xpu")
 
+<<<<<<< HEAD
+=======
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    def test_qconv1d_relu_cpu(self):
+        r"""
+        This testcase will quantize Conv1d->ReLU pattern.
+        """
+        device = "cpu"
+        unary_op = torch.nn.ReLU()
+
+        class M(torch.nn.Module):
+            def __init__(
+                self,
+            ):
+                super().__init__()
+                self.conv = torch.nn.Conv1d(3, 128, kernel_size=3, stride=1)
+                self.unary_fn = copy.deepcopy(unary_op)
+                self.conv2 = torch.nn.Conv1d(
+                    128, 128, kernel_size=3, stride=1, bias=False
+                )
+                self.unary_fn2 = copy.deepcopy(unary_op)
+
+            def forward(self, x):
+                tmp = self.unary_fn(self.conv(x))
+                return self.unary_fn2(self.conv2(tmp))
+
+        mod = M().eval().to(device=device)
+        v = (
+            torch.randn((1, 3, 8), dtype=torch.float32, requires_grad=False)
+            .add(1)
+            .to(device=device)
+        )
+
+        def matcher_check_fn():
+            # 1. Dequant-Conv2D pattern matched in quantization weight prepack * 2
+            self.assertEqual(
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 2
+            )
+            # 2. QConv2D Unary fusion in post-grad fusion pass * 2
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_matcher_count"],
+                0 if TEST_ACL else 2,
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_lower_count"], 0 if TEST_ACL else 2
+            )
+
+        self._test_common(
+            mod,
+            (v,),
+            check_quantization=True,
+            matcher_check_fn=matcher_check_fn,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _qlinear_test_helper(
         self,
         inputs,
@@ -3176,6 +3819,7 @@ def matcher_check_fn():
                     0 if TEST_ACL else 1,
                 )
                 self.assertEqual(
+<<<<<<< HEAD
                     counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 1
                 )
                 self.assertEqual(
@@ -3184,6 +3828,16 @@ def matcher_check_fn():
                 )
                 self.assertEqual(
                     counters["inductor"]["qconv2d_unary_lower_count"],
+=======
+                    counters["inductor"]["qconv_weight_prepack_matcher_count"], 1
+                )
+                self.assertEqual(
+                    counters["inductor"]["qconv_unary_matcher_count"],
+                    0 if TEST_ACL else 1,
+                )
+                self.assertEqual(
+                    counters["inductor"]["qconv_unary_lower_count"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     0 if TEST_ACL else 1,
                 )
 
@@ -3275,6 +3929,7 @@ def matcher_check_fn():
                 counters["inductor"]["qcat_matcher_count"], 0 if TEST_ACL else 1
             )
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 2
             )
             self.assertEqual(
@@ -3283,6 +3938,16 @@ def matcher_check_fn():
             )
             self.assertEqual(
                 counters["inductor"]["qconv2d_unary_lower_count"], 0 if TEST_ACL else 2
+=======
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 2
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_matcher_count"],
+                0 if TEST_ACL else 2,
+            )
+            self.assertEqual(
+                counters["inductor"]["qconv_unary_lower_count"], 0 if TEST_ACL else 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         self._test_common(
@@ -3700,6 +4365,10 @@ def forward(self, x):
             om(*example_inputs)
             om(*example_inputs)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not TEST_MKL, "Test requires MKL")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @xfailIfACL
     @torch._dynamo.config.patch("inline_inbuilt_nn_modules", True)
     def test_reproduce_121253_issue_addmm_fusion_check(self):
@@ -4002,10 +4671,20 @@ def matcher_check_fn():
                 nodes_count = 10 if has_bias else 7
             else:
                 nodes_count = 7 if has_bias else 6
+<<<<<<< HEAD
             self.assertEqual(
                 counters["inductor"]["qlinear_weight_prepack_matcher_nodes"],
                 nodes_count,
             )
+=======
+            if counters["inductor"]["removed_pointless_view_pair"] == 0:
+                # Removing pointless view pairs affect how the pattern
+                # for this test is matched.
+                self.assertEqual(
+                    counters["inductor"]["qlinear_weight_prepack_matcher_nodes"],
+                    nodes_count,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._test_common(
             mod,
@@ -4119,6 +4798,7 @@ def matcher_check_fn():
             self.assertEqual(counters["inductor"]["qlinear_binary_matcher_count"], 1)
 
 
+<<<<<<< HEAD
 @dynamo_config.patch({"dynamic_shapes": True, "assume_static_by_default": False})
 class TestDynamicPatternMatcher(TestPatternMatcherBase):
     _test_conv_unary_cpu_base = TestPatternMatcher._test_conv_unary_cpu_base
@@ -4133,6 +4813,35 @@ class TestDynamicPatternMatcher(TestPatternMatcherBase):
     )
 
     def test_conv_transpose2d_dynamic_shapes(self):
+=======
+class TestDynamicPatternMatcherGeneric(TestPatternMatcherBase):
+    def setUp(self):
+        super().setUp()
+        self.ctx_stack.enter_context(
+            # When testing kernel counts, unspecializing float causes wobbling of our tests because
+            # we end up reusing the same compiled region across tests. Thus we purposely specialize floats
+            # here since we primarily care about number of kernels generated in the absence of compile
+            # caching.
+            dynamo_config.patch(
+                {
+                    "dynamic_shapes": True,
+                    "assume_static_by_default": False,
+                    "specialize_float": True,
+                }
+            )
+        )
+
+    _test_conv_unary_base = TestPatternMatcherGeneric._test_conv_unary_base
+    test_conv2d_unary_dynamic_shapes = TestPatternMatcherGeneric.test_conv2d_unary
+    test_conv3d_unary_dynamic_shapes = TestPatternMatcherGeneric.test_conv3d_unary
+    _test_conv_binary_base = TestPatternMatcherGeneric._test_conv_binary_base
+    test_conv2d_binary_dynamic_shapes = TestPatternMatcherGeneric.test_conv2d_binary
+    test_conv3d_binary_dynamic_shapes = TestPatternMatcherGeneric.test_conv3d_binary
+
+    def test_conv_transpose2d_dynamic_shapes(self, device):
+        self.device = device
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We don't support conv_transpose2d for now.
         class M(torch.nn.Module):
             def __init__(self) -> None:
@@ -4153,7 +4862,16 @@ def matcher_check_fn():
 
         self._test_common(mod, (v,), matcher_check_fn)
 
+<<<<<<< HEAD
     def test_multi_linear_share_same_input_dynamic(self):
+=======
+    @skipIfXpu(
+        msg="Different with CPU, two linears will be concat on XPU for better performance"
+    )
+    def test_multi_linear_share_same_input_dynamic(self, device):
+        self.device = device
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # llama pattern.
         class M(torch.nn.Module):
             def __init__(
@@ -4167,9 +4885,15 @@ def forward(self, x):
                 return F.silu(self.w1(x)) * F.relu(self.w2(x))
 
         dtypes = []
+<<<<<<< HEAD
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
             dtypes.append(torch.bfloat16)
         if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+=======
+        if is_mkldnn_bf16_supported(self.device):
+            dtypes.append(torch.bfloat16)
+        if is_mkldnn_fp16_supported(self.device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtypes.append(torch.float16)
 
         def matcher_check_fn():
@@ -4196,6 +4920,32 @@ def matcher_check_fn():
             v = torch.randn(2, 4, 16).to(dtype)
             self._test_common(mod, (v,), matcher_check_fn, rtol=1e-2, atol=1e-2)
 
+<<<<<<< HEAD
+=======
+
+class TestDynamicPatternMatcher(TestPatternMatcherBase):
+    test_linear_unary_dynamic_shapes = TestPatternMatcher.test_linear_unary
+    test_linear_input_non_contiguous_3D_wo_bias_dynamic_shapes = (
+        TestPatternMatcher.test_linear_input_non_contiguous_3D_wo_bias
+    )
+
+    def setUp(self):
+        super().setUp()
+        self.ctx_stack.enter_context(
+            # When testing kernel counts, unspecializing float causes wobbling of our tests because
+            # we end up reusing the same compiled region across tests. Thus we purposely specialize floats
+            # here since we primarily care about number of kernels generated in the absence of compile
+            # caching.
+            dynamo_config.patch(
+                {
+                    "dynamic_shapes": True,
+                    "assume_static_by_default": False,
+                    "specialize_float": True,
+                }
+            )
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @xfailIfACL
     def test_qconv2d_maxpool2d_linear_dynamic_cpu(self, include_ops=None):
         r"""
@@ -4228,7 +4978,11 @@ def forward(self, x):
         v = torch.randn((2, 3, 8, 8), dtype=torch.float32, requires_grad=False).add(1)
         if include_ops is None:
             include_ops = [
+<<<<<<< HEAD
                 "torch.ops.onednn.qconv2d_pointwise",
+=======
+                "torch.ops.onednn.qconv_pointwise",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "torch.ops.quantized.max_pool2d",
                 "torch.ops.onednn.qlinear_pointwise",
             ]
@@ -4267,7 +5021,11 @@ def forward(self, x):
 
         def matcher_check_fn():
             self.assertEqual(
+<<<<<<< HEAD
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 1
+=======
+                counters["inductor"]["qconv_weight_prepack_matcher_count"], 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         self._test_common(
@@ -4357,8 +5115,20 @@ def matcher_check_fn():
             )
 
 
+<<<<<<< HEAD
 instantiate_parametrized_tests(TestPatternMatcher)
 
 if __name__ == "__main__":
     if IS_LINUX and HAS_CPU and torch.backends.mkldnn.is_available():
+=======
+instantiate_device_type_tests(
+    TestPatternMatcherGeneric, globals(), allow_xpu=True, only_for=("cpu", "xpu")
+)
+instantiate_device_type_tests(
+    TestDynamicPatternMatcherGeneric, globals(), allow_xpu=True, only_for=("cpu", "xpu")
+)
+instantiate_parametrized_tests(TestPatternMatcher)
+if __name__ == "__main__":
+    if IS_LINUX and (HAS_CPU) and torch.backends.mkldnn.is_available():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests()
diff --git a/test/inductor/test_mps_basic.py b/test/inductor/test_mps_basic.py
index aa5a9eb7ab7e..e7954d4603aa 100644
--- a/test/inductor/test_mps_basic.py
+++ b/test/inductor/test_mps_basic.py
@@ -3,6 +3,11 @@
 import os
 import sys
 
+<<<<<<< HEAD
+=======
+import numpy as np
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import get_all_dtypes
@@ -34,6 +39,10 @@
 # This tests basic MPS compile functionality
 
 
+<<<<<<< HEAD
+=======
+@instantiate_parametrized_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MPSBasicTests(TestCase):
     is_dtype_supported = CommonTemplate.is_dtype_supported
     common = check_model_gpu
@@ -82,6 +91,7 @@ def foo(x):
     def test_cast(self, dtype):
         self.common(lambda a: a.to(dtype), (torch.rand(1024),))
 
+<<<<<<< HEAD
     def test_pointwise_i0(self):
         self.common(torch.special.i0, (torch.rand(128, 128),), check_lowp=False)
 
@@ -90,6 +100,37 @@ def test_pointwise_i1(self):
 
     def test_pointwise_erf(self):
         self.common(torch.special.erf, (torch.rand(128, 128),), check_lowp=False)
+=======
+    pointwise_unary_ops = [
+        "i0",
+        "i0e",
+        "i1",
+        "i1e",
+        "erf",
+        "digamma",
+        "sinc",
+        "spherical_bessel_j0",
+        "bessel_j0",
+        "bessel_j1",
+        "bessel_y0",
+        "bessel_y1",
+        "modified_bessel_i0",
+        "modified_bessel_i1",
+        "modified_bessel_k0",
+        "modified_bessel_k1",
+        "scaled_modified_bessel_k0",
+        "scaled_modified_bessel_k1",
+        "entr",
+    ]
+
+    @parametrize("op_name", pointwise_unary_ops)
+    def test_pointwise_unary_op(self, op_name):
+        self.common(
+            lambda x: getattr(torch.special, op_name)(x),
+            (torch.rand(128, 128),),
+            check_lowp=False,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_pointwise_polygamma(self):
         self.common(
@@ -101,6 +142,7 @@ def test_pointwise_polygamma(self):
             check_lowp=False,
         )
 
+<<<<<<< HEAD
     def test_pointwise_digamma(self):
         self.common(torch.special.digamma, (torch.rand(128, 128),), check_lowp=False)
 
@@ -110,10 +152,28 @@ def test_pointwise_sinc(self):
     def test_pointwise_zeta(self):
         self.common(
             torch.special.zeta,
+=======
+    @parametrize(
+        "op_name",
+        [
+            "zeta",
+            "xlog1py",
+            "chebyshev_polynomial_t",
+            "chebyshev_polynomial_u",
+            "chebyshev_polynomial_v",
+            "chebyshev_polynomial_w",
+            "hermite_polynomial_he",
+        ],
+    )
+    def test_pointwise_binary_op(self, op_name):
+        self.common(
+            lambda x, y: getattr(torch.special, op_name)(x, y),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (torch.rand(128, 128), torch.rand(128, 128)),
             check_lowp=False,
         )
 
+<<<<<<< HEAD
     def test_pointwise_spherical_bessel_j0(self):
         self.common(
             torch.special.spherical_bessel_j0, (torch.rand(128, 128),), check_lowp=False
@@ -129,6 +189,8 @@ def test_pointwise_xlog1py(self):
     def test_pointwise_entr(self):
         self.common(torch.special.entr, (torch.rand(128, 128),), check_lowp=False)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_broadcast(self):
         self.common(torch.add, (torch.rand(32, 1024), torch.rand(1024)))
 
@@ -139,6 +201,7 @@ def inc_(x):
 
         self.common(inc_, (torch.rand(1024),))
 
+<<<<<<< HEAD
     # TODO(NS): Replace me with full test_prod when multi-stage reductions are implemented
     def test_prod(self):
         def fn(a):
@@ -221,6 +284,137 @@ def fn(a):
     setattr(MPSBasicTests, test_name, getattr(CommonTemplate, test_name))
 
 instantiate_parametrized_tests(MPSBasicTests)
+=======
+    def test_rms_norm_nograd(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/150629
+        def fn(x, w):
+            with torch.no_grad():
+                return torch.nn.functional.rms_norm(x, x.shape, w)
+
+        self.common(fn, (torch.rand(10), torch.ones(10)))
+
+    def test_compile_numpy_scalar(self):
+        def fn(x, y):
+            return x / y
+
+        self.common(fn, (torch.rand(10), np.exp(0.3)))
+
+    def test_conv_transpose_channels_last(self):
+        def fn(x, y):
+            return torch.nn.functional.conv_transpose2d(x, y, stride=1, padding=1)
+
+        self.common(
+            fn,
+            (
+                torch.rand(1, 1, 16, 16).to(memory_format=torch.channels_last),
+                torch.rand(1, 4, 8, 8),
+            ),
+        )
+
+    def test_cholesky(self):
+        def fn(x):
+            return (
+                torch.linalg.cholesky(x, upper=False),
+                torch.linalg.cholesky(x, upper=True),
+            )
+
+        self.common(fn, (torch.eye(64),), check_lowp=False)
+
+
+class MPSBasicTestsAOTI(TestCase):
+    def check_model(self, m, inp, dynamic_shapes=None):
+        res2 = m(*inp)
+        ep = torch.export.export(m, inp, dynamic_shapes=dynamic_shapes)
+        path = torch._inductor.aoti_compile_and_package(ep)
+        m = torch._inductor.aoti_load_package(path)
+        res = m(*inp)
+        assert torch.allclose(res, res2)
+
+    def test_add_mps(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        inp = (torch.ones(3, 3, device="mps"), torch.ones(3, 3, device="mps"))
+        m = M().to("mps")
+        self.check_model(m, inp)
+
+    def test_fallback_mps(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.nn.functional.linear(x, y)
+
+        inp = (
+            torch.randn(10, 10, device="mps"),
+            torch.randn(10, 10, device="mps"),
+        )
+        m = M().to("mps")
+        self.check_model(m, inp)
+
+    def test_c10(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                return torch.cat(tensors=torch.split(x, 4, dim=1), dim=-2)
+
+        inp = (torch.randn(2, 8, device="mps"),)
+        m = M().to("mps")
+        self.check_model(m, inp)
+
+    def test_two_const(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.y = torch.ones(3, 3, device="mps")
+                self.z = torch.full((3, 3), 2, device="mps")
+
+            def forward(self, x):
+                return x + self.y + self.z
+
+        inp = (torch.ones(3, 3, device="mps"),)
+        m = Model().to(device="mps")
+        self.check_model(m, inp)
+
+    def test_simple_dynamic(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x, y):
+                add_0 = x + y
+                return torch.nn.functional.relu(input=add_0, inplace=False)
+
+        x = torch.randn(128, 2048, device="mps")
+        y = torch.randn(128, 2048, device="mps")
+        inp = (x, y)
+
+        m = Model().to(device="mps")
+        dim0_x = torch.export.Dim("dim0_x", min=1, max=2048)
+        dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_x}}
+
+        self.check_model(m, inp, dynamic_shapes)
+
+    def test_dynamic_cat(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, a, b):
+                return torch.cat([a, b], dim=0)
+
+        a = torch.randn(2, 4, device="mps")
+        b = torch.randn(3, 4, device="mps")
+        inp = (a, b)
+        m = Model().to(device="mps")
+
+        dim0_a = torch.export.Dim("dim0_a", min=1, max=10)
+        dim0_b = torch.export.Dim("dim0_b", min=1, max=20)
+        dynamic_shapes = {"a": {0: dim0_a}, "b": {0: dim0_b}}
+        self.check_model(m, inp, dynamic_shapes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/inductor/test_multi_kernel.py b/test/inductor/test_multi_kernel.py
index 78c8f7b5ea0a..3c402239a9a0 100644
--- a/test/inductor/test_multi_kernel.py
+++ b/test/inductor/test_multi_kernel.py
@@ -115,12 +115,22 @@ def mock_run(self, *args, **kwargs):
             picked_kernel = self.picked_kernel
             return out
 
+<<<<<<< HEAD
         with unittest.mock.patch.object(
             MultiKernelCall, "run", mock_run
         ), unittest.mock.patch.object(
             MultiKernelCall,
             "benchmark_sub_kernels",
             lambda *args, **kwargs: mock_latency,
+=======
+        with (
+            unittest.mock.patch.object(MultiKernelCall, "run", mock_run),
+            unittest.mock.patch.object(
+                MultiKernelCall,
+                "benchmark_sub_kernels",
+                lambda *args, **kwargs: mock_latency,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.compile(f)(x)
         self.assertEqual(picked_kernel, force_kernel)
diff --git a/test/inductor/test_op_dtype_prop.py b/test/inductor/test_op_dtype_prop.py
index 28b2b3bba784..451e7c93794a 100644
--- a/test/inductor/test_op_dtype_prop.py
+++ b/test/inductor/test_op_dtype_prop.py
@@ -67,6 +67,10 @@ class TestCase(InductorTestCase):
     )
     # @config.patch("triton.codegen_upcast_to_fp32", False) # TODO enable
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+<<<<<<< HEAD
+=======
+    @config.patch("test_configs.static_cpp_dtype_assert", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @disable_cache_limit()
     def test_op_dtype_propagation(self, op, dtype):
         def run(op, args, kwargs):
@@ -77,7 +81,22 @@ def run(op, args, kwargs):
             args = (sample_input.input,) + sample_input.args
             kwargs = sample_input.kwargs
             out = run(op.get_op(), args, kwargs)
+<<<<<<< HEAD
             out_c = torch.compile(run)(op.get_op(), args, kwargs)
+=======
+
+            # test_configs.runtime_triton_dtype_assert does not work well with dynamic shape so far.
+            # Consider the following cases for torch.add:
+            #   both lhs/rhs are int32 tensor, there is also a integer alpha argument.
+            #   In dynamic shape case, alpha is passed in as an ks0 argument. To be safe,
+            #   we use tl.int64 for ks0's dtype.
+            #   But the dtype for alpha is also decided as tl.int32 during lowering when
+            #   we promote alpha to a ir.Constant.
+            #   Ideally to resolve this problem, we should track assignment like
+            #     alpha = ks0
+            #   so that we know alpha is actually tl.int64 rather than tl.int32.
+            out_c = torch.compile(run, dynamic=False)(op.get_op(), args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(out, out_c)
 
     @requires_gpu()
@@ -241,6 +260,10 @@ def test_binary_math_mixed_precision(self):
         # There should be no downcast, since the input is promoted to float32.
         self.assertNotIn(".to(tl.float16)", code)
 
+<<<<<<< HEAD
+=======
+    @config.patch("test_configs.static_cpp_dtype_assert", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
     @config.patch("triton.codegen_upcast_to_fp32", False)
     def test_downcast_div_mod(self):
@@ -254,6 +277,10 @@ def fn(x, y):
         FileCheck().check("static_assert").check_same(".dtype").run(code[0])
         self.assertEqual(fn(x, y), out)
 
+<<<<<<< HEAD
+=======
+    @config.patch("test_configs.static_cpp_dtype_assert", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
     def test_constant(self):
         def fn():
@@ -264,6 +291,10 @@ def fn():
         self.assertEqual(fn(), out)
 
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+<<<<<<< HEAD
+=======
+    @config.patch("test_configs.static_cpp_dtype_assert", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch("triton.persistent_reductions", False)
     def test_any(self):
         def fn(x):
@@ -274,6 +305,10 @@ def fn(x):
         self.assertEqual(fn(x), out)
 
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+<<<<<<< HEAD
+=======
+    @config.patch("test_configs.static_cpp_dtype_assert", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_assoc_scan(self):
         from torch._higher_order_ops.associative_scan import associative_scan
 
diff --git a/test/inductor/test_ordered_set.py b/test/inductor/test_ordered_set.py
index b057fe393de5..042a75979832 100644
--- a/test/inductor/test_ordered_set.py
+++ b/test/inductor/test_ordered_set.py
@@ -1,6 +1,9 @@
 # Owner(s): ["module: inductor"]
 # ruff: noqa: F841
+<<<<<<< HEAD
 # flake8: noqa
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import collections
 import collections.abc
 import copy
@@ -296,7 +299,11 @@ def test_pickling(self):
                 self.s.z = ["z"]
             p = pickle.dumps(self.s, i)
             dup = pickle.loads(p)
+<<<<<<< HEAD
             self.assertEqual(self.s, dup, "%s != %s" % (self.s, dup))
+=======
+            self.assertEqual(self.s, dup, "%s != %s" % (self.s, dup))  # noqa: UP031
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if type(self.s) not in (OrderedSet, frozenset):
                 self.assertEqual(self.s.x, dup.x)
                 self.assertEqual(self.s.z, dup.z)
@@ -390,7 +397,11 @@ def test_cyclical_repr(self):
             self.assertEqual(repr(s), "{OrderedSet(...)}")
         else:
             name = repr(s).partition("(")[0]  # strip class name
+<<<<<<< HEAD
             self.assertEqual(repr(s), "%s({%s(...)})" % (name, name))
+=======
+            self.assertEqual(repr(s), "%s({%s(...)})" % (name, name))  # noqa: UP031
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skip("Different hashing")
     def test_do_not_rehash_dict_keys(self):
@@ -454,7 +465,11 @@ def test_set_literal(self):
 
     def test_set_literal_insertion_order(self):
         # SF Issue #26020 -- Expect left to right insertion
+<<<<<<< HEAD
         s = {1, 1.0, True}
+=======
+        s = {1, 1.0, True}  # noqa: B033
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(len(s), 1)
         stored_value = s.pop()
         self.assertEqual(type(stored_value), int)
@@ -715,6 +730,7 @@ def __le__(self, some_set):
         myset = {1, 2, 3}
 
         myobj = TestRichSetCompare()
+<<<<<<< HEAD
         myset < myobj
         self.assertTrue(myobj.gt_called)
 
@@ -728,6 +744,21 @@ def __le__(self, some_set):
 
         myobj = TestRichSetCompare()
         myset >= myobj
+=======
+        myset < myobj  # noqa: B015
+        self.assertTrue(myobj.gt_called)
+
+        myobj = TestRichSetCompare()
+        myset > myobj  # noqa: B015
+        self.assertTrue(myobj.lt_called)
+
+        myobj = TestRichSetCompare()
+        myset <= myobj  # noqa: B015
+        self.assertTrue(myobj.ge_called)
+
+        myobj = TestRichSetCompare()
+        myset >= myobj  # noqa: B015
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(myobj.le_called)
 
 
@@ -834,7 +865,13 @@ def test_pickling(self):
             p = pickle.dumps(self.OrderedSet, proto)
             copy = pickle.loads(p)
             self.assertEqual(
+<<<<<<< HEAD
                 self.OrderedSet, copy, "%s != %s" % (self.OrderedSet, copy)
+=======
+                self.OrderedSet,
+                copy,
+                "%s != %s" % (self.OrderedSet, copy),  # noqa: UP031
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def test_issue_37219(self):
@@ -1195,7 +1232,11 @@ def test_add_until_full(self):
         expected_len = 0
         for v in self.values:
             tmp.add(v)
+<<<<<<< HEAD
             expected_len += 1
+=======
+            expected_len += 1  # noqa: SIM113
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(len(tmp), expected_len)
         self.assertEqual(tmp, self.OrderedSet)
 
@@ -1518,7 +1559,11 @@ def setUp(self):
 class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, TestCase):
     def setUp(self):
         def gen():
+<<<<<<< HEAD
             for i in range(0, 10, 2):
+=======
+            for i in range(0, 10, 2):  # noqa: UP028
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 yield i
 
         self.OrderedSet = OrderedSet((1, 2, 3))
@@ -1541,7 +1586,11 @@ def test_copy(self):
 
     def test_deep_copy(self):
         dup = copy.deepcopy(self.OrderedSet)
+<<<<<<< HEAD
         ##print type(dup), repr(dup)
+=======
+        # print type(dup), repr(dup)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dup_list = sorted(dup, key=repr)
         set_list = sorted(self.OrderedSet, key=repr)
         self.assertEqual(len(dup_list), len(set_list))
@@ -1641,7 +1690,11 @@ def test_exclusion(self):
 
 def R(seqn):
     "Regular generator"
+<<<<<<< HEAD
     for i in seqn:
+=======
+    for i in seqn:  # noqa: UP028
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         yield i
 
 
@@ -1655,7 +1708,11 @@ def __getitem__(self, i):
         return self.seqn[i]
 
 
+<<<<<<< HEAD
 class I:
+=======
+class I:  # noqa: E742
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Sequence using iterator protocol"
 
     def __init__(self, seqn):
@@ -1681,7 +1738,11 @@ def __init__(self, seqn):
         self.i = 0
 
     def __iter__(self):
+<<<<<<< HEAD
         for val in self.seqn:
+=======
+        for val in self.seqn:  # noqa: UP028
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield val
 
 
@@ -1743,7 +1804,11 @@ def __next__(self):
 
 def L(seqn):
     "Test multiple tiers of iterators"
+<<<<<<< HEAD
     return chain(map(lambda x: x, R(Ig(G(seqn)))))
+=======
+    return chain(map(lambda x: x, R(Ig(G(seqn)))))  # noqa: C417
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestVariousIteratorArgs(TestCase):
@@ -1909,7 +1974,11 @@ def powerset(U):
 def cube(n):
     """Graph of n-dimensional hypercube."""
     singletons = [frozenset([x]) for x in range(n)]
+<<<<<<< HEAD
     return dict(
+=======
+    return dict(  # noqa: C404
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         [(x, frozenset([x ^ s for s in singletons])) for x in powerset(range(n))]
     )
 
@@ -1946,7 +2015,11 @@ def faces(G):
                             f.add(frozenset([v1, v2, v3, v4]))
                         else:
                             for v5 in G[v4]:
+<<<<<<< HEAD
                                 if v5 == v3 or v5 == v2:
+=======
+                                if v5 == v3 or v5 == v2:  # noqa: SIM109
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                     continue
                                 if v1 in G[v5]:
                                     f.add(frozenset([v1, v2, v3, v4, v5]))
diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
index ce52376d2f2c..9090de1973f5 100644
--- a/test/inductor/test_pad_mm.py
+++ b/test/inductor/test_pad_mm.py
@@ -13,7 +13,11 @@
     should_pad_mm_bf16,
 )
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache, is_big_gpu, run_and_get_code
+=======
+from torch._inductor.utils import fresh_cache, is_big_gpu, run_and_get_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import skipIfRocm
 from torch.testing._internal.inductor_utils import HAS_CUDA
@@ -362,7 +366,11 @@ def foo(x, y):
         self.assertEqual(out, inps[0] @ inps[1])
 
     @inductor_config.patch(force_shape_pad=True)
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pad_addmm_2d_bias(self):
         @torch.compile()
         def foo(input, x, y):
@@ -400,9 +408,15 @@ def test_pad_batch(self):
         expected_alignment = get_alignment_size(mat1)
 
         assert expected_alignment == 8, "Alignment for float16 should be 8"
+<<<<<<< HEAD
         assert should_pad_common(
             mat1, mat2
         ), "This should pass the common padding criteria"
+=======
+        assert should_pad_common(mat1, mat2), (
+            "This should pass the common padding criteria"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile()
         def bmm(mat1, mat2):
@@ -415,11 +429,19 @@ def bmm(mat1, mat2):
             ".run(", 2, exactly=True
         ).check("empty_strided_cuda((3, 8, 16)").run(code)
 
+<<<<<<< HEAD
         assert torch.allclose(
             res2, bmm_expected_result
         ), "BMM results are not identical"
 
     @fresh_inductor_cache()
+=======
+        assert torch.allclose(res2, bmm_expected_result), (
+            "BMM results are not identical"
+        )
+
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_exclude_padding(self):
         @torch.compile()
         def mm(a, b):
@@ -448,7 +470,11 @@ def mm(a, b):
             repr(local_cache)
         )
 
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch(max_pointwise_cat_inputs=2)
     def test_exclude_cat_padding(self):
         @torch.compile()
@@ -475,7 +501,11 @@ def mm(inps, b):
         "No perf regression on H100+ with BF16",
     )
     @skipIfRocm
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch(
         post_grad_fusion_options={"pad_aten_mm_pass": {"k_threshold_to_pad": 8388608}}
     )
@@ -488,12 +518,21 @@ def test_pad_mm_bf16(self):
         expected_alignment = get_alignment_size(mat1)
 
         assert expected_alignment == 8, "Alignment for bfloat16 should be 8"
+<<<<<<< HEAD
         assert should_pad_common(
             mat1, mat2
         ), "This should pass the common padding criteria"
         assert should_pad_mm_bf16(
             mat1.dtype, m, n, k
         ), "This should pass the should_pad_mm_bf16 padding criteria"
+=======
+        assert should_pad_common(mat1, mat2), (
+            "This should pass the common padding criteria"
+        )
+        assert should_pad_mm_bf16(mat1.dtype, m, n, k), (
+            "This should pass the should_pad_mm_bf16 padding criteria"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @torch.compile()
         def mm(mat1, mat2):
@@ -508,7 +547,11 @@ def mm(mat1, mat2):
 
         assert torch.allclose(res2, mm_expected_result), "MM results are not identical"
 
+<<<<<<< HEAD
     @fresh_inductor_cache()
+=======
+    @fresh_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch(
         {
             "triton.unique_kernel_names": "original_aten",
@@ -521,8 +564,13 @@ def fn(x, y):
             return x @ y
 
         args = [
+<<<<<<< HEAD
             torch.randn(2**4, 2**14 - 1, device="cuda", dtype=torch.float16),
             torch.randn(2**14 - 1, 2**4, device="cuda", dtype=torch.float16),
+=======
+            torch.randn(2**4, 2**8 - 1, device="cuda", dtype=torch.float16),
+            torch.randn(2**8 - 1, 2**4, device="cuda", dtype=torch.float16),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         counters.clear()
@@ -534,6 +582,10 @@ def fn(x, y):
             ret, code = run_and_get_code(opt_fn, *args)
         self.assertEqual(counters["inductor"]["pattern_matcher_count"], 1)
 
+<<<<<<< HEAD
+=======
+        code = [c for c in code if "decompose_k" not in c]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # The mm kernel should use a template (because we set max_autotune_gemm_backends = TRITON).
         # Its name should contain `mm` because `mm` was the original aten op where the mm came from.
         FileCheck().check("def triton_tem_fused_mm").run(code[0])
diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py
index 74eb018ca806..504ed6344b2f 100644
--- a/test/inductor/test_padding.py
+++ b/test/inductor/test_padding.py
@@ -8,11 +8,18 @@
 from torch import nn, Tensor
 from torch._dynamo.convert_frame import maybe_cprofile
 from torch._dynamo.device_interface import get_interface_for_device
+<<<<<<< HEAD
 from torch._dynamo.test_case import run_tests, TestCase
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.testing import rand_strided, reduce_to_scalar_loss
 from torch._inductor import config, ir, metrics
 from torch._inductor.fx_passes import pad_mm as pad_mm_pass
 from torch._inductor.runtime.benchmarking import benchmarker
+<<<<<<< HEAD
+=======
+from torch._inductor.test_case import run_tests, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.utils import ceildiv, run_and_get_code
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
index 8009f7f5bfb6..3ed384f8804d 100644
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@@ -307,6 +307,7 @@ def fn1(a, b, c):
             torch.randint(-128, 127, (32, 8), dtype=torch.int8, device=GPU_TYPE),
             torch.randn((8), dtype=torch.float32, device=GPU_TYPE),
         )
+<<<<<<< HEAD
 
         args2 = (
             torch.randint(-128, 127, (32, 32), dtype=torch.int8, device=GPU_TYPE),
@@ -315,6 +316,9 @@ def fn1(a, b, c):
         )
         self._test_fused_int_mm_mul_impl(fn1, args1, True)
         self._test_fused_int_mm_mul_impl(fn1, [arg.cpu() for arg in args2], False)
+=======
+        self._test_fused_int_mm_mul_impl(fn1, args1, True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_mixed_impl(
         self,
@@ -642,6 +646,47 @@ def forward(self, x):
 
         self.assertEqual(res1, res2)
 
+<<<<<<< HEAD
+=======
+    @inductor_config.patch(
+        {
+            "max_autotune_gemm_backends": "ATEN",
+        }
+    )
+    def test_bmm_to_mm(self):
+        def fn(a, b):
+            return torch.bmm(a, b)
+
+        a = torch.randn(1, 16, 8, device=GPU_TYPE)
+        b = torch.randn(1, 8, 32, device=GPU_TYPE)
+
+        result, (code,) = run_and_get_code(torch.compile(fn), a, b)
+
+        expected = fn(a, b)
+        torch.testing.assert_close(result, expected)
+
+        # The mm kernel should use ATen (because we set max_autotune_gemm_backends = ATEN).
+        # Its name should contain `aten.bmm` since this is the original aten op where the bmm came from.
+        if HAS_GPU:
+            FileCheck().check("extern_kernels.mm(").check_not(
+                "extern_kernels.bmm("
+            ).run(code)
+        else:
+            FileCheck().check("extern_kernels.bmm(")
+
+        a_multi = torch.randn(3, 16, 8, device=GPU_TYPE)
+        b_multi = torch.randn(3, 8, 32, device=GPU_TYPE)
+
+        result_multi, (code_multi,) = run_and_get_code(
+            torch.compile(fn), a_multi, b_multi
+        )
+
+        expected_multi = fn(a_multi, b_multi)
+        torch.testing.assert_close(result_multi, expected_multi)
+
+        FileCheck().check("extern_kernels.bmm(").run(code_multi)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cat_mm(self):
         def fn(a, b, c):
             return torch.cat(
@@ -744,6 +789,35 @@ def f(x):
         joint_graph.joint_graph_passes(gm)
         self.assertEqual(count_calls(gm.graph), 2)
 
+<<<<<<< HEAD
+=======
+        # handle negative 1 in size argument of view
+        def f(x):
+            x = aten.view.default(x, [3, 5, 7])
+            x = aten.view.default(x, [-1, 7])
+            return x
+
+        gm = make_fx(f)(x)
+        self.assertEqual(count_calls(gm.graph), 2)
+        joint_graph.joint_graph_passes(gm)
+        self.assertEqual(count_calls(gm.graph), 0)
+
+    def test_pointless_view_pair_dynamic_shapes(self):
+        def f(x):
+            s1, s2 = x.shape
+            x = aten.view.default(x, [-1])
+            x = aten.view.default(x, [s1, s2])
+            return x
+
+        x = torch.randn(15, 7, device=GPU_TYPE)
+        torch._dynamo.decorators.mark_unbacked(x, 0)
+
+        out = torch.compile(f, dynamic=True)(x)
+        self.assertTrue(torch.equal(x, out))
+
+        self.assertEqual(counters["inductor"]["removed_pointless_view_pair"], 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pointless_permute_pair(self):
         def f(x):
             x = aten.permute.default(x, [1, 0])
@@ -1101,6 +1175,7 @@ def fn5(x, y):
             torch.randn(5, 5, device=GPU_TYPE),
         ]
 
+<<<<<<< HEAD
         with unittest.mock.patch(
             "torch._inductor.fx_passes.pre_grad.config.pre_grad_fusion_options",
             {"test": {}},
@@ -1110,6 +1185,21 @@ def fn5(x, y):
         ), unittest.mock.patch(
             "torch._inductor.fx_passes.pre_grad.PRE_GRAD_PATTERNS",
             {"test": test_pass},
+=======
+        with (
+            unittest.mock.patch(
+                "torch._inductor.fx_passes.pre_grad.config.pre_grad_fusion_options",
+                {"test": {}},
+            ),
+            unittest.mock.patch(
+                "torch._inductor.fx_passes.pre_grad.PRE_GRAD_FUSIONS",
+                [],
+            ),
+            unittest.mock.patch(
+                "torch._inductor.fx_passes.pre_grad.PRE_GRAD_PATTERNS",
+                {"test": test_pass},
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             for fn in (fn0, fn1, fn2, fn3, fn4, fn5):
                 counter = 0
@@ -1293,6 +1383,22 @@ def repl(inp, x1, x2):
                 # addmm should be replaced
                 FileCheck().check_not("extern_kernels.addmm(").run(code[0])
 
+<<<<<<< HEAD
+=======
+    def test_replace_mul_zero(self):
+        def test(x, y):
+            return x + (y * 0)
+
+        x = torch.rand([256], device=GPU_TYPE)
+        y = torch.rand([256], device=GPU_TYPE)
+
+        test_c = torch.compile(test)
+
+        out, code = run_and_get_code(test_c, x, y)
+        FileCheck().check_not(".run").run(code[0])
+        self.assertEqual(out, test(x, y))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @inductor_config.patch(fx_graph_remote_cache=False)
     def test_match_equivalent_function_invocations2(self):
         counter = 0
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 4b720495c711..7b3dc0adf0b5 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -31,6 +31,12 @@
 from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
 
 
+<<<<<<< HEAD
+=======
+# set so that metrics appear
+torch._logging.set_logs(inductor_metrics=True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if HAS_CUDA:
     import triton  # @manual
     import triton.language as tl  # @manual
diff --git a/test/inductor/test_profiler.py b/test/inductor/test_profiler.py
index 75e6a7ed4076..7ec9a29c7ad6 100644
--- a/test/inductor/test_profiler.py
+++ b/test/inductor/test_profiler.py
@@ -12,7 +12,12 @@
 from torch._inductor import config
 from torch.profiler import ProfilerActivity
 from torch.testing._internal.common_utils import TemporaryFileName
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import HAS_CUDA
+=======
+from torch.testing._internal.inductor_utils import HAS_CUDA, IS_BIG_GPU
+from torch.torch_version import TorchVersion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import has_triton
 
 
@@ -107,7 +112,13 @@ def fn(x, y):
                 self.assertEqual(event.input_shapes[:4], [[4, 4], [4, 4], [4, 4], []])
         self.assertTrue(event_found)
 
+<<<<<<< HEAD
     @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
+=======
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inductor_profiling_kernel_names_template(self):
         with config.patch(
             {"max_autotune": True, "max_autotune_gemm_backends": "TRITON"}
@@ -176,9 +187,20 @@ def fn(x, y):
             self.assertTrue(event_found)
 
     @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
+<<<<<<< HEAD
     def test_inductor_profiling_triton_hooks(self):
         from triton.compiler import CompiledKernel  # @manual
 
+=======
+    @config.patch(
+        "compile_threads", 1
+    )  # This test monkey patches global variables, which workers don't see
+    def test_inductor_profiling_triton_hooks(self):
+        from triton.compiler import CompiledKernel  # @manual
+
+        from torch._inductor.runtime.triton_compat import knobs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hooks_called = {"enter": False, "exit": False}
 
         def launch_enter_hook(lazy_dict):
@@ -187,8 +209,17 @@ def launch_enter_hook(lazy_dict):
         def launch_exit_hook(lazy_dict):
             hooks_called["exit"] = True
 
+<<<<<<< HEAD
         CompiledKernel.launch_enter_hook = launch_enter_hook
         CompiledKernel.launch_exit_hook = launch_exit_hook
+=======
+        if knobs:
+            knobs.runtime.launch_enter_hook = launch_enter_hook
+            knobs.runtime.launch_exit_hook = launch_exit_hook
+        else:
+            CompiledKernel.launch_enter_hook = launch_enter_hook
+            CompiledKernel.launch_exit_hook = launch_exit_hook
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn(x, y):
             return torch._foreach_add(x, y)
@@ -280,6 +311,26 @@ def check_triton_event(e) -> None:
         for e in triton_events:
             check_triton_event(e)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
+    def test_cupti_lazy_reinit(self):
+        x, y = (torch.randn(4, 4, device="cuda") for _ in range(2))
+
+        def fn(x, y):
+            return (x + y).sin()
+
+        fn_c = torch.compile(fn, mode="reduce-overhead")
+
+        with torch.profiler.profile():
+            fn_c(x, y)
+
+        if TorchVersion(torch.version.cuda) >= "12.6":
+            self.assertEqual("0", os.environ.get("DISABLE_CUPTI_LAZY_REINIT", "0"))
+        else:
+            self.assertEqual("1", os.environ.get("DISABLE_CUPTI_LAZY_REINIT", "0"))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_provenance_tracing.py b/test/inductor/test_provenance_tracing.py
index 58bb1026d657..d1f404502859 100644
--- a/test/inductor/test_provenance_tracing.py
+++ b/test/inductor/test_provenance_tracing.py
@@ -5,12 +5,20 @@
 import re
 import shutil
 import tempfile
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from pathlib import Path
 
 import torch
 from torch._inductor import config
 from torch._inductor.debug import create_node_mapping
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
+=======
+from torch.testing._internal.inductor_utils import HAS_GPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.triton_utils import requires_cuda
 
 
@@ -31,7 +39,32 @@ def forward(self, a, b, c):
         return z
 
 
+<<<<<<< HEAD
 @requires_cuda
+=======
+class Model2(torch.nn.Module):
+    # this test model is used for combo kernel provenance tracing info
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, a, b, c):
+        a1 = torch.nn.functional.relu(a)
+        b1 = torch.nn.functional.sigmoid(b)
+        c1 = torch.nn.functional.tanh(c)
+        return a1, b1, c1
+
+
+class Model3(torch.nn.Module):
+    def __init__(self, n, k):
+        super().__init__()
+        self.weight = torch.randn(n, k, device="cuda")
+        self.bias = torch.randn(n, device="cuda")
+
+    def forward(self, a):
+        return torch.nn.functional.linear(a, self.weight, self.bias)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @config.patch("trace.enabled", True)
 class TestProvenanceTracingArtifact(TestCase):
     """
@@ -39,6 +72,7 @@ class TestProvenanceTracingArtifact(TestCase):
     corresponding "inductor triton kernel node" is expected.
     """
 
+<<<<<<< HEAD
     def _check_provenance_tracing_artifact(self, filepath):
         self.assertTrue(filepath.is_dir())
         filename = Path(filepath) / "inductor_triton_kernel_to_post_grad_nodes.json"
@@ -122,6 +156,32 @@ def test_triton_kernel_to_post_grad_tracing(self):
         model = Model()
         ep = torch.export._trace._export(model, example_inputs)
         gm = ep.module()
+=======
+    def _check_provenance_tracing_artifact(self, filepath, expected_data):
+        self.assertTrue(filepath.is_dir())
+        filename = Path(filepath) / "inductor_generated_kernel_to_post_grad_nodes.json"
+        with open(filename) as f:
+            actual_data = json.load(f)
+        # check that the generated provenance tracing artifact is expected
+        self.assertEqual(sorted(actual_data.items()), sorted(expected_data.items()))
+
+    def _check_provenance_tracking_node_mappings(self, filepath, expected_mapping):
+        self.assertTrue(filepath.is_dir())
+        filename = Path(filepath) / "inductor_provenance_tracking_node_mappings.json"
+        with open(filename) as f:
+            actual_data = json.load(f)
+        # check that the generated provenance tracing node mapping is expected
+        self.assertEqual(sorted(actual_data.items()), sorted(expected_mapping))
+
+    def _test_triton_kernel_to_post_grad_tracing(self, device):
+        a = torch.randn(10, 20, device=device)
+        b = torch.randn(20, 30, device=device)
+        c = torch.randn(10, 30, device=device)
+        example_inputs = (a, b, c)
+
+        model = Model()
+        filepath = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for backend in ["aot_inductor", "inductor"]:
             try:
@@ -136,19 +196,226 @@ def test_triton_kernel_to_post_grad_tracing(self):
                         level=logging.WARNING,
                     ) as cm:
                         if backend == "aot_inductor":
+<<<<<<< HEAD
                             so_path = torch._inductor.aot_compile(gm, example_inputs)
                             optimized = AOTIRunnerUtil.load("cuda", so_path)
                             optimized(*example_inputs)
                         else:
                             compiled = torch.compile(gm, backend=backend)
+=======
+                            AOTIRunnerUtil.run(model, example_inputs)
+                        else:
+                            ep = torch.export._trace._export(model, example_inputs)
+                            compiled = torch.compile(ep.module(), backend=backend)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             compiled(*example_inputs)
                     self.assertEqual(len(cm.output), 1)
                     m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
                     self.assertTrue(m)
                     filepath = Path(m.group(1))
+<<<<<<< HEAD
                     self._check_provenance_tracing_artifact(filepath)
             finally:
                 shutil.rmtree(filepath)
+=======
+                    if device == "cuda":
+                        expected_data = {
+                            "triton_poi_fused_mul_0": ["mul"],
+                            "triton_poi_fused_addmm_gelu_1": [
+                                "mul_3",
+                                "mul_1",
+                                "add_tensor",
+                                "add",
+                                "erf",
+                                "mul_2",
+                            ],
+                        }
+                        self._check_provenance_tracing_artifact(filepath, expected_data)
+                        expected_mapping = [
+                            (
+                                "cppCodeToPost",
+                                {
+                                    "triton_poi_fused_mul_0": ["mul"],
+                                    "triton_poi_fused_addmm_gelu_1": [
+                                        "mul_3",
+                                        "mul_1",
+                                        "add_tensor",
+                                        "add",
+                                        "erf",
+                                        "mul_2",
+                                    ],
+                                },
+                            ),
+                            (
+                                "postToCppCode",
+                                {
+                                    "mul": ["triton_poi_fused_mul_0"],
+                                    "mul_3": ["triton_poi_fused_addmm_gelu_1"],
+                                    "mul_1": ["triton_poi_fused_addmm_gelu_1"],
+                                    "add_tensor": ["triton_poi_fused_addmm_gelu_1"],
+                                    "add": ["triton_poi_fused_addmm_gelu_1"],
+                                    "erf": ["triton_poi_fused_addmm_gelu_1"],
+                                    "mul_2": ["triton_poi_fused_addmm_gelu_1"],
+                                },
+                            ),
+                            (
+                                "postToPre",
+                                {
+                                    "mul": ["mul"],
+                                    "mm_default": ["addmm"],
+                                    "add_tensor": ["addmm"],
+                                    "mul_1": ["gelu"],
+                                    "mul_2": ["gelu"],
+                                    "erf": ["gelu"],
+                                    "add": ["gelu"],
+                                    "mul_3": ["gelu"],
+                                },
+                            ),
+                            (
+                                "preToPost",
+                                {
+                                    "mul": ["mul"],
+                                    "addmm": ["mm_default", "add_tensor"],
+                                    "gelu": ["mul_1", "mul_2", "erf", "add", "mul_3"],
+                                },
+                            ),
+                        ]
+                        self._check_provenance_tracking_node_mappings(
+                            filepath, expected_mapping
+                        )
+                    else:
+                        assert device == "cpu"
+                        # check the inductor kernel to post grad nodes mapping is expected for cpu
+                        if backend == "aot_inductor":
+                            expected_data = {
+                                "cpp_fused_mul_0": ["mul"],
+                                "aoti_torch_cpu_addmm_out": ["addmm", "mul"],
+                                "cpp_fused_gelu_1": [
+                                    "mul_3",
+                                    "mul_1",
+                                    "add",
+                                    "erf",
+                                    "mul_2",
+                                ],
+                            }
+                        else:
+                            # backend == "inductor"
+                            expected_data = {
+                                "cpp_fused_mul_0": ["mul"],
+                                "aoti_torch_cpu_addmm_out": ["addmm", "mul"],
+                                "cpp_fused_gelu_1": [
+                                    "mul_3",
+                                    "mul_1",
+                                    "add",
+                                    "erf",
+                                    "mul_2",
+                                ],
+                                "extern_kernels.addmm": ["addmm", "mul"],
+                            }
+                        self._check_provenance_tracing_artifact(filepath, expected_data)
+
+            finally:
+                if filepath:
+                    shutil.rmtree(filepath)
+
+    @requires_cuda
+    def test_triton_kernel_to_post_grad_tracing_cuda(self):
+        self._test_triton_kernel_to_post_grad_tracing(device="cuda")
+
+    @unittest.skipIf(HAS_GPU, "the test is only for cpu")
+    def test_triton_kernel_to_post_grad_tracing_cpu(self):
+        self._test_triton_kernel_to_post_grad_tracing(device="cpu")
+
+    @requires_cuda
+    def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
+        M = 8
+        N = 6
+        K = 16
+        model = Model3(N, K)
+        batch = 2
+        a = torch.randn(batch, M, K, device="cuda")
+        example_inputs = (a,)
+        filepath = None
+
+        for backend in ["aot_inductor", "inductor"]:
+            try:
+                with config.patch(
+                    {
+                        "trace.debug_dir": tempfile.mkdtemp(),
+                        "force_disable_caches": True,
+                    }
+                ):
+                    with self.assertLogs(
+                        logging.getLogger("torch._inductor.debug"),
+                        level=logging.WARNING,
+                    ) as cm:
+                        if backend == "aot_inductor":
+                            AOTIRunnerUtil.run(model, example_inputs)
+                        else:
+                            ep = torch.export._trace._export(model, example_inputs)
+                            compiled = torch.compile(ep.module(), backend=backend)
+                            compiled(*example_inputs)
+                    self.assertEqual(len(cm.output), 1)
+                    m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
+                    self.assertTrue(m)
+                    filepath = Path(m.group(1))
+                    if backend == "inductor":
+                        expected_data = {
+                            "aoti_torch_cuda_addmm_out": ["addmm", "_tensor_constant1"],
+                            "triton_poi_fused_0": ["_tensor_constant1"],
+                            "extern_kernels.addmm": ["addmm"],
+                        }
+                    else:
+                        # backend = aot_inductor
+                        expected_data = {
+                            "aoti_torch_cuda_addmm_out": ["addmm", "_tensor_constant1"],
+                            "triton_poi_fused_0": ["_tensor_constant1"],
+                        }
+                    self._check_provenance_tracing_artifact(filepath, expected_data)
+            finally:
+                if filepath:
+                    shutil.rmtree(filepath)
+
+    @requires_cuda
+    def _test_pt_tracing_combo_kernel(self, backend):
+        """This test checks that generated provenance tracing artifact from triton combo kernel to post grad nodes"""
+        a = torch.randn(10, 10, device="cuda")
+        b = torch.randn(20, 20, device="cuda")
+        c = torch.randn(10, 10, device="cuda")
+        example_inputs = (a, b, c)
+
+        model = Model2()
+
+        with config.patch(
+            {
+                "trace.debug_dir": tempfile.mkdtemp(),
+                "force_disable_caches": True,
+                "combo_kernels": True,
+                "benchmark_combo_kernel": False,
+            }
+        ):
+            with self.assertLogs(
+                logging.getLogger("torch._inductor.debug"),
+                level=logging.WARNING,
+            ) as cm:
+                if backend == "aot_inductor":
+                    AOTIRunnerUtil.run(model, example_inputs)
+                else:
+                    ep = torch.export._trace._export(model, example_inputs)
+                    compiled = torch.compile(ep.module(), backend=backend)
+                    compiled(*example_inputs)
+            self.assertEqual(len(cm.output), 1)
+            m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
+            self.assertTrue(m)
+            filepath = Path(m.group(1)).resolve()
+            expected_data = {"triton_poi_fused_0": ["relu", "sigmoid", "tanh"]}
+            self._check_provenance_tracing_artifact(filepath, expected_data)
+
+    @requires_cuda
+    def test_triton_kernel_to_post_grad_tracing_combo_kernel(self):
+        self._test_pt_tracing_combo_kernel(backend="inductor")
+        self._test_pt_tracing_combo_kernel(backend="aot_inductor")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestProvenanceTracingNodeMapping(TestCase):
diff --git a/test/inductor/test_quantization.py b/test/inductor/test_quantization.py
new file mode 100644
index 000000000000..ecc46d00d1b8
--- /dev/null
+++ b/test/inductor/test_quantization.py
@@ -0,0 +1,234 @@
+# Owner(s): ["module: inductor"]
+
+import logging
+
+import numpy as np
+
+import torch
+import torch._inductor
+import torch._inductor.fx_passes.group_batch_fusion
+from torch._dynamo.utils import counters
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_utils import IS_LINUX
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, requires_gpu
+
+
+log = logging.getLogger(__name__)
+
+
+class TargetCPModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x1, x2):
+        relued = torch.relu(x1)
+        tanhed = torch.tanh(relued)
+        tensor = torch.matmul(
+            tanhed,
+            x2,
+        )
+        return tensor
+
+
+class FeedforwardNN(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(1, 64)
+        self.fc2 = torch.nn.Linear(64, 64)
+        self.fc3 = torch.nn.Linear(64, 64)
+        self.fc4 = torch.nn.Linear(64, 1)
+
+    def forward(self, x):
+        x = torch.relu(self.fc1(x))
+        tanh_x = torch.tanh(x)
+        x = torch.relu(self.fc2(x))
+        x = torch.relu(self.fc3(tanh_x))
+        x = self.fc4(x)
+        return x
+
+
+class LayernormNN(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input, normalized_shape, weight, bias):
+        x = torch.nn.functional.layer_norm(
+            input=input,
+            normalized_shape=normalized_shape,
+            weight=weight,
+            bias=bias,
+            eps=1e-5,
+        )
+        return x
+
+
+class TestQuantization(TestCase):
+    def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3):
+        if len(set(ref_dict.keys())) != len(set(res_dict.keys())):
+            return False
+        for key1 in ref_dict.keys():
+            key2 = "_orig_mod." + key1
+            assert key2 in res_dict, f"{key1} does not exist in traced module"
+            # if both of them are None, continue
+            if (
+                not isinstance(ref_dict[key1], torch.Tensor)
+                and not isinstance(res_dict[key2], torch.Tensor)
+                and ref_dict[key1] is None
+                and res_dict[key2] is None
+            ):
+                log.info(
+                    "None found with key1 and value 1: %s, %s, key2 and value2 %s, %s",
+                    key1,
+                    ref_dict[key1],
+                    key2,
+                    res_dict[key2],
+                )
+                continue
+            elif not torch.allclose(
+                ref_dict[key1], res_dict[key2], rtol=rtol, atol=atol, equal_nan=True
+            ):
+                log.info(
+                    "gradient mismatch for eager and compiled modules, with eager: %s and compiled: %s",
+                    ref_dict[key1],
+                    res_dict[key2],
+                )
+                return False
+        return True
+
+    def compare_pred(self, module, traced, input, rtol=1e-3, atol=1e-3):
+        ref = module(*input)
+        res = traced(*input)
+        self.assertEqual(ref, res, rtol=rtol, atol=atol)
+
+    def compare_parameters(self, module, traced, rtol=1e-3, atol=1e-3):
+        ref_params = dict(module.named_parameters())
+        res_params = dict(traced.named_parameters())
+        self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol, atol))
+
+    def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
+        ref_grad = {key: param.grad for key, param in module.named_parameters()}
+        res_grad = {key: param.grad for key, param in traced.named_parameters()}
+        self.assertTrue(
+            self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
+        )
+
+    @requires_gpu()
+    @torch._inductor.config.patch(
+        pre_grad_fusion_options={},
+        post_grad_fusion_options={
+            "activation_quantization_aten_pass": {
+                "quant_type": "torch.float8_e5m2",
+                "use_scaling": True,
+                "size_in_mb": 0.0,
+                "exclude_primals": True,
+                "allowed_dtypes": "torch.bfloat16;torch.float32",
+            },
+        },
+    )
+    def test_activation_quantization_aten_with_scaling(self):
+        counters.clear()
+        module = TargetCPModule().to(GPU_TYPE)
+        input = [
+            torch.rand(
+                (16, 10), requires_grad=True, device=GPU_TYPE, dtype=torch.bfloat16
+            ),
+            torch.rand(
+                (10, 16), requires_grad=True, device=GPU_TYPE, dtype=torch.bfloat16
+            ),
+        ]
+        traced = torch.compile(module)
+        ref = module(*input)
+        res = traced(*input)
+        self.compare_pred(module, traced, input)
+        ref.sum().backward()
+        res.sum().backward()
+        self.compare_parameters(module, traced)
+        self.compare_gradients(module, traced)
+        self.assertEqual(
+            counters["inductor"]["activation_quantization_fwd_aten_pass"], 1
+        )
+        self.assertEqual(
+            counters["inductor"]["activation_quantization_bwd_aten_pass"], 1
+        )
+        self.assertTrue(torch.allclose(ref, res))
+        counters.clear()
+
+        module = FeedforwardNN().to(GPU_TYPE)
+        X = np.linspace(-10, 10, 100).reshape(-1, 1).astype(np.float32)
+        input = [
+            torch.from_numpy(X).to(GPU_TYPE),
+        ]
+        traced = torch.compile(module)
+        ref = module(*input)
+        res = traced(*input)
+        self.compare_pred(module, traced, input)
+        ref.sum().backward()
+        res.sum().backward()
+        self.compare_parameters(module, traced)
+        self.compare_gradients(module, traced)
+        self.assertEqual(
+            counters["inductor"]["activation_quantization_fwd_aten_pass"], 1
+        )
+        self.assertEqual(
+            counters["inductor"]["activation_quantization_bwd_aten_pass"], 1
+        )
+        self.assertTrue(torch.allclose(ref, res))
+        counters.clear()
+
+    @requires_gpu()
+    @torch._inductor.config.patch(
+        pre_grad_fusion_options={},
+        post_grad_fusion_options={
+            "activation_quantization_aten_pass": {
+                "quant_type": "torch.float8_e5m2",
+                "use_scaling": False,
+                "size_in_mb": 0.0,
+                "exclude_primals": True,
+                "allowed_dtypes": "torch.bfloat16;torch.float32",
+            },
+        },
+    )
+    def test_activation_quantization_aten_without_scaling(self):
+        counters.clear()
+
+        module = LayernormNN().to(GPU_TYPE)
+        normalized_shape = [256]
+        input = [
+            torch.randn(
+                (1, 3, 256), requires_grad=True, device=GPU_TYPE, dtype=torch.bfloat16
+            ),
+            normalized_shape,
+            torch.randn(
+                *normalized_shape,
+                requires_grad=True,
+                device=GPU_TYPE,
+                dtype=torch.bfloat16,
+            ),
+            torch.randn(
+                *normalized_shape,
+                requires_grad=True,
+                device=GPU_TYPE,
+                dtype=torch.bfloat16,
+            ),
+        ]
+        traced = torch.compile(module)
+        ref = module(*input)
+        res = traced(*input)
+        self.compare_pred(module, traced, input)
+        ref.sum().backward()
+        res.sum().backward()
+        self.compare_parameters(module, traced)
+        self.compare_gradients(module, traced)
+        self.assertEqual(
+            counters["inductor"]["activation_quantization_fwd_aten_pass"], 1
+        )
+        self.assertEqual(
+            counters["inductor"]["activation_quantization_bwd_aten_pass"], 1
+        )
+        self.assertTrue(torch.allclose(ref, res))
+        counters.clear()
+
+
+if __name__ == "__main__":
+    if IS_LINUX and HAS_GPU:
+        run_tests()
diff --git a/test/inductor/test_scatter_optimization.py b/test/inductor/test_scatter_optimization.py
index a67b3e25a6b6..56101a732d90 100644
--- a/test/inductor/test_scatter_optimization.py
+++ b/test/inductor/test_scatter_optimization.py
@@ -13,6 +13,12 @@
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
+<<<<<<< HEAD
+=======
+# set so that metrics appear
+torch._logging.set_logs(inductor_metrics=True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1"
 
 
@@ -177,9 +183,15 @@ def f(m, x, label):
         ref_grad = ref_model.weight.grad
         opt_f(opt_model, x, label)
         act_grad = opt_model.weight.grad
+<<<<<<< HEAD
         assert torch.allclose(
             ref_grad, act_grad, atol=1e-3, rtol=1e-3
         ), f"{ref_grad=}\n{act_grad=}"
+=======
+        assert torch.allclose(ref_grad, act_grad, atol=1e-3, rtol=1e-3), (
+            f"{ref_grad=}\n{act_grad=}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.check_metric()
 
diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
index 2d2443e86c18..2865d20e5a41 100644
--- a/test/inductor/test_select_algorithm.py
+++ b/test/inductor/test_select_algorithm.py
@@ -38,9 +38,15 @@ def skip_cache(self, choices, name, key, benchmark):
     def wrapped(*args, **kwargs):
         counters.clear()
         torch.manual_seed(12345)
+<<<<<<< HEAD
         assert (
             not torch.backends.cuda.matmul.allow_tf32
         ), "correctness testing is allergic to tf32"
+=======
+        assert not torch.backends.cuda.matmul.allow_tf32, (
+            "correctness testing is allergic to tf32"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fn(*args, **kwargs)
 
     return wrapped
@@ -357,6 +363,11 @@ def test_TritonTemplateCaller_str(self):
             extra_args=None,
             num_stages=None,
             num_warps=None,
+<<<<<<< HEAD
+=======
+            num_consumer_groups=None,
+            num_buffers_warp_spec=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input_tensor_meta=None,
             output_tensor_meta=None,
         )
diff --git a/test/inductor/test_snode_runtime.py b/test/inductor/test_snode_runtime.py
index e002a61b6725..76ce4e4d9fcf 100644
--- a/test/inductor/test_snode_runtime.py
+++ b/test/inductor/test_snode_runtime.py
@@ -32,6 +32,10 @@ def calculate_runtime(f, *args) -> float:
     Assumes all inputs are fp32
     """
     metrics.reset()
+<<<<<<< HEAD
+=======
+    torch._logging.set_logs(inductor_metrics=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.compile(f, backend=compile_but_use_eager)(*args)
     print(metrics.node_runtimes)
 
@@ -39,6 +43,10 @@ def calculate_runtime(f, *args) -> float:
     for pair in metrics.node_runtimes:
         ret += pair[1]
 
+<<<<<<< HEAD
+=======
+    torch._logging.set_logs()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ret
 
 
@@ -235,6 +243,10 @@ def _verify_runtime_estimation(self, fn, inps):
         )
         try:
             metrics.reset()
+<<<<<<< HEAD
+=======
+            torch._logging.set_logs(inductor_metrics=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.compile(fn)(*inps)
             found_collective = False
             for snode, runtime in metrics.node_runtimes:
@@ -251,6 +263,10 @@ def _verify_runtime_estimation(self, fn, inps):
                 self.assertNotZero(runtime)
             # Make sure a collective kernel is found in graph
             self.assertTrue(found_collective)
+<<<<<<< HEAD
+=======
+            torch._logging.set_logs()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         finally:
             dist.destroy_process_group()
 
diff --git a/test/inductor/test_split_cat_fx_aten_passes.py b/test/inductor/test_split_cat_fx_aten_passes.py
index 99ee583d975c..f391ab768c96 100644
--- a/test/inductor/test_split_cat_fx_aten_passes.py
+++ b/test/inductor/test_split_cat_fx_aten_passes.py
@@ -49,6 +49,135 @@ def forward(self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         return torch.ops.aten.cat.default([cat_1, cat_2], 1)
 
 
+<<<<<<< HEAD
+=======
+class TestSplitCatPartial(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self, x1: torch.Tensor, x2: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+    ):
+        split_with_sizes_1 = torch.ops.aten.split_with_sizes.default(
+            x1,
+            [
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+                96,
+            ],
+            1,
+        )
+        split_with_sizes_2 = torch.ops.aten.split_with_sizes.default(
+            x2, [96, 96, 96, 96], 1
+        )
+        getitem_71 = split_with_sizes_1[0]
+        getitem_72 = split_with_sizes_1[1]
+        getitem_73 = split_with_sizes_1[2]
+        getitem_74 = split_with_sizes_1[3]
+        getitem_75 = split_with_sizes_1[4]
+        getitem_76 = split_with_sizes_1[5]
+        getitem_77 = split_with_sizes_1[10]
+        getitem_78 = split_with_sizes_1[11]
+        getitem_79 = split_with_sizes_1[12]
+        getitem_80 = split_with_sizes_1[13]
+        getitem_81 = split_with_sizes_1[14]
+        getitem_82 = split_with_sizes_1[15]
+        getitem_83 = split_with_sizes_1[16]
+        getitem_84 = split_with_sizes_1[17]
+        getitem_85 = split_with_sizes_2[0]
+        getitem_86 = split_with_sizes_2[1]
+        getitem_87 = split_with_sizes_2[2]
+        getitem_88 = split_with_sizes_2[3]
+
+        cat = torch.ops.aten.cat.default(
+            [
+                z,
+                getitem_71,
+                getitem_72,
+                getitem_73,
+                getitem_74,
+                getitem_75,
+                getitem_76,
+                getitem_82,
+                getitem_83,
+                getitem_84,
+                y,
+                getitem_77,
+                getitem_78,
+                getitem_79,
+                getitem_80,
+                getitem_81,
+                y,
+                getitem_85,
+                getitem_86,
+                getitem_87,
+                getitem_88,
+                z,
+            ],
+            1,
+        )
+        return cat
+
+
+class TestMoveViewAferCat(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor):
+        split_with_sizes_1 = torch.ops.aten.split_with_sizes.default(
+            x, [1, 1, 1, 1, 1, 1, 1]
+        )
+        getitem_71 = split_with_sizes_1[0]
+        getitem_72 = split_with_sizes_1[1]
+        getitem_73 = split_with_sizes_1[2]
+        getitem_74 = split_with_sizes_1[3]
+        getitem_75 = split_with_sizes_1[4]
+        getitem_76 = split_with_sizes_1[5]
+        getitem_77 = split_with_sizes_1[6]
+        view_1 = torch.ops.aten.view.default(getitem_71, [8, 96])
+        view_2 = torch.ops.aten.view.default(getitem_72, [8, 96])
+        view_3 = torch.ops.aten.view.default(getitem_73, [8, 96])
+        view_4 = torch.ops.aten.view.default(getitem_74, [8, 96])
+        view_5 = torch.ops.aten.view.default(getitem_75, [8, 96])
+        view_6 = torch.ops.aten.view.default(getitem_76, [8, 96])
+        view_7 = torch.ops.aten.view.default(getitem_77, [8, 96])
+        clone = torch.ops.aten.clone.default(view_1)
+
+        cat = torch.ops.aten.cat.default(
+            [
+                view_1,
+                view_2,
+                view_3,
+                view_4,
+                view_5,
+                view_6,
+                view_7,
+            ],
+            1,
+        )
+        cat_1 = torch.ops.aten.cat.default([clone, cat], 1)
+        return torch.cat([clone, cat_1], 1)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestSelectCat(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -111,7 +240,11 @@ def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
         pre_grad_fusion_options={},
         post_grad_fusion_options={
             "normalization_aten_pass": {},
+<<<<<<< HEAD
             "split_cat_aten_pass": {},
+=======
+            "split_cat_aten_pass": {"threshold_to_cat": 5},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         },
     )
     def test_split_cat_post_grad(self):
@@ -132,6 +265,26 @@ def test_split_cat_post_grad(self):
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
         counters.clear()
 
+<<<<<<< HEAD
+=======
+        inputs = [
+            torch.randn(1024, 96 * 21, device=torch.device(device=GPU_TYPE)),
+            torch.randn(1024, 96 * 4, device=torch.device(device=GPU_TYPE)),
+            torch.randn(1024, 96, device=torch.device(device=GPU_TYPE)),
+            torch.randn(1024, 96, device=torch.device(device=GPU_TYPE)),
+        ]
+        module = TestSplitCatPartial()
+        traced = torch.compile(module)
+        ref = module(*inputs)
+        res = traced(*inputs)
+        self.compare_pred(module, traced, inputs)
+        self.assertEqual(counters["inductor"]["normalization_aten_pass"], 3)
+        self.assertEqual(counters["inductor"]["split_cat_aten_pass"], 1)
+        self.assertEqual(ref, res, rtol=1e-8, atol=1e-8)
+        self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
+        counters.clear()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_cuda
     @torch._inductor.config.patch(
         pre_grad_fusion_options={},
@@ -157,6 +310,33 @@ def test_select_cat_post_grad(self):
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
         counters.clear()
 
+<<<<<<< HEAD
+=======
+    @requires_cuda
+    @torch._inductor.config.patch(
+        pre_grad_fusion_options={},
+        post_grad_fusion_options={
+            "normalization_aten_pass": {},
+            "move_view_after_cat_aten_pass": {},
+        },
+    )
+    def test_move_view_after_cat_aten(self):
+        counters.clear()
+        inputs = [
+            torch.randn(7, 8, 96, device=torch.device(device=GPU_TYPE)),
+        ]
+        module = TestMoveViewAferCat()
+        traced = torch.compile(module)
+        ref = module(*inputs)
+        res = traced(*inputs)
+        self.compare_pred(module, traced, inputs)
+        self.assertEqual(counters["inductor"]["normalization_aten_pass"], 4)
+        self.assertEqual(counters["inductor"]["move_view_after_cat_aten_pass"], 1)
+        self.assertEqual(ref, res, rtol=1e-8, atol=1e-8)
+        self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
+        counters.clear()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_static_cuda_launcher.py b/test/inductor/test_static_cuda_launcher.py
new file mode 100644
index 000000000000..2ce294ed0ff5
--- /dev/null
+++ b/test/inductor/test_static_cuda_launcher.py
@@ -0,0 +1,517 @@
+# Owner(s): ["module: inductor"]
+import os
+import random
+import tempfile
+from unittest import mock
+
+import torch
+from torch._dynamo.device_interface import get_interface_for_device
+from torch._inductor.codecache import PyCodeCache
+from torch._inductor.runtime import triton_helpers
+from torch._inductor.runtime.static_cuda_launcher import StaticallyLaunchedCudaKernel
+from torch._inductor.runtime.triton_compat import CompiledKernel, tl, triton
+from torch._inductor.runtime.triton_helpers import libdevice
+from torch._inductor.test_case import TestCase
+from torch.testing._internal.common_utils import skipIfRocm
+from torch.testing._internal.triton_utils import requires_cuda
+
+
+@requires_cuda
+class TestStaticCudaLauncher(TestCase):
+    def setUp(self):
+        super().setUp()
+        self.tmp_files = []
+
+    def tearDown(self):
+        super().tearDown()
+        for tmp_file in self.tmp_files:
+            try:
+                os.remove(tmp_file.name)
+            except OSError:
+                pass
+
+    def write_cubin_to_tmp(self, kernel: CompiledKernel) -> str:
+        """
+        Only used for tests where we don't have a cubin path.
+        """
+        if hasattr(kernel, "_cubin_path"):
+            return
+        # Just used by tests for now.
+        # TODO: derive cubin_path from wherever triton stores the cubin file on disk.
+        tmp_file = tempfile.NamedTemporaryFile(mode="wb", delete=False)
+        with tmp_file:
+            tmp_file.write(kernel.asm["cubin"])
+        self.tmp_files.append(tmp_file)
+        return tmp_file.name
+
+    def _make_launcher(
+        self,
+        compiled_kernel: CompiledKernel,
+    ) -> StaticallyLaunchedCudaKernel:
+        """
+        Compiles a Triton kernel with the provided *args,
+        writes its cubin to the temporary file, and returns the file path.
+        """
+        cubin_file = self.write_cubin_to_tmp(compiled_kernel)
+        compiled_kernel._cubin_path = cubin_file
+        result = StaticallyLaunchedCudaKernel(compiled_kernel)
+        # Test reload cubin from raw here
+        old_cubin_path = result.cubin_path
+        assert old_cubin_path is not None
+        result.cubin_path = None
+        result.reload_cubin_from_raw(old_cubin_path)
+        device_interface = get_interface_for_device("cuda")
+        result.load_kernel(device_interface.current_device())
+        return result
+
+    @skipIfRocm
+    def test_basic(self):
+        @triton.jit
+        def simple_kernel(arg0, arg1):
+            x = tl.load(arg0)
+            y = arg1
+            tl.store(arg0, x + y)
+
+        arg0 = torch.zeros(1, dtype=torch.int32, device="cuda")
+        arg1 = 5
+        args = (arg0, arg1)
+        compiled_kernel = simple_kernel[(1,)](*args)
+        launcher = self._make_launcher(compiled_kernel)
+        self.assertEqual(arg0, torch.tensor([5], dtype=torch.int32, device="cuda"))
+        self.assertEqual(launcher.arg_tys, "Oi")
+        new_arg0 = torch.zeros(1, dtype=torch.int32, device="cuda")
+        device_interface = get_interface_for_device("cuda")
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+
+        launcher.run(1, 1, 1, stream, new_arg0, arg1)
+        self.assertEqual(new_arg0, arg0)
+
+    # I wish I could macro all int types this into a single unit test on a loop, but
+    # 1. variables aren't allowed as type annotations in python
+    # 2. triton relies on inspect.get_source to get the type annotations
+    # so I can't even use exec() to generate the test cases.
+    # So we'll just make a few kernels by hand
+    @skipIfRocm
+    def test_unsigned_integers(self):
+        @triton.jit
+        def unsigned_integers(
+            arg0, arg1: tl.uint8, arg2: tl.uint16, arg3: tl.uint32, arg4: tl.uint64
+        ):
+            x = tl.load(arg0)
+            y = arg1 + arg2 + arg3 + arg4
+            tl.store(arg0, x + y)
+
+        arg0 = torch.zeros(1, dtype=torch.uint64, device="cuda")
+        # Using small numbers creates a Literal type which triton treats as a constant
+        args = (arg0, 50, 50, 50, 50)
+
+        compiled_kernel = unsigned_integers[1,](*args)
+        launcher = self._make_launcher(compiled_kernel)
+        self.assertEqual(arg0, torch.tensor([200], dtype=torch.uint64, device="cuda"))
+        self.assertEqual(launcher.arg_tys, "OBHIK")
+        new_arg0 = torch.zeros(1, dtype=torch.uint64, device="cuda")
+        device_interface = get_interface_for_device("cuda")
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+        launcher.run(1, 1, 1, stream, new_arg0, 50, 50, 50, 50)
+        self.assertEqual(new_arg0, arg0)
+
+    @skipIfRocm
+    def test_signed_integers(self):
+        @triton.jit
+        def signed_integers(
+            arg0, arg1: tl.int8, arg2: tl.int16, arg3: tl.int32, arg4: tl.int64
+        ):
+            x = tl.load(arg0)
+            y = arg1 + arg2 + arg3 + arg4
+            tl.store(arg0, x + y)
+
+        arg0 = torch.zeros(1, dtype=torch.int64, device="cuda")
+        # Using small numbers creates a Literal type which triton treats as a constant
+        args = (arg0, 50, 50, 50, 50)
+
+        compiled_kernel = signed_integers[1,](*args)
+        launcher = self._make_launcher(compiled_kernel)
+        self.assertEqual(arg0, torch.tensor([200], dtype=torch.int64, device="cuda"))
+        self.assertEqual(launcher.arg_tys, "Obhil")
+        new_arg0 = torch.zeros(1, dtype=torch.int64, device="cuda")
+        device_interface = get_interface_for_device("cuda")
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+        launcher.run(1, 1, 1, stream, new_arg0, 50, 50, 50, 50)
+        self.assertEqual(new_arg0, arg0)
+
+    @skipIfRocm
+    def test_basic_1arg(self):
+        @triton.jit
+        def simple_kernel_1_arg(arg0):
+            x = tl.load(arg0)
+            tl.store(arg0, x + 1)
+
+        arg0 = torch.zeros(1, dtype=torch.int32, device="cuda")
+        compiled_kernel = simple_kernel_1_arg[1,](arg0)
+        launcher = self._make_launcher(compiled_kernel)
+        self.assertEqual(arg0, torch.tensor([1], dtype=torch.int32, device="cuda"))
+        self.assertEqual(launcher.arg_tys, "O")
+        new_arg0 = torch.zeros(1, dtype=torch.int32, device="cuda")
+        device_interface = get_interface_for_device("cuda")
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+
+        launcher.run(
+            1,
+            1,
+            1,
+            stream,
+            new_arg0,
+        )
+        self.assertEqual(new_arg0, arg0)
+
+    @skipIfRocm
+    def test_constexpr(self):
+        # Constexprs are compiled directly into the cubin file,
+        # so we never need to pass it to StaticCudaLauncher.
+
+        @triton.jit
+        def kernel_constexpr(arg0, CONSTANT: tl.constexpr):
+            x = tl.load(arg0)
+            tl.store(arg0, x + CONSTANT)
+
+        # Can't use make_launcher because constexpr needs to be constant
+        arg0 = torch.zeros(1, dtype=torch.int32, device="cuda")
+        compiled_kernel = kernel_constexpr[(1,)](arg0, CONSTANT=5)
+        launcher = self._make_launcher(compiled_kernel)
+
+        self.assertEqual(arg0, torch.tensor([5], dtype=torch.int32, device="cuda"))
+        self.assertEqual(launcher.arg_tys, "O")
+        new_arg0 = torch.zeros(1, dtype=torch.int32, device="cuda")
+        device_interface = get_interface_for_device("cuda")
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+        launcher.run(
+            1,
+            1,
+            1,
+            stream,
+            new_arg0,
+        )
+        self.assertEqual(new_arg0, arg0)
+
+    @skipIfRocm
+    def test_implied_constant(self):
+        """xnumel is unused in this kernel, but isn't explicitly marked as a constexpr"""
+
+        # This kernel was generated by inductor so it has a bunch of unused arguments. We don't change it
+        @triton.jit
+        def triton_red_fused_any_isinf_0(
+            in_ptr0,
+            out_ptr0,
+            xnumel,  # noqa: F841
+            r0_numel,
+            XBLOCK: tl.constexpr,
+            R0_BLOCK: tl.constexpr,
+        ):
+            xnumel = 1  # noqa: F841
+            rnumel = r0_numel  # noqa: F841
+            RBLOCK: tl.constexpr = R0_BLOCK  # noqa: F841
+            xoffset = tl.program_id(0) * XBLOCK
+            xindex = xoffset + tl.arange(0, XBLOCK)[:, None]  # noqa: F841
+            xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)  # noqa: F841
+            r0_base = tl.arange(0, R0_BLOCK)[None, :]
+            rbase = r0_base  # noqa: F841
+            _tmp3 = tl.full([XBLOCK, R0_BLOCK], False, tl.int1)
+            for r0_offset in range(0, r0_numel, R0_BLOCK):
+                r0_index = r0_offset + r0_base
+                r0_mask = r0_index < r0_numel
+                roffset = r0_offset  # noqa: F841
+                rindex = r0_index  # noqa: F841
+                r0_0 = r0_index
+                tmp0 = tl.load(
+                    in_ptr0 + (r0_0), r0_mask, eviction_policy="evict_first", other=0.0
+                )
+                tmp1 = libdevice.isinf(tmp0).to(tl.int1)
+                tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
+                tmp4 = _tmp3 | tmp2
+                _tmp3 = tl.where(r0_mask, tmp4, _tmp3)
+            tmp3 = triton_helpers.any(_tmp3.to(tl.int8), 1)[:, None].to(tl.int1)
+            tl.store(out_ptr0 + (tl.full([XBLOCK, 1], 0, tl.int32)), tmp3, None)
+
+        arg0 = torch.tensor([0.0, 0.5, float("inf"), 5], device="cuda")
+        arg1 = torch.tensor([False], device="cuda")
+        arg2 = torch.tensor([False], device="cuda")
+        compiled_kernel = triton_red_fused_any_isinf_0[1,](
+            arg0, arg1, 1, 128, XBLOCK=1, R0_BLOCK=1
+        )
+        launcher = self._make_launcher(compiled_kernel)
+
+        device_interface = get_interface_for_device("cuda")
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+        # Don't pass in xnumel, as it is a constant
+        launcher.run(1, 1, 1, stream, arg0, arg2, 128)
+        self.assertEqual(arg1, arg2)
+
+    @skipIfRocm
+    def test_kernel_no_args(self):
+        # Just an easy way to test incompatible number of arguments
+        @triton.jit
+        def kernel_no_op():
+            pass
+
+        compiled_kernel = kernel_no_op[(1,)]()
+        launcher = self._make_launcher(compiled_kernel)
+        device_interface = get_interface_for_device("cuda")
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+        launcher.run(1, 1, 1, stream)
+
+    @skipIfRocm
+    def test_high_shared_mem(self):
+        @triton.jit
+        def simple_kernel(arg0, arg1):
+            x = tl.load(arg0)
+            y = arg1
+            tl.store(arg0, x + y)
+
+        arg0 = torch.zeros(1, dtype=torch.int32, device="cuda")
+        arg1 = 5
+        args = (arg0, arg1)
+        compiled_kernel = simple_kernel[(1,)](*args)
+        # Allocate 50 KB of memory
+        compiled_kernel.shared = 50000
+        launcher = self._make_launcher(compiled_kernel)
+        self.assertEqual(arg0, torch.tensor([5], dtype=torch.int32, device="cuda"))
+        self.assertEqual(launcher.arg_tys, "Oi")
+        new_arg0 = torch.zeros(1, dtype=torch.int32, device="cuda")
+        device_interface = get_interface_for_device("cuda")
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+        launcher.slow_launch_kernel = True
+        launcher.run(1, 1, 1, stream, new_arg0, arg1)
+        self.assertEqual(new_arg0, arg0)
+
+    @skipIfRocm
+    def test_too_high_shared_mem(self):
+        @triton.jit
+        def simple_kernel(arg0, arg1):
+            x = tl.load(arg0)
+            y = arg1
+            tl.store(arg0, x + y)
+
+        arg0 = torch.zeros(1, dtype=torch.int32, device="cuda")
+        arg1 = 5
+        args = (arg0, arg1)
+        compiled_kernel = simple_kernel[(1,)](*args)
+        # Allocate too much shared memory
+        compiled_kernel.shared = 99999999
+        self.assertRaisesRegex(
+            RuntimeError,
+            "out of resource: simple_kernel",
+            lambda: self._make_launcher(compiled_kernel),
+        )
+
+    @skipIfRocm
+    def test_kernel_empty_tensor(self):
+        # Triton kernel generated by torch.compile of the following:
+        # @torch.compile()
+        # def foo(x, y):
+        #   return torch.cat(((x * 4), y + 10))
+
+        # Running with example input:
+        # torch._dynamo.decorators.mark_unbacked(t, 0)
+        # x = torch.rand(0, device="cuda")
+        # y = torch.rand(20, device="cuda")
+
+        @triton.jit
+        def triton_poi_fused_cat_0(
+            in_ptr0, in_ptr1, out_ptr0, ks0, xnumel, XBLOCK: tl.constexpr
+        ):
+            xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
+            xindex = xoffset + tl.arange(0, XBLOCK)[:].to(tl.int64)
+            xmask = xindex < xnumel
+            x0 = xindex
+            tmp0 = x0
+            tmp3 = ks0
+            tmp4 = tmp0 < tmp3
+            tmp5 = tl.load(
+                in_ptr0 + (x0), xmask & tmp4, eviction_policy="evict_last", other=0.0
+            )
+            tmp6 = 4.0
+            tmp7 = tmp5 * tmp6
+            tmp8 = tl.full(tmp7.shape, 0.0, tmp7.dtype)
+            tmp9 = tl.where(tmp4, tmp7, tmp8)
+            tmp10 = tmp0 >= tmp3
+            tmp13 = tl.load(
+                in_ptr1 + (x0 + ((-1) * ks0)),
+                xmask & tmp10,
+                eviction_policy="evict_last",
+                other=0.0,
+            )
+            tmp14 = 10.0
+            tmp15 = tmp13 + tmp14
+            tmp16 = tl.full(tmp15.shape, 0.0, tmp15.dtype)
+            tmp17 = tl.where(tmp10, tmp15, tmp16)
+            tmp18 = tl.where(tmp4, tmp9, tmp17)
+            tl.store(out_ptr0 + (x0), tmp18, xmask)
+
+        arg0 = 0
+        arg1 = torch.randn(0, device="cuda")
+        arg2 = torch.randn(20, device="cuda")
+        buf0 = torch.empty(20, device="cuda")
+        buf1 = torch.empty(20, device="cuda")
+        xnumel = 20 + arg0
+        compiled_kernel = triton_poi_fused_cat_0[(1,)](
+            arg1, arg2, buf0, arg0, xnumel, XBLOCK=32
+        )
+        launcher = self._make_launcher(compiled_kernel)
+
+        device_interface = get_interface_for_device("cuda")
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+
+        launcher.run(1, 1, 1, stream, arg1, arg2, buf1, arg0, xnumel)
+        self.assertEqual(buf0, buf1)
+
+    @skipIfRocm
+    def test_kernel_many_args(self):
+        N = 200
+        # Make 200 arguments
+        args = [f"arg_{i}" for i in range(N)]
+        decl = ", ".join(args)
+        sums = [f"    total += arg_{i}" for i in range(N)]
+        sums_str = "\n".join(sums)
+
+        template = f"""
+from torch._inductor.runtime.triton_compat import tl, triton
+@triton.jit
+def kernel_many_args(out_tensor, {decl}):
+    out = tl.load(out_tensor)
+    total = out
+{sums_str}
+    tl.store(out_tensor, total)
+        """
+
+        result = PyCodeCache.load(template.lstrip())
+
+        kernel_args = tuple(random.random() for _ in range(N))
+        buf0 = torch.zeros(1, device="cuda")
+        compiled_kernel = result.kernel_many_args[1,](buf0, *kernel_args)
+        launcher = self._make_launcher(compiled_kernel)
+        device_interface = get_interface_for_device("cuda")
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+        buf1 = torch.zeros(1, device="cuda")
+        launcher.run(1, 1, 1, stream, buf1, *kernel_args)
+        self.assertEqual(buf0, buf1)
+
+
+@requires_cuda
+@torch._inductor.config.patch(
+    {"use_static_cuda_launcher": True, "strict_static_cuda_launcher": True}
+)
+class TestStaticTritonCompileResult(TestCase):
+    """
+    Tests static cuda launcher with torch.compile()
+    """
+
+    @skipIfRocm
+    def test_basic_compile(self):
+        @torch.compile
+        def foo(x, y):
+            return x + y
+
+        x = torch.randn(10, device="cuda")
+        y = torch.randn(10, device="cuda")
+        self.assertEqual(foo(x, y), x + y)
+
+    @skipIfRocm
+    # The error gets raised on a worker, so we want to not use a separate process
+    @torch._inductor.config.patch("compile_threads", 1)
+    def test_incompatible_code(self):
+        # User defined triton kernel
+        @triton.jit
+        def custom_kernel(arg_0, arg_1):
+            x = tl.load(arg_0)
+            y = arg_1
+            tl.store(arg_0, x + y)
+
+        @torch.compile
+        def foo(x):
+            custom_kernel[1,](x, 5)
+            return x
+
+        x = torch.randn(1, device="cuda")
+        self.assertRaisesRegex(
+            torch._inductor.exc.InductorError,
+            "CannotStaticallyLaunchKernel: User defined triton kernel",
+            lambda: foo(x),
+        )
+
+    @skipIfRocm
+    # The error gets raised on a worker, so we want to not use a separate process
+    @torch._inductor.config.patch(
+        {"compile_threads": 1, "static_launch_user_defined_triton_kernels": True}
+    )
+    def test_static_launch_user_defined_triton_kernels(self):
+        # User defined triton kernel
+        @triton.jit
+        def custom_kernel(arg_0, arg_1):
+            x = tl.load(arg_0)
+            y = arg_1
+            tl.store(arg_0, x + y)
+
+        @torch.compile
+        def foo(x):
+            custom_kernel[1,](x, 5)
+            return x
+
+        x = torch.randn(1, device="cuda")
+        x2 = x.clone().detach_()
+        self.assertEqual(foo(x), x2 + 5)
+
+    @skipIfRocm
+    def test_empty_tensor(self):
+        @torch.compile()
+        def foo(x, y):
+            return torch.cat(((x * 4), y + 10))
+
+        x = torch.rand(0, device="cuda")
+        torch._dynamo.decorators.mark_unbacked(x, 0)
+        y = torch.rand(20, device="cuda")
+        result = foo(x, y)
+        self.assertEqual(result, torch.cat(((x * 4), y + 10)))
+
+    @skipIfRocm
+    def test_any(self):
+        def fn(x):
+            return (
+                x.any(-1),
+                x.isinf().any(),
+                torch.all(x.isinf(), dim=0),
+                torch.all(torch.logical_not(x.isinf())),
+            )
+
+        compiled_fn = torch.compile(fn)
+        arg = -torch.rand(64, device="cuda", dtype=torch.float64)
+        eager_result = fn(arg)
+        compiled_result = compiled_fn(arg)
+        self.assertEqual(eager_result, compiled_result)
+        arg[1] = float("inf")
+        eager_result = fn(arg)
+        compiled_result = compiled_fn(arg)
+        self.assertEqual(eager_result, compiled_result)
+
+    @skipIfRocm
+    def test_disable_static_cuda_launcher(self):
+        @torch.compile
+        def fn(x, y):
+            return torch.cat(((x * 4), y + 10))
+
+        # Test that static cuda launcher is in fact disabled
+        with torch._inductor.config.patch("use_static_cuda_launcher", False):
+            x = torch.rand(20, device="cuda")
+            y = torch.rand(20, device="cuda")
+            with mock.patch(
+                "torch._inductor.runtime.triton_heuristics.StaticTritonCompileResult.make_launcher"
+            ) as mocked:
+                result = fn(x, y)
+                mocked.assert_not_called()
+
+            self.assertEqual(result, torch.cat(((x * 4), y + 10)))
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    run_tests()
diff --git a/test/inductor/test_subgraph_choice.py b/test/inductor/test_subgraph_choice.py
new file mode 100644
index 000000000000..98f447652d24
--- /dev/null
+++ b/test/inductor/test_subgraph_choice.py
@@ -0,0 +1,192 @@
+# Owner(s): ["module: inductor"]
+import functools
+import unittest
+from unittest import mock
+from unittest.mock import MagicMock
+
+import torch
+from torch._dispatch.python import enable_python_dispatcher
+from torch._inductor.codegen.subgraph import SubgraphTemplate
+from torch._inductor.decomposition import select_decomp_table
+from torch._inductor.ir import Buffer, FixedLayout, FlexibleLayout
+from torch._inductor.lowering import register_lowering
+from torch._inductor.select_algorithm import autotune_select_algorithm
+from torch._inductor.test_case import run_tests, TestCase
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.common_utils import skipIfXpu, TEST_WITH_ROCM
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
+
+
+def decomposeK(a, b, kPartitions):
+    m = a.shape[0]
+    n = b.shape[1]
+    k = a.shape[1]
+
+    B = k // kPartitions
+    a_reshaped = torch.permute(a.reshape(m, B, kPartitions), (1, 0, 2))
+    b_reshaped = b.reshape(B, kPartitions, n)
+    result = torch.bmm(a_reshaped, b_reshaped, out_dtype=torch.float32)
+    result_fp32 = result.to(torch.float32)
+    reduced_buf = torch.sum(result_fp32, 0)
+    return reduced_buf.to(a.dtype)
+
+
+class TestSubgraphChoice(TestCase):
+    def setUp(self):
+        super().setUp()
+
+    def _create_buffer(self, name, shape, dtype):
+        return Buffer(
+            name=name,
+            layout=FixedLayout(torch.device(f"{GPU_TYPE}:0"), dtype=dtype, size=shape),
+        )
+
+    @skipIfXpu
+    @unittest.skipIf(TEST_WITH_ROCM, "decompose_k not supported on ROCm")
+    def test_subgraph_decompose_k(self):
+        from torch._inductor.kernel.mm import aten_mm
+        from torch._inductor.kernel.mm_common import mm_args
+
+        mat1_shape, mat2_shape = (32, 4096), (4096, 32)
+
+        @torch.library.custom_op("mylib::matmul_decompose", mutates_args={})
+        def matmul_decompose(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+            return a @ b
+
+        @matmul_decompose.register_fake
+        def _(a, b):
+            return a @ b
+
+        @register_lowering(torch.ops.mylib.matmul_decompose)
+        def _(a, b):
+            _, _, _, layout, mat1, mat2 = mm_args(a, b)
+
+            choices = [aten_mm.bind((mat1, mat2), layout)]
+
+            kPartitions = 256
+            with enable_python_dispatcher():
+                decompositions = select_decomp_table()
+
+                decompose_k_subgraph_template = SubgraphTemplate(
+                    name="decompose_k_mm",
+                    make_fx_graph=make_fx(
+                        functools.partial(decomposeK, kPartitions=kPartitions),
+                        decompositions,
+                        tracing_mode="real",
+                    ),
+                )
+
+            decompose_k_subgraph_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+            )
+
+            # Test benchmarking against aten
+            autotune_select_algorithm("test_subgraph_choice", choices, [a, b], layout)
+
+            # Only return decomposeK case for codegen
+            choices = [choices[1]]
+            return autotune_select_algorithm(
+                "test_subgraph_choice", choices, [a, b], layout
+            )
+
+        a_in = torch.randn(
+            mat1_shape, dtype=torch.float16, device=torch.device(f"{GPU_TYPE}:0")
+        )
+        b_in = torch.randn(
+            mat2_shape, dtype=torch.float16, device=torch.device(f"{GPU_TYPE}:0")
+        )
+
+        def func(mat1, mat2):
+            return torch.ops.mylib.matmul_decompose(mat1, mat2)
+
+        compiled_func = torch.compile(func, mode="max-autotune", dynamic=False)
+
+        res = compiled_func(a_in, b_in)
+
+        # Check same results of compiled result and regular torch.mm
+        torch.testing.assert_close(res, a_in @ b_in, atol=1e-1, rtol=1e-1)
+
+    @skipIfXpu
+    @unittest.skipIf(TEST_WITH_ROCM, "decompose_k not supported on ROCm")
+    def test_subgraph_freeze_layout(self):
+        from torch._inductor.kernel.mm_common import mm_args
+
+        M, N, K = (4, 128, 14240)
+        a_in = torch.randn(
+            (M, K), dtype=torch.bfloat16, device=torch.device(f"{GPU_TYPE}:0")
+        )
+        b_in = torch.randn(
+            (K, N), dtype=torch.bfloat16, device=torch.device(f"{GPU_TYPE}:0")
+        )
+
+        @torch.library.custom_op("mylib::matmul_decompose_padding", mutates_args={})
+        def matmul_decompose(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+            return a @ b
+
+        @matmul_decompose.register_fake
+        def _(a, b):
+            return a @ b
+
+        @register_lowering(torch.ops.mylib.matmul_decompose_padding)
+        def _(a, b):
+            _, _, _, layout, mat1, mat2 = mm_args(a, b)
+            mat1_layout = mat1.layout
+            assert isinstance(mat1_layout, FlexibleLayout)
+            mat1_stride = mat1_layout.stride
+
+            choices = []
+
+            kPartitions = 2
+            with enable_python_dispatcher():
+                decompositions = select_decomp_table()
+
+                decompose_k_subgraph_template = SubgraphTemplate(
+                    name="decompose_k_mm",
+                    make_fx_graph=make_fx(
+                        functools.partial(decomposeK, kPartitions=kPartitions),
+                        decompositions,
+                    ),
+                )
+
+            decompose_k_subgraph_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+            )
+
+            choice = choices[0]
+            assert isinstance(mat1.layout, FixedLayout)
+
+            # Creating the subgraph choice should have frozen the layout
+            # We ensure padding so the stride should differ
+            assert mat1.layout.stride != mat1_stride
+
+            for example_stride, layout_stride in zip(
+                choice.example_inputs[0].stride(), mat1.layout.stride
+            ):
+                # Example inputs should have same stride as current layout
+                assert example_stride == layout_stride
+
+            return autotune_select_algorithm(
+                "test_subgraph_choice", choices, [a, b], layout
+            )
+
+        def func(mat1, mat2):
+            return torch.ops.mylib.matmul_decompose_padding((mat1 + 1.0), mat2)
+
+        with mock.patch("torch._inductor.ir.V.get_current_node") as get_node_mock:
+            node_mock = MagicMock()
+            node_mock.meta = {"dislike_padding": False}
+            get_node_mock.return_value = node_mock
+
+            compiled_func = torch.compile(func, mode="max-autotune", dynamic=False)
+
+            compiled_func(a_in, b_in)
+
+
+if __name__ == "__main__":
+    # Set env to make it work in CI.
+    if HAS_GPU and HAS_CPU:
+        run_tests()
diff --git a/test/inductor/test_torchbind.py b/test/inductor/test_torchbind.py
index d948bed81720..8baa32719c7a 100644
--- a/test/inductor/test_torchbind.py
+++ b/test/inductor/test_torchbind.py
@@ -13,6 +13,10 @@
 from torch._inductor import aot_compile, ir
 from torch._inductor.package import package_aoti
 from torch._inductor.test_case import run_tests, TestCase
+<<<<<<< HEAD
+=======
+from torch.testing._internal.inductor_utils import GPU_TYPE, requires_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.torchbind_impls import (
     _empty_tensor_queue,
     init_torchbind_implementations,
@@ -24,6 +28,27 @@ def setUp(self):
         super().setUp()
         init_torchbind_implementations()
 
+<<<<<<< HEAD
+=======
+    def get_dummy_exported_model(self):
+        """
+        Returns the ExportedProgram, example inputs, and result from calling the
+        eager model with those inputs
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        m = M()
+        inputs = (torch.ones(2, 3),)
+        orig_res = m(*inputs)
+
+        ep = torch.export.export(m, inputs, strict=False)
+
+        return ep, inputs, orig_res, m
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_exported_model(self):
         """
         Returns the ExportedProgram, example inputs, and result from calling the
@@ -61,6 +86,25 @@ def test_torchbind_inductor(self):
         new_res = compiled(*inputs)
         self.assertTrue(torch.allclose(orig_res, new_res))
 
+<<<<<<< HEAD
+=======
+    def test_torchbind_compile_symint(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(2, 3)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo_tensor_return(self.attr, x)
+                return a
+
+        m = M()
+        inputs = (torch.ones(2, 3),)
+        orig_res = m(*inputs)
+        new_res = torch.compile(m, backend="inductor")(*inputs)
+        self.assertTrue(torch.allclose(orig_res, new_res))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_torchbind_compile(self):
         _, inputs, orig_res, mod = self.get_exported_model()
         new_res = torch.compile(mod, backend="inductor")(*inputs)
@@ -93,7 +137,38 @@ def test_torchbind_hop_schema(self):
         schema = CallTorchBind.schema(foo_ir, "add")
         self.assertEqual(
             str(schema),
+<<<<<<< HEAD
             "call_torchbind(__torch__.torch.classes._TorchScriptTesting._Foo obj, str method, int _1) -> int _0",
+=======
+            "call_torchbind(__torch__.torch.classes._TorchScriptTesting._Foo _0, str method, int _1) -> int _0",
+        )
+
+    def test_torchbind_config_not_generated(self):
+        # custom_objs_config.json should not be generated when its empty
+        ep, inputs, _, _ = self.get_dummy_exported_model()
+        aoti_files = aot_compile(
+            ep.module(), inputs, options={"aot_inductor.package": True}
+        )
+        for file in aoti_files:
+            self.assertTrue(not file.endswith("/custom_objs_config.json"))
+
+    def test_torchbind_hop_schema_no_input(self):
+        q = _empty_tensor_queue()
+        q_ir = ir.TorchBindObject(name="q", value=q)
+        schema = CallTorchBind.schema(q_ir, "pop")
+        self.assertEqual(
+            str(schema),
+            "call_torchbind(__torch__.torch.classes._TorchScriptTesting._TensorQueue _0, str method) -> Tensor _0",
+        )
+
+    def test_torchbind_hop_schema_no_output(self):
+        q = _empty_tensor_queue()
+        q_ir = ir.TorchBindObject(name="q", value=q)
+        schema = CallTorchBind.schema(q_ir, "push")
+        self.assertEqual(
+            str(schema),
+            "call_torchbind(__torch__.torch.classes._TorchScriptTesting._TensorQueue _0, str method, Tensor _1) -> NoneType _0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_torchbind_aot_compile(self):
@@ -188,7 +263,11 @@ def test_torchbind_aot_compile(self):
                                 "target": "call_torchbind",
                                 "inputs": [
                                     {
+<<<<<<< HEAD
                                         "name": "obj",
+=======
+                                        "name": "_0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                         "arg": {
                                             "as_custom_obj": {
                                                 "name": "_torchbind_obj0",
@@ -221,6 +300,7 @@ def test_torchbind_aot_compile(self):
         with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
             package_path = package_aoti(f.name, aoti_files)
 
+<<<<<<< HEAD
             with tempfile.TemporaryDirectory() as tmp_dir, zipfile.ZipFile(
                 package_path, "r"
             ) as zip_ref:
@@ -233,6 +313,134 @@ def test_torchbind_aot_compile(self):
 
         # TODO: add accuracy test after we support loading and running compiled models with
         # torchbind objects.
+=======
+            with zipfile.ZipFile(package_path, "r") as zip_ref:
+                all_files = zip_ref.namelist()
+                base_folder = all_files[0].split("/")[0]
+                tmp_path_model = Path(base_folder) / "data" / "aotinductor" / "model"
+                tmp_path_constants = Path(base_folder) / "data" / "constants"
+
+                self.assertTrue(
+                    str(tmp_path_model / "custom_objs_config.json") in all_files
+                )
+                self.assertTrue(str(tmp_path_constants / "custom_obj_0") in all_files)
+
+    def test_torchbind_aoti(self):
+        ep, inputs, orig_res, _ = self.get_exported_model()
+        pt2_path = torch._inductor.aoti_compile_and_package(ep)
+        optimized = torch._inductor.aoti_load_package(pt2_path)
+        result = optimized(*inputs)
+        self.assertEqual(result, orig_res)
+
+    @torch._inductor.config.patch("aot_inductor.use_runtime_constant_folding", True)
+    def test_torchbind_aot_compile_constant_folding(self):
+        ep, inputs, orig_res, _ = self.get_exported_model()
+        pt2_path = torch._inductor.aoti_compile_and_package(ep)
+        optimized = torch._inductor.aoti_load_package(pt2_path)
+        result = optimized(*inputs)
+        self.assertEqual(result, orig_res)
+
+    def test_torchbind_list_return_aot_compile(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo_list_return(self.attr, x)
+                y = a[0] + a[1] + a[2]
+                b = torch.ops._TorchScriptTesting.takes_foo(self.attr, y)
+                return x + b
+
+        m = M()
+        inputs = (torch.ones(2, 3),)
+        orig_res = m(*inputs)
+
+        # We can't directly torch.compile because dynamo doesn't trace ScriptObjects yet
+        with enable_torchbind_tracing():
+            ep = torch.export.export(m, inputs, strict=False)
+
+        pt2_path = torch._inductor.aoti_compile_and_package(ep)
+        optimized = torch._inductor.aoti_load_package(pt2_path)
+        result = optimized(*inputs)
+        self.assertEqual(result, orig_res)
+
+    def test_torchbind_queue(self):
+        class Foo(torch.nn.Module):
+            def __init__(self, tq) -> None:
+                super().__init__()
+                self.tq = tq
+
+            def forward(self, x):
+                self.tq.push(x.cos())
+                self.tq.push(x.sin())
+                # TODO: int return type in fallback kernel not support yet
+                x_cos = self.tq.pop()  # + self.tq.size()
+                x_sin = self.tq.pop()  # - self.tq.size()
+                return x_sin, x_cos
+
+        inputs = (torch.randn(3, 2),)
+
+        q = _empty_tensor_queue()
+        m = Foo(q)
+        orig_res = m(*inputs)
+
+        q2 = _empty_tensor_queue()
+        m2 = Foo(q2)
+
+        # We can't directly torch.compile because dynamo doesn't trace ScriptObjects yet
+        with enable_torchbind_tracing():
+            ep = torch.export.export(m2, inputs, strict=False)
+
+        pt2_path = torch._inductor.aoti_compile_and_package(ep)
+        optimized = torch._inductor.aoti_load_package(pt2_path)
+        result = optimized(*inputs)
+        self.assertEqual(result, orig_res)
+
+    @requires_gpu()
+    @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_torchbind_compile_gpu_op_symint_graph_partition(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(2, 3)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo_tensor_return(self.attr, x)
+                a_cuda = a.to(device=GPU_TYPE)
+                return a_cuda + 1
+
+        m = M()
+        inputs = (torch.ones(2, 3),)
+        orig_res = m(*inputs)
+        new_res = torch.compile(m, backend="inductor")(*inputs)
+        self.assertTrue(torch.allclose(orig_res, new_res))
+
+    def test_torchbind_input_aot_compile(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x, y):
+                a = torch.ops._TorchScriptTesting.takes_foo_list_return(x, y)
+                return a
+
+        m = M()
+        inputs = (torch.classes._TorchScriptTesting._Foo(10, 20), torch.ones(2, 3))
+
+        # We can't directly torch.compile because dynamo doesn't trace ScriptObjects yet
+        with enable_torchbind_tracing():
+            ep = torch.export.export(m, inputs, strict=False)
+
+        from torch._dynamo.exc import UserError
+
+        with self.assertRaisesRegex(
+            UserError,
+            expected_regex="TorchBind object inputs are not supported in AOTInductor",
+        ):
+            aot_compile(ep.module(), inputs, options={"aot_inductor.package": True})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 900e5f9c3085..1f76853f8697 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -30,6 +30,10 @@
 import torch._dynamo.config as dynamo_config
 import torch._inductor.aoti_eager
 import torch.nn as nn
+<<<<<<< HEAD
+=======
+from torch._C._dynamo.guards import assert_alignment, assert_size_stride
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.debug_utils import aot_graph_input_parser
 from torch._dynamo.device_interface import get_interface_for_device
@@ -49,7 +53,10 @@
     aoti_eager_cache_dir,
     load_aoti_eager_cache,
 )
+<<<<<<< HEAD
 from torch._inductor.codecache import cpp_prefix_path
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.codegen.common import DataTypePropagation, OptimizationContext
 from torch._inductor.fx_passes import pad_mm
 from torch._inductor.test_case import TestCase as InductorTestCase
@@ -72,6 +79,10 @@
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     SM80OrLater,
+<<<<<<< HEAD
+=======
+    SM90OrLater,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_CUDNN,
     tf32_on_and_off,
     with_tf32_off,
@@ -102,6 +113,10 @@
     TEST_WITH_ROCM,
     xfailIfS390X,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.logging_utils import logs_to_string
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils import _pytree as pytree
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_flatten, tree_unflatten
@@ -126,8 +141,17 @@
     GPU_TYPE,
     HAS_CPU,
     HAS_GPU,
+<<<<<<< HEAD
     HAS_MULTIGPU,
     requires_gpu,
+=======
+    HAS_MPS,
+    HAS_MULTIGPU,
+    IS_BIG_GPU,
+    requires_gpu,
+    RUN_CPU,
+    RUN_GPU,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipCPUIf,
     skipCUDAIf,
 )
@@ -140,6 +164,13 @@
 
 HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
 
+<<<<<<< HEAD
+=======
+if TEST_WITH_ROCM:
+    torch._inductor.config.force_layout_optimization = 1
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten = torch.ops.aten
 
 requires_multigpu = functools.partial(
@@ -167,6 +198,18 @@
     torch.int32,
     torch.int64,
 ]
+<<<<<<< HEAD
+=======
+
+test_int_dtypes = [
+    torch.uint8,
+    torch.int8,
+    torch.int16,
+    torch.int32,
+    torch.int64,
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if SM80OrLater or MACOS_VERSION >= 14.0:
     test_dtypes.append(torch.bfloat16)
 
@@ -285,6 +328,17 @@ def get_divisible_by_16(cfg):
     ]
 
 
+<<<<<<< HEAD
+=======
+def get_post_grad_graph(f, inputs):
+    log_stream, ctx = logs_to_string("torch._inductor.compile_fx", "post_grad_graphs")
+    with ctx():
+        f(*inputs)
+    post_grad_graph = "\n".join(log_stream.getvalue().strip().split("\n")[3:]).strip()
+    return post_grad_graph
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestCase(InductorTestCase):
     @classmethod
     def setUpClass(cls):
@@ -501,9 +555,15 @@ def reference_to_expect(actual_flat, correct_flat):
     if reference_in_float and exact_dtype:
         for expect_dtype, actual_result in zip(expect_dtypes, actual_flat):
             if expect_dtype is not None:
+<<<<<<< HEAD
                 assert (
                     actual_result.dtype == expect_dtype
                 ), f"dtype mismatch, expected {expect_dtype} but got {actual_result.dtype}"
+=======
+                assert actual_result.dtype == expect_dtype, (
+                    f"dtype mismatch, expected {expect_dtype} but got {actual_result.dtype}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if reference_in_float:
         correct_flat = reference_to_expect(actual_flat, correct_flat)
@@ -549,7 +609,11 @@ def reference_to_expect(actual_flat, correct_flat):
 
         # generate random unit norm gradients
         grads = [
+<<<<<<< HEAD
             torch.rand(r.shape, device=r.device, dtype=r.dtype)
+=======
+            torch.randn_like(r)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for r in correct_flat
             if isinstance(r, torch.Tensor) and r.requires_grad
         ]
@@ -780,41 +844,76 @@ def is_cpp_backend(device):
 
 def skip_if_cpu(fn):
     @functools.wraps(fn)
+<<<<<<< HEAD
     def wrapper(self):
         if self.device == "cpu":
             raise unittest.SkipTest("cpu not supported")
         return fn(self)
+=======
+    def wrapper(self, *args, **kwargs):
+        if self.device == "cpu":
+            raise unittest.SkipTest("cpu not supported")
+        return fn(self, *args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return wrapper
 
 
 def skip_if_halide(fn):
     @functools.wraps(fn)
+<<<<<<< HEAD
     def wrapper(self):
         if is_halide_backend(self.device):
             raise unittest.SkipTest("halide not supported")
         return fn(self)
+=======
+    def wrapper(self, *args, **kwargs):
+        if is_halide_backend(self.device):
+            raise unittest.SkipTest("halide not supported")
+        return fn(self, *args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return wrapper
 
 
 def xfail_if_mps(fn):
     @functools.wraps(fn)
+<<<<<<< HEAD
     def wrapper(self):
         if not is_mps_backend(self.device):
             return fn(self)
         with self.assertRaises(Exception):
             return fn(self)
+=======
+    def wrapper(self, *args, **kwargs):
+        if not is_mps_backend(self.device):
+            return fn(self, *args, **kwargs)
+        with self.assertRaises(Exception):
+            return fn(self, *args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return wrapper
 
 
+<<<<<<< HEAD
 def skip_if_triton(fn):
     @functools.wraps(fn)
     def wrapper(self):
         if is_triton_backend(self.device):
             raise unittest.SkipTest("triton not supported")
         return fn(self)
+=======
+# Just an alias to track failures due to the missing eager ops
+xfail_if_mps_unimplemented = xfail_if_mps
+
+
+def skip_if_triton(fn):
+    @functools.wraps(fn)
+    def wrapper(self, *args, **kwargs):
+        if is_triton_backend(self.device):
+            raise unittest.SkipTest("triton not supported")
+        return fn(self, *args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return wrapper
 
@@ -831,10 +930,17 @@ def wrapper(self, *args, **kwargs):
 
 def skip_if_dynamic(fn):
     @functools.wraps(fn)
+<<<<<<< HEAD
     def wrapper(self):
         if ifdynstaticdefault(True, False) or torch._dynamo.config.dynamic_shapes:
             raise unittest.SkipTest("associtaive_scan doesn's support lifted SymInts.")
         return fn(self)
+=======
+    def wrapper(self, *args, **kwargs):
+        if ifdynstaticdefault(True, False):
+            raise unittest.SkipTest("associtaive_scan doesn's support lifted SymInts.")
+        return fn(self, *args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return wrapper
 
@@ -890,13 +996,21 @@ def xfail_if_triton_cpu(fn):
 
 def skip_if_gpu_halide(fn):
     @functools.wraps(fn)
+<<<<<<< HEAD
     def wrapper(self):
+=======
+    def wrapper(self, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             is_halide_backend(self.device)
             and getattr(self.device, "type", self.device) == "cuda"
         ):
             raise unittest.SkipTest("halide not supported")
+<<<<<<< HEAD
         return fn(self)
+=======
+        return fn(self, *args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return wrapper
 
@@ -905,12 +1019,20 @@ class skip_if_cpp_wrapper:
     def __init__(self, reason: str = "") -> None:
         self.reason = reason
 
+<<<<<<< HEAD
     def __call__(self, fn):
+=======
+    def __call__(self, fn, *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @functools.wraps(fn)
         def wrapper(test_self):
             if config.cpp_wrapper:
                 raise unittest.SkipTest(f"cpp wrapper bug to be fixed: {self.reason}")
+<<<<<<< HEAD
             return fn(test_self)
+=======
+            return fn(test_self, *args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return wrapper
 
@@ -1348,7 +1470,15 @@ def fn(*args):
             b = torch.add(args[0], args[0])
             return (a, b)
 
+<<<<<<< HEAD
         x = torch.randn(41, dtype=torch.complex64)
+=======
+        # Complex are not supported on MacOS-13
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("No complex on MacOS13")
+
+        x = torch.randn(41, dtype=torch.complex64, device=self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y = x.clone()
         # should not inplace write to the input
         fn(x)
@@ -1362,6 +1492,11 @@ def fn(a, b):
             return c + d
 
         for dtype in [torch.complex32, torch.complex64, torch.complex128]:
+<<<<<<< HEAD
+=======
+            if not self.is_dtype_supported(dtype):
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = torch.tensor(
                 [1 + 1j, -1 + 1j, -2 + 2j, 3 - 3j, 0, 1j, 1, -1],
                 dtype=dtype,
@@ -1374,7 +1509,18 @@ def fn(a, b):
             )
             _, code = run_and_get_code(fn, x, y)
             code = " ".join(code)
+<<<<<<< HEAD
             self.assertEqual(
+=======
+            assert_keywords = ["assert_size_stride", "assert_alignment"]
+            filtered_lines = [
+                line
+                for line in code.splitlines()
+                if not any(assert_key in line for assert_key in assert_keywords)
+            ]
+            code = "\n".join(filtered_lines)
+            self.assertGreaterEqual(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 code.count("view_dtype" if config.cpp_wrapper else "aten.view"), 3
             )
 
@@ -1424,6 +1570,13 @@ def fn(a, b, c):
         )
         real_input = torch.tensor([-1.0, 0.0, 1.0, float("nan")])
         interger_real_input = torch.tensor([-1, 0, 1])
+<<<<<<< HEAD
+=======
+        # Complex are not supported on MacOS-13
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            self.common(fn, (complex_input.real, real_input, interger_real_input))
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(fn, (complex_input, real_input, interger_real_input))
 
     def test_sgn(self):
@@ -1438,6 +1591,11 @@ def fn(inp, src, index):
             return inp.scatter_add(0, index, src)
 
         for dtype in [torch.int64, torch.bool, torch.bfloat16]:
+<<<<<<< HEAD
+=======
+            if not self.is_dtype_supported(dtype):
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.common(
                 fn,
                 [
@@ -1574,6 +1732,12 @@ def copy(x):
             i = torch.arange(x.size(0), device=x.device)
             return x[i]
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Inaccurate on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(8, device=self.device)
         copy_opt = torch.compile(copy, backend="inductor")
 
@@ -1680,6 +1844,16 @@ def fn(a):
 
         self.common(fn, (torch.randn(1024),))
 
+<<<<<<< HEAD
+=======
+    def test_index_remainder(self):
+        def fn(x, y):
+            return x[y % 12]
+
+        self.common(fn, (torch.rand(1024), torch.randint(50, (50,))))
+
+    @xfailIfS390X
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(debug_index_asserts=False)
     @config.patch("cpp.enable_tiling_heuristics", False)
     def test_neg_index(self):
@@ -1689,6 +1863,13 @@ def test(
             fn_opt = torch.compile(fn)
             if is_halide_backend(self.device):
                 pass  # no device asserts in halide
+<<<<<<< HEAD
+=======
+            # TODO: remove once https://github.com/pytorch/pytorch/issues/144634
+            # is fixed.
+            elif is_mps_backend(self.device):
+                pass  # no device asserts in MPS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif self.device == "cpu" and not is_triton_cpu_backend(self.device):
                 _, code = run_and_get_cpp_code(fn_opt, *inps)
                 self.assertTrue(("TORCH_CHECK" in code) is has_assert)
@@ -1983,13 +2164,21 @@ def fn(a):
             return torch.max(a), torch.sum(a)
 
         # Requires masked loading for the intermediate reduction
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Fails with internal compiler error on MacOS-13")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sample = torch.full((3999971,), 0, dtype=torch.int64)
         sample[-1] = 1
         self.common(fn, (sample,))
 
     @skip_if_gpu_halide
     @skipCPUIf(IS_MACOS, "fails on macos")
+<<<<<<< HEAD
     @xfailIfS390X
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multilayer_var(self):
         def fn(a):
             return torch.var(a)
@@ -2009,7 +2198,11 @@ def fn(a):
 
     @skipCPUIf(IS_MACOS, "fails on macos")
     @skip_if_halide  # accuracy 4.7% off
+<<<<<<< HEAD
     @xfailIfS390X
+=======
+    @xfailIfS390X  # accuracy failure
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multilayer_var_lowp(self):
         def fn(a):
             return torch.var(a)
@@ -2055,7 +2248,10 @@ def fn(a):
             self.common(fn, (inp.view(10, -1),), rtol=1e-4, atol=1e-5, check_lowp=False)
 
     @skipCUDAIf(not SM80OrLater, "Requires sm80")
+<<<<<<< HEAD
     @skipCUDAIf(TEST_WITH_ROCM, "Computation not done in float on ROCm")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_gpu_halide  # accuracy issue
     def test_split_cumsum_low_prec(self):
         if is_cpp_backend(self.device):
@@ -2101,6 +2297,12 @@ def fn(lengths, data):
             offsets = torch.cumsum(lengths, 0)
             return data[offsets]
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("CumSum for int64 needs MacOS-13.3+")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lengths = torch.full((2**14,), 2**2, dtype=torch.int64, device=self.device)
         lengths[-2] = 3
         lengths[-1] = 3
@@ -2127,7 +2329,10 @@ def fn(a):
             self.common(fn, (inp,), atol=1e-5, rtol=1e-4, check_lowp=False)
 
     @skipCUDAIf(not SM80OrLater, "Requires sm80")
+<<<<<<< HEAD
     @skipCUDAIf(TEST_WITH_ROCM, "Computation not done in float on ROCm")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_gpu_halide  # accuracy issue
     def test_split_cumprod_low_prec(self):
         if is_cpp_backend(self.device):
@@ -2137,6 +2342,11 @@ def fn(a):
             return torch.cumprod(a.view(-1), 0)
 
         for dtype in [torch.float16, torch.bfloat16]:
+<<<<<<< HEAD
+=======
+            if not self.is_dtype_supported(dtype):
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inp = _large_cumprod_input(
                 (10, 10000), dim=1, dtype=dtype, device=self.device
             )
@@ -2166,7 +2376,10 @@ def fn(a, b):
 
             self.common(fn, (a, b), atol=1e-5, rtol=1e-5, check_lowp=False)
 
+<<<<<<< HEAD
     @skipCUDAIf(TEST_WITH_ROCM, "associative_scan is not supported on ROCm")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_halide  # scan ops
     # TODO: support lifted symints when dynamic
     @torch._dynamo.config.patch(
@@ -2226,7 +2439,10 @@ def fn(a, b, dim):
             r"triton_.*\.run\(arg[01]_1, arg[12]_1, buf1,"
         ).check_not("run(").run(code[0])
 
+<<<<<<< HEAD
     @skipCUDAIf(TEST_WITH_ROCM, "associative_scan is not supported on ROCm")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_halide  # scan ops
     # TODO: support lifted symints when dynamic
     @torch._dynamo.config.patch(
@@ -2254,7 +2470,10 @@ def argmax_combine(a, b):
         actual = associative_scan(argmax_combine, (a, idx), 0)
         self.assertEqual(expect, actual)
 
+<<<<<<< HEAD
     @skipCUDAIf(TEST_WITH_ROCM, "associative_scan is not supported on ROCm")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_halide  # scan ops
     # TODO: support lifted symints when dynamic
     @torch._dynamo.config.patch(
@@ -2311,6 +2530,10 @@ def fn(a):
         packed = torch.cat([data, scales, offsets], dim=-1)
         self.common(fn, [packed])
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(True, "No _weight_int8pack_mm implementation on CUDA")
     @skipIfXpu(msg="No _weight_int8pack_mm implementation on XPU")
     def test_int8_weight_only_quant(self):
@@ -2334,6 +2557,10 @@ def fn(a, b_int8pack, b_scales, c):
         b_int8pack, b_scales = convert_weight_to_int8pack(b)
         self.common(fn, (a, b_int8pack, b_scales, c))
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @xfail_if_triton_cpu
     @skipCUDAIf(True, "No _dyn_quant_pack_4bit_weight implementation on CUDA")
     @skipIfRocm
@@ -2369,6 +2596,10 @@ def fn(b, in_features, out_features):
 
         self.common(fn, (b, in_features, out_features))
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @xfail_if_triton_cpu
     @skipCUDAIf(True, "No _dyn_quant_matmul_4bit implementation on CUDA")
     @skipIfRocm
@@ -2418,6 +2649,7 @@ def fn(x, y):
             z = x * y
             return z.sum((0, 1))
 
+<<<<<<< HEAD
         atol = None
         rtol = None
 
@@ -2429,6 +2661,10 @@ def fn(x, y):
         if config.triton.multi_kernel:
             atol = 1e-5
             rtol = 1e-5
+=======
+        atol = 1e-3
+        rtol = 1e-3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn, (torch.randn(2, 197, 256), torch.randn(2, 1, 256)), atol=atol, rtol=rtol
         )
@@ -2483,6 +2719,7 @@ def test_sum_int(self):
         def fn(x):
             return 2 * x.sum(-1) + x.sum()
 
+<<<<<<< HEAD
         dtypes = torch.bool, torch.uint8, torch.int
         inps = [torch.randint(2, (64,), dtype=dtype) for dtype in dtypes]
         for i in inps:
@@ -2495,6 +2732,40 @@ def fn(x):
 
         self.common(fn, (torch.ones(32, 32) * 70,))
 
+=======
+        # Requires masked loading for the intermediate reduction
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Fails with internal compiler error on MacOS-13")
+
+        dtypes = torch.bool, torch.uint8, torch.int
+        inps = [torch.randint(2, (64,), dtype=dtype) for dtype in dtypes]
+
+        for i in inps:
+            self.common(fn, (i,), check_lowp=False)
+
+    def test_sum_dtype(self):
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("bfloat unsupported on MacOS-13")
+
+        sum_dtype = torch.double if self.device != "mps" else torch.bfloat16
+
+        def fn(x):
+            return x * x.sum(-1, dtype=sum_dtype) + x.sum(dtype=sum_dtype)
+
+        self.common(fn, (torch.ones(32, 32) * 70,))
+
+    @skip_if_halide
+    def test_cummin(self):
+        def fn(x):
+            return x.cummin(0)
+
+        self.common(
+            fn, (torch.rand(16, 32),), check_lowp=not is_halide_backend(self.device)
+        )
+        self.common(fn, (torch.rand(1),), check_lowp=not is_halide_backend(self.device))
+        self.common(fn, (torch.rand(0),), check_lowp=not is_halide_backend(self.device))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cumsum(self):
         def fn(x):
             return x.cumsum(0), x.cumsum(1)
@@ -2571,6 +2842,10 @@ def make_tensor(shape):
                 inp = torch.full((2, n), float("inf"), device=self.device, dtype=_dtype)
                 self.assertEqual(cfn(inp), fn(inp))
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @xfail_if_triton_cpu
     def test_logcumsumexp(self):
         def fn(x):
@@ -2597,6 +2872,10 @@ def fn(x):
             rtol=1e-5,
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_logcumsumexp_zero_dim(self):
         def fn(x):
             return x.logcumsumexp(0), x.logcumsumexp(-1)
@@ -2620,6 +2899,15 @@ def fn(a):
 
         self.common(fn, (torch.randint(4, (4,)),))
 
+<<<<<<< HEAD
+=======
+    def test_clamp_type_promotion_non_tensor(self):
+        def fn(a):
+            return a.clamp(min=1.5), a.clamp(min=2)
+
+        self.common(fn, (torch.randint(4, (4,)),))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_gpu_halide
     @xfail_if_triton_cpu
     def test_dist(self):
@@ -2631,12 +2919,34 @@ def fn(a, b):
 
         self.common(fn, (torch.randn(4, 4), torch.randn(4, 4)))
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps
+    @skip_if_halide  # different pow accuracies
+    @xfail_if_triton_cpu
+    def test_norm_constant_overflow(self):
+        def fn(a):
+            return (
+                torch.norm(a, p=-41.0, dim=1),
+                torch.norm(a, p=-41.0, dim=0),
+            )
+
+        self.common(fn, (torch.randn(4, 1, 4),))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(not SM80OrLater, "Requires sm80")
     @skip_if_gpu_halide  # https://github.com/halide/Halide/issues/8311
     def test_dist_bf16(self):
         def fn(a, b):
             return torch.dist(a.to(torch.bfloat16), b.to(torch.bfloat16))
 
+<<<<<<< HEAD
+=======
+        if not self.is_dtype_supported(torch.bfloat16):
+            raise unittest.SkipTest(
+                f"torch.bfloat16 not supported for device {self.device}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(fn, (torch.randn(4, 4), torch.randn(4, 4)))
 
     def test_arange1(self):
@@ -2875,6 +3185,11 @@ def fn(a, b):
     @skipIfXpu(msg="logaddexp_xpu not implemented for ComplexFloat")
     @skipCUDAIf(True, "Not implemented for CUDA")
     def test_logaddexp(self):
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("Complex needs MacOS-14+")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             torch.logaddexp,
             (
@@ -2895,7 +3210,11 @@ def fn(a, b):
             return torch.round(a), torch.round(b + 1), torch.round(a, decimals=2)
 
         # without manual_seed, there is some chance this test fails due to:
+<<<<<<< HEAD
         # https://github.com/openai/triton/issues/530
+=======
+        # https://github.com/triton-lang/triton/issues/530
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.manual_seed(0)
 
         # with *100 we are always getting a number exactly at .5 which we don't do right in half
@@ -3072,6 +3391,12 @@ def fn(a, b):
                 a // b,
             )
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Inaccurate for MPS no MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn,
             (torch.randint(-100, 0, [8, 8]), torch.randint(1, 10, [8, 8])),
@@ -3653,7 +3978,10 @@ def fn(a, b):
 
     @skipIfPy312  # segfaults
     @skipCUDAIf(not SM80OrLater, "Requires sm80")
+<<<<<<< HEAD
     @config.patch(mixed_mm_choice="triton")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mixed_mm(self):
         def fn(a, b):
             return torch.mm(a, b.to(a.dtype))
@@ -3669,7 +3997,10 @@ def fn(a, b):
 
     @skipIfPy312  # segfaults
     @skipCUDAIf(not SM80OrLater, "Requires sm80")
+<<<<<<< HEAD
     @config.patch(mixed_mm_choice="triton")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mixed_mm2(self):
         def fn(a, b, scale, bias):
             return torch.mm(a, b.to(a.dtype)) * scale + bias
@@ -3687,7 +4018,10 @@ def fn(a, b, scale, bias):
 
     @skipIfPy312  # segfaults
     @skipCUDAIf(not SM80OrLater, "Requires sm80")
+<<<<<<< HEAD
     @config.patch(mixed_mm_choice="triton")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mixed_mm3(self):
         def fn(a, b):
             return torch.mm(a, b.to(a.dtype))
@@ -3699,13 +4033,21 @@ def fn(a, b):
                 torch.randn(256, 256),
                 torch.randint(-128, 127, (256, 256), dtype=torch.int8),
             ),
+<<<<<<< HEAD
             check_lowp=True,
+=======
+            # MacOS-13 MM ops have precision issues
+            check_lowp=self.device != "mps" or MACOS_VERSION > 14.0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rtol=0.01,
             atol=0.1,
         )
 
     @with_tf32_off
+<<<<<<< HEAD
     @config.patch(use_mixed_mm=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_uint4x2_mixed_mm(self):
         def fn(a, b):
             return torch.mm(
@@ -3742,6 +4084,10 @@ def fn(a, b):
             torch.compile(fn)(t1, t2)
 
     @skipIfXpu
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented  # linear for non-float inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_linear_mixed_dtype(self):
         class Net(nn.Module):
             def __init__(self) -> None:
@@ -3763,11 +4109,20 @@ def forward(self, x):
         with self.assertRaisesRegex(RuntimeError, msg):
             with torch.no_grad():
                 torch.compile(fn)(t)
+<<<<<<< HEAD
         # TODO: Autograd internal assertion
         msg = r".*isDifferentiableType\(variable.scalar_type\(\)\) INTERNAL ASSERT FAILED.*"
         with self.assertRaisesRegex(RuntimeError, msg):
             torch.compile(fn)(t)
 
+=======
+        with self.assertRaisesRegex(RuntimeError, "Autograd not support dtype:.*"):
+            torch.compile(fn)(t)
+
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(
         {
             "max_autotune": True,
@@ -3775,6 +4130,12 @@ def forward(self, x):
         }
     )
     def test_linear_dynamic_maxautotune(self):
+<<<<<<< HEAD
+=======
+        if self.device == "cpu":
+            raise unittest.SkipTest("using triton backend only is not supported on CPU")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch.compile(dynamic=True)
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -3977,6 +4338,10 @@ def forward(self, x):
             )
 
     @skipIfRocm
+<<<<<<< HEAD
+=======
+    @xfail_if_mps  # Expected to find .run(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv_inference_heuristics(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest(f"{GPU_TYPE} only test")
@@ -3997,6 +4362,7 @@ def test_conv_inference_heuristics(self):
         def foo(m, inp):
             return m(inp)
 
+<<<<<<< HEAD
         with torch.no_grad():
             _, code = run_and_get_code(foo, grouped_conv, input_tensor)
             # no to channels last permuting before kernel
@@ -4006,6 +4372,18 @@ def foo(m, inp):
                 )
             else:
                 FileCheck().check_not(".run(").check(".convolution(").run(code[0])
+=======
+        if self.device != "xpu":
+            with torch.no_grad():
+                _, code = run_and_get_code(foo, grouped_conv, input_tensor)
+                # no to channels last permuting before kernel
+                if config.cpp_wrapper:
+                    FileCheck().check_not("  call_triton").check("_convolution(").run(
+                        code[0]
+                    )
+                else:
+                    FileCheck().check_not(".run(").check(".convolution(").run(code[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # in out should do channels last in inference
         in_channels = 8
@@ -4123,6 +4501,10 @@ def fn(a, b):
         y = torch.tensor(0)
         self.assertEqual(fn(x, y), x + x)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented  # Sparse not supported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_gather3(self):
         def fn(a, b):
             return torch.gather(a, 1, b, sparse_grad=True)
@@ -4142,7 +4524,12 @@ def fn(x, y):
 
         x1 = torch.randn(30, device=self.device)
         x2 = torch.randn(36, device=self.device)
+<<<<<<< HEAD
         y = torch.ones(1, dtype=torch.float64, device=self.device)
+=======
+        dtype = torch.float64 if self.device != "mps" else torch.float32
+        y = torch.ones(1, dtype=dtype, device=self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(torch.compile(fn)(x1, y), fn(x1, y))
         self.assertEqual(torch.compile(fn)(x2, y), fn(x2, y))
@@ -4265,6 +4652,7 @@ def fn2(a):
             (torch.randn([2, 2, 10]),),
         )
 
+<<<<<<< HEAD
     def test_low_memory_max_pool(self):
         prims = torch.ops.prims
 
@@ -4276,10 +4664,27 @@ def fn(x):
             ceil_mode = False
 
             vals, offsets = prims._low_memory_max_pool2d_with_offsets(
+=======
+    @parametrize("dilation", (1, 2))
+    @parametrize(
+        "dim", (subtest(2), subtest(3, decorators=[xfail_if_mps_unimplemented]))
+    )
+    def test_low_memory_max_pool(self, dilation: int, dim: int):
+        prims = torch.ops.prims
+
+        def fn(x):
+            kernel_size = [3, 3] if dim == 2 else [3, 3, 2]
+            stride = [2] * dim
+            padding = [1] * dim
+            ceil_mode = False
+
+            vals, offsets = prims._low_memory_max_pool_with_offsets(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 x,
                 kernel_size,
                 stride,
                 padding,
+<<<<<<< HEAD
                 dilation,
                 ceil_mode,
             )
@@ -4296,11 +4701,38 @@ def fn(x):
 
     @xfail_if_mps
     def test_to_dtype(self):
+=======
+                [dilation] * dim,
+                ceil_mode,
+            )
+            indices = prims._low_memory_max_pool_offsets_to_indices(
+                offsets,
+                kernel_size,
+                x.shape[-dim:],
+                stride,
+                padding,
+                dilation=[dilation] * dim,
+            )
+            return vals, indices, offsets
+
+        self.common(fn, (torch.randn(1, 3, *[10] * dim),))
+
+    def test_to_dtype(self):
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("bfloat unsupported on MacOS-13")
+
+        new_dtype = torch.float64 if self.device != "mps" else torch.bfloat16
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn(a, b):
             return (
                 aten._to_copy(a, dtype=6),
                 aten._to_copy(b + 1, dtype=6),
+<<<<<<< HEAD
                 aten.to(b, torch.float64),
+=======
+                aten.to(b, new_dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 aten.to(b, torch.bool),
             )
 
@@ -4308,7 +4740,11 @@ def fn(a, b):
             fn,
             (
                 torch.randn([2, 2, 10]),
+<<<<<<< HEAD
                 torch.randn([2, 2, 10], dtype=torch.float64),
+=======
+                torch.randn([2, 2, 10], dtype=new_dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
@@ -4680,6 +5116,10 @@ def fn(x):
         )
         assertGeneratedKernelCountEqual(self, 0)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_gpu_halide  # slow
     def test_adaptive_max_pool2d1(self):
         def fn(x):
@@ -4734,6 +5174,10 @@ def fn(x):
             (torch.randn(2, 4, 4, 4),),
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fractional_max_pool2d1(self):
         def fn(x, samples):
             return aten.fractional_max_pool2d(x, (3, 3), (2, 2), samples)
@@ -4742,20 +5186,34 @@ def fn(x, samples):
             fn, (torch.randn(1, 4, 16, 16), torch.rand(1, 4, 2)), check_lowp=False
         )
 
+<<<<<<< HEAD
     def test_fractional_max_pool2d2(self):
         # fallback for larger kernel size
+=======
+    @xfail_if_mps_unimplemented
+    def test_fractional_max_pool2d2(self):
+        # large kernel size without unrolling
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def fn(x, samples):
             return aten.fractional_max_pool2d(x, (6, 5), (3, 3), samples)
 
+<<<<<<< HEAD
         torch._inductor.metrics.generated_kernel_count = 0
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn,
             (torch.randn(2, 4, 36, 36), torch.rand(2, 4, 2)),
             check_lowp=False,
         )
+<<<<<<< HEAD
         assertGeneratedKernelCountEqual(self, 0)
 
+=======
+
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fractional_max_pool2d3(self):
         def fn(x, samples):
             return aten.fractional_max_pool2d(x, (1, 1), (16, 16), samples)
@@ -4764,6 +5222,10 @@ def fn(x, samples):
             fn, (torch.randn(2, 4, 16, 16), torch.rand(2, 4, 2)), check_lowp=False
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(fallback_random=True)
     @skip_if_halide  # Can only unroll for loops over a constant extent
     def test_fractional_max_pool2d4(self):
@@ -4779,6 +5241,18 @@ def fn(x):
 
         self.common(fn, (torch.randn(1, 4, 16, 16),), check_lowp=False)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+    def test_fractional_max_pool2d5(self):
+        def fn(x, samples):
+            return aten.fractional_max_pool2d(x, (3, 3), (1, 1), samples)
+
+        self.common(
+            fn, (torch.randn(2, 4, 6, 6), torch.rand(2, 4, 2)), check_lowp=False
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multi_threading(self):
         model = torch.nn.Linear(2, 3).eval()
         inp = torch.randn(4, 2)
@@ -4817,6 +5291,12 @@ def forward(self, x):
 
         mod = Model().to(self.device)
         for dtype in [torch.half, torch.bfloat16]:
+<<<<<<< HEAD
+=======
+            # Skip bfloat16 on MacOS-13 for MPS tests
+            if not self.is_dtype_supported(dtype):
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x = torch.randn(4, 3, 7, 7, device=self.device).to(dtype=dtype)
             opt_mod = torch.compile(mod)
             res = opt_mod(x)
@@ -4946,9 +5426,17 @@ def forward(self, x):
 
         eager_version_counters_after = [
             # TODO: remove the + 1 after https://github.com/pytorch/pytorch/issues/120622 is fixed
+<<<<<<< HEAD
             buffer._version + 1
             if k in ["m.running_mean", "m.running_var"]
             else buffer._version
+=======
+            (
+                buffer._version + 1
+                if k in ["m.running_mean", "m.running_var"]
+                else buffer._version
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for k, buffer in model_for_eager.named_buffers()
         ]
 
@@ -5038,10 +5526,20 @@ def fn(x):
         )
 
     @skip_if_gpu_halide  # slow
+<<<<<<< HEAD
     def test_max_pool2d6(self):
         # Big kernel size
         def fn(x):
             return aten.max_pool2d_with_indices(x, [13, 13], [])
+=======
+    @parametrize("dilation", (1, 2))
+    def test_max_pool2d6(self, dilation: int):
+        # Big kernel size
+        def fn(x):
+            return aten.max_pool2d_with_indices(
+                x, [13, 13], [], dilation=[dilation] * 2
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.common(
             fn,
@@ -5063,16 +5561,26 @@ def fn(x):
 
     # From https://github.com/pytorch/pytorch/issues/93384
     def test_max_pool2d8(self):
+<<<<<<< HEAD
         # dialtion is not 1, use fallback
         def fn(x):
             return aten.max_pool2d_with_indices(x, [3, 2], [2, 1], [1, 1], [1, 2])
 
         torch._inductor.metrics.generated_kernel_count = 0
+=======
+        # dilation is not 1
+        def fn(x):
+            return aten.max_pool2d_with_indices(x, [3, 2], [2, 1], [1, 1], [1, 2])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn,
             (torch.randn([2, 2, 3, 6]),),
         )
+<<<<<<< HEAD
         assertGeneratedKernelCountEqual(self, 0)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_avg_pool2d1(self):
         def fn(x):
@@ -5170,6 +5678,10 @@ def fn(x):
             check_lowp=not is_halide_backend(self.device),  # misaligned addr fp16
         )
 
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.006)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_gpu_halide  # slow
     def test_alexnet_prefix(self):
         def forward(arg6, arg7, arg16):
@@ -5216,6 +5728,13 @@ def test_tan(self):
         def fn(x):
             return aten.tan(x) + 2, aten.tan(x + 1)
 
+<<<<<<< HEAD
+=======
+        # tan is broken in MPSGraph for MacOS before version 13.3
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("tan is inaccurate for MPS no MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn,
             (torch.randn([16, 16]),),
@@ -5339,6 +5858,22 @@ def test_embedding(self):
             (torch.randint(10, [2, 8]),),
         )
 
+<<<<<<< HEAD
+=======
+    def test_embedding_sparse(self):
+        # Fix https://github.com/pytorch/pytorch/issues/150656
+        def fn(weight, indices):
+            return F.embedding(indices, weight, sparse=True)
+
+        indices = torch.randint(10, (2, 3))
+        weight = torch.randn(10, 3, requires_grad=True)
+
+        self.common(
+            fn,
+            (weight, indices),
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mean(self):
         def fn(x):
             return (
@@ -5372,6 +5907,16 @@ def fn(x):
                 (torch.randn([1, 2, 4, 8]),),
             )
 
+<<<<<<< HEAD
+=======
+    def test_var_mean_div_by(self):
+        def fn(x):
+            var, mean = torch.var_mean(x, dim=2, keepdim=True)
+            return x / var, var, mean
+
+        self.common(fn, (torch.rand([1, 17, 2048]),))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_var_correction(self):
         def fn(x):
             dim = -1
@@ -5416,6 +5961,10 @@ def fn(x):
             (torch.randn([2, 4, 4, 8]),),
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_embedding_bag(self):
         def fn(w, i, o):
             return aten._embedding_bag(w, i, o, False, 0, False, None)
@@ -5764,6 +6313,10 @@ def fn(mask, value):
             )
             self.assertEqual(fn(*inputs), opt_fn(*inputs))
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps  # 'NullHandler' object has no attribute 'wrapper_code'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_masked_scatter(self):
         def fn(value, mask, source):
             return torch.masked_scatter(value, mask, source)
@@ -5800,6 +6353,13 @@ def test_pow1(self):
         def fn(x):
             return [aten.pow(x, e) for e in range(-8, 9)]
 
+<<<<<<< HEAD
+=======
+        # pow is broken in MPSGraph for MacOS before version 13.3
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("pow is inaccurate for MPS no MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn,
             (torch.randn([16, 16]),),
@@ -6126,6 +6686,7 @@ def fn(x1, x2, x3, x4):
             # use default
             atol = None
             rtol = None
+<<<<<<< HEAD
         self.common(
             fn,
             (
@@ -6138,6 +6699,26 @@ def fn(x1, x2, x3, x4):
             rtol=rtol,
             check_lowp=False,  # accuracy issues with relatively large matmuls
         )
+=======
+        # MPS has correctness problem before MacOS15
+        with (
+            contextlib.nullcontext()
+            if self.device != "mps" or MACOS_VERSION >= 15.0
+            else self.assertRaises(AssertionError)
+        ):
+            self.common(
+                fn,
+                (
+                    torch.randn(256, 256),
+                    torch.randn(256, 1024),
+                    torch.randn(1024, 1600),
+                    torch.randn(100, 256),
+                ),
+                atol=atol,
+                rtol=rtol,
+                check_lowp=False,  # accuracy issues with relatively large matmuls
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_gpu_halide
     # Constant folding was explicitly turned off due to issue #108388
@@ -6151,6 +6732,7 @@ def matmul_with_op(x, y, fn):
 
         # test no-op
         fns = (
+<<<<<<< HEAD
             lambda x: x
             + torch.zeros(
                 [256, 256], dtype=torch.float32, device=x.device
@@ -6167,6 +6749,12 @@ def matmul_with_op(x, y, fn):
             / torch.ones(
                 [256, 256], dtype=torch.float32, device=x.device
             ),  # noqa: E731
+=======
+            lambda x: x + torch.zeros([256, 256], dtype=torch.float32, device=x.device),  # noqa: E731
+            lambda x: x - torch.zeros([256, 256], dtype=torch.float32, device=x.device),  # noqa: E731
+            lambda x: x * torch.ones([256, 256], dtype=torch.float32, device=x.device),  # noqa: E731
+            lambda x: x / torch.ones([256, 256], dtype=torch.float32, device=x.device),  # noqa: E731
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         inps = [torch.rand([256, 256], device=self.device) for _ in range(2)]
@@ -6228,6 +6816,90 @@ def fn(x):
 
         self.common(fn, (torch.randn(2, 4),))
 
+<<<<<<< HEAD
+=======
+    def test_remove_noop_slice(self):
+        def f(x):
+            x = x + 1
+            size = x.shape[-1]
+            y = torch.ops.aten.slice(x, -1, 0, size)  # noop
+            return y + 1
+
+        f = torch.compile(f)
+
+        x = torch.ones((2, 3, 2), device=self.device)
+        torch._dynamo.mark_dynamic(x, 0)
+        torch._dynamo.mark_dynamic(x, 1)
+        torch._dynamo.mark_dynamic(x, 2)
+
+        post_grad_graph = get_post_grad_graph(f, (x,))
+        expected_graph = f"""\
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", arg3_1: "f32[s77, s27, s53][s27*s53, s53, 1]{str(x.device)}"):
+        add: "f32[s77, s27, s53][s27*s53, s53, 1]{str(x.device)}" = torch.ops.aten.add.Tensor(arg3_1, 1);  arg3_1 = None
+        add_9: "f32[s77, s27, s53][s27*s53, s53, 1]{str(x.device)}" = torch.ops.aten.add.Tensor(add, 1);  add = None
+        return (add_9,)"""  # noqa: B950
+        self.assertExpectedInline(
+            post_grad_graph,
+            expected_graph,
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+    def test_remove_noop_slice1(self):
+        def f(x):
+            x = x + 1
+            y = torch.ops.aten.slice(x, -1, 0, -1)  # not a noop
+            return y + 1
+
+        f = torch.compile(f)
+        x = torch.ones((2, 3, 2), device=self.device)
+        torch._dynamo.mark_dynamic(x, 0)
+        torch._dynamo.mark_dynamic(x, 1)
+        post_grad_graph = get_post_grad_graph(f, (x,))
+        expected_graph = f"""\
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "f32[s77, s27, 2][2*s27, 2, 1]{str(x.device)}"):
+        add: "f32[s77, s27, 2][2*s27, 2, 1]{str(x.device)}" = torch.ops.aten.add.Tensor(arg2_1, 1);  arg2_1 = None
+        slice_1: "f32[s77, s27, 1][2*s27, 2, 1]{str(x.device)}" = torch.ops.aten.slice.Tensor(add, -1, 0, -1);  add = None
+        add_9: "f32[s77, s27, 1][s27, 1, 1]{str(x.device)}" = torch.ops.aten.add.Tensor(slice_1, 1);  slice_1 = None
+        return (add_9,)"""  # noqa: B950
+        self.assertExpectedInline(
+            post_grad_graph,
+            expected_graph,
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+    def test_remove_noop_slice_scatter(self):
+        def f(x):
+            x = x + 1
+            y = torch.empty_like(x)
+            size = x.shape[-1]
+            out = torch.ops.aten.slice_scatter(y, x, -1, 0, size)  # noop
+            return out + 1
+
+        f = torch.compile(f)
+
+        x = torch.ones((2, 3, 2), device=self.device)
+        torch._dynamo.mark_dynamic(x, 0)
+        torch._dynamo.mark_dynamic(x, 1)
+        torch._dynamo.mark_dynamic(x, 2)
+
+        post_grad_graph = get_post_grad_graph(f, (x,))
+        expected_graph = f"""\
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", arg3_1: "f32[s77, s27, s53][s27*s53, s53, 1]{str(x.device)}"):
+        empty: "f32[s77, s27, s53][s27*s53, s53, 1]{str(x.device)}" = torch.ops.aten.empty.memory_format([arg0_1, arg1_1, arg2_1], dtype = torch.float32, layout = torch.strided, device = {repr(x.device)}, pin_memory = False);  arg0_1 = arg1_1 = arg2_1 = None
+        permute: "f32[s77, s27, s53][s27*s53, s53, 1]{str(x.device)}" = torch.ops.aten.permute.default(empty, [0, 1, 2]);  empty = permute = None
+        add: "f32[s77, s27, s53][s27*s53, s53, 1]{str(x.device)}" = torch.ops.aten.add.Tensor(arg3_1, 1);  arg3_1 = None
+        add_13: "f32[s77, s27, s53][s27*s53, s53, 1]{str(x.device)}" = torch.ops.aten.add.Tensor(add, 1);  add = None
+        return (add_13,)"""  # noqa: B950
+        self.assertExpectedInline(
+            post_grad_graph,
+            expected_graph,
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cat_of_loops_and_extern_kernel(self):
         class M(torch.nn.Module):
             def __init__(
@@ -6326,6 +6998,12 @@ def fn(x):
             return torch.expm1(x), torch.expm1(x) * 2
 
         for dtype in (torch.float16, torch.float, torch.double, torch.int, torch.int64):
+<<<<<<< HEAD
+=======
+            if not self.is_dtype_supported(dtype):
+                continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.common(
                 fn,
                 (torch.randn([64]).to(dtype=dtype),),
@@ -6335,6 +7013,10 @@ def fn(x):
                 (torch.arange(-1e-5, 1e-5, 1e-7).to(dtype=dtype),),
             )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_adaptive_pool_errors_with_long(self):
         class Model(torch.nn.Module):
             def __init__(self, pool_operator):
@@ -6356,6 +7038,10 @@ def forward(self, x):
             ):
                 model(x)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_adaptive_avg_pool_errors_with_long(self):
         class Model(torch.nn.Module):
             def __init__(self, pool_operator):
@@ -6389,11 +7075,35 @@ def test_avg_pool_errors_with_uint(self):
                 ):
                     c_op(x, kernel_size=2, stride=2)
 
+<<<<<<< HEAD
+=======
+    def test_replication_pad_errors_with_bool(self):
+        for dim in (1, 2, 3):
+
+            def fn(x):
+                x = torch.signbit(x)
+                x = eval(f"nn.ReplicationPad{dim}d(padding=1)")(x)
+                return x
+
+            c_fn = torch.compile(fn)
+            x = torch.randn([1] * (dim + 2))
+            with self.assertRaisesRegex(
+                RuntimeError, r".*(not implemented|aoti_torch_).*"
+            ):
+                c_fn(x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_log1p(self):
         def fn(x):
             return torch.log1p(x), torch.log1p(x) * 2
 
         for dtype in (torch.float16, torch.float, torch.double, torch.int, torch.int64):
+<<<<<<< HEAD
+=======
+            if not self.is_dtype_supported(dtype):
+                continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.common(
                 fn,
                 (torch.randn([64]).to(dtype=dtype),),
@@ -6403,7 +7113,10 @@ def fn(x):
                 (torch.arange(-1e-5, 1e-5, 1e-7).to(dtype=dtype),),
             )
 
+<<<<<<< HEAD
     @patch.object(cpp_prefix_path, "cache_clear", lambda: None)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(force_disable_caches=True)
     @skip_if_cpp_wrapper("run_and_get_kernels issue")
     def test_deterministic_codegen(self):
@@ -6452,7 +7165,10 @@ def c(x):
         self.assertEqual(coda_b0, coda_b2)
         self.assertEqual(coda_c0, coda_c2)
 
+<<<<<<< HEAD
     @patch.object(cpp_prefix_path, "cache_clear", lambda: None)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(force_disable_caches=True)
     @skip_if_cpp_wrapper("run_and_get_kernels issue")
     def test_deterministic_codegen_on_graph_break(self):
@@ -6473,7 +7189,10 @@ def b(x):
         _, (code0, code1) = _run_and_get_stripped_kernels(b, x)
         self.assertEqual(code0, code1)
 
+<<<<<<< HEAD
     @patch.object(cpp_prefix_path, "cache_clear", lambda: None)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(force_disable_caches=True)
     @skip_if_cpp_wrapper("run_and_get_kernels issue")
     def test_deterministic_codegen_with_suffix(self):
@@ -6549,6 +7268,10 @@ def fn(a, b):
             ),
         )
 
+<<<<<<< HEAD
+=======
+    @skip_if_halide  # log2 not implemented for halide
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_log2(self):
         def fn(x):
             return torch.log2(x), torch.log2(x + 1) - 2
@@ -6567,6 +7290,10 @@ def fn(x):
             (torch.randn([8, 8]) + 10,),
         )
 
+<<<<<<< HEAD
+=======
+    @skip_if_halide  # log2 not implemented for halide
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_log_fp64(self):
         def fn(x):
             return torch.log(x), torch.log2(x)
@@ -6805,6 +7532,10 @@ def fn(a, b):
                 ),
             )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIf(not TEST_CUDNN, "CUDNN not available")
     @skipIfXpu
     @skipIfRocm
@@ -6987,6 +7718,10 @@ def fn(a, pad):
             ),
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_reflection_pad2d_backward(self):
         def template(size, padding):
             def fn(grad_output, x):
@@ -7008,6 +7743,10 @@ def fn(grad_output, x):
         template([1, 1, 8, 8], [2, 2, 0, -1])
         template([1, 1, 8, 8], [2, 2, -1, 0])
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented  # Unsupported Border padding mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_grid_sampler_2d(self):
         def fn(a, b):
             return (
@@ -7106,9 +7845,22 @@ def test_sort_transpose(self):
         def fn(a, descending):
             return torch.sort(a, stable=True, descending=descending)
 
+<<<<<<< HEAD
         inp = torch.randn(128, 10).transpose(0, 1)
         self.common(fn, (inp, False))
         self.common(fn, (inp, True))
+=======
+        # MPS has correctness problem for transposed sort before MacOS15
+        ctx = (
+            contextlib.nullcontext()
+            if self.device != "mps" or MACOS_VERSION >= 15.0
+            else self.assertRaises(AssertionError)
+        )
+        inp = torch.randn(128, 10).transpose(0, 1)
+        with ctx:
+            self.common(fn, (inp, False))
+            self.common(fn, (inp, True))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_topk(self):
         def fn(a):
@@ -7553,6 +8305,10 @@ def fn(a, b):
                 (a, b),
             )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps  # dtypes mismatch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nll_loss_backward(self):
         def fn(a, b, c):
             return aten.nll_loss_backward(
@@ -7654,6 +8410,11 @@ def fn(a, b, c, beta):
                 # Greatest relative difference: 1.0 at index (3, 19, 4) (up to 0.001 allowed)
                 atol=0.002,
                 rtol=0.001,
+<<<<<<< HEAD
+=======
+                # MacOS-13 MM ops have precision issues
+                check_lowp=self.device != "mps" or MACOS_VERSION > 14.0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @config.patch({"triton.max_tiles": 2})
@@ -7707,7 +8468,11 @@ def fn(a, b, c):
                 torch.randint(0, 100, size=[600], dtype=torch.int64),
                 torch.randn([600, 256, 7, 7]),
             ],
+<<<<<<< HEAD
             # workaround for https://github.com/openai/triton/issues/558
+=======
+            # workaround for https://github.com/triton-lang/triton/issues/558
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             check_lowp=False,
         )
 
@@ -7897,6 +8662,13 @@ def fn(x, y):
             )
             return torch.ops.aten.index.Tensor(y, [iota, sub])
 
+<<<<<<< HEAD
+=======
+        # Requires masked loading for the intermediate reduction
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Fails with internal compiler error on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(fn, [torch.randn(1, 1024), torch.randn(1, 1024, 2)])
 
     @config.patch(fallback_random=True)
@@ -7981,7 +8753,10 @@ def fn(x):
         actual_out = compiled_fn(view)
         self.assertEqual(reference_out.stride(), actual_out.stride())
 
+<<<<<<< HEAD
     @xfail_if_triton_cpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_like_channels_last(self):
         def foo():
             randn = torch.randn((4, 3, 8, 8), device=self.device, dtype=torch.float32)
@@ -8209,6 +8984,12 @@ def fn(a, dim, index, b, reduce):
             a1.scatter_(dim, index, b, reduce=reduce)
             return (a, a1)
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("Crashes on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         check_lowp = True
         if self.device == "xpu":
             check_lowp = False
@@ -8358,6 +9139,12 @@ def fn(a, dim, index, b, reduce):
             a1.scatter_reduce_(dim, index, b, reduce=reduce)
             return (a, a1)
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("Crashes on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         check_lowp = True
         if self.device == "xpu":
             check_lowp = False
@@ -8528,6 +9315,42 @@ def fn(a):
         self.assertTrue((d < 1).all())
 
     @config.patch(implicit_fallbacks=True)
+<<<<<<< HEAD
+=======
+    def test_needs_contiguous_strides(self):
+        # Construct a custom op whose output strides are not contiguous
+        @torch.library.custom_op("mylib::myop", mutates_args={})
+        def myop(x: torch.Tensor) -> torch.Tensor:
+            return torch.zeros(2, 2).t()
+
+        @myop.register_fake
+        def _(x):
+            return torch.zeros(2, 2).t()
+
+        # custom op that needs contiguous inputs
+        @torch.library.custom_op(
+            "mylib::second_op",
+            mutates_args={},
+            tags=[torch._C.Tag.needs_contiguous_strides],
+        )
+        def second_op(x: torch.Tensor) -> torch.Tensor:
+            assert x.is_contiguous()
+            return torch.ones(2, 2)
+
+        @second_op.register_fake
+        def _(x):
+            return torch.ones(2, 2)
+
+        def f(x):
+            y = myop(x)
+            return second_op(y)
+
+        # Check that the x.is_contiguous() assertion never gets triggered
+        x = torch.randn(2, 2)
+        _ = torch.compile(f, backend="inductor", fullgraph=True)(x)
+
+    @config.patch(implicit_fallbacks=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fallback_mutable_op_basic(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as m:
 
@@ -8717,6 +9540,10 @@ def g(x):
         out = [torch.empty_like(x) for _ in range(2)]
         y = g(x)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented  # rng_prims not supported for MPS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_functionalize_rng_wrappers(self):
         # Ideally, we would like to use torch.compile for these operators. But
         # currently the plan is to introduce these operators at the partitioner
@@ -8764,6 +9591,10 @@ def fn():
     @patch.object(torch._functorch.config, "functionalize_rng_ops", True)
     @expectedFailureXPU
     @skip_if_gpu_halide  # rand
+<<<<<<< HEAD
+=======
+    @xfail_if_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_philox_rand(self):
         if self.device == "cpu":
             raise unittest.SkipTest(
@@ -8909,6 +9740,10 @@ def bin(index, max_size):
         self.assertTrue(error < expected_error)
 
     @config.patch(fallback_random=True)
+<<<<<<< HEAD
+=======
+    @xfail_if_mps  # 100% are not close
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_like_rands(self):
         def fn(x):
             return torch.rand_like(x), torch.randn_like(x)
@@ -8989,6 +9824,10 @@ def fn(a, b, c):
             ],
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps  # Small tolerances bug
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_gpu_halide  # slow
     def test_max_pool2d_with_indices_backward2(self):
         def fn(a, b, c):
@@ -9041,6 +9880,10 @@ def fn(a, b, c):
         )
 
     # From https://github.com/pytorch/torchdynamo/issues/1352
+<<<<<<< HEAD
+=======
+    @xfail_if_mps  # Small tolerances bug
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_halide  # hangs forever
     def test_max_pool2d_with_indices_backward4(self):
         def fn(a, b, c):
@@ -9220,6 +10063,10 @@ def fn(a, b):
         )
         assertGeneratedKernelCountEqual(self, 0)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_avg_pool3d_backward(self):
         def fn(a, b):
             return aten.avg_pool3d_backward(
@@ -9241,6 +10088,10 @@ def fn(a, b):
             ],
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_halide  # compiles for 5+ minutes
     def test_avg_pool3d_backward2(self):
         def fn(a, b):
@@ -9263,6 +10114,10 @@ def fn(a, b):
             ],
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_avg_pool3d_backward3(self):
         def fn(a, b):
             return aten.avg_pool3d_backward(
@@ -9286,6 +10141,10 @@ def fn(a, b):
         )
         assertGeneratedKernelCountEqual(self, 1)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_avg_pool3d_backward4(self):
         def fn(a, b):
             return aten.avg_pool3d_backward(
@@ -9339,6 +10198,10 @@ def fn(a):
         result = fn(torch.randn([1, 2, 16, 4]).requires_grad_())
         result.sum().backward()
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dropout2(self):
         n = 100000
         weight = torch.ones(
@@ -9397,6 +10260,10 @@ def check(r, g):
         self.assertTrue(same(r2, r3))
         self.assertTrue(same(g2, g3))
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(search_autotune_cache=False)
     def test_dropout3(self):
         m = torch.nn.Sequential(
@@ -9428,6 +10295,10 @@ def run(x):
             self.assertEqual(bw_code.count("tl.rand"), 0)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps  # Only works for triton
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_randint_kernel_count(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("Only valid for GPU!")
@@ -9516,6 +10387,10 @@ def fn(x):
         t1 = torch.randint(8, size=(1028, 1028))
         self.common(fn, (t1,))
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps  # eager nan is wrong, see https://github.com/pytorch/pytorch/issues/130295
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_halide  # nan behavior
     def test_argmax_argmin_with_nan(self):
         def fn(x):
@@ -9642,7 +10517,13 @@ def fn(x):
 
     @parametrize(
         "use_block_ptr",
+<<<<<<< HEAD
         [subtest(False), subtest(True, decorators=[skip_if_not_triton])],
+=======
+        [
+            subtest(True, decorators=[skip_if_not_triton]),
+        ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_tmp_not_defined_issue1(self, use_block_ptr):
         def forward(
@@ -9689,7 +10570,10 @@ def forward(
         "TODO: debug this with asan",
     )
     @skip_if_gpu_halide
+<<<<<<< HEAD
     @xfailIfS390X
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tmp_not_defined_issue2(self):
         def forward(arg38_1, arg81_1, getitem_17, new_zeros_default_4):
             div_tensor_7 = torch.ops.aten.div.Tensor(getitem_17, arg81_1)
@@ -9710,6 +10594,10 @@ def forward(arg38_1, arg81_1, getitem_17, new_zeros_default_4):
         ]
         self.common(forward, args, atol=1e-5, rtol=1e-5)
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented  # embedding bag
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @requires_gpu()
     @skip_if_halide  # cascading accuracy issues due rsqrt fallback
     def test_tmp_not_defined_issue3(self):
@@ -9966,6 +10854,14 @@ def test_unspec_inputs(self, dtype):
             # https://github.com/halide/Halide/issues/8318
             raise unittest.SkipTest("halide not supported")
 
+<<<<<<< HEAD
+=======
+        if not self.is_dtype_supported(dtype):
+            raise unittest.SkipTest(
+                f"dtype {dtype} not supported for device {self.device}"
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def fn(x, y):
             return x + y, x * y, x / y
 
@@ -10068,6 +10964,10 @@ def fn(x):
         x = torch.rand(128, 32, 63)
         self.common(fn, (x,))
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_vectorized_ops_masked_var_novec(self):
         def fn(x):
             index = torch.arange(10, device=x.device)
@@ -10085,9 +10985,12 @@ def fn(x):
         for x in (torch.randn(2, 3), torch.randn(2, 2), torch.randn(3, 2)):
             self.common(fn, (x,))
 
+<<<<<<< HEAD
     @skip_if_cpp_wrapper(
         "cannot currently handle fallback ops with return types containing list[Tensor]"
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_kwargs(self):
         if self.device == GPU_TYPE:
             raise unittest.SkipTest("histogramdd only supports cpu")
@@ -10162,6 +11065,53 @@ def f(x):
             self.assertEqual(x_ref, x_test)
 
     @requires_gpu()
+<<<<<<< HEAD
+=======
+    @skip_if_not_triton
+    @unittest.skipIf(
+        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
+    )
+    def test_inductor_multiple_specializations(self):
+        from triton.testing import do_bench
+
+        @torch.compile(
+            options={
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+            },
+            dynamic=False,
+        )
+        def inductor_matmul(a, b):
+            torch._check(a.shape[0] == b.shape[1])
+            return (m, torch.mm(a, b))
+
+        m = 16
+        k = 1280
+        dynamic_a = torch.randn(m, k, device=GPU_TYPE, dtype=torch.bfloat16)
+        dynamic_specialized_a = torch.randn(m, k, device=GPU_TYPE, dtype=torch.bfloat16)
+        b = torch.randn(k, m, device=GPU_TYPE, dtype=torch.bfloat16)
+        torch._dynamo.decorators.mark_dynamic(
+            dynamic_a,
+            0,
+        )
+        torch._dynamo.decorators.mark_dynamic(
+            dynamic_specialized_a,
+            0,
+            specialize_on=[lambda x0: x0 == 16],
+        )
+        torch._dynamo.decorators.mark_dynamic(
+            b,
+            1,
+        )
+        dynamic = do_bench(lambda: inductor_matmul(dynamic_a, b))
+        torch._dynamo.reset()
+        dynamic_specialized = do_bench(
+            lambda: inductor_matmul(dynamic_specialized_a, b)
+        )
+        self.assertGreaterEqual(dynamic, dynamic_specialized)
+
+    @requires_gpu()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stride_preservation_with_stride_modifying_fx_pass(self):
         def f(x):
             return x + 1
@@ -10210,6 +11160,16 @@ def fn(x, i):
         # Constant must not get matched as constant
         self.common(fn, [torch.randn(3, 1, 1, 1, 1), 9132])
 
+<<<<<<< HEAD
+=======
+    def test_float_repr_dynamic_shapes(self):
+        @torch.compile(dynamic=True)
+        def fn(x):
+            return F.interpolate(x, scale_factor=1 / 300, mode="linear")
+
+        self.common(fn, [torch.randn(1, 8, 396 * 300)])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sqrt_dynamic_shapes(self):
         # TIMM convit_base model: https://github.com/pytorch/pytorch/issues/97877.
         # TODO: support cuda path.
@@ -10314,6 +11274,18 @@ def fn(arg0_1):
             [x],
         )
 
+<<<<<<< HEAD
+=======
+    @skip_if_halide  # log2 not yet implemented
+    @skip_if_triton_cpu  # log2 implemented only in Dec 2024
+    def test_pow_by_natural_log2_dynamic_shapes(self):
+        @torch.compile(dynamic=True)
+        def fn(x):
+            return x + 2 ** (math.floor(math.log2(x.shape[0]) + 1))
+
+        self.common(fn, [torch.randn(5)])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_setitem_with_int_parameter(self):
         x = torch.zeros(7, device=self.device)
 
@@ -10424,7 +11396,10 @@ def forward(self, arg0_1, arg1_1):
         eager_out = eager_mod(*eager_args)
         self.assertEqual(inductor_out, eager_out)
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_require_stride_expanded(self):
         def forward(arg6, arg7, arg16):
             convolution = torch.ops.aten.convolution(
@@ -10891,8 +11866,13 @@ def func(arg0_1):
     # Calling div only torch.SymInt arguments is not yet supported.
     # To support this behavior, we need to allow const-propping tensors that store symint data.
     # For now, dynamo will explicitly graph break when it encounters user code with this behavior.
+<<<<<<< HEAD
     @xfailIfS390X
     @expectedFailureCodegenDynamic
+=======
+    @expectedFailureCodegenDynamic
+    @xfailIfS390X
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_gpu_halide  # accuracy error
     def test_AllenaiLongformerBase_repro(self):
         def fn(query, scores, window_overlap):
@@ -10955,6 +11935,12 @@ def fn(input_ids) -> torch.Tensor:
             attention_mask = attention_mask.long()
             return torch.cumsum(attention_mask, dim=1)
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("CumSum for int64 needs MacOS-13.3+")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = torch.randn(2, 2)
         self.common(fn, (x,), atol=0, rtol=0)
 
@@ -11133,6 +12119,10 @@ def fn(q, k, v):
             rtol=1e-2,  # to pass lowp check on GPU
         )
 
+<<<<<<< HEAD
+=======
+    @xfail_if_mps_unimplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @expectedFailureXPU
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Some archs don't support mem eff SDPA"
@@ -11164,12 +12154,24 @@ def test_fft_real_input(self):
         def fn(x):
             return torch.fft.fftn(x)
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("FFT needs MacOS-14+")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(fn, (torch.randn((16, 16, 16)),), check_lowp=False)
 
     def test_fft_real_input_real_output(self):
         def fn(x):
             return torch.fft.fftn(x).real
 
+<<<<<<< HEAD
+=======
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("FFT needs MacOS-14+")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(fn, (torch.randn((16, 16, 16)),), check_lowp=False)
 
     def test_searchsorted(self):
@@ -11218,6 +12220,38 @@ def fn(sorted_sequence, values, out_int32, right, side, sorter):
                 check_lowp=False,
             )
 
+<<<<<<< HEAD
+=======
+    @requires_gpu()
+    @skip_if_gpu_halide
+    @skip_if_not_triton
+    def test_searchsorted_broadcast(self):
+        def fn(sorted_sequence, values):
+            return (
+                torch.searchsorted(
+                    sorted_sequence,
+                    values,
+                )
+                .unsqueeze(-1)
+                .expand(-1, 64)
+                .contiguous()
+            )
+
+        unsorted_sequence = torch.rand((32,))
+        sorted_sequence, sorting_indices = torch.sort(unsorted_sequence)
+        values = torch.rand((64,))
+
+        self.common(fn, (sorted_sequence, values), check_lowp=False)
+        cfn = torch.compile(fn)
+        _, code = run_and_get_code(
+            cfn, sorted_sequence.to(GPU_TYPE), values.to(GPU_TYPE)
+        )
+
+        # make sure that we did not fuse the broadcast and the bucketize,
+        # because bucketize is computationally expensive.
+        FileCheck().check("def triton").check("def triton").run(code[0])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("nd_tiling", (False, True))
     def test_bucketize(self, nd_tiling: bool):
         def fn(input, boundaries, out_int32, right):
@@ -11246,12 +12280,27 @@ def fn(input, offsets):
 
         self.common(fn, (input, offsets), check_lowp=False)
 
+<<<<<<< HEAD
     def test_bucketize_int(self):
         def fn(input, offsets, out_int32, right):
             return torch.bucketize(input, offsets, out_int32=out_int32, right=right)
 
         input = torch.randint(0, 102, (64, 64))
         offsets = torch.arange(10, dtype=torch.int32) ** 2 + 1
+=======
+    @parametrize(
+        "dtype_input, dtype_boundaries",
+        list(itertools.product(test_int_dtypes, test_int_dtypes)),
+    )
+    def test_bucketize_int(
+        self, dtype_input: torch.dtype, dtype_boundaries: torch.dtype
+    ):
+        def fn(input, offsets, out_int32, right):
+            return torch.bucketize(input, offsets, out_int32=out_int32, right=right)
+
+        input = torch.randint(-(2**10), 2**10, (64, 64)).to(dtype_input)
+        offsets = (torch.arange(10, dtype=torch.int32) ** 2 - 512).to(dtype_boundaries)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for out_int32 in [True, False]:
             for right in [True, False]:
@@ -11286,6 +12335,32 @@ def fn(inp, offsets):
         self.common(fn, (inp, offsets), check_lowp=False)
 
     @requires_gpu()
+<<<<<<< HEAD
+=======
+    @skip_if_gpu_halide
+    @skip_if_not_triton
+    def test_bucketize_broadcast(self):
+        def fn(input, boundaries):
+            return (
+                torch.bucketize(input, boundaries)
+                .unsqueeze(-1)
+                .expand(-1, -1, 64)
+                .contiguous()
+            )
+
+        inp = torch.rand((64, 64)) * 2 - 1
+        boundaries = torch.tensor([-0.9, -0.8, 0.1, 0.2, 0.5, 0.9])
+
+        self.common(fn, (inp, boundaries), check_lowp=False)
+        cfn = torch.compile(fn)
+        _, code = run_and_get_code(cfn, inp.to(GPU_TYPE), boundaries.to(GPU_TYPE))
+
+        # make sure that we did not fuse the broadcast and the bucketize,
+        # because bucketize is computationally expensive.
+        FileCheck().check("def triton").check("def triton").run(code[0])
+
+    @requires_gpu()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @config.patch(assume_aligned_inputs=False)
     def test_config_option_dont_assume_alignment(self):
         def fn(x: torch.Tensor) -> torch.Tensor:
@@ -11453,6 +12528,101 @@ def fn(x):
             check_lowp=False,
         )
 
+<<<<<<< HEAD
+=======
+    @requires_gpu()
+    @skip_if_not_triton
+    @skip_if_cpp_wrapper("skip cpp_wrapper tests")
+    @config.patch(implicit_fallbacks=True)
+    def test_generated_code_has_size_stride_assert(self):
+        def foo(x):
+            return 3 * x
+
+        def foo_meta(x):
+            return torch.empty_like(x)
+
+        define_custom_op_for_test("foo", foo, foo_meta)
+
+        def fn(x):
+            a = torch.nn.functional.relu(x)
+            b = torch.ops.test.foo(a)
+            return b
+
+        a = torch.randn((16, 32), device=self.device)
+
+        _, code = run_and_get_code(
+            torch.compile(fn),
+            a,
+        )
+        if not is_dynamic_shape_enabled():
+            if code and len(code) > 0 and "assert_size_stride(" in code[0]:
+                try:
+                    FileCheck().check_regex(
+                        r"assert_size_stride\s*\(\s*[^,]+,\s*\([^\)]*\),\s*\([^\)]*\),\s*'[^']+'\s*\)"
+                    ).run(code[0])
+                except Exception as e:
+                    print(f"Failed regex match for assert_size_stride: {e}")
+                    print(code[0])
+                    raise e
+            else:
+                print("Skipping: No assert_size_stride found.")
+
+    @requires_gpu()
+    @skip_if_not_triton
+    @skip_if_cpp_wrapper("skip cpp_wrapper tests")
+    @config.patch(implicit_fallbacks=True)
+    def test_generated_code_has_alignment_assert(self):
+        def foo(x):
+            return 3 * x
+
+        def foo_meta(x):
+            return torch.empty_like(x)
+
+        define_custom_op_for_test("foo", foo, foo_meta)
+
+        def fn(x):
+            a = torch.nn.functional.relu(x)
+            b = torch.ops.test.foo(a)
+            return b
+
+        a = torch.randn((16, 32), device=self.device)
+
+        _, code = run_and_get_code(
+            torch.compile(fn),
+            a,
+        )
+        if not is_dynamic_shape_enabled():
+            if code and len(code) > 0 and "assert_alignment(" in code[0]:
+                try:
+                    FileCheck().check_regex(
+                        r"assert_alignment\s*\(\s*[^,]+,\s*[^,]+,\s*'[^']+'\s*\)"
+                    ).run(code[0])
+                except Exception as e:
+                    print(f"Failed regex match for assert_alignment: {e}")
+                    print(code[0])
+                    raise e
+            else:
+                print("Skipping: No assert_alignment found.")
+
+    def test_assert_size_stride_op_name_pass(self):
+        tensor = torch.empty((16, 32))
+        assert_size_stride(tensor, (16, 32), (32, 1), "torch.ops.dummy.op_name")
+
+    def test_assert_size_stride_op_name_fail(self):
+        tensor = torch.empty((16, 32))
+        with self.assertRaisesRegex(AssertionError, "torch.ops.dummy.op_name"):
+            assert_size_stride(tensor, (32, 64), (32, 1), "torch.ops.dummy.op_name")
+
+    def test_assert_alignment_op_name_pass(self):
+        tensor = torch.empty((16, 32))
+        assert_alignment(tensor, 16, "torch.ops.dummy.op_name")
+
+    def test_assert_alignment_op_name_fail(self):
+        tensor = torch.empty((16, 32))
+        with self.assertRaisesRegex(AssertionError, "torch.ops.dummy.op_name"):
+            assert_alignment(tensor, 0, "torch.ops.dummy.op_name")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
     @torch._inductor.config.patch(implicit_fallbacks=True)
     def test_custom_op_unbacked_symints(self):
@@ -11580,7 +12750,15 @@ def fn(x):
                 # to be channels-last. If this assertion ever fails then we need
                 # a new test case.
                 self.assertEqual(len(bar_strides), 1)
+<<<<<<< HEAD
                 self.assertNotEqual(bar_strides[0], expected_stride)
+=======
+                if self.device == "mps" and MACOS_VERSION < 15.0:
+                    # Before MacOS15 contigous output were returned regardless of input
+                    self.assertEqual(bar_strides[0], expected_stride)
+                else:
+                    self.assertNotEqual(bar_strides[0], expected_stride)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @config.patch(implicit_fallbacks=True)
     @skip_if_cpp_wrapper(
@@ -11803,7 +12981,11 @@ def hook_fn(scheduler, nodes):
                 # 'i1 + 3 * i0' is cached.
                 self.assertTrue(
                     "i0 + 2 * i1" in mul_buf.data.inner_fn_str()
+<<<<<<< HEAD
                     or "i0 + i1 * s1" in mul_buf.data.inner_fn_str()
+=======
+                    or "i0 + i1 * s64" in mul_buf.data.inner_fn_str()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         with add_scheduler_init_hook(hook_fn):
@@ -11818,9 +13000,16 @@ def fn(tensor, index, source):
             return out
 
         device = "cpu"
+<<<<<<< HEAD
         tensor = torch.rand((1,), dtype=torch.double, device=device)
         index = torch.tensor([0], dtype=torch.long, device=device)
         source = torch.rand((1,), dtype=torch.double, device=device)
+=======
+        dtype = torch.double if self.device != "mps" else torch.float32
+        tensor = torch.rand((1,), dtype=dtype, device=device)
+        index = torch.tensor([0], dtype=torch.long, device=device)
+        source = torch.rand((1,), dtype=dtype, device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.common(
             fn,
             (
@@ -11998,6 +13187,11 @@ def fn(a, b):
             x_view = x.view(dtype=torch.int16)
             return x_view.mul(2) + x_view.bitwise_and(2)
 
+<<<<<<< HEAD
+=======
+        if not self.is_dtype_supported(torch.bfloat16):
+            raise unittest.SkipTest("bfloat16 is not supported on {self.device}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = torch.ones(4, dtype=torch.bfloat16, device=self.device)
         b = torch.ones(4, dtype=torch.bfloat16, device=self.device)
         ref = fn(a, b)
@@ -12200,7 +13394,32 @@ def fn(x, n):
             def fn(x):
                 return op(x)
 
+<<<<<<< HEAD
         self.common(fn, args, check_lowp=check_lowp, atol=1e-4, rtol=1e-4)
+=======
+        ctx = (
+            contextlib.nullcontext()
+            if self.device != "mps"
+            or name
+            not in [
+                "airy_ai",
+                "erfcx",
+                "gammainc",
+                "gammaincc",
+                "laguerre_polynomial_l",
+                "legendre_polynomial_p",
+                "log_ndtr",
+                "ndtri",
+                "shifted_chebyshev_polynomial_t",
+                "shifted_chebyshev_polynomial_u",
+                "shifted_chebyshev_polynomial_v",
+                "shifted_chebyshev_polynomial_w",
+            ]
+            else self.assertRaises(NotImplementedError)
+        )
+        with ctx:
+            self.common(fn, args, check_lowp=check_lowp, atol=1e-4, rtol=1e-4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # codegen test fails with no dynamic for loop in dynamic shape tests
     @expectedFailureCodegenDynamic
@@ -12271,7 +13490,15 @@ def test_generate_rand_fp8(self):
         self.assertTrue(t.dtype is torch.float8_e4m3fn)
 
     @largeTensorTest("1GB", inductor=True)
+<<<<<<< HEAD
     def test_large_grid(self):
+=======
+    @parametrize(
+        "use_block_ptr",
+        [subtest(False), subtest(True, decorators=[skip_if_not_triton])],
+    )
+    def test_large_grid(self, use_block_ptr):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # https://github.com/pytorch/pytorch/issues/123210
         def fn(primals_5):
             view = torch.ops.aten.reshape.default(primals_5, [-1, 2, 4])
@@ -12284,9 +13511,17 @@ def fn(primals_5):
 
         s0 = 16777472
         s1 = 8
+<<<<<<< HEAD
         compiled_fn = torch.compile(fn)
         actual = compiled_fn(torch.ones(s0, s1, device=self.device))
         self.assertTrue((actual == 1).all())
+=======
+
+        with config.patch({"triton.use_block_ptr": use_block_ptr}):
+            compiled_fn = torch.compile(fn)
+            actual = compiled_fn(torch.ones(s0, s1, device=self.device))
+            self.assertTrue((actual == 1).all())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_gpu_halide
     def test_pattern_matcher_multi_user(self):
@@ -12301,9 +13536,21 @@ def forward(float_1, view_1):
         a = torch.randn(512, 4096, requires_grad=True)
         b = torch.randint(size=(512,), low=0, high=4095)
 
+<<<<<<< HEAD
+        self.common(forward, (a, b))
+
+    def test_isin_tensor_scalar(self):
+=======
+        if self.device == "mps" and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Fails with internal compiler error on MacOS-13")
+
         self.common(forward, (a, b))
 
     def test_isin_tensor_scalar(self):
+        if self.device == "mps" and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("isin is not implemented on MacOS-13")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for invert in [True, False]:
             torch._dynamo.reset()
             elements = 1
@@ -12532,7 +13779,11 @@ def f(image_latent):
         torch.testing.assert_close(ref, act, atol=1e-3, rtol=1e-3)
 
         if is_dynamic_shape_enabled():
+<<<<<<< HEAD
             size_assert_pattern = r"assert_size_stride.[a-z]+[0-9]+, .2, 3, s1, s2, s2., .3\*s1\*s2\*s2, s1\*s2\*s2, 1, s1\*s2, s1.."  # noqa: B950
+=======
+            size_assert_pattern = r"assert_size_stride.[a-z]+[0-9]+, .2, 3, s12, s80, s80., .3\*s12\*s80\*s80, s12\*s80\*s80, 1, s12\*s80, s1.."  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             size_assert_pattern = r"assert_size_stride.[a-z]+[0-9]+, .2, 3, 16, 32, 32., .49152, 16384, 1, 512, 16.."
         FileCheck().check_regex(size_assert_pattern).run(code)
@@ -12551,12 +13802,21 @@ def f(x):
         code = run_and_get_triton_code(f, x)
 
         if is_dynamic_shape_enabled():
+<<<<<<< HEAD
             FileCheck().check("assert_size_stride(buf1, (s0, s1), (s1, 1))").check(
                 "assert_size_stride(buf2, (s0, s1), (s1, 1))"
             ).run(code)
         else:
             FileCheck().check("assert_size_stride(buf1, (16, 32), (32, 1))").check(
                 "assert_size_stride(buf2, (16, 32), (32, 1))"
+=======
+            FileCheck().check("assert_size_stride(buf1, (s77, s27), (s27, 1)").check(
+                "assert_size_stride(buf2, (s77, s27), (s27, 1)"
+            ).run(code)
+        else:
+            FileCheck().check("assert_size_stride(buf1, (16, 32), (32, 1)").check(
+                "assert_size_stride(buf2, (16, 32), (32, 1)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).run(code)
 
     @requires_cuda
@@ -12613,6 +13873,31 @@ def foo():
         foo()
 
     @torch._inductor.config.patch("graph_partition", True)
+<<<<<<< HEAD
+=======
+    def test_graph_partition_mutation_real_name(self):
+        def f(x, y, z, other):
+            mul = x * y
+            diag = torch.diagonal(mul)
+            diag.copy_(other)
+            # force grah partition by device copy
+            u = diag.cpu().to(self.device)
+            return torch.mm(mul, z) + u + diag
+
+        inps = (
+            torch.randn(3, 3, device=self.device),
+            torch.randn(3, 3, device=self.device),
+            torch.randn(3, 3, device=self.device),
+            torch.randn(3, device=self.device),
+        )
+
+        eager_out = f(*inps)
+        compiled_f = torch.compile(f)
+        compiled_out = compiled_f(*inps)
+        torch.testing.assert_close(eager_out, compiled_out)
+
+    @torch._inductor.config.patch("graph_partition", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_graph_partition_arange1(self):
         def fn(step, device):
             return torch.arange(512, -512, step, device=device)
@@ -12828,6 +14113,196 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
                 assert len(inps) == 0
 
+<<<<<<< HEAD
+=======
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_graph_partition_pad_dynamic(self):
+        def get_same_padding(x: int, k: int, s: int, d: int):
+            return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
+
+        def pad_same(x, k, s, d=(1, 1), value=0):
+            ih, iw = x.size()[-2:]
+            pad_h, pad_w = (
+                get_same_padding(ih, k[0], s[0], d[0]),
+                get_same_padding(iw, k[1], s[1], d[1]),
+            )
+            if pad_h > 0 or pad_w > 0:
+                x = torch.nn.functional.pad(
+                    x,
+                    [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2],
+                    value=value,
+                )
+            return x
+
+        x = torch.randn(2, 24, 110, 110, device=self.device)
+        opt = torch.compile(pad_same, dynamic=True)
+        res = opt(x, (5, 5), (2, 2))
+        ref = pad_same(x, (5, 5), (2, 2))
+        self.assertEqual(res, ref, atol=0, rtol=0)
+
+    @skip_if_halide  # only 32-bit indexing
+    @largeTensorTest("16GB", inductor=True)
+    def test_split_reduction_with_int64_size(self):
+        if torch._inductor.config.cpu_backend == "triton":
+            raise unittest.SkipTest(
+                "Fail for triton cpu backend with error: https://gist.github.com/shunting314/a873fb32b6b7b5a437f44280ae86839f"
+            )
+
+        if self.device == "cpu":
+            raise unittest.SkipTest(
+                "The test fails some times on CI: "
+                "https://github.com/pytorch/pytorch/actions/runs/15333913377/job/43153170162. "
+                "Skip for now."
+            )
+
+        size = (30000, 100000)
+
+        # rand rather than randn since the mean for the latter is close to 0
+        # which happens to be close to the value generated by the bug.
+        t = torch.rand(size, dtype=torch.float, device=self.device)
+        op = torch.mean
+        expected = op(t)
+        actual = torch.compile(op)(t)
+        # self.common takes more GPU memory. Do the check dirctly
+        self.assertTrue(
+            torch.allclose(expected, actual, atol=1e-2, rtol=1e-2),
+            f"{expected=} {actual=}",
+        )
+
+    def test_remove_noop_view_default(self):
+        def f(x):
+            batch_size = x.shape[0]
+            x = x.transpose(1, 2)  # (batch_size, 2, 3)
+            x = x.reshape(batch_size, 2, 3)  # noop
+            return x
+
+        f = torch.compile(f)
+
+        x = torch.randn((2, 3, 2), device=self.device)
+        expected_graph1 = f"""\
+def forward(self, arg0_1: "f32[2, 3, 2][6, 2, 1]{str(x.device)}"):
+        permute: "f32[2, 2, 3][6, 1, 2]{str(x.device)}" = torch.ops.aten.permute.default(arg0_1, [0, 2, 1]);  arg0_1 = None
+        return (permute,)"""  # noqa: B950
+
+        post_grad_graph = get_post_grad_graph(f, (x,))
+
+        self.assertExpectedInline(
+            post_grad_graph,
+            expected_graph1,
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+        # dynamic shape
+        x = torch.randn((4, 3, 2), device=self.device)
+        expected_graph2 = f"""\
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77, 3, 2][6, 2, 1]{str(x.device)}"):
+        permute: "f32[s77, 2, 3][6, 1, 2]{str(x.device)}" = torch.ops.aten.permute.default(arg1_1, [0, 2, 1]);  arg1_1 = None
+        return (permute,)"""  # noqa: B950
+        post_grad_graph = get_post_grad_graph(f, (x,))
+        self.assertExpectedInline(
+            post_grad_graph,
+            expected_graph2,
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+    def test_remove_noop_view_dtype(self):
+        def f(x):
+            x = x.transpose(1, 2)  # (batch_size, 2, 3)
+            x = x.view(torch.uint8)  # noop
+            return x
+
+        f = torch.compile(f)
+
+        x = torch.ones((2, 3, 2), device=self.device, dtype=torch.uint8)
+        torch._dynamo.mark_dynamic(x, 0)
+        torch._dynamo.mark_dynamic(x, 1)
+        torch._dynamo.mark_dynamic(x, 2)
+
+        post_grad_graph = get_post_grad_graph(f, (x,))
+        expected_graph = f"""\
+def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", arg3_1: "u8[s77, s27, s53][s27*s53, s53, 1]{str(x.device)}"):
+        permute: "u8[s77, s53, s27][s27*s53, 1, s53]{str(x.device)}" = torch.ops.aten.permute.default(arg3_1, [0, 2, 1]);  arg3_1 = None
+        return (permute,)"""  # noqa: B950
+        self.assertExpectedInline(
+            post_grad_graph,
+            expected_graph,
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+    @config.patch("min_num_split", 256)
+    @xfail_if_mps  # TypeError: cannot determine truth value of Relational
+    def test_split_reduction_dynamic_shape(self):
+        from torch._dynamo.decorators import mark_dynamic
+
+        def f(x):
+            # outer reduction
+            return x.sum(dim=0)
+
+        N = 512
+        x_small = torch.randn(4096, N, device=self.device)
+
+        mark_dynamic(x_small, 0)
+        expect = f(x_small)
+        opt_f = torch.compile(f, dynamic=True)
+        actual = opt_f(x_small)
+        self.assertTrue(torch.allclose(expect, actual, atol=1e-3, rtol=1e-3))
+
+        if DO_PERF_TEST:
+            from triton.testing import do_bench
+
+            # benchmark for a much larger input
+            x_large = torch.randn(4096 * 1000, N, device=self.device)
+            ms = do_bench(lambda: opt_f(x_large))
+            print(f"{ms=:.3f}")
+
+    @expectedFailureCodegenDynamic
+    def test_special_polygamma(self):
+        fn = torch.special.polygamma
+        x = torch.tensor(2, dtype=torch.float32)
+        self.common(fn, (0, x))
+        self.common(fn, (1, x))
+        self.common(fn, (2, x))
+
+    @skip_if_triton
+    @skip_if_halide
+    @config.patch({"freezing": True})
+    def test_dont_constant_fold(self):
+        from torch._inductor.constant_folding import (
+            add_dont_constant_fold,
+            clear_dont_constant_fold,
+        )
+
+        m = 5
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = torch.randn(m)
+                self.s = torch.randn(m)
+
+            def forward(self, x):
+                return self.w * self.s + x
+
+        x = torch.rand(m)
+        mod = M()
+        for dont_constant_fold in [True, False]:
+            clear_dont_constant_fold()
+            if dont_constant_fold:
+                add_dont_constant_fold(torch.ops.aten.mul.Tensor)
+            with torch.no_grad():
+                refe_out = mod(x)
+                mod = torch.compile(mod)
+                test_out, (code,) = run_and_get_code(mod, x)
+            if dont_constant_fold:
+                FileCheck().check("cpp_fused_add_mul").run(code)
+            else:
+                FileCheck().check("cpp_fused_add_0").run(code)
+            self.assertEqual(refe_out, test_out)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class TestFailure:
@@ -12836,9 +14311,13 @@ class TestFailure:
     __test__: bool = False
 
 
+<<<<<<< HEAD
 def copy_tests(
     my_cls, other_cls, suffix, test_failures=None, xfail_prop=None
 ):  # noqa: B902
+=======
+def copy_tests(my_cls, other_cls, suffix, test_failures=None, xfail_prop=None):  # noqa: B902
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for name, value in my_cls.__dict__.items():
         if name.startswith("test_"):
             # You cannot copy functions in Python, so we use closures here to
@@ -12873,7 +14352,11 @@ def new_test(self, value=value):
         other_cls.is_dtype_supported = my_cls.is_dtype_supported
 
 
+<<<<<<< HEAD
 if HAS_CPU:
+=======
+if RUN_CPU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class SweepInputsCpuTest(SweepInputs2, TestCase):
         gen = InputGen(10, "cpu")
@@ -12886,7 +14369,11 @@ class CpuTests(TestCase):
 
     copy_tests(CommonTemplate, CpuTests, "cpu")
 
+<<<<<<< HEAD
 if HAS_GPU:
+=======
+if RUN_GPU or HAS_MPS:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class SweepInputsGPUTest(SweepInputs2, TestCase):
         gen = InputGen(10, GPU_TYPE)
@@ -12899,6 +14386,11 @@ class GPUTests(TestCase):
 
     copy_tests(CommonTemplate, GPUTests, GPU_TYPE)
 
+<<<<<<< HEAD
+=======
+if RUN_GPU:
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @instantiate_parametrized_tests
     class TritonCodeGenTests(TestCase):
         from torch._inductor.runtime.triton_heuristics import CachingAutotuner
@@ -13182,6 +14674,54 @@ def forward(
                 )
                 torch._inductor.aot_compile(traced, inputs)
 
+<<<<<<< HEAD
+=======
+        @skipCUDAIf(not SM90OrLater, "Requires sm90")
+        @requires_cuda
+        @unittest.skipIf(TEST_WITH_ROCM, "no grouped_mm support")
+        @config.patch(implicit_fallbacks=True)
+        def test_grouped_mm(self):
+            @torch.compile(fullgraph=True)
+            def f(a, b, offs, out_dtype):
+                return torch._grouped_mm(
+                    a, b.transpose(-2, -1), offs=offs, out_dtype=out_dtype
+                )
+
+            device = "cuda"
+            dtype = torch.bfloat16
+
+            m, n, k, n_groups = 16, 32, 16, 4
+            a_ref = torch.randn(m * n_groups, k, device=device, dtype=dtype)[:, :k]
+
+            b_ref = torch.randn(
+                n_groups,
+                n,
+                k,
+                device=device,
+                dtype=dtype,
+            )[::1, :, :k]
+
+            offs = torch.arange(
+                m, n_groups * m + 1, m, device=device, dtype=torch.int32
+            )
+
+            a_ref.requires_grad_(True)
+            b_ref.requires_grad_(True)
+
+            a_test = a_ref.clone().detach().requires_grad_()
+            b_test = b_ref.clone().detach().requires_grad_()
+
+            out_ref = f(a_ref, b_ref, offs, out_dtype=torch.bfloat16)
+            out_ref.sum().backward()
+
+            out_test = f(a_test, b_test, offs=offs, out_dtype=torch.bfloat16)
+            out_test.sum().backward()
+
+            self.assertEqual(out_ref, out_test)
+            self.assertEqual(a_ref.grad, a_test.grad)
+            self.assertEqual(b_ref.grad, b_test.grad)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_optimize_indexing_assert(self):
             def has_indirect(code, tl_fn: str):
                 self.assertTrue(
@@ -13409,6 +14949,11 @@ def f(x, mask):
             # it does not move the tensor constructor to cuda and keeps it on CPU.
             self.assertFalse("empty_strided_cuda(()" in code)
 
+<<<<<<< HEAD
+=======
+        # only uncoalesced without this :)
+        @config.patch("triton.coalesce_tiling_analysis", False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @config.patch("triton.use_block_ptr", False)
         def test_evict_last_non_coalesced_loads(self):
             @torch.compile
@@ -13459,6 +15004,10 @@ def f(a, b):
             )
 
         @config.patch("triton.use_block_ptr", True)
+<<<<<<< HEAD
+=======
+        @config.patch("triton.coalesce_tiling_analysis", False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_evict_last_non_coalesced_loads_block_ptr(self):
             @torch.compile
             def f(a, b):
@@ -13595,9 +15144,17 @@ def fn3(x):
                 ),
                 (
                     fn3,
+<<<<<<< HEAD
                     "triton_poi_fused_layer_norm_relu"
                     if torch._dynamo.config.inline_inbuilt_nn_modules
                     else "triton_poi_fused_LayerNorm_ReLU",
+=======
+                    (
+                        "triton_poi_fused_layer_norm_relu"
+                        if torch._dynamo.config.inline_inbuilt_nn_modules
+                        else "triton_poi_fused_LayerNorm_ReLU"
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     (torch.randn(4, 4, device=GPU_TYPE),),
                 ),
             ]
@@ -13952,9 +15509,13 @@ def forward(self, x):
                         B,
                         T,
                         C,
+<<<<<<< HEAD
                     ) = (
                         x.size()
                     )  # batch size, sequence length, embedding dimensionality (n_embd)
+=======
+                    ) = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # calculate query, key, values for all heads in batch and move head forward to be the batch dim
                     qkv = self.c_attn(x)
                     q, k, v = qkv.split(self.n_embd, dim=2)
@@ -14037,9 +15598,15 @@ def __init__(self, config):
                 def forward(self, idx, targets):
                     device = idx.device
                     b, t = idx.size()
+<<<<<<< HEAD
                     assert (
                         t <= self.config.block_size
                     ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+=======
+                    assert t <= self.config.block_size, (
+                        f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     pos = torch.arange(
                         0, t, dtype=torch.long, device=device
                     )  # shape (t)
@@ -14129,12 +15696,33 @@ def f(x, y):
 
             if not config.cpp_wrapper:
                 FileCheck().check("def partition_0(args):").check(
+<<<<<<< HEAD
                     "(buf0, buf1) = self.partitions[0](partition0_args)"
+=======
+                    "(buf0, buf1, arg0_1, arg1_1) = self.partitions[0](partition0_args)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ).check("recursively_apply_fns = runner.recursively_apply_fns").run(
                     code[0]
                 )
 
         @torch._inductor.config.patch("graph_partition", True)
+<<<<<<< HEAD
+=======
+        def test_graph_partition_foreach_op(self):
+            def fn(a0, a1):
+                c = torch._foreach_abs([a0, a1])
+                return torch.mul(c[0], a0)
+
+            compiled_fn = torch.compile(fn)
+
+            a0 = torch.randn(2, 3, device=self.device)
+            a1 = torch.randn(2, 3, device=self.device)
+            eager_out = fn(a0, a1)
+            compiled_out = compiled_fn(a0, a1)
+            self.assertEqual(eager_out, compiled_out)
+
+        @torch._inductor.config.patch("graph_partition", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_graph_partition_multiple_functions(self):
             def f(x, y):
                 x1 = x + 1
@@ -14167,14 +15755,55 @@ def false_fn(x):
 
                 return torch.cond(p, true_fn, false_fn, [b])
 
+<<<<<<< HEAD
             p = torch.tensor([True], device=self.device)
             a = torch.ones([2, 3], device=self.device)
 
             compiled_f = torch.compile(f)
+=======
+            compiled_f = torch.compile(f)
+
+            # static shape
+            p = torch.tensor([True], device=self.device)
+            a = torch.ones([2, 3], device=self.device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+            eager_out = f(p, a)
+            compiled_out = compiled_f(p, a)
+            self.assertEqual(eager_out, compiled_out)
+
+<<<<<<< HEAD
+=======
+            # dynamic shape with backed symint
+            p = torch.tensor([True], device=self.device)
+            a = torch.ones([4, 5], device=self.device)
             eager_out = f(p, a)
             compiled_out = compiled_f(p, a)
             self.assertEqual(eager_out, compiled_out)
 
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._dynamo.config.patch("capture_scalar_outputs", True)
+        def test_graph_partition_unbacked_symint_multi_output_layout(self):
+            def f(p, size_tensor):
+                size_val = size_tensor.item()
+                b = torch.ones([size_val, 3], device=GPU_TYPE)
+
+                def true_fn(x):
+                    return torch.cos(x), torch.cos(x) + 1
+
+                def false_fn(x):
+                    return torch.sin(x), torch.sin(x) + 1
+
+                cond_out = torch.cond(p, true_fn, false_fn, [b])
+                return cond_out[0] + cond_out[1]
+
+            compiled_f = torch.compile(f)
+            p = torch.tensor([True], device=GPU_TYPE)
+            size_tensor = torch.tensor(2, device=GPU_TYPE)
+            eager_out = f(p, size_tensor)
+            compiled_out = compiled_f(p, size_tensor)
+            self.assertEqual(eager_out, compiled_out)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch._inductor.config.patch("graph_partition", True)
         def test_graph_partition_symint(self):
             def f(x, y):
@@ -14185,19 +15814,95 @@ def f(x, y):
                 return x1 + y1 + z + y_cpu.to(GPU_TYPE)
 
             f_compiled = torch.compile(f)
+<<<<<<< HEAD
             x, y = torch.ones(3, 3, device=self.device), torch.randn(
                 3, 3, device=self.device
+=======
+            x, y = (
+                torch.ones(3, 3, device=self.device),
+                torch.randn(3, 3, device=self.device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             compiled_out = f_compiled(x, y)
             self.assertEqual(compiled_out, f(x, y))
 
+<<<<<<< HEAD
             x, y = torch.ones(4, 4, device=self.device), torch.randn(
                 4, 4, device=self.device
+=======
+            x, y = (
+                torch.ones(4, 4, device=self.device),
+                torch.randn(4, 4, device=self.device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             compiled_out = f_compiled(x, y)
             self.assertEqual(compiled_out, f(x, y))
 
         @torch._inductor.config.patch("graph_partition", True)
+<<<<<<< HEAD
+=======
+        def test_graph_partition_symint_cat_backward(self):
+            def f(x, w):
+                y = torch.cat((x, x), dim=0)
+                z = y @ w
+                return z @ z.T
+
+            compiled_f = torch.compile(f)
+
+            for shape in (2, 3):
+                torch.manual_seed(42)
+                eager_x = torch.randn(shape, 2, device=self.device)
+                eager_w = torch.randn(2, 2, device=self.device, requires_grad=True)
+                torch.manual_seed(42)
+                compiled_x = torch.randn(shape, 2, device=self.device)
+                compiled_w = torch.randn(2, 2, device=self.device, requires_grad=True)
+
+                f(eager_x, eager_w).sum().backward()
+                compiled_f(compiled_x, compiled_w).sum().backward()
+                self.assertEqual(eager_w.grad, compiled_w.grad)
+
+        @dynamo_config.patch("capture_dynamic_output_shape_ops", True)
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint_from_nested_indirect_indexing(self):
+            def nested(x, repeats):
+                rank = torch.arange(repeats.numel(), device=x.device)
+                index = rank.repeat_interleave(repeats, dim=0)
+                return torch.index_select(x, index=index, dim=0)
+
+            example_inputs = (
+                torch.randn((32, 64), device=self.device),
+                repeats := torch.tensor([5, 10, 15], device=self.device),
+            )
+            torch._dynamo.mark_dynamic(repeats, 0)  # create backed symint
+
+            nested_opt = torch.compile(nested, backend="inductor")
+
+            expect = nested(*example_inputs)
+            actual = nested_opt(*example_inputs)
+            self.assertEqual(expect, actual)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_symint_from_mutation_index(self):
+            x = torch.zeros(7, device=GPU_TYPE)
+
+            def fn(n, a):
+                a[n] = -1
+                return a
+
+            opt_fn = torch.compile(fn, fullgraph=True)
+
+            for n in range(2, x.shape[0]):
+                opt_fn(n, x)
+                self.assertEqual(x[n], -1)
+
+            # Negative index triggers new compilation.
+            opt_fn(-x.shape[0], x)
+
+            self.assertEqual(x[0], -1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_graph_partition_unbacked_symint(self):
             def f(x, y):
                 x1 = x + 1
@@ -14207,8 +15912,14 @@ def f(x, y):
                 return x1 + y1 + z + y_cpu.to(GPU_TYPE)
 
             f_compiled = torch.compile(f)
+<<<<<<< HEAD
             x, y = torch.ones(3, 3, device=self.device), torch.randn(
                 3, 3, device=self.device
+=======
+            x, y = (
+                torch.ones(3, 3, device=self.device),
+                torch.randn(3, 3, device=self.device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             torch._dynamo.decorators.mark_unbacked(x, 0)
@@ -14219,6 +15930,46 @@ def f(x, y):
             self.assertEqual(compiled_out, eager_out)
 
         @torch._inductor.config.patch("graph_partition", True)
+<<<<<<< HEAD
+=======
+        def test_graph_partition_dynamic_scalar_inputs(self):
+            def f(x, y, integer):
+                x1 = x + 1
+                y1 = y + 1
+                y_cpu = y1.cpu() + 1
+                z = x @ y
+                z += integer
+                return x1 + y1 + z + y_cpu.to(GPU_TYPE)
+
+            f_compiled = torch.compile(f)
+            x, y = (
+                torch.ones(3, 3, device=self.device),
+                torch.randn(3, 3, device=self.device),
+            )
+
+            torch._dynamo.decorators.mark_unbacked(x, 0)
+            torch._dynamo.decorators.mark_unbacked(y, 1)
+
+            compiled_out = f_compiled(x, y, 5)
+            self.assertEqual(compiled_out, f(x, y, 5))
+
+            compiled_out = f_compiled(x, y, 6)
+            self.assertEqual(compiled_out, f(x, y, 6))
+
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._dynamo.config.patch("capture_scalar_outputs", True)
+        def test_graph_partition_item(self):
+            def f(x):
+                y = x + 1
+                scalar = y.item()
+                return x + y + scalar
+
+            compiled_f = torch.compile(f)
+            compiled_out = f(torch.tensor(1, device=GPU_TYPE))
+            self.assertEqual(compiled_out, f(torch.tensor(1, device=GPU_TYPE)))
+
+        @torch._inductor.config.patch("graph_partition", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_graph_partition_buffer_reuse(self):
             def f(x, y):
                 x1 = x + 1
@@ -14238,6 +15989,29 @@ def f(x, y):
 
             self.assertEqual(eager_out, compiled_out)
 
+<<<<<<< HEAD
+=======
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_fused_scheduler_node(self):
+            def foo(x):
+                x = x * 20
+                x_alias = x[0]
+                y = x * 10
+                y_alias = y[0]
+                torch._dynamo.graph_break()
+                ind = torch.tensor(4, device=GPU_TYPE)
+                x_alias2 = x[ind:]
+                y_alias2 = y[ind:]
+                return x, x_alias, x_alias2, y_alias, y_alias2
+
+            foo = torch.compile(foo)
+            x = torch.rand([20, 20], device=GPU_TYPE)
+            _, code = run_and_get_code(foo, x)
+
+            if not config.cpp_wrapper:
+                FileCheck().check("def partition_0(args):").run(code[0])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     class RNNTest(TestCase):
         device_type = GPU_TYPE
 
@@ -14272,6 +16046,12 @@ def f(x):
                 self.assertIn("aoti_torch_check_inf_and_nan", code)
             else:
                 self.assertIn("# make sure graph inputs are not nan/inf", code)
+<<<<<<< HEAD
+=======
+                self.assertRegex(code, r"return_vars = (.*)")
+                self.assertIn("for var in return_vars:", code)
+                self.assertIn("if isinstance(var, torch.Tensor):", code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertRegex(code, r"assert not .*\.isnan\(\)\.any\(\).item\(\)")
                 self.assertRegex(code, r"assert not .*\.isinf\(\)\.any\(\).item\(\)")
 
@@ -14288,7 +16068,11 @@ def f(x):
                 torch.compile(f)(x)
 
 
+<<<<<<< HEAD
 if HAS_CPU:
+=======
+if RUN_CPU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class TestFull(TestCase):
         def test_full_dtype(self):
@@ -14352,5 +16136,9 @@ def _run_and_get_stripped_kernels(
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
+<<<<<<< HEAD
     if HAS_CPU or HAS_GPU:
+=======
+    if RUN_CPU or RUN_GPU:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_tests(needs="filelock")
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
index 240138058759..abfc06dc5838 100644
--- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -44,20 +44,32 @@ def check_codegen(
     example_inputs,
     kwargs=None,
     *,
+<<<<<<< HEAD
+=======
+    device: torch.types.Device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_cpp_code: bool,
 ):
     kwargs = kwargs or {}
 
     if is_cpp_code is False:
         if hasattr(model, "to"):
+<<<<<<< HEAD
             model = model.to(device=GPU_TYPE)
+=======
+            model = model.to(device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def copy_fn(x):
             # preserve strides of the input on the device
             if not isinstance(x, torch.Tensor):
                 return x
             return torch.empty_strided(
+<<<<<<< HEAD
                 x.size(), x.stride(), device=GPU_TYPE, dtype=x.dtype
+=======
+                x.size(), x.stride(), device=device, dtype=x.dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).copy_(x)
 
         example_inputs = tuple(copy_fn(x) for x in example_inputs)
@@ -136,13 +148,20 @@ def run(*ex, **kwargs):
     "test_mul_index_expr_dynamic_shapes": TestFailure(("cpu",)),
     "test_flip_cat_dynamic_shapes": TestFailure(("cpu",)),
     "test_pad_single_dynamic_shapes": TestFailure(("cpu",)),
+<<<<<<< HEAD
+=======
+    "test_embedding_sparse_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #
     # Failed to find for loop/triton kernel:
     #
     "test_complex_fallback_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_adaptive_avg_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_adaptive_max_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+<<<<<<< HEAD
     "test_fractional_max_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_argmax_to_float_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_avg_pool2d7_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_avg_pool2d_backward4_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
@@ -157,6 +176,10 @@ def run(*ex, **kwargs):
     "test_conv_functional_bn_fuse_dynamic_shapes": TestFailure(("cpu",), is_skip=True),
     "test_convolution2_dynamic_shapes": TestFailure(("cpu",)),
     "test_cumprod_zero_dim_dynamic_shapes": TestFailure(("cpu",)),
+<<<<<<< HEAD
+=======
+    "test_cummin_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_cumsum_dynamic_shapes": TestFailure(("cpu",)),
     "test_cumsum_no_mask_dynamic_shapes": TestFailure(("cpu",)),
     "test_cumsum_zero_dim_dynamic_shapes": TestFailure(("cpu",)),
@@ -168,7 +191,15 @@ def run(*ex, **kwargs):
     "test_bucketize_nd_tiling_False_dynamic_shapes": TestFailure(("cpu",)),
     "test_bucketize_nd_tiling_True_dynamic_shapes": TestFailure(("cpu",)),
     "test_bucketize_default_kwargs_dynamic_shapes": TestFailure(("cpu",)),
+<<<<<<< HEAD
     "test_bucketize_int_dynamic_shapes": TestFailure(("cpu",)),
+=======
+    "test_bucketize_int_uint8_uint8_dynamic_shapes": TestFailure(("cpu",)),
+    "test_bucketize_int_int8_int8_dynamic_shapes": TestFailure(("cpu",)),
+    "test_bucketize_int_int16_int16_dynamic_shapes": TestFailure(("cpu",)),
+    "test_bucketize_int_int32_int32_dynamic_shapes": TestFailure(("cpu",)),
+    "test_bucketize_int_int64_int64_dynamic_shapes": TestFailure(("cpu",)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_searchsorted_dynamic_shapes": TestFailure(("cpu",)),
     "test_like_rands_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_linspace2_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
@@ -176,7 +207,10 @@ def run(*ex, **kwargs):
     "test_linspace4_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_logcumsumexp_dynamic_shapes": TestFailure(("cpu",)),
     "test_logcumsumexp_zero_dim_dynamic_shapes": TestFailure(("cpu",)),
+<<<<<<< HEAD
     "test_max_pool2d8_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_max_pool2d_with_indices_backward5_dynamic_shapes": TestFailure(
         ("cpu", "cuda")
     ),
@@ -237,7 +271,10 @@ def run(*ex, **kwargs):
     "test_pointwise_laguerre_polynomial_l_dynamic_shapes": TestFailure(("cuda", "xpu")),
     "test_pointwise_legendre_polynomial_p_dynamic_shapes": TestFailure(("cuda", "xpu")),
     "test_polar_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu"), is_skip=True),
+<<<<<<< HEAD
     "test_randint_distribution_dynamic_shapes": TestFailure(("cuda", "xpu")),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_randn_generator_dynamic_shapes": TestFailure(("cpu",)),
     "test_randn_like_empty_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_single_elem_dynamic_shapes": TestFailure(("cpu",)),
@@ -261,9 +298,12 @@ def run(*ex, **kwargs):
     ),
     "test_zero_element_mutation_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_custom_op_3_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
+<<<<<<< HEAD
     "test_custom_op_fixed_layout_sequential_dynamic_shapes": TestFailure(
         ("cuda", "xpu") if IS_LINUX else ("cpu", "cuda", "xpu")
     ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_cat_uint8_dynamic_shapes": TestFailure(
         ("cpu",)
     ),  # cat on uint8 input is using aten fallback on cpu
@@ -383,11 +423,20 @@ def run(*ex, **kwargs):
     **dynamic_shapes_test_failures,
 }
 
+<<<<<<< HEAD
 if TEST_WITH_ROCM:
     test_failures.update(
         {
             "test_split_cumsum_low_prec_dynamic_shapes": TestFailure(("cpu", "cuda")),
             "test_split_cumprod_low_prec_dynamic_shapes": TestFailure(("cpu", "cuda")),
+=======
+if not TEST_WITH_ROCM:
+    test_failures.update(
+        {
+            "test_custom_op_fixed_layout_sequential_dynamic_shapes": TestFailure(
+                ("cuda") if IS_LINUX else ("cpu", "cuda", "xpu")
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     )
 
@@ -407,8 +456,14 @@ def common(self: TestCase, model, example_inputs, kwargs=None, **_rest):
                 self=self,
                 model=model,
                 example_inputs=example_inputs,
+<<<<<<< HEAD
                 kwargs=kwargs,
                 is_cpp_code=True,
+=======
+                device=self.device,
+                kwargs=kwargs,
+                is_cpp_code=torch._inductor.config.cpu_backend == "cpp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     copy_tests(
@@ -430,6 +485,10 @@ def common(self: TestCase, model, example_inputs, kwargs=None, **_rest):
                 self=self,
                 model=model,
                 example_inputs=example_inputs,
+<<<<<<< HEAD
+=======
+                device=self.device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kwargs=kwargs,
                 is_cpp_code=False,
             )
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index 332ec3c86ac5..9deae5586f51 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -12,8 +12,11 @@
 import torch.library
 from torch._dynamo.testing import CompileCounterWithBackend, make_test_cls_with_patches
 from torch._inductor import metrics
+<<<<<<< HEAD
 from torch._inductor.codegen.common import device_codegens, register_backend_for_device
 from torch._inductor.codegen.cpp import CppScheduling
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.codegen.wrapper import PythonWrapperCodegen
 from torch._inductor.test_case import TestCase
 from torch._inductor.utils import run_and_get_code
@@ -34,7 +37,16 @@
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
 )
+<<<<<<< HEAD
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
+=======
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CPU,
+    HAS_GPU,
+    patch_inductor_backend,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Make the helper files in test/ importable
@@ -58,11 +70,18 @@
     "test_AllenaiLongformerBase_repro_dynamic_shapes": TestFailure(
         ("cpu", "cuda", "xpu")
     ),
+<<<<<<< HEAD
     "test_randint_distribution_dynamic_shapes": TestFailure(("cuda", "xpu")),
 }
 if not torch._inductor.config.cpp_wrapper:
     test_failures["test_conv_inference_heuristics_dynamic_shapes"] = TestFailure(
         ("cuda", "xpu")
+=======
+}
+if not torch._inductor.config.cpp_wrapper:
+    test_failures["test_conv_inference_heuristics_dynamic_shapes"] = TestFailure(
+        ("cuda",)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 if TEST_WITH_ROCM:
@@ -367,18 +386,32 @@ def f(x):
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     @torch._inductor.config.patch(implicit_fallbacks=True)
     def test_item_to_inputs_kernel_nobreak(self, device):
+<<<<<<< HEAD
         @torch.library.custom_op("test::foo", mutates_args=())
         def foo(x: torch.Tensor, y: int) -> torch.Tensor:
             return x.clone()
 
         @foo.register_fake
+=======
+        @torch.library.custom_op(
+            "test_inductor_dynamic_shapes::nobreak_test", mutates_args=()
+        )
+        def nobreak_test(x: torch.Tensor, y: int) -> torch.Tensor:
+            return x.clone()
+
+        @nobreak_test.register_fake
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def _(x: torch.Tensor, y: int) -> torch.Tensor:
             return x.clone()
 
         @torch.compile(fullgraph=True)
         def f(x, r):
             y = x.item()
+<<<<<<< HEAD
             return torch.ops.test.foo(r, y)
+=======
+            return torch.ops.test_inductor_dynamic_shapes.nobreak_test(r, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         f(torch.tensor([3], device=device), torch.randn(10, device=device))
 
@@ -591,11 +624,21 @@ def f(x):
     )
     @torch._inductor.config.patch(implicit_fallbacks=True)
     def test_multi_output_unbacked_custom_op(self, device):
+<<<<<<< HEAD
         @torch.library.custom_op("test::foo", mutates_args=())
         def foo(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
             return torch.empty(2, device=x.device), torch.empty(3, device=x.device)
 
         @foo.register_fake
+=======
+        @torch.library.custom_op(
+            "test_inductor_dynamic_shapes::unbacked_test", mutates_args=()
+        )
+        def unbacked_test(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            return torch.empty(2, device=x.device), torch.empty(3, device=x.device)
+
+        @unbacked_test.register_fake
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def _(x: torch.Tensor) -> torch.Tensor:
             ctx = torch.library.get_ctx()
             u0 = ctx.new_dynamic_size()
@@ -603,11 +646,28 @@ def _(x: torch.Tensor) -> torch.Tensor:
 
         @torch.compile(fullgraph=True)
         def f(x):
+<<<<<<< HEAD
             a, b = torch.ops.test.foo(x)
+=======
+            a, b = torch.ops.test_inductor_dynamic_shapes.unbacked_test(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return a.sum() + b.sum()
 
         f(torch.tensor([3], device=device))
 
+<<<<<<< HEAD
+=======
+    def test_meta_dynamic_shapes(self):
+        def foobar(x, y):
+            return x * 2, y * 3
+
+        foo_c = torch.compile(dynamic=True)(foobar)
+        t = torch.empty((1, 16, 128, 128), device="meta")
+        y = torch.rand([64])
+
+        self.assertEqual(foo_c(t, y), foobar(t, y))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_floor(self):
         def fn(x):
             n = x.size(-1)
@@ -633,8 +693,14 @@ def get_same_padding(x: int, k: int, s: int, d: int):
 
         def pad_same(x, k, s, d=(1, 1), value=0):
             ih, iw = x.size()[-2:]
+<<<<<<< HEAD
             pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(
                 iw, k[1], s[1], d[1]
+=======
+            pad_h, pad_w = (
+                get_same_padding(ih, k[0], s[0], d[0]),
+                get_same_padding(iw, k[1], s[1], d[1]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             if pad_h > 0 or pad_w > 0:
                 x = torch.nn.functional.pad(
@@ -903,7 +969,13 @@ def _test_wrapper_codegen_statically_known_int_or_none_in_context():
                 # testing fn_2
                 assert (
                     PythonWrapperCodegen.statically_known_int_or_none(batch_dim) == 5
+<<<<<<< HEAD
                 ), "Should be limited to exactly 5 on second call due to multiple constraints"
+=======
+                ), (
+                    "Should be limited to exactly 5 on second call due to multiple constraints"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif call_count == 2:
                 # testing fn_3
                 assert (
@@ -918,6 +990,7 @@ def generate(self, is_inference, *args, **kwargs):
                 _test_wrapper_codegen_statically_known_int_or_none_in_context()
                 return super().generate(is_inference, *args, **kwargs)
 
+<<<<<<< HEAD
         if "cpu" not in device_codegens:
             register_backend_for_device("cpu", CppScheduling, PythonWrapperCodegen)
         orig_cpu_codegens = device_codegens["cpu"]
@@ -925,16 +998,22 @@ def generate(self, is_inference, *args, **kwargs):
             register_backend_for_device(
                 "cpu", orig_cpu_codegens.scheduling, TestWrapperCodegen
             )
+=======
+        with patch_inductor_backend("cpu", python_wrapper_codegen=TestWrapperCodegen):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Compile each of the functions above, with an example input
             # that has 5 in the first dimension, but is marked as dynamic
 
             torch.compile(backend="inductor", dynamic=None)(fn_1)(_x)
             torch.compile(backend="inductor", dynamic=None)(fn_2)(_x)
             torch.compile(backend="inductor", dynamic=None)(fn_3)(_x)
+<<<<<<< HEAD
         finally:
             register_backend_for_device(
                 "cpu", orig_cpu_codegens.scheduling, orig_cpu_codegens.wrapper_codegen
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_item_unbacked_stride_nobreak(self, device):
@@ -1043,6 +1122,22 @@ def fn(x, y):
         self.assertEqual(fn(x, 4.0), fn_opt(x, 4.0))
         self.assertEqual(cnt.frame_count, 2)
 
+<<<<<<< HEAD
+=======
+    def test_unspecialized_float_dynamic(self):
+        def fn(x, y):
+            return x * y
+
+        cnt = CompileCounterWithBackend("inductor")
+        fn_opt = torch.compile(fn, dynamic=True, backend=cnt)
+        x = torch.randn(5, 5)
+
+        self.assertEqual(fn(x, 2.0), fn_opt(x, 2.0))
+        self.assertEqual(fn(x, 3.0), fn_opt(x, 3.0))
+        self.assertEqual(fn(x, 4.0), fn_opt(x, 4.0))
+        self.assertEqual(cnt.frame_count, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @torch._dynamo.config.patch(specialize_float=False)
     def test_unspecialized_float_fallback_symint_specialization(self):
         def fn(x, y):
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index f3fddb9128d3..7e6413303371 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -30,7 +30,10 @@
 )
 from torch.testing._internal.common_methods_invocations import op_db, skipOps
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     dtype_abbrs,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_MACOS,
     IS_X86,
     skipCUDAMemoryLeakCheckIf,
@@ -45,9 +48,17 @@
     GPU_TYPE,
     HAS_CPU,
     HAS_CUDA,
+<<<<<<< HEAD
     HAS_XPU,
     maybe_skip_size_asserts,
 )
+=======
+    has_triton,
+    HAS_XPU,
+    maybe_skip_size_asserts,
+)
+from torch.utils._dtype_abbrs import dtype_abbrs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_map
 
@@ -135,7 +146,11 @@ def maybe_truncate(x, length=80):
                 if idx >= 0:
                     x = f"{x[:idx]}..."
                 if len(x) > length:
+<<<<<<< HEAD
                     return f"{x[:length - 3]}..."
+=======
+                    return f"{x[: length - 3]}..."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return x
 
             reasons = sorted(set(map(maybe_truncate, failed_reasons[key])))
@@ -466,6 +481,46 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("index_reduce.amax", f32): {"check_gradient": False},
     ("index_reduce.amax", f16): {"check_gradient": False},
     ("tanh", f16): {"atol": 1e-4, "rtol": 1e-2},
+<<<<<<< HEAD
+=======
+    ("_unsafe_masked_index", f16): {
+        "reference_in_float": True,
+        "atol": 3e-4,
+        "rtol": 2e-3,
+    },
+    ("nn.functional.interpolate.linear", f16): {"reference_in_float": True},
+    ("nn.functional.prelu", f16): {
+        "reference_in_float": True,
+        "atol": 1e-3,
+        "rtol": 4e-3,
+    },
+    ("addmm", f16): {"reference_in_float": True},
+    ("logaddexp", f16): {"reference_in_float": True},
+    ("std_mean", f16): {"reference_in_float": True},
+    ("hypot", f16): {"reference_in_float": True, "atol": 3e-4, "rtol": 2e-3},
+    ("cummin", f16): {"reference_in_float": True, "atol": 5e-5, "rtol": 2e-3},
+    ("unfold_copy", f16): {"reference_in_float": True, "atol": 2e-5, "rtol": 1e-2},
+    ("nn.functional.upsample_bilinear", f16): {
+        "reference_in_float": True,
+        "atol": 1e-4,
+        "rtol": 2e-3,
+    },
+    ("nn.functional.embedding_bag", f16): {
+        "reference_in_float": True,
+        "atol": 1e-4,
+        "rtol": 1e-2,
+    },
+    ("fft.irfft2", f16): {
+        "reference_in_float": True,
+        "atol": 1e-4,
+        "rtol": 7e-1,
+    },
+    ("fft.irfftn", f16): {
+        "reference_in_float": True,
+        "atol": 1e-4,
+        "rtol": 7e-1,
+    },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inductor_override_kwargs["xpu"] = {
@@ -483,6 +538,10 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("baddbmm", f16): {"atol": 2e-3, "rtol": 0.002},  # decomp affects accuracy
     ("angle", f64): {"reference_in_float": True},
     ("asin", f16): {"reference_in_float": True},
+<<<<<<< HEAD
+=======
+    ("asin", f32): {"reference_in_float": True, "atol": 1e-4, "rtol": 1e-4},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ("atanh", f16): {"reference_in_float": True},
     "cauchy": {"reference_in_float": True},
     ("cummax", f16): {"atol": 5e-4, "rtol": 0.002},
@@ -502,7 +561,11 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("linalg.vecdot", f16): {"atol": 1e-5, "rtol": 2e-2},
     "log_normal": {"reference_in_float": True},
     ("logsumexp", f16): {"atol": 1e-5, "rtol": 1e-2},
+<<<<<<< HEAD
     ("masked.cumprod", f16): {"atol": 1e-5, "rtol": 5e-2},
+=======
+    ("masked.cumprod", f16): {"reference_in_float": True, "atol": 1e-5, "rtol": 5e-2},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ("masked.cumsum", f16): {"atol": 1e-5, "rtol": 5e-3},
     ("masked.softmin", f16): {"atol": 1e-4, "rtol": 0.01},
     ("masked.softmax", f16): {"atol": 2e-4, "rtol": 0.01},
@@ -552,7 +615,10 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
         "rtol": 0.02,
     },
     ("remainder", f16): {"atol": 1e-4, "rtol": 0.005},
+<<<<<<< HEAD
     ("nn.functional.upsample_bilinear", f16): {"atol": 1e-5, "rtol": 0.002},
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ("sinc", f16): {"atol": 0.008, "rtol": 0.002},
     ("softmax", f16): {"atol": 1e-4, "rtol": 0.02},
     ("_softmax_backward_data", f16): {"atol": 0.008, "rtol": 0.002},
@@ -585,6 +651,7 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("index_reduce.amax", f32): {"check_gradient": False},
     ("index_reduce.amax", f16): {"check_gradient": False},
     ("tanh", f16): {"atol": 1e-4, "rtol": 1e-2},
+<<<<<<< HEAD
     ("nn.functional.embedding_bag", f16): {"check_gradient": False},
     ("nn.functional.embedding_bag", f32): {"check_gradient": False},
     ("nn.functional.embedding_bag", f64): {"check_gradient": False},
@@ -596,6 +663,50 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
         {
             ("cummin", f16): {"atol": 1e-3, "rtol": 1e-5}
         }
+=======
+    ("nn.functional.embedding_bag", f32): {"check_gradient": False},
+    ("nn.functional.embedding_bag", f64): {"check_gradient": False},
+    ("_unsafe_masked_index_put_accumulate", f16): {"atol": 1e-5, "rtol": 5e-3},
+    ("_unsafe_masked_index", f16): {
+        "reference_in_float": True,
+        "atol": 3e-4,
+        "rtol": 2e-3,
+    },
+    ("nn.functional.interpolate.linear", f16): {"reference_in_float": True},
+    ("nn.functional.prelu", f16): {
+        "reference_in_float": True,
+        "atol": 1e-3,
+        "rtol": 4e-3,
+    },
+    ("addmm", f16): {"reference_in_float": True},
+    ("logaddexp", f16): {"reference_in_float": True},
+    ("std_mean", f16): {"reference_in_float": True},
+    ("hypot", f16): {"reference_in_float": True, "atol": 3e-4, "rtol": 2e-3},
+    ("cummin", f16): {"reference_in_float": True, "atol": 5e-5, "rtol": 2e-3},
+    ("unfold_copy", f16): {"reference_in_float": True, "atol": 2e-5, "rtol": 1e-2},
+    ("nn.functional.upsample_bilinear", f16): {
+        "reference_in_float": True,
+        "atol": 1e-4,
+        "rtol": 2e-3,
+    },
+    ("nn.functional.embedding_bag", f16): {
+        "check_gradient": False,
+        "atol": 1e-4,
+        "rtol": 1e-2,
+    },
+    ("nn.functional.max_pool2d", f16): {
+        "reference_in_float": True,
+        "atol": 1e-4,
+        "rtol": 2e-3,
+    },
+    ("nn.functional.unfold", f16): {
+        "reference_in_float": True,
+    },
+}
+if TEST_WITH_ROCM:
+    inductor_override_kwargs["cuda"].update(
+        {("cummin", f16): {"atol": 1e-3, "rtol": 1e-5}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -921,6 +1032,10 @@ def tearDown(self):
         {"implicit_fallbacks": False, "triton.autotune_pointwise": False}
     )
     @torch._inductor.config.patch("test_configs.runtime_triton_dtype_assert", True)
+<<<<<<< HEAD
+=======
+    @torch._inductor.config.patch("test_configs.static_cpp_dtype_assert", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @collection_decorator
     def test_comprehensive(self, device, dtype, op):
         device_type = torch.device(device).type
@@ -945,6 +1060,11 @@ def test_comprehensive(self, device, dtype, op):
             "nn.functional.interpolate.bicubic",
             "nn.functional.upsample_bilinear",
             "nn.functional.upsample_nearest",
+<<<<<<< HEAD
+=======
+            "fill",
+            "full_like",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             if dtype not in allowed_dtypes:
                 raise unittest.SkipTest("Skipped!")
@@ -963,9 +1083,13 @@ def test_comprehensive(self, device, dtype, op):
             op_name, set()
         ) or dtype in inductor_gradient_expected_failures_single_sample[
             device_type
+<<<<<<< HEAD
         ].get(
             op_name, set()
         ):
+=======
+        ].get(op_name, set()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             test_expect = ExpectedTestResult.XFAILURE  # noqa: F841
         else:
             test_expect = ExpectedTestResult.SUCCESS  # noqa: F841
@@ -1074,6 +1198,7 @@ def _get_tolerances(dtype):
                 #     print(f"RUNNING OP {op_name} on {device_type} with {dtype}", flush=True, file=f)
                 #     print(f"RUNNING OP {op_name} on {device_type} with {dtype}", flush=True)
                 rtol, atol = _get_tolerances(dtype)
+<<<<<<< HEAD
                 if device_type == GPU_TYPE:
                     # opinfo test case have already place the input on the correct device
                     # so we don't need do additional copy by setting copy_to_gpu=False
@@ -1094,12 +1219,49 @@ def _get_tolerances(dtype):
                             }
                             adjusted_kwargs.update(overridden_kwargs)
                             adjusted_kwargs.update(kwarg_overrides)
+=======
+                no_python, has_rng_op = do_nopython_and_has_rng(fn, args, kwargs)
+                for context_fn, kwarg_overrides in get_contexts(has_rng_op):
+                    with context_fn():
+                        # Base kwargs
+                        adjusted_kwargs = {
+                            "check_lowp": False,
+                            "nopython": no_python,
+                            "check_has_compiled": no_python,
+                            "atol": atol,
+                            "rtol": rtol,
+                        }
+
+                        # Backend-specific adjustments
+                        # Triton
+                        if has_triton():
+                            adjusted_kwargs.update(
+                                copy_to_gpu=False, reference_in_float=False
+                            )
+
+                        # skip checking gradient on CPU for now
+                        if device_type == GPU_TYPE:
+                            adjusted_kwargs.update(
+                                check_gradient=requires_grad,
+                                output_process_fn_grad=sample_input.output_process_fn_grad,
+                            )
+                        else:
+                            adjusted_kwargs["check_gradient"] = False
+
+                        # Update with overridden kwargs and context-specific overrides
+                        adjusted_kwargs.update(overridden_kwargs)
+                        adjusted_kwargs.update(kwarg_overrides)
+
+                        # Call the appropriate check method based on device type
+                        if device_type == GPU_TYPE:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             self.check_model_gpu(
                                 fn,
                                 args,
                                 kwargs,
                                 **adjusted_kwargs,
                             )
+<<<<<<< HEAD
                 elif device_type == "cpu":
                     no_python, has_rng_op = do_nopython_and_has_rng(fn, args, kwargs)
                     for context_fn, kwarg_overrides in get_contexts(has_rng_op):
@@ -1116,6 +1278,9 @@ def _get_tolerances(dtype):
                             adjusted_kwargs.update(overridden_kwargs)
                             adjusted_kwargs.update(kwarg_overrides)
 
+=======
+                        else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             self.check_model(
                                 fn,
                                 args,
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
index 887a071d80ee..75b5acf95c9b 100644
--- a/test/inductor/test_torchinductor_strided_blocks.py
+++ b/test/inductor/test_torchinductor_strided_blocks.py
@@ -1,13 +1,20 @@
 # Owner(s): ["module: inductor"]
 # ruff: noqa: F841
 import contextlib
+<<<<<<< HEAD
 import importlib
+=======
+import dataclasses
+import importlib
+import math
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.utils._pytree as pytree
 from torch._inductor import config
+<<<<<<< HEAD
 from torch._inductor.runtime.hints import TRITON_MAX_BLOCK
 from torch._inductor.runtime.runtime_utils import is_power_of_2
 from torch._inductor.test_case import TestCase as InductorTestCase
@@ -15,6 +22,19 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+=======
+from torch._inductor.choices import InductorChoices
+from torch._inductor.codegen.triton import FixedTritonConfig
+from torch._inductor.runtime.hints import TRITON_MAX_BLOCK
+from torch._inductor.runtime.runtime_utils import get_max_y_grid, is_power_of_2
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch._inductor.utils import run_and_get_code
+from torch._inductor.virtualized import V
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    skipIfXpu,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     subtest,
 )
 from torch.testing._internal.inductor_utils import (
@@ -109,6 +129,7 @@ def _discontiguous_tensor(
         view = torch.as_strided(full, view_size, full.stride())
         return view
 
+<<<<<<< HEAD
     def _assert_reduction_ndims(self, code, num_dims: int) -> None:
         reduction_blocks = ["R0_BLOCK", "R1_BLOCK"]
         for expected_block in reduction_blocks[:num_dims]:
@@ -116,6 +137,25 @@ def _assert_reduction_ndims(self, code, num_dims: int) -> None:
         for unexpected_block in reduction_blocks[num_dims:]:
             self.assertNotIn(unexpected_block, code)
 
+=======
+    def _assert_pointwise_ndims(self, code, num_dims: int) -> None:
+        pointwise_blocks = ["XBLOCK", "YBLOCK", "ZBLOCK"]
+        return self._assert_tiling_ndims(code, pointwise_blocks, num_dims)
+
+    def _assert_reduction_ndims(self, code, num_dims: int) -> None:
+        reduction_blocks = ["R0_BLOCK", "R1_BLOCK"]
+        return self._assert_tiling_ndims(code, reduction_blocks, num_dims)
+
+    def _assert_tiling_ndims(self, code, blocks: list[str], num_dims: int) -> None:
+        for expected_block in blocks[:num_dims]:
+            self.assertIn(expected_block, code)
+        for unexpected_block in blocks[num_dims:]:
+            self.assertNotIn(unexpected_block, code)
+
+    def _get_lines_containing_substr(self, code: str, substr: str) -> str:
+        return "\n".join(line for line in code.split("\n") if substr in line)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @instantiate_parametrized_tests
 class CommonTemplate:
@@ -348,11 +388,16 @@ def test_pointwise_broadcast_nonzero_strides(self, prefer_nd_tiling: bool):
         # Check the code for broadcasts.
         # We shouldn't see any strides of 0.
         load_lines, store_lines = tuple(
+<<<<<<< HEAD
             [line for line in triton_code.split("\n") if substr in line]
+=======
+            self._get_lines_containing_substr(triton_code, substr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for substr in ("tl.load", "tl.store")
         )
         if prefer_nd_tiling:
             self.assertExpectedInline(
+<<<<<<< HEAD
                 "\n".join(load_lines),
                 """\
     tmp0 = tl.load(tl.make_block_ptr(in_ptr0, shape=[8, 8], strides=[1, 8], block_shape=[XBLOCK, YBLOCK], order=[1, 0], offsets=[xoffset, yoffset]), boundary_check=[0, 1])
@@ -365,12 +410,30 @@ def test_pointwise_broadcast_nonzero_strides(self, prefer_nd_tiling: bool):
         else:
             self.assertExpectedInline(
                 "\n".join(load_lines),
+=======
+                load_lines,
+                """\
+    tmp0 = tl.load(tl.make_block_ptr(in_ptr0, shape=[8, 8], strides=[8, 1], block_shape=[YBLOCK, XBLOCK], order=[1, 0], offsets=[yoffset, xoffset]), boundary_check=[0, 1])
+    tmp1 = tl.load(tl.make_block_ptr(in_ptr1, shape=[8], strides=[8], block_shape=[YBLOCK], order=[0], offsets=[yoffset]), boundary_check=[0], eviction_policy='evict_last')[:, None]""",  # noqa: B950
+            )
+            self.assertExpectedInline(
+                store_lines,
+                """    tl.store(tl.make_block_ptr(out_ptr0, shape=[8, 8], strides=[8, 1], block_shape=[YBLOCK, XBLOCK], order=[1, 0], offsets=[yoffset, xoffset]), tl.broadcast_to(tmp2, [YBLOCK, XBLOCK]).to(tl.float32), boundary_check=[0, 1])""",  # noqa: B950
+            )
+        else:
+            self.assertExpectedInline(
+                load_lines,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 """\
     tmp0 = tl.load(tl.make_block_ptr(in_ptr0, shape=[64], strides=[1], block_shape=[XBLOCK], order=[0], offsets=[xoffset]), boundary_check=[0])
     tmp1 = tl.reshape(tl.broadcast_to(tl.load(tl.make_block_ptr(in_ptr1, shape=[8], strides=[8], block_shape=[(7 + XBLOCK) // 8], order=[0], offsets=[xoffset // 8]), boundary_check=[0], eviction_policy='evict_last')[:, None, None], [(7 + XBLOCK) // 8, ((1) * ((1) <= ((7 + XBLOCK) // 8)) + ((7 + XBLOCK) // 8) * (((7 + XBLOCK) // 8) < (1))), ((8) * ((8) <= (XBLOCK)) + (XBLOCK) * ((XBLOCK) < (8)))]), [XBLOCK])""",  # noqa: B950
             )
             self.assertExpectedInline(
+<<<<<<< HEAD
                 "\n".join(store_lines),
+=======
+                store_lines,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 """    tl.store(tl.make_block_ptr(out_ptr0, shape=[64], strides=[1], block_shape=[XBLOCK], order=[0], offsets=[xoffset]), tl.broadcast_to(tmp2, [XBLOCK]).to(tl.float32), boundary_check=[0])""",  # noqa: B950
             )
 
@@ -500,6 +563,7 @@ def foo(x):
         # Expect 2 block pointers: input and output
         run_and_compare(self, foo, view, expected_num_block_pointers=2)
 
+<<<<<<< HEAD
     def test_dynamic_shapes_generic(self):
         """
         Test a generic strided block with dynamic shapes. Block pointers are not
@@ -516,6 +580,63 @@ def test_dynamic_shapes_generic(self):
 
     @unittest.skip(reason="Dynamo tracing error")
     def test_dynamic_shapes_multiple_max_block(self):
+=======
+    @parametrize(
+        "nd_tiling,num_block_pointers",
+        [
+            (True, 2),  # With tiling, the index is affine.
+            (False, 1),  # We can't infer that the load is a power of 2.
+        ],
+    )
+    def test_dynamic_shapes_pointwise(self, nd_tiling: bool, num_block_pointers: int):
+        """
+        Test a pointwise kernel with dynamic shapes.
+        """
+
+        view_size = (4, 4)
+        view = self._discontiguous_tensor(view_size, self.device)
+
+        run_and_compare(
+            self,
+            torch.div,
+            view,
+            view,
+            expected_num_block_pointers=num_block_pointers,
+            config_patches={"triton.prefer_nd_tiling": nd_tiling},
+            compile_kwargs={"dynamic": True},
+        )
+
+    @parametrize(
+        "with_tiling,num_block_pointers",
+        [
+            (True, 1),  # With tiling, the index is affine.
+            (False, 0),  # We can't infer that the load is a power of 2.
+        ],
+    )
+    @skipIfXpu(msg="Remove this after Intel triton issue #4000 resolved.")
+    def test_dynamic_shapes_reduction(self, with_tiling: bool, num_block_pointers: int):
+        """
+        Test a reduction kernel with dynamic shapes.
+        """
+
+        view_size = (4, 4)
+        view = self._discontiguous_tensor(view_size, self.device)
+
+        run_and_compare(
+            self,
+            torch.prod,
+            view,
+            expected_num_block_pointers=num_block_pointers,
+            config_patches={
+                "triton.prefer_nd_tiling": with_tiling,
+                "triton.tile_reductions": with_tiling,
+            },
+            compile_kwargs={"dynamic": True},
+        )
+
+    @unittest.skip(reason="Dynamo tracing error")
+    def test_dynamic_shapes_pointwise_multiple_max_block(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Test dynamic shapes, where we know the shape is a multiple of the max block
         size. We should be able to generate a block pointer for this case.
@@ -901,7 +1022,45 @@ def foo(x, y, z):
         )
 
         # Check for 3D tiling
+<<<<<<< HEAD
         self.assertIn("ZBLOCK", code)
+=======
+        self._assert_pointwise_ndims(code, 3)
+
+    @torch._dynamo.config.patch({"capture_scalar_outputs": True})
+    @parametrize("num_tile_candidates", (1, 2))
+    def test_unbacked_size_on_non_contig_dim(self, num_tile_candidates: int):
+        # NUM_REPEAT should determine # of candidate_tilings.
+        NUM_REPEAT = 2 if num_tile_candidates == 2 else 8
+
+        def foo(x, length):
+            unbacked = length.item()
+            torch._check_is_size(unbacked)
+
+            repeated = x.repeat(1, unbacked, NUM_REPEAT)
+            # permute creates split in middle with unbacked symint is the first range
+            # ranges: [33*unbacked, NUM_REPEAT, 64]
+            permute120 = repeated.permute([1, 2, 0])
+            return permute120.cos()
+
+        inps = (
+            torch.rand((64, 33, 1), device=self.device, dtype=torch.float32),
+            torch.scalar_tensor(16, device=self.device, dtype=torch.int32),
+        )
+
+        with torch._dynamo.config.patch({"capture_scalar_outputs": True}):
+            run_and_compare(
+                self,
+                foo,
+                *inps,
+                expected_num_triton_kernels=1,
+                expected_num_block_pointers=0,
+                config_patches={
+                    "triton.max_tiles": 3,
+                    "triton.prefer_nd_tiling": True,
+                },
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # block_ptr advancements should also be deferrered conditional
     # on the associated buffer not being removed
@@ -927,6 +1086,198 @@ def fn(a):
             rtol=0.06,
         )
 
+<<<<<<< HEAD
+=======
+    def test_pointwise_index_order(self):
+        """
+        Test the order of indices in pointwise kernels. Expect Z to be the leading dim,
+        then Y, then X.
+        """
+
+        inps = [
+            self._discontiguous_tensor((5, 5, 5), device=self.device) for _ in range(2)
+        ]
+
+        result, (triton_code,) = run_and_compare(
+            self,
+            torch.add,
+            *inps,
+            expected_num_triton_kernels=1,
+            expected_num_block_pointers=3,
+            config_patches={
+                "triton.max_tiles": 3,
+                "triton.prefer_nd_tiling": True,
+            },
+        )
+
+        # Check the load and store for block pointer strides.
+        load_lines, store_lines, index_lines = tuple(
+            self._get_lines_containing_substr(triton_code, substr)
+            for substr in ("tl.load", "tl.store", "index =")
+        )
+        self.assertExpectedInline(
+            load_lines,
+            """\
+    tmp0 = tl.load(tl.make_block_ptr(in_ptr0, shape=[5, 5, 5], strides=[100, 10, 1], block_shape=[ZBLOCK, YBLOCK, XBLOCK], order=[2, 1, 0], offsets=[zoffset, yoffset, xoffset]), boundary_check=[0, 1, 2])
+    tmp1 = tl.load(tl.make_block_ptr(in_ptr1, shape=[5, 5, 5], strides=[100, 10, 1], block_shape=[ZBLOCK, YBLOCK, XBLOCK], order=[2, 1, 0], offsets=[zoffset, yoffset, xoffset]), boundary_check=[0, 1, 2])""",  # noqa: B950
+        )
+
+        self.assertExpectedInline(
+            store_lines,
+            """    tl.store(tl.make_block_ptr(out_ptr0, shape=[5, 5, 5], strides=[25, 5, 1], block_shape=[ZBLOCK, YBLOCK, XBLOCK], order=[2, 1, 0], offsets=[zoffset, yoffset, xoffset]), tl.broadcast_to(tmp2, [ZBLOCK, YBLOCK, XBLOCK]).to(tl.float32), boundary_check=[0, 1, 2])""",  # noqa: B950
+        )
+
+        # Check the indices. These are used for non-block pointers.
+        self.assertExpectedInline(
+            index_lines,
+            """\
+    zindex = zoffset + tl.arange(0, ZBLOCK)[:, None, None]
+    yindex = yoffset + tl.arange(0, YBLOCK)[None, :, None]
+    xindex = xoffset + tl.arange(0, XBLOCK)[None, None, :]""",  # noqa: B950
+        )
+
+    def test_expand_clone_broadcast(self):
+        """
+        Test expand followed by clone. This uses an explicit Triton broadcast.
+        """
+        base_size = (1, 32)
+        expanded_size = (32, 32)
+
+        def foo(x):
+            return x.expand(*expanded_size).clone()
+
+        inps = [torch.randn(base_size, device=self.device)]
+        result, (triton_code,) = run_and_compare(
+            self,
+            foo,
+            *inps,
+            expected_num_triton_kernels=1,
+            expected_num_block_pointers=2,
+            config_patches={
+                "triton.max_tiles": 3,
+                "triton.prefer_nd_tiling": True,
+            },
+        )
+
+        # We should only need one broadcast.
+        num_broadcasts = triton_code.count("tl.broadcast_to")
+        self.assertEqual(num_broadcasts, 1)
+
+    def test_mul_broadcast_multi_output(self):
+        def foo(x, y, z):
+            a = x * y
+            b = 128.0
+            c = a * b
+            d = a * z
+            e = x * z
+            return a, c, d, e
+
+        inps = [
+            torch.randn((8, 11, 128), device=self.device),
+            torch.randn((128,), device=self.device),
+            torch.randn((8, 11, 128), device=self.device),
+        ]
+        result, (triton_code,) = run_and_compare(
+            self,
+            foo,
+            *inps,
+            expected_num_triton_kernels=1,
+            expected_num_block_pointers=7,
+            config_patches={
+                "triton.max_tiles": 3,
+                "triton.prefer_nd_tiling": True,
+            },
+        )
+
+        # Check that the tiling is 2D, even though we allow up to 3D.
+        # Singleton splits should be discarded.
+        self._assert_pointwise_ndims(triton_code, 2)
+
+    @config.patch("triton.prefer_nd_tiling", True)
+    @config.patch("triton.max_tiles", 3)
+    @parametrize(
+        "block_multiple, ynumel_exceed_ygrid_size, include_z",
+        [
+            # No boundary check in all dimensions
+            [True, False, True],
+            # No xdim boundary check, ydim is checked since > max_ygrid
+            # z dim can be used since its not included
+            [True, True, False],
+            # Boundary check in all dimensions
+            # skip triton_cpu very slow test > 1000s
+            subtest(
+                [False, False, True], decorators=[test_torchinductor.skip_if_triton_cpu]
+            ),
+        ],
+    )
+    def test_boundary_check(self, block_multiple, ynumel_exceed_ygrid_size, include_z):
+        @dataclasses.dataclass
+        class InputShape:
+            x: int
+            y: int
+            z: Optional[int] = None
+
+            def to_list(self):
+                out = [self.y, self.x]
+                if self.z is not None:
+                    out.insert(0, self.z)
+                return out
+
+        BLOCK_SIZE = 8
+        DIM_SIZE = BLOCK_SIZE if block_multiple else BLOCK_SIZE + 1
+        shape = InputShape(DIM_SIZE, DIM_SIZE, DIM_SIZE if include_z else None)
+        if ynumel_exceed_ygrid_size:
+            shape.y = math.ceil(get_max_y_grid()) * shape.y + shape.y
+
+        # Use fixed block sizes to avoid having to generate very large input tensors
+        class FixedBlockSizeChoices(InductorChoices):
+            def triton_kernel_kwargs(self, kernel_cls, features, groups, kernel_kwargs):
+                block_sizes = {
+                    f"{prefix.upper()}BLOCK": BLOCK_SIZE
+                    for prefix, size in dataclasses.asdict(shape).items()
+                    if size is not None
+                }
+                kernel_kwargs["fixed_config"] = FixedTritonConfig(block_sizes)
+                return kernel_kwargs
+
+        a = self._discontiguous_tensor(shape.to_list(), device=self.device)
+        b_shape = shape.to_list()
+        b_shape[-1] = 1
+        b = self._discontiguous_tensor(b_shape, device=self.device)
+
+        def func(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+            return a + b
+
+        with V.set_choices_handler(FixedBlockSizeChoices()):
+            result, code = run_and_compare(
+                self,
+                func,
+                a,
+                b,
+                expected_num_triton_kernels=1,
+                expected_num_block_pointers=3,
+            )
+
+            code = code[0]
+            if block_multiple:
+                if ynumel_exceed_ygrid_size:
+                    self.assertIn(
+                        "yoffset = (tl.program_id(1) + tl.program_id(2) * tl.num_programs(1)) * YBLOCK",
+                        code,
+                    )
+                    # Only the y dimension should be boundary checked
+                    # a, b, and output
+                    self.assertEqual(code.count("boundary_check=[0]"), 3)
+                else:
+                    # No boundary checking
+                    self.assertNotIn("boundary_check", code)
+            else:
+                # Loading a
+                self.assertTrue("boundary_check=[0, 1, 2]" in code)
+                # Loading b
+                self.assertTrue("boundary_check=[0, 1]" in code)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(not TRITON_HAS_CPU, "requires triton CPU backend")
 @config.patch(cpu_backend="triton")
diff --git a/test/inductor/test_triton_heuristics.py b/test/inductor/test_triton_heuristics.py
index 8b1a56ee597b..2d14c9e61f85 100644
--- a/test/inductor/test_triton_heuristics.py
+++ b/test/inductor/test_triton_heuristics.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: inductor"]
 
+<<<<<<< HEAD
 import sys
 import unittest
 
@@ -7,6 +8,18 @@
 from torch._dynamo.testing import rand_strided
 from torch._inductor.utils import clone_preserve_strides
 from torch.testing._internal.common_utils import IS_LINUX, skipIfXpu
+=======
+import functools
+import sys
+import unittest
+from unittest.mock import MagicMock, patch
+
+import torch
+from torch._dynamo.testing import rand_strided
+from torch._inductor.runtime.triton_compat import HAS_WARP_SPEC
+from torch._inductor.utils import clone_preserve_strides
+from torch.testing._internal.common_utils import IS_LINUX, runOnRocm, skipIfXpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_GPU,
@@ -34,11 +47,42 @@
 from torch._inductor.runtime.triton_heuristics import (
     autotune_hints_to_configs,
     CachingAutotuner,
+<<<<<<< HEAD
+=======
+    template,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     triton_config,
 )
 from torch._inductor.test_case import run_tests, TestCase
 
 
+<<<<<<< HEAD
+=======
+@triton.jit
+def amd_sqr_kernel(in_ptr, out_ptr, numel, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    data = tl.load(in_ptr + offsets, mask=offsets < numel)
+    sqr = data * data
+    tl.store(out_ptr + offsets, sqr, mask=offsets < numel)
+
+
+@functools.lru_cache
+def get_autotuned_amd_sqr_kernel():
+    return triton.autotune(
+        configs=[
+            triton.Config(
+                {
+                    "BLOCK_SIZE": 64,
+                    "waves_per_eu": 3,
+                }
+            )
+        ],
+        key=[],
+    )(amd_sqr_kernel)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestTritonHeuristics(TestCase):
     device_type = GPU_TYPE
 
@@ -185,6 +229,58 @@ def mock_triton_config(
 
         self.assertTrue(8 in seen_num_elements_per_warp)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(not HAS_WARP_SPEC, "FBCODE Triton is required for this test")
+    def test_template_function_ws(self):
+        triton_meta = {"device": MagicMock()}
+        num_stages = 2
+        num_warps = 4
+        num_consumer_groups = 3
+        num_buffers_warp_spec = 5
+
+        with patch(
+            "torch._inductor.runtime.triton_heuristics.cached_autotune"
+        ) as mock_cached_autotune:
+            template(
+                num_stages=num_stages,
+                num_warps=num_warps,
+                triton_meta=triton_meta,
+                num_consumer_groups=num_consumer_groups,
+                num_buffers_warp_spec=num_buffers_warp_spec,
+            )
+            mock_cached_autotune.assert_called_once()
+            configs = mock_cached_autotune.call_args[0][1]
+            self.assertEqual(configs[0].num_consumer_groups, num_consumer_groups)
+            self.assertEqual(configs[0].num_buffers_warp_spec, num_buffers_warp_spec)
+
+    @runOnRocm
+    def test_amd_special_config_args(self):
+        """
+        waves_per_eu is an example of a special config arg on AMD; if it is explicitly specified
+        in a config, the kwarg will exist in the kwargs but not in the function signature.
+        """
+
+        @torch.library.triton_op("test_triton_heuristics::triton_sqr", mutates_args=())
+        def triton_sqr(x: torch.Tensor) -> torch.Tensor:
+            y = torch.empty_like(x)
+
+            def grid(meta):
+                return (triton.cdiv(x.numel(), meta["BLOCK_SIZE"]),)
+
+            torch.library.wrap_triton(get_autotuned_amd_sqr_kernel())[grid](
+                x, y, x.numel()
+            )
+
+        def fn(x):
+            return triton_sqr(x)
+
+        x = torch.randn(32, device="cuda")
+        ref = fn(x)
+        res = torch.compile(fn)(x)
+        self.assertEqual(ref, res)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestArgumentCloneAndRestore(TestCase):
     # Our tensor is large enough. If a unexpected copy happens, the
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index a5f360bcd584..ff9490de5ada 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -30,20 +30,32 @@
     skipIfRocm,
     skipIfWindows,
     skipIfXpu,
+<<<<<<< HEAD
     TEST_WITH_ROCM,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA, HAS_GPU, HAS_XPU
 from torch.testing._internal.logging_utils import log_settings, logs_to_string
 
 # Defines all the kernels for tests
 from torch.testing._internal.triton_utils import *  # noqa: F403
+<<<<<<< HEAD
 from torch.utils._triton import has_triton_package, has_triton_tma
+=======
+from torch.utils._triton import (
+    has_triton_experimental_host_tma,
+    has_triton_package,
+    has_triton_tensor_descriptor_host_tma,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if HAS_GPU:
     import triton
     from triton import language as tl
 
+<<<<<<< HEAD
     if not TEST_WITH_ROCM:
         if HAS_CUDA:
             try:
@@ -61,6 +73,24 @@
                 fast_dividef,
                 fast_dividef as my_fast_dividef,
             )
+=======
+    if HAS_CUDA:
+        try:
+            from triton.language.extra.libdevice import (  # @manual
+                fast_dividef,
+                fast_dividef as my_fast_dividef,
+            )
+        except ImportError:
+            from triton.language.extra.cuda.libdevice import (  # @manual
+                fast_dividef,
+                fast_dividef as my_fast_dividef,
+            )
+    elif HAS_XPU:
+        from triton.language.extra.intel.libdevice import (  # @manual
+            fast_dividef,
+            fast_dividef as my_fast_dividef,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _triton_get_ast_equal_to_str(params):
         try:
@@ -456,7 +486,11 @@ def call_triton(x: torch.Tensor, y: torch.Tensor, output: torch.Tensor):
                 self.assertIn("output_handles[0] = ", code)
                 self.assertIn("output_handles[1] = ", code)
             else:
+<<<<<<< HEAD
                 self.assertIn("return (buf0, s0, )", code)
+=======
+                self.assertIn("return (buf0, s92, )", code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             self.assertIn(
                 "output_handles[0] = "
@@ -806,7 +840,16 @@ def call_triton(x: torch.Tensor, y: torch.Tensor):
     @common_utils.parametrize("dynamic", [False, True])
     @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
     @common_utils.parametrize("grid_type", [1, 2, 3])
+<<<<<<< HEAD
     def test_triton_kernel_2d_autotune(self, grad, dynamic, backend, grid_type):
+=======
+    @common_utils.parametrize("tdlp", ["0", "1"])
+    def test_triton_kernel_2d_autotune(self, grad, dynamic, backend, grid_type, tdlp):
+        import os
+
+        os.environ["TORCHINDUCTOR_DUMP_LAUNCH_PARAMS"] = tdlp
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def call_triton(x: torch.Tensor, y: torch.Tensor, output: torch.Tensor):
             x_elements = output.size()[0]
             y_elements = output.size()[1]
@@ -1341,7 +1384,10 @@ def f(x, y):
         self.assertEqual(compiled_out, eager_out)
 
     @requires_gpu
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_kernel_with_imported_symbol(self):
         @triton.jit
         def add_kernel_with_imported_symbol(
@@ -1373,7 +1419,10 @@ def f(x):
         self.assertEqual(compiled_out, eager_out)
 
     @requires_gpu
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_triton_kernel_with_imported_symbol_with_custom_name(self):
         @triton.jit
         def add_kernel_with_imported_symbol(
@@ -1647,6 +1696,68 @@ def f(x):
 
     @requires_gpu
     @common_utils.parametrize("dynamic", [False, True])
+<<<<<<< HEAD
+=======
+    @common_utils.parametrize("tma_version", ["new", "old"])
+    def test_on_device_tma(self, dynamic, tma_version):
+        if tma_version == "new" and not has_triton_tensor_descriptor_host_tma():
+            self.skipTest("requires triton.tools.tensor_descriptor TMA support")
+        if tma_version == "old" and not has_triton_experimental_host_tma():
+            self.skipTest("requires triton.tools.experimental_descriptor TMA support")
+
+        kernel = (
+            add_kernel_on_device_tma_new_api
+            if tma_version == "new"
+            else add_kernel_on_device_tma_old_api
+        )
+
+        def f(a, b):
+            BLOCK_SIZE = 32
+            out = torch.zeros_like(a)
+            m, n = out.size()
+
+            # Allocate workspace for on-device TMA descriptors
+            # Need 128 bytes per descriptor, 3 descriptors total
+            if tma_version == "old":
+                workspace = torch.zeros(3 * 128, dtype=torch.uint8, device=a.device)
+            else:
+                workspace = None
+
+            grid = lambda meta: (
+                triton.cdiv(m, meta["BLOCK_SIZE"]),
+                triton.cdiv(n, meta["BLOCK_SIZE"]),
+            )
+
+            kernel[grid](
+                a,
+                b,
+                out,
+                m,
+                n,
+                workspace,
+                BLOCK_SIZE=BLOCK_SIZE,
+            )
+
+            return out
+
+        a = torch.randn((32, 32), device=GPU_TYPE)
+        b = torch.randn((32, 32), device=GPU_TYPE)
+
+        expected_out = a + b
+        triton.set_allocator(
+            lambda size, align, stream: torch.empty(
+                size, dtype=torch.int8, device=GPU_TYPE
+            )
+        )
+        eager_out = f(a, b)
+        compiled_out = torch.compile(f, fullgraph=True, dynamic=dynamic)(a, b)
+
+        self.assertEqual(eager_out, expected_out)
+        self.assertEqual(compiled_out, expected_out)
+
+    @requires_gpu
+    @common_utils.parametrize("dynamic", [False, True])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
     def test_triton_kernel_multiple_outputs(self, dynamic, backend):
         @triton.jit
@@ -1687,12 +1798,31 @@ def f(x, y, z):
         self.assertEqual(out3, z**2)
 
     @requires_gpu
+<<<<<<< HEAD
     @unittest.skipIf(not has_triton_tma(), "requires Triton TMA support")
     @common_utils.parametrize("dynamic", [False, True])
     def test_tma_capture_and_functionalize(self, dynamic):
         from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
 
         kernel_side_table.reset_table()
+=======
+    @common_utils.parametrize("dynamic", [False, True])
+    @common_utils.parametrize("tma_version", ["new", "old"])
+    def test_tma_capture_and_functionalize(self, dynamic, tma_version):
+        if tma_version == "new" and not has_triton_tensor_descriptor_host_tma():
+            self.skipTest("requires triton.tools.tensor_descriptor TMA support")
+        if tma_version == "old" and not has_triton_experimental_host_tma():
+            self.skipTest("requires triton.tools.experimental_descriptor TMA support")
+
+        from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
+
+        kernel_side_table.reset_table()
+        kernel = (
+            add_kernel_with_tma_1d_new_api
+            if tma_version == "new"
+            else add_kernel_with_tma_1d_old_api
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def f(a, b):
             BLOCK_SIZE = 256
@@ -1700,17 +1830,26 @@ def f(a, b):
             n_elements = out.numel()
 
             desc_a, desc_b, desc_out = (
+<<<<<<< HEAD
                 triton.tools.experimental_descriptor.create_1d_tma_descriptor(
                     t.data_ptr(),
                     n_elements,
                     BLOCK_SIZE,
                     t.element_size(),
+=======
+                create_tensor_descriptor_shim(
+                    t, [BLOCK_SIZE], new_api=(tma_version == "new")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for t in (a, b, out)
             )
 
             grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+<<<<<<< HEAD
             add_kernel_with_tma_1d[grid](
+=======
+            kernel[grid](
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 desc_a,
                 desc_b,
                 desc_out,
@@ -1723,6 +1862,10 @@ def f(a, b):
         b = torch.randn(301, device=GPU_TYPE)
 
         backend = torch._dynamo.testing.AotEagerAndRecordGraphs()
+<<<<<<< HEAD
+=======
+        _ = f(a, b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.compile(
             f,
             fullgraph=True,
@@ -1731,14 +1874,35 @@ def f(a, b):
         )(a, b)
 
         if dynamic:
+<<<<<<< HEAD
             self.assertExpectedInline(
                 backend.fw_graphs[0].code.strip(),
                 """\
+=======
+            if tma_version == "new":
+                self.assertExpectedInline(
+                    backend.fw_graphs[0].code.strip(),
+                    """\
+def forward(self, arg0_1, arg1_1, arg2_1):
+    zeros_like = torch.ops.aten.zeros_like.default(arg1_1, pin_memory = False)
+    add_2 = arg0_1 + 256;  arg0_1 = None
+    sub_1 = add_2 - 1;  add_2 = None
+    floordiv = sub_1 // 256;  sub_1 = None
+    triton_kernel_wrapper_functional_proxy = torch.ops.higher_order.triton_kernel_wrapper_functional(kernel_idx = 0, constant_args_idx = 0, grid = [(floordiv, 1, 1)], tma_descriptor_metadata = {'in_desc_ptr0': ('stable', ([256],)), 'in_desc_ptr1': ('stable', ([256],)), 'out_desc_ptr': ('stable', ([256],))}, kwargs = {'in_desc_ptr0': arg1_1, 'in_desc_ptr1': arg2_1, 'out_desc_ptr': zeros_like}, tensors_to_clone = ['out_desc_ptr']);  floordiv = arg1_1 = arg2_1 = zeros_like = None
+    getitem = triton_kernel_wrapper_functional_proxy['out_desc_ptr'];  triton_kernel_wrapper_functional_proxy = None
+    return (getitem,)""",
+                )
+            elif tma_version == "old":
+                self.assertExpectedInline(
+                    backend.fw_graphs[0].code.strip(),
+                    """\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def forward(self, arg0_1, arg1_1, arg2_1):
     zeros_like = torch.ops.aten.zeros_like.default(arg1_1, pin_memory = False)
     add_2 = arg0_1 + 256
     sub_1 = add_2 - 1;  add_2 = None
     floordiv = sub_1 // 256;  sub_1 = None
+<<<<<<< HEAD
     triton_kernel_wrapper_functional_proxy = torch.ops.higher_order.triton_kernel_wrapper_functional(kernel_idx = 0, constant_args_idx = 0, grid = [(floordiv, 1, 1)], tma_descriptor_metadata = {'in_desc_ptr0': ([arg0_1], [256], 4), 'in_desc_ptr1': ([arg0_1], [256], 4), 'out_desc_ptr': ([arg0_1], [256], 4)}, kwargs = {'in_desc_ptr0': arg1_1, 'in_desc_ptr1': arg2_1, 'out_desc_ptr': zeros_like}, tensors_to_clone = ['out_desc_ptr']);  floordiv = arg0_1 = arg1_1 = arg2_1 = zeros_like = None
     getitem = triton_kernel_wrapper_functional_proxy['out_desc_ptr'];  triton_kernel_wrapper_functional_proxy = None
     return (getitem,)""",
@@ -1759,6 +1923,50 @@ def forward(self, arg0_1, arg1_1):
     @common_utils.parametrize("after_data_ptr", [False, True])
     @common_utils.parametrize("after_create_desc", [False, True])
     def test_tma_graph_breaks(self, after_data_ptr, after_create_desc):
+=======
+    triton_kernel_wrapper_functional_proxy = torch.ops.higher_order.triton_kernel_wrapper_functional(kernel_idx = 0, constant_args_idx = 0, grid = [(floordiv, 1, 1)], tma_descriptor_metadata = {'in_desc_ptr0': ('experimental', ([arg0_1], [256], 4)), 'in_desc_ptr1': ('experimental', ([arg0_1], [256], 4)), 'out_desc_ptr': ('experimental', ([arg0_1], [256], 4))}, kwargs = {'in_desc_ptr0': arg1_1, 'in_desc_ptr1': arg2_1, 'out_desc_ptr': zeros_like}, tensors_to_clone = ['out_desc_ptr']);  floordiv = arg0_1 = arg1_1 = arg2_1 = zeros_like = None
+    getitem = triton_kernel_wrapper_functional_proxy['out_desc_ptr'];  triton_kernel_wrapper_functional_proxy = None
+    return (getitem,)""",
+                )
+        else:
+            if tma_version == "new":
+                self.assertExpectedInline(
+                    backend.fw_graphs[0].code.strip(),
+                    """\
+def forward(self, arg0_1, arg1_1):
+    zeros_like = torch.ops.aten.zeros_like.default(arg0_1, pin_memory = False)
+    triton_kernel_wrapper_functional_proxy = torch.ops.higher_order.triton_kernel_wrapper_functional(kernel_idx = 0, constant_args_idx = 0, grid = [(2, 1, 1)], tma_descriptor_metadata = {'in_desc_ptr0': ('stable', ([256],)), 'in_desc_ptr1': ('stable', ([256],)), 'out_desc_ptr': ('stable', ([256],))}, kwargs = {'in_desc_ptr0': arg0_1, 'in_desc_ptr1': arg1_1, 'out_desc_ptr': zeros_like}, tensors_to_clone = ['out_desc_ptr']);  arg0_1 = arg1_1 = zeros_like = None
+    getitem = triton_kernel_wrapper_functional_proxy['out_desc_ptr'];  triton_kernel_wrapper_functional_proxy = None
+    return (getitem,)""",
+                )
+            elif tma_version == "old":
+                self.assertExpectedInline(
+                    backend.fw_graphs[0].code.strip(),
+                    """\
+def forward(self, arg0_1, arg1_1):
+    zeros_like = torch.ops.aten.zeros_like.default(arg0_1, pin_memory = False)
+    triton_kernel_wrapper_functional_proxy = torch.ops.higher_order.triton_kernel_wrapper_functional(kernel_idx = 0, constant_args_idx = 0, grid = [(2, 1, 1)], tma_descriptor_metadata = {'in_desc_ptr0': ('experimental', ([301], [256], 4)), 'in_desc_ptr1': ('experimental', ([301], [256], 4)), 'out_desc_ptr': ('experimental', ([301], [256], 4))}, kwargs = {'in_desc_ptr0': arg0_1, 'in_desc_ptr1': arg1_1, 'out_desc_ptr': zeros_like}, tensors_to_clone = ['out_desc_ptr']);  arg0_1 = arg1_1 = zeros_like = None
+    getitem = triton_kernel_wrapper_functional_proxy['out_desc_ptr'];  triton_kernel_wrapper_functional_proxy = None
+    return (getitem,)""",
+                )
+
+    @requires_gpu
+    @common_utils.parametrize("after_data_ptr", [False, True])
+    @common_utils.parametrize("after_create_desc", [False, True])
+    @common_utils.parametrize("tma_version", ["new", "old"])
+    def test_tma_graph_breaks(self, after_data_ptr, after_create_desc, tma_version):
+        if tma_version == "new" and not has_triton_tensor_descriptor_host_tma():
+            self.skipTest("requires triton.tools.tensor_descriptor TMA support")
+        if tma_version == "old" and not has_triton_experimental_host_tma():
+            self.skipTest("requires triton.tools.experimental_descriptor TMA support")
+
+        kernel = (
+            add_kernel_with_tma_1d_new_api
+            if tma_version == "new"
+            else add_kernel_with_tma_1d_old_api
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(a, b):
             BLOCK_SIZE = 256
             out = torch.zeros_like(a)
@@ -1768,11 +1976,16 @@ def f(a, b):
                 torch._dynamo.graph_break()
 
             descs = [
+<<<<<<< HEAD
                 triton.tools.experimental_descriptor.create_1d_tma_descriptor(
                     t.data_ptr(),
                     n_elements,
                     BLOCK_SIZE,
                     t.element_size(),
+=======
+                create_tensor_descriptor_shim(
+                    t, [BLOCK_SIZE], new_api=(tma_version == "new")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for t in (a, b, out)
             ]
@@ -1781,7 +1994,11 @@ def f(a, b):
                 torch._dynamo.graph_break()
 
             grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+<<<<<<< HEAD
             add_kernel_with_tma_1d[grid](
+=======
+            kernel[grid](
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *descs,
                 BLOCK_SIZE=BLOCK_SIZE,
             )
@@ -1804,27 +2021,54 @@ def f(a, b):
         self.assertEqual(compiled_out, expected_out)
 
     @requires_gpu
+<<<<<<< HEAD
     @unittest.skipIf(not has_triton_tma(), "requires Triton TMA support")
     @common_utils.parametrize("dynamic", [False, True])
     @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
     def test_tma_descriptor_1d(self, dynamic, backend):
+=======
+    @common_utils.parametrize("dynamic", [False, True])
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    @common_utils.parametrize("tma_version", ["new", "old"])
+    def test_tma_descriptor_1d(self, dynamic, backend, tma_version):
+        if tma_version == "new" and not has_triton_tensor_descriptor_host_tma():
+            self.skipTest("requires triton.tools.tensor_descriptor TMA support")
+        if tma_version == "old" and not has_triton_experimental_host_tma():
+            self.skipTest("requires triton.tools.experimental_descriptor TMA support")
+
+        kernel = (
+            add_kernel_with_tma_1d_new_api
+            if tma_version == "new"
+            else add_kernel_with_tma_1d_old_api
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(a, b):
             BLOCK_SIZE = 256
             out = torch.zeros_like(a)
             n_elements = out.numel()
 
             desc_a, desc_b, desc_out = (
+<<<<<<< HEAD
                 triton.tools.experimental_descriptor.create_1d_tma_descriptor(
                     t.data_ptr(),
                     n_elements,
                     BLOCK_SIZE,
                     t.element_size(),
+=======
+                create_tensor_descriptor_shim(
+                    t, [BLOCK_SIZE], new_api=(tma_version == "new")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for t in (a, b, out)
             )
 
             grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+<<<<<<< HEAD
             add_kernel_with_tma_1d[grid](
+=======
+            kernel[grid](
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 desc_a,
                 desc_b,
                 desc_out,
@@ -1849,25 +2093,50 @@ def f(a, b):
         self.assertEqual(compiled_out, expected_out)
 
     @requires_gpu
+<<<<<<< HEAD
     @unittest.skipIf(not has_triton_tma(), "requires Triton TMA support")
     def test_tma_descriptor_dedup(self):
+=======
+    @common_utils.parametrize("tma_version", ["new", "old"])
+    def test_tma_descriptor_dedup(self, tma_version):
+        if tma_version == "new" and not has_triton_tensor_descriptor_host_tma():
+            self.skipTest("requires triton.tools.tensor_descriptor TMA support")
+        if tma_version == "old" and not has_triton_experimental_host_tma():
+            self.skipTest("requires triton.tools.experimental_descriptor TMA support")
+
+        kernel = (
+            add_kernel_with_tma_1d_new_api
+            if tma_version == "new"
+            else add_kernel_with_tma_1d_old_api
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(a):
             BLOCK_SIZE = 256
             out = torch.zeros_like(a)
             n_elements = out.numel()
 
             desc_a, desc_out = (
+<<<<<<< HEAD
                 triton.tools.experimental_descriptor.create_1d_tma_descriptor(
                     t.data_ptr(),
                     n_elements,
                     BLOCK_SIZE,
                     t.element_size(),
+=======
+                create_tensor_descriptor_shim(
+                    t, [BLOCK_SIZE], new_api=(tma_version == "new")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for t in (a, out)
             )
 
             grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+<<<<<<< HEAD
             add_kernel_with_tma_1d[grid](
+=======
+            kernel[grid](
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 desc_a,
                 desc_a,
                 desc_out,
@@ -1894,6 +2163,7 @@ def f(a):
         self.assertEqual(compiled_out, expected_out)
 
         # 2 calls: one for two inputs (dedupped), one for the output
+<<<<<<< HEAD
         self.assertEqual(code.count("create_1d_tma_descriptor("), 2)
 
     @requires_gpu
@@ -1901,6 +2171,29 @@ def f(a):
     @common_utils.parametrize("dynamic", [False, True])
     @common_utils.parametrize("backend", ["eager", "aot_eager"])
     def test_tma_descriptor_2d(self, dynamic, backend):
+=======
+        if tma_version == "new":
+            self.assertEqual(code.count("TensorDescriptor.from_tensor("), 2)
+        else:
+            self.assertEqual(code.count("create_1d_tma_descriptor("), 2)
+
+    @requires_gpu
+    @common_utils.parametrize("dynamic", [False, True])
+    @common_utils.parametrize("backend", ["eager", "aot_eager"])
+    @common_utils.parametrize("tma_version", ["new", "old"])
+    def test_tma_descriptor_2d(self, dynamic, backend, tma_version):
+        if tma_version == "new" and not has_triton_tensor_descriptor_host_tma():
+            self.skipTest("requires triton.tools.tensor_descriptor TMA support")
+        if tma_version == "old" and not has_triton_experimental_host_tma():
+            self.skipTest("requires triton.tools.experimental_descriptor TMA support")
+
+        kernel = (
+            add_kernel_with_tma_2d_new_api
+            if tma_version == "new"
+            else add_kernel_with_tma_2d_old_api
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def f(a, b):
             BLOCK_SIZE_X = 16
             BLOCK_SIZE_Y = 32
@@ -1908,6 +2201,7 @@ def f(a, b):
             x_size, y_size = out.size()
 
             desc_a, desc_b, desc_out = (
+<<<<<<< HEAD
                 triton.tools.experimental_descriptor.create_2d_tma_descriptor(
                     t.data_ptr(),
                     x_size,
@@ -1915,6 +2209,10 @@ def f(a, b):
                     BLOCK_SIZE_X,
                     BLOCK_SIZE_Y,
                     t.element_size(),
+=======
+                create_tensor_descriptor_shim(
+                    t, [BLOCK_SIZE_X, BLOCK_SIZE_Y], new_api=(tma_version == "new")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for t in (a, b, out)
             )
@@ -1923,7 +2221,11 @@ def f(a, b):
                 triton.cdiv(x_size, meta["BLOCK_SIZE_X"]),
                 triton.cdiv(y_size, meta["BLOCK_SIZE_Y"]),
             )
+<<<<<<< HEAD
             add_kernel_with_tma_2d[grid](
+=======
+            kernel[grid](
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 desc_a,
                 desc_b,
                 desc_out,
@@ -2341,15 +2643,72 @@ def fn(x):
             "'BLOCK_SIZE': 'constexpr'"
         ).run(code[0])
 
+<<<<<<< HEAD
+=======
+    @requires_gpu
+    @inductor_config.patch({"triton.autotune_at_compile_time": True})
+    @parametrize("quotes", ["single", "double"])
+    def test_kernel_with_docstring(self, quotes):
+        kernel = (
+            kernel_with_docstring_single_quotes
+            if quotes == "single"
+            else kernel_with_docstring_double_quotes
+        )
+
+        # https://github.com/pytorch/pytorch/issues/155006
+        def fn(sz):
+            x = torch.empty(sz, device=GPU_TYPE)
+            BLOCK_SIZE = 32
+            grid = (triton.cdiv(sz, BLOCK_SIZE),)
+            kernel[grid](x, sz, BLOCK_SIZE)
+            return x
+
+        actual = fn(345)
+        expected = torch.compile(fn, fullgraph=True)(345)
+        self.assertEqual(actual, expected)
+
+    @requires_gpu
+    @skipIfRocm
+    @skipIfXpu
+    @inductor_config.patch({"triton.autotune_at_compile_time": True})
+    @parametrize("quotes", ["single", "double"])
+    def test_kernel_inline_asm(self, quotes):
+        kernel = (
+            kernel_inline_asm_single_quotes
+            if quotes == "single"
+            else kernel_inline_asm_double_quotes
+        )
+
+        # https://github.com/pytorch/pytorch/issues/155006
+        def fn(inp):
+            sz = inp.size(0)
+            x = torch.empty(sz, device=GPU_TYPE)
+            BLOCK_SIZE = 32
+            grid = (triton.cdiv(sz, BLOCK_SIZE),)
+            kernel[grid](inp, x, sz, BLOCK_SIZE)
+            return x
+
+        inp = torch.randn(345, device=GPU_TYPE)
+        actual = fn(inp)
+        expected = torch.compile(fn, fullgraph=True)(inp)
+        self.assertEqual(actual, expected)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def make_mutation_test(fn):
     @requires_gpu
     def test_fn(self):
         from torch._higher_order_ops.triton_kernel_wrap import identify_mutated_tensors
 
+<<<<<<< HEAD
         kernel, inputs, outputs = fn()
         self.assertListEqual(
             identify_mutated_tensors(kernel, inputs),
+=======
+        kernel, inputs, tma_descriptor_metadata, outputs = fn()
+        self.assertListEqual(
+            identify_mutated_tensors(kernel, inputs, tma_descriptor_metadata),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             outputs,
         )
 
@@ -2401,6 +2760,10 @@ def add_kernel_out_of_order(
                 "out_ptr": t,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         )
 
@@ -2432,6 +2795,10 @@ def add_kernel_out_of_order_fn1(
                 "out_ptr": t,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         )
 
@@ -2462,7 +2829,11 @@ def reduce_sum_kernel(a_ptr, c_ptr, stride_am, stride_an):
         # old TTIR string parsing-based one). remove this gating
         # and use ["c_ptr"] as `expected` after the new Triton
         # pin lands both in OSS and internally.
+<<<<<<< HEAD
         ttir_module, _ = generate_ttir(kernel, kwargs)
+=======
+        ttir_module, _ = generate_ttir(kernel, kwargs, tma_descriptor_metadata={})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if hasattr(ttir_module, "walk"):
             # with MLIR-based Triton analysis pass
             expected = ["c_ptr"]
@@ -2473,6 +2844,10 @@ def reduce_sum_kernel(a_ptr, c_ptr, stride_am, stride_an):
         return (
             kernel,
             kwargs,
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             expected,
         )
 
@@ -2503,7 +2878,11 @@ def argmax_kernel(a_ptr, c_ptr, stride_am, stride_an):
         # old TTIR string parsing-based one). remove this gating
         # and use ["c_ptr"] as `expected` after the new Triton
         # pin lands both in OSS and internally.
+<<<<<<< HEAD
         ttir_module, _ = generate_ttir(kernel, kwargs)
+=======
+        ttir_module, _ = generate_ttir(kernel, kwargs, tma_descriptor_metadata={})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if hasattr(ttir_module, "walk"):
             # with MLIR-based Triton analysis pass
             expected = ["c_ptr"]
@@ -2514,6 +2893,10 @@ def argmax_kernel(a_ptr, c_ptr, stride_am, stride_an):
         return (
             kernel,
             kwargs,
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             expected,
         )
 
@@ -2559,7 +2942,11 @@ def cumsum_kernel(in_ptr, out_ptr, XBLOCK: tl.constexpr, RBLOCK: tl.constexpr):
         # old TTIR string parsing-based one). remove this gating
         # and use ["out_ptr"] as `expected` after the new Triton
         # pin lands both in OSS and internally.
+<<<<<<< HEAD
         ttir_module, _ = generate_ttir(kernel, kwargs)
+=======
+        ttir_module, _ = generate_ttir(kernel, kwargs, tma_descriptor_metadata={})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if hasattr(ttir_module, "walk"):
             # with MLIR-based Triton analysis pass
             expected = ["out_ptr"]
@@ -2570,6 +2957,10 @@ def cumsum_kernel(in_ptr, out_ptr, XBLOCK: tl.constexpr, RBLOCK: tl.constexpr):
         return (
             kernel,
             kwargs,
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             expected,
         )
 
@@ -2603,6 +2994,10 @@ def add_kernel_with_fn_call(
                 "out_ptr": t,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         )
 
@@ -2635,6 +3030,10 @@ def add_kernel_with_fn_call(
                 "out_ptr": t,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         )
 
@@ -2671,6 +3070,10 @@ def nested_cond_op_kernel(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         )
 
@@ -2705,6 +3108,10 @@ def add_4_times_kernel(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         )
 
@@ -2739,6 +3146,10 @@ def add_1_time_kernel(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         )
 
@@ -2774,6 +3185,10 @@ def add_4_times_kernel(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         )
 
@@ -2812,6 +3227,10 @@ def add_4_times_kernel(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         )
 
@@ -2846,6 +3265,10 @@ def kernel_with_label(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         )
 
@@ -2905,6 +3328,10 @@ def fwd_kernel(
                 "BLOCK_SIZE_M": 64,
                 "BLOCK_SIZE_C2": 64,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["O_ptr"],
         )
 
@@ -2964,6 +3391,10 @@ def fwd_kernel(
                 "BLOCK_M": M,
                 "BLOCK_N": N,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["o_ptr"],
         )
 
@@ -3025,6 +3456,10 @@ def fwd_kernel(
                 "BLOCK_M": M,
                 "BLOCK_N": N,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["o_ptr"],
         )
 
@@ -3070,9 +3505,117 @@ def branch_with_multiple_yield_args(
                 "n_elements": 14,
                 "BLOCK_SIZE": 16,
             },
+<<<<<<< HEAD
+            ["out_ptr"],
+        )
+
+=======
+            {},
             ["out_ptr"],
         )
 
+    def test_get_tma_stores(self):
+        from torch._higher_order_ops.triton_kernel_wrap import (
+            get_tma_stores,
+            Intermediate,
+            Op,
+            Param,
+        )
+
+        functions = {
+            "helper": {
+                Intermediate(idx=0): [
+                    Op(
+                        "tt.reinterpret_tensor_descriptor",
+                        None,
+                        [Param(idx=0)],
+                        Intermediate(idx=0),
+                    )
+                ],
+            },
+            "main": {
+                Intermediate(idx=-1): [
+                    Op(
+                        "tt.call",
+                        "helper",
+                        [Param(idx=0), Param(idx=1)],
+                        Intermediate(idx=-1),
+                    )
+                ],
+            },
+        }
+
+        self.assertEqual(get_tma_stores(functions, "helper"), set())
+        self.assertEqual(get_tma_stores(functions, "main"), set())
+
+        functions["helper"][Intermediate(idx=-1)] = [
+            Op(
+                "tt.experimental_descriptor_store",
+                None,
+                [Intermediate(idx=0), Param(idx=1)],
+                Intermediate(idx=-1),
+            )
+        ]
+        get_tma_stores.reset()
+
+        self.assertEqual(
+            get_tma_stores(functions, "helper"), {Param(idx=0), Intermediate(idx=0)}
+        )
+        self.assertEqual(get_tma_stores(functions, "main"), {Param(idx=0)})
+
+    @unittest.skipIf(
+        not has_triton_experimental_host_tma(),
+        "requires experimental TMA descriptor API",
+    )
+    @make_mutation_test
+    def test_add_kernel_on_device_tma_old_api():
+        a = torch.randn(1024, 1024)
+        b = torch.randn(1024, 1024)
+        c = torch.empty(1024, 1024)
+        workspace = torch.empty(128 * 3, dtype=torch.int8)
+        return (
+            add_kernel_on_device_tma_old_api,
+            {
+                "a_ptr": a,
+                "b_ptr": b,
+                "c_ptr": c,
+                "m": 1024,
+                "n": 1024,
+                "workspace": workspace,
+                "BLOCK_SIZE": 32,
+            },
+            {},
+            ["c_ptr", "workspace"],
+        )
+
+    @unittest.skipIf(
+        not has_triton_tensor_descriptor_host_tma(),
+        "requires TensorDescriptor API in Triton",
+    )
+    @make_mutation_test
+    def test_add_kernel_on_device_tma_new_api():
+        a = torch.randn(1024, 1024)
+        b = torch.randn(1024, 1024)
+        c = torch.empty(1024, 1024)
+        workspace = torch.empty(
+            128 * 3, dtype=torch.int8
+        )  # Not used by the new API but kept for consistency
+        return (
+            add_kernel_on_device_tma_new_api,
+            {
+                "a_ptr": a,
+                "b_ptr": b,
+                "c_ptr": c,
+                "m": 1024,
+                "n": 1024,
+                "workspace": workspace,
+                "BLOCK_SIZE": 32,
+            },
+            {},
+            ["c_ptr"],
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if HAS_GPU:
     t = torch.randn(4)
@@ -3087,6 +3630,10 @@ def branch_with_multiple_yield_args(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         ],
         [
@@ -3098,6 +3645,10 @@ def branch_with_multiple_yield_args(
                 "x_elements": 4,
                 "y_elements": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         ],
         [
@@ -3109,6 +3660,10 @@ def branch_with_multiple_yield_args(
                 "BLOCK_SIZE": 4,
                 "ACTIVATION": "mul2_inplace_kernel",
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["in_ptr0", "out_ptr"],
         ],
         [
@@ -3120,21 +3675,37 @@ def branch_with_multiple_yield_args(
                 "BLOCK_SIZE": 4,
                 "ACTIVATION": "add_kernel",
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         ],
         [
             mul2_inplace_kernel,
             {"ptr": t, "n_elements": 4, "BLOCK_SIZE": 4},
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["ptr"],
         ],
         [
             inline_asm_kernel_is_pure_true,
             {"X": t, "Y": t, "Z": t, "n": 4, "BLOCK": 4},
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["Z"],
         ],
         [
             inline_asm_kernel_is_pure_false,
             {"X": t, "Y": t, "Z": t, "n": 4, "BLOCK": 4},
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["X", "Y", "Z"],
         ],
         [
@@ -3146,6 +3717,10 @@ def branch_with_multiple_yield_args(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["output_ptr"],
         ],
         [
@@ -3156,6 +3731,10 @@ def branch_with_multiple_yield_args(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["output_ptr"],
         ],
         [
@@ -3167,6 +3746,10 @@ def branch_with_multiple_yield_args(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         ],
         [
@@ -3178,6 +3761,10 @@ def branch_with_multiple_yield_args(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         ],
         [
@@ -3189,6 +3776,10 @@ def branch_with_multiple_yield_args(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
+=======
+            {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ["out_ptr"],
         ],
         [
@@ -3200,6 +3791,7 @@ def branch_with_multiple_yield_args(
                 "n_elements": 4,
                 "BLOCK_SIZE": 4,
             },
+<<<<<<< HEAD
             ["out_ptr"],
         ],
     ]
@@ -3210,6 +3802,23 @@ def branch_with_multiple_yield_args(
             lambda kernel=kernel, inputs=inputs, outputs=outputs: (
                 kernel,
                 inputs,
+=======
+            {},
+            ["out_ptr"],
+        ],
+    ]
+    for kernel, inputs, tma_descriptor_metadata, outputs in tests:
+        fn = make_mutation_test(
+            # Add default arguments to avoid Python lambda capture pitfall
+            # This forces the capture at lambda creation
+            lambda kernel=kernel,
+            inputs=inputs,
+            tma_descriptor_metadata=tma_descriptor_metadata,
+            outputs=outputs: (
+                kernel,
+                inputs,
+                tma_descriptor_metadata,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 outputs,
             )
         )
@@ -3385,7 +3994,14 @@ def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         self.assertEqual(z, (x + y) * 2)
 
     @requires_gpu
+<<<<<<< HEAD
     def test_preserves_strides(self):
+=======
+    @common_utils.parametrize(
+        "variant", ["triton_kernel", "custom_op", "mutable_custom_op"]
+    )
+    def test_preserves_strides(self, variant):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import triton
         import triton.language as tl
 
@@ -3409,12 +4025,19 @@ def add_kernel(
         x = torch.randn(4, 4, 2, 2, device=GPU_TYPE)
         other = torch.randn(4, 4, 2, 2, device=GPU_TYPE)
 
+<<<<<<< HEAD
         def f(x, other):
             y = x.transpose(2, 3).contiguous().transpose(2, 3)
             z = y.sin().transpose(2, 3)
             grid = (z.numel(),)
             out = torch.empty_like(other)
             add_kernel[grid](z, other, out, z.numel(), BLOCK_SIZE=16)
+=======
+        def add_triton(y, z):
+            grid = (z.numel(),)
+            out = torch.empty_like(z, memory_format=torch.contiguous_format)
+            add_kernel[grid](y, z, out, z.numel(), BLOCK_SIZE=16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return out
 
         class _CustomPass(PatternMatcherPass):
@@ -3436,8 +4059,13 @@ def _(match, *args, **kwargs):
 
             def decomp(*flat_args):
                 args, kwargs = pytree.tree_unflatten(flat_args, spec)
+<<<<<<< HEAD
                 return torch.ops.aten.permute(*args, **kwargs).clone(
                     memory_format=torch.channels_last
+=======
+                return torch.ops.mylib.force_channels_last(
+                    torch.ops.aten.permute(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             nonlocal called
@@ -3446,12 +4074,70 @@ def decomp(*flat_args):
 
         from torch._inductor import config
 
+<<<<<<< HEAD
         with config.patch(
             post_grad_custom_post_pass=g,
         ):
             f_compile = torch.compile(f)
             self.assertEqual(f(x, other), f_compile(x, other))
             self.assertTrue(called)
+=======
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            lib.define(
+                "force_channels_last(Tensor x) -> Tensor",
+                tags=[torch._C.Tag.flexible_layout],
+            )
+
+            def impl2(x):
+                return x.clone(memory_format=torch.channels_last)
+
+            lib.impl("force_channels_last", impl2, "CompositeExplicitAutograd")
+
+            lib.define(
+                "add_op(Tensor x, Tensor y) -> Tensor",
+            )
+
+            def impl(x, y):
+                return add_triton(x, y)
+
+            def meta(x, y):
+                return torch.empty_like(y, memory_format=torch.contiguous_format)
+
+            lib.impl("add_op", impl, "CompositeExplicitAutograd")
+            lib.impl("add_op", meta, "Meta")
+
+            lib.define(
+                "add_out_op(Tensor x, Tensor y, Tensor(a!) out) -> ()",
+            )
+
+            def impl_out(x, y, out):
+                grid = (y.numel(),)
+                add_kernel[grid](x, y, out, y.numel(), BLOCK_SIZE=16)
+
+            lib.impl("add_out_op", impl_out, "CompositeExplicitAutograd")
+            lib.impl("add_out_op", lambda x, y, out: None, "Meta")
+
+            def f(x, other):
+                y = x.transpose(2, 3).contiguous().transpose(2, 3)
+                z = y.sin().transpose(2, 3)
+                if variant == "triton_kernel":
+                    return add_triton(y, z)
+                elif variant == "custom_op":
+                    return torch.ops.mylib.add_op.default(y, z)
+                elif variant == "mutable_custom_op":
+                    out = torch.empty_like(y, memory_format=torch.contiguous_format)
+                    torch.ops.mylib.add_out_op(y, z, out)
+                    return out
+                else:
+                    raise AssertionError("should not be hit")
+
+            with config.patch(
+                post_grad_custom_post_pass=g,
+            ):
+                f_compile = torch.compile(f, fullgraph=True)
+                self.assertEqual(f(x, other), f_compile(x, other))
+                self.assertTrue(called)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @requires_gpu
     @common_utils.parametrize("dynamic", [False, True])
@@ -3627,9 +4313,16 @@ def grid(META):
 
         torch._dynamo.decorators.mark_unbacked(x, 0)
 
+<<<<<<< HEAD
         with log_settings("+output_code"), self.assertLogs(
             logger="torch._inductor", level=logging.DEBUG
         ) as log:
+=======
+        with (
+            log_settings("+output_code"),
+            self.assertLogs(logger="torch._inductor", level=logging.DEBUG) as log,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             foo(x, w)
 
         output = "\n".join(record.getMessage() for record in log.records)
diff --git a/test/inductor/test_unbacked_symints.py b/test/inductor/test_unbacked_symints.py
index d85be61f64e0..d71d2c79d06c 100644
--- a/test/inductor/test_unbacked_symints.py
+++ b/test/inductor/test_unbacked_symints.py
@@ -299,6 +299,29 @@ def fn(value, mask):
 
     @skipGPUIf(not HAS_GPU, "requires gpu and triton")
     @dynamo_config.patch({"capture_scalar_outputs": True})
+<<<<<<< HEAD
+=======
+    def test_unbacked_repeat(self, device):
+        def fn(x, a, b):
+            u0, u1 = a.item(), b.item()
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+
+            return x.repeat(u0, 2).repeat(2, u1)
+
+        example_inputs = (
+            make_tensor(1, 16, dtype=torch.float32, device=device),
+            torch.scalar_tensor(2, dtype=torch.int32, device=device),
+            torch.scalar_tensor(4, dtype=torch.int32, device=device),
+        )
+
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
+    @skipGPUIf(not HAS_GPU, "requires gpu and triton")
+    @dynamo_config.patch({"capture_scalar_outputs": True})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("dynamic", [False, True, None])
     def test_unbacked_slice_on_subclass(self, device, dynamic):
         from torch.testing._internal.common_subclass import WrapperTensor
@@ -442,6 +465,36 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1):
         model = Model()
         self.assertEqual(torch.compile(model)(*example_inputs), model(*example_inputs))
 
+<<<<<<< HEAD
+=======
+    @skipGPUIf(not HAS_GPU, "torch.compile for gpu requires triton")
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_einsum(self, device):
+        def fn(q, k, vector, scalar):
+            unbacked = scalar.item()
+            q = q.repeat(1, unbacked, 1, 1)
+            k = k.repeat(1, unbacked, 1, 1)
+
+            qk = torch.einsum("bcxd,bcyd->bcxy", (q, k))
+            qk2 = torch.einsum("b...,b...->b...", (q, k))
+            qvec = torch.einsum("b...,b->b...", (q, vector))
+            return qk, qk2, qvec
+
+        example_inputs = (
+            torch.empty_strided(
+                (12, 1, 512, 64), (64, 196608, 768, 1), device=device
+            ).uniform_(0, 1),
+            torch.empty_strided(
+                (12, 1, 512, 64), (64, 196608, 768, 1), device=device
+            ).uniform_(0, 1),
+            torch.randn((12,), device=device),
+            torch.scalar_tensor(10, device=device, dtype=torch.int8),
+        )
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestUnbackedSymints, globals(), allow_xpu=True)
 
diff --git a/test/inductor/test_utils.py b/test/inductor/test_utils.py
index 693afc15dee1..81d5a557d366 100644
--- a/test/inductor/test_utils.py
+++ b/test/inductor/test_utils.py
@@ -1,10 +1,20 @@
 # Owner(s): ["module: inductor"]
 
+<<<<<<< HEAD
 from sympy import Symbol
 
 import torch
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import sympy_subs
+=======
+from sympy import Symbol, sympify
+
+import torch
+from torch._inductor.fx_utils import count_flops_fx, countable_fx
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import sympy_str, sympy_subs
+from torch._inductor.virtualized import V
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestUtils(TestCase):
@@ -72,6 +82,123 @@ def testSympySubs(self):
         self.assertEqual(result.is_integer, None)
         self.assertEqual(result.is_nonnegative, None)
 
+<<<<<<< HEAD
+=======
+    def test_sympy_str(self):
+        self.assertEqual(sympy_str(sympify("a+b+c")), "a + b + c")
+        self.assertEqual(sympy_str(sympify("a*b+c")), "c + a * b")
+        self.assertEqual(sympy_str(sympify("a+b*(c+d)")), "a + b * (c + d)")
+        self.assertEqual(sympy_str(sympify("(a+b)*(c+d)")), "(a + b) * (c + d)")
+        self.assertEqual(sympy_str(sympify("-a")), "-a")
+        self.assertEqual(sympy_str(sympify("a-b")), "a - b")
+        self.assertEqual(sympy_str(sympify("a+-b")), "a - b")
+
+    def test_flops_fx(self):
+        def create_fx_node(
+            aten: torch._ops.OpOverloadPacket, args, kwargs
+        ) -> tuple[torch.fx.Node, torch.fx.Node]:
+            node1 = torch.fx.Node(
+                graph=torch.fx.Graph(),
+                name="",
+                op="call_function",
+                target=aten,
+                args=args,
+                kwargs=kwargs,
+            )
+            name: str = aten.overloads()[0]
+            op_overload: torch._ops.OpOverload = getattr(aten, name)
+            node2 = torch.fx.Node(
+                graph=torch.fx.Graph(),
+                name="",
+                op="call_function",
+                target=op_overload,
+                args=args,
+                kwargs=kwargs,
+            )
+            return node1, node2
+
+        with V.set_fake_mode(
+            torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+        ):
+            trues = [
+                (
+                    torch.ops.aten.addmm,
+                    (torch.Tensor(4, 4), torch.Tensor(4, 5), torch.Tensor(5, 4)),
+                    {},
+                ),
+                (
+                    torch.ops.aten.bmm,
+                    (torch.Tensor(10, 4, 5), torch.Tensor(10, 5, 4)),
+                    {},
+                ),
+                (torch.ops.aten.mm, (torch.Tensor(2, 3), torch.Tensor(3, 2)), {}),
+                (
+                    torch.ops.aten.convolution,
+                    (
+                        torch.Tensor(2, 3, 3),
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2),
+                        (1, 1),
+                        (0, 0),
+                        (1, 1),
+                        True,
+                        (0, 0),
+                        1,
+                    ),
+                    {},
+                ),
+                (
+                    torch.ops.aten._convolution,
+                    (
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2, 2, 2),
+                        torch.Tensor(2),
+                        (1,),
+                        (0,),
+                        (1,),
+                        True,
+                        (0,),
+                        1,
+                        False,
+                        True,
+                        False,
+                    ),
+                    {},
+                ),
+            ]
+            # we don't support pointwise ops
+            falses = [
+                (
+                    torch.ops.aten.add,
+                    (torch.Tensor(1, 2, 3), torch.Tensor(1, 2, 3)),
+                    {},
+                ),
+                (
+                    torch.ops.aten.mul,
+                    (torch.Tensor(1, 2, 3), torch.Tensor(1, 2, 3)),
+                    {},
+                ),
+            ]
+            for t, args, kwargs in trues:
+                fx_node_1, fx_node_2 = create_fx_node(t, args, kwargs)
+                self.assertTrue(
+                    countable_fx(fx_node_1), f"Expected true {t}: {fx_node_1}"
+                )
+                self.assertTrue(
+                    countable_fx(fx_node_2), f"Expected true {t}: {fx_node_2}"
+                )
+                self.assertNotEqual(count_flops_fx(fx_node_1), None)
+                self.assertNotEqual(count_flops_fx(fx_node_2), None)
+            for f, args, kwargs in falses:
+                fx_node_1, fx_node_2 = create_fx_node(f, args, kwargs)
+                self.assertFalse(
+                    countable_fx(fx_node_1), f"Expected false {f}: {fx_node_1}"
+                )
+                self.assertFalse(
+                    countable_fx(fx_node_2), f"Expected false {f}: {fx_node_2}"
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/jit/test_alias_analysis.py b/test/jit/test_alias_analysis.py
index 222140dc5602..c19a85e43864 100644
--- a/test/jit/test_alias_analysis.py
+++ b/test/jit/test_alias_analysis.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch._C import parse_ir
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TemporaryFileName
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -14,6 +15,15 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    TemporaryFileName,
+)
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestAliasAnalysis(JitTestCase):
     def test_becomes_wildcard_annotations(self):
         graph_str = """
@@ -154,3 +164,10 @@ def forward(self, x):
             mod = ModuleWrapper(module_list)
             mod = torch.jit.script(mod)
             mod(torch.zeros((2, 2)))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_async.py b/test/jit/test_async.py
index e5d5de52bc13..dbe418822162 100644
--- a/test/jit/test_async.py
+++ b/test/jit/test_async.py
@@ -16,6 +16,10 @@
 
 from torch import Tensor
 from torch.jit import Future
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import _inline_everything, JitTestCase
 
 
@@ -547,8 +551,12 @@ def fn_float(x: int) -> Any:
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
         "\tpython test/test_jit.py TESTNAME\n\n"
         "instead."
     )
+=======
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_aten_pow.py b/test/jit/test_aten_pow.py
index d227f2525049..ea3e6161c17f 100644
--- a/test/jit/test_aten_pow.py
+++ b/test/jit/test_aten_pow.py
@@ -1,7 +1,11 @@
 # Owner(s): ["oncall: jit"]
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestAtenPow(TestCase):
@@ -99,3 +103,10 @@ def fn_float_float(a: float, b: float):
         self.assertEqual(fn_float_float(0.0, -0.0), 0.0 ** (-0.0))
         # zero base and negative exponent case that should trigger RunTimeError
         self.assertRaises(RuntimeError, fn_float_float, 0.0, -2.0)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_attr.py b/test/jit/test_attr.py
index 2e641b91d66e..f0a45ec0f5f5 100644
--- a/test/jit/test_attr.py
+++ b/test/jit/test_attr.py
@@ -4,6 +4,7 @@
 
 import torch
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -15,6 +16,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestGetDefaultAttr(JitTestCase):
     def test_getattr_with_default(self):
         class A(torch.nn.Module):
@@ -66,3 +73,10 @@ def fn(x: Tuple[str, int]) -> int:
 
         with self.assertRaisesRegex(RuntimeError, "but got a normal Tuple"):
             torch.jit.script(fn)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_autodiff.py b/test/jit/test_autodiff.py
index 0594efd6ea51..566debe65951 100644
--- a/test/jit/test_autodiff.py
+++ b/test/jit/test_autodiff.py
@@ -4,7 +4,14 @@
 from typing import List
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfTorchDynamo
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -145,3 +152,10 @@ def fn(a, b, c):
             self.assertEqual(x_s.requires_grad, x.requires_grad)
             self.assertEqual(y_s.requires_grad, y.requires_grad)
             self.assertEqual(z_s.requires_grad, z.requires_grad)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_autodiff_subgraph_slicing.py b/test/jit/test_autodiff_subgraph_slicing.py
index ea367108788b..b42d7a32998e 100644
--- a/test/jit/test_autodiff_subgraph_slicing.py
+++ b/test/jit/test_autodiff_subgraph_slicing.py
@@ -20,12 +20,17 @@
 from typing import List, Optional, Tuple
 
 from torch.testing import FileCheck
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import (
     disable_autodiff_subgraph_inlining,
     JitTestCase,
 )
 
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -34,6 +39,8 @@
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @unittest.skipIf(
     GRAPH_EXECUTOR == ProfilingMode.SIMPLE, "Simple Executor doesn't support gradients"
 )
@@ -589,3 +596,10 @@ def test_has_profiled_info_aliasing_outputs(self):
         FileCheck().check("= prim::DifferentiableGraph").check(
             "with prim::DifferentiableGraph"
         ).check(" = aten::relu").check("requires_grad=0").check("aten::relu").run(graph)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_await.py b/test/jit/test_await.py
index 7a65beb9bdbd..b4122c04a545 100644
--- a/test/jit/test_await.py
+++ b/test/jit/test_await.py
@@ -6,6 +6,10 @@
 import torch
 from torch import Tensor
 from torch._awaits import _Await as Await
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
@@ -390,3 +394,10 @@ def main(x: Tensor) -> Tensor:
         sm = torch.jit.load(iofile)
         script_out_load = sm(inp)
         self.assertTrue(torch.allclose(expected, script_out_load))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_backend_nnapi.py b/test/jit/test_backend_nnapi.py
index 9f4771665020..149814a71008 100644
--- a/test/jit/test_backend_nnapi.py
+++ b/test/jit/test_backend_nnapi.py
@@ -7,7 +7,15 @@
 
 import torch
 import torch._C
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_FBCODE, skipIfTorchDynamo
+=======
+from torch.testing._internal.common_utils import (
+    IS_FBCODE,
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # hacky way to skip these tests in fbcode:
@@ -28,6 +36,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -35,6 +44,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Unit Tests for Nnapi backend with delegate
 Inherits most tests from TestNNAPI, which loads Android NNAPI models
@@ -139,3 +150,10 @@ def test_compile_spec_santiy(self):
     def tearDown(self):
         # Change dtype back to default (Otherwise, other unit tests will complain)
         torch.set_default_dtype(self.default_dtype)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py
index 8453f59cfdbe..23773526e90e 100644
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@@ -15,6 +15,10 @@
     IS_MACOS,
     IS_SANDCASTLE,
     IS_WINDOWS,
+<<<<<<< HEAD
+=======
+    raise_on_run_directly,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfRocm,
     TEST_WITH_ROCM,
 )
@@ -25,6 +29,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -32,6 +37,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def to_test_backend(module, method_compile_spec):
     return torch._C._jit_to_backend(
@@ -822,3 +829,10 @@ def test_attribute(self):
         )
         self.assertEqual(pre_bundled, post_bundled)
         self.assertEqual(post_bundled, post_load)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_batch_mm.py b/test/jit/test_batch_mm.py
index a0f0eb76bbba..645ae8d7a031 100644
--- a/test/jit/test_batch_mm.py
+++ b/test/jit/test_batch_mm.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -13,13 +14,27 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestBatchMM(JitTestCase):
     @staticmethod
     def _get_test_tensors(n: int):
         return [
+<<<<<<< HEAD
             torch.tensor([[1 + x, 2 + x, 3 + x], [4 + x, 5 + x, 6 + x]])
             if x % 2 == 0
             else torch.tensor([[1 + x, 2 + x], [3 + x, 4 + x], [5 + x, 6 + x]])
+=======
+            (
+                torch.tensor([[1 + x, 2 + x, 3 + x], [4 + x, 5 + x, 6 + x]])
+                if x % 2 == 0
+                else torch.tensor([[1 + x, 2 + x], [3 + x, 4 + x], [5 + x, 6 + x]])
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for x in range(n)
         ]
 
@@ -288,3 +303,10 @@ def test_batch_mm(n: int):
         FileCheck().check_count("aten::mm", 10, exactly=True).check_not(
             "prim::MMBatchSide"
         ).run(test_batch_mm.graph)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_builtins.py b/test/jit/test_builtins.py
index 510b911e4633..f210adf1f376 100644
--- a/test/jit/test_builtins.py
+++ b/test/jit/test_builtins.py
@@ -13,6 +13,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA
 
 
@@ -24,6 +25,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestBuiltins(JitTestCase):
     """
     Tests for TorchScript support of Python builtin functions.
@@ -299,3 +306,10 @@ def test_func(func, x, tensor):
                 self.assertEqual(
                     test_func(script_funs[i], x, tensor), test_func(funs[i], x, tensor)
                 )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
index 02182b3b2fbf..8a96a9800535 100644
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@@ -18,6 +18,7 @@
 from typing import Dict, Iterable, List, Optional, Tuple
 
 import torch.testing._internal.jit_utils
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_SANDCASTLE, skipIfTorchDynamo
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
@@ -30,6 +31,16 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import (
+    IS_SANDCASTLE,
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+)
+from torch.testing._internal.jit_utils import JitTestCase, make_global
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestClassType(JitTestCase):
     def test_reference_semantics(self):
         """
@@ -1667,3 +1678,10 @@ def fn_e():
         for fn in (fn_a, fn_b, fn_c, fn_d, fn_e):
             with self.assertRaisesRegex(RuntimeError, error_message_regex):
                 torch.jit.script(fn)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_complex.py b/test/jit/test_complex.py
index 0cbec5605c08..55e596037e0c 100644
--- a/test/jit/test_complex.py
+++ b/test/jit/test_complex.py
@@ -8,7 +8,11 @@
 from typing import Dict, List
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_MACOS
+=======
+from torch.testing._internal.common_utils import IS_MACOS, raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import execWrapper, JitTestCase
 
 
@@ -617,3 +621,10 @@ def div(x: complex, y: torch.Tensor):
                 scripted = torch.jit.script(op)
                 jit_result = scripted(x, y)
                 self.assertEqual(eager_result, jit_result)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_complexity.py b/test/jit/test_complexity.py
index cd022eb52244..2e43988384bd 100644
--- a/test/jit/test_complexity.py
+++ b/test/jit/test_complexity.py
@@ -13,7 +13,10 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
+<<<<<<< HEAD
     run_tests,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set_default_dtype,
     suppress_warnings,
 )
@@ -105,4 +108,11 @@ def test_nn_module_tests(self):
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     run_tests()
+=======
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_convert_activation.py b/test/jit/test_convert_activation.py
index 56826193ce78..c2b68be5959c 100644
--- a/test/jit/test_convert_activation.py
+++ b/test/jit/test_convert_activation.py
@@ -22,6 +22,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -32,6 +33,12 @@
         "instead."
     )
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 activations = [
     F.celu,
     F.elu,
@@ -204,3 +211,10 @@ def test_resnet18_correctness(self):
         inp = torch.randn(N, C, H, W)
         self.run_pass("inplace_to_functional_activation", frozen_model.graph)
         self.assertEqual(model(inp), frozen_model(inp))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_cuda.py b/test/jit/test_cuda.py
index fb7e5cd325d4..43e8bd5e11db 100644
--- a/test/jit/test_cuda.py
+++ b/test/jit/test_cuda.py
@@ -12,6 +12,10 @@
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_utils import (
     NoTest,
+<<<<<<< HEAD
+=======
+    raise_on_run_directly,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipCUDANonDefaultStreamIf,
     skipIfRocm,
     TEST_CUDA,
@@ -36,6 +40,7 @@
     torch.ones(1).cuda()  # initialize cuda context
     TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 5e9
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -43,6 +48,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestCUDA(JitTestCase):
     """
@@ -698,3 +705,10 @@ def fn(device: int, tensor):
         FileCheck().check("cuda::_maybe_exchange_device(").run(g)
         torch._C._jit_pass_inline(g)
         FileCheck().check("cuda::_maybe_exchange_device(").run(g)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_custom_operators.py b/test/jit/test_custom_operators.py
index 498179f91064..61def09c3bbf 100644
--- a/test/jit/test_custom_operators.py
+++ b/test/jit/test_custom_operators.py
@@ -10,6 +10,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -21,6 +22,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def canonical(graph):
     return torch._C._jit_pass_canonicalize(graph).str(False)
 
@@ -151,3 +158,10 @@ def test_generic_list(self):
     def test_where_no_scalar(self):
         x = torch.rand(1, 3, 224, 224)
         torch.ops.aten.where(x > 0.5, -1.5, 1.5)  # does not raise
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_data_parallel.py b/test/jit/test_data_parallel.py
index fc69e86a8cf4..807a347321ac 100644
--- a/test/jit/test_data_parallel.py
+++ b/test/jit/test_data_parallel.py
@@ -12,6 +12,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA_MULTI_GPU
 
 
@@ -23,6 +24,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA_MULTI_GPU
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestDataParallel(JitTestCase):
     class Mpy(torch.nn.Module):
         def __init__(self) -> None:
@@ -158,3 +165,10 @@ def test_tensor_sharing_with_forward(self):
         x1 = torch.ones(2, 2, requires_grad=True).cuda(device=1)
         r1_forward = replica[1](x1)
         self.assertEqual(first_forward, r1_forward)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_dataclasses.py b/test/jit/test_dataclasses.py
index 4cebc2706568..285922f55096 100644
--- a/test/jit/test_dataclasses.py
+++ b/test/jit/test_dataclasses.py
@@ -1,8 +1,12 @@
 # Owner(s): ["oncall: jit"]
+<<<<<<< HEAD
 # flake8: noqa
 
 import sys
 import unittest
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass, field, InitVar
 from enum import Enum
 from typing import List, Optional
@@ -10,6 +14,10 @@
 from hypothesis import given, settings, strategies as st
 
 import torch
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -171,3 +179,10 @@ def f(a: MixupParams3):
 
         with self.assertRaises(OSError):
             torch.jit.script(f)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_dce.py b/test/jit/test_dce.py
index fd41acaf1c92..5d95771f4f92 100644
--- a/test/jit/test_dce.py
+++ b/test/jit/test_dce.py
@@ -2,6 +2,10 @@
 
 import torch
 from torch.testing import FileCheck
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
@@ -44,3 +48,39 @@ def forward(self):
         # freezing inlines t1.__init__(), after which DCE can occur.
         t2 = torch.jit.freeze(t2)
         FileCheck().check_not("prim::SetAttr").run(t2.graph)
+<<<<<<< HEAD
+=======
+
+    def test_mutated_simple(self):
+        def fn(x: torch.Tensor):
+            y = x.sin()
+            y_slice = y[::2]
+            y_slice.add_(x[::2])
+            z = y.cos()
+            return z
+
+        fn_s = torch.jit.script(fn)
+        torch._C._jit_pass_dce_graph(fn_s.graph)
+
+        FileCheck().check("aten::add_").run(fn_s.graph)
+
+    def test_mutated_loop(self):
+        def fn(x: torch.Tensor):
+            y = x.sin()
+            y_slice = y[::2]
+            y_slice.add_(x[::2])
+            for _ in range(2):
+                y_slice = y[::2]
+                y = y.repeat(2)
+            z = y.cos()
+            return z
+
+        fn_s = torch.jit.script(fn)
+        torch._C._jit_pass_dce_graph(fn_s.graph)
+
+        FileCheck().check("aten::add_").run(fn_s.graph)
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_decorator.py b/test/jit/test_decorator.py
index 5f9fafda86e5..1670b2f20eb7 100644
--- a/test/jit/test_decorator.py
+++ b/test/jit/test_decorator.py
@@ -1,10 +1,14 @@
 # Owner(s): ["oncall: jit"]
+<<<<<<< HEAD
 # flake8: noqa
 
 import sys
 import unittest
 from enum import Enum
 from typing import List, Optional
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from jit.myfunction_a import my_function_a
@@ -24,3 +28,13 @@ def test_decorator(self):
         fn = my_function_a
         fx = torch.jit.script(fn)
         self.assertEqual(fn(1.0), fx(1.0))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_device_analysis.py b/test/jit/test_device_analysis.py
index e2cb461924d8..fffa67c7241e 100644
--- a/test/jit/test_device_analysis.py
+++ b/test/jit/test_device_analysis.py
@@ -5,7 +5,11 @@
 
 import torch
 from torch.jit._passes._property_propagation import apply_input_props_using_example
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TEST_CUDA
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TEST_CUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -14,6 +18,7 @@
 except ImportError:
     models = None
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -21,6 +26,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestDeviceAnalysis(JitTestCase):
     @classmethod
@@ -336,3 +343,10 @@ def test_fn(x, y, z: bool, a: bool):
             test_fn, [self.mkldnn, self.mkldnn, None, None], self.mkldnn
         )
         self.assert_device_equal(test_fn, [self.cpu, self.cuda, None, None], None)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_dtype_analysis.py b/test/jit/test_dtype_analysis.py
index 1a5fd2038bdb..bd8b0f29e749 100644
--- a/test/jit/test_dtype_analysis.py
+++ b/test/jit/test_dtype_analysis.py
@@ -17,7 +17,15 @@
     sample_inputs_conv2d,
     SampleInput,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import first_sample, set_default_dtype
+=======
+from torch.testing._internal.common_utils import (
+    first_sample,
+    raise_on_run_directly,
+    set_default_dtype,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_metaprogramming_utils import create_traced_fn
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -27,6 +35,7 @@
 """
 
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -35,6 +44,8 @@
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 custom_rules_works_list = {
     "nn.functional.adaptive_avg_pool1d",
     "nn.functional.adaptive_avg_pool2d",
@@ -386,3 +397,9 @@ def test_custom_rules_expected_failure(self, device, dtype, op):
 TestDtypeCustomRulesCPU = None
 # This creates TestDtypeCustomRulesCPU
 instantiate_device_type_tests(TestDtypeCustomRules, globals(), only_for=("cpu",))
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_enum.py b/test/jit/test_enum.py
index d7b703eb1b03..d30261f0e3f7 100644
--- a/test/jit/test_enum.py
+++ b/test/jit/test_enum.py
@@ -12,6 +12,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
@@ -23,6 +24,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase, make_global
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestEnum(JitTestCase):
     def test_enum_value_types(self):
         class IntEnum(Enum):
@@ -358,3 +365,10 @@ class Color(int, Enum):
         @torch.jit.script
         def is_red(x: Color) -> bool:
             return x == Color.RED
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_exception.py b/test/jit/test_exception.py
index 38d9424d3b74..f717cbb60347 100644
--- a/test/jit/test_exception.py
+++ b/test/jit/test_exception.py
@@ -197,3 +197,13 @@ def fn():
             "jit.myexception.MyKeyError: This is a user defined key error",
         ):
             fn()
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 4f7e2a4b07d0..abd0780a9da8 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -15,7 +15,12 @@
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM
 from torch.testing._internal.common_quantized import override_quantized_engine
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     NAVI_ARCH,
+=======
+    raise_on_run_directly,
+    NAVI4_ARCH,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set_default_dtype,
     skipCUDAMemoryLeakCheckIf,
     skipIfRocmArch,
@@ -34,6 +39,7 @@
     HAS_TORCHVISION = False
 skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -41,6 +47,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST_ROCM = torch.cuda.is_available() and torch.version.hip is not None
 
 
@@ -57,7 +65,11 @@ def __init__(self) -> None:
                 self.a = 1  # folded
                 self.b = 1.2  # folded
                 self.c = "hello"  # folded
+<<<<<<< HEAD
                 self.c2 = "hi\xA1"  # not folded
+=======
+                self.c2 = "hi\xa1"  # not folded
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.d = [1, 1]  # folded
                 self.e = [1.0, 1.1]  # folded
                 self.f = ["hello", "world"]  # folded
@@ -69,7 +81,11 @@ def __init__(self) -> None:
                     torch.tensor([5.5], requires_grad=True),
                 )  # folded
                 self.h = {"layer": [torch.tensor([7.7], requires_grad=True)]}
+<<<<<<< HEAD
                 self.h2 = {"layer\xB1": [torch.tensor([8.8], requires_grad=True)]}
+=======
+                self.h2 = {"layer\xb1": [torch.tensor([8.8], requires_grad=True)]}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.t = torch.tensor([1.2, 2.4], requires_grad=True)  # folded
                 self.ts = [
                     torch.tensor([1.0, 2.0], requires_grad=True),
@@ -2975,7 +2991,11 @@ def test_conv_to_mkldnn_no_mkldnn(self):
             self.assertEqual(frozen(inp), mod(inp))
 
     @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
+<<<<<<< HEAD
     @skipIfRocmArch(NAVI_ARCH)  # not supported by MIOPEN on NAVI
+=======
+    @skipIfRocmArch(NAVI4_ARCH)  # not supported by MIOPEN on NAVI4x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_freeze_conv_relu_fusion(self):
         with set_default_dtype(torch.float):
             conv_bias = [True, False]
@@ -3038,7 +3058,11 @@ def forward(self, x):
                 self.assertEqual(mod_eager(inp), frozen_mod(inp))
 
     @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
+<<<<<<< HEAD
     @skipIfRocmArch(NAVI_ARCH)  # not supported by MIOPEN on NAVI
+=======
+    @skipIfRocmArch(NAVI4_ARCH)  # not supported by MIOPEN on NAVI4x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_freeze_conv_relu_fusion_not_forward(self):
         with set_default_dtype(torch.float):
 
@@ -3465,3 +3489,10 @@ def forward(self, x):
         mod = self.freezeAndConvert(mod_eager)
         FileCheck().check("aten::add_").run(mod.graph)
         self.checkResults(mod_eager, mod)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_functional_blocks.py b/test/jit/test_functional_blocks.py
index 29f180b66fff..aa14bab4a73d 100644
--- a/test/jit/test_functional_blocks.py
+++ b/test/jit/test_functional_blocks.py
@@ -10,6 +10,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -21,6 +22,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestFunctionalBlocks(JitTestCase):
     def test_subgraph_creation(self):
         def fn(x, y, z):
@@ -54,3 +61,10 @@ def fn(x, y, z):
         FileCheck().check("add").check("add_").check_not("mul").check(
             "FunctionalGraph"
         ).run(graph)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_fuser_common.py b/test/jit/test_fuser_common.py
index 9b0921d22b1f..e33ea2e164cb 100644
--- a/test/jit/test_fuser_common.py
+++ b/test/jit/test_fuser_common.py
@@ -1,6 +1,10 @@
 # Owner(s): ["oncall: jit"]
 
 import torch
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -19,3 +23,10 @@ def fn(x):
             # test fallback when optimization is not applicable
             y = fn(torch.randn(5, requires_grad=rq))
             self.assertEqual(y.requires_grad, rq)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit_fuser_te.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_generator.py b/test/jit/test_generator.py
index 5f6e15cf8fe3..0a94c836579a 100644
--- a/test/jit/test_generator.py
+++ b/test/jit/test_generator.py
@@ -6,6 +6,7 @@
 
 import torch
 from torch.nn import init
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfLegacyJitExecutor
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -18,6 +19,15 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfLegacyJitExecutor,
+)
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestGenerator(JitTestCase):
     # torch.jit.trace does not properly capture the generator manual seed
     # and thus is non deterministic even if the generator is manually seeded
@@ -193,3 +203,10 @@ def forward(self, x):
         except:  # noqa: B001, E722
             print(loaded_module.forward.code)
             raise
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_graph_rewrite_passes.py b/test/jit/test_graph_rewrite_passes.py
index 061ef66aa190..d9e58edfb4e9 100644
--- a/test/jit/test_graph_rewrite_passes.py
+++ b/test/jit/test_graph_rewrite_passes.py
@@ -3,6 +3,10 @@
 import torch
 import torch._C
 from torch.testing import FileCheck
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -59,3 +63,10 @@ def forward(self, x):
         FileCheck().check_not("aten::linear").run(model.graph)
         # make sure it runs
         model(x)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_hash.py b/test/jit/test_hash.py
index 439cd62a5bb1..e231e3fd4f75 100644
--- a/test/jit/test_hash.py
+++ b/test/jit/test_hash.py
@@ -10,6 +10,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -21,6 +22,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestHash(JitTestCase):
     def test_hash_tuple(self):
         def fn(t1: Tuple[int, int], t2: Tuple[int, int]) -> bool:
@@ -115,3 +122,10 @@ def fn(d1: torch.device, d2: torch.device):
         self.checkScript(fn, (gpu0, gpu1))
         self.checkScript(fn, (gpu0, cpu))
         self.checkScript(fn, (cpu, cpu))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_hooks.py b/test/jit/test_hooks.py
index 33e84440bbe6..dc97a960bd8c 100644
--- a/test/jit/test_hooks.py
+++ b/test/jit/test_hooks.py
@@ -33,6 +33,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -44,6 +45,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Tests for JIT forward hooks and pre-hooks
 class TestHooks(JitTestCase):
     def test_module_no_forward_input(self):
@@ -393,3 +400,10 @@ def forward_hook_wrong_output_from_prev_hook(
             r"Received type: 'str'. Expected type: 'Tuple\[str\]'",
         ):
             torch.jit.script(m)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_hooks_modules.py b/test/jit/test_hooks_modules.py
index ffcd6fea37fd..908abfbc7da2 100644
--- a/test/jit/test_hooks_modules.py
+++ b/test/jit/test_hooks_modules.py
@@ -528,3 +528,12 @@ def forward_hook(self, input: Tuple[str], output: str):
     m.submodule.register_forward_hook(forward_hook)
 
     return m
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This file is a collection of utils, it should be imported not executed directly"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_ignorable_args.py b/test/jit/test_ignorable_args.py
index 07968319caf3..0d69e1cf343a 100644
--- a/test/jit/test_ignorable_args.py
+++ b/test/jit/test_ignorable_args.py
@@ -11,6 +11,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -22,6 +23,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Tests that Python slice class is supported in TorchScript
 class TestIgnorableArgs(JitTestCase):
     def test_slice_ignorable_args_for_slice(self):
@@ -61,3 +68,10 @@ def fn(x: torch.Tensor, y: torch.Tensor):
             torch.add(x, y, out=y)
 
         FileCheck().check("torch.add(x, y, out=y)").run(fn.code)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_ignore_context_manager.py b/test/jit/test_ignore_context_manager.py
index b0d5bf457000..332f0d7e5310 100644
--- a/test/jit/test_ignore_context_manager.py
+++ b/test/jit/test_ignore_context_manager.py
@@ -11,6 +11,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.jit.frontend import _IS_ASTUNPARSE_INSTALLED
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -22,6 +23,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestIgnoreContextManager(JitTestCase):
     @unittest.skipUnless(_IS_ASTUNPARSE_INSTALLED, "astunparse package is required")
     def test_with_ignore_context_manager_with_inp_out(self):
@@ -103,3 +110,10 @@ def forward(self):
         s = torch.jit.script(model)
         self.assertEqual(s(), 5)
         self.assertEqual(s(), model())
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_isinstance.py b/test/jit/test_isinstance.py
index 53b701590f78..e655f7503f10 100644
--- a/test/jit/test_isinstance.py
+++ b/test/jit/test_isinstance.py
@@ -11,6 +11,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -22,6 +23,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Tests for torch.jit.isinstance
 class TestIsinstance(JitTestCase):
     def test_int(self):
@@ -354,3 +361,10 @@ def test_empty_container_special_cases(self):
         # Should not throw "Boolean value of Tensor with more than
         # one value is ambiguous" error
         torch._jit_internal.check_empty_containers(torch.rand(2, 3))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_jit_utils.py b/test/jit/test_jit_utils.py
index 4e2e2898f093..81bf1f52393c 100644
--- a/test/jit/test_jit_utils.py
+++ b/test/jit/test_jit_utils.py
@@ -11,6 +11,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -22,6 +23,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Tests various JIT-related utility functions.
 class TestJitUtils(JitTestCase):
     # Tests that POSITIONAL_OR_KEYWORD arguments are captured.
@@ -116,3 +123,10 @@ def test_no_tracer_warn_context_manager(self):
         with jit_utils.NoTracerWarnContextManager():
             self.assertEqual(False, torch._C._jit_get_tracer_state_warn())
         self.assertEqual(True, torch._C._jit_get_tracer_state_warn())
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index 53245e811ec4..12ec50b500ed 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -19,6 +19,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfTorchDynamo, TEST_CUDA
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
@@ -31,6 +32,16 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+    TEST_CUDA,
+)
+from torch.testing._internal.jit_utils import JitTestCase, make_global
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestList(JitTestCase):
     def test_list_bool_conversion(self):
         def if_predicate(l: List[int]):
@@ -1825,7 +1836,11 @@ def aug_assign_dict_prim(a: Dict[str, float]) -> Dict[str, float]:
     def test_popitem(self):
         @torch.jit.script
         def popitem(
+<<<<<<< HEAD
             x: Dict[str, Tensor]
+=======
+            x: Dict[str, Tensor],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> Tuple[Tuple[str, Tensor], Dict[str, Tensor]]:
             item = x.popitem()
             return item, x
@@ -2996,3 +3011,10 @@ def forward(self):
         for i in range(300):
             test = Test()
             test_script = torch.jit.script(test)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_logging.py b/test/jit/test_logging.py
index 366a6b93442c..5909e945e721 100644
--- a/test/jit/test_logging.py
+++ b/test/jit/test_logging.py
@@ -10,6 +10,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -21,6 +22,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestLogging(JitTestCase):
     def test_bump_numeric_counter(self):
         class ModuleThatLogs(torch.jit.ScriptModule):
@@ -122,3 +129,10 @@ def foo(x):
     def test_logging_levels_set(self):
         torch._C._jit_set_logging_option("foo")
         self.assertEqual("foo", torch._C._jit_get_logging_option())
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_misc.py b/test/jit/test_misc.py
index 8c63e61a8daa..6d775068fb97 100644
--- a/test/jit/test_misc.py
+++ b/test/jit/test_misc.py
@@ -12,7 +12,11 @@
 from jit.test_module_interface import TestModuleInterface  # noqa: F401
 from torch import jit
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import freeze_rng_state
+=======
+from torch.testing._internal.common_utils import freeze_rng_state, raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase, make_global, RUN_CUDA_HALF
 
 
@@ -20,6 +24,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -27,6 +32,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestMisc(JitTestCase):
     def test_joined_str(self):
@@ -129,7 +136,11 @@ def forward(x: Any) -> str:
     def test_subexpression_Tuple_int_int_Future(self):
         @torch.jit.script
         def fn(
+<<<<<<< HEAD
             x: Tuple[int, int, torch.jit.Future[int]]
+=======
+            x: Tuple[int, int, torch.jit.Future[int]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> Tuple[int, torch.jit.Future[int]]:
             return x[0], x[2]
 
@@ -147,7 +158,11 @@ def fn(x: Dict[int, torch.jit.Future[int]], y: int) -> torch.jit.Future[int]:
     def test_subexpression_Optional(self):
         @torch.jit.script
         def fn(
+<<<<<<< HEAD
             x: Optional[Dict[int, torch.jit.Future[int]]]
+=======
+            x: Optional[Dict[int, torch.jit.Future[int]]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> Optional[torch.jit.Future[int]]:
             if x is not None:
                 return x[0]
@@ -504,3 +519,10 @@ def test_jit_get_operation_order(self):
         self.assertTrue(len(complex_indices) > 0)
         self.assertTrue(len(Scalar_indices) > 0)
         self.assertTrue(complex_indices[0] > Scalar_indices[0])
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_models.py b/test/jit/test_models.py
index 7ee9ef365eb4..32d55a9fae63 100644
--- a/test/jit/test_models.py
+++ b/test/jit/test_models.py
@@ -11,13 +11,21 @@
     enable_profiling_mode_for_profiling_tests,
     GRAPH_EXECUTOR,
     ProfilingMode,
+<<<<<<< HEAD
     set_default_dtype,
+=======
+    raise_on_run_directly,
+    set_default_dtype,
+    slowTest,
+    suppress_warnings,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import slowTest, suppress_warnings
 from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA
 
@@ -29,6 +37,11 @@
         "instead."
     )
 
+=======
+from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 try:
     import torchvision
 
@@ -84,7 +97,11 @@ def __init__(self, nz, ngf, nc):
                     nn.ReLU(True),
                     # state size. (ngf) x 32 x 32
                     nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
+<<<<<<< HEAD
                     nn.Tanh()
+=======
+                    nn.Tanh(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # state size. (nc) x 64 x 64
                 )
 
@@ -754,3 +771,10 @@ def test_alexnet(self):
         m = self.createFunctionFromGraph(g)
         with torch.random.fork_rng(devices=[]):
             self.assertEqual(outputs, m(*inputs))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_module_apis.py b/test/jit/test_module_apis.py
index 24a50b016427..dfb4af951e42 100644
--- a/test/jit/test_module_apis.py
+++ b/test/jit/test_module_apis.py
@@ -5,6 +5,10 @@
 from typing import Any, Dict, List
 
 import torch
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -12,6 +16,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -19,6 +24,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestModuleAPIs(JitTestCase):
     def test_default_state_dict_methods(self):
@@ -141,3 +148,10 @@ def forward(self, x):
         self.assertFalse(m2.sub.customized_load_state_dict_called)
         m2.load_state_dict(state_dict)
         self.assertTrue(m2.sub.customized_load_state_dict_called)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_module_containers.py b/test/jit/test_module_containers.py
index e8200eb2c09b..f829cf2a5316 100644
--- a/test/jit/test_module_containers.py
+++ b/test/jit/test_module_containers.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch.nn as nn
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -14,6 +18,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -21,6 +26,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestModuleContainers(JitTestCase):
     def test_sequential_intermediary_types(self):
@@ -756,3 +763,10 @@ def forward(self, x):
                 )
 
         self.checkModule(MyModule(), (torch.ones(1),))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_module_interface.py b/test/jit/test_module_interface.py
index ad30ea3492d3..2f48fbffea1b 100644
--- a/test/jit/test_module_interface.py
+++ b/test/jit/test_module_interface.py
@@ -8,6 +8,10 @@
 import torch
 import torch.nn as nn
 from torch import Tensor
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
@@ -15,6 +19,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -22,6 +27,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class OrigModule(nn.Module):
     def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
@@ -701,3 +708,10 @@ def method(self, input):
 
         with self.assertRaisesRegex(Exception, "Could not compile"):
             scripted_mod = torch.jit.script(TestModule())
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_modules.py b/test/jit/test_modules.py
index 3602887133d9..cc3600fabe47 100644
--- a/test/jit/test_modules.py
+++ b/test/jit/test_modules.py
@@ -4,6 +4,10 @@
 import sys
 
 import torch
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -11,6 +15,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -18,6 +23,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestModules(JitTestCase):
     def test_script_module_with_constants_list(self):
@@ -36,3 +43,10 @@ def __init__(self) -> None:
                 self.x = 0
 
         self.checkModule(Net(), (torch.randn(5),))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_op_decompositions.py b/test/jit/test_op_decompositions.py
index bd9ced8daa85..77923860c351 100644
--- a/test/jit/test_op_decompositions.py
+++ b/test/jit/test_op_decompositions.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -13,6 +14,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestOpDecompositions(JitTestCase):
     def test_op_decomposition(self):
         def foo(x):
@@ -42,3 +49,10 @@ def square_decomp(x):
         FileCheck().check_not("aten::square").check("aten::pow").run(foo.graph)
         x = torch.rand([4])
         self.assertEqual(foo(x), torch.square(x))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_optimize_for_mobile_preserve_debug_info.py b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
index d405b2764e6f..fd053239dd1d 100644
--- a/test/jit/test_optimize_for_mobile_preserve_debug_info.py
+++ b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
@@ -3,7 +3,11 @@
 import torch
 import torch._C
 import torch.nn.functional as F
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfNoXNNPACK
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, skipIfNoXNNPACK
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -263,3 +267,10 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_4(self):
             conv2d_activation=F.relu,
             conv2d_activation_kind="aten::relu",
         )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_parametrization.py b/test/jit/test_parametrization.py
index 1372885e5db6..7f2053d098dd 100644
--- a/test/jit/test_parametrization.py
+++ b/test/jit/test_parametrization.py
@@ -4,6 +4,7 @@
 import torch
 import torch.nn.utils.parametrize as parametrize
 from torch import nn
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -15,6 +16,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestParametrization(JitTestCase):
     # Define some parametrization
     class Symmetric(nn.Module):
@@ -68,3 +75,10 @@ def test_scriptable(self):
                 # Check the scripting process throws an error when caching
                 with self.assertRaisesRegex(RuntimeError, "Caching is not implemented"):
                     scripted_model = torch.jit.trace_module(model)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_pdt.py b/test/jit/test_pdt.py
index 7e77d93cdcd6..da531060dd7b 100644
--- a/test/jit/test_pdt.py
+++ b/test/jit/test_pdt.py
@@ -6,7 +6,11 @@
 
 import torch
 from torch.jit._monkeytype_config import _IS_MONKEYTYPE_INSTALLED
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import NoTest
+=======
+from torch.testing._internal.common_utils import NoTest, raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
@@ -21,6 +25,7 @@
     )
     JitTestCase = NoTest  # type: ignore[misc, assignment] # noqa: F811
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -28,6 +33,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestPDT(JitTestCase):
     """
@@ -896,3 +903,10 @@ def test_none(a) -> Any:
                 torch.ones(1),
             ),
         )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_peephole.py b/test/jit/test_peephole.py
index ac2f54bfe260..98b2e9468a31 100644
--- a/test/jit/test_peephole.py
+++ b/test/jit/test_peephole.py
@@ -6,6 +6,7 @@
 import torch
 from torch import nn
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import _inline_everything, JitTestCase, RUN_CUDA
 
 
@@ -17,6 +18,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import _inline_everything, JitTestCase, RUN_CUDA
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestPeephole(JitTestCase):
     def test_peephole_with_writes(self):
         def test_write(x):
@@ -890,3 +897,10 @@ def foo(x: int, y: int):
 
         self.run_pass("peephole", foo.graph)
         FileCheck().check("aten::slice").run(foo.graph)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index 29f3cc9be4cd..1630451a426d 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -4,7 +4,14 @@
 import sys
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfTorchDynamo
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Make the helper files in test/ importable
@@ -13,6 +20,7 @@
 from torch.testing._internal.jit_utils import FileCheck, JitTestCase, warmup_backward
 
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -21,6 +29,8 @@
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @skipIfTorchDynamo()
 class TestProfiler(JitTestCase):
     def setUp(self):
@@ -284,3 +294,10 @@ def foo(a, b, c, d):
 
         g = torch.jit.last_executed_optimized_graph()
         self.assertEqual(len(list(g.findAllNodes("prim::TensorExprGroup"))), 2)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_python_bindings.py b/test/jit/test_python_bindings.py
index 9de3bf02b7ff..f18031016b9e 100644
--- a/test/jit/test_python_bindings.py
+++ b/test/jit/test_python_bindings.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -13,6 +14,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestPythonBindings(JitTestCase):
     def test_cu_get_functions(self):
         @torch.jit.script
@@ -114,3 +121,10 @@ def test_canonicalize(self):
         graph3 = torch._C.parse_ir(ir)
         graph3 = torch._C._jit_pass_canonicalize(graph3, False)
         FileCheck().check_not("%p207").run(graph3)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_python_builtins.py b/test/jit/test_python_builtins.py
index c84e4edff233..ee9acf60b3a9 100644
--- a/test/jit/test_python_builtins.py
+++ b/test/jit/test_python_builtins.py
@@ -7,6 +7,10 @@
 from textwrap import dedent
 
 import torch
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import execWrapper, JitTestCase
 
 
@@ -14,6 +18,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -21,6 +26,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def get_fn(file_name, script_path):
     import importlib.util
@@ -473,3 +480,10 @@ def foo(a):
 
         s = torch.rand(1)
         self.assertTrue(foo(s))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_python_ir.py b/test/jit/test_python_ir.py
index e5e98ac9fb80..01dc50f6c99f 100644
--- a/test/jit/test_python_ir.py
+++ b/test/jit/test_python_ir.py
@@ -6,6 +6,7 @@
 
 import torch
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_MACOS
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -18,6 +19,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import IS_MACOS, raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestPythonIr(JitTestCase):
     def test_param_strides(self):
         def trace_me(arg):
@@ -100,3 +107,10 @@ def foo(x):
 
         FileCheck().check_not("aten::mul").check("aten::add").run(foo.graph)
         self.assertEqual(foo(torch.ones([2, 2])), torch.ones([2, 2]) * 4)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py
index 33fd38c2b9c7..10f828ef0e6e 100644
--- a/test/jit/test_recursive_script.py
+++ b/test/jit/test_recursive_script.py
@@ -20,12 +20,17 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import (
     _tmp_donotuse_dont_inline_everything,
     JitTestCase,
 )
 
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -34,6 +39,8 @@
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestRecursiveScript(JitTestCase):
     def test_inferred_nonetype(self):
         class M(nn.Module):
@@ -799,3 +806,10 @@ def i_am_ignored(self):
         # ScriptModule should correctly reflect the override.
         s = torch.jit.script(m)
         self.assertEqual(s.i_am_ignored(), "new")
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_remove_mutation.py b/test/jit/test_remove_mutation.py
index 8048d406ab33..da1630c858b4 100644
--- a/test/jit/test_remove_mutation.py
+++ b/test/jit/test_remove_mutation.py
@@ -11,6 +11,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import freeze_rng_state, JitTestCase
 
 
@@ -22,6 +23,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import freeze_rng_state, JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestRemoveMutation(JitTestCase):
     def test_aten_inplace(self):
         def test_not_new_alias(x):
@@ -318,3 +325,10 @@ def test_multiple_uses():
 
         self.run_pass("remove_mutation", mod_script.forward.graph)
         FileCheck().check("aten::add_").run(test_multiple_uses.graph)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_save_load.py b/test/jit/test_save_load.py
index 4c83c40e1aa9..18ea750aea7d 100644
--- a/test/jit/test_save_load.py
+++ b/test/jit/test_save_load.py
@@ -8,7 +8,15 @@
 
 import torch
 from torch import Tensor
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfTorchDynamo, TemporaryFileName
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+    TemporaryFileName,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Make the helper files in test/ importable
@@ -17,6 +25,7 @@
 from torch.testing._internal.jit_utils import clear_class_registry, JitTestCase
 
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -25,6 +34,8 @@
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestSaveLoad(JitTestCase):
     def test_different_modules(self):
         """
@@ -1197,3 +1208,10 @@ def forward(self, x: Tensor):
         torch._C._get_model_extra_files_from_buffer(script_module_io, re_extra_files)
 
         self.assertEqual(extra_files, re_extra_files)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_save_load_for_op_version.py b/test/jit/test_save_load_for_op_version.py
index 1b62e4043eb8..466fe6928286 100644
--- a/test/jit/test_save_load_for_op_version.py
+++ b/test/jit/test_save_load_for_op_version.py
@@ -17,6 +17,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.jit.mobile import _load_for_lite_interpreter
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -28,6 +29,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestSaveLoadForOpVersion(JitTestCase):
     # Helper that returns the module after saving and loading
     def _save_load_module(self, m):
@@ -617,3 +624,10 @@ def forward(
             self.assertTrue(output.size(dim=0) == 100)
             # "Upgraded" model should match the new version output
             self.assertEqual(output, output_current)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_script_profile.py b/test/jit/test_script_profile.py
index c3977c2314de..d453d853379c 100644
--- a/test/jit/test_script_profile.py
+++ b/test/jit/test_script_profile.py
@@ -10,6 +10,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -21,6 +22,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class Sequence(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -115,3 +122,10 @@ def test_empty(self):
         p.enable()
         p.disable()
         self.assertEqual(p.dump_string(), "")
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_scriptmod_ann.py b/test/jit/test_scriptmod_ann.py
index 60d8d434b3ae..4da68d48ea46 100644
--- a/test/jit/test_scriptmod_ann.py
+++ b/test/jit/test_scriptmod_ann.py
@@ -11,6 +11,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -22,6 +23,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestScriptModuleInstanceAttributeTypeAnnotation(JitTestCase):
     # NB: There are no tests for `Tuple` or `NamedTuple` here. In fact,
     # reassigning a non-empty Tuple to an attribute previously typed
@@ -363,3 +370,10 @@ def forward(self, x: Optional[str]):
                 "empty non-base types",
             ):
                 torch.jit.script(M())
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_slice.py b/test/jit/test_slice.py
index f14dc68358c1..e24604aada20 100644
--- a/test/jit/test_slice.py
+++ b/test/jit/test_slice.py
@@ -10,6 +10,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -21,6 +22,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Tests that Python slice class is supported in TorchScript
 class TestSlice(JitTestCase):
     def test_slice_kwarg(self):
@@ -178,3 +185,10 @@ def forward(self):
         self.assertEqual(result2[0].identifier, "B")
         self.assertEqual(result2[1].identifier, "C")
         self.assertEqual(result2[2].identifier, "D")
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_sparse.py b/test/jit/test_sparse.py
index 97ce0a32b6c4..066f8e6208d0 100644
--- a/test/jit/test_sparse.py
+++ b/test/jit/test_sparse.py
@@ -4,7 +4,15 @@
 import unittest
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS, TEST_MKL
+=======
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    raise_on_run_directly,
+    TEST_MKL,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -118,3 +126,10 @@ def forward(self, x):
         loaded_result = loaded_model.forward(x)
 
         self.assertEqual(expected_result.to_dense(), loaded_result.to_dense())
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_string_formatting.py b/test/jit/test_string_formatting.py
index e90c3cd9eebc..e65f1eb89a97 100644
--- a/test/jit/test_string_formatting.py
+++ b/test/jit/test_string_formatting.py
@@ -10,6 +10,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -21,6 +22,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestStringFormatting(JitTestCase):
     def test_modulo_operator(self):
         def fn(dividend: int, divisor: int) -> int:
@@ -199,3 +206,10 @@ def fn(arg1: str) -> str:
             '"%a in template" % arg1',
         ):
             fn("foo")
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_symbolic_shape_analysis.py b/test/jit/test_symbolic_shape_analysis.py
index f43105093d74..0543027b0760 100644
--- a/test/jit/test_symbolic_shape_analysis.py
+++ b/test/jit/test_symbolic_shape_analysis.py
@@ -9,6 +9,7 @@
 from torch import nn, Tensor
 from torch.testing import FileCheck
 from torch.testing._internal.common_methods_invocations import sample_inputs_cat_concat
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import make_tensor
 from torch.testing._internal.jit_utils import execWrapper, JitTestCase
 
@@ -21,6 +22,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import make_tensor, raise_on_run_directly
+from torch.testing._internal.jit_utils import execWrapper, JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # XXX: still in prototype
 class TestSymbolicShapeAnalysis(JitTestCase):
     def setUp(self):
@@ -819,3 +826,10 @@ def foo(x):
         input.setType(input.type().with_sizes([1, 5, 8]))
         torch._C._jit_pass_propagate_shapes_on_graph(foo.graph)
         self.assertEqual(next(foo.graph.outputs()).type().symbolic_sizes(), [5, 8])
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_tensor_creation_ops.py b/test/jit/test_tensor_creation_ops.py
index a51cd2bd3f38..82b4c62604d9 100644
--- a/test/jit/test_tensor_creation_ops.py
+++ b/test/jit/test_tensor_creation_ops.py
@@ -9,6 +9,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -20,6 +21,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestTensorCreationOps(JitTestCase):
     """
     A suite of tests for ops that create tensors.
@@ -78,3 +85,10 @@ def tril_indices(rows: int, cols: int):
             assert indices.dtype == torch.int32
 
         self.checkScript(tril_indices, (3, 3))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_tensor_methods.py b/test/jit/test_tensor_methods.py
index 8e75a96e260d..22c902f64ba9 100644
--- a/test/jit/test_tensor_methods.py
+++ b/test/jit/test_tensor_methods.py
@@ -10,6 +10,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing import FileCheck
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -21,6 +22,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestTensorMethods(JitTestCase):
     def test_getitem(self):
         def tensor_getitem(inp: torch.Tensor):
@@ -41,3 +48,10 @@ def tensor_getitem_invalid(inp: torch.Tensor):
             RuntimeError, "expected exactly 1 argument", "inp.__getitem__"
         ):
             torch.jit.script(tensor_getitem_invalid)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py
index 813e5ba0f9e5..c192a159e1a4 100644
--- a/test/jit/test_torchbind.py
+++ b/test/jit/test_torchbind.py
@@ -8,7 +8,14 @@
 from typing import Optional
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfTorchDynamo
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Make the helper files in test/ importable
@@ -19,6 +26,7 @@
 from torch.testing._internal.torchbind_impls import load_torchbind_test_lib
 
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -27,6 +35,8 @@
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @skipIfTorchDynamo("skipping as a precaution")
 class TestTorchbind(JitTestCase):
     def setUp(self):
@@ -463,3 +473,10 @@ def gn() -> int:
             return obj.decrement()
 
         self.checkScript(gn, ())
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 2aee35a60e9b..559dc673ddf2 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -29,6 +29,10 @@
 from torch.testing._internal.common_utils import (
     enable_profiling_mode_for_profiling_tests,
     IS_SANDCASTLE,
+<<<<<<< HEAD
+=======
+    raise_on_run_directly,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfCompiledWithoutNumpy,
     skipIfCrossRef,
     skipIfTorchDynamo,
@@ -46,6 +50,7 @@
 )
 
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -54,6 +59,8 @@
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
 class TestTracer(JitTestCase):
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
@@ -2826,3 +2833,10 @@ def outer_fn(x, y):
         for n in fn_t.graph.nodes():
             if n.kind() == "prim::CallFunction":
                 self.assertTrue(n.output().isCompleteTensor())
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_type_sharing.py b/test/jit/test_type_sharing.py
index 747222ad2649..4f7c667f7e69 100644
--- a/test/jit/test_type_sharing.py
+++ b/test/jit/test_type_sharing.py
@@ -10,6 +10,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import suppress_warnings
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -22,6 +23,15 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    suppress_warnings,
+)
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestTypeSharing(JitTestCase):
     def assertSameType(self, m1, m2):
         if not isinstance(m1, torch.jit.ScriptModule):
@@ -626,3 +636,10 @@ def forward(self, x):
         # of A, __jit_ignored_attributes__ was modified before scripting s2,
         # so the set of ignored attributes is different between s1 and s2.
         self.assertDifferentType(s1, s2)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_types.py b/test/jit/test_types.py
index c0e56bb47c86..ad53b8885e1e 100644
--- a/test/jit/test_types.py
+++ b/test/jit/test_types.py
@@ -12,6 +12,10 @@
 import torch.testing._internal.jit_utils
 from jit.test_module_interface import TestModuleInterface  # noqa: F401
 from torch.testing import FileCheck
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -19,6 +23,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -26,6 +31,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestTypesAndAnnotation(JitTestCase):
     def test_pep585_type(self):
@@ -370,3 +377,10 @@ def test_inferred_type_error_message(self):
 
         with self.assertRaisesRegex(RuntimeError, "ErrorReason"):
             t = inferred_type.type()
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py
index bf5e53b9e9f0..50e3153deb9f 100644
--- a/test/jit/test_typing.py
+++ b/test/jit/test_typing.py
@@ -7,7 +7,11 @@
 from typing import Dict, List, NamedTuple, Tuple
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS
+=======
+from torch.testing._internal.common_utils import IS_WINDOWS, raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
@@ -15,6 +19,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -22,6 +27,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestTyping(JitTestCase):
     def test_dict_in_not_in(self):
@@ -140,7 +147,11 @@ def wrong_key_type(dictionary: Dict[torch.jit.ScriptModule, str]):
 
         # Check for invalid key and value type annotation
         def wrong_key_value_type(
+<<<<<<< HEAD
             dictionary: Dict[torch.jit.ScriptModule, torch.jit.ScriptModule]
+=======
+            dictionary: Dict[torch.jit.ScriptModule, torch.jit.ScriptModule],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             return
 
@@ -688,3 +699,10 @@ def __init__(self) -> None:
         mod2 = LowestModule()
         mod_s = torch.jit.script(mod)
         mod2_s = torch.jit.script(mod2)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_union.py b/test/jit/test_union.py
index c3810117dad3..476a3d1194b0 100644
--- a/test/jit/test_union.py
+++ b/test/jit/test_union.py
@@ -15,6 +15,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
@@ -26,6 +27,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase, make_global
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestUnion(JitTestCase):
     """
     This class tests the functionality of `Union`.
@@ -1066,3 +1073,10 @@ def fn():
         #                    "Union[Dict[str, torch.Tensor], int]",
         #                    lhs["dict_comprehension_of_mixed"],
         #                    "foobar")
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_union_pep604.py b/test/jit/test_union_pep604.py
index 871af5aa75a0..c37c8a1ccb1e 100644
--- a/test/jit/test_union_pep604.py
+++ b/test/jit/test_union_pep604.py
@@ -16,6 +16,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
@@ -27,6 +28,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase, make_global
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @unittest.skipIf(sys.version_info < (3, 10), "Requires Python 3.10")
 class TestUnion(JitTestCase):
     """
@@ -1064,3 +1071,10 @@ def fn():
         #                    "Dict[str, torch.Tensor] | int",
         #                    lhs["dict_comprehension_of_mixed"],
         #                    "foobar")
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_unsupported_ops.py b/test/jit/test_unsupported_ops.py
index cf07b9485ab2..0d4fff59b46b 100644
--- a/test/jit/test_unsupported_ops.py
+++ b/test/jit/test_unsupported_ops.py
@@ -10,6 +10,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -20,6 +21,12 @@
         "instead."
     )
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NOTE: FIXING FAILING TESTS
 # If you are seeing a test failure from this file, congrats, you improved
 # parity between JIT and Python API. Before you fix the test, you must also update
@@ -90,3 +97,10 @@ def sparse():
             func()
             with self.assertRaisesRegex(Exception, ""):
                 torch.jit.script(func)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_upgraders.py b/test/jit/test_upgraders.py
index 6a7b294164bc..5cf15d669ee8 100644
--- a/test/jit/test_upgraders.py
+++ b/test/jit/test_upgraders.py
@@ -13,6 +13,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -24,6 +25,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestUpgraders(JitTestCase):
     def _load_model_version(self, loaded_model):
         buffer = io.BytesIO()
@@ -346,3 +353,10 @@ def test_aten_full_out_at_4(self):
         FileCheck().check_count("aten::full", 5).run(loaded_model.graph)
         version = self._load_model_version(loaded_model)
         self.assertTrue(version == 5)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_warn.py b/test/jit/test_warn.py
index e72ab71b30a0..697a522bc480 100644
--- a/test/jit/test_warn.py
+++ b/test/jit/test_warn.py
@@ -13,6 +13,7 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+<<<<<<< HEAD
 from torch.testing._internal.jit_utils import JitTestCase
 
 
@@ -24,6 +25,12 @@
     )
 
 
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+from torch.testing._internal.jit_utils import JitTestCase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestWarn(JitTestCase):
     def test_warn(self):
         @torch.jit.script
@@ -148,3 +155,10 @@ def bar():
         ).run(
             f.getvalue()
         )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/test_with.py b/test/jit/test_with.py
index c03085efd326..28e08b0d3414 100644
--- a/test/jit/test_with.py
+++ b/test/jit/test_with.py
@@ -6,7 +6,14 @@
 from typing import Any, List
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfTorchDynamo
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
@@ -14,6 +21,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+<<<<<<< HEAD
 if __name__ == "__main__":
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
@@ -21,6 +29,8 @@
         "instead."
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestWith(JitTestCase):
     """
@@ -647,3 +657,10 @@ def with_rf(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         # Nested record function should have child "aten::add"
         nested_child_events = nested_function_event.cpu_children
         self.assertTrue("aten::add" in (child.name for child in nested_child_events))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_jit.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index 6996ee7e4d46..ea6b8ed52304 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -184,3 +184,13 @@ def forward(self, x, y):
                     }
                 },
             )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/lazy/test_bindings.py b/test/lazy/test_bindings.py
index 39466b33a168..81affd721970 100644
--- a/test/lazy/test_bindings.py
+++ b/test/lazy/test_bindings.py
@@ -1,8 +1,19 @@
 # Owner(s): ["oncall: jit"]
 
 import torch._lazy.metrics
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import run_tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def test_metrics():
     names = torch._lazy.metrics.counter_names()
     assert len(names) == 0, f"Expected no counter names, but got {names}"
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    run_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/lazy/test_extract_compiled_graph.py b/test/lazy/test_extract_compiled_graph.py
index 79359ddb769a..d81ee5cc3747 100644
--- a/test/lazy/test_extract_compiled_graph.py
+++ b/test/lazy/test_extract_compiled_graph.py
@@ -206,3 +206,13 @@ class OptimizeTest(unittest.TestCase):
     test_return_multi = maketest(ModuleReturnMulti)
     test_return_dup_tensor = maketest(ModuleReturnDupTensor)
     test_inplace_update = maketest(ModuleInplaceUpdate)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/lazy/test_meta_kernel.py b/test/lazy/test_meta_kernel.py
index e212fca89ba4..fc5692524270 100644
--- a/test/lazy/test_meta_kernel.py
+++ b/test/lazy/test_meta_kernel.py
@@ -37,3 +37,13 @@ def test_addmm(self):
     def test_add_invalid_device(self):
         with self.assertRaisesRegex(RuntimeError, ".*not a lazy tensor.*"):
             _ = torch.tensor([1], device="cpu") + torch.tensor([1], device="lazy")
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/mobile/lightweight_dispatch/CMakeLists.txt b/test/mobile/lightweight_dispatch/CMakeLists.txt
index 06a836848da0..fb229b9743de 100644
--- a/test/mobile/lightweight_dispatch/CMakeLists.txt
+++ b/test/mobile/lightweight_dispatch/CMakeLists.txt
@@ -12,7 +12,11 @@ target_include_directories(test_codegen_unboxing PRIVATE ${ATen_CPU_INCLUDE})
 
 target_compile_definitions(test_codegen_unboxing PRIVATE USE_GTEST)
 
+<<<<<<< HEAD
 set(TEST_UNBOXING_DEPENDENCIES torch gtest)
+=======
+set(TEST_UNBOXING_DEPENDENCIES torch gtest_main)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 target_link_libraries(test_codegen_unboxing PRIVATE
   ${TEST_UNBOXING_DEPENDENCIES}
diff --git a/test/mobile/model_test/README.md b/test/mobile/model_test/README.md
index 7e99e6763fee..7a12a80bda58 100644
--- a/test/mobile/model_test/README.md
+++ b/test/mobile/model_test/README.md
@@ -43,7 +43,11 @@ Python scripts in this folder are used to generate lite interpreter models for A
 
 The generated models are located at
 https://github.com/pytorch/pytorch/tree/master/android/pytorch_android/src/androidTest/assets (Android)
+<<<<<<< HEAD
 https://github.com/pytorch/pytorch/tree/master/ios/TestApp/models/ (iOS)
+=======
+https://github.com/pytorch/pytorch/tree/master/ios/TestApp/models/ (iOS) <!-- @lint-ignore -->
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 These test models will be executed in Android and iOS simulator tests. Note that we only check if there's error in model execution, but don't check the correctness of model output.
 
diff --git a/test/mobile/nnc/CMakeLists.txt b/test/mobile/nnc/CMakeLists.txt
index 8ca52dabef69..b62a46090433 100644
--- a/test/mobile/nnc/CMakeLists.txt
+++ b/test/mobile/nnc/CMakeLists.txt
@@ -11,7 +11,11 @@ add_executable(test_mobile_nnc
   ${MOBILE_NNC_TEST_SRCS}
 )
 
+<<<<<<< HEAD
 target_link_libraries(test_mobile_nnc PRIVATE torch gtest)
+=======
+target_link_libraries(test_mobile_nnc PRIVATE torch gtest_main)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 target_include_directories(test_mobile_nnc PRIVATE ${ATen_CPU_INCLUDE})
 target_compile_definitions(test_mobile_nnc PRIVATE USE_GTEST)
 
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index def17cbcd363..46036300aa43 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -27,7 +27,10 @@
     onlyNativeDeviceTypes,
     precisionOverride,
     skipCPUIfNoMkldnn,
+<<<<<<< HEAD
     skipCUDAIfCudnnVersionLessThan,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipCUDAIfMiopen,
     skipCUDAIfNoCudnn,
     skipCUDAIfNoMiopen,
@@ -48,7 +51,11 @@
     gradgradcheck,
     instantiate_parametrized_tests,
     MACOS_VERSION,
+<<<<<<< HEAD
     NAVI_ARCH,
+=======
+    NAVI4_ARCH,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parametrize as parametrize_test,
     run_tests,
     set_default_dtype,
@@ -381,11 +388,19 @@ def test_Conv3d_module_same_padding(self):
 
         # Test connstruction with same padding and strides raises
         with self.assertRaisesRegex(ValueError, "padding='same'"):
+<<<<<<< HEAD
             module = nn.Conv2d(
                 in_channels=3, out_channels=33, kernel_size=10, padding="same", stride=2
             )
         with self.assertRaisesRegex(ValueError, "padding='same'"):
             module = nn.Conv2d(
+=======
+            module = nn.Conv3d(
+                in_channels=3, out_channels=33, kernel_size=10, padding="same", stride=2
+            )
+        with self.assertRaisesRegex(ValueError, "padding='same'"):
+            module = nn.Conv3d(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 in_channels=3,
                 out_channels=33,
                 kernel_size=10,
@@ -393,7 +408,11 @@ def test_Conv3d_module_same_padding(self):
                 stride=(1, 1, 3),
             )
         with self.assertRaisesRegex(ValueError, "padding='same'"):
+<<<<<<< HEAD
             module = nn.Conv2d(
+=======
+            module = nn.Conv3d(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 in_channels=3,
                 out_channels=33,
                 kernel_size=10,
@@ -401,7 +420,11 @@ def test_Conv3d_module_same_padding(self):
                 stride=(1, 4, 1),
             )
         with self.assertRaisesRegex(ValueError, "padding='same'"):
+<<<<<<< HEAD
             module = nn.Conv2d(
+=======
+            module = nn.Conv3d(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 in_channels=3,
                 out_channels=33,
                 kernel_size=10,
@@ -1296,11 +1319,20 @@ def func(*inputs):
             torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []
         )
     )
+<<<<<<< HEAD
     def test_Conv2d_deterministic_cudnn(self, device, dtype):
         inputs = torch.randn(2, 3, 5, 5, device=device, dtype=dtype, requires_grad=True)
         with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
             conv1 = torch.nn.Conv2d(3, 3, 3).to(device, dtype)
             conv2 = torch.nn.Conv2d(3, 3, 3).to(device, dtype)
+=======
+    @parametrize_test("dilation", [1, 2, 3])
+    def test_Conv2d_deterministic_cudnn(self, device, dtype, dilation):
+        inputs = torch.randn(2, 3, 7, 7, device=device, dtype=dtype, requires_grad=True)
+        with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
+            conv1 = torch.nn.Conv2d(3, 3, 3, dilation=dilation).to(device, dtype)
+            conv2 = torch.nn.Conv2d(3, 3, 3, dilation=dilation).to(device, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             conv2.bias.data.copy_(conv1.bias.data)
             conv2.weight.data.copy_(conv1.weight.data)
             out1 = conv1(inputs)
@@ -1715,34 +1747,53 @@ def test_conv1d_same_padding(self, device, dtype):
         actual = F.conv1d(x, y, padding="same", dilation=3)
         self.assertEqual(expect, actual)
 
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypesIfMPS(
         *([torch.float] if MACOS_VERSION < 14.0 else [torch.float, torch.cfloat])
     )  # Complex not supported on MacOS13
     @dtypes(torch.float, torch.cfloat)
     def test_conv2d_same_padding(self, device, dtype):
+<<<<<<< HEAD
         if dtype is torch.cfloat:
             rtol, atol = 2e-6, 2e-6
         else:
             rtol, atol = None, None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Compare F.conv2d padding='same' output against manual padding
         # Without strides/dilation
         x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype)
         y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype)
         expect = F.conv2d(x, y, padding=(2, 2))[..., 1:, :]
         actual = F.conv2d(x, y, padding="same")
+<<<<<<< HEAD
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+=======
+        self.assertEqual(expect, actual)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # With dilation
         y = torch.rand(1, 1, 3, 4, device=device, dtype=dtype)
         expect = F.conv2d(x, y, padding=(2, 3), dilation=2)
         actual = F.conv2d(x, y, padding="same", dilation=2)
+<<<<<<< HEAD
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+=======
+        self.assertEqual(expect, actual)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Dilation with asymmetric padding
         y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype)
         expect = F.conv2d(x, y, padding=5, dilation=3)[..., 1:, 1:]
         actual = F.conv2d(x, y, padding="same", dilation=3)
+<<<<<<< HEAD
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+=======
+        self.assertEqual(expect, actual)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dtypes(torch.float, torch.cfloat)
     def test_conv3d_same_padding(self, device, dtype):
@@ -1803,7 +1854,13 @@ def test_conv3d_valid_padding(self, device, dtype):
         self.assertEqual(expect, actual)
 
     @dtypes(torch.float, torch.cfloat)
+<<<<<<< HEAD
     @dtypesIfMPS(torch.float)
+=======
+    @dtypesIfMPS(
+        *([torch.float] if MACOS_VERSION < 14.0 else [torch.float, torch.cfloat])
+    )  # Complex not supported on MacOS13
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv1d_same_padding_backward(self, device, dtype):
         # Test F.conv1d gradients work with padding='same'
         x = torch.rand(1, 1, 12, dtype=dtype, device=device, requires_grad=True)
@@ -2138,9 +2195,13 @@ def test_conv3d_valid_padding_backward(self, device, dtype):
         arg_str="N",
         arg_values=[
             subtest(arg_values=(2), name="ConvTranspose2d"),
+<<<<<<< HEAD
             subtest(
                 arg_values=(3), name="ConvTranspose3d", decorators=[expectedFailureMPS]
             ),
+=======
+            subtest(arg_values=(3), name="ConvTranspose3d"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
     )
     def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N):
@@ -3097,7 +3158,10 @@ def test_conv_large_nosplit(self, device):
         input_large = torch.randn(1, 1, 2048, 1024, dtype=dtype, device=device)
         conv2(input_large)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # ConvTranspose 3D is not supported on MPS
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv_noncontig_weights(self, device):
         for dim in (1, 2, 3):
             for grouped in (False, True):
@@ -3636,7 +3700,10 @@ def helper(
                 )
 
     @onlyCUDA
+<<<<<<< HEAD
     @skipCUDAIfCudnnVersionLessThan(7603)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.half, torch.float, torch.cfloat)
     def test_conv_cudnn_nhwc(self, device, dtype):
         def helper(n, c, h, w, out_channels, kernel_size, groups):
@@ -3692,7 +3759,10 @@ def helper(n, c, h, w, out_channels, kernel_size, groups):
         helper(1, 16, 56, 56, out_channels=16, kernel_size=3, groups=16)
 
     @onlyCUDA
+<<<<<<< HEAD
     @skipCUDAIfCudnnVersionLessThan(8005)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.half, torch.float)
     def test_conv_cudnn_ndhwc(self, device, dtype):
         def helper(n, c, d, h, w, out_channels, kernel_size, groups):
@@ -3822,7 +3892,10 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
                     )
 
     @onlyCUDA
+<<<<<<< HEAD
     @skipCUDAIfCudnnVersionLessThan(7603)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.05)
     def test_conv_cudnn_mismatch_memory_format(self, device):
         configs = [
@@ -3883,7 +3956,11 @@ def test_conv2d_no_grad(self, device, dtype):
 
     @onlyCUDA
     @skipCUDAIfNoCudnn
+<<<<<<< HEAD
     @skipIfRocmArch(NAVI_ARCH) # not supported by MIOPEN on NAVI
+=======
+    @skipIfRocmArch(NAVI4_ARCH) # not supported by MIOPEN on NAVI4x
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float, torch.float16)
     @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
     def test_cudnn_convolution_relu(self, device, dtype):
@@ -3956,7 +4033,10 @@ def test_cudnn_convolution_add_relu(self, device, dtype):
                 self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out)
 
     @onlyCUDA
+<<<<<<< HEAD
     @skipCUDAIfCudnnVersionLessThan(7603)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_convert_conv2d_weight_memory_format(self, device):
         input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device=device)
         model = nn.Sequential(nn.Conv2d(8, 4, 3), nn.BatchNorm2d(4)).to(device).float()
@@ -3976,7 +4056,10 @@ def test_convert_conv2d_weight_memory_format(self, device):
             self.assertTrue(out.is_contiguous(memory_format=memory_format))
 
     @onlyCUDA
+<<<<<<< HEAD
     @skipCUDAIfCudnnVersionLessThan(7603)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_convert_conv3d_weight_memory_format(self, device):
         input = torch.randint(
             1, 10, (2, 8, 4, 4, 4), dtype=torch.float32, device=device
@@ -4059,6 +4142,20 @@ def test_conv3d_64bit_indexing(self, device):
         y = m.to(device=device)(x.to(device=device))
         self.assertEqual(yref, y)
 
+<<<<<<< HEAD
+=======
+    @skipCUDAIfRocm
+    @onlyCUDA
+    @largeTensorTest("20GB")
+    @largeTensorTest("80GB", "cpu")
+    def test_depthwise_conv_64bit_indexing(self, device):
+        x = torch.randn(1, 2, 32800, 32800)
+        c = nn.Conv2d(2, 2, kernel_size=3, stride=1, padding=1, groups=2)
+        yref = c(x)
+        y = c.to(device=device)(x.to(device=device))
+        self.assertEqual(yref, y)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestConvolutionNNDeviceType, globals(), allow_mps=True)
 instantiate_parametrized_tests(TestConvolutionNN)
diff --git a/test/nn/test_lazy_modules.py b/test/nn/test_lazy_modules.py
index 163063c6f085..b0775db18502 100644
--- a/test/nn/test_lazy_modules.py
+++ b/test/nn/test_lazy_modules.py
@@ -33,7 +33,11 @@ def test_lazy_module_parameter(self):
         new_module.register_parameter("test_param", nn.Parameter(torch.ones(5, 5)))
         with self.assertRaisesRegex(RuntimeError, "shape of an uninitialized"):
             new_module.load_state_dict(state_dict)
+<<<<<<< HEAD
         # Uninitialized parameters are overriden when the state dict to be loaded contains a valid one
+=======
+        # Uninitialized parameters are overridden when the state dict to be loaded contains a valid one
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_module = LazyModule()
         new_module.register_parameter("test_param", nn.Parameter(torch.ones(5, 5)))
         module.load_state_dict(new_module.state_dict())
@@ -62,7 +66,11 @@ def test_lazy_module_buffer(self):
         new_module.test_buffer = Buffer(torch.ones(5, 5))
         with self.assertRaisesRegex(RuntimeError, "shape of an uninitialized"):
             new_module.load_state_dict(state_dict)
+<<<<<<< HEAD
         # Uninitialized parameters are overriden when the state dict to be loaded contains a valid one
+=======
+        # Uninitialized parameters are overridden when the state dict to be loaded contains a valid one
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_module = LazyModule()
         new_module.test_buffer = Buffer(torch.ones(5, 5))
         module.load_state_dict(new_module.state_dict())
@@ -117,11 +125,21 @@ def test_linear(self):
         self.assertIsInstance(module.weight, UninitializedParameter)
         self.assertIsInstance(module.bias, UninitializedParameter)
         input = torch.ones(5, 5)
+<<<<<<< HEAD
         module(input)
+=======
+        output = module(input)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertIsInstance(module, nn.Linear)
         self.assertNotIsInstance(module, nn.LazyLinear)
         self.assertTrue(module.weight.shape == (10, 5))
         self.assertTrue(module.bias.shape == (10,))
+<<<<<<< HEAD
+=======
+        self.assertTrue((module.weight != 0).any())
+        self.assertTrue((module.bias != 0).any())
+        self.assertTrue((output != 0).any())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y = module(input)
         self.assertTrue(
             torch.equal(
@@ -165,6 +183,25 @@ def test_linear_state(self):
         with self.assertRaisesRegex(RuntimeError, "shape of an uninitialized"):
             module.load_state_dict(lazy_module.state_dict())
 
+<<<<<<< HEAD
+=======
+    @suppress_warnings
+    def test_lazy_linear_state_and_forward(self):
+        module = nn.Linear(5, 10)
+        lazy_module = nn.LazyLinear(10)
+        lazy_module.load_state_dict(module.state_dict())
+        # Parameters have been initialized but the module won't become a full
+        # Linear one until the first iteration. This is due to
+        # limitations on the state_dict loading logic
+        self.assertFalse(lazy_module.has_uninitialized_params())
+        self.assertTrue(isinstance(lazy_module, nn.LazyLinear))
+
+        input = torch.randn(5, 5)
+        lazy_module(input)
+        self.assertFalse(isinstance(lazy_module, nn.LazyLinear))
+        self.assertTrue(lazy_module.in_features == 5)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _check_lazy_conv(
         self,
         cls,
diff --git a/test/nn/test_load_state_dict.py b/test/nn/test_load_state_dict.py
index 641017284c63..7a51b93e4246 100644
--- a/test/nn/test_load_state_dict.py
+++ b/test/nn/test_load_state_dict.py
@@ -470,9 +470,15 @@ def module_load(dest, src, assign=False):
                         return cls(src._data)
                     return cls(src)
         else:
+<<<<<<< HEAD
             assert isinstance(
                 src, cls
             ), f"Expected isinstance(src, {cls}) but got {type(src)}"
+=======
+            assert isinstance(src, cls), (
+                f"Expected isinstance(src, {cls}) but got {type(src)}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert (
                 type(dest) == torch.Tensor
                 or type(dest) == torch.nn.Parameter
diff --git a/test/nn/test_module_hooks.py b/test/nn/test_module_hooks.py
index c9c29f0ba4a3..dcffb55d9671 100644
--- a/test/nn/test_module_hooks.py
+++ b/test/nn/test_module_hooks.py
@@ -1445,7 +1445,18 @@ def hook(mod, grad_input, grad_output):
         mod.register_full_backward_hook(hook)
 
         # This should run and trigger the hook properly
+<<<<<<< HEAD
         mod(inp).sum().backward()
+=======
+        with self.assertWarnsRegex(
+            UserWarning,
+            (
+                "Full backward hook is firing when gradients are computed with "
+                "respect to module outputs since no inputs require gradients"
+            ),
+        ):
+            mod(inp).sum().backward()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(hook_called[0], 1)
 
         return_val = "grad_input"
diff --git a/test/nn/test_multihead_attention.py b/test/nn/test_multihead_attention.py
index c0419664d009..893aff6288b8 100644
--- a/test/nn/test_multihead_attention.py
+++ b/test/nn/test_multihead_attention.py
@@ -17,7 +17,10 @@
     instantiate_parametrized_tests,
     parametrize as parametrize_test,
     run_tests,
+<<<<<<< HEAD
     skipIfRocm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_NUMPY,
     TEST_WITH_CROSSREF,
 )
@@ -746,7 +749,10 @@ def test_multihead_attn_nested_tensor_outside_fast_path(self):
 
 
 class TestMultiheadAttentionNNDeviceType(NNTestCase):
+<<<<<<< HEAD
     @skipIfRocm(msg="To investigate: yields NaN")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multihead_self_attn_two_masks_fast_path(self, device):
         """
         Multihead self-attention should give the same result on the fast path (BetterTransformer) as on the slow path
diff --git a/test/nn/test_parametrization.py b/test/nn/test_parametrization.py
index cbc2a143ec40..7db001387c3f 100644
--- a/test/nn/test_parametrization.py
+++ b/test/nn/test_parametrization.py
@@ -1475,9 +1475,15 @@ def test_new_spectral_norm_load_state_dict(self):
             snm.load_state_dict(non_strict_state_dict, strict=False)
             del non_strict_state_dict["parametrizations.weight.0._v"]
             snm.load_state_dict(non_strict_state_dict, strict=False)
+<<<<<<< HEAD
             non_strict_state_dict[
                 "weight"
             ] = snm.weight.detach().clone()  # set W as a buffer
+=======
+            non_strict_state_dict["weight"] = (
+                snm.weight.detach().clone()
+            )  # set W as a buffer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             snm.load_state_dict(non_strict_state_dict, strict=False)
             del non_strict_state_dict._metadata[
                 "parametrizations.weight.0"
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index 82f6ca2fafe3..a54f95009e30 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -557,7 +557,23 @@ def test_adaptive_pooling_empty_output_size(self, dtype, device):
                 fn(input2, output_size).sum().backward()
 
     @onlyNativeDeviceTypes
+<<<<<<< HEAD
     def test_adaptive_pooling_backward_fails(self, device):
+=======
+    def test_adaptive_avg_pooling_backward_fails(self, device):
+        grad_output = torch.randn(1, 2, 7, device=device)
+        input = torch.randn(1, 2, 3, 3, device=device)
+        with self.assertRaisesRegex(RuntimeError, "Expected dimensions"):
+            torch.ops.aten._adaptive_avg_pool2d_backward(grad_output, input)
+
+        grad_output = torch.randn(1, 2, 7, 7, device=device)
+        input = torch.randn(1, 2, 3, 3, 3, device=device)
+        with self.assertRaisesRegex(RuntimeError, "Expected dimensions"):
+            torch.ops.aten._adaptive_avg_pool3d_backward(grad_output, input)
+
+    @onlyNativeDeviceTypes
+    def test_adaptive_max_pooling_backward_fails(self, device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         grad_output = torch.randn(1, 2, 7, 7, device=device)
         input = torch.randn(1, 2, 7, 7, device=device)
         indices = torch.ones(1, 2, 3, 3, dtype=torch.long, device=device)
diff --git a/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py b/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
index ef8fdff4bcb7..508cf14993ad 100644
--- a/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
+++ b/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
@@ -18,7 +18,10 @@
 from torch.onnx import (
     _OrtBackend as OrtBackend,
     _OrtBackendOptions as OrtBackendOptions,
+<<<<<<< HEAD
     ExportOptions,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import skipIfNNModuleInlined
@@ -29,6 +32,7 @@
 import onnx_test_common
 
 
+<<<<<<< HEAD
 def make_aot_ort(dynamic: bool = False):
     ort_backend = OrtBackend(
         options=OrtBackendOptions(
@@ -37,6 +41,10 @@ def make_aot_ort(dynamic: bool = False):
             )
         )
     )
+=======
+def make_aot_ort():
+    ort_backend = OrtBackend(options=OrtBackendOptions())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ort_backend, ort_backend
 
 
@@ -107,6 +115,7 @@ def _test_torch_compile_backend_caching_assert_reused(
                 ),
             ),
             (OrtBackendOptions(default_execution_providers=["Something"]),),
+<<<<<<< HEAD
             (
                 OrtBackendOptions(
                     export_options=ExportOptions(
@@ -114,6 +123,9 @@ def _test_torch_compile_backend_caching_assert_reused(
                     )
                 ),
             ),
+=======
+            (OrtBackendOptions(),),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
     )
     def test_torch_compile_backend_caching_assert_reused(
@@ -147,7 +159,11 @@ def _test_model_numerically(
         Args:
             model: The model to test.
             dynamo_backend: The dynamo backend to use. Here we use string `onnxrt` or
+<<<<<<< HEAD
               the first returned value of `make_aot_ort(dynamic=True)`.
+=======
+              the first returned value of `make_aot_ort()`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             example_args_collection: A tuple of example arguments to test. E.g.,
                 (
                   (torch.randn(2), torch.randn(2)),
@@ -268,7 +284,11 @@ def elementwise_model(x: torch.Tensor):
             return z
 
         if test_local_backend:
+<<<<<<< HEAD
             local_aot_ort, local_ort = make_aot_ort(dynamic=True)
+=======
+            local_aot_ort, local_ort = make_aot_ort()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             # This will use the global ONNXRuntime backend registered
             # in Dynamo to compile the tested model.
@@ -316,7 +336,11 @@ def elementwise_model_with_multiple_outputs(w: torch.Tensor):
             return x, y, z
 
         if test_local_backend:
+<<<<<<< HEAD
             local_aot_ort, local_ort = make_aot_ort(dynamic=True)
+=======
+            local_aot_ort, local_ort = make_aot_ort()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             local_aot_ort, local_ort = "onnxrt", None
 
@@ -360,7 +384,11 @@ def forward(self, tensor_x: torch.Tensor):
                 return tensor_x
 
         if test_local_backend:
+<<<<<<< HEAD
             local_aot_ort, local_ort = make_aot_ort(dynamic=True)
+=======
+            local_aot_ort, local_ort = make_aot_ort()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             local_aot_ort, local_ort = "onnxrt", None
 
@@ -451,7 +479,11 @@ def generate_example_inputs(batch: int, seq: int, hidden_size: int):
         )
 
         if test_local_backend:
+<<<<<<< HEAD
             local_aot_ort, local_ort = make_aot_ort(dynamic=True)
+=======
+            local_aot_ort, local_ort = make_aot_ort()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             local_aot_ort, local_ort = "onnxrt", None
 
@@ -546,7 +578,11 @@ def generate_example_inputs(batch: int, seq: int, hidden_size: int):
         )
 
         if test_local_backend:
+<<<<<<< HEAD
             local_aot_ort, local_ort = make_aot_ort(dynamic=True)
+=======
+            local_aot_ort, local_ort = make_aot_ort()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             local_aot_ort, local_ort = "onnxrt", None
 
@@ -632,7 +668,11 @@ def generate_example_inputs(batch: int, seq: int):
         )
 
         if test_local_backend:
+<<<<<<< HEAD
             local_aot_ort, local_ort = make_aot_ort(dynamic=True)
+=======
+            local_aot_ort, local_ort = make_aot_ort()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             local_aot_ort, local_ort = "onnxrt", None
 
@@ -697,7 +737,11 @@ def forward(self, tensor_x: torch.Tensor):
                 return tensor_x
 
         if test_local_backend:
+<<<<<<< HEAD
             local_aot_ort, _ = make_aot_ort(dynamic=True)
+=======
+            local_aot_ort, _ = make_aot_ort()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             local_aot_ort, _ = "onnxrt", None
 
diff --git a/test/onnx/exporter/test_api.py b/test/onnx/exporter/test_api.py
index d2196c75a263..2b4f6f495d98 100644
--- a/test/onnx/exporter/test_api.py
+++ b/test/onnx/exporter/test_api.py
@@ -3,6 +3,10 @@
 
 from __future__ import annotations
 
+<<<<<<< HEAD
+=======
+import io
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 
 import numpy as np
@@ -49,6 +53,14 @@ def forward(
             return x - w, x - y, c
 
 
+<<<<<<< HEAD
+=======
+class SampleModelForDimOne(torch.nn.Module):
+    def forward(self, x, y, z):
+        return torch.cat((x, y), axis=1) + z
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestExportAPIDynamo(common_utils.TestCase):
     """Tests for the ONNX exporter API when dynamo=True."""
 
@@ -124,6 +136,7 @@ def test_saved_f_exists_after_export(self):
             )
             self.assertTrue(os.path.exists(path))
 
+<<<<<<< HEAD
     def test_export_supports_script_module(self):
         class ScriptModule(torch.nn.Module):
             def forward(self, x):
@@ -135,6 +148,8 @@ def forward(self, x):
             strategy="JitTraceConvertStrategy",
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dynamic_shapes_with_fully_specified_axes(self):
         ep = torch.export.export(
             SampleModelForDynamicShapes(),
@@ -246,6 +261,34 @@ def test_dynamic_shapes_supports_nested_input_model_with_input_names_assigned(se
             )
         )
 
+<<<<<<< HEAD
+=======
+    def test_upgraded_torchlib_impl(self):
+        class GeluModel(torch.nn.Module):
+            def forward(self, input):
+                # Use GELU activation function
+                return torch.nn.functional.gelu(input, approximate="tanh")
+
+        input = torch.randn(1, 3, 4, 4)
+        onnx_program_op18 = torch.onnx.export(
+            GeluModel(),
+            input,
+            dynamo=True,
+        )
+        all_nodes_op18 = [n.op_type for n in onnx_program_op18.model.graph]
+        self.assertIn("Tanh", all_nodes_op18)
+        self.assertNotIn("Gelu", all_nodes_op18)
+
+        onnx_program_op20 = torch.onnx.export(
+            GeluModel(),
+            input,
+            opset_version=20,
+            dynamo=True,
+        )
+        all_nodes_op20 = [n.op_type for n in onnx_program_op20.model.graph]
+        self.assertIn("Gelu", all_nodes_op20)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_refine_dynamic_shapes_with_onnx_export(self):
         # NOTE: From test/export/test_export.py
 
@@ -273,6 +316,20 @@ def forward(self, x):
         input = torch.randn(2)
         self.assert_export(Model(), (input))
 
+<<<<<<< HEAD
+=======
+    def test_export_successful_when_dynamic_dimension_is_one(self):
+        self.assert_export(
+            SampleModelForDimOne(),
+            (torch.randn(1, 3), torch.randn(1, 5), torch.randn(1, 8)),
+            dynamic_shapes=(
+                {0: "batch", 1: "sequence"},
+                {0: "batch", 1: "sequence"},
+                {0: "batch", 1: "sequence"},
+            ),
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestCustomTranslationTable(common_utils.TestCase):
     def test_custom_translation_table_overrides_ops(self):
@@ -505,6 +562,38 @@ def forward(self, x):
             onnx_model.graph.initializers["weight"].const_value.numpy(), 42.0
         )
 
+<<<<<<< HEAD
+=======
+    def test_is_in_onnx_export(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x):
+                def f(x):
+                    return x.sin() if torch.onnx.is_in_onnx_export() else x.cos()
+
+                return f(x)
+
+        self.assertFalse(torch.onnx.is_in_onnx_export())
+        onnx_program = torch.onnx.export(
+            Mod(),
+            (torch.randn(3, 4),),
+            dynamo=True,
+            fallback=False,
+        )
+        self.assertFalse(torch.onnx.is_in_onnx_export())
+
+        node_names = [n.op_type for n in onnx_program.model.graph]
+        self.assertIn("Sin", node_names)
+
+    def test_torchscript_exporter_raises_deprecation_warning(self):
+        # Test that the deprecation warning is raised when using torchscript exporter
+        with self.assertWarnsRegex(
+            DeprecationWarning, "You are using the legacy TorchScript-based ONNX export"
+        ):
+            torch.onnx.export(
+                SampleModel(), (torch.randn(1, 1, 2),), io.BytesIO(), dynamo=False
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/exporter/test_capture_strategies.py b/test/onnx/exporter/test_capture_strategies.py
index bcfd269dd82c..74ca537a89c2 100644
--- a/test/onnx/exporter/test_capture_strategies.py
+++ b/test/onnx/exporter/test_capture_strategies.py
@@ -13,9 +13,15 @@ class ExportStrategiesTest(common_utils.TestCase):
     @common_utils.parametrize(
         "strategy_cls",
         [
+<<<<<<< HEAD
             _capture_strategies.TorchExportStrategy,
             _capture_strategies.TorchExportNonStrictStrategy,
             _capture_strategies.JitTraceConvertStrategy,
+=======
+            _capture_strategies.TorchExportStrictStrategy,
+            _capture_strategies.TorchExportNonStrictStrategy,
+            _capture_strategies.TorchExportDraftExportStrategy,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
         name_fn=lambda strategy_cls: strategy_cls.__name__,
     )
@@ -35,6 +41,7 @@ def forward(self, a, b):
         assert ep is not None
         torch.testing.assert_close(ep.module()(a, b), model(a, b))
 
+<<<<<<< HEAD
     def test_jit_trace_supports_dynamic_shapes_as_tuple(self):
         class Model(torch.nn.Module):
             def forward(self, a, b):
@@ -80,6 +87,28 @@ def forward(self, a, b):
         a_size = next(iter(ep.graph.nodes)).meta["val"].size()
         batch_dim_val = a_size[0]
         self.assertIsInstance(batch_dim_val, torch.SymInt)
+=======
+    def test_draft_export_on_data_dependent_model(self):
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                if a.sum() > 0:
+                    return a.cos()
+                # The branch is expected to be specialized and a warning is logged
+                return b.sin()
+
+        model = Model()
+        a = torch.tensor(0.0)
+        b = torch.tensor(1.0)
+
+        strategy = _capture_strategies.TorchExportDraftExportStrategy()
+        with self.assertLogs("torch.export", level="WARNING") as cm:
+            result = strategy(model, (a, b), kwargs=None, dynamic_shapes=None)
+            expected_warning = "1 issue(s) found during export, and it was not able to soundly produce a graph."
+            self.assertIn(expected_warning, str(cm.output))
+        ep = result.exported_program
+        assert ep is not None
+        torch.testing.assert_close(ep.module()(a, b), model(a, b))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/onnx/exporter/test_core.py b/test/onnx/exporter/test_core.py
index fc776ecc673b..e019d6ecf2e0 100644
--- a/test/onnx/exporter/test_core.py
+++ b/test/onnx/exporter/test_core.py
@@ -3,6 +3,10 @@
 
 from __future__ import annotations
 
+<<<<<<< HEAD
+=======
+import ml_dtypes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import numpy as np
 
 import torch
@@ -15,17 +19,28 @@ class TorchTensorTest(common_utils.TestCase):
     @common_utils.parametrize(
         "dtype, np_dtype",
         [
+<<<<<<< HEAD
             (torch.bfloat16, np.uint16),
+=======
+            (torch.bfloat16, ml_dtypes.bfloat16),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (torch.bool, np.bool_),
             (torch.complex128, np.complex128),
             (torch.complex64, np.complex64),
             (torch.float16, np.float16),
             (torch.float32, np.float32),
             (torch.float64, np.float64),
+<<<<<<< HEAD
             (torch.float8_e4m3fn, np.uint8),
             (torch.float8_e4m3fnuz, np.uint8),
             (torch.float8_e5m2, np.uint8),
             (torch.float8_e5m2fnuz, np.uint8),
+=======
+            (torch.float8_e4m3fn, ml_dtypes.float8_e4m3fn),
+            (torch.float8_e4m3fnuz, ml_dtypes.float8_e4m3fnuz),
+            (torch.float8_e5m2, ml_dtypes.float8_e5m2),
+            (torch.float8_e5m2fnuz, ml_dtypes.float8_e5m2fnuz),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (torch.int16, np.int16),
             (torch.int32, np.int32),
             (torch.int64, np.int64),
@@ -34,10 +49,21 @@ class TorchTensorTest(common_utils.TestCase):
             (torch.uint32, np.uint32),
             (torch.uint64, np.uint64),
             (torch.uint8, np.uint8),
+<<<<<<< HEAD
         ],
     )
     def test_numpy_returns_correct_dtype(self, dtype: torch.dtype, np_dtype):
         tensor = _core.TorchTensor(torch.tensor([1], dtype=dtype))
+=======
+            (torch.float4_e2m1fn_x2, ml_dtypes.float4_e2m1fn),
+        ],
+    )
+    def test_numpy_returns_correct_dtype(self, dtype: torch.dtype, np_dtype):
+        if dtype == torch.float4_e2m1fn_x2:
+            tensor = _core.TorchTensor(torch.tensor([1], dtype=torch.uint8).view(dtype))
+        else:
+            tensor = _core.TorchTensor(torch.tensor([1], dtype=dtype))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(tensor.numpy().dtype, np_dtype)
         self.assertEqual(tensor.__array__().dtype, np_dtype)
         self.assertEqual(np.array(tensor).dtype, np_dtype)
@@ -45,6 +71,7 @@ def test_numpy_returns_correct_dtype(self, dtype: torch.dtype, np_dtype):
     @common_utils.parametrize(
         "dtype",
         [
+<<<<<<< HEAD
             (torch.bfloat16),
             (torch.bool),
             (torch.complex128),
@@ -64,12 +91,42 @@ def test_numpy_returns_correct_dtype(self, dtype: torch.dtype, np_dtype):
             (torch.uint32),
             (torch.uint64),
             (torch.uint8),
+=======
+            torch.bfloat16,
+            torch.bool,
+            torch.complex128,
+            torch.complex64,
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2,
+            torch.float8_e5m2fnuz,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.int8,
+            torch.uint16,
+            torch.uint32,
+            torch.uint64,
+            torch.uint8,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
     )
     def test_tobytes(self, dtype: torch.dtype):
         tensor = _core.TorchTensor(torch.tensor([1], dtype=dtype))
         self.assertEqual(tensor.tobytes(), tensor.numpy().tobytes())
 
+<<<<<<< HEAD
+=======
+    def test_tobytes_float4(self):
+        tensor = _core.TorchTensor(
+            torch.tensor([1], dtype=torch.uint8).view(torch.float4_e2m1fn_x2)
+        )
+        self.assertEqual(tensor.tobytes(), b"\x01")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/exporter/test_dynamic_shapes.py b/test/onnx/exporter/test_dynamic_shapes.py
index 2f3f40b65cea..4d647bc1f2c9 100644
--- a/test/onnx/exporter/test_dynamic_shapes.py
+++ b/test/onnx/exporter/test_dynamic_shapes.py
@@ -558,7 +558,11 @@ def test_convert_str_to_export_dim_returns_the_converted_dynamic_shapes_when_the
         expected_dynamic_shapes = {
             "input_x": [
                 {
+<<<<<<< HEAD
                     0: torch.export.Dim.AUTO,
+=======
+                    0: torch.export.Dim.DYNAMIC,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     1: torch.export.Dim.STATIC,
                 },
                 {
@@ -566,7 +570,11 @@ def test_convert_str_to_export_dim_returns_the_converted_dynamic_shapes_when_the
                     1: dimx,
                 },
             ],
+<<<<<<< HEAD
             "input_b": {2: torch.export.Dim.AUTO},
+=======
+            "input_b": {2: torch.export.Dim.DYNAMIC},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         dynamic_shapes_with_export_dim, need_axis_mapping = (
             _dynamic_shapes.convert_str_to_export_dim(dynamic_shapes)
@@ -598,7 +606,11 @@ def test_convert_str_to_export_dim_returns_the_converted_dynamic_shapes_when_the
                 },
                 {
                     0: torch.export.Dim.AUTO,
+<<<<<<< HEAD
                     1: torch.export.Dim.AUTO,
+=======
+                    1: torch.export.Dim.DYNAMIC,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 },
             ],
             {2: torch.export.Dim.STATIC},
diff --git a/test/onnx/exporter/test_small_models_e2e.py b/test/onnx/exporter/test_small_models_e2e.py
index 60193c562e5e..d4f2903d2bca 100644
--- a/test/onnx/exporter/test_small_models_e2e.py
+++ b/test/onnx/exporter/test_small_models_e2e.py
@@ -5,7 +5,17 @@
 
 import logging
 
+<<<<<<< HEAD
 import transformers
+=======
+import onnx.reference as onnx_ref
+
+import onnxruntime
+import pytest
+import transformers
+from onnxscript import ir
+from packaging import version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.onnx._internal.exporter import _testing as onnx_testing
@@ -13,8 +23,16 @@
 from torch.utils import _pytree as torch_pytree
 
 
+<<<<<<< HEAD
 @common_utils.instantiate_parametrized_tests
 class DynamoExporterTest(common_utils.TestCase):
+=======
+def has_onnxruntime_opset_23() -> bool:
+    return version.parse(onnxruntime.__version__) >= version.parse("1.22")
+
+
+class _WithExport:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def export(self, model, args=(), kwargs=None, **options) -> torch.onnx.ONNXProgram:
         onnx_program = torch.onnx.export(
             model,
@@ -28,6 +46,12 @@ def export(self, model, args=(), kwargs=None, **options) -> torch.onnx.ONNXProgr
         assert onnx_program is not None
         return onnx_program
 
+<<<<<<< HEAD
+=======
+
+@common_utils.instantiate_parametrized_tests
+class DynamoExporterTest(common_utils.TestCase, _WithExport):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_insert_contiguous_between_transpose_and_view(self):
         class Model(torch.nn.Module):
             def forward(self, query, key, value):
@@ -149,22 +173,36 @@ def false_fn(x, z):
                 x = torch.cond(x.sum() > 0, true_fn, false_fn, (x, z))
                 return x, z
 
+<<<<<<< HEAD
         onnx_program = torch.onnx.export(
             CondModel(),
             (torch.tensor([1, 2]),),
             dynamo=True,
             fallback=False,
         )
+=======
+        onnx_program = self.export(CondModel(), (torch.tensor([1, 2]),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         onnx_testing.assert_onnx_program(onnx_program)
         onnx_testing.assert_onnx_program(onnx_program, args=(torch.tensor([-1, -2]),))
 
     def test_empty(self):
+<<<<<<< HEAD
         def func(x):
             return torch.empty(x.size(), dtype=torch.int64)
 
         # Since `torch.empty` returns tensor with uninitialized data, we cannot
         # test this under `test_fx_to_onnx_with_onnxruntime.py` with result comparison.
         _ = self.export(func, (torch.randn(1, 2),))
+=======
+        class EmptyModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.empty(x.size(), dtype=torch.int64)
+
+        # Since `torch.empty` returns tensor with uninitialized data, we cannot
+        # test this under `test_fx_to_onnx_with_onnxruntime.py` with result comparison.
+        _ = self.export(EmptyModel(), (torch.randn(1, 2),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_multiple_outputs_op_with_evaluator(self):
         class TopKModel(torch.nn.Module):
@@ -193,6 +231,7 @@ def forward(self, x):
             _ = self.export(exported_program)
 
     @common_utils.parametrize(
+<<<<<<< HEAD
         "float8_type",
         [
             common_utils.subtest(
@@ -209,17 +248,78 @@ def forward(self, x):
             ),
             common_utils.subtest(
                 torch.float8_e4m3fnuz,
+=======
+        "float8_type, onnx_type",
+        [
+            common_utils.subtest(
+                (torch.float8_e5m2, ir.DataType.FLOAT8E5M2),
+                name="torch_float8_e5m2",
+            ),
+            common_utils.subtest(
+                (torch.float8_e5m2fnuz, ir.DataType.FLOAT8E5M2FNUZ),
+                name="torch_float8_e5m2fnuz",
+            ),
+            common_utils.subtest(
+                (torch.float8_e4m3fn, ir.DataType.FLOAT8E4M3FN),
+                name="torch_float8_e4m3fn",
+            ),
+            common_utils.subtest(
+                (torch.float8_e4m3fnuz, ir.DataType.FLOAT8E4M3FNUZ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 name="torch_float8_e4m3fnuz",
             ),
         ],
     )
+<<<<<<< HEAD
     def test_float8_support(self, float8_type):
+=======
+    def test_float8_support(self, float8_type: torch.dtype, onnx_type: ir.DataType):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Float8Module(torch.nn.Module):
             def forward(self, input: torch.Tensor):
                 input = input.to(float8_type)
                 return input
 
+<<<<<<< HEAD
         _ = self.export(Float8Module(), (torch.randn(1, 2),))
+=======
+        onnx_program = self.export(Float8Module(), (torch.randn(1, 2),))
+        self.assertEqual(onnx_program.model.graph.outputs[0].dtype, onnx_type)
+
+    def test_float4_support(self):
+        class Float4Module(torch.nn.Module):
+            def forward(self):
+                return torch.empty([1], dtype=torch.float4_e2m1fn_x2)
+
+        onnx_program = self.export(Float4Module(), optimize=False)
+        output = onnx_program.model.graph.outputs[0]
+        self.assertEqual(output.dtype, ir.DataType.FLOAT4E2M1)
+        # The shape is [*shape[:-1], shape[-1]*2] because ONNX stores the shape of the unpacked tensor
+        self.assertEqual(output.shape.numpy(), [2])
+
+    def test_bfloat16_support(self):
+        class BfloatModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                # Test parameters
+                self.param = torch.nn.Parameter(torch.tensor(2.0, dtype=torch.bfloat16))
+
+            def forward(self, x):
+                # Test constant tensors are stored as bfloat16
+                const = torch.tensor(1.0, dtype=torch.bfloat16)
+                return x * const * self.param
+
+        input = torch.tensor([1.0, 2.0], dtype=torch.bfloat16)
+        onnx_program = self.export(BfloatModel(), (input,), optimize=False)
+        initializers = onnx_program.model.graph.initializers.values()
+        self.assertEqual(len(initializers), 2)
+        for initializer in initializers:
+            self.assertEqual(initializer.dtype, ir.DataType.BFLOAT16)
+        self.assertEqual(onnx_program.model.graph.inputs[0].dtype, ir.DataType.BFLOAT16)
+        self.assertEqual(
+            onnx_program.model.graph.outputs[0].dtype, ir.DataType.BFLOAT16
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_export_with_logging_logger(self):
         logger = logging.getLogger(__name__)
@@ -274,7 +374,11 @@ def forward(self, x, y):
                 return x + y
 
         dim0_x = torch.export.Dim("dim0_x", min=6)
+<<<<<<< HEAD
         dynamic_shapes = {"x": {0: dim0_x}, "y": None}
+=======
+        dynamic_shapes = {"x": {0: dim0_x}, "y": torch.export.Dim.STATIC}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # specialized input y to 5 during tracing
         onnx_program = self.export(
             Model(),
@@ -450,7 +554,11 @@ def forward(
         )
 
         dynamic_shapes = (
+<<<<<<< HEAD
             {0: torch.export.Dim("dim_x", min=3)},  # _Dim
+=======
+            {0: torch.export.Dim("dim_x", min=3)},  # Dim
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [("custom_name_axis_ys_0",), (torch.export.Dim.AUTO,)],  # custom name
             {
                 "a": {0: torch.export.Dim.AUTO},
@@ -515,6 +623,7 @@ def forward(self, x, y, z):
         # all of these should be fine
         dynamic_shapes = (
             {0: dx, 1: torch.export.Dim.AUTO},
+<<<<<<< HEAD
             {0: dy, 1: None},
             {0: dz, 1: 3},
         )
@@ -523,6 +632,224 @@ def forward(self, x, y, z):
         # make sre the naming is working
         self.assertEqual(onnx_program.model.graph.inputs[0].shape[0], "dx")
 
+=======
+            {0: dy, 1: torch.export.Dim.STATIC},
+            {0: dz, 1: 3},
+        )
+        onnx_program = self.export(Model(), inputs, dynamic_shapes=dynamic_shapes)
+        onnx_testing.assert_onnx_program(onnx_program)
+        # make sre the naming is working
+        self.assertEqual(onnx_program.model.graph.inputs[0].shape[0], "dx")
+
+    def test_export_sym_max(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.sym_max(*x.shape)
+
+        inputs = (torch.zeros((2, 3)),)
+        dynamic_shapes = ({0: torch.export.Dim.DYNAMIC, 1: torch.export.Dim.DYNAMIC},)
+        onnx_program = self.export(Model(), inputs, dynamic_shapes=dynamic_shapes)
+        onnx_testing.assert_onnx_program(onnx_program)
+        self.assertIn(
+            "Max",
+            [node.op_type for node in onnx_program.model.graph],
+        )
+
+    def test_export_sym_min(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.sym_min(*x.shape)
+
+        inputs = (torch.zeros((2, 3)),)
+        dynamic_shapes = ({0: torch.export.Dim.DYNAMIC, 1: torch.export.Dim.DYNAMIC},)
+        onnx_program = self.export(Model(), inputs, dynamic_shapes=dynamic_shapes)
+        onnx_testing.assert_onnx_program(onnx_program)
+        self.assertIn(
+            "Min",
+            [node.op_type for node in onnx_program.model.graph],
+        )
+
+    def test_export_sym_not(self):
+        class SymNotModel(torch.nn.Module):
+            def forward(self, x):
+                comparison = x.shape[0] == x.shape[1]
+                return torch.sym_not(comparison)
+
+        inputs = (torch.zeros((2, 2)),)
+        dynamic_shapes = ({0: torch.export.Dim.DYNAMIC, 1: torch.export.Dim.DYNAMIC},)
+        onnx_program = self.export(SymNotModel(), inputs, dynamic_shapes=dynamic_shapes)
+        onnx_testing.assert_onnx_program(onnx_program)
+        self.assertIn(
+            "Not",
+            [node.op_type for node in onnx_program.model.graph],
+        )
+
+    def test_export_sym_float(self):
+        class SymFloatModel(torch.nn.Module):
+            def forward(self, x):
+                a = x.shape[0]
+                return torch.sym_float(a)
+
+        inputs = (torch.zeros((2, 2)),)
+        dynamic_shapes = ({0: torch.export.Dim.DYNAMIC, 1: torch.export.Dim.DYNAMIC},)
+        onnx_program = self.export(
+            SymFloatModel(), inputs, dynamic_shapes=dynamic_shapes
+        )
+        onnx_testing.assert_onnx_program(onnx_program)
+        self.assertIn(
+            "Cast",
+            [node.op_type for node in onnx_program.model.graph],
+        )
+
+    def test_scan_cdist_add(self):
+        def dist(unused: torch.Tensor, x: torch.Tensor, samex: torch.Tensor):
+            sub = samex - x.reshape((1, -1))
+            sq = sub * sub
+            rd = torch.sqrt(sq.sum(axis=1))
+            return [unused.clone(), rd]
+
+        class ScanModel(torch.nn.Module):
+            def forward(self, x):
+                z = torch.tensor([0], dtype=torch.float32)
+                y = x.clone()
+                out = torch.ops.higher_order.scan(dist, [z], [x], additional_inputs=[y])
+                return out[1]
+
+        inputs = (
+            torch.tensor(
+                [[1, 2, 3, -1], [4, 5, 6, -1], [7, 8, 9, -1]], dtype=torch.float32
+            ),
+        )
+        onnx_program = self.export(ScanModel(), inputs)
+        onnx_testing.assert_onnx_program(onnx_program)
+
+    def test_scan_cdist_dynamic_shapes(self):
+        def dist(y: torch.Tensor, scanned_x: torch.Tensor):
+            sub = y - scanned_x.reshape((1, -1))
+            sq = sub * sub
+            rd = torch.sqrt(sq.sum(axis=1))
+            return [y.clone(), rd]
+
+        class ScanModel(torch.nn.Module):
+            def forward(self, x, y):
+                carry, out = torch.ops.higher_order.scan(
+                    dist, [y], [x], additional_inputs=[]
+                )
+                return out
+
+        x_rows = torch.export.Dim("x_rows")
+        y_rows = torch.export.Dim("y_rows")
+        dim = torch.export.Dim("dim")
+        inputs = (torch.randn(3, 4), torch.randn(5, 4))
+        onnx_program = self.export(
+            ScanModel(),
+            inputs,
+            dynamic_shapes=({0: x_rows, 1: dim}, {0: y_rows, 1: dim}),
+        )
+        onnx_testing.assert_onnx_program(onnx_program)
+
+    @pytest.mark.xfail(reason="Data dependent error.")
+    def test_scan_loop_inplace(self):
+        def dummy_loop(padded: torch.Tensor, pos: torch.Tensor):
+            copy = torch.zeros(padded.shape)
+            for i in range(pos.shape[0]):
+                p = pos[i]
+                copy[i, :p] = padded[i, :p]
+            return copy
+
+        def dummy_loop_with_scan(padded: torch.Tensor, pos: torch.Tensor):
+            def pad_row(padded, p):
+                row = torch.zeros((padded.shape[0],))
+                torch._check(p.item() > 0)
+                torch._check(p.item() < padded.shape[0])
+                # this check is not always true, we add it anyway to make this dimension >= 2
+                # and avoid raising an exception about dynamic dimension in {0, 1}
+                if torch.compiler.is_exporting():
+                    torch._check(p.item() > 1)
+                row[: p.item()] = padded[: p.item()]
+                return (row,)
+
+            return torch.ops.higher_order.scan(pad_row, [], [padded, pos], [])
+
+        def select_when_exporting(f, f_scan):
+            return f_scan if torch.compiler.is_exporting() else f
+
+        class ScanModel(torch.nn.Module):
+            def forward(self, images, position):
+                return select_when_exporting(dummy_loop, dummy_loop_with_scan)(
+                    images, position
+                )
+
+        DYN = torch.export.Dim.DYNAMIC
+        x = torch.randn((5, 6))
+        y = torch.arange(5, dtype=torch.int64) + 1
+        ep = torch.export.export(
+            ScanModel(),
+            (x, y),
+            dynamic_shapes={"images": {0: DYN, 1: DYN}, "position": {0: DYN}},
+            strict=False,
+        )
+        onnx_program = self.export(ep)
+        onnx_testing.assert_onnx_program(onnx_program)
+
+
+@common_utils.instantiate_parametrized_tests
+class DynamoExporterNewOpsetsTest(common_utils.TestCase, _WithExport):
+    def test_group_norm_opset_21(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.nn.functional.group_norm(x, 4)
+
+        x = torch.randn(1, 4, 4, 4, dtype=torch.float32)
+        onnx_program = self.export(Model(), (x,), opset_version=21)
+        # TODO(after ort support): As of ONNX Runtime 1.22, the operator is not implemented yet.
+        # call assert_onnx_program after ort support
+        self.assertIn(
+            "GroupNormalization",
+            [node.op_type for node in onnx_program.model.graph],
+        )
+
+    def test_graph_attention_opset_23(self):
+        class Model(torch.nn.Module):
+            def forward(self, query, key, value):
+                return torch.nn.functional.scaled_dot_product_attention(
+                    query, key, value
+                )
+
+        query = torch.rand(32, 8, 128, 64, dtype=torch.float16)
+        key = torch.rand(32, 8, 128, 64, dtype=torch.float16)
+        value = torch.rand(32, 8, 128, 64, dtype=torch.float16)
+        expected = Model()(query, key, value)
+
+        onnx_program = self.export(Model(), (query, key, value), opset_version=23)
+        self.assertIn("Attention", [node.op_type for node in onnx_program.model.graph])
+
+        ref = onnx_ref.ReferenceEvaluator(onnx_program.model_proto)
+        got = ref.run(
+            None, dict(query=query.numpy(), key=key.numpy(), value=value.numpy())
+        )[0]
+        torch.testing.assert_close(torch.from_numpy(got), expected, atol=1e-2, rtol=1)
+
+    def test_graph_accuracy_attention_opset_23(self):
+        class Model(torch.nn.Module):
+            def forward(self, query, key, value):
+                return torch.nn.functional.scaled_dot_product_attention(
+                    query, key, value
+                )
+
+        query = torch.rand(32, 8, 128, 64, dtype=torch.float16)
+        key = torch.rand(32, 8, 128, 64, dtype=torch.float16)
+        value = torch.rand(32, 8, 128, 64, dtype=torch.float16)
+
+        onnx_program = self.export(
+            Model(), (query, key, value), opset_version=23, optimize=True
+        )
+        self.assertEqual(["Attention"], [n.op_type for n in onnx_program.model.graph])
+        # onnxruntime inlines any op defined as a function and without any implemented kernel
+        if has_onnxruntime_opset_23():
+            onnx_testing.assert_onnx_program(onnx_program, atol=1e-2, rtol=1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/exporter/test_verification.py b/test/onnx/exporter/test_verification.py
index 89cad4554fff..1da853ea22db 100644
--- a/test/onnx/exporter/test_verification.py
+++ b/test/onnx/exporter/test_verification.py
@@ -3,6 +3,11 @@
 
 from __future__ import annotations
 
+<<<<<<< HEAD
+=======
+import json
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch.onnx._internal.exporter import _verification
 from torch.testing._internal import common_utils
@@ -47,6 +52,41 @@ def test_from_tensors_int(self):
         self.assertEqual(verification_info.expected_dtype, torch.int64)
         self.assertEqual(verification_info.actual_dtype, torch.int64)
 
+<<<<<<< HEAD
+=======
+    def test_asdict(self):
+        # Test the asdict method
+        expected = torch.tensor([1.0, 2.0, 3.0])
+        actual = torch.tensor([1.0, 2.0, 3.0])
+        verification_info = _verification.VerificationInfo.from_tensors(
+            "test_tensor", expected, actual
+        )
+        asdict_result = verification_info.asdict()
+        self.assertEqual(asdict_result["name"], "test_tensor")
+        self.assertEqual(asdict_result["max_abs_diff"], 0)
+        self.assertEqual(asdict_result["max_rel_diff"], 0)
+        self.assertEqual(
+            asdict_result["abs_diff_hist"],
+            [
+                [3.0] + [0.0] * 8,
+                [0.0, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 1000000.0],
+            ],
+        )
+        self.assertEqual(
+            asdict_result["rel_diff_hist"],
+            [
+                [3.0] + [0.0] * 8,
+                [0.0, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 1000000.0],
+            ],
+        )
+        self.assertEqual(asdict_result["expected_dtype"], "torch.float32")
+        self.assertEqual(asdict_result["actual_dtype"], "torch.float32")
+        # Ensure it can be round tripped as json
+        json_str = json.dumps(asdict_result)
+        loaded_dict = json.loads(json_str)
+        self.assertEqual(loaded_dict, asdict_result)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class VerificationInterpreterTest(common_utils.TestCase):
     def test_interpreter_stores_correct_info(self):
diff --git a/test/onnx/internal/test_registraion.py b/test/onnx/internal/test_registraion.py
index 39afcc24ee65..fbd7914c9e34 100644
--- a/test/onnx/internal/test_registraion.py
+++ b/test/onnx/internal/test_registraion.py
@@ -144,7 +144,11 @@ def test_remove_override_removes_overridden_key(self):
         self.assertEqual(len(self.override_dict), 0)
         self.assertNotIn("a", self.override_dict)
 
+<<<<<<< HEAD
     def test_overriden_key_precededs_base_key_regardless_of_insert_order(self):
+=======
+    def test_overridden_key_precedes_base_key_regardless_of_insert_order(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.override_dict.set_base("a", 42)
         self.override_dict.override("a", 100)
         self.override_dict.set_base("a", 0)
diff --git a/test/onnx/onnx_test_common.py b/test/onnx/onnx_test_common.py
index d446130cfc4c..8550320b2797 100644
--- a/test/onnx/onnx_test_common.py
+++ b/test/onnx/onnx_test_common.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import contextlib
+<<<<<<< HEAD
 import copy
 import dataclasses
 import io
@@ -10,6 +11,12 @@
 import os
 import unittest
 import warnings
+=======
+import dataclasses
+import io
+import os
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Collection, Iterable, Mapping, Sequence
 from typing import Any, Callable, Optional, Union
 
@@ -187,6 +194,7 @@ def _run_test(m, remained_onnx_input_idx, flatten=True, ignore_none=True):
         if not is_model_script and not self.is_script:
             _run_test(model, tracing_remained_onnx_input_idx)
 
+<<<<<<< HEAD
     def run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
         self,
         model: _ModelType,
@@ -323,6 +331,8 @@ def run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
                     has_mutation=has_mutation,
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def run_ort(
     onnx_model: Union[str, torch.onnx.ONNXProgram],
@@ -369,6 +379,7 @@ def run_ort(
     return session.run(None, ort_input)
 
 
+<<<<<<< HEAD
 def _try_clone_model(model: _ModelType) -> _ModelType:
     """Used for preserving original model in case forward mutates model states."""
     try:
@@ -412,18 +423,23 @@ def _compare_pytorch_onnx_with_ort(
     torch.testing.assert_close(onnx_outputs, ref_outputs, rtol=rtol, atol=atol)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # The min onnx opset version to test for
 MIN_ONNX_OPSET_VERSION = 9
 # The max onnx opset version to test for
 MAX_ONNX_OPSET_VERSION = _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET
 TESTED_OPSETS = range(MIN_ONNX_OPSET_VERSION, MAX_ONNX_OPSET_VERSION + 1)
 
+<<<<<<< HEAD
 # The min onnx opset version to test for
 FX_MIN_ONNX_OPSET_VERSION = 18
 # The max onnx opset version to test for
 FX_MAX_ONNX_OPSET_VERSION = 18
 FX_TESTED_OPSETS = range(FX_MIN_ONNX_OPSET_VERSION, FX_MAX_ONNX_OPSET_VERSION + 1)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BOOL_TYPES = (torch.bool,)
 
 INT_TYPES = (
diff --git a/test/onnx/ops/test_ops.py b/test/onnx/ops/test_ops.py
new file mode 100644
index 000000000000..437c74e9bfbf
--- /dev/null
+++ b/test/onnx/ops/test_ops.py
@@ -0,0 +1,1453 @@
+# Owner(s): ["module: onnx"]
+"""Test torch.onnx.ops."""
+
+from __future__ import annotations
+
+import onnx_ir.passes.common as common_passes
+from onnxscript import ir
+
+import torch
+from torch.onnx.ops import _impl, _symbolic_impl
+from torch.testing._internal import common_utils
+
+
+class SchemaTest(common_utils.TestCase):
+    def test_symbolic_has_correct_schema(self):
+        torch.library.opcheck(
+            _symbolic_impl._symbolic,
+            ([torch.tensor(1)], "CustomOp", 1),
+            dict(
+                shape=[
+                    1,
+                ],
+                attr_keys=["key"],
+                attr_types=["i"],
+                attr_pos=[(0, 1)],
+                attr_ints=[1],
+                attr_floats=[1.0],
+                attr_strs=["attr"],
+                metadata_props_keys=["meta_key"],
+                metadata_props_values=["meta_value"],
+                domain="custom_domain",
+                version=42,
+            ),
+        )
+
+        # Empty inputs
+        torch.library.opcheck(
+            _symbolic_impl._symbolic,
+            ([], "CustomOp", 1),
+            dict(
+                shape=[
+                    1,
+                ],
+                attr_keys=[],
+                attr_types=[],
+                attr_pos=[],
+                attr_ints=[],
+                attr_floats=[],
+                attr_strs=[],
+                metadata_props_keys=[],
+                metadata_props_values=[],
+            ),
+        )
+
+    def test_symbolic_multi_out_has_correct_schema(self):
+        torch.library.opcheck(
+            _symbolic_impl._symbolic_multi_out,
+            ([torch.tensor(1)], "CustomMultiOutOp", [1, 2, 10]),
+            dict(
+                shapes=[[1, 2], [42], []],
+                attr_keys=["key"],
+                attr_types=["i"],
+                attr_pos=[(0, 1)],
+                attr_ints=[1],
+                attr_floats=[1.0],
+                attr_strs=["attr"],
+                metadata_props_keys=["meta_key"],
+                metadata_props_values=["meta_value"],
+                domain="",
+                version=1,
+            ),
+        )
+
+        # Empty inputs
+        torch.library.opcheck(
+            _symbolic_impl._symbolic_multi_out,
+            ([], "CustomMultiOutOp", []),
+            dict(
+                shapes=[],
+                attr_keys=[],
+                attr_types=[],
+                attr_pos=[],
+                attr_ints=[],
+                attr_floats=[],
+                attr_strs=[],
+                metadata_props_keys=[],
+                metadata_props_values=[],
+            ),
+        )
+
+
+class SymbolicOpsTest(common_utils.TestCase):
+    def test_symbolic_accepts_valid_inputs(self):
+        output = torch.onnx.ops.symbolic(
+            "custom_domain::CustomOp",
+            (torch.tensor(1),),
+            dict(
+                int_key=1,
+                float_key=1.0,
+                str_key="attr",
+                bool_key=True,
+                list_int_key=[1, 2],
+                list_float_key=[1.0, 2.0],
+                list_str_key=["attr1", "attr2"],
+                list_bool_key=[True, False],
+            ),
+            dtype=torch.float32,
+            shape=[1, 2, 3],
+            version=1,
+            metadata_props={"meta_key": "meta_value"},
+        )
+        self.assertEqual(output.shape, torch.Size([1, 2, 3]))
+        self.assertEqual(output.dtype, torch.float32)
+        self.assertEqual(output.device, torch.device("cpu"))
+
+    def test_symbolic_accepts_valid_inputs_empty_shape(self):
+        output = torch.onnx.ops.symbolic(
+            "custom_domain::CustomOp",
+            (torch.tensor(1),),
+            dtype=torch.float32,
+            shape=[],
+        )
+        self.assertEqual(output.shape, torch.Size([]))
+
+    def test_symbolic_accepts_valid_inputs_integer_types(self):
+        output = torch.onnx.ops.symbolic(
+            "custom_domain::CustomOp",
+            (torch.tensor(1),),
+            dtype=1,  # 1 is float32 in ONNX
+            shape=[42],
+        )
+        self.assertEqual(output.dtype, torch.float32)
+
+    def test_symbolic_accepts_valid_inputs_int4_type(self):
+        output = torch.onnx.ops.symbolic(
+            "custom_domain::CustomOp",
+            (torch.tensor(1),),
+            dtype=22,  # 22 is INT4 in ONNX
+            shape=[42],
+        )
+        # We use torch uint8 for int4
+        self.assertEqual(output.dtype, torch.uint8)
+
+    def test_symbolic_is_exportable(self):
+        class Model(torch.nn.Module):
+            def forward(self, x: torch.Tensor):
+                return torch.onnx.ops.symbolic(
+                    "custom_domain::CustomOp",
+                    (x, None),
+                    dict(
+                        int_key=1,
+                        float_key=1.0,
+                        str_key="attr",
+                        bool_key=True,
+                        list_int_key=[1, 2],
+                        list_float_key=[1.0, 2.0],
+                        list_str_key=["attr1", "attr2"],
+                        list_bool_key=[True, False],
+                    ),
+                    dtype=x.dtype,
+                    shape=[1, 2, 3],
+                    version=1,
+                    metadata_props={"meta_key": "meta_value"},
+                )
+
+        onnx_program = torch.onnx.export(
+            Model(), (torch.tensor(1),), dynamo=True, verbose=False
+        )
+        assert onnx_program is not None
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "CustomOp")
+        self.assertEqual(node.domain, "custom_domain")
+        attributes = node.attributes
+        self.assertEqual(
+            attributes,
+            dict(
+                int_key=ir.AttrInt64("int_key", 1),
+                float_key=ir.AttrFloat32("float_key", 1.0),
+                str_key=ir.AttrString("str_key", "attr"),
+                bool_key=ir.AttrInt64("bool_key", 1),
+                list_int_key=ir.AttrInt64s("list_int_key", [1, 2]),
+                list_float_key=ir.AttrFloat32s("list_float_key", [1.0, 2.0]),
+                list_str_key=ir.AttrStrings("list_str_key", ["attr1", "attr2"]),
+                list_bool_key=ir.AttrInt64s("list_bool_key", [1, 0]),
+            ),
+        )
+        self.assertEqual(node.metadata_props["meta_key"], "meta_value")
+        outputs = node.outputs
+        self.assertEqual(list(outputs[0].shape), [1, 2, 3])
+        self.assertEqual(outputs[0].dtype, ir.DataType.INT64)
+
+    def test_symbolic_preserves_dynamic_shapes(self):
+        class Model(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor):
+                return torch.onnx.ops.symbolic(
+                    "custom_domain::CustomOp",
+                    (x, y),
+                    dtype=x.dtype,
+                    shape=[*x.shape, *y.shape],
+                    version=1,
+                )
+
+        onnx_program = torch.onnx.export(
+            Model(),
+            (torch.zeros(2, 3), torch.zeros(1, 2)),
+            dynamic_shapes=({0: "batch"}, {1: "something_else"}),
+            dynamo=True,
+            verbose=False,
+        )
+        assert onnx_program is not None
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "CustomOp")
+        self.assertEqual(node.domain, "custom_domain")
+        inputs = onnx_program.model.graph.inputs
+        self.assertEqual(str(inputs[0].shape[0]), "batch")
+        self.assertEqual(inputs[0].shape[1], 3)
+        self.assertEqual(inputs[1].shape[0], 1)
+        self.assertEqual(str(inputs[1].shape[1]), "something_else")
+        outputs = node.outputs
+        self.assertEqual(str(outputs[0].shape[0]), "batch")
+        self.assertEqual(outputs[0].shape[1], 3)
+        self.assertEqual(outputs[0].shape[2], 1)
+        self.assertEqual(str(outputs[0].shape[3]), "something_else")
+        self.assertEqual(outputs[0].dtype, ir.DataType.FLOAT)
+
+    def test_symbolic_multi_out_accepts_valid_inputs(self):
+        outputs = torch.onnx.ops.symbolic_multi_out(
+            "custom_domain::CustomMultiOutOp",
+            (torch.tensor(1),),
+            dict(
+                int_key=1,
+                float_key=1.0,
+                str_key="attr",
+                bool_key=True,
+                list_int_key=[1, 2],
+                list_float_key=[1.0, 2.0],
+                list_str_key=["attr1", "attr2"],
+                list_bool_key=[True, False],
+            ),
+            dtypes=(
+                1,  # 1 is float32 in ONNX
+                torch.int32,
+                torch.float8_e4m3fn,
+            ),
+            shapes=([1, 2], [42], []),
+            version=1,
+            metadata_props={"meta_key": "meta_value"},
+        )
+        self.assertEqual(len(outputs), 3)
+        self.assertEqual(outputs[0].shape, torch.Size([1, 2]))
+        self.assertEqual(outputs[0].dtype, torch.float32)
+        self.assertEqual(outputs[1].shape, torch.Size([42]))
+        self.assertEqual(outputs[1].dtype, torch.int32)
+        self.assertEqual(outputs[2].shape, torch.Size([]))
+        self.assertEqual(outputs[2].dtype, torch.float8_e4m3fn)
+        self.assertEqual(outputs[0].device, torch.device("cpu"))
+        self.assertEqual(outputs[1].device, torch.device("cpu"))
+        self.assertEqual(outputs[2].device, torch.device("cpu"))
+
+    def test_symbolic_multi_out_accepts_valid_inputs_empty_shape(self):
+        outputs = torch.onnx.ops.symbolic_multi_out(
+            "custom_domain::CustomOp",
+            (torch.tensor(1),),
+            dtypes=(torch.float32,),
+            shapes=[[]],
+        )
+        self.assertEqual(outputs[0].shape, torch.Size([]))
+
+    def test_symbolic_multi_out_accepts_valid_inputs_integer_types(self):
+        outputs = torch.onnx.ops.symbolic_multi_out(
+            "custom_domain::CustomOp",
+            (torch.tensor(1),),
+            dtypes=(1,),  # 1 is float32 in ONNX
+            shapes=[[42]],
+        )
+        self.assertEqual(outputs[0].dtype, torch.float32)
+
+    def test_symbolic_multi_out_accepts_valid_inputs_int4_type(self):
+        outputs = torch.onnx.ops.symbolic_multi_out(
+            "custom_domain::CustomOp",
+            (torch.tensor(1),),
+            dtypes=(22,),  # 22 is INT4 in ONNX
+            shapes=[[42]],
+        )
+        # We use torch uint8 for int4
+        self.assertEqual(outputs[0].dtype, torch.uint8)
+
+    def test_symbolic_multi_out_is_exportable(self):
+        class Model(torch.nn.Module):
+            def forward(self, x: torch.Tensor):
+                return torch.onnx.ops.symbolic_multi_out(
+                    "custom_domain::CustomOp",
+                    (x, None),
+                    dict(
+                        int_key=1,
+                        float_key=1.0,
+                        str_key="attr",
+                        bool_key=True,
+                        list_int_key=[1, 2],
+                        list_float_key=[1.0, 2.0],
+                        list_str_key=["attr1", "attr2"],
+                        list_bool_key=[True, False],
+                    ),
+                    dtypes=(torch.float32, torch.int32, torch.float8_e4m3fn),
+                    shapes=([1, 2], [42], []),
+                    version=1,
+                    metadata_props={"meta_key": "meta_value"},
+                )
+
+        onnx_program = torch.onnx.export(
+            Model(), (torch.tensor(1),), dynamo=True, verbose=False
+        )
+        assert onnx_program is not None
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "CustomOp")
+        self.assertEqual(node.domain, "custom_domain")
+        attributes = node.attributes
+        self.assertEqual(
+            attributes,
+            dict(
+                int_key=ir.AttrInt64("int_key", 1),
+                float_key=ir.AttrFloat32("float_key", 1.0),
+                str_key=ir.AttrString("str_key", "attr"),
+                bool_key=ir.AttrInt64("bool_key", 1),
+                list_int_key=ir.AttrInt64s("list_int_key", [1, 2]),
+                list_float_key=ir.AttrFloat32s("list_float_key", [1.0, 2.0]),
+                list_str_key=ir.AttrStrings("list_str_key", ["attr1", "attr2"]),
+                list_bool_key=ir.AttrInt64s("list_bool_key", [1, 0]),
+            ),
+        )
+        self.assertEqual(node.metadata_props["meta_key"], "meta_value")
+        outputs = node.outputs
+        self.assertEqual(list(outputs[0].shape), [1, 2])
+        self.assertEqual(outputs[0].dtype, ir.DataType.FLOAT)
+        self.assertEqual(list(outputs[1].shape), [42])
+        self.assertEqual(outputs[1].dtype, ir.DataType.INT32)
+        self.assertEqual(list(outputs[2].shape), [])
+        self.assertEqual(outputs[2].dtype, ir.DataType.FLOAT8E4M3FN)
+
+    def test_symbolic_multi_out_preserves_dynamic_shapes(self):
+        class Model(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor):
+                return torch.onnx.ops.symbolic_multi_out(
+                    "custom_domain::CustomOp",
+                    (x, y),
+                    dtypes=(x.dtype, 22),  # 22 is INT4
+                    shapes=[[*x.shape, *y.shape], [42]],
+                    version=1,
+                )
+
+        onnx_program = torch.onnx.export(
+            Model(),
+            (torch.zeros(2, 3), torch.zeros(1, 2)),
+            dynamic_shapes=({0: "batch"}, {1: "something_else"}),
+            dynamo=True,
+            verbose=False,
+        )
+        assert onnx_program is not None
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "CustomOp")
+        self.assertEqual(node.domain, "custom_domain")
+        inputs = onnx_program.model.graph.inputs
+        self.assertEqual(str(inputs[0].shape[0]), "batch")
+        self.assertEqual(inputs[0].shape[1], 3)
+        self.assertEqual(inputs[1].shape[0], 1)
+        self.assertEqual(str(inputs[1].shape[1]), "something_else")
+        outputs = node.outputs
+        self.assertEqual(str(outputs[0].shape[0]), "batch")
+        self.assertEqual(outputs[0].shape[1], 3)
+        self.assertEqual(outputs[0].shape[2], 1)
+        self.assertEqual(str(outputs[0].shape[3]), "something_else")
+        self.assertEqual(outputs[0].dtype, ir.DataType.FLOAT)
+        self.assertEqual(list(outputs[1].shape), [42])
+        self.assertEqual(outputs[1].dtype, ir.DataType.INT4)
+
+    def test_symbolic_multi_out_raises_when_dtypes_and_shapes_differ(self):
+        with self.assertRaises(RuntimeError):
+            torch.onnx.ops.symbolic_multi_out(
+                "custom_domain::CustomMultiOutOp",
+                (torch.tensor(1),),
+                dict(
+                    int_key=1,
+                    float_key=1.0,
+                    str_key="attr",
+                    bool_key=True,
+                    list_int_key=[1, 2],
+                    list_float_key=[1.0, 2.0],
+                    list_str_key=["attr1", "attr2"],
+                    list_bool_key=[True, False],
+                ),
+                dtypes=(torch.float32, torch.int32),
+                shapes=([1, 2], [42], []),
+                version=1,
+                metadata_props={"meta_key": "meta_value"},
+            )
+
+        with self.assertRaises(RuntimeError):
+            torch.onnx.ops.symbolic_multi_out(
+                "custom_domain::CustomMultiOutOp",
+                (torch.tensor(1),),
+                dict(
+                    int_key=1,
+                    float_key=1.0,
+                    str_key="attr",
+                    bool_key=True,
+                    list_int_key=[1, 2],
+                    list_float_key=[1.0, 2.0],
+                    list_str_key=["attr1", "attr2"],
+                    list_bool_key=[True, False],
+                ),
+                dtypes=(torch.float32,),
+                shapes=([1, 2], [42]),
+                version=1,
+                metadata_props={"meta_key": "meta_value"},
+            )
+
+
+class NativeOnnxOpsTest(common_utils.TestCase):
+    def export(self, model, args=(), kwargs=None, **options) -> torch.onnx.ONNXProgram:
+        onnx_program = torch.onnx.export(
+            model,
+            args,
+            kwargs=kwargs,
+            dynamo=True,
+            fallback=False,
+            verbose=False,
+            **options,
+        )
+        assert onnx_program is not None
+        common_passes.CheckerPass()(onnx_program.model)
+        return onnx_program
+
+    def test_onnx_ops_can_be_decomposed_to_aten(self):
+        input_data = torch.rand(2, 3, 4, 8)
+        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+        sin_cache_data = torch.rand(50, 4)
+        cos_cache_data = torch.rand(50, 4)
+
+        class Model(torch.nn.Module):
+            def forward(
+                self, input_data, cos_cache_data, sin_cache_data, position_ids_data
+            ):
+                return torch.onnx.ops.rotary_embedding(
+                    input_data,
+                    cos_cache_data,
+                    sin_cache_data,
+                    position_ids_data,
+                    interleaved=True,
+                )
+
+        model = Model()
+
+        ep = torch.export.export(
+            model,
+            (input_data, cos_cache_data, sin_cache_data, position_ids_data),
+        )
+        self.assertIn(
+            "onnx.RotaryEmbedding.opset23",
+            [str(node.target) for node in ep.graph.nodes],
+        )
+        # The program can be decomposed into aten ops so it is fully compatible with the PyTorch ecosystem
+        aten_decomped = ep.run_decompositions(torch.onnx.ops.aten_decompositions())
+        self.assertNotIn(
+            "onnx.RotaryEmbedding.opset23",
+            [str(node.target) for node in aten_decomped.graph.nodes],
+        )
+        torch.testing.assert_close(
+            aten_decomped.module()(
+                input_data, cos_cache_data, sin_cache_data, position_ids_data
+            ),
+            model(input_data, cos_cache_data, sin_cache_data, position_ids_data),
+        )
+
+    def test_rotary_embedding_opcheck(self):
+        input_data = torch.rand(2, 3, 4, 8)
+        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+        sin_cache_data = torch.rand(50, 4)
+        cos_cache_data = torch.rand(50, 4)
+
+        torch.library.opcheck(
+            _impl.rotary_embedding_23,
+            (input_data, cos_cache_data, sin_cache_data, position_ids_data),
+        )
+
+    def test_rotary_embedding(self):
+        input_data = torch.rand(2, 3, 4, 8)
+        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+        sin_cache_data = torch.rand(50, 4)
+        cos_cache_data = torch.rand(50, 4)
+
+        # Eager mode is supported. Autograd is also supported so users can choose to use the op
+        # in development and production
+        result = torch.onnx.ops.rotary_embedding(
+            input_data, cos_cache_data, sin_cache_data, position_ids_data
+        )
+        self.assertEqual(result.shape, input_data.shape)
+
+        class Model(torch.nn.Module):
+            def forward(
+                self, input_data, cos_cache_data, sin_cache_data, position_ids_data
+            ):
+                return torch.onnx.ops.rotary_embedding(
+                    input_data,
+                    cos_cache_data,
+                    sin_cache_data,
+                    position_ids_data,
+                    interleaved=True,
+                )
+
+        model = Model()
+
+        # Dynamic shapes are supported
+        dynamic_shapes = {
+            "input_data": {0: torch.export.Dim.DYNAMIC},
+            "cos_cache_data": None,
+            "sin_cache_data": None,
+            "position_ids_data": {0: torch.export.Dim.DYNAMIC},
+        }
+
+        onnx_program = self.export(
+            model,
+            (input_data, cos_cache_data, sin_cache_data, position_ids_data),
+            dynamic_shapes=dynamic_shapes,
+            opset_version=23,
+        )
+        self.assertEqual(onnx_program.model.opset_imports[""], 23)
+        self.assertEqual("RotaryEmbedding", onnx_program.model.graph.node(0).op_type)
+
+    def test_attention_basic(self):
+        """Test basic attention functionality."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        # Test eager mode
+        torch.library.opcheck(_impl.attention_23, (Q, K, V))
+        output, present_key, present_value, qk_output = torch.onnx.ops.attention(
+            Q, K, V
+        )
+
+        self.assertEqual(output.shape, (batch_size, q_num_heads, q_seq_len, head_size))
+        self.assertEqual(present_key.shape, K.shape)
+        self.assertEqual(present_value.shape, V.shape)
+        self.assertEqual(
+            qk_output.shape, (batch_size, q_num_heads, q_seq_len, kv_seq_len)
+        )
+
+    def test_attention_3d_inputs(self):
+        """Test attention with 3D inputs (requires num_heads parameters)."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_seq_len, q_num_heads * head_size)
+        K = torch.rand(batch_size, kv_seq_len, kv_num_heads * head_size)
+        V = torch.rand(batch_size, kv_seq_len, kv_num_heads * head_size)
+
+        torch.library.opcheck(
+            _impl.attention_23,
+            (Q, K, V),
+            dict(q_num_heads=q_num_heads, kv_num_heads=kv_num_heads),
+        )
+        output, present_key, present_value, qk_output = torch.onnx.ops.attention(
+            Q, K, V, q_num_heads=q_num_heads, kv_num_heads=kv_num_heads
+        )
+
+        # Output should be reshaped back to 3D
+        self.assertEqual(output.shape, (batch_size, q_seq_len, q_num_heads * head_size))
+        self.assertEqual(
+            present_key.shape, (batch_size, kv_num_heads, kv_seq_len, head_size)
+        )
+        self.assertEqual(
+            present_value.shape, (batch_size, kv_num_heads, kv_seq_len, head_size)
+        )
+
+    def test_attention_gqa(self):
+        """Test Group Query Attention (GQA)."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 4  # GQA: q_num_heads % kv_num_heads = 0
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        torch.library.opcheck(_impl.attention_23, (Q, K, V))
+        output, present_key, present_value, qk_output = torch.onnx.ops.attention(
+            Q, K, V
+        )
+        expected = torch.nn.functional.scaled_dot_product_attention(
+            Q, K, V, None, enable_gqa=True
+        )
+
+        self.assertEqual(output.shape, (batch_size, q_num_heads, q_seq_len, head_size))
+        self.assertEqual(present_key.shape, K.shape)
+        self.assertEqual(present_value.shape, V.shape)
+        torch.testing.assert_close(output, expected)
+
+    def test_attention_mqa(self):
+        """Test Multi-Query Attention (MQA)."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 1  # MQA: kv_num_heads = 1
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        torch.library.opcheck(_impl.attention_23, (Q, K, V))
+        output, present_key, present_value, qk_output = torch.onnx.ops.attention(
+            Q, K, V
+        )
+        expected = torch.nn.functional.scaled_dot_product_attention(
+            Q, K, V, None, enable_gqa=True
+        )
+
+        self.assertEqual(output.shape, (batch_size, q_num_heads, q_seq_len, head_size))
+        torch.testing.assert_close(output, expected)
+
+    def test_attention_with_2d_mask(self):
+        """Test attention with 2D attention mask (q_seq_len, kv_seq_len)."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        # Test with boolean mask
+        bool_mask = torch.randint(0, 2, (q_seq_len, kv_seq_len), dtype=torch.bool)
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(attn_mask=bool_mask))
+        output_bool, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=bool_mask)
+
+        # Test with float mask
+        float_mask = torch.randn(q_seq_len, kv_seq_len)
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(attn_mask=float_mask))
+        output_float, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=float_mask)
+
+        self.assertEqual(
+            output_bool.shape, (batch_size, q_num_heads, q_seq_len, head_size)
+        )
+        self.assertEqual(
+            output_float.shape, (batch_size, q_num_heads, q_seq_len, head_size)
+        )
+
+    def test_attention_with_4d_mask(self):
+        """Test attention with 4D attention mask (batch_size, num_heads, q_seq_len, kv_seq_len)."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        # Test with boolean mask
+        bool_mask = torch.randint(
+            0, 2, (batch_size, q_num_heads, q_seq_len, kv_seq_len), dtype=torch.bool
+        )
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(attn_mask=bool_mask))
+        output_bool, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=bool_mask)
+
+        # Test with float mask
+        float_mask = torch.randn(batch_size, q_num_heads, q_seq_len, kv_seq_len)
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(attn_mask=float_mask))
+        output_float, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=float_mask)
+
+        self.assertEqual(
+            output_bool.shape, (batch_size, q_num_heads, q_seq_len, head_size)
+        )
+        self.assertEqual(
+            output_float.shape, (batch_size, q_num_heads, q_seq_len, head_size)
+        )
+
+    def test_attention_with_zero_float_mask(self):
+        """Test attention with zero float mask."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        zero_mask = torch.zeros(q_seq_len, kv_seq_len)
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(attn_mask=zero_mask))
+        output, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=zero_mask)
+
+        self.assertEqual(output.shape, (batch_size, q_num_heads, q_seq_len, head_size))
+
+    def test_attention_with_causal_mask_pattern(self):
+        """Test attention with lower triangular causal mask pattern."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 4  # Square for causal
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        # Create a lower triangular causal mask
+        causal_mask = torch.tril(torch.ones(q_seq_len, kv_seq_len, dtype=torch.bool))
+        torch.library.opcheck(
+            _impl.attention_23, (Q, K, V), dict(attn_mask=causal_mask)
+        )
+        output, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=causal_mask)
+
+        self.assertEqual(output.shape, (batch_size, q_num_heads, q_seq_len, head_size))
+
+    def test_attention_with_gqa_and_mask(self):
+        """Test attention with GQA and different mask shapes."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 4  # GQA
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        # Test 2D mask with GQA
+        mask_2d = torch.randint(0, 2, (q_seq_len, kv_seq_len), dtype=torch.bool)
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(attn_mask=mask_2d))
+        output_2d, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=mask_2d)
+
+        # Test 4D mask with GQA (note: using q_num_heads for mask heads)
+        mask_4d = torch.randint(
+            0, 2, (batch_size, q_num_heads, q_seq_len, kv_seq_len), dtype=torch.bool
+        )
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(attn_mask=mask_4d))
+        output_4d, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=mask_4d)
+
+        self.assertEqual(
+            output_2d.shape, (batch_size, q_num_heads, q_seq_len, head_size)
+        )
+        self.assertEqual(
+            output_4d.shape, (batch_size, q_num_heads, q_seq_len, head_size)
+        )
+
+    def test_attention_with_large_negative_float_mask(self):
+        """Test attention with large negative values in float mask."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        # Create mask with large negative values (similar to -inf masking)
+        float_mask = torch.full((q_seq_len, kv_seq_len), -1e9)
+        # Allow some positions
+        float_mask[:, :3] = 0.0
+
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(attn_mask=float_mask))
+        output, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=float_mask)
+
+        self.assertEqual(output.shape, (batch_size, q_num_heads, q_seq_len, head_size))
+
+    def test_attention_causal(self):
+        """Test causal attention."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 4  # Square for causal
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(is_causal=True))
+        output, _, _, _ = torch.onnx.ops.attention(Q, K, V, is_causal=True)
+
+        self.assertEqual(output.shape, (batch_size, q_num_heads, q_seq_len, head_size))
+
+    def test_attention_with_past_kv(self):
+        """Test attention with past key/value caches."""
+        batch_size, q_seq_len, kv_seq_len, past_seq_len = 2, 4, 6, 3
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        past_key = torch.rand(batch_size, kv_num_heads, past_seq_len, head_size)
+        past_value = torch.rand(batch_size, kv_num_heads, past_seq_len, head_size)
+
+        torch.library.opcheck(
+            _impl.attention_23,
+            (Q, K, V),
+            dict(past_key=past_key, past_value=past_value),
+        )
+        output, present_key, present_value, _ = torch.onnx.ops.attention(
+            Q, K, V, past_key=past_key, past_value=past_value
+        )
+
+        # Present key/value should include past + current
+        expected_total_seq_len = past_seq_len + kv_seq_len
+        self.assertEqual(
+            present_key.shape,
+            (batch_size, kv_num_heads, expected_total_seq_len, head_size),
+        )
+        self.assertEqual(
+            present_value.shape,
+            (batch_size, kv_num_heads, expected_total_seq_len, head_size),
+        )
+
+    def test_attention_with_softcap(self):
+        """Test attention with softcap."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(softcap=30.0))
+        output, _, _, _ = torch.onnx.ops.attention(Q, K, V, softcap=30.0)
+
+        self.assertEqual(output.shape, (batch_size, q_num_heads, q_seq_len, head_size))
+
+    def test_attention_qk_output_modes(self):
+        """Test different QK matmul output modes."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        for mode in [0, 1, 2, 3]:
+            torch.library.opcheck(
+                _impl.attention_23,
+                (Q, K, V),
+                dict(qk_matmul_output_mode=mode),
+            )
+            output, _, _, qk_output = torch.onnx.ops.attention(
+                Q, K, V, qk_matmul_output_mode=mode
+            )
+
+            self.assertEqual(
+                output.shape, (batch_size, q_num_heads, q_seq_len, head_size)
+            )
+            self.assertEqual(
+                qk_output.shape, (batch_size, q_num_heads, q_seq_len, kv_seq_len)
+            )
+
+    def test_attention_custom_scale(self):
+        """Test attention with custom scale factor."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        custom_scale = 0.25
+        torch.library.opcheck(_impl.attention_23, (Q, K, V), dict(scale=custom_scale))
+        output, _, _, _ = torch.onnx.ops.attention(Q, K, V, scale=custom_scale)
+
+        self.assertEqual(output.shape, (batch_size, q_num_heads, q_seq_len, head_size))
+
+    def test_attention_export(self):
+        """Test that attention can be exported to ONNX."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        class AttentionModel(torch.nn.Module):
+            def forward(self, Q, K, V):
+                output, present_key, present_value, qk_output = (
+                    torch.onnx.ops.attention(Q, K, V)
+                )
+                return output
+
+        model = AttentionModel()
+
+        onnx_program = self.export(
+            model,
+            (Q, K, V),
+            opset_version=23,
+        )
+
+        self.assertEqual(onnx_program.model.opset_imports[""], 23)
+        self.assertEqual("Attention", onnx_program.model.graph.node(0).op_type)
+
+    def test_attention_export_with_dynamic_shapes(self):
+        """Test attention export with dynamic shapes."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        class AttentionModel(torch.nn.Module):
+            def forward(self, Q, K, V):
+                output, present_key, present_value, qk_output = (
+                    torch.onnx.ops.attention(Q, K, V)
+                )
+                return output
+
+        model = AttentionModel()
+
+        dynamic_shapes = {
+            "Q": {0: "batch", 2: "q_seq_len"},
+            "K": {0: "batch", 2: "kv_seq_len"},
+            "V": {0: "batch", 2: "kv_seq_len"},
+        }
+
+        onnx_program = self.export(
+            model,
+            (Q, K, V),
+            dynamic_shapes=dynamic_shapes,
+            opset_version=23,
+        )
+
+        self.assertEqual(onnx_program.model.opset_imports[""], 23)
+        self.assertEqual("Attention", onnx_program.model.graph.node(0).op_type)
+        node = onnx_program.model.graph.node(0)
+        # Verify inputs
+        self.assertEqual(len(node.inputs), 3)  # Q, K, V (no optional inputs)
+        self.assertEqual(
+            node.inputs[0].shape, ["batch", q_num_heads, "q_seq_len", head_size]
+        )
+        self.assertEqual(
+            node.inputs[1].shape, ["batch", kv_num_heads, "kv_seq_len", head_size]
+        )
+        self.assertEqual(
+            node.inputs[2].shape, ["batch", kv_num_heads, "kv_seq_len", head_size]
+        )
+
+        # Verify default attributes (should be minimal)
+        self.assertEqual(len(node.attributes), 0)
+
+    def test_attention_3d_export(self):
+        """Test attention export with 3D inputs."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_seq_len, q_num_heads * head_size)
+        K = torch.rand(batch_size, kv_seq_len, kv_num_heads * head_size)
+        V = torch.rand(batch_size, kv_seq_len, kv_num_heads * head_size)
+
+        class AttentionModel(torch.nn.Module):
+            def forward(self, Q, K, V):
+                output, _, _, _ = torch.onnx.ops.attention(
+                    Q, K, V, q_num_heads=q_num_heads, kv_num_heads=kv_num_heads
+                )
+                return output
+
+        model = AttentionModel()
+
+        onnx_program = self.export(
+            model,
+            (Q, K, V),
+            opset_version=23,
+        )
+
+        self.assertEqual(onnx_program.model.opset_imports[""], 23)
+        self.assertEqual("Attention", onnx_program.model.graph.node(0).op_type)
+
+    def test_attention_decomposition(self):
+        """Test that attention can be decomposed to aten ops."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        class AttentionModel(torch.nn.Module):
+            def forward(self, Q, K, V):
+                output, present_key, present_value, qk_output = (
+                    torch.onnx.ops.attention(Q, K, V)
+                )
+                return output
+
+        model = AttentionModel()
+
+        ep = torch.export.export(model, (Q, K, V))
+        self.assertIn(
+            "onnx.Attention.opset23",
+            [str(node.target) for node in ep.graph.nodes],
+        )
+
+        # The program can be decomposed into aten ops
+        aten_decomped = ep.run_decompositions(torch.onnx.ops.aten_decompositions())
+        self.assertNotIn(
+            "onnx.Attention.opset23",
+            [str(node.target) for node in aten_decomped.graph.nodes],
+        )
+
+        # Results should match
+        torch.testing.assert_close(
+            aten_decomped.module()(Q, K, V),
+            model(Q, K, V),
+        )
+
+    def test_attention_export_with_past_key_value(self):
+        """Test export with past_key, past_value to ensure the optional input order is correct."""
+        batch_size, q_seq_len, kv_seq_len, past_seq_len = 2, 4, 6, 3
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        past_key = torch.rand(batch_size, kv_num_heads, past_seq_len, head_size)
+        past_value = torch.rand(batch_size, kv_num_heads, past_seq_len, head_size)
+
+        class Model(torch.nn.Module):
+            def forward(self, Q, K, V, past_key, past_value):
+                output, _, _, _ = torch.onnx.ops.attention(
+                    Q,
+                    K,
+                    V,
+                    past_key=past_key,
+                    attn_mask=None,
+                    # Switched argument order
+                    past_value=past_value,
+                )
+                return output
+
+        model = Model()
+        onnx_program = self.export(
+            model, (Q, K, V, past_key, past_value), opset_version=23
+        )
+
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "Attention")
+
+        # Verify all 6 inputs are present
+        self.assertEqual(
+            len(node.inputs), 6
+        )  # Q, K, V, attn_mask, past_key, past_value
+        self.assertEqual(
+            node.inputs[0].shape, [batch_size, q_num_heads, q_seq_len, head_size]
+        )
+        self.assertEqual(
+            node.inputs[1].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
+        )
+        self.assertEqual(
+            node.inputs[2].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
+        )
+        self.assertIsNone(node.inputs[3])
+        self.assertEqual(
+            node.inputs[4].shape, [batch_size, kv_num_heads, past_seq_len, head_size]
+        )
+        self.assertEqual(
+            node.inputs[5].shape, [batch_size, kv_num_heads, past_seq_len, head_size]
+        )
+
+    def test_attention_export_with_all_optional_inputs(self):
+        """Test export with all optional inputs: mask, past_key, past_value."""
+        batch_size, q_seq_len, kv_seq_len, past_seq_len = 2, 4, 6, 3
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        attn_mask = torch.randint(
+            0, 2, (1, 1, q_seq_len, kv_seq_len + past_seq_len), dtype=torch.bool
+        )
+        past_key = torch.rand(batch_size, kv_num_heads, past_seq_len, head_size)
+        past_value = torch.rand(batch_size, kv_num_heads, past_seq_len, head_size)
+
+        class FullAttentionModel(torch.nn.Module):
+            def forward(self, Q, K, V, attn_mask, past_key, past_value):
+                output, _, _, _ = torch.onnx.ops.attention(
+                    Q,
+                    K,
+                    V,
+                    attn_mask=attn_mask,
+                    past_key=past_key,
+                    past_value=past_value,
+                )
+                return output
+
+        model = FullAttentionModel()
+        onnx_program = self.export(
+            model, (Q, K, V, attn_mask, past_key, past_value), opset_version=23
+        )
+
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "Attention")
+
+        # Verify all 6 inputs are present
+        self.assertEqual(
+            len(node.inputs), 6
+        )  # Q, K, V, attn_mask, past_key, past_value
+        self.assertEqual(
+            node.inputs[0].shape, [batch_size, q_num_heads, q_seq_len, head_size]
+        )
+        self.assertEqual(
+            node.inputs[1].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
+        )
+        self.assertEqual(
+            node.inputs[2].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
+        )
+        self.assertEqual(
+            node.inputs[3].shape, [1, 1, q_seq_len, kv_seq_len + past_seq_len]
+        )
+        self.assertEqual(
+            node.inputs[4].shape, [batch_size, kv_num_heads, past_seq_len, head_size]
+        )
+        self.assertEqual(
+            node.inputs[5].shape, [batch_size, kv_num_heads, past_seq_len, head_size]
+        )
+
+    def test_attention_export_3d_with_num_heads_attributes(self):
+        """Test export with 3D inputs and explicit num_heads attributes."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 4  # GQA
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_seq_len, q_num_heads * head_size)
+        K = torch.rand(batch_size, kv_seq_len, kv_num_heads * head_size)
+        V = torch.rand(batch_size, kv_seq_len, kv_num_heads * head_size)
+
+        class Attention3DModel(torch.nn.Module):
+            def forward(self, Q, K, V):
+                output, _, _, _ = torch.onnx.ops.attention(
+                    Q, K, V, q_num_heads=q_num_heads, kv_num_heads=kv_num_heads
+                )
+                return output
+
+        model = Attention3DModel()
+        onnx_program = self.export(model, (Q, K, V), opset_version=23)
+
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "Attention")
+
+        # Verify 3D input shapes
+        self.assertEqual(
+            node.inputs[0].shape, [batch_size, q_seq_len, q_num_heads * head_size]
+        )
+        self.assertEqual(
+            node.inputs[1].shape, [batch_size, kv_seq_len, kv_num_heads * head_size]
+        )
+        self.assertEqual(
+            node.inputs[2].shape, [batch_size, kv_seq_len, kv_num_heads * head_size]
+        )
+
+        # Verify num_heads attributes are set
+        attrs = node.attributes
+        self.assertIn("q_num_heads", attrs)
+        self.assertIn("kv_num_heads", attrs)
+        self.assertEqual(attrs["q_num_heads"].value, q_num_heads)
+        self.assertEqual(attrs["kv_num_heads"].value, kv_num_heads)
+
+    def test_attention_export_with_all_attributes(self):
+        """Test export with all possible attributes set."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        class FullAttributesModel(torch.nn.Module):
+            def forward(self, Q, K, V):
+                output, _, _, _ = torch.onnx.ops.attention(
+                    Q,
+                    K,
+                    V,
+                    is_causal=True,
+                    qk_matmul_output_mode=2,
+                    scale=0.25,
+                    softcap=30.0,
+                    softmax_precision=1,  # FLOAT
+                )
+                return output
+
+        model = FullAttributesModel()
+        onnx_program = self.export(model, (Q, K, V), opset_version=23)
+
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "Attention")
+
+        # Verify all attributes are set correctly
+        attrs = node.attributes
+        self.assertIn("is_causal", attrs)
+        self.assertIn("qk_matmul_output_mode", attrs)
+        self.assertIn("scale", attrs)
+        self.assertIn("softcap", attrs)
+        self.assertIn("softmax_precision", attrs)
+
+        self.assertEqual(attrs["is_causal"].value, 1)  # True as int
+        self.assertEqual(attrs["qk_matmul_output_mode"].value, 2)
+        self.assertAlmostEqual(attrs["scale"].value, 0.25, places=6)
+        self.assertAlmostEqual(attrs["softcap"].value, 30.0, places=6)
+        self.assertEqual(attrs["softmax_precision"].value, 1)
+
+    def test_attention_export_with_different_mask_shapes(self):
+        """Test export with different attention mask shapes."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        # Test 2D mask
+        mask_2d = torch.randint(0, 2, (q_seq_len, kv_seq_len), dtype=torch.bool)
+
+        class Mask2DModel(torch.nn.Module):
+            def forward(self, Q, K, V, mask):
+                output, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=mask)
+                return output
+
+        model_2d = Mask2DModel()
+        onnx_program_2d = self.export(model_2d, (Q, K, V, mask_2d), opset_version=23)
+
+        node_2d = onnx_program_2d.model.graph.node(0)
+        self.assertEqual(node_2d.inputs[3].shape, [q_seq_len, kv_seq_len])
+
+        # Test 3D mask
+        mask_3d = torch.randint(
+            0, 2, (batch_size, 1, q_seq_len, kv_seq_len), dtype=torch.bool
+        )
+
+        class Mask3DModel(torch.nn.Module):
+            def forward(self, Q, K, V, mask):
+                output, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=mask)
+                return output
+
+        model_3d = Mask3DModel()
+        onnx_program_3d = self.export(model_3d, (Q, K, V, mask_3d), opset_version=23)
+
+        node_3d = onnx_program_3d.model.graph.node(0)
+        self.assertEqual(
+            node_3d.inputs[3].shape, [batch_size, 1, q_seq_len, kv_seq_len]
+        )
+
+        # Test 4D mask
+        mask_4d = torch.randint(
+            0, 2, (batch_size, q_num_heads, q_seq_len, kv_seq_len), dtype=torch.bool
+        )
+
+        class Mask4DModel(torch.nn.Module):
+            def forward(self, Q, K, V, mask):
+                output, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=mask)
+                return output
+
+        model_4d = Mask4DModel()
+        onnx_program_4d = self.export(model_4d, (Q, K, V, mask_4d), opset_version=23)
+
+        node_4d = onnx_program_4d.model.graph.node(0)
+        self.assertEqual(
+            node_4d.inputs[3].shape, [batch_size, q_num_heads, q_seq_len, kv_seq_len]
+        )
+
+    def test_attention_export_with_float_mask(self):
+        """Test export with float attention mask."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        float_mask = torch.randn(q_seq_len, kv_seq_len)
+
+        class FloatMaskModel(torch.nn.Module):
+            def forward(self, Q, K, V, mask):
+                output, _, _, _ = torch.onnx.ops.attention(Q, K, V, attn_mask=mask)
+                return output
+
+        model = FloatMaskModel()
+        onnx_program = self.export(model, (Q, K, V, float_mask), opset_version=23)
+
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "Attention")
+        self.assertEqual(node.inputs[3].shape, [q_seq_len, kv_seq_len])
+        # Verify the mask input has float dtype in the ONNX model
+        self.assertEqual(node.inputs[3].dtype, ir.DataType.FLOAT)
+
+    def test_attention_export_qk_output_modes(self):
+        """Test export with different QK output modes."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        for mode in [0, 1, 2, 3]:
+
+            class QKOutputModel(torch.nn.Module):
+                def __init__(self, qk_mode):
+                    super().__init__()
+                    self.qk_mode = qk_mode
+
+                def forward(self, Q, K, V):
+                    output, _, _, qk_output = torch.onnx.ops.attention(
+                        Q, K, V, qk_matmul_output_mode=self.qk_mode
+                    )
+                    return output, qk_output
+
+            model = QKOutputModel(mode)
+            onnx_program = self.export(model, (Q, K, V), opset_version=23)
+
+            node = onnx_program.model.graph.node(0)
+            self.assertEqual(node.op_type, "Attention")
+
+            # Verify qk_matmul_output_mode attribute
+            attrs = node.attributes
+            if mode != 0:
+                self.assertIn("qk_matmul_output_mode", attrs)
+                self.assertEqual(attrs["qk_matmul_output_mode"].value, mode)
+
+            # Verify 4 outputs (output, present_key, present_value, qk_output)
+            self.assertEqual(len(node.outputs), 4)
+
+    def test_attention_export_mqa(self):
+        """Test export with Multi-Query Attention (MQA)."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 1  # MQA
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        class MQAModel(torch.nn.Module):
+            def forward(self, Q, K, V):
+                output, _, _, _ = torch.onnx.ops.attention(Q, K, V)
+                return output
+
+        model = MQAModel()
+        onnx_program = self.export(model, (Q, K, V), opset_version=23)
+
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "Attention")
+
+        # Verify MQA tensor shapes
+        self.assertEqual(
+            node.inputs[0].shape, [batch_size, q_num_heads, q_seq_len, head_size]
+        )
+        self.assertEqual(
+            node.inputs[1].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
+        )  # kv_num_heads = 1
+        self.assertEqual(
+            node.inputs[2].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
+        )
+
+    def test_attention_export_with_softmax_precision(self):
+        """Test export with different softmax precision values."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 8
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        # Test different ONNX precision types
+        precision_types = [
+            (1, "FLOAT"),
+            (10, "FLOAT16"),
+            (11, "DOUBLE"),
+            (16, "BFLOAT16"),
+        ]
+
+        for precision_val, precision_name in precision_types:
+
+            class SoftmaxPrecisionModel(torch.nn.Module):
+                def __init__(self, precision):
+                    super().__init__()
+                    self.precision = precision
+
+                def forward(self, Q, K, V):
+                    output, _, _, _ = torch.onnx.ops.attention(
+                        Q, K, V, softmax_precision=self.precision
+                    )
+                    return output
+
+            model = SoftmaxPrecisionModel(precision_val)
+            onnx_program = self.export(model, (Q, K, V), opset_version=23)
+
+            node = onnx_program.model.graph.node(0)
+            self.assertEqual(node.op_type, "Attention")
+
+            # Verify softmax_precision attribute
+            attrs = node.attributes
+            self.assertIn("softmax_precision", attrs)
+            self.assertEqual(attrs["softmax_precision"].value, precision_val)
+
+    def test_attention_export_gqa(self):
+        """Test export and verify output tensor shapes."""
+        batch_size, q_seq_len, kv_seq_len = 2, 4, 6
+        q_num_heads, kv_num_heads = 8, 4  # GQA
+        head_size = 64
+
+        Q = torch.rand(batch_size, q_num_heads, q_seq_len, head_size)
+        K = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+        V = torch.rand(batch_size, kv_num_heads, kv_seq_len, head_size)
+
+        class AttentionOutputsModel(torch.nn.Module):
+            def forward(self, Q, K, V):
+                return torch.onnx.ops.attention(Q, K, V)
+
+        model = AttentionOutputsModel()
+        onnx_program = self.export(model, (Q, K, V), opset_version=23)
+
+        node = onnx_program.model.graph.node(0)
+        self.assertEqual(node.op_type, "Attention")
+
+        # Verify all 4 outputs have correct shapes
+        outputs = node.outputs
+        self.assertEqual(len(outputs), 4)
+
+        # output: (batch_size, q_num_heads, q_seq_len, head_size)
+        self.assertEqual(
+            outputs[0].shape, [batch_size, q_num_heads, q_seq_len, head_size]
+        )
+
+        # present_key: (batch_size, kv_num_heads, kv_seq_len, head_size)
+        self.assertEqual(
+            outputs[1].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
+        )
+
+        # present_value: (batch_size, kv_num_heads, kv_seq_len, head_size)
+        self.assertEqual(
+            outputs[2].shape, [batch_size, kv_num_heads, kv_seq_len, head_size]
+        )
+
+        # qk_output: (batch_size, q_num_heads, q_seq_len, kv_seq_len)
+        self.assertEqual(
+            outputs[3].shape, [batch_size, q_num_heads, q_seq_len, kv_seq_len]
+        )
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/pytorch_test_common.py b/test/onnx/pytorch_test_common.py
index 1cdae000edaf..0060acd20dc6 100644
--- a/test/onnx/pytorch_test_common.py
+++ b/test/onnx/pytorch_test_common.py
@@ -15,7 +15,10 @@
 
 import torch
 from torch.autograd import function
+<<<<<<< HEAD
 from torch.onnx._internal import diagnostics
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_utils
 
 
@@ -292,6 +295,7 @@ def inner(self, *args, **kwargs):
             try:
                 func(self, *args, **kwargs)
             except Exception as e:
+<<<<<<< HEAD
                 if isinstance(e, torch.onnx.OnnxExporterError):
                     # diagnostic message is in the cause of the exception
                     assert error_message in str(e.__cause__), (
@@ -301,6 +305,11 @@ def inner(self, *args, **kwargs):
                     assert error_message in str(e), (
                         f"Expected error message: {error_message} NOT in {str(e)}"
                     )
+=======
+                assert error_message in str(e), (
+                    f"Expected error message: {error_message} NOT in {str(e)}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 pytest.xfail(reason if reason else f"Expected failure: {error_message}")
             else:
                 pytest.fail("Unexpected success!")
@@ -417,4 +426,7 @@ def setUp(self):
         set_rng_seed(0)
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(0)
+<<<<<<< HEAD
         diagnostics.engine.clear()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/onnx/test_models_quantized_onnxruntime.py b/test/onnx/test_models_quantized_onnxruntime.py
index 81a180ea01fd..df3e1aecf9f6 100644
--- a/test/onnx/test_models_quantized_onnxruntime.py
+++ b/test/onnx/test_models_quantized_onnxruntime.py
@@ -10,6 +10,10 @@
 
 import torch
 from torch import nn
+<<<<<<< HEAD
+=======
+from torch.testing._internal import common_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_test_image_tensor():
@@ -95,3 +99,10 @@ def test_resnext101_32x8d(self):
             pretrained=True, quantize=True
         )
         self.run_test(model, _get_test_image_tensor())
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/onnx/test_onnxscript_no_runtime.py b/test/onnx/test_onnxscript_no_runtime.py
index fcac54d948d8..62dae968f502 100644
--- a/test/onnx/test_onnxscript_no_runtime.py
+++ b/test/onnx/test_onnxscript_no_runtime.py
@@ -160,3 +160,13 @@ def custom_selu(g, X):
         )
         loop_selu_proto = onnx.load(io.BytesIO(saved_model.getvalue()))
         self.assertEqual(len(loop_selu_proto.functions), 1)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index cb4f72e5aa20..d46c768b7236 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -367,7 +367,11 @@ def forward(self, x_in: torch.Tensor) -> dict[str, torch.Tensor]:
         f = io.BytesIO()
         torch.onnx.export(mod, (x_in,), f)
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, r"DictConstruct.+is not supported."):
+=======
+        with self.assertRaisesRegex(RuntimeError, r"DictConstruct.+is not supported"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f = io.BytesIO()
             torch.onnx.export(torch.jit.script(mod), (x_in,), f)
 
diff --git a/test/onnx/torchlib/README.md b/test/onnx/torchlib/README.md
index 0ea8c6c524d4..182060211635 100644
--- a/test/onnx/torchlib/README.md
+++ b/test/onnx/torchlib/README.md
@@ -36,7 +36,11 @@ Sometimes, there is no existing OpInfo that fits our need to test an operator. Y
 
 Follow the steps below to create new OpInfo tests:
 
+<<<<<<< HEAD
 1. Use the implementation for `ops.aten.slice_scatter` as a reference (https://github.com/microsoft/onnxscript/blob/e67335101e4a06b8cc98cb4129935a9af5062c77/tests/function_libs/torch_lib/extra_opinfo.py#L2412-L2418) to declare an OpInfo in [`extra_opinfo.py`](./extra_opinfo.py)
+=======
+1. Use the implementation for `ops.aten.slice_scatter` as a reference (https://github.com/microsoft/onnxscript/blob/e67335101e4a06b8cc98cb4129935a9af5062c77/tests/function_libs/torch_lib/extra_opinfo.py#L2412-L2418) to declare an `OpInfo` in `extra_opinfo.py`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
    ```py
     opinfo_core.OpInfo(
diff --git a/test/onnx/torchlib/ops_test_common.py b/test/onnx/torchlib/ops_test_common.py
index 73c00de388fa..6e9ecf5794ae 100644
--- a/test/onnx/torchlib/ops_test_common.py
+++ b/test/onnx/torchlib/ops_test_common.py
@@ -52,6 +52,10 @@
     torch.float64,
 )
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST_OPSET_VERSION = 18
 IS_MACOS = sys.platform.startswith("darwin")
 IS_WINDOWS = os.name == "nt"
@@ -487,6 +491,10 @@ def dtype_op_schema_compatible(dtype: torch.dtype, schema: onnx.defs.OpSchema) -
 def graph_executor(
     test_name: str,
     outputs: Sequence[Any],
+<<<<<<< HEAD
+=======
+    opset_version: int = TEST_OPSET_VERSION,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable[[Callable[..., Any], tuple[Any], dict[str, Any]], None]:
     """Eagerly executes a function."""
 
@@ -500,10 +508,17 @@ def _capture_graph_and_evaluate_torch_script_evaluator(
             (),
             (),
             nodes=(),
+<<<<<<< HEAD
             opset_imports={"": 18, "pkg.torch.onnx": 1},
             name="main_graph",
         )
         opset = onnxscript.opset18
+=======
+            opset_imports={"": opset_version, "pkg.torch.onnx": 1},
+            name="main_graph",
+        )
+        opset = onnxscript.values.Opset("", opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tracer = _building.OpRecorder(opset, {})
         ort_inputs = {}
         onnxscript_args: list[Any] = []
@@ -590,7 +605,11 @@ def _capture_graph_and_evaluate_torch_script_evaluator(
                 proto = onnxscript_function.to_function_proto()
                 ir_function = ir.serde.deserialize_function(proto)
             onnx_model.functions[identifier] = ir_function
+<<<<<<< HEAD
         _ir_passes.add_torchlib_common_imports(onnx_model)
+=======
+        _ir_passes.add_torchlib_common_imports(onnx_model, opset_version=opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _ir_passes.add_opset_imports(onnx_model)
         # Make sure the model is valid
         model_proto = ir.to_proto(onnx_model)
diff --git a/test/onnx/torchlib/ops_test_data.py b/test/onnx/torchlib/ops_test_data.py
index b255f07640b8..a0851c36f286 100644
--- a/test/onnx/torchlib/ops_test_data.py
+++ b/test/onnx/torchlib/ops_test_data.py
@@ -1,4 +1,8 @@
 # Owner(s): ["module: onnx"]
+<<<<<<< HEAD
+=======
+# flake8: noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """Test op correctness by comparing with PyTorch results.
 
 ## Usage
@@ -32,25 +36,43 @@
     op, use `ops_test_common.duplicate_opinfo` to create new OpInfo with new names and map each
     to one overload.
 """
+<<<<<<< HEAD
 # flake8: noqa
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from __future__ import annotations
 
 import copy
 import dataclasses
 import functools
+<<<<<<< HEAD
 from typing import Any, Callable, Collection, Optional
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 
 import numpy as np
 import ops_test_common
 
 import torch
+<<<<<<< HEAD
 from torch.onnx._internal.exporter._torchlib.ops import core as core_ops
+=======
+from torch.onnx._internal.exporter._torchlib.ops import core as core_ops, nn as nn_ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import common_methods_invocations
 from torch.testing._internal.opinfo import definitions as opinfo_definitions
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from collections.abc import Collection
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Create a copy of the op_db to modify
 OPS_DB = copy.deepcopy(common_methods_invocations.op_db)
 
@@ -78,6 +100,15 @@ class TorchLibOpInfo:
     compare_shape_only_for_output: tuple[int, ...] = ()
     # Whether the function is designed for complex inputs
     complex: bool = False
+<<<<<<< HEAD
+=======
+    # The ONNX opset version in which the function was introduced.
+    # Its specifies the minimum ONNX opset version required to use the function.
+    # It ensures that the function is only used when the target ONNX opset version
+    # is compatible. For example, if `opset_introduced=20`, the function will only
+    # be used when exporting to ONNX models targeting opset version 20 or higher.
+    opset_introduced: int = 18
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The acceptable tolerance of the inference result difference between PyTorch and ORT.
     # Format: {dtype: (rtol, atol)}.
     # For example: {torch.float16: (1e-3, 1e-3)}
@@ -447,8 +478,15 @@ def _where_input_wrangler(
     TorchLibOpInfo("abs", core_ops.aten_abs_complex, complex=True),
     TorchLibOpInfo("add", core_ops.aten_add, tolerance={torch.float16: (1e-3, 1e-3)}),
     TorchLibOpInfo("add", core_ops.aten_add_complex, complex=True),
+<<<<<<< HEAD
 )
 
+=======
+    TorchLibOpInfo("gelu_op20", nn_ops.aten_gelu_opset20, opset_introduced=20),
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ops_test_common.duplicate_opinfo(OPS_DB, "all", ("all_dim", "all_dims"))
 ops_test_common.duplicate_opinfo(OPS_DB, "any", ("any_dim", "any_dims"))
 ops_test_common.duplicate_opinfo(
@@ -500,6 +538,10 @@ def _where_input_wrangler(
         "nn.functional.replication_pad3d",
     ),
 )
+<<<<<<< HEAD
+=======
+ops_test_common.duplicate_opinfo(OPS_DB, "nn.functional.gelu", ("gelu_op20",))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ops_test_common.duplicate_opinfo(
     OPS_DB,
     "nn.functional.scaled_dot_product_attention",
diff --git a/test/onnx/torchlib/test_ops.py b/test/onnx/torchlib/test_ops.py
index 74cbeeca3138..f28ef764c0a6 100644
--- a/test/onnx/torchlib/test_ops.py
+++ b/test/onnx/torchlib/test_ops.py
@@ -220,7 +220,13 @@ def run_test_output_match(
 
                 test_name = test_suite.id()
                 function_output, model_proto = function_executor(
+<<<<<<< HEAD
                     test_name, reference_torch_outputs
+=======
+                    test_name,
+                    reference_torch_outputs,
+                    opset_version=torchlib_op_info.opset_introduced,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )(onnx_function, input_onnx, kwargs_onnx)
                 # Finally we re-flatten everything
                 # TODO: add pytree structure comparison.
diff --git a/test/optim/test_lrscheduler.py b/test/optim/test_lrscheduler.py
index 8ff5036d062c..eea89dad868b 100644
--- a/test/optim/test_lrscheduler.py
+++ b/test/optim/test_lrscheduler.py
@@ -107,9 +107,13 @@ def get_lr(self, step):
                     [0]
                     + [i + 1 for i, m in enumerate(self.milestones) if global_step >= m]
                 )[-1]
+<<<<<<< HEAD
                 return [
                     init_lr * (self.gamma**gamma_power) for init_lr in self.init_lr
                 ]
+=======
+                return [init_lr * (self.gamma**gamma_power) for init_lr in self.init_lr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         optimizer = SGD([torch.rand(1)], lr=1)
 
@@ -1837,6 +1841,18 @@ def test_multiplicative_lr(self):
         )
         self._test(scheduler, targets, epochs)
 
+<<<<<<< HEAD
+=======
+    def test_multiplicative_lr_with_lr_lambda(self):
+        lr_lambda = 0.95
+        with self.assertRaisesRegex(TypeError, "lr_lambda should be a function"):
+            MultiplicativeLR(self.opt, lr_lambda)
+
+        lr_lambda2 = 0.95
+        with self.assertRaisesRegex(TypeError, "lr_lambda should be a function"):
+            MultiplicativeLR(self.opt, [lr_lambda, lr_lambda2])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("T_mult", [1, 2, 4])
     def test_CosineAnnealingWarmRestarts_lr1(self, T_mult):
         iters = 100
@@ -1924,6 +1940,17 @@ def test_CosineAnnealingWarmRestarts_lr3(self):
                 scheduler, targets, epochs
             )
 
+<<<<<<< HEAD
+=======
+    def test_CosineAnnealingWarmRestarts_T_cur_reset(self):
+        sch = CosineAnnealingWarmRestarts(self.opt, T_0=4)
+        for epoch in [7, 8, 9]:
+            sch.T_cur = epoch
+            sch.step()
+            expect_T_cur = (epoch + 1) % sch.T_0
+            self.assertEqual(sch.T_cur, expect_T_cur)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_swalr_no_anneal(self):
         epochs, swa_start, swa_lr = 10, 5, 0.01
         initial_lrs = [group["lr"] for group in self.opt.param_groups]
@@ -2558,6 +2585,59 @@ def test_constant_initial_params_swalr(self):
                 self.assertEqual(group["swa_lr"], 0.05)
                 self.assertEqual(sch.base_lrs, [0.1])
 
+<<<<<<< HEAD
+=======
+    @parametrize(
+        "LRClass",
+        [
+            partial(ExponentialLR, gamma=0.999),
+            partial(LambdaLR, lr_lambda=lambda epoch: epoch // 30),
+            partial(MultiplicativeLR, lr_lambda=lambda epoch: 0.95),
+            partial(StepLR, step_size=30),
+            partial(MultiStepLR, milestones=[30, 80]),
+            ConstantLR,
+            LinearLR,
+            PolynomialLR,
+            partial(CosineAnnealingLR, T_max=10),
+            partial(CosineAnnealingWarmRestarts, T_0=20),
+            partial(CyclicLR, base_lr=0.01, max_lr=0.1),
+            partial(OneCycleLR, max_lr=0.01, total_steps=10),
+            partial(SWALR, swa_lr=0.01),
+        ],
+    )
+    def test_lr_scheduler_checkpoint(self, LRClass):
+        model = torch.nn.Linear(3, 3)
+        optim = torch.optim.AdamW(model.parameters())
+        sch = LRClass(optim)
+        optim.step()
+        sch.step()
+        optim2 = torch.optim.AdamW(model.parameters())
+        optim2.load_state_dict(optim.state_dict())
+        sch2 = LRClass(optim2, last_epoch=0)
+        self.assertEqual(
+            sch2._get_closed_form_lr()[0]
+            if hasattr(self, "_get_closed_form_lr")
+            else sch2.get_last_lr()[0],
+            optim.param_groups[0]["lr"],
+        )
+
+    def test_lr_scheduler_checkpoint_on_plateau(self):
+        model = torch.nn.Linear(3, 3)
+        optim = torch.optim.AdamW(model.parameters())
+        sch = ReduceLROnPlateau(optim, mode="min")
+        optim.step()
+        sch.step(1)
+        optim2 = torch.optim.AdamW(model.parameters())
+        optim2.load_state_dict(optim.state_dict())
+        sch2 = ReduceLROnPlateau(optim2, mode="min")
+        self.assertEqual(
+            sch2._get_closed_form_lr()[0]
+            if hasattr(self, "_get_closed_form_lr")
+            else sch2.get_last_lr()[0],
+            optim.param_groups[0]["lr"],
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(TestLRScheduler)
 
diff --git a/test/optim/test_optim.py b/test/optim/test_optim.py
index 00f5db1478c9..d9906fdb0f84 100644
--- a/test/optim/test_optim.py
+++ b/test/optim/test_optim.py
@@ -62,9 +62,15 @@ def _multistep_backprop_diff_hyperparams_fn(
     kwargs: dict[str, Any],
     *ignored: Any,
 ) -> tuple[Tensor, ...]:
+<<<<<<< HEAD
     assert (
         kwargs["differentiable"] is True
     ), "Only call this test function when differentiable=True"
+=======
+    assert kwargs["differentiable"] is True, (
+        "Only call this test function when differentiable=True"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     params = params.clone()
     params.grad = grad
@@ -81,9 +87,15 @@ def _multistep_backprop_diff_hyperparams_fn(
     # so they're passed in as Tensors (not a tuple) and recognized by gradcheck
     if "beta1" in kwargs or "beta2" in kwargs:
         # Prevent just one beta kwarg from being passed in
+<<<<<<< HEAD
         assert (
             "beta1" in kwargs and "beta2" in kwargs
         ), "Both betas should be defined in kwargs"
+=======
+        assert "beta1" in kwargs and "beta2" in kwargs, (
+            "Both betas should be defined in kwargs"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs.update({"betas": (kwargs.pop("beta1"), kwargs.pop("beta2"))})
 
     kwargs.update(
diff --git a/test/package/test_package_fx.py b/test/package/test_package_fx.py
index 9976766f47f3..46690bae1cbf 100644
--- a/test/package/test_package_fx.py
+++ b/test/package/test_package_fx.py
@@ -187,6 +187,30 @@ def forward(self, a):
         input = torch.rand(2, 3)
         self.assertEqual(loaded_traced(input), traced(input))
 
+<<<<<<< HEAD
+=======
+    def test_package_gm_preserve_stack_trace(self):
+        class SimpleTest(torch.nn.Module):
+            def forward(self, x):
+                return torch.relu(x + 3.0)
+
+        st = SimpleTest()
+        traced = symbolic_trace(st)
+
+        for node in traced.graph.nodes:
+            node.meta["stack_trace"] = f"test_{node.name}"
+
+        f = BytesIO()
+        with PackageExporter(f) as pe:
+            pe.save_pickle("model", "model.pkl", traced)
+
+        f.seek(0)
+        pi = PackageImporter(f)
+        loaded_traced = pi.load_pickle("model", "model.pkl")
+        for node in loaded_traced.graph.nodes:
+            self.assertEqual(f"test_{node.name}", node.meta["stack_trace"])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/profiler/test_execution_trace.py b/test/profiler/test_execution_trace.py
index 072b5b12a023..bd5a08c541d2 100644
--- a/test/profiler/test_execution_trace.py
+++ b/test/profiler/test_execution_trace.py
@@ -15,7 +15,10 @@
 
 import json
 import os
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import tempfile
 import unittest
 from typing import Any
@@ -366,9 +369,12 @@ def test_execution_trace_env_disabled(self, device):
 
     @unittest.skipIf(IS_WINDOWS, "torch.compile does not support WINDOWS")
     @unittest.skipIf(
+<<<<<<< HEAD
         sys.version_info >= (3, 12), "torch.compile is not supported on python 3.12+"
     )
     @unittest.skipIf(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (not has_triton()) or (not TEST_CUDA and not TEST_XPU),
         "need triton and device(CUDA or XPU) availability to run",
     )
@@ -389,6 +395,12 @@ def fn(a, b, c):
         # Create a temp file to save execution trace data.
         fp = tempfile.NamedTemporaryFile("w+t", suffix="_et.json", delete=False)
         fp.close()
+<<<<<<< HEAD
+=======
+        et = ExecutionTraceObserver()
+        et.register_callback(fp.name)
+        et.set_extra_resource_collection(True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with profile(
             activities=torch.profiler.supported_activities(),
@@ -396,9 +408,13 @@ def fn(a, b, c):
             schedule=torch.profiler.schedule(
                 skip_first=3, wait=1, warmup=1, active=2, repeat=1
             ),
+<<<<<<< HEAD
             execution_trace_observer=(
                 ExecutionTraceObserver().register_callback(fp.name)
             ),
+=======
+            execution_trace_observer=et,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) as p:
             for idx in range(10):
                 with record_function(f"## LOOP {idx} ##"):
@@ -419,9 +435,12 @@ def fn(a, b, c):
 
     @unittest.skipIf(IS_WINDOWS, "torch.compile does not support WINDOWS")
     @unittest.skipIf(
+<<<<<<< HEAD
         sys.version_info >= (3, 12), "torch.compile is not supported on python 3.12+"
     )
     @unittest.skipIf(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (not has_triton()) or (not TEST_CUDA and not TEST_XPU),
         "need triton and device(CUDA or XPU) availability to run",
     )
diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 304587faf8a6..d63a76a9967e 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -2,7 +2,10 @@
 import functools
 import gc
 import itertools as it
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import textwrap
 import unittest
 from collections.abc import Iterator
@@ -104,7 +107,10 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
         return out
 
 
+<<<<<<< HEAD
 @unittest.skipIf(sys.version_info >= (3, 13), "many segfaults")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @skipIfTorchDynamo("TorchDynamo changes Python calls that memory profiling relies on.")
 class TestIdentifyGradients(TestCase):
     def gradient_detected(
@@ -828,7 +834,10 @@ def f_fwd_bwd(**kwargs):
         )
 
 
+<<<<<<< HEAD
 @unittest.skipIf(sys.version_info >= (3, 13), "many segfaults")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @skipIfTorchDynamo("TorchDynamo changes Python calls that memory profiling relies on.")
 class TestMemoryProfilerE2E(TestCase):
     @staticmethod
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index d4fc6f43fff7..9dd73b8f135f 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -53,7 +53,10 @@
     SynchronizedDataLoaderPattern,
 )
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
+<<<<<<< HEAD
 from torch.testing._internal.common_device_type import skipCUDAVersionIn
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_ARM64,
@@ -66,8 +69,15 @@
     skipIfTorchDynamo,
     TemporaryDirectoryName,
     TemporaryFileName,
+<<<<<<< HEAD
     TEST_WITH_CROSSREF,
     TEST_WITH_ROCM,
+=======
+    TEST_CUDA,
+    TEST_WITH_CROSSREF,
+    TEST_WITH_ROCM,
+    TEST_XPU,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 
@@ -102,7 +112,10 @@
 @unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows")
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
 class TestProfilerCUDA(TestCase):
+<<<<<<< HEAD
     @skipCUDAVersionIn([(11, 5)])  # https://github.com/pytorch/pytorch/issues/69023
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mem_leak(self):
         """Checks that there's no memory leak when using profiler with CUDA"""
         t = torch.rand(1, 1).cuda()
@@ -605,6 +618,12 @@ def create_cpu_tensor():
         def create_cuda_tensor():
             return torch.rand(10, 10).cuda()
 
+<<<<<<< HEAD
+=======
+        def create_xpu_tensor():
+            return torch.rand(10, 10).xpu()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def create_mkldnn_tensor():
             return torch.rand(10, 10, dtype=torch.float32).to_mkldnn()
 
@@ -675,6 +694,33 @@ def create_mkldnn_tensor():
                 ],
             )
 
+<<<<<<< HEAD
+=======
+        if torch.xpu.is_available():
+            create_xpu_tensor()
+            stats = run_profiler(create_xpu_tensor)
+            check_metrics(
+                stats,
+                "device_memory_usage",
+                allocs=[
+                    "test_user_scope_alloc",
+                    "aten::to",
+                    "aten::empty_strided",
+                ],
+                deallocs=[
+                    "test_user_scope_dealloc",
+                ],
+            )
+            check_metrics(
+                stats,
+                "cpu_memory_usage",
+                allocs=[
+                    "aten::rand",
+                    "aten::empty",
+                ],
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch.backends.mkldnn.is_available():
             create_mkldnn_tensor()
             stats = run_profiler(create_mkldnn_tensor)
@@ -699,6 +745,12 @@ def create_mkldnn_tensor():
             if torch.cuda.is_available():
                 y = torch.rand(10, 10).cuda()
                 del y
+<<<<<<< HEAD
+=======
+            elif torch.xpu.is_available():
+                y = torch.rand(10, 10).to("xpu")
+                del y
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gc.collect()
         stats = prof.key_averages(group_by_input_shape=True)
         check_metrics(
@@ -709,6 +761,11 @@ def create_mkldnn_tensor():
         )
         if torch.cuda.is_available():
             check_metrics(stats, "device_memory_usage", deallocs=["[memory]"])
+<<<<<<< HEAD
+=======
+        elif torch.xpu.is_available():
+            check_metrics(stats, "device_memory_usage", deallocs=["[memory]"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(
         IS_JETSON, "Jetson has a guard against OOM since host and gpu memory are shared"
@@ -2001,16 +2058,45 @@ def test_user_annotation(self):
             else:
                 self.assertFalse(evt.is_user_annotation)
 
+<<<<<<< HEAD
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
     def test_dynamic_toggle(self):
         with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as p:
             with torch.profiler.record_function("test_user_annotation"):
                 x, y = (torch.rand(4, 4).to("cuda") for _ in range(2))
+=======
+    @unittest.skipUnless(TEST_CUDA or TEST_XPU, "requires gpu")
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
+    def test_basic_profile(self):
+        # test a really basic profile to make sure no erroneous aten ops are run
+        x = torch.randn(4, device="cuda")
+        with torch.profiler.profile(with_stack=True) as p:
+            x *= 2
+        names = [e.name for e in p.events()]
+        for name in names:
+            if name.startswith("aten") and name != "aten::mul_":
+                self.assertTrue(False, "Found unexpected event: " + name)
+        self.assertTrue("aten::mul_" in names)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
+    def test_dynamic_toggle(self):
+        acc = torch.accelerator.current_accelerator()
+        self.assertIsNotNone(acc)
+        device = acc.type
+        gpu_activity = getattr(ProfilerActivity, device.upper(), None)
+        self.assertIsNotNone(gpu_activity)
+        activities = [ProfilerActivity.CPU, gpu_activity]
+        with profile(activities=activities) as p:
+            with torch.profiler.record_function("test_user_annotation"):
+                x, y = (torch.rand(4, 4).to(device) for _ in range(2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.add(x, y)
 
         self.assertTrue(any("aten" in e.name for e in p.events()))
 
+<<<<<<< HEAD
         self.assertTrue(any("cuda" in e.name for e in p.events()))
 
         self.assertTrue(any("kernel" in e.name for e in p.events()))
@@ -2019,10 +2105,21 @@ def test_dynamic_toggle(self):
             p1.toggle_collection_dynamic(False, [ProfilerActivity.CUDA])
             with torch.profiler.record_function("test_user_annotation"):
                 x, y = (torch.rand(4, 4).to("cuda") for _ in range(2))
+=======
+        self.assertTrue(any(device in e.name for e in p.events()))
+
+        self.assertTrue(any("kernel" in e.name.lower() for e in p.events()))
+
+        with profile(activities=activities) as p1:
+            p1.toggle_collection_dynamic(False, [gpu_activity])
+            with torch.profiler.record_function("test_user_annotation"):
+                x, y = (torch.rand(4, 4).to(device) for _ in range(2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.add(x, y)
 
         self.assertTrue(any("aten" in e.name for e in p1.events()))
 
+<<<<<<< HEAD
         self.assertTrue(all("cuda" not in e.name for e in p1.events()))
 
         self.assertTrue(all("kernel" not in e.name for e in p1.events()))
@@ -2033,6 +2130,16 @@ def test_dynamic_toggle(self):
             )
             with torch.profiler.record_function("test_user_annotation"):
                 x, y = (torch.rand(4, 4).to("cuda") for _ in range(2))
+=======
+        self.assertTrue(all(device not in e.name for e in p1.events()))
+
+        self.assertTrue(all("kernel" not in e.name.lower() for e in p1.events()))
+
+        with profile(activities=activities) as p2:
+            p2.toggle_collection_dynamic(False, activities)
+            with torch.profiler.record_function("test_user_annotation"):
+                x, y = (torch.rand(4, 4).to(device) for _ in range(2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.add(x, y)
         self.assertTrue(len(p2.events()) == 0)
 
@@ -2933,6 +3040,20 @@ def unpack(fmt, offset):
     def test_profiler_overload_names(self):
         from torch.library import _scoped_library, fallthrough_kernel
 
+<<<<<<< HEAD
+=======
+        def validate_json(prof):
+            print()
+            with TemporaryFileName(mode="w+") as fname:
+                prof.export_chrome_trace(fname)
+                with open(fname) as f:
+                    events = json.load(f)["traceEvents"]
+                    self.assertTrue(
+                        any("aten::add.Tensor" in e["name"] for e in events)
+                    )
+                    self.assertTrue(any("aten::add.out" in e["name"] for e in events))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with _scoped_library("aten", "IMPL") as my_lib:
             my_lib.impl("add.Tensor", fallthrough_kernel, "CPU")
             experimental_config = torch._C._profiler._ExperimentalConfig(
@@ -2983,6 +3104,10 @@ def test_profiler_overload_names(self):
             key_averages = prof.key_averages(group_by_overload_name=True)
             assert len(key_averages) == 3
             assert "Overload Name" in key_averages.table()
+<<<<<<< HEAD
+=======
+            validate_json(prof)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index 7dac5fb70905..702c69c2591b 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -3,7 +3,10 @@
 import functools
 import os
 import re
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import textwrap
 import traceback
 import unittest
@@ -560,7 +563,10 @@ def test_profiler_experimental_tree_with_memory_and_stack(self):
         )
 
     @skipIfTorchDynamo("too slow")
+<<<<<<< HEAD
     @unittest.skipIf(sys.version_info >= (3, 13), "segfaults")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
     )
diff --git a/test/profiler/test_torch_tidy.py b/test/profiler/test_torch_tidy.py
index 119db5bb856f..12b50555ab06 100644
--- a/test/profiler/test_torch_tidy.py
+++ b/test/profiler/test_torch_tidy.py
@@ -15,7 +15,10 @@
 
 import gc
 import re
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import textwrap
 import unittest
 import weakref
@@ -57,7 +60,10 @@ def forward(self, x):
         return self.fc2(self.fc1(x))
 
 
+<<<<<<< HEAD
 @unittest.skipIf(sys.version_info >= (3, 13), "segfaults")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestTorchTidyProfiler(TestCase):
     def _get_tensor_fields(self, node, index):
         self.assertIsNotNone(node)
diff --git a/test/quantization/ao_migration/test_ao_migration.py b/test/quantization/ao_migration/test_ao_migration.py
index 020dc6d56d8d..96afdd99aa2c 100644
--- a/test/quantization/ao_migration/test_ao_migration.py
+++ b/test/quantization/ao_migration/test_ao_migration.py
@@ -1,5 +1,10 @@
 # Owner(s): ["oncall: quantization"]
 
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .common import AOMigrationTestCase
 
 
@@ -359,3 +364,10 @@ def test_modules_no_import_nn_intrinsic_quantized_dynamic(self):
 
         _ = torch.ao.nn.intrinsic.quantized.dynamic
         _ = torch.nn.intrinsic.quantized.dynamic
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/ao_migration/test_quantization.py b/test/quantization/ao_migration/test_quantization.py
index 3d416f3b67a2..cdf92f9089f6 100644
--- a/test/quantization/ao_migration/test_quantization.py
+++ b/test/quantization/ao_migration/test_quantization.py
@@ -1,5 +1,10 @@
 # Owner(s): ["oncall: quantization"]
 
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .common import AOMigrationTestCase
 
 
@@ -219,3 +224,10 @@ def test_function_import_utils(self):
             "weight_is_statically_quantized",
         ]
         self._test_function_import("utils", function_list)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py
index 25b3328c8f44..b24c4b37d630 100644
--- a/test/quantization/ao_migration/test_quantization_fx.py
+++ b/test/quantization/ao_migration/test_quantization_fx.py
@@ -1,5 +1,10 @@
 # Owner(s): ["oncall: quantization"]
 
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .common import AOMigrationTestCase
 
 
@@ -150,3 +155,10 @@ def test_function_import_fx_utils(self):
             "maybe_get_next_module",
         ]
         self._test_function_import("fx.utils", function_list)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/bc/test_backward_compatibility.py b/test/quantization/bc/test_backward_compatibility.py
index c7eabd629f48..10afdda8f253 100644
--- a/test/quantization/bc/test_backward_compatibility.py
+++ b/test/quantization/bc/test_backward_compatibility.py
@@ -20,7 +20,15 @@
 )
 
 # Testing utils
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_AVX512_VNNI_SUPPORTED, TestCase
+=======
+from torch.testing._internal.common_utils import (
+    IS_AVX512_VNNI_SUPPORTED,
+    raise_on_run_directly,
+    TestCase,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.quantization_torch_package_models import (
     LinearReluFunctional,
 )
@@ -565,3 +573,10 @@ def forward(self, x):
     def test_linear_relu_package_quantization_transforms(self):
         m = LinearReluFunctional(4).eval()
         self._test_package(m, input_size=(1, 1, 4, 4), generate=False)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/core/experimental/test_adaround_eager.py b/test/quantization/core/experimental/test_adaround_eager.py
index 53f943398c4e..5b31375b8b5a 100644
--- a/test/quantization/core/experimental/test_adaround_eager.py
+++ b/test/quantization/core/experimental/test_adaround_eager.py
@@ -134,3 +134,13 @@ def forward(self, x):
             ada_loss = F.mse_loss(ada_out, float_out)
             fq_loss = F.mse_loss(fq_out, float_out)
             self.assertTrue(ada_loss.item() < fq_loss.item())
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/core/experimental/test_floatx.py b/test/quantization/core/experimental/test_floatx.py
new file mode 100644
index 000000000000..01f59d4765b7
--- /dev/null
+++ b/test/quantization/core/experimental/test_floatx.py
@@ -0,0 +1,475 @@
+# Owner(s): ["oncall: quantization"]
+
+import struct
+import unittest
+
+import torch
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    dtypesIfCUDA,
+    instantiate_device_type_tests,
+)
+from torch.testing._internal.common_utils import (
+    DeterministicGuard,
+    IS_WINDOWS,
+    parametrize,
+    run_tests,
+    subtest,
+    TemporaryFileName,
+    TestCase,
+)
+
+
+FLOAT8_DTYPES = [
+    torch.float8_e5m2,
+    torch.float8_e5m2fnuz,
+    torch.float8_e4m3fn,
+    torch.float8_e4m3fnuz,
+    torch.float8_e8m0fnu,
+]
+
+CUDA_FLOAT8_DTYPES = [
+    torch.float8_e5m2,
+    torch.float8_e4m3fn,
+    torch.float8_e8m0fnu,
+]
+
+# The following information are not yet provided by torch.finfo.
+
+MANTISSA_BITS = {
+    torch.float8_e5m2: 2,
+    torch.float8_e5m2fnuz: 2,
+    torch.float8_e4m3fn: 3,
+    torch.float8_e4m3fnuz: 3,
+    torch.float8_e8m0fnu: 0,
+}
+
+# As in np.finfo(dtype).minexp
+MINEXP = {
+    torch.float8_e5m2: -14,
+    torch.float8_e5m2fnuz: -15,
+    torch.float8_e4m3fn: -6,
+    torch.float8_e4m3fnuz: -7,
+    torch.float8_e8m0fnu: -127,
+}
+
+SPECIAL_NUMBERS = {
+    torch.float8_e5m2: [
+        ("01111100", float("inf"), "inf"),
+        ("11111100", -1.0 * float("inf"), "neg_inf"),
+        ("01111101", float("nan"), "nan"),
+        ("11111101", float("nan"), "nan"),
+        ("01111110", float("nan"), "nan"),
+        ("11111110", float("nan"), "nan"),
+        ("01111111", float("nan"), "nan"),
+        ("11111111", float("nan"), "nan"),
+        ("00000000", 0.0, "zero"),
+        ("10000000", -0.0, "neg_zero"),
+        ("01111011", 57344.0, "max_normal"),
+        ("11111011", -57344.0, "neg_max_normal"),
+        ("00000100", 2**-14, "min_normal"),
+        ("10000100", -1 * (2**-14), "neg_min_normal"),
+        ("00000011", 0.75 * (2**-14), "max_subnorm"),
+        ("10000011", -0.75 * (2**-14), "neg_max_subnorm"),
+        ("00000001", 2**-16, "min_subnorm"),
+        ("10000001", -1 * (2**-16), "neg_min_subnorm"),
+    ],
+    torch.float8_e5m2fnuz: [
+        ("10000000", float("nan"), "nan"),
+        ("00000000", 0.0, "zero"),
+        ("00000000", -0.0, "neg_zero"),
+        ("01111111", 57344.0, "max_normal"),
+        ("11111111", -57344.0, "neg_max_normal"),
+        ("00000100", 2**-15, "min_normal"),
+        ("10000100", -1 * (2**-15), "neg_min_normal"),
+        ("00000011", 0.75 * (2**-15), "max_subnorm"),
+        ("10000011", -0.75 * (2**-15), "neg_max_subnorm"),
+        ("00000001", 0.25 * (2**-15), "min_subnorm"),
+        ("10000001", -0.25 * (2**-15), "neg_min_subnorm"),
+    ],
+    torch.float8_e4m3fn: [
+        ("01111111", float("nan"), "nan"),
+        ("11111111", float("nan"), "nan"),
+        ("00000000", 0.0, "zero"),
+        ("10000000", -0.0, "neg_zero"),
+        ("01111110", 448.0, "max_normal"),
+        ("11111110", -448.0, "neg_max_normal"),
+        ("00001000", 2**-6, "min_normal"),
+        ("10001000", -1 * (2**-6), "neg_min_normal"),
+        ("00000111", 0.875 * (2**-6), "max_subnorm"),
+        ("10000111", -0.875 * (2**-6), "neg_max_subnorm"),
+        ("00000001", 2**-9, "min_subnorm"),
+        ("10000001", -1 * (2**-9), "neg_min_subnorm"),
+    ],
+    torch.float8_e4m3fnuz: [
+        ("10000000", float("nan"), "nan"),
+        ("00000000", 0.0, "zero"),
+        ("00000000", -0.0, "neg_zero"),
+        ("01111111", 240.0, "max_normal"),
+        ("11111111", -240.0, "neg_max_normal"),
+        ("00001000", 2**-7, "min_normal"),
+        ("10001000", -1 * (2**-7), "neg_min_normal"),
+        ("00000111", 0.875 * (2**-7), "max_subnorm"),
+        ("10000111", -0.875 * (2**-7), "neg_max_subnorm"),
+        ("00000001", 0.125 * (2**-7), "min_subnorm"),
+        ("10000001", -0.125 * (2**-7), "neg_min_subnorm"),
+    ],
+    torch.float8_e8m0fnu: [
+        ("00000000", float(2**-127), "smallest_number"),
+        ("11111110", float(2**127), "largest_number"),
+        ("01111110", 0.5, "zero_point_five"),
+        ("01111111", 1.0, "one"),
+        ("10000000", 2.0, "two"),
+        ("11111111", float("nan"), "nan"),
+    ],
+}
+
+FLOAT8_DTYPES_WITH_INF = [torch.float8_e5m2]
+
+
+def _int_bits_to_float(x):
+    y = struct.unpack("!f", struct.pack("!I", x))[0]
+    return y
+
+
+def simulate_fp8_precision(input, variant):
+    """Round input (as float32) to the given float8 datatype variant."""
+
+    # Constants
+    dtype = torch.float32
+    int_type = torch.int32
+    mbits = MANTISSA_BITS[variant]
+    minexp = MINEXP[variant]  # ml_dtypes.finfo(variant).
+
+    input = input.to(dtype)
+
+    # Extract bitfield components
+    signs = torch.sign(input)
+    input_int = torch.abs(input).view(int_type)
+
+    exponent_bits = (input_int & 0x7F800000) >> 23
+    mantissa_bits = input_int & 0x007FFFFF
+
+    exponent_base = exponent_bits - 0x7F
+
+    # Add implicit leading 1 to mantissas, i.e. create 1.mmmmmmmm
+    f32_is_normal = exponent_bits != 0
+    mantissa_val_base = f32_is_normal * 0x00800000 + mantissa_bits
+
+    # Shift mantissa to match minimum exponent - denormals in the lower
+    # precision dtype remain normal in the higher precision dtype
+    denormal_bits = torch.maximum(
+        minexp - exponent_base, torch.tensor(0, dtype=int_type)
+    )
+    mantissa_val = mantissa_val_base >> denormal_bits
+    exponent = exponent_base + denormal_bits
+
+    # Round off mantissas
+    last_unrounded_bit = 1 << (23 - mbits)
+    rounding_mask = last_unrounded_bit - 1
+    mantissa_val_rounded = (mantissa_val + (rounding_mask >> 1)) & ~rounding_mask
+
+    # Round ties to nearest even
+    ties = (mantissa_val & rounding_mask) == (last_unrounded_bit >> 1)
+    is_odd = (mantissa_val_rounded & last_unrounded_bit) != 0
+    mantissa_val_rounded += (ties & is_odd) * last_unrounded_bit
+
+    # Re-compose mantissa and exponent
+    vals = (mantissa_val_rounded * 2.0 ** (-23 + exponent)).to(dtype)
+
+    # Replace overflows with inf/NaN as appropriate (no saturation)
+    have_inf = variant in FLOAT8_DTYPES_WITH_INF
+    vals[vals > torch.finfo(variant).max] = torch.inf if have_inf else torch.nan
+
+    return vals * signs
+
+
+def _round_e8m0_rne(biased_exponent, lsb, g, r, s):
+    round_up = False
+
+    # apply g,r,s rounding rules for RNE rounding
+    if g == 1:
+        if (r == 1) or (s == 1):
+            round_up = True
+        else:
+            if lsb:
+                round_up = True
+
+    # round up if necessary
+    if round_up:
+        biased_exponent += 1
+
+    return biased_exponent
+
+
+ROUND_TRIP_TEST_CASES = (
+    # A general 'soak test'.
+    subtest(
+        lambda dtype, device: torch.rand((100, 100), device=device)
+        * torch.finfo(dtype).max,
+        name="soak",
+    ),
+    # A range below the smallest normal in the lower precision type, to ensure
+    # these are rounded correctly to their nearest subnormal in that type.
+    subtest(
+        lambda dtype, device: torch.rand(1000, device=device)
+        * 2
+        * torch.finfo(dtype).smallest_normal,
+        name="subnormals",
+    ),
+    # A range of integers to exert rounding to nearest even.
+    subtest(
+        lambda dtype, device: torch.arange(
+            int(torch.finfo(dtype).max), dtype=torch.int, device=device
+        ),
+        name="rte",
+    ),
+    # Values around max.
+    subtest(
+        lambda dtype, device: torch.finfo(dtype).max
+        + (torch.finfo(dtype).eps * torch.finfo(dtype).max)
+        * torch.arange(-3, 3, 0.25, device=device),
+        name="extremes",
+    ),
+)
+
+
+class TestFloat8Dtype(TestCase):
+    @dtypes(*FLOAT8_DTYPES)
+    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
+    def test_creation_with_zeros(self, dtype, device):
+        """Sanity test, round-trip casting of zeros."""
+        x8 = torch.zeros(8, dtype=dtype, device=device)
+        if dtype is torch.float8_e8m0fnu:
+            # zeros are not supported for this dtype, values get clamped
+            # to 2 ^ -127
+            x = torch.full((8,), 2**-127, dtype=torch.float, device=device)
+            self.assertEqual(x, x8.float(), atol=0, rtol=0)
+        else:
+            x = torch.zeros(8, dtype=torch.float, device=device)
+            self.assertEqual(x, x8.float(), atol=0, rtol=0)
+
+    @dtypes(*FLOAT8_DTYPES)
+    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
+    @parametrize("get_input", ROUND_TRIP_TEST_CASES)
+    def test_cast_round_trip(self, dtype, get_input, device):
+        """Numerical test of float8 conversion, by performing a round-trip cast
+        to the float8 dtype and back to float32, comparing against simulated
+        lower precision."""
+        if dtype is torch.float8_e8m0fnu:
+            return unittest.skip("numerics for e8m0fnu are tested elsewhere")
+
+        x = get_input(dtype, device)
+        x = torch.cat((x, -x))
+        x8 = x.to(dtype)
+        x8_simulated = simulate_fp8_precision(x, dtype)
+        self.assertEqual(x8_simulated, x8.float())
+
+    def test_float8_e8m0fnu_rne_rounding(self, device):
+        """
+        For every possible e8m0 exponent (256 options) and for every possible
+        g, r, s bits of the float32 mantissa, verify that RNE rounding is
+        correctly applied when casting from float32 to e8m0
+
+        Note: this code is morally similar to `test_cast_round_trip`, but
+        IMO simpler to special case e8m0 here.
+        """
+
+        for biased_exponent in range(0, 256):
+            # iterate through all the possible options of guard, round, sticky bits
+            # for the current exponent
+            for grs in range(8):
+                # create a positive floating point number with the specified exponent
+                # and mantissa guard, round, sticky bits
+                uint32_t_start = (biased_exponent << 23) + (grs << 20)
+                fp32_start = _int_bits_to_float(uint32_t_start)
+
+                # create an RNE rounded version of the exponent
+                if biased_exponent == 255:
+                    new_biased_exponent = biased_exponent
+                else:
+                    lsb = biased_exponent > 0
+                    g = grs >> 2
+                    r = (grs >> 1) & 0b1
+                    s = grs & 0b1
+                    new_biased_exponent = _round_e8m0_rne(biased_exponent, lsb, g, r, s)
+
+                # create an RNE rounded version of the float
+                fp32_e8m0_fp32_emulated = _int_bits_to_float(new_biased_exponent << 23)
+
+                # now, do the same in PyTorch and see if results match
+                fp32_pt_start = torch.full(
+                    (1,), fp32_start, device=device, dtype=torch.float
+                )
+                fp32_pt_e8m0 = fp32_pt_start.to(torch.float8_e8m0fnu)
+                fp32_pt_e8m0_fp32 = fp32_pt_e8m0.to(torch.float)
+
+                expected = fp32_e8m0_fp32_emulated
+                if biased_exponent == 254 and grs >= 4:
+                    # special case rounding up from the largest representable float32 exponent, which
+                    # saturates to nan
+                    expected = float("nan")
+                elif biased_exponent == 255:
+                    # special case inf and nan, which becomes nan
+                    expected = float("nan")
+
+                actual = fp32_pt_e8m0_fp32.item()
+
+                self.assertEqual(
+                    expected, actual, f"expected: {expected}, actual: {actual}"
+                )
+
+    @dtypes(*FLOAT8_DTYPES)
+    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
+    def test_special_numbers(self, dtype, device):
+        """Test special numbers."""
+
+        def compare_binary_with_decimal(binary, decimal, number_name, dtype, device):
+            bits_int = int(binary, 2)
+            tensor_int = torch.tensor([bits_int], dtype=torch.uint8, device=device)
+            tensor_fp8 = tensor_int.view(dtype)
+            if number_name == "nan":
+                assert tensor_fp8.isnan()
+            else:
+                tensor_fp32 = tensor_fp8.float()
+                ref_tensor_fp32 = torch.tensor(
+                    [decimal], dtype=torch.float, device=device
+                )
+                self.assertEqual(tensor_fp32, ref_tensor_fp32, atol=0, rtol=0)
+
+        for number in SPECIAL_NUMBERS[dtype]:
+            compare_binary_with_decimal(*number, dtype, device)
+
+    @dtypes(*FLOAT8_DTYPES)
+    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
+    def test_type_promotion_fails(self, dtype, device):
+        """Test that float8 is not promoted to higher precision Float Type."""
+        for other_dtype in [
+            torch.float16,
+            torch.bfloat16,
+            torch.float32,
+            torch.float64,
+        ]:
+            x = torch.randn(8, device=device).to(dtype)
+            y = torch.randn(8, device=device).to(other_dtype)
+            with self.assertRaisesRegex(
+                RuntimeError, "Promotion for Float8 Types is not supported"
+            ):
+                x + y
+
+    @dtypes(*FLOAT8_DTYPES)
+    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
+    def test_empty(self, dtype, device):
+        with DeterministicGuard(torch.are_deterministic_algorithms_enabled()):
+            for use_deterministic in (True, False):
+                torch.use_deterministic_algorithms(use_deterministic)
+                torch.empty(4, 4, device=device, dtype=dtype)
+
+    @dtypes(*FLOAT8_DTYPES)
+    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
+    def test_to_string(self, dtype, device):
+        x = torch.empty(4, 4, device=device, dtype=dtype)
+        str(x)
+
+    @dtypes(*FLOAT8_DTYPES)
+    def test_finfo(self, dtype, device):
+        torch.finfo(dtype)
+
+    @dtypes(*FLOAT8_DTYPES)
+    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
+    def test_cat(self, dtype, device):
+        x1 = torch.empty(4, 4, device=device, dtype=dtype)
+        x2 = torch.empty(4, 4, device=device, dtype=dtype)
+        torch.cat([x1, x2])
+
+    @dtypes(*FLOAT8_DTYPES)
+    @dtypesIfCUDA(*CUDA_FLOAT8_DTYPES)
+    def test_save_load(self, dtype, device):
+        x1 = torch.randint(0, 10, (4, 4), device=device, dtype=torch.uint8).view(dtype)
+        with TemporaryFileName() as fname:
+            torch.save(x1, fname)
+            x1_save_load = torch.load(fname)
+            torch.testing.assert_close(x1, x1_save_load, atol=0, rtol=0)
+
+
+class TestFloat4Dtype(TestCase):
+    # TODO(#146647): make the testing generic for shell dtypes
+    def test_float4_e2m1fn_x2(self, device):
+        # can create a tensor of dtype float4
+        x1 = torch.empty(4096, 4096, device=device, dtype=torch.float4_e2m1fn_x2)
+
+        # can create a string (so printing will work)
+        str(x1)
+
+        # can view float4_e2m1fn_x2 as uint8
+        x2 = x1.view(torch.uint8)
+
+        # can view uint8 as float4_e2m1fn_x2
+        x2.view(torch.float4_e2m1fn_x2)
+
+    def test_f4_save_load(self, device):
+        x1 = torch.randint(0, 10, (4, 4), device=device, dtype=torch.uint8).view(
+            torch.float4_e2m1fn_x2
+        )
+        with TemporaryFileName() as fname:
+            torch.save(x1, fname)
+            x1_save_load = torch.load(fname)
+            # TODO(#146647): make this and all other shell dtypes support equality
+            # comparison
+            torch.testing.assert_close(
+                x1.view(torch.uint8), x1_save_load.view(torch.uint8), atol=0, rtol=0
+            )
+
+
+instantiate_device_type_tests(TestFloat8Dtype, globals())
+instantiate_device_type_tests(TestFloat4Dtype, globals())
+
+
+class TestFloat8DtypeCPUOnly(TestCase):
+
+    """
+    Test of mul implementation
+
+    NOTE: this is CPU-only for now because adding it to CUDA requires adding yet
+    another C++ dtype macro, and there is no use case yet for unscaled float8
+    multiplication - doesn't seem worth it.
+    """
+
+    @dtypes(*CUDA_FLOAT8_DTYPES)
+    def test_mul(self, dtype):
+        # TODO(#113663): remove arithmetic support from all float8 dtypes
+        if dtype is torch.float8_e8m0fnu:
+            return unittest.skip("arithmetic not supported for torch.float8_e8m0fnu")
+        shape = (10, 10)
+        a = torch.randn(shape)
+        a8_simulated = simulate_fp8_precision(a, dtype)
+        a8 = a.to(dtype)
+        b = torch.randn(shape)
+        b8_simulated = simulate_fp8_precision(b, dtype)
+        b8 = b.to(dtype)
+        mul8 = a8 * b8
+        mul8_simulated = (a8_simulated * b8_simulated).to(dtype)
+        self.assertEqual(mul8, mul8_simulated)
+
+    @unittest.skipIf(IS_WINDOWS, "torch.compile not supported on Windows yet")
+    @dtypes(*CUDA_FLOAT8_DTYPES)
+    def test_pt2_traceable_aot_eager(self, dtype):
+        if dtype is torch.float8_e8m0fnu:
+            return unittest.skip(
+                "PT2 support for torch.float8_e8m0fnu is not implemented yet"
+            )
+
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def f(x):
+            x = x.to(dtype)
+            x = x.float()
+            return x
+
+        x = torch.randn(1).requires_grad_()
+        f(x).sum().backward()
+
+
+instantiate_device_type_tests(TestFloat8DtypeCPUOnly, globals(), only_for="cpu")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/quantization/core/test_docs.py b/test/quantization/core/test_docs.py
index 2222ef64b62e..0dd4e76d73a3 100644
--- a/test/quantization/core/test_docs.py
+++ b/test/quantization/core/test_docs.py
@@ -11,7 +11,11 @@
     SingleLayerLinearModel,
 )
 from torch.testing._internal.common_quantized import override_quantized_engine
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_ARM64, IS_FBCODE
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, IS_ARM64, IS_FBCODE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 
 
@@ -141,3 +145,9 @@ def test_quantization_doc_custom(self):
 
         code = self._get_code(path_from_pytorch, unique_identifier)
         self._test_code(code, global_inputs)
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/core/test_quantized_functional.py b/test/quantization/core/test_quantized_functional.py
index b14aaf465b09..d5bebcde1473 100644
--- a/test/quantization/core/test_quantized_functional.py
+++ b/test/quantization/core/test_quantized_functional.py
@@ -16,7 +16,11 @@
     _make_conv_test_input,
 )
 from torch.testing._internal.common_quantized import override_quantized_engine
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_PPC
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, IS_PPC
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestQuantizedFunctionalOps(QuantizationTestCase):
     def test_relu_api(self):
@@ -52,7 +56,11 @@ def _test_conv_api_impl(
         # Make sure the results match
         # assert_array_almost_equal compares using the following formula:
         #     abs(desired-actual) < 1.5 * 10**(-decimal)
+<<<<<<< HEAD
         # (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
+=======
+        # (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We use decimal = 0 to ignore off-by-1 differences between reference
         # and test. Off-by-1 differences arise due to the order of round and
         # zero_point addition operation, i.e., if addition followed by round is
@@ -235,3 +243,9 @@ def test_grid_sample(self, N, C, H, H_out, W, W_out, scale, zero_point):
         out_exp = torch.quantize_per_tensor(F.grid_sample(X, grid), scale=scale, zero_point=zero_point, dtype=torch.quint8)
         np.testing.assert_array_almost_equal(
             out.int_repr().numpy(), out_exp.int_repr().numpy(), decimal=0)
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py
index c31fe44fa292..f61863a4204c 100644
--- a/test/quantization/core/test_quantized_module.py
+++ b/test/quantization/core/test_quantized_module.py
@@ -31,6 +31,10 @@
     qengine_is_qnnpack,
     qengine_is_onednn,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.fx
 from hypothesis import assume, given
 from hypothesis import strategies as st
@@ -320,7 +324,11 @@ def _test_conv_api_impl(
         # Make sure the results match
         # assert_array_almost_equal compares using the following formula:
         #     abs(desired-actual) < 1.5 * 10**(-decimal)
+<<<<<<< HEAD
         # (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
+=======
+        # (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We use decimal = 0 to ignore off-by-1 differences between reference
         # and test. Off-by-1 differences arise due to the order of round and
         # zero_point addition operation, i.e., if addition followed by round is
@@ -2095,3 +2103,9 @@ def test_linear_decomposed_weight_custom_qmin_qmax(self):
                 self.assertTrue(qmax == 127)
                 found += 1
         self.assertTrue(found == 2)
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 39f059e1db2e..a462002c0ac9 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -24,8 +24,20 @@
 hu.assert_deadline_disabled()
 
 from torch.testing._internal.common_cuda import SM80OrLater
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
 from torch.testing._internal.common_utils import IS_PPC, IS_MACOS, IS_SANDCASTLE, IS_FBCODE, IS_ARM64
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    TestCase,
+    IS_PPC,
+    IS_MACOS,
+    IS_SANDCASTLE,
+    IS_FBCODE,
+    IS_ARM64
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM, skipIfNoQNNPACK, skipIfNoONEDNN
 from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \
     override_quantized_engine, supported_qengines, override_qengines, _snr
@@ -3139,6 +3151,108 @@ def forward(
                     # Verify the result is scriptable
                     mha_quantized_scripted = torch.jit.script(mha_quantized)
 
+<<<<<<< HEAD
+=======
+    @skipIfNoONEDNN
+    def test_int8_mul_onednn(self):
+        output_dtype_list = [torch.uint8, torch.float, torch.bfloat16, torch.half]
+        shape_list = [(16, 64), (15, 63)]
+        cases = itertools.product(shape_list, output_dtype_list)
+        for shape, output_dtype in cases:
+            a = torch.randn(shape)
+            b = torch.randn(shape)
+            s_a, z_a = 0.1, 1
+            s_b, z_b = 0.2, 2
+            if output_dtype == torch.uint8:
+                s_c, z_c = 0.3, 3
+            else:
+                s_c, z_c = 1, 0
+            qa = torch.quantize_per_tensor(a, s_a, z_a, torch.quint8)
+            qb = torch.quantize_per_tensor(b, s_b, z_b, torch.quint8)
+            dqa = qa.dequantize()
+            dqb = qb.dequantize()
+            c_ref = dqa * dqb
+            if output_dtype == torch.uint8:
+                c_ref = torch.ops.quantized_decomposed.quantize_per_tensor.default(c_ref, s_c, z_c, 0, 255, torch.uint8)
+            c_ref = c_ref.to(output_dtype)
+
+            a_int8 = qa.int_repr()
+            b_int8 = qb.int_repr()
+            c = torch.ops.onednn.qmul.tensor(a_int8, s_a, z_a, b_int8, s_b, z_b, s_c, z_c, output_dtype)
+            self.assertEqual(c, c_ref)
+
+    @skipIfNoONEDNN
+    @given(relu_fused=st.booleans())
+    def test_int8_add_onednn(self, relu_fused):
+        output_dtype_list = [torch.uint8, torch.float, torch.bfloat16, torch.half]
+        shape_list = [(16, 64), (15, 63)]
+        cases = itertools.product(shape_list, output_dtype_list)
+        for shape, output_dtype in cases:
+            a = torch.randn(shape)
+            b = torch.randn(shape)
+            s_a, z_a = 0.1, 1
+            s_b, z_b = 0.2, 2
+            if output_dtype == torch.uint8:
+                s_c, z_c = 0.3, 3
+            else:
+                s_c, z_c = 1, 0
+            qa = torch.quantize_per_tensor(a, s_a, z_a, torch.quint8)
+            qb = torch.quantize_per_tensor(b, s_b, z_b, torch.quint8)
+            dqa = qa.dequantize()
+            dqb = qb.dequantize()
+            c_ref = dqa + dqb
+            if relu_fused:
+                c_ref = torch.nn.functional.relu(c_ref)
+            if output_dtype == torch.uint8:
+                c_ref = torch.ops.quantized_decomposed.quantize_per_tensor.default(c_ref, s_c, z_c, 0, 255, torch.uint8)
+            c_ref = c_ref.to(output_dtype)
+
+            a_int8 = qa.int_repr()
+            b_int8 = qb.int_repr()
+            if relu_fused:
+                c = torch.ops.onednn.qadd_relu.tensor(a_int8, s_a, z_a, b_int8, s_b, z_b, s_c, z_c, output_dtype)
+            else:
+                c = torch.ops.onednn.qadd.tensor(a_int8, s_a, z_a, b_int8, s_b, z_b, s_c, z_c, output_dtype)
+            self.assertEqual(c, c_ref)
+
+    @skipIfNoONEDNN
+    def test_int8_batch_norm_onednn(self):
+        # hypothesis too slow for this test, create test cases manually
+        channel_len_list = (8, 64, 100, 120, 128)
+        output_dtype_list = [torch.uint8, torch.float, torch.bfloat16, torch.half]
+        x_scale, x_zero_point = 0.1, 1
+        cases = itertools.product(channel_len_list, output_dtype_list)
+        for channels, out_dtype in cases:
+            shapes = [8, channels, 8, 8]
+            y_scale, y_zero_point = (0.2, 2) if out_dtype == torch.uint8 else (1, 0)
+
+            x = torch.randn(shapes, dtype=torch.float32)
+            mean = torch.rand(channels).float()
+            var = torch.rand(channels).float()
+            weight = torch.rand(channels).float()
+            bias = torch.rand(channels).float()
+            eps = 0.001
+            qx = torch.ops.quantized_decomposed.quantize_per_tensor.default(
+                x, x_scale, x_zero_point, 0, 255, torch.uint8
+            )
+            y = torch.ops.onednn.qbatch_norm2d(
+                qx, x_scale, x_zero_point, weight, bias, mean, var, eps, y_scale, y_zero_point, out_dtype
+            )
+
+            dqx = torch.ops.quantized_decomposed.dequantize_per_tensor.default(
+                qx, x_scale, x_zero_point, 0, 255, torch.uint8
+            )
+            y_ref = F.batch_norm(dqx, weight=weight, bias=bias,
+                                 running_mean=mean, running_var=var, training=False,
+                                 momentum=0, eps=eps)
+            if out_dtype == torch.uint8:
+                y_ref = torch.ops.quantized_decomposed.quantize_per_tensor.default(
+                    y_ref, y_scale, y_zero_point, 0, 255, torch.uint8
+                )
+            y_ref = y_ref.to(out_dtype)
+            self.assertEqual(y, y_ref, msg=f"{y} vs {y_ref}")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestDynamicQuantizedOps(TestCase):
     """Tests the correctness of the dynamic quantized linear and linear_relu op."""
@@ -5172,7 +5286,11 @@ def _test_qconv_impl(
         # Make sure the results match
         # assert_array_almost_equal compares using the following formula:
         #     abs(desired-actual) < 1.5 * 10**(-decimal)
+<<<<<<< HEAD
         # (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
+=======
+        # (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We use decimal = 0 to ignore off-by-1 differences between
         # reference and test. Off-by-1 differences arise due to the order of
         # round and zero_point addition operation, i.e., if addition
@@ -6907,7 +7025,11 @@ def _test_qconv_impl_cpu_tensor(
         # Make sure the results match
         # assert_array_almost_equal compares using the following formula:
         #     abs(desired-actual) < 1.5 * 10**(-decimal)
+<<<<<<< HEAD
         # (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
+=======
+        # (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We use decimal = 0 to ignore off-by-1 differences between
         # reference and test. Off-by-1 differences arise due to the order of
         # round and zero_point addition operation, i.e., if addition
@@ -7010,7 +7132,11 @@ def test_qconv2d_pt2e(self):
             if (output_dtype is not None or channel_last_weight_format) and not (use_bias and use_channelwise):
                 # Remove some test combination to reduce UT test time
                 continue
+<<<<<<< HEAD
             qconv = torch.ops.onednn.qconv2d_pointwise
+=======
+            qconv = torch.ops.onednn.qconv_pointwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             qconv_prepack = torch.ops.onednn.qconv_prepack
             conv_op = torch.nn.Conv2d(
                 input_channels_per_group * groups,
@@ -7123,7 +7249,11 @@ def test_qconv2d_relu_pt2e(self):
         output_dtype_list = [None, torch.float32, torch.bfloat16]
         options = itertools.product(groups_list, use_bias_list, use_channelwise_list, output_dtype_list)
         for groups, use_bias, use_channelwise, output_dtype in options:
+<<<<<<< HEAD
             qconv = torch.ops.onednn.qconv2d_pointwise
+=======
+            qconv = torch.ops.onednn.qconv_pointwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             qconv_prepack = torch.ops.onednn.qconv_prepack
             conv_op = torch.nn.Conv2d(
                 input_channels_per_group * groups,
@@ -7174,7 +7304,11 @@ def test_qconv2d_hardtanh_pt2e(self):
         output_dtype_list = [None, torch.float32, torch.bfloat16]
         options = itertools.product(groups_list, use_bias_list, use_channelwise_list, output_dtype_list)
         for groups, use_bias, use_channelwise, output_dtype in options:
+<<<<<<< HEAD
             qconv = torch.ops.onednn.qconv2d_pointwise
+=======
+            qconv = torch.ops.onednn.qconv_pointwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             qconv_prepack = torch.ops.onednn.qconv_prepack
             conv_op = torch.nn.Conv2d(
                 input_channels_per_group * groups,
@@ -7225,7 +7359,11 @@ def test_qconv2d_silu_pt2e(self):
         output_dtype_list = [None, torch.float32, torch.bfloat16]
         options = itertools.product(groups_list, use_bias_list, use_channelwise_list, output_dtype_list)
         for groups, use_bias, use_channelwise, output_dtype in options:
+<<<<<<< HEAD
             qconv = torch.ops.onednn.qconv2d_pointwise
+=======
+            qconv = torch.ops.onednn.qconv_pointwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             qconv_prepack = torch.ops.onednn.qconv_prepack
             conv_op = torch.nn.Conv2d(
                 input_channels_per_group * groups,
@@ -7277,7 +7415,11 @@ def test_qconv2d_hardswish_pt2e(self):
         options = itertools.product(groups_list, use_bias_list, use_channelwise_list, output_dtype_list)
 
         for groups, use_bias, use_channelwise, output_dtype in options:
+<<<<<<< HEAD
             qconv = torch.ops.onednn.qconv2d_pointwise
+=======
+            qconv = torch.ops.onednn.qconv_pointwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             qconv_prepack = torch.ops.onednn.qconv_prepack
             conv_op = torch.nn.Conv2d(
                 input_channels_per_group * groups,
@@ -7480,6 +7622,61 @@ def test_qconv2d_sum_relu_float_output_pt2e(self):
                 qconv_x2_dtype=qconv_x2_dtype,
             )
 
+<<<<<<< HEAD
+=======
+    # Test qconv1d with post op relu
+    @unittest.skipIf(IS_FBCODE, "Skip pt2e ops in fbcode")
+    @skipIfNoONEDNN
+    def test_qconv1d_relu_pt2e(self):
+        input_channels_per_group = 2
+        output_channels_per_group = 2
+        groups_list = [1, 10]
+        input_feature_map_shape = (10,)
+        kernels = (3,)
+        strides = (2,)
+        pads = (1,)
+        dilations = (1,)
+        W_scale = [1.5]
+        W_zero_point = [0]
+        use_bias_list = [False, True]
+        use_channelwise_list = [False, True]
+        output_dtype_list = [None, torch.float32, torch.bfloat16]
+        options = itertools.product(groups_list, use_bias_list, use_channelwise_list, output_dtype_list)
+        for groups, use_bias, use_channelwise, output_dtype in options:
+            qconv = torch.ops.onednn.qconv_pointwise
+            qconv_prepack = torch.ops.onednn.qconv_prepack
+            conv_op = torch.nn.Conv1d(
+                input_channels_per_group * groups,
+                output_channels_per_group * groups,
+                kernels,
+                strides,
+                pads,
+                dilations,
+                groups,
+            )
+            pointwise_post_op = PointwisePostOp(unary_attr="relu")
+            self._test_qconv_impl_cpu_tensor(
+                qconv,
+                qconv_prepack,
+                conv_op,
+                input_channels_per_group=input_channels_per_group,
+                input_feature_map_shape=input_feature_map_shape,
+                output_channels_per_group=output_channels_per_group,
+                groups=groups,
+                kernels=kernels,
+                strides=strides,
+                pads=pads,
+                dilations=dilations,
+                W_scale=W_scale,
+                W_zero_point=W_zero_point,
+                use_bias=use_bias,
+                post_op=pointwise_post_op,
+                use_channelwise=use_channelwise,
+                qconv_output_dtype=output_dtype,
+            )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestPadding(TestCase):
     @given(batch_size=st.integers(1, 64),
            channels=st.integers(1, 64),
@@ -8114,3 +8311,9 @@ def test_compare_tensor_scalar(self, A, b):
             note(f"result 3: {result}")
             self.assertEqual(result_ref, result,
                              msg=f"'tensor.{op}(scalar)'' failed")
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/core/test_top_level_apis.py b/test/quantization/core/test_top_level_apis.py
index f76db1cd4139..c0f81b22aa9c 100644
--- a/test/quantization/core/test_top_level_apis.py
+++ b/test/quantization/core/test_top_level_apis.py
@@ -91,3 +91,12 @@ def test_reduce_range(self) -> None:
 
                 fake_quantize_weight = qconfig.weight()
                 self.assertEqual(fake_quantize_weight.reduce_range, reduce_ranges[1])
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/core/test_utils.py b/test/quantization/core/test_utils.py
index e4a3d3079c4e..a68757efdd30 100644
--- a/test/quantization/core/test_utils.py
+++ b/test/quantization/core/test_utils.py
@@ -1,7 +1,11 @@
 # Owner(s): ["oncall: quantization"]
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, TestCase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.utils import get_fqn_to_example_inputs
 from torch.ao.nn.quantized.modules.utils import _quantize_weight
 from torch.ao.quantization import MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver
@@ -220,3 +224,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs=None):
                 [0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF],
             ], dtype=torch.uint8))
             assert x.dtype == dtype
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index 4b7a6587d86c..0b22eba6f457 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -818,6 +818,12 @@ def test_learnable_forward_per_channel_cpu(self, X):
     @given(X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,),
                                    qparams=hu.qparams(dtypes=torch.quint8)))
     @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
+<<<<<<< HEAD
+=======
+    @unittest.skip(
+        "this is broken without changes to any relevant code, "
+        "we need to remove hypothesis testing in CI")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_learnable_forward_per_channel_cuda(self, X):
         torch.random.manual_seed(NP_RANDOM_SEED)
         X, (_, _, axis, _) = X
@@ -954,6 +960,12 @@ def test_learnable_backward_per_channel_cpu(self, X):
     @given(X=hu.per_channel_tensor(shapes=hu.array_shapes(2, 5,),
                                    qparams=hu.qparams(dtypes=torch.quint8)))
     @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
+<<<<<<< HEAD
+=======
+    @unittest.skip(
+        "this is broken without changes to any relevant code, "
+        "we need to remove hypothesis testing in CI")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_learnable_backward_per_channel_cuda(self, X):
         torch.random.manual_seed(NP_RANDOM_SEED)
         X, (scale, zero_point, axis, torch_type) = X
@@ -1035,9 +1047,15 @@ def test_fake_quantize_per_channel_affine_scale_dtypes(self):
 
 class TestFusedObsFakeQuant(TestCase):
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
+<<<<<<< HEAD
            symmetric_quant=st.booleans())
     @settings(deadline=None)
     def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant) -> None:
+=======
+           symmetric_quant=st.booleans(), use_bool=st.booleans())
+    @settings(deadline=None)
+    def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant, use_bool) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Tests the case where we call the fused_obs_fake_quant op multiple times
         and update the running_min and max of the activation tensors.
@@ -1049,15 +1067,25 @@ def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant) -> None:
         avg_const = 0.01
         scale = torch.tensor([1.0], device=device)
         zero_point = torch.tensor([0], dtype=torch.int, device=device)
+<<<<<<< HEAD
         observer_on = fake_quant_on = 0
+=======
+        observer_on = fake_quant_on = False if use_bool else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         pt_op = torch.fused_moving_avg_obs_fake_quant
         # enable observer after 2 iterations and fake_quant after 4 iterations
         for i in range(10):
             if i > 2:
+<<<<<<< HEAD
                 observer_on = 1
             if i > 4:
                 fake_quant_on = 1
+=======
+                observer_on = True if use_bool else 1
+            if i > 4:
+                fake_quant_on = True if use_bool else 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             x = torch.randn(5, 5, device=device)
             out = pt_op(
@@ -1126,9 +1154,15 @@ def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant) -> None:
         self.assertEqual(out.shape, output_shape)
 
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
+<<<<<<< HEAD
            symmetric_quant=st.booleans())
     @settings(deadline=None)
     def test_fused_obs_fake_quant_moving_avg_per_channel(self, device, symmetric_quant) -> None:
+=======
+           symmetric_quant=st.booleans(), use_bool=st.booleans())
+    @settings(deadline=None)
+    def test_fused_obs_fake_quant_moving_avg_per_channel(self, device, symmetric_quant, use_bool) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Tests the case where we call the fused_obs_fake_quant op multiple times
         and update the running_min and max of the activation tensors.
@@ -1145,15 +1179,25 @@ def test_fused_obs_fake_quant_moving_avg_per_channel(self, device, symmetric_qua
             scale = torch.empty(m, device=device).fill_(0.1)
             zero_point = torch.empty(m, dtype=torch.int, device=device).fill_(0)
 
+<<<<<<< HEAD
             observer_on = fake_quant_on = 0
+=======
+            observer_on = fake_quant_on = False if use_bool else 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             pt_op = torch.fused_moving_avg_obs_fake_quant
             # enable observer after 2 iterations and fake_quant after 4 iterations
             for i in range(10):
                 if i > 2:
+<<<<<<< HEAD
                     observer_on = 1
                 if i > 4:
                     fake_quant_on = 1
+=======
+                    observer_on = True if use_bool else 1
+                if i > 4:
+                    fake_quant_on = True if use_bool else 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 x = torch.randn(size, device=device)
                 out = pt_op(
diff --git a/test/quantization/eager/test_bias_correction_eager.py b/test/quantization/eager/test_bias_correction_eager.py
index 39c8ad872bdd..25cea642a889 100644
--- a/test/quantization/eager/test_bias_correction_eager.py
+++ b/test/quantization/eager/test_bias_correction_eager.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: quantization"]
 
+<<<<<<< HEAD
 import torch
 import torch.nn as nn
 from torch.testing._internal.common_quantization import QuantizationTestCase
@@ -9,16 +10,34 @@
 from torch.ao.quantization import QuantWrapper
 import torch.ao.ns._numeric_suite as ns
 
+=======
+import copy
+
+import torch
+import torch.ao.ns._numeric_suite as ns
+import torch.nn as nn
+from torch.ao.quantization import default_qconfig, QuantWrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization._correct_bias import (
     _supported_modules,
     _supported_modules_quantized,
     bias_correction,
     get_module,
     get_param,
+<<<<<<< HEAD
     parent_child_names
 )
 
 import copy
+=======
+    parent_child_names,
+)
+from torch.testing._internal.common_quantization import (
+    QuantizationTestCase,
+    skipIfNoFBGEMM,
+)
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestBiasCorrectionEager(QuantizationTestCase):
@@ -28,9 +47,15 @@ def compute_sqnr(self, x, y):
         return 20 * torch.log10(Ps / Pn)
 
     def correct_artificial_bias_quantize(self, float_model, img_data):
+<<<<<<< HEAD
         ''' Adding artificial bias and testing if bias persists after bias
             correction. This test case changes the bias of a quantized submodule
         '''
+=======
+        """Adding artificial bias and testing if bias persists after bias
+        correction. This test case changes the bias of a quantized submodule
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         artificial_model = copy.deepcopy(float_model)
         artificial_model.qconfig = default_qconfig
         torch.ao.quantization.prepare(artificial_model, inplace=True)
@@ -41,12 +66,26 @@ def correct_artificial_bias_quantize(self, float_model, img_data):
         # manually changing bias
         for name, submodule in artificial_model.named_modules():
             if type(submodule) in _supported_modules:
+<<<<<<< HEAD
                 x = get_param(submodule, 'bias')
                 weight = get_param(submodule, 'weight')
                 if x is not None:
                     submodule.set_weight_bias(weight, x.data * 3)
 
         bias_correction(float_model, artificial_model, img_data, target_modules=_supported_modules_quantized)
+=======
+                x = get_param(submodule, "bias")
+                weight = get_param(submodule, "weight")
+                if x is not None:
+                    submodule.set_weight_bias(weight, x.data * 3)
+
+        bias_correction(
+            float_model,
+            artificial_model,
+            img_data,
+            target_modules=_supported_modules_quantized,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Trims off the shadow module,
         for name, submodule in artificial_model.named_modules():
@@ -58,11 +97,21 @@ def correct_artificial_bias_quantize(self, float_model, img_data):
         for name, artificial_submodule in artificial_model.named_modules():
             if type(artificial_submodule) in _supported_modules_quantized:
                 submodule = get_module(float_model, name)
+<<<<<<< HEAD
                 float_bias = get_param(submodule, 'bias')
                 artificial_bias = get_param(artificial_submodule, 'bias')
 
                 self.assertTrue(self.compute_sqnr(float_bias, artificial_bias) > 30,
                                 "Correcting quantized bias produced too much noise, sqnr score too low")
+=======
+                float_bias = get_param(submodule, "bias")
+                artificial_bias = get_param(artificial_submodule, "bias")
+
+                self.assertTrue(
+                    self.compute_sqnr(float_bias, artificial_bias) > 30,
+                    "Correcting quantized bias produced too much noise, sqnr score too low",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoFBGEMM
     def test_linear_chain(self):
@@ -78,9 +127,21 @@ def forward(self, x):
                 x = self.linear2(x)
                 x = self.linear3(x)
                 return x
+<<<<<<< HEAD
         float_model = QuantWrapper(LinearChain())
         img_data = [(torch.rand(10, 3, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long))
                     for _ in range(50)]
+=======
+
+        float_model = QuantWrapper(LinearChain())
+        img_data = [
+            (
+                torch.rand(10, 3, dtype=torch.float),
+                torch.randint(0, 1, (2,), dtype=torch.long),
+            )
+            for _ in range(50)
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.correct_artificial_bias_quantize(float_model, img_data)
 
     @skipIfNoFBGEMM
@@ -97,7 +158,24 @@ def forward(self, x):
                 x = self.conv2d2(x)
                 x = self.conv2d3(x)
                 return x
+<<<<<<< HEAD
         float_model = QuantWrapper(ConvChain())
         img_data = [(torch.rand(10, 3, 125, 125, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long))
                     for _ in range(50)]
         self.correct_artificial_bias_quantize(float_model, img_data)
+=======
+
+        float_model = QuantWrapper(ConvChain())
+        img_data = [
+            (
+                torch.rand(10, 3, 125, 125, dtype=torch.float),
+                torch.randint(0, 1, (2,), dtype=torch.long),
+            )
+            for _ in range(50)
+        ]
+        self.correct_artificial_bias_quantize(float_model, img_data)
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/eager/test_equalize_eager.py b/test/quantization/eager/test_equalize_eager.py
index 8cf506e8478a..df29d0452f64 100644
--- a/test/quantization/eager/test_equalize_eager.py
+++ b/test/quantization/eager/test_equalize_eager.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: quantization"]
 
+<<<<<<< HEAD
 import torch
 import torch.nn as nn
 
@@ -15,6 +16,23 @@ def checkChannelsEqualized(self, tensor1, tensor2, output_axis, input_axis):
         ''' Checks the channel ranges of tensor1, tensor2 are the same,
         which is an indication that equalization has been applied correctly
         '''
+=======
+import copy
+
+import torch
+import torch.ao.quantization._equalize as _equalize
+import torch.nn as nn
+from torch.ao.quantization.fuse_modules import fuse_modules
+from torch.testing._internal.common_quantization import QuantizationTestCase
+from torch.testing._internal.common_utils import raise_on_run_directly
+
+
+class TestEqualizeEager(QuantizationTestCase):
+    def checkChannelsEqualized(self, tensor1, tensor2, output_axis, input_axis):
+        """Checks the channel ranges of tensor1, tensor2 are the same,
+        which is an indication that equalization has been applied correctly
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_channel_tensor1 = _equalize.channel_range(tensor1, output_axis)
         input_channel_tensor2 = _equalize.channel_range(tensor2, input_axis)
 
@@ -23,18 +41,30 @@ def checkChannelsEqualized(self, tensor1, tensor2, output_axis, input_axis):
         self.assertEqual(output_channel_tensor1, input_channel_tensor2)
 
     def getModule(self, model, name):
+<<<<<<< HEAD
         ''' Given the name is a submodule to a model, return the submodule
         '''
         curr = model
         name = name.split('.')
+=======
+        """Given the name is a submodule to a model, return the submodule"""
+        curr = model
+        name = name.split(".")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for subname in name:
             curr = curr._modules[subname]
         return curr
 
     def test_cross_layer_equalization(self):
+<<<<<<< HEAD
         ''' applies _equalize.cross_layer_equalization on two modules and checks
         to make sure channels ranges are equivalent
         '''
+=======
+        """applies _equalize.cross_layer_equalization on two modules and checks
+        to make sure channels ranges are equivalent
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module1 = nn.Conv2d(3, 4, 2)
         module2 = nn.Linear(4, 4)
 
@@ -45,6 +75,7 @@ def test_cross_layer_equalization(self):
 
         mod_tensor1, mod_tensor2 = module1.weight, module2.weight
 
+<<<<<<< HEAD
         self.checkChannelsEqualized(mod_tensor1, mod_tensor2, module1_output_channel_axis, module2_input_channel_axis)
 
     def test_converged(self):
@@ -52,6 +83,20 @@ def test_converged(self):
         identical modules should return true
         modules with high difference in weights should return false
         '''
+=======
+        self.checkChannelsEqualized(
+            mod_tensor1,
+            mod_tensor2,
+            module1_output_channel_axis,
+            module2_input_channel_axis,
+        )
+
+    def test_converged(self):
+        """Sanity checks on _equalize.converged working
+        identical modules should return true
+        modules with high difference in weights should return false
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module1 = nn.Linear(3, 3)
         module2 = nn.Linear(3, 3)
 
@@ -59,18 +104,32 @@ def test_converged(self):
         module2.weight = nn.parameter.Parameter(torch.zeros(module1.weight.size()))
 
         # input is a dictionary
+<<<<<<< HEAD
         dictionary_1 = {'linear1': module1}
         dictionary_2 = {'linear1': module2}
+=======
+        dictionary_1 = {"linear1": module1}
+        dictionary_2 = {"linear1": module2}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(_equalize.converged(dictionary_1, dictionary_1, 1e-6))
         self.assertFalse(_equalize.converged(dictionary_1, dictionary_2, 1e-6))
 
     def test_equalize(self):
+<<<<<<< HEAD
         ''' First checks to see if _equalize.equalize can handle multiple
+=======
+        """First checks to see if _equalize.equalize can handle multiple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pair modules as input
         then checks correctness of the function by ensuring the equalized
         and unequalized versions of the model yield the same output
         given the same input
+<<<<<<< HEAD
         '''
+=======
+        """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class ChainModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -83,6 +142,7 @@ def forward(self, x):
                 x = self.linear2(x)
                 x = self.linear3(x)
                 return x
+<<<<<<< HEAD
         chain1 = ChainModule()
         chain2 = copy.deepcopy(chain1)
 
@@ -90,6 +150,18 @@ def forward(self, x):
         linear1 = self.getModule(chain1, 'linear1')
         linear2 = self.getModule(chain1, 'linear2')
         linear3 = self.getModule(chain1, 'linear3')
+=======
+
+        chain1 = ChainModule()
+        chain2 = copy.deepcopy(chain1)
+
+        _equalize.equalize(
+            chain1, [["linear1", "linear2"], ["linear2", "linear3"]], 1e-6
+        )
+        linear1 = self.getModule(chain1, "linear1")
+        linear2 = self.getModule(chain1, "linear2")
+        linear3 = self.getModule(chain1, "linear3")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.checkChannelsEqualized(linear1.weight, linear2.weight, 0, 1)
         self.checkChannelsEqualized(linear2.weight, linear3.weight, 0, 1)
@@ -98,7 +170,11 @@ def forward(self, x):
         self.assertEqual(chain1(input), chain2(input))
 
     def test_equalize_fused_convrelu(self):
+<<<<<<< HEAD
         ''' Checks to see if eager mode equalization supports fused
+=======
+        """Checks to see if eager mode equalization supports fused
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ConvReLU2d models
 
         A model with 3 ConvReLU2d is constructed. Next, the conv2d and relu
@@ -106,7 +182,12 @@ def test_equalize_fused_convrelu(self):
         equalization applied. Finally, we ensure that the channels have been
         equalized and that the equalized and unequalized versions of the model
         yield the same output given the same input
+<<<<<<< HEAD
         '''
+=======
+        """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -128,6 +209,7 @@ def forward(self, x):
 
         model = M()
 
+<<<<<<< HEAD
         fused_model1 = fuse_modules(model, [['conv1', 'relu1'], ['conv2', 'relu2'], ['conv3', 'relu3']])
         fused_model2 = copy.deepcopy(fused_model1)
 
@@ -135,6 +217,17 @@ def forward(self, x):
         conv1 = self.getModule(fused_model1, 'conv1')[0]
         conv2 = self.getModule(fused_model1, 'conv2')[0]
         conv3 = self.getModule(fused_model1, 'conv3')[0]
+=======
+        fused_model1 = fuse_modules(
+            model, [["conv1", "relu1"], ["conv2", "relu2"], ["conv3", "relu3"]]
+        )
+        fused_model2 = copy.deepcopy(fused_model1)
+
+        _equalize.equalize(fused_model1, [["conv1", "conv2"], ["conv2", "conv3"]], 1e-6)
+        conv1 = self.getModule(fused_model1, "conv1")[0]
+        conv2 = self.getModule(fused_model1, "conv2")[0]
+        conv3 = self.getModule(fused_model1, "conv3")[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.checkChannelsEqualized(conv1.weight, conv2.weight, 0, 1)
         self.checkChannelsEqualized(conv2.weight, conv3.weight, 0, 1)
@@ -144,7 +237,11 @@ def forward(self, x):
         self.assertEqual(fused_model1(input), model(input))
 
     def test_equalize_fused_linearrelu(self):
+<<<<<<< HEAD
         ''' Checks to see if eager mode equalization supports fused
+=======
+        """Checks to see if eager mode equalization supports fused
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LinearReLU models
 
         A model with 3 LinearReLU is constructed. Next, the linear and relu
@@ -152,7 +249,12 @@ def test_equalize_fused_linearrelu(self):
         equalization applied. Finally, we ensure that the channels have been
         equalized and that the equalized and unequalized versions of the model
         yield the same output given the same input
+<<<<<<< HEAD
         '''
+=======
+        """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -174,6 +276,7 @@ def forward(self, x):
 
         model = M()
 
+<<<<<<< HEAD
         fused_model1 = fuse_modules(model, [['linear1', 'relu1'], ['linear2', 'relu2'], ['linear3', 'relu3']])
         fused_model2 = copy.deepcopy(fused_model1)
 
@@ -181,6 +284,19 @@ def forward(self, x):
         linear1 = self.getModule(fused_model1, 'linear1')[0]
         linear2 = self.getModule(fused_model1, 'linear2')[0]
         linear3 = self.getModule(fused_model1, 'linear3')[0]
+=======
+        fused_model1 = fuse_modules(
+            model, [["linear1", "relu1"], ["linear2", "relu2"], ["linear3", "relu3"]]
+        )
+        fused_model2 = copy.deepcopy(fused_model1)
+
+        _equalize.equalize(
+            fused_model1, [["linear1", "linear2"], ["linear2", "linear3"]], 1e-6
+        )
+        linear1 = self.getModule(fused_model1, "linear1")[0]
+        linear2 = self.getModule(fused_model1, "linear2")[0]
+        linear3 = self.getModule(fused_model1, "linear3")[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.checkChannelsEqualized(linear1.weight, linear2.weight, 0, 1)
         self.checkChannelsEqualized(linear2.weight, linear3.weight, 0, 1)
@@ -188,3 +304,10 @@ def forward(self, x):
         input = torch.randn(20, 3)
         self.assertEqual(fused_model1(input), fused_model2(input))
         self.assertEqual(fused_model1(input), model(input))
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/eager/test_fuse_eager.py b/test/quantization/eager/test_fuse_eager.py
index d10f259ccef5..9730fa5ac444 100644
--- a/test/quantization/eager/test_fuse_eager.py
+++ b/test/quantization/eager/test_fuse_eager.py
@@ -3,6 +3,7 @@
 import copy
 
 import torch
+<<<<<<< HEAD
 import torch.nn as nn
 import torch.ao.nn.quantized as nnq
 import torch.ao.nn.intrinsic as nni
@@ -34,6 +35,37 @@
     skipIfNoFBGEMM,
 )
 
+=======
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.qat as nniqat
+import torch.ao.nn.intrinsic.quantized as nniq
+import torch.ao.nn.quantized as nnq
+import torch.nn as nn
+from torch.ao.quantization import (
+    convert,
+    default_qat_qconfig,
+    default_qconfig,
+    fuse_modules,
+    fuse_modules_qat,
+    prepare,
+    prepare_qat,
+    QConfig,
+    quantize,
+    quantize_qat,
+)
+from torch.testing._internal.common_quantization import (
+    ModelForConvTransposeBNFusion,
+    ModelForFusion,
+    ModelForFusionWithBias,
+    ModelForLinearBNFusion,
+    ModelWithSequentialFusion,
+    QuantizationTestCase,
+    SingleLayerLinearModel,
+    skipIfNoFBGEMM,
+    test_only_eval_fn,
+    test_only_train_fn,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_quantized import (
     override_quantized_engine,
     supported_qengines,
@@ -45,6 +77,7 @@ class TestFuseEager(QuantizationTestCase):
     def test_fuse_module_train(self):
         model = ModelForFusion(default_qat_qconfig).train()
         # Test step by step fusion
+<<<<<<< HEAD
         model = fuse_modules_qat(model, ['conv1', 'bn1', 'relu1'])
         model = fuse_modules_qat(model, ['sub1.conv', 'sub1.bn'])
         self.assertEqual(type(model.conv1), nni.ConvBnReLU2d,
@@ -62,6 +95,40 @@ def test_fuse_module_train(self):
                          msg="Non-fused submodule Conv")
         self.assertEqual(type(model.sub2.relu), torch.nn.ReLU,
                          msg="Non-fused submodule ReLU")
+=======
+        model = fuse_modules_qat(model, ["conv1", "bn1", "relu1"])
+        model = fuse_modules_qat(model, ["sub1.conv", "sub1.bn"])
+        self.assertEqual(
+            type(model.conv1),
+            nni.ConvBnReLU2d,
+            msg="Fused Conv + BN + Relu first layer",
+        )
+        self.assertEqual(
+            type(model.bn1),
+            torch.nn.Identity,
+            msg="Fused Conv + BN + Relu (skipped BN)",
+        )
+        self.assertEqual(
+            type(model.relu1),
+            torch.nn.Identity,
+            msg="Fused Conv + BN + Relu (skipped Relu)",
+        )
+
+        self.assertEqual(
+            type(model.sub1.conv), nni.ConvBn2d, msg="Fused submodule Conv + BN"
+        )
+        self.assertEqual(
+            type(model.sub1.bn),
+            torch.nn.Identity,
+            msg="Fused submodule Conv + BN (skipped BN)",
+        )
+        self.assertEqual(
+            type(model.sub2.conv), torch.nn.Conv2d, msg="Non-fused submodule Conv"
+        )
+        self.assertEqual(
+            type(model.sub2.relu), torch.nn.ReLU, msg="Non-fused submodule ReLU"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = prepare_qat(model)
         self.checkObservers(model)
 
@@ -89,11 +156,19 @@ def checkQuantized(model):
             test_only_eval_fn(model, self.img_data_1d)
             self.checkNoQconfig(model)
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "Could not run 'aten::native_batch_norm' with arguments from the 'QuantizedCPU'"):
+=======
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Could not run 'aten::native_batch_norm' with arguments from the 'QuantizedCPU'",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             checkQuantized(model)
 
         model = ModelForFusion(default_qat_qconfig).train()
         model = fuse_modules_qat(
+<<<<<<< HEAD
             model,
             [['conv1', 'bn1', 'relu1'],
              ['sub1.conv', 'sub1.bn']])
@@ -102,11 +177,23 @@ def checkQuantized(model):
             checkQuantized(model)
 
 
+=======
+            model, [["conv1", "bn1", "relu1"], ["sub1.conv", "sub1.bn"]]
+        )
+        model = quantize_qat(model, test_only_train_fn, [self.img_data_1d_train])
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Could not run 'aten::native_batch_norm' with arguments from the 'QuantizedCPU'",
+        ):
+            checkQuantized(model)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fuse_module_eval(self):
         model = ModelForFusion(default_qconfig)
         model.eval()
         model = fuse_modules(
             model,
+<<<<<<< HEAD
             [['conv3', 'bn3', 'relu4'],
              ['conv1', 'bn1', 'relu1'],
              ['conv2', 'relu2'],
@@ -152,6 +239,101 @@ def test_fuse_module_eval(self):
                          msg="Non-fused submodule Conv")
         self.assertEqual(type(model.sub2.relu), torch.nn.ReLU,
                          msg="Non-fused submodule ReLU")
+=======
+            [
+                ["conv3", "bn3", "relu4"],
+                ["conv1", "bn1", "relu1"],
+                ["conv2", "relu2"],
+                ["bn2", "relu3"],
+                ["sub1.conv", "sub1.bn"],
+            ],
+        )
+        self.assertEqual(
+            type(model.conv1),
+            nni.ConvReLU2d,
+            msg="Fused Conv + BN + Relu first layer (BN is folded)",
+        )
+        self.assertEqual(
+            type(model.conv1[0]),
+            nn.Conv2d,
+            msg="Fused Conv + BN + Relu (Conv + folded BN only)",
+        )
+        self.assertEqual(
+            type(model.conv1[1]),
+            nn.ReLU,
+            msg="Fused Conv + BN + Relu second layer (Relu only)",
+        )
+        self.assertEqual(
+            type(model.bn1),
+            nn.Identity,
+            msg="Fused Conv + BN + Relu second layer (Skipped BN)",
+        )
+        self.assertEqual(
+            type(model.relu1),
+            nn.Identity,
+            msg="Fused Conv + BN + Relu second layer (Skipped Relu)",
+        )
+        self.assertEqual(
+            type(model.conv2),
+            nni.ConvReLU3d,
+            msg="Fused Conv + BN + Relu first layer (BN is folded)",
+        )
+        self.assertEqual(
+            type(model.bn2),
+            nni.BNReLU3d,
+            msg="Fused BN + Relu first layer (Relu is folded))",
+        )
+        self.assertEqual(
+            type(model.relu3),
+            nn.Identity,
+            msg="Fused BN + Relu second layer (Skipped Relu)",
+        )
+        self.assertEqual(
+            type(model.conv2[0]),
+            nn.Conv3d,
+            msg="Fused Conv + BN + Relu (Conv + folded BN only)",
+        )
+        self.assertEqual(
+            type(model.conv2[1]),
+            nn.ReLU,
+            msg="Fused Conv + BN + Relu second layer (Relu only)",
+        )
+        self.assertEqual(
+            type(model.relu2),
+            nn.Identity,
+            msg="Fused Conv + BN + Relu second layer (Skipped Relu)",
+        )
+
+        self.assertEqual(
+            type(model.conv3),
+            nni.ConvReLU1d,
+            msg="Fused Conv + Relu for Conv1d (folded BN)",
+        )
+        self.assertEqual(
+            type(model.conv3[0]), nn.Conv1d, msg="Fused Conv + Relu for Conv1d "
+        )
+        self.assertEqual(
+            type(model.conv3[1]), nn.ReLU, msg="Fused Conv + Relu for Conv1d"
+        )
+        self.assertEqual(
+            type(model.bn3),
+            nn.Identity,
+            msg="Fused Conv + BN + Relu for Conv1d (Skipped BN)",
+        )
+
+        self.assertEqual(
+            type(model.sub1.conv), nn.Conv2d, msg="Fused submodule Conv + folded BN"
+        )
+        self.assertEqual(
+            type(model.sub1.bn), nn.Identity, msg="Fused submodule (skipped BN)"
+        )
+        self.assertEqual(
+            type(model.sub2.conv), nn.Conv2d, msg="Non-fused submodule Conv"
+        )
+        self.assertEqual(
+            type(model.sub2.relu), torch.nn.ReLU, msg="Non-fused submodule ReLU"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model = prepare(model)
         self.checkObservers(model)
@@ -176,11 +358,22 @@ def checkQuantized(model):
         model = ModelForFusion(default_qconfig).eval()
         model = fuse_modules(
             model,
+<<<<<<< HEAD
             [['conv1', 'bn1', 'relu1'],
              ['conv2', 'relu2'],
              ['bn2', 'relu3'],
              ['sub1.conv', 'sub1.bn'],
              ['conv3', 'bn3', 'relu4']])
+=======
+            [
+                ["conv1", "bn1", "relu1"],
+                ["conv2", "relu2"],
+                ["bn2", "relu3"],
+                ["sub1.conv", "sub1.bn"],
+                ["conv3", "bn3", "relu4"],
+            ],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = quantize(model, test_only_eval_fn, [self.img_data_1d])
         checkQuantized(model)
 
@@ -190,6 +383,7 @@ def test_fusion_sequential_model_train(self):
                 model = ModelWithSequentialFusion().train()
                 model.to(torch.float)
                 fuse_modules_qat(
+<<<<<<< HEAD
                     model, [['conv1', 'relu1'] ,
                             ['features.0.0', 'features.0.1', 'features.0.2'],
                             ['features.1.0', 'features.1.1', 'features.1.2'],
@@ -211,6 +405,48 @@ def test_fusion_sequential_model_train(self):
                                      msg="Fused submodule (skipped BN)")
                     self.assertEqual(type(model.features[i][2]), nn.Identity,
                                      msg="Non-fused submodule Conv")
+=======
+                    model,
+                    [
+                        ["conv1", "relu1"],
+                        ["features.0.0", "features.0.1", "features.0.2"],
+                        ["features.1.0", "features.1.1", "features.1.2"],
+                        ["features.2.0", "features.2.1", "features.2.2"],
+                        ["classifier.0", "classifier.1"],
+                    ],
+                    inplace=True,
+                )
+                self.assertEqual(
+                    type(model.conv1),
+                    nni.ConvReLU2d,
+                    msg="Fused Conv + Relu: nni.ConvReLU2d",
+                )
+                self.assertEqual(
+                    type(model.conv1[0]), nn.Conv2d, msg="Fused Conv + Relu: Conv2d"
+                )
+                self.assertEqual(
+                    type(model.conv1[1]), nn.ReLU, msg="Fused Conv + Relu: Relu"
+                )
+                self.assertEqual(
+                    type(model.relu1), nn.Identity, msg="Fused Conv + Relu: Identity"
+                )
+                for i in range(3):
+                    self.assertEqual(
+                        type(model.features[i][0]),
+                        nni.ConvBnReLU2d,
+                        msg="Fused submodule Conv + folded BN",
+                    )
+                    self.assertEqual(
+                        type(model.features[i][1]),
+                        nn.Identity,
+                        msg="Fused submodule (skipped BN)",
+                    )
+                    self.assertEqual(
+                        type(model.features[i][2]),
+                        nn.Identity,
+                        msg="Non-fused submodule Conv",
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(type(model.classifier[0]), nni.LinearReLU)
                 self.assertEqual(type(model.classifier[1]), nn.Identity)
                 model.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
@@ -218,6 +454,7 @@ def test_fusion_sequential_model_train(self):
                 self.checkObservers(model)
                 model(self.img_data_2d[0][0])
 
+<<<<<<< HEAD
 
                 def checkQAT(model):
                     self.assertEqual(type(model.conv1), nniqat.ConvReLU2d)
@@ -229,6 +466,28 @@ def checkQAT(model):
                                      msg="Fused submodule (skipped BN)")
                     self.assertEqual(type(model.features[i][2]), nn.Identity,
                                      msg="Non-fused submodule Conv")
+=======
+                def checkQAT(model):
+                    self.assertEqual(type(model.conv1), nniqat.ConvReLU2d)
+                    self.assertEqual(type(model.relu1), nn.Identity)
+
+                for i in range(3):
+                    self.assertEqual(
+                        type(model.features[i][0]),
+                        nniqat.ConvBnReLU2d,
+                        msg="Fused submodule Conv + folded BN",
+                    )
+                    self.assertEqual(
+                        type(model.features[i][1]),
+                        nn.Identity,
+                        msg="Fused submodule (skipped BN)",
+                    )
+                    self.assertEqual(
+                        type(model.features[i][2]),
+                        nn.Identity,
+                        msg="Non-fused submodule Conv",
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(type(model.classifier[0]), nniqat.LinearReLU)
                 self.assertEqual(type(model.classifier[1]), nn.Identity)
 
@@ -245,6 +504,7 @@ def test_fusion_sequential_model_eval(self):
                 model.to(torch.float)
                 fuse_modules(
                     model,
+<<<<<<< HEAD
                     [['conv1', 'relu1'],
                      ['features.0.0', 'features.0.1', 'features.0.2'],
                      ['features.1.0', 'features.1.1', 'features.1.2'],
@@ -266,6 +526,47 @@ def test_fusion_sequential_model_eval(self):
                                      msg="Fused submodule (skipped BN)")
                     self.assertEqual(type(model.features[i][2]), nn.Identity,
                                      msg="Non-fused submodule Conv")
+=======
+                    [
+                        ["conv1", "relu1"],
+                        ["features.0.0", "features.0.1", "features.0.2"],
+                        ["features.1.0", "features.1.1", "features.1.2"],
+                        ["features.2.0", "features.2.1", "features.2.2"],
+                        ["classifier.0", "classifier.1"],
+                    ],
+                    inplace=True,
+                )
+                self.assertEqual(
+                    type(model.conv1),
+                    nni.ConvReLU2d,
+                    msg="Fused Conv + Relu: nni.ConvReLU2d",
+                )
+                self.assertEqual(
+                    type(model.conv1[0]), nn.Conv2d, msg="Fused Conv + Relu: Conv2d"
+                )
+                self.assertEqual(
+                    type(model.conv1[1]), nn.ReLU, msg="Fused Conv + Relu: Relu"
+                )
+                self.assertEqual(
+                    type(model.relu1), nn.Identity, msg="Fused Conv + Relu: Identity"
+                )
+                for i in range(3):
+                    self.assertEqual(
+                        type(model.features[i][0]),
+                        nni.ConvReLU2d,
+                        msg="Fused submodule Conv + folded BN",
+                    )
+                    self.assertEqual(
+                        type(model.features[i][1]),
+                        nn.Identity,
+                        msg="Fused submodule (skipped BN)",
+                    )
+                    self.assertEqual(
+                        type(model.features[i][2]),
+                        nn.Identity,
+                        msg="Non-fused submodule Conv",
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(type(model.classifier[0]), nni.LinearReLU)
                 self.assertEqual(type(model.classifier[1]), nn.Identity)
                 model.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
@@ -297,12 +598,21 @@ def test_fusion_conv_with_bias(self):
                 out_ref = model_ref(self.img_data_2d[0][0])
 
                 # fused model
+<<<<<<< HEAD
                 model_orig.qconfig = QConfig(activation=torch.nn.Identity,
                                              weight=torch.nn.Identity)
                 model = fuse_modules_qat(
                     model_orig,
                     [["conv1", "bn1", "relu1"],
                      ["conv2", "bn2"]])
+=======
+                model_orig.qconfig = QConfig(
+                    activation=torch.nn.Identity, weight=torch.nn.Identity
+                )
+                model = fuse_modules_qat(
+                    model_orig, [["conv1", "bn1", "relu1"], ["conv2", "bn2"]]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 prep_model = prepare_qat(model, inplace=False)
                 # output with fusion but no observers.
                 out_fused = prep_model(self.img_data_2d[0][0])
@@ -332,7 +642,10 @@ def checkQAT(model):
 
                 checkQAT(model)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fusion_linear_bn_eval(self):
         model = ModelForLinearBNFusion().train()
         inp1 = torch.randn(8, 20)
@@ -357,7 +670,13 @@ def test_fusion_convtranspose_bn_eval(self):
         model.eval()
         golden = model(inp2)
 
+<<<<<<< HEAD
         model = fuse_modules(model, [["conv1", "bn1"], ["conv2", "bn2"], ["conv3", "bn3"]])
+=======
+        model = fuse_modules(
+            model, [["conv1", "bn1"], ["conv2", "bn2"], ["conv3", "bn3"]]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(type(model.bn1), nn.Identity)
         self.assertEqual(type(model.bn2), nn.Identity)
         self.assertEqual(type(model.bn3), nn.Identity)
@@ -384,13 +703,19 @@ def test_forward_hooks_preserved(self):
         model = ModelForFusion(default_qat_qconfig).train()
 
         counter = {
+<<<<<<< HEAD
             'pre_forwards': 0,
             'forwards': 0,
+=======
+            "pre_forwards": 0,
+            "forwards": 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         fused = False
 
         def fw_pre_hook(fused_module_class, h_module, input):
             if fused:
+<<<<<<< HEAD
                 self.assertEqual(type(h_module), fused_module_class,
                                  "After fusion owner of the first module's forward pre hook is not a fused module")
             counter['pre_forwards'] += 1
@@ -420,14 +745,68 @@ def fw_hook(fused_module_class, h_module, input, output):
         test_only_eval_fn(model, self.img_data_1d)
         self.assertEqual(counter['pre_forwards'] - before_fusion_pre_count, 2 * len(self.img_data_1d))
         self.assertEqual(counter['forwards'] - before_fusion_post_count, 2 * len(self.img_data_1d))
+=======
+                self.assertEqual(
+                    type(h_module),
+                    fused_module_class,
+                    "After fusion owner of the first module's forward pre hook is not a fused module",
+                )
+            counter["pre_forwards"] += 1
+
+        def fw_hook(fused_module_class, h_module, input, output):
+            if fused:
+                self.assertEqual(
+                    type(h_module),
+                    fused_module_class,
+                    "After fusion owner of the last module's forward hook is not a fused module",
+                )
+            counter["forwards"] += 1
+
+        # Registering two pre and two post forward hooks, thus expecting counter increment by two each inference
+        model.conv1.register_forward_pre_hook(
+            lambda *args: fw_pre_hook(nni.ConvBnReLU2d, *args)
+        )
+        model.sub1.conv.register_forward_pre_hook(
+            lambda *args: fw_pre_hook(nni.ConvBn2d, *args)
+        )
+        model.relu1.register_forward_hook(
+            lambda *args: fw_hook(nni.ConvBnReLU2d, *args)
+        )
+        model.sub1.bn.register_forward_hook(lambda *args: fw_hook(nni.ConvBn2d, *args))
+
+        test_only_eval_fn(model, self.img_data_1d)
+        self.assertEqual(counter["pre_forwards"], 2 * len(self.img_data_1d))
+        self.assertEqual(counter["forwards"], 2 * len(self.img_data_1d))
+
+        model = fuse_modules_qat(model, ["conv1", "bn1", "relu1"])
+        model = fuse_modules_qat(model, ["sub1.conv", "sub1.bn"])
+
+        fused = True
+        before_fusion_pre_count = counter["pre_forwards"]
+        before_fusion_post_count = counter["forwards"]
+        test_only_eval_fn(model, self.img_data_1d)
+        self.assertEqual(
+            counter["pre_forwards"] - before_fusion_pre_count, 2 * len(self.img_data_1d)
+        )
+        self.assertEqual(
+            counter["forwards"] - before_fusion_post_count, 2 * len(self.img_data_1d)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_fuse_modules_with_nested_hooks(self):
         r"""Test case that checks whether a nested module with sub-sub modules registered with hooks
         can be safely fused. Safeguard for issues similar to https://github.com/pytorch/pytorch/issues/105063
         in the future.
         """
+<<<<<<< HEAD
+        def myhook(*x):
+            return ""
+=======
+
         def myhook(*x):
             return ""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for qengine in supported_qengines:
             with override_quantized_engine(qengine):
                 model = ModelWithSequentialFusion().eval()
@@ -435,6 +814,7 @@ def myhook(*x):
                 for sub_model in model.modules():
                     if isinstance(sub_model, nn.Sequential):
                         for layer in sub_model:
+<<<<<<< HEAD
                             if hasattr(layer, 'register_forward_hook'):
                                 layer.register_forward_hook(myhook)
 
@@ -443,20 +823,46 @@ def myhook(*x):
                     type(model.features[0][0]),
                     nni.ConvReLU2d,
                     msg="Fused submodule Conv + folded BN"
+=======
+                            if hasattr(layer, "register_forward_hook"):
+                                layer.register_forward_hook(myhook)
+
+                fuse_modules(
+                    model,
+                    [["features.0.0", "features.0.1", "features.0.2"]],
+                    inplace=True,
+                )
+                self.assertEqual(
+                    type(model.features[0][0]),
+                    nni.ConvReLU2d,
+                    msg="Fused submodule Conv + folded BN",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 self.assertEqual(
                     type(model.features[0][1]),
                     nn.Identity,
+<<<<<<< HEAD
                     msg="Fused submodule (skipped BN)"
+=======
+                    msg="Fused submodule (skipped BN)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 self.assertEqual(
                     type(model.features[0][2]),
                     nn.Identity,
+<<<<<<< HEAD
                     msg="Non-fused submodule Conv"
                 )
 
 
 if __name__ == '__main__':
+=======
+                    msg="Non-fused submodule Conv",
+                )
+
+
+if __name__ == "__main__":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     raise RuntimeError(
         "This test file is not meant to be run directly, use:\n\n"
         "\tpython test/test_quantization.py TESTNAME\n\n"
diff --git a/test/quantization/eager/test_model_numerics.py b/test/quantization/eager/test_model_numerics.py
index 1a1ef3b917fc..efca22dfd730 100644
--- a/test/quantization/eager/test_model_numerics.py
+++ b/test/quantization/eager/test_model_numerics.py
@@ -1,17 +1,28 @@
 # Owner(s): ["oncall: quantization"]
 
 import torch
+<<<<<<< HEAD
 
 from torch.testing._internal.common_quantization import (
     QuantizationTestCase,
     ModelMultipleOps,
     ModelMultipleOpsNoAvgPool,
+=======
+from torch.testing._internal.common_quantization import (
+    ModelMultipleOps,
+    ModelMultipleOpsNoAvgPool,
+    QuantizationTestCase,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_quantized import (
     override_quantized_engine,
     supported_qengines,
 )
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestModelNumericsEager(QuantizationTestCase):
     def test_float_quant_compare_per_tensor(self):
         for qengine in supported_qengines:
@@ -25,16 +36,36 @@ def test_float_quant_compare_per_tensor(self):
                 qModel = torch.ao.quantization.QuantWrapper(my_model)
                 qModel.eval()
                 qModel.qconfig = torch.ao.quantization.default_qconfig
+<<<<<<< HEAD
                 torch.ao.quantization.fuse_modules(qModel.module, [['conv1', 'bn1', 'relu1']], inplace=True)
+=======
+                torch.ao.quantization.fuse_modules(
+                    qModel.module, [["conv1", "bn1", "relu1"]], inplace=True
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.ao.quantization.prepare(qModel, inplace=True)
                 qModel(calib_data)
                 torch.ao.quantization.convert(qModel, inplace=True)
                 out_q = qModel(eval_data)
+<<<<<<< HEAD
                 SQNRdB = 20 * torch.log10(torch.norm(out_ref) / torch.norm(out_ref - out_q))
                 # Quantized model output should be close to floating point model output numerically
                 # Setting target SQNR to be 30 dB so that relative error is 1e-3 below the desired
                 # output
                 self.assertGreater(SQNRdB, 30, msg='Quantized model numerics diverge from float, expect SQNR > 30 dB')
+=======
+                SQNRdB = 20 * torch.log10(
+                    torch.norm(out_ref) / torch.norm(out_ref - out_q)
+                )
+                # Quantized model output should be close to floating point model output numerically
+                # Setting target SQNR to be 30 dB so that relative error is 1e-3 below the desired
+                # output
+                self.assertGreater(
+                    SQNRdB,
+                    30,
+                    msg="Quantized model numerics diverge from float, expect SQNR > 30 dB",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_float_quant_compare_per_channel(self):
         # Test for per-channel Quant
@@ -47,7 +78,13 @@ def test_float_quant_compare_per_channel(self):
         q_model = torch.ao.quantization.QuantWrapper(my_model)
         q_model.eval()
         q_model.qconfig = torch.ao.quantization.default_per_channel_qconfig
+<<<<<<< HEAD
         torch.ao.quantization.fuse_modules(q_model.module, [['conv1', 'bn1', 'relu1']], inplace=True)
+=======
+        torch.ao.quantization.fuse_modules(
+            q_model.module, [["conv1", "bn1", "relu1"]], inplace=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.ao.quantization.prepare(q_model)
         q_model(calib_data)
         torch.ao.quantization.convert(q_model)
@@ -55,7 +92,15 @@ def test_float_quant_compare_per_channel(self):
         SQNRdB = 20 * torch.log10(torch.norm(out_ref) / torch.norm(out_ref - out_q))
         # Quantized model output should be close to floating point model output numerically
         # Setting target SQNR to be 35 dB
+<<<<<<< HEAD
         self.assertGreater(SQNRdB, 35, msg='Quantized model numerics diverge from float, expect SQNR > 35 dB')
+=======
+        self.assertGreater(
+            SQNRdB,
+            35,
+            msg="Quantized model numerics diverge from float, expect SQNR > 35 dB",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_fake_quant_true_quant_compare(self):
         for qengine in supported_qengines:
@@ -69,7 +114,13 @@ def test_fake_quant_true_quant_compare(self):
                 fq_model = torch.ao.quantization.QuantWrapper(my_model)
                 fq_model.train()
                 fq_model.qconfig = torch.ao.quantization.default_qat_qconfig
+<<<<<<< HEAD
                 torch.ao.quantization.fuse_modules_qat(fq_model.module, [['conv1', 'bn1', 'relu1']], inplace=True)
+=======
+                torch.ao.quantization.fuse_modules_qat(
+                    fq_model.module, [["conv1", "bn1", "relu1"]], inplace=True
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.ao.quantization.prepare_qat(fq_model)
                 fq_model.eval()
                 fq_model.apply(torch.ao.quantization.disable_fake_quant)
@@ -78,6 +129,7 @@ def test_fake_quant_true_quant_compare(self):
                 fq_model.apply(torch.ao.quantization.enable_fake_quant)
                 fq_model.apply(torch.ao.quantization.disable_observer)
                 out_fq = fq_model(eval_data)
+<<<<<<< HEAD
                 SQNRdB = 20 * torch.log10(torch.norm(out_ref) / torch.norm(out_ref - out_fq))
                 # Quantized model output should be close to floating point model output numerically
                 # Setting target SQNR to be 35 dB
@@ -86,6 +138,28 @@ def test_fake_quant_true_quant_compare(self):
                 out_q = fq_model(eval_data)
                 SQNRdB = 20 * torch.log10(torch.norm(out_fq) / (torch.norm(out_fq - out_q) + 1e-10))
                 self.assertGreater(SQNRdB, 60, msg='Fake quant and true quant numerics diverge, expect SQNR > 60 dB')
+=======
+                SQNRdB = 20 * torch.log10(
+                    torch.norm(out_ref) / torch.norm(out_ref - out_fq)
+                )
+                # Quantized model output should be close to floating point model output numerically
+                # Setting target SQNR to be 35 dB
+                self.assertGreater(
+                    SQNRdB,
+                    35,
+                    msg="Quantized model numerics diverge from float, expect SQNR > 35 dB",
+                )
+                torch.ao.quantization.convert(fq_model)
+                out_q = fq_model(eval_data)
+                SQNRdB = 20 * torch.log10(
+                    torch.norm(out_fq) / (torch.norm(out_fq - out_q) + 1e-10)
+                )
+                self.assertGreater(
+                    SQNRdB,
+                    60,
+                    msg="Fake quant and true quant numerics diverge, expect SQNR > 60 dB",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Test to compare weight only quantized model numerics and
     # activation only quantized model numerics with float
@@ -95,8 +169,15 @@ def test_weight_only_activation_only_fakequant(self):
                 torch.manual_seed(67)
                 calib_data = torch.rand(2048, 3, 15, 15, dtype=torch.float32)
                 eval_data = torch.rand(10, 3, 15, 15, dtype=torch.float32)
+<<<<<<< HEAD
                 qconfigset = {torch.ao.quantization.default_weight_only_qconfig,
                               torch.ao.quantization.default_activation_only_qconfig}
+=======
+                qconfigset = {
+                    torch.ao.quantization.default_weight_only_qconfig,
+                    torch.ao.quantization.default_activation_only_qconfig,
+                }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 SQNRTarget = [35, 45]
                 for idx, qconfig in enumerate(qconfigset):
                     my_model = ModelMultipleOpsNoAvgPool().to(torch.float32)
@@ -105,7 +186,13 @@ def test_weight_only_activation_only_fakequant(self):
                     fq_model = torch.ao.quantization.QuantWrapper(my_model)
                     fq_model.train()
                     fq_model.qconfig = qconfig
+<<<<<<< HEAD
                     torch.ao.quantization.fuse_modules_qat(fq_model.module, [['conv1', 'bn1', 'relu1']], inplace=True)
+=======
+                    torch.ao.quantization.fuse_modules_qat(
+                        fq_model.module, [["conv1", "bn1", "relu1"]], inplace=True
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch.ao.quantization.prepare_qat(fq_model)
                     fq_model.eval()
                     fq_model.apply(torch.ao.quantization.disable_fake_quant)
@@ -114,6 +201,7 @@ def test_weight_only_activation_only_fakequant(self):
                     fq_model.apply(torch.ao.quantization.enable_fake_quant)
                     fq_model.apply(torch.ao.quantization.disable_observer)
                     out_fq = fq_model(eval_data)
+<<<<<<< HEAD
                     SQNRdB = 20 * torch.log10(torch.norm(out_ref) / torch.norm(out_ref - out_fq))
                     self.assertGreater(SQNRdB, SQNRTarget[idx], msg='Quantized model numerics diverge from float')
 
@@ -122,3 +210,21 @@ def test_weight_only_activation_only_fakequant(self):
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
                        "instead.")
+=======
+                    SQNRdB = 20 * torch.log10(
+                        torch.norm(out_ref) / torch.norm(out_ref - out_fq)
+                    )
+                    self.assertGreater(
+                        SQNRdB,
+                        SQNRTarget[idx],
+                        msg="Quantized model numerics diverge from float",
+                    )
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_quantization.py TESTNAME\n\n"
+        "instead."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/eager/test_numeric_suite_eager.py b/test/quantization/eager/test_numeric_suite_eager.py
index 6ce8fe1255f1..8fbded347d4d 100644
--- a/test/quantization/eager/test_numeric_suite_eager.py
+++ b/test/quantization/eager/test_numeric_suite_eager.py
@@ -2,6 +2,7 @@
 # ruff: noqa: F841
 
 import unittest
+<<<<<<< HEAD
 import torch
 import torch.nn as nn
 import torch.ao.nn.quantized as nnq
@@ -23,12 +24,37 @@
     compare_weights,
     prepare_model_outputs,
     get_matching_activations,
+=======
+
+import torch
+import torch.ao.nn.quantized as nnq
+import torch.nn as nn
+from torch.ao.ns._numeric_suite import (
+    compare_model_outputs,
+    compare_model_stub,
+    compare_weights,
+    get_matching_activations,
+    OutputLogger,
+    prepare_model_outputs,
+    Shadow,
+    ShadowLogger,
+)
+from torch.ao.quantization import (
+    convert,
+    default_qconfig,
+    DeQuantStub,
+    prepare,
+    quantize,
+    quantize_dynamic,
+    QuantStub,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_quantization import (
     AnnotatedConvBnReLUModel,
     AnnotatedConvModel,
     AnnotatedConvTransposeModel,
     AnnotatedSingleLayerLinearModel,
+<<<<<<< HEAD
     LSTMwithHiddenDynamicModel,
     AnnotatedTwoLayerLinearModel,
     QuantizationTestCase,
@@ -38,6 +64,18 @@
 )
 from torch.testing._internal.common_quantized import override_qengines
 from torch.testing._internal.common_utils import IS_ARM64
+=======
+    AnnotatedTwoLayerLinearModel,
+    LSTMwithHiddenDynamicModel,
+    QuantizationTestCase,
+    SingleLayerLinearDynamicModel,
+    skip_if_no_torchvision,
+    test_only_eval_fn,
+)
+from torch.testing._internal.common_quantized import override_qengines
+from torch.testing._internal.common_utils import IS_ARM64, raise_on_run_directly
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class SubModule(torch.nn.Module):
     def __init__(self) -> None:
@@ -200,10 +238,25 @@ def compare_and_validate_results(float_model, q_model, module_swap_list, data):
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(v["float"][i].shape == v["quantized"][i].shape)
 
+<<<<<<< HEAD
         model_list = [AnnotatedConvModel(qengine),
                       AnnotatedConvTransposeModel("qnnpack"),  # ConvT cannot use per channel weights
                       AnnotatedConvBnReLUModel(qengine)]
         module_swap_list = [nn.Conv2d, nn.intrinsic.modules.fused.ConvReLU2d, nn.ConvTranspose2d]
+=======
+        model_list = [
+            AnnotatedConvModel(qengine),
+            AnnotatedConvTransposeModel(
+                "qnnpack"
+            ),  # ConvT cannot use per channel weights
+            AnnotatedConvBnReLUModel(qengine),
+        ]
+        module_swap_list = [
+            nn.Conv2d,
+            nn.intrinsic.modules.fused.ConvReLU2d,
+            nn.ConvTranspose2d,
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for model in model_list:
             model.eval()
             if hasattr(model, "fuse_model"):
@@ -279,7 +332,10 @@ def test_compare_model_stub_submodule_static(self):
         self.assertTrue(isinstance(q_model.mod1, Shadow))
         self.assertFalse(isinstance(q_model.conv, Shadow))
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @override_qengines
     def test_compare_model_stub_functional_static(self):
         r"""Compare the output of static quantized functional layer and its float shadow module"""
@@ -486,7 +542,13 @@ def compare_and_validate_results(float_model, q_model, input, hidden):
                 for i, val in enumerate(v["quantized"]):
                     self.assertTrue(len(v["float"][i]) == len(v["quantized"][i]))
                     if i == 0:
+<<<<<<< HEAD
                         self.assertTrue(v["float"][i][0].shape == v["quantized"][i][0].shape)
+=======
+                        self.assertTrue(
+                            v["float"][i][0].shape == v["quantized"][i][0].shape
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     else:
                         self.assertTrue(
                             v["float"][i][0].shape == v["quantized"][i][0].shape
@@ -540,12 +602,32 @@ def test_shadow_logger(self):
 
     @skip_if_no_torchvision
     def _test_vision_model(self, float_model):
+<<<<<<< HEAD
         float_model.to('cpu')
         float_model.eval()
         float_model.fuse_model()
         float_model.qconfig = torch.ao.quantization.default_qconfig
         img_data = [(torch.rand(2, 3, 224, 224, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)) for _ in range(2)]
         qmodel = quantize(float_model, torch.ao.quantization.default_eval_fn, [img_data], inplace=False)
+=======
+        float_model.to("cpu")
+        float_model.eval()
+        float_model.fuse_model()
+        float_model.qconfig = torch.ao.quantization.default_qconfig
+        img_data = [
+            (
+                torch.rand(2, 3, 224, 224, dtype=torch.float),
+                torch.randint(0, 1, (2,), dtype=torch.long),
+            )
+            for _ in range(2)
+        ]
+        qmodel = quantize(
+            float_model,
+            torch.ao.quantization.default_eval_fn,
+            [img_data],
+            inplace=False,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         wt_compare_dict = compare_weights(float_model.state_dict(), qmodel.state_dict())
 
@@ -560,9 +642,17 @@ def compute_error(x, y):
         # 'quantized', containing the activations of floating point and quantized model at matching locations.
         act_compare_dict = compare_model_outputs(float_model, qmodel, data)
 
+<<<<<<< HEAD
 
         for key in act_compare_dict:
             compute_error(act_compare_dict[key]['float'][0], act_compare_dict[key]['quantized'][0].dequantize())
+=======
+        for key in act_compare_dict:
+            compute_error(
+                act_compare_dict[key]["float"][0],
+                act_compare_dict[key]["quantized"][0].dequantize(),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         prepare_model_outputs(float_model, qmodel)
 
@@ -579,10 +669,23 @@ def compute_error(x, y):
     @unittest.skipIf(IS_ARM64, "Not working on arm right now")
     def test_mobilenet_v2(self):
         from torchvision.models.quantization import mobilenet_v2
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_vision_model(mobilenet_v2(pretrained=True, quantize=False))
 
     @skip_if_no_torchvision
     @unittest.skipIf(IS_ARM64, "Not working on arm right now")
     def test_mobilenet_v3(self):
         from torchvision.models.quantization import mobilenet_v3_large
+<<<<<<< HEAD
+        self._test_vision_model(mobilenet_v3_large(pretrained=True, quantize=False))
+=======
+
         self._test_vision_model(mobilenet_v3_large(pretrained=True, quantize=False))
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py
index 923f7f358ca8..eaf11693d400 100644
--- a/test/quantization/eager/test_quantize_eager_ptq.py
+++ b/test/quantization/eager/test_quantize_eager_ptq.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: quantization"]
 # ruff: noqa: F841
 
+<<<<<<< HEAD
 import torch
 import torch.nn as nn
 import torch.ao.nn.quantized as nnq
@@ -44,10 +45,52 @@
     prepare_dynamic,
     convert_dynamic,
     skipIfNoFBGEMM,
+=======
+from hypothesis import given, strategies as st
+
+import torch
+import torch.ao.nn.quantized as nnq
+import torch.nn as nn
+import torch.testing._internal.hypothesis_utils as hu
+from torch.ao.quantization import (
+    convert,
+    default_dynamic_qconfig,
+    default_dynamic_quant_observer,
+    default_qconfig,
+    default_weight_observer,
+    DeQuantStub,
+    FixedQParamsObserver,
+    float16_dynamic_qconfig,
+    float_qparams_weight_only_qconfig,
+    float_qparams_weight_only_qconfig_4bit,
+    per_channel_dynamic_qconfig,
+    PerChannelMinMaxObserver,
+    prepare,
+    prepare_qat,
+    QConfig,
+    quantize,
+    quantize_dynamic,
+    QuantStub,
+    QuantWrapper,
+)
+from torch.nn.utils.rnn import PackedSequence
+
+# annotated models
+from torch.testing._internal.common_quantization import (
+    ActivationsTestModel,
+    AnnotatedCustomConfigNestedModel,
+    AnnotatedNestedModel,
+    AnnotatedSingleLayerLinearModel,
+    AnnotatedSkipQuantModel,
+    AnnotatedSubNestedModel,
+    AnnotatedTwoLayerLinearModel,
+    convert_dynamic,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     EmbeddingBagModule,
     EmbeddingModule,
     EmbeddingWithStaticLinear,
     LinearReluLinearModel,
+<<<<<<< HEAD
 )
 
 # annotated models
@@ -68,11 +111,35 @@
 from hypothesis import given
 from hypothesis import strategies as st
 import torch.testing._internal.hypothesis_utils as hu
+=======
+    ModelWithFunctionals,
+    NestedModel,
+    NormalizationTestModel,
+    prepare_dynamic,
+    QuantizationTestCase,
+    QuantStubModel,
+    ResNetBase,
+    RNNCellDynamicModel,
+    RNNDynamicModel,
+    SingleLayerLinearDynamicModel,
+    skipIfNoFBGEMM,
+    test_only_eval_fn,
+    TwoLayerLinearModel,
+)
+from torch.testing._internal.common_quantized import (
+    override_qengines,
+    override_quantized_engine,
+    supported_qengines,
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 hu.assert_deadline_disabled()
 
 # Standard library
 import numpy as np
 
+<<<<<<< HEAD
 class TestQuantizeEagerOps(QuantizationTestCase):
     @override_qengines
     def _test_reference_module_impl(self,
@@ -80,6 +147,18 @@ def _test_reference_module_impl(self,
                                     quantized_module_class,
                                     extra_module_kwargs,
                                     input_size):
+=======
+
+class TestQuantizeEagerOps(QuantizationTestCase):
+    @override_qengines
+    def _test_reference_module_impl(
+        self,
+        float_module_class,
+        quantized_module_class,
+        extra_module_kwargs,
+        input_size,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -111,8 +190,13 @@ def forward(self, x):
                 return x
 
         qengine = torch.backends.quantized.engine
+<<<<<<< HEAD
         if qengine not in supported_qengines or qengine == 'qnnpack':
             return   # qnnpack does not support nnq.ConvTranspose3d
+=======
+        if qengine not in supported_qengines or qengine == "qnnpack":
+            return  # qnnpack does not support nnq.ConvTranspose3d
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         data = torch.randn(*input_size, dtype=torch.float)
         original_m = M()
@@ -147,61 +231,98 @@ def test_conv_1d(self):
         self._test_reference_module_impl(
             nn.Conv1d,
             nnq.Conv1d,
+<<<<<<< HEAD
             {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
             (16, 1, 1)
+=======
+            {"in_channels": 1, "out_channels": 1, "kernel_size": 1},
+            (16, 1, 1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_conv_2d(self):
         self._test_reference_module_impl(
             nn.Conv2d,
             nnq.Conv2d,
+<<<<<<< HEAD
             {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
             (16, 1, 10, 10)
+=======
+            {"in_channels": 1, "out_channels": 1, "kernel_size": 1},
+            (16, 1, 10, 10),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_conv_3d(self):
         self._test_reference_module_impl(
             nn.Conv3d,
             nnq.Conv3d,
+<<<<<<< HEAD
             {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
             (16, 1, 10, 10, 10)
+=======
+            {"in_channels": 1, "out_channels": 1, "kernel_size": 1},
+            (16, 1, 10, 10, 10),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_conv_transpose_1d(self):
         self._test_reference_module_impl(
             nn.ConvTranspose1d,
             nnq.ConvTranspose1d,
+<<<<<<< HEAD
             {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
             (16, 1, 1)
+=======
+            {"in_channels": 1, "out_channels": 1, "kernel_size": 1},
+            (16, 1, 1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_conv_transpose_2d(self):
         self._test_reference_module_impl(
             nn.ConvTranspose2d,
             nnq.ConvTranspose2d,
+<<<<<<< HEAD
             {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
             (16, 1, 10, 10)
+=======
+            {"in_channels": 1, "out_channels": 1, "kernel_size": 1},
+            (16, 1, 10, 10),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_conv_transpose_3d(self):
         self._test_reference_module_impl(
             nn.ConvTranspose3d,
             nnq.ConvTranspose3d,
+<<<<<<< HEAD
             {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1},
             (16, 1, 10, 10, 10)
+=======
+            {"in_channels": 1, "out_channels": 1, "kernel_size": 1},
+            (16, 1, 10, 10, 10),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_linear(self):
         self._test_reference_module_impl(
+<<<<<<< HEAD
             nn.Linear,
             nnq.Linear,
             {'in_features': 5, 'out_features': 10},
             (16, 5)
+=======
+            nn.Linear, nnq.Linear, {"in_features": 5, "out_features": 10}, (16, 5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @override_qengines
     def test_int16_reference_module(self):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class RefM(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -219,7 +340,10 @@ def forward(self, x):
                 x = self.dequant2(x)
                 return x
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_size = (16, 1, 10, 10)
         data = torch.randn(*input_size, dtype=torch.float)
 
@@ -237,14 +361,24 @@ def forward(self, x):
         weight_obs = MovingAverageMinMaxObserver.with_args(
             dtype=torch.qint32,
             # set qmin and qmax to represent qint16
+<<<<<<< HEAD
             quant_min=-1 * (2 ** 15),
             quant_max=(2 ** 15) - 1,
+=======
+            quant_min=-1 * (2**15),
+            quant_max=(2**15) - 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             qscheme=torch.per_tensor_symmetric,
         )
         act_obs = MovingAverageMinMaxObserver.with_args(
             dtype=torch.qint32,
+<<<<<<< HEAD
             quant_min=-1 * (2 ** 15),
             quant_max=(2 ** 15) - 1,
+=======
+            quant_min=-1 * (2**15),
+            quant_max=(2**15) - 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         custom_qconfig = QConfig(activation=act_obs, weight=weight_obs)
 
@@ -258,6 +392,7 @@ def forward(self, x):
 
         ref_m = convert(ref_m, is_reference=True)
 
+<<<<<<< HEAD
         myobs = MovingAverageMinMaxObserver(averaging_constant=0.5,
                                             dtype=torch.qint32,
                                             # set qmin and qmax to represent qint16
@@ -265,10 +400,21 @@ def forward(self, x):
                                             quant_max=(2 ** 15) - 1,
                                             qscheme=torch.per_tensor_symmetric,
                                             )
+=======
+        myobs = MovingAverageMinMaxObserver(
+            averaging_constant=0.5,
+            dtype=torch.qint32,
+            # set qmin and qmax to represent qint16
+            quant_min=-1 * (2**15),
+            quant_max=(2**15) - 1,
+            qscheme=torch.per_tensor_symmetric,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = myobs(rand_w)
         qparams = myobs.calculate_qparams()
         self.assertEqual(ref_m.conv.weight_scale, qparams[0])
 
+<<<<<<< HEAD
 
     def _test_activation_op_impl(
             self, float_module_class, quantized_module_class, extra_module_kwargs):
@@ -276,6 +422,16 @@ def _test_activation_op_impl(
         Args:
             extra_module_kwargs: keyword args to instantiate the float module
         """
+=======
+    def _test_activation_op_impl(
+        self, float_module_class, quantized_module_class, extra_module_kwargs
+    ):
+        """Implementation for testing common activation ops like leaky relu
+        Args:
+            extra_module_kwargs: keyword args to instantiate the float module
+        """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -297,10 +453,19 @@ def forward(self, x):
         self.assertEqual(type(m.activation_op), quantized_module_class)
 
     def test_leaky_relu(self):
+<<<<<<< HEAD
         self._test_activation_op_impl(nn.LeakyReLU, nnq.LeakyReLU, {'negative_slope': 0.1, 'inplace': False})
 
     def test_relu(self):
         self._test_activation_op_impl(nn.ReLU, nn.ReLU, {'inplace': False})
+=======
+        self._test_activation_op_impl(
+            nn.LeakyReLU, nnq.LeakyReLU, {"negative_slope": 0.1, "inplace": False}
+        )
+
+    def test_relu(self):
+        self._test_activation_op_impl(nn.ReLU, nn.ReLU, {"inplace": False})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Histogram Observers are slow, so have no-deadline to ensure test doesn't time out
     @given(train_mode=st.booleans())
@@ -310,10 +475,17 @@ def test_functional_module(self, train_mode):
         xq = torch.quantize_per_tensor(x, 0.01, 30, torch.quint8)
         self.checkScriptable(model, [[x]], check_save_load=True)
         if train_mode:
+<<<<<<< HEAD
             model.qconfig = torch.ao.quantization.get_default_qat_qconfig('fbgemm')
             model = prepare_qat(model)
         else:
             model.qconfig = torch.ao.quantization.get_default_qconfig('qnnpack')
+=======
+            model.qconfig = torch.ao.quantization.get_default_qat_qconfig("fbgemm")
+            model = prepare_qat(model)
+        else:
+            model.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model = prepare(model)
         # Check if observers and quant/dequant nodes are inserted
         self.checkNoPrepModules(model)
@@ -333,8 +505,13 @@ def checkQuantized(model):
         checkQuantized(model)
         self.checkScriptable(model, [[xq]], check_save_load=True)
 
+<<<<<<< HEAD
 class TestQuantizeEagerPTQStatic(QuantizationTestCase):
 
+=======
+
+class TestQuantizeEagerPTQStatic(QuantizationTestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_single_layer(self):
         r"""Quantize SingleLayerLinearModel which has one Linear module, make sure it is swapped
         to nnq.Linear which is the quantized version of the module
@@ -370,7 +547,13 @@ def checkQuantized(model):
                 model = quantize(base, test_only_eval_fn, [self.calib_data])
                 checkQuantized(model)
                 keys_after = set(base.state_dict().keys())
+<<<<<<< HEAD
                 self.assertEqual(keys_before, keys_after)  # simple check that nothing changed
+=======
+                self.assertEqual(
+                    keys_before, keys_after
+                )  # simple check that nothing changed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # in-place version
                 model = AnnotatedSingleLayerLinearModel(qengine)
@@ -383,7 +566,11 @@ def test_two_layers(self):
         r"""TwoLayerLinearModel has two Linear modules but we only quantize the second one
         `fc2`, and `fc1`is not quantized
         """
+<<<<<<< HEAD
         with override_quantized_engine('fbgemm'):
+=======
+        with override_quantized_engine("fbgemm"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model = AnnotatedTwoLayerLinearModel()
             model = prepare(model)
 
@@ -408,8 +595,14 @@ def checkQuantized(model):
             checkQuantized(model)
 
             # test one line API
+<<<<<<< HEAD
             model = quantize(AnnotatedTwoLayerLinearModel(), test_only_eval_fn,
                              [self.calib_data])
+=======
+            model = quantize(
+                AnnotatedTwoLayerLinearModel(), test_only_eval_fn, [self.calib_data]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             checkQuantized(model)
 
     def test_nested1(self):
@@ -450,11 +643,19 @@ def checkQuantized(model):
                 checkQuantized(model)
 
                 # test one line API
+<<<<<<< HEAD
                 model = quantize(AnnotatedNestedModel(qengine), test_only_eval_fn,
                                  [self.calib_data])
                 checkQuantized(model)
 
 
+=======
+                model = quantize(
+                    AnnotatedNestedModel(qengine), test_only_eval_fn, [self.calib_data]
+                )
+                checkQuantized(model)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfNoFBGEMM
     def test_nested2(self):
         model = AnnotatedSubNestedModel()
@@ -491,8 +692,14 @@ def checkQuantized(model):
         checkQuantized(model)
 
         # test one line API
+<<<<<<< HEAD
         model = quantize(AnnotatedSubNestedModel(), test_only_eval_fn,
                          [self.calib_data])
+=======
+        model = quantize(
+            AnnotatedSubNestedModel(), test_only_eval_fn, [self.calib_data]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         checkQuantized(model)
 
     def test_nested3(self):
@@ -533,6 +740,7 @@ def checkQuantized(model):
                 checkQuantized(model)
 
                 # test one line API
+<<<<<<< HEAD
                 model = quantize(AnnotatedCustomConfigNestedModel(), test_only_eval_fn,
                                  [self.calib_data])
                 checkQuantized(model)
@@ -540,6 +748,17 @@ def checkQuantized(model):
     def test_skip_quant(self):
         r"""The case when we want to skip quantizing some layers
         """
+=======
+                model = quantize(
+                    AnnotatedCustomConfigNestedModel(),
+                    test_only_eval_fn,
+                    [self.calib_data],
+                )
+                checkQuantized(model)
+
+    def test_skip_quant(self):
+        r"""The case when we want to skip quantizing some layers"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for qengine in supported_qengines:
             with override_quantized_engine(qengine):
                 model = AnnotatedSkipQuantModel(qengine)
@@ -562,7 +781,15 @@ def checkQuantized(model):
                 checkQuantized(model)
 
                 # test one line API
+<<<<<<< HEAD
                 model = quantize(AnnotatedSkipQuantModel(qengine), test_only_eval_fn, [self.calib_data])
+=======
+                model = quantize(
+                    AnnotatedSkipQuantModel(qengine),
+                    test_only_eval_fn,
+                    [self.calib_data],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 checkQuantized(model)
 
     @skipIfNoFBGEMM
@@ -608,7 +835,13 @@ def test_resnet_base(self):
                 model = convert(model)
 
                 def checkQuantized(model):
+<<<<<<< HEAD
                     self.assertEqual(type(model.module.conv1), nn.intrinsic.quantized.ConvReLU2d)
+=======
+                    self.assertEqual(
+                        type(model.module.conv1), nn.intrinsic.quantized.ConvReLU2d
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(type(model.module.myop), nn.quantized.QFunctional)
                     self.assertEqual(type(model.module.avgpool), nn.AdaptiveAvgPool2d)
                     self.assertEqual(type(model.module.fc), nnq.Linear)
@@ -624,7 +857,11 @@ def test_normalization(self):
         Test quantization of normalization layers
         """
         model = NormalizationTestModel()
+<<<<<<< HEAD
         model.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
+=======
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prepare(model, inplace=True)
         self.checkObservers(model)
         test_only_eval_fn(model, self.calib_data)
@@ -648,7 +885,12 @@ def checkQuantized(model):
         checkQuantized(model)
 
         model_oneline = quantize(
+<<<<<<< HEAD
             NormalizationTestModel(), test_only_eval_fn, [self.calib_data])
+=======
+            NormalizationTestModel(), test_only_eval_fn, [self.calib_data]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         checkQuantized(model)
 
     def test_save_load_state_dict(self):
@@ -680,7 +922,13 @@ def test_save_load_state_dict(self):
                 new_state_dict = model.state_dict()
 
                 # Check to make sure the state dict keys match original model after convert.
+<<<<<<< HEAD
                 self.assertEqual(set(new_state_dict.keys()), set(quant_state_dict.keys()))
+=======
+                self.assertEqual(
+                    set(new_state_dict.keys()), set(quant_state_dict.keys())
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 model.load_state_dict(quant_state_dict)
 
@@ -693,7 +941,11 @@ def test_activations(self):
         Test quantization of activations
         """
         model = ActivationsTestModel()
+<<<<<<< HEAD
         model.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
+=======
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prepare(model, inplace=True)
         self.checkObservers(model)
         test_only_eval_fn(model, self.calib_data)
@@ -710,8 +962,14 @@ def checkQuantized(model):
         checkQuantized(model)
 
         # test one line API
+<<<<<<< HEAD
         model_oneline = quantize(ActivationsTestModel(), test_only_eval_fn,
                                  [self.calib_data])
+=======
+        model_oneline = quantize(
+            ActivationsTestModel(), test_only_eval_fn, [self.calib_data]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         checkQuantized(model_oneline)
 
     @override_qengines
@@ -722,6 +980,7 @@ def test_forward_hooks_preserved(self):
         qengine = torch.backends.quantized.engine
         model = QuantStubModel()
         counter = {
+<<<<<<< HEAD
             'pre_forwards': 0,
             'forwards': 0,
         }
@@ -731,6 +990,17 @@ def fw_pre_hook(h_module, input):
 
         def fw_hook(h_module, input, output):
             counter['forwards'] += 1
+=======
+            "pre_forwards": 0,
+            "forwards": 0,
+        }
+
+        def fw_pre_hook(h_module, input):
+            counter["pre_forwards"] += 1
+
+        def fw_hook(h_module, input, output):
+            counter["forwards"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model.fc.register_forward_pre_hook(fw_pre_hook)
         model.fc.register_forward_hook(fw_hook)
@@ -741,12 +1011,21 @@ def fw_hook(h_module, input, output):
         def checkHooksIsPresent(model, before_convert=True):
             num_fwd_hooks = 1
             if before_convert:
+<<<<<<< HEAD
                 self.assertEqual(len(model.quant._forward_hooks.values()), 1,
                                  "Quantization observer hook has disappeared")
+=======
+                self.assertEqual(
+                    len(model.quant._forward_hooks.values()),
+                    1,
+                    "Quantization observer hook has disappeared",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 num_fwd_hooks = 2
 
             self.assertObjectIn(fw_pre_hook, model.fc._forward_pre_hooks.values())
             self.assertObjectIn(fw_hook, model.fc._forward_hooks.values())
+<<<<<<< HEAD
             self.assertEqual(len(model.fc._forward_pre_hooks.values()), 1,
                              "Extra pre forward hooks have appeared on a layer")
             # During static quantization non stub layers are provided with quantization observer hook too
@@ -755,6 +1034,25 @@ def checkHooksIsPresent(model, before_convert=True):
             # Implicitly check that fw_hook goes after _observer_forward_hook
             self.assertEqual(list(model.fc._forward_hooks.values())[-1], fw_hook,
                              "_observer_forward_hook is not a first entry of the hooks list")
+=======
+            self.assertEqual(
+                len(model.fc._forward_pre_hooks.values()),
+                1,
+                "Extra pre forward hooks have appeared on a layer",
+            )
+            # During static quantization non stub layers are provided with quantization observer hook too
+            self.assertEqual(
+                len(model.fc._forward_hooks.values()),
+                num_fwd_hooks,
+                "Extra post forward hooks have appeared on a layer",
+            )
+            # Implicitly check that fw_hook goes after _observer_forward_hook
+            self.assertEqual(
+                list(model.fc._forward_hooks.values())[-1],
+                fw_hook,
+                "_observer_forward_hook is not a first entry of the hooks list",
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         checkHooksIsPresent(model, True)
         test_only_eval_fn(model, self.calib_data)
@@ -763,6 +1061,7 @@ def checkHooksIsPresent(model, before_convert=True):
 
     @skipIfNoFBGEMM
     def test_quantized_embedding(self):
+<<<<<<< HEAD
         r""" Test the post-training quantization flow, serialization and scripting
         of embedding modules
         """
@@ -770,11 +1069,62 @@ def test_quantized_embedding(self):
         for qconfig in [float_qparams_weight_only_qconfig, float_qparams_weight_only_qconfig_4bit]:
             model = EmbeddingModule().eval()
             indices = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8, 6, 6, 9, 1, 6, 8, 8, 3, 2, 3, 6, 3, 6, 5, 7, 0, 8, 4, 6, 5, 8, 2, 3])
+=======
+        r"""Test the post-training quantization flow, serialization and scripting
+        of embedding modules
+        """
+
+        for qconfig in [
+            float_qparams_weight_only_qconfig,
+            float_qparams_weight_only_qconfig_4bit,
+        ]:
+            model = EmbeddingModule().eval()
+            indices = torch.tensor(
+                [
+                    9,
+                    6,
+                    5,
+                    7,
+                    8,
+                    8,
+                    9,
+                    2,
+                    8,
+                    6,
+                    6,
+                    9,
+                    1,
+                    6,
+                    8,
+                    8,
+                    3,
+                    2,
+                    3,
+                    6,
+                    3,
+                    6,
+                    5,
+                    7,
+                    0,
+                    8,
+                    4,
+                    6,
+                    5,
+                    8,
+                    2,
+                    3,
+                ]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             weights = torch.randn(10, 12, dtype=torch.float32)
             model.qconfig = qconfig
             prepare(model, inplace=True)
             convert(model, inplace=True)
+<<<<<<< HEAD
             self.assertTrue('QuantizedEmbedding' in str(model))
+=======
+            self.assertTrue("QuantizedEmbedding" in str(model))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(type(model.emb), torch.ao.nn.quantized.Embedding)
             self.checkScriptable(model, [[indices]], check_save_load=True)
 
@@ -784,8 +1134,13 @@ def test_quantized_embedding(self):
             model = EmbeddingWithStaticLinear().eval()
             prepare(model, inplace=True)
             convert(model, inplace=True)
+<<<<<<< HEAD
             self.assertTrue('QuantizedEmbedding' in str(model))
             self.assertTrue('QuantizedLinear' in str(model))
+=======
+            self.assertTrue("QuantizedEmbedding" in str(model))
+            self.assertTrue("QuantizedLinear" in str(model))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.checkQuantizedLinear(model.fc)
             model(idx, offsets, x)
 
@@ -809,27 +1164,80 @@ def test_dequant_stub(self):
         self.assertEqual(type(m2.fc), nnq.Linear)
         self.assertEqual(type(m2.dequant), DeQuantStub)
 
+<<<<<<< HEAD
 
     def test_quantized_embedding_bag(self):
         r""" Test the post-training quantization flow, serialization and scripting
         of embedding_bag modules
         """
         indices = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8, 6, 6, 9, 1, 6, 8, 8, 3, 2, 3, 6, 3, 6, 5, 7, 0, 8, 4, 6, 5, 8, 2, 3])
+=======
+    def test_quantized_embedding_bag(self):
+        r"""Test the post-training quantization flow, serialization and scripting
+        of embedding_bag modules
+        """
+        indices = torch.tensor(
+            [
+                9,
+                6,
+                5,
+                7,
+                8,
+                8,
+                9,
+                2,
+                8,
+                6,
+                6,
+                9,
+                1,
+                6,
+                8,
+                8,
+                3,
+                2,
+                3,
+                6,
+                3,
+                6,
+                5,
+                7,
+                0,
+                8,
+                4,
+                6,
+                5,
+                8,
+                2,
+                3,
+            ]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offsets = torch.tensor([0, 19, 20, 28, 28, 32])
         weights = torch.randn(10, 12, dtype=torch.float32)
 
         for dtype in [torch.quint8, torch.quint4x2]:
             model = EmbeddingBagModule().eval()
+<<<<<<< HEAD
             float_qparams_observer = PerChannelMinMaxObserver.with_args(dtype=dtype,
                                                                         qscheme=torch.per_channel_affine_float_qparams,
                                                                         ch_axis=0)
             float_qparams_qconfig = QConfig(activation=default_dynamic_quant_observer,
                                             weight=float_qparams_observer)
+=======
+            float_qparams_observer = PerChannelMinMaxObserver.with_args(
+                dtype=dtype, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0
+            )
+            float_qparams_qconfig = QConfig(
+                activation=default_dynamic_quant_observer, weight=float_qparams_observer
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model.qconfig = float_qparams_qconfig
 
             prepare(model, inplace=True)
             quantized_model = convert(model)
 
+<<<<<<< HEAD
             per_sample_weights = torch.from_numpy(np.random.uniform(
                 low=0.01, high=0.5, size=[len(indices)]).astype(np.float32))
 
@@ -837,16 +1245,50 @@ def test_quantized_embedding_bag(self):
             self.assertTrue('QuantizedEmbeddingBag' in str(quantized_model))
             self.checkDynamicQuantizedModule(quantized_model.emb, torch.ao.nn.quantized.EmbeddingBag, torch.quint8)
             self.checkScriptable(quantized_model, [[indices, offsets, per_sample_weights]], check_save_load=True)
+=======
+            per_sample_weights = torch.from_numpy(
+                np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype(
+                    np.float32
+                )
+            )
+
+            # Test to make sure module is quantized correctly.
+            self.assertTrue("QuantizedEmbeddingBag" in str(quantized_model))
+            self.checkDynamicQuantizedModule(
+                quantized_model.emb, torch.ao.nn.quantized.EmbeddingBag, torch.quint8
+            )
+            self.checkScriptable(
+                quantized_model,
+                [[indices, offsets, per_sample_weights]],
+                check_save_load=True,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             class EmbeddingBagWithLinear(torch.nn.Module):
                 def __init__(self) -> None:
                     super().__init__()
+<<<<<<< HEAD
                     self.emb = torch.nn.EmbeddingBag(num_embeddings=10, embedding_dim=12,
                                                      include_last_offset=True, scale_grad_by_freq=False, mode='sum')
                     self.fc = torch.nn.Linear(5, 5)
 
                 def forward(self, indices, offsets, per_sample_weights, linear_in):
                     return self.emb(indices, offsets, per_sample_weights), self.fc(linear_in)
+=======
+                    self.emb = torch.nn.EmbeddingBag(
+                        num_embeddings=10,
+                        embedding_dim=12,
+                        include_last_offset=True,
+                        scale_grad_by_freq=False,
+                        mode="sum",
+                    )
+                    self.fc = torch.nn.Linear(5, 5)
+
+                def forward(self, indices, offsets, per_sample_weights, linear_in):
+                    return self.emb(indices, offsets, per_sample_weights), self.fc(
+                        linear_in
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Test quantization of embedding_bag layer only
             model2 = EmbeddingBagWithLinear().eval()
@@ -854,9 +1296,17 @@ def forward(self, indices, offsets, per_sample_weights, linear_in):
             prepare(model2, inplace=True)
             quantized_model = convert(model2)
 
+<<<<<<< HEAD
             self.assertTrue('QuantizedEmbeddingBag' in str(quantized_model))
             self.checkLinear(model2.fc)
             self.checkDynamicQuantizedModule(quantized_model.emb, torch.ao.nn.quantized.EmbeddingBag, torch.quint8)
+=======
+            self.assertTrue("QuantizedEmbeddingBag" in str(quantized_model))
+            self.checkLinear(model2.fc)
+            self.checkDynamicQuantizedModule(
+                quantized_model.emb, torch.ao.nn.quantized.EmbeddingBag, torch.quint8
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoFBGEMM
     def test_custom_module_class(self):
@@ -878,7 +1328,11 @@ def forward(self, x):
 
             @classmethod
             def from_float(cls, float_module):
+<<<<<<< HEAD
                 assert hasattr(float_module, 'qconfig')
+=======
+                assert hasattr(float_module, "qconfig")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 observed = cls(float_module.conv)
                 observed.qconfig = float_module.qconfig
                 return observed
@@ -893,10 +1347,18 @@ def forward(self, x):
 
             @classmethod
             def from_observed(cls, observed_module):
+<<<<<<< HEAD
                 assert hasattr(observed_module, 'qconfig')
                 assert hasattr(observed_module, 'activation_post_process')
                 observed_module.conv.activation_post_process = \
                     observed_module.activation_post_process
+=======
+                assert hasattr(observed_module, "qconfig")
+                assert hasattr(observed_module, "activation_post_process")
+                observed_module.conv.activation_post_process = (
+                    observed_module.activation_post_process
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 quantized = cls(nnq.Conv2d.from_float(observed_module.conv))
                 return quantized
 
@@ -942,10 +1404,23 @@ def forward(self, x):
         # instantiate M and RefM and align the parameters
         original_m = M()
         original_ref_m = RefM()
+<<<<<<< HEAD
         original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach())
         original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach())
         original_ref_m.conv2.weight = torch.nn.Parameter(original_m.sub.custom.conv.weight.detach())
         original_ref_m.conv2.bias = torch.nn.Parameter(original_m.sub.custom.conv.bias.detach())
+=======
+        original_ref_m.conv1.weight = torch.nn.Parameter(
+            original_m.conv.weight.detach()
+        )
+        original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach())
+        original_ref_m.conv2.weight = torch.nn.Parameter(
+            original_m.sub.custom.conv.weight.detach()
+        )
+        original_ref_m.conv2.bias = torch.nn.Parameter(
+            original_m.sub.custom.conv.bias.detach()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         original_m.qconfig = default_qconfig
         prepare_custom_config_dict = {
@@ -958,18 +1433,26 @@ def forward(self, x):
                 ObservedCustomModule: QuantizedCustomModule
             }
         }
+<<<<<<< HEAD
         m = prepare(
             original_m,
             prepare_custom_config_dict=prepare_custom_config_dict)
+=======
+        m = prepare(original_m, prepare_custom_config_dict=prepare_custom_config_dict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.checkObservers(m, None, prepare_custom_config_dict)
         # calibration
         m(data)
         # all activation observers are inserted in the top level module
 
         # check converted/quantized model
+<<<<<<< HEAD
         m = convert(
             m,
             convert_custom_config_dict=convert_custom_config_dict)
+=======
+        m = convert(m, convert_custom_config_dict=convert_custom_config_dict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # check if the module is properly quantized
         self.assertEqual(type(m.quant), nnq.Quantize)
         self.assertEqual(type(m.conv), nnq.Conv2d)
@@ -995,12 +1478,22 @@ def test_convtranspose_per_channel_fails_early(self):
         weight observers fails in the prepare step, as opposed to the convert step.
         """
         m = torch.nn.Sequential(torch.nn.ConvTranspose2d(1, 1, 1))
+<<<<<<< HEAD
         m.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
         with self.assertRaises(AssertionError) as context:
             mp = torch.ao.quantization.prepare(m)
         self.assertTrue(
             str(context.exception) ==
             'Per channel weight observer is not supported yet for ConvTranspose{n}d.')
+=======
+        m.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        with self.assertRaises(AssertionError) as context:
+            mp = torch.ao.quantization.prepare(m)
+        self.assertTrue(
+            str(context.exception)
+            == "Per channel weight observer is not supported yet for ConvTranspose{n}d."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoFBGEMM
     def test_convtranspose_per_channel_qconfig_none(self):
@@ -1008,7 +1501,11 @@ def test_convtranspose_per_channel_qconfig_none(self):
         Verifies that having qconfig==None for conv transpose does not crash
         """
         m = torch.nn.Sequential(torch.nn.ConvTranspose2d(1, 1, 1))
+<<<<<<< HEAD
         m.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
+=======
+        m.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m[0].qconfig = None
         mp = torch.ao.quantization.prepare(m)
 
@@ -1030,6 +1527,10 @@ def test_activations_in_non_leaf_module_list(self):
         Ensure activations like `nn.Sigmoid` and `nn.Tanh` are properly handled in
         `non_leaf_module_list`.
         """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class MyModel(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -1051,6 +1552,7 @@ def forward(self, x):
 
         qconfig = QConfig(
             activation=FixedQParamsObserver.with_args(scale=123.0, zero_point=0),
+<<<<<<< HEAD
             weight=default_weight_observer
         )
         m = MyModel()
@@ -1067,13 +1569,48 @@ def forward(self, x):
         self.assertTrue(isinstance(m.hardsigmoid.activation_post_process, FixedQParamsObserver))
         self.assertTrue(isinstance(m.softmax.activation_post_process, FixedQParamsObserver))
         self.assertTrue(isinstance(m.tanh.activation_post_process, FixedQParamsObserver))
+=======
+            weight=default_weight_observer,
+        )
+        m = MyModel()
+        m.qconfig = qconfig
+        m = prepare(
+            m,
+            observer_non_leaf_module_list=[
+                torch.nn.Sigmoid,
+                torch.nn.Hardsigmoid,
+                torch.nn.Softmax,
+                torch.nn.Tanh,
+            ],
+        )
+
+        # Should use the observer specified in the QConfig instead of the default (FixedQParamsFakeQuantize)
+        self.assertTrue(
+            isinstance(m.sigmoid.activation_post_process, FixedQParamsObserver)
+        )
+        self.assertTrue(
+            isinstance(m.hardsigmoid.activation_post_process, FixedQParamsObserver)
+        )
+        self.assertTrue(
+            isinstance(m.softmax.activation_post_process, FixedQParamsObserver)
+        )
+        self.assertTrue(
+            isinstance(m.tanh.activation_post_process, FixedQParamsObserver)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoFBGEMM
     def test_mha_batch_first_attr_is_copied_in_prepare(self):
         class TransformerDecoderLayer(nn.Module):
             def __init__(self, d_model, nhead, batch_first):
                 super().__init__()
+<<<<<<< HEAD
                 self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=0.1, batch_first=batch_first)
+=======
+                self.self_attn = nn.MultiheadAttention(
+                    d_model, nhead, dropout=0.1, batch_first=batch_first
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         qengine = torch.backends.quantized.engine
         for batch_first in [True, False]:
@@ -1081,7 +1618,14 @@ def __init__(self, d_model, nhead, batch_first):
             quantization_config = torch.ao.quantization.get_default_qconfig(qengine)
             model.qconfig = quantization_config
             prepared_model = torch.ao.quantization.prepare(model, inplace=False)
+<<<<<<< HEAD
             self.assertTrue(prepared_model.self_attn.batch_first == model.self_attn.batch_first)
+=======
+            self.assertTrue(
+                prepared_model.self_attn.batch_first == model.self_attn.batch_first
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @skipIfNoFBGEMM
 class TestQuantizeEagerPTQDynamic(QuantizationTestCase):
@@ -1092,10 +1636,19 @@ def test_single_layer(self):
         """
         for dtype in [torch.qint8, torch.float16]:
             model = SingleLayerLinearDynamicModel().eval()
+<<<<<<< HEAD
             qconfig = float16_dynamic_qconfig if dtype == torch.float16 else default_dynamic_qconfig
             qconfig_dict = {
                 'fc1': qconfig
             }
+=======
+            qconfig = (
+                float16_dynamic_qconfig
+                if dtype == torch.float16
+                else default_dynamic_qconfig
+            )
+            qconfig_dict = {"fc1": qconfig}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepare_dynamic(model, qconfig_dict)
             convert_dynamic(model)
 
@@ -1112,7 +1665,13 @@ def checkQuantized(model):
             model = quantize_dynamic(base, qconfig_dict)
             checkQuantized(model)
             keys_after = set(base.state_dict().keys())
+<<<<<<< HEAD
             self.assertEqual(keys_before, keys_after)  # simple check that nothing changed
+=======
+            self.assertEqual(
+                keys_before, keys_after
+            )  # simple check that nothing changed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # in-place version
             model = SingleLayerLinearDynamicModel()
@@ -1130,10 +1689,19 @@ def test_two_layers(self):
         """
         for dtype in [torch.qint8, torch.float16]:
             model = TwoLayerLinearModel().eval()
+<<<<<<< HEAD
             qconfig = float16_dynamic_qconfig if dtype == torch.float16 else default_dynamic_qconfig
             qconfig_dict = {
                 'fc2': qconfig
             }
+=======
+            qconfig = (
+                float16_dynamic_qconfig
+                if dtype == torch.float16
+                else default_dynamic_qconfig
+            )
+            qconfig_dict = {"fc2": qconfig}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepare_dynamic(model, qconfig_dict)
 
             convert_dynamic(model)
@@ -1151,7 +1719,11 @@ def checkQuantized(model):
             checkQuantized(model)
 
             # Test set API
+<<<<<<< HEAD
             model = quantize_dynamic(TwoLayerLinearModel().eval(), {'fc2'}, dtype=dtype)
+=======
+            model = quantize_dynamic(TwoLayerLinearModel().eval(), {"fc2"}, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             checkQuantized(model)
 
     def test_nested1(self):
@@ -1160,11 +1732,20 @@ def test_nested1(self):
         """
         for dtype in [torch.qint8, torch.float16]:
             model = NestedModel().eval()
+<<<<<<< HEAD
             qconfig = float16_dynamic_qconfig if dtype == torch.float16 else default_dynamic_qconfig
             qconfig_dict = {
                 'fc3': qconfig,
                 'sub2.fc1': qconfig
             }
+=======
+            qconfig = (
+                float16_dynamic_qconfig
+                if dtype == torch.float16
+                else default_dynamic_qconfig
+            )
+            qconfig_dict = {"fc3": qconfig, "sub2.fc1": qconfig}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             prepare_dynamic(model, qconfig_dict)
             convert_dynamic(model)
@@ -1183,7 +1764,13 @@ def checkQuantized(model):
             model = quantize_dynamic(NestedModel().eval(), qconfig_dict)
             checkQuantized(model)
 
+<<<<<<< HEAD
             model = quantize_dynamic(NestedModel().eval(), {'fc3', 'sub2.fc1'}, dtype=dtype)
+=======
+            model = quantize_dynamic(
+                NestedModel().eval(), {"fc3", "sub2.fc1"}, dtype=dtype
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             checkQuantized(model)
 
     def test_nested2(self):
@@ -1192,11 +1779,20 @@ def test_nested2(self):
         """
         for dtype in [torch.qint8, torch.float16]:
             model = NestedModel().eval()
+<<<<<<< HEAD
             qconfig = float16_dynamic_qconfig if dtype == torch.float16 else default_dynamic_qconfig
             qconfig_dict = {
                 'fc3': qconfig,
                 'sub2': qconfig
             }
+=======
+            qconfig = (
+                float16_dynamic_qconfig
+                if dtype == torch.float16
+                else default_dynamic_qconfig
+            )
+            qconfig_dict = {"fc3": qconfig, "sub2": qconfig}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepare_dynamic(model, qconfig_dict)
 
             convert_dynamic(model)
@@ -1217,7 +1813,11 @@ def checkQuantized(model):
             checkQuantized(model)
 
             # Test set API
+<<<<<<< HEAD
             model = quantize_dynamic(NestedModel().eval(), {'fc3', 'sub2'}, dtype=dtype)
+=======
+            model = quantize_dynamic(NestedModel().eval(), {"fc3", "sub2"}, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             checkQuantized(model)
 
     def test_nested3(self):
@@ -1226,11 +1826,23 @@ def test_nested3(self):
         """
         for dtype in [torch.qint8, torch.float16]:
             model = NestedModel().eval()
+<<<<<<< HEAD
             qconfig = float16_dynamic_qconfig if dtype == torch.float16 else default_dynamic_qconfig
             qconfig_dynamic_dict = {
                 'fc3': qconfig,
                 'sub2': qconfig,
                 'sub2.fc1': qconfig
+=======
+            qconfig = (
+                float16_dynamic_qconfig
+                if dtype == torch.float16
+                else default_dynamic_qconfig
+            )
+            qconfig_dynamic_dict = {
+                "fc3": qconfig,
+                "sub2": qconfig,
+                "sub2.fc1": qconfig,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
             prepare_dynamic(model, qconfig_dynamic_dict)
 
@@ -1250,7 +1862,13 @@ def checkQuantized(model):
             checkQuantized(model)
 
             # Test set API
+<<<<<<< HEAD
             model = quantize_dynamic(NestedModel().eval(), {'fc3', 'sub2', 'sub2.fc1'}, dtype=dtype)
+=======
+            model = quantize_dynamic(
+                NestedModel().eval(), {"fc3", "sub2", "sub2.fc1"}, dtype=dtype
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             checkQuantized(model)
 
     def test_type_match_rule(self):
@@ -1259,12 +1877,21 @@ def test_type_match_rule(self):
         """
         for dtype in [torch.qint8, torch.float16]:
             model = NestedModel().eval()
+<<<<<<< HEAD
             qconfig = float16_dynamic_qconfig if dtype == torch.float16 else default_dynamic_qconfig
             qconfig_dict = {
                 'fc3': None,
                 'sub2.fc1': None,
                 torch.nn.Linear: qconfig
             }
+=======
+            qconfig = (
+                float16_dynamic_qconfig
+                if dtype == torch.float16
+                else default_dynamic_qconfig
+            )
+            qconfig_dict = {"fc3": None, "sub2.fc1": None, torch.nn.Linear: qconfig}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             prepare_dynamic(model, qconfig_dict)
             test_only_eval_fn(model, self.calib_data)
@@ -1286,12 +1913,18 @@ def checkQuantized(model):
             checkQuantized(model)
 
     def test_per_channel_linear_quantize(self):
+<<<<<<< HEAD
         r"""Test quantization for per_channel dynamic quantization
         """
         model = NestedModel().eval()
         qconfig_dict = {
             torch.nn.Linear: per_channel_dynamic_qconfig
         }
+=======
+        r"""Test quantization for per_channel dynamic quantization"""
+        model = NestedModel().eval()
+        qconfig_dict = {torch.nn.Linear: per_channel_dynamic_qconfig}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         prepare_dynamic(model, qconfig_dict)
         test_only_eval_fn(model, self.calib_data)
@@ -1315,8 +1948,13 @@ def test_linear_relu_fusion(self):
         dtype = torch.qint8
         model = LinearReluLinearModel().eval()
         qconfig = default_dynamic_qconfig
+<<<<<<< HEAD
         qconfig_dict = {'' : qconfig}
         torch.ao.quantization.fuse_modules(model, [['fc1', 'relu']], inplace=True)
+=======
+        qconfig_dict = {"": qconfig}
+        torch.ao.quantization.fuse_modules(model, [["fc1", "relu"]], inplace=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prepare_dynamic(model, qconfig_dict)
         convert_dynamic(model)
 
@@ -1328,6 +1966,7 @@ def checkQuantized(model):
 
         checkQuantized(model)
 
+<<<<<<< HEAD
     @given(qconfig=st.sampled_from([per_channel_dynamic_qconfig, default_dynamic_qconfig]),
            dtype=st.sampled_from([torch.qint8, torch.float16]))
     def test_quantized_rnn(self, qconfig, dtype):
@@ -1351,12 +1990,49 @@ def checkQuantized(model, module_type):
             self.checkDynamicQuantizedModule(model_quantized.mod, mod_type_map[module_type], dtype)
 
         for module_type in ['LSTM', 'GRU']:
+=======
+    @given(
+        qconfig=st.sampled_from([per_channel_dynamic_qconfig, default_dynamic_qconfig]),
+        dtype=st.sampled_from([torch.qint8, torch.float16]),
+    )
+    def test_quantized_rnn(self, qconfig, dtype):
+        r"""Test dynamic quantization, scriptability and serialization for dynamic quantized lstm modules on int8 and fp16"""
+        niter = 10
+        x = (
+            torch.tensor([[100, -155], [-155, 100], [100, -155]], dtype=torch.float)
+            .unsqueeze(0)
+            .repeat(niter, 1, 1)
+        )
+        qconfig_dict = {torch.nn.LSTM: qconfig, torch.nn.GRU: qconfig}
+
+        def checkQuantized(model, module_type):
+            mod_type_map = {
+                "LSTM": torch.ao.nn.quantized.dynamic.LSTM,
+                "GRU": torch.ao.nn.quantized.dynamic.GRU,
+            }
+            mod_repr_map = {
+                "LSTM": "DynamicQuantizedLSTM",
+                "GRU": "DynamicQuantizedGRU",
+            }
+            self.assertTrue(mod_repr_map[module_type] in str(model_quantized))
+            self.checkDynamicQuantizedModule(
+                model_quantized.mod, mod_type_map[module_type], dtype
+            )
+
+        for module_type in ["LSTM", "GRU"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model = RNNDynamicModel(module_type).eval()
 
             if dtype == torch.float16:
                 model_quantized = quantize_dynamic(model=model, dtype=dtype)
             else:
+<<<<<<< HEAD
                 model_quantized = quantize_dynamic(model=model, qconfig_spec=qconfig_dict, dtype=dtype)
+=======
+                model_quantized = quantize_dynamic(
+                    model=model, qconfig_spec=qconfig_dict, dtype=dtype
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             checkQuantized(model_quantized, module_type)
             self.checkScriptable(model_quantized, [[x]], check_save_load=True)
@@ -1366,7 +2042,13 @@ def __init__(self, cell):
                     super().__init__()
                     self.cell = cell
 
+<<<<<<< HEAD
                 def forward(self, x: PackedSequence) -> tuple[PackedSequence, tuple[torch.Tensor, torch.Tensor]]:
+=======
+                def forward(
+                    self, x: PackedSequence
+                ) -> tuple[PackedSequence, tuple[torch.Tensor, torch.Tensor]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return self.cell(x)
 
             class ScriptWrapperPackedGRU(torch.nn.Module):
@@ -1374,6 +2056,7 @@ def __init__(self, cell):
                     super().__init__()
                     self.cell = cell
 
+<<<<<<< HEAD
                 def forward(self, x: PackedSequence) -> tuple[PackedSequence, torch.Tensor]:
                     return self.cell(x)
 
@@ -1381,10 +2064,28 @@ def forward(self, x: PackedSequence) -> tuple[PackedSequence, torch.Tensor]:
                                   'GRU': ScriptWrapperPackedGRU}
             packed_input = torch.nn.utils.rnn.pack_padded_sequence(x, torch.tensor([10, 5, 2]))
             model_with_packed_input = script_wrapper_map[module_type](model_quantized.mod)
+=======
+                def forward(
+                    self, x: PackedSequence
+                ) -> tuple[PackedSequence, torch.Tensor]:
+                    return self.cell(x)
+
+            script_wrapper_map = {
+                "LSTM": ScriptWrapperPackedLSTM,
+                "GRU": ScriptWrapperPackedGRU,
+            }
+            packed_input = torch.nn.utils.rnn.pack_padded_sequence(
+                x, torch.tensor([10, 5, 2])
+            )
+            model_with_packed_input = script_wrapper_map[module_type](
+                model_quantized.mod
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             model_with_packed_input(packed_input)
             scripted = torch.jit.script(model_with_packed_input)
             scripted(packed_input)
             # We cannot trace with input dtype being a packed sequence
+<<<<<<< HEAD
             self._checkScriptable(model_with_packed_input, scripted, [[packed_input]], True)
 
 
@@ -1406,12 +2107,36 @@ def test_quantized_rnn_cell(self, qconfig, dtype):
                              [100, -155]], dtype=torch.float)
 
             if torch.backends.quantized.engine == 'qnnpack' and dtype == torch.float16:
+=======
+            self._checkScriptable(
+                model_with_packed_input, scripted, [[packed_input]], True
+            )
+
+    @given(
+        qconfig=st.sampled_from([per_channel_dynamic_qconfig, default_dynamic_qconfig]),
+        dtype=st.sampled_from([torch.qint8, torch.float16]),
+    )
+    def test_quantized_rnn_cell(self, qconfig, dtype):
+        r"""Test dynamic quantization, scriptability and serialization for dynamic quantized rnn cell modules on int8 and fp16"""
+        qconfig_dict = {
+            torch.nn.LSTMCell: qconfig,
+            torch.nn.GRUCell: qconfig,
+            torch.nn.RNNCell: qconfig,
+        }
+
+        for module_type in ["LSTMCell", "GRUCell", "RNNTanh", "RNNReLU"]:
+            model = RNNCellDynamicModel(module_type).eval()
+            x = torch.tensor([[100, -155], [-155, 100], [100, -155]], dtype=torch.float)
+
+            if torch.backends.quantized.engine == "qnnpack" and dtype == torch.float16:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
                 # fp16 dynamic quant is not supported for qnnpack
 
             if dtype == torch.float16:
                 model_quantized = quantize_dynamic(model=model, dtype=dtype)
             else:
+<<<<<<< HEAD
                 model_quantized = quantize_dynamic(model=model, qconfig_spec=qconfig_dict, dtype=dtype)
 
             def checkQuantized(model, module_type):
@@ -1427,19 +2152,48 @@ def checkQuantized(model, module_type):
 
                 self.assertTrue(mod_repr_map[module_type] in str(model_quantized))
                 self.checkDynamicQuantizedModule(model_quantized.mod, mod_type_map[module_type], dtype)
+=======
+                model_quantized = quantize_dynamic(
+                    model=model, qconfig_spec=qconfig_dict, dtype=dtype
+                )
+
+            def checkQuantized(model, module_type):
+                mod_type_map = {
+                    "LSTMCell": torch.ao.nn.quantized.dynamic.LSTMCell,
+                    "GRUCell": torch.ao.nn.quantized.dynamic.GRUCell,
+                    "RNNTanh": torch.ao.nn.quantized.dynamic.RNNCell,
+                    "RNNReLU": torch.ao.nn.quantized.dynamic.RNNCell,
+                }
+
+                mod_repr_map = {
+                    "LSTMCell": "DynamicQuantizedLSTMCell",
+                    "GRUCell": "DynamicQuantizedGRUCell",
+                    "RNNTanh": "DynamicQuantizedRNNCell",
+                    "RNNReLU": "DynamicQuantizedRNNCell",
+                }
+
+                self.assertTrue(mod_repr_map[module_type] in str(model_quantized))
+                self.checkDynamicQuantizedModule(
+                    model_quantized.mod, mod_type_map[module_type], dtype
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.checkNoQconfig(model)
 
             # Smoke test extra reprs
             checkQuantized(model_quantized, module_type)
             self.checkScriptable(model_quantized, [[x]], check_save_load=True)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_forward_hooks_preserved(self):
         r"""Test post-training dynamic quantization on preserving
         pre forward and post forward hooks of original model
         """
         for dtype in [torch.qint8, torch.float16]:
             model = SingleLayerLinearDynamicModel().eval()
+<<<<<<< HEAD
             qconfig = float16_dynamic_qconfig if dtype == torch.float16 else default_dynamic_qconfig
             qconfig_dict = {
                 'fc1': qconfig
@@ -1456,6 +2210,26 @@ def fw_pre_hook(h_module, input):
 
             def fw_hook(h_module, input, output):
                 counter['forwards'] += 1
+=======
+            qconfig = (
+                float16_dynamic_qconfig
+                if dtype == torch.float16
+                else default_dynamic_qconfig
+            )
+            qconfig_dict = {"fc1": qconfig}
+            convert_dynamic(model)
+
+            counter = {
+                "pre_forwards": 0,
+                "forwards": 0,
+            }
+
+            def fw_pre_hook(h_module, input):
+                counter["pre_forwards"] += 1
+
+            def fw_hook(h_module, input, output):
+                counter["forwards"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             model.fc1.register_forward_pre_hook(fw_pre_hook)
             model.fc1.register_forward_hook(fw_hook)
@@ -1464,10 +2238,23 @@ def fw_hook(h_module, input, output):
             def checkHooksIsPresent(model):
                 self.assertObjectIn(fw_pre_hook, model.fc1._forward_pre_hooks.values())
                 self.assertObjectIn(fw_hook, model.fc1._forward_hooks.values())
+<<<<<<< HEAD
                 self.assertEqual(len(model.fc1._forward_pre_hooks.values()), 1,
                                  "Extra pre forward hooks have appeared on a layer")
                 self.assertEqual(len(model.fc1._forward_hooks.values()), 1,
                                  "Extra post forward hooks have appeared on a layer")
+=======
+                self.assertEqual(
+                    len(model.fc1._forward_pre_hooks.values()),
+                    1,
+                    "Extra pre forward hooks have appeared on a layer",
+                )
+                self.assertEqual(
+                    len(model.fc1._forward_hooks.values()),
+                    1,
+                    "Extra post forward hooks have appeared on a layer",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             checkHooksIsPresent(model)
             test_only_eval_fn(model, self.calib_data)
@@ -1479,12 +2266,23 @@ def test_embedding_bag_dynamic(self):
         class EmbeddingBagWithLinear(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
+<<<<<<< HEAD
                 self.emb = torch.nn.EmbeddingBag(num_embeddings=10, embedding_dim=12,
                                                  include_last_offset=True, scale_grad_by_freq=False, mode='sum')
+=======
+                self.emb = torch.nn.EmbeddingBag(
+                    num_embeddings=10,
+                    embedding_dim=12,
+                    include_last_offset=True,
+                    scale_grad_by_freq=False,
+                    mode="sum",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.fc = torch.nn.Linear(5, 5)
 
             def forward(self, indices, offsets, linear_in):
                 return self.emb(indices, offsets), self.fc(linear_in)
+<<<<<<< HEAD
         model = EmbeddingBagWithLinear().eval()
 
         qconfig_dict = {
@@ -1492,12 +2290,62 @@ def forward(self, indices, offsets, linear_in):
             torch.nn.Linear: default_dynamic_qconfig
         }
         indices = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8, 6, 6, 9, 1, 6, 8, 8, 3, 2, 3, 6, 3, 6, 5, 7, 0, 8, 4, 6, 5, 8, 2, 3])
+=======
+
+        model = EmbeddingBagWithLinear().eval()
+
+        qconfig_dict = {
+            torch.nn.EmbeddingBag: float_qparams_weight_only_qconfig,
+            torch.nn.Linear: default_dynamic_qconfig,
+        }
+        indices = torch.tensor(
+            [
+                9,
+                6,
+                5,
+                7,
+                8,
+                8,
+                9,
+                2,
+                8,
+                6,
+                6,
+                9,
+                1,
+                6,
+                8,
+                8,
+                3,
+                2,
+                3,
+                6,
+                3,
+                6,
+                5,
+                7,
+                0,
+                8,
+                4,
+                6,
+                5,
+                8,
+                2,
+                3,
+            ]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offsets = torch.tensor([0, 19, 20, 28, 28, 32])
         q_model = quantize_dynamic(model, qconfig_dict)
 
         q_model(indices, offsets, torch.randn(5, 5))
+<<<<<<< HEAD
         self.assertTrue('QuantizedEmbeddingBag' in str(q_model.emb))
         self.assertTrue('DynamicQuantizedLinear' in str(q_model.fc))
+=======
+        self.assertTrue("QuantizedEmbeddingBag" in str(q_model.emb))
+        self.assertTrue("DynamicQuantizedLinear" in str(q_model.fc))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfNoFBGEMM
     def test_embedding_ops_dynamic(self):
@@ -1505,11 +2353,17 @@ class EmbeddingWithLinear(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
                 self.emb = torch.nn.Embedding(
+<<<<<<< HEAD
                     num_embeddings=10, embedding_dim=12, scale_grad_by_freq=False)
+=======
+                    num_embeddings=10, embedding_dim=12, scale_grad_by_freq=False
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.fc = torch.nn.Linear(5, 5)
 
             def forward(self, indices, linear_in):
                 return self.emb(indices), self.fc(linear_in)
+<<<<<<< HEAD
         model = EmbeddingWithLinear().eval()
         qconfig_dict = {
             torch.nn.Embedding : float_qparams_weight_only_qconfig,
@@ -1525,3 +2379,59 @@ def forward(self, indices, linear_in):
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
                        "instead.")
+=======
+
+        model = EmbeddingWithLinear().eval()
+        qconfig_dict = {
+            torch.nn.Embedding: float_qparams_weight_only_qconfig,
+            torch.nn.Linear: default_dynamic_qconfig,
+        }
+        indices = torch.tensor(
+            [
+                9,
+                6,
+                5,
+                7,
+                8,
+                8,
+                9,
+                2,
+                8,
+                6,
+                6,
+                9,
+                1,
+                6,
+                8,
+                8,
+                3,
+                2,
+                3,
+                6,
+                3,
+                6,
+                5,
+                7,
+                0,
+                8,
+                4,
+                6,
+                5,
+                8,
+                2,
+                3,
+            ]
+        )
+        q_model = quantize_dynamic(model, qconfig_dict)
+        self.assertTrue("QuantizedEmbedding" in str(q_model.emb))
+        self.assertTrue("DynamicQuantizedLinear" in str(q_model.fc))
+        q_model(indices, torch.randn(5, 5))
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_quantization.py TESTNAME\n\n"
+        "instead."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/eager/test_quantize_eager_qat.py b/test/quantization/eager/test_quantize_eager_qat.py
index 851ab388e820..1c3fac10ec9e 100644
--- a/test/quantization/eager/test_quantize_eager_qat.py
+++ b/test/quantization/eager/test_quantize_eager_qat.py
@@ -3,6 +3,11 @@
 import copy
 import math
 
+<<<<<<< HEAD
+=======
+from hypothesis import given, strategies as st
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.ao.nn.intrinsic.qat as nniqat
 import torch.ao.nn.qat as nnqat
@@ -12,8 +17,11 @@
 import torch.backends.mkldnn
 import torch.nn as nn
 import torch.testing._internal.hypothesis_utils as hu
+<<<<<<< HEAD
 
 from hypothesis import given, strategies as st
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.nn.intrinsic.qat import ConvBn2d, ConvBnReLU2d
 from torch.ao.quantization import (
     convert,
@@ -50,23 +58,37 @@
     test_only_train_fn,
     TwoLayerLinearModel,
 )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_quantized import (
     override_qengines,
     override_quantized_engine,
     supported_qengines,
 )
+<<<<<<< HEAD
+
+from torch.testing._internal.common_utils import skipIfNoXNNPACK
+
+hu.assert_deadline_disabled()
+from functools import reduce
 
+=======
 from torch.testing._internal.common_utils import skipIfNoXNNPACK
 
+
 hu.assert_deadline_disabled()
 from functools import reduce
 
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _ReferenceConvBnNd(torch.nn.Conv2d, torch.nn.modules.conv._ConvNd):
     """
     Conv-BN fusion implemented with explicit folding. Useful
     to verify numerical equivalency with non-folded version.
     """
+<<<<<<< HEAD
     def __init__(self,
                  # ConvNd args
                  in_channels, out_channels, kernel_size, stride,
@@ -86,6 +108,48 @@ def __init__(self,
                                          stride, padding, dilation, transposed,
                                          output_padding, groups, False, padding_mode)
         assert qconfig, 'qconfig must be provided for QAT module'
+=======
+
+    def __init__(
+        self,
+        # ConvNd args
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        bias,
+        padding_mode,
+        # BatchNormNd args
+        # num_features: out_channels
+        eps=1e-05,
+        momentum=0.1,
+        # affine: True
+        # track_running_stats: True
+        # Args for this module
+        freeze_bn=False,
+        qconfig=None,
+    ):
+        nn.modules.conv._ConvNd.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            False,
+            padding_mode,
+        )
+        assert qconfig, "qconfig must be provided for QAT module"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.qconfig = qconfig
         self.eps = eps
         self.momentum = momentum
@@ -103,7 +167,11 @@ def __init__(self,
         if bias:
             self.bias = nn.Parameter(torch.empty(out_channels))
         else:
+<<<<<<< HEAD
             self.register_parameter('bias', None)
+=======
+            self.register_parameter("bias", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.reset_bn_parameters()
 
     def reset_running_stats(self):
@@ -123,7 +191,11 @@ def reset_bn_parameters(self):
     def reset_parameters(self):
         super().reset_parameters()
         # A hack to avoid resetting on undefined parameters
+<<<<<<< HEAD
         if hasattr(self, 'gamma'):
+=======
+        if hasattr(self, "gamma"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.reset_bn_parameters()
 
     def update_bn_stats(self):
@@ -161,19 +233,35 @@ def _forward(self, input):
         if self.bias is not None:
             zero_bias = torch.zeros_like(self.bias, dtype=input.dtype)
         else:
+<<<<<<< HEAD
             zero_bias = torch.zeros(self.out_channels, device=scaled_weight.device, dtype=input.dtype)
         conv = self._conv_forward(input, self.weight_fake_quant(scaled_weight), zero_bias)
+=======
+            zero_bias = torch.zeros(
+                self.out_channels, device=scaled_weight.device, dtype=input.dtype
+            )
+        conv = self._conv_forward(
+            input, self.weight_fake_quant(scaled_weight), zero_bias
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.training and not self.freeze_bn:
             # recovering original conv to get original batch_mean and batch_var
             if self.bias is not None:
+<<<<<<< HEAD
                 conv_orig = conv / scale_factor.reshape([1, -1, 1, 1]) + self.bias.reshape([1, -1, 1, 1])
+=======
+                conv_orig = conv / scale_factor.reshape(
+                    [1, -1, 1, 1]
+                ) + self.bias.reshape([1, -1, 1, 1])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 conv_orig = conv / scale_factor.reshape([1, -1, 1, 1])
             batch_mean = torch.mean(conv_orig, dim=[0, 2, 3])
             batch_var = torch.var(conv_orig, dim=[0, 2, 3], unbiased=False)
             n = float(conv_orig.numel() / conv_orig.size()[1])
             unbiased_batch_var = batch_var * (n / (n - 1))
+<<<<<<< HEAD
             batch_rstd = torch.ones_like(batch_var, memory_format=torch.contiguous_format) / torch.sqrt(batch_var + self.eps)
 
             conv = (self.gamma * batch_rstd).reshape([1, -1, 1, 1]) * conv_orig + \
@@ -188,6 +276,33 @@ def _forward(self, input):
                                running_std).reshape([1, -1, 1, 1])
             else:
                 conv = conv + (self.gamma * (self.bias - self.running_mean) / running_std + self.beta).reshape([1, -1, 1, 1])
+=======
+            batch_rstd = torch.ones_like(
+                batch_var, memory_format=torch.contiguous_format
+            ) / torch.sqrt(batch_var + self.eps)
+
+            conv = (self.gamma * batch_rstd).reshape([1, -1, 1, 1]) * conv_orig + (
+                self.beta - self.gamma * batch_rstd * batch_mean
+            ).reshape([1, -1, 1, 1])
+            self.running_mean = (
+                exponential_average_factor * batch_mean.detach()
+                + (1 - exponential_average_factor) * self.running_mean
+            )
+            self.running_var = (
+                exponential_average_factor * unbiased_batch_var.detach()
+                + (1 - exponential_average_factor) * self.running_var
+            )
+        else:
+            if self.bias is None:
+                conv = conv + (
+                    self.beta - self.gamma * self.running_mean / running_std
+                ).reshape([1, -1, 1, 1])
+            else:
+                conv = conv + (
+                    self.gamma * (self.bias - self.running_mean) / running_std
+                    + self.beta
+                ).reshape([1, -1, 1, 1])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return conv
 
     def extra_repr(self):
@@ -200,6 +315,7 @@ def forward(self, input):
     @classmethod
     def from_float(cls, mod, qconfig=None):
         r"""Create a qat module from a float module or qparams_dict
+<<<<<<< HEAD
             Args: `mod` a float module, either produced by torch.ao.quantization utilities
             or directly from user
         """
@@ -217,6 +333,39 @@ def from_float(cls, mod, qconfig=None):
                          bn.eps, bn.momentum,
                          False,
                          qconfig)
+=======
+        Args: `mod` a float module, either produced by torch.ao.quantization utilities
+        or directly from user
+        """
+        assert type(mod) == cls._FLOAT_MODULE, (
+            "qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__
+        )
+        if not qconfig:
+            assert hasattr(
+                mod, "qconfig"
+            ), "Input float module must have qconfig defined"
+            assert mod.qconfig, "Input float module must have a valid qconfig"
+            qconfig = mod.qconfig
+        conv, bn = mod[0], mod[1]
+        qat_convbn = cls(
+            conv.in_channels,
+            conv.out_channels,
+            conv.kernel_size,
+            conv.stride,
+            conv.padding,
+            conv.dilation,
+            conv.groups,
+            conv.bias is not None,
+            conv.padding_mode,
+            bn.eps,
+            bn.momentum,
+            False,
+            qconfig,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qat_convbn.weight = conv.weight
         qat_convbn.bias = conv.bias
         qat_convbn.gamma = bn.weight
@@ -226,6 +375,7 @@ def from_float(cls, mod, qconfig=None):
         qat_convbn.num_batches_tracked = bn.num_batches_tracked
         return qat_convbn
 
+<<<<<<< HEAD
 class _ReferenceConvBn2d(_ReferenceConvBnNd, nn.Conv2d):
     _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvBn2d
 
@@ -243,24 +393,86 @@ def __init__(self,
                  # Args for this module
                  freeze_bn=False,
                  qconfig=None):
+=======
+
+class _ReferenceConvBn2d(_ReferenceConvBnNd, nn.Conv2d):
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvBn2d
+
+    def __init__(
+        self,
+        # ConvNd args
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=None,
+        padding_mode="zeros",
+        # BatchNorm2d args
+        # num_features: out_channels
+        eps=1e-05,
+        momentum=0.1,
+        # affine: True
+        # track_running_stats: True
+        # Args for this module
+        freeze_bn=False,
+        qconfig=None,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kernel_size = _pair(kernel_size)
         stride = _pair(stride)
         padding = _pair(padding)
         dilation = _pair(dilation)
+<<<<<<< HEAD
         _ReferenceConvBnNd.__init__(self, in_channels, out_channels, kernel_size, stride,
                                     padding, dilation, False, _pair(0), groups, bias, padding_mode,
                                     eps, momentum, freeze_bn, qconfig)
+=======
+        _ReferenceConvBnNd.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _pair(0),
+            groups,
+            bias,
+            padding_mode,
+            eps,
+            momentum,
+            freeze_bn,
+            qconfig,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestQuantizeEagerQAT(QuantizationTestCase):
     def setUp(self):
         super().setUp()
 
+<<<<<<< HEAD
         self.embed_linear_data_train = [[torch.randint(0, 10, (12, 12), dtype=torch.long),
                                          torch.randn((12, 1), dtype=torch.float)]
                                         for _ in range(2)]
         self.embed_data = [[torch.randint(0, 10, (12, 1))]]
 
 
+=======
+        self.embed_linear_data_train = [
+            [
+                torch.randint(0, 10, (12, 12), dtype=torch.long),
+                torch.randn((12, 1), dtype=torch.float),
+            ]
+            for _ in range(2)
+        ]
+        self.embed_data = [[torch.randint(0, 10, (12, 1))]]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_manual(self):
         for qengine in supported_qengines:
             with override_quantized_engine(qengine):
@@ -279,8 +491,14 @@ def checkQuantized(model):
 
                 checkQuantized(model)
 
+<<<<<<< HEAD
                 model = quantize_qat(ManualLinearQATModel(qengine), test_only_train_fn,
                                      [self.train_data])
+=======
+                model = quantize_qat(
+                    ManualLinearQATModel(qengine), test_only_train_fn, [self.train_data]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 checkQuantized(model)
 
     def test_dropout(self):
@@ -301,8 +519,16 @@ def checkQuantized(model):
 
                 checkQuantized(model)
 
+<<<<<<< HEAD
                 model = quantize_qat(ManualDropoutQATModel(qengine), test_only_train_fn,
                                      [self.train_data])
+=======
+                model = quantize_qat(
+                    ManualDropoutQATModel(qengine),
+                    test_only_train_fn,
+                    [self.train_data],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 checkQuantized(model)
 
     def test_eval_only_fake_quant(self):
@@ -342,7 +568,13 @@ def checkQuantized(model):
                 checkQuantized(model)
 
                 model = ManualConvLinearQATModel()
+<<<<<<< HEAD
                 model = quantize_qat(model, test_only_train_fn, [self.img_data_2d_train])
+=======
+                model = quantize_qat(
+                    model, test_only_train_fn, [self.img_data_2d_train]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 checkQuantized(model)
 
     @skipIfNoXNNPACK
@@ -351,7 +583,11 @@ def test_conv_linear_symm(self):
         Supported only with qengine=qnnpack, which uses symmetric
         kernels from xnnpack library."""
         for qengine in supported_qengines:
+<<<<<<< HEAD
             if qengine != 'qnnpack':
+=======
+            if qengine != "qnnpack":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             with override_quantized_engine(qengine):
                 model = ManualConvLinearSymmQATModel()
@@ -373,17 +609,31 @@ def checkQuantized(model):
                 checkQuantized(model)
 
                 model = ManualConvLinearSymmQATModel()
+<<<<<<< HEAD
                 model = quantize_qat(model, test_only_train_fn, [self.img_data_2d_train])
+=======
+                model = quantize_qat(
+                    model, test_only_train_fn, [self.img_data_2d_train]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 checkQuantized(model)
 
     def test_dynamic_qat_linear(self):
         for qengine in supported_qengines:
             with override_quantized_engine(qengine):
                 # Dynamic QAT without memoryless observers should fail
+<<<<<<< HEAD
                 with self.assertRaisesRegex(ValueError,
                                             "Dynamic QAT requires a memoryless observer." +
                                             "This means a MovingAverage observer with averaging constant equal to 1"
                                             ):
+=======
+                with self.assertRaisesRegex(
+                    ValueError,
+                    "Dynamic QAT requires a memoryless observer."
+                    + "This means a MovingAverage observer with averaging constant equal to 1",
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     model = ManualLinearDynamicQATModel(default_qat_qconfig)
                     model = prepare_qat(model, mapping={torch.nn.Linear: nnqatd.Linear})
 
@@ -409,6 +659,7 @@ def test_defused_embedding_bag_linear(self):
 
                 test_only_train_fn(model, self.embed_linear_data_train)
                 # make sure activation_post_process is inserted after Linear.
+<<<<<<< HEAD
                 self.assertEqual(type(model.linear.activation_post_process), FusedMovingAvgObsFakeQuantize)
                 # make sure that Embedding has a noop for activation.
                 self.assertEqual(type(model.emb.activation_post_process), NoopObserver)
@@ -417,6 +668,25 @@ def test_defused_embedding_bag_linear(self):
                 self.assertEqual(model.linear.weight_fake_quant.zero_point.dtype, torch.int32)
 
                 model = convert(model, mapping=get_embedding_static_quant_module_mappings())
+=======
+                self.assertEqual(
+                    type(model.linear.activation_post_process),
+                    FusedMovingAvgObsFakeQuantize,
+                )
+                # make sure that Embedding has a noop for activation.
+                self.assertEqual(type(model.emb.activation_post_process), NoopObserver)
+                # make sure that FakeQuant zero_points are correct dtype
+                self.assertEqual(
+                    model.emb.weight_fake_quant.zero_point.dtype, torch.float32
+                )
+                self.assertEqual(
+                    model.linear.weight_fake_quant.zero_point.dtype, torch.int32
+                )
+
+                model = convert(
+                    model, mapping=get_embedding_static_quant_module_mappings()
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 def checkQuantized(model):
                     # make sure Embedding is now a QuantizedEmbedding
@@ -430,7 +700,10 @@ def checkQuantized(model):
 
                 checkQuantized(model)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_embedding_bag_linear(self):
         for qengine in supported_qengines:
             with override_quantized_engine(qengine):
@@ -442,9 +715,21 @@ def test_embedding_bag_linear(self):
                 # make sure not activation_post_process is inserted for EmbeddingBag
                 self.assertFalse(hasattr(model, "activation_post_process"))
                 # make sure that FakeQuant zero_points are correct dtype
+<<<<<<< HEAD
                 self.assertEqual(model.emb.weight_fake_quant.zero_point.dtype, torch.float32)
                 self.assertEqual(model.linear.weight_fake_quant.zero_point.dtype, torch.int32)
                 model = convert(model, mapping=get_embedding_static_quant_module_mappings())
+=======
+                self.assertEqual(
+                    model.emb.weight_fake_quant.zero_point.dtype, torch.float32
+                )
+                self.assertEqual(
+                    model.linear.weight_fake_quant.zero_point.dtype, torch.int32
+                )
+                model = convert(
+                    model, mapping=get_embedding_static_quant_module_mappings()
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 def checkQuantized(model):
                     # Make sure EmbeddingBag is now a quantized EmbeddingBag.
@@ -505,7 +790,13 @@ def test_train_save_load_eval(self):
                 model.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
                 torch.ao.quantization.prepare(model, inplace=True)
                 torch.ao.quantization.convert(model, inplace=True)
+<<<<<<< HEAD
                 self.assertEqual(set(model.state_dict().keys()), set(quant_state_dict.keys()))
+=======
+                self.assertEqual(
+                    set(model.state_dict().keys()), set(quant_state_dict.keys())
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model.eval()
                 model.load_state_dict(quant_state_dict)
                 out = model(x)
@@ -513,6 +804,7 @@ def test_train_save_load_eval(self):
 
     @override_qengines
     def test_forward_hooks_preserved(self):
+<<<<<<< HEAD
         r"""Test QAT on preserving pre forward and post forward hooks of original model
         """
         qengine = torch.backends.quantized.engine
@@ -527,6 +819,21 @@ def fw_pre_hook(h_module, input):
 
         def fw_hook(h_module, input, output):
             counter['forwards'] += 1
+=======
+        r"""Test QAT on preserving pre forward and post forward hooks of original model"""
+        qengine = torch.backends.quantized.engine
+        model = QuantStubModel()
+        counter = {
+            "pre_forwards": 0,
+            "forwards": 0,
+        }
+
+        def fw_pre_hook(h_module, input):
+            counter["pre_forwards"] += 1
+
+        def fw_hook(h_module, input, output):
+            counter["forwards"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         model.fc.register_forward_pre_hook(fw_pre_hook)
         model.fc.register_forward_hook(fw_hook)
@@ -537,6 +844,7 @@ def fw_hook(h_module, input, output):
         def checkHooksIsPresent(model, before_convert=True):
             forward_hooks = 1
             if before_convert:
+<<<<<<< HEAD
                 self.assertEqual(len(model.quant._forward_hooks.values()), 1,
                                  "Quantization observer hook has disappeared")
                 forward_hooks = 2
@@ -546,6 +854,26 @@ def checkHooksIsPresent(model, before_convert=True):
                              "Extra pre forward hooks have appeared on a layer")
             self.assertEqual(len(model.fc._forward_hooks.values()), forward_hooks,
                              "Extra post forward hooks have appeared on a layer")
+=======
+                self.assertEqual(
+                    len(model.quant._forward_hooks.values()),
+                    1,
+                    "Quantization observer hook has disappeared",
+                )
+                forward_hooks = 2
+            self.assertObjectIn(fw_pre_hook, model.fc._forward_pre_hooks.values())
+            self.assertObjectIn(fw_hook, model.fc._forward_hooks.values())
+            self.assertEqual(
+                len(model.fc._forward_pre_hooks.values()),
+                1,
+                "Extra pre forward hooks have appeared on a layer",
+            )
+            self.assertEqual(
+                len(model.fc._forward_hooks.values()),
+                forward_hooks,
+                "Extra post forward hooks have appeared on a layer",
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         checkHooksIsPresent(model, True)
         x = torch.rand(2, 5, dtype=torch.float)
@@ -600,6 +928,7 @@ def test_qat_embedding_bag_errors(self):
         default_qat_qconfig = get_default_qat_qconfig(torch.backends.quantized.engine)
 
         # Test constructor parameters checks here.
+<<<<<<< HEAD
         with self.assertRaisesRegex(AssertionError,
                                     "qconfig must be provided for QAT module"):
             nnqat.EmbeddingBag(10, 5, qconfig=None)
@@ -607,10 +936,23 @@ def test_qat_embedding_bag_errors(self):
         with self.assertRaisesRegex(AssertionError,
                                     "Embedding Bag weights requires a qscheme of " +
                                     "torch.per_channel_affine_float_qparams"):
+=======
+        with self.assertRaisesRegex(
+            AssertionError, "qconfig must be provided for QAT module"
+        ):
+            nnqat.EmbeddingBag(10, 5, qconfig=None)
+
+        with self.assertRaisesRegex(
+            AssertionError,
+            "Embedding Bag weights requires a qscheme of "
+            + "torch.per_channel_affine_float_qparams",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nnqat.EmbeddingBag(10, 5, qconfig=default_qat_qconfig)
 
         # Test from_float checks here.
         embed = nn.Embedding(10, 5)
+<<<<<<< HEAD
         with self.assertRaisesRegex(AssertionError,
                                     "qat.EmbeddingBag.from_float only works for EmbeddingBag"):
             nnqat.EmbeddingBag.from_float(embed)
@@ -626,6 +968,28 @@ def test_qat_embedding_bag_errors(self):
         with self.assertRaisesRegex(AssertionError,
                                     "Embedding Bag weights requires a qscheme of " +
                                     "torch.per_channel_affine_float_qparams"):
+=======
+        with self.assertRaisesRegex(
+            AssertionError, "qat.EmbeddingBag.from_float only works for EmbeddingBag"
+        ):
+            nnqat.EmbeddingBag.from_float(embed)
+        embed_bag = nn.EmbeddingBag(10, 5)
+        with self.assertRaisesRegex(
+            AssertionError, "Input float module must have qconfig defined"
+        ):
+            nnqat.EmbeddingBag.from_float(embed_bag)
+        embed_bag.qconfig = None
+        with self.assertRaisesRegex(
+            AssertionError, "Input float module must have a valid qconfig"
+        ):
+            nnqat.EmbeddingBag.from_float(embed_bag)
+        embed_bag.qconfig = default_qat_qconfig
+        with self.assertRaisesRegex(
+            AssertionError,
+            "Embedding Bag weights requires a qscheme of "
+            + "torch.per_channel_affine_float_qparams",
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nnqat.EmbeddingBag.from_float(embed_bag)
 
     def test_embedding_qat_qconfig_equal(self):
@@ -636,8 +1000,15 @@ def test_embedding_qat_qconfig_equal(self):
         model = ManualEmbeddingBagLinear().train()
         model = prepare_qat(model)
 
+<<<<<<< HEAD
         self.assertTrue(qconfig_equals(model.emb.qconfig,
                                        default_embedding_qat_qconfig))
+=======
+        self.assertTrue(
+            qconfig_equals(model.emb.qconfig, default_embedding_qat_qconfig)
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestQuantizeEagerQATNumerics(QuantizationTestCase):
     def _test_activation_convert_numerics_impl(self, Act, data):
@@ -683,24 +1054,41 @@ def forward(self, x):
         m = M().train()
         m.qconfig = default_qat_qconfig
         m = prepare_qat(m)
+<<<<<<< HEAD
         for attr in ['sigmoid', 'hardsigmoid', 'tanh']:
             self.assertEqual(type(getattr(m, attr).activation_post_process), FixedQParamsFakeQuantize)
+=======
+        for attr in ["sigmoid", "hardsigmoid", "tanh"]:
+            self.assertEqual(
+                type(getattr(m, attr).activation_post_process), FixedQParamsFakeQuantize
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         data = torch.randn(1, 3, 2, 4)
         before_convert = m(data)
         m = convert(m)
         after_convert = m(data)
         self.assertEqual(before_convert, after_convert)
         # make sure activation post process is removed
+<<<<<<< HEAD
         for attr in ['sigmoid', 'hardsigmoid', 'tanh']:
             # verify fake quant module is removd
             self.assertFalse(hasattr(getattr(m, attr), 'activation_post_process'))
+=======
+        for attr in ["sigmoid", "hardsigmoid", "tanh"]:
+            # verify fake quant module is removd
+            self.assertFalse(hasattr(getattr(m, attr), "activation_post_process"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # verify that hooks are removed
             self.assertTrue(len(getattr(m, attr)._forward_hooks.items()) == 0)
 
         # make sure no fake quantize module is inserted for eval mode
 
         def checkNoFQModule(m):
+<<<<<<< HEAD
             for attr in ['sigmoid', 'hardsigmoid', 'tanh']:
+=======
+            for attr in ["sigmoid", "hardsigmoid", "tanh"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertFalse(hasattr(getattr(m, attr), "activation_post_process"))
                 self.assertTrue(len(getattr(m, attr)._forward_hooks.items()) == 0)
 
@@ -734,6 +1122,7 @@ def forward(self, x):
         # make sure ReLU module is not changed
         self.assertTrue(type(m.relu), nn.ReLU)
 
+<<<<<<< HEAD
     @given(batch_size=st.integers(2, 4),
            input_channels_per_group=st.sampled_from([2, 3, 4]),
            height=st.integers(5, 10),
@@ -778,6 +1167,54 @@ def test_conv_bn_relu(
             zero_gamma,
             has_bias,
             use_slow_fusion,
+=======
+    @given(
+        batch_size=st.integers(2, 4),
+        input_channels_per_group=st.sampled_from([2, 3, 4]),
+        height=st.integers(5, 10),
+        width=st.integers(5, 10),
+        output_channels_per_group=st.sampled_from([2, 3]),
+        groups=st.integers(1, 3),
+        kernel_h=st.integers(1, 3),
+        kernel_w=st.integers(1, 3),
+        stride_h=st.integers(1, 2),
+        stride_w=st.integers(1, 2),
+        pad_h=st.integers(0, 2),
+        pad_w=st.integers(0, 2),
+        dilation=st.integers(1, 1),
+        padding_mode=st.sampled_from(["zeros", "circular"]),
+        use_relu=st.booleans(),
+        eps=st.sampled_from([1e-5, 1e-4, 1e-3]),
+        momentum=st.sampled_from([0.1, 0.2, 0.3]),
+        freeze_bn=st.booleans(),
+        zero_gamma=st.booleans(),
+        has_bias=st.booleans(),
+        use_slow_fusion=st.booleans(),
+    )
+    def test_conv_bn_relu(
+        self,
+        batch_size,
+        input_channels_per_group,
+        height,
+        width,
+        output_channels_per_group,
+        groups,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation,
+        padding_mode,
+        use_relu,
+        eps,
+        momentum,
+        freeze_bn,
+        zero_gamma,
+        has_bias,
+        use_slow_fusion,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         input_channels = input_channels_per_group * groups
         output_channels = output_channels_per_group * groups
@@ -792,7 +1229,11 @@ def test_conv_bn_relu(
             (dilation_h, dilation_w),
             groups,
             has_bias,
+<<<<<<< HEAD
             padding_mode
+=======
+            padding_mode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).to(dtype=torch.double)
         bn_op = BatchNorm2d(output_channels, eps, momentum).to(dtype=torch.double)
         relu_op = ReLU()
@@ -811,7 +1252,11 @@ def test_conv_bn_relu(
             eps,
             momentum,
             freeze_bn=True,
+<<<<<<< HEAD
             qconfig=default_qat_qconfig
+=======
+            qconfig=default_qat_qconfig,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).to(dtype=torch.double)
         qat_op._enable_slow_path_for_better_numerical_stability = use_slow_fusion
 
@@ -826,7 +1271,18 @@ def test_conv_bn_relu(
             qat_op.apply(torch.ao.nn.intrinsic.qat.update_bn_stats)
 
         # align inputs and internal parameters
+<<<<<<< HEAD
         input = torch.randn(batch_size, input_channels, height, width, dtype=torch.double, requires_grad=True)
+=======
+        input = torch.randn(
+            batch_size,
+            input_channels,
+            height,
+            width,
+            dtype=torch.double,
+            requires_grad=True,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         conv_op.weight = torch.nn.Parameter(qat_op.weight.detach())
         if has_bias:
             conv_op.bias = torch.nn.Parameter(qat_op.bias.detach())
@@ -840,10 +1296,15 @@ def compose(functions):
             return reduce(lambda f, g: lambda x: f(g(x)), functions[::-1], lambda x: x)
 
         if not use_relu:
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def relu_op(x):  # noqa: F811
                 return x
 
         if freeze_bn:
+<<<<<<< HEAD
             def ref_op(x):
                 x = conv_op(x)
                 x = (x - bn_op.running_mean.reshape([1, -1, 1, 1])) * \
@@ -851,6 +1312,17 @@ def ref_op(x):
                     .reshape([1, -1, 1, 1]) + bn_op.bias.reshape([1, -1, 1, 1])
                 x = relu_op(x)
                 return x
+=======
+
+            def ref_op(x):
+                x = conv_op(x)
+                x = (x - bn_op.running_mean.reshape([1, -1, 1, 1])) * (
+                    bn_op.weight / torch.sqrt(bn_op.running_var + bn_op.eps)
+                ).reshape([1, -1, 1, 1]) + bn_op.bias.reshape([1, -1, 1, 1])
+                x = relu_op(x)
+                return x
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             ref_op = compose([conv_op, bn_op, relu_op])
 
@@ -882,6 +1354,7 @@ def ref_op(x):
             num_batches_tracked_actual = qat_op.bn.num_batches_tracked
             precision = 1e-10
             self.assertEqual(input_grad_ref, input_grad_actual, atol=precision, rtol=0)
+<<<<<<< HEAD
             self.assertEqual(weight_grad_ref, weight_grad_actual, atol=precision, rtol=0)
             self.assertEqual(gamma_grad_ref, gamma_grad_actual, atol=precision, rtol=0)
             self.assertEqual(beta_grad_ref, beta_grad_actual, atol=precision, rtol=0)
@@ -927,6 +1400,66 @@ def test_conv_bn_folded_vs_unfolded(
             momentum,
             freeze_bn,
             bias,
+=======
+            self.assertEqual(
+                weight_grad_ref, weight_grad_actual, atol=precision, rtol=0
+            )
+            self.assertEqual(gamma_grad_ref, gamma_grad_actual, atol=precision, rtol=0)
+            self.assertEqual(beta_grad_ref, beta_grad_actual, atol=precision, rtol=0)
+            self.assertEqual(
+                num_batches_tracked_ref,
+                num_batches_tracked_actual,
+                atol=precision,
+                rtol=0,
+            )
+            self.assertEqual(
+                running_mean_ref, running_mean_actual, atol=precision, rtol=0
+            )
+            self.assertEqual(
+                running_var_ref, running_var_actual, atol=precision, rtol=0
+            )
+
+    @given(
+        batch_size=st.integers(2, 4),
+        input_channels_per_group=st.sampled_from([2, 3, 4]),
+        height=st.integers(5, 10),
+        width=st.integers(5, 10),
+        output_channels_per_group=st.sampled_from([2, 3]),
+        groups=st.integers(1, 3),
+        kernel_h=st.integers(1, 3),
+        kernel_w=st.integers(1, 3),
+        stride_h=st.integers(1, 2),
+        stride_w=st.integers(1, 2),
+        pad_h=st.integers(0, 2),
+        pad_w=st.integers(0, 2),
+        dilation=st.integers(1, 1),
+        padding_mode=st.sampled_from(["zeros", "circular"]),
+        eps=st.sampled_from([1e-5, 1e-4, 1e-3]),
+        momentum=st.sampled_from([0.1, 0.2, 0.3]),
+        freeze_bn=st.booleans(),
+        bias=st.booleans(),
+    )
+    def test_conv_bn_folded_vs_unfolded(
+        self,
+        batch_size,
+        input_channels_per_group,
+        height,
+        width,
+        output_channels_per_group,
+        groups,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation,
+        padding_mode,
+        eps,
+        momentum,
+        freeze_bn,
+        bias,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         input_channels = input_channels_per_group * groups
         output_channels = output_channels_per_group * groups
@@ -945,7 +1478,11 @@ def test_conv_bn_folded_vs_unfolded(
             eps,
             momentum,
             freeze_bn=freeze_bn,
+<<<<<<< HEAD
             qconfig=default_qat_qconfig
+=======
+            qconfig=default_qat_qconfig,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).to(dtype=torch.double)
 
         qat_ref_op = _ReferenceConvBn2d(
@@ -961,7 +1498,11 @@ def test_conv_bn_folded_vs_unfolded(
             eps,
             momentum,
             freeze_bn=freeze_bn,
+<<<<<<< HEAD
             qconfig=default_qat_qconfig
+=======
+            qconfig=default_qat_qconfig,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).to(dtype=torch.double)
 
         qat_op.apply(torch.ao.quantization.disable_fake_quant)
@@ -981,7 +1522,10 @@ def test_conv_bn_folded_vs_unfolded(
         qat_ref_op_optim = torch.optim.SGD(qat_ref_op.parameters(), lr=lr)
 
         for i in range(5):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # make sure that calling model.train() does not override the
             # bn freeze setting
             qat_op.train()
@@ -990,7 +1534,18 @@ def test_conv_bn_folded_vs_unfolded(
             qat_op_optim.zero_grad()
             qat_ref_op_optim.zero_grad()
 
+<<<<<<< HEAD
             input = torch.randn(batch_size, input_channels, height, width, dtype=torch.double, requires_grad=True)
+=======
+            input = torch.randn(
+                batch_size,
+                input_channels,
+                height,
+                width,
+                dtype=torch.double,
+                requires_grad=True,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input_clone = input.detach().clone().requires_grad_()
 
             if i > 2:
@@ -1030,12 +1585,32 @@ def test_conv_bn_folded_vs_unfolded(
 
             precision = 1e-5
             self.assertEqual(input_grad_ref, input_grad_actual, atol=precision, rtol=0)
+<<<<<<< HEAD
             self.assertEqual(weight_grad_ref, weight_grad_actual, atol=precision, rtol=0)
             self.assertEqual(gamma_grad_ref, gamma_grad_actual, atol=precision, rtol=0)
             self.assertEqual(beta_grad_ref, beta_grad_actual, atol=precision, rtol=0)
             self.assertEqual(num_batches_tracked_ref, num_batches_tracked_actual, atol=precision, rtol=0)
             self.assertEqual(running_mean_ref, running_mean_actual, atol=precision, rtol=0)
             self.assertEqual(running_var_ref, running_var_actual, atol=precision, rtol=0)
+=======
+            self.assertEqual(
+                weight_grad_ref, weight_grad_actual, atol=precision, rtol=0
+            )
+            self.assertEqual(gamma_grad_ref, gamma_grad_actual, atol=precision, rtol=0)
+            self.assertEqual(beta_grad_ref, beta_grad_actual, atol=precision, rtol=0)
+            self.assertEqual(
+                num_batches_tracked_ref,
+                num_batches_tracked_actual,
+                atol=precision,
+                rtol=0,
+            )
+            self.assertEqual(
+                running_mean_ref, running_mean_actual, atol=precision, rtol=0
+            )
+            self.assertEqual(
+                running_var_ref, running_var_actual, atol=precision, rtol=0
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             qat_op_optim.step()
             qat_ref_op_optim.step()
@@ -1048,7 +1623,11 @@ def test_linear_bn_numerics(self):
             nn.BatchNorm1d(4),
         )
         m_ref_copy = copy.deepcopy(m_ref)
+<<<<<<< HEAD
         m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [['0', '1']])
+=======
+        m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [["0", "1"]])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
         m_ref_copy[0].qconfig = qconfig
         m = nniqat.LinearBn1d.from_float(m_ref_copy[0])
@@ -1071,7 +1650,11 @@ def test_linear_bn_symm_numerics(self):
             nn.BatchNorm1d(4),
         )
         m_ref_copy = copy.deepcopy(m_ref)
+<<<<<<< HEAD
         m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [['0', '1']])
+=======
+        m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [["0", "1"]])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qconfig = default_symmetric_qnnpack_qat_qconfig
         m_ref_copy[0].qconfig = qconfig
         m = nniqat.LinearBn1d.from_float(m_ref_copy[0])
@@ -1093,14 +1676,21 @@ def test_linear_bn_workflow(self):
         )
         data = torch.randn(4, 4)
         m.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
+<<<<<<< HEAD
         m = torch.ao.quantization.fuse_modules_qat(m, [['1', '2']])
+=======
+        m = torch.ao.quantization.fuse_modules_qat(m, [["1", "2"]])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mp = prepare_qat(m)
         mp(data)
         mq = convert(mp)
         self.assertTrue(type(mq[1]) == nnq.Linear)
         self.assertTrue(type(mq[2]) == nn.Identity)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfNoXNNPACK
     @override_qengines
     def test_linear_precomputed_fake_quant(self):
@@ -1124,6 +1714,7 @@ def test_linear_precomputed_fake_quant(self):
         m_ref.activation_post_process = activation
         m_ref.qconfig = qconfig
         m_ref = nnq.Linear.from_float(m_ref, use_precomputed_fake_quant=True)
+<<<<<<< HEAD
         self.assertTrue(m_ref._weight_bias()[0].q_scale != m_ref_copy._weight_bias()[0].q_scale)
 
 
@@ -1131,3 +1722,16 @@ def test_linear_precomputed_fake_quant(self):
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
                        "instead.")
+=======
+        self.assertTrue(
+            m_ref._weight_bias()[0].q_scale != m_ref_copy._weight_bias()[0].q_scale
+        )
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_quantization.py TESTNAME\n\n"
+        "instead."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/fx/test_equalize_fx.py b/test/quantization/fx/test_equalize_fx.py
index 648afa81b5ae..b6b459f0144b 100644
--- a/test/quantization/fx/test_equalize_fx.py
+++ b/test/quantization/fx/test_equalize_fx.py
@@ -43,6 +43,10 @@
     FunctionalConvReluModel,
     FunctionalConvReluConvModel,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Standard Libraries
 import copy
@@ -894,3 +898,9 @@ def forward(self, x):
 
         # Check the order of nodes in the graph
         self.checkGraphModuleNodes(equalized_model, expected_node_list=node_list)
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/fx/test_model_report_fx.py b/test/quantization/fx/test_model_report_fx.py
index 3b0ff7a5ecec..08a6f52adf99 100644
--- a/test/quantization/fx/test_model_report_fx.py
+++ b/test/quantization/fx/test_model_report_fx.py
@@ -30,6 +30,10 @@
     skipIfNoQNNPACK,
     override_quantized_engine,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 """
@@ -1956,3 +1960,9 @@ def _get_prepped_for_calibration_model_helper(model, detector_set, example_input
     prepared_for_callibrate_model = model_report.prepare_detailed_calibration()
 
     return (prepared_for_callibrate_model, model_report)
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index 84c4f84fa355..58f5d3cb771b 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -37,7 +37,11 @@
     skip_if_no_torchvision,
     TwoLayerLinearModel
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfTorchDynamo
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly, skipIfTorchDynamo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.quantization_mappings import (
     get_default_static_quant_module_mappings,
     get_default_dynamic_quant_module_mappings,
@@ -2915,3 +2919,9 @@ def test_mobilenet_v2(self):
             m, (torch.randn(1, 3, 224, 224),),
             qconfig_dict=qconfig_dict,
             should_log_inputs=False)
+<<<<<<< HEAD
+=======
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 8f94dfaf7163..f8099d74905f 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -6666,6 +6666,10 @@ def __init__(self, input_dim, hidden_dim, output_dim):
                 setattr(self, 'submodule|2', SubModule(hidden_dim, hidden_dim))
                 setattr(self, 'submodule/3', SubModule(hidden_dim, hidden_dim))
                 setattr(self, 'submodule:4', SubModule(hidden_dim, hidden_dim))
+<<<<<<< HEAD
+=======
+                setattr(self, 'submodule: 5', SubModule(hidden_dim, hidden_dim))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._w = nn.Parameter(torch.randn(output_dim, hidden_dim))
 
             def forward(self, x):
@@ -6673,8 +6677,14 @@ def forward(self, x):
                 x2 = getattr(self, 'submodule|2')(x1)
                 x3 = getattr(self, 'submodule/3')(x2)
                 x4 = getattr(self, 'submodule:4')(x3)
+<<<<<<< HEAD
                 x5 = F.linear(x4, self._w)
                 return x5
+=======
+                x5 = getattr(self, 'submodule: 5')(x4)
+                x6 = F.linear(x5, self._w)
+                return x6
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         input_dim = 10
         hidden_dim = 20
@@ -6688,7 +6698,11 @@ def forward(self, x):
         prepared_model(example_inputs)
         quantized_model = convert_fx(prepared_model, keep_original_weights=True)
 
+<<<<<<< HEAD
         self.assertTrue(len(quantized_model.original_weights_lookup) == 5)
+=======
+        self.assertTrue(len(quantized_model.original_weights_lookup) == 6)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue("submodule_1_packed_weight_0" in quantized_model.original_weights_lookup)
         torch.testing.assert_close(
             quantized_model.original_weights_lookup["submodule_1_packed_weight_0"][0],
@@ -6725,6 +6739,18 @@ def forward(self, x):
             quantized_model.original_weights_lookup["submodule_4_packed_weight_0"][1],
             getattr(model, "submodule:4").b
         )
+<<<<<<< HEAD
+=======
+        self.assertTrue("submodule_5_packed_weight_0" in quantized_model.original_weights_lookup)
+        torch.testing.assert_close(
+            quantized_model.original_weights_lookup["submodule_5_packed_weight_0"][0],
+            getattr(model, "submodule: 5").w
+        )
+        torch.testing.assert_close(
+            quantized_model.original_weights_lookup["submodule_5_packed_weight_0"][1],
+            getattr(model, "submodule: 5").b
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue("_packed_weight_0" in quantized_model.original_weights_lookup)
         torch.testing.assert_close(
             quantized_model.original_weights_lookup["_packed_weight_0"][0],
diff --git a/test/quantization/jit/test_fusion_passes.py b/test/quantization/jit/test_fusion_passes.py
index a1a9eceadb53..3fdb22abbba6 100644
--- a/test/quantization/jit/test_fusion_passes.py
+++ b/test/quantization/jit/test_fusion_passes.py
@@ -4,6 +4,10 @@
 import torch
 from torch.testing import FileCheck
 from torch.testing._internal.common_quantization import QuantizationTestCase
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestFusionPasses(QuantizationTestCase):
@@ -104,3 +108,10 @@ def forward(self, x, y: float, z):
         ).check("quantized::add_scalar_relu_out").run(scripted_m.graph)
         output = scripted_m(qA, 3.0, qC)
         self.assertEqual(ref_output, output)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/jit/test_ondevice_quantization.py b/test/quantization/jit/test_ondevice_quantization.py
index f9d43f183a4a..69a1695092e7 100644
--- a/test/quantization/jit/test_ondevice_quantization.py
+++ b/test/quantization/jit/test_ondevice_quantization.py
@@ -528,3 +528,13 @@ def test_serialization_deserialization(self):
     def test_device_side_api(self):
         model = MyConvLinearModule()
         self._check_device_side_api(model)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py
index 264a11cb8631..d6f17158e8b8 100644
--- a/test/quantization/jit/test_quantize_jit.py
+++ b/test/quantization/jit/test_quantize_jit.py
@@ -71,7 +71,14 @@
     qengine_is_fbgemm,
     qengine_is_qnnpack,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import set_default_dtype
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    set_default_dtype,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.jit_utils import (
     attrs_with_prefix,
     get_forward,
@@ -3880,3 +3887,10 @@ def test_linear_dynamic_fp16(self):
                 )
             # compare result with eager mode
             self.assertEqual(quantized_model(self.calib_data[0][0]), result_eager)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/pt2e/test_duplicate_dq.py b/test/quantization/pt2e/test_duplicate_dq.py
index 54456ab37b15..e5ee9b2d3ed1 100644
--- a/test/quantization/pt2e/test_duplicate_dq.py
+++ b/test/quantization/pt2e/test_duplicate_dq.py
@@ -26,7 +26,11 @@
 )
 from torch.export import export_for_training
 from torch.testing._internal.common_quantization import QuantizationTestCase
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS
+=======
+from torch.testing._internal.common_utils import IS_WINDOWS, raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestHelperModules:
@@ -101,10 +105,14 @@ def _test_duplicate_dq(
 
         # program capture
         m = copy.deepcopy(m_eager)
+<<<<<<< HEAD
         m = export_for_training(
             m,
             example_inputs,
         ).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         m = prepare_pt2e(m, quantizer)
         # Calibrate
@@ -310,3 +318,10 @@ def validate(self, model: torch.fx.GraphModule) -> None:
             example_inputs,
             BackendAQuantizer(),
         )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/pt2e/test_graph_utils.py b/test/quantization/pt2e/test_graph_utils.py
index ac020795f5f7..5a1454892a86 100644
--- a/test/quantization/pt2e/test_graph_utils.py
+++ b/test/quantization/pt2e/test_graph_utils.py
@@ -9,7 +9,15 @@
     get_equivalent_types,
     update_equivalent_types_dict,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS, TestCase
+=======
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    raise_on_run_directly,
+    TestCase,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestGraphUtils(TestCase):
@@ -32,7 +40,11 @@ def forward(self, x):
         example_inputs = (torch.randn(1, 3, 5, 5),)
 
         # program capture
+<<<<<<< HEAD
         m, guards = torchdynamo.export(  # noqa: F841©
+=======
+        m, guards = torchdynamo.export(  # noqa: F841
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             m,
             *copy.deepcopy(example_inputs),
             aten_graph=True,
@@ -121,3 +133,10 @@ def forward(self, x):
             [torch.nn.Conv2d, torch.nn.ReLU6],
         )
         self.assertEqual(len(fused_partitions), 1)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/pt2e/test_metadata_porting.py b/test/quantization/pt2e/test_metadata_porting.py
index 4f6eb4f56d3a..ef208744ea88 100644
--- a/test/quantization/pt2e/test_metadata_porting.py
+++ b/test/quantization/pt2e/test_metadata_porting.py
@@ -12,7 +12,15 @@
 from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import OP_TO_ANNOTATOR
 from torch.fx import Node
 from torch.testing._internal.common_quantization import QuantizationTestCase
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS, skipIfCrossRef
+=======
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    raise_on_run_directly,
+    skipIfCrossRef,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestHelperModules:
@@ -98,10 +106,14 @@ def _test_metadata_porting(
 
         # program capture
         m = copy.deepcopy(m_eager)
+<<<<<<< HEAD
         m = torch.export.export_for_training(
             m,
             example_inputs,
         ).module()
+=======
+        m = torch.export.export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         m = prepare_pt2e(m, quantizer)
         # Calibrate
@@ -516,3 +528,10 @@ def validate(self, model: torch.fx.GraphModule) -> None:
             BackendAQuantizer(),
             node_tags,
         )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/pt2e/test_numeric_debugger.py b/test/quantization/pt2e/test_numeric_debugger.py
index b5ada0cc3d59..085dfd1f419c 100644
--- a/test/quantization/pt2e/test_numeric_debugger.py
+++ b/test/quantization/pt2e/test_numeric_debugger.py
@@ -21,7 +21,16 @@
 )
 from torch.export import export_for_training
 from torch.testing._internal.common_quantization import TestHelperModules
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_WINDOWS, skipIfCrossRef, TestCase
+=======
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    raise_on_run_directly,
+    skipIfCrossRef,
+    TestCase,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @unittest.skipIf(IS_WINDOWS, "Windows not yet supported for torch.compile")
@@ -81,7 +90,11 @@ def _extract_debug_handles_with_prev_decomp_op_from_node(node):
     def test_simple(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export_for_training(m, example_inputs)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         self._assert_each_node_has_debug_handle(ep)
         debug_handle_map = self._extract_debug_handles(ep)
@@ -91,7 +104,11 @@ def test_simple(self):
     def test_control_flow(self):
         m = TestHelperModules.ControlFlow()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export_for_training(m, example_inputs)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
 
         self._assert_each_node_has_debug_handle(ep)
@@ -102,7 +119,11 @@ def test_control_flow(self):
     def test_quantize_pt2e_preserve_handle(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export_for_training(m, example_inputs)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         m = ep.module()
 
@@ -162,14 +183,22 @@ def test_deepcopy_preserve_handle(self):
     def test_re_export_preserve_handle(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export_for_training(m, example_inputs)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         m = ep.module()
 
         self._assert_each_node_has_debug_handle(ep)
         debug_handle_map_ref = self._extract_debug_handles(ep)
 
+<<<<<<< HEAD
         ep_reexport = export_for_training(m, example_inputs)
+=======
+        ep_reexport = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._assert_each_node_has_debug_handle(ep_reexport)
         debug_handle_map = self._extract_debug_handles(ep_reexport)
@@ -179,7 +208,11 @@ def test_re_export_preserve_handle(self):
     def test_run_decompositions_same_handle_id(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export_for_training(m, example_inputs)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
 
         self._assert_each_node_has_debug_handle(ep)
@@ -204,7 +237,11 @@ def test_run_decompositions_map_handle_to_new_nodes(self):
 
         for m in test_models:
             example_inputs = m.example_inputs()
+<<<<<<< HEAD
             ep = export_for_training(m, example_inputs)
+=======
+            ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             generate_numeric_debug_handle(ep)
 
             self._assert_each_node_has_debug_handle(ep)
@@ -227,7 +264,11 @@ def test_run_decompositions_map_handle_to_new_nodes(self):
     def test_prepare_for_propagation_comparison(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export_for_training(m, example_inputs)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_logger = prepare_for_propagation_comparison(m)
@@ -244,7 +285,11 @@ def test_prepare_for_propagation_comparison(self):
     def test_extract_results_from_loggers(self):
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export_for_training(m, example_inputs)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_ref_logger = prepare_for_propagation_comparison(m)
@@ -269,7 +314,11 @@ def test_extract_results_from_loggers(self):
     def test_extract_results_from_loggers_list_output(self):
         m = TestHelperModules.Conv2dWithSplit()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export_for_training(m, example_inputs)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_ref_logger = prepare_for_propagation_comparison(m)
@@ -299,7 +348,11 @@ def test_extract_results_from_loggers_list_output(self):
     def test_added_node_gets_unique_id(self) -> None:
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
+<<<<<<< HEAD
         ep = export_for_training(m, example_inputs)
+=======
+        ep = export_for_training(m, example_inputs, strict=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generate_numeric_debug_handle(ep)
         ref_handles = self._extract_debug_handles(ep)
         ref_counter = Counter(ref_handles.values())
@@ -346,3 +399,10 @@ def test_added_node_gets_unique_id(self) -> None:
         # may change with future node ordering changes.
         self.assertNotEqual(handles_after_modification["relu_default"], 0)
         self.assertEqual(handles_counter[handles_after_modification["relu_default"]], 1)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
index 2bc87f72fc25..f6caca7e0e3d 100644
--- a/test/quantization/pt2e/test_quantize_pt2e.py
+++ b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -767,10 +767,14 @@ def validate(self, model: torch.fx.GraphModule) -> None:
         example_inputs = (torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5))
 
         # program capture
+<<<<<<< HEAD
         m = export_for_training(
             m,
             example_inputs,
         ).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, BackendAQuantizer())
         # make sure the two observers for input are shared
         conv_output_obs = []
@@ -830,10 +834,14 @@ def _test_transitive_sharing_with_cat_helper(self, quantizer):
         )
 
         # program capture
+<<<<<<< HEAD
         m = export_for_training(
             m,
             example_inputs,
         ).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
         # make sure the two input observers and output are shared
@@ -1152,10 +1160,14 @@ def validate(self, model: torch.fx.GraphModule) -> None:
         )
 
         # program capture
+<<<<<<< HEAD
         m = export_for_training(
             m,
             example_inputs,
         ).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = BackendAQuantizer()
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
@@ -1305,7 +1317,11 @@ def validate(self, model: torch.fx.GraphModule) -> None:
 
         m = M().eval()
         example_inputs = torch.randn(1, 2, 3, 3)
+<<<<<<< HEAD
         m = export_for_training(m, (example_inputs,)).module()
+=======
+        m = export_for_training(m, (example_inputs,), strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaises(Exception):
             m = prepare_pt2e(m, BackendAQuantizer())
 
@@ -1428,10 +1444,14 @@ def forward(self, x):
         quantizer.set_global(operator_config)
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
+<<<<<<< HEAD
         m = export_for_training(
             m,
             example_inputs,
         ).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weight_meta = None
         for n in m.graph.nodes:
             if (
@@ -1518,7 +1538,11 @@ def forward(self, x):
         m = M().eval()
         quantizer = TestQuantizer()
         example_inputs = (torch.randn(1, 2, 3, 3),)
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
         node_occurrence = {
@@ -1569,7 +1593,11 @@ def forward(self, x, y, z):
             torch.randn(1, 2, 3, 3),
             torch.randn(1, 2, 3, 3),
         )
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
         node_occurrence = {
@@ -1824,7 +1852,11 @@ def forward(self, x):
 
         example_inputs = (torch.randn(1),)
         m = M().train()
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if inplace:
             target = torch.ops.aten.dropout_.default
         else:
@@ -1889,7 +1921,11 @@ def forward(self, x):
             m = M().train()
             example_inputs = (torch.randn(1, 3, 3, 3),)
         bn_train_op, bn_eval_op = self._get_bn_train_eval_ops()
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Assert that batch norm op exists and is in train mode
         bn_node = self._get_node(m, bn_train_op)
@@ -1920,7 +1956,11 @@ def test_disallow_eval_train(self):
         m.train()
 
         # After export: this is not OK
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaises(NotImplementedError):
             m.eval()
         with self.assertRaises(NotImplementedError):
@@ -1961,7 +2001,11 @@ def forward(self, x):
             m = M().train()
             example_inputs = (torch.randn(1, 3, 3, 3),)
         bn_train_op, bn_eval_op = self._get_bn_train_eval_ops()
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def _assert_ops_are_correct(m: torch.fx.GraphModule, train: bool):
             targets = [n.target for n in m.graph.nodes]
@@ -2027,7 +2071,11 @@ def forward(self, x):
 
         m = M().train()
         example_inputs = (torch.randn(1, 3, 3, 3),)
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.ao.quantization.allow_exported_model_train_eval(m)
 
         # Mock m.recompile() to count how many times it's been called
@@ -2059,7 +2107,11 @@ def _fake_recompile():
     def test_model_is_exported(self):
         m = TestHelperModules.ConvWithBNRelu(relu=True)
         example_inputs = (torch.rand(3, 3, 5, 5),)
+<<<<<<< HEAD
         exported_gm = export_for_training(m, example_inputs).module()
+=======
+        exported_gm = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fx_traced_gm = torch.fx.symbolic_trace(m, example_inputs)
         self.assertTrue(
             torch.ao.quantization.pt2e.export_utils.model_is_exported(exported_gm)
@@ -2077,7 +2129,13 @@ def test_reentrant(self):
         quantizer = XNNPACKQuantizer().set_global(
             get_symmetric_quantization_config(is_per_channel=True, is_qat=True)
         )
+<<<<<<< HEAD
         m.conv_bn_relu = export_for_training(m.conv_bn_relu, example_inputs).module()
+=======
+        m.conv_bn_relu = export_for_training(
+            m.conv_bn_relu, example_inputs, strict=True
+        ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m.conv_bn_relu = prepare_qat_pt2e(m.conv_bn_relu, quantizer)
         m(*example_inputs)
         m.conv_bn_relu = convert_pt2e(m.conv_bn_relu)
@@ -2085,7 +2143,11 @@ def test_reentrant(self):
         quantizer = XNNPACKQuantizer().set_module_type(
             torch.nn.Linear, get_symmetric_quantization_config(is_per_channel=False)
         )
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         m = convert_pt2e(m)
 
@@ -2257,7 +2319,11 @@ def test_speed(self):
 
         def dynamic_quantize_pt2e(model, example_inputs):
             torch._dynamo.reset()
+<<<<<<< HEAD
             model = export_for_training(model, example_inputs).module()
+=======
+            model = export_for_training(model, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Per channel quantization for weight
             # Dynamic quantization for activation
             # Please read a detail: https://fburl.com/code/30zds51q
@@ -2343,6 +2409,121 @@ def validate(self, model: torch.fx.GraphModule) -> None:
             node_list,
         )
 
+<<<<<<< HEAD
+=======
+    def test_conv_padding_bn_relu(self):
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                act_qspec = QuantizationSpec(
+                    dtype=torch.uint8,
+                    quant_min=0,
+                    quant_max=255,
+                    qscheme=torch.per_tensor_affine,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.default_observer,
+                )
+                weight_qspec = QuantizationSpec(
+                    dtype=torch.int8,
+                    quant_min=-128,
+                    quant_max=127,
+                    qscheme=torch.per_tensor_affine,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.default_weight_observer,
+                )
+                bias_qspec = QuantizationSpec(
+                    dtype=torch.float32,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.PlaceholderObserver,
+                )
+
+                for n in model.graph.nodes:
+                    if (
+                        n.op != "call_function"
+                        or n.target != torch.ops.aten.relu.default
+                    ):
+                        continue
+                    relu_node = n
+                    n = n.args[0]
+
+                    # Check for any of the conv operations
+                    conv_ops = [
+                        torch.ops.aten.conv1d.padding,
+                        torch.ops.aten.conv2d.padding,
+                        torch.ops.aten.conv3d.padding,
+                    ]
+                    if n.op != "call_function" or n.target not in conv_ops:
+                        continue
+
+                    conv_node = n
+                    input_act = conv_node.args[0]
+                    weight = conv_node.args[1]
+                    bias = conv_node.args[2]
+                    conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
+                        input_qspec_map={
+                            input_act: act_qspec,
+                            weight: weight_qspec,
+                            bias: bias_qspec,
+                        },
+                        _annotated=True,
+                    )
+                    relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+                        output_qspec=act_qspec,
+                        _annotated=True,
+                    )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        # Test cases for Conv1d, Conv2d, Conv3d
+        test_cases = [
+            {
+                "dim": 1,
+                "example_input": (torch.randn(1, 3, 5),),
+                "conv_op": torch.ops.aten.conv1d.padding,
+            },
+            {
+                "dim": 2,
+                "example_input": (torch.randn(1, 3, 5, 5),),
+                "conv_op": torch.ops.aten.conv2d.padding,
+            },
+            {
+                "dim": 3,
+                "example_input": (torch.randn(1, 3, 5, 5, 5),),
+                "conv_op": torch.ops.aten.conv3d.padding,
+            },
+        ]
+
+        for test_case in test_cases:
+            with self.subTest(dim=test_case["dim"]):
+                model = TestHelperModules.ConvWithBNRelu(
+                    relu=True,
+                    dim=test_case["dim"],
+                    bn=True,
+                    bias=True,
+                    padding="same",  # This will trigger the .padding variants
+                ).eval()
+
+                node_occurrence = {
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default: 2,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default: 3,
+                }
+                node_list = [
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                    test_case["conv_op"],
+                    torch.ops.aten.relu.default,
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                ]
+
+                self._test_quantizer(
+                    model,
+                    test_case["example_input"],
+                    BackendAQuantizer(),
+                    node_occurrence,
+                    node_list,
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_multi_users_without_output_observer(self):
         """
         Test the case in which a node is used by multiple users,
@@ -2360,7 +2541,11 @@ def forward(self, x):
 
         example_inputs = (torch.randn(1, 3, 5, 5),)
         m = M()
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = XNNPACKQuantizer().set_global(
             get_symmetric_quantization_config(),
         )
@@ -2442,7 +2627,11 @@ def prepare_obs_or_fq_callback(
                     edge_or_node_to_obs_or_fq[x] = new_observer
 
         example_inputs = (torch.rand(1, 32, 16, 16),)
+<<<<<<< HEAD
         gm = export_for_training(Model().eval(), example_inputs).module()
+=======
+        gm = export_for_training(Model().eval(), example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gm = prepare_pt2e(gm, BackendAQuantizer())
         gm = convert_pt2e(gm)
         for n in gm.graph.nodes:
@@ -2450,7 +2639,11 @@ def prepare_obs_or_fq_callback(
                 torch.ops.quantized_decomposed.quantize_per_tensor.default,
                 torch.ops.quantized_decomposed.dequantize_per_tensor.default,
             ):
+<<<<<<< HEAD
                 # Entire graph share the same qspec which was overriden by FixedQParamsObserver
+=======
+                # Entire graph share the same qspec which was overridden by FixedQParamsObserver
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(n.args[1], 0.125)
                 self.assertEqual(n.args[2], 42)
 
@@ -2469,7 +2662,13 @@ def check_nn_module(node):
                 "ConvWithBNRelu" in node.meta["nn_module_stack"]["L__self__"][1]
             )
 
+<<<<<<< HEAD
         m.conv_bn_relu = export_for_training(m.conv_bn_relu, example_inputs).module()
+=======
+        m.conv_bn_relu = export_for_training(
+            m.conv_bn_relu, example_inputs, strict=True
+        ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in m.conv_bn_relu.graph.nodes:
             if node.op not in ["placeholder", "output", "get_attr"]:
                 check_nn_module(node)
@@ -2562,5 +2761,191 @@ def forward(self, x):
             is_debug_mode=True,
         )
 
+<<<<<<< HEAD
+=======
+    def test_dynamic_affine_act_per_channel_weights(self):
+        import operator
+
+        from torch.ao.quantization.observer import (
+            MappingType,
+            PerChannelMinMaxObserver,
+            PerToken,
+        )
+        from torch.ao.quantization.pt2e._affine_quantization import (
+            AffineQuantizedMovingAverageMinMaxObserver,
+        )
+
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                for node in model.graph.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.linear.default
+                    ):
+                        input_act = node.args[0]
+                        assert isinstance(input_act, Node)
+                        weight = node.args[1]
+                        assert isinstance(weight, Node)
+
+                        activation_dtype = torch.int8
+                        act_qspec = QuantizationSpec(
+                            dtype=activation_dtype,
+                            quant_min=-128,
+                            quant_max=127,
+                            qscheme=None,
+                            is_dynamic=True,
+                            observer_or_fake_quant_ctr=AffineQuantizedMovingAverageMinMaxObserver.with_args(
+                                # TODO: maybe align the arg name here
+                                target_dtype=activation_dtype,
+                                mapping_type=MappingType.SYMMETRIC,
+                                granularity=PerToken(),
+                                averaging_constant=1,
+                            ),
+                        )
+
+                        weight_qspec = QuantizationSpec(
+                            dtype=torch.int8,
+                            quant_min=-127,
+                            quant_max=127,
+                            qscheme=torch.per_channel_symmetric,
+                            ch_axis=0,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=PerChannelMinMaxObserver.with_args(),
+                        )
+                        node.meta["quantization_annotation"] = QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act: act_qspec,
+                                weight: weight_qspec,
+                            },
+                            _annotated=True,
+                        )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(128, 20)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        node_occurrence = {
+            torch.ops.pt2e_quant.choose_qparams_affine: 1,
+            operator.getitem: 2,
+            torch.ops.pt2e_quant.quantize_affine: 1,
+            torch.ops.pt2e_quant.dequantize_affine: 1,
+            torch.ops.quantized_decomposed.dequantize_per_channel.default: 1,
+        }
+        node_list = [
+            torch.ops.quantized_decomposed.dequantize_per_channel.default,
+            torch.ops.pt2e_quant.choose_qparams_affine,
+            operator.getitem,
+            torch.ops.pt2e_quant.quantize_affine,
+            torch.ops.pt2e_quant.dequantize_affine,
+        ]
+        example_inputs = (torch.randn(5, 128),)
+        self._test_quantizer(
+            M().eval(),
+            example_inputs,
+            BackendAQuantizer(),
+            node_occurrence,
+            node_list,
+            is_debug_mode=True,
+        )
+
+    def test_dynamic_per_tok_act_per_group_weights(self):
+        import operator
+
+        from torch.ao.quantization.observer import MappingType, PerGroup, PerToken
+        from torch.ao.quantization.pt2e._affine_quantization import (
+            AffineQuantizedMinMaxObserver,
+            AffineQuantizedPlaceholderObserver,
+        )
+
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                for node in model.graph.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.linear.default
+                    ):
+                        input_act = node.args[0]
+                        assert isinstance(input_act, Node)
+                        weight = node.args[1]
+                        assert isinstance(weight, Node)
+
+                        activation_dtype = torch.int8
+                        act_qspec = QuantizationSpec(
+                            dtype=activation_dtype,
+                            quant_min=-128,
+                            quant_max=127,
+                            qscheme=None,
+                            is_dynamic=True,
+                            observer_or_fake_quant_ctr=AffineQuantizedPlaceholderObserver.with_args(
+                                # TODO: maybe align the arg name here
+                                target_dtype=activation_dtype,
+                                mapping_type=MappingType.SYMMETRIC,
+                                granularity=PerToken(),
+                            ),
+                        )
+
+                        weight_qspec = QuantizationSpec(
+                            dtype=torch.int8,
+                            quant_min=-127,
+                            quant_max=127,
+                            qscheme=torch.per_channel_symmetric,
+                            ch_axis=0,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=AffineQuantizedMinMaxObserver.with_args(
+                                target_dtype=torch.int8,
+                                mapping_type=MappingType.SYMMETRIC,
+                                granularity=PerGroup(group_size=128),
+                            ),
+                        )
+                        node.meta["quantization_annotation"] = QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act: act_qspec,
+                                weight: weight_qspec,
+                            },
+                            _annotated=True,
+                        )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(128, 20)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        node_occurrence = {
+            torch.ops.pt2e_quant.choose_qparams_affine: 1,
+            operator.getitem: 2,
+            torch.ops.pt2e_quant.quantize_affine: 1,
+            torch.ops.pt2e_quant.dequantize_affine: 2,
+        }
+        node_list = [
+            torch.ops.pt2e_quant.dequantize_affine,
+            torch.ops.pt2e_quant.choose_qparams_affine,
+            operator.getitem,
+            torch.ops.pt2e_quant.quantize_affine,
+            torch.ops.pt2e_quant.dequantize_affine,
+        ]
+        example_inputs = (torch.randn(5, 128),)
+        self._test_quantizer(
+            M().eval(),
+            example_inputs,
+            BackendAQuantizer(),
+            node_occurrence,
+            node_list,
+            is_debug_mode=True,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(TestQuantizePT2E)
diff --git a/test/quantization/pt2e/test_quantize_pt2e_qat.py b/test/quantization/pt2e/test_quantize_pt2e_qat.py
index abc9849aee82..92f047dd5a0a 100644
--- a/test/quantization/pt2e/test_quantize_pt2e_qat.py
+++ b/test/quantization/pt2e/test_quantize_pt2e_qat.py
@@ -43,6 +43,10 @@
     skipIfNoQNNPACK,
 )
 from torch.testing._internal.common_quantized import override_quantized_engine
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class PT2EQATTestCase(QuantizationTestCase):
@@ -140,8 +144,12 @@ def _verify_symmetric_xnnpack_qat_numerics_helper(
             )
         )
         model_pt2e = export_for_training(
+<<<<<<< HEAD
             model_pt2e,
             example_inputs,
+=======
+            model_pt2e, example_inputs, strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).module()
         model_pt2e = prepare_qat_pt2e(model_pt2e, quantizer)
         torch.manual_seed(MANUAL_SEED)
@@ -229,10 +237,14 @@ def _verify_symmetric_xnnpack_qat_graph_helper(
         quantizer.set_global(
             get_symmetric_quantization_config(is_per_channel, is_qat=True)
         )
+<<<<<<< HEAD
         m = export_for_training(
             m,
             example_inputs,
         ).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
 
@@ -621,7 +633,11 @@ def forward(self, x):
         m = M(self.conv_class, self.bn_class, backbone)
         quantizer = XNNPACKQuantizer()
         quantizer.set_global(get_symmetric_quantization_config(is_qat=True))
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
         m = convert_pt2e(m)
@@ -679,7 +695,11 @@ def get_source_fn(node: torch.fx.Node):
     def test_qat_conv_bn_bias_derived_qspec(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = ConvBnDerivedBiasQuantizer()
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
@@ -726,7 +746,11 @@ def test_qat_conv_bn_bias_derived_qspec(self):
     def test_qat_per_channel_weight_custom_dtype(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = ConvBnInt32WeightQuantizer()
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
@@ -780,7 +804,11 @@ def test_qat_conv_transpose_bn_relu(self):
     def test_qat_conv_bn_per_channel_weight_bias(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = ConvBnDerivedBiasQuantizer(is_per_channel=True)
         m = prepare_qat_pt2e(m, quantizer)
         m(*example_inputs)
@@ -837,7 +865,11 @@ def test_fold_bn_erases_bn_node(self):
         it into conv in `convert_pt2e` even in train mode.
         """
         m = self._get_conv_bn_model(has_conv_bias=False, has_bn=True, has_relu=False)
+<<<<<<< HEAD
         m = export_for_training(m, self.example_inputs).module()
+=======
+        m = export_for_training(m, self.example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer = XNNPACKQuantizer()
         quantizer.set_global(
             get_symmetric_quantization_config(is_per_channel=False, is_qat=True),
@@ -848,6 +880,42 @@ def test_fold_bn_erases_bn_node(self):
         self.assertTrue(conv_node is not None)
         self.assertTrue(bn_node is None)
 
+<<<<<<< HEAD
+=======
+    def test_fold_bn_erases_add_node(self):
+        """
+        Test that batch norm stat tracking (which results in an add_ tensor) is removed when folding batch norm.
+        """
+        m = self._get_conv_bn_model(has_conv_bias=False, has_bn=True, has_relu=False)
+        m = export_for_training(m, self.example_inputs, strict=True).module()
+
+        def _has_add_(graph):
+            for node in graph.nodes:
+                if node.target == torch.ops.aten.add_.Tensor:
+                    return True
+            return False
+
+        # Verify that add_ tensor exists in the exported model (for tracking batch norm stats)
+        has_add_tensor_before = _has_add_(m.graph)
+        self.assertTrue(
+            has_add_tensor_before, "Expected to find add_ tensor in the exported model"
+        )
+
+        quantizer = XNNPACKQuantizer()
+        quantizer.set_global(
+            get_symmetric_quantization_config(is_per_channel=False, is_qat=True),
+        )
+        m = prepare_qat_pt2e(m, quantizer)
+        m = convert_pt2e(m)
+
+        # Verify that add_ tensor is removed in the quantized model
+        has_add_tensor_after = _has_add_(m.graph)
+        self.assertFalse(
+            has_add_tensor_after,
+            "Expected add_ tensor to be removed in the quantized model",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @skipIfNoQNNPACK
 class TestQuantizePT2EQAT_ConvBn1d(TestQuantizePT2EQAT_ConvBn_Base):
@@ -1085,7 +1153,13 @@ def _prepare_qat_linears(self, model):
                     in_channels = child.linear1.weight.size(1)
 
                 example_input = (torch.rand((1, in_channels)),)
+<<<<<<< HEAD
                 traced_child = export_for_training(child, example_input).module()
+=======
+                traced_child = export_for_training(
+                    child, example_input, strict=True
+                ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 quantizer = XNNPACKQuantizer()
                 quantization_config = get_symmetric_quantization_config(
                     is_per_channel=True, is_qat=True
@@ -1116,10 +1190,14 @@ def test_mixing_qat_ptq(self):
         self._convert_qat_linears(model)
         model(*example_inputs)
 
+<<<<<<< HEAD
         model_pt2e = export_for_training(
             model,
             example_inputs,
         ).module()
+=======
+        model_pt2e = export_for_training(model, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         quantizer = XNNPACKQuantizer()
         quantizer.set_module_type(torch.nn.Linear, None)
@@ -1149,3 +1227,10 @@ def test_mixing_qat_ptq(self):
         self.checkGraphModuleNodes(
             exported_model.graph_module, expected_node_occurrence=node_occurrence
         )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/pt2e/test_representation.py b/test/quantization/pt2e/test_representation.py
index c6eed1ed8260..7e3738616497 100644
--- a/test/quantization/pt2e/test_representation.py
+++ b/test/quantization/pt2e/test_representation.py
@@ -17,6 +17,10 @@
     skipIfNoQNNPACK,
     TestHelperModules,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @skipIfNoQNNPACK
@@ -33,10 +37,14 @@ def _test_representation(
     ) -> torch.nn.Module:
         # resetting dynamo cache
         torch._dynamo.reset()
+<<<<<<< HEAD
         model = export_for_training(
             model,
             example_inputs,
         ).module()
+=======
+        model = export_for_training(model, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model_copy = copy.deepcopy(model)
 
         model = prepare_pt2e(model, quantizer)
@@ -309,3 +317,10 @@ def forward(self, x, y):
             ref_node_occurrence,
             non_ref_node_occurrence,
         )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/pt2e/test_x86inductor_quantizer.py b/test/quantization/pt2e/test_x86inductor_quantizer.py
index 31cecf9adeda..d800a830a01f 100644
--- a/test/quantization/pt2e/test_x86inductor_quantizer.py
+++ b/test/quantization/pt2e/test_x86inductor_quantizer.py
@@ -7,6 +7,10 @@
 import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
 import torch.nn as nn
 from torch.ao.quantization import ObserverBase
+<<<<<<< HEAD
+=======
+from torch.ao.quantization.pt2e.lowering import lower_pt2e_quantized_to_x86
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.quantize_pt2e import (
     convert_pt2e,
     prepare_pt2e,
@@ -24,7 +28,14 @@
     skipIfNoX86,
 )
 from torch.testing._internal.common_quantized import override_quantized_engine
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import skipIfTorchDynamo
+=======
+from torch.testing._internal.common_utils import (
+    raise_on_run_directly,
+    skipIfTorchDynamo,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class NodePosType(Enum):
@@ -551,6 +562,105 @@ def forward(self, x):
             y = torch.cat([y, y], dim=-1)
             return y.transpose(1, 2)
 
+<<<<<<< HEAD
+=======
+    class MiniResNet(nn.Module):
+        class BasicBlock(nn.Module):
+            def __init__(self, in_channels, out_channels, stride=1, downsample=None):
+                super().__init__()
+                self.conv1 = nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    bias=False,
+                )
+                self.bn1 = nn.BatchNorm2d(out_channels)
+                self.relu = nn.ReLU()
+                self.conv2 = nn.Conv2d(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                )
+                self.bn2 = nn.BatchNorm2d(out_channels)
+                self.downsample = downsample
+
+            def forward(self, x):
+                identity = x
+
+                out = self.conv1(x)
+                out = self.bn1(out)
+                out = self.relu(out)
+
+                out = self.conv2(out)
+                out = self.bn2(out)
+
+                if self.downsample is not None:
+                    identity = self.downsample(x)
+
+                out += identity
+                out = self.relu(out)
+
+                return out
+
+        def __init__(self, num_classes=10):
+            super().__init__()
+            self.in_channels = 16
+            self.conv1 = nn.Conv2d(
+                3, self.in_channels, kernel_size=3, stride=1, padding=1, bias=False
+            )
+            self.bn1 = nn.BatchNorm2d(self.in_channels)
+            self.relu = nn.ReLU()
+            self.layer1 = self._make_layer(16, 1)
+            self.layer2 = self._make_layer(32, 1, stride=2)
+            self.layer3 = self._make_layer(64, 1, stride=2)
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.fc = nn.Linear(64, num_classes)
+
+        def _make_layer(self, out_channels, blocks, stride=1):
+            downsample = None
+            if stride != 1 or self.in_channels != out_channels:
+                downsample = nn.Sequential(
+                    nn.Conv2d(
+                        self.in_channels,
+                        out_channels,
+                        kernel_size=1,
+                        stride=stride,
+                        bias=False,
+                    ),
+                    nn.BatchNorm2d(out_channels),
+                )
+
+            layers = []
+            layers.append(
+                self.BasicBlock(self.in_channels, out_channels, stride, downsample)
+            )
+            self.in_channels = out_channels
+            for _ in range(1, blocks):
+                layers.append(self.BasicBlock(self.in_channels, out_channels))
+
+            return nn.Sequential(*layers)
+
+        def forward(self, x):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.relu(x)
+
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.fc(x)
+
+            return x
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class X86InductorQuantTestCase(QuantizationTestCase):
     def _test_quantizer(
@@ -562,15 +672,23 @@ def _test_quantizer(
         expected_node_list=None,
         is_qat=False,
         debug=False,
+<<<<<<< HEAD
+=======
+        lower=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         m_eager = model.train() if is_qat else model.eval()
 
         # program capture
         m = copy.deepcopy(m_eager)
+<<<<<<< HEAD
         m = export_for_training(
             m,
             example_inputs,
         ).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # QAT Model failed to deepcopy
         export_model = m if is_qat else copy.deepcopy(m)
@@ -582,6 +700,11 @@ def _test_quantizer(
         convert_model = copy.deepcopy(m)
         if debug:
             convert_model.print_readable(True)
+<<<<<<< HEAD
+=======
+        if lower:
+            m = lower_pt2e_quantized_to_x86(m, example_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m(*example_inputs)
         node_occurrence = {
             ns.call_function(k): v for k, v in expected_node_occurrence.items()
@@ -2244,7 +2367,11 @@ def forward(self, x):
         )
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         # Use a linear count instead of names because the names might change, but
         # the order should be the same.
@@ -2732,3 +2859,39 @@ def test_attention_block(self):
                     node_occurrence,
                     node_list,
                 )
+<<<<<<< HEAD
+=======
+
+    @skipIfNoX86
+    def test_lowering_to_x86(self):
+        with override_quantized_engine("x86"), torch.no_grad():
+            m = TestHelperModules.MiniResNet().eval()
+            example_inputs = (torch.randn(2, 3, 16, 16),)
+            quantizer = X86InductorQuantizer().set_global(
+                xiq.get_default_x86_inductor_quantization_config()
+            )
+            node_occurrence = {
+                torch.ops.quantized_decomposed.quantize_per_tensor.default: 3,
+                torch.ops.onednn.qconv_pointwise.default: 6,
+                torch.ops.onednn.qconv2d_pointwise.binary: 3,
+                torch.ops.onednn.qlinear_pointwise.default: 1,
+            }
+            node_list = [
+                torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                torch.ops.onednn.qconv_pointwise.default,
+                torch.ops.onednn.qconv2d_pointwise.binary,
+                torch.ops.onednn.qlinear_pointwise.default,
+            ]
+            self._test_quantizer(
+                m,
+                example_inputs,
+                quantizer,
+                node_occurrence,
+                node_list,
+                lower=True,
+            )
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/quantization/pt2e/test_xnnpack_quantizer.py b/test/quantization/pt2e/test_xnnpack_quantizer.py
index 36209e5aad10..eed5a43de7f5 100644
--- a/test/quantization/pt2e/test_xnnpack_quantizer.py
+++ b/test/quantization/pt2e/test_xnnpack_quantizer.py
@@ -38,6 +38,10 @@
     TestHelperModules,
 )
 from torch.testing._internal.common_quantized import override_quantized_engine
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_utils import raise_on_run_directly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @skipIfNoQNNPACK
@@ -361,7 +365,11 @@ def forward(self, x):
         )
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
+<<<<<<< HEAD
         m = export_for_training(m, example_inputs).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = prepare_pt2e(m, quantizer)
         # Use a linear count instead of names because the names might change, but
         # the order should be the same.
@@ -497,10 +505,14 @@ def test_propagate_annotation(self):
         example_inputs = (torch.randn(1, 3, 5, 5),)
 
         # program capture
+<<<<<<< HEAD
         m = export_for_training(
             m,
             example_inputs,
         ).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
@@ -766,8 +778,12 @@ def forward(self, input_tensor, hidden_tensor):
 
             with torchdynamo.config.patch(allow_rnn=True):
                 model_graph = export_for_training(
+<<<<<<< HEAD
                     model_graph,
                     example_inputs,
+=======
+                    model_graph, example_inputs, strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ).module()
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(
@@ -829,8 +845,12 @@ def forward(self, input_tensor, hidden_tensor):
 
             with torchdynamo.config.patch(allow_rnn=True):
                 model_graph = export_for_training(
+<<<<<<< HEAD
                     model_graph,
                     example_inputs,
+=======
+                    model_graph, example_inputs, strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ).module()
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(
@@ -1039,10 +1059,14 @@ def test_resnet18(self):
             m = torchvision.models.resnet18().eval()
             m_copy = copy.deepcopy(m)
             # program capture
+<<<<<<< HEAD
             m = export_for_training(
                 m,
                 example_inputs,
             ).module()
+=======
+            m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(is_per_channel=True)
@@ -1088,3 +1112,10 @@ def test_resnet18(self):
             self.assertTrue(
                 compute_sqnr(after_quant_result, after_quant_result_fx) > 35
             )
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise_on_run_directly("test/test_quantization.py")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/run_test.py b/test/run_test.py
index 2a402282397e..03151d597cd7 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -30,11 +30,18 @@
     get_report_path,
     IS_CI,
     IS_MACOS,
+<<<<<<< HEAD
     IS_WINDOWS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     retry_shell,
     set_cwd,
     shell,
     TEST_CUDA,
+<<<<<<< HEAD
+=======
+    TEST_SAVE_XML,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_WITH_ASAN,
     TEST_WITH_CROSSREF,
     TEST_WITH_ROCM,
@@ -172,6 +179,7 @@ def __contains__(self, item):
     "distributed/rpc/test_tensorpipe_agent",
     "distributed/rpc/test_share_memory",
     "distributed/rpc/cuda/test_tensorpipe_agent",
+<<<<<<< HEAD
     "distributed/_shard/checkpoint/test_checkpoint"
     "distributed/_shard/checkpoint/test_file_system_checkpoint"
     "distributed/_shard/sharding_spec/test_sharding_spec",
@@ -180,10 +188,13 @@ def __contains__(self, item):
     "distributed/_shard/sharded_tensor/ops/test_binary_cmp",
     "distributed/_shard/sharded_tensor/ops/test_init",
     "distributed/_shard/sharded_optim/test_sharded_optim",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_determination",
     "test_jit_legacy",
     "test_cuda_nvml_based_avail",
     "test_jit_cuda_fuser",
+<<<<<<< HEAD
     "distributed/tensor/test_attention",
 ]
 
@@ -513,6 +524,83 @@ def __contains__(self, item):
     "torch_np/test_unary_ufuncs",
     "xpu/test_conv.py",
     "xpu/test_gemm.py",
+=======
+]
+
+S390X_BLOCKLIST = [
+    # these tests fail due to various reasons
+    "dynamo/test_misc",
+    "inductor/test_cpu_repro",
+    "inductor/test_cpu_select_algorithm",
+    "inductor/test_aot_inductor_arrayref",
+    "inductor/test_torchinductor_codegen_dynamic_shapes",
+    "lazy/test_meta_kernel",
+    "onnx/test_utility_funs",
+    "profiler/test_profiler",
+    "test_ao_sparsity",
+    "test_cpp_extensions_open_device_registration",
+    "test_jit",
+    "test_metal",
+    "test_mps",
+    "dynamo/test_torchrec",
+    "inductor/test_aot_inductor_utils",
+    "inductor/test_coordinate_descent_tuner",
+    "test_jiterator",
+    "inductor/test_cpu_cpp_wrapper",
+    "export/test_converter",
+    "inductor/test_inductor_freezing",
+    "dynamo/test_utils",
+    "test_nn",
+    "functorch/test_ops",
+    # these tests run long and fail in addition to that
+    "dynamo/test_dynamic_shapes",
+    "test_quantization",
+    "inductor/test_torchinductor",
+    "inductor/test_torchinductor_dynamic_shapes",
+    "inductor/test_torchinductor_opinfo",
+    "test_binary_ufuncs",
+    "test_unary_ufuncs",
+    # these tests fail when cuda is not available
+    "inductor/test_aot_inductor",
+    "inductor/test_best_config",
+    "inductor/test_cudacodecache",
+    "inductor/test_inductor_utils",
+    "inductor/test_inplacing_pass",
+    "inductor/test_kernel_benchmark",
+    "inductor/test_max_autotune",
+    "inductor/test_move_constructors_to_cuda",
+    "inductor/test_multi_kernel",
+    "inductor/test_pattern_matcher",
+    "inductor/test_perf",
+    "inductor/test_select_algorithm",
+    "inductor/test_snode_runtime",
+    "inductor/test_triton_wrapper",
+    # these tests fail when mkldnn is not available
+    "inductor/test_custom_post_grad_passes",
+    "inductor/test_mkldnn_pattern_matcher",
+    # lacks quantization support
+    "onnx/test_models_quantized_onnxruntime",
+    "onnx/test_pytorch_onnx_onnxruntime",
+    # https://github.com/pytorch/pytorch/issues/102078
+    "test_decomp",
+    # https://github.com/pytorch/pytorch/issues/146698
+    "test_model_exports_to_core_aten",
+    # runs very long, skip for now
+    "inductor/test_layout_optim",
+    "test_fx",
+    # some false errors
+    "doctests",
+    # new failures to investigate and fix
+    "cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic",
+    "test_tensorboard",
+    # onnx + protobuf failure, see
+    # https://github.com/protocolbuffers/protobuf/issues/22104
+    "dynamo/test_backends",
+    "dynamo/test_modules",
+    "inductor/test_config",
+    "test_public_bindings",
+    "test_testing",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 XPU_BLOCKLIST = [
@@ -542,6 +630,10 @@ def __contains__(self, item):
     "test_multiprocessing",
     "test_multiprocessing_spawn",
     "test_namedtuple_return_api",
+<<<<<<< HEAD
+=======
+    "test_openreg",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_overrides",
     "test_show_pickle",
     "test_tensorexpr",
@@ -613,20 +705,36 @@ def __contains__(self, item):
 
 
 if dist.is_available():
+<<<<<<< HEAD
+=======
+    num_gpus = torch.cuda.device_count()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DISTRIBUTED_TESTS_CONFIG["test"] = {"WORLD_SIZE": "1"}
     if not TEST_WITH_ROCM and dist.is_mpi_available():
         DISTRIBUTED_TESTS_CONFIG["mpi"] = {
             "WORLD_SIZE": "3",
         }
+<<<<<<< HEAD
     if dist.is_nccl_available():
         DISTRIBUTED_TESTS_CONFIG["nccl"] = {
             "WORLD_SIZE": f"{torch.cuda.device_count()}",
+=======
+    if dist.is_nccl_available() and num_gpus > 0:
+        DISTRIBUTED_TESTS_CONFIG["nccl"] = {
+            "WORLD_SIZE": f"{num_gpus}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     if dist.is_gloo_available():
         DISTRIBUTED_TESTS_CONFIG["gloo"] = {
             # TODO: retire testing gloo with CUDA
+<<<<<<< HEAD
             "WORLD_SIZE": f"{torch.cuda.device_count()}",
         }
+=======
+            "WORLD_SIZE": f"{num_gpus if num_gpus > 0 else 3}",
+        }
+    del num_gpus
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Test with UCC backend is deprecated.
     # See https://github.com/pytorch/pytorch/pull/137161
     # if dist.is_ucc_available():
@@ -666,7 +774,19 @@ def __contains__(self, item):
 ]
 FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")]
 ONNX_TESTS = [test for test in TESTS if test.startswith("onnx")]
+<<<<<<< HEAD
 CPP_TESTS = [test for test in TESTS if test.startswith(CPP_TEST_PREFIX)]
+=======
+
+
+def _is_cpp_test(test):
+    # Note: tests underneath cpp_extensions are different from other cpp tests
+    # in that they utilize the usual python test infrastructure.
+    return test.startswith(CPP_TEST_PREFIX) and not test.startswith("cpp_extensions")
+
+
+CPP_TESTS = [test for test in TESTS if _is_cpp_test(test)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TESTS_REQUIRING_LAPACK = [
     "distributions/test_constraints",
@@ -684,9 +804,20 @@ def __contains__(self, item):
     "test_decomp",
     "test_cpp_extensions_jit",
     "test_jit",
+<<<<<<< HEAD
+    "test_ops",
+    "test_ops_jit",
+    "dynamo/test_recompile_ux",
+=======
+    "test_matmul_cuda",
     "test_ops",
     "test_ops_jit",
     "dynamo/test_recompile_ux",
+    "inductor/test_compiled_optimizers",
+    "inductor/test_cutlass_backend",
+    "inductor/test_max_autotune",
+    "inductor/test_select_algorithm",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "inductor/test_smoke",
     "test_quantization",
 ]
@@ -734,7 +865,11 @@ def run_test(
     stepcurrent_key = test_file
 
     is_distributed_test = test_file.startswith(DISTRIBUTED_TEST_PREFIX)
+<<<<<<< HEAD
     is_cpp_test = test_file.startswith(CPP_TEST_PREFIX)
+=======
+    is_cpp_test = _is_cpp_test(test_file)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NB: Rerun disabled tests depends on pytest-flakefinder and it doesn't work with
     # pytest-cpp atm. We also don't have support to disable C++ test yet, so it's ok
     # to just return successfully here
@@ -777,7 +912,11 @@ def run_test(
             )
         )
         unittest_args.extend(test_module.get_pytest_args())
+<<<<<<< HEAD
         replacement = {"-f": "-x"}
+=======
+        replacement = {"-f": "-x", "-dist=loadfile": "--dist=loadfile"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unittest_args = [replacement.get(arg, arg) for arg in unittest_args]
 
     if options.showlocals:
@@ -810,7 +949,11 @@ def run_test(
         # case such as coverage for C++ test. So just returning ok makes sense
         return 0
 
+<<<<<<< HEAD
     if test_file.startswith(CPP_TEST_PREFIX):
+=======
+    if is_cpp_test:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # C++ tests are not the regular test directory
         if CPP_TESTS_DIR:
             cpp_test = os.path.join(
@@ -877,6 +1020,10 @@ def run_test(
                 stepcurrent_key,
                 output,
                 options.continue_through_error,
+<<<<<<< HEAD
+=======
+                test_file,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             command.extend([f"--sc={stepcurrent_key}", "--print-items"])
@@ -955,6 +1102,10 @@ def run_test_retries(
     stepcurrent_key,
     output,
     continue_through_error,
+<<<<<<< HEAD
+=======
+    test_file,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     # Run the test with -x to stop at first failure.  Rerun the test by itself.
     # If it succeeds, move on to the rest of the tests in a new process.  If it
@@ -1030,6 +1181,11 @@ def print_to_file(s):
             print_to_file("Retrying single test...")
         print_items = []  # do not continue printing them, massive waste of space
 
+<<<<<<< HEAD
+=======
+    if "null" in num_failures:
+        num_failures[f"'{test_file}'"] = num_failures.pop("null")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     consistent_failures = [x[1:-1] for x in num_failures.keys() if num_failures[x] >= 3]
     flaky_failures = [x[1:-1] for x in num_failures.keys() if 0 < num_failures[x] < 3]
     if len(flaky_failures) > 0:
@@ -1437,7 +1593,11 @@ def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
         # is much slower than running them directly
         pytest_args.extend(["-n", str(NUM_PROCS)])
 
+<<<<<<< HEAD
         if IS_CI:
+=======
+        if TEST_SAVE_XML:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Add the option to generate XML test report here as C++ tests
             # won't go into common_utils
             test_report_path = get_report_path(pytest=True)
@@ -1493,6 +1653,10 @@ def run_ci_sanity_check(test: ShardedTest, test_directory, options):
     "test_autoload_enable": test_autoload_enable,
     "test_autoload_disable": test_autoload_disable,
     "test_cpp_extensions_open_device_registration": run_test_with_openreg,
+<<<<<<< HEAD
+=======
+    "test_openreg": run_test_with_openreg,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "test_transformers_privateuse1": run_test_with_openreg,
 }
 
@@ -1537,6 +1701,19 @@ def parse_args():
         ),
     )
     parser.add_argument(
+<<<<<<< HEAD
+=======
+        "--einops",
+        "--einops",
+        action="store_true",
+        help=(
+            "If this flag is present, we will only run einops tests. "
+            "If this flag is not present, we will run all tests "
+            "(including einops tests)."
+        ),
+    )
+    parser.add_argument(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "--mps",
         "--mps",
         action="store_true",
@@ -1638,11 +1815,15 @@ def parse_args():
         default=IS_CI
         and (
             TEST_WITH_CROSSREF
+<<<<<<< HEAD
             or TEST_WITH_ASAN
             or (TEST_CONFIG == "distributed" and TEST_CUDA)
             or (IS_WINDOWS and not TEST_CUDA)
             or TEST_CONFIG == "nogpu_AVX512"
             or TEST_CONFIG == "nogpu_NO_AVX2"
+=======
+            or TEST_CONFIG == "distributed"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             or TEST_CONFIG == "default"
         )
         and get_pr_number() is not None
@@ -1768,10 +1949,13 @@ def can_run_in_pytest(test):
 def get_selected_tests(options) -> list[str]:
     selected_tests = options.include
 
+<<<<<<< HEAD
     # for s390x, override defaults
     if IS_S390X and selected_tests == TESTS:
         selected_tests = S390X_TESTLIST
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # filter if there's JIT only and distributed only test options
     if options.jit:
         selected_tests = list(
@@ -1791,10 +1975,30 @@ def get_selected_tests(options) -> list[str]:
 
     # Filter to only run functorch tests when --functorch option is specified
     if options.functorch:
+<<<<<<< HEAD
         selected_tests = [tname for tname in selected_tests if tname in FUNCTORCH_TESTS]
 
     if options.cpp:
         selected_tests = [tname for tname in selected_tests if tname in CPP_TESTS]
+=======
+        selected_tests = list(
+            filter(lambda test_name: test_name in FUNCTORCH_TESTS, selected_tests)
+        )
+
+    # Filter to only run einops tests when --einops option is specified
+    if options.einops:
+        selected_tests = list(
+            filter(
+                lambda test_name: test_name.startswith("test/dynamo/test_einops"),
+                selected_tests,
+            )
+        )
+
+    if options.cpp:
+        selected_tests = list(
+            filter(lambda test_name: test_name in CPP_TESTS, selected_tests)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         # Exclude all C++ tests otherwise as they are still handled differently
         # than Python test at the moment
@@ -1811,6 +2015,10 @@ def get_selected_tests(options) -> list[str]:
             "test_view_ops",
             "test_nn",
             "inductor/test_mps_basic",
+<<<<<<< HEAD
+=======
+            "inductor/test_torchinductor",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
     else:
         # Exclude all mps tests otherwise
@@ -1862,6 +2070,16 @@ def get_selected_tests(options) -> list[str]:
             ]
         )
 
+<<<<<<< HEAD
+=======
+    if sys.version_info[:2] < (3, 13):
+        # Skip tests for older Python versions as they may use syntax or features
+        # not supported in those versions
+        options.exclude.extend(
+            [test for test in selected_tests if test.startswith("dynamo/cpython/3_13/")]
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     selected_tests = exclude_tests(options.exclude, selected_tests)
 
     if sys.platform == "win32" and not options.ignore_win_blocklist:
@@ -1879,6 +2097,10 @@ def get_selected_tests(options) -> list[str]:
         selected_tests = exclude_tests(ROCM_BLOCKLIST, selected_tests, "on ROCm")
 
     elif IS_S390X:
+<<<<<<< HEAD
+=======
+        selected_tests = exclude_tests(S390X_BLOCKLIST, selected_tests, "on s390x")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         selected_tests = exclude_tests(
             DISTRIBUTED_TESTS,
             selected_tests,
@@ -2044,11 +2266,14 @@ def run_tests(
         x for x in selected_tests if x not in selected_tests_parallel
     ]
 
+<<<<<<< HEAD
     # See Note [ROCm parallel CI testing]
     pool = get_context("spawn").Pool(
         NUM_PROCS, maxtasksperchild=None if torch.version.hip else 1
     )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NB: This is a hack to make conftest.py and files it depends on available
     # on CPP_TESTS_DIR. We should see if the file could be turned into a
     # full-fledge ptest plugin instead
@@ -2066,13 +2291,22 @@ def run_tests(
         ):
             shutil.copy(os.path.join(test_directory, conftest_file), cpp_file)
 
+<<<<<<< HEAD
     def handle_error_messages(failure: Optional[TestFailure]):
         if failure is None:
+=======
+    def handle_complete(failure: Optional[TestFailure]):
+        failed = failure is not None
+        if IS_CI and options.upload_artifacts_while_running:
+            zip_and_upload_artifacts(failed)
+        if not failed:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
         failures.append(failure)
         print_to_stderr(failure.message)
         return True
 
+<<<<<<< HEAD
     def parallel_test_completion_callback(failure):
         test_failed = handle_error_messages(failure)
         if IS_CI and options.upload_artifacts_while_running:
@@ -2084,6 +2318,8 @@ def parallel_test_completion_callback(failure):
         ):
             pool.terminate()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     keep_going_message = (
         "\n\nTip: You can keep running tests even on failure by passing --keep-going to run_test.py.\n"
         "If running on CI, add the 'keep-going' label to your PR and rerun your jobs."
@@ -2095,7 +2331,11 @@ def parallel_test_completion_callback(failure):
             if can_run_in_pytest(test):
                 options_clone.pytest = True
             failure = run_test_module(test, test_directory, options_clone)
+<<<<<<< HEAD
             test_failed = handle_error_messages(failure)
+=======
+            test_failed = handle_complete(failure)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (
                 test_failed
                 and not options.continue_through_error
@@ -2110,7 +2350,11 @@ def parallel_test_completion_callback(failure):
                 options_clone.pytest = True
             options_clone.additional_args.extend(["-m", "serial"])
             failure = run_test_module(test, test_directory, options_clone)
+<<<<<<< HEAD
             test_failed = handle_error_messages(failure)
+=======
+            test_failed = handle_complete(failure)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if (
                 test_failed
                 and not options.continue_through_error
@@ -2118,7 +2362,28 @@ def parallel_test_completion_callback(failure):
             ):
                 raise RuntimeError(failure.message + keep_going_message)
 
+<<<<<<< HEAD
         os.environ["NUM_PARALLEL_PROCS"] = str(NUM_PROCS)
+=======
+        # This is used later to constrain memory per proc on the GPU. On ROCm
+        # the number of procs is the number of GPUs, so we don't need to do this
+        os.environ["NUM_PARALLEL_PROCS"] = str(1 if torch.version.hip else NUM_PROCS)
+
+        # See Note [ROCm parallel CI testing]
+        pool = get_context("spawn").Pool(
+            NUM_PROCS, maxtasksperchild=None if torch.version.hip else 1
+        )
+
+        def parallel_test_completion_callback(failure):
+            test_failed = handle_complete(failure)
+            if (
+                test_failed
+                and not options.continue_through_error
+                and not RERUN_DISABLED_TESTS
+            ):
+                pool.terminate()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for test in selected_tests_parallel:
             options_clone = copy.deepcopy(options)
             if can_run_in_pytest(test):
diff --git a/test/simulate_nccl_errors.py b/test/simulate_nccl_errors.py
index 7f1b4a65b5d1..ca0297eeb358 100644
--- a/test/simulate_nccl_errors.py
+++ b/test/simulate_nccl_errors.py
@@ -6,9 +6,23 @@
 import torch.distributed as c10d
 
 
+<<<<<<< HEAD
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )
+=======
+FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+log = logging.getLogger("log")
+log.setLevel(logging.INFO)
+
+handler = logging.StreamHandler()
+formatter = logging.Formatter(FORMAT)
+handler.setFormatter(formatter)
+
+log.addHandler(handler)
+log.propagate = False  # Prevent log duplication
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -29,6 +43,7 @@
 
     store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
     process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
+<<<<<<< HEAD
     logging.info("Running first allreduce")
     process_group.allreduce(torch.rand(10).cuda(rank)).wait()
     if rank == 0:
@@ -39,4 +54,16 @@
         logging.info("Second allreduce successful: %s", work.is_success())
     else:
         logging.info("Aborting all other ranks.")
+=======
+    log.info("Running first allreduce")
+    process_group.allreduce(torch.rand(10).cuda(rank)).wait()
+    if rank == 0:
+        log.info("Running second allreduce only on rank 0")
+        work = process_group.allreduce(torch.rand(10).cuda(rank))
+        log.info("Waiting for allreduce to complete...")
+        work.wait()
+        log.info("Second allreduce successful: %s", work.is_success())
+    else:
+        log.info("Aborting all other ranks.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         os.abort()
diff --git a/test/slow_tests.json b/test/slow_tests.json
index a7800e7e37b5..6372e940f778 100644
--- a/test/slow_tests.json
+++ b/test/slow_tests.json
@@ -1,4 +1,5 @@
 {
+<<<<<<< HEAD
   "EndToEndLSTM (__main__.RNNTest)": 206.5943349202474,
   "MultiheadAttention (__main__.ModulesTest)": 137.28899637858072,
   "test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 216.8983357747396,
@@ -311,4 +312,266 @@
   "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 87.14966710408528,
   "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 81.74700037638347,
   "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 130.22516504923502
+=======
+  "EndToEndLSTM (__main__.RNNTest)": 184.65333048502603,
+  "MultiheadAttention (__main__.ModulesTest)": 134.43099975585938,
+  "test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 199.10467020670572,
+  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 83.39333131578233,
+  "test_adaptive_max_pool2d1_cpu_halide (__main__.HalideCpuTests)": 113.98933410644531,
+  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 61.397444831000435,
+  "test_alexnet_prefix_cpu_halide (__main__.HalideCpuTests)": 176.93266805013022,
+  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.99899800618489,
+  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 66.08271371750604,
+  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 61.71266555786133,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 151.31399536132812,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 267.58533732096356,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 120.89933013916016,
+  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 73.94028554643903,
+  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 112.47666422526042,
+  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 609.4812072753906,
+  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 158.25587558746338,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 502.05988226996527,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 494.381110297309,
+  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 124.20333480834961,
+  "test_avg_pool3d_backward_cpu_halide (__main__.HalideCpuTests)": 61.64700063069662,
+  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 71.78066380818684,
+  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 78.40683364868164,
+  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 250.50655958387586,
+  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 145.54050064086914,
+  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 327.4082217746311,
+  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 409.865227593316,
+  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 310.50811258951825,
+  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 90.77466710408528,
+  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 88.94400024414062,
+  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 61.99116643269857,
+  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 89.07300059000652,
+  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 98.6163330078125,
+  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 65.7913335164388,
+  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 400.17799886067706,
+  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 65.32166544596355,
+  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 433.8283386230469,
+  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 65.70300038655598,
+  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 246.12633005777994,
+  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 237.4903361002604,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1256.5741882324219,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 68.78149922688802,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1055.0651448567708,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.93966611226399,
+  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 65.20016670227051,
+  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 61.16316668192545,
+  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 62.08466657002767,
+  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 62.160666147867836,
+  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 65.54600079854329,
+  "test_comprehensive_linalg_vector_norm_cpu_float16 (__main__.TestInductorOpInfoCPU)": 85.31400044759114,
+  "test_comprehensive_linalg_vector_norm_cpu_float32 (__main__.TestInductorOpInfoCPU)": 86.7923355102539,
+  "test_comprehensive_linalg_vector_norm_cpu_float64 (__main__.TestInductorOpInfoCPU)": 83.80366770426433,
+  "test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 65.01507412945783,
+  "test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 66.07433333220305,
+  "test_comprehensive_logspace_cpu_float32 (__main__.TestInductorOpInfoCPU)": 169.19166564941406,
+  "test_comprehensive_logspace_cpu_float64 (__main__.TestInductorOpInfoCPU)": 164.14199829101562,
+  "test_comprehensive_logspace_cpu_int32 (__main__.TestInductorOpInfoCPU)": 167.1233367919922,
+  "test_comprehensive_logspace_cpu_int64 (__main__.TestInductorOpInfoCPU)": 161.9933319091797,
+  "test_comprehensive_masked_norm_cpu_float16 (__main__.TestInductorOpInfoCPU)": 204.7566680908203,
+  "test_comprehensive_masked_norm_cpu_float32 (__main__.TestInductorOpInfoCPU)": 202.51532999674478,
+  "test_comprehensive_masked_norm_cpu_float64 (__main__.TestInductorOpInfoCPU)": 205.77066548665366,
+  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 114.11033376057942,
+  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 105.25066757202148,
+  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 113.67999903361003,
+  "test_comprehensive_nn_functional_fractional_max_pool3d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 101.1036114162869,
+  "test_comprehensive_nn_functional_fractional_max_pool3d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 94.08183288574219,
+  "test_comprehensive_nn_functional_fractional_max_pool3d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 94.20638847351074,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 93.08233388264973,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 94.11516571044922,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 107.86000061035156,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 94.72633361816406,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 284.54283142089844,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 228.18283081054688,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 77.24066543579102,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 77.22533416748047,
+  "test_comprehensive_nn_functional_max_pool1d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 153.27567036946616,
+  "test_comprehensive_nn_functional_max_pool1d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 151.73899841308594,
+  "test_comprehensive_nn_functional_max_pool1d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 137.59866841634116,
+  "test_comprehensive_nn_functional_max_pool2d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 1176.6233723958333,
+  "test_comprehensive_nn_functional_max_pool2d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 1034.320332845052,
+  "test_comprehensive_nn_functional_max_pool2d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 1053.9040120442708,
+  "test_comprehensive_nn_functional_max_pool2d_cpu_int32 (__main__.TestInductorOpInfoCPU)": 901.5313517252604,
+  "test_comprehensive_nn_functional_max_pool2d_cpu_int64 (__main__.TestInductorOpInfoCPU)": 914.4829915364584,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1132.8611653645833,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1129.974344889323,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1135.6740112304688,
+  "test_comprehensive_nn_functional_max_pool3d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 891.2769978841146,
+  "test_comprehensive_nn_functional_max_pool3d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 687.6756591796875,
+  "test_comprehensive_nn_functional_max_pool3d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 683.6936645507812,
+  "test_comprehensive_nn_functional_max_pool3d_cpu_int32 (__main__.TestInductorOpInfoCPU)": 678.6616617838541,
+  "test_comprehensive_nn_functional_max_pool3d_cpu_int64 (__main__.TestInductorOpInfoCPU)": 701.6133422851562,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 495.5906626383464,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 487.7074940999349,
+  "test_comprehensive_nn_functional_max_unpool2d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 115.73200225830078,
+  "test_comprehensive_nn_functional_max_unpool2d_cpu_float32 (__main__.TestInductorOpInfoCPU)": 118.66033426920573,
+  "test_comprehensive_nn_functional_max_unpool2d_cpu_float64 (__main__.TestInductorOpInfoCPU)": 115.82266743977864,
+  "test_comprehensive_nn_functional_max_unpool3d_cpu_float16 (__main__.TestInductorOpInfoCPU)": 67.43566640218098,
+  "test_comprehensive_nn_functional_unfold_cpu_bool (__main__.TestInductorOpInfoCPU)": 68.42166900634766,
+  "test_comprehensive_nn_functional_unfold_cpu_float16 (__main__.TestInductorOpInfoCPU)": 118.02966817220052,
+  "test_comprehensive_nn_functional_unfold_cpu_float32 (__main__.TestInductorOpInfoCPU)": 105.94366709391277,
+  "test_comprehensive_nn_functional_unfold_cpu_float64 (__main__.TestInductorOpInfoCPU)": 118.99266815185547,
+  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 115.5125020345052,
+  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 103.90849939982097,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 66.59218077226119,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 64.84800084431966,
+  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 60.27900060017904,
+  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 68.57966613769531,
+  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 66.81166776021321,
+  "test_compute_global_tensor_shape_1D_invalid_shape (__main__.UtilTest)": 209.35732873280844,
+  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 154.30916849772134,
+  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 142.58683141072592,
+  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 94.73116620381673,
+  "test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 110.29800033569336,
+  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 244.17077806260852,
+  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 627.981665717231,
+  "test_conv2d_unary_cpu_cpp_wrapper (__main__.TestCppWrapper)": 68.8806660970052,
+  "test_conv3d_binary_broadcast_shapes_cpu_cpu (__main__.TestPatternMatcherGenericCPU)": 75.51066589355469,
+  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 78.39416631062825,
+  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 74.26416778564453,
+  "test_count_nonzero_all (__main__.TestBool)": 630.1393364800347,
+  "test_custom_module_lstm (__main__.TestQuantizedOps)": 666.0326605902778,
+  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 84.40749867757161,
+  "test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDTensorOpsCPU)": 88.80566660563152,
+  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 153.85249682267508,
+  "test_error_detection_and_propagation (__main__.NcclErrorHandlingTest)": 67.68433125813802,
+  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 64.70655483669705,
+  "test_fail_creation_ops.py (__main__.TestTyping)": 70.33796894550323,
+  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 73.33583068847656,
+  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 95.88233311971028,
+  "test_fn_gradgrad_map_nested_cpu_float64 (__main__.TestBwdGradientsCPU)": 84.52066802978516,
+  "test_fn_gradgrad_map_triple_nested_cpu_float64 (__main__.TestBwdGradientsCPU)": 518.5540161132812,
+  "test_fn_gradgrad_map_triple_nested_cuda_float64 (__main__.TestBwdGradientsCUDA)": 352.0611623128255,
+  "test_fuse_large_params_cpu (__main__.CpuTests)": 98.19175052642822,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 172.9732191297743,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 178.04811265733508,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 96.32300059000652,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 94.25100072224934,
+  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 110.52466583251953,
+  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 147.46899922688803,
+  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 137.17833455403647,
+  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 223.40133412679037,
+  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 130.75699996948242,
+  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 159.8721669514974,
+  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 661.1241658528646,
+  "test_grid_sampler_2d_cpu_halide (__main__.HalideCpuTests)": 196.1066640218099,
+  "test_group_norm (__main__.TestQuantizedOps)": 143.82022105322943,
+  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 252.9750010172526,
+  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 68.59622192382812,
+  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 132.5279998779297,
+  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 151.57311164008246,
+  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 117.37533315022786,
+  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 577.0678304036459,
+  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 72.07283401489258,
+  "test_linear (__main__.TestStaticQuantizedModule)": 178.05622397528754,
+  "test_linear_relu (__main__.TestStaticQuantizedModule)": 64.9945551554362,
+  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 83.73499965667725,
+  "test_lstm_cpu (__main__.TestMkldnnCPU)": 66.0846659342448,
+  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 125.42355600992839,
+  "test_max_pool2d2_cpu_halide (__main__.HalideCpuTests)": 445.62599690755206,
+  "test_max_pool2d3_cpu_halide (__main__.HalideCpuTests)": 134.19500223795572,
+  "test_max_pool2d5_cpu_halide (__main__.HalideCpuTests)": 363.20066324869794,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.19877794053819,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 61.39377763536241,
+  "test_proper_exit (__main__.TestDataLoader)": 240.04466501871744,
+  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 271.00699615478516,
+  "test_python_ref_executor__refs_special_zeta_executor_aten_cuda_float64 (__main__.TestCommonCUDA)": 64.18233426411946,
+  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 151.71777767605252,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 61.14148919847276,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 60.4263552347819,
+  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 88.72544479370117,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 69.56600189208984,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 82.00166829427083,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 78.14999898274739,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 68.93766784667969,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 75.8633321126302,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 78.89766947428386,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 67.93033345540364,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 76.1066665649414,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 77.59533437093098,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 70.57233174641927,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 86.69966634114583,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 82.32333374023438,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 69.6453348795573,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 79.38400014241536,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 80.18400065104167,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 71.49599965413411,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 78.35600026448567,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 82.9933344523112,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 71.89866892496745,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 75.72566731770833,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 80.28999837239583,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 80.68799845377605,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 85.98066711425781,
+  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 418.50034586588544,
+  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 842.5636698404948,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 658.1936645507812,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1321.1958414713542,
+  "test_quick_core_backward_expand_copy_cuda_float64 (__main__.TestDecompCUDA)": 72.79183260599773,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 68.16699981689453,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 222.59966786702475,
+  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 89.49299875895183,
+  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 208.05382792154947,
+  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 61.09833272298177,
+  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 119.15299987792969,
+  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 72.5490010579427,
+  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 137.61000188191733,
+  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 83.77516682942708,
+  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 112.9426663716634,
+  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 68.61433410644531,
+  "test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 71.73550089200337,
+  "test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 66.45991698900859,
+  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 60.68633270263672,
+  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 74.52111011081271,
+  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 156.46233622233072,
+  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 128.3509979248047,
+  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 148.15933481852213,
+  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 145.64644877115884,
+  "test_sort_stable_cpu (__main__.CpuTritonTests)": 76.39066569010417,
+  "test_split_cumsum_cpu (__main__.CpuTritonTests)": 89.5290018717448,
+  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 150.72099796930948,
+  "test_tensor_split (__main__.TestVmapOperators)": 72.26428134347766,
+  "test_terminate_handler_on_crash (__main__.TestTorch)": 100.98866719669766,
+  "test_terminate_signal (__main__.ForkTest)": 134.33088995267948,
+  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 133.97255667547384,
+  "test_terminate_signal (__main__.SpawnTest)": 137.73455943001642,
+  "test_torch_distributions_functions_dynamic_shapes (__main__.DynamicShapesFunctionTests)": 193.52591840426126,
+  "test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 144.84678077697754,
+  "test_train_parity_multi_group_unshard_async_op (__main__.TestFullyShard1DTrainingCore)": 62.523999532063804,
+  "test_transformer_backend_inductor_fullgraph_True (__main__.TestFullyShardCompile)": 82.06791687011719,
+  "test_transformer_backend_inductor_fullgraph_True_graph_partition (__main__.TestFullyShardCompile)": 82.57758394877116,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 93.72849909464519,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 86.33483123779297,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 84.0580005645752,
+  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 128.47150166829428,
+  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 125.92099952697754,
+  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 105.98566563924153,
+  "test_unary_ops (__main__.TestTEFuserDynamic)": 173.52266354031033,
+  "test_unary_ops (__main__.TestTEFuserStatic)": 154.03555562761096,
+  "test_upsample_bicubic2d_cpu_halide (__main__.HalideCpuTests)": 95.91699727376302,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 91.32800038655598,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 72.65949885050456,
+  "test_vmapjvpvjp_diff_cuda_float32 (__main__.TestOperatorsCUDA)": 64.64249992370605,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 114.75466410319011,
+  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 61.643143063499814,
+  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 76.99316660563152,
+  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 67.82800102233887,
+  "test_vmapjvpvjp_linalg_pinv_singular_cpu_float32 (__main__.TestOperatorsCPU)": 60.267666498819985,
+  "test_vmapjvpvjp_linalg_solve_triangular_cuda_float32 (__main__.TestOperatorsCUDA)": 68.94433307647705,
+  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 73.93966547648112,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 88.03500111897786,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 90.39650090535481,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 79.07066853841145,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 95.49366696675618,
+  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 81.16833623250325,
+  "test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 61.30799865722656,
+  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 79.50816663106282,
+  "test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 100.31945332613859,
+  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 106.99416732788086,
+  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 103.08566665649414,
+  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 149.96750259399414
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
\ No newline at end of file
diff --git a/test/test_accelerator.py b/test/test_accelerator.py
index 5790dcdbe922..70dcb33dd055 100644
--- a/test/test_accelerator.py
+++ b/test/test_accelerator.py
@@ -81,6 +81,27 @@ def test_current_stream_query(self):
         ):
             torch.accelerator.current_stream(other_device)
 
+<<<<<<< HEAD
+=======
+    def test_device_context_manager(self):
+        prev_device = torch.accelerator.current_device_index()
+        with torch.accelerator.device_index(None):
+            self.assertEqual(torch.accelerator.current_device_index(), prev_device)
+        self.assertEqual(torch.accelerator.current_device_index(), prev_device)
+        with torch.accelerator.device_index(0):
+            self.assertEqual(torch.accelerator.current_device_index(), 0)
+        self.assertEqual(torch.accelerator.current_device_index(), prev_device)
+
+    @unittest.skipIf(not TEST_MULTIACCELERATOR, "only one accelerator detected")
+    def test_multi_device_context_manager(self):
+        src_device = 0
+        dst_device = 1
+        torch.accelerator.set_device_index(src_device)
+        with torch.accelerator.device_index(dst_device):
+            self.assertEqual(torch.accelerator.current_device_index(), dst_device)
+        self.assertEqual(torch.accelerator.current_device_index(), src_device)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stream_context_manager(self):
         prev_stream = torch.accelerator.current_stream()
         with torch.Stream() as s:
@@ -112,6 +133,35 @@ def test_pin_memory_on_non_blocking_copy(self):
         self.assertTrue(t_host.is_pinned())
         self.assertEqual(t_acc.cpu(), t_host)
 
+<<<<<<< HEAD
+=======
+    def test_generic_event_behavior(self):
+        event1 = torch.Event(enable_timing=False)
+        event2 = torch.Event(enable_timing=False)
+        with self.assertRaisesRegex(
+            ValueError,
+            "Both events must be created with argument 'enable_timing=True'",
+        ):
+            event1.elapsed_time(event2)
+
+        event1 = torch.Event(enable_timing=True)
+        event2 = torch.Event(enable_timing=True)
+        with self.assertRaisesRegex(
+            ValueError,
+            "Both events must be recorded before calculating elapsed time",
+        ):
+            event1.elapsed_time(event2)
+
+        # check default value of enable_timing: False
+        event1 = torch.Event()
+        event2 = torch.Event()
+        with self.assertRaisesRegex(
+            ValueError,
+            "Both events must be created with argument 'enable_timing=True'",
+        ):
+            event1.elapsed_time(event2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_ao_sparsity.py b/test/test_ao_sparsity.py
index db6b2222a3be..90ae6dd12cb6 100644
--- a/test/test_ao_sparsity.py
+++ b/test/test_ao_sparsity.py
@@ -1,4 +1,8 @@
 # Owner(s): ["module: unknown"]
+<<<<<<< HEAD
+=======
+import logging
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Kernels
 from ao.sparsity.test_kernels import (  # noqa: F401
@@ -56,4 +60,12 @@
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
+=======
+    logging.basicConfig(
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        level=logging.INFO,
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     run_tests()
diff --git a/test/test_appending_byte_serializer.py b/test/test_appending_byte_serializer.py
index e650fad1eac7..3f3d15b0e8d7 100644
--- a/test/test_appending_byte_serializer.py
+++ b/test/test_appending_byte_serializer.py
@@ -78,6 +78,22 @@ def deserialize(reader: BytesReader) -> "Foo":
             ),
         )
 
+<<<<<<< HEAD
+=======
+    def test_checksum(self) -> None:
+        writer = BytesWriter()
+        writer.write_str("test")
+        b = writer.to_bytes()
+        b = bytearray(b)
+        b[0:1] = b"\x00"
+        b = bytes(b)
+
+        with self.assertRaisesRegex(
+            RuntimeError, r"Bytes object is corrupted, checksum does not match.*"
+        ):
+            BytesReader(b)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 4aeeb87240a4..d2c69e3e6f46 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -76,9 +76,15 @@
     skipIfNoLapack,
     skipIfTorchDynamo,
     skipIfWindows,
+<<<<<<< HEAD
     slowTest,
     TestCase,
     xfailIfTorchDynamo,
+=======
+    skipIfXpu,
+    slowTest,
+    TestCase,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import TorchDispatchMode
@@ -830,7 +836,11 @@ def compute_grad(create_graph):
         x_grad, x_grad_clone = compute_grad(create_graph=False)
         self.assertEqual(x_grad, x_grad_clone * 2)
 
+<<<<<<< HEAD
         # Accumulate out-of-place when create_graph is False
+=======
+        # Accumulate out-of-place when create_graph is True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x_grad, x_grad_clone = compute_grad(create_graph=True)
         self.assertEqual(x_grad, x_grad_clone)
 
@@ -2507,6 +2517,15 @@ def reset_grad():
             lambda: torch.autograd.backward(fn(), gradient, inputs=[]),
         )
 
+<<<<<<< HEAD
+=======
+    def test_backward_with_scalar_input(self):
+        x = torch.randn([], dtype=torch.double, requires_grad=True)
+        out = x**2
+        out.backward(inputs=x)
+        self.assertEqual(x.grad, 2 * x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_backward_with_nonleaf_inputs(self):
         x = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
         x_nonleaf = x * 1
@@ -3022,8 +3041,13 @@ def check_index(x, y, idx):
         check_index(x, y, ([1, 2, 3], [0]))
         check_index(x, y, ([1, 2], [2, 1]))
         check_index(x, y, ([[1, 2], [3, 0]], [[0, 1], [2, 3]]))
+<<<<<<< HEAD
         check_index(x, y, ([slice(None), [2, 3]]))
         check_index(x, y, ([[2, 3], slice(None)]))
+=======
+        check_index(x, y, ((slice(None), [2, 3])))
+        check_index(x, y, (([2, 3], slice(None))))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # advanced indexing, with less dim, or ellipsis
         check_index(x, y, ([0]))
@@ -3055,8 +3079,13 @@ def check_index(x, y, idx):
         # advanced indexing, with a tensor wrapped in a variable
         z = torch.LongTensor([0, 1])
         zv = Variable(z, requires_grad=False)
+<<<<<<< HEAD
         seq = [z, Ellipsis]
         seqv = [zv, Ellipsis]
+=======
+        seq = (z, Ellipsis)
+        seqv = (zv, Ellipsis)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if y.grad is not None:
             with torch.no_grad():
@@ -3080,7 +3109,11 @@ def test_indexing_duplicates(self):
         x = torch.arange(1.0, 17).view(4, 4)
         y = Variable(x, requires_grad=True)
 
+<<<<<<< HEAD
         idx = [[1, 1, 3, 2, 1, 2], [0]]
+=======
+        idx = ([1, 1, 3, 2, 1, 2], [0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y[idx].sum().backward()
         expected_grad = torch.zeros(4, 4)
         for i in idx[0]:
@@ -3091,7 +3124,11 @@ def test_indexing_duplicates(self):
 
         x = torch.arange(1.0, 17).view(4, 4)
         y = Variable(x, requires_grad=True)
+<<<<<<< HEAD
         idx = [[[1, 2], [0, 0]], [[0, 1], [1, 1]]]
+=======
+        idx = ([[1, 2], [0, 0]], [[0, 1], [1, 1]])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y[idx].sum().backward()
         expected_grad = torch.tensor(
             [
@@ -3106,7 +3143,11 @@ def test_indexing_duplicates(self):
         x = torch.arange(1.0, 65).view(4, 4, 4)
         y = Variable(x, requires_grad=True)
 
+<<<<<<< HEAD
         idx = [[1, 1, 1], slice(None), slice(None)]
+=======
+        idx = ([1, 1, 1], slice(None), slice(None))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y[idx].sum().backward()
         expected_grad = torch.empty(4, 4, 4).zero_()
         expected_grad[1].fill_(3)
@@ -3535,6 +3576,7 @@ def test_setitem(self):
         self._test_setitem((5, 5), 1)
         self._test_setitem((5,), 1)
         self._test_setitem((1,), 0)
+<<<<<<< HEAD
         self._test_setitem((10,), [[0, 4, 2]])
         self._test_setitem((5, 5), [[0, 4], [2, 2]])
         self._test_setitem((5, 5, 5), [slice(None), slice(None), [1, 3]])
@@ -3544,11 +3586,23 @@ def test_setitem(self):
         self._test_setitem((5, 5, 5), [[1, 3], [2, 4], slice(None)])
         self._test_setitem_tensor((5, 5), 3)
         self._test_setitem_tensor((5, 5), [[0, 1], [1, 0]])
+=======
+        self._test_setitem((10,), ([0, 4, 2]))
+        self._test_setitem((5, 5), ([0, 4], [2, 2]))
+        self._test_setitem((5, 5, 5), (slice(None), slice(None), [1, 3]))
+        self._test_setitem((5, 5, 5), (slice(None), [1, 3], slice(None)))
+        self._test_setitem((5, 5, 5), ([1, 3], slice(None), slice(None)))
+        self._test_setitem((5, 5, 5), (slice(None), [2, 4], [1, 3]))
+        self._test_setitem((5, 5, 5), ([1, 3], [2, 4], slice(None)))
+        self._test_setitem_tensor((5, 5), 3)
+        self._test_setitem_tensor((5, 5), ([0, 1], [1, 0]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_setitem_tensor((5,), 3)
         self._test_setitem_tensor(
             (5,), Variable(torch.LongTensor([3]), requires_grad=False).sum()
         )
         self._test_setitem_tensor((5,), [[0, 1, 2, 3]])
+<<<<<<< HEAD
         self._test_setitem_tensor((5, 5, 5), [slice(None), slice(None), [1, 3]])
         self._test_setitem_tensor((5, 5, 5), [slice(None), [1, 3], slice(None)])
         self._test_setitem_tensor((5, 5, 5), [[1, 3], slice(None), slice(None)])
@@ -3561,6 +3615,20 @@ def test_setitem(self):
                 [2, 4],
                 slice(None),
             ],
+=======
+        self._test_setitem_tensor((5, 5, 5), (slice(None), slice(None), [1, 3]))
+        self._test_setitem_tensor((5, 5, 5), (slice(None), [1, 3], slice(None)))
+        self._test_setitem_tensor((5, 5, 5), ([1, 3], slice(None), slice(None)))
+        self._test_setitem_tensor((5, 5, 5), (slice(None), [2, 4], [1, 3]))
+        self._test_setitem_tensor((5, 5, 5), ([1, 3], [2, 4], slice(None)))
+        self._test_setitem_tensor(
+            (5, 5, 5),
+            (
+                Variable(torch.LongTensor([1, 3]), requires_grad=False),
+                [2, 4],
+                slice(None),
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_setitem_mask(self):
@@ -3720,6 +3788,21 @@ def backward(ctx, grad_x):
         with self.assertRaisesRegex(RuntimeError, "Attribute 'name' is invalid"):
             f.name()
         with self.assertRaisesRegex(
+<<<<<<< HEAD
+=======
+            RuntimeError, "Attribute '_sequence_nr' is invalid"
+        ):
+            f._sequence_nr()
+        with self.assertRaisesRegex(
+            RuntimeError, "Attribute '_set_sequence_nr' is invalid"
+        ):
+            f._set_sequence_nr(2)
+        with self.assertRaisesRegex(
+            RuntimeError, "Attribute '_input_metadata' is invalid"
+        ):
+            f._input_metadata
+        with self.assertRaisesRegex(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             RuntimeError, "underlying PyNode has already been deallocated"
         ):
             f.metadata
@@ -7424,8 +7507,12 @@ def test_checkpointing_without_reentrant_correct_grad(self):
         self.assertEqual(b_grad, c_grad)
         self.assertEqual(b_grad, d_grad)
 
+<<<<<<< HEAD
     # PYTORCH_TEST_WITH_DYNAMO=1 test fails on CI but can't repro locally
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/127115")
+=======
+    @skipIfXpu(msg="torch._C._scatter Not implemented on XPU, issue #143239")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_checkpointing_without_reentrant_dataparallel(self):
         """
         Verifies gradient correctness when checkpoint without reentrant autograd
@@ -7483,8 +7570,11 @@ def hook(grad):
         # should only call hook once
         self.assertEqual(count, 1)
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/127115
     @xfailIfTorchDynamo
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_checkpointing_without_reentrant_arbitrary_input_output(self):
         """
         Ensures checkpointing without reentrant autograd works with functions
@@ -8626,6 +8716,70 @@ def jvp(ctx, x_t):
                     self.assertTrue(out_dual is x_dual)
                     self.assertTrue(out_tangent is x_tangent)
 
+<<<<<<< HEAD
+=======
+    def test_custom_function_mark_output_view_of_intermediate(self):
+        class Func(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, inp):
+                out = inp.clone().view_as(inp)
+                ctx.mark_dirty(out)
+                return out
+
+            @staticmethod
+            def backward(ctx, gO):
+                pass
+
+        a = torch.tensor([1.0], requires_grad=True)
+        a_clone = a.clone()
+
+        with self.assertRaisesRegex(
+            RuntimeError, "received a tensor that was not an input."
+        ):
+            Func.apply(a_clone)
+
+    def test_custom_function_inplace_on_non_default_view(self):
+        class Func(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, inp):
+                inp.add_(1)
+                ctx.mark_dirty(inp)
+                return inp
+
+            @staticmethod
+            def backward(ctx, gO):
+                pass
+
+        a = torch.tensor([1.0, 2.0], requires_grad=True)
+        a_clone = a.clone()
+        b, c = a.split_with_sizes([1, 1], dim=0)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "output of a function that returns multiple view"
+        ):
+            Func.apply(b)
+
+    def test_custom_function_inplace_on_view_of_leaf(self):
+        class Func(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, inp):
+                inp.add_(1)
+                ctx.mark_dirty(inp)
+                return inp
+
+            @staticmethod
+            def backward(ctx, gO):
+                pass
+
+        a = torch.tensor([1.0, 2.0], requires_grad=True)
+        b = a.view_as(a)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "a view of a leaf Variable that requires grad"
+        ):
+            Func.apply(b)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_named_tensor_for_complex_views(self):
         names = ["batch", "height", "width", "complex"]
         z = torch.ones((2, 1, 2, 2), requires_grad=True)
@@ -9309,10 +9463,21 @@ def forward(self, x):
             with set_warn_always_context(True):
                 with warnings.catch_warnings(record=True) as w:
                     tmp.exp().sum().backward(create_graph=True)
+<<<<<<< HEAD
                     self.assertTrue(len(w) == 1)
                     self.assertTrue(
                         "Using backward() with create_graph=True" in str(w[0].message)
                     )
+=======
+                    self.assertTrue(w)
+                    found = 0
+                    for warning in w:
+                        if "Using backward() with create_graph=True" in str(
+                            warning.message
+                        ):
+                            found += 1
+                    self.assertEqual(found, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Remove the backward + create_graph=True cycle
             a.grad = None
@@ -12647,6 +12812,12 @@ def functional_op(x):
         self.assertFalse(func_out.requires_grad)
         self.assertTrue(func_out.is_leaf)
 
+<<<<<<< HEAD
+=======
+    @skipIfTorchDynamo(
+        "exception from ill-formed graph module is not propagated with eager_noexcept"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_inference_mode_inf_tensor_in_normal_mode_inplace_op(self):
         def run_test(fn):
             for requires_grad in (False, True):
@@ -12903,6 +13074,448 @@ def run_test(fn):
         run_test(lambda x: x.transpose_(0, 1))
 
 
+<<<<<<< HEAD
+=======
+NUM_GPU_CYCLES_IN_ONE_SEC = 2_000_000_000
+
+
+@contextlib.contextmanager
+def _set_device_index(target_device):
+    orig_device = torch.accelerator.current_device_index()
+    try:
+        torch.accelerator.set_device_index(target_device)
+        yield
+    finally:
+        torch.accelerator.set_device_index(orig_device)
+
+
+def _sleep_if_cuda(cycles):
+    if "cuda" == torch.accelerator.current_accelerator().type:
+        return torch.cuda._sleep(cycles)
+    else:
+        # Update this if non-cuda accelerators support something like sleep
+        return
+
+
+def _get_device_name(idx):
+    return f"{torch.accelerator.current_accelerator().type}:{idx}"
+
+
+# Although this is written to be generic over all accelerators, non-cuda accelerators
+# are not fully tested since sleep is only supported on cuda.
+class TestAutogradStreamSynchronization(TestCase):
+    def get_default_streams(self, num_devices=1):
+        out = []
+        for i in range(num_devices):
+            with _set_device_index(i):
+                acc = torch.accelerator.current_accelerator()
+                out.append(torch.get_device_module(acc).default_stream())
+        return tuple(out)
+
+    def synchronize_all_devices(self, num_devices=1):
+        for i in range(num_devices):
+            torch.accelerator.synchronize(i)
+
+    def assert_all_streams_default(self, num_devices=1):
+        # Sanity check
+        default_streams = self.get_default_streams(num_devices)
+        for i in range(num_devices):
+            with _set_device_index(i):
+                acc = torch.accelerator.current_accelerator()
+                # Do this instead of using torch.accelerator.current_stream(i)
+                # Otherwise, e.g. in the case of cuda, we'd be trying to compare
+                # torch.cuda.Stream with torch.Stream
+                self.assertEqual(
+                    torch.get_device_module(acc).current_stream(), default_streams[i]
+                )
+
+    # AttributeError: module 'torch.mps' has no attribute 'default_stream'
+    @skipIfMPS
+    @unittest.skipIf(not torch.accelerator.is_available(), "requires accelerator")
+    def test_consumer_to_single_producer_case_2_correctness(self):
+        #                          Device    Stream
+        # Consumer (MulBackward):  cuda:0    s0
+        # Producer              :  cuda:0    s1
+        # Gradient              :  cuda:0    s1
+        class Producer(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x.clone()
+
+            @staticmethod
+            def backward(ctx, gO):
+                out = gO.clone()
+                _sleep_if_cuda(NUM_GPU_CYCLES_IN_ONE_SEC // 2)
+                out.add_(1)
+                return out
+
+        def test():
+            self.synchronize_all_devices()
+            self.assert_all_streams_default()
+
+            with torch.Stream(0) as s0:
+                a = torch.ones(256, 256, requires_grad=True, device=_get_device_name(0))
+                b = a * 2
+
+            with torch.Stream(0) as s1:
+                s1.wait_stream(s0)
+                out = Producer.apply(b)
+
+                with torch.autograd.grad_mode.set_multithreading_enabled(False):
+                    out.sum().backward()
+
+            self.synchronize_all_devices()
+
+            # Expected result: a.grad = (grad_out + 1) * 2 = 4
+            self.assertEqual(a.grad, torch.full_like(a, 4))
+
+        # Run an extra time to warm up
+        for _ in range(2):
+            test()
+
+    def _test_consumer_to_single_producer_case_3_correctness(
+        self, non_default_ambient_stream
+    ):
+        #                          Device    Stream
+        # Consumer (MulBackward):  cuda:0    s0
+        # Producer              :  cuda:1    cuda:1 default
+        # Gradient              :  cuda:0    cuda:0 default
+        class Producer(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                # The node's canonical stream is the current stream
+                # of the device of the first output.
+                ctx.node_stream = torch.accelerator.current_stream(1)
+                return x.to(_get_device_name(1))
+
+            @staticmethod
+            def backward(ctx, gO):
+                out = gO.to(_get_device_name(0))
+                with _set_device_index(0):
+                    _sleep_if_cuda(NUM_GPU_CYCLES_IN_ONE_SEC // 2)
+                # It's the node's responsibility to sync back to its canonical stream.
+                out.add_(1)
+                ctx.node_stream.wait_stream(torch.accelerator.current_stream(0))
+                return out
+
+        def test():
+            self.synchronize_all_devices(2)
+            self.assert_all_streams_default(2)
+
+            (default_stream_0,) = self.get_default_streams()
+
+            # Ensure consumer node happens on non-default stream so that
+            # when FuncBackward produces a gradient on a default stream
+            # a sync is necessary.
+            with torch.Stream(0) as s0:
+                a = torch.ones(256, 256, requires_grad=True, device="cuda")
+                b = a * 2
+
+            default_stream_0.wait_stream(s0)
+            out = Producer.apply(b)
+
+            def call_backward(x):
+                with torch.autograd.grad_mode.set_multithreading_enabled(False):
+                    x.sum().backward()
+
+            if non_default_ambient_stream:
+                with torch.Stream(0) as s1:
+                    s1.wait_stream(default_stream_0)
+                    call_backward(out)
+            else:
+                call_backward(out)
+
+            self.synchronize_all_devices(2)
+
+            # Expected result: a.grad = (grad_out + 1) * 2 = 4
+            self.assertEqual(a.grad, torch.full_like(a, 4))
+
+        # Run an extra time to warm up
+        for _ in range(2):
+            test()
+
+    # AttributeError: module 'torch.mps' has no attribute 'default_stream'
+    @skipIfMPS
+    @unittest.skipIf(not torch.accelerator.is_available(), "requires accelerator")
+    @unittest.skipIf(
+        torch.accelerator.device_count() < 2, "accelerator count is less than 2"
+    )
+    def test_consumer_to_single_producer_case_3_correctness_non_default_ambient_stream(
+        self,
+    ):
+        self._test_consumer_to_single_producer_case_3_correctness(
+            non_default_ambient_stream=True
+        )
+
+    # AttributeError: module 'torch.mps' has no attribute 'default_stream'
+    @skipIfMPS
+    @unittest.skipIf(not torch.accelerator.is_available(), "requires accelerator")
+    @unittest.skipIf(
+        torch.accelerator.device_count() < 2, "accelerator count is less than 2"
+    )
+    def test_consumer_to_single_producer_case_3_correctness(self):
+        self._test_consumer_to_single_producer_case_3_correctness(
+            non_default_ambient_stream=False
+        )
+
+    # AttributeError: module 'torch.mps' has no attribute 'default_stream'
+    @skipIfMPS
+    @unittest.skipIf(not torch.accelerator.is_available(), "requires accelerator")
+    @unittest.skipIf(
+        torch.accelerator.device_count() < 2, "accelerator count is less than 2"
+    )
+    def test_consumer_to_single_producer_case_4_correctness(self):
+        #           Device    Stream
+        # Consumer: cuda:0    cuda:0 default
+        # Producer: cuda:1    s1
+        # Gradient: cuda:1    s1
+        class Producer(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x.clone()
+
+            @staticmethod
+            def backward(ctx, gO):
+                out = gO.clone()
+                _sleep_if_cuda(NUM_GPU_CYCLES_IN_ONE_SEC // 2)
+                return out.add_(1)
+
+        class Consumer(torch.autograd.Function):
+            # In the multi-output case, the node's canonical device and stream correspond to
+            # that of its first output. This is required to induce cases 4/5.
+            @staticmethod
+            def forward(ctx, x):
+                return x.clone(), x.to(_get_device_name(1))
+
+            @staticmethod
+            def backward(ctx, gO_0, gO_1):
+                # gO_1 is on s1, but we're currently doing compute in cuda:1 default
+                # It's the user's responsibility to sync to consumer (.to() should do this
+                # already.)
+                # Things would work out if the engine sync'd s1 with consumer.
+                # Ignore grad wrt first arg because we don't use it.
+                return gO_1.to(_get_device_name(0))
+
+        def test():
+            self.synchronize_all_devices(2)
+            self.assert_all_streams_default(2)
+
+            _, default_stream_1 = self.get_default_streams(2)
+            a = torch.ones(256, 256, requires_grad=True, device=_get_device_name(0))
+            _unused, b = Consumer.apply(a)
+
+            with torch.Stream(1) as s1:
+                s1.wait_stream(default_stream_1)
+                out = Producer.apply(b)
+
+                with torch.autograd.grad_mode.set_multithreading_enabled(False):
+                    out.sum().backward()
+
+            self.synchronize_all_devices(2)
+
+            # Expected result: a.grad = grad_out + 1 = 2
+            self.assertEqual(a.grad, torch.full_like(a, 2))
+
+        # Run an extra time to warm up
+        for _ in range(2):
+            test()
+
+    # AttributeError: module 'torch.mps' has no attribute 'default_stream'
+    @skipIfMPS
+    @unittest.skipIf(not torch.accelerator.is_available(), "requires accelerator")
+    @unittest.skipIf(
+        torch.accelerator.device_count() < 2, "accelerator count is less than 2"
+    )
+    def test_consumer_to_multi_producer_case_4_correctness(self):
+        #             Device    Stream
+        # Consumer  : cuda:0    cuda:0 default
+        #
+        # Producer 1: cuda:1    s1
+        # Gradient 1: cuda:1    s1
+        #
+        # Producer 2: cuda:1    s2
+        # Gradient 2: cuda:1    s2
+        #
+        # Accumulation stream: s2 since it is scheduled first
+        class ProducerFast(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x.clone()
+
+            @staticmethod
+            def backward(ctx, gO):
+                out = gO.clone()
+                return out * 2
+
+        class ProducerSlow(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x.clone()
+
+            @staticmethod
+            def backward(ctx, gO):
+                out = gO.clone()
+                _sleep_if_cuda(NUM_GPU_CYCLES_IN_ONE_SEC // 2)
+                return out.mul_(2)
+
+        class Consumer(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.node_stream = torch.accelerator.current_stream(x.device)
+                return x.clone(), x.to(_get_device_name(1))
+
+            @staticmethod
+            def backward(ctx, gO_0, gO_1):
+                torch.accelerator.current_stream(gO_1.device).wait_stream(
+                    ctx.node_stream
+                )
+                return (gO_1 * 2).to(_get_device_name(0))
+
+        def test():
+            self.synchronize_all_devices(2)
+            self.assert_all_streams_default(2)
+
+            default_stream_0, default_stream_1 = self.get_default_streams(2)
+
+            a = torch.ones(256, 256, requires_grad=True, device=_get_device_name(0))
+            _unused, b = Consumer.apply(a)
+
+            with torch.Stream(1) as s1:
+                s1.wait_stream(default_stream_1)
+                out1 = ProducerFast.apply(b)
+
+            with torch.Stream(1) as s2:
+                s2.wait_stream(default_stream_1)
+                out2 = ProducerSlow.apply(b)
+
+            default_stream_1.wait_stream(s1)
+            default_stream_1.wait_stream(s2)
+
+            with torch.autograd.grad_mode.set_multithreading_enabled(False):
+                (out1 + out2).sum().backward()
+
+            self.synchronize_all_devices(2)
+
+            # If the accumulation stream does not wait for the slow producer stream
+            # the in-place mul-by-2 is performed on the accumulated buffer AFTER
+            # ProducerFast has already accumulated!
+            #
+            # Correct: (1.mul_(2) + 2) * 2 = 8
+            # Incorrect: (1 + 2).mul_(2) * 2 = 12
+            self.assertEqual(a.grad, torch.full_like(a, 8))
+
+        # Run an extra time to warm up
+        for _ in range(2):
+            test()
+
+    # AttributeError: module 'torch.mps' has no attribute 'default_stream'
+    @skipIfMPS
+    # This test may spuriously fail on non-cuda accelerators (since we won't
+    # be calling sleep)
+    @unittest.skipIf(not TEST_CUDA, "requires CUDA")
+    def test_side_stream_backward_overlap(self):
+        # In case 2/3, we would designate the consumer as the accumulation
+        # stream and naively, one might have the consumer wait for the producer
+        # as soon as we've added to the InputBuffer the first time.
+        #
+        # However, in the case where the stream of the consumer also happens to
+        # be the stream of the producer, this is suboptimal because it would
+        # prevent the computation of the two producers from being overlapped.
+        # what you really want to do is to have that sync between the producer
+        # and consumer to be delayed until right before the accumulation.
+        # Note that this doesn't address N=3, but the side-stream N=2 case is
+        # the common case.
+        events = {
+            "main_backward_start": None,
+            "side_backward_start": None,
+            "side_backward_end": None,
+        }
+
+        class Main(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x
+
+            @staticmethod
+            def backward(ctx, gO):
+                # Record when main backward starts
+                evt = torch.Event(enable_timing=True)
+                evt.record()
+                events["main_backward_start"] = evt
+                return gO
+
+        class Side(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x
+
+            @staticmethod
+            def backward(ctx, gO):
+                evt = torch.Event(enable_timing=True)
+                evt.record()
+                events["side_backward_start"] = evt
+
+                _sleep_if_cuda(NUM_GPU_CYCLES_IN_ONE_SEC // 2)
+                result = gO.clone()
+
+                evt = torch.Event(enable_timing=True)
+                evt.record()
+                events["side_backward_end"] = evt
+                return result
+
+        def populate_events():
+            self.synchronize_all_devices()
+            self.assert_all_streams_default()
+
+            (default_stream_0,) = self.get_default_streams()
+
+            a = torch.ones(256, 256, requires_grad=True, device=_get_device_name(0))
+            b = a.clone()  # not a leaf, does it matter?
+
+            evt = torch.Event()
+            evt.record()
+
+            # Overlap during forward
+            c_main = Main.apply(b)
+
+            with torch.Stream(0) as s0:
+                s0.wait_event(evt)
+                c_side = Side.apply(b)
+
+            default_stream_0.wait_stream(s0)
+
+            with torch.autograd.grad_mode.set_multithreading_enabled(False):
+                (c_main + c_side).sum().backward()
+
+            self.synchronize_all_devices()
+
+        def check_ordering():
+            # Sanity check: side backward's end happens after start
+            self.assertTrue(
+                events["side_backward_start"].elapsed_time(events["side_backward_end"])
+                > 0
+            )
+            # Overlap check: side's backward starts before side backward ends
+            self.assertTrue(
+                events["main_backward_start"].elapsed_time(events["side_backward_end"])
+                > 0
+            )
+
+        # Warmup
+        for _ in range(2):
+            populate_events()
+
+        # Reset events (not really necessary but OK)
+        events["side_backward_start"] = None
+        events["side_backward_end"] = None
+        events["main_backward_start"] = None
+
+        # Test
+        populate_events()
+        check_ordering()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestMultithreadAutograd(TestCase):
     def _run_py_multithread_fn(
         self, fn, args=(), num_threads=10, kwargs=None, pass_idx=False
diff --git a/test/test_bundled_images.py b/test/test_bundled_images.py
index 1919e1cd4fe3..8f01a4790fd4 100644
--- a/test/test_bundled_images.py
+++ b/test/test_bundled_images.py
@@ -92,3 +92,13 @@ def forward(self, arg):
             im2_tensor = torch.ops.fb.image_decode_to_NCHW(byte_tensor, weight, bias)
             self.assertEqual(raw_data.shape, im2_tensor.shape)
             self.assertEqual(raw_data, im2_tensor, atol=0.1, rtol=1e-01)
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test is not currently used and should be "
+        "enabled in discover_tests.py if required."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py
index 65437c0e395f..c54403313dab 100644
--- a/test/test_cpp_extensions_aot.py
+++ b/test/test_cpp_extensions_aot.py
@@ -2,11 +2,16 @@
 
 import os
 import re
+<<<<<<< HEAD
 import subprocess
 import sys
 import unittest
 from itertools import repeat
 from pathlib import Path
+=======
+import unittest
+from itertools import repeat
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import get_args, get_origin, Union
 
 import torch
@@ -16,7 +21,10 @@
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import (
     IS_WINDOWS,
+<<<<<<< HEAD
     shell,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchDynamo,
     TEST_XPU,
     xfailIfTorchDynamo,
@@ -185,6 +193,7 @@ def test_cuda_dlink_libs(self):
         test = cuda_dlink.add(a, b)
         self.assertEqual(test, ref)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_CUDA, "python_agnostic is a CUDA extension + needs CUDA")
     @unittest.skipIf(not common.IS_LINUX, "test requires linux tools ldd and nm")
     def test_python_agnostic(self):
@@ -299,6 +308,8 @@ def _make_cuda_tensors(prior_mem):
             curr_mem = torch.cuda.memory_allocated(device)
             self.assertEqual(curr_mem, init_mem)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestPybindTypeCasters(common.TestCase):
@@ -435,19 +446,62 @@ def test_conv_backend_override(self):
         weight = torch.empty(6, 4, 2, 2, device="maia", requires_grad=True)
         bias = torch.empty(6, device="maia")
 
+<<<<<<< HEAD
         # Make sure forward is overriden
+=======
+        # Make sure forward is overridden
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out = torch.nn.functional.conv2d(input, weight, bias, 2, 0, 1, 1)
         self.assertEqual(maia_extension.get_test_int(), 2)
         self.assertEqual(out.shape[0], input.shape[0])
         self.assertEqual(out.shape[1], weight.shape[0])
 
+<<<<<<< HEAD
         # Make sure backward is overriden
+=======
+        # Make sure backward is overridden
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Double backward is dispatched to _convolution_double_backward.
         # It is not tested here as it involves more computation/overrides.
         grad = torch.autograd.grad(out, input, out, create_graph=True)
         self.assertEqual(maia_extension.get_test_int(), 3)
         self.assertEqual(grad[0].shape, input.shape)
 
+<<<<<<< HEAD
+=======
+    def test_autocast_apis_for_maia_device(self):
+        # Default low-precision type in MAIA's autocast.
+        fast_dtype = torch.get_autocast_dtype("maia")
+        self.assertEqual(fast_dtype, torch.bfloat16)
+        self.assertTrue(torch._C._is_autocast_available("maia"))
+
+    @skipIfTorchDynamo(
+        "dynamo cannot handle maia device. Output tensor may have wrong dtype."
+    )
+    def test_matmul_autocast_float16_precision(self):
+        # Ensure we can change low precision dtype.
+        x = torch.empty((2, 4), dtype=torch.float, device="maia")
+        w = torch.empty((4, 2), dtype=torch.float, device="maia")
+        with torch.autocast(device_type="maia", dtype=torch.float16):
+            self.assertTrue(torch.is_autocast_enabled("maia"))
+            y = torch.ops.aten.matmul(x, w)
+            self.assertEqual(y.dtype, torch.float16)
+            self.assertEqual(y.shape, (2, 2))
+
+    @skipIfTorchDynamo(
+        "dynamo cannot handle maia device. Output tensor may have wrong dtype."
+    )
+    def test_matmul_autocast_default_precision(self):
+        # Use default lower precision dtype, bfloat16.
+        x = torch.empty((2, 4), dtype=torch.float, device="maia")
+        w = torch.empty((4, 2), dtype=torch.float, device="maia")
+        with torch.autocast(device_type="maia"):
+            self.assertTrue(torch.is_autocast_enabled("maia"))
+            y = torch.ops.aten.matmul(x, w)
+            self.assertEqual(y.dtype, torch.bfloat16)
+            self.assertEqual(y.shape, (2, 2))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestRNGExtension(common.TestCase):
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index 80a43f886564..4a4c1754526a 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -3,8 +3,15 @@
 import glob
 import locale
 import os
+<<<<<<< HEAD
 import re
 import shutil
+=======
+import random
+import re
+import shutil
+import string
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import subprocess
 import sys
 import tempfile
@@ -116,6 +123,7 @@ def test_jit_cuda_extension(self):
         # 2 * sigmoid(0) = 2 * 0.5 = 1
         self.assertEqual(z, torch.ones_like(z))
 
+<<<<<<< HEAD
     @unittest.skipIf(not (TEST_XPU), "XPU not found")
     def test_jit_xpu_extension(self):
         # NOTE: The name of the extension must equal the name of the module.
@@ -135,6 +143,88 @@ def test_jit_xpu_extension(self):
 
         # 2 * sigmoid(0) = 2 * 0.5 = 1
         self.assertEqual(z, torch.ones_like(z))
+=======
+    def _test_jit_xpu_extension(self, extra_sycl_cflags):
+        # randomizing extension name and names of extension methods
+        # for the case when we test building few extensions in a row
+        # using this function
+        rand = "".join(random.sample(string.ascii_letters, 5))
+        name = f"torch_test_xpu_extension_{rand}"
+        temp_dir = tempfile.mkdtemp()
+        try:
+            with open("cpp_extensions/xpu_extension.sycl") as f:
+                text = f.read()
+                for fn in ["sigmoid_add", "SigmoidAddKernel"]:
+                    text = text.replace(fn, f"{fn}_{rand}")
+
+            sycl_file = f"{temp_dir}/xpu_extension.sycl"
+            with open(sycl_file, "w") as f:
+                f.write(text)
+
+            module = torch.utils.cpp_extension.load(
+                name=name,
+                sources=[sycl_file],
+                extra_sycl_cflags=extra_sycl_cflags,
+                verbose=True,
+                keep_intermediates=True,
+                build_directory=temp_dir,
+            )
+
+            x = torch.zeros(100, device="xpu", dtype=torch.float32)
+            y = torch.zeros(100, device="xpu", dtype=torch.float32)
+
+            method = f"sigmoid_add_{rand}"
+            self.assertTrue(hasattr(module, method))
+            z = getattr(module, method)(x, y).cpu()
+
+            # 2 * sigmoid(0) = 2 * 0.5 = 1
+            self.assertEqual(z, torch.ones_like(z))
+        finally:
+            shutil.rmtree(temp_dir)
+
+    @unittest.skipIf(not (TEST_XPU), "XPU not found")
+    def test_jit_xpu_extension(self):
+        # NOTE: this test can be affected by setting TORCH_XPU_ARCH_LIST
+        self._test_jit_xpu_extension(extra_sycl_cflags=[])
+
+    @unittest.skipIf(not (TEST_XPU), "XPU not found")
+    def test_jit_xpu_archlists(self):
+        # NOTE: in this test we explicitly test few different options
+        # for TORCH_XPU_ARCH_LIST. Setting TORCH_XPU_ARCH_LIST in the
+        # environment before the test won't affect it.
+        cases = [
+            {
+                # Testing JIT compilation
+                "archlist": "",
+                "extra_sycl_cflags": [],
+            },
+            {
+                # Testing JIT + AOT (full torch AOT arch list)
+                # NOTE: default cpp extension AOT arch list might be reduced
+                # from the full list
+                "archlist": ",".join(torch.xpu.get_arch_list()),
+                "extra_sycl_cflags": [],
+            },
+            {
+                # Testing AOT (full torch AOT arch list)
+                # NOTE: default cpp extension AOT arch list might be reduced
+                # from the full list
+                "archlist": ",".join(torch.xpu.get_arch_list()),
+                # below excludes spir64 target responsible for JIT
+                "extra_sycl_cflags": ["-fsycl-targets=spir64_gen"],
+            },
+        ]
+        old_envvar = os.environ.get("TORCH_XPU_ARCH_LIST", None)
+        try:
+            for c in cases:
+                os.environ["TORCH_XPU_ARCH_LIST"] = c["archlist"]
+                self._test_jit_xpu_extension(extra_sycl_cflags=c["extra_sycl_cflags"])
+        finally:
+            if old_envvar is None:
+                os.environ.pop("TORCH_XPU_ARCH_LIST")
+            else:
+                os.environ["TORCH_XPU_ARCH_LIST"] = old_envvar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not TEST_MPS, "MPS not found")
     def test_mps_extension(self):
@@ -180,7 +270,13 @@ def _check_cuobjdump_output(expected_values, is_ptx=False):
                 )
 
             actual_arches = sorted(re.findall(r"sm_\d+", output))
+<<<<<<< HEAD
             expected_arches = sorted(["sm_" + xx for xx in expected_values])
+=======
+            expected_arches = sorted(
+                ["sm_" + xx.replace("121", "120") for xx in expected_values]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(
                 actual_arches,
                 expected_arches,
diff --git a/test/test_cpp_extensions_open_device_registration.py b/test/test_cpp_extensions_open_device_registration.py
index 5d1f0c34ee2e..de119d9793c2 100644
--- a/test/test_cpp_extensions_open_device_registration.py
+++ b/test/test_cpp_extensions_open_device_registration.py
@@ -3,10 +3,14 @@
 import _codecs
 import io
 import os
+<<<<<<< HEAD
 import sys
 import tempfile
 import unittest
 from typing import Union
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import numpy as np
@@ -16,6 +20,7 @@
 import torch.testing._internal.common_utils as common
 import torch.utils.cpp_extension
 from torch.serialization import safe_globals
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     IS_ARM64,
     skipIfTorchDynamo,
@@ -71,6 +76,14 @@ def current_device():
 @unittest.skipIf(TEST_XPU, "XPU does not support cppextension currently")
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestCppExtensionOpenRgistration(common.TestCase):
+=======
+from torch.testing._internal.common_utils import TemporaryFileName
+
+
+@unittest.skipIf(common.TEST_XPU, "XPU does not support cppextension currently")
+@common.markDynamoStrictTest
+class TestCppExtensionOpenRegistration(common.TestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Tests Open Device Registration with C++ extensions."""
 
     module = None
@@ -93,7 +106,11 @@ def tearDown(self):
 
     @classmethod
     def setUpClass(cls):
+<<<<<<< HEAD
         torch.testing._internal.common_utils.remove_cpp_extensions_build_root()
+=======
+        common.remove_cpp_extensions_build_root()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cls.module = torch.utils.cpp_extension.load(
             name="custom_device_extension",
@@ -105,6 +122,7 @@ def setUpClass(cls):
             verbose=True,
         )
 
+<<<<<<< HEAD
         torch.utils.generate_methods_for_privateuse1_backend(for_storage=True)
         generate_faked_module_methods()
 
@@ -433,6 +451,8 @@ def __name__(self):
         finally:
             torch.openreg.FloatStorage = None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_open_device_faketensor(self):
         with torch._subclasses.fake_tensor.FakeTensorMode.push():
             a = torch.empty(1, device="openreg")
@@ -459,7 +479,11 @@ def test_compile_autograd_function_returns_self(self):
 
     # Not an open registration test - this file is just very convenient
     # for testing torch.compile on custom C++ operators
+<<<<<<< HEAD
     @skipIfTorchDynamo("Temporary disabled due to torch._ops.OpOverloadPacket")
+=======
+    @common.skipIfTorchDynamo("Temporary disabled due to torch._ops.OpOverloadPacket")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_compile_autograd_function_aliasing(self):
         x_ref = torch.randn(4, requires_grad=True)
         out_ref = torch.ops._test_funcs.custom_autograd_fn_aliasing(x_ref)
@@ -520,7 +544,11 @@ def test_open_device_tensorlist_type_fallback(self):
         # call _fused_adamw_ with undefined tensor.
         self.module.fallback_with_undefined_tensor()
 
+<<<<<<< HEAD
     @skipIfTorchDynamo()
+=======
+    @common.skipIfTorchDynamo()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         np.__version__ < "1.25",
         "versions < 1.25 serialize dtypes differently from how it's serialized in data_legacy_numpy",
diff --git a/test/test_cpp_extensions_stream_and_event.py b/test/test_cpp_extensions_stream_and_event.py
index f6b2281e1711..1b4477500afb 100644
--- a/test/test_cpp_extensions_stream_and_event.py
+++ b/test/test_cpp_extensions_stream_and_event.py
@@ -81,21 +81,43 @@ def setUpClass(cls):
     def test_stream_event(self):
         s = torch.Stream()
         self.assertTrue(s.device_type, int(torch._C._autograd.DeviceType.MTIA))
+<<<<<<< HEAD
         e = torch.Event()
+=======
+        e = torch.Event(enable_timing=True)
+        e1 = torch.Event(enable_timing=True)
+        e1.record()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(e.device.type, "mtia")
         # Should be nullptr by default
         self.assertTrue(e.event_id == 0)
         s.record_event(event=e)
         print(f"recorded event 1: {e}")
         self.assertTrue(e.event_id != 0)
+<<<<<<< HEAD
+=======
+        # The enable_timing of event created by record_event() is false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         e2 = s.record_event()
         print(f"recorded event 2: {e2}")
         self.assertTrue(e2.event_id != 0)
         self.assertTrue(e2.event_id != e.event_id)
         e.synchronize()
+<<<<<<< HEAD
         e2.synchronize()
         time_elapsed = e.elapsed_time(e2)
         print(f"time elapsed between e1 and e2: {time_elapsed}")
+=======
+        e1.synchronize()
+        e2.synchronize()
+        time_elapsed = e.elapsed_time(e1)
+        print(f"time elapsed between e and e1: {time_elapsed}")
+        with self.assertRaisesRegex(
+            ValueError,
+            "Both events must be created with argument 'enable_timing=True'",
+        ):
+            time_elapsed = e.elapsed_time(e2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         old_event_id = e.event_id
         e.record(stream=s)
         print(f"recorded event 1: {e}")
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 3a8f0a7b3d1f..d730ad8bdcd1 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -12,8 +12,15 @@
 import sys
 import tempfile
 import threading
+<<<<<<< HEAD
 import unittest
 import warnings
+=======
+import time
+import unittest
+import warnings
+from collections import defaultdict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from copy import deepcopy
 from itertools import product
 from random import randint
@@ -22,6 +29,10 @@
 
 import torch
 import torch.cuda
+<<<<<<< HEAD
+=======
+import torch.nn as nn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch import inf, nan
 from torch.cuda._memory_viz import (
     _profile_to_snapshot,
@@ -32,14 +43,25 @@
 from torch.testing._internal.autocast_test_lists import AutocastTestLists, TestAutocast
 from torch.testing._internal.common_cuda import (
     _create_scaling_case,
+<<<<<<< HEAD
+    TEST_CUDNN,
+    TEST_MULTIGPU,
+=======
+    SM70OrLater,
     TEST_CUDNN,
     TEST_MULTIGPU,
+    tf32_on_and_off,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     largeTensorTest,
     onlyCUDA,
     onlyNativeDeviceTypes,
+<<<<<<< HEAD
+=======
+    skipCUDAIf,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_optimizers import (
     _get_optim_inputs_including_global_cliquey_kwargs,
@@ -48,18 +70,31 @@
     TensorTracker,
 )
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
+=======
+    cuda_python_error_check,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     EXPANDABLE_SEGMENTS,
     freeze_rng_state,
     gcIfJetson,
     get_cycles_per_ms,
     instantiate_parametrized_tests,
+<<<<<<< HEAD
     IS_ARM64,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_FBCODE,
     IS_JETSON,
     IS_LINUX,
     IS_SANDCASTLE,
     IS_WINDOWS,
+<<<<<<< HEAD
+    load_tests,
+=======
+    IS_X86,
     load_tests,
+    MI300_ARCH,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parametrize,
     run_tests,
     serialTest,
@@ -67,19 +102,38 @@
     skipCUDAMemoryLeakCheckIf,
     skipCUDANonDefaultStreamIf,
     skipIfRocm,
+<<<<<<< HEAD
+=======
+    skipIfRocmArch,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     slowTest,
     subtest,
     TemporaryFileName,
     TEST_CUDA,
     TEST_CUDA_GRAPH,
+<<<<<<< HEAD
+=======
+    TEST_CUDA_PYTHON_BINDINGS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TEST_NUMPY,
     TEST_WITH_ROCM,
     TestCase,
 )
+<<<<<<< HEAD
+=======
+from torch.utils._triton import has_triton
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils.checkpoint import checkpoint_sequential
 from torch.utils.viz._cycles import observe_tensor_cycles
 
 
+<<<<<<< HEAD
+=======
+requiresCppContext = unittest.skipUnless(
+    IS_X86 and IS_LINUX, "cpp contexts are x86 linux only"
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -99,7 +153,13 @@
 TEST_LARGE_TENSOR = TEST_CUDA
 TEST_MEDIUM_TENSOR = TEST_CUDA
 TEST_BF16 = False
+<<<<<<< HEAD
 TEST_PYNVML = not torch.cuda._HAS_PYNVML
+=======
+TEST_PYNVML = (
+    torch.cuda._HAS_PYNVML and not IS_JETSON
+)  # nvml not fully supported on jetson
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TEST_CUDA:
     TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9
     TEST_MEDIUM_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 6e9
@@ -316,6 +376,22 @@ def test_pinned_memory_empty_cache(self):
                 "pinned_use_cuda_host_register:False"
             )
 
+<<<<<<< HEAD
+=======
+    def test_pinned_memory_use_background_threads(self):
+        script = """
+import torch
+
+torch.cuda.memory._set_allocator_settings(
+    f"pinned_use_background_threads:True"
+)
+t = torch.ones(1024 * 1024, pin_memory=True)
+print(t.is_pinned())
+"""
+        proc = subprocess.run([sys.executable, "-c", script], capture_output=True)
+        self.assertEqual(proc.returncode, 0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cudart_register(self):
         t = torch.ones(20)
         self.assertFalse(t.is_pinned())
@@ -441,6 +517,10 @@ def test_set_per_process_memory_fraction(self):
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
         orig = torch.cuda.get_per_process_memory_fraction(0)
+<<<<<<< HEAD
+=======
+        torch.cuda.reset_peak_memory_stats(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             # test invalid fraction value.
             with self.assertRaisesRegex(TypeError, "Invalid type"):
@@ -477,6 +557,12 @@ def test_set_per_process_memory_fraction(self):
         finally:
             torch.cuda.set_per_process_memory_fraction(orig, 0)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_JETSON, "oom reporting has issues on jetson igx due to partial nvml support"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @serialTest()
     def test_get_per_process_memory_fraction(self):
         # get the initial memory fraction
@@ -496,7 +582,10 @@ def test_get_per_process_memory_fraction(self):
         # restore the initial memory fraction
         torch.cuda.set_per_process_memory_fraction(init_fraction)
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "uuid attribute not yet available")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_uuid(self):
         uuid = torch.cuda.get_device_properties(0).uuid
         self.assertEqual(len(str(uuid)), 36)  # xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
@@ -590,6 +679,10 @@ def test_serialization_array_with_storage(self):
         q_copy[1].fill_(10)
         self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Does not work in fbcode yet")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @setBlasBackendsToDefaultFinally
     def test_preferred_blas_library_settings(self):
         def _check_default():
@@ -659,7 +752,11 @@ def test_cublas_workspace_explicit_allocation(self):
             gcn_arch = str(
                 torch.cuda.get_device_properties(0).gcnArchName.split(":", 1)[0]
             )
+<<<<<<< HEAD
             if "gfx94" in gcn_arch:
+=======
+            if "gfx94" in gcn_arch or "gfx95" in gcn_arch:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 default_workspace_size = 1024 * 128 * 1024  # :1024:128
         else:
             default_workspace_size = (
@@ -955,6 +1052,43 @@ def test_events(self):
         self.assertTrue(event.query())
         self.assertGreater(start_event.elapsed_time(event), 0)
 
+<<<<<<< HEAD
+=======
+        event = torch.cuda.Event(enable_timing=True)
+        self.assertEqual(event.cuda_event, 0)
+        self.assertEqual(event.event_id, 0)
+
+        event.record()
+        self.assertNotEqual(event.cuda_event, 0)
+        self.assertNotEqual(event.event_id, 0)
+        self.assertEqual(event.cuda_event, event.event_id)
+
+    def test_events_elapsedtime(self):
+        event1 = torch.cuda.Event(enable_timing=False)
+        event2 = torch.cuda.Event(enable_timing=False)
+        with self.assertRaisesRegex(
+            ValueError,
+            "Both events must be created with argument 'enable_timing=True'",
+        ):
+            event1.elapsed_time(event2)
+
+        event1 = torch.cuda.Event(enable_timing=True)
+        event2 = torch.cuda.Event(enable_timing=True)
+        with self.assertRaisesRegex(
+            ValueError, "Both events must be recorded before calculating elapsed time"
+        ):
+            event1.elapsed_time(event2)
+
+        # check default value of enable_timing: False
+        event1 = torch.cuda.Event()
+        event2 = torch.cuda.Event()
+        with self.assertRaisesRegex(
+            ValueError,
+            "Both events must be created with argument 'enable_timing=True'",
+        ):
+            event1.elapsed_time(event2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_generic_stream_event(self):
         stream = torch.Stream("cuda")
         self.assertEqual(stream.device_index, torch.cuda.current_device())
@@ -1001,7 +1135,11 @@ def test_stream_compatibility(self):
         torch.accelerator.set_stream(s2)
         self.assertEqual(torch.accelerator.current_stream().stream_id, s2.stream_id)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError, "device_index >= 0 && device_index < num_gpus"
+=======
+            RuntimeError, "Device index value .* is out of index range"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             torch.accelerator.current_stream(torch.accelerator.device_count())
 
@@ -1073,6 +1211,27 @@ def test_record_stream_on_shifted_view(self):
 
         self.assertNotEqual(try_realloc.data_ptr(), data_ptr)
 
+<<<<<<< HEAD
+=======
+    def test_device_context_manager(self):
+        prev_device = torch.cuda.current_device()
+        with torch.accelerator.device_index(None):
+            self.assertEqual(torch.cuda.current_device(), prev_device)
+        self.assertEqual(torch.cuda.current_device(), prev_device)
+        with torch.accelerator.device_index(0):
+            self.assertEqual(torch.cuda.current_device(), 0)
+        self.assertEqual(torch.cuda.current_device(), prev_device)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    def test_multi_device_context_manager(self):
+        src_device = 0
+        dst_device = 1
+        torch.cuda.set_device(src_device)
+        with torch.accelerator.device_index(dst_device):
+            self.assertEqual(torch.cuda.current_device(), 1)
+        self.assertEqual(torch.cuda.current_device(), src_device)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stream_context_manager(self):
         prev_stream = torch.cuda.current_stream()
         with torch.cuda.Stream() as stream:
@@ -1309,6 +1468,11 @@ def _spawn_method(self, method, arg):
             for e in errors:
                 if "device-side assert triggered" not in str(e):
                     self.fail(e)
+<<<<<<< HEAD
+=======
+                if e.error_code != 710:  # cudaErrorAssert == 710
+                    self.fail(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def _test_index_bounds_cuda(idx):
@@ -2174,6 +2338,29 @@ def test_graph_debugdump(self):
             g.debug_dump(os.path.join(tempdir, "out_multi_stream.dot"))
 
     @unittest.skipIf(
+<<<<<<< HEAD
+=======
+        not TEST_CUDA_GRAPH or TEST_WITH_ROCM,
+        "CUDA >= 11.0 required for external events in cuda graphs. rocm does not support external events",
+    )
+    def test_graph_timing(self):
+        torch.cuda.empty_cache()
+        x = torch.randn(10240000, device="cuda")
+        y = torch.rand_like(x)
+        g = torch.cuda.CUDAGraph()
+        start_event = torch.cuda.Event(enable_timing=True, external=True)
+        end_event = torch.cuda.Event(enable_timing=True, external=True)
+        with torch.cuda.graph(g):
+            start_event.record()
+            z = x + y
+            end_event.record()
+        torch.cuda.synchronize()
+        g.replay()
+        torch.cuda.synchronize()
+        self.assertTrue(start_event.elapsed_time(end_event) > 0)
+
+    @unittest.skipIf(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
     def test_graph_error(self):
@@ -3424,9 +3611,101 @@ def throws_on_cuda_event(capture_error_mode):
         # Exception would Corrupt Process and make other tests fail
         # self.assertTrue(throws_on_cuda_event("global"))
 
+    @unittest.skipIf(
+<<<<<<< HEAD
+=======
+        not TEST_CUDA_GRAPH,
+        "CUDA >= 11.0 or ROCM >= 5.3 required for graphs, cuda-python must be installed",
+    )
+    def test_cuda_graph_raw_graph_keep_graph_false(self):
+        graph = torch.cuda.CUDAGraph(keep_graph=False)
+        x = torch.zeros([2000], device="cuda")
+        y = torch.ones([2000], device="cuda")
+        with torch.cuda.graph(graph, capture_error_mode="relaxed"):
+            z = x + y
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"instantiate\(\) is intended to be called by the user only when keep_graph=true",
+        ):
+            raw_pointer = graph.instantiate()
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"You cannot access the raw (cuda|hip)Graph_t instance unless CUDAGraph was initialized with keep_graph=true",
+        ):
+            raw_pointer = graph.raw_cuda_graph()
+
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH or not TEST_CUDA_PYTHON_BINDINGS,
+        "CUDA >= 11.0 or ROCM >= 5.3 required for graphs, cuda-bindings must be installed",
+    )
+    def test_cuda_graph_raw_graph(self):
+        import cuda.bindings.runtime as cudart
+
+        graph = torch.cuda.CUDAGraph(keep_graph=True)
+        x = torch.zeros([2000], device="cuda")
+        y = torch.ones([2000], device="cuda")
+        with torch.cuda.graph(graph, capture_error_mode="relaxed"):
+            z = x + y
+
+        raw_pointer = graph.raw_cuda_graph()
+
+        cudart_cuda_graph = cudart.cudaGraph_t(init_value=raw_pointer)
+        _, num_nodes = cuda_python_error_check(
+            cudart.cudaGraphGetNodes(cudart_cuda_graph)
+        )
+        nodes, _ = cuda_python_error_check(
+            cudart.cudaGraphGetNodes(cudart_cuda_graph, num_nodes)
+        )
+        for node in nodes:
+            cuda_python_error_check(cudart.cudaGraphNodeGetType(node))
+
+        graph.replay()
+
     @unittest.skipIf(
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
+    def test_cuda_graph_raw_graph_reset_and_recapture(self):
+        graph = torch.cuda.CUDAGraph(keep_graph=True)
+        x = torch.zeros([2000], device="cuda")
+        with torch.cuda.graph(graph, capture_error_mode="relaxed"):
+            x += 1.0
+
+        graph.instantiate()
+        graph.replay()
+        self.assertTrue(torch.all(x == 1.0))
+        # Exercise the code path where you reinstantiate the cuda graph twice.
+        graph.instantiate()
+        graph.replay()
+        self.assertTrue(torch.all(x == 2.0))
+        graph.replay()
+        self.assertTrue(torch.all(x == 3.0))
+
+        # Check that graph capture can succeed after reseting.
+        graph.reset()
+
+        # Don't do x[:] = 0.0 because we want to capture a new address
+        # in the next cuda graph, to make sure we are running a new
+        # cuda graph.
+        x = torch.zeros([2000], device="cuda")
+        with torch.cuda.graph(graph, capture_error_mode="relaxed"):
+            x += 2.0
+
+        graph.instantiate()
+        graph.replay()
+        self.assertTrue(torch.all(x == 2.0))
+        # Exercise the code path where you reinstantiate the cuda graph twice.
+        graph.instantiate()
+        graph.replay()
+        self.assertTrue(torch.all(x == 4.0))
+        graph.replay()
+        self.assertTrue(torch.all(x == 6.0))
+
+    @unittest.skipIf(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_cuda_graph_allocator_propagates_stream(self):
         segments = torch.cuda.memory_snapshot()
         existing_pools = {s["segment_pool_id"] for s in segments}
@@ -3749,7 +4028,11 @@ def test_memory_snapshot(self):
         finally:
             torch.cuda.memory._record_memory_history(None)
 
+<<<<<<< HEAD
     @unittest.skipIf(IS_ARM64 or not IS_LINUX, "x86 linux only cpp unwinding")
+=======
+    @unittest.skipUnless(IS_X86 and IS_LINUX, "x86 linux only cpp unwinding")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_direct_traceback(self):
         from torch._C._profiler import gather_traceback, symbolize_tracebacks  # @manual
 
@@ -3762,7 +4045,11 @@ def test_direct_traceback(self):
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
+<<<<<<< HEAD
     @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
+=======
+    @requiresCppContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_memory_snapshot_with_cpp(self):
         try:
             torch.cuda.memory.empty_cache()
@@ -3797,7 +4084,11 @@ def test_memory_profiler_viz(self):
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
+<<<<<<< HEAD
     @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
+=======
+    @requiresCppContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cycles(self):
         fired = False
 
@@ -3839,7 +4130,11 @@ def foo(p):
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
+<<<<<<< HEAD
     @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
+=======
+    @requiresCppContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_memory_plots(self):
         for context, stacks in (
             ("all", "all" if IS_LINUX else "python"),
@@ -3875,7 +4170,11 @@ def run():
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
+<<<<<<< HEAD
     @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
+=======
+    @requiresCppContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_memory_plots_free_stack(self):
         for context in ["alloc", "all", "state"]:
             try:
@@ -3902,7 +4201,72 @@ def thefree():
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
+<<<<<<< HEAD
     @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
+=======
+    @unittest.skipIf(not has_triton(), "test needs triton")
+    @requiresCppContext
+    def test_memory_compile_regions(self):
+        expected_allocation_sequence = [
+            "Torch-Compiled Region: 0/0",
+            "Torch-Compiled Region: 1/0",
+            "Torch-Compiled Region: 0/0",
+        ]
+
+        class MyModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = nn.Linear(10, 10)
+                self.linear2 = nn.Linear(10, 10)
+
+            def forward(self, x):
+                x = self.linear1(x)
+
+                if x.sum() > 0:
+                    x = x + 1
+                else:
+                    x = x - 1
+
+                x = self.linear2(x)
+
+                return x
+
+        try:
+            torch.cuda.memory.empty_cache()
+            input_tensor = torch.randn(1, 10, device="cuda")
+            # Create an instance of the model
+            model = MyModel()
+            model.to("cuda")
+            # Compile the model using torch.compile
+            compiled_model = torch.compile(model)
+            # Create a sample input tensor
+            torch.cuda.memory._record_memory_history(
+                context="all", compile_context=True
+            )
+            compiled_model(input_tensor)
+            ss = torch.cuda.memory._snapshot()["device_traces"]
+            device_idx = 0
+            allocation_sequence = []
+            while len(ss[device_idx]) == 0:
+                device_idx = device_idx + 1
+            for s in ss[device_idx]:
+                context = s["compile_context"]
+                if context == "N/A":
+                    continue
+                if len(allocation_sequence) > 0 and allocation_sequence[-1] == context:
+                    continue
+                allocation_sequence.append(context)
+            self.assertTrue(allocation_sequence == expected_allocation_sequence)
+        except RuntimeError as e:
+            pass
+        finally:
+            torch.cuda.memory._record_memory_history(None)
+
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
+    @requiresCppContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_memory_plots_history_context(self):
         try:
             torch.cuda.memory.empty_cache()
@@ -3940,7 +4304,11 @@ def should_capture2():
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
+<<<<<<< HEAD
     @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
+=======
+    @requiresCppContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_memory_plots_free_segment_stack(self):
         for context in ["alloc", "all", "state"]:
             try:
@@ -4047,7 +4415,11 @@ def alloc(n):
             # expandable_segment blocks can be in the free list when this is called.
             alloc(80)
         finally:
+<<<<<<< HEAD
             orig = torch.cuda.get_per_process_memory_fraction(0)
+=======
+            torch.cuda.memory.set_per_process_memory_fraction(orig)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_allocator_settings(self):
         def power2_div(size, div_factor):
@@ -4205,6 +4577,12 @@ def requested_bytes_alloc_stats(raw_alloc_size, stream):
             cuda_alloc_size = requested_bytes_alloc_stats(raw_alloc_size, stream)
             self.assertEqual(cuda_alloc_size, raw_alloc_size)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_JETSON, "oom reporting has issues on jetson igx due to partial nvml support"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("max_split_size_mb_setting", [False, True])
     def test_raises_oom(self, max_split_size_mb_setting):
         if max_split_size_mb_setting:
@@ -4321,19 +4699,31 @@ def free():
         finally:
             random.setstate(state)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
+=======
+    @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_nvml_get_handler(self):
         if not torch.version.hip:
             self.assertTrue(torch.cuda._get_pynvml_handler() is not None)
         else:
             self.assertTrue(torch.cuda._get_amdsmi_handler() is not None)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
+=======
+    @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_temperature(self):
         self.assertTrue(0 <= torch.cuda.temperature() <= 150)
 
     @unittest.skipIf(TEST_WITH_ROCM, "flaky for AMD gpu")
+<<<<<<< HEAD
     @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
+=======
+    @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_device_memory_used(self):
         """
         Verify used device memory in bytes
@@ -4351,6 +4741,7 @@ def test_device_memory_used(self):
         # test the order of magnitude
         self.assertTrue(num_bytes // 32 <= mem_bytes <= num_bytes * 32)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
     def test_power_draw(self):
         self.assertTrue(torch.cuda.power_draw() >= 0)
@@ -4360,6 +4751,18 @@ def test_clock_speed(self):
         self.assertTrue(torch.cuda.clock_rate() >= 0)
 
     @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
+=======
+    @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
+    def test_power_draw(self):
+        self.assertTrue(torch.cuda.power_draw() >= 0)
+
+    @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
+    @skipIfRocmArch(MI300_ARCH)
+    def test_clock_speed(self):
+        self.assertTrue(torch.cuda.clock_rate() >= 0)
+
+    @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_WITH_ROCM, "amdsmi specific test")
     def test_raw_amdsmi_device_count(self):
         """
@@ -4374,7 +4777,11 @@ def test_raw_amdsmi_device_count(self):
         )
         self.assertEqual(torch.cuda._raw_device_count_amdsmi(), raw_device_cnt)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
+=======
+    @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_WITH_ROCM, "amdsmi specific test")
     def test_raw_amdsmi_device_uuids(self):
         """
@@ -4393,7 +4800,11 @@ def test_raw_amdsmi_device_uuids(self):
                 matching = False
         self.assertEqual(True, matching)
 
+<<<<<<< HEAD
     @unittest.skipIf(TEST_PYNVML, "pynvml/amdsmi is not available")
+=======
+    @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_WITH_ROCM, "amdsmi specific test")
     def test_uuid_visible_devices(self):
         """
@@ -4802,6 +5213,7 @@ def _use_cuda_memory_pool_manager(device, mem_pool):
             torch.cuda.synchronize()
             stream = torch.cuda.Stream()
             stream.wait_stream(torch.cuda.current_stream())
+<<<<<<< HEAD
             stream_context = torch.cuda.stream(stream)
             stream_context.__enter__()
             torch._C._cuda_beginAllocateCurrentStreamToPool(device, mem_pool)
@@ -4811,6 +5223,18 @@ def _use_cuda_memory_pool_manager(device, mem_pool):
                 torch._C._cuda_endAllocateCurrentStreamToPool(device, mem_pool)
                 torch._C._cuda_releasePool(device, mem_pool)
                 stream_context.__exit__(None, None, None)
+=======
+
+            with torch.cuda.stream(stream), torch.device(device):
+                torch._C._cuda_beginAllocateCurrentThreadToPool(device, mem_pool)
+                try:
+                    yield
+                finally:
+                    torch._C._cuda_endAllocateToPool(device, mem_pool)
+                    torch._C._cuda_releasePool(device, mem_pool)
+
+            torch.cuda.current_stream().wait_stream(stream)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         segments = get_cudagraph_segments(pool)
         self.assertEqual(len(get_cudagraph_segments(pool)), 1)
@@ -4857,7 +5281,11 @@ def no_pool():
         self.assertEqual(len(get_cudagraph_segments(pool)), 0)
 
     def test_no_triton_on_import(self):
+<<<<<<< HEAD
         """Test that Trition is not imported on first GPU use"""
+=======
+        """Test that Triton is not imported on first GPU use"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         script = "import sys; import torch; torch.rand(2, device='cuda'); print('triton' in sys.modules)"
 
         rc = (
@@ -4875,6 +5303,28 @@ def test_no_triton_on_import(self):
 
 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
 class TestMemPool(TestCase):
+<<<<<<< HEAD
+=======
+    def _setup_mempool_limited_memory_test(self, additional_allowed_memory_in_mb):
+        device = torch.device("cuda:0")
+
+        self.init_fraction = torch.cuda.get_per_process_memory_fraction()
+        torch.cuda.memory.empty_cache()
+        mb = 1024 * 1024
+        _, all_memory = torch.cuda.memory.mem_get_info(device)
+        pre_reserved = torch.cuda.memory_reserved(device)
+        total_allowed = additional_allowed_memory_in_mb * mb + pre_reserved
+        fraction_allowed = total_allowed / all_memory
+        torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed, device)
+
+        dtype = torch.int8
+        return device, dtype
+
+    def _teardown_mempool_limited_memory_test(self):
+        torch.cuda.memory.empty_cache()
+        torch.cuda.memory.set_per_process_memory_fraction(self.init_fraction)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mempool_id(self):
         pool1 = torch.cuda.graph_pool_handle()
         pool2 = torch.cuda.MemPool().id
@@ -4886,6 +5336,7 @@ def test_mempool_id(self):
         # increments the id
         self.assertTrue(abs(pool2[1] - pool1[1]) > 0)
 
+<<<<<<< HEAD
     def test_mempool_with_allocator(self):
         pool = torch.cuda.MemPool()
 
@@ -4895,32 +5346,79 @@ def test_mempool_with_allocator(self):
         from torch.utils.cpp_extension import load_inline
 
         dummy_allocator_source = """
+=======
+    def get_dummy_allocator(self, check_vars):
+        dummy_allocator_source_vars = """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #include <torch/extension.h>
         #include <ATen/cuda/Exceptions.h>
         #include <cuda_runtime_api.h>
 
         extern "C" {
+<<<<<<< HEAD
           C10_EXPORT int called_dummy_alloc = 0;
           C10_EXPORT int called_dummy_free = 0;
 
           // Note that windows needs __declspec(dllexport): https://stackoverflow.com/a/24575865
           C10_EXPORT void* dummy_alloc(size_t size, int device, void* stream) {
             called_dummy_alloc = 123;
+=======
+            C10_EXPORT int called_dummy_alloc = 0;
+            C10_EXPORT int called_dummy_free = 0;
+
+            // Note that windows needs __declspec(dllexport): https://stackoverflow.com/a/24575865
+            C10_EXPORT void* dummy_alloc(size_t size, int device, void* stream) {
+            called_dummy_alloc = 123;
+            void* ptr;
+            C10_CUDA_CHECK(cudaMallocManaged(&ptr, size));
+            return ptr;
+            }
+
+            C10_EXPORT void dummy_free(void* ptr, size_t size, int device, void* stream) {
+            called_dummy_free = 321;
+            C10_CUDA_CHECK(cudaFree(ptr));
+            }
+        }
+        """
+        dummy_allocator_source_no_vars = """
+        #include <torch/extension.h>
+        #include <ATen/cuda/Exceptions.h>
+        #include <cuda_runtime_api.h>
+
+        extern "C" {
+          // Note that windows needs __declspec(dllexport): https://stackoverflow.com/a/24575865
+          C10_EXPORT void* dummy_alloc(size_t size, int device, void* stream) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             void* ptr;
             C10_CUDA_CHECK(cudaMallocManaged(&ptr, size));
             return ptr;
           }
 
           C10_EXPORT void dummy_free(void* ptr, size_t size, int device, void* stream) {
+<<<<<<< HEAD
             called_dummy_free = 321;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             C10_CUDA_CHECK(cudaFree(ptr));
           }
         }
         """
+<<<<<<< HEAD
         dummy_allocator_libname = "dummy_allocator"
         dummy_allocator = load_inline(
             name=dummy_allocator_libname,
             cpp_sources=dummy_allocator_source,
+=======
+
+        from torch.utils.cpp_extension import load_inline
+
+        dummy_allocator_libname = "dummy_allocator"
+        dummy_allocator = load_inline(
+            name=dummy_allocator_libname,
+            cpp_sources=dummy_allocator_source_vars
+            if check_vars
+            else dummy_allocator_source_no_vars,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is_python_module=False,
             keep_intermediates=False,
             verbose=True,
@@ -4931,6 +5429,59 @@ def test_mempool_with_allocator(self):
             "dummy_alloc",
             "dummy_free",
         )
+<<<<<<< HEAD
+=======
+        return allocator, dummy_allocator
+
+    def test_mempool_empty_cache(self):
+        torch.cuda.empty_cache()
+        pool = torch.cuda.MemPool()
+        x = torch.empty(1024, 1024, device="cuda")
+
+        with torch.cuda.use_mem_pool(pool):
+            y = torch.empty(1024, 1024, device="cuda")
+
+        del y
+        del x
+        del pool
+        segments = torch.cuda.memory._snapshot()["segments"]
+        self.assertTrue(len(segments) > 0, "expected more than one segment")
+
+    def test_mempool_empty_cache_inactive(self):
+        torch.cuda.empty_cache()
+        allocator, dummy_allocator = self.get_dummy_allocator(check_vars=True)
+        alloc_lib = ctypes.CDLL(dummy_allocator)
+        called_dummy_alloc = ctypes.c_int.in_dll(alloc_lib, "called_dummy_alloc")
+        called_dummy_free = ctypes.c_int.in_dll(alloc_lib, "called_dummy_free")
+        self.assertEqual(called_dummy_alloc.value, 0)
+        self.assertEqual(called_dummy_free.value, 0)
+
+        def f():
+            pool = torch.cuda.MemPool(allocator.allocator())
+
+            # allocate memory with ncclMemAlloc
+            with torch.cuda.use_mem_pool(pool):
+                x = torch.arange(1024 * 1024 * 2, device="cuda")
+            # Note: pool will be destroyed upon function return, but x, which
+            # was allocated via the pool is still alive.
+            return x
+
+        x = f()
+        self.assertEqual(called_dummy_alloc.value, 123)
+        self.assertEqual(called_dummy_free.value, 0)
+
+        del x
+        torch.cuda.empty_cache()
+        self.assertEqual(called_dummy_free.value, 321)
+
+    def test_mempool_with_allocator(self):
+        pool = torch.cuda.MemPool()
+
+        # MemPool doesn't have an allocator by default
+        self.assertEqual(pool.allocator, None)
+        allocator, dummy_allocator = self.get_dummy_allocator(check_vars=True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pool = torch.cuda.MemPool(allocator.allocator())
 
         # pool should point to the same allocator as the one passed into it
@@ -4965,6 +5516,11 @@ def test_mempool_with_allocator(self):
         # out tensor
         self.assertEqual(called_dummy_alloc.value, 123)
 
+<<<<<<< HEAD
+=======
+        out_non_pool = torch.empty(nelem_1mb, device="cuda")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with torch.cuda.use_mem_pool(pool):
             # pool should have 1 segment since we made a small allocation (1 MB)
             # above and so the CUDACachingAllocator packed it into a 2 MB buffer
@@ -4982,6 +5538,11 @@ def test_mempool_with_allocator(self):
             # to make a new 2 MB buffer to accomodate out_2
             self.assertEqual(len(pool.snapshot()), 2)
 
+<<<<<<< HEAD
+=======
+        self.assertEqual(len(pool.snapshot()), 2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         del out_0, out_1, out_2
 
         # pool's destructor calls emptyCache()
@@ -4991,6 +5552,7 @@ def test_mempool_with_allocator(self):
         # out tensor
         self.assertEqual(called_dummy_free.value, 321)
 
+<<<<<<< HEAD
     def test_mempool_context(self):
         active_pool = torch.cuda.MemPoolContext.active_pool()
 
@@ -5013,16 +5575,94 @@ def test_mempool_context(self):
     def test_mempool_multithread(self):
         pool_ids = []
         active_pool_ids = []
+=======
+    @serialTest()
+    def test_mempool_limited_memory_with_allocator(self):
+        allocator, _ = self.get_dummy_allocator(check_vars=False)
+        pool_do_not_use = torch.cuda.MemPool(allocator.allocator())
+        pool_use = torch.cuda.MemPool(allocator.allocator(), use_on_oom=True)
+
+        nelem_1mb = 1024 * 1024 // 4
+
+        self._setup_mempool_limited_memory_test(80)
+        # remaining free mem: 80 mb
+        # mempool_use [] 0 mb
+        # mempool_do_not_use [] 0 mb
+        # default pool [] 0 mb
+        with torch.cuda.use_mem_pool(pool_do_not_use):
+            a = torch.randn(40 * nelem_1mb, device="cuda")
+        with torch.cuda.use_mem_pool(pool_use):
+            b = torch.randn(40 * nelem_1mb, device="cuda")
+        a_dataptr = a.data_ptr()
+        b_dataptr = b.data_ptr()
+        # remaining free mem: 0 mb
+        # mempool_do_not_use [aaaa] 40 mb
+        # mempool_use [bbbb] 40 mb
+        # default pool [] 0 mb
+        with self.assertRaises(torch.OutOfMemoryError):
+            # out of memory
+            c = torch.randn(40 * nelem_1mb, device="cuda")
+
+        del a, b
+        # remaining free mem: 0 mb
+        # mempool_do_not_use [____] 40 mb
+        # mempool_use [____] 40 mb
+        # default pool [] 0 mb
+
+        # c should not oom and instead can use mempool_use as fallback
+        c = torch.randn(30 * nelem_1mb, device="cuda")
+        c_dataptr = c.data_ptr()
+        # remaining free mem: 0 mb
+        # mempool_do_not_use [____] 40 mb
+        # mempool_use [ccc_] 40 mb
+        # default pool [] 0 mb
+        with self.assertRaises(torch.OutOfMemoryError):
+            # out of memory since can't use mempool_do_not_use
+            d = torch.randn(30 * nelem_1mb, device="cuda")
+
+        del c
+        # remaining free mem: 0 mb
+        # mempool_do_not_use [____] 40 mb
+        # mempool_use [____] 40 mb
+        # default pool [] 0 mb
+
+        # expect that we used same memory address for both a and c
+        self.assertEqual(b_dataptr, c_dataptr)
+
+        # make sure we can still use mempool_use as intended after c is deleted
+        with torch.cuda.use_mem_pool(pool_use):
+            e = torch.randn(20 * nelem_1mb, device="cuda")
+        # remaining free mem: 0 mb
+        # mempool_do_not_use [____] 40 mb
+        # mempool_use [ee__] 40 mb
+        # default pool [] 0 mb
+
+        e_dataptr = e.data_ptr()
+        del e
+
+        self.assertEqual(e_dataptr, c_dataptr)
+
+        # pool's destructor calls emptyCache()
+        del pool_use, pool_do_not_use
+
+        self._teardown_mempool_limited_memory_test()
+
+    def test_mempool_multithread(self):
+        pool_ids = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def create_mempool_and_make_active():
             pool = torch.cuda.MemPool()
             pool_ids.extend([pool.id])
 
+<<<<<<< HEAD
             ctx = torch.cuda.MemPoolContext(pool)
             active_pool = torch.cuda.MemPoolContext.active_pool()
             active_pool_ids.extend([active_pool.id])
             del ctx
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_threads = 4
         threads = [
             threading.Thread(target=create_mempool_and_make_active)
@@ -5037,6 +5677,7 @@ def create_mempool_and_make_active():
         # mempool id creation is atomic
         self.assertEqual(len(set(pool_ids)), 4)
 
+<<<<<<< HEAD
         # each thread should have different active mempool, since
         # the pointer to the mempool is thread local
         self.assertEqual(len(set(active_pool_ids)), 4)
@@ -5045,6 +5686,39 @@ def create_mempool_and_make_active():
     def test_mempool_expandable(self):
         torch.cuda.memory._set_allocator_settings("expandable_segments:True")
         pool = torch.cuda.MemPool()
+=======
+    def test_mempool_emptycache_multithread(self):
+        num_threads = 4
+
+        def my_function(pool):
+            with torch.cuda.use_mem_pool(pool):
+                x = torch.randn(4, device="cuda")
+                del x
+                torch.cuda.empty_cache()
+
+        pools = [torch.cuda.MemPool() for _ in range(num_threads)]
+        threads = [
+            threading.Thread(target=my_function, args=(pools[i],))
+            for i in range(num_threads)
+        ]
+
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        # empty_cache should have done nothing under mempool context
+        for p in pools:
+            s = p.snapshot()
+            self.assertEqual(len(s), 1, "Expected to have a single segment")
+
+    @skipIfRocm(msg="expandable_segments mode is not supported on ROCm")
+    @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Load_inline doesn't work in fbcode")
+    def test_mempool_expandable(self):
+        torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+        allocator, _ = self.get_dummy_allocator(check_vars=False)
+        pool = torch.cuda.MemPool(allocator.allocator())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # torch.cuda.MemPool doesn't work with expandable segments
         with self.assertRaises(RuntimeError):
@@ -5053,6 +5727,66 @@ def test_mempool_expandable(self):
                 out_0 = torch.randn(nelem_1mb, device="cuda")
         torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
+<<<<<<< HEAD
+=======
+    def test_mempool_ctx_multithread(self):
+        torch.cuda.empty_cache()
+        segments = torch.cuda.memory._snapshot()["segments"]
+        self.assertEqual(len(segments), 0, "Expected empty pool in the beginning")
+
+        nelem = 1024 * 1024
+        trigger_alloc = threading.Event()
+        done_allocation = threading.Event()
+
+        def main_thread_fn():
+            pool = torch.cuda.MemPool()
+            out1 = torch.empty(nelem, dtype=torch.int8, device="cuda")
+            with torch.cuda.use_mem_pool(pool):
+                out = torch.empty(nelem, dtype=torch.int8, device="cuda")
+                del out
+                trigger_alloc.set()
+                done_allocation.wait()
+
+        def side_thread_fn(segments):
+            trigger_alloc.wait()
+            out = torch.empty(nelem, dtype=torch.int8, device="cuda")
+            s = torch.cuda.memory._snapshot()["segments"]
+            segments.append(s)
+            done_allocation.set()
+
+        segments = []
+        main_thread = threading.Thread(target=main_thread_fn)
+        side_thread = threading.Thread(target=side_thread_fn, args=(segments,))
+
+        main_thread.start()
+        side_thread.start()
+        main_thread.join(timeout=10)
+        side_thread.join(timeout=10)
+
+        if main_thread.is_alive() or side_thread.is_alive():
+            # release threads so that they don't hang forever
+            trigger_alloc.set()
+            done_allocation.set()
+            self.fail(
+                "Test timed out - threads did not complete within the allowed time"
+            )
+
+        self.assertEqual(len(segments), 1, "Expected to have memory snapshot")
+        self.assertEqual(len(segments[0]), 2, "Expected to have 2 segments allocated")
+        active = defaultdict(int)
+        for s in segments[0]:
+            active[s["segment_pool_id"]] += s["active_size"]
+        for k, v in active.items():
+            if k == (0, 0):
+                self.assertEqual(
+                    v, 2097152, "Expected to have 2MB allocated in the default pool"
+                )
+            else:
+                self.assertEqual(
+                    v, 0, "Expected to have 0 bytes allocated in the custom pool"
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
 @torch.testing._internal.common_utils.markDynamoStrictTest
@@ -5959,9 +6693,410 @@ def test_cuda_module_loading_env(self):
         self.assertEqual(val, "LAZY")
 
 
+<<<<<<< HEAD
+instantiate_parametrized_tests(TestCuda)
+instantiate_parametrized_tests(TestCudaMallocAsync)
+instantiate_device_type_tests(TestCudaOptims, globals())
+=======
+class TestCompileKernel(TestCase):
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel(self):
+        # Simple vector addition kernel
+        kernel_source = """
+        __global__ void add_tensors(const float* a, const float* b, float* c, int n) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n)
+                c[i] = a[i] + b[i];
+        }
+        """
+
+        # Compile the kernel
+        from torch.cuda import _compile_kernel
+
+        add_kernel = _compile_kernel(kernel_source, "add_tensors")
+
+        # Prepare data
+        N = 1024
+        a = torch.rand(N, device="cuda")
+        b = torch.rand(N, device="cuda")
+        c = torch.empty_like(a)
+
+        # Calculate grid and block dimensions
+        threads_per_block = 256
+        blocks_per_grid = (N + threads_per_block - 1) // threads_per_block
+
+        # Launch kernel
+        add_kernel(
+            grid=(blocks_per_grid, 1, 1),
+            block=(threads_per_block, 1, 1),
+            args=[a, b, c, N],
+        )
+
+        # Verify results
+        expected = a + b
+        self.assertEqual(c, expected)
+
+        # Test with different tensor types
+        a_int = torch.randint(0, 100, (N,), device="cuda", dtype=torch.int32)
+        b_int = torch.randint(0, 100, (N,), device="cuda", dtype=torch.int32)
+        c_int = torch.empty_like(a_int)
+
+        # Integer addition kernel
+        int_kernel_source = """
+        __global__ void add_int_tensors(const int* a, const int* b, int* c, int n) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n)
+                c[i] = a[i] + b[i];
+        }
+        """
+        from torch.cuda import _compile_kernel
+
+        add_int_kernel = _compile_kernel(int_kernel_source, "add_int_tensors")
+
+        # Launch kernel
+        add_int_kernel(
+            grid=(blocks_per_grid, 1, 1),
+            block=(threads_per_block, 1, 1),
+            args=[a_int, b_int, c_int, N],
+        )
+
+        # Verify results
+        expected_int = a_int + b_int
+        self.assertEqual(c_int, expected_int)
+
+        # Test with header code
+        header_code = """
+        #define SCALE_FACTOR 2.0f
+
+        __device__ float scale_value(float val) {
+            return val * SCALE_FACTOR;
+        }
+        """
+
+        scale_kernel_source = """
+        __global__ void scale_tensors(const float* input, float* output, int n) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n)
+                output[i] = scale_value(input[i]);
+        }
+        """
+
+        scale_kernel = _compile_kernel(
+            scale_kernel_source, "scale_tensors", header_code=header_code
+        )
+
+        input_tensor = torch.rand(N, device="cuda")
+        output_tensor = torch.empty_like(input_tensor)
+
+        scale_kernel(
+            grid=(blocks_per_grid, 1, 1),
+            block=(threads_per_block, 1, 1),
+            args=[input_tensor, output_tensor, N],
+        )
+
+        # Verify scaling
+        expected_scaled = input_tensor * 2.0
+        self.assertEqual(output_tensor, expected_scaled)
+
+        # Test error handling with invalid kernel
+        invalid_kernel_source = """
+        __global__ void invalid_kernel(float* a) {
+            undeclared_variable = 10; // This will cause a compilation error
+        }
+        """
+
+        with self.assertRaises(RuntimeError):
+            _compile_kernel(invalid_kernel_source, "invalid_kernel")
+
+    @tf32_on_and_off(0.005)
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel_advanced(self):
+        # Test matrix multiplication
+        matmul_kernel_source = """
+        __global__ void matrix_multiply(const float* A, const float* B, float* C, int M, int N, int K) {
+            int row = blockIdx.y * blockDim.y + threadIdx.y;
+            int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (row < M && col < N) {
+                float sum = 0.0f;
+                for (int i = 0; i < K; i++) {
+                    sum += A[row * K + i] * B[i * N + col];
+                }
+                C[row * N + col] = sum;
+            }
+        }
+        """
+        from torch.cuda import _compile_kernel
+
+        matmul_kernel = _compile_kernel(matmul_kernel_source, "matrix_multiply")
+
+        # Matrix dimensions
+        M, K, N = 64, 32, 48
+
+        # Create matrices
+        A = torch.rand((M, K), device="cuda")
+        B = torch.rand((K, N), device="cuda")
+        C = torch.zeros((M, N), device="cuda")
+
+        # Calculate grid and block dimensions
+        block_dim = (16, 16, 1)
+        grid_dim = (
+            (N + block_dim[0] - 1) // block_dim[0],
+            (M + block_dim[1] - 1) // block_dim[1],
+            1,
+        )
+
+        # Launch kernel
+        matmul_kernel(
+            grid=grid_dim,
+            block=block_dim,
+            args=[A.contiguous(), B.contiguous(), C, M, N, K],
+        )
+
+        # Verify results
+        expected = torch.matmul(A, B)
+        self.assertEqual(C, expected)
+
+        # Test with different compute capability if specified
+        device_props = torch.cuda.get_device_properties(torch.cuda.current_device())
+        compute_cap = f"{device_props.major}{device_props.minor}"
+
+        # Recompile with explicit compute capability
+        matmul_kernel_explicit = _compile_kernel(
+            matmul_kernel_source, "matrix_multiply", compute_capability=compute_cap
+        )
+
+        C_explicit = torch.zeros((M, N), device="cuda")
+
+        # Launch kernel
+        matmul_kernel_explicit(
+            grid=grid_dim,
+            block=block_dim,
+            args=[A.contiguous(), B.contiguous(), C_explicit, M, N, K],
+        )
+
+        # Verify results
+        self.assertEqual(C_explicit, expected)
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel_as_custom_op(self):
+        # Define a simple vector addition kernel
+        kernel_source = """
+        __global__ void vector_add(const float* a, const float* b, float* c, int n) {
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx < n) {
+                c[idx] = a[idx] + b[idx];
+            }
+        }
+        """
+
+        @torch.library.custom_op("test_compile_kernel::vector_add", mutates_args=())
+        def vector_add_op(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+            from torch.cuda import _compile_kernel
+
+            # Validate that tensors are 1-dimensional and have the same size
+            torch._check(
+                a.dim() == 1,
+                lambda: f"Expected tensor 'a' to be 1-dimensional, but got {a.dim()} dimensions",
+            )
+            torch._check(
+                b.dim() == 1,
+                lambda: f"Expected tensor 'b' to be 1-dimensional, but got {b.dim()} dimensions",
+            )
+            torch._check(
+                a.size() == b.size(),
+                lambda: f"Expected tensors to have the same size, but got a.size()={a.size()} and b.size()={b.size()}",
+            )
+            compiled_kernel = _compile_kernel(kernel_source, "vector_add")
+
+            c = torch.empty_like(a)
+            n = a.numel()
+
+            threads_per_block = 256
+            blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
+            compiled_kernel(
+                grid=(blocks_per_grid, 1, 1),
+                block=(threads_per_block, 1, 1),
+                args=[a, b, c, n],
+            )
+
+            return c
+
+        @vector_add_op.register_fake
+        def _(a, b):
+            return torch.empty_like(a)
+
+        device = torch.device("cuda:0")
+        size = (1024,)
+
+        a = torch.randn(size, device=device, dtype=torch.float32)
+        b = torch.randn(size, device=device, dtype=torch.float32)
+
+        result = vector_add_op(a, b)
+
+        expected = a + b
+        torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCM does not support nvrtc")
+    @unittest.skipIf(not TEST_CUDA, "No CUDA")
+    def test_compile_kernel_custom_op_validation(self):
+        kernel_source = """
+        __global__ void add_scalar(const float* input, float* output, float scalar, int n) {
+            int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            if (idx < n) {
+                output[idx] = input[idx] + scalar;
+            }
+        }
+        """
+
+        @torch.library.custom_op("test_compile_kernel::add_scalar", mutates_args=())
+        def add_scalar_op(input_tensor: torch.Tensor, scalar: float) -> torch.Tensor:
+            from torch.cuda import _compile_kernel
+
+            compiled_kernel = _compile_kernel(kernel_source, "add_scalar")
+
+            output = torch.empty_like(input_tensor)
+            n = input_tensor.numel()
+
+            threads_per_block = 256
+            blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
+            compiled_kernel(
+                grid=(blocks_per_grid, 1, 1),
+                block=(threads_per_block, 1, 1),
+                args=[input_tensor, output, scalar, n],
+            )
+
+            return output
+
+        @add_scalar_op.register_fake
+        def _(input_tensor, scalar):
+            return torch.empty_like(input_tensor)
+
+        # Test with opcheck
+        device = torch.device("cuda:0")
+        input_data = torch.randn((64,), device=device, dtype=torch.float32)
+        scalar_val = 3.14
+
+        # Run opcheck validation
+        torch.library.opcheck(add_scalar_op, (input_data, scalar_val), {})
+
+        # Also test the actual functionality
+        result = add_scalar_op(input_data, scalar_val)
+        expected = input_data + scalar_val
+        torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
+
+
+@unittest.skipIf(not TEST_CUDA, "CUDA not available, skipping tests")
+class TestCudaDeviceParametrized(TestCase):
+    @unittest.skipIf(
+        TEST_WITH_ROCM, "ROCM does not support nvrtc or external cuda graph events"
+    )
+    @skipCUDAIf(
+        not SM70OrLater, "Compute capability >= SM70 required for relaxed ptx flag"
+    )
+    def test_graph_external_wait_and_record(self):
+        torch.cuda.empty_cache()
+
+        kernel_source = r"""
+        __global__ void wait_for_cpu(int *pinned_cpu_flag) {
+            int flag = 0;
+            do {
+                    asm volatile("ld.relaxed.sys.global.s32 %0, [%1];" : "=r"(flag) : "l"(pinned_cpu_flag) : "memory");
+            } while (flag == 0);
+        }
+        """
+        from torch.cuda import _compile_kernel
+
+        spin_wait_kernel = _compile_kernel(
+            kernel_source, "wait_for_cpu", compute_capability="70"
+        )
+
+        x = torch.ones(4, device="cuda")
+        x_cpu = torch.zeros(x.shape, device="cpu").pin_memory()
+        flag_cpu = torch.zeros(1, dtype=torch.int32, device="cpu").pin_memory()
+        start_event = torch.cuda.Event(external=True)
+        end_event = torch.cuda.Event(external=True)
+
+        signalling_graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(signalling_graph, capture_error_mode="relaxed"):
+            spin_wait_kernel(grid=(1, 1, 1), block=(1, 1, 1), args=[flag_cpu])
+            start_event.record()
+
+        # This is counter-intuitive, but a cudaEventRecord() during
+        # stream capture does not count as the first call to
+        # cudaEventRecord(). Rather, cudaGraphLaunch() counts as the
+        # first call to cudaEventRecord(). Therefore, all calls to
+        # cudaEventQuery() will succeed before that happens.
+
+        # See:
+        # "Before the first call to cudaEventRecord(), an event represents an empty set of work, so for example cudaEventQuery() would return cudaSuccess."  # noqa: B950
+        # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
+        self.assertTrue(start_event.query(), "Start event's work should be empty")
+
+        work_graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(work_graph, capture_error_mode="relaxed"):
+            start_event.wait()
+            x_cpu.copy_(x, non_blocking=True)
+            end_event.record()
+
+        self.assertTrue(
+            torch.all(x_cpu == 0.0), "Copy cannot occur until start_event is recorded"
+        )
+
+        try:
+            signalling_stream = torch.cuda.Stream()
+            with torch.cuda.stream(signalling_stream):
+                signalling_graph.replay()
+
+            work_stream = torch.cuda.Stream()
+            with torch.cuda.stream(work_stream):
+                work_graph.replay()
+
+            self.assertFalse(
+                end_event.query(), "Event record node cannot run until flag_cpu[0]=1"
+            )
+
+            # Sleep for a little to make sure that work doesn't proceed until we set flag_cpu[0]=1
+            time.sleep(1)
+            self.assertTrue(
+                torch.all(x_cpu == 0.0),
+                "Copy cannot occur until start_event is recorded",
+            )
+        finally:
+            # In case an assertion fails, we still need to empty out
+            # the GPU queue of work. Therefore, we do this write
+            # unconditionally, even if an exception is thrown.
+
+            # This writes allows wait_for_cpu to proceed
+            # This is an atomic store at system scope according to this rule:
+            # "the scope is thread_scope_system and and it is a load or store that affects a naturally-aligned object of sizes 1, 2, 4, 8, or 16 bytes on mapped memory"  # noqa: B950
+            # https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_model.html#atomicity
+
+            # Note that every CPU store is implicitly system scope,
+            # even if we don't use C++ atomics like this:
+            # std::atomic_ref<int32_t>::store(1);
+            flag_cpu[0] = 1
+
+        end_event.synchronize()
+        self.assertTrue(
+            torch.all(x_cpu == 1.0),
+            "Copy should be done once end_event is synchronized",
+        )
+        self.assertTrue(
+            work_stream.query(),
+            "end_event.synchronize() completing should imply that work_stream is done",
+        )
+
+
 instantiate_parametrized_tests(TestCuda)
 instantiate_parametrized_tests(TestCudaMallocAsync)
+instantiate_parametrized_tests(TestCompileKernel)
 instantiate_device_type_tests(TestCudaOptims, globals())
+instantiate_device_type_tests(TestCudaDeviceParametrized, globals())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_cuda_multigpu.py b/test/test_cuda_multigpu.py
index 25c1cf2be658..087a71d677c5 100644
--- a/test/test_cuda_multigpu.py
+++ b/test/test_cuda_multigpu.py
@@ -815,8 +815,13 @@ def test_stream_event_nogil(self):
             # it may vary on different hardware in different environments.
             # Therefore, this test uses relative comparisons, checking if the
             # sum of parent and child threads execution time is greater than the
+<<<<<<< HEAD
             # real execution time by least 40%.
             self.assertGreater(parent_time + child_time, total_time * 1.4)
+=======
+            # real execution time by least 30%.
+            self.assertGreater(parent_time + child_time, total_time * 1.3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # This test is flaky for ROCm, see issue #62602
     @skipIfRocm
diff --git a/test/test_cuda_primary_ctx.py b/test/test_cuda_primary_ctx.py
index 37a9c0aedbdd..a39ee5f3e8aa 100644
--- a/test/test_cuda_primary_ctx.py
+++ b/test/test_cuda_primary_ctx.py
@@ -4,7 +4,15 @@
 import unittest
 
 import torch
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
+=======
+from torch.testing._internal.common_cuda import (
+    _get_torch_cuda_version,
+    TEST_CUDA,
+    TEST_MULTIGPU,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import NoTest, run_tests, TestCase
 
 
@@ -31,6 +39,22 @@ def setUp(self):
                 TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG,
             )
 
+<<<<<<< HEAD
+=======
+    def test_set_device_0(self):
+        # In CUDA 12 the behavior of cudaSetDevice has changed. It eagerly creates context on target.
+        # The behavior of `torch.cuda.set_device(0)` should also create context on the device 0.
+        # Initially, we should not have any context on device 0.
+        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
+        torch.cuda.set_device(0)
+        if _get_torch_cuda_version() >= (12, 0):
+            # Now after the device was set, the contex should present in CUDA 12.
+            self.assertTrue(torch._C._cuda_hasPrimaryContext(0))
+        else:
+            # In CUDA 11 the context should not be created.
+            self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_str_repr(self):
         x = torch.randn(1, device="cuda:1")
diff --git a/test/test_custom_ops.py b/test/test_custom_ops.py
index c92edc279f55..0576e3aa2c2a 100644
--- a/test/test_custom_ops.py
+++ b/test/test_custom_ops.py
@@ -2,16 +2,31 @@
 # ruff: noqa: F841
 
 import collections
+<<<<<<< HEAD
+=======
+import io
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 import os
 import re
 import subprocess
 import sys
+<<<<<<< HEAD
 import typing
 import unittest
 from typing import *  # noqa: F403
 
 import numpy as np
+=======
+import tempfile
+import typing
+import unittest
+from pathlib import Path
+from typing import *  # noqa: F403
+
+import numpy as np
+import yaml
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._custom_ops as custom_ops
 import torch.testing._internal.optests as optests
@@ -20,8 +35,23 @@
 from functorch import make_fx
 from torch import Tensor
 from torch._custom_op.impl import CustomOp, infer_schema
+<<<<<<< HEAD
+from torch._library.infer_schema import tuple_to_list
+from torch._utils_internal import get_file_path_2  # @manual
+=======
+from torch._library.fake_profile import (
+    generate_yaml_from_profiles,
+    load_op_profiles,
+    MissingOpProfile,
+    OpProfile,
+    read_profiles_from_yaml,
+    save_op_profiles,
+    TensorMetadata,
+)
 from torch._library.infer_schema import tuple_to_list
 from torch._utils_internal import get_file_path_2  # @manual
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal import custom_op_db
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_device_type import (
@@ -37,6 +67,10 @@
     scoped_load_inline,
     skipIfTorchDynamo,
     subtest,
+<<<<<<< HEAD
+=======
+    TemporaryFileName,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TestCase,
 )
 from torch.testing._internal.custom_op_db import numpy_nonzero
@@ -210,6 +244,34 @@ def _(x: torch.Tensor) -> torch.Tensor:
         example = torch.zeros([10, 20], device=device)
         torch.library.opcheck(f, args=[example])
 
+<<<<<<< HEAD
+=======
+    # https://github.com/pytorch/pytorch/issues/150472
+    def test_single_element_tuple_output(self, device):
+        # Helper function to register id_tuple custom and the fake tensor implementation
+        # so that Dynamo has the fake tensor implementation
+        def get_id_tuple():
+            @torch.library.custom_op("test::id_tuple", mutates_args=[])
+            def id_tuple(x: torch.Tensor) -> Tuple[torch.Tensor]:
+                return (x.clone(),)
+
+            @id_tuple.register_fake
+            def _(
+                x: torch.Tensor,
+            ) -> Tuple[torch.Tensor]:
+                return (x.clone(),)
+
+            return id_tuple
+
+        id_tuple = get_id_tuple()
+        x = torch.randn(3, device=device)
+        ret = id_tuple(x)
+        # Check if ret is a tuple and has exactly one and the same element
+        self.assertIsInstance(ret, tuple)
+        self.assertEqual(len(ret), 1)
+        self.assertEqual(x, ret[0])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_missing_abstract_impl(self, device):
         lib = self.lib()
         lib.define("foo(Tensor x) -> Tensor")
@@ -2171,6 +2233,133 @@ def test_impl_device_invalid(self):
         with self.assertRaisesRegex(RuntimeError, "Expected one of cpu, cuda"):
             torch.library.impl("blah::blah", "somethingsomething")
 
+<<<<<<< HEAD
+=======
+    def test_override_impl(self):
+        lib = self.lib()
+        op_name = f"{self.test_ns}::foo"
+        torch.library.define(op_name, "(Tensor x) -> Tensor", lib=lib)
+        op = self.ns().foo.default
+
+        def foo_impl1(x):
+            return x * 1
+
+        # Register cpu impl to foo_impl1
+        lib.impl("foo", foo_impl1, "CPU")
+        self.assertEqual(op(torch.ones(3)), torch.ones(3))
+
+        def foo_impl2(x):
+            return torch.cat([x, x])
+
+        with self.assertRaisesRegex(RuntimeError, "already a kernel registered"):
+            lib.impl("foo", foo_impl2, "CPU")
+
+        # Override cpu impl to foo_impl2
+        lib.impl(op_name, foo_impl2, "CPU", allow_override=True)
+        self.assertEqual(op(torch.ones(3)), torch.ones(6))
+
+    def test_override_fake(self):
+        lib = self.lib()
+        op_name = f"{self.test_ns}::foo"
+        torch.library.define(op_name, "(Tensor x) -> Tensor", lib=lib)
+        op = self.ns().foo.default
+
+        def foo_impl1(x):
+            return x * 1
+
+        # Register fake kernel to foo_impl1
+        torch.library.register_fake(op_name, foo_impl1, lib=lib)
+
+        with torch._subclasses.FakeTensorMode():
+            self.assertEqual(op(torch.ones(3)).shape, [3])
+        self.assertEqual(op(torch.ones(3, device="meta")).shape, [3])
+
+        def foo_impl2(x):
+            return torch.cat([x, x])
+
+        with self.assertRaisesRegex(RuntimeError, "already has an fake impl"):
+            torch.library.register_fake(op_name, foo_impl2, lib=lib)
+
+        # Override fake kernel to foo_impl2
+        torch.library.register_fake(op_name, foo_impl2, lib=lib, allow_override=True)
+        with torch._subclasses.FakeTensorMode():
+            self.assertEqual(op(torch.ones(3)).shape, [6])
+        self.assertEqual(op(torch.ones(3, device="meta")).shape, [6])
+
+        # Use scoped_library to temporarily register Fake kernel to foo_impl1
+        with torch.library._scoped_library(self.test_ns, "FRAGMENT") as lib2:
+            torch.library.register_fake(
+                op_name, foo_impl1, lib=lib2, allow_override=True
+            )
+            with torch._subclasses.FakeTensorMode():
+                self.assertEqual(op(torch.ones(3)).shape, [3])
+            self.assertEqual(op(torch.ones(3, device="meta")).shape, [3])
+
+        # Fake kernel should go back to foo_impl2
+        with torch._subclasses.FakeTensorMode():
+            self.assertEqual(op(torch.ones(3)).shape, [6])
+        self.assertEqual(op(torch.ones(3, device="meta")).shape, [6])
+
+    def test_override_meta(self):
+        lib = self.lib()
+        op_name = f"{self.test_ns}::foo"
+        torch.library.define(op_name, "(Tensor x) -> Tensor", lib=lib)
+        op = self.ns().foo.default
+
+        def foo_impl1(x):
+            return x * 1
+
+        # Register Meta kernel to foo_impl1
+        lib.impl("foo", foo_impl1, "Meta")
+        self.assertEqual(op(torch.ones(3, device="meta")).shape, [3])
+
+        def foo_impl2(x):
+            return torch.cat([x, x])
+
+        with self.assertRaisesRegex(RuntimeError, "already a kernel registered"):
+            lib.impl("foo", foo_impl2, "Meta")
+
+        # Override Meta kernel to foo_impl2
+        lib.impl("foo", foo_impl2, "Meta", allow_override=True)
+        self.assertEqual(op(torch.ones(3, device="meta")).shape, [6])
+
+        # Use scoped_library to temporarily register Meta kernel to foo_impl1
+        with torch.library._scoped_library(self.test_ns, "FRAGMENT") as lib2:
+            lib2.impl("foo", foo_impl1, "Meta", allow_override=True)
+            self.assertEqual(op(torch.ones(3, device="meta")).shape, [3])
+
+        # Meta kernel should go back to foo_impl2
+        self.assertEqual(op(torch.ones(3, device="meta")).shape, [6])
+
+        # Use register_fake to override Meta kernel to foo_impl1
+        torch.library.register_fake(op_name, foo_impl1, lib=lib, allow_override=True)
+        self.assertEqual(op(torch.ones(3, device="meta")).shape, [3])
+
+    def test_override_cea(self):
+        lib = self.lib()
+        op_name = f"{self.test_ns}::foo"
+        torch.library.define(op_name, "(Tensor x) -> Tensor", lib=lib)
+        op = self.ns().foo.default
+
+        def foo_impl1(x):
+            return x * 1
+
+        # Register CEA impl to foo_impl1
+        lib.impl("foo", foo_impl1, "CompositeExplicitAutograd")
+        with torch._subclasses.FakeTensorMode():
+            self.assertEqual(op(torch.ones(3)).shape, [3])
+        self.assertEqual(op(torch.ones(3, device="meta")).shape, [3])
+
+        def foo_impl2(x):
+            return torch.cat([x, x])
+
+        # Override Meta/fake kernel with foo_impl2
+        torch.library.register_fake(op_name, foo_impl2, lib=lib, allow_override=True)
+        with torch._subclasses.FakeTensorMode():
+            self.assertEqual(op(torch.ones(3)).shape, [6])
+        self.assertEqual(op(torch.ones(3, device="meta")).shape, [6])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @scoped_load_inline
     def test_autograd_function_backed_op(self, load_inline):
         cpp_source = """
@@ -3699,6 +3888,10 @@ def vmap(info, in_dims, w, x=2, *, y=3, z):
             self.assertEqual(result, w * 2 * 3 * 42)
 
     def test_layout_constraint_tags(self):
+<<<<<<< HEAD
+=======
+        needs_exact_strides = torch._C.Tag.needs_exact_strides
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         needs_fixed_stride_order = torch._C.Tag.needs_fixed_stride_order
         flexible_layout = torch._C.Tag.flexible_layout
         # (tags, the result of the tag inference)
@@ -3706,11 +3899,19 @@ def test_layout_constraint_tags(self):
             ({needs_fixed_stride_order}, needs_fixed_stride_order),
             ({flexible_layout}, flexible_layout),
             # If no tags are provided, then the following is the default
+<<<<<<< HEAD
             (set(), needs_fixed_stride_order),
             # If multiple tags are provided, then we use the most constrained tag.
             ({flexible_layout, needs_fixed_stride_order}, needs_fixed_stride_order),
         ]
         from torch._inductor.lowering import get_layout_constraint_tag
+=======
+            (set(), needs_exact_strides),
+            # If multiple tags are provided, then we use the most constrained tag.
+            ({flexible_layout, needs_fixed_stride_order}, needs_fixed_stride_order),
+        ]
+        from torch._library.utils import get_layout_constraint_tag
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for tags, expected in tests:
             with torch.library._scoped_library("mylib", "FRAGMENT") as m:
@@ -3910,6 +4111,130 @@ def fvmap2(info, in_dims, x, y):
         self.assertTrue(called)
         self.assertEqual(result, x + y)
 
+<<<<<<< HEAD
+=======
+    @skipIfTorchDynamo("Skip due to sys.refcount")
+    def test_any_requires_grad(self):
+        test_fn = torch._C._any_requires_grad
+        # Regression test on not leaking kwargs
+        t = torch.randn(2, 2)
+        t_refcount = sys.getrefcount(t)
+        test_fn(t, a=t)
+        self.assertEqual(sys.getrefcount(t), t_refcount)
+
+        self.assertTrue(
+            test_fn(
+                torch.zeros(1, requires_grad=True), torch.ones(1, requires_grad=True)
+            )
+        )
+        self.assertFalse(test_fn(torch.ones(1), torch.zeros(1)))
+        self.assertTrue(
+            test_fn(
+                [torch.zeros(1, requires_grad=True), torch.ones(1, requires_grad=True)]
+            )
+        )
+        # _C_any_requires_grad supports only List[Tensor] in args, not List[List[Tensor]]
+        self.assertFalse(test_fn([[torch.zeros(1, requires_grad=True)]], torch.ones(1)))
+        self.assertFalse(test_fn([torch.zeros(1), torch.ones(1)]))
+        self.assertTrue(test_fn(torch.zeros(1), a=torch.ones(1, requires_grad=True)))
+        self.assertFalse(test_fn(torch.zeros(1), a=torch.ones(1)))
+        self.assertTrue(
+            test_fn([torch.zeros(1, requires_grad=True), torch.ones(1)], torch.zeros(1))
+        )
+        self.assertFalse(test_fn([torch.zeros(1), torch.ones(1)], torch.zeros(1)))
+
+    @skipIfTorchDynamo("Skip due to sys.refcount")
+    def test_any_output_is_alias_to_input_or_output(self):
+        test_fn = torch._C._any_output_is_alias_to_input_or_output
+        # Regression test on not leaking kwargs
+        t = torch.randn(2, 2)
+        t_refcount = sys.getrefcount(t)
+        test_fn((t,), {"a": t}, ())
+        assert sys.getrefcount(t) == t_refcount
+
+        x = torch.randn(2, 2)
+        y = torch.randn(2, 2)
+        self.assertTrue(
+            test_fn(
+                (x,),
+                {},
+                (x.t(),),
+            )
+        )
+        self.assertFalse(test_fn((x,), None, (2 * x,)))
+        self.assertTrue(
+            test_fn(
+                (),
+                {"a": x.view(-1)},
+                (x,),
+            )
+        )
+        self.assertTrue(
+            test_fn(
+                (),
+                {"a": x.view(-1)},
+                (x.t(),),
+            )
+        )
+        self.assertTrue(test_fn((y,), {}, (y[1:],)))
+        self.assertFalse(
+            test_fn(
+                (x,),
+                {"a": x},
+                (),
+            )
+        )
+        self.assertFalse(
+            test_fn(
+                (torch.tensor([]),),
+                {},
+                (torch.tensor([]),),
+            )
+        )
+        self.assertTrue(
+            test_fn(
+                ([x], x + 1),
+                {},
+                (x.t(),),
+            )
+        )
+        self.assertTrue(
+            test_fn(
+                ([x], x + 1),
+                {},
+                ([x.t()], x + 1),
+            )
+        )
+        self.assertTrue(
+            test_fn(
+                ([x], x),
+                {},
+                ([x.t()], x + 1),
+            )
+        )
+        self.assertTrue(
+            test_fn(
+                ([x, 1], x),
+                {},
+                ([x.t()], x + 1),
+            )
+        )
+        self.assertTrue(
+            test_fn(
+                ([[x]], x),
+                {},
+                ([x.t()], x + 1),
+            )
+        )
+        self.assertTrue(
+            test_fn(
+                ([[1, x], 2], 3),
+                {},
+                ([x.t()], x + 1),
+            )
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class MiniOpTestOther(CustomOpTestCaseBase):
     test_ns = "mini_op_test"
@@ -4211,6 +4536,189 @@ def test_mixed_types(self):
         self.assertEqual(result_type, list[typing.Union[int, float, str]])
 
 
+<<<<<<< HEAD
+=======
+class TestOpProfiles(TestCase):
+    def get_sample_op_profile(self, opname) -> dict[str, set[OpProfile]]:
+        return {
+            opname: {
+                OpProfile(
+                    args_profile=(
+                        TensorMetadata(
+                            rank=2,
+                            dtype=torch.float32,
+                            device=torch.device("cpu"),
+                            layout=torch.strided,
+                        ),
+                        TensorMetadata(
+                            rank=2,
+                            dtype=torch.float32,
+                            device=torch.device("cpu"),
+                            layout=torch.strided,
+                        ),
+                    ),
+                    out_profile=TensorMetadata(
+                        rank=2,
+                        dtype=torch.float32,
+                        device=torch.device("cpu"),
+                        layout=torch.strided,
+                    ),
+                )
+            }
+        }
+
+    def test_fake_registration(self):
+        fm = torch._subclasses.FakeTensorMode(
+            shape_env=ShapeEnv(allow_dynamic_output_shape_ops=True)
+        )
+        t1 = fm.from_tensor(torch.ones(3, 3))
+        t2 = fm.from_tensor(torch.ones(3, 3))
+
+        op_profiles = self.get_sample_op_profile("mylib.foo2.default")
+
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::foo2",
+                "(Tensor a, Tensor b) -> Tensor",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo2", "cpu", lib=lib)
+            def foo_impl(a, b):
+                return a + b
+
+            with (
+                self.assertRaisesRegex(
+                    torch._subclasses.fake_tensor.UnsupportedOperatorException,
+                    "mylib.foo2.default",
+                ),
+                fm,
+            ):
+                torch.ops.mylib.foo2(t1, t2)
+
+            with (
+                torch._library.fake_profile.unsafe_generate_fake_kernels(op_profiles),
+                fm,
+            ):
+                torch.ops.mylib.foo2(t1, t2)
+
+                with self.assertRaisesRegex(MissingOpProfile, "mylib::foo2"):
+                    torch.ops.mylib.foo2(torch.ones(3, 3, 3), torch.ones(3, 3, 3))
+
+            with (
+                self.assertRaisesRegex(
+                    torch._subclasses.fake_tensor.UnsupportedOperatorException,
+                    "mylib.foo2.default",
+                ),
+                fm,
+            ):
+                torch.ops.mylib.foo2(t1, t2)
+
+    def test_duplicate_registration_impl(self):
+        fm = torch._subclasses.FakeTensorMode(
+            shape_env=ShapeEnv(allow_dynamic_output_shape_ops=True)
+        )
+        t1 = fm.from_tensor(torch.ones(3, 3))
+        t2 = fm.from_tensor(torch.ones(3, 3))
+
+        op_profiles = self.get_sample_op_profile("mylib.foo3.default")
+
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::foo3",
+                "(Tensor a, Tensor b) -> Tensor",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo3", "cpu", lib=lib)
+            def foo3_impl(a, b):
+                return a + b
+
+            @torch.library.register_fake("mylib::foo3", lib=lib)
+            def foo3_impl_fake(a, b):
+                return (a + b).to(dtype=torch.bfloat16)
+
+            with fm:
+                self.assertEqual(torch.ops.mylib.foo3(t1, t2).dtype, torch.bfloat16)
+
+            with torch._library.fake_profile.unsafe_generate_fake_kernels(op_profiles):
+                with fm:
+                    self.assertEqual(torch.ops.mylib.foo3(t1, t2).dtype, torch.float32)
+
+            with fm:
+                self.assertEqual(torch.ops.mylib.foo3(t1, t2).dtype, torch.bfloat16)
+
+    def test_duplicate_registration_custom_op(self):
+        fm = torch._subclasses.FakeTensorMode(
+            shape_env=ShapeEnv(allow_dynamic_output_shape_ops=True)
+        )
+        t1 = fm.from_tensor(torch.ones(3, 3))
+        t2 = fm.from_tensor(torch.ones(3, 3))
+
+        op_profiles = self.get_sample_op_profile("mylib.foo1.default")
+
+        @torch.library.custom_op("mylib::foo1", mutates_args=())
+        def foo_impl(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+            return a + b
+
+        @torch.library.register_fake("mylib::foo1")
+        def foo_impl_fake(a, b):
+            return torch.empty_like(a, dtype=torch.bfloat16)
+
+        with fm:
+            self.assertEqual(torch.ops.mylib.foo1(t1, t2).dtype, torch.bfloat16)
+
+        with torch._library.fake_profile.unsafe_generate_fake_kernels(op_profiles):
+            with fm:
+                self.assertEqual(torch.ops.mylib.foo1(t1, t2).dtype, torch.float32)
+
+        with fm:
+            self.assertEqual(torch.ops.mylib.foo1(t1, t2).dtype, torch.bfloat16)
+
+    def test_yaml(self):
+        op_profiles = self.get_sample_op_profile("mylib.foo.default")
+        yaml_str = generate_yaml_from_profiles(op_profiles)
+        loaded = read_profiles_from_yaml(yaml_str)
+        self.assertEqual(op_profiles, loaded)
+
+    @unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
+    def test_save_to_file(self):
+        op_profile = self.get_sample_op_profile("mylib.foo.default")
+
+        # Saving with buffer
+        buffer = io.BytesIO()
+        save_op_profiles(op_profile, buffer)
+        buffer.seek(0)
+        loaded = load_op_profiles(buffer)
+        self.assertEqual(op_profile, loaded)
+
+        # Saving with file
+        with tempfile.NamedTemporaryFile() as f:
+            save_op_profiles(op_profile, f.name)
+            f.seek(0)
+            loaded = load_op_profiles(f.name)
+            self.assertEqual(op_profile, loaded)
+
+        # Saving with Path
+        with TemporaryFileName() as fname:
+            path = Path(fname)
+            save_op_profiles(op_profile, path)
+            loaded = load_op_profiles(path)
+            self.assertEqual(op_profile, loaded)
+
+    def test_version(self):
+        op_profiles = self.get_sample_op_profile("mylib.foo.default")
+        yaml_str = generate_yaml_from_profiles(op_profiles)
+        loaded = yaml.safe_load(yaml_str)
+        loaded["torch_version"] = "2.7"
+        yaml_str = yaml.dump(loaded, sort_keys=False)
+        with self.assertRaisesRegex(RuntimeError, "Unable to load outdated profile"):
+            loaded = read_profiles_from_yaml(yaml_str)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(TestCustomOpTesting, globals(), only_for=only_for)
 instantiate_parametrized_tests(TestCustomOp)
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 5c0708893579..b9c5a96a767f 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -25,6 +25,10 @@
 from torch.testing._internal.common_utils import (
     IS_CI,
     IS_JETSON,
+<<<<<<< HEAD
+=======
+    IS_S390X,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_SANDCASTLE,
     IS_WINDOWS,
     load_tests,
@@ -132,11 +136,35 @@
 )
 
 
+<<<<<<< HEAD
 # collate_fn that returns the batch cloned; defined globally here for pickle purposes.
+=======
+# The following collate functions are defined globally here for pickle purposes.
+
+
+# collate_fn that returns the batch cloned
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _clone_collate(b):
     return [x.clone() for x in b]
 
 
+<<<<<<< HEAD
+=======
+# collate_fn that returns the batch of sparse coo tensors cloned
+def _sparse_coo_collate(b):
+    lst = []
+    for x in b:
+        t = x.clone()
+        lst.append(t)
+        # Force sparse tensor invariants checks. check_pinning=True
+        # reproduces gh-153143.
+        torch._validate_sparse_coo_tensor_args(
+            t._indices(), t._values(), t.size(), t.is_coalesced(), check_pinning=False
+        )
+    return lst
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
@@ -1384,6 +1412,12 @@ def test_multiple_dataloaders(self):
     # This case pass on Intel GPU, but currently expected failure on other device,
     # please don't forget to remove this skip when remove the xfailIfLinux.
     @skipIfXpu
+<<<<<<< HEAD
+=======
+    # This case passes on s390x too.
+    # please don't forget to remove this skip when remove the xfailIfLinux.
+    @unittest.skipIf(IS_S390X, "Unexpectedly succeeds on s390x")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # https://github.com/pytorch/pytorch/issues/128551
     @xfailIfLinux
     def test_segfault(self):
@@ -2889,8 +2923,14 @@ class TestDataLoaderDeviceType(TestCase):
     def test_nested_tensor_multiprocessing(self, device, context):
         # The 'fork' multiprocessing context doesn't work for CUDA so skip it
         if "cuda" in device and context == "fork":
+<<<<<<< HEAD
             # TODO: Skip this better in a better way when the test framework allows
             return
+=======
+            self.skipTest(
+                f"{context} multiprocessing context not supported for {device}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         dataset = [
             torch.nested.nested_tensor([torch.randn(5)], device=device)
@@ -2928,6 +2968,40 @@ def test_nested_tensor_multiprocessing(self, device, context):
 
             next(iter(loader))
 
+<<<<<<< HEAD
+=======
+    @parametrize(
+        "context",
+        [ctx for ctx in supported_multiprocessing_contexts if ctx is not None],
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
+    def test_sparse_tensor_multiprocessing(self, device, context):
+        # The 'fork' multiprocessing context doesn't work for CUDA so skip it
+        if "cuda" in device and context == "fork":
+            self.skipTest(
+                f"{context} multiprocessing context not supported for {device}"
+            )
+
+        dataset = [torch.randn(5, 5).to_sparse().to(device) for _ in range(10)]
+
+        pin_memory_settings = [False]
+        if device == "cpu" and torch.cuda.is_available():
+            pin_memory_settings.append(True)
+
+        for pin_memory in pin_memory_settings:
+            loader = torch.utils.data.DataLoader(
+                dataset,
+                batch_size=1,
+                num_workers=4,
+                collate_fn=_sparse_coo_collate,
+                pin_memory=pin_memory,
+                multiprocessing_context=context,
+            )
+
+            for i, batch in enumerate(loader):
+                self.assertEqual(batch[0], dataset[i])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class IntegrationTestDataLoaderDataPipe(TestCase):
     r"""
diff --git a/test/test_dispatch.py b/test/test_dispatch.py
index 0e77c31915e5..bc55b985d223 100644
--- a/test/test_dispatch.py
+++ b/test/test_dispatch.py
@@ -1118,7 +1118,11 @@ def test_autogradother(self):
     def test_duplicate_registrations(self):
         dispatcher = PythonDispatcher()
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, r"Overriden is not allowed"):
+=======
+        with self.assertRaisesRegex(RuntimeError, r"Overridden is not allowed"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dispatcher.register(["CPU", "CPU"])
 
     def test_defaultbackend_math(self):
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 27a54d83cbd1..ab3d85f91d4a 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -15,6 +15,11 @@
 import torch.nn.functional as F
 from torch import sym_int, SymBool, SymFloat, SymInt
 from torch._C import _disabled_torch_function_impl
+<<<<<<< HEAD
+=======
+from torch._dynamo.testing import CompileCounterWithBackend
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental import sym_node
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.sym_node import method_to_operator, SymNode, to_node
@@ -27,10 +32,18 @@
     guard_float,
     guard_int,
     GuardOnDataDependentSymNode,
+<<<<<<< HEAD
+=======
+    has_free_symbols,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     hint_int,
     is_symbolic,
     ShapeEnv,
     StatelessSymbolicContext,
+<<<<<<< HEAD
+=======
+    statically_known_false,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     statically_known_true,
 )
 from torch.testing._internal.common_dtype import all_types_and
@@ -41,6 +54,10 @@
     skipIfTorchDynamo,
     TestCase,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.logging_utils import logs_to_string
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils import _pytree as pytree
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._sympy.functions import (
@@ -393,7 +410,11 @@ def test_symint_bitwise_and(self):
         self.assertEqual(res_and, 0b1000)
         self.assertIsInstance(res_and, torch.SymInt, msg=type(res_and))
         self.assertExpectedInline(
+<<<<<<< HEAD
             str(shape_env.guards[0][0]), """Eq(BitwiseFn_bitwise_and(s0, s1), 8)"""
+=======
+            str(shape_env.guards[0][0]), """Eq(BitwiseFn_bitwise_and(s97, s26), 8)"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         a1 = create_symint(shape_env, 3)
@@ -415,7 +436,11 @@ def test_symint_bitwise_or(self):
         self.assertEqual(res_or, 0b1110)
         self.assertIsInstance(res_or, torch.SymInt, msg=type(res_or))
         self.assertExpectedInline(
+<<<<<<< HEAD
             str(shape_env.guards[0][0]), """Eq(BitwiseFn_bitwise_or(s0, s1), 14)"""
+=======
+            str(shape_env.guards[0][0]), """Eq(BitwiseFn_bitwise_or(s97, s26), 14)"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_stride(self):
@@ -497,7 +522,11 @@ def test_guard_int(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
         self.assertEqual(guard_int(a0), 2)
+<<<<<<< HEAD
         self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s0, 2)""")
+=======
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s97, 2)""")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_sym_sum(self):
         shape_env = ShapeEnv()
@@ -512,7 +541,11 @@ def test_prefer_deferred_runtime_assertions_over_guards(self):
         shape_env = ShapeEnv(prefer_deferred_runtime_asserts_over_guards=True)
         s0 = create_symint(shape_env, 2)
         self.assertEqual(guard_int(s0), 2)
+<<<<<<< HEAD
         self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s0, 2)""")
+=======
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s97, 2)""")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         shape_env = ShapeEnv(prefer_deferred_runtime_asserts_over_guards=True)
         s0 = create_symint(shape_env, 2)
@@ -520,7 +553,11 @@ def test_prefer_deferred_runtime_assertions_over_guards(self):
         self.assertEqual(len(shape_env.guards), 0)
         self.assertExpectedInline(
             str([ra.expr for ra in shape_env.deferred_runtime_asserts[None]]),
+<<<<<<< HEAD
             """[Eq(s0, 2)]""",
+=======
+            """[Eq(s97, 2)]""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_sym_int(self):
@@ -529,14 +566,22 @@ def test_sym_int(self):
         r = sym_int(a0)
         self.assertEqual(r, 5)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
+<<<<<<< HEAD
         self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s0, 5)""")
+=======
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s97, 5)""")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         a1 = create_symint(shape_env, 7)
         r = sym_int(a1 / 2)
         self.assertEqual(guard_int(r), 3)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(
+<<<<<<< HEAD
             str(shape_env.guards[1][0]), """Eq(TruncToInt(IntTrueDiv(s1, 2)), 3)"""
+=======
+            str(shape_env.guards[1][0]), """Eq(TruncToInt(IntTrueDiv(s26, 2)), 3)"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         a3 = create_symint(shape_env, 3)
@@ -544,7 +589,11 @@ def test_sym_int(self):
         self.assertEqual(guard_int(r), 6)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(
+<<<<<<< HEAD
             str(shape_env.guards[2][0]), """Eq(TruncToInt(2.0*ToFloat(s2)), 6)"""
+=======
+            str(shape_env.guards[2][0]), """Eq(TruncToInt(2.0*ToFloat(s57)), 6)"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_sym_log2(self):
@@ -554,7 +603,11 @@ def test_sym_log2(self):
         self.assertEqual(r, 2.0)
         self.assertIsInstance(r, torch.SymFloat, msg=type(r))
         self.assertExpectedInline(
+<<<<<<< HEAD
             str(shape_env.guards[0][0]), """Eq(OpaqueUnaryFn_log2(ToFloat(s0)), 2.0)"""
+=======
+            str(shape_env.guards[0][0]), """Eq(OpaqueUnaryFn_log2(ToFloat(s97)), 2.0)"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_sym_sqrt(self):
@@ -564,7 +617,11 @@ def test_sym_sqrt(self):
         self.assertEqual(r, 2)
         self.assertIsInstance(r, torch.SymFloat, msg=type(r))
         self.assertExpectedInline(
+<<<<<<< HEAD
             str(shape_env.guards[0][0]), """Eq(OpaqueUnaryFn_sqrt(ToFloat(s0)), 2.0)"""
+=======
+            str(shape_env.guards[0][0]), """Eq(OpaqueUnaryFn_sqrt(ToFloat(s97)), 2.0)"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_sym_floor(self):
@@ -575,14 +632,22 @@ def test_sym_floor(self):
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(
             str(shape_env.guards[0][0]),
+<<<<<<< HEAD
             """Eq(FloorToInt(IntTrueDiv(s0, 2)), 2)""",
+=======
+            """Eq(FloorToInt(IntTrueDiv(s97, 2)), 2)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         r = math.floor(3.0 * a0)
         self.assertEqual(r, 15)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(
             str(shape_env.guards[1][0]),
+<<<<<<< HEAD
             """Eq(FloorToInt(3.0*ToFloat(s0)), 15)""",
+=======
+            """Eq(FloorToInt(3.0*ToFloat(s97)), 15)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_sym_trunc(self):
@@ -592,14 +657,22 @@ def test_sym_trunc(self):
         self.assertEqual(r, 2)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(
+<<<<<<< HEAD
             str(shape_env.guards[0][0]), """Eq(TruncToInt(IntTrueDiv(s0, 2)), 2)"""
+=======
+            str(shape_env.guards[0][0]), """Eq(TruncToInt(IntTrueDiv(s97, 2)), 2)"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         r = torch.sym_int(torch.sym_sqrt(a0))
         self.assertEqual(r, 2)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(
             str(shape_env.guards[1][0]),
+<<<<<<< HEAD
             """Eq(TruncToInt(OpaqueUnaryFn_sqrt(ToFloat(s0))), 2)""",
+=======
+            """Eq(TruncToInt(OpaqueUnaryFn_sqrt(ToFloat(s97))), 2)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_sym_ceil(self):
@@ -610,7 +683,11 @@ def test_sym_ceil(self):
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(
             str(shape_env.guards[0][0]),
+<<<<<<< HEAD
             """Eq(CeilToInt(IntTrueDiv(s0, 2)), 3)""",
+=======
+            """Eq(CeilToInt(IntTrueDiv(s97, 2)), 3)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         r1 = 3.0 * a0
         r = math.floor(r1)
@@ -618,7 +695,11 @@ def test_sym_ceil(self):
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(
             str(shape_env.guards[1][0]),
+<<<<<<< HEAD
             """Eq(FloorToInt(3.0*ToFloat(s0)), 15)""",
+=======
+            """Eq(FloorToInt(3.0*ToFloat(s97)), 15)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_sym_ite(self):
@@ -638,7 +719,11 @@ def test_sym_ite(self):
         self.assertEqual(type(t), type(r3))
         self.assertExpectedInline(
             str(shape_env.guards[0][0]),
+<<<<<<< HEAD
             """Eq(Piecewise((s0, Eq(s0, 5)), (s1, True)), 5)""",
+=======
+            """Eq(Piecewise((s97, Eq(s97, 5)), (s26, True)), 5)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         b4 = f == 5
         r4 = torch.sym_ite(b4, t, f)
@@ -647,7 +732,11 @@ def test_sym_ite(self):
         self.assertEqual(type(f), type(r4))
         self.assertExpectedInline(
             str(shape_env.guards[1][0]),
+<<<<<<< HEAD
             """Eq(Piecewise((s0, Eq(s1, 5)), (s1, True)), 4)""",
+=======
+            """Eq(Piecewise((s97, Eq(s26, 5)), (s26, True)), 4)""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def test_tracing_sym_ite(self):
@@ -679,7 +768,11 @@ def test_int_conversion(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
         int(a0)
+<<<<<<< HEAD
         self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s0, 2)""")
+=======
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s97, 2)""")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_data_dependent_guard(self):
         shape_env = ShapeEnv()
@@ -710,7 +803,11 @@ def test_expect_true_with_s0(self):
         self.assertTrue(expect_true(i0 < s0))
         self.assertExpectedInline(
             str([ra.expr for ra in shape_env.deferred_runtime_asserts[i0.node.expr]]),
+<<<<<<< HEAD
             """[u0 < s0]""",
+=======
+            """[u0 < s97]""",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertTrue(i0 < s0)
         self.assertTrue(i0 != s0)
@@ -944,7 +1041,11 @@ def test_floor_clean_div_axioms(self):
         shape_env = ShapeEnv()
         a = shape_env.create_unbacked_symint()
 
+<<<<<<< HEAD
         shape_env.defer_runtime_assert((a // 3 == 1).node.expr, " test")
+=======
+        shape_env.guard_or_defer_runtime_assert((a // 3 == 1).node.expr, " test")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from sympy import Eq
 
@@ -955,7 +1056,11 @@ def test_floor_clean_div_axioms(self):
         self.assertEqual(shape_env._maybe_evaluate_static(test2), None)
 
         # After this FloorDiv(a, 3) is simplified to CleanDiv(a, 3)
+<<<<<<< HEAD
         shape_env.defer_runtime_assert(Eq(Mod(a, 3), 0), " test")
+=======
+        shape_env.guard_or_defer_runtime_assert(Eq(Mod(a, 3), 0), " test")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(test2, shape_env.simplify(test1))
 
         self.assertTrue(shape_env.evaluate_expr(test1))
@@ -1004,6 +1109,7 @@ def assert_not_optimized(sym):
         b = s3 + s4 + s5
         assert_optimized(a)
         assert_optimized(b)
+<<<<<<< HEAD
         assert_optimized(a + b)
         assert_optimized(b + a)
 
@@ -1015,6 +1121,11 @@ def assert_not_optimized(sym):
         assert_optimized(a)
         assert_not_optimized(b)
         assert_not_optimized(a + b)
+=======
+        assert_not_optimized(a + b)
+        assert_not_optimized(b + a)
+        assert_not_optimized(b + a + b)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_max_of_unique_summation_opt(self):
         shape_env = ShapeEnv()
@@ -1173,6 +1284,7 @@ def f(a, b):
             out.strip(),
             """\
 class f(torch.nn.Module):
+<<<<<<< HEAD
     def forward(self, a_1: "f32[s0, s1]", b_1: "f32[s2, s1]"):
         # No stacktrace found for following nodes
         sym_size_int: "Sym(s0)" = torch.ops.aten.sym_size.int(a_1, 0)
@@ -1185,6 +1297,20 @@ def forward(self, a_1: "f32[s0, s1]", b_1: "f32[s2, s1]"):
         native_dropout = torch.ops.aten.native_dropout.default(new_empty, 0.5, True);  new_empty = None
         getitem: "f32[s0 + s2, 2*s1]" = native_dropout[0]
         getitem_1: "b8[s0 + s2, 2*s1]" = native_dropout[1];  native_dropout = None
+=======
+    def forward(self, a_1: "f32[s75, s96]", b_1: "f32[s57, s96]"):
+        # No stacktrace found for following nodes
+        sym_size_int: "Sym(s75)" = torch.ops.aten.sym_size.int(a_1, 0)
+        sym_size_int_1: "Sym(s57)" = torch.ops.aten.sym_size.int(b_1, 0)
+        add: "Sym(s57 + s75)" = sym_size_int + sym_size_int_1;  sym_size_int = sym_size_int_1 = None
+        sym_size_int_2: "Sym(s96)" = torch.ops.aten.sym_size.int(a_1, 1)
+        sym_size_int_3: "Sym(s96)" = torch.ops.aten.sym_size.int(b_1, 1);  b_1 = None
+        add_1: "Sym(2*s96)" = sym_size_int_2 + sym_size_int_3;  sym_size_int_2 = sym_size_int_3 = None
+        new_empty: "f32[s57 + s75, 2*s96]" = torch.ops.aten.new_empty.default(a_1, [add, add_1], pin_memory = False);  a_1 = add = add_1 = None
+        native_dropout = torch.ops.aten.native_dropout.default(new_empty, 0.5, True);  new_empty = None
+        getitem: "f32[s57 + s75, 2*s96]" = native_dropout[0]
+        getitem_1: "b8[s57 + s75, 2*s96]" = native_dropout[1];  native_dropout = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (getitem, getitem_1)""",  # noqa: B950
         )
 
@@ -1218,6 +1344,39 @@ def test_statically_known_true(self):
         # No guards should be generated
         self.assertEqual(len(shape_env.guards), 0)
 
+<<<<<<< HEAD
+=======
+    def test_statically_known_false(self):
+        shape_env = ShapeEnv()
+        s2, s3, s4 = (create_symint(shape_env, i) for i in range(2, 5))
+
+        # Statically known true
+        self.assertFalse(statically_known_false(True))
+        self.assertFalse(statically_known_false(s2 == s2))
+        self.assertFalse(statically_known_false(s2 * s3 > s3))
+        self.assertFalse(statically_known_false(s3 * s4 > s4))
+        self.assertFalse(statically_known_false((s3 + s3) % 2 == 0))
+
+        # Statically known false
+        self.assertTrue(statically_known_false(False))
+        self.assertTrue(statically_known_false(s3 * s4 <= s4))
+        self.assertTrue(statically_known_false((s3 + s3) % 2 == 1))
+
+        # True for hints, but not known statically
+        self.assertFalse(statically_known_false(s2 + s2 == s4))
+        self.assertFalse(statically_known_false(s4 % s2 == 0))
+        self.assertFalse(statically_known_false(s2 != s3))
+        self.assertFalse(statically_known_false(s3 * s4 > s2))
+
+        # False for hints, but not known statically
+        self.assertFalse(statically_known_false(s2 == s3))
+        self.assertFalse(statically_known_false(s2 > s3))
+        self.assertFalse(statically_known_false(s3 + s3 == s4))
+
+        # No guards should be generated
+        self.assertEqual(len(shape_env.guards), 0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ephemeral_source_simplification(self):
         from torch._dynamo.source import EphemeralSource
 
@@ -1329,6 +1488,40 @@ def test_tensor_factory_with_symint(self):
             res = Tensor(sym_args)
             self.assertEqual(res, expected, exact_dtype=False)
 
+<<<<<<< HEAD
+=======
+    def test_backed_size_oblivious_01_spec(self):
+        from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+        @torch.compile(dynamic=True, fullgraph=True)
+        def f(a, b):
+            if guard_size_oblivious(a.size(0) == 1):
+                return b * 10
+            else:
+                return b * 20
+
+        with torch.fx.experimental._config.patch(backed_size_oblivious=True):
+            # always go to the >= 2 branch.
+            self.assertEqual(
+                f(torch.tensor([1]), torch.tensor([1])), torch.tensor([20])
+            )
+
+    def test_baddbmm_symint(self):
+        from torch._subclasses.fake_tensor import FakeTensorMode
+
+        shape_env = ShapeEnv()
+        fake_mode = FakeTensorMode(shape_env=shape_env)
+
+        B, M, K, N = [shape_env.create_unbacked_symint() for _ in range(4)]
+
+        with fake_mode:
+            A = torch.empty((B, M, K), device="meta")
+            Bmat = torch.empty((B, K, N), device="meta")
+            bias3 = torch.empty((B, M, N), device="meta")
+
+            _ = torch.baddbmm(bias3, A, Bmat)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @skipIfTorchDynamo(
     "Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)"
@@ -1426,8 +1619,12 @@ def guard_fn(v):
                 out = lambda_apply(sym_inp1)
             else:
                 out = lambda_apply(sym_inp1, inp2)
+<<<<<<< HEAD
             if fn not in sym_node.alternate_impl_if_hinted_methods:
                 self.assertTrue(isinstance(out, (SymInt, SymFloat, SymBool)))
+=======
+            self.assertTrue(isinstance(out, (SymInt, SymFloat, SymBool)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = guard_fn(out)
             self.assertEqual(out, ref_out)
 
@@ -1438,16 +1635,24 @@ def guard_fn(v):
         sym_inp2 = get_sym_inp(inp2)
         with maybe_xfail(inp1, sym_inp2):
             out = lambda_apply(inp1, sym_inp2)
+<<<<<<< HEAD
             if fn not in sym_node.alternate_impl_if_hinted_methods:
                 self.assertTrue(isinstance(out, (SymInt, SymFloat, SymBool)))
+=======
+            self.assertTrue(isinstance(out, (SymInt, SymFloat, SymBool)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = guard_fn(out)
             self.assertEqual(out, ref_out)
 
         # Symified both args
         with maybe_xfail(sym_inp1, sym_inp2):
             out = lambda_apply(sym_inp1, sym_inp2)
+<<<<<<< HEAD
             if fn not in sym_node.alternate_impl_if_hinted_methods:
                 self.assertTrue(isinstance(out, (SymInt, SymFloat, SymBool)))
+=======
+            self.assertTrue(isinstance(out, (SymInt, SymFloat, SymBool)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = guard_fn(out)
             self.assertEqual(out, ref_out)
 
@@ -2815,6 +3020,141 @@ def test_guards_float_print(self):
         guards = shape_env.produce_guards_expression([s0])
         self.assertTrue(shape_env.evaluate_guards_expression(guards, [hint_int(s0)]))
 
+<<<<<<< HEAD
+=======
+    @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_guard_or_true(self):
+        from torch.fx.experimental.symbolic_shapes import guard_or_true
+
+        def func(a, b):
+            x = a.item()
+            if guard_or_true(x == 1):
+                return b * 10
+            else:
+                return b * 20
+
+        # eager.
+        self.assertEqual(func(torch.tensor([1]), torch.tensor([1])), torch.tensor([10]))
+        self.assertEqual(func(torch.tensor([2]), torch.tensor([1])), torch.tensor([20]))
+
+        # compile with unbacked.
+        unbacked_func = torch.compile(func, dynamic=True, fullgraph=True)
+        a = torch.tensor([1])
+        b = torch.tensor([1])
+        unbacked_func(a, b)
+
+        # always return b*10
+        self.assertEqual(
+            unbacked_func(torch.tensor([1]), torch.tensor([1])), torch.tensor([10])
+        )
+        self.assertEqual(
+            unbacked_func(torch.tensor([2]), torch.tensor([1])), torch.tensor([10])
+        )
+
+        # Test that statically known true works.
+        def func2(a, b):
+            x = a.item()
+            if guard_or_true(x != x):
+                return b * 10
+            else:
+                return b * 20
+
+        unbacked_func2 = torch.compile(func2, dynamic=True, fullgraph=True)
+        a = torch.tensor([1])
+        b = torch.tensor([1])
+        unbacked_func2(a, b)
+        # always return b*20
+        self.assertEqual(
+            unbacked_func2(torch.tensor([1]), torch.tensor([1])), torch.tensor([20])
+        )
+        self.assertEqual(
+            unbacked_func2(torch.tensor([2]), torch.tensor([1])), torch.tensor([20])
+        )
+
+        # Test backed_size_oblivious
+        with torch.fx.experimental._config.patch("backed_size_oblivious", True):
+
+            def func3(a, b):
+                if guard_or_true(a.size()[0] != 9):
+                    return b * 10
+                else:
+                    return b * 20
+
+            compiled = torch.compile(func3, dynamic=True, fullgraph=True)
+            a = torch.rand(9, 2)
+            b = torch.rand(3, 4)
+
+            self.assertEqual(func3(a, b), b * 20)
+            self.assertEqual(compiled(a, b), b * 10)
+
+    @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_guard_or_false(self):
+        from torch.fx.experimental.symbolic_shapes import guard_or_false
+
+        def func(a, b):
+            x = a.item()
+            if guard_or_false(x == 1):
+                return b * 10
+            else:
+                return b * 20
+
+        # eager.
+        self.assertEqual(func(torch.tensor([1]), torch.tensor([1])), torch.tensor([10]))
+        self.assertEqual(func(torch.tensor([2]), torch.tensor([1])), torch.tensor([20]))
+
+        # compile with unbacked.
+        unbacked_func = torch.compile(func, dynamic=True, fullgraph=True)
+        a = torch.tensor([1])
+        b = torch.tensor([1])
+        unbacked_func(a, b)
+
+        # always return b*20
+        self.assertEqual(
+            unbacked_func(torch.tensor([1]), torch.tensor([1])), torch.tensor([20])
+        )
+        self.assertEqual(
+            unbacked_func(torch.tensor([2]), torch.tensor([1])), torch.tensor([20])
+        )
+
+        # Test that statically known true works.
+        def func2(a, b):
+            x = a.item()
+            if guard_or_false(x == x):
+                return b * 10
+            else:
+                return b * 20
+
+        unbacked_func2 = torch.compile(func2, dynamic=True, fullgraph=True)
+        a = torch.tensor([1])
+        b = torch.tensor([1])
+        unbacked_func2(a, b)
+        # always return b*10
+        self.assertEqual(
+            unbacked_func2(torch.tensor([1]), torch.tensor([1])), torch.tensor([10])
+        )
+        self.assertEqual(
+            unbacked_func2(torch.tensor([2]), torch.tensor([1])), torch.tensor([10])
+        )
+
+        # Test backed_size_oblivious
+        with torch.fx.experimental._config.patch("backed_size_oblivious", True):
+
+            def func3(a, b):
+                if guard_or_false(a.size()[0] == 9):
+                    return b * 10
+                else:
+                    return b * 20
+
+            compiled = torch.compile(func3, dynamic=True, fullgraph=True)
+            a = torch.rand(9, 2)
+            b = torch.rand(3, 4)
+
+            self.assertEqual(func3(a, b), b * 10)
+            self.assertEqual(compiled(a, b), b * 20)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_guards_float_div(self):
         shape_env = ShapeEnv()
         s0 = create_symint(shape_env, 8)
@@ -2846,8 +3186,13 @@ def test_remove_symbols_without_guarding(self):
             ],
         )
 
+<<<<<<< HEAD
         self.assertEqual(f"{x.stride()}", "(s1, 1)")
         self.assertEqual(f"{x.shape}", "torch.Size([s0, s1])")
+=======
+        self.assertEqual(f"{x.stride()}", "(s49, 1)")
+        self.assertEqual(f"{x.shape}", "torch.Size([s26, s49])")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x_clean = _remove_symbols_without_guarding(x, 4096)
 
@@ -2855,5 +3200,387 @@ def test_remove_symbols_without_guarding(self):
         self.assertEqual(f"{x_clean.shape}", "torch.Size([5, 8])")
 
 
+<<<<<<< HEAD
+=======
+def custom_pass(graph: torch.fx.Graph) -> torch.fx.Graph:
+    for node in graph.nodes:
+        if node.name == "arg3_1":
+            assert node.meta["val"].size()[0] == 2
+    return graph
+
+
+class TestUnbacked(TestCase):
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @parametrize("backend", ["inductor", "eager"])
+    def test_deferred_neq_assert(self, backend):
+        @torch.compile(fullgraph=True, backend=backend)
+        def func(a):
+            torch._check(a.item() != 5)
+            return a.item() * 10
+
+        func(torch.tensor([100]))
+
+        with self.assertRaises(RuntimeError):
+            func(torch.tensor([5]))
+
+    # Test a situation where we generate a runtime assert i.e: u1==s1, then we specialize s1
+    # later on to a constant.
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @parametrize("backend", ["inductor", "eager"])
+    def test_post_specialize_runtime_assert1(self, backend):
+        @torch.compile(dynamic=True, backend=backend)
+        def func(x, y):
+            u0 = y.item()
+            s0 = x.size()[0]
+            s1 = x.size()[1]
+            torch._check(u0 + s0 + s1 == 102)
+            assert s0 == 2
+            return x * 10
+
+        func(torch.rand(2, 50), torch.tensor([50]))
+        with self.assertRaises(RuntimeError):
+            func(torch.rand(2, 50), torch.tensor([51]))
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._inductor.config.patch(post_grad_custom_pre_pass=custom_pass)
+    @parametrize("backend", ["inductor", "eager"])
+    def test_post_specialize_runtime_assert2(self, backend):
+        @torch.compile(dynamic=True, backend=backend)
+        def func(x, y):
+            u0 = y.item()
+            s0 = x.size()[0]
+            s1 = x.size()[1]
+            torch._check(u0 + s0 + s1 == 102)
+            return x * 10
+
+        func(torch.rand(2, 50), torch.tensor([50]))
+        with self.assertRaises(RuntimeError):
+            func(torch.rand(2, 50), torch.tensor([51]))
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @parametrize("backend", ["inductor", "eager"])
+    def test_deferred_sym_or_assert(self, backend):
+        @torch.compile(fullgraph=True, backend=backend)
+        def func(a, b):
+            torch._check(operator.or_(a.item() == 5, b.item() == 5))
+            return a.item() * 10
+
+        func(torch.tensor([5]), torch.tensor([100]))
+        func(torch.tensor([100]), torch.tensor([5]))
+
+    def test_has_free_symbols(self):
+        self.assertFalse(has_free_symbols(sympy.S.true))
+        self.assertFalse(has_free_symbols(sympy.Max(1, 10, evaluate=False)))
+
+        self.assertFalse(has_free_symbols(sympy.sympify("1")))
+        self.assertFalse(has_free_symbols(sympy.sympify("1.1")))
+        self.assertTrue(has_free_symbols(sympy.sympify("a")))
+        self.assertTrue(has_free_symbols(sympy.sympify("a*2")))
+        self.assertTrue(has_free_symbols(sympy.sympify("a+b")))
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @parametrize("backend", ["inductor", "eager"])
+    def test_deferred_sym_eq_assert(self, backend):
+        @torch.compile(fullgraph=True, backend=backend)
+        def func(a, b):
+            torch._check(b.item() == 5)
+            return a * 10
+
+        func(torch.tensor([5]), torch.tensor([5]))
+        with self.assertRaises(RuntimeError):
+            func(torch.tensor([100]), torch.tensor([1]))
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @parametrize("backend", ["inductor", "eager"])
+    @skipIfTorchDynamo("mark_unbacked is not traceable")
+    def test_deferred_with_unbacked_input(self, backend):
+        @torch.compile(fullgraph=True, dynamic=True, backend=backend)
+        def func(a, b):
+            torch._check(a.size()[0] == b.size()[0])
+            return a * 10
+
+        a = torch.rand(1, 1)
+        b = torch.rand(1, 1)
+        torch._dynamo.decorators.mark_unbacked(a, 0)
+        torch._dynamo.decorators.mark_unbacked(b, 0)
+        func(a, b)
+
+        with self.assertRaises(RuntimeError):
+            func(a, torch.rand(2, 1))
+
+
+class TestUbackedOps(TestCase):
+    @fresh_cache()
+    @skipIfTorchDynamo("not allowed to trace mark_unbacked")
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_reshape1(self):
+        cnt = CompileCounterWithBackend("inductor")
+
+        # Reshape happens in place reshape (no-clone)
+        # reshape u1 -> (u0*u0)
+        def func(x, y):
+            f = y.item()
+            t1 = x.view((f, f))
+            t2 = x.reshape((f, f))
+            # TODO avoid _check_is_size here.
+            torch._check_is_size(f)
+            return t1 * 10, t2 * 10
+
+        compiled_func = torch.compile(
+            fullgraph=True,
+            backend=cnt,
+            dynamic=True,
+        )(func)
+
+        # create a non-contiguous with data being even numbers in [0:cnt-1]
+        # and reshape it into sqrt(cnt)*sqrt(cnt)
+        def make_non_contiguous_tensor_and_test(cnt):
+            # create a non-contiguous tensor x that is skipping odd indices.
+            x = torch.arange(cnt * 2)
+            x = x.as_strided((x.size()[0] // 2,), (2,))
+
+            torch._dynamo.decorators.mark_unbacked(x, 0)
+            sz = torch.tensor([int(math.sqrt(cnt))])
+            compiled_result = compiled_func(x, sz)
+            eager_result = func(x, sz)
+            self.assertEqual(compiled_result, eager_result)
+
+        log_stream, ctx = logs_to_string(
+            "torch._functorch._aot_autograd.dispatch_and_compile_graph", "aot_graphs"
+        )
+        with ctx():
+            make_non_contiguous_tensor_and_test(4)
+        aot_graphs = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
+        self.assertExpectedInline(
+            aot_graphs,
+            """\
+def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "Sym(s7)", arg3_1: "i64[u1][s7]cpu"):
+        ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
+        ge_3: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
+        pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
+        eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
+        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
+        view: "i64[u0, u0][s7*u0, s7]cpu" = torch.ops.aten.view.default(arg3_1, [_local_scalar_dense, _local_scalar_dense])
+        view_1: "i64[u0, u0][s7*u0, s7]cpu" = torch.ops.aten.view.default(arg3_1, [_local_scalar_dense, _local_scalar_dense]);  arg3_1 = _local_scalar_dense = None
+        mul_9: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view, 10);  view = None
+        mul_12: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view_1, 10);  view_1 = None
+        return (mul_9, mul_12)""",  # noqa: B950
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+        make_non_contiguous_tensor_and_test(49)
+        self.assertEqual(cnt.frame_count, 1)
+
+        # Pass in a contiguous tensor, it will recompile due to stride being 1 (0/1 specialization).
+        # marking strides unbacked would have avoided the recompilation here.
+        x = torch.arange(100)
+        torch._dynamo.decorators.mark_unbacked(x, 0)
+
+        log_stream, ctx = logs_to_string(
+            "torch._functorch._aot_autograd.dispatch_and_compile_graph", "aot_graphs"
+        )
+        with ctx():
+            compiled_result = compiled_func(x, torch.tensor([10]))
+            eager_result = func(x, torch.tensor([10]))
+            self.assertEqual(compiled_result, eager_result)
+            self.assertEqual(cnt.frame_count, 2)
+
+        aot_graphs = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
+        self.assertExpectedInline(
+            aot_graphs,
+            """\
+def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]cpu"):
+        ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
+        ge_3: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
+        pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
+        eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
+        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
+        view: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.view.default(arg2_1, [_local_scalar_dense, _local_scalar_dense])
+        view_1: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.view.default(arg2_1, [_local_scalar_dense, _local_scalar_dense]);  arg2_1 = _local_scalar_dense = None
+        mul_4: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view, 10);  view = None
+        mul_7: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten.mul.Tensor(view_1, 10);  view_1 = None
+        return (mul_4, mul_7)""",  # noqa: B950
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+        x = torch.arange(25)
+        compiled_result = compiled_func(x, torch.tensor([5]))
+        eager_result = func(x, torch.tensor([5]))
+        self.assertEqual(cnt.frame_count, 2)
+
+    @skipIfTorchDynamo("not allowed to trace mark_unbacked")
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_reshape2(self):
+        cnt = CompileCounterWithBackend("inductor")
+
+        # This reshape requires a clone when the input is not contiguous and we cant compute strides.
+        # reshape (u2, u3) -> (u0, u1)
+        def func(x, y):
+            u0, u1 = y.tolist()
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+
+            result1 = torch.reshape(x, (u0, u1))
+            return result1 * 10
+
+        compiled_func = torch.compile(fullgraph=True, backend=cnt, dynamic=True)(func)
+
+        x = torch.randn(10, 10)
+        # make x not contiguous.
+        x = x.t_()
+        torch._dynamo.decorators.mark_unbacked(x, 0)
+        torch._dynamo.decorators.mark_unbacked(x, 1)
+
+        log_stream, ctx = logs_to_string(
+            "torch._functorch._aot_autograd.dispatch_and_compile_graph", "aot_graphs"
+        )
+        with ctx():
+            result_eager = func(x, torch.tensor([5, 20]))
+            result_compiled = compiled_func(x, torch.tensor([5, 20]))
+            self.assertEqual(result_compiled, result_eager)
+            self.assertEqual(cnt.frame_count, 1)
+
+        aot_graphs = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
+        self.assertExpectedInline(
+            aot_graphs,
+            """\
+def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)", arg3_1: "f32[u2, u3][1, u2]cpu"):
+        ge_1: "Sym(u2 >= 0)" = arg1_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u2 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        ge_3: "Sym(u3 >= 0)" = arg2_1 >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u3 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
+        select: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 0)
+        _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(select);  select = None
+        ge_5: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_5, "Runtime assertion failed for expression u0 >= 0 on node 'ge_2'");  ge_5 = _assert_scalar_2 = None
+        select_1: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 1);  arg0_1 = None
+        _local_scalar_dense_1: "Sym(u1)" = torch.ops.aten._local_scalar_dense.default(select_1);  select_1 = None
+        ge_7: "Sym(u1 >= 0)" = _local_scalar_dense_1 >= 0
+        _assert_scalar_3 = torch.ops.aten._assert_scalar.default(ge_7, "Runtime assertion failed for expression u1 >= 0 on node 'ge_3'");  ge_7 = _assert_scalar_3 = None
+        mul: "Sym(u2*u3)" = arg1_1 * arg2_1;  arg1_1 = arg2_1 = None
+        mul_1: "Sym(u0*u1)" = _local_scalar_dense * _local_scalar_dense_1
+        eq: "Sym(Eq(u2*u3, u0*u1))" = mul == mul_1;  mul = mul_1 = None
+        _assert_scalar_4 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u2*u3, u0*u1) on node 'eq'");  eq = _assert_scalar_4 = None
+        clone: "f32[u2, u3][Max(1, u3), 1]cpu" = torch.ops.aten.clone.default(arg3_1, memory_format = torch.contiguous_format);  arg3_1 = None
+        view: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.view.default(clone, [_local_scalar_dense, _local_scalar_dense_1]);  clone = _local_scalar_dense = _local_scalar_dense_1 = None
+        mul_19: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.mul.Tensor(view, 10);  view = None
+        return (mul_19,)""",  # noqa: B950
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+        result_eager = func(x, torch.tensor([2, 50]))
+        result_compiled = compiled_func(x, torch.tensor([2, 50]))
+        self.assertEqual(result_compiled, result_eager)
+        self.assertEqual(cnt.frame_count, 1)
+
+        x = torch.randn(4, 4).t_()
+        result_eager = func(x, torch.tensor([2, 8]))
+        result_compiled = compiled_func(x, torch.tensor([2, 8]))
+        self.assertEqual(result_compiled, result_eager)
+        self.assertEqual(cnt.frame_count, 1)
+
+        # Pass a contiguous tensor. A recompilation will happen due to 0/1 speciialization on stride.
+        log_stream, ctx = logs_to_string(
+            "torch._functorch._aot_autograd.dispatch_and_compile_graph", "aot_graphs"
+        )
+        with ctx():
+            # This used to hit could guard on data-dependent expression Eq(10, u3) x.stride[0]==10. and x.size()=[u2, u3].
+            # but not anymore since we use  definitely_contiguous .
+            # We need a way to mark strides unbacked to avoid the recompilation here.
+            x = torch.randn(10, 10)
+            torch._dynamo.decorators.mark_unbacked(x, 0)
+            torch._dynamo.decorators.mark_unbacked(x, 1)
+
+        aot_graphs = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
+        self.assertExpectedInline(
+            aot_graphs,
+            """""",  # noqa: B950
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+        result_compiled = compiled_func(x, torch.tensor([2, 50]))
+        result_eager = func(x, torch.tensor([2, 50]))
+
+        self.assertEqual(result_compiled, result_eager)
+        self.assertEqual(cnt.frame_count, 2)
+
+        x = torch.randn(4, 4)
+
+        result_eager = func(x, torch.tensor([2, 8]))
+        result_compiled = compiled_func(x, torch.tensor([2, 8]))
+        self.assertEqual(result_compiled, result_eager)
+        self.assertEqual(cnt.frame_count, 2)
+
+    @unittest.skip("this test fails due to inductor/autograd issue #153041")
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_non_contigious_reshape_failing(self):
+        # reshape u1 -> (u0*u0)
+        # this result in the tensor "i64[u0, u0][s7*u0, s7].
+        # reshape happens in place reshape (no-clone)
+        def func(x, y):
+            f = y.item()
+            t1 = x.view((f, f))
+            t2 = x.reshape((f, f))
+            return t1, t2
+
+        # create a non-contiguous with data being even numbers in [0:cnt-1]
+        def make_non_contiguous_tensor(cnt):
+            # create a non-contiguous tensor x that is skipping odd indices.
+            x = torch.arange(cnt * 2)
+            x = x.as_strided((x.size()[0] // 2,), (2,))
+            return x
+
+        x = make_non_contiguous_tensor(4)
+        torch._dynamo.decorators.mark_unbacked(x, 0)
+        compiled_func = torch.compile(
+            fullgraph=True,
+            backend="inductor",
+        )(func)
+
+        compiled_result = compiled_func(x, torch.tensor([2]))
+        eager_result = func(x, torch.tensor([2]))
+        self.assertEqual(compiled_result, eager_result)
+
+    @skipIfTorchDynamo("not allowed to trace mark_unbacked")
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_invalid_view_unbacked_view(self):
+        cnt = CompileCounterWithBackend("inductor")
+
+        # This view (u2, u3) -> (u0, u1) cant happen in general unless we know that input is contigous or we have
+        # hints to to compute strides.
+        def func(x, y):
+            u0, u1 = y.tolist()
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+
+            result2 = x.view(u0, u1) * 10
+            return result2
+
+        compiled_func = torch.compile(fullgraph=True, backend=cnt, dynamic=True)(func)
+
+        x = torch.randn(10, 10)
+        # make x not contiguous.
+        x = x.t_()
+        torch._dynamo.decorators.mark_unbacked(x, 0)
+        torch._dynamo.decorators.mark_unbacked(x, 1)
+        with self.assertRaises(torch._dynamo.exc.UserError):
+            # throws a data dependent error.
+            compiled_func(x, torch.tensor([5, 20]))
+
+
+instantiate_parametrized_tests(TestUnbacked)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_expanded_weights.py b/test/test_expanded_weights.py
index 7f210bf79a29..bc57481a2221 100644
--- a/test/test_expanded_weights.py
+++ b/test/test_expanded_weights.py
@@ -679,7 +679,11 @@ def _do_test(
             expected_grads = [torch.stack(grad) for grad in zip(*expected_grads)]
             if not batch_first:
                 expected_grads[-1] = expected_grads[-1].transpose(0, 1)
+<<<<<<< HEAD
         self.assertEqual(actual_res, expected_res)
+=======
+        self.assertEqual(actual_res, expected_res, atol=atol, rtol=rtol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         [
             self.assertEqual(actual, expected, atol=atol, rtol=rtol)
             for (actual, expected) in zip(actual_grads, expected_grads)
@@ -776,7 +780,11 @@ def _do_test_rnn_packed_sequence(
                 expected_grads.append(out_grads)
 
             expected_grads = [torch.stack(grad) for grad in zip(*expected_grads)]
+<<<<<<< HEAD
             self.assertEqual(actual_res, expected_res)
+=======
+            self.assertEqual(actual_res, expected_res, atol=atol, rtol=rtol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [
                 self.assertEqual(actual, expected, atol=atol, rtol=rtol)
                 for (actual, expected) in zip(actual_grads, expected_grads)
@@ -807,11 +815,15 @@ def batch_hidden(h):
             return h.unsqueeze(1).repeat(new_h_shape)
 
         module_cls = module_info.module_cls
+<<<<<<< HEAD
         atol, rtol = (
             (1e-4, 1e-5)
             if module_cls == torch.nn.GRU and dtype == torch.float32
             else (None, None)
         )
+=======
+        atol, rtol = (1e-3, 1e-4) if dtype == torch.float32 else (None, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module_inputs = module_info.module_inputs_func(
             module_info,
             device=device,
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 34ce0f79b863..c04edc1d53ed 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -5,22 +5,37 @@
 import contextlib
 import copy
 import dataclasses
+<<<<<<< HEAD
 import inspect
+=======
+import gc
+import inspect
+import io
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 import pickle
 import unittest
 import weakref
 from unittest.mock import patch
+<<<<<<< HEAD
 import io
 
 import numpy as np
+=======
+
+import numpy as np
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch._dynamo
 import torch._functorch.config
 import torch._prims as prims
 import torch.testing._internal.optests as optests
 import torch.utils._pytree as pytree
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch import distributed as dist
 from torch._C._functorch import _add_batch_dim, get_unwrapped, is_batchedtensor
 from torch._dispatch.python import enable_python_dispatcher
@@ -31,10 +46,17 @@
     _CacheKeyState,
     DynamicOutputShapeException,
     extract_tensor_metadata,
+<<<<<<< HEAD
     MetadataMismatchError,
     FakeTensor,
     FakeTensorConverter,
     FakeTensorMode,
+=======
+    FakeTensor,
+    FakeTensorConverter,
+    FakeTensorMode,
+    MetadataMismatchError,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unset_fake_temporarily,
     UnsupportedOperatorException,
 )
@@ -55,6 +77,10 @@
     OpDTypes,
     ops,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_dtype import all_types_complex_float8_and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -67,15 +93,23 @@
     TestCase,
     xfailIfTorchDynamo,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_dtype import all_types_complex_float8_and
 from torch.testing._internal.custom_op_db import custom_op_db
 
+=======
+from torch.testing._internal.custom_op_db import custom_op_db
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.inductor_utils import GPU_TYPE
 from torch.testing._internal.jit_utils import RUN_CUDA
 from torch.testing._internal.two_tensor import TwoTensor
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import TorchDispatchMode
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten = torch.ops.aten
 
 torch._dynamo.config.fake_tensor_cache_enabled = True
@@ -344,6 +378,45 @@ def test_fake_mode_error(self):
             with FakeTensorMode():
                 y = x[0]
 
+<<<<<<< HEAD
+=======
+    def test_no_tag_func(self):
+        import functools
+
+        from torch.nn.attention.flex_attention import _identity, flex_attention
+
+        def create_attention(score_mod, block_mask, enable_gqa=False):
+            return functools.partial(
+                flex_attention,
+                score_mod=score_mod,
+                block_mask=block_mask,
+                enable_gqa=enable_gqa,
+            )
+
+        input_shape = (4, 16, 128, 64)
+        q = torch.randn(
+            input_shape,
+            dtype=torch.bfloat16,
+            device="cpu",
+            requires_grad=False,
+        )
+        k = torch.randn(
+            input_shape,
+            dtype=torch.bfloat16,
+            device="cpu",
+            requires_grad=False,
+        )
+        v = torch.randn(
+            input_shape,
+            dtype=torch.bfloat16,
+            device="cpu",
+            requires_grad=False,
+        )
+        sdpa_partial = create_attention(_identity, None)
+        with FakeTensorMode(allow_non_fake_inputs=True):
+            sdpa_partial(q, k, v, return_lse=False)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
     )
@@ -971,6 +1044,19 @@ def add(x, y):
         self.assertIsInstance(r[0], FakeTensor)
         self.assertIsInstance(r[1], FakeTensor)
 
+<<<<<<< HEAD
+=======
+    def test_fast_div(self):
+        mode = FakeTensorMode()
+        with mode:
+            x = torch.empty(2, 2, device="cpu", dtype=torch.int32)
+        from torch._subclasses.fake_impls import get_fast_op_impls
+
+        fast_div = get_fast_op_impls()[torch.ops.aten.div.Tensor]
+        y = fast_div(mode, x, 2)
+        self.assertEqual(y.dtype, torch.float32)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_parametrized_tests(FakeTensorTest)
 
@@ -1106,7 +1192,13 @@ def test_fake(self, device, dtype, op):
 make_propagate_real_tensors_cls(FakeTensorOpInfoTest)
 instantiate_device_type_tests(FakeTensorOpInfoTest, globals(), only_for=("cpu", "cuda"))
 instantiate_device_type_tests(
+<<<<<<< HEAD
     PropagateRealTensorsFakeTensorOpInfoTest, globals(), only_for=("cpu",)  # noqa: F821
+=======
+    PropagateRealTensorsFakeTensorOpInfoTest,  # noqa: F821
+    globals(),
+    only_for=("cpu",),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -1406,6 +1498,7 @@ def forward(self, arg1, arg2, arg3):
                 self.assertTrue("output[0]" not in str(e))
                 if self.__class__.__name__.startswith("PropagateRealTensors"):
                     self.assertTrue(
+<<<<<<< HEAD
                         "Real tensor propagation found a metadata mismatch"
                         in str(e)
                     )
@@ -1413,6 +1506,13 @@ def forward(self, arg1, arg2, arg3):
                     self.assertTrue(
                         "found mismatched tensor metadata for output"
                         in str(e)
+=======
+                        "Real tensor propagation found a metadata mismatch" in str(e)
+                    )
+                else:
+                    self.assertTrue(
+                        "found mismatched tensor metadata for output" in str(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
     # IMPORTANT!!! Always run even if CUDA is not available
@@ -1478,6 +1578,23 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
         self.assertEqual(mode.count, 0)
 
+<<<<<<< HEAD
+=======
+    # PropagateRealTensors installs weakrefs
+    @expectedFailurePropagateRealTensors
+    @unittest.skipIf(not RUN_CUDA, "requires cuda")
+    def test_module_to(self):
+        def _check_device(sd, device_type):
+            for v in sd.values():
+                self.assertEqual(v.device.type, device_type)
+
+        with FakeTensorMode():
+            m = torch.nn.Linear(2, 2)
+            _check_device(m.state_dict(), "cpu")
+            m.to("cuda")
+            _check_device(m.state_dict(), "cuda")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 make_propagate_real_tensors_cls(FakeTensorOperatorInvariants)
 
@@ -1614,23 +1731,49 @@ def test_nonzero_stride(self):
     def test_torch_load_with_fake_mode(self):
         model = torch.nn.Linear(5, 10)
         sd = model.state_dict()
+<<<<<<< HEAD
         sd['tt'] = TwoTensor(torch.randn(2), torch.randn(2))
+=======
+        sd["tt"] = TwoTensor(torch.randn(2), torch.randn(2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def _read_tensor_and_check(key, sd_loaded, all_bytes, device):
             dtype = torch.float32
             t = sd_loaded[key]
             self.assertEqual(t.device.type, device)
             if isinstance(t, TwoTensor):
+<<<<<<< HEAD
                 untyped_storage_a, untyped_storage_b = t.a.untyped_storage(), t.b.untyped_storage()
                 offset_a, offset_b = untyped_storage_a._checkpoint_offset, untyped_storage_b._checkpoint_offset
                 nbytes_a, nbytes_b = untyped_storage_a.nbytes() // 4, untyped_storage_b.nbytes() // 4
                 result_a = torch.frombuffer(all_bytes, dtype=dtype, count=nbytes_a, offset=offset_a).resize_(t.a.size())
                 result_b = torch.frombuffer(all_bytes, dtype=dtype, count=nbytes_b, offset=offset_b).resize_(t.b.size())
+=======
+                untyped_storage_a, untyped_storage_b = (
+                    t.a.untyped_storage(),
+                    t.b.untyped_storage(),
+                )
+                offset_a, offset_b = (
+                    untyped_storage_a._checkpoint_offset,
+                    untyped_storage_b._checkpoint_offset,
+                )
+                nbytes_a, nbytes_b = (
+                    untyped_storage_a.nbytes() // 4,
+                    untyped_storage_b.nbytes() // 4,
+                )
+                result_a = torch.frombuffer(
+                    all_bytes, dtype=dtype, count=nbytes_a, offset=offset_a
+                ).resize_(t.a.size())
+                result_b = torch.frombuffer(
+                    all_bytes, dtype=dtype, count=nbytes_b, offset=offset_b
+                ).resize_(t.b.size())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(TwoTensor(result_a, result_b), sd[key])
             else:
                 untyped_storage = t.untyped_storage()
                 offset = untyped_storage._checkpoint_offset
                 nbytes = untyped_storage.nbytes() // 4
+<<<<<<< HEAD
                 result = torch.frombuffer(all_bytes, dtype=dtype, count=nbytes, offset=offset).resize_(t.size())
                 self.assertEqual(result, sd[key])
 
@@ -1639,12 +1782,24 @@ def _read_tensor_and_check(key, sd_loaded, all_bytes, device):
             # Create state_dict to be loaded later
             torch.save(sd, f)
             with open(f, 'rb') as g:
+=======
+                result = torch.frombuffer(
+                    all_bytes, dtype=dtype, count=nbytes, offset=offset
+                ).resize_(t.size())
+                self.assertEqual(result, sd[key])
+
+        with TemporaryFileName() as f, torch.serialization.safe_globals([TwoTensor]):
+            # Create state_dict to be loaded later
+            torch.save(sd, f)
+            with open(f, "rb") as g:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 all_bytes = g.read()
 
             fake_mode = FakeTensorMode()
             with fake_mode:
                 sd_loaded = torch.load(f)
             for k in sd:
+<<<<<<< HEAD
                 _read_tensor_and_check(k, sd_loaded, all_bytes, 'cpu')
             with fake_mode:
                 sd_loaded = torch.load(f, map_location="cuda")
@@ -1658,17 +1813,39 @@ def _read_tensor_and_check(key, sd_loaded, all_bytes, device):
         with TemporaryFileName() as f, torch.serialization.safe_globals([TwoTensor]):
             torch.save(sd, f)
             with open(f, 'rb') as g:
+=======
+                _read_tensor_and_check(k, sd_loaded, all_bytes, "cpu")
+            with fake_mode:
+                sd_loaded = torch.load(f, map_location="cuda")
+            for k in sd:
+                _read_tensor_and_check(k, sd_loaded, all_bytes, "cuda")
+
+        for k in sd.keys():
+            sd[k] = sd[k].to("cuda")
+
+        with TemporaryFileName() as f, torch.serialization.safe_globals([TwoTensor]):
+            torch.save(sd, f)
+            with open(f, "rb") as g:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 all_bytes = g.read()
 
             fake_mode = FakeTensorMode()
             with fake_mode:
                 sd_loaded = torch.load(f)
             for k in sd:
+<<<<<<< HEAD
                 _read_tensor_and_check(k, sd_loaded, all_bytes, 'cuda')
             with fake_mode:
                 sd_loaded = torch.load(f, map_location="cpu")
             for k in sd:
                 _read_tensor_and_check(k, sd_loaded, all_bytes, 'cpu')
+=======
+                _read_tensor_and_check(k, sd_loaded, all_bytes, "cuda")
+            with fake_mode:
+                sd_loaded = torch.load(f, map_location="cpu")
+            for k in sd:
+                _read_tensor_and_check(k, sd_loaded, all_bytes, "cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 make_propagate_real_tensors_cls(FakeTensorPropTest)
@@ -1985,9 +2162,15 @@ def test_fft_hfft2_issue145522(self):
             x = torch.randn(s0, s1, s2)
             out = torch.randn(s0, s3, s4)
             kwargs = {
+<<<<<<< HEAD
                 's': (s3, s4),
                 'dim': (1, s5),
                 'norm': 'ortho',
+=======
+                "s": (s3, s4),
+                "dim": (1, s5),
+                "norm": "ortho",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
             r = torch._C._fft.fft_hfft2(x, **kwargs, out=out)
             self.assertEqual(r.shape, out.shape)
@@ -2065,8 +2248,17 @@ def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
             def __torch_dispatch__(cls, func, types, args, kwargs):
                 if kwargs is None:
                     kwargs = {}
+<<<<<<< HEAD
                 args = pytree.tree_map_only(DifferentDeviceTensor, lambda x: x.inner_tensor, args)
                 kwargs = pytree.tree_map_only(DifferentDeviceTensor, lambda x: x.inner_tensor, kwargs)
+=======
+                args = pytree.tree_map_only(
+                    DifferentDeviceTensor, lambda x: x.inner_tensor, args
+                )
+                kwargs = pytree.tree_map_only(
+                    DifferentDeviceTensor, lambda x: x.inner_tensor, kwargs
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Returns unwrapped tensor
                 return func(*args, **kwargs)
 
@@ -2089,7 +2281,11 @@ def f(x):
             return torch.nn.functional.interpolate(
                 x,
                 size=[256, 256],
+<<<<<<< HEAD
                 mode='bilinear',
+=======
+                mode="bilinear",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 align_corners=False,
                 antialias=True,
             )
@@ -2099,8 +2295,18 @@ def f(x):
         x = fake_m.from_tensor(
             torch.randn(1, 3, 2005, 1920, requires_grad=True),
             symbolic_context=StatelessSymbolicContext(
+<<<<<<< HEAD
                 dynamic_sizes=[DimDynamic.STATIC, DimDynamic.STATIC, DimDynamic.DYNAMIC, DimDynamic.DYNAMIC],
                 constraint_sizes=[None, None, None, None]
+=======
+                dynamic_sizes=[
+                    DimDynamic.STATIC,
+                    DimDynamic.STATIC,
+                    DimDynamic.DYNAMIC,
+                    DimDynamic.DYNAMIC,
+                ],
+                constraint_sizes=[None, None, None, None],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
         with fake_m, enable_python_dispatcher():
@@ -2117,6 +2323,7 @@ def test_from_buffer(self):
 
             t = torch.ByteTensor(storage)
             self.assertTrue(isinstance(t, FakeTensor))
+<<<<<<< HEAD
             self.assertEqual(t.device, torch.device('cpu'))
 
     def test_meta_tensor_to_fake_cpu(self):
@@ -2125,6 +2332,16 @@ def test_meta_tensor_to_fake_cpu(self):
             x_cpu = x.to(device='cpu')
         self.assertTrue(isinstance(x_cpu, FakeTensor))
         self.assertEqual(x_cpu.device, torch.device('cpu'))
+=======
+            self.assertEqual(t.device, torch.device("cpu"))
+
+    def test_meta_tensor_to_fake_cpu(self):
+        x = torch.randn(4, 4, device="meta")
+        with FakeTensorMode(allow_non_fake_inputs=True):
+            x_cpu = x.to(device="cpu")
+        self.assertTrue(isinstance(x_cpu, FakeTensor))
+        self.assertEqual(x_cpu.device, torch.device("cpu"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_cache_tuple_outputs(self):
         """
@@ -2149,6 +2366,199 @@ def test_cache_tuple_outputs(self):
                     extract_tensor_metadata(b),
                 )
 
+<<<<<<< HEAD
+=======
+    def test_cache_aten_index(self):
+        with FakeTensorMode():
+            x = torch.randn(4, 4, 4)
+            idx_tensor1 = torch.tensor([0, 2, 3])
+            idx_tensor2 = torch.tensor([0, 1, 2])
+
+            FakeTensorMode.cache_clear()
+            self.assertHitsMisses(0, 0)
+
+            ref = torch.ops.aten.index(x, [None, idx_tensor1, idx_tensor2])
+            self.assertHitsMisses(0, 3)
+
+            res = torch.ops.aten.index(x, [None, idx_tensor1, idx_tensor2])
+            self.assertHitsMisses(1, 3)
+            self.assertEqual(extract_tensor_metadata(ref), extract_tensor_metadata(res))
+
+        with FakeTensorMode():
+            x = torch.randn(4, 4, 4)
+            idx_tensor1 = torch.tensor([True, True, False, True])
+            self.assertRaises(
+                DynamicOutputShapeException,
+                lambda: torch.ops.aten.index(x, [None, idx_tensor1]),
+            )
+
+            idx_tensor1 = torch.tensor([1, -2, 3, -4], dtype=torch.int8)
+            self.assertRaises(
+                DynamicOutputShapeException,
+                lambda: torch.ops.aten.index(x, [None, idx_tensor1]),
+            )
+
+    @skipIfTorchDynamo("cache hit/miss changes with invoke_subgraph caching")
+    def test_invoke_subgraph(self):
+        """
+        Tests invoke subgraph
+        """
+        invoke_subgraph = torch._higher_order_ops.invoke_subgraph
+
+        def run():
+            def fn(x, y):
+                return (x + y * 2,)
+
+            # Ensure there is no caching for non-Fx graph module inputs
+            with FakeTensorMode():
+                x = torch.randn(6, 4)
+                y = torch.randn(6, 4)
+
+                FakeTensorMode.cache_clear()
+                self.assertHitsMisses(0, 0)
+
+                ref = invoke_subgraph(fn, "subgraph", x, y)
+                self.assertHitsMisses(0, 2)
+                self.assertBypasses("function argument", 1)
+
+                res = invoke_subgraph(fn, "subgraph", x, y)
+                # The hits are from the ops inside fn
+                self.assertHitsMisses(2, 2)
+                self.assertBypasses("function argument", 2)
+
+                res = invoke_subgraph(fn, "subgraph", x, y)
+                # The hits are from the ops inside fn
+                self.assertHitsMisses(4, 2)
+                self.assertBypasses("function argument", 3)
+
+            # Get the mod as if its going through torch.compile
+            backend = torch._dynamo.testing.AotEagerAndRecordGraphs()
+            x = torch.randn(6, 4)
+            y = torch.randn(6, 4)
+            torch.compile(fn, backend=backend, fullgraph=True)(x, y)
+            self.assertEqual(len(backend.fw_graphs), 1)
+            mod = backend.fw_graphs[0]
+
+            # Ensure that we see hits everytime
+            with FakeTensorMode():
+                x = torch.randn(6, 4)
+                y = torch.randn(6, 4)
+
+                FakeTensorMode.cache_clear()
+                self.assertHitsMisses(0, 0)
+
+                ref = invoke_subgraph(mod, "subgraph", x, y)
+                self.assertHitsMisses(0, 3)
+
+                res = invoke_subgraph(mod, "subgraph", x, y)
+                # The hits are from re-running the subgraph
+                self.assertHitsMisses(1, 3)
+
+                res = invoke_subgraph(mod, "subgraph", x, y)
+                # The hits are from re-running the subgraph
+                self.assertHitsMisses(2, 3)
+
+                self.assertEqual(len(ref), len(res))
+                self.assertEqual(len(ref), len(res))
+                for a, b in zip(ref, res):
+                    self.assertEqual(
+                        extract_tensor_metadata(a),
+                        extract_tensor_metadata(b),
+                    )
+            self.assertTrue(count_invoke_subgraph_keys() > 0)
+
+        def count_invoke_subgraph_keys():
+            invoke_subgraph_keys = 0
+            for cache_key in FakeTensorMode.cache.keys():
+                if isinstance(cache_key.key[0], torch._ops.HigherOrderOperator):
+                    invoke_subgraph_keys += 1
+            return invoke_subgraph_keys
+
+        # Check that the graph gc clears the cache
+        run()
+        torch.compiler.reset()
+        gc.collect()
+        self.assertTrue(count_invoke_subgraph_keys() == 0)
+
+    @skipIfTorchDynamo("cache hit/miss changes with invoke_subgraph caching")
+    def test_invoke_subgraph_cacheable_inplace(self):
+        invoke_subgraph = torch._higher_order_ops.invoke_subgraph
+
+        def fn(x, y):
+            # aten ops are used so that eager backend graph is suitable for fake
+            # tensor testing
+            cos = torch.ops.aten.cos.default(x)
+            # inplace-view - this should cause the whole invoke_subgraph to not
+            # being able to cache
+            t = torch.ops.aten.t_.default(cos)
+            mul = torch.ops.aten.mul.Tensor(t, y)
+            return (mul,)
+
+        # Get the mod as if its going through torch.compile
+        backend = torch._dynamo.testing.AotEagerAndRecordGraphs()
+        x = torch.randn(4, 4)
+        y = torch.randn(4, 4)
+        torch.compile(fn, backend=backend, fullgraph=True)(x, y)
+        self.assertEqual(len(backend.graphs), 1)
+        mod = backend.graphs[0]
+
+        # Ensure that invoke_subgraph result is still cached
+        with FakeTensorMode():
+            x = torch.randn(4, 4)
+            y = torch.randn(4, 4)
+
+            FakeTensorMode.cache_clear()
+            self.assertHitsMisses(0, 0)
+
+            ref = invoke_subgraph(mod, "subgraph", x, y)
+            self.assertHitsMisses(0, 3)
+
+            res = invoke_subgraph(mod, "subgraph", x, y)
+            # The hits are from the ops inside fn and not the subgraph
+            self.assertHitsMisses(1, 3)
+
+            res = invoke_subgraph(mod, "subgraph", x, y)
+            # The hits are from the ops inside fn and not the subgraph
+            self.assertHitsMisses(2, 3)
+
+            self.assertEqual(len(ref), len(res))
+            self.assertEqual(len(ref), len(res))
+            for a, b in zip(ref, res):
+                self.assertEqual(
+                    extract_tensor_metadata(a),
+                    extract_tensor_metadata(b),
+                )
+
+    @skipIfTorchDynamo("cache hit/miss changes with invoke_subgraph caching")
+    def test_unbacked_output(self):
+        # The point of this test is to have an op which has no symbols as input
+        # but a symbol as an output and make sure that we skip caching it.
+        class LengthsGather(torch.nn.Module):
+            def forward(
+                self,
+                input: torch.Tensor,
+                lengths: torch.Tensor,
+                indices: torch.Tensor,
+                offsets: torch.Tensor,
+            ) -> torch.Tensor:
+                bias = torch.gather(offsets, 0, indices)
+                lengths_selected = torch.gather(lengths, 0, indices)
+                index = torch.repeat_interleave(bias, lengths_selected, dim=0)
+                return index
+
+        input = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        lengths = torch.tensor([0, 2, 3, 1, 4])
+        indices = torch.tensor([2, 3, 4, 6, 7, 8, 9])
+        offsets = torch.cumsum(lengths, 0)
+        ep = torch.export.export(
+            LengthsGather(), (input, lengths, indices, offsets), strict=False
+        )
+
+        FakeTensorMode.cache_clear()
+        ep.run_decompositions({})
+        self.assertBypasses("unrepresented symbol in output", 2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_flop_counter.py b/test/test_flop_counter.py
index 58400d86a815..8af2f5ee59d2 100644
--- a/test/test_flop_counter.py
+++ b/test/test_flop_counter.py
@@ -8,18 +8,31 @@
 import torch.utils.flop_counter
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.testing._internal.common_cuda import (
+<<<<<<< HEAD
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     PLATFORM_SUPPORTS_CUDNN_ATTENTION
+=======
+    PLATFORM_SUPPORTS_CUDNN_ATTENTION,
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+    PLATFORM_SUPPORTS_FP8,
+    PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_utils import (
     run_tests,
     TEST_WITH_TORCHDYNAMO,
     TestCase,
+<<<<<<< HEAD
     skipIfRocm,
 )
 
+=======
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 try:
     from torchvision import models as torchvision_models
 
@@ -422,7 +435,16 @@ def get_flops(
             run_uniform_flops(backend, with_backward=True)
             for backend in ["math", "flash", "mem_efficient", "cudnn"]
         ]
+<<<<<<< HEAD
         flops_fw_bw_math, flops_fw_bw_flash, flops_fw_bw_efficient, flops_fw_bw_cudnn = flops
+=======
+        (
+            flops_fw_bw_math,
+            flops_fw_bw_flash,
+            flops_fw_bw_efficient,
+            flops_fw_bw_cudnn,
+        ) = flops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(flops_fw_math * 3, flops_fw_bw_math)
         self.assertEqual(flops_fw_math * 7 // 2, flops_fw_bw_flash)
         self.assertEqual(flops_fw_bw_flash, flops_fw_bw_efficient)
@@ -457,7 +479,10 @@ def get_flops(
         self.assertExpectedInline(str(flops_fw_bw_math), """805306368""")
         self.assertExpectedInline(str(flops_fw_bw_efficient), """939524096""")
 
+<<<<<<< HEAD
     @skipIfRocm  # Nested tensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_CUDA, "CUDA not available")
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION
@@ -677,7 +702,10 @@ def split_tensor(x):
             ),
         )
 
+<<<<<<< HEAD
     @skipIfRocm  # Nested tensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not HAS_CUDA, "CUDA not available")
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION,
@@ -705,7 +733,13 @@ def test_nested_attention_fake_tensors(self):
                     False,
                 )
 
+<<<<<<< HEAD
         dense_x = torch.randn(4, 40, 4, 16, dtype=torch.bfloat16, device="cuda").transpose(1, 2)
+=======
+        dense_x = torch.randn(
+            4, 40, 4, 16, dtype=torch.bfloat16, device="cuda"
+        ).transpose(1, 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with FlopCounterMode() as real_flop_counter_mode:
             torch.ops.aten._flash_attention_forward(
@@ -721,8 +755,15 @@ def test_nested_attention_fake_tensors(self):
                 False,
             )
 
+<<<<<<< HEAD
         self.assertEqual(int(get_total_flops(fake_flop_counter_mode)), int(get_total_flops(real_flop_counter_mode)))
 
+=======
+        self.assertEqual(
+            int(get_total_flops(fake_flop_counter_mode)),
+            int(get_total_flops(real_flop_counter_mode)),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_addmm_out(self):
         def f(x):
@@ -795,7 +836,13 @@ def foo(x: torch.Tensor) -> torch.Tensor:
 
         called = 0
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(ValueError, "expected each target to be OpOverloadPacket"):
+=======
+        with self.assertRaisesRegex(
+            ValueError, "expected each target to be OpOverloadPacket"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             register_flop_formula(torch.ops.mylib.foo.default)(lambda x: x)
 
         @register_flop_formula(torch.ops.mylib.foo)
@@ -826,7 +873,13 @@ def get_flops(model):
         with torch.inference_mode():
             mode_inference = get_flops(resnet18)
 
+<<<<<<< HEAD
         self.assertEqual(get_total_flops(mode_standard), get_total_flops(mode_inference))
+=======
+        self.assertEqual(
+            get_total_flops(mode_standard), get_total_flops(mode_inference)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         layer1_conv_flops_standard = mode_standard.flop_counts["ResNet.layer1"][
             torch.ops.aten.convolution
@@ -854,5 +907,9 @@ def test_scaled_mm(self):
 
         self.assertExpectedInline(get_total_flops(mode), """860160""")
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 8531c3a3422d..5fdc64e5373d 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -12,6 +12,7 @@
 import torch
 from torch.testing import make_tensor
 from torch.testing._comparison import default_tolerances
+<<<<<<< HEAD
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
     dtypes,
@@ -19,6 +20,17 @@
     onlyCUDA,
     OpDTypes,
     ops,
+=======
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    largeTensorTest,
+    onlyCUDA,
+    OpDTypes,
+    ops,
+    skipCUDAVersionIn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex_and,
@@ -42,6 +54,10 @@
     TEST_WITH_ROCM,
     TestCase,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.triton_utils import requires_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _BOOL_SUB_ERR_MSG = "Subtraction, the `-` operator"
@@ -85,6 +101,11 @@ def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
         ):
             with torch.profiler.profile() as p:
                 actual = self.func(*inputs, **kwargs)
+<<<<<<< HEAD
+=======
+                # synchronize within the profiler context to make sure events happen before exiting
+                torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             keys = tuple([e.key for e in p.key_averages()])
             mta_called = any("multi_tensor_apply_kernel" in k for k in keys)
             assert mta_called == (expect_fastpath and (not zero_size)), (
@@ -186,6 +207,12 @@ def test_all_zero_size_tensors_do_not_launch_kernel(self, device, dtype, op):
                         zero_size=True,
                     )
 
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfRocmVersionLessThan((6, 0))
     @ops(
         foreach_unary_op_db
@@ -298,6 +325,12 @@ def _binary_test(
                 else:
                     self.assertEqual(expected, actual)
 
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(filter(lambda op: op.supports_scalar_self_arg, foreach_binary_op_db))
     @parametrize("is_fastpath", (True, False))
     def test_binary_op_with_scalar_self_support(self, device, dtype, op, is_fastpath):
@@ -355,8 +388,14 @@ def clone(arg):
 
     @ops(foreach_pointwise_op_db)
     @parametrize("is_fastpath", (True, False))
+<<<<<<< HEAD
     # TODO: Remove skip CUDA 12.6 once resolved: https://github.com/pytorch/pytorch/issues/148681
     @unittest.skipIf(_get_torch_cuda_version() >= (12, 6), "Failure on CUDA 12.6")
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pointwise_op_with_tensor_of_scalarlist_overload(
         self, device, dtype, op, is_fastpath
     ):
@@ -694,6 +733,12 @@ def test_binary_op_list_error_cases(self, device, dtype, op):
                 ):
                     foreach_op_([tensor1], [tensor2])
 
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
     @ops(
         filter(lambda op: op.supports_out, foreach_binary_op_db),
@@ -809,6 +854,12 @@ def test_binary_op_list_slow_path(self, device, dtype, op):
             scalar_self_arg=False,
         )
 
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @ops(
         filter(lambda op: op.supports_out, foreach_binary_op_db),
         dtypes=floating_types_and(torch.half, torch.bfloat16),
@@ -1332,11 +1383,21 @@ def test_foreach_copy_with_multi_device_inputs(self, device, dtype, op):
                         copy_(t, s, non_blocking)
                     self.assertEqual(ref_input, sample.input)
 
+<<<<<<< HEAD
+=======
+    # Skip CUDA version 12.8 as the upgrade makes profiler results flaky
+    # https://github.com/pytorch/pytorch/issues/148681
+    @skipCUDAVersionIn([(12, 8)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     @ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
     def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
         # check (a) multi_tensor_apply is called and (b) numerical parity with for-loop and Tensor.copy_
         foreach_copy_ = ForeachFuncWrapper(op.inplace_variant)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for sample in op.sample_inputs(
             device, dtype, noncontiguous=False, allow_higher_dtype_scalars=True
         ):
@@ -1355,6 +1416,53 @@ def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
                 for t, ref_t in zip(out, ref_out):
                     self.assertTrue(torch.equal(t, ref_t))
 
+<<<<<<< HEAD
+=======
+    @onlyCUDA
+    @largeTensorTest("40GB", device="cuda")
+    def test_foreach_copy_with_multi_dtypes_large_input(self):
+        # see https://github.com/pytorch/pytorch/issues/156261
+        self_tensor = torch.empty(2**31 + 1, device="cuda", dtype=torch.float32)
+        src_tensor = torch.ones(2**31 + 1, device="cuda", dtype=torch.bfloat16)
+
+        torch._foreach_copy_([self_tensor], [src_tensor])
+        ref_out = torch.empty_like(self_tensor).copy_(src_tensor)
+        self.assertEqual(self_tensor, ref_out)
+
+    @requires_cuda
+    @ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
+    def test_foreach_copy_with_different_device_inputs(self, device, dtype, op):
+        if dtype in (torch.complex128, torch.complex64):
+            self.skipTest("Complex dtype not supported")
+        # check foreach_copy when self and src tensorList have different device
+        foreach_copy = op.method_variant
+        copy_ = op.ref_inplace
+
+        def fn(self_tensor, src_tensor, non_blocking):
+            return foreach_copy(self_tensor, src_tensor, non_blocking)
+
+        fn = torch.compile(fn)
+        for non_blocking in (False,):
+            for sample in op.sample_inputs(
+                device, dtype, noncontiguous=False, allow_higher_dtype_scalars=True
+            ):
+                with torch.no_grad():
+                    ref_input = [t.detach().clone() for t in sample.input]
+                    ref_input_cpu = [t.detach().clone().to("cpu") for t in sample.input]
+                    rhs_tensors = [t.detach().clone().to("cpu") for t in sample.args[0]]
+                    self_tensors = [t.detach().clone().to("cpu") for t in sample.input]
+
+                output1 = fn(sample.input, rhs_tensors, non_blocking)
+                for t, s in zip(ref_input, rhs_tensors):
+                    copy_(t, s, non_blocking)
+                self.assertEqual(output1, ref_input)
+
+                output2 = fn(self_tensors, sample.args[0], non_blocking)
+                for t, s in zip(ref_input_cpu, sample.args[0]):
+                    copy_(t, s, non_blocking)
+                self.assertEqual(output2, ref_input_cpu)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Test reverse-mode & forward-mode AD if supported.
     @onlyCUDA
     @ops(
diff --git a/test/test_fx.py b/test/test_fx.py
index 07401118c426..2cd340b101a2 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: fx"]
 # ruff: noqa: F841
+<<<<<<< HEAD
+=======
+# flake8: noqa: E221
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import builtins
 import collections
@@ -180,7 +184,11 @@ def _custom_fx_repr_fn(self) -> str:
 
 
 # for testing pytrees
+<<<<<<< HEAD
 class Foo:  # noqa: B209
+=======
+class Foo:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, a, b):
         self.a = a
         self.b = b
@@ -708,15 +716,30 @@ def forward(self, a, b):
             seen_names.add(node.name)
 
     def test_stack_traces(self):
+<<<<<<< HEAD
         class M(torch.nn.Module):
             def forward(self, a, b):
                 return a + b
+=======
+        def foo(a, b):
+            return a * b
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, b):
+                c = a + b
+                c = foo(a, c)
+                return c
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         tracer = torch.fx.Tracer()
         tracer.record_stack_traces = True
 
         graph = tracer.trace(M())
         # saving the original list because we will insert new nodes as a part of a test
+<<<<<<< HEAD
         orig_graph_nodes = list(graph.nodes)
         for node in orig_graph_nodes:
             if node.op == "output":
@@ -728,6 +751,18 @@ def forward(self, a, b):
             new_node = graph.node_copy(node)
             self.assertTrue(new_node.stack_trace is not None)
             assert "test_fx.py" in new_node.stack_trace
+=======
+        stack_traces = "\n".join([node.meta.get("stack_trace", "") for node in graph.nodes])
+        FileCheck().check_count(
+            "c = a + b", 1, exactly=True
+        ).run(stack_traces.strip())
+        FileCheck().check_count(
+            "c = foo(a, c)", 1, exactly=True
+        ).run(stack_traces.strip())
+        FileCheck().check_count(
+            "return a * b", 1, exactly=True
+        ).run(stack_traces.strip())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_stack_traces_with_transformer(self):
         class M(torch.nn.Module):
@@ -1271,6 +1306,26 @@ def forward(self, x: torch.Tensor, y: int = 2):
             "call_module"
         ).check("clamp").check("call_method").run(all_formatted)
 
+<<<<<<< HEAD
+=======
+    def test_print_graph(self):
+        op: torch._ops.OpOverload = torch.ops.aten.relu.default
+        type_name: str = torch.typename(op)
+
+        graph: torch.fx.Graph = torch.fx.Graph()
+        a: torch.fx.Node = graph.create_node("placeholder", "x")
+        b: torch.fx.Node = graph.create_node("call_function", op, (a,), type_expr=type_name)
+        c: torch.fx.Node = graph.create_node("call_function", op, (b,), type_expr=type_name)
+        graph.output((b, c))
+
+        gm: torch.fx.GraphModule = torch.fx.GraphModule(
+            torch.nn.Module(), graph
+        )
+        gm.graph.lint()
+        text = gm.print_readable(False)
+        assert 2 == text.count("_torch__ops_aten_aten_relu_")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_script_tensor_constant(self):
         # TorchScript seems to ignore attributes that start with `__`.
         # We used to call anonymous Tensor values `__tensor_constant*`, but
@@ -2324,9 +2379,13 @@ def test_deepcopy_recursion_depth(self):
 
         copied_graph = copy.deepcopy(g)
 
+<<<<<<< HEAD
         val_map = {}
         for orig_node, new_node in zip(g.nodes, copied_graph.nodes):
             val_map[orig_node] = new_node
+=======
+        val_map = dict(zip(g.nodes, copied_graph.nodes))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for orig_node, new_node in zip(g.nodes, copied_graph.nodes):
             orig_users = set(orig_node.users.keys())
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 434de5243c13..086aefeb3e8c 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -791,6 +791,49 @@ def mod_partition(node: Node):
 
         self.assertEqual(orig_out, submodules_out)
 
+<<<<<<< HEAD
+=======
+    def test_split_module_input_names(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x, a0, a1, b0, b1, c0, c1):
+                x = x + (a0 ** 2) + (a1 / 2)
+                x = x + (b0 ** 2) + (b1 / 2)
+                x = x + (c0 ** 2) + (c1 / 2)
+                return x
+
+        mod = Mod()
+        traced = torch.fx.symbolic_trace(mod)
+
+        seen = 0
+
+        def split(n):
+            nonlocal seen
+            result = seen // 4
+            seen += 1
+            return result
+
+        split = split_module(traced, mod, split, keep_original_input_name=False)
+
+        # All the submodules should take in the inputs in the same order.
+        args = [torch.tensor(2.), torch.tensor(3.), torch.tensor(4.)]
+        output0 = split.submod_0(*args)
+        output1 = split.submod_1(*args)
+        output2 = split.submod_2(*args)
+        self.assertEqual(output0, output1)
+        self.assertEqual(output1, output2)
+
+        # Each submodule should have normalized input names
+        def check_ph(gm):
+            nodes = list(gm.graph.nodes)
+            self.assertEqual(nodes[0].target, "arg_0")
+            self.assertEqual(nodes[1].target, "arg_1")
+            self.assertEqual(nodes[2].target, "arg_2")
+
+        check_ph(split.submod_0)
+        check_ph(split.submod_1)
+        check_ph(split.submod_2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_split_module_dead_code(self):
         class ModWithDeadCode(torch.nn.Module):
             def forward(self, x):
diff --git a/test/test_hub.py b/test/test_hub.py
index 1447b3dc4a76..da6b18c8f1c6 100644
--- a/test/test_hub.py
+++ b/test/test_hub.py
@@ -8,7 +8,16 @@
 
 import torch
 import torch.hub as hub
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_SANDCASTLE, retry, TestCase
+=======
+from torch.testing._internal.common_utils import (
+    IS_SANDCASTLE,
+    retry,
+    run_tests,
+    TestCase,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def sum_of_state_dict(state_dict):
@@ -307,3 +316,10 @@ def test_trust_repo_legacy(self):
         torch.hub.load("ailzhang/torchhub_example", "mnist_zip_1_6", trust_repo="check")
 
         self._assert_trusted_list_is_empty()
+<<<<<<< HEAD
+=======
+
+
+if __name__ == "__main__":
+    run_tests()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 1fbe397c8247..9abddacd0133 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -250,7 +250,14 @@ def validate_setting(x):
         reference = consec((10,))
         strided = torch.tensor((), dtype=dtype, device=device)
         strided.set_(
+<<<<<<< HEAD
             reference.storage(), storage_offset=0, size=torch.Size([4]), stride=[2]
+=======
+            reference.untyped_storage(),
+            storage_offset=0,
+            size=torch.Size([4]),
+            stride=[2],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.assertEqual(strided[[0]], torch.tensor([1], dtype=dtype, device=device))
@@ -274,7 +281,14 @@ def validate_setting(x):
         # stride is [4, 8]
         strided = torch.tensor((), dtype=dtype, device=device)
         strided.set_(
+<<<<<<< HEAD
             reference.storage(), storage_offset=4, size=torch.Size([2]), stride=[4]
+=======
+            reference.untyped_storage(),
+            storage_offset=4,
+            size=torch.Size([2]),
+            stride=[4],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(strided[[0]], torch.tensor([5], dtype=dtype, device=device))
         self.assertEqual(
@@ -309,6 +323,7 @@ def validate_setting(x):
         self.assertEqual(reference[ri([0]), ri([0])], consec((1,)))
         self.assertEqual(reference[ri([2]), ri([1])], consec((1,), 6))
         self.assertEqual(
+<<<<<<< HEAD
             reference[[ri([0, 0]), ri([0, 1])]],
             torch.tensor([1, 2], dtype=dtype, device=device),
         )
@@ -318,6 +333,17 @@ def validate_setting(x):
         )
         self.assertEqual(
             reference[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
+=======
+            reference[(ri([0, 0]), ri([0, 1]))],
+            torch.tensor([1, 2], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[(ri([0, 1, 1, 0, 2]), ri([1]))],
+            torch.tensor([2, 4, 4, 2, 6], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[(ri([0, 0, 1, 1]), ri([0, 1, 0, 0]))],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.tensor([1, 2, 3, 3], dtype=dtype, device=device),
         )
 
@@ -387,6 +413,7 @@ def validate_setting(x):
             reference[ri([2]), ri([1])], torch.tensor([6], dtype=dtype, device=device)
         )
         self.assertEqual(
+<<<<<<< HEAD
             reference[[ri([0, 0]), ri([0, 1])]],
             torch.tensor([0, 4], dtype=dtype, device=device),
         )
@@ -396,6 +423,17 @@ def validate_setting(x):
         )
         self.assertEqual(
             reference[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
+=======
+            reference[(ri([0, 0]), ri([0, 1]))],
+            torch.tensor([0, 4], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[(ri([0, 1, 1, 0, 3]), ri([1]))],
+            torch.tensor([4, 5, 5, 4, 7], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[(ri([0, 0, 1, 1]), ri([0, 1, 0, 0]))],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.tensor([0, 4, 1, 1], dtype=dtype, device=device),
         )
 
@@ -446,7 +484,13 @@ def validate_setting(x):
 
         reference = torch.arange(0.0, 24, dtype=dtype, device=device).view(3, 8)
         strided = torch.tensor((), dtype=dtype, device=device)
+<<<<<<< HEAD
         strided.set_(reference.storage(), 1, size=torch.Size([2, 4]), stride=[8, 2])
+=======
+        strided.set_(
+            reference.untyped_storage(), 1, size=torch.Size([2, 4]), stride=[8, 2]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(
             strided[ri([0, 1]), ri([0])],
@@ -463,6 +507,7 @@ def validate_setting(x):
             strided[ri([1]), ri([3])], torch.tensor([15], dtype=dtype, device=device)
         )
         self.assertEqual(
+<<<<<<< HEAD
             strided[[ri([0, 0]), ri([0, 3])]],
             torch.tensor([1, 7], dtype=dtype, device=device),
         )
@@ -472,6 +517,17 @@ def validate_setting(x):
         )
         self.assertEqual(
             strided[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
+=======
+            strided[(ri([0, 0]), ri([0, 3]))],
+            torch.tensor([1, 7], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            strided[(ri([1]), ri([0, 1, 1, 0, 3]))],
+            torch.tensor([9, 11, 11, 9, 15], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            strided[(ri([0, 0, 1, 1]), ri([0, 1, 0, 0]))],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.tensor([1, 3, 9, 9], dtype=dtype, device=device),
         )
 
@@ -502,7 +558,13 @@ def validate_setting(x):
 
         reference = torch.arange(0.0, 24, dtype=dtype, device=device).view(3, 8)
         strided = torch.tensor((), dtype=dtype, device=device)
+<<<<<<< HEAD
         strided.set_(reference.storage(), 10, size=torch.Size([2, 2]), stride=[7, 1])
+=======
+        strided.set_(
+            reference.untyped_storage(), 10, size=torch.Size([2, 2]), stride=[7, 1]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             strided[ri([0]), ri([1])], torch.tensor([11], dtype=dtype, device=device)
         )
@@ -513,7 +575,13 @@ def validate_setting(x):
 
         reference = torch.arange(0.0, 24, dtype=dtype, device=device).view(3, 8)
         strided = torch.tensor((), dtype=dtype, device=device)
+<<<<<<< HEAD
         strided.set_(reference.storage(), 10, size=torch.Size([2, 2]), stride=[7, 1])
+=======
+        strided.set_(
+            reference.untyped_storage(), 10, size=torch.Size([2, 2]), stride=[7, 1]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             strided[ri([0, 1]), ri([1, 0])],
             torch.tensor([11, 17], dtype=dtype, device=device),
@@ -528,7 +596,13 @@ def validate_setting(x):
 
         reference = torch.arange(0.0, 24, dtype=dtype, device=device).view(3, 8)
         strided = torch.tensor((), dtype=dtype, device=device)
+<<<<<<< HEAD
         strided.set_(reference.storage(), 10, size=torch.Size([2, 2]), stride=[7, 1])
+=======
+        strided.set_(
+            reference.untyped_storage(), 10, size=torch.Size([2, 2]), stride=[7, 1]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         rows = ri([[0], [1]])
         columns = ri([[0, 1], [0, 1]])
@@ -642,6 +716,7 @@ def get_set_tensor(indexed, indexer):
 
         indices_to_test = [
             # grab the second, fourth columns
+<<<<<<< HEAD
             [slice(None), [1, 3]],
             # first, third rows,
             [[0, 2], slice(None)],
@@ -655,6 +730,21 @@ def get_set_tensor(indexed, indexer):
 
         # only test dupes on gets
         get_indices_to_test = indices_to_test + [[slice(None), [0, 1, 1, 2, 2]]]
+=======
+            (slice(None), [1, 3]),
+            # first, third rows,
+            ([0, 2], slice(None)),
+            # weird shape
+            (slice(None), [[0, 1], [2, 3]]),
+            # negatives
+            ([-1], [0]),
+            ([0, 2], [-1]),
+            (slice(None), [-1]),
+        ]
+
+        # only test dupes on gets
+        get_indices_to_test = indices_to_test + [(slice(None), [0, 1, 1, 2, 2])]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for indexer in get_indices_to_test:
             assert_get_eq(reference, indexer)
@@ -668,6 +758,7 @@ def get_set_tensor(indexed, indexer):
         reference = torch.arange(0.0, 160, dtype=dtype, device=device).view(4, 8, 5)
 
         indices_to_test = [
+<<<<<<< HEAD
             [slice(None), slice(None), [0, 3, 4]],
             [slice(None), [2, 4, 5, 7], slice(None)],
             [[2, 3], slice(None), slice(None)],
@@ -708,6 +799,48 @@ def get_set_tensor(indexed, indexer):
             [Ellipsis, [[0, 1], [1, 0]], [[2, 1], [3, 5]], slice(None)],
             [[[0, 1], [1, 0]], [[2, 1], [3, 5]], Ellipsis, slice(None)],
             [[[0, 1], [1, 0]], [[2, 1], [3, 5]], slice(None), Ellipsis],
+=======
+            (slice(None), slice(None), (0, 3, 4)),
+            (slice(None), (2, 4, 5, 7), slice(None)),
+            ((2, 3), slice(None), slice(None)),
+            (slice(None), (0, 2, 3), (1, 3, 4)),
+            (slice(None), (0,), (1, 2, 4)),
+            (slice(None), (0, 1, 3), (4,)),
+            (slice(None), ((0, 1), (1, 0)), ((2, 3),)),
+            (slice(None), ((0, 1), (2, 3)), ((0,),)),
+            (slice(None), ((5, 6),), ((0, 3), (4, 4))),
+            ((0, 2, 3), (1, 3, 4), slice(None)),
+            ((0,), (1, 2, 4), slice(None)),
+            ((0, 1, 3), (4,), slice(None)),
+            (((0, 1), (1, 0)), ((2, 1), (3, 5)), slice(None)),
+            (((0, 1), (1, 0)), ((2, 3),), slice(None)),
+            (((0, 1), (2, 3)), ((0,),), slice(None)),
+            (((2, 1),), ((0, 3), (4, 4)), slice(None)),
+            (((2,),), ((0, 3), (4, 1)), slice(None)),
+            # non-contiguous indexing subspace
+            ((0, 2, 3), slice(None), (1, 3, 4)),
+            # [...]
+            # less dim, ellipsis
+            ((0, 2),),
+            ((0, 2), slice(None)),
+            ((0, 2), Ellipsis),
+            ((0, 2), slice(None), Ellipsis),
+            ((0, 2), Ellipsis, slice(None)),
+            ((0, 2), (1, 3)),
+            ((0, 2), (1, 3), Ellipsis),
+            (Ellipsis, (1, 3), (2, 3)),
+            (Ellipsis, (2, 3, 4)),
+            (Ellipsis, slice(None), (2, 3, 4)),
+            (slice(None), Ellipsis, (2, 3, 4)),
+            # ellipsis counts for nothing
+            (Ellipsis, slice(None), slice(None), (0, 3, 4)),
+            (slice(None), Ellipsis, slice(None), (0, 3, 4)),
+            (slice(None), slice(None), Ellipsis, (0, 3, 4)),
+            (slice(None), slice(None), (0, 3, 4), Ellipsis),
+            (Ellipsis, ((0, 1), (1, 0)), ((2, 1), (3, 5)), slice(None)),
+            (((0, 1), (1, 0)), ((2, 1), (3, 5)), Ellipsis, slice(None)),
+            (((0, 1), (1, 0)), ((2, 1), (3, 5)), slice(None), Ellipsis),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         for indexer in indices_to_test:
@@ -720,6 +853,7 @@ def get_set_tensor(indexed, indexer):
         reference = torch.arange(0.0, 1296, dtype=dtype, device=device).view(3, 9, 8, 6)
 
         indices_to_test = [
+<<<<<<< HEAD
             [slice(None), slice(None), slice(None), [0, 3, 4]],
             [slice(None), slice(None), [2, 4, 5, 7], slice(None)],
             [slice(None), [2, 3], slice(None), slice(None)],
@@ -779,6 +913,67 @@ def get_set_tensor(indexed, indexer):
             [[0, 2, 1], [3], [4], slice(None)],
             [[0, 2, 1], [3], [4], Ellipsis],
             [Ellipsis, [0, 2, 1], [3], [4]],
+=======
+            (slice(None), slice(None), slice(None), (0, 3, 4)),
+            (slice(None), slice(None), (2, 4, 5, 7), slice(None)),
+            (slice(None), (2, 3), slice(None), slice(None)),
+            ((1, 2), slice(None), slice(None), slice(None)),
+            (slice(None), slice(None), (0, 2, 3), (1, 3, 4)),
+            (slice(None), slice(None), (0,), (1, 2, 4)),
+            (slice(None), slice(None), (0, 1, 3), (4,)),
+            (slice(None), slice(None), ((0, 1), (1, 0)), ((2, 3),)),
+            (slice(None), slice(None), ((0, 1), (2, 3)), ((0,),)),
+            (slice(None), slice(None), ((5, 6),), ((0, 3), (4, 4))),
+            (slice(None), (0, 2, 3), (1, 3, 4), slice(None)),
+            (slice(None), (0,), (1, 2, 4), slice(None)),
+            (slice(None), (0, 1, 3), (4,), slice(None)),
+            (slice(None), ((0, 1), (3, 4)), ((2, 3), (0, 1)), slice(None)),
+            (slice(None), ((0, 1), (3, 4)), ((2, 3),), slice(None)),
+            (slice(None), ((0, 1), (3, 2)), ((0,),), slice(None)),
+            (slice(None), ((2, 1),), ((0, 3), (6, 4)), slice(None)),
+            (slice(None), ((2,),), ((0, 3), (4, 2)), slice(None)),
+            ((0, 1, 2), (1, 3, 4), slice(None), slice(None)),
+            ((0,), (1, 2, 4), slice(None), slice(None)),
+            ((0, 1, 2), (4,), slice(None), slice(None)),
+            (((0, 1), (0, 2)), ((2, 4), (1, 5)), slice(None), slice(None)),
+            (((0, 1), (1, 2)), ((2, 0),), slice(None), slice(None)),
+            (((2, 2),), ((0, 3), (4, 5)), slice(None), slice(None)),
+            (((2,),), ((0, 3), (4, 5)), slice(None), slice(None)),
+            (slice(None), (3, 4, 6), (0, 2, 3), (1, 3, 4)),
+            (slice(None), (2, 3, 4), (1, 3, 4), (4,)),
+            (slice(None), (0, 1, 3), (4,), (1, 3, 4)),
+            (slice(None), (6,), (0, 2, 3), (1, 3, 4)),
+            (slice(None), (2, 3, 5), (3,), (4,)),
+            (slice(None), (0,), (4,), (1, 3, 4)),
+            (slice(None), (6,), (0, 2, 3), (1,)),
+            (slice(None), ((0, 3), (3, 6)), ((0, 1), (1, 3)), ((5, 3), (1, 2))),
+            ((2, 2, 1), (0, 2, 3), (1, 3, 4), slice(None)),
+            ((2, 0, 1), (1, 2, 3), (4,), slice(None)),
+            ((0, 1, 2), (4,), (1, 3, 4), slice(None)),
+            ((0,), (0, 2, 3), (1, 3, 4), slice(None)),
+            ((0, 2, 1), (3,), (4,), slice(None)),
+            ((0,), (4,), (1, 3, 4), slice(None)),
+            ((1,), (0, 2, 3), (1,), slice(None)),
+            (((1, 2), (1, 2)), ((0, 1), (2, 3)), ((2, 3), (3, 5)), slice(None)),
+            # less dim, ellipsis
+            (Ellipsis, (0, 3, 4)),
+            (Ellipsis, slice(None), (0, 3, 4)),
+            (Ellipsis, slice(None), slice(None), (0, 3, 4)),
+            (slice(None), Ellipsis, (0, 3, 4)),
+            (slice(None), slice(None), Ellipsis, (0, 3, 4)),
+            (slice(None), (0, 2, 3), (1, 3, 4)),
+            (slice(None), (0, 2, 3), (1, 3, 4), Ellipsis),
+            (Ellipsis, (0, 2, 3), (1, 3, 4), slice(None)),
+            ((0,), (1, 2, 4)),
+            ((0,), (1, 2, 4), slice(None)),
+            ((0,), (1, 2, 4), Ellipsis),
+            ((0,), (1, 2, 4), Ellipsis, slice(None)),
+            ((1,),),
+            ((0, 2, 1), (3,), (4,)),
+            ((0, 2, 1), (3,), (4,), slice(None)),
+            ((0, 2, 1), (3,), (4,), Ellipsis),
+            (Ellipsis, (0, 2, 1), (3,), (4,)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         for indexer in indices_to_test:
@@ -786,8 +981,13 @@ def get_set_tensor(indexed, indexer):
             assert_set_eq(reference, indexer, 1333)
             assert_set_eq(reference, indexer, get_set_tensor(reference, indexer))
         indices_to_test += [
+<<<<<<< HEAD
             [slice(None), slice(None), [[0, 1], [1, 0]], [[2, 3], [3, 0]]],
             [slice(None), slice(None), [[2]], [[0, 3], [4, 4]]],
+=======
+            (slice(None), slice(None), [[0, 1], [1, 0]], [[2, 3], [3, 0]]),
+            (slice(None), slice(None), [[2]], [[0, 3], [4, 4]]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         for indexer in indices_to_test:
             assert_get_eq(reference, indexer)
@@ -866,6 +1066,24 @@ def test_bool_indices(self, device):
             )
             self.assertEqual(len(w), 1)
 
+<<<<<<< HEAD
+=======
+    def test_list_indices(self, device):
+        N = 1000
+        t = torch.randn(N, device=device)
+        # Set window size
+        W = 10
+        # Generate a list of lists, containing overlapping window indices
+        indices = [range(i, i + W) for i in range(0, N - W)]
+
+        for i in [len(indices), 100, 32]:
+            windowed_data = t[indices[:i]]
+            self.assertEqual(windowed_data.shape, (i, W))
+
+        with self.assertRaisesRegex(IndexError, "too many indices"):
+            windowed_data = t[indices[:31]]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_bool_indices_accumulate(self, device):
         mask = torch.zeros(size=(10,), dtype=torch.bool, device=device)
         y = torch.ones(size=(10, 10), device=device)
@@ -1052,6 +1270,7 @@ def test_index_put_accumulate_non_contiguous(self, device):
         self.assertEqual(out_cuda.cpu(), out_cpu)
 
     @onlyCUDA
+<<<<<<< HEAD
     @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
     def test_index_put_accumulate_with_optional_tensors(self, device):
         # TODO: replace with a better solution.
@@ -1062,6 +1281,17 @@ def test_index_put_accumulate_with_optional_tensors(self, device):
         def func(x, i, v):
             idx = [None, i]
             x.index_put_(idx, v, accumulate=True)
+=======
+    def test_index_put_deterministic_with_optional_tensors(self, device):
+        def func(x, i, v):
+            with DeterministicGuard(True):
+                x[..., i] = v
+            return x
+
+        def func1(x, i, v):
+            with DeterministicGuard(True):
+                x[i] = v
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return x
 
         n = 4
@@ -1071,6 +1301,7 @@ def func(x, i, v):
         indices_dev = indices.to(device)
         value0d = torch.tensor(10.0)
         value1d = torch.tensor([1.0, 2.0])
+<<<<<<< HEAD
 
         out_cuda = func(t_dev, indices_dev, value0d.cuda())
         out_cpu = func(t, indices, value0d)
@@ -1078,6 +1309,36 @@ def func(x, i, v):
 
         out_cuda = func(t_dev, indices_dev, value1d.cuda())
         out_cpu = func(t, indices, value1d)
+=======
+        values2d = torch.randn(n, 1)
+
+        for val in (value0d, value1d, values2d):
+            out_cuda = func(t_dev, indices_dev, val.to(device))
+            out_cpu = func(t, indices, val)
+            self.assertEqual(out_cuda.cpu(), out_cpu)
+
+        t = torch.zeros((5, 4))
+        t_dev = t.to(device)
+        indices = torch.tensor([1, 4, 3])
+        indices_dev = indices.to(device)
+        val = torch.randn(4)
+        out_cuda = func1(t_dev, indices_dev, val.cuda())
+        out_cpu = func1(t, indices, val)
+        self.assertEqual(out_cuda.cpu(), out_cpu)
+
+        t = torch.zeros(2, 3, 4)
+        ind = torch.tensor([0, 1])
+        val = torch.randn(6, 2)
+        with self.assertRaisesRegex(RuntimeError, "shape mismatch"):
+            func(t, ind, val)
+
+        with self.assertRaisesRegex(RuntimeError, "must match"):
+            func(t.to(device), ind.to(device), val.to(device))
+
+        val = torch.randn(2, 3, 1)
+        out_cuda = func1(t.to(device), ind.to(device), val.to(device))
+        out_cpu = func1(t, ind, val)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(out_cuda.cpu(), out_cpu)
 
     @onlyNativeDeviceTypes
diff --git a/test/test_jit.py b/test/test_jit.py
index 0a49ab368719..6d570e48ad06 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1761,8 +1761,12 @@ def doit(x, y):
         for node in g.nodes():
             n_ = g2.createClone(node, lambda x: g_to_g2[x])
             g2.appendNode(n_)
+<<<<<<< HEAD
             for o, no in zip(node.outputs(), n_.outputs()):
                 g_to_g2[o] = no
+=======
+            g_to_g2.update(zip(node.outputs(), n_.outputs()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for node in g.outputs():
             g2.registerOutput(g_to_g2[node])
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 3de2a8496099..967dec7463f7 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -14,20 +14,35 @@
 from random import randrange
 from itertools import product
 from functools import reduce, partial
+<<<<<<< HEAD
+=======
+from typing import Union, Optional
+from torch._prims_common import DimsType
+from packaging import version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, TEST_SCIPY, IS_MACOS, IS_WINDOWS, slowTest,
      TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices,
      make_fullrank_matrices_with_distinct_singular_values,
      freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize, skipIfTorchDynamo,
+<<<<<<< HEAD
      skipIfRocmArch, NAVI4_ARCH,
      setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest,
      runOnRocmArch, MI300_ARCH)
+=======
+     setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest,
+     runOnRocmArch, MI300_ARCH, TEST_CUDA)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, has_cusolver, has_hipsolver,
      onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
      skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyNativeDeviceTypes, dtypesIfCUDA,
+<<<<<<< HEAD
      onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver, skipCUDAIfNotRocm, skipCUDAIfRocmVersionLessThan,
+=======
+     onlyCUDA, skipMeta, skipCUDAIfNoCusolver, skipCUDAIfNotRocm, skipCUDAIfRocmVersionLessThan,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      dtypesIfMPS, largeTensorTest)
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
@@ -88,9 +103,13 @@ def tunableop_matmul(device, dtype, result_filename=None, offline=False):
 
 def get_tunableop_validators():
     assert len(torch.cuda.tunable.get_validators()) > 0
+<<<<<<< HEAD
     validators = {}
     for key, value in torch.cuda.tunable.get_validators():
         validators[key] = value
+=======
+    validators = dict(torch.cuda.tunable.get_validators())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return validators
 
 def find_tunableop_result(results, OpSig, ParamSig):
@@ -126,12 +145,20 @@ def _hip_allow_tf32(self):
                 del os.environ["HIPBLASLT_ALLOW_TF32"]
 
     def setUp(self):
+<<<<<<< HEAD
         super(self.__class__, self).setUp()
+=======
+        super().setUp()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.backends.cuda.matmul.allow_tf32 = False
 
     def tearDown(self):
         torch.backends.cuda.matmul.allow_tf32 = True
+<<<<<<< HEAD
         super(self.__class__, self).tearDown()
+=======
+        super().tearDown()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @contextlib.contextmanager
     def _tunableop_ctx(self):
@@ -1478,6 +1505,64 @@ def run_test_case(input, ord, dim, keepdim, norm_dtype):
                             keepdim,
                             norm_dtype)
 
+<<<<<<< HEAD
+=======
+
+    def test_vector_norm_decom_unbacked_checks(self):
+        from torch._refs.linalg import _check_vector_norm_args
+
+        class Mod(torch.nn.Module):
+            def __init__(self, ord, dim):
+                super().__init__()
+                self.ord = ord
+                self.dim = dim
+
+            def forward(self, a):
+                x = a.item()
+                tensor_unbacked_size = torch.ones(x, x + 1, x + 2)
+                _check_vector_norm_args(tensor_unbacked_size, self.ord, self.dim)
+                return tensor_unbacked_size
+
+        def test(
+            ord: Union[float, int],
+            dim: Optional[DimsType],
+            expect_numel_runtime_check: bool,
+            expect_index_0_check: bool = False,
+        ) -> None:
+            m = Mod(ord, dim)
+            exported_program: torch.export.ExportedProgram = torch.export.export(
+                m, args=tuple(torch.tensor([1]))
+            )
+            self.assertEqual(
+                "Runtime assertion failed for expression Ne(u0*(u0 + 1)*(u0 + 2), 0)"
+                in exported_program.graph_module.code,
+                expect_numel_runtime_check,
+            )
+            self.assertEqual(
+                "Runtime assertion failed for expression Ne(u0, 0) | Ne(u0*(u0 + 1)*(u0 + 2), 0)"
+                in exported_program.graph_module.code,
+                expect_index_0_check,
+            )
+
+        # dim is int
+        test(-1, 1, True)
+
+        # dim is None
+        test(-1, None, True)
+
+        # len(dim) == 0
+        test(-1, [], True)
+
+        # shape[d] == 0
+        test(-1, [0], False, True)
+
+        # u0 + 1 == 0 is False we do not see a runtime assert in the generated graph.
+        test(-1, [1], False, False)
+
+        test(-1, [0, 1], False, True)
+        test(-1, [0, 0], False, True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_vector_norm_dim_tuple_arg(self, device):
         test_cases = [
             # input size, dim, error, error message
@@ -2763,6 +2848,12 @@ def test_invariance_error_spectral_decompositions(self, device, dtype):
             Q = torch.linalg.eigh(A).eigenvectors
             Q.sum().abs().backward()
 
+<<<<<<< HEAD
+=======
+    # I don't know how much memory this test uses but on complex64 it needs at least 4GB
+    @largeTensorTest("4GB", device="cuda")
+    @serialTest(TEST_CUDA)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIfNoCusolver  # MAGMA backend doesn't work in this case
     @precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4})
     @skipCPUIfNoLapack
@@ -3658,7 +3749,10 @@ def test_matrix_rank_atol_rtol(self, device, dtype):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
+<<<<<<< HEAD
     @skipCUDAVersionIn([(11, 6), (11, 7)])  # https://github.com/pytorch/pytorch/issues/75391
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(*floating_and_complex_types())
     def test_matrix_rank_empty(self, device, dtype):
         matrix_rank = torch.linalg.matrix_rank
@@ -5163,7 +5257,11 @@ def test_validator_tunableop_rocm(self, device, dtype):
             # Check for rocBLAS and hipBLASLt
             self.assertTrue("ROCBLAS_VERSION" in validators)
             # format: [major].[minor].[patch].[tweak].[commit id]
+<<<<<<< HEAD
             self.assertTrue(re.match(r'^\d+[a-z0-9.]+$', validators["ROCBLAS_VERSION"]))
+=======
+            self.assertTrue(re.match(r'^\d+.\d+.\d+.\d+.[a-z0-9]+$', validators["ROCBLAS_VERSION"]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertTrue("HIPBLASLT_VERSION" in validators)
             self.assertTrue(re.match(r'^\d+-[a-z0-9]+$', validators["HIPBLASLT_VERSION"]))
 
@@ -5861,6 +5959,59 @@ def test_rowwise_scaled_gemm_numerics_tunableop(self, device, dtype):
         delta = tuned_default_scaled_mm - ref_scaled_mm
         self.assertTrue(torch.all(delta == 0))
 
+<<<<<<< HEAD
+=======
+    @onlyCUDA
+    @skipCUDAIfNotRocm
+    @dtypes(torch.float)
+    def test_call_count_tunableop(self, device, dtype):
+        # Test that after tuning a GEMM in TunableOp, we only call the GEMM kernel once
+        # per PyTorch API invocation.
+        # We use the torch profiler to get the call counts on the kernels
+
+        # Supported only for: MM, batch MM, and GEMM with bias (linear)
+        from torch.profiler import profile, ProfilerActivity
+
+        with self._tunableop_ctx():
+            # set these to single iterations to keep it short but still exercise the code
+            torch.cuda.tunable.set_max_tuning_iterations(1)
+
+            b = 2
+            M = 10
+
+            # MM
+            A = torch.rand(M, M, device=device)
+            C = torch.mm(A, A)
+
+            # Linear - GEMM BIAS
+            X = torch.rand(M, M, device='cuda')
+            bias = torch.rand(M, device='cuda')
+            Y = torch.nn.functional.linear(X, A, bias)
+
+            # BMM
+            batch_A = torch.rand((b, M, M), device='cuda')
+            batch_C = torch.bmm(batch_A, batch_A)
+
+            kernel_count = 0
+            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+                C = torch.mm(A, A)
+                Y = torch.nn.functional.linear(X, A, bias)
+                batch_C = torch.bmm(batch_A, batch_A)
+
+            # Check that after tuning, there was only one kernel
+            # launched per PyTorch API. The kernels have string
+            # that always starts with `Cijk*`
+            mm_key = 'Cijk'
+            events = prof.key_averages()
+            for evt in events:
+                if mm_key in evt.key:
+                    self.assertEqual(evt.count, 1)
+                    kernel_count = kernel_count + 1
+
+            # There must be exactly three kernels only
+            self.assertEqual(kernel_count, 3)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float, torch.complex64)
     def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
         a = torch.empty((256, 512), device=device, dtype=dtype, requires_grad=True).unsqueeze(0)
@@ -5875,6 +6026,29 @@ def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
         with torch.no_grad():
             torch.matmul(a, b, out=c)
 
+<<<<<<< HEAD
+=======
+    @dtypes(torch.float, torch.complex64)
+    def test_tensordot_out_kernel_errors_with_autograd(self, device, dtype):
+        a = torch.empty((4, 2), device=device, dtype=dtype, requires_grad=True)
+        b = torch.empty((2, 4), device=device, dtype=dtype, requires_grad=True)
+        c = torch.empty((2, 2), device=device, dtype=dtype, requires_grad=True)
+        d = torch.empty((4, 4), device=device, dtype=dtype, requires_grad=False)
+        err_msg = "the 'out' tensor was specified and requires gradients"
+
+        with torch.set_grad_enabled(True), self.assertRaisesRegex(RuntimeError, err_msg):
+            torch.tensordot(a, b, dims=([1], [0]), out=c)
+
+        with torch.set_grad_enabled(True):
+            torch.tensordot(a, b, dims=([1], [0]), out=d)
+
+        with torch.set_grad_enabled(False), warnings.catch_warnings(record=True) as w:
+            # Hack to avoid resize error for CUDA tensors as resize_cuda_ is different to resize_.
+            c.requires_grad = False
+            torch.tensordot(a, b, dims=([1], [0]), out=c)
+            self.assertEqual(len(w), 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # 4GB should do, but we run tests in parallel in CI, so let's be generous
     @largeTensorTest('16GB', device='cuda')
     def test_large_bmm_mm_backward(self, device):
@@ -5948,6 +6122,21 @@ def test_linalg_cross_with_and_without_dim(self, device, dtype):
         self.assertEqual(res1, res2)
         self.assertEqual(res1, res3)
 
+<<<<<<< HEAD
+=======
+    def test_cross_error(self, device):
+        x = torch.randn(4, 3, device=device)
+        y = torch.randn(4, 3, device=device)
+        with self.assertRaisesRegex(RuntimeError, "input tensor and the written-to tensor refer to a single memory location"):
+            torch.cross(x, y, out=x)
+        with self.assertRaisesRegex(RuntimeError, "input tensor and the written-to tensor refer to a single memory location"):
+            torch.cross(y, x, out=x)
+        with self.assertRaisesRegex(RuntimeError, "input tensor and the written-to tensor refer to a single memory location"):
+            torch.linalg.cross(x, y, out=x)
+        with self.assertRaisesRegex(RuntimeError, "input tensor and the written-to tensor refer to a single memory location"):
+            torch.linalg.cross(y, x, out=x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_renorm(self, device):
         m1 = torch.randn(20, 20, device=device)  # big enough to exercise vectorized path
         res1 = torch.tensor((), device=device)
@@ -6030,6 +6219,7 @@ def run_test(batch, m, n, fortran_contiguous):
     @dtypes(*floating_and_complex_types())
     def test_ormqr_errors_and_warnings(self, device, dtype):
         test_cases = [
+<<<<<<< HEAD
             # input1 size, input2 size, input3 size, error regex
             ((10,), (2,), (2,), r"input must have at least 2 dimensions"),
             ((2, 2), (2,), (2,), r"other must have at least 2 dimensions"),
@@ -6039,11 +6229,27 @@ def test_ormqr_errors_and_warnings(self, device, dtype):
             ((1, 2, 2), (1, 2), (2, 2, 2), r"batch dimensions of other to be equal to input.shape\[:-2\]"),
         ]
         for a_size, tau_size, c_size, error_regex in test_cases:
+=======
+            # input1 size, input2 size, input3 size, left, error regex
+            ((10,), (2,), (2,), True, r"input must have at least 2 dimensions"),
+            ((2, 2), (2,), (2,), True, r"other must have at least 2 dimensions"),
+            ((6, 6), (5,), (5, 5), True, r"other.shape\[-2\] must be equal to input.shape\[-2\]"),
+            ((1, 2, 2), (2, 2), (1, 2, 2), True, r"batch dimensions of tau to be equal to input.shape\[:-2\]"),
+            ((1, 2, 2), (1, 2), (2, 2, 2), True, r"batch dimensions of other to be equal to input.shape\[:-2\]"),
+            ((2, 4, 3), (2, 2), (2, 3, 10), True, r"torch.ormqr: other.shape\[-2\] must be equal to input.shape\[-2\]"),
+            ((2, 4, 3), (2, 2), (2, 3, 10), False, r"torch.ormqr: other.shape\[-1\] must be equal to input.shape\[-2\]")
+        ]
+        for a_size, tau_size, c_size, left, error_regex in test_cases:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a = make_tensor(a_size, dtype=dtype, device=device)
             tau = make_tensor(tau_size, dtype=dtype, device=device)
             c = make_tensor(c_size, dtype=dtype, device=device)
             with self.assertRaisesRegex(RuntimeError, error_regex):
+<<<<<<< HEAD
                 torch.ormqr(a, tau, c)
+=======
+                torch.ormqr(a, tau, c, left)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_blas_empty(self, device):
         def fn(torchfn, *args, test_out=False, **kwargs):
@@ -6593,6 +6799,15 @@ def _test_lobpcg_method(self, device, dtype, method):
         def test_tracker(worker):
             k = worker.iparams['k']
             nc = worker.ivars['converged_count']
+<<<<<<< HEAD
+=======
+
+            # Regression test for PR #152789 (fixes issue #101075)
+            # Ensure rerr is non-negative at each iteration
+            rerr = worker.tvars['rerr']
+            self.assertGreaterEqual(rerr.min(), 0.)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if k <= nc:
                 tol = worker.fparams['tol']
                 rerr = worker.tvars['rerr']
@@ -6622,9 +6837,32 @@ def lobpcg(*args, **kwargs):
             kwargs['tol'] = 1e-8
             return orig_lobpcg(*args, **kwargs)
         prec = 5e-4
+<<<<<<< HEAD
 
         # check dense input
         mm = torch.matmul
+=======
+        mm = torch.matmul
+
+        # Regression test for PR #152789 (fixes issue #101075)
+        # https://github.com/pytorch/pytorch/issues/101075#issuecomment-1548483685
+        # Demonstrates the original bug: negative residuals in the 2nd iteration
+        A = torch.Tensor([
+            [-0.56142016, 0.29639858, -0.16059532],
+            [0.29639858, -0.69093563, 0.26248195],
+            [-0.16059532, 0.26248195, -0.40236716]
+        ])
+        B = torch.Tensor([
+            [1.89193057, -0.08174309, -0.3557846],
+            [-0.08174309, 1.64589643, -0.46436347],
+            [-0.3557846, -0.46436347, 1.67404367]
+        ])
+        X = torch.Tensor([[0.61591334, 0.63823109, 0.46185694]]).T
+        E, V = lobpcg(A=A, B=B, X=X, k=1)
+        self.assertEqual(matmul(A, V), mm(matmul(B, V), E.diag_embed()), atol=prec, rtol=0)
+
+        # check dense input
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for batches in [(), (2,), (2, 3)]:
             for m, n, k in [
                     (9, 3, 1),
@@ -6717,7 +6955,12 @@ def test_lobpcg_torchscript(self, device, dtype):
         eq_err = torch.norm((mm(A1, V1) - V1 * E1), 2) / E1.max()
         self.assertLess(eq_err, 1e-6)
 
+<<<<<<< HEAD
     @unittest.skipIf(not TEST_SCIPY or (TEST_SCIPY and scipy.__version__ < '1.4.1'), "Scipy not found or older than 1.4.1")
+=======
+    @unittest.skipIf(not TEST_SCIPY or (TEST_SCIPY and version.parse(scipy.__version__) < version.parse('1.4.1')),
+                     "Scipy not found or older than 1.4.1")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCPUIfNoLapack
     @skipIfTorchDynamo("fails in tracing scipy.sparse.lobpcg")
     @onlyCPU
@@ -6929,8 +7172,11 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=
                   torch.half))
     @dtypes(torch.bfloat16, torch.half, torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_addmv(self, device, dtype):
+<<<<<<< HEAD
         if IS_ARM64 and device == 'cpu' and dtype == torch.float16:
             raise unittest.SkipTest("Fails on ARM, see https://github.com/pytorch/pytorch/issues/125438")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # have to use torch.randn(...).to(bfloat16) instead of
         # torch.randn(..., dtype=bfloat16). randn does not support
         # bfloat16 yet.
@@ -7150,7 +7396,10 @@ def test_baddbmm_input_dtypes_compatibility(self, device, dtype):
 
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
     @onlyCUDA
+<<<<<<< HEAD
     @skipIfRocmArch(NAVI4_ARCH)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_matmul_45724(self, device):
         # https://github.com/pytorch/pytorch/issues/45724
         a = torch.rand(65537, 22, 64, device=device, dtype=torch.half)
@@ -9375,6 +9624,27 @@ def test_tensordot(self, device):
         an = torch.from_numpy(np.tensordot(np.zeros((), dtype=np.float32), np.zeros((), dtype=np.float32), 0))
         self.assertEqual(a, an)
 
+<<<<<<< HEAD
+=======
+        # Testing the fast path introduced in #145936,
+        # i.e. reduction to a scalar has to be of right dim.
+        a = torch.rand(2, 2, device=device)
+        a_dims = [-1, -2]
+        b = torch.rand(2, 2, device=device)
+        b_dims = [-2, -1]
+        for res_ndim in range(5):
+            res_torch = torch.tensordot(a, b, [a_dims, b_dims])
+            self.assertEqual(res_torch.ndim, res_ndim)
+
+            res_numpy = torch.from_numpy(np.tensordot(a.cpu().numpy(), b.cpu().numpy(), [a_dims, b_dims]))
+            self.assertEqual(res_torch, res_numpy)
+
+            if res_ndim % 2:
+                b.unsqueeze_(0)
+            else:
+                a.unsqueeze_(0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIfNoCusolver
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
@@ -9516,6 +9786,10 @@ def test_preferred_blas_library(self):
         self.assertEqual(out1, out2)
         self.assertEqual(out_ref, out2.cpu())
 
+<<<<<<< HEAD
+=======
+    @onlyCUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipCUDAIfNotRocm
     @unittest.skipIf(not blaslt_supported_device(), "blasLt not supported on current device")
     @setBlasBackendsToDefaultFinally
@@ -9609,6 +9883,20 @@ def gen_mat(w, h, use_transpose: bool = False):
         ref = alpha * A @ B + beta * C
         self.assertEqual(rc, ref)
 
+<<<<<<< HEAD
+=======
+    @dtypes(torch.float, torch.half, torch.bfloat16)
+    @largeTensorTest('16GB')
+    def test_matmul_mv(self, device, dtype):
+        # Regression test for https://github.com/pytorch/pytorch/issues/150637
+        # Such matrix will take more than 4Gb in memory
+        n = 50_000
+        A = torch.ones(n, n, dtype=dtype, device=device)
+        B = torch.rand(n, dtype=dtype, device=device)
+        C = torch.matmul(A, B)
+        self.assertEqual(C, B.sum().expand(B.shape))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float, torch.double)
     @precisionOverride({torch.float32: 1e-4})
     def test_1_sized_with_0_strided(self, device, dtype):
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index 18e9edca4835..b2fb7dcbeae4 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -19,12 +19,23 @@
 
 from torch.testing import make_tensor
 from torch.testing._internal.common_cuda import (
+<<<<<<< HEAD
     SM53OrLater,
     SM89OrLater,
     SM90OrLater,
     _get_torch_cuda_version,
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_MX_GEMM
+=======
+    PLATFORM_SUPPORTS_BF16,
+    SM53OrLater,
+    SM89OrLater,
+    SM90OrLater,
+    xfailIfSM100OrLater,
+    _get_torch_cuda_version,
+    PLATFORM_SUPPORTS_FP8,
+    PLATFORM_SUPPORTS_MX_GEMM,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_device_type import (
     dtypes,
@@ -32,10 +43,20 @@
     onlyCUDA,
     tol as xtol,
     toleranceOverride,
+<<<<<<< HEAD
 )
 
 from torch.testing._internal.common_utils import (
     IS_ARM64,
+=======
+    e4m3_type,
+    e5m2_type,
+    E4M3_MAX_POS,
+    E5M2_MAX_POS,
+)
+
+from torch.testing._internal.common_utils import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IS_JETSON,
     IS_WINDOWS,
     parametrize,
@@ -47,6 +68,10 @@
     TEST_WITH_ROCM,
     TestCase,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_quantized import _f32_to_floatx_unpacked, _floatx_unpacked_to_f32, ceil_div, to_blocked
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _IS_SM8X = False
 if TEST_CUDA:
@@ -56,15 +81,34 @@
 assert torch.get_default_dtype() is torch.float32
 
 
+<<<<<<< HEAD
 @unittest.skipIf(IS_ARM64, "Issue with numpy version on arm")
 class TestMatmulCuda(TestCase):
     def setUp(self):
         super(self.__class__, self).setUp()
+=======
+@contextlib.contextmanager
+def blas_library_context(backend):
+    prev_backend = torch.backends.cuda.preferred_blas_library()
+    torch.backends.cuda.preferred_blas_library(backend)
+    try:
+        yield
+    finally:
+        torch.backends.cuda.preferred_blas_library(prev_backend)
+
+class TestMatmulCuda(TestCase):
+    def setUp(self):
+        super().setUp()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.backends.cuda.matmul.allow_tf32 = False
 
     def tearDown(self):
         torch.backends.cuda.matmul.allow_tf32 = True
+<<<<<<< HEAD
         super(self.__class__, self).tearDown()
+=======
+        super().tearDown()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def cublas_addmm(self, size: int, dtype: torch.dtype, reduced_precision: bool = False, fp16_accumulate: bool = False):
         #
@@ -136,8 +180,15 @@ def cublas_addmm(self, size: int, dtype: torch.dtype, reduced_precision: bool =
                         torch.float32: xtol(atol=1e-1, rtol=1e-1)})
     @dtypes(torch.float16, torch.bfloat16, torch.float32)
     @parametrize("size", [100, 1000, 10000])
+<<<<<<< HEAD
     def test_cublas_addmm(self, size: int, dtype: torch.dtype):
         self.cublas_addmm(size, dtype, False)
+=======
+    @parametrize("backend", ["cublas", "cublaslt"])
+    def test_cublas_addmm(self, size: int, dtype: torch.dtype, backend):
+        with blas_library_context(backend):
+            self.cublas_addmm(size, dtype, False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @onlyCUDA
     @skipIfRocmVersionLessThan((5, 2))
@@ -146,8 +197,36 @@ def test_cublas_addmm(self, size: int, dtype: torch.dtype):
                         torch.bfloat16: xtol(atol=1e1, rtol=2e-1)})
     @dtypes(torch.float16, torch.bfloat16)
     @parametrize("size", [100, 1000, 10000])
+<<<<<<< HEAD
     def test_cublas_addmm_reduced_precision(self, size: int, dtype: torch.dtype):
         self.cublas_addmm(size, dtype, True)
+=======
+    @parametrize("backend", ["cublas", "cublaslt"])
+    def test_cublas_addmm_reduced_precision(self, size: int, dtype: torch.dtype, backend):
+        with blas_library_context(backend):
+            self.cublas_addmm(size, dtype, True)
+
+    @onlyCUDA
+    @skipIfRocmVersionLessThan((5, 2))
+    @dtypes(torch.float16)
+    # m == 4 chooses OUTPUT_TYPE reduction on H200
+    # m == 8 chooses OUTPUT_TYPE reduction on A100
+    @parametrize("small_size", [4, 8])
+    @parametrize("size", [32768])
+    @parametrize("backend", ["cublaslt", "cublas"])
+    def test_cublas_addmm_no_reduced_precision(self, small_size: int, size: int, dtype: torch.dtype, backend):
+        with blas_library_context(backend):
+            torch.backends.cuda.preferred_blas_library(backend)
+            orig_precision = torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
+            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+            m1 = torch.full((small_size, size), 65504.0, dtype=dtype, device='cuda')
+            m2 = torch.ones((size, small_size), dtype=dtype, device='cuda')
+            m2[size // 2:, :] = -1.0
+            b = torch.zeros((small_size,), dtype=dtype, device='cuda')
+            out = torch.addmm(b, m1, m2, beta=1.0)
+            self.assertEqual(out.sum().item(), 0.0)
+            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig_precision
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @onlyCUDA
     @skipIfRocmVersionLessThan((5, 2))
@@ -156,8 +235,15 @@ def test_cublas_addmm_reduced_precision(self, size: int, dtype: torch.dtype):
                         torch.bfloat16: xtol(atol=1e1, rtol=2e-1)})
     @dtypes(torch.float16, torch.bfloat16)
     @parametrize("size", [100, 1000, 10000])
+<<<<<<< HEAD
     def test_cublas_addmm_reduced_precision_fp16_accumulate(self, size: int, dtype: torch.dtype):
         self.cublas_addmm(size, dtype, False, True)
+=======
+    @parametrize("backend", ["cublas", "cublaslt"])
+    def test_cublas_addmm_reduced_precision_fp16_accumulate(self, size: int, dtype: torch.dtype, backend):
+        with blas_library_context(backend):
+            self.cublas_addmm(size, dtype, False, True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @onlyCUDA
     @skipIfRocm
@@ -254,10 +340,477 @@ def _expand_to_batch(t: torch.Tensor):
         # cross comparison
         self.assertEqual(out1_gpu, out2_gpu[0])
 
+<<<<<<< HEAD
+=======
+    def grouped_mm_helper(self, alist, blist, gOlist, agradlist, bgradlist, outlist):
+        for a, b, gO, agrad, bgrad, out in zip(alist, blist, gOlist, agradlist, bgradlist, outlist):
+            a = a.clone().detach().requires_grad_()
+            b = b.clone().detach().requires_grad_()
+            out_ref = torch.mm(a, b.t())
+            out_ref.backward(gO)
+            self.assertEqual(out, out_ref)
+            if agrad is not None:
+                self.assertEqual(agrad, a.grad)
+                self.assertEqual(bgrad, b.grad)
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("strided", [False, True])
+    @parametrize("a_row_major", [False, True])
+    @parametrize("b_row_major", [False, True])
+    def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major):
+        device = "cuda"
+        dtype = torch.bfloat16
+        m, n, k, n_groups = 16, 32, 64, 4
+        if a_row_major:
+            a = torch.randn(m, k * n_groups + k * int(strided), device=device, dtype=dtype)[:, :k * n_groups]
+        else:
+            a = torch.randn(k * n_groups + k * int(strided), m, device=device, dtype=dtype).t()[:, :k * n_groups]
+
+        if b_row_major:
+            b = torch.randn(n, k * n_groups + k * int(strided), device=device, dtype=dtype)[:, :k * n_groups]
+        else:
+            b = torch.randn(k * n_groups + k * int(strided), n, device=device, dtype=dtype).t()[:, :k * n_groups]
+
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+        offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
+
+        f = torch._grouped_mm
+        out = f(a, b.t(), offs=offs, out_dtype=torch.bfloat16)
+        gO = torch.rand_like(out)
+        out.backward(gO)
+        offs_cpu = offs.cpu()
+        alist, blist, agradlist, bgradlist = [], [], [], []
+        start = 0
+        for i in range(n_groups):
+            alist.append(a[:, start:offs_cpu[i]])
+            blist.append(b[:, start:offs_cpu[i]])
+            agradlist.append(a.grad[:, start:offs_cpu[i]])
+            bgradlist.append(b.grad[:, start:offs_cpu[i]])
+            start = offs_cpu[i]
+        self.grouped_mm_helper(alist, blist, gO, agradlist, bgradlist, out)
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("strided", [False, True])
+    @parametrize("a_row_major", [False, True])
+    @parametrize("b_row_major", [False, True])
+    def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major):
+        device = "cuda"
+        dtype = torch.bfloat16
+        s_int = int(strided)
+        m, n, k, n_groups = 16, 32, 64, 4
+        if a_row_major:
+            a = torch.randn(m * n_groups, k * (1 + s_int), device=device, dtype=dtype)[:, :k]
+        else:
+            a = torch.randn(k, (m + 2 * s_int) * n_groups, device=device, dtype=dtype).t()[:m * n_groups, :]
+
+        if b_row_major:
+            b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device, dtype=dtype)[::(1 + s_int), :, :k]
+        else:
+            b = torch.randn(n_groups * (1 + s_int), k * (1 + s_int), n, device=device,
+                            dtype=dtype).transpose(-2, -1)[::(1 + s_int), :, :k]
+
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+
+        a_contig = a if a_row_major else a.t()
+        self.assertTrue(a_contig.is_contiguous() is not strided)
+        b_contig = b if b_row_major else b.transpose(-2, -1)
+        self.assertTrue(b_contig.is_contiguous() is not strided)
+        for check_zero_size in (False, True):
+            if check_zero_size and n_groups <= 1:
+                continue
+
+            a.grad = None
+            b.grad = None
+            offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
+            if check_zero_size:
+                offs[0] = offs[1]
+
+            f = torch._grouped_mm
+            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=torch.bfloat16)
+            gO = torch.rand_like(out)
+            if not check_zero_size:
+                out.backward(gO)
+            offs_cpu = offs.cpu()
+            alist, agradlist, gOlist, outlist = [], [], [], []
+            bgradlist = [None] * n_groups if check_zero_size else b.grad
+            start = 0
+            for i in range(n_groups):
+                alist.append(a[start:offs_cpu[i]])
+                agradlist.append(None if check_zero_size else a.grad[start:offs_cpu[i]])
+                outlist.append(out[start:offs_cpu[i]])
+                gOlist.append(gO[start:offs_cpu[i]])
+                start = offs_cpu[i]
+            self.grouped_mm_helper(alist, b, gOlist, agradlist, bgradlist, outlist)
+
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("strided", [False, True])
+    @parametrize("a_row_major", [False, True])
+    @parametrize("b_row_major", [False, True])
+    def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major):
+        device = "cuda"
+        dtype = torch.bfloat16
+        s_int = int(strided)
+        m, n, k, n_groups = 16, 32, 64, 4
+        if a_row_major:
+            a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device, dtype=dtype)[::(1 + s_int), :, :k]
+        else:
+            a = torch.randn(n_groups * (1 + s_int), k * (1 + s_int), m, device=device,
+                            dtype=dtype).transpose(-2, -1)[::(1 + s_int), :, :k]
+        if b_row_major:
+            b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device, dtype=dtype)[::(1 + s_int), :, :k]
+        else:
+            b = torch.randn(n_groups * (1 + s_int), k * (1 + s_int), n, device=device,
+                            dtype=dtype).transpose(-2, -1)[::(1 + s_int), :, :k]
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+
+        a_contig = a if a_row_major else a.transpose(-2, -1)
+        self.assertTrue(a_contig.is_contiguous() is not strided)
+        b_contig = b if b_row_major else b.transpose(-2, -1)
+        self.assertTrue(b_contig.is_contiguous() is not strided)
+
+        f = torch._grouped_mm
+        out = f(a, b.transpose(-2, -1), out_dtype=torch.bfloat16)
+        gO = torch.rand_like(out)
+        out.backward(gO)
+        self.grouped_mm_helper(a, b, gO, a.grad, b.grad, out)
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("strided", [False, True])
+    @parametrize("a_row_major", [False, True])
+    @parametrize("b_row_major", [False, True])
+    def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major):
+        device = "cuda"
+        dtype = torch.bfloat16
+        s_int = int(strided)
+        m, n, k, n_groups = 16, 32, 64, 4
+        if a_row_major:
+            a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device, dtype=dtype)[::(1 + s_int), :, :k]
+        else:
+            a = torch.randn(n_groups * (1 + s_int), k * (1 + s_int), m, device=device,
+                            dtype=dtype).transpose(-2, -1)[::(1 + s_int), :, :k]
+        if b_row_major:
+            b = torch.randn(n * n_groups, k * (1 + s_int), device=device, dtype=dtype)[:, :k]
+        else:
+            b = torch.randn(k, n * (n_groups + s_int), device=device, dtype=dtype).transpose(-2, -1)[:n * n_groups, :]
+
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+
+        a_contig = a if a_row_major else a.transpose(-2, -1)
+        self.assertTrue(a_contig.is_contiguous() is not strided)
+        b_contig = b if b_row_major else b.transpose(-2, -1)
+        self.assertTrue(b_contig.is_contiguous() is not strided)
+        for check_zero_size in (False, True):
+            if check_zero_size and n_groups <= 1:
+                continue
+
+            offs = torch.arange(n, n_groups * n + 1, n, device="cuda", dtype=torch.int32)
+            if check_zero_size:
+                offs[0] = offs[1]
+
+            f = torch._grouped_mm
+            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=torch.bfloat16)
+            gO = torch.rand_like(out)
+            if not check_zero_size:
+                out.backward(gO)
+            offs_cpu = offs.cpu()
+            blist, outlist, bgradlist, gOlist = [], [], [], []
+            agradlist = [None] * n_groups if check_zero_size else a.grad
+            start = 0
+            for i in range(n_groups):
+                blist.append(b[start:offs_cpu[i]])
+                bgradlist.append(b.grad[start:offs_cpu[i]])
+                outlist.append(out[:, start:offs_cpu[i]])
+                gOlist.append(gO[:, start:offs_cpu[i]])
+                start = offs_cpu[i]
+            self.grouped_mm_helper(a, blist, gOlist, agradlist, bgradlist, outlist)
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("op", ["2d/2d", "2d/3d", "3d/2d", "3d/3d"])
+    @parametrize("a_row_major", [False, True])
+    @parametrize("b_row_major", [False, True])
+    def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major):
+        torch._dynamo.reset()
+
+        device = "cuda"
+        dtype_AB = torch.bfloat16
+        dtype_offset = torch.int32
+
+        align = 16 // dtype_AB.itemsize
+
+        f_ref = torch._grouped_mm
+        f = torch.compile(
+            f_ref,
+            options={
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+            },
+        )
+
+        if op == "2d/2d":
+            m, n = 3, 7
+            m_align = (m + align - 1) // align * align
+            n_align = (n + align - 1) // align * align
+            if not a_row_major and not b_row_major:
+                offs = torch.tensor([1, 3, 4, 6, 7], device=device, dtype=dtype_offset)
+            else:
+                offs = torch.tensor([8, 16, 32, 37], device=device, dtype=dtype_offset)
+            ngroups = offs.shape[0]
+            k = offs[-1]
+            k_align = (k + align - 1) // align * align
+
+            if a_row_major:
+                A = torch.randn(m, k_align, device=device, dtype=dtype_AB)[:, :k]
+            else:
+                A = torch.randn(k, m_align, device=device, dtype=dtype_AB).t()[:m, :]
+            if b_row_major:
+                B = torch.randn(n, k_align, device=device, dtype=dtype_AB)[:, :k]
+            else:
+                B = torch.randn(k, n_align, device=device, dtype=dtype_AB).t()[:n, :]
+        elif op == "2d/3d":
+            n, k = 7, 13
+            n_align = (n + align - 1) // align * align
+            k_align = (k + align - 1) // align * align
+            if a_row_major:
+                offs = torch.tensor([0, 1, 3, 3, 5], device=device, dtype=dtype_offset)
+            else:
+                offs = torch.tensor([0, 8, 16, 16, 19], device=device, dtype=dtype_offset)
+            ngroups = offs.shape[0]
+            m = offs[-1]
+            m_align = (m + align - 1) // align * align
+
+            if a_row_major:
+                A = torch.randn(m, k_align, device=device, dtype=dtype_AB)[:, :k]
+            else:
+                A = torch.randn(k, m_align, device=device, dtype=dtype_AB).t()[:m, :]
+            if b_row_major:
+                B = torch.randn(ngroups, n, k_align, device=device, dtype=dtype_AB)[:, :, :k]
+            else:
+                B = torch.randn(ngroups, k, n_align, device=device, dtype=dtype_AB).transpose(
+                    -2, -1
+                )[:, :n, :]
+        elif op == "3d/2d":
+            m, k = 3, 13
+            m_align = (m + align - 1) // align * align
+            k_align = (k + align - 1) // align * align
+            offs = torch.tensor([0, 8, 16, 16, 19], device=device, dtype=dtype_offset)
+            ngroups = offs.shape[0]
+            n = offs[-1]
+            n_align = (n + align - 1) // align * align
+
+            if a_row_major:
+                A = torch.randn(ngroups, m, k_align, device=device, dtype=dtype_AB)[:, :, :k]
+            else:
+                A = torch.randn(ngroups, k, m_align, device=device, dtype=dtype_AB).transpose(
+                    -2, -1
+                )[:, :m, :]
+            if b_row_major:
+                B = torch.randn(n, k_align, device=device, dtype=dtype_AB)[:, :k]
+            else:
+                B = torch.randn(k, n_align, device=device, dtype=dtype_AB).t()[:n, :]
+        elif op == "3d/3d":
+            offs = None
+            ngroups = 5
+            m, n, k = 3, 7, 13
+            m_align = (m + align - 1) // align * align
+            n_align = (n + align - 1) // align * align
+            k_align = (k + align - 1) // align * align
+            if a_row_major:
+                A = torch.randn(ngroups, m, k_align, device=device, dtype=dtype_AB)[:, :, :k]
+            else:
+                A = torch.randn(ngroups, k, m_align, device=device, dtype=dtype_AB).transpose(
+                    -2, -1
+                )[:, :m, :]
+            if b_row_major:
+                B = torch.randn(ngroups, n, k_align, device=device, dtype=dtype_AB)[:, :, :k]
+            else:
+                B = torch.randn(ngroups, k, n_align, device=device, dtype=dtype_AB).transpose(
+                    -2, -1
+                )[:, :n, :]
+        else:
+            raise AssertionError(f"Invaild op: {op}")
+
+        C_ref = f_ref(A, B.transpose(-2, -1), offs=offs)
+        C = f(A, B.transpose(-2, -1), offs=offs)
+        torch.testing.assert_close(C, C_ref)
+
+
+    @onlyCUDA
+    @skipIfRocm
+    @parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
+    @parametrize("M", [1, 32, 64])
+    @parametrize("N", [1, 32, 64])
+    @parametrize("K", [1, 32, 64])
+    @parametrize("batch_size", [None, 1, 16])
+    @parametrize("backend", ["cublas", "cublaslt"])
+    def test_mm_bmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
+        device = "cuda"
+        dtype = input_dtype
+        with blas_library_context(backend):
+            def create_inputs(B=None):
+                if B is None:
+                    a = torch.randn(M, K, device=device, dtype=dtype)
+                    b = torch.randn(K, N, device=device, dtype=dtype)
+                else:
+                    a = torch.randn(B, M, K, device=device, dtype=dtype)
+                    b = torch.randn(B, K, N, device=device, dtype=dtype)
+                return a, b
+
+            a, b = create_inputs(batch_size)
+
+            a_fp32, b_fp32 = a.to(torch.float32), b.to(torch.float32)
+
+            output_dtypes = [torch.float32]
+
+            if input_dtype != torch.float32:
+                output_dtypes.append(input_dtype)
+
+            for output_dtype in output_dtypes:
+                # Catch edge case of incompat with bfloat16 and major version < 8
+                if input_dtype == torch.bfloat16 and not PLATFORM_SUPPORTS_BF16:
+                    if output_dtype == torch.bfloat16:
+                        continue
+
+                    if batch_size:
+                        with self.assertRaises(RuntimeError):
+                            torch.bmm(a, b, out_dtype=output_dtype)
+                    else:
+                        with self.assertRaises(RuntimeError):
+                            torch.mm(a, b, out_dtype=output_dtype)
+                else:
+                    if batch_size:
+                        out = torch.bmm(a, b, out_dtype=output_dtype)
+                        baseline = torch.bmm(a_fp32, b_fp32) if output_dtype == torch.float32 else torch.bmm(a, b)
+                    else:
+                        out = torch.mm(a, b, out_dtype=output_dtype)
+                        baseline = torch.mm(a_fp32, b_fp32) if output_dtype == torch.float32 else torch.mm(a, b)
+
+                    self.assertEqual(out.dtype, output_dtype)
+
+                    torch.testing.assert_close(out, baseline, atol=1e-3, rtol=1e-3)
+
+
+    @onlyCUDA
+    @skipIfRocm
+    @parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
+    @parametrize("M", [1, 32, 64])
+    @parametrize("N", [1, 32, 64])
+    @parametrize("K", [1, 32, 64])
+    @parametrize("batch_size", [None, 1, 32])
+    @parametrize("backend", ["cublas", "cublaslt"])
+    def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
+        device = "cuda"
+        dtype = input_dtype
+        with blas_library_context(backend):
+            def create_inputs(B=None):
+                if B is None:
+                    a = torch.randn(M, K, device=device, dtype=dtype)
+                    b = torch.randn(K, N, device=device, dtype=dtype)
+                    c = torch.randn(M, N, device=device, dtype=dtype)
+                else:
+                    a = torch.randn(B, M, K, device=device, dtype=dtype)
+                    b = torch.randn(B, K, N, device=device, dtype=dtype)
+                    c = torch.randn(B, M, N, device=device, dtype=dtype)
+
+                return a, b, c
+
+            a, b, c = create_inputs(batch_size)
+
+            a_fp32, b_fp32, c_fp32 = a.to(torch.float32), b.to(torch.float32), c.to(torch.float32)
+
+            output_dtypes = [torch.float32]
+
+            if input_dtype != torch.float32:
+                output_dtypes.append(input_dtype)
+
+            for output_dtype in output_dtypes:
+                # Catch edge case of incompat with bfloat16 and major version < 8
+                if input_dtype == torch.bfloat16 and not PLATFORM_SUPPORTS_BF16:
+                    if output_dtype == torch.bfloat16:
+                        continue
+
+                    if batch_size:
+                        with self.assertRaises(RuntimeError):
+                            torch.baddbmm(c, a, b, out_dtype=output_dtype)
+                    else:
+                        with self.assertRaises(RuntimeError):
+                            torch.addmm(c, a, b, out_dtype=output_dtype)
+                else:
+                    if batch_size:
+                        out = torch.baddbmm(c, a, b, out_dtype=output_dtype)
+                        if output_dtype == torch.float32:
+                            baseline = torch.baddbmm(c_fp32, a_fp32, b_fp32)
+                        else:
+                            baseline = torch.baddbmm(c, a, b)
+                    else:
+                        out = torch.addmm(c, a, b, out_dtype=output_dtype)
+                        if output_dtype == torch.float32:
+                            baseline = torch.addmm(c_fp32, a_fp32, b_fp32)
+                        else:
+                            baseline = torch.addmm(c, a, b)
+
+                    self.assertEqual(out.dtype, output_dtype)
+                    torch.testing.assert_close(out, baseline, atol=1e-3, rtol=1e-3)
+
+
+    @onlyCUDA
+    @skipIfRocm
+    @parametrize("batch_size", [1, 32])
+    @parametrize("backend", ["cublas", "cublaslt"])
+    def test_fp16_accum_and_fp32_out_failure(self, batch_size, backend):
+        M, N, K = 32, 32, 32
+        device = "cuda"
+        dtype = torch.float16
+        with blas_library_context(backend):
+            torch.backends.cuda.preferred_blas_library(backend)
+
+            orig_fp16_accum = torch.backends.cuda.matmul.allow_fp16_accumulation
+            torch.backends.cuda.matmul.allow_fp16_accumulation = True
+
+            def create_inputs():
+                a = torch.randn(M, K, device=device, dtype=dtype)
+                b = torch.randn(K, N, device=device, dtype=dtype)
+                c = torch.randn(M, N, device=device, dtype=dtype)
+                return a, b, c
+
+            def expand(tensor):
+                return tensor.unsqueeze(0).expand(batch_size, *tensor.shape)
+
+            a, b, c = create_inputs()
+
+            with self.assertRaises(Exception):
+                torch.baddbmm(expand(c), expand(a), expand(b), out_dtype=torch.float32)
+
+            with self.assertRaises(Exception):
+                torch.addmm(c, a, b, out_dtype=torch.float32)
+
+            with self.assertRaises(Exception):
+                torch.bmm(expand(a,), expand(b), out_dtype=torch.float32)
+
+            with self.assertRaises(Exception):
+                torch.mm(a, b, out_dtype=torch.float32)
+
+            torch.backends.cuda.matmul.allow_fp16_accumulation = orig_fp16_accum
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 f8_msg = "FP8 is only supported on H100+, SM 8.9 and MI300+ devices"
 mx_skip_msg = "MX gemm is only supported on CUDA capability 10.0+"
 
+<<<<<<< HEAD
 if torch.version.hip and 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName:
     e4m3_type = torch.float8_e4m3fnuz
     e5m2_type = torch.float8_e5m2fnuz
@@ -269,6 +822,8 @@ def _expand_to_batch(t: torch.Tensor):
     E4M3_MAX_POS = torch.finfo(torch.float8_e4m3fn).max
     E5M2_MAX_POS = torch.finfo(torch.float8_e5m2).max
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # avoid division by zero when calculating scale
 EPS = 1e-12
 
@@ -373,6 +928,7 @@ def to_fp8_saturated(
 
     return x.to(fp8_dtype)
 
+<<<<<<< HEAD
 # copied from https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/mx/to_blocked.py
 def ceil_div(a, b):
     return (a + b - 1) // b
@@ -410,6 +966,8 @@ def to_blocked(input_matrix) -> torch.Tensor:
 
     return rearranged.flatten()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     """Computes the error between two tensors in dB.
 
@@ -424,21 +982,49 @@ def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     Pn = torch.norm(x - y)
     return 20 * torch.log10(Ps / Pn)
 
+<<<<<<< HEAD
 # largest power of 2 representable in `torch.float8_e4m3fn`
 F8E4M3_LARGEST_POW2 = 8
+=======
+
+# largest power of 2 representable in `torch.float8_e4m3fn`
+F8E4M3_LARGEST_POW2 = 8
+# largest power of 2 representable in `torch.float4_e2m1fn_x2`
+FP4E2M1FN_LARGEST_POW2 = 1.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # max value of `torch.float8_e4m3fn` (448)
 F8E4M3_MAX_VAL = torch.finfo(torch.float8_e4m3fn).max
 # exponent bias of `torch.float8_e8m0fnu`
 F8E8M0_EXP_BIAS = 127
+<<<<<<< HEAD
 
 def data_to_mx_scale(x, block_size):
     # simple implementation of https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
     # section 6.3, not all edge cases (such as NaN) are handled/tested
+=======
+# exponent and mantissa bits of `torch.float4_e2m1fn_x2`
+FP4_EBITS, FP4_MBITS = 2, 1
+FP4_MAX_VAL = 6.0
+
+def data_to_mx_scale(x, block_size, recipe):
+    # simple implementation of https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    # section 6.3, not all edge cases (such as NaN) are handled/tested
+    if recipe == "mxfp8":
+        largest_pow2 = F8E4M3_LARGEST_POW2
+    elif recipe == "mxfp4":
+        largest_pow2 = FP4E2M1FN_LARGEST_POW2
+    else:
+        raise ValueError(f"data_to_mx_scale(): Unsupported mx recipe: {recipe}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     orig_shape = x.shape
     x = x.reshape(-1, block_size)
     max_abs = torch.amax(torch.abs(x), 1)
     largest_p2_lt_max_abs = torch.floor(torch.log2(max_abs))
+<<<<<<< HEAD
     scale_e8m0_unbiased = largest_p2_lt_max_abs - F8E4M3_LARGEST_POW2
+=======
+    scale_e8m0_unbiased = largest_p2_lt_max_abs - largest_pow2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     scale_e8m0_unbiased = torch.clamp(scale_e8m0_unbiased, -1 * F8E8M0_EXP_BIAS, F8E8M0_EXP_BIAS)
     scale_e8m0_biased = scale_e8m0_unbiased + F8E8M0_EXP_BIAS
     scale_e8m0_biased = scale_e8m0_biased.to(torch.uint8)
@@ -446,15 +1032,66 @@ def data_to_mx_scale(x, block_size):
     return scale_e8m0_biased.reshape(orig_shape[0], -1)
 
 
+<<<<<<< HEAD
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
 class TestFP8MatmulCuda(TestCase):
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+=======
+def data_to_nvfp4_scale(x, block_size):
+    orig_shape = x.shape
+    x = x.reshape(-1, block_size)
+    max_abs = torch.amax(torch.abs(x), 1) + 1e-12
+
+    # x_orig_max / scale = x_in_fp4_domain_max
+    # x_orig_max / x_in_fp4_domain_max = scale
+    scale = max_abs / FP4_MAX_VAL
+
+    # for the purposes of this function, just clamp to representable range of
+    # `torch.float8_e4m3fn`. In real code, we would expect the modeling code to
+    # handle this before the input data hits this function.
+    scale = scale.clamp(max=F8E4M3_MAX_VAL)
+
+    # cast to target dtype
+    scale = scale.to(torch.float8_e4m3fn)
+    scale = scale.reshape(orig_shape[0], -1)
+    return scale
+
+
+def down_size(size):
+    assert size[-1] % 2 == 0, f"{size} last dim not divisible by two"
+    return (*size[:-1], size[-1] // 2)
+
+
+def pack_uint4(uint8_data) -> torch.Tensor:
+    # converting to uint8 for operations
+    shape = uint8_data.shape
+    assert shape[-1] % 2 == 0
+    uint8_data = uint8_data.contiguous().view(-1)
+    return (uint8_data[1::2] << 4 | uint8_data[::2]).view(down_size(shape))
+
+
+def _bfloat16_to_float4_e2m1fn_x2(x):
+    assert x.dtype == torch.bfloat16
+    x = _f32_to_floatx_unpacked(x.float(), FP4_EBITS, FP4_MBITS)
+    x = pack_uint4(x)
+    x = x.view(torch.float4_e2m1fn_x2)
+    return x
+
+
+class TestFP8Matmul(TestCase):
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_tautological_mm(self, device: str = "cuda",
                               x_dtype: torch.dtype = e4m3_type,
                               y_dtype: torch.dtype = e4m3_type,
                               out_dtype: Optional[torch.dtype] = None,
                               size: int = 16) -> None:
+<<<<<<< HEAD
+=======
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x_fp8 = torch.rand(size, size, device=device).to(x_dtype)
         y_fp8 = torch.eye(size, device=device, dtype=y_dtype).t()
         out_fp32 = torch.mm(x_fp8.to(torch.float), y_fp8.to(torch.float))
@@ -465,12 +1102,22 @@ def _test_tautological_mm(self, device: str = "cuda",
             self.assertEqual(out_dtype, out_fp8.dtype)
         self.assertEqual(out_fp32, out_fp8.to(torch.float))
 
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     def test_float8_basics(self, device) -> None:
         self._test_tautological_mm(device, e4m3_type, e4m3_type, size=16)
         # According to https://docs.nvidia.com/cuda/cublas/#id99 8F_E5M2 MM is unsupported
         # supported on ROCm but fails on CUDA
         ctx = self.assertRaises(RuntimeError) if torch.version.hip is None else contextlib.nullcontext()
+=======
+    def test_float8_basics(self, device) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        self._test_tautological_mm(device, e4m3_type, e4m3_type, size=16)
+        # According to https://docs.nvidia.com/cuda/cublas/#id99 8F_E5M2 MM is unsupported
+        # supported on ROCm but fails on CUDA
+        ctx = self.assertRaises(RuntimeError) if torch.version.hip is None and device != "cpu" else contextlib.nullcontext()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with ctx:
             self._test_tautological_mm(device, e5m2_type, e5m2_type)
 
@@ -481,11 +1128,20 @@ def test_float8_basics(self, device) -> None:
         self._test_tautological_mm(device, size=96, out_dtype=torch.float32)
         self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16)
 
+<<<<<<< HEAD
         with self.assertRaises(AssertionError if torch.version.hip else RuntimeError):
             self._test_tautological_mm(device, out_dtype=e5m2_type)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     def test_float8_scale(self, device) -> None:
+=======
+        with self.assertRaises(AssertionError if torch.version.hip or device == "cpu" else RuntimeError):
+            self._test_tautological_mm(device, out_dtype=e5m2_type)
+
+    def test_float8_scale(self, device) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         size = (16, 16)
         x = torch.full(size, .5, device=device, dtype=e4m3_type)
         # hipblaslt does not yet support mixed e4m3_type input
@@ -600,8 +1256,15 @@ def test_scaled_mm_change_stride(self, base_dtype):
 
         torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
 
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     def test_float8_bias(self, device) -> None:
+=======
+    @onlyCUDA
+    def test_float8_bias(self, device) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (k, l, m) = (16, 48, 32)
         x = torch.ones((k, l), device=device).to(e4m3_type)
         y = torch.full((m, l), .25, device=device, dtype=e4m3_type).t()
@@ -616,6 +1279,10 @@ def test_float8_bias(self, device) -> None:
         difference = torch.abs(out_fp32 - outb_fp32)
         self.assertEqual(difference, torch.tensor(4.0, device=device).expand_as(out_fp32))
 
+<<<<<<< HEAD
+=======
+    @onlyCUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("bias", [True, False])
     def test_non_divisible_leading_dim(self, device, bias: bool) -> None:
@@ -628,6 +1295,10 @@ def test_non_divisible_leading_dim(self, device, bias: bool) -> None:
             input_bias = torch.rand((16,), device=device).to(torch.half)
         _ = torch._scaled_mm(x, y, scale_a, scale_b, bias=input_bias)
 
+<<<<<<< HEAD
+=======
+    @onlyCUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     def test_float8_bias_relu_edgecase(self, device) -> None:
         (k, l, m) = (16, 48, 32)
@@ -640,6 +1311,10 @@ def test_float8_bias_relu_edgecase(self, device) -> None:
         outb_fp32 = outb_fp8.to(torch.float32)
         self.assertEqual(outb_fp32, torch.tensor(-3.0, device=device).expand_as(outb_fp32))
 
+<<<<<<< HEAD
+=======
+    @onlyCUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     def test_float32_output_errors_with_bias(self, device) -> None:
         (k, l, m) = (16, 48, 32)
@@ -654,7 +1329,12 @@ def test_float32_output_errors_with_bias(self, device) -> None:
             lambda: torch._scaled_mm(x, y, scale_a, scale_b, bias=bias, out_dtype=torch.float32),
         )
 
+<<<<<<< HEAD
     @unittest.skipIf(PLATFORM_SUPPORTS_FP8, f8_msg)
+=======
+    @onlyCUDA
+    @unittest.skipIf(PLATFORM_SUPPORTS_FP8 or not torch.cuda.is_available(), f8_msg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_error_message_fp8_pre_sm89(self, device) -> None:
         (k, l, m) = (16, 48, 32)
         x = torch.rand((k, l), device=device).to(e4m3_type)
@@ -682,8 +1362,14 @@ def test_float8_scale_fast_accum(self, device) -> None:
         self.assertEqual(out_fp8, out_fp8_s)
 
     @skipIfRocmVersionAndArch((7, 1), "gfx950")
+<<<<<<< HEAD
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
     @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89+ specific")
+=======
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
+    @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("use_fast_accum", [True, False])
     def test_float8_rowwise_scaling_sanity(self, device, use_fast_accum: bool) -> None:
         M, K, N = (1024, 512, 2048)
@@ -709,6 +1395,10 @@ def test_float8_rowwise_scaling_sanity(self, device, use_fast_accum: bool) -> No
             out_fp8.to(torch.float32), torch.full((M, N), K * (fill_value**2), device=device)
         )
 
+<<<<<<< HEAD
+=======
+    @onlyCUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
     def test_float8_error_messages(self, device) -> None:
         M, K, N = (1024, 512, 2048)
@@ -789,7 +1479,11 @@ def test_float8_error_messages(self, device) -> None:
 
     @skipIfRocmVersionAndArch((7, 1), "gfx950")
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
+<<<<<<< HEAD
     @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89+ specific")
+=======
+    @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("base_dtype", [torch.bfloat16])
     def test_scaled_mm_vs_emulated_row_wise(self, base_dtype):
         torch.manual_seed(42)
@@ -849,6 +1543,10 @@ def test_zero_dim_tensorwise(self, which_dim_zero, use_torch_compile) -> None:
         self.assertEqual(out_dtype, out_fp8.dtype)
         self.assertEqual(out_fp32, out_fp8.to(torch.float))
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support sm carveout")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(IS_WINDOWS, "Windows doesn't support row-wise scaling")
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @unittest.skipIf(not SM90OrLater, "sm89 kernel isn't opted into carveout yet")
@@ -877,6 +1575,7 @@ def test_honor_sm_carveout(self) -> None:
                 torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
 
             prof.export_chrome_trace(f.name)
+<<<<<<< HEAD
             if torch.version.hip:
                 events = [evt for evt in json.load(open(f.name))["traceEvents"] if evt.get("cat", "") == "kernel"]
                 # events were returned out of order; need to be sorted on "ts" timestamp
@@ -909,6 +1608,31 @@ def test_honor_sm_carveout(self) -> None:
                 self.assertEqual(no_carveout, no_carveout_again)
                 self.assertNotEqual(no_carveout, carveout_66)
                 self.assertNotEqual(carveout_66, carveout_0)
+=======
+            no_carveout, carveout_0, carveout_66, no_carveout_again = [
+                math.prod(evt.get("args", {}).get("grid", []))
+                for evt in json.load(open(f.name))["traceEvents"]
+                if evt.get("cat", "") == "kernel"
+            ]
+
+            self.assertEqual(no_carveout, no_carveout_again)
+            self.assertNotEqual(no_carveout, carveout_66)
+            self.assertNotEqual(carveout_66, carveout_0)
+
+    def test_pack_uint4(self):
+        """
+        Verify that given a tensor with high precision values [val0, val1],
+        the x2 packed representation is val1:val0 (from MSB to LSB), and
+        not val0:val1.
+
+        Note that the packing function is private to this file, but it's still
+        good to test that we are packing in the expected way.
+        """
+        hp_data = torch.tensor([0b00000010, 0b00001011], dtype=torch.uint8)
+        lp_data_actual = pack_uint4(hp_data)
+        lp_data_expected = torch.tensor([0b10110010], dtype=torch.uint8)
+        torch.testing.assert_close(lp_data_actual, lp_data_expected, atol=0, rtol=0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
     @parametrize("test_case_name", [
@@ -933,7 +1657,11 @@ def test_honor_sm_carveout(self) -> None:
         # Non block multiples
         (65, 96, 112),
         (197, 224, 272),
+<<<<<<< HEAD
         # K not multiple of 32
+=======
+        # K not multiple of 32 (skipped for fp4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (197, 240, 272),
 
         # Very unbalanced
@@ -946,6 +1674,7 @@ def test_honor_sm_carveout(self) -> None:
         (127, 96, 1024),
         (1025, 128, 96)
     ], name_fn=lambda mkn: f"{mkn[0]}_{mkn[1]}_{mkn[2]}")
+<<<<<<< HEAD
     def test_blockwise_mxfp8_numerics(self, test_case_name, fast_accum, mkn) -> None:
         # inspiration: https://github.com/pytorch/ao/pull/1625
 
@@ -954,12 +1683,31 @@ def test_blockwise_mxfp8_numerics(self, test_case_name, fast_accum, mkn) -> None
         BLOCK_SIZE = 32
         require_exact_match = True
 
+=======
+    @parametrize("recipe", ["mxfp8", "mxfp4" if torch.version.hip else "nvfp4"])
+    def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum, mkn, recipe) -> None:
+        if (recipe == "nvfp4" or recipe == "mxfp4") and fast_accum:
+            raise unittest.SkipTest("fast_accum not supported in nvfp4/mxfp4 cublas gemm, skipping")
+
+        device = "cuda"
+        M, K, N = mkn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch.version.hip:
             if not (M % 32 == 0 and K % 32 == 0 and N % 32 == 0):
                 raise unittest.SkipTest("Matrix dimensions must be multiples of 32 on ROCm, skipping")
 
+<<<<<<< HEAD
         def ceil_div(a, b):
             return (a + b - 1) // b
+=======
+        if (recipe == "nvfp4" or recipe == "mxfp4") and K % 32 != 0:
+            raise unittest.SkipTest("K must be divisible by 32 for nvfp4/mxfp4 cublas gemm, skipping")
+
+        fp4_scaling_dtype = torch.float8_e8m0fnu if torch.version.hip else torch.float8_e4m3fn
+        BLOCK_SIZE = 16 if recipe == "nvfp4" else 32
+        require_exact_match = True
+        approx_match_sqnr_target = 22.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if test_case_name == "a_eye_b_eye":
             if not ((M == K) and (M == N)):
@@ -967,25 +1715,52 @@ def ceil_div(a, b):
             A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
             B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
 
+<<<<<<< HEAD
             A = A_ref.to(torch.float8_e4m3fn)
             B = B_ref.to(torch.float8_e4m3fn)
 
             A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
             B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+=======
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         elif test_case_name == "a_ones_b_ones":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
             B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
 
+<<<<<<< HEAD
             A = A_ref.to(torch.float8_e4m3fn)
             B = B_ref.to(torch.float8_e4m3fn)
 
             A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
             B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+=======
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         elif test_case_name == "a_ones_modified_b_ones":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
             B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+<<<<<<< HEAD
 
             A = A_ref.to(torch.float8_e4m3fn)
             B = B_ref.to(torch.float8_e4m3fn)
@@ -995,10 +1770,25 @@ def ceil_div(a, b):
 
             A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
             B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+=======
+            A_ref[1][0:BLOCK_SIZE] = 2
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         elif test_case_name == "a_ones_b_ones_modified":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
             B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+<<<<<<< HEAD
 
             A = A_ref.to(torch.float8_e4m3fn)
             B = B_ref.to(torch.float8_e4m3fn)
@@ -1008,11 +1798,26 @@ def ceil_div(a, b):
 
             A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
             B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+=======
+            B_ref[1][0:BLOCK_SIZE] = 2
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         elif test_case_name == "a_scale_modified_b_ones":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
             B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
 
+<<<<<<< HEAD
             A = A_ref.to(torch.float8_e4m3fn)
             B = B_ref.to(torch.float8_e4m3fn)
 
@@ -1022,11 +1827,30 @@ def ceil_div(a, b):
             A_ref[1][0:BLOCK_SIZE] = 4
             A[1][0:BLOCK_SIZE] = 2
             A_scale[1][0] = 2
+=======
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                A_ref[1][0:BLOCK_SIZE] = 4
+                A[1][0:BLOCK_SIZE] = 2
+                A_scale[1][0] = 2
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                A_ref[1][0:BLOCK_SIZE] = 4
+                A.view(torch.uint8)[1][0:(BLOCK_SIZE // 2)] = 0b01000100
+                A_scale[1][0] = 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         elif test_case_name == "a_ones_b_scale_modified":
             A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
             B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
 
+<<<<<<< HEAD
             A = A_ref.to(torch.float8_e4m3fn)
             B = B_ref.to(torch.float8_e4m3fn)
 
@@ -1053,6 +1877,57 @@ def ceil_div(a, b):
 
             A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
             B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+=======
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_ref[1][0:BLOCK_SIZE] = 4
+                B[1][0:BLOCK_SIZE] = 2
+                B_scale[1][0] = 2
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_ref[1][0:BLOCK_SIZE] = 4
+                B.view(torch.uint8)[1][0:(BLOCK_SIZE // 2)] = 0b01000100
+                B_scale[1][0] = 2
+
+        elif test_case_name == "data_random_scales_one":
+            require_exact_match = False
+
+            if recipe == "mxfp8":
+                # scales all-ones, element data random while being exactly representable in float8_e4m3fn
+                # generate integers in [0, 255] and interpret as float8_e4m3fn
+                A_ref = torch.randint(0, 255, (M, K), device=device, dtype=torch.uint8).view(torch.float8_e4m3fn).to(torch.bfloat16)
+                B_ref = torch.randint(0, 255, (N, K), device=device, dtype=torch.uint8).view(torch.float8_e4m3fn).to(torch.bfloat16)
+                # modification: don't allow NaN values
+                A_ref[torch.isnan(A_ref)] = 0
+                B_ref[torch.isnan(B_ref)] = 0
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                # scales all-ones, element data random while being exactly representable in float4_e2m1fn_x2
+                # generate integers in [0, 16] and cast to bfloat16
+                A_ref = _floatx_unpacked_to_f32(
+                    torch.randint(0, 16, (M, K), device=device, dtype=torch.uint8),
+                    FP4_EBITS,
+                    FP4_MBITS
+                ).bfloat16()
+                B_ref = _floatx_unpacked_to_f32(
+                    torch.randint(0, 16, (N, K), device=device, dtype=torch.uint8),
+                    FP4_EBITS,
+                    FP4_MBITS
+                ).bfloat16()
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         elif test_case_name == "data_random_scales_from_data":
             if not K % BLOCK_SIZE == 0:
@@ -1062,6 +1937,7 @@ def ceil_div(a, b):
             A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000
             B_ref = torch.randn((N, K), device=device, dtype=torch.bfloat16) * 1000
 
+<<<<<<< HEAD
             # Calculate scales based on the inputs
             A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE)
             B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE)
@@ -1073,14 +1949,46 @@ def ceil_div(a, b):
             A = A.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
             B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(N, K)
             B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
+=======
+            if recipe == "mxfp8":
+                # Calculate scales based on the inputs
+                A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE, recipe)
+                B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE, recipe)
+                max_val = F8E4M3_MAX_VAL
+                min_val = -1 * max_val
+                A = (A_ref.reshape(-1, BLOCK_SIZE) / A_scale.reshape(M * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(M, K)
+                A = A.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
+                B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(N, K)
+                B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
+            else:  # nvfp4 # mxfp4
+                scale_func = data_to_mx_scale if recipe == "mxfp4" else data_to_nvfp4_scale
+                A_scale = scale_func(A_ref, BLOCK_SIZE, recipe if recipe == "mxfp4" else None)
+                B_scale = scale_func(B_ref, BLOCK_SIZE, recipe if recipe == "mxfp4" else None)
+                max_val = FP4_MAX_VAL
+                min_val = -1 * max_val
+
+                A = (A_ref.reshape(-1, BLOCK_SIZE) / A_scale.reshape(M * ceil_div(K, BLOCK_SIZE), 1).bfloat16()).reshape(M, K)
+                A = A.clamp(min=min_val, max=max_val)
+                A = _bfloat16_to_float4_e2m1fn_x2(A)
+                B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).bfloat16()).reshape(N, K)
+                B = B.clamp(min=min_val, max=max_val)
+                B = _bfloat16_to_float4_e2m1fn_x2(B)
+
+                approx_match_sqnr_target = 12.0 if torch.version.hip else 15.8
+
+        C_ref = A_ref @ B_ref.t()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # convert to swizzled format
         if not torch.version.hip:
             A_scale = to_blocked(A_scale)
             B_scale = to_blocked(B_scale)
 
+<<<<<<< HEAD
         C_ref = A_ref @ B_ref.t()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         C = torch._scaled_mm(
             A,
             B.t(),
@@ -1094,6 +2002,7 @@ def ceil_div(a, b):
             torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
         else:
             sqnr = compute_error(C_ref, C)
+<<<<<<< HEAD
             assert sqnr.item() > 22.0
 
     @skipIfRocm
@@ -1103,22 +2012,47 @@ def test_blockwise_mxfloat8_error_messages(self, device) -> None:
         BLOCK_SIZE_K = 32
         BLOCK_SIZE_MN = 128
         fill_value = 0.5
+=======
+            assert sqnr.item() > approx_match_sqnr_target
+
+    @skipIfRocm
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM or IS_WINDOWS, mx_skip_msg)
+    @parametrize("recipe", ["mxfp8", "nvfp4"])
+    def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
+        M, K, N = (1024, 512, 2048)
+        BLOCK_SIZE_K = 16 if recipe == "nvfp4" else 32
+        BLOCK_SIZE_MN = 128
+        fill_value = 0.5
+        scale_dtype = torch.float8_e4m3fn if recipe == "nvfp4" else torch.float8_e8m0fnu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         x = torch.full((M, K), fill_value, device=device)
         y = torch.full((N, K), fill_value, device=device)
 
+<<<<<<< HEAD
         x_fp8 = x.to(e4m3_type)
         y_fp8 = y.to(e4m3_type).t()
 
         def ceil_div(a, b):
             return (a + b - 1) // b
+=======
+        if recipe == "mxfp8":
+            x_lowp = x.to(e4m3_type)
+            y_lowp = y.to(e4m3_type).t()
+        else:  # nvfp4
+            x_lowp = _bfloat16_to_float4_e2m1fn_x2(x.bfloat16())
+            y_lowp = _bfloat16_to_float4_e2m1fn_x2(y.bfloat16()).t()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         num_k_blocks = ceil_div(K, BLOCK_SIZE_K)
         padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4
         expected_a_size = BLOCK_SIZE_MN * ceil_div(M, BLOCK_SIZE_MN) * padded_num_k_blocks
         expected_b_size = BLOCK_SIZE_MN * ceil_div(N, BLOCK_SIZE_MN) * padded_num_k_blocks
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test wrong scale tensor size for scale_a with correct dtype
         with self.assertRaisesRegex(
             RuntimeError,
@@ -1127,11 +2061,19 @@ def ceil_div(a, b):
                 f"but got {expected_a_size - 1}"
             ),
         ):
+<<<<<<< HEAD
             incorrect_size_a = torch.ones(expected_a_size - 1, device=device, dtype=torch.float8_e8m0fnu)
             correct_size_b = torch.ones(expected_b_size, device=device, dtype=torch.float8_e8m0fnu)
             torch._scaled_mm(
                 x_fp8,
                 y_fp8,
+=======
+            incorrect_size_a = torch.ones(expected_a_size - 1, device=device, dtype=scale_dtype)
+            correct_size_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
+            torch._scaled_mm(
+                x_lowp,
+                y_lowp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 scale_a=incorrect_size_a,
                 scale_b=correct_size_b,
                 out_dtype=torch.bfloat16,
@@ -1145,11 +2087,19 @@ def ceil_div(a, b):
                 f"but got {expected_b_size + 1}"
             ),
         ):
+<<<<<<< HEAD
             correct_size_a = torch.ones(expected_a_size, device=device, dtype=torch.float8_e8m0fnu)
             incorrect_size_b = torch.ones(expected_b_size + 1, device=device, dtype=torch.float8_e8m0fnu)
             torch._scaled_mm(
                 x_fp8,
                 y_fp8,
+=======
+            correct_size_a = torch.ones(expected_a_size, device=device, dtype=scale_dtype)
+            incorrect_size_b = torch.ones(expected_b_size + 1, device=device, dtype=scale_dtype)
+            torch._scaled_mm(
+                x_lowp,
+                y_lowp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 scale_a=correct_size_a,
                 scale_b=incorrect_size_b,
                 out_dtype=torch.bfloat16,
@@ -1162,16 +2112,25 @@ def ceil_div(a, b):
                 "For BlockWise scaling: Both scale_a and scale_b must be contiguous"
             ),
         ):
+<<<<<<< HEAD
             non_contiguous_a = torch.ones(expected_a_size * 2, device=device, dtype=torch.float8_e8m0fnu)[::2]
             contiguous_b = torch.ones(expected_b_size, device=device, dtype=torch.float8_e8m0fnu)
             torch._scaled_mm(
                 x_fp8,
                 y_fp8,
+=======
+            non_contiguous_a = torch.ones(expected_a_size * 2, device=device, dtype=scale_dtype)[::2]
+            contiguous_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
+            torch._scaled_mm(
+                x_lowp,
+                y_lowp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 scale_a=non_contiguous_a,
                 scale_b=contiguous_b,
                 out_dtype=torch.bfloat16,
             )
 
+<<<<<<< HEAD
     def grouped_mm_helper(self, alist, blist, ascalelist, bscalelist, outlist, use_fast_accum):
         for a, b, ascale, bscale, out in zip(alist, blist, ascalelist, bscalelist, outlist):
             out_ref = torch._scaled_mm(a, b.t(), ascale.view(-1, 1), bscale.view(1, -1),
@@ -1192,6 +2151,35 @@ def test_grouped_gemm_2d_2d(self, fast_accum, strided):
         offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
         out = torch._scaled_grouped_mm(a, b.t(), scale_a, scale_b, offs=offs,
                                        out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+=======
+    def scaled_grouped_mm_helper(self, alist, blist, ascalelist, bscalelist, outlist, use_fast_accum):
+        for a, b, ascale, bscale, out in zip(alist, blist, ascalelist, bscalelist, outlist):
+            out_ref = torch._scaled_mm(a, b.t(), ascale.view(-1, 1), bscale.view(1, -1),
+                                       out_dtype=torch.bfloat16, use_fast_accum=use_fast_accum)
+            self.assertEqual(out, out_ref, atol=5e-2, rtol=5e-4)
+
+    # Testing only _scaled_grouped_mm() with multiple shapes, as
+    # _scaled_mm() already has more combinations of parameters than
+    # _scaled_grouped_mm(), for supporing more than one inputs layout
+    # combinations.
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("fast_accum", [False, True])
+    @parametrize("strided", [False, True])
+    def test_scaled_grouped_gemm_2d_2d(self, fast_accum, strided):
+        device = "cuda"
+        m, n, k, n_groups = 16, 32, 64, 4
+        a = torch.randn(m, k * n_groups + k * int(strided), device=device).to(torch.float8_e4m3fn)[:, :k * n_groups]
+        b = torch.randn(n, k * n_groups + k * int(strided), device=device).to(torch.float8_e4m3fn)[:, :k * n_groups]
+        scale_a = torch.rand(m * n_groups, device=device, dtype=torch.float32)
+        scale_b = torch.rand(n * n_groups, device=device, dtype=torch.float32)
+        offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
+        f = torch._scaled_grouped_mm
+        out = f(a, b.t(), scale_a, scale_b, offs=offs,
+                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offs_cpu = offs.cpu()
         alist, blist, ascalelist, bscalelist = [], [], [], []
         start = 0
@@ -1201,6 +2189,7 @@ def test_grouped_gemm_2d_2d(self, fast_accum, strided):
             ascalelist.append(scale_a[i * m : (i + 1) * m])
             bscalelist.append(scale_b[i * n : (i + 1) * n])
             start = offs_cpu[i]
+<<<<<<< HEAD
         self.grouped_mm_helper(alist, blist, ascalelist, bscalelist, out, fast_accum)
 
 
@@ -1212,10 +2201,25 @@ def test_grouped_gemm_2d_3d(self, fast_accum, strided):
         device = "cuda"
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 16, 4
+=======
+        self.scaled_grouped_mm_helper(alist, blist, ascalelist, bscalelist, out, fast_accum)
+
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("fast_accum", [False, True])
+    @parametrize("strided", [False, True])
+    def test_scaled_grouped_gemm_2d_3d(self, fast_accum, strided):
+        device = "cuda"
+        m, n, k, n_groups = 16, 32, 64, 4
+        s_int = int(strided)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = torch.randn(m * n_groups, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[:, :k]
         b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[::(1 + s_int), :, :k]
         self.assertTrue(a.is_contiguous() is not strided)
         self.assertTrue(b.is_contiguous() is not strided)
+<<<<<<< HEAD
         offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
         scale_a = torch.arange(n_groups * m, device="cuda", dtype=torch.float32)
         scale_b = torch.ones(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
@@ -1242,10 +2246,47 @@ def test_grouped_gemm_3d_3d(self, fast_accum, strided):
         device = "cuda"
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 16, 4
+=======
+        for check_zero_size in (True, False):
+            if check_zero_size and n_groups <= 1:
+                continue
+
+            offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
+            if check_zero_size:
+                offs[0] = offs[1]
+            scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32)
+            scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
+
+            f = torch._scaled_grouped_mm
+            out = f(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
+                    out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+
+            offs_cpu = offs.cpu()
+            alist, ascalelist, outlist = [], [], []
+            start = 0
+            for i in range(n_groups):
+                alist.append(a[start:offs_cpu[i]])
+                ascalelist.append(scale_a[start:offs_cpu[i]])
+                outlist.append(out[start:offs_cpu[i]])
+                start = offs_cpu[i]
+                self.scaled_grouped_mm_helper(alist, b, ascalelist, scale_b, outlist, fast_accum)
+
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("fast_accum", [False, True])
+    @parametrize("strided", [False, True])
+    def test_scaled_grouped_gemm_3d_3d(self, fast_accum, strided):
+        device = "cuda"
+        m, n, k, n_groups = 16, 32, 64, 4
+        s_int = int(strided)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[::(1 + s_int), :, :k]
         b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[::(1 + s_int), :, :k]
         self.assertTrue(a.is_contiguous() is not strided)
         self.assertTrue(b.is_contiguous() is not strided)
+<<<<<<< HEAD
         scale_a = torch.ones(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
         scale_b = torch.ones(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
 
@@ -1263,10 +2304,32 @@ def test_grouped_gemm_3d_2d(self, fast_accum, strided):
         device = "cuda"
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 16, 4
+=======
+        scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
+        scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
+
+        f = torch._scaled_grouped_mm
+        out = f(a, b.transpose(-2, -1), scale_a, scale_b,
+                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+
+        self.scaled_grouped_mm_helper(a, b, scale_a, scale_b, out, fast_accum)
+
+
+    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
+    @xfailIfSM100OrLater
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    @parametrize("fast_accum", [False, True])
+    @parametrize("strided", [False, True])
+    def test_scaled_grouped_gemm_3d_2d(self, fast_accum, strided):
+        device = "cuda"
+        m, n, k, n_groups = 16, 32, 64, 4
+        s_int = int(strided)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[::(1 + s_int), :, :k]
         b = torch.randn(n * n_groups, k * (1 + s_int), device=device).to(torch.float8_e4m3fn)[:, :k]
         self.assertTrue(a.is_contiguous() is not strided)
         self.assertTrue(b.is_contiguous() is not strided)
+<<<<<<< HEAD
         scale_a = torch.arange(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
         scale_b = torch.arange(n_groups * n, device="cuda", dtype=torch.float32)
         offs = torch.arange(n, n_groups * n + 1, n, device="cuda", dtype=torch.int32)
@@ -1282,6 +2345,89 @@ def test_grouped_gemm_3d_2d(self, fast_accum, strided):
             outlist.append(out[:, start:offs_cpu[i]])
             start = offs_cpu[i]
         self.grouped_mm_helper(a, blist, scale_a, bscalelist, outlist, fast_accum)
+=======
+        scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
+        scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32)
+        for check_zero_size in (True, False):
+            if check_zero_size and n_groups <= 1:
+                continue
+
+            offs = torch.arange(n, n_groups * n + 1, n, device="cuda", dtype=torch.int32)
+            if check_zero_size:
+                offs[0] = offs[1]
+
+            f = torch._scaled_grouped_mm
+            out = f(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
+                    out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+            offs_cpu = offs.cpu()
+            blist, bscalelist, outlist = [], [], []
+            start = 0
+            for i in range(n_groups):
+                blist.append(b[start:offs_cpu[i]])
+                bscalelist.append(scale_b[start:offs_cpu[i]])
+                outlist.append(out[:, start:offs_cpu[i]])
+                start = offs_cpu[i]
+                self.scaled_grouped_mm_helper(a, blist, scale_a, bscalelist, outlist, fast_accum)
+
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
+    def test_blockwise_mxfp8_compile(self) -> None:
+
+        device = "cuda"
+        M, K, N = 128, 128, 128
+        BLOCK_SIZE = 32
+
+        A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+        B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+
+        A = A_ref.to(torch.float8_e4m3fn)
+        B = B_ref.to(torch.float8_e4m3fn)
+
+        A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+        B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+        C_ref = A_ref @ B_ref.t()
+
+        compiled_scaled_mm = torch.compile(torch._scaled_mm, backend="inductor")
+        C = compiled_scaled_mm(
+            A,
+            B.t(),
+            A_scale,
+            B_scale,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=False,
+        )
+        torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
+
+    @skipIfRocm
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
+    def test_blockwise_nvfp4_compile(self) -> None:
+
+        device = "cuda"
+        M, K, N = 128, 128, 128
+        BLOCK_SIZE = 16
+
+        A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+        B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+
+        A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+        B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+
+        A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+        B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
+        C_ref = A_ref @ B_ref.t()
+
+        compiled_scaled_mm = torch.compile(torch._scaled_mm, backend="inductor")
+        # C = torch._scaled_mm(
+        C = compiled_scaled_mm(
+            A,
+            B.t(),
+            A_scale,
+            B_scale,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=False,
+        )
+        torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
@@ -1405,8 +2551,13 @@ def run_test(
             )
 
 instantiate_device_type_tests(TestMatmulCuda, globals(), except_for="cpu")
+<<<<<<< HEAD
 instantiate_device_type_tests(TestFP8MatmulCuda, globals(), except_for="cpu")
 instantiate_device_type_tests(TestMixedDtypesLinearCuda, globals(), except_for="cpu")
+=======
+instantiate_device_type_tests(TestMixedDtypesLinearCuda, globals(), except_for="cpu")
+instantiate_device_type_tests(TestFP8Matmul, globals(), except_for="cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == '__main__':
     TestCase._default_dtype_check_enabled = True
diff --git a/test/test_meta.py b/test/test_meta.py
index 8a6e29362e7d..5cb2c9d6e2ec 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -7,6 +7,10 @@
 import numpy as np
 from enum import Enum
 from torch.overrides import resolve_name
+<<<<<<< HEAD
+=======
+from torch.utils._dtype_abbrs import dtype_abbrs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._pytree import tree_map, tree_map_only, tree_flatten, tree_unflatten
 from torch.utils import _pytree as pytree
 from torch._subclasses.meta_utils import MetaConverter, assert_metadata_eq, is_sparse_any
@@ -22,7 +26,10 @@
     suppress_warnings,
     TEST_WITH_TORCHDYNAMO,
     run_tests,
+<<<<<<< HEAD
     dtype_abbrs,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parametrize,
     xfailIfTorchDynamo,
 )
diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py
index 9ad75164d499..6d4001104b20 100644
--- a/test/test_mkldnn.py
+++ b/test/test_mkldnn.py
@@ -1629,6 +1629,39 @@ def test_mkldnn_error_on_zero_stride(self, device):
         with self.assertRaises(ValueError):
             torch.mkldnn_max_pool2d(x, kernel_size=3, stride=0)
 
+<<<<<<< HEAD
+=======
+    def test_mkldnn_scaled_mm(self, device) -> None:
+        # test with input scale, weight scale and output_scale
+        M, N, K = 2, 13, 16
+        x = torch.randn((M, K), device=device) / K
+        y = torch.randn((N, K), device=device).t() / K
+        options = itertools.product(
+            [torch.float8_e4m3fn, torch.float8_e5m2],
+            [torch.float8_e4m3fn, torch.float8_e5m2],
+            [torch.float8_e4m3fn, torch.float8_e5m2, torch.bfloat16, torch.float16, torch.float32])
+        for x_dtype, y_dtype, out_dtype in options:
+            if out_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+                if x_dtype != out_dtype:
+                    continue
+            x_fp8 = x.to(x_dtype)
+            y_fp8 = y.to(y_dtype)
+            scale_a = torch.randn(1, device=device)
+            scale_b = torch.randn(1, device=device)
+            scale_out = torch.randn(1, device=device)
+            out_fp32 = torch.mm(x_fp8.to(torch.float) * scale_a, y_fp8.to(torch.float) * scale_b)
+            if out_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+                out_emulated = (out_fp32 / scale_out).to(out_dtype)
+            else:
+                out_emulated = out_fp32.to(out_dtype)
+
+            out = torch._scaled_mm(x_fp8, y_fp8, scale_a, scale_b, scale_result=scale_out, out_dtype=out_dtype)
+            if out_dtype is not None:
+                self.assertEqual(out_dtype, out.dtype)
+            self.assertEqual(out_emulated.float(), out.float(), atol=5e-2, rtol=5e-2)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestMkldnn, globals(), only_for=('cpu',))
 
diff --git a/test/test_model_exports_to_core_aten.py b/test/test_model_exports_to_core_aten.py
index aae14c28b8d6..a0740f652cb4 100644
--- a/test/test_model_exports_to_core_aten.py
+++ b/test/test_model_exports_to_core_aten.py
@@ -27,7 +27,13 @@ def test_vit_aten_export(self):
         m = m.eval()
         input_shape = (1, 3, 224, 224)
         example_inputs = (torch.randn(input_shape),)
+<<<<<<< HEAD
         m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
+=======
+        m = torch.export.export_for_training(
+            m, copy.deepcopy(example_inputs), strict=True
+        ).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m(*example_inputs)
         m = export.export(m, copy.deepcopy(example_inputs))
         ops = _get_ops_list(m.graph_module)
diff --git a/test/test_mps.py b/test/test_mps.py
index 0022cc42b69b..8e592b399904 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -24,6 +24,10 @@
 from torch.testing._internal.common_utils import \
     (gradcheck, gradgradcheck, parametrize, run_tests, TestCase, download_file, MACOS_VERSION, IS_CI,
      NoTest, skipIfSlowGradcheckEnv, suppress_warnings, serialTest, instantiate_parametrized_tests)
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_mps import mps_ops_modifier, mps_ops_grad_modifier, mps_ops_error_inputs_modifier
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
 import torch.backends.mps
@@ -32,7 +36,10 @@
 
 from torch.testing._internal.common_methods_invocations import (
     op_db,
+<<<<<<< HEAD
     DecorateInfo,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     UnaryUfuncInfo,
     ReductionOpInfo,
     SpectralFuncInfo,
@@ -50,6 +57,19 @@
 test_consistency_op_db = copy.deepcopy(op_db)
 test_error_inputs_op_db = copy.deepcopy(op_db)
 
+<<<<<<< HEAD
+=======
+# Add bicubic2d_aa to test_consistency_op_db
+for op in op_db:
+    if op.name != "_upsample_bilinear2d_aa":
+        continue
+    op = copy.deepcopy(op)
+    op.name = "_upsample_bicubic2d_aa"
+    op.op = torch.ops.aten._upsample_bicubic2d_aa
+    test_consistency_op_db.append(op)
+    break
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Copied from `test_ops.py` for the purposes of duplicating `test_numpy_ref`
 _ref_test_ops = tuple(
     filter(
@@ -69,6 +89,7 @@ def wrapper(func):
             return func
     return wrapper
 
+<<<<<<< HEAD
 def mps_ops_grad_modifier(ops):
     XFAILLIST_GRAD = {
 
@@ -961,6 +982,8 @@ def addDecorator(op, d) -> None:
             addDecorator(op, DecorateInfo(unittest.expectedFailure))
         yield op
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Same logic as test_cuda.py
 if not torch.backends.mps.is_available():
     print('MPS not available, skipping tests', file=sys.stderr)
@@ -995,9 +1018,15 @@ def __enter__(self):
         self.caching_allocator_before = torch.mps.current_allocated_memory()
         self.driver_before = torch.mps.driver_allocated_memory()
 
+<<<<<<< HEAD
     def __exit__(self, exec_type, exec_value, traceback):
         # Don't check for leaks if an exception was thrown
         if exec_type is not None:
+=======
+    def __exit__(self, exc_type, exc_value, traceback):
+        # Don't check for leaks if an exception was thrown
+        if exc_type is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
         # Compares caching allocator before/after statistics
         # An increase in allocated memory is a discrepancy indicating a possible memory leak
@@ -1074,6 +1103,7 @@ def test_matmul_autocast(self):
                          f"Autocast & non-autocast tensors did not match, \
                          got:\n{autocast_output_tensor} \n{output_tensor.to(torch.float16)}")
 
+<<<<<<< HEAD
     # Regression test for https://github.com/pytorch/pytorch/issues/141774
     def test_scaled_dot_product_attention_autocast(self):
         # TODO(hvaara): Parameterize the dtypes for cleaner code and better failure debugability
@@ -1090,6 +1120,143 @@ def test_scaled_dot_product_attention_autocast(self):
             y = F.scaled_dot_product_attention(query, key, value.to(torch.float32))
             self.assertEqual(y.to(y_autocast.dtype), y_autocast)
 
+=======
+    @parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    def test_scaled_dot_product_attention_autocast(self, dtype):
+        # Regression test for https://github.com/pytorch/pytorch/issues/141774
+        if dtype == torch.bfloat16 and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("bfloat16 needs MacOS14+")
+
+        query = torch.rand(4, 1, 16, 8, dtype=torch.float32, device="mps")
+        key = torch.rand(4, 1, 16, 8, dtype=torch.float32, device="mps")
+        value = torch.rand(4, 1, 16, 8, dtype=dtype, device="mps")
+
+        with torch.amp.autocast(device_type="mps"):
+            y_autocast = F.scaled_dot_product_attention(query, key, value)
+
+        y = F.scaled_dot_product_attention(query, key, value.to(torch.float32))
+        self.assertEqual(y.to(y_autocast.dtype), y_autocast)
+
+    def test_gradscaler_mps(self):
+        # big model to force chunking/depth in the gradscaler dispatch
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = nn.Linear(10, 2048)
+                self.fc2 = nn.Linear(2048, 2048)
+                self.fc3 = nn.Linear(2048, 2048)
+                self.fc4 = nn.Linear(2048, 2048)
+                self.fc5 = nn.Linear(2048, 5)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.relu(self.fc2(x))
+                x = self.relu(self.fc3(x))
+                x = self.relu(self.fc4(x))
+                return self.fc5(x)
+        torch.manual_seed(42)
+
+        def helper(model_cpu, model_mps, dtype, iterations, batch_size, atol=3e-4, rtol=1e-5):
+            if dtype == torch.bfloat16 and MACOS_VERSION < 14.0:
+                raise unittest.SkipTest("bfloat16 needs MacOS14+")
+            optimizer_cpu = torch.optim.SGD(model_cpu.parameters(), lr=0.01)
+            optimizer_mps = torch.optim.SGD(model_mps.parameters(), lr=0.01)
+            loss_fn = nn.MSELoss()
+
+            input_cpu = torch.randn(batch_size, 10)
+            target_cpu = torch.randn(batch_size, 5)
+            input_mps = input_cpu.to('mps')
+            target_mps = target_cpu.to('mps')
+
+            scaler_cpu = torch.amp.GradScaler(device="cpu")
+            scaler_mps = torch.amp.GradScaler(device="mps")
+            for _ in range(iterations):
+                optimizer_cpu.zero_grad()
+                optimizer_mps.zero_grad()
+
+                with torch.amp.autocast(device_type="cpu", dtype=dtype):
+                    output_cpu = model_cpu(input_cpu)
+                    loss_cpu = loss_fn(output_cpu, target_cpu)
+                scaler_cpu.scale(loss_cpu).backward()
+                scaler_cpu.step(optimizer_cpu)
+                scaler_cpu.update()
+
+                with torch.autocast(device_type="mps", dtype=dtype):
+                    output_mps = model_mps(input_mps)
+                    loss_mps = loss_fn(output_mps, target_mps)
+                scaler_mps.scale(loss_mps).backward()
+                scaler_mps.step(optimizer_mps)
+                scaler_mps.update()
+
+            for p_cpu, p_mps in zip(model_cpu.parameters(), model_mps.parameters()):
+                self.assertEqual(p_mps.cpu(), p_cpu, rtol=rtol, atol=atol)
+
+        model_cpu = Model().to('cpu')
+        model_mps = Model().to('mps')
+        model_mps.load_state_dict(model_cpu.state_dict())
+
+        helper(model_cpu, model_mps, torch.float16, iterations=5, batch_size=4)
+        helper(model_cpu, model_mps, torch.bfloat16, iterations=5, batch_size=4)
+
+    def test_non_fast_path_amp_unscale(self):
+        torch.manual_seed(42)
+
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = nn.Linear(10, 10)
+                self.linear2 = nn.Linear(10, 10)
+
+            def forward(self, x):
+                x = self.linear1(x)
+                x = F.relu(x)
+                x = self.linear2(x)
+                x = x.mean(dim=1)
+                return x
+
+        cpu_model = Model().to("cpu")
+        mps_model = copy.deepcopy(cpu_model).to("mps")
+
+        cpu_optimizer = torch.optim.SGD(cpu_model.parameters(), lr=0.01)
+        mps_optimizer = torch.optim.SGD(mps_model.parameters(), lr=0.01)
+        cpu_scaler = torch.amp.GradScaler(device="cpu")
+        mps_scaler = torch.amp.GradScaler(device="mps")
+
+        def helper(model, optimizer, scaler, device, input, target, apply_grad_transform=False):
+            optimizer.zero_grad()
+            with torch.autocast(device_type=device, dtype=torch.bfloat16):
+                output = model(input)
+                loss = nn.MSELoss()(output, target)
+            scaler.scale(loss).backward()
+
+            if apply_grad_transform:
+                for p in model.parameters():
+                    if p.grad is not None and p.grad.dim() >= 2:
+                        p.grad = p.grad.as_strided(p.grad.size(), (1,) * p.grad.dim())
+
+            scaler.unscale_(optimizer)
+            scaler.step(optimizer)
+            scaler.update()
+
+        # CPU forward/backward pass
+        input_cpu = torch.randn(32, 10, device="cpu")
+        target_cpu = torch.randn(32, device="cpu")
+        helper(cpu_model, cpu_optimizer, cpu_scaler, "cpu", input_cpu, target_cpu)
+
+        # MPS forward/backward pass
+        input_mps = input_cpu.to("mps")
+        target_mps = target_cpu.to("mps")
+        helper(mps_model, mps_optimizer, mps_scaler, "mps", input_mps, target_mps, apply_grad_transform=True)
+
+        updated_linear1_weight_cpu = cpu_model.linear1.weight.detach()
+        updated_linear2_weight_cpu = cpu_model.linear2.weight.detach()
+        updated_linear1_weight_mps = mps_model.linear1.weight.detach().cpu()
+        updated_linear2_weight_mps = mps_model.linear2.weight.detach().cpu()
+
+        self.assertEqual(updated_linear1_weight_cpu, updated_linear1_weight_mps, atol=6e-4, rtol=1e-6)
+        self.assertEqual(updated_linear2_weight_cpu, updated_linear2_weight_mps, atol=6e-4, rtol=1e-6)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Expand TestCase class with Memory Leak Detection on MPS device
 class TestCaseMPS(TestCase):
@@ -1828,6 +1995,7 @@ def test_bmm(self):
     @xfailIf(MACOS_VERSION < 15.0)
     @parametrize("dtype", [torch.float16, torch.bfloat16])
     def test_large_bmm(self, dtype):
+<<<<<<< HEAD
         batch1 = torch.randn(11, 20064, 128, dtype=dtype, device='mps')
         batch2 = torch.randn(11, 128, 20064, dtype=dtype, device='mps')
         output_cpu = torch.bmm(batch1.cpu(), batch2.cpu())
@@ -1837,6 +2005,20 @@ def test_large_bmm(self, dtype):
         tol = 1e-2 if dtype == torch.float16 else None
         self.assertEqual(output_cpu, output_mps, atol=tol, rtol=tol)
         self.assertEqual(output_cpu.size(), output_mps.size())
+=======
+        B, M, N = 11, 20064, 128
+        batch1 = torch.randn(B, M, N, dtype=dtype, device='mps')
+        batch2 = torch.randn(B, N, M, dtype=dtype, device='mps')
+        output_mps = torch.bmm(batch1, batch2)
+
+        # For performance reasons, check only one(non-first) batch for correctness
+        # TODO: Check two when https://github.com/pytorch/pytorch/issues/153560 is fixed
+        batch_idx = torch.randint(1, B, size=()).item()
+        output_cpu = torch.mm(batch1[batch_idx].cpu(), batch2[batch_idx].cpu())
+        # Using the low precision comparison for FP16
+        tol = 1e-2 if dtype == torch.float16 else None
+        self.assertEqual(output_cpu, output_mps[batch_idx], atol=tol, rtol=tol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
     def test_take_along_dim(self, dtype):
@@ -2495,8 +2677,13 @@ def test_batch_norm_backward(self):
         # This used to crash, see https://github.com/pytorch/pytorch/issues/98602
         outputs.sum().backward()
 
+<<<<<<< HEAD
     # Regression test for https://github.com/pytorch/pytorch/issues/133520
     def test_batch_norm_slices(self):
+=======
+    def test_batch_norm_slices(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/133520
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bn_cpu = nn.BatchNorm2d(100, affine=False, device='cpu')
         bn_mps = nn.BatchNorm2d(100, affine=False, device='mps')
 
@@ -2736,10 +2923,20 @@ def run_det_test(size, *batch_dims):
         run_det_test(32, 2, 2, 10, 10)
 
     def test_layer_norm(self):
+<<<<<<< HEAD
         # TODO: Test non-contiguous
         def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dtype=torch.float32):
             cpu_x = torch.randn(input_shape, device='cpu', dtype=dtype, requires_grad=True)
             x = cpu_x.detach().clone().to('mps').requires_grad_()
+=======
+        def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dtype=torch.float32, non_contiguous=False):
+            cpu_x = torch.randn(input_shape, device='cpu', dtype=dtype, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
+            if non_contiguous:
+                x = x.mT
+                cpu_x = cpu_x.mT
+                normalized_shape[-1], normalized_shape[-2] = normalized_shape[-2], normalized_shape[-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             cpu_op = torch.nn.LayerNorm(normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device='cpu', dtype=dtype)
             mps_op = torch.nn.LayerNorm(normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device='mps', dtype=dtype)
@@ -2769,10 +2966,17 @@ def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dt
                 self.assertEqual(mps_op.weight.grad, cpu_op.weight.grad)
                 self.assertEqual(mps_op.bias.grad, cpu_op.bias.grad)
 
+<<<<<<< HEAD
         for elementwise_affine in [True, False]:
             helper((2, 2, 2, 2), (2, 2), elementwise_affine=elementwise_affine)
             helper((2, 3, 4, 5), (4, 5), elementwise_affine=elementwise_affine)
             helper((2, 3, 4, 5, 6), (4, 5, 6), elementwise_affine=elementwise_affine)
+=======
+        for (elementwise_affine, non_contiguous) in itertools.product([True, False], [True, False]):
+            helper((2, 2, 2, 2), [2, 2], elementwise_affine=elementwise_affine, non_contiguous=non_contiguous)
+            helper((2, 3, 4, 5), [4, 5], elementwise_affine=elementwise_affine, non_contiguous=non_contiguous)
+            helper((2, 3, 4, 5, 6), [4, 5, 6], elementwise_affine=elementwise_affine, non_contiguous=non_contiguous)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Regression test for https://github.com/pytorch/pytorch/issues/96113
         torch.nn.LayerNorm((16,), elementwise_affine=True).to("mps")(torch.randn(1, 2, 16).to("mps", dtype=torch.float16))
@@ -2790,8 +2994,13 @@ def test_ifft(self):
         # Expecting the inverted to yield the original signal
         self.assertEqual(ifft_result, signal)
 
+<<<<<<< HEAD
     # Regression test for https://github.com/pytorch/pytorch/issues/135223
     def test_fftfreq(self):
+=======
+    def test_fftfreq(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/135223
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         freq_cpu = torch.fft.fftfreq(10**4, device='cpu')
         freq_mps = torch.fft.fftfreq(10**4, device='mps')
         self.assertEqual(freq_cpu, freq_mps)
@@ -4131,6 +4340,7 @@ def test_full_bugs(self):
         y_cpu = torch.full((2, 2), 247, device='cpu', dtype=torch.uint8)
         self.assertEqual(y_mps, y_cpu)
 
+<<<<<<< HEAD
     @unittest.skipIf(MACOS_VERSION < 13.0, "Skipped on macOS 12")
     # See https://github.com/pytorch/pytorch/issues/84995
     def test_div_bugs(self):
@@ -4139,6 +4349,13 @@ def test_div_bugs(self):
                 x = torch.tensor(list(range(1, 11)), device='mps', dtype=dtype)
                 y = torch.div(x, 101, rounding_mode=mode)
                 self.assertEqual(y.sum(), 0)
+=======
+    def test_div_bugs(self):
+        for (dtype, mode) in itertools.product(integral_types(), ['trunc', 'floor']):
+            x = torch.tensor(list(range(1, 11)), device='mps', dtype=dtype)
+            y = torch.div(x, 101, rounding_mode=mode)
+            self.assertEqual(y.sum(), 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # See https://github.com/pytorch/pytorch/issues/82663
     def test_bool_expand(self):
@@ -4400,7 +4617,12 @@ def helper(src_shape, dst_shape, src_dtype, dst_dtype):
 
     # See https://github.com/pytorch/pytorch/pull/84742
     # and https://github.com/pytorch/pytorch/pull/78319
+<<<<<<< HEAD
     def test_binops_dtype_precedence(self):
+=======
+    @parametrize("binop", ['add', 'sub', 'mul', 'div'])
+    def test_binops_dtype_precedence(self, binop):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test dtype precedence (casting order) in binary operations by comparing to CPU result
         # Example values for all dtypes supported on the MPS backend
         sample_vals = {
@@ -4412,8 +4634,12 @@ def test_binops_dtype_precedence(self):
             torch.float32: [-1.0, 0.0, 0.1, 111.99],
         }
         # Test all combinations of dtypes, operations, dimensionality
+<<<<<<< HEAD
         for dtype1, dtype2, binop in itertools.product(
                 sample_vals.keys(), sample_vals.keys(), ['add', 'sub', 'mul', 'div']):
+=======
+        for dtype1, dtype2 in itertools.product(sample_vals, repeat=2):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # bool minus bool is generally unsupported, so skip
             if binop == 'sub' and (dtype1 == torch.bool or dtype2 == torch.bool):
                 continue
@@ -5135,7 +5361,10 @@ def test_eq(self):
 
         self.assertEqual(result_cpu, result_mps.to('cpu'))
 
+<<<<<<< HEAD
     @unittest.skipIf(MACOS_VERSION < 13.0, "Skipped on macOS 12")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_signed_vs_unsigned_comparison(self):
         cpu_x = torch.tensor((-1, 2, 3), device='cpu', dtype=torch.uint8)
         mps_x = torch.tensor((-1, 2, 3), device='mps', dtype=torch.uint8)
@@ -6348,6 +6577,13 @@ def test_remainder(self):
             torch.tensor([1, 2, 3, 4, 5], dtype=torch.int32, device="mps"), -1.5)
         self.assertEqual(res_cpu, res_mps)
 
+<<<<<<< HEAD
+=======
+        # Regression test for https://github.com/pytorch/pytorch/issues/154171
+        # Essentially remained over integral types should rely on integers ops
+        self.assertEqual(torch.tensor(42309891, device='mps') % torch.tensor(31, device='mps'), torch.tensor(6, device='mps'))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_expand(self):
         def helper(n, c):
             values = [[1.0], [4.0], [7.0]]
@@ -6368,6 +6604,55 @@ def helper(x):
         x = x_cpu.detach().clone().to('mps')
         self.assertEqual(helper(x_cpu), helper(x))
 
+<<<<<<< HEAD
+=======
+    def test_col2im(self):
+        def helper(shapes, output_size, kernel_size, padding, stride, contiguous, dtype=torch.float32, test_bool=False):
+            atol = 1e-5 if dtype == torch.float else 1e-2
+            rtol = 1e-3 if dtype == torch.float else 1e-2
+            x_cpu = torch.rand(*shapes, dtype=dtype)
+            if test_bool:
+                x_cpu = x_cpu > 0.5
+            x_mps = x_cpu.clone().to('mps')
+            if not contiguous:
+                x_cpu = x_cpu.mT
+                x_mps = x_mps.mT
+            out_cpu = torch.nn.functional.fold(
+                x_cpu,
+                output_size=output_size,
+                kernel_size=kernel_size,
+                padding=padding,
+                stride=stride
+            )
+            out_mps = torch.nn.functional.fold(
+                x_mps,
+                output_size=output_size,
+                kernel_size=kernel_size,
+                padding=padding,
+                stride=stride
+            )
+            self.assertEqual(out_cpu, out_mps, atol=atol, rtol=rtol)
+
+        helper((4, 27, 1600), (40, 40), 3, 1, 1, True)
+        helper((1, 27, 1600), (40, 40), 3, 1, 1, True)
+        helper((27, 1600), (40, 40), 3, 1, 1, True)
+        helper((27, 320), (80, 4), 3, 1, 1, True)
+        helper((27, 320), (4, 80), 3, 1, 1, True)
+        helper((320, 27), (4, 80), 3, 1, 1, False)
+        helper((4, 75, 1600), (40, 40), 5, 2, 1, True)
+        helper((4, 75, 441), (41, 41), 5, 2, 2, True)
+        helper((4, 12, 100), (20, 20), 2, 0, 2, True)
+        helper((4, 48, 225), (30, 30), 4, 1, 2, True)
+        helper((100, 75), (20, 20), 5, 2, 2, False)
+        helper((4, 15, 1600), (40, 40), (3, 5), (1, 2), (1, 1), True)
+        helper((4, 45, 187), (35, 33), (3, 5), (0, 1), (2, 3), True)
+        helper((1600, 15), (40, 40), (3, 5), (1, 2), (1, 1), False)
+        if MACOS_VERSION >= 14.0:
+            helper((20, 15), (2, 10), (3, 5), (1, 2), (1, 1), False, torch.bfloat16)
+        helper((20, 15), (2, 10), (3, 5), (1, 2), (1, 1), False, torch.float16)
+        helper((20, 15), (2, 10), (3, 5), (1, 2), (1, 1), False, test_bool=True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_select(self):
         def helper(n, c):
             cpu_x = torch.randn(n, c, device='cpu', dtype=torch.float, requires_grad=True)
@@ -6705,6 +6990,7 @@ def helper(shape):
 
         helper((2, 8, 4, 5))
 
+<<<<<<< HEAD
     def test_log1p(self):
         def helper(shape):
             cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
@@ -6716,6 +7002,27 @@ def helper(shape):
             self.assertEqual(log_result, log_result_cpu)
 
         helper((2, 8, 4, 5))
+=======
+    @parametrize("dtype", {torch.float, torch.half} if MACOS_VERSION < 14 else {torch.float, torch.half, torch.bfloat16})
+    def test_log1p(self, dtype):
+        eps = torch.finfo(dtype).eps
+        # Small values
+        cpu_x = torch.arange(-10.0 * eps, 10.0 * eps, 1e-2 * eps, dtype=dtype, requires_grad=False)
+        x = cpu_x.detach().clone().to('mps')
+
+        log_result = torch.log1p(x)
+        log_result_cpu = torch.log1p(cpu_x)
+        self.assertEqual(log_result, log_result_cpu, atol=0, rtol=2e-7)
+
+        # Fallback to log
+        cpu_x = torch.arange(-1.0, 2.0, 1e-4, dtype=dtype, requires_grad=False)
+        x = cpu_x.detach().clone().to('mps')
+
+        log_result = torch.log1p(x)
+        log_result_cpu = torch.log1p(cpu_x)
+
+        self.assertEqual(log_result, log_result_cpu, atol=0, rtol=2e-7)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_logaddexp(self):
         def helper(shape):
@@ -7536,6 +7843,42 @@ def helper(shape, dim, index, idx_dtype=torch.int32):
         helper((), 0, [0])
         helper((5), 0, [])
 
+<<<<<<< HEAD
+=======
+    def test_index_copy_non_contiguous(self):
+        def helper(shape, dim, index):
+            dest_cpu = torch.randn(shape)
+            dest = dest_cpu.clone().to('mps')
+
+            dest_cpu = dest_cpu.transpose(0, 1)
+            dest = dest.transpose(0, 1)
+            dim = 1 if dim == 0 else 0 if dim == 1 else dim
+
+            src_shape = list(dest_cpu.shape)
+            src_shape[dim] = len(index)
+            src_cpu = torch.randn(src_shape)
+            src = src_cpu.clone().to('mps')
+
+            idx_cpu = torch.tensor(index, dtype=torch.long)
+            idx_mps = idx_cpu.clone().to('mps')
+
+            dest_cpu.index_copy_(dim, idx_cpu, src_cpu)
+            dest.index_copy_(dim, idx_mps, src)
+            self.assertEqual(dest, dest_cpu)
+
+        test_cases = [
+            ((2, 8, 4, 5), 0, [1]),
+            ((8, 8, 4, 5), 0, [0, 3, 2, 7, 6]),
+            ((2, 8, 4, 5), 1, [0, 3, 2, 7, 6]),
+            ((2, 8, 4, 5), 2, [3, 0, 1]),
+            ((2, 8, 4, 5), 3, [2, 3, 0]),
+            ((2, 3, 3), -1, [1, 2])
+        ]
+
+        for args in test_cases:
+            helper(*args)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_select_scalar(self):
         def helper(value, dim, index, idx_dtype=torch.int32):
             cpu_x = torch.tensor(value, device='cpu', dtype=torch.float, requires_grad=False)
@@ -7816,21 +8159,41 @@ def helper(shape, diag=0):
         helper((2, 8, 4, 5), diag=-1)
         helper((2, 8, 4, 5), diag=-2)
         helper((2, 8, 4, 5), diag=-3)
+<<<<<<< HEAD
 
     # Test inverse
     def test_inverse(self):
         def helper(n):
+=======
+        # Test inplace
+        x_mps = torch.arange(9.0, device='mps').reshape(3, 3).t().triu()
+        x_cpu = torch.arange(9.0, device='cpu').reshape(3, 3).t().triu()
+        self.assertEqual(x_cpu, x_mps)
+        self.assertEqual(x_cpu.stride(), x_mps.stride())
+
+    # Test inverse
+    def test_inverse(self):
+        def helper(n, atol=1e-5, rtol=1e-6):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cpu_input = torch.randn(n, n, device='cpu')
             mps_input = cpu_input.to('mps')
 
             cpu_result = torch.linalg.inv(cpu_input)
             mps_result = torch.linalg.inv(mps_input)
+<<<<<<< HEAD
             self.assertEqual(cpu_result, mps_result)
+=======
+            self.assertEqual(cpu_result, mps_result, atol=atol, rtol=rtol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         helper(2)
         helper(6)
         helper(3)
         helper(8)
+<<<<<<< HEAD
+=======
+        helper(1025, atol=1e-4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Test tril
     def test_tril(self):
@@ -8616,6 +8979,48 @@ def test_do_sync_thrice_its_all_right(self):
             torch.mps.synchronize()
         self.assertLess(x.sum().item(), x.numel())
 
+<<<<<<< HEAD
+=======
+    @parametrize("dtype", [torch.int32, torch.int64, torch.int16, torch.int8, torch.uint8])
+    def test_inplace_bitwise_not(self, dtype):
+        # Start with bitwise not here (reported by @qqaatw)
+        x_mps, x_cpu = [torch.arange(64, device=device, dtype=dtype) for device in ["cpu", "mps"]]
+        for x in [x_mps, x_cpu]:
+            x[::2].bitwise_not_()
+        self.assertEqual(x_mps.cpu(), x_cpu)
+
+
+class TestLargeTensors(TestCaseMPS):
+    def test_64bit_binops(self):
+        if torch.mps.recommended_max_memory() < 16_000_000_000:
+            raise unittest.SkipTest("Needs at least 16Gb of RAM")
+        a = torch.rand(1, 1024, 1024, dtype=torch.float16, device='mps')
+        b = torch.rand(5000, 1, 1, dtype=torch.float16, device='mps')
+        rc = (a + b).sin()
+        slice_idx = -2
+        rc_slice = rc[slice_idx:]
+        rc_slice_cpu = (a.cpu() + b.cpu()[slice_idx:]).sin()
+        self.assertEqual(rc_slice, rc_slice_cpu)
+
+    @serialTest()
+    def test_64bit_index_select(self):
+        if torch.mps.recommended_max_memory() < 16_000_000_000:
+            raise unittest.SkipTest("Needs at least 16Gb of RAM")
+        B, N = 11, 20000
+        x = torch.empty(B, N, N, dtype=torch.float16, device='mps')
+        for i in range(B):
+            x[i] = 1.0 * i
+        batch_idx = torch.tensor([9], device='mps')
+        y = x[batch_idx]
+        self.assertEqual(y[0, 1, 2].item(), 9.0)
+        # Reclaim memory after running the tests
+        del y
+        del x
+        gc.collect()
+        torch.mps.empty_cache()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestLogical(TestCaseMPS):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
         return torch.tensor(x, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -8779,6 +9184,7 @@ def helper(dtype):
 
         [helper(dtype) for dtype in dtypes]
 
+<<<<<<< HEAD
     def test_isin_asserts(self):
         A = torch.randn(size=[1, 4], device='mps', dtype=torch.float32)
         B = torch.randn(size=[1, 4], device='mps', dtype=torch.float16)
@@ -8786,6 +9192,23 @@ def test_isin_asserts(self):
             out = torch.isin(A, B)
 
 
+=======
+        # Mixed dtypes (see https://github.com/pytorch/pytorch/issues/151443 )
+        # torch.isin is broken in MacOS-13.2 even for the same dtype
+        if MACOS_VERSION >= 14.0:
+            x = torch.arange(4.0, device="mps")
+            y = torch.tensor([1, 3], device="mps", dtype=torch.float16)
+            self.assertEqual(torch.isin(x, y), torch.tensor([False, True, False, True], device="mps"))
+
+            # Tensor.Scalar variant (aliases to eq), not covered by OpInfo
+            self.assertEqual(torch.isin(x, 2.0), torch.tensor([False, False, True, False], device="mps"))
+            self.assertEqual(torch.isin(x, 1.0, invert=True), torch.tensor([True, False, True, True], device="mps"))
+            self.assertEqual(torch.isin(x, 8.0), torch.tensor([False, False, False, False], device="mps"))
+            # Scalar.Tensor varaiant(alaises to Scalar.Scalar), not covered by OpInfo
+            self.assertEqual(torch.isin(2.0, x), torch.tensor(True, device="mps"))
+
+    def test_isin_asserts(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         C = torch.randn(size=[1, 4], device='mps', dtype=torch.float32)
         D = torch.randn(size=[1, 4], device='cpu', dtype=torch.float32)
         with self.assertRaisesRegex(RuntimeError, 'Expected elements.is_mps()*'):
@@ -9054,6 +9477,17 @@ def test_topk(self):
                 with self.subTest(shape=shape, largest_val=largest_val):
                     self._test_topk(shape, largest_val)
 
+<<<<<<< HEAD
+=======
+    def test_topk_gt_4d(self):
+        a = torch.ones(5, 4, 3, 2, 1, dtype=torch.float).to('mps')
+        try:
+            t_mps = torch.ops.aten.topk(a, k=5, dim=0)
+        except Exception as e:
+            e_string = str(e)
+            self.assertEqual(e_string, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestNNMPS(NNTestCase):
 
     def _create_basic_net(self):
@@ -9277,7 +9711,10 @@ def test_conv2d_backward_collision(self):
         # This used to crash with MPSNDArrayConvolutionA14.mm:4352: failed assertion
         y2.sum().backward()
 
+<<<<<<< HEAD
     @unittest.skipIf(MACOS_VERSION < 13.2, "Skipped on macOS 12")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv3d_backward_collision(self):
         # Conv3D is only available from MacOS 13.2 onwards
         x = torch.rand(1, 1, 10, 10, 20, device="mps", requires_grad=True)
@@ -9942,6 +10379,117 @@ def test_sdpa_enable_gqa(self, dtype, is_causal):
             )
         self._compare_tensors(y.cpu(), y_ref)
 
+<<<<<<< HEAD
+=======
+    @serialTest
+    def test_sdpa_fp32_no_memory_leak(self):
+        def get_mps_memory_usage():
+            return (torch.mps.current_allocated_memory() / (1024 * 1024),
+                    torch.mps.driver_allocated_memory() / (1024 * 1024))
+
+        batch_size, seq_len, num_heads, head_dim = 4, 128, 8, 64
+        query = torch.randn(batch_size, num_heads, seq_len, head_dim, device="mps", dtype=torch.float32)
+        key = torch.randn(batch_size, num_heads, seq_len, head_dim, device="mps", dtype=torch.float32)
+        value = torch.randn(batch_size, num_heads, seq_len, head_dim, device="mps", dtype=torch.float32)
+        memory_footprints = []
+        for i in range(100):
+            output = F.scaled_dot_product_attention(query, key, value)
+            current_mem, driver_mem = get_mps_memory_usage()
+            memory_footprints.append((current_mem, driver_mem))
+        # 5 MB different maximum allowed value(could be decreased even more)
+        torch.testing.assert_close(memory_footprints[-1], memory_footprints[0], atol=5, rtol=1)
+
+    def generate_qkv(self, batch, NH, q_len, s_len, head_dim, contiguous, dtype):
+        if contiguous:
+            q = torch.randn(batch, NH, q_len, head_dim, dtype=dtype, device="mps")
+            k = torch.randn(batch, NH, s_len, head_dim, dtype=dtype, device="mps")
+        else:
+            q = torch.randn(batch, NH, head_dim, q_len, dtype=dtype, device="mps").mT
+            k = torch.randn(batch, NH, head_dim, s_len, dtype=dtype, device="mps").mT
+        v = torch.randn(batch, NH, s_len, head_dim, dtype=dtype, device="mps")
+        return q, k, v
+
+    def run_fast_attention_test(self, q, k, v, with_mask, dropout_p=0.0, is_causal=False):
+        q_len = q.shape[2]
+        s_len = k.shape[2]
+
+        if with_mask:
+            attn_mask = torch.zeros(q.shape[0], q.shape[1], q_len, s_len,
+                                    dtype=torch.bool, device=q.device)
+            attn_mask[..., s_len // 2:] = True
+
+            with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
+                y = F.scaled_dot_product_attention(
+                    q, k, v,
+                    attn_mask=attn_mask,
+                    dropout_p=dropout_p,
+                    is_causal=is_causal,
+                )
+            y_ref = F.scaled_dot_product_attention(
+                q.cpu(),
+                k.cpu(),
+                v.cpu(),
+                attn_mask=attn_mask.cpu(),
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+            )
+        else:
+            with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
+                y = F.scaled_dot_product_attention(
+                    q, k, v,
+                    dropout_p=dropout_p,
+                    is_causal=is_causal,
+                )
+            y_ref = F.scaled_dot_product_attention(
+                q.cpu(),
+                k.cpu(),
+                v.cpu(),
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+            )
+        self._compare_tensors(y.cpu(), y_ref)
+
+    @parametrize("dtype", [torch.float16, torch.float32])
+    @parametrize("contiguous", [True, False])
+    @parametrize("head_dim", [64, 96, 128])  # 64, 96, 128 are for the fast kernel
+    @parametrize("with_mask", [True, False])
+    def test_fast_vector_attention(self, dtype, contiguous, head_dim, with_mask):
+        torch.manual_seed(1729)
+        batch = 1
+        NH = 2
+        q_len = 4  # <8 so that vector fast is eligible
+        s_len = 16  # smaller than 1024 so that we use the one–pass variant
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        self.run_fast_attention_test(q, k, v, with_mask)
+
+    @parametrize("dtype", [torch.float32])  # float16 underflows sometimes, which leads to flaky tests
+    @parametrize("contiguous", [True, False])
+    @parametrize("with_mask", [True, False])
+    def test_fast_vector_attention_2pass(self, dtype, contiguous, with_mask):
+        torch.manual_seed(1729)
+        batch = 1
+        NH = 32
+        q_len = 8
+        s_len = 1024  # large enough to trigger the two–pass path
+        head_dim = 64  # supported head dimension for vector attention
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        self.run_fast_attention_test(q, k, v, with_mask)
+
+    @unittest.skip("Full attention fast kernel not implemented yet")
+    @parametrize("dtype", [torch.float16, torch.float32])
+    @parametrize("contiguous", [True, False])
+    @parametrize("head_dim", [64, 80, 128])  # 64, 80, 128 are for the fast kernel
+    @parametrize("with_mask", [True, False])
+    def test_fast_full_attention(self, dtype, contiguous, head_dim, with_mask):
+        torch.manual_seed(1729)
+        batch = 1
+        NH = 2
+        q_len = 32  # threshold to trigger full fast attention path
+        s_len = 16
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        self.run_fast_attention_test(q, k, v, with_mask)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestGatherScatter(TestCaseMPS):
     def test_slicing_with_step(self):
@@ -11041,7 +11589,10 @@ def test_conv2d_single_stride(self):
             x_gpu = conv_gpu(y_gpu)
             self.assertEqual(x_cpu, x_gpu.cpu(), rtol=1e-03, atol=1e-05)
 
+<<<<<<< HEAD
     @unittest.skipIf(MACOS_VERSION < 13.2, "Skipped on macOS 12")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conv3d_single_stride(self):
         # Conv3d is only available from MacOS 13.2 onwards
         y_cpu = torch.randn(2, 2, 3, 6)
@@ -11487,9 +12038,12 @@ def helper(dtype):
 
             self.assertEqual(res_cpu, res_mps, str(dtype))
         for dtype in self.supported_dtypes:
+<<<<<<< HEAD
             # MPS support binary op with uint8 natively starting from macOS 13.0
             if MACOS_VERSION < 13.0 and dtype == torch.uint8:
                 continue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             helper(dtype)
 
     def test_advanced_indexing_3D_get(self):
@@ -11648,7 +12202,10 @@ def test_bool_indices(self, device="mps"):
             self.assertEqual(v[boolIndices], torch.tensor([True], dtype=torch.bool, device=device))
             self.assertEqual(len(w), 2)
 
+<<<<<<< HEAD
     @unittest.skipIf(MACOS_VERSION < 13.0, "Skipped on macOS 12")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_bool_indices_accumulate(self, device="mps"):
         mask = torch.zeros(size=(10, ), dtype=torch.uint8, device=device)
         mask = mask > 0
@@ -11866,7 +12423,10 @@ def helper(device, dtype):
             self.assertEqual(res.shape, src.shape)
         [helper(device="mps", dtype=dtype) for dtype in [torch.float, torch.int32]]
 
+<<<<<<< HEAD
     @unittest.skipIf(MACOS_VERSION < 13.0, "Skipped on macOS 12")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_index_src_datatype(self):
         def helper(device, dtype):
             orig_dtype = dtype
@@ -12466,14 +13026,20 @@ def test_serialization_map_location(self):
 MPS_GRAD_DTYPES = [torch.float32, torch.float16]
 
 
+<<<<<<< HEAD
 def transform_opinfo_sample_to_mps(sample):
     """Transforms opinfo.core.SampleInput from CPU to MPS"""
+=======
+def transform_opinfo_sample_to_cpu(sample):
+    """Transforms opinfo.core.SampleInput from MPS to CPU"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def transform_sample(x):
         if not isinstance(x, torch.Tensor):
             return x
         requires_grad = x.requires_grad
         conjugated = x.is_conj()
         rc = x.detach()
+<<<<<<< HEAD
         rc = rc.to("mps") if not conjugated else x.conj().to("mps").conj()
         return rc.requires_grad_(x.requires_grad)
 
@@ -12483,6 +13049,18 @@ def transform_sample(x):
     if mps_sample.kwargs.get("device", "") == "cpu":
         mps_sample.kwargs["device"] = "mps"
     return mps_sample
+=======
+        rc = rc.cpu() if not conjugated else x.conj().cpu().conj()
+        return rc.requires_grad_(x.requires_grad)
+
+    cpu_sample = sample.transform(transform_sample)
+
+    # Transform kwargs `device="mps:0"` to `device="cpu"`
+    if cpu_sample.kwargs.get("device", "") == "mps:0":
+        cpu_sample.kwargs["device"] = "cpu"
+
+    return cpu_sample
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestConsistency(TestCaseMPS):
     # TODO: This is only used while some ops are being added.
@@ -12544,6 +13122,7 @@ class TestConsistency(TestCaseMPS):
         'norm', 'masked.normalize',
         'arange', 'linspace',
         'special.xlog1py',
+<<<<<<< HEAD
     }
 
     FP32_LOW_PRECISION_LIST = {
@@ -12551,6 +13130,19 @@ class TestConsistency(TestCaseMPS):
         # difference compared to CPU/CUDA, so we use lower precision on FP32
         'nn.functional.conv2d',
         'nn.functional.conv_transpose2d',
+=======
+
+        # CPU accumulates sequantially, but GPU does in in parallel
+        '_unsafe_masked_index_put_accumulate',
+    }
+
+    FP32_LOW_PRECISION_LIST = {
+        # conv2d, conv_transpose2d and conv_transpose3d results have a very small
+        # difference compared to CPU/CUDA, so we use lower precision on FP32
+        'nn.functional.conv2d',
+        'nn.functional.conv_transpose2d',
+        'nn.functional.conv_transpose3d',
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         'matmul', '__rmatmul__',
         'linalg.multi_dot',
         'addbmm',
@@ -12561,7 +13153,11 @@ def _compute_tolerances(self, op, dtype):
             return (1e-4, 3e-5)
 
         if op.name in self.FP16_LOW_PRECISION_LIST and dtype in [torch.float16, torch.bfloat16]:
+<<<<<<< HEAD
             return (1e-2, 1e-2) if dtype == torch.float16 else (5e-2, 5e-2)
+=======
+            return (2e-2, 1e-2) if dtype == torch.float16 else (5e-2, 5e-2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if op.name in self.BF16_LOW_PRECISION_LIST and dtype == torch.bfloat16:
             return (5e-2, 5e-2)
@@ -12595,8 +13191,15 @@ def _compute_tolerances(self, op, dtype):
 
     @ops(mps_ops_modifier(test_consistency_op_db), allowed_dtypes=MPS_DTYPES)
     def test_output_match(self, device, dtype, op):
+<<<<<<< HEAD
         self.assertEqual(device, "cpu")
         include_conjugated_inputs = dtype.is_complex and op.test_conjugated_samples
+=======
+        self.assertEqual(device, "mps:0")
+        include_conjugated_inputs = dtype.is_complex and op.test_conjugated_samples
+        if op.name.endswith("svd") and MACOS_VERSION < 14.0 and dtype == torch.complex64:
+            raise unittest.SkipTest("Can't even generate complex samples on MacOS-13")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def get_samples():
             return op.sample_inputs(
@@ -12604,6 +13207,7 @@ def get_samples():
                 dtype,
                 requires_grad=(dtype.is_floating_point or dtype.is_complex),
                 include_conjugated_inputs=include_conjugated_inputs,
+<<<<<<< HEAD
                 # TODO: Enable per-sample seed setting and tweak tolerances / fix xfails
                 set_seed=False,
             )
@@ -12614,6 +13218,16 @@ def get_samples():
             # Forward check
             #
             mps_sample = transform_opinfo_sample_to_mps(cpu_sample)
+=======
+                set_seed=True,
+            )
+
+        for mps_sample in get_samples():
+            #
+            # Forward check
+            #
+            cpu_sample = transform_opinfo_sample_to_cpu(mps_sample)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             cpu_args = [cpu_sample.input] + list(cpu_sample.args)
             cpu_kwargs = cpu_sample.kwargs
@@ -12624,8 +13238,21 @@ def get_samples():
             if op.name == "tensor_split" and isinstance(mps_args[1], torch.Tensor):
                 mps_args[1] = cpu_args[1]
 
+<<<<<<< HEAD
             cpu_out = op(*cpu_args, **cpu_kwargs)
             mps_out = op(*mps_args, **mps_kwargs)
+=======
+            # Order of ops in index_put is not guaranteed, which can lead to large errors if inputs are
+            # not normalized
+            if op.name == "_unsafe_masked_index_put_accumulate" and dtype in [torch.bfloat16, torch.float16]:
+                mps_args[3] = F.normalize(mps_args[3])
+                cpu_args[3] = F.normalize(cpu_args[3])
+
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=UserWarning)
+                cpu_out = op(*cpu_args, **cpu_kwargs)
+                mps_out = op(*mps_args, **mps_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             atol, rtol = self._compute_tolerances(op, dtype)
             if (op.name == "nn.functional.interpolate" and dtype == torch.uint8 and
@@ -12637,6 +13264,7 @@ def get_samples():
                 # in slight numerical differences
                 atol, rtol = 1, 0
 
+<<<<<<< HEAD
             if op.name == "_upsample_bilinear2d_aa" and cpu_kwargs.get("scale_factors") == [1.7, 0.9]:
                 # Similar to the above, float vs double precision aresults in slight error
                 atol, rtol = 2e-5, 2e-6
@@ -12646,6 +13274,17 @@ def get_samples():
     @ops(mps_ops_grad_modifier(copy.deepcopy(test_consistency_op_db)), allowed_dtypes=MPS_GRAD_DTYPES)
     def test_output_grad_match(self, device, dtype, op):
         self.assertEqual(device, "cpu")
+=======
+            if op.name in ["_upsample_bilinear2d_aa", "_upsample_bicubic2d_aa"] and cpu_kwargs.get("scale_factors") == [1.7, 0.9]:
+                # Similar to the above, float vs double precision aresults in slight error
+                atol, rtol = 2e-5, 2e-6
+
+            self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
+
+    @ops(mps_ops_grad_modifier(copy.deepcopy(test_consistency_op_db)), allowed_dtypes=MPS_GRAD_DTYPES)
+    def test_output_grad_match(self, device, dtype, op):
+        self.assertEqual(device, "mps:0")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def get_samples():
             return op.sample_inputs(
@@ -12655,6 +13294,7 @@ def get_samples():
                 # TODO: Enable per-sample seed setting and tweak tolerances / fix xfails
                 set_seed=False,
             )
+<<<<<<< HEAD
         cpu_samples = get_samples()
 
         for cpu_sample in cpu_samples:
@@ -12663,6 +13303,14 @@ def get_samples():
             #
             forward_failed = False
             mps_sample = transform_opinfo_sample_to_mps(cpu_sample)
+=======
+
+        for mps_sample in get_samples():
+            #
+            # Forward check
+            #
+            cpu_sample = transform_opinfo_sample_to_cpu(mps_sample)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             cpu_args = [cpu_sample.input] + list(cpu_sample.args)
             cpu_kwargs = cpu_sample.kwargs
@@ -12673,8 +13321,21 @@ def get_samples():
             if op.name == "tensor_split" and isinstance(mps_args[1], torch.Tensor):
                 mps_args[1] = cpu_args[1]
 
+<<<<<<< HEAD
             cpu_out = op(*cpu_args, **cpu_kwargs)
             mps_out = op(*mps_args, **mps_kwargs)
+=======
+            # Order of ops in index_put is not guaranteed, which can lead to large errors if inputs are
+            # not normalized
+            if op.name == "_unsafe_masked_index_put_accumulate" and dtype in [torch.bfloat16, torch.float16]:
+                mps_args[3] = F.normalize(mps_args[3])
+                cpu_args[3] = F.normalize(cpu_args[3])
+
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=UserWarning)
+                cpu_out = op(*cpu_args, **cpu_kwargs)
+                mps_out = op(*mps_args, **mps_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if op.name == "unique" and cpu_kwargs["sorted"] is False:
                 continue
@@ -12689,11 +13350,14 @@ def get_samples():
             #
             # Backward check
             #
+<<<<<<< HEAD
             if forward_failed:
                 # We would've failed immediately anyway, but this error is clearer
                 # We error instead of continuing so that all_backward_pass would not be True
                 raise RuntimeError("Forward pass already failed")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cpu_out = (cpu_out,) if isinstance(cpu_out, torch.Tensor) else tuple(cpu_out)
             mps_out = (mps_out,) if isinstance(mps_out, torch.Tensor) else tuple(mps_out)
 
@@ -12726,13 +13390,49 @@ def req_grad(t):
             ):
                 atol = 1e-5
                 rtol = 1.5e-3
+<<<<<<< HEAD
+            self.assertEqual(cpu_grad_inputs, mps_grad_inputs, atol=atol, rtol=rtol)
+
+=======
+            if op.name == "nn.functional.unfold" and dtype == torch.float16:
+                atol, rtol = 1e-3, 1e-3
+            # Order of ops in unsafe_masked_index backward is not guaranteed
+            # which leads to larger errors
+            if op.name == "_unsafe_masked_index" and dtype == torch.float16:
+                atol, rtol = 3e-3, 3e-3
             self.assertEqual(cpu_grad_inputs, mps_grad_inputs, atol=atol, rtol=rtol)
 
+    def test_fmax_mixed_dtypes(self, device):
+        # Regression tesing for https://github.com/pytorch/pytorch/issues/149951
+        # fmax and fmin are implemented as binary metal shaders and they were implemented
+        # with the assumption that both args have the same dtype
+        x = torch.rand((3, 3), device=device, dtype=torch.float32)
+        x_int = torch.randint(-10, 10, (3, 3), device=device, dtype=torch.int8)
+        y = torch.rand((3, 3), device=device, dtype=torch.float16)
+        for op in [torch.fmax, torch.fmin]:
+            self.assertEqual(op(x, y), op(x.to("mps"), y.to("mps")).cpu())
+            self.assertEqual(op(x_int, y), op(x_int.to("mps"), y.to("mps")).cpu())
+            # Stride
+            self.assertEqual(op(x.t(), y), op(x.to("mps").t(), y.to("mps")).cpu())
+            # Broadcast
+            self.assertEqual(op(x, y[0]), op(x.to("mps"), y.to("mps")[0]).cpu())
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestErrorInputs(TestCase):
     _ignore_not_implemented_error = True
 
+<<<<<<< HEAD
     @ops(mps_ops_error_inputs_modifier(test_error_inputs_op_db), dtypes=OpDTypes.none)
+=======
+    @ops(
+        mps_ops_error_inputs_modifier(
+            [op for op in test_error_inputs_op_db if op.error_inputs_func is not None]
+        ),
+        dtypes=OpDTypes.none
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_error_inputs(self, device, op):
         self.assertEqual(device, "mps:0")
 
@@ -12809,6 +13509,7 @@ def tearDownClass(cls):
     def test_numpy_ref_mps(self, device, dtype, op):
         # Unlike `test_numpy_ref`, this test compares in `float32` since at the time of this test's creation MPS
         # does not support float64 Tensors.
+<<<<<<< HEAD
         # A few ops are currently broken on their reference inputs, but not their sample inputs. These should
         # get patched up and this workaround removed.
         broken_on_ref_inputs = op.name in ('where',)
@@ -12818,6 +13519,11 @@ def test_numpy_ref_mps(self, device, dtype, op):
             op.reference_inputs(device, dtype, set_seed=False) if not broken_on_ref_inputs
             else op.sample_inputs(device, dtype, set_seed=False)
         )
+=======
+
+        # TODO: Enable per-sample seed setting and tweak tolerances / fix xfails
+        inputs = op.reference_inputs(device, dtype, set_seed=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for sample_input in inputs:
             self.compare_with_reference(op, op.ref, sample_input)
 
@@ -12887,6 +13593,12 @@ def test_metal_arange_with_arg(self, start=3.14, step=.5):
     def test_metal_arange_with_arg_and_scalar_tensor(self):
         self.test_metal_arange_with_arg(step=torch.tensor(.5))
 
+<<<<<<< HEAD
+=======
+    def test_metal_arange_with_arg_and_scalar_tensor_float64(self):
+        self.test_metal_arange_with_arg(step=torch.tensor(.5, dtype=torch.float64))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_metal_arange_with_arg_and_cast(self):
         x = torch.zeros(12, device="mps", dtype=torch.half)
         y = torch.zeros(12, device="mps", dtype=torch.half)
@@ -12929,6 +13641,75 @@ def test_metal_include(self):
         lib = torch.mps.compile_shader("#include <c10/metal/special_math.h>")
         self.assertIsNotNone(lib)
 
+<<<<<<< HEAD
+=======
+    @parametrize("dtype", [torch.float32, torch.float16, torch.int32, torch.int64])
+    def test_reduction_utils(self, dtype):
+        if dtype == torch.int64 and MACOS_VERSION < 13.3:
+            raise unittest.SkipTest("Using simd_shuffle_down_and_fill results in ICE on MacOS-13")
+        from torch._inductor.codegen.mps import DTYPE_TO_METAL
+        lib = torch.mps.compile_shader(f"""
+            #include <c10/metal/reduction_utils.h>
+            kernel void do_sum(device {DTYPE_TO_METAL[dtype]}* out,
+                               constant {DTYPE_TO_METAL[dtype]}* inp,
+                               uint idx [[thread_position_in_grid]]) {{
+                out[idx] = c10::metal::simd_sum(inp[idx]);
+            }}
+        """)
+        x = torch.testing.make_tensor(28, device="mps", dtype=dtype)
+        y = torch.empty_like(x)
+        lib.do_sum(y, x)
+        x_sum = x.sum()
+        max_err = (y - x_sum).abs().max().item()
+        self.assertLess(max_err, 1e-2 if dtype == torch.float16 else 1e-5,
+                        f"results are {y}, but all elements should have been {x_sum.item()}")
+
+    @parametrize("dtype", [torch.float32, torch.float16, torch.int32, torch.bfloat16])
+    def test_atomic_add(self, dtype):
+        if dtype == torch.bfloat16 and MACOS_VERSION < 14.0:
+            raise unittest.SkipTest("bfloat requires MacOS-14+")
+        from torch._inductor.codegen.mps import DTYPE_TO_METAL
+        mdtype = DTYPE_TO_METAL[dtype]
+        lib = torch.mps.compile_shader(f"""
+            #include <c10/metal/atomic.h>
+            using namespace c10::metal;
+            kernel void atomic_add(device AtomicType<{mdtype}>::type* out,
+                                  constant {mdtype}* inc,
+                                  uint idx [[thread_position_in_grid]]) {{
+                AtomicType<{mdtype}>::atomic_add(out, idx & 1 ? 3 : 4, inc[idx]);
+            }}
+
+        """)
+        x = torch.arange(16, device="mps", dtype=dtype)
+        y = torch.arange(16, device="mps", dtype=dtype)
+        lib.atomic_add(x, y)
+        self.assertEqual(x[3], 67)
+        self.assertEqual(x[4], 60)
+
+    def test_argument_buffers(self):
+        lib = torch.mps.compile_shader("""
+        constant constexpr auto nbuffers = 64;
+        struct Inputs {
+          metal::array<device float *, nbuffers> args;
+        };
+
+        kernel void sum_all(device float* output, constant Inputs& inputs, uint idx [[thread_position_in_grid]]) {
+          auto rc = inputs.args[0][idx];
+          for(auto i = 1; i < nbuffers; ++i) {
+            rc += inputs.args[i][idx];
+          }
+          output[idx] = rc;
+        }
+        """)
+        inputs = torch.rand(64, 32, device="mps").unbind(0)
+        output = torch.empty_like(inputs[0])
+        lib.sum_all(output, inputs)
+        correct = torch.zeros_like(inputs[0])
+        for inp in inputs:
+            correct += inp
+        self.assertEqual(correct, output)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not torch.mps.profiler.is_metal_capture_enabled(), "Set MTL_CAPTURE_ENABLED and try again")
     def test_metal_capture(self):
         lib = torch.mps.compile_shader("kernel void full(device float* x, uint idx [[thread_position_in_grid]]) { x[idx] = 1.0; }")
@@ -12952,14 +13733,26 @@ def test_metal_capture(self):
 # This requires mps to be properly registered in the device generic test framework which is not the
 # case right now. We can probably use `allow_mps` introduced in https://github.com/pytorch/pytorch/pull/87342
 # to achieve this.
+<<<<<<< HEAD
 instantiate_device_type_tests(TestConsistency, globals(), only_for="cpu")
 instantiate_device_type_tests(TestErrorInputs, globals(), allow_mps=True, only_for="mps")
 instantiate_device_type_tests(TestCommon, globals(), allow_mps=True, only_for="mps")
 instantiate_device_type_tests(TestLinalgMPS, globals(), allow_mps=True, only_for="mps")
+=======
+instantiate_device_type_tests(TestConsistency, globals(), allow_mps=True, only_for="mps")
+instantiate_device_type_tests(TestErrorInputs, globals(), allow_mps=True, only_for="mps")
+instantiate_device_type_tests(TestCommon, globals(), allow_mps=True, only_for="mps")
+instantiate_device_type_tests(TestLinalgMPS, globals(), allow_mps=True, only_for="mps")
+instantiate_parametrized_tests(TestAutocastMPS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_parametrized_tests(TestLogical)
 instantiate_parametrized_tests(TestMPS)
 instantiate_parametrized_tests(TestSDPA)
 instantiate_parametrized_tests(TestSmoothL1Loss)
+<<<<<<< HEAD
+=======
+instantiate_parametrized_tests(TestMetalLibrary)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index 9f58f14143a5..00c936f41b34 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -98,6 +98,47 @@ def send_and_delete_tensors(queue, event, device, dtype, count, size=5):
     event.wait()
 
 
+<<<<<<< HEAD
+=======
+def send_tensor_with_untyped_storage(queue, event):
+    tensors = torch.ones(2, device="cuda").chunk(2, dim=0)
+    specs = []
+    for tensor in tensors:
+        storage = tensor.untyped_storage()
+        (
+            storage_device,
+            storage_handle,
+            storage_size_bytes,
+            storage_offset_bytes,
+            ref_counter_handle,
+            ref_counter_offset,
+            event_handle,
+            event_sync_required,
+        ) = storage._share_cuda_()
+        specs.append(
+            {
+                "tensor_cls": type(tensor),
+                "tensor_size": tensor.shape,
+                "tensor_stride": tensor.stride(),
+                "tensor_offset": tensor.storage_offset(),
+                "dtype": tensor.dtype,
+                "requires_grad": tensor.requires_grad,
+                "storage_cls": type(storage),
+                "storage_device": storage_device,
+                "storage_handle": storage_handle,
+                "storage_size_bytes": storage_size_bytes,
+                "storage_offset_bytes": storage_offset_bytes,
+                "ref_counter_handle": ref_counter_handle,
+                "ref_counter_offset": ref_counter_offset,
+                "event_handle": event_handle,
+                "event_sync_required": event_sync_required,
+            }
+        )
+    queue.put(specs)
+    event.wait()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def receive_and_send_sum(queue, out_queue, event, device, dtype, count, size=5):
     s = torch.full([size], 0, device=device, dtype=dtype)
     for i in range(count):
@@ -631,6 +672,30 @@ def run(rank):
         self.assertRegex(stderr, "Cannot re-initialize CUDA in forked subprocess.")
 
     @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
+<<<<<<< HEAD
+=======
+    def test_rebuild_cuda_tensor(self):
+        ctx = mp.get_context("spawn")
+        queue = ctx.Queue()
+        event = ctx.Event()
+
+        proc = ctx.Process(
+            target=send_tensor_with_untyped_storage,
+            args=(queue, event),
+        )
+        proc.start()
+
+        specs = queue.get()
+        tensors = []
+        for spec in specs:
+            tensors.append(mp.reductions.rebuild_cuda_tensor(**spec))
+        self.assertEqual(tensors, [1, 1])
+
+        del tensors, spec
+        event.set()
+
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_event(self):
         ctx = mp.get_context("spawn")
         queue = ctx.Queue()
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index bdc79d550504..3bd7014f050f 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -6826,7 +6826,13 @@ def test_sdpa_compile(self, device, dtype):
         out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math(
             q_d1, k_d1, v_d1
         )[0]
+<<<<<<< HEAD
         output_ref_atol, output_ref_rtol = get_tolerances(out_ref, out_lp_ref)
+=======
+        output_ref_atol, output_ref_rtol = get_tolerances(
+            out_ref, out_lp_ref, fudge_factor=2
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         attn_d1 = torch.nn.functional.scaled_dot_product_attention(
             q_d1, k_d1, v_d1
@@ -8377,6 +8383,7 @@ def f(values, offsets):
         sample_match_fn=lambda device, sample: ("noncontig_holes" in sample.name),
         name="broken_unflatten_backward",
     ),
+<<<<<<< HEAD
     # -> CPU device conversion backwards is broken
     XFailRule(
         error_type=RuntimeError,
@@ -8387,6 +8394,8 @@ def f(values, offsets):
         ),
         name="broken_to_backward",
     ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # sum() backward is not implemented for non-full reductions
     XFailRule(
         error_type=NotImplementedError,
diff --git a/test/test_nn.py b/test/test_nn.py
index dcfb4c93fa75..dd2c3edb829a 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -36,13 +36,23 @@
     download_file, get_function_arglist, load_tests, skipIfMPS, \
     IS_PPC, \
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
+<<<<<<< HEAD
     skipIfTorchDynamo, skipIfRocmVersionLessThan, gcIfJetson, set_default_dtype
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION, _get_torch_rocm_version
+=======
+    skipIfTorchDynamo, gcIfJetson, set_default_dtype
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, \
+    _get_torch_rocm_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
     ctcloss_reference, get_new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
 from torch.testing._internal.common_device_type import dtypesIfMPS, instantiate_device_type_tests, dtypes, \
+<<<<<<< HEAD
     dtypesIfCUDA, precisionOverride, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
+=======
+    dtypesIfCUDA, precisionOverride, onlyCUDA, onlyCPU, \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, \
     onlyNativeDeviceTypes, deviceCountAtLeast, largeTensorTest, expectedFailureMeta, expectedFailureMPS, \
     skipMeta, get_all_device_types
@@ -1817,7 +1827,11 @@ def check_weight_norm(l, name, num_params):
 
 
     def test_weight_norm(self):
+<<<<<<< HEAD
         for dtype in [torch.float, torch.bfloat16]:
+=======
+        for dtype in [torch.float, torch.bfloat16, torch.float16]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input = torch.randn(3, 4, dtype=dtype)
             m = nn.Linear(4, 5).to(dtype=dtype)
             expected_output = m(input)
@@ -3149,7 +3163,10 @@ def perm_fn(x):
                                                 [2.42240309, 0.0354595, -0.60659063, -0.05378816]]]))
             torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
 
+<<<<<<< HEAD
     @skipIfRocm(msg='Large numerical errors')
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_transformerdecoder(self):
         def get_a_test_layer(use_cuda, activation, batch_first=False):
             d_model = 4
@@ -5194,10 +5211,14 @@ def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
                 self.skipTest("bfloat16 NHWC train failed on ROCm <= 6.3")
 
             if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_native_mixed_bfloat16",
+<<<<<<< HEAD
                                         "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16",
                                         "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16",
                                         "test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
                                         "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16" \
+=======
+                                        "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                         ) and _get_torch_rocm_version() >= (6, 4):
                 self.skipTest("bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600")
 
@@ -6606,6 +6627,27 @@ def test_channel_shuffle_return_alias_of_self(self):
         output = torch.nn.ChannelShuffle(groups)(input_tensor)
         torch.testing.assert_close(output, input_tensor)
 
+<<<<<<< HEAD
+=======
+    def test_channel_shuffle_input_checks(self):
+        input_tensor = torch.rand([1, 3, 2, 2])
+        with self.assertRaisesRegex(RuntimeError,
+                                    "Number of groups to divide channels in must be positive.*"):
+            groups = 0
+            torch.native_channel_shuffle(input_tensor, groups)
+
+        with self.assertRaisesRegex(RuntimeError,
+                                    "Number of channels must be divisible by groups.*"):
+            groups = 2
+            torch.native_channel_shuffle(input_tensor, groups)
+
+        with self.assertRaisesRegex(RuntimeError,
+                                    "channel_shuffle expects input with > 2 dims,.*"):
+            input_tensor = torch.rand([1, 2])
+            groups = 2
+            torch.native_channel_shuffle(input_tensor, groups)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
     def test_native_channel_shuffle_return_alias_of_self(self):
         groups = 3
@@ -6924,6 +6966,10 @@ def test_linear_raise_on_scalar_input(self):
         with self.assertRaisesRegex(RuntimeError, ".*both arguments.*1D.*"):
             m(inp)
 
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize_test('device', ['cpu'] + (['cuda'] if TEST_CUDA else []))
     @parametrize_test('bias', [
         subtest(False, name='nobias'), subtest(True, name='bias')])
@@ -7045,6 +7091,13 @@ def test_bilinear_broadcasting(self):
         expected = m(input1.view(6, 5), input2.view(6, 6)).view(2, 3, 8)
         self.assertEqual(expected, m(input1, input2))
 
+<<<<<<< HEAD
+=======
+    def test_bilinear_value_error(self):
+        with self.assertRaisesRegex(ValueError, "in1_features must be > 0"):
+            nn.Bilinear(0, 0, 0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_fold_invalid_arg(self):
         # input.size(1) not divisible by \prod(kernel_size)
 
@@ -7397,6 +7450,41 @@ def test_layer_norm_eps(self):
         ln = torch.nn.LayerNorm(2, eps=1e-6, elementwise_affine=False)
         self.assertEqual(ln.forward(x), torch.zeros_like(x))
 
+<<<<<<< HEAD
+=======
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    def test_layer_norm_backwards_eps(self):
+        dtype = torch.float
+        m_x_n_list = [(3, 3), (5, 5), (11, 11), (55, 55),
+                      (32, 32), (1024, 32), (1024, 1024),
+                      (33, 33), (1025, 33), (1025, 1025),
+                      (128 * 1024, 32), (32, 128 * 1024)]
+        boolean = [True, False]
+        combinations = itertools.product(boolean, repeat=2)
+        for elementwise_affine, bias in combinations:
+            for m, n in m_x_n_list:
+                x = torch.randn((m, n), dtype=dtype, requires_grad=True)
+                grad_output = torch.rand_like(x)
+                x_cuda = x.clone().detach().to("cuda").requires_grad_()
+                grad_output_cuda = grad_output.clone().detach().to("cuda")
+                ln = nn.LayerNorm(n, dtype=dtype, elementwise_affine=elementwise_affine, bias=bias)
+                ln_cuda = nn.LayerNorm(n, device="cuda", dtype=dtype, elementwise_affine=elementwise_affine, bias=bias)
+                ln_out = ln(x)
+                ln_out_cuda = ln_cuda(x_cuda)
+                ln_out.backward(grad_output)
+                ln_out_cuda.backward(grad_output_cuda)
+                atol = 1e-4
+                rtol = 1e-5
+                if m > 64 * 1024:
+                    atol = 1e-3
+                    rtol = 1e-3
+                if elementwise_affine:
+                    self.assertEqual(ln.weight.grad, ln_cuda.weight.grad, f"weight grad failed: {m=} {n=}", rtol=rtol, atol=atol)
+                if bias and elementwise_affine:
+                    self.assertEqual(ln.bias.grad, ln_cuda.bias.grad, f"bias grad failed: {m=} {n=}", rtol=rtol, atol=atol)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @largeTensorTest("40GB", device="cuda")
     def test_layer_norm_large_tensor(self):
         # test for https://github.com/pytorch/pytorch/issues/136291
@@ -9963,7 +10051,10 @@ def test_upsamplingNearest3d_correctness(self, device, memory_format, isize, osi
         expected_out = expected_out.to(device=device)
         self.assertEqual(out_t, expected_out)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # NotImplementedError: aten::_upsample_nearest_exact3d.out https://github.com/pytorch/pytorch/issues/77764
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last_3d])
     @parametrize_test("isize, osize", [(20, 11), (10, 15)])
     def test_upsamplingNearestExact3d_correctness(self, device, memory_format, isize, osize):
@@ -10230,7 +10321,10 @@ def test_upsamplingBicubic2d_correctness(self, device):
         torch.set_printoptions(precision=5)
         self.assertEqual(out_t, expected_out_t, atol=1e-5, rtol=0)
 
+<<<<<<< HEAD
     @expectedFailureMPS  # NotImplementedError: aten::_upsample_bicubic2d_aa.out https://github.com/pytorch/pytorch/issues/77764
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
     def test_upsamplingBicubic2d_aa_correctness(self, device, memory_format):
         t_in = torch.arange(3 * 8 * 8, dtype=torch.float, device=device).reshape(1, 3, 8, 8)
@@ -10663,6 +10757,20 @@ def test_softmax_double(self, device, dtype):
         expected_ones = F.log_softmax(logits, dim=1).exp().sum(dim=1)
         self.assertEqual(expected_ones, torch.ones_like(expected_ones))
 
+<<<<<<< HEAD
+=======
+        # backward
+        logits = torch.randn(5, 513, dtype=dtype, device=device, requires_grad=True)
+        out = F.log_softmax(logits, dim=1)
+        grad = torch.randn_like(out)
+        out.backward(grad)
+        logits_cpu = logits.detach().cpu()
+        logits_cpu.requires_grad = True
+        out_cpu = F.log_softmax(logits_cpu, dim=1)
+        out_cpu.backward(grad.detach().cpu())
+        self.assertEqual(logits.grad, logits_cpu.grad)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     @dtypes(torch.half)
     @largeTensorTest("20GB")
@@ -11120,7 +11228,10 @@ def test_upsamplingNearest2d_launch_rocm(self, device):
         out = m(inp)
 
     @onlyCUDA
+<<<<<<< HEAD
     @skipCUDAIfCudnnVersionLessThan(7600)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_CTCLoss_cudnn(self, device):
         def _helper(zero_infinity):
             target_lengths = [30, 25, 20]
@@ -11383,6 +11494,7 @@ def test_hardswish_grad(self, device):
         inputs.requires_grad = True
         self.assertTrue(gradcheck(F.hardswish, (inputs,)))
 
+<<<<<<< HEAD
     @onlyCPU
     @dtypes(torch.half, torch.bfloat16, torch.float)
     def test_hardswish_grad_corner(self, device, dtype):
@@ -11397,6 +11509,24 @@ def test_hardswish_grad_corner(self, device, dtype):
         ref = torch.ones(shape, device=device, dtype=dtype)
         ref.fill_(1.5)
         self.assertEqual(cpu_input.grad, ref)
+=======
+    def _test_hardswish_grad_corner(self, device, dtype, scalar, ref_fn):
+        m = nn.Hardswish()
+        shape = (1, 9, 9, 1)
+        inputs = torch.ones(shape, device=device, dtype=dtype)
+        inputs = inputs * scalar
+        inputs.requires_grad = True
+        fwd_result = m(inputs)
+        fwd_result.backward(torch.ones_like(fwd_result))
+        ref = ref_fn(shape, device=device, dtype=dtype)
+        self.assertEqual(inputs.grad, ref)
+
+    @onlyNativeDeviceTypes
+    @dtypes(torch.half, torch.bfloat16, torch.float)
+    def test_hardswish_grad_corner(self, device, dtype):
+        self._test_hardswish_grad_corner(device, dtype, 3, torch.ones)
+        self._test_hardswish_grad_corner(device, dtype, -3, torch.zeros)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_batchnorm_eval(self, ndim, device, dtype, module_dtype=None):
         module_dtype = module_dtype or dtype
@@ -11663,7 +11793,10 @@ def ctc_after_softmax(x):
 
     @onlyCUDA
     @skipCUDAIfRocm(msg="skipped Cudnn test on ROCm")
+<<<<<<< HEAD
     @skipCUDAIfCudnnVersionLessThan(7600)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_ctc_loss_cudnn(self, device):
         batch_size = 16
         input_length = 30
@@ -11688,8 +11821,12 @@ def test_ctc_loss_cudnn(self, device):
 
     @onlyCUDA
     @skipCUDAIfRocm(msg="skipped Cudnn test on ROCm")
+<<<<<<< HEAD
     @skipCUDAIfCudnnVersionLessThan(8000)
     def test_ctc_loss_cudnn_tensor(self, device):
+=======
+    def test_ctc_loss_cudnn_tensor_cuda(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         batch_size = 16
         input_length = 30
         num_labels = 101
@@ -11715,6 +11852,48 @@ def test_ctc_loss_cudnn_tensor(self, device):
         grad_cudnn, = torch.autograd.grad(loss_cudnn, log_probs, grad_out)
         self.assertEqual(grad_cudnn, grad_native, atol=1e-4, rtol=0)
 
+<<<<<<< HEAD
+=======
+    @onlyCUDA
+    @skipCUDAIfRocm(msg="skipped Cudnn test on ROCm")
+    def test_ctc_loss_cudnn_tensor_cpu_length_cuda(self):
+        # batch size
+        N = 50
+        # audio length
+        T = 100
+        # text dimension
+        C = 80
+        # max text length
+        S = 10
+
+        prob_device = torch.device("cuda")
+        other_device = torch.device("cpu")
+        other_dtype = torch.int32
+
+        log_probs = torch.randn(T, N, C).log_softmax(2).to(prob_device)
+
+        input_lengths = torch.full((N,), T, dtype=other_dtype).to(other_device)
+        target_lengths = torch.randint(low=1, high=S, size=(N,), dtype=other_dtype).to(other_device)
+        targets = torch.randint(low=0, high=C, size=(sum(target_lengths),), dtype=other_dtype).to(other_device)
+
+        ctc_loss = torch.nn.functional.ctc_loss(
+            log_probs=log_probs,
+            targets=targets,
+            input_lengths=input_lengths,
+            target_lengths=target_lengths,
+            reduction="sum",
+        )
+
+    @expectedFailureMPS
+    def test_ctc_loss_error(self, device):
+        log_probs = torch.rand(0, 0, 4, device=device)
+        targets = torch.tensor([], device=device, dtype=torch.long)
+        input_lengths = torch.tensor([], device=device, dtype=torch.long)
+        target_lengths = torch.tensor([], device=device, dtype=torch.long)
+        with self.assertRaisesRegex(RuntimeError, "log_probs tensor must not be empty"):
+            F.ctc_loss(log_probs, targets, input_lengths, target_lengths, reduction='none')
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @expectedFailureMPS  # RuntimeError: LSTM with projections is not currently supported with MPS.
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     @dtypes(torch.float)
@@ -11789,6 +11968,11 @@ def check_lengths(lengths, enforce_sorted, use_default_hiddens, proj_size):
                 prec = dtype2prec_DONTUSE[dtype]
                 if dtype == torch.float16:
                     prec = 4e-2
+<<<<<<< HEAD
+=======
+                elif dtype == torch.float32:
+                    prec = 2e-4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(p1.grad, p2.grad, atol=prec, rtol=0)
 
         tests = [
@@ -11926,7 +12110,11 @@ def test_nll_loss_invalid_weights(self, device):
             with self.assertRaisesRegex(RuntimeError, msg):
                 F.nll_loss(x, t, weight=weight)
 
+<<<<<<< HEAD
     # Ref: https://github.com/pytorch/pytorch/issue/85005
+=======
+    # Ref: https://github.com/pytorch/pytorch/issues/85005
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     @largeTensorTest("120GB", "cpu")
     @largeTensorTest("45GB", "cuda")
@@ -11959,7 +12147,11 @@ def test_nll_loss_large_tensor(self, device, reduction):
             with torch.no_grad():
                 self.assertTrue(torch.allclose(input.grad.cpu(), input_cpu.grad, rtol=rtol, atol=atol))
 
+<<<<<<< HEAD
     # Ref: https://github.com/pytorch/pytorch/issue/108345
+=======
+    # Ref: https://github.com/pytorch/pytorch/issues/108345
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCUDA
     @largeTensorTest("20GB", "cpu")
     @largeTensorTest("20GB", "cuda")
@@ -12326,7 +12518,11 @@ def check_equal(loss, inp_targ_1, inp_targ_2):
 
     # Ref: https://github.com/pytorch/pytorch/issues/85005
     @onlyCUDA
+<<<<<<< HEAD
     @largeTensorTest("45GB", "cpu")
+=======
+    @largeTensorTest("120GB", "cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @largeTensorTest("70GB", "cuda")
     @parametrize_test("reduction", ("none", "mean", "sum"))
     def test_cross_entropy_large_tensor(self, device, reduction):
@@ -12837,8 +13033,11 @@ def test_skip_init(self, device):
     @dtypes(torch.float)
     @dtypesIfCUDA(torch.double, torch.float, torch.half)
     def test_transformerencoderlayer(self, device, dtype):
+<<<<<<< HEAD
         if TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION and dtype == torch.half:
             self.skipTest("Skip on ROCM due to Flash Attention tolerances")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # this is a deterministic test for TransformerEncoderLayer
         d_model = 4
         nhead = 2
@@ -13060,8 +13259,11 @@ def test_transformerencoderlayer_fast_path(self, device, dtype):
     @dtypes(torch.float)
     @dtypesIfCUDA(torch.half, torch.float)
     def test_transformerencoderlayer_gelu(self, device, dtype):
+<<<<<<< HEAD
         if TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION and dtype == torch.half:
             self.skipTest("Skip on ROCM due to Flash Attention tolerances")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # this is a deterministic test for TransformerEncoderLayer with gelu activation
         d_model = 4
         nhead = 2
@@ -13244,6 +13446,19 @@ def compare_scaling(grads):
         clip_grad_norm_([p2], max_norm, norm_type=norm_type, foreach=foreach)
         self.assertEqual(p1.grad, p2.grad)
 
+<<<<<<< HEAD
+=======
+        # Should warning when parameters generator exhausted
+        params = l.parameters()
+        for p in params:
+            pass
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            clip_grad_norm_(params, max_norm, norm_type=norm_type, foreach=foreach)
+            self.assertEqual(len(w), 1)
+            self.assertEqual(str(w[0].message), "`parameters` is an empty generator, no gradient clipping will occur.")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # reference issue: https://github.com/pytorch/pytorch/issues/111484
     @onlyCUDA
     @largeTensorTest("42GB", "cuda")
diff --git a/test/test_numba_integration.py b/test/test_numba_integration.py
index dc63d4910f5e..1d7c09991d2f 100644
--- a/test/test_numba_integration.py
+++ b/test/test_numba_integration.py
@@ -36,7 +36,11 @@ def test_cuda_array_interface(self):
             version: (int) Version 0
 
         See:
+<<<<<<< HEAD
         https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
+=======
+        https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
 
         types = [
@@ -250,7 +254,11 @@ def test_from_cuda_array_interface(self):
         will use the exposed device memory.
 
         See:
+<<<<<<< HEAD
         https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
+=======
+        https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
 
         dtypes = [
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index 15864a056041..724eb88fb256 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -286,6 +286,17 @@ def test_from_numpy_no_leak_on_invalid_dtype(self):
                 pass
         self.assertTrue(sys.getrefcount(x) == 2)
 
+<<<<<<< HEAD
+=======
+    @skipIfTorchDynamo("No need to test invalid dtypes that should fail by design.")
+    @onlyCPU
+    def test_from_numpy_zero_element_type(self):
+        # This tests that dtype check happens before strides check
+        # which results in div-by-zero on-x86
+        x = np.ndarray((3, 3), dtype=str)
+        self.assertRaises(TypeError, lambda: torch.from_numpy(x))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipMeta
     def test_from_list_of_ndarray_warning(self, device):
         warning_msg = (
diff --git a/test/test_openreg.py b/test/test_openreg.py
new file mode 100644
index 000000000000..e44a2e4306d2
--- /dev/null
+++ b/test/test_openreg.py
@@ -0,0 +1,429 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import os
+import tempfile
+import types
+import unittest
+
+import psutil
+import pytorch_openreg  # noqa: F401
+
+import torch
+from torch.testing._internal.common_utils import (
+    IS_LINUX,
+    run_tests,
+    skipIfTorchDynamo,
+    skipIfXpu,
+    TestCase,
+)
+
+
+class TestPrivateUse1(TestCase):
+    """Tests of third-parth device integration mechinasm based PrivateUse1"""
+
+    def test_backend_name(self):
+        self.assertEqual(torch._C._get_privateuse1_backend_name(), "openreg")
+        # backend can be renamed to the same name multiple times
+        torch.utils.rename_privateuse1_backend("openreg")
+        with self.assertRaisesRegex(RuntimeError, "has already been set"):  # type: ignore[misc]
+            torch.utils.rename_privateuse1_backend("dev")
+
+    def test_backend_module_registration(self):
+        def generate_faked_module():
+            return types.ModuleType("fake_module")
+
+        with self.assertRaisesRegex(RuntimeError, "Expected one of cpu"):  # type: ignore[misc]
+            torch._register_device_module("dev", generate_faked_module())
+        with self.assertRaisesRegex(RuntimeError, "The runtime module of"):  # type: ignore[misc]
+            torch._register_device_module("openreg", generate_faked_module())
+
+    def test_backend_generate_methods(self):
+        with self.assertRaisesRegex(RuntimeError, "The custom device module of"):  # type: ignore[misc]
+            torch.utils.generate_methods_for_privateuse1_backend()  # type: ignore[misc]
+
+        self.assertTrue(hasattr(torch.Tensor, "is_openreg"))
+        self.assertTrue(hasattr(torch.Tensor, "openreg"))
+        self.assertTrue(hasattr(torch.TypedStorage, "is_openreg"))
+        self.assertTrue(hasattr(torch.TypedStorage, "openreg"))
+        self.assertTrue(hasattr(torch.UntypedStorage, "is_openreg"))
+        self.assertTrue(hasattr(torch.UntypedStorage, "openreg"))
+        self.assertTrue(hasattr(torch.nn.Module, "openreg"))
+        self.assertTrue(hasattr(torch.nn.utils.rnn.PackedSequence, "is_openreg"))
+        self.assertTrue(hasattr(torch.nn.utils.rnn.PackedSequence, "openreg"))
+
+    def test_backend_module_function(self):
+        with self.assertRaisesRegex(RuntimeError, "Try to call torch.openreg"):  # type: ignore[misc]
+            torch.utils.backend_registration._get_custom_mod_func("func_name_")  # type: ignore[misc]
+        self.assertTrue(
+            torch.utils.backend_registration._get_custom_mod_func("device_count")() == 2  # type: ignore[misc]
+        )
+
+    @skipIfTorchDynamo()
+    def test_backend_operator_registration(self):
+        self.assertTrue(
+            torch._C._dispatch_has_kernel_for_dispatch_key(
+                "aten::empty.memory_format", torch.DispatchKey.PrivateUse1
+            )
+        )
+        x = torch.empty(3, 3, device="openreg")
+        self.assertTrue(x.device.type, "openreg")
+        self.assertTrue(x.shape, torch.Size([3, 3]))
+
+    def test_backend_dispatchstub(self):
+        x_cpu = torch.randn(2, 2, 3, dtype=torch.float32, device="cpu")
+        x_openreg = x_cpu.to("openreg")
+
+        y_cpu = torch.abs(x_cpu)
+        y_openreg = torch.abs(x_openreg)
+        self.assertEqual(y_cpu, y_openreg.cpu())
+
+        o_cpu = torch.randn(2, 2, 6, dtype=torch.float32, device="cpu")
+        o_openreg = o_cpu.to("openreg")
+        # output operand with resize flag is False in TensorIterator.
+        torch.abs(x_cpu, out=o_cpu[:, :, 0:6:2])
+        torch.abs(x_openreg, out=o_openreg[:, :, 0:6:2])
+        self.assertEqual(o_cpu, o_openreg.cpu())
+
+        # output operand with resize flag is True in TensorIterator and
+        # convert output to contiguous tensor in TensorIterator.
+        torch.abs(x_cpu, out=o_cpu[:, :, 0:6:3])
+        torch.abs(x_openreg, out=o_openreg[:, :, 0:6:3])
+        self.assertEqual(o_cpu, o_openreg.cpu())
+
+    def test_backend_tensor_type(self):
+        dtypes_map = {
+            torch.bool: "torch.openreg.BoolTensor",
+            torch.double: "torch.openreg.DoubleTensor",
+            torch.float32: "torch.openreg.FloatTensor",
+            torch.half: "torch.openreg.HalfTensor",
+            torch.int32: "torch.openreg.IntTensor",
+            torch.int64: "torch.openreg.LongTensor",
+            torch.int8: "torch.openreg.CharTensor",
+            torch.short: "torch.openreg.ShortTensor",
+            torch.uint8: "torch.openreg.ByteTensor",
+        }
+
+        for dtype, str in dtypes_map.items():
+            x = torch.empty(4, 4, dtype=dtype, device="openreg")
+            self.assertTrue(x.type() == str)
+
+    # Note that all dtype-d Tensor objects here are only for legacy reasons
+    # and should NOT be used.
+    def test_backend_type_methods(self):
+        # Tensor
+        tensor_cpu = torch.randn([8]).float()
+        self.assertEqual(tensor_cpu.type(), "torch.FloatTensor")
+
+        tensor_openreg = tensor_cpu.openreg()
+        self.assertEqual(tensor_openreg.type(), "torch.openreg.FloatTensor")
+
+        # Storage
+        storage_cpu = tensor_cpu.storage()
+        self.assertEqual(storage_cpu.type(), "torch.FloatStorage")
+
+        tensor_openreg = tensor_cpu.openreg()
+        storage_openreg = tensor_openreg.storage()
+        self.assertEqual(storage_openreg.type(), "torch.storage.TypedStorage")
+
+        class CustomFloatStorage:
+            @property
+            def __module__(self):
+                return "torch." + torch._C._get_privateuse1_backend_name()
+
+            @property
+            def __name__(self):
+                return "FloatStorage"
+
+        try:
+            torch.openreg.FloatStorage = CustomFloatStorage()
+            self.assertEqual(storage_openreg.type(), "torch.openreg.FloatStorage")
+
+            # test custom int storage after defining FloatStorage
+            tensor_openreg = tensor_cpu.int().openreg()
+            storage_openreg = tensor_openreg.storage()
+            self.assertEqual(storage_openreg.type(), "torch.storage.TypedStorage")
+        finally:
+            torch.openreg.FloatStorage = None
+
+    def test_backend_tensor_methods(self):
+        x = torch.empty(4, 4)
+        self.assertFalse(x.is_openreg)  # type: ignore[misc]
+
+        y = x.openreg(torch.device("openreg"))  # type: ignore[misc]
+        self.assertTrue(y.is_openreg)  # type: ignore[misc]
+        z = x.openreg(torch.device("openreg:0"))  # type: ignore[misc]
+        self.assertTrue(z.is_openreg)  # type: ignore[misc]
+        n = x.openreg(0)  # type: ignore[misc]
+        self.assertTrue(n.is_openreg)  # type: ignore[misc]
+
+    @unittest.skip("Need to support Parameter in openreg")
+    def test_backend_module_methods(self):
+        class FakeModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.x = torch.nn.Parameter(torch.randn(3, 3))
+
+            def forward(self):
+                pass
+
+        module = FakeModule()
+        self.assertEqual(module.x.device.type, "cpu")
+        module.openreg()  # type: ignore[misc]
+        self.assertEqual(module.x.device.type, "openreg")
+
+    @unittest.skip("Need to support untyped_storage in openreg")
+    def test_backend_storage_methods(self):
+        x = torch.empty(4, 4)
+
+        x_cpu = x.storage()
+        self.assertFalse(x_cpu.is_openreg)  # type: ignore[misc]
+        x_openreg = x_cpu.openreg()  # type: ignore[misc]
+        self.assertTrue(x_openreg.is_openreg)  # type: ignore[misc]
+
+        y = torch.empty(4, 4)
+
+        y_cpu = y.untyped_storage()
+        self.assertFalse(y_cpu.is_openreg)  # type: ignore[misc]
+        y_openreg = y_cpu.openreg()  # type: ignore[misc]
+        self.assertTrue(y_openreg.is_openreg)  # type: ignore[misc]
+
+    def test_backend_packed_sequence_methods(self):
+        x = torch.rand(5, 3)
+        y = torch.tensor([1, 1, 1, 1, 1])
+
+        z_cpu = torch.nn.utils.rnn.PackedSequence(x, y)
+        self.assertFalse(z_cpu.is_openreg)  # type: ignore[misc]
+
+        z_openreg = z_cpu.openreg()  # type: ignore[misc]
+        self.assertTrue(z_openreg.is_openreg)  # type: ignore[misc]
+
+
+class TestOpenReg(TestCase):
+    """Tests of mimick accelerator named OpenReg based on PrivateUse1"""
+
+    # Stream & Event
+    def test_stream_synchronize(self):
+        stream = torch.Stream(device="openreg:1")
+        stream.synchronize()
+        self.assertEqual(True, stream.query())
+
+    def test_stream_wait_stream(self):
+        stream_1 = torch.Stream(device="openreg:0")
+        stream_2 = torch.Stream(device="openreg:1")
+        # Does not crash!
+        stream_2.wait_stream(stream_1)
+
+    @skipIfTorchDynamo()
+    def test_record_event(self):
+        stream = torch.Stream(device="openreg:1")
+        event1 = stream.record_event()
+        self.assertNotEqual(0, event1.event_id)
+        event2 = stream.record_event()
+        self.assertNotEqual(0, event2.event_id)
+        self.assertNotEqual(event1.event_id, event2.event_id)
+
+    @skipIfTorchDynamo()
+    def test_event_elapsed_time(self):
+        stream = torch.Stream(device="openreg:1")
+        e1 = torch.Event(device="openreg:1", enable_timing=True)
+        e1.record(stream)
+        e2 = torch.Event(device="openreg:1", enable_timing=True)
+        e2.record(stream)
+
+        e2.synchronize()
+        self.assertTrue(e2.query())
+
+        ms = e1.elapsed_time(e2)
+        self.assertTrue(ms > 0)
+
+    @skipIfTorchDynamo()
+    def test_stream_wait_event(self):
+        s1 = torch.Stream(device="openreg")
+        s2 = torch.Stream(device="openreg")
+        e = s1.record_event()
+        s2.wait_event(e)
+
+    @skipIfTorchDynamo()
+    def test_event_wait_stream(self):
+        s1 = torch.Stream(device="openreg")
+        s2 = torch.Stream(device="openreg")
+        e1 = s1.record_event()
+        e1.wait(s2)
+
+    # Copy
+    def test_cross_device_copy(self):
+        a = torch.rand(10)
+        b = a.to(device="openreg").add(2).to(device="cpu")
+        self.assertEqual(b, a + 2)
+
+    def test_copy_same_device(self):
+        a = torch.ones(10, device="openreg").clone()
+        self.assertEqual(a, torch.ones(10, device="openreg"))
+
+    def test_cross_diff_devices_copy(self):
+        a = torch.ones(10, device="openreg:0").to(device="openreg:1").to(device="cpu")
+        self.assertEqual(a, torch.ones(10))
+
+    # RNG
+    def test_generator(self):
+        generator = torch.Generator(device="openreg:1")
+        self.assertEqual(generator.device.type, "openreg")
+        self.assertEqual(generator.device.index, 1)
+
+    def test_rng_state(self):
+        state = torch.openreg.get_rng_state(0)  # type: ignore[misc]
+        torch.openreg.set_rng_state(state, 0)  # type: ignore[misc]
+
+    def test_manual_seed(self):
+        torch.openreg.manual_seed_all(2024)  # type: ignore[misc]
+        self.assertEqual(torch.openreg.initial_seed(), 2024)  # type: ignore[misc]
+
+    # Autograd
+    @unittest.skipIf(not IS_LINUX, "Only works on linux")
+    def test_autograd_init(self):
+        # Make sure autograd is initialized
+        torch.ones(2, requires_grad=True, device="openreg").sum().backward()
+
+        pid = os.getpid()
+        task_path = f"/proc/{pid}/task"
+        all_threads = psutil.Process(pid).threads()
+
+        all_thread_names = set()
+
+        for t in all_threads:
+            with open(f"{task_path}/{t.id}/comm") as file:
+                thread_name = file.read().strip()
+            all_thread_names.add(thread_name)
+
+        for i in range(torch.accelerator.device_count()):
+            self.assertIn(f"pt_autograd_{i}", all_thread_names)
+
+    # Storage & Pin Memory
+    @skipIfTorchDynamo("unsupported aten.is_pinned.default")
+    def test_pin_memory(self):
+        tensor = torch.randn(10)
+        self.assertFalse(tensor.is_pinned())
+        pinned_tensor = tensor.pin_memory()
+        self.assertTrue(pinned_tensor.is_pinned())
+        slice_tensor = pinned_tensor[2:5]
+        self.assertTrue(slice_tensor.is_pinned())
+
+        tensor = torch.randn(10)
+        storage = tensor.storage()
+        self.assertFalse(storage.is_pinned("openreg"))
+        pinned_storage = storage.pin_memory("openreg")
+        self.assertTrue(pinned_storage.is_pinned("openreg"))
+
+        tensor = torch.randn(10)
+        untyped_storage = tensor.untyped_storage()
+        self.assertFalse(untyped_storage.is_pinned("openreg"))
+        pinned_untyped_storage = untyped_storage.pin_memory("openreg")
+        self.assertTrue(pinned_untyped_storage.is_pinned("openreg"))
+
+    @skipIfTorchDynamo("unsupported aten.is_pinned.default")
+    def test_rewrapped_storage(self):
+        pinned_a = torch.randn(10).pin_memory()
+        rewrapped_a = torch.tensor((), dtype=torch.float32).set_(
+            pinned_a.untyped_storage()[2:],
+            size=(5,),
+            stride=(1,),
+            storage_offset=0,
+        )
+        self.assertTrue(rewrapped_a.is_pinned())
+        self.assertNotEqual(pinned_a.data_ptr(), rewrapped_a.data_ptr())
+
+    # Serialization
+    @unittest.skip(
+        "Temporarily disable due to the tiny differences between clang++ and g++ in defining static variable in inline function,"
+        "this pr can fix this, https://github.com/pytorch/pytorch/pull/147095"
+    )
+    def test_serialization(self):
+        storage = torch.UntypedStorage(4, device=torch.device("openreg"))
+        self.assertEqual(torch.serialization.location_tag(storage), "openreg:0")
+
+        storage = torch.UntypedStorage(4, device=torch.device("openreg:0"))
+        self.assertEqual(torch.serialization.location_tag(storage), "openreg:0")
+
+        storage_cpu = torch.empty(4, 4).storage()
+        storage_openreg = torch.serialization.default_restore_location(
+            storage_cpu, "openreg:0"
+        )
+        self.assertTrue(storage_openreg.is_openreg)  # type: ignore[misc]
+
+        tensor = torch.empty(3, 3, device="openreg")
+        self.assertEqual(torch._utils.get_tensor_metadata(tensor), {})  # type: ignore[misc]
+        metadata = {"version_number": True, "format_number": True}
+        torch._utils.set_tensor_metadata(tensor, metadata)  # type: ignore[misc]
+        self.assertEqual(torch._utils.get_tensor_metadata(tensor), metadata)  # type: ignore[misc]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "data.pt")
+            torch.save(tensor, path)
+
+            tensor_openreg = torch.load(path)
+            self.assertTrue(tensor_openreg.is_openreg)
+            self.assertEqual(torch._utils.get_tensor_metadata(tensor_openreg), metadata)  # type: ignore[misc]
+
+            tensor_cpu = torch.load(path, map_location="cpu")
+            self.assertFalse(tensor_cpu.is_openreg)
+            self.assertEqual(torch._utils.get_tensor_metadata(tensor_cpu), {})  # type: ignore[misc]
+
+    # Opeartors
+    def test_factory(self):
+        x = torch.empty(3, device="openreg")
+        self.assertEqual(x.device.type, "openreg")
+        self.assertEqual(x.shape, torch.Size([3]))
+
+        y = torch.zeros(3, device="openreg")
+        self.assertEqual(y.device.type, "openreg")
+        self.assertEqual(y.shape, torch.Size([3]))
+
+        z = torch.tensor((), device="openreg")
+        self.assertEqual(z.device.type, "openreg")
+        self.assertEqual(z.shape, torch.Size([0]))
+
+    def test_printing(self):
+        a = torch.ones(20, device="openreg")
+        # Does not crash!
+        str(a)
+
+    def test_data_dependent_output(self):
+        cpu_a = torch.randn(10)
+        a = cpu_a.to(device="openreg")
+        mask = a.gt(0)
+        out = torch.masked_select(a, mask)
+
+        self.assertEqual(out, cpu_a.masked_select(cpu_a.gt(0)))
+
+    def test_expand(self):
+        x = torch.tensor([[1], [2], [3]], device="openreg")
+        y = x.expand(3, 2)
+        self.assertEqual(y.to(device="cpu"), torch.tensor([[1, 1], [2, 2], [3, 3]]))
+        self.assertEqual(x.data_ptr(), y.data_ptr())
+
+    def test_resize(self):
+        tensor_cpu = torch.randn([4, 4])
+
+        tensor_openreg = tensor_cpu.openreg()
+        self.assertTrue(tensor_openreg.size() == torch.Size([4, 4]))
+
+        storage_openreg = tensor_openreg.storage()
+        self.assertTrue(storage_openreg.size() == 16)
+
+        tensor_openreg.resize_(2, 2, 2, 2)
+        self.assertTrue(tensor_openreg.size() == torch.Size([2, 2, 2, 2]))
+
+        storage_openreg = tensor_openreg.storage()
+        self.assertTrue(storage_openreg.size() == 16)
+
+    # Quantize
+    @skipIfXpu(msg="missing kernel for openreg")
+    def test_quantize(self):
+        x = torch.randn(3, 4, 5, dtype=torch.float32, device="openreg")
+        quantized_tensor = torch.quantize_per_tensor(x, 0.1, 10, torch.qint8)
+        self.assertEqual(quantized_tensor.device, torch.device("openreg:0"))
+        self.assertEqual(quantized_tensor.dtype, torch.qint8)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_ops.py b/test/test_ops.py
index be634c047ca2..fedf6a0a4c6f 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -62,7 +62,10 @@
     run_tests,
     set_default_dtype,
     skipIfTorchDynamo,
+<<<<<<< HEAD
     skipIfRocm,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     skipIfTorchInductor,
     slowTest,
     suppress_warnings,
@@ -119,8 +122,11 @@ def reduction_dtype_filter(op):
 aten = torch.ops.aten
 
 meta_consistency_out_dtype_mismatch_xfails = {
+<<<<<<< HEAD
     xfail("addbmm"),
     xfail("addmv"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("alias_copy"),
     xfail("all"),
     xfail("amax"),
@@ -128,7 +134,10 @@ def reduction_dtype_filter(op):
     xfail("aminmax"),
     xfail("any"),
     xfail("as_strided_copy"),
+<<<<<<< HEAD
     xfail("baddbmm"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("bucketize"),
     xfail("conj_physical"),
     xfail("cross"),
@@ -136,7 +145,10 @@ def reduction_dtype_filter(op):
     xfail("cummin"),
     xfail("diag"),
     xfail("diagonal_copy"),
+<<<<<<< HEAD
     xfail("dot"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("expand_copy"),
     xfail("fft.ihfft2"),
     xfail("fft.ihfftn"),
@@ -160,7 +172,10 @@ def reduction_dtype_filter(op):
     xfail("linalg.lu_factor"),
     xfail("linalg.lu_factor_ex"),
     xfail("linalg.lu_solve"),
+<<<<<<< HEAD
     xfail("linalg.matrix_power"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("linalg.qr"),
     xfail("linalg.slogdet"),
     xfail("linalg.solve"),
@@ -169,12 +184,18 @@ def reduction_dtype_filter(op):
     xfail("logcumsumexp"),
     xfail("lu_solve"),
     xfail("lu_unpack"),
+<<<<<<< HEAD
     xfail("matmul"),
     xfail("mm"),
     xfail("mode"),
     xfail("msort"),
     xfail("multinomial"),
     xfail("mv"),
+=======
+    xfail("mode"),
+    xfail("msort"),
+    xfail("multinomial"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("nan_to_num"),
     xfail("nanmean"),
     xfail("narrow_copy"),
@@ -183,7 +204,10 @@ def reduction_dtype_filter(op):
     xfail("nn.functional.avg_pool3d"),
     xfail("nn.functional.gelu"),
     xfail("nn.functional.hardshrink"),
+<<<<<<< HEAD
     xfail("nn.functional.linear"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("nn.functional.logsigmoid"),
     xfail("nn.functional.softplus"),
     xfail("nn.functional.softshrink"),
@@ -211,7 +235,10 @@ def reduction_dtype_filter(op):
     xfail("triu"),
     xfail("unfold_copy"),
     xfail("unsqueeze_copy"),
+<<<<<<< HEAD
     xfail("vdot"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfail("view_copy"),
     xfail("where"),
     # Output has dynamic shape.
@@ -1431,6 +1458,7 @@ def test_complex_half_reference_testing(self, device, dtype, op):
             # `cfloat` input -> `float` output
             self.assertEqual(actual, expected, exact_dtype=False)
 
+<<<<<<< HEAD
     @skipIfRocm
     @ops(op_db, allowed_dtypes=(torch.bool,))
     def test_non_standard_bool_values(self, device, dtype, op):
@@ -1446,6 +1474,10 @@ def test_non_standard_bool_values(self, device, dtype, op):
             if self._testMethodName in rocm_blocklist:
                 self.skipTest("Failed on ROCm")
 
+=======
+    @ops(op_db, allowed_dtypes=(torch.bool,))
+    def test_non_standard_bool_values(self, device, dtype, op):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test boolean values other than 0x00 and 0x01 (gh-54789)
         def convert_boolean_tensors(x):
             if not isinstance(x, torch.Tensor) or x.dtype != torch.bool:
@@ -1839,6 +1871,10 @@ def check_ignore_materialize(idx_or_kw, allow_list):
         def check_cow_input(
             arg,
             arg_copy,
+<<<<<<< HEAD
+=======
+            arg_raw,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             idx_or_kw,
             backward_or_forward="forward",
             supports_cow_input_no_materialize=op.supports_cow_input_no_materialize_forward,
@@ -1851,6 +1887,16 @@ def check_cow_input(
             ) + f" during {backward_or_forward} call"
 
             if is_strided_tensor(arg):
+<<<<<<< HEAD
+=======
+                self.assertTrue(
+                    torch._C._is_cow_tensor(arg_raw),
+                    msg=(
+                        f"{arg_name} raw input should remain COW, but it "
+                        "unexpectedly materialized."
+                    ),
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 is_cow = torch._C._is_cow_tensor(arg)
 
                 if supports_cow_input_no_materialize and not check_ignore_materialize(
@@ -1875,6 +1921,20 @@ def check_cow_input(
                             "but the operation mutated its data."
                         ),
                     )
+<<<<<<< HEAD
+=======
+                else:
+                    self.assertTrue(
+                        torch.allclose(
+                            arg_raw, arg_copy, rtol=0, atol=0, equal_nan=True
+                        ),
+                        msg=(
+                            f"{arg_name} materialized, which is allowed in this "
+                            "case, but the COW input data was mutated, which is "
+                            "not allowed."
+                        ),
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for sample in samples:
             args_raw = [sample.input] + list(sample.args)
@@ -1915,10 +1975,17 @@ def check_cow_input(
 
             # Check that COW inputs remain COW after the forward op is executed
             for idx, arg in enumerate(args):
+<<<<<<< HEAD
                 check_cow_input(arg, args_copy[idx], idx)
 
             for kw, arg in kwargs.items():
                 check_cow_input(arg, kwargs_copy[kw], kw)
+=======
+                check_cow_input(arg, args_copy[idx], args_raw[idx], idx)
+
+            for kw, arg in kwargs.items():
+                check_cow_input(arg, kwargs_copy[kw], kwargs_raw[kw], kw)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Call backward op if it is supported. This part of the test is
             # based on `composite_compliance.check_backward_formula`
@@ -1968,6 +2035,10 @@ def check_cow_input(
                         check_cow_input(
                             arg,
                             args_copy[idx],
+<<<<<<< HEAD
+=======
+                            args_raw[idx],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             idx,
                             backward_or_forward="backward",
                             supports_cow_input_no_materialize=op.supports_cow_input_no_materialize_backward,
@@ -1979,6 +2050,10 @@ def check_cow_input(
                         check_cow_input(
                             output_grad,
                             output_grads_copy[idx],
+<<<<<<< HEAD
+=======
+                            output_grads_raw[idx],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             f"output grad {idx}",
                             backward_or_forward="backward",
                             supports_cow_input_no_materialize=op.supports_cow_input_no_materialize_backward,
diff --git a/test/test_ops_gradients.py b/test/test_ops_gradients.py
index a0adc4ce3972..5f7d54750934 100644
--- a/test/test_ops_gradients.py
+++ b/test/test_ops_gradients.py
@@ -14,7 +14,10 @@
     TestCase,
     TestGradients,
     unMarkDynamoStrictTest,
+<<<<<<< HEAD
     xfailIfS390X,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.custom_op_db import custom_op_db
 from torch.testing._internal.hop_db import hop_db
@@ -29,7 +32,10 @@
 @unMarkDynamoStrictTest
 class TestBwdGradients(TestGradients):
     # Tests that gradients are computed correctly
+<<<<<<< HEAD
     @xfailIfS390X
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @_gradcheck_ops(op_db + hop_db + custom_op_db)
     def test_fn_grad(self, device, dtype, op):
         # This is verified by test_dtypes in test_ops.py
diff --git a/test/test_optim.py b/test/test_optim.py
index 2dcf3faecd64..a5bce7b5364b 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -4,6 +4,10 @@
 import tempfile
 import unittest
 from copy import deepcopy
+<<<<<<< HEAD
+=======
+from itertools import product
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 from unittest.mock import patch
 
@@ -44,7 +48,10 @@
     run_tests,
     TEST_WITH_TORCHDYNAMO,
     TestCase,
+<<<<<<< HEAD
     xfailIfS390X,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -315,15 +322,31 @@ def closure():
 
             self.assertLess(closure().item(), initial_value)
 
+<<<<<<< HEAD
     @optims(optim_db, dtypes=[torch.float32])
     def test_tensor_lr(self, device, dtype, optim_info):
         optim_cls = optim_info.optim_cls
 
+=======
+    @parametrize("num_dim", [0, 1, 2])
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_tensor_lr(self, device, dtype, optim_info, num_dim):
+        optim_cls = optim_info.optim_cls
+
+        lr_devices = [device]
+        if _get_device_type(device) != "cpu":
+            lr_devices.append("cpu")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",)
         )
+<<<<<<< HEAD
         for optim_input in all_optim_inputs:
+=======
+        for optim_input, lr_device in product(all_optim_inputs, lr_devices):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
             weight_c = weight.detach().clone().requires_grad_(True)
             bias = Parameter(torch.randn((10), device=device, dtype=dtype))
@@ -338,7 +361,13 @@ def test_tensor_lr(self, device, dtype, optim_info):
             optimizer_r = optim_cls([weight, bias], **kwargs)
 
             try:
+<<<<<<< HEAD
                 kwargs["lr"] = torch.tensor(kwargs["lr"])
+=======
+                kwargs["lr"] = (
+                    torch.tensor(kwargs["lr"]).reshape([1] * num_dim).to(lr_device)
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 optimizer = optim_cls([weight_c, bias_c], **kwargs)
             except ValueError as e:
                 self.assertRegex(str(e), ".*lr as a Tensor is not supported.*")
@@ -365,7 +394,13 @@ def closure(optim, w, b, i):
                     )
                 else:
                     closure(optimizer_r, weight, bias, inpt)
+<<<<<<< HEAD
                     closure(optimizer, weight_c, bias_c, inpt)
+=======
+                    optimizer_r.step()
+                    closure(optimizer, weight_c, bias_c, inpt)
+                    optimizer.step()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 self.assertEqual(weight, weight_c)
                 self.assertEqual(bias, bias_c)
@@ -581,7 +616,10 @@ def complex_closure():
             self.assertEqual(complex_steps, real_steps)
 
     @skipMPS
+<<<<<<< HEAD
     @xfailIfS390X
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @optims([o for o in optim_db if o.supports_complex], dtypes=[torch.complex64])
     def test_complex_2d(self, device, dtype, optim_info):
         optim_cls = optim_info.optim_cls
@@ -1171,7 +1209,11 @@ def test_cpu_load_state_dict(self, device, dtype, impl, optim_info):
         opt_name = optim_cls.__name__
         if opt_name in ("SGD", "Adagrad") and impl == "capturable":
             # Capturable SGD/Adagrad does not exist
+<<<<<<< HEAD
             self.skipTest("SGD does not currently support capturable")
+=======
+            self.skipTest(f"{opt_name} does not currently support capturable")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if _get_device_type(device) == "cpu":
             self.skipTest("Test is only for non-cpu devices")
         elif (
diff --git a/test/test_overrides.py b/test/test_overrides.py
index cc7c904a124b..d740e0a08671 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -9,10 +9,16 @@
 import pickle
 import collections
 import unittest
+<<<<<<< HEAD
 import contextlib
 import os
 
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_CROSSREF, TEST_WITH_TORCHDYNAMO
+=======
+import os
+
+from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_CROSSREF
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.overrides import (
     handle_torch_function,
     has_torch_function,
@@ -369,7 +375,11 @@ class TensorLike:
     """A class that overrides the full torch API
 
     This class is used to explicitly test that the full torch.tensor API
+<<<<<<< HEAD
     can be overriden with a class that defines __torch_function__.
+=======
+    can be overridden with a class that defines __torch_function__.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
@@ -382,6 +392,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         return HANDLED_FUNCTIONS_TENSOR_LIKE[func](*args, **kwargs)
 
 class TestTorchFunctionOverride(TestCase):
+<<<<<<< HEAD
     @classmethod
     def setUpClass(cls):
         cls._stack = contextlib.ExitStack()
@@ -403,6 +414,8 @@ def setup_subclasses():
     def tearDownClass(cls):
         cls._stack.close()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_dtype_override(self):
         class MyDtype:
             def __torch_function__(self, *args, **kwargs):
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 26131d558dd7..567c057d6717 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1084,7 +1084,11 @@ def f(x):
         test_inputs.append([(6, 8)])
         gm = self._test_dynamic(f, [(3, 4)], test_inputs)
         self.assertTrue(eval_guards(gm, torch.randn(4, 5)))
+<<<<<<< HEAD
         self.assertEqual(repr(bind_symbols(gm, torch.randn(4, 5))), "{s0: 4, s1: 5}")
+=======
+        self.assertEqual(repr(bind_symbols(gm, torch.randn(4, 5))), "{s75: 4, s96: 5}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertFalse(eval_guards(gm, torch.randn(25, 5)))
         self.assertExpectedInline(show_guards(gm), """L['x'].size()[0] <= 19""")
 
@@ -1218,7 +1222,11 @@ def f(src_tokens):
         gm = make_fx(f, tracing_mode="symbolic")(src_tokens)
         # Guards to rule out batch_size == sys.maxsize (wobbling between 2 and
         # 1 ok)
+<<<<<<< HEAD
         self.assertEqual(len(gm.shape_env.guards), 1)
+=======
+        self.assertEqual(len(gm.shape_env.guards), 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(not HAS_CUDA, 'CUDA-only test')
     def test_cpu_scalar_cuda(self):
@@ -1370,8 +1378,13 @@ def forward(self, crop_camera_1, mask_1):
     view_1 = torch.ops.aten.view.default(expand_1, [sym_size_int, sym_size_int_1, sym_size_int_2]);  expand_1 = sym_size_int_1 = sym_size_int_2 = None
     bmm = torch.ops.aten.bmm.default(view, view_1);  view = view_1 = None
     view_2 = torch.ops.aten.view.default(bmm, [sym_size_int, 3, 3]);  bmm = None
+<<<<<<< HEAD
     mul_4 = sym_size_int * 3
     view_3 = torch.ops.aten.view.default(view_2, [mul_4, 3]);  view_2 = mul_4 = None
+=======
+    mul_6 = sym_size_int * 3
+    view_3 = torch.ops.aten.view.default(view_2, [mul_6, 3]);  view_2 = mul_6 = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mm = torch.ops.aten.mm.default(view_3, eye);  view_3 = eye = None
     _unsafe_view = torch.ops.aten._unsafe_view.default(mm, [sym_size_int, 3, 3]);  mm = sym_size_int = None
     index_put_ = torch.ops.aten.index_put_.default(crop_camera_1, [mask_1], _unsafe_view);  crop_camera_1 = mask_1 = _unsafe_view = index_put_ = None
@@ -1717,7 +1730,11 @@ def f(a, b):
         gm = self._test_dynamic(f, [(1, 6), (8, 1)], test_inputs)
         self.assertTrue(eval_guards(gm, torch.randn(1, 10), torch.randn(6, 1)))
         self.assertFalse(eval_guards(gm, torch.randn(1, 2), torch.randn(4, 1)))
+<<<<<<< HEAD
         self.assertExpectedInline(show_guards(gm), """2*L['a'].size()[1]*L['b'].size()[0] > 20""")
+=======
+        self.assertExpectedInline(show_guards(gm), """2*L['b'].size()[0]*L['a'].size()[1] > 20""")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_new_empty(self):
         def f(a, b):
@@ -2047,9 +2064,13 @@ def f(t):
     xfail('take_along_dim', ''),
 
     # SymIntArrayRef expected to contain only concrete
+<<<<<<< HEAD
     xfail('ones', ''),
     xfail('randn', ''),
     xfail('zeros', ''),
+=======
+    xfail('randn', ''),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # RuntimeError: Cannot call numel() on tensor with symbolic sizes/strides
     xfail('index_reduce', 'prod'),
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 74282260553b..f93b633637a3 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -59,6 +59,10 @@ def test_no_new_bindings(self):
         #
         #   {elem for elem in dir(torch._C) if not elem.startswith("_")}
         torch_C_allowlist_superset = {
+<<<<<<< HEAD
+=======
+            "AcceleratorError",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "AggregationType",
             "AliasDb",
             "AnyType",
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 2e6bbd406e47..10f322cf0aa7 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -2,6 +2,10 @@
 # ruff: noqa: F841
 
 import logging
+<<<<<<< HEAD
+=======
+import pickle
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import tempfile
 import unittest
@@ -226,6 +230,23 @@ def sum_meta(*args, **kwargs):
                 torch.ops.custom.sum.default(a)
                 self.assertTrue(meta_is_called)
 
+<<<<<<< HEAD
+=======
+    def test_dispatchkeyset_pickle(self) -> None:
+        keyset = torch._C.DispatchKeySet(torch._C.DispatchKey.AutogradCPU)
+        serialized = pickle.dumps(keyset)
+        new_keyset = pickle.loads(serialized)
+        self.assertEqual(new_keyset, keyset)
+
+    def test_dispatchkeyset_eq(self) -> None:
+        a = torch._C.DispatchKeySet(torch._C.DispatchKey.AutogradCPU)
+        b = torch._C.DispatchKeySet(torch._C.DispatchKey.AutogradCPU)
+        c = torch._C.DispatchKeySet(torch._C.DispatchKey.CPU)
+        self.assertTrue(a == b)
+        self.assertFalse(a != b)
+        self.assertTrue(a != c)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_override_aten_ops_with_multiple_libraries(self) -> None:
         x = torch.tensor([1, 2])
         with _scoped_library("aten", "IMPL") as my_lib2:
diff --git a/test/test_pytree.py b/test/test_pytree.py
index 4560ac6e69ed..85d49fb5f26c 100644
--- a/test/test_pytree.py
+++ b/test/test_pytree.py
@@ -6,11 +6,20 @@
 import re
 import subprocess
 import sys
+<<<<<<< HEAD
 import unittest
 from collections import defaultdict, deque, namedtuple, OrderedDict, UserDict
 from dataclasses import dataclass
 from enum import auto
 from typing import Any, NamedTuple
+=======
+import time
+import unittest
+from collections import defaultdict, deque, namedtuple, OrderedDict, UserDict
+from dataclasses import dataclass, field
+from enum import auto
+from typing import Any, NamedTuple, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as py_pytree
@@ -731,6 +740,151 @@ def test_pytree_serialize_bad_input(self, pytree_impl):
         with self.assertRaises(TypeError):
             pytree_impl.treespec_dumps("random_blurb")
 
+<<<<<<< HEAD
+=======
+    @parametrize(
+        "pytree",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_is_namedtuple(self, pytree):
+        DirectNamedTuple1 = namedtuple("DirectNamedTuple1", ["x", "y"])
+
+        class DirectNamedTuple2(NamedTuple):
+            x: int
+            y: int
+
+        class IndirectNamedTuple1(DirectNamedTuple1):
+            pass
+
+        class IndirectNamedTuple2(DirectNamedTuple2):
+            pass
+
+        self.assertTrue(pytree.is_namedtuple(DirectNamedTuple1(0, 1)))
+        self.assertTrue(pytree.is_namedtuple(DirectNamedTuple2(0, 1)))
+        self.assertTrue(pytree.is_namedtuple(IndirectNamedTuple1(0, 1)))
+        self.assertTrue(pytree.is_namedtuple(IndirectNamedTuple2(0, 1)))
+        self.assertFalse(pytree.is_namedtuple(time.gmtime()))
+        self.assertFalse(pytree.is_namedtuple((0, 1)))
+        self.assertFalse(pytree.is_namedtuple([0, 1]))
+        self.assertFalse(pytree.is_namedtuple({0: 1, 1: 2}))
+        self.assertFalse(pytree.is_namedtuple({0, 1}))
+        self.assertFalse(pytree.is_namedtuple(1))
+
+        self.assertTrue(pytree.is_namedtuple(DirectNamedTuple1))
+        self.assertTrue(pytree.is_namedtuple(DirectNamedTuple2))
+        self.assertTrue(pytree.is_namedtuple(IndirectNamedTuple1))
+        self.assertTrue(pytree.is_namedtuple(IndirectNamedTuple2))
+        self.assertFalse(pytree.is_namedtuple(time.struct_time))
+        self.assertFalse(pytree.is_namedtuple(tuple))
+        self.assertFalse(pytree.is_namedtuple(list))
+
+        self.assertTrue(pytree.is_namedtuple_class(DirectNamedTuple1))
+        self.assertTrue(pytree.is_namedtuple_class(DirectNamedTuple2))
+        self.assertTrue(pytree.is_namedtuple_class(IndirectNamedTuple1))
+        self.assertTrue(pytree.is_namedtuple_class(IndirectNamedTuple2))
+        self.assertFalse(pytree.is_namedtuple_class(time.struct_time))
+        self.assertFalse(pytree.is_namedtuple_class(tuple))
+        self.assertFalse(pytree.is_namedtuple_class(list))
+
+    @parametrize(
+        "pytree",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_is_structseq(self, pytree):
+        class FakeStructSeq(tuple):
+            n_fields = 2
+            n_sequence_fields = 2
+            n_unnamed_fields = 0
+
+            __slots__ = ()
+            __match_args__ = ("x", "y")
+
+            def __new__(cls, sequence):
+                return super().__new__(cls, sequence)
+
+            @property
+            def x(self):
+                return self[0]
+
+            @property
+            def y(self):
+                return self[1]
+
+        DirectNamedTuple1 = namedtuple("DirectNamedTuple1", ["x", "y"])
+
+        class DirectNamedTuple2(NamedTuple):
+            x: int
+            y: int
+
+        self.assertFalse(pytree.is_structseq(FakeStructSeq((0, 1))))
+        self.assertTrue(pytree.is_structseq(time.gmtime()))
+        self.assertFalse(pytree.is_structseq(DirectNamedTuple1(0, 1)))
+        self.assertFalse(pytree.is_structseq(DirectNamedTuple2(0, 1)))
+        self.assertFalse(pytree.is_structseq((0, 1)))
+        self.assertFalse(pytree.is_structseq([0, 1]))
+        self.assertFalse(pytree.is_structseq({0: 1, 1: 2}))
+        self.assertFalse(pytree.is_structseq({0, 1}))
+        self.assertFalse(pytree.is_structseq(1))
+
+        self.assertFalse(pytree.is_structseq(FakeStructSeq))
+        self.assertTrue(pytree.is_structseq(time.struct_time))
+        self.assertFalse(pytree.is_structseq(DirectNamedTuple1))
+        self.assertFalse(pytree.is_structseq(DirectNamedTuple2))
+        self.assertFalse(pytree.is_structseq(tuple))
+        self.assertFalse(pytree.is_structseq(list))
+
+        self.assertFalse(pytree.is_structseq_class(FakeStructSeq))
+        self.assertTrue(
+            pytree.is_structseq_class(time.struct_time),
+        )
+        self.assertFalse(pytree.is_structseq_class(DirectNamedTuple1))
+        self.assertFalse(pytree.is_structseq_class(DirectNamedTuple2))
+        self.assertFalse(pytree.is_structseq_class(tuple))
+        self.assertFalse(pytree.is_structseq_class(list))
+
+        # torch.return_types.* are all PyStructSequence types
+        for cls in vars(torch.return_types).values():
+            if isinstance(cls, type) and issubclass(cls, tuple):
+                self.assertTrue(pytree.is_structseq(cls))
+                self.assertTrue(pytree.is_structseq_class(cls))
+                self.assertFalse(pytree.is_namedtuple(cls))
+                self.assertFalse(pytree.is_namedtuple_class(cls))
+
+                inst = cls(range(cls.n_sequence_fields))
+                self.assertTrue(pytree.is_structseq(inst))
+                self.assertTrue(pytree.is_structseq(type(inst)))
+                self.assertFalse(pytree.is_structseq_class(inst))
+                self.assertTrue(pytree.is_structseq_class(type(inst)))
+                self.assertFalse(pytree.is_namedtuple(inst))
+                self.assertFalse(pytree.is_namedtuple_class(inst))
+            else:
+                self.assertFalse(pytree.is_structseq(cls))
+                self.assertFalse(pytree.is_structseq_class(cls))
+                self.assertFalse(pytree.is_namedtuple(cls))
+                self.assertFalse(pytree.is_namedtuple_class(cls))
+
+    @parametrize(
+        "pytree",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_enum_treespec_roundtrip(self, pytree):
+        data = {TestEnum.A: 5}
+        spec = pytree.tree_structure(data)
+
+        serialized = pytree.treespec_dumps(spec)
+        deserialized_spec = pytree.treespec_loads(serialized)
+        self.assertEqual(spec, deserialized_spec)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestPythonPytree(TestCase):
     def test_deprecated_register_pytree_node(self):
@@ -975,9 +1129,14 @@ def test_pytree_serialize_namedtuple(self):
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point1",
         )
 
+<<<<<<< HEAD
         spec = py_pytree.TreeSpec(
             namedtuple, Point1, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
         )
+=======
+        spec = py_pytree.tree_structure(Point1(1, 2))
+        self.assertIs(spec.type, namedtuple)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
         self.assertEqual(spec, roundtrip_spec)
 
@@ -990,18 +1149,40 @@ class Point2(NamedTuple):
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point2",
         )
 
+<<<<<<< HEAD
         spec = py_pytree.TreeSpec(
             namedtuple, Point2, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
         )
+=======
+        spec = py_pytree.tree_structure(Point2(1, 2))
+        self.assertIs(spec.type, namedtuple)
+        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+        self.assertEqual(spec, roundtrip_spec)
+
+        class Point3(Point2):
+            pass
+
+        py_pytree._register_namedtuple(
+            Point3,
+            serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point3",
+        )
+
+        spec = py_pytree.tree_structure(Point3(1, 2))
+        self.assertIs(spec.type, namedtuple)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
         self.assertEqual(spec, roundtrip_spec)
 
     def test_pytree_serialize_namedtuple_bad(self):
         DummyType = namedtuple("DummyType", ["x", "y"])
 
+<<<<<<< HEAD
         spec = py_pytree.TreeSpec(
             namedtuple, DummyType, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
         )
+=======
+        spec = py_pytree.tree_structure(DummyType(1, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaisesRegex(
             NotImplementedError, "Please register using `_register_namedtuple`"
@@ -1020,9 +1201,13 @@ def __init__(self, x, y):
             lambda xs, _: DummyType(*xs),
         )
 
+<<<<<<< HEAD
         spec = py_pytree.TreeSpec(
             DummyType, None, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
         )
+=======
+        spec = py_pytree.tree_structure(DummyType(1, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(
             NotImplementedError, "No registered serialization name"
         ):
@@ -1042,9 +1227,13 @@ def __init__(self, x, y):
             to_dumpable_context=lambda context: "moo",
             from_dumpable_context=lambda dumpable_context: None,
         )
+<<<<<<< HEAD
         spec = py_pytree.TreeSpec(
             DummyType, None, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
         )
+=======
+        spec = py_pytree.tree_structure(DummyType(1, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         serialized_spec = py_pytree.treespec_dumps(spec, 1)
         self.assertIn("moo", serialized_spec)
         roundtrip_spec = py_pytree.treespec_loads(serialized_spec)
@@ -1082,9 +1271,13 @@ def __init__(self, x, y):
             from_dumpable_context=lambda dumpable_context: None,
         )
 
+<<<<<<< HEAD
         spec = py_pytree.TreeSpec(
             DummyType, None, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
         )
+=======
+        spec = py_pytree.tree_structure(DummyType(1, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with self.assertRaisesRegex(
             TypeError, "Object of type type is not JSON serializable"
@@ -1095,9 +1288,13 @@ def test_pytree_serialize_bad_protocol(self):
         import json
 
         Point = namedtuple("Point", ["x", "y"])
+<<<<<<< HEAD
         spec = py_pytree.TreeSpec(
             namedtuple, Point, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
         )
+=======
+        spec = py_pytree.tree_structure(Point(1, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         py_pytree._register_namedtuple(
             Point,
             serialized_type_name="test_pytree.test_pytree_serialize_bad_protocol.Point",
@@ -1168,6 +1365,7 @@ def test_tree_map_with_path(self):
 
     def test_dataclass(self):
         @dataclass
+<<<<<<< HEAD
         class Point:
             x: torch.Tensor
             y: torch.Tensor
@@ -1178,6 +1376,57 @@ class Point:
         point = py_pytree.tree_map(lambda x: x + 1, point)
         self.assertEqual(point.x, torch.tensor(1))
         self.assertEqual(point.y, torch.tensor(2))
+=======
+        class Data:
+            a: torch.Tensor
+            b: str = "moo"
+            c: Optional[str] = None
+            d: str = field(init=False, default="")
+
+        py_pytree.register_dataclass(Data)
+        old_data = Data(torch.tensor(3), "b", "c")
+        old_data.d = "d"
+        new_data = py_pytree.tree_unflatten(*py_pytree.tree_flatten(old_data))
+        self.assertEqual(new_data.a, torch.tensor(3))
+        self.assertEqual(new_data.b, "b")
+        self.assertEqual(new_data.c, "c")
+        self.assertEqual(new_data.d, "")
+        py_pytree._deregister_pytree_node(Data)
+
+        with self.assertRaisesRegex(ValueError, "Missing fields"):
+            py_pytree.register_dataclass(Data, field_names=["a", "b"])
+
+        with self.assertRaisesRegex(ValueError, "Unexpected fields"):
+            py_pytree.register_dataclass(Data, field_names=["a", "b", "e"])
+
+        with self.assertRaisesRegex(ValueError, "Unexpected fields"):
+            py_pytree.register_dataclass(Data, field_names=["a", "b", "c", "d"])
+
+        py_pytree.register_dataclass(
+            Data, field_names=["a"], drop_field_names=["b", "c"]
+        )
+        old_data = Data(torch.tensor(3), "b", "c")
+        new_data = py_pytree.tree_unflatten(*py_pytree.tree_flatten(old_data))
+        self.assertEqual(new_data.a, torch.tensor(3))
+        self.assertEqual(new_data.b, "moo")
+        self.assertEqual(new_data.c, None)
+        py_pytree._deregister_pytree_node(Data)
+
+    def test_register_dataclass_class(self):
+        class CustomClass:
+            def __init__(self, x, y):
+                self.x = x
+                self.y = y
+
+        with self.assertRaisesRegex(ValueError, "field_names must be specified"):
+            py_pytree.register_dataclass(CustomClass)
+
+        py_pytree.register_dataclass(CustomClass, field_names=["x", "y"])
+        c = CustomClass(torch.tensor(0), torch.tensor(1))
+        mapped = py_pytree.tree_map(lambda x: x + 1, c)
+        self.assertEqual(mapped.x, torch.tensor(1))
+        self.assertEqual(mapped.y, torch.tensor(2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_constant(self):
         # Either use `frozen=True` or `unsafe_hash=True` so we have a
diff --git a/test/test_quantization.py b/test/test_quantization.py
index 61a8e310c7af..aec198e97919 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -38,12 +38,20 @@
 from quantization.core.test_workflow_module import TestFusedObsFakeQuantModule  # noqa: F401
 from quantization.core.test_backend_config import TestBackendConfig  # noqa: F401
 from quantization.core.test_utils import TestUtils  # noqa: F401
+<<<<<<< HEAD
+=======
+log = logging.getLogger(__name__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 try:
     # This test has extra data dependencies, so in some environments, e.g. Meta internal
     # Buck, it has its own test runner.
     from quantization.core.test_docs import TestQuantizationDocs  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     logging.warning(e)
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Eager Mode Workflow. Tests for the functionality of APIs and different features implemented
 # using eager mode.
@@ -77,7 +85,11 @@
 except ImportError as e:
     # In FBCode we separate FX out into a separate target for the sake of dev
     # velocity. These are covered by a separate test target `quantization_fx`
+<<<<<<< HEAD
     logging.warning(e)
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # PyTorch 2 Export Quantization
 try:
@@ -99,7 +111,11 @@
 except ImportError as e:
     # In FBCode we separate PT2 out into a separate target for the sake of dev
     # velocity. These are covered by a separate test target `quantization_pt2e`
+<<<<<<< HEAD
     logging.warning(e)
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 try:
     from quantization.fx.test_numeric_suite_fx import TestFXGraphMatcher  # noqa: F401
@@ -108,7 +124,11 @@
     from quantization.fx.test_numeric_suite_fx import TestFXNumericSuiteNShadows  # noqa: F401
     from quantization.fx.test_numeric_suite_fx import TestFXNumericSuiteCoreAPIsModels  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     logging.warning(e)
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Test the model report module
 try:
@@ -120,19 +140,31 @@
     from quantization.fx.test_model_report_fx import TestFxDetectOutliers  # noqa: F401
     from quantization.fx.test_model_report_fx import TestFxModelReportVisualizer  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     logging.warning(e)
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Equalization for FX mode
 try:
     from quantization.fx.test_equalize_fx import TestEqualizeFx  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     logging.warning(e)
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Backward Compatibility. Tests serialization and BC for quantized modules.
 try:
     from quantization.bc.test_backward_compatibility import TestSerialization  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     logging.warning(e)
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # JIT Graph Mode Quantization
 from quantization.jit.test_quantize_jit import TestQuantizeJit  # noqa: F401
@@ -151,12 +183,17 @@
 try:
     from quantization.ao_migration.test_quantization_fx import TestAOMigrationQuantizationFx  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     logging.warning(e)
+=======
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Experimental functionality
 try:
     from quantization.core.experimental.test_bits import TestBitsCPU  # noqa: F401
 except ImportError as e:
+<<<<<<< HEAD
     logging.warning(e)
 try:
     from quantization.core.experimental.test_bits import TestBitsCUDA  # noqa: F401
@@ -174,6 +211,25 @@
     from quantization.core.experimental.test_float8 import TestFloat8DtypeCPUOnlyCPU  # noqa: F401
 except ImportError as e:
     logging.warning(e)
+=======
+    log.warning(e)
+try:
+    from quantization.core.experimental.test_bits import TestBitsCUDA  # noqa: F401
+except ImportError as e:
+    log.warning(e)
+try:
+    from quantization.core.experimental.test_floatx import TestFloat8DtypeCPU  # noqa: F401
+except ImportError as e:
+    log.warning(e)
+try:
+    from quantization.core.experimental.test_floatx import TestFloat8DtypeCUDA  # noqa: F401
+except ImportError as e:
+    log.warning(e)
+try:
+    from quantization.core.experimental.test_floatx import TestFloat8DtypeCPUOnlyCPU  # noqa: F401
+except ImportError as e:
+    log.warning(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_reductions.py b/test/test_reductions.py
index dc84432777d3..ff5334109fc0 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -65,7 +65,11 @@ def _rand_shape(dim, min_size, max_size):
         shape.append(random.randint(min_size, max_size))
     return tuple(shape)
 
+<<<<<<< HEAD
 def _reduced_shape(shape, dim=None, keepdim=False):
+=======
+def _reduced_shape(shape, empty_dim_as_none=False, dim=None, keepdim=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Computes the expected reduced shape given dim and keepdim
 
     Args:
@@ -77,7 +81,11 @@ def _reduced_shape(shape, dim=None, keepdim=False):
     Returns:
         The reduced shape
     """
+<<<<<<< HEAD
     if dim is None:
+=======
+    if dim is None or (empty_dim_as_none and dim == []):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [1] * len(shape) if keepdim else []
 
     # Wrap negative dims
@@ -105,7 +113,12 @@ def _test_dim_keepdim(self, op: ReductionOpInfo, device, *, ndim, **dim_keepdim)
         t = make_tensor(shape, dtype=torch.float, device=device)
         args, kwargs = next(op.generate_args_kwargs(t, **dim_keepdim))
         result = op(t, *args, **dim_keepdim, **kwargs)
+<<<<<<< HEAD
         expected_shape = _reduced_shape(shape, **dim_keepdim)
+=======
+        empty_dim_as_none = (op.name == "linalg.vector_norm" or op.name == "_refs.linalg.vector_norm")
+        expected_shape = _reduced_shape(shape, empty_dim_as_none, **dim_keepdim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(result.shape, expected_shape, f"""
         expected output shape to be {expected_shape} but got {list(result.shape)}
         for input shape {shape} and {dim_keepdim}
@@ -314,7 +327,11 @@ def test_empty_tensor_nonempty_slice(self, device, op: ReductionOpInfo):
         for dim in [1] + [[1, 2]] if op.supports_multiple_dims else []:
             args, kwargs = next(op.generate_args_kwargs(t, dim=dim))
             result = op(t, *args, dim=dim, **kwargs)
+<<<<<<< HEAD
             self.assertEqual(result.shape, _reduced_shape(t.shape, dim))
+=======
+            self.assertEqual(result.shape, _reduced_shape(t.shape, dim=dim))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_noncontiguous(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs):
         """Helper method to test noncontiguous input tensors."""
@@ -1759,7 +1776,10 @@ def is_integral(dtype):
         # On Windows CI, the current version of `numpy` promotes all lower integers
         # dtypes to int32 while `torch` promotes them to int64. Hence we skip on checking
         # the exact dtype.
+<<<<<<< HEAD
         # Reference : https://dr.pytorch.org/api/view-log-full?build_id=122051580
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # PR : https://github.com/pytorch/pytorch/pull/38628#issuecomment-655905370
         if IS_WINDOWS and is_integral(dtype):
             exact_dtype = False
@@ -1792,7 +1812,11 @@ def test_nansum_vs_numpy(self, device, dtype):
     @dtypes(*complex_types())
     def test_nansum_complex(self, device, dtype):
         x = torch.randn((3, 3, 3), device=device, dtype=dtype)
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, "nansum does not support complex inputs"):
+=======
+        with self.assertRaisesRegex(RuntimeError, "nansum on CPU does not support complex inputs"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.nansum(x)
 
     @dtypes(*all_types_and(torch.half))
diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py
index 555c0be18625..1b857ef67657 100644
--- a/test/test_scatter_gather_ops.py
+++ b/test/test_scatter_gather_ops.py
@@ -13,6 +13,11 @@
 from torch.testing._internal.common_dtype import \
     (get_all_dtypes,)
 
+<<<<<<< HEAD
+=======
+from torch.testing._internal.common_cuda import CDNA3OrLater
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Protects against includes accidentally setting the default dtype
 assert torch.get_default_dtype() is torch.float32
 
@@ -63,6 +68,66 @@ def test_gather(self, device, dtype):
             actual = torch.gather(src, 2, idx)
             self.assertEqual(actual, expected, atol=0, rtol=0)
 
+<<<<<<< HEAD
+=======
+    @dtypes(torch.int8, torch.bfloat16)
+    def test_gather_large(self, device, dtype):
+        # test larger shapes to check vectorized implementation
+        for (m, n, k) in ((4096, 3072, 4096), (4096, 3072, 4100)):
+            src = make_tensor((m, k), device=device, dtype=dtype)
+            alloc0 = torch.empty(src.nelement() * 2, device=device, dtype=dtype)
+            discontig = alloc0.view(m, 2 * k)[:, ::2].copy_(src)
+            alloc1 = torch.empty(src.nelement() + 1, device=device, dtype=dtype)
+            misaligned = alloc1[1:].view(m, k).copy_(src)
+            alloc2 = torch.empty(m, k + 4, device=device, dtype=dtype)
+            misaligned1 = alloc2[:, :-4].copy_(src)
+            num_ind = n
+            for dim in (0, 1):
+                max_ind = src.shape[dim]
+                ind0 = torch.randint(max_ind, (num_ind,), device=device)
+                ind_discontig0 = torch.empty(num_ind * 2, device=device, dtype=torch.int64)[::2].copy_(ind0)
+                shape_ind = [1] * src.ndim
+                shape_ind[dim] = ind0.shape[0]
+                shape_out = list(src.shape)
+                shape_out[dim] = ind0.shape[0]
+                ind = ind0.view(shape_ind).expand(shape_out)
+                ind_discontig = ind_discontig0.view(shape_ind).expand(shape_out)
+                res = torch.gather(src, dim=dim, index=ind)
+                ref = src[ind0] if dim == 0 else src[:, ind0]
+                self.assertEqual(res, ref, atol=0, rtol=0)
+                if res.device.type == "cuda":
+                    ref_cpu = src.cpu()[ind0.cpu()] if dim == 0 else src.cpu()[:, ind0.cpu()]
+                    self.assertEqual(res.cpu(), ref_cpu, atol=0, rtol=0)
+                res = torch.gather(src, dim=dim, index=ind_discontig)
+                self.assertEqual(res, ref, atol=0, rtol=0)
+                res_ind = src[ind_discontig0] if dim == 0 else src[:, ind_discontig0]
+                self.assertEqual(res_ind, ref, atol=0, rtol=0)
+                res_ind_neg = src[ind0 - src.shape[dim]] if dim == 0 else src[:, ind0 - src.shape[1]]
+                self.assertEqual(res_ind_neg, ref, atol=0, rtol=0)
+                res = torch.gather(discontig, dim=dim, index=ind)
+                self.assertEqual(res, ref, atol=0, rtol=0)
+                res_ind = discontig[ind0] if dim == 0 else discontig[:, ind0]
+                self.assertEqual(res_ind, ref, atol=0, rtol=0)
+                res = torch.gather(misaligned, dim=dim, index=ind)
+                self.assertEqual(res, ref, atol=0, rtol=0)
+                res_ind = misaligned[ind0] if dim == 0 else misaligned[:, ind0]
+                self.assertEqual(res_ind, ref, atol=0, rtol=0)
+                res_ind = misaligned1[ind0] if dim == 0 else misaligned[:, ind0]
+                self.assertEqual(res_ind, ref, atol=0, rtol=0)
+                res_gather = torch.gather(misaligned1, dim=dim, index=ind)
+                self.assertEqual(res_gather, ref, atol=0, rtol=0)
+        # test gather along 1st dim that can accidentally trigger fast path
+        # because due to index dimension in the gather dim being 1
+        # an unexpected squashing in tensorIterator happens
+        src = make_tensor((16, 2, 16), device=device, dtype=dtype)
+        ind = torch.randint(2, (16, 1), device=device).view(16, 1, 1).expand(16, 1, 16)
+        res = torch.gather(src, dim=1, index=ind)
+        if res.device.type == "cuda":
+            ref_cpu = torch.gather(src.cpu(), dim=1, index=ind.cpu())
+            self.assertEqual(res.cpu(), ref_cpu, atol=0, rtol=0)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.bool)
     def test_gather_bool(self, device, dtype):
         src = torch.tensor(((False, True), (True, True)), device=device, dtype=dtype)
@@ -157,8 +222,12 @@ def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction,
         else:
             # When we are running opportunistic_fastatomics, we will expect some floating point rounding
             # errors as the order of operation is not guaranteed.
+<<<<<<< HEAD
             if TEST_WITH_ROCM \
                     and 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName \
+=======
+            if TEST_WITH_ROCM and CDNA3OrLater() \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     and not torch.are_deterministic_algorithms_enabled():
                 self.assertEqual(actual, expected, atol=1e-9, rtol=1e-6)
             else:
diff --git a/test/test_schema_check.py b/test/test_schema_check.py
index 9e1d6a6f1250..3d41f8f56dbc 100644
--- a/test/test_schema_check.py
+++ b/test/test_schema_check.py
@@ -232,7 +232,11 @@ def test_schema_check_mode_functionality(self):
             actual = x.relu().sin()
         self.assertEqual(expected, actual)
 
+<<<<<<< HEAD
     # Tests that SchemaCheckMode wraps torch.Tensor when an argument's default is overriden
+=======
+    # Tests that SchemaCheckMode wraps torch.Tensor when an argument's default is overridden
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_schema_check_mode_functionality_default_replaced(self):
         x = torch.rand((3, 3), requires_grad=True)
         expected = x.add(x, alpha=2)
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 94632b1a0ffc..809248db3e19 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -413,6 +413,10 @@ def test_serialization_sparse(self):
     def test_serialization_sparse_safe(self):
         self._test_serialization(True)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(True, "Temporary skip due to gh-153143")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_serialization_sparse_invalid(self):
         x = torch.zeros(3, 3)
         x[1][1] = 1
@@ -438,11 +442,19 @@ def __reduce_ex__(self, proto):
             torch.save({"spoofed": TensorSerializationSpoofer(x)}, f)
             for weights_only in (False, True):
                 f.seek(0)
+<<<<<<< HEAD
                 with self.assertRaisesRegex(
+=======
+                with torch.sparse.check_sparse_tensor_invariants(), self.assertRaisesRegex(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         RuntimeError,
                         "size is inconsistent with indices"):
                     y = torch.load(f, weights_only=weights_only)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(True, "Temporary skip due to gh-153143")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_serialization_sparse_invalid_legacy_ctor(self):
         # This is set in test class setup but would not be check when running user code
         prev_invariant_check_enabled = torch.sparse.check_sparse_tensor_invariants.is_enabled()
@@ -469,14 +481,24 @@ def __reduce_ex__(self, proto):
                 torch.save(sd, f)
                 for weights_only in (True,):
                     f.seek(0)
+<<<<<<< HEAD
                     with self.assertRaisesRegex(
                             RuntimeError,
                             "size is inconsistent with indices"):
+=======
+                    with torch.sparse.check_sparse_tensor_invariants(), self.assertRaisesRegex(
+                            RuntimeError,
+                            "size is inconsistent with indices|found negative index"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         y = torch.load(f, weights_only=weights_only)
         finally:
             if prev_invariant_check_enabled:
                 torch.sparse.check_sparse_tensor_invariants.enable()
 
+<<<<<<< HEAD
+=======
+    @torch.sparse.check_sparse_tensor_invariants(enable=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_serialization_sparse_compressed_invalid(self,
                                                       conversion,
                                                       get_compressed_indices,
@@ -515,18 +537,34 @@ def __reduce_ex__(self, proto):
                     f"`{compressed_indices_name}[[]..., 0[]] == 0` is not satisfied."):
                 y = torch.load(f)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(True, "Temporary skip due to gh-153143")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_serialization_sparse_csr_invalid(self):
         self._test_serialization_sparse_compressed_invalid(
             torch.Tensor.to_sparse_csr, torch.Tensor.crow_indices, torch.Tensor.col_indices)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(True, "Temporary skip due to gh-153143")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_serialization_sparse_csc_invalid(self):
         self._test_serialization_sparse_compressed_invalid(
             torch.Tensor.to_sparse_csc, torch.Tensor.ccol_indices, torch.Tensor.row_indices)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(True, "Temporary skip due to gh-153143")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_serialization_sparse_bsr_invalid(self):
         self._test_serialization_sparse_compressed_invalid(
             lambda x: x.to_sparse_bsr((1, 1)), torch.Tensor.crow_indices, torch.Tensor.col_indices)
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(True, "Temporary skip due to gh-153143")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_serialization_sparse_bsc_invalid(self):
         self._test_serialization_sparse_compressed_invalid(
             lambda x: x.to_sparse_bsc((1, 1)), torch.Tensor.ccol_indices, torch.Tensor.row_indices)
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index e0c5e3cd7174..97251d7d14c3 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -222,6 +222,7 @@ def test_sort_large(self, device, dtype):
         t = t0.view(1, 8192).expand(2**18 + 1, -1).contiguous()
         v, i = t.sort()
         del t
+<<<<<<< HEAD
         iv, im = i.var_mean(dim=0)
         del i
         vv, vm = v.var_mean(dim=0)
@@ -230,6 +231,16 @@ def test_sort_large(self, device, dtype):
         self.assertEqual(iv, torch.zeros_like(iv))
         self.assertEqual(vm, torch.arange(255, dtype=dtype, device=device))
         self.assertEqual(im, t0.sort().indices)
+=======
+        iv, im = torch.var_mean(i.to(dtype), dim=0)
+        del i
+        vv, vm = torch.var_mean(v.to(dtype), dim=0)
+        del v
+        self.assertEqual(vv, torch.zeros_like(vv))
+        self.assertEqual(iv, torch.zeros_like(iv))
+        self.assertEqual(vm, torch.arange(8192, dtype=dtype, device=device))
+        self.assertEqual(im, t0.sort().indices, exact_dtype=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dtypes(torch.float32)
     def test_sort_restride(self, device, dtype):
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 64d7ad9b1c2a..0b097abe3826 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -8,7 +8,11 @@
 import random
 import unittest
 from torch.testing import make_tensor
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \
+=======
+from torch.testing._internal.common_utils import TestCase, run_tests, do_test_dtypes, \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     load_tests, TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, gradcheck, coalescedonoff, \
     DeterministicGuard, first_sample, TEST_WITH_CROSSREF, TEST_WITH_ROCM, skipIfTorchDynamo, \
     parametrize, subtest, is_coalesced_indices, suppress_warnings, instantiate_parametrized_tests, \
@@ -440,6 +444,7 @@ def test_ctor_is_coalesced_with_gradcheck(self, device, dtype, coalesced):
             self.assertEqual(t.is_coalesced(), coalesced)
 
             def func(indices, values, shape, is_coalesced):
+<<<<<<< HEAD
                 s = torch.sparse_coo_tensor(indices, values, shape, check_invariants=True, is_coalesced=is_coalesced)
                 self.assertEqual(s.is_coalesced(), is_coalesced)
                 return s.to_dense(masked_grad=False)
@@ -452,6 +457,24 @@ def func(indices, values, shape, is_coalesced):
                 with self.assertRaisesRegex(RuntimeError,
                                             "cannot set is_coalesced to true if indices correspond to uncoalesced COO tensor"):
                     torch.autograd.gradcheck(func, (t._indices(), t._values().requires_grad_(True), t.shape, True))
+=======
+                if shape is None:
+                    s = torch.sparse_coo_tensor(indices, values, check_invariants=True, is_coalesced=is_coalesced)
+                else:
+                    s = torch.sparse_coo_tensor(indices, values, shape, check_invariants=True, is_coalesced=is_coalesced)
+                self.assertEqual(s.is_coalesced(), is_coalesced)
+                return s.to_dense(masked_grad=False)
+
+            for shape in {t.shape, None}:
+                if coalesced:
+                    torch.autograd.gradcheck(func, (t._indices(), t._values().requires_grad_(True), shape, False))
+                    torch.autograd.gradcheck(func, (t._indices(), t._values().requires_grad_(True), shape, True))
+                else:
+                    torch.autograd.gradcheck(func, (t._indices(), t._values().requires_grad_(True), shape, False))
+                    with self.assertRaisesRegex(RuntimeError,
+                                                "cannot set is_coalesced to true if indices correspond to uncoalesced COO tensor"):
+                        torch.autograd.gradcheck(func, (t._indices(), t._values().requires_grad_(True), shape, True))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dtypes(*floating_and_complex_types_and(torch.float16, torch.bfloat16))
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
@@ -3608,6 +3631,7 @@ def test_log_softmax_zero_nnz(self, device, dtype):
         self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 10, device, dtype)
 
     # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA
+<<<<<<< HEAD
     @skipIfRocm
     @coalescedonoff
     @dtypes(*floating_and_complex_types())
@@ -3615,6 +3639,16 @@ def test_log_softmax_zero_nnz(self, device, dtype):
                                       *[torch.bfloat16] if SM80OrLater else [],
                                       torch.complex64,
                                       *[torch.complex128] if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else []))
+=======
+    @coalescedonoff
+    @dtypes(*floating_and_complex_types())
+    @dtypesIfCUDA(*floating_types_and(*[torch.half] if SM53OrLater and not TEST_WITH_ROCM else [],
+                                      *[torch.bfloat16] if SM80OrLater and not TEST_WITH_ROCM else [],
+                                      torch.complex64,
+                                      *[torch.complex128]
+                                      if CUSPARSE_SPMM_COMPLEX128_SUPPORTED or HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+                                      else []))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(TEST_WITH_CROSSREF, "not working with fake tensor")
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2, torch.complex64: 1e-2, torch.float32: 1e-2})
     def test_sparse_matmul(self, device, dtype, coalesced):
@@ -4056,21 +4090,33 @@ class TestSparseOneOff(TestCase):
     def test_cuda_from_cpu(self):
         with self.assertRaisesRegex(
                 RuntimeError,
+<<<<<<< HEAD
                 "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"):
+=======
+                "Expected all tensors to be on the same device"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.sparse_coo_tensor(torch.zeros(1, 4).long().cuda(),
                                     torch.randn(4, 4, 4),
                                     [3, 4, 4])
 
         with self.assertRaisesRegex(
                 RuntimeError,
+<<<<<<< HEAD
                 "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"):
+=======
+                "Expected all tensors to be on the same device"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.sparse_coo_tensor(torch.zeros(1, 4).long().cuda(),
                                     torch.randn(4, 4, 4, 0),
                                     [3, 4, 4, 0])
 
         with self.assertRaisesRegex(
                 RuntimeError,
+<<<<<<< HEAD
                 "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"):
+=======
+                "Expected all tensors to be on the same device"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.sparse_coo_tensor(torch.empty(1, 0).long().cuda(),
                                     torch.randn(0, 4, 4, 0),
                                     [0, 4, 4, 0])
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 4ef62c9184d0..1b3ce6294d88 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -12,8 +12,13 @@
 from torch.testing._internal.common_cuda import SM53OrLater, SM80OrLater, TEST_CUSPARSE_GENERIC
 from torch.testing._internal.common_utils import \
     (TEST_WITH_TORCHINDUCTOR, TEST_WITH_ROCM, TEST_CUDA_CUDSS, TEST_SCIPY, TEST_NUMPY, TEST_MKL, IS_WINDOWS, TestCase,
+<<<<<<< HEAD
      run_tests, load_tests, coalescedonoff, parametrize, subtest, skipIfTorchDynamo, skipIfRocm, IS_FBCODE, IS_REMOTE_GPU,
      suppress_warnings)
+=======
+     run_tests, load_tests, coalescedonoff, parametrize, subtest, skipIfTorchDynamo, skipIfRocm,
+     skipIfRocmVersionLessThan, IS_FBCODE, IS_REMOTE_GPU, suppress_warnings)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_device_type import \
     (ops, instantiate_device_type_tests, dtypes, OpDTypes, dtypesIfCUDA, onlyCPU, onlyCUDA, skipCUDAIfNoSparseGeneric,
      precisionOverride, skipMeta, skipCUDAIf, skipCPUIfNoMklSparse, skipCUDAIfRocmVersionLessThan,
@@ -1495,7 +1500,12 @@ def test_csr_matvec(self, device, dtype):
             res = csr.matmul(vec)
             expected = csr.to_dense().matmul(vec)
 
+<<<<<<< HEAD
             self.assertEqual(res, expected)
+=======
+            atol, rtol = (2e-3, 1e-3) if dtype == torch.half else (None, None)
+            self.assertEqual(res, expected, atol=atol, rtol=rtol)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             bad_vec = torch.randn(side + 10, dtype=dtype, device=device)
             err_msg = "size mismatch, got"
@@ -1545,8 +1555,13 @@ def run_test(c, a, a_batched, b, op_b=False, op_out=False, *, dtype=None, device
                     run_test(c, a, a_batched, b, op_b, op_out, dtype=dtype, device=device)
 
     @onlyCUDA
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_ROCM, "Only CUDA 11+ is supported")
     @skipCUDAIfNoSparseGeneric
+=======
+    @skipCUDAIfNoSparseGeneric
+    @skipIfRocmVersionLessThan((6, 3))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_bmm(self, device, dtype):
         def run_test(a, a_batched, b, op_b=False, op_out=False, *, dtype=None, device=None):
@@ -1834,7 +1849,11 @@ def run_test(a, b, upper, transpose, unitriangular, op_out):
                 run_test(a, b, upper, unitriangular, transpose, op_out)
 
     @skipCPUIfNoMklSparse
+<<<<<<< HEAD
     @unittest.skipIf(TEST_WITH_ROCM, "Only CUDA 11+ is supported")
+=======
+    @skipIfRocmVersionLessThan((6, 3))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.double)
     def test_mm(self, device, dtype):
         def test_shape(di, dj, dk, nnz0=None, nnz1=None):
@@ -1984,10 +2003,15 @@ def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
             test_shape(7, 8, 9, 20, True, index_dtype, (1, 1))
 
     @skipCPUIfNoMklSparse
+<<<<<<< HEAD
+=======
+    @skipIfRocmVersionLessThan((6, 3))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(*floating_and_complex_types())
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
     @dtypesIfCUDA(*floating_types_and(torch.complex64,
+<<<<<<< HEAD
                                       *[torch.bfloat16] if SM80OrLater else [],
                                       *[torch.half] if SM53OrLater else [],
                                       *[torch.complex128] if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else []))
@@ -1996,6 +2020,14 @@ def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
         not _check_cusparse_spgemm_available(),
         "cuSparse Generic API SpGEMM is not available"
     )
+=======
+                                      *[torch.bfloat16] if (SM80OrLater and not TEST_WITH_ROCM) else [],
+                                      *[torch.half] if (SM53OrLater and not TEST_WITH_ROCM) else [],
+                                      *[torch.complex128]
+                                      if CUSPARSE_SPMM_COMPLEX128_SUPPORTED or HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+                                      else []))
+    @sparse_compressed_nonblock_layouts()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addmm_all_sparse_csr(self, device, dtype, layout):
         M = torch.randn(10, 25, device=device).to(dtype)
         m1 = torch.randn(10, 50, device=device).to(dtype)
@@ -2066,16 +2098,24 @@ def maybe_transpose(cond, m):
     @skipCPUIfNoMklSparse
     @dtypes(*floating_and_complex_types())
     @dtypesIfCUDA(*floating_types_and(torch.complex64,
+<<<<<<< HEAD
                                       *[torch.bfloat16] if SM80OrLater else [],
                                       *[torch.half] if SM53OrLater else [],
+=======
+                                      *[torch.bfloat16] if SM80OrLater and not TEST_WITH_ROCM else [],
+                                      *[torch.half] if SM53OrLater and not TEST_WITH_ROCM else [],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                       *[torch.complex128]
                                       if CUSPARSE_SPMM_COMPLEX128_SUPPORTED or HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
                                       else []))
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
     def test_addmm_sizes_all_sparse_csr(self, device, dtype, m, n, k):
+<<<<<<< HEAD
         if (TEST_WITH_ROCM and k != 0 and n != 0 and m != 0):
             self.skipTest("Skipped on ROCm")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         M = torch.randn(n, m, device=device).to(dtype)
         m1 = torch.randn(n, k, device=device).to(dtype)
         m2 = torch.randn(k, m, device=device).to(dtype)
diff --git a/test/test_sparse_semi_structured.py b/test/test_sparse_semi_structured.py
index 83dff20d5f6c..8374002fd555 100644
--- a/test/test_sparse_semi_structured.py
+++ b/test/test_sparse_semi_structured.py
@@ -47,17 +47,29 @@
 
 _IS_SM8X = False
 _IS_SM9X = False
+<<<<<<< HEAD
+=======
+_IS_HIPSPARSELT_AVAILABLE = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if torch.cuda.is_available():
     _IS_SM8X = torch.cuda.get_device_capability(0)[0] == 8
     _IS_SM9X = torch.cuda.get_device_capability(0)[0] == 9
+<<<<<<< HEAD
 
+=======
+    _IS_HIPSPARSELT_AVAILABLE = torch.version.hip is not None and tuple(int(v) for v in torch.version.hip.split('.')[:2]) > (6, 4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # CUTLASS kernels only work for Ampere
     if _IS_SM8X:
         SEMI_STRUCTURED_SUPPORTED_BACKENDS["cutlass"] = SparseSemiStructuredTensorCUTLASS
 
     # add cuSPASRELt tests if available
+<<<<<<< HEAD
     if torch.backends.cusparselt.is_available() and (_IS_SM8X or _IS_SM9X):
+=======
+    if torch.backends.cusparselt.is_available() and (_IS_SM8X or _IS_SM9X or _IS_HIPSPARSELT_AVAILABLE):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         SEMI_STRUCTURED_SUPPORTED_BACKENDS["cusparselt"] = SparseSemiStructuredTensorCUSPARSELT
 
 inference_dtypes = dtypes(torch.float16, torch.bfloat16, torch.int8)
@@ -223,6 +235,10 @@ def forward(self, x):
 
     @unittest.skipIf(IS_WINDOWS, "torch.compile not supported on windows")
     @unittest.skipIf("cusparselt" not in SEMI_STRUCTURED_SUPPORTED_BACKENDS, "cusparselt not supported on this machine")
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mlp_contiguous_relu_compile_cusparselt(self):
         """
         test for cuSPASRELt meta registrations (_cslt_sparse_mm) + torch.compile
@@ -233,6 +249,10 @@ def test_mlp_contiguous_relu_compile_cusparselt(self):
 
     @unittest.skipIf("cutlass" not in SEMI_STRUCTURED_SUPPORTED_BACKENDS, "cutlass not supported on this machine")
     @unittest.skipIf(IS_WINDOWS, "torch.compile not supported on windows")
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mlp_contiguous_relu_compile_cutlass(self):
         """
         test for CUTLASS meta registrations (_sparse_semi_structured_addmm) + torch.compile
@@ -243,6 +263,10 @@ def test_mlp_contiguous_relu_compile_cutlass(self):
 
     @unittest.skipIf(IS_WINDOWS, "torch.compile not supported on windows")
     @unittest.skipIf("cusparselt" not in SEMI_STRUCTURED_SUPPORTED_BACKENDS, "cusparselt not supported on this machine")
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_compile(self) -> None:
         x = torch.randn([1024, 512], device="cuda", dtype=torch.float16, requires_grad=True)
 
@@ -571,6 +595,10 @@ def setUp(self):
 
 
     @training_dtypes
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_prune_dense_static_sort(self, dtype) -> None:
         # Ideally we would like to clone and compare, but that won't work because the sorting order will be different
         # instead we pass the pruned matrix to the CUDA implementation and preserve the sparsity pattern.
@@ -615,6 +643,10 @@ def test_prune_dense_static_sort(self, dtype) -> None:
 
     @training_dtypes
     @parametrize_backends
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pruning_algo_largest_abs_values_greedy(self, dtype, backend) -> None:
         inp = torch.tensor(
             [[4, 3, 2, 1], [-1, -3, 0.6, 0.5], [1, 2, 3, 4], [10, 2, -1, 5]],
@@ -651,6 +683,10 @@ def test_gemm(self, dtype) -> None:
 
     @training_dtypes
     @parametrize_backends
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pack_both_ways_meta_correctness(self, dtype, backend) -> None:
         M, N = 128, 256
         # Construct x to make sure we always have exactly 8 elements per 4x4 tile
@@ -684,6 +720,10 @@ def test_pack_both_ways_meta_correctness(self, dtype, backend) -> None:
         torch.testing.assert_close(ref_gemm, pack_gemm, **atol_rtol_kw[dtype])
 
     @training_dtypes
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pack_both_ways_id(self, dtype) -> None:
         N = 512
         torch.manual_seed(0)
@@ -718,6 +758,10 @@ def test_pack_both_ways_id(self, dtype) -> None:
         ), f"packed_t is wrong at pos: ({max_diff // N}, {max_diff % N})"
 
     @training_dtypes
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_pack_both_ways_edge_case1(self, dtype) -> None:
         # In this case, the heuristic will keep 7 values out of 16
         # instead of 8. let's see how the kernel handles this
@@ -742,6 +786,10 @@ def test_pack_both_ways_edge_case1(self, dtype) -> None:
         assert packed_t[0, 1].item() == 0
 
     @training_dtypes
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_apply(self, dtype) -> None:
         M, N = 256, 1024
         x = torch.randn([M, N], dtype=dtype, device="cuda")
@@ -757,6 +805,10 @@ def test_sp24_apply(self, dtype) -> None:
         torch.testing.assert_close(packed_t, packed_t2)
 
     @training_dtypes
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_apply_dense(self, dtype) -> None:
         M, N = 256, 1024
         x = torch.randn([M, N], dtype=dtype, device="cuda")
@@ -794,6 +846,10 @@ def test_sp24_apply_dense(self, dtype) -> None:
 
 
     @training_dtypes
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_matmuls(self, dtype) -> None:
         M, N, K = 64, 256, 1024
         a = torch.randn([M, K], device="cuda", dtype=dtype)
@@ -828,6 +884,10 @@ def test_sp24_matmuls(self, dtype) -> None:
             a_s.t() @ a, (a * a_m).t() @ a, rtol=1e-1, atol=1e-1
         )
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_matmuls_mat_vec(self) -> None:
         a = torch.randn([64, 128], device="cuda", dtype=torch.float16)
         b = torch.randn([128], device="cuda", dtype=torch.float16)
@@ -837,7 +897,11 @@ def test_sp24_matmuls_mat_vec(self) -> None:
         with pytest.raises(NotImplementedError):
             torch.testing.assert_close(a_s @ b, (a * a_m) @ b, **atol_rtol_kw[a.dtype])
 
+<<<<<<< HEAD
 
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_sp24_matmuls_bmm(self) -> None:
         a = torch.randn([64, 128], device="cuda", dtype=torch.float16)
         b = torch.randn([5, 6, 128], device="cuda", dtype=torch.float16)
@@ -859,7 +923,11 @@ def setUp(self):
 
     def tearDown(self):
         SparseSemiStructuredTensor._FORCE_CUTLASS = False
+<<<<<<< HEAD
         super(self.__class__, self).tearDown()
+=======
+        super().tearDown()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(TEST_WITH_ROCM or IS_WINDOWS, "ROCm and Windows doesn't support CUTLASS")
     @inference_dtypes
@@ -988,6 +1056,10 @@ def run_test(m, n, k, device, dtype, dtype_out, use_input, rtol, atol):
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @inference_dtypes
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conversions(self, device, dtype):
 
         def run_test(r, c, device, dtype):
@@ -1016,6 +1088,10 @@ def run_test(r, c, device, dtype):
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @inference_dtypes
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_conversions_all_patterns(self, device, dtype):
         r, c = 32, 128
 
@@ -1135,6 +1211,10 @@ def test_cslt_sparse_mm_mixed_dtype(self, dense_input_shape, out_dtype, device):
 
     @unittest.skip("cuSPARSELt v0.6.x does not support bfloat/float16 alpha scaling")
     @training_dtypes
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cslt_sparse_mm_alpha(self, dtype, device):
         A = torch.Tensor([0, 0, 1, 1]).tile((128, 64)).to(dtype).cuda()
         B = torch.ones((256, 128), device=device).to(dtype)
@@ -1151,6 +1231,10 @@ def test_cslt_sparse_mm_alpha(self, dtype, device):
         torch.testing.assert_close(sparse_result, dense_result, rtol=1e-3, atol=1e-3)
 
     @parametrize("out_dtype", [torch.float16, torch.bfloat16, torch.int32])
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cslt_sparse_mm_alpha_compile_autotune(self, device, out_dtype):
         A = torch.Tensor([0, 0, 1, 1]).tile((128, 64)).to(torch.int8).to(device)
         B = torch.ones((128, 256), device=device, dtype=torch.int8).t()
@@ -1172,6 +1256,10 @@ def get_dense_result():
         torch.testing.assert_close(sparse_result.cpu(), get_dense_result(), rtol=1e-3, atol=1e-3)
 
     @parametrize("out_dtype", [torch.float16, torch.bfloat16, torch.int32])
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cslt_sparse_mm_alpha_mixed_dtype(self, out_dtype, device):
         A = torch.Tensor([0, 0, 10, 10]).tile((128, 64)).to(torch.int8).cuda()
         B = torch.ones((128, 256), device=device).to(torch.int8).t()
@@ -1207,11 +1295,19 @@ def test_csrc_cslt_sparse_mm_search(self, device, dtype):
         B = torch.ones((128, 128), device=device).to(dtype)
 
         A_compressed = torch._cslt_compress(A)
+<<<<<<< HEAD
         alg_id, split_k, split_k_one_kernel, _ = torch._C._cusparselt.mm_search(A_compressed, B.t(), None, None, None, False)
         sparse_result = torch._cslt_sparse_mm(A_compressed, B.t(),
                                               alg_id=alg_id,
                                               split_k=split_k,
                                               split_k_one_kernel=split_k_one_kernel)
+=======
+        alg_id, split_k, split_k_mode, _ = torch._C._cusparselt.mm_search(A_compressed, B.t(), None, None, None, False)
+        sparse_result = torch._cslt_sparse_mm(A_compressed, B.t(),
+                                              alg_id=alg_id,
+                                              split_k=split_k,
+                                              split_k_mode=split_k_mode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dense_result = torch.mm(A.to(torch.float32), B.to(torch.float32))
         dense_result = dense_result.to(dtype)
         torch.testing.assert_close(sparse_result, dense_result, rtol=1e-3, atol=1e-3)
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 154c36832f72..cfde228c9a2c 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -60,20 +60,37 @@ def _hermitian_conj(x, dim):
     """
     out = torch.empty_like(x)
     mid = (x.size(dim) - 1) // 2
+<<<<<<< HEAD
     idx = [slice(None)] * out.dim()
     idx_center = list(idx)
     idx_center[dim] = 0
+=======
+    idx = tuple([slice(None)] * out.dim())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out[idx] = x[idx]
 
     idx_neg = list(idx)
     idx_neg[dim] = slice(-mid, None)
+<<<<<<< HEAD
     idx_pos = idx
     idx_pos[dim] = slice(1, mid + 1)
+=======
+    idx_neg = tuple(idx_neg)
+    idx_pos = list(idx)
+    idx_pos[dim] = slice(1, mid + 1)
+    idx_pos = tuple(idx_pos)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     out[idx_pos] = x[idx_neg].flip(dim)
     out[idx_neg] = x[idx_pos].flip(dim)
     if (2 * mid + 1 < x.size(dim)):
+<<<<<<< HEAD
+        idx[dim] = mid + 1
+=======
+        idx = list(idx)
         idx[dim] = mid + 1
+        idx = tuple(idx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out[idx] = x[idx]
     return out.conj()
 
@@ -518,6 +535,10 @@ def test_hfftn(self, device, dtype):
             lastdim_size = input.size(lastdim) // 2 + 1
             idx = [slice(None)] * input_ndim
             idx[lastdim] = slice(0, lastdim_size)
+<<<<<<< HEAD
+=======
+            idx = tuple(idx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input = input[idx]
 
             s = [shape[dim] for dim in actual_dims]
@@ -558,6 +579,10 @@ def test_ihfftn(self, device, dtype):
             lastdim_size = expect.size(lastdim) // 2 + 1
             idx = [slice(None)] * input_ndim
             idx[lastdim] = slice(0, lastdim_size)
+<<<<<<< HEAD
+=======
+            idx = tuple(idx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             expect = expect[idx]
 
             actual = torch.fft.ihfftn(input, dim=dim, norm="ortho")
diff --git a/test/test_sympy_utils.py b/test/test_sympy_utils.py
index e804e289c1c2..385a79262286 100644
--- a/test/test_sympy_utils.py
+++ b/test/test_sympy_utils.py
@@ -37,6 +37,10 @@
 from torch.utils._sympy.solve import INEQUALITY_TYPES, mirror_rel_op, try_solve
 from torch.utils._sympy.value_ranges import ValueRanges
 from torch._inductor.bounds import ValueRangeAnalysis
+<<<<<<< HEAD
+=======
+from torch._inductor.index_propagation import TypedExpr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 UNARY_OPS = [
@@ -968,6 +972,36 @@ def test_expand_identity(self):
         self.assertEqual(expanded.count(Identity), 0)
         self.assertEqual(expanded, arg)
 
+<<<<<<< HEAD
+=======
+    def test_cast_identity_int(self):
+        num = 1
+        expr = Identity(num)
+        self.assertEqual(num, int(expr))
+
+    def test_cast_identity_float(self):
+        num = 1.1
+        expr = Identity(num)
+        self.assertEqual(num, float(expr))
+
+    def test_cast_identity_illegal(self):
+        sym = Identity(sympy.Symbol("x"))
+        self.assertRaises(TypeError, int, sym)
+        self.assertRaises(TypeError, float, sym)
+
+        tup = (0, 1, 2)
+        tup_I = Identity(tup)
+        self.assertRaises(TypeError, int, tup_I)
+        self.assertRaises(TypeError, float, tup_I)
+
+class TestTypedExpr(TestCase):
+    def test_typed_expr(self):
+        I = Identity(1)
+        typed_I = TypedExpr(I, torch.int32)
+        self.assertEqual(typed_I.expr, 1)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_parametrized_tests(TestValueRanges)
 instantiate_parametrized_tests(TestSympyInterp)
 instantiate_parametrized_tests(TestSympySolve)
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 35db309a0bdf..25d230133bef 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -33,7 +33,10 @@
     IS_S390X,
     IS_ARM64,
     parametrize,
+<<<<<<< HEAD
     skipIfTorchDynamo,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     xfailIfTorchDynamo,
 )
 from torch.testing._internal.common_device_type import (
@@ -196,7 +199,10 @@ def test_fill_all_dtypes_and_devices(self, device):
                     self.assertEqual(x, torch.tensor([n] * numel, dtype=dt, device=device))
                     self.assertEqual(dt, x.dtype)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_roll(self, device):
         numbers = torch.arange(1, 9, device=device)
 
@@ -535,6 +541,17 @@ def test_cat_empty(self, device):
         res1 = torch.cat([empty, empty], dim=1)
         self.assertEqual(res1, empty)
 
+<<<<<<< HEAD
+=======
+    def test_concat_empty_list_error(self, device):
+        # Regression test for https://github.com/pytorch/pytorch/issues/155306
+        msg = "expected a non-empty list of Tensors"
+        with self.assertRaisesRegex(ValueError, msg):
+            torch.concat([], dim='N')
+        with self.assertRaisesRegex(ValueError, msg):
+            torch.concatenate([], dim='N')
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_cat_out(self, device):
         x = torch.zeros((0), device=device)
         y = torch.randn((4, 6), device=device)
@@ -775,7 +792,10 @@ def test_device_rounding(self, device, dtype):
 
     # Note: This test failed on XLA since its test cases are created by empty_strided which
     #       doesn't support overlapping sizes/strides in XLA impl
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails on this test for unknown reasons")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     def test_like_fn_stride_proparation_vs_tensoriterator_unary_op(self, device):
         # Test like functions against tensoriterator based unary operator (exp) to
@@ -1015,7 +1035,10 @@ def test_dstack(self, device, dtype):
             expected = np.dstack(np_input)
             self.assertEqual(actual, expected)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.int32, torch.int64)
     def test_large_linspace(self, device, dtype):
         start = torch.iinfo(dtype).min
@@ -1284,7 +1307,10 @@ def __getitem__(self, item):
             torch.tensor(bad_mock_seq, device=device)
         self.assertEqual(torch.tensor([1.0, 2.0, 3.0], device=device), torch.tensor(good_mock_seq, device=device))
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_simple_scalar_cast(self, device):
         ok = [torch.tensor([1.5], device=device), torch.zeros(1, 1, 1, 1, device=device)]
         ok_values = [1.5, 0]
@@ -1536,7 +1562,10 @@ def test_combinations(self, device):
         self.assertEqual(c1, expected)
         self.assertEqual(c2, expected)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipMeta
     def test_linlogspace_mem_overlap(self, device):
         x = torch.rand(1, device=device).expand(10)
@@ -1612,8 +1641,11 @@ def test_random_bool(self, device):
         self.assertEqual(t.max(), True)
         self.assertTrue(0.4 < (t.eq(True)).to(torch.int).sum().item() / size < 0.6)
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126834
     @xfailIfTorchDynamo
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_random_from_to_bool(self, device):
         size = 2000
 
@@ -1693,8 +1725,11 @@ def test_random_full_range(self, device, dtype):
 
     # NB: uint64 is broken because its max value is not representable in
     # int64_t, but this is what random expects
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126834
     @xfailIfTorchDynamo
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(*all_types_and(torch.bfloat16, torch.half, torch .uint16, torch.uint32))
     def test_random_from_to(self, device, dtype):
         size = 2000
@@ -1784,8 +1819,11 @@ def test_random_from_to(self, device, dtype):
                         lambda: t.random_(from_, to_)
                     )
 
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/issues/126834
     @xfailIfTorchDynamo
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(*all_types_and(torch.bfloat16, torch.half, torch.uint16, torch.uint32))
     def test_random_to(self, device, dtype):
         size = 2000
@@ -2333,7 +2371,10 @@ def test_as_tensor(self, device):
             self.assertNotEqual(torch.tensor(n, device='cuda'), n_astensor)
 
     # TODO: this test should be updated
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @suppress_warnings
     @dtypesIfCPU(torch.float, torch.bfloat16, torch.float16)
     @dtypes(torch.float)
@@ -2364,7 +2405,10 @@ def test_range(self, device, dtype):
         self.assertEqual(res1, res2, atol=0, rtol=0)
 
     # TODO: this test should be updated
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_range_warning(self, device):
         with warnings.catch_warnings(record=True) as w:
             torch.range(0, 10, device=device)
@@ -2684,13 +2728,19 @@ def test_fn(torch_fn, numpy_fn, steps):
         for steps in [1, 2, 3, 5, 11, 256, 257, 2**22]:
             test_fn(torch.linspace, np.linspace, steps)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.complex64)
     def test_linspace_vs_numpy_complex(self, device, dtype):
         self._test_linspace_logspace_complex_helper(torch.linspace, np.linspace,
                                                     device, dtype)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.complex64)
     def test_logspace_vs_numpy_complex(self, device, dtype):
         self._test_linspace_logspace_complex_helper(torch.logspace, np.logspace,
@@ -2811,7 +2861,10 @@ def _test_signal_window_functions(self, name, dtype, device, **kwargs):
     @precisionOverride({torch.bfloat16: 5e-2, torch.half: 1e-3})
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
     @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16, torch.half, torch.long)
+<<<<<<< HEAD
     @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float, torch.double, torch.long)
     @parametrize("window", ['hann', 'hamming', 'bartlett', 'blackman'])
     def test_signal_window_functions(self, device, dtype, window):
@@ -2820,7 +2873,10 @@ def test_signal_window_functions(self, device, dtype, window):
     @onlyNativeDeviceTypes
     @precisionOverride({torch.bfloat16: 5e-2, torch.half: 1e-3})
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16, torch.half, torch.long)
     @dtypes(torch.float, torch.double, torch.long, torch.bfloat16, torch.float16)
     def test_kaiser_window(self, device, dtype):
@@ -2847,7 +2903,10 @@ def _test_signal_windows_functions(self, name, dtype, device, **kwargs):
     # torch.signal.windows functions (except any with extra parameters)
     @onlyNativeDeviceTypes
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
+<<<<<<< HEAD
     @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float, torch.double)
     @parametrize("window", ['bartlett', 'blackman', 'cosine', 'hamming', 'hann', 'nuttall'])
     def test_signal_windows_functions(self, device, dtype, window):
@@ -2856,7 +2915,10 @@ def test_signal_windows_functions(self, device, dtype, window):
     # torch.signal.windows.kaiser
     @onlyNativeDeviceTypes
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dtypes(torch.float, torch.double)
     def test_kaiser(self, device, dtype):
         for num_test in range(50):
@@ -3040,12 +3102,18 @@ def _test_linspace_logspace_deduction_helper(self, fn, device):
 
             self.assertEqual(fn(start, end, steps=100, device=device).dtype, dtype)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_linspace_deduction(self, device):
         # Test deduction from input parameters.
         self._test_linspace_logspace_deduction_helper(torch.linspace, device)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_logspace_deduction(self, device):
         # Test deduction from input parameters.
         self._test_linspace_logspace_deduction_helper(torch.logspace, device)
@@ -3227,7 +3295,11 @@ def test_from_file(self, device, shared):
             self.assertTrue(t_mapped.untyped_storage().filename == expected_filename)
             self.assertEqual(torch.flatten(t), t_mapped)
 
+<<<<<<< HEAD
             s = torch.UntypedStorage.from_file(f.name, shared, t.numel() * dtype.itemsize)
+=======
+            s = torch.UntypedStorage.from_file(f.name, shared, nbytes=t.numel() * dtype.itemsize)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertTrue(s.filename == expected_filename)
 
     @onlyCPU
@@ -4185,7 +4257,12 @@ def test_astensor_consistency(self, device):
             t = torch.asarray(e)
             self.assertEqual(t, original)
 
+<<<<<<< HEAD
     @skipIfTorchDynamo()
+=======
+    # Dynamo changes numpy scalar to array, thus skips the asserted error.
+    @xfailIfTorchDynamo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyCPU
     def test_numpy_scalars(self, device):
         scalar = np.float64(0.5)
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index c5a2e9702b2a..b3bc0e79074c 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -82,6 +82,37 @@ def tearDown(self):
             if os.path.exists(temp_dir):
                 shutil.rmtree(temp_dir)
 
+<<<<<<< HEAD
+=======
+    def assertProto(self, str_to_compare):
+        if expecttest.ACCEPT:
+            write_proto(str_to_compare, self)
+            return True
+        expected = read_expected_content(self)
+        str_to_compare = str(str_to_compare)
+        self.assertEqual(remove_whitespace(str_to_compare), remove_whitespace(expected))
+
+    def assertImageProto(self, actual_proto):
+        if expecttest.ACCEPT:
+            expected_file = get_expected_file(self)
+            with open(expected_file, "w") as f:
+                f.write(text_format.MessageToString(actual_proto))
+            return True
+        expected_str = read_expected_content(self)
+        expected_proto = Summary()
+        text_format.Parse(expected_str, expected_proto)
+
+        [actual, expected] = [actual_proto.value[0], expected_proto.value[0]]
+        actual_img = Image.open(io.BytesIO(actual.image.encoded_image_string))
+        expected_img = Image.open(io.BytesIO(expected.image.encoded_image_string))
+
+        self.assertEqual(actual.tag, expected.tag)
+        self.assertEqual(actual.image.height, expected.image.height)
+        self.assertEqual(actual.image.width, expected.image.width)
+        self.assertEqual(actual.image.colorspace, expected.image.colorspace)
+        self.assertEqual(actual_img, expected_img)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TEST_TENSORBOARD:
     from google.protobuf import text_format
@@ -94,6 +125,13 @@ def tearDown(self):
     from torch.utils.tensorboard._pytorch_graph import graph
     from torch.utils.tensorboard._utils import _prepare_video, convert_to_HWC
     from torch.utils.tensorboard.summary import int_to_half, tensor_proto
+<<<<<<< HEAD
+=======
+else:
+    # Dummy for parametrization
+    class DataType:
+        DT_FLOAT, DT_HALF, DT_BFLOAT16, DT_INT32 = [None] * 4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestTensorBoardPyTorchNumpy(BaseTestCase):
@@ -417,16 +455,23 @@ def test_empty_input(self):
             summary.histogram("dummy", np.ndarray(0), "tensorflow")
 
     def test_image_with_boxes(self):
+<<<<<<< HEAD
         self.assertTrue(
             compare_image_proto(
                 summary.image_boxes(
                     "dummy", tensor_N(shape=(3, 32, 32)), np.array([[10, 10, 40, 40]])
                 ),
                 self,
+=======
+        self.assertImageProto(
+            summary.image_boxes(
+                "dummy", tensor_N(shape=(3, 32, 32)), np.array([[10, 10, 40, 40]])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
 
     def test_image_with_one_channel(self):
+<<<<<<< HEAD
         self.assertTrue(
             compare_image_proto(
                 summary.image("dummy", tensor_N(shape=(1, 8, 8)), dataformats="CHW"),
@@ -460,15 +505,41 @@ def test_image_without_channel(self):
                 summary.image("dummy", tensor_N(shape=(8, 8)), dataformats="HW"), self
             )
         )  # noqa: E131
+=======
+        self.assertImageProto(
+            summary.image("dummy", tensor_N(shape=(1, 8, 8)), dataformats="CHW")
+        )
+
+    def test_image_with_one_channel_batched(self):
+        self.assertImageProto(
+            summary.image("dummy", tensor_N(shape=(2, 1, 8, 8)), dataformats="NCHW")
+        )
+
+    def test_image_with_3_channel_batched(self):
+        self.assertImageProto(
+            summary.image("dummy", tensor_N(shape=(2, 3, 8, 8)), dataformats="NCHW")
+        )
+
+    def test_image_without_channel(self):
+        self.assertImageProto(
+            summary.image("dummy", tensor_N(shape=(8, 8)), dataformats="HW")
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_video(self):
         try:
             import moviepy  # noqa: F401
         except ImportError:
             return
+<<<<<<< HEAD
         self.assertTrue(
             compare_proto(summary.video("dummy", tensor_N(shape=(4, 3, 1, 8, 8))), self)
         )
+=======
+
+        self.assertProto(summary.video("dummy", tensor_N(shape=(4, 3, 1, 8, 8))))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         summary.video("dummy", np.random.rand(16, 48, 1, 28, 28))
         summary.video("dummy", np.random.rand(20, 7, 1, 8, 8))
 
@@ -477,20 +548,29 @@ def test_video(self):
     )
     @xfailIfS390X
     def test_audio(self):
+<<<<<<< HEAD
         self.assertTrue(
             compare_proto(summary.audio("dummy", tensor_N(shape=(42,))), self)
         )
+=======
+        self.assertProto(summary.audio("dummy", tensor_N(shape=(42,))))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(
         IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
     )
     def test_text(self):
+<<<<<<< HEAD
         self.assertTrue(compare_proto(summary.text("dummy", "text 123"), self))
+=======
+        self.assertProto(summary.text("dummy", "text 123"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(
         IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
     )
     def test_histogram_auto(self):
+<<<<<<< HEAD
         self.assertTrue(
             compare_proto(
                 summary.histogram(
@@ -498,12 +578,17 @@ def test_histogram_auto(self):
                 ),
                 self,
             )
+=======
+        self.assertProto(
+            summary.histogram("dummy", tensor_N(shape=(1024,)), bins="auto", max_bins=5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(
         IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
     )
     def test_histogram_fd(self):
+<<<<<<< HEAD
         self.assertTrue(
             compare_proto(
                 summary.histogram(
@@ -511,18 +596,28 @@ def test_histogram_fd(self):
                 ),
                 self,
             )
+=======
+        self.assertProto(
+            summary.histogram("dummy", tensor_N(shape=(1024,)), bins="fd", max_bins=5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @unittest.skipIf(
         IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
     )
     def test_histogram_doane(self):
+<<<<<<< HEAD
         self.assertTrue(
             compare_proto(
                 summary.histogram(
                     "dummy", tensor_N(shape=(1024,)), bins="doane", max_bins=5
                 ),
                 self,
+=======
+        self.assertProto(
+            summary.histogram(
+                "dummy", tensor_N(shape=(1024,)), bins="doane", max_bins=5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
 
@@ -548,14 +643,22 @@ def test_mesh(self):
         )
         f = np.array([[[0, 2, 3], [0, 3, 1], [0, 1, 2], [1, 3, 2]]], dtype=int)
         mesh = summary.mesh("my_mesh", vertices=v, colors=c, faces=f, config_dict=None)
+<<<<<<< HEAD
         self.assertTrue(compare_proto(mesh, self))
+=======
+        self.assertProto(mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @unittest.skipIf(
         IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 "
     )
     def test_scalar_new_style(self):
         scalar = summary.scalar("test_scalar", 1.0, new_style=True)
+<<<<<<< HEAD
         self.assertTrue(compare_proto(scalar, self))
+=======
+        self.assertProto(scalar)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaises(AssertionError):
             summary.scalar("test_scalar2", torch.Tensor([1, 2, 3]), new_style=True)
 
@@ -585,6 +688,7 @@ def read_expected_content(function_ptr):
         return f.read()
 
 
+<<<<<<< HEAD
 def compare_image_proto(actual_proto, function_ptr):
     if expecttest.ACCEPT:
         expected_file = get_expected_file(function_ptr)
@@ -617,6 +721,8 @@ def compare_proto(str_to_compare, function_ptr):
     return remove_whitespace(str_to_compare) == remove_whitespace(expected)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def write_proto(str_to_compare, function_ptr):
     expected_file = get_expected_file(function_ptr)
     with open(expected_file, "w") as f:
@@ -657,7 +763,10 @@ def forward(self, x):
             )
 
     def test_nested_nn_squential(self):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dummy_input = torch.randn(2, 3)
 
         class InnerNNSquential(torch.nn.Module):
diff --git a/test/test_testing.py b/test/test_testing.py
index 3dfa44c9bf06..31d04a4f4adc 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -440,6 +440,43 @@ def test_supported_dtypes(self, device, op):
             op.supported_dtypes(torch.device("cuda", index=1)),
         )
 
+<<<<<<< HEAD
+=======
+    def test_setup_and_teardown_run_for_device_specific_tests(self, device):
+        # TODO: Move this (and other similar text blocks) to some fixtures/ subdir
+        stderr = TestCase.runWithPytorchAPIUsageStderr(f"""\
+#!/usr/bin/env python3
+
+import torch
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+class TestFoo(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # store something on the test class to query during teardown
+        cls.stored_thing = "called with " + cls.__name__
+
+    @classmethod
+    def tearDownClass(cls):
+        # throw here so we know teardown was run
+        raise RuntimeError(cls.stored_thing)
+
+    def test_bar(self, device):
+        # make sure the test can access the stored thing
+        print(self.stored_thing)
+
+instantiate_device_type_tests(TestFoo, globals(), only_for='{self.device_type}')
+
+if __name__ == '__main__':
+    run_tests()
+""")
+        expected_device_class_name = f"TestFoo{self.device_type.upper()}"
+        expected_error_text = f"RuntimeError: called with {expected_device_class_name}"
+        self.assertIn(expected_error_text, stderr)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 instantiate_device_type_tests(TestTesting, globals())
 
 
@@ -909,6 +946,45 @@ def test_abs_diff(self):
             with self.assertRaisesRegex(AssertionError, re.escape("Greatest absolute difference: 2 at index (1, 0)")):
                 fn()
 
+<<<<<<< HEAD
+=======
+    def test_small_float_dtype(self):
+        for dtype in [
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2,
+            torch.float8_e5m2fnuz,
+            torch.float8_e8m0fnu,
+        ]:
+            w_vector = torch.tensor([3.14, 1.0], dtype=dtype)
+            x_vector = torch.tensor([1.0, 3.14], dtype=dtype)
+            y_vector = torch.tensor([3.14, 3.14], dtype=dtype)
+            z_vector = torch.tensor([1.0, 3.14], dtype=dtype)
+
+            for additional_dims in range(4):
+                new_shape = list(w_vector.shape) + ([1] * additional_dims)
+                w_tensor = w_vector.reshape(new_shape)
+                x_tensor = x_vector.reshape(new_shape)
+                y_tensor = y_vector.reshape(new_shape)
+                z_tensor = z_vector.reshape(new_shape)
+
+                for fn in assert_close_with_inputs(x_tensor, y_tensor):
+                    expected_shape = (0,) + (0,) * (additional_dims)
+                    with self.assertRaisesRegex(
+                        AssertionError, re.escape(f"The first mismatched element is at index {expected_shape}")
+                    ):
+                        fn()
+
+                for fn in assert_close_with_inputs(w_tensor, y_tensor):
+                    expected_shape = (1,) + (0,) * (additional_dims)
+                    with self.assertRaisesRegex(
+                        AssertionError, re.escape(f"The first mismatched element is at index {expected_shape}")
+                    ):
+                        fn()
+                for fn in assert_close_with_inputs(x_tensor, z_tensor):
+                    fn()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_abs_diff_scalar(self):
         actual = 3
         expected = 5
@@ -2296,6 +2372,14 @@ def test_circular_dependencies(self) -> None:
                            "torch.onnx._internal",  # depends on onnx-script
                            "torch._inductor.runtime.triton_helpers",  # depends on triton
                            "torch._inductor.codegen.cuda",  # depends on cutlass
+<<<<<<< HEAD
+=======
+                           "torch.distributed.benchmarks",  # depends on RPC and DDP Optim
+                           "torch.distributed.examples",  # requires CUDA and torchvision
+                           "torch.distributed.tensor.examples",  # example scripts
+                           "torch.csrc",  # files here are devtools, not part of torch
+                           "torch.include",  # torch include files after install
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                            ]
         if IS_WINDOWS or IS_MACOS or IS_JETSON:
             # Distributed should be importable on Windows(except nn.api.), but not on Mac
diff --git a/test/test_torch.py b/test/test_torch.py
index 6b56450a65e1..39dd79d14799 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -43,7 +43,11 @@
     skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName,
     wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard,
     bytes_to_scalar, parametrize, skipIfMPS, noncontiguous_like,
+<<<<<<< HEAD
     AlwaysWarnTypedStorageRemoval, TEST_WITH_TORCHDYNAMO, xfailIfTorchDynamo)
+=======
+    AlwaysWarnTypedStorageRemoval, TEST_WITH_TORCHDYNAMO, xfailIfTorchDynamo, set_warn_always_context)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from multiprocessing.reduction import ForkingPickler
 from torch.testing._internal.common_device_type import (
     expectedFailureMeta,
@@ -248,6 +252,22 @@ def test_storage_setitem(self, device, dtype):
         s[2:7] = 1
         self.assertEqual(s, storage_type(l))
 
+<<<<<<< HEAD
+=======
+    @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
+    @onlyNativeDeviceTypes
+    @unittest.skipIf(
+        "RelWithAssert" in torch.__config__.show(),
+        "failing in debug build, see https://github.com/pytorch/pytorch/pull/156731 for example",
+    )
+    def test_storage_use_count(self, device):
+        a = torch.randn(10, device=device)
+        prev_cf = torch._C._storage_Use_Count(a.untyped_storage()._cdata)
+        self.assertEqual(prev_cf, 1)
+        b = a.view(2, 5)
+        self.assertEqual(torch._C._storage_Use_Count(b.untyped_storage()._cdata), prev_cf + 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @xfailIfTorchDynamo
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
@@ -1817,6 +1837,7 @@ def test_nondeterministic_alert_put_accumulate(self, device):
                 'put_',
                 torch.device(device).type == 'cuda')
 
+<<<<<<< HEAD
     @skipIfMPS
     def test_nondeterministic_alert_histc(self, device):
         a = torch.tensor([], device=device)
@@ -1825,6 +1846,18 @@ def test_nondeterministic_alert_histc(self, device):
                 lambda: op_call(a, min=0, max=3),
                 '_histc_cuda',
                 torch.device(device).type == 'cuda')
+=======
+    @dtypes(torch.float32)
+    @dtypesIfCUDA(torch.float32, torch.int32)
+    @skipIfMPS
+    def test_nondeterministic_alert_histc(self, device, dtype):
+        a = torch.tensor([], device=device, dtype=dtype)
+        for op_call in [torch.histc, torch.Tensor.histc]:
+            self.check_nondeterministic_alert(
+                lambda: op_call(a, min=0, max=3),
+                '_histc_cuda with floating point input',
+                torch.device(device).type == 'cuda' and dtype.is_floating_point)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skipIfMPS
     def test_nondeterministic_alert_bincount(self, device):
@@ -2265,6 +2298,10 @@ def _generate_correlation_tensors(self, device, dtype):
         if dtype != torch.int:
             yield torch.tensor([0, -2, nan, 10.2, inf], dtype=dtype, device=device)
 
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @onlyNativeDeviceTypes
     @dtypes(torch.int, torch.float, torch.cfloat)
     def test_corrcoef(self, device, dtype):
@@ -6008,12 +6045,16 @@ def run(device, data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_
     # Make sure that the parameters become nonsense when scaled gradients are finite
     # but they get invalidated before `optimizer.step`, after `GradScaler.unscale_`
 
+<<<<<<< HEAD
     @onlyNativeDeviceTypes
     @optims(
         [optim for optim in optim_db if optim.optim_cls in [torch.optim.AdamW, torch.optim.Adam, torch.optim.SGD]],
         dtypes=[torch.float32]
     )
     def test_params_invalidated_with_grads_invalidated_between_unscale_and_step(self, device, dtype, optim_info):
+=======
+    def _test_params_invalidated_with_grads_invalidated_between_unscale_and_step(self, device, dtype, optim_info):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         optimizer_ctor = optim_info.optim_cls
         all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
             device, dtype, optim_info, skip=("differentiable",))
@@ -6042,6 +6083,26 @@ def test_params_invalidated_with_grads_invalidated_between_unscale_and_step(self
             self.assertTrue(all((p.isnan().any() or p.isinf().any()) for p in model.parameters()))
 
     @onlyNativeDeviceTypes
+<<<<<<< HEAD
+=======
+    @optims(
+        [optim for optim in optim_db if optim.optim_cls in [torch.optim.AdamW, torch.optim.Adam, torch.optim.SGD]],
+        dtypes=[torch.float32]
+    )
+    def test_params_invalidated_with_grads_invalidated_between_unscale_and_step(self, device, dtype, optim_info):
+        self._test_params_invalidated_with_grads_invalidated_between_unscale_and_step(device, dtype, optim_info)
+
+    @onlyNativeDeviceTypes
+    @optims(
+        [optim for optim in optim_db if optim.optim_cls in [torch.optim.AdamW, torch.optim.Adam, torch.optim.SGD]],
+        dtypes=[torch.float32]
+    )
+    @torch._inductor.config.patch("graph_partition", True)
+    def test_params_invalidated_with_grads_invalidated_and_graph_partition(self, device, dtype, optim_info):
+        self._test_params_invalidated_with_grads_invalidated_between_unscale_and_step(device, dtype, optim_info)
+
+    @onlyNativeDeviceTypes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_grad_scale_will_not_overflow(self, device):
         device = torch.device(device)
         model = torch.nn.Linear(5, 1).to(device)
@@ -7057,7 +7118,11 @@ def test_unflatten(self):
             torch.tensor([1]).unflatten(0, [])
         with self.assertRaisesRegex(RuntimeError, r"Provided sizes \[2, 2\] don't multiply up to the size of dim 0 \(1\)"):
             torch.tensor([1]).unflatten(0, [2, 2])
+<<<<<<< HEAD
         with self.assertRaisesRegex(IndexError, r"Dimension specified as 0 but tensor has no dimensions"):
+=======
+        with self.assertRaisesRegex(RuntimeError, r".*Dimension specified as 0 but tensor has no dimensions.*"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.tensor(1).unflatten(0, [0])
         with self.assertRaisesRegex(RuntimeError, r"only one dimension can be inferred"):
             torch.randn(5, 10).unflatten(1, (-1, -1))
@@ -8574,6 +8639,7 @@ def test_map2(self):
             lambda: res.map2_(y, z, lambda a, b, c: a + b * c))
 
     def test_Size(self):
+<<<<<<< HEAD
         x = torch.Size([1, 2, 3])
         self.assertIsInstance(x, tuple)
         self.assertEqual(x[0], 1)
@@ -8585,6 +8651,98 @@ def test_Size(self):
         self.assertIsInstance(x * 2, torch.Size)
         self.assertIsInstance(x[:-1], torch.Size)
         self.assertIsInstance(x + x, torch.Size)
+=======
+        # expects iterable of int, not Tensor
+        self.assertRaises(TypeError, lambda: torch.Size(torch.ones(3)))
+        # initialization
+        empty_size = torch.Size([])
+        size = torch.Size([1, 2, 3])
+        self.assertIsInstance(empty_size, tuple)
+        self.assertIsInstance(size, tuple)
+        # value check __len__
+        self.assertEqual(len(empty_size), 0)
+        self.assertEqual(len(size), 3)
+        # type check __getitem__[int]
+        self.assertIsInstance(size[0], int)
+        self.assertIsInstance(size[1], int)
+        self.assertIsInstance(size[2], int)
+        # value check __getitem__[int]
+        self.assertEqual(size[0], 1)
+        self.assertEqual(size[1], 2)
+        self.assertEqual(size[2], 3)
+        # type check __getitem__[slice]
+        self.assertIsInstance(size[:], torch.Size)
+        self.assertIsInstance(size[:-1], torch.Size)
+        self.assertIsInstance(size[0:0], torch.Size)
+        # value check __getitem__[slice]
+        self.assertEqual(size[:], (1, 2, 3))
+        self.assertEqual(size[:-1], (1, 2))
+        self.assertEqual(size[0:0], ())
+        # type check __add__
+        self.assertIsInstance(empty_size + (), torch.Size)
+        self.assertIsInstance(size + (), torch.Size)
+        self.assertIsInstance(size + (4, 5), torch.Size)
+        self.assertIsInstance(size + size, torch.Size)
+        # value check __add__
+        self.assertEqual(empty_size + (), ())
+        self.assertEqual(size + (), (1, 2, 3))
+        self.assertEqual(size + (4, 5), (1, 2, 3, 4, 5))
+        self.assertEqual(size + size, (1, 2, 3, 1, 2, 3))
+        # type check __radd__
+        self.assertIsInstance(() + empty_size, torch.Size)
+        self.assertIsInstance((4, 5) + size, torch.Size)
+        # value check __radd__
+        self.assertEqual(() + size, (1, 2, 3))
+        self.assertEqual((4, 5) + size, (4, 5, 1, 2, 3))
+        # type check __mul__
+        self.assertIsInstance(empty_size * 0, torch.Size)
+        self.assertIsInstance(size * 0, torch.Size)
+        self.assertIsInstance(size * 1, torch.Size)
+        self.assertIsInstance(size * 2, torch.Size)
+        # value check __mul__
+        self.assertEqual(empty_size * 0, ())
+        self.assertEqual(size * 0, ())
+        self.assertEqual(size * 1, (1, 2, 3))
+        self.assertEqual(size * 2, (1, 2, 3, 1, 2, 3))
+        # type check __rmul__
+        self.assertIsInstance(0 * empty_size, torch.Size)
+        self.assertIsInstance(0 * size, torch.Size)
+        self.assertIsInstance(1 * size, torch.Size)
+        self.assertIsInstance(2 * size, torch.Size)
+        # value check __rmul__
+        self.assertEqual(0 * empty_size, ())
+        self.assertEqual(0 * size, ())
+        self.assertEqual(1 * size, (1, 2, 3))
+        self.assertEqual(2 * size, (1, 2, 3, 1, 2, 3))
+
+    def test_Size_concat_non_tuple_sequence(self):
+        # check that TypeError get's raised on adding non-tuple sequences.
+        from collections.abc import Sequence
+
+        class DummySequence(Sequence):
+            vals = list(range(5))
+            def __len__(self): return len(self.vals)
+            def __getitem__(self, i): return self.vals[i]
+            def __iter__(self): return iter(self.vals)
+
+        size = torch.Size([1, 2, 3])
+        seq = DummySequence()
+        msg = r"can only concatenate tuple \(not \w+\) to torch.Size"
+        self.assertRaisesRegex(TypeError, msg, lambda: size + seq)
+        msg = r"unsupported operand type"
+        self.assertRaisesRegex(TypeError, msg, lambda: seq + size)
+
+    def test_Size_concat_wildcard(self):
+        # check that 3rd party classes can support addition with torch.Size
+        class Wildcard:
+            def __add__(self, other): return 42
+            def __radd__(self, other): return 42
+
+        size = torch.Size([1, 2, 3])
+        wildcard = Wildcard()
+        self.assertEqual(wildcard + size, 42)
+        self.assertEqual(size + wildcard, 42)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_Size_scalar(self):
         three = torch.tensor(3)
@@ -10829,6 +10987,35 @@ class MyTwoTensor4(TwoTensor):
     def test_bf16_supported_on_cpu(self):
         self.assertFalse(torch.cuda.is_bf16_supported())
 
+<<<<<<< HEAD
+=======
+    def test_tensor_with_grad_to_scalar_warning(self) -> None:
+        with (warnings.catch_warnings(record=True) as w,
+                set_warn_always_context(True)):
+            warnings.simplefilter("always")
+
+            x = torch.tensor(2.0, requires_grad=True)
+            math.pow(x, 3)  # calling this results in a warning
+
+            self.assertEqual(len(w), 1)
+            self.assertTrue(issubclass(w[0].category, UserWarning))
+            self.assertIn(
+                "Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.",
+                str(w[0].message)
+            )
+
+    def test_tensor_item_no_warning(self):
+        with (warnings.catch_warnings(record=True) as w,
+                set_warn_always_context(True)):
+            warnings.simplefilter("always")
+
+            x = torch.tensor(2.0, requires_grad=True)
+            max(x, 3)  # No warning
+            x.item()  # No warning
+
+            self.assertEqual(len(w), 0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # The following block extends TestTorch with negative dim wrapping tests
 # FIXME: replace these with OpInfo sample inputs or systemic OpInfo tests
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 1bd836a3eb15..065076bd90af 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -17,7 +17,11 @@
 import math
 import itertools
 import torch.optim as optim
+<<<<<<< HEAD
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyCUDA, onlyCPU
+=======
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyCUDA, onlyCPU, largeTensorTest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional
 import torch.utils.cpp_extension
 from torch.testing._internal.common_nn import NNTestCase
@@ -52,6 +56,10 @@
     SM90OrLater,
     tf32_on_and_off,
     tf32_enabled,
+<<<<<<< HEAD
+=======
+    ROCM_VERSION,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 if TEST_FAIRSEQ:
@@ -82,7 +90,11 @@ def use_deterministic_algorithims(mode: bool, warn_only: bool):
 
 isSM8XDevice = torch.cuda.is_available() and torch.cuda.get_device_capability() in [(8, 6), (8, 7), (8, 9)]
 isSM90Device = torch.cuda.is_available() and torch.cuda.get_device_capability() == (9, 0)
+<<<<<<< HEAD
 isSM120Device = torch.cuda.is_available() and torch.cuda.get_device_capability() == (12, 0)
+=======
+isSM120Device = torch.cuda.is_available() and torch.cuda.get_device_capability() in [(12, 0), (12, 1)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 isSM5xDevice = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] == 5
 isLessThanSM80Device = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8
 
@@ -340,7 +352,11 @@ def test_train_with_pad_and_catch_error(self, device):
                 l1_bool = nn.L1Loss()(test_train_bool[:, 0:2, :], test_eval_bool[:, 0:2, :]).item()
                 self.assertTrue(l1_bool < 1e-4, "Eval/Train difference in pad_mask BOOL")
 
+<<<<<<< HEAD
     @tf32_on_and_off(0.001)
+=======
+    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("attn_mask_dim", [2, 3, None])
     @parametrize("key_padding_mask_dim", [2, None])
     @parametrize("mask_dtype", [torch.bool, torch.float32])
@@ -430,7 +446,10 @@ def hook(module, inputs, output):
         # remove hook
         handle.remove()
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @tf32_on_and_off(0.001)
     @parametrize("use_torchscript", [False])
     @parametrize("enable_nested_tensor", [True, False])
@@ -524,7 +543,11 @@ def test_transformerencoder_fastpath(self, device, use_torchscript, enable_neste
                 slowpath_output = slowpath_output.masked_fill(src_key_padding_mask.unsqueeze(-1), 0)
                 self.assertEqual(fastpath_output_expanded, slowpath_output)
 
+<<<<<<< HEAD
     @tf32_on_and_off(0.001)
+=======
+    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("with_no_grad", [True, False])
     @parametrize("training", [True, False])
     @parametrize("enable_nested_tensor", [False])
@@ -1110,7 +1133,11 @@ def forward(
                     return_all_hiddens=False,
                 )[0]
 
+<<<<<<< HEAD
     @tf32_on_and_off(0.003)
+=======
+    @tf32_on_and_off(0.003, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("input_dim,attn_mask_dim,is_causal",
                  [(3, None, False), (3, 2, False), (3, 2, True), (3, 3, False), (3, 3, True),
                   (4, None, False), (4, 2, False), (4, 2, True), (4, 4, False), (4, 4, True)],
@@ -1421,7 +1448,10 @@ def ones_tensor(*shape):
                         _ = mha_f(qkv_f, qkv_f, qkv_f, attn_mask=mask, need_weights=False, is_causal=True)
                         torch.cuda.synchronize()
 
+<<<<<<< HEAD
     @skipIfRocm  # Missing EFFICIENT_ATTENTION
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Platform does not supposrt fused SDPA or pre-SM80 hardware"
     )
@@ -1619,6 +1649,37 @@ def test_invalid_last_dim_stride(self, device, kernel: SDPBackend):
                     q, k, v, None, 0.0, False))
 
     @onlyCUDA
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
+        or not PLATFORM_SUPPORTS_CUDNN_ATTENTION,
+        "Efficient or cuDNN Attention was not built for this system",
+    )
+    @parametrize("kernel", [SDPBackend.EFFICIENT_ATTENTION, SDPBackend.CUDNN_ATTENTION])
+    def test_mask_invalid_last_dim_stride(self, device, kernel):
+        with sdpa_kernel(backends=[kernel]):
+            dtype = torch.float16
+            make_tensor = partial(torch.rand, device=device, dtype=dtype)
+            size = SdpaShape(2, 2, 8, 8)
+            q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
+            attn_mask = make_tensor((2, 2, 8, 8))
+            # Passing in a attn_mask with last dim stride not equal to 1 will error
+            attn_mask.as_strided_(size, [2, 2, 2, 2])
+
+            with self.assertWarnsRegex(
+                UserWarning,
+                "GPU backends require attn_mask's last dimension to have stride 1 while the CPU does not",
+            ):
+                self.assertRaises(
+                    RuntimeError,
+                    lambda: torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, attn_mask, 0.0, False
+                    ),
+                )
+
+    @onlyCUDA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Does not support SDPA or pre-SM80 hardware")
     @parametrize("fused_kernel", [SDPBackend.EFFICIENT_ATTENTION])
     def test_invalid_sdpa_kernel_grouped_query_attention_cuda(self, device, fused_kernel):
@@ -1699,7 +1760,11 @@ def test_unaligned_tensors(self, device):
         make_tensor = partial(torch.rand, size, device=device, dtype=dtype)
         q, k, v = make_tensor(), make_tensor(), make_tensor()
         with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
+<<<<<<< HEAD
             ctxmgr = self.assertRaises(RuntimeError) if not TEST_WITH_ROCM else contextlib.nullcontext()
+=======
+            ctxmgr = self.assertRaises(RuntimeError)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with ctxmgr:
                 torch.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False)
 
@@ -1898,6 +1963,77 @@ def test_flash_attention_fail_with_non_square_causal_attention(self, device):
                 self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
                     q, k, v, None, 0.0, is_causal=True))
 
+<<<<<<< HEAD
+=======
+    @onlyCUDA
+    def test_mem_eff_attention_fail_with_batch_size_geq_65536(self):
+        batch_size = 2**16
+        query = torch.rand([batch_size, 2, 2, 8], device='cuda', dtype=torch.float16, requires_grad=True)
+        key = torch.rand([batch_size, 2, 2, 8], device='cuda', dtype=torch.float16, requires_grad=True)
+        value = torch.rand([batch_size, 2, 2, 8], device='cuda', dtype=torch.float16, requires_grad=True)
+        q_cpu, k_cpu, v_cpu = (query.detach().cpu().requires_grad_(True),
+                               key.detach().cpu().requires_grad_(True),
+                               value.detach().cpu().requires_grad_(True))
+        with sdpa_kernel(backends=SDPBackend.EFFICIENT_ATTENTION):
+            out = F.scaled_dot_product_attention(query, key, value)
+        out_cpu = F.scaled_dot_product_attention(q_cpu, k_cpu, v_cpu)
+        grad_out = torch.rand_like(out)
+        out.backward(grad_out)
+        out_cpu.backward(grad_out.cpu())
+
+        self.assertEqual(out, out_cpu, atol=2e-3, rtol=1e-4)
+        self.assertEqual(query.grad, q_cpu.grad, atol=2e-3, rtol=1e-4)
+        self.assertEqual(key.grad, k_cpu.grad, atol=2e-3, rtol=1e-4)
+        self.assertEqual(value.grad, v_cpu.grad, atol=2e-3, rtol=1e-4)
+
+    @onlyCUDA
+    def test_mem_eff_attention_fail_with_batch_size_geq_65536_error(self):
+        query = torch.rand([2**16, 2, 2, 8], device='cuda', dtype=torch.float16)
+        key = torch.rand([2**16, 2, 2, 8], device='cuda', dtype=torch.float16)
+        value = torch.rand([2**16, 2, 2, 8], device='cuda', dtype=torch.float16)
+        error_str = (r"Efficient attention cannot produce valid seed and offset outputs when "
+                     r"the batch size exceeds \(65535\)\.")
+        with self.assertRaisesRegex(RuntimeError, error_str):
+            torch._scaled_dot_product_efficient_attention(query, key, value,
+                                                          attn_bias=None, compute_log_sumexp=True,
+                                                          dropout_p=0.01)
+
+    @largeTensorTest("15GB", "cuda")
+    @onlyCUDA
+    def test_mem_eff_attention_large_seq_len_uniform_attention(self):
+        device = torch.device("cuda")
+        dtype = torch.bfloat16
+
+        num_queries = 49999
+        num_heads = 2
+        feature_dim = 16
+
+        # Q and K are all zeros -> uniform attention
+        query = torch.zeros(1, num_heads, num_queries, feature_dim, device=device, dtype=dtype, requires_grad=True)
+        key = torch.zeros(1, num_heads, num_queries, feature_dim, device=device, dtype=dtype, requires_grad=True)
+        value = torch.ones(1, num_heads, num_queries, feature_dim, device=device, dtype=dtype, requires_grad=True)
+        mask = torch.ones((num_queries, num_queries), dtype=torch.bool, device=device)
+
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
+            output = torch.nn.functional.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=mask,
+            )
+            expected = torch.ones_like(output)
+            grad_output = torch.ones_like(output)
+            output.backward(grad_output)
+
+            self.assertTrue(torch.allclose(output, expected))
+            self.assertTrue(torch.allclose(query.grad, torch.zeros_like(query)))
+            self.assertTrue(torch.allclose(key.grad, torch.zeros_like(key)))
+            # For value, since each input position contributed 1/num_queries to each output, the grad should sum accordingly
+            # for all ones grad_output, each value position receives grad of 1 (because sum of all softmax weights per row is 1)
+            self.assertTrue(torch.allclose(value.grad, torch.ones_like(value)))
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     # This should match the block sizes in the CUDA kernel
     assert head_dim <= 256
@@ -2445,7 +2581,10 @@ def convert_flash_attn_S_to_softmax(
         S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k_rounded))
         return S_converted[:, :, :seqlen_q, :seqlen_k]
 
+<<<<<<< HEAD
     @skipIfRocm  # No cuDNN Attention
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_cudnn_attention_different_dk_dv(self, device):
         dtype = torch.bfloat16
@@ -2469,7 +2608,57 @@ def test_cudnn_attention_different_dk_dv(self, device):
 
         self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
 
+<<<<<<< HEAD
     @skipIfRocm(msg="No cuDNN on ROCm")
+=======
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
+    def test_cudnn_attention_gqa(self, device):
+        batch = 4
+        seq_len_q = 512
+        seq_len_kv = 1024
+        D = 128
+        # Sample call to SDPA - GQ
+        query = torch.rand(batch, 32, seq_len_q, D, device='cuda', dtype=torch.bfloat16)
+        key = torch.rand(batch, 8, seq_len_kv, D, device='cuda', dtype=torch.bfloat16)
+        # cuDNN supports h_k != h_v
+        value = torch.rand(batch, 4, seq_len_kv, D, device='cuda', dtype=torch.bfloat16)
+        with sdpa_kernel([SDPBackend.MATH]):
+            output_math = scaled_dot_product_attention(query, key, value, is_causal=True, enable_gqa=True)
+
+        with self.assertRaisesRegex(RuntimeError, "No available kernel."):
+            with sdpa_kernel([SDPBackend.CUDNN_ATTENTION]):
+                output_cudnn = scaled_dot_product_attention(query, key, value, is_causal=True, enable_gqa=False)
+
+        with sdpa_kernel([SDPBackend.CUDNN_ATTENTION]):
+            output_cudnn = scaled_dot_product_attention(query, key, value, is_causal=True, enable_gqa=True)
+
+        self.assertEqual(output_math, output_cudnn)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
+    def test_cudnn_attention_d256_heuristic(self, device):
+        dtype = torch.bfloat16
+        make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=True)
+        batch, num_heads, head_dim_k, head_dim_v = 32, 16, 256, 64
+        seq_len = 640
+        q_shape = SdpaShape(batch, num_heads, seq_len, head_dim_k)
+        k_shape = SdpaShape(batch, num_heads, seq_len, head_dim_k)
+        v_shape = SdpaShape(batch, num_heads, seq_len, head_dim_v)
+        query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
+
+        with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH], set_priority=True):
+            actual = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
+            actual.backward(torch.randn_like(actual))
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
+            math_ref = torch.nn.functional.scaled_dot_product_attention(
+                query.contiguous().to(torch.float32),
+                key.contiguous().to(torch.float32),
+                value.contiguous().to(torch.float32),
+                attn_mask=None, dropout_p=0.0, is_causal=False)
+
+        self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_fused_attention_different_dk_dv(self, device):
         dtype = torch.bfloat16
@@ -2493,7 +2682,10 @@ def test_fused_attention_different_dk_dv(self, device):
         self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
 
 
+<<<<<<< HEAD
     @skipIfRocm  # No cuDNN Attention
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_cudnn_attention_fail_d128(self, device):
         # Test that cuDNN attention dispatching correctly bails out on d > 128
@@ -2505,11 +2697,25 @@ def test_cudnn_attention_fail_d128(self, device):
         k = torch.randn(b, h, s_kv, d_qk, device=device, dtype=torch.bfloat16)
         v = torch.randn(b, h, s_kv, d_v, device=device, dtype=torch.bfloat16)
 
+<<<<<<< HEAD
         with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION]):
             with self.assertRaisesRegex(RuntimeError, "No available kernel."):
                 torch.nn.functional.scaled_dot_product_attention(q, k, v)
 
     @skipIfRocm(msg="No cuDNN on ROCm")
+=======
+        device_cap = torch.cuda.get_device_capability()
+        ISSM90 = device_cap == (9, 0)
+        ISSM100 = device_cap == (10, 0)
+        with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION]):
+            # SM90/100 support d <= 256 as of cuDNN 9.5.1+
+            if (ISSM90 or ISSM100) and torch.backends.cudnn.version() >= 90501:
+                torch.nn.functional.scaled_dot_product_attention(q, k, v)
+            else:
+                with self.assertRaisesRegex(RuntimeError, "No available kernel."):
+                    torch.nn.functional.scaled_dot_product_attention(q, k, v)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
     def test_cudnn_attention_trivial_output_transpose(self, device):
         # see also: https://github.com/pytorch/pytorch/issues/134001
@@ -2525,7 +2731,10 @@ def test_cudnn_attention_trivial_output_transpose(self, device):
         o.backward(o)
         torch.testing.assert_close(x.grad, x_cpu.grad.cuda(), atol=7e-3, rtol=7e-3)
 
+<<<<<<< HEAD
     @skipIfRocm  # No cuDNN Attention
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
     def test_cudnn_attention_nonmodulo64seqlen(self, device):
         # see also: https://github.com/pytorch/pytorch/issues/137347
@@ -2565,7 +2774,10 @@ def test_cudnn_attention_nonmodulo64seqlen(self, device):
         torch.testing.assert_close(k.grad, k_cpu.grad.cuda(), atol=3e-3, rtol=2e-3)
         torch.testing.assert_close(v.grad, v_cpu.grad.cuda(), atol=3e-3, rtol=2e-3)
 
+<<<<<<< HEAD
     @skipIfRocm
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
     def test_cudnn_attention_preserves_query_layout(self, device):
 
@@ -2978,11 +3190,25 @@ def test_sdp_choice_with_determinism(self, device, warn_only):
             with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH]):
                 assert torch._fused_sdp_choice(query, key, value) == SDPBackend.EFFICIENT_ATTENTION.value
 
+<<<<<<< HEAD
     @skipIfRocm
     @onlyCUDA
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA")
     def test_fused_sdp_priority_order(self, device):
+=======
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA")
+    @parametrize("use_compile", [True, False])
+    def test_fused_sdp_priority_order(self, device, use_compile):
+        @torch.compile
+        def compiled_func(order):
+            with sdpa_kernel(order, set_priority=True):
+                out = scaled_dot_product_attention(q, q, q)
+            return out
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         q = torch.randn(64, 8, 1024, 64, dtype=torch.half, device='cuda')
         default_order = torch._C._get_sdp_priority_order()
         orders = [[SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION],
@@ -2992,18 +3218,37 @@ def test_fused_sdp_priority_order(self, device):
         import time
         times = list()
         for order in orders:
+<<<<<<< HEAD
             with sdpa_kernel(order, set_priority=True):
                 scaled_dot_product_attention(q, q, q)
             torch.cuda.synchronize()
             t0 = time.perf_counter()
             with sdpa_kernel(order, set_priority=True):
                 scaled_dot_product_attention(q, q, q)
+=======
+            if use_compile:
+                compiled_func(order)
+            else:
+                with sdpa_kernel(order, set_priority=True):
+                    scaled_dot_product_attention(q, q, q)
+            torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            if use_compile:
+                compiled_func(order)
+            else:
+                with sdpa_kernel(order, set_priority=True):
+                    scaled_dot_product_attention(q, q, q)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.cuda.synchronize()
             t1 = time.perf_counter()
             times.append(t1 - t0)
         self.assertTrue(times[0] < times[1], "expected cuDNN SDPA to be faster than Math backend.")
         self.assertTrue(times[1] > times[2], "expected Eff Attn backend to faster than Math backend.")
         self.assertTrue(times[3] < times[2], "expected Flash Attn backend to faster than Math backend.")
+<<<<<<< HEAD
+=======
+        self.assertTrue(times[0] < times[2], "expected cuDNN Attn backend to faster than Eff Attn backend.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reset_order = torch._C._get_sdp_priority_order()
         self.assertEqual(default_order, reset_order, "expected SDPA context manager to reset priority order.")
 
@@ -3185,17 +3430,29 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
             'grad_value': 8.5,
         }
         if TEST_WITH_ROCM:
+<<<<<<< HEAD
+=======
+            fudge_factors['out'] = 5.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fudge_factors['grad_key'] = 45.0
             fudge_factors['grad_query'] = 360.0
             if seq_len_k >= 1024:
                 fudge_factors['grad_key'] = 70.0
             if seq_len_k >= 2048:
                 fudge_factors['grad_key'] = 160.0
+<<<<<<< HEAD
                 fudge_factors['grad_query'] = 650.0
             if dtype == torch.float32:
                 fudge_factors['grad_key'] = 90.0
                 if "gfx95" in torch.cuda.get_device_properties(0).gcnArchName:
                     fudge_factors['grad_value'] = 15.0
+=======
+                fudge_factors['grad_query'] = 670.0
+            if dtype == torch.float32:
+                fudge_factors['grad_key'] = 90.0
+            if "gfx95" in torch.cuda.get_device_properties(0).gcnArchName:
+                fudge_factors['grad_value'] = 16.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),
@@ -3308,17 +3565,29 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
             "grad_attn_mask": 45.0,
         }
         if TEST_WITH_ROCM:
+<<<<<<< HEAD
+=======
+            fudge_factors['out'] = 6.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fudge_factors['grad_key'] = 45.0
             fudge_factors['grad_query'] = 360.0
             if seq_len_k >= 1024:
                 fudge_factors['grad_key'] = 70.0
             if seq_len_k >= 2048:
                 fudge_factors['grad_key'] = 160.0
+<<<<<<< HEAD
                 fudge_factors['grad_query'] = 650.0
             if dtype == torch.float32:
                 fudge_factors['grad_key'] = 90.0
                 if "gfx95" in torch.cuda.get_device_properties(0).gcnArchName:
                     fudge_factors['grad_value'] = 15.0
+=======
+                fudge_factors['grad_query'] = 670.0  # gfx90a
+            if dtype == torch.float32:
+                fudge_factors['grad_key'] = 90.0
+                if "gfx95" in torch.cuda.get_device_properties(0).gcnArchName:
+                    fudge_factors['grad_value'] = 16.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),
@@ -3439,6 +3708,7 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
             'grad_value': 4,
         }
         if TEST_WITH_ROCM:
+<<<<<<< HEAD
             fudge_factors['grad_key'] = 45.0
             fudge_factors['grad_query'] = 360.0
             if seq_len_k >= 1024:
@@ -3450,6 +3720,35 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
                     fudge_factors['grad_query'] = 1100.0
             if dtype == torch.float32:
                 fudge_factors['grad_key'] = 90.0
+=======
+            fudge_factors['grad_value'] = 6.0
+            if TEST_WITH_CK:
+                fudge_factors['out'] = 5.0
+                fudge_factors['grad_key'] = 145.0
+                fudge_factors['grad_query'] = 855.0  # ck min = 855.0
+                if seq_len_k >= 1024:
+                    fudge_factors['grad_key'] = 70.0
+                if seq_len_k >= 2048:
+                    fudge_factors['grad_key'] = 190.0
+                    fudge_factors['grad_query'] = 1550.0  # NEW CK MIN
+                    if seq_len_q >= 2048:
+                        fudge_factors['grad_query'] = 1100.0
+                if dtype == torch.float32:
+                    fudge_factors['grad_key'] = 90.0
+            else:
+                fudge_factors['out'] = 6.0
+                fudge_factors['grad_key'] = 45.0
+                fudge_factors['grad_query'] = 360.0
+                if seq_len_k >= 1024:
+                    fudge_factors['grad_key'] = 70.0
+                if seq_len_k >= 2048:
+                    fudge_factors['grad_key'] = 190.0
+                    fudge_factors['grad_query'] = 650.0
+                    if seq_len_q >= 2048:
+                        fudge_factors['grad_query'] = 1100.0
+                if dtype == torch.float32:
+                    fudge_factors['grad_key'] = 90.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),
@@ -3602,6 +3901,7 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
             grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value), upstream_grad)
             grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad)
 
+<<<<<<< HEAD
             check_out_and_grad(
                 (out_ref, out_lp_ref, out),
                 *zip(grads_ref, grads_ref_lp, grads),
@@ -3611,6 +3911,21 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
                     'grad_key': 8.0,
                     'grad_value': 3.0,
                 }
+=======
+            fudge_factors = {
+                'out': 3.0,
+                'grad_query': 110.0,
+                'grad_key': 8.0,
+                'grad_value': 3.0,
+            }
+            if TEST_WITH_ROCM:
+                fudge_factors['out'] = 6.0
+                fudge_factors['grad_value'] = 6.0
+            check_out_and_grad(
+                (out_ref, out_lp_ref, out),
+                *zip(grads_ref, grads_ref_lp, grads),
+                fudge_factors=fudge_factors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Fused SDPA was not built for this system")
@@ -3908,7 +4223,11 @@ def test_fused_sdp_choice_xpu(self, device, type: str, dropout: float, dtype: to
 
     def test_fused_attention_different_dk_dv(self, device):
         dtype = torch.bfloat16
+<<<<<<< HEAD
         make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=True)
+=======
+        make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         batch, num_heads, head_dim_k, head_dim_v = 32, 16, 128, 64
         q_shape = SdpaShape(batch, num_heads, 1, head_dim_k)
         k_shape = SdpaShape(batch, num_heads, 2, head_dim_k)
@@ -3924,11 +4243,57 @@ def test_fused_attention_different_dk_dv(self, device):
 
         self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
 
+<<<<<<< HEAD
     def test_onednn_attention_fail_d256(self, device):
         # Test that onednn graph attention dispatching correctly bails out on d > 256
         b, h = 1, 2
         s_q, s_kv = 128, 128
         d_qk, d_v = 512, 512
+=======
+    @parametrize("dtype", [torch.half, torch.bfloat16])
+    @parametrize("batch_size,n_head,n_head_kv,q_size,kv_size,head_dim", [
+        (2, 64, 16, 9216, 77, 64),
+        (2, 32, 4, 2304, 2304, 64),
+        (2, 32, 2, 2304, 77, 64),
+        (2, 20, 2, 576, 576, 64),
+        (2, 20, 2, 576, 77, 64),
+        (2, 20, 2, 144, 144, 64),
+        (2, 20, 2, 144, 77, 64),
+        (1, 32, 2, 1, 32, 128),
+        (4, 32, 4, 1, 32, 128),
+        (1, 32, 2, 32, 32, 128),
+        (4, 32, 4, 32, 32, 128),
+        (1, 32, 2, 2016, 2016, 128),
+        (4, 32, 4, 2016, 2016, 128),
+    ])
+    @parametrize("is_causal", [True, False])
+    def test_fused_attention_gqa(self, device, dtype, batch_size, n_head, n_head_kv, q_size, kv_size, head_dim, is_causal):
+        tol = Tolerances(1e-5, 5e-6)
+        if dtype is torch.bfloat16:
+            tol = Tolerances(5e-2, 5e-2)
+        if dtype is torch.float16:
+            tol = Tolerances(1e-2, 1e-2)
+        make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=False)
+        q_shape = SdpaShape(batch_size, n_head, q_size, head_dim)
+        k_shape = SdpaShape(batch_size, n_head_kv, kv_size, head_dim)
+        v_shape = SdpaShape(batch_size, n_head_kv, kv_size, head_dim)
+        query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
+
+        # test that we do not dispatch to onednn for an unsupported case
+        actual = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=None, dropout_p=0.0, is_causal=is_causal, enable_gqa=True)
+
+        math_ref = torch.ops.aten._scaled_dot_product_attention_math(
+            query.float(), key.float(), value.float(), attn_mask=None, dropout_p=0.0, is_causal=is_causal, enable_gqa=True)[0]
+
+        self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=tol.atol, rtol=tol.rtol)
+
+    def test_onednn_attention_fail_d576(self, device):
+        # Test that onednn graph attention dispatching correctly bails out on d > 576
+        b, h = 1, 2
+        s_q, s_kv = 128, 128
+        d_qk, d_v = 1024, 1024
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         q = torch.randn(b, h, s_q, d_qk, device=device, dtype=torch.bfloat16)
         k = torch.randn(b, h, s_kv, d_qk, device=device, dtype=torch.bfloat16)
@@ -3938,6 +4303,75 @@ def test_onednn_attention_fail_d256(self, device):
             with self.assertRaisesRegex(RuntimeError, "No available kernel."):
                 _ = F.scaled_dot_product_attention(q, k, v)
 
+<<<<<<< HEAD
+=======
+    def test_fused_attention_broadcasted_input(self, device):
+        dtype = torch.bfloat16
+        make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=False)
+        batch, num_heads, seqlen, head_dim = 32, 16, 128, 32
+        q_shape = SdpaShape(batch, num_heads, seqlen, head_dim)
+        k_shape = SdpaShape(batch, num_heads, seqlen, head_dim)
+        v_shape = SdpaShape(batch, num_heads, seqlen, head_dim)
+        query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
+
+        attn_mask_shape = (1, seqlen)
+        attn_mask = make_tensor(attn_mask_shape)
+        attn_mask = attn_mask.expand(1, 1, seqlen, seqlen)
+
+        # test that we do not dispatch to onednn for an unsupported case
+        actual = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+
+        math_ref = torch.ops.aten._scaled_dot_product_attention_math(
+            query.float(), key.float(), value.float(), attn_mask=attn_mask, dropout_p=0.0, is_causal=False)[0]
+
+        self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
+
+    def test_attention_preserves_query_layout(self, device):
+
+        def test_attention(permute_order: list[list[int]]):
+            BHSqD = [4, 16, 256, 64]
+            BHSkvD = [4, 16, 512, 64]
+
+            shape_q = [BHSqD[idx] for idx in permute_order]
+            shape_kv = [BHSkvD[idx] for idx in permute_order]
+            reverse = [permute_order.index(idx) for idx in range(4)]
+            q = torch.randn(*shape_q, dtype=torch.bfloat16, device=device, requires_grad=False).permute(reverse)
+            k = torch.randn(*shape_kv, dtype=torch.bfloat16, device=device, requires_grad=False).permute(reverse)
+            v = torch.randn(*shape_kv, dtype=torch.bfloat16, device=device, requires_grad=False).permute(reverse)
+            self.assertEqual(q.shape, BHSqD)
+            self.assertEqual(k.shape, BHSkvD)
+            self.assertEqual(v.shape, BHSkvD)
+
+            out = F.scaled_dot_product_attention(q, k, v)
+            self.assertTrue(out.permute(permute_order).is_contiguous())
+
+        permutable = [0, 1, 2]
+        permute_orders = itertools.permutations(permutable)
+
+        for permute_order in permute_orders:
+            test_attention(list(permute_order) + [3])
+
+    def test_scaled_dot_product_attention_fused_kernels_safe_softmax(self, device):
+        dtype = torch.bfloat16
+        make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=False)
+        batch, num_heads, seqlen, head_dim = 32, 16, 32, 64
+        q_shape = SdpaShape(batch, num_heads, seqlen, head_dim)
+        k_shape = SdpaShape(batch, num_heads, seqlen, head_dim)
+        v_shape = SdpaShape(batch, num_heads, seqlen, head_dim)
+        query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
+
+        attn_mask = torch.full((seqlen, seqlen), float('-inf'), device=device, dtype=torch.bfloat16)
+
+        actual = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
+
+        math_ref = torch.ops.aten._scaled_dot_product_attention_math(
+            query.float(), key.float(), value.float(), attn_mask=attn_mask, dropout_p=0.0, is_causal=False)[0]
+
+        self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @parametrize("type", ["dense"])
     @parametrize("is_contiguous", [True, False])
     def test_scaled_dot_product_attention_fused_kernels_packed(self, device, type: str, is_contiguous: bool):
@@ -4118,10 +4552,13 @@ def test_causal_variants(self, device, causal_variant: CausalVariant, shape: lis
         make_tensor = partial(
             torch.rand, device=device, dtype=torch.float16, requires_grad=True
         )
+<<<<<<< HEAD
         if TEST_WITH_ROCM and causal_variant == CausalVariant.LOWER_RIGHT:
             self.skipTest("No support for LOWER_RIGHT variant for now")
             return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bsz, num_heads, seq_len_q, seq_len_kv, head_dim = shape
         make_q_tensor = partial(make_tensor, SdpaShape(bsz, num_heads, seq_len_q, head_dim))
         make_kv_tensor = partial(make_tensor, SdpaShape(bsz, num_heads, seq_len_kv, head_dim))
@@ -4152,10 +4589,13 @@ def test_causal_variants(self, device, causal_variant: CausalVariant, shape: lis
     @unittest.skipIf(IS_WINDOWS, "torch.compile is not supported on windows")
     @skipIfTorchDynamo("This function already calls torch.compile.")
     def test_causal_variants_compile(self, device, causal_variant: CausalVariant, shape: list[tuple[int]]):
+<<<<<<< HEAD
         if TEST_WITH_ROCM and causal_variant == CausalVariant.LOWER_RIGHT:
             self.skipTest("No support for LOWER_RIGHT variant for now")
             return
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cnts = CompileCounterWithBackend("aot_eager")
         make_tensor = partial(
             torch.rand, device=device, dtype=torch.float16, requires_grad=True
diff --git a/test/test_transformers_privateuse1.py b/test/test_transformers_privateuse1.py
index 9f034904b07e..ce61d9d2fb01 100644
--- a/test/test_transformers_privateuse1.py
+++ b/test/test_transformers_privateuse1.py
@@ -1,6 +1,9 @@
 # Owner(s): ["module: sdpa"]
 
+<<<<<<< HEAD
 import os
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import unittest
 from collections import namedtuple
 from functools import partial
@@ -8,6 +11,7 @@
 import pytorch_openreg  # noqa: F401
 
 import torch
+<<<<<<< HEAD
 import torch.utils.cpp_extension
 from torch.nn.attention import SDPBackend
 from torch.testing._internal.common_nn import NNTestCase
@@ -17,12 +21,18 @@
     skipIfTorchDynamo,
     TEST_XPU,
 )
+=======
+from torch.nn.attention import SDPBackend
+from torch.testing._internal.common_nn import NNTestCase
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TEST_XPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 SdpaShape = namedtuple("Sdpa_Shape", ["batch", "num_heads", "seq_len", "head_dim"])
 
 
 @unittest.skipIf(TEST_XPU, "XPU does not support cppextension currently")
+<<<<<<< HEAD
 @unittest.skipIf(
     IS_FBCODE,
     "Ninja is required to load C++ extensions and it's not compatible with Buck ",
@@ -41,6 +51,9 @@ def setUpClass(cls):
             verbose=True,
         )
 
+=======
+class TestSDPAPrivateUse1Only(NNTestCase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skipIfTorchDynamo()
     def test_fused_sdp_choice_privateuseone(self):
         batch_size, seq_len, num_heads, head_dim = 4, 256, 2, 128
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index ca5ab1e7df3b..b883cd36b907 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -547,7 +547,11 @@ def test_complex_edge_values(self, device, dtype):
         # sqrt Test Reference: https://github.com/pytorch/pytorch/pull/47424
         x = torch.tensor(0.0 - 1.0e20j, dtype=dtype, device=device)
         self.compare_with_numpy(torch.sqrt, np.sqrt, x)
+<<<<<<< HEAD
         # acos test reference: https://github.com/pytorch/pytorch/issue/42952
+=======
+        # acos test reference: https://github.com/pytorch/pytorch/issues/42952
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not (dtype == torch.cdouble and "cuda" in device):
             self.compare_with_numpy(torch.acos, np.arccos, x)
 
diff --git a/test/test_utils.py b/test/test_utils.py
index 5f69ecdfe35a..a079c1b539cd 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1060,6 +1060,7 @@ def test_get_default_device(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_get_default_device_more(self):
+<<<<<<< HEAD
         torch.set_default_device("cuda")
         self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
         torch.set_default_device(None)
@@ -1072,6 +1073,35 @@ def test_get_default_device_more(self):
         torch.set_default_device("cuda:1")
         self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
         torch.set_default_device(None)
+=======
+        try:
+            torch.set_default_device("cuda")
+            self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
+            torch.set_default_device(None)
+
+            torch.set_default_device("cuda")
+            torch.cuda.set_device("cuda:1")
+            self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
+            torch.set_default_device(None)
+
+            torch.set_default_device("cuda:1")
+            self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
+            torch.set_default_device(None)
+
+            torch.set_default_device("cuda:1")
+            with torch.device("cuda:0"):
+                self.assertEqual(torch.get_default_device(), torch.device("cuda", 0))
+
+            torch.set_default_device("cpu")
+            self.assertEqual(torch.get_default_device(), torch.device("cpu"))
+            with torch.device("cuda:0"):
+                self.assertEqual(torch.get_default_device(), torch.device("cuda", 0))
+
+            self.assertEqual(torch.get_default_device(), torch.device("cpu"))
+        finally:
+            # Reset the device at the end.
+            torch.set_default_device(None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @onlyCPU
     @ops(op_db)
diff --git a/test/test_xpu.py b/test/test_xpu.py
index 4208bf6daa5e..9049bb6c3e97 100644
--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: intel"]
 
+<<<<<<< HEAD
+=======
+import re
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import subprocess
 import sys
 import tempfile
@@ -22,7 +26,10 @@
     find_library_location,
     IS_LINUX,
     IS_WINDOWS,
+<<<<<<< HEAD
     NoTest,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     run_tests,
     suppress_warnings,
     TEST_XPU,
@@ -31,10 +38,13 @@
 from torch.utils.checkpoint import checkpoint_sequential
 
 
+<<<<<<< HEAD
 if not TEST_XPU:
     print("XPU not available, skipping tests", file=sys.stderr)
     TestCase = NoTest  # noqa: F811
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TEST_MULTIXPU = torch.xpu.device_count() > 1
 
 cpu_device = torch.device("cpu")
@@ -74,6 +84,10 @@
 ]
 
 
+<<<<<<< HEAD
+=======
+@unittest.skipIf(not TEST_XPU, "XPU not available, skipping tests")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestXpu(TestCase):
     def test_device_behavior(self):
         current_device = torch.xpu.current_device()
@@ -136,6 +150,10 @@ def test_get_device_properties(self):
                 device_capability["architecture"],
             )
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(IS_WINDOWS, "not applicable to Windows (only fails with fork)")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_wrong_xpu_fork(self):
         stderr = TestCase.runWithPytorchAPIUsageStderr(
             """\
@@ -158,6 +176,12 @@ def run(rank):
         )
         self.assertRegex(stderr, "Cannot re-initialize XPU in forked subprocess.")
 
+<<<<<<< HEAD
+=======
+    @unittest.skipIf(
+        IS_WINDOWS, "Only for lazy initialization on Linux, not applicable on Windows."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_lazy_init(self):
         """Validate that no XPU calls are made during `import torch` call"""
 
@@ -192,9 +216,17 @@ def test_multi_process(model, input):
     torch.nn.ReLU(),
     torch.nn.MaxPool2d(2, 2),
 )
+<<<<<<< HEAD
 test_multi_process(model, input)
 test_multi_process(model, input)
 print(torch.xpu.device_count())
+=======
+
+if __name__ == "__main__":
+    test_multi_process(model, input)
+    test_multi_process(model, input)
+    print(torch.xpu.device_count())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
         rc = check_output(test_script)
         self.assertEqual(rc, str(torch.xpu.device_count()))
@@ -245,7 +277,11 @@ def test_events(self):
         stream.record_event(end_event)
         torch.xpu.synchronize()
         if int(torch.version.xpu) >= 20250000:
+<<<<<<< HEAD
             start_event.elapsed_time(end_event)
+=======
+            self.assertGreater(start_event.elapsed_time(end_event), 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             with self.assertRaisesRegex(
                 NotImplementedError,
@@ -253,6 +289,18 @@ def test_events(self):
             ):
                 start_event.elapsed_time(end_event)
 
+<<<<<<< HEAD
+=======
+        event = torch.xpu.Event(enable_timing=True)
+        self.assertEqual(event.sycl_event, 0)
+        self.assertEqual(event.event_id, 0)
+
+        event.record()
+        self.assertNotEqual(event.sycl_event, 0)
+        self.assertNotEqual(event.event_id, 0)
+        self.assertEqual(event.sycl_event, event.event_id)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_generic_stream_event(self):
         stream = torch.Stream("xpu")
         self.assertEqual(stream.device_index, torch.xpu.current_device())
@@ -287,7 +335,11 @@ def test_generic_stream_event(self):
         self.assertNotEqual(event1.event_id, event2.event_id)
         self.assertEqual(c_xpu.cpu(), a + b)
         if int(torch.version.xpu) >= 20250000:
+<<<<<<< HEAD
             event1.elapsed_time(event2)
+=======
+            self.assertGreater(event1.elapsed_time(event2), 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             with self.assertRaisesRegex(
                 NotImplementedError,
@@ -309,6 +361,27 @@ def test_stream_compatibility(self):
         with self.assertRaisesRegex(RuntimeError, "The device index is out of range"):
             torch.accelerator.current_stream(torch.accelerator.device_count())
 
+<<<<<<< HEAD
+=======
+    def test_device_context_manager(self):
+        prev_device = torch.xpu.current_device()
+        with torch.accelerator.device_index(None):
+            self.assertEqual(torch.xpu.current_device(), prev_device)
+        self.assertEqual(torch.xpu.current_device(), prev_device)
+        with torch.accelerator.device_index(0):
+            self.assertEqual(torch.xpu.current_device(), 0)
+        self.assertEqual(torch.xpu.current_device(), prev_device)
+
+    @unittest.skipIf(not TEST_MULTIXPU, "only one GPU detected")
+    def test_multi_device_context_manager(self):
+        src_device = 0
+        dst_device = 1
+        torch.xpu.set_device(src_device)
+        with torch.accelerator.device_index(dst_device):
+            self.assertEqual(torch.xpu.current_device(), 1)
+        self.assertEqual(torch.xpu.current_device(), src_device)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_stream_context_manager(self):
         prev_stream = torch.xpu.current_stream()
         with torch.xpu.Stream() as stream:
@@ -521,6 +594,7 @@ def test_torch_version_xpu(self):
             library = find_library_location("libtorch_xpu.so")
             cmd = f"ldd {library} | grep libsycl"
             results = subprocess.check_output(cmd, shell=True).strip().split(b"\n")
+<<<<<<< HEAD
             # There should be only one libsycl.so or libsycl-preview.so
             self.assertEqual(len(results), 1)
             for result in results:
@@ -530,6 +604,12 @@ def test_torch_version_xpu(self):
                     self.assertLess(compiler_version, 20250000)
                 else:
                     self.fail("Unexpected libsycl library")
+=======
+            # There should be only one libsycl.so
+            self.assertEqual(len(results), 1)
+            for result in results:
+                self.assertTrue(b"libsycl.so" in result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_dlpack_conversion(self):
         x = make_tensor((5,), dtype=torch.float32, device="xpu")
@@ -548,6 +628,10 @@ def test_dlpack_conversion(self):
 instantiate_device_type_tests(TestXpu, globals(), only_for="xpu", allow_xpu=True)
 
 
+<<<<<<< HEAD
+=======
+@unittest.skipIf(not TEST_XPU, "XPU not available, skipping tests")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestXpuAutocast(TestAutocast):
     # These operators are not implemented on XPU backend and we can NOT fall back
     # them to CPU. So we have to skip them at this moment.
@@ -628,6 +712,10 @@ def test_xpu_autocast_dtype(self):
             self.assertEqual(result.dtype, torch.float16)
 
 
+<<<<<<< HEAD
+=======
+@unittest.skipIf(not TEST_XPU, "XPU not available, skipping tests")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestXpuTrace(TestCase):
     def setUp(self):
         torch._C._activate_gpu_trace()
@@ -690,5 +778,38 @@ def test_event_synchronization_callback(self):
         self.mock.assert_called_once_with(event._as_parameter_.value)
 
 
+<<<<<<< HEAD
+=======
+class TestXPUAPISanity(TestCase):
+    def test_is_bf16_supported(self):
+        self.assertEqual(
+            torch.xpu.is_bf16_supported(including_emulation=True),
+            torch.xpu.is_available(),
+        )
+
+    def test_get_arch_list(self):
+        if not torch.xpu._is_compiled():
+            self.assertEqual(len(torch.xpu.get_arch_list()), 0)
+
+    def test_torch_config_for_xpu(self):
+        config = torch.__config__.show()
+        value = re.search(r"USE_XPU=([^,]+)", config)
+        self.assertIsNotNone(value)
+        if torch.xpu._is_compiled():
+            self.assertTrue(value.group(1) in ["ON", "1"])
+            value = re.search(r"USE_XCCL=([^,]+)", config)
+            if torch.distributed.is_xccl_available():
+                self.assertTrue(value.group(1) in ["ON", "1"])
+            else:
+                self.assertTrue(value.group(1) in ["OFF", "0"])
+        else:
+            self.assertTrue(value.group(1) in ["OFF", "0"])
+            self.assertFalse(torch.distributed.is_xccl_available())
+            value = re.search(r"USE_XCCL=([^,]+)", config)
+            self.assertIsNotNone(value)
+            self.assertTrue(value.group(1) in ["OFF", "0"])
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if __name__ == "__main__":
     run_tests()
diff --git a/test/torch_np/numpy_tests/core/test_dtype.py b/test/torch_np/numpy_tests/core/test_dtype.py
index aeb9710832f9..32d5d7e98387 100644
--- a/test/torch_np/numpy_tests/core/test_dtype.py
+++ b/test/torch_np/numpy_tests/core/test_dtype.py
@@ -3,7 +3,10 @@
 import functools
 import operator
 import pickle
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import types
 from itertools import permutations
 from typing import Any
@@ -325,11 +328,14 @@ def test_keyword_argument(self):
         # test for https://github.com/numpy/numpy/pull/16574#issuecomment-642660971
         assert np.dtype(dtype=np.float64) == np.dtype(np.float64)
 
+<<<<<<< HEAD
     @skipif(sys.version_info >= (3, 9), reason="Requires python 3.9")
     def test_class_getitem_38(self) -> None:
         with pytest.raises(TypeError):
             np.dtype[Any]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class TestFromDTypeAttribute(TestCase):
     def test_simple(self):
diff --git a/test/torch_np/numpy_tests/core/test_getlimits.py b/test/torch_np/numpy_tests/core/test_getlimits.py
index 738b272d40a3..06fdcc5cfbc3 100644
--- a/test/torch_np/numpy_tests/core/test_getlimits.py
+++ b/test/torch_np/numpy_tests/core/test_getlimits.py
@@ -135,9 +135,15 @@ def test_basic(self):
         [
             np.uint8,
             # xfail: unsupported add (uint[16,32,64])
+<<<<<<< HEAD
             subtest(np.uint16, decorators=[xfail]),
             subtest(np.uint32, decorators=[xfail]),
             subtest(np.uint64, decorators=[xfail]),
+=======
+            subtest(np.uint16, decorators=[] if TEST_WITH_TORCHDYNAMO else [xfail]),
+            subtest(np.uint32, decorators=[] if TEST_WITH_TORCHDYNAMO else [xfail]),
+            subtest(np.uint64, decorators=[] if TEST_WITH_TORCHDYNAMO else [xfail]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
     )
     def test_unsigned_max(self, T):
diff --git a/test/torch_np/numpy_tests/core/test_multiarray.py b/test/torch_np/numpy_tests/core/test_multiarray.py
index 1ccf5ca4ffe4..37293883d695 100644
--- a/test/torch_np/numpy_tests/core/test_multiarray.py
+++ b/test/torch_np/numpy_tests/core/test_multiarray.py
@@ -4328,7 +4328,11 @@ def test_empty(self):
     @skipif(
         IS_PYPY,
         reason="PyPy's memoryview currently does not track exports. See: "
+<<<<<<< HEAD
         "https://foss.heptapod.net/pypy/pypy/-/issues/3724",
+=======
+        "https://github.com/pypy/pypy/issues/3723",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_mmap_close(self):
         # The old buffer protocol was not safe for some things that the new
diff --git a/test/torch_np/numpy_tests/core/test_scalar_methods.py b/test/torch_np/numpy_tests/core/test_scalar_methods.py
index e1e92de7d6c6..d6ecfef051fd 100644
--- a/test/torch_np/numpy_tests/core/test_scalar_methods.py
+++ b/test/torch_np/numpy_tests/core/test_scalar_methods.py
@@ -5,7 +5,10 @@
 """
 import fractions
 import functools
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import types
 from typing import Any
 from unittest import skipIf as skipif, SkipTest
@@ -222,6 +225,7 @@ def test_subscript_scalar(self) -> None:
         assert np.number[Any]
 
 
+<<<<<<< HEAD
 @instantiate_parametrized_tests
 class TestClassGetitemMisc(TestCase):
     @skipif(sys.version_info >= (3, 9), reason="Requires python 3.8")
@@ -231,6 +235,8 @@ def test_class_getitem_38(self, cls: type[np.number]) -> None:
             cls[Any]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @skip(reason="scalartype(...).bit_count() not implemented")
 @instantiate_parametrized_tests
 class TestBitCount(TestCase):
diff --git a/test/typing/fail/arithmetic_ops.py b/test/typing/fail/arithmetic_ops.py
index 3108d4b1379e..ff784e4ceb83 100644
--- a/test/typing/fail/arithmetic_ops.py
+++ b/test/typing/fail/arithmetic_ops.py
@@ -9,6 +9,7 @@
 
 TENSOR, INT, FLOAT = randn(3), 2, 1.5
 
+<<<<<<< HEAD
 assert_type(
     INT & TENSOR,  # E: Unsupported operand types for & ("int" and "Tensor")  [operator]
     Any,
@@ -37,3 +38,12 @@
     ^ TENSOR,
     Tensor,
 )
+=======
+FLOAT & TENSOR  # E: Unsupported operand types for & ("float" and "Tensor")
+FLOAT | TENSOR  # E: Unsupported operand types for | ("float" and "Tensor")
+FLOAT ^ TENSOR  # E: Unsupported operand types for ^ ("float" and "Tensor")
+# FIXME: false negatives (https://github.com/pytorch/pytorch/issues/155701)
+# TENSOR & FLOAT  # E: Unsupported operand types for & ("Tensor" and "float" )
+# TENSOR | FLOAT  # E: Unsupported operand types for | ("Tensor" and "float" )
+# TENSOR ^ FLOAT  # E: Unsupported operand types for ^ ("Tensor" and "float" )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/typing/pass/arithmetic_ops.py b/test/typing/pass/arithmetic_ops.py
index 4edfb73e7359..ad7c85ac9d1d 100644
--- a/test/typing/pass/arithmetic_ops.py
+++ b/test/typing/pass/arithmetic_ops.py
@@ -109,9 +109,15 @@
 assert_type(INT**TENSOR, Any)
 assert_type(INT << TENSOR, Any)
 assert_type(INT >> TENSOR, Any)
+<<<<<<< HEAD
 assert_type(INT & TENSOR, Any)  # type: ignore[operator]
 assert_type(INT | TENSOR, Any)  # type: ignore[operator]
 assert_type(INT ^ TENSOR, Any)  # type: ignore[operator]
+=======
+assert_type(INT & TENSOR, Tensor)
+assert_type(INT | TENSOR, Tensor)
+assert_type(INT ^ TENSOR, Tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 assert_type(TENSOR == FLOAT, Tensor)
 assert_type(TENSOR != FLOAT, Tensor)
@@ -147,9 +153,12 @@
 assert_type(FLOAT**TENSOR, Any)
 assert_type(FLOAT << TENSOR, Any)
 assert_type(FLOAT >> TENSOR, Any)
+<<<<<<< HEAD
 assert_type(FLOAT & TENSOR, Tensor)  # type: ignore[operator]
 assert_type(FLOAT | TENSOR, Tensor)  # type: ignore[operator]
 assert_type(FLOAT ^ TENSOR, Tensor)  # type: ignore[operator]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 NUMBER: TypeAlias = Union[int, float, bool]
@@ -408,6 +417,7 @@ def __xor__(self, other: NUMBER) -> "Binary":  # type: ignore[override]
 assert_type(BOOL >> TENSOR, Any)
 assert_type(FLOAT >> TENSOR, Any)
 assert_type(INT >> TENSOR, Any)
+<<<<<<< HEAD
 
 assert_type(FLOAT & TENSOR, Tensor)  # type: ignore[operator]
 assert_type(INT & TENSOR, Any)  # type: ignore[operator]
@@ -417,3 +427,5 @@ def __xor__(self, other: NUMBER) -> "Binary":  # type: ignore[override]
 
 assert_type(FLOAT ^ TENSOR, Tensor)  # type: ignore[operator]
 assert_type(INT ^ TENSOR, Any)  # type: ignore[operator]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/typing/pass/torch_size.py b/test/typing/pass/torch_size.py
index 2ea2088a52cb..d370bc8dac88 100644
--- a/test/typing/pass/torch_size.py
+++ b/test/typing/pass/torch_size.py
@@ -1,17 +1,27 @@
+<<<<<<< HEAD
 from typing_extensions import assert_type
+=======
+# mypy: enable-error-code=unused-ignore
+
+from typing_extensions import assert_type, Never
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch import Size
 
 
+<<<<<<< HEAD
 s1 = Size([1, 2, 3])
 s2 = Size([1, 2, 3])
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ZeroIndex:
     def __index__(self) -> int:
         return 0
 
 
+<<<<<<< HEAD
 # __getitem__
 assert_type(s1[0], int)
 assert_type(s1[ZeroIndex()], int)
@@ -26,3 +36,38 @@ def __index__(self) -> int:
 assert_type(s1 * ZeroIndex(), Size)
 assert_type(3 * s1, Size)
 assert_type(ZeroIndex() * s1, Size)
+=======
+tup0: tuple[()] = ()
+tup1: tuple[int] = (1,)
+tup2: tuple[int, int] = (1, 2)
+tupN: tuple[int, int, int] = (1, 2, 3)
+tupX: tuple[Never, ...] = tuple()
+s = Size([1, 2, 3])
+
+# assignability to tuple
+t: tuple[int, ...] = s
+
+# __getitem__
+assert_type(s[0], int)
+assert_type(s[ZeroIndex()], int)
+assert_type(s[:2], Size)
+# __add__
+assert_type(s + s, Size)
+assert_type(s + tup0, Size)
+assert_type(s + tup1, Size)
+assert_type(s + tup2, Size)
+assert_type(s + tupN, Size)
+assert_type(s + tupX, Size)
+# __radd__
+# NOTE: currently incorrect inference, see: https://github.com/python/mypy/issues/19006
+assert_type(tup0 + s, Size)  # type: ignore[assert-type]
+assert_type(tup1 + s, Size)  # type: ignore[assert-type]
+assert_type(tup2 + s, Size)  # type: ignore[assert-type]
+assert_type(tupN + s, Size)  # type: ignore[assert-type]
+assert_type(tupX + s, Size)  # type: ignore[assert-type]
+# __mul__
+assert_type(s * 3, Size)
+assert_type(s * ZeroIndex(), Size)
+assert_type(3 * s, Size)
+assert_type(ZeroIndex() * s, Size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/test/xpu/test_fusion.py b/test/xpu/test_fusion.py
new file mode 100644
index 000000000000..f2ae0be84be9
--- /dev/null
+++ b/test/xpu/test_fusion.py
@@ -0,0 +1,290 @@
+# Owner(s): ["module: intel"]
+
+import itertools
+from typing import NamedTuple
+
+import torch
+import torch.nn as nn
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+CONV_MODULES = {2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
+
+
+class PointwisePostOp(NamedTuple):
+    attr: str
+    pointwise_module: nn.Module
+    scalars: list = []
+    algorithm: str = ""
+
+
+class TestoneDNNFusion(TestCase):
+    def _unary_list(self):
+        unary_list = {
+            "relu": PointwisePostOp("relu", nn.ReLU()),
+            "sigmoid": PointwisePostOp("sigmoid", nn.Sigmoid()),
+            "tanh": PointwisePostOp("tanh", nn.Tanh()),
+            "hardswish": PointwisePostOp("hardswish", nn.Hardswish()),
+            "swish": PointwisePostOp("swish", nn.SiLU()),
+            "leaky_relu": PointwisePostOp(
+                "leaky_relu", nn.LeakyReLU(0.1, inplace=False), scalars=[0.1]
+            ),
+            "hardtanh": PointwisePostOp(
+                "hardtanh",
+                nn.Hardtanh(min_val=-0.5, max_val=4, inplace=False),
+                scalars=[-0.5, 4],
+            ),
+            "gelu_none": PointwisePostOp(
+                "gelu", nn.GELU(approximate="none"), algorithm="none"
+            ),
+            "gelu_tanh": PointwisePostOp(
+                "gelu", nn.GELU(approximate="tanh"), algorithm="tanh"
+            ),
+        }
+        return unary_list
+
+    def _binary_list(self):
+        binary_list = {
+            "add": torch.add,
+            "sub": torch.sub,
+            "mul": torch.mul,
+        }
+        return binary_list
+
+    def test_linear_unary_fusion_ops(self, device):
+        class M(nn.Module):
+            def __init__(self, unary_fn, in_channels, out_channels, bias, **kwargs):
+                super().__init__()
+                self.linear = torch.nn.Linear(
+                    in_channels, out_channels, bias=bias, **kwargs
+                )
+                self.unary = unary_fn
+
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.unary(x)
+                return x
+
+        for pointwise_info in self._unary_list().values():
+            options = itertools.product([[2, 3, 10], [2, 10]], [True, False])
+            for input_shape, bias in options:
+                with torch.no_grad():
+                    mod = M(
+                        pointwise_info.pointwise_module, input_shape[-1], 10, bias
+                    ).eval()
+                    mod = mod.to(device)
+                    v = torch.randn(input_shape)
+                    v = v.to(device)
+                    ref = mod(v)
+                    attr = pointwise_info.attr
+                    scalars = pointwise_info.scalars
+                    algorithm = pointwise_info.algorithm
+                    fused = torch.ops.mkldnn._linear_pointwise(
+                        v,
+                        mod.linear.weight,
+                        mod.linear.bias,
+                        attr,
+                        scalars,
+                        algorithm,
+                    )
+                    self.assertEqual(ref, fused)
+
+    def test_linear_binary_fusion_ops(self, device):
+        class M(nn.Module):
+            def __init__(self, binary_fn, in_channels, out_channels, bias, **kwargs):
+                super().__init__()
+                self.linear = torch.nn.Linear(
+                    in_channels, out_channels, bias=bias, **kwargs
+                )
+                self.binary = binary_fn
+
+            def forward(self, x, other):
+                x = self.linear(x)
+                x = self.binary(x, other)
+                return x
+
+        out_feature = 20
+        in_feature = 10
+        for pointwise_name, pointwise_fn in self._binary_list().items():
+            with torch.no_grad():
+                input = torch.randn(4, in_feature).to(device)
+                model = M(pointwise_fn, in_feature, out_feature, True).eval().to(device)
+                other = torch.randn(4, out_feature).to(device)
+                ref = model(input, other)
+                attr = pointwise_name
+                fused = torch.ops.mkldnn._linear_pointwise(
+                    input,
+                    other,
+                    model.linear.weight,
+                    model.linear.bias,
+                    attr,
+                )
+                self.assertEqual(ref, fused)
+
+    def test_conv_unary_fusion_ops(self):
+        class M(nn.Module):
+            def __init__(
+                self,
+                unary_fn,
+                dim,
+                in_channels,
+                out_channels,
+                dilation,
+                groups,
+                bias,
+                **kwargs,
+            ):
+                super().__init__()
+                self.conv = CONV_MODULES[dim](
+                    in_channels,
+                    out_channels,
+                    dilation=dilation,
+                    groups=groups,
+                    bias=bias,
+                    **kwargs,
+                )
+                self.unary = unary_fn
+
+            def forward(self, x):
+                x = self.conv(x)
+                x = self.unary(x)
+                return x
+
+        input_shapes = {2: (112, 112), 3: (55, 55, 55)}
+        for pointwise_info in self._unary_list().values():
+            for dim in [2, 3]:
+                channels_last = (
+                    torch.channels_last if dim == 2 else torch.channels_last_3d
+                )
+                options = itertools.product(
+                    [True, False],
+                    [1, 2],
+                    [1, 4],
+                    [torch.contiguous_format, channels_last],
+                )
+                for bias, dilation, groups, memory_format in options:
+                    oC = 32 * groups
+                    iC = 3 * groups
+                    x_shape = (1, iC) + input_shapes[dim]
+                    x = torch.randn(x_shape, dtype=torch.float32).to(
+                        memory_format=memory_format
+                    )
+                    mod = M(
+                        pointwise_info.pointwise_module,
+                        dim,
+                        iC,
+                        oC,
+                        dilation,
+                        groups,
+                        bias,
+                        kernel_size=3,
+                    )
+                    mod = mod.to(memory_format=memory_format).eval()
+                    with torch.no_grad():
+                        x = x.to("xpu")
+                        mod = mod.to("xpu")
+                        ref = mod(x)
+                        attr = pointwise_info.attr
+                        scalars = pointwise_info.scalars
+                        algorithm = pointwise_info.algorithm
+                        fused = torch.ops.mkldnn._convolution_pointwise(
+                            x,
+                            mod.conv.weight,
+                            mod.conv.bias,
+                            mod.conv.padding,
+                            mod.conv.stride,
+                            mod.conv.dilation,
+                            mod.conv.groups,
+                            attr,
+                            scalars,
+                            algorithm,
+                        )
+                    self.assertEqual(ref, fused)
+
+    def test_conv_binary_fusion_ops(self):
+        class M(nn.Module):
+            def __init__(
+                self,
+                binary_fn,
+                dim,
+                in_channels,
+                out_channels,
+                dilation,
+                groups,
+                bias,
+                **kwargs,
+            ):
+                super().__init__()
+                self.conv = CONV_MODULES[dim](
+                    in_channels,
+                    out_channels,
+                    dilation=dilation,
+                    groups=groups,
+                    bias=bias,
+                    **kwargs,
+                )
+                self.binary = binary_fn
+
+            def forward(self, x, other):
+                x = self.conv(x)
+                x = self.binary(x, other)
+                return x
+
+        for pointwise_name, pointwise_fn in self._binary_list().items():
+            x = torch.randn(
+                (
+                    1,
+                    3,
+                    112,
+                    112,
+                )
+            ).to("xpu")
+            mod = M(pointwise_fn, 2, 3, 3, 1, 1, True, kernel_size=3).to("xpu")
+            other = torch.randn_like(mod.conv(x))
+            with torch.no_grad():
+                ref = mod(x, other)
+                unary_attr = None
+                attr = pointwise_name
+                fused = torch.ops.mkldnn._convolution_pointwise(
+                    x,
+                    other,
+                    mod.conv.weight,
+                    mod.conv.bias,
+                    mod.conv.padding,
+                    mod.conv.stride,
+                    mod.conv.dilation,
+                    mod.conv.groups,
+                    attr,
+                    None,
+                    unary_attr,
+                    [],
+                    None,
+                )
+                if attr == "add":
+                    fused_inplace = torch.ops.mkldnn._convolution_pointwise_(
+                        other,
+                        x,
+                        mod.conv.weight,
+                        mod.conv.bias,
+                        mod.conv.padding,
+                        mod.conv.stride,
+                        mod.conv.dilation,
+                        mod.conv.groups,
+                        attr,
+                        None,
+                        unary_attr,
+                        [],
+                        None,
+                    )
+                    self.assertEqual(ref, other)
+                    self.assertEqual(ref, fused_inplace)
+                self.assertEqual(ref, fused, atol=5e-4, rtol=5e-4)
+
+
+instantiate_device_type_tests(
+    TestoneDNNFusion, globals(), only_for="xpu", allow_xpu=True
+)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/xpu/test_gemm.py b/test/xpu/test_gemm.py
index cf3d68add29e..d4d508fa29af 100644
--- a/test/xpu/test_gemm.py
+++ b/test/xpu/test_gemm.py
@@ -1,5 +1,11 @@
 # Owner(s): ["module: intel"]
 
+<<<<<<< HEAD
+=======
+import contextlib
+import functools
+import inspect
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 import math
 import random
@@ -15,7 +21,112 @@
     instantiate_device_type_tests,
     precisionOverride,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import iter_indices, run_tests, TestCase
+=======
+from torch.testing._internal.common_utils import (
+    iter_indices,
+    parametrize,
+    run_tests,
+    TestCase,
+)
+
+
+@contextlib.contextmanager
+def tf32_off():
+    enabled = torch.backends.mkldnn.enabled
+    deterministic = torch.backends.mkldnn.deterministic
+    with torch.backends.mkldnn.flags(
+        enabled=enabled, deterministic=deterministic, allow_tf32=False
+    ):
+        yield
+
+
+@contextlib.contextmanager
+def tf32_on(self, tf32_precision=1e-5):
+    enabled = torch.backends.mkldnn.enabled
+    deterministic = torch.backends.mkldnn.deterministic
+    old_precision = self.precision
+    try:
+        self.precision = tf32_precision
+        with torch.backends.mkldnn.flags(
+            enabled=enabled, deterministic=deterministic, allow_tf32=True
+        ):
+            yield
+    finally:
+        self.precision = old_precision
+
+
+# This is a wrapper that wraps a test to run this test twice, one with
+# allow_tf32=True, another with allow_tf32=False. When running with
+# allow_tf32=True, it will use reduced precision as specified by the
+# argument. For example:
+#    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+#    @tf32_on_and_off(0.005)
+#    def test_matmul(self, device, dtype):
+#        a = ...; b = ...;
+#        c = torch.matmul(a, b)
+#        self.assertEqual(c, expected)
+# In the above example, when testing torch.float32 , the matmul will be running at
+# TF32 mode and TF32 mode off, and on TF32 mode, the assertEqual will use reduced
+# precision to check values.
+#
+# This decorator can be used for function with or without device/dtype, such as
+# @tf32_on_and_off(0.005)
+# def test_my_op(self)
+# @tf32_on_and_off(0.005)
+# def test_my_op(self, device)
+# @tf32_on_and_off(0.005)
+# def test_my_op(self, device, dtype)
+# @tf32_on_and_off(0.005)
+# def test_my_op(self, dtype)
+def tf32_on_and_off(tf32_precision=1e-5):
+    def with_tf32_disabled(self, function_call):
+        with tf32_off():
+            function_call()
+
+    def with_tf32_enabled(self, function_call):
+        with tf32_on(self, tf32_precision):
+            function_call()
+
+    def wrapper(f):
+        params = inspect.signature(f).parameters
+        arg_names = tuple(params.keys())
+
+        @functools.wraps(f)
+        def wrapped(*args, **kwargs):
+            kwargs.update(zip(arg_names, args))
+            cond = True
+            if "device" in kwargs:
+                cond = cond and (torch.device(kwargs["device"]).type == "xpu")
+            if "dtype" in kwargs:
+                cond = cond and (
+                    kwargs["dtype"] in {torch.float32}
+                )  # TODO: add complex64
+            if cond:
+                with_tf32_disabled(kwargs["self"], lambda: f(**kwargs))
+                with_tf32_enabled(kwargs["self"], lambda: f(**kwargs))
+            else:
+                f(**kwargs)
+
+        return wrapped
+
+    return wrapper
+
+
+# This is a wrapper that wraps a test to run it with TF32 turned off.
+# This wrapper is designed to be used when a test uses matmul or convolutions
+# but the purpose of that test is not testing matmul or convolutions.
+# Disabling TF32 will enforce torch.float tensors to be always computed
+# at full precision.
+def with_tf32_off(f):
+    @functools.wraps(f)
+    def wrapped(*args, **kwargs):
+        with tf32_off():
+            return f(*args, **kwargs)
+
+    return wrapped
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestBasicGEMM(TestCase):
@@ -128,11 +239,42 @@ def maybe_transpose(cond, m):
 
     @precisionOverride({torch.float: 1e-4, torch.double: 1e-6, torch.half: 1e-1})
     @dtypes(torch.float32, torch.half, torch.double)
+<<<<<<< HEAD
     def test_addmm(self, device, dtype):
         self._test_addmm_impl(torch.addmm, None, device, dtype)
 
     @precisionOverride({torch.bfloat16: 1e-0, torch.half: 1e-3, torch.float: 1e-4})
     @dtypes(torch.bfloat16, torch.half, torch.float, torch.double)
+=======
+    @tf32_on_and_off(0.05)
+    def test_addmm(self, device, dtype):
+        self._test_addmm_impl(torch.addmm, None, device, dtype)
+
+    @precisionOverride({torch.float: 1e-4, torch.double: 1e-6, torch.half: 1e-1})
+    @dtypes(torch.float, torch.half, torch.double)
+    def test_addmm_badmm_scalar_tnesor_input(self, device, dtype):
+        input = torch.tensor(1).to(device=device, dtype=dtype)
+
+        # test addmm
+        mat1 = torch.randn(10, 25, device=device).to(dtype)
+        mat2 = torch.randn(25, 10, device=device).to(dtype)
+        result = torch.addmm(input, mat1, mat2)
+
+        ref = mat1.cpu().numpy() @ mat2.cpu().numpy() + 1
+        self.assertEqual(result, ref)
+
+        # test baddbmm
+        mat1 = torch.randn(3, 10, 25, device=device).to(dtype)
+        mat2 = torch.randn(3, 25, 10, device=device).to(dtype)
+        result = torch.baddbmm(input, mat1, mat2)
+
+        ref = mat1.cpu().numpy() @ mat2.cpu().numpy() + 1
+        self.assertEqual(result, ref)
+
+    @precisionOverride({torch.bfloat16: 1e-0, torch.half: 1e-3, torch.float: 1e-4})
+    @dtypes(torch.bfloat16, torch.half, torch.float, torch.double)
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addmv(self, device, dtype):
         # have to use torch.randn(...).to(bfloat16) instead of
         # torch.randn(..., dtype=bfloat16). randn does not support
@@ -180,6 +322,10 @@ def test_addmv(self, device, dtype):
         torch.float32,
         torch.float64,
     )
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mm(self, device, dtype):
         def _test_mm(n, m, p, dtype, genf):
             # helper function
@@ -282,6 +428,10 @@ def genf_Half(x, y):
 
     @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
     @dtypes(torch.float32, torch.bfloat16, torch.half, torch.float64)
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_bmm(self, device, dtype):
         batch_sizes = [1, 10]
         M, N, O = 23, 15, 12
@@ -398,6 +548,10 @@ def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor):
 
     @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
     @dtypes(torch.float64, torch.float32, torch.bfloat16, torch.half)
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addbmm(self, device, dtype):
         num_batches = 2
         M, N, O = 16, 17, 18
@@ -501,6 +655,10 @@ def generate_tensor():
 
     @precisionOverride({torch.half: 0.1, torch.bfloat16: 0.5, torch.float64: 1e-6})
     @dtypes(torch.float64, torch.float32, torch.bfloat16, torch.half)
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.01)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_baddbmm(self, device, dtype):
         num_batches = 10
         M, N, O = 12, 8, 50
@@ -563,6 +721,10 @@ def generate_tensor():
         for b1, b2, ref, out_tensor in generate_tensor():
             self._test_addbmm_baddbmm("baddbmm", b1, b2, ref, out_tensor)
 
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_tensordot(self, device):
         a = torch.arange(60.0, device=device).reshape(3, 4, 5)
         b = torch.arange(24.0, device=device).reshape(4, 3, 2)
@@ -599,6 +761,10 @@ def test_tensordot(self, device):
 
     @dtypes(torch.float, torch.double)
     @precisionOverride({torch.float32: 1e-4})
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_1_sized_with_0_strided(self, device, dtype):
         a = make_tensor((8, 1, 64), dtype=dtype, device=device)
         a_strided = torch.as_strided(a, size=[8, 1, 64], stride=[64, 0, 1])
@@ -641,6 +807,10 @@ def _select_broadcastable_dims(self, dims_full=None):
                 dims_small = [ds] + dims_small
         return (dims_small, dims_large, dims_full)
 
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_broadcast_fused_matmul(self, device):
         fns = ["baddbmm", "addbmm", "addmm", "addmv", "addr"]
 
@@ -687,6 +857,10 @@ def dims_full_for_fn():
             self.assertEqual(r0, r1)
 
     @dtypes(torch.float32, torch.float64)
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_strided_mm_bmm(self, device, dtype):
         # Tests strided view case with stride smaller than corresponding dimension size
         x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype, device=device)
@@ -701,6 +875,10 @@ def test_strided_mm_bmm(self, device, dtype):
         torch_fn = lambda x: torch.mm(x, x)  # noqa: E731
         self.compare_with_numpy(torch_fn, np_fn, sx[0])
 
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mm_empty_inputs_mixed_dtype_errors(self, device):
         a = torch.randint(0, 10, [1, 10], dtype=torch.int16, device=device)
         b = torch.randn(10, 20, dtype=torch.float32, device=device)
@@ -709,6 +887,10 @@ def test_mm_empty_inputs_mixed_dtype_errors(self, device):
         ):
             torch.mm(a, b)
 
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_matmul_45724(self, device):
         # https://github.com/pytorch/pytorch/issues/45724
         a = torch.rand(65537, 22, 64, device=device, dtype=torch.half)
@@ -726,6 +908,10 @@ def test_matmul_45724(self, device):
         torch.float32,
         torch.float64,
     )
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_baddbmm_input_dtypes_compatibility(self, device, dtype):
         batch1 = torch.rand((1, 2, 2), dtype=torch.float32, device=device)
         batch2 = torch.rand((1, 2, 2), dtype=torch.float32, device=device)
@@ -740,6 +926,10 @@ def test_baddbmm_input_dtypes_compatibility(self, device, dtype):
             self.assertEqual(out, y_ref)
 
     @dtypes(torch.float)
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_baddbmm_nan_input_with_zero_beta(self, device, dtype):
         for shape in [[3, 2, 2], [2, 20, 20]]:
             mat1, mat2 = (
@@ -762,6 +952,10 @@ def test_baddbmm_nan_input_with_zero_beta(self, device, dtype):
 
     @precisionOverride({torch.double: 1e-6})
     @dtypes(torch.float, torch.double)
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addmm_sizes(self, device, dtype):
         for m in [0, 1, 25]:
             for n in [0, 1, 10]:
@@ -793,6 +987,10 @@ def test_addmm_sizes(self, device, dtype):
         }
     )
     @dtypes(torch.double, torch.float32, torch.bfloat16, torch.half)
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.05)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addmm_gelu(self, device, dtype):
         self._test_addmm_impl(torch._addmm_activation, "gelu", device, dtype)
 
@@ -807,10 +1005,19 @@ def test_addmm_gelu(self, device, dtype):
         }
     )
     @dtypes(torch.double, torch.float32, torch.bfloat16, torch.half)
+<<<<<<< HEAD
     def test_addmm_relu(self, device, dtype):
         self._test_addmm_impl(torch._addmm_activation, "relu", device, dtype)
 
     @dtypes(torch.float, torch.bfloat16, torch.half, torch.double)
+=======
+    @tf32_on_and_off(0.05)
+    def test_addmm_relu(self, device, dtype):
+        self._test_addmm_impl(torch._addmm_activation, "relu", device, dtype)
+
+    @dtypes(torch.float, torch.bfloat16, torch.half)
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_addmv_rowmajor_colmajor_incx_incy_lda(self, device, dtype):
         # tests (o, s)*(s).  o is output size, s is summed size.
         o = 5
@@ -854,6 +1061,10 @@ def _test(row_major, incx, incy, lda_tail):
         }
     )
     @dtypes(torch.double, torch.bfloat16, torch.half, torch.float32)
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_corner_cases_of_cublasltmatmul(self, device, dtype):
         # common case
         M = torch.randn(128, device=device).to(dtype)
@@ -993,6 +1204,10 @@ def call_torch_fn(*args, **kwargs):
             torch.tensor(0.0, device=device), fn(torch.dot, (0,), (0,), test_out=True)
         )
 
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_large_bmm_backward(self, device):
         A = torch.randn([1024, 2, 1024], device=device).mT.contiguous().mT
         B = torch.randn([1, 1024, 65536], device=device, requires_grad=True)
@@ -1001,6 +1216,10 @@ def test_large_bmm_backward(self, device):
         # Should not create an intermediary tensor of size [1024, 1024, 65536] (256GB of memory) and OOM
         (A @ B).backward(G)
 
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_large_bmm_mm_backward(self, device):
         A = torch.randn([1024, 2, 1024], device=device).mT.contiguous().mT
         B = torch.randn([1024, 65536], device=device, requires_grad=True)
@@ -1099,6 +1318,10 @@ def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
             self.check_single_matmul(x, y)
 
     @dtypes(torch.float)
+<<<<<<< HEAD
+=======
+    @tf32_on_and_off(0.005)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
         a = torch.empty(
             (256, 512), device=device, dtype=dtype, requires_grad=True
@@ -1119,6 +1342,107 @@ def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
         with torch.no_grad():
             torch.matmul(a, b, out=c)
 
+<<<<<<< HEAD
+=======
+    def _group_quantize_tensor(self, w, n_bit=4, q_group_size=16):
+        # w [k, n] = [32, 48]
+        assert w.dim() == 2
+        # w [n, k] = [48, 32]
+        w = w.transpose(0, 1).contiguous()
+        assert q_group_size > 1
+        assert w.shape[-1] % q_group_size == 0
+
+        # to_quant: [n * k / group_size, group_size]
+        to_quant = w.reshape(-1, q_group_size)
+        assert torch.isnan(to_quant).sum() == 0
+
+        max_val = to_quant.amax(dim=1, keepdim=True)
+        min_val = to_quant.amin(dim=1, keepdim=True)
+        max_int = 2**n_bit - 1
+        min_int = 0
+        scales = (max_val - min_val).clamp(min=1e-6) / max_int
+        assert torch.isnan(scales).sum() == 0
+
+        zeros = min_int - min_val.div(scales).round()
+        zeros = torch.clamp(zeros, min_int, max_int)
+        zeros = zeros.to(torch.int8)
+        assert torch.isnan(zeros).sum() == 0
+
+        out = to_quant.div(scales).add(zeros).round().clamp_(min_int, max_int)
+        assert torch.isnan(out).sum() == 0
+
+        # [n, k]
+        out = out.to(dtype=torch.int32).reshape(w.shape)
+        if out.device != torch.device("cpu"):
+            out = (out[::, 1::2] << 4 | out[::, 0::2]).to(torch.uint8)
+
+        # Scales and zeros for the same q-group should be contiguous, so we can
+        # load as a 32-bit word
+        scales = scales.view(w.shape[0], -1).transpose(0, 1).contiguous()
+        zeros = zeros.view(w.shape[0], -1).transpose(0, 1).contiguous()
+
+        return out, scales, zeros
+
+    @parametrize("m", [128])
+    @parametrize("k", [512, 1024])
+    @parametrize("n", [512, 1024])
+    def test__int4_mm(self, device, m, k, n):
+        q_group = 32
+        inner_k_tiles = 2
+
+        torch.manual_seed(1)
+        a_bf16 = torch.rand((m, k), dtype=torch.float32, device=device)
+        b_bf16 = torch.rand((k, n), dtype=torch.float32, device=device)
+
+        def convert_weight_to_int4pack(b):
+            # b_uint8 [n, k //2]
+            b_uint8, scales, zeros = self._group_quantize_tensor(
+                b, n_bit=4, q_group_size=q_group
+            )
+            # b_int4pack [k//8, n]
+            b_int4pack = torch._convert_weight_to_int4pack(b_uint8, inner_k_tiles)
+
+            return b_int4pack, scales, zeros
+
+        def weight_int4pack_mm(a, b_int4pack, qscale, qzeros):
+            return torch._weight_int4pack_mm_with_scales_and_zeros(
+                a, b_int4pack, q_group, qscale, qzeros
+            )
+
+        b_int4pack, b_scales, zeros_int8 = convert_weight_to_int4pack(b_bf16)
+
+        for dtype in [torch.bfloat16, torch.float16]:
+            a = a_bf16.to(dtype=dtype)
+            b = b_bf16.to(dtype=dtype)
+            b_scales = b_scales.to(dtype=dtype)
+            ref = torch.mm(a, b)
+
+            res = weight_int4pack_mm(a, b_int4pack, b_scales, zeros_int8)
+
+            mean_err = ((res - ref).abs() / ref).mean()
+            self.assertTrue(mean_err < 0.05)
+
+    def test_mm_with_offset(self, device):
+        from torch._dynamo.testing import rand_strided
+
+        offset = 997
+        a = rand_strided(
+            (2, 4, 128, 64),
+            (65536, 16384, 64, 1),
+            dtype=torch.float16,
+            device=device,
+            extra_size=offset,
+        )
+        a = a.as_strided((2, 4, 128, 64), (65536, 16384, 64, 1), storage_offset=offset)
+        b = rand_strided(
+            (2, 4, 64, 256), (65536, 16384, 1, 64), dtype=torch.float16, device=device
+        )
+
+        gpu_out = torch.matmul(a, b)
+        cpu_out = torch.matmul(a.cpu(), b.cpu())
+        self.assertEqual(gpu_out.cpu(), cpu_out)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 instantiate_device_type_tests(TestBasicGEMM, globals(), only_for="xpu", allow_xpu=True)
 
diff --git a/third_party/BUCK.oss b/third_party/BUCK.oss
index 0c6d2cd096a2..7af4bf1cd2cb 100644
--- a/third_party/BUCK.oss
+++ b/third_party/BUCK.oss
@@ -76,6 +76,18 @@ cxx_library(
 )
 
 cxx_library(
+<<<<<<< HEAD
+=======
+    name = "moodycamel",
+    raw_headers = glob([
+        "concurrentqueue/**/*.h",
+    ]),
+    reexport_all_header_dependencies = True,
+    visibility = ["PUBLIC"],
+)
+
+cxx_library(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     name = "pocketfft_header",
     header_namespace = "",
     exported_headers = {
diff --git a/third_party/concurrentqueue/README.md b/third_party/concurrentqueue/README.md
new file mode 100644
index 000000000000..14a0f0008cb0
--- /dev/null
+++ b/third_party/concurrentqueue/README.md
@@ -0,0 +1,19 @@
+## How to update moodycamel
+
+To update the moodycamel directory with the latest files from the concurrentqueue repository, run the following command:
+
+```bash
+cd third_party/concurrentqueue
+./update.sh
+```
+
+## Why not a submodule
+
+We didn’t want to deal with license issues from the test/ directory so we decided on a non-submodule approach. 
+This script allows us to keep the moodycamel directory up-to-date with the latest files from the concurrentqueue 
+repository without having to worry about submodule complexities.
+
+## Original source
+
+repo: https://github.com/cameron314/concurrentqueue
+commit: 24b78782bd6ca5a5853ef46917708806112dc142
diff --git a/third_party/concurrentqueue/moodycamel/LICENSE.md b/third_party/concurrentqueue/moodycamel/LICENSE.md
new file mode 100644
index 000000000000..519338976fb8
--- /dev/null
+++ b/third_party/concurrentqueue/moodycamel/LICENSE.md
@@ -0,0 +1,62 @@
+This license file applies to everything in this repository except that which
+is explicitly annotated as being written by other authors, i.e. the Boost
+queue (included in the benchmarks for comparison), Intel's TBB library (ditto),
+dlib::pipe (ditto),
+the CDSChecker tool (used for verification), the Relacy model checker (ditto),
+and Jeff Preshing's semaphore implementation (used in the blocking queue) which
+has a zlib license (embedded in lightweightsempahore.h).
+
+---
+
+Simplified BSD License:
+
+Copyright (c) 2013-2016, Cameron Desrochers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this list of
+conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or other materials
+provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+---
+
+I have also chosen to dual-license under the Boost Software License as an alternative to
+the Simplified BSD license above:
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/third_party/concurrentqueue/moodycamel/concurrentqueue.h b/third_party/concurrentqueue/moodycamel/concurrentqueue.h
new file mode 100644
index 000000000000..2fc775400d53
--- /dev/null
+++ b/third_party/concurrentqueue/moodycamel/concurrentqueue.h
@@ -0,0 +1,3747 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2020, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+
+#pragma once
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
+// does not support `if constexpr`, so we have no choice but to simply disable the warning
+#pragma warning(push)
+#pragma warning(disable: 4127)  // conditional expression is constant
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+// We'll override the default trait malloc ourselves without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+#include <mutex>        // used for thread exit synchronization
+
+// Platform-specific definitions of a numeric thread ID type and an invalid value
+namespace moodycamel { namespace details {
+	template<typename thread_id_t> struct thread_id_converter {
+		typedef thread_id_t thread_id_numeric_size_t;
+		typedef thread_id_t thread_id_hash_t;
+		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+	};
+} }
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel { namespace details {
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+	static inline thread_id_t thread_id() { return rl::thread_index(); }
+} }
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel { namespace details {
+	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+} }
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel { namespace details {
+	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+	
+	typedef std::thread::id thread_id_t;
+	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
+
+	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+	// be.
+	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+	template<std::size_t> struct thread_id_size { };
+	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
+	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
+
+	template<> struct thread_id_converter<thread_id_t> {
+		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+#ifndef __APPLE__
+		typedef std::size_t thread_id_hash_t;
+#else
+		typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+		static thread_id_hash_t prehash(thread_id_t const& x)
+		{
+#ifndef __APPLE__
+			return std::hash<std::thread::id>()(x);
+#else
+			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+#endif
+		}
+	};
+} }
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel { namespace details {
+	typedef std::uintptr_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
+	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+} }
+#endif
+
+// Constexpr if
+#ifndef MOODYCAMEL_CONSTEXPR_IF
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+#define MOODYCAMEL_CONSTEXPR_IF if constexpr
+#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
+#else
+#define MOODYCAMEL_CONSTEXPR_IF if
+#define MOODYCAMEL_MAYBE_UNUSED
+#endif
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
+#else
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
+// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions. 
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+namespace moodycamel { namespace details {
+#ifndef MOODYCAMEL_ALIGNAS
+// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
+#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
+	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
+	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
+	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
+	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
+	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
+	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
+	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
+	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
+	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
+#else
+	template<typename T> struct identity { typedef T type; };
+#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
+#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+#endif
+#endif
+} }
+
+
+// TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
+// we can apply per-function compile-time suppression.
+// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
+#define MOODYCAMEL_NO_TSAN
+#if defined(__has_feature)
+ #if __has_feature(thread_sanitizer)
+  #undef MOODYCAMEL_NO_TSAN
+  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+ #endif // TSAN
+#endif // TSAN
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
+	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+#else
+	static inline bool (likely)(bool x) { return x; }
+	static inline bool (unlikely)(bool x) { return x; }
+#endif
+} }
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GLIBCXX__)
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+	
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+	
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 32;
+	
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+	
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// How many full blocks can be expected for a single implicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// The initial size of the hash table mapping thread IDs to implicit producers.
+	// Note that the hash is resized every time it becomes half full.
+	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
+	// (using the enqueue methods without an explicit producer token) is disabled.
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+	
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+	
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+	// The number of times to spin before sleeping when waiting on a semaphore.
+	// Recommended values are on the order of 1000-10000 unless the number of
+	// consumer threads exceeds the number of idle cores (in which case try 0-100).
+	// Only affects instances of the BlockingConcurrentQueue.
+	static const int MAX_SEMA_SPINS = 10000;
+
+	// Whether to recycle dynamically-allocated blocks into an internal free list or
+	// not. If false, only pre-allocated blocks (controlled by the constructor
+	// arguments) will be recycled, and all others will be `free`d back to the heap.
+	// Note that blocks consumed by explicit producers are only freed on destruction
+	// of the queue (not following destruction of the token) regardless of this trait.
+	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
+	
+#ifndef MCDBGQ_USE_RELACY
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return std::malloc(size); }
+	static inline void free(void* ptr) { return std::free(ptr); }
+#endif
+#else
+	// Debug versions when running under the Relacy race detector (ignore
+	// these in user code)
+	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+template<typename T, typename Traits> class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+		
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr)
+		{
+		}
+	};
+	
+	template<bool use32> struct _hash_32_or_64 {
+		static inline std::uint32_t hash(std::uint32_t h)
+		{
+			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+			// Since the thread ID is already unique, all we really want to do is propagate that
+			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
+			// reducing collisions significantly
+			h ^= h >> 16;
+			h *= 0x85ebca6b;
+			h ^= h >> 13;
+			h *= 0xc2b2ae35;
+			return h ^ (h >> 16);
+		}
+	};
+	template<> struct _hash_32_or_64<1> {
+		static inline std::uint64_t hash(std::uint64_t h)
+		{
+			h ^= h >> 33;
+			h *= 0xff51afd7ed558ccd;
+			h ^= h >> 33;
+			h *= 0xc4ceb9fe1a85ec53;
+			return h ^ (h >> 33);
+		}
+	};
+	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+	
+	static inline size_t hash_thread_id(thread_id_t id)
+	{
+		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+			thread_id_converter<thread_id_t>::prehash(id)));
+	}
+	
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
+	}
+	
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = left.load(std::memory_order_relaxed);
+		left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		right.store(temp, std::memory_order_relaxed);
+	}
+	
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+	
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+	
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+	
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+	
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+	typedef RelacyThreadExitListener ThreadExitListener;
+	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+	class ThreadExitNotifier;
+
+	struct ThreadExitListener
+	{
+		typedef void (*callback_t)(void*);
+		callback_t callback;
+		void* userData;
+		
+		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
+		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
+	};
+
+	class ThreadExitNotifier
+	{
+	public:
+		static void subscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			std::lock_guard<std::mutex> guard(mutex());
+			listener->next = tlsInst.tail;
+			listener->chain = &tlsInst;
+			tlsInst.tail = listener;
+		}
+		
+		static void unsubscribe(ThreadExitListener* listener)
+		{
+			std::lock_guard<std::mutex> guard(mutex());
+			if (!listener->chain) {
+				return;  // race with ~ThreadExitNotifier
+			}
+			auto& tlsInst = *listener->chain;
+			listener->chain = nullptr;
+			ThreadExitListener** prev = &tlsInst.tail;
+			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+				if (ptr == listener) {
+					*prev = ptr->next;
+					break;
+				}
+				prev = &ptr->next;
+			}
+		}
+		
+	private:
+		ThreadExitNotifier() : tail(nullptr) { }
+		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		~ThreadExitNotifier()
+		{
+			// This thread is about to exit, let everyone know!
+			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+			std::lock_guard<std::mutex> guard(mutex());
+			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+				ptr->chain = nullptr;
+				ptr->callback(ptr->userData);
+			}
+		}
+		
+		// Thread-local
+		static inline ThreadExitNotifier& instance()
+		{
+			static thread_local ThreadExitNotifier notifier;
+			return notifier;
+		}
+
+		static inline std::mutex& mutex()
+		{
+			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
+			static std::mutex mutex;
+			return mutex;
+		}
+		
+	private:
+		ThreadExitListener* tail;
+	};
+#endif
+#endif
+	
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+	
+	template<typename T, typename Traits>
+	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+	
+	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+	
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+	
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+	
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+	
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+	
+	template<typename T, typename Traits>
+	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+	
+	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+	
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+	
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+	typedef ::moodycamel::ProducerToken producer_token_t;
+	typedef ::moodycamel::ConsumerToken consumer_token_t;
+	
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+	
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		// Track all the producers using a fully-resolved typed list for
+		// each kind; this makes it possible to debug them starting from
+		// the root queue object (otherwise wacky casts are needed that
+		// don't compile in the debugger's expression evaluator).
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
+		populate_initial_block_list(blocks);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+		
+		// Destroy implicit producer hash tables
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+			while (hash != nullptr) {
+				auto prev = hash->prev;
+				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
+					for (size_t i = 0; i != hash->capacity; ++i) {
+						hash->entries[i].~ImplicitProducerKVP();
+					}
+					hash->~ImplicitProducerHash();
+					(Traits::free)(hash);
+				}
+				hash = prev;
+			}
+		}
+		
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+		
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+		producerCount(other.producerCount.load(std::memory_order_relaxed)),
+		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+		initialBlockPool(other.initialBlockPool),
+		initialBlockPoolSize(other.initialBlockPoolSize),
+		freeList(std::move(other.freeList)),
+		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+	{
+		// Move the other one into this, and leave the other one as an empty queue
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		swap_implicit_producer_hashes(other);
+		
+		other.producerListTail.store(nullptr, std::memory_order_relaxed);
+		other.producerCount.store(0, std::memory_order_relaxed);
+		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+		
+		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+		other.initialBlockPoolSize = 0;
+		other.initialBlockPool = nullptr;
+		
+		reown_producers();
+	}
+	
+	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		details::swap_relaxed(producerListTail, other.producerListTail);
+		details::swap_relaxed(producerCount, other.producerCount);
+		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+		std::swap(initialBlockPool, other.initialBlockPool);
+		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+		freeList.swap(other.freeList);
+		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+		
+		swap_implicit_producer_hashes(other);
+		
+		reown_producers();
+		other.reown_producers();
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		details::swap_relaxed(explicitProducers, other.explicitProducers);
+		details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+		
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CanAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CanAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+	}
+	
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		// Instead of simply trying each producer in turn (which could cause needless contention on the first
+		// producer), we score them heuristically.
+		size_t nonEmptyCount = 0;
+		ProducerBase* best = nullptr;
+		size_t bestSize = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+			auto size = ptr->size_approx();
+			if (size > 0) {
+				if (size > bestSize) {
+					bestSize = size;
+					best = ptr;
+				}
+				++nonEmptyCount;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (nonEmptyCount > 0) {
+			if ((details::likely)(best->dequeue(item))) {
+				return true;
+			}
+			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+				if (ptr != best && ptr->dequeue(item)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// This differs from the try_dequeue(item) method in that this one does
+	// not attempt to reduce contention by interleaving the order that producer
+	// streams are dequeued from. So, using this method can reduce overall throughput
+	// under contention, but will give more predictable results in single-threaded
+	// consumer scenarios. This is mostly only useful for internal unit tests.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue_non_interleaved(U& item)
+	{
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->dequeue(item)) {
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		// The idea is roughly as follows:
+		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
+		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+		
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return true;
+		}
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			if (ptr->dequeue(item)) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = 1;
+				return true;
+			}
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			count += ptr->dequeue_bulk(itemFirst, max - count);
+			if (count == max) {
+				break;
+			}
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return 0;
+			}
+		}
+		
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+	
+	
+	
+	// Attempts to dequeue from a specific producer's inner queue.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns false if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+	}
+	
+	// Attempts to dequeue several elements from a specific producer's inner queue.
+	// Returns the number of items actually dequeued.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns 0 if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+	}
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static constexpr bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2 &&
+			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	struct ExplicitProducer;
+	friend struct ExplicitProducer;
+	struct ImplicitProducer;
+	friend struct ImplicitProducer;
+	friend class ConcurrentQueueTests;
+		
+	enum AllocationMode { CanAlloc, CannotAlloc };
+	
+	
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(producer_token_t const& token, U&& element)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(U&& element)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if ((details::unlikely)(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+		
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+		
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+	
+	
+	///////////////////////////
+	// Free list
+	///////////////////////////
+	
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+		
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+	
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+		
+		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		inline void add(N* node)
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+		
+		inline N* try_get()
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+				
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+					
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+				
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+			
+			return nullptr;
+		}
+		
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+		
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+		
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+	
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+		
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+		debug::DebugMutex mutex;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Block
+	///////////////////////////
+	
+	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+	
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
+		{
+#ifdef MCDBGQ_TRACKMEM
+			owner = nullptr;
+#endif
+		}
+		
+		template<InnerQueueContext context>
+		inline bool is_empty() const
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+				
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+		
+		// Returns true if the block is now empty (does not apply in explicit context)
+		template<InnerQueueContext context>
+		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+		
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		template<InnerQueueContext context>
+		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void set_all_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void reset_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+		
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		
+	private:
+		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
+		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+		
+#ifdef MCDBGQ_TRACKMEM
+		void* owner;
+#endif
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+#ifdef MCDBGQ_TRACKMEM
+public:
+	struct MemStats;
+private:
+#endif
+	
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+	
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			isExplicit(isExplicit_),
+			parent(parent_)
+		{
+		}
+		
+		virtual ~ProducerBase() { }
+		
+		template<typename U>
+		inline bool dequeue(U& element)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue(element);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue(element);
+			}
+		}
+		
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+		}
+		
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+		
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+		
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+		
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+		
+		Block* tailBlock;
+		
+	public:
+		bool isExplicit;
+		ConcurrentQueue* parent;
+		
+	protected:
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+		
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, true),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+			
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+		
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+				
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+						continue;
+					}
+					
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+					
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					this->parent->add_block_to_free_list(block);
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto startBlock = this->tailBlock;
+				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					// We can re-use the block ahead of us, it's empty!					
+					this->tailBlock = this->tailBlock->next;
+					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					
+					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+					// last block from it first -- except instead of removing then adding, we can just overwrite).
+					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
+					// it would have been re-attempted when adding the first block to the queue; since there is such
+					// a block, a block index must have been successfully allocated.
+				}
+				else {
+					// Whatever head value we see here is >= the last value we saw here (relatively),
+					// and <= its current value. Since we have the most recent tail, the head must be
+					// <= to it.
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+						// We can't enqueue in another block because there's not enough leeway -- the
+						// tail could surpass the head by the time the block fills up! (Or we'll exceed
+						// the size limit, if the second part of the condition was true.)
+						return false;
+					}
+					// We're going to need a new block; check that the block index has room
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+						// Hmm, the circular block index is already full -- we'll need
+						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+						// the initial allocation failed in the constructor.
+						
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							return false;
+						}
+						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+							return false;
+						}
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						return false;
+					}
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					++pr_blockIndexSlotsUsed;
+				}
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// The constructor may throw. We want the element not to appear in the queue in
+					// that case (without corrupting the queue):
+					MOODYCAMEL_TRY {
+						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Revert change to the current block, but leave the new block available
+						// for next time
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				else {
+					(void)startBlock;
+					(void)originalBlockIndexSlotsUsed;
+				}
+				
+				// Add block to block index
+				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+				entry.base = currentTailIndex;
+				entry.block = this->tailBlock;
+				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				// Might be something to dequeue, let's give it a try
+				
+				// Note that this if is purely for performance purposes in the common case when the queue is
+				// empty and the values are eventually consistent -- we may enter here spuriously.
+				
+				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+				// change them) and must be the same value at this point (inside the if) as when the if condition was
+				// evaluated.
+
+				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+				// unfortunately that can't be shown to be correct using only the C++11 standard.
+				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				// Increment optimistic counter, then check if it went over the boundary
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				
+				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+				
+				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					// Guaranteed to be at least one element to dequeue!
+					
+					// Get the index. Note that since there's guaranteed to be at least one element, this
+					// will never exceed tail. We need to do an acquire-release fence here since it's possible
+					// that whatever condition got us to this point was for an earlier enqueued element (that
+					// we already see the memory effects for), but that by the time we increment somebody else
+					// has incremented it, and we need to see the memory effects for *that* element, which is
+					// in such a case is necessarily visible on the thread that incremented it in the first
+					// place with the more current condition (they must have acquired a tail that is at least
+					// as recent).
+					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					
+					// Determine which block the element is in
+					
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					// We need to be careful here about subtracting and dividing because of index wrap-around.
+					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
+					// block size (in order to get a correct signed block count offset in all cases):
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+					
+					// Dequeue
+					auto& el = *((*block)[index]);
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+						// Make sure the element is still fully dequeued and destroyed even if the assignment
+						// throws
+						struct Guard {
+							Block* block;
+							index_t index;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+							}
+						} guard = { block, index };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+					}
+					
+					return true;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			auto originalBlockIndexFront = pr_blockIndexFront;
+			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+			
+			Block* firstAllocatedBlock = nullptr;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+				// Allocate as many blocks as possible from ahead
+				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					this->tailBlock = this->tailBlock->next;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Now allocate as many blocks as necessary from the block pool
+				while (blockBaseDiff > 0) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						
+						// pr_blockIndexFront is updated inside new_block_index, so we need to
+						// update our fallback value too (since we keep the new index even if we
+						// later fail)
+						originalBlockIndexFront = originalBlockIndexSlotsUsed;
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					++pr_blockIndexSlotsUsed;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+				// publish the new block index front
+				auto block = firstAllocatedBlock;
+				while (true) {
+					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (block == this->tailBlock) {
+						break;
+					}
+					block = block->next;
+				}
+				
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+				}
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			auto endBlock = this->tailBlock;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							// Must use copy constructor even if move constructor is available
+							// because we may have to revert if there's an exception.
+							// Sorry about the horrible templated next line, but it was the only way
+							// to disable moving *at compile time*, which is important because a type
+							// may only define a (noexcept) move constructor, and so calls to the
+							// cctor will not compile, even if they are in an if branch that will never
+							// be executed
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Oh dear, an exception's been thrown -- destroy the elements that
+						// were enqueued so far and revert the entire bulk operation (we'll keep
+						// any allocated blocks in our linked list for later, though).
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			
+			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+				if (firstAllocatedBlock != nullptr)
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+			}
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								// It's too late to revert the dequeue, but we can make sure that all
+								// the dequeued objects are properly destroyed and the block index
+								// (and empty count) are properly updated before we propagate the exception
+								do {
+									block = localBlockIndex->entries[indexIndex].block;
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+									
+									firstIndexInBlock = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+		
+		
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+			
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+			
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+			
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+			
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+			
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+			
+			return true;
+		}
+		
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+		
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ExplicitProducer* nextExplicitProducer;
+	private:
+#endif
+		
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Implicit queue
+	//////////////////////////////////
+	
+	struct ImplicitProducer : public ProducerBase
+	{			
+		ImplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, false),
+			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+			blockIndex(nullptr)
+		{
+			new_block_index();
+		}
+		
+		~ImplicitProducer()
+		{
+			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+			// completed already; this means that all undequeued elements are placed contiguously across
+			// contiguous blocks, and that only the first and last remaining blocks can be only partially
+			// empty (all other remaining blocks must be completely full).
+			
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+			// Unregister ourselves for thread termination notification
+			if (!this->inactive.load(std::memory_order_relaxed)) {
+				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+			}
+#endif
+			
+			// Destroy all remaining elements!
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto index = this->headIndex.load(std::memory_order_relaxed);
+			Block* block = nullptr;
+			assert(index == tail || details::circular_less_than(index, tail));
+			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
+			while (index != tail) {
+				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+					if (block != nullptr) {
+						// Free the old block
+						this->parent->add_block_to_free_list(block);
+					}
+					
+					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+				}
+				
+				((*block)[index])->~T();
+				++index;
+			}
+			// Even if the queue is empty, there's still one block that's not on the free list
+			// (unless the head index reached the end of it, in which case the tail will be poised
+			// to create a new block).
+			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+				this->parent->add_block_to_free_list(this->tailBlock);
+			}
+			
+			// Destroy block index
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			if (localBlockIndex != nullptr) {
+				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+					localBlockIndex->index[i]->~BlockIndexEntry();
+				}
+				do {
+					auto prev = localBlockIndex->prev;
+					localBlockIndex->~BlockIndexHeader();
+					(Traits::free)(localBlockIndex);
+					localBlockIndex = prev;
+				} while (localBlockIndex != nullptr);
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto head = this->headIndex.load(std::memory_order_relaxed);
+				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+					return false;
+				}
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				// Find out where we'll be inserting this block in the block index
+				BlockIndexEntry* idxEntry;
+				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+					return false;
+				}
+				
+				// Get ahold of a new block
+				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+				if (newBlock == nullptr) {
+					rewind_block_index_tail();
+					idxEntry->value.store(nullptr, std::memory_order_relaxed);
+					return false;
+				}
+#ifdef MCDBGQ_TRACKMEM
+				newBlock->owner = this;
+#endif
+				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// May throw, try to insert now before we publish the fact that we have this new block
+					MOODYCAMEL_TRY {
+						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						rewind_block_index_tail();
+						idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						this->parent->add_block_to_free_list(newBlock);
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				// Insert the new block into the index
+				idxEntry->value.store(newBlock, std::memory_order_relaxed);
+				
+				this->tailBlock = newBlock;
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			// See ExplicitProducer::dequeue for rationale and explanation
+			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					// Determine which block the element is in
+					auto entry = get_block_index_entry_for_index(index);
+					
+					// Dequeue
+					auto block = entry->value.load(std::memory_order_relaxed);
+					auto& el = *((*block)[index]);
+					
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+						// Note: Acquiring the mutex with every dequeue instead of only when a block
+						// is released is very sub-optimal, but it is, after all, purely debug code.
+						debug::DebugLock lock(producer->mutex);
+#endif
+						struct Guard {
+							Block* block;
+							index_t index;
+							BlockIndexEntry* entry;
+							ConcurrentQueue* parent;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+									entry->value.store(nullptr, std::memory_order_relaxed);
+									parent->add_block_to_free_list(block);
+								}
+							}
+						} guard = { block, index, entry, this->parent };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+
+						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Add the block back into the global free pool (and remove from block index)
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+					}
+					
+					return true;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+				}
+			}
+		
+			return false;
+		}
+		
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4706)  // assignment within conditional expression
+#endif
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			
+			// Note that the tailBlock we start off with may not be owned by us any more;
+			// this happens if it was filled up exactly to the top (setting tailIndex to
+			// the first index of the next block which is not yet allocated), then dequeued
+			// completely (putting it on the free list) before we enqueue again.
+			
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			Block* firstAllocatedBlock = nullptr;
+			auto endBlock = this->tailBlock;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				do {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					// Find out where we'll be inserting this block in the block index
+					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
+					Block* newBlock;
+					bool indexInserted = false;
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+						// Index allocation or block allocation failed; revert any other allocations
+						// and index insertions done so far for this operation
+						if (indexInserted) {
+							rewind_block_index_tail();
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						}
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+					newBlock->next = nullptr;
+					
+					// Insert the new block into the index
+					idxEntry->value.store(newBlock, std::memory_order_relaxed);
+					
+					// Store the chain of blocks so that we can undo if later allocations fail,
+					// and so that we can find the blocks when we do the actual enqueueing
+					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+						assert(this->tailBlock != nullptr);
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					endBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+				} while (blockBaseDiff > 0);
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					BlockIndexHeader* localBlockIndex;
+					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+					do {
+						auto blockStartIndex = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						
+						auto entry = localBlockIndex->index[indexIndex];
+						auto block = entry->value.load(std::memory_order_relaxed);
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								do {
+									entry = localBlockIndex->index[indexIndex];
+									block = entry->value.load(std::memory_order_relaxed);
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									
+									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+										debug::DebugLock lock(mutex);
+#endif
+										entry->value.store(nullptr, std::memory_order_relaxed);
+										this->parent->add_block_to_free_list(block);
+									}
+									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+									
+									blockStartIndex = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		// The block size must be > 1, so any number with the low bit set is an invalid block base index
+		static const index_t INVALID_BLOCK_BASE = 1;
+		
+		struct BlockIndexEntry
+		{
+			std::atomic<index_t> key;
+			std::atomic<Block*> value;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t capacity;
+			std::atomic<size_t> tail;
+			BlockIndexEntry* entries;
+			BlockIndexEntry** index;
+			BlockIndexHeader* prev;
+		};
+		
+		template<AllocationMode allocMode>
+		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
+			if (localBlockIndex == nullptr) {
+				return false;  // this can happen if new_block_index failed in the constructor
+			}
+			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+				
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+			
+			// No room in the old block index, try to allocate another one!
+			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+				return false;
+			}
+			else if (!new_block_index()) {
+				return false;
+			}
+			else {
+				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+				idxEntry = localBlockIndex->index[newTail];
+				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+		}
+		
+		inline void rewind_block_index_tail()
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+		}
+		
+		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+		{
+			BlockIndexHeader* localBlockIndex;
+			auto idx = get_block_index_index_for_index(index, localBlockIndex);
+			return localBlockIndex->index[idx];
+		}
+		
+		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+		{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+			debug::DebugLock lock(mutex);
+#endif
+			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+			localBlockIndex = blockIndex.load(std::memory_order_acquire);
+			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+			assert(tailBase != INVALID_BLOCK_BASE);
+			// Note: Must use division instead of shift because the index may wrap around, causing a negative
+			// offset, whose negativity we want to preserve
+			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+			return idx;
+		}
+		
+		bool new_block_index()
+		{
+			auto prev = blockIndex.load(std::memory_order_relaxed);
+			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+			auto raw = static_cast<char*>((Traits::malloc)(
+				sizeof(BlockIndexHeader) +
+				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+			if (raw == nullptr) {
+				return false;
+			}
+			
+			auto header = new (raw) BlockIndexHeader;
+			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+			if (prev != nullptr) {
+				auto prevTail = prev->tail.load(std::memory_order_relaxed);
+				auto prevPos = prevTail;
+				size_t i = 0;
+				do {
+					prevPos = (prevPos + 1) & (prev->capacity - 1);
+					index[i++] = prev->index[prevPos];
+				} while (prevPos != prevTail);
+				assert(i == prevCapacity);
+			}
+			for (size_t i = 0; i != entryCount; ++i) {
+				new (entries + i) BlockIndexEntry;
+				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+				index[prevCapacity + i] = entries + i;
+			}
+			header->prev = prev;
+			header->entries = entries;
+			header->index = index;
+			header->capacity = nextBlockIndexCapacity;
+			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+			
+			blockIndex.store(header, std::memory_order_release);
+			
+			nextBlockIndexCapacity <<= 1;
+			
+			return true;
+		}
+		
+	private:
+		size_t nextBlockIndexCapacity;
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	public:
+		details::ThreadExitListener threadExitListener;
+	private:
+#endif
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ImplicitProducer* nextImplicitProducer;
+	private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+		mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+	
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+		
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+	
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+		
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+		
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+	
+	inline void add_block_to_free_list(Block* block)
+	{
+#ifdef MCDBGQ_TRACKMEM
+		block->owner = nullptr;
+#endif
+		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+			destroy(block);
+		}
+		else {
+			freeList.add(block);
+		}
+	}
+	
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+	
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+	
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	template<AllocationMode canAlloc>
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
+			return create<Block>();
+		}
+		else {
+			return nullptr;
+		}
+	}
+	
+
+#ifdef MCDBGQ_TRACKMEM
+	public:
+		struct MemStats {
+			size_t allocatedBlocks;
+			size_t usedBlocks;
+			size_t freeBlocks;
+			size_t ownedBlocksExplicit;
+			size_t ownedBlocksImplicit;
+			size_t implicitProducers;
+			size_t explicitProducers;
+			size_t elementsEnqueued;
+			size_t blockClassBytes;
+			size_t queueClassBytes;
+			size_t implicitBlockIndexBytes;
+			size_t explicitBlockIndexBytes;
+			
+			friend class ConcurrentQueue;
+			
+		private:
+			static MemStats getFor(ConcurrentQueue* q)
+			{
+				MemStats stats = { 0 };
+				
+				stats.elementsEnqueued = q->size_approx();
+			
+				auto block = q->freeList.head_unsafe();
+				while (block != nullptr) {
+					++stats.allocatedBlocks;
+					++stats.freeBlocks;
+					block = block->freeListNext.load(std::memory_order_relaxed);
+				}
+				
+				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+					stats.implicitProducers += implicit ? 1 : 0;
+					stats.explicitProducers += implicit ? 0 : 1;
+					
+					if (implicit) {
+						auto prod = static_cast<ImplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ImplicitProducer);
+						auto head = prod->headIndex.load(std::memory_order_relaxed);
+						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+						if (hash != nullptr) {
+							for (size_t i = 0; i != hash->capacity; ++i) {
+								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+									++stats.allocatedBlocks;
+									++stats.ownedBlocksImplicit;
+								}
+							}
+							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+							for (; hash != nullptr; hash = hash->prev) {
+								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+							}
+						}
+						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+							//auto block = prod->get_block_index_entry_for_index(head);
+							++stats.usedBlocks;
+						}
+					}
+					else {
+						auto prod = static_cast<ExplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ExplicitProducer);
+						auto tailBlock = prod->tailBlock;
+						bool wasNonEmpty = false;
+						if (tailBlock != nullptr) {
+							auto block = tailBlock;
+							do {
+								++stats.allocatedBlocks;
+								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+									++stats.usedBlocks;
+									wasNonEmpty = wasNonEmpty || block != tailBlock;
+								}
+								++stats.ownedBlocksExplicit;
+								block = block->next;
+							} while (block != tailBlock);
+						}
+						auto index = prod->blockIndex.load(std::memory_order_relaxed);
+						while (index != nullptr) {
+							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+						}
+					}
+				}
+				
+				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+				stats.allocatedBlocks += freeOnInitialPool;
+				stats.freeBlocks += freeOnInitialPool;
+				
+				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+				stats.queueClassBytes += sizeof(ConcurrentQueue);
+				
+				return stats;
+			}
+		};
+		
+		// For debugging only. Not thread-safe.
+		MemStats getMemStats()
+		{
+			return MemStats::getFor(this);
+		}
+	private:
+		friend struct MemStats;
+#endif
+	
+	
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////	
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit)
+	{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Try to re-use one first
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+				bool expected = true;
+				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// We caught one! It's been marked as activated, the caller can have it
+					return ptr;
+				}
+			}
+		}
+
+		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+	}
+	
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+		
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+		
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		if (producer->isExplicit) {
+			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+		else {
+			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+#endif
+		
+		return producer;
+	}
+	
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+	
+	
+	//////////////////////////////////
+	// Implicit producer hash
+	//////////////////////////////////
+	
+	struct ImplicitProducerKVP
+	{
+		std::atomic<details::thread_id_t> key;
+		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
+		
+		ImplicitProducerKVP() : value(nullptr) { }
+		
+		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+			value = other.value;
+		}
+		
+		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			swap(other);
+			return *this;
+		}
+		
+		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+		{
+			if (this != &other) {
+				details::swap_relaxed(key, other.key);
+				std::swap(value, other.value);
+			}
+		}
+	};
+	
+	template<typename XT, typename XTraits>
+	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+	
+	struct ImplicitProducerHash
+	{
+		size_t capacity;
+		ImplicitProducerKVP* entries;
+		ImplicitProducerHash* prev;
+	};
+	
+	inline void populate_initial_implicit_producer_hash()
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			implicitProducerHashCount.store(0, std::memory_order_relaxed);
+			auto hash = &initialImplicitProducerHash;
+			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+			hash->entries = &initialImplicitProducerHashEntries[0];
+			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+			}
+			hash->prev = nullptr;
+			implicitProducerHash.store(hash, std::memory_order_relaxed);
+		}
+	}
+	
+	void swap_implicit_producer_hashes(ConcurrentQueue& other)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			// Swap (assumes our implicit producer hash is initialized)
+			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+			
+			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+			
+			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &initialImplicitProducerHash;
+			}
+			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &other.initialImplicitProducerHash;
+			}
+		}
+	}
+	
+	// Only fails (returns nullptr) if memory allocation fails
+	ImplicitProducer* get_or_add_implicit_producer()
+	{
+		// Note that since the data is essentially thread-local (key is thread ID),
+		// there's a reduced need for fences (memory ordering is already consistent
+		// for any individual thread), except for the current table itself.
+		
+		// Start by looking for the thread ID in the current and all previous hash tables.
+		// If it's not found, it must not be in there yet, since this same thread would
+		// have added it previously to one of the tables that we traversed.
+		
+		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+		
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		
+		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+			// Look for the id in this hash
+			auto index = hashedId;
+			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
+				index &= hash->capacity - 1u;
+				
+				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					// Found it! If we had to search several hashes deep, though, we should lazily add it
+					// to the current main hash table to avoid the extended search next time.
+					// Note there's guaranteed to be room in the current hash table since every subsequent
+					// table implicitly reserves space for all previous tables (there's only one
+					// implicitProducerHashCount).
+					auto value = hash->entries[index].value;
+					if (hash != mainHash) {
+						index = hashedId;
+						while (true) {
+							index &= mainHash->capacity - 1u;
+							auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+							auto reusable = details::invalid_thread_id2;
+							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
+								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+#else
+							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+#endif
+								mainHash->entries[index].value = value;
+								break;
+							}
+							++index;
+						}
+					}
+					
+					return value;
+				}
+				if (probedKey == details::invalid_thread_id) {
+					break;		// Not in this hash table
+				}
+				++index;
+			}
+		}
+		
+		// Insert!
+		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+		while (true) {
+			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+				// We've acquired the resize lock, try to allocate a bigger hash table.
+				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+				// locked block).
+				mainHash = implicitProducerHash.load(std::memory_order_acquire);
+				if (newCount >= (mainHash->capacity >> 1)) {
+					size_t newCapacity = mainHash->capacity << 1;
+					while (newCount >= (newCapacity >> 1)) {
+						newCapacity <<= 1;
+					}
+					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+					if (raw == nullptr) {
+						// Allocation failed
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+						return nullptr;
+					}
+					
+					auto newHash = new (raw) ImplicitProducerHash;
+					newHash->capacity = static_cast<size_t>(newCapacity);
+					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+					for (size_t i = 0; i != newCapacity; ++i) {
+						new (newHash->entries + i) ImplicitProducerKVP;
+						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+					}
+					newHash->prev = mainHash;
+					implicitProducerHash.store(newHash, std::memory_order_release);
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+					mainHash = newHash;
+				}
+				else {
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+				}
+			}
+			
+			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
+			// always be true)
+			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
+				if (producer == nullptr) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+					return nullptr;
+				}
+				
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+				producer->threadExitListener.userData = producer;
+				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+				
+				auto index = hashedId;
+				while (true) {
+					index &= mainHash->capacity - 1u;
+					auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+					auto reusable = details::invalid_thread_id2;
+					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
+						mainHash->entries[index].value = producer;
+						break;
+					}
+#endif
+					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+						mainHash->entries[index].value = producer;
+						break;
+					}
+					++index;
+				}
+				return producer;
+			}
+			
+			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+			// we try to allocate ourselves).
+			mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		}
+	}
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	void implicit_producer_thread_exited(ImplicitProducer* producer)
+	{
+		// Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		auto hash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		details::thread_id_t probedKey;
+		
+		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+		// trying to add an entry thinking there's a free slot (because they reused a producer)
+		for (; hash != nullptr; hash = hash->prev) {
+			auto index = hashedId;
+			do {
+				index &= hash->capacity - 1u;
+				probedKey = id;
+				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+					break;
+				}
+				++index;
+			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+		}
+		
+		// Mark the queue as being recyclable
+		producer->inactive.store(true, std::memory_order_release);
+	}
+	
+	static void implicit_producer_thread_exited_callback(void* userData)
+	{
+		auto producer = static_cast<ImplicitProducer*>(userData);
+		auto queue = producer->parent;
+		queue->implicit_producer_thread_exited(producer);
+	}
+#endif
+	
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+
+	template<typename TAlign>
+	static inline void* aligned_malloc(size_t size)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::malloc)(size);
+		else {
+			size_t alignment = std::alignment_of<TAlign>::value;
+			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
+			if (!raw)
+				return nullptr;
+			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
+			*(reinterpret_cast<void**>(ptr) - 1) = raw;
+			return ptr;
+		}
+	}
+
+	template<typename TAlign>
+	static inline void aligned_free(void* ptr)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::free)(ptr);
+		else
+			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
+	}
+
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
+		if (p == nullptr)
+			return nullptr;
+
+		for (size_t i = 0; i != count; ++i)
+			new (p + i) U();
+		return p;
+	}
+
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		if (p != nullptr) {
+			assert(count > 0);
+			for (size_t i = count; i != 0; )
+				(p + --i)->~U();
+		}
+		aligned_free<U>(p);
+	}
+
+	template<typename U>
+	static inline U* create()
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr)
+			p->~U();
+		aligned_free<U>(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+	
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+	
+#ifndef MCDBGQ_USEDEBUGFREELIST
+	FreeList<Block> freeList;
+#else
+	debug::DebugFreeList<Block> freeList;
+#endif
+	
+	std::atomic<ImplicitProducerHash*> implicitProducerHash;
+	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
+	ImplicitProducerHash initialImplicitProducerHash;
+	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+	std::atomic_flag implicitProducerHashResizeInProgress;
+	
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+	
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+	debug::DebugMutex implicitProdMutex;
+#endif
+	
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	std::atomic<ExplicitProducer*> explicitProducers;
+	std::atomic<ImplicitProducer*> implicitProducers;
+#endif
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+#pragma warning(pop)
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic pop
+#endif
diff --git a/third_party/concurrentqueue/moodycamel/lightweightsemaphore.h b/third_party/concurrentqueue/moodycamel/lightweightsemaphore.h
new file mode 100644
index 000000000000..a0414751905e
--- /dev/null
+++ b/third_party/concurrentqueue/moodycamel/lightweightsemaphore.h
@@ -0,0 +1,427 @@
+// Provides an efficient implementation of a semaphore (LightweightSemaphore).
+// This is an extension of Jeff Preshing's sempahore implementation (licensed 
+// under the terms of its separate zlib license) that has been adapted and
+// extended by Cameron Desrochers.
+
+#pragma once
+
+#include <cstddef> // For std::size_t
+#include <atomic>
+#include <type_traits> // For std::make_signed<T>
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__MVS__)
+#include <zos-semaphore.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+
+#if defined(__GLIBC_PREREQ) && defined(_GNU_SOURCE)
+#if __GLIBC_PREREQ(2,30)
+#define MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+#endif
+#endif
+#endif
+
+namespace moodycamel
+{
+namespace details
+{
+
+// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's
+// portable + lightweight semaphore implementations, originally from
+// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+// LICENSE:
+// Copyright (c) 2015 Jeff Preshing
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+//	claim that you wrote the original software. If you use this software
+//	in a product, an acknowledgement in the product documentation would be
+//	appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//	misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+#if defined(_WIN32)
+class Semaphore
+{
+private:
+	void* m_hSema;
+	
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		const long maxLong = 0x7fffffff;
+		m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+		assert(m_hSema);
+	}
+
+	~Semaphore()
+	{
+		CloseHandle(m_hSema);
+	}
+
+	bool wait()
+	{
+		const unsigned long infinite = 0xffffffff;
+		return WaitForSingleObject(m_hSema, infinite) == 0;
+	}
+	
+	bool try_wait()
+	{
+		return WaitForSingleObject(m_hSema, 0) == 0;
+	}
+	
+	bool timed_wait(std::uint64_t usecs)
+	{
+		return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0;
+	}
+
+	void signal(int count = 1)
+	{
+		while (!ReleaseSemaphore(m_hSema, count, nullptr));
+	}
+};
+#elif defined(__MACH__)
+//---------------------------------------------------------
+// Semaphore (Apple iOS and OSX)
+// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	semaphore_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+		assert(rc == KERN_SUCCESS);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		semaphore_destroy(mach_task_self(), m_sema);
+	}
+
+	bool wait()
+	{
+		return semaphore_wait(m_sema) == KERN_SUCCESS;
+	}
+	
+	bool try_wait()
+	{
+		return timed_wait(0);
+	}
+	
+	bool timed_wait(std::uint64_t timeout_usecs)
+	{
+		mach_timespec_t ts;
+		ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
+		ts.tv_nsec = static_cast<int>((timeout_usecs % 1000000) * 1000);
+
+		// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+		kern_return_t rc = semaphore_timedwait(m_sema, ts);
+		return rc == KERN_SUCCESS;
+	}
+
+	void signal()
+	{
+		while (semaphore_signal(m_sema) != KERN_SUCCESS);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (semaphore_signal(m_sema) != KERN_SUCCESS);
+		}
+	}
+};
+#elif defined(__unix__) || defined(__MVS__)
+//---------------------------------------------------------
+// Semaphore (POSIX, Linux, zOS)
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	sem_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		int rc = sem_init(&m_sema, 0, static_cast<unsigned int>(initialCount));
+		assert(rc == 0);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		sem_destroy(&m_sema);
+	}
+
+	bool wait()
+	{
+		// http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+		int rc;
+		do {
+			rc = sem_wait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool try_wait()
+	{
+		int rc;
+		do {
+			rc = sem_trywait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool timed_wait(std::uint64_t usecs)
+	{
+		struct timespec ts;
+		const int usecs_in_1_sec = 1000000;
+		const int nsecs_in_1_sec = 1000000000;
+#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+		clock_gettime(CLOCK_MONOTONIC, &ts);
+#else
+		clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+		ts.tv_sec += (time_t)(usecs / usecs_in_1_sec);
+		ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000;
+		// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+		// so we have to clean things up before passing it in
+		if (ts.tv_nsec >= nsecs_in_1_sec) {
+			ts.tv_nsec -= nsecs_in_1_sec;
+			++ts.tv_sec;
+		}
+
+		int rc;
+		do {
+#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+			rc = sem_clockwait(&m_sema, CLOCK_MONOTONIC, &ts);
+#else
+			rc = sem_timedwait(&m_sema, &ts);
+#endif
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	void signal()
+	{
+		while (sem_post(&m_sema) == -1);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (sem_post(&m_sema) == -1);
+		}
+	}
+};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+}	// end namespace details
+
+
+//---------------------------------------------------------
+// LightweightSemaphore
+//---------------------------------------------------------
+class LightweightSemaphore
+{
+public:
+	typedef std::make_signed<std::size_t>::type ssize_t;
+
+private:
+	std::atomic<ssize_t> m_count;
+	details::Semaphore m_sema;
+	int m_maxSpins;
+
+	bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+	{
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+			std::atomic_signal_fence(std::memory_order_acquire);	 // Prevent the compiler from collapsing the loop.
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount > 0)
+			return true;
+		if (timeout_usecs < 0)
+		{
+			if (m_sema.wait())
+				return true;
+		}
+		if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs))
+			return true;
+		// At this point, we've timed out waiting for the semaphore, but the
+		// count is still decremented indicating we may still be waiting on
+		// it. So we have to re-adjust the count, but only if the semaphore
+		// wasn't signaled enough times for us too since then. If it was, we
+		// need to release the semaphore too.
+		while (true)
+		{
+			oldCount = m_count.load(std::memory_order_acquire);
+			if (oldCount >= 0 && m_sema.try_wait())
+				return true;
+			if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+				return false;
+		}
+	}
+
+	ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1)
+	{
+		assert(max > 0);
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if (oldCount > 0)
+			{
+				ssize_t newCount = oldCount > max ? oldCount - max : 0;
+				if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+					return oldCount - newCount;
+			}
+			std::atomic_signal_fence(std::memory_order_acquire);
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount <= 0)
+		{
+			if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait()) || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs)))
+			{
+				while (true)
+				{
+					oldCount = m_count.load(std::memory_order_acquire);
+					if (oldCount >= 0 && m_sema.try_wait())
+						break;
+					if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+						return 0;
+				}
+			}
+		}
+		if (max > 1)
+			return 1 + tryWaitMany(max - 1);
+		return 1;
+	}
+
+public:
+	LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000) : m_count(initialCount), m_maxSpins(maxSpins)
+	{
+		assert(initialCount >= 0);
+		assert(maxSpins >= 0);
+	}
+
+	bool tryWait()
+	{
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+		}
+		return false;
+	}
+
+	bool wait()
+	{
+		return tryWait() || waitWithPartialSpinning();
+	}
+
+	bool wait(std::int64_t timeout_usecs)
+	{
+		return tryWait() || waitWithPartialSpinning(timeout_usecs);
+	}
+
+	// Acquires between 0 and (greedily) max, inclusive
+	ssize_t tryWaitMany(ssize_t max)
+	{
+		assert(max >= 0);
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			ssize_t newCount = oldCount > max ? oldCount - max : 0;
+			if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+				return oldCount - newCount;
+		}
+		return 0;
+	}
+
+	// Acquires at least one, and (greedily) at most max
+	ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs)
+	{
+		assert(max >= 0);
+		ssize_t result = tryWaitMany(max);
+		if (result == 0 && max > 0)
+			result = waitManyWithPartialSpinning(max, timeout_usecs);
+		return result;
+	}
+	
+	ssize_t waitMany(ssize_t max)
+	{
+		ssize_t result = waitMany(max, -1);
+		assert(result > 0);
+		return result;
+	}
+
+	void signal(ssize_t count = 1)
+	{
+		assert(count >= 0);
+		ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release);
+		ssize_t toRelease = -oldCount < count ? -oldCount : count;
+		if (toRelease > 0)
+		{
+			m_sema.signal((int)toRelease);
+		}
+	}
+	
+	std::size_t availableApprox() const
+	{
+		ssize_t count = m_count.load(std::memory_order_relaxed);
+		return count > 0 ? static_cast<std::size_t>(count) : 0;
+	}
+};
+
+}   // end namespace moodycamel
diff --git a/third_party/concurrentqueue/update.sh b/third_party/concurrentqueue/update.sh
new file mode 100755
index 000000000000..1a0cbbef0153
--- /dev/null
+++ b/third_party/concurrentqueue/update.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Create the moodycamel directory if it doesn't exist
+mkdir -p moodycamel
+
+# Download the concurrentqueue.h file
+curl -o moodycamel/concurrentqueue.h https://raw.githubusercontent.com/cameron314/concurrentqueue/master/concurrentqueue.h
+
+# Download the lightweightsemaphore.h file
+curl -o moodycamel/lightweightsemaphore.h https://raw.githubusercontent.com/cameron314/concurrentqueue/master/lightweightsemaphore.h
+
+# Download the LICENSE.md file
+curl -o moodycamel/LICENSE.md https://raw.githubusercontent.com/cameron314/concurrentqueue/master/LICENSE.md
diff --git a/third_party/eigen_pin.txt b/third_party/eigen_pin.txt
new file mode 100644
index 000000000000..18091983f59d
--- /dev/null
+++ b/third_party/eigen_pin.txt
@@ -0,0 +1 @@
+3.4.0
diff --git a/third_party/moodycamel.BUILD b/third_party/moodycamel.BUILD
new file mode 100644
index 000000000000..d3028205016f
--- /dev/null
+++ b/third_party/moodycamel.BUILD
@@ -0,0 +1,7 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+cc_library(
+    name = "moodycamel",
+    hdrs = glob(["**/*.h"]),
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl
index c52532d0361b..913fdc0721b8 100644
--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
@@ -2236,7 +2236,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         labels = labels,
         deps = [
+<<<<<<< HEAD
             ":subgraph",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ":tables",
             ":prod_ukernels",
             third_party("cpuinfo"),
@@ -2247,6 +2250,16 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         },
         header_namespace = "",
         headers = get_xnnpack_headers(),
+<<<<<<< HEAD
+=======
+        exported_deps = [
+            ":subgraph",
+        ],
+        compiler_flags = select({
+            "DEFAULT": [],
+            "ovr_config//os:macos": ["-fvisibility=default"],
+        }),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         platforms = (APPLE, ANDROID, CXX, WINDOWS),
         preprocessor_flags = XNN_COMMON_PREPROCESSOR_FLAGS + [
             "-DXNN_NO_Q8_OPERATORS",
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index 53b3ef7e4560..2a1ba7836d93 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 3ee2bd2f13e1ed17a685986ff667a58bed5f2aa5
+=======
+3a9419c8bb6a98dd3e3cd473c36691fb4abeae40
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/BUCK.bzl b/tools/BUCK.bzl
index c60fd580b0fd..3bd10e6af92b 100644
--- a/tools/BUCK.bzl
+++ b/tools/BUCK.bzl
@@ -1,5 +1,9 @@
 # @lint-ignore-every FBCODEBZLADDLOADS
+<<<<<<< HEAD
 load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
+=======
+load("@fbsource//tools/build_defs:glob_defs.bzl", "subdir_glob")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # shared by internal and OSS BUCK
 def define_tools_targets(
@@ -287,6 +291,7 @@ def define_tools_targets(
             ":autograd",
         ],
     )
+<<<<<<< HEAD
 
     python_test(
         name = "test_torchgen_executorch",
@@ -302,3 +307,5 @@ def define_tools_targets(
             torchgen_deps,
         ],
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/alerts/create_alerts.py b/tools/alerts/create_alerts.py
index 97607e07fa0a..3260c3eae953 100644
--- a/tools/alerts/create_alerts.py
+++ b/tools/alerts/create_alerts.py
@@ -11,7 +11,11 @@
 from typing import Any
 
 import requests
+<<<<<<< HEAD
 from setuptools import distutils  # type: ignore[import]
+=======
+from setuptools import distutils  # type: ignore[import,attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ALL_SKIPPED_THRESHOLD = 100
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 7919f5f93702..f77f61b2b4e8 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -113,7 +113,11 @@
 #   - `wrap_opt_if`, is a 2-argument function that accepts a tensor
 #     variable and a boolean condition that dictates whether to save that
 #     variable in a graph. The result of this function is `std::optional<Tensor>`,
+<<<<<<< HEAD
 #     and it is `::std::nullopt` when the condition evalutes to `false`,
+=======
+#     and it is `::std::nullopt` when the condition evaluates to `false`,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #     otherwise it is the variable wrapped in `std::optional<Tensor>`.
 #     For example, wrap_opt_if(var_0, grad_input_mask[1] || grad_input_mask[2])
 #     would mean that `var_0` is saved as long as the second (grad_input_mask[1])
@@ -200,7 +204,11 @@
 #     Undefined Tensors are created with the default constructor `at::Tensor()`.
 #     It is an efficient way to represent a Tensor filled with zeros because
 #     the Tensor holds no sizing information and no Storage data is allocated.
+<<<<<<< HEAD
 #     But consequentially, Tensor operations cannot be performed on them.
+=======
+#     But consequently, Tensor operations cannot be performed on them.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #     Therefore, your backward function should treat an undefined output grad as
 #     a zero, and it needs to be a special case.
 #
@@ -915,7 +923,11 @@
 - name: isnan(Tensor self) -> Tensor
   self: non_differentiable
 
+<<<<<<< HEAD
 - name: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+=======
+- name: kthvalue(Tensor self, SymInt k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
   values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
 
@@ -1206,6 +1218,13 @@
   mat2: mm_mat2_backward(grad, self, mat2.sym_sizes(), mat2.sym_strides(), mat2.layout(), 1)
   result: at::mm(self_t, mat2_p) + at::mm(self_p, mat2_t)
 
+<<<<<<< HEAD
+=======
+- name: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
+  self: _grouped_mm_mat1_backward(grad, mat2, self.sym_sizes(), self.sym_strides(), self.layout(), offs, 1)
+  mat2: _grouped_mm_mat2_backward(grad, self, mat2.sym_sizes(), mat2.sym_strides(), mat2.layout(), offs, 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - name: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
   self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), keepdim)
   values: gather_with_keepdimed_indices(self_t, dim, indices, keepdim)
@@ -2861,6 +2880,11 @@
   offsets: non_differentiable
   lengths: non_differentiable
   dummy: non_differentiable
+<<<<<<< HEAD
+=======
+  min_seqlen: non_differentiable
+  max_seqlen: non_differentiable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 - name: _nested_get_values(Tensor(a) self) -> Tensor(a)
   self: "_nested_view_from_jagged(grad, at::_nested_get_offsets(self), at::_nested_get_jagged_dummy(self), at::_nested_get_lengths(self), at::_nested_get_ragged_idx(self), at::_nested_get_min_seqlen(self).defined() ? std::optional<Tensor>(at::_nested_get_min_seqlen(self)) : ::std::nullopt, at::_nested_get_max_seqlen(self).defined() ? std::optional<Tensor>(at::_nested_get_max_seqlen(self)) : ::std::nullopt)"
@@ -2976,7 +3000,11 @@
 - name: ne.Tensor(Tensor self, Tensor other) -> Tensor
   output_differentiability: [False]
 
+<<<<<<< HEAD
 - name: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
+=======
+- name: multinomial(Tensor self, SymInt num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   output_differentiability: [False]
 
 - name: nonzero(Tensor self) -> Tensor
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index ea275b58f0f6..e134725c04af 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -272,7 +272,11 @@
 # Getter templates
 GETTER_DEFINITION = CodeTemplate(
     """\
+<<<<<<< HEAD
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+=======
+static PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   auto prop = static_cast<${op}*>(self->cdata.get())->${name};
   ${body}
@@ -283,7 +287,11 @@
 
 GETTER_DEFINITION_SAVEDVAR = CodeTemplate(
     """\
+<<<<<<< HEAD
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+=======
+static PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_;
   ${body}
@@ -294,7 +302,11 @@
 
 GETTER_DEFINITION_RAW_SAVEDVAR = CodeTemplate(
     """\
+<<<<<<< HEAD
 PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
+=======
+static PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_;
   ${body}
@@ -305,7 +317,11 @@
 
 GETTER_DEFINITION_VEC_SAVEDVAR = CodeTemplate(
     """\
+<<<<<<< HEAD
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+=======
+static PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   const auto *node = static_cast<${op}*>(self->cdata.get());
   const auto& prop = node->${name}_;
@@ -321,7 +337,11 @@
 
 GETTER_DEFINITION_RAW_VEC_SAVEDVAR = CodeTemplate(
     """\
+<<<<<<< HEAD
 PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
+=======
+static PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   const auto *node = static_cast<${op}*>(self->cdata.get());
   const auto& prop = node->${name}_;
@@ -337,7 +357,11 @@
 
 GETTER_DEFINITION_OPT = CodeTemplate(
     """\
+<<<<<<< HEAD
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+=======
+static PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name};
   if (!opt_prop.has_value()) {
@@ -352,7 +376,11 @@
 
 GETTER_DEFINITION_OPT_ARRAYREF = CodeTemplate(
     """\
+<<<<<<< HEAD
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+=======
+static PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name};
   if (!opt_prop.list.has_value()) {
@@ -555,8 +583,12 @@ def gen_autograd_functions_lib(
             fname,
             lambda: {
                 "generated_comment": "@"
+<<<<<<< HEAD
                 + f"generated from {fm.template_dir_for_comments()}/"
                 + fname,
+=======
+                + f"generated from {fm.template_dir_for_comments()}/{fname}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "autograd_function_declarations": declarations,
                 "autograd_function_definitions": definitions,
             },
@@ -832,7 +864,11 @@ def save_var(var: SavedAttribute, is_output: bool) -> None:
             getter_definitions.append(
                 CodeTemplate(
                     """\
+<<<<<<< HEAD
 PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+=======
+static PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   const auto *node = static_cast<${op}*>(self->cdata.get());
   const auto& prop = node->${name};
diff --git a/tools/autograd/gen_inplace_or_view_type.py b/tools/autograd/gen_inplace_or_view_type.py
index 0fd882d00cf1..419ecfe204ca 100644
--- a/tools/autograd/gen_inplace_or_view_type.py
+++ b/tools/autograd/gen_inplace_or_view_type.py
@@ -2,7 +2,11 @@
 #
 # NOTE: If any changes are being made to the ADInplaceOrView codegen please also check
 # if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+<<<<<<< HEAD
 # The fallback is expected to mimick this codegen, so we should keep the two in sync.
+=======
+# The fallback is expected to mimic this codegen, so we should keep the two in sync.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from __future__ import annotations
 
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index f1e0140a4155..674df2775993 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -148,7 +148,11 @@
     "mH",  # these need to be an attributes in Python, not functions
     "nonzero(_(out|numpy))?",
     "set_data",
+<<<<<<< HEAD
     ".*_overrideable",  # overrideable functions for backend extension
+=======
+    ".*_overrideable",  # overridable functions for backend extension
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "data",
     "is_leaf",
     "output_nr",
@@ -617,7 +621,11 @@ def load_deprecated_signatures(
         schema_args_by_name = {a.name: a for a in schema.arguments.flat_all}
         for name in call_args:
             assert name in schema_args_by_name or name in known_constants, (
+<<<<<<< HEAD
                 f"deprecation definiton: Unrecognized value {name}"
+=======
+                f"deprecation definition: Unrecognized value {name}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # Map deprecated signature arguments to their aten signature and test
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 67f71d2df503..af9606053ff7 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -17,7 +17,11 @@
 
 # Note [Manual Backend kernels]
 # For these ops, we want to manually register to dispatch key Backend and
+<<<<<<< HEAD
 # skip codegen-ed registeration to all keys before Backend.
+=======
+# skip codegen-ed registration to all keys before Backend.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # For codegen this means:
 #   - op set below must match ops with manual_kernel_registration=True in native_functions.yaml
 #     where we skip codegen backend kernels
diff --git a/tools/autograd/gen_view_funcs.py b/tools/autograd/gen_view_funcs.py
index e6600106dca9..aec0daba3e77 100644
--- a/tools/autograd/gen_view_funcs.py
+++ b/tools/autograd/gen_view_funcs.py
@@ -331,8 +331,12 @@ def gen_view_funcs(
             fname,
             lambda: {
                 "generated_comment": "@"
+<<<<<<< HEAD
                 + f"generated from {fm.template_dir_for_comments()}/"
                 + fname,
+=======
+                + f"generated from {fm.template_dir_for_comments()}/{fname}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "view_func_declarations": declarations,
                 "view_func_definitions": definitions,
                 "ops_headers": ops_headers,
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
index 9d600a815758..c392d774ea75 100644
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@@ -336,7 +336,11 @@ def find_required_inputs(formula: str, postfix: str) -> tuple[str, ...]:
             # This transformation is based on the observation that for element-wise functions, the Jacobian
             # matrix is diagonal and thus doing J * v is the same as (v^T J)^T (in practice, we ignore the transpositions)
             # For the complex case, we use hermitian transpose and get (v.conj() J).conj()
+<<<<<<< HEAD
             # So here we are going to re-use the backward formula and replace two things:
+=======
+            # So here we are going to reuse the backward formula and replace two things:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # 1) all occurrences of "grad" with "foo_t.conj()", where foo is the name of the unique differentiable input.
             # 2) all usage of an original input "foo" with its primal value "foo_p".
             # 3) conjugate the final result
@@ -986,7 +990,11 @@ def repl(m: re.Match[str]) -> str:
 
 
 def _create_op_prefix(name: str) -> str:
+<<<<<<< HEAD
     """Takes a native function name converts to a op prefix name.
+=======
+    r"""Takes a native function name converts to an op prefix name.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Note that the "name" parameter must be the native function name
     without the optional variant suffix, so "add" instead of
@@ -995,8 +1003,14 @@ def _create_op_prefix(name: str) -> str:
     OP names correspond to classes, hence the change to title case.
 
     Example::
+<<<<<<< HEAD
     >>> _create_op_prefix("add")
     'AddBackward'
+=======
+
+        >>> _create_op_prefix("add")
+        'AddBackward'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     camel_case = "".join([p.title() for p in name.split("_")])
     return (camel_case + "Backward").replace("ForwardBackward", "Backward")
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index 505ccc6d2f6d..3c655a578e31 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -36,6 +36,10 @@
 #include "torch/csrc/autograd/generated/python_return_types.h"
 
 #include <ATen/core/Tensor.h>
+<<<<<<< HEAD
+=======
+#include <ATen/core/grad_mode.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/FuncTorchTLS.h>
 #include "c10/core/Stream.h"
 
@@ -291,6 +295,16 @@ static Tensor dispatch_copy_(const Tensor & self, const Tensor & other, bool non
   return self.copy_(other, non_blocking);
 }
 
+<<<<<<< HEAD
+=======
+static void maybe_warn_requires_grad(const Tensor & self) {
+  if (at::GradMode::is_enabled() && self.requires_grad()) {
+    TORCH_WARN_ONCE("Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.\n"
+                    "Consider using tensor.detach() first.");
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  static PyObject * THPVariable_copy_(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
@@ -325,6 +339,10 @@ static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
   }
   jit::tracer::warn("Converting a tensor to a Python float", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = THPVariable_Unpack(self);
+<<<<<<< HEAD
+=======
+  maybe_warn_requires_grad(self_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return wrap(dispatch_to<double>(self_));
   END_HANDLE_TH_ERRORS
 }
@@ -336,6 +354,10 @@ static PyObject * THPVariable_complex_scalar(PyObject* self, PyObject* args) {
   }
   jit::tracer::warn("Converting a tensor to a Python complex", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = THPVariable_Unpack(self);
+<<<<<<< HEAD
+=======
+  maybe_warn_requires_grad(self_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return wrap(dispatch_to<c10::complex<double>>(self_));
   END_HANDLE_TH_ERRORS
 }
@@ -842,7 +864,11 @@ static PyObject * THPVariable_requires_grad_(PyObject* self, PyObject* args, PyO
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 inline bool dispatch_is_contiguous(const Tensor & self, MemoryFormat memory_format) {
+=======
+static inline bool dispatch_is_contiguous(const Tensor & self, MemoryFormat memory_format) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return self.is_contiguous(memory_format);
 }
 
diff --git a/tools/bazel_tools/shellwrap.sh b/tools/bazel_tools/shellwrap.sh
index 1ebab29a6a73..03c9129314d9 100755
--- a/tools/bazel_tools/shellwrap.sh
+++ b/tools/bazel_tools/shellwrap.sh
@@ -54,5 +54,9 @@ echo "Entering interactive shell at the execution root:"
 # quote escape all the arguments to use as a single input string
 cmd="'$shell' --noprofile --rcfile '$rcfile'"
 
+<<<<<<< HEAD
 # run the command in a script psuedo terminal and dump to null
+=======
+# run the command in a script pseudo terminal and dump to null
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /usr/bin/script -c "$cmd" -q /dev/null
diff --git a/tools/build/bazel/requirements.in b/tools/build/bazel/requirements.in
index 60adbdc9efc3..bd6fcb6d863c 100644
--- a/tools/build/bazel/requirements.in
+++ b/tools/build/bazel/requirements.in
@@ -1,7 +1,11 @@
 PyYAML==6.0.1
 numpy==1.26.4
 requests==2.32.2
+<<<<<<< HEAD
 setuptools==70.0.0
+=======
+setuptools==78.1.1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 sympy==1.12
 typing_extensions==4.11.0
 networkx==2.8.8
diff --git a/tools/build/bazel/requirements.txt b/tools/build/bazel/requirements.txt
index 9ebb7ac6c7d4..242dd8f4ebc9 100644
--- a/tools/build/bazel/requirements.txt
+++ b/tools/build/bazel/requirements.txt
@@ -111,7 +111,11 @@ mpmath==1.3.0 \
 networkx==2.8.8 \
     --hash=sha256:230d388117af870fce5647a3c52401fcf753e94720e6ea6b4197a5355648885e \
     --hash=sha256:e435dfa75b1d7195c7b8378c3859f0445cd88c6b0375c181ed66823a9ceb7524
+<<<<<<< HEAD
     # via -r tools/build/bazel/requirements.in
+=======
+    # via -r requirements.in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 numpy==1.26.4 \
     --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
     --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
@@ -149,7 +153,11 @@ numpy==1.26.4 \
     --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \
     --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \
     --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f
+<<<<<<< HEAD
     # via -r tools/build/bazel/requirements.in
+=======
+    # via -r requirements.in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 pyyaml==6.0.1 \
     --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \
     --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \
@@ -202,6 +210,7 @@ pyyaml==6.0.1 \
     --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \
     --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \
     --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f
+<<<<<<< HEAD
     # via -r tools/build/bazel/requirements.in
 requests==2.32.2 \
     --hash=sha256:dd951ff5ecf3e3b3aa26b40703ba77495dab41da839ae72ef3c8e5d8e2433289 \
@@ -215,13 +224,35 @@ typing-extensions==4.11.0 \
     --hash=sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0 \
     --hash=sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a
     # via -r tools/build/bazel/requirements.in
+=======
+    # via -r requirements.in
+requests==2.32.2 \
+    --hash=sha256:dd951ff5ecf3e3b3aa26b40703ba77495dab41da839ae72ef3c8e5d8e2433289 \
+    --hash=sha256:fc06670dd0ed212426dfeb94fc1b983d917c4f9847c863f313c9dfaaffb7c23c
+    # via -r requirements.in
+sympy==1.12 \
+    --hash=sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5 \
+    --hash=sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8
+    # via -r requirements.in
+typing-extensions==4.11.0 \
+    --hash=sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0 \
+    --hash=sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a
+    # via -r requirements.in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 urllib3==2.2.2 \
     --hash=sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472 \
     --hash=sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168
     # via requests
 
 # The following packages are considered to be unsafe in a requirements file:
+<<<<<<< HEAD
 setuptools==70.0.0 \
     --hash=sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4 \
     --hash=sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0
     # via -r tools/build/bazel/requirements.in
+=======
+setuptools==78.1.1 \
+    --hash=sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561 \
+    --hash=sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d
+    # via -r requirements.in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/build_defs/buck_helpers.bzl b/tools/build_defs/buck_helpers.bzl
index 2353fae91101..8926cc4d775f 100644
--- a/tools/build_defs/buck_helpers.bzl
+++ b/tools/build_defs/buck_helpers.bzl
@@ -24,7 +24,11 @@ ONLY_AVAILABLE_IN_BUCK2 = [
 def filter_attributes(kwgs):
     keys = list(kwgs.keys())
 
+<<<<<<< HEAD
     # drop unncessary attributes
+=======
+    # drop unnecessary attributes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for key in keys:
         if key in IGNORED_ATTRIBUTES or key in ONLY_AVAILABLE_IN_BUCK2:
             kwgs.pop(key)
diff --git a/tools/build_defs/type_defs.bzl b/tools/build_defs/type_defs.bzl
index 7a905e7d6cc0..a3e7cc031860 100644
--- a/tools/build_defs/type_defs.bzl
+++ b/tools/build_defs/type_defs.bzl
@@ -83,7 +83,11 @@ def is_bool(arg):
     """Checks if provided instance is a boolean value.
 
     Args:
+<<<<<<< HEAD
       arg: An instance ot check. type: Any
+=======
+      arg: An instance to check. type: Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
       True for boolean values, False otherwise. rtype: bool
@@ -96,7 +100,11 @@ def is_number(arg):
     """Checks if provided instance is a number value.
 
     Args:
+<<<<<<< HEAD
       arg: An instance ot check. type: Any
+=======
+      arg: An instance to check. type: Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
       True for number values, False otherwise. rtype: bool
@@ -109,7 +117,11 @@ def is_struct(arg):
     """Checks if provided instance is a struct value.
 
     Args:
+<<<<<<< HEAD
       arg: An instance ot check. type: Any
+=======
+      arg: An instance to check. type: Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
       True for struct values, False otherwise. rtype: bool
diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
index 5dd5a2219758..dadd5c582cef 100644
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@@ -2,6 +2,7 @@
 
 import os
 import platform
+<<<<<<< HEAD
 import subprocess
 from glob import glob
 from pathlib import Path
@@ -12,10 +13,22 @@
 
 repo_root = Path(__file__).absolute().parent.parent
 third_party_path = os.path.join(repo_root, "third_party")
+=======
+
+from .optional_submodules import checkout_nccl
+from .setup_helpers.cmake import CMake, USE_NINJA
+from .setup_helpers.env import (
+    check_env_flag,
+    check_negative_env_flag,
+    IS_64BIT,
+    IS_WINDOWS,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_vc_env(vc_arch: str) -> dict[str, str]:
     try:
+<<<<<<< HEAD
         from setuptools import distutils  # type: ignore[import]
 
         return distutils._msvccompiler._get_vc_env(vc_arch)  # type: ignore[no-any-return]
@@ -23,6 +36,17 @@ def _get_vc_env(vc_arch: str) -> dict[str, str]:
         from setuptools._distutils import _msvccompiler  # type: ignore[import]
 
         return _msvccompiler._get_vc_env(vc_arch)  # type: ignore[no-any-return]
+=======
+        from setuptools import distutils  # type: ignore[import,attr-defined]
+
+        return distutils._msvccompiler._get_vc_env(vc_arch)  # type: ignore[no-any-return]
+    except AttributeError:
+        from setuptools._distutils import (
+            _msvccompiler,  # type: ignore[import,attr-defined]
+        )
+
+        return _msvccompiler._get_vc_env(vc_arch)  # type: ignore[no-any-return,attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _overlay_windows_vcvars(env: dict[str, str]) -> dict[str, str]:
@@ -67,6 +91,7 @@ def _create_build_env() -> dict[str, str]:
     # you should NEVER add something to this list. It is bad practice to
     # have cmake read the environment
     my_env = os.environ.copy()
+<<<<<<< HEAD
     if (
         "CUDA_HOME" in my_env
     ):  # Keep CUDA_HOME. This env variable is still used in other part.
@@ -76,6 +101,8 @@ def _create_build_env() -> dict[str, str]:
         if len(cuda_win) > 0:
             my_env["CUDA_BIN_PATH"] = cuda_win[0]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if IS_WINDOWS and USE_NINJA:
         # When using Ninja under Windows, the gcc toolchain will be chosen as
         # default. But it should be set to MSVC as the user's first choice.
@@ -85,6 +112,7 @@ def _create_build_env() -> dict[str, str]:
     return my_env
 
 
+<<<<<<< HEAD
 def read_nccl_pin() -> str:
     nccl_file = "nccl-cu12.txt"
     if os.getenv("DESIRED_CUDA", "").startswith("11") or os.getenv(
@@ -110,6 +138,8 @@ def checkout_nccl() -> None:
         subprocess.check_call(["git", "checkout", release_tag], cwd=nccl_basedir)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def build_pytorch(
     version: str | None,
     cmake_python_library: str | None,
@@ -119,7 +149,16 @@ def build_pytorch(
     cmake: CMake,
 ) -> None:
     my_env = _create_build_env()
+<<<<<<< HEAD
     checkout_nccl()
+=======
+    if (
+        not check_negative_env_flag("USE_CUDA")
+        and not check_negative_env_flag("USE_NCCL")
+        and not check_env_flag("USE_SYSTEM_NCCL")
+    ):
+        checkout_nccl()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     build_test = not check_negative_env_flag("BUILD_TEST")
     cmake.generate(
         version, cmake_python_library, build_python, build_test, my_env, rerun_cmake
diff --git a/tools/build_with_debinfo.py b/tools/build_with_debinfo.py
index 73c9dba0090b..2a714d2d2337 100755
--- a/tools/build_with_debinfo.py
+++ b/tools/build_with_debinfo.py
@@ -38,7 +38,11 @@ def get_lib_extension() -> str:
         return "so"
     if sys.platform == "darwin":
         return "dylib"
+<<<<<<< HEAD
     raise RuntimeError(f"Usupported platform {sys.platform}")
+=======
+    raise RuntimeError(f"Unsupported platform {sys.platform}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def create_symlinks() -> None:
@@ -78,7 +82,11 @@ def create_build_plan() -> list[tuple[str, str]]:
         if line.startswith(": &&") and line.endswith("&& :"):
             line = line[4:-4]
         line = line.replace("-O2", "-g").replace("-O3", "-g")
+<<<<<<< HEAD
         # Build Metal shaders with debug infomation
+=======
+        # Build Metal shaders with debug information
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if "xcrun metal " in line and "-frecord-sources" not in line:
             line += " -frecord-sources -gline-tables-only"
         try:
diff --git a/tools/code_analyzer/gen_operators_yaml.py b/tools/code_analyzer/gen_operators_yaml.py
index ede651679847..f28cb085aff4 100644
--- a/tools/code_analyzer/gen_operators_yaml.py
+++ b/tools/code_analyzer/gen_operators_yaml.py
@@ -68,13 +68,21 @@
 #    used by training, and not just the root operators. All Training ops are
 #    also considered for inference, so these are merged into inference ops.
 #
+<<<<<<< HEAD
 # 3. Operator Depencency Graph (--dep-graph-yaml-path): A path to the
+=======
+# 3. Operator Dependency Graph (--dep-graph-yaml-path): A path to the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #    operator dependency graph used to determine which operators depend on
 #    which other operators for correct functioning. This is used for
 #    generating the transitive closure of all the operators used by the
 #    model based on the root operators when static selective build is used.
 #    For tracing based selective build, we don't need to perform this
+<<<<<<< HEAD
 #    transitive cloure.
+=======
+#    transitive closure.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 # 4. Model Metadata (--model-name, --model-versions, --model-assets,
 #    --model-backends): Self-descriptive. These are used to tell this
diff --git a/tools/code_coverage/package/tool/print_report.py b/tools/code_coverage/package/tool/print_report.py
index 26c20aca231a..c7b3ac6c3856 100644
--- a/tools/code_coverage/package/tool/print_report.py
+++ b/tools/code_coverage/package/tool/print_report.py
@@ -133,7 +133,11 @@ def print_file_oriented_report(
     coverage_percentage = print_file_summary(
         covered_summary, total_summary, summary_file
     )
+<<<<<<< HEAD
     # print test condition (interested folder / tests that are successsful or failed)
+=======
+    # print test condition (interested folder / tests that are successful or failed)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     print_test_condition(
         tests,
         tests_type,
@@ -204,7 +208,11 @@ def html_oriented_report() -> None:
     # use lcov to generate the coverage report
     build_folder = os.path.join(get_pytorch_folder(), "build")
     coverage_info_file = os.path.join(SUMMARY_FOLDER_DIR, "coverage.info")
+<<<<<<< HEAD
     # generage coverage report -- coverage.info in build folder
+=======
+    # generate coverage report -- coverage.info in build folder
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     subprocess.check_call(
         [
             "lcov",
diff --git a/tools/config/defs.bzl b/tools/config/defs.bzl
index 6ddd0e991561..79bbeabe3fcd 100644
--- a/tools/config/defs.bzl
+++ b/tools/config/defs.bzl
@@ -27,7 +27,11 @@ def if_rocm(if_true, if_false = []):
 def if_sycl(if_true, if_false = []):
     """Helper for selecting based on the whether SYCL/ComputeCPP is configured."""
 
+<<<<<<< HEAD
     # NOTE: Tensorflow expects some stange behavior (see their if_sycl) if we
+=======
+    # NOTE: Tensorflow expects some strange behavior (see their if_sycl) if we
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # actually plan on supporting this at some point.
     return select({
         "//conditions:default": if_false,
diff --git a/tools/coverage_plugins_package/pyproject.toml b/tools/coverage_plugins_package/pyproject.toml
index 374b58cbf463..29dcb67c8af7 100644
--- a/tools/coverage_plugins_package/pyproject.toml
+++ b/tools/coverage_plugins_package/pyproject.toml
@@ -1,6 +1,30 @@
 [build-system]
+<<<<<<< HEAD
 requires = [
     "setuptools>=42",
     "wheel"
 ]
 build-backend = "setuptools.build_meta"
+=======
+requires = ["setuptools>=77"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "coverage-plugins"
+version = "0.0.1"
+description = "Plug-in to coverage for PyTorch JIT"
+readme = "README.md"
+license = "MIT"
+authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+]
+
+[project.urls]
+Repository = "https://github.com/pytorch/pytorch"
+"Issue Tracker" = "https://github.com/pytorch/pytorch/issues"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/download_mnist.py b/tools/download_mnist.py
index 4fe6068fed9b..5013ec81635b 100644
--- a/tools/download_mnist.py
+++ b/tools/download_mnist.py
@@ -8,7 +8,11 @@
 
 MIRRORS = [
     "http://yann.lecun.com/exdb/mnist/",
+<<<<<<< HEAD
     "https://ossci-datasets.s3.amazonaws.com/mnist/",
+=======
+    "https://ossci-datasets.s3.amazonaws.com/mnist/",  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 RESOURCES = [
diff --git a/tools/dynamo/gb_id_mapping.py b/tools/dynamo/gb_id_mapping.py
new file mode 100644
index 000000000000..2764de89a401
--- /dev/null
+++ b/tools/dynamo/gb_id_mapping.py
@@ -0,0 +1,374 @@
+# mypy: ignore-errors
+
+import argparse
+import ast
+import json
+import re
+import sys
+from pathlib import Path
+
+
+def get_source_segment(source, node):
+    return ast.get_source_segment(source, node)
+
+
+def load_registry(path):
+    if path.exists():
+        with path.open() as f:
+            return json.load(f)
+    return {}
+
+
+def save_registry(reg, path):
+    with path.open("w") as f:
+        json.dump(reg, f, indent=2)
+
+
+def next_gb_id(reg):
+    ids = [int(x[2:]) for x in reg if x.startswith("GB") and x[2:].isdigit()]
+    return f"GB{(max(ids, default=0) + 1):04d}"
+
+
+def clean_string(s):
+    """
+    Normalizes string literals by removing formatting artifacts and escape sequences.
+    Handles f-strings, quotes, newlines, and other syntax elements for cleaner output.
+    """
+    if isinstance(s, str):
+        # Convert f-string prefix to regular string prefix (e.g., f"hello" -> "hello")
+        s = re.sub(r'^f["\']', r'"', s)
+        # Replace quoted strings with f-prefix in the middle with a space (e.g., " f"" -> " ")
+        s = re.sub(r'["\'] f["\']', " ", s)
+        # Remove surrounding quotes, keeping only the content (e.g., "hello" -> hello)
+        s = re.sub(r'^["\'](.*)["\']$', r"\1", s)
+        # Replace any whitespace
+        s = " ".join(s.splitlines())
+        # Replace escaped quotes with their unescaped versions
+        s = s.encode().decode("unicode_escape")
+        # Replace adjacent quoted strings with a space (e.g., " "" -> " ")
+        s = re.sub(r'" "', " ", s)
+    return s
+
+
+def expand_hints(hints):
+    # Expands hint references to their actual values from graph_break_hints.
+    from torch._dynamo import graph_break_hints
+
+    hint_constants = {
+        name: value
+        for name, value in graph_break_hints.__dict__.items()
+        if isinstance(value, list) and name.isupper()
+    }
+
+    expanded_hints = []
+    for hint in hints:
+        for name, value in hint_constants.items():
+            if f"*graph_break_hints.{name}" in hint:
+                expanded_hints.extend(value)
+                break
+    return expanded_hints
+
+
+def extract_info_from_keyword(source, kw):
+    """
+    Extracts and returns the value of a keyword argument from an AST node.
+
+    This function handles different types of AST nodes:
+    - If the node is a constant, it returns the constant value.
+    - If the node is an f-string, it reconstructs the string by
+      evaluating formatted values and concatenating them with string literals.
+    - For other types, it cleans the source segment to remove formatting artifacts.
+
+    """
+    param_source = get_source_segment(source, kw.value)
+    if isinstance(kw.value, ast.Constant):
+        return kw.value.value
+    elif isinstance(kw.value, ast.JoinedStr):
+        evaluated_context = []
+        for value in kw.value.values:
+            if isinstance(value, ast.FormattedValue):
+                evaluated_context.append(f"{{{ast.unparse(value.value)}}}")
+            elif isinstance(value, ast.Constant):
+                evaluated_context.append(value.value)
+        return "".join(evaluated_context)
+    else:
+        return clean_string(param_source)
+
+
+def find_unimplemented_v2_calls(path):
+    results = []
+    path = Path(path)
+
+    if path.is_dir():
+        file_paths = path.glob("**/*.py")
+    else:
+        file_paths = [path]
+
+    for file_path in file_paths:
+        with open(file_path) as f:
+            source = f.read()
+            try:
+                tree = ast.parse(source)
+
+                for node in ast.walk(tree):
+                    if isinstance(node, ast.FunctionDef):
+                        if node.name == "unimplemented_v2":
+                            continue
+                    if (
+                        isinstance(node, ast.Call)
+                        and isinstance(node.func, ast.Name)
+                        and node.func.id == "unimplemented_v2"
+                    ):
+                        info = {
+                            "gb_type": None,
+                            "context": None,
+                            "explanation": None,
+                            "hints": [],
+                        }
+
+                        for kw in node.keywords:
+                            if kw.arg in info:
+                                info[kw.arg] = extract_info_from_keyword(source, kw)
+
+                        if info["gb_type"] is None:
+                            continue
+
+                        if info["hints"]:
+                            hints = info["hints"]
+                            expanded_hints = []
+                            items = re.findall(r'"([^"]*)"', hints)
+                            if items:
+                                expanded_hints.extend(items)
+
+                            if "*graph_break_hints." in hints:
+                                expanded_hints.extend(expand_hints([hints]))
+
+                            info["hints"] = expanded_hints
+
+                        results.append(info)
+            except SyntaxError:
+                print(f"Syntax error in {file_path}")
+
+    return results
+
+
+def cmd_add_new_gb_type(gb_type, file_path, registry_path, additional_info=None):
+    """
+    Add a new graph break type to the registry.
+
+    Args:
+        gb_type: The graph break type to add
+        file_path: Path to the file containing the unimplemented_v2 call
+        registry_path: Path to the registry JSON file
+    """
+    registry_path = Path(registry_path)
+    reg = load_registry(registry_path)
+
+    existing_gb_types = {entry[0]["Gb_type"] for entry in reg.values()}
+    if gb_type in existing_gb_types:
+        print(
+            f"Error: gb_type '{gb_type}' already exists in registry. Please rename the gb_type so it can be unique."
+        )
+        return False
+
+    calls = find_unimplemented_v2_calls(Path(file_path))
+    matching_call = next((call for call in calls if call["gb_type"] == gb_type), None)
+
+    if not matching_call:
+        print(
+            f"Error: Could not find unimplemented_v2 call with gb_type '{gb_type}' in {file_path}"
+        )
+        return False
+
+    gb_id = next_gb_id(reg)
+    reg[gb_id] = [
+        {
+            "Gb_type": gb_type,
+            "Context": matching_call["context"],
+            "Explanation": matching_call["explanation"],
+            "Hints": matching_call["hints"] or [],
+            **({"Additional_Info": [additional_info]} if additional_info else {}),
+        }
+    ]
+
+    save_registry(reg, registry_path)
+    print(f"Added {gb_type} to registry with ID {gb_id}")
+    return True
+
+
+def cmd_update_gb_type(
+    old_gb_type, file_path, registry_path, new_gb_type=None, additional_info=None
+):
+    """
+    Update an existing graph break type in the registry by adding a new version
+    to the version history list.
+
+    Args:
+        old_gb_type: The current graph break type to update
+        file_path: Path to the file containing the updated unimplemented_v2 call
+        registry_path: Path to the registry JSON file
+        new_gb_type: Optional new gb_type name to replace the old one
+    """
+    registry_path = Path(registry_path)
+    reg = load_registry(registry_path)
+
+    gb_id_map = {entry[0]["Gb_type"]: id for id, entry in reg.items()}
+    gb_id = gb_id_map.get(old_gb_type)
+
+    if gb_id is None:
+        print(f"Error: gb_type '{old_gb_type}' not found in registry.")
+        return False
+
+    search_gb_type = new_gb_type if new_gb_type else old_gb_type
+    calls = find_unimplemented_v2_calls(Path(file_path))
+    matching_call = next(
+        (call for call in calls if call["gb_type"] == search_gb_type), None
+    )
+
+    if not matching_call:
+        print(
+            f"Error: Could not find unimplemented_v2 call with gb_type '{search_gb_type}' in {file_path}"
+        )
+        return False
+
+    if (
+        matching_call["gb_type"] != old_gb_type
+        and matching_call["gb_type"] in gb_id_map
+    ):
+        print(
+            f"Error: New gb_type '{matching_call['gb_type']}' already exists in registry. Please use a unique gb_type."
+        )
+        return False
+
+    new_entry = {
+        "Gb_type": matching_call["gb_type"],
+        "Context": matching_call["context"],
+        "Explanation": matching_call["explanation"],
+        "Hints": matching_call["hints"] or [],
+    }
+
+    if additional_info:
+        additional_info_list = reg[gb_id][0].get("Additional_Info", [])
+        new_entry["Additional_Info"] = (
+            additional_info_list + [additional_info]
+            if additional_info_list
+            else [additional_info]
+        )
+    elif "Additional_Info" in reg[gb_id][0]:
+        new_entry["Additional_Info"] = reg[gb_id][0]["Additional_Info"]
+
+    reg[gb_id].insert(0, new_entry)
+
+    save_registry(reg, registry_path)
+    print(
+        f"Updated {old_gb_type} to {matching_call['gb_type']} in registry with ID {gb_id}"
+    )
+    return True
+
+
+def create_registry(dynamo_dir, registry_path):
+    calls = find_unimplemented_v2_calls(dynamo_dir)
+    registry = {}
+
+    gb_types = {}
+    for info in calls:
+        gb_types[info["gb_type"]] = info
+
+    GB_ID_INDEX = 0000
+    for i, (gb_type, info) in enumerate(sorted(gb_types.items()), GB_ID_INDEX):
+        gb_id = f"GB{i:04d}"
+        hints = info["hints"]
+
+        registry[gb_id] = [
+            {
+                "Gb_type": gb_type,
+                "Context": info["context"],
+                "Explanation": info["explanation"],
+                "Hints": hints if hints else [],
+            }
+        ]
+
+    with open(registry_path, "w") as f:
+        json.dump(registry, f, indent=2)
+
+
+def main():
+    script_dir = Path(__file__).resolve().parent
+    repo_root = script_dir.parent.parent
+    registry_path = script_dir / "graph_break_registry.json"
+
+    try:
+        import torch._dynamo
+
+        default_dynamo_dir = str(Path(torch._dynamo.__file__).parent)
+    except ImportError:
+        default_dynamo_dir = str(repo_root / "torch" / "_dynamo")
+
+    parser = argparse.ArgumentParser(description="Manage graph break registry.")
+    subparsers = parser.add_subparsers(dest="command", help="Command to execute")
+
+    create_parser = subparsers.add_parser("create", help="Create registry from scratch")
+    create_parser.add_argument(
+        "--dynamo_dir",
+        type=str,
+        default=default_dynamo_dir,
+        help="Directory to search for unimplemented_v2 calls.",
+    )
+
+    add_parser = subparsers.add_parser("add", help="Add a gb_type to registry")
+    add_parser.add_argument("gb_type", help="The gb_type to add")
+    add_parser.add_argument(
+        "file_path", help="Path to the file containing the unimplemented_v2 call"
+    )
+    add_parser.add_argument(
+        "--additional-info", help="Optional additional information to include"
+    )
+
+    update_parser = subparsers.add_parser(
+        "update", help="Update an existing gb_type in registry"
+    )
+    update_parser.add_argument("gb_type", help="The gb_type to update")
+    update_parser.add_argument(
+        "file_path",
+        help="Path to the file containing the updated unimplemented_v2 call",
+    )
+    update_parser.add_argument(
+        "--new_gb_type", help="New gb_type name if it has changed", default=None
+    )
+    update_parser.add_argument(
+        "--additional-info", help="Optional additional information to include"
+    )
+
+    parser.add_argument(
+        "--registry-path",
+        type=str,
+        default=str(registry_path),
+        help="Path to save the registry JSON file",
+    )
+
+    args = parser.parse_args()
+
+    if args.command == "create":
+        create_registry(args.dynamo_dir, args.registry_path)
+    elif args.command == "add":
+        success = cmd_add_new_gb_type(
+            args.gb_type, args.file_path, args.registry_path, args.additional_info
+        )
+        if not success:
+            sys.exit(1)
+    elif args.command == "update":
+        success = cmd_update_gb_type(
+            args.gb_type,
+            args.file_path,
+            args.registry_path,
+            args.new_gb_type,
+            args.additional_info,
+        )
+        if not success:
+            sys.exit(1)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/extract_scripts.py b/tools/extract_scripts.py
index ab64424348f0..6db194c4bdbb 100755
--- a/tools/extract_scripts.py
+++ b/tools/extract_scripts.py
@@ -42,7 +42,11 @@ def extract(step: Step) -> Script | None:
             "bash": f"#!/usr/bin/env bash\nset -eo pipefail\n{run}",
             "sh": f"#!/usr/bin/env sh\nset -e\n{run}",
         }.get(shell, run)
+<<<<<<< HEAD
         return {"extension": extension, "script": script}
+=======
+        return {"extension": extension, "script": script}  # type: ignore[typeddict-item]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif is_gh_script and gh_script is not None:
         return {"extension": ".js", "script": gh_script}
     else:
diff --git a/tools/flight_recorder/components/builder.py b/tools/flight_recorder/components/builder.py
index bb61ac3e8216..6b960e26ef70 100644
--- a/tools/flight_recorder/components/builder.py
+++ b/tools/flight_recorder/components/builder.py
@@ -6,6 +6,10 @@
 
 import argparse
 import ast
+<<<<<<< HEAD
+=======
+import copy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import sys
 from typing import Any  # type: ignore[attr-defined]
@@ -16,8 +20,12 @@
     Database,
     EntryState,
     Group,
+<<<<<<< HEAD
     MatchInfo,
     MatchState,
+=======
+    MatchStateRecord,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Membership,
     NCCLCall,
     Op,
@@ -25,6 +33,7 @@
 )
 from tools.flight_recorder.components.utils import (
     align_trace_from_beginning,
+<<<<<<< HEAD
     check_no_missing_dump_files,
     check_size_alltoall,
     check_version,
@@ -34,6 +43,18 @@
     just_print_entries,
     match_coalesced_groups,
     match_one_event,
+=======
+    check_current_entry_match,
+    check_no_missing_dump_files,
+    check_version,
+    error_analysis,
+    find_coalesced_group as find_coalesced_group_p2p_only,
+    find_coalesced_group_with_non_p2p,
+    get_version_detail,
+    just_print_entries,
+    match_coalesced_groups as match_coalesced_groups_p2p_only,
+    match_coalesced_groups_with_non_p2p,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -161,7 +182,10 @@ def build_collectives(
         ]
     }
     """
+<<<<<<< HEAD
     major_v, minor_v = get_version_detail(version)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tracebacks: list[Traceback] = []
 
     collectives: list[Collective] = []
@@ -194,6 +218,7 @@ def build_collectives(
         # lets match the first collective! we need to know which ranks are involved, and ensure that this same
         # collective is also the first one on those ranks within that group
         entries = all_entries[first_rank]
+<<<<<<< HEAD
         desc = entries[0]["process_group"][1]
         # For db build and logs printing, we want to use the original pg_name, not the hash one.
         original_pg_name = entries[0]["process_group"][0]
@@ -212,6 +237,43 @@ def build_collectives(
             all_coalesced_entries = {}
             while expected_ranks:
                 curr = expected_ranks.pop()
+=======
+        current_entry = entries[0]
+        desc = current_entry["process_group"][1]
+        # For db build and logs printing, we want to use the original pg_name, not the hash one.
+        original_pg_name = current_entry["process_group"][0]
+        pg_name = _pg_guids[(original_pg_name, first_rank)]
+        expected_ranks = set(_memberships[pg_name])
+        entry_state = EntryState(current_entry, expected_ranks)
+        match_record = MatchStateRecord(
+            expected_ranks=expected_ranks,
+            other_ranks=other_ranks,
+            entry_state=entry_state,
+            candidate_ranks={first_rank},
+            candidate_idx={},
+            found_ranks=set(),
+            found_idx={},
+            errors=set(),
+        )
+
+        major_v, minor_v = get_version_detail(version)
+        find_coalesced_group = (
+            find_coalesced_group_p2p_only
+            if major_v <= 2 and minor_v < 7
+            else find_coalesced_group_with_non_p2p
+        )
+        maybe_coalesced_group = find_coalesced_group(
+            pg_name, entries, _pg_guids, first_rank
+        )
+        if len(maybe_coalesced_group) > 1:
+            num_coalesced_entries = len(maybe_coalesced_group)
+            # We need a copy of the original expected ranks to avoid modifying it.
+            candidate_ranks = copy.deepcopy(expected_ranks)
+            done_ranks = set()
+            all_coalesced_entries = {}
+            while candidate_ranks:
+                curr = candidate_ranks.pop()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 done_ranks.add(curr)
                 grp = (
                     find_coalesced_group(pg_name, all_entries[curr], _pg_guids, curr)  # type: ignore[index]
@@ -223,6 +285,7 @@ def build_collectives(
                     op = Op(entry, _memberships, pg_name)
                     peer = None
                     if op.type == "send":
+<<<<<<< HEAD
                         assert op._src_g == curr, (op._src_g, curr)
                         peer = op._dst_g
                     elif op.type == "recv":
@@ -241,13 +304,60 @@ def build_collectives(
 
             if match and mismatch[pg_name] == 0:
                 collectives.append(entry_state.to_collective(len(collectives)))
+=======
+                        assert op._src_g == curr, (
+                            f"Send src error: {curr} expected but {op._src_g} is set"
+                        )
+                        peer = op._dst_g
+                    elif op.type == "recv":
+                        assert op._dst_g == curr, (
+                            f"Recv dst error: {curr} expected but {op._dst_g} is set"
+                        )
+                        peer = op._src_g
+                    if peer and peer not in done_ranks:
+                        candidate_ranks.add(peer)
+
+            if major_v <= 2 and minor_v < 7:
+                match = match_coalesced_groups_p2p_only(
+                    all_coalesced_entries,
+                    group_size=_groups[pg_name].size,
+                    groups=_groups,
+                    memberships=_memberships,
+                    _pg_guids=_pg_guids,
+                )
+            else:
+                match = match_coalesced_groups_with_non_p2p(
+                    copy.deepcopy(
+                        all_coalesced_entries
+                    ),  # We want to keep a copy for cleanup.
+                    pg_info=(pg_name, desc),
+                    memberships=_memberships,
+                    _pg_guids=_pg_guids,
+                    mismatch=mismatch,
+                    dumps_ranks=dumps_ranks,
+                    version=version,
+                    collectives=collectives,
+                    match_record=match_record,
+                )
+
+            if match and mismatch[pg_name] == 0:
+                # We treat coalesced collectives as a single collective.
+                # TODO: we need to surface a merged collective info like input/output sizes to users.
+                collectives.append(
+                    match_record.entry_state.to_collective(len(collectives))
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 mismatch[pg_name] += 1
             for r in all_coalesced_entries:
                 idx_map = {r: i for i, _ in reversed(all_coalesced_entries[r])}  # noqa: B035
                 nccl_calls.extend(
                     reversed(
+<<<<<<< HEAD
                         entry_state.to_nccl_call(
+=======
+                        match_record.entry_state.to_nccl_call(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             all_entries,
                             idx_map,
                             len(nccl_calls),
@@ -255,6 +365,7 @@ def build_collectives(
                         )
                     )
                 )
+<<<<<<< HEAD
         else:
             has_undecided_case = False
             for o in expected_ranks.intersection(set(other_ranks)):
@@ -376,10 +487,40 @@ def build_collectives(
                         "No errors found for this collective entry, There could be some "
                         "other reasons why we see collective timeout."
                     )
+=======
+                # This extra cleanup is needed because we need to pop all collectives within a coalesced collective.
+                for i, k in idx_map.items():
+                    for _ in range(1, num_coalesced_entries):
+                        all_entries[i].pop(k)
+        else:
+            # Iterate through all the ranks and check if there is a mismatch for the current entry.
+            check_current_entry_match(
+                all_entries,
+                _pg_guids,
+                (pg_name, desc),
+                current_entry,
+                _memberships,
+                mismatch,
+                match_record,
+            )
+
+            # Use heuristics to decide what type of errors and error messages we should print.
+            error_analysis(
+                all_entries,
+                match_record,
+                dumps_ranks,
+                first_rank,
+                current_entry,
+                mismatch,
+                get_version_detail(version),
+                pg_name,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # at this point there are 3 possibilities
             # 1. we found a match on all the ranks that are members of the group
             #  -> we create a Collective and remove the individual entries from their original lists
+<<<<<<< HEAD
             if found_ranks == expected_ranks and mismatch[pg_name] == 0:
                 collectives.append(entry_state.to_collective(len(collectives)))
                 idx_map = {
@@ -387,6 +528,18 @@ def build_collectives(
                 }
                 nccl_calls.extend(
                     entry_state.to_nccl_call(
+=======
+            if match_record.found_ranks == expected_ranks and mismatch[pg_name] == 0:
+                collectives.append(
+                    match_record.entry_state.to_collective(len(collectives))
+                )
+                idx_map = {
+                    r: match_record.found_idx[r] if r != first_rank else 0
+                    for r in match_record.found_ranks
+                }
+                nccl_calls.extend(
+                    match_record.entry_state.to_nccl_call(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         all_entries, idx_map, len(nccl_calls), collectives[-1].id
                     )
                 )
@@ -398,6 +551,7 @@ def build_collectives(
             else:
                 logger.debug("appending a non-matching collective")
                 idx_map = {
+<<<<<<< HEAD
                     r: candidate_idx[r] if r != first_rank else 0
                     for r in candidate_ranks
                 }
@@ -405,12 +559,25 @@ def build_collectives(
                     entry_state.to_collective(
                         len(collectives),
                         errors=errors,
+=======
+                    r: match_record.candidate_idx[r] if r != first_rank else 0
+                    for r in match_record.candidate_ranks
+                }
+                collectives.append(
+                    match_record.entry_state.to_collective(
+                        len(collectives),
+                        errors=match_record.errors,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         idx_map=idx_map,
                         all_entries=all_entries,
                     )
                 )
                 nccl_calls.extend(
+<<<<<<< HEAD
                     entry_state.to_nccl_call(
+=======
+                    match_record.entry_state.to_nccl_call(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         all_entries, idx_map, len(nccl_calls), None
                     )
                 )
diff --git a/tools/flight_recorder/components/loader.py b/tools/flight_recorder/components/loader.py
index d836779b585f..53ac6701dc43 100644
--- a/tools/flight_recorder/components/loader.py
+++ b/tools/flight_recorder/components/loader.py
@@ -46,7 +46,11 @@ def read_dump(prefix: str, filename: str) -> dict[str, Union[str, int, list[Any]
 
 def _determine_prefix(files: list[str]) -> str:
     """If the user doesn't specify a prefix, but does pass a dir full of similarly-prefixed files, we should be able to
+<<<<<<< HEAD
     infer the common prefix most of the time.  But if we can't confidently infer, just fall back to requring the user
+=======
+    infer the common prefix most of the time.  But if we can't confidently infer, just fall back to requiring the user
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     to specify it
     """
     possible_prefixes: defaultdict[str, set[int]] = defaultdict(set)
diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py
index d396551f7cdf..3c3ddb68e96e 100644
--- a/tools/flight_recorder/components/types.py
+++ b/tools/flight_recorder/components/types.py
@@ -187,16 +187,32 @@ class Database(NamedTuple):
 """
 COLLECTIVES = {
     "broadcast",
+<<<<<<< HEAD
+=======
+    "_broadcast_oop",
+    "reduce",
+    "_reduce_oop",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "all_gather",
     "all_reduce",
     "_all_gather_base",
     "all_gather_into_tensor_coalesced",
+<<<<<<< HEAD
+=======
+    "reduce_scatter",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "reduce_scatter_tensor_coalesced",
     "_reduce_scatter_base",
     "gather",
     "scatter",
     "all_to_all",
     "all_reduce_barrier",
+<<<<<<< HEAD
+=======
+    "allreduce_coalesced",
+    "ALLGATHER_coalesced",
+    "REDUCE_SCATTER_coalesced",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 P2P = {
@@ -224,7 +240,11 @@ def __init__(self, entry: dict[str, Any], expected_ranks: set[int]) -> None:
         self.input_sizes = entry["input_sizes"]
         self.output_sizes = entry["output_sizes"]
         self.collective_state = entry["state"]
+<<<<<<< HEAD
         self.collective_frames = entry["frames"]
+=======
+        self.collective_frames = entry.get("frames", [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.expected_ranks = expected_ranks
         self.missing_ranks: set[int]
         self.input_numel: int
@@ -316,7 +336,11 @@ def to_collective(
                     output_sizes=entry["output_sizes"],
                     expected_ranks=self.expected_ranks,
                     collective_state=entry["state"],
+<<<<<<< HEAD
                     collective_frames=entry["frames"],
+=======
+                    collective_frames=entry.get("frames", []),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     type_of_mismatch=error,
                 )
             return Collective(
@@ -498,9 +522,27 @@ def match(self, other: "Op") -> MatchInfo:
                     f"Expected state: '{self.state}' does not match found state: '{other.state}'",
                 )
             if (
+<<<<<<< HEAD
                 set(self.input_dtypes) != set(self.output_dtypes)
                 or set(self.input_dtypes) != set(other.input_dtypes)
                 or set(self.input_dtypes) != set(other.output_dtypes)
+=======
+                (
+                    set(self.input_dtypes) != set(self.output_dtypes)
+                    and self.input_sizes[0]
+                    and self.output_sizes[0]
+                )
+                or (
+                    set(self.input_dtypes) != set(other.input_dtypes)
+                    and self.input_sizes[0]
+                    and other.input_sizes[0]
+                )
+                or (
+                    set(self.input_dtypes) != set(other.output_dtypes)
+                    and self.input_sizes[0]
+                    and other.output_sizes[0]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 return MatchInfo(
                     MatchState.COLLECTIVE_DTYPE_MISMATCH,
@@ -522,16 +564,30 @@ def match(self, other: "Op") -> MatchInfo:
                     f"Expected output sizes: '{self.output_sizes}' does not match found output sizes: "
                     f"'{other.output_sizes}'",
                 )
+<<<<<<< HEAD
             if self.type == "all_reduce" and self.input_sizes != other.output_sizes:
+=======
+            if (
+                self.type in ["all_reduce", "allreduce_coalesced"]
+                and self.input_sizes != other.output_sizes
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return MatchInfo(
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Expected input sizes: '{self.input_sizes}' does not match found output sizes: '{other.output_sizes}'",
                 )
+<<<<<<< HEAD
             # TODO: need to consider uneven sharding for all-gather.
             # TODO: need to consider all_gather_into_tensor_coalesced (coalesced related)
             if self.type in [
                 "all_gather",
                 "all_gather_base",
+=======
+            if self.type in [
+                "all_gather",
+                "all_gather_base",
+                "all_gather_into_tensor_coalesced",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ] and not (
                 math.prod(other.output_sizes[0])
                 == math.prod(self.input_sizes[0]) * self.pg_size
@@ -544,6 +600,10 @@ def match(self, other: "Op") -> MatchInfo:
             if self.type in [
                 "reduce_scatter",
                 "_reduce_scatter_base",
+<<<<<<< HEAD
+=======
+                "reduce_scatter_tensor_coalesced",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ] and not (
                 math.prod(other.input_sizes[0])
                 == math.prod(self.output_sizes[0]) * self.pg_size
@@ -553,6 +613,7 @@ def match(self, other: "Op") -> MatchInfo:
                     f"Found input numel '{math.prod(other.input_sizes[0])}' does not match output numel "
                     f"'{math.prod(other.output_sizes[0])} * pg size {self.pg_size}'",
                 )
+<<<<<<< HEAD
         elif self.type == "coalesced":
             return (
                 MatchInfo(MatchState.FULLY_MATCHED)
@@ -560,3 +621,49 @@ def match(self, other: "Op") -> MatchInfo:
                 else MatchInfo(MatchState.SIZE_OR_SYNTAX_MISMATCH)
             )
         return MatchInfo(MatchState.FULLY_MATCHED)
+=======
+        elif self.type in [
+            "coalesced",
+            "ALLGATHER_coalesced",
+            "REDUCE_SCATTER_coalesced",
+        ]:
+            return (
+                MatchInfo(MatchState.FULLY_MATCHED)
+                if (other.type == self.type)
+                else MatchInfo(MatchState.SIZE_OR_SYNTAX_MISMATCH)
+            )
+        return MatchInfo(MatchState.FULLY_MATCHED)
+
+
+class MatchStateRecord:
+    def __init__(
+        self,
+        expected_ranks: set[int],
+        other_ranks: list[int],
+        entry_state: EntryState,
+        candidate_ranks: set[int],
+        candidate_idx: dict[int, int],
+        found_ranks: set[int],
+        found_idx: dict[int, int],
+        errors: set[tuple[int, MatchInfo]],
+    ) -> None:
+        self.expected_ranks = expected_ranks
+        self.other_ranks = other_ranks
+        self.entry_state = entry_state
+        self.candidate_ranks = candidate_ranks
+        self.candidate_idx = candidate_idx
+        self.found_ranks = found_ranks
+        self.found_idx = found_idx
+        self.errors = errors
+        self.has_undecided_case = False
+
+    def reset_for_coalesced(
+        self, entry_state: EntryState, candidate_ranks: set[int]
+    ) -> None:
+        self.entry_state = entry_state
+        self.candidate_ranks = candidate_ranks
+        self.candidate_idx = {}
+        self.found_ranks = set()
+        self.found_idx = {}
+        self.errors = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/flight_recorder/components/utils.py b/tools/flight_recorder/components/utils.py
index 02787d3e43c6..db6f25ff1aee 100644
--- a/tools/flight_recorder/components/utils.py
+++ b/tools/flight_recorder/components/utils.py
@@ -10,9 +10,18 @@
 
 from tools.flight_recorder.components.fr_logger import FlightRecorderLogger
 from tools.flight_recorder.components.types import (
+<<<<<<< HEAD
     Group,
     MatchInfo,
     MatchState,
+=======
+    Collective,
+    EntryState,
+    Group,
+    MatchInfo,
+    MatchState,
+    MatchStateRecord,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Membership,
     Op,
     P2P,
@@ -175,6 +184,212 @@ def visualize_ops(
     return True
 
 
+<<<<<<< HEAD
+=======
+# We enabled the creating FR entry for non-P2P slow path collective ops in v2.7.
+def match_coalesced_groups_with_non_p2p(
+    all_rank_events: dict[Any, Any],
+    pg_info: tuple[str, str],
+    memberships: dict[str, set[Any]],
+    _pg_guids: dict[tuple[str, int], str],
+    mismatch: dict[str, int],
+    dumps_ranks: set[int],
+    version: str,
+    collectives: list[Collective],
+    match_record: MatchStateRecord,
+) -> bool:
+    """
+    all_rank_events: {
+        rank: [
+            (idx, event_dict)
+        ]
+    }
+
+    Note: it is possible for event dicts in a coalesced group to be asymmetric.
+        e.g. the following events lists form a valid coalescing group
+             events0 [send:1]
+             events1 [recv:0, send:2]
+             events2 [recv:1]
+
+    Rule 1: all ops should find a match
+    Rule 2: relative ordering of sends and recvs in one event list can be arbitrary
+        e.g.
+        events1 [recv:0, send:2]  —> okay
+        events1 [send:2, recv:0] —> also okay
+    Rule 3: sends to the same dest or recvs from the src should be in a consistent order
+        e.g.
+        rank0 [send:1 (100B), send:1 (1000B)]
+        rank1 [recv:0 (1000B), recv:0 (100B)]   —> not okay
+    """
+    all_ops = {
+        rank: [
+            Op(e, memberships, _pg_guids[(e["process_group"][0], rank)])
+            for _, e in all_rank_events[rank]
+        ]
+        for rank in all_rank_events
+    }
+    is_p2p = any(op.type in P2P for ops in all_ops.values() for op in ops)
+    pg_name = pg_info[0]
+
+    def visualize_ops(
+        match: bool,
+        _pg_guids: dict[tuple[str, int], str],
+    ) -> None:
+        all_ops = {
+            rank: [
+                Op(e, memberships, _pg_guids[(e["process_group"][0], rank)])
+                for _, e in all_rank_events[rank]
+            ]
+            for rank in all_rank_events
+        }
+
+        i = 0
+        row = []
+        progress = True
+        table = []
+        while progress:
+            progress = False
+            for r in all_ops:
+                if len(all_ops[r]) > i:
+                    rank, event = all_rank_events[r][i]
+                    row.append(
+                        Op(
+                            event,
+                            memberships,
+                            _pg_guids[(event["process_group"][0], rank)],
+                        )
+                    )
+                    progress = True
+                else:
+                    row.append(None)  # type: ignore[arg-type]
+            table.append(row)
+            row = []
+            i += 1
+        title = "Match" if match else "MISMATCH"
+        logger.info("%s \n", title)
+        logger.info("%s", tabulate(table))  # type: ignore[operator]
+
+    # TODO Need to verify no seq_id deltas for P2P ops.
+    for rank, op_list in all_ops.items():
+        if not op_list:
+            logger.error("Rank %s has an empty op list.", rank)
+            continue
+        if op_list[-1].type == "coalesced" and is_p2p:
+            op_list.pop(-1)
+
+    while all_ops:
+        first_rank = next(iter(all_ops))
+        my_ops = all_ops[first_rank]
+
+        if len(all_ops[first_rank]) == 0:
+            all_ops.pop(first_rank)
+            continue
+
+        # lets match the first collective! we need to know which ranks are involved, and ensure that this same
+        # collective is also the first one on those ranks within that group
+        op = my_ops[0]
+        match_idx = -1
+        if is_p2p:
+            dst_global_rank = sorted(memberships[op.pg_name])[op.dst]
+            peer_ops = all_ops[dst_global_rank]
+            for i, other in enumerate(peer_ops):
+                if op.match(other).state == MatchState.FULLY_MATCHED:
+                    match_idx = i
+                    break
+                elif op.dst == other.src:
+                    # Rule 3
+                    break
+                else:
+                    # Rule 1
+                    continue
+            if match_idx >= 0:
+                my_ops.pop(0)
+                peer_ops.pop(match_idx)
+            else:
+                visualize_ops(False, _pg_guids)
+                return False
+        else:
+            all_coalesced_entries = {
+                rank: [e for _, e in all_rank_events[rank]] for rank in all_rank_events
+            }
+            current_entry = all_coalesced_entries[first_rank][0]
+            my_ops.pop(0)
+
+            match_record.reset_for_coalesced(
+                EntryState(current_entry, match_record.expected_ranks),
+                {first_rank},
+            )
+
+            # Iterate through all the ranks and check if there is a mismatch for the current entry.
+            check_current_entry_match(
+                all_coalesced_entries,
+                _pg_guids,
+                pg_info,
+                current_entry,
+                memberships,
+                mismatch,
+                match_record,
+            )
+
+            # Use heuristics to decide what type of errors and error messages we should print.
+            error_analysis(
+                all_coalesced_entries,
+                match_record,
+                dumps_ranks,
+                first_rank,
+                current_entry,
+                mismatch,
+                get_version_detail(version),
+                pg_info[0],
+            )
+
+            # TODO: For now, we only check the correctness of individual collective within a coalesced one in
+            # this script. We need to merge  (e.g, input/output sizes) together
+            # for downstream consumer.
+
+            # at this point there are 3 possibilities
+            # 1. we found a match on all the ranks that are members of the group
+            #  -> we create a Collective and remove the individual entries from their original lists
+            if (
+                match_record.found_ranks == match_record.expected_ranks
+                and mismatch[pg_name] == 0
+            ):
+                # Just pop out this collective.
+                idx_map = {
+                    r: match_record.found_idx[r] if r != first_rank else 0
+                    for r in match_record.found_ranks
+                }
+                for i, k in idx_map.items():
+                    all_rank_events[i].pop(k)
+                for r in match_record.found_ranks:
+                    if r != first_rank:
+                        all_ops[r].pop(0)
+
+            # 2. we found a partial match but some ranks are missing
+            # 3. we found no match
+            #  -> since its not a complete collective, no entry goes into collectives but we still record a nccl call
+            else:
+                logger.debug("Non-matching collective inside coalesced group")
+                idx_map = {
+                    r: match_record.candidate_idx[r] if r != first_rank else 0
+                    for r in match_record.candidate_ranks
+                }
+                collectives.append(
+                    match_record.entry_state.to_collective(
+                        len(collectives),
+                        errors=match_record.errors,
+                        idx_map=idx_map,
+                        all_entries=all_coalesced_entries,
+                    )
+                )
+                return False
+
+    if is_p2p:
+        visualize_ops(True, _pg_guids)
+    return True
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def check_size_alltoall(alltoall_cases: list[dict[str, Any]]) -> tuple[bool, int, int]:
     input_numel = 0
     output_numel = 0
@@ -184,6 +399,162 @@ def check_size_alltoall(alltoall_cases: list[dict[str, Any]]) -> tuple[bool, int
     return input_numel != output_numel, input_numel, output_numel
 
 
+<<<<<<< HEAD
+=======
+def check_current_entry_match(
+    all_entries: dict[int, list[dict[str, Any]]],
+    _pg_guids: dict[tuple[str, int], str],
+    pg_info: tuple[str, str],
+    current_entry: dict[str, Any],
+    _memberships: dict[str, set[Any]],
+    mismatch: dict[str, int],
+    match_record: MatchStateRecord,
+) -> None:
+    pg_name, desc = pg_info[0], pg_info[1]
+    for o in match_record.expected_ranks.intersection(set(match_record.other_ranks)):
+        for i, e in enumerate(all_entries[o]):  # type: ignore[index]
+            # step over ops from other PGs
+            # only check match state when seq_id matches
+            if (
+                _pg_guids[(e["process_group"][0], o)] == pg_name
+                and e["process_group"][1] == desc
+                and e["collective_seq_id"] == match_record.entry_state.collective_seq_id
+            ):
+                match_info = match_one_event(current_entry, e, _memberships, pg_name)
+                if (
+                    match_info.state in [MatchState.FULLY_MATCHED, MatchState.UNDECIDED]
+                    and mismatch[pg_name] == 0
+                ):
+                    match_record.found_ranks.add(o)
+                    match_record.found_idx[o] = i
+                    match_record.has_undecided_case = (
+                        match_info.state == MatchState.UNDECIDED
+                    )
+                else:
+                    match_record.candidate_ranks.add(o)
+                    match_record.candidate_idx[o] = i
+                    if match_info.state not in [
+                        MatchState.FULLY_MATCHED,
+                        MatchState.UNDECIDED,
+                    ]:
+                        # Here we assume the current rank is not the source of the error.
+                        # But it's possible that the current rank is the culprit, then users will
+                        # see lots of normal ranks reported as culprit.
+                        # TODO: we need to figure out a better way to handle the case mentioned above.
+                        match_record.errors.add((o, match_info))
+                break
+
+
+def error_analysis(
+    all_entries: dict[int, list[dict[str, Any]]],
+    match_record: MatchStateRecord,
+    dumps_ranks: set[int],
+    first_rank: int,
+    current_entry: dict[str, Any],
+    mismatch: dict[str, int],
+    version: tuple[int, int],
+    pg_name: str,
+) -> None:
+    major_v, minor_v = version[0], version[1]
+    # case one: not every rank join the collective or in the flight recorder.
+    if (
+        match_record.candidate_ranks | match_record.found_ranks
+    ) != match_record.expected_ranks and match_record.expected_ranks - (
+        match_record.candidate_ranks | match_record.found_ranks
+    ) <= dumps_ranks:
+        mismatch[pg_name] += 1
+        logger_msg = "Not all ranks joining collective, sequence number: %s"
+        missing_ranks = match_record.expected_ranks - (
+            match_record.candidate_ranks | match_record.found_ranks
+        )
+        match_record.entry_state.log(
+            logger, logger_msg, format_frames, missing_ranks=missing_ranks
+        )
+        match_record.candidate_ranks.update(match_record.found_ranks)
+        match_record.candidate_idx.update(match_record.found_idx)
+        match_record.found_idx.clear()
+        match_record.found_ranks.clear()
+    # We didn't see any mismatch and all expected ranks are in the dump.
+    elif len(
+        match_record.candidate_ranks
+    ) == 1 and match_record.expected_ranks.issubset(dumps_ranks):
+        # case two: alltoall or alltoall_base case.
+        if match_record.has_undecided_case:
+            alltoall_cases = [current_entry] + [
+                all_entries[o][match_record.found_idx[o]]
+                for o in match_record.found_ranks
+            ]
+            fail_check, total_input_numel, total_output_numel = check_size_alltoall(
+                alltoall_cases
+            )
+            if major_v <= 2 and minor_v <= 3:
+                # We don't log the input/output sizes for alltoall before v2.4,
+                # so we don't consider the size mismatch as an error for now.
+                fail_check = False
+            if fail_check:
+                # When we see errors in all_to_all, it's hard to tell which rank is the source of the error.
+                mismatch[pg_name] += 1
+                logger_msg = (
+                    "Input/output mismatch in the collective sequence number: %s"
+                )
+                match_record.entry_state.log(
+                    logger,
+                    logger_msg,
+                    format_frames,
+                    total_numel=(total_input_numel, total_output_numel),
+                )
+                match_record.candidate_ranks.update(match_record.found_ranks)
+                match_record.candidate_idx.update(match_record.found_idx)
+                match_record.found_idx.clear()
+                match_record.found_ranks.clear()
+                match_record.errors.add(
+                    (first_rank, MatchInfo(MatchState.SIZE_OR_SYNTAX_MISMATCH))
+                )
+            else:
+                match_record.found_ranks.update(match_record.candidate_ranks)
+                match_record.found_idx.update(match_record.candidate_idx)
+                match_record.candidate_idx.clear()
+                match_record.candidate_ranks.clear()
+        # case three: all joined and everything matches on all ranks.
+        else:
+            match_record.found_ranks.update(match_record.candidate_ranks)
+            match_record.found_idx.update(match_record.candidate_idx)
+            match_record.candidate_idx.clear()
+            match_record.candidate_ranks.clear()
+    # case four: mismatch cases due to not same type, size mismatch or state mismatch.
+    elif len(match_record.errors) > 0:
+        mismatch[pg_name] += 1
+        logger_msg = "Collective sequence number: %s has errors"
+        match_record.entry_state.log(
+            logger, logger_msg, format_frames, errors=match_record.errors
+        )
+        match_record.candidate_ranks.update(match_record.found_ranks)
+        match_record.candidate_idx.update(match_record.found_idx)
+        match_record.found_idx.clear()
+        match_record.found_ranks.clear()
+    # partial analysis case when we cannot decide what's wrong with this collective entry.
+    else:
+        match_record.candidate_ranks.update(match_record.found_ranks)
+        match_record.candidate_idx.update(match_record.found_idx)
+        match_record.found_idx.clear()
+        match_record.found_ranks.clear()
+        # if any element in expected_ranks not in dumps_ranks.
+        if match_record.expected_ranks - dumps_ranks:
+            mismatch[pg_name] += 1
+            logger.info(
+                "We cannot decide what's wrong with this collective entry "
+                "because we missed FR dumps from ranks (%s) so we don't have enough "
+                "information. If you want to debug further use -j to dump all raw trace",
+                str(match_record.expected_ranks - dumps_ranks),
+            )
+        else:
+            logger.info(
+                "No errors found for this collective entry, There could be some "
+                "other reasons why we see collective timeout."
+            )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def find_coalesced_group(
     pg_name: str,
     entries: list[dict[str, Any]],
@@ -216,6 +587,44 @@ def find_coalesced_group(
     return []
 
 
+<<<<<<< HEAD
+=======
+# We enabled the creating FR entry for non-P2P slow path collective ops in v2.7.
+def find_coalesced_group_with_non_p2p(
+    pg_name: str,
+    entries: list[dict[str, Any]],
+    _pg_guids: dict[tuple[str, int], str],
+    rank: int,
+) -> list[tuple[int, dict[str, Any]]]:
+    """Given a list of entries, if the collective_seq_id of the first entry matches that of subsequent ones,
+    build an return a list of entries terminating in a 'coalesced' op entry all sharing a collective_seq_id
+    """
+    found = []
+    collective_seq_id = None
+    for i, e in enumerate(entries):
+        if _pg_guids[(e["process_group"][0], rank)] != pg_name:
+            continue
+        elif collective_seq_id is None:
+            collective_seq_id = (
+                e["p2p_seq_id"] if e["is_p2p"] else e["collective_seq_id"]
+            )
+            found.append((i, e))
+        elif not e["is_p2p"] and e["collective_seq_id"] == collective_seq_id:
+            found.append((i, e))
+        elif e["is_p2p"] and e["p2p_seq_id"] == collective_seq_id:
+            found.append((i, e))
+        else:
+            break
+
+    if len(found) > 1:
+        name = found[-1][1]["profiling_name"]
+        if name.startswith("nccl:") and not name.endswith("_coalesced"):
+            logger.error("Rank %s does not have a coalesced end.", rank)
+        return found
+    return []
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def just_print_entries(
     all_entries: dict[int, list[dict[str, Any]]],
     _groups: dict[str, Group],
diff --git a/tools/flight_recorder/fr_trace.py b/tools/flight_recorder/fr_trace.py
index aebd914eb467..c1ba9d66a9af 100644
--- a/tools/flight_recorder/fr_trace.py
+++ b/tools/flight_recorder/fr_trace.py
@@ -14,7 +14,11 @@
 - TODO- tracebacks aren't implemented
 
 Known Issues
+<<<<<<< HEAD
 - Flight Recorder buffer sequence_id information is not sufficient to match collectives and coalseced collectives
+=======
+- Flight Recorder buffer sequence_id information is not sufficient to match collectives and coalesced collectives
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   unless we have the trace data from the beginning of the program.  To enable confident analysis of trace buffers that
   do not start from zero (and to simplify the script's matching logic) we need to add more information to the recorder.
 - Currently, the script omits checking the 'status' of collectives.  We can look for the first 'non completed'
diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py
index a33ea171edbb..baa7877e339a 100644
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@@ -6,7 +6,11 @@
 import subprocess
 from pathlib import Path
 
+<<<<<<< HEAD
 from setuptools import distutils  # type: ignore[import]
+=======
+from setuptools import distutils  # type: ignore[import,attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 UNKNOWN = "Unknown"
@@ -97,7 +101,13 @@ def get_torch_version(sha: str | None = None) -> str:
 
     with open(version_path, "w") as f:
         f.write("from typing import Optional\n\n")
+<<<<<<< HEAD
         f.write("__all__ = ['__version__', 'debug', 'cuda', 'git_version', 'hip']\n")
+=======
+        f.write(
+            "__all__ = ['__version__', 'debug', 'cuda', 'git_version', 'hip', 'xpu']\n"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f.write(f"__version__ = '{version}'\n")
         # NB: This is not 100% accurate, because you could have built the
         # library code with DEBUG, but csrc without DEBUG (in which case
diff --git a/tools/linter/adapters/_linter/__init__.py b/tools/linter/adapters/_linter/__init__.py
new file mode 100644
index 000000000000..136b168de3f3
--- /dev/null
+++ b/tools/linter/adapters/_linter/__init__.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+import token
+from pathlib import Path
+from typing import Any, TYPE_CHECKING
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from tokenize import TokenInfo
+
+
+__all__ = (
+    "Block",
+    "EMPTY_TOKENS",
+    "FileLinter",
+    "LineWithSets",
+    "LintResult",
+    "ParseError",
+    "PythonFile",
+    "ROOT",
+)
+
+NO_TOKEN = -1
+
+# Python 3.12 and up have two new token types, FSTRING_START and FSTRING_END
+_START_OF_LINE_TOKENS = token.DEDENT, token.INDENT, token.NEWLINE
+_IGNORED_TOKENS = token.COMMENT, token.ENDMARKER, token.ENCODING, token.NL
+EMPTY_TOKENS = dict.fromkeys(_START_OF_LINE_TOKENS + _IGNORED_TOKENS)
+
+_LINTER = Path(__file__).absolute().parents[0]
+ROOT = _LINTER.parents[3]
+
+
+class ParseError(ValueError):
+    def __init__(self, token: TokenInfo, *args: str) -> None:
+        super().__init__(*args)
+        self.token = token
+
+
+from .block import Block
+from .file_linter import FileLinter
+from .messages import LintResult
+from .python_file import PythonFile
diff --git a/tools/linter/adapters/_linter/argument_parser.py b/tools/linter/adapters/_linter/argument_parser.py
new file mode 100644
index 000000000000..29a1c18dcd4f
--- /dev/null
+++ b/tools/linter/adapters/_linter/argument_parser.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import argparse
+import sys
+from typing import Any
+from typing_extensions import Never
+
+
+class ArgumentParser(argparse.ArgumentParser):
+    """
+    Adds better help formatting and default arguments to argparse.ArgumentParser
+    """
+
+    def __init__(
+        self,
+        prog: str | None = None,
+        usage: str | None = None,
+        description: str | None = None,
+        epilog: str | None = None,
+        is_fixer: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(prog, usage, description, None, **kwargs)
+        self._epilog = epilog
+
+        help = "A list of files or directories to lint"
+        self.add_argument("files", nargs="*", help=help)
+        # TODO(rec): get fromfile_prefix_chars="@", type=argparse.FileType to work
+
+        help = "Fix lint errors if possible" if is_fixer else argparse.SUPPRESS
+        self.add_argument("-f", "--fix", action="store_true", help=help)
+
+        help = "Run for lintrunner and print LintMessages which aren't edits"
+        self.add_argument("-l", "--lintrunner", action="store_true", help=help)
+
+        help = "Print more debug info"
+        self.add_argument("-v", "--verbose", action="store_true", help=help)
+
+    def exit(self, status: int = 0, message: str | None = None) -> Never:
+        """
+        Overriding this method is a workaround for argparse throwing away all
+        line breaks when printing the `epilog` section of the help message.
+        """
+        argv = sys.argv[1:]
+        if self._epilog and not status and "-h" in argv or "--help" in argv:
+            print(self._epilog)
+        super().exit(status, message)
diff --git a/tools/linter/adapters/_linter/block.py b/tools/linter/adapters/_linter/block.py
new file mode 100644
index 000000000000..f0417a5ff47d
--- /dev/null
+++ b/tools/linter/adapters/_linter/block.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+import dataclasses as dc
+import itertools
+import token
+from enum import Enum
+from functools import cached_property, total_ordering
+from typing import Any, Optional, TYPE_CHECKING
+from typing_extensions import Self
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator, Sequence
+    from tokenize import TokenInfo
+
+
+@total_ordering
+@dc.dataclass
+class Block:
+    """A block of Python code starting with either `def` or `class`"""
+
+    class Category(str, Enum):
+        CLASS = "class"
+        DEF = "def"
+
+    category: Category
+
+    # The sequence of tokens that contains this Block.
+    # Tokens are represented in `Block` as indexes into `self.tokens`
+    tokens: Sequence[TokenInfo] = dc.field(repr=False)
+
+    # The name of the function or class being defined
+    name: str
+
+    # The index of the very first token in the block (the "class" or "def" keyword)
+    begin: int
+
+    # The index of the first INDENT token for this block
+    indent: int
+
+    # The index of the DEDENT token for this end of this block
+    dedent: int
+
+    # The docstring for the block
+    docstring: str
+
+    # These next members only get filled in after all blocks have been constructed
+    # and figure out family ties
+
+    # The full qualified name of the block within the file.
+    # This is the name of this block and all its parents, joined with `.`.
+    full_name: str = ""
+
+    # The index of this block within the full list of blocks in the file
+    index: int = 0
+
+    # Is this block contained within a function definition?
+    is_local: bool = dc.field(default=False, repr=False)
+
+    # Is this block a function definition in a class definition?
+    is_method: bool = dc.field(default=False, repr=False)
+
+    # A block index to the parent of this block, or None for a top-level block.
+    parent: Optional[int] = None
+
+    # A list of block indexes for the children
+    children: list[int] = dc.field(default_factory=list)
+
+    @property
+    def start_line(self) -> int:
+        return self.tokens[max(self.indent, self.index)].start[0]
+
+    @property
+    def end_line(self) -> int:
+        return self.tokens[max(self.dedent, self.index)].start[0]
+
+    @property
+    def line_count(self) -> int:
+        return self.end_line - self.start_line
+
+    @property
+    def is_class(self) -> bool:
+        return self.category == Block.Category.CLASS
+
+    @property
+    def display_name(self) -> str:
+        """A user-friendly name like 'class One' or 'def One.method()'"""
+        ending = "" if self.is_class else "()"
+        return f"{self.category.value} {self.full_name}{ending}"
+
+    @cached_property
+    def decorators(self) -> list[str]:
+        """A list of decorators for this function or method.
+
+        Each decorator both the @ symbol and any arguments to the decorator
+        but no extra whitespace.
+        """
+        return _get_decorators(self.tokens, self.begin)
+
+    @cached_property
+    def is_override(self) -> bool:
+        return not self.is_class and any(
+            d.rpartition(".")[2] == "override" for d in self.decorators
+        )
+
+    DATA_FIELDS = (
+        "category",
+        "children",
+        "decorators",
+        "display_name",
+        "docstring",
+        "full_name",
+        "index",
+        "is_local",
+        "is_method",
+        "line_count",
+        "parent",
+        "start_line",
+    )
+
+    def as_data(self) -> dict[str, Any]:
+        d = {i: getattr(self, i) for i in self.DATA_FIELDS}
+        d["category"] = d["category"].value
+        return d
+
+    @property
+    def is_init(self) -> bool:
+        return not self.is_class and self.name == "__init__"
+
+    def contains(self, b: Block) -> bool:
+        return self.start_line < b.start_line and self.end_line >= b.end_line
+
+    def __eq__(self, o: object) -> bool:
+        assert isinstance(o, Block)
+        return o.tokens is self.tokens and o.index == self.index
+
+    def __hash__(self) -> int:
+        return super().__hash__()
+
+    def __lt__(self, o: Self) -> bool:
+        assert isinstance(o, Block) and o.tokens is self.tokens
+        return o.index < self.index
+
+
+_IGNORE = {token.COMMENT, token.DEDENT, token.INDENT, token.NL}
+
+
+def _get_decorators(tokens: Sequence[TokenInfo], block_start: int) -> list[str]:
+    def decorators() -> Iterator[str]:
+        rev = reversed(range(block_start))
+        newlines = (i for i in rev if tokens[i].type == token.NEWLINE)
+        newlines = itertools.chain(newlines, [-1])  # To account for the first line
+
+        it = iter(newlines)
+        end = next(it, -1)  # Like itertools.pairwise in Python 3.10
+        for begin in it:
+            for i in range(begin + 1, end):
+                t = tokens[i]
+                if t.type == token.OP and t.string == "@":
+                    useful = (t for t in tokens[i:end] if t.type not in _IGNORE)
+                    yield "".join(s.string.strip("\n") for s in useful)
+                    break
+                elif t.type not in _IGNORE:
+                    return  # A statement means no more decorators
+            end = begin
+
+    out = list(decorators())
+    out.reverse()
+    return out
diff --git a/tools/linter/adapters/_linter/blocks.py b/tools/linter/adapters/_linter/blocks.py
new file mode 100644
index 000000000000..7f511d06cc98
--- /dev/null
+++ b/tools/linter/adapters/_linter/blocks.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+import token
+from typing import NamedTuple, TYPE_CHECKING
+
+from . import EMPTY_TOKENS, ParseError
+from .block import Block
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from tokenize import TokenInfo
+
+
+class BlocksResult(NamedTuple):
+    blocks: list[Block]
+    errors: dict[str, str]
+
+
+def blocks(tokens: Sequence[TokenInfo]) -> BlocksResult:
+    blocks: list[Block] = []
+    indent_to_dedent = _make_indent_dict(tokens)
+    errors: dict[str, str] = {}
+
+    def starts_block(t: TokenInfo) -> bool:
+        return t.type == token.NAME and t.string in ("class", "def")
+
+    it = (i for i, t in enumerate(tokens) if starts_block(t))
+    blocks = [_make_block(tokens, i, indent_to_dedent, errors) for i in it]
+
+    for i, parent in enumerate(blocks):
+        for j in range(i + 1, len(blocks)):
+            if parent.contains(child := blocks[j]):
+                child.parent = i
+                parent.children.append(j)
+            else:
+                break
+
+    for i, b in enumerate(blocks):
+        b.index = i
+        parents = [b]
+        while (p := parents[-1].parent) is not None:
+            parents.append(blocks[p])
+        parents = parents[1:]
+
+        b.is_local = not all(p.is_class for p in parents)
+        b.is_method = not b.is_class and bool(parents) and parents[0].is_class
+
+    _add_full_names(blocks, [b for b in blocks if b.parent is None])
+    return BlocksResult(blocks, errors)
+
+
+def _make_indent_dict(tokens: Sequence[TokenInfo]) -> dict[int, int]:
+    dedents = dict[int, int]()
+    stack = list[int]()
+
+    for i, t in enumerate(tokens):
+        if t.type == token.INDENT:
+            stack.append(i)
+        elif t.type == token.DEDENT:
+            dedents[stack.pop()] = i
+
+    return dedents
+
+
+def _docstring(tokens: Sequence[TokenInfo], start: int) -> str:
+    for i in range(start + 1, len(tokens)):
+        tk = tokens[i]
+        if tk.type == token.STRING:
+            return tk.string
+        if tk.type not in EMPTY_TOKENS:
+            return ""
+    return ""
+
+
+def _add_full_names(
+    blocks: Sequence[Block], children: Sequence[Block], prefix: str = ""
+) -> None:
+    # Would be trivial except that there can be duplicate names at any level
+    dupes: dict[str, list[Block]] = {}
+    for b in children:
+        dupes.setdefault(b.name, []).append(b)
+
+    for dl in dupes.values():
+        for i, b in enumerate(dl):
+            suffix = f"[{i + 1}]" if len(dl) > 1 else ""
+            b.full_name = prefix + b.name + suffix
+
+    for b in children:
+        if kids := [blocks[i] for i in b.children]:
+            _add_full_names(blocks, kids, b.full_name + ".")
+
+
+def _make_block(
+    tokens: Sequence[TokenInfo],
+    begin: int,
+    indent_to_dedent: dict[int, int],
+    errors: dict[str, str],
+) -> Block:
+    def next_token(start: int, token_type: int, error: str) -> int:
+        for i in range(start, len(tokens)):
+            if tokens[i].type == token_type:
+                return i
+        raise ParseError(tokens[-1], error)
+
+    t = tokens[begin]
+    category = Block.Category[t.string.upper()]
+    indent = -1
+    dedent = -1
+    docstring = ""
+    name = "(not found)"
+    try:
+        ni = next_token(begin + 1, token.NAME, "Definition but no name")
+        name = tokens[ni].string
+        indent = next_token(ni + 1, token.INDENT, "Definition but no indent")
+        dedent = indent_to_dedent[indent]
+        docstring = _docstring(tokens, indent)
+    except ParseError as e:
+        errors[t.line] = " ".join(e.args)
+
+    return Block(
+        begin=begin,
+        category=category,
+        dedent=dedent,
+        docstring=docstring,
+        indent=indent,
+        name=name,
+        tokens=tokens,
+    )
diff --git a/tools/linter/adapters/_linter/bracket_pairs.py b/tools/linter/adapters/_linter/bracket_pairs.py
new file mode 100644
index 000000000000..23f08c9ff739
--- /dev/null
+++ b/tools/linter/adapters/_linter/bracket_pairs.py
@@ -0,0 +1,42 @@
+import token
+from collections.abc import Sequence
+from tokenize import TokenInfo
+
+from . import NO_TOKEN, ParseError
+
+
+FSTRING_START: int = getattr(token, "FSTRING_START", NO_TOKEN)
+FSTRING_END: int = getattr(token, "FSTRING_END", NO_TOKEN)
+
+BRACKETS = {"{": "}", "(": ")", "[": "]"}
+BRACKETS_INV = {j: i for i, j in BRACKETS.items()}
+
+
+def bracket_pairs(tokens: Sequence[TokenInfo]) -> dict[int, int]:
+    """Returns a dictionary mapping opening to closing brackets"""
+    braces: dict[int, int] = {}
+    stack: list[int] = []
+
+    for i, t in enumerate(tokens):
+        if t.type == token.OP:
+            if t.string in BRACKETS:
+                stack.append(i)
+            elif inv := BRACKETS_INV.get(t.string):
+                if not stack:
+                    raise ParseError(t, "Never opened")
+                begin = stack.pop()
+
+                if not (stack and stack[-1] == FSTRING_START):
+                    braces[begin] = i
+
+                b = tokens[begin].string
+                if b != inv:
+                    raise ParseError(t, f"Mismatched braces '{b}' at {begin}")
+        elif t.type == FSTRING_START:
+            stack.append(FSTRING_START)
+        elif t.type == FSTRING_END:
+            if stack.pop() != FSTRING_START:
+                raise ParseError(t, "Mismatched FSTRING_START/FSTRING_END")
+    if stack:
+        raise ParseError(t, "Left open")
+    return braces
diff --git a/tools/linter/adapters/_linter/file_linter.py b/tools/linter/adapters/_linter/file_linter.py
new file mode 100644
index 000000000000..7f9c0890fbf6
--- /dev/null
+++ b/tools/linter/adapters/_linter/file_linter.py
@@ -0,0 +1,190 @@
+from __future__ import annotations
+
+import json
+import sys
+from abc import abstractmethod
+from functools import cached_property
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing_extensions import Never
+
+from . import ParseError
+from .argument_parser import ArgumentParser
+from .messages import LintResult
+from .python_file import PythonFile
+
+
+if TYPE_CHECKING:
+    from argparse import Namespace
+    from collections.abc import Iterator, Sequence
+
+
+class ErrorLines:
+    """How many lines to display before and after an error"""
+
+    WINDOW = 5
+    BEFORE = 2
+    AFTER = WINDOW - BEFORE - 1
+
+
+class FileLinter:
+    """The base class that all token-based linters inherit from"""
+
+    description: str
+    linter_name: str
+
+    epilog: str | None = None
+    is_fixer: bool = True
+    report_column_numbers: bool = False
+
+    @abstractmethod
+    def _lint(self, python_file: PythonFile) -> Iterator[LintResult]:
+        raise NotImplementedError
+
+    def __init__(self, argv: Sequence[str] | None = None) -> None:
+        self.argv = argv
+        self.parser = ArgumentParser(
+            is_fixer=self.is_fixer,
+            description=self.description,
+            epilog=self.epilog,
+        )
+        self.result_shown = False
+
+    @classmethod
+    def run(cls) -> Never:
+        sys.exit(not cls().lint_all())
+
+    def lint_all(self) -> bool:
+        if self.args.fix and self.args.lintrunner:
+            raise ValueError("--fix and --lintrunner are incompatible")
+
+        success = True
+        for p in self.paths:
+            success = self._lint_file(p) and success
+        return self.args.lintrunner or success
+
+    @classmethod
+    def make_file(cls, pc: Path | str | None = None) -> PythonFile:
+        return PythonFile.make(cls.linter_name, pc)
+
+    @cached_property
+    def args(self) -> Namespace:
+        args = self.parser.parse_args(self.argv)
+
+        return args
+
+    @cached_property
+    def code(self) -> str:
+        return self.linter_name.upper()
+
+    @cached_property
+    def paths(self) -> list[Path]:
+        files = []
+        file_parts = (f for fp in self.args.files for f in fp.split(":"))
+        for f in file_parts:
+            if f.startswith("@"):
+                files.extend(Path(f[1:]).read_text().splitlines())
+            elif f != "--":
+                files.append(f)
+        return sorted(Path(f) for f in files)
+
+    def _lint_file(self, p: Path) -> bool:
+        if self.args.verbose:
+            print(p, "Reading", file=sys.stderr)
+
+        pf = self.make_file(p)
+        replacement, results = self._replace(pf)
+
+        if display := list(self._display(pf, results)):
+            print(*display, sep="\n")
+        if results and self.args.fix and pf.path and pf.contents != replacement:
+            pf.path.write_text(replacement)
+
+        return not results or self.args.fix and all(r.is_edit for r in results)
+
+    def _error(self, pf: PythonFile, result: LintResult) -> None:
+        """Called on files that are unparsable"""
+
+    def _replace(self, pf: PythonFile) -> tuple[str, list[LintResult]]:
+        # Because of recursive replacements, we need to repeat replacing and reparsing
+        # from the inside out until all possible replacements are complete
+        previous_result_count = float("inf")
+        first_results = None
+        original = replacement = pf.contents
+
+        while True:
+            try:
+                results = sorted(self._lint(pf), key=LintResult.sort_key)
+            except IndentationError as e:
+                error, (_name, lineno, column, _line) = e.args
+
+                results = [LintResult(error, lineno, column)]
+                self._error(pf, *results)
+
+            except ParseError as e:
+                results = [LintResult(str(e), *e.token.start)]
+                self._error(pf, *results)
+
+            for i, ri in enumerate(results):
+                if not ri.is_recursive:
+                    for rj in results[i + 1 :]:
+                        if ri.contains(rj):
+                            rj.is_recursive = True
+                        else:
+                            break
+
+            first_results = first_results or results
+            if not results or len(results) >= previous_result_count:
+                break
+            previous_result_count = len(results)
+
+            lines = pf.lines[:]
+            for r in reversed(results):
+                if r.is_edit and not r.is_recursive:
+                    r.apply(lines)
+            replacement = "".join(lines)
+
+            if not any(r.is_recursive for r in results):
+                break
+            pf = pf.with_contents(replacement)
+
+        if first_results and self.args.lintrunner:
+            name = f"Suggested fixes for {self.linter_name}"
+            msg = LintResult(name=name, original=original, replacement=replacement)
+            first_results.append(msg)
+
+        return replacement, first_results
+
+    def _display(self, pf: PythonFile, results: list[LintResult]) -> Iterator[str]:
+        """Emit a series of human-readable strings representing the results"""
+        for r in results:
+            if self.args.lintrunner:
+                msg = r.as_message(code=self.code, path=str(pf.path))
+                yield json.dumps(msg.asdict(), sort_keys=True)
+            else:
+                if self.result_shown:
+                    yield ""
+                else:
+                    self.result_shown = True
+                if r.line is None:
+                    yield f"{pf.path}: {r.name}"
+                else:
+                    yield from (i.rstrip() for i in self._display_window(pf, r))
+
+    def _display_window(self, pf: PythonFile, r: LintResult) -> Iterator[str]:
+        """Display a window onto the code with an error"""
+        if r.char is None or not self.report_column_numbers:
+            yield f"{pf.path}:{r.line}: {r.name}"
+        else:
+            yield f"{pf.path}:{r.line}:{r.char + 1}: {r.name}"
+
+        begin = max((r.line or 0) - ErrorLines.BEFORE, 1)
+        end = min(begin + ErrorLines.WINDOW, 1 + len(pf.lines))
+
+        for lineno in range(begin, end):
+            source_line = pf.lines[lineno - 1].rstrip()
+            yield f"{lineno:5} | {source_line}"
+            if lineno == r.line:
+                spaces = 8 + (r.char or 0)
+                carets = len(source_line) if r.char is None else (r.length or 1)
+                yield spaces * " " + carets * "^"
diff --git a/tools/linter/adapters/_linter/messages.py b/tools/linter/adapters/_linter/messages.py
new file mode 100644
index 000000000000..5408d3600185
--- /dev/null
+++ b/tools/linter/adapters/_linter/messages.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+import dataclasses as dc
+from enum import Enum
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+@dc.dataclass
+class LintMessage:
+    """This is a datatype representation of the JSON that gets sent to lintrunner
+    as described here:
+    https://docs.rs/lintrunner/latest/lintrunner/lint_message/struct.LintMessage.html
+    """
+
+    code: str
+    name: str
+    severity: LintSeverity
+
+    char: int | None = None
+    description: str | None = None
+    line: int | None = None
+    original: str | None = None
+    path: str | None = None
+    replacement: str | None = None
+
+    asdict = dc.asdict
+
+
+@dc.dataclass
+class LintResult:
+    """LintResult is a single result from a linter.
+
+    Like LintMessage but the .length member allows you to make specific edits to
+    one location within a file, not just replace the whole file.
+
+    Linters can generate recursive results - results that contain other results.
+
+    For example, the annotation linter would find two results in this code sample:
+
+        index = Union[Optional[str], int]
+
+    And the first result, `Union[Optional[str], int]`, contains the second one,
+    `Optional[str]`, so the first result is recursive but the second is not.
+
+    If --fix is selected, the linter does a cycle of tokenizing and fixing all
+    the non-recursive edits until no edits remain.
+    """
+
+    name: str
+
+    line: int | None = None
+    char: int | None = None
+    replacement: str | None = None
+    length: int | None = None  # Not in LintMessage
+    description: str | None = None
+    original: str | None = None
+
+    is_recursive: bool = False  # Not in LintMessage
+
+    @property
+    def is_edit(self) -> bool:
+        return None not in (self.char, self.length, self.line, self.replacement)
+
+    def apply(self, lines: list[str]) -> None:
+        if not (
+            self.char is None
+            or self.length is None
+            or self.line is None
+            or self.replacement is None
+        ):
+            line = lines[self.line - 1]
+            before = line[: self.char]
+            after = line[self.char + self.length :]
+            lines[self.line - 1] = f"{before}{self.replacement}{after}"
+
+    def contains(self, r: LintResult) -> bool:
+        assert self.char is not None and self.line is not None
+        assert r.char is not None and r.line is not None
+        return self.line == r.line and self.char <= r.char and self.end >= r.end
+
+    @property
+    def end(self) -> int:
+        assert self.char is not None and self.length is not None
+        return self.char + self.length
+
+    def as_message(self, code: str, path: str) -> LintMessage:
+        d = dc.asdict(self)
+        d.pop("is_recursive")
+        d.pop("length")
+        if self.is_edit:
+            # This is one of our , which we don't want to
+            # send to lintrunner as a replacement
+            d["replacement"] = None
+
+        return LintMessage(code=code, path=path, severity=LintSeverity.ERROR, **d)
+
+    def sort_key(self) -> tuple[int, int, str]:
+        line = -1 if self.line is None else self.line
+        char = -1 if self.char is None else self.char
+        return line, char, self.name
diff --git a/tools/linter/adapters/_linter/python_file.py b/tools/linter/adapters/_linter/python_file.py
new file mode 100644
index 000000000000..41ebfba6ea47
--- /dev/null
+++ b/tools/linter/adapters/_linter/python_file.py
@@ -0,0 +1,183 @@
+from __future__ import annotations
+
+import token
+from functools import cached_property
+from pathlib import Path
+from tokenize import generate_tokens, TokenInfo
+from typing import TYPE_CHECKING
+from typing_extensions import Self
+
+from . import EMPTY_TOKENS, NO_TOKEN, ParseError, ROOT
+from .blocks import blocks
+from .sets import LineWithSets
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from .block import Block
+
+
+class PythonFile:
+    contents: str
+    lines: list[str]
+    path: Path | None
+    linter_name: str
+
+    def __init__(
+        self,
+        linter_name: str,
+        path: Path | None = None,
+        contents: str | None = None,
+    ) -> None:
+        self.linter_name = linter_name
+        self.path = path and (path.relative_to(ROOT) if path.is_absolute() else path)
+        if contents is None and path is not None:
+            contents = path.read_text()
+
+        self.contents = contents or ""
+        self.lines = self.contents.splitlines(keepends=True)
+
+    @classmethod
+    def make(cls, linter_name: str, pc: Path | str | None = None) -> Self:
+        if isinstance(pc, Path):
+            return cls(linter_name, path=pc)
+        return cls(linter_name, contents=pc)
+
+    def with_contents(self, contents: str) -> Self:
+        return self.__class__(self.linter_name, self.path, contents)
+
+    @cached_property
+    def omitted(self) -> OmittedLines:
+        assert self.linter_name is not None
+        return OmittedLines(self.lines, self.linter_name)
+
+    @cached_property
+    def tokens(self) -> list[TokenInfo]:
+        # Might raise IndentationError if the code is mal-indented
+        return list(generate_tokens(iter(self.lines).__next__))
+
+    @cached_property
+    def token_lines(self) -> list[list[TokenInfo]]:
+        """Returns lists of TokenInfo segmented by token.NEWLINE"""
+        token_lines: list[list[TokenInfo]] = [[]]
+
+        for t in self.tokens:
+            if t.type not in (token.COMMENT, token.ENDMARKER, token.NL):
+                token_lines[-1].append(t)
+                if t.type == token.NEWLINE:
+                    token_lines.append([])
+        if token_lines and not token_lines[-1]:
+            token_lines.pop()
+        return token_lines
+
+    @cached_property
+    def import_lines(self) -> list[list[int]]:
+        froms, imports = [], []
+        for i, (t, *_) in enumerate(self.token_lines):
+            if t.type == token.INDENT:
+                break
+            if t.type == token.NAME:
+                if t.string == "from":
+                    froms.append(i)
+                elif t.string == "import":
+                    imports.append(i)
+
+        return [froms, imports]
+
+    @cached_property
+    def opening_comment_lines(self) -> int:
+        """The number of comments at the very top of the file."""
+        it = (i for i, s in enumerate(self.lines) if not s.startswith("#"))
+        return next(it, 0)
+
+    def __getitem__(self, i: int | slice) -> TokenInfo | Sequence[TokenInfo]:
+        return self.tokens[i]
+
+    def next_token(self, start: int, token_type: int, error: str) -> int:
+        for i in range(start, len(self.tokens)):
+            if self.tokens[i].type == token_type:
+                return i
+        raise ParseError(self.tokens[-1], error)
+
+    def docstring(self, start: int) -> str:
+        for i in range(start + 1, len(self.tokens)):
+            tk = self.tokens[i]
+            if tk.type == token.STRING:
+                return tk.string
+            if tk.type not in EMPTY_TOKENS:
+                return ""
+        return ""
+
+    @cached_property
+    def indent_to_dedent(self) -> dict[int, int]:
+        dedents = dict[int, int]()
+        stack = list[int]()
+
+        for i, t in enumerate(self.tokens):
+            if t.type == token.INDENT:
+                stack.append(i)
+            elif t.type == token.DEDENT:
+                dedents[stack.pop()] = i
+
+        return dedents
+
+    @cached_property
+    def errors(self) -> dict[str, str]:
+        return {}
+
+    @cached_property
+    def braced_sets(self) -> list[Sequence[TokenInfo]]:
+        lines = [t for tl in self._lines_with_sets for t in tl.braced_sets]
+        return [s for s in lines if not self.omitted(s)]
+
+    @cached_property
+    def sets(self) -> list[TokenInfo]:
+        tokens = [t for tl in self._lines_with_sets for t in tl.sets]
+        return [t for t in tokens if not self.omitted([t])]
+
+    @cached_property
+    def insert_import_line(self) -> int | None:
+        froms, imports = self.import_lines
+        for i in froms + imports:
+            tl = self.token_lines[i]
+            if any(i.type == token.NAME and i.string == "OrderedSet" for i in tl):
+                return None
+        if section := froms or imports:
+            return self._lines_with_sets[section[-1]].tokens[-1].start[0] + 1
+        return self.opening_comment_lines + 1
+
+    @cached_property
+    def _lines_with_sets(self) -> list[LineWithSets]:
+        return [LineWithSets(tl) for tl in self.token_lines]
+
+    @cached_property
+    def blocks(self) -> list[Block]:
+        res = blocks(self.tokens)
+        self.errors.update(res.errors)
+        return res.blocks
+
+
+class OmittedLines:
+    """Read lines textually and find comment lines that end in 'noqa {linter_name}'"""
+
+    omitted: set[int]
+
+    def __init__(self, lines: Sequence[str], linter_name: str) -> None:
+        self.lines = lines
+        suffix = f"# noqa: {linter_name}"
+        omitted = ((i, s.rstrip()) for i, s in enumerate(lines))
+        self.omitted = {i + 1 for i, s in omitted if s.endswith(suffix)}
+
+    def __call__(
+        self, tokens: Sequence[TokenInfo], begin: int = 0, end: int = NO_TOKEN
+    ) -> bool:
+        if end == NO_TOKEN:
+            end = len(tokens)
+        # A token_line might span multiple physical lines
+        start = min((tokens[i].start[0] for i in range(begin, end)), default=0)
+        end = max((tokens[i].end[0] for i in range(begin, end)), default=-1)
+        return self.contains_lines(start, end)
+
+    def contains_lines(self, begin: int, end: int) -> bool:
+        return bool(self.omitted.intersection(range(begin, end + 1)))
diff --git a/tools/linter/adapters/_linter/sets.py b/tools/linter/adapters/_linter/sets.py
new file mode 100644
index 000000000000..0aab76876acf
--- /dev/null
+++ b/tools/linter/adapters/_linter/sets.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+import dataclasses as dc
+import token
+from functools import cached_property
+from typing import TYPE_CHECKING
+
+from . import EMPTY_TOKENS
+from .bracket_pairs import bracket_pairs
+
+
+if TYPE_CHECKING:
+    from tokenize import TokenInfo
+
+
+@dc.dataclass
+class LineWithSets:
+    """A logical line of Python tokens, terminated by a NEWLINE or the end of file"""
+
+    tokens: list[TokenInfo]
+
+    @cached_property
+    def sets(self) -> list[TokenInfo]:
+        """A list of tokens which use the built-in set symbol"""
+        return [t for i, t in enumerate(self.tokens) if self.is_set(i)]
+
+    @cached_property
+    def braced_sets(self) -> list[list[TokenInfo]]:
+        """A list of lists of tokens, each representing a braced set, like {1}"""
+        return [
+            self.tokens[b : e + 1]
+            for b, e in self.bracket_pairs.items()
+            if self.is_braced_set(b, e)
+        ]
+
+    @cached_property
+    def bracket_pairs(self) -> dict[int, int]:
+        return bracket_pairs(self.tokens)
+
+    def is_set(self, i: int) -> bool:
+        t = self.tokens[i]
+        after = i < len(self.tokens) - 1 and self.tokens[i + 1]
+        if t.string == "Set" and t.type == token.NAME:
+            return after and after.string == "[" and after.type == token.OP
+        return (
+            (t.string == "set" and t.type == token.NAME)
+            and not (i and self.tokens[i - 1].string in ("def", "."))
+            and not (after and after.string == "=" and after.type == token.OP)
+        )
+
+    def is_braced_set(self, begin: int, end: int) -> bool:
+        if (
+            begin + 1 == end
+            or self.tokens[begin].string != "{"
+            or begin
+            and self.tokens[begin - 1].string == "in"  # skip `x in {1, 2, 3}`
+        ):
+            return False
+
+        i = begin + 1
+        empty = True
+        while i < end:
+            t = self.tokens[i]
+            if t.type == token.OP and t.string in (":", "**"):
+                return False
+            if brace_end := self.bracket_pairs.get(i):
+                # Skip to the end of a subexpression
+                i = brace_end
+            elif t.type not in EMPTY_TOKENS:
+                empty = False
+            i += 1
+        return not empty
diff --git a/tools/linter/adapters/black_linter.py b/tools/linter/adapters/black_linter.py
index c5229273178c..be75d595ed13 100644
--- a/tools/linter/adapters/black_linter.py
+++ b/tools/linter/adapters/black_linter.py
@@ -9,16 +9,23 @@
 import sys
 import time
 from enum import Enum
+<<<<<<< HEAD
 from typing import Any, BinaryIO, NamedTuple
+=======
+from typing import BinaryIO, NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 IS_WINDOWS: bool = os.name == "nt"
 
 
+<<<<<<< HEAD
 def eprint(*args: Any, **kwargs: Any) -> None:
     print(*args, file=sys.stderr, flush=True, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LintSeverity(str, Enum):
     ERROR = "error"
     WARNING = "warning"
diff --git a/tools/linter/adapters/clangformat_linter.py b/tools/linter/adapters/clangformat_linter.py
index c775e2f7e7e8..a601906ce5f4 100644
--- a/tools/linter/adapters/clangformat_linter.py
+++ b/tools/linter/adapters/clangformat_linter.py
@@ -10,16 +10,23 @@
 import time
 from enum import Enum
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, NamedTuple
+=======
+from typing import NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 IS_WINDOWS: bool = os.name == "nt"
 
 
+<<<<<<< HEAD
 def eprint(*args: Any, **kwargs: Any) -> None:
     print(*args, file=sys.stderr, flush=True, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LintSeverity(str, Enum):
     ERROR = "error"
     WARNING = "warning"
diff --git a/tools/linter/adapters/clangtidy_linter.py b/tools/linter/adapters/clangtidy_linter.py
index 0859f6e59d47..b644ef37f05b 100644
--- a/tools/linter/adapters/clangtidy_linter.py
+++ b/tools/linter/adapters/clangtidy_linter.py
@@ -13,7 +13,11 @@
 from enum import Enum
 from pathlib import Path
 from sysconfig import get_paths as gp
+<<<<<<< HEAD
 from typing import Any, NamedTuple
+=======
+from typing import NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # PyTorch directory root
@@ -31,7 +35,10 @@ def scm_root() -> str:
 
 
 PYTORCH_ROOT = scm_root()
+<<<<<<< HEAD
 IS_WINDOWS: bool = os.name == "nt"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Returns '/usr/local/include/python<version number>'
@@ -39,10 +46,13 @@ def get_python_include_dir() -> str:
     return gp()["include"]
 
 
+<<<<<<< HEAD
 def eprint(*args: Any, **kwargs: Any) -> None:
     print(*args, file=sys.stderr, flush=True, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LintSeverity(str, Enum):
     ERROR = "error"
     WARNING = "warning"
@@ -62,10 +72,13 @@ class LintMessage(NamedTuple):
     description: str | None
 
 
+<<<<<<< HEAD
 def as_posix(name: str) -> str:
     return name.replace("\\", "/") if IS_WINDOWS else name
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # c10/core/DispatchKey.cpp:281:26: error: 'k' used after it was moved [bugprone-use-after-move]
 RESULTS_RE: re.Pattern[str] = re.compile(
     r"""(?mx)
diff --git a/tools/linter/adapters/codespell_linter.py b/tools/linter/adapters/codespell_linter.py
new file mode 100644
index 000000000000..eb7c55081e53
--- /dev/null
+++ b/tools/linter/adapters/codespell_linter.py
@@ -0,0 +1,164 @@
+from __future__ import annotations
+
+import argparse
+import concurrent.futures
+import json
+import logging
+import os
+import subprocess
+import sys
+from enum import Enum
+from pathlib import Path
+from typing import NamedTuple
+
+
+REPO_ROOT = Path(__file__).absolute().parents[3]
+PYPROJECT = REPO_ROOT / "pyproject.toml"
+DICTIONARY = REPO_ROOT / "tools" / "linter" / "dictionary.txt"
+
+FORBIDDEN_WORDS = {
+    "multipy",  # project pytorch/multipy is dead  # codespell:ignore multipy
+}
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: str | None
+    line: int | None
+    char: int | None
+    code: str
+    severity: LintSeverity
+    name: str
+    original: str | None
+    replacement: str | None
+    description: str | None
+
+
+def format_error_message(
+    filename: str,
+    error: Exception | None = None,
+    *,
+    message: str | None = None,
+) -> LintMessage:
+    if message is None and error is not None:
+        message = (
+            f"Failed due to {error.__class__.__name__}:\n{error}\n"
+            "Please either fix the error or "
+            "add the word(s) to the dictionary file (lowercase is preferred)."
+        )
+    return LintMessage(
+        path=filename,
+        line=None,
+        char=None,
+        code="CODESPELL",
+        severity=LintSeverity.ERROR,
+        name="spelling error",
+        original=None,
+        replacement=None,
+        description=message,
+    )
+
+
+def run_codespell(path: Path) -> str:
+    try:
+        return subprocess.check_output(
+            [
+                sys.executable,
+                "-m",
+                "codespell_lib",
+                "--toml",
+                str(PYPROJECT),
+                str(path),
+            ],
+            stderr=subprocess.STDOUT,
+            text=True,
+            encoding="utf-8",
+        )
+    except subprocess.CalledProcessError as exc:
+        raise ValueError(exc.output) from exc
+
+
+def check_file(filename: str) -> list[LintMessage]:
+    path = Path(filename).absolute()
+    try:
+        run_codespell(path)
+    except Exception as err:
+        return [format_error_message(filename, err)]
+    return []
+
+
+def check_dictionary(filename: str) -> list[LintMessage]:
+    """Check the dictionary file for duplicates."""
+    path = Path(filename).absolute()
+    try:
+        words = path.read_text(encoding="utf-8").splitlines()
+        words_set = set(words)
+        if len(words) != len(words_set):
+            raise ValueError("The dictionary file contains duplicate entries.")
+        uncased_words = list(map(str.lower, words))
+        if uncased_words != sorted(uncased_words):
+            raise ValueError(
+                "The dictionary file is not sorted alphabetically (case-insensitive)."
+            )
+        for forbidden_word in sorted(
+            FORBIDDEN_WORDS & (words_set | set(uncased_words))
+        ):
+            raise ValueError(
+                f"The dictionary file contains a forbidden word: {forbidden_word!r}. "
+                "Please remove it from the dictionary file and use 'codespell:ignore' "
+                "inline comment instead."
+            )
+    except Exception as err:
+        return [format_error_message(str(filename), err)]
+    return []
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Check files for spelling mistakes using codespell.",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose logging",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        format="<%(processName)s:%(levelname)s> %(message)s",
+        level=logging.NOTSET
+        if args.verbose
+        else logging.DEBUG
+        if len(args.filenames) < 1000
+        else logging.INFO,
+        stream=sys.stderr,
+    )
+
+    with concurrent.futures.ProcessPoolExecutor(
+        max_workers=os.cpu_count(),
+    ) as executor:
+        futures = {executor.submit(check_file, x): x for x in args.filenames}
+        futures[executor.submit(check_dictionary, str(DICTIONARY))] = str(DICTIONARY)
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                for lint_message in future.result():
+                    print(json.dumps(lint_message._asdict()), flush=True)
+            except Exception:
+                logging.critical('Failed at "%s".', futures[future])
+                raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/linter/adapters/docstring_linter-grandfather.json b/tools/linter/adapters/docstring_linter-grandfather.json
new file mode 100644
index 000000000000..672abe1f1c2f
--- /dev/null
+++ b/tools/linter/adapters/docstring_linter-grandfather.json
@@ -0,0 +1,347 @@
+{
+  "torch/_inductor/async_compile.py": {
+    "class AsyncCompile": 281
+  },
+  "torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py": {
+    "class MMRankingA100": 278,
+    "def MMRankingA100.fill_choices()": 199
+  },
+  "torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py": {
+    "class MMRankingH100": 303,
+    "def MMRankingH100.fill_choices()": 203
+  },
+  "torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py": {
+    "class MixedMMA100": 132,
+    "def MixedMMA100.get_best_choices()": 85
+  },
+  "torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py": {
+    "class MixedMMH100": 131,
+    "def MixedMMH100.get_best_choices()": 85
+  },
+  "torch/_inductor/autotune_process.py": {
+    "class CUDABenchmarkRequest": 115,
+    "class TritonBenchmarkRequest": 121,
+    "def TritonBenchmarkRequest.make_run_fn()": 81
+  },
+  "torch/_inductor/bounds.py": {
+    "class ValueRangeAnalysis": 107
+  },
+  "torch/_inductor/codecache.py": {
+    "class AotCodeCompiler": 516,
+    "class CUDACodeCache": 107,
+    "class CppCodeCache": 125,
+    "class CppPythonBindingsCodeCache": 168,
+    "class HalideCodeCache": 350
+  },
+  "torch/_inductor/codegen/common.py": {
+    "class CSE": 167,
+    "class CSEProxy": 310,
+    "class Kernel": 286,
+    "class KernelArgs": 325,
+    "class OpOverrides": 227
+  },
+  "torch/_inductor/codegen/cpp.py": {
+    "class CppKernel": 572,
+    "class CppKernelProxy": 601,
+    "class CppOverrides": 429,
+    "class CppScheduling": 777,
+    "class CppVecKernel": 857,
+    "class OuterLoopFusedSchedulerNode": 159,
+    "def CppKernel.codegen_loops_impl()": 144,
+    "def CppKernelProxy.codegen_functions()": 183,
+    "def CppKernelProxy.legalize_lowp_fp_dtype_loopbody()": 224,
+    "def CppScheduling.fuse()": 81,
+    "def CppVecKernel.reduction()": 193,
+    "def CppVecKernel.reduction_combine_vec()": 87,
+    "def TilingSelect.select_tiling()": 165
+  },
+  "torch/_inductor/codegen/cpp_flex_attention_template.py": {
+    "class CppFlexAttentionTemplate": 374,
+    "def CppFlexAttentionTemplate.modification()": 94
+  },
+  "torch/_inductor/codegen/cpp_gemm_template.py": {
+    "class CppGemmTemplate": 998,
+    "def CppGemmTemplate.add_choices()": 163,
+    "def CppGemmTemplate.get_options()": 243
+  },
+  "torch/_inductor/codegen/cpp_grouped_gemm_template.py": {
+    "def CppGroupedGemmTemplate.add_choices()": 141,
+    "def CppGroupedGemmTemplate.render()": 146
+  },
+  "torch/_inductor/codegen/cpp_micro_gemm.py": {
+    "def create_micro_gemm()": 94
+  },
+  "torch/_inductor/codegen/cpp_template.py": {
+    "class CppTemplate": 114
+  },
+  "torch/_inductor/codegen/cpp_template_kernel.py": {
+    "class CppTemplateKernel": 469,
+    "def CppTemplateKernel.store_outputs()": 102
+  },
+  "torch/_inductor/codegen/cpp_utils.py": {
+    "def create_epilogue_with_attr()": 165
+  },
+  "torch/_inductor/codegen/cpp_wrapper_cpu.py": {
+    "def CppWrapperCpu.generate_extern_kernel_args_decl_if_needed()": 152,
+    "def CppWrapperCpu.generate_input_output_runtime_checks()": 115,
+    "def CppWrapperCpu.generate_py_arg()": 96,
+    "def CppWrapperCpu.val_to_arg_str()": 88,
+    "def CppWrapperCpu.write_wrapper_decl()": 140
+  },
+  "torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py": {
+    "def CppWrapperCpuArrayRef.generate_return()": 127,
+    "def CppWrapperCpuArrayRef.write_wrapper_decl()": 208
+  },
+  "torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py": {
+    "def EmitGemmUniversal3xInstanceWithEVT.emit()": 98
+  },
+  "torch/_inductor/codegen/cuda/device_op_overrides.py": {
+    "class CUDADeviceOpOverrides": 222,
+    "def CUDADeviceOpOverrides.tma_descriptor_helpers()": 102
+  },
+  "torch/_inductor/codegen/cuda/gemm_template.py": {
+    "class CUTLASS2xGemmTemplate": 265,
+    "class CUTLASS3xGemmTemplate": 326
+  },
+  "torch/_inductor/codegen/debug_utils.py": {
+    "class DebugPrinterManager": 228
+  },
+  "torch/_inductor/codegen/halide.py": {
+    "class HalideKernel": 982,
+    "class HalideOverrides": 329,
+    "class HalidePrinter": 129,
+    "def HalideKernel.halide_kernel_meta()": 82
+  },
+  "torch/_inductor/codegen/mps.py": {
+    "class MetalKernel": 354,
+    "class MetalOverrides": 335,
+    "def MetalKernel.reduction()": 109
+  },
+  "torch/_inductor/codegen/rocm/ck_conv_template.py": {
+    "class CKGroupedConvFwdTemplate": 531,
+    "def CKGroupedConvFwdTemplate.globals()": 143
+  },
+  "torch/_inductor/codegen/rocm/ck_universal_gemm_template.py": {
+    "class CKGemmTemplate": 947
+  },
+  "torch/_inductor/codegen/rocm/rocm_benchmark_request.py": {
+    "class ROCmBenchmarkRequest": 117
+  },
+  "torch/_inductor/codegen/simd.py": {
+    "class IterationRangesRoot": 122,
+    "class SIMDScheduling": 1054,
+    "def SIMDScheduling.candidate_tilings()": 126,
+    "def SIMDScheduling.generate_node_schedule()": 95
+  },
+  "torch/_inductor/codegen/triton.py": {
+    "class BlockPtrOptions": 272,
+    "class TritonKernel": 2455,
+    "class TritonOverrides": 505,
+    "class TritonPrinter": 172,
+    "class TritonScheduling": 396,
+    "def TritonKernel.codegen_kernel()": 222,
+    "def TritonKernel.codegen_kernel_benchmark()": 89,
+    "def TritonKernel.load()": 134,
+    "def TritonKernel.reduction()": 383,
+    "def TritonKernel.scan()": 103,
+    "def TritonScheduling.benchmark_codegened_module()": 83,
+    "def TritonScheduling.benchmark_combo_kernel()": 91
+  },
+  "torch/_inductor/codegen/triton_combo_kernel.py": {
+    "class ComboKernel": 808,
+    "def ComboKernel.codegen_kernel_benchmark()": 89
+  },
+  "torch/_inductor/codegen/triton_split_scan.py": {
+    "def TritonSplitScanKernel.scan()": 114
+  },
+  "torch/_inductor/codegen/wrapper.py": {
+    "def PythonWrapperCodegen.benchmark_compiled_module()": 92,
+    "def PythonWrapperCodegen.define_user_defined_triton_kernel()": 249,
+    "def PythonWrapperCodegen.generate_example_arg_value()": 83,
+    "def user_defined_kernel_grid_fn_code()": 96
+  },
+  "torch/_inductor/comm_lowering.py": {
+    "def register_comm_lowerings()": 189
+  },
+  "torch/_inductor/comms.py": {
+    "def enforce_comm_ordering_for_fsdp()": 170,
+    "def reinplace_fsdp_all_gather()": 110
+  },
+  "torch/_inductor/compile_fx.py": {
+    "def _InProcessFxCompile.codegen_and_compile()": 379,
+    "def fw_compiler_freezing()": 93
+  },
+  "torch/_inductor/config.py": {
+    "class cpp": 107,
+    "class triton": 182
+  },
+  "torch/_inductor/constant_folding.py": {
+    "class ConstantFolder": 223,
+    "def ConstantFolder.run_node()": 94
+  },
+  "torch/_inductor/cpu_vec_isa.py": {
+    "class VecISA": 120
+  },
+  "torch/_inductor/debug.py": {
+    "class DebugContext": 158,
+    "class DebugFormatter": 189,
+    "def DebugFormatter.log_autotuning_results()": 81
+  },
+  "torch/_inductor/dependencies.py": {
+    "class MemoryDep": 225
+  },
+  "torch/_inductor/fx_passes/b2b_gemm.py": {
+    "def b2b_gemm_handler()": 180
+  },
+  "torch/_inductor/fx_passes/binary_folding.py": {
+    "def binary_folding_init()": 416
+  },
+  "torch/_inductor/fx_passes/freezing_patterns.py": {
+    "def addmm_patterns_init()": 94
+  },
+  "torch/_inductor/fx_passes/group_batch_fusion.py": {
+    "def BatchLayernormFusion.fuse()": 131,
+    "def PostGradBatchLinearFusion.fuse()": 83,
+    "def PreGradBatchLinearFusion.fuse()": 87
+  },
+  "torch/_inductor/fx_passes/joint_graph.py": {
+    "def constant_fold_uniform_value()": 109,
+    "def remove_no_ops()": 93
+  },
+  "torch/_inductor/fx_passes/micro_pipeline_tp.py": {
+    "def find_all_gather_patterns()": 116,
+    "def find_reduce_scatter_patterns()": 125
+  },
+  "torch/_inductor/fx_passes/post_grad.py": {
+    "def lower_scan_to_while_loop()": 154
+  },
+  "torch/_inductor/fx_passes/split_cat.py": {
+    "def SplitCatSimplifier.replace_cat()": 145,
+    "def merge_getitem_cat()": 97,
+    "def merge_split_cat_aten()": 87,
+    "def move_reshape_out_of_split_stack()": 110
+  },
+  "torch/_inductor/fx_utils.py": {
+    "def FakeTensorUpdater.incremental_update()": 100
+  },
+  "torch/_inductor/graph.py": {
+    "class GraphLowering": 2032,
+    "def GraphLowering.call_function()": 116,
+    "def GraphLowering.extract_autotune_inputs()": 90,
+    "def GraphLowering.output()": 87,
+    "def GraphLowering.placeholder()": 92,
+    "def GraphLowering.run_node()": 380
+  },
+  "torch/_inductor/ir.py": {
+    "class Buffer": 122,
+    "class ComputedBuffer": 329,
+    "class Conditional": 138,
+    "class ExternKernel": 793,
+    "class FallbackKernel": 439,
+    "class FlexibleLayout": 139,
+    "class IRNode": 244,
+    "class Layout": 202,
+    "class Loops": 128,
+    "class Reduction": 737,
+    "class Scan": 199,
+    "class Sort": 150,
+    "class UserDefinedTritonKernel": 183,
+    "class View": 174,
+    "class WelfordReduction": 221,
+    "class WhileLoop": 203,
+    "def ConcatKernel.create()": 95,
+    "def ExternKernel.process_kernel()": 110,
+    "def ExternKernel.require_strides()": 149,
+    "def FallbackKernel.create()": 81,
+    "def FallbackKernel.export_extern_kernel_node()": 82,
+    "def Reduction.create()": 136,
+    "def Reduction.num_splits()": 152,
+    "def Scan.create()": 83,
+    "def WelfordReduction.create()": 110,
+    "def WhileLoop.create()": 161
+  },
+  "torch/_inductor/jagged_lowerings.py": {
+    "def register_jagged_ops()": 156
+  },
+  "torch/_inductor/kernel/bmm.py": {
+    "def tuned_bmm()": 91
+  },
+  "torch/_inductor/kernel/conv.py": {
+    "def convolution()": 231
+  },
+  "torch/_inductor/kernel/flex_attention.py": {
+    "def flex_attention()": 303,
+    "def flex_attention_backward()": 323,
+    "def lower_cpu()": 273
+  },
+  "torch/_inductor/kernel/flex_decoding.py": {
+    "def create_flex_decoding_kernel()": 288
+  },
+  "torch/_inductor/kernel/mm.py": {
+    "def tuned_addmm()": 169,
+    "def tuned_mm()": 127,
+    "def tuned_scaled_mm()": 130
+  },
+  "torch/_inductor/loop_body.py": {
+    "class CaptureIndexing": 174
+  },
+  "torch/_inductor/lowering.py": {
+    "def avg_pool2d_backward()": 155,
+    "def avg_pool3d_backward()": 189,
+    "def cat()": 123,
+    "def index_put_impl_()": 125,
+    "def make_pointwise()": 85,
+    "def max_pool2d_with_indices_backward()": 140,
+    "def scatter_reduce_()": 111,
+    "def sdpa_constraint()": 132,
+    "def searchsorted()": 84
+  },
+  "torch/_inductor/mkldnn_ir.py": {
+    "class MkldnnRnnLayer": 114
+  },
+  "torch/_inductor/mkldnn_lowerings.py": {
+    "def register_onednn_fusion_ops()": 1152
+  },
+  "torch/_inductor/mock_cache.py": {
+    "class PatchCaches": 108
+  },
+  "torch/_inductor/pattern_matcher.py": {
+    "class ReplacementPatternEntry": 196,
+    "def ReplacementPatternEntry.replace_with_graph()": 177
+  },
+  "torch/_inductor/quantized_lowerings.py": {
+    "def register_woq_mm_ops()": 136
+  },
+  "torch/_inductor/runtime/autotune_cache.py": {
+    "class AutotuneCache": 190
+  },
+  "torch/_inductor/runtime/benchmarking.py": {
+    "class InductorBenchmarker": 111
+  },
+  "torch/_inductor/scheduler.py": {
+    "class BaseSchedulerNode": 697,
+    "class BaseScheduling": 139,
+    "class Scheduler": 2568,
+    "class SchedulerBuffer": 103,
+    "class SchedulerNode": 256
+  },
+  "torch/_inductor/select_algorithm.py": {
+    "class AlgorithmSelectorCache": 694,
+    "class TritonTemplate": 224,
+    "class TritonTemplateKernel": 770,
+    "def AlgorithmSelectorCache.log_results()": 92,
+    "def AlgorithmSelectorCache.make_benchmark_fn[2]()": 145
+  },
+  "torch/_inductor/sizevars.py": {
+    "class SizeVarAllocator": 780
+  },
+  "torch/_inductor/template_heuristics.py": {
+    "class ROCmConfigHeuristic": 212
+  },
+  "torch/_inductor/utils.py": {
+    "class IndentedBuffer": 136
+  },
+  "torch/_inductor/wrapper_benchmark.py": {
+    "def parse_profile_event_list()": 119
+  }
+}
\ No newline at end of file
diff --git a/tools/linter/adapters/docstring_linter.py b/tools/linter/adapters/docstring_linter.py
index 2bab9630006e..07f9e68a9b6a 100644
--- a/tools/linter/adapters/docstring_linter.py
+++ b/tools/linter/adapters/docstring_linter.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+<<<<<<< HEAD
 import sys
 import token
 from functools import cached_property
@@ -11,12 +12,27 @@
 _PATH = [Path(p).absolute() for p in sys.path]
 
 if TYPE_CHECKING or _PARENT not in _PATH:
+=======
+import itertools
+import json
+import sys
+from functools import cached_property
+from pathlib import Path
+from typing import Any, Callable, TYPE_CHECKING
+
+
+_FILE = Path(__file__).absolute()
+_PATH = [Path(p).absolute() for p in sys.path]
+
+if TYPE_CHECKING or _FILE.parent not in _PATH:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from . import _linter
 else:
     import _linter
 
 if TYPE_CHECKING:
     from collections.abc import Iterator, Sequence
+<<<<<<< HEAD
     from tokenize import TokenInfo
 
 
@@ -36,12 +52,35 @@
 
 def _is_def(t: TokenInfo) -> bool:
     return t.type == token.NAME and t.string in ("class", "def")
+=======
+
+
+GRANDFATHER_LIST = _FILE.parent / "docstring_linter-grandfather.json"
+
+# We tolerate a 10% increase in block size before demanding a docstring
+TOLERANCE_PERCENT = 10
+
+MAX_LINES = {"class": 100, "def": 80}
+
+MIN_DOCSTRING = 50  # docstrings shorter than this are too short
+
+DESCRIPTION = """
+`docstring_linter` reports on long functions, methods or classes without docstrings
+""".strip()
+
+METHOD_OVERRIDE_HINT = (
+    "If the method overrides a method on a parent class, adding the"
+    " `@typing_extensions.override` decorator will make this error"
+    " go away."
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DocstringLinter(_linter.FileLinter):
     linter_name = "docstring_linter"
     description = DESCRIPTION
     is_fixer = False
+<<<<<<< HEAD
     results: dict[str, list[tuple[int, Path, str]]]
 
     def __init__(self, argv: list[str] | None = None) -> None:
@@ -163,6 +202,252 @@ def indent_to_dedent(tokens: Sequence[TokenInfo]) -> dict[int, int]:
     # Can't happen: the tokenization process would already have failed on a bad indent
 
     return indent_to_dedent
+=======
+
+    path_to_blocks: dict[str, list[dict[str, Any]]]
+    path_to_errors: dict[str, list[dict[str, Any]]]
+
+    def __init__(self, argv: Sequence[str] | None = None) -> None:
+        super().__init__(argv)
+        add_arguments(self.parser.add_argument)
+        self.path_to_blocks = {}
+        self.path_to_errors = {}
+
+    def lint_all(self) -> bool:
+        success = super().lint_all()
+        self._report()
+        self._write_grandfather()
+        return success
+
+    def _lint(self, pf: _linter.PythonFile) -> Iterator[_linter.LintResult]:
+        if (p := str(pf.path)) in self.path_to_blocks:
+            print("Repeated file", p, file=sys.stderr)
+            return
+
+        blocks = pf.blocks
+        bad = {b for b in blocks if self._is_bad_block(b, pf)}
+        bad = self._dont_require_constructor_and_class_docs(blocks, bad)
+        gf = self._grandfathered(pf.path, bad)
+
+        yield from (self._block_result(b, pf) for b in sorted(bad - gf))
+
+        def as_data(b: _linter.Block) -> dict[str, Any]:
+            status = "grandfather" if b in gf else "bad" if b in bad else "good"
+            return {"status": status, **b.as_data()}
+
+        self.path_to_blocks[p] = [as_data(b) for b in blocks]
+
+    def _error(self, pf: _linter.PythonFile, result: _linter.LintResult) -> None:
+        self.path_to_errors[str(pf.path)] = [{str(result.line): result.name}]
+
+    @cached_property
+    def _grandfather(self) -> dict[str, dict[str, Any]]:
+        try:
+            with open(self.args.grandfather) as fp:
+                return json.load(fp)  # type: ignore[no-any-return]
+        except FileNotFoundError:
+            return {}
+        except Exception as e:
+            print("ERROR:", e, "in", GRANDFATHER_LIST, file=sys.stderr)
+            raise
+
+    @cached_property
+    def _max_lines(self) -> dict[str, int]:
+        return {"class": self.args.max_class, "def": self.args.max_def}
+
+    def _grandfathered(
+        self, path: Path | None, bad: set[_linter.Block]
+    ) -> set[_linter.Block]:
+        if path is None or self.args.no_grandfather or self.args.write_grandfather:
+            return set()
+
+        grand: dict[str, int] = self._grandfather.get(str(path), {})
+        tolerance_ratio = 1 + self.args.grandfather_tolerance / 100.0
+
+        def grandfathered(b: _linter.Block) -> bool:
+            lines = int(grand.get(b.display_name, 0) * tolerance_ratio)
+            return b.line_count <= lines
+
+        return {b for b in bad if grandfathered(b)}
+
+    def _block_result(
+        self, b: _linter.Block, pf: _linter.PythonFile
+    ) -> _linter.LintResult:
+        def_name = "function" if b.category == "def" else "class"
+        msg = f"docstring found for {def_name} '{b.name}' ({b.line_count} lines)"
+        if len(b.docstring):
+            s = "" if len(b.docstring) == 1 else "s"
+            needed = f"needed {self.args.min_docstring}"
+            msg = f"{msg} was too short ({len(b.docstring)} character{s}, {needed})"
+        else:
+            msg = f"No {msg}"
+            if b.is_method:
+                msg = f"{msg}. {METHOD_OVERRIDE_HINT}"
+        return _linter.LintResult(msg, *pf.tokens[b.begin].start)
+
+    def _display(
+        self, pf: _linter.PythonFile, results: list[_linter.LintResult]
+    ) -> Iterator[str]:
+        if not self.args.report:
+            yield from super()._display(pf, results)
+
+    def _dont_require_constructor_and_class_docs(
+        self, blocks: Sequence[_linter.Block], bad: set[_linter.Block]
+    ) -> set[_linter.Block]:
+        if self.args.lint_init:
+            return bad
+
+        good = {b for b in blocks if len(b.docstring) >= self.args.min_docstring}
+
+        def has_class_init_doc(b: _linter.Block) -> bool:
+            if b.is_class:
+                # Is it a class whose constructor is documented?
+                children = (blocks[i] for i in b.children)
+                return any(b.is_init and b in good for b in children)
+
+            # Is it a constructor whose class is documented?
+            return b.is_init and b.parent is not None and blocks[b.parent] in good
+
+        return {b for b in bad if not has_class_init_doc(b)}
+
+    def _is_bad_block(self, b: _linter.Block, pf: _linter.PythonFile) -> bool:
+        max_lines = self._max_lines[b.category]
+        return (
+            not pf.omitted(pf.tokens, b.begin, b.dedent)
+            and b.line_count > max_lines
+            and len(b.docstring) < self.args.min_docstring
+            and (self.args.lint_local or not b.is_local)
+            and (self.args.lint_protected or not b.name.startswith("_"))
+        )
+
+    def _report(self) -> None:
+        if not self.args.lintrunner and self.path_to_blocks and self.args.report:
+            report = {
+                k: s for k, v in self.path_to_blocks.items() if (s := file_summary(v))
+            } | self.path_to_errors
+            print(json.dumps(report, sort_keys=True, indent=2))
+
+    def _write_grandfather(self) -> None:
+        if self.args.write_grandfather:
+            results: dict[str, dict[str, int]] = {}
+
+            for path, blocks in self.path_to_blocks.items():
+                for block in blocks:
+                    if block["status"] == "bad":
+                        d = results.setdefault(path, {})
+                        d[block["display_name"]] = block["line_count"]
+
+            with open(self.args.grandfather, "w") as fp:
+                json.dump(results, fp, sort_keys=True, indent=2)
+
+
+def make_recursive(blocks: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    def rec(i: int) -> dict[str, Any]:
+        d = dict(blocks[i])
+        d["children"] = [rec(c) for c in d["children"]]
+        return d
+
+    return [rec(i) for i, b in enumerate(blocks) if b["parent"] is None]
+
+
+def make_terse(
+    blocks: Sequence[dict[str, Any]],
+    index_by_line: bool = True,
+) -> dict[str, dict[str, Any]]:
+    result: dict[str, dict[str, Any]] = {}
+
+    max_line = max(b["start_line"] for b in blocks) if blocks else 0
+    line_field_width = len(str(max_line))
+
+    for b in blocks:
+        root = f"{b['category']} {b['full_name']}"
+        for i in itertools.count():
+            name = root + bool(i) * f"[{i + 1}]"
+            if name not in result:
+                break
+
+        d = {
+            "docstring_len": len(b["docstring"]),
+            "lines": b["line_count"],
+            "status": b.get("status", "good"),
+        }
+
+        start_line = b["start_line"]
+        if index_by_line:
+            d["name"] = name
+            result[f"{start_line:>{line_field_width}}"] = d
+        else:
+            d["line"] = start_line
+            result[name] = d
+
+        if kids := b["children"]:
+            if not all(isinstance(k, int) for k in kids):
+                assert all(isinstance(k, dict) for k in kids)
+                d["children"] = make_terse(kids)
+
+    return result
+
+
+def file_summary(
+    blocks: Sequence[dict[str, Any]], report_all: bool = False
+) -> dict[str, str]:
+    def to_line(v: dict[str, Any]) -> str | None:
+        if (status := v["status"]) == "good":
+            if not report_all:
+                return None
+            fail = ""
+        elif status == "grandfather":
+            fail = ": (grandfathered)"
+        else:
+            assert status == "bad"
+            fail = ": FAIL"
+        name = v["name"]
+        lines = v["lines"]
+        docs = v["docstring_len"]
+        parens = "()" if name.startswith("def ") else ""
+        return f"{name}{parens}: {lines=}, {docs=}{fail}"
+
+    t = make_terse(blocks)
+    r = {k: line for k, v in t.items() if (line := to_line(v))}
+    while r and all(k.startswith(" ") for k in r):
+        r = {k[1:]: v for k, v in r.items()}
+    return r
+
+
+def add_arguments(add: Callable[..., Any]) -> None:
+    h = "Set the grandfather list"
+    add("--grandfather", "-g", default=str(GRANDFATHER_LIST), type=str, help=h)
+
+    h = "Tolerance for grandfather sizes, in percent"
+    add("--grandfather-tolerance", "-t", default=TOLERANCE_PERCENT, type=float, help=h)
+
+    h = "Lint __init__ and class separately"
+    add("--lint-init", "-i", action="store_true", help=h)
+
+    h = "Lint definitions inside other functions"
+    add("--lint-local", "-o", action="store_true", help=h)
+
+    h = "Lint functions, methods and classes that start with _"
+    add("--lint-protected", "-p", action="store_true", help=h)
+
+    h = "Maximum number of lines for an undocumented class"
+    add("--max-class", "-c", default=MAX_LINES["class"], type=int, help=h)
+
+    h = "Maximum number of lines for an undocumented function"
+    add("--max-def", "-d", default=MAX_LINES["def"], type=int, help=h)
+
+    h = "Minimum number of characters for a docstring"
+    add("--min-docstring", "-s", default=MIN_DOCSTRING, type=int, help=h)
+
+    h = "Disable the grandfather list"
+    add("--no-grandfather", "-n", action="store_true", help=h)
+
+    h = "Print a report on all classes and defs"
+    add("--report", "-r", action="store_true", help=h)
+
+    h = "Rewrite the grandfather list"
+    add("--write-grandfather", "-w", action="store_true", help=h)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/tools/linter/adapters/flake8_linter.py b/tools/linter/adapters/flake8_linter.py
index c046f18ac04f..1239e5334e31 100644
--- a/tools/linter/adapters/flake8_linter.py
+++ b/tools/linter/adapters/flake8_linter.py
@@ -9,16 +9,23 @@
 import sys
 import time
 from enum import Enum
+<<<<<<< HEAD
 from typing import Any, NamedTuple
+=======
+from typing import NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 IS_WINDOWS: bool = os.name == "nt"
 
 
+<<<<<<< HEAD
 def eprint(*args: Any, **kwargs: Any) -> None:
     print(*args, file=sys.stderr, flush=True, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LintSeverity(str, Enum):
     ERROR = "error"
     WARNING = "warning"
diff --git a/tools/linter/adapters/grep_linter.py b/tools/linter/adapters/grep_linter.py
index cd439875a454..85633685aa1e 100644
--- a/tools/linter/adapters/grep_linter.py
+++ b/tools/linter/adapters/grep_linter.py
@@ -12,16 +12,23 @@
 import sys
 import time
 from enum import Enum
+<<<<<<< HEAD
 from typing import Any, NamedTuple
+=======
+from typing import NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 IS_WINDOWS: bool = os.name == "nt"
 
 
+<<<<<<< HEAD
 def eprint(*args: Any, **kwargs: Any) -> None:
     print(*args, file=sys.stderr, flush=True, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LintSeverity(str, Enum):
     ERROR = "error"
     WARNING = "warning"
diff --git a/tools/linter/adapters/header_only_linter.py b/tools/linter/adapters/header_only_linter.py
new file mode 100644
index 000000000000..2548dae4c199
--- /dev/null
+++ b/tools/linter/adapters/header_only_linter.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""
+Checks that all symbols in torch/header_only_apis.txt are tested in a .cpp
+test file to ensure header-only-ness. The .cpp test file must be built
+without linking libtorch.
+"""
+
+import argparse
+import json
+import re
+from enum import Enum
+from pathlib import Path
+from typing import NamedTuple, Union
+
+
+LINTER_CODE = "HEADER_ONLY_LINTER"
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: Union[str, None]
+    line: Union[int, None]
+    char: Union[int, None]
+    code: str
+    severity: LintSeverity
+    name: str
+    original: Union[str, None]
+    replacement: Union[str, None]
+    description: Union[str, None]
+
+
+CPP_TEST_GLOBS = [
+    "test/cpp/aoti_abi_check/*.cpp",
+]
+
+REPO_ROOT = Path(__file__).parents[3]
+
+
+def find_matched_symbols(
+    symbols_regex: re.Pattern[str], test_globs: list[str] = CPP_TEST_GLOBS
+) -> set[str]:
+    """
+    Goes through all lines not starting with // in the cpp files and
+    accumulates a list of matches with the symbols_regex. Note that
+    we expect symbols_regex to be sorted in reverse alphabetical
+    order to allow superset regexes to get matched.
+    """
+    matched_symbols = set()
+    # check noncommented out lines of the test files
+    for cpp_test_glob in test_globs:
+        for test_file in REPO_ROOT.glob(cpp_test_glob):
+            with open(test_file) as tf:
+                for test_file_line in tf:
+                    test_file_line = test_file_line.strip()
+                    if test_file_line.startswith(("//", "#")) or test_file_line == "":
+                        continue
+                    matches = re.findall(symbols_regex, test_file_line)
+                    for m in matches:
+                        if m != "":
+                            matched_symbols.add(m)
+    return matched_symbols
+
+
+def check_file(
+    filename: str, test_globs: list[str] = CPP_TEST_GLOBS
+) -> list[LintMessage]:
+    """
+    Goes through the header_only_apis.txt file and verifies that all symbols
+    within the file can be found tested in an appropriately independent .cpp
+    file.
+
+    Note that we expect CPP_TEST_GLOBS to be passed in as test_globs--the
+    only reason this is an argument at all is for ease of testing.
+    """
+    lint_messages: list[LintMessage] = []
+
+    symbols: dict[str, int] = {}  # symbol -> lineno
+    with open(filename) as f:
+        for idx, line in enumerate(f):
+            # commented out lines should be skipped
+            symbol = line.strip()
+            if not symbol or symbol[0] == "#":
+                continue
+
+            # symbols can in fact be duplicated and come from different headers.
+            # we are aware this is a flaw in using simple string matching.
+            symbols[symbol] = idx + 1
+
+    # Why reverse the keys? To allow superset regexes to get matched first in
+    # find_matched_symbols. For example, we want Float8_e5m2fnuz to match
+    # before Float8_e5m2. Otherwise, both Float8_e5m2fnuz and Float8_e5m2 will
+    # match Float8_e5m2
+    symbols_regex = re.compile("|".join(sorted(symbols.keys(), reverse=True)))
+    matched_symbols = find_matched_symbols(symbols_regex, test_globs)
+
+    for s, lineno in symbols.items():
+        if s not in matched_symbols:
+            lint_messages.append(
+                LintMessage(
+                    path=filename,
+                    line=lineno,
+                    char=None,
+                    code=LINTER_CODE,
+                    severity=LintSeverity.ERROR,
+                    name="[untested-symbol]",
+                    original=None,
+                    replacement=None,
+                    description=(
+                        f"{s} has been included as a header-only API "
+                        "but is not tested in any of CPP_TEST_GLOBS, which "
+                        f"contains {CPP_TEST_GLOBS}.\n"
+                        "Please add a .cpp test using the symbol without "
+                        "linking anything to verify that the symbol is in "
+                        "fact header-only. If you already have a test but it's"
+                        " not found, please add the .cpp file to CPP_TEST_GLOBS"
+                        " in tools/linters/adapters/header_only_linter.py."
+                    ),
+                )
+            )
+
+    return lint_messages
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="header only APIs linter",
+        fromfile_prefix_chars="@",
+    )
+    args = parser.parse_args()
+
+    for lint_message in check_file(
+        str(REPO_ROOT) + "/torch/header_only_apis.txt", CPP_TEST_GLOBS
+    ):
+        print(json.dumps(lint_message._asdict()), flush=True)
diff --git a/tools/linter/adapters/import_linter.py b/tools/linter/adapters/import_linter.py
index be0021f77b4a..60d20cef7145 100644
--- a/tools/linter/adapters/import_linter.py
+++ b/tools/linter/adapters/import_linter.py
@@ -43,7 +43,11 @@ class LintMessage(NamedTuple):
     description: str | None
 
 
+<<<<<<< HEAD
 LINTER_CODE = "NEWLINE"
+=======
+LINTER_CODE = "IMPORT_LINTER"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CURRENT_FILE_NAME = os.path.basename(__file__)
 _MODULE_NAME_ALLOW_LIST: set[str] = set()
 
diff --git a/tools/linter/adapters/mypy_linter.py b/tools/linter/adapters/mypy_linter.py
index a2afd8a46943..673cb189489e 100644
--- a/tools/linter/adapters/mypy_linter.py
+++ b/tools/linter/adapters/mypy_linter.py
@@ -10,6 +10,7 @@
 import time
 from enum import Enum
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, NamedTuple
 
 
@@ -18,6 +19,9 @@
 
 def eprint(*args: Any, **kwargs: Any) -> None:
     print(*args, file=sys.stderr, flush=True, **kwargs)
+=======
+from typing import NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class LintSeverity(str, Enum):
@@ -39,10 +43,13 @@ class LintMessage(NamedTuple):
     description: str | None
 
 
+<<<<<<< HEAD
 def as_posix(name: str) -> str:
     return name.replace("\\", "/") if IS_WINDOWS else name
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # tools/linter/flake8_linter.py:15:13: error: Incompatibl...int")  [assignment]
 RESULTS_RE: re.Pattern[str] = re.compile(
     r"""(?mx)
diff --git a/tools/linter/adapters/no_merge_conflict_csv_linter.py b/tools/linter/adapters/no_merge_conflict_csv_linter.py
index 4b14e03c0496..e11baf545b6f 100644
--- a/tools/linter/adapters/no_merge_conflict_csv_linter.py
+++ b/tools/linter/adapters/no_merge_conflict_csv_linter.py
@@ -7,6 +7,7 @@
 import os
 import sys
 from enum import Enum
+<<<<<<< HEAD
 from typing import Any, NamedTuple
 
 
@@ -15,6 +16,9 @@
 
 def eprint(*args: Any, **kwargs: Any) -> None:
     print(*args, file=sys.stderr, flush=True, **kwargs)
+=======
+from typing import NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class LintSeverity(str, Enum):
diff --git a/tools/linter/adapters/pip_init.py b/tools/linter/adapters/pip_init.py
index 764d4613a8f4..27163a389025 100644
--- a/tools/linter/adapters/pip_init.py
+++ b/tools/linter/adapters/pip_init.py
@@ -52,7 +52,14 @@ def run_command(args: list[str]) -> subprocess.CompletedProcess[bytes]:
         stream=sys.stderr,
     )
 
+<<<<<<< HEAD
     uv_available = shutil.which("uv") is not None
+=======
+    uv_available = (
+        any(prefix in sys.base_prefix for prefix in ["uv/python", "uv\\python"])
+        and shutil.which("uv") is not None
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if uv_available:
         pip_args = ["uv", "pip", "install"]
diff --git a/tools/linter/adapters/pyfmt_linter.py b/tools/linter/adapters/pyfmt_linter.py
index 42564d119fb3..b88e87347d37 100644
--- a/tools/linter/adapters/pyfmt_linter.py
+++ b/tools/linter/adapters/pyfmt_linter.py
@@ -11,7 +11,11 @@
 import sys
 from enum import Enum
 from pathlib import Path
+<<<<<<< HEAD
 from typing import Any, NamedTuple
+=======
+from typing import NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import black
 import isort
@@ -38,6 +42,7 @@
                     # torchgen/**
                     # test/**
                     # test/[a-h]*/**
+<<<<<<< HEAD
                     "test/[a-h]*/**",
                     # test/[i-j]*/**
                     "test/[i-j]*/**",
@@ -45,6 +50,13 @@
                     "test/[k-n]*/**",
                     # test/optim/**
                     "test/optim/**",
+=======
+                    # test/[i-j]*/**
+                    "test/j*/**",
+                    # test/[k-m]*/**
+                    "test/[k-m]*/**",
+                    # test/optim/**
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # "test/[p-z]*/**",
                     "test/[p-z]*/**",
                     # torch/**
@@ -56,12 +68,21 @@
                     # torch/_[j-z]*/**
                     "torch/_[j-z]*/**",
                     # torch/[a-c]*/**
+<<<<<<< HEAD
                     "torch/[a-c]*/**",
                     # torch/d*/**
                     # torch/[e-n]*/**
                     "torch/[e-n]*/**",
                     # torch/optim/**
                     "torch/optim/**",
+=======
+                    "torch/a[a-n]*/**",
+                    "torch/a[p-z]*/**",
+                    "torch/[b-c]*/**",
+                    # torch/d*/**
+                    # torch/[e-m]*/**
+                    # torch/optim/**
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # torch/[p-z]*/**
                     "torch/[p-z]*/**",
                 ],
@@ -71,10 +92,13 @@
 )
 
 
+<<<<<<< HEAD
 def eprint(*args: Any, **kwargs: Any) -> None:
     print(*args, file=sys.stderr, flush=True, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LintSeverity(str, Enum):
     ERROR = "error"
     WARNING = "warning"
diff --git a/tools/linter/adapters/ruff_linter.py b/tools/linter/adapters/ruff_linter.py
index fe8da208912b..157ef17acb0a 100644
--- a/tools/linter/adapters/ruff_linter.py
+++ b/tools/linter/adapters/ruff_linter.py
@@ -20,11 +20,14 @@
 IS_WINDOWS: bool = os.name == "nt"
 
 
+<<<<<<< HEAD
 def eprint(*args: Any, **kwargs: Any) -> None:
     """Print to stderr."""
     print(*args, file=sys.stderr, flush=True, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LintSeverity(str, enum.Enum):
     """Severity of a lint message."""
 
diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json
index 54f0a2f2d96a..012371e9ae08 100644
--- a/tools/linter/adapters/s3_init_config.json
+++ b/tools/linter/adapters/s3_init_config.json
@@ -8,6 +8,7 @@
     ],
     "clang-format": {
         "Darwin-arm": {
+<<<<<<< HEAD
             "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/macos-arm/17.0.6/clang-format",
             "hash": "47c47f3c8275fd6e25d07128ef9a655d3f898eae6a59a7c7a801967871bdb2f7"
         },
@@ -18,6 +19,18 @@
         "Linux": {
             "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/17.0.6/clang-format",
             "hash": "920159a0fafc7c65f6819e8a0b739ecc8e655f50f20a3a1db975a3473b86431b"
+=======
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/macos-arm/19.1.4/clang-format",
+            "hash": "f0da3ecf0ab1e9b50e8c27bd2d7ca0baa619e2f4b824b35d79d46356581fa552"
+        },
+        "Darwin-i386": {
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/macos-i386/19.1.4/clang-format",
+            "hash": "f5eb5037b9aa9d1d2de650fb2e0fe1a2517768a462fae8e98791a67b698302f4"
+        },
+        "Linux": {
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/19.1.4/clang-format",
+            "hash": "bfa9ef6eccb372f79ffcb6196af966fd84519ea9567f5ae7b6ad30208cd82109"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     },
     "clang-tidy": {
@@ -35,6 +48,7 @@
         }
     },
     "actionlint": {
+<<<<<<< HEAD
         "Darwin": {
             "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Darwin_amd64/actionlint",
             "hash": "b354db83815384d3c3a07f68f44b30cb0a70899757a0d185d7322de9952e8813"
@@ -42,6 +56,19 @@
         "Linux": {
             "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Linux_arm64/actionlint",
             "hash": "025ac157db121b33971ef24af72d73d71cda3cb1e3a94795bb2708ef4032ca76"
+=======
+        "Darwin-i386": {
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.7.7/Darwin_amd64/actionlint",
+            "hash": "996affd492c57441c5ecfe00dedaef1fde056872d242c0cf7cc15de058d59d03"
+        },
+        "Darwin-arm": {
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.7.7/Darwin_arm64/actionlint",
+            "hash": "00aba386d026da33be6e85dd5a46d7af4dd9e4d6cbdb02335f4b267162fd2d9e"
+        },
+        "Linux": {
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.7.7/Linux_x86_64/actionlint",
+            "hash": "9f7dedb4e23f89f2922073d1a6720405b7b520d4f5832ebb96f0d55a2958886c"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     },
     "bazel": {
diff --git a/tools/linter/adapters/set_linter.py b/tools/linter/adapters/set_linter.py
index c243fb0ce94b..6b1fa2523be1 100644
--- a/tools/linter/adapters/set_linter.py
+++ b/tools/linter/adapters/set_linter.py
@@ -1,9 +1,13 @@
 from __future__ import annotations
 
+<<<<<<< HEAD
 import dataclasses as dc
 import sys
 import token
 from functools import cached_property
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -17,12 +21,20 @@
     import _linter
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Iterator, Sequence
     from tokenize import TokenInfo
 
 
 ERROR = "Builtin `set` is deprecated"
 IMPORT_LINE = "from torch.utils._ordered_set import OrderedSet\n"
+=======
+    from collections.abc import Iterator
+
+
+ERROR = "Builtin `set` is deprecated"
+IMPORT_LINE = "from torch.utils._ordered_set import OrderedSet\n\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 DESCRIPTION = """`set_linter` is a lintrunner linter which finds usages of the
 Python built-in class `set` in Python code, and optionally replaces them with
@@ -80,6 +92,7 @@ class SetLinter(_linter.FileLinter):
     report_column_numbers = True
 
     def _lint(self, pf: _linter.PythonFile) -> Iterator[_linter.LintResult]:
+<<<<<<< HEAD
         pl = PythonLines(pf)
         for b in pl.braced_sets:
             yield _linter.LintResult(ERROR, *b[0].start, "OrderedSet([", 1)
@@ -188,6 +201,18 @@ def __init__(self, pf: _linter.PythonFile) -> None:
             self.insert_import_line = pf.token_lines[section[-1]][-1].start[0] + 1
         else:
             self.insert_import_line = 0
+=======
+        if (pf.sets or pf.braced_sets) and (ins := pf.insert_import_line) is not None:
+            yield _linter.LintResult(
+                "Add import for OrderedSet", ins, 0, IMPORT_LINE, 0
+            )
+        for b in pf.braced_sets:
+            yield _linter.LintResult(ERROR, *b[0].start, "OrderedSet([", 1)
+            yield _linter.LintResult(ERROR, *b[-1].start, "])", 1)
+
+        for s in pf.sets:
+            yield _linter.LintResult(ERROR, *s.start, "OrderedSet", 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
diff --git a/tools/linter/adapters/test_device_bias_linter.py b/tools/linter/adapters/test_device_bias_linter.py
new file mode 100644
index 000000000000..9901d5f3fe52
--- /dev/null
+++ b/tools/linter/adapters/test_device_bias_linter.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+"""
+This lint verifies that every Python test file (file that matches test_*.py or
+*_test.py in the test folder) has a cuda hard code in `requires_gpu()`
+decorated function to ensure that the test not fail on other GPU.
+
+"""
+
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import multiprocessing as mp
+from enum import Enum
+from typing import NamedTuple
+
+
+LINTER_CODE = "TEST_DEVICE_BIAS"
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: str | None
+    line: int | None
+    char: int | None
+    code: str
+    severity: LintSeverity
+    name: str
+    original: str | None
+    replacement: str | None
+    description: str | None
+
+
+DEVICE_BIAS = ["cuda", "xpu", "mps"]
+
+
+class DeviceBiasVisitor(ast.NodeVisitor):
+    def __init__(self, filename: str):
+        self.filename = filename
+        self.lint_messages: list[LintMessage] = []
+
+    def _has_requires_gpu_decorator(self, node: ast.FunctionDef) -> bool:
+        for d in node.decorator_list:
+            if isinstance(d, ast.Name) and d.id == "requires_gpu":
+                return True
+            if (
+                isinstance(d, ast.Call)
+                and isinstance(d.func, ast.Name)
+                and d.func.id == "requires_gpu"
+            ):
+                return True
+        return False
+
+    # check device = "cuda" or torch.device("cuda")
+    def _check_keyword_device(self, subnode: ast.keyword, msg_prefix: str) -> None:
+        if subnode.arg != "device":
+            return
+
+        val = subnode.value
+        if isinstance(val, ast.Constant) and any(
+            bias in val.value for bias in DEVICE_BIAS
+        ):
+            self.record(
+                subnode,
+                f"{msg_prefix} device='{val.value}', suggest to use device=GPU_TYPE",
+            )
+        elif isinstance(val, ast.Call):
+            if (
+                isinstance(val.func, ast.Attribute)
+                and val.func.attr == "device"
+                and len(val.args) > 0
+                and isinstance(val.args[0], ast.Constant)
+                and any(bias in val.args[0].value for bias in DEVICE_BIAS)
+            ):
+                self.record(
+                    val,
+                    f"{msg_prefix} torch.device('{val.args[0].value}'), suggest to use torch.device(GPU_TYPE)",
+                )
+
+    # check .cuda() or .to("cuda")
+    def _check_device_methods(self, subnode: ast.Call, msg_prefix: str) -> None:
+        func = subnode.func
+        if not isinstance(func, ast.Attribute):
+            return
+        method_name = func.attr
+        if method_name in DEVICE_BIAS:
+            self.record(
+                subnode,
+                f"{msg_prefix} .{method_name}(), suggest to use .to(GPU_TYPE)",
+            )
+        elif method_name == "to" and subnode.args:
+            arg = subnode.args[0]
+            if isinstance(arg, ast.Constant) and any(
+                bias in arg.value for bias in DEVICE_BIAS
+            ):
+                self.record(
+                    subnode,
+                    f"{msg_prefix} .to('{arg.value}'), suggest to use .to(GPU_TYPE)",
+                )
+
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
+        # Check if the function is decorated with @requires_gpu, which indicates
+        # that the function is intended to run on GPU devices (e.g., CUDA or XPU),
+        # but ensure it does not hardcode the device to CUDA.
+        if not self._has_requires_gpu_decorator(node):
+            self.generic_visit(node)
+            return
+
+        msg_prefix = "`@requires_gpu` function should not hardcode"
+        for subnode in ast.walk(node):
+            if isinstance(subnode, ast.keyword):
+                self._check_keyword_device(subnode, msg_prefix)
+            elif isinstance(subnode, ast.Call) and isinstance(
+                subnode.func, ast.Attribute
+            ):
+                self._check_device_methods(subnode, msg_prefix)
+
+        self.generic_visit(node)
+
+    def record(self, node: ast.AST, message: str) -> None:
+        self.lint_messages.append(
+            LintMessage(
+                path=self.filename,
+                line=getattr(node, "lineno", None),
+                char=None,
+                code=LINTER_CODE,
+                severity=LintSeverity.ERROR,
+                name="[device-bias]",
+                original=None,
+                replacement=None,
+                description=message,
+            )
+        )
+
+
+def check_file(filename: str) -> list[LintMessage]:
+    with open(filename) as f:
+        source = f.read()
+        tree = ast.parse(source, filename=filename)
+        checker = DeviceBiasVisitor(filename)
+        checker.visit(tree)
+
+    return checker.lint_messages
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Detect Device bias in python functions decorated with [require_gpu]"
+        " that may potentially break support for other GPU devices.",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+
+    args = parser.parse_args()
+
+    with mp.Pool(8) as pool:
+        lint_messages = pool.map(check_file, args.filenames)
+
+    flat_lint_messages = []
+    for sublist in lint_messages:
+        flat_lint_messages.extend(sublist)
+
+    for lint_message in flat_lint_messages:
+        print(json.dumps(lint_message._asdict()), flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/linter/adapters/testowners_linter.py b/tools/linter/adapters/testowners_linter.py
index 7f6e2efd34c1..377cd5dfee86 100755
--- a/tools/linter/adapters/testowners_linter.py
+++ b/tools/linter/adapters/testowners_linter.py
@@ -41,11 +41,14 @@ class LintMessage(NamedTuple):
     description: str | None
 
 
+<<<<<<< HEAD
 # Team/owner labels usually start with "module: " or "oncall: ", but the following are acceptable exceptions
 ACCEPTABLE_OWNER_LABELS = ["NNC", "high priority"]
 OWNERS_PREFIX = "# Owner(s): "
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_pytorch_labels() -> Any:
     url = "https://ossci-metrics.s3.amazonaws.com/pytorch_labels.json"
     try:
@@ -65,6 +68,10 @@ def get_pytorch_labels() -> Any:
 PYTORCH_LABELS = get_pytorch_labels()
 # Team/owner labels usually start with "module: " or "oncall: ", but the following are acceptable exceptions
 ACCEPTABLE_OWNER_LABELS = ["NNC", "high priority"]
+<<<<<<< HEAD
+=======
+OWNERS_PREFIX = "# Owner(s): "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GLOB_EXCEPTIONS = ["**/test/run_test.py"]
 
 
diff --git a/tools/linter/clang_tidy/generate_build_files.py b/tools/linter/clang_tidy/generate_build_files.py
index af322e754b87..0dfadddec537 100644
--- a/tools/linter/clang_tidy/generate_build_files.py
+++ b/tools/linter/clang_tidy/generate_build_files.py
@@ -31,7 +31,12 @@ def gen_compile_commands() -> None:
     os.environ["USE_PRECOMPILED_HEADERS"] = "1"
     os.environ["CC"] = "clang"
     os.environ["CXX"] = "clang++"
+<<<<<<< HEAD
     run_cmd([sys.executable, "setup.py", "--cmake-only", "build"])
+=======
+    os.environ["CMAKE_ONLY"] = "1"
+    run_cmd([sys.executable, "setup.py", "build"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def run_autogen() -> None:
diff --git a/tools/linter/dictionary.txt b/tools/linter/dictionary.txt
new file mode 100644
index 000000000000..2a7c3b9d1acd
--- /dev/null
+++ b/tools/linter/dictionary.txt
@@ -0,0 +1,30 @@
+ans
+BU
+contiguities
+contiguity
+coo
+DEPENDEES
+Din
+Dout
+dOut
+ElementE
+followings
+fro
+froms
+hsa
+nd
+nin
+nout
+NowNs
+optins
+OT
+overrideable
+ptd
+rebuild
+rebuilt
+reenable
+reenabled
+requestor
+ser'de
+supercedes
+te
diff --git a/tools/lldb/deploy_debugger.py b/tools/lldb/deploy_debugger.py
index 135a6167e3a4..137a9681b769 100644
--- a/tools/lldb/deploy_debugger.py
+++ b/tools/lldb/deploy_debugger.py
@@ -25,7 +25,11 @@
 stem = Path(name).stem
 with NamedTemporaryFile(prefix=stem, suffix='.so', delete=False) as tf:
     tf.write(r)
+<<<<<<< HEAD
     print("torch_deploy registering debug inforation for ", tf.name)
+=======
+    print("torch_deploy registering debug information for ", tf.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cmd1 = f"target modules add {tf.name}"
     # print(cmd1)
     lldb.debugger.HandleCommand(cmd1)
diff --git a/tools/nightly.py b/tools/nightly.py
index 45ca897cbe55..ff345c3f0e44 100755
--- a/tools/nightly.py
+++ b/tools/nightly.py
@@ -9,7 +9,16 @@
     $ ./tools/nightly.py checkout -b my-nightly-branch
     $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
 
+<<<<<<< HEAD
 Or if you would like to re-use an existing virtual environment, you can pass in
+=======
+Or if you would like to check out the nightly commit in detached HEAD mode::
+
+    $ ./tools/nightly.py checkout
+    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+
+Or if you would like to reuse an existing virtual environment, you can pass in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 the prefix argument (--prefix)::
 
     $ ./tools/nightly.py checkout -b my-nightly-branch -p my-env
@@ -50,6 +59,10 @@
 import subprocess
 import sys
 import tempfile
+<<<<<<< HEAD
+=======
+import textwrap
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import time
 import uuid
 from ast import literal_eval
@@ -91,6 +104,10 @@
 
 
 LOGGER: logging.Logger | None = None
+<<<<<<< HEAD
+=======
+VERBOSE: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DATETIME_FORMAT = "%Y-%m-%d_%Hh%Mm%Ss"
 SHA1_RE = re.compile(r"(?P<sha1>[0-9a-fA-F]{40})")
 USERNAME_PASSWORD_RE = re.compile(r":\/\/(.*?)\@")
@@ -122,6 +139,7 @@ class PipSource(NamedTuple):
         supported_platforms={"Linux", "macOS", "Windows"},
         accelerator="cpu",
     ),
+<<<<<<< HEAD
     "cuda-11.8": PipSource(
         name="cuda-11.8",
         index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu118",
@@ -134,15 +152,37 @@ class PipSource(NamedTuple):
         supported_platforms={"Linux", "Windows"},
         accelerator="cuda",
     ),
+=======
+    # NOTE: Sync with CUDA_ARCHES in .github/scripts/generate_binary_build_matrix.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "cuda-12.6": PipSource(
         name="cuda-12.6",
         index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu126",
         supported_platforms={"Linux", "Windows"},
         accelerator="cuda",
     ),
+<<<<<<< HEAD
     "rocm-6.2.4": PipSource(
         name="rocm-6.2.4",
         index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm6.2.4",
+=======
+    "cuda-12.8": PipSource(
+        name="cuda-12.8",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu128",
+        supported_platforms={"Linux", "Windows"},
+        accelerator="cuda",
+    ),
+    # NOTE: Sync with ROCM_ARCHES in .github/scripts/generate_binary_build_matrix.py
+    "rocm-6.3": PipSource(
+        name="rocm-6.3",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm6.3",
+        supported_platforms={"Linux"},
+        accelerator="rocm",
+    ),
+    "rocm-6.4": PipSource(
+        name="rocm-6.4",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm6.4",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         supported_platforms={"Linux"},
         accelerator="rocm",
     ),
@@ -217,7 +257,18 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
 class Venv:
     """Virtual environment manager"""
 
+<<<<<<< HEAD
     AGGRESSIVE_UPDATE_PACKAGES = ("pip", "setuptools", "packaging", "wheel")
+=======
+    AGGRESSIVE_UPDATE_PACKAGES = (
+        "uv",
+        "pip",
+        "setuptools",
+        "packaging",
+        "wheel",
+        "build[uv]",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -230,21 +281,49 @@ def __init__(
         self.pip_source = pip_source
         self.base_executable = Path(base_executable or sys.executable).absolute()
         self._executable: Path | None = None
+<<<<<<< HEAD
         self._env = {"PIP_EXTRA_INDEX_URL": self.pip_source.index_url}
+=======
+        self._bindir: Path | None = None
+        self._env = {
+            "PIP_EXTRA_INDEX_URL": self.pip_source.index_url,
+            "UV_INDEX": self.pip_source.index_url,
+            "FORCE_COLOR": "1",
+            "CLICOLOR_FORCE": "1",
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def is_venv(self) -> bool:
         """Check if the prefix is a virtual environment."""
         return self.prefix.is_dir() and (self.prefix / "pyvenv.cfg").is_file()
 
     @property
+<<<<<<< HEAD
+=======
+    def bindir(self) -> Path:
+        """Get the bin directory for the virtual environment."""
+        assert self.is_venv()
+        if self._bindir is None:
+            if WINDOWS:
+                self._bindir = self.prefix / "Scripts"
+            else:
+                self._bindir = self.prefix / "bin"
+        return self._bindir
+
+    @property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def executable(self) -> Path:
         """Get the Python executable for the virtual environment."""
         assert self.is_venv()
         if self._executable is None:
+<<<<<<< HEAD
             if WINDOWS:
                 executable = self.prefix / "Scripts" / "python.exe"
             else:
                 executable = self.prefix / "bin" / "python"
+=======
+            executable = self.bindir / ("python.exe" if WINDOWS else "python")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert executable.is_file() or executable.is_symlink()
             assert os.access(executable, os.X_OK), f"{executable} is not executable"
             self._executable = executable
@@ -340,6 +419,47 @@ def create(self, *, remove_if_exists: bool = False) -> Path:
         self.base_python("-m", "venv", str(self.prefix))
         assert self.is_venv(), "Failed to create virtual environment."
         (self.prefix / ".gitignore").write_text("*\n", encoding="utf-8")
+<<<<<<< HEAD
+=======
+
+        if LINUX:
+            activate_script = self.activate_script
+            st_mode = activate_script.stat().st_mode
+            # The activate script may be read-only and we need to add write permissions
+            activate_script.chmod(st_mode | 0o200)
+            with activate_script.open(mode="a", encoding="utf-8") as f:
+                f.write(
+                    "\n"
+                    + textwrap.dedent(
+                        f"""
+                        # Add NVIDIA PyPI packages to LD_LIBRARY_PATH
+                        export LD_LIBRARY_PATH="$(
+                            {self.executable.name} - <<EOS
+                        import glob
+                        import itertools
+                        import os
+                        import site
+
+                        nvidia_libs = [
+                            p.rstrip("/")
+                            for p in itertools.chain.from_iterable(
+                                glob.iglob(f"{{site_dir}}/{{pattern}}/", recursive=True)
+                                for site_dir in site.getsitepackages()
+                                for pattern in ("nvidia/**/lib", "cu*/**/lib")
+                            )
+                        ]
+                        ld_library_path = os.getenv("LD_LIBRARY_PATH", "").split(os.pathsep)
+                        print(os.pathsep.join(dict.fromkeys(nvidia_libs + ld_library_path)))
+                        EOS
+                        )"
+                        """
+                    ).strip()
+                    + "\n"
+                )
+            # Change the file mode back
+            activate_script.chmod(st_mode)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.ensure()
 
     def ensure(self) -> Path:
@@ -361,9 +481,16 @@ def python(
             python = self.executable
         cmd = [str(python), *args]
         env = popen_kwargs.pop("env", None) or {}
+<<<<<<< HEAD
         return subprocess.run(
             cmd,
             check=True,
+=======
+        check = popen_kwargs.pop("check", True)
+        return subprocess.run(
+            cmd,
+            check=check,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             text=True,
             encoding="utf-8",
             env={**self._env, **env},
@@ -394,6 +521,58 @@ def base_python_version(self) -> str:
         """Get the Python version for the base environment."""
         return self.python_version(python=self.base_executable)
 
+<<<<<<< HEAD
+=======
+    def uv(
+        self,
+        *args: str,
+        python: Path | str | None = None,
+        **popen_kwargs: Any,
+    ) -> subprocess.CompletedProcess[str]:
+        """Run a uv command in the virtual environment."""
+        if python is None:
+            python = self.executable
+        cmd = [str(self.bindir / "uv"), *args]
+        env = popen_kwargs.pop("env", None) or {}
+        check = popen_kwargs.pop("check", True)
+        env["UV_PYTHON"] = str(python)
+        return subprocess.run(
+            cmd,
+            check=check,
+            text=True,
+            encoding="utf-8",
+            env={**self._env, **env},
+            **popen_kwargs,
+        )
+
+    @timed("Installing packages")
+    def uv_pip_install(
+        self,
+        *packages: str,
+        prerelease: bool = False,
+        upgrade: bool = False,
+        no_deps: bool = False,
+        **popen_kwargs: Any,
+    ) -> subprocess.CompletedProcess[str]:
+        """Run a pip install command in the virtual environment."""
+        uv_pip_args = []
+        if VERBOSE:
+            uv_pip_args.append("-v")
+        if prerelease:
+            uv_pip_args.append("--prerelease")
+        if upgrade:
+            uv_pip_args.append("--upgrade")
+            verb = "Upgrading"
+        else:
+            verb = "Installing"
+        if no_deps:
+            uv_pip_args.append("--no-deps")
+        print(f"{verb} package(s) ({self.pip_source.index_url}):")
+        for package in packages:
+            print(f"  - {os.path.basename(package)}")
+        return self.uv("pip", "install", *uv_pip_args, *packages, **popen_kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def pip(self, *args: str, **popen_kwargs: Any) -> subprocess.CompletedProcess[str]:
         """Run a pip command in the virtual environment."""
         return self.python("-m", "pip", *args, **popen_kwargs)
@@ -404,6 +583,7 @@ def pip_install(
         *packages: str,
         prerelease: bool = False,
         upgrade: bool = False,
+<<<<<<< HEAD
         **popen_kwargs: Any,
     ) -> subprocess.CompletedProcess[str]:
         """Run a pip install command in the virtual environment."""
@@ -420,12 +600,38 @@ def pip_install(
             f"{', '.join(map(os.path.basename, packages))}"
         )
         return self.pip("install", *args, **popen_kwargs)
+=======
+        no_deps: bool = False,
+        **popen_kwargs: Any,
+    ) -> subprocess.CompletedProcess[str]:
+        """Run a pip install command in the virtual environment."""
+        pip_args = []
+        if VERBOSE:
+            pip_args.append("-v")
+        if prerelease:
+            pip_args.append("--pre")
+        if upgrade:
+            pip_args.append("--upgrade")
+            verb = "Upgrading"
+        else:
+            verb = "Installing"
+        if no_deps:
+            pip_args.append("--no-deps")
+        print(f"{verb} package(s) ({self.pip_source.index_url}):")
+        for package in packages:
+            print(f"  - {os.path.basename(package)}")
+        return self.pip("install", *pip_args, *packages, **popen_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @timed("Downloading packages")
     def pip_download(
         self,
         *packages: str,
         prerelease: bool = False,
+<<<<<<< HEAD
+=======
+        no_deps: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **popen_kwargs: Any,
     ) -> list[Path]:
         """Download a package in the virtual environment."""
@@ -436,11 +642,22 @@ def pip_download(
             f"Downloading package(s) ({self.pip_source.index_url}): "
             f"{', '.join(packages)}"
         )
+<<<<<<< HEAD
         if prerelease:
             args = ["--pre", *packages]
         else:
             args = list(packages)
         self.pip("download", "--dest", str(tempdir), *args, **popen_kwargs)
+=======
+        pip_args = []
+        if VERBOSE:
+            pip_args.append("-v")
+        if prerelease:
+            pip_args.append("--pre")
+        if no_deps:
+            pip_args.append("--no-deps")
+        self.pip("download", f"--dest={tempdir}", *pip_args, *packages, **popen_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         files = list(tempdir.iterdir())
         print(f"Downloaded {len(files)} file(s) to {tempdir}:")
         for file in files:
@@ -466,7 +683,11 @@ def wheel_unpack(
         wheel = Path(wheel).absolute()
         dest = Path(dest).absolute()
         assert wheel.is_file() and wheel.suffix.lower() == ".whl"
+<<<<<<< HEAD
         return self.wheel("unpack", "--dest", str(dest), str(wheel), **popen_kwargs)
+=======
+        return self.wheel("unpack", f"--dest={dest}", str(wheel), **popen_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @contextlib.contextmanager
     def extracted_wheel(self, wheel: Path | str) -> Generator[Path]:
@@ -556,7 +777,11 @@ def logging_manager(*, debug: bool = False) -> Generator[logging.Logger, None, N
         print(f"log file: {log_file}")
         yield root_logger
     except Exception as e:
+<<<<<<< HEAD
         logging.exception("Fatal exception")
+=======
+        logging.exception("Fatal exception")  # noqa: LOG015
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         logging_record_exception(e)
         print(f"log file: {log_file}")
         sys.exit(1)
@@ -564,7 +789,11 @@ def logging_manager(*, debug: bool = False) -> Generator[logging.Logger, None, N
         # You could logging.debug here to suppress the backtrace
         # entirely, but there is no reason to hide it from technically
         # savvy users.
+<<<<<<< HEAD
         logging.info("", exc_info=True)
+=======
+        logging.info("", exc_info=True)  # noqa: LOG015
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         logging_record_exception(e)
         print(f"log file: {log_file}")
         sys.exit(1)
@@ -574,19 +803,31 @@ def check_branch(subcommand: str, branch: str | None) -> str | None:
     """Checks that the branch name can be checked out."""
     if subcommand != "checkout":
         return None
+<<<<<<< HEAD
     # first make sure actual branch name was given
     if branch is None:
         return "Branch name to checkout must be supplied with '-b' option"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # next check that the local repo is clean
     cmd = git("status", "--untracked-files=no", "--porcelain")
     stdout = subprocess.check_output(cmd, text=True, encoding="utf-8")
     if stdout.strip():
         return "Need to have clean working tree to checkout!\n\n" + stdout
+<<<<<<< HEAD
     # next check that the branch name doesn't already exist
     cmd = git("show-ref", "--verify", "--quiet", f"refs/heads/{branch}")
     p = subprocess.run(cmd, capture_output=True, check=False)  # type: ignore[assignment]
     if not p.returncode:
         return f"Branch {branch!r} already exists"
+=======
+    # next check that the branch name doesn't already exist (if a branch name is provided)
+    if branch is not None:
+        cmd = git("show-ref", "--verify", "--quiet", f"refs/heads/{branch}")
+        p = subprocess.run(cmd, capture_output=True, check=False)  # type: ignore[assignment]
+        if not p.returncode:
+            return f"Branch {branch!r} already exists"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return None
 
 
@@ -596,7 +837,11 @@ def install_packages(venv: Venv, packages: Iterable[str]) -> None:
     # install packages
     packages = list(dict.fromkeys(packages))
     if packages:
+<<<<<<< HEAD
         venv.pip_install(*packages)
+=======
+        venv.uv_pip_install(*packages)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _ensure_commit(git_sha1: str) -> None:
@@ -641,10 +886,22 @@ def _nightly_version(site_dir: Path) -> str:
 
 
 @timed("Checking out nightly PyTorch")
+<<<<<<< HEAD
 def checkout_nightly_version(branch: str, site_dir: Path) -> None:
     """Get's the nightly version and then checks it out."""
     nightly_version = _nightly_version(site_dir)
     cmd = git("checkout", "-b", branch, nightly_version)
+=======
+def checkout_nightly_version(branch: str | None, site_dir: Path) -> None:
+    """Gets the nightly version and then checks it out."""
+    nightly_version = _nightly_version(site_dir)
+    if branch is None:
+        # Detached mode - explicitly use --detach flag
+        cmd = git("checkout", "--detach", nightly_version)
+    else:
+        # Branch mode
+        cmd = git("checkout", "-b", branch, nightly_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     subprocess.check_call(cmd)
 
 
@@ -741,12 +998,22 @@ def _move_single(
                 relname = relroot / name
                 s = src / relname
                 t = trg / relname
+<<<<<<< HEAD
                 print(f"{verb} {s} -> {t}")
+=======
+                if VERBOSE:
+                    print(f"{verb} {s} -> {t}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mover(s, t)
             for name in dirs:
                 (trg / relroot / name).mkdir(parents=True, exist_ok=True)
     else:
+<<<<<<< HEAD
         print(f"{verb} {src} -> {trg}")
+=======
+        if VERBOSE:
+            print(f"{verb} {src} -> {trg}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mover(src, trg)
 
 
@@ -789,13 +1056,61 @@ def write_pth(venv: Venv) -> None:
     )
 
 
+<<<<<<< HEAD
+=======
+def parse_dependencies(
+    venv: Venv,
+    wheel_site_dir: Path,
+) -> list[str]:
+    """Parse dependencies from the torch wheel's metadata."""
+    dist_info_dirs = list(wheel_site_dir.glob("*.dist-info"))
+    if len(dist_info_dirs) != 1:
+        raise RuntimeError(
+            f"Expected exactly one .dist-info directory in {wheel_site_dir}, "
+            f"got {dist_info_dirs}"
+        )
+    dist_info_dir = dist_info_dirs[0]
+    if not (dist_info_dir / "METADATA").is_file():
+        raise RuntimeError(
+            f"Expected METADATA file in {dist_info_dir}, but it does not exist."
+        )
+
+    # Use the Python interpreter in the virtual environment instead of the interpreter
+    # running this script, so that we can evaluate markers correctly.
+    dependencies = (
+        venv.python(
+            "-c",
+            textwrap.dedent(
+                """
+                from packaging.metadata import Metadata
+
+                with open("METADATA", encoding="utf-8") as f:
+                    metadata = Metadata.from_email(f.read())
+                for req in metadata.requires_dist:
+                    if req.marker is None or req.marker.evaluate():
+                        print(req)
+                """
+            ).strip(),
+            cwd=dist_info_dir,
+            capture_output=True,
+        )
+        .stdout.strip()
+        .splitlines()
+    )
+    return [dep.strip() for dep in dependencies]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def install(
     *,
     venv: Venv,
     packages: Iterable[str],
     subcommand: str = "checkout",
     branch: str | None = None,
+<<<<<<< HEAD
     logger: logging.Logger,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """Development install of PyTorch"""
     use_existing = subcommand == "checkout"
@@ -806,6 +1121,7 @@ def install(
 
     packages = [p for p in packages if p != "torch"]
 
+<<<<<<< HEAD
     dependencies = venv.pip_download("torch", prerelease=True)
     torch_wheel = [
         dep
@@ -822,6 +1138,23 @@ def install(
     with venv.extracted_wheel(torch_wheel) as wheel_site_dir:
         if subcommand == "checkout":
             checkout_nightly_version(cast(str, branch), wheel_site_dir)
+=======
+    downloaded_files = venv.pip_download("torch", prerelease=True, no_deps=True)
+    if len(downloaded_files) != 1:
+        raise RuntimeError(f"Expected exactly one torch wheel, got {downloaded_files}")
+    torch_wheel = downloaded_files[0]
+    if not (
+        torch_wheel.name.startswith("torch-") and torch_wheel.name.endswith(".whl")
+    ):
+        raise RuntimeError(f"Expected exactly one torch wheel, got {torch_wheel}")
+
+    with venv.extracted_wheel(torch_wheel) as wheel_site_dir:
+        dependencies = parse_dependencies(venv, wheel_site_dir)
+        install_packages(venv, [*dependencies, *packages])
+
+        if subcommand == "checkout":
+            checkout_nightly_version(branch, wheel_site_dir)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif subcommand == "pull":
             pull_nightly_version(wheel_site_dir)
         else:
@@ -829,11 +1162,19 @@ def install(
         move_nightly_files(wheel_site_dir)
 
     write_pth(venv)
+<<<<<<< HEAD
     logger.info(
         "-------\n"
         "PyTorch Development Environment set up!\n"
         "Please activate to enable this environment:\n\n"
         "  $ %s",
+=======
+    cast(logging.Logger, LOGGER).info(
+        "-------\n"
+        "PyTorch Development Environment set up!\n"
+        "Please activate to enable this environment:\n\n"
+        "  $ %s\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         venv.activate_command,
     )
 
@@ -854,7 +1195,11 @@ def find_executable(name: str) -> Path:
     checkout.add_argument(
         "-b",
         "--branch",
+<<<<<<< HEAD
         help="Branch name to checkout",
+=======
+        help="Branch name to checkout (if omitted, checks out in detached HEAD mode)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dest="branch",
         default=None,
         metavar="NAME",
@@ -930,14 +1275,24 @@ def parse_arguments() -> argparse.Namespace:
 
 def main() -> None:
     """Main entry point"""
+<<<<<<< HEAD
     global LOGGER
     args = parse_arguments()
+=======
+    global LOGGER, VERBOSE
+    args = parse_arguments()
+    VERBOSE = args.verbose
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     status = check_branch(args.subcmd, args.branch)
     if status:
         sys.exit(status)
 
     pip_source = None
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for toolkit in ("CUDA", "ROCm"):
         accel = toolkit.lower()
         if hasattr(args, accel):
@@ -977,7 +1332,10 @@ def main() -> None:
             packages=PACKAGES_TO_INSTALL,
             subcommand=args.subcmd,
             branch=args.branch,
+<<<<<<< HEAD
             logger=logger,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
diff --git a/tools/nvcc_fix_deps.py b/tools/nvcc_fix_deps.py
index 0c0c9db66693..036e37252456 100644
--- a/tools/nvcc_fix_deps.py
+++ b/tools/nvcc_fix_deps.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 """Tool to fix the nvcc's dependecy file output
+=======
+"""Tool to fix the nvcc's dependency file output
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Usage: python nvcc_fix_deps.py nvcc [nvcc args]...
 
diff --git a/tools/optional_submodules.py b/tools/optional_submodules.py
new file mode 100644
index 000000000000..1e7589edf2fb
--- /dev/null
+++ b/tools/optional_submodules.py
@@ -0,0 +1,63 @@
+import os
+from pathlib import Path
+from subprocess import check_call
+
+
+repo_root = Path(__file__).absolute().parent.parent
+third_party_path = repo_root / "third_party"
+
+
+def _read_file(path: Path) -> str:
+    with path.open(encoding="utf-8") as f:
+        return f.read().strip()
+
+
+def _checkout_by_tag(repo: str, tag: str) -> None:
+    check_call(
+        [
+            "git",
+            "clone",
+            "--depth",
+            "1",
+            "--branch",
+            tag,
+            repo,
+        ],
+        cwd=third_party_path,
+    )
+
+
+def read_nccl_pin() -> str:
+    nccl_file = "nccl-cu12.txt"
+    if os.getenv("DESIRED_CUDA", os.getenv("CUDA_VERSION", "")).startswith("11"):
+        nccl_file = "nccl-cu11.txt"
+    nccl_pin_path = repo_root / ".ci" / "docker" / "ci_commit_pins" / nccl_file
+    return _read_file(nccl_pin_path)
+
+
+def checkout_nccl() -> None:
+    release_tag = read_nccl_pin()
+    print(f"-- Checkout nccl release tag: {release_tag}")
+    nccl_basedir = third_party_path / "nccl"
+    if not nccl_basedir.exists():
+        _checkout_by_tag("https://github.com/NVIDIA/nccl", release_tag)
+
+
+def checkout_eigen() -> None:
+    eigen_tag = _read_file(third_party_path / "eigen_pin.txt")
+    print(f"-- Checkout Eigen release tag: {eigen_tag}")
+    eigen_basedir = third_party_path / "eigen"
+    if not eigen_basedir.exists():
+        _checkout_by_tag("https://gitlab.com/libeigen/eigen", eigen_tag)
+
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) == 1:
+        # If no arguments are given checkout all optional dependency
+        checkout_nccl()
+        checkout_eigen()
+    else:
+        # Otherwise just call top-level function of choice
+        globals()[sys.argv[1]]()
diff --git a/tools/packaging/build_wheel.py b/tools/packaging/build_wheel.py
index 96e4978c7fcd..ec2e53c75340 100644
--- a/tools/packaging/build_wheel.py
+++ b/tools/packaging/build_wheel.py
@@ -62,7 +62,11 @@ def venv(interpreter: str) -> Iterator[str]:
 
 
 class Builder:
+<<<<<<< HEAD
     # The python interpeter that we should be using
+=======
+    # The python interpreter that we should be using
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     interpreter: str
 
     def __init__(self, interpreter: str) -> None:
@@ -124,7 +128,11 @@ def main() -> None:
         with venv(interpreter) as venv_interpreter:
             builder = Builder(venv_interpreter)
             # clean actually requires setuptools so we need to ensure we
+<<<<<<< HEAD
             # install requriements before
+=======
+            # install requirements before
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             builder.install_requirements()
             builder.clean()
 
diff --git a/tools/packaging/split_wheel.py b/tools/packaging/split_wheel.py
index 1aa77aa5c694..f81399b65196 100644
--- a/tools/packaging/split_wheel.py
+++ b/tools/packaging/split_wheel.py
@@ -76,11 +76,23 @@ def split_build(cmd: str) -> None:
         extra_env={"BUILD_LIBTORCH_WHL": "1", "BUILD_PYTHON_ONLY": "0"},
     )
     logger.info("Running %s for torch wheel", cmd)
+<<<<<<< HEAD
     # NOTE: Passing --cmake is necessary here since the torch frontend has it's
     # own cmake files that it needs to generate
     setup_py(
         [cmd, "--cmake"],
         extra_env={"BUILD_LIBTORCH_WHL": "0", "BUILD_PYTHON_ONLY": "1"},
+=======
+    # NOTE: Passing CMAKE_FRESH=1 is necessary here since the torch frontend has it's
+    # own cmake files that it needs to generate
+    setup_py(
+        [cmd],
+        extra_env={
+            "BUILD_LIBTORCH_WHL": "0",
+            "BUILD_PYTHON_ONLY": "1",
+            "CMAKE_FRESH": "1",
+        },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 6e98db067588..9cc2f011dad5 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 from __future__ import annotations
 
 import argparse
@@ -29,6 +30,8 @@
     from collections.abc import Sequence
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 This module implements generation of type stubs for PyTorch,
 enabling use of autocomplete in IDEs like PyCharm, which otherwise
@@ -53,6 +56,41 @@
 read gen_pyi for the gory details.
 """
 
+<<<<<<< HEAD
+=======
+from __future__ import annotations
+
+import argparse
+import collections
+import importlib
+import inspect
+import sys
+import textwrap
+from typing import TYPE_CHECKING
+from unittest.mock import Mock, patch
+from warnings import warn
+
+from tools.autograd.gen_python_functions import (
+    group_overloads,
+    load_signatures,
+    should_generate_py_binding,
+)
+
+from torchgen.api.python import (
+    format_function_signature as defs,
+    PythonSignatureGroup,
+    PythonSignatureNativeFunctionPair,
+    returns_structseq_pyi,
+)
+from torchgen.gen import parse_native_yaml, parse_tags_yaml
+from torchgen.model import _TorchDispatchModeKey, DispatchKey, Variant
+from torchgen.utils import FileManager
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def get_py_torch_functions(
     python_funcs: Sequence[PythonSignatureNativeFunctionPair],
@@ -84,36 +122,69 @@ def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool:
 # TODO: Consider defining some aliases for our Union[...] types, to make
 # the stubs to read on the human eye.
 
+<<<<<<< HEAD
 DEVICE_PARAM = "device: Optional[DeviceLikeType] = None"
 FACTORY_PARAMS = f"dtype: Optional[_dtype] = None, {DEVICE_PARAM}, requires_grad: _bool = False, pin_memory: _bool = False"
+=======
+DEVICE_PARAM = "device: DeviceLikeType | None = None"
+FACTORY_PARAMS = [
+    "dtype: _dtype | None = None",
+    DEVICE_PARAM,
+    "requires_grad: _bool = False",
+    "pin_memory: _bool = False",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # NOTE: specifying indices for Tensor.__getitem__
 # We can imitate numpy's definition of ndarray.__getitem__ found in numpy/__init__.pyi:
 #
 # key: (
+<<<<<<< HEAD
 #     None
 #     | slice
 #     | ellipsis
 #     | SupportsIndex
 #     | _ArrayLikeInt_co
 #     | tuple[None | slice | ellipsis | _ArrayLikeInt_co | SupportsIndex, ...]
+=======
+#     slice
+#     | EllipsisType
+#     | None
+#     | _ArrayLikeInt_co
+#     | SupportsIndex
+#     | tuple[slice | EllipsisType | None | _ArrayLikeInt_co | SupportsIndex, ...]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # )
 #
 # where:
 #
 # _ArrayLikeInt_co = _DualArrayLike[
+<<<<<<< HEAD
 #     dtype[Union[bool_, integer[Any]]],
 #     Union[bool, int],
+=======
+#     dtype[bool_ | integer[Any]],
+#     bool | int,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # ]
 #
 # and
 #
+<<<<<<< HEAD
 # _DualArrayLike = Union[
 #     _SupportsArray[_DType],
 #     _NestedSequence[_SupportsArray[_DType]],
 #     _T,
 #     _NestedSequence[_T],
 # ]
+=======
+# _DualArrayLike = (
+#     _SupportsArray[_DType]
+#     | _NestedSequence[_SupportsArray[_DType]]
+#     | _T
+#     | _NestedSequence[_T]
+# )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 # Moreover, _NestedSequence is a Protocol that matches arbitrary nesting of list/tuple.
 # We can substitute and simplify:
@@ -121,6 +192,7 @@ def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool:
 # _ArrayLikeInt_co -> [bool | int | | Tensor | NestedSequence[bool | int] | NestedSequence[Tensor]]
 # which leaves us with key: T | tuple[T, ...], where T is:
 # T = (
+<<<<<<< HEAD
 #     None | bool | int | slice | ellipsis | SupportsIndex
 #     | Tensor | _NestedSequence[Tensor] | _NestedSequence[bool | int]
 # )
@@ -129,6 +201,17 @@ def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool:
 _leaf_types = "Union[None, _bool, _int, slice, ellipsis, Tensor]"  # not SupportsIndex!
 _index = f"Union[SupportsIndex, {_leaf_types}, _NestedSequence[{_leaf_types}]]"
 INDICES = f"indices: Union[{_index}, tuple[{_index}, ...]]"
+=======
+#     SupportsIndex | bool | int | slice | EllipsisType | None
+#     | Tensor | _NestedSequence[Tensor] | _NestedSequence[bool | int]
+# )
+_leaf_types = (
+    "_bool | _int | slice | EllipsisType | Tensor | None"  # not SupportsIndex!
+)
+_index_types = f"SupportsIndex | {_leaf_types} | _NestedSequence[{_leaf_types}]"
+_index_type_def = f"_Index: TypeAlias = {_index_types}  # fmt: skip"
+INDICES = "indices: _Index | tuple[_Index, ...]"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 blocklist = [
     "__init_subclass__",
@@ -244,6 +327,7 @@ def sig_for_ops(opname: str) -> list[str]:
     name = opname[2:-2]
     if name == "rpow":
         return [  # somehow required to make mypy ci happy?
+<<<<<<< HEAD
             f"def {opname}(self, other: Union[Tensor, Number, _complex]) -> Tensor: ... # type: ignore[has-type]"
         ]
     elif name in arithmetic_ops:
@@ -267,6 +351,32 @@ def sig_for_ops(opname: str) -> list[str]:
     elif name in unary_ops:
         return [f"def {opname}(self) -> Tensor: ..."]
     elif name in to_py_type_ops:
+=======
+            f"def {opname}(self, other: Tensor | Number | _complex) -> Tensor: ...  # type: ignore[has-type]"
+        ]
+    elif name in arithmetic_ops:
+        if name.startswith("i"):
+            # In-place binary-operation dunder methods, like `__iadd__`, should return `Self`
+            return [
+                f"def {opname}(self, other: Tensor | Number | _complex) -> Tensor: ...  # noqa: PYI034"
+            ]
+        return [f"def {opname}(self, other: Tensor | Number | _complex) -> Tensor: ..."]
+    elif name in logic_ops:
+        return [f"def {opname}(self, other: Tensor | _int) -> Tensor: ..."]
+    elif name in shift_ops:
+        return [f"def {opname}(self, other: Tensor | _int) -> Tensor: ..."]
+    elif name in symmetric_comparison_ops:
+        return [
+            # unsafe override https://github.com/python/mypy/issues/5704
+            f"def {opname}(self, other: Tensor | Number | _complex) -> Tensor: ...  # type: ignore[overload-overlap]",
+            f"def {opname}(self, other: object) -> _bool: ...",
+        ]
+    elif name in asymmetric_comparison_ops:
+        return [f"def {opname}(self, other: Tensor | Number | _complex) -> Tensor: ..."]
+    elif name in unary_ops:
+        return [f"def {opname}(self) -> Tensor: ..."]
+    if name in to_py_type_ops:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name in {"bool", "float", "complex"}:
             tname = name
         elif name == "nonzero":
@@ -274,10 +384,16 @@ def sig_for_ops(opname: str) -> list[str]:
         else:
             tname = "int"
         if tname in {"float", "int", "bool", "complex"}:
+<<<<<<< HEAD
             tname = "builtins." + tname
         return [f"def {opname}(self) -> {tname}: ..."]
     else:
         raise Exception("unknown op", opname)  # noqa: TRY002
+=======
+            tname = "_" + tname
+        return [f"def {opname}(self) -> {tname}: ..."]
+    raise ValueError(f"unknown op {opname!r}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def generate_type_hints(sig_group: PythonSignatureGroup) -> list[str]:
@@ -325,6 +441,7 @@ def get_max_pool_dispatch(name: str, arg_list: list[str]) -> dict[str, list[str]
     # Otherwise force return_indices to be kwarg
     arg_list_keyword = arg_list.copy()
     arg_list_keyword.insert(flag_pos, "*")
+<<<<<<< HEAD
     tmpl = "def {name}({args}) -> {{return_type}}: ..."
     return {
         name: [
@@ -339,6 +456,33 @@ def get_max_pool_dispatch(name: str, arg_list: list[str]) -> dict[str, list[str]
             tmpl.format(name=name, args=", ".join(arg_list_keyword)).format(
                 return_indices="return_indices: Literal[True]",
                 return_type="tuple[Tensor, Tensor]",
+=======
+    return {
+        name: [
+            defs(
+                name,
+                [
+                    arg.format(return_indices="return_indices: Literal[False] = False")
+                    for arg in arg_list
+                ],
+                "Tensor",
+            ),
+            defs(
+                name,
+                [
+                    arg.format(return_indices="return_indices: Literal[True]")
+                    for arg in arg_list_positional
+                ],
+                "tuple[Tensor, Tensor]",
+            ),
+            defs(
+                name,
+                [
+                    arg.format(return_indices="return_indices: Literal[True]")
+                    for arg in arg_list_keyword
+                ],
+                "tuple[Tensor, Tensor]",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         ]
     }
@@ -346,6 +490,7 @@ def get_max_pool_dispatch(name: str, arg_list: list[str]) -> dict[str, list[str]
 
 def gen_nn_functional(fm: FileManager) -> None:
     INPUT = "input: Tensor"
+<<<<<<< HEAD
     KERNEL_SIZE = "kernel_size: Union[_int, _size]"
     STRIDE_PADDING = ", ".join(
         [
@@ -353,6 +498,13 @@ def gen_nn_functional(fm: FileManager) -> None:
             "padding: Union[_int, _size] = 0",
         ]
     )
+=======
+    KERNEL_SIZE = "kernel_size: _int | _size"
+    STRIDE_PADDING = [
+        "stride: _int | _size | None = None",
+        "padding: _int | _size = 0",
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO the list for `torch._C._nn` is nonexhaustive
     unsorted_c_nn_function_hints: dict[str, list[str]] = {}
@@ -361,6 +513,7 @@ def gen_nn_functional(fm: FileManager) -> None:
         unsorted_c_nn_function_hints.update(
             {
                 f"avg_pool{d}d": [
+<<<<<<< HEAD
                     f"def avg_pool{d}d({{}}) -> Tensor: ...".format(
                         ", ".join(
                             [
@@ -384,21 +537,68 @@ def gen_nn_functional(fm: FileManager) -> None:
                                 "_random_samples: Tensor",
                             ]
                         ),
+=======
+                    defs(
+                        f"avg_pool{d}d",
+                        [
+                            INPUT,
+                            KERNEL_SIZE,
+                            *STRIDE_PADDING,
+                            "ceil_mode: bool = False",
+                            "count_include_pad: bool = True",
+                            "divisor_override: int | None = None",
+                        ],
+                        "Tensor",
+                    )
+                ],
+                f"fractional_max_pool{d}d": [
+                    defs(
+                        f"fractional_max_pool{d}d",
+                        [
+                            INPUT,
+                            KERNEL_SIZE,
+                            "output_size: _int | _size",
+                            "_random_samples: Tensor",
+                        ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "tuple[Tensor, Tensor]",
                     )
                 ],
                 f"adaptive_max_pool{d}d": [
+<<<<<<< HEAD
                     f"def adaptive_max_pool{d}d({{}}) -> {{}}: ...".format(
                         ", ".join([f"{INPUT}", "output_size: Union[_int, _size]"]),
                         "tuple[Tensor, Tensor]",
                     )
                 ],
+=======
+                    defs(
+                        f"adaptive_max_pool{d}d",
+                        [
+                            INPUT,
+                            "output_size: _int | _size",
+                        ],
+                        "tuple[Tensor, Tensor]",
+                    )
+                ],
+                f"adaptive_avg_pool{d}d": [
+                    defs(
+                        f"adaptive_avg_pool{d}d",
+                        [
+                            INPUT,
+                            "output_size: _int | _size",
+                        ],
+                        "Tensor",
+                    )
+                ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         )
 
     unsorted_c_nn_function_hints.update(
         {
             "hardtanh": [
+<<<<<<< HEAD
                 "def hardtanh({}) -> Tensor: ...".format(
                     ", ".join(
                         [
@@ -493,6 +693,129 @@ def gen_nn_functional(fm: FileManager) -> None:
                             "enable_gqa: bool = False",
                         ]
                     )
+=======
+                defs(
+                    "hardtanh",
+                    [
+                        "input: Tensor",
+                        "min_val: float = ...",
+                        "max_val: float = ...",
+                        "*",
+                        "out: Tensor | None = None",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "hardtanh_": [
+                defs(
+                    "hardtanh_",
+                    ["input: Tensor", "min_val: float = ...", "max_val: float = ..."],
+                    "Tensor",
+                ),
+            ],
+            "elu_": [defs("elu_", ["input: Tensor", "alpha: float = ..."], "Tensor")],
+            "leaky_relu": [
+                defs(
+                    "leaky_relu",
+                    [
+                        "input: Tensor",
+                        "negative_slope: float = ...",
+                        "*",
+                        "out: Tensor | None = None",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "leaky_relu_": [
+                defs(
+                    "leaky_relu_",
+                    ["input: Tensor", "negative_slope: float = ..."],
+                    "Tensor",
+                )
+            ],
+            "log_sigmoid": [defs("log_sigmoid", ["input: Tensor"], "Tensor")],
+            "gelu": [
+                defs("gelu", ["input: Tensor", "approximate: str = ..."], "Tensor")
+            ],
+            "softplus": [
+                defs(
+                    "softplus",
+                    ["input: Tensor", "beta: float = ...", "threshold: float = ..."],
+                    "Tensor",
+                )
+            ],
+            "softshrink": [
+                defs("softshrink", ["input: Tensor", "lambd: float = ..."], "Tensor")
+            ],
+            "hardsigmoid": [
+                defs(
+                    "hardsigmoid",
+                    ["input: Tensor", "*", "out: Tensor | None = None"],
+                    "Tensor",
+                )
+            ],
+            "linear": [
+                defs(
+                    "linear",
+                    ["input: Tensor", "weight: Tensor", "bias: Tensor | None = None"],
+                    "Tensor",
+                )
+            ],
+            "pad": [
+                defs(
+                    "pad",
+                    [
+                        "input: Tensor",
+                        "pad: Sequence[int]",
+                        "mode: str = ...",
+                        "value: float | None = None",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "one_hot": [
+                defs("one_hot", ["tensor: Tensor", "num_classes: int = ..."], "Tensor")
+            ],
+            "scaled_dot_product_attention": [
+                defs(
+                    "scaled_dot_product_attention",
+                    [
+                        "query: Tensor",
+                        "key: Tensor",
+                        "value: Tensor",
+                        "attn_mask: Tensor | None = None",
+                        "dropout_p: float = 0.0",
+                        "is_causal: bool = False",
+                        "scale: float | None = None",
+                        "enable_gqa: bool = False",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "binary_cross_entropy": [
+                defs(
+                    "binary_cross_entropy",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "weight: Tensor | None = None",
+                        "reduction: str = ...",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "col2im": [
+                defs(
+                    "col2im",
+                    [
+                        INPUT,
+                        "output_size: _int | _size",
+                        KERNEL_SIZE,
+                        "dilation: _int | _size",
+                        *STRIDE_PADDING,
+                    ],
+                    "Tensor",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             ],
         }
@@ -504,6 +827,7 @@ def gen_nn_functional(fm: FileManager) -> None:
             hints = ["@overload\n" + h for h in hints]
         c_nn_function_hints += hints
 
+<<<<<<< HEAD
     # Functions imported into `torch.nn.functional` from `torch`, perhaps being filtered
     # through an `_add_docstr` call
     torch_imports = [
@@ -532,11 +856,49 @@ def gen_nn_functional(fm: FileManager) -> None:
         "cosine_similarity",
     ]
     imported_hints = [f"from torch import {_} as {_}" for _ in torch_imports]
+=======
+    extra_nn_functional___all__: list[str] = []
+
+    # Functions imported into `torch.nn.functional` from `torch`, perhaps being filtered
+    # through an `_add_docstr` call
+    torch_imports = [
+        "adaptive_avg_pool1d",
+        "avg_pool1d",
+        "bilinear",
+        "celu_",
+        "channel_shuffle",
+        "conv_tbc",
+        "conv_transpose1d",
+        "conv_transpose2d",
+        "conv_transpose3d",
+        "conv1d",
+        "conv2d",
+        "conv3d",
+        "cosine_similarity",
+        "hardshrink",
+        "native_channel_shuffle",
+        "pairwise_distance",
+        "pdist",
+        "pixel_shuffle",
+        "pixel_unshuffle",
+        "prelu",
+        "relu_",
+        "rrelu_",
+        "selu_",
+    ]
+    imported_hints = [
+        "from torch import (",
+        *sorted(f"    {name} as {name}," for name in torch_imports),
+        ")",
+    ]
+    extra_nn_functional___all__.extend(torch_imports)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Functions imported into `torch.nn.functional` from `torch._C._nn`
     c_nn_imports = [
         "avg_pool2d",
         "avg_pool3d",
+<<<<<<< HEAD
         "hardtanh_",
         "elu_",
         "leaky_relu_",
@@ -553,6 +915,27 @@ def gen_nn_functional(fm: FileManager) -> None:
     imported_hints.append(
         "from torch._C._nn import log_sigmoid\nlogsigmoid = log_sigmoid"
     )
+=======
+        "elu_",
+        "gelu",
+        "hardtanh_",
+        "leaky_relu_",
+        "linear",
+        "log_sigmoid",
+        "one_hot",
+        "pad",
+        "scaled_dot_product_attention",
+        "softplus",
+        "softshrink",
+    ]
+    renamed = {"log_sigmoid": "logsigmoid"}
+    imported_hints += [
+        "from torch._C._nn import (",
+        *sorted(f"    {name} as {renamed.get(name, name)}," for name in c_nn_imports),
+        ")",
+    ]
+    extra_nn_functional___all__.extend(renamed.get(name, name) for name in c_nn_imports)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Functions generated by `torch._jit_internal.boolean_dispatch` in `nn.functional`
     unsorted_dispatched_hints: dict[str, list[str]] = {}
@@ -562,10 +945,17 @@ def gen_nn_functional(fm: FileManager) -> None:
             **get_max_pool_dispatch(
                 f"max_pool{d}d",
                 [
+<<<<<<< HEAD
                     f"{INPUT}",
                     f"{KERNEL_SIZE}",
                     f"{STRIDE_PADDING}",
                     "dilation: Union[_int, _size] = 1",
+=======
+                    INPUT,
+                    KERNEL_SIZE,
+                    *STRIDE_PADDING,
+                    "dilation: _int | _size = 1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "ceil_mode: bool = False",
                     "{return_indices}",
                 ],
@@ -573,22 +963,43 @@ def gen_nn_functional(fm: FileManager) -> None:
             **get_max_pool_dispatch(
                 f"fractional_max_pool{d}d",
                 [
+<<<<<<< HEAD
                     f"{INPUT}",
                     f"{KERNEL_SIZE}",
                     "output_size: Optional[Union[_int, _size]] = None",
                     "output_ratio: Optional[_ratio_any_t] = None",
                     "{return_indices}",
                     "_random_samples: Optional[Tensor] = None",
+=======
+                    INPUT,
+                    KERNEL_SIZE,
+                    "output_size: _int | _size | None = None",
+                    "output_ratio: _ratio_any_t | None = None",
+                    "{return_indices}",
+                    "_random_samples: Tensor | None = None",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ],
             ),
             **get_max_pool_dispatch(
                 f"adaptive_max_pool{d}d",
+<<<<<<< HEAD
                 [f"{INPUT}", "output_size: Union[_int, _size]", "{return_indices}"],
+=======
+                [
+                    INPUT,
+                    "output_size: _int | _size",
+                    "{return_indices}",
+                ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
     # There's no fractional_max_pool1d
     del unsorted_dispatched_hints["fractional_max_pool1d"]
+<<<<<<< HEAD
+=======
+    extra_nn_functional___all__.extend(unsorted_dispatched_hints)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     dispatched_hints: list[str] = []
     for _, hints in sorted(unsorted_dispatched_hints.items()):
@@ -596,12 +1007,25 @@ def gen_nn_functional(fm: FileManager) -> None:
             hints = ["@overload\n" + h for h in hints]
         dispatched_hints += hints
 
+<<<<<<< HEAD
+=======
+    extra_nn_functional___all__ = [
+        "__all__ += [",
+        *(f'    "{name}",' for name in extra_nn_functional___all__),
+        "]",
+    ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fm.write_with_template(
         "torch/nn/functional.pyi",
         "torch/nn/functional.pyi.in",
         lambda: {
             "imported_hints": imported_hints,
             "dispatched_hints": dispatched_hints,
+<<<<<<< HEAD
+=======
+            "extra_nn_functional___all__": extra_nn_functional___all__,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         },
     )
     fm.write_with_template(
@@ -648,12 +1072,25 @@ def mock_add_docstr(func: Mock, docstr: str) -> None:
 
 
 def add_docstr_to_hint(docstr: str, hint: str) -> str:
+<<<<<<< HEAD
     if "..." in hint:  # function or method
         assert hint.endswith("..."), f"Hint `{hint}` does not end with '...'"
         hint = hint[:-3]  # remove "..."
         return "\n    ".join([hint, 'r"""'] + docstr.split("\n") + ['"""', "..."])
     else:  # attribute or property
         return f'{hint}\nr"""{docstr}"""\n'
+=======
+    docstr = inspect.cleandoc(docstr).strip()
+    if "..." in hint:  # function or method
+        assert hint.endswith("..."), f"Hint `{hint}` does not end with '...'"
+        hint = hint.removesuffix("...").rstrip()  # remove "..."
+        content = hint + "\n" + textwrap.indent(f'r"""\n{docstr}\n"""', prefix="    ")
+        # Remove trailing whitespace on each line
+        return "\n".join(map(str.rstrip, content.splitlines())).rstrip()
+
+    # attribute or property
+    return f'{hint}\nr"""{docstr}"""'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def gen_pyi(
@@ -691,6 +1128,7 @@ def gen_pyi(
         unsorted_function_hints.update(
             {
                 f"sparse_{n}_tensor": [
+<<<<<<< HEAD
                     f"def sparse_{n}_tensor({{}}) -> Tensor: ...".format(
                         ", ".join(
                             [
@@ -705,6 +1143,22 @@ def gen_pyi(
                                 "check_invariants: Optional[_bool] = None",
                             ]
                         ),
+=======
+                    defs(
+                        f"sparse_{n}_tensor",
+                        [
+                            f"{n1}_indices: Tensor | list",
+                            f"{n2}_indices: Tensor | list",
+                            "values: Tensor | list",
+                            "size: _size | None = None",
+                            "*",
+                            "dtype: _dtype | None = None",
+                            "device: DeviceLikeType | None = None",
+                            "requires_grad: _bool = False",
+                            "check_invariants: _bool | None = None",
+                        ],
+                        "Tensor",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 ],
             }
@@ -712,6 +1166,7 @@ def gen_pyi(
 
     unsorted_function_hints.update(
         {
+<<<<<<< HEAD
             "set_flush_denormal": ["def set_flush_denormal(mode: _bool) -> _bool: ..."],
             "get_default_dtype": ["def get_default_dtype() -> _dtype: ..."],
             "asarray": [
@@ -761,10 +1216,60 @@ def gen_pyi(
             "get_num_interop_threads": ["def get_num_interop_threads() -> _int: ..."],
             "set_num_interop_threads": [
                 "def set_num_interop_threads(num: _int) -> None: ..."
+=======
+            "set_flush_denormal": [
+                defs("set_flush_denormal", ["mode: _bool"], "_bool")
+            ],
+            "get_default_dtype": [defs("get_default_dtype", [], "_dtype")],
+            "asarray": [
+                defs(
+                    "asarray",
+                    [
+                        "obj: Any",
+                        "*",
+                        "dtype: _dtype | None = None",
+                        "device: DeviceLikeType | None = None",
+                        "copy: _bool | None = None",
+                        "requires_grad: _bool = False",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "from_numpy": [defs("from_numpy", ["ndarray"], "Tensor")],
+            "frombuffer": [
+                defs(
+                    "frombuffer",
+                    [
+                        "buffer: Any",
+                        "*",
+                        "dtype: _dtype",
+                        "count: int = -1",
+                        "offset: int = 0",
+                        "requires_grad: _bool = False",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "numel": [defs("numel", ["self: Tensor"], "_int")],
+            "as_tensor": [
+                defs(
+                    "as_tensor",
+                    ["data: Any", "dtype: _dtype | None = None", DEVICE_PARAM],
+                    "Tensor",
+                )
+            ],
+            "get_num_threads": [defs("get_num_threads", [], "_int")],
+            "set_num_threads": [defs("set_num_threads", ["num: _int"], "None")],
+            "init_num_threads": [defs("init_num_threads", [], "None")],
+            "get_num_interop_threads": [defs("get_num_interop_threads", [], "_int")],
+            "set_num_interop_threads": [
+                defs("set_num_interop_threads", ["num: _int"], "None")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
             # These functions are explicitly disabled by
             # SKIP_PYTHON_BINDINGS because they are hand bound.
             # Correspondingly, we must hand-write their signatures.
+<<<<<<< HEAD
             "tensor": [f"def tensor(data: Any, {FACTORY_PARAMS}) -> Tensor: ..."],
             "sparse_coo_tensor": [
                 "def sparse_coo_tensor({}) -> Tensor: ...".format(
@@ -804,11 +1309,53 @@ def gen_pyi(
             "_sync": ["def _sync(t: Tensor) -> None: ..."],
             "_is_functional_tensor": [
                 "def _is_functional_tensor(t: Tensor) -> _bool: ..."
+=======
+            "tensor": [defs("tensor", ["data: Any", *FACTORY_PARAMS], "Tensor")],
+            "sparse_coo_tensor": [
+                defs(
+                    "sparse_coo_tensor",
+                    [
+                        "indices: Tensor",
+                        "values: Tensor | list",
+                        "size: _size | None = None",
+                        "*",
+                        "dtype: _dtype | None = None",
+                        "device: DeviceLikeType | None = None",
+                        "requires_grad: _bool = False",
+                        "check_invariants: _bool | None = None",
+                        "is_coalesced: _bool | None = None",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "sparse_compressed_tensor": [
+                defs(
+                    "sparse_compressed_tensor",
+                    [
+                        "compressed_indices: Tensor | list",
+                        "plain_indices: Tensor | list",
+                        "values: Tensor | list",
+                        "size: _size | None = None",
+                        "*",
+                        "dtype: _dtype | None = None",
+                        "layout: _layout | None = None",
+                        "device: DeviceLikeType | None = None",
+                        "requires_grad: _bool = False",
+                        "check_invariants: _bool | None = None",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "_sync": [defs("_sync", ["t: Tensor"], "None")],
+            "_is_functional_tensor": [
+                defs("_is_functional_tensor", ["t: Tensor"], "_bool")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
             "_is_functional_tensor_base": [
                 "def _is_functional_tensor_base(t: Tensor) -> _bool: ..."
             ],
             "_from_functional_tensor": [
+<<<<<<< HEAD
                 "def _from_functional_tensor(t: Tensor) -> Tensor: ..."
             ],
             "_to_functional_tensor": [
@@ -819,11 +1366,26 @@ def gen_pyi(
             ],
             "_functionalize_commit_update": [
                 "def _functionalize_commit_update(t: Tensor) -> None: ..."
+=======
+                defs("_from_functional_tensor", ["t: Tensor"], "Tensor")
+            ],
+            "_to_functional_tensor": [
+                defs("_to_functional_tensor", ["t: Tensor"], "Tensor")
+            ],
+            "_functionalize_replace": [
+                defs(
+                    "_functionalize_replace", ["self_: Tensor", "other: Tensor"], "None"
+                )
+            ],
+            "_functionalize_commit_update": [
+                defs("_functionalize_commit_update", ["t: Tensor"], "None")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
             "_functionalize_unsafe_set": [
                 "def _functionalize_unsafe_set(dst: Tensor, src: Tensor) -> None: ..."
             ],
             "_functionalize_mark_mutation_hidden_from_autograd": [
+<<<<<<< HEAD
                 "def _functionalize_mark_mutation_hidden_from_autograd(t: Tensor) -> None: ..."
             ],
             "_functionalize_are_all_mutations_hidden_from_autograd": [
@@ -838,11 +1400,44 @@ def gen_pyi(
             "_functionalize_sync": ["def _functionalize_sync(t: Tensor) -> None: ..."],
             "_functionalize_was_storage_changed": [
                 "def _functionalize_was_storage_changed(tensor: Tensor) -> _bool: ..."
+=======
+                defs(
+                    "_functionalize_mark_mutation_hidden_from_autograd",
+                    ["t: Tensor"],
+                    "None",
+                )
+            ],
+            "_functionalize_are_all_mutations_hidden_from_autograd": [
+                defs(
+                    "_functionalize_are_all_mutations_hidden_from_autograd",
+                    ["t: Tensor"],
+                    "_bool",
+                )
+            ],
+            "_functionalize_are_all_mutations_under_no_grad_or_inference_mode": [
+                defs(
+                    "_functionalize_are_all_mutations_under_no_grad_or_inference_mode",
+                    ["t: Tensor"],
+                    "_bool",
+                )
+            ],
+            "_functionalize_was_inductor_storage_resized": [
+                defs(
+                    "_functionalize_was_inductor_storage_resized",
+                    ["t: Tensor"],
+                    "_bool",
+                )
+            ],
+            "_functionalize_sync": [defs("_functionalize_sync", ["t: Tensor"], "None")],
+            "_functionalize_was_storage_changed": [
+                defs("_functionalize_was_storage_changed", ["tensor: Tensor"], "_bool")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
             "_functionalize_set_storage_changed": [
                 "def _functionalize_set_storage_changed(tensor: Tensor) -> _bool: ..."
             ],
             "_functionalize_has_metadata_mutation": [
+<<<<<<< HEAD
                 "def _functionalize_has_metadata_mutation(tensor: Tensor) -> _bool: ..."
             ],
             "_functionalize_apply_view_metas": [
@@ -1021,12 +1616,211 @@ def gen_pyi(
                             "out: Optional[Tensor] = None",
                         ]
                     )
+=======
+                defs(
+                    "_functionalize_has_metadata_mutation", ["tensor: Tensor"], "_bool"
+                )
+            ],
+            "_functionalize_apply_view_metas": [
+                defs(
+                    "_functionalize_apply_view_metas",
+                    ["tensor: Tensor", "base: Tensor"],
+                    "Tensor",
+                )
+            ],
+            "_functionalize_is_symbolic": [
+                defs("_functionalize_is_symbolic", ["tensor: Tensor"], "_bool")
+            ],
+            "_enable_functionalization": [
+                defs(
+                    "_enable_functionalization",
+                    ["*", "reapply_views: _bool = False"],
+                    "None",
+                )
+            ],
+            "_disable_functionalization": [defs("_disable_functionalization")],
+            "range": [
+                defs(
+                    "range",
+                    [
+                        "start: Number",
+                        "end: Number",
+                        "step: Number = 1",
+                        "*",
+                        "out: Tensor | None = None",
+                        *FACTORY_PARAMS,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "arange": [
+                defs(
+                    "arange",
+                    [
+                        "start: Number",
+                        "end: Number",
+                        "step: Number",
+                        "*",
+                        "out: Tensor | None = None",
+                        *FACTORY_PARAMS,
+                    ],
+                    "Tensor",
+                ),
+                defs(
+                    "arange",
+                    [
+                        "start: Number",
+                        "end: Number",
+                        "*",
+                        "out: Tensor | None = None",
+                        *FACTORY_PARAMS,
+                    ],
+                    "Tensor",
+                ),
+                defs(
+                    "arange",
+                    ["end: Number", "*", "out: Tensor | None = None", *FACTORY_PARAMS],
+                    "Tensor",
+                ),
+            ],
+            "linspace": [
+                defs(
+                    "linspace",
+                    [
+                        "start: Number",
+                        "end: Number",
+                        "steps: _int | None = None",
+                        "*",
+                        "out: Tensor | None = None",
+                        *FACTORY_PARAMS,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "logspace": [
+                defs(
+                    "logspace",
+                    [
+                        "start: Number",
+                        "end: Number",
+                        "steps: _int | None = None",
+                        "base: _float = 10.0",
+                        "*",
+                        "out: Tensor | None = None",
+                        *FACTORY_PARAMS,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "randint": [
+                defs(
+                    "randint",
+                    [
+                        "low: _int",
+                        "high: _int",
+                        "size: _size",
+                        "*",
+                        "generator: Generator | None = None",
+                        *FACTORY_PARAMS,
+                    ],
+                    "Tensor",
+                ),
+                defs(
+                    "randint",
+                    [
+                        "high: _int",
+                        "size: _size",
+                        "*",
+                        "generator: Generator | None = None",
+                        *FACTORY_PARAMS,
+                    ],
+                    "Tensor",
+                ),
+            ],
+            "full": [
+                defs(
+                    "full",
+                    [
+                        "size: _size",
+                        "fill_value: Number | _complex",
+                        "*",
+                        "out: Tensor | None = None",
+                        "layout: _layout = strided",
+                        *FACTORY_PARAMS,
+                    ],
+                    "Tensor",
+                ),
+                defs(
+                    "full",
+                    [
+                        "size: _size",
+                        "fill_value: Number | _complex",
+                        "*",
+                        "names: list[str | None]",
+                        "layout: _layout = strided",
+                        *FACTORY_PARAMS,
+                    ],
+                    "Tensor",
+                ),
+            ],
+            "is_grad_enabled": [defs("is_grad_enabled", [], "_bool")],
+            "is_inference_mode_enabled": [
+                defs("is_inference_mode_enabled", [], "_bool")
+            ],
+            "nonzero": [
+                defs(
+                    "nonzero",
+                    [
+                        "input: Tensor",
+                        "*",
+                        "as_tuple: Literal[False] = False",
+                        "out: Tensor | None = None",
+                    ],
+                    "Tensor",
+                ),
+                defs(
+                    "nonzero",
+                    ["input: Tensor", "*", "as_tuple: Literal[True]"],
+                    "tuple[Tensor, ...]",
+                ),
+            ],
+            "dsmm": [defs("dsmm", ["input: Tensor", "mat2: Tensor"], "Tensor")],
+            "hsmm": [defs("hsmm", ["input: Tensor", "mat2: Tensor"], "Tensor")],
+            "saddmm": [
+                defs(
+                    "saddmm",
+                    [
+                        "input: Tensor",
+                        "mat1: Tensor",
+                        "mat2: Tensor",
+                        "*",
+                        "beta: Number = 1",
+                        "alpha: Number = 1",
+                        "out: Tensor | None = None",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "spmm": [defs("spmm", ["input: Tensor", "mat2: Tensor"], "Tensor")],
+            "div": [
+                defs(
+                    "div",
+                    [
+                        "input: Tensor | Number",
+                        "other: Tensor | Number",
+                        "*",
+                        "rounding_mode: str | None = None",
+                        "out: Tensor | None = None",
+                    ],
+                    "Tensor",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             ],
         }
     )
     for binop in ["true_divide", "floor_divide"]:
         unsorted_function_hints[binop].append(
+<<<<<<< HEAD
             f"def {binop}(input: Union[Tensor, Number], other: Union[Tensor, Number], "
             "*, out: Optional[Tensor] = None) -> Tensor: ..."
         )
@@ -1039,6 +1833,45 @@ def gen_pyi(
         unsorted_function_hints[binop].append(
             f"def {binop}(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], "
             "*, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: ..."
+=======
+            defs(
+                binop,
+                [
+                    "input: Tensor | Number",
+                    "other: Tensor | Number",
+                    "*",
+                    "out: Tensor | None = None",
+                ],
+                "Tensor",
+            )
+        )
+    for binop in ["mul"]:
+        unsorted_function_hints[binop].append(
+            defs(
+                binop,
+                [
+                    "input: Tensor | Number | _complex",
+                    "other: Tensor | Number | _complex",
+                    "*",
+                    "out: Tensor | None = None",
+                ],
+                "Tensor",
+            )
+        )
+    for binop in ["add", "sub"]:
+        unsorted_function_hints[binop].append(
+            defs(
+                binop,
+                [
+                    "input: Tensor | Number | _complex",
+                    "other: Tensor | Number | _complex",
+                    "*",
+                    "alpha: Number | _complex | None = 1",
+                    "out: Tensor | None = None",
+                ],
+                "Tensor",
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     native_functions = parse_native_yaml(
@@ -1066,7 +1899,11 @@ def gen_pyi(
     def replace_special_case(hint: str) -> str:
         # NB: Keep this in sync with enum in aten/src/ATen/core/Reduction.h
         hint = hint.replace("at::Reduction::Mean", "1")
+<<<<<<< HEAD
         hint = hint.replace(": Tensor = None", ": Optional[Tensor] = None")
+=======
+        hint = hint.replace(": Tensor = None", ": Tensor | None = None")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return hint
 
     docstrs = gather_docstrs()
@@ -1083,10 +1920,15 @@ def replace_special_case(hint: str) -> str:
     # Generate type signatures for Tensor methods
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+<<<<<<< HEAD
+=======
+    index_type_def = [_index_type_def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unsorted_tensor_method_hints: dict[str, list[str]] = collections.defaultdict(list)
     unsorted_tensor_method_hints.update(
         {
             "size": [
+<<<<<<< HEAD
                 "def size(self, dim: None = None) -> Size: ...",
                 "def size(self, dim: _int) -> _int: ...",
             ],
@@ -1199,6 +2041,192 @@ def replace_special_case(hint: str) -> str:
                 "def is_contiguous(self, memory_format=torch.contiguous_format) -> _bool: ..."
             ],
             "_is_view": ["def _is_view(self) -> _bool: ..."],
+=======
+                defs("size", ["self", "dim: None = None"], "Size"),
+                defs("size", ["self", "dim: _int"], "_int"),
+            ],
+            "stride": [
+                defs("stride", ["self", "dim: None = None"], "tuple[_int, ...]"),
+                defs("stride", ["self", "dim: _int"], "_int"),
+            ],
+            "new_ones": [
+                defs("new_ones", ["self", "size: _size", *FACTORY_PARAMS], "Tensor")
+            ],
+            "new_tensor": [
+                defs("new_tensor", ["self", "data: Any", *FACTORY_PARAMS], "Tensor")
+            ],
+            "__new__": [defs("__new__", ["cls", "*args", "**kwargs"], "Self")],
+            # new and __init__ have the same signatures differ only in return type
+            # Adapted from legacy_tensor_ctor and legacy_tensor_new
+            "new": [
+                defs("new", ["cls", "*args: Any", DEVICE_PARAM], "Self"),
+                defs("new", ["cls", "storage: Storage"], "Self"),
+                defs("new", ["cls", "other: Tensor"], "Self"),
+                defs("new", ["cls", "size: _size", "*", DEVICE_PARAM], "Self"),
+            ],
+            "__init__": [
+                defs("__init__", ["self", "*args: Any", DEVICE_PARAM], "None"),
+                defs("__init__", ["self", "storage: Storage"], "None"),
+                defs("__init__", ["self", "other: Tensor"], "None"),
+                defs("__init__", ["self", "size: _size", "*", DEVICE_PARAM], "None"),
+            ],
+            "as_subclass": [defs("as_subclass", ["self", "cls: type[S]"], "S")],
+            "_make_subclass": [
+                "@staticmethod\n"
+                + defs(
+                    "_make_subclass",
+                    [
+                        "cls: type[S]",
+                        "data: Tensor",
+                        "require_grad: _bool = False",
+                        "dispatch_strides: _bool = False",
+                        "dispatch_device: _bool = False",
+                        "device_for_backend_keys: _device | None = None",
+                    ],
+                    "S",
+                )
+            ],
+            "_make_wrapper_subclass": [
+                "@staticmethod\n"
+                + defs(
+                    "_make_wrapper_subclass",
+                    [
+                        "cls: type[S]",
+                        "size: Sequence[_int | SymInt]",
+                        "strides: Sequence[_int | SymInt] | None = None",
+                        "storage_offset: _int | SymInt | None = None",
+                        "memory_format: torch.memory_format | None = None",
+                        "dtype: _dtype | None = None",
+                        "layout: _layout = strided",
+                        "device: _device | None = None",
+                        "pin_memory: _bool = False",
+                        "requires_grad: _bool = False",
+                        "dispatch_sizes_strides_policy: str | None = None",
+                        "dispatch_device: _bool = False",
+                        "dispatch_layout: _bool = False",
+                        "_extra_dispatch_keys: torch.DispatchKeySet | None = None",
+                        "storage_size: _int | SymInt | None = None",
+                    ],
+                    "S",
+                )
+            ],
+            "__contains__": [defs("__contains__", ["self", "item: Any", "/"], "_bool")],
+            "__getitem__": [defs("__getitem__", ["self", INDICES, "/"], "Tensor")],
+            "__setitem__": [
+                defs(
+                    "__setitem__",
+                    ["self", INDICES, "value: Tensor | Number", "/"],
+                    "None",
+                )
+            ],
+            "tolist": [defs("tolist", ["self"], "list")],
+            "requires_grad_": [
+                defs("requires_grad_", ["self", "mode: _bool = True"], "Tensor")
+            ],
+            "element_size": [defs("element_size", ["self"], "_int")],
+            "data_ptr": [defs("data_ptr", ["self"], "_int")],
+            "dim": [defs("dim", ["self"], "_int")],
+            "nonzero": [
+                defs(
+                    "nonzero",
+                    ["self", "*", "as_tuple: Literal[False] = False"],
+                    "Tensor",
+                ),
+                defs(
+                    "nonzero",
+                    ["self", "*", "as_tuple: Literal[True]"],
+                    "tuple[Tensor, ...]",
+                ),
+            ],
+            "numel": [defs("numel", ["self"], "_int")],
+            "ndimension": [defs("ndimension", ["self"], "_int")],
+            "nelement": [defs("nelement", ["self"], "_int")],
+            "cuda": [
+                defs(
+                    "cuda",
+                    [
+                        "self",
+                        "device: _device | _int | str | None = None",
+                        "non_blocking: _bool = False",
+                        "memory_format: torch.memory_format = torch.preserve_format",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "xpu": [
+                defs(
+                    "xpu",
+                    [
+                        "self",
+                        "device: _device | _int | str | None = None",
+                        "non_blocking: _bool = False",
+                        "memory_format: torch.memory_format = torch.preserve_format",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "cpu": [
+                defs(
+                    "cpu",
+                    [
+                        "self",
+                        "memory_format: torch.memory_format = torch.preserve_format",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "numpy": [
+                defs("numpy", ["self", "*", "force: _bool = False"], "numpy.ndarray")
+            ],
+            "apply_": [defs("apply_", ["self", "callable: Callable"], "Tensor")],
+            "map_": [
+                defs("map_", ["self", "other: Tensor", "callable: Callable"], "Tensor")
+            ],
+            "map2_": [
+                defs(
+                    "map2_",
+                    ["self", "x: Tensor", "y: Tensor", "callable: Callable"],
+                    "Tensor",
+                )
+            ],
+            "storage": [defs("untyped_storage", ["self"], "UntypedStorage")],
+            "storage_type": [defs("storage_type", ["self"], "Storage")],
+            "type": [
+                defs(
+                    "type",
+                    ["self", "dtype: None = None", "non_blocking: _bool = False"],
+                    "str",
+                ),
+                defs(
+                    "type",
+                    ["self", "dtype: str | _dtype", "non_blocking: _bool = False"],
+                    "Tensor",
+                ),
+            ],
+            "get_device": [defs("get_device", ["self"], "_int")],
+            "contiguous": [
+                defs(
+                    "contiguous",
+                    [
+                        "self",
+                        "memory_format: torch.memory_format = torch.contiguous_format",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "has_names": [defs("has_names", ["self"], "_bool")],
+            "is_contiguous": [
+                defs(
+                    "is_contiguous",
+                    [
+                        "self",
+                        "memory_format: torch.memory_format = torch.contiguous_format",
+                    ],
+                    "_bool",
+                )
+            ],
+            "_is_view": [defs("_is_view", ["self"], "_bool")],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "is_cpu": ["is_cpu: _bool"],
             "is_cuda": ["is_cuda: _bool"],
             "is_xpu": ["is_xpu: _bool"],
@@ -1214,6 +2242,7 @@ def replace_special_case(hint: str) -> str:
             "is_mkldnn": ["is_mkldnn: _bool"],
             "is_vulkan": ["is_vulkan: _bool"],
             "is_ipu": ["is_ipu: _bool"],
+<<<<<<< HEAD
             "storage_offset": ["def storage_offset(self) -> Union[_int, SymInt]: ..."],
             "to": [
                 (
@@ -1244,11 +2273,100 @@ def replace_special_case(hint: str) -> str:
             ],
             "div_": [
                 "def div_(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: ..."
+=======
+            "storage_offset": [defs("storage_offset", ["self"], "_int | SymInt")],
+            "to": [
+                (
+                    defs(
+                        "to",
+                        [
+                            "self",
+                            *to_args,
+                            "non_blocking: _bool = False",
+                            "copy: _bool = False",
+                            "*",
+                            "memory_format: torch.memory_format | None = None",
+                        ],
+                        "Tensor",
+                    )
+                )
+                for to_args in [
+                    ["dtype: _dtype"],
+                    [
+                        "device: DeviceLikeType | None = None",
+                        "dtype: _dtype | None = None",
+                    ],
+                    ["other: Tensor"],
+                ]
+            ],
+            "item": [defs("item", ["self"], "Number")],
+            "copy_": [
+                defs(
+                    "copy_",
+                    ["self", "other: Tensor", "non_blocking: _bool = False"],
+                    "Tensor",
+                )
+            ],
+            "set_": [
+                defs(
+                    "set_",
+                    [
+                        "self",
+                        "source: Storage | TypedStorage | UntypedStorage",
+                        "storage_offset: IntLikeType",
+                        "size: _symsize",
+                        "stride: _symsize",
+                    ],
+                    "Tensor",
+                ),
+                defs(
+                    "set_",
+                    ["self", "source: Storage | TypedStorage | UntypedStorage"],
+                    "Tensor",
+                ),
+            ],
+            "split": [
+                defs(
+                    "split",
+                    ["self", "split_size: _int", "dim: _int = 0"],
+                    "Sequence[Tensor]",
+                ),
+                defs(
+                    "split",
+                    ["self", "split_size: tuple[_int, ...]", "dim: _int = 0"],
+                    "Sequence[Tensor]",
+                ),
+            ],
+            "div": [
+                defs(
+                    "div",
+                    [
+                        "self",
+                        "other: Tensor | Number",
+                        "*",
+                        "rounding_mode: str | None = None",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "div_": [
+                defs(
+                    "div_",
+                    [
+                        "self",
+                        "other: Tensor | Number",
+                        "*",
+                        "rounding_mode: str | None = None",
+                    ],
+                    "Tensor",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
         }
     )
     for binop in ["true_divide", "floor_divide"]:
         for inplace in [False, True]:
+<<<<<<< HEAD
             out_suffix = ", *, out: Optional[Tensor] = None"
             if inplace:
                 binop += "_"
@@ -1279,6 +2397,62 @@ def replace_special_case(hint: str) -> str:
                 " -> Tensor: ..."
             )
     simple_conversions = [
+=======
+            out_args = ["*", "out: Tensor | None = None"]
+            if inplace:
+                binop += "_"
+                out_args = []
+            unsorted_tensor_method_hints[binop].append(
+                defs(
+                    binop,
+                    [
+                        "self",
+                        "other: Tensor | Number | torch.SymInt | torch.SymFloat",
+                        *out_args,
+                    ],
+                    "Tensor",
+                )
+            )
+    for binop in ["mul"]:
+        for inplace in [False, True]:
+            out_args = ["*", "out: Tensor | None = None"]
+            if inplace:
+                binop += "_"
+                out_args = []
+            unsorted_tensor_method_hints[binop].append(
+                defs(
+                    binop,
+                    [
+                        "self",
+                        "other: Tensor | Number | _complex | torch.SymInt | torch.SymFloat",
+                        *out_args,
+                    ],
+                    "Tensor",
+                )
+            )
+    for binop in ["add", "sub"]:
+        for inplace in [False, True]:
+            out_args = ["out: Tensor | None = None"]
+            if inplace:
+                binop += "_"
+                out_args = []
+            unsorted_tensor_method_hints[binop].append(
+                defs(
+                    binop,
+                    [
+                        "self",
+                        "other: Tensor | Number | _complex | torch.SymInt | torch.SymFloat",
+                        "*",
+                        "alpha: Number | _complex | None = 1",
+                        *out_args,
+                    ],
+                    "Tensor",
+                )
+            )
+    simple_conversions = [
+        "bfloat16",
+        "bool",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "byte",
         "char",
         "double",
@@ -1287,8 +2461,11 @@ def replace_special_case(hint: str) -> str:
         "int",
         "long",
         "short",
+<<<<<<< HEAD
         "bool",
         "bfloat16",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     for name in simple_conversions:
         unsorted_tensor_method_hints[name].append(f"def {name}(self) -> Tensor: ...")
@@ -1337,12 +2514,28 @@ def replace_special_case(hint: str) -> str:
     # Generate structseq definitions
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+<<<<<<< HEAD
+    structseq_defs = [f"{defn}\n" for defn in structseqs.values()]
+=======
+    structseqs = dict(sorted(structseqs.items()))
     structseq_defs = [f"{defn}\n" for defn in structseqs.values()]
+    return_types___all__ = [
+        "__all__ = [",
+        '    "pytree_register_structseq",',
+        '    "all_return_types",',
+        *(f'    "{name}",' for name in structseqs),
+        "]",
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Generate type signatures for legacy classes
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+<<<<<<< HEAD
     legacy_storage_base_hints = ["class StorageBase(object): ..."]
+=======
+    legacy_storage_base_hints = ["class StorageBase: ..."]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     legacy_class_hints = []
     for c in (
@@ -1378,6 +2571,10 @@ def replace_special_case(hint: str) -> str:
             "float8_e5m2",
             "float8_e5m2fnuz",
             "float8_e8m0fnu",
+<<<<<<< HEAD
+=======
+            "float4_e2m1fn_x2",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "half",
             "uint8",
             "uint16",
@@ -1418,9 +2615,18 @@ def replace_special_case(hint: str) -> str:
     hinted_function_names = [
         name for name, hint in unsorted_function_hints.items() if hint
     ]
+<<<<<<< HEAD
     all_symbols = sorted(list(structseqs.keys()) + hinted_function_names)
     all_directive = pformat(all_symbols, width=100, compact=True).split("\n")
     all_directive[0] = f"__all__ = {all_directive[0]}"
+=======
+    all_symbols = sorted(list(structseqs) + hinted_function_names)
+    all_directive = [
+        "__all__ = [",
+        *(f'    "{name}",' for name in all_symbols),
+        "]",
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Dispatch key hints
     # ~~~~~~~~~~~~~~~~~~
@@ -1440,7 +2646,13 @@ def replace_special_case(hint: str) -> str:
 
     env = {
         "structseq_defs": structseq_defs,
+<<<<<<< HEAD
+        "function_hints": function_hints,
+=======
+        "return_types___all__": return_types___all__,
         "function_hints": function_hints,
+        "index_type_def": index_type_def,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "tensor_method_hints": tensor_method_hints,
         "legacy_class_hints": legacy_class_hints,
         "legacy_storage_base_hints": legacy_storage_base_hints,
@@ -1494,12 +2706,26 @@ def main() -> None:
         help="path to deprecated.yaml",
     )
     parser.add_argument(
+<<<<<<< HEAD
         "--out", metavar="OUT", default=".", help="path to output directory"
+=======
+        "--out",
+        metavar="OUT",
+        default=".",
+        help="path to output directory",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     args = parser.parse_args()
     fm = FileManager(install_dir=args.out, template_dir=".", dry_run=False)
     gen_pyi(
+<<<<<<< HEAD
         args.native_functions_path, args.tags_path, args.deprecated_functions_path, fm
+=======
+        args.native_functions_path,
+        args.tags_path,
+        args.deprecated_functions_path,
+        fm,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index ad0d0fb014bf..3156a9469358 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -60,12 +60,23 @@ def _get_cmake_command() -> str:
         cmake3_version = CMake._get_version(which("cmake3"))
         cmake_version = CMake._get_version(which("cmake"))
 
+<<<<<<< HEAD
         _cmake_min_version = LooseVersion("3.18.0")
+=======
+        _cmake_min_version = LooseVersion("3.27.0")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if all(
             ver is None or ver < _cmake_min_version
             for ver in [cmake_version, cmake3_version]
         ):
+<<<<<<< HEAD
             raise RuntimeError("no cmake or cmake3 with version >= 3.18.0 found")
+=======
+            raise RuntimeError(
+                "no cmake or cmake3 with version >= 3.27.0 found:"
+                + str([cmake_version, cmake3_version])
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if cmake3_version is None:
             cmake_command = "cmake"
@@ -189,7 +200,10 @@ def generate(
             # Key: environment variable name. Value: Corresponding variable name to be passed to CMake. If you are
             # adding a new build option to this block: Consider making these two names identical and adding this option
             # in the block below.
+<<<<<<< HEAD
             "_GLIBCXX_USE_CXX11_ABI": "GLIBCXX_USE_CXX11_ABI",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "CUDNN_LIB_DIR": "CUDNN_LIBRARY",
             "USE_CUDA_STATIC_LINK": "CAFFE2_STATIC_LINK_CUDA",
         }
@@ -289,6 +303,15 @@ def generate(
             }
         )
 
+<<<<<<< HEAD
+=======
+        # Detect build dependencies from python lib path (in order to set *_HOME variables)
+        # NVSHMEM
+        nvshmem_home = py_lib_path + "/nvidia/nvshmem"
+        if os.path.exists(nvshmem_home):
+            build_options["NVSHMEM_HOME"] = nvshmem_home
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Options starting with CMAKE_
         cmake__options = {
             "CMAKE_INSTALL_PREFIX": install_dir,
diff --git a/tools/setup_helpers/cmake_utils.py b/tools/setup_helpers/cmake_utils.py
index 591dea5b2a35..9b0ca9b91d12 100644
--- a/tools/setup_helpers/cmake_utils.py
+++ b/tools/setup_helpers/cmake_utils.py
@@ -28,7 +28,11 @@ def convert_cmake_value_to_python_value(
     cmake_type = cmake_type.upper()
     up_val = cmake_value.upper()
     if cmake_type == "BOOL":
+<<<<<<< HEAD
         # https://gitlab.kitware.com/cmake/community/wikis/doc/cmake/VariablesListsStrings#boolean-values-in-cmake
+=======
+        # https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html#genex:BOOL
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return not (
             up_val in ("FALSE", "OFF", "N", "NO", "0", "", "NOTFOUND")
             or up_val.endswith("-NOTFOUND")
diff --git a/tools/setup_helpers/env.py b/tools/setup_helpers/env.py
index 6332a66e7be0..b6e20eb51466 100644
--- a/tools/setup_helpers/env.py
+++ b/tools/setup_helpers/env.py
@@ -3,7 +3,10 @@
 import os
 import platform
 import struct
+<<<<<<< HEAD
 import sys
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from itertools import chain
 from typing import cast, TYPE_CHECKING
 
@@ -16,6 +19,7 @@
 IS_DARWIN = platform.system() == "Darwin"
 IS_LINUX = platform.system() == "Linux"
 
+<<<<<<< HEAD
 IS_CONDA = (
     "conda" in sys.version
     or "Continuum" in sys.version
@@ -23,6 +27,8 @@
 )
 CONDA_DIR = os.path.join(os.path.dirname(sys.executable), "..")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IS_64BIT = struct.calcsize("P") == 8
 
 BUILD_DIR = "build"
diff --git a/tools/stats/check_disabled_tests.py b/tools/stats/check_disabled_tests.py
index 5505dc265929..d6e935a1ba70 100644
--- a/tools/stats/check_disabled_tests.py
+++ b/tools/stats/check_disabled_tests.py
@@ -173,7 +173,11 @@ def save_results(
     all_tests: dict[str, dict[str, int]],
 ) -> None:
     """
+<<<<<<< HEAD
     Save the result to S3, which then gets put into the HUD backened database
+=======
+    Save the result to S3, which then gets put into the HUD backend database
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     should_be_enabled_tests = {
         name: stats
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
index ecab76a05276..87d4acd5c125 100644
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@@ -108,7 +108,11 @@ def process_disabled_test(the_response: dict[str, Any]) -> dict[str, Any]:
         return disabled_test_from_issues
 
     try:
+<<<<<<< HEAD
         url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json?versionId=YtwvDnjQUfyi8xyjux1XJdTV_hMY0rqx"
+=======
+        url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json?versionId=oZfFXdfoa7trdcAiH1aL91T9jUDckwlX"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fetch_and_cache(dirpath, filename, url, process_disabled_test)
     except Exception:
         print("Couldn't download test skip set, leaving all tests enabled...")
diff --git a/tools/stats/monitor.py b/tools/stats/monitor.py
index b80ed64b5908..e75956e7d0fe 100644
--- a/tools/stats/monitor.py
+++ b/tools/stats/monitor.py
@@ -1,7 +1,11 @@
 #!/usr/bin/env python3
 """
 A Python script that logging the system-level utilization usage in json format.
+<<<<<<< HEAD
 Data collected: CPU, memory, GPU memeory utilzation, and GPU utilization if available.
+=======
+Data collected: CPU, memory, GPU memory utilization, and GPU utilization if available.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Usage:
 - To run the script with default data collect time setting, use the following command:
@@ -135,20 +139,40 @@ class SharedResource:
     def __init__(self, is_debug_mode: bool = False) -> None:
         self._data_list: list[UsageData] = []
         self._data_errors: list[str] = []
+<<<<<<< HEAD
         self._lock = threading.Lock()
 
     def get_and_reset(self) -> tuple[list[UsageData], list[str]]:
+=======
+        self._data_logs: list[str] = []
+        self._lock = threading.Lock()
+
+    def get_and_reset(self) -> tuple[list[UsageData], list[str], list[str]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         get deepcopy of list of usageData and list of string errors
         """
         copy_data = []
         copy_errors = []
+<<<<<<< HEAD
         with self._lock:
             copy_data = copy.deepcopy(self._data_list)
             copy_errors = copy.deepcopy(self._data_errors)
             self._data_list.clear()
             self._data_errors.clear()
         return copy_data, copy_errors
+=======
+        copy_logs = []
+        with self._lock:
+            copy_data = copy.deepcopy(self._data_list)
+            copy_errors = copy.deepcopy(self._data_errors)
+            copy_logs = copy.deepcopy(self._data_logs)
+
+            self._data_list.clear()
+            self._data_errors.clear()
+            self._data_logs.clear()
+        return copy_data, copy_errors, copy_logs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def add_data(self, data: UsageData) -> None:
         with self._lock:
@@ -158,6 +182,14 @@ def add_error(self, error: Exception) -> None:
         with self._lock:
             self._data_errors.append(str(error))
 
+<<<<<<< HEAD
+=======
+    def add_log(self, log: str) -> None:
+        with self._lock:
+            print("here log")
+            self._data_logs.append(log)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class UsageLogger:
     """
@@ -227,6 +259,10 @@ def _collect_data(self) -> None:
                     print(f"collecting data {data}")
 
                 self.shared_resource.add_data(data)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except Exception as e:
                 if self._debug_mode:
                     print(f"error detected: {str(e)}")
@@ -265,10 +301,17 @@ def _output_data(self) -> None:
             )
 
             try:
+<<<<<<< HEAD
                 data_list, error_list = self.shared_resource.get_and_reset()
                 if self._debug_mode:
                     print(
                         f"collected data: {len(data_list)}, errors found: {len(error_list)}"
+=======
+                data_list, error_list, log_list = self.shared_resource.get_and_reset()
+                if self._debug_mode:
+                    print(
+                        f"collected data: {len(data_list)}, errors found: {len(error_list)}, logs {len(log_list)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 # records and clears found errors
                 errors = list(set(error_list))
@@ -276,7 +319,11 @@ def _output_data(self) -> None:
                 # if has errors but data list is None, a bug may exist in the monitor code, log the errors
                 if not data_list and len(errors) > 0:
                     raise ValueError(
+<<<<<<< HEAD
                         f"no data is collected but detected errors during the interval: {errors}"
+=======
+                        f"no data is collected but detected errors during the interval: {errors}, logs: {log_list}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 if not data_list:
                     # pass since no data is collected
@@ -304,11 +351,18 @@ def _output_data(self) -> None:
                     gpu_list = self._calculate_gpu_utilization(data_list)
                     record.gpu_usage = gpu_list
                 stats.data = record
+<<<<<<< HEAD
             except Exception as e:
                 stats = UtilizationRecord(
                     level="record",
                     timestamp=getTsNow(),
                     error=str(e),
+=======
+                stats.logs = log_list
+            except Exception as e:
+                stats = UtilizationRecord(
+                    level="record", timestamp=getTsNow(), error=str(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             finally:
                 collecting_end_time = time.time()
@@ -415,10 +469,21 @@ def _initial_gpu_handler(self) -> None:
 
             self._num_of_cpus = psutil.cpu_count(logical=True)
             # update summary info
+<<<<<<< HEAD
             self._metadata.gpu_type = self._gpu_lib_detected
             self._metadata.gpu_count = len(self._gpu_handles)
             self._metadata.cpu_count = self._num_of_cpus
 
+=======
+            self._metadata.gpu_count = len(self._gpu_handles)
+            self._metadata.cpu_count = self._num_of_cpus
+
+            if self._has_pynvml or self._has_amdsmi:
+                if len(self._gpu_handles) == 0:
+                    self._metadata.gpu_type = ""
+                else:
+                    self._metadata.gpu_type = self._gpu_lib_detected
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except Exception as e:
             self._metadata.error = str(e)
 
diff --git a/tools/stats/upload_external_contrib_stats.py b/tools/stats/upload_external_contrib_stats.py
index 93634c4ad5ed..3cd5bbb3b49b 100644
--- a/tools/stats/upload_external_contrib_stats.py
+++ b/tools/stats/upload_external_contrib_stats.py
@@ -81,7 +81,11 @@ def get_external_pr_data(
             response = cast(
                 dict[str, Any],
                 fetch_json(
+<<<<<<< HEAD
                     "https://api.github.com/search/issues",
+=======
+                    "https://api.github.com/search/issues",  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     params={
                         "q": f'repo:pytorch/pytorch is:pr is:closed \
                             label:"open source" label:Merged -label:Reverted closed:{period_begin_date}..{period_end_date}',
diff --git a/tools/stats/upload_utilization_stats/upload_utilization_stats.py b/tools/stats/upload_utilization_stats/upload_utilization_stats.py
index e2768f7c8c91..43b7c59c1c52 100644
--- a/tools/stats/upload_utilization_stats/upload_utilization_stats.py
+++ b/tools/stats/upload_utilization_stats/upload_utilization_stats.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+<<<<<<< HEAD
 from __future__ import annotations
 
 import os
@@ -10,6 +11,14 @@
 if TYPE_CHECKING:
     from pathlib import Path
 
+=======
+import os
+import sys
+from pathlib import Path
+from typing import Union
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
 import argparse
 import json
@@ -31,10 +40,18 @@
 )
 
 
+<<<<<<< HEAD
 USAGE_LOG_FILENAME = "usage_log.txt"
 CMD_PYTHON_LEVEL = "CMD_PYTHON"
 UTILIZATION_BUCKET = "ossci-utilization"
 PYTORCH_REPO = "pytorch/pytorch"
+=======
+TEST_USAGE_LOG_FILENAME = "usage_log.txt"
+CMD_PYTHON_LEVEL = "CMD_PYTHON"
+UTILIZATION_BUCKET = "ossci-utilization"
+PYTORCH_REPO = "pytorch/pytorch"
+JOB_TEST_ARTIFACT_PREFIX = "logs-test"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SegmentGenerator:
@@ -190,19 +207,45 @@ class UploadUtilizationData:
 
     def __init__(
         self,
+<<<<<<< HEAD
+        info: WorkflowInfo,
+        dry_run: bool = False,
+        debug: bool = False,
+    ):
+=======
+        artifact_prefix: str,
         info: WorkflowInfo,
         dry_run: bool = False,
         debug: bool = False,
+        local_path: str = "",
     ):
+        self.artifact_prefix = artifact_prefix
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.info = info
         self.segment_generator = SegmentGenerator()
         self.debug_mode = debug
         self.dry_run = dry_run
+<<<<<<< HEAD
 
     def start(self) -> None:
         metadata, valid_records, _ = self.get_log_data(
             self.info.workflow_run_id, self.info.job_id, self.info.run_attempt
         )
+=======
+        self.local_path = local_path
+
+    def start(self) -> None:
+        if self.local_path:
+            metadata, valid_records, _ = self.get_log_data_from_local(self.local_path)
+        else:
+            print(f"Search for test log in s3 bucket: {UTILIZATION_BUCKET}")
+            metadata, valid_records, _ = self.get_log_data_from_s3(
+                self.info.workflow_run_id,
+                self.info.job_id,
+                self.info.run_attempt,
+                self.artifact_prefix,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not metadata:
             print("[Log Model] Failed to process test log, metadata is None")
@@ -270,13 +313,42 @@ def _upload_utilization_data_to_s3(
         key = f"{collection}/{version}/{repo}/{workflow_run_id}/{workflow_run_attempt}/{job_id}/{file_name}"
         upload_to_s3(bucket_name, key, docs)
 
+<<<<<<< HEAD
     def get_log_data(
         self, workflow_run_id: int, job_id: int, workflow_run_attempt: int
+=======
+    def get_log_data_from_local(
+        self,
+        file_path: str,
+        artifact_prefix: str = "",
+    ) -> tuple[
+        Optional[UtilizationMetadata], list[UtilizationRecord], list[UtilizationRecord]
+    ]:
+        test_log_content = read_file(file_path)
+        if not test_log_content:
+            return None, [], []
+        metadata, records, error_records = self.convert_to_log_models(test_log_content)
+        if metadata is None:
+            return None, [], []
+        print(f"Converted Log Model: UtilizationMetadata:\n {metadata}")
+        return metadata, records, error_records
+
+    def get_log_data_from_s3(
+        self,
+        workflow_run_id: int,
+        job_id: int,
+        workflow_run_attempt: int,
+        artifact_prefix: str = JOB_TEST_ARTIFACT_PREFIX,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[
         Optional[UtilizationMetadata], list[UtilizationRecord], list[UtilizationRecord]
     ]:
         artifact_paths = download_s3_artifacts(
+<<<<<<< HEAD
             "logs-test", workflow_run_id, workflow_run_attempt, job_id
+=======
+            artifact_prefix, workflow_run_id, workflow_run_attempt, job_id
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if len(artifact_paths) == 0:
             print(
@@ -290,7 +362,14 @@ def get_log_data(
             return None, [], []
 
         p = artifact_paths[0]
+<<<<<<< HEAD
         test_log_content = unzip_file(p, USAGE_LOG_FILENAME)
+=======
+
+        test_log_content = handle_file(p)
+        if not test_log_content:
+            return None, [], []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         metadata, records, error_records = self.convert_to_log_models(test_log_content)
         if metadata is None:
@@ -354,6 +433,41 @@ def convert_to_log_models(
         return metadata, result_logs, error_logs
 
 
+<<<<<<< HEAD
+=======
+def handle_file(file_path: Path) -> str:
+    if file_path.match("*.zip"):
+        print(f"extracting {TEST_USAGE_LOG_FILENAME} from zip file {file_path}")
+        return unzip_file(file_path, TEST_USAGE_LOG_FILENAME)
+    elif file_path.match("*.txt"):
+        print(f"extracting {file_path}")
+        return read_file(file_path)
+    print(f"{file_path} is not a supported file type")
+    return ""
+
+
+def read_file(file_path: Union[str, Path]) -> str:
+    try:
+        if isinstance(file_path, Path):
+            if file_path.is_file():
+                with file_path.open("r") as f:
+                    return f.read()
+            else:
+                print(f"::warning file {file_path} does not exist.")
+        elif isinstance(file_path, str):
+            if os.path.isfile(file_path):
+                with open(file_path) as f:
+                    return f.read()
+            else:
+                print(f"::warning file {file_path} does not exist.")
+        else:
+            print(f"::warning unsupported file_path type: {type(file_path)}")
+    except Exception as e:
+        print(f"::warning trying to read file {file_path} failed by: {e}")
+    return ""
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def unzip_file(path: Path, file_name: str) -> str:
     try:
         with zipfile.ZipFile(path) as zip_file:
@@ -412,6 +526,22 @@ def parse_args() -> argparse.Namespace:
 
     parser.add_argument("--dry-run", action="store_true", help="Enable dry-run mode")
 
+<<<<<<< HEAD
+=======
+    parser.add_argument(
+        "--artifact-prefix",
+        type=str,
+        required=False,
+        help="artifact prefix to download raw utilizarion data from s3",
+    )
+    parser.add_argument(
+        "--local-path",
+        type=str,
+        required=False,
+        help="path of the raw utilizarion data from local location",
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return parser.parse_args()
 
 
@@ -435,9 +565,23 @@ def parse_args() -> argparse.Namespace:
         repo=repo,
     )
 
+<<<<<<< HEAD
+=======
+    artifact_prefix = JOB_TEST_ARTIFACT_PREFIX
+    if args.artifact_prefix:
+        artifact_prefix = args.artifact_prefix
+        print(f"args.artifact_prefix: {args.artifact_prefix}")
+        print(f"artifact_prefix: {artifact_prefix}")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ud = UploadUtilizationData(
         info=workflow_info,
         dry_run=args.dry_run,
         debug=args.debug,
+<<<<<<< HEAD
+=======
+        artifact_prefix=artifact_prefix,
+        local_path=args.local_path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     ud.start()
diff --git a/tools/stats/utilization_stats_lib.py b/tools/stats/utilization_stats_lib.py
index 50bb9312c05c..88e124b65506 100644
--- a/tools/stats/utilization_stats_lib.py
+++ b/tools/stats/utilization_stats_lib.py
@@ -53,6 +53,10 @@ class UtilizationRecord(DataClassJsonMixin):
     cmd_names: Optional[list[str]] = None
     error: Optional[str] = None
     log_duration: Optional[str] = None
+<<<<<<< HEAD
+=======
+    logs: Optional[list[str]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # the db schema related to this is:
@@ -87,7 +91,11 @@ class OssCiUtilizationMetadataV1:
 
 
 # this data model is for the time series data:
+<<<<<<< HEAD
 # https://github.com/pytorch/test-infra/blob/main/clickhouse_db_schema/oss_ci_utilization/oss_ci_utilization_time_series_schema.sql
+=======
+# https://github.com/pytorch/test-infra/blob/main/clickhouse_db_schema/oss_ci_utilization/oss_ci_time_series_schema.sql
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class OssCiUtilizationTimeSeriesV1:
     created_at: int
diff --git a/tools/test/docstring_linter_testdata/block_names.py.txt b/tools/test/docstring_linter_testdata/block_names.py.txt
new file mode 100644
index 000000000000..a3a41ec9cb46
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/block_names.py.txt
@@ -0,0 +1,44 @@
+def top(number):
+    if number == 0:
+
+        def fun():
+            if number == 10:
+                def sab():
+                    return 1
+            else:
+                def sub():
+                    return 2
+            return sub
+
+    elif number == 1:
+
+        def fun():
+            if number == 11:
+                def sub():
+                    return 3
+            else:
+                def sub():
+                    return 4
+            return sub
+
+    elif number == 2:
+
+        def fun():
+            if number == 12:
+                def sub():
+                    return 5
+            else:
+                def sab():
+                    return 6
+            return sub
+
+    elif number == 3:
+
+        def run():
+            if number == 12:
+                def sub():
+                    return 5
+            else:
+                def sub():
+                    return 6
+            return sub
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt b/tools/test/docstring_linter_testdata/more_python_code.py.txt
new file mode 100644
index 000000000000..cfc13d912dcc
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt
@@ -0,0 +1,32 @@
+def a_very_very_long():
+    # Lots of lines!
+    # Lots of lines!
+    # Lots of lines!
+    # Lots of lines!
+    pass
+
+
+
+class LintInit:
+    def __init__(self) -> None:
+        # Lots of lines!
+        # Lots of lines!
+        pass
+
+
+class LintInitClass:
+    """This is a very long comment, a very long comment"""
+
+    def __init__(self) -> None:
+        # Lots of lines!
+        # Lots of lines!
+        pass
+
+
+class LintInitInit:
+    def __init__(self) -> None:
+        """This is a very long comment, a very long comment"""
+
+        # Lots of lines!
+        # Lots of lines!
+        pass
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json b/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json
new file mode 100644
index 000000000000..1a52bb962d6b
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.json
@@ -0,0 +1,10 @@
+{
+  "tools/test/docstring_linter_testdata/more_python_code.py.txt": {
+    "11": "class LintInit: lines=6, docs=0: (grandfathered)"
+  },
+  "tools/test/docstring_linter_testdata/python_code.py.txt": {
+    "20": "class LongWithoutDocstring: lines=4, docs=0: (grandfathered)",
+    "25": "class LongWithShortDocstring: lines=6, docs=10: (grandfathered)",
+    "72": "def ImpossibleCombo.needs_docs(): lines=12, docs=0: (grandfathered)"
+  }
+}
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.txt b/tools/test/docstring_linter_testdata/more_python_code.py.txt.after.txt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json
new file mode 100644
index 000000000000..816972717bd1
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.json
@@ -0,0 +1,10 @@
+{
+  "tools/test/docstring_linter_testdata/more_python_code.py.txt": {
+    "11": "class LintInit: lines=6, docs=0: FAIL"
+  },
+  "tools/test/docstring_linter_testdata/python_code.py.txt": {
+    "20": "class LongWithoutDocstring: lines=4, docs=0: FAIL",
+    "25": "class LongWithShortDocstring: lines=6, docs=10: FAIL",
+    "72": "def ImpossibleCombo.needs_docs(): lines=12, docs=0: FAIL"
+  }
+}
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt
new file mode 100644
index 000000000000..cb6775975070
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.before.txt
@@ -0,0 +1,31 @@
+tools/test/docstring_linter_testdata/more_python_code.py.txt:10: No docstring found for class 'LintInit' (6 lines)
+    8 |
+    9 |
+   10 | class LintInit:
+        ^
+   11 |     def __init__(self) -> None:
+   12 |         # Lots of lines!
+
+tools/test/docstring_linter_testdata/python_code.py.txt:17: No docstring found for class 'LongWithoutDocstring' (4 lines)
+   15 |
+   16 |
+   17 | class LongWithoutDocstring:
+        ^
+   18 |     # A comment isn't a docstring
+   19 |
+
+tools/test/docstring_linter_testdata/python_code.py.txt:24: docstring found for class 'LongWithShortDocstring' (6 lines) was too short (10 characters, needed 16)
+   22 |
+   23 |
+   24 | class LongWithShortDocstring:
+        ^
+   25 |     """TODO"""
+   26 |
+
+tools/test/docstring_linter_testdata/python_code.py.txt:71: No docstring found for function 'needs_docs' (12 lines). If the method overrides a method on a parent class, adding the `@typing_extensions.override` decorator will make this error go away.
+   69 |     """This docstring, while short, is enough"""
+   70 |
+   71 |     def needs_docs(self):
+            ^
+   72 |         def not_short():
+   73 |             class Long:
diff --git a/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json b/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json
new file mode 100644
index 000000000000..f4b3a72d1972
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/more_python_code.py.txt.grandfather.json
@@ -0,0 +1,10 @@
+{
+  "tools/test/docstring_linter_testdata/more_python_code.py.txt": {
+    "class LintInit": 6
+  },
+  "tools/test/docstring_linter_testdata/python_code.py.txt": {
+    "class LongWithShortDocstring": 6,
+    "class LongWithoutDocstring": 4,
+    "def ImpossibleCombo.needs_docs()": 12
+  }
+}
\ No newline at end of file
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt b/tools/test/docstring_linter_testdata/python_code.py.txt
index 44b2c2123331..340828c3dbf6 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt
@@ -81,12 +81,27 @@ class ImpossibleCombo(
                 pass
 
 
+<<<<<<< HEAD
+=======
+@override  # Won't work!
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class NotDocstring:
     def short1(self):
         pass
 
     """This is not a docstring"""
 
+<<<<<<< HEAD
+=======
+    @override
+    def long_with_override(self):
+        #
+        #
+        #
+        #
+        pass
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def short2(self):
         pass
 
@@ -94,6 +109,10 @@ class NotDocstring:
         pass
 
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def long_with_omit():  # noqa: docstring_linter
     #
     #
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.json b/tools/test/docstring_linter_testdata/python_code.py.txt.json
index eebee3718730..19d69726e83c 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.json
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.json
@@ -15,7 +15,11 @@
     "code": "DOCSTRING_LINTER",
     "description": null,
     "line": 24,
+<<<<<<< HEAD
     "name": "docstring found for class 'LongWithShortDocstring' (6 lines) was too short (10 characters)",
+=======
+    "name": "docstring found for class 'LongWithShortDocstring' (6 lines) was too short (10 characters, needed 16)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "original": null,
     "path": "tools/test/docstring_linter_testdata/python_code.py.txt",
     "replacement": null,
@@ -26,6 +30,7 @@
     "code": "DOCSTRING_LINTER",
     "description": null,
     "line": 71,
+<<<<<<< HEAD
     "name": "No docstring found for function 'needs_docs' (12 lines)",
     "original": null,
     "path": "tools/test/docstring_linter_testdata/python_code.py.txt",
@@ -60,6 +65,9 @@
     "description": null,
     "line": 84,
     "name": "No docstring found for class 'NotDocstring' (12 lines)",
+=======
+    "name": "No docstring found for function 'needs_docs' (12 lines). If the method overrides a method on a parent class, adding the `@typing_extensions.override` decorator will make this error go away.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "original": null,
     "path": "tools/test/docstring_linter_testdata/python_code.py.txt",
     "replacement": null,
@@ -71,9 +79,15 @@
     "description": null,
     "line": null,
     "name": "Suggested fixes for docstring_linter",
+<<<<<<< HEAD
     "original": "class ShortWithDocstring:\n    \"\"\"This docstring, while short, is enough\"\"\"\n    pass\n\n\nclass Short:\n    pass\n\n\nclass LongWithDocstring:\n    \"\"\"This docstring, while short, is enough\"\"\"\n\n    def short1(self):\n        pass\n\n\nclass LongWithoutDocstring:\n    # A comment isn't a docstring\n\n    def short1(self):\n        pass\n\n\nclass LongWithShortDocstring:\n    \"\"\"TODO\"\"\"\n\n    def short1(self):\n        pass\n\n\nclass _Protected:\n    \"\"\"TODO\"\"\"\n\n    def short1(self):\n        pass\n\n\ndef short():\n    #\n    #\n    #\n    pass\n\n\ndef long():\n    \"\"\"This docstring, while short, is enough\"\"\"\n    #\n    #\n    #\n    #\n    pass\n\n\ndef long_without_docstring():\n    #\n    #\n    #\n    #\n    pass\n\n\nclass ImpossibleCombo(\n    set,\n    tuple,\n    int,\n):\n    # We could have comments\n    # before the doc comment\n    \"\"\"This docstring, while short, is enough\"\"\"\n\n    def needs_docs(self):\n        def not_short():\n            class Long:\n                a = 1\n                b = 1\n                c = 1\n                d = 1\n                e = 1\n\n            class Short:\n                pass\n\n\nclass NotDocstring:\n    def short1(self):\n        pass\n\n    \"\"\"This is not a docstring\"\"\"\n\n    def short2(self):\n        pass\n\n    def short3(self):\n        pass\n\n\ndef long_with_omit():  # noqa: docstring_linter\n    #\n    #\n    #\n    #\n    pass\n",
     "path": "tools/test/docstring_linter_testdata/python_code.py.txt",
     "replacement": "class ShortWithDocstring:\n    \"\"\"This docstring, while short, is enough\"\"\"\n    pass\n\n\nclass Short:\n    pass\n\n\nclass LongWithDocstring:\n    \"\"\"This docstring, while short, is enough\"\"\"\n\n    def short1(self):\n        pass\n\n\nclass LongWithoutDocstring:\n    # A comment isn't a docstring\n\n    def short1(self):\n        pass\n\n\nclass LongWithShortDocstring:\n    \"\"\"TODO\"\"\"\n\n    def short1(self):\n        pass\n\n\nclass _Protected:\n    \"\"\"TODO\"\"\"\n\n    def short1(self):\n        pass\n\n\ndef short():\n    #\n    #\n    #\n    pass\n\n\ndef long():\n    \"\"\"This docstring, while short, is enough\"\"\"\n    #\n    #\n    #\n    #\n    pass\n\n\ndef long_without_docstring():\n    #\n    #\n    #\n    #\n    pass\n\n\nclass ImpossibleCombo(\n    set,\n    tuple,\n    int,\n):\n    # We could have comments\n    # before the doc comment\n    \"\"\"This docstring, while short, is enough\"\"\"\n\n    def needs_docs(self):\n        def not_short():\n            class Long:\n                a = 1\n                b = 1\n                c = 1\n                d = 1\n                e = 1\n\n            class Short:\n                pass\n\n\nclass NotDocstring:\n    def short1(self):\n        pass\n\n    \"\"\"This is not a docstring\"\"\"\n\n    def short2(self):\n        pass\n\n    def short3(self):\n        pass\n\n\ndef long_with_omit():  # noqa: docstring_linter\n    #\n    #\n    #\n    #\n    pass\n",
+=======
+    "original": "class ShortWithDocstring:\n    \"\"\"This docstring, while short, is enough\"\"\"\n    pass\n\n\nclass Short:\n    pass\n\n\nclass LongWithDocstring:\n    \"\"\"This docstring, while short, is enough\"\"\"\n\n    def short1(self):\n        pass\n\n\nclass LongWithoutDocstring:\n    # A comment isn't a docstring\n\n    def short1(self):\n        pass\n\n\nclass LongWithShortDocstring:\n    \"\"\"TODO\"\"\"\n\n    def short1(self):\n        pass\n\n\nclass _Protected:\n    \"\"\"TODO\"\"\"\n\n    def short1(self):\n        pass\n\n\ndef short():\n    #\n    #\n    #\n    pass\n\n\ndef long():\n    \"\"\"This docstring, while short, is enough\"\"\"\n    #\n    #\n    #\n    #\n    pass\n\n\ndef long_without_docstring():\n    #\n    #\n    #\n    #\n    pass\n\n\nclass ImpossibleCombo(\n    set,\n    tuple,\n    int,\n):\n    # We could have comments\n    # before the doc comment\n    \"\"\"This docstring, while short, is enough\"\"\"\n\n    def needs_docs(self):\n        def not_short():\n            class Long:\n                a = 1\n                b = 1\n                c = 1\n                d = 1\n                e = 1\n\n            class Short:\n                pass\n\n\n@override  # Won't work!\nclass NotDocstring:\n    def short1(self):\n        pass\n\n    \"\"\"This is not a docstring\"\"\"\n\n    @override\n    def long_with_override(self):\n        #\n        #\n        #\n        #\n        pass\n\n    def short2(self):\n        pass\n\n    def short3(self):\n        pass\n\n\n\ndef long_with_omit():  # noqa: docstring_linter\n    #\n    #\n    #\n    #\n    pass\n",
+    "path": "tools/test/docstring_linter_testdata/python_code.py.txt",
+    "replacement": "class ShortWithDocstring:\n    \"\"\"This docstring, while short, is enough\"\"\"\n    pass\n\n\nclass Short:\n    pass\n\n\nclass LongWithDocstring:\n    \"\"\"This docstring, while short, is enough\"\"\"\n\n    def short1(self):\n        pass\n\n\nclass LongWithoutDocstring:\n    # A comment isn't a docstring\n\n    def short1(self):\n        pass\n\n\nclass LongWithShortDocstring:\n    \"\"\"TODO\"\"\"\n\n    def short1(self):\n        pass\n\n\nclass _Protected:\n    \"\"\"TODO\"\"\"\n\n    def short1(self):\n        pass\n\n\ndef short():\n    #\n    #\n    #\n    pass\n\n\ndef long():\n    \"\"\"This docstring, while short, is enough\"\"\"\n    #\n    #\n    #\n    #\n    pass\n\n\ndef long_without_docstring():\n    #\n    #\n    #\n    #\n    pass\n\n\nclass ImpossibleCombo(\n    set,\n    tuple,\n    int,\n):\n    # We could have comments\n    # before the doc comment\n    \"\"\"This docstring, while short, is enough\"\"\"\n\n    def needs_docs(self):\n        def not_short():\n            class Long:\n                a = 1\n                b = 1\n                c = 1\n                d = 1\n                e = 1\n\n            class Short:\n                pass\n\n\n@override  # Won't work!\nclass NotDocstring:\n    def short1(self):\n        pass\n\n    \"\"\"This is not a docstring\"\"\"\n\n    @override\n    def long_with_override(self):\n        #\n        #\n        #\n        #\n        pass\n\n    def short2(self):\n        pass\n\n    def short3(self):\n        pass\n\n\n\ndef long_with_omit():  # noqa: docstring_linter\n    #\n    #\n    #\n    #\n    pass\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "severity": "error"
   }
 ]
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner b/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner
index 07adffee6d84..c15d7573e8ed 100644
--- a/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.lintrunner
@@ -6,7 +6,11 @@ tools/test/docstring_linter_testdata/python_code.py.txt:17: No docstring found f
    18 |     # A comment isn't a docstring
    19 |
 
+<<<<<<< HEAD
 tools/test/docstring_linter_testdata/python_code.py.txt:24: docstring found for class 'LongWithShortDocstring' (6 lines) was too short (10 characters)
+=======
+tools/test/docstring_linter_testdata/python_code.py.txt:24: docstring found for class 'LongWithShortDocstring' (6 lines) was too short (10 characters, needed 16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    22 |
    23 |
    24 | class LongWithShortDocstring:
@@ -14,13 +18,18 @@ tools/test/docstring_linter_testdata/python_code.py.txt:24: docstring found for
    25 |     """TODO"""
    26 |
 
+<<<<<<< HEAD
 tools/test/docstring_linter_testdata/python_code.py.txt:71: No docstring found for function 'needs_docs' (12 lines)
+=======
+tools/test/docstring_linter_testdata/python_code.py.txt:71: No docstring found for function 'needs_docs' (12 lines). If the method overrides a method on a parent class, adding the `@typing_extensions.override` decorator will make this error go away.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    69 |     """This docstring, while short, is enough"""
    70 |
    71 |     def needs_docs(self):
             ^
    72 |         def not_short():
    73 |             class Long:
+<<<<<<< HEAD
 
 tools/test/docstring_linter_testdata/python_code.py.txt:72: No docstring found for function 'not_short' (11 lines)
    70 |
@@ -54,3 +63,5 @@ Top undocumented classes:
 Top undocumented functions:
     12 lines: tools/test/docstring_linter_testdata/python_code.py.txt:needs_docs
     11 lines: tools/test/docstring_linter_testdata/python_code.py.txt:not_short
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json
new file mode 100644
index 000000000000..ba5803383ad9
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.json
@@ -0,0 +1,449 @@
+[
+  {
+    "category": "class",
+    "children": [],
+    "decorators": [],
+    "display_name": "class ShortWithDocstring",
+    "docstring": "\"\"\"This docstring, while short, is enough\"\"\"",
+    "full_name": "ShortWithDocstring",
+    "index": 0,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 4,
+    "parent": null,
+    "start_line": 2
+  },
+  {
+    "category": "class",
+    "children": [],
+    "decorators": [],
+    "display_name": "class Short",
+    "docstring": "",
+    "full_name": "Short",
+    "index": 1,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 3,
+    "parent": null,
+    "start_line": 7
+  },
+  {
+    "category": "class",
+    "children": [
+      {
+        "category": "def",
+        "children": [],
+        "decorators": [],
+        "display_name": "def LongWithDocstring.short1()",
+        "docstring": "",
+        "full_name": "LongWithDocstring.short1",
+        "index": 3,
+        "is_local": false,
+        "is_method": true,
+        "line_count": 3,
+        "parent": 2,
+        "start_line": 14
+      }
+    ],
+    "decorators": [],
+    "display_name": "class LongWithDocstring",
+    "docstring": "\"\"\"This docstring, while short, is enough\"\"\"",
+    "full_name": "LongWithDocstring",
+    "index": 2,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 6,
+    "parent": null,
+    "start_line": 11
+  },
+  {
+    "category": "class",
+    "children": [
+      {
+        "category": "def",
+        "children": [],
+        "decorators": [],
+        "display_name": "def LongWithoutDocstring.short1()",
+        "docstring": "",
+        "full_name": "LongWithoutDocstring.short1",
+        "index": 5,
+        "is_local": false,
+        "is_method": true,
+        "line_count": 3,
+        "parent": 4,
+        "start_line": 21
+      }
+    ],
+    "decorators": [],
+    "display_name": "class LongWithoutDocstring",
+    "docstring": "",
+    "full_name": "LongWithoutDocstring",
+    "index": 4,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 4,
+    "parent": null,
+    "start_line": 20
+  },
+  {
+    "category": "class",
+    "children": [
+      {
+        "category": "def",
+        "children": [],
+        "decorators": [],
+        "display_name": "def LongWithShortDocstring.short1()",
+        "docstring": "",
+        "full_name": "LongWithShortDocstring.short1",
+        "index": 7,
+        "is_local": false,
+        "is_method": true,
+        "line_count": 3,
+        "parent": 6,
+        "start_line": 28
+      }
+    ],
+    "decorators": [],
+    "display_name": "class LongWithShortDocstring",
+    "docstring": "\"\"\"TODO\"\"\"",
+    "full_name": "LongWithShortDocstring",
+    "index": 6,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 6,
+    "parent": null,
+    "start_line": 25
+  },
+  {
+    "category": "class",
+    "children": [
+      {
+        "category": "def",
+        "children": [],
+        "decorators": [],
+        "display_name": "def _Protected.short1()",
+        "docstring": "",
+        "full_name": "_Protected.short1",
+        "index": 9,
+        "is_local": false,
+        "is_method": true,
+        "line_count": 3,
+        "parent": 8,
+        "start_line": 35
+      }
+    ],
+    "decorators": [],
+    "display_name": "class _Protected",
+    "docstring": "\"\"\"TODO\"\"\"",
+    "full_name": "_Protected",
+    "index": 8,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 6,
+    "parent": null,
+    "start_line": 32
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def short()",
+    "docstring": "",
+    "full_name": "short",
+    "index": 10,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 3,
+    "parent": null,
+    "start_line": 42
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def long()",
+    "docstring": "\"\"\"This docstring, while short, is enough\"\"\"",
+    "full_name": "long",
+    "index": 11,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 8,
+    "parent": null,
+    "start_line": 46
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def long_without_docstring()",
+    "docstring": "",
+    "full_name": "long_without_docstring",
+    "index": 12,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 3,
+    "parent": null,
+    "start_line": 59
+  },
+  {
+    "category": "class",
+    "children": [
+      {
+        "category": "def",
+        "children": [
+          {
+            "category": "def",
+            "children": [
+              {
+                "category": "class",
+                "children": [],
+                "decorators": [],
+                "display_name": "class ImpossibleCombo.needs_docs.not_short.Long",
+                "docstring": "",
+                "full_name": "ImpossibleCombo.needs_docs.not_short.Long",
+                "index": 16,
+                "is_local": true,
+                "is_method": false,
+                "line_count": 6,
+                "parent": 15,
+                "start_line": 74
+              },
+              {
+                "category": "class",
+                "children": [],
+                "decorators": [],
+                "display_name": "class ImpossibleCombo.needs_docs.not_short.Short",
+                "docstring": "",
+                "full_name": "ImpossibleCombo.needs_docs.not_short.Short",
+                "index": 17,
+                "is_local": true,
+                "is_method": false,
+                "line_count": 3,
+                "parent": 15,
+                "start_line": 81
+              }
+            ],
+            "decorators": [],
+            "display_name": "def ImpossibleCombo.needs_docs.not_short()",
+            "docstring": "",
+            "full_name": "ImpossibleCombo.needs_docs.not_short",
+            "index": 15,
+            "is_local": true,
+            "is_method": false,
+            "line_count": 11,
+            "parent": 14,
+            "start_line": 73
+          },
+          {
+            "category": "class",
+            "children": [],
+            "decorators": [],
+            "display_name": "class ImpossibleCombo.needs_docs.not_short.Long",
+            "docstring": "",
+            "full_name": "ImpossibleCombo.needs_docs.not_short.Long",
+            "index": 16,
+            "is_local": true,
+            "is_method": false,
+            "line_count": 6,
+            "parent": 15,
+            "start_line": 74
+          },
+          {
+            "category": "class",
+            "children": [],
+            "decorators": [],
+            "display_name": "class ImpossibleCombo.needs_docs.not_short.Short",
+            "docstring": "",
+            "full_name": "ImpossibleCombo.needs_docs.not_short.Short",
+            "index": 17,
+            "is_local": true,
+            "is_method": false,
+            "line_count": 3,
+            "parent": 15,
+            "start_line": 81
+          }
+        ],
+        "decorators": [],
+        "display_name": "def ImpossibleCombo.needs_docs()",
+        "docstring": "",
+        "full_name": "ImpossibleCombo.needs_docs",
+        "index": 14,
+        "is_local": false,
+        "is_method": true,
+        "line_count": 12,
+        "parent": 13,
+        "start_line": 72
+      },
+      {
+        "category": "def",
+        "children": [
+          {
+            "category": "class",
+            "children": [],
+            "decorators": [],
+            "display_name": "class ImpossibleCombo.needs_docs.not_short.Long",
+            "docstring": "",
+            "full_name": "ImpossibleCombo.needs_docs.not_short.Long",
+            "index": 16,
+            "is_local": true,
+            "is_method": false,
+            "line_count": 6,
+            "parent": 15,
+            "start_line": 74
+          },
+          {
+            "category": "class",
+            "children": [],
+            "decorators": [],
+            "display_name": "class ImpossibleCombo.needs_docs.not_short.Short",
+            "docstring": "",
+            "full_name": "ImpossibleCombo.needs_docs.not_short.Short",
+            "index": 17,
+            "is_local": true,
+            "is_method": false,
+            "line_count": 3,
+            "parent": 15,
+            "start_line": 81
+          }
+        ],
+        "decorators": [],
+        "display_name": "def ImpossibleCombo.needs_docs.not_short()",
+        "docstring": "",
+        "full_name": "ImpossibleCombo.needs_docs.not_short",
+        "index": 15,
+        "is_local": true,
+        "is_method": false,
+        "line_count": 11,
+        "parent": 14,
+        "start_line": 73
+      },
+      {
+        "category": "class",
+        "children": [],
+        "decorators": [],
+        "display_name": "class ImpossibleCombo.needs_docs.not_short.Long",
+        "docstring": "",
+        "full_name": "ImpossibleCombo.needs_docs.not_short.Long",
+        "index": 16,
+        "is_local": true,
+        "is_method": false,
+        "line_count": 6,
+        "parent": 15,
+        "start_line": 74
+      },
+      {
+        "category": "class",
+        "children": [],
+        "decorators": [],
+        "display_name": "class ImpossibleCombo.needs_docs.not_short.Short",
+        "docstring": "",
+        "full_name": "ImpossibleCombo.needs_docs.not_short.Short",
+        "index": 17,
+        "is_local": true,
+        "is_method": false,
+        "line_count": 3,
+        "parent": 15,
+        "start_line": 81
+      }
+    ],
+    "decorators": [],
+    "display_name": "class ImpossibleCombo",
+    "docstring": "\"\"\"This docstring, while short, is enough\"\"\"",
+    "full_name": "ImpossibleCombo",
+    "index": 13,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 15,
+    "parent": null,
+    "start_line": 69
+  },
+  {
+    "category": "class",
+    "children": [
+      {
+        "category": "def",
+        "children": [],
+        "decorators": [],
+        "display_name": "def NotDocstring.short1()",
+        "docstring": "",
+        "full_name": "NotDocstring.short1",
+        "index": 19,
+        "is_local": false,
+        "is_method": true,
+        "line_count": 2,
+        "parent": 18,
+        "start_line": 87
+      },
+      {
+        "category": "def",
+        "children": [],
+        "decorators": [
+          "@override"
+        ],
+        "display_name": "def NotDocstring.long_with_override()",
+        "docstring": "",
+        "full_name": "NotDocstring.long_with_override",
+        "index": 20,
+        "is_local": false,
+        "is_method": true,
+        "line_count": 2,
+        "parent": 18,
+        "start_line": 97
+      },
+      {
+        "category": "def",
+        "children": [],
+        "decorators": [],
+        "display_name": "def NotDocstring.short2()",
+        "docstring": "",
+        "full_name": "NotDocstring.short2",
+        "index": 21,
+        "is_local": false,
+        "is_method": true,
+        "line_count": 2,
+        "parent": 18,
+        "start_line": 100
+      },
+      {
+        "category": "def",
+        "children": [],
+        "decorators": [],
+        "display_name": "def NotDocstring.short3()",
+        "docstring": "",
+        "full_name": "NotDocstring.short3",
+        "index": 22,
+        "is_local": false,
+        "is_method": true,
+        "line_count": 4,
+        "parent": 18,
+        "start_line": 103
+      }
+    ],
+    "decorators": [
+      "@override"
+    ],
+    "display_name": "class NotDocstring",
+    "docstring": "",
+    "full_name": "NotDocstring",
+    "index": 18,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 21,
+    "parent": null,
+    "start_line": 86
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def long_with_omit()",
+    "docstring": "",
+    "full_name": "long_with_omit",
+    "index": 23,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 1,
+    "parent": null,
+    "start_line": 112
+  }
+]
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json
new file mode 100644
index 000000000000..adebe574e688
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.json
@@ -0,0 +1,206 @@
+{
+  "class ImpossibleCombo": {
+    "children": {
+      "72": {
+        "children": {
+          "73": {
+            "children": {
+              "74": {
+                "docstring_len": 0,
+                "lines": 6,
+                "name": "class ImpossibleCombo.needs_docs.not_short.Long",
+                "status": "good"
+              },
+              "81": {
+                "docstring_len": 0,
+                "lines": 3,
+                "name": "class ImpossibleCombo.needs_docs.not_short.Short",
+                "status": "good"
+              }
+            },
+            "docstring_len": 0,
+            "lines": 11,
+            "name": "def ImpossibleCombo.needs_docs.not_short",
+            "status": "good"
+          },
+          "74": {
+            "docstring_len": 0,
+            "lines": 6,
+            "name": "class ImpossibleCombo.needs_docs.not_short.Long",
+            "status": "good"
+          },
+          "81": {
+            "docstring_len": 0,
+            "lines": 3,
+            "name": "class ImpossibleCombo.needs_docs.not_short.Short",
+            "status": "good"
+          }
+        },
+        "docstring_len": 0,
+        "lines": 12,
+        "name": "def ImpossibleCombo.needs_docs",
+        "status": "good"
+      },
+      "73": {
+        "children": {
+          "74": {
+            "docstring_len": 0,
+            "lines": 6,
+            "name": "class ImpossibleCombo.needs_docs.not_short.Long",
+            "status": "good"
+          },
+          "81": {
+            "docstring_len": 0,
+            "lines": 3,
+            "name": "class ImpossibleCombo.needs_docs.not_short.Short",
+            "status": "good"
+          }
+        },
+        "docstring_len": 0,
+        "lines": 11,
+        "name": "def ImpossibleCombo.needs_docs.not_short",
+        "status": "good"
+      },
+      "74": {
+        "docstring_len": 0,
+        "lines": 6,
+        "name": "class ImpossibleCombo.needs_docs.not_short.Long",
+        "status": "good"
+      },
+      "81": {
+        "docstring_len": 0,
+        "lines": 3,
+        "name": "class ImpossibleCombo.needs_docs.not_short.Short",
+        "status": "good"
+      }
+    },
+    "docstring_len": 44,
+    "line": 69,
+    "lines": 15,
+    "status": "good"
+  },
+  "class LongWithDocstring": {
+    "children": {
+      "14": {
+        "docstring_len": 0,
+        "lines": 3,
+        "name": "def LongWithDocstring.short1",
+        "status": "good"
+      }
+    },
+    "docstring_len": 44,
+    "line": 11,
+    "lines": 6,
+    "status": "good"
+  },
+  "class LongWithShortDocstring": {
+    "children": {
+      "28": {
+        "docstring_len": 0,
+        "lines": 3,
+        "name": "def LongWithShortDocstring.short1",
+        "status": "good"
+      }
+    },
+    "docstring_len": 10,
+    "line": 25,
+    "lines": 6,
+    "status": "good"
+  },
+  "class LongWithoutDocstring": {
+    "children": {
+      "21": {
+        "docstring_len": 0,
+        "lines": 3,
+        "name": "def LongWithoutDocstring.short1",
+        "status": "good"
+      }
+    },
+    "docstring_len": 0,
+    "line": 20,
+    "lines": 4,
+    "status": "good"
+  },
+  "class NotDocstring": {
+    "children": {
+      " 87": {
+        "docstring_len": 0,
+        "lines": 2,
+        "name": "def NotDocstring.short1",
+        "status": "good"
+      },
+      " 97": {
+        "docstring_len": 0,
+        "lines": 2,
+        "name": "def NotDocstring.long_with_override",
+        "status": "good"
+      },
+      "100": {
+        "docstring_len": 0,
+        "lines": 2,
+        "name": "def NotDocstring.short2",
+        "status": "good"
+      },
+      "103": {
+        "docstring_len": 0,
+        "lines": 4,
+        "name": "def NotDocstring.short3",
+        "status": "good"
+      }
+    },
+    "docstring_len": 0,
+    "line": 86,
+    "lines": 21,
+    "status": "good"
+  },
+  "class Short": {
+    "docstring_len": 0,
+    "line": 7,
+    "lines": 3,
+    "status": "good"
+  },
+  "class ShortWithDocstring": {
+    "docstring_len": 44,
+    "line": 2,
+    "lines": 4,
+    "status": "good"
+  },
+  "class _Protected": {
+    "children": {
+      "35": {
+        "docstring_len": 0,
+        "lines": 3,
+        "name": "def _Protected.short1",
+        "status": "good"
+      }
+    },
+    "docstring_len": 10,
+    "line": 32,
+    "lines": 6,
+    "status": "good"
+  },
+  "def long": {
+    "docstring_len": 44,
+    "line": 46,
+    "lines": 8,
+    "status": "good"
+  },
+  "def long_with_omit": {
+    "docstring_len": 0,
+    "line": 112,
+    "lines": 1,
+    "status": "good"
+  },
+  "def long_without_docstring": {
+    "docstring_len": 0,
+    "line": 59,
+    "lines": 3,
+    "status": "good"
+  },
+  "def short": {
+    "docstring_len": 0,
+    "line": 42,
+    "lines": 3,
+    "status": "good"
+  }
+}
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json
new file mode 100644
index 000000000000..56ae51b77270
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.recursive.terse.line.json
@@ -0,0 +1,206 @@
+{
+  "  2": {
+    "docstring_len": 44,
+    "lines": 4,
+    "name": "class ShortWithDocstring",
+    "status": "good"
+  },
+  "  7": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "class Short",
+    "status": "good"
+  },
+  " 11": {
+    "children": {
+      "14": {
+        "docstring_len": 0,
+        "lines": 3,
+        "name": "def LongWithDocstring.short1",
+        "status": "good"
+      }
+    },
+    "docstring_len": 44,
+    "lines": 6,
+    "name": "class LongWithDocstring",
+    "status": "good"
+  },
+  " 20": {
+    "children": {
+      "21": {
+        "docstring_len": 0,
+        "lines": 3,
+        "name": "def LongWithoutDocstring.short1",
+        "status": "good"
+      }
+    },
+    "docstring_len": 0,
+    "lines": 4,
+    "name": "class LongWithoutDocstring",
+    "status": "good"
+  },
+  " 25": {
+    "children": {
+      "28": {
+        "docstring_len": 0,
+        "lines": 3,
+        "name": "def LongWithShortDocstring.short1",
+        "status": "good"
+      }
+    },
+    "docstring_len": 10,
+    "lines": 6,
+    "name": "class LongWithShortDocstring",
+    "status": "good"
+  },
+  " 32": {
+    "children": {
+      "35": {
+        "docstring_len": 0,
+        "lines": 3,
+        "name": "def _Protected.short1",
+        "status": "good"
+      }
+    },
+    "docstring_len": 10,
+    "lines": 6,
+    "name": "class _Protected",
+    "status": "good"
+  },
+  " 42": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def short",
+    "status": "good"
+  },
+  " 46": {
+    "docstring_len": 44,
+    "lines": 8,
+    "name": "def long",
+    "status": "good"
+  },
+  " 59": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def long_without_docstring",
+    "status": "good"
+  },
+  " 69": {
+    "children": {
+      "72": {
+        "children": {
+          "73": {
+            "children": {
+              "74": {
+                "docstring_len": 0,
+                "lines": 6,
+                "name": "class ImpossibleCombo.needs_docs.not_short.Long",
+                "status": "good"
+              },
+              "81": {
+                "docstring_len": 0,
+                "lines": 3,
+                "name": "class ImpossibleCombo.needs_docs.not_short.Short",
+                "status": "good"
+              }
+            },
+            "docstring_len": 0,
+            "lines": 11,
+            "name": "def ImpossibleCombo.needs_docs.not_short",
+            "status": "good"
+          },
+          "74": {
+            "docstring_len": 0,
+            "lines": 6,
+            "name": "class ImpossibleCombo.needs_docs.not_short.Long",
+            "status": "good"
+          },
+          "81": {
+            "docstring_len": 0,
+            "lines": 3,
+            "name": "class ImpossibleCombo.needs_docs.not_short.Short",
+            "status": "good"
+          }
+        },
+        "docstring_len": 0,
+        "lines": 12,
+        "name": "def ImpossibleCombo.needs_docs",
+        "status": "good"
+      },
+      "73": {
+        "children": {
+          "74": {
+            "docstring_len": 0,
+            "lines": 6,
+            "name": "class ImpossibleCombo.needs_docs.not_short.Long",
+            "status": "good"
+          },
+          "81": {
+            "docstring_len": 0,
+            "lines": 3,
+            "name": "class ImpossibleCombo.needs_docs.not_short.Short",
+            "status": "good"
+          }
+        },
+        "docstring_len": 0,
+        "lines": 11,
+        "name": "def ImpossibleCombo.needs_docs.not_short",
+        "status": "good"
+      },
+      "74": {
+        "docstring_len": 0,
+        "lines": 6,
+        "name": "class ImpossibleCombo.needs_docs.not_short.Long",
+        "status": "good"
+      },
+      "81": {
+        "docstring_len": 0,
+        "lines": 3,
+        "name": "class ImpossibleCombo.needs_docs.not_short.Short",
+        "status": "good"
+      }
+    },
+    "docstring_len": 44,
+    "lines": 15,
+    "name": "class ImpossibleCombo",
+    "status": "good"
+  },
+  " 86": {
+    "children": {
+      " 87": {
+        "docstring_len": 0,
+        "lines": 2,
+        "name": "def NotDocstring.short1",
+        "status": "good"
+      },
+      " 97": {
+        "docstring_len": 0,
+        "lines": 2,
+        "name": "def NotDocstring.long_with_override",
+        "status": "good"
+      },
+      "100": {
+        "docstring_len": 0,
+        "lines": 2,
+        "name": "def NotDocstring.short2",
+        "status": "good"
+      },
+      "103": {
+        "docstring_len": 0,
+        "lines": 4,
+        "name": "def NotDocstring.short3",
+        "status": "good"
+      }
+    },
+    "docstring_len": 0,
+    "lines": 21,
+    "name": "class NotDocstring",
+    "status": "good"
+  },
+  "112": {
+    "docstring_len": 0,
+    "lines": 1,
+    "name": "def long_with_omit",
+    "status": "good"
+  }
+}
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.report.json b/tools/test/docstring_linter_testdata/python_code.py.txt.report.json
new file mode 100644
index 000000000000..3d67396bbda3
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.report.json
@@ -0,0 +1,367 @@
+[
+  {
+    "category": "class",
+    "children": [],
+    "decorators": [],
+    "display_name": "class ShortWithDocstring",
+    "docstring": "\"\"\"This docstring, while short, is enough\"\"\"",
+    "full_name": "ShortWithDocstring",
+    "index": 0,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 4,
+    "parent": null,
+    "start_line": 2
+  },
+  {
+    "category": "class",
+    "children": [],
+    "decorators": [],
+    "display_name": "class Short",
+    "docstring": "",
+    "full_name": "Short",
+    "index": 1,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 3,
+    "parent": null,
+    "start_line": 7
+  },
+  {
+    "category": "class",
+    "children": [
+      3
+    ],
+    "decorators": [],
+    "display_name": "class LongWithDocstring",
+    "docstring": "\"\"\"This docstring, while short, is enough\"\"\"",
+    "full_name": "LongWithDocstring",
+    "index": 2,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 6,
+    "parent": null,
+    "start_line": 11
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def LongWithDocstring.short1()",
+    "docstring": "",
+    "full_name": "LongWithDocstring.short1",
+    "index": 3,
+    "is_local": false,
+    "is_method": true,
+    "line_count": 3,
+    "parent": 2,
+    "start_line": 14
+  },
+  {
+    "category": "class",
+    "children": [
+      5
+    ],
+    "decorators": [],
+    "display_name": "class LongWithoutDocstring",
+    "docstring": "",
+    "full_name": "LongWithoutDocstring",
+    "index": 4,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 4,
+    "parent": null,
+    "start_line": 20
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def LongWithoutDocstring.short1()",
+    "docstring": "",
+    "full_name": "LongWithoutDocstring.short1",
+    "index": 5,
+    "is_local": false,
+    "is_method": true,
+    "line_count": 3,
+    "parent": 4,
+    "start_line": 21
+  },
+  {
+    "category": "class",
+    "children": [
+      7
+    ],
+    "decorators": [],
+    "display_name": "class LongWithShortDocstring",
+    "docstring": "\"\"\"TODO\"\"\"",
+    "full_name": "LongWithShortDocstring",
+    "index": 6,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 6,
+    "parent": null,
+    "start_line": 25
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def LongWithShortDocstring.short1()",
+    "docstring": "",
+    "full_name": "LongWithShortDocstring.short1",
+    "index": 7,
+    "is_local": false,
+    "is_method": true,
+    "line_count": 3,
+    "parent": 6,
+    "start_line": 28
+  },
+  {
+    "category": "class",
+    "children": [
+      9
+    ],
+    "decorators": [],
+    "display_name": "class _Protected",
+    "docstring": "\"\"\"TODO\"\"\"",
+    "full_name": "_Protected",
+    "index": 8,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 6,
+    "parent": null,
+    "start_line": 32
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def _Protected.short1()",
+    "docstring": "",
+    "full_name": "_Protected.short1",
+    "index": 9,
+    "is_local": false,
+    "is_method": true,
+    "line_count": 3,
+    "parent": 8,
+    "start_line": 35
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def short()",
+    "docstring": "",
+    "full_name": "short",
+    "index": 10,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 3,
+    "parent": null,
+    "start_line": 42
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def long()",
+    "docstring": "\"\"\"This docstring, while short, is enough\"\"\"",
+    "full_name": "long",
+    "index": 11,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 8,
+    "parent": null,
+    "start_line": 46
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def long_without_docstring()",
+    "docstring": "",
+    "full_name": "long_without_docstring",
+    "index": 12,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 3,
+    "parent": null,
+    "start_line": 59
+  },
+  {
+    "category": "class",
+    "children": [
+      14,
+      15,
+      16,
+      17
+    ],
+    "decorators": [],
+    "display_name": "class ImpossibleCombo",
+    "docstring": "\"\"\"This docstring, while short, is enough\"\"\"",
+    "full_name": "ImpossibleCombo",
+    "index": 13,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 15,
+    "parent": null,
+    "start_line": 69
+  },
+  {
+    "category": "def",
+    "children": [
+      15,
+      16,
+      17
+    ],
+    "decorators": [],
+    "display_name": "def ImpossibleCombo.needs_docs()",
+    "docstring": "",
+    "full_name": "ImpossibleCombo.needs_docs",
+    "index": 14,
+    "is_local": false,
+    "is_method": true,
+    "line_count": 12,
+    "parent": 13,
+    "start_line": 72
+  },
+  {
+    "category": "def",
+    "children": [
+      16,
+      17
+    ],
+    "decorators": [],
+    "display_name": "def ImpossibleCombo.needs_docs.not_short()",
+    "docstring": "",
+    "full_name": "ImpossibleCombo.needs_docs.not_short",
+    "index": 15,
+    "is_local": true,
+    "is_method": false,
+    "line_count": 11,
+    "parent": 14,
+    "start_line": 73
+  },
+  {
+    "category": "class",
+    "children": [],
+    "decorators": [],
+    "display_name": "class ImpossibleCombo.needs_docs.not_short.Long",
+    "docstring": "",
+    "full_name": "ImpossibleCombo.needs_docs.not_short.Long",
+    "index": 16,
+    "is_local": true,
+    "is_method": false,
+    "line_count": 6,
+    "parent": 15,
+    "start_line": 74
+  },
+  {
+    "category": "class",
+    "children": [],
+    "decorators": [],
+    "display_name": "class ImpossibleCombo.needs_docs.not_short.Short",
+    "docstring": "",
+    "full_name": "ImpossibleCombo.needs_docs.not_short.Short",
+    "index": 17,
+    "is_local": true,
+    "is_method": false,
+    "line_count": 3,
+    "parent": 15,
+    "start_line": 81
+  },
+  {
+    "category": "class",
+    "children": [
+      19,
+      20,
+      21,
+      22
+    ],
+    "decorators": [
+      "@override"
+    ],
+    "display_name": "class NotDocstring",
+    "docstring": "",
+    "full_name": "NotDocstring",
+    "index": 18,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 21,
+    "parent": null,
+    "start_line": 86
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def NotDocstring.short1()",
+    "docstring": "",
+    "full_name": "NotDocstring.short1",
+    "index": 19,
+    "is_local": false,
+    "is_method": true,
+    "line_count": 2,
+    "parent": 18,
+    "start_line": 87
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [
+      "@override"
+    ],
+    "display_name": "def NotDocstring.long_with_override()",
+    "docstring": "",
+    "full_name": "NotDocstring.long_with_override",
+    "index": 20,
+    "is_local": false,
+    "is_method": true,
+    "line_count": 2,
+    "parent": 18,
+    "start_line": 97
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def NotDocstring.short2()",
+    "docstring": "",
+    "full_name": "NotDocstring.short2",
+    "index": 21,
+    "is_local": false,
+    "is_method": true,
+    "line_count": 2,
+    "parent": 18,
+    "start_line": 100
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def NotDocstring.short3()",
+    "docstring": "",
+    "full_name": "NotDocstring.short3",
+    "index": 22,
+    "is_local": false,
+    "is_method": true,
+    "line_count": 4,
+    "parent": 18,
+    "start_line": 103
+  },
+  {
+    "category": "def",
+    "children": [],
+    "decorators": [],
+    "display_name": "def long_with_omit()",
+    "docstring": "",
+    "full_name": "long_with_omit",
+    "index": 23,
+    "is_local": false,
+    "is_method": false,
+    "line_count": 1,
+    "parent": null,
+    "start_line": 112
+  }
+]
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json b/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json
new file mode 100644
index 000000000000..cf40868ef4b0
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.single.line.json
@@ -0,0 +1,26 @@
+{
+  "  2": "class ShortWithDocstring: lines=4, docs=44",
+  "  7": "class Short: lines=3, docs=0",
+  " 11": "class LongWithDocstring: lines=6, docs=44",
+  " 14": "def LongWithDocstring.short1(): lines=3, docs=0",
+  " 20": "class LongWithoutDocstring: lines=4, docs=0",
+  " 21": "def LongWithoutDocstring.short1(): lines=3, docs=0",
+  " 25": "class LongWithShortDocstring: lines=6, docs=10",
+  " 28": "def LongWithShortDocstring.short1(): lines=3, docs=0",
+  " 32": "class _Protected: lines=6, docs=10",
+  " 35": "def _Protected.short1(): lines=3, docs=0",
+  " 42": "def short(): lines=3, docs=0",
+  " 46": "def long(): lines=8, docs=44",
+  " 59": "def long_without_docstring(): lines=3, docs=0",
+  " 69": "class ImpossibleCombo: lines=15, docs=44",
+  " 72": "def ImpossibleCombo.needs_docs(): lines=12, docs=0",
+  " 73": "def ImpossibleCombo.needs_docs.not_short(): lines=11, docs=0",
+  " 74": "class ImpossibleCombo.needs_docs.not_short.Long: lines=6, docs=0",
+  " 81": "class ImpossibleCombo.needs_docs.not_short.Short: lines=3, docs=0",
+  " 86": "class NotDocstring: lines=21, docs=0",
+  " 87": "def NotDocstring.short1(): lines=2, docs=0",
+  " 97": "def NotDocstring.long_with_override(): lines=2, docs=0",
+  "100": "def NotDocstring.short2(): lines=2, docs=0",
+  "103": "def NotDocstring.short3(): lines=4, docs=0",
+  "112": "def long_with_omit(): lines=1, docs=0"
+}
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json
new file mode 100644
index 000000000000..ff878ca5d53e
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.json
@@ -0,0 +1,146 @@
+{
+  "class ImpossibleCombo": {
+    "docstring_len": 44,
+    "line": 69,
+    "lines": 15,
+    "status": "good"
+  },
+  "class ImpossibleCombo.needs_docs.not_short.Long": {
+    "docstring_len": 0,
+    "line": 74,
+    "lines": 6,
+    "status": "good"
+  },
+  "class ImpossibleCombo.needs_docs.not_short.Short": {
+    "docstring_len": 0,
+    "line": 81,
+    "lines": 3,
+    "status": "good"
+  },
+  "class LongWithDocstring": {
+    "docstring_len": 44,
+    "line": 11,
+    "lines": 6,
+    "status": "good"
+  },
+  "class LongWithShortDocstring": {
+    "docstring_len": 10,
+    "line": 25,
+    "lines": 6,
+    "status": "good"
+  },
+  "class LongWithoutDocstring": {
+    "docstring_len": 0,
+    "line": 20,
+    "lines": 4,
+    "status": "good"
+  },
+  "class NotDocstring": {
+    "docstring_len": 0,
+    "line": 86,
+    "lines": 21,
+    "status": "good"
+  },
+  "class Short": {
+    "docstring_len": 0,
+    "line": 7,
+    "lines": 3,
+    "status": "good"
+  },
+  "class ShortWithDocstring": {
+    "docstring_len": 44,
+    "line": 2,
+    "lines": 4,
+    "status": "good"
+  },
+  "class _Protected": {
+    "docstring_len": 10,
+    "line": 32,
+    "lines": 6,
+    "status": "good"
+  },
+  "def ImpossibleCombo.needs_docs": {
+    "docstring_len": 0,
+    "line": 72,
+    "lines": 12,
+    "status": "good"
+  },
+  "def ImpossibleCombo.needs_docs.not_short": {
+    "docstring_len": 0,
+    "line": 73,
+    "lines": 11,
+    "status": "good"
+  },
+  "def LongWithDocstring.short1": {
+    "docstring_len": 0,
+    "line": 14,
+    "lines": 3,
+    "status": "good"
+  },
+  "def LongWithShortDocstring.short1": {
+    "docstring_len": 0,
+    "line": 28,
+    "lines": 3,
+    "status": "good"
+  },
+  "def LongWithoutDocstring.short1": {
+    "docstring_len": 0,
+    "line": 21,
+    "lines": 3,
+    "status": "good"
+  },
+  "def NotDocstring.long_with_override": {
+    "docstring_len": 0,
+    "line": 97,
+    "lines": 2,
+    "status": "good"
+  },
+  "def NotDocstring.short1": {
+    "docstring_len": 0,
+    "line": 87,
+    "lines": 2,
+    "status": "good"
+  },
+  "def NotDocstring.short2": {
+    "docstring_len": 0,
+    "line": 100,
+    "lines": 2,
+    "status": "good"
+  },
+  "def NotDocstring.short3": {
+    "docstring_len": 0,
+    "line": 103,
+    "lines": 4,
+    "status": "good"
+  },
+  "def _Protected.short1": {
+    "docstring_len": 0,
+    "line": 35,
+    "lines": 3,
+    "status": "good"
+  },
+  "def long": {
+    "docstring_len": 44,
+    "line": 46,
+    "lines": 8,
+    "status": "good"
+  },
+  "def long_with_omit": {
+    "docstring_len": 0,
+    "line": 112,
+    "lines": 1,
+    "status": "good"
+  },
+  "def long_without_docstring": {
+    "docstring_len": 0,
+    "line": 59,
+    "lines": 3,
+    "status": "good"
+  },
+  "def short": {
+    "docstring_len": 0,
+    "line": 42,
+    "lines": 3,
+    "status": "good"
+  }
+}
diff --git a/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json
new file mode 100644
index 000000000000..c248493d4532
--- /dev/null
+++ b/tools/test/docstring_linter_testdata/python_code.py.txt.terse.line.json
@@ -0,0 +1,146 @@
+{
+  "  2": {
+    "docstring_len": 44,
+    "lines": 4,
+    "name": "class ShortWithDocstring",
+    "status": "good"
+  },
+  "  7": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "class Short",
+    "status": "good"
+  },
+  " 11": {
+    "docstring_len": 44,
+    "lines": 6,
+    "name": "class LongWithDocstring",
+    "status": "good"
+  },
+  " 14": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def LongWithDocstring.short1",
+    "status": "good"
+  },
+  " 20": {
+    "docstring_len": 0,
+    "lines": 4,
+    "name": "class LongWithoutDocstring",
+    "status": "good"
+  },
+  " 21": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def LongWithoutDocstring.short1",
+    "status": "good"
+  },
+  " 25": {
+    "docstring_len": 10,
+    "lines": 6,
+    "name": "class LongWithShortDocstring",
+    "status": "good"
+  },
+  " 28": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def LongWithShortDocstring.short1",
+    "status": "good"
+  },
+  " 32": {
+    "docstring_len": 10,
+    "lines": 6,
+    "name": "class _Protected",
+    "status": "good"
+  },
+  " 35": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def _Protected.short1",
+    "status": "good"
+  },
+  " 42": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def short",
+    "status": "good"
+  },
+  " 46": {
+    "docstring_len": 44,
+    "lines": 8,
+    "name": "def long",
+    "status": "good"
+  },
+  " 59": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "def long_without_docstring",
+    "status": "good"
+  },
+  " 69": {
+    "docstring_len": 44,
+    "lines": 15,
+    "name": "class ImpossibleCombo",
+    "status": "good"
+  },
+  " 72": {
+    "docstring_len": 0,
+    "lines": 12,
+    "name": "def ImpossibleCombo.needs_docs",
+    "status": "good"
+  },
+  " 73": {
+    "docstring_len": 0,
+    "lines": 11,
+    "name": "def ImpossibleCombo.needs_docs.not_short",
+    "status": "good"
+  },
+  " 74": {
+    "docstring_len": 0,
+    "lines": 6,
+    "name": "class ImpossibleCombo.needs_docs.not_short.Long",
+    "status": "good"
+  },
+  " 81": {
+    "docstring_len": 0,
+    "lines": 3,
+    "name": "class ImpossibleCombo.needs_docs.not_short.Short",
+    "status": "good"
+  },
+  " 86": {
+    "docstring_len": 0,
+    "lines": 21,
+    "name": "class NotDocstring",
+    "status": "good"
+  },
+  " 87": {
+    "docstring_len": 0,
+    "lines": 2,
+    "name": "def NotDocstring.short1",
+    "status": "good"
+  },
+  " 97": {
+    "docstring_len": 0,
+    "lines": 2,
+    "name": "def NotDocstring.long_with_override",
+    "status": "good"
+  },
+  "100": {
+    "docstring_len": 0,
+    "lines": 2,
+    "name": "def NotDocstring.short2",
+    "status": "good"
+  },
+  "103": {
+    "docstring_len": 0,
+    "lines": 4,
+    "name": "def NotDocstring.short3",
+    "status": "good"
+  },
+  "112": {
+    "docstring_len": 0,
+    "lines": 1,
+    "name": "def long_with_omit",
+    "status": "good"
+  }
+}
diff --git a/tools/test/header_only_linter_testdata/a.cpp b/tools/test/header_only_linter_testdata/a.cpp
new file mode 100644
index 000000000000..97cc3ad12d6e
--- /dev/null
+++ b/tools/test/header_only_linter_testdata/a.cpp
@@ -0,0 +1,7 @@
+// bbb
+#include <a.h>
+
+int main() {
+  auto var = symC(2, 3);
+  return symDef() + var;
+}
diff --git a/tools/test/header_only_linter_testdata/b.cpp b/tools/test/header_only_linter_testdata/b.cpp
new file mode 100644
index 000000000000..037f376c6836
--- /dev/null
+++ b/tools/test/header_only_linter_testdata/b.cpp
@@ -0,0 +1,9 @@
+#include <a.h>
+#include <bbb.h>
+
+int main() {
+  auto var = a();
+
+  // bbb
+  return symDef() + var;
+}
diff --git a/tools/test/header_only_linter_testdata/bad.txt b/tools/test/header_only_linter_testdata/bad.txt
new file mode 100644
index 000000000000..31e9d61017f2
--- /dev/null
+++ b/tools/test/header_only_linter_testdata/bad.txt
@@ -0,0 +1,9 @@
+# a.h
+a
+symC
+symD
+
+# bbb.h
+bbb
+symD
+symDef
diff --git a/tools/test/header_only_linter_testdata/empty.txt b/tools/test/header_only_linter_testdata/empty.txt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/test/header_only_linter_testdata/good.txt b/tools/test/header_only_linter_testdata/good.txt
new file mode 100644
index 000000000000..19f1dff018cd
--- /dev/null
+++ b/tools/test/header_only_linter_testdata/good.txt
@@ -0,0 +1,10 @@
+# a.h
+  a
+
+  # indented comment should do nothing
+symC
+
+
+
+# bbb.h
+  symDef
diff --git a/tools/test/linter_test_case.py b/tools/test/linter_test_case.py
index 0410c5a1b850..0be80b79dbcf 100644
--- a/tools/test/linter_test_case.py
+++ b/tools/test/linter_test_case.py
@@ -5,8 +5,11 @@
 from pathlib import Path
 from unittest import mock, TestCase
 
+<<<<<<< HEAD
 from tools.linter.adapters._linter import PythonFile
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class LinterTestCase(TestCase):
     LinterClass = None
@@ -15,13 +18,21 @@ class LinterTestCase(TestCase):
     def assertExpected(self, path: Path, actual: str, suffix: str) -> None:
         expected_file = Path(f"{path}.{suffix}")
         if not self.rewrite_expected and expected_file.exists():
+<<<<<<< HEAD
             self.assertEqual(actual, expected_file.read_text())
+=======
+            self.assertEqual(expected_file.read_text(), actual)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             expected_file.write_text(actual)
 
     def replace(self, s: str):
         linter = self.LinterClass("dummy")
+<<<<<<< HEAD
         pf = PythonFile(linter.linter_name, contents=s)
+=======
+        pf = self.LinterClass.make_file(contents=s)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         replacement, _results = linter._replace(pf)
         return replacement
 
@@ -45,9 +56,16 @@ def _lint_test(self, path, args, mock_stdout):
             linter.lint_all()
             self.assertExpected(path, mock_stdout.getvalue(), "lintrunner")
 
+<<<<<<< HEAD
         with self.subTest("from-lintrunner"):
             linter = self.LinterClass(["--lintrunner", str(path), *args])
             pf = PythonFile(linter.linter_name, path)
+=======
+        replacement, results = "(no replacement)", "(no results)"
+        with self.subTest("from-lintrunner"):
+            linter = self.LinterClass(["--lintrunner", str(path), *args])
+            pf = self.LinterClass.make_file(path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             replacement, results = linter._replace(pf)
 
             actual = [json.loads(d) for d in linter._display(pf, results)]
diff --git a/tools/test/set_linter_testdata/includes.py.txt.json b/tools/test/set_linter_testdata/includes.py.txt.json
index 412c6901523c..c03e64b7a0ee 100644
--- a/tools/test/set_linter_testdata/includes.py.txt.json
+++ b/tools/test/set_linter_testdata/includes.py.txt.json
@@ -29,7 +29,11 @@
     "name": "Suggested fixes for set_linter",
     "original": "# mypy: ignore-errors\n\nimport collections\nimport types\nfrom typing import Any, Dict, List, Optional, TYPE_CHECKING\n\nimport torch\nimport torch.fx\nfrom torch._guards import Source\n\nfrom ..utils import (\n    namedtuple_fields,\n    odict_values,\n    # OrderedSet,\n    set_example_value,\n)\nfrom .base import MutableLocal, VariableTracker, VariableTrackerContainer\n\nif TYPE_CHECKING:\n    from torch._dynamo.codegen import PyCodegen\n\n\nclass BaseListVariable(VariableTrackerContainer):\n    our_container = set\n",
     "path": "tools/test/set_linter_testdata/includes.py.txt",
+<<<<<<< HEAD
     "replacement": "# mypy: ignore-errors\n\nimport collections\nimport types\nfrom typing import Any, Dict, List, Optional, TYPE_CHECKING\n\nimport torch\nimport torch.fx\nfrom torch._guards import Source\n\nfrom ..utils import (\n    namedtuple_fields,\n    odict_values,\n    # OrderedSet,\n    set_example_value,\n)\nfrom .base import MutableLocal, VariableTracker, VariableTrackerContainer\nfrom torch.utils._ordered_set import OrderedSet\n\nif TYPE_CHECKING:\n    from torch._dynamo.codegen import PyCodegen\n\n\nclass BaseListVariable(VariableTrackerContainer):\n    our_container = OrderedSet\n",
+=======
+    "replacement": "# mypy: ignore-errors\n\nimport collections\nimport types\nfrom typing import Any, Dict, List, Optional, TYPE_CHECKING\n\nimport torch\nimport torch.fx\nfrom torch._guards import Source\n\nfrom ..utils import (\n    namedtuple_fields,\n    odict_values,\n    # OrderedSet,\n    set_example_value,\n)\nfrom .base import MutableLocal, VariableTracker, VariableTrackerContainer\nfrom torch.utils._ordered_set import OrderedSet\n\n\nif TYPE_CHECKING:\n    from torch._dynamo.codegen import PyCodegen\n\n\nclass BaseListVariable(VariableTrackerContainer):\n    our_container = OrderedSet\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "severity": "error"
   }
 ]
diff --git a/tools/test/set_linter_testdata/includes.py.txt.python b/tools/test/set_linter_testdata/includes.py.txt.python
index 42e828164050..f9dbd30f1489 100644
--- a/tools/test/set_linter_testdata/includes.py.txt.python
+++ b/tools/test/set_linter_testdata/includes.py.txt.python
@@ -17,6 +17,10 @@ from ..utils import (
 from .base import MutableLocal, VariableTracker, VariableTrackerContainer
 from torch.utils._ordered_set import OrderedSet
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     from torch._dynamo.codegen import PyCodegen
 
diff --git a/tools/test/set_linter_testdata/python_code.py.txt b/tools/test/set_linter_testdata/python_code.py.txt
index de5c1e37ab30..c780de588993 100644
--- a/tools/test/set_linter_testdata/python_code.py.txt
+++ b/tools/test/set_linter_testdata/python_code.py.txt
@@ -1,4 +1,10 @@
 # Basic tests
+<<<<<<< HEAD
+=======
+import tempfile
+
+print(f"{tempfile.gettempdir()}/memory_snapshot.pickle")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ignored = set()  # noqa: set_linter
 a = set()
diff --git a/tools/test/set_linter_testdata/python_code.py.txt.json b/tools/test/set_linter_testdata/python_code.py.txt.json
index cebe328d05ac..3a1e7d4d691f 100644
--- a/tools/test/set_linter_testdata/python_code.py.txt.json
+++ b/tools/test/set_linter_testdata/python_code.py.txt.json
@@ -3,7 +3,11 @@
     "char": 0,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 0,
+=======
+    "line": 3,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Add import for OrderedSet",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -14,7 +18,11 @@
     "char": 4,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 4,
+=======
+    "line": 7,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -25,7 +33,11 @@
     "char": 4,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 6,
+=======
+    "line": 9,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -36,7 +48,11 @@
     "char": 3,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 9,
+=======
+    "line": 12,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -47,7 +63,11 @@
     "char": 7,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 32,
+=======
+    "line": 35,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -58,7 +78,11 @@
     "char": 9,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 32,
+=======
+    "line": 35,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -69,7 +93,11 @@
     "char": 7,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 33,
+=======
+    "line": 36,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -80,7 +108,11 @@
     "char": 12,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 33,
+=======
+    "line": 36,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -91,7 +123,11 @@
     "char": 15,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 35,
+=======
+    "line": 38,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -102,7 +138,11 @@
     "char": 36,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 35,
+=======
+    "line": 38,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -113,7 +153,11 @@
     "char": 17,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 38,
+=======
+    "line": 41,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -124,7 +168,11 @@
     "char": 22,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 38,
+=======
+    "line": 41,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -135,7 +183,11 @@
     "char": 30,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 38,
+=======
+    "line": 41,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -146,7 +198,11 @@
     "char": 50,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 38,
+=======
+    "line": 41,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -157,7 +213,11 @@
     "char": 10,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 41,
+=======
+    "line": 44,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -168,7 +228,11 @@
     "char": 51,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 41,
+=======
+    "line": 44,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -179,7 +243,11 @@
     "char": 75,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 41,
+=======
+    "line": 44,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -190,7 +258,11 @@
     "char": 77,
     "code": "SET_LINTER",
     "description": null,
+<<<<<<< HEAD
     "line": 41,
+=======
+    "line": 44,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "name": "Builtin `set` is deprecated",
     "original": null,
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
@@ -203,9 +275,15 @@
     "description": null,
     "line": null,
     "name": "Suggested fixes for set_linter",
+<<<<<<< HEAD
     "original": "# Basic tests\n\nignored = set()  # noqa: set_linter\na = set()\nb = \"set()\"\nc = set\nd = c.set\nf = (\n   set(\n   )\n)\nignored = (\n   set(  # noqa: set_linter\n   )\n)\n\n# Non-sets\n\nd = {}\nlong_string = \"\"\" set()\nset() set x.set set()\n\\\"\"\"\"\n\nclass A:\n    def set(self, x):\n        self.x = x\n\nset = A().set\n\n# Braced sets\n\nset1 = {1}\nset2 = {1, 2}\n\niterator_set = {i for i in range(10)}\n\n# A dict with two sets.\ndict_set = {\"a\": {2, 3}, \"b\": {i for i in range(3)}}\n\n# A set containing an object constructed with a dict and a set\nsos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}\n",
     "path": "tools/test/set_linter_testdata/python_code.py.txt",
     "replacement": "# Basic tests\n\nignored = set()  # noqa: set_linter\na = OrderedSet()\nb = \"set()\"\nc = OrderedSet\nd = c.set\nf = (\n   OrderedSet(\n   )\n)\nignored = (\n   set(  # noqa: set_linter\n   )\n)\n\n# Non-sets\n\nd = {}\nlong_string = \"\"\" set()\nset() set x.set set()\n\\\"\"\"\"\n\nclass A:\n    def set(self, x):\n        self.x = x\n\nset = A().set\n\n# Braced sets\n\nset1 = OrderedSet([1])\nset2 = OrderedSet([1, 2])\n\niterator_set = OrderedSet([i for i in range(10)])\n\n# A dict with two sets.\ndict_set = {\"a\": OrderedSet([2, 3]), \"b\": OrderedSet([i for i in range(3)])}\n\n# A set containing an object constructed with a dict and a set\nfrom torchOrderedSet([utils._ordered_set import OrdOrderedSet([redSet\nsos_set = {Somet])ing({i: i + ]) for i in range(3)}, {i + 1 for i in range(3)})}\n",
+=======
+    "original": "# Basic tests\nimport tempfile\n\nprint(f\"{tempfile.gettempdir()}/memory_snapshot.pickle\")\n\nignored = set()  # noqa: set_linter\na = set()\nb = \"set()\"\nc = set\nd = c.set\nf = (\n   set(\n   )\n)\nignored = (\n   set(  # noqa: set_linter\n   )\n)\n\n# Non-sets\n\nd = {}\nlong_string = \"\"\" set()\nset() set x.set set()\n\\\"\"\"\"\n\nclass A:\n    def set(self, x):\n        self.x = x\n\nset = A().set\n\n# Braced sets\n\nset1 = {1}\nset2 = {1, 2}\n\niterator_set = {i for i in range(10)}\n\n# A dict with two sets.\ndict_set = {\"a\": {2, 3}, \"b\": {i for i in range(3)}}\n\n# A set containing an object constructed with a dict and a set\nsos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}\n",
+    "path": "tools/test/set_linter_testdata/python_code.py.txt",
+    "replacement": "# Basic tests\nimport tempfile\nfrom torch.utils._ordered_set import OrderedSet\n\n\nprint(f\"{tempfile.gettempdir()}/memory_snapshot.pickle\")\n\nignored = set()  # noqa: set_linter\na = OrderedSet()\nb = \"set()\"\nc = OrderedSet\nd = c.set\nf = (\n   OrderedSet(\n   )\n)\nignored = (\n   set(  # noqa: set_linter\n   )\n)\n\n# Non-sets\n\nd = {}\nlong_string = \"\"\" set()\nset() set x.set set()\n\\\"\"\"\"\n\nclass A:\n    def set(self, x):\n        self.x = x\n\nset = A().set\n\n# Braced sets\n\nset1 = OrderedSet([1])\nset2 = OrderedSet([1, 2])\n\niterator_set = OrderedSet([i for i in range(10)])\n\n# A dict with two sets.\ndict_set = {\"a\": OrderedSet([2, 3]), \"b\": OrderedSet([i for i in range(3)])}\n\n# A set containing an object constructed with a dict and a set\nsos_set = OrderedSet([Something({i: i + 1 for i in range(3)}, OrderedSet([i + 1 for i in range(3)]))])\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "severity": "error"
   }
 ]
diff --git a/tools/test/set_linter_testdata/python_code.py.txt.lintrunner b/tools/test/set_linter_testdata/python_code.py.txt.lintrunner
index 16e9767d2389..568b4a09e907 100644
--- a/tools/test/set_linter_testdata/python_code.py.txt.lintrunner
+++ b/tools/test/set_linter_testdata/python_code.py.txt.lintrunner
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 tools/test/set_linter_testdata/python_code.py.txt:0:1: Add import for OrderedSet
     1 | # Basic tests
     2 |
@@ -131,4 +132,140 @@ tools/test/set_linter_testdata/python_code.py.txt:41:78: Builtin `set` is deprec
    39 |
    40 | # A set containing an object constructed with a dict and a set
    41 | sos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}
+=======
+tools/test/set_linter_testdata/python_code.py.txt:3:1: Add import for OrderedSet
+    1 | # Basic tests
+    2 | import tempfile
+    3 |
+        ^
+    4 | print(f"{tempfile.gettempdir()}/memory_snapshot.pickle")
+    5 |
+
+tools/test/set_linter_testdata/python_code.py.txt:7:5: Builtin `set` is deprecated
+    5 |
+    6 | ignored = set()  # noqa: set_linter
+    7 | a = set()
+            ^^^
+    8 | b = "set()"
+    9 | c = set
+
+tools/test/set_linter_testdata/python_code.py.txt:9:5: Builtin `set` is deprecated
+    7 | a = set()
+    8 | b = "set()"
+    9 | c = set
+            ^^^
+   10 | d = c.set
+   11 | f = (
+
+tools/test/set_linter_testdata/python_code.py.txt:12:4: Builtin `set` is deprecated
+   10 | d = c.set
+   11 | f = (
+   12 |    set(
+           ^^^
+   13 |    )
+   14 | )
+
+tools/test/set_linter_testdata/python_code.py.txt:35:8: Builtin `set` is deprecated
+   33 | # Braced sets
+   34 |
+   35 | set1 = {1}
+               ^
+   36 | set2 = {1, 2}
+   37 |
+
+tools/test/set_linter_testdata/python_code.py.txt:35:10: Builtin `set` is deprecated
+   33 | # Braced sets
+   34 |
+   35 | set1 = {1}
+                 ^
+   36 | set2 = {1, 2}
+   37 |
+
+tools/test/set_linter_testdata/python_code.py.txt:36:8: Builtin `set` is deprecated
+   34 |
+   35 | set1 = {1}
+   36 | set2 = {1, 2}
+               ^
+   37 |
+   38 | iterator_set = {i for i in range(10)}
+
+tools/test/set_linter_testdata/python_code.py.txt:36:13: Builtin `set` is deprecated
+   34 |
+   35 | set1 = {1}
+   36 | set2 = {1, 2}
+                    ^
+   37 |
+   38 | iterator_set = {i for i in range(10)}
+
+tools/test/set_linter_testdata/python_code.py.txt:38:16: Builtin `set` is deprecated
+   36 | set2 = {1, 2}
+   37 |
+   38 | iterator_set = {i for i in range(10)}
+                       ^
+   39 |
+   40 | # A dict with two sets.
+
+tools/test/set_linter_testdata/python_code.py.txt:38:37: Builtin `set` is deprecated
+   36 | set2 = {1, 2}
+   37 |
+   38 | iterator_set = {i for i in range(10)}
+                                            ^
+   39 |
+   40 | # A dict with two sets.
+
+tools/test/set_linter_testdata/python_code.py.txt:41:18: Builtin `set` is deprecated
+   39 |
+   40 | # A dict with two sets.
+   41 | dict_set = {"a": {2, 3}, "b": {i for i in range(3)}}
+                         ^
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+
+tools/test/set_linter_testdata/python_code.py.txt:41:23: Builtin `set` is deprecated
+   39 |
+   40 | # A dict with two sets.
+   41 | dict_set = {"a": {2, 3}, "b": {i for i in range(3)}}
+                              ^
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+
+tools/test/set_linter_testdata/python_code.py.txt:41:31: Builtin `set` is deprecated
+   39 |
+   40 | # A dict with two sets.
+   41 | dict_set = {"a": {2, 3}, "b": {i for i in range(3)}}
+                                      ^
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+
+tools/test/set_linter_testdata/python_code.py.txt:41:51: Builtin `set` is deprecated
+   39 |
+   40 | # A dict with two sets.
+   41 | dict_set = {"a": {2, 3}, "b": {i for i in range(3)}}
+                                                          ^
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+
+tools/test/set_linter_testdata/python_code.py.txt:44:11: Builtin `set` is deprecated
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+   44 | sos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}
+                  ^
+
+tools/test/set_linter_testdata/python_code.py.txt:44:52: Builtin `set` is deprecated
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+   44 | sos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}
+                                                           ^
+
+tools/test/set_linter_testdata/python_code.py.txt:44:76: Builtin `set` is deprecated
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+   44 | sos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}
+                                                                                   ^
+
+tools/test/set_linter_testdata/python_code.py.txt:44:78: Builtin `set` is deprecated
+   42 |
+   43 | # A set containing an object constructed with a dict and a set
+   44 | sos_set = {Something({i: i + 1 for i in range(3)}, {i + 1 for i in range(3)})}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                                                                      ^
diff --git a/tools/test/set_linter_testdata/python_code.py.txt.python b/tools/test/set_linter_testdata/python_code.py.txt.python
index 7f77218135c8..443170704c59 100644
--- a/tools/test/set_linter_testdata/python_code.py.txt.python
+++ b/tools/test/set_linter_testdata/python_code.py.txt.python
@@ -1,4 +1,12 @@
 # Basic tests
+<<<<<<< HEAD
+=======
+import tempfile
+from torch.utils._ordered_set import OrderedSet
+
+
+print(f"{tempfile.gettempdir()}/memory_snapshot.pickle")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ignored = set()  # noqa: set_linter
 a = OrderedSet()
@@ -38,5 +46,9 @@ iterator_set = OrderedSet([i for i in range(10)])
 dict_set = {"a": OrderedSet([2, 3]), "b": OrderedSet([i for i in range(3)])}
 
 # A set containing an object constructed with a dict and a set
+<<<<<<< HEAD
 from torchOrderedSet([utils._ordered_set import OrdOrderedSet([redSet
 sos_set = {Somet])ing({i: i + ]) for i in range(3)}, {i + 1 for i in range(3)})}
+=======
+sos_set = OrderedSet([Something({i: i + 1 for i in range(3)}, OrderedSet([i + 1 for i in range(3)]))])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/test/test_create_alerts.py b/tools/test/test_create_alerts.py
index 11afebf85573..a8a1f5e50184 100644
--- a/tools/test/test_create_alerts.py
+++ b/tools/test/test_create_alerts.py
@@ -12,7 +12,11 @@
         "sha": "f02f3046571d21b48af3067e308a1e0f29b43af9",
         "id": 7819529276,
         "conclusion": "failure",
+<<<<<<< HEAD
         "htmlUrl": "https://github.com/pytorch/pytorch/runs/7819529276?check_suite_focus=true",
+=======
+        "htmlUrl": "https://github.com/pytorch/pytorch/runs/7819529276?check_suite_focus=true",  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "logUrl": "https://ossci-raw-job-status.s3.amazonaws.com/log/7819529276",
         "durationS": 14876,
         "failureLine": "##[error]The action has timed out.",
@@ -25,7 +29,11 @@
         "sha": "d0d6b1f2222bf90f478796d84a525869898f55b6",
         "id": 7818399623,
         "conclusion": "failure",
+<<<<<<< HEAD
         "htmlUrl": "https://github.com/pytorch/pytorch/runs/7818399623?check_suite_focus=true",
+=======
+        "htmlUrl": "https://github.com/pytorch/pytorch/runs/7818399623?check_suite_focus=true",  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "logUrl": "https://ossci-raw-job-status.s3.amazonaws.com/log/7818399623",
         "durationS": 14882,
         "failureLine": "##[error]The action has timed out.",
diff --git a/tools/test/test_docstring_linter.py b/tools/test/test_docstring_linter.py
index 85ea26de4e77..65be5275c7a2 100644
--- a/tools/test/test_docstring_linter.py
+++ b/tools/test/test_docstring_linter.py
@@ -1,10 +1,29 @@
 # mypy: ignore-errors
+<<<<<<< HEAD
 from __future__ import annotations
 
 import sys
 from pathlib import Path
 
 from tools.linter.adapters.docstring_linter import DocstringLinter
+=======
+
+import io
+import itertools
+import json
+import sys
+import tempfile
+from pathlib import Path
+from unittest import mock
+
+from tools.linter.adapters._linter.block import _get_decorators
+from tools.linter.adapters.docstring_linter import (
+    DocstringLinter,
+    file_summary,
+    make_recursive,
+    make_terse,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _PARENT = Path(__file__).parent.absolute()
@@ -16,11 +35,188 @@
     from .linter_test_case import LinterTestCase
 
 TEST_FILE = Path("tools/test/docstring_linter_testdata/python_code.py.txt")
+<<<<<<< HEAD
+=======
+TEST_FILE2 = Path("tools/test/docstring_linter_testdata/more_python_code.py.txt")
+TEST_BLOCK_NAMES = Path("tools/test/docstring_linter_testdata/block_names.py.txt")
+ARGS = "--max-class=3", "--max-def=4", "--min-docstring=16"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestDocstringLinter(LinterTestCase):
     LinterClass = DocstringLinter
+<<<<<<< HEAD
 
     def test_python_code(self):
         args = "--max-class=3 --max-def=4".split()
         self.lint_test(TEST_FILE, args)
+=======
+    maxDiff = 10_240
+
+    def test_python_code(self):
+        self.lint_test(TEST_FILE, ARGS)
+
+    @mock.patch("sys.stdout", new_callable=io.StringIO)
+    def test_end_to_end(self, mock_stdout):
+        argv_base = *ARGS, str(TEST_FILE), str(TEST_FILE2)
+        report = "--report"
+        write = "--write-grandfather"
+
+        out = _next_stdout(mock_stdout)
+
+        def run(name, *argv):
+            DocstringLinter(argv_base + argv).lint_all()
+            self.assertExpected(TEST_FILE2, next(out), name)
+
+        with tempfile.TemporaryDirectory() as td:
+            grandfather_file = f"{td}/grandfather.json"
+            grandfather = f"--grandfather={grandfather_file}"
+
+            # Find some failures
+            run("before.txt", grandfather)
+
+            # Rewrite grandfather file
+            run("before.json", grandfather, report, write)
+            actual = Path(grandfather_file).read_text()
+            self.assertExpected(TEST_FILE2, actual, "grandfather.json")
+
+            # Now there are no failures
+            run("after.txt", grandfather)
+            run("after.json", grandfather, report)
+
+    def test_report(self):
+        actual = _dumps(_data())
+        self.assertExpected(TEST_FILE, actual, "report.json")
+
+    def test_terse(self):
+        terse = make_terse(_data(), index_by_line=False)
+        actual = _dumps(terse)
+        self.assertExpected(TEST_FILE, actual, "terse.json")
+
+    def test_terse_line(self):
+        terse = make_terse(_data(), index_by_line=True)
+        actual = _dumps(terse)
+        self.assertExpected(TEST_FILE, actual, "terse.line.json")
+
+    def test_recursive(self):
+        recursive = make_recursive(_data())
+        actual = _dumps(recursive)
+        self.assertExpected(TEST_FILE, actual, "recursive.json")
+
+    def test_terse_recursive(self):
+        recursive = make_recursive(_data())
+        terse = make_terse(recursive, index_by_line=False)
+        actual = _dumps(terse)
+        self.assertExpected(TEST_FILE, actual, "recursive.terse.json")
+
+    def test_terse_line_recursive(self):
+        recursive = make_recursive(_data())
+        terse = make_terse(recursive, index_by_line=True)
+        actual = _dumps(terse)
+        self.assertExpected(TEST_FILE, actual, "recursive.terse.line.json")
+
+    def test_file_summary(self):
+        actual = _dumps(file_summary(_data(), report_all=True))
+        self.assertExpected(TEST_FILE, actual, "single.line.json")
+
+    def test_file_names(self):
+        f = DocstringLinter.make_file(TEST_BLOCK_NAMES)
+        actual = [b.full_name for b in f.blocks]
+        expected = [
+            "top",
+            "top.fun[1]",
+            "top.fun[1].sab",
+            "top.fun[1].sub",
+            "top.fun[2]",
+            "top.fun[2].sub[1]",
+            "top.fun[2].sub[2]",
+            "top.fun[3]",
+            "top.fun[3].sub",
+            "top.fun[3].sab",
+            "top.run",
+            "top.run.sub[1]",
+            "top.run.sub[2]",
+        ]
+        self.assertEqual(actual, expected)
+
+    def test_decorators(self):
+        tests = itertools.product(INDENTS, DECORATORS.items())
+        for indent, (name, (expected, test_inputs)) in tests:
+            ind = indent * " "
+            for data in test_inputs:
+                prog = "".join(ind + d + "\n" for d in data)
+                pf = DocstringLinter.make_file(prog)
+                it = (i for i, t in enumerate(pf.tokens) if t.string == "def")
+                def_t = next(it, 0)
+                with self.subTest("Decorator", indent=indent, name=name, data=data):
+                    actual = list(_get_decorators(pf.tokens, def_t))
+                    self.assertEqual(actual, expected)
+
+
+def _dumps(d: dict) -> str:
+    return json.dumps(d, sort_keys=True, indent=2) + "\n"
+
+
+def _data(file=TEST_FILE):
+    docstring_file = DocstringLinter.make_file(file)
+    return [b.as_data() for b in docstring_file.blocks]
+
+
+def _next_stdout(mock_stdout):
+    length = 0
+    while True:
+        s = mock_stdout.getvalue()
+        yield s[length:]
+        length = len(s)
+
+
+CONSTANT = "A = 10"
+COMMENT = "# a simple function"
+OVER = "@override"
+WRAPS = "@functools.wraps(fn)"
+MASSIVE = (
+    "@some.long.path.very_long_function_name(",
+    "    adjust_something_fiddly=1231232,",
+    "    disable_something_critical=True,)",
+)
+MASSIVE_FLAT = (
+    "@some.long.path.very_long_function_name("
+    "adjust_something_fiddly=1231232,"
+    "disable_something_critical=True,)"
+)
+DEF = "def function():", "    pass"
+
+INDENTS = 0, 4, 8
+DECORATORS = {
+    "none": (
+        [],
+        (
+            [],
+            [*DEF],
+            [COMMENT, *DEF],
+            [CONSTANT, "", COMMENT, *DEF],
+            [OVER, CONSTANT, *DEF],  # Probably not even Python. :-)
+        ),
+    ),
+    "one": (
+        [OVER],
+        (
+            [OVER, *DEF],
+            [OVER, COMMENT, *DEF],
+            [OVER, COMMENT, "", *DEF],
+            [COMMENT, OVER, "", COMMENT, "", *DEF],
+        ),
+    ),
+    "two": (
+        [OVER, WRAPS],
+        (
+            [OVER, WRAPS, *DEF],
+            [COMMENT, OVER, COMMENT, WRAPS, COMMENT, *DEF],
+        ),
+    ),
+    "massive": (
+        [MASSIVE_FLAT, OVER],
+        ([*MASSIVE, OVER, *DEF],),
+    ),
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/test/test_header_only_linter.py b/tools/test/test_header_only_linter.py
new file mode 100644
index 000000000000..62a98c16428e
--- /dev/null
+++ b/tools/test/test_header_only_linter.py
@@ -0,0 +1,98 @@
+import re
+import unittest
+
+from tools.linter.adapters.header_only_linter import (
+    check_file,
+    CPP_TEST_GLOBS,
+    find_matched_symbols,
+    LINTER_CODE,
+    LintMessage,
+    LintSeverity,
+    REPO_ROOT,
+)
+
+
+class TestHeaderOnlyLinter(unittest.TestCase):
+    """
+    Test the header only linter functionality
+    """
+
+    def test_find_matched_symbols(self) -> None:
+        sample_regex = re.compile("symDef|symD|symC|bbb|a")
+        test_globs = ["tools/test/header_only_linter_testdata/*.cpp"]
+
+        expected_matches = {"symDef", "symC", "a"}
+        self.assertEqual(
+            find_matched_symbols(sample_regex, test_globs), expected_matches
+        )
+
+    def test_find_matched_symbols_empty_regex(self) -> None:
+        sample_regex = re.compile("")
+        test_globs = ["tools/test/header_only_linter_testdata/*.cpp"]
+
+        expected_matches: set[str] = set()
+        self.assertEqual(
+            find_matched_symbols(sample_regex, test_globs), expected_matches
+        )
+
+    def test_check_file_no_issues(self) -> None:
+        sample_txt = str(REPO_ROOT / "tools/test/header_only_linter_testdata/good.txt")
+        test_globs = ["tools/test/header_only_linter_testdata/*.cpp"]
+        self.assertEqual(len(check_file(sample_txt, test_globs)), 0)
+
+    def test_check_empty_file(self) -> None:
+        sample_txt = str(REPO_ROOT / "tools/test/header_only_linter_testdata/empty.txt")
+        test_globs = ["tools/test/header_only_linter_testdata/*.cpp"]
+        self.assertEqual(len(check_file(sample_txt, test_globs)), 0)
+
+    def test_check_file_with_untested_symbols(self) -> None:
+        sample_txt = str(REPO_ROOT / "tools/test/header_only_linter_testdata/bad.txt")
+        test_globs = ["tools/test/header_only_linter_testdata/*.cpp"]
+
+        expected_msgs = [
+            LintMessage(
+                path=sample_txt,
+                line=7,
+                char=None,
+                code=LINTER_CODE,
+                severity=LintSeverity.ERROR,
+                name="[untested-symbol]",
+                original=None,
+                replacement=None,
+                description=(
+                    f"bbb has been included as a header-only API "
+                    "but is not tested in any of CPP_TEST_GLOBS, which "
+                    f"contains {CPP_TEST_GLOBS}.\n"
+                    "Please add a .cpp test using the symbol without "
+                    "linking anything to verify that the symbol is in "
+                    "fact header-only. If you already have a test but it's"
+                    " not found, please add the .cpp file to CPP_TEST_GLOBS"
+                    " in tools/linters/adapters/header_only_linter.py."
+                ),
+            ),
+            LintMessage(
+                path=sample_txt,
+                line=8,
+                char=None,
+                code=LINTER_CODE,
+                severity=LintSeverity.ERROR,
+                name="[untested-symbol]",
+                original=None,
+                replacement=None,
+                description=(
+                    f"symD has been included as a header-only API "
+                    "but is not tested in any of CPP_TEST_GLOBS, which "
+                    f"contains {CPP_TEST_GLOBS}.\n"
+                    "Please add a .cpp test using the symbol without "
+                    "linking anything to verify that the symbol is in "
+                    "fact header-only. If you already have a test but it's"
+                    " not found, please add the .cpp file to CPP_TEST_GLOBS"
+                    " in tools/linters/adapters/header_only_linter.py."
+                ),
+            ),
+        ]
+        self.assertEqual(set(check_file(sample_txt, test_globs)), set(expected_msgs))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/test/test_selective_build.py b/tools/test/test_selective_build.py
index 59e6e617072e..5c9a41d441e6 100644
--- a/tools/test/test_selective_build.py
+++ b/tools/test/test_selective_build.py
@@ -298,6 +298,7 @@ def test_custom_namespace_selected_correctly(self) -> None:
             valid_tags=set(),
         )
         self.assertTrue(selector.is_native_function_selected(native_function))
+<<<<<<< HEAD
 
 
 class TestExecuTorchSelectiveBuild(unittest.TestCase):
@@ -340,3 +341,5 @@ def test_et_kernel_selected(self) -> None:
                 "aten::add.out", ["v2/6;0,1|6;0,1|6;0,1|6;0,1"]
             ),
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/test/test_set_linter.py b/tools/test/test_set_linter.py
index 1aa9d8610a03..6be677d59bb9 100644
--- a/tools/test/test_set_linter.py
+++ b/tools/test/test_set_linter.py
@@ -6,8 +6,12 @@
 from token import NAME
 from tokenize import TokenInfo
 
+<<<<<<< HEAD
 from tools.linter.adapters._linter import PythonFile
 from tools.linter.adapters.set_linter import PythonLines, SetLinter
+=======
+from tools.linter.adapters.set_linter import SetLinter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _PARENT = Path(__file__).parent.absolute()
@@ -27,21 +31,32 @@
 FILES = TESTFILE, INCLUDES_FILE, INCLUDES_FILE2
 
 
+<<<<<<< HEAD
 def python_lines(p: str | Path) -> PythonLines:
     pf = PythonFile.make(SetLinter.linter_name, p)
     return PythonLines(pf)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestSetLinter(LinterTestCase):
     maxDiff = 10000000
     LinterClass = SetLinter
 
     def test_get_all_tokens(self) -> None:
+<<<<<<< HEAD
         self.assertEqual(EXPECTED_SETS, python_lines(TESTFILE).sets)
 
     def test_omitted_lines(self) -> None:
         actual = sorted(python_lines(TESTFILE).omitted.omitted)
         expected = [3, 13]
+=======
+        self.assertEqual(EXPECTED_SETS, SetLinter.make_file(TESTFILE).sets)
+
+    def test_omitted_lines(self) -> None:
+        actual = sorted(SetLinter.make_file(TESTFILE).omitted.omitted)
+        expected = [6, 16]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(expected, actual)
 
     def test_linting(self) -> None:
@@ -62,6 +77,7 @@ def test_bracket_pairs(self) -> None:
                 "{One({1: [2], 2: {3}, 3: {4: 5}})}",
                 {0: 25, 2: 24, 3: 23, 6: 8, 12: 14, 18: 22},
             ),
+<<<<<<< HEAD
         )
         for s, expected in TESTS:
             pl = python_lines(s)
@@ -69,6 +85,16 @@ def test_bracket_pairs(self) -> None:
                 actual = pl.token_lines[0].bracket_pairs
             else:
                 self.assertEqual(pl.token_lines, [])
+=======
+            ("f'{a}'", {}),
+        )
+        for s, expected in TESTS:
+            pf = SetLinter.make_file(s)
+            if s:
+                actual = pf._lines_with_sets[0].bracket_pairs
+            else:
+                self.assertEqual(pf._lines_with_sets, [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 actual = {}
             self.assertEqual(actual, expected)
 
@@ -84,13 +110,24 @@ def test_match_braced_sets(self) -> None:
             ("{One({'a': 1}), Two([{}, {2}, {1, 2}])}", 3),
         )
         for s, expected in TESTS:
+<<<<<<< HEAD
             pl = python_lines(s)
             actual = pl.token_lines and pl.token_lines[0].braced_sets
+=======
+            pf = SetLinter.make_file(s)
+            actual = pf._lines_with_sets and pf._lines_with_sets[0].braced_sets
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(len(actual), expected)
 
 
 EXPECTED_SETS = [
+<<<<<<< HEAD
     TokenInfo(NAME, "set", (4, 4), (4, 7), "a = set()\n"),
     TokenInfo(NAME, "set", (6, 4), (6, 7), "c = set\n"),
     TokenInfo(NAME, "set", (9, 3), (9, 6), "   set(\n"),
+=======
+    TokenInfo(NAME, "set", (7, 4), (7, 7), "a = set()\n"),
+    TokenInfo(NAME, "set", (9, 4), (9, 7), "c = set\n"),
+    TokenInfo(NAME, "set", (12, 3), (12, 6), "   set(\n"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/tools/testing/discover_tests.py b/tools/testing/discover_tests.py
index 614d036b45a9..f2be6c36d11d 100644
--- a/tools/testing/discover_tests.py
+++ b/tools/testing/discover_tests.py
@@ -104,7 +104,14 @@ def skip_test_p(name: str) -> bool:
         "distributed/test_c10d_spawn",
         "distributions/test_transforms",
         "distributions/test_utils",
+<<<<<<< HEAD
         "test/inductor/test_aot_inductor_utils",
+=======
+        "lazy/test_meta_kernel",
+        "lazy/test_extract_compiled_graph",
+        "test/inductor/test_aot_inductor_utils",
+        "onnx/test_onnxscript_no_runtime",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "onnx/test_pytorch_onnx_onnxruntime_cuda",
         "onnx/test_models",
         # These are not C++ tests
diff --git a/tools/testing/target_determination/heuristics/__init__.py b/tools/testing/target_determination/heuristics/__init__.py
index 1bd5940abbb1..bf0c473d30f5 100644
--- a/tools/testing/target_determination/heuristics/__init__.py
+++ b/tools/testing/target_determination/heuristics/__init__.py
@@ -33,7 +33,11 @@
 
 
 # All currently running heuristics.
+<<<<<<< HEAD
 # To add a heurstic in trial mode, specify the keywork argument `trial_mode=True`.
+=======
+# To add a heurstic in trial mode, specify the keyword argument `trial_mode=True`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 HEURISTICS: list[HeuristicInterface] = [
     PreviouslyFailedInPR(),
     EditedByPR(),
diff --git a/tools/testing/target_determination/heuristics/edited_by_pr.py b/tools/testing/target_determination/heuristics/edited_by_pr.py
index b21235365215..2186db537399 100644
--- a/tools/testing/target_determination/heuristics/edited_by_pr.py
+++ b/tools/testing/target_determination/heuristics/edited_by_pr.py
@@ -1,5 +1,9 @@
 from __future__ import annotations
 
+<<<<<<< HEAD
+=======
+import re
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 from warnings import warn
 
@@ -14,6 +18,21 @@
 from tools.testing.test_run import TestRun
 
 
+<<<<<<< HEAD
+=======
+# Some files run tests in other test files, so we map them to each other here.
+# This is a map from file that runs the test to regex that matches the file that
+# contains the test. Test file with path test/a/b.py should of the form a/b.
+# Regexes should be based on repo root.
+ADDITIONAL_MAPPINGS = {
+    # Not files that are tracked by git but rather functions defined in
+    # run_test.py that generate test files which run tests in test/cpp_extensions.
+    "test_cpp_extensions_aot_ninja": [r"test\/cpp_extensions.*"],
+    "test_cpp_extensions_aot_no_ninja": [r"test\/cpp_extensions.*"],
+}
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class EditedByPR(HeuristicInterface):
     def __init__(self, **kwargs: dict[str, Any]) -> None:
         super().__init__(**kwargs)
@@ -28,9 +47,25 @@ def get_prediction_confidence(self, tests: list[str]) -> TestPrioritizations:
 def _get_modified_tests() -> set[str]:
     try:
         changed_files = query_changed_files()
+<<<<<<< HEAD
     except Exception as e:
         warn(f"Can't query changed test files due to {e}")
         # If unable to get changed files from git, quit without doing any sorting
         return set()
 
     return python_test_file_to_test_name(set(changed_files))
+=======
+        should_run = python_test_file_to_test_name(set(changed_files))
+        for test_file, regexes in ADDITIONAL_MAPPINGS.items():
+            if any(
+                re.search(regex, changed_file) is not None
+                for regex in regexes
+                for changed_file in changed_files
+            ):
+                should_run.add(test_file)
+        return should_run
+    except Exception as e:
+        warn(f"Can't query changed test files due to {e}")
+        # If unable to get changed files from git, quit without doing any sorting
+    return set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/tools/testing/test_run.py b/tools/testing/test_run.py
index 81bdfc4d7088..429357133478 100644
--- a/tools/testing/test_run.py
+++ b/tools/testing/test_run.py
@@ -285,7 +285,11 @@ def __lt__(self, other: object) -> bool:
         if not isinstance(other, ShardedTest):
             raise NotImplementedError
 
+<<<<<<< HEAD
         # This is how the list was implicity sorted when it was a NamedTuple
+=======
+        # This is how the list was implicitly sorted when it was a NamedTuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.name != other.name:
             return self.name < other.name
         if self.shard != other.shard:
diff --git a/tools/testing/upload_artifacts.py b/tools/testing/upload_artifacts.py
index 4ebfd03a1465..90f19e2dbfc3 100644
--- a/tools/testing/upload_artifacts.py
+++ b/tools/testing/upload_artifacts.py
@@ -1,4 +1,8 @@
 import glob
+<<<<<<< HEAD
+=======
+import gzip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import time
 import zipfile
@@ -9,6 +13,10 @@
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
 LAST_UPDATED = 0.0
+<<<<<<< HEAD
+=======
+LOG_BUCKET_PREFIX = "temp_logs"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @lru_cache(maxsize=1)
@@ -28,11 +36,34 @@ def zip_artifact(file_name: str, paths: list[str]) -> None:
                 f.write(file, os.path.relpath(file, REPO_ROOT))
 
 
+<<<<<<< HEAD
 def upload_to_s3_artifacts() -> None:
+=======
+def concated_logs() -> str:
+    """Concatenate all the logs in the test-reports directory into a single string."""
+    logs = []
+    for log_file in glob.glob(
+        f"{REPO_ROOT}/test/test-reports/**/*.log", recursive=True
+    ):
+        logs.append(f"=== {log_file} ===")
+        with open(log_file) as f:
+            # For every line, prefix with fake timestamp for log classifier
+            for line in f:
+                line = line.rstrip("\n")  # Remove any trailing newline
+                logs.append(f"2020-01-01T00:00:00.0000000Z {line}")
+    return "\n".join(logs)
+
+
+def upload_to_s3_artifacts(failed: bool) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Upload the file to S3."""
     workflow_id = os.environ.get("GITHUB_RUN_ID")
     workflow_run_attempt = os.environ.get("GITHUB_RUN_ATTEMPT")
     file_suffix = os.environ.get("ARTIFACTS_FILE_SUFFIX")
+<<<<<<< HEAD
+=======
+    job_id = os.environ.get("JOB_ID")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not workflow_id or not workflow_run_attempt or not file_suffix:
         print(
             "GITHUB_RUN_ID, GITHUB_RUN_ATTEMPT, or ARTIFACTS_FILE_SUFFIX not set, not uploading"
@@ -70,6 +101,21 @@ def upload_to_s3_artifacts() -> None:
         Bucket="gha-artifacts",
         Key=f"workflows_failing_pending_upload/{workflow_id}.txt",
     )
+<<<<<<< HEAD
+=======
+    if job_id and failed:
+        logs = concated_logs()
+        # Put logs into bucket so log classifier can access them. We cannot get
+        # the actual GH logs so this will have to be a proxy.
+        print(f"Uploading logs for {job_id} to S3")
+        get_s3_resource().put_object(
+            Body=gzip.compress(logs.encode("utf-8")),
+            Bucket="gha-artifacts",
+            Key=f"{LOG_BUCKET_PREFIX}/{job_id}",
+            ContentType="text/plain",
+            ContentEncoding="gzip",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def zip_and_upload_artifacts(failed: bool) -> None:
@@ -81,7 +127,11 @@ def zip_and_upload_artifacts(failed: bool) -> None:
     if failed or time.time() - LAST_UPDATED > 20 * 60:
         start = time.time()
         try:
+<<<<<<< HEAD
             upload_to_s3_artifacts()
+=======
+            upload_to_s3_artifacts(failed=failed)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             LAST_UPDATED = time.time()
         except Exception as e:
             print(f"Failed to upload artifacts: {e}")
@@ -94,7 +144,11 @@ def trigger_upload_test_stats_intermediate_workflow() -> None:
     # The GITHUB_TOKEN cannot trigger workflow so this isn't used for now
     print("Triggering upload_test_stats_intermediate workflow")
     x = requests.post(
+<<<<<<< HEAD
         "https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches",
+=======
+        "https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches",  # noqa: B950 @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         headers={
             "Accept": "application/vnd.github.v3+json",
             "Authorization": f"Bearer {os.environ.get('GITHUB_TOKEN')}",
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 8b8ebdc6e976..c85bc8883ad8 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -2,7 +2,11 @@
 # Now it only builds the Torch python bindings.
 
 if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+<<<<<<< HEAD
   cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+=======
+  cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   project(torch CXX C)
   find_package(torch REQUIRED)
   option(USE_CUDA "Use CUDA" ON)
@@ -74,6 +78,10 @@ set(TORCH_PYTHON_INCLUDE_DIRECTORIES
     ${TORCH_SRC_DIR}/csrc
     ${TORCH_SRC_DIR}/csrc/api/include
     ${TORCH_SRC_DIR}/lib
+<<<<<<< HEAD
+=======
+    ${TORCH_SRC_DIR}/standalone
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${LIBSHM_SRCDIR})
@@ -84,6 +92,10 @@ set(TORCH_PYTHON_LINK_LIBRARIES
     opentelemetry::api
     httplib
     nlohmann
+<<<<<<< HEAD
+=======
+    moodycamel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shm
     fmt::fmt-header-only
     ATEN_CPU_FILES_GEN_LIB)
@@ -144,6 +156,7 @@ if(USE_CUDA)
         list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUFILE)
     endif()
 
+<<<<<<< HEAD
     if(TARGET torch::nvtx3)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtx3)
     else()
@@ -151,6 +164,9 @@ if(USE_CUDA)
         list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
       endif()
     endif()
+=======
+    list(APPEND TORCH_PYTHON_LINK_LIBRARIES CUDA::nvtx3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 endif()
 
 if(USE_ROCM)
@@ -251,6 +267,11 @@ add_custom_command(
       "${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml"
       "${TORCH_ROOT}/aten/src/ATen/native/tags.yaml"
       "${TORCH_ROOT}/tools/autograd/deprecated.yaml"
+<<<<<<< HEAD
+=======
+      "${TORCH_ROOT}/torch/_torch_docs.py"
+      "${TORCH_ROOT}/torch/_tensor_docs.py"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ${pyi_python}
       ${autograd_python}
       ${torchgen_python}
@@ -262,6 +283,10 @@ add_custom_command(
     OUTPUT
     "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi"
     COMMAND
+<<<<<<< HEAD
+=======
+    ${CMAKE_COMMAND} -E env PYTHONPATH="${TORCH_ROOT}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "${Python_EXECUTABLE}" ${TORCH_SRC_DIR}/utils/data/datapipes/gen_pyi.py
     DEPENDS
     "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi.in"
diff --git a/torch/_C/_VariableFunctions.pyi.in b/torch/_C/_VariableFunctions.pyi.in
index b5a383dd1236..ba9c73d3fa79 100644
--- a/torch/_C/_VariableFunctions.pyi.in
+++ b/torch/_C/_VariableFunctions.pyi.in
@@ -1,6 +1,7 @@
 # ${generated_comment}
 # mypy: disable-error-code="type-arg"
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 
 import builtins
 from typing import (
@@ -19,6 +20,25 @@ from typing import (
 
 import torch
 from torch import contiguous_format, Generator, inf, memory_format, strided, SymInt, Tensor
+=======
+# ruff: noqa: F401,PYI054
+
+from collections.abc import Sequence
+from types import EllipsisType
+from typing import Any, Callable, Literal, overload, TypeVar
+
+import torch
+from torch import (
+    contiguous_format,
+    Generator,
+    inf,
+    memory_format,
+    strided,
+    SymInt,
+    Tensor,
+)
+from torch._prims_common import DeviceLikeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.types import (
     _bool,
     _complex,
@@ -33,8 +53,14 @@ from torch.types import (
     Number,
 )
 
+<<<<<<< HEAD
 from torch._prims_common import DeviceLikeType
 
 ${function_hints}
 
 ${all_directive}
+=======
+${all_directive}
+
+${function_hints}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index d90d1e6cfe60..6ca2f47d634a 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1,14 +1,24 @@
 # ${generated_comment}
 # mypy: disable-error-code="type-arg"
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 
 import builtins
 from enum import Enum, IntEnum
 from pathlib import Path
+=======
+# ruff: noqa: F401
+
+from collections.abc import Iterable, Iterator, Sequence
+from enum import Enum, IntEnum
+from pathlib import Path
+from types import EllipsisType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import (
     Any,
     AnyStr,
     Callable,
+<<<<<<< HEAD
     ContextManager,
     Dict,
     Generic,
@@ -31,21 +41,52 @@ from typing import (
     runtime_checkable,
 )
 from typing_extensions import ParamSpec, Self
+=======
+    Generic,
+    IO,
+    Literal,
+    NamedTuple,
+    overload,
+    SupportsIndex,
+    TypeVar,
+)
+from typing_extensions import ParamSpec, Protocol, runtime_checkable, Self, TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import numpy
 
 import torch
+<<<<<<< HEAD
 from torch import SymInt, Tensor, inf
+=======
+from torch import inf, SymInt, Tensor
+from torch._C import (
+    _aoti,
+    _cpu,
+    _dynamo,
+    _export,
+    _functorch,
+    _lazy,
+    _lazy_ts_backend,
+    _nn,
+    _onnx,
+    _VariableFunctions,
+    _verbose,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._prims_common import DeviceLikeType
 from torch.autograd.graph import Node as _Node
 from torch.fx.node import Node as FxNode
 from torch.package import PackageExporter
 from torch.storage import TypedStorage, UntypedStorage
 from torch.types import (
+<<<<<<< HEAD
     Device,
     Number,
     Storage,
     IntLikeType,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _bool,
     _bytes,
     _complex,
@@ -59,6 +100,7 @@ from torch.types import (
     _size,
     _str,
     _symsize,
+<<<<<<< HEAD
 )
 from torch.utils._python_dispatch import TorchDispatchMode
 
@@ -88,6 +130,26 @@ _T_co = TypeVar("_T_co", covariant=True)
 
 @runtime_checkable
 class _NestedSequence(Protocol[_T_co]):
+=======
+    Device,
+    IntLikeType,
+    Number,
+    Storage,
+)
+from torch.utils._python_dispatch import TorchDispatchMode
+
+# This module is defined in torch/csrc/Module.cpp
+
+K = TypeVar("K")  # noqa: PYI001
+T = TypeVar("T")  # noqa: PYI001
+S = TypeVar("S", bound=torch.Tensor)  # noqa: PYI001
+P = ParamSpec("P")  # noqa: PYI001
+R = TypeVar("R", covariant=True)  # return value (always covariant)  # noqa: PYI001
+T_co = TypeVar("T_co", covariant=True)  # noqa: PYI001
+
+@runtime_checkable
+class _NestedSequence(Protocol[T_co]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """A protocol for representing nested sequences.
 
     References::
@@ -95,6 +157,7 @@ class _NestedSequence(Protocol[_T_co]):
         <https://github.com/numpy/numpy/blob/main/numpy/_typing/_nested_sequence.py>
     """
 
+<<<<<<< HEAD
     def __len__(self, /) -> builtins.int: ...
     def __getitem__(self, index: builtins.int, /) -> _T_co | _NestedSequence[_T_co]: ...
     def __contains__(self, x: builtins.object, /) -> builtins.bool: ...
@@ -103,6 +166,15 @@ class _NestedSequence(Protocol[_T_co]):
     def count(self, value: Any, /) -> builtins.int: ...
     def index(self, value: Any, /) -> builtins.int: ...
 
+=======
+    def __len__(self, /) -> _int: ...
+    def __getitem__(self, index: _int, /) -> T_co | _NestedSequence[T_co]: ...
+    def __contains__(self, x: object, /) -> _bool: ...
+    def __iter__(self, /) -> Iterator[T_co | _NestedSequence[T_co]]: ...
+    def __reversed__(self, /) -> Iterator[T_co | _NestedSequence[T_co]]: ...
+    def count(self, value: Any, /) -> _int: ...
+    def index(self, value: Any, /) -> _int: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/Device.cpp
 class device:
@@ -120,9 +192,15 @@ class device:
     # Uncomment if we ever make torch.device a decorator
     # def __call__(self, func: T) -> T: ...
 
+<<<<<<< HEAD
     def __enter__(self) -> device: ...
     def __exit__(self, exc_type, exc_val, exc_tb) -> None: ...
     def __reduce__(self) -> Tuple[Any, ...]: ...  # THPDevice_reduce
+=======
+    def __enter__(self) -> Self: ...
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None: ...
+    def __reduce__(self) -> tuple[Any, ...]: ...  # THPDevice_reduce
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/Stream.cpp
 class Stream:
@@ -133,13 +211,32 @@ class Stream:
     device: _device  # The device of the stream
 
     @overload
+<<<<<<< HEAD
     def __new__(self, device: Optional[DeviceLikeType] = None, *, priority: _int = 0) -> Stream: ...
     @overload
     def __new__(self, stream_id: _int, device_index: _int, device_type: _int, *, priority: _int = 0) -> Stream: ...
+=======
+    def __new__(
+        cls,
+        device: DeviceLikeType | None = None,
+        *,
+        priority: _int = 0,
+    ) -> Self: ...
+    @overload
+    def __new__(
+        cls,
+        stream_id: _int,
+        device_index: _int,
+        device_type: _int,
+        *,
+        priority: _int = 0,
+    ) -> Self: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def query(self) -> _bool: ...
     def synchronize(self) -> None: ...
     def wait_event(self, event: Event) -> None: ...
     def wait_stream(self, other: Stream) -> None: ...
+<<<<<<< HEAD
     def record_event(self, event: Optional[Event] = None) -> Event: ...
     def __hash__(self) -> _int: ...
     def __repr__(self) -> str: ...
@@ -164,12 +261,40 @@ class Event:
     def from_ipc_handle(self, device: _device, ipc_handle: bytes) -> Event: ...
     def record(self, stream: Optional[Stream] = None) -> None: ...
     def wait(self, stream: Optional[Stream] = None) -> None: ...
+=======
+    def record_event(self, event: Event | None = None) -> Event: ...
+    def __hash__(self) -> _int: ...
+    def __eq__(self, other: object) -> _bool: ...
+    def __enter__(self) -> Self: ...
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None: ...
+
+# Defined in torch/csrc/Event.cpp
+class Event:
+    device: _device  # The device of the Event
+    event_id: _int  # The raw event created by device backend
+
+    def __new__(
+        cls,
+        device: DeviceLikeType | None = None,
+        *,
+        enable_timing: _bool = False,
+        blocking: _bool = False,
+        interprocess: _bool = False,
+    ) -> Self: ...
+    @classmethod
+    def from_ipc_handle(cls, device: _device, ipc_handle: bytes) -> Event: ...
+    def record(self, stream: Stream | None = None) -> None: ...
+    def wait(self, stream: Stream | None = None) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def query(self) -> _bool: ...
     def elapsed_time(self, other: Event) -> _float: ...
     def synchronize(self) -> None: ...
     def ipc_handle(self) -> bytes: ...
+<<<<<<< HEAD
     def __repr__(self) -> str: ...
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/Size.cpp
 class Size(tuple[_int, ...]):
@@ -181,7 +306,11 @@ class Size(tuple[_int, ...]):
     def __getitem__(self: Size, key: slice, /) -> Size: ...
     # Note: torch.Size does not support adding non-integer tuples.
     def __add__(self, other: tuple[_int, ...], /) -> Size: ...  # type: ignore[override]
+<<<<<<< HEAD
     # Note: tuple[int, ...] + Size results in tuple[int, ...], not Size!
+=======
+    def __radd__(self: Size, other: tuple[_int, ...], /) -> Size: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __mul__(self, other: SupportsIndex, /) -> Size: ...
     def __rmul__(self, other: SupportsIndex, /) -> Size: ...
     def numel(self: Size, /) -> _int: ...
@@ -260,10 +389,17 @@ per_channel_affine_float_qparams: qscheme = ...
 
 # Defined in torch/csrc/autograd/python_function.cpp
 class _FunctionBase:
+<<<<<<< HEAD
     saved_tensors: Tuple[Tensor]
     _raw_saved_tensors: Tuple[Any]
     next_functions: Tuple[Tuple[Any, _int], ...]
     needs_input_grad: Tuple[_bool]
+=======
+    saved_tensors: tuple[Tensor]
+    _raw_saved_tensors: tuple[Any]
+    next_functions: tuple[tuple[Any, _int], ...]
+    needs_input_grad: tuple[_bool]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     metadata: dict
     _materialize_non_diff_grads: _bool
     # skip adding type hints for the fields that have wrappers defined
@@ -273,18 +409,32 @@ class _FunctionBase:
 class _LegacyVariableBase(Tensor):  # inherits from Tensor to appease mypy
     def __init__(
         self,
+<<<<<<< HEAD
         data: Optional[Tensor] = ...,
         requires_grad: Optional[_bool] = ...,
         volatile: Optional[_bool] = ...,
         _grad_fn: Optional[_FunctionBase] = ...,
+=======
+        data: Tensor | None = ...,
+        requires_grad: _bool | None = ...,
+        volatile: _bool | None = ...,
+        _grad_fn: _FunctionBase | None = ...,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None: ...
 
 # Defined in torch/csrc/jit/python/init.cpp
 class IODescriptor: ...
+<<<<<<< HEAD
 class JITException: ...
 
 class Future(Generic[T]):
     def __init__(self, devices: List[device]) -> None: ...
+=======
+class JITException(Exception): ...
+
+class Future(Generic[T]):
+    def __init__(self, devices: list[device]) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def done(self) -> _bool: ...
     def value(self) -> T: ...
     def wait(self) -> T: ...
@@ -296,7 +446,11 @@ class Future(Generic[T]):
 class _Await:
     def __init__(self) -> None: ...
     def fn(self) -> Callable: ...
+<<<<<<< HEAD
     def args(self) -> Tuple[Any, ...]: ...
+=======
+    def args(self) -> tuple[Any, ...]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_nowait(self) -> _bool: ...
 
 def _jit_set_num_profiled_runs(num: _size) -> _size: ...
@@ -316,19 +470,32 @@ def wait(fut: Future) -> Any: ...
 def _awaitable(*args: Any, **kwargs: Any) -> _Await: ...
 def _awaitable_wait(aw: _Await) -> Any: ...
 def _awaitable_nowait(x: Any) -> _Await: ...
+<<<<<<< HEAD
 def _collect_all(futures: List[Future]) -> Future: ...
 def _set_print_stack_traces_on_fatal_signal(print: _bool) -> None: ...
 def unify_type_list(types: List[JitType]) -> JitType: ...
 def _freeze_module(
     module: ScriptModule,
     preserved_attrs: List[str] = [],
+=======
+def _collect_all(futures: list[Future]) -> Future: ...
+def _set_print_stack_traces_on_fatal_signal(print: _bool) -> None: ...
+def unify_type_list(types: list[JitType]) -> JitType: ...
+def _freeze_module(
+    module: ScriptModule,
+    preserved_attrs: list[str] = ...,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     freeze_interfaces: _bool = True,
     preserveParameters: _bool = True,
 ) -> ScriptModule: ...
 def _jit_pass_optimize_frozen_graph(Graph, optimize_numerics: _bool = True) -> None: ...
 def _jit_pass_optimize_for_inference(
     module: torch.jit.ScriptModule,
+<<<<<<< HEAD
     other_methods: List[str] = [],
+=======
+    other_methods: list[str] = ...,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...
 def _jit_pass_fold_frozen_conv_bn(graph: Graph): ...
 def _jit_pass_fold_frozen_conv_add_or_sub(graph: Graph): ...
@@ -340,6 +507,7 @@ def _jit_pass_transpose_frozen_linear(graph: Graph): ...
 def _jit_pass_remove_dropout(module: torch.jit.ScriptModule): ...
 def _is_tracing() -> _bool: ...
 def _jit_init() -> _bool: ...
+<<<<<<< HEAD
 def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ...
 def _jit_unflatten(vars: List[Tensor], desc: IODescriptor) -> Any: ...
 def _jit_get_operation(op_name: str) -> Tuple[Callable, List[str]]: ...
@@ -366,17 +534,53 @@ def _jit_pass_vulkan_optimize_for_mobile(
 def _jit_pass_metal_optimize_for_mobile(
     module: torch.jit.ScriptModule,
     preserved_methods: List[AnyStr],
+=======
+def _jit_flatten(arg: Any) -> tuple[list[Tensor], IODescriptor]: ...
+def _jit_unflatten(vars: list[Tensor], desc: IODescriptor) -> Any: ...
+def _jit_get_operation(op_name: str) -> tuple[Callable, list[str]]: ...
+def _get_operation_overload(
+    op_name: str,
+    op_overload_name: str,
+) -> tuple[Callable, Callable, list[Any]]: ...
+def _get_schema(op_name: str, overload_name: str) -> FunctionSchema: ...
+def _jit_pass_optimize_for_mobile(
+    module: torch.jit.ScriptModule,
+    optimization_blocklist: set[_MobileOptimizerType],
+    preserved_methods: list[AnyStr],
+) -> torch.jit.ScriptModule: ...
+def _clone_module_with_class(
+    module: torch.jit.ScriptModule,
+    ignored_methods: list[AnyStr],
+    ignored_attributes: list[AnyStr],
+) -> torch.jit.ScriptModule: ...
+def _jit_pass_vulkan_optimize_for_mobile(
+    module: torch.jit.ScriptModule,
+    optimization_blocklist: set[_MobileOptimizerType],
+    preserved_methods: list[AnyStr],
+) -> torch.jit.ScriptModule: ...
+def _jit_pass_metal_optimize_for_mobile(
+    module: torch.jit.ScriptModule,
+    preserved_methods: list[AnyStr],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch.jit.ScriptModule: ...
 def _jit_pass_inline(Graph) -> None: ...
 def _jit_pass_constant_propagation(Graph) -> None: ...
 def _jit_pass_propagate_shapes_on_graph(Graph) -> None: ...
 def _jit_register_decomposition_for_schema(schema: FunctionSchema, Graph) -> None: ...
 def _jit_erase_non_input_shape_information(Graph) -> None: ...
+<<<<<<< HEAD
 def _jit_get_schemas_for_operator(name: str) -> List[FunctionSchema]: ...
 def _jit_get_all_schemas() -> List[FunctionSchema]: ...
 def _jit_check_alias_annotation(
     g: Graph,
     args: Tuple[Any, ...],
+=======
+def _jit_get_schemas_for_operator(name: str) -> list[FunctionSchema]: ...
+def _jit_get_all_schemas() -> list[FunctionSchema]: ...
+def _jit_check_alias_annotation(
+    g: Graph,
+    args: tuple[Any, ...],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unqualified_op_name: str,
 ): ...
 def _jit_can_fuse_on_cpu() -> _bool: ...
@@ -405,7 +609,11 @@ def _jit_pass_fold_convbn(module: torch.jit.ScriptModule): ...
 def _jit_pass_insert_observers(
     module: torch.jit.ScriptModule,
     method_name: str,
+<<<<<<< HEAD
     qconfig_dict: Dict[str, Any],
+=======
+    qconfig_dict: dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     inplace: _bool,
     quant_type: _int,
 ): ...
@@ -436,25 +644,39 @@ def _jit_pass_quant_finalize_for_ondevice_ptq(
 def _jit_pass_insert_observer_method_for_ondevice_ptq(
     module: torch.jit.ScriptModule,
     method_name: str,
+<<<<<<< HEAD
     qconfig_dict: Dict[str, Any],
+=======
+    qconfig_dict: dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     inplace: _bool,
     quant_type: _int,
 ): ...
 def _jit_set_profiling_executor(profiling_flag: _bool) -> _bool: ...
 def _jit_set_profiling_mode(profiling_flag: _bool) -> _bool: ...
 def _jit_set_fusion_strategy(
+<<<<<<< HEAD
     strategy: List[Tuple[str, _int]],
 ) -> List[Tuple[str, _int]]: ...
+=======
+    strategy: list[tuple[str, _int]],
+) -> list[tuple[str, _int]]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _jit_try_infer_type(obj: Any) -> InferredType: ...
 def _jit_get_trigger_value(trigger_name: str) -> _int: ...
 
 # Defined in torch/csrc/jit/python/script_init.cpp
+<<<<<<< HEAD
 ResolutionCallback = Callable[[str], Callable[..., Any]]
+=======
+ResolutionCallback: TypeAlias = Callable[[str], Callable[..., Any]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/jit/python/script_init.cpp
 #        and torch/csrc/jit/python/init.cpp
 def _maybe_call_torch_function_for_op_packet(
     op_overload_packet: Any,
+<<<<<<< HEAD
     args: Any,
     kwargs: Any,
 ) -> Any: ...
@@ -462,6 +684,15 @@ def _check_schema_allow_fake_script_object(
     schema: FunctionSchema,
     args: Any,
     kwargs: Any,
+=======
+    *args: Any,
+    **kwargs: Any,
+) -> Any: ...
+def _check_schema_allow_fake_script_object(
+    schema: FunctionSchema,
+    *args: Any,
+    **kwargs: Any,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> _bool: ...
 def _create_function_from_graph(qualname: str, graph: Graph) -> ScriptFunction: ...
 def _debug_set_autodiff_subgraph_inlining(disabled: _bool) -> None: ...
@@ -469,6 +700,7 @@ def _ivalue_tags_match(lhs: ScriptModule, rhs: ScriptModule) -> _bool: ...
 def _jit_assert_is_instance(obj: Any, type: JitType): ...
 def _jit_clear_class_registry() -> None: ...
 def _jit_set_emit_hooks(
+<<<<<<< HEAD
     ModuleHook: Optional[Callable],
     FunctionHook: Optional[Callable],
 ) -> None: ...
@@ -488,21 +720,51 @@ def _get_model_bytecode_version_from_buffer(buffer: IO[bytes]) -> _int: ...
 def _backport_for_mobile(
     filename_input: Union[str, Path],
     filename_output: Union[str, Path],
+=======
+    ModuleHook: Callable | None,
+    FunctionHook: Callable | None,
+) -> None: ...
+def _jit_get_emit_hooks() -> tuple[Callable, Callable]: ...
+def _load_for_lite_interpreter(
+    filename: str | Path,
+    map_location: DeviceLikeType | None,
+): ...
+def _load_for_lite_interpreter_from_buffer(
+    buffer: IO[bytes],
+    map_location: DeviceLikeType | None,
+): ...
+def _export_operator_list(module: LiteScriptModule): ...
+def _quantize_ondevice_ptq_dynamic(module: LiteScriptModule, method_name: str): ...
+def _get_model_bytecode_version(filename: str | Path) -> _int: ...
+def _get_model_bytecode_version_from_buffer(buffer: IO[bytes]) -> _int: ...
+def _backport_for_mobile(
+    filename_input: str | Path,
+    filename_output: str | Path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     to_version: _int,
 ) -> None: ...
 def _backport_for_mobile_from_buffer(
     buffer: IO[bytes],
+<<<<<<< HEAD
     filename_output: Union[str, Path],
     to_version: _int,
 ) -> None: ...
 def _backport_for_mobile_to_buffer(
     filename_input: Union[str, Path],
+=======
+    filename_output: str | Path,
+    to_version: _int,
+) -> None: ...
+def _backport_for_mobile_to_buffer(
+    filename_input: str | Path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     to_version: _int,
 ) -> bytes: ...
 def _backport_for_mobile_from_buffer_to_buffer(
     buffer: IO[bytes],
     to_version: _int,
 ) -> bytes: ...
+<<<<<<< HEAD
 def _get_model_ops_and_info(filename: Union[str, Path]): ...
 def _get_model_ops_and_info_from_buffer(buffer: IO[bytes]): ...
 def _get_mobile_model_contained_types(filename: Union[str, Path]): ...
@@ -529,14 +791,49 @@ def _create_function_from_trace_with_dict(
     force_outplace: _bool,
     argument_names: List[str],
 ) -> Tuple[Graph, Stack]: ...
+=======
+def _get_model_ops_and_info(filename: str | Path): ...
+def _get_model_ops_and_info_from_buffer(buffer: IO[bytes]): ...
+def _get_mobile_model_contained_types(filename: str | Path): ...
+def _get_mobile_model_contained_types_from_buffer(buffer: IO[bytes]): ...
+def _logging_set_logger(logger: LoggerBase) -> LoggerBase: ...
+def _get_graph_executor_optimize(optimize: _bool | None = None) -> _bool: ...
+def _set_graph_executor_optimize(optimize: _bool): ...
+def _export_opnames(module: ScriptModule) -> list[str]: ...
+def _create_function_from_trace(
+    qualname: str,
+    func: Callable[..., Any],
+    input_tuple: tuple[Any, ...],
+    var_lookup_fn: Callable[[Tensor], str],
+    strict: _bool,
+    force_outplace: _bool,
+    argument_names: list[str],
+) -> tuple[Graph, Stack]: ...
+def _create_function_from_trace_with_dict(
+    qualname: str,
+    func: Callable[..., Any],
+    input_dict: dict[str, Any],
+    var_lookup_fn: Callable[[Tensor], str],
+    strict: _bool,
+    force_outplace: _bool,
+    argument_names: list[str],
+) -> tuple[Graph, Stack]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _jit_is_script_object(obj: Any) -> _bool: ...
 def _last_executed_optimized_graph() -> Graph: ...
 def parse_type_comment(comment: str) -> Decl: ...
 def _get_upgraders_map_size() -> _int: ...
+<<<<<<< HEAD
 def _get_upgraders_entry_map() -> Dict[str, str]: ...
 def _dump_upgraders_map() -> Dict[str, str]: ...
 def _test_only_populate_upgraders(content: Dict[str, str]) -> None: ...
 def _test_only_remove_upgraders(content: Dict[str, str]) -> None: ...
+=======
+def _get_upgraders_entry_map() -> dict[str, str]: ...
+def _dump_upgraders_map() -> dict[str, str]: ...
+def _test_only_populate_upgraders(content: dict[str, str]) -> None: ...
+def _test_only_remove_upgraders(content: dict[str, str]) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def merge_type_from_type_comment(
     decl: Decl,
     type_annotation_decl: Decl,
@@ -561,17 +858,30 @@ def _replace_overloaded_method_decl(
 def _jit_pass_lower_all_tuples(graph: Graph) -> None: ...
 def _jit_pass_onnx_set_dynamic_input_shape(
     graph: Graph,
+<<<<<<< HEAD
     dynamic_axes: Dict[str, Dict[_int, str]],
     input_names: List[str],
 ) -> None: ...
 def _jit_pass_onnx_graph_shape_type_inference(
     graph: Graph,
     params_dict: Dict[str, IValue],
+=======
+    dynamic_axes: dict[str, dict[_int, str]],
+    input_names: list[str],
+) -> None: ...
+def _jit_pass_onnx_graph_shape_type_inference(
+    graph: Graph,
+    params_dict: dict[str, IValue],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opset_version: _int,
 ) -> None: ...
 def _jit_pass_onnx_assign_output_shape(
     graph: Graph,
+<<<<<<< HEAD
     tensors: List[Tensor],
+=======
+    tensors: list[Tensor],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     desc: IODescriptor,
     onnx_shape_inference: _bool,
     is_script: _bool,
@@ -579,7 +889,11 @@ def _jit_pass_onnx_assign_output_shape(
 ) -> None: ...
 def _jit_pass_onnx_remove_inplace_ops_for_onnx(
     graph: Graph,
+<<<<<<< HEAD
     module: Optional[ScriptModule] = None,
+=======
+    module: ScriptModule | None = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...
 def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ...
 def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ...
@@ -595,12 +909,21 @@ def _jit_pass_onnx_remove_print(graph: Graph) -> None: ...
 def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ...
 def _jit_pass_onnx_unpack_quantized_weights(
     graph: Graph,
+<<<<<<< HEAD
     paramsDict: Dict[str, IValue],
 ) -> Dict[str, IValue]: ...
 def _jit_pass_onnx_quantization_insert_permutes(
     graph: Graph,
     paramsDict: Dict[str, IValue],
 ) -> Dict[str, IValue]: ...
+=======
+    paramsDict: dict[str, IValue],
+) -> dict[str, IValue]: ...
+def _jit_pass_onnx_quantization_insert_permutes(
+    graph: Graph,
+    paramsDict: dict[str, IValue],
+) -> dict[str, IValue]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _jit_pass_custom_pattern_based_rewrite_graph(
     pattern: str,
     fused_node_name: str,
@@ -608,7 +931,11 @@ def _jit_pass_custom_pattern_based_rewrite_graph(
 ) -> None: ...
 def _jit_onnx_list_model_parameters(
     module: ScriptModule,
+<<<<<<< HEAD
 ) -> Tuple[ScriptModule, List[IValue]]: ...
+=======
+) -> tuple[ScriptModule, list[IValue]]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _jit_pass_erase_number_types(graph: Graph) -> None: ...
 def _jit_pass_onnx_lint(graph: Graph) -> None: ...
 def _jit_pass_onnx(
@@ -629,6 +956,7 @@ def _jit_pass_dce_allow_deleting_nodes_with_side_effects(graph: Graph) -> None:
 def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ...
 def _jit_pass_onnx_function_extraction(
     graph: Graph,
+<<<<<<< HEAD
     module_names: Set[str],
     param_names: List[str],
 ) -> Dict[Node, Dict[str, str]]: ...
@@ -636,11 +964,21 @@ def _jit_pass_onnx_clear_scope_records() -> None: ...
 def _jit_pass_onnx_track_scope_attributes(
     graph: Graph,
     onnx_attrs: Dict[str, Any],
+=======
+    module_names: set[str],
+    param_names: list[str],
+) -> dict[Node, dict[str, str]]: ...
+def _jit_pass_onnx_clear_scope_records() -> None: ...
+def _jit_pass_onnx_track_scope_attributes(
+    graph: Graph,
+    onnx_attrs: dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...
 def _jit_is_onnx_log_enabled() -> _bool: ...
 def _jit_set_onnx_log_enabled(enabled: _bool) -> None: ...
 def _jit_set_onnx_log_output_stream(stream_name: str) -> None: ...
 def _jit_onnx_log(*args: Any) -> None: ...
+<<<<<<< HEAD
 def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ...
 def _jit_pass_inline_fork_wait(graph: Graph) -> None: ...
 def _jit_pass_onnx_deduplicate_initializers(
@@ -669,30 +1007,80 @@ def _jit_decay_packed_param_input_types(graph: Graph) -> None: ...
 def _jit_pass_onnx_node_shape_type_inference(
     n: Node,
     paramsDict: Dict[str, IValue],
+=======
+def _jit_pass_lower_graph(graph: Graph, m: Module) -> tuple[Graph, list[IValue]]: ...
+def _jit_pass_inline_fork_wait(graph: Graph) -> None: ...
+def _jit_pass_onnx_deduplicate_initializers(
+    graph: Graph,
+    params_dict: dict[str, IValue],
+    is_train: _bool,
+) -> dict[str, IValue]: ...
+def _jit_pass_onnx_eval_peephole(
+    graph: Graph,
+    paramsDict: dict[str, IValue],
+) -> dict[str, IValue]: ...
+def _jit_pass_onnx_constant_fold(
+    graph: Graph,
+    paramsDict: dict[str, IValue],
+    opset_version: _int,
+) -> dict[str, IValue]: ...
+def _jit_pass_onnx_eliminate_unused_items(
+    graph: Graph,
+    paramsDict: dict[str, IValue],
+) -> dict[str, IValue]: ...
+def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ...
+def _jit_pass_filter_non_tensor_arguments(
+    params: dict[str, IValue],
+) -> dict[str, Tensor]: ...
+def _jit_decay_packed_param_input_types(graph: Graph) -> None: ...
+def _jit_pass_onnx_node_shape_type_inference(
+    n: Node,
+    paramsDict: dict[str, IValue],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opset_version: _int,
 ) -> None: ...
 def _jit_onnx_convert_pattern_from_subblock(
     block: Block,
     n: Node,
+<<<<<<< HEAD
     env: Dict[Value, Value],
     values_in_env: Set[Value],
 ) -> List[Value]: ...
+=======
+    env: dict[Value, Value],
+    values_in_env: set[Value],
+) -> list[Value]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _jit_pass_onnx_block(
     old_block: Block,
     new_block: Block,
     operator_export_type: _onnx.OperatorExportTypes,
+<<<<<<< HEAD
     env: Dict[Value, Value],
     values_in_env: Set[Value],
     is_sub_block: _bool,
 ) -> Dict[Value, Value]: ...
+=======
+    env: dict[Value, Value],
+    values_in_env: set[Value],
+    is_sub_block: _bool,
+) -> dict[Value, Value]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _jit_pass_onnx_assign_scoped_names_for_node_and_value(graph: Graph) -> None: ...
 def _jit_pass_fixup_onnx_controlflow_node(
     n: Node,
     opset_version: _int,
+<<<<<<< HEAD
 ) -> List[Value]: ...
 def _jit_onnx_create_full_scope_name(class_name: str, variable_name: str) -> str: ...
 def _compile_graph_to_code_table(name: str, graph: Graph) -> IValue: ...
 def _generate_upgraders_graph() -> Dict[str, Graph]: ...
+=======
+) -> list[Value]: ...
+def _jit_onnx_create_full_scope_name(class_name: str, variable_name: str) -> str: ...
+def _compile_graph_to_code_table(name: str, graph: Graph) -> IValue: ...
+def _generate_upgraders_graph() -> dict[str, Graph]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _calculate_package_version_based_on_upgraders(val: _bool): ...
 def _get_version_calculator_flag() -> _bool: ...
 def _jit_script_interface_compile(
@@ -706,38 +1094,62 @@ def _jit_script_compile_overload(
     overload_decl: Decl,
     implementation_def: Def,
     rcb: ResolutionCallback,
+<<<<<<< HEAD
     implementation_defaults: Dict[str, Any],
+=======
+    implementation_defaults: dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     signature: Any,
 ): ...
 def _jit_script_compile(
     qual_name: str,
     definition: Def,
     rcb: ResolutionCallback,
+<<<<<<< HEAD
     defaults: Dict[str, Any],
+=======
+    defaults: dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ): ...
 def _jit_script_class_compile(
     qual_name: str,
     definition: ClassDef,
+<<<<<<< HEAD
     defaults: Dict[str, Dict[str, Any]],
+=======
+    defaults: dict[str, dict[str, Any]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     rcb: ResolutionCallback,
 ): ...
 def _parse_source_def(src: str) -> Def: ...
 def import_ir_module(
     cu: CompilationUnit,
+<<<<<<< HEAD
     filename: Union[str, Path],
     map_location: Optional[DeviceLikeType],
     extra_files: Dict[str, Any],
+=======
+    filename: str | Path,
+    map_location: DeviceLikeType | None,
+    extra_files: dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> ScriptModule: ...
 def import_ir_module_from_buffer(
     cu: CompilationUnit,
     buffer: IO[bytes],
+<<<<<<< HEAD
     map_location: Optional[DeviceLikeType],
     extra_files: Dict[str, Any],
+=======
+    map_location: DeviceLikeType | None,
+    extra_files: dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> ScriptModule: ...
 def _import_ir_module_from_package(
     cu: CompilationUnit,
     reader: PyTorchFileReader,
     storage_context: DeserializationStorageContext,
+<<<<<<< HEAD
     map_location: Optional[DeviceLikeType],
     ts_id: str,
 ) -> ScriptModule: ...
@@ -747,6 +1159,17 @@ def _propagate_and_assign_input_shapes(
     graph: Graph,
     inputs: Tuple[Tensor, ...],
     param_count_list: List[_int],
+=======
+    map_location: DeviceLikeType | None,
+    ts_id: str,
+) -> ScriptModule: ...
+def _assign_output_shapes(graph: Graph, inputs: list[Tensor]) -> Graph: ...
+def _check_onnx_proto(proto: str) -> None: ...
+def _propagate_and_assign_input_shapes(
+    graph: Graph,
+    inputs: tuple[Tensor, ...],
+    param_count_list: list[_int],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with_grad: _bool,
     propagate: _bool,
 ) -> Graph: ...
@@ -755,12 +1178,20 @@ def _propagate_and_assign_input_shapes(
 class GraphExecutorState: ...
 
 # Defined in torch/torch/csrc/jit/ir/alias_analysis.h
+<<<<<<< HEAD
 class AliasDb:
     def __str__(self) -> str: ...
 
 class _InsertPoint:
     def __enter__(self) -> None: ...
     def __exit__(self, *args) -> None: ...
+=======
+class AliasDb: ...
+
+class _InsertPoint:
+    def __enter__(self) -> None: ...
+    def __exit__(self, *exc_info: object) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/jit/ir/ir.h
 class Use:
@@ -781,7 +1212,11 @@ class Value:
     def unique(self) -> _int: ...
     def offset(self) -> _int: ...
     def node(self) -> Node: ...
+<<<<<<< HEAD
     def uses(self) -> List[Use]: ...
+=======
+    def uses(self) -> list[Use]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def replaceAllUsesWith(self, val: Value) -> None: ...
     def replaceAllUsesAfterNodeWith(self, node: Node, val: Value) -> None: ...
     def requires_grad(self) -> _bool: ...
@@ -814,7 +1249,11 @@ class Node:
     def outputsAt(self, idx: _int) -> Value: ...
     def outputsSize(self) -> _int: ...
     def hasMultipleOutputs(self) -> _bool: ...
+<<<<<<< HEAD
     def blocks(self) -> List[Block]: ...
+=======
+    def blocks(self) -> list[Block]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def addBlock(self) -> Block: ...
     def mustBeNone(self) -> _bool: ...
     def matches(self, pattern: str) -> _bool: ...
@@ -846,23 +1285,41 @@ class Node:
     def sourceRange(self) -> SourceRange: ...
     def owningBlock(self) -> Block: ...
     def findNode(self, kind: str, recurse: _bool = True) -> Node: ...
+<<<<<<< HEAD
     def findAllNodes(self, kind: str, recurse: _bool = True) -> List[Node]: ...
     def getModuleHierarchy(self) -> str: ...
     def prev(self) -> Node: ...
     def destroy(self) -> None: ...
     def attributeNames(self) -> List[str]: ...
+=======
+    def findAllNodes(self, kind: str, recurse: _bool = True) -> list[Node]: ...
+    def getModuleHierarchy(self) -> str: ...
+    def prev(self) -> Node: ...
+    def destroy(self) -> None: ...
+    def attributeNames(self) -> list[str]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Accessors for attributes as types.
     def f(self, name: str) -> _float: ...
     def f_(self, name: str, val: _float) -> Node: ...
+<<<<<<< HEAD
     def fs(self, name: str) -> List[_float]: ...
     def fs_(self, name: str, val: List[_float]) -> Node: ...
+=======
+    def fs(self, name: str) -> list[_float]: ...
+    def fs_(self, name: str, val: list[_float]) -> Node: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def c(self, name: str) -> complex: ...
     def c_(self, name: str, val: complex) -> Node: ...
     def s(self, name: str) -> str: ...
     def s_(self, name: str, val: str) -> Node: ...
+<<<<<<< HEAD
     def ss(self, name: str) -> List[str]: ...
     def ss_(self, name: str, val: List[str]) -> Node: ...
+=======
+    def ss(self, name: str) -> list[str]: ...
+    def ss_(self, name: str, val: list[str]) -> Node: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def i(self, name: str) -> _int: ...
     def i_(self, name: str, val: _int) -> Node: ...
     # Cannot define "is" like this because it's a reserved keyword in python.
@@ -870,18 +1327,32 @@ class Node:
     # def is_(self, name: str, val: List[_int]) -> Node: ...
     def g(self, name: str) -> Graph: ...
     def g_(self, name: str, val: Graph) -> Node: ...
+<<<<<<< HEAD
     def gs(self, name: str) -> List[Graph]: ...
     def gs_(self, name: str, val: List[Graph]) -> Node: ...
+=======
+    def gs(self, name: str) -> list[Graph]: ...
+    def gs_(self, name: str, val: list[Graph]) -> Node: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def ival(self, name: str) -> IValue: ...
     def ival_(self, name: str, val: IValue) -> Node: ...
     def t(self, name: str) -> Tensor: ...
     def t_(self, name: str, val: Tensor) -> Node: ...
+<<<<<<< HEAD
     def ts(self, name: str) -> List[Tensor]: ...
     def ts_(self, name: str, val: List[Tensor]) -> Node: ...
     def ty(self, name: str) -> JitType: ...
     def ty_(self, name: str, val: JitType) -> Node: ...
     def tys(self, name: str) -> List[JitType]: ...
     def tys_(self, name: str, val: List[JitType]) -> Node: ...
+=======
+    def ts(self, name: str) -> list[Tensor]: ...
+    def ts_(self, name: str, val: list[Tensor]) -> Node: ...
+    def ty(self, name: str) -> JitType: ...
+    def ty_(self, name: str, val: JitType) -> Node: ...
+    def tys(self, name: str) -> list[JitType]: ...
+    def tys_(self, name: str, val: list[JitType]) -> Node: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/torch/csrc/jit/ir/ir.h
 class Graph:
@@ -901,23 +1372,42 @@ class Graph:
     def block(self) -> Block: ...
     def lint(self) -> None: ...
     def alias_db(self) -> AliasDb: ...
+<<<<<<< HEAD
     def setInsertPoint(self, n: Union[Block, Node]) -> None: ...
     def insert_point_guard(self, n: Union[Block, Node]) -> _InsertPoint: ...
     def insertPoint(self) -> Node: ...
     def insertGraph(self, callee: Graph, inputs: List[Value]) -> List[Value]: ...
+=======
+    def setInsertPoint(self, n: Block | Node) -> None: ...
+    def insert_point_guard(self, n: Block | Node) -> _InsertPoint: ...
+    def insertPoint(self) -> Node: ...
+    def insertGraph(self, callee: Graph, inputs: list[Value]) -> list[Value]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def makeMultiOutputIntoTuple(self) -> None: ...
     def copy(self) -> Graph: ...
 
 # Defined in torch/aten/src/ATen/core/alias_info.h
 class AliasInfo:
     is_write: _bool
+<<<<<<< HEAD
     before_set: Set[str]
     after_set: Set[str]
+=======
+    before_set: set[str]
+    after_set: set[str]
+    def __init__(
+        self,
+        is_write: _bool,
+        before_set: set[str],
+        after_set: set[str],
+    ) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/aten/src/ATen/core/function_schema.h
 class Argument:
     name: str
     type: JitType
+<<<<<<< HEAD
     default_value: Optional[Any]
     def has_default_value(self) -> _bool: ...
     kwarg_only: _bool
@@ -932,6 +1422,40 @@ class FunctionSchema:
     name: str
     overload_name: str
     is_mutable: _bool
+=======
+    default_value: Any | None
+    def has_default_value(self) -> _bool: ...
+    kwarg_only: _bool
+    is_out: _bool
+    alias_info: AliasInfo | None
+    is_write: _bool
+    real_type: JitType
+    def __init__(
+        self,
+        name: str,
+        type: JitType,
+        N: _int | None,
+        defualt_value: Any | None,
+        kwarg_only: _bool,
+        alias_info: AliasInfo | None,
+    ) -> None: ...
+
+class FunctionSchema:
+    arguments: list[Argument]
+    returns: list[Argument]
+    name: str
+    overload_name: str
+    is_mutable: _bool
+    def __init__(
+        self,
+        name: str,
+        overload_name: str,
+        arguments: list[Argument],
+        returns: list[Argument],
+        is_vararg: _bool,
+        is_varret: _bool,
+    ) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _UpgraderEntry:
     bumped_at_version: _int
@@ -949,8 +1473,13 @@ class _UpgraderRange:
     max_version: _int
 
 def _get_max_operator_version() -> _int: ...
+<<<<<<< HEAD
 def _get_operator_version_map() -> Dict[str, List[_UpgraderEntry]]: ...
 def _get_upgrader_ranges(name: str) -> List[_UpgraderRange]: ...
+=======
+def _get_operator_version_map() -> dict[str, list[_UpgraderEntry]]: ...
+def _get_upgrader_ranges(name: str) -> list[_UpgraderRange]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _test_only_add_entry_to_op_version(op_name: str, entry: _UpgraderEntry) -> None: ...
 def _test_only_remove_entry_to_op_version(op_name: str) -> None: ...
 
@@ -990,7 +1519,11 @@ class ConcreteModuleTypeBuilder:
     ): ...
     def add_module(self, name: str, meta: ConcreteModuleType): ...
     def add_constant(self, name: str, value: Any): ...
+<<<<<<< HEAD
     def add_overload(self, method_name: str, overloaded_method_names: List[str]): ...
+=======
+    def add_overload(self, method_name: str, overloaded_method_names: list[str]): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def add_builtin_function(self, name: str, symbol_name: str): ...
     def add_failed_attribute(self, name: str, failure_reason: str): ...
     def add_function_attribute(
@@ -1000,18 +1533,30 @@ class ConcreteModuleTypeBuilder:
         func: Callable[..., Any],
     ): ...
     def add_ignored_attribute(self, name: str): ...
+<<<<<<< HEAD
     def add_ignored_attributes(self, names: List[str]): ...
+=======
+    def add_ignored_attributes(self, names: list[str]): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def add_forward_hook(self, hook: Callable[..., Any]): ...
     def add_forward_pre_hook(self, pre_hook: Callable[..., Any]): ...
 
 class ConcreteModuleType:
+<<<<<<< HEAD
     def get_constants(self) -> Dict[str, Any]: ...
+=======
+    def get_constants(self) -> dict[str, Any]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def equals(self, other: ConcreteModuleType) -> _bool: ...
     @staticmethod
     def from_jit_type(ty: JitType) -> ConcreteModuleType: ...
 
 class CallStack:
+<<<<<<< HEAD
     def __init__(self, name: str, range: SourceRange): ...
+=======
+    def __init__(self, name: str, range: SourceRange) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ErrorReport:
     def __init__(self, range: SourceRange) -> None: ...
@@ -1030,7 +1575,11 @@ class CompilationUnit:
         _frames_up: _int = ...,
     ): ...
     def get_interface(self, name: str) -> InterfaceType: ...
+<<<<<<< HEAD
     def get_functions(self) -> List[ScriptFunction]: ...
+=======
+    def get_functions(self) -> list[ScriptFunction]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def create_function(
         self,
         name: str,
@@ -1045,12 +1594,17 @@ class ScriptObject:
     def _type(self) -> ClassType: ...
 
 class ScriptModule(ScriptObject):
+<<<<<<< HEAD
     def _method_names(self) -> List[str]: ...
+=======
+    def _method_names(self) -> list[str]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _get_method(self, name: str) -> ScriptMethod: ...
 
 class LiteScriptModule:
     def __call__(self, *input): ...
     def find_method(self, method_name: str): ...
+<<<<<<< HEAD
     def forward(self, *input) -> List[str]: ...
     def run_method(self, method_name: str, *input): ...
 
@@ -1059,6 +1613,16 @@ class ScriptFunction(Generic[P, ReturnVal]):
     def __call__(self, *args: P.args, **kwargs: P.kwargs) -> ReturnVal: ...
     def save(self, filename: str, _extra_files: Dict[str, bytes]) -> None: ...
     def save_to_buffer(self, _extra_files: Dict[str, bytes]) -> bytes: ...
+=======
+    def forward(self, *input) -> list[str]: ...
+    def run_method(self, method_name: str, *input): ...
+
+# NOTE: switch to collections.abc.Callable in python 3.9
+class ScriptFunction(Generic[P, R]):
+    def __call__(self, *args: P.args, **kwargs: P.kwargs) -> R: ...
+    def save(self, filename: str, _extra_files: dict[str, bytes]) -> None: ...
+    def save_to_buffer(self, _extra_files: dict[str, bytes]) -> bytes: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def graph(self) -> Graph: ...
     def inlined_graph(self) -> Graph: ...
@@ -1069,9 +1633,15 @@ class ScriptFunction(Generic[P, ReturnVal]):
     def qualified_name(self) -> str: ...
 
 # NOTE: switch to collections.abc.Callable in python 3.9
+<<<<<<< HEAD
 class ScriptMethod(Generic[P, ReturnVal]):
     graph: Graph
     def __call__(self, *args: P.args, **kwargs: P.kwargs) -> ReturnVal: ...
+=======
+class ScriptMethod(Generic[P, R]):
+    graph: Graph
+    def __call__(self, *args: P.args, **kwargs: P.kwargs) -> R: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def owner(self) -> ScriptModule: ...
     @property
@@ -1080,7 +1650,11 @@ class ScriptMethod(Generic[P, ReturnVal]):
     def schema(self) -> FunctionSchema: ...
 
 class ScriptDict(Generic[K, T]):
+<<<<<<< HEAD
     def __init__(self, dict: Dict[K, T]) -> None: ...
+=======
+    def __init__(self, dict: dict[K, T]) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __len__(self) -> _int: ...
     def __contains__(self, key: K) -> _bool: ...
     def __getitem__(self, key: K) -> T: ...
@@ -1091,7 +1665,11 @@ class ScriptDict(Generic[K, T]):
     def keys(self) -> Iterator[K]: ...
 
 class ScriptList(Generic[T]):
+<<<<<<< HEAD
     def __init__(self, list: List[T]) -> None: ...
+=======
+    def __init__(self, list: list[T]) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __len__(self) -> _int: ...
     def __contains__(self, item: T) -> _bool: ...
     @overload
@@ -1101,7 +1679,11 @@ class ScriptList(Generic[T]):
     @overload
     def __setitem__(self, idx: _int, value: T) -> None: ...
     @overload
+<<<<<<< HEAD
     def __setitem__(self, idx: slice, value: List[T]) -> None: ...
+=======
+    def __setitem__(self, idx: slice, value: list[T]) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __delitem__(self, idx: _int) -> None: ...
     def __iter__(self) -> Iterator[T]: ...
     def count(self, value: T) -> _int: ...
@@ -1109,7 +1691,11 @@ class ScriptList(Generic[T]):
     def append(self, value: T) -> None: ...
     def clear(self) -> None: ...
     @overload
+<<<<<<< HEAD
     def extend(self, values: List[T]) -> None: ...
+=======
+    def extend(self, values: list[T]) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @overload
     def extend(self, values: Iterable[T]) -> None: ...
     @overload
@@ -1119,7 +1705,11 @@ class ScriptList(Generic[T]):
 
 class ModuleDict:
     def __init__(self, mod: ScriptModule) -> None: ...
+<<<<<<< HEAD
     def items(self) -> List[Tuple[str, Any]]: ...
+=======
+    def items(self) -> list[tuple[str, Any]]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ParameterDict:
     def __init__(self, mod: ScriptModule) -> None: ...
@@ -1134,7 +1724,11 @@ class Module: ...
 def _initExtension(shm_manager_path: str) -> None: ...  # THPModule_initExtension
 def _autograd_init() -> _bool: ...  # THPAutograd_initExtension
 def _add_docstr(obj: T, doc_obj: str) -> T: ...  # THPModule_addDocStr
+<<<<<<< HEAD
 def _init_names(arg: Sequence[_Type]) -> None: ...  # THPModule_initNames
+=======
+def _init_names(arg: Sequence[type]) -> None: ...  # THPModule_initNames
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _has_distributed() -> _bool: ...  # THPModule_hasDistributed
 def _set_default_tensor_type(type) -> None: ...  # THPModule_setDefaultTensorType
 def _set_default_dtype(d: _dtype) -> None: ...  # THPModule_setDefaultDtype
@@ -1149,7 +1743,13 @@ def _get_cpu_capability() -> str: ...  # THPModule_getCpuCapability
 def _set_backcompat_broadcast_warn(
     arg: _bool,
 ) -> None: ...  # THPModule_setBackcompatBroadcastWarn
+<<<<<<< HEAD
 def _get_backcompat_broadcast_warn() -> _bool: ...  # THPModule_getBackcompatBroadcastWarn
+=======
+def _get_backcompat_broadcast_warn() -> (
+    _bool
+): ...  # THPModule_getBackcompatBroadcastWarn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _set_backcompat_keepdim_warn(
     arg: _bool,
 ) -> None: ...  # THPModule_setBackcompatKeepdimWarn
@@ -1170,12 +1770,31 @@ def _set_sdp_use_mem_efficient(
 ) -> None: ...  # THPModule_setSDPUseMemEfficient
 def _get_math_sdp_enabled() -> _bool: ...  # THPModule_userEnabledMathSDP
 def _set_sdp_use_math(arg: _bool) -> None: ...  # THPModule_setSDPUseMath
+<<<<<<< HEAD
 def _get_math_sdp_allow_fp16_bf16_reduction() -> _bool: ...  # THPModule_allowFP16BF16ReductionMathSDP
 def _set_math_sdp_allow_fp16_bf16_reduction(arg: _bool) -> None: ...  # THPModule_setAllowFP16BF16ReductionMathSDP
 def _get_overrideable_sdp_enabled() -> _bool: ...  # THPModule_userEnabledOverrideableSDP
 def _set_sdp_use_overrideable(arg: _bool) -> None: ...  # THPModule_setSDPUseOverrideable
 def _get_sdp_priority_order() -> List[_int]: ... #THPModule_getSDPPriorityOrder
 def _set_sdp_priority_order(arg: List[_int]) -> None: ... #THPModule_setSDPPriorityOrder
+=======
+def _get_math_sdp_allow_fp16_bf16_reduction() -> (
+    _bool
+): ...  # THPModule_allowFP16BF16ReductionMathSDP
+def _set_math_sdp_allow_fp16_bf16_reduction(
+    arg: _bool,
+) -> None: ...  # THPModule_setAllowFP16BF16ReductionMathSDP
+def _get_overrideable_sdp_enabled() -> (
+    _bool
+): ...  # THPModule_userEnabledOverrideableSDP
+def _set_sdp_use_overrideable(
+    arg: _bool,
+) -> None: ...  # THPModule_setSDPUseOverrideable
+def _get_sdp_priority_order() -> list[_int]: ...  # THPModule_getSDPPriorityOrder
+def _set_sdp_priority_order(
+    arg: list[_int],
+) -> None: ...  # THPModule_setSDPPriorityOrder
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_cudnn_sdp_enabled() -> _bool: ...  # THPModule_userEnabledMathSDP
 def _set_sdp_use_cudnn(arg: _bool) -> None: ...  # THPModule_setSDPUseMath
 def _get_mkldnn_enabled() -> _bool: ...  # THPModule_userEnabledMkldnn
@@ -1185,18 +1804,39 @@ def _set_cudnn_benchmark(arg: _bool) -> None: ...  # THPModule_setBenchmarkCuDNN
 def _get_cudnn_deterministic() -> _bool: ...  # THPModule_deterministicCuDNN
 def _set_cudnn_deterministic(arg: _bool) -> None: ...  # THPModule_setDeterministicCuDNN
 def _get_mkldnn_deterministic() -> _bool: ...  # THPModule_deterministicMkldnn
+<<<<<<< HEAD
 def _set_mkldnn_deterministic(arg: _bool) -> None: ...  # THPModule_setDeterministicMkldnn
 def _get_onednn_allow_tf32() -> _bool: ... # THPModule_allowTF32OneDNN
 def _set_onednn_allow_tf32(arg: _bool) -> None: ... # THPModule_setAllowTF32OneDNN
 def _get_deterministic_algorithms() -> _bool: ...  # THPModule_deterministicAlgorithms
 def _get_deterministic_algorithms_warn_only() -> _bool: ...  # THPModule_deterministicAlgorithmsWarnOnly
+=======
+def _set_mkldnn_deterministic(
+    arg: _bool,
+) -> None: ...  # THPModule_setDeterministicMkldnn
+def _get_onednn_allow_tf32() -> _bool: ...  # THPModule_allowTF32OneDNN
+def _set_onednn_allow_tf32(arg: _bool) -> None: ...  # THPModule_setAllowTF32OneDNN
+def _get_deterministic_algorithms() -> _bool: ...  # THPModule_deterministicAlgorithms
+def _get_deterministic_algorithms_warn_only() -> (
+    _bool
+): ...  # THPModule_deterministicAlgorithmsWarnOnly
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _set_deterministic_algorithms(
     mode: _bool,
     *,
     warn_only: _bool = ...,
 ) -> None: ...  # THPModule_setDeterministicAlgorithms
+<<<<<<< HEAD
 def _get_deterministic_fill_uninitialized_memory() -> _bool: ...  # THPModule_deterministicFillUninitializedMemory
 def _set_deterministic_fill_uninitialized_memory(arg: _bool) -> None: ...  # THPModule_setDeterministicFillUninitializedMemory
+=======
+def _get_deterministic_fill_uninitialized_memory() -> (
+    _bool
+): ...  # THPModule_deterministicFillUninitializedMemory
+def _set_deterministic_fill_uninitialized_memory(
+    arg: _bool,
+) -> None: ...  # THPModule_setDeterministicFillUninitializedMemory
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_nnpack_enabled() -> _bool: ...  # THPModule_userEnabledNNPACK
 def _set_nnpack_enabled(arg: _bool) -> None: ...  # THPModule_setUserEnabledNNPACK
 def _get_warnAlways() -> _bool: ...  # THPModule_warnAlways
@@ -1209,6 +1849,7 @@ def _get_float32_matmul_precision() -> str: ...  # THPModule_float32MatmulPrecis
 def _set_float32_matmul_precision(
     arg: str,
 ) -> None: ...  # THPModule_setFloat32MatmulPrecision
+<<<<<<< HEAD
 def _get_cublas_allow_fp16_reduced_precision_reduction() -> _bool: ...  # THPModule_allowFP16ReductionCuBLAS
 def _set_cublas_allow_fp16_reduced_precision_reduction(
     arg: _bool,
@@ -1223,6 +1864,28 @@ def _set_cublas_allow_fp16_accumulation(
 ) -> None: ... # THPModule_setAllowFP16AccumulationCuBLAS
 def _get_sm_carveout_experimental() -> Optional[_int]: ...
 def _set_sm_carveout_experimental(arg: Optional[_int]) -> None: ...
+=======
+def _get_cublas_allow_fp16_reduced_precision_reduction() -> (
+    _bool
+): ...  # THPModule_allowFP16ReductionCuBLAS
+def _set_cublas_allow_fp16_reduced_precision_reduction(
+    arg: _bool,
+) -> None: ...  # THPModule_setAllowFP16ReductionCuBLAS
+def _get_cublas_allow_bf16_reduced_precision_reduction() -> (
+    _bool
+): ...  # THPModule_allowBF16ReductionCuBLAS
+def _set_cublas_allow_bf16_reduced_precision_reduction(
+    arg: _bool,
+) -> None: ...  # THPModule_setAllowBF16ReductionCuBLAS
+def _get_cublas_allow_fp16_accumulation() -> (
+    _bool
+): ...  # THPModule_allowFP16AccumulationCuBLAS
+def _set_cublas_allow_fp16_accumulation(
+    arg: _bool,
+) -> None: ...  # THPModule_setAllowFP16AccumulationCuBLAS
+def _get_sm_carveout_experimental() -> _int | None: ...
+def _set_sm_carveout_experimental(arg: _int | None) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _set_conj(x: Tensor, conj: _bool) -> None: ...
 def _set_neg(x: Tensor, neg: _bool) -> None: ...
 def _set_meta_in_tls_dispatch_include(meta_in_tls: _bool) -> None: ...
@@ -1238,12 +1901,31 @@ def _conv_determine_backend_memory_format(
     backend: ConvBackend,
 ) -> memory_format: ...
 def _has_storage(x: Tensor) -> _bool: ...
+<<<<<<< HEAD
 def _construct_storage_from_data_pointer(data_ptr: _int, device: torch.device, size: _int) -> Storage: ...
 def _should_allow_numbers_as_tensors(func_name: str) -> _bool: ...
 def _group_tensors_by_device_and_dtype(nested_tensorlists: List[List[Optional[Tensor]]], with_indices: _bool = False) -> Dict[Tuple[torch.device, torch.dtype], Tuple[List[List[Optional[Tensor]]], List[_int]]]: ...
 
 # NB: There is no Capsule type in typing, see
 # https://code.activestate.com/lists/python-dev/139675/
+=======
+def _construct_storage_from_data_pointer(
+    data_ptr: _int,
+    device: torch.device,
+    size: _int,
+) -> Storage: ...
+def _should_allow_numbers_as_tensors(func_name: str) -> _bool: ...
+def _group_tensors_by_device_and_dtype(
+    nested_tensorlists: list[list[Tensor | None]],
+    with_indices: _bool = False,
+) -> dict[
+    tuple[torch.device, torch.dtype],
+    tuple[list[list[Tensor | None]], list[_int]],
+]: ...
+
+# NB: There is no Capsule type in typing, see
+# https://github.com/python/cpython/issues/109562
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _to_dlpack(data: Tensor) -> Any: ...  # THPModule_toDLPack
 def _from_dlpack(data: Any) -> Tensor: ...  # THPModule_fromDLPack
 def _get_cpp_backtrace(
@@ -1255,6 +1937,7 @@ def get_default_dtype() -> _dtype: ...  # THPModule_getDefaultDtype
 def _get_default_device() -> str: ...  # THPModule_getDefaultDevice
 def _get_qengine() -> _int: ...  # THPModule_qEngine
 def _set_qengine(qengine: _int) -> None: ...  # THPModule_setQEngine
+<<<<<<< HEAD
 def _supported_qengines() -> List[_int]: ...  # THPModule_supportedQEngines
 def _is_xnnpack_enabled() -> _bool: ...  # THPModule_isEnabledXNNPACK
 def _check_sparse_tensor_invariants() -> _bool: ...  # THPModule_checkSparseTensorInvariants
@@ -1266,6 +1949,29 @@ def _set_default_mobile_cpu_allocator() -> None: ...  # THPModule_setDefaultMobi
 def _unset_default_mobile_cpu_allocator() -> None: ...  # THPModule_unsetDefaultMobileCPUAllocator
 def _is_torch_function_enabled() -> _bool: ...  # THPModule_isEnabledTorchFunction
 def _is_torch_function_all_disabled() -> _bool: ...  # THPModule_isAllDisabledTorchFunction
+=======
+def _supported_qengines() -> list[_int]: ...  # THPModule_supportedQEngines
+def _is_xnnpack_enabled() -> _bool: ...  # THPModule_isEnabledXNNPACK
+def _check_sparse_tensor_invariants() -> (
+    _bool
+): ...  # THPModule_checkSparseTensorInvariants
+def _set_check_sparse_tensor_invariants(
+    arg: _bool,
+) -> None: ...  # THPModule_setCheckSparseTensorInvariants
+def _is_default_mobile_cpu_allocator_set() -> (
+    _bool
+): ...  # THPModule_isDefaultMobileCPUAllocatorSet
+def _set_default_mobile_cpu_allocator() -> (
+    None
+): ...  # THPModule_setDefaultMobileCPUAllocator
+def _unset_default_mobile_cpu_allocator() -> (
+    None
+): ...  # THPModule_unsetDefaultMobileCPUAllocator
+def _is_torch_function_enabled() -> _bool: ...  # THPModule_isEnabledTorchFunction
+def _is_torch_function_all_disabled() -> (
+    _bool
+): ...  # THPModule_isAllDisabledTorchFunction
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _has_torch_function(
     args: Iterable[Any],
 ) -> _bool: ...  # THPModule_has_torch_function
@@ -1276,6 +1982,7 @@ def _has_torch_function_variadic(
 def _vmapmode_increment_nesting() -> _int: ...  # THPModule_vmapmode_increment_nesting
 def _vmapmode_decrement_nesting() -> _int: ...  # THPModule_vmapmode_decrement_nesting
 def _log_api_usage_once(str) -> None: ...  # LogAPIUsageOnceFromPython
+<<<<<<< HEAD
 def _log_api_usage_metadata(event: str, metadata_map: Dict[str, str]) -> None: ...  # LogAPIUsageMetadataFromPython
 def _demangle(str) -> str: ...  # c10::demangle
 def _disabled_torch_function_impl(
@@ -1292,6 +1999,27 @@ def _disabled_torch_dispatch_impl(
 ) -> Any: ...  # THPModule_disable_dispatch_function
 def _get_linalg_preferred_backend() -> torch._C._LinalgBackend: ...
 def _set_linalg_preferred_backend(arg: torch._C._LinalgBackend): ...
+=======
+def _log_api_usage_metadata(
+    event: str,
+    metadata_map: dict[str, str],
+) -> None: ...  # LogAPIUsageMetadataFromPython
+def _demangle(str) -> str: ...  # c10::demangle
+def _disabled_torch_function_impl(
+    func: Callable,
+    types: Iterable[type],
+    args: tuple,
+    kwargs: dict,
+) -> Any: ...  # THPModule_disable_torch_function
+def _disabled_torch_dispatch_impl(
+    func: Callable,
+    types: Iterable[type],
+    args: tuple,
+    kwargs: dict,
+) -> Any: ...  # THPModule_disable_dispatch_function
+def _get_linalg_preferred_backend() -> _LinalgBackend: ...
+def _set_linalg_preferred_backend(arg: _LinalgBackend): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _LinalgBackend:
     Default: _LinalgBackend
@@ -1303,10 +2031,17 @@ class _LinalgBackend:
 # members. There is a chance this is due to a recent change in the semantics
 # of enum membership. If so, use `member = value` to mark an enum member,
 # instead of `member: type`
+<<<<<<< HEAD
 class BatchNormBackend(Enum): ... # type: ignore[misc]
 
 def _get_blas_preferred_backend() -> torch._C._BlasBackend: ...
 def _set_blas_preferred_backend(arg: torch._C._BlasBackend): ...
+=======
+class BatchNormBackend(Enum): ...  # type: ignore[misc]
+
+def _get_blas_preferred_backend() -> _BlasBackend: ...
+def _set_blas_preferred_backend(arg: _BlasBackend): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _BlasBackend:
     Default: _BlasBackend
@@ -1328,7 +2063,11 @@ class _ROCmFABackend:
 # There is a chance this is due to a recent change in the semantics of enum
 # membership. If so, use `member = value` to mark an enum member, instead of
 # `member: type`
+<<<<<<< HEAD
 class ConvBackend(Enum): ... # type: ignore[misc]
+=======
+class ConvBackend(Enum): ...  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Tag(Enum):
     ${tag_attributes}
@@ -1336,7 +2075,13 @@ class Tag(Enum):
 # Defined in `valgrind.h` and `callgrind.h` respectively.
 def _valgrind_supported_platform() -> _bool: ...  # NVALGRIND
 def _valgrind_toggle() -> None: ...  # CALLGRIND_TOGGLE_COLLECT
+<<<<<<< HEAD
 def _valgrind_toggle_and_dump_stats() -> None: ...  # CALLGRIND_TOGGLE_COLLECT and CALLGRIND_DUMP_STATS
+=======
+def _valgrind_toggle_and_dump_stats() -> (
+    None
+): ...  # CALLGRIND_TOGGLE_COLLECT and CALLGRIND_DUMP_STATS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 has_openmp: _bool
 has_mkl: _bool
@@ -1358,6 +2103,11 @@ def _set_grad_enabled(enabled: _bool) -> None: ...
 def is_grad_enabled() -> _bool: ...
 def _set_fwd_grad_enabled(enabled: _bool) -> None: ...
 def _is_fwd_grad_enabled() -> _bool: ...
+<<<<<<< HEAD
+=======
+def _any_requires_grad(*args, **kwargs) -> _bool: ...
+def _any_output_is_alias_to_input_or_output(*args, **kwargs) -> _bool: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_inference_mode_enabled() -> _bool: ...
 @overload
 def set_autocast_enabled(device_type: str, enabled: _bool) -> None: ...
@@ -1405,15 +2155,22 @@ def _get_function_stack_at(idx: _int) -> Any: ...
 def _len_torch_function_stack() -> _int: ...
 def _set_torch_dispatch_mode(cls: Any) -> None: ...
 def _push_on_torch_dispatch_stack(cls: TorchDispatchMode) -> None: ...
+<<<<<<< HEAD
 def _pop_torch_dispatch_stack(mode_key: Optional[torch._C._TorchDispatchModeKey] = None) -> Any: ...
 def _get_dispatch_mode(mode_key: Optional[torch._C._TorchDispatchModeKey]) -> Any: ...
 def _unset_dispatch_mode(mode: torch._C._TorchDispatchModeKey) -> Optional[TorchDispatchMode]: ...
+=======
+def _pop_torch_dispatch_stack(mode_key: _TorchDispatchModeKey | None = None) -> Any: ...
+def _get_dispatch_mode(mode_key: _TorchDispatchModeKey | None) -> Any: ...
+def _unset_dispatch_mode(mode: _TorchDispatchModeKey) -> TorchDispatchMode | None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _set_dispatch_mode(mode: TorchDispatchMode) -> None: ...
 def _get_dispatch_stack_at(idx: _int) -> Any: ...
 def _len_torch_dispatch_stack() -> _int: ...
 def _activate_gpu_trace() -> None: ...
 
 class _DisableTorchDispatch:
+<<<<<<< HEAD
     def __init__(self): ...
     def __enter__(self): ...
     def __exit__(self, exc_type, exc_value, traceback): ...
@@ -1452,6 +2209,46 @@ class _InferenceMode:
     def __init__(self, enabled: _bool): ...
     def __enter__(self): ...
     def __exit__(self, exc_type, exc_value, traceback): ...
+=======
+    def __init__(self) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _EnableTorchFunction:
+    def __init__(self) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _EnablePythonDispatcher:
+    def __init__(self) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _DisablePythonDispatcher:
+    def __init__(self) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _EnablePreDispatch:
+    def __init__(self) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _DisableFuncTorch:
+    def __init__(self) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _DisableAutocast:
+    def __init__(self) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _InferenceMode:
+    def __init__(self, enabled: _bool) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _set_autograd_fallback_mode(mode: str) -> None: ...
 def _get_autograd_fallback_mode() -> str: ...
@@ -1488,10 +2285,15 @@ class PyTorchFileReader:
     @overload
     def __init__(self, buffer: IO[bytes]) -> None: ...
     def get_record(self, name: str) -> bytes: ...
+<<<<<<< HEAD
+=======
+    def get_all_records(self) -> list[str]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def serialization_id(self) -> str: ...
 
 class PyTorchFileWriter:
     @overload
+<<<<<<< HEAD
     def __init__(self, name: str, compute_crc32: _bool = True, storage_alignment: _int = 64) -> None: ...
     @overload
     def __init__(self, buffer: IO[bytes], compute_crc32: _bool = True, storage_alignment: _int = 64) -> None: ...
@@ -1499,6 +2301,30 @@ class PyTorchFileWriter:
     def write_end_of_file(self) -> None: ...
     def set_min_version(self, version: _int) -> None: ...
     def get_all_written_records(self) -> List[str]: ...
+=======
+    def __init__(
+        self,
+        name: str,
+        compute_crc32: _bool = True,
+        storage_alignment: _int = 64,
+    ) -> None: ...
+    @overload
+    def __init__(
+        self,
+        buffer: IO[bytes],
+        compute_crc32: _bool = True,
+        storage_alignment: _int = 64,
+    ) -> None: ...
+    def write_record(
+        self,
+        name: str,
+        data: Storage | bytes | _int,
+        size: _int,
+    ) -> None: ...
+    def write_end_of_file(self) -> None: ...
+    def set_min_version(self, version: _int) -> None: ...
+    def get_all_written_records(self) -> list[str]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def archive_name(self) -> str: ...
     def serialization_id(self) -> str: ...
 
@@ -1509,6 +2335,10 @@ def _jit_set_logging_option(option: str) -> None: ...
 def _jit_set_logging_stream(stream_name: str) -> None: ...
 def _jit_pass_cse(Graph) -> _bool: ...
 def _jit_pass_dce(Graph) -> None: ...
+<<<<<<< HEAD
+=======
+def _jit_pass_dce_graph(Graph) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _jit_pass_lint(Graph) -> None: ...
 
 # Defined in torch/csrc/jit/python/python_custom_class.cpp
@@ -1521,9 +2351,17 @@ def _get_privateuse1_backend_name() -> str: ...
 # Defined in torch/csrc/Generator.cpp
 class Generator:
     device: _device
+<<<<<<< HEAD
     def __init__(self, device: Optional[DeviceLikeType] = None) -> None: ...
     def __reduce__(self) -> Tuple[_Type[Generator], Tuple[_device], Tuple[_int, Optional[_int], Tensor]]: ...
     def __setstate__(self, state: Tuple[_int, Optional[_int], Tensor]) -> None: ...
+=======
+    def __init__(self, device: DeviceLikeType | None = None) -> None: ...
+    def __reduce__(
+        self,
+    ) -> tuple[type[Generator], tuple[_device], tuple[_int, _int | None, Tensor]]: ...
+    def __setstate__(self, state: tuple[_int, _int | None, Tensor]) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_state(self) -> Tensor: ...
     def set_state(self, _new_state: Tensor) -> Generator: ...
     def clone_state(self) -> Generator: ...
@@ -1540,6 +2378,10 @@ class Generator:
 class _DispatchOperatorHandle:
     def schema(self) -> FunctionSchema: ...
     def debug(self) -> str: ...
+<<<<<<< HEAD
+=======
+    def redispatch_boxed(self, keyset: DispatchKeySet, *args, **kwargs) -> Any: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _DispatchModule:
     def reset(self) -> None: ...
@@ -1568,12 +2410,25 @@ class _DispatchModule:
         self,
         ns: str,
         op_name_with_overload: str,
+<<<<<<< HEAD
         dispatch: _dispatchkey
+=======
+        dispatch: _dispatchkey,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None: ...
     def impl(self, name: str, dispatch: _dispatchkey, func: Callable) -> None: ...
     def define(self, schema: str, alias: str = "") -> str: ...
     def fallback_fallthrough(self, dispatch: str = "") -> _DispatchModule: ...
+<<<<<<< HEAD
     def fallback(self, dispatch: _dispatchkey, func: Callable, with_keyset: _bool = False) -> None: ...
+=======
+    def fallback(
+        self,
+        dispatch: _dispatchkey,
+        func: Callable,
+        with_keyset: _bool = False,
+    ) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _after_ADInplaceOrView_keyset: DispatchKeySet
 _after_autograd_keyset: DispatchKeySet
@@ -1590,8 +2445,19 @@ def _dispatch_dump_table(name: str) -> str: ...
 def _dispatch_check_invariants(name: str) -> None: ...
 def _dispatch_check_all_invariants() -> None: ...
 def _dispatch_call_boxed(handle: _DispatchOperatorHandle, *args, **kwargs) -> Any: ...
+<<<<<<< HEAD
 def _dispatch_find_schema_or_throw(name: str, overload_name: str) -> _DispatchOperatorHandle: ...
 def _dispatch_set_report_error_callback(handle: _DispatchOperatorHandle, callback: Callable) -> None: ...
+=======
+def _dispatch_find_schema_or_throw(
+    name: str,
+    overload_name: str,
+) -> _DispatchOperatorHandle: ...
+def _dispatch_set_report_error_callback(
+    handle: _DispatchOperatorHandle,
+    callback: Callable,
+) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _dispatch_has_kernel(name: str) -> _bool: ...
 def _dispatch_has_kernel_for_dispatch_key(
     name: str,
@@ -1609,8 +2475,13 @@ def _dispatch_has_computed_kernel_for_dispatch_key(
     name: str,
     dispatch: _dispatchkey,
 ) -> _bool: ...
+<<<<<<< HEAD
 def _dispatch_find_dangling_impls() -> List[str]: ...
 def _dispatch_get_all_op_names() -> List[str]: ...
+=======
+def _dispatch_find_dangling_impls() -> list[str]: ...
+def _dispatch_get_all_op_names() -> list[str]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _dispatch_tls_set_dispatch_key_excluded(
     dispatch: _dispatchkey,
     val: _bool,
@@ -1624,12 +2495,21 @@ def _dispatch_tls_is_dispatch_key_included(dispatch: _dispatchkey) -> _bool: ...
 def _dispatch_isTensorSubclassLike(tensor: Tensor) -> _bool: ...
 def _dispatch_key_name(dispatch: _dispatchkey) -> str: ...
 def _dispatch_key_for_device(device_type: str) -> str: ...
+<<<<<<< HEAD
 def _parse_dispatch_key(key: str) -> Optional[DispatchKey]: ...
 def _dispatch_key_parse(dispatch: _dispatchkey) -> DispatchKey: ...
 def _dispatch_num_backends() -> _int: ...
 def _dispatch_pystub(name: str, overload: str) -> Optional[Tuple[str, str]]: ...
 def _dispatch_is_alias_key(dispatch: _dispatchkey) -> _bool: ...
 def _functionality_to_backend_keys(dispatch: _dispatchkey) -> List[DispatchKey]: ...
+=======
+def _parse_dispatch_key(key: str) -> DispatchKey | None: ...
+def _dispatch_key_parse(dispatch: _dispatchkey) -> DispatchKey: ...
+def _dispatch_num_backends() -> _int: ...
+def _dispatch_pystub(name: str, overload: str) -> tuple[str, str] | None: ...
+def _dispatch_is_alias_key(dispatch: _dispatchkey) -> _bool: ...
+def _functionality_to_backend_keys(dispatch: _dispatchkey) -> list[DispatchKey]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _functionalization_reapply_views_tls() -> _bool: ...
 def _only_lift_cpu_tensors() -> _bool: ...
 def _set_only_lift_cpu_tensors(value: _bool) -> None: ...
@@ -1645,11 +2525,19 @@ class DispatchKeySet:
     def __sub__(self, other: DispatchKeySet) -> DispatchKeySet: ...
     def __and__(self, other: DispatchKeySet) -> DispatchKeySet: ...
     def raw_repr(self) -> _int: ...
+<<<<<<< HEAD
+=======
+    @staticmethod
+    def from_raw_repr(raw: _int) -> DispatchKeySet: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def highestPriorityTypeId(self) -> DispatchKey: ...
     def has(self, k: _dispatchkey) -> _bool: ...
     def add(self, k: _dispatchkey) -> DispatchKeySet: ...
     def remove(self, k: _dispatchkey) -> DispatchKeySet: ...
+<<<<<<< HEAD
     def __repr__(self) -> str: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _dispatch_autogradother_backends: DispatchKeySet
 _additional_keys_to_prop_for_wrapper_tensors: DispatchKeySet
@@ -1673,6 +2561,7 @@ def _replace_(a: Tensor, b: Tensor) -> None: ...
 def _commit_update(a: Tensor) -> None: ...
 
 class _ExcludeDispatchKeyGuard:
+<<<<<<< HEAD
     def __init__(self, keyset: DispatchKeySet): ...
     def __enter__(self): ...
     def __exit__(self, exc_type, exc_value, traceback): ...
@@ -1701,38 +2590,89 @@ class _AutoDispatchBelowADInplaceOrView:
     def __init__(self): ...
     def __enter__(self): ...
     def __exit__(self, exc_type, exc_value, traceback): ...
+=======
+    def __init__(self, keyset: DispatchKeySet) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _IncludeDispatchKeyGuard:
+    def __init__(self, k: DispatchKey) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _ForceDispatchKeyGuard:
+    def __init__(self, include: DispatchKeySet, exclude: DispatchKeySet) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _PreserveDispatchKeyGuard:
+    def __init__(self) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _AutoDispatchBelowAutograd:
+    def __init__(self) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+
+class _AutoDispatchBelowADInplaceOrView:
+    def __init__(self) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _dispatch_print_registrations_for_dispatch_key(dispatch_key: str = "") -> None: ...
 def _dispatch_get_registrations_for_dispatch_key(
     dispatch_key: str = "",
+<<<<<<< HEAD
 ) -> List[str]: ...
+=======
+) -> list[str]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _are_functorch_transforms_active() -> _bool: ...
 
 # Define in torch/csrc/autograd/init.cpp
 def _set_python_dispatcher(dispatcher: object) -> None: ...
+<<<<<<< HEAD
 
 def _get_nested_int(id: _int, coeff: _int) -> SymInt: ...
 
+=======
+def _get_nested_int(id: _int, coeff: _int) -> SymInt: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_constant_bool_symnode(val: _bool) -> Any: ...
 
 class _TorchDispatchModeKey(Enum):
     ${torch_dispatch_mode_key_hints}
 
 class _SetExcludeDispatchKeyGuard:
+<<<<<<< HEAD
     def __init__(self, k: DispatchKey, enabled: _bool): ...
     def __enter__(self): ...
     def __exit__(self, exc_type, exc_value, traceback): ...
+=======
+    def __init__(self, k: DispatchKey, enabled: _bool) -> None: ...
+    def __enter__(self): ...
+    def __exit__(self, *exc_info: object) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/utils/schema_info.h
 
 class _SchemaInfo:
+<<<<<<< HEAD
     def __init__(self, schema: _int) -> None: ...
 
+=======
+    def __init__(self, schema: FunctionSchema) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @overload
     def is_mutable(self) -> _bool: ...
     @overload
     def is_mutable(self, name: str) -> _bool: ...
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def has_argument(self, name: str) -> _bool: ...
 
 # Defined in torch/csrc/utils/init.cpp
@@ -1762,19 +2702,32 @@ ${legacy_class_hints}
 # Defined in torch/csrc/autograd/python_engine.cpp
 class _ImperativeEngine:
     def queue_callback(self, callback: Callable[[], None]) -> None: ...
+<<<<<<< HEAD
     def run_backward(self, *args: Any, **kwargs: Any) -> Tuple[Tensor, ...]: ...
+=======
+    def run_backward(self, *args: Any, **kwargs: Any) -> tuple[Tensor, ...]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_checkpoint_valid(self) -> _bool: ...
 
 # Defined in torch/csrc/autograd/python_variable.cpp
 class _TensorMeta(type): ...
 
+<<<<<<< HEAD
+=======
+${index_type_def}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Defined in torch/csrc/autograd/python_variable.cpp
 class TensorBase(metaclass=_TensorMeta):
     requires_grad: _bool
     retains_grad: _bool
     shape: Size
     data: Tensor
+<<<<<<< HEAD
     names: List[str]
+=======
+    names: list[str]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device: _device
     dtype: _dtype
     layout: _layout
@@ -1787,6 +2740,7 @@ class TensorBase(metaclass=_TensorMeta):
     ndim: _int
     output_nr: _int
     _version: _int
+<<<<<<< HEAD
     _base: Optional[Tensor]
     _cdata: _int
     grad_fn: Optional[_Node]
@@ -1794,6 +2748,15 @@ class TensorBase(metaclass=_TensorMeta):
     _grad: Optional[Tensor]
     grad: Optional[Tensor]
     _backward_hooks: Optional[Dict[_int, Callable[[Tensor], Optional[Tensor]]]]
+=======
+    _base: Tensor | None
+    _cdata: _int
+    grad_fn: _Node | None
+    _grad_fn: Any
+    _grad: Tensor | None
+    grad: Tensor | None
+    _backward_hooks: dict[_int, Callable[[Tensor], Tensor | None]] | None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nbytes: _int
     itemsize: _int
     _has_symbolic_sizes_strides: _bool
@@ -1801,11 +2764,17 @@ class TensorBase(metaclass=_TensorMeta):
     def _view_func_unsafe(
         self,
         new_base: Tensor,
+<<<<<<< HEAD
         symint_visitor_fn: Optional[Callable[[_int], _int]] = None,
         tensor_visitor_fn: Optional[Callable[[Tensor], Tensor]] = None
     ):
         ...
 
+=======
+        symint_visitor_fn: Callable[[_int], _int] | None = None,
+        tensor_visitor_fn: Callable[[Tensor], Tensor] | None = None,
+    ): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ${tensor_method_hints}
 
 _TensorBase = TensorBase
@@ -1822,6 +2791,10 @@ def _accelerator_hooks_get_current_device() -> _int: ...
 def _accelerator_hooks_exchange_device(device_index: _int) -> _int: ...
 def _accelerator_hooks_maybe_exchange_device(device_index: _int) -> _int: ...
 def _get_accelerator(check: _bool = False) -> _device: ...
+<<<<<<< HEAD
+=======
+def _storage_Use_Count(storage_ptr: _int) -> _int: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/mtia/Module.cpp
 def _mtia_init() -> None: ...
@@ -1831,6 +2804,7 @@ def _mtia_deviceSynchronize() -> None: ...
 def _mtia_getCurrentStream(device: _int) -> Stream: ...
 def _mtia_setCurrentStream(stream: Stream) -> None: ...
 def _mtia_getDefaultStream(device: _int) -> Stream: ...
+<<<<<<< HEAD
 def _mtia_memoryStats(device: _int) -> Dict[str, Any]: ...
 def _mtia_getDeviceCapability(device: _int) -> Tuple[_int, _int]: ...
 def _mtia_emptyCache() -> None: ...
@@ -1840,6 +2814,21 @@ def _mtia_recordMemoryHistory(
     max_entries
 ) -> None: ...
 def _mtia_memorySnapshot() -> Dict[str, Any]: ...
+=======
+def _mtia_memoryStats(device: _int) -> dict[str, Any]: ...
+def _mtia_getDeviceCapability(device: _int) -> tuple[_int, _int]: ...
+def _mtia_getDeviceProperties(device: _int) -> dict[str, Any]: ...
+def _mtia_emptyCache() -> None: ...
+def _mtia_recordMemoryHistory(
+    enabled: str | None,
+    stacks: str,
+    max_entries,
+) -> None: ...
+def _mtia_memorySnapshot() -> dict[str, Any]: ...
+def _mtia_attachOutOfMemoryObserver(
+    observer: Callable[[_int, _int, _int, _int], None],
+) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _mtia_getDeviceCount() -> _int: ...
 def _mtia_resetPeakMemoryStats(device: _int) -> None: ...
 
@@ -1867,12 +2856,20 @@ def _mps_isCapturing() -> _bool: ...
 def _mps_startCapture(name: str) -> None: ...
 def _mps_stopCapture() -> None: ...
 
+<<<<<<< HEAD
 
 # Defined in torch/csrc/cuda/Module.cpp
 def _cuda_getCurrentStream(device: _int) -> Tuple: ...
 def _cuda_getCurrentRawStream(device: _int) -> _int: ...
 def _cuda_getDefaultStream(device: _int) -> Tuple: ...
 def _cuda_getStreamFromExternal(data_ptr: _int, device_index: _int) -> Tuple: ...
+=======
+# Defined in torch/csrc/cuda/Module.cpp
+def _cuda_getCurrentStream(device: _int) -> tuple: ...
+def _cuda_getCurrentRawStream(device: _int) -> _int: ...
+def _cuda_getDefaultStream(device: _int) -> tuple: ...
+def _cuda_getStreamFromExternal(data_ptr: _int, device_index: _int) -> tuple: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _cuda_getCurrentBlasHandle() -> _int: ...
 def _cuda_clearCublasWorkspaces() -> None: ...
 def _cuda_setDevice(device: _int) -> None: ...
@@ -1880,12 +2877,20 @@ def _cuda_exchangeDevice(device: _int) -> _int: ...
 def _cuda_maybeExchangeDevice(device: _int) -> _int: ...
 def _cuda_getDevice() -> _int: ...
 def _cuda_getDeviceCount() -> _int: ...
+<<<<<<< HEAD
 def _cuda_set_sync_debug_mode(warn_level: Union[_int, str]) -> None: ...
+=======
+def _cuda_set_sync_debug_mode(warn_level: _int | str) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _cuda_get_sync_debug_mode() -> _int: ...
 def _cuda_sleep(cycles: _int) -> None: ...
 def _cuda_synchronize() -> None: ...
 def _cuda_ipc_collect() -> None: ...
+<<<<<<< HEAD
 def _cuda_getArchFlags() -> Optional[str]: ...
+=======
+def _cuda_getArchFlags() -> str | None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _cuda_init() -> None: ...
 def _cuda_setStream(stream_id: _int, device_index: _int, device_type: _int) -> None: ...
 def _cuda_getCompiledVersion() -> _int: ...
@@ -1894,6 +2899,7 @@ def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int:
 def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
 def _cuda_cudaCachingAllocator_enable(val: _bool) -> None: ...
 def _cuda_cudaCachingAllocator_set_allocator_settings(env: str) -> None: ...
+<<<<<<< HEAD
 def _cuda_beginAllocateToPool(device: _int, mempool_id: Tuple[_int, _int]) -> None: ...
 def _cuda_beginAllocateCurrentStreamToPool(device: _int, mempool_id: Tuple[_int, _int]) -> None: ...
 def _cuda_endAllocateCurrentStreamToPool(device: _int, mempool_id: Tuple[_int, _int]) -> None: ...
@@ -1910,12 +2916,47 @@ def _cuda_hostMemoryStats() -> Dict[str, Any]: ...
 def _cuda_resetAccumulatedHostMemoryStats() -> None: ...
 def _cuda_resetPeakHostMemoryStats() -> None: ...
 def _cuda_memorySnapshot() -> Dict[str, Any]: ...
+=======
+def _cuda_beginAllocateToPool(device: _int, mempool_id: tuple[_int, _int]) -> None: ...
+def _cuda_beginAllocateCurrentThreadToPool(
+    device: _int,
+    mempool_id: tuple[_int, _int],
+) -> None: ...
+def _cuda_endAllocateToPool(device: _int, mempool_id: tuple[_int, _int]) -> None: ...
+def _cuda_beginAllocateCurrentStreamToPool(
+    device: _int,
+    mempool_id: tuple[_int, _int],
+) -> None: ...
+def _cuda_releasePool(device: _int, mempool_id: tuple[_int, _int]) -> None: ...
+def _cuda_checkPoolLiveAllocations(
+    device: _int,
+    mempool_id: tuple[_int, _int],
+    expected_live_allocations: set,
+) -> _bool: ...
+def _cuda_setCheckpointPoolState(
+    device: _int,
+    state: _cuda_CUDAAllocator_AllocatorState,
+    stale_storages: list[_int],
+    storages_to_add_deleters_to: list[_int],
+) -> None: ...
+def _cuda_getMemoryFraction(device: _int) -> _float: ...
+def _cuda_setMemoryFraction(fraction: _float, device: _int) -> None: ...
+def _cuda_emptyCache() -> None: ...
+def _cuda_memoryStats(device: _int) -> dict[str, Any]: ...
+def _cuda_resetAccumulatedMemoryStats(device: _int) -> None: ...
+def _cuda_resetPeakMemoryStats(device: _int) -> None: ...
+def _cuda_hostMemoryStats() -> dict[str, Any]: ...
+def _cuda_resetAccumulatedHostMemoryStats() -> None: ...
+def _cuda_resetPeakHostMemoryStats() -> None: ...
+def _cuda_memorySnapshot(mempool_id: tuple[_int, _int] | None) -> dict[str, Any]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _cuda_record_memory_history_legacy(
     enabled: _bool,
     record_context: _bool,
     record_context_cpp: _bool,
     alloc_trace_max_entries: _int,
     alloc_trace_record_context: _bool,
+<<<<<<< HEAD
 ) -> None: ...
 def _cuda_record_memory_history(
     enabled: Optional[str],
@@ -1935,6 +2976,40 @@ def _remove_cached_tensor(t: Tensor) -> None: ...
 def _tensors_data_ptrs_at_indices_equal(tensors: List[Union[Tensor, _int]], ptrs: List[Optional[_int]], indices: List[_int]) -> _bool: ...
 def _construct_CUDA_Tensor_From_Storage_And_Metadata(metadata: dict, storage: Storage) -> Tensor: ...
 def _storage_Use_Count(storage_ptr: _int) -> _int: ...
+=======
+    clear_history: _bool,
+    compile_context: _bool,
+) -> None: ...
+def _cuda_record_memory_history(
+    enabled: str | None,
+    context: str | None,
+    stacks: str,
+    max_entries: _int,
+    clear_history: _bool,
+    compile_context: _bool,
+) -> None: ...
+def _cuda_isHistoryEnabled() -> _bool: ...
+def _cuda_getAllocatorBackend() -> str: ...
+
+class _cuda_CUDAAllocator_AllocatorState: ...
+
+def _cuda_getCheckpointState(
+    device: _int,
+    mempool: tuple[_int, _int],
+) -> _cuda_CUDAAllocator_AllocatorState: ...
+def _set_cached_tensors_enabled(enabled: _bool) -> None: ...
+def _add_cached_tensor(t: Tensor) -> None: ...
+def _remove_cached_tensor(t: Tensor) -> None: ...
+def _tensors_data_ptrs_at_indices_equal(
+    tensors: list[Tensor | _int],
+    ptrs: list[_int | None],
+    indices: list[_int],
+) -> _bool: ...
+def _construct_CUDA_Tensor_From_Storage_And_Metadata(
+    metadata: dict,
+    storage: Storage,
+) -> Tensor: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _set_storage_access_error_msg(t: Tensor, s: str) -> None: ...
 def _set_storage_data_ptr_access_error_msg(storage_ptr: _int, s: str) -> None: ...
 def _free_And_Remove_DeleterFn(storage_ptr: _int) -> None: ...
@@ -1953,15 +3028,24 @@ def _cuda_jiterator_compile_and_launch_kernel(
     kernel_name: str,
     return_by_ref: _bool,
     num_outputs: _int,
+<<<<<<< HEAD
     tensors: Tuple,
     kwargs: Dict[str, Union[_int, _float, _bool]],
+=======
+    tensors: tuple,
+    kwargs: dict[str, _int | _float | _bool],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor: ...
 def _cuda_get_cudnn_benchmark_limit() -> _int: ...
 def _cuda_set_cudnn_benchmark_limit(arg: _int) -> None: ...
 def _cuda_get_conv_benchmark_empty_cache() -> _bool: ...
 def _cudnn_set_conv_benchmark_empty_cache(enable: _bool) -> None: ...
 def _nccl_version() -> _int: ...
+<<<<<<< HEAD
 def _nccl_version_suffix() -> bytes : ...
+=======
+def _nccl_version_suffix() -> bytes: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _nccl_unique_id() -> bytes: ...
 def _nccl_init_rank(nranks: _int, comm_id: bytes, rank: _int) -> object: ...
 def _nccl_reduce(
@@ -1969,34 +3053,59 @@ def _nccl_reduce(
     output: Tensor,
     root: _int,
     op: _int,
+<<<<<<< HEAD
     streams: Optional[Sequence[_CudaStreamBase]],
     comms: Optional[Sequence[object]],
+=======
+    streams: Sequence[_CudaStreamBase] | None,
+    comms: Sequence[object] | None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...
 def _nccl_all_reduce(
     input: Sequence[Tensor],
     output: Sequence[Tensor],
     op: _int,
+<<<<<<< HEAD
     streams: Optional[Sequence[_CudaStreamBase]],
     comms: Optional[Sequence[object]],
+=======
+    streams: Sequence[_CudaStreamBase] | None,
+    comms: Sequence[object] | None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...
 def _nccl_broadcast(
     input: Sequence[Tensor],
     root: _int,
+<<<<<<< HEAD
     streams: Optional[Sequence[_CudaStreamBase]],
     comms: Optional[Sequence[object]],
+=======
+    streams: Sequence[_CudaStreamBase] | None,
+    comms: Sequence[object] | None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...
 def _nccl_all_gather(
     input: Sequence[Tensor],
     output: Sequence[Tensor],
+<<<<<<< HEAD
     streams: Optional[Sequence[_CudaStreamBase]],
     comms: Optional[Sequence[object]],
+=======
+    streams: Sequence[_CudaStreamBase] | None,
+    comms: Sequence[object] | None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...
 def _nccl_reduce_scatter(
     input: Sequence[Tensor],
     output: Sequence[Tensor],
     op: _int,
+<<<<<<< HEAD
     streams: Optional[Sequence[_CudaStreamBase]],
     comms: Optional[Sequence[object]],
+=======
+    streams: Sequence[_CudaStreamBase] | None,
+    comms: Sequence[object] | None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...
 def _rocm_is_backward_pass() -> _bool: ...
 def _cuda_tunableop_enable(val: _bool) -> None: ...
@@ -2007,6 +3116,7 @@ def _cuda_tunableop_set_max_tuning_duration(duration: _int) -> None: ...
 def _cuda_tunableop_get_max_tuning_duration() -> _int: ...
 def _cuda_tunableop_set_max_tuning_iterations(iterations: _int) -> None: ...
 def _cuda_tunableop_get_max_tuning_iterations() -> _int: ...
+<<<<<<< HEAD
 def _cuda_tunableop_set_filename(filename: str, insert_device_ordinal: Optional[_bool]) -> None: ...
 def _cuda_tunableop_get_filename() -> str: ...
 def _cuda_tunableop_write_file(filename: Optional[str]) -> _bool: ...
@@ -2014,6 +3124,18 @@ def _cuda_tunableop_read_file(filename: Optional[str]) -> _bool: ...
 def _cuda_tunableop_write_file_on_exit(val: _bool) -> None: ...
 def _cuda_tunableop_get_results() -> Tuple[str, str, str, _float]: ...
 def _cuda_tunableop_get_validators() -> Tuple[str, str]: ...
+=======
+def _cuda_tunableop_set_filename(
+    filename: str,
+    insert_device_ordinal: _bool | None,
+) -> None: ...
+def _cuda_tunableop_get_filename() -> str: ...
+def _cuda_tunableop_write_file(filename: str | None) -> _bool: ...
+def _cuda_tunableop_read_file(filename: str | None) -> _bool: ...
+def _cuda_tunableop_write_file_on_exit(val: _bool) -> None: ...
+def _cuda_tunableop_get_results() -> tuple[str, str, str, _float]: ...
+def _cuda_tunableop_get_validators() -> tuple[str, str]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _cuda_tunableop_set_rotating_buffer_size(buffer_size: _int) -> None: ...
 def _cuda_tunableop_get_rotation_buffer_size() -> _int: ...
 
@@ -2036,7 +3158,11 @@ class _SDPAParams:
     query: Tensor
     key: Tensor
     value: Tensor
+<<<<<<< HEAD
     attn_mask: Optional[Tensor]
+=======
+    attn_mask: Tensor | None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dropout: _float
     is_causal: _bool
     enable_gqa: _bool
@@ -2045,10 +3171,18 @@ class _SDPAParams:
         query: Tensor,
         key: Tensor,
         value: Tensor,
+<<<<<<< HEAD
         attn_mask: Optional[Tensor],
         dropout: _float,
         is_causal: _bool,
         enable_gqa: _bool) -> None: ...
+=======
+        attn_mask: Tensor | None,
+        dropout: _float,
+        is_causal: _bool,
+        enable_gqa: _bool,
+    ) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _SDPBackend(Enum):
     ERROR = -1
@@ -2071,6 +3205,7 @@ def _gds_load_storage(handle: _int, s: Storage, offset: _int) -> None: ...
 def _gds_save_storage(handle: _int, s: Storage, offset: _int) -> None: ...
 
 # Defined in torch/csrc/cuda/python_comm.cpp
+<<<<<<< HEAD
 def _broadcast(tensor: Tensor, devices: List[_int]) -> List[Tensor]: ...
 def _broadcast_out(tensor: Tensor, out_tensors: List[Tensor]) -> List[Tensor]: ...
 def _broadcast_coalesced(
@@ -2097,6 +3232,34 @@ def _gather(
     destination_index: Optional[_int],
 ) -> Tensor: ...
 def _gather_out(tensors: List[Tensor], out_tensor: Tensor, dim: _int) -> Tensor: ...
+=======
+def _broadcast(tensor: Tensor, devices: list[_int]) -> list[Tensor]: ...
+def _broadcast_out(tensor: Tensor, out_tensors: list[Tensor]) -> list[Tensor]: ...
+def _broadcast_coalesced(
+    tensors: list[Tensor],
+    devices: list[_int],
+    buffer_size: _int,
+) -> list[list[Tensor]]: ...
+def _scatter(
+    tensor: Tensor,
+    devices: list[_int],
+    chunk_sizes: list[_int] | None,
+    dim: _int,
+    streams: list[Stream] | None,
+) -> list[Tensor]: ...
+def _scatter_out(
+    tensor: Tensor,
+    out_tensors: list[Tensor],
+    dim: _int,
+    streams: list[Stream] | None,
+) -> list[Tensor]: ...
+def _gather(
+    tensors: list[Tensor],
+    dim: _int,
+    destination_index: _int | None,
+) -> Tensor: ...
+def _gather_out(tensors: list[Tensor], out_tensor: Tensor, dim: _int) -> Tensor: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/cuda/Stream.cpp
 class _CudaStreamBase(Stream):
@@ -2109,15 +3272,26 @@ class _CudaStreamBase(Stream):
     priority: _int
 
     def __new__(
+<<<<<<< HEAD
         self,
+=======
+        cls,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         priority: _int = 0,
         stream_id: _int = 0,
         device_index: _int = 0,
         stream_ptr: _int = 0,
+<<<<<<< HEAD
     ) -> _CudaStreamBase: ...
     def query(self) -> _bool: ...
     def synchronize(self) -> None: ...
     def priority_range(self) -> Tuple[_int, _int]: ...
+=======
+    ) -> Self: ...
+    def query(self) -> _bool: ...
+    def synchronize(self) -> None: ...
+    def priority_range(self) -> tuple[_int, _int]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/cuda/Event.cpp
 class _CudaEventBase:
@@ -2129,7 +3303,12 @@ class _CudaEventBase:
         enable_timing: _bool = False,
         blocking: _bool = False,
         interprocess: _bool = False,
+<<<<<<< HEAD
     ) -> _CudaEventBase: ...
+=======
+        external: _bool = False,
+    ) -> Self: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def from_ipc_handle(cls, device: _device, ipc_handle: bytes) -> _CudaEventBase: ...
     def record(self, stream: _CudaStreamBase) -> None: ...
@@ -2141,6 +3320,7 @@ class _CudaEventBase:
 
 # Defined in torch/csrc/cuda/Graph.cpp
 class _CUDAGraph:
+<<<<<<< HEAD
     def capture_begin(self, pool: Optional[Tuple[_int, _int]] = ..., capture_error_mode: str = "global") -> None: ...
     def capture_end(self) -> None: ...
     def register_generator_state(self, Generator) -> None: ...
@@ -2166,6 +3346,43 @@ class _MemPoolContext:
 
 def _cuda_isCurrentStreamCapturing() -> _bool: ...
 def _graph_pool_handle() -> Tuple[_int, _int]: ...
+=======
+    def __new__(cls, keep_graph: _bool = ...) -> Self: ...
+    def capture_begin(
+        self,
+        pool: tuple[_int, _int] | None = ...,
+        capture_error_mode: str = "global",
+    ) -> None: ...
+    def capture_end(self) -> None: ...
+    def instantiate(self) -> None: ...
+    def register_generator_state(self, Generator) -> None: ...
+    def replay(self) -> None: ...
+    def reset(self) -> None: ...
+    def pool(self) -> tuple[_int, _int]: ...
+    def enable_debug_mode(self) -> None: ...
+    def debug_dump(self, debug_path: str) -> None: ...
+    def raw_cuda_graph(self) -> _int: ...
+
+# Defined in torch/csrc/cuda/MemPool.cpp
+class _MemPool:
+    def __init__(
+        self,
+        allocator: _cuda_CUDAAllocator | None = None,
+        is_user_created: _bool = True,
+        use_on_oom: _bool = False,
+        symmetric: _bool = False,
+    ) -> None: ...
+    @property
+    def id(self) -> tuple[_int, _int]: ...
+    @property
+    def is_symmetric(self) -> _bool: ...
+    @property
+    def allocator(self) -> _cuda_CUDAAllocator | None: ...
+    def use_count(self) -> _int: ...
+
+def _cuda_isCurrentStreamCapturing() -> _bool: ...
+def _graph_pool_handle() -> tuple[_int, _int]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/xpu/Module.cpp
 def _xpu_setDevice(device: _int) -> None: ...
@@ -2173,6 +3390,7 @@ def _xpu_exchangeDevice(device: _int) -> _int: ...
 def _xpu_maybeExchangeDevice(device: _int) -> _int: ...
 def _xpu_getDevice() -> _int: ...
 def _xpu_getDeviceCount() -> _int: ...
+<<<<<<< HEAD
 def _xpu_getArchFlags() -> Optional[str]: ...
 def _xpu_init() -> None: ...
 def _xpu_setStream(stream_id: _int, device_index: _int, device_type: _int) -> None: ...
@@ -2185,6 +3403,20 @@ def _xpu_memoryStats(device: _int) -> Dict[str, Any]: ...
 def _xpu_resetAccumulatedMemoryStats(device: _int) -> None: ...
 def _xpu_resetPeakMemoryStats(device: _int) -> None: ...
 def _xpu_getMemoryInfo(device: _int) -> Tuple[_int, _int]: ...
+=======
+def _xpu_getArchFlags() -> str | None: ...
+def _xpu_init() -> None: ...
+def _xpu_setStream(stream_id: _int, device_index: _int, device_type: _int) -> None: ...
+def _xpu_getCurrentStream(device: _int) -> tuple: ...
+def _xpu_getCurrentRawStream(device: _int) -> _int: ...
+def _xpu_getStreamFromExternal(data_ptr: _int, device_index: _int) -> tuple: ...
+def _xpu_synchronize(device: _int) -> None: ...
+def _xpu_emptyCache() -> None: ...
+def _xpu_memoryStats(device: _int) -> dict[str, Any]: ...
+def _xpu_resetAccumulatedMemoryStats(device: _int) -> None: ...
+def _xpu_resetPeakMemoryStats(device: _int) -> None: ...
+def _xpu_getMemoryInfo(device: _int) -> tuple[_int, _int]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _XpuDeviceProperties:
     name: str
@@ -2196,7 +3428,11 @@ class _XpuDeviceProperties:
     gpu_eu_count: _int
     max_work_group_size: _int
     max_num_sub_groups: _int
+<<<<<<< HEAD
     sub_group_sizes: List[_int]
+=======
+    sub_group_sizes: list[_int]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     has_fp16: _bool
     has_fp64: _bool
     has_atomic64: _bool
@@ -2225,18 +3461,30 @@ class _XpuStreamBase(Stream):
         stream_id: _int = 0,
         device_index: _int = 0,
         device_type: _int = 0,
+<<<<<<< HEAD
     ) -> _XpuStreamBase: ...
     def query(self) -> _bool: ...
     def synchronize(self) -> None: ...
     @staticmethod
     def priority_range() -> Tuple: ...
+=======
+    ) -> Self: ...
+    def query(self) -> _bool: ...
+    def synchronize(self) -> None: ...
+    @staticmethod
+    def priority_range() -> tuple: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/xpu/Event.cpp
 class _XpuEventBase:
     device: _device
     sycl_event: _int
 
+<<<<<<< HEAD
     def __new__(cls, enable_timing: _bool = False) -> _XpuEventBase: ...
+=======
+    def __new__(cls, enable_timing: _bool = False) -> Self: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def record(self, stream: _XpuEventBase) -> None: ...
     def wait(self, stream: _XpuStreamBase) -> None: ...
     def query(self) -> _bool: ...
@@ -2249,19 +3497,31 @@ def _set_worker_signal_handlers(
 ) -> None: ...  # THPModule_setWorkerSignalHandlers
 def _set_worker_pids(
     key: _int,
+<<<<<<< HEAD
     child_pids: Tuple[_int, ...],
+=======
+    child_pids: tuple[_int, ...],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None: ...  # THPModule_setWorkerPIDs
 def _remove_worker_pids(loader_id: _int) -> None: ...  # THPModule_removeWorkerPIDs
 def _error_if_any_worker_fails() -> None: ...  # THPModule_errorIfAnyWorkerFails
 
 # Defined in torch/csrc/DeviceAccelerator.cpp
 def _accelerator_getAccelerator() -> _device: ...
+<<<<<<< HEAD
 def _accelerator_deviceCount() -> _int: ...
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _accelerator_setDeviceIndex(device_index: _int) -> None: ...
 def _accelerator_getDeviceIndex() -> _int: ...
 def _accelerator_setStream(Stream) -> None: ...
 def _accelerator_getStream(device_index: _int) -> Stream: ...
 def _accelerator_synchronizeDevice(device_index: _int) -> None: ...
+<<<<<<< HEAD
+=======
+def _accelerator_exchangeDevice(device_index: _int) -> _int: ...
+def _accelerator_maybeExchangeDevice(device_index: _int) -> _int: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/jit/python/python_tracer.cpp
 class TracingState:
@@ -2278,8 +3538,13 @@ def _create_graph_by_tracing(
     strict: Any,
     force_outplace: Any,
     self: Any = None,
+<<<<<<< HEAD
     argument_names: List[str] = [],
 ) -> Tuple[Graph, Stack]: ...
+=======
+    argument_names: list[str] = ...,
+) -> tuple[Graph, Stack]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _tracer_warn_use_python(): ...
 def _get_tracing_state() -> TracingState: ...
 
@@ -2287,12 +3552,17 @@ def _get_tracing_state() -> TracingState: ...
 # Not actually defined in python_ir.cpp, not sure where they are.
 class IValue: ...
 
+<<<<<<< HEAD
 Stack = List[IValue]
+=======
+Stack: TypeAlias = list[IValue]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class JitType:
     annotation_str: str
     def isSubtypeOf(self, other: JitType) -> _bool: ...
     def with_dtype(self, dtype: _dtype) -> JitType: ...
+<<<<<<< HEAD
     def with_sizes(self, sizes: List[Optional[_int]]) -> JitType: ...
     def kind(self) -> str: ...
     def scalarType(self) -> Optional[str]: ...
@@ -2301,10 +3571,21 @@ class JitType:
 
 class InferredType:
     def __init__(self, arg: Union[JitType, str]): ...
+=======
+    def with_sizes(self, sizes: list[_int | None]) -> JitType: ...
+    def kind(self) -> str: ...
+    def scalarType(self) -> str | None: ...
+    def getElementType(self) -> JitType: ...
+    def dtype(self) -> _dtype | None: ...
+
+class InferredType:
+    def __init__(self, arg: JitType | str) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def type(self) -> JitType: ...
     def success(self) -> _bool: ...
     def reason(self) -> str: ...
 
+<<<<<<< HEAD
 R = TypeVar("R", bound=JitType)
 
 class Type(JitType):
@@ -2320,6 +3601,19 @@ class Type(JitType):
     def device(self) -> Optional[_device]: ...
     def __eq__(self, other: object) -> _bool: ...
     __hash__ = None  # type: ignore[assignment]
+=======
+class Type(JitType):
+    def str(self) -> _str: ...
+    def containedTypes(self) -> list[JitType]: ...
+    def dim(self) -> _int | None: ...
+    def undefined(self) -> _bool | None: ...
+    def sizes(self) -> list[_int] | None: ...
+    def symbol_sizes(self) -> list[_int] | None: ...
+    def varyingSizes(self) -> list[_int | None] | None: ...
+    def strides(self) -> list[_int] | None: ...
+    def contiguous(self) -> Self: ...
+    def device(self) -> _device | None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_interface_type(self) -> _bool: ...
     def requires_grad(self) -> _bool: ...
     @property
@@ -2399,6 +3693,7 @@ class DictType(JitType):
     def getValueType(self) -> JitType: ...
 
 class TupleType(JitType):
+<<<<<<< HEAD
     def __init__(self, a: List[Optional[JitType]]) -> None: ...
     def elements(self) -> List[JitType]: ...
 
@@ -2417,6 +3712,28 @@ class InterfaceType(JitType):
 class OptionalType(JitType, Generic[R]):
     def __init__(self, a: JitType) -> None: ...
     def getElementType(self) -> JitType: ...
+=======
+    def __init__(self, a: list[JitType | None]) -> None: ...
+    def elements(self) -> list[JitType]: ...
+
+class UnionType(JitType):
+    def __init__(self, a: list[JitType]) -> None: ...
+
+class ClassType(JitType):
+    def __init__(self, qualified_name: str) -> None: ...
+    def qualified_name(self) -> str: ...
+
+class InterfaceType(JitType):
+    def __init__(self, qualified_name: str) -> None: ...
+    def getMethod(self, name: str) -> FunctionSchema | None: ...
+    def getMethodNames(self) -> list[str]: ...
+
+JitTypeT = TypeVar("JitTypeT", bound=JitType)  # noqa: PYI001
+
+class OptionalType(JitType, Generic[JitTypeT]):
+    def __init__(self, a: JitTypeT) -> None: ...
+    def getElementType(self) -> JitTypeT: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def ofTensor() -> OptionalType: ...
 
@@ -2436,7 +3753,11 @@ class EnumType(JitType):
         self,
         qualified_name: str,
         value_type: JitType,
+<<<<<<< HEAD
         enum_names_values: List[Any],
+=======
+        enum_names_values: list[Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None: ...
 
 class TensorType(JitType):
@@ -2444,6 +3765,7 @@ class TensorType(JitType):
     def get(cls) -> TensorType: ...
     @classmethod
     def getInferred(cls) -> TensorType: ...
+<<<<<<< HEAD
     def with_sizes(self, other: Optional[List[Optional[_int]]]) -> TensorType: ...
     def sizes(self) -> Optional[List[_int]]: ...
     def varyingSizes(self) -> Optional[List[Optional[_int]]]: ...
@@ -2451,6 +3773,15 @@ class TensorType(JitType):
     def device(self) -> Optional[_device]: ...
     def dim(self) -> _int: ...
     def dtype(self) -> Optional[_dtype]: ...
+=======
+    def with_sizes(self, other: list[_int | None] | None) -> TensorType: ...
+    def sizes(self) -> list[_int] | None: ...
+    def varyingSizes(self) -> list[_int | None] | None: ...
+    def strides(self) -> list[_int] | None: ...
+    def device(self) -> _device | None: ...
+    def dim(self) -> _int: ...
+    def dtype(self) -> _dtype | None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def create_from_tensor(t: Tensor) -> TensorType: ...
 
@@ -2489,11 +3820,16 @@ def _will_engine_execute_node(node: _Node) -> _bool: ...
 def _dispatch_key_set(tensor) -> str: ...
 
 # Defined in torch/csrc/Exceptions.cpp
+<<<<<<< HEAD
+=======
+class AcceleratorError(RuntimeError): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class OutOfMemoryError(RuntimeError): ...
 class _DistError(RuntimeError): ...
 class _DistBackendError(RuntimeError): ...
 class _DistStoreError(RuntimeError): ...
 class _DistNetworkError(RuntimeError): ...
+<<<<<<< HEAD
 
 # Defined in torch/csrc/profiler/init.cpp
 class CapturedTraceback:
@@ -2501,14 +3837,34 @@ class CapturedTraceback:
 def gather_traceback(python: _bool, script: _bool, cpp: _bool) -> CapturedTraceback: ...
 def symbolize_tracebacks(tracebacks: List[CapturedTraceback]) -> List[Dict[str, Any]]: ...
 
+=======
+class _DistQueueEmptyError(_DistStoreError): ...
+
+# Defined in torch/csrc/profiler/init.cpp
+class CapturedTraceback: ...
+
+def gather_traceback(python: _bool, script: _bool, cpp: _bool) -> CapturedTraceback: ...
+def symbolize_tracebacks(
+    tracebacks: list[CapturedTraceback],
+) -> list[dict[str, Any]]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _load_mobile_module_from_file(filename: str): ...
 def _load_mobile_module_from_bytes(bytes_: bytes): ...
 def _load_jit_module_from_file(filename: str): ...
 def _load_jit_module_from_bytes(bytes_: bytes): ...
 def _save_mobile_module(m: LiteScriptModule, filename: str): ...
+<<<<<<< HEAD
 def _save_jit_module(m: ScriptModule, filename: str, extra_files: Dict[str, Any]): ...
 def _save_mobile_module_to_bytes(m: LiteScriptModule) -> bytes: ...
 def _save_jit_module_to_bytes(m: ScriptModule,  extra_files: Dict[str, Any]) -> bytes: ...
+=======
+def _save_jit_module(m: ScriptModule, filename: str, extra_files: dict[str, Any]): ...
+def _save_mobile_module_to_bytes(m: LiteScriptModule) -> bytes: ...
+def _save_jit_module_to_bytes(
+    m: ScriptModule,
+    extra_files: dict[str, Any],
+) -> bytes: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_module_info_from_flatbuffer(data: bytes): ...
 def _jit_resolve_packet(op_name: str, *args, **kwargs) -> str: ...
 def _swap_tensor_impl(t1: Tensor, t2: Tensor): ...
@@ -2516,17 +3872,30 @@ def _pickle_save(obj: Any) -> bytes: ...
 def _pickle_load_obj(bs: bytes) -> Any: ...
 
 # Defined in torch/csrc/jit/runtime/static/init.cpp
+<<<<<<< HEAD
 def _jit_to_static_module(graph_or_module: Union[Graph,ScriptModule]) -> Any: ...
 def _fuse_to_static_module(graph_or_module: Union[Graph,ScriptModule], min_size: _int) -> Any: ...
+=======
+def _jit_to_static_module(graph_or_module: Graph | ScriptModule) -> Any: ...
+def _fuse_to_static_module(
+    graph_or_module: Graph | ScriptModule,
+    min_size: _int,
+) -> Any: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined in torch/csrc/fx/node.cpp
 def _fx_map_aggregate(a: Any, fn: Callable[[Any], Any]) -> Any: ...
 def _fx_map_arg(a: Any, fn: Callable[[Any], Any]) -> Any: ...
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _NodeBase:
     _erased: _bool
     _prev: FxNode
     _next: FxNode
     def __init__(
+<<<<<<< HEAD
             self,
             graph: Any,
             name: str,
@@ -2540,3 +3909,40 @@ class _NodeIter(Iterator):
     def __init__(self, root: FxNode, reversed: _bool) -> None: ...
     def __iter__(self) -> Iterator[FxNode]: ...
     def __next__(self) -> FxNode: ...
+=======
+        self,
+        graph: Any,
+        name: str,
+        op: str,
+        target: Any,
+        return_type: Any,
+    ) -> None: ...
+    def _update_args_kwargs(self, args: tuple[Any, ...], kwargs: dict[str, Any]): ...
+
+class _NodeIter(Iterator[FxNode]):
+    def __init__(self, root: FxNode, reversed: _bool) -> None: ...
+    def __iter__(self) -> Self: ...
+    def __next__(self) -> FxNode: ...
+
+# Defined in torch/csrc/inductor/static_cuda_launcher.cpp
+class _StaticCudaLauncher:
+    @staticmethod
+    def _load_kernel(
+        cubin_file: str,
+        func_name: str,
+        shared_mem_bytes: _int,
+        device: _int,
+    ) -> tuple[_int, _int, _int]: ...
+    @staticmethod
+    def _launch_kernel(
+        func: _int,
+        grid_x: _int,
+        grid_y: _int,
+        grid_z: _int,
+        num_warps: _int,
+        shared_mem_bytes: _int,
+        arg_types: str,
+        args: tuple[Any, ...],
+        stream: _int,
+    ) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_C/_aoti.pyi b/torch/_C/_aoti.pyi
index d3eb06bc1a33..9c5e2882e799 100644
--- a/torch/_C/_aoti.pyi
+++ b/torch/_C/_aoti.pyi
@@ -1,4 +1,8 @@
 from ctypes import c_void_p
+<<<<<<< HEAD
+=======
+from typing import overload, Protocol
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch import Tensor
 
@@ -16,9 +20,157 @@ def alloc_tensor_by_stealing_from_void_ptr(
     handle: c_void_p,
 ) -> Tensor: ...
 
+<<<<<<< HEAD
 class AOTIModelContainerRunnerCpu: ...
 class AOTIModelContainerRunnerCuda: ...
 class AOTIModelContainerRunnerXpu: ...
 
 # Defined in torch/csrc/inductor/aoti_package/pybind.cpp
 class AOTIModelPackageLoader: ...
+=======
+class AOTIModelContainerRunner(Protocol):
+    def run(
+        self, inputs: list[Tensor], stream_handle: c_void_p = ...
+    ) -> list[Tensor]: ...
+    def get_call_spec(self) -> list[str]: ...
+    def get_constant_names_to_original_fqns(self) -> dict[str, str]: ...
+    def get_constant_names_to_dtypes(self) -> dict[str, int]: ...
+    def extract_constants_map(self, use_inactive: bool) -> dict[str, Tensor]: ...
+    def update_constant_buffer(
+        self,
+        tensor_map: dict[str, Tensor],
+        use_inactive: bool,
+        validate_full_updates: bool,
+        user_managed: bool = ...,
+    ) -> None: ...
+    def swap_constant_buffer(self) -> None: ...
+    def free_inactive_constant_buffer(self) -> None: ...
+
+class AOTIModelContainerRunnerCpu:
+    def __init__(self, model_so_path: str, num_models: int) -> None: ...
+    def run(
+        self, inputs: list[Tensor], stream_handle: c_void_p = ...
+    ) -> list[Tensor]: ...
+    def get_call_spec(self) -> list[str]: ...
+    def get_constant_names_to_original_fqns(self) -> dict[str, str]: ...
+    def get_constant_names_to_dtypes(self) -> dict[str, int]: ...
+    def extract_constants_map(self, use_inactive: bool) -> dict[str, Tensor]: ...
+    def update_constant_buffer(
+        self,
+        tensor_map: dict[str, Tensor],
+        use_inactive: bool,
+        validate_full_updates: bool,
+        user_managed: bool = ...,
+    ) -> None: ...
+    def swap_constant_buffer(self) -> None: ...
+    def free_inactive_constant_buffer(self) -> None: ...
+
+class AOTIModelContainerRunnerCuda:
+    @overload
+    def __init__(self, model_so_path: str, num_models: int) -> None: ...
+    @overload
+    def __init__(
+        self, model_so_path: str, num_models: int, device_str: str
+    ) -> None: ...
+    @overload
+    def __init__(
+        self, model_so_path: str, num_models: int, device_str: str, cubin_dir: str
+    ) -> None: ...
+    def run(
+        self, inputs: list[Tensor], stream_handle: c_void_p = ...
+    ) -> list[Tensor]: ...
+    def get_call_spec(self) -> list[str]: ...
+    def get_constant_names_to_original_fqns(self) -> dict[str, str]: ...
+    def get_constant_names_to_dtypes(self) -> dict[str, int]: ...
+    def extract_constants_map(self, use_inactive: bool) -> dict[str, Tensor]: ...
+    def update_constant_buffer(
+        self,
+        tensor_map: dict[str, Tensor],
+        use_inactive: bool,
+        validate_full_updates: bool,
+        user_managed: bool = ...,
+    ) -> None: ...
+    def swap_constant_buffer(self) -> None: ...
+    def free_inactive_constant_buffer(self) -> None: ...
+
+class AOTIModelContainerRunnerXpu:
+    @overload
+    def __init__(self, model_so_path: str, num_models: int) -> None: ...
+    @overload
+    def __init__(
+        self, model_so_path: str, num_models: int, device_str: str
+    ) -> None: ...
+    @overload
+    def __init__(
+        self, model_so_path: str, num_models: int, device_str: str, kernel_bin_dir: str
+    ) -> None: ...
+    def run(
+        self, inputs: list[Tensor], stream_handle: c_void_p = ...
+    ) -> list[Tensor]: ...
+    def get_call_spec(self) -> list[str]: ...
+    def get_constant_names_to_original_fqns(self) -> dict[str, str]: ...
+    def get_constant_names_to_dtypes(self) -> dict[str, int]: ...
+    def extract_constants_map(self, use_inactive: bool) -> dict[str, Tensor]: ...
+    def update_constant_buffer(
+        self,
+        tensor_map: dict[str, Tensor],
+        use_inactive: bool,
+        validate_full_updates: bool,
+        user_managed: bool = ...,
+    ) -> None: ...
+    def swap_constant_buffer(self) -> None: ...
+    def free_inactive_constant_buffer(self) -> None: ...
+
+class AOTIModelContainerRunnerMps:
+    def __init__(self, model_so_path: str, num_models: int) -> None: ...
+    def run(
+        self, inputs: list[Tensor], stream_handle: c_void_p = ...
+    ) -> list[Tensor]: ...
+    def get_call_spec(self) -> list[str]: ...
+    def get_constant_names_to_original_fqns(self) -> dict[str, str]: ...
+    def get_constant_names_to_dtypes(self) -> dict[str, int]: ...
+    def extract_constants_map(self, use_inactive: bool) -> dict[str, Tensor]: ...
+    def update_constant_buffer(
+        self,
+        tensor_map: dict[str, Tensor],
+        use_inactive: bool,
+        validate_full_updates: bool,
+        user_managed: bool = ...,
+    ) -> None: ...
+    def swap_constant_buffer(self) -> None: ...
+    def free_inactive_constant_buffer(self) -> None: ...
+
+# Defined in torch/csrc/inductor/aoti_package/pybind.cpp
+class AOTIModelPackageLoader:
+    def __init__(
+        self,
+        model_package_path: str,
+        model_name: str,
+        run_single_threaded: bool,
+        num_runners: int,
+        device_index: int,
+    ) -> None: ...
+    def get_metadata(self) -> dict[str, str]: ...
+    def run(
+        self, inputs: list[Tensor], stream_handle: c_void_p = ...
+    ) -> list[Tensor]: ...
+    def boxed_run(
+        self, inputs: list[Tensor], stream_handle: c_void_p = ...
+    ) -> list[Tensor]: ...
+    def get_call_spec(self) -> list[str]: ...
+    def get_constant_fqns(self) -> list[str]: ...
+    def load_constants(
+        self,
+        constants_map: dict[str, Tensor],
+        use_inactive: bool,
+        check_full_update: bool,
+        user_managed: bool = ...,
+    ) -> None: ...
+    def update_constant_buffer(
+        self,
+        tensor_map: dict[str, Tensor],
+        use_inactive: bool,
+        validate_full_updates: bool,
+        user_managed: bool = ...,
+    ) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
index 52fea2bdf441..9f694561000f 100644
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -116,6 +116,12 @@ def _push_saved_tensors_default_hooks(
     unpack_hook: Callable[[Any], torch.Tensor],
 ) -> None: ...
 def _pop_saved_tensors_default_hooks() -> None: ...
+<<<<<<< HEAD
+=======
+def _top_saved_tensors_default_hooks(
+    ignore_is_tracing: bool,
+) -> tuple[Callable[[torch.Tensor], Any], Callable[[Any], torch.Tensor]]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _unsafe_set_version_counter(
     t: tuple[torch.Tensor, ...], prev_version: tuple[int, ...]
 ) -> None: ...
@@ -123,7 +129,11 @@ def _enable_profiler_legacy(config: ProfilerConfig) -> None: ...
 def _disable_profiler_legacy() -> list[list[ProfilerEvent]]: ...
 def _profiler_type() -> ActiveProfilerType: ...
 def _saved_tensors_hooks_enable() -> None: ...
+<<<<<<< HEAD
 def _saved_tensors_hooks_disable(message: str) -> None: ...
+=======
+def _saved_tensors_hooks_disable(message: str, fail_if_non_empty=True) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _saved_tensors_hooks_get_disabled_error_message() -> str | None: ...
 def _saved_tensors_hooks_set_tracing(is_tracing: bool) -> bool: ...
 
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index e4b5a116fdbd..42142ca575cc 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -2,7 +2,11 @@
 # mypy: disable-error-code="type-arg"
 from datetime import timedelta
 from enum import Enum
+<<<<<<< HEAD
 from typing import Any, overload
+=======
+from typing import Any, Optional, overload, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -50,6 +54,11 @@ class Reducer:
         gradient_as_bucket_view: bool = ...,
         param_to_name_mapping: dict[int, str] = ...,
         first_bucket_types_cap: int = ...,  # kDefaultFirstBucketBytes in reducer.hpp
+<<<<<<< HEAD
+=======
+        skip_all_reduce_unused_params: bool = ...,
+        use_python_reducer: bool = ...,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None: ...
     def prepare_for_forward(self) -> None: ...
     def prepare_for_backward(self, output: list[Tensor]) -> None: ...
@@ -139,6 +148,11 @@ class BroadcastOptions:
 class AllreduceOptions:
     reduceOp: ReduceOp
     timeout: timedelta
+<<<<<<< HEAD
+=======
+    asyncOp: bool
+    sparseIndices: Optional[Tensor]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class AllreduceCoalescedOptions(AllreduceOptions): ...
 
@@ -147,6 +161,10 @@ class ReduceOptions:
     rootRank: int
     rootTensor: int
     timeout: timedelta
+<<<<<<< HEAD
+=======
+    asyncOp: bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class AllgatherOptions:
     timeout: timedelta
@@ -155,6 +173,10 @@ class AllgatherOptions:
 class GatherOptions:
     rootRank: int
     timeout: timedelta
+<<<<<<< HEAD
+=======
+    asyncOp: bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ScatterOptions:
     rootRank: int
@@ -170,14 +192,26 @@ class BarrierOptions:
     device_ids: list[int]
     device: torch.device
     timeout: timedelta
+<<<<<<< HEAD
+
+class AllToAllOptions:
+    timeout: timedelta
+=======
+    asyncOp: bool
 
 class AllToAllOptions:
     timeout: timedelta
+    asyncOp: bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Store:
     def set(self, key: str, value: str): ...
     def get(self, key: str) -> bytes: ...
     def add(self, key: str, value: int) -> int: ...
+<<<<<<< HEAD
+=======
+    def check(self, keys: list[str]) -> bool: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def compare_set(
         self,
         key: str,
@@ -191,6 +225,12 @@ class Store:
     def wait(self, keys: list[str]): ...
     @overload
     def wait(self, keys: list[str], timeout: timedelta): ...
+<<<<<<< HEAD
+=======
+    def queue_pop(self, key: str, block: bool = True) -> bytes: ...
+    def queue_push(self, key: str, value: Union[bytes, str]) -> None: ...
+    def queue_len(self, key: str) -> int: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class FileStore(Store):
     def __init__(self, path: str, numWorkers: int = ...) -> None: ...
@@ -296,6 +336,11 @@ class Backend:
     @property
     def supports_coalescing(self) -> bool: ...
     @property
+<<<<<<< HEAD
+=======
+    def supports_time_estimate(self) -> bool: ...
+    @property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def options(self) -> Options: ...
     def rank(self) -> int: ...
     def size(self) -> int: ...
@@ -305,6 +350,17 @@ class Backend:
     def _set_sequence_number_for_group(self) -> None: ...
     def _set_default_timeout(self, timeout: timedelta) -> None: ...
     def get_error(self) -> ErrorType: ...
+<<<<<<< HEAD
+=======
+    def supports_tensor_alloc(self, device: torch.device) -> bool: ...
+    def allocate_tensor(
+        self,
+        size: int,
+        *,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> Tensor: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def mem_allocator(self) -> Any: ...
 
@@ -551,6 +607,11 @@ class ProcessGroupGloo(Backend):
     class Options(Backend.Options):
         devices: list[ProcessGroupGloo.Device]
         threads: int
+<<<<<<< HEAD
+=======
+        global_ranks_in_group: list[int]
+        group_name: str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __init__(self): ...
 
@@ -562,10 +623,19 @@ class ProcessGroupGloo(Backend):
         timeout: timedelta,
     ) -> None: ...
     @staticmethod
+<<<<<<< HEAD
     def create_device(hostname="", interface="") -> Device: ...
     @staticmethod
     def create_default_device() -> Device: ...
     def _set_default_timeout(self, timeout) -> None: ...
+=======
+    def create_device(hostname="", interface="", lazy_init=None) -> Device: ...
+    @staticmethod
+    def create_default_device(lazy_init=None) -> Device: ...
+    def _set_default_timeout(self, timeout) -> None: ...
+    @property
+    def options(self) -> Options: ...  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _ProcessGroupWrapper(Backend):
     def __init__(self, pg: Backend, gloo_pg: ProcessGroupGloo) -> None: ...
@@ -603,6 +673,11 @@ class ProcessGroupNCCL(Backend):
     ) -> None: ...
     def _group_start(self) -> None: ...
     def _group_end(self) -> None: ...
+<<<<<<< HEAD
+=======
+    def _start_time_estimate(self) -> None: ...
+    def _end_time_estimate(self) -> float: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _set_default_timeout(self, timeout) -> None: ...
     def perform_nocolor_split(self, device: torch.device) -> None: ...
     def register_mem_pool(self, pool: torch.cuda.MemPool) -> None: ...
@@ -615,6 +690,13 @@ class ProcessGroupNCCL(Backend):
     def uid(self) -> int: ...
     @property
     def options(self) -> Options: ...  # type: ignore[override]
+<<<<<<< HEAD
+=======
+    @staticmethod
+    def get_build_nccl_version(self) -> tuple[int, int, int]: ...
+    @staticmethod
+    def get_runtime_nccl_version(self) -> tuple[int, int, int]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ProcessGroupUCC(Backend):
     def __init__(
@@ -668,6 +750,17 @@ def _allow_inflight_collective_as_graph_input() -> bool: ...
 def _unregister_all_process_groups() -> None: ...
 def _unregister_process_group(group_name: str) -> None: ...
 
+<<<<<<< HEAD
+=======
+# Intializes the device state in CUmodule so that it’s able to perform NVSHMEM
+# operations.  CUmodule is a pointer to a CUDA module, carried by a int64 in
+# Python. At C++ interface, it is converted to a uintptr_t.
+def _nvshmemx_cumodule_init(module: int) -> None: ...
+
+# Check if NVSHMEM is available on current system.
+def _is_nvshmem_available() -> bool: ...
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _SymmetricMemory:
     @staticmethod
     def set_group_info(
diff --git a/torch/_C/_dynamo/__init__.pyi b/torch/_C/_dynamo/__init__.pyi
index e13ac5e01f8e..a5a48e667b8d 100644
--- a/torch/_C/_dynamo/__init__.pyi
+++ b/torch/_C/_dynamo/__init__.pyi
@@ -1,6 +1,13 @@
+<<<<<<< HEAD
 from typing import Union
 
 from . import compiled_autograd, eval_frame, guards  # noqa: F401
 
 def strip_function_call(name: str) -> str: ...
 def is_valid_var_name(name: str) -> Union[bool, int]: ...
+=======
+from . import compiled_autograd, eval_frame, guards  # noqa: F401
+
+def strip_function_call(name: str) -> str: ...
+def is_valid_var_name(name: str) -> bool | int: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_C/_dynamo/compiled_autograd.pyi b/torch/_C/_dynamo/compiled_autograd.pyi
index 2f2a1fec522b..7cee2c2fa3cb 100644
--- a/torch/_C/_dynamo/compiled_autograd.pyi
+++ b/torch/_C/_dynamo/compiled_autograd.pyi
@@ -1,5 +1,9 @@
 from typing import Callable
 
+<<<<<<< HEAD
+=======
+from torch import Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.compiled_autograd import AutogradCompilerInstance
 
 def set_autograd_compiler(
@@ -9,3 +13,7 @@ def set_autograd_compiler(
 def clear_cache() -> None: ...
 def is_cache_empty() -> bool: ...
 def set_verbose_logger(fn: Callable[[str], None] | None) -> bool: ...
+<<<<<<< HEAD
+=======
+def call_cpp_tensor_pre_hooks(idx: int, grad: Tensor) -> Tensor: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_C/_dynamo/eval_frame.pyi b/torch/_C/_dynamo/eval_frame.pyi
index da0b32637759..3971e66aa6f7 100644
--- a/torch/_C/_dynamo/eval_frame.pyi
+++ b/torch/_C/_dynamo/eval_frame.pyi
@@ -1,8 +1,19 @@
 import enum
 import types
+<<<<<<< HEAD
 from typing import overload
 
 from torch._dynamo.types import DynamoCallback, DynamoGuardHook
+=======
+from typing import Optional, overload
+
+from torch._dynamo.types import (
+    DynamoCallback,
+    DynamoGuardCompleteHook,
+    DynamoGuardHook,
+    GuardFn,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def set_eval_frame(callback: DynamoCallback) -> DynamoCallback: ...
 def set_skip_guard_eval_unsafe(value: bool) -> bool: ...
@@ -13,6 +24,12 @@ def set_code_exec_strategy(
     code: types.CodeType, strategy: _FrameExecStrategy
 ) -> None: ...
 def set_guard_error_hook(hook: DynamoGuardHook) -> None: ...
+<<<<<<< HEAD
+=======
+def set_guard_complete_hook(
+    hook: Optional[DynamoGuardCompleteHook],
+) -> Optional[DynamoGuardCompleteHook]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def raise_sigtrap() -> None: ...
 
 class _CacheEntry:
@@ -57,3 +74,10 @@ def _debug_get_cache_entry_list(code: types.CodeType) -> list[_CacheEntry]: ...
 py_opcode_caches: list[int]
 
 def code_framelocals_names(code: types.CodeType) -> tuple[str]: ...
+<<<<<<< HEAD
+=======
+def _load_precompile_entry(
+    code: types.CodeType, guard_manager: GuardFn, dynamo_code: types.CodeType
+) -> None: ...
+def _reset_precompile_entries(code: types.CodeType) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_C/_dynamo/guards.pyi b/torch/_C/_dynamo/guards.pyi
index 5059968df49d..e19f4702e613 100644
--- a/torch/_C/_dynamo/guards.pyi
+++ b/torch/_C/_dynamo/guards.pyi
@@ -176,6 +176,15 @@ def assert_size_stride(
     item: torch.Tensor,
     size: torch.types._size,
     stride: torch.types._size,
+<<<<<<< HEAD
+=======
+    op_name: str | None = None,
+): ...
+def assert_alignment(
+    item: torch.Tensor,
+    alignment: int,
+    op_name: str | None = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ): ...
 def check_obj_id(obj: object, expected: int) -> bool: ...
 def check_type_id(obj: object, expected: int) -> bool: ...
diff --git a/torch/_C/_export/__init__.pyi b/torch/_C/_export/__init__.pyi
new file mode 100644
index 000000000000..039f9c22eea6
--- /dev/null
+++ b/torch/_C/_export/__init__.pyi
@@ -0,0 +1,9 @@
+# Defined in torch/csrc/export/pybind.cpp
+class CppExportedProgram: ...
+
+def deserialize_exported_program(
+    serialized_program: str,
+) -> CppExportedProgram: ...
+def serialize_exported_program(
+    cpp_exported_program: CppExportedProgram,
+) -> str: ...
diff --git a/torch/_C/_export/pt2_archive_constants.pyi b/torch/_C/_export/pt2_archive_constants.pyi
new file mode 100644
index 000000000000..87e356453bcf
--- /dev/null
+++ b/torch/_C/_export/pt2_archive_constants.pyi
@@ -0,0 +1,22 @@
+# Defined in torch/csrc/export/pt2_archive_constants.h
+
+ARCHIVE_ROOT_NAME: str = ...
+ARCHIVE_FORMAT_PATH: str = ...
+ARCHIVE_FORMAT_VALUE: str = ...
+ARCHIVE_VERSION_PATH: str = ...
+ARCHIVE_VERSION_VALUE: str = ...
+MODELS_DIR: str = ...
+MODELS_FILENAME_FORMAT: str = ...
+AOTINDUCTOR_DIR: str = ...
+MTIA_DIR: str = ...
+WEIGHTS_DIR: str = ...
+WEIGHT_FILENAME_PREFIX: str = ...
+CONSTANTS_DIR: str = ...
+TENSOR_CONSTANT_FILENAME_PREFIX: str = ...
+CUSTOM_OBJ_FILENAME_PREFIX: str = ...
+SAMPLE_INPUTS_DIR: str = ...
+SAMPLE_INPUTS_FILENAME_FORMAT: str = ...
+EXTRA_DIR: str = ...
+MODULE_INFO_PATH: str = ...
+XL_MODEL_WEIGHTS_DIR: str = ...
+XL_MODEL_WEIGHTS_PARAM_CONFIG_PATH: str = ...
diff --git a/torch/_C/_functorch.pyi b/torch/_C/_functorch.pyi
index 4cdfb1346fd1..7e0ce95d1b65 100644
--- a/torch/_C/_functorch.pyi
+++ b/torch/_C/_functorch.pyi
@@ -49,6 +49,12 @@ class RandomnessType(Enum):
 class CInterpreter:
     def key(self) -> TransformType: ...
     def level(self) -> int: ...
+<<<<<<< HEAD
+=======
+    def serialize(self) -> bytes: ...
+    @staticmethod
+    def deserialize(bytes) -> CInterpreter: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CGradInterpreterPtr:
     def __init__(self, interpreter: CInterpreter) -> None: ...
diff --git a/torch/_C/_monitor.pyi b/torch/_C/_monitor.pyi
index d28c373e528b..38fd328cb243 100644
--- a/torch/_C/_monitor.pyi
+++ b/torch/_C/_monitor.pyi
@@ -3,7 +3,11 @@
 import datetime
 from enum import Enum
 from types import TracebackType
+<<<<<<< HEAD
 from typing import Callable, Optional
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Aggregation(Enum):
     VALUE = ...
@@ -48,9 +52,15 @@ class _WaitCounterTracker:
     def __enter__(self) -> None: ...
     def __exit__(
         self,
+<<<<<<< HEAD
         exec_type: Optional[type[BaseException]] = None,
         exec_value: Optional[BaseException] = None,
         traceback: Optional[TracebackType] = None,
+=======
+        exc_type: type[BaseException] | None = None,
+        exc_value: BaseException | None = None,
+        traceback: TracebackType | None = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None: ...
 
 class _WaitCounter:
diff --git a/torch/_C/_nn.pyi.in b/torch/_C/_nn.pyi.in
index 50c9a2a5c86a..23f82cb86cab 100644
--- a/torch/_C/_nn.pyi.in
+++ b/torch/_C/_nn.pyi.in
@@ -1,7 +1,12 @@
 # ${generated_comment}
 # mypy: disable-error-code="type-arg"
 
+<<<<<<< HEAD
 from typing import Literal, Optional, overload, Sequence, Union
+=======
+from collections.abc import Sequence
+from typing import Literal, overload
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch import memory_format, Tensor
 from torch.types import _bool, _device, _dtype, _int, _size
@@ -11,7 +16,11 @@ from torch.types import _bool, _device, _dtype, _int, _size
 ${c_nn_function_hints}
 
 # Defined in aten/src/ATen/native/mkldnn/Linear.cpp
+<<<<<<< HEAD
 def mkldnn_linear(input: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor: ...
+=======
+def mkldnn_linear(input: Tensor, weight: Tensor, bias: Tensor | None) -> Tensor: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Defined at aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
 def mkldnn_reorder_conv2d_weight(
@@ -61,10 +70,17 @@ def _parse_to(
 
 # Defined in aten/src/ATen/native/PackedSequence.cpp
 def pad_sequence(
+<<<<<<< HEAD
     sequences: Union[list[Tensor], tuple[Tensor, ...]],
     batch_first: bool = False,
     padding_value: float = 0.0,
     padding_side: Union[Literal["left", "right"], str] = "right",
+=======
+    sequences: list[Tensor] | tuple[Tensor, ...],
+    batch_first: bool = False,
+    padding_value: float = 0.0,
+    padding_side: Literal["left", "right"] = "right",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor: ...
 def flatten_dense_tensors(tensors: list[Tensor]) -> Tensor: ...
 def unflatten_dense_tensors(flat: Tensor, tensors: list[Tensor]) -> list[Tensor]: ...
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index 48b14cc4b467..e094a46d1bbd 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -1,5 +1,9 @@
 from enum import Enum
+<<<<<<< HEAD
 from typing import Any, Literal, Optional
+=======
+from typing import Literal
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import TypeAlias
 
 from torch._C import device, dtype, layout
@@ -73,7 +77,11 @@ class ProfilerConfig:
         with_flops: bool,
         with_modules: bool,
         experimental_config: _ExperimentalConfig,
+<<<<<<< HEAD
         trace_id: Optional[str] = None,
+=======
+        trace_id: str | None = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None: ...
 
 class _ProfilerEvent:
@@ -243,4 +251,8 @@ class _RecordFunctionFast:
         keyword_values: dict | None = None,
     ) -> None: ...
     def __enter__(self) -> None: ...
+<<<<<<< HEAD
     def __exit__(self, *args: Any) -> None: ...
+=======
+    def __exit__(self, *exc_info: object) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_C/return_types.pyi.in b/torch/_C/return_types.pyi.in
index 7951b602f159..7b1cfd327f8d 100644
--- a/torch/_C/return_types.pyi.in
+++ b/torch/_C/return_types.pyi.in
@@ -1,6 +1,7 @@
 # ${generated_comment}
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
 from typing import (
     Any,
     Callable,
@@ -19,6 +20,13 @@ from typing import (
 
 from torch import contiguous_format, Generator, inf, memory_format, strided, Tensor, SymInt
 from torch.types import (
+=======
+from typing import Final, NoReturn
+from typing_extensions import Self
+
+from torch import SymInt, Tensor
+from torch.types import (  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _bool,
     _device,
     _dtype,
@@ -30,6 +38,16 @@ from torch.types import (
     Number,
 )
 
+<<<<<<< HEAD
 ${structseq_defs}
 
 all_return_types: list[Type] = []
+=======
+${return_types___all__}
+
+def pytree_register_structseq(cls: type) -> None: ...
+
+${structseq_defs}
+
+all_return_types: list[type] = ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/__init__.py b/torch/__init__.py
index 54c4ca89a8ed..3e587406f2a9 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -12,6 +12,10 @@
 
 import builtins
 import ctypes
+<<<<<<< HEAD
+=======
+import functools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import glob
 import importlib
 import inspect
@@ -35,7 +39,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from .types import IntLikeType
+=======
+    from .types import Device, IntLikeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # multipy/deploy is setting this import before importing torch, this is the most
@@ -53,6 +61,10 @@ def _running_with_deploy() -> builtins.bool:
 from torch._utils_internal import (
     get_file_path,
     prepare_multiprocessing_environment,
+<<<<<<< HEAD
+=======
+    profiler_allow_cudagraph_cupti_lazy_reinit_cuda12,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     USE_GLOBAL_DEPS,
     USE_RTLD_GLOBAL_WITH_LIBTORCH,
 )
@@ -181,6 +193,10 @@ def _load_dll_libraries() -> None:
         usebase_path = os.path.join(
             sysconfig.get_config_var("userbase"), "Library", "bin"
         )
+<<<<<<< HEAD
+=======
+        py_root_bin_path = os.path.join(sys.exec_prefix, "bin")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # When users create a virtualenv that inherits the base environment,
         # we will need to add the corresponding library directory into
@@ -193,6 +209,7 @@ def _load_dll_libraries() -> None:
 
         dll_paths = [
             p
+<<<<<<< HEAD
             for p in (th_dll_path, py_dll_path, base_py_dll_path, usebase_path)
             if os.path.exists(p)
         ]
@@ -211,6 +228,18 @@ def _load_dll_libraries() -> None:
         else:
             nvtoolsext_dll_path = ""
 
+=======
+            for p in (
+                th_dll_path,
+                py_dll_path,
+                base_py_dll_path,
+                usebase_path,
+                py_root_bin_path,
+            )
+            if os.path.exists(p)
+        ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if cuda_version and builtins.all(
             not glob.glob(os.path.join(p, "cudart64*.dll")) for p in dll_paths
         ):
@@ -223,9 +252,13 @@ def _load_dll_libraries() -> None:
         else:
             cuda_path = ""
 
+<<<<<<< HEAD
         dll_paths.extend(
             p for p in (nvtoolsext_dll_path, cuda_path) if os.path.exists(p)
         )
+=======
+        dll_paths.extend(p for p in (cuda_path,) if os.path.exists(p))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
         with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
@@ -362,7 +395,10 @@ def _load_global_deps() -> None:
             "cusparselt": "libcusparseLt.so.*[0-9]",
             "cusolver": "libcusolver.so.*[0-9]",
             "nccl": "libnccl.so.*[0-9]",
+<<<<<<< HEAD
             "nvtx": "libnvToolsExt.so.*[0-9]",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         # cufiile is only available on cuda 12+
         # TODO: Remove once CUDA 11.8 binaries are deprecated
@@ -623,7 +659,11 @@ def conjugate(self) -> "SymInt":
 
 class SymFloat:
     """
+<<<<<<< HEAD
     Like an float (including magic methods), but redirects all operations on the
+=======
+    Like a float (including magic methods), but redirects all operations on the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     wrapped node. This is used in particular to symbolically record operations
     in the symbolic shape workflow.
     """
@@ -742,7 +782,11 @@ def hex(self) -> str:
 
 class SymBool:
     """
+<<<<<<< HEAD
     Like an bool (including magic methods), but redirects all operations on the
+=======
+    Like a bool (including magic methods), but redirects all operations on the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     wrapped node. This is used in particular to symbolically record operations
     in the symbolic shape workflow.
 
@@ -983,6 +1027,10 @@ def fn(a):
 
 
 def sym_ite(b, t, f):
+<<<<<<< HEAD
+=======
+    """SymInt-aware utility for ternary operator (``t if b else f``.)"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if overrides.has_torch_function((b, t, f)):
         return overrides.handle_torch_function(sym_ite, (b, t, f), b, t, f)
     assert isinstance(b, (SymBool, builtins.bool)) and type(t) == type(f)
@@ -1148,21 +1196,51 @@ def get_default_device() -> "torch.device":
     r"""Gets the default ``torch.Tensor`` to be allocated on ``device``"""
     global _GLOBAL_DEVICE_CONTEXT
 
+<<<<<<< HEAD
     if hasattr(_GLOBAL_DEVICE_CONTEXT, "device_context"):
         device = _GLOBAL_DEVICE_CONTEXT.device_context.device
+=======
+    from torch.overrides import _get_current_function_mode_stack
+    from torch.utils._device import DeviceContext
+
+    def _get_device_with_index(device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if device.index is not None:
             return device
         else:
             # TODO: Call like get_device_index() method corresponding to
             # each device type
             return torch.tensor([]).device
+<<<<<<< HEAD
+=======
+
+    # Get device from any active DeviceContext.
+    device_mode = next(
+        filter(
+            lambda mode: isinstance(mode, DeviceContext),
+            reversed(_get_current_function_mode_stack()),
+        ),
+        None,
+    )
+    if device_mode:
+        device = device_mode.device
+        return _get_device_with_index(device)
+
+    if hasattr(_GLOBAL_DEVICE_CONTEXT, "device_context"):
+        device = _GLOBAL_DEVICE_CONTEXT.device_context.device
+        return _get_device_with_index(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return torch.device("cpu")
 
 
+<<<<<<< HEAD
 def set_default_device(
     device: _Optional[_Union["torch.device", str, builtins.int]],
 ) -> None:
+=======
+def set_default_device(device: "Device") -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Sets the default ``torch.Tensor`` to be allocated on ``device``.  This
     does not affect factory function calls which are called with an explicit
     ``device`` argument.  Factory calls will be performed as if they
@@ -1335,7 +1413,13 @@ def use_deterministic_algorithms(
         * :class:`torch.nn.ConvTranspose1d` when called on CUDA tensor
         * :class:`torch.nn.ConvTranspose2d` when called on CUDA tensor
         * :class:`torch.nn.ConvTranspose3d` when called on CUDA tensor
+<<<<<<< HEAD
         * :class:`torch.nn.ReplicationPad2d` when attempting to differentiate a CUDA tensor
+=======
+        * :class:`torch.nn.ReplicationPad1d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.ReplicationPad2d` when attempting to differentiate a CUDA tensor
+        * :class:`torch.nn.ReplicationPad3d` when attempting to differentiate a CUDA tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         * :func:`torch.bmm` when called on sparse-dense CUDA tensors
         * :func:`torch.Tensor.__getitem__` when attempting to differentiate a CPU tensor
           and the index is a list of tensors
@@ -1377,8 +1461,11 @@ def use_deterministic_algorithms(
         * :class:`torch.nn.ReflectionPad1d` when attempting to differentiate a CUDA tensor
         * :class:`torch.nn.ReflectionPad2d` when attempting to differentiate a CUDA tensor
         * :class:`torch.nn.ReflectionPad3d` when attempting to differentiate a CUDA tensor
+<<<<<<< HEAD
         * :class:`torch.nn.ReplicationPad1d` when attempting to differentiate a CUDA tensor
         * :class:`torch.nn.ReplicationPad3d` when attempting to differentiate a CUDA tensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         * :class:`torch.nn.NLLLoss` when called on a CUDA tensor
         * :class:`torch.nn.CTCLoss` when attempting to differentiate a CUDA tensor
         * :class:`torch.nn.EmbeddingBag` when attempting to differentiate a CUDA tensor when
@@ -2106,7 +2193,11 @@ def _manager_path():
     __obj.__module__ = __name__  # "torch"
     # Hide some APIs that should not be public
     if __name == "segment_reduce":
+<<<<<<< HEAD
         # TODO: Once the undocumented FC window is passed, remove the line bellow
+=======
+        # TODO: Once the undocumented FC window is passed, remove the line below
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         globals()[__name] = __obj
         __name = "_" + __name
     globals()[__name] = __obj
@@ -2237,7 +2328,11 @@ def _assert(condition, message):
 
 def compiled_with_cxx11_abi() -> builtins.bool:
     r"""Returns whether PyTorch was built with _GLIBCXX_USE_CXX11_ABI=1"""
+<<<<<<< HEAD
     return _C._GLIBCXX_USE_CXX11_ABI
+=======
+    return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 from torch import _library as _library, _ops as _ops
@@ -2308,7 +2403,20 @@ def __init__(self, mode, options, dynamic):
         self.apply_options(options)
         self.apply_options(CompilerBisector.get_config_change("inductor"))
 
+<<<<<<< HEAD
         if self.config.get("triton.cudagraphs", False):
+=======
+        cuda_version = None
+        if hasattr(torch, "version"):
+            from torch.torch_version import TorchVersion
+
+            cuda_version = TorchVersion(getattr(torch.version, "cuda", "0.0"))
+
+        if self.config.get("triton.cudagraphs", False) and (
+            (cuda_version and cuda_version < "12.6")
+            or not profiler_allow_cudagraph_cupti_lazy_reinit_cuda12()
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
             # FIXME: CUDA Graph does not work well with CUPTI teardown.
             #   1) crashes on 1st lazy CUPTI re-init after teardown (CUDA 11)
@@ -2423,7 +2531,13 @@ def compile(
     dynamic: _Optional[builtins.bool] = None,
     backend: _Union[str, _Callable] = "inductor",
     mode: _Union[str, None] = None,
+<<<<<<< HEAD
     options: _Optional[dict[str, _Union[str, builtins.int, builtins.bool]]] = None,
+=======
+    options: _Optional[
+        dict[str, _Union[str, builtins.int, builtins.bool, _Callable]]
+    ] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     disable: builtins.bool = False,
 ) -> _Callable[_InputT, _RetT]: ...
 
@@ -2436,19 +2550,35 @@ def compile(
     dynamic: _Optional[builtins.bool] = None,
     backend: _Union[str, _Callable] = "inductor",
     mode: _Union[str, None] = None,
+<<<<<<< HEAD
     options: _Optional[dict[str, _Union[str, builtins.int, builtins.bool]]] = None,
+=======
+    options: _Optional[
+        dict[str, _Union[str, builtins.int, builtins.bool, _Callable]]
+    ] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     disable: builtins.bool = False,
 ) -> _Callable[[_Callable[_InputT, _RetT]], _Callable[_InputT, _RetT]]: ...
 
 
 def compile(
+<<<<<<< HEAD
     model: _Optional[_Callable] = None,
+=======
+    model: _Optional[_Callable[_InputT, _RetT]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     *,
     fullgraph: builtins.bool = False,
     dynamic: _Optional[builtins.bool] = None,
     backend: _Union[str, _Callable] = "inductor",
     mode: _Union[str, None] = None,
+<<<<<<< HEAD
     options: _Optional[dict[str, _Union[str, builtins.int, builtins.bool]]] = None,
+=======
+    options: _Optional[
+        dict[str, _Union[str, builtins.int, builtins.bool, _Callable]]
+    ] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     disable: builtins.bool = False,
 ) -> _Union[
     _Callable[[_Callable[_InputT, _RetT]], _Callable[_InputT, _RetT]],
@@ -2471,7 +2601,11 @@ def compile(
     function, they will all share the same code cache.
 
     Args:
+<<<<<<< HEAD
        model (Callable): Module/function to optimize
+=======
+       model (Callable or None): Module/function to optimize
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
        fullgraph (bool): If False (default), torch.compile attempts to discover compileable regions
         in the function that it will optimize. If True, then we require that the entire function be
         capturable into a single graph. If this is not possible (that is, if there are graph breaks),
@@ -2529,6 +2663,17 @@ def compile(
 
         - `trace.graph_diagram` which will show you a picture of your graph after fusion
 
+<<<<<<< HEAD
+=======
+        - `guard_filter_fn` that controls which dynamo guards are saved with compilations.
+          This is an unsafe feature and there is no backward compatibility guarantee provided
+          for dynamo guards as data types.
+          For stable helper functions to use, see the documentations in `torch.compiler`, for example:
+          - `torch.compiler.skip_guard_on_inbuilt_nn_modules_unsafe`
+          - `torch.compiler.skip_guard_on_all_nn_modules_unsafe`
+          - `torch.compiler.keep_tensor_guards_unsafe`
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         - For inductor you can see the full list of configs that it supports by calling `torch._inductor.list_options()`
        disable (bool): Turn torch.compile() into a no-op for testing
 
@@ -2544,9 +2689,20 @@ def foo(x):
     _C._log_api_usage_once("torch.compile")
     if sys.version_info >= (3, 14):
         raise RuntimeError("torch.compile is not supported on Python 3.14+")
+<<<<<<< HEAD
     elif sysconfig.get_config_var("Py_GIL_DISABLED") == 1:
         raise RuntimeError(
             "torch.compile is not supported on Python built with GIL disabled"
+=======
+    elif sysconfig.get_config_var("Py_GIL_DISABLED") == 1 and sys.version_info < (
+        3,
+        13,
+        3,
+    ):
+        raise RuntimeError(
+            "torch.compile is not supported on Python < 3.13.3 built with GIL disabled. "
+            "Please use Python 3.13.3+."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     # Decorator mode
@@ -2579,6 +2735,13 @@ def fn(model: _Callable[_InputT, _RetT]) -> _Callable[_InputT, _RetT]:
     if bisect_backend := CompilerBisector.get_backend():
         backend = bisect_backend
 
+<<<<<<< HEAD
+=======
+    guard_filter_fn = None
+    if options and isinstance(options, dict):
+        guard_filter_fn = options.pop("guard_filter_fn", None)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if backend == "inductor":
         backend = _TorchCompileInductorWrapper(mode, options, dynamic)
     else:
@@ -2589,6 +2752,10 @@ def fn(model: _Callable[_InputT, _RetT]) -> _Callable[_InputT, _RetT]:
         nopython=fullgraph,
         dynamic=dynamic,
         disable=disable,
+<<<<<<< HEAD
+=======
+        guard_filter_fn=guard_filter_fn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )(model)  # type: ignore[return-value]
 
 
@@ -2703,6 +2870,10 @@ def __getattr__(name):
         raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 
 
+<<<<<<< HEAD
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_device_module(device: _Optional[_Union[torch.device, str]] = None):
     """
     Returns the module associated with a given device(e.g., torch.device('cuda'), "mtia:0", "xpu", ...).
diff --git a/torch/_appdirs.py b/torch/_appdirs.py
index 64d81139d7a9..647bdd51497e 100644
--- a/torch/_appdirs.py
+++ b/torch/_appdirs.py
@@ -41,9 +41,14 @@
 See <https://github.com/ActiveState/appdirs> for details and usage.
 """
 # Dev Notes:
+<<<<<<< HEAD
 # - MSDN on where to store app data files:
 #   http://support.microsoft.com/default.aspx?scid=kb;en-us;310294#XSLTH3194121123120121120120
 # - Mac OS X: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/index.html
+=======
+# - Windows "Known Folders": https://learn.microsoft.com/en-us/windows/win32/shell/csidl
+# - macOS File System Programming Guide: https://developer.apple.com/library/archive/documentation/FileManagement/Conceptual/FileSystemProgrammingGuide/Introduction/Introduction.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # - XDG spec for Un*x: https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
 
 __version__ = "1.4.4"
diff --git a/torch/_compile.py b/torch/_compile.py
index 05e63baa7c96..337832426d82 100644
--- a/torch/_compile.py
+++ b/torch/_compile.py
@@ -45,7 +45,13 @@ def inner(*args: _P.args, **kwargs: _P.kwargs) -> _T:
             if disable_fn is None:
                 import torch._dynamo
 
+<<<<<<< HEAD
                 disable_fn = torch._dynamo.disable(fn, recursive)
+=======
+                # We can safely turn off functools.wraps here because the inner
+                # already wraps fn in the outer scope.
+                disable_fn = torch._dynamo.disable(fn, recursive, wrapping=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fn.__dynamo_disable = disable_fn  # type: ignore[attr-defined]
 
             return disable_fn(*args, **kwargs)
diff --git a/torch/_custom_op/autograd.py b/torch/_custom_op/autograd.py
index 35727197d03c..d4e494b328ee 100644
--- a/torch/_custom_op/autograd.py
+++ b/torch/_custom_op/autograd.py
@@ -1,8 +1,16 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import torch
 import torch.utils._pytree as pytree
 from collections import namedtuple
 import functools
+=======
+import functools
+from collections import namedtuple
+
+import torch
+import torch.utils._pytree as pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # NOTE [CustomOp autograd kernel indirection]
@@ -19,19 +27,32 @@ def autograd_kernel_indirection(custom_op):
     autograd_fallback = autograd_not_implemented(custom_op)
 
     def inner(*args, **kwargs):
+<<<<<<< HEAD
         if custom_op._has_impl('autograd'):
             kernel = custom_op._get_impl('autograd').func
+=======
+        if custom_op._has_impl("autograd"):
+            kernel = custom_op._get_impl("autograd").func
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return kernel(*args, **kwargs)
         # As explained in NOTE ["backward", "save_for_backward", and "autograd"],
         # after the user gives us "backward" and "save_for_backward", we generate
         # the "autograd" impl. If the user only provided one, then we tell
         # the user they've done something wrong.
+<<<<<<< HEAD
         if custom_op._has_impl('save_for_backward') or custom_op._has_impl('backward'):
             missing = (
                 'save_for_backward' if custom_op._has_impl('backward')
                 else 'backward'
             )
             found = 'save_for_backward' if missing == 'backward' else 'backward'
+=======
+        if custom_op._has_impl("save_for_backward") or custom_op._has_impl("backward"):
+            missing = (
+                "save_for_backward" if custom_op._has_impl("backward") else "backward"
+            )
+            found = "save_for_backward" if missing == "backward" else "backward"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             loc = custom_op._get_impl(found).location
             raise RuntimeError(
                 f"We found a '{found}' registration for {custom_op} at "
@@ -39,8 +60,15 @@ def inner(*args, **kwargs):
                 f"To use the CustomOp API to register a backward formula, "
                 f"please provide us both a backward function and a "
                 f"'save for backward' function via `impl_backward` and "
+<<<<<<< HEAD
                 f"`impl_save_for_backward` respectively.")
         return autograd_fallback(*args, **kwargs)
+=======
+                f"`impl_save_for_backward` respectively."
+            )
+        return autograd_fallback(*args, **kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return inner
 
 
@@ -54,6 +82,10 @@ def kernel(*args, **kwargs):
             raise RuntimeError("Autograd has not been implemented for operator")
         with torch._C._AutoDispatchBelowAutograd():
             return custom_op(*args, **kwargs)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return kernel
 
 
@@ -70,7 +102,13 @@ def mark_non_differentiable(ctx, output, output_differentiability):
             tuple_output = output  # type: ignore[assignment]
         assert len(output_differentiability) == len(tuple_output)
         non_differentiable_tensors = []
+<<<<<<< HEAD
         for idx, (differentiable, out) in enumerate(zip(output_differentiability, tuple_output)):
+=======
+        for idx, (differentiable, out) in enumerate(
+            zip(output_differentiability, tuple_output)
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(out, torch.Tensor):
                 if not differentiable:
                     non_differentiable_tensors.append(out)
@@ -84,12 +122,18 @@ def mark_non_differentiable(ctx, output, output_differentiability):
                     f"With output_differentiability={output_differentiability}. "
                     f"At idx {idx}, we received an object of type {type(out)} that "
                     f"is not a Tensor, so it cannot have be marked as differentiable in "
+<<<<<<< HEAD
                     f"output_differentiability.")
+=======
+                    f"output_differentiability."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if non_differentiable_tensors:
             ctx.mark_non_differentiable(*non_differentiable_tensors)
 
 
 def construct_autograd_kernel(
+<<<<<<< HEAD
         schema,
         output_differentiability,
         custom_op,
@@ -97,6 +141,15 @@ def construct_autograd_kernel(
         save_for_backward_fn,
         backward_fn):
 
+=======
+    schema,
+    output_differentiability,
+    custom_op,
+    op_overload,
+    save_for_backward_fn,
+    backward_fn,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def apply(*args):
         flat_args, spec = pytree.tree_flatten(args)
         out_spec = None
@@ -108,8 +161,12 @@ def forward(ctx, *flat_args):
                 output = op_overload(*args)
 
             # We use the info about args to give better error messages in backward
+<<<<<<< HEAD
             args_info = namedtuple_args(
                 schema, pytree.tree_map(type, args))
+=======
+            args_info = namedtuple_args(schema, pytree.tree_map(type, args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             save_for_backward_fn_inputs = namedtuple_args(schema, args)
             to_save = save_for_backward_fn(save_for_backward_fn_inputs, output)
@@ -138,11 +195,20 @@ def backward(ctx, *flat_grad_output):
             return grad_inputs_dict_to_flat_tuple(grad_inputs_dict, args_info)
 
         generated_cls = gen_autograd_function(
+<<<<<<< HEAD
             custom_op._opname + '_customop', forward, backward)
+=======
+            custom_op._opname + "_customop", forward, backward
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         flat_output = generated_cls.apply(*flat_args)
         assert out_spec is not None
         return pytree.tree_unflatten(list(flat_output), out_spec)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return apply
 
 
@@ -151,9 +217,15 @@ def gen_autograd_function(name, forward, backward):
         name,
         (torch.autograd.Function,),
         {
+<<<<<<< HEAD
             'forward': staticmethod(forward),
             'backward': staticmethod(backward),
         }
+=======
+            "forward": staticmethod(forward),
+            "backward": staticmethod(backward),
+        },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return generated_cls
 
@@ -175,6 +247,7 @@ def namedtuple_args(schema, args):
 
 def validate_grad_inputs_dict(grad_inputs_dict, forward_op, args_info):
     def error(what):
+<<<<<<< HEAD
         backward = forward_op._get_impl('backward')
         raise RuntimeError(
             f"In the backward function defined for {forward_op} at "
@@ -194,12 +267,42 @@ def error(what):
               f"to the CustomOp that may be a Tensor or Sequence[Tensor]. "
               f"Args declared to be non-Tensor-like types should not appear "
               f"in the grad_input dict")
+=======
+        backward = forward_op._get_impl("backward")
+        raise RuntimeError(
+            f"In the backward function defined for {forward_op} at "
+            f"{backward.location} using the CustomOp API, {what}"
+        )
+
+    if not isinstance(grad_inputs_dict, dict):
+        error(
+            f"expected the output of the backward function to be a dict but "
+            f"got {type(grad_inputs_dict)}"
+        )
+
+    expected_keys = {
+        arg.name
+        for arg in forward_op._schema.arguments.flat_all
+        if arg.type.is_tensor_like()
+    }
+    actual_keys = grad_inputs_dict.keys()
+    if expected_keys != actual_keys:
+        error(
+            f"expected the returned grad_input dict to have keys "
+            f"{expected_keys} but got {actual_keys}. The backward "
+            f"function must return a gradient (can be None) for each arg "
+            f"to the CustomOp that may be a Tensor or Sequence[Tensor]. "
+            f"Args declared to be non-Tensor-like types should not appear "
+            f"in the grad_input dict"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for name, grad in grad_inputs_dict.items():
         arg_info = getattr(args_info, name)
 
         if isinstance(arg_info, list):
             if not isinstance(grad, (tuple, list)):
+<<<<<<< HEAD
                 error(f"for input '{name}' expected the grad_input dict to "
                       f"hold a list of gradients but got object of type "
                       f"{type(grad)}.")
@@ -207,10 +310,24 @@ def error(what):
                 error(f"for input '{name}' expected the grad_input dict to "
                       f"hold a list of {len(arg_info)} gradients but got "
                       f"{len(grad)}")
+=======
+                error(
+                    f"for input '{name}' expected the grad_input dict to "
+                    f"hold a list of gradients but got object of type "
+                    f"{type(grad)}."
+                )
+            if not len(grad) == len(arg_info):
+                error(
+                    f"for input '{name}' expected the grad_input dict to "
+                    f"hold a list of {len(arg_info)} gradients but got "
+                    f"{len(grad)}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for idx, (g, info) in enumerate(zip(grad, arg_info)):
                 if g is None:
                     continue
                 if not isinstance(g, torch.Tensor):
+<<<<<<< HEAD
                     error(f"for input '{name}' expected the grad_input dict to "
                           f"hold a list of None or Tensor gradients but got "
                           f"object of {type(g)} at index {idx}")
@@ -219,11 +336,26 @@ def error(what):
                           f"for the {idx}-th value but expected None because "
                           f"the {idx}-th value was not a Tensor (it was "
                           f"type {arg_info}")
+=======
+                    error(
+                        f"for input '{name}' expected the grad_input dict to "
+                        f"hold a list of None or Tensor gradients but got "
+                        f"object of {type(g)} at index {idx}"
+                    )
+                if not issubclass(info, torch.Tensor):
+                    error(
+                        f"for input '{name}', got a Tensor as the gradient "
+                        f"for the {idx}-th value but expected None because "
+                        f"the {idx}-th value was not a Tensor (it was "
+                        f"type {arg_info}"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
 
         if grad is None:
             continue
         if not isinstance(grad, torch.Tensor):
+<<<<<<< HEAD
             error(f"got object of type {type(grad)} as the gradient for input "
                   f"'{name}', "
                   f"but expected the gradient to be either None or a Tensor")
@@ -231,6 +363,19 @@ def error(what):
             error(f"got a Tensor as the gradient for input '{name}' but "
                   f"expected None as the gradient because input '{name}' "
                   f"was not a Tensor (it was type {arg_info}).")
+=======
+            error(
+                f"got object of type {type(grad)} as the gradient for input "
+                f"'{name}', "
+                f"but expected the gradient to be either None or a Tensor"
+            )
+        if not issubclass(arg_info, torch.Tensor):
+            error(
+                f"got a Tensor as the gradient for input '{name}' but "
+                f"expected None as the gradient because input '{name}' "
+                f"was not a Tensor (it was type {arg_info})."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def grad_inputs_dict_to_flat_tuple(grad_inputs_dict, args_info):
@@ -242,6 +387,10 @@ def grad_inputs_dict_to_flat_tuple(grad_inputs_dict, args_info):
         result.append(grad_inputs_dict[name])
     return tuple(pytree.tree_leaves(result))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Saves "stuff" (a pytree) onto the ctx object. Use unpack_saved to unpack it.
 # autograd.Function prefers that users use ctx.save_for_backward to
 # save Tensors (to avoid reference cycles) and for non-Tensors to go onto the
@@ -249,10 +398,21 @@ def grad_inputs_dict_to_flat_tuple(grad_inputs_dict, args_info):
 def save_pytree_for_backward(ctx, stuff):
     flat_stuff, spec = pytree.tree_flatten(stuff)
     num_elts = len(flat_stuff)
+<<<<<<< HEAD
     tensor_idxs = [idx for idx, thing in enumerate(flat_stuff)
                    if isinstance(thing, torch.Tensor)]
     non_tensor_idxs = [idx for idx, thing in enumerate(flat_stuff)
                        if not isinstance(thing, torch.Tensor)]
+=======
+    tensor_idxs = [
+        idx for idx, thing in enumerate(flat_stuff) if isinstance(thing, torch.Tensor)
+    ]
+    non_tensor_idxs = [
+        idx
+        for idx, thing in enumerate(flat_stuff)
+        if not isinstance(thing, torch.Tensor)
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensors = [thing for thing in flat_stuff if isinstance(thing, torch.Tensor)]
     non_tensors = [thing for thing in flat_stuff if not isinstance(thing, torch.Tensor)]
 
diff --git a/torch/_custom_op/impl.py b/torch/_custom_op/impl.py
index ffa7ded27dbc..2db3fe55b61b 100644
--- a/torch/_custom_op/impl.py
+++ b/torch/_custom_op/impl.py
@@ -4,6 +4,7 @@
 import inspect
 import sys
 import typing
+<<<<<<< HEAD
 import weakref
 import warnings
 
@@ -17,6 +18,28 @@
 from .autograd import autograd_kernel_indirection, construct_autograd_kernel
 import torch._library.infer_schema
 from torch._library.infer_schema import infer_schema
+=======
+import warnings
+import weakref
+
+import torch
+import torch._C as _C
+import torch._library.infer_schema
+import torch.library as library
+from torch._library.infer_schema import infer_schema
+from torch.library import get_ctx
+from torchgen.model import (
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    ListType,
+    OperatorName,
+    SchemaKind,
+)
+
+from .autograd import autograd_kernel_indirection, construct_autograd_kernel
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 """
 torch._custom_op is deprecated. We shipped a production-ready version of it into torch.library.
@@ -42,10 +65,20 @@
     "pytorch",
 }
 
+<<<<<<< HEAD
 def warn_deprecated():
     warnings.warn(
         "torch._custom_op is deprecated and will be removed in PyTorch 2.6, please "
         "use the equivalent torch.library API instead.", DeprecationWarning)
+=======
+
+def warn_deprecated():
+    warnings.warn(
+        "torch._custom_op is deprecated and will be removed in PyTorch 2.6, please "
+        "use the equivalent torch.library API instead.",
+        DeprecationWarning,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def custom_op(
@@ -73,7 +106,15 @@ def inner(func):
                 f"is passed to `custom_op`"
             )
 
+<<<<<<< HEAD
         schema = infer_schema(func, mutates_args=()) if manual_schema is None else manual_schema
+=======
+        schema = (
+            infer_schema(func, mutates_args=())
+            if manual_schema is None
+            else manual_schema
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         schema_str = f"{name}{schema}"
         function_schema = FunctionSchema.parse(schema_str)
         validate_schema(function_schema)
@@ -83,7 +124,13 @@ def inner(func):
         lib = library.Library(ns, "FRAGMENT")
         lib.define(schema_str)
         ophandle = find_ophandle_or_throw(ns, function_schema.name)
+<<<<<<< HEAD
         result = CustomOp(lib, ns, function_schema, name, ophandle, _private_access=True)
+=======
+        result = CustomOp(
+            lib, ns, function_schema, name, ophandle, _private_access=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         result.__name__ = func.__name__
         result.__module__ = func.__module__
@@ -116,7 +163,13 @@ class CustomOp:
     This API is deprecated, please use torch.library.custom_op instead
     """
 
+<<<<<<< HEAD
     def __init__(self, lib, cpp_ns, schema, operator_name, ophandle, *, _private_access=False):
+=======
+    def __init__(
+        self, lib, cpp_ns, schema, operator_name, ophandle, *, _private_access=False
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         warn_deprecated()
         if not _private_access:
@@ -144,7 +197,13 @@ def __init__(self, lib, cpp_ns, schema, operator_name, ophandle, *, _private_acc
 
     def _register_autograd_kernel_indirection(self):
         assert not self._registered_autograd_kernel_indirection
+<<<<<<< HEAD
         self._lib.impl(self._opname, autograd_kernel_indirection(weakref.proxy(self)), "Autograd")
+=======
+        self._lib.impl(
+            self._opname, autograd_kernel_indirection(weakref.proxy(self)), "Autograd"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._registered_autograd_kernel_indirection = True
 
     # Records the impl and the source location in self._impls
@@ -196,7 +255,13 @@ def __call__(self, *args, **kwargs):
         return result
 
     def impl(
+<<<<<<< HEAD
         self, device_types: typing.Union[str, typing.Iterable[str]], _stacklevel=2,
+=======
+        self,
+        device_types: typing.Union[str, typing.Iterable[str]],
+        _stacklevel=2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> typing.Callable:
         r"""
         This API is deprecated, please use torch.library.custom_op instead
@@ -224,7 +289,12 @@ def _check_doesnt_have_library_impl(self, device_type):
             raise RuntimeError(
                 f"impl(..., device_types={device_type}): the operator {self._qualname} "
                 f"already has an implementation for this device type via a "
+<<<<<<< HEAD
                 f"pre-existing torch.library or TORCH_LIBRARY registration.")
+=======
+                f"pre-existing torch.library or TORCH_LIBRARY registration."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def impl_factory(self) -> typing.Callable:
         r"""Register an implementation for a factory function."""
@@ -306,20 +376,37 @@ def error(detail):
         for ret in schema.returns:
             if ret.type in allowed_return_types:
                 continue
+<<<<<<< HEAD
             error(f"operator with return not in {list(allowed_return_types.values())} (got {ret.type})")
+=======
+            error(
+                f"operator with return not in {list(allowed_return_types.values())} (got {ret.type})"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_doesnt_have_library_autograd_impl(self):
         if self._registered_autograd_kernel_indirection:
             return
 
+<<<<<<< HEAD
         if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "CompositeImplicitAutograd"):
+=======
+        if _C._dispatch_has_kernel_for_dispatch_key(
+            self._qualname, "CompositeImplicitAutograd"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(
                 f"impl_backward/impl_save_for_backward: the operator {self._qualname} "
                 f"already has an implementation for this device type via a "
                 f"pre-existing registration to DispatchKey::CompositeImplicitAutograd."
                 f"CompositeImplicitAutograd operators do not need an autograd formula; "
                 f"instead, the operator will decompose into its constituents and those "
+<<<<<<< HEAD
                 f"can have autograd formulas defined on them.")
+=======
+                f"can have autograd formulas defined on them."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # We can improve this by adding "all Autograd<BACKEND> keys", but
         # realistically people will just be using this API for CPU/CUDA for now.
@@ -330,7 +417,12 @@ def _check_doesnt_have_library_autograd_impl(self):
                     f"the operator {self._qualname} already has an Autograd kernel "
                     f"registered to DispatchKey::{key} vi a pre-existing "
                     f"torch.library or TORCH_LIBRARY registration. Please either "
+<<<<<<< HEAD
                     f"remove those registrations or don't use the torch._custom_ops APIs")
+=======
+                    f"remove those registrations or don't use the torch._custom_ops APIs"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _check_doesnt_have_library_meta_impl(self):
         if self._has_impl("abstract"):
@@ -341,10 +433,16 @@ def _check_doesnt_have_library_meta_impl(self):
         # (existing custom ops may have CompositeExplicitAutograd
         # registration that don't work with Meta kernels, so this
         # gives them an escape hatch).
+<<<<<<< HEAD
         if (
             _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "CompositeExplicitAutograd")
             and not _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "Meta")
         ):
+=======
+        if _C._dispatch_has_kernel_for_dispatch_key(
+            self._qualname, "CompositeExplicitAutograd"
+        ) and not _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "Meta"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         # Otherwise, if the user's already has a Meta kernel or their
@@ -352,21 +450,37 @@ def _check_doesnt_have_library_meta_impl(self):
         # raise.
 
         # Special case for CompositeImplicitAutograd
+<<<<<<< HEAD
         if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "CompositeImplicitAutograd"):
+=======
+        if _C._dispatch_has_kernel_for_dispatch_key(
+            self._qualname, "CompositeImplicitAutograd"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(
                 f"impl_abstract(...): the operator {self._qualname} "
                 f"already has an implementation for this device type via a "
                 f"pre-existing registration to DispatchKey::CompositeImplicitAutograd."
                 f"CompositeImplicitAutograd operators do not need an abstract impl; "
                 f"instead, the operator will decompose into its constituents and those "
+<<<<<<< HEAD
                 f"can have abstract impls defined on them.")
+=======
+                f"can have abstract impls defined on them."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "Meta"):
             raise RuntimeError(
                 f"impl_abstract(...): the operator {self._qualname} "
                 f"already has an DispatchKey::Meta implementation via a "
                 f"pre-existing torch.library or TORCH_LIBRARY registration. "
+<<<<<<< HEAD
                 f"Please either remove that registration or don't call impl_abstract.")
+=======
+                f"Please either remove that registration or don't call impl_abstract."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # NOTE ["backward", "save_for_backward", and "autograd"]
     # As a part of the explicit autograd API, a user must provide us
@@ -382,7 +496,12 @@ def _register_autograd_kernel(self):
             self,
             get_op(self._qualname),
             self._get_impl("save_for_backward").func,
+<<<<<<< HEAD
             self._get_impl("backward").func)
+=======
+            self._get_impl("backward").func,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._register_impl("autograd", kernel)
 
     def impl_save_for_backward(self, _stacklevel=2):
@@ -390,6 +509,10 @@ def impl_save_for_backward(self, _stacklevel=2):
 
         Please see impl_backward for more details.
         """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def inner(f):
             self._check_can_register_backward()
             self._check_doesnt_have_library_autograd_impl()
@@ -398,6 +521,10 @@ def inner(f):
             self._register_impl("save_for_backward", f, stacklevel=_stacklevel)
             if self._has_impl("backward"):
                 self._register_autograd_kernel()
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return inner
 
     def impl_backward(self, output_differentiability=None, _stacklevel=2):
@@ -405,12 +532,21 @@ def impl_backward(self, output_differentiability=None, _stacklevel=2):
         This API is deprecated, please use torch.library.custom_op instead
         """
         if output_differentiability is not None:
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def yell():
                 raise RuntimeError(
                     f"impl_backward(output_differentiability): expected "
                     f"output_differentiability to be a list of bools with "
                     f"length equal to the number of outputs of this CustomOp "
+<<<<<<< HEAD
                     f"got: {output_differentiability}")
+=======
+                    f"got: {output_differentiability}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if not isinstance(output_differentiability, list):
                 yell()
@@ -429,6 +565,10 @@ def inner(f):
             self._output_differentiability = output_differentiability
             if self._has_impl("save_for_backward"):
                 self._register_autograd_kernel()
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return inner
 
 
@@ -459,6 +599,10 @@ def validate_namespace(ns: str) -> None:
             f"please choose something else. "
         )
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def validate_schema(schema: FunctionSchema) -> None:
     if not torch._library.utils.is_functional_schema(schema):
         raise ValueError(
@@ -479,6 +623,7 @@ def validate_schema(schema: FunctionSchema) -> None:
 def parse_qualname(qualname: str) -> tuple[str, str]:
     names = qualname.split("::", 1)
     if len(names) != 2:
+<<<<<<< HEAD
         raise ValueError(f"Expected there to be a namespace in {qualname}, i.e. The "
                          f"operator name should look something like ns::foo")
     if '.' in names[1]:
@@ -486,6 +631,19 @@ def parse_qualname(qualname: str) -> tuple[str, str]:
                          f"i.e. operator names with '.' in them. "
                          f"Please name your operator something like ns::foo. "
                          f"Got: {qualname}")
+=======
+        raise ValueError(
+            f"Expected there to be a namespace in {qualname}, i.e. The "
+            f"operator name should look something like ns::foo"
+        )
+    if "." in names[1]:
+        raise ValueError(
+            f"The torch.custom_ops APIs do not handle overloads, "
+            f"i.e. operator names with '.' in them. "
+            f"Please name your operator something like ns::foo. "
+            f"Got: {qualname}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return names[0], names[1]
 
 
@@ -615,7 +773,12 @@ def error_not_found():
         raise ValueError(
             f"Could not find the operator {qualname}. Please make sure you have "
             f"already registered the operator and (if registered from C++) "
+<<<<<<< HEAD
             f"loaded it via torch.ops.load_library.")
+=======
+            f"loaded it via torch.ops.load_library."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     ns, name = parse_qualname(qualname)
     if not hasattr(torch.ops, ns):
@@ -624,7 +787,11 @@ def error_not_found():
     if not hasattr(opnamespace, name):
         error_not_found()
     packet = getattr(opnamespace, name)
+<<<<<<< HEAD
     if not hasattr(packet, 'default'):
+=======
+    if not hasattr(packet, "default"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         error_not_found()
     return packet.default
 
@@ -635,7 +802,12 @@ def _find_custom_op(qualname, also_check_torch_library=False):
     if not also_check_torch_library:
         raise RuntimeError(
             f'Could not find custom op "{qualname}". Did you register it via '
+<<<<<<< HEAD
             f"the torch._custom_ops API?")
+=======
+            f"the torch._custom_ops API?"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     overload = get_op(qualname)
     result = custom_op_from_existing(overload)
     return result
diff --git a/torch/_custom_ops.py b/torch/_custom_ops.py
index 25140653b996..4d9db6252ee1 100644
--- a/torch/_custom_ops.py
+++ b/torch/_custom_ops.py
@@ -52,6 +52,10 @@ def custom_op(qualname, func_or_schema=None):
             schema string.
 
     Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
         >>> import torch
         >>> import numpy as np
@@ -134,6 +138,10 @@ def impl(qualname, *, device_types=("cpu", "cuda"), func=None):
         device_types (str or Iterable[str]): the device type(s) to register the function for.
 
     Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
         >>> import torch
         >>> import numpy as np
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
index f147f304d70c..89890f4a533d 100644
--- a/torch/_decomp/__init__.py
+++ b/torch/_decomp/__init__.py
@@ -332,7 +332,10 @@ def _core_aten_decompositions_post_autograd() -> dict[
             aten.diagonal_copy,
             aten.dot,
             aten.vdot,
+<<<<<<< HEAD
             aten.elu,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten.elu_,
             aten.elu_backward,
             aten._embedding_bag,
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 89f854783de8..f54c3134a9a1 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -58,11 +58,25 @@ def type_casts(
     f: Callable,
     type_promotion: utils.ELEMENTWISE_TYPE_PROMOTION_KIND,
     compute_dtype_only: bool = False,
+<<<<<<< HEAD
 ):
     @functools.wraps(f)
     def inner(*args, **kwargs):
         flat_args = [
             x for x in pytree.arg_tree_leaves(*args, **kwargs) if isinstance(x, Tensor)
+=======
+    include_non_tensor_args: bool = False,
+):
+    @functools.wraps(f)
+    def inner(*args, **kwargs):
+        allowed_types = (
+            (Tensor, torch.types._Number) if include_non_tensor_args else (Tensor,)
+        )  # type: ignore[arg-type]
+        flat_args = [
+            x
+            for x in pytree.arg_tree_leaves(*args, **kwargs)
+            if isinstance(x, allowed_types)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         computation_dtype, result_dtype = utils.elementwise_dtypes(
             *flat_args, type_promotion_kind=type_promotion
@@ -98,6 +112,14 @@ def decrease_prec(x):
 pw_cast_for_opmath = partial(
     type_casts, type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
 )
+<<<<<<< HEAD
+=======
+pw_cast_for_opmath_non_tensor_args = partial(
+    type_casts,
+    type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    include_non_tensor_args=True,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 pw_cast_for_int_to_real = partial(
     type_casts, type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
 )
@@ -212,9 +234,15 @@ def hardswish(self: Tensor) -> Tensor:
 @pw_cast_for_opmath
 def hardswish_backward(grad_output: Tensor, self: Tensor) -> Tensor:
     return torch.where(
+<<<<<<< HEAD
         self < -3,
         0.0,
         torch.where(self <= 3, grad_output * ((self / 3) + 0.5), grad_output),
+=======
+        self <= -3,
+        0.0,
+        torch.where(self < 3, grad_output * ((self / 3) + 0.5), grad_output),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -739,11 +767,19 @@ def slice_forward(
     elif guard_size_oblivious(start_val > sizes[dim]):
         start_val = sizes[dim]
 
+<<<<<<< HEAD
     if guard_size_oblivious(end_val < start_val):
         end_val = start_val
     elif statically_known_true(end_val == sys.maxsize) or guard_size_oblivious(
         end_val > sizes[dim]
     ):
+=======
+    if statically_known_true(end_val == sys.maxsize):
+        end_val = sizes[dim]
+    elif guard_size_oblivious(end_val < start_val):
+        end_val = start_val
+    elif guard_size_oblivious(end_val > sizes[dim]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         end_val = sizes[dim]
 
     storage_offset = self.storage_offset() + start_val * strides[dim]
@@ -1282,7 +1318,11 @@ def _pad_chunk(
             ]
             tensor = aten.constant_pad_nd(tensor, pad, 0)
         view_size = tensor_size[:dim] + torch.Size([num_chunks, -1])
+<<<<<<< HEAD
         padded_tensors.append(tensor.view(view_size))
+=======
+        padded_tensors.append(tensor.reshape(view_size))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return padded_tensors
 
 
@@ -1482,7 +1522,11 @@ def _addmm_activation(
 
 
 @register_decomposition(aten.addmv)
+<<<<<<< HEAD
 @out_wrapper()
+=======
+@out_wrapper(exact_dtype=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @pw_cast_for_opmath
 def addmv(self: Tensor, mat1: Tensor, vec: Tensor, beta: int = 1, alpha: int = 1):
     if not self.is_floating_point() and not self.is_complex():
@@ -4338,7 +4382,11 @@ def grid_sampler_2d(
 
 
 @register_decomposition(aten.mv)
+<<<<<<< HEAD
 @out_wrapper()
+=======
+@out_wrapper(exact_dtype=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @pw_cast_for_opmath
 def mv(self, vec):
     torch._check(
@@ -4389,8 +4437,12 @@ def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> b
     t1_stride = t1.stride()
 
     # Check the contiguous, we can skip the dim with size of 1
+<<<<<<< HEAD
     # as aten: https://github.com/pytorch/pytorch/blob/
     # e201460f8aa1510b4c4686627d57b69756c4b916/aten/src/ATen/TensorGeometry.cpp#L17
+=======
+    # as aten: https://github.com/pytorch/pytorch/blob/e201460f8aa1510b4c4686627d57b69756c4b916/aten/src/ATen/TensorGeometry.cpp#L17
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     expected_stride = [1]
     for size in reversed(t1_shape[1:]):
         expected_stride.append(size * expected_stride[-1])
@@ -5031,7 +5083,11 @@ def inplace_op(*args, **kwargs):
 
 
 @register_decomposition([aten.baddbmm])
+<<<<<<< HEAD
 @out_wrapper()
+=======
+@out_wrapper(exact_dtype=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @pw_cast_for_opmath
 def baddbmm(self, batch1, batch2, beta=1, alpha=1):
     if not self.is_floating_point() and not self.is_complex():
@@ -5135,7 +5191,12 @@ def bernoulli(
 def isin_default(elements, test_elements, *, invert=False):
     if elements.numel() == 0:
         return torch.empty_like(elements, dtype=torch.bool)
+<<<<<<< HEAD
     x = elements.view(*elements.shape, *((1,) * test_elements.ndim))
+=======
+    expanded_elem_shape = elements.shape + (1,) * test_elements.ndim
+    x = elements.view(expanded_elem_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dim = tuple(range(-1, -test_elements.ndim - 1, -1))
     res = (x == test_elements).any(dim=dim)
     return ~res if invert else res
diff --git a/torch/_dispatch/python.py b/torch/_dispatch/python.py
index 1e03146bbcc3..ff77eee5621a 100644
--- a/torch/_dispatch/python.py
+++ b/torch/_dispatch/python.py
@@ -3,12 +3,21 @@
 import unittest.mock
 from collections.abc import Iterator
 from contextlib import contextmanager
+<<<<<<< HEAD
+=======
+from typing import Callable, TypeVar, Union
+from typing_extensions import ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._C
 import torch._ops
 import torch.utils._python_dispatch
 import torch.utils._pytree as pytree
+<<<<<<< HEAD
+=======
+from torch._C import DispatchKey
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["enable_python_dispatcher", "no_python_dispatcher", "enable_pre_dispatch"]
@@ -19,6 +28,12 @@
 
 CROSSREF_FUNCTIONALIZE = False
 
+<<<<<<< HEAD
+=======
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def all_py_loaded_overloads() -> Iterator[torch._ops.OpOverload]:
     """
@@ -103,14 +118,24 @@ def _fmt(a: object) -> object:
         return a
 
 
+<<<<<<< HEAD
 def make_crossref_functionalize(op, final_key):
+=======
+def make_crossref_functionalize(
+    op: torch._ops.OpOverload[_P, _T], final_key: DispatchKey
+) -> Union[Callable[_P, _T], DispatchKey]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._subclasses.fake_tensor import FakeTensorMode
 
     # This case is pretty weird, suppress it for now
     if op == torch.ops.aten.lift_fresh.default:
         return final_key
 
+<<<<<<< HEAD
     def handler(*args, **kwargs):
+=======
+    def handler(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fake_mode = FakeTensorMode()
 
         def fakeify_defun(t):
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index c3b9e880619b..6a02cfb446d0 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -20,6 +20,10 @@
     assume_constant_result,
     disable,
     disallow_in_graph,
+<<<<<<< HEAD
+=======
+    dont_skip_tracing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     forbid_in_graph,
     graph_break,
     mark_dynamic,
@@ -27,8 +31,15 @@
     mark_static_address,
     maybe_mark_dynamic,
     nonstrict_trace,
+<<<<<<< HEAD
     run,
     set_stance,
+=======
+    patch_dynamo_config,
+    run,
+    set_stance,
+    skip_frame,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     substitute_in_graph,
 )
 from .eval_frame import (
@@ -57,6 +68,10 @@
     "allow_in_graph",
     "assume_constant_result",
     "disallow_in_graph",
+<<<<<<< HEAD
+=======
+    "dont_skip_tracing",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "forbid_in_graph",
     "substitute_in_graph",
     "graph_break",
@@ -67,6 +82,11 @@
     "nonstrict_trace",
     "optimize",
     "optimize_assert",
+<<<<<<< HEAD
+=======
+    "patch_dynamo_config",
+    "skip_frame",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "export",
     "explain",
     "run",
diff --git a/torch/_dynamo/_trace_wrapped_higher_order_op.py b/torch/_dynamo/_trace_wrapped_higher_order_op.py
index 278c89c83a03..30bd56f00220 100644
--- a/torch/_dynamo/_trace_wrapped_higher_order_op.py
+++ b/torch/_dynamo/_trace_wrapped_higher_order_op.py
@@ -50,7 +50,11 @@
 
 
 if not torch._running_with_deploy():
+<<<<<<< HEAD
     # torch.library.custom_op does not work with torch.deploy/multipy
+=======
+    # torch.library.custom_op does not work with torch.deploy/multipy  # codespell:ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch.library.custom_op("flex_lib::zeros_and_scatter", mutates_args=())  # type: ignore[misc]
     def zeros_and_scatter(
diff --git a/torch/_dynamo/backends/common.py b/torch/_dynamo/backends/common.py
index f92d16bf2b30..3762013d74dd 100644
--- a/torch/_dynamo/backends/common.py
+++ b/torch/_dynamo/backends/common.py
@@ -68,8 +68,21 @@ def __call__(self, gm: torch.fx.GraphModule, example_inputs, **kwargs):
 
         def wrap_bw_compiler(bw_compiler_fn):
             def _wrapped_bw_compiler(*args, **kwargs):
+<<<<<<< HEAD
                 # stop TorchDynamo from trying to compile our generated backwards pass
                 return disable(disable(bw_compiler_fn)(*args, **kwargs))
+=======
+                # Note [Wrapping bw_compiler in disable]
+                # The two disables here:
+                # - stop TorchDynamo from trying to compile the bw_compiler function itself
+                # - stop TorchDynamo from trying to compile our the generated backwards pass bw_compiler produces
+                return disable(
+                    disable(
+                        bw_compiler_fn, reason="do not trace backward compiler function"
+                    )(*args, **kwargs),
+                    reason="do not trace generated backwards pass",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return _wrapped_bw_compiler
 
@@ -100,7 +113,11 @@ def _wrapped_bw_compiler(*args, **kwargs):
             with enable_aot_logging(), patch_config:
                 cg = aot_module_simplified(gm, example_inputs, **self.kwargs)
                 counters["aot_autograd"]["ok"] += 1
+<<<<<<< HEAD
                 return disable(cg)
+=======
+                return disable(cg, reason="do not trace AOT-compiled graph")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except TensorifyScalarRestartAnalysis:
             raise
         except Exception:
diff --git a/torch/_dynamo/backends/debugging.py b/torch/_dynamo/backends/debugging.py
index 490185b5d426..e5bdbf1ebeb8 100644
--- a/torch/_dynamo/backends/debugging.py
+++ b/torch/_dynamo/backends/debugging.py
@@ -56,7 +56,11 @@ def make_eager_backend_with_torch_function_mode(mode):
 
 
 def make_eager_backend_with_torch_function_modes(modes):
+<<<<<<< HEAD
     """Used to trace HOPs (cond and while) for eager exectution, the metadata
+=======
+    """Used to trace HOPs (cond and while) for eager execution, the metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TF mode mutates vars outside of the scope of the HOP, and we can't have graph breaks
     in the HOP, so we need to externally run this mode and not trace it."""
     from contextlib import ExitStack
diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
index df36dd7d0efe..335a9a874359 100644
--- a/torch/_dynamo/backends/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -146,6 +146,29 @@ def has_higher_order_op(gm):
     return False
 
 
+<<<<<<< HEAD
+=======
+def propagate_metadata(orig_gm, split_gm) -> None:
+    for name, module in split_gm.named_modules():
+        if "." not in name and len(name):
+            # TODO: add split id to CompileId: https://github.com/pytorch/tlparse/pull/83/files#r1880649384
+            module.meta = orig_gm.meta
+            module._param_name_to_source = orig_gm._param_name_to_source
+
+
+def propagate_dynamo_source(orig_gm, split_gm) -> None:
+    name_to_dynamo_source = {}
+    for node in orig_gm.graph.find_nodes(op="placeholder"):
+        name_to_dynamo_source[node.name] = node._dynamo_source
+
+    for name, module in split_gm.named_modules():
+        if "." not in name and len(name):
+            for node in module.graph.find_nodes(op="placeholder"):
+                # non-placeholder in original_gm may become placeholder in submodules
+                node._dynamo_source = name_to_dynamo_source.get(node.name, None)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # compile each of the partitioned submodules using the user-provided compiler
 class SubmodCompiler(torch.fx.interpreter.Interpreter):
     def __init__(self, module, compiler, fake_mode) -> None:
@@ -516,6 +539,13 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: list[torch.Tensor]):
             gm, None, lambda node: partition_map[node]
         )
 
+<<<<<<< HEAD
+=======
+        # See note [Assumption on Dynamo Metadata]
+        propagate_dynamo_source(gm, split_gm)
+        propagate_metadata(gm, split_gm)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         debug_str = (
             f"\n---orig graph---\n{gm.graph}\n"
             + f"\n---split graph---\n{split_gm.graph}\n"
diff --git a/torch/_dynamo/backends/inductor.py b/torch/_dynamo/backends/inductor.py
index d92ecd4b5fee..a5090ecb15aa 100644
--- a/torch/_dynamo/backends/inductor.py
+++ b/torch/_dynamo/backends/inductor.py
@@ -13,11 +13,21 @@
 """
 
 from torch._dynamo import register_backend
+<<<<<<< HEAD
+=======
+from torch._dynamo.utils import dynamo_timed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_backend
 def inductor(*args, **kwargs):
+<<<<<<< HEAD
     # do import here to avoid loading inductor into memory when it is not used
     from torch._inductor.compile_fx import compile_fx
+=======
+    with dynamo_timed("inductor_import", log_pt2_compile_event=True):
+        # do import here to avoid loading inductor into memory when it is not used
+        from torch._inductor.compile_fx import compile_fx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return compile_fx(*args, **kwargs)
diff --git a/torch/_dynamo/backends/registry.py b/torch/_dynamo/backends/registry.py
index 01381aa66b80..57ccb346c3f5 100644
--- a/torch/_dynamo/backends/registry.py
+++ b/torch/_dynamo/backends/registry.py
@@ -154,7 +154,11 @@ def list_backends(exclude_tags=("debug", "experimental")) -> list[str]:
     return sorted(backends)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _lazy_import():
     from .. import backends
     from ..utils import import_submodule
@@ -168,7 +172,11 @@ def _lazy_import():
     _discover_entrypoint_backends()
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _discover_entrypoint_backends():
     # importing here so it will pick up the mocked version in test_backends.py
     from importlib.metadata import entry_points
diff --git a/torch/_dynamo/backends/tvm.py b/torch/_dynamo/backends/tvm.py
index 3a5b239183f3..1050d5261fbd 100644
--- a/torch/_dynamo/backends/tvm.py
+++ b/torch/_dynamo/backends/tvm.py
@@ -201,7 +201,11 @@ def has_tvm():
         return False
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def llvm_target():
     if sys.platform == "linux":
         cpuinfo = open("/proc/cpuinfo").read()
diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py
index 2b46df42f040..c167655fe0e4 100644
--- a/torch/_dynamo/bytecode_transformation.py
+++ b/torch/_dynamo/bytecode_transformation.py
@@ -22,6 +22,10 @@
 import itertools
 import sys
 import types
+<<<<<<< HEAD
+=======
+import uuid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterator, Sequence
 from typing import Any, Callable, cast, Optional, Union
 
@@ -1372,6 +1376,10 @@ def clear_instruction_args(instructions):
             inst.arg = None
 
 
+<<<<<<< HEAD
+=======
+@functools.lru_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_code_keys() -> list[str]:
     # Python 3.11 changes to code keys are not fully documented.
     # See https://github.com/python/cpython/blob/3.11/Objects/clinic/codeobject.c.h#L24
@@ -1546,8 +1554,16 @@ def _cached_cleaned_instructions(code, safe=False) -> Sequence[Instruction]:
 _unique_id_counter = itertools.count()
 
 
+<<<<<<< HEAD
 def unique_id(name) -> str:
     return f"{name}_{next(_unique_id_counter)}"
+=======
+def unique_id(name, with_uuid=False) -> str:
+    ret = f"{name}_{next(_unique_id_counter)}"
+    if with_uuid:
+        ret += f"_{uuid.uuid4()}".replace("-", "_")
+    return ret
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_generator(code: types.CodeType) -> bool:
@@ -1602,6 +1618,10 @@ def template():
         # If we don't reset starts_line, then the generated
         # bytecode's line number will be based on fn's.
         inst.starts_line = None
+<<<<<<< HEAD
+=======
+        inst.positions = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if varname_map and inst.argval in varname_map:
             inst.argval = varname_map[inst.argval]
 
@@ -1643,7 +1663,11 @@ def template():
             # replace returns with jumps
             for inst in returns:
                 # don't replace inst with new instruction
+<<<<<<< HEAD
                 # due to targetting/exn table/etc.
+=======
+                # due to targeting/exn table/etc.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 jump_inst = create_jump_absolute(insts[-1])
                 inst.opname = jump_inst.opname
                 inst.opcode = jump_inst.opcode
diff --git a/torch/_dynamo/cache_size.py b/torch/_dynamo/cache_size.py
index f88f8dab7d62..494fad8df1e2 100644
--- a/torch/_dynamo/cache_size.py
+++ b/torch/_dynamo/cache_size.py
@@ -86,7 +86,11 @@ class CacheSizeRelevantForFrame:
     num_cache_entries_with_same_id_matched_objs: int = 0
 
     def will_compilation_exceed(self, limit: int) -> bool:
+<<<<<<< HEAD
         # Checks if a compilation will exceed the given limit (thats why >=).
+=======
+        # Checks if a compilation will exceed the given limit (that's why >=).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             self.will_compilation_exceed_accumulated_limit()
             or self.will_compilation_exceed_specific_limit(limit)
diff --git a/torch/_dynamo/callback.py b/torch/_dynamo/callback.py
index 5acc576ed45d..a289f5ec1527 100644
--- a/torch/_dynamo/callback.py
+++ b/torch/_dynamo/callback.py
@@ -25,31 +25,67 @@ def my_end_callback():
         print("Compilation complete")
 """
 
+<<<<<<< HEAD
+=======
+import enum
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import threading
 from collections.abc import Generator
 from contextlib import contextmanager
 from dataclasses import dataclass, field  # noqa: F811
+<<<<<<< HEAD
 from typing import Any, Callable, Optional
 
 from torch._utils_internal import justknobs_check
+=======
+from typing import Any, Callable
+
+
+class CallbackTrigger(enum.Enum):
+    # most common case, dynamo attempts to trace a new frame
+    DYNAMO = 1
+    # backward compilation can be deferred to runtime
+    LAZY_BACKWARD = 2
+    # some backends autotune at runtime
+    TRITON_AUTOTUNING = 3
+    # cudagraphs record at runtime
+    CUDAGRAPH_RECORDING = 4
+
+
+@dataclass
+class CallbackArgs:
+    callback_trigger: CallbackTrigger
+    compile_id: str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
 class CompilationCallbackHandler:
+<<<<<<< HEAD
     start_callbacks: list[Callable[[], None]] = field(default_factory=list)
     end_callbacks: list[Callable[[], None]] = field(default_factory=list)
 
     __prevent_duplicate_callbacks: Optional[bool] = field(
         default=None, init=False, repr=False
     )
+=======
+    start_callbacks: list[Callable[[CallbackArgs], None]] = field(default_factory=list)
+    end_callbacks: list[Callable[[CallbackArgs], None]] = field(default_factory=list)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __pending_callbacks_counter: int = field(default=0, init=False, repr=False)
     __pending_callbacks_counter_lock: threading.Lock = field(
         default_factory=threading.Lock, init=False, repr=False
     )
 
     def register_start_callback(
+<<<<<<< HEAD
         self, callback: Callable[[], None]
     ) -> Callable[[], None]:
+=======
+        self, callback: Callable[[CallbackArgs], None]
+    ) -> Callable[[CallbackArgs], None]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Register a callback function to be called when the compilation starts.
 
@@ -59,7 +95,13 @@ def register_start_callback(
         self.start_callbacks.append(callback)
         return callback
 
+<<<<<<< HEAD
     def register_end_callback(self, callback: Callable[[], None]) -> Callable[[], None]:
+=======
+    def register_end_callback(
+        self, callback: Callable[[CallbackArgs], None]
+    ) -> Callable[[CallbackArgs], None]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Register a callback function to be called when the compilation ends.
 
@@ -69,7 +111,11 @@ def register_end_callback(self, callback: Callable[[], None]) -> Callable[[], No
         self.end_callbacks.append(callback)
         return callback
 
+<<<<<<< HEAD
     def remove_start_callback(self, callback: Callable[[], None]) -> None:
+=======
+    def remove_start_callback(self, callback: Callable[[CallbackArgs], None]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Remove a registered start callback function.
 
@@ -78,7 +124,11 @@ def remove_start_callback(self, callback: Callable[[], None]) -> None:
         """
         self.start_callbacks.remove(callback)
 
+<<<<<<< HEAD
     def remove_end_callback(self, callback: Callable[[], None]) -> None:
+=======
+    def remove_end_callback(self, callback: Callable[[CallbackArgs], None]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Remove a registered end callback function.
 
@@ -87,18 +137,29 @@ def remove_end_callback(self, callback: Callable[[], None]) -> None:
         """
         self.end_callbacks.remove(callback)
 
+<<<<<<< HEAD
     def run_start_callbacks(self) -> None:
+=======
+    def run_start_callbacks(self, args: CallbackArgs) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Execute all registered start callbacks.
         """
         for callback in self.start_callbacks:
+<<<<<<< HEAD
             callback()
 
     def run_end_callbacks(self) -> None:
+=======
+            callback(args)
+
+    def run_end_callbacks(self, args: CallbackArgs) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Execute all registered end callbacks.
         """
         for callback in self.end_callbacks:
+<<<<<<< HEAD
             callback()
 
     @property
@@ -135,6 +196,32 @@ def install_callbacks(self) -> Generator[None, Any, Any]:
                 yield
             finally:
                 self.run_end_callbacks()
+=======
+            callback(args)
+
+    @contextmanager
+    def install_callbacks(
+        self, trigger: CallbackTrigger, compile_id: str
+    ) -> Generator[None, Any, Any]:
+        """
+        Context manager to install the callbacks and run them when the context is exited.
+        """
+        args = CallbackArgs(trigger, compile_id)
+        try:
+            with self.__pending_callbacks_counter_lock:
+                if self.__pending_callbacks_counter == 0:
+                    self.run_start_callbacks(args)
+                self.__pending_callbacks_counter += 1
+            yield
+        finally:
+            with self.__pending_callbacks_counter_lock:
+                assert self.__pending_callbacks_counter > 0, (
+                    "Pending callbacks counter cannot become negative."
+                )
+                if self.__pending_callbacks_counter == 1:
+                    self.run_end_callbacks(args)
+                self.__pending_callbacks_counter -= 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def clear(self) -> None:
         """
@@ -142,12 +229,22 @@ def clear(self) -> None:
         """
         self.start_callbacks.clear()
         self.end_callbacks.clear()
+<<<<<<< HEAD
+=======
+        assert self.__pending_callbacks_counter == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 callback_handler = CompilationCallbackHandler()
 
 
+<<<<<<< HEAD
 def on_compile_start(callback: Callable[[], None]) -> Callable[[], None]:
+=======
+def on_compile_start(
+    callback: Callable[[CallbackArgs], None],
+) -> Callable[[CallbackArgs], None]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Decorator to register a callback function for the start of the compilation.
     """
@@ -155,7 +252,13 @@ def on_compile_start(callback: Callable[[], None]) -> Callable[[], None]:
     return callback
 
 
+<<<<<<< HEAD
 def on_compile_end(callback: Callable[[], None]) -> Callable[[], None]:
+=======
+def on_compile_end(
+    callback: Callable[[CallbackArgs], None],
+) -> Callable[[CallbackArgs], None]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Decorator to register a callback function for the end of the compilation.
     """
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index 05dd42866e81..a595758db045 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -18,12 +18,20 @@
 import sys
 import types
 from collections import Counter
+<<<<<<< HEAD
 from typing import Optional, Union
+=======
+from typing import Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.nn
 from torch.utils._ordered_set import OrderedSet
 
+<<<<<<< HEAD
 from . import graph_break_hints, utils
+=======
+from . import config, graph_break_hints, utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .bytecode_transformation import (
     add_push_null,
     add_push_null_call_function_ex,
@@ -54,6 +62,13 @@
 from .variables.torch_function import TensorWithTFOverrideVariable
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from .symbolic_convert import InstructionTranslatorBase
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class GraphOutputEntry:
     index: int
@@ -67,7 +82,11 @@ class PyCodegen:
 
     def __init__(
         self,
+<<<<<<< HEAD
         tx=None,
+=======
+        tx: "InstructionTranslatorBase",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         root: Optional[torch.nn.Module] = None,
         graph_output_var: Optional[str] = None,
         tempvars=None,
@@ -75,7 +94,11 @@ def __init__(
     ) -> None:
         self.root = root
         self.top_of_stack: Optional[Union[VariableTracker, Source]] = None
+<<<<<<< HEAD
         self.uses: Counter[VariableTracker] = collections.Counter()
+=======
+        self.uses: Counter[Union[VariableTracker, Source]] = collections.Counter()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.graph_outputs: dict[int, GraphOutputEntry] = {}
         self._output: list[Instruction] = []
         # This determines which VariableTracker/Source should be stored as
@@ -177,9 +200,15 @@ def __call__(self, value, allow_cache=True):
         Notable effects:
         1. `self.top_of_stack` will be set to `value`, if we don't codegen
            `value` based on source.
+<<<<<<< HEAD
         2. `self.uses[value]` will increment, if we don't codegen `value` based
            on source or cache/top-of-stack reuse; in other words, if we codegen
            as if `value` is modelling some brand new python value.
+=======
+        2. `self.uses[value]` will increment, unless (a). we codegen via
+            `top_of_stack` or cached `tempvars`, or (b). `value` has special VT
+            types like `NNModuleVariable`, etc.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if isinstance(value, Source):
             # If the source needs to be overridden, use the new one.
@@ -194,6 +223,10 @@ def __call__(self, value, allow_cache=True):
                 self.top_of_stack = source
                 return
 
+<<<<<<< HEAD
+=======
+            self.uses[source] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 self.call_reconstruct(source)
             except NotImplementedError:
@@ -203,9 +236,15 @@ def __call__(self, value, allow_cache=True):
                     explanation=f"Dynamo has no bytecode reconstruction implemented for {type(source)} variable {source}.",
                     hints=[*graph_break_hints.DYNAMO_BUG],
                 )
+<<<<<<< HEAD
 
             self._output.append(create_dup_top())
             self.add_cache(source)
+=======
+            if source in self.tempvars:
+                self._output.append(create_dup_top())
+                self.add_cache(source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.top_of_stack = source
 
             return
@@ -248,7 +287,11 @@ def __call__(self, value, allow_cache=True):
             # above, export _wants to_ obtain an identity FX graph (despite it
             # appears unnecessarily expensive for `torch.compile`), so we have
             # the following option to override Dynamo's preference for codegen
+<<<<<<< HEAD
             # from source. Morever, this option applies recursively, for cases
+=======
+            # from source. Moreover, this option applies recursively, for cases
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # like input tensor being returned in a new dictionary.
             #
             # And why the `ValueMutationExisting` check? Not sure, so leaving it
@@ -345,10 +388,17 @@ def gen_fn():
                     context=str(value),
                     explanation=f"Dynamo has no bytecode reconstruction implemented for sourceless variable {value}.",
                     hints=[
+<<<<<<< HEAD
                         "If Dynamo attempting to trace a return statement and your code is attempting to return a variable "
                         "that Dynamo cannot reconstruct, then remove it from the return statement.",
                         *graph_break_hints.CAUSED_BY_EARLIER_GRAPH_BREAK,
                         "Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have"
+=======
+                        "If Dynamo is attempting to trace a return statement and your code is attempting to return a variable "
+                        "that Dynamo cannot reconstruct, then remove it from the return statement.",
+                        *graph_break_hints.CAUSED_BY_EARLIER_GRAPH_BREAK,
+                        "Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "reconstruction rules may be fundamentally unreconstructable.",
                     ],
                 )
@@ -503,6 +553,7 @@ def rot_n(self, n):
                 create_instruction("UNPACK_SEQUENCE", arg=n),
             ]
 
+<<<<<<< HEAD
     def pop_null(self):
         # POP_TOP doesn't work for null, so we pop nulls by pushing in a
         # nop function, calling it (which consumes the null), and popping the result.
@@ -519,6 +570,8 @@ def pop_null(self):
             create_instruction("POP_TOP"),
         ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def pop_top(self):
         self.append_output(create_instruction("POP_TOP"))
 
@@ -602,7 +655,11 @@ def make_call_generated_code(self, fn_name: str) -> None:
 
         def collect_temp_source(source):
             if source in seen_sources:
+<<<<<<< HEAD
                 # This source is used atleast twice, so it can be reused
+=======
+                # This source is used at least twice, so it can be reused
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.mark_source_temp(source)
                 # Dont trace source further. This prevents us from marking too
                 # many nodes as temp sources.
@@ -625,6 +682,21 @@ def collect_temp_source(source):
             if arg.source is not None:
                 collect_temp_source(arg.source)
 
+<<<<<<< HEAD
+=======
+        cm_var = None
+        if config.record_runtime_overhead:
+            # Record the pregraph bytecode start
+            self.add_push_null(
+                lambda: self.load_import_from(
+                    utils.__name__, "record_pregraph_bytecode_enter"
+                )
+            )
+            self.extend_output(create_call_function(0, False))
+            cm_var = self.new_var()
+            self.store(cm_var)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for arg in graphargs:
             if arg.pass_arg_as_tensor:
                 self.add_push_null(
@@ -640,6 +712,21 @@ def collect_temp_source(source):
             else:
                 self.call_reconstruct(arg)
 
+<<<<<<< HEAD
+=======
+        if config.record_runtime_overhead:
+            # Record the pregraph bytecode end
+            self.add_push_null(
+                lambda: self.load_import_from(
+                    utils.__name__, "record_pregraph_bytecode_exit"
+                )
+            )
+            assert cm_var is not None
+            self.extend_output([self.create_load(cm_var)])
+            self.extend_output(create_call_function(1, False))
+            self.pop_top()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.extend_output(create_call_function(len(graphargs), False))
 
     def load_import_from(self, module_name, object_name) -> None:
diff --git a/torch/_dynamo/compiled_autograd.py b/torch/_dynamo/compiled_autograd.py
index 06fc8cdb1fc4..8a9730542fc0 100644
--- a/torch/_dynamo/compiled_autograd.py
+++ b/torch/_dynamo/compiled_autograd.py
@@ -22,14 +22,26 @@
 import operator
 import time
 from collections import Counter, defaultdict
+<<<<<<< HEAD
 from typing import Any, Optional, TYPE_CHECKING, Union
+=======
+from typing import Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
 from torch._dynamo.external_utils import (
+<<<<<<< HEAD
     call_backward,
     call_hook,
     FakeCompiledAutogradEngine,
+=======
+    call_accumulate_grad,
+    call_backward,
+    call_hook,
+    FakeCompiledAutogradEngine,
+    unwrap_maybe_dynamic_int,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._dynamo.source import GetItemSource, LocalSource
 from torch._dynamo.utils import (
@@ -38,6 +50,13 @@
     lazy_format_graph_code,
     set_locals_to_steal,
 )
+<<<<<<< HEAD
+=======
+from torch._functorch._aot_autograd.runtime_wrappers import (
+    AutogradLazyBackwardCompileInfo,
+    CachedAutogradLazyBackwardCompileInfo,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._guards import compile_context, CompileContext, CompileId
 from torch._logging import getArtifactLogger, trace_structured
 from torch._prims_common import clone_preserve_strides
@@ -63,6 +82,15 @@
     from torch.fx.proxy import Proxy
 
 
+<<<<<<< HEAD
+=======
+TURN_OFF_MSG = """You can turn off compiled autograd by either:
+1. Moving the unsupported autograd call outside of the torch.compile'd region.
+2. Wrapping the unsupported autograd call in the torch._dynamo.compiled_autograd._disable() context manager.
+3. Setting torch._dynamo.config.compiled_autograd=False for the torch.compile call containing the unsupported autograd call.
+4. Setting torch._dynamo.config.compiled_autograd=False at the start of the program."""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 compiled_autograd_log = getArtifactLogger(__name__, "compiled_autograd")
 verbose_log = getArtifactLogger(__name__, "compiled_autograd_verbose")
 
@@ -83,6 +111,110 @@ def maybe_clone(x):
     return x
 
 
+<<<<<<< HEAD
+=======
+def extract_bw_module(CompiledFunction):
+    if isinstance(
+        CompiledFunction._lazy_backward_info, AutogradLazyBackwardCompileInfo
+    ):
+        return CompiledFunction._lazy_backward_info.bw_module
+    elif isinstance(
+        CompiledFunction._lazy_backward_info, CachedAutogradLazyBackwardCompileInfo
+    ):
+        with torch._subclasses.fake_tensor.unset_fake_temporarily():
+            return CompiledFunction._lazy_backward_info.bw_module_fn()
+    else:
+        raise AssertionError(
+            "Unexpected Lazy Backward Compilation Info Type. Please file an issue."
+        )
+
+
+# Note: [Anomaly Mode Semantics in Compiled Autograd]
+# In the eager autograd engine, anomaly mode is able to detect NaNs
+# after each node. This is useful, because the executed code with
+# and without anomaly mode are the same. So assuming determinism,
+# a NaN in regular mode should also happen in anomaly mode.
+#
+# With torch.compile, following eager semantics would require inserting
+# runtime asserts to check for NaNs, which could prevent some fusions.
+# This results in different code being run with and without anomaly mode.
+# So different semantics are needed, this implementation below will check
+# for NaNs at the end of the autograd call, instead of after each node
+class NaNChecker:
+    def __init__(self, accumulate_grad: bool):
+        self.accumulate_grad = accumulate_grad
+        self.params_indices: list[int] = []
+        self.params_to_check: dict[str, torch.Tensor] = {}
+        self.output_names: list[str] = []
+
+    def prep_with_graph(self, graph: torch.fx.Graph):
+        inputs_node = next(iter(graph.nodes))
+        acc_grad_nodes = graph.find_nodes(
+            op="call_function", target=call_accumulate_grad
+        )
+        output_nodes = graph.find_nodes(op="output")[0].args[0]
+        assert self.accumulate_grad == bool(
+            acc_grad_nodes
+        ) and self.accumulate_grad == (not output_nodes)
+
+        for node in acc_grad_nodes:
+            param_node = node.args[0]
+            # AccumulateGrad always saves a reference to the param
+            # so Compiled Autograd will always lift the param and
+            # this should always be true
+            assert (
+                param_node.target == operator.getitem
+                and param_node.args[0] is inputs_node  # type: ignore[possibly-undefined]
+                and isinstance(param_node.args[1], int)
+            )
+            self.params_indices.append(param_node.args[1])
+
+        self.output_names = [node.name for node in output_nodes]
+
+    def prep_with_inputs(self, inputs: tuple[torch.Tensor]):
+        if not self.accumulate_grad:
+            # Using .grad, nothing to prep
+            return
+
+        # Using .backward, we must check existing grads on params if any
+        for idx in self.params_indices:
+            grad = inputs[idx].grad
+            if grad is not None:
+                assert not torch.isnan(grad).any(), (
+                    f"Compiled autograd running under anomaly mode with inputs[{idx}] already "
+                    "having NaN gradient. This is not supported. {TURN_OFF_MSG}"
+                )
+
+            self.params_to_check[f"inputs[{idx}]"] = inputs[idx]
+
+    def check(self, out: tuple[torch.Tensor]):
+        if self.accumulate_grad:
+            # Using .backward, graph outputs are empty
+            assert not out
+            nan_params: list[str] = []
+            for inputs_str, param in self.params_to_check.items():
+                assert param.grad is not None  # not true for autograd.grad
+                if torch.isnan(param.grad).any():
+                    nan_params.append(inputs_str)
+
+            if nan_params:
+                raise RuntimeError(
+                    f"Compiled Autograd returned NaN gradients for parameters: {','.join(nan_params)}."
+                )
+        else:
+            # Using .grad, graph outputs are grads
+            nan_grads: list[str] = []
+            for i, grad in enumerate(out):
+                if torch.isnan(grad).any():
+                    nan_grads.append(self.output_names[i])
+
+            if nan_grads:
+                raise RuntimeError(
+                    f"Compiled Autograd returned NaN gradients for output nodes: {','.join(nan_grads)}."
+                )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # We lazily bind "functional backward" variants for PyTorch built-in autograd
 # nodes to this class. Example: torch._dynamo.compiled_autograd.ops.MulBackward0
 # Each "functional backward" is bound the first time the node's apply_with_saved
@@ -140,7 +272,11 @@ def __repr__(self):
         call_hook,
         call_backward,
         FakeCompiledAutogradEngine._exec_final_callbacks_stub,
+<<<<<<< HEAD
         torch.ops.inductor.accumulate_grad_.default,
+=======
+        call_accumulate_grad,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 )
 
@@ -188,11 +324,23 @@ def begin_capture(
         sizes: list[int],
         scalars: list[Union[int, float]],
         origins: list[list[tuple[int, str]]],
+<<<<<<< HEAD
     ):
         counters["compiled_autograd"]["captures"] += 1
         self.id = next(COMPILE_COUNTER)
         self.compile_context = make_compile_context(self.id)
         self.compile_context.__enter__()
+=======
+        accumulate_grad: bool,
+        check_nans: bool,
+    ):
+        counters["compiled_autograd"]["captures"] += 1
+        self.id = next(COMPILE_COUNTER)
+        self.aot_id_counter: dict[int, int] = defaultdict(int)
+        self.compile_context = make_compile_context(self.id)
+        self.compile_context.__enter__()
+        self.nan_checker = NaNChecker(accumulate_grad) if check_nans else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.start_time_ns = time.time_ns()
         get_chromium_event_logger().log_event_start(
             "compiled_autograd",
@@ -200,8 +348,11 @@ def begin_capture(
             {"graph_id": self.id},
             log_pt2_compile_event=True,
         )
+<<<<<<< HEAD
         self.aot_graph_cls_name: Optional[str] = None
         self.aot_graph_infos: dict[int, dict[str, Any]] = {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fx_tracer.root = torch.nn.Module()
         self.fx_tracer.graph = torch.fx.Graph(tracer_cls=PythonKeyTracer)
         self.fx_tracer.tensor_attrs = {}
@@ -219,11 +370,24 @@ def begin_capture(
 
         self.stack.enter_context(preserve_node_meta())
         inputs_origins, sizes_origins, scalars_origins = origins
+<<<<<<< HEAD
         # tensor inputs to fake tensors
         inputs = [
             self.wrap_fake(x, self.source("inputs", idx))
             for idx, x in enumerate(inputs)
         ]
+=======
+
+        # tensor inputs to fake tensors
+        x = inputs[0]  # mypy will complain about unbound x
+        try:
+            for idx, x in enumerate(inputs):
+                inputs[idx] = self.wrap_fake(x, self.source("inputs", idx))
+        except Exception as e:
+            raise NotImplementedError(
+                f"Found tensor of type {type(x)}, which is not supported by FakeTensorMode. {TURN_OFF_MSG}"
+            ) from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.bind_objects_to_proxies(inputs, args_proxy, inputs_origins)
 
         # size inputs to symints
@@ -235,9 +399,26 @@ def begin_capture(
             )
             for idx, val in enumerate(sizes)
         ]
+<<<<<<< HEAD
         proxies = self.bind_objects_to_proxies(sizes, self.sizes_proxy, sizes_origins)
         for i, symint in enumerate(sizes):
             self.symnode_proxy_lookup[symint.node] = proxies[i]
+=======
+
+        # We want to mark every size as dynamic, but since there's no way to
+        # mark a primitive `int` as dynamic, we need to wrap it in a tensor.
+        # In the graph, we unwrap it with `unwrap_maybe_dynamic_int` back into a primitive.
+        proxies = [self.sizes_proxy[i] for i in range(len(sizes))]  # type: ignore[index]
+        for i, symint in enumerate(sizes):
+            proxies[i] = self.fx_tracer.create_proxy(
+                "call_function",
+                unwrap_maybe_dynamic_int,
+                (proxies[i],),
+                {},
+            )
+            self.symnode_proxy_lookup[symint.node] = proxies[i]
+        proxies = self.bind_objects_to_proxies(sizes, proxies, sizes_origins)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for idx, val in enumerate(scalars):
             source = self.source("scalars", idx)
@@ -319,10 +500,26 @@ def proxy_call_aot_backward(
 
         # NOTE: we should only close over constants
         CompiledFunction = ctx._forward_cls
+<<<<<<< HEAD
         metadata = CompiledFunction.metadata
         maybe_subclass_metadata = CompiledFunction.maybe_subclass_metadata
         del CompiledFunction
 
+=======
+        bw_module = extract_bw_module(CompiledFunction)
+        metadata = CompiledFunction.metadata
+        maybe_subclass_metadata = CompiledFunction.maybe_subclass_metadata
+        aot_id = CompiledFunction._aot_id
+        del CompiledFunction
+
+        if torch.is_grad_enabled():
+            for output_alias_info in metadata.output_info:
+                if output_alias_info.requires_grad:
+                    raise RuntimeError(
+                        "torch.compile does not currently support higher order gradients."
+                    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @torch._dynamo.allow_in_graph  # type: ignore[misc]
         def call_aot_bwd_prologue(ctx_saved_tensors, ctx_symints, *flat_args):
             out = torch._functorch._aot_autograd.runtime_wrappers._backward_prologue_functional(
@@ -361,9 +558,15 @@ def num_inputs(graph):
                         break
                 return num_args
 
+<<<<<<< HEAD
             # set up the proxy inputs to ctx._bw_module
             # the calling convention is: [*symints, *args (primals and tangents), backward_state]
             num_args = num_inputs(ctx._bw_module.graph)
+=======
+            # set up the proxy inputs to bw_module
+            # the calling convention is: [*symints, *args (primals and tangents), backward_state]
+            num_args = num_inputs(bw_module.graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             pall_args = [
                 pgrads[i] for i in range(num_args - int(pbackward_state is not None))
             ]
@@ -381,9 +584,29 @@ def num_inputs(graph):
             args_idx = 0
             value_remap = {}
             poutputs: Optional[list[torch.fx.Proxy]] = None
+<<<<<<< HEAD
             for node in ctx._bw_module.graph.nodes:
                 if node.op == "placeholder":
                     value_remap[node] = pall_args[args_idx].node
+=======
+
+            # names of nodes must appear only once in the fx.Graph
+            # dedup AOT backwards that appear multiple times
+            deduped_aot_id = str(aot_id)
+            if self.aot_id_counter[aot_id]:
+                deduped_aot_id += f"_{self.aot_id_counter[aot_id]}"
+            self.aot_id_counter[aot_id] += 1
+
+            def make_unique(node_name):
+                # make it both informative and unique
+                return f"aot{deduped_aot_id}_{node_name}"
+
+            for node in bw_module.graph.nodes:
+                if node.op == "placeholder":
+                    ph = pall_args[args_idx].node
+                    ph.name = make_unique(node.name)
+                    value_remap[node] = ph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     args_idx += 1
                 elif node.op == "output":
                     assert len(node.args) == 1
@@ -396,6 +619,7 @@ def num_inputs(graph):
                 elif node.op == "get_attr":
                     name = node.target
                     qualname = self.fx_tracer.get_fresh_qualname(name)
+<<<<<<< HEAD
                     setattr(
                         self.fx_tracer.root, qualname, getattr(ctx._bw_module, name)
                     )
@@ -408,6 +632,35 @@ def num_inputs(graph):
                     value_remap[node] = result
                 else:
                     raise AssertionError("shouldn't get here")
+=======
+                    setattr(self.fx_tracer.root, qualname, getattr(bw_module, name))
+                    result = self.fx_tracer.create_node("get_attr", qualname, (), {})
+                    result.name = make_unique(node.name)
+                    value_remap[node] = result
+                elif node.op == "call_function":
+                    if node.target == torch.ops.aten.view.default:
+                        # this aot bwd graph is being lazily compiled
+                        # we must manually apply the view_to_reshape post grad pass
+                        # since it was already applied to the aot fwd, and baked into the gradients
+                        node.target = torch.ops.aten.reshape.default
+                    result = self.fx_tracer.graph.node_copy(
+                        node, lambda n: value_remap[n]
+                    )
+                    result.name = make_unique(node.name)
+                    value_remap[node] = result
+                elif node.op == "call_module":
+                    name = node.target
+                    qualname = self.fx_tracer.get_fresh_qualname(name)
+                    setattr(self.fx_tracer.root, qualname, getattr(bw_module, name))
+                    result = self.fx_tracer.graph.node_copy(
+                        node, lambda n: value_remap[n]
+                    )
+                    result.target = qualname
+                    value_remap[node] = result
+                else:
+                    raise AssertionError("shouldn't get here")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert poutputs is not None
 
             # In general we don't know what the shapes of the outputs are, so allocate
@@ -502,6 +755,7 @@ def proxy_call_backward(
             self.bind_objects_to_proxies(grad_ins, proxies)
         return tuple(grad_ins)
 
+<<<<<<< HEAD
     def call_copy_slices_prologue(self, inputs, base, view):
         args = (
             inputs,
@@ -511,6 +765,26 @@ def call_copy_slices_prologue(self, inputs, base, view):
             view.sizes(),
             view.strides(),
             view.storage_offset(),
+=======
+    def call_copy_slices_prologue(
+        self,
+        inputs,
+        base_sizes,
+        base_strides,
+        base_storage_offset,
+        view_sizes,
+        view_strides,
+        view_storage_offset,
+    ):
+        args = (
+            inputs,
+            self.to_proxy(base_sizes),
+            self.to_proxy(base_strides),
+            self.to_proxy(base_storage_offset),
+            self.to_proxy(view_sizes),
+            self.to_proxy(view_strides),
+            self.to_proxy(view_storage_offset),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return self.proxy_call(copy_slices_prologue, args, [None] * 3)
 
@@ -567,6 +841,21 @@ def accumulate(self, old_var, new_var):
         self.bind_objects_to_proxies([result], [proxy_out])
         return result
 
+<<<<<<< HEAD
+=======
+    def accumulate_grad(self, variable, grad, has_post_hooks):
+        self.fx_tracer.create_proxy(
+            "call_function",
+            call_accumulate_grad,
+            args=(
+                self.to_proxy(variable),
+                self.to_proxy(grad),
+                has_post_hooks,
+            ),
+            kwargs={},
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def proxy_call_hook(self, hook, *args, **kwargs):
         return self.fx_tracer.create_proxy(
             "call_function",
@@ -604,6 +893,21 @@ def tensor_pre_hook(self, inputs, hook_id, i: int):
             self.bind_objects_to_proxies([inputs[i]], [proxy])
         return inputs
 
+<<<<<<< HEAD
+=======
+    def cpp_tensor_pre_hook(self, inputs: list[torch.Tensor], hook_id: int, i: int):
+        proxy = self.fx_tracer.create_proxy(
+            "call_function",
+            torch._C._dynamo.compiled_autograd.call_cpp_tensor_pre_hooks,
+            (hook_id, self.to_proxy(inputs[i])),
+            {},
+        )
+        with disable_proxy_modes_tracing():
+            inputs[i] = maybe_clone(inputs[i])
+            self.bind_objects_to_proxies([inputs[i]], [proxy])
+        return inputs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def pre_hook(self, inputs, hook_id):
         assert self.hooks_proxy is not None
         hook = self.hooks_proxy[hook_id]  # type: ignore[index]
@@ -718,18 +1022,63 @@ def dce(self):
         assert i == len(_graph_placeholders) - 1
 
         def is_impure(node):
+<<<<<<< HEAD
             return (
                 node in unpack_nodes
                 or node.op == "placeholder"
                 or node.op == "output"
                 or (node.op == "call_function" and node.target in _impure_targets)
             )
+=======
+            if node in unpack_nodes or (
+                node.op == "call_function" and node.target in _impure_targets
+            ):
+                return True
+            return node.is_impure()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         before = len(self.fx_tracer.graph.nodes)
         self.fx_tracer.graph.eliminate_dead_code(is_impure)
         after = len(self.fx_tracer.graph.nodes)
         verbose_log.debug("DCE removed %d nodes", before - after)
 
+<<<<<<< HEAD
+=======
+    def remove_unused_sizes(self):
+        used_sizes = []
+        unused_sizes = []
+
+        # seek placeholder, should be at nodes[1]
+        it = iter(self.fx_tracer.graph.nodes)
+        next(it)
+        sizes_node = next(it)
+        assert sizes_node.name == "sizes"
+
+        for getitem_node in sizes_node.users.keys():
+            assert getitem_node.target == operator.getitem
+            if getitem_node.users:
+                used_sizes.append(getitem_node)
+            else:
+                # remove from the graph
+                unused_sizes.append(getitem_node)
+
+        used_sizes_idx: set[int] = set()
+        for used in used_sizes:
+            assert isinstance(used.args, tuple)
+            assert used.args[0] == sizes_node
+            assert isinstance(used.args[1], int)
+            next_size_idx = len(used_sizes_idx)
+            # used later reindex the runtime sizes arg
+            used_sizes_idx.add(used.args[1])
+            # reindex the graph
+            used.args = (used.args[0], next_size_idx)
+
+        for unused in unused_sizes:
+            self.fx_tracer.graph.erase_node(unused)
+
+        return used_sizes_idx
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def create_graph_module(self, id):
         return GraphModule(self.fx_tracer.root, self.fx_tracer.graph, id)
 
@@ -771,7 +1120,10 @@ def end_capture(self, outputs):
                 f"CompiledAutograd{self.id}PreReordering",
             ).print_readable(print_output=False),
         )
+<<<<<<< HEAD
         self.rename_aot_dispatcher_nodes()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.delay_unpack_hook_nodes()
         self.reorder_tensor_pre_hook_nodes()
         self.reorder_pre_hook_nodes_to_schedule_asap()
@@ -790,6 +1142,14 @@ def end_capture(self, outputs):
         # Proper fix is Richard's Python compiled autograd effort which will avoid calling make_fx and
         # should prevent these ops from going into the CA graph.
         self.dce()
+<<<<<<< HEAD
+=======
+        if self.nan_checker:
+            self.nan_checker.prep_with_graph(self.fx_tracer.graph)
+
+        # keep only sizes that are actually used in the graph
+        used_sizes_idx = self.remove_unused_sizes()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         graph = self.create_graph_module(f"CompiledAutograd{self.id}")
         set_locals_to_steal(graph, ["inputs"])
@@ -811,11 +1171,37 @@ def runtime_wrapper(compiled_fn, inputs, sizes, scalars, hooks, packed_inputs):
             global in_compiled_autograd_region
             try:
                 in_compiled_autograd_region = True
+<<<<<<< HEAD
+=======
+
+                if self.nan_checker:
+                    self.nan_checker.prep_with_inputs(inputs)
+
+                filtered_sizes = []
+                for idx, integer in enumerate(sizes):
+                    if idx in used_sizes_idx:
+                        # can't create negative size
+                        if integer > 0:
+                            filtered_sizes.append(torch.empty(0, integer))
+                            torch._dynamo.maybe_mark_dynamic(filtered_sizes[-1], 1)
+                        else:
+                            filtered_sizes.append(integer)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for i in runtime_inputs_to_move:
                     inputs[i] = inputs[i].pin_memory().cuda(non_blocking=True)
 
                 with _disable(), make_compile_context(self.id):
+<<<<<<< HEAD
                     return compiled_fn(inputs, sizes, scalars, hooks, packed_inputs)
+=======
+                    out = compiled_fn(
+                        inputs, filtered_sizes, scalars, hooks, packed_inputs
+                    )
+                    if self.nan_checker:
+                        self.nan_checker.check(out)
+                    return out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             finally:
                 in_compiled_autograd_region = False
 
@@ -829,6 +1215,7 @@ def runtime_wrapper(compiled_fn, inputs, sizes, scalars, hooks, packed_inputs):
         self.compile_context.__exit__(None, None, None)
         return runtime_wrapper, self.compiler_fn(graph)
 
+<<<<<<< HEAD
     def rename_aot_dispatcher_nodes(self):
         """
         Renames nodes as they appear in the AOTDispatcher backward graphs, prefixed by AOT id
@@ -927,6 +1314,8 @@ def is_similar(ca: torch.fx.node.Node, aot: torch.fx.node.Node):
                     aot_id,
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def get_all_nodes(args):
         # filter out non-Node args, like None
@@ -950,7 +1339,11 @@ def reorder_accumulate_grad_nodes(self):
         pass attempts to reorder the graph to mimic eager behavior.
         """
         for node in self.fx_tracer.graph.find_nodes(
+<<<<<<< HEAD
             op="call_function", target=torch.ops.inductor.accumulate_grad_.default
+=======
+            op="call_function", target=call_accumulate_grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             param_node, grad_node = node.args[0], node.args[1]
             getitem_node = None
@@ -1092,10 +1485,14 @@ def reorder_post_acc_grad_hook_nodes(self):
             # find the corresponding acc_grad node
             acc_grad_node = None
             for n in list(param_node.users.keys()):
+<<<<<<< HEAD
                 if (
                     n.op == "call_function"
                     and n.target == torch.ops.inductor.accumulate_grad_.default
                 ):
+=======
+                if n.op == "call_function" and n.target == call_accumulate_grad:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     acc_grad_node = n
                     break
 
@@ -1144,10 +1541,14 @@ def reorder_post_hook_nodes(self):
                 )
 
             arg = max(input_nodes_and_users)  # last input users
+<<<<<<< HEAD
             if (
                 arg.op == "call_function"
                 and arg.target == torch.ops.inductor.accumulate_grad_.default
             ):
+=======
+            if arg.op == "call_function" and arg.target == call_accumulate_grad:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 param_node = arg.args[0]
                 post_acc_grad_hook_node = None
                 for n in list(param_node.users.keys()):
@@ -1225,6 +1626,7 @@ def set_node_origin(
                         """This compiled backward function was saved by AOTAutogradCache, which does not support
                     compiled autograd. Please turn off AOTAutogradCache using `TORCHINDUCTOR_AUTOGRAD_CACHE=0`."""
                     )
+<<<<<<< HEAD
                 self.aot_graph_cls_name = node_name
                 maybe_aot_id = forward_cls._aot_id
                 self.aot_graph_infos[nodecall_index] = {
@@ -1233,6 +1635,9 @@ def set_node_origin(
                     "aot_gm": forward_cls._lazy_backward_info.bw_module,
                 }
 
+=======
+                maybe_aot_id = forward_cls._aot_id
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_code = f"{node_name}{maybe_aot_id} (NodeCall {nodecall_index})"
         raw_stack_trace = CapturedTraceback.extract().format()[-1]
         new_stack_trace = raw_stack_trace.replace(
@@ -1250,6 +1655,7 @@ def set_node_origin(
 # global flag to check if we are processing graphs produced from a compiled autograd graph
 in_compiled_autograd_region = False
 
+<<<<<<< HEAD
 
 @contextlib.contextmanager
 def _enable(compiler_fn, dynamic=False):
@@ -1291,6 +1697,83 @@ def _enable(compiler_fn, dynamic=False):
             torch._C._dynamo.compiled_autograd.set_autograd_compiler(
                 prior_compiler, prior_dynamic
             )
+=======
+active_disable_ctx = False
+
+depth = 0
+
+
+@contextlib.contextmanager
+def _enable(compiler_fn, dynamic: bool = True, ignore_active_disable_ctx=True):
+    # The entrypoint to enable CA.
+    # It is recommended to enable via `torch._dynamo.config.compiled_autograd = True` rather
+    # than using this context manager directly. If you are torch.compiling the corresponding
+    # forward pass, make sure they are wrapped under this context as well.
+    #
+    # Example:
+    #   def train(model, inputs, target):
+    #     compiled_model = torch.compile(model)
+    #     pred = compiled_model(data)
+    #     loss = compute_loss(pred, target)
+    #     loss.backward()
+    #
+    #   with _enable(compiler_fn):
+    #      train(model, inputs, target)
+    #
+    # Inputs:
+    # - compiler_fn: The wrapper that will consume the compiled autograd graph, e.g. `torch.compile`
+    # - dynamic: Whether compiled autograd will treat tensors in the autograd graph (params, activations) as dynamic.
+    #   This doesn't affect the dynamic configuration of the compilation wrapper.
+
+    if not ignore_active_disable_ctx and active_disable_ctx:
+        yield
+    else:
+        if dynamic:
+            assert type(dynamic) is bool
+
+        from torch._dynamo import eval_frame
+
+        if eval_frame._stance.stance == "force_eager":
+            # If user explicitly sets Dynamo stance to "force_eager", we want Compiled Autograd
+            # to fall back to eager as well.
+            global compiled_autograd_enabled_force_eager
+            compiled_autograd_enabled_force_eager = True
+            try:
+                yield
+            finally:
+                compiled_autograd_enabled_force_eager = False
+        else:
+            # we need to import this, because user might not have imported it if they directly use this context manager
+            # we need to lazily import it, because of circular dependencies
+            import torch._inductor.cudagraph_trees
+
+            (
+                prior_compiler,
+                prior_dynamic,
+            ) = torch._C._dynamo.compiled_autograd.set_autograd_compiler(
+                functools.partial(AutogradCompilerInstance, compiler_fn), dynamic
+            )
+            if snapshot_verbose_logging_enabled():
+                torch._C._dynamo.compiled_autograd.set_verbose_logger(verbose_log)  # type:ignore[arg-type]
+            global compiled_autograd_enabled
+            compiled_autograd_enabled = True
+            global depth
+            prior_depth = depth
+            depth += 1
+            try:
+                with torch.autograd.set_multithreading_enabled(False):
+                    yield
+            finally:
+                if not prior_compiler:
+                    compiled_autograd_enabled = False
+                torch._C._dynamo.compiled_autograd.set_autograd_compiler(
+                    prior_compiler, prior_dynamic
+                )
+                depth -= 1
+                assert depth == prior_depth, (
+                    "Nested Compiled Autograd Contexts must return before their parent context"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @contextlib.contextmanager
@@ -1301,11 +1784,21 @@ def _disable():
     ) = torch._C._dynamo.compiled_autograd.set_autograd_compiler(None, False)
     global compiled_autograd_enabled
     compiled_autograd_enabled = False
+<<<<<<< HEAD
+=======
+    global active_disable_ctx
+    if not active_disable_ctx:
+        active_disable_ctx = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         yield
     finally:
         if prior_compiler:
             compiled_autograd_enabled = True
+<<<<<<< HEAD
+=======
+        active_disable_ctx = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._C._dynamo.compiled_autograd.set_autograd_compiler(
             prior_compiler, prior_dynamic
         )
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index b2527886f3b2..a9a7bd911d1f 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -38,7 +38,11 @@
 # [@compile_ignored: runtime_behaviour] verify the correctness of optimized backend
 verify_correctness = False
 
+<<<<<<< HEAD
 # need this many ops to create an FX graph
+=======
+# need this many ops to create an FX graph (deprecated: not used)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 minimum_call_count = 1
 
 # turn on/off DCE pass (deprecated: always true)
@@ -143,7 +147,11 @@
 # guard_nn_modules_using_dict_tags, the guard_nn_modules is not really required
 # but kept around for debugging and discussing unspecializing nn module
 # variables.
+<<<<<<< HEAD
 # TODO(janimesh, voz): Remove both of these flags (or atleast guard_nn_modules)
+=======
+# TODO(janimesh, voz): Remove both of these flags (or at least guard_nn_modules)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # once we have reached stability for the guard_nn_modules_using_dict_tags.
 guard_nn_modules_using_dict_tags = True
 
@@ -152,6 +160,7 @@
 # Non-Inductor backends can use this list for graph freezing.
 prepare_freezing = os.environ.get("TORCHDYNAMO_PREPARE_FREEZING", "0") == "1"
 
+<<<<<<< HEAD
 
 # This feature doesn't really work.  We offer this flag for experimental
 # purposes / if you want to help us build out support.
@@ -172,6 +181,18 @@
 # https://github.com/pytorch/torchdynamo/issues/1952
 traceable_tensor_subclasses: set[type[Any]] = set()
 
+=======
+# NOTE this has been deprecated, it does nothing now.
+traceable_tensor_subclasses: set[type[Any]] = set()
+
+# If a tensor subclass is put into this set, Dynamo will model its instasnces in
+# a very conservative and limited way (most likely causing lots of graph breaks
+# if one apply tensor ops on these instances). This is useful if you encounter
+# internal compiler errors from Dynamo which are caused by tensor subclasses,
+# and you are willing to tolerate potential graph breaks rather than hard error.
+nontraceable_tensor_subclasses: set[type[Any]] = set()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Suppress errors in torch._dynamo.optimize, instead forcing a fallback to eager.
 # This is a good way to get your model to work one way or another, but you may
 # lose optimization opportunities this way.  Devs, if your benchmark model is failing
@@ -188,7 +209,11 @@
 rewrite_assert_with_torch_assert = True
 
 # Disable dynamo
+<<<<<<< HEAD
 disable = os.environ.get("TORCH_COMPILE_DISABLE", False)
+=======
+disable = os.environ.get("TORCH_COMPILE_DISABLE", "0") == "1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # [@compile_ignored: runtime_behaviour] Get a cprofile trace of Dynamo
 cprofile = os.environ.get("TORCH_COMPILE_CPROFILE", False)
@@ -296,7 +321,11 @@
 
 # Specify how to optimize a compiled DDP module. The flag accepts a boolean
 # value or a string. There are 3 modes.
+<<<<<<< HEAD
 # 1. "ddp_optimizer" (or True): with "ddp_ptimizer", Dynamo will automatically
+=======
+# 1. "ddp_optimizer" (or True): with "ddp_optimizer", Dynamo will automatically
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # split model graph into pieces to match DDP bucket sizes to allow DDP
 # comm/compute overlap.
 # 2. "python_reducer" (experimental): this optimization requires the usage
@@ -332,6 +361,11 @@
 # Skip tracing the torchrec files added to trace_rules.FBCODE_SKIP_DIRS
 skip_torchrec = True
 
+<<<<<<< HEAD
+=======
+# Don't apply most trace_rules.py rules
+dont_skip_tracing = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # No longer used
 optimize_ddp_lazy_compile = False
@@ -405,12 +439,22 @@
 # Use C++ guard manager (deprecated: always true)
 enable_cpp_guard_manager = True
 
+<<<<<<< HEAD
 # Use C++ guard manger for symbolic shapes
+=======
+# Use C++ guard manager for symbolic shapes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 enable_cpp_symbolic_shape_guards = False
 
 # Enable tracing through contextlib.contextmanager
 enable_trace_contextlib = True
 
+<<<<<<< HEAD
+=======
+# Enable tracing through unittest
+enable_trace_unittest = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Enable tracing generator functions lazily. If False, Dynamo will exhaust
 # generators upon first execution. And if True, the generator will be accessed lazily
 enable_faithful_generator_behavior = True
@@ -421,7 +465,16 @@
     justknob="pytorch/compiler:inline_inbuilt_nn_modules",
 )
 
+<<<<<<< HEAD
 # Use C++ FrameLocalsMapping (raw array view of Python frame fastlocals)
+=======
+# Install "free" tensor variables (globals, non-locals, nn module attributes)
+# as graph attributes.  This is useful for export, as it
+# produces a consistent number of inputs to the graph.
+install_free_tensors = False
+
+# Use C++ FrameLocalsMapping (raw array view of Python frame fastlocals) (deprecated: always True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 enable_cpp_framelocals_guard_eval = True
 
 # Whether to automatically find and replace identical graph
@@ -432,6 +485,13 @@
 # This flag is ignored if use_graph_deduplication is True
 track_nodes_for_deduplication = False
 
+<<<<<<< HEAD
+=======
+# Whether to lint the graph after each region is replaced
+# (Debug)
+graph_deduplication_lint = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Issues a warning in Python 3.13.0 for possibly slower guard evaluation and
 # instructs user to attempt using 3.13.1+, where the CPython bug is fixed.
 # Should be disabled in dynamo-wrapped tests since some tests check that no warnings are issued.
@@ -489,14 +549,22 @@ def default_debug_dir_root():
 # This flag is ignored and maintained for backwards compatibility.
 capture_autograd_function = True
 
+<<<<<<< HEAD
 # This flag is ignored and maintained for backwards compatbility.
+=======
+# This flag is ignored and maintained for backwards compatibility.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 capture_func_transforms = True
 
 # If to log Dynamo compilation metrics into log files (for OSS) and Scuba tables (for fbcode).
 log_compilation_metrics = True
 
 # A set of logging functions which will be reordered to the end of graph breaks,
+<<<<<<< HEAD
 # allowing dynamo to construct larget graph. Note that there are some
+=======
+# allowing dynamo to construct large graph. Note that there are some
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # limitations to this, such as how it does not correctly print objects that were
 # mutated after the print statement.
 reorderable_logging_functions: set[Callable[[Any], None]] = set()
@@ -534,8 +602,24 @@ def default_debug_dir_root():
     os.environ.get("TORCH_FAKE_TENSOR_DISPATCH_CACHE_CROSSCHECK", "0") == "1"
 )
 
+<<<<<<< HEAD
 # Enables the Compiled Autograd engine to trace .backward() calls made under torch.compile().
 # Note: AOT Autograd will still trace joint graphs.
+=======
+# Disables inference mode for fake tensor prop during compilation. At runtime,
+# the inference_mode is still respected.
+fake_tensor_disable_inference_mode = True
+
+# Enables the Compiled Autograd engine to trace autograd calls made under torch.compile().
+# Note: AOTAutograd will still trace and partition an AOT backward graph local to that
+# compiled region. But AOTAutograd traces without knowledge of backward hooks which are
+# coordinated by the Autograd engine, and under the hood, it uses the torch.autograd.grad
+# API, so it cannot capture gradient accumulation operations (AccumulateGrad).
+#
+# Compiled Autograd will trace all autograd operations as seen by the Autograd engine.
+# This flag will also lift certain restrictions during the forward trace such as
+# registering backward hooks on tensors contained within the compiled region.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 compiled_autograd = False
 
 # Overrides torch.compile() kwargs for Compiled Autograd:
@@ -596,6 +680,17 @@ def default_debug_dir_root():
     env_name_default="TORCH_DYNAMO_RUN_GC_AFTER_COMPILE",
 )
 
+<<<<<<< HEAD
+=======
+# Takes the function/module decorated with torch.compile and passes it through a
+# wrapper. This ensures that nn.module hooks are also compiled in the same frame.
+wrap_top_frame = False
+
+# Flag to record runtime overhead in profile traces. Used for pre-graph bytecode
+# and AOTAutograd runtime wrapper.
+record_runtime_overhead = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # HACK: this is for testing custom ops profiling only
 _custom_ops_profile: Optional[Any] = None
 
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 3ad676cb3087..70967c10fdc1 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -48,6 +48,10 @@
 import torch
 import torch._logging
 from torch._C._dynamo.guards import GlobalStateGuard
+<<<<<<< HEAD
+=======
+from torch._dynamo.callback import CallbackTrigger
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.distributed import get_compile_pg
 from torch._dynamo.symbolic_convert import TensorifyState
 from torch._guards import compile_context, CompileContext, CompileId, tracing
@@ -72,7 +76,11 @@
 )
 from torch.utils._traceback import CapturedTraceback, format_traceback_short
 
+<<<<<<< HEAD
 from . import config, exc, graph_break_hints, trace_rules
+=======
+from . import config, decorators, exc, graph_break_hints, trace_rules
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .bytecode_analysis import remove_dead_code, remove_pointless_jumps
 from .bytecode_transformation import (
     check_inst_exn_tab_entries_valid,
@@ -99,6 +107,10 @@
     FailOnRecompileLimitHit,
     format_error_msg,
     InternalTorchDynamoError,
+<<<<<<< HEAD
+=======
+    PackageError,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     RecompileLimitExceeded,
     ShortenTraceback,
     SkipCodeRecursiveException,
@@ -113,7 +125,11 @@
     GuardedCode,
 )
 from .hooks import Hooks
+<<<<<<< HEAD
 from .pgo import put_code_state
+=======
+from .pgo import log_frame_dynamic_whitelist, put_code_state
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .replay_record import ExecutionRecord
 from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
 from .symbolic_convert import (
@@ -138,6 +154,11 @@
     is_namedtuple,
     istype,
     LazyString,
+<<<<<<< HEAD
+=======
+    maybe_disable_inference_mode,
+    maybe_disable_inference_mode_for_fake_prop,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     orig_code_map,
     reset_graph_break_dup_checker,
     setup_compile_debug,
@@ -157,6 +178,10 @@
 
 if typing.TYPE_CHECKING:
     from .backends.registry import CompilerFn
+<<<<<<< HEAD
+=======
+    from .package import CompilePackage
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .repro.after_dynamo import WrapBackendDebug
     from .types import BytecodeHook, CacheEntry, DynamoFrameType
     from .variables.builder import FrameStateSizeEntry
@@ -227,11 +252,23 @@ def preserve_global_state(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
         guards = GlobalStateGuard()
         prior_grad_mode = torch.is_grad_enabled()
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Just in case we get left in a bad dispatch state we want to restore
         # it. This can happen because the dispatch bits aren't a true
         # stack/counter - so we can't just increment/decrement them as we enter
         # and leave.
+<<<<<<< HEAD
         with torch._C._PreserveDispatchKeyGuard():
+=======
+        with (
+            torch._C._PreserveDispatchKeyGuard(),
+            maybe_disable_inference_mode(),
+            maybe_disable_inference_mode_for_fake_prop(),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prior_inference_mode = torch.is_inference_mode_enabled()
             prior_deterministic = torch.are_deterministic_algorithms_enabled()
             prior_warn_only = torch.is_deterministic_algorithms_warn_only_enabled()
@@ -401,11 +438,24 @@ def profile_wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T:
             f"/tmp/{func.__name__}_{str(trace_id).replace('/', '_')}.profile"
         )
         prof = cProfile.Profile()
+<<<<<<< HEAD
         prof.enable()
         start_ts = time.time()
         retval = prof.runcall(func, *args, **kwargs)
         profile_latency = time.time() - start_ts
         prof.disable()
+=======
+        try:
+            prof.enable()
+            start_ts = time.time()
+            retval = prof.runcall(func, *args, **kwargs)
+            profile_latency = time.time() - start_ts
+            prof.disable()
+        except ValueError:
+            log.exception("failed to enable cProfile")
+            profile_latency = 0
+            retval = func(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.warning(
             "### Cprofile for %s trace id [%s] took %.3f seconds ###",
             func.__name__,
@@ -415,7 +465,11 @@ def profile_wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T:
         ps = pstats.Stats(prof)
         try:
             prof.dump_stats(profile_path)
+<<<<<<< HEAD
         except PermissionError:
+=======
+        except OSError:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.exception("Cannot write to %s", profile_path)
         log.warning("Raw profile at %s", profile_path)
         svg_path = profile_path.with_suffix(".svg")
@@ -464,6 +518,10 @@ def __init__(
         one_graph: bool = True,
         export: bool = False,
         export_constraints: Optional[typing.Never] = None,
+<<<<<<< HEAD
+=======
+        package: Optional[CompilePackage] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         # assert export_constraints is None
         reset_graph_break_dup_checker()
@@ -471,6 +529,10 @@ def __init__(
         self._one_graph = one_graph
         self._export = export
         self._export_constraints = export_constraints
+<<<<<<< HEAD
+=======
+        self._package = package
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def _clone_with_backend(self) -> Callable[[CompilerFn], ConvertFrameAssert]:
@@ -553,6 +615,23 @@ def __call__(
         if not has_tensor_in_frame(frame):
             return ConvertFrameReturn()
 
+<<<<<<< HEAD
+=======
+        # skip tracing non-recursive disabled functions
+        # detect if the previous frame (non-convert_frame) is a non-recursive disable wrapper
+        prev_frame = sys._getframe()
+        while (
+            prev_frame
+            and "torch/_dynamo/convert_frame.py" in prev_frame.f_code.co_filename
+        ):
+            prev_frame = prev_frame.f_back  # type: ignore[assignment]
+        if (
+            prev_frame
+            and prev_frame.f_code is decorators._nonrecursive_disable_wrapper_code
+        ):
+            return ConvertFrameReturn(apply_to_code=False)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         global initial_global_state
         initial_global_state = GlobalStateGuard()
 
@@ -612,6 +691,10 @@ def __call__(
                 frame_state=frame_state,
                 compile_id=compile_id,
                 skip=skip + 1,
+<<<<<<< HEAD
+=======
+                package=self._package,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
@@ -620,9 +703,18 @@ def convert_frame_assert(
     one_graph: bool = True,
     export: bool = False,
     export_constraints: Optional[typing.Never] = None,
+<<<<<<< HEAD
 ) -> ConvertFrameAssert:
     """Fully convert a frame into an FX graph"""
     return ConvertFrameAssert(compiler_fn, one_graph, export, export_constraints)
+=======
+    package: Optional[CompilePackage] = None,
+) -> ConvertFrameAssert:
+    """Fully convert a frame into an FX graph"""
+    return ConvertFrameAssert(
+        compiler_fn, one_graph, export, export_constraints, package
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 from collections import OrderedDict
@@ -665,6 +757,10 @@ def _compile(
     *,
     compile_id: CompileId,
     skip: int = 0,
+<<<<<<< HEAD
+=======
+    package: Optional[CompilePackage] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> ConvertFrameReturn:
     from torch.fx.experimental.validator import (
         bisect,
@@ -689,7 +785,11 @@ def transform(
     ) -> None:
         nonlocal output
         nonlocal tracer
+<<<<<<< HEAD
         speculation_log.restart()
+=======
+        speculation_log.restart()  # type: ignore[has-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         exn_vt_stack = ExceptionStack()
         tracer = InstructionTranslator(
             instructions,
@@ -705,6 +805,7 @@ def transform(
             export,
             export_constraints,
             frame_state=frame_state,
+<<<<<<< HEAD
             speculation_log=speculation_log,
             exn_vt_stack=exn_vt_stack,
             distributed_state=distributed_state,
@@ -715,6 +816,20 @@ def transform(
                 tracer.run()
         except exc.UnspecializeRestartAnalysis:
             speculation_log.clear()
+=======
+            speculation_log=speculation_log,  # type: ignore[has-type]
+            exn_vt_stack=exn_vt_stack,
+            distributed_state=distributed_state,  # type: ignore[has-type]
+            package=package,
+        )
+
+        try:
+            tracer.output.mark_bytecode_tracing_start()
+            with tracing(tracer.output.tracing_context), tracer.set_current_tx():
+                tracer.run()
+        except exc.UnspecializeRestartAnalysis:
+            speculation_log.clear()  # type: ignore[has-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise
         except (
             exc.SpeculationRestartAnalysis,
@@ -747,6 +862,7 @@ def compile_inner(
     ) -> ConvertFrameReturn:
         with contextlib.ExitStack() as stack:
             stack.enter_context(
+<<<<<<< HEAD
                 dynamo_timed(
                     "_compile.compile_inner",
                     phase_name="entire_frame_compile",
@@ -757,6 +873,12 @@ def compile_inner(
                 _WaitCounter("pytorch.wait_counter.dynamo_compile").guard()
             )
             stack.enter_context(torch._dynamo.callback_handler.install_callbacks())
+=======
+                torch._dynamo.callback_handler.install_callbacks(
+                    CallbackTrigger.DYNAMO, str(CompileContext.current_compile_id())
+                )
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             stack.enter_context(CompileTimeInstructionCounter.record())
             return _compile_inner(code, one_graph, hooks, transform)
 
@@ -794,7 +916,14 @@ def log_bytecode(
         for attempt in itertools.count():
             CompileContext.get().attempt = attempt
             try:
+<<<<<<< HEAD
                 out_code = transform_code_object(code, transform)
+=======
+                with dynamo_timed(
+                    f"compile_attempt_{attempt}", log_pt2_compile_event=True
+                ):
+                    out_code = transform_code_object(code, transform)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 break
             except exc.RestartAnalysis as e:
                 if not isinstance(e, exc.TensorifyScalarRestartAnalysis):
@@ -831,7 +960,11 @@ def log_bytecode(
                     log.debug("No graph captured with one_graph=True")
                 return ConvertFrameReturn()
 
+<<<<<<< HEAD
         assert distributed_state is None or distributed_state.all_states is not None, (
+=======
+        assert distributed_state is None or distributed_state.all_states is not None, (  # type: ignore[has-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "compiler collective wasn't run before compilation completed"
         )
 
@@ -903,12 +1036,28 @@ def count_args(code: CodeType) -> int:
         assert output.guards is not None
         CleanupManager.instance[out_code] = output.cleanups
         nonlocal cache_entry
+<<<<<<< HEAD
         check_fn = CheckFunctionManager(
             code,
             output,
             cache_entry,
             hooks.guard_fail_fn if hooks else None,
         )
+=======
+        with dynamo_timed("build_guards", log_pt2_compile_event=True):
+            check_fn = CheckFunctionManager(
+                code,
+                output,
+                cache_entry,
+                hooks.guard_fail_fn if hooks else None,
+                hooks.guard_filter_fn if hooks else None,
+                guards_serialization_mode="save" if package else None,
+            )
+
+        if package is not None:
+            assert check_fn.guards_state is not None
+            package.add_guarded_code(check_fn.guards_state, out_code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         compile_id_str = str(compile_id) if compile_id is not None else "Unknown"
         annotation_str = "Torch-Compiled Region: " + compile_id_str
@@ -930,13 +1079,30 @@ def count_args(code: CodeType) -> int:
         return wrap_guarded_code(guarded_code)
 
     metrics_context = get_metrics_context()
+<<<<<<< HEAD
+=======
+    code_context = (
+        package.code_context(code) if package is not None else contextlib.nullcontext()
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with (
         _use_lazy_graph_module(config.use_lazy_graph_module),
         compile_context(CompileContext(compile_id)),
         chromium_event_timed(
             "dynamo", reset_event_log_on_exit=True, log_pt2_compile_event=True
         ),
+<<<<<<< HEAD
         metrics_context,
+=======
+        _WaitCounter("pytorch.wait_counter.entire_forward_compile").guard(),
+        metrics_context,
+        dynamo_timed(
+            "_compile.compile_inner",
+            phase_name="entire_frame_compile",
+            dynamo_compile_column_us="dynamo_cumulative_compile_time_us",
+        ),
+        code_context,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         restart_reasons: set[str] = set()
         # This is shared across restarts
@@ -1068,6 +1234,10 @@ def format_func_info(code: CodeType) -> str:
             # to upload for graph break though, because this can prevent
             # extra graph break compilations.)
             put_code_state()
+<<<<<<< HEAD
+=======
+            log_frame_dynamic_whitelist(code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return guarded_code
         except Exception as e:
@@ -1102,6 +1272,10 @@ def format_func_info(code: CodeType) -> str:
                     UncapturedHigherOrderOpError,
                     BisectValidationException,
                     ShortenTraceback,
+<<<<<<< HEAD
+=======
+                    PackageError,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             ):
                 raise
@@ -1123,6 +1297,10 @@ def format_func_info(code: CodeType) -> str:
 
             if tracer:
                 tracer.output.local_scope = {}
+<<<<<<< HEAD
+=======
+                tracer.f_locals = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             from .utils import curr_frame
 
@@ -1191,9 +1369,18 @@ def __init__(
         self,
         compiler_fn: CompilerFn,
         hooks: Hooks,
+<<<<<<< HEAD
     ) -> None:
         self._torchdynamo_orig_callable = compiler_fn
         self._inner_convert = convert_frame_assert(compiler_fn, one_graph=False)
+=======
+        package: Optional[CompilePackage] = None,
+    ) -> None:
+        self._torchdynamo_orig_callable = compiler_fn
+        self._inner_convert = convert_frame_assert(
+            compiler_fn, one_graph=False, package=package
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._hooks = hooks
 
     @property
@@ -1208,6 +1395,10 @@ def __call__(
         frame_state: dict[str, Union[int, FrameStateSizeEntry]],
         skip: int = 0,
     ) -> ConvertFrameReturn:
+<<<<<<< HEAD
+=======
+        input_codes.add(frame.f_code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         counters["frames"]["total"] += 1
         try:
             result = self._inner_convert(
@@ -1295,9 +1486,17 @@ def __call__(
         return ConvertFrameReturn()
 
 
+<<<<<<< HEAD
 def convert_frame(compiler_fn: CompilerFn, hooks: Hooks) -> ConvertFrame:
     """Try to convert a frame into an FX graph, if error leave frame unmodified"""
     return ConvertFrame(compiler_fn, hooks)
+=======
+def convert_frame(
+    compiler_fn: CompilerFn, hooks: Hooks, package: Optional[CompilePackage] = None
+) -> ConvertFrame:
+    """Try to convert a frame into an FX graph, if error leave frame unmodified"""
+    return ConvertFrame(compiler_fn, hooks, package=package)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # TODO mlazos: add support for same args, or record them
@@ -1367,6 +1566,11 @@ def __call__(
     ) -> ConvertFrameReturn:
         assert frame_state is not None
 
+<<<<<<< HEAD
+=======
+        input_codes.add(frame.f_code)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_skipfile = trace_rules.check(frame.f_code)
         if sys.version_info >= (3, 13):
             has_started_execution = frame.f_lasti > first_real_inst_idx(frame.f_code)
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 9482b54a19f7..7c9c40b01e0a 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -247,7 +247,11 @@ def __init__(self) -> None:
         return model_str
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)  # subprocess is expensive
+=======
+@functools.cache  # subprocess is expensive
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _cuda_system_info_comment():
     if not torch.cuda.is_available():
         return "# torch.cuda.is_available()==False, no GPU info collected\n"
@@ -370,7 +374,11 @@ def run_fwd_maybe_bwd(gm, args, only_fwd=False, disable_clone=False):
         gm.zero_grad(True)
 
     # TorchInductor returned callable expects lists. So, may need a boxed calling convention.
+<<<<<<< HEAD
     out = gm(args) if hasattr(gm, "_boxed_call") else gm(*args)
+=======
+    out = gm(args) if getattr(gm, "_boxed_call", False) else gm(*args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if only_fwd:
         return out
@@ -780,7 +788,11 @@ def forward(self, primals_1: "f32[1001, 6]", primals_2: "f32[s0]", primals_3: "S
     forward(**kwargs)
     """
 
+<<<<<<< HEAD
     from torch.fx.graph import dtype_abbrs
+=======
+    from torch.utils._dtype_abbrs import dtype_abbrs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     dtype_map = {value: key for key, value in dtype_abbrs.items()}
     dtype_pattern = "|".join(dtype_abbrs.values())
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
index b61408e6fd2f..6b7b647e21d7 100644
--- a/torch/_dynamo/decorators.py
+++ b/torch/_dynamo/decorators.py
@@ -7,6 +7,7 @@
 
 import functools
 import inspect
+<<<<<<< HEAD
 import sys
 import weakref
 from dataclasses import dataclass
@@ -16,6 +17,14 @@
 import torch
 from torch._environment import is_fbcode
 from torch._vendor.packaging.version import Version
+=======
+import weakref
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import ParamSpec
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._contextlib import _DecoratorContextManager
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
@@ -30,7 +39,15 @@
     skip_code,
 )
 from .exc import IncorrectUsage
+<<<<<<< HEAD
 from .external_utils import is_compiling
+=======
+from .external_utils import (
+    _dynamo_config_patch_proxy_dunder_call,
+    get_nonrecursive_disable_wrapper,
+    is_compiling,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .utils import is_function
 
 
@@ -40,6 +57,10 @@
     from torch._C._dynamo.eval_frame import (  # noqa: F401
         reset_code,
         set_eval_frame,
+<<<<<<< HEAD
+=======
+        set_guard_complete_hook,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         set_guard_error_hook,
         unsupported,
     )
@@ -65,7 +86,11 @@ def run(fn=None):
     return RunOnlyContext()
 
 
+<<<<<<< HEAD
 def disable(fn=None, recursive=True):
+=======
+def disable(fn=None, recursive=True, *, reason=None, wrapping=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Decorator to disable TorchDynamo
 
@@ -74,15 +99,44 @@ def disable(fn=None, recursive=True):
 
     If recursive=False, Dynamo skips frames associated with the function code,
     but still process recursively invoked frames.
+<<<<<<< HEAD
+=======
+
+    If reason is provided, it will be printed when Dynamo attempts to trace the disabled function.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if recursive:
         if fn is not None:
             fn = innermost_fn(fn)
             assert callable(fn)
+<<<<<<< HEAD
             return DisableContext()(fn)
         return DisableContext()
     else:
         return skip(fn)
+=======
+            return DisableContext(msg=reason, wrapping=wrapping)(fn)
+        return DisableContext(msg=reason, wrapping=wrapping)
+    else:
+
+        def wrap(fn):
+            fn = innermost_fn(fn)
+            assert callable(fn)
+
+            nonrecursive_disable_wrapper = get_nonrecursive_disable_wrapper(fn)
+            nonrecursive_disable_wrapper._torchdynamo_disable = True  # type: ignore[attr-defined]
+            nonrecursive_disable_wrapper._torchdynamo_disable_msg = reason  # type: ignore[attr-defined]
+            nonrecursive_disable_wrapper._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+            return nonrecursive_disable_wrapper
+
+        if fn is None:
+            return wrap
+        return wrap(fn)
+
+
+_nonrecursive_disable_wrapper_code = disable(lambda: None, recursive=False).__code__  # type: ignore[attr-defined]
+skip_code(_nonrecursive_disable_wrapper_code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def skip(fn=None):
@@ -263,6 +317,15 @@ def graph_break(msg=""):
     """Force a graph break"""
 
 
+<<<<<<< HEAD
+=======
+# NOTE: primarily used for internal debugging purposes!
+@_disallow_in_graph_helper(throw_if_not_allowed=False)
+def skip_frame(msg=""):
+    """Force a skipped frame"""
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def forbid_in_graph(fn):
     """
     Customize which functions TorchDynamo will assert are not present while tracing.
@@ -495,7 +558,11 @@ class directly; instead, use :func:`mark_dynamic`.
 
 
 @forbid_in_graph
+<<<<<<< HEAD
 def mark_unbacked(t, index, strict=False):
+=======
+def mark_unbacked(t, index, strict=False, specialize_on=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Mark a tensor as having an unbacked dim.  This changes the semantics of operations,
     we will always report the size does not equal zero/one, we will turn asserts
@@ -518,8 +585,22 @@ def mark_unbacked(t, index, strict=False):
             t._dynamo_strict_unbacked_indices.add(index)
             return
 
+<<<<<<< HEAD
         if not hasattr(t, "_dynamo_unbacked_indices"):
             t._dynamo_unbacked_indices = set()
+=======
+        if not hasattr(t, "_specialized_on"):
+            t._specialize_on = {}
+
+        if not hasattr(t, "_dynamo_unbacked_indices"):
+            t._dynamo_unbacked_indices = set()
+
+        # FX tracers don't respect @forbid_in_graph and choke on the following error since it passes in proxies:
+        # TypeError: 'Attribute' object does not support item assignment
+        if isinstance(t._specialize_on, dict):
+            t._specialize_on[index] = specialize_on if specialize_on is not None else []
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         t._dynamo_unbacked_indices.add(index)
         return
 
@@ -529,7 +610,11 @@ def mark_unbacked(t, index, strict=False):
 
 
 @forbid_in_graph
+<<<<<<< HEAD
 def mark_dynamic(t, index, *, min=None, max=None):
+=======
+def mark_dynamic(t, index, *, min=None, max=None, specialize_on=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Mark a tensor as having a dynamic dim and set corresponding min and max range for the dim.
 
@@ -552,6 +637,23 @@ def mark_dynamic(t, index, *, min=None, max=None):
     4) Attempts to trace this function will explicitly raise. As such, all calls to mark_dynamic must be made
     before torch.compile.
 
+<<<<<<< HEAD
+=======
+    5) If specialize_on is passed in, we will perform a single generic Dynamo trace followed by
+    multiple specialized compilations in addition to a single generic compilation. NB: For now we only support
+    per dimension specialization, or in other words we do not generate a cross product of specializations.
+    At runtime, we will dispatch to a specialized compiled region if the input matches the specialization criteria.
+
+    For example:
+        mark_dynamic(..., specialize_on=[
+            lambda x: x == 8,
+            lambda x: x == 16
+        ])
+
+    This approach results in one Dynamo trace and two backend compilations. When the input dimension equals 8 or 16
+    at runtime, execution will be directed to the specialized compiled region. Performance measurements indicate
+    2-8x speedups depending on the specific specialization and model architecture.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if is_traceable_wrapper_subclass(t):
         # default behavior: mirror mark_dynamic() on all inner tensors with same dim as t
@@ -564,14 +666,34 @@ def mark_dynamic(t, index, *, min=None, max=None):
         if not hasattr(t, "_dynamo_dynamic_indices"):
             t._dynamo_dynamic_indices = set()
             t._dynamo_dynamic_range = set()
+<<<<<<< HEAD
         # TODO(voz): Should we bounds check?
         t._dynamo_dynamic_indices.add(index)
         t._dynamo_dynamic_range.add(_DimRange(index, min, max))
+=======
+
+        if not hasattr(t, "_specialize_on"):
+            t._specialize_on = {}
+
+        # TODO(voz): Should we bounds check?
+        t._dynamo_dynamic_indices.add(index)
+        t._dynamo_dynamic_range.add(_DimRange(index, min, max))
+
+        # FX tracers don't respect @forbid_in_graph and choke on the following error since it passes in proxies:
+        # TypeError: 'Attribute' object does not support item assignment
+        if isinstance(t._specialize_on, dict):
+            t._specialize_on[index] = specialize_on if specialize_on is not None else []
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     assert isinstance(index, (list, tuple))
     for i in index:
         mark_dynamic(t, i, min=min, max=max)
+<<<<<<< HEAD
+=======
+        mark_dynamic(t, i, min=min, max=max, specialize_on=specialize_on)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @forbid_in_graph
@@ -644,7 +766,11 @@ def mark_static(t, index=None):
 
     if not isinstance(t, torch.Tensor):
         raise TypeError(
+<<<<<<< HEAD
             f"mark_static expects a tensor/nn.Module class but recieved {type(t)}"
+=======
+            f"mark_static expects a tensor/nn.Module class but received {type(t)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if isinstance(index, int):
@@ -670,7 +796,11 @@ def mark_static_address(t, guard=True):
     Tensors marked in this way will be kept alive until `torch._dynamo.reset()` is called.
     """
     if not isinstance(t, torch.Tensor):
+<<<<<<< HEAD
         raise TypeError(f"mark_static_address expects a tensor but recieved {type(t)}")
+=======
+        raise TypeError(f"mark_static_address expects a tensor but received {type(t)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if guard:
         t._dynamo_static_input_type = "guarded"  # type: ignore[attr-defined]
@@ -678,6 +808,7 @@ def mark_static_address(t, guard=True):
         t._dynamo_static_input_type = "unguarded"  # type: ignore[attr-defined]
 
 
+<<<<<<< HEAD
 # Note: this carefully avoids eagerly import einops.
 # TODO: we should delete this whole _allow_in_graph_einops logic by approximately 2024 Q2
 def _allow_in_graph_einops():
@@ -713,3 +844,140 @@ def _allow_in_graph_einops():
 
 
 trace_rules.add_module_init_func("einops", _allow_in_graph_einops)
+=======
+# One day, Dynamo will support tracing into einops directly (no allow_in_graph needed)
+# Note that PyTorch supports multiple versions of einops, so when that day comes,
+# we still need to be really careful about version matches.
+def _allow_in_graph_einops():
+    import einops
+
+    try:
+        # requires einops > 0.6.1, torch >= 2.0
+        from einops._torch_specific import (  # type: ignore[attr-defined]  # noqa: F401
+            _ops_were_registered_in_torchdynamo,
+        )
+
+        # einops > 0.6.1 will call the op registration logic as it is imported.
+    except ImportError:
+        # einops <= 0.6.1
+        allow_in_graph(einops.rearrange)
+        allow_in_graph(einops.reduce)
+        if hasattr(einops, "repeat"):
+            allow_in_graph(einops.repeat)  # available since einops 0.2.0
+        if hasattr(einops, "einsum"):
+            allow_in_graph(einops.einsum)  # available since einops 0.5.0
+        if hasattr(einops, "pack"):
+            allow_in_graph(einops.pack)  # available since einops 0.6.0
+        if hasattr(einops, "unpack"):
+            allow_in_graph(einops.unpack)  # available since einops 0.6.0
+
+
+# Note: this carefully avoids eagerly import einops.
+trace_rules.add_module_init_func("einops", _allow_in_graph_einops)
+
+
+# Proxy class for torch._dynamo.config patching - so dynamo can identify context managers/decorators
+# created by patch_dynamo_config, compared to ones created by a raw torch._dynamo.config.patch.
+class DynamoConfigPatchProxy:
+    def __init__(self, config_patch):
+        self.config_patch = config_patch
+
+    @property
+    def changes(self):
+        return self.config_patch.changes
+
+    # Decorator implementation that simply sets up `self` as a context manager.
+    # Placed in external_utils so that we can trace through it.
+    __call__ = _dynamo_config_patch_proxy_dunder_call
+
+    def __enter__(self):
+        return self.config_patch.__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        return self.config_patch.__exit__(exc_type, exc_val, exc_tb)
+
+
+# Criteria for patchable config:
+# - Config values must be constants (i.e. int, float, str, bool, None).
+#     - in particular, NO list, set, dict.
+# - Traceable config patches are only useful for configs that change dynamo behavior
+#   from symbolic_convert and below.
+#     - e.g. patching recompile_limit won't really do anything.
+# - For patching configs that affect Dynamo behavior above symbolic_convert,
+#   ensure that Dynamo behaves soundly even if tracing is done with different config.
+#     - e.g. be careful if patching guard-related configs as configs may have changed
+#       between guard creation and evaluation.
+_allowed_config_patches = (
+    "verbose",
+    "verify_correctness",
+    "rewrite_assert_with_torch_assert",
+    "capture_scalar_outputs",
+    "allow_unspec_int_on_nn_module",
+    "skip_torchrec",
+    "dont_skip_tracing",
+)
+
+from . import config
+
+
+for name in _allowed_config_patches:
+    assert hasattr(config, name), "nonexistent config"
+del config
+
+
+def _patch_dynamo_config_check(changes: dict[str, Any]):
+    for k, v in changes.items():
+        if k not in _allowed_config_patches:
+            raise ValueError(
+                f"patch_dynamo_config does not support patching config {k}"
+            )
+        if not torch._dynamo.utils.is_safe_constant(v):
+            raise ValueError(
+                f"patch_dynamo_config does not support patching config {k} "
+                f"with non-safe-constant value {v}"
+            )
+
+
+# TODO: also implement nonrecursive patch_dynamo_config/dont_skip_tracing.
+# Unlike config.patch, we also need to accept tuple as input in order to
+# deal with context manager reconstruction.
+def patch_dynamo_config(
+    arg1: Optional[Union[str, dict[str, Any], tuple[tuple[str, Any], ...]]] = None,
+    arg2: Any = None,
+    **kwargs: Any,
+) -> DynamoConfigPatchProxy:
+    """
+    A wrapper around torch._dynamo.config.patch that can be traced by Dynamo to
+    temporarily change config values DURING tracing.
+
+    See _allowed_config_patches for the list of allowed config patches.
+
+    Arguments are the same as with torch._dynamo.config.patch.
+
+    Can be used as a decorator or a context manager.
+
+    User code SHOULD NOT MODIFY the return value of this function.
+
+    WARNING: changing Dynamo config during tracing can lead to unpredictable tracing behavior!
+        Proceed only as advised!
+    """
+    if isinstance(arg1, tuple):
+        arg1 = dict(arg1)
+    config_patch = torch._dynamo.config.patch(arg1, arg2, **kwargs)
+    _patch_dynamo_config_check(config_patch.changes)
+    # check for valid patching using config_patch.changes
+    return DynamoConfigPatchProxy(config_patch)
+
+
+def dont_skip_tracing(fn=None):
+    """
+    Context manager/decorator to trace into functions intentionally marked by developers to be skipped
+    when tracing.
+
+    This decorator will also apply to recursively invoked functions.
+    """
+    ctx = patch_dynamo_config(dont_skip_tracing=True)
+    if fn:
+        return ctx(fn)
+    return ctx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/device_interface.py b/torch/_dynamo/device_interface.py
index d8610915ec3a..442f265eb667 100644
--- a/torch/_dynamo/device_interface.py
+++ b/torch/_dynamo/device_interface.py
@@ -17,6 +17,10 @@
 specialized implementations for each hardware backend's unique features.
 """
 
+<<<<<<< HEAD
+=======
+import inspect
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import time
 from collections.abc import Iterable
 from dataclasses import dataclass
@@ -31,8 +35,11 @@
 else:
     get_cuda_stream = None
 
+<<<<<<< HEAD
 _device_t = Union[torch.device, str, int, None]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Recording the device properties in the main process but used in worker process.
 caching_worker_device_properties: dict[str, Any] = {}
 caching_worker_current_devices: dict[str, int] = {}
@@ -45,7 +52,11 @@ class DeviceInterface:
     """
 
     class device:
+<<<<<<< HEAD
         def __new__(cls, device: _device_t):
+=======
+        def __new__(cls, device: torch.types.Device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise NotImplementedError
 
     class Event:
@@ -77,7 +88,11 @@ def current_device() -> int:
             raise NotImplementedError
 
         @staticmethod
+<<<<<<< HEAD
         def get_device_properties(device: _device_t = None):
+=======
+        def get_device_properties(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise NotImplementedError
 
     @staticmethod
@@ -85,7 +100,11 @@ def current_device():
         raise NotImplementedError
 
     @staticmethod
+<<<<<<< HEAD
     def set_device(device: _device_t):
+=======
+    def set_device(device: torch.types.Device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     @staticmethod
@@ -125,6 +144,7 @@ def get_raw_stream(device_idx: int) -> int:
         raise NotImplementedError
 
     @staticmethod
+<<<<<<< HEAD
     def synchronize(device: _device_t = None):
         raise NotImplementedError
 
@@ -134,6 +154,17 @@ def get_device_properties(cls, device: _device_t = None):
 
     @staticmethod
     def get_compute_capability(device: _device_t = None):
+=======
+    def synchronize(device: torch.types.Device = None):
+        raise NotImplementedError
+
+    @classmethod
+    def get_device_properties(cls, device: torch.types.Device = None):
+        return cls.Worker.get_device_properties(device)
+
+    @staticmethod
+    def get_compute_capability(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     @staticmethod
@@ -147,9 +178,36 @@ def is_dtype_supported(
         return dtype != torch.bfloat16 or cls.is_bf16_supported(including_emulation)
 
     @staticmethod
+<<<<<<< HEAD
     def memory_allocated(device: _device_t = None) -> int:
         raise NotImplementedError
 
+=======
+    def memory_allocated(device: torch.types.Device = None) -> int:
+        raise NotImplementedError
+
+    @staticmethod
+    def is_triton_capable(device: torch.types.Device = None) -> bool:
+        """
+        Returns True if the device has Triton support, False otherwise, even if
+        the appropriate Triton backend is not available.
+        """
+        return False
+
+    @classmethod
+    def raise_if_triton_unavailable(cls, device: torch.types.Device = None) -> None:
+        """
+        Raises a `RuntimeError` with the appropriate human-readable instructions
+        to resolve the issue if Triton is not available for the given device, or
+        the default device if `device` is `None`.
+
+        The caller should ensure the presence of the 'triton' package before
+        calling this method.
+        """
+        if not cls.is_triton_capable():
+            raise RuntimeError("This device is not capable of supporting Triton")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class DeviceGuard:
     """
@@ -179,12 +237,21 @@ def __exit__(self, type: Any, value: Any, traceback: Any):
 
 
 class CudaInterface(DeviceInterface):
+<<<<<<< HEAD
     device = torch.cuda.device
 
     # register Event and Stream class into the backend interface
     # make sure Event and Stream are implemented and inherited from the torch.Event and torch.Stream
     Event = torch.cuda.Event
     Stream = torch.cuda.Stream
+=======
+    device = torch.cuda.device  # type: ignore[assignment]
+
+    # register Event and Stream class into the backend interface
+    # make sure Event and Stream are implemented and inherited from the torch.Event and torch.Stream
+    Event = torch.cuda.Event  # type: ignore[assignment]
+    Stream = torch.cuda.Stream  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class Worker:
         @staticmethod
@@ -198,7 +265,11 @@ def current_device() -> int:
             return torch.cuda.current_device()
 
         @staticmethod
+<<<<<<< HEAD
         def get_device_properties(device: _device_t = None):
+=======
+        def get_device_properties(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if device is not None:
                 if isinstance(device, str):
                     device = torch.device(device)
@@ -238,13 +309,43 @@ def is_available() -> bool:
         return torch.cuda.is_available()
 
     @staticmethod
+<<<<<<< HEAD
     def get_compute_capability(device: _device_t = None):
+=======
+    def get_compute_capability(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch.version.hip is None:
             major, min = torch.cuda.get_device_capability(device)
             return major * 10 + min
         else:
             return torch.cuda.get_device_properties(device).gcnArchName.split(":", 1)[0]
 
+<<<<<<< HEAD
+=======
+    @staticmethod
+    def is_triton_capable(device: torch.types.Device = None) -> bool:
+        return (
+            torch.version.hip is not None
+            or torch.cuda.get_device_properties(device).major >= 7
+        )
+
+    @staticmethod
+    def raise_if_triton_unavailable(device: torch.types.Device = None) -> None:
+        from torch._inductor.exc import GPUTooOldForTriton
+
+        if not CudaInterface.is_triton_capable(device):
+            device_props = torch.cuda.get_device_properties(device)
+            raise GPUTooOldForTriton(device_props, inspect.currentframe())
+
+        import triton.backends
+
+        if torch.version.hip is not None:
+            if "amd" not in triton.backends.backends:
+                raise RuntimeError("triton not built with the 'amd' backend")
+        elif "nvidia" not in triton.backends.backends:
+            raise RuntimeError("triton not built with the 'nvidia' backend")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 get_xpu_stream: Optional[Callable[[int], int]]
 if torch.xpu._is_compiled():
@@ -254,9 +355,15 @@ def get_compute_capability(device: _device_t = None):
 
 
 class XpuInterface(DeviceInterface):
+<<<<<<< HEAD
     device = torch.xpu.device
     Event = torch.xpu.Event
     Stream = torch.xpu.Stream
+=======
+    device = torch.xpu.device  # type: ignore[assignment]
+    Event = torch.xpu.Event  # type: ignore[assignment]
+    Stream = torch.xpu.Stream  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class Worker:
         @staticmethod
@@ -270,7 +377,11 @@ def current_device() -> int:
             return torch.xpu.current_device()
 
         @staticmethod
+<<<<<<< HEAD
         def get_device_properties(device: _device_t = None):
+=======
+        def get_device_properties(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if device is not None:
                 if isinstance(device, str):
                     device = torch.device(device)
@@ -309,7 +420,11 @@ def is_available() -> bool:
         return torch.xpu.is_available()
 
     @staticmethod
+<<<<<<< HEAD
     def get_compute_capability(device: _device_t = None):
+=======
+    def get_compute_capability(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cc = torch.xpu.get_device_capability(device)
         return cc
 
@@ -317,6 +432,20 @@ def get_compute_capability(device: _device_t = None):
     def is_bf16_supported(including_emulation: bool = False) -> bool:
         return torch.xpu.is_bf16_supported()
 
+<<<<<<< HEAD
+=======
+    @staticmethod
+    def is_triton_capable(device: torch.types.Device = None) -> bool:
+        return True
+
+    @staticmethod
+    def raise_if_triton_unavailable(evice: torch.types.Device = None) -> None:
+        import triton.backends
+
+        if "intel" not in triton.backends.backends:
+            raise RuntimeError("triton not built with the 'intel' backend")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclass
 class CpuDeviceProperties:
@@ -334,6 +463,17 @@ def elapsed_time(self, end_event) -> float:
         def record(self, stream=None):
             self.time = time.perf_counter()
 
+<<<<<<< HEAD
+=======
+    class Worker:
+        @staticmethod
+        def get_device_properties(device: torch.types.Device = None):
+            import multiprocessing
+
+            cpu_count = multiprocessing.cpu_count()
+            return CpuDeviceProperties(cpu_count)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def is_available() -> bool:
         return True
@@ -343,7 +483,11 @@ def is_bf16_supported(including_emulation: bool = False):
         return True
 
     @staticmethod
+<<<<<<< HEAD
     def get_compute_capability(device: _device_t = None) -> str:
+=======
+    def get_compute_capability(device: torch.types.Device = None) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ""
 
     @staticmethod
@@ -355,6 +499,7 @@ def current_device():
         return 0
 
     @staticmethod
+<<<<<<< HEAD
     def synchronize(device: _device_t = None):
         pass
 
@@ -365,6 +510,21 @@ def get_device_properties(device: _device_t = None):
 
             cpu_count = multiprocessing.cpu_count()
             return CpuDeviceProperties(cpu_count)
+=======
+    def synchronize(device: torch.types.Device = None):
+        pass
+
+    @staticmethod
+    def is_triton_capable(device: torch.types.Device = None) -> bool:
+        return True
+
+    @staticmethod
+    def raise_if_triton_unavailable(device: torch.types.Device = None) -> None:
+        import triton.backends
+
+        if "cpu" not in triton.backends.backends:
+            raise RuntimeError("triton not built with the 'cpu' backend")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class MpsInterface(DeviceInterface):
@@ -376,7 +536,11 @@ def is_bf16_supported(including_emulation: bool = False) -> bool:
     def is_dtype_supported(
         cls, dtype: torch.dtype, including_emulation: bool = False
     ) -> bool:
+<<<<<<< HEAD
         if dtype == torch.float64:
+=======
+        if dtype in [torch.float64, torch.complex128]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
         return dtype != torch.bfloat16 or cls.is_bf16_supported(including_emulation)
 
@@ -389,16 +553,28 @@ def current_device():
         return 0
 
     @staticmethod
+<<<<<<< HEAD
     def get_compute_capability(device: _device_t = None) -> str:
         return ""
 
     @staticmethod
     def synchronize(device: _device_t = None):
+=======
+    def get_compute_capability(device: torch.types.Device = None) -> str:
+        return ""
+
+    @staticmethod
+    def synchronize(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.mps.synchronize()
 
     class Worker:
         @staticmethod
+<<<<<<< HEAD
         def get_device_properties(device: _device_t = None):
+=======
+        def get_device_properties(device: torch.types.Device = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return {}
 
         @staticmethod
diff --git a/torch/_dynamo/distributed.py b/torch/_dynamo/distributed.py
index aa60b325844b..ad6091a3e9ff 100644
--- a/torch/_dynamo/distributed.py
+++ b/torch/_dynamo/distributed.py
@@ -22,6 +22,10 @@
 
 
 _COMPILE_PG: Optional[dist.ProcessGroup] = None
+<<<<<<< HEAD
+=======
+_GUARD_PG: Optional[dist.ProcessGroup] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_compile_pg() -> Optional[dist.ProcessGroup]:
@@ -39,3 +43,18 @@ def get_compile_pg() -> Optional[dist.ProcessGroup]:
         return _COMPILE_PG
 
     return None
+<<<<<<< HEAD
+=======
+
+
+# NB: Unlike get_compile_pg, this is only called when guard collectives were
+# explicitly requested
+def get_guard_pg() -> Optional[dist.ProcessGroup]:
+    if dist.is_available() and dist.is_initialized():
+        global _GUARD_PG
+        if _GUARD_PG is None:
+            _GUARD_PG = dist.distributed_c10d._new_group_with_tag(pg_tag="pt2_guard_pg")
+        return _GUARD_PG
+
+    return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index dcdc5b3874d3..2fc4926d4794 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -58,18 +58,35 @@
     reset_code,
     set_code_exec_strategy,
     set_eval_frame,
+<<<<<<< HEAD
+=======
+    set_guard_complete_hook,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     set_guard_error_hook,
     set_skip_guard_eval_unsafe,
     unsupported,
 )
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.types import ConvertFrameReturn, FrameAction, FrameExecStrategy
+<<<<<<< HEAD
+=======
+from torch._export.utils import _compiling_state_context
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._subclasses.fake_tensor import unset_fake_temporarily
 from torch._utils_internal import justknobs_check, log_export_usage
 from torch.export.dynamic_shapes import (
     _combine_args,
+<<<<<<< HEAD
     _process_dynamic_shapes,
     _RelaxedConstraint,
+=======
+    _DimHint,
+    _DimHintType,
+    _IntWrapper,
+    _process_dynamic_shapes,
+    _RelaxedConstraint,
+    Constraint,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.fx import GraphModule
 from torch.fx.experimental._dynamism import (
@@ -85,7 +102,11 @@
 )
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
 
+<<<<<<< HEAD
 from . import config, convert_frame, external_utils, trace_rules, utils
+=======
+from . import config, convert_frame, distributed, external_utils, trace_rules, utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .backends.registry import CompilerFn, lookup_backend
 from .code_context import code_context
 from .exc import (
@@ -201,7 +222,13 @@ def _callback_from_stance(callback):
         if callback in (False, None):
             return callback
 
+<<<<<<< HEAD
         def fail_callback(*args, **kwargs):
+=======
+        def fail_callback(frame, *args, **kwargs):
+            if trace_rules.check(frame.f_code):
+                return ConvertFrameReturn()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(
                 "Detected recompile when torch.compile stance is 'fail_on_recompile'"
             )
@@ -214,7 +241,11 @@ def fail_callback(*args, **kwargs):
         raise RuntimeError(f"invalid torch.compile stance '{_stance}'")
 
 
+<<<<<<< HEAD
 def _create_wrapped_callback(compiler_fn, dynamism=None):
+=======
+def _create_wrapped_callback(compiler_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     hooks = Hooks()
     return convert_frame.catch_errors_wrapper(
         convert_frame.convert_frame(  # type: ignore[arg-type]
@@ -254,7 +285,11 @@ def callback_fn(*args, **kwargs):
         dynamism = track_dynamism_across_examples(example_inputs)
         code_context.get_context(frame.f_code)["dynamism"] = dynamism
         compiler_fn = callback._torchdynamo_orig_callable._torchdynamo_orig_callable
+<<<<<<< HEAD
         return _create_wrapped_callback(compiler_fn, dynamism)(*args, **kwargs)
+=======
+        return _create_wrapped_callback(compiler_fn)(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return callback_fn
 
@@ -308,12 +343,32 @@ class OptimizedModule(torch.nn.Module):
         "_forward",
         "__dict__",
         "named_children_walk",
+<<<<<<< HEAD
     }
 
     def __init__(self, mod: torch.nn.Module, dynamo_ctx) -> None:
         super().__init__()
         # Installs the params/buffer
         self._orig_mod = mod
+=======
+        "_super_module_initialized",
+    }
+
+    def __init__(self, mod: torch.nn.Module, dynamo_ctx) -> None:
+        # NOTE: this must go first, because attribute reads/writes of `self`
+        # uses `_orig_mod`, and sometimes users override `Module.__init__` to
+        # do attribute reads/writes on `self`.
+        #
+        # We also can't use regular setattr because `super().__setattr__` will
+        # complain for module value before `super().__init__()`
+        object.__setattr__(self, "_orig_mod", mod)
+        self._super_module_initialized = False
+        super().__init__()
+        self._super_module_initialized = True
+
+        # Installs the params/buffer
+        self._orig_mod = mod  # `super().__setattr__` will register this module
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.dynamo_ctx = dynamo_ctx
         self._initialize()
         self.training = self._orig_mod.training
@@ -323,9 +378,18 @@ def _initialize(self):
         if isinstance(self.dynamo_ctx, DisableContext):
             # No need to check trace rules
             self.forward = self.dynamo_ctx(self._orig_mod.__call__)
+<<<<<<< HEAD
         elif isinstance(self._orig_mod.forward, types.MethodType) and (
             trace_rules.check(self._orig_mod.forward)
             or getattr(self._orig_mod, "_is_fsdp_managed_module", False)
+=======
+        elif config.wrap_top_frame or (
+            isinstance(self._orig_mod.forward, types.MethodType)
+            and (
+                trace_rules.check(self._orig_mod.forward)
+                or getattr(self._orig_mod, "_is_fsdp_managed_module", False)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # This may be a torch.nn.* instance in trace_rules.py which
             # won't trigger a frame evaluation workaround to add an extra
@@ -339,6 +403,22 @@ def _initialize(self):
             self._forward = self.forward
             self.forward = self._call_lazy_check
 
+<<<<<<< HEAD
+=======
+    def __call__(self, *args, **kwargs):
+        if torch.nn.modules.module._has_any_global_hook():
+            warnings.warn(
+                "Using `torch.compile(module)` when there are global hooks on "
+                "modules (e.g., from `register_module_forward_hook`); this will"
+                " cause the hooks to fire an extra time for the "
+                "`OptimizedModule` created by `torch.compile(module)`. If this "
+                "causes undesired behavior, please try using `module.compile()`"
+                ", or use the per-module hooks instead",
+                stacklevel=2,
+            )
+        return super().__call__(*args, **kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __reduce__(self):
         return (self.__class__, (self._orig_mod, self.dynamo_ctx))
 
@@ -358,12 +438,20 @@ def training(self):
 
     @training.setter
     def training(self, value):
+<<<<<<< HEAD
         try:
             super().__getattr__("_orig_mod")
             self._orig_mod.training = value
         except AttributeError:
             # still initializing
             pass
+=======
+        # Ignore the `training` mutation in `super().__init__()`, since that's
+        # setting the default on `nn.Module`, but we are mirroring the
+        # `training` attr in `self._orig_mod`.
+        if self._super_module_initialized:
+            self._orig_mod.training = value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __getattr__(self, name):
         if name == "_orig_mod":
@@ -379,6 +467,18 @@ def __setattr__(self, name, val) -> None:
             return super().__setattr__(name, val)
         return setattr(self._orig_mod, name, val)
 
+<<<<<<< HEAD
+=======
+    def __delattr__(self, name):
+        # This mirrors `__setattr__`
+        if hasattr(type(self), name):
+            return super().__delattr__(name)
+
+        if name in OptimizedModule._opt_mod_attributes:
+            return super().__delattr__(name)
+        return delattr(self._orig_mod, name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _call_lazy_check(self, *args, **kwargs):
         if (
             hasattr(self._orig_mod, "_initialize_hook")
@@ -433,7 +533,13 @@ def innermost_fn(fn):
     unaltered_fn = fn
     while hasattr(unaltered_fn, "_torchdynamo_orig_callable"):
         unaltered_fn = unaltered_fn._torchdynamo_orig_callable
+<<<<<<< HEAD
         assert callable(unaltered_fn)
+=======
+        assert callable(unaltered_fn), (
+            f"A callable function is expected, but {type(unaltered_fn)} is provided."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return unaltered_fn
 
 
@@ -475,6 +581,41 @@ def _log_traced_frames():
     log.info(msg)
 
 
+<<<<<<< HEAD
+=======
+def guard_collectives_hook(guard_eval_result):
+    import torch.distributed as dist
+    from torch._dynamo.utils import dynamo_timed
+
+    # guard_eval_result == True  ==>  cache hit
+    if pg := distributed.get_guard_pg():
+        with dynamo_timed(
+            "guard_collective", log_pt2_compile_event=True, log_waitcounter=True
+        ):
+            log.info("guard_collective %s", guard_eval_result)
+            torch._logging.trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "guard_collective",
+                    "encoding": "string",
+                },
+                payload_fn=lambda: str(guard_eval_result),
+            )
+            # TODO: a bit awkward to time, this isn't inside of the dynamo compile region
+            all_results = [None] * pg.size()
+            dist.all_gather_object(all_results, guard_eval_result, group=pg)
+            # True = everyone hit, OK to run
+            # False = someone missed, force recompile everywhere
+            res = all(all_results)
+            log.info("guard_collective %s -> %s", guard_eval_result, res)
+            return res
+    return guard_eval_result
+
+
+_not_set = object()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _TorchDynamoContext:
     def __init__(
         self,
@@ -487,6 +628,10 @@ def __init__(
         export=False,
         dynamic=None,
         compiler_config=None,
+<<<<<<< HEAD
+=======
+        package=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__()
         assert callable(callback) or callback is False or callback is None
@@ -499,6 +644,10 @@ def __init__(
         self.compiler_config = compiler_config
         self.cleanup_fns: list[Callable[[], Any]] = []
         self.enter_exit_hooks = []
+<<<<<<< HEAD
+=======
+        self._package = package
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         patch_fn()
 
         # Save the backends so that we can reset them during torch._dynamo.reset
@@ -588,13 +737,23 @@ def get_compiler_config():
                 cls_obj._call_impl = self(cls_obj._call_impl)
             return cls_obj
 
+<<<<<<< HEAD
         assert callable(fn)
+=======
+        assert callable(fn), (
+            f"A callable function is expected, but {type(fn)} is provided."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         try:
             filename = inspect.getsourcefile(fn)
         except TypeError:
             filename = None
+<<<<<<< HEAD
         if (
+=======
+        if config.wrap_top_frame or (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (filename is None or trace_rules.check(fn))
             and (
                 getattr(fn, "__name__", "")
@@ -617,7 +776,11 @@ def do_nothing(*arg, **kwargs):
         is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing
 
         @functools.wraps(fn)
+<<<<<<< HEAD
         def _fn(*args, **kwargs):
+=======
+        def compile_wrapper(*args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prior = set_eval_frame(None)
             try:
                 if is_fx_tracing():
@@ -644,7 +807,11 @@ def _fn(*args, **kwargs):
                 # something onto the DynamicLayerStack then we pop it off (the
                 # constructed graph code isn't guarded with try/finally).
                 #
+<<<<<<< HEAD
                 # This used to be a context but putting a `with` here is a noticible
+=======
+                # This used to be a context but putting a `with` here is a noticeable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # perf regression (#126293)
                 saved_dynamic_layer_stack_depth = (
                     torch._C._functorch.get_dynamic_layer_stack_depth()
@@ -656,7 +823,16 @@ def _fn(*args, **kwargs):
                 except Unsupported as e:
                     if config.verbose:
                         raise
+<<<<<<< HEAD
                     raise e.with_traceback(None) from None
+=======
+                    # strip internal tracebacks from causes
+                    cur_exn: BaseException = e
+                    while cur_exn.__cause__ is not None:
+                        cur_exn.__cause__.with_traceback(None)
+                        cur_exn = cur_exn.__cause__
+                    raise e.with_traceback(None) from e.__cause__  # User compiler error
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except ShortenTraceback as e:
                     # Failures in the backend likely don't have useful
                     # data in the TorchDynamo frames, so we strip them out.
@@ -675,6 +851,7 @@ def _fn(*args, **kwargs):
                 _maybe_set_eval_frame(prior)
 
         # hooks to properly handle inlining
+<<<<<<< HEAD
         _fn._torchdynamo_inline = fn  # type: ignore[attr-defined]
 
         # Save the function pointer to find the original callable while nesting
@@ -685,6 +862,18 @@ def _fn(*args, **kwargs):
         # provide public api _fn.get_compiler_config()
         assert not hasattr(_fn, "get_compiler_config")
         _fn.get_compiler_config = get_compiler_config  # type: ignore[attr-defined]
+=======
+        compile_wrapper._torchdynamo_inline = fn  # type: ignore[attr-defined]
+
+        # Save the function pointer to find the original callable while nesting
+        # of decorators.
+        compile_wrapper._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+
+        # when compiling user function instead of nn.Module
+        # provide public api _fn.get_compiler_config()
+        assert not hasattr(compile_wrapper, "get_compiler_config")
+        compile_wrapper.get_compiler_config = get_compiler_config  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
@@ -725,7 +914,11 @@ def _fn(*args, **kwargs):
                 )
             always_optimize_code_objects[fn.__code__] = True
 
+<<<<<<< HEAD
         return _fn
+=======
+        return compile_wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class OptimizeContext(_TorchDynamoContext):
@@ -741,6 +934,10 @@ def __init__(
         rebuild_ctx: Optional[
             Callable[[], Union[OptimizeContext, _NullDecorator]]
         ] = None,
+<<<<<<< HEAD
+=======
+        package=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         def on_enter():
             install_generation_tagging_init()
@@ -754,15 +951,29 @@ def on_enter():
             export=export,
             dynamic=dynamic,
             compiler_config=compiler_config,
+<<<<<<< HEAD
+        )
+
+        if config.compiled_autograd:
+=======
+            package=package,
         )
 
         if config.compiled_autograd:
+            _dynamic = self._dynamic
+            if _dynamic is None:
+                _dynamic = not torch._dynamo.config.assume_static_by_default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def call_compiled_autograd():
                 assert rebuild_ctx is not None
                 compiler_fn = rebuild_ctx()
                 ctx = torch._dynamo.compiled_autograd._enable(
+<<<<<<< HEAD
                     compiler_fn, dynamic=self._dynamic
+=======
+                    compiler_fn, dynamic=_dynamic, ignore_active_disable_ctx=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 ctx.__enter__()
                 return functools.partial(ctx.__exit__, None, None, None)
@@ -794,8 +1005,15 @@ def __reduce__(self):
 
 
 class DisableContext(_TorchDynamoContext):
+<<<<<<< HEAD
     def __init__(self) -> None:
         super().__init__(callback=None)
+=======
+    def __init__(self, msg: Optional[str] = None, wrapping: bool = True) -> None:
+        super().__init__(callback=None)
+        self.msg = msg
+        self.wrapping = wrapping
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(self, fn):
         # Earlier this code was in the base class _TorchDynamoContext. But we
@@ -810,20 +1028,29 @@ def __call__(self, fn):
             new_mod._torchdynamo_orig_callable = mod.forward
             return new_mod
 
+<<<<<<< HEAD
         if inspect.isclass(fn):
+=======
+        if isinstance(fn, type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # User has wrapped the class with compile/disable decorator. Apply
             # disable to init/call method.
             cls_obj = fn
             # Disable on init is useful for reconstruction of bytecodes where we
             # want to prevent Dynamo from tracing into the init function. Check
             # test_reconstruction in test_model_output.py.
+<<<<<<< HEAD
             cls_obj.__init__ = self(cls_obj.__init__)
+=======
+            cls_obj.__init__ = self(cls_obj.__init__)  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls_obj.__call__ = self(cls_obj.__call__)
             if issubclass(cls_obj, torch.nn.Module):
                 # NN module variable tracker directly inlines the _call_impl. Disable it.
                 cls_obj._call_impl = self(cls_obj._call_impl)
             return cls_obj
 
+<<<<<<< HEAD
         assert callable(fn)
 
         @functools.wraps(fn)
@@ -833,16 +1060,41 @@ def _fn(*args, **kwargs):
                 prior_skip_guard_eval_unsafe = set_skip_guard_eval_unsafe(
                     _is_skip_guard_eval_unsafe_stance()
                 )
+=======
+        assert callable(fn), (
+            f"A callable function is expected, but {type(fn)} is provided."
+        )
+
+        def _fn(*args, **kwargs):
+            prior = set_eval_frame(None)
+            try:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _maybe_set_eval_frame(_callback_from_stance(self.callback))
                 try:
                     return fn(*args, **kwargs)
                 finally:
                     set_eval_frame(None)
+<<<<<<< HEAD
                     set_skip_guard_eval_unsafe(prior_skip_guard_eval_unsafe)
             finally:
                 _maybe_set_eval_frame(prior)
 
         _fn._torchdynamo_disable = True  # type: ignore[attr-defined]
+=======
+            finally:
+                _maybe_set_eval_frame(prior)
+
+        # Under some circumstances (e.g. precompile) we can end up calling @disable
+        # decorator in generated bytecode and trigger recompile. This is due to the
+        # fact that the old callback from torch.compile() is still active and under
+        # this circumstance we will trigger a failure with set_stance("fail_on_recompile").
+        # Therefore we want to skip calling into any frame in this case.
+        if self.wrapping:
+            _fn = functools.wraps(fn)(_fn)
+
+        _fn._torchdynamo_disable = True  # type: ignore[attr-defined]
+        _fn._torchdynamo_disable_msg = self.msg  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Save the function pointer to find the original callable while nesting
         # of decorators.
@@ -862,6 +1114,10 @@ def _optimize_catch_errors(
     dynamic=None,
     compiler_config=None,
     rebuild_ctx=None,
+<<<<<<< HEAD
+=======
+    package=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     return OptimizeContext(
         convert_frame.catch_errors_wrapper(compile_fn, hooks),
@@ -871,6 +1127,10 @@ def _optimize_catch_errors(
         dynamic=dynamic,
         compiler_config=compiler_config,
         rebuild_ctx=rebuild_ctx,
+<<<<<<< HEAD
+=======
+        package=package,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -889,16 +1149,33 @@ def get_compiler_fn(compiler_fn):
 
 class _NullDecorator(contextlib.nullcontext):  # type: ignore[type-arg]
     def __call__(self, fn):
+<<<<<<< HEAD
         assert callable(fn)
+=======
+        assert callable(fn), (
+            f"A callable function is expected, but {type(fn)} is provided."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fn
 
 
 def check_if_dynamo_supported():
     if sys.version_info >= (3, 14):
         raise RuntimeError("Python 3.14+ not yet supported for torch.compile")
+<<<<<<< HEAD
     elif sysconfig.get_config_var("Py_GIL_DISABLED") == 1:
         raise RuntimeError(
             "torch.compile is not supported on Python built with GIL disabled"
+=======
+    elif sysconfig.get_config_var("Py_GIL_DISABLED") == 1 and sys.version_info < (
+        3,
+        13,
+        3,
+    ):
+        raise RuntimeError(
+            "torch.compile is not supported on Python < 3.13.3 built with GIL disabled. "
+            "Please use Python 3.13.3+."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -951,8 +1228,15 @@ def _optimize(
     nopython=False,
     guard_export_fn=None,
     guard_fail_fn=None,
+<<<<<<< HEAD
+    disable=False,
+    dynamic=None,
+=======
+    guard_filter_fn=None,
     disable=False,
     dynamic=None,
+    package=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Union[OptimizeContext, _NullDecorator]:
     """
     The main entrypoint of TorchDynamo.  Do graph capture and call
@@ -986,7 +1270,15 @@ def toy_example(a, b): ...
     # There is some prior art around this, w/r/t nesting backend calls are enforced to be the same
     # compiler, however, this feels onerous for callback and hooks, and it feels better to give our users an
     # easier to understand UX at the cost of a little more plumbing on our end.
+<<<<<<< HEAD
     hooks = Hooks(guard_export_fn=guard_export_fn, guard_fail_fn=guard_fail_fn)
+=======
+    hooks = Hooks(
+        guard_export_fn=guard_export_fn,
+        guard_fail_fn=guard_fail_fn,
+        guard_filter_fn=guard_filter_fn,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._C._log_api_usage_once("torch._dynamo.optimize")
     if (
         disable
@@ -995,23 +1287,41 @@ def toy_example(a, b): ...
     ):
         return _NullDecorator()
 
+<<<<<<< HEAD
     backend = get_compiler_fn(backend)
 
     # Find if backend has any extra context manager
     backend_ctx_ctor = getattr(backend, "backend_ctx_ctor", null_context)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if nopython:
         return optimize_assert(
             backend,
             dynamic=dynamic,
             hooks=hooks,
             rebuild_ctx=rebuild_ctx,
+<<<<<<< HEAD
+        )
+=======
+            package=package,
         )
+
+    backend = get_compiler_fn(backend)
+
+    # Find if backend has any extra context manager
+    backend_ctx_ctor = getattr(backend, "backend_ctx_ctor", null_context)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The backend function is stashed in the callable returned by
     # _optimize_catch_errors in the field _torchdynamo_orig_callable. This can
     # be used by eval_frame.c to insert a guard on the backend.
     return _optimize_catch_errors(
+<<<<<<< HEAD
         convert_frame.convert_frame(backend, hooks=hooks),
+=======
+        convert_frame.convert_frame(backend, hooks=hooks, package=package),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hooks,
         backend_ctx_ctor,
         dynamic=dynamic,
@@ -1021,6 +1331,10 @@ def toy_example(a, b): ...
             else None
         ),
         rebuild_ctx=rebuild_ctx,
+<<<<<<< HEAD
+=======
+        package=package,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -1125,7 +1439,11 @@ def __init__(
             if i in matched_input_elements_to_fake:
                 arg.node.meta["val"] = matched_input_elements_to_fake[i]
             else:
+<<<<<<< HEAD
                 # Fill node.mata["val"] with faketensor from the input,
+=======
+                # Fill node.meta["val"] with faketensor from the input,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # if it's not found in matched_input_elements_positions
                 if fake_mode is not None and isinstance(flat_args[i], torch.Tensor):
                     # TODO(zhxchen17) Also preserve all the user constraints here.
@@ -1143,6 +1461,14 @@ def __init__(
                             constraint_sizes=[None] * len(flat_args[i].shape),
                         ),
                     )
+<<<<<<< HEAD
+=======
+                elif isinstance(flat_args[i], _IntWrapper):
+                    arg.node.meta["val"] = flat_args[i].val
+                else:
+                    arg.node.meta["val"] = flat_args[i]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.new_args.append(arg)
         self.old_args_gen = (self.new_args[i] for i in matched_input_elements_positions)
         self.matched_output_elements_positions = matched_output_elements_positions
@@ -1282,6 +1608,10 @@ def check_user_input_output(flat_values, error_type):
             torch.SymFloat,
             torch.SymBool,
             torch._C.ScriptObject,
+<<<<<<< HEAD
+=======
+            _IntWrapper,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ] + list(common_constant_types)
 
         def is_supported_type(val):
@@ -1478,6 +1808,10 @@ def export(
     prefer_deferred_runtime_asserts_over_guards: bool = False,
     allow_complex_guards_as_runtime_asserts: bool = False,
     _log_export_usage: bool = True,
+<<<<<<< HEAD
+=======
+    constraints: Optional[list[Constraint]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     **extra_kwargs,
 ) -> Callable[..., ExportResult]:
     """
@@ -1540,10 +1874,22 @@ def export(
     _f = f
     _specialize_float = specialize_float
     _assume_static_by_default = assume_static_by_default
+<<<<<<< HEAD
 
     def inner(*args, **kwargs):
         combined_args = _combine_args(_f, args, kwargs)
         constraints = _process_dynamic_shapes(combined_args, dynamic_shapes)
+=======
+    _constraints = constraints
+
+    def inner(*args, **kwargs):
+        if not _constraints:
+            combined_args = _combine_args(_f, args, kwargs)
+            constraints = _process_dynamic_shapes(combined_args, dynamic_shapes)
+        else:
+            constraints = _constraints
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f = _f
         specialize_float = _specialize_float
         assume_static_by_default = _assume_static_by_default
@@ -1634,8 +1980,40 @@ def result_capturing_wrapper(*graph_inputs):
                             value, static_shapes=True
                         )
 
+<<<<<<< HEAD
                     fake_graph_inputs = pytree.tree_map(
                         ambient_fake_mode.from_tensor, graph_inputs
+=======
+                    def fakify_with_ambient(path, t):
+                        if isinstance(t, torch.Tensor):
+                            return ambient_fake_mode.from_tensor(t, static_shapes=True)
+                        elif isinstance(t, _IntWrapper):
+                            if (
+                                t.dynamism is not None
+                                and isinstance(t.dynamism, _DimHint)
+                                and t.dynamism.type
+                                in (
+                                    _DimHintType.DYNAMIC,
+                                    _DimHintType.AUTO,
+                                )
+                            ):  # type: ignore[union-attr]
+                                from torch._export.non_strict_utils import (
+                                    key_path_to_source,
+                                )
+
+                                source = key_path_to_source(path)
+                                symint = ambient_fake_mode.shape_env.create_unspecified_symint_and_symbol(  # type: ignore[union-attr]
+                                    t.val, source, DimDynamic.DYNAMIC
+                                )
+                                return symint
+                            else:
+                                return t.val
+                        else:
+                            return t
+
+                    fake_graph_inputs = pytree.tree_map_with_path(
+                        fakify_with_ambient, graph_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     graph_captured_result = torch.func.functional_call(
                         graph, fake_params_buffers, fake_graph_inputs
@@ -1653,6 +2031,7 @@ def result_capturing_wrapper(*graph_inputs):
         constraint_violation_error = None
         if tracing_mode != "symbolic":
             assume_static_by_default = True
+<<<<<<< HEAD
         with config.patch(
             specialize_int=True,
             specialize_float=specialize_float,
@@ -1662,6 +2041,20 @@ def result_capturing_wrapper(*graph_inputs):
             capture_scalar_outputs=True,
             prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
             allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+=======
+        with (
+            config.patch(
+                specialize_int=True,
+                specialize_float=specialize_float,
+                assume_static_by_default=assume_static_by_default,
+                automatic_dynamic_shapes=False,
+                capture_dynamic_output_shape_ops=True,
+                capture_scalar_outputs=True,
+                prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+                allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+            ),
+            _compiling_state_context(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             opt_f = optimize_assert(
                 dynamo_normalization_capturing_compiler,
@@ -1726,7 +2119,11 @@ def result_capturing_wrapper(*graph_inputs):
                 "Failed to produce a graph during tracing as no tensor operations were found and same_signature is False."
             )
             # If the module does not contain any tensor computation, we would create a graph with inputs and outputs.
+<<<<<<< HEAD
             # To be consitant with the graph traced by dynano, `graph` will have only tensor inputs as placeholders
+=======
+            # To be consistent with the graph traced by dynano, `graph` will have only tensor inputs as placeholders
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # and tensor outputs as output nodes. non-tensor inputs and outputs will be added when rewriting signature.
             # We will also construct the `example_inputs`, `graph_captured_input`, and `graph_captured_result` corresponding
             # to `graph`.
@@ -1771,7 +2168,14 @@ def result_capturing_wrapper(*graph_inputs):
                 check_signature_rewritable(graph)
 
         # NB: This is mostly hitting the cache; Dynamo already converted these
+<<<<<<< HEAD
         example_fake_inputs = [fake_mode.from_tensor(t) for t in example_inputs]
+=======
+        example_fake_inputs = [
+            fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
+            for t in example_inputs
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if aten_graph:
             # Running graph with interpreter is needed for propagating the stack_trace
@@ -1845,6 +2249,7 @@ def graph_with_interpreter(*args):
         return inner
 
 
+<<<<<<< HEAD
 def optimize_assert(
     backend,
     *,
@@ -1853,6 +2258,30 @@ def optimize_assert(
     export_constraints=None,
     dynamic=None,
     rebuild_ctx=None,
+=======
+def optimize_assert(*args, **kwargs):
+    if "rebuild_ctx" in kwargs and kwargs["rebuild_ctx"] is not None:
+        # called from optimize
+        rebuild_ctx = kwargs["rebuild_ctx"]
+        del kwargs["rebuild_ctx"]
+    else:
+
+        def rebuild_ctx():
+            return optimize_assert(*args, **kwargs)
+
+    return _optimize_assert(rebuild_ctx, *args, **kwargs)
+
+
+def _optimize_assert(
+    rebuild_ctx: Callable[[], OptimizeContext],
+    backend,
+    *,
+    hooks=Hooks(None, None, None),
+    export=False,
+    export_constraints=None,
+    dynamic=None,
+    package=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     The same as `torch._dynamo.optimize(backend, nopython=True)`
@@ -1864,30 +2293,62 @@ def optimize_assert(
 
     return _optimize_catch_errors(
         convert_frame.convert_frame_assert(
+<<<<<<< HEAD
             backend, export=export, export_constraints=export_constraints
+=======
+            backend,
+            export=export,
+            export_constraints=export_constraints,
+            package=package,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
         hooks,
         backend_ctx_ctor,
         export=export,
         dynamic=dynamic,
         rebuild_ctx=rebuild_ctx,
+<<<<<<< HEAD
+=======
+        package=package,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
 class TorchPatcher:
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def patch():
         # A better way to disable the following would be decorate the source
         # functions with @torch._disable_dynamo. However, this causes issues
         # with torch.deploy internally.
         from .decorators import disable
 
+<<<<<<< HEAD
         torch.jit.trace = disable(torch.jit.trace)
         torch.jit.trace_module = disable(torch.jit.trace_module)
         torch.jit._get_trace_graph = disable(torch.jit._get_trace_graph)
         torch.fx._symbolic_trace.Tracer.trace = disable(
             torch.fx._symbolic_trace.Tracer.trace
+=======
+        torch.jit.trace = disable(
+            torch.jit.trace, reason="tracing into TorchScript not fully supported"
+        )
+        torch.jit.trace_module = disable(
+            torch.jit.trace_module,
+            reason="tracing into TorchScript not fully supported",
+        )
+        torch.jit._get_trace_graph = disable(
+            torch.jit._get_trace_graph,
+            reason="tracing into TorchScript not fully supported",
+        )
+        torch.fx._symbolic_trace.Tracer.trace = disable(
+            torch.fx._symbolic_trace.Tracer.trace,
+            reason="tracing into FX not fully supported",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         torch.distributions.Distribution.set_default_validate_args(False)
 
@@ -1929,7 +2390,16 @@ def patch():
 
             if hasattr(opt_mod, fused_fn_name):
                 setattr(
+<<<<<<< HEAD
                     opt_mod, fused_fn_name, disable(getattr(opt_mod, fused_fn_name))
+=======
+                    opt_mod,
+                    fused_fn_name,
+                    disable(
+                        getattr(opt_mod, fused_fn_name),
+                        reason="don't trace into fused optimizer",
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         optimizer_classes = [
@@ -1946,10 +2416,21 @@ def patch():
 
         for opt in optimizer_classes:
             if opt in excluded_optimizer_classes:
+<<<<<<< HEAD
                 opt.step = disable(opt.step)
 
             if hasattr(opt, "_init_group"):
                 opt._init_group = disable(opt._init_group)
+=======
+                opt.step = disable(
+                    opt.step, reason=f"optimizer {opt} step not supported"
+                )
+
+            if hasattr(opt, "_init_group"):
+                opt._init_group = disable(
+                    opt._init_group, reason=f"optimizer {opt} _init_group not supported"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def suppress_torch_distributed_warnings(fn):
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index 46456e76c398..6fdc8d9d049d 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -271,13 +271,24 @@ class FailOnRecompileLimitHit(Exception):
     pass
 
 
+<<<<<<< HEAD
+=======
+class PackageError(TorchDynamoException):
+    pass
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ObservedException(TorchDynamoException):
     # An exception observed during the tracing. This exception is used by Dynamo to handle exceptions.
     pass
 
 
 class ObservedUserStopIteration(ObservedException):
+<<<<<<< HEAD
     # An UserStopIteraion exception observed during the Dynamo tracing (e.g Dynamo tracing __next__)
+=======
+    # An UserStopIteration exception observed during the Dynamo tracing (e.g Dynamo tracing __next__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     value: Optional[Any]
 
     # Reference `StopIteration_init` in CPython
@@ -400,6 +411,10 @@ def handle_observed_exception(tx: Any) -> None:
     torch._subclasses.fake_tensor.DynamicOutputShapeException,
     torch._subclasses.fake_tensor.UnsupportedOperatorException,
     torch._subclasses.fake_tensor.UnsupportedFakeTensorException,
+<<<<<<< HEAD
+=======
+    torch._subclasses.fake_tensor.UnsupportedMutationAliasingException,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -510,6 +525,15 @@ def unimplemented_v2(
     """
 
     msg = format_graph_break_message(gb_type, context, explanation, hints)
+<<<<<<< HEAD
+=======
+
+    # Temporarily disabling the generation of the weblinks in error message
+
+    # documentation_link = get_gbid_documentation_link(gb_type)
+    # msg += f"\n For more details about this graph break, please visit: {documentation_link}"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if log_warning:
         log.warning(msg)
     if from_exc is not _NOTHING:
diff --git a/torch/_dynamo/external_utils.py b/torch/_dynamo/external_utils.py
index 81c5fb3f048a..be54e1703fe8 100644
--- a/torch/_dynamo/external_utils.py
+++ b/torch/_dynamo/external_utils.py
@@ -187,3 +187,46 @@ def call_module_hooks_from_backward_state(
         if new_result is not None:
             result = new_result
     return result
+<<<<<<< HEAD
+=======
+
+
+# used for torch._dynamo.disable(recursive=False)
+def get_nonrecursive_disable_wrapper(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+    # wrap function to get the right error message
+    # this function is in external_utils so that convert_frame doesn't skip it.
+    @functools.wraps(fn)
+    def nonrecursive_disable_wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        return fn(*args, **kwargs)
+
+    return nonrecursive_disable_wrapper
+
+
+def _dynamo_config_patch_proxy_dunder_call(
+    self: Any, func: Callable[_P, _R]
+) -> Callable[_P, _R]:
+    @functools.wraps(func)
+    def inner(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        with self:
+            return func(*args, **kwargs)
+
+    return inner
+
+
+# Use only on ints marked dynamic via torch.empty(0, integer)
+# Currently only way to mark ints as dynamic: https://github.com/pytorch/pytorch/issues/129623
+def unwrap_maybe_dynamic_int(x: Union[torch.Tensor, int]) -> int:
+    if isinstance(x, torch.Tensor):
+        # x.size() is expected to be [0, dynamic_int]
+        return x.size(1)
+    return x
+
+
+def call_accumulate_grad(
+    variable: torch.Tensor, grad: torch.Tensor, has_post_hooks: bool
+) -> None:
+    updated_grad = torch._dynamo.compiled_autograd.ops.AccumulateGrad(  # type: ignore[attr-defined]
+        [grad], variable, variable.grad, has_post_hooks
+    )
+    variable.grad = updated_grad[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/graph_break_hints.py b/torch/_dynamo/graph_break_hints.py
index e7bffd0ed74f..77a62d30d983 100644
--- a/torch/_dynamo/graph_break_hints.py
+++ b/torch/_dynamo/graph_break_hints.py
@@ -19,3 +19,11 @@
 CAUSED_BY_EARLIER_GRAPH_BREAK = [
     "This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.",
 ]
+<<<<<<< HEAD
+=======
+INFERENCE_MODE = [
+    "Avoid using `tensor.is_inference()` and `torch.is_inference_mode_enabled()` in your compile code. "
+    "This is primarily used in conjunction with `torch.inference_mode`. Consider using `torch.no_grad` instead "
+    "because `torch.no_grad` leads to same improvements as `inference_mode` when `torch.compile` is used.",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/graph_break_registry.json b/torch/_dynamo/graph_break_registry.json
new file mode 100644
index 000000000000..6a773c8fd168
--- /dev/null
+++ b/torch/_dynamo/graph_break_registry.json
@@ -0,0 +1,2141 @@
+{
+  "GB0000": [
+    {
+      "Gb_type": "All __torch_function__ overrides returned NotImplemented due to TypeError from user code",
+      "Context": "fn={fn}, args={args}, kwargs={kwargs}",
+      "Explanation": "All __torch_function__ overrides for for function {fn} returned NotImplemented",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0001": [
+    {
+      "Gb_type": "Argument of `as_subclass` must be a non-dispatcher-style tensor subclass",
+      "Context": "{self}.as_subclass({cls})",
+      "Explanation": "Currently not supported",
+      "Hints": [
+        "Avoid this call or move it outside `torch.compile` regione",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0002": [
+    {
+      "Gb_type": "Assertion failed on symbolic shapes",
+      "Context": "str(sym_expr)",
+      "Explanation": "",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0003": [
+    {
+      "Gb_type": "Attempt to trace generator",
+      "Context": "",
+      "Explanation": "Generators cannot be compiled directly with `torch.compile`.",
+      "Hints": [
+        "Call a generator from inside of a non-generator Python function and ",
+        "compile that function instead.",
+        "This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround."
+      ]
+    }
+  ],
+  "GB0004": [
+    {
+      "Gb_type": "Attempted super().__delattr__() on an object without mutation tracking",
+      "Context": "call_method {self} {name}",
+      "Explanation": "Dynamo needs to track mutations on an object before `super().__delattr__` can be used on it. But the object ({self.objvar}) doesn't have attribute mutation tracking enabled.",
+      "Hints": [
+        "Ensure the object is tracked by Dynamo's side effect system.",
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0005": [
+    {
+      "Gb_type": "Attempted to a str() method implemented in C/C++",
+      "Context": "",
+      "Explanation": "{type(arg.value)} has a C/C++ based str method. This is not supported.",
+      "Hints": [
+        "Write the str method in Python"
+      ]
+    }
+  ],
+  "GB0006": [
+    {
+      "Gb_type": "Attempted to call a super() attribute that is not a function or method",
+      "Context": "call_method {self} {name}",
+      "Explanation": "Dynamo does not know how to trace the call `super().{name}()` because `super().{name}` is not a function or method attribute.",
+      "Hints": [
+        "Ensure the attribute accessed via `super()` is a standard method or function."
+      ]
+    }
+  ],
+  "GB0007": [
+    {
+      "Gb_type": "Attempted to call function marked as skipped",
+      "Context": "module: {module_name}, qualname: {qualname}, skip reason: {reason}",
+      "Explanation": "explanation",
+      "Hints": []
+    }
+  ],
+  "GB0008": [
+    {
+      "Gb_type": "Attempted to inline function marked as skipped",
+      "Context": "qualname: {fn_qualname}, name: {func.get_name()}, filename: `{func.get_filename()}`, skip reason: {result.reason}",
+      "Explanation": "Dynamo developers have intentionally marked that the function `{fn_qualname}` should not be traced.",
+      "Hints": []
+    }
+  ],
+  "GB0009": [
+    {
+      "Gb_type": "Attempted to inline function marked as skipped (SkipFunctionVariable)",
+      "Context": "Attempted to inline a SkipFunctionVariable {func}",
+      "Explanation": "Attempted to inline a function that was previously determined to be marked as intentionally skipped.",
+      "Hints": []
+    }
+  ],
+  "GB0010": [
+    {
+      "Gb_type": "Attempted to read a deleted variable",
+      "Context": "item: {item}, name: {name}",
+      "Explanation": "",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0011": [
+    {
+      "Gb_type": "Attempted to read undefined local variable",
+      "Context": "LOAD_FAST {name}",
+      "Explanation": "Could not find a local variable with name `{name}`",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0012": [
+    {
+      "Gb_type": "Attempted to read undefined local variable (implicit)",
+      "Context": "LOAD_FAST {name}",
+      "Explanation": "Could not find an implicit local variable with name `{name}`",
+      "Hints": [
+        "This happens in dict/list comprehensions",
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0013": [
+    {
+      "Gb_type": "Attempted to represent unregistered RemovableHandle",
+      "Context": "",
+      "Explanation": "Dynamo attempted to build a representation of a torch.utils.hooks.RemovableHandle, which is not supported. This happens because the RemovableHandle was created in another frame.",
+      "Hints": []
+    }
+  ],
+  "GB0014": [
+    {
+      "Gb_type": "Attempted to wrap RNN, GRU, or LSTM",
+      "Context": "str(value)",
+      "Explanation": "Dynamo does not support RNN, GRU, or LSTM.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0015": [
+    {
+      "Gb_type": "Attempted to wrap sparse Tensor",
+      "Context": "",
+      "Explanation": "torch.compile does not support sparse Tensors",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0016": [
+    {
+      "Gb_type": "Attempted to wrap strided NestedTensor",
+      "Context": "",
+      "Explanation": "torch.compile does not support strided NestedTensor",
+      "Hints": []
+    }
+  ],
+  "GB0017": [
+    {
+      "Gb_type": "Attempted to wrap torch._higher_order_ops.invoke_subgraph",
+      "Context": "",
+      "Explanation": "Directly using invoke_subgraph is not supported. Use mark_compile_region",
+      "Hints": []
+    }
+  ],
+  "GB0018": [
+    {
+      "Gb_type": "Attempted to wrap unbacked SymInt",
+      "Context": "",
+      "Explanation": "Unbacked SymInt input is not supported yet.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0019": [
+    {
+      "Gb_type": "AutogradFunctionContextVariable escaped Dynamo-traced region",
+      "Context": "",
+      "Explanation": "We cannot reconstruct a torch.autograd.Function's context object.",
+      "Hints": []
+    }
+  ],
+  "GB0020": [
+    {
+      "Gb_type": "BUILD_STRING key conflict",
+      "Context": "format_string_parts: {format_string_parts}, kwargs: {kwargs}, part.sym_kwargs: {part.sym_kwargs}",
+      "Explanation": "Failed to build format string due to key conflict",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0021": [
+    {
+      "Gb_type": "BUILD_STRING type error",
+      "Context": "str(part)",
+      "Explanation": "Format string part type is not correct - expected constant or format string.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0022": [
+    {
+      "Gb_type": "Bad import result",
+      "Context": "typestr(value)",
+      "Explanation": "Import result is not a Python module.",
+      "Hints": []
+    }
+  ],
+  "GB0023": [
+    {
+      "Gb_type": "Builtin `operator.*` comparison with constant `self` failed",
+      "Context": "call_method {self} {name} {args} {kwargs}",
+      "Explanation": "\"Failed to compare {self} with {other}, because {other} is not a Python constant or its mutation check fails.\"",
+      "Hints": []
+    }
+  ],
+  "GB0024": [
+    {
+      "Gb_type": "CLEANUP_THROW with StopIteration",
+      "Context": "",
+      "Explanation": "Received StopIteration when handling generator.throw/close. This is not supported.",
+      "Hints": []
+    }
+  ],
+  "GB0025": [
+    {
+      "Gb_type": "Call to `torch._dynamo.graph_break()`",
+      "Context": "Called `torch._dynamo.graph_break()` with args `{args}`, kwargs `{kwargs}`",
+      "Explanation": "User-inserted graph break. Message: {graph_break_msg}",
+      "Hints": [
+        "Remove the `torch._dynamo.graph_break()` call."
+      ]
+    }
+  ],
+  "GB0026": [
+    {
+      "Gb_type": "Calling subclass default constructor with more than tensor argument",
+      "Context": "{self.value}(args={args}, kwargs={kwargs})",
+      "Explanation": "Currently not supported",
+      "Hints": [
+        "Avoid this constructor call or move it outside ",
+        "`torch.compile` regione",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0027": [
+    {
+      "Gb_type": "Cannot check Tensor object identity without its fake value",
+      "Context": "str(fake_tensor)",
+      "Explanation": "TensorVariable is missing a fake example_value.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0028": [
+    {
+      "Gb_type": "Caught non-Exception value",
+      "Context": "str(exc_instance)",
+      "Explanation": "Except expects to receive an object of Exception type but received {exc_instance}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0029": [
+    {
+      "Gb_type": "Compilation of intermediate hooks requires compiled autograd",
+      "Context": "var_getattr {self} {name}",
+      "Explanation": "Dynamo must be in compiled_autograd to register hooks.",
+      "Hints": []
+    }
+  ],
+  "GB0030": [
+    {
+      "Gb_type": "ComptimeContext graph break",
+      "Context": "msg",
+      "Explanation": "Manually triggered ComptimeContext graph break with message {msg}.",
+      "Hints": []
+    }
+  ],
+  "GB0031": [
+    {
+      "Gb_type": "Custom __getattribute__ in nn.Module attribute access",
+      "Context": "var_getattr {self} {name}",
+      "Explanation": "Dynamo does not support checking key existence on `nn.Module` instances that have a custom `__getattribute__` method defined.",
+      "Hints": [
+        "Avoid defining `__getattribute__` in your module.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0032": [
+    {
+      "Gb_type": "Custom __getattribute__ in nn.Module dict key check",
+      "Context": "has_key_in_generic_dict {self} {key}",
+      "Explanation": "Dynamo does not support checking key existence on `nn.Module` instances that have a custom `__getattribute__` method defined.",
+      "Hints": [
+        "Avoid defining `__getattribute__` in your module.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0033": [
+    {
+      "Gb_type": "Data dependent operator",
+      "Context": "str(cause.func)",
+      "Explanation": "Operator `{cause.func}` has a non-Tensor output whose value is dependent on the data of Tensor inputs.",
+      "Hints": []
+    }
+  ],
+  "GB0034": [
+    {
+      "Gb_type": "Data-dependent assertion failed (cannot compile partial graph)",
+      "Context": "value: {value}",
+      "Explanation": "Dynamo has determined when encountering a data-dependent assert failure that it should not compile the partial graph.",
+      "Hints": [
+        "Use `torch._assert()` to raise a hard AssertionError when the check fails. ",
+        "This error will propagate back the user code ",
+        "that called the compiled function (i.e. Dynamo will not trace any exception handling).",
+        "Remove the assert statement.",
+        "Move the assert statement outside of any context managers in order to graph break with ",
+        "partial graph compilation (if fullgraph=False).",
+        "This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround."
+      ]
+    }
+  ],
+  "GB0035": [
+    {
+      "Gb_type": "Data-dependent branching with non-constant __bool__",
+      "Context": "method: {x}, result: {result}",
+      "Explanation": "Attempted to perform data-dependent branching on a user-defined object with a __bool__ method that did not return a constant.",
+      "Hints": []
+    }
+  ],
+  "GB0036": [
+    {
+      "Gb_type": "Dynamic shape operator",
+      "Context": "str(cause.func)",
+      "Explanation": "Operator `{cause.func}`'s output shape depends on input Tensor data.",
+      "Hints": [
+        "Enable tracing of dynamic shape operators with ",
+        "`torch._dynamo.config.capture_dynamic_output_shape_ops = True`"
+      ]
+    }
+  ],
+  "GB0037": [
+    {
+      "Gb_type": "Dynamic shape operator (no meta kernel)",
+      "Context": "str(cause.func)",
+      "Explanation": "Operator `{cause.func}` does not have a meta kernel that supports dynamic output shapes",
+      "Hints": [
+        "Please report an issue to PyTorch"
+      ]
+    }
+  ],
+  "GB0038": [
+    {
+      "Gb_type": "Dynamic slicing with Tensor arguments",
+      "Context": "SliceVariable start: {start}, stop: {stop}, step: {step}",
+      "Explanation": "Creating slices with Tensor arguments is not supported. e.g. `l[:x]`, where `x` is a 1-element tensor.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0039": [
+    {
+      "Gb_type": "Dynamo cache limit exceeded",
+      "Context": "Limit type: {limit_type}",
+      "Explanation": "Dynamo attempted to recompile the code object too many times, exceeding the {limit_type} cache size limit.Giving up on compiling as the compile time tradeoff is likely not worth the performance gain.",
+      "Hints": []
+    }
+  ],
+  "GB0040": [
+    {
+      "Gb_type": "Encountered aliasing during higher order op tracing",
+      "Context": "context",
+      "Explanation": "Higher order ops do not support aliasing. Found in {source_target.name()}",
+      "Hints": [
+        "Consider using the debug context to change user code to avoid aliasing.",
+        "Please open an issue."
+      ]
+    }
+  ],
+  "GB0041": [
+    {
+      "Gb_type": "Encountered input mutation during higher order op tracing",
+      "Context": "context",
+      "Explanation": "Higher order ops do not support input mutation. Found in {source_target.name()}",
+      "Hints": [
+        "Consider using the debug context to change user code to avoid mutation.",
+        "Please open an issue."
+      ]
+    }
+  ],
+  "GB0042": [
+    {
+      "Gb_type": "Encountered non user function variable during invoke_subgraph HOP tracing",
+      "Context": "str(fn_vt)",
+      "Explanation": "invoke_subgraph does not support non user function variable",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0043": [
+    {
+      "Gb_type": "Encountered non-PT2-compliant op",
+      "Context": "",
+      "Explanation": "msg +   + err_epilogue",
+      "Hints": []
+    }
+  ],
+  "GB0044": [
+    {
+      "Gb_type": "Encountered strided NestedTensor in automatic dynamic dim determination",
+      "Context": "",
+      "Explanation": "torch.compile does not support strided NestedTensor",
+      "Hints": []
+    }
+  ],
+  "GB0045": [
+    {
+      "Gb_type": "Encountered tensor.is_inference() during tracing",
+      "Context": "",
+      "Explanation": "tensor.is_inference() is not supported",
+      "Hints": [
+        "This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround."
+      ]
+    }
+  ],
+  "GB0046": [
+    {
+      "Gb_type": "Encountered torch.is_inference_mode_enabled during tracing",
+      "Context": "",
+      "Explanation": "torch.is_inference_mode_enabled() is not supported",
+      "Hints": [
+        "This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround."
+      ]
+    }
+  ],
+  "GB0047": [
+    {
+      "Gb_type": "Encountered unconverted argument when attempting to inline",
+      "Context": "func: {func}, arg: {v}",
+      "Explanation": "An argument to an inlined function was not successfully converted to a VariableTracker.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0048": [
+    {
+      "Gb_type": "Error getting associated real value",
+      "Context": "call_id {self}",
+      "Explanation": "Dynamo encountered an error while trying to get the associated real value.",
+      "Hints": []
+    }
+  ],
+  "GB0049": [
+    {
+      "Gb_type": "Error when attempting to resolve op packet",
+      "Context": "",
+      "Explanation": "str(e)",
+      "Hints": []
+    }
+  ],
+  "GB0050": [
+    {
+      "Gb_type": "Exception with bad expected type",
+      "Context": "str(expected_exc_types)",
+      "Explanation": "`except ...` has unsupported type {expected_exc_types}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0051": [
+    {
+      "Gb_type": "Exception with non-type expectation",
+      "Context": "str(expected_type)",
+      "Explanation": "`except ...` expects a non-type: {expected_type}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0052": [
+    {
+      "Gb_type": "Excessive RestartAnalysis() calls",
+      "Context": "",
+      "Explanation": "Dynamo attempted to trace the same frame 100+ times. Giving up on compiling as the compile time tradeoff is likely not worth the performance gain.",
+      "Hints": []
+    }
+  ],
+  "GB0053": [
+    {
+      "Gb_type": "FSDP with use_orig_params=False",
+      "Context": "",
+      "Explanation": "Dynamo only supports FSDP with use_orig_params=True",
+      "Hints": []
+    }
+  ],
+  "GB0054": [
+    {
+      "Gb_type": "Failed to construct Enum variable",
+      "Context": "value: {value_vt}, allowed enum values: {list(cls_type)}",
+      "Explanation": "Attempted to construct an Enum value that is non-constant (e.g. int, string) or is not an acceptable value for the Enum. Acceptable values for Enum `{cls_type}`: {list(cls_type)}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0055": [
+    {
+      "Gb_type": "Failed to convert args/kwargs to proxy",
+      "Context": "call_function args: {typestr(*args)} {typestr(*list(kwargs.values()))}",
+      "Explanation": "Missing `as_proxy()` implementation for some arg/kwarg.",
+      "Hints": []
+    }
+  ],
+  "GB0056": [
+    {
+      "Gb_type": "Failed to mutate tensor data attribute",
+      "Context": "setattr({obj}, {name}, {val})",
+      "Explanation": "Dyanmo only supports mutating `.data` of tensor created outside `torch.compile` region",
+      "Hints": [
+        "Don't mutate `.data` on this tensor, or move ",
+        "the mutation out of `torch.compile` region"
+      ]
+    }
+  ],
+  "GB0057": [
+    {
+      "Gb_type": "Failed to raise exception",
+      "Context": "str(exc)",
+      "Explanation": "Attempted to raise a non-Exception type/value.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0058": [
+    {
+      "Gb_type": "Failed to set tensor attribute",
+      "Context": "setattr({obj}, {name}, {val})",
+      "Explanation": "Dyanmo doesn't support setting these tensor attributes",
+      "Hints": [
+        "Don't mutate attribute '{name}' on tensors, or ",
+        "move the mutation out of `torch.compile` region"
+      ]
+    }
+  ],
+  "GB0059": [
+    {
+      "Gb_type": "Failed to trace builtin operator",
+      "Context": "builtin {fn.__name__} {arg_types} {has_kwargs}",
+      "Explanation": "Dynamo does not know how to trace builtin operator `{fn.__name__}` with argument types {real_arg_types} (has_kwargs {has_kwargs})",
+      "Hints": [
+        "Avoid calling builtin `{fn.__name__}` with argument types {real_arg_types}. ",
+        "Consider using an equivalent alternative function/method to `{fn.__name__}`.",
+        "If you are attempting to call a logging function (e.g. `print`), ",
+        "you can try adding it to `torch._dynamo.config.reorderable_logging_functions`.",
+        "Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0060": [
+    {
+      "Gb_type": "Failed to trace unittest method",
+      "Context": "function: unittest.TestCase.{name}",
+      "Explanation": "Dynamo does not know how to trace unittest method `{name}` ",
+      "Hints": [
+        "Avoid calling `TestCase.{name}`. ",
+        "Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0061": [
+    {
+      "Gb_type": "Failed to unpack object for BUILD_LIST_UNPACK",
+      "Context": "str(seq)",
+      "Explanation": "{seq} cannot be unpacked into a list for the BUILD_LIST_UNPACK bytecode (`[*x, *y, ...]`).",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0062": [
+    {
+      "Gb_type": "Failed to unpack object for UNPACK_EX",
+      "Context": "str(seq)",
+      "Explanation": "{seq} cannot be unpacked into a list for the UNPACK_EX bytecode.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0063": [
+    {
+      "Gb_type": "Failed to unpack object for UNPACK_SEQUENCE",
+      "Context": "str(seq)",
+      "Explanation": "{seq} cannot be unpacked into a list for the UNPACK_SEQUENCE bytecode (i.e. `a, b, c = d`).",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0064": [
+    {
+      "Gb_type": "Fake tensor propagation exception",
+      "Context": "str(e.reason)",
+      "Explanation": "msg",
+      "Hints": []
+    }
+  ],
+  "GB0065": [
+    {
+      "Gb_type": "Graph break in inlined function",
+      "Context": "",
+      "Explanation": "Graph breaks in an inlined call are not supported.",
+      "Hints": []
+    }
+  ],
+  "GB0066": [
+    {
+      "Gb_type": "Graph break under GenericContextWrappingVariable",
+      "Context": "Active generic context managers: {self.active_generic_context_managers}",
+      "Explanation": "Attempted to graph break in an active context manager(s) that doesn't support graph breaking.",
+      "Hints": [
+        "Move the offending context manager(s) to outside the compiled region.",
+        "This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one."
+      ]
+    }
+  ],
+  "GB0067": [
+    {
+      "Gb_type": "HigherOrderOperator: Mutating a variable not in the current scope (SideEffects)",
+      "Context": "",
+      "Explanation": "This is not supported.",
+      "Hints": []
+    }
+  ],
+  "GB0068": [
+    {
+      "Gb_type": "Illegal method invocation in strict mode",
+      "Context": "call_method {self} {name} {args} {kwargs}",
+      "Explanation": "Dynamo currently does not support this method ({name}) invocation in strict mode.",
+      "Hints": []
+    }
+  ],
+  "GB0069": [
+    {
+      "Gb_type": "Import failure",
+      "Context": "module_name: {module_name}, fromlist: {fromlist}, level={level}",
+      "Explanation": "Failure when attempting to import.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0070": [
+    {
+      "Gb_type": "Indexing list with non-scalar tensor",
+      "Context": "call_method {self} {name} {args} {kwargs}",
+      "Explanation": "Attempted to index list-like object with tensor with > 1 element.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0071": [
+    {
+      "Gb_type": "Inline attempt with __self__",
+      "Context": "str(func)",
+      "Explanation": "Attempted to inline a function with the `__self__` attribute. Dynamo is expected to decompose method calls into function calls with a `self` argument.",
+      "Hints": []
+    }
+  ],
+  "GB0072": [
+    {
+      "Gb_type": "Inplace op on input tensor",
+      "Context": "",
+      "Explanation": "Attempted to trace an inplace view op on input tensor {typestr(self.value)}.",
+      "Hints": [
+        "Ensure you do not modify input tensor in place.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0073": [
+    {
+      "Gb_type": "Invoking an nn.Module inside a HigherOrderOperator",
+      "Context": "",
+      "Explanation": "This is not supported.",
+      "Hints": []
+    }
+  ],
+  "GB0074": [
+    {
+      "Gb_type": "Invoking an nn.Module inside a higher order operator",
+      "Context": "Higher order op name: {self.source_target}",
+      "Explanation": "This is not supported.",
+      "Hints": []
+    }
+  ],
+  "GB0075": [
+    {
+      "Gb_type": "LOAD_BUILD_CLASS bytecode not supported",
+      "Context": "",
+      "Explanation": "Dynamo does not support tracing classes that are defined in the compiled region.",
+      "Hints": [
+        "Move the class definition out of the compiled region.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0076": [
+    {
+      "Gb_type": "LOAD_FAST_CHECK on uninitialized variable",
+      "Context": "inst.argval",
+      "Explanation": "Attempted to load uninitialized local variable {inst.argval}",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0077": [
+    {
+      "Gb_type": "Length mismatch when unpacking object for UNPACK_SEQUENCE",
+      "Context": "expected length: {inst.argval}, actual: {len(val)}",
+      "Explanation": "{seq} unpacked to a list for the UNPACK_SEQUENCE bytecode (i.e. `a, b, c = d`) with unexpected length.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0078": [
+    {
+      "Gb_type": "Limitation of `nonstrict_trace",
+      "Context": "{self}",
+      "Explanation": "msg",
+      "Hints": [
+        "make sure definition of {fn_name} is outside ",
+        "`torch.compile` region"
+      ]
+    }
+  ],
+  "GB0079": [
+    {
+      "Gb_type": "Missing CALL_INTRINSIC_1 handler",
+      "Context": "CALL_INTRINSIC_1 operand: {inst.argval}",
+      "Explanation": "No handler implemented for CALL_INTRINSIC_1 {inst.argval} instruction.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0080": [
+    {
+      "Gb_type": "Missing FakeTensor example value",
+      "Context": "str(node)",
+      "Explanation": "`FakeTensor` example value was required for {node} but not available.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0081": [
+    {
+      "Gb_type": "Missing attribute when running call_method node",
+      "Context": "",
+      "Explanation": "make_error_message(\"attribute not defined\")",
+      "Hints": []
+    }
+  ],
+  "GB0082": [
+    {
+      "Gb_type": "Missing bytecode handler",
+      "Context": "{opname} with args {args}",
+      "Explanation": "Dynamo does not know how to handle the bytecode instruction `{opname}`.",
+      "Hints": [
+        "Do not trace code that produces the `{opname}` bytecode instruction ",
+        "(see https://docs.python.org/3/library/dis.html for bytecode semantics).",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0083": [
+    {
+      "Gb_type": "Module-level backwards hooks require compiled autograd.",
+      "Context": "",
+      "Explanation": "",
+      "Hints": [
+        "Enable compiled autograd by setting torch._dynamo.config.compiled_autograd = True."
+      ]
+    }
+  ],
+  "GB0084": [
+    {
+      "Gb_type": "Non-constant attribute given to `super().__delattr__()`",
+      "Context": "call_method {self} {name}",
+      "Explanation": "Dynamo requires the attribute name passed to `super().__delattr__(...)` to be a constant (string).",
+      "Hints": [
+        "Ensure the attribute name is a string literal or a constant variable."
+      ]
+    }
+  ],
+  "GB0085": [
+    {
+      "Gb_type": "Non-function or method in subclass of torch.autograd.Function",
+      "Context": "call_apply {self} {args} {kwargs}",
+      "Explanation": "Dynamo requires the `forward` attribute of a `torch.autograd.Function` subclass to be a standard Python function or method. Found type `{type(fn).__name__}` instead.",
+      "Hints": [
+        "Ensure the `forward` method is defined as a regular ",
+        "function or instance method."
+      ]
+    }
+  ],
+  "GB0086": [
+    {
+      "Gb_type": "Not a Python constant",
+      "Context": "guard_as_python_constant {self}",
+      "Explanation": "Failed to convert {self} into a Python constant.",
+      "Hints": []
+    }
+  ],
+  "GB0087": [
+    {
+      "Gb_type": "NotImplementedError/UnsupportedFakeTensorException when running FX node",
+      "Context": "",
+      "Explanation": "make_error_message(e)",
+      "Hints": []
+    }
+  ],
+  "GB0088": [
+    {
+      "Gb_type": "Observed exception",
+      "Context": "str(raised_exception)",
+      "Explanation": "observed_exn_gb_explanation",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0089": [
+    {
+      "Gb_type": "Observed exception (EXCEPT_HANDLER)",
+      "Context": "str(raised_exception)",
+      "Explanation": "observed_exn_gb_explanation                                 + \" This graph break is unexpected.\"",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0090": [
+    {
+      "Gb_type": "Operator does not support running with fake tensors",
+      "Context": "unsupported operator: {cause.func}",
+      "Explanation": "",
+      "Hints": [
+        "{import_suggestion}see ",
+        "https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.64r4npvq0w0",
+        " for how to fix"
+      ]
+    }
+  ],
+  "GB0091": [
+    {
+      "Gb_type": "Read uninitialized cell",
+      "Context": "str(cellvar)",
+      "Explanation": "Attempted to read a cell variable that has not been populated yet.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0092": [
+    {
+      "Gb_type": "Reconstruction failure",
+      "Context": "str(value)",
+      "Explanation": "Dynamo has no bytecode reconstruction implemented for sourceless variable {value}.",
+      "Hints": [
+        "If Dynamo is attempting to trace a return statement and your code is attempting to return a variable ",
+        "that Dynamo cannot reconstruct, then remove it from the return statement.",
+        "Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have ",
+        "reconstruction rules may be fundamentally unreconstructable.",
+        "This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one."
+      ]
+    }
+  ],
+  "GB0093": [
+    {
+      "Gb_type": "Reconstruction failure: source.reconstruct not implemented",
+      "Context": "str(source)",
+      "Explanation": "Dynamo has no bytecode reconstruction implemented for {type(source)} variable {source}.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0094": [
+    {
+      "Gb_type": "SEND with bad type",
+      "Context": "TOS type: {typestr(tos)}",
+      "Explanation": "Attempted to SEND with unsupported type {typestr(tos)}.",
+      "Hints": []
+    }
+  ],
+  "GB0095": [
+    {
+      "Gb_type": "Set Exception object `__traceback__` attribute to not-`None`",
+      "Context": "call_setattr {self} {name}",
+      "Explanation": "Dynamo does not support setting the attribute '__traceback__' on tracked exception objects to anything other than None.",
+      "Hints": [
+        "Avoid setting '__traceback__' on exception objects ",
+        "within traced code, or set it to None."
+      ]
+    }
+  ],
+  "GB0096": [
+    {
+      "Gb_type": "Should not compile partial graph (STORE_ATTR)",
+      "Context": "",
+      "Explanation": "Dynamo has determined when encountering an unsupported STORE_ATTR instruction (i.e. `obj.attr = val`) that it should not compile the partial graph.",
+      "Hints": []
+    }
+  ],
+  "GB0097": [
+    {
+      "Gb_type": "Side effect on existing deque with limited maxlen",
+      "Context": "",
+      "Explanation": "This is not supported.",
+      "Hints": [
+        "Don't use a deque with `maxlen` specified."
+      ]
+    }
+  ],
+  "GB0098": [
+    {
+      "Gb_type": "Skip calling `torch.compiler.disable()`d function",
+      "Context": "str(self.value)",
+      "Explanation": "Skip calling function `{self.value}` since it was wrapped with `torch.compiler.disable` (reason: {msg})",
+      "Hints": [
+        "Remove the `torch.compiler.disable` call"
+      ]
+    }
+  ],
+  "GB0099": [
+    {
+      "Gb_type": "Skip inlining `torch.compiler.disable()`d function",
+      "Context": "str(func.get_function())",
+      "Explanation": "Skip inlining function {func.get_function()} since it was wrapped with `torch.compiler.disable` (reason: {msg})",
+      "Hints": [
+        "Remove the `torch.compiler.disable` call"
+      ]
+    }
+  ],
+  "GB0100": [
+    {
+      "Gb_type": "Storing Tensor hook handle in globals",
+      "Context": "name",
+      "Explanation": "This is not supported.",
+      "Hints": []
+    }
+  ],
+  "GB0101": [
+    {
+      "Gb_type": "Storing Tensor hook handle in globals (inline call)",
+      "Context": "inst.argval",
+      "Explanation": "This is not supported.",
+      "Hints": []
+    }
+  ],
+  "GB0102": [
+    {
+      "Gb_type": "Strict mode banned op",
+      "Context": "var_getattr {self} {name}",
+      "Explanation": "Getattr invocation '{name}' in strict mode is not supported.",
+      "Hints": [
+        "Remove `{name}` from the list of banned ops by ",
+        "setting `torch._dynamo.config._autograd_backward_strict_mode_banned_ops`."
+      ]
+    }
+  ],
+  "GB0103": [
+    {
+      "Gb_type": "Tensor subclass overridden method call",
+      "Context": "{name}",
+      "Explanation": "`torch.compile` currently can't trace this",
+      "Hints": [
+        "Avoid calling {name} of tensor subclass in torch.compile region",
+        "Renaming method `{name}` of type {self.class_type}",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0104": [
+    {
+      "Gb_type": "Tensor with grad_fn()",
+      "Context": "var_getattr {self} grad_fn",
+      "Explanation": "Dynamo does not support tracing tensors with a grad_fn directly.",
+      "Hints": []
+    }
+  ],
+  "GB0105": [
+    {
+      "Gb_type": "Tensor.numpy() with trace_numpy=False",
+      "Context": "call_method {self} numpy",
+      "Explanation": "`Tensor.numpy()` was called, but the `trace_numpy` configuration was manually disabled.",
+      "Hints": [
+        "Set `torch._dynamo.config.trace_numpy = True` to allow ",
+        "Dynamo to trace through NumPy."
+      ]
+    }
+  ],
+  "GB0106": [
+    {
+      "Gb_type": "Tensor.numpy() without NumPy installed",
+      "Context": "call_method {self} numpy",
+      "Explanation": "`Tensor.numpy()` was called, but the NumPy library is not available in the current environment.",
+      "Hints": [
+        "Ensure NumPy is installed in your Python environment."
+      ]
+    }
+  ],
+  "GB0107": [
+    {
+      "Gb_type": "Tensor.random_ op",
+      "Context": "Tensor.{name}(args={args}, kwargs={kwargs})",
+      "Explanation": "This is currently not supported.",
+      "Hints": [
+        "Use the out-of-place version of this op",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0108": [
+    {
+      "Gb_type": "Tensor.retain_grad() with AOTDispatcher",
+      "Context": "var_getattr {self} retain_grad",
+      "Explanation": "`Tensor.retain_grad()` does not work with AOTDispatcher.",
+      "Hints": []
+    }
+  ],
+  "GB0109": [
+    {
+      "Gb_type": "Tensor.tolist() with non-integer tensor",
+      "Context": "call_method {self} to_list",
+      "Explanation": "Dynamo currently does not support tracing `tolist()` on non-integer tensors.",
+      "Hints": [
+        "Ensure the input tensor to `tolist()` is an integer ",
+        "type (e.g., int8, int16, int32, int64)."
+      ]
+    }
+  ],
+  "GB0110": [
+    {
+      "Gb_type": "Tensor.uniform_ op called with `from` keyword",
+      "Context": "Tensor.{name}(args={args}, kwargs={kwargs})",
+      "Explanation": "This is currently not supported.",
+      "Hints": [
+        "Avoid using the `from` keyword.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0111": [
+    {
+      "Gb_type": "TypeError from user code",
+      "Context": "call_function({self.value}, {args}, {kwargs})",
+      "Explanation": "msg",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0112": [
+    {
+      "Gb_type": "TypeError when making fake tensor call",
+      "Context": "TypeError {node.target}: {cause}",
+      "Explanation": "",
+      "Hints": []
+    }
+  ],
+  "GB0113": [
+    {
+      "Gb_type": "Unable to resolve super getattr",
+      "Context": "",
+      "Explanation": "Dynamo failed to trace attribute `{name}` accessed via `super()` (for type `{self.typevar}` and object `{self.objvar}`) because the resolved attribute type is not supported.",
+      "Hints": [
+        "Ensure the attribute exists in the parent class.",
+        "Check the arguments passed to `super()`."
+      ]
+    }
+  ],
+  "GB0114": [
+    {
+      "Gb_type": "Unexpected failure during itertools.accumulate() iteration",
+      "Context": "call_function {self} {args} {kwargs}",
+      "Explanation": "Unexpected failure in invoking function during accumulate. Failed running func {func}({item}{acc})",
+      "Hints": [
+        "This graph break may be difficult to debug. Please report an issue to PyTorch for assistance."
+      ]
+    }
+  ],
+  "GB0115": [
+    {
+      "Gb_type": "Unexpected failure during itertools.groupby() iteration",
+      "Context": "call_function {self} {args} {kwargs}",
+      "Explanation": "Unexpected failure in invoking function during groupby",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0116": [
+    {
+      "Gb_type": "Unexpected type in sourceless builder",
+      "Context": "{value_type.__module__}.{value_type.__qualname__}",
+      "Explanation": "SourcelessBuilder.create does not know how to wrap {value_type}",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0117": [
+    {
+      "Gb_type": "Unhandled args for method",
+      "Context": "call_method {self} {name} {args} {kwargs}",
+      "Explanation": "Dynamo encountered an error while calling the method `{name}`.",
+      "Hints": []
+    }
+  ],
+  "GB0118": [
+    {
+      "Gb_type": "Unimplemented next() call",
+      "Context": "next({self})",
+      "Explanation": "This abstract method must be implemented",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0119": [
+    {
+      "Gb_type": "Uninitialized nn.Module",
+      "Context": "typestr(value)",
+      "Explanation": "Attempted to trace an uninitialized nn.Module of type {typestr(value)}.",
+      "Hints": [
+        "Ensure your nn.Module instance has called `super().__init__()`.",
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0120": [
+    {
+      "Gb_type": "Unreachable sub-generator code",
+      "Context": "",
+      "Explanation": "Should only be encountered while implementing generator support.",
+      "Hints": []
+    }
+  ],
+  "GB0121": [
+    {
+      "Gb_type": "UnspecializedNNModuleVariable missing method",
+      "Context": "call_method: {self} {name} {args} {kwargs}",
+      "Explanation": "Dynamo does not support tracing method {name} of nn.Module {self.value}",
+      "Hints": [
+        "Dynamo does not really define unspecialized nn.Module very well.",
+        "This graph break may be difficult to debug. Please report an issue to PyTorch for assistance."
+      ]
+    }
+  ],
+  "GB0122": [
+    {
+      "Gb_type": "Unsupported SourceType",
+      "Context": "MutationType.__init__ {self} {typ}",
+      "Explanation": "Dynamo does not support the type `{typ}`",
+      "Hints": [
+        "This branch is not supposed to be reachable.",
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0123": [
+    {
+      "Gb_type": "Unsupported Tensor.backward() call",
+      "Context": "call_method {self} backward {args} {kwargs}",
+      "Explanation": "Dynamo currently does not support tracing `Tensor.backward()`.",
+      "Hints": [
+        "This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround."
+      ]
+    }
+  ],
+  "GB0124": [
+    {
+      "Gb_type": "Unsupported Tensor.item() call with capture_scalar_outputs=False",
+      "Context": "call_method {self} item {args} {kwargs}",
+      "Explanation": "Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.",
+      "Hints": [
+        "Set `torch._dynamo.config.capture_scalar_outputs = True` ",
+        "or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` ",
+        "to include these operations in the captured graph."
+      ]
+    }
+  ],
+  "GB0125": [
+    {
+      "Gb_type": "Unsupported Tensor.requires_grad_() call",
+      "Context": "call_method {self} requires_grad_",
+      "Explanation": "Dynamo does not support changes to a Tensor's `requires_grad` through calling `requires_grad_()`.",
+      "Hints": []
+    }
+  ],
+  "GB0126": [
+    {
+      "Gb_type": "Unsupported Tensor.resize_() call",
+      "Context": "call_method {self} resize_ {args} {kwargs}",
+      "Explanation": "Dynamo currently does not support tracing `Tensor.resize_()`.",
+      "Hints": []
+    }
+  ],
+  "GB0127": [
+    {
+      "Gb_type": "Unsupported Tensor.resize_as_() call",
+      "Context": "call_method {self} resize_as_ {args} {kwargs}",
+      "Explanation": "Dynamo currently does not support tracing `Tensor.resize_as_()`.",
+      "Hints": []
+    }
+  ],
+  "GB0128": [
+    {
+      "Gb_type": "Unsupported Tensor.set_() call",
+      "Context": "call_method {self} set_ {args} {kwargs}",
+      "Explanation": "Dynamo currently does not support tracing `Tensor.set_()` overloads that include more than one argument.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0129": [
+    {
+      "Gb_type": "Unsupported Tensor.sparse_resize_() call",
+      "Context": "call_method {self} sparse_resize_ {args} {kwargs}",
+      "Explanation": "Dynamo currently does not support tracing `Tensor.sparse_resize_()`.",
+      "Hints": []
+    }
+  ],
+  "GB0130": [
+    {
+      "Gb_type": "Unsupported Tensor.sparse_resize_and_clear_() call",
+      "Context": "call_method {self} sparse_resize_and_clear_ {args} {kwargs}",
+      "Explanation": "Dynamo currently does not support tracing `Tensor.sparse_resize_and_clear_()`.",
+      "Hints": []
+    }
+  ],
+  "GB0131": [
+    {
+      "Gb_type": "Unsupported __setitem__/__setattr__ inline attempt",
+      "Context": "code name: {code.co_name}, args: {args}",
+      "Explanation": "Attempted to inline {code.co_name} where first argument (self) is not a user-defined object.",
+      "Hints": []
+    }
+  ],
+  "GB0132": [
+    {
+      "Gb_type": "Unsupported `func` in itertools.accumulate",
+      "Context": "call_function {self} {args} {kwargs}",
+      "Explanation": "Dynamo does not know how to get the function to use for itertools.accumulate. itertools.accumulate expects the `func` as the second argument or as a keyword argument.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0133": [
+    {
+      "Gb_type": "Unsupported arguments for itertools.accumulate",
+      "Context": "call_function {self} {args} {kwargs}",
+      "Explanation": "Dynamo does not know how to trace itertools.accumulate with args: {args} and kwargs: {kwargs}. itertools.accumulate expects an iterable, an optional binary function for accumulation, and an optional initial value to set the starting state.",
+      "Hints": [
+        "Make sure the arguments to itertools.accumulate are correct.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0134": [
+    {
+      "Gb_type": "Unsupported arguments for itertools.groupby",
+      "Context": "call_function {self} {args} {kwargs}",
+      "Explanation": "Dynamo does not know how to trace itertools.groupby with args: {args} and kwargs: {kwargs}. itertools.groupby expects an iterable to group and an optional key function to determine groupings.",
+      "Hints": [
+        "Make sure the arguments to itertools.groupby are correct.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0135": [
+    {
+      "Gb_type": "Unsupported attribute assignment on Exception object",
+      "Context": "call_setattr {self} {name}",
+      "Explanation": "Dynamo does not support setting the attribute '{name}' on tracked exception objects. Only `__context__`, `__cause__`, `__suppress_context__`, and `__traceback__` are supported.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0136": [
+    {
+      "Gb_type": "Unsupported attribute for range() object",
+      "Context": "var_getattr {self} {name}",
+      "Explanation": "Expected attribute to be one of {','.join(fields)} but got {name}",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0137": [
+    {
+      "Gb_type": "Unsupported attribute for slice() object",
+      "Context": "var_getattr {self} {name}",
+      "Explanation": "Expected attribute to be one of {','.join(fields)} but got {name}",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0138": [
+    {
+      "Gb_type": "Unsupported autograd.Function context `save_for_backward`",
+      "Context": "call_method {self} {name}",
+      "Explanation": "Dynamo requires the `saved_tensors` attribute to be initialized on the `autograd.Function` context object.",
+      "Hints": [
+        "Ensure that the `saved_tensors` attribute is properly ",
+        "initialized before calling `save_for_backward`. ",
+        "`save_for_backward` only supported on a newly constructed `torch.autograd.function.FunctionCtx`."
+      ]
+    }
+  ],
+  "GB0139": [
+    {
+      "Gb_type": "Unsupported autograd.Function context method",
+      "Context": "call_method {self} {name}",
+      "Explanation": "Dynamo does not support calling the method `{name}` on `autograd.Function` context objects. Supported methods are `__setattr__`, `save_for_backward` and `mark_non_differentiable`.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0140": [
+    {
+      "Gb_type": "Unsupported autograd.Function method",
+      "Context": "call_method {self} {name}",
+      "Explanation": "Dynamo does not support calling the method `{name}` directly on the `torch.autograd.Function` instance. Supported methods include `apply`, `backward`, static methods, and class methods.",
+      "Hints": [
+        "Ensure the method is decorated with `@staticmethod` ",
+        "or `@classmethod` if it's meant to be called on the class."
+      ]
+    }
+  ],
+  "GB0141": [
+    {
+      "Gb_type": "Unsupported call_id() without source",
+      "Context": "call_id {self}",
+      "Explanation": "call_id() not supported for sourceless TensorVariable.",
+      "Hints": []
+    }
+  ],
+  "GB0142": [
+    {
+      "Gb_type": "Unsupported context manager",
+      "Context": "Attempted SETUP_WITH/BEFORE_WITH on {ctx}",
+      "Explanation": "Dynamo does not know how to enter a `{ctx.python_type_name()}` context manager.",
+      "Hints": [
+        "Avoid using the unsupported context manager.",
+        "If the context manager seems like it should be supported (e.g. torch.set_grad_enabled), then ",
+        "it may be the case that it was created outside the compiled region, which Dynamo does not support. ",
+        "Supported context managers can cross graph break boundaries only if they are local non-closure ",
+        "variables, or are intermediate values.",
+        "File an issue to PyTorch. Simple context managers can potentially be supported, ",
+        "but note that context managers can't be supported in general"
+      ]
+    }
+  ],
+  "GB0143": [
+    {
+      "Gb_type": "Unsupported conversion for slice assignment",
+      "Context": "call_method {self} {name} {args}",
+      "Explanation": "Missing dynamo support for converting {value} into a list for slice assignment.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0144": [
+    {
+      "Gb_type": "Unsupported custom jvp",
+      "Context": "call_apply {self} {args} {kwargs}",
+      "Explanation": "Dynamo does not support tracing `torch.autograd.Function` subclasses that define a custom `jvp` method.",
+      "Hints": [
+        "Remove the custom `jvp` method if possible.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0145": [
+    {
+      "Gb_type": "Unsupported custom vjp",
+      "Context": "call_apply {self} {args} {kwargs}",
+      "Explanation": "Dynamo does not support tracing `torch.autograd.Function` subclasses that define a custom `vjp` method.",
+      "Hints": [
+        "Remove the custom `vjp` method if possible.",
+        "Use standard `backward` instead if applicable.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0146": [
+    {
+      "Gb_type": "Unsupported event method",
+      "Context": "str(name)",
+      "Explanation": "Dynamo doesn't support tracing the {method_name} method. We currently support wait, record, synchronize, and query.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0147": [
+    {
+      "Gb_type": "Unsupported function call",
+      "Context": "call_function {self} {args} {kwargs}",
+      "Explanation": "Dynamo does not know how to trace the function `{self.debug_repr()}`",
+      "Hints": [
+        "Avoid calling `{self.debug_repr()}` in your code.",
+        "Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0148": [
+    {
+      "Gb_type": "Unsupported function call (delayed)",
+      "Context": "source: {self.source}",
+      "Explanation": "Dynamo determined that a graph break should occur when calling `{self.source.name()}`. Reason: {self.msg}",
+      "Hints": []
+    }
+  ],
+  "GB0149": [
+    {
+      "Gb_type": "Unsupported functorch tracing attempt",
+      "Context": "",
+      "Explanation": "msg",
+      "Hints": []
+    }
+  ],
+  "GB0150": [
+    {
+      "Gb_type": "Unsupported hasattr call",
+      "Context": "call_obj_hasattr {self} {name}",
+      "Explanation": "Dynamo does not know how to trace the function `{self.debug_repr()}`",
+      "Hints": [
+        "Avoid calling `hasattr({self.__class__.__name__}, {name})` in your code.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0151": [
+    {
+      "Gb_type": "Unsupported inspect call",
+      "Context": "inspect_parameter_names {self}",
+      "Explanation": "Dynamo does not know how to trace the function `{self.debug_repr()}`",
+      "Hints": []
+    }
+  ],
+  "GB0152": [
+    {
+      "Gb_type": "Unsupported key type for itertools.groupby",
+      "Context": "call_function {self} {args} {kwargs}",
+      "Explanation": "Dynamo does not know how to trace itertools.groupby with key type: {str(type(key))}. We only support grouping keys that are constants (int, float, str, etc.)",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0153": [
+    {
+      "Gb_type": "Unsupported key type for nn.Module.__getitem__",
+      "Context": "call_method: {self} {name} {args} {kwargs}",
+      "Explanation": "Dynamo does not support getitem on `nn.Module` with non-constant key.",
+      "Hints": []
+    }
+  ],
+  "GB0154": [
+    {
+      "Gb_type": "Unsupported kwargs for itertools.accumulate",
+      "Context": "call_function {self} {args} {kwargs}",
+      "Explanation": "Expected kwargs: 'initial', 'func', but got {','.join(set(kwargs.keys()) - {'initial', 'func'})}",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0155": [
+    {
+      "Gb_type": "Unsupported kwargs for itertools.groupby",
+      "Context": "call_function {self} {args} {kwargs}",
+      "Explanation": "Expected kwargs: 'key', but got {','.join(set(kwargs.keys()) - {'key'})}",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0156": [
+    {
+      "Gb_type": "Unsupported method call",
+      "Context": "call_method {self} {name} {args} {kwargs}",
+      "Explanation": "Dynamo does not know how to trace method `{name}` of class `{self.python_type_name()}`",
+      "Hints": []
+    }
+  ],
+  "GB0157": [
+    {
+      "Gb_type": "Unsupported ndarray attribute access",
+      "Context": "var_getattr {self} {name}",
+      "Explanation": "Dynamo currently does not support tracing `ndarray.{name}`.",
+      "Hints": []
+    }
+  ],
+  "GB0158": [
+    {
+      "Gb_type": "Unsupported ndarray method call",
+      "Context": "call_method {self} {name} {args} {kwargs}",
+      "Explanation": "`ndarray.{name}()` is not modelled in `torch._numpy`.",
+      "Hints": []
+    }
+  ],
+  "GB0159": [
+    {
+      "Gb_type": "Unsupported ndarray.__version__ access",
+      "Context": "var_getattr {self} {name}",
+      "Explanation": "Dynamo currently does not support tracing `ndarray.{name}`.",
+      "Hints": []
+    }
+  ],
+  "GB0160": [
+    {
+      "Gb_type": "Unsupported next() call",
+      "Context": "next({self})",
+      "Explanation": "Dynamo does not know how to trace calling `next()` on variable `{self}`.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0161": [
+    {
+      "Gb_type": "Unsupported nn.Module attribute type",
+      "Context": "nn.Module subclass: {typestr(base)}, name: {name}, attribute type: {typestr(subobj)}",
+      "Explanation": "Dynamo does not support tracing nn.Module attributes of type `{typestr(subobj)}`",
+      "Hints": [
+        "Refactor your code so that `{name}` (type `{typestr(subobj)}`) is not an attribute of `{typestr(base)}`",
+        "Currently supported attribute types are methods, classmethods, staticmethods, ",
+        "properties, constants, and tensors.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0162": [
+    {
+      "Gb_type": "Unsupported super().__init__() call",
+      "Context": "call_method {self} {name} {args} {kwargs}",
+      "Explanation": "Dynamo encountered a super().__init__() call on {objvar} that resolved to a `torch.nn.Module.__init__()` call that we cannot trace.",
+      "Hints": [
+        "This graph break may be difficult to debug. Please report an issue to PyTorch for assistance."
+      ]
+    }
+  ],
+  "GB0163": [
+    {
+      "Gb_type": "Unsupported tensor subclass attribute access",
+      "Context": "{name}",
+      "Explanation": "`torch.compile` currently can't trace this",
+      "Hints": [
+        "Avoid accessing {name} of tensor subclass in torch.compile region",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0164": [
+    {
+      "Gb_type": "Unsupported tensor subclass overridden attribute access",
+      "Context": "{name}",
+      "Explanation": "`torch.compile` only support tracing certain types of overridden tensor subclass attributes",
+      "Hints": [
+        "Avoid accessing {name} of tensor subclass in torch.compile region",
+        "Renaming attribute `{name}` of type {self.class_type}",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0165": [
+    {
+      "Gb_type": "Unsupported torch._C._ImperativeEngine method",
+      "Context": "call_method {self} {name}",
+      "Explanation": "Dynamo only supports the `queue_callback` method on a torch._C._ImperativeEngine instance, but found: `{name}`.",
+      "Hints": []
+    }
+  ],
+  "GB0166": [
+    {
+      "Gb_type": "Unsupported torch._C._ImperativeEngine.queue_callback()",
+      "Context": "call_method {self} {name}",
+      "Explanation": "queue_callback() is only supported when Compiled Autograd is enabled with fullgraph=True.",
+      "Hints": []
+    }
+  ],
+  "GB0167": [
+    {
+      "Gb_type": "Variadic function call with bad args/kwargs type",
+      "Context": "args type: {typestr(argsvars)}, kwargs type: {typestr(kwargsvars)}",
+      "Explanation": "Expected args to be a list and kwargs to be a dict",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0168": [
+    {
+      "Gb_type": "Variadic function call with bad flags",
+      "Context": "flags: {inst.argval}",
+      "Explanation": "Attempted to call a variadic function (CALL_FUNCTION_EX) with bad flags {inst.argval}",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0169": [
+    {
+      "Gb_type": "Write to immutable cell",
+      "Context": "cellvar: {cellvar}, value: {value}",
+      "Explanation": "Dynamo doesn't support writing to immutable/sourceless cell variables.",
+      "Hints": [
+        "This graph break may be difficult to debug. Please report an issue to PyTorch for assistance."
+      ]
+    }
+  ],
+  "GB0170": [
+    {
+      "Gb_type": "_gb_type",
+      "Context": "attempted to jump with {value}",
+      "Explanation": "_explanation",
+      "Hints": []
+    }
+  ],
+  "GB0171": [
+    {
+      "Gb_type": "assert with non-string message",
+      "Context": "str(args)",
+      "Explanation": "Dynamo only supports asserts with string messages",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0172": [
+    {
+      "Gb_type": "async_op=True for distributed collectives",
+      "Context": "{self.fn}, args={args}, kwargs={kwargs}",
+      "Explanation": "`torch.compile` doesn't support `async_op=True for {self.fn}",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0173": [
+    {
+      "Gb_type": "backward_state does not support export",
+      "Context": "",
+      "Explanation": "Compiled autograd doesn't work with `torch.export`.",
+      "Hints": []
+    }
+  ],
+  "GB0174": [
+    {
+      "Gb_type": "bad args to builtin cast()",
+      "Context": "got args {args} {kwargs}",
+      "Explanation": "Dynamo expects exactly 2 args to builtin cast().",
+      "Hints": [
+        "Ensure your call to cast() has exactly 2 arguments."
+      ]
+    }
+  ],
+  "GB0175": [
+    {
+      "Gb_type": "builtin isinstance() cannot determine type of argument",
+      "Context": "isinstance({arg}, {isinstance_type})",
+      "Explanation": "Dynamo doesn't have a rule to determine the type of argument {arg}",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0176": [
+    {
+      "Gb_type": "call_id() without associated real value",
+      "Context": "call_id {self}",
+      "Explanation": "Dynamo could not find an associated real value for the tensor.",
+      "Hints": []
+    }
+  ],
+  "GB0177": [
+    {
+      "Gb_type": "can't handle functions not implemented in python ",
+      "Context": "{fn}",
+      "Explanation": "Dynamo can only handle functions defined in python",
+      "Hints": [
+        "Move usage of this function out of `torch.compile` region",
+        "Avoid using `tensor.is_inference()` and `torch.is_inference_mode_enabled()` in your compile code. This is primarily used in conjunction with `torch.inference_mode`. Consider using `torch.no_grad` instead because `torch.no_grad` leads to same improvements as `inference_mode` when `torch.compile` is used."
+      ]
+    }
+  ],
+  "GB0178": [
+    {
+      "Gb_type": "constant fold exception",
+      "Context": "attempted to run function {fn} with arguments {args}",
+      "Explanation": "Encountered exception when attempting to constant fold.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0179": [
+    {
+      "Gb_type": "copy.deepcopy()",
+      "Context": "copy.deepcopy({x})",
+      "Explanation": "Dynamo does not support copy.deepcopy()",
+      "Hints": [
+        "Avoid calling copy.deepcopy()",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0180": [
+    {
+      "Gb_type": "dataclass fields failure",
+      "Context": "obj: {obj}; variable type: {type(obj)}",
+      "Explanation": "Dataclass fields handling fails for {obj}. Expected it to be a user-defined object.",
+      "Hints": []
+    }
+  ],
+  "GB0181": [
+    {
+      "Gb_type": "dtype mismatch between tensor and its gradient",
+      "Context": "tensor dtype: {value.dtype}; grad dtype: {safe_grad(value).dtype}",
+      "Explanation": "Inconsistent dtype between tensor and its gradient. This can happen in FSDP and crashes meta tensor creation.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0182": [
+    {
+      "Gb_type": "failed to broadcast when attempting Tensor comparison op",
+      "Context": "{op.__name__}({left}, {right})",
+      "Explanation": "Dynamo was unable to broad cast the arguments {left}, {right} when attempting to trace the comparison op {op.__name__}.",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0183": [
+    {
+      "Gb_type": "failed to call dict.fromkeys()",
+      "Context": "{user_cls.__name__}.fromkeys(): {args} {kwargs}",
+      "Explanation": "Failed to call {user_cls.__name__}.fromkeys() because arguments could not be automatically converted to a list, or some dict key is not hashable.",
+      "Hints": [
+        "Manually convert the argument to a list.",
+        "Ensure all keys are hashable."
+      ]
+    }
+  ],
+  "GB0184": [
+    {
+      "Gb_type": "failed to call str() on user defined object",
+      "Context": "str(arg)",
+      "Explanation": "User defined object has no __str__ or __repr__ method",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0185": [
+    {
+      "Gb_type": "failed to convert numpy.ndarray to Tensor",
+      "Context": "str(value)",
+      "Explanation": "Exception encountered when attempting to convert numpy.ndarray to Tensor",
+      "Hints": []
+    }
+  ],
+  "GB0186": [
+    {
+      "Gb_type": "functools.partial() with non-literal keyword",
+      "Context": "non-literal keyword: {k}",
+      "Explanation": "functools.partial() expects literal/string keywords",
+      "Hints": [
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0187": [
+    {
+      "Gb_type": "functools.wraps",
+      "Context": "{fn}",
+      "Explanation": "`torch.compile` can't trace `functools.wraps` on functions defined outside the compile region",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0188": [
+    {
+      "Gb_type": "getattr with no source",
+      "Context": "var_getattr {self} {name}",
+      "Explanation": "Dynamo does not know how to access an attribute on an `nn.Module` instance that lacks a source. This is usually an internal error in Dynamo.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0189": [
+    {
+      "Gb_type": "getattr() on nn.Module with pending mutation",
+      "Context": "getattr({obj}, {name}, {default})",
+      "Explanation": "Intentionally graph breaking on getattr() on a nn.Module with a pending mutation",
+      "Hints": []
+    }
+  ],
+  "GB0190": [
+    {
+      "Gb_type": "getattr() with non-constant name argument",
+      "Context": "getattr({obj}, {name_var}, {default})",
+      "Explanation": "getattr() with non-constant name argument is not supported",
+      "Hints": [
+        "Ensure the name argument of getattr() is a string"
+      ]
+    }
+  ],
+  "GB0191": [
+    {
+      "Gb_type": "id() with unsupported args",
+      "Context": "str(args)",
+      "Explanation": "Dynamo doesn't know how to trace id() call with args {args}",
+      "Hints": [
+        "Supported args are Tensors, and functions/nn.Modules/user-defined objects ",
+        "from outside the compiled region.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0192": [
+    {
+      "Gb_type": "input iterator to itertools.cycle has too many items",
+      "Context": "next({self})",
+      "Explanation": "Has reached internal Dynamo max iterator limit: {MAX_ITERATOR_LIMIT}",
+      "Hints": []
+    }
+  ],
+  "GB0193": [
+    {
+      "Gb_type": "invalid call to builtin op handler",
+      "Context": "invalid args to {self_handler}: {args} {kwargs}",
+      "Explanation": "Encountered TypeError when trying to handle op {fn.__name__}",
+      "Hints": [
+        "This graph break may be difficult to debug. Please report an issue to PyTorch for assistance."
+      ]
+    }
+  ],
+  "GB0194": [
+    {
+      "Gb_type": "isinstance() called on user defined object with C extensions",
+      "Context": "isinstance({arg}, {isinstance_type})",
+      "Explanation": "User-defined object with C extensions can have torch.Tensor attributes; intentionally graph breaking.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0195": [
+    {
+      "Gb_type": "issubclass() with non-constant arguments",
+      "Context": "issubclass({left_ty}, {right_ty})",
+      "Explanation": "issubclass() with non-constant arguments not supported.",
+      "Hints": [
+        "Make sure your arguments are types.",
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0196": [
+    {
+      "Gb_type": "key not found in dict",
+      "Context": "Key {arg.value}",
+      "Explanation": "msg",
+      "Hints": [
+        "Check if the key exists in the dictionary before accessing it.",
+        "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
+      ]
+    }
+  ],
+  "GB0197": [
+    {
+      "Gb_type": "list elements are pointing to the list itself",
+      "Context": "",
+      "Explanation": "Dynamo does not support lists whose items reference to itself",
+      "Hints": [
+        "Avoid using self referential list"
+      ]
+    }
+  ],
+  "GB0198": [
+    {
+      "Gb_type": "mapping proxy affected by dictionary mutation",
+      "Context": "Source: {self.source}, Dict mutation detected",
+      "Explanation": "msg",
+      "Hints": [
+        "Avoid modifying dictionaries that might be referenced by mapping proxy objects",
+        "Or avoid using the mapping proxy objects after modifying its underlying dictionary"
+      ]
+    }
+  ],
+  "GB0199": [
+    {
+      "Gb_type": "mapping proxy cannot be reconstructed",
+      "Context": "Source: {self.source}",
+      "Explanation": "msg",
+      "Hints": [
+        "Use a mapping proxy constructed in the same `torch.compile` region.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0200": [
+    {
+      "Gb_type": "missing BUILD_SET handler",
+      "Context": "",
+      "Explanation": "Missing BUILD_SET bytecode handler (for testing purposes).",
+      "Hints": []
+    }
+  ],
+  "GB0201": [
+    {
+      "Gb_type": "namedtuple construction",
+      "Context": "args={args}, kwargs={kwargs}",
+      "Explanation": "`torch.compile` only support certain input types for namedtuple",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0202": [
+    {
+      "Gb_type": "non-const argument in nn.Module method",
+      "Context": "call_method: {self} {name} {args} {kwargs}",
+      "Explanation": "Dynamo does not support calling method `{name}` of ``nn.Module`` {module} with non-constant arguments.",
+      "Hints": []
+    }
+  ],
+  "GB0203": [
+    {
+      "Gb_type": "non-const keys in dict_keys",
+      "Context": "non-const keys: {[k for k in value if not ConstantVariable.is_literal(k)]}",
+      "Explanation": "Dynamo expects dict_keys keys to be constants.",
+      "Hints": [
+        "Ensure your dict_keys keys are constants (e.g. int, float, strings)"
+      ]
+    }
+  ],
+  "GB0204": [
+    {
+      "Gb_type": "non-const keys in mappingproxy",
+      "Context": "non-const keys: {[k for k in value.keys() if not ConstantVariable.is_literal(k)]}",
+      "Explanation": "Dynamo expects mappingproxy keys to be constants.",
+      "Hints": [
+        "Ensure your mappingproxy keys are constants (e.g. int, float, strings)"
+      ]
+    }
+  ],
+  "GB0205": [
+    {
+      "Gb_type": "proxy not set",
+      "Context": "as_proxy {self}",
+      "Explanation": "Dynamo requires the autograd.Function context to be initialized with a proxy.",
+      "Hints": [
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0206": [
+    {
+      "Gb_type": "setattr() on Tensor.requires_grad",
+      "Context": "setattr({obj}, {name}, {val})",
+      "Explanation": "setattr() on Tensor.requires_grad not supported. Mutating requires_grad can introduce a new leaf from non-leaf or vice versa in the middle of the graph, which AOTAutograd does not currently know how to handle.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0207": [
+    {
+      "Gb_type": "sort with non-constant keys",
+      "Context": "str(first_non_constant_key)",
+      "Explanation": "Cannot perform sort with non-constant key. First non-constant key type: {python_type}. Most notably, we cannot sort with Tensor or SymInt keys, but we can sort ints.",
+      "Hints": [
+        "Use something else as the key."
+      ]
+    }
+  ],
+  "GB0208": [
+    {
+      "Gb_type": "torch.* op returned non-Tensor",
+      "Context": "example_value type: {typestr(example_value)}; op: {proxy.node.op}; target: {proxy.node.target}",
+      "Explanation": "torch.* ops that return a non-Tensor cannot be traced into the Dynamo FX graph output",
+      "Hints": []
+    }
+  ],
+  "GB0209": [
+    {
+      "Gb_type": "torch.autograd._unsafe_preserve_version_counter escaped from compiled region",
+      "Context": "str(self)",
+      "Explanation": "Dynamo doesn't support compiling a region that returns a torch.autograd._unsafe_preserve_version_counter context manager.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0210": [
+    {
+      "Gb_type": "torch.distributed package is not available!",
+      "Context": "",
+      "Explanation": "The PyTorch package doesn't include torch.distributed when building from source.",
+      "Hints": [
+        "Set USE_DISTRIBUTED=1 to enable it when building PyTorch from source."
+      ]
+    }
+  ],
+  "GB0211": [
+    {
+      "Gb_type": "torch.nn.Module with a non-function custom __getattr__",
+      "Context": "var_getattr {self} {name}",
+      "Explanation": "Dynamo detected a nn.Module object with a custom `__getattr__` method, but this method is not a standard Python function (e.g., it might be implemented in C/C++). Dynamo cannot currently trace into such non-standard `__getattr__` methods.",
+      "Hints": [
+        "Avoid using objects with non-standard __getattr__ methods ",
+        "within the compiled region. If possible, implement ",
+        "__getattr__ as a standard Python function.",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0212": [
+    {
+      "Gb_type": "torch.profiler object escaped from compiled region",
+      "Context": "str(self)",
+      "Explanation": "Dynamo doesn't support compiling a region that returns a torch.profiler context manager.",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0213": [
+    {
+      "Gb_type": "unimplemented builtin op on tensor arguments",
+      "Context": "partial tensor op: {self} {args} {kwargs}",
+      "Explanation": "Dynamo does not know how to trace builtin operator {self.fn} with tensor arguments",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0214": [
+    {
+      "Gb_type": "unsupported SymNode comparison op",
+      "Context": "{op.__name__}({left}, {right})",
+      "Explanation": "Dynamo does not support the comparison op {op.__name__} with SymNode arguments {left}, {right}",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0215": [
+    {
+      "Gb_type": "unsupported Tensor comparison op",
+      "Context": "{op.__name__}({left}, {right})",
+      "Explanation": "Dynamo does not support the comparison op {op.__name__} with Tensor arguments {left}, {right}",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0216": [
+    {
+      "Gb_type": "unsupported grid type for triton hop check_grid",
+      "Context": "grid type = {type(grid)}",
+      "Explanation": "`torch.compile` only supports list-like grid for check_grid",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0217": [
+    {
+      "Gb_type": "unsupported hasattr operation",
+      "Context": "Class {self.user_cls}",
+      "Explanation": "msg",
+      "Hints": [
+        "Consider using a regular dictionary instead",
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0218": [
+    {
+      "Gb_type": "unsupported index(Tensor)",
+      "Context": "",
+      "Explanation": "Dynamo does not support tracing builtin index() on a Tensor",
+      "Hints": []
+    }
+  ]
+}
diff --git a/torch/_dynamo/graph_deduplication.py b/torch/_dynamo/graph_deduplication.py
index 54fbb8b24e8a..006a631a2aa2 100644
--- a/torch/_dynamo/graph_deduplication.py
+++ b/torch/_dynamo/graph_deduplication.py
@@ -9,6 +9,7 @@
 
 import logging
 import operator
+<<<<<<< HEAD
 from collections.abc import Iterable
 from typing import Any
 
@@ -23,6 +24,34 @@
 
 
 def apply_graph_deduplication(output_graph) -> dict[Node, Node]:  # type: ignore[no-untyped-def]
+=======
+from collections import defaultdict
+from collections.abc import Generator, Iterable
+from typing import Optional
+
+import torch
+import torch.fx
+from torch._dynamo import config
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.utils._ordered_set import OrderedSet
+
+from .graph_region_tracker import Node, Region
+from .graph_utils import _detect_cycles, _get_flat_args, _get_flat_args_unique
+
+
+# Represents an index into the region
+# to select a node and then
+# an index into that node's
+# flattened arguments
+UsageIndex = tuple[int, int]
+
+log = logging.getLogger(__name__)
+
+last_node_to_additional_deps: Optional[dict[Node, OrderedSet[Node]]] = None
+
+
+def apply_graph_deduplication(output_graph) -> dict[str, torch.fx.GraphModule]:  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This is the main entry point for applying the graph deduplication pass. \
 Deduplication occurs in two phases:
@@ -49,6 +78,7 @@ def apply_graph_deduplication(output_graph) -> dict[Node, Node]:  # type: ignore
 Returns a mapping of nodes to their subgraph output replacement node to remap outputs
 when they are created in output_graph.
     """
+<<<<<<< HEAD
     duplicated_region_groups = output_graph.region_tracker.get_identical_regions(
         output_graph.graph
     )
@@ -58,24 +88,57 @@ def apply_graph_deduplication(output_graph) -> dict[Node, Node]:  # type: ignore
     # graph outputs have been created, so we pass the replacement mapping
     # back to output graph to do the replacements at the site of output creation
     output_replacements: dict[Node, Node] = {}
+=======
+
+    duplicated_region_groups = output_graph.region_tracker.get_identical_regions(
+        output_graph.graph
+    )
+    node_to_mutated_arg_positions = (
+        output_graph.region_tracker.node_to_mutated_arg_positions
+    )
+    node_to_additional_deps = _populate_additional_deps(
+        output_graph.graph, output_graph.region_tracker.node_to_mutated_arg_positions
+    )
+
+    sub_gms: dict[str, torch.fx.GraphModule] = {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for region_group in duplicated_region_groups:
         inds_with_external_users = _get_all_output_indices(region_group)
         region = region_group[0]
         (
             subgraph,
+<<<<<<< HEAD
             node_ind_arg_inds,
         ) = _create_subgraph(region, inds_with_external_users)
         sub_gm = torch.fx.GraphModule(output_graph.nn_modules, subgraph)
         subgraph_name = output_graph.install_subgraph("subgraph", sub_gm)
+=======
+            external_node_usages,
+        ) = _create_subgraph(region, inds_with_external_users)
+
+        # Ignore regions with no args for now, could they possibly be evaluated at compile time?
+        if not list(external_node_usages):
+            continue
+
+        sub_gm = torch.fx.GraphModule(output_graph.nn_modules, subgraph)
+        subgraph_name = output_graph.install_subgraph("subgraph", sub_gm)
+        sub_gms[subgraph_name] = sub_gm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with output_graph.graph.inserting_before():
             get_subgraph_node = output_graph.graph.create_node(
                 "get_attr", subgraph_name, (), {}
             )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for region in region_group:
             _replace_region_with_subgraph(
                 output_graph.graph,
                 region,
                 get_subgraph_node,
+<<<<<<< HEAD
                 node_ind_arg_inds.keys(),
                 inds_with_external_users,
                 sub_gm,
@@ -108,12 +171,31 @@ def flatten(args: Any) -> None:
     flatten(args)
 
     return fully_flattened
+=======
+                external_node_usages,
+                inds_with_external_users,
+                subgraph_name,
+                node_to_additional_deps,
+                node_to_mutated_arg_positions,
+            )
+
+    # This is to expose the updated node_to_additional_deps to tests
+    global last_node_to_additional_deps
+    last_node_to_additional_deps = node_to_additional_deps
+
+    _stable_topological_sort(
+        output_graph.graph,
+        node_to_additional_deps,
+    )
+    return sub_gms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _replace_region_with_subgraph(
     graph: torch.fx.Graph,
     region: Region,
     get_subgraph_node: Node,
+<<<<<<< HEAD
     node_ind_arg_ind: Iterable[tuple[int, int]],
     inds_with_external_users: list[int],
     sub_gm: torch.fx.GraphModule,
@@ -153,10 +235,74 @@ def _replace_region_with_subgraph(
         # Erase in reverse topological order
         for node in reversed(region):
             graph.erase_node(node)
+=======
+    external_node_usages: Iterable[OrderedSet[UsageIndex]],
+    inds_with_external_users: list[int],
+    subgraph_name: str,
+    node_to_additional_deps: dict[Node, OrderedSet[Node]],
+    node_to_mutated_arg_positions: dict[Node, OrderedSet[int]],
+) -> None:
+    sub_args = []
+    for usages in external_node_usages:
+        node_ind, usage_ind = next(iter(usages))
+        node = region[node_ind]
+        flattened_args_kwargs = _get_flat_args(node, {})
+        for user_ind, node_usage_ind in usages:
+            user = region[user_ind]
+            if user in node_to_mutated_arg_positions:
+                if node_usage_ind in node_to_mutated_arg_positions[user]:
+                    log.debug(
+                        "NYI: Failed to substitute region %s due to mutation", region
+                    )
+                    return
+        sub_args.append(flattened_args_kwargs[usage_ind])
+
+    # Input/Output aliasing not supported in HOPs today
+    # Note: we should use the nodes in the original graph (the region here)
+    # because we use the original traced example values for this check
+    if _has_aliasing(region, sub_args, inds_with_external_users):
+        return
+
+    invoke_args = (get_subgraph_node, subgraph_name, *sub_args)
+
+    invoke_subgraph_node = graph.create_node(
+        "call_function",
+        torch.ops.higher_order.invoke_subgraph,
+        invoke_args,  # type: ignore[arg-type]
+        {},
+    )
+    for ind, external_user_ind in enumerate(inds_with_external_users):
+        node = region[external_user_ind]
+        subgraph_output = graph.create_node(
+            "call_function", operator.getitem, (invoke_subgraph_node, ind), {}
+        )
+        node.replace_all_uses_with(subgraph_output, propagate_meta=True)
+
+    # Erase in reverse topological order
+    for node in reversed(region):
+        graph.erase_node(node)
+        # Remove any nodes with additional deps
+        # This is safe; we've guaranteed that there is
+        # no input mutation, so all additional deps
+        # will be internal to the subgraph
+        node_to_additional_deps.pop(node, None)
+        for deps in node_to_additional_deps.values():
+            try:
+                deps.remove(node)
+                deps.add(invoke_subgraph_node)
+            except KeyError:
+                pass
+
+    if config.graph_deduplication_lint:
+        print(_detect_cycles(graph, node_to_additional_deps))
+        _stable_topological_sort(graph, node_to_additional_deps)
+        graph.lint()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_external_inputs(
     region: Region,
+<<<<<<< HEAD
 ) -> dict[Node, tuple[int, int]]:
     external_node_to_indices = dict()
     region_unique = set(region)
@@ -171,6 +317,22 @@ def _get_external_inputs(
                 external_node_to_indices[in_node] = (node_ind, arg_ind)
 
     return external_node_to_indices
+=======
+) -> dict[Node, OrderedSet[UsageIndex]]:
+    external_node_to_usages = defaultdict[Node, OrderedSet[UsageIndex]](OrderedSet)
+    region_unique = set(region)
+    for node_ind, node in enumerate(region):
+        flattened_args_kwargs = _get_flat_args(node, {})
+        for arg_ind, in_node in enumerate(flattened_args_kwargs):
+            if isinstance(in_node, Node) and in_node not in region_unique:
+                # in_node may occur in multiple nodes' flat_args
+                # track this so we can check if the arg is mutated
+                # Previously, we only needed to track one occurrence
+                # to be able to map that node to a placeholder
+                external_node_to_usages[in_node].add((node_ind, arg_ind))
+
+    return external_node_to_usages
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_all_output_indices(regions: list[Region]) -> list[int]:
@@ -193,6 +355,7 @@ def _get_inds_with_external_users(region: Region, inds_unique: set[int]) -> None
 
 def _copy_nodes_and_remap_inputs(
     subgraph: torch.fx.Graph, region: Region
+<<<<<<< HEAD
 ) -> dict[tuple[int, int], Any]:
     external_inputs_to_indices = _get_external_inputs(region)
     indices_to_placeholder_ind: dict[tuple[int, int], Any] = {}
@@ -204,6 +367,16 @@ def _copy_nodes_and_remap_inputs(
         # Note: insertion order matches the order in which placeholders were created
         # for the calling convention of the subgraph
         indices_to_placeholder_ind[arg_indices] = None
+=======
+) -> list[OrderedSet[UsageIndex]]:
+    external_input_to_usages = _get_external_inputs(region)
+    external_node_usages = list[OrderedSet[UsageIndex]]()
+    region_to_subgraph_node = {}
+    for node, usage_indices in external_input_to_usages.items():
+        placeholder = subgraph.placeholder(f"subgraph_input_{node.name}")
+        region_to_subgraph_node[node] = placeholder
+        external_node_usages.append(usage_indices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def map_arg(node: Node) -> Node:
         if node in region_to_subgraph_node:
@@ -215,7 +388,11 @@ def map_arg(node: Node) -> Node:
         subgraph_node = subgraph.node_copy(node, lambda old: map_arg(old))
         region_to_subgraph_node[node] = subgraph_node
 
+<<<<<<< HEAD
     return indices_to_placeholder_ind
+=======
+    return external_node_usages
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _create_subgraph_outputs(
@@ -229,8 +406,193 @@ def _create_subgraph_outputs(
 def _create_subgraph(
     region: Region,
     inds_with_external_users: list[int],
+<<<<<<< HEAD
 ) -> tuple[torch.fx.Graph, dict[tuple[int, int], Any]]:
     subgraph: torch.fx.Graph = torch.fx.Graph()
     node_ind_input_inds = _copy_nodes_and_remap_inputs(subgraph, region)
     _create_subgraph_outputs(subgraph, inds_with_external_users)
     return subgraph, node_ind_input_inds
+=======
+) -> tuple[torch.fx.Graph, list[OrderedSet[UsageIndex]]]:
+    subgraph: torch.fx.Graph = torch.fx.Graph()
+    external_node_usages = _copy_nodes_and_remap_inputs(subgraph, region)
+    _create_subgraph_outputs(subgraph, inds_with_external_users)
+    return subgraph, external_node_usages
+
+
+def _stable_topological_sort(
+    graph: torch.fx.Graph,
+    node_to_additional_deps: dict[Node, OrderedSet[Node]],
+) -> None:
+    # Nodes are in exactly one of these four collections:
+
+    # - Nodes in `pending` are waiting to be processed (in reverse order):
+    pending = list(reversed(graph.nodes))
+
+    # - Nodes in `ready` have been processed and are already in the correct
+    #   order.
+    ready = OrderedSet[Node]()
+
+    # - `waiting` is a mapping from a dependency to nodes which depend on that
+    #   dependency.
+    waiting = defaultdict(list)
+
+    # - `outputs` are always at the end of the graph
+    outputs = OrderedSet[Node]()
+
+    # The cursor indicates the last processed node so we can add new nodes
+    # after it.
+    cursor = None
+    while pending:
+        node = pending.pop()
+
+        if node.target == "output":
+            outputs.add(node)
+            assert not node.users, "output nodes should have no users"
+            continue
+
+        waiting_for = [
+            x
+            for x in _get_flat_args_unique(node, node_to_additional_deps)
+            if x not in ready
+        ]
+        if waiting_for:
+            # We have unprocessed input nodes. Might as well wait for the last
+            # arg so an already sorted list will only recheck this node once.
+            waiting[waiting_for[-1]].append(node)
+        else:
+            ready.add(node)
+            if cursor and cursor.next is not node:
+                cursor.append(node)
+            cursor = node
+            # Mark the nodes that have been waiting for this node to finish as
+            # ready to check again.
+            pending.extend(reversed(waiting.pop(node, ())))
+
+    ready.update(outputs)
+    assert not waiting and len(ready) == len(graph.nodes)
+
+
+def _populate_additional_deps(
+    graph: torch.fx.Graph, node_to_mutated_arg_positions: dict[Node, OrderedSet[int]]
+) -> dict[Node, OrderedSet[Node]]:
+    node_to_additional_deps: dict[Node, OrderedSet[Node]] = defaultdict(OrderedSet)
+    _add_mutation_dependencies(node_to_mutated_arg_positions, node_to_additional_deps)
+    _add_global_state_dependencies(graph, node_to_additional_deps)
+    return node_to_additional_deps
+
+
+def _add_global_state_dependencies(
+    graph: torch.fx.Graph, node_to_additional_deps: dict[Node, OrderedSet[Node]]
+) -> None:
+    import torch.amp
+
+    all_nodes = list(graph.nodes)
+
+    # These are targets of the nodes which need to stay in the same relative place in the graph
+    global_state_targets = {torch.amp._enter_autocast, torch.amp._exit_autocast}
+    all_nodes_dep_on: list[Node] = []
+
+    def prev_cur_nodes(
+        all_nodes: list[Node],
+    ) -> Generator[tuple[list[Node], Node], None, None]:
+        prev_nodes: list[Node] = []
+        next_nodes = list(reversed(all_nodes))
+
+        while next_nodes:
+            cur_node = next_nodes.pop()
+            yield prev_nodes, cur_node
+            prev_nodes.append(cur_node)
+
+    for prev_nodes, cur_node in prev_cur_nodes(all_nodes):
+        args_unique = _get_flat_args_unique(cur_node, {})
+        new_deps = [n for n in all_nodes_dep_on if n not in args_unique]
+
+        if new_deps:
+            additional_deps = node_to_additional_deps[cur_node]
+            additional_deps.update(new_deps)
+
+        if cur_node.target in global_state_targets:
+            additional_deps = node_to_additional_deps[cur_node]
+            additional_deps.update(n for n in prev_nodes if n not in args_unique)
+            all_nodes_dep_on.append(cur_node)
+
+
+def _add_mutation_dependencies(
+    node_to_mutated_arg_positions: dict[Node, OrderedSet[int]],
+    node_to_additional_deps: dict[Node, OrderedSet[Node]],
+) -> None:
+    for node, indices in node_to_mutated_arg_positions.items():
+        flat_args_kwargs = _get_flat_args(node, {})
+
+        # for all mutated args,
+        # add dependency on usages which occur after node to ensure
+        # node will always be ordered before them
+        # also add node as a dependency on usages which
+        # occur before node to ensure node is ordered after them
+        for index in indices:
+            mutated_arg = flat_args_kwargs[index]
+            for user in mutated_arg.users:
+                if user is node:
+                    continue
+                elif user < node:
+                    node_to_additional_deps[node].add(user)
+                elif user > node:
+                    node_to_additional_deps[user].add(node)
+
+
+def _has_aliasing(
+    region: Region, inputs: list[Node], inds_with_external_users: list[int]
+) -> bool:
+    input_storages: dict[StorageWeakRef, Node] = dict()
+
+    for node in inputs:
+        example_value = node.meta["example_value"]
+        if isinstance(example_value, torch.Tensor):
+            storage = StorageWeakRef(example_value._typed_storage())
+            if storage in input_storages:
+                # input-input aliasing
+                log.debug(
+                    "NYI: Failed to substitute region %s due to input-output aliasing detected at nodes %s, %s",
+                    region,
+                    input_storages[storage],
+                    node,
+                )
+                return True
+            input_storages[storage] = node
+
+    output_storages: dict[StorageWeakRef, Node] = dict()
+    for i in inds_with_external_users:
+        out_node = region[i]
+        if out_node:
+            example_value = out_node.meta["example_value"]
+            assert not isinstance(example_value, list)
+            if isinstance(example_value, torch.Tensor):
+                storage = StorageWeakRef(example_value._typed_storage())
+                if storage in output_storages:
+                    # output-output aliasing
+                    log.debug(
+                        "NYI: Failed to substitute region %s due to output-output aliasing detected at nodes %s, %s",
+                        region,
+                        output_storages[storage],
+                        out_node,
+                    )
+                    return True
+                output_storages[storage] = out_node
+
+    intersected_storages = input_storages.keys() & output_storages.keys()
+    if len(intersected_storages) > 0:
+        # input-output aliasing
+        aliased = [
+            (input_storages[s], output_storages[s]) for s in intersected_storages
+        ]
+        aliased = ", ".join([f"{i} and {o}" for i, o in aliased])
+        log.debug(
+            "NYI: Failed to substitute region %s due to input-output aliasing detected at nodes %s",
+            region,
+            aliased,
+        )
+        return True
+
+    return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/graph_region_tracker.py b/torch/_dynamo/graph_region_tracker.py
index 918d97a5345d..afd63e991453 100644
--- a/torch/_dynamo/graph_region_tracker.py
+++ b/torch/_dynamo/graph_region_tracker.py
@@ -17,6 +17,10 @@
 import io
 import logging
 import math
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import pickle
 from collections import defaultdict, deque
 from dataclasses import fields
@@ -25,8 +29,16 @@
 import torch._logging
 import torch.fx
 from torch._subclasses.fake_tensor import FakeTensor
+<<<<<<< HEAD
 from torch.utils._pytree import tree_flatten
 
+=======
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._pytree import tree_flatten
+
+from .graph_utils import _get_flat_args_unique
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 T = TypeVar("T")
 
@@ -99,9 +111,17 @@ def dumps(self, obj: Any) -> bytes:
             self._stream.truncate(0)
 
 
+<<<<<<< HEAD
 def _extract_tensor_arg(arg: Any) -> Any:
     if isinstance(arg, Node):
         return arg.meta.get("example_value")
+=======
+def _extract_args(arg: Any) -> Any:
+    if isinstance(arg, Node):
+        return arg.meta.get("example_value")
+    elif isinstance(arg, (torch.Tensor, int)):
+        return arg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return None
 
@@ -110,11 +130,19 @@ def _normalize_args(
     node: Node,
 ) -> tuple[tuple[str, ...], tuple[Optional[Any], ...]]:
     flat_args, _ = tree_flatten(node.args)
+<<<<<<< HEAD
     sorted_kwargs = sorted(node.kwargs.items(), key=lambda x: x[0])
     sorted_keys = tuple(sorted(node.kwargs.keys()))
     flat_kwargs, _ = tree_flatten(sorted_kwargs)
     all_args = flat_args + flat_kwargs
     return (sorted_keys, tuple(_extract_tensor_arg(arg) for arg in all_args))
+=======
+    sorted_kwargs = sorted(node.kwargs.items(), key=operator.itemgetter(0))
+    sorted_keys = tuple(sorted(node.kwargs.keys()))
+    flat_kwargs, _ = tree_flatten(sorted_kwargs)
+    all_args = flat_args + flat_kwargs
+    return (sorted_keys, tuple(_extract_args(arg) for arg in all_args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_global_state_key() -> GlobalStateKey:
@@ -148,6 +176,12 @@ def __init__(self, origin: Node) -> None:
     def create(origin: Node) -> "BackwardBfsArgIter":
         it = BackwardBfsArgIter(origin)
         it.add_children(origin)
+<<<<<<< HEAD
+=======
+        # pop the origin node, since it is the origin of
+        # the region and does not need to be considered for addition
+        assert it.next()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return it
 
     def next(self) -> Optional[Node]:
@@ -162,23 +196,36 @@ def peek(self) -> Optional[Node]:
         return self._cur
 
     def add_children(self, node: Node) -> None:
+<<<<<<< HEAD
         arg: Any
         flat_args, _ = tree_flatten(node.args)
+=======
+        flat_args = _get_flat_args_unique(node, {})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for arg in flat_args:
             if isinstance(arg, Node):
                 self._append(arg)
 
+<<<<<<< HEAD
         flat_kwargs, _ = tree_flatten(node.kwargs)
         for kwarg in flat_kwargs:
             if isinstance(kwarg, Node):
                 self._append(kwarg)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _append(self, arg: Node) -> None:
         if self._cur is None:
             self._cur = arg
         else:
             self._queue.append(arg)
 
+<<<<<<< HEAD
+=======
+    def __str__(self) -> str:
+        return f"BackwardBfsArgIter(cur={self._cur}, queue={self._queue})"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class GraphRegionTracker:
     """
@@ -194,6 +241,11 @@ class GraphRegionTracker:
     def __init__(self) -> None:
         self.hash_to_duplicates: dict[str, IdenticalNodes] = defaultdict(list)
         self.node_to_duplicates: dict[Node, IdenticalNodes] = {}
+<<<<<<< HEAD
+=======
+        # Note: position is in flattened args/kwargs list
+        self.node_to_mutated_arg_positions: dict[Node, OrderedSet[int]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.input_pickler = InputPickler()
 
     def _hash_node(
@@ -235,6 +287,41 @@ def track_node(self, tx: "InstructionTranslatorBase", node: Node) -> None:
         except NodeHashException as e:
             log.debug("Unable to hash node %s with exception %s", node, e)
 
+<<<<<<< HEAD
+=======
+    def track_node_mutations(
+        self,
+        node: Node,
+        flat_args_kwargs: list[Any],
+        id_to_initial_version: dict[int, int],
+    ) -> None:
+        """
+        This function tracks which argument positions are mutated by the given node. Subgraph HOP does not support
+        input mutations today so we will skip regions which have inputs that are mutated.
+        """
+        mutated_arg_positions = OrderedSet[int]()
+        for i, arg in enumerate(flat_args_kwargs):
+            val_id = id(arg)
+            if (
+                val_id in id_to_initial_version
+                and id_to_initial_version[val_id] != arg._version
+            ):
+                mutated_arg_positions.add(i)
+
+        if mutated_arg_positions:
+            self.node_to_mutated_arg_positions[node] = mutated_arg_positions
+
+    def add_node_mutation(
+        self,
+        node: Node,
+        arg_pos: int,
+    ) -> None:
+        if node in self.node_to_mutated_arg_positions:
+            self.node_to_mutated_arg_positions[node].add(arg_pos)
+        else:
+            self.node_to_mutated_arg_positions[node] = OrderedSet([arg_pos])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_identical_regions(self, graph: torch.fx.Graph) -> list[list[Region]]:
         """
         This function is responsible for extracting the largest regions of identical nodes from the given graph.
@@ -250,6 +337,11 @@ def get_identical_regions(self, graph: torch.fx.Graph) -> list[list[Region]]:
         """
         topological_ranking = {node: i for i, node in enumerate(graph.nodes)}
         region_groups_with_rank = []
+<<<<<<< HEAD
+=======
+        # needed to detect if replacing a region will create cycles
+        node_to_recursive_ancestors = _populate_recursive_ancestor_map(graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Create region groups; a region group is a group
         # of regions that are all identical. In this initial state
@@ -278,7 +370,16 @@ def get_identical_regions(self, graph: torch.fx.Graph) -> list[list[Region]]:
         # overlap.
         seen_nodes: set[Node] = set()
         for region_group in region_groups:
+<<<<<<< HEAD
             fully_expand_region_group(region_group, seen_nodes, self._is_identical)
+=======
+            fully_expand_region_group(
+                region_group,
+                seen_nodes,
+                node_to_recursive_ancestors,
+                self._is_identical,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # sort topologically
             for region in region_group:
                 region.sort(key=lambda n: topological_ranking[n])
@@ -291,9 +392,48 @@ def __str__(self) -> str:
         return f"GraphRegionTracker(hash_to_duplicates={self.hash_to_duplicates}, node_to_duplicates={self.node_to_duplicates})"
 
 
+<<<<<<< HEAD
+def fully_expand_region_group(
+    regions: list[Region],
+    seen_nodes: set[Node],
+=======
+class RegionWrapper:
+    """Holds state for regions e.g. ancestors and new candidate nodes for consideration"""
+
+    def __init__(
+        self, region: Region, node_to_recursive_ancestors: dict[Node, set[Node]]
+    ) -> None:
+        assert len(region) == 1, "all regions should start with one node"
+        node = region[0]
+        self.node_to_recursive_ancestors = node_to_recursive_ancestors
+        self.iter = BackwardBfsArgIter.create(node)
+        self.nodes_unique = OrderedSet([node])
+        self.ancestors = set(node_to_recursive_ancestors[node])
+        self.region = region
+
+    def next_candidate(self) -> Optional[Node]:
+        return self.iter.next()
+
+    def will_inclusion_create_cycle(self, node: Node) -> bool:
+        external_users = [user for user in node.users if user not in self.nodes_unique]
+        for user in external_users:
+            if user in self.ancestors:
+                return True
+
+        return False
+
+    def add(self, node: Node) -> None:
+        self.nodes_unique.add(node)
+        self.region.append(node)
+        self.iter.add_children(node)
+        self.ancestors.update(self.node_to_recursive_ancestors[node])
+
+
 def fully_expand_region_group(
     regions: list[Region],
     seen_nodes: set[Node],
+    node_to_recursive_ancestors: dict[Node, set[Node]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_identical_fn: Callable[[Node, Node], bool],
 ) -> None:
     debug_log("--------------------------------------------------")
@@ -301,6 +441,7 @@ def fully_expand_region_group(
 
     # All regions should start with 1 node
     assert all(len(region) == 1 for region in regions)
+<<<<<<< HEAD
     region_iters = []
     for region in regions:
         (origin,) = region  # Only works for 1 element sets
@@ -316,10 +457,24 @@ def fully_expand_region_group(
 
     current_node = region_iters[0].next()
     assert current_node is not None
+=======
+    region_wrappers = [
+        RegionWrapper(region, node_to_recursive_ancestors) for region in regions
+    ]
+
+    nodes_to_add = OrderedSet[Node]()
+    current_node = region_wrappers[0].next_candidate()
+
+    # No children
+    if current_node is None:
+        return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Loop incrementally adding new nodes to each region
     # regions are only expanded if the node to add is valid
     # for ALL regions
     while current_node:
+<<<<<<< HEAD
         add_node = True
         nodes_to_add.clear()
         nodes_to_add.append(current_node)
@@ -355,6 +510,53 @@ def fully_expand_region_group(
                 seen_nodes.add(node)
 
         current_node = region_iters[0].next()
+=======
+        add_to_all_regions = not region_wrappers[0].will_inclusion_create_cycle(
+            current_node
+        )
+        nodes_to_add.clear()
+        nodes_to_add.add(current_node)
+        for region_wrapper in region_wrappers[1:]:
+            candidate = region_wrapper.next_candidate()
+
+            debug_log("--------------------")
+            debug_log(
+                "considering candidate: %s, cur_node: %s", candidate, current_node
+            )
+
+            if not candidate or not add_to_all_regions:
+                add_to_all_regions = False
+                continue
+
+            debug_log(
+                "candidate in previously claimed nodes?: %s", candidate in seen_nodes
+            )
+            debug_log("is_identical: %s", is_identical_fn(candidate, current_node))
+
+            add_to_all_regions &= (
+                candidate not in seen_nodes
+                and candidate not in nodes_to_add
+                and candidate.op != "placeholder"
+                and is_identical_fn(candidate, current_node)
+                and not region_wrapper.will_inclusion_create_cycle(candidate)
+            )
+            nodes_to_add.add(candidate)
+
+            debug_log(f"add_to_all_regions: {add_to_all_regions}")
+            debug_log("--------------------")
+
+        if add_to_all_regions:
+            assert len(region_wrappers) == len(nodes_to_add), (
+                "Number of nodes to add must equal the number of regions"
+            )
+            for region_wrapper, node in zip(region_wrappers, nodes_to_add):
+                region_wrapper.add(node)
+                debug_log("adding %s's children", node)
+                debug_log("%s %s", node.args, list(node.kwargs.items()))
+                seen_nodes.add(node)
+
+        current_node = region_wrappers[0].next_candidate()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Ensure regions are sorted in topological order
     for region in regions:
@@ -362,3 +564,21 @@ def fully_expand_region_group(
 
     debug_log("end expand new region group: %s", regions)
     debug_log("--------------------------------------------------")
+<<<<<<< HEAD
+=======
+
+
+def _populate_recursive_ancestor_map(graph: torch.fx.Graph) -> dict[Node, set[Node]]:
+    node_to_recursive_ancestors: dict[Node, set[Node]] = {}
+    for node in graph.nodes:
+        node_to_recursive_ancestors[node] = set()
+    for node in graph.nodes:
+        all_args = _get_flat_args_unique(node, {})
+        for arg in all_args:
+            if isinstance(arg, Node):
+                node_to_recursive_ancestors[node].update(
+                    node_to_recursive_ancestors[arg]
+                )
+                node_to_recursive_ancestors[node].add(arg)
+    return node_to_recursive_ancestors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/graph_utils.py b/torch/_dynamo/graph_utils.py
new file mode 100644
index 000000000000..1e54ba95b388
--- /dev/null
+++ b/torch/_dynamo/graph_utils.py
@@ -0,0 +1,77 @@
+from collections import deque
+from typing import Any
+
+from torch.fx import Graph, map_arg, Node
+from torch.utils._ordered_set import OrderedSet
+
+
+# flattens with support for slices
+# Note: a better way to do this would
+# be register/unregister slices as pytree nodes
+# but there is no unregister API in the pytorch
+# pytree impl
+def _get_flat_args(
+    node: Node, node_to_additional_deps: dict[Node, OrderedSet[Node]]
+) -> list[Node]:
+    args = list[Any]()
+    map_arg((node.args, node.kwargs), args.append)
+    if node in node_to_additional_deps:
+        args.extend(node_to_additional_deps[node])
+    return args
+
+
+def _get_flat_args_unique(
+    node: Node, node_to_additional_deps: dict[Node, OrderedSet[Node]]
+) -> OrderedSet[Node]:
+    args = OrderedSet[Node]()
+    map_arg((node.args, node.kwargs), args.add)
+    if node in node_to_additional_deps:
+        args.update(node_to_additional_deps[node])
+    return args
+
+
+def _detect_cycles(
+    graph: Graph, node_to_additional_deps: dict[Node, OrderedSet[Node]]
+) -> str:
+    current_path: deque[Node] = deque()
+    current_path_set: set[Node] = set()
+    pending: deque[tuple[Node, Node]] = deque()
+
+    def add_to_current_path(node: Node) -> None:
+        current_path.append(node)
+        current_path_set.add(node)
+
+    def pop_current_path() -> None:
+        node = current_path.pop()
+        current_path_set.remove(node)
+
+    def current_path_head() -> Node:
+        return current_path[-1]
+
+    for origin in graph.find_nodes(op="output"):
+        current_path.clear()
+        current_path_set.clear()
+        add_to_current_path(origin)
+        for child in _get_flat_args_unique(origin, node_to_additional_deps):
+            pending.append((child, origin))
+
+        while pending:
+            cur_node, parent = pending.pop()
+
+            # handle backtracking
+            while current_path and current_path_head() != parent:
+                pop_current_path()
+
+            if not isinstance(cur_node, Node):
+                continue
+
+            if cur_node in current_path_set:
+                current_path.append(cur_node)
+                return f"cycle detected in path: {current_path}"
+
+            add_to_current_path(cur_node)
+
+            for child in _get_flat_args_unique(cur_node, node_to_additional_deps):
+                pending.append((child, cur_node))
+
+    return "no cycle detected"
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index b374533f44d7..74df15389345 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -27,8 +27,15 @@
 import functools
 import importlib
 import inspect
+<<<<<<< HEAD
 import logging
 import math
+=======
+import io
+import logging
+import math
+import pickle
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import textwrap
 import types
@@ -37,7 +44,11 @@
 from contextlib import contextmanager
 from copy import deepcopy
 from inspect import currentframe
+<<<<<<< HEAD
 from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, Callable, NoReturn, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from weakref import ReferenceType
 
 import torch
@@ -57,6 +68,11 @@
     RootGuardManager,
 )
 from torch._dynamo.source import (
+<<<<<<< HEAD
+=======
+    get_global_source_name,
+    get_local_source_name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IndexedSource,
     is_from_flatten_script_object_source,
     is_from_local_source,
@@ -64,7 +80,11 @@
     TensorProperty,
     TensorPropertySource,
 )
+<<<<<<< HEAD
 from torch._dynamo.utils import CompileEventLogger
+=======
+from torch._dynamo.utils import CompileEventLogger, get_metrics_context
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._guards import (
     CompileContext,
     CompileId,
@@ -79,15 +99,28 @@
 from torch._logging import structured
 from torch._utils_internal import justknobs_check
 from torch.fx.experimental.symbolic_shapes import (
+<<<<<<< HEAD
+=======
+    _CppShapeGuardsHelper,
+    _ShapeGuardsHelper,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     EqualityConstraint,
     is_symbolic,
     SYMPY_INTERP,
 )
+<<<<<<< HEAD
+=======
+from torch.utils import _pytree as pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._traceback import format_frame, report_compile_source_on_error
 from torch.utils.weak import TensorWeakRef
 
+<<<<<<< HEAD
 from . import config, convert_frame, exc, mutation_guard
+=======
+from . import config, convert_frame, exc
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .eval_frame import set_guard_error_hook
 from .source import (
     AttrProxySource,
@@ -97,6 +130,10 @@
     ChainedSource,
     ConstantSource,
     ConstDictKeySource,
+<<<<<<< HEAD
+=======
+    DataclassFieldsSource,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DefaultsSource,
     DictGetItemSource,
     DictSubclassGetItemSource,
@@ -131,11 +168,19 @@
     ExtraState,
     GuardedCode,
     GuardFail,
+<<<<<<< HEAD
+=======
+    GuardFilterEntry,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GuardFn,
 )
 from .utils import (
     builtin_dict_keys,
     common_constant_types,
+<<<<<<< HEAD
+=======
+    dataclass_fields,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dict_keys,
     get_custom_getattr,
     get_torch_function_mode_stack,
@@ -165,6 +210,11 @@
 if TYPE_CHECKING:
     from sympy import Symbol
 
+<<<<<<< HEAD
+=======
+    from torch._dynamo.output_graph import OutputGraphGuardsState
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 guards_log = torch._logging.getArtifactLogger(__name__, "guards")
@@ -201,17 +251,30 @@ def __init__(self, root=None):
         self.id_matched_objs = {}
         self.no_tensor_aliasing_sources = []
 
+<<<<<<< HEAD
         self.print_no_tensor_aliasing_guard = True
+=======
+        self.printed_relational_guards = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.diff_guard_sources: OrderedSet[str] = OrderedSet()
 
     @contextmanager
+<<<<<<< HEAD
     def _preserve_print_no_tensor_aliasing_flag(self):
         self.print_no_tensor_aliasing_guard = True
         try:
             yield
         finally:
             self.print_no_tensor_aliasing_guard = True
+=======
+    def _preserve_printed_relational_guards(self):
+        self.printed_relational_guards = set()
+        try:
+            yield
+        finally:
+            self.printed_relational_guards = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def collect_diff_guard_sources(self):
         # At the time of finalize, we have only marked guard managers with
@@ -314,9 +377,15 @@ def construct_dict_manager_string(self, mgr, body):
     def construct_manager_string(self, mgr, body):
         with body.indent():
             for guard in mgr.get_leaf_guards():
+<<<<<<< HEAD
                 if isinstance(guard, torch._C._dynamo.guards.NO_TENSOR_ALIASING):  # type: ignore[attr-defined]
                     if self.print_no_tensor_aliasing_guard:
                         self.print_no_tensor_aliasing_guard = False
+=======
+                if isinstance(guard, torch._C._dynamo.guards.RelationalGuard):  # type: ignore[attr-defined]
+                    if guard not in self.printed_relational_guards:
+                        self.printed_relational_guards.add(guard)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         body.writelines(self.get_guard_lines(guard))
                     else:
                         body.writelines(
@@ -353,7 +422,11 @@ def writeline(self, line, skip_prefix=False):
                 else:
                     super().writeline("+- " + line)
 
+<<<<<<< HEAD
         with self._preserve_print_no_tensor_aliasing_flag():
+=======
+        with self._preserve_printed_relational_guards():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             body = IndentedBufferWithPrefix()
             body.tabwidth = 1
             body.writeline("", skip_prefix=True)
@@ -375,7 +448,11 @@ def check_verbose(self, x):
 
     def populate_code_parts_for_debugging(self):
         # This should be called when the guard manager is fully populated
+<<<<<<< HEAD
         tensor_aliasing_guard_seen = False
+=======
+        relational_guards_seen = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def get_code_parts(leaf_guard):
             code_parts = []
@@ -385,12 +462,21 @@ def get_code_parts(leaf_guard):
             return code_parts
 
         def visit(mgr):
+<<<<<<< HEAD
             nonlocal tensor_aliasing_guard_seen
             for guard in mgr.get_leaf_guards():
                 if isinstance(guard, torch._C._dynamo.guards.NO_TENSOR_ALIASING):  # type: ignore[attr-defined]
                     if not tensor_aliasing_guard_seen:
                         self.code_parts.extend(get_code_parts(guard))
                         tensor_aliasing_guard_seen = True
+=======
+            nonlocal relational_guards_seen
+            for guard in mgr.get_leaf_guards():
+                if isinstance(guard, torch._C._dynamo.guards.RelationalGuard):  # type: ignore[attr-defined]
+                    if guard not in relational_guards_seen:
+                        self.code_parts.extend(get_code_parts(guard))
+                        relational_guards_seen.add(guard)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     self.code_parts.extend(get_code_parts(guard))
 
@@ -409,7 +495,11 @@ def from_numpy(a):
 
 
 # For user stack printing
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def uninteresting_files():
     import torch._dynamo.external_utils
     import torch._dynamo.polyfills
@@ -439,6 +529,10 @@ def _get_closure_vars():
             "___tuple_iterator_len": tuple_iterator_len,
             "___normalize_range_iter": normalize_range_iter,
             "___tuple_iterator_getitem": tuple_iterator_getitem,
+<<<<<<< HEAD
+=======
+            "___dataclass_fields": dataclass_fields,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "___get_torch_function_mode_stack_at": get_torch_function_mode_stack_at,
             "__math_isnan": math.isnan,
             "__numpy_isnan": None if np is None else np.isnan,
@@ -470,7 +564,15 @@ def get_verbose_code_part(code_part: str, guard: Guard) -> str:
                     extra = f"  # {format_frame(fs, line=True)}"
                     break
         elif guard.stack:
+<<<<<<< HEAD
             extra = f"  # {format_frame(guard.stack.summary()[-1])}"
+=======
+            summary = guard.stack.summary()
+            if len(summary) > 0:
+                extra = f"  # {format_frame(summary[-1])}"
+            else:
+                extra = "  # <unknown>"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return f"{code_part:<60}{extra}"
 
 
@@ -482,6 +584,7 @@ def get_verbose_code_parts(
     return [get_verbose_code_part(code_part, guard) for code_part in code_parts]
 
 
+<<<<<<< HEAD
 def convert_to_concrete_values(size_or_stride):
     converted: list[Optional[int]] = []
     for dim in size_or_stride:
@@ -497,6 +600,25 @@ def get_tensor_guard_code_part(value, name, sizes, strides):
     pytype = type(value)
     dispatch_key = (
         torch._C._dispatch_keys(value) | torch._C._dispatch_tls_local_include_set()
+=======
+def convert_int_to_concrete_values(dim) -> Optional[int]:
+    if dim is None:
+        return None
+    if not is_symbolic(dim):
+        return dim
+    else:
+        assert isinstance(dim, torch.SymInt)
+        return dim.node.maybe_as_int()
+
+
+def convert_to_concrete_values(size_or_stride):
+    return [convert_int_to_concrete_values(dim) for dim in size_or_stride]
+
+
+def get_tensor_guard_code_part(value, name, sizes, strides, pytype, dispatch_keys):
+    dispatch_key = (
+        dispatch_keys | torch._C._dispatch_tls_local_include_set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) - torch._C._dispatch_tls_local_exclude_set()
     dtype = value.dtype
     device_index = value.device.index
@@ -520,6 +642,17 @@ def get_key_index_source(source, index):
     return f"list(dict.keys({source}))[{index}]"
 
 
+<<<<<<< HEAD
+=======
+def raise_local_type_error(obj: Any) -> NoReturn:
+    raise TypeError(
+        f"Type {type(obj)} for object {obj} cannot be saved "
+        + "into torch.compile() package since it's defined in local scope. "
+        + "Please define the class at global scope (top level of a module)."
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass(frozen=True)
 class NNModuleAttrAccessorInfo:
     # Represents where is the attr name is present in the nn module attribute
@@ -531,7 +664,11 @@ class NNModuleAttrAccessorInfo:
     # Either the actual name or _parameters/_buffers/_modules
     l1_key: Optional[str] = None
 
+<<<<<<< HEAD
     # Actual paramter/buffer/submodule name
+=======
+    # Actual parameter/buffer/submodule name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     l2_key: Optional[str] = None
 
 
@@ -579,7 +716,11 @@ def getitem_on_dict_manager(
 def match_on_id_for_tensor(guard):
     source = guard.originating_source
     # For numpy tensors, always use TENSOR_MATCH because __from_numpy leads
+<<<<<<< HEAD
     # to a new tensor everytime and therefore id differs.
+=======
+    # to a new tensor every time and therefore id differs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(source, NumpyTensorSource):
         return False
 
@@ -602,7 +743,11 @@ class GuardManagerType(enum.Enum):
     DICT_GUARD_MANAGER = 2
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def code_framelocals_names_reversed_cached(code: types.CodeType):
     return list(reversed(code_framelocals_names(code)))
 
@@ -618,6 +763,10 @@ def __init__(
         global_scope: dict[str, object],
         guard_manager: GuardManagerWrapper,
         check_fn_manager: CheckFunctionManager,
+<<<<<<< HEAD
+=======
+        serialization_mode: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         self.f_code = f_code
         self.id_ref = id_ref
@@ -657,8 +806,13 @@ def __init__(
         # to access the same object - self._module["param"] is same as
         # self.param.
         self.key_order_guarded_dict_ids = set()
+<<<<<<< HEAD
         for source_name in self.check_fn_manager.output_graph.guard_on_key_order:
             self.key_order_guarded_dict_ids.add(id(self.get(source_name)))
+=======
+        for source in self.check_fn_manager.output_graph.guard_on_key_order:
+            self.key_order_guarded_dict_ids.add(id(self.get(source.name())))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Keep track of weak references of objects with ID_MATCH guard. This
         # info is stored alongside optimized_code and guard_manager and is used to
@@ -670,6 +824,10 @@ def __init__(
             str, torch._C._dynamo.guards.GuardManager
         ] = {}
         self._cached_duplicate_input_guards: set[tuple[str, str]] = set()
+<<<<<<< HEAD
+=======
+        self.serialization_mode = serialization_mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def guard_on_dict_keys_and_ignore_order(self, example_value, guard):
         dict_mgr = self.get_guard_manager(guard)
@@ -968,6 +1126,7 @@ def get_guard_manager_from_source(self, source):
             # duplicate names in the case of cells: a name can be both local and cell
             # and will take up 2 slots of the frame's localsplus. The correct behavior
             # is to refer to the cell, which has a higher index.
+<<<<<<< HEAD
             if config.enable_cpp_framelocals_guard_eval:
                 framelocals_names_reversed = code_framelocals_names_reversed_cached(
                     self.f_code
@@ -990,6 +1149,22 @@ def get_guard_manager_from_source(self, source):
                     example_value=example_value,
                     guard_manager_enum=guard_manager_enum,
                 )
+=======
+            framelocals_names_reversed = code_framelocals_names_reversed_cached(
+                self.f_code
+            )
+            framelocals_idx = (
+                len(framelocals_names_reversed)
+                - framelocals_names_reversed.index(source.local_name)
+                - 1
+            )
+            out = root_guard_manager.framelocals_manager(
+                key=(source.local_name, framelocals_idx),
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif istype(source, GlobalSource):
             # Global manager accepts a dict but it is not a DictGuardManager
             # because globals dict is big and we typically guard on a very
@@ -1283,6 +1458,17 @@ def get_guard_manager_from_source(self, source):
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+<<<<<<< HEAD
+=======
+        elif istype(source, DataclassFieldsSource):
+            assert base_guard_manager
+            out = base_guard_manager.lambda_manager(
+                python_lambda=lambda x: dataclass_fields(x),
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise AssertionError(
                 f"missing guard manager builder {source} - {source.name()}"
@@ -1440,7 +1626,20 @@ def NOT_PRESENT_IN_GENERIC_DICT(self, guard: Guard, attr=None) -> None:
 
     def TYPE_MATCH(self, guard: Guard) -> None:
         # ___check_type_id is same as `id(type(x)) == y`
+<<<<<<< HEAD
         t = type(self.get(guard.name))
+=======
+        value = self.get(guard.name)
+        if isinstance(value, torch._subclasses.FakeTensor) and value.pytype:
+            t = value.pytype
+        else:
+            t = type(value)
+
+        if self.serialization_mode == "save":
+            if t.__qualname__ != t.__name__:
+                raise_local_type_error(value)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         obj_id = self.id_ref(t, f"type({guard.name})")
         code = f"___check_type_id({self.arg_ref(guard)}, {obj_id})"
         self._set_guard_export_info(guard, [code])
@@ -1450,6 +1649,13 @@ def TYPE_MATCH(self, guard: Guard) -> None:
         )
 
     def DICT_VERSION(self, guard: Guard):
+<<<<<<< HEAD
+=======
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "DICT_VERSION guard cannot be serialized."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # ___check_dict_version is same as `dict_version(x) == y`
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
@@ -1474,7 +1680,42 @@ def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool):
             not invert, key, get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
+    def ID_MATCH(self, guard: Guard):
+=======
+    def BOOL_MATCH(self, guard: Guard):
+        # checks val == True or val == False
+        ref = self.arg_ref(guard)
+        val = self.get(guard.name)
+        assert istype(val, bool)
+        code = [f"{ref} == {val!r}"]
+        self._set_guard_export_info(guard, code)
+
+        if val:
+            self.get_guard_manager(guard).add_true_match_guard(
+                get_verbose_code_parts(code, guard)
+            )
+        else:
+            self.get_guard_manager(guard).add_false_match_guard(
+                get_verbose_code_parts(code, guard)
+            )
+
+    def NONE_MATCH(self, guard: Guard):
+        # checks `val is None`
+        ref = self.arg_ref(guard)
+        val = self.get(guard.name)
+        assert val is None
+        code = [f"{ref} is None"]
+        self._set_guard_export_info(guard, code)
+
+        self.get_guard_manager(guard).add_none_match_guard(
+            get_verbose_code_parts(code, guard)
+        )
+
     def ID_MATCH(self, guard: Guard):
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError("ID_MATCH guard cannot be serialized.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # ___check_obj_id is same as `id(x) == y`
         if isinstance(guard.originating_source, TypeSource):
             # optional optimization to produce cleaner/faster guard code
@@ -1528,6 +1769,7 @@ def DISPATCH_KEY_SET_MATCH(self, guard: Guard):
     def NAME_MATCH(self, guard: Guard):
         self._guard_on_attribute(guard, "__name__", GuardBuilder.EQUALS_MATCH)
 
+<<<<<<< HEAD
     def DATA_PTR_MATCH(self, guard: Guard):
         # C++ guard has the type check internally
         obj = self.get(guard.name)
@@ -1542,6 +1784,12 @@ def DUAL_LEVEL(self, guard: Guard):
         # Invalidate dual level if current dual level is different than the one
         # in the fx graph
         dual_level = torch.autograd.forward_ad._current_level
+=======
+    def DUAL_LEVEL(self, guard: Guard):
+        # Invalidate dual level if current dual level is different than the one
+        # in the fx graph
+        dual_level = self.check_fn_manager.output_graph.dual_level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code = [f"torch.autograd.forward_ad._current_level == {dual_level}"]
         self._set_guard_export_info(guard, [code])
         # TODO(anijain2305) - Consider this moving this guard to C++
@@ -1557,7 +1805,11 @@ def fn(x):
     def FUNCTORCH_STACK_MATCH(self, guard: Guard):
         # Invalidate functorch code if current level is different than
         # the one when FX graph was generated
+<<<<<<< HEAD
         cis = torch._functorch.pyfunctorch.retrieve_all_functorch_interpreters()
+=======
+        cis = self.check_fn_manager.output_graph.functorch_layers
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         states = [ci.get_state() for ci in cis]
         code = [f"torch._functorch.pyfunctorch.compare_functorch_state({states})"]
         self._set_guard_export_info(guard, code)
@@ -1572,6 +1824,36 @@ def fn(x):
             fn, get_verbose_code_parts(code, guard)
         )
 
+<<<<<<< HEAD
+=======
+    def AUTOGRAD_SAVED_TENSORS_HOOKS(self, guard: Guard):
+        get_hooks = torch._functorch._aot_autograd.utils.top_saved_tensors_hooks
+        are_inline_hooks = (
+            torch._functorch._aot_autograd.utils.saved_tensors_hooks_are_inlineable
+        )
+
+        def hooks_ids_fn(hooks):
+            if not are_inline_hooks(hooks):
+                return None
+
+            pack_hook, unpack_hook = hooks
+            return tuple(map(id, hooks))
+
+        guard_hooks_ids = hooks_ids_fn(get_hooks())
+
+        code = [
+            f"torch._functorch.aot_autograd.utils.top_saved_tensors_hooks ids == {guard_hooks_ids}"
+        ]
+        self._set_guard_export_info(guard, code)
+
+        def fn(x):
+            return guard_hooks_ids == hooks_ids_fn(get_hooks())
+
+        self.guard_manager.root.add_lambda_guard(
+            fn, get_verbose_code_parts(code, guard)
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def TENSOR_SUBCLASS_METADATA_MATCH(self, guard: Guard):
         value = self.get(guard.name)
         original_metadata = deepcopy(self.get(guard.name).__tensor_flatten__()[1])
@@ -1593,7 +1875,11 @@ def metadata_checker(x):
             metadata_checker, get_verbose_code_parts(global_name, guard)
         )
 
+<<<<<<< HEAD
     def EQUALS_MATCH(self, guard: Guard):
+=======
+    def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
         if np:
@@ -1633,6 +1919,10 @@ def EQUALS_MATCH(self, guard: Guard):
         if torch.distributed.is_available():
             from torch.distributed.device_mesh import DeviceMesh
             from torch.distributed.tensor.placement_types import (
+<<<<<<< HEAD
+=======
+                _StridedShard,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Partial,
                 Replicate,
                 Shard,
@@ -1643,8 +1933,18 @@ def EQUALS_MATCH(self, guard: Guard):
                 Replicate,
                 Partial,
                 DeviceMesh,
+<<<<<<< HEAD
+            )
+
+=======
+                _StridedShard,
             )
 
+        from torch.export.dynamic_shapes import _IntWrapper
+
+        ok_types = ok_types + (_IntWrapper,)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import torch.utils._pytree as pytree
 
         assert istype(val, ok_types) or pytree.is_constant_class(type(val)), (
@@ -1684,20 +1984,47 @@ def EQUALS_MATCH(self, guard: Guard):
             # is immutable. For a few corner cases like sets and lists, we make a deepcopy to purposefully fail the
             # pointer equality check.
             val = deepcopy(val)
+<<<<<<< HEAD
         self.get_guard_manager(guard).add_equals_match_guard(
             val, get_verbose_code_parts(code, guard)
         )
+=======
+
+        verbose_code_parts = get_verbose_code_parts(code, guard)
+        if recompile_hint:
+            verbose_code_parts = [
+                f"{part} (HINT: {recompile_hint})" for part in verbose_code_parts
+            ]
+
+        self.get_guard_manager(guard).add_equals_match_guard(val, verbose_code_parts)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._set_guard_export_info(guard, code)
         return
 
     def CONSTANT_MATCH(self, guard: Guard):
         val = self.get(guard.name)
+<<<<<<< HEAD
         if istype(val, (bool, type(None), types.CodeType)):
+=======
+        if istype(val, bool):
+            self.BOOL_MATCH(guard)
+        elif val is None:
+            self.NONE_MATCH(guard)
+        elif istype(val, types.CodeType):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.ID_MATCH(guard)
         else:
             self.EQUALS_MATCH(guard)
 
     def NN_MODULE(self, guard: Guard):
+<<<<<<< HEAD
+=======
+        # don't support this in serialization because it uses unsupported ID_MATCH
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "NN_MODULE guard cannot be serialized."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.ID_MATCH(guard)
         val = self.get(guard.name)
         if hasattr(val, "training"):
@@ -1716,10 +2043,26 @@ def NN_MODULE(self, guard: Guard):
 
     def FUNCTION_MATCH(self, guard: Guard):
         """things like torch.add and user defined functions"""
+<<<<<<< HEAD
+=======
+        # don't support this in serialization because it uses unsupported ID_MATCH
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "FUNCTION_MATCH guard cannot be serialized."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.ID_MATCH(guard)
 
     def CLOSURE_MATCH(self, guard: Guard):
         """matches a closure by __code__ id."""
+<<<<<<< HEAD
+=======
+        # don't support this in serialization because it uses unsupported FUNCTION_MATCH
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "CLOSURE_MATCH guard cannot be serialized."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         val = self.get(guard.name)
         # Strictly only want user-defined functions
         if type(val) == types.FunctionType and hasattr(val, "__code__"):
@@ -1731,9 +2074,12 @@ def CLOSURE_MATCH(self, guard: Guard):
     def BUILTIN_MATCH(self, guard: Guard):
         return self.FUNCTION_MATCH(guard)
 
+<<<<<<< HEAD
     def PYMODULE_MATCH(self, guard: Guard):
         return self.FUNCTION_MATCH(guard)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def SEQUENCE_LENGTH(self, guard):
         # This guard is used to check length of PySequence objects like list,
         # tuple, collections.deque etc
@@ -1796,6 +2142,13 @@ def RANGE_ITERATOR_MATCH(self, guard):
 
     # TODO(voz): Deduplicate w/ AOTAutograd dupe input guards
     def DUPLICATE_INPUT(self, guard, source_b):
+<<<<<<< HEAD
+=======
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "DUPLICATE_INPUT guard cannot be serialized yet."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ref_a = self.arg_ref(guard)
         ref_b = self.arg_ref(source_b.name())
 
@@ -1822,6 +2175,13 @@ def DUPLICATE_INPUT(self, guard, source_b):
         )
 
     def WEAKREF_ALIVE(self, guard):
+<<<<<<< HEAD
+=======
+        if self.serialization_mode == "save":
+            raise torch._dynamo.exc.PackageError(
+                "WEAKREF_ALIVE guard cannot be serialized."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         code = [f"{self.arg_ref(guard)} is not None"]
 
         self._set_guard_export_info(guard, code)
@@ -1871,9 +2231,12 @@ def EMPTY_NN_MODULE_HOOKS_DICT(self, guard):
             return
         self.SEQUENCE_LENGTH(guard)
 
+<<<<<<< HEAD
     def OBJECT_MUTATION(self, guard: Guard):
         mutation_guard.watch(self.get(guard.name), self.check_fn_manager)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def GRAD_MODE(self, guard: Guard):
         pass  # we always guard on this via GlobalStateGuard()
 
@@ -1889,9 +2252,16 @@ def FSDP_TRAINING_STATE(self, guard: Guard):
     def DEFAULT_DEVICE(self, guard: Guard):
         """Guard on CURRENT_DEVICE per torch.utils._device"""
         assert guard.source is GuardSource.GLOBAL
+<<<<<<< HEAD
         import torch.utils._device as m
 
         code = [f"utils_device.CURRENT_DEVICE == {m.CURRENT_DEVICE!r}"]
+=======
+
+        code = [
+            f"utils_device.CURRENT_DEVICE == {self.check_fn_manager.output_graph.current_device!r}"
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._set_guard_export_info(guard, code)
 
         self.get_guard_manager(guard).add_default_device_guard(
@@ -1899,6 +2269,7 @@ def DEFAULT_DEVICE(self, guard: Guard):
         )
 
     def SHAPE_ENV(self, guard: Guard):
+<<<<<<< HEAD
         # Let's handle ShapeEnv guards.  To do this, we will resolve
         # shape variables to sources from tracked_fakes.  This must happen after
         # tensor checks.
@@ -1974,6 +2345,122 @@ def _get_code_parts(langs):
         # postprocessing, so don't freeze yet
         if not self.check_fn_manager.output_graph.export:
             output_graph.shape_env.freeze()
+=======
+        assert guard.name == ""
+        output_graph = self.check_fn_manager.output_graph
+        if self.serialization_mode == "load":
+            assert self.check_fn_manager.shape_code_parts is not None
+            shape_code_parts = self.check_fn_manager.shape_code_parts
+            python_code_parts = shape_code_parts.python_code_parts
+            verbose_code_parts = shape_code_parts.verbose_code_parts
+            if shape_code_parts.cpp_code_parts is not None:
+                cpp_code_parts = shape_code_parts.cpp_code_parts
+            python_fallback = shape_code_parts.python_fallback
+        else:
+            # Let's handle ShapeEnv guards.  To do this, we will resolve
+            # shape variables to sources from tracked_fakes.  This must happen after
+            # tensor checks.
+            # NB: self.output_graph can be None in the debug_nops tests
+            fs = output_graph.tracked_fakes
+            input_contexts = [a.symbolic_context for a in fs]
+
+            def get_sources(t_id, dim):
+                # Looks up base sources mapped to a tensor id and uses them to create
+                # sources for the corresponding tensor dimension.
+                return [
+                    TensorPropertySource(source, TensorProperty.SIZE, dim)
+                    for source in output_graph.tracked_fakes_id_to_source[t_id]
+                ]
+
+            if output_graph.export_constraints:
+                names: dict[str, tuple[int, int]] = {}
+                source_pairs: list[tuple[Source, Source]] = []
+                derived_equalities: list[  # type: ignore[type-arg]
+                    tuple[Source, Union[Source, Symbol], Callable]
+                ] = []
+                phantom_symbols: dict[str, Symbol] = {}
+                relaxed_sources: set[Source] = set()
+                for constraint in output_graph.export_constraints:
+                    if constraint.t_id in output_graph.tracked_fakes_id_to_source:
+                        torch.export.dynamic_shapes._process_equalities(
+                            constraint,
+                            get_sources,
+                            output_graph.shape_env,
+                            names,
+                            source_pairs,
+                            derived_equalities,
+                            phantom_symbols,
+                            relaxed_sources,
+                        )
+                    else:
+                        log.warning("Untracked tensor used in export constraints")
+                equalities_inputs = EqualityConstraint(
+                    source_pairs=source_pairs,
+                    derived_equalities=derived_equalities,
+                    phantom_symbols=list(phantom_symbols.values()),
+                    relaxed_sources=relaxed_sources,
+                    warn_only=False,
+                )
+            else:
+                equalities_inputs = None
+
+            def _get_code_parts(langs):
+                return output_graph.shape_env.produce_guards_verbose(
+                    [a.fake for a in fs],
+                    [a.source for a in fs],
+                    input_contexts=input_contexts,
+                    equalities_inputs=equalities_inputs,
+                    source_ref=self.source_ref,
+                    # Export keeps static.
+                    ignore_static=(not self.check_fn_manager.output_graph.export),
+                    langs=langs,
+                )
+
+            if config.enable_cpp_symbolic_shape_guards:
+                try:
+                    # For exporting we need the python code parts
+                    python_code_parts, verbose_code_parts, cpp_code_parts = (
+                        _get_code_parts(("python", "verbose_python", "cpp"))
+                    )
+                    python_fallback = False
+                except OverflowError:
+                    # Cannot use int64_t
+                    python_fallback = True
+                    python_code_parts, verbose_code_parts = _get_code_parts(
+                        ("python", "verbose_python")
+                    )
+            else:
+                python_fallback = True
+                python_code_parts, verbose_code_parts = _get_code_parts(
+                    ("python", "verbose_python")
+                )
+
+            # When exporting, we may work with the shape constraints some more in
+            # postprocessing, so don't freeze yet
+            if not self.check_fn_manager.output_graph.export:
+                output_graph.shape_env.freeze()
+
+        if self.serialization_mode == "save":
+            # For SHAPE_ENV we want to skip serializing the entire ShapeEnv so instead
+            # we directly serialize the generated code here.
+            maybe_cpp_code_parts = locals().get("cpp_code_parts")
+            assert maybe_cpp_code_parts is None or isinstance(
+                maybe_cpp_code_parts, _CppShapeGuardsHelper
+            )
+            maybe_shape_env_sources = (
+                []
+                if maybe_cpp_code_parts is None
+                else list(maybe_cpp_code_parts.source_to_symbol.keys())
+            )
+            self.check_fn_manager.shape_code_parts = ShapeCodeParts(
+                python_code_parts=python_code_parts,
+                verbose_code_parts=verbose_code_parts,
+                cpp_code_parts=maybe_cpp_code_parts,
+                python_fallback=python_fallback,
+                shape_env_sources=maybe_shape_env_sources,
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for code in python_code_parts.exprs:
             self._set_guard_export_info(guard, [code])
 
@@ -1981,11 +2468,18 @@ def _get_code_parts(langs):
         if compile_context := CompileContext.try_get():
             compile_context.shape_env_guards.extend(verbose_code_parts.exprs)
 
+<<<<<<< HEAD
         if config.enable_cpp_symbolic_shape_guards:
             import ctypes
 
             from torch._inductor.codecache import CppCodeCache
 
+=======
+        int_source_to_symbol = []
+        float_source_to_symbol = []
+
+        if not python_fallback:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert cpp_code_parts  # type: ignore[possibly-undefined]
             code_parts, source_to_symbol = (
                 cpp_code_parts.exprs,
@@ -1995,10 +2489,13 @@ def _get_code_parts(langs):
             if not code_parts:
                 return
 
+<<<<<<< HEAD
             int_source_to_symbol = []
             float_source_to_symbol = []
 
             python_fallback = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for source, symbol in source_to_symbol.items():
                 if isinstance(source, ConstantSource):
                     python_fallback = True
@@ -2016,6 +2513,7 @@ def _get_code_parts(langs):
                         # int64_t/double in C++ guards for now.
                         python_fallback = True
 
+<<<<<<< HEAD
             if not python_fallback:
                 source_to_symbol = dict(int_source_to_symbol + float_source_to_symbol)
                 try:
@@ -2072,6 +2570,81 @@ def _get_code_parts(langs):
                         verbose_code_parts.exprs,
                     )
                     return
+=======
+        if not python_fallback:
+            import ctypes
+
+            from torch._inductor.codecache import CppCodeCache
+
+            assert cpp_code_parts  # type: ignore[possibly-undefined]
+            code_parts, source_to_symbol = (
+                cpp_code_parts.exprs,
+                cpp_code_parts.source_to_symbol,
+            )
+
+            source_to_symbol = dict(int_source_to_symbol + float_source_to_symbol)
+            try:
+                guard_managers = [
+                    self.get_guard_manager_from_source(IndexedSource(source, i))
+                    for i, source in enumerate(source_to_symbol)
+                ]
+
+                int_symbols_str = ", ".join(
+                    f"{symbol} = int_values[{i}]"
+                    for i, (_, symbol) in enumerate(int_source_to_symbol)
+                )
+                float_symbols_str = ", ".join(
+                    f"{symbol} = float_values[{i}]"
+                    for i, (_, symbol) in enumerate(float_source_to_symbol)
+                )
+
+                if int_symbols_str:
+                    int_symbols_str = f"int64_t {int_symbols_str};"
+                if float_symbols_str:
+                    float_symbols_str = f"double {float_symbols_str};"
+
+                func_str = textwrap.dedent(
+                    f"""
+                #include <algorithm>
+                #include <cstdint>
+                #include <cmath>
+                #include <c10/util/generic_math.h>
+
+                #if defined(_MSC_VER)
+                #  define EXTERN_DLL_EXPORT extern "C" __declspec(dllexport)
+                #else
+                #  define EXTERN_DLL_EXPORT extern "C"
+                #endif
+
+                EXTERN_DLL_EXPORT int8_t guard(int64_t *int_values, double *float_values) {{
+                  {int_symbols_str}
+                  {float_symbols_str}
+                  return ({") && (".join(code_parts)});
+                }}
+                """
+                )
+                guards_log.debug(
+                    "C++ shape guard function: %s %s",
+                    func_str,
+                    verbose_code_parts.exprs,
+                )
+                clib = CppCodeCache.load(func_str)
+                cguard = ctypes.cast(clib.guard, ctypes.c_void_p).value
+                assert cguard
+            except torch._inductor.exc.InvalidCxxCompiler:
+                # No valid C++ compiler to compile the shape guard
+                pass
+            else:
+                install_symbolic_shape_guard(
+                    guard_managers,
+                    len(int_source_to_symbol),
+                    len(float_source_to_symbol),
+                    cguard,
+                    clib,
+                    verbose_code_parts.exprs,
+                )
+                return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Install all the symbolic guards in one python lambda guard. These are run
         # at the very end of the RootGuardManager via epilogue guards.
@@ -2096,8 +2669,28 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
                 value = value()
 
             value = value if value is not None else self.get(guard.name)
+<<<<<<< HEAD
             assert isinstance(value, torch.Tensor)
 
+=======
+
+            pytype = type(value)
+            dispatch_keys = torch._C._dispatch_keys(value)
+            if isinstance(value, torch._subclasses.FakeTensor):
+                if value.pytype is not None:
+                    pytype = value.pytype
+                if value.dispatch_keys is not None:
+                    dispatch_keys = value.dispatch_keys
+
+            assert isinstance(value, torch.Tensor)
+
+            if config.log_compilation_metrics and isinstance(value, torch.nn.Parameter):
+                metrics_context = get_metrics_context()
+                metrics_context.increment("param_numel", value.numel())
+                metrics_context.increment("param_bytes", value.nbytes)
+                metrics_context.increment("param_count", 1)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             tensor_name = self.arg_ref(guard)
             # [Note - On Export Tensor Guards]
             #
@@ -2147,7 +2740,11 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
                 # But we deliberately take this soundness hit because this
                 # usecase is quite rare and there is substantial reduction in
                 # guard overhead.
+<<<<<<< HEAD
                 # For numpy tensors, since those are ephemeral, we dont have to
+=======
+                # For numpy tensors, since those are ephemeral, we don't have to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # insert aliasing guards on them
                 if not (
                     config.skip_no_tensor_aliasing_guards_on_parameters
@@ -2166,7 +2763,13 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
                 stride = convert_to_concrete_values(metadata["stride"])
 
                 verbose_code_parts = get_verbose_code_parts(
+<<<<<<< HEAD
                     get_tensor_guard_code_part(value, tensor_name, size, stride),
+=======
+                    get_tensor_guard_code_part(
+                        value, tensor_name, size, stride, pytype, dispatch_keys
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     guard,
                 )
                 guard_manager.add_tensor_match_guard(
@@ -2175,14 +2778,23 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
                     stride,
                     tensor_name,
                     verbose_code_parts,
+<<<<<<< HEAD
+=======
+                    pytype,
+                    dispatch_keys,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 # We consider TENSOR_MATCH guard to be important enough to be
                 # included in diff guard manager by default.
                 if not isinstance(value, torch.nn.Parameter):
+<<<<<<< HEAD
                     self.check_fn_manager.guard_manager.diff_guard_sources.add(
                         guard.name
                     )
+=======
+                    self.guard_manager.diff_guard_sources.add(guard.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # A frame is valid for reuse with dynamic dimensions if the new
             # (user-requested) dynamic dimensions are a subset of the old
@@ -2409,6 +3021,164 @@ def populate_diff_guard_manager(self):
         self.diff_guard_root = None
 
 
+<<<<<<< HEAD
+=======
+@dataclasses.dataclass
+class ShapeCodeParts:
+    python_code_parts: _ShapeGuardsHelper
+    verbose_code_parts: _ShapeGuardsHelper
+    cpp_code_parts: Optional[_CppShapeGuardsHelper]
+    python_fallback: bool
+    shape_env_sources: list[Source]
+
+
+@dataclasses.dataclass
+class GuardsState:
+    output_graph: OutputGraphGuardsState
+    shape_code_parts: Optional[ShapeCodeParts]
+
+
+class GuardsStatePickler(pickle.Pickler):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.fake_mode = torch._subclasses.FakeTensorMode()
+        self.tensor_converter = torch._subclasses.fake_tensor.FakeTensorConverter()
+
+    @classmethod
+    def _unpickle_module(cls, state):
+        mod = torch.nn.Module()
+        mod.__setstate__(state)
+        return mod
+
+    @classmethod
+    def _unpickle_tensor(cls, meta_tensor, device, pytype, dispatch_keys_raw):
+        fake_mode = torch._subclasses.FakeTensorMode()
+        tensor_converter = torch._subclasses.fake_tensor.FakeTensorConverter()
+        return tensor_converter.from_meta_and_device(
+            fake_mode,
+            meta_tensor,
+            device,
+            pytype,
+            torch._C.DispatchKeySet.from_raw_repr(dispatch_keys_raw),
+        )
+
+    @classmethod
+    def _unpickle_traceable_wrapper_subclass(
+        cls, meta_tensor, device, pytype, dispatch_keys_raw, ctx, inner_data
+    ):
+        # Unpickle the inner tensor components. These could also be subclass instances.
+        inner_tensors = {}
+        for attr, unpickle_func, unpickle_func_args in inner_data:
+            inner_tensors[attr] = unpickle_func(*unpickle_func_args)
+
+        outer_size, outer_stride = meta_tensor.shape, meta_tensor.stride()
+        out = type(meta_tensor).__tensor_unflatten__(
+            inner_tensors, ctx, outer_size, outer_stride
+        )
+        out.pytype = pytype
+        out.dispatch_keys = torch._C.DispatchKeySet.from_raw_repr(dispatch_keys_raw)
+        return out
+
+    @classmethod
+    def _unpickle_python_module(cls, alias: str):
+        return importlib.import_module(alias)
+
+    @classmethod
+    def _unpickle_dispatch_key_set(cls, raw_repr: int):
+        return torch._C.DispatchKeySet.from_raw_repr(raw_repr)
+
+    @classmethod
+    def _unpickle_functorch_interpreter(cls, json: bytes):
+        return torch._C._functorch.CInterpreter.deserialize(json)
+
+    @classmethod
+    def _unpickle_mapping_proxy(cls, d):
+        return types.MappingProxyType(d)
+
+    def reducer_override(self, obj):
+        import sympy
+
+        if isinstance(obj, torch.Tensor) and obj.device.type != "meta":
+            from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+            if is_traceable_wrapper_subclass(obj):
+                # inner_data is a list of tuples of:
+                #   (inner attr name, unpickle func, tuple of func inputs)
+                # This supports traceable wrapper subclass inner tensors.
+                inner_data = []
+                attrs, ctx = obj.__tensor_flatten__()
+                # recursively call for inner tensor components
+                for attr in attrs:
+                    inner = getattr(obj, attr)
+                    func, args_tuple = self.reducer_override(inner)
+                    inner_data.append((attr, func, args_tuple))
+
+                return type(self)._unpickle_traceable_wrapper_subclass, (
+                    torch.empty_like(obj, device="meta"),
+                    obj.device,
+                    type(obj),
+                    torch._C._dispatch_keys(obj).raw_repr(),
+                    ctx,
+                    inner_data,
+                )
+
+            return type(self)._unpickle_tensor, (
+                torch.empty_like(obj, device="meta"),
+                obj.device,
+                type(obj),
+                torch._C._dispatch_keys(obj).raw_repr(),
+            )
+
+        elif isinstance(obj, torch.nn.Module):
+            if type(obj).__qualname__ == type(obj).__name__:
+                return NotImplemented
+            if obj.__class__.__getstate__ == torch.nn.Module.__getstate__:
+                return type(self)._unpickle_module, (obj.__getstate__(),)
+
+        elif inspect.ismodule(obj):
+            return type(self)._unpickle_python_module, (obj.__name__,)
+
+        elif isinstance(obj, torch._C.DispatchKeySet):
+            return type(self)._unpickle_dispatch_key_set, (obj.raw_repr(),)
+
+        elif isinstance(obj, torch._C._functorch.CInterpreter):
+            return type(self)._unpickle_functorch_interpreter, (obj.serialize(),)
+
+        elif (
+            inspect.isclass(obj)
+            and issubclass(obj, sympy.Function)
+            and hasattr(obj, "_torch_handler_name")
+        ):
+            assert hasattr(obj, "_torch_unpickler")
+            return obj._torch_unpickler, (obj._torch_handler_name,)
+
+        elif isinstance(obj, torch.SymInt):
+            raise RuntimeError(f"Cannot serialize SymInt {obj} (node: {obj.node})")
+
+        elif isinstance(obj, types.MappingProxyType):
+            return type(self)._unpickle_mapping_proxy, (obj.copy(),)
+
+        if type(obj).__qualname__ != type(obj).__name__:
+            raise torch._dynamo.exc.PackageError(
+                f"Type {type(obj)} for object {obj} cannot be saved "
+                + "into torch.compile() package since it's defined in local scope. "
+                + "Please define the class at global scope (top level of a module)."
+            )
+
+        return NotImplemented
+
+
+def pickle_guards_state(state: GuardsState) -> bytes:
+    buf = io.BytesIO()
+    pickler = GuardsStatePickler(buf)
+    try:
+        pickler.dump(state)
+    except AttributeError as e:
+        raise torch._dynamo.exc.PackageError(str(e)) from e
+    return buf.getvalue()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NB: Naively, you'd expect this to only be a function that produces
 # the callable that constitutes the guard.  However, there is some
 # delicate handling for invalidating this check function when the
@@ -2421,6 +3191,14 @@ def __init__(
         output_graph=None,
         cache_entry=None,
         guard_fail_fn: Optional[Callable[[GuardFail], None]] = None,
+<<<<<<< HEAD
+=======
+        guard_filter_fn: Optional[
+            Callable[[list[GuardFilterEntry]], list[bool]]
+        ] = None,
+        guards_serialization_mode: Optional[str] = None,
+        shape_code_parts: Optional[ShapeCodeParts] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         guards = output_graph.guards if output_graph else None
         self._weakrefs: dict[int, ReferenceType[object]] = {}
@@ -2428,16 +3206,24 @@ def __init__(
         existing_diff_guard_sources = (
             update_diff_guard_managers_for_existing_cache_entries(cache_entry)
         )
+<<<<<<< HEAD
         self.guard_manager = GuardManagerWrapper()
         self.guard_manager.diff_guard_sources = existing_diff_guard_sources
         self.output_graph = output_graph
         w_builder = None
+=======
+        self.output_graph = output_graph
+
+        # Only used for serialization.
+        self.shape_code_parts = shape_code_parts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # NB: Until we trace device contexts, we need to use the stack recorded at the beginning of tracing
         # in case a set default device call was made in the graph.
         self.torch_function_mode_stack = (
             output_graph.torch_function_mode_stack if output_graph else None
         )
+<<<<<<< HEAD
 
         def source_ref(source):
             guard_source = source.guard_source()
@@ -2472,10 +3258,14 @@ def cleanup_builder(weak_b):
         guard_on_nn_modules = config.guard_nn_modules and justknobs_check(
             "pytorch/compiler:guard_nn_modules"
         )
+=======
+        self.guards_serialization_mode = guards_serialization_mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not justknobs_check("pytorch/compiler:guard_nn_modules"):
             log.warning("guard_nn_modules is turned off using justknobs killswitch")
 
+<<<<<<< HEAD
         for guard in sorted(guards or (), key=Guard.sort_key):
             if (
                 not guard_on_nn_modules
@@ -2491,6 +3281,63 @@ def cleanup_builder(weak_b):
             guard.create(builder)
 
         self.compile_check_fn(builder, guards, guard_fail_fn)
+=======
+        sorted_guards = sorted(guards or (), key=Guard.sort_key)
+        builder, guard_manager = self.build_guards(
+            sorted_guards,
+            existing_diff_guard_sources,
+            f_code,
+            output_graph,
+            None if guard_filter_fn else self.guards_serialization_mode,
+        )
+
+        if guard_filter_fn:
+
+            def make_guard_filter_entry(guard):
+                MISSING = object()
+                name = strip_local_scope(guard.name)
+                if name == "":
+                    has_value = False
+                    value = MISSING
+                else:
+                    has_value = True
+                    value = builder.get(guard.name)
+                is_global = get_global_source_name(guard.originating_source) is not None
+                guard_fn = guard.create_fn
+                if isinstance(guard_fn, functools.partial):
+                    guard_fn = guard.create_fn.func
+                return GuardFilterEntry(
+                    name=name,
+                    has_value=has_value,
+                    value=value,
+                    guard_type=guard_fn.__name__,
+                    derived_guard_types=tuple(guard.guard_types)
+                    if guard.guard_types
+                    else (),
+                    is_global=is_global,
+                    orig_guard=guard,
+                )
+
+            filter_results = guard_filter_fn(
+                [make_guard_filter_entry(guard) for guard in sorted_guards]
+            )
+            assert len(filter_results) == len(sorted_guards)
+            assert all(type(x) == bool for x in filter_results)
+            sorted_guards = [
+                guard for i, guard in enumerate(sorted_guards) if filter_results[i]
+            ]
+            # Redo the guards because filtering relies on the results from the last guard builder.
+            builder, guard_manager = self.build_guards(
+                sorted_guards,
+                existing_diff_guard_sources,
+                f_code,
+                output_graph,
+                self.guards_serialization_mode,
+            )
+
+        self.guard_manager = guard_manager
+        self.compile_check_fn(builder, sorted_guards, guard_fail_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Keep track of weak references of objects with ID_MATCH guard. This
         # info is stored alongside optimized_code and guard_manager and is used to
@@ -2510,7 +3357,11 @@ def cleanup_builder(weak_b):
         # TODO(anijain2305, ydwu4) - Skipping export because of following test
         # python -s test/dynamo/test_export.py -k test_export_with_symbool_inputs
         latency = 0.0
+<<<<<<< HEAD
         if not output_graph.export:
+=======
+        if not output_graph.export and self.guards_serialization_mode != "load":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not self.guard_manager.check(output_graph.local_scope):
                 reasons = get_guard_fail_reason_helper(
                     self.guard_manager,  # type: ignore[arg-type]
@@ -2524,12 +3375,23 @@ def cleanup_builder(weak_b):
                     self.guard_manager, output_graph.local_scope
                 )
 
+<<<<<<< HEAD
             # NB for developers: n_iters is chosen to be 50 to achieve
             # statistical significance.  If you are working on a guard
             # optimization, it might be a good idea to increase this number for
             # more stabiilty during development.
             latency = profile_guard_manager(
                 self.guard_manager.root, output_graph.local_scope, 50
+=======
+            # NB for developers: n_iters is chosen to be 1 to prevent excessive
+            # increase in compile time. We first do a cache flush to measure the
+            # guard latency more accurately. This cache flush is expensive.
+            # Note  - If you are working on a guard optimization, it might be a
+            # good idea to increase this number for more stabiilty during
+            # development.
+            latency = profile_guard_manager(
+                self.guard_manager.root, output_graph.local_scope, 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             guards_log.debug("Guard eval latency = %s us", f"{latency:.2f}")
             # Note: We use `increment_toplevel` instead of `compilation_metric`
@@ -2543,6 +3405,83 @@ def cleanup_builder(weak_b):
             # account for, we simply increment at the toplevel instead.
             CompileEventLogger.increment_toplevel("guard_latency_us", int(latency))
 
+<<<<<<< HEAD
+=======
+        self.guards_state: Optional[bytes] = None
+        if self.guards_serialization_mode == "save":
+            used_global_vars = set()
+            used_local_vars = set()
+
+            def prune_variable(source):
+                if name := get_global_source_name(source):
+                    assert isinstance(name, str)
+                    used_global_vars.add(name)
+                elif name := get_local_source_name(source):
+                    assert isinstance(name, str)
+                    used_local_vars.add(name)
+
+            output_graph_guards_state = self.output_graph.dump_guards_state()
+            # Only serialize the global variables that are actually used in guards.
+            for guard in sorted_guards:
+                if isinstance(guard.originating_source, ShapeEnvSource):
+                    assert self.shape_code_parts
+                    for source in self.shape_code_parts.shape_env_sources:
+                        prune_variable(source)
+                else:
+                    prune_variable(guard.originating_source)
+
+            for source in self.output_graph.guard_on_key_order:
+                prune_variable(source)
+
+            def normalize_create_fn(x):
+                if isinstance(x, functools.partial):
+
+                    def _ref(x):
+                        if isinstance(x, (TensorWeakRef, weakref.ref)):
+                            return x()
+                        return x
+
+                    new_args = tuple(_ref(a) for a in x.args)
+                    new_keywords = {k: _ref(v) for k, v in x.keywords.items()}
+                    return functools.partial(x.func, *new_args, **new_keywords)
+
+                return x
+
+            output_graph_guards_state = dataclasses.replace(
+                output_graph_guards_state,
+                local_scope={
+                    k: v
+                    for k, v in output_graph_guards_state.local_scope.items()
+                    if k in used_local_vars
+                },
+                global_scope={
+                    k: v
+                    for k, v in output_graph_guards_state.global_scope.items()
+                    if k in used_global_vars
+                },
+                _guards=torch._guards.GuardsSet(
+                    {
+                        dataclasses.replace(
+                            guard,
+                            obj_weakref=None,
+                            guarded_class_weakref=None,
+                            create_fn=normalize_create_fn(guard.create_fn),
+                        )
+                        for guard in sorted_guards
+                    }
+                ),
+                input_source_to_sizes_strides=pytree.tree_map(
+                    convert_int_to_concrete_values,
+                    output_graph_guards_state.input_source_to_sizes_strides,
+                ),
+            )
+            guards_state = GuardsState(
+                output_graph=output_graph_guards_state,
+                shape_code_parts=self.shape_code_parts,
+            )
+            self.guards_state = pickle_guards_state(guards_state)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: don't do the string rep, do something more structured here
         torch._logging.trace_structured(
             "dynamo_cpp_guards_str",
@@ -2559,6 +3498,72 @@ def cleanup_builder(weak_b):
         self._weakrefs.clear()
         self.output_graph = None
 
+<<<<<<< HEAD
+=======
+    def build_guards(
+        self,
+        sorted_guards,
+        existing_diff_guard_sources,
+        f_code,
+        output_graph,
+        serialization_mode=None,
+    ):
+        guard_manager = GuardManagerWrapper()
+        guard_manager.diff_guard_sources = existing_diff_guard_sources
+
+        w_builder = None
+
+        def source_ref(source):
+            guard_source = source.guard_source()
+            if guard_source is GuardSource.CONSTANT:
+                # No need to track constants
+                return source.name()
+            assert w_builder
+            r_builder = w_builder()
+            assert r_builder is not None
+            return r_builder.arg_ref(source.name())
+
+        builder = GuardBuilder(
+            f_code,
+            self.id_ref,
+            source_ref,
+            self.lookup_weakrefs,
+            output_graph.local_scope,
+            output_graph.global_scope,
+            guard_manager,
+            self,
+            serialization_mode,
+        )
+
+        # Break retain cycle. See test_release_scope_memory
+        def cleanup_builder(weak_b):
+            b = weak_b()
+            if b:
+                b.scope = None
+
+        # Break retain cycle. See test_release_input_memory
+        w_builder = weakref.ref(builder, cleanup_builder)
+
+        guard_on_nn_modules = config.guard_nn_modules and justknobs_check(
+            "pytorch/compiler:guard_nn_modules"
+        )
+
+        for guard in sorted_guards:
+            if (
+                not guard_on_nn_modules
+                and guard.is_specialized_nn_module()
+                # Default func args must be guarded on.
+                # TODO: we could make use of 'DefaultsSource' and offer a .guard.is_defaults() API
+                and "__defaults__" not in guard.name
+                and "__kwdefaults__" not in guard.name
+                and (config.skip_nnmodule_hook_guards or "hooks" not in guard.name)
+            ):
+                continue
+
+            guard.create(builder)
+        return builder, guard_manager
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def compile_check_fn(self, builder, guards_out, guard_fail_fn):
         # see parallel handling of ".0" / "___implicit0" in _eval_frame.c
         largs = builder.argnames
@@ -2648,9 +3653,13 @@ def add_code_part(code_part, guard, log_only=False):
             )
 
         aotautograd_guards: list[GuardEnvExpr] = (
+<<<<<<< HEAD
             self.output_graph.tracing_context.guards_context.aotautograd_guards
             if self.output_graph
             else []
+=======
+            self.output_graph.aotautograd_guards if self.output_graph else []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # TODO(anijain2305) - There is a duplicate logic in Dynamo to find
@@ -2780,10 +3789,23 @@ def build_guard_function(code_parts, closure_args) -> tuple[str, str]:
     from torch._inductor.utils import IndentedBuffer
 
     csepass = PyExprCSEPass()
+<<<<<<< HEAD
     csepass.count(code_parts)
 
     def replace(expr: str) -> tuple[list[str], str]:
         return csepass.replace(expr)
+=======
+    try:
+        csepass.count(code_parts)
+
+        def replace(expr: str) -> tuple[list[str], str]:
+            return csepass.replace(expr)
+    except RecursionError:
+        # If we hit recursion limits during CSE analysis, fall back to a no-op replace function
+        # This can happen with extremely complex guard expressions
+        def replace(expr: str) -> tuple[list[str], str]:
+            return [], expr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Generate the inner body of the guard function.
     # i.e. if-chain of the guard expressions.
@@ -2822,8 +3844,13 @@ def is_recompiles_verbose_enabled():
 
 
 # this will only be used if cpp guards are disabled
+<<<<<<< HEAD
 def make_torch_function_mode_stack_guard(intial_stack):
     types = [type(x) for x in intial_stack]
+=======
+def make_torch_function_mode_stack_guard(initial_stack):
+    types = [type(x) for x in initial_stack]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def check_torch_function_mode_stack():
         cur_stack = get_torch_function_mode_stack()
@@ -3098,7 +4125,11 @@ def make_dupe_guard(obj_source, dupe_source):
             dupe_source
         ) or is_from_flatten_script_object_source(obj_source):
             raise exc.UnsafeScriptObjectError(
+<<<<<<< HEAD
                 f"{obj_source.name()} is alising {dupe_source.name()}. This is not supported."
+=======
+                f"{obj_source.name()} is aliasing {dupe_source.name()}. This is not supported."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f" Please do a clone for corresponding input."
             )
 
diff --git a/torch/_dynamo/hooks.py b/torch/_dynamo/hooks.py
index c701abb40258..ff76088c255e 100644
--- a/torch/_dynamo/hooks.py
+++ b/torch/_dynamo/hooks.py
@@ -6,7 +6,10 @@
 The Hooks class manages two types of hook functions:
 - guard_export_fn: Called when guards need to be exported, taking a GuardsSet as input
 - guard_fail_fn: Called when a guard check fails, taking a GuardFail object as input
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 These hooks enable customization of guard export and failure handling behaviors.
 """
 
@@ -15,10 +18,18 @@
 
 from torch._guards import GuardsSet
 
+<<<<<<< HEAD
 from .types import GuardFail
+=======
+from .types import GuardFail, GuardFilterEntry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
 class Hooks:
     guard_export_fn: Optional[Callable[[GuardsSet], None]] = None
     guard_fail_fn: Optional[Callable[[GuardFail], None]] = None
+<<<<<<< HEAD
+=======
+    guard_filter_fn: Optional[Callable[[list[GuardFilterEntry]], list[bool]]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index 2d67665f5e9d..49a0c1e49fe6 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -33,7 +33,11 @@ def get_loggers() -> list[logging.Logger]:
 # get_step_logger should be lazily called (i.e. at runtime, not at module-load time)
 # so that step numbers are initialized properly. e.g.:
 
+<<<<<<< HEAD
 # @functools.lru_cache(None)
+=======
+# @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # def _step_logger():
 #     return get_step_logger(logging.getLogger(...))
 
diff --git a/torch/_dynamo/metrics_context.py b/torch/_dynamo/metrics_context.py
index e43d786b132d..d2e47e143c69 100644
--- a/torch/_dynamo/metrics_context.py
+++ b/torch/_dynamo/metrics_context.py
@@ -14,12 +14,22 @@
 """
 
 import heapq
+<<<<<<< HEAD
+=======
+import logging
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import time
 from collections.abc import Iterator
 from typing import Any, Callable, Optional
 from typing_extensions import TypeAlias
 
 
+<<<<<<< HEAD
+=======
+log = logging.getLogger(__name__)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TopN:
     """
     Helper to record a list of metrics, keeping only the top N "most expensive" elements.
@@ -84,10 +94,20 @@ def __exit__(
         self._level -= 1
         assert self._level >= 0
         if self._level == 0:
+<<<<<<< HEAD
             end_time_ns = time.time_ns()
             self._on_exit(
                 self._start_time_ns, end_time_ns, self._metrics, exc_type, exc_value
             )
+=======
+            try:
+                end_time_ns = time.time_ns()
+                self._on_exit(
+                    self._start_time_ns, end_time_ns, self._metrics, exc_type, exc_value
+                )
+            except Exception:
+                log.exception("Unexpected exception logging compilation metrics")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def in_progress(self) -> bool:
         """
@@ -189,7 +209,11 @@ def __init__(self, on_exit: OnExitType):
         self._start_time_ns: int = 0
 
     def increment(
+<<<<<<< HEAD
         self, metric: str, value: int, extra: Optional[dict[str, Any]]
+=======
+        self, metric: str, value: int, extra: Optional[dict[str, Any]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """
         Increment a metric by a given amount.
@@ -211,6 +235,18 @@ def finish(self) -> None:
         Call the on_exit function with the metrics gathered so far and reset.
         """
         if self._metrics:
+<<<<<<< HEAD
             end_time_ns = time.time_ns()
             self._on_exit(self._start_time_ns, end_time_ns, self._metrics, None, None)
             self._metrics = {}
+=======
+            try:
+                end_time_ns = time.time_ns()
+                self._on_exit(
+                    self._start_time_ns, end_time_ns, self._metrics, None, None
+                )
+            except Exception:
+                log.exception("Unexpected exception logging runtime metrics")
+            finally:
+                self._metrics = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/mutation_guard.py b/torch/_dynamo/mutation_guard.py
index e317c21e8809..b4d72c5e4e19 100644
--- a/torch/_dynamo/mutation_guard.py
+++ b/torch/_dynamo/mutation_guard.py
@@ -117,6 +117,7 @@ def is_dynamic_nn_module(obj: Any, is_export: bool) -> bool:
         return True
     if hasattr(obj, "torchdynamo_force_dynamic"):
         return obj.torchdynamo_force_dynamic
+<<<<<<< HEAD
     # For export, we will have to fix
     # 1) Input signature problem because params are lifted as inputs
     # 2) nn module stack info changes
@@ -125,6 +126,12 @@ def is_dynamic_nn_module(obj: Any, is_export: bool) -> bool:
         isinstance(obj, torch.nn.Module)
         and config.inline_inbuilt_nn_modules
         and not is_export
+=======
+    if (
+        isinstance(obj, torch.nn.Module)
+        and config.inline_inbuilt_nn_modules
+        and (not is_export or config.install_free_tensors)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         return True
 
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index ddae37cb8b44..2c41d2c60a42 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -33,7 +33,11 @@
 import sys
 import traceback
 import weakref
+<<<<<<< HEAD
 from dataclasses import dataclass
+=======
+from dataclasses import dataclass, field as dc_field
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
 
 import sympy
@@ -43,13 +47,22 @@
 import torch.distributed as dist
 import torch.nn
 import torch.utils._pytree as pytree
+<<<<<<< HEAD
 from torch import fx
+=======
+from torch import fx, Tensor
+from torch._C._dynamo import guards
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.exc import ShortenTraceback, TensorifyScalarRestartAnalysis
 from torch._guards import (
     CompileContext,
     CompileId,
     GlobalContextCheckpointState,
     Source,
+<<<<<<< HEAD
+=======
+    tracing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TracingContext,
 )
 from torch._subclasses.fake_tensor import FakeTensor
@@ -61,11 +74,22 @@
     guard_scalar,
     is_symbolic,
     ShapeEnv,
+<<<<<<< HEAD
 )
 from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 from . import config, exc, graph_break_hints, logging as torchdynamo_logging, variables
+=======
+    Specialization,
+)
+from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+from . import config, exc, logging as torchdynamo_logging, variables
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .backends.registry import CompiledFn, CompilerFn
 from .bytecode_transformation import (
     create_call_function,
@@ -77,6 +101,10 @@
 from .code_context import code_context
 from .codegen import PyCodegen
 from .current_scope_id import enter_new_scope
+<<<<<<< HEAD
+=======
+from .device_interface import get_interface_for_device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .exc import (
     BackendCompilerFailed,
     exceptions_allowed_to_be_fallback,
@@ -119,6 +147,10 @@
     get_unique_name_wrt,
     graph_break_reasons,
     increment_op_count,
+<<<<<<< HEAD
+=======
+    istype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     lazy_format_graph_code,
     LazyString,
     nn_module_proxy,
@@ -132,6 +164,10 @@
     TrackedFake,
     wrap_fx_proxy,
 )
+<<<<<<< HEAD
+=======
+from .variables.ctx_manager import ContextWrappingVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .variables.lists import BaseListVariable
 from .variables.misc import CellVariable, NullVariable
 from .variables.nn_module import NNModuleVariable
@@ -154,6 +190,11 @@
 graph_sizes_log = torch._logging.getArtifactLogger(__name__, "graph_sizes")
 trace_call_log = torch._logging.getArtifactLogger(__name__, "trace_call")
 
+<<<<<<< HEAD
+=======
+RootGuardManager = guards.RootGuardManager
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclass(frozen=True)
 class VariableTrackerCacheKey:
@@ -164,6 +205,21 @@ class VariableTrackerCacheKey:
     source: Source
 
 
+<<<<<<< HEAD
+=======
+@dataclass(frozen=True)
+class AliasingInfo:
+    has_aliasing: bool
+    msg: str
+
+
+@dataclass(frozen=True)
+class MutationInfo:
+    has_mutation: bool
+    msg: str
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class VariableTrackerCache:
     def __init__(self):
         self.cache = {}
@@ -188,7 +244,11 @@ def clear(self):
         self.cache.clear()
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _step_logger():
     return torchdynamo_logging.get_step_logger(log)
 
@@ -226,6 +286,13 @@ def __init__(self, nn_modules: dict[str, torch.nn.Module]):
     def __repr__(self) -> str:
         return "FakeRootModule(...)"
 
+<<<<<<< HEAD
+=======
+    def add_nn_modules(self, nn_modules: dict[str, torch.nn.Module]):
+        for k, v in nn_modules.items():
+            setattr(self, k, v)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class WrapperBackend:
     def __init__(self, backend: CompilerFn):
@@ -253,7 +320,10 @@ def __call__(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor])
                 return self.candidate
 
             raise RuntimeError(f"incorrect results of backend {self}")
+<<<<<<< HEAD
             return self.gm.forward
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         except Exception:
             log.exception("error in verify_correctness")
@@ -265,7 +335,64 @@ def __call__(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor])
 Scope = dict[str, object]
 
 
+<<<<<<< HEAD
 class OutputGraph:
+=======
+@dataclass
+class OutputGraphGuardsState:
+    """
+    A base class containing fields that are considered "persistent" when we
+    want to save all the important state for reconstrucing guards in a different
+    process. Normally we don't need to add states here, but we may have to when
+    the information is needed to serialize the guards, so the fields here are
+    supposed to be serializable as a requirement.
+    """
+
+    local_scope: Scope
+    global_scope: Scope
+    # This records the initial torch function mode stack for guarding
+    torch_function_mode_stack: list[torch.overrides.TorchFunctionMode]
+    guard_on_key_order: set[Source]
+    # Map from graph input's `Source` to sizes / strides metadata
+    input_source_to_sizes_strides: dict[Source, dict[str, Any]]
+    dual_level: int
+    functorch_layers: list[torch._functorch.pyfunctorch.FuncTorchInterpreter]
+    current_device: Optional[torch.device]
+
+    export: bool = False
+    export_constraints: bool = False
+
+    _guards: Optional[torch._guards.GuardsSet] = None
+    _aotautograd_guards: Optional[list[torch._guards.GuardEnvExpr]] = None
+
+    @property
+    def shape_env(self):
+        raise AssertionError(f"shape_env shouldn't be accessed from {type(self)}")
+
+    @property
+    def guards(self):
+        return self._guards
+
+    @property
+    def aotautograd_guards(self):
+        return self._aotautograd_guards
+
+
+@dataclass
+class StackLocalsMetadata:
+    """
+    Stores metadata for a frame's stack and locals for the purposes of building resume functions
+    """
+
+    stack_null_idxes: list[int] = dc_field(default_factory=list)
+    locals_null_keys: list[str] = dc_field(default_factory=list)
+    stack_ctx_args: list[tuple[int, tuple[Any, ...]]] = dc_field(default_factory=list)
+    stack_ctx_idxes_orig: list[int] = dc_field(default_factory=list)
+    locals_ctx_args: list[tuple[str, tuple[Any, ...]]] = dc_field(default_factory=list)
+
+
+class OutputGraph(OutputGraphGuardsState):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Wrapper class to hold outputs of InstructionTranslator.  Mainly the
     generated fx.Graph.
@@ -290,8 +417,23 @@ def __init__(
         global_scope: Scope,
         f_code,
         torch_function_mode_stack,
+<<<<<<< HEAD
     ):
         super().__init__()
+=======
+        package,
+    ):
+        super().__init__(
+            local_scope,
+            global_scope,
+            torch_function_mode_stack,
+            guard_on_key_order=set(),
+            input_source_to_sizes_strides={},
+            dual_level=torch.autograd.forward_ad._current_level,
+            functorch_layers=torch._functorch.pyfunctorch.retrieve_all_functorch_interpreters(),
+            current_device=torch.utils._device.CURRENT_DEVICE,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.tracers = [SubgraphTracer(self, is_export=export)]
         # Map from graph input's `Source` to its `VariableTracker` to
         # de-duplicate graph inputs by source and reuse the tracker
@@ -299,8 +441,11 @@ def __init__(
         self.export = export
         self.export_constraints = export_constraints
         self.frame_state = frame_state
+<<<<<<< HEAD
         # Map from graph input's `Source` to sizes / strides metadata
         self.input_source_to_sizes_strides: dict[Source, dict[str, Any]] = {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cleanup_hooks: list[Callable[[], Any]] = []
         # compile_id is an id number for the current torch.compile
         self.compile_id: int = next(_compile_id_counter)
@@ -352,6 +497,10 @@ def __init__(
                 export=self.export,
             )
         self.tracing_context: TracingContext = TracingContext(fake_mode)
+<<<<<<< HEAD
+=======
+        self.tracing_context.traced_code.append(f_code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.dynamo_compile_id: Optional[CompileId] = (
             CompileContext.current_compile_id()
         )
@@ -372,7 +521,11 @@ def __init__(
         # and LOAD_ATTR for same python objects free.
         self.variable_tracker_cache = VariableTrackerCache()
         self.unique_var_id = itertools.count()
+<<<<<<< HEAD
         self.code_options = dict(code_options)
+=======
+        self.code_options: dict[str, Any] = dict(code_options)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.output_instructions: list[Instruction] = []
         # used to track nodes that are added between calls of copy_graphstate
         # and restore_graphstate
@@ -383,10 +536,16 @@ def __init__(
 
         # Not checkpointed
         self.compiler_fn: Optional[CompilerFn] = compiler_fn
+<<<<<<< HEAD
         self.global_scope = global_scope
         self.local_scope = local_scope
         self.root_tx = root_tx
 
+=======
+        self.root_tx = root_tx
+
+        self.package = package
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Given a source, what are the user stacks of all locations that
         # accessed it?
         #
@@ -404,12 +563,17 @@ def __init__(
         self.should_exit = False
         self.unspec_variable_map: dict[str, UnspecializedPythonVariable] = {}
 
+<<<<<<< HEAD
         # Note this returns true iff TF Mode and TF Subclasses are enabled
         self.torch_function_enabled = torch._C._is_torch_function_enabled()
         # This returns false if TF Overall (both mode and subclass) is disabled OR that TF Mode stack is empty
         self.torch_function_mode_enabled = torch._C._is_torch_function_mode_enabled()
         # This records the initial torch function mode stack for guarding
         self.torch_function_mode_stack = torch_function_mode_stack
+=======
+        # This returns false if TF Overall (both mode and subclass) is disabled OR that TF Mode stack is empty
+        self.torch_function_mode_enabled = torch._C._is_torch_function_mode_enabled()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Tracks if the output graph has a user defined allowed function in the
         # graph. This is used later to determine if we should fallback to eager
@@ -444,7 +608,11 @@ def __init__(
         self.random_calls: list[
             tuple[Callable[..., object], tuple[object, ...], dict[str, object]]
         ] = []
+<<<<<<< HEAD
         self.random_values_var = None
+=======
+        self.random_values_var: Any = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Bytecode to insert right before we call the graph
         self.pregraph_bytecode: list[Instruction] = []
@@ -458,11 +626,38 @@ def __init__(
             self.install_builtins_dict_in_fglobals()
         )
 
+<<<<<<< HEAD
         self.guard_on_key_order: set[str] = set()
 
     def install_builtins_dict_in_fglobals(self):
         # f_globals["__builtins__"] can be a dict or a module. This is an
         # implemenation detail -
+=======
+        self.compiler_trace_stack = contextlib.ExitStack()
+
+        # These are the ambient, currently-global saved_tensor_hooks stashed in autograd,
+        # that are set for the entire duration of the compiled region.
+        # This is an invariant today because we graph break on the saved_tensor_hook
+        # context manager inside a compiled region
+        self.saved_tensors_hooks_subgraph_names: Optional[list[str]] = (
+            self.maybe_install_saved_tensors_hooks_subgraphs()
+        )
+
+    def mark_bytecode_tracing_start(self):
+        self.compiler_trace_stack.enter_context(
+            dynamo_timed(
+                "bytecode_tracing",
+                log_pt2_compile_event=True,
+            )
+        )
+
+    def mark_bytecode_tracing_stop(self):
+        self.compiler_trace_stack.close()
+
+    def install_builtins_dict_in_fglobals(self):
+        # f_globals["__builtins__"] can be a dict or a module. This is an
+        # implementation detail -
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # https://docs.python.org/3/library/builtins.html.
 
         # This makes guarding on any builtin messy because the guard check_fn
@@ -528,6 +723,60 @@ def init_ambient_guards(self):
             self.guards.add(
                 GlobalStateSource().make_guard(GuardBuilder.FUNCTORCH_STACK_MATCH)
             )
+<<<<<<< HEAD
+=======
+        if not torch._dynamo.compiled_autograd.in_compiled_autograd_region:
+            self.guards.add(
+                GlobalStateSource().make_guard(
+                    GuardBuilder.AUTOGRAD_SAVED_TENSORS_HOOKS
+                )
+            )
+
+    def maybe_install_saved_tensors_hooks_subgraphs(self) -> Optional[list[str]]:
+        if torch._dynamo.compiled_autograd.in_compiled_autograd_region:
+            return None
+
+        get_hooks = torch._functorch._aot_autograd.utils.top_saved_tensors_hooks
+        are_inline_hooks = (
+            torch._functorch._aot_autograd.utils.saved_tensors_hooks_are_inlineable
+        )
+        hooks = get_hooks()
+        if not are_inline_hooks(hooks):
+            return None
+
+        # If GraphModule provided by user contains fx.wrap,
+        # We can only rely on user provided cache hash in this case.
+        # If user did not provide cache hash - then we always bypass cache.
+
+        pack_gm, unpack_gm = hooks
+        pack_subgraph_name = self.install_subgraph(
+            "saved_tensors_hooks_pack",
+            torch.fx.GraphModule(self.nn_modules, pack_gm.graph),
+        )
+        unpack_subgraph_name = self.install_subgraph(
+            "saved_tensors_hooks_unpack",
+            torch.fx.GraphModule(self.nn_modules, unpack_gm.graph),
+        )
+        assert pack_subgraph_name == "saved_tensors_hooks_pack_0"
+        assert unpack_subgraph_name == "saved_tensors_hooks_unpack_0"
+        return [pack_subgraph_name, unpack_subgraph_name]
+
+    def dump_guards_state(self):
+        return OutputGraphGuardsState(
+            local_scope=self.local_scope,
+            global_scope=self.global_scope,
+            torch_function_mode_stack=self.torch_function_mode_stack,
+            guard_on_key_order=self.guard_on_key_order,
+            input_source_to_sizes_strides=self.input_source_to_sizes_strides,
+            dual_level=self.dual_level,
+            functorch_layers=self.functorch_layers,
+            current_device=self.current_device,
+            export=self.export,
+            export_constraints=self.export_constraints,
+            _guards=self.guards,
+            _aotautograd_guards=self.aotautograd_guards,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def synthetic_graph_input(self, fn, args):
         """
@@ -548,6 +797,11 @@ def synthetic_graph_input(self, fn, args):
         self.pregraph_bytecode.extend(cg.get_instructions())
         source = SyntheticLocalSource(varname)
         result = VariableTracker.build(self.root_tx, example_value, source)
+<<<<<<< HEAD
+=======
+        # Realize the VT because we will delete the guards on it in the next line.
+        result = result.realize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         TracingContext.get().guards_context.dynamo_guards.remove_guards_with_source(
             source
         )
@@ -653,6 +907,13 @@ def guards(self) -> torch._guards.GuardsSet:
     def nn_modules(self) -> dict[str, Any]:
         return self.tracing_context.module_context.nn_modules
 
+<<<<<<< HEAD
+=======
+    @property
+    def aotautograd_guards(self):
+        return self.tracing_context.guards_context.aotautograd_guards
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def save_global_state(self, out=None):
         """
         Saves to out if it is provided. Else saves to the tracing context's global_state.
@@ -666,6 +927,7 @@ def save_global_state(self, out=None):
             ),
         )
 
+<<<<<<< HEAD
         # TODO - Consider having a torch level API for torch_function_state. As
         # of now, we create a ref cycle by passing the
         # output.set_torch_function_state to
@@ -676,6 +938,8 @@ def save_global_state(self, out=None):
             self.set_torch_function_state,
             self.torch_function_enabled,
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         global_state["grad_enabled"] = (torch.set_grad_enabled, torch.is_grad_enabled())
 
         global_state["autocast_enabled"] = (
@@ -772,7 +1036,11 @@ def register_attr_or_module(
         *names,
         **options,
     ):
+<<<<<<< HEAD
         if is_dynamic_nn_module(target, self.root_tx.export):
+=======
+        if is_dynamic_nn_module(target, self.export):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Instead of returning UnspecializedNNModuleVariable, call
             # VariableTracker.build so that it is tracked for mutation.
             return VariableTracker.build(self.current_tx, target, **options)
@@ -870,7 +1138,13 @@ def wrap_name(module_key):
                 self.output.update_co_names(module_key)
                 self.global_scope[module_key] = target
                 return VariableTracker.build(
+<<<<<<< HEAD
                     self, target, ConstantSource(source_name=module_key)
+=======
+                    self,  # type: ignore[arg-type]
+                    target,
+                    ConstantSource(source_name=module_key),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         for k, v in self.nn_modules.items():
@@ -963,7 +1237,11 @@ def handle_aliases_for_stolen_lists(self, tx):
 
                 # A small codegen optimization because we might have different
                 # VariableTrackers that share the same source.
+<<<<<<< HEAD
                 list_idx = x.source.index
+=======
+                list_idx = x.source.index  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if list_idx not in visited:
                     alias_name = self.new_var(
                         f"{list_name}_ref"
@@ -990,6 +1268,7 @@ def handle_aliases_for_stolen_lists(self, tx):
         # other parts of Dynamo like guards.
         return alias_insts, overridden_sources
 
+<<<<<<< HEAD
     def compile_subgraph(
         self, tx, partial_convert=False, reason: Optional[GraphCompileReason] = None
     ):
@@ -1068,6 +1347,49 @@ def append_prefix_insts():
             name: nn_module_proxy(mod) for name, mod in self.nn_modules.items()
         }
         root = FakeRootModule(nn_modules_proxies)
+=======
+    def _get_stack_values_to_restore(self, tx, stack_pops):
+        """
+        Gets the stack + locals values belonging to tx that need to be restored.
+
+        Also prunes dead tx locals and realizes all VTs in the tx's stack.
+
+        NullVariables in stack/locals will NOT be restored, unless they are the top `stack_pops`
+        elements of the stack - it is expected that the next instruction to run will pop the top
+        `stack_pops` elements of the stack, so we should codegen NULLs.
+
+        Returns:
+            - stack_values: stack and locals values that need to be restored
+            - restore_vars: names of locals corresponding to the locals part of `stack_values`
+            - meta: locations of NULLs and ContextWrappingVariables in the stack/locals
+                (ignores the top `stack_pops` values on the stack)
+        """
+        tx.prune_dead_locals()
+
+        stack_values = []
+        meta = StackLocalsMetadata()
+
+        # realize any unrealized tensor VTs in case they
+        # need to be added to self.nn_modules as attributes
+        for i, value in enumerate(tx.stack):
+            variables.LazyVariableTracker.realize_all(value)
+            # ignore top `stack_pops` values on the stack
+            if len(tx.stack) - i <= stack_pops:
+                stack_values.append(value)
+                continue
+            if isinstance(value, NullVariable):
+                meta.stack_null_idxes.append(i)
+            else:
+                stack_values.append(value)
+            if isinstance(value, ContextWrappingVariable):
+                target_values = (
+                    () if value.target_values is None else tuple(value.target_values)
+                )
+                # NOTE: track index in stack after NULLs have been removed
+                meta.stack_ctx_args.append((len(stack_values) - 1, target_values))
+                meta.stack_ctx_idxes_orig.append(i)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Add all the local vars to the "stack" so restore at the end
         restore_vars: list[str] = []
         val_to_names: dict[VariableTracker, list[str]] = {}
@@ -1076,6 +1398,12 @@ def append_prefix_insts():
         # will clear out all of symbolic_locals because RETURN_VALUE is the
         # last instruction and no more locals are used.  The fanciness here
         # is only needed for partial graphs.
+<<<<<<< HEAD
+=======
+        # NOTE: All cell and free variables are represented as CellVariable,
+        # so checks for NULLs and context managers in the case of codegen'ing resume
+        # functions will not be performed on them. This is expected behavior.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for k, v in tx.symbolic_locals.items():
             # Note! this explicitly uses .local_name for matching
             # Failure to do so will cause spurious registrations in val_to_names.
@@ -1090,10 +1418,22 @@ def append_prefix_insts():
             if sys.version_info >= (3, 12):
                 # Continuation function will load the NULL for v.
                 if type.__instancecheck__(NullVariable, v):
+<<<<<<< HEAD
+=======
+                    meta.locals_null_keys.append(k)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     continue
             else:
                 # A variable should never be NULL in < 3.12
                 assert not type.__instancecheck__(NullVariable, v)
+<<<<<<< HEAD
+=======
+            if isinstance(v, ContextWrappingVariable):
+                target_values = (
+                    () if v.target_values is None else tuple(v.target_values)
+                )
+                meta.locals_ctx_args.append((k, target_values))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if v not in val_to_names:
                 val_to_names[v] = []
             val_to_names[v].append(k)
@@ -1101,6 +1441,7 @@ def append_prefix_insts():
             restore_vars.extend(val_to_names[v])
             stack_values.extend([v] * len(val_to_names[v]))
 
+<<<<<<< HEAD
         # to handle random calls
         if len(self.random_calls) > 0:
             append_prefix_insts()
@@ -1109,17 +1450,151 @@ def append_prefix_insts():
             rand_fn = disable(_get_gen_rand_values_fn(self.random_calls))
             rand_fn_name = self.install_global("__gen_rand_values", rand_fn)
             codegen = PyCodegen(tx, root, overridden_sources=overridden_sources)
+=======
+        return stack_values, restore_vars, meta
+
+    def compile_subgraph(
+        self,
+        tx: "InstructionTranslatorBase",
+        reason: GraphCompileReason,
+        partial_convert=False,
+        stack_pops=0,
+    ):
+        """
+        Compiles the current subgraph, with inputs w.r.t. self.root_tx, and codegens:
+            - Call the compiled subgraph
+            - Apply side effects
+            - Codegen stack and locals
+            - Store the locals
+
+        Python does not allow NULL to be an arg to a function, so we do not codegen NULLs on the stack,
+        unless the value is one of the top `stack_pops` values on the stack (these values are expected to be
+        popped immediately after this generated code. The prologue of the resume function is expected to restore
+        any dropped NULLs.
+
+        Returns stack indices and locals keys where we dropped NULLs, and where we found inactive context manager objects.
+        """
+
+        assert self.root_tx is not None
+
+        # FIXME temporary assert to make sure we're not accidentally compiling nested graph breaks
+        # before we're done the full implementation
+        assert self.root_tx is tx
+
+        # bytecode tracing has finished. Pop the context manager for dynamo_timed
+        self.mark_bytecode_tracing_stop()
+
+        self.partial_convert = partial_convert
+        self.compile_subgraph_reason = reason
+        self.should_exit = True
+
+        log.debug("COMPILING GRAPH due to %s", reason)
+
+        # prefix instructions (Python 3.11+)
+        prefix_insts: list[Instruction] = []
+        if sys.version_info >= (3, 11):
+            for inst in tx.prefix_insts:
+                if inst.opname == "MAKE_CELL":
+                    prefix_insts.append(
+                        create_instruction("MAKE_CELL", argval=inst.argval)
+                    )
+                elif inst.opname == "COPY_FREE_VARS":
+                    prefix_insts.append(
+                        create_instruction(
+                            "COPY_FREE_VARS", arg=len(tx.code_options["co_freevars"])
+                        )
+                    )
+                else:
+                    prefix_insts.append(copy.copy(inst))
+        self.add_output_instructions(prefix_insts)
+
+        assert not (self.pregraph_bytecode and self.export), (
+            "export does not support pregraph_bytecode"
+        )
+        self.add_output_instructions(self.pregraph_bytecode)
+
+        alias_insts, overridden_sources = self.handle_aliases_for_stolen_lists(
+            self.root_tx
+        )
+        self.add_output_instructions(alias_insts)
+
+        # Exit from all context manager variables to make sure global state is restored
+        for block in reversed(self.root_tx.block_stack):
+            block.exit(self.root_tx, is_graph_break=reason.graph_break)
+
+        self.cleanup_graph()
+
+        # stack values and restore vars for each frame are pushed in reverse order
+        # i.e. last element corresponds to root frame, first element corresponds to current frame
+        all_stack_values = []
+        all_restore_vars = []
+        all_stack_locals_metas = []
+        cur_tx: Optional[InstructionTranslatorBase] = tx
+        while True:
+            assert cur_tx is not None
+            # this should have been checked by the caller
+            assert all(block.can_restore() for block in cur_tx.block_stack)
+            stack_values, restore_vars, meta = self._get_stack_values_to_restore(
+                cur_tx, stack_pops
+            )
+            all_stack_values.append(stack_values)
+            all_restore_vars.append(restore_vars)
+            all_stack_locals_metas.append(meta)
+            if cur_tx is self.root_tx:
+                break
+            cur_tx = tx.parent
+
+        # Use nn.Module "proxies" in the constructed GraphModule so that
+        # the resulting GM does not hold additional strong references to the original modules.
+        # This prevents a strong ref cycle where Dynamo created code holds on to references
+        # to modules that also have Dynamo code cache invalidation checks.
+        # When cache invalidation runs, the generated GM will be invalidated, which also deletes
+        # the proxies.
+        nn_modules_proxies = {
+            name: nn_module_proxy(mod) for name, mod in self.nn_modules.items()
+        }
+        root = FakeRootModule(nn_modules_proxies)
+
+        from .decorators import disable
+
+        # to handle random calls
+        if len(self.random_calls) > 0:
+            random_calls_instructions = []
+            self.random_values_var = self.new_var("random_values")
+            rand_fn = disable(
+                _get_gen_rand_values_fn(self.random_calls),
+                reason="do not trace into Dynamo rng recovery function",
+            )
+            rand_fn_name = self.install_global("__gen_rand_values", rand_fn)
+            codegen = PyCodegen(
+                self.root_tx, root, overridden_sources=overridden_sources
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             random_calls_instructions.extend(
                 codegen.load_function_name(rand_fn_name, True)
             )
             random_calls_instructions.extend(create_call_function(0, False))
             random_calls_instructions.append(
+<<<<<<< HEAD
                 codegen.create_store(tx.output.random_values_var),
             )
             self.add_output_instructions(random_calls_instructions)
 
         if (
             stack_values
+=======
+                codegen.create_store(self.random_values_var),
+            )
+            self.add_output_instructions(random_calls_instructions)
+
+        # call compiled fx graph
+        graph_output_var = None
+        stored_graph_output_var = False
+        root_stack_values = all_stack_values[-1]
+        if (
+            self.root_tx is tx
+            and root_stack_values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and all(
                 not isinstance(
                     v,
@@ -1130,6 +1605,7 @@ def append_prefix_insts():
                     ),
                 )
                 and not (isinstance(v, SymNodeVariable) and v.python_type() is float)
+<<<<<<< HEAD
                 for v in stack_values
             )
             and all(isinstance(x, TensorVariable) for x in stack_values)
@@ -1179,6 +1655,61 @@ def append_prefix_insts():
                     self.compile_and_call_fx_graph(
                         tx, pass2.graph_output_vars(), root, output_replacements
                     )
+=======
+                for v in root_stack_values
+            )
+            and all(isinstance(x, TensorVariable) for x in root_stack_values)
+            and len(set(root_stack_values)) == len(root_stack_values)
+            and self.side_effects.is_empty()
+            and not tx.debug_locals
+            and not self.backward_state
+            and not all_stack_locals_metas[-1].stack_null_idxes
+            and not all_stack_locals_metas[-1].locals_null_keys
+        ):
+            # optimization to generate better code in a common case
+            self.add_output_instructions(
+                self.compile_and_call_fx_graph(
+                    tx, list(reversed(root_stack_values)), root
+                )
+                + [create_instruction("UNPACK_SEQUENCE", arg=len(root_stack_values))]
+            )
+        else:
+            graph_output_var = self.new_var("graph_out")
+            # load stack values in a flat manner for now - will likely change later.
+            stack_values_flat = [
+                val for vals in reversed(all_stack_values) for val in vals
+            ]
+            pass1 = PyCodegen(
+                self.root_tx,
+                root,
+                graph_output_var,
+                overridden_sources=overridden_sources,
+            )
+            self.codegen_suffix(tx, stack_values_flat, pass1)
+
+            # Use `pass1.uses` to selectively cache multi-user variables into a
+            # temporary local source. This (a). speeds up loading VTs with long
+            # chained source, and (b). avoids redundantly saving single-user VT
+            # into a temporary local.
+            tempvars = {}  # type: ignore[var-annotated]
+            for val, count in pass1.uses.items():
+                # If it's already a local source, no need to cache it
+                if count > 1 and not istype(val, (SyntheticLocalSource, LocalSource)):
+                    tempvars[val] = None
+            pass2 = PyCodegen(
+                self.root_tx,
+                root,
+                graph_output_var,
+                tempvars=tempvars,
+                overridden_sources=overridden_sources,
+            )
+            self.codegen_suffix(tx, stack_values_flat, pass2)
+
+            output = []
+            if count_calls(self.graph) != 0 or len(pass2.graph_outputs) != 0:
+                output.extend(
+                    self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 if len(pass2.graph_outputs) != 0:
@@ -1189,6 +1720,7 @@ def append_prefix_insts():
             else:
                 # NB: Important to run compiler collective even when there is
                 # a graph break
+<<<<<<< HEAD
                 self.run_compiler_collective(tx)
             append_prefix_insts()
             self.add_output_instructions(output + pass2.get_instructions())
@@ -1211,6 +1743,29 @@ def append_prefix_insts():
                         ).create_delete(graph_output_var)
                     ]
                 )
+=======
+                self.run_compiler_collective()
+            self.add_output_instructions(output + pass2.get_instructions())
+
+        # restore all the live local vars of the root
+        local_restore_cg = PyCodegen(
+            self.root_tx, overridden_sources=overridden_sources
+        )
+        # TODO this local restoration should be removed when fully implementing nested graph breaks
+        self.add_output_instructions(
+            [
+                local_restore_cg.create_store(var)
+                for var in reversed(all_restore_vars[-1])
+            ]
+        )
+
+        if graph_output_var and stored_graph_output_var:
+            self.add_output_instructions(
+                [local_restore_cg.create_delete(graph_output_var)]
+            )
+
+        return all_stack_locals_metas
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_suffix(self, tx, stack_values, cg):
         # NOTE: `codegen_save_tempvars` must run first to update `source` fields
@@ -1318,7 +1873,13 @@ def restore_global_state(self):
                 GlobalContextCheckpointState(current_global_state)
             )
 
+<<<<<<< HEAD
     def run_compiler_collective(self, tx):
+=======
+    def run_compiler_collective(self):
+        tx = self.root_tx
+        assert tx is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (ds := tx.distributed_state) is not None and ds.all_states is None:
             compile_pg = ds.compile_pg
             log.info("compiler_collective %s", ds.local_state)
@@ -1330,8 +1891,19 @@ def run_compiler_collective(self, tx):
                 },
                 payload_fn=lambda: ds.local_state.render(),
             )
+<<<<<<< HEAD
             with (
                 torch.cuda.device(compile_pg.rank() % torch.cuda.device_count()),
+=======
+            device_types = compile_pg._device_types
+            assert len(device_types) == 1, (
+                "Expect only one device type but got {}".format("+".join(device_types))
+            )
+            with (
+                get_interface_for_device(device_types.pop()).device(  # type: ignore[attr-defined]
+                    compile_pg.rank() % torch.accelerator.device_count()
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dynamo_timed("compiler_collective", log_pt2_compile_event=True),
             ):
                 all_states = [None] * compile_pg.size()
@@ -1342,19 +1914,35 @@ def run_compiler_collective(self, tx):
             tx.speculation_log.clear()
             raise exc.CompileCollectiveRestartAnalysis
 
+<<<<<<< HEAD
     def compile_and_call_fx_graph(self, tx, rv, root, replaced_outputs):
         """
         Generate code from self.graph and return the Instruction()s to
         call that generated code.
+=======
+    def compile_and_call_fx_graph(self, tx, rv, root):
+        """
+        Generate code from self.graph and return the Instruction()s to
+        call that generated code.
+
+        Code is generated w.r.t. self.root_tx.
+        tx is only used for preserving GraphModule metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         with torch._guards.TracingContext.clear_frame():
             from .decorators import disable
 
             assert self.should_exit
 
+<<<<<<< HEAD
             self.run_compiler_collective(tx)
 
             name = unique_id("__compiled_fn")
+=======
+            self.run_compiler_collective()
+
+            name = unique_id("__compiled_fn", with_uuid=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             assert isinstance(rv, list)
             assert isinstance(root, FakeRootModule)
@@ -1365,11 +1953,18 @@ def compile_and_call_fx_graph(self, tx, rv, root, replaced_outputs):
                 (self.current_tracer.create_arg(tuple(x.as_proxy() for x in rv)),),
                 {},
             )
+<<<<<<< HEAD
 
             for old_node, new_node in replaced_outputs.items():
                 old_node.replace_all_uses_with(new_node)
 
             tx.output.current_tracer._maybe_preserve_original_meta(tx, output_node)
+=======
+            sub_gms = self.dedup_pass()
+            root.add_nn_modules(sub_gms)
+
+            self.current_tracer._maybe_preserve_original_meta(tx, output_node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not config.do_not_emit_runtime_asserts:
                 # There is a rare scenario where codegen_suffix adds a new entry
                 # to self.nn_modules while `root` knows only about the
@@ -1396,9 +1991,24 @@ def compile_and_call_fx_graph(self, tx, rv, root, replaced_outputs):
             self.real_value_cache.clear()
 
             gm = _make_graph_module(root, self.graph)
+<<<<<<< HEAD
+            for register_finalizer in self.register_finalizer_fns:
+                register_finalizer(gm)
+
+=======
+
+            # Saved tensors hooks are not used by the graph.
+            # GraphModule by default only copies used in the graph submodules.
+            # Copying them into the result graph manually.
+            if self.saved_tensors_hooks_subgraph_names:
+                for subgraph_name in self.saved_tensors_hooks_subgraph_names:
+                    setattr(gm, subgraph_name, getattr(root, subgraph_name))
+
             for register_finalizer in self.register_finalizer_fns:
                 register_finalizer(gm)
 
+            gm._backend_id = name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gm.compile_subgraph_reason = self.compile_subgraph_reason
             gm.meta["dynamo_flat_name_to_original_fqn"] = (
                 self.dynamo_flat_name_to_original_fqn.copy()
@@ -1434,7 +2044,11 @@ def compile_and_call_fx_graph(self, tx, rv, root, replaced_outputs):
                 self.tracing_context.fake_mode = backend_fake_mode
 
             with self.restore_global_state():
+<<<<<<< HEAD
                 compiled_fn = self.call_user_compiler(gm)
+=======
+                compiled_fn = self.call_user_compiler(gm, self.example_inputs())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             from torch.fx._lazy_graph_module import _LazyGraphModule
 
@@ -1459,6 +2073,7 @@ def compile_and_call_fx_graph(self, tx, rv, root, replaced_outputs):
                     # replace compiled_fn with the real forward method
                     compiled_fn = lazy_gm.forward
 
+<<<<<<< HEAD
             compiled_fn = disable(compiled_fn)
 
             counters["stats"]["unique_graphs"] += 1
@@ -1466,6 +2081,76 @@ def compile_and_call_fx_graph(self, tx, rv, root, replaced_outputs):
             self.install_global_unsafe(name, compiled_fn)
 
             cg = PyCodegen(tx)
+=======
+            if self.package is not None:
+                self.package.add_backend_id(name, compiled_fn)
+
+            compiled_fn = disable(
+                compiled_fn, reason="do not trace Dynamo-compiled graph"
+            )
+
+            counters["stats"]["unique_graphs"] += 1
+            if specializations := old_fake_mode.shape_env.specializations:
+                specialization_guards = []
+                specialization_cache: dict[Specialization, Callable[[Any], Any]] = {}
+                sources = [a.source for a in self.graphargs]
+                for specialization in specializations:
+                    source_index = sources.index(specialization.source)
+                    check_fn_source = inspect.getsource(specialization.check_fn).strip()
+                    check_fn = guards.LAMBDA_GUARD(  # type: ignore[attr-defined]
+                        specialization.check_fn,
+                        [check_fn_source],
+                    )
+
+                    log.debug(
+                        "Compiling backend specialized graph with specialization=%s",
+                        check_fn_source,
+                    )
+
+                    specialization_guards.append(
+                        (
+                            functools.partial(
+                                lambda idx, args, check_fn=check_fn: check_fn(
+                                    args[idx]
+                                ),
+                                source_index,
+                            ),
+                            specialization,
+                        )
+                    )
+
+                @torch._dynamo.disable(reason="do not trace Dynamo-compiled graph")
+                def specialized_dispatch(*args, **kwargs):
+                    for check_fn, specialization in specialization_guards:
+                        if check_fn(args):
+                            if specialization in specialization_cache:
+                                return specialization_cache[specialization](
+                                    *args, **kwargs
+                                )
+
+                            with self.shape_env.patch_source_specialization(
+                                specialization.source, specialization.check_fn
+                            ):
+                                # Modify gm so AOTAutogradCache key changes per specialization
+                                gm.meta["specialization"] = specialization
+                                example_inputs: list[Tensor] = list(args)
+                                with tracing(self.tracing_context):
+                                    specialization_cache[specialization] = (
+                                        self.call_user_compiler(gm, example_inputs)
+                                    )
+
+                            return specialization_cache[specialization](*args, **kwargs)
+                    return compiled_fn(*args, **kwargs)
+
+                # This is safe because we pre-process name to be unique
+                self.install_global_unsafe(name, specialized_dispatch)
+            else:
+                # This is safe because we pre-process name to be unique
+                self.install_global_unsafe(name, compiled_fn)
+
+            assert self.root_tx is not None
+            cg = PyCodegen(self.root_tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cg.make_call_generated_code(name)
             return cg.get_instructions()
 
@@ -1477,16 +2162,34 @@ def placeholders(self) -> list[fx.Node]:
     def graphargs(self) -> list[GraphArg]:
         return [node.meta["grapharg"] for node in self.placeholders]
 
+<<<<<<< HEAD
     def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
+=======
+    def call_user_compiler(
+        self, gm: fx.GraphModule, example_inputs: list[Tensor]
+    ) -> CompiledFn:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with dynamo_timed(
             "OutputGraph.call_user_compiler",
             phase_name="backend_compile",
             log_pt2_compile_event=True,
+<<<<<<< HEAD
             dynamo_compile_column_us="aot_autograd_cumulative_compile_time_us",
         ):
             return self._call_user_compiler(gm)
 
     def _call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
+=======
+            log_waitcounter=True,
+            waitcounter_name_override="compile_aot_autograd",
+            dynamo_compile_column_us="aot_autograd_cumulative_compile_time_us",
+        ):
+            return self._call_user_compiler(gm, example_inputs)
+
+    def _call_user_compiler(
+        self, gm: fx.GraphModule, example_inputs: list[Tensor]
+    ) -> CompiledFn:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.compiler_fn is not None
         tot = 0
         placeholders = []
@@ -1497,10 +2200,18 @@ def _call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
                 placeholders.append(node)
         increment_op_count(tot)
         for pl in placeholders:
+<<<<<<< HEAD
             arg = pl.meta["grapharg"]
             # TODO: Why isn't this stored in meta :think:
             # NOTE: can't move these into meta: https://github.com/pytorch/pytorch/issues/141640
             pl._dynamo_source = arg.source
+=======
+            if not hasattr(pl, "_dynamo_source"):
+                arg = pl.meta["grapharg"]
+                # TODO: Why isn't this stored in meta :think:
+                # NOTE: can't move these into meta: https://github.com/pytorch/pytorch/issues/141640
+                pl._dynamo_source = arg.source
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # NOTE: can't move these into meta: https://github.com/pytorch/pytorch/issues/141640
         gm._param_name_to_source = self.param_name_to_source  # type: ignore[assignment]
@@ -1516,7 +2227,11 @@ def _call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             compiler_fn = self.compiler_fn
             if config.verify_correctness:
                 compiler_fn = WrapperBackend(compiler_fn)
+<<<<<<< HEAD
             compiled_fn = compiler_fn(gm, self.example_inputs())
+=======
+            compiled_fn = compiler_fn(gm, example_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _step_logger()(logging.INFO, f"done compiler function {name}")
             assert callable(compiled_fn), "compiler_fn did not return callable"
         except (TensorifyScalarRestartAnalysis, ShortenTraceback):
@@ -1562,7 +2277,11 @@ def dedup_pass(self):
         if torch._dynamo.config.use_graph_deduplication:
             return apply_graph_deduplication(self)
         else:
+<<<<<<< HEAD
             return dict()
+=======
+            return {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def install_subgraph(self, name, sub_gm):
         next_name = get_unique_name_wrt(name, self.nn_modules, requires_suffix=True)
@@ -1583,10 +2302,18 @@ def remove_unused_get_attr_nodes(self) -> None:
                 self.remove_node(node)
 
     def remove_unused_graphargs(self) -> None:
+<<<<<<< HEAD
         # NB: It's always OK to drop GraphArg for symbols that ended up being
         # specialized.  You don't even have to make a guard for it, because
         # ShapeEnv produce_guards operates on tracked_fakes, which never gets
         # pruned.  That being said, you'll get marginally better generated
+=======
+        # NB: It's OK to drop GraphArg for symbols that ended up being
+        # specialized iff they are not used in runtime assertions.  You don't
+        # even have to make a guard for it, because ShapeEnv produce_guards
+        # operates on tracked_fakes, which never gets pruned.
+        # That being said, you'll get marginally better generated
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # guard code if you promote the guard into a Dynamo guard (since that
         # allows for the guard to be done using C++ guards.)  If we get
         # ShapeEnv guards to go into C++ guards, this will stop being a thing
@@ -1832,9 +2559,12 @@ def cleanup(self) -> None:
         self.unspec_variable_map.clear()
         self.backward_state.clear()
 
+<<<<<<< HEAD
     def set_torch_function_state(self, enabled: bool) -> None:
         self.torch_function_enabled = enabled
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def add_graph_finalizer(
         self, register_finalizer: Callable[[fx.GraphModule], None]
     ) -> None:
@@ -1994,11 +2724,23 @@ def __init__(self, output_graph, parent=None, is_export=False, source_target=Non
         # True if this tracer is currently tracing into torch.utils.checkpoint
         # as part of speculate_subgraph.
         self.under_activation_checkpoint = False
+<<<<<<< HEAD
         # True if we want to allow side-effects (doesn't throw error on their existence)
+=======
+        # True if we want to allow externally visible side-effects (doesn't throw error on their existence)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # during this tracer's tracing of torch.utils.checkpoint (via speculate_subgraph).
         # Only safe if we know for sure that *NOT* replaying these side-effects during
         # backward recomputation of the checkpoint region doesn't affect its correctness.
         self.allow_side_effects_under_checkpoint = False
+<<<<<<< HEAD
+=======
+        # True if we want to allow externally visible side-effects (doesn't throw error on their existence)
+        # during this tracer's tracing. This is currently only used by experimental AC out-of-tree
+        # via torch._dynamo.utils._disable_side_effect_safety_checks_for_current_subtracer.
+        # Note: Externally visible side-effects are allowed if this flag OR the above flag is True.
+        self.unsafe_allow_externally_visible_side_effects = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # True if this tracer is currently tracing (reconstructing) into a Python generator
         self.is_reconstructing_generator = False
@@ -2020,6 +2762,19 @@ def __init__(self, output_graph, parent=None, is_export=False, source_target=Non
                 (self.graph._target_to_str(source_target), source_target)
             ]
 
+<<<<<<< HEAD
+=======
+        # This is used to create a unique name for the placeholder
+        self._used_names: OrderedSet[str] = OrderedSet()
+        # Stores the versions of the input tensors at the time they are inserted
+        # as placeholders in the graph. This is used to track input mutation.
+        self._input_versions_at_beginning: list[int] = []
+        if torch.is_inference_mode_enabled():
+            raise RuntimeError(
+                "Inference mode is supposed to be disabled during compilation. Please open an issue."
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # preserve original meta if it is available
     def _maybe_preserve_original_meta(self, tx, node):
         if (
@@ -2239,6 +2994,10 @@ def create_node(
 
         node = super().create_node(op, target, args, kwargs, name, type_expr)
         node.meta["creation_timestamp"] = self.output_graph.timestamp
+<<<<<<< HEAD
+=======
+        self._used_names.add(node.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return node
 
     # Note: we did not override erase_node since
@@ -2268,6 +3027,11 @@ def remove_node(self, node):
     def create_graph_input(
         self, name, type_expr, example_value, before=False, source=None
     ):
+<<<<<<< HEAD
+=======
+        if isinstance(example_value, torch.Tensor):
+            self._input_versions_at_beginning.append(example_value._version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.debug(
             "create_graph_input %s %s %s at debug_level %s before=%s",
             name,
@@ -2296,7 +3060,14 @@ def create_graph_input(
                     TracingContext.extract_stack()
                 )
 
+<<<<<<< HEAD
         name = get_unique_name_wrt(name, self.input_name_to_proxy)
+=======
+        # _used_names contains the names of all the nodes in the graph,
+        # including intermediates. This ensures that we do not have a name
+        # collision.
+        name = get_unique_name_wrt(name, self._used_names)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.input_name_to_proxy:
             prev_name = next(reversed(self.input_name_to_proxy))
             node = self.input_name_to_proxy[prev_name].node
@@ -2316,6 +3087,14 @@ def create_graph_input(
             else:
                 self.input_name_to_proxy[name] = proxy
 
+<<<<<<< HEAD
+=======
+            # For placeholder nodes, `name` is passed as a str to the target,
+            # and then torch.fx decides the node.name. So, record the `target`
+            # name as well in the _used_names to prevent any collision.
+            self._used_names.add(name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # NOTE: [Auto lift basic free symbols when create_graph_input]
             # Whenever we call create_graph_input, we try to also lift the basic symbols in example values
             # as graph input.
@@ -2341,12 +3120,30 @@ def create_graph_input(
             # Also see NOTE: [Export inputs must be explicitly passed in]
             is_strict_export = self.is_export
             is_non_strict_export = torch.compiler.is_compiling()
+<<<<<<< HEAD
             if (
                 not is_strict_export
                 and not is_non_strict_export
                 and isinstance(example_value, torch.Tensor)
             ):
                 self._lift_basic_symbols(example_value, source)
+=======
+            if not is_strict_export and not is_non_strict_export:
+                if isinstance(example_value, torch.Tensor):
+                    self._lift_basic_symbols(example_value, source)
+                elif isinstance(example_value, (list, tuple)):
+                    for i, e in enumerate(example_value):
+                        if not isinstance(e, torch.Tensor):
+                            continue
+
+                        e_source = None
+                        if source:
+                            e_source = GetItemSource(
+                                base=source, index=i, index_is_slice=False
+                            )
+
+                        self._lift_basic_symbols(e, e_source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Bound the symbol to ph if example_value is a SymInt with basic symbol.
             if isinstance(example_value, torch.SymInt) and isinstance(
@@ -2377,7 +3174,11 @@ def lift_tracked_freevar_to_input(self, proxy):
         ):
             return self.bound_symbols[example_value.node.expr]
 
+<<<<<<< HEAD
         # Proxys are associated with VariableTracker.
+=======
+        # Proxies are associated with VariableTracker.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # It is possible that we've already lifted the Proxy to be an input.
         # If that is the case, just return the already lifted Proxy.
         if proxy in self.lifted_freevars:
@@ -2431,7 +3232,11 @@ def track_unbacked_symbols(
         self, example_value, e_proxy: Union[LazyProxy, torch.fx.Proxy]
     ):
         # When binding the symbols in an exmaple_value, we bind the symbols
+<<<<<<< HEAD
         # to the proxy's associatied Tracer instead of current tracer.
+=======
+        # to the proxy's associated Tracer instead of current tracer.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This is because:
         # 1. We may be calling wrap_tensors during speculate_subgraph because
         # the variables are lazily realized. The proxy are top-level phs but
@@ -2530,7 +3335,11 @@ def _lift_basic_symbols(
         self, example_value: Union[torch.SymInt, torch.Tensor], src: Optional[Source]
     ):
         # The before arg is for inserting symints in the sizes/strides of a tensor
+<<<<<<< HEAD
         # before the tensor. This odering ensures that when we look at the tensor's
+=======
+        # before the tensor. This ordering ensures that when we look at the tensor's
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # symbols, they're already lifted/tracked. E.g. this assumption is used
         # in insert_deferred_runtime_asserts.
         def _lift_symbols_in_symint(
@@ -2677,6 +3486,82 @@ def lookup_unbound_symbols(self, s: torch.SymInt) -> list[sympy.Symbol]:
         # Sort the symbols so that we can have a deterministic lifting order
         return sorted(to_be_bound, key=lambda s: s.name)
 
+<<<<<<< HEAD
+=======
+    def has_input_mutation(self):
+        input_versions_at_beginning = self._input_versions_at_beginning
+        input_nodes = []
+
+        input_versions_at_end = []
+        for node in self.graph.nodes:
+            if node.op == "placeholder":
+                example_value = node.meta["example_value"]
+                if isinstance(example_value, torch.Tensor):
+                    input_versions_at_end.append(example_value._version)
+                    input_nodes.append(node)
+            else:
+                break
+
+        mutated_inputs = [
+            i
+            for i, (v1, v2) in enumerate(
+                zip(input_versions_at_beginning, input_versions_at_end)
+            )
+            if v1 != v2
+        ]
+
+        if len(mutated_inputs):
+            mutated_nodes = [input_nodes[i] for i in mutated_inputs]
+            msg = f"Input mutation detected at {mutated_nodes}"
+            return MutationInfo(True, msg)
+
+        return MutationInfo(False, "")
+
+    def has_aliasing(self):
+        from torch._higher_order_ops.utils import _collect_fake_inputs
+
+        input_storages: dict[StorageWeakRef, torch.fx.Node] = dict()
+
+        for node in self.graph.nodes:
+            if node.op == "placeholder":
+                example_value = _collect_fake_inputs([node])[0]
+                if isinstance(example_value, torch.Tensor):
+                    storage = StorageWeakRef(example_value._typed_storage())
+                    if storage in input_storages:
+                        # input-input aliasing
+                        msg = f"Input-to-input aliasing detected at nodes {input_storages[storage]} and {node}"
+                        return AliasingInfo(True, msg)
+                    input_storages[storage] = node
+            else:
+                break
+
+        output_storages: dict[StorageWeakRef, torch.fx.Node] = dict()
+        out_nodes = self.graph.find_nodes(op="output")[0]
+        for out_node in pytree.tree_leaves(out_nodes.args[0]):
+            if out_node:
+                example_value = _collect_fake_inputs([out_node])[0]
+                assert not isinstance(example_value, list)
+                if isinstance(example_value, torch.Tensor):
+                    storage = StorageWeakRef(example_value._typed_storage())
+                    if storage in output_storages:
+                        # output-output aliasing
+                        msg = f"Output-to-output aliasing detected at nodes {output_storages[storage]} and {out_node}"
+                        return AliasingInfo(True, msg)
+                    output_storages[storage] = out_node
+
+        intersected_storages = input_storages.keys() & output_storages.keys()
+        if len(intersected_storages) > 0:
+            # input-output aliasing
+            aliased = [
+                (input_storages[s], output_storages[s]) for s in intersected_storages
+            ]
+            aliased = ", ".join([f"{i} and {o}" for i, o in aliased])
+            msg = f"Input-to-output aliasing detected at nodes {aliased}"
+            return AliasingInfo(True, msg)
+
+        return AliasingInfo(False, "")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # NOTE: [HigherOrderOperator tracing design]
 # Ignoring HigherOrderOperators for a moment,
diff --git a/torch/_dynamo/package.py b/torch/_dynamo/package.py
new file mode 100644
index 000000000000..741dd1b7574a
--- /dev/null
+++ b/torch/_dynamo/package.py
@@ -0,0 +1,413 @@
+"""
+This module provides the infrastructure for creating and managing compile package
+for torch.compile. We mainly have two abstractions here:
+  - CompilePackage: Overarching data structure for store and lookup a list of compiled codes.
+  - CodeCacheEntry: Data structure for a single code being compiled by torch.compile.
+The caching behavior is always under user control explicitly so that a stronger guarantee can
+be provided about cache hit for a specific compiled model. Users can load the compile package
+from a different process or host.
+"""
+
+import contextlib
+import dataclasses
+import functools
+import hashlib
+import importlib
+import logging
+import os
+import pickle
+import platform
+import sys
+import types
+from collections.abc import Generator
+from typing import Any, NewType, Optional
+
+import torch
+import torch._inductor.package
+from torch._dynamo.precompile_context import PrecompileCacheArtifact, PrecompileContext
+from torch.compiler._cache import CacheArtifactFactory
+
+from .bytecode_transformation import get_code_keys
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass(frozen=True)
+class SerializedCode:
+    co_argcount: int
+    co_posonlyargcount: int
+    co_kwonlyargcount: int
+    co_nlocals: int
+    co_stacksize: int
+    co_flags: int
+    co_code: bytes
+    co_consts: tuple[Any, ...]
+    co_names: tuple[str, ...]
+    co_varnames: tuple[str, ...]
+    co_filename: str
+    co_name: str
+    co_firstlineno: int
+    co_cellvars: tuple[str, ...]
+    co_freevars: tuple[str, ...]
+    co_linetable: Optional[bytes] = None
+    co_qualname: Optional[str] = None
+    co_exceptiontable: Optional[bytes] = None
+    co_lnotab: Optional[str] = None
+
+    @classmethod
+    @functools.cache
+    def from_code_object(cls, code: types.CodeType) -> "SerializedCode":
+        kwargs = {key: getattr(code, key) for key in get_code_keys()}
+        kwargs["co_consts"] = tuple(
+            cls.from_code_object(c) if isinstance(c, types.CodeType) else c
+            for c in kwargs["co_consts"]
+        )
+        return cls(**kwargs)
+
+    @classmethod
+    @functools.cache
+    def to_code_object(cls, serialized_code: "SerializedCode") -> types.CodeType:
+        kwargs = {key: getattr(serialized_code, key) for key in get_code_keys()}
+        kwargs["co_consts"] = tuple(
+            cls.to_code_object(c) if isinstance(c, SerializedCode) else c
+            for c in kwargs["co_consts"]
+        )
+        return types.CodeType(
+            *kwargs.values(),
+        )
+
+
+@dataclasses.dataclass
+class _GuardedCodeCacheEntry:
+    """
+    Contains the serializable information associated with a single compilation in dynamo.
+    To restore an execution of compiled code, we will need to serialize the following data:
+      - Dynamo bytecode for mapping Python inputs/outputs.
+      - Dynamo guards.
+    """
+
+    guards_state: bytes
+    dynamo_code: SerializedCode
+
+
+_BackendId = NewType("_BackendId", str)  # __compiled_fn
+_FunctionId = NewType("_FunctionId", str)  # __resume_at
+
+
+@dataclasses.dataclass
+class _DynamoCodeCacheEntry:
+    """
+    Contains the serializable information associated with a single code object
+    in dynamo. To restore an execution of compiled code, we will need the following
+    ingredients:
+      1. The "original" code object, which serves as the entry point for eager
+         execution, i.e. the code only executed when there's no cache entry hit.
+      2. The python module name this code object belongs to, for identifying the
+         enclosing global scope to inject compiled and resume functions.
+      3. A list of function names that pointing to this code object. There could be
+         multiple function objects pointing to the same code such as recursive functions.
+      4. A list of guarded code that eval frame dispatches to.
+      5. A list of imported module objects unioned from all compiled branches.
+      6. A list of "backends" (compiled fx graph) unioned from all compield branches.
+    """
+
+    python_code: SerializedCode
+    python_module: str
+    function_names: list[_FunctionId]
+    guarded_codes: list[_GuardedCodeCacheEntry]
+    import_sources: dict[str, str]
+    backend_ids: list[_BackendId]
+
+
+@dataclasses.dataclass
+class _DynamoCacheEntry:
+    codes: list[_DynamoCodeCacheEntry]
+    python_version: str = platform.python_version()
+    torch_version: str = torch.__version__
+
+    @property
+    def backend_ids(self) -> set[_BackendId]:
+        return {backend_id for code in self.codes for backend_id in code.backend_ids}
+
+
+@CacheArtifactFactory.register
+class _DynamoCacheArtifact(PrecompileCacheArtifact[_DynamoCacheEntry]):
+    @staticmethod
+    def type() -> str:
+        return "precompile_dynamo"
+
+    def after_deserialization(self) -> _DynamoCacheEntry:
+        return pickle.loads(self.content)
+
+
+class CompilePackage:
+    """
+    CompilePackage is considered a low level component and should not be directly exposed to
+    end users. It has the following interface:
+
+    1. `CompilePackage.__init__()` which optionally takes previously serialized dynamo states.
+        a. when `dynamo` argument is None, it will construct a brand new CompilePackage object.
+        b. when `dynamo` argument is not None, it will load a pre-compiled dynamo state.
+    2. `package.save()` which dumps the dynamo and backend states to a DynamoCacheEntry object.
+    3. `package.install(backends) which will handle all the side-effectful global scope
+        updates with compiled functions and resume functions.
+    """
+
+    def __init__(self, fn: Any, dynamo: Optional[_DynamoCacheEntry] = None) -> None:
+        self._innermost_fn = None
+        self._codes: dict[types.CodeType, _DynamoCodeCacheEntry] = {}
+
+        self._current_entry: Optional[_DynamoCodeCacheEntry] = None
+        self._installed_globals: dict[types.ModuleType, list[str]] = {}
+
+        # For debugging/testing purpose only.
+        self._cached_backends: dict[_BackendId, Any] = {}
+
+        self._initialize(fn, dynamo)
+        self.uninstall()
+        self.validate()
+
+    def _initialize(self, fn: Any, dynamo: Optional[_DynamoCacheEntry] = None) -> None:
+        from .eval_frame import innermost_fn
+
+        self._innermost_fn = innermost_fn(fn)
+        assert self._innermost_fn is not None
+        if dynamo is not None:
+            assert isinstance(dynamo, _DynamoCacheEntry)
+            if dynamo.python_version != platform.python_version():
+                raise RuntimeError(
+                    f"Compile package was created with a different Python version: {dynamo.python_version}"
+                )
+            if dynamo.torch_version != torch.__version__:
+                raise RuntimeError(
+                    f"Compile package was created with a different PyTorch version: {dynamo.torch_version}"
+                )
+
+            main, *codes = dynamo.codes
+            self._codes = {self._innermost_fn.__code__: main}
+            for code in codes:
+                self._codes[SerializedCode.to_code_object(code.python_code)] = code
+        else:
+            self._add_function(
+                self._innermost_fn.__code__, self._innermost_fn.__module__
+            )
+
+    def _add_function(
+        self,
+        python_code: types.CodeType,
+        python_module: str,
+        name: Optional[_FunctionId] = None,
+    ) -> None:
+        if python_code not in self._codes:
+            code = _DynamoCodeCacheEntry(
+                python_code=SerializedCode.from_code_object(python_code),
+                python_module=python_module,
+                function_names=[],
+                guarded_codes=[],
+                import_sources={},
+                backend_ids=[],
+            )
+            self._codes[python_code] = code
+        else:
+            code = self._codes[python_code]
+            assert code.python_module == python_module
+
+        if name is not None:
+            code.function_names.append(name)
+
+    @property
+    def cached_backends(self) -> dict[_BackendId, Any]:
+        return self._cached_backends
+
+    @functools.cached_property
+    def source_id(self) -> str:
+        assert self._innermost_fn is not None
+        sha256_hash = hashlib.sha256()
+        sha256_hash.update(self._innermost_fn.__qualname__.encode())
+        sha256_hash.update(str(self._innermost_fn.__code__.co_firstlineno).encode())
+        return sha256_hash.hexdigest()
+
+    @contextlib.contextmanager
+    def code_context(self, code: types.CodeType) -> Generator[None, None, None]:
+        assert self._current_entry is None
+
+        entry = self._codes[code]
+        self._current_entry = entry
+        try:
+            yield
+        finally:
+            self._current_entry = None
+
+    def add_guarded_code(
+        self,
+        guards_state: bytes,
+        dynamo_code: types.CodeType,
+    ) -> None:
+        assert self._current_entry is not None
+        guarded_code_entry = _GuardedCodeCacheEntry(
+            guards_state=guards_state,
+            dynamo_code=SerializedCode.from_code_object(dynamo_code),
+        )
+        self._current_entry.guarded_codes.append(guarded_code_entry)
+
+    def add_resume_function(
+        self,
+        python_code: types.CodeType,
+        python_module: str,
+        name: Optional[str],
+    ) -> None:
+        self._add_function(
+            python_code, python_module, _FunctionId(name) if name else None
+        )
+
+    def add_import_source(self, alias: str, module_name: str) -> None:
+        assert self._current_entry is not None
+        self._current_entry.import_sources[alias] = module_name
+
+    def add_backend_id(self, backend_id: str, backend: Optional[Any] = None) -> None:
+        assert self._current_entry is not None
+        assert backend_id.startswith("__compiled_fn_")  # sanity check
+        backend_id = _BackendId(backend_id)
+        self._current_entry.backend_ids.append(backend_id)
+        if backend is not None:
+            self._cached_backends[backend_id] = backend
+
+    def validate(self) -> None:
+        assert self._current_entry is None
+        assert self._innermost_fn is not None
+        assert next(iter(self._codes)) is self._innermost_fn.__code__
+
+    def _install_global(self, module: types.ModuleType, name: str, value: Any) -> None:
+        module.__dict__[name] = value
+        self._installed_globals.setdefault(module, []).append(name)
+
+    def uninstall(self) -> None:
+        from torch._C._dynamo.eval_frame import _reset_precompile_entries
+
+        assert self._innermost_fn is not None
+        for module, names in self._installed_globals.items():
+            for name in names:
+                module.__dict__.pop(name)
+
+        self._installed_globals = {}
+
+        _reset_precompile_entries(self._innermost_fn.__code__)
+
+    def install(self, backends: dict[_BackendId, Any]) -> None:
+        """
+        Sync the package states to the compiled function. This includes the following actions:
+          1. Clean up the previously installed states.
+          2. Install the compiled functions to global scopes.
+          3. Install the precompiled cache entries to ExtraStates on the code object.
+        """
+        from torch._C._dynamo.eval_frame import _load_precompile_entry
+
+        self.uninstall()
+
+        for code, entry in self._codes.items():
+            module = sys.modules[entry.python_module]
+            for alias, module_name in entry.import_sources.items():
+                self._install_global(
+                    module, alias, importlib.import_module(module_name)
+                )
+            for function_name in entry.function_names:
+                fn = types.FunctionType(code, module.__dict__, function_name)
+                self._install_global(module, function_name, fn)
+            for backend_id in entry.backend_ids:
+                if backend_id not in backends:
+                    raise RuntimeError(
+                        f"Backend {backend_id} is not found in the given backends"
+                    )
+                backend = backends[backend_id]
+                self._install_global(
+                    module,
+                    backend_id,
+                    torch._dynamo.disable(backend),
+                )
+
+        for code, entry in self._codes.items():
+            for guarded_code in entry.guarded_codes:
+                guards_state = pickle.loads(guarded_code.guards_state)
+                assert isinstance(guards_state, torch._dynamo.guards.GuardsState)
+                check_fn_manager = torch._dynamo.guards.CheckFunctionManager(
+                    code,
+                    guards_state.output_graph,
+                    guards_serialization_mode="load",
+                    shape_code_parts=guards_state.shape_code_parts,
+                )
+                _load_precompile_entry(
+                    code,
+                    check_fn_manager.guard_manager,
+                    SerializedCode.to_code_object(guarded_code.dynamo_code),
+                )
+
+    def cache_entry(self) -> _DynamoCacheEntry:
+        self.validate()
+        return _DynamoCacheEntry(codes=list(self._codes.values()))
+
+
+@CacheArtifactFactory.register
+class EagerCacheArtifact(PrecompileCacheArtifact[Any]):
+    @staticmethod
+    def type() -> str:
+        return "precompile_eager"
+
+    def after_deserialization(self) -> Any:
+        return pickle.loads(self.content)
+
+
+class DynamoStore:
+    """
+    A DynamoStore tracks active CompilePackages, and provides methods to store and retrieve them.
+    """
+
+    def record_package(self, package: CompilePackage) -> None:
+        """Records a package to PrecompileContext, so that it can be serialized later."""
+        cache_entry = package.cache_entry()
+        pickled_result = pickle.dumps(cache_entry)
+        PrecompileContext.record_artifact(
+            _DynamoCacheArtifact.type(), key=package.source_id, content=pickled_result
+        )
+
+    def record_eager_backend(self, backend_id: _BackendId, backend: Any) -> None:
+        """Records eager fx graphs to PrecompileContext for testing purposes."""
+        pickled_result = pickle.dumps(backend)
+        PrecompileContext.record_artifact(
+            EagerCacheArtifact.type(), key=backend_id, content=pickled_result
+        )
+
+    def save_package(self, package: CompilePackage, path: str) -> None:
+        """Saves a package to a given path. Grabs backends from PrecompileContext."""
+        backend_content = {}
+        cache_entry = package.cache_entry()
+        for backend_id in cache_entry.backend_ids:
+            serialized_backend = PrecompileContext.serialize_artifact_by_key(backend_id)
+            if serialized_backend is None:
+                raise RuntimeError(
+                    f"Backend {backend_id} is not found in the given backends"
+                )
+            backend_content[backend_id] = serialized_backend
+        try:
+            with open(os.path.join(path, "dynamo"), "wb") as dynamo_path:
+                pickle.dump(cache_entry, dynamo_path)
+            with open(os.path.join(path, "backends"), "wb") as backend_path:
+                pickle.dump(backend_content, backend_path)
+        except Exception as e:
+            raise RuntimeError(f"Failed to save package to {path}: {e}") from e
+
+    def load_package(
+        self, fn: Any, path: str
+    ) -> tuple[CompilePackage, dict[_BackendId, Any]]:
+        """Loads a package from a given path and returns it plus a list of deserialized backends"""
+        try:
+            with open(os.path.join(path, "dynamo"), "rb") as dynamo_path:
+                cache_entry = pickle.load(dynamo_path)
+            with open(os.path.join(path, "backends"), "rb") as backend_path:
+                backend_content = pickle.load(backend_path)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load package from path {path}: {e}") from e
+        for backend_id, backend in backend_content.items():
+            backend_content[backend_id] = backend.after_deserialization()
+        package = CompilePackage(fn, cache_entry)
+        return package, backend_content
diff --git a/torch/_dynamo/pgo.py b/torch/_dynamo/pgo.py
index 96ace1da75b4..bf42b6cd783b 100644
--- a/torch/_dynamo/pgo.py
+++ b/torch/_dynamo/pgo.py
@@ -15,13 +15,24 @@
 import copy
 import dataclasses
 import enum
+<<<<<<< HEAD
+=======
+import functools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import os
 import pickle
 import re
+<<<<<<< HEAD
 from collections import defaultdict
 from typing import Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import Self
+=======
+import zlib
+from collections import defaultdict
+from typing import Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import override, Self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._dynamo.config
 import torch._utils_internal
@@ -35,7 +46,16 @@
 )
 from torch._environment import is_fbcode
 from torch._logging._internal import trace_structured_artifact
+<<<<<<< HEAD
 from torch.compiler._cache import CacheArtifactManager, CacheArtifactType
+=======
+from torch.compiler._cache import (
+    CacheArtifact,
+    CacheArtifactFactory,
+    CacheArtifactManager,
+)
+from torch.utils._ordered_set import OrderedSet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -104,15 +124,67 @@ class ReservedWorkflowIdUserError(ValueError):
 # across attempts.  No need to have one mechanism to do everything.
 
 
+<<<<<<< HEAD
+=======
+@functools.cache
+def _hash_containing_file(filepath: str) -> str:
+    # if the file does not exists we consider filepath to be the hash.
+    if not os.path.exists(filepath):
+        return filepath
+
+    with open(filepath, "rb") as file:
+        content = file.read()
+        crc32_value = zlib.crc32(content)
+        hash = format(crc32_value & 0xFFFFFFFF, "08x")
+        return hash
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass(frozen=True)
 class CodeId:
     filename: str
     firstlineno: int
     name: str
+<<<<<<< HEAD
 
     @staticmethod
     def make(code: types.CodeType) -> CodeId:
         return CodeId(code.co_filename, code.co_firstlineno, code.co_name)
+=======
+    # When a job restart, the code can be copied to a different path than the previous attempt. In that case
+    # self.filename will have a different value,  we do not want to consider those differences. Instead we
+    # hash the content of the file and use it as an identifier of the file.
+    #
+    # self.filename is kept in the object to give readable information/pointer to the actual file, in a local
+    # code state it will refer to the first seen file path.
+    file_hash: str
+
+    # Exclude file name.
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, CodeId):
+            return False
+        return (
+            self.file_hash == other.file_hash
+            and self.firstlineno == other.firstlineno
+            and self.name == other.name
+        )
+
+    # Ensure if two CodeIds are the same, then they have the same hash by excluding filename.
+    def __hash__(self) -> int:
+        return hash((self.file_hash, self.name, self.firstlineno))
+
+    def __str__(self) -> str:
+        return f"hash({self.file_hash}){self.filename}:{self.firstlineno}:{self.name}"
+
+    @staticmethod
+    def make(code: types.CodeType) -> CodeId:
+        return CodeId(
+            code.co_filename,
+            code.co_firstlineno,
+            code.co_name,
+            _hash_containing_file(code.co_filename),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -325,6 +397,7 @@ def update_automatic_dynamic(
 ) -> FrameStateSizeEntry:
     code_id = CodeId.make(tx.f_code)
     frame_state = get_code_state()[code_id]
+<<<<<<< HEAD
     is_update = name in frame_state.automatic_dynamic
     mut_entry = frame_state.automatic_dynamic[name]
     old_entry = copy.copy(mut_entry)
@@ -409,6 +482,103 @@ def log_tup(
                         log_tup("stride", f"stride({i})", "stride change", i)
         else:
             log_tup("stride", "other", "other")
+=======
+    if torch._dynamo.config.automatic_dynamic_shapes:
+        is_update = name in frame_state.automatic_dynamic
+        mut_entry = frame_state.automatic_dynamic[name]
+        old_entry = copy.copy(mut_entry)
+        mut_entry |= entry
+
+        # Do some logs (damn, I spend more code logging than I do actually doing
+        # the updates lol)
+        if is_update and old_entry.scalar != mut_entry.scalar:
+            log.debug(
+                "automatic dynamic int %s val %s != %s",
+                name,
+                entry.scalar,
+                old_entry.scalar,
+            )
+            CompileEventLogger.instant(
+                "automatic_dynamic",
+                {
+                    "name": name,
+                    "dim_changed": "scalar",
+                    "reason": "scalar change",
+                    "cached": str(old_entry.scalar),
+                    "new": str(entry.scalar),
+                },
+            )
+            if is_unspecialized_nn_module:
+                log.info(
+                    "%s is converted to a symbolic integer. It is an attribute of a "
+                    "user defined nn module class. If you wish to keep it static, you can "
+                    "mark the nn module class as `torch._dynamo.mark_static`.",
+                    name,
+                )
+
+        def log_tup(
+            tup_name: str, short_reason: str, long_reason: str, i: Optional[int] = None
+        ) -> None:
+            entry_tup = (
+                getattr(entry, tup_name) if i is None else getattr(entry, tup_name)[i]
+            )
+            old_entry_tup = (
+                getattr(old_entry, tup_name)
+                if i is None
+                else getattr(old_entry, tup_name)[i]
+            )
+            log.debug(
+                "automatic dynamic %s %s %s %s != %s",
+                tup_name,
+                name,
+                short_reason,
+                # NB: We used to only report len(...) here for dim mismatch
+                entry_tup,
+                old_entry_tup,
+            )
+            CompileEventLogger.instant(
+                "automatic_dynamic",
+                {
+                    "name": name,
+                    "dim_changed": "all" if i is None else i,
+                    "reason": long_reason,
+                    "cached": str(old_entry_tup),
+                    "new": str(entry_tup),
+                },
+            )
+
+        if is_update and old_entry.size != mut_entry.size:
+            if isinstance(old_entry.size, tuple) and isinstance(entry.size, tuple):
+                if len(old_entry.size) != len(entry.size):
+                    log_tup("size", "dim", "dimensionality change")
+                else:
+                    for i in range(len(entry.size)):
+                        if old_entry.size[i] != entry.size[i]:
+                            log_tup("size", f"size({i})", "size change", i)
+            else:
+                log_tup("size", "other", "other")
+
+        if is_update and old_entry.stride != mut_entry.stride:
+            if isinstance(old_entry.stride, tuple) and isinstance(entry.stride, tuple):
+                if len(old_entry.stride) != len(entry.stride):
+                    log_tup("stride", "dim", "dimensionality change")
+                else:
+                    for i in range(len(entry.stride)):
+                        if old_entry.stride[i] != entry.stride[i]:
+                            log_tup("stride", f"stride({i})", "stride change", i)
+            else:
+                log_tup("stride", "other", "other")
+    else:
+        old_entry = frame_state.automatic_dynamic[name]
+        log.debug(
+            "automatic dynamic is off, overwriting int %s val %s -> %s",
+            name,
+            old_entry.scalar,
+            entry.scalar,
+        )
+        frame_state.automatic_dynamic[name] = entry
+        mut_entry = entry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return mut_entry
 
@@ -542,14 +712,86 @@ def get_remote_cache() -> Optional[RemoteCache[JsonDataTy]]:
     )
 
 
+<<<<<<< HEAD
 def render_code_state(cs: defaultdict[CodeId, CodeState]) -> str:
     return "\n".join(
         f"{k.filename}:{k.firstlineno}:{k.name}:\n"
+=======
+def _collect_dynamic_sources(code_state: CodeState) -> OrderedSet[str]:
+    dynamic_sources: OrderedSet[str] = OrderedSet()
+    for src, fs in code_state.automatic_dynamic.items():
+        dynamic = False
+        if isinstance(fs.size, tuple):
+            dynamic = auto_dynamic in fs.size  # type: ignore[operator]
+        elif fs.scalar == auto_dynamic:
+            dynamic = True
+        if dynamic:
+            dynamic_sources.add(src)
+    return dynamic_sources
+
+
+def log_frame_dynamic_whitelist(f_code: types.CodeType) -> None:
+    code_id = CodeId.make(f_code)
+    frame_state = get_code_state()[code_id]
+    frame_whitelist = ",".join(_collect_dynamic_sources(frame_state))
+    if frame_whitelist:
+        with dynamo_timed(name := "pgo.dynamic_whitelist", log_pt2_compile_event=True):
+            CompileEventLogger.pt2_compile(
+                name, recompile_dynamic_whitelist=frame_whitelist
+            )
+
+
+def render_code_state(cs: defaultdict[CodeId, CodeState]) -> str:
+    code_state_str = "\n".join(
+        f"{k}:\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         + "\n".join(
             f"  {src}: {fs.render()}" for src, fs in v.automatic_dynamic.items()
         )
         for k, v in cs.items()
     )
+<<<<<<< HEAD
+=======
+    dynamic_sources: OrderedSet[str] = OrderedSet()
+    for state in cs.values():
+        dynamic_sources.update(_collect_dynamic_sources(state))
+    if dynamic_sources:
+        code_state_str += (
+            "\n\nPGO detected a recompilation due to dynamic shapes. "
+            "To reduce shape recompilations by compiling dynamically to start, "
+            f'set environment variable TORCH_COMPILE_DYNAMIC_SOURCES="{",".join(dynamic_sources)}"'
+        )
+    return code_state_str
+
+
+@CacheArtifactFactory.register
+class PGOCacheArtifact(CacheArtifact):
+    @override
+    def populate_cache(self) -> None:
+        meta = write_local_impl(
+            self._rewrite_cache_key_for_mega_cache(self.key), self.content
+        )
+        assert meta is not None
+
+    @override
+    @staticmethod
+    def type() -> str:
+        return "pgo"
+
+    @staticmethod
+    def _rewrite_cache_key_for_mega_cache(original_key: str) -> str:
+        """
+        The PGO cache artifact key for a MAST job contains the job name and the version.
+        When we want to use the cache artifact on a different MAST job, we need to
+        update the key to use the new MAST job's name and version.
+        """
+        if not original_key.startswith("mast:"):
+            # if original_key is overridden, then dont change it
+            return original_key
+        if (new_key := get_cache_key()) is not None:
+            return new_key
+        return original_key
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_code_state() -> defaultdict[CodeId, CodeState]:
@@ -571,7 +813,11 @@ def hit(ty: str) -> defaultdict[CodeId, CodeState]:
         trace_structured_artifact(
             f"get_{ty}_code_state",
             "string",
+<<<<<<< HEAD
             lambda: render_code_state(_CODE_STATE),
+=======
+            lambda: render_code_state(_CODE_STATE),  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         set_feature_use("pgo", True)
         _INIT_CODE_STATE = copy.deepcopy(_CODE_STATE)
@@ -597,7 +843,11 @@ def hit(ty: str) -> defaultdict[CodeId, CodeState]:
                     )
                 else:
                     CacheArtifactManager.record_artifact(
+<<<<<<< HEAD
                         CacheArtifactType.PGO, cache_key, content
+=======
+                        PGOCacheArtifact.type(), cache_key, content
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     return hit("local")
 
@@ -605,7 +855,13 @@ def hit(ty: str) -> defaultdict[CodeId, CodeState]:
     remote_cache = get_remote_cache()
     if remote_cache is not None:
         with dynamo_timed(
+<<<<<<< HEAD
             name := "pgo.get_remote_code_state", log_pt2_compile_event=True
+=======
+            name := "pgo.get_remote_code_state",
+            log_pt2_compile_event=True,
+            dynamo_compile_column_us="pgo_get_remote_code_state_time_us",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             CompileEventLogger.pt2_compile(name, cache_key=cache_key)
             # TODO: I don't really understand why there's a JSON container format
@@ -634,7 +890,11 @@ def hit(ty: str) -> defaultdict[CodeId, CodeState]:
                         )
                     else:
                         CacheArtifactManager.record_artifact(
+<<<<<<< HEAD
                             CacheArtifactType.PGO, cache_key, payload
+=======
+                            PGOCacheArtifact.type(), cache_key, payload
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         return hit("remote")
                 else:
@@ -697,7 +957,11 @@ def put_local_code_state(cache_key: str) -> None:
         pickled_code = pickle.dumps(_CODE_STATE)
 
         CacheArtifactManager.record_artifact(
+<<<<<<< HEAD
             CacheArtifactType.PGO, cache_key, pickled_code
+=======
+            PGOCacheArtifact.type(), cache_key, pickled_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         meta = write_local_impl(cache_key, pickled_code)
@@ -716,7 +980,15 @@ def put_local_code_state(cache_key: str) -> None:
 
 
 def put_remote_code_state(cache_key: str) -> None:
+<<<<<<< HEAD
     with dynamo_timed(name := "pgo.put_remote_code_state", log_pt2_compile_event=True):
+=======
+    with dynamo_timed(
+        name := "pgo.put_remote_code_state",
+        log_pt2_compile_event=True,
+        dynamo_compile_column_us="pgo_put_remote_code_state_time_us",
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         CompileEventLogger.pt2_compile(name, cache_key=cache_key)
         assert _CODE_STATE is not None
 
diff --git a/torch/_dynamo/polyfills/__init__.py b/torch/_dynamo/polyfills/__init__.py
index c1435c7551c0..7be27c031106 100644
--- a/torch/_dynamo/polyfills/__init__.py
+++ b/torch/_dynamo/polyfills/__init__.py
@@ -9,7 +9,11 @@
 # mypy: allow-untyped-defs
 
 import types
+<<<<<<< HEAD
 from collections.abc import MutableMapping, Sequence
+=======
+from collections.abc import Iterable, MutableMapping, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from itertools import repeat as _repeat
 from typing import Any, Callable, TYPE_CHECKING
 
@@ -75,6 +79,7 @@ def radians(x):
 
 
 def accumulate_grad(x, new_grad):
+<<<<<<< HEAD
     if new_grad is None:
         return
     new_grad = torch.clone(new_grad)
@@ -82,6 +87,19 @@ def accumulate_grad(x, new_grad):
         x.grad = new_grad
     else:
         x.grad.add_(new_grad)
+=======
+    # polyfills according to the Gradient Layout Contract
+    if new_grad is None:
+        return
+    new_grad_strided = torch.empty_like(x)
+    new_grad_strided.copy_(new_grad)
+    if x.grad is None:
+        x.grad = new_grad_strided
+    elif torch.is_grad_enabled():
+        x.grad = x.grad + new_grad_strided
+    else:
+        x.grad.add_(new_grad_strided)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # This mirrors
@@ -97,6 +115,26 @@ def list_cmp(op: Callable[[Any, Any], bool], left: Sequence[Any], right: Sequenc
     return op(len(left), len(right))
 
 
+<<<<<<< HEAD
+=======
+def set_symmetric_difference(set1, set2):
+    symmetric_difference_set = set()
+    for x in set1:
+        if x not in set2:
+            symmetric_difference_set.add(x)
+    for x in set2:
+        if x not in set1:
+            symmetric_difference_set.add(x)
+    return symmetric_difference_set
+
+
+def set_symmetric_difference_update(set1, set2):
+    result = set1.symmetric_difference(set2)
+    set1.clear()
+    set1.update(result)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def set_isdisjoint(set1, set2):
     for x in set1:
         if x in set2:
@@ -104,14 +142,28 @@ def set_isdisjoint(set1, set2):
     return True
 
 
+<<<<<<< HEAD
 def set_intersection(set1, set2):
     intersection_set = set()
     for x in set1:
         if x in set2:
+=======
+def set_intersection(set1, *others):
+    if len(others) == 0:
+        return set1.copy()
+
+    intersection_set = set()
+    for x in set1:
+        for set2 in others:
+            if x not in set2:
+                break
+        else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             intersection_set.add(x)
     return intersection_set
 
 
+<<<<<<< HEAD
 def set_union(set1, set2):
     union_set = set1.copy()
     set_update(union_set, set2)
@@ -129,10 +181,58 @@ def set_difference(set1, set2):
     difference_set = set()
     for x in set1:
         if x not in set2:
+=======
+def set_intersection_update(set1, *others):
+    result = set1.intersection(*others)
+    set1.clear()
+    set1.update(result)
+
+
+def set_union(set1, *others):
+    # frozenset also uses this function
+    union_set = set(set1.copy())
+    for set2 in others:
+        set_update(union_set, set2)
+    return type(set1)(union_set)
+
+
+def set_update(set1, *others):
+    if len(others) == 0:
+        return set1
+
+    for set2 in others:
+        for x in set2:
+            if x not in set1:
+                set1.add(x)
+
+
+def set_difference(set1, *others):
+    if len(others) == 0:
+        return set1.copy()
+
+    if not all(isinstance(s, Iterable) for s in others):
+        raise TypeError(f"set.difference expected an iterable, got {type(others)}")
+
+    difference_set = set()
+    for x in set1:
+        for set2 in others:
+            if x in set2:
+                break
+        else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             difference_set.add(x)
     return difference_set
 
 
+<<<<<<< HEAD
+=======
+def set_difference_update(set1, *others):
+    result = set1.difference(*others)
+    set1.clear()
+    set1.update(result)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def getattr_and_trace(*args, **kwargs):
     wrapper_obj = args[0]
     attr_name = args[1]
@@ -165,7 +265,11 @@ def construct_dict(cls, /, *args, **kwargs):
         src = args[0]
 
         # Ensure that the overridden __iter__ method is invoked
+<<<<<<< HEAD
         if isinstance(src, (dict, MutableMapping)):
+=======
+        if isinstance(src, (dict, MutableMapping, types.MappingProxyType)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for key in src:
                 # This will inline the __getitem__ of the src object
                 dst[key] = src[key]
diff --git a/torch/_dynamo/polyfills/loader.py b/torch/_dynamo/polyfills/loader.py
index d9be4e9febc9..ac40ba9b0bbc 100644
--- a/torch/_dynamo/polyfills/loader.py
+++ b/torch/_dynamo/polyfills/loader.py
@@ -21,6 +21,10 @@
     "pytree",
     "sys",
     "fx",
+<<<<<<< HEAD
+=======
+    "tensor",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 POLYFILLED_MODULES: tuple["ModuleType", ...] = tuple(
     importlib.import_module(f".{submodule}", package=polyfills.__name__)
diff --git a/torch/_dynamo/polyfills/pytree.py b/torch/_dynamo/polyfills/pytree.py
index b12181d6e21a..438b7ba28f9e 100644
--- a/torch/_dynamo/polyfills/pytree.py
+++ b/torch/_dynamo/polyfills/pytree.py
@@ -57,9 +57,16 @@ def _(*args: Any, **kwargs: Any) -> bool:
         "structseq_fields",
     ):
         __func = getattr(optree, __name)
+<<<<<<< HEAD
         substitute_in_graph(__func, can_constant_fold_through=True)(
             __func.__python_implementation__
         )
+=======
+        globals()[__name] = substitute_in_graph(__func, can_constant_fold_through=True)(
+            __func.__python_implementation__
+        )
+        __all__ += [__name]  # noqa: PLE0604
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         del __func
     del __name
 
diff --git a/torch/_dynamo/polyfills/tensor.py b/torch/_dynamo/polyfills/tensor.py
new file mode 100644
index 000000000000..dffa98f60f3b
--- /dev/null
+++ b/torch/_dynamo/polyfills/tensor.py
@@ -0,0 +1,40 @@
+from typing import Any
+
+import torch
+
+from ..decorators import substitute_in_graph
+
+
+@substitute_in_graph(  # type: ignore[arg-type]
+    torch.Tensor._make_subclass
+)
+def make_subclass(
+    cls: type[Any], data: torch.Tensor, requires_grad: bool = False, **kwargs: Any
+) -> Any:
+    with torch._C.DisableTorchFunctionSubclass():
+        # This is a rough approximation of `THPVariable_make_subclass`. It should
+        # suffice for most of Dynamo tracing purposes.
+        # https://github.com/pytorch/pytorch/blob/ccfde4dadfa3c342076a1ee387017f84dd4ad2f7/torch/csrc/autograd/python_variable.cpp#L597-L650
+        assert len(kwargs) == 0, (
+            "_make_subclass only supports requires_grad as keyword arg"
+        )
+        data = data.detach()
+
+        # Avoid unnecessary `requires_grad` mutation, which isn't supported in Dynamo.
+        if data.requires_grad != requires_grad:
+            data.requires_grad = requires_grad
+
+        # Dynamo can't yet handle upcasting to base tensor type via `as_subclass`.
+        if cls is torch.Tensor:
+            return torch.Tensor(data)
+
+        # Calling `as_subclass` because
+        # 1. Dynamo knows how to handle it
+        # 2. the C impls match at this point -- both `THPVariable_make_subclass` and
+        #    `THPVariable_as_subclass` calls `THPVariable_NewWithVar`.
+        return data.as_subclass(cls)
+
+
+__all__ = [
+    "make_subclass",
+]
diff --git a/torch/_dynamo/precompile_context.py b/torch/_dynamo/precompile_context.py
new file mode 100644
index 000000000000..1ed283470667
--- /dev/null
+++ b/torch/_dynamo/precompile_context.py
@@ -0,0 +1,152 @@
+from abc import abstractmethod
+from collections import defaultdict
+from typing import Any, Generic, Optional, TypeVar
+from typing_extensions import override
+
+from torch.compiler._cache import (
+    _serialize_single_cache,
+    CacheArtifact,
+    CacheArtifactFactory,
+    CacheArtifactManager,
+    CacheArtifactsResult,
+    CacheInfo,
+)
+from torch.utils._appending_byte_serializer import AppendingByteSerializer
+from torch.utils._ordered_set import OrderedSet
+
+
+"""
+Classes and implementations related to precompile
+"""
+
+T = TypeVar("T")
+
+
+class PrecompileCacheArtifact(CacheArtifact, Generic[T]):
+    """
+    Data for each cache artifact that will be serialized and deserialized by
+    PrecompileContext, rather than CacheArtifactManager.
+    T represents the deserialized type of the artifact, i.e. the return type of after_deserialization
+
+    PrecompileCacheArtifact is a frozen dataclass - you can add new serializable fields and metadata specific to your own artifacts
+    as needed, and use them in after_deserialization.
+
+    Example implementation:
+
+    class MyPrecompileCacheArtifact(PrecompileCacheArtifact[MySerializableType]):
+        my_field: int
+
+        def after_deserialization(self) -> MySerializableType:
+            result = pickle.loads(self.content)
+            # Do some extra work post deserialization
+            result.my_post_deserialization_function(self.my_field)
+            return result
+    """
+
+    @override
+    def populate_cache(self) -> None:
+        raise RuntimeError("Precompile cache artifacts do not populate caches")
+
+    @override
+    def precompile_compatible(self) -> bool:
+        return True
+
+    @abstractmethod
+    def after_deserialization(self) -> T:
+        """
+        Code to be run after reading raw byte contents from disk.
+        Generally converts self.content from raw bytes back into its original form.
+        """
+        ...
+
+
+class PrecompileContext(CacheArtifactManager):
+    """
+    PrecompileContext is a special CacheArtifactManager for handling precompilation
+    It uses the same interface as CacheArtifactManager, but handles deserialization differently: instead
+    of placing each artifact into respective caches, it will stitch all the cache artifacts for a single key
+    together and place it into a global Precompile Cache.
+
+    The following artifact types are supported by PrecompileContext:
+     - BundledAOTAutogradCacheArtifact
+     - CodeStateArtifact (from torch._dynamo.package once available)
+    """
+
+    # Protected by the compile_lock
+    # _new_cache_artifacts_by_key organizes results by the key of each artifact.
+    # This allows us to implement serialize_by_key easily.
+    # On call to `serialize()`, all cache artifacts in _new_cache_artifacts_by_key
+    # are transferred to _new_cache_artifacts before serialization.
+    _new_cache_artifacts_by_key: dict[str, CacheArtifact] = {}
+    _new_cache_artifacts: CacheArtifactsResult = defaultdict(list)
+    # Keep a separate seen artifacts list to make avoid unnecessary duplicates
+    # This list will not be cleared between serialize() calls
+    _seen_artifacts: OrderedSet[CacheArtifact] = OrderedSet()
+    # When serialize() is called, artifacts are transferred from _cache_artifacts to
+    # internal data structure of the _serializer
+    # This allows us to only pay the cost of serialization if serialize() is called
+    _serializer: AppendingByteSerializer[tuple[str, list[CacheArtifact]]] = (
+        AppendingByteSerializer(serialize_fn=_serialize_single_cache)
+    )
+    _cache_info: CacheInfo = CacheInfo()
+
+    @classmethod
+    def clear(cls) -> None:
+        cls._new_cache_artifacts_by_key.clear()
+        super().clear()
+
+    @override
+    @classmethod
+    def record_artifact(
+        cls,
+        artifact_type: str,
+        key: str,
+        content: Any,
+    ) -> None:
+        """
+        Called from each caching operation to record the artifact in this
+        "mega" list
+        """
+        artifact = CacheArtifactFactory.encode_create(artifact_type, key, content)
+        # TODO: although this covers completely same artifacts, it's possible
+        # with AOTAutogradCacheEntries to have multiple artifacts whose keys
+        # (i.e. backend_ids) are different, but whose contents are equal.
+        # In those cases, it would be much better if we only serialize once instead
+        # of N times.
+        if artifact in cls._seen_artifacts:
+            return
+
+        cls._new_cache_artifacts_by_key[key] = artifact
+        cls._seen_artifacts.add(artifact)
+
+    @classmethod
+    def _save_artifacts_by_type(cls) -> None:
+        """
+        We normally record artifacts by key, but serialization expects them to be organized
+        by artifact type. This function transfers artifacts from _new_cache_artifacts_by_key to _new_cache_artifacts
+        """
+        for artifact in cls._new_cache_artifacts_by_key.values():
+            cls._new_cache_artifacts[artifact.__class__.type()].append(artifact)
+        cls._new_cache_artifacts_by_key.clear()
+
+    @classmethod
+    def serialize_artifact_by_key(cls, key: str) -> Optional[CacheArtifact]:
+        """
+        Serialize all artifacts with the given key returned in a list.
+        """
+        return cls._new_cache_artifacts_by_key.get(key, None)
+
+    @classmethod
+    def serialize(cls) -> Optional[tuple[bytes, CacheInfo]]:
+        cls._save_artifacts_by_type()
+        return super().serialize()
+
+    @staticmethod
+    def populate_caches(artifacts: CacheArtifactsResult) -> CacheInfo:
+        raise NotImplementedError("TODO")
+
+    @classmethod
+    def _ensure_cache_artifacts_registered(cls) -> None:
+        from torch._functorch._aot_autograd.autograd_cache import (  # noqa: F401
+            BundledAOTAutogradCacheArtifact,
+        )
diff --git a/torch/_dynamo/repro/after_aot.py b/torch/_dynamo/repro/after_aot.py
index e285bf9a868d..3bd0420a60a2 100644
--- a/torch/_dynamo/repro/after_aot.py
+++ b/torch/_dynamo/repro/after_aot.py
@@ -58,8 +58,13 @@
     NopInputReader,
     same_two_models,
 )
+<<<<<<< HEAD
 from torch._dynamo.trace_rules import is_fbcode
 from torch._dynamo.utils import clone_inputs, counters, same
+=======
+from torch._dynamo.utils import clone_inputs, counters, same
+from torch._environment import is_fbcode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.output_code import OutputCode
 from torch._library.fake_class_registry import FakeScriptObject
 from torch.fx.experimental.proxy_tensor import make_fx
@@ -81,7 +86,11 @@
 
 
 inductor_config = import_module("torch._inductor.config")
+<<<<<<< HEAD
 use_buck = inductor_config.is_fbcode()
+=======
+use_buck = is_fbcode()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #                           MAIN ENTRY POINT
@@ -250,7 +259,11 @@ def inner_debug_fn(real_inputs):
 
 
 def maybe_fbcode_instructions():
+<<<<<<< HEAD
     if is_fbcode:
+=======
+    if is_fbcode():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         extra_deps_formatted = "\n".join([f'        "{dep}",' for dep in extra_deps])
         if len(extra_deps_formatted) > 0:
             extra_deps_formatted = "\n" + extra_deps_formatted
@@ -465,7 +478,11 @@ def isolate_fails(
     if use_buck:
         cmd = BuckTargetWriter(file_name).write(print_msg=False)
     else:
+<<<<<<< HEAD
         cmd = ["python", file_name]
+=======
+        cmd = [sys.executable, file_name]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     p = subprocess.Popen(
         cmd,
diff --git a/torch/_dynamo/resume_execution.py b/torch/_dynamo/resume_execution.py
index 9f9f7396e96d..0dc058eb42ba 100644
--- a/torch/_dynamo/resume_execution.py
+++ b/torch/_dynamo/resume_execution.py
@@ -292,8 +292,13 @@ def generate(
         argnames: tuple[str, ...],
         argnames_null: tuple[str, ...],
         setup_fns: tuple[ReenterWith, ...],
+<<<<<<< HEAD
         stack_ctx_vars: tuple[tuple[int, tuple[Any]], ...],
         argnames_ctx_vars: tuple[tuple[str, tuple[Any]], ...],
+=======
+        stack_ctx_vars: tuple[tuple[int, tuple[Any, ...]], ...],
+        argnames_ctx_vars: tuple[tuple[str, tuple[Any, ...]], ...],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         null_idxes: tuple[int, ...],
     ) -> types.CodeType:
         assert offset is not None
@@ -398,11 +403,18 @@ def update(instructions: list[Instruction], code_options: dict[str, Any]):
                         old_hook_target = offset_to_inst[hook_target_offset]
                         meta.prefix_block_target_offset_remap.append(hook_target_offset)
                         old_hook_target_remap[old_hook_target] = exn_target
+<<<<<<< HEAD
                 real_i = i + null_idxes_i
                 if real_i in stack_ctx_vars_d:
                     # NOTE: we assume that current stack var is a context manager CLASS!
                     # Load args for context variable and construct it
                     prefix.extend(_load_tuple_and_call(stack_ctx_vars_d[real_i]))
+=======
+                if i in stack_ctx_vars_d:
+                    # NOTE: we assume that current stack var is a context manager CLASS!
+                    # Load args for context variable and construct it
+                    prefix.extend(_load_tuple_and_call(stack_ctx_vars_d[i]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if is_py311_plus:
                 # reverse the mapping since targets of later/nested contexts are inserted
@@ -567,6 +579,7 @@ def remap_block_offsets(
         return ContinueExecutionCache.lookup(
             meta.code, lineno, new_offset, setup_fn_target_offsets, *args
         )
+<<<<<<< HEAD
 
 
 """
@@ -642,3 +655,5 @@ def patch_setup_with(
         create_instruction("JUMP_ABSOLUTE", target=inside_with_resume_at)
     ]
 """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index 4c85d98cfd16..bd27a5bb7755 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -1,5 +1,31 @@
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
+=======
+"""
+Side effect tracking and management for TorchDynamo's compilation system.
+
+This module provides infrastructure for tracking and managing side effects that occur
+during symbolic execution, including:
+
+- Tracking mutations to objects, attributes, and variables
+- Managing context changes (cell variables, global namespace modifications)
+- Handling aliasing and object identity preservation
+- Managing stack frame state and local variable changes
+- Tracking function calls with side effects
+
+Key classes:
+- SideEffects: Main container for tracking all side effects during execution
+- MutableSideEffects: Specialization for mutable object tracking
+- AttributeMutation/ValueMutation: Track specific types of mutations
+- Various specialized side effect classes for different scenarios
+
+The side effect system ensures that mutations performed during symbolic execution
+are properly replayed during runtime, maintaining the correctness of compiled code
+while enabling optimizations where safe.
+"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import collections
 import contextlib
 import inspect
@@ -160,6 +186,16 @@ def should_allow_side_effects_under_checkpoint(self):
             and output_graph.current_tx.output.current_tracer.allow_side_effects_under_checkpoint
         )
 
+<<<<<<< HEAD
+=======
+    def should_allow_externally_visible_side_effects_in_subtracer(self):
+        output_graph = self.output_graph_weakref()
+        return (
+            output_graph
+            and output_graph.current_tx.output.current_tracer.unsafe_allow_externally_visible_side_effects
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_reconstructing_generator(self):
         output_graph = self.output_graph_weakref()
 
@@ -175,6 +211,11 @@ def check_allowed_side_effect(self, item):
         # These are benign.
         if isinstance(item, AutogradFunctionContextVariable):
             return True
+<<<<<<< HEAD
+=======
+        if self.should_allow_externally_visible_side_effects_in_subtracer():
+            return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.should_allow_side_effects_under_checkpoint():
             return True
         if self.is_reconstructing_generator():
@@ -256,6 +297,10 @@ def cls_supports_mutation_side_effects(cls):
             int.__getattribute__,
             str.__getattribute__,
             list.__getattribute__,
+<<<<<<< HEAD
+=======
+            tuple.__getattribute__,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             BaseException.__getattribute__,
         )
 
@@ -295,9 +340,13 @@ def _track_obj(
         variable: VariableTracker,
         mutation_type_cls=ValueMutationExisting,
     ):
+<<<<<<< HEAD
         """Start tracking a new variable for mutation"""
         assert variable.source is not None
 
+=======
+        """Start tracking an existing or new variable for mutation"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if id(item) in self.id_to_variable:
             raise AssertionError(
                 f"{variable} is already tracked for mutation. This could be "
@@ -576,12 +625,27 @@ def _get_modified_vars(self):
         return [var for var in self.id_to_variable.values() if self.is_modified(var)]
 
     def codegen_save_tempvars(self, cg: PyCodegen):
+<<<<<<< HEAD
         # Make sure we codegen these modified VT to their source by default, so
         # that mutation and aliasing are properly accounted for.
         for var in self._get_modified_vars():
             if isinstance(var.mutation_type, AttributeMutationNew) and isinstance(
                 var, variables.CellVariable
             ):
+=======
+        # We must codegen modified VT to their source by default, so that
+        # mutation and aliasing are properly accounted for.
+        #
+        # Since newly constructed objects don't have a source, we manually
+        # codegen their construction and store them to a newly assigned local
+        # source. Note that `ValueMutationNew` isn't tracked by SideEffects.
+        for var in self._get_modified_vars():
+            if not isinstance(var.mutation_type, AttributeMutationNew):
+                assert var.source is not None
+                continue
+
+            if isinstance(var, variables.CellVariable):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Cells created in the root frame are created either by
                 # `MAKE_CELL` or by them being in `co_cellvars`, so we only emit
                 # `make_cell` for the non-root-frame cells here.
@@ -595,6 +659,7 @@ def codegen_save_tempvars(self, cg: PyCodegen):
                     var.source = LocalSource(cg.tempvars[var])  # type: ignore[attr-defined]
                 elif var.source is None:
                     var.source = LocalCellSource(var.local_name)
+<<<<<<< HEAD
             elif isinstance(var.mutation_type, AttributeMutationNew):
                 if isinstance(var, variables.AutogradFunctionContextVariable):
                     unimplemented_v2(
@@ -607,6 +672,40 @@ def codegen_save_tempvars(self, cg: PyCodegen):
                 # Reconstruct the bytecode for
                 # base_cls.__new__(user_cls, *args)
 
+=======
+            elif isinstance(var, variables.TensorVariable):
+                # NOTE: for historical reasons we never assigned local sources
+                # to newly constructed tensor object, so we keep it that way.
+                # They are always loaded from output of the fx graph, so one can
+                # think of it as having a "OutputGraphSource" for codegen
+                # purposes.
+                #
+                # However, tensor subclass objects are different, because the
+                # reconstruction logic in `PyCodegen` loads the data tensor from
+                # graph output and then calls `as_subclass`, meaning we must
+                # assign a source to it to ensure we only reconstruct one
+                # subclass instance.
+                if isinstance(
+                    var, variables.torch_function.TensorWithTFOverrideVariable
+                ):
+                    # Don't codegen from temp source assigned from the 1st pass.
+                    cg(var, allow_cache=False)
+                    cg.add_cache(var)
+                    # `add_cache` generates STORE and consumes TOS, but we never
+                    # cleared it. TODO move this call into `add_cache`
+                    cg.clear_tos()
+                    var.source = LocalSource(cg.tempvars[var])
+            elif isinstance(var, variables.AutogradFunctionContextVariable):
+                unimplemented_v2(
+                    gb_type="AutogradFunctionContextVariable escaped Dynamo-traced region",
+                    context="",
+                    explanation="We cannot reconstruct a torch.autograd.Function's context object.",
+                    hints=[],
+                )
+            else:
+                # Reconstruct the bytecode for
+                # base_cls.__new__(user_cls, *args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(var, variables.UserDefinedObjectVariable):
 
                     def load_new_method():
@@ -630,10 +729,13 @@ def load_new_method():
 
                 cg.add_cache(var)
                 var.source = LocalSource(cg.tempvars[var])
+<<<<<<< HEAD
             else:
                 # The remaning cases here are `AttributeMutationExisting` and
                 # `MutableSideEffects`, which have sources already.
                 assert var.source is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for ctx, args in self.save_for_backward:
             cg(ctx.source)
@@ -977,6 +1079,26 @@ def codegen_update_mutated(self, cg: PyCodegen):
                             suffixes.append(
                                 [create_instruction("DELETE_ATTR", argval=name)]
                             )
+<<<<<<< HEAD
+=======
+                    elif isinstance(
+                        var, variables.UserDefinedObjectVariable
+                    ) and var.should_skip_descriptor_setter(name):
+                        cg.add_push_null(
+                            lambda: cg.load_import_from(
+                                utils.__name__, "object_setattr_ignore_descriptor"
+                            )
+                        )
+                        cg(var.source)  # type: ignore[attr-defined]
+                        cg(variables.ConstantVariable(name))
+                        cg(value)
+                        suffixes.append(
+                            [
+                                *create_call_function(3, False),
+                                create_instruction("POP_TOP"),
+                            ]
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     elif (
                         isinstance(var, variables.UserDefinedObjectVariable)
                         and var.needs_slow_setattr()
@@ -993,7 +1115,11 @@ def codegen_update_mutated(self, cg: PyCodegen):
                     else:
                         cg.tx.output.update_co_names(name)
                         cg(value)
+<<<<<<< HEAD
                         cg(var.source)
+=======
+                        cg(var)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         suffixes.append([create_instruction("STORE_ATTR", argval=name)])
             elif isinstance(var, variables.ListIteratorVariable):
                 for _ in range(var.index):
@@ -1050,6 +1176,19 @@ def allow_side_effects_under_checkpoint(tx: "InstructionTranslator"):
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
+=======
+def allow_externally_visible_side_effects_in_subtracer(tx: "InstructionTranslator"):
+    orig_val = tx.output.current_tracer.unsafe_allow_externally_visible_side_effects
+    try:
+        tx.output.current_tracer.unsafe_allow_externally_visible_side_effects = True
+        yield
+    finally:
+        tx.output.current_tracer.unsafe_allow_externally_visible_side_effects = orig_val
+
+
+@contextlib.contextmanager
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def disallow_side_effects_in_generator(tx: "InstructionTranslator"):
     orig_val = tx.output.current_tracer.is_reconstructing_generator
     try:
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index 511a8c17272d..9fcdc5efd9bb 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -21,7 +21,12 @@
 
 import dataclasses
 import enum
+<<<<<<< HEAD
 from typing import Any, Optional, Union
+=======
+import functools
+from typing import Any, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch._guards import ChainedSource, GuardSource, Source
 
@@ -29,6 +34,12 @@
 from .bytecode_transformation import create_call_function, create_instruction
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from .codegen import PyCodegen
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # It shouldn't be supported to construct an NNModuleVariable inside an FSDP module,
 # so those cases are omitted intentionally
 
@@ -120,7 +131,11 @@ class LocalSource(Source):
     # or `co_freevars`.
     is_derefed_cell_contents: bool = False
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.is_derefed_cell_contents:
             codegen.load_deref(self.local_name)
         else:
@@ -137,7 +152,11 @@ def name(self):
 class SyntheticLocalSource(Source):
     local_name: str
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.append_output(codegen.create_load(self.local_name))
 
     def guard_source(self):
@@ -154,7 +173,11 @@ class RandomValueSource(Source):
     def guard_source(self):
         return GuardSource.RANDOM_VALUE
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.append_output(codegen.create_load(codegen.tx.output.random_values_var))
         codegen.append_output(codegen.create_load_const(self.random_call_index))
         codegen.append_output(create_instruction("BINARY_SUBSCR"))
@@ -167,7 +190,11 @@ def name(self):
 class GlobalSource(Source):
     global_name: str
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.append_output(codegen.create_load_global(self.global_name, add=True))
 
     def guard_source(self):
@@ -181,7 +208,11 @@ def name(self):
 class GlobalWeakRefSource(Source):
     global_name: str
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.append_output(
                 codegen.create_load_global(self.global_name, add=True)
@@ -198,7 +229,11 @@ def name(self):
 
 @dataclasses.dataclass(frozen=True)
 class WeakRefCallSource(ChainedSource):
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(lambda: codegen(self.base))
         codegen.extend_output(create_call_function(0, False))
 
@@ -227,7 +262,11 @@ def __post_init__(self):
             )
             object.__setattr__(self, "member", member_parts[-1])
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
         codegen.extend_output(codegen.create_load_attrs(self.member))
 
@@ -253,7 +292,11 @@ def __post_init__(self):
             )
             object.__setattr__(self, "member", member_parts[-1])
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
         codegen.extend_output(codegen.create_load_attrs(self.member))
 
@@ -273,7 +316,11 @@ class LocalCellSource(Source):
 
     local_name: str
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Although `LOAD_FAST` and `LOAD_CLOSURE` have the same semantics,
         # Dynamo's bytecode transformation differentiates them slightly, so we
         # always emit `LOAD_CLOSURE` here.
@@ -291,7 +338,11 @@ def reconstruct(self, codegen):
 class GradSource(ChainedSource):
     member: str = "grad"
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
         codegen.extend_output(codegen.create_load_attrs(self.member))
 
@@ -366,7 +417,11 @@ def __post_init__(self):
         else:
             assert self.idx is not None
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(
                 utils.__name__, f"call_{self.prop.method_name()}"
@@ -402,7 +457,11 @@ class IndexedSource(ChainedSource):
     def __post_init__(self):
         assert self.base is not None
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     def guard_source(self):
@@ -417,7 +476,11 @@ class NegateSource(ChainedSource):
     def __post_init__(self):
         assert self.base is not None
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     def guard_source(self):
@@ -433,7 +496,11 @@ class ConvertIntSource(ChainedSource):
     def __post_init__(self):
         assert self.base is not None
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
 
     def guard_source(self):
@@ -448,7 +515,11 @@ class FlattenScriptObjectSource(ChainedSource):
     def __post_init__(self):
         assert self.base is not None
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
 
     def guard_source(self):
@@ -463,7 +534,11 @@ class ScriptObjectQualifiedNameSource(ChainedSource):
     def __post_init__(self):
         assert self.base is not None
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
 
     def guard_source(self):
@@ -474,7 +549,11 @@ def name(self):
 
 
 class AttrProxySource(ChainedSource):
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
 
     def guard_source(self):
@@ -508,7 +587,11 @@ def __post_init__(self):
                 self, "_name", f"{self.base.name()}.{self.field}[{self.idx_key}]"
             )
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
         codegen.extend_output(codegen.create_load_attrs(self.field))
         codegen.append_output(codegen.create_load_const(self.idx_key))
@@ -533,7 +616,11 @@ def __post_init__(self):
             super().__setattr__("index", self.index.__reduce__())
             super().__setattr__("index_is_slice", True)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
         if self.index_is_slice:
             codegen.append_output(codegen.create_load_const(self.unpack_slice()))
@@ -567,7 +654,11 @@ class ConstDictKeySource(ChainedSource):
     def guard_source(self):
         return self.base.guard_source()
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(utils.__name__, "dict_keys_getitem")
         )
@@ -670,7 +761,11 @@ class ListGetItemSource(GetItemSource):
     Same as GetItemSource with reconstruct and name overridden to be list specific.
     """
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Reconstruct list.__getitem__(lst, index) to avoid any side effects
         # from possibly overridden __getitem__.
 
@@ -707,7 +802,11 @@ def name(self):
 
 @dataclasses.dataclass(frozen=True)
 class TupleIteratorGetItemSource(GetItemSource):
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(utils.__name__, "tuple_iterator_getitem")
         )
@@ -720,11 +819,34 @@ def name(self):
 
 
 @dataclasses.dataclass(frozen=True)
+<<<<<<< HEAD
+=======
+class DataclassFieldsSource(ChainedSource):
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.add_push_null(
+            lambda: codegen.load_import_from(utils.__name__, "dataclass_fields")
+        )
+        codegen(self.base)
+        codegen.extend_output(create_call_function(1, False))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        return f"___dataclass_fields({self.base.name()})"
+
+
+@dataclasses.dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TypeSource(ChainedSource):
     def __post_init__(self):
         assert self.base is not None
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(lambda: codegen.load_import_from("builtins", "type"))
         codegen(self.base)
         codegen.extend_output(create_call_function(1, False))
@@ -738,7 +860,11 @@ def name(self):
 
 @dataclasses.dataclass(frozen=True)
 class OptimizerSource(ChainedSource):
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
 
     def guard_source(self):
@@ -750,7 +876,11 @@ def name(self):
 
 @dataclasses.dataclass(frozen=True)
 class NNModuleSource(ChainedSource):
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.base)
 
     def guard_source(self):
@@ -799,7 +929,11 @@ def _get_index(self):
 
         return TorchFunctionModeStackVariable.get_mode_index(self.ind)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(
                 utils.__name__, "get_torch_function_mode_stack_at"
@@ -816,7 +950,11 @@ def guard_source(self):
 class ConstantSource(Source):
     source_name: str
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.append_output(codegen.create_load_global(self.source_name, add=False))
 
     def guard_source(self):
@@ -837,7 +975,11 @@ def name(self) -> str:
     def guard_source(self):
         return self.base.guard_source()
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(lambda: codegen.load_import_from("torch", "as_tensor"))
         codegen(self.base)
         codegen.extend_output(create_call_function(1, False))
@@ -893,6 +1035,7 @@ def guard_source(self):
         return GuardSource.BACKWARD_STATE
 
 
+<<<<<<< HEAD
 def is_from_local_source(source: Source, *, only_allow_input=False):
     if isinstance(source, ChainedSource):
         return is_from_local_source(source.base, only_allow_input=only_allow_input)
@@ -903,6 +1046,69 @@ def is_from_local_source(source: Source, *, only_allow_input=False):
     return True
 
 
+=======
+def get_local_source_name(source: Source, *, only_allow_input=False) -> Optional[str]:
+    if isinstance(source, ChainedSource):
+        return get_local_source_name(source.base, only_allow_input=only_allow_input)
+    if not isinstance(source, LocalSource):
+        return None
+    if only_allow_input and not source.is_input:
+        return None
+    return source.local_name
+
+
+def is_from_local_source(source: Source, *, only_allow_input=False):
+    return get_local_source_name(source, only_allow_input=only_allow_input) is not None
+
+
+def is_from_global_source(source: Source) -> bool:
+    return get_global_source_name(source) is not None
+
+
+def get_global_source_name(source: Source) -> Optional[str]:
+    if isinstance(source, ChainedSource):
+        return get_global_source_name(source.base)
+    if not isinstance(source, GlobalSource):
+        return None
+    return source.global_name
+
+
+def is_from_nonlocal_source(source: Source):
+    if isinstance(source, ChainedSource):
+        return is_from_nonlocal_source(source.base)
+    return (
+        isinstance(source, LocalSource)
+        and source.is_derefed_cell_contents
+        and not source.is_input
+    )
+
+
+def is_from_source(source: Source, target: Source):
+    if isinstance(source, ChainedSource):
+        return is_from_source(source.base, target)
+    return source == target
+
+
+@functools.lru_cache
+def is_from_unspecialized_nn_module_source(source: Source):
+    if isinstance(source, UnspecializedNNModuleSource):
+        return True
+    if isinstance(source, ChainedSource):
+        return is_from_unspecialized_nn_module_source(source.base)
+    return False
+
+
+@functools.lru_cache
+def is_from_unspecialized_builtin_nn_module_source(source: Source):
+    if isinstance(source, UnspecializedBuiltinNNModuleSource):
+        return True
+    if isinstance(source, ChainedSource):
+        return is_from_unspecialized_builtin_nn_module_source(source.base)
+    return False
+
+
+@functools.lru_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_from_unspecialized_param_buffer_source(source: Source):
     if isinstance(source, UnspecializedParamBufferSource):
         return True
@@ -911,6 +1117,10 @@ def is_from_unspecialized_param_buffer_source(source: Source):
     return False
 
 
+<<<<<<< HEAD
+=======
+@functools.lru_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_from_flatten_script_object_source(source: Source):
     if isinstance(source, FlattenScriptObjectSource):
         return True
@@ -919,6 +1129,10 @@ def is_from_flatten_script_object_source(source: Source):
     return False
 
 
+<<<<<<< HEAD
+=======
+@functools.lru_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_from_optimizer_source(source: Source):
     if isinstance(source, OptimizerSource):
         return True
@@ -929,6 +1143,10 @@ def is_from_optimizer_source(source: Source):
 
 # TODO: can probably write a generic "test this on everything in the chain"
 # helper
+<<<<<<< HEAD
+=======
+@functools.lru_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_from_defaults(source: Source):
     if isinstance(source, DefaultsSource):
         return True
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 4fc7454af6ef..2e4af8618b04 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -44,13 +44,21 @@
 import types
 import typing
 import weakref
+<<<<<<< HEAD
 from typing import Any, Callable, cast, NoReturn, Optional, Union
+=======
+from typing import Any, Callable, cast, NoReturn, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
 import torch._logging
 from torch._dynamo.exc import TensorifyScalarRestartAnalysis
 from torch._guards import tracing, TracingContext
+<<<<<<< HEAD
+=======
+from torch._logging.structured import dump_file
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.symbolic_shapes import guard_bool
 from torch.utils._functools import cache_method
 
@@ -116,7 +124,11 @@
     proxy_args_kwargs,
 )
 from .variables.base import typestr, ValueMutationNew, VariableTracker
+<<<<<<< HEAD
 from .variables.builder import FrameStateSizeEntry, wrap_fx_proxy
+=======
+from .variables.builder import FrameStateSizeEntry, VariableBuilder, wrap_fx_proxy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .variables.builtin import BuiltinVariable
 from .variables.constant import ConstantVariable
 from .variables.ctx_manager import (
@@ -166,6 +178,12 @@
 )
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from .package import CompilePackage
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 graph_break_log = torch._logging.getArtifactLogger(__name__, "graph_breaks")
 trace_call_log = torch._logging.getArtifactLogger(__name__, "trace_call")
@@ -269,7 +287,11 @@ def next(
 - Expected: {entry.filename}:{entry.lineno} ({entry.inst.opname} at ip={entry.instruction_pointer})
 - Actual: {filename}:{lineno} ({inst.opname} at ip={instruction_pointer})
 {prev_entry_msg}
+<<<<<<< HEAD
 There are two usual reasons why this may have occured:
+=======
+There are two usual reasons why this may have occurred:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - When Dynamo analysis restarted, the second run took a different path than
   the first.  If this occurred, the previous instruction is the critical instruction that
   behaved differently.
@@ -330,7 +352,11 @@ def empty(cls) -> bool:
         return len(cls.force_specializations) == 0
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _step_logger():
     return torchdynamo_logging.get_step_logger(log)
 
@@ -590,6 +616,7 @@ def jump_graph_break(self, inst, value, extra_msg=""):
                 hints=_hints,
             ),
         )
+<<<<<<< HEAD
         if not self.should_compile_partial_graph():
             unimplemented_v2(
                 gb_type="Should not compile partial graph (data-dependent branching)",
@@ -599,6 +626,9 @@ def jump_graph_break(self, inst, value, extra_msg=""):
                 "compile the partial graph.",
                 hints=[],
             )
+=======
+        assert self.should_compile_partial_graph()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # compile a partial subgraph prefix then jump into user code
         if self.maybe_has_backedge():
             msg = (
@@ -610,11 +640,16 @@ def jump_graph_break(self, inst, value, extra_msg=""):
 
         self.push(value)
         log.debug("generic_jump triggered compile")
+<<<<<<< HEAD
         self.output.compile_subgraph(
+=======
+        all_stack_locals_metadata = self.output.compile_subgraph(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self,
             reason=GraphCompileReason(
                 f"generic_jump {typestr(value)}{extra_msg}", [self.frame_summary()]
             ),
+<<<<<<< HEAD
         )
         self.pop()
 
@@ -622,6 +657,18 @@ def jump_graph_break(self, inst, value, extra_msg=""):
         if push:
             self.push(value)
         if_jump = self.create_call_resume_at(inst.target)
+=======
+            stack_pops=1,
+        )
+        self.pop()
+
+        if_next = self.create_call_resume_at(
+            self.next_instruction, all_stack_locals_metadata
+        )
+        if push:
+            self.push(value)
+        if_jump = self.create_call_resume_at(inst.target, all_stack_locals_metadata)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if sys.version_info >= (3, 13):
             # 3.13 requires stack[-1] to be bool type
@@ -642,8 +689,29 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
             if value.is_python_constant():
                 if bool(value.as_python_constant()):
                     return self.jump(inst)
+<<<<<<< HEAD
                 else:
                     jump_graph_break(self, inst, value)
+=======
+                elif self.should_compile_partial_graph():
+                    jump_graph_break(self, inst, value)
+                else:
+                    unimplemented_v2(
+                        gb_type="Data-dependent assertion failed (cannot compile partial graph)",
+                        context=f"value: {value}",
+                        explanation="Dynamo has determined when encountering a data-dependent assert failure "
+                        "that it should not compile the partial graph.",
+                        hints=[
+                            *graph_break_hints.FUNDAMENTAL,
+                            "Use `torch._assert()` to raise a hard AssertionError when the check fails. "
+                            "This error will propagate back the user code "
+                            "that called the compiled function (i.e. Dynamo will not trace any exception handling).",
+                            "Remove the assert statement.",
+                            "Move the assert statement outside of any context managers in order to graph break with "
+                            "partial graph compilation (if fullgraph=False).",
+                        ],
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # TODO maybe should respect DtoH sync intention of users later??
             # Manually insert torch._assert_async instead of python assert and jump over
@@ -864,7 +932,25 @@ def handle_graph_break(
             inst: Instruction,
             reason: GraphCompileReason,
         ):
+<<<<<<< HEAD
             self.output.compile_subgraph(self, reason=reason)
+=======
+            if (
+                sys.version_info >= (3, 11)
+                and sys.version_info < (3, 12)
+                and inst.opname == "CALL"
+            ):
+                # stack effect for PRECALL + CALL is split between the two instructions
+                stack_effect = dis.stack_effect(
+                    dis.opmap["PRECALL"], inst.arg
+                ) + dis.stack_effect(dis.opmap["CALL"], inst.arg)
+            else:
+                stack_effect = dis.stack_effect(inst.opcode, inst.arg)
+
+            all_stack_locals_metadata = self.output.compile_subgraph(
+                self, reason=reason, stack_pops=push - stack_effect
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cg = PyCodegen(self)
             cleanup: list[Instruction] = []
             # Reconstruct the context variable CLASS in the block stack
@@ -909,6 +995,7 @@ def handle_graph_break(
 
             self.output.add_output_instructions(cleanup)
 
+<<<<<<< HEAD
             if (
                 sys.version_info >= (3, 11)
                 and sys.version_info < (3, 12)
@@ -926,6 +1013,15 @@ def handle_graph_break(
                 self.push(UnknownVariable())
             self.output.add_output_instructions(
                 self.create_call_resume_at(self.next_instruction)
+=======
+            self.popn(push - stack_effect)
+            for _ in range(push):
+                self.push(UnknownVariable())
+            self.output.add_output_instructions(
+                self.create_call_resume_at(
+                    self.next_instruction, all_stack_locals_metadata
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         return wrapper
@@ -1075,6 +1171,13 @@ class InstructionTranslatorBase(
     exec_recorder: Optional[ExecutionRecorder]
     strict_checks_fn: Optional[Callable[[VariableTracker], bool]]
     start_point: Optional[int]
+<<<<<<< HEAD
+=======
+    is_leaf_tracer: bool
+    parent: Optional["InstructionTranslatorBase"]
+    debug_locals: list[tuple[VariableTracker, list[VariableTracker]]]
+    package: Optional["CompilePackage"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def mark_inconsistent_side_effects(self):
         """
@@ -1209,10 +1312,15 @@ def starts_line(self, lineno):
         TracingContext.set_current_loc(
             self.f_code.co_filename, lineno, self.f_code.co_name
         )
+<<<<<<< HEAD
         from torch._logging.structured import dump_file
 
         dump_file(self.f_code.co_filename)
         if trace_source_log.isEnabledFor(logging.DEBUG):
+=======
+
+        if self.is_trace_source_log_enabled:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             trace_source_log.debug("%s", LazyString(self.get_log_starts_line_log_str))
 
     def step(self):
@@ -1235,7 +1343,11 @@ def step(self):
             if self.current_speculation.failed:
                 return self.step_graph_break(inst)
 
+<<<<<<< HEAD
         if trace_bytecode_log.isEnabledFor(logging.DEBUG):
+=======
+        if self.is_trace_bytecode_log_enabled:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             trace_bytecode_log.debug(
                 "TRACE %s %s %s", inst.opname, inst.argval, self.stack
             )
@@ -1314,6 +1426,13 @@ def step_graph_break(self, continue_inst):
         # generate code from checkpoint
         assert not self.output.output_instructions
         assert self.current_speculation is not None
+<<<<<<< HEAD
+=======
+        # NOTE: adding an assert here since it seems like the only place
+        # where we call step_graph_break right now is when the stack is empty,
+        # so let's enforce that for now.
+        assert not self.stack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.output.compile_subgraph(
             self,
             partial_convert=True,
@@ -1331,6 +1450,10 @@ def run_ctx_mgr(self):
 
     def run(self):
         with self.run_ctx_mgr():
+<<<<<<< HEAD
+=======
+            dump_file(self.f_code.co_filename)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 self.output.push_tx(self)
                 self.start_point = self.instruction_pointer
@@ -1366,6 +1489,14 @@ def run(self):
                 if isinstance(self, InstructionTranslator):
                     self.output.cleanup()
 
+<<<<<<< HEAD
+=======
+                    # Note that this call maybe redundant if compile_subgraph is
+                    # called. This is ok, because calling exit stack close()
+                    # twice is not an issue (second stop is a no op).
+                    self.output.mark_bytecode_tracing_stop()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def push(self, val: Optional[VariableTracker]):
         assert val is None or isinstance(val, VariableTracker), (
             f"push expects VariableTracker, got {typestr(val)}"
@@ -1414,7 +1545,11 @@ def LOAD_FAST(self, inst):
                 )
 
         # for continuation functions
+<<<<<<< HEAD
         if name.startswith("___stack"):
+=======
+        if name.startswith("__stack"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.symbolic_locals.pop(name)
 
     def LOAD_DEREF(self, inst):
@@ -1469,16 +1604,26 @@ def _load_global(self, inst):
                 assert name in self.f_builtins
                 self.exec_recorder.builtins[name] = self.f_builtins[name]
 
+<<<<<<< HEAD
+=======
+        if name not in self.f_globals:
+            return self.load_builtin(inst)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if name in self.symbolic_globals:
             variable = self.output.side_effects[self.symbolic_globals[name]]
             self.push(self.output.side_effects.load_global(variable, name))
             return
 
+<<<<<<< HEAD
         try:
             value = self.f_globals[name]
         except KeyError:
             return self.load_builtin(inst)
 
+=======
+        value = self.f_globals[name]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.push(VariableTracker.build(self, value, GlobalSource(name)))
 
     @functools.cached_property
@@ -1528,6 +1673,12 @@ def import_source(self, module_name):
         else:
             value = _import_module(module_name)
             alias = f"__import_{module_name.replace('.', '_dot_')}"
+<<<<<<< HEAD
+=======
+
+        if self.package is not None:
+            self.package.add_import_source(alias, module_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f_globals = self.output.global_scope
         assert alias not in f_globals or f_globals[alias] is value
         f_globals[alias] = value
@@ -1632,10 +1783,22 @@ def IMPORT_NAME(self, inst):
                 hints=[],
             )
 
+<<<<<<< HEAD
+=======
+    # fb internal 3.12 opcode
+    EAGER_IMPORT_NAME = IMPORT_NAME
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def IMPORT_FROM(self, inst):
         self.DUP_TOP(inst)
         self._load_attr(inst)
 
+<<<<<<< HEAD
+=======
+    # Cache note: This cache only exists for the duration of this
+    # InstructionTranslator - so it should be safe to do.
+    @cache_method
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def load_builtin_from_argval(self, argval):
         if argval not in self.f_builtins:
             raise Unsupported(f"name '{argval}' is not defined")
@@ -1646,6 +1809,7 @@ def load_builtin_from_argval(self, argval):
                 self.output.name_of_builtins_dict_key_in_fglobals
             )
             var_source = DictGetItemSource(builtins_source, argval)
+<<<<<<< HEAD
             self.push(VariableTracker.build(self, val, var_source))
         else:
             assert is_builtin_constant(val)
@@ -1653,6 +1817,15 @@ def load_builtin_from_argval(self, argval):
 
     def load_builtin(self, inst):
         self.load_builtin_from_argval(inst.argval)
+=======
+            return VariableTracker.build(self, val, var_source)
+        else:
+            assert is_builtin_constant(val)
+            return ConstantVariable.create(value=val)
+
+    def load_builtin(self, inst):
+        self.push(self.load_builtin_from_argval(inst.argval))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def jump(self, inst):
         assert self.instruction_pointer is not None
@@ -1701,6 +1874,7 @@ def WITH_CLEANUP_FINISH(self, inst):
         self.popn(2)
         self.push(None)
 
+<<<<<<< HEAD
     def CALL_FINALLY(self, inst):
         """
         pushes the address of the next instruction onto the stack and increments
@@ -1729,6 +1903,8 @@ def POP_FINALLY(self, inst):
         if preserve_tos:
             self.push(tos)  # type: ignore[possibly-undefined]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def FOR_ITER(self, inst):
         it = self.pop().realize()
         try:
@@ -1749,18 +1925,34 @@ def FOR_ITER(self, inst):
                 self.push(ConstantVariable.create(None))
             self.jump(inst)
 
+<<<<<<< HEAD
     def _raise_exception_variable(self, val) -> NoReturn:
         # User can raise exception in 2 ways
         #   1) raise exception type - raise NotImplementedError
         #   2) raise execption instance - raise NotImplemetedError("foo")
 
         # 1) when user raises exception type
+=======
+    def _create_exception_type(self, val):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(
             val, (variables.BuiltinVariable, UserDefinedExceptionClassVariable)
         ):
             # Create the instance of the exception type
             # https://github.com/python/cpython/blob/3.11/Python/ceval.c#L6547-L6549
             val = val.call_function(self, [], {})  # type: ignore[arg-type]
+<<<<<<< HEAD
+=======
+        return val
+
+    def _raise_exception_variable(self, val) -> NoReturn:
+        # User can raise exception in 2 ways
+        #   1) raise exception type - raise NotImplementedError
+        #   2) raise exception instance - raise NotImplemetedError("foo")
+
+        # 1) when user raises exception type
+        val = self._create_exception_type(val)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Handle https://peps.python.org/pep-0479/
         # CPython 3.12+ has a specific bytecode instruction (CALL_INTRINSIC_1 3) for this
@@ -1787,6 +1979,13 @@ def _raise_exception_variable(self, val) -> NoReturn:
 
     def RAISE_VARARGS(self, inst):
         if inst.arg == 0:
+<<<<<<< HEAD
+=======
+            if not len(self.exn_vt_stack):
+                msg = ConstantVariable("No active exception to reraise")
+                exc.raise_observed_exception(RuntimeError, self, args=[msg])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # re-raise the previous exception. Here CPython refers to the exception
             # on top of the exception stack
             assert len(self.exn_vt_stack)
@@ -1798,6 +1997,7 @@ def RAISE_VARARGS(self, inst):
             val = self.stack[-1]
             self._raise_exception_variable(val)
         else:
+<<<<<<< HEAD
             # raise .. from None
             from_vt = self.pop()
             if isinstance(from_vt, ConstantVariable) and from_vt.value is None:
@@ -1816,6 +2016,18 @@ def RAISE_VARARGS(self, inst):
                 explanation="Dynamo does not support `raise ... from [not-None]`",
                 hints=[],
             )
+=======
+            # raise .. from ...
+            from_vt = self.pop()
+            val = self.pop()
+            try:
+                self._raise_exception_variable(val)
+            finally:
+                # Update __cause__/__supppress_context__ in the raised exception
+                curr_exc = self.exn_vt_stack.get_current_exception()
+                cause = self._create_exception_type(from_vt)
+                curr_exc.call_setattr(self, ConstantVariable("__cause__"), cause)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def CLEANUP_THROW(self, inst):
         # https://github.com/python/cpython/pull/96010
@@ -1918,7 +2130,11 @@ def exception_handler(self, raised_exception):
                 self.jump(exn_tab_entry)
             else:
                 # No handler found. Bubble the exception to the parent
+<<<<<<< HEAD
                 # instruction translater. We use special exception for this.
+=======
+                # instruction translator. We use special exception for this.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.stack.clear()
                 if type(self) is InstructionTranslator:
                     unimplemented_v2(
@@ -1944,7 +2160,11 @@ def exception_handler(self, raised_exception):
                     self.exn_vt_stack.pop()
                     if len(self.block_stack) == 0:
                         # No handler found in this frame. Bubble the exception to the parent
+<<<<<<< HEAD
                         # instruction translater.
+=======
+                        # instruction translator.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.stack.clear()
                         if type(self) is InstructionTranslator:
                             unimplemented_v2(
@@ -1998,7 +2218,11 @@ def exception_handler(self, raised_exception):
                 self.jump(block_stack_entry)
             else:
                 # No handler found. Bubble the exception to the parent
+<<<<<<< HEAD
                 # instruction translater. We use special exception for this.
+=======
+                # instruction translator. We use special exception for this.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.stack.clear()
                 if type(self) is InstructionTranslator:
                     unimplemented_v2(
@@ -2105,7 +2329,11 @@ def check_if_exc_matches(self):
                 unimplemented_v2(
                     gb_type="Caught non-Exception value",
                     context=str(exc_instance),
+<<<<<<< HEAD
                     explanation=f"Except expects to recieve an object of Exception type but received {exc_instance}.",
+=======
+                    explanation=f"Except expects to receive an object of Exception type but received {exc_instance}.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     hints=[*graph_break_hints.USER_ERROR],
                 )
 
@@ -2193,6 +2421,7 @@ def CALL_FUNCTION_EX(self, inst):
             null = self.pop()
             assert isinstance(null, NullVariable)
 
+<<<<<<< HEAD
         if isinstance(fn, GetAttrVariable) and isinstance(fn.obj, TensorVariable):
             # realize is requires for Python 3.8
             kwargsvars = kwargsvars.realize()
@@ -2240,6 +2469,8 @@ def CALL_FUNCTION_EX(self, inst):
                     hints=[],
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(
             argsvars, BaseListVariable
         ) and argsvars.has_force_unpack_var_sequence(self):
@@ -2367,13 +2598,24 @@ def store_attr_graph_break(self, inst):
                 "STORE_ATTR instruction (i.e. `obj.attr = val`) that it should not compile the partial graph.",
                 hints=[],
             )
+<<<<<<< HEAD
         self.output.compile_subgraph(
             self, reason=GraphCompileReason("store_attr", [self.frame_summary()])
+=======
+        all_stack_locals_metadata = self.output.compile_subgraph(
+            self,
+            reason=GraphCompileReason("store_attr", [self.frame_summary()]),
+            stack_pops=2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.output.add_output_instructions([copy.copy(inst)])
         self.popn(2)
         self.output.add_output_instructions(
+<<<<<<< HEAD
             self.create_call_resume_at(self.next_instruction)
+=======
+            self.create_call_resume_at(self.next_instruction, all_stack_locals_metadata)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def DELETE_ATTR(self, inst):
@@ -2384,7 +2626,11 @@ def DELETE_ATTR(self, inst):
             {},
         )
 
+<<<<<<< HEAD
     def create_call_resume_at(self, offset):
+=======
+    def create_call_resume_at(self, offset, all_stack_locals_metadata):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise AssertionError(
             f"create_call_resume_at not overridden by subclass {type(self)}"
         )
@@ -2761,9 +3007,17 @@ def STOPITERATION_ERROR(self, inst):
         if val.exc_type is StopIteration:  # type: ignore[attr-defined]
             new_val = variables.BuiltinVariable(RuntimeError).call_function(
                 self,  # type: ignore[arg-type]
+<<<<<<< HEAD
                 [],
                 {},
             )
+=======
+                [ConstantVariable("generator raised StopIteration")],
+                {},
+            )
+            new_val.call_setattr(self, ConstantVariable("__context__"), val)  # type: ignore[attr-defined]
+            new_val.call_setattr(self, ConstantVariable("__cause__"), val)  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.stack[-1] = new_val
 
     def DICT_MERGE(self, inst):
@@ -2820,7 +3074,22 @@ def MATCH_KEYS(self, inst):
                 self.push(ConstantVariable.create(False))
 
     def LOAD_ASSERTION_ERROR(self, inst):
+<<<<<<< HEAD
         self.load_builtin_from_argval("AssertionError")
+=======
+        self.push(self.load_builtin_from_argval("AssertionError"))
+
+    def LOAD_BUILD_CLASS(self, inst):
+        unimplemented_v2(
+            gb_type="LOAD_BUILD_CLASS bytecode not supported",
+            context="",
+            explanation="Dynamo does not support tracing classes that are defined in the compiled region.",
+            hints=[
+                "Move the class definition out of the compiled region.",
+                *graph_break_hints.SUPPORTABLE,
+            ],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     UNARY_POSITIVE = stack_op(operator.pos)
     UNARY_NEGATIVE = stack_op(operator.neg)
@@ -2961,6 +3230,13 @@ def setup_or_before_with(self, inst):
                 explanation=f"Dynamo does not know how to enter a `{ctx.python_type_name()}` context manager.",
                 hints=[
                     "Avoid using the unsupported context manager.",
+<<<<<<< HEAD
+=======
+                    "If the context manager seems like it should be supported (e.g. torch.set_grad_enabled), then "
+                    "it may be the case that it was created outside the compiled region, which Dynamo does not support. "
+                    "Supported context managers can cross graph break boundaries only if they are local non-closure "
+                    "variables, or are intermediate values.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "File an issue to PyTorch. Simple context managers can potentially be supported, "
                     "but note that context managers can't be supported in general",
                 ],
@@ -3041,7 +3317,11 @@ def END_FOR(self, inst):
             self.popn(2)
 
     def LOAD_FAST_CHECK(self, inst):
+<<<<<<< HEAD
         if isinstance(self.symbolic_locals[inst.argval], NullVariable):
+=======
+        if isinstance(self.symbolic_locals.get(inst.argval, None), NullVariable):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unimplemented_v2(
                 gb_type="LOAD_FAST_CHECK on uninitialized variable",
                 context=inst.argval,
@@ -3211,6 +3491,10 @@ def __init__(
         distributed_state: Optional[DistributedState],
         # This determines whether to use the execution recorder.
         closure: Optional[tuple[types.CellType]] = None,
+<<<<<<< HEAD
+=======
+        package: Optional["CompilePackage"] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__()
         self.speculation_log = speculation_log
@@ -3265,6 +3549,15 @@ def __init__(
 
         self.strict_checks_fn = None
 
+<<<<<<< HEAD
+=======
+        self.is_leaf_tracer = True
+        self.parent = None
+        self.debug_locals = []
+
+        self.package = package
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sys.version_info >= (3, 10):
             from .resume_execution import (
                 CO_ASYNC_GENERATOR,
@@ -3283,6 +3576,16 @@ def __init__(
         self._constants_cache: list[Optional[VariableTracker]] = [None] * len(
             f_code.co_consts
         )
+<<<<<<< HEAD
+=======
+
+        self.is_trace_bytecode_log_enabled: Optional[bool] = (
+            trace_bytecode_log.isEnabledFor(logging.DEBUG)
+        )
+        self.is_trace_source_log_enabled: Optional[bool] = (
+            trace_source_log.isEnabledFor(logging.DEBUG)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         linecache.lazycache(f_code.co_filename, f_globals)
 
 
@@ -3318,6 +3621,10 @@ def __init__(
         speculation_log: SpeculationLog,
         exn_vt_stack: ExceptionStack,
         distributed_state: Optional[DistributedState],
+<<<<<<< HEAD
+=======
+        package: Optional["CompilePackage"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         _step_logger()(
             logging.INFO,
@@ -3335,6 +3642,10 @@ def __init__(
                 global_scope=f_globals,
                 f_code=f_code,
                 torch_function_mode_stack=torch_function_mode_stack,
+<<<<<<< HEAD
+=======
+                package=package,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             instructions=instructions,
             f_locals=f_locals,
@@ -3352,6 +3663,10 @@ def __init__(
             speculation_log=speculation_log,
             exn_vt_stack=exn_vt_stack,
             distributed_state=distributed_state,
+<<<<<<< HEAD
+=======
+            package=package,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self._throw_if_in_functorch()
@@ -3448,7 +3763,10 @@ def __init__(
                 torch_function_mode_stack
             )
 
+<<<<<<< HEAD
             self.debug_locals: list[tuple[VariableTracker, list[VariableTracker]]] = []
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if export:
                 # export gets confused if we never realize unused inputs
                 # in export mode just eagerly realize everything
@@ -3511,7 +3829,11 @@ def should_compile_partial_graph(self):
             and not self.active_generic_context_managers
         )
 
+<<<<<<< HEAD
     def create_call_resume_at(self, inst):
+=======
+    def create_call_resume_at(self, inst, all_stack_locals_metadata):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.instruction_pointer = None
 
         if inst.opname == "RETURN_VALUE":
@@ -3526,6 +3848,7 @@ def create_call_resume_at(self, inst):
             if k in reads and k not in self.cell_and_freevars()
         )
         # NOTE: do not use isinstance, since it realizes lazy VT's
+<<<<<<< HEAD
         argnames = tuple(
             k
             for k in all_argnames
@@ -3538,12 +3861,24 @@ def create_call_resume_at(self, inst):
         )
         if sys.version_info < (3, 12):
             assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
+=======
+        argnames_null_set = set(all_stack_locals_metadata[0].locals_null_keys)
+        argnames = tuple(k for k in all_argnames if k not in argnames_null_set)
+        argnames_null = tuple(k for k in all_argnames if k in argnames_null_set)
+        if sys.version_info < (3, 12):
+            assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
+        # compile_subgraph did not codegen any NULLs,
+        # so we should not count NullVariables
+        stack_len = len(self.stack) - len(all_stack_locals_metadata[0].stack_null_idxes)
+        nargs = stack_len + len(argnames)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cg = PyCodegen(self)
 
         # Handle inactive context variables.
         # The resume function assumes that context variables are the class, NOT the object.
         # e.g. torch.set_grad_enabled(True) will be reconstructed as torch.set_grad_enabled
+<<<<<<< HEAD
         stack_ctx_vars = []
         for i, var in enumerate(self.stack):
             if type.__instancecheck__(ContextWrappingVariable, var):
@@ -3597,6 +3932,27 @@ def create_call_resume_at(self, inst):
         nargs = stack_len + len(argnames)
 
         name = unique_id(f"__resume_at_{inst.offset}")
+=======
+        # NOTE: if the unsupported instruction modifies the inactive context variable, it may
+        # result in silent incorrectness!
+        for (i, _), i_orig in zip(
+            all_stack_locals_metadata[0].stack_ctx_args,
+            all_stack_locals_metadata[0].stack_ctx_idxes_orig,
+        ):
+            # Replace the current stack var with the context class
+            ctx = cast(ContextWrappingVariable, self.stack[i_orig])
+            ctx.reconstruct_type(cg)
+            cg.extend_output(create_swap(stack_len - i + 1))
+            cg.append_output(create_instruction("POP_TOP"))
+
+        for name, _ in all_stack_locals_metadata[0].locals_ctx_args:
+            # Replace the local with the context class
+            ctx = cast(ContextWrappingVariable, self.symbolic_locals[name])
+            ctx.reconstruct_type(cg)
+            cg.append_output(create_instruction("STORE_FAST", argval=name))
+
+        name = unique_id(f"__resume_at_{inst.offset}", with_uuid=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         new_code: types.CodeType = ContinueExecutionCache.lookup(
             self.f_code,
@@ -3607,9 +3963,15 @@ def create_call_resume_at(self, inst):
             argnames,
             argnames_null,
             tuple(b.resume_fn() for b in self.block_stack),
+<<<<<<< HEAD
             tuple(stack_ctx_vars),
             tuple(argnames_ctx_vars),
             tuple(null_idxes),
+=======
+            tuple(all_stack_locals_metadata[0].stack_ctx_args),
+            tuple(all_stack_locals_metadata[0].locals_ctx_args),
+            tuple(all_stack_locals_metadata[0].stack_null_idxes),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Add original GraphModule context to the resume function to handle
@@ -3626,12 +3988,25 @@ def create_call_resume_at(self, inst):
             # expose code object for debugging purposes
             self.output.install_global_unsafe(name, new_code)
             cg.make_function_with_closure(name, new_code, True, stack_len)
+<<<<<<< HEAD
+=======
+            package_name = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             # This is safe: we pre-generate a unique name
             self.output.install_global_unsafe(
                 name, types.FunctionType(new_code, self.f_globals, name)
             )
             cg.extend_output(cg.load_function_name(name, True, stack_len))
+<<<<<<< HEAD
+=======
+            package_name = name
+
+        if self.package is not None:
+            self.package.add_resume_function(
+                new_code, self.f_globals["__name__"], package_name
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cg.extend_output([cg.create_load(k) for k in argnames])
         cg.extend_output(create_call_function(nargs, False))
@@ -3681,12 +4056,23 @@ def _return(self, inst):
             f"torchdynamo done tracing {self.f_code.co_name} ({inst.opname})",
         )
         log.debug("%s triggered compile", inst.opname)
+<<<<<<< HEAD
         self.output.compile_subgraph(
+=======
+        all_stack_locals_metadata = self.output.compile_subgraph(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self,
             reason=GraphCompileReason(
                 "return_value", [self.frame_summary()], graph_break=False
             ),
         )
+<<<<<<< HEAD
+=======
+        # check that our stack/locals meta are correct:
+        # we should only be tracing 1 frame, and there should not be any NULLs on the stack
+        assert len(all_stack_locals_metadata) == 1
+        assert not all_stack_locals_metadata[0].stack_null_idxes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return_inst = (
             create_instruction("RETURN_VALUE")
             if inst.opname == "RETURN_VALUE"
@@ -3716,6 +4102,10 @@ class InliningInstructionTranslator(InstructionTranslatorBase):
     """Trace and inline a called method"""
 
     symbolic_result: Optional[VariableTracker]
+<<<<<<< HEAD
+=======
+    parent: InstructionTranslatorBase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def inline_call(cls, parent, func, args, kwargs):
@@ -3734,6 +4124,25 @@ def check_inlineable(func):
                 hints=[],
             )
 
+<<<<<<< HEAD
+=======
+        if isinstance(func, UserFunctionVariable) and inspect.getattr_static(
+            func.get_function(), "_torchdynamo_disable", False
+        ):
+            msg = inspect.getattr_static(
+                func.get_function(), "_torchdynamo_disable_msg", None
+            )
+            unimplemented_v2(
+                gb_type="Skip inlining `torch.compiler.disable()`d function",
+                context=str(func.get_function()),
+                explanation=f"Skip inlining function {func.get_function()} since it was wrapped "
+                f"with `torch.compiler.disable` (reason: {msg})",
+                hints=[
+                    "Remove the `torch.compiler.disable` call",
+                ],
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = trace_rules.check_verbose(func, is_inlined_call=True)
         if result.skipped:
             from torch._dynamo.variables.misc import produce_trampoline_autograd_apply
@@ -3753,11 +4162,18 @@ def check_inlineable(func):
             ]
             if "_dynamo" not in func.get_filename():
                 hints += [
+<<<<<<< HEAD
                     f"Remove the function `{fn_qualname}` or the file `{func.get_filename()}` "
                     "from torch/_dynamo/trace_rules.py. More graph breaks may occur as a result of "
                     "attempting to trace into the function.",
                     "Please file an issue to PyTorch.",
                     # TODO suggest mark_force_inline when implemented
+=======
+                    f"Apply `@torch._dynamo.dont_skip_tracing` to the function `{fn_qualname}` "
+                    "to force tracing into the function. "
+                    "More graph breaks may occur as a result of attempting to trace into the function.",
+                    "Please file an issue to PyTorch.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
             unimplemented_v2(
                 gb_type="Attempted to inline function marked as skipped",
@@ -3768,6 +4184,7 @@ def check_inlineable(func):
                 hints=hints,
             )
 
+<<<<<<< HEAD
         if isinstance(func, UserFunctionVariable) and inspect.getattr_static(
             func.get_function(), "_torchdynamo_disable", False
         ):
@@ -3781,6 +4198,9 @@ def check_inlineable(func):
             )
         else:
             return result
+=======
+        return result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def build_inline_tracer(
@@ -3789,6 +4209,7 @@ def build_inline_tracer(
         args: list[VariableTracker],
         kwargs,
     ):
+<<<<<<< HEAD
         if isinstance(func, SkipFunctionVariable):
             unimplemented_v2(
                 gb_type="Attempted to inline function marked as skipped (SkipFunctionVariable)",
@@ -3796,6 +4217,8 @@ def build_inline_tracer(
                 explanation="Attempted to inline a function that was previously determined to be marked as intentionally skipped.",
                 hints=[],
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(
             func,
             (
@@ -3805,8 +4228,40 @@ def build_inline_tracer(
                 LocalGeneratorObjectVariable,
             ),
         )
+<<<<<<< HEAD
         result = InliningInstructionTranslator.check_inlineable(func)
         assert result.skipped is False
+=======
+        code: types.CodeType = func.get_code()
+        result = None
+        tracing_ctx = parent.output.tracing_context
+
+        # Check if we have already identified this function to be inline-able.
+        # The exception is dont_skip_tracing flag which affects the inline
+        # behavior. If the flag is True, don't rely on previous results.
+        if not config.dont_skip_tracing and tracing_ctx:
+            if previous_result := tracing_ctx.previously_inlined_functions.get(
+                code, None
+            ):
+                result = previous_result
+
+        if result is None:
+            if isinstance(func, SkipFunctionVariable):
+                unimplemented_v2(
+                    gb_type="Attempted to inline function marked as skipped (SkipFunctionVariable)",
+                    context=f"Attempted to inline a SkipFunctionVariable {func}",
+                    explanation=(
+                        "Attempted to inline a function that was previously determined to be marked as intentionally skipped."
+                    ),
+                    hints=[],
+                )
+            result = InliningInstructionTranslator.check_inlineable(func)
+            assert result.skipped is False
+
+            if not config.dont_skip_tracing and tracing_ctx:
+                tracing_ctx.previously_inlined_functions[code] = result
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             sub_locals = func.bind_args(parent, args, kwargs)
         except TypeError as e:
@@ -3829,7 +4284,10 @@ def build_inline_tracer(
                     hints=[*graph_break_hints.DYNAMO_BUG],
                 )
 
+<<<<<<< HEAD
         code: types.CodeType = func.get_code()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if code.co_name in ("__setitem__", "__setattr__") and not (
             args and isinstance(args[0], variables.UserDefinedObjectVariable)
         ):
@@ -3848,9 +4306,17 @@ def build_inline_tracer(
         if sys.version_info >= (3, 11):
             cur_inst = parent.current_instruction
             parent_code = parent.f_code
+<<<<<<< HEAD
             header = parent.get_line_of_code_header(lineno=cur_inst.positions.lineno)
 
             def get_trace_call_log_str():
+=======
+
+            def get_trace_call_log_str():
+                header = parent.get_line_of_code_header(
+                    lineno=cur_inst.positions.lineno
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 line = get_instruction_source_311(parent_code, cur_inst).rstrip()
                 return f"TRACE inlined call {code.co_name} from {header}\n{line}"
 
@@ -3922,6 +4388,10 @@ def inline_call_(self):
         parent.inconsistent_side_effects |= self.inconsistent_side_effects
 
         log.debug("DONE INLINING %s", code)
+<<<<<<< HEAD
+=======
+        self.output.tracing_context.traced_code.append(code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if config.enable_faithful_generator_behavior or (
             isinstance(self, InliningGeneratorInstructionTranslator)
@@ -3961,8 +4431,30 @@ def __init__(
         f_builtins = f_globals["__builtins__"]
         if not isinstance(f_builtins, dict):
             f_builtins = f_builtins.__dict__
+<<<<<<< HEAD
         instructions = cleaned_instructions(code)
         propagate_line_nums(instructions)
+=======
+
+        # Get the cached instructions. These instructions are safe to cache
+        # because we dont mutate them in transform_code_object (those
+        # instructions are for the top most Instruction translator).  Also, we
+        # have to be careful about not using _cached_cleaned_instructions here
+        # because that function is global, while we want the the cache to be
+        # alive only during a compmilation.
+        tracing_ctx = parent.output.tracing_context
+        instructions = None
+        if tracing_ctx:
+            if tracing_ctx.previously_cleaned_instructions.get(code):
+                instructions = tracing_ctx.previously_cleaned_instructions[code]
+
+        if instructions is None:
+            instructions = cleaned_instructions(code)
+            propagate_line_nums(instructions)
+            if tracing_ctx:
+                tracing_ctx.previously_cleaned_instructions[code] = instructions
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             output=parent.output,
             f_locals={},
@@ -3979,6 +4471,10 @@ def __init__(
             speculation_log=parent.speculation_log,
             exn_vt_stack=parent.exn_vt_stack,
             distributed_state=parent.distributed_state,
+<<<<<<< HEAD
+=======
+            package=parent.package,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.funcvar = funcvar
         self.parent = parent
@@ -3997,7 +4493,11 @@ def run_ctx_mgr(self):
     def should_compile_partial_graph(self):
         return False  # inlining functions is all-or-nothing
 
+<<<<<<< HEAD
     def create_call_resume_at(self, offset):
+=======
+    def create_call_resume_at(self, inst, all_stack_locals_metadata):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unimplemented_v2(
             gb_type="Graph break in inlined function",
             context="",
@@ -4016,7 +4516,16 @@ def RETURN_CONST(self, inst):
         raise ReturnValueOp
 
     def get_globals_source_and_value(self, name):
+<<<<<<< HEAD
         if "__name__" in self.f_globals:
+=======
+        # NamedTuple's `__new__` has a fake global scope that's not an actual
+        # module. TODO generalize the check for other non-importable cases.
+        # https://github.com/python/cpython/blob/8421b03b16a4852a527256cb7cdce2ab2d318548/Lib/collections/__init__.py#L441-L447
+        if "__name__" in self.f_globals and not self.f_globals["__name__"].startswith(
+            "namedtuple_"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             module_name = self.f_globals["__name__"]
             module_source = self.import_source(module_name)
             if "torch_package" in module_name:
@@ -4027,7 +4536,12 @@ def get_globals_source_and_value(self, name):
                 )  # type: ignore[assignment]
             else:
                 fglobals_value = _import_module(module_name)
+<<<<<<< HEAD
             fglobals_vt = VariableTracker.build(self, fglobals_value, module_source)
+=======
+            # Dont use lazy vt because we will do a setattr afterwards
+            fglobals_vt = VariableBuilder(self, module_source)(fglobals_value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             global_source = AttrSource(module_source, name)
         else:
             globals_name = self.output.install_global_by_id(
@@ -4035,27 +4549,46 @@ def get_globals_source_and_value(self, name):
             )
             globals_source = GlobalSource(globals_name)
             fglobals_value = self.f_globals  # type: ignore[assignment]
+<<<<<<< HEAD
             fglobals_vt = VariableTracker.build(self, fglobals_value, globals_source)
+=======
+            # Dont use lazy vt because we will do a setattr afterwards
+            fglobals_vt = VariableBuilder(self, globals_source)(fglobals_value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             global_source = DictGetItemSource(globals_source, name)  # type: ignore[assignment]
         return fglobals_value, fglobals_vt, global_source
 
     def _load_global(self, inst):
+<<<<<<< HEAD
+=======
+        name = inst.argval
+        if name not in self.f_globals:
+            return self.load_builtin(inst)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.output.global_scope is self.f_globals:
             # If the global scope matches that of the root frame, use handler in
             # root frame instruction translator, to enforce consistency.
             super()._load_global(inst)
         else:
+<<<<<<< HEAD
             name = inst.argval
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _, fglobals_vt, global_source = self.get_globals_source_and_value(name)
             if self.output.side_effects.has_pending_mutation_of_attr(fglobals_vt, name):
                 self.push(self.output.side_effects.load_attr(fglobals_vt, name))
             else:
+<<<<<<< HEAD
                 try:
                     value = self.f_globals[name]
                 except KeyError:
                     return self.load_builtin(inst)
 
+=======
+                value = self.f_globals[name]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.push(VariableTracker.build(self, value, global_source))
 
     def STORE_GLOBAL(self, inst):
@@ -4079,7 +4612,11 @@ def STORE_GLOBAL(self, inst):
 
 class InliningGeneratorInstructionTranslator(InliningInstructionTranslator):
     generated_items: list[VariableTracker]
+<<<<<<< HEAD
     # Flag wether or not the InlineGenerator should consume the entire iterator
+=======
+    # Flag whether or not the InlineGenerator should consume the entire iterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
index e927fc4a1eaf..3a0586f713e3 100644
--- a/torch/_dynamo/test_case.py
+++ b/torch/_dynamo/test_case.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """Testing utilities for Dynamo, providing a specialized TestCase class and test running functionality.
 
 This module extends PyTorch's testing framework with Dynamo-specific testing capabilities.
@@ -10,8 +15,17 @@
 
 import contextlib
 import importlib
+<<<<<<< HEAD
+import logging
+import os
+=======
+import inspect
 import logging
 import os
+import re
+import sys
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Union
 
 import torch
@@ -95,3 +109,107 @@ def tearDown(self) -> None:
         if self._prior_is_grad_enabled is not torch.is_grad_enabled():
             log.warning("Running test changed grad mode")
             torch.set_grad_enabled(self._prior_is_grad_enabled)
+<<<<<<< HEAD
+=======
+
+
+class CPythonTestCase(TestCase):
+    """
+    Test class for CPython tests located in "test/dynamo/CPython/Py_version/*".
+
+    This class enables specific features that are disabled by default, such as
+    tracing through unittest methods.
+    """
+
+    _stack: contextlib.ExitStack
+    dynamo_strict_nopython = True
+
+    # Restore original unittest methods to simplify tracing CPython test cases.
+    assertEqual = unittest.TestCase.assertEqual  # type: ignore[assignment]
+    assertNotEqual = unittest.TestCase.assertNotEqual  # type: ignore[assignment]
+    assertTrue = unittest.TestCase.assertTrue
+    assertFalse = unittest.TestCase.assertFalse
+    assertIs = unittest.TestCase.assertIs
+    assertIsNot = unittest.TestCase.assertIsNot
+    assertIsNone = unittest.TestCase.assertIsNone
+    assertIsNotNone = unittest.TestCase.assertIsNotNone
+    assertIn = unittest.TestCase.assertIn
+    assertNotIn = unittest.TestCase.assertNotIn
+    assertIsInstance = unittest.TestCase.assertIsInstance
+    assertNotIsInstance = unittest.TestCase.assertNotIsInstance
+    assertAlmostEqual = unittest.TestCase.assertAlmostEqual
+    assertNotAlmostEqual = unittest.TestCase.assertNotAlmostEqual
+    assertGreater = unittest.TestCase.assertGreater
+    assertGreaterEqual = unittest.TestCase.assertGreaterEqual
+    assertLess = unittest.TestCase.assertLess
+    assertLessEqual = unittest.TestCase.assertLessEqual
+    assertRegex = unittest.TestCase.assertRegex
+    assertNotRegex = unittest.TestCase.assertNotRegex
+    assertCountEqual = unittest.TestCase.assertCountEqual
+    assertMultiLineEqual = unittest.TestCase.assertMultiLineEqual
+    assertSequenceEqual = unittest.TestCase.assertSequenceEqual
+    assertListEqual = unittest.TestCase.assertListEqual
+    assertTupleEqual = unittest.TestCase.assertTupleEqual
+    assertSetEqual = unittest.TestCase.assertSetEqual
+    assertDictEqual = unittest.TestCase.assertDictEqual
+    assertRaises = unittest.TestCase.assertRaises
+    assertRaisesRegex = unittest.TestCase.assertRaisesRegex
+    assertWarns = unittest.TestCase.assertWarns
+    assertWarnsRegex = unittest.TestCase.assertWarnsRegex
+    assertLogs = unittest.TestCase.assertLogs
+    fail = unittest.TestCase.fail
+    failureException = unittest.TestCase.failureException
+
+    def compile_fn(self, fn, backend, nopython):
+        # We want to compile only the test function, excluding any setup code
+        # from unittest
+        method = getattr(self, self._testMethodName)
+        method = torch._dynamo.optimize(backend, nopython=nopython)(method)
+        setattr(self, self._testMethodName, method)
+        return fn
+
+    def _dynamo_test_key(self):
+        suffix = super()._dynamo_test_key()
+        test_cls = self.__class__
+        test_file = inspect.getfile(test_cls).split(os.sep)[-1].split(".")[0]
+        py_ver = re.search(r"/([\d_]+)/", inspect.getfile(test_cls))
+        if py_ver:
+            py_ver = py_ver.group().strip(os.sep).replace("_", "")  # type: ignore[assignment]
+        else:
+            return suffix
+        return f"CPython{py_ver}-{test_file}-{suffix}"
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        cls._stack.close()
+        super().tearDownClass()
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        # Skip test if python versions doesn't match
+        prefix = os.path.join("dynamo", "cpython") + os.path.sep
+        regex = re.escape(prefix) + r"\d_\d{2}"
+        search_path = inspect.getfile(cls)
+        m = re.search(regex, search_path)
+        if m:
+            test_py_ver = tuple(map(int, m.group().removeprefix(prefix).split("_")))
+            py_ver = sys.version_info[:2]
+            if py_ver < test_py_ver:
+                expected = ".".join(map(str, test_py_ver))
+                got = ".".join(map(str, py_ver))
+                raise unittest.SkipTest(
+                    f"Test requires Python {expected} but got Python {got}"
+                )
+        else:
+            raise unittest.SkipTest(
+                f"Test requires a specific Python version but not found in path {inspect.getfile(cls)}"
+            )
+
+        super().setUpClass()
+        cls._stack = contextlib.ExitStack()  # type: ignore[attr-defined]
+        cls._stack.enter_context(  # type: ignore[attr-defined]
+            config.patch(
+                enable_trace_unittest=True,
+            ),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/test_dont_skip_tracing_functions.py b/torch/_dynamo/test_dont_skip_tracing_functions.py
new file mode 100644
index 000000000000..1edce5ff857f
--- /dev/null
+++ b/torch/_dynamo/test_dont_skip_tracing_functions.py
@@ -0,0 +1,40 @@
+"""
+Functions used to test torch._dynamo.dont_skip_tracing.
+This file is located in torch/_dynamo so that it is skipped by trace rules.
+There is a special rule in trace_rules that doesn't skip this file when
+dont_skip_tracing is active.
+"""
+
+import torch
+
+
+def f1(x: torch.Tensor) -> torch.Tensor:
+    return x + 1
+
+
+def f2(x: torch.Tensor) -> torch.Tensor:
+    return x + 1
+
+
+def f3(x: torch.Tensor) -> torch.Tensor:
+    return f2(x)
+
+
+def f4(x: torch.Tensor) -> torch.Tensor:
+    x = f5(x, 1)
+    x = torch._dynamo.dont_skip_tracing(f6)(x)
+    x = f5(x, 8)
+    return x
+
+
+def f5(x: torch.Tensor, n: int) -> torch.Tensor:
+    if torch.compiler.is_compiling():
+        return x + n
+    return x
+
+
+def f6(x: torch.Tensor) -> torch.Tensor:
+    x = f5(x, 2)
+    torch._dynamo.graph_break()
+    x = f5(x, 4)
+    return x
diff --git a/torch/_dynamo/test_minifier_common.py b/torch/_dynamo/test_minifier_common.py
index 674157699884..52de7f80f2f5 100644
--- a/torch/_dynamo/test_minifier_common.py
+++ b/torch/_dynamo/test_minifier_common.py
@@ -73,6 +73,11 @@ class MinifierTestBase(torch._dynamo.test_case.TestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
+<<<<<<< HEAD
+=======
+        if not os.path.exists(cls.DEBUG_DIR):
+            cls.DEBUG_DIR = tempfile.mkdtemp()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cls._exit_stack.enter_context(  # type: ignore[attr-defined]
             torch._dynamo.config.patch(debug_dir_root=cls.DEBUG_DIR)
         )
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index d44ad4b2408d..6cb4726d5dbc 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -213,6 +213,10 @@ def insert_nops(instructions: list[Any], code_options: Any) -> None:
             global_scope=globals(),
             f_code=frame.f_code,
             torch_function_mode_stack=[],
+<<<<<<< HEAD
+=======
+            package=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         return wrap_guarded_code(
@@ -312,6 +316,29 @@ def bw_compiler(
         )
 
 
+<<<<<<< HEAD
+=======
+class InductorAndRecordGraphs:
+    def __init__(self) -> None:
+        self.graphs: list[torch.fx.GraphModule] = []
+        self.inductor_graphs: list[torch.fx.GraphModule] = []
+
+    def __call__(self, gm, example_inputs):  # type: ignore[no-untyped-def]
+        import torch._inductor.compile_fx as compile_fx_mod
+
+        self.graphs.append(gm)
+
+        old_compile_fx_inner = compile_fx_mod._compile_fx_inner
+
+        def patched(*args, **kwargs):  # type: ignore[no-untyped-def]
+            self.inductor_graphs.append(args[0])
+            return old_compile_fx_inner(*args, **kwargs)
+
+        with patch.object(compile_fx_mod, "_compile_fx_inner", new=patched):
+            return compile_fx_mod.compile_fx(gm, example_inputs)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def strip_comment(code: str) -> str:
     return re.sub(r"(?m)^ *#.*\n?", "", code)
 
@@ -465,31 +492,51 @@ def make_test_cls_with_patches(
 
 
 # test Python 3.11+ specific features
+<<<<<<< HEAD
 def skipIfNotPy311(fn: Callable[..., Any]) -> Callable[..., Any]:
+=======
+def skipIfNotPy311(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 11):
         return fn
     return unittest.skip(fn)
 
 
+<<<<<<< HEAD
 def skipIfNotPy312(fn: Callable[..., Any]) -> Callable[..., Any]:
+=======
+def skipIfNotPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 12):
         return fn
     return unittest.skip("Requires Python 3.12+")(fn)
 
 
+<<<<<<< HEAD
 def xfailIfPy312(fn: Callable[..., Any]) -> Callable[..., Any]:
+=======
+def xfailIfPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 12):
         return unittest.expectedFailure(fn)
     return fn
 
 
+<<<<<<< HEAD
 def skipIfPy312(fn: Callable[..., Any]) -> Callable[..., Any]:
+=======
+def skipIfPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 12):
         return unittest.skip("Not supported in Python 3.12+")(fn)
     return fn
 
 
+<<<<<<< HEAD
 def requiresPy310(fn: Callable[..., Any]) -> Callable[..., Any]:
+=======
+def requiresPy310(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if sys.version_info >= (3, 10):
         return fn
     else:
@@ -498,19 +545,31 @@ def requiresPy310(fn: Callable[..., Any]) -> Callable[..., Any]:
 
 # Controls tests generated in test/inductor/test_torchinductor_dynamic_shapes.py
 # and test/dynamo/test_dynamic_shapes.py
+<<<<<<< HEAD
 def expectedFailureDynamic(fn: Callable[..., Any]) -> Callable[..., Any]:
+=======
+def expectedFailureDynamic(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fn._expected_failure_dynamic = True  # type: ignore[attr-defined]
     return fn
 
 
 # Controls tests generated in test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+<<<<<<< HEAD
 def expectedFailureCodegenDynamic(fn: Callable[..., Any]) -> Callable[..., Any]:
+=======
+def expectedFailureCodegenDynamic(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fn._expected_failure_codegen_dynamic = True  # type: ignore[attr-defined]
     return fn
 
 
 # Controls test generated in test/inductor/test_cpp_wrapper.py
+<<<<<<< HEAD
 def expectedFailureDynamicWrapper(fn: Callable[..., Any]) -> Callable[..., Any]:
+=======
+def expectedFailureDynamicWrapper(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fn._expected_failure_dynamic_wrapper = True  # type: ignore[attr-defined]
     return fn
 
@@ -524,3 +583,12 @@ def reset_rng_state(use_xla: bool = False) -> None:
         import torch_xla.core.xla_model as xm
 
         xm.set_rng_state(1337, str(xm.xla_device()))
+<<<<<<< HEAD
+=======
+
+
+def _skipped_function_for_test_reconstruct(
+    f: Callable[_P, _T], *args: _P.args, **kwargs: _P.kwargs
+) -> _T:
+    return f(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index da71649a2b11..7b40b4bd7a51 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -1,4 +1,29 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+
+"""
+Tracing rules and policies for TorchDynamo compilation decisions.
+
+This module defines the rules that govern what code TorchDynamo should trace and compile
+versus what should be executed eagerly. It contains functions and classes that determine:
+
+- Which modules, functions, and objects should be skipped during tracing
+- Which parts of the code should cause graph breaks
+- How to handle different Python libraries and third-party packages
+- Rules for determining when to inline functions vs calling them eagerly
+
+Key components:
+- Skip rules: Functions that return True if an object should be skipped during tracing
+- Inlining rules: Policies for when to inline function calls during compilation
+- Library-specific handling: Special cases for popular Python packages
+- Performance heuristics: Rules that balance compilation overhead vs runtime benefits
+
+These rules are critical for TorchDynamo's ability to automatically determine
+compilation boundaries and optimize PyTorch programs effectively.
+"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import abc
 import builtins
 import collections
@@ -25,8 +50,15 @@
 import torch._inductor.test_operators
 import torch.distributed
 import torch.utils._content_store
+<<<<<<< HEAD
 from torch.utils import _config_module
 
+=======
+from torch._environment import is_fbcode
+from torch.utils import _config_module
+
+from . import config
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
 from .utils import getfile, hashable, NP_SUPPORTED_MODULES, unwrap_if_wrapper
 from .variables import (
@@ -141,6 +173,10 @@
     "torch._utils.is_compiling": TorchInGraphFunctionVariable,
     "torch.fx._symbolic_trace.is_fx_tracing": TorchInGraphFunctionVariable,
     "torch._dynamo.external_utils.is_compiling": TorchInGraphFunctionVariable,
+<<<<<<< HEAD
+=======
+    "torch._dynamo.utils._disable_side_effect_safety_checks_for_current_subtracer": UserFunctionVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch.compiler.is_compiling": TorchInGraphFunctionVariable,
     "torch.compiler.is_dynamo_compiling": TorchInGraphFunctionVariable,
     "torch.compiler.is_exporting": TorchInGraphFunctionVariable,
@@ -305,9 +341,25 @@
     "torch._tensor._convert": UserFunctionVariable,
     "torch.jit._unwrap_optional": UserFunctionVariable,
     "torch.backends.mha.get_fastpath_enabled": UserFunctionVariable,
+<<<<<<< HEAD
+    "torch._dynamo.mark_static": UserFunctionVariable,
+    "torch._dynamo.nonstrict_trace": UserFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.guard_size_oblivious": TorchInGraphFunctionVariable,
+=======
+    "torch._dynamo.dont_skip_tracing": UserFunctionVariable,
     "torch._dynamo.mark_static": UserFunctionVariable,
     "torch._dynamo.nonstrict_trace": UserFunctionVariable,
+    "torch._dynamo.patch_dynamo_config": UserFunctionVariable,
     "torch.fx.experimental.symbolic_shapes.guard_size_oblivious": TorchInGraphFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.guard_or_true": TorchInGraphFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.guard_or_false": TorchInGraphFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.statically_known_true": TorchInGraphFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.statically_known_false": TorchInGraphFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.sym_and": TorchInGraphFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.sym_or": TorchInGraphFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.guard_scalar": TorchInGraphFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.has_static_value": TorchInGraphFunctionVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch.cuda._get_device_properties": TorchInGraphFunctionVariable,
     "torch.utils.hooks.BackwardHook": TorchInGraphFunctionVariable,
     "torch.set_default_device": UserFunctionVariable,
@@ -317,6 +369,11 @@
     "torch.sparse_csr_tensor": SkipFunctionVariable,
     "torch.sparse_compressed_tensor": SkipFunctionVariable,
     "torch._C._autograd._unsafe_set_version_counter": TorchInGraphFunctionVariable,
+<<<<<<< HEAD
+=======
+    "torch.xpu.get_rng_state": SkipFunctionVariable,
+    "torch.xpu.set_rng_state": SkipFunctionVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # avoid skipping user defined modules in distributed unit tests
     "torch/testing/_internal/common_fsdp.py#forward": UserFunctionVariable,
     f"torch/testing/_internal/common_fsdp.py#{TORCH_DYNAMO_RESUME_IN_PREFIX}": UserFunctionVariable,
@@ -362,6 +419,10 @@
         "math.isinf",
         "math.isnan",
         "math.isqrt",
+<<<<<<< HEAD
+=======
+        "math.lcm",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "math.ldexp",
         "math.lgamma",
         "math.log",
@@ -393,6 +454,14 @@
         "torch._assert_async",
         "torch._assert_tensor_metadata",
         "torch._batch_norm_impl_index",
+<<<<<<< HEAD
+=======
+        "torch._C._accelerator_getAccelerator",
+        "torch._C._accelerator_getDeviceIndex",
+        "torch._C._accelerator_getStream",
+        "torch._C._accelerator_setStream",
+        "torch._C._accelerator_synchronizeDevice",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C._activate_gpu_trace",
         "torch._C._add_cached_tensor",
         "torch._C._add_docstr",
@@ -451,7 +520,11 @@
         "torch._C._cuda_cudaHostAllocator",
         "torch._C._cuda_customAllocator",
         "torch._C._cuda_emptyCache",
+<<<<<<< HEAD
         "torch._C._cuda_endAllocateCurrentStreamToPool",
+=======
+        "torch._C._cuda_endAllocateToPool",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C._cuda_exchangeDevice",
         "torch._C._cuda_get_conv_benchmark_empty_cache",
         "torch._C._cuda_get_cudnn_benchmark_limit",
@@ -510,7 +583,10 @@
         "torch._C._debug_set_fusion_group_inlining",
         "torch._C._demangle",
         "torch._C._disabled_torch_dispatch_impl",
+<<<<<<< HEAD
         "torch._C._disabled_torch_function_impl",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C._dispatch_call_boxed",
         "torch._C._dispatch_check_all_invariants",
         "torch._C._dispatch_check_invariants",
@@ -678,6 +754,10 @@
         "torch._C._is_multithreading_enabled",
         "torch._C._is_torch_function_enabled",
         "torch._C._is_torch_function_mode_enabled",
+<<<<<<< HEAD
+=======
+        "torch._C._is_torch_function_all_disabled",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C._is_tracing",
         "torch._C._is_view_replay_enabled",
         "torch._C._is_xnnpack_enabled",
@@ -1309,6 +1389,24 @@
         "torch._C._warn",
         "torch._C._will_engine_execute_node",
         "torch._C._wrap_tensor_impl",
+<<<<<<< HEAD
+=======
+        "torch._C._xpu_emptyCache",
+        "torch._C._xpu_getArchFlags",
+        "torch._C._xpu_getCurrentStream",
+        "torch._C._xpu_getCurrentRawStream",
+        "torch._C._xpu_getDeviceCount",
+        "torch._C._xpu_getDevice",
+        "torch._C._xpu_getMemoryInfo",
+        "torch._C._xpu_getStreamFromExternal",
+        "torch._C._xpu_isInBadFork",
+        "torch._C._xpu_init",
+        "torch._C._xpu_memoryStats",
+        "torch._C._xpu_resetAccumulatedMemoryStats",
+        "torch._C._xpu_resetPeakMemoryStats",
+        "torch._C._xpu_setStream",
+        "torch._C._xpu_synchronize",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._C.fork",
         "torch._C.get_autocast_cpu_dtype",
         "torch._C.get_autocast_dtype",
@@ -1505,6 +1603,10 @@
         "torch._fused_sdp_choice",
         "torch._fw_primal_copy",
         "torch._grid_sampler_2d_cpu_fallback",
+<<<<<<< HEAD
+=======
+        "torch._grouped_mm",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._has_compatible_shallow_copy_type",
         "torch._histogramdd_bin_edges",
         "torch._histogramdd_from_bin_cts",
@@ -1574,6 +1676,10 @@
         "torch._scaled_dot_product_flash_attention_for_cpu",
         "torch._scaled_dot_product_cudnn_attention",
         "torch._scaled_mm",
+<<<<<<< HEAD
+=======
+        "torch._scaled_grouped_mm",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._shape_as_tensor",
         "torch._sobol_engine_draw",
         "torch._sobol_engine_ff_",
@@ -1621,6 +1727,10 @@
         "torch._values_copy",
         "torch._weight_int4pack_mm",
         "torch._weight_int4pack_mm_for_cpu",
+<<<<<<< HEAD
+=======
+        "torch._weight_int4pack_mm_with_scales_and_zeros",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch._weight_int8pack_mm",
         "torch._weight_norm_interface",
         "torch._weight_norm",
@@ -2228,12 +2338,19 @@
         "torch.slice_inverse",
         "torch._assert_scalar",
         "torch._functional_assert_scalar",
+<<<<<<< HEAD
+=======
+        "torch.xpu._get_device_properties",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
     TorchInGraphFunctionVariable,
 )
 
 
+<<<<<<< HEAD
 torch_c_binding_in_graph_functions["math.lcm"] = TorchInGraphFunctionVariable
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if sys.version_info >= (3, 11):
     torch_c_binding_in_graph_functions["math.exp2"] = TorchInGraphFunctionVariable
     torch_c_binding_in_graph_functions["math.cbrt"] = TorchInGraphFunctionVariable
@@ -2340,6 +2457,16 @@
         "torch._utils._unflatten_dense_tensors",
         "torch._weights_only_unpickler._get_allowed_globals",
         "torch._weights_only_unpickler.load",
+<<<<<<< HEAD
+=======
+        "torch.accelerator.current_accelerator",
+        "torch.accelerator.current_device_index",
+        "torch.accelerator.current_stream",
+        "torch.accelerator.device_count",
+        "torch.accelerator.is_available",
+        "torch.accelerator.set_stream",
+        "torch.accelerator.synchronize",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch.align_tensors",
         "torch.amp.autocast_mode._enter_autocast",
         "torch.amp.autocast_mode._exit_autocast",
@@ -2844,6 +2971,46 @@
         "torch.tensordot",
         "torch.unique_consecutive",
         "torch.use_deterministic_algorithms",
+<<<<<<< HEAD
+=======
+        "torch.xpu._get_device",
+        "torch.xpu._get_generator",
+        "torch.xpu._get_rng_state_offset",
+        "torch.xpu._is_compiled",
+        "torch.xpu._lazy_call",
+        "torch.xpu._lazy_init",
+        "torch.xpu._set_rng_state_offset",
+        "torch.xpu._set_stream_by_id",
+        "torch.xpu._utils._get_device_index",
+        "torch.xpu.current_device",
+        "torch.xpu.current_stream",
+        "torch.xpu.device_count",
+        "torch.xpu.get_arch_list",
+        "torch.xpu.get_device_capability",
+        "torch.xpu.get_device_name",
+        "torch.xpu.get_device_properties",
+        "torch.xpu.get_gencode_flags",
+        "torch.xpu.get_stream_from_external",
+        "torch.xpu.init",
+        "torch.xpu.is_available",
+        "torch.xpu.is_bf16_supported",
+        "torch.xpu.is_initialized",
+        "torch.xpu.memory.empty_cache",
+        "torch.xpu.memory.max_memory_allocated",
+        "torch.xpu.memory.max_memory_reserved",
+        "torch.xpu.memory.mem_get_info",
+        "torch.xpu.memory.memory_allocated",
+        "torch.xpu.memory.memory_reserved",
+        "torch.xpu.memory.memory_stats_as_nested_dict",
+        "torch.xpu.memory.memory_stats",
+        "torch.xpu.memory.reset_accumulated_memory_stats",
+        "torch.xpu.memory.reset_peak_memory_stats",
+        "torch.xpu.random.initial_seed",
+        "torch.xpu.random.seed_all",
+        "torch.xpu.random.seed",
+        "torch.xpu.set_stream",
+        "torch.xpu.synchronize",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ],
     TorchInGraphFunctionVariable,
 )
@@ -2861,7 +3028,11 @@
 """
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_torch_obj_rule_map() -> dict[Any, type["VariableTracker"]]:
     d: dict[Any, type[VariableTracker]] = {}
     for m in torch_name_rule_map:
@@ -2910,7 +3081,11 @@ def load_object(name):
 """
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_tensor_method():
     disallowed_tensor_methods = {"__new__", "_make_wrapper_subclass", "_make_subclass"}
     s = set()
@@ -3171,7 +3346,10 @@ def is_numpy_type_info(obj) -> bool:
     random,
     traceback,
     linecache,
+<<<<<<< HEAD
     unittest,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 # third party libraries skiplist is defined by str, because users may not use these libraries.
@@ -3257,7 +3435,11 @@ def _module_dir(m: types.ModuleType):
         # the forward_hook won't be ignored.
         "torch.distributed._composable.replicate",
     }
+<<<<<<< HEAD
     if not torch._dynamo.config.skip_fsdp_hooks:
+=======
+    if not config.skip_fsdp_hooks:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LEGACY_MOD_INLINELIST.add("torch.distributed.fsdp._fully_shard")
 
 # Force inline functions under these modules, even they are in *_SKIPLIST.
@@ -3294,6 +3476,10 @@ def _module_dir(m: types.ModuleType):
     "torch.cuda.amp.autocast_mode",
     "torch.distributions",
     "torch.export._tree_utils",
+<<<<<<< HEAD
+=======
+    "torch.export._wrapper_utils",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch.fx._pytree",
     "torch.fx._symbolic_trace",
     "torch.fx.experimental.proxy_tensor",
@@ -3319,7 +3505,11 @@ def _module_dir(m: types.ModuleType):
 
 if torch.distributed.is_available():
     MOD_INLINELIST.add("torch.distributed")
+<<<<<<< HEAD
     if not torch._dynamo.config.skip_fsdp_hooks:
+=======
+    if not config.skip_fsdp_hooks:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         MOD_INLINELIST.add("torch.distributed.fsdp._fully_shard")
 
 
@@ -3354,7 +3544,10 @@ def _module_dir(m: types.ModuleType):
     "torch._functorch",
     "torch._guards",
     "torch._higher_order_ops.effects",
+<<<<<<< HEAD
     "torch._higher_order_ops.map",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "torch._higher_order_ops.torchbind",
     "torch._higher_order_ops.wrap",
     "torch._inductor",
@@ -3432,7 +3625,11 @@ def _module_dir(m: types.ModuleType):
 MOD_SKIPLIST = set(MOD_SKIPLIST)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_legacy_mod_inlinelist():
     inlinelist = {
         _as_posix_path(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
@@ -3441,7 +3638,11 @@ def get_legacy_mod_inlinelist():
     return inlinelist
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_mod_inlinelist():
     inlinelist = {
         _as_posix_path(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
@@ -3450,7 +3651,11 @@ def get_mod_inlinelist():
     return inlinelist
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_mod_skiplist():
     skiplist = {
         _as_posix_path(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
@@ -3471,7 +3676,10 @@ def get_mod_skiplist():
 
 SKIP_DIRS_RE = re.compile(r"match nothing^")
 
+<<<<<<< HEAD
 is_fbcode = importlib.import_module("torch._inductor.config").is_fbcode()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Skip fbcode paths(including torch.package paths) containing
 # one of the following strings.
 FBCODE_SKIP_DIRS: set[str] = set()
@@ -3481,7 +3689,11 @@ def get_mod_skiplist():
 # Remove this after fbcode is fully migrated to tracing through torchrec.
 FBCODE_SKIP_TORCHREC_DIRS = {
     "torchrec/distributed",
+<<<<<<< HEAD
     "trochrec/fb/distributed",
+=======
+    "torchrec/fb/distributed",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "caffe2/torch/fb/sparsenn/pooled_embeddings_modules.py",
 }
 
@@ -3559,7 +3771,11 @@ def check_file(filename, is_inlined_call=False):
             "MOD_INLINELIST",
         )
     if (
+<<<<<<< HEAD
         is_fbcode
+=======
+        is_fbcode()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and FBCODE_SKIP_DIRS
         and bool(FBCODE_SKIP_DIRS_RE.match(filename))
         and not bool(FBCODE_INLINE_FILES_IN_SKIPPED_DIRS_RE.match(filename))
@@ -3570,14 +3786,28 @@ def check_file(filename, is_inlined_call=False):
         )
 
     if (
+<<<<<<< HEAD
         is_fbcode
         and torch._dynamo.config.skip_torchrec
+=======
+        is_fbcode()
+        and config.skip_torchrec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and FBCODE_SKIP_TORCHREC_DIRS
         and bool(FBCODE_SKIP_TORCHREC_DIRS_RE.match(filename))
         and not bool(FBCODE_INLINE_FILES_IN_SKIPPED_DIRS_RE.match(filename))
     ):
         return SkipResult(True, "FBCODE_SKIP_TORCHREC_DIRS")
 
+<<<<<<< HEAD
+=======
+    if (
+        filename.startswith(_module_dir(unittest))
+        and not torch._dynamo.config.enable_trace_unittest
+    ):
+        return SkipResult(True, "unittest")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if bool(SKIP_DIRS_RE.match(filename)):
         return SkipResult(True, "SKIP_DIRS")
 
@@ -3612,9 +3842,15 @@ def f3(x, y):
         ......
 
 There are mainly three call sites of check/check_verbose:
+<<<<<<< HEAD
 * The compile region entrance (like function f1), the correspoinding code is located at eval_frame.py.
 * When tracing the recursively called functions (like function f2 and f3).
     * Dynamo decides inline/skip everytime it encounters a new recursively function call, and the call site
+=======
+* The compile region entrance (like function f1), the corresponding code is located at eval_frame.py.
+* When tracing the recursively called functions (like function f2 and f3).
+    * Dynamo decides inline/skip every time it encounters a new recursively function call, and the call site
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       is in InliningInstructionTranslator.check_inlineable of symbolic_convert.py.
     * If f2 is skipped by Dynamo, when evaluating the frame of f3, Dynamo need the inline/skip check again
       and the call site is in catch_errors_wrapper.catch_errors of convert_frame.py.
@@ -3699,7 +3935,11 @@ def is_torch_inline_allowed(filename):
     return any(filename.startswith(d) for d in get_mod_inlinelist())
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def dynamo_dir():
     import torch._dynamo
 
@@ -3742,6 +3982,10 @@ def lookup(obj):
     return lookup_inner(obj)
 
 
+<<<<<<< HEAD
+=======
+# also takes config.dont_skip_tracing into account
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def lookup_inner(
     obj,
     name=None,
@@ -3749,6 +3993,44 @@ def lookup_inner(
     is_direct_call=True,
     reasons: Union[None, set[str]] = None,
 ):
+<<<<<<< HEAD
+=======
+    result = _lookup_inner(
+        obj,
+        name=name,
+        filename=filename,
+        is_direct_call=is_direct_call,
+        reasons=reasons,
+    )
+    # There are still some modules we should absolutely NOT trace into - e.g. most of torch._dynamo,
+    # as this can result in really weird tracing behaviors.
+    # Note that if a torch._dynamo function is already not skipped (e.g. functions in external_utils.py),
+    # then this branch does not apply.
+    if config.dont_skip_tracing and result is SkipFunctionVariable:
+        if filename is None:
+            filename = getfile(obj)
+        filename = _as_posix_path(filename)
+        dynamo_path = _as_posix_path(_module_dir(torch)) + "_dynamo"
+        if filename.startswith(dynamo_path) and not filename.endswith(
+            "test_dont_skip_tracing_functions.py"
+        ):
+            return SkipFunctionVariable
+        if reasons is not None:
+            reasons.add(
+                "Attempted skip but we are ignoring skips due to torch._dynamo.config.dont_skip_tracing"
+            )
+        return UserFunctionVariable
+    return result
+
+
+def _lookup_inner(
+    obj,
+    name=None,
+    filename=None,
+    is_direct_call=True,
+    reasons: Union[None, set[str]] = None,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Step 1: lookup obj's tracing rule in `torch_name_rule_map`.
     # The rules defined in `torch_name_rule_map` mainly includes two parts:
     # - Manually defined rules for any functions.
diff --git a/torch/_dynamo/types.py b/torch/_dynamo/types.py
index 94e30a9657ee..b33ff058e47e 100644
--- a/torch/_dynamo/types.py
+++ b/torch/_dynamo/types.py
@@ -23,7 +23,11 @@
     _FrameExecStrategy as FrameExecStrategy,
     _PyInterpreterFrame as DynamoFrameType,
 )
+<<<<<<< HEAD
 from torch._guards import CompileId
+=======
+from torch._guards import CompileId, Guard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # We use a dict to store additional data per frame.
@@ -37,6 +41,20 @@ class GuardFail(NamedTuple):
     orig_code: types.CodeType
 
 
+<<<<<<< HEAD
+=======
+@dataclasses.dataclass(frozen=True)
+class GuardFilterEntry:
+    name: str
+    has_value: bool
+    value: object
+    guard_type: str
+    derived_guard_types: tuple[str, ...]
+    is_global: bool
+    orig_guard: Guard
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GuardFn(Protocol):
     closure_vars: dict[str, object]
     args: list[str]
@@ -64,7 +82,11 @@ class ConvertFrameReturn:
     # default return is no compiled code (i.e. `return None`):
     # strategy is to skip non-recursively, for all future intercepted frames too
 
+<<<<<<< HEAD
     # eval fram execution strategy for this frame
+=======
+    # eval frame execution strategy for this frame
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     frame_exec_strategy: FrameExecStrategy = dataclasses.field(
         default_factory=lambda: FrameExecStrategy(FrameAction.SKIP, FrameAction.DEFAULT)
     )
@@ -103,6 +125,16 @@ def __call__(
     ) -> None: ...
 
 
+<<<<<<< HEAD
+=======
+class DynamoGuardCompleteHook(Protocol):
+    def __call__(
+        self,
+        cache_hit: bool,
+    ) -> bool: ...
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ProfilerStartHook(Protocol):
     def __call__(
         self,
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 67ca4556f80a..49f7c7499903 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -47,10 +47,17 @@
 import warnings
 import weakref
 from collections import Counter, OrderedDict
+<<<<<<< HEAD
 from contextlib import contextmanager
 from dataclasses import is_dataclass
 from functools import lru_cache
 from types import MethodWrapperType
+=======
+from contextlib import AbstractContextManager, contextmanager
+from dataclasses import is_dataclass
+from functools import lru_cache
+from types import CodeType, MethodWrapperType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import (
     Any,
     Callable,
@@ -62,7 +69,11 @@
     TypeVar,
     Union,
 )
+<<<<<<< HEAD
 from typing_extensions import Literal, TypeIs
+=======
+from typing_extensions import Literal, TypeAlias, TypeGuard, TypeIs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._functorch.config
@@ -92,9 +103,24 @@
 from torch.utils._triton import has_triton, has_triton_package
 from torch.utils.hooks import RemovableHandle
 
+<<<<<<< HEAD
 
 if typing.TYPE_CHECKING:
     from collections.abc import Generator, Iterable, Iterator, KeysView, ValuesView
+=======
+from .graph_utils import _get_flat_args
+
+
+if typing.TYPE_CHECKING:
+    from collections.abc import (
+        Generator,
+        ItemsView,
+        Iterable,
+        Iterator,
+        KeysView,
+        ValuesView,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -105,7 +131,11 @@
 try:
     import torch._logging
     import torch._numpy as tnp
+<<<<<<< HEAD
     from torch._guards import detect_fake_mode  # noqa: F401n
+=======
+    from torch._guards import detect_fake_mode  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._logging import LazyString
 
     from . import config
@@ -579,12 +609,39 @@ def instant(
     @staticmethod
     def try_add_pt2_compile(event_name: str, **metadata: object):
         """
+<<<<<<< HEAD
         Adds to an existing pt2_compile event, but silently returns if the event doesn't exist.
         This function is syntactic sugar for chromium_event_logger().try_add_event_data.
         """
         chromium_log = get_chromium_event_logger()
         chromium_log.try_add_event_data(event_name, **metadata)
 
+=======
+        Adds to an existing pt2_compile event, but silently returns if the event doesn't exist
+        or ChromiumEventLogger is not initialized.
+        This function is syntactic sugar for chromium_event_logger().try_add_event_data.
+        """
+        if not chromium_event_log_active():
+            return
+        chromium_log = get_chromium_event_logger()
+        chromium_log.try_add_event_data(event_name, **metadata)
+
+    @staticmethod
+    def try_(method_fn, *args, **kwargs):
+        """
+        Special function that quietly runs a given method, returning if CHROMIUM_EVENT_LOG is None or metrics context is not set
+        """
+        if not chromium_event_log_active():
+            return
+        metrics_context = get_metrics_context()
+        if not metrics_context.in_progress():
+            return
+        method_fn(*args, **kwargs)
+
+
+_dynamo_timed_tls = threading.local()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @contextmanager
 def dynamo_timed(
@@ -594,10 +651,17 @@ def dynamo_timed(
     log_pt2_compile_event: bool = False,
     metadata: Optional[dict[str, object]] = None,
     dynamo_compile_column_us: Optional[str] = None,
+<<<<<<< HEAD
     dynamo_compile_runtime_column_us: Optional[str] = None,
     compile_id: Optional[CompileId] = None,
     is_forward: Optional[bool] = None,
     log_waitcounter: bool = False,
+=======
+    compile_id: Optional[CompileId] = None,
+    is_backward: Optional[bool] = None,
+    log_waitcounter: bool = False,
+    waitcounter_name_override: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Generator[Any, None, None]:
     """
     dynamo_timed is a context manager
@@ -632,6 +696,7 @@ def _foo(...):
     - dynamo_compile_column_us: If provided, updates the specified CompilationMetrics
       field to be logged to dyname_compile column. We expect all columns to be _us;
       therefore, the field name must end with "_us".
+<<<<<<< HEAD
     - dynamo_compile_runtime_column_us: Like 'dynamo_compile_column_us', but should
       be used for those columns captured outside of a compile context, e.g.,
       runtime autotuning.
@@ -649,6 +714,16 @@ def _foo(...):
     # Only one of these should be set.
     assert dynamo_compile_column_us is None or dynamo_compile_runtime_column_us is None
 
+=======
+    - compile_id: In the typical case, this parameter should not be needed. Use to
+      supply the compile_id for those cases where we want to log a compile_id where
+      it's not naturally available, e.g., for runtime autotuning.
+    - is_backward: Specify forward/backward directly when not available in a
+      CompileContext, e.g., during runtime autotuning.
+      that support it.
+    - log_waitcounter: If set, we'll log a waitcounter of the form "pytorch.dynamo_timed.{key}"
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if phase_name:
         event_name = phase_name
         fn_name = key
@@ -664,8 +739,13 @@ def _foo(...):
         event_metadata.update(metadata)
     if fn_name:
         event_metadata.update({"fn_name": fn_name})
+<<<<<<< HEAD
     if is_forward is not None:
         event_metadata.update({"is_backward": not is_forward})
+=======
+    if is_backward is not None:
+        event_metadata.update({"is_backward": is_backward})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     chromium_log: ChromiumEventLogger = get_chromium_event_logger()
     start_ns = time.time_ns()
@@ -673,6 +753,7 @@ def _foo(...):
         event_name, start_ns, event_metadata, log_pt2_compile_event, compile_id
     )
 
+<<<<<<< HEAD
     try:
         with torch.profiler.record_function(f"{key} (dynamo_timed)"):
             if log_waitcounter:
@@ -680,6 +761,38 @@ def _foo(...):
                     yield
             else:
                 yield
+=======
+    cx_mgrs: list[typing.Any] = [
+        torch.profiler.record_function(f"{key} (dynamo_timed)")
+    ]
+    if log_waitcounter:
+        wc_name = waitcounter_name_override if waitcounter_name_override else key
+        cx_mgrs.append(_WaitCounter(f"pytorch.wait_counter.{wc_name}").guard())
+
+    is_compile_time = torch._guards.CompileContext.current_compile_id() is not None
+    if dynamo_compile_column_us:
+        # We're standardizing on microseconds for dynamo_compile timings.
+        assert dynamo_compile_column_us.endswith("_us")
+
+        # Track nested dynamo_timed calls that update CompilationMetrics so we can
+        # bump a total duration only for the outermost metric.
+        if not hasattr(_dynamo_timed_tls, "depth"):
+            _dynamo_timed_tls.depth = 0
+        _dynamo_timed_tls.depth += 1
+
+        # The corresponding WaitCounters that we bump for all overheads
+        if _dynamo_timed_tls.depth == 1:
+            cx_mgrs.append(_WaitCounter("pytorch.wait_counter.dynamo_compile").guard())
+            if not is_compile_time:
+                runtime_wc = "pytorch.wait_counter.compile_runtime_overheads"
+                cx_mgrs.append(_WaitCounter(runtime_wc).guard())
+
+    try:
+        with contextlib.ExitStack() as stack:
+            for cx in cx_mgrs:
+                stack.enter_context(cx)
+            yield
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     finally:
         end_ns = time.time_ns()
         time_spent_ns = end_ns - start_ns
@@ -688,11 +801,14 @@ def _foo(...):
             event_name, end_ns, {}, start_ns, log_pt2_compile_event, compile_id
         )
         if dynamo_compile_column_us:
+<<<<<<< HEAD
             metrics_context = get_metrics_context()
             if metrics_context.in_progress():
                 metrics_context.increment(
                     dynamo_compile_column_us, time_spent_ns // 1000
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # TODO: the events that we capture in calculate_time_spent() seem a little
             # arbitrary. Currently, it's only those fields that are present in
             # CompilationMetrics (but note that we accumulate by the associated event
@@ -700,6 +816,7 @@ def _foo(...):
             # this way?
             cumulative_time_spent_ns[event_name] += time_spent_ns
 
+<<<<<<< HEAD
         if dynamo_compile_runtime_column_us:
             get_runtime_metrics_context().increment(
                 dynamo_compile_runtime_column_us,
@@ -711,6 +828,29 @@ def _foo(...):
                 },
             )
             cumulative_time_spent_ns[event_name] += time_spent_ns
+=======
+            # Bump the total duration for every outer event.
+            _dynamo_timed_tls.depth -= 1
+            is_outer_event = _dynamo_timed_tls.depth == 0
+
+            duration_us = time_spent_ns // 1000
+            if is_compile_time:
+                metrics_context = get_metrics_context()
+                if metrics_context.in_progress():
+                    metrics_context.increment(dynamo_compile_column_us, duration_us)
+                    if is_outer_event:
+                        metrics_context.increment("duration_us", duration_us)
+            else:
+                runtime_context = get_runtime_metrics_context()
+                runtime_context.increment(dynamo_compile_column_us, duration_us)
+                if is_outer_event:
+                    extra = {
+                        "compile_id": compile_id,
+                        "is_runtime": True,
+                        "is_forward": not is_backward,
+                    }
+                    runtime_context.increment("duration_us", duration_us, extra)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -1000,19 +1140,57 @@ def is_numpy_float_type(value):
     )
 
 
+<<<<<<< HEAD
 def is_lru_cache_wrapped_function(value):
+=======
+@overload
+def is_lru_cache_wrapped_function(
+    value: Callable[..., T],
+) -> TypeGuard[functools._lru_cache_wrapper[T]]: ...
+
+
+@overload
+def is_lru_cache_wrapped_function(
+    value: Any,
+) -> TypeGuard[functools._lru_cache_wrapper[Any]]: ...
+
+
+def is_lru_cache_wrapped_function(
+    value: Any,
+) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(value, functools._lru_cache_wrapper) and is_function(
         inspect.getattr_static(value, "__wrapped__")
     )
 
 
+<<<<<<< HEAD
 def is_function_or_wrapper(value):
+=======
+_FuncTypes: TypeAlias = Union[
+    types.FunctionType,
+    types.BuiltinFunctionType,
+    types.MethodDescriptorType,
+    types.WrapperDescriptorType,
+]
+
+
+def is_function_or_wrapper(
+    value: Any,
+) -> TypeIs[Union[_FuncTypes, torch._ops.OpOverloadPacket, torch._ops.OpOverload]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return is_function(value) or isinstance(
         value, (torch._ops.OpOverloadPacket, torch._ops.OpOverload)
     )
 
 
+<<<<<<< HEAD
 def is_function(value):
+=======
+def is_function(
+    value: Any,
+) -> TypeIs[_FuncTypes]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(
         value,
         (
@@ -1044,7 +1222,21 @@ def is_function(value):
 }
 
 
+<<<<<<< HEAD
 def is_wrapper_or_member_descriptor(value):
+=======
+def is_wrapper_or_member_descriptor(
+    value: Any,
+) -> TypeIs[
+    Union[
+        types.GetSetDescriptorType,
+        types.MethodDescriptorType,
+        types.WrapperDescriptorType,
+        types.MemberDescriptorType,
+        types.MethodWrapperType,
+    ]
+]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(
         value,
         (
@@ -1199,7 +1391,10 @@ class CompilationMetrics:
     runtime_cudagraphify_time_us: Optional[int] = None
     runtime_triton_autotune_time_us: Optional[int] = None
     dynamo_compile_time_before_restart_us: Optional[int] = None
+<<<<<<< HEAD
     cuda_synchronize_time_us: Optional[int] = None  # TODO: instrument
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     distributed_ephemeral_timeout_us: Optional[int] = None
     structured_logging_overhead_us: Optional[int] = None
     remote_fx_graph_cache_get_time_us: Optional[int] = None
@@ -1232,6 +1427,21 @@ class CompilationMetrics:
     triton_kernel_compile_times_us: Optional[str] = None
     ir_count: Optional[int] = None
     cudagraph_skip_reason: Optional[str] = None
+<<<<<<< HEAD
+=======
+    python_version: Optional[str] = None
+    pgo_put_remote_code_state_time_us: Optional[int] = None
+    pgo_get_remote_code_state_time_us: Optional[int] = None
+    # The number of elements within parameters. This is classically what people
+    # think of when they think of parameters in a ML model.
+    param_numel: Optional[int] = None
+    # The number of elements counted by bytes - i.e. a float32 is 4 bytes
+    # per element.
+    param_bytes: Optional[int] = None
+    # The number of parameters counted by fields. This is mostly a proxy for
+    # the number of distinct type of params.
+    param_count: Optional[int] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def create(cls, metrics: dict[str, Any]):
@@ -1395,6 +1605,10 @@ def clean_for_json(d: dict[str, Any]) -> dict[str, Any]:
             "reorderable_logging_functions",
             "ignore_logger_methods",
             "traceable_tensor_subclasses",
+<<<<<<< HEAD
+=======
+            "nontraceable_tensor_subclasses",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "_custom_ops_profile",
         }
 
@@ -1481,7 +1695,10 @@ def record_compilation_metrics(
         "compile_id": compile_id,
         "start_time_us": start_time_ns // 1000,
         "end_time_us": end_time_ns // 1000,
+<<<<<<< HEAD
         "duration_us": (end_time_ns - start_time_ns) // 1000,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "fail_type": exc_type.__qualname__ if exc_type else None,
         "fail_reason": str(exc_value) if exc_value else None,
         "structured_logging_overhead_us": to_int_us(
@@ -1493,6 +1710,10 @@ def record_compilation_metrics(
         "triton_version": triton.__version__ if has_triton() else "",
         "remote_cache_version": remote_cache_version,
         "inductor_fx_remote_cache_backend_type": inductor_fx_remote_cache_backend_type,
+<<<<<<< HEAD
+=======
+        "python_version": sys.version,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     compilation_metrics = CompilationMetrics.create({**common_metrics, **metrics})
@@ -1865,6 +2086,14 @@ def get_chromium_event_logger() -> ChromiumEventLogger:
     return CHROMIUM_EVENT_LOG
 
 
+<<<<<<< HEAD
+=======
+def chromium_event_log_active() -> bool:
+    global CHROMIUM_EVENT_LOG
+    return CHROMIUM_EVENT_LOG is not None
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @contextmanager
 def chromium_event_timed(
     event_name: str,
@@ -2064,7 +2293,13 @@ def preserve_rng_state():
                 torch.cuda.set_rng_state(cuda_rng_state)  # type: ignore[possibly-undefined]
 
 
+<<<<<<< HEAD
 def is_jit_model(model0):
+=======
+def is_jit_model(
+    model0,
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(
         model0,
         (
@@ -2268,7 +2503,11 @@ def is_safe_constant(v):
     )
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def common_constants():
     return {
         # We zero-one specialize shapes, so specialize these constants
@@ -2278,7 +2517,11 @@ def common_constants():
     }
 
 
+<<<<<<< HEAD
 def is_torch_sym(value):
+=======
+def is_torch_sym(value: Any) -> TypeGuard[Union[torch.SymBool, torch.SymInt]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(value, (torch.SymBool, torch.SymInt)) and not isinstance(
         value.node, torch.nested._internal.nested_int.NestedIntNode
     )
@@ -2301,6 +2544,13 @@ def is_int_specialization_case(value, source):
             source.guard_source().is_unspecialized_builtin_nn_module()
             and not config.allow_unspec_int_on_nn_module
         )
+<<<<<<< HEAD
+=======
+        or (
+            source.guard_source().is_unspecialized_nn_module()
+            and not config.allow_unspec_int_on_nn_module
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         or is_from_defaults(source)
         # TODO: Delete this condition when rollout is done.  NB: this
         # condition never evaluates True in open source
@@ -2386,6 +2636,10 @@ def check_numpy_ndarray_args(args, kwargs):
 
 dict_keys: type[KeysView[Any]] = type({}.keys())
 dict_values: type[ValuesView[Any]] = type({}.values())
+<<<<<<< HEAD
+=======
+dict_items: type[ItemsView[Any, Any]] = type({}.items())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 odict_values: type[ValuesView[Any]] = type(OrderedDict().values())
 tuple_iterator: type[Iterator[Any]] = type(iter(()))
 range_iterator: type[Iterator[Any]] = type(iter(range(0)))
@@ -2438,6 +2692,13 @@ def tuple_iterator_getitem(it, index):
     return obj[start + index]
 
 
+<<<<<<< HEAD
+=======
+def dataclass_fields(cls):
+    return torch._dynamo.disable(dataclasses.fields)(cls)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 iter_next = next
 
 
@@ -2550,7 +2811,13 @@ def iter_contains(items, search, tx, check_tensor_identity=False):
     return found
 
 
+<<<<<<< HEAD
 def key_is_id(k):
+=======
+def key_is_id(
+    k: Any,
+) -> TypeIs[Union[torch.Tensor, torch.nn.Module, MethodWrapperType]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Returns whether it indexes dictionaries using its id"""
     return isinstance(k, (torch.Tensor, torch.nn.Module, MethodWrapperType))
 
@@ -2606,7 +2873,11 @@ def dict_keys_repr(const_keys, *, local) -> str:
 def get_safe_global_name(tx, root, obj):
     # The global_mangled_class_name should be different for different
     # invocations of torch.compile. Otherwise, we can run into a situation
+<<<<<<< HEAD
     # where multiple torch.compile invocations re-use the same global name,
+=======
+    # where multiple torch.compile invocations reuse the same global name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # but the global's lifetime is tied to the first invocation (and
     # may be deleted when the first torch.compile invocation is deleted)
     # We mangle it based off of the output_graph's id.
@@ -2860,7 +3131,11 @@ def get_multiplier():
                     elif use_larger_multiplier_for_smaller_tensor and (
                         fp64_ref.numel() <= 500
                     ):
+<<<<<<< HEAD
                         multiplier = 5.0
+=======
+                        multiplier = 8.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     elif (
                         fp64_ref.numel() < 1000
                         or (ref.ndim == 4 and ref.shape[-1] == ref.shape[-2] == 1)
@@ -2869,7 +3144,11 @@ def get_multiplier():
                     ):
                         # In the presence of noise, noise might dominate our error
                         # metric for smaller tensors.
+<<<<<<< HEAD
                         # Similary, for 1x1 kernels, there seems to be high noise with amp.
+=======
+                        # Similarly, for 1x1 kernels, there seems to be high noise with amp.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         multiplier = 3.0
                     return multiplier
 
@@ -3003,7 +3282,11 @@ def disable_cache_limit():
 
 
 # return same dir unless user changes config between calls
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_debug_dir(root_dir):
     dir_name = (
         "run_"
@@ -3093,6 +3376,23 @@ def get_fake_value(node, tx, allow_non_graph_fake=False):
         tx, (node.args, node.kwargs), allow_non_graph_fake
     )
 
+<<<<<<< HEAD
+=======
+    if (
+        torch._dynamo.config.use_graph_deduplication
+        or torch._dynamo.config.track_nodes_for_deduplication
+    ):
+        flat_args_kwargs = get_fake_values_from_nodes(
+            tx, _get_flat_args(node, {}), allow_non_graph_fake
+        )
+        id_to_initial_version = {
+            id(arg): arg._version for arg in flat_args_kwargs if is_fake(arg)
+        }
+    else:
+        flat_args_kwargs = []
+        id_to_initial_version = {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nnmodule = None
     if op == "call_method" and len(args) > 0 and isinstance(args[0], torch.nn.Module):
         # If the first argument is nn.Module, should copy to fake mode.
@@ -3232,6 +3532,20 @@ def get_fake_value(node, tx, allow_non_graph_fake=False):
         _ = pytree.tree_map_only(
             torch.Tensor, functools.partial(ensure_graph_fake, tx=tx), ret_val
         )
+<<<<<<< HEAD
+=======
+
+    if (
+        torch._dynamo.config.use_graph_deduplication
+        or torch._dynamo.config.track_nodes_for_deduplication
+    ):
+        tx.output.region_tracker.track_node_mutations(
+            node,
+            flat_args_kwargs,
+            id_to_initial_version,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ret_val
 
 
@@ -3416,6 +3730,15 @@ def object_has_getattribute(value: Any):
     return class_has_getattribute(type(value))
 
 
+<<<<<<< HEAD
+=======
+def object_setattr_ignore_descriptor(obj, name, value):
+    # https://github.com/python/cpython/blob/3.11/Objects/object.c#L1286-L1335
+    d = object.__getattribute__(obj, "__dict__")
+    d[name] = value
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def class_has_getattribute(cls: type):
     try:
         if isinstance(
@@ -3704,6 +4027,13 @@ def defake(x):
     return y
 
 
+<<<<<<< HEAD
+=======
+def _disable_side_effect_safety_checks_for_current_subtracer(fn, *args, **kwargs):
+    return fn(*args, **kwargs)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_utils_checkpoint(obj):
     # Lazy import to avoid circular dependencies
     import torch.utils.checkpoint
@@ -3748,10 +4078,18 @@ def build_checkpoint_variable(**options):
 def is_compile_supported(device_type):
     from .eval_frame import is_dynamo_supported
 
+<<<<<<< HEAD
     compile_supported = is_dynamo_supported()
     if device_type == "cpu":
         pass
     elif device_type in ["cuda", "xpu"] and compile_supported:
+=======
+    type = torch.device(device_type).type
+    compile_supported = is_dynamo_supported()
+    if type == "cpu":
+        pass
+    elif type in ["cuda", "xpu"] and compile_supported:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compile_supported = has_triton()
     else:
         compile_supported = False
@@ -4069,11 +4407,27 @@ def is_tensor_base_attr_getter(value):
     )
 
 
+<<<<<<< HEAD
+=======
+def is_tensor_getset_descriptor(name):
+    try:
+        attr = inspect.getattr_static(torch.Tensor, name)
+        return type(attr) is types.GetSetDescriptorType
+    except AttributeError:
+        return False
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_torch_function_object(value):
     return hasattr(value, "__torch_function__")
 
 
 def has_torch_function(vt: torch._dynamo.variables.base.VariableTracker) -> bool:
+<<<<<<< HEAD
+=======
+    # This emulates
+    # https://github.com/pytorch/pytorch/blob/8d81806211bc3c0ee6c2ef235017bacf1d775a85/torch/csrc/utils/disable_torch_function.cpp#L315-L323
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.variables import UserDefinedObjectVariable
     from torch._dynamo.variables.torch_function import TensorWithTFOverrideVariable
 
@@ -4088,12 +4442,23 @@ def has_torch_function(vt: torch._dynamo.variables.base.VariableTracker) -> bool
     if vt.is_realized() or (
         hasattr(vt, "peek_value") and hasattr(vt.peek_value(), "__torch_function__")
     ):
+<<<<<<< HEAD
         if isinstance(vt, TensorWithTFOverrideVariable):
             return True
 
         return isinstance(vt, UserDefinedObjectVariable) and hasattr(
             vt.value, "__torch_function__"
         )
+=======
+        func = None
+        if isinstance(vt, TensorWithTFOverrideVariable):
+            func = getattr(vt.class_type, "__torch_function__", None)
+
+        elif isinstance(vt, UserDefinedObjectVariable):
+            func = getattr(vt.value, "__torch_function__", None)
+
+        return func not in (None, torch._C._disabled_torch_function_impl)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return False
 
@@ -4343,6 +4708,7 @@ def does_not_override_dict_iter_methods(user_cls):
     )
 
 
+<<<<<<< HEAD
 # Helper functions below are to prevent __torch_function__
 # calls from happening in the middle of __torch_function__
 # compiled bytecode
@@ -4369,6 +4735,24 @@ def fn(x):
         return x.storage_offset()
 
     return fn(x)
+=======
+# Helper functions below are to prevent TorchDynamo to prevent tracing of
+# __torch_function__ calls triggered on tensor properties in the pre graph
+# bytecode.
+@torch._disable_dynamo
+def call_size(x, i):
+    return x.size(i)
+
+
+@torch._disable_dynamo
+def call_stride(x, i):
+    return x.stride(i)
+
+
+@torch._disable_dynamo
+def call_storage_offset(x):
+    return x.storage_offset()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Helper function to extract relevant parts of a tensor's __dict__ to store in node meta.
@@ -4454,6 +4838,10 @@ def set_feature_use(feature: str, usage: bool):
 _ddp_optimization_mode: tuple[str, ...] = (
     "ddp_optimizer",
     "python_reducer",  # experimental mode
+<<<<<<< HEAD
+=======
+    "python_reducer_without_compiled_forward",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "no_optimization",
 )
 
@@ -4473,3 +4861,72 @@ def get_optimize_ddp_mode():
         f"Invalid dynamo config optimize_ddp value {mode=}"
     )
     return mode
+<<<<<<< HEAD
+=======
+
+
+@contextmanager
+def maybe_disable_inference_mode() -> Generator[None, None, None]:
+    """
+    Disables torch.inference_mode for the compilation (still on at runtime).
+    This simplifies the compile stack where we can assume that inference_mode
+    will always be off.
+
+    Since inference_mode is equivalent to no_grad + some optimizations (version
+    counts etc), we turn on no_grad here. The other optimizations are not
+    relevant to torch.compile.
+    """
+    is_inference_mode_on = (
+        config.fake_tensor_disable_inference_mode and torch.is_inference_mode_enabled()
+    )
+    if is_inference_mode_on:
+        with (
+            torch.inference_mode(False),
+            torch.no_grad(),
+        ):
+            yield
+    else:
+        yield
+
+
+@contextmanager
+def maybe_disable_inference_mode_for_fake_prop() -> Generator[None, None, None]:
+    """
+    Turns off tracking of inference_mode for fake tensor propagation. With this
+    context manager, when a real tensor is converted to fake tensor, the fake
+    tensor looses its inference-ness.
+    """
+    if config.fake_tensor_disable_inference_mode:
+        with torch._subclasses.meta_utils.disable_inference_mode_for_fake_prop():
+            yield
+    else:
+        yield
+
+
+def is_node_meta_valid(node: Optional[torch.fx.Node]) -> bool:
+    return node is None or "example_value" in node.meta or "val" in node.meta
+
+
+@torch._disable_dynamo
+def record_pregraph_bytecode_enter() -> AbstractContextManager[None]:
+    cm: AbstractContextManager[None] = (
+        torch._C._profiler._RecordFunctionFast("Pregraph bytecode")
+        if torch.autograd.profiler._is_profiler_enabled
+        else contextlib.nullcontext()
+    )
+    cm.__enter__()
+    return cm
+
+
+@torch._disable_dynamo
+def record_pregraph_bytecode_exit(cm: AbstractContextManager[None]) -> None:
+    cm.__exit__(None, None, None)
+
+
+# Returns a set of code objects present traced in the current TracingContext, or None
+# if there is no current TracingContext.
+def get_traced_code() -> list[CodeType]:
+    from torch._guards import TracingContext
+
+    return TracingContext.get_traced_code()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/variables/__init__.py b/torch/_dynamo/variables/__init__.py
index deb6c1761231..62cf40ed7241 100644
--- a/torch/_dynamo/variables/__init__.py
+++ b/torch/_dynamo/variables/__init__.py
@@ -26,6 +26,10 @@
     DeterministicAlgorithmsVariable,
     DisabledSavedTensorsHooksVariable,
     DualLevelContextManager,
+<<<<<<< HEAD
+=======
+    DynamoConfigPatchVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FSDPParamGroupUseTrainingStateVariable,
     GradIncrementNestingCtxManagerVariable,
     GradInplaceRequiresGradCtxManagerVariable,
@@ -53,7 +57,12 @@
 from .functions import (
     BuiltinMethodVariable,
     CollectionsNamedTupleFunction,
+<<<<<<< HEAD
     CreateTMADescriptorVariable,
+=======
+    CreateTMADescriptorExperimentalVariable,
+    CreateTMADescriptorStableVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FunctionDecoratedByContextlibContextManagerVariable,
     FunctoolsPartialVariable,
     FunctoolsWrapsVariable,
@@ -62,9 +71,18 @@
     NestedUserFunctionVariable,
     PolyfilledFunctionVariable,
     SkipFunctionVariable,
+<<<<<<< HEAD
     TMADescriptorVariable,
     UserFunctionVariable,
     UserMethodVariable,
+=======
+    TMADescriptorExperimentalVariable,
+    TMADescriptorStableVariable,
+    UserFunctionVariable,
+    UserMethodVariable,
+    WrapperUserFunctionVariable,
+    WrapperUserMethodVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from .higher_order_ops import (
     FunctionalCallVariable,
@@ -156,7 +174,12 @@
     "ConstDictVariable",
     "ContextWrappingVariable",
     "CountIteratorVariable",
+<<<<<<< HEAD
     "CreateTMADescriptorVariable",
+=======
+    "CreateTMADescriptorExperimentalVariable",
+    "CreateTMADescriptorStableVariable",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "CUDADeviceVariable",
     "CycleIteratorVariable",
     "DataPtrVariable",
@@ -164,6 +187,10 @@
     "DeletedVariable",
     "DeterministicAlgorithmsVariable",
     "DictKeySetVariable",
+<<<<<<< HEAD
+=======
+    "DynamoConfigPatchVariable",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "EnumVariable",
     "FakeItemVariable",
     "GetAttrVariable",
@@ -196,7 +223,12 @@
     "SuperVariable",
     "TemporarilyPopInterpreterStackCtxManagerVariable",
     "TensorVariable",
+<<<<<<< HEAD
     "TMADescriptorVariable",
+=======
+    "TMADescriptorExperimentalVariable",
+    "TMADescriptorStableVariable",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "TorchCtxManagerClassVariable",
     "TorchInGraphFunctionVariable",
     "TorchVersionVariable",
diff --git a/torch/_dynamo/variables/base.py b/torch/_dynamo/variables/base.py
index f43629ab4271..90a5939abd8e 100644
--- a/torch/_dynamo/variables/base.py
+++ b/torch/_dynamo/variables/base.py
@@ -16,6 +16,7 @@
 """
 
 import collections
+<<<<<<< HEAD
 from collections.abc import Sequence
 from enum import Enum
 from typing import Any, Callable, Optional, TYPE_CHECKING
@@ -23,13 +24,27 @@
 from .. import variables
 from ..current_scope_id import current_scope_id
 from ..exc import unimplemented, unimplemented_v2
+=======
+from collections.abc import ItemsView, KeysView, Sequence, ValuesView
+from enum import Enum
+from typing import Any, Callable, Optional, TYPE_CHECKING
+
+from .. import graph_break_hints, variables
+from ..current_scope_id import current_scope_id
+from ..exc import raise_observed_exception, unimplemented_v2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource, Source
 from ..utils import cmp_name_to_op_mapping, istype
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from .symbolic_convert import InstructionTranslator, InstructionTranslatorBase
+=======
+    from ..codegen import PyCodegen
+    from ..symbolic_convert import InstructionTranslator, InstructionTranslatorBase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class SourceType(Enum):
@@ -88,7 +103,19 @@ def __init__(self, typ: SourceType) -> None:
         elif typ is SourceType.New:
             self.scope = current_scope_id()
         else:
+<<<<<<< HEAD
             unimplemented(f"Unsupported SourceType: {typ}")
+=======
+            unimplemented_v2(
+                gb_type="Unsupported SourceType",
+                context=f"MutationType.__init__ {self} {typ}",
+                explanation=f"Dynamo does not support the type `{typ}`",
+                hints=[
+                    "This branch is not supposed to be reachable.",
+                    *graph_break_hints.DYNAMO_BUG,
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ValueMutationNew(MutationType):
@@ -201,7 +228,11 @@ class AsPythonConstantNotImplementedError(NotImplementedError):
     vt: "VariableTracker"
 
     def __init__(self, vt: "VariableTracker"):
+<<<<<<< HEAD
         super().__init__(self, f"{vt} is not a constant")
+=======
+        super().__init__(f"{vt} is not a constant")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.vt = vt
 
 
@@ -336,8 +367,18 @@ def guard_as_python_constant(self):
         """Similar to as_python_constant(), but add ID_MATCH guards to try to force things to become constants"""
         try:
             return self.as_python_constant()
+<<<<<<< HEAD
         except NotImplementedError as e:
             unimplemented(str(e))
+=======
+        except NotImplementedError:
+            unimplemented_v2(
+                gb_type="Not a Python constant",
+                context=f"guard_as_python_constant {self}",
+                explanation=f"Failed to convert {self} into a Python constant.",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def is_python_constant(self):
         try:
@@ -386,7 +427,11 @@ def maybe_fx_node(self):
         except NotImplementedError:
             return None
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError
 
     def unpack_var_sequence(self, tx) -> list["VariableTracker"]:
@@ -412,13 +457,43 @@ def has_unpack_var_sequence(self, tx) -> bool:
     def has_force_unpack_var_sequence(self, tx) -> bool:
         return self.has_unpack_var_sequence(tx)
 
+<<<<<<< HEAD
     def inspect_parameter_names(self) -> list[str]:
         unimplemented(f"inspect_parameter_names: {self}")
+=======
+    # Forces unpacking the var sequence while also applying a function to each element.
+    # Only use when it is safe to eagerly unpack this variable (like force_unpack_var_sequence).
+    # INVARIANT: variable must satisfy has_force_unpack_var_sequence() == True!
+    def force_apply_to_var_sequence(self, tx, fn) -> None:
+        assert self.has_force_unpack_var_sequence(tx)
+        for v in self.unpack_var_sequence(tx):
+            fn(v)
+
+    def inspect_parameter_names(self) -> list[str]:
+        unimplemented_v2(
+            gb_type="Unsupported inspect call",
+            context=f"inspect_parameter_names {self}",
+            explanation=f"Dynamo does not know how to trace the function `{self.debug_repr()}`",
+            hints=[],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
     ) -> "VariableTracker":
+<<<<<<< HEAD
         unimplemented(f"hasattr {self.__class__.__name__} {name}")
+=======
+        unimplemented_v2(
+            gb_type="Unsupported hasattr call",
+            context=f"call_obj_hasattr {self} {name}",
+            explanation=f"Dynamo does not know how to trace the function `{self.debug_repr()}`",
+            hints=[
+                f"Avoid calling `hasattr({self.__class__.__name__}, {name})` in your code.",
+                *graph_break_hints.SUPPORTABLE,
+            ],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_function(
         self,
@@ -453,6 +528,7 @@ def call_method(
             and not kwargs
         ):
             return self.var_getattr(tx, args[0].as_python_constant())
+<<<<<<< HEAD
         elif (
             name in cmp_name_to_op_mapping
             and len(args) == 1
@@ -476,6 +552,47 @@ def call_method(
                     self.as_python_constant(), other.as_python_constant()
                 )
             )
+=======
+        elif name in cmp_name_to_op_mapping and len(args) == 1 and not kwargs:
+            other = args[0]
+            if not isinstance(self, type(other)) and not (
+                isinstance(self, variables.GetAttrVariable)
+                or isinstance(other, variables.GetAttrVariable)
+            ):
+                # NB: GetAttrVariable is a special case because sometimes an
+                # object can map to GetAttrVariable but other time as
+                # SkipFunctionVariable if it is an input to the compiled
+                # function, e.g. tensor.data_ptr
+                return variables.ConstantVariable.create(NotImplemented)
+            # NB : Checking for mutation is necessary because we compare
+            # constant values
+            if (
+                not self.is_python_constant()
+                or not other.is_python_constant()
+                or tx.output.side_effects.has_pending_mutation(self)
+                or tx.output.side_effects.has_pending_mutation(other)
+            ):
+                unimplemented_v2(
+                    gb_type="Builtin `operator.*` comparison with constant `self` failed",
+                    context=f"call_method {self} {name} {args} {kwargs}",
+                    explanation=f"Failed to compare {self} with {other}, "
+                    + f"because {other} is not a Python constant or its mutation check fails.",
+                    hints=[],
+                )
+
+            try:
+                return variables.ConstantVariable.create(
+                    cmp_name_to_op_mapping[name](
+                        self.as_python_constant(), other.as_python_constant()
+                    )
+                )
+            except Exception as e:
+                raise_observed_exception(
+                    type(e),
+                    tx,
+                    args=[list(map(variables.ConstantVariable.create, e.args))],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hints = [
             f"Avoid calling `{self.python_type_name()}.{name}` in your code.",
             "Please report an issue to PyTorch.",
@@ -485,6 +602,14 @@ def call_method(
             "__iter__",
             "__next__",
         ):
+<<<<<<< HEAD
+=======
+            if isinstance(self.value, (KeysView, ItemsView, ValuesView)):
+                hints.append(
+                    "Consider moving the creation of dict view object (e.g. `dict.keys()`, `dict.items()`,) "
+                    "to the compiled region, instead of passing it as an input to the compiled region."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             hints.append(
                 "Dynamo does not fully support tracing builtin iterators (e.g. `map`, `zip`, `enumerate`) "
                 "passed in from uncompiled to compiled regions (e.g. `torch.compile(fn)(enumerate(...))`). "
@@ -514,7 +639,16 @@ def is_realized(self):
         return True
 
     def next_variable(self, tx):
+<<<<<<< HEAD
         unimplemented(f"next({self})")
+=======
+        unimplemented_v2(
+            gb_type="Unsupported next() call",
+            context=f"next({self})",
+            explanation=f"Dynamo does not know how to trace calling `next()` on variable `{self}`.",
+            hints=[*graph_break_hints.USER_ERROR],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def is_strict_mode(self, tx):
         return tx.strict_checks_fn and tx.strict_checks_fn(self)
@@ -537,7 +671,11 @@ def build(
         if source is None:
             return builder.SourcelessBuilder.create(tx, value)
         else:
+<<<<<<< HEAD
             return builder.VariableBuilder(tx, source)(value)
+=======
+            return variables.LazyVariableTracker.create(value, source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index cb979e804edc..e28ed912e900 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -49,6 +49,10 @@
     get_metrics_context,
     is_int_specialization_case,
     is_torch_sym,
+<<<<<<< HEAD
+=======
+    set_feature_use,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._guards import TracingContext
 from torch._higher_order_ops.torchbind import call_torchbind
@@ -66,15 +70,31 @@
     StatefulSymbolicContext,
     SubclassSymbolicContext,
     SymbolicContext,
+<<<<<<< HEAD
 )
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+=======
+    SymIntSymbolicContext,
+    TrackedFake,
+)
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+from torch.nn.utils._expanded_weights import ExpandedWeight
+from torch.utils._python_dispatch import (
+    is_traceable_wrapper_subclass,
+    is_traceable_wrapper_subclass_type,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._sympy.value_ranges import ValueRanges
 from torch.utils.weak import TensorWeakRef
 
 from .. import config, graph_break_hints, mutation_guard, replay_record, trace_rules
 from ..device_interface import get_registered_device_interfaces
+<<<<<<< HEAD
 from ..exc import InternalTorchDynamoError, unimplemented, unimplemented_v2
+=======
+from ..exc import InternalTorchDynamoError, raise_observed_exception, unimplemented_v2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..guards import GuardBuilder, install_guard, make_dupe_guard
 from ..pgo import (
     auto_dynamic,
@@ -97,7 +117,14 @@
     GetItemSource,
     GradSource,
     is_constant_source,
+<<<<<<< HEAD
     is_from_optimizer_source,
+=======
+    is_from_global_source,
+    is_from_nonlocal_source,
+    is_from_optimizer_source,
+    is_from_unspecialized_nn_module_source,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ListGetItemSource,
     LocalSource,
     NumpyTensorSource,
@@ -106,6 +133,11 @@
     Source,
     SubclassAttrListSource,
     TupleIteratorGetItemSource,
+<<<<<<< HEAD
+=======
+    UnspecializedBuiltinNNModuleSource,
+    UnspecializedNNModuleSource,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from ..utils import (
     _extract_tensor_dict,
@@ -141,6 +173,10 @@
     wrap_fake_exception,
 )
 from .base import (
+<<<<<<< HEAD
+=======
+    AttributeMutationNew,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     typestr,
     ValueMutationExisting,
     ValueMutationNew,
@@ -150,6 +186,10 @@
 from .constant import ConstantVariable, EnumVariable
 from .ctx_manager import (
     AutocastModeVariable,
+<<<<<<< HEAD
+=======
+    DynamoConfigPatchVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     EventVariable,
     NullContextVariable,
     PreserveVersionContextVariable,
@@ -175,7 +215,12 @@
     BuiltinMethodVariable,
     CollectionsNamedTupleFunction,
     CollectiveFunctionRewriteVariable,
+<<<<<<< HEAD
     CreateTMADescriptorVariable,
+=======
+    CreateTMADescriptorExperimentalVariable,
+    CreateTMADescriptorStableVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FunctoolsPartialVariable,
     FunctoolsWrapsVariable,
     SysFunctionVariable,
@@ -246,13 +291,20 @@
     TorchInGraphFunctionVariable,
 )
 from .torch_function import (
+<<<<<<< HEAD
     build_torch_function_fn,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorWithTFOverrideVariable,
     torch_function_mode_stack_state_mgr,
     TorchFunctionModeVariable,
 )
 from .user_defined import (
     FrozenDataClassVariable,
+<<<<<<< HEAD
+=======
+    IntWrapperVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     KeyedJaggedTensorVariable,
     MutableMappingVariable,
     SourcelessGraphModuleVariable,
@@ -272,6 +324,10 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from torch._dynamo.codegen import PyCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 
@@ -344,7 +400,11 @@ def __post_init__(self):
             self._example = TensorWeakRef(self._example)
             assert is_fake(self.fake_tensor)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.source)
 
     def erase(self):
@@ -365,7 +425,11 @@ def __init__(self) -> None:
             is_tensor=False,
         )
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert codegen.tx.output.backward_state_var
         codegen.add_push_null(
             lambda: codegen.load_import_from(BackwardState.__module__, "BackwardState")
@@ -385,6 +449,15 @@ def reconstruct(self, codegen):
 # Will be updated later in substitute_in_graph in torch/_dynamo/polyfills/itertools.py
 ITERTOOLS_POLYFILLED_TYPE_IDS: set[int] = set()
 
+<<<<<<< HEAD
+=======
+# Capture fn pointer at import time
+# This is to guard against trying to mark the iterated tensors
+# as static in case user overrides fn ptr
+og_module_named_buffers_fn_ptr = torch.nn.Module.named_buffers
+og_module_named_parameters_fn_ptr = torch.nn.Module.named_parameters
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class VariableBuilder:
     """Wrap a python value in a VariableTracker() instance"""
@@ -416,7 +489,14 @@ def __call__(self, value):
             return cached_vt
 
         vt = self._wrap(value)
+<<<<<<< HEAD
         vt.source = self.source
+=======
+
+        if vt.source is None:
+            vt.source = self.source
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             self._can_lift_attrs_to_inputs(vt)
             and value not in self.tx.output.side_effects
@@ -452,7 +532,11 @@ def _type_dispatch(cls):
         return cls._type_dispatch_impl(config.trace_numpy)
 
     @classmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _type_dispatch_impl(cls, trace_numpy):
         # NB: Careful not to close over self to avoid ref cycle from lru_cache
         entries = [
@@ -505,7 +589,17 @@ def wrap_removable_handle(self, value):
         # Our current infra requires the hook to be registered and removed in
         # the same frame. So graph break.
         # Related test - PYTORCH_TEST_WITH_DYNAMO=1 python test/test_autograd.py -k TestAutograd.test_hooks
+<<<<<<< HEAD
         unimplemented("unregistered hook removable handle")
+=======
+        unimplemented_v2(
+            gb_type="Attempted to represent unregistered RemovableHandle",
+            context="",
+            explanation="Dynamo attempted to build a representation of a torch.utils.hooks.RemovableHandle, "
+            "which is not supported. This happens because the RemovableHandle was created in another frame.",
+            hints=[],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def wrap_jit_function(self, value):
         self.install_guards(GuardBuilder.TYPE_MATCH)
@@ -521,16 +615,33 @@ def wrap_mapping_proxy(self, value):
         all_const = all(ConstantVariable.is_literal(k) for k in value.keys())
 
         if not all_const:
+<<<<<<< HEAD
             unimplemented("mapping proxy type supports only const keys")
+=======
+            unimplemented_v2(
+                gb_type="non-const keys in mappingproxy",
+                context=f"non-const keys: {[k for k in value.keys() if not ConstantVariable.is_literal(k)]}",
+                explanation="Dynamo expects mappingproxy keys to be constants.",
+                hints=[
+                    "Ensure your mappingproxy keys are constants (e.g. int, float, strings)",
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def build_key_value(k, v):
             key = ConstantVariable.create(k)
             source_key = k
 
             source_value = GetItemSource(self.get_source(), source_key)
+<<<<<<< HEAD
             value = LazyVariableTracker.create(v, source_value)
 
             return key, value
+=======
+            res_value = LazyVariableTracker.create(v, source_value)
+
+            return key, res_value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         items = dict(build_key_value(k, v) for k, v in value.items())
 
@@ -540,7 +651,11 @@ def build_key_value(k, v):
         return self.tx.output.side_effects.track_mutable(value, result)
 
     @classmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _id_dispatch(
         cls,
     ) -> dict[int, Callable[["VariableBuilder", Any], VariableTracker]]:
@@ -569,7 +684,17 @@ def _id_dispatch(
 
     def _wrap(self, value):
         # import here to avoid circular dependencies
+<<<<<<< HEAD
         from torch.utils._triton import has_triton, has_triton_tma
+=======
+        from torch.utils._triton import (
+            has_triton,
+            has_triton_experimental_host_tma,
+            has_triton_tensor_descriptor_host_tma,
+        )
+
+        from ..decorators import DynamoConfigPatchProxy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if has_triton():
             from triton.runtime.autotuner import Autotuner
@@ -582,6 +707,7 @@ class JITFunction:
             class Autotuner:
                 pass
 
+<<<<<<< HEAD
         if has_triton_tma():
             from triton.tools.experimental_descriptor import (
                 create_1d_tma_descriptor,
@@ -594,6 +720,27 @@ def create_1d_tma_descriptor():
 
             def create_2d_tma_descriptor():
                 pass
+=======
+        # default implementations, in case we don't have triton (or the wrong triton version)
+        def create_1d_tma_descriptor():
+            pass
+
+        def create_2d_tma_descriptor():
+            pass
+
+        class TensorDescriptor:
+            @staticmethod
+            def from_tensor():
+                pass
+
+        if has_triton_experimental_host_tma():
+            from triton.tools.experimental_descriptor import (  # noqa: F811
+                create_1d_tma_descriptor,
+                create_2d_tma_descriptor,
+            )
+        if has_triton_tensor_descriptor_host_tma():
+            from triton.tools.tensor_descriptor import TensorDescriptor  # noqa: F811
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Handle exact type() match
         type_dispatch = self._type_dispatch().get(type(value))
@@ -606,11 +753,38 @@ def create_2d_tma_descriptor():
             return id_dispatch(self, value)
 
         # Everything else (NB: order matters!)
+<<<<<<< HEAD
         if is_traceable_wrapper_subclass(value) or istype(
             value, config.traceable_tensor_subclasses
         ):
             return self.wrap_tensor(value)
         elif is_namedtuple(value):
+=======
+        if (
+            isinstance(value, torch.Tensor)
+            and type(value)
+            not in (
+                # These torch-native subclasses have overly restrictive
+                # `__torch_function__` which prevents Dynamo from reading their
+                # tensor attributes like `is_nested` or calling methods like
+                # `_is_view`.
+                torch.nn.parameter.UninitializedBuffer,
+                torch.nn.parameter.UninitializedParameter,
+                ExpandedWeight,
+            )
+            and type(value) not in config.nontraceable_tensor_subclasses
+        ):
+            if type(value).__torch_dispatch__ is torch.Tensor.__torch_dispatch__:
+                # This case it's either tensor or subclass with default
+                # torch_dispatch (they might override torch_function or not),
+                # and we can always trace into them.
+                return self.wrap_tensor(value)
+            elif is_traceable_wrapper_subclass(value):
+                # For non-default torch_dispatch, we have more requirements.
+                return self.wrap_tensor(value)
+
+        if is_namedtuple(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.install_guards(GuardBuilder.SEQUENCE_LENGTH)
             output = [
                 LazyVariableTracker.create(
@@ -627,7 +801,11 @@ def create_2d_tma_descriptor():
             self.install_guards(GuardBuilder.TYPE_MATCH)
             all_const = all(ConstantVariable.is_literal(k) for k in value.keys())
 
+<<<<<<< HEAD
             # For all_const, we dont have to guard on anything yet. We guard on
+=======
+            # For all_const, we don't have to guard on anything yet. We guard on
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # keys lazily by adding a dict_getitem entry for each accessed key.
             # For cases where we need to guard on all keys, we lazily put guards
             # during the dict call_method (check dicts.py)
@@ -643,20 +821,33 @@ def create_2d_tma_descriptor():
                 # 2) For non-constant objects, we also have to guard on the keys
                 # (like TENSOR_MATCH on tensor). We might also have guards on
                 # the attributes of the keys (like tensor.grad). To make this
+<<<<<<< HEAD
                 # work in tree strucutre is complicated.
+=======
+                # work in tree structure is complicated.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 #
                 # So, instead we guard on the key order. While guarding on key
                 # order, we just save the indices and use it to access keys and
                 # values. Indices are cheap to save.
+<<<<<<< HEAD
                 self.tx.output.guard_on_key_order.add(self.source.name())
+=======
+                self.tx.output.guard_on_key_order.add(self.source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # We need all the keys to be hashable. We do this within the
             # _HashableTracker class in dicts.py
             def build_key_value(i, k, v):
+<<<<<<< HEAD
+=======
+                base = self.get_source()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if all_const:
                     key = ConstantVariable.create(k)
                     source_key = k
                 else:
+<<<<<<< HEAD
                     source_key = ConstDictKeySource(self.get_source(), i)
                     key = LazyVariableTracker.create(k, source_key)
 
@@ -664,6 +855,14 @@ def build_key_value(i, k, v):
                 value = LazyVariableTracker.create(v, source_value)
 
                 return key, value
+=======
+                    source_key = ConstDictKeySource(base, i)
+                    key = LazyVariableTracker.create(k, source_key)
+                source_value = DictGetItemSource(base, source_key)
+                res_value = LazyVariableTracker.create(v, source_value)
+
+                return key, res_value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Ensure that we call dict.keys and not value.keys (which can call
             # overridden keys method). In the C++ guards, we relied on
@@ -747,7 +946,16 @@ def build_key_value(i, k, v):
             keywords_source = AttrSource(self.get_source(), "keywords")
             for k, v in value.keywords.items():
                 if not ConstantVariable.is_literal(k):
+<<<<<<< HEAD
                     unimplemented("functools.partial with non-literal keyword")
+=======
+                    unimplemented_v2(
+                        gb_type="functools.partial() with non-literal keyword",
+                        context=f"non-literal keyword: {k}",
+                        explanation="functools.partial() expects literal/string keywords",
+                        hints=[*graph_break_hints.USER_ERROR],
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 keywords[k] = VariableBuilder(
                     self.tx, DictGetItemSource(keywords_source, k)
                 )(v)
@@ -864,6 +1072,11 @@ def build_key_value(i, k, v):
                     {},
                 )
             )
+<<<<<<< HEAD
+=======
+        elif isinstance(value, DynamoConfigPatchProxy):
+            return DynamoConfigPatchVariable(value.changes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif callable(value) and trace_rules.lookup_callable(value) is not None:
             if trace_rules.is_callable_allowed(value):
                 self.tx.output.has_user_defined_allowed_in_graph = True
@@ -874,8 +1087,16 @@ def build_key_value(i, k, v):
             return self.wrap_unspecialized_primitive(value)
         elif isinstance(value, HigherOrderOperator):
             if value is torch._higher_order_ops.invoke_subgraph:
+<<<<<<< HEAD
                 unimplemented(
                     "Directly using invoke_subgraph is not supported. Use mark_compile_region"
+=======
+                unimplemented_v2(
+                    gb_type="Attempted to wrap torch._higher_order_ops.invoke_subgraph",
+                    context="",
+                    explanation="Directly using invoke_subgraph is not supported. Use nested_compile_region",
+                    hints=[],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             self.install_guards(GuardBuilder.TYPE_MATCH, GuardBuilder.NAME_MATCH)
             return TorchHigherOrderOperatorVariable.make(value, source=self.source)
@@ -925,11 +1146,14 @@ def build_key_value(i, k, v):
                 source=self.source,
             )
         elif (
+<<<<<<< HEAD
             isinstance(value, torch._C._TensorMeta)
             and value in config.traceable_tensor_subclasses
         ):
             return TensorSubclassVariable(value, source=self.source)
         elif (
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             istype(value, contextlib.nullcontext)
             and inspect.getattr_static(value, "enter_result", None) is None
         ):
@@ -975,7 +1199,11 @@ def build_key_value(i, k, v):
             return ItertoolsVariable(value, source=self.source)
         elif is_torch_sym(value):
             # Note: this doesn't handle nested symints.
+<<<<<<< HEAD
             # For SymBool input, we re-use the infra for SymInt by simulating SymBool with a SymInt in dynamo.
+=======
+            # For SymBool input, we reuse the infra for SymInt by simulating SymBool with a SymInt in dynamo.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Concretely,
             # 1. We create a SymInt in dynamo's shape_env, whose source is constructed as ConvertIntSource(self.source).
@@ -1005,8 +1233,16 @@ def build_key_value(i, k, v):
                     # this is automatically done by evaluating the guards once but this
                     # will cause data-dependent error when we evaluate the outer unbacked symints.
                     # The test case that triggers this graph break is test_cond_unbacked_symint_closure
+<<<<<<< HEAD
                     unimplemented(
                         "unbacked symint input is not supported yet. If you need this feature, please file a github issue."
+=======
+                    unimplemented_v2(
+                        gb_type="Attempted to wrap unbacked SymInt",
+                        context="",
+                        explanation="Unbacked SymInt input is not supported yet.",
+                        hints=[*graph_break_hints.SUPPORTABLE],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
             sym_node_proxy = self.tx.output.root_tracer.create_graph_input(
@@ -1045,9 +1281,17 @@ def build_key_value(i, k, v):
                 source=self.source,
             )
         elif value is create_1d_tma_descriptor:
+<<<<<<< HEAD
             return CreateTMADescriptorVariable(rank=1)
         elif value is create_2d_tma_descriptor:
             return CreateTMADescriptorVariable(rank=2)
+=======
+            return CreateTMADescriptorExperimentalVariable(rank=1)
+        elif value is create_2d_tma_descriptor:
+            return CreateTMADescriptorExperimentalVariable(rank=2)
+        elif value is TensorDescriptor.from_tensor:
+            return CreateTMADescriptorStableVariable()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(value, torch.amp.autocast_mode.autocast):
             self.install_guards(GuardBuilder.ID_MATCH)
             return AutocastModeVariable(
@@ -1181,6 +1425,23 @@ def build_key_value(i, k, v):
             if value is torch.autograd._unsafe_preserve_version_counter:
                 self.install_guards(GuardBuilder.FUNCTION_MATCH)
                 return PreserveVersionContextVariable.constructor(self.tx)
+<<<<<<< HEAD
+=======
+            if (
+                # `value` must be a strict subclass of `torch.Tensor`
+                issubclass(value, torch.Tensor)
+                and value is not torch.Tensor
+                # `TensorSubclassVariable` is not for subclass that overrides
+                # `torch_dispatch`.
+                and value.__torch_dispatch__ is torch.Tensor.__torch_dispatch__
+                # `TensorSubclassVariable` would lead to construction of
+                # `TensorWithTFOverrideVariable`, but we don't want that for
+                # traceable wrapper subclasses (we wrap those subclass instances
+                # into `TensorVariable`).
+                and not is_traceable_wrapper_subclass_type(value)
+            ):
+                return TensorSubclassVariable(value, source=self.source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # This is a userdefined class, so install an ID_MATCH even if its a
             # global variable.
             self.install_guards(GuardBuilder.ID_MATCH)
@@ -1203,7 +1464,11 @@ def build_key_value(i, k, v):
                 )
 
                 # setting is_unspecialized=False to not insert a as_tensor call in reconstruct by default
+<<<<<<< HEAD
                 # seting example to be real value because these example values will be used
+=======
+                # setting example to be real value because these example values will be used
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # as example_inputs for user compiler.
                 proxy.node.meta["grapharg"] = GraphArg(
                     self.source, value, False, None, False, value
@@ -1248,7 +1513,11 @@ def build_key_value(i, k, v):
             )
 
             # setting is_unspecialized=False to not insert a as_tensor call in reconstruct by default
+<<<<<<< HEAD
             # seting example to be real value because these example values will be used
+=======
+            # setting example to be real value because these example values will be used
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # as example_inputs for user compiler.
             proxy.node.meta["grapharg"] = GraphArg(
                 self.source, value, False, None, False, fake_script_obj
@@ -1267,18 +1536,31 @@ def build_key_value(i, k, v):
             self.install_guards(GuardBuilder.SEQUENCE_LENGTH)
 
             # Guard on the key order
+<<<<<<< HEAD
             self.tx.output.guard_on_key_order.add(self.source.name())
+=======
+            self.tx.output.guard_on_key_order.add(self.source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # We need all the keys to be hashable. We do this within the
             # _HashableTracker class in dicts.py
             def build_key_value(i, k, v):
+<<<<<<< HEAD
                 source_key = ConstDictKeySource(self.get_source(), i)
+=======
+                base = self.get_source()
+                source_key = ConstDictKeySource(base, i)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 key = LazyVariableTracker.create(k, source_key)
 
                 source_value = DictSubclassGetItemSource(base, source_key)
                 res_value = LazyVariableTracker.create(v, source_value)
 
+<<<<<<< HEAD
                 return key, value
+=======
+                return key, res_value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Ensure that we call dict.keys and not value.keys (which can call
             # overridden keys method). In the C++ guards, we relied on
@@ -1305,7 +1587,11 @@ def build_key_value(i, k, v):
 
             result = UserDefinedDictVariable(value, dict_vt=dict_vt, source=self.source)
             return self.tx.output.side_effects.track_object_existing(value, result)
+<<<<<<< HEAD
         elif isinstance(value, tuple) and type(value).__new__ is tuple.__new__:
+=======
+        elif isinstance(value, tuple):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.install_guards(GuardBuilder.TYPE_MATCH)
             self.install_guards(GuardBuilder.SEQUENCE_LENGTH)
 
@@ -1322,7 +1608,11 @@ def build_key_value(i, k, v):
             tuple_vt = TupleVariable(
                 output, source=self.source, mutation_type=ValueMutationExisting()
             )
+<<<<<<< HEAD
             result = UserDefinedTupleVariable.create(
+=======
+            result = UserDefinedTupleVariable(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 value, tuple_vt=tuple_vt, source=self.source
             )
             return self.tx.output.side_effects.track_object_existing(value, result)
@@ -1365,7 +1655,45 @@ def build_key_value(i, k, v):
                 )
                 return DictKeySetVariable(items, source=self.source)
             else:
+<<<<<<< HEAD
                 unimplemented("dict_keys with non-constant keys are not supported")
+=======
+                unimplemented_v2(
+                    gb_type="non-const keys in dict_keys",
+                    context=f"non-const keys: {[k for k in value if not ConstantVariable.is_literal(k)]}",
+                    explanation="Dynamo expects dict_keys keys to be constants.",
+                    hints=[
+                        "Ensure your dict_keys keys are constants (e.g. int, float, strings)",
+                    ],
+                )
+        elif IntWrapperVariable.is_matching_object(value):
+            from torch.export.dynamic_shapes import _DimHintType
+
+            if value.dynamism is None or value.dynamism.type == _DimHintType.STATIC:
+                return self.wrap_symint(value.val)
+            elif value.dynamism.type == _DimHintType.DYNAMIC:
+                log.debug(
+                    "%s marked %s via IntWrapper",
+                    self.source.name(),
+                    DimDynamic.DYNAMIC,
+                )
+                return self.wrap_symint(
+                    value.val,
+                    dynamism=DimDynamic.DYNAMIC,
+                    context=SymIntSymbolicContext(
+                        constraint=RelaxedUnspecConstraint(warn_only=False)
+                    ),
+                )
+            elif value.dynamism.type == _DimHintType.AUTO:
+                log.debug(
+                    "%s marked %s via IntWrapper",
+                    self.source.name(),
+                    DimDynamic.DYNAMIC,
+                )
+                return self.wrap_symint(value.val, dynamism=DimDynamic.DYNAMIC)
+            else:
+                raise RuntimeError(f"Undefined dynamism {value.dynamism}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return self.wrap_user_defined(value)
 
@@ -1378,6 +1706,18 @@ def wrap_user_defined(self, value: Any):
         return self.tx.output.side_effects.track_object_existing(value, result)
 
     def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
+<<<<<<< HEAD
+=======
+        for item in value:
+            if item is value:
+                unimplemented_v2(
+                    gb_type="list elements are pointing to the list itself",
+                    context="",
+                    explanation="Dynamo does not support lists whose items reference to itself",
+                    hints=["Avoid using self referential list"],
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.specialize_int and type(value) is torch.Size:
             self.install_guards(GuardBuilder.CONSTANT_MATCH)
             return ConstantVariable.create(value=value)
@@ -1534,7 +1874,17 @@ def wrap_module(self, value: torch.nn.Module):
                 # we graph break here, Dynamo does not know how to create
                 # continuation functions for such bytecodes. So, we delay the
                 # graph break to CALL_FUNCTION.
+<<<<<<< HEAD
                 return DelayGraphBreakVariable(source=self.source)
+=======
+                msg = inspect.getattr_static(
+                    value.forward, "_torchdynamo_disable_msg", None
+                )
+                return DelayGraphBreakVariable(
+                    source=self.source,
+                    msg=f"Optimized `nn.Module` is wrapped with `torch.compiler.disable` (reason: {msg})",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             self.install_guards(GuardBuilder.TYPE_MATCH)
             self.source = AttrSource(self.source, "_orig_mod")
@@ -1544,7 +1894,16 @@ def wrap_module(self, value: torch.nn.Module):
             isinstance(value, (torch.nn.RNN, torch.nn.GRU, torch.nn.LSTM))
             and not config.allow_rnn
         ):
+<<<<<<< HEAD
             unimplemented("TorchDynamo purposely graph breaks on RNN, GRU, LSTMs")
+=======
+            unimplemented_v2(
+                gb_type="Attempted to wrap RNN, GRU, or LSTM",
+                context=str(value),
+                explanation="Dynamo does not support RNN, GRU, or LSTM.",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if getattr(value, "_is_fsdp_managed_module", False):
             # See note [Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
@@ -1553,7 +1912,16 @@ def wrap_module(self, value: torch.nn.Module):
             # we can't do this assert inside FSDP constructor,
             # since we don't know yet whether dynamo will be used
             if not getattr(value, "_fsdp_use_orig_params", False):
+<<<<<<< HEAD
                 unimplemented("Dynamo only supports FSDP with use_orig_params=True")
+=======
+                unimplemented_v2(
+                    gb_type="FSDP with use_orig_params=False",
+                    context="",
+                    explanation="Dynamo only supports FSDP with use_orig_params=True",
+                    hints=[],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Note on FSDP guarding
             # Eager FSDP already assumes (requires, but without enforcement)
@@ -1580,6 +1948,7 @@ def wrap_module(self, value: torch.nn.Module):
                 value = value.get_base()
                 self.source = AttrProxySource(self.source)
 
+<<<<<<< HEAD
             self.install_guards(GuardBuilder.TYPE_MATCH)
             if torch._dynamo.config.inline_inbuilt_nn_modules:
                 freezing = is_parameter_freezing()
@@ -1588,6 +1957,35 @@ def wrap_module(self, value: torch.nn.Module):
 
                 for b in value.buffers():
                     self.mark_static_input(b, guard=freezing)
+=======
+            if torch._dynamo.config.inline_inbuilt_nn_modules:
+                freezing = is_parameter_freezing()
+
+                # Guard against the case where user may overwrite named parameters
+                # / named buffers
+                # NOTE: This is not likely to happen but worth guarding to avoid
+                # exception
+                if (
+                    callable(value.named_parameters)
+                    and value.named_parameters.__func__
+                    is og_module_named_parameters_fn_ptr
+                ):
+                    try:  # catch TypeErrors in named_parameters() from unserializable nn modules
+                        for _, p in value.named_parameters():
+                            self.mark_static_input(p, guard=freezing)
+                    except TypeError as e:
+                        raise_observed_exception(type(e), self.tx, args=list(e.args))
+
+                if (
+                    callable(value.named_buffers)
+                    and value.named_buffers.__func__ is og_module_named_buffers_fn_ptr
+                ):
+                    try:  # catch TypeErrors in named_parameters() from unserializable nn modules
+                        for _, b in value.named_buffers():
+                            self.mark_static_input(b, guard=freezing)
+                    except TypeError as e:
+                        raise_observed_exception(type(e), self.tx, args=list(e.args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 if freezing:
                     # we need to add the module to tracing context
@@ -1595,12 +1993,36 @@ def wrap_module(self, value: torch.nn.Module):
                     # this will get cleaned up once compile ends
                     self.tx.output.nn_modules[self.name] = value
 
+<<<<<<< HEAD
             if value.__module__.startswith(("torch.nn.", "torch.ao.")) or getattr(
                 value.__class__, "_dynamo_marked_static", False
             ):
                 result = UnspecializedBuiltinNNModuleVariable(value, source=self.source)
             else:
                 result = UnspecializedNNModuleVariable(value, source=self.source)
+=======
+            if (
+                value.__module__.startswith(("torch.nn.modules", "torch.ao."))
+                and not value.__module__.startswith("torch.nn.modules.container")
+            ) or getattr(value.__class__, "_dynamo_marked_static", False):
+                new_source = self.source
+                if config.inline_inbuilt_nn_modules and (
+                    not self.tx.output.export or config.install_free_tensors
+                ):
+                    # Export corner case - look at test_repros.py test_inlining_cornercase
+                    new_source = UnspecializedBuiltinNNModuleSource(self.source)
+                result = UnspecializedBuiltinNNModuleVariable(value, source=new_source)
+                install_guard(new_source.make_guard(GuardBuilder.TYPE_MATCH))
+            else:
+                new_source = self.source
+                if config.inline_inbuilt_nn_modules and (
+                    not self.tx.output.export or config.install_free_tensors
+                ):
+                    # Export corner case - look at test_repros.py test_inlining_cornercase
+                    new_source = UnspecializedNNModuleSource(self.source)
+                result = UnspecializedNNModuleVariable(value, source=new_source)
+                install_guard(new_source.make_guard(GuardBuilder.TYPE_MATCH))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if not SideEffects.cls_supports_mutation_side_effects(type(value)):
                 # don't allow STORE_ATTR mutation with custom __setattr__
@@ -1620,6 +2042,7 @@ def wrap_module(self, value: torch.nn.Module):
             )
 
     def wrap_literal(self, value):
+<<<<<<< HEAD
         if not config.specialize_int and type(value) is int:
             # unspecializing int by default, but still
             # specialize for the following conditions
@@ -1628,6 +2051,47 @@ def wrap_literal(self, value):
                 return ConstantVariable.create(value=value, source=self.source)
             else:
                 return self.wrap_symint(value)
+=======
+        if type(value) is int:
+            # allowlist has higher precedence over specialization control.
+            if is_dynamic_source(self.source.name()):
+                log.debug("%s marked dynamic via source whitelist", self.source.name())
+                return self.wrap_symint(value, dynamism=DimDynamic.DYNAMIC)
+
+            if is_unbacked_source(self.source.name()):
+                log.debug("%s marked unbacked via source whitelist", self.source.name())
+                return self.wrap_symint(value, dynamism=DimDynamic.SIZE_LIKE_UNBACKED)
+
+            if not config.specialize_int:
+                # unspecializing int by default, but still
+                # specialize for the following conditions
+                if is_int_specialization_case(value, self.source):
+                    recompile_hint = None
+                    if (
+                        self.source.guard_source().is_unspecialized_builtin_nn_module()
+                        or self.source.guard_source().is_unspecialized_nn_module()
+                    ):
+                        # This means that it is an integer from a NN module.
+                        # Dynamo considers nn module int attributes to be static
+                        # (a good heuristic). But a user might want to mark the
+                        # int attribute to be a symint, so track this integer
+                        # for recompilation later.
+                        recompile_hint = (
+                            "torch.compile considers integer attributes of the nn.Module to be static. "
+                            "If you are observing recompilation, you might want to make this integer dynamic "
+                            "using torch._dynamo.config.allow_unspec_int_on_nn_module = True, or convert this "
+                            "integer into a tensor."
+                        )
+
+                    self.install_guards(
+                        functools.partial(
+                            GuardBuilder.EQUALS_MATCH, recompile_hint=recompile_hint
+                        )
+                    )
+                    return ConstantVariable.create(value=value, source=self.source)
+
+            return self.wrap_symint(value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif not config.specialize_float and type(value) is float:
             return self.wrap_symfloat(value)
         else:
@@ -1666,15 +2130,35 @@ def wrap_tensor(self, value: torch.Tensor):
             self.mark_static_input(value, guard=is_parameter_freezing())
             is_static_input = True
 
+<<<<<<< HEAD
+=======
+        # Install any tensors which are "free" variables; that is:
+        # 1. Globals
+        # 2. NonLocals
+        # 3. tensors that are attributes of nn module
+        should_install_free_tensor = config.install_free_tensors and (
+            is_from_global_source(source)
+            or is_from_nonlocal_source(source)
+            or is_from_unspecialized_nn_module_source(source)
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         make_graph_attribute = is_static_input and (
             not config.inline_inbuilt_nn_modules
             or is_parameter_freezing()
             or torch._dynamo.config.prepare_freezing
         )
 
+<<<<<<< HEAD
         if (
             source.guard_source().is_specialized_nn_module() or make_graph_attribute
         ) and not source.guard_source().is_fsdp_module():
+=======
+        if should_install_free_tensor or (
+            (source.guard_source().is_specialized_nn_module() or make_graph_attribute)
+            and not source.guard_source().is_fsdp_module()
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assert_not_wrapped_by_this_graph(value)
             return self.tx.output.register_attr_or_module(
                 value, self.name, source=source
@@ -1704,7 +2188,26 @@ def wrap_tensor(self, value: torch.Tensor):
                 # Guards are added inside register_attr_or_module
             )
 
+<<<<<<< HEAD
         if type(value) in config.traceable_tensor_subclasses:
+=======
+        # NB: this just says we accessed a tensor from the same source again
+        # (e.g., a tensor lives in a global foo, and we LOAD_GLOBAL it twice).
+        # This is distinct from two distinct sources mapping to the same
+        # Tensor (per id())!  No guard is necessary here.  See below for the
+        # other case.
+        is_duplicate_tensor = source in self.tx.output.input_source_to_var
+        if is_duplicate_tensor:
+            return self.tx.output.input_source_to_var[source]
+
+        options = {}
+        if type(value) in (
+            torch.Tensor,
+            torch.nn.Parameter,
+            torch._subclasses.fake_tensor.FakeTensor,
+            torch._subclasses.functional_tensor.FunctionalTensor,
+        ) or is_traceable_wrapper_subclass(value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Ordinarily, we would fakeify a tensor so that it can get dynamic
             # shapes and be computed on without triggering actual operations.
             # However, how can we fakeify a tensor subclass?  Ordinary
@@ -1722,6 +2225,7 @@ def wrap_tensor(self, value: torch.Tensor):
             # To simplify things for now, the __dict__ tracking bits haven't
             # been implemented yet, but they can be added into this design at
             # a later point in time.
+<<<<<<< HEAD
             subclass_type = type(value)
         else:
             assert type(value) in (
@@ -1740,6 +2244,12 @@ def wrap_tensor(self, value: torch.Tensor):
         is_duplicate_tensor = source in self.tx.output.input_source_to_var
         if is_duplicate_tensor:
             return self.tx.output.input_source_to_var[source]
+=======
+            subclass_type = None
+        else:
+            subclass_type = type(value)
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if get_static_address_type(value) == "guarded":
             self.install_guards(GuardBuilder.ID_MATCH)
@@ -1747,6 +2257,7 @@ def wrap_tensor(self, value: torch.Tensor):
         # By this point, we should have deduplicated all tensors
         self.assert_not_wrapped_by_this_graph(value)
 
+<<<<<<< HEAD
         options = {}
         if type(value) in config.traceable_tensor_subclasses:
             options["torch_function_fn"] = build_torch_function_fn(
@@ -1754,12 +2265,23 @@ def wrap_tensor(self, value: torch.Tensor):
             )
             self.install_guards(GuardBuilder.TYPE_MATCH)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             isinstance(value, torch.Tensor)
             and value.is_nested
             and not isinstance(value, torch.nested._internal.nested_tensor.NestedTensor)
         ):
+<<<<<<< HEAD
             unimplemented("torch.compile does not support strided NestedTensor")
+=======
+            unimplemented_v2(
+                gb_type="Attempted to wrap strided NestedTensor",
+                context="",
+                explanation="torch.compile does not support strided NestedTensor",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO(pearu,sparse-team) - Add the corresponding SPARSE_TENSOR_MATCH guards
         if (
@@ -1769,19 +2291,38 @@ def wrap_tensor(self, value: torch.Tensor):
         ):
             # A hot fix for sparse tensors + torch.compile. Support for
             # export + sparsity is being added but we need to create
+<<<<<<< HEAD
             # SPARSE_TENSOR_GUARDS for guards to work propertly.
             unimplemented("torch.compile does not support sparse Tensors")
+=======
+            # SPARSE_TENSOR_GUARDS for guards to work properly.
+            unimplemented_v2(
+                gb_type="Attempted to wrap sparse Tensor",
+                context="",
+                explanation="torch.compile does not support sparse Tensors",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if (
             safe_has_grad(value)
             and safe_grad(value) is not None
             and value.dtype != safe_grad(value).dtype
         ):
+<<<<<<< HEAD
             unimplemented(
                 "Inconsistent dtype between tensor and its gradient. "
                 "This can happen in FSDP and crashes meta tensor creation. "
                 "This is potentially a workaround. Fixing it correctly "
                 "requires some design around FSDP + torch.compile."
+=======
+            unimplemented_v2(
+                gb_type="dtype mismatch between tensor and its gradient",
+                context=f"tensor dtype: {value.dtype}; grad dtype: {safe_grad(value).dtype}",
+                explanation="Inconsistent dtype between tensor and its gradient. "
+                "This can happen in FSDP and crashes meta tensor creation.",
+                hints=[*graph_break_hints.SUPPORTABLE],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # tx.output has multiple tracers if we're introspecting HigherOrderOperator.
@@ -1796,6 +2337,10 @@ def wrap_tensor(self, value: torch.Tensor):
         example_value = wrap_to_fake_tensor_and_record(
             value, tx=self.tx, is_tensor=True, source=source
         )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor_proxy = self.tx.output.root_tracer.create_graph_input(
             re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
             type(value),
@@ -1896,7 +2441,17 @@ def wrap_numpy_ndarray(self, value):
                     tensor_value = clone_preserve_strides(tensor_value)
             except NotImplementedError as e:
                 # failed to convert to tensor, graph break
+<<<<<<< HEAD
                 unimplemented(str(e))
+=======
+                unimplemented_v2(
+                    gb_type="failed to convert numpy.ndarray to Tensor",
+                    context=str(value),
+                    explanation="Exception encountered when attempting to convert numpy.ndarray to Tensor",
+                    hints=[],
+                    from_exc=e,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # We do this because we want the full behavior of guarding the numpy ndarray as if it were
         # a tensor. It's a little annoying to make a VT to throw out, but there's so many side effects here
@@ -1940,9 +2495,24 @@ def wrap_numpy_ndarray(self, value):
         )
         proxy.node.meta["grapharg"] = grapharg
 
+<<<<<<< HEAD
         return numpy_ndarray_variable
 
     def wrap_symint(self, value):
+=======
+        # TODO - Why do we need to set the source of the np ndarray vt back to
+        # original source. Many tests fails.
+        numpy_ndarray_variable.source = self.source
+
+        return numpy_ndarray_variable
+
+    def wrap_symint(
+        self,
+        value,
+        dynamism: Optional[DimDynamic] = None,
+        context: Optional[SymIntSymbolicContext] = None,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert type(value) is int
 
         if self.name in self.tx.output.unspec_variable_map:
@@ -1961,10 +2531,18 @@ def wrap_symint(self, value):
         # but the general idea is that we generate kernels that can
         # take unspecialized floats and use them in sizevar computation
         elif not is_constant_source(self.get_source()):
+<<<<<<< HEAD
             if torch._dynamo.config.specialize_int:
                 # If specialize_int is False, also return
                 # a constant (but this should have been handled
                 # in the caller, TBH)
+=======
+            if dynamism is None and torch._dynamo.config.specialize_int:
+                # If specialize_int is False, also return
+                # a constant (but this should have been handled
+                # in the caller, TBH). But if `dynamism` is set, then actually
+                # turn it into a symint
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.install_guards(GuardBuilder.CONSTANT_MATCH)
                 return ConstantVariable.create(value=value, source=self.source)
 
@@ -1986,13 +2564,22 @@ def wrap_symint(self, value):
             if isinstance(base_source, ChainedSource):
                 base_source = base_source.get_base()
 
+<<<<<<< HEAD
             if self.source.name() in get_dynamic_sources():
                 log.debug("%s marked dynamic via source whitelist", self.source.name())
                 dynamic_dim = DimDynamic.DYNAMIC
+=======
+            if dynamism is not None:
+                dynamic_dim = dynamism
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif (
                 config.automatic_dynamic_shapes
                 and frame_state_entry.scalar is auto_dynamic
             ):
+<<<<<<< HEAD
+=======
+                set_feature_use("dynamo.automatic_dynamic_shapes", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 dynamic_dim = get_automatic_dynamic_shapes_mark_as()
             elif (
                 isinstance(base_source, LocalSource)
@@ -2005,6 +2592,11 @@ def wrap_symint(self, value):
             else:  # assume_static_by_default
                 # TODO: dynamic_dim = DimDynamic.STATIC should work but
                 # for some reason it doesn't
+<<<<<<< HEAD
+=======
+                if frame_state_entry.scalar is auto_dynamic:
+                    set_feature_use("dynamo.automatic_dynamic_shapes", False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.install_guards(GuardBuilder.CONSTANT_MATCH)
                 return ConstantVariable.create(value=value)
 
@@ -2015,7 +2607,11 @@ def wrap_symint(self, value):
             )
 
             self.tx.output.tracked_fakes.append(
+<<<<<<< HEAD
                 TrackedFake(wrapped_value, self.source, None)
+=======
+                TrackedFake(wrapped_value, self.source, context)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             assert is_constant_source(self.get_source())
@@ -2042,11 +2638,14 @@ def wrap_symint(self, value):
         self.tx.output.unspec_variable_map[self.name] = unspec_var
 
         if not is_constant_source(self.get_source()):
+<<<<<<< HEAD
             if self.tx.export and not isinstance(self.get_source(), LocalSource):
                 raise AssertionError(
                     f"Dynamo attempts to add additional input during export: value={wrapped_value}, source={self.get_source()}"
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             proxy.node.meta["grapharg"] = GraphArg(
                 self.get_source(),
                 wrapped_value,
@@ -2089,7 +2688,14 @@ def wrap_symfloat(self, value):
             # python test/inductor/test_compiled_optimizers.py CompiledOptimizerTests.test_rmsprop_weight_decay_maximize_capturable_cuda # noqa: B950
             or torch._inductor.config.triton.cudagraphs
             or justknobs_check("pytorch/compiler:unspecialize_float_killswitch", False)
+<<<<<<< HEAD
             or frame_state_entry.scalar is not auto_dynamic
+=======
+            or (
+                config.assume_static_by_default
+                and frame_state_entry.scalar is not auto_dynamic
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             self.install_guards(GuardBuilder.CONSTANT_MATCH)
             return ConstantVariable.create(value=value, source=self.source)
@@ -2259,14 +2865,28 @@ def _dataclasses_fields_lambda(obj):
     if isinstance(obj, UserDefinedObjectVariable):
         value = obj.value
     else:
+<<<<<<< HEAD
         unimplemented(f"Dataclass fields handling fails for type {obj}")
+=======
+        unimplemented_v2(
+            gb_type="dataclass fields failure",
+            context=f"obj: {obj}; variable type: {type(obj)}",
+            explanation=f"Dataclass fields handling fails for {obj}. Expected it to be a user-defined object.",
+            hints=[],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     items = []
     for field in dataclasses.fields(value):
         source = None
         if obj.source:
+<<<<<<< HEAD
             source = DictGetItemSource(
                 AttrSource(obj.source, "__dataclass_fields__"), field.name
             )
+=======
+            base_src = AttrSource(obj.source, "__dataclass_fields__")
+            source = DictGetItemSource(base_src, field.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         items.append(UserDefinedObjectVariable(field, source=source))
     return TupleVariable(items)
 
@@ -2442,7 +3062,13 @@ def _wrap_fx_preexisting_tensor(
                 f"wrapped by this instance of Dynamo. Found: {tensor}"
             )
 
+<<<<<<< HEAD
     return handle_traced_output(tensor, tx, proxy, options, subclass_type, target_cls)
+=======
+    return construct_tensor_variable(
+        target_cls, tx, proxy, tensor, subclass_type, options
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # This is 2 in the above comment (wrapping the output of a traced op)
@@ -2476,6 +3102,7 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
     import torch._utils
 
     if isinstance(example_value, torch.Tensor):
+<<<<<<< HEAD
         is_parameter = isinstance(example_value, torch.nn.Parameter)
         is_buffer = isinstance(example_value, torch.nn.Buffer)
 
@@ -2506,6 +3133,25 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
 
         options.update(specialized_props)
         return target_cls(proxy, **options)
+=======
+        var = construct_tensor_variable(
+            target_cls, tx, proxy, example_value, subclass_type, options
+        )
+        # NOTE: [Side effect tracking for newly constructed tensor]
+        # For newly constructed objects that have mutable attributes, we usually
+        # construct their VariableTracker via `track_object_new`, but since
+        # tensor variable construction is a bit different, we handle them
+        # specially here. This ensures that codegen will actually generate the
+        # attribute mutations on this tensor.
+        #
+        # NOTE we pass a dummy object as the `item` argument to avoid
+        # constructing a dummy _tensor_ object. The object isn't used for
+        # newly constructed VTs anyways.
+        tx.output.side_effects._track_obj(
+            proxy, var, mutation_type_cls=AttributeMutationNew
+        )
+        return var
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif (
         hasattr(proxy.node.target, "__name__")
         and proxy.node.target.__name__ == "set_state"
@@ -2647,6 +3293,10 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         proxy.node.target
         in [
             torch._C._are_functorch_transforms_active,
+<<<<<<< HEAD
+=======
+            torch._C._functorch.is_batchedtensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.backends.cuda.is_flash_attention_available,
             torch.backends.cuda.can_use_flash_attention,
             torch.backends.cuda.can_use_efficient_attention,
@@ -2666,6 +3316,7 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         set_example_value(proxy.node, example_value)
         return ConstantVariable.create(example_value, **options)
     else:
+<<<<<<< HEAD
         unimplemented(
             "torch.* op returned non-Tensor "
             + f"{typestr(example_value)} {proxy.node.op} {proxy.node.target}",
@@ -2673,6 +3324,53 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         )
 
 
+=======
+        unimplemented_v2(
+            gb_type="torch.* op returned non-Tensor",
+            context=f"example_value type: {typestr(example_value)}; op: {proxy.node.op}; target: {proxy.node.target}",
+            explanation="torch.* ops that return a non-Tensor cannot be traced into the Dynamo FX graph output",
+            hints=[],
+        )
+
+
+def construct_tensor_variable(
+    target_cls, tx, proxy, example_value, subclass_type, options
+):
+    """
+    Actually construct a tensor variable after all the pre-processing from
+    wrapping a pre-existing or newly created tensor value.
+    """
+    # NB: In most (all?) cases, this does not actually do a clone.
+    # (WARNING: this means that if we mutate metadata on the fake
+    # tensor, the stored example value will update too!)
+    example_value = _clone_input(example_value, tx.fake_mode)
+    set_example_value(proxy.node, example_value)
+    # We bind the unbacked symints in sizes/trdies of tensor lazily.
+    # So that subgraphs can access the unbacked symbol's proxy in parent graph
+    # when lifting unbacked symbols of input tensors to subgraph inputs.
+    # We do it lazily because the tensor may not be used in subgraphs.
+    tx.output.current_tracer.track_unbacked_symbols(example_value, proxy)
+    specialized_props = target_cls.specialize(example_value)
+    # TODO: not sure about this fake mode test
+    if (
+        isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor)
+        and example_value.fake_mode is tx.fake_mode
+    ):
+        if subclass_type:
+            tensor_type = subclass_type
+        elif isinstance(example_value, torch.nn.Parameter):
+            tensor_type = torch.nn.Parameter
+        elif isinstance(example_value, torch.nn.Buffer):
+            tensor_type = torch.nn.Buffer
+        else:
+            tensor_type = torch.Tensor
+        specialized_props["class_type"] = tensor_type
+
+    options.update(specialized_props)
+    return target_cls(proxy, **options)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_automatic_dynamic_shapes_mark_as():
     if config.automatic_dynamic_shapes_mark_as == "dynamic":
         return DimDynamic.DYNAMIC
@@ -2687,6 +3385,7 @@ def get_automatic_dynamic_shapes_mark_as():
 
 
 _DYNAMIC_SOURCES: Optional[set[str]] = None
+<<<<<<< HEAD
 
 
 def get_dynamic_sources() -> set[str]:
@@ -2697,10 +3396,32 @@ def get_dynamic_sources() -> set[str]:
     _DYNAMIC_SOURCES = set(
         torch.compiler.config.dynamic_sources.replace(" ", "").split(",")
     )
+=======
+_DYNAMIC_SOURCES_CONFIG_HASH: Optional[int] = None
+
+
+def get_dynamic_sources() -> set[str]:
+    global _DYNAMIC_SOURCES, _DYNAMIC_SOURCES_CONFIG_HASH
+
+    current_hash = hash(torch.compiler.config.dynamic_sources)
+
+    # If we have already calculated the sources and the config hasn't changed, return cached result
+    if _DYNAMIC_SOURCES is not None and _DYNAMIC_SOURCES_CONFIG_HASH == current_hash:
+        return _DYNAMIC_SOURCES
+
+    # Config has changed or first time, (re)calculate the sources
+    _DYNAMIC_SOURCES = {
+        s
+        for s in torch.compiler.config.dynamic_sources.replace(" ", "").split(",")
+        if s
+    }
+    _DYNAMIC_SOURCES_CONFIG_HASH = current_hash
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return _DYNAMIC_SOURCES
 
 
+<<<<<<< HEAD
 # Tracks the sources of all fake tensors we wrap in Dynamo.
 # Used by shape guard computation.
 @dataclasses.dataclass
@@ -2717,6 +3438,81 @@ def __eq__(self, other: object) -> bool:
         if isinstance(other, TrackedFake):
             return self.fake is other.fake and self.source.name() == other.source.name()
         return False
+=======
+def is_dynamic_source(source_name: str) -> bool:
+    dynamic_sources = get_dynamic_sources()
+    for pattern in dynamic_sources:
+        if pattern == source_name or re.match(pattern, source_name):
+            log.debug(
+                "%s was marked dynamic due to dynamic source allowlist pattern: %s",
+                source_name,
+                pattern,
+            )
+            return True
+    return False
+
+
+def record_automatic_dynamic(
+    tx: "InstructionTranslator", name: str, e: torch.Tensor
+) -> FrameStateSizeEntry:
+    # This mimics stride inference algorithm in _create_symbolic_sizes_strides_storage_offset
+    ex_size = e.size()
+    if not is_sparse_any(e):
+        ex_stride = e.stride()
+        dim = e.dim()
+
+        stride = [None] * dim
+        pending = [(ex_stride[i], -i) for i in range(dim)]
+        pending.sort(key=_nested_int_aware_sort)
+        candidates = {}
+        for i_stride, neg_i in pending:
+            i = -neg_i
+            stride[i] = candidates.get(i_stride, i_stride)
+            candidates.setdefault(i_stride * ex_size[i], InferStride(i))
+    else:
+        stride = []
+
+    return process_automatic_dynamic(
+        tx, name, FrameStateSizeEntry.make_tensor(tuple(ex_size), tuple(stride))
+    )
+
+
+_UNBACKED_SOURCES: Optional[set[str]] = None
+_UNBACKED_SOURCES_CONFIG_HASH: Optional[int] = None
+
+
+def get_unbacked_sources() -> set[str]:
+    global _UNBACKED_SOURCES, _UNBACKED_SOURCES_CONFIG_HASH
+
+    current_hash = hash(torch.compiler.config.unbacked_sources)
+
+    # If we have already calculated the sources and the config hasn't changed, return cached result
+    if _UNBACKED_SOURCES is not None and _UNBACKED_SOURCES_CONFIG_HASH == current_hash:
+        return _UNBACKED_SOURCES
+
+    # Config has changed or first time, (re)calculate the sources
+    _UNBACKED_SOURCES = {
+        s
+        for s in torch.compiler.config.unbacked_sources.replace(" ", "").split(",")
+        if s
+    }
+    _UNBACKED_SOURCES_CONFIG_HASH = current_hash
+
+    return _UNBACKED_SOURCES
+
+
+def is_unbacked_source(source_name: str) -> bool:
+    unbacked_sources = get_unbacked_sources()
+    for pattern in unbacked_sources:
+        if pattern == source_name or re.match(pattern, source_name):
+            log.debug(
+                "%s was marked unbacked due to unbacked source allowlist pattern: %s",
+                source_name,
+                pattern,
+            )
+            return True
+    return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Performs automatic dynamic dim determination.
@@ -2728,10 +3524,21 @@ def _automatic_dynamic(
     if e.is_nested and not isinstance(
         e, torch.nested._internal.nested_tensor.NestedTensor
     ):
+<<<<<<< HEAD
         unimplemented("torch.compile does not support strided NestedTensor")
 
     name = source.name()
     dynamic_sources = get_dynamic_sources()
+=======
+        unimplemented_v2(
+            gb_type="Encountered strided NestedTensor in automatic dynamic dim determination",
+            context="",
+            explanation="torch.compile does not support strided NestedTensor",
+            hints=[],
+        )
+
+    name = source.name()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prior_policy = tx.output.tracing_context.tensor_to_context.get(e, None)
     shape_env_to_source_to_symbol_cache = (
         prior_policy.shape_env_to_source_to_symbol_cache if prior_policy else None
@@ -2770,7 +3577,12 @@ def _automatic_dynamic(
             inner_contexts=inner_contexts,
         )
 
+<<<<<<< HEAD
     if static_shapes and name not in dynamic_sources:
+=======
+    if static_shapes and not is_dynamic_source(name):
+        record_automatic_dynamic(tx, name, e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return StatefulSymbolicContext(
             dynamic_sizes=[DimDynamic.STATIC] * e.dim(),
             dynamic_strides=[DimDynamic.INFER_STRIDE] * e.dim(),
@@ -2800,6 +3612,7 @@ def _automatic_dynamic(
         )
 
     # Prep for automatic dynamic
+<<<<<<< HEAD
 
     # This mimics stride inference algorithm in _create_symbolic_sizes_strides_storage_offset
     ex_size = e.size()
@@ -2821,6 +3634,9 @@ def _automatic_dynamic(
     frame_state_entry = process_automatic_dynamic(
         tx, name, FrameStateSizeEntry.make_tensor(tuple(ex_size), tuple(stride))
     )
+=======
+    frame_state_entry = record_automatic_dynamic(tx, name, e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO: index export_constraints ahead of time so we don't have to
     # do a linear scan every time here
@@ -2858,6 +3674,10 @@ def update_dim2constraint(dim, constraint_range, name):
     dynamic_strides = []
     constraint_sizes = []
     constraint_strides = []
+<<<<<<< HEAD
+=======
+    specialize_on = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i in range(e.dim()):
         # NB: mark dynamic has precedence over static
         marked_strict_unbacked = i in getattr(
@@ -2868,6 +3688,11 @@ def update_dim2constraint(dim, constraint_range, name):
         marked_weak_dynamic = i in getattr(e, "_dynamo_weak_dynamic_indices", set())
         marked_static = i in getattr(e, "_dynamo_static_indices", set())
 
+<<<<<<< HEAD
+=======
+        specialize_on.append(getattr(e, "_specialize_on", {}).get(i, []))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Reflect the user directive in the frame_state
         # For dynamic, apply None always
 
@@ -2901,11 +3726,23 @@ def update_dim2constraint(dim, constraint_range, name):
             config.automatic_dynamic_shapes and frame_state_entry.is_stride_dynamic(i)
         )
 
+<<<<<<< HEAD
         if name in dynamic_sources:
+=======
+        if is_dynamic_source(name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.debug("%s marked dynamic via source whitelist", name)
             automatic_dynamic_size = True
             automatic_dynamic_stride = True
 
+<<<<<<< HEAD
+=======
+        if is_unbacked_source(name):
+            log.debug("%s marked unbacked via source whitelist", name)
+            automatic_dynamic_size = True
+            automatic_dynamic_stride = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         automatic_dynamic = automatic_dynamic_size or automatic_dynamic_stride
 
         # We will process constraints first, as they will imply that we
@@ -2938,11 +3775,20 @@ def update_dim2constraint(dim, constraint_range, name):
             elif marked_strict_unbacked:
                 constraint_size = RelaxedUnspecConstraint(warn_only=False)
             elif not marked_static and automatic_dynamic:
+<<<<<<< HEAD
+=======
+                set_feature_use("dynamo.automatic_dynamic_shapes", True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if automatic_dynamic_size:
                     constraint_size = RelaxedUnspecConstraint(warn_only=True)
                 if automatic_dynamic_stride:
                     constraint_stride = RelaxedUnspecConstraint(warn_only=True)
             else:
+<<<<<<< HEAD
+=======
+                if not marked_static and not config.automatic_dynamic_shapes:
+                    set_feature_use("dynamo.automatic_dynamic_shapes", False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 constraint_size = None
                 constraint_stride = None
         else:
@@ -2953,7 +3799,11 @@ def update_dim2constraint(dim, constraint_range, name):
         constraint_sizes.append(constraint_size)
         constraint_strides.append(constraint_stride)
 
+<<<<<<< HEAD
         if marked_unbacked:
+=======
+        if marked_unbacked or is_unbacked_source(name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dynamic_size = DimDynamic.SIZE_LIKE_UNBACKED
         elif (
             constraint_size is not None
@@ -2987,6 +3837,10 @@ def update_dim2constraint(dim, constraint_range, name):
         dynamic_strides=dynamic_strides,
         constraint_sizes=constraint_sizes,
         constraint_strides=constraint_strides,
+<<<<<<< HEAD
+=======
+        specialize_on=specialize_on,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         view_base_context=view_base_context,
         tensor_source=source,
         shape_env_to_source_to_symbol_cache=shape_env_to_source_to_symbol_cache,
@@ -3030,6 +3884,10 @@ def wrap_to_fake_tensor_and_record(
             symbolic_context,
             type(e),
         )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fake_e = wrap_fake_exception(
             lambda: tx.fake_mode.from_tensor(
                 e,
@@ -3173,8 +4031,28 @@ def create(tx: "InstructionTranslator", value) -> VariableTracker:
             )
         elif isinstance(value, types.GenericAlias):
             return TypingVariable(value)
+<<<<<<< HEAD
         unimplemented(
             f"Unexpected type in sourceless builder {value_type.__module__}.{value_type.__qualname__}"
+=======
+        elif is_namedtuple(value):
+            output = [
+                SourcelessBuilder.create(tx, getattr(value, name))
+                for name in namedtuple_fields(type(value))
+            ]
+            return NamedTupleVariable(output, tuple_cls=type(value))
+        elif (
+            isinstance(value, torch.SymInt)
+            and value.node.expr in tx.output.bound_symbols
+        ):
+            proxy = tx.output.bound_symbols[value.node.expr]
+            return SymNodeVariable.create(tx, proxy)
+        unimplemented_v2(
+            gb_type="Unexpected type in sourceless builder",
+            context=f"{value_type.__module__}.{value_type.__qualname__}",
+            explanation=f"SourcelessBuilder.create does not know how to wrap {value_type}",
+            hints=[*graph_break_hints.DYNAMO_BUG],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @staticmethod
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 8f66b3b63b00..d3f0e6744b2a 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -1,5 +1,29 @@
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
+=======
+"""
+Built-in function and type variable tracking for TorchDynamo's symbolic execution.
+
+This module contains variable tracker classes for Python built-in functions, types,
+and operations during graph compilation. It handles symbolic execution of:
+
+- Built-in functions (len, getattr, isinstance, etc.)
+- Type constructors (int, float, str, list, dict, etc.)
+- Built-in operators and methods
+- Special Python constructs (super, hasattr, etc.)
+
+Key classes:
+- BuiltinVariable: Tracks built-in functions and handles their execution
+- TypeVariable: Manages type constructor calls and type checking
+- SuperVariable: Handles super() calls in class hierarchies
+
+These variable trackers ensure that built-in Python operations are correctly
+handled during symbolic execution, either by executing them directly when safe
+or by creating appropriate graph nodes when needed.
+"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import contextlib
 import functools
 import inspect
@@ -10,20 +34,34 @@
 import sys
 import types
 import typing
+<<<<<<< HEAD
+=======
+import unittest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections import defaultdict, OrderedDict
 from collections.abc import KeysView, Sequence
 from typing import Callable, TYPE_CHECKING, Union
 
 import torch
 from torch import sym_float, sym_int
+<<<<<<< HEAD
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 from .. import config, polyfills, variables
+=======
+from torch._subclasses.meta_utils import is_sparse_any
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+from .. import config, graph_break_hints, polyfills, variables
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..exc import (
     AttributeMutationError,
     ObservedAttributeError,
     raise_observed_exception,
+<<<<<<< HEAD
     unimplemented,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unimplemented_v2,
     Unsupported,
     UserError,
@@ -48,6 +86,10 @@
     extract_fake_example_value,
     get_fake_value,
     guard_if_dyn,
+<<<<<<< HEAD
+=======
+    is_tensor_getset_descriptor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_wrapper_or_member_descriptor,
     istype,
     numpy_operator_wrapper,
@@ -55,7 +97,11 @@
     str_methods,
     tensortype_to_dtype,
 )
+<<<<<<< HEAD
 from .base import ValueMutationNew, VariableTracker
+=======
+from .base import AsPythonConstantNotImplementedError, ValueMutationNew, VariableTracker
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .constant import ConstantVariable
 from .ctx_manager import EventVariable, StreamVariable
 from .dicts import (
@@ -86,6 +132,10 @@
 
 if TYPE_CHECKING:
     # Cyclic dependency...
+<<<<<<< HEAD
+=======
+    from torch._dynamo.codegen import PyCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 log = logging.getLogger(__name__)
@@ -145,7 +195,11 @@ def create_with_source(cls, value, source):
         return cls(value, source=source)
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _constant_fold_functions():
         fns = {
             abs,
@@ -215,7 +269,11 @@ def can_constant_fold_through(self):
         return self.fn in self._constant_fold_functions()
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _fx_graph_functions():
         fns = {
             operator.abs,
@@ -261,7 +319,11 @@ def _fx_graph_functions():
         return fns
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _binops() -> dict[
         Callable[..., object], tuple[list[str], Callable[..., object]]
     ]:
@@ -300,7 +362,11 @@ def _binops() -> dict[
         return fns
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _binop_handlers():
         # Multiple dispatch mechanism defining custom binop behavior for certain type
         # combinations. Handlers are attempted in order, and will be used if the type checks
@@ -437,6 +503,17 @@ def size_add_handler(tx: "InstructionTranslator", a, b):
                 size_add_handler,
             ),
             (
+<<<<<<< HEAD
+=======
+                (SizeVariable, TupleVariable),
+                size_add_handler,
+            ),
+            (
+                (TupleVariable, SizeVariable),
+                size_add_handler,
+            ),
+            (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (TupleVariable, TupleVariable),
                 tuple_add_handler,
             ),
@@ -729,7 +806,11 @@ def as_proxy(self):
             return DTYPE[self.fn]
         return super().as_proxy()
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "torch._dynamo.codegen.PyCodegen"):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = self.fn.__name__
         assert self.fn.__module__ == "builtins"
         assert name not in codegen.tx.f_globals, "shadowed global"
@@ -792,7 +873,11 @@ def _make_handler(fn, arg_types: list[type], has_kwargs: bool):
 
         if inspect.isclass(fn) and (
             issubclass(fn, Exception)
+<<<<<<< HEAD
             # GeneratorExit doens't inherit from Exception
+=======
+            # GeneratorExit doesn't inherit from Exception
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # >>> issubclass(GeneratorExit, Exception)
             # False
             or fn is GeneratorExit
@@ -806,7 +891,16 @@ def create_exception_class_object(
                     and isinstance(x.value, str)
                     for x in args
                 ):
+<<<<<<< HEAD
                     unimplemented("assert with non-string message")
+=======
+                    unimplemented_v2(
+                        gb_type="assert with non-string message",
+                        context=str(args),
+                        explanation="Dynamo only supports asserts with string messages",
+                        hints=[*graph_break_hints.SUPPORTABLE],
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 return variables.ExceptionVariable(fn, args, **kwargs)
 
@@ -863,8 +957,16 @@ def call_self_handler(tx: "InstructionTranslator", args, kwargs):
                                 self_handler,
                                 e,
                             )
+<<<<<<< HEAD
                             unimplemented(
                                 f"invalid handler args {self_handler} {args} {kwargs}"
+=======
+                            unimplemented_v2(
+                                gb_type="invalid call to builtin op handler",
+                                context=f"invalid args to {self_handler}: {args} {kwargs}",
+                                explanation=f"Encountered TypeError when trying to handle op {fn.__name__}",
+                                hints=[*graph_break_hints.DIFFICULT],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             )
                     else:
                         raise
@@ -890,7 +992,23 @@ def constant_fold_handler(tx: "InstructionTranslator", args, kwargs):
                             *[x.as_python_constant() for x in args],
                         )
                     except Exception as exc:
+<<<<<<< HEAD
                         unimplemented(f"constant fold exception: {repr(exc)}")
+=======
+                        raise_observed_exception(
+                            type(exc),
+                            tx,
+                            args=list(map(ConstantVariable.create, exc.args)),
+                        )
+                    except AsPythonConstantNotImplementedError as exc:
+                        unimplemented_v2(
+                            gb_type="constant fold exception",
+                            context=f"attempted to run function {fn} with arguments {args}",
+                            explanation="Encountered exception when attempting to constant fold.",
+                            hints=[*graph_break_hints.DYNAMO_BUG],
+                            from_exc=exc,
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return VariableTracker.build(tx, res)
 
             else:
@@ -905,8 +1023,25 @@ def constant_fold_handler(tx: "InstructionTranslator", args, kwargs):
                                     k: v.as_python_constant() for k, v in kwargs.items()
                                 },
                             )
+<<<<<<< HEAD
                         except Exception as exc:
                             unimplemented(f"constant fold exception: {repr(exc)}")
+=======
+                        except AsPythonConstantNotImplementedError as exc:
+                            unimplemented_v2(
+                                gb_type="constant fold exception",
+                                context=f"attempted to run function {fn} with arguments {args}",
+                                explanation="Encountered exception when attempting to constant fold.",
+                                hints=[*graph_break_hints.DYNAMO_BUG],
+                                from_exc=exc,
+                            )
+                        except Exception as exc:
+                            raise_observed_exception(
+                                type(exc),
+                                tx,
+                                args=list(map(ConstantVariable.create, exc.args)),
+                            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         return VariableTracker.build(tx, res)
 
             handlers.append(constant_fold_handler)
@@ -1036,6 +1171,18 @@ def _handle_insert_op_in_graph(self, tx: "InstructionTranslator", args, kwargs):
 
                 return wrap_fx_proxy_cls(variables.NumpyNdarrayVariable, tx, proxy)
 
+<<<<<<< HEAD
+=======
+            if (
+                fn is operator.eq
+                and len(args) == 2
+                and isinstance(args[0], variables.TensorVariable)
+            ):
+                # Dynamo expects `__eq__` str while operator.eq gives just `eq`
+                # TODO - supporting all comparison operators could also work but
+                # it fails lots of tests because graph str changes.
+                return args[0].call_method(tx, "__eq__", args[1:], kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             proxy = tx.output.create_proxy(
                 "call_function",
                 fn,
@@ -1076,7 +1223,16 @@ def _handle_insert_op_in_graph(self, tx: "InstructionTranslator", args, kwargs):
                 return wrap_fx_proxy(tx, proxy)
 
         except NotImplementedError:
+<<<<<<< HEAD
             unimplemented(f"partial tensor op: {self} {args} {kwargs}")
+=======
+            unimplemented_v2(
+                gb_type="unimplemented builtin op on tensor arguments",
+                context=f"partial tensor op: {self} {args} {kwargs}",
+                explanation=f"Dynamo does not know how to trace builtin operator {self.fn} with tensor arguments",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     call_function_handler_cache: dict[
         tuple[object, ...],
@@ -1156,6 +1312,7 @@ def call_method(
                 and args[1].has_unpack_var_sequence(tx)
                 and not kwargs
             ):
+<<<<<<< HEAD
                 init_args = args[1].unpack_var_sequence(tx)
                 tuple_vt = variables.TupleVariable(
                     init_args, mutation_type=ValueMutationNew()
@@ -1164,12 +1321,24 @@ def call_method(
                     return tuple_vt
 
                 result = tx.output.side_effects.track_new_user_defined_object(
+=======
+                if isinstance(args[0], BuiltinVariable) and args[0].fn is tuple:
+                    init_args = args[1].unpack_var_sequence(tx)
+                    return variables.TupleVariable(
+                        init_args, mutation_type=ValueMutationNew()
+                    )
+
+                return tx.output.side_effects.track_new_user_defined_object(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self,
                     args[0],
                     args[1:],
                 )
+<<<<<<< HEAD
                 result.set_underlying_tuple_vt(tuple_vt)
                 return result
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if self.fn is list:
                 list_vt = ListVariable([], mutation_type=ValueMutationNew())
@@ -1228,6 +1397,27 @@ def _call_int_float(self, tx: "InstructionTranslator", arg):
     call_int = _call_int_float
     call_float = _call_int_float
 
+<<<<<<< HEAD
+=======
+    def call_bool(self, tx: "InstructionTranslator", arg):
+        # Emulate `PyBool_Type.tp_vectorcall` which boils down to `PyObject_IsTrue`.
+        # https://github.com/python/cpython/blob/3.12/Objects/object.c#L1674-L1697
+        if isinstance(arg, SymNodeVariable):
+            # Note that we delay specializing on symbolic values to avoid
+            # unnecessary guards. Specialization will happen later if, e.g., the
+            # resulting boolean is used for branching.
+            if isinstance(arg.sym_num, torch.SymBool):
+                return arg
+
+            # Emulate `nb_bool` of int/float objects
+            # - https://github.com/python/cpython/blob/3.12/Objects/longobject.c#L4940-L4944
+            # - https://github.com/python/cpython/blob/3.12/Objects/floatobject.c#L878-L882
+            assert istype(arg.sym_num, (torch.SymInt, torch.SymFloat))
+            return SymNodeVariable.create(tx, arg.as_proxy() != 0)
+
+        # TODO handle more cases and merge this with this with `generic_jump`.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_str(self, tx: "InstructionTranslator", arg):
         # Handle `str` on a user defined function or object
         if isinstance(arg, (variables.UserFunctionVariable)):
@@ -1240,7 +1430,16 @@ def call_str(self, tx: "InstructionTranslator", arg):
                 # account for __repr__ functions when __str__ is absent
                 str_method = arg.value.__repr__
             else:
+<<<<<<< HEAD
                 unimplemented("user defined object has no __str__ or __repr__ method")
+=======
+                unimplemented_v2(
+                    gb_type="failed to call str() on user defined object",
+                    context=str(arg),
+                    explanation="User defined object has no __str__ or __repr__ method",
+                    hints=[*graph_break_hints.USER_ERROR],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if type(arg.value).__str__ is object.__str__:
                 # Rely on the object str method
@@ -1250,7 +1449,16 @@ def call_str(self, tx: "InstructionTranslator", arg):
                     # Graph break
                     return
             elif is_wrapper_or_member_descriptor(str_method):
+<<<<<<< HEAD
                 unimplemented(f"{type(arg.value)} has a C/C++ based str method")
+=======
+                unimplemented_v2(
+                    gb_type="Attempted to a str() method implemented in C/C++",
+                    context="",
+                    explanation=f"{type(arg.value)} has a C/C++ based str method. This is not supported.",
+                    hints=["Write the str method in Python"],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 # Overrides for custom str method
                 # Pass method as function to call tx.inline_user_function_return
@@ -1266,6 +1474,15 @@ def call_str(self, tx: "InstructionTranslator", arg):
 
                 # Inline the user function
                 return tx.inline_user_function_return(user_func_variable, [arg], {})
+<<<<<<< HEAD
+=======
+        elif isinstance(arg, (variables.ExceptionVariable,)):
+            if len(arg.args) == 0:
+                value = f"{arg.exc_type}"
+            else:
+                value = ", ".join(a.as_python_constant() for a in arg.args)
+            return variables.ConstantVariable.create(value=value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _call_min_max(self, tx: "InstructionTranslator", *args):
         if len(args) == 1 and args[0].has_force_unpack_var_sequence(tx):
@@ -1401,7 +1618,16 @@ def call_pos(self, tx: "InstructionTranslator", arg: "VariableTracker"):
 
     def call_index(self, tx: "InstructionTranslator", arg: "VariableTracker"):
         if isinstance(arg, variables.TensorVariable):
+<<<<<<< HEAD
             unimplemented("unsupported index(tensor)")
+=======
+            unimplemented_v2(
+                gb_type="unsupported index(Tensor)",
+                context="",
+                explanation="Dynamo does not support tracing builtin index() on a Tensor",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         arg = guard_if_dyn(arg)
         constant_value = operator.index(arg)
@@ -1470,9 +1696,26 @@ def _call_iter_tuple_list(
                         and isinstance(obj, ConstDictVariable)
                         and not istype(obj, SetVariable)
                     ):
+<<<<<<< HEAD
                         tx.output.guard_on_key_order.add(obj.source.name())
 
                     install_guard(obj.source.make_guard(GuardBuilder.SEQUENCE_LENGTH))
+=======
+                        tx.output.guard_on_key_order.add(obj.source)
+
+                    if isinstance(obj, variables.MappingProxyVariable):
+                        # This could be an overguarding, but its rare to iterate
+                        # through a mapping proxy and not use the keys.
+                        install_guard(
+                            obj.source.make_guard(GuardBuilder.MAPPING_KEYS_CHECK)
+                        )
+                    elif not isinstance(obj, variables.UnspecializedNNModuleVariable):
+                        # Prevent calling __len__ method for guards, the tracing
+                        # of __iter__ will insert the right guards later.
+                        install_guard(
+                            obj.source.make_guard(GuardBuilder.SEQUENCE_LENGTH)
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return cls(
                 list(obj.unpack_var_sequence(tx)),
@@ -1548,7 +1791,16 @@ def call_cast(self, _, *args, **kwargs):
         if len(args) == 2:
             return args[1]
 
+<<<<<<< HEAD
         unimplemented(f"unsupported args to builtin cast(): {args} {kwargs}")
+=======
+        unimplemented_v2(
+            gb_type="bad args to builtin cast()",
+            context=f"got args {args} {kwargs}",
+            explanation="Dynamo expects exactly 2 args to builtin cast().",
+            hints=["Ensure your call to cast() has exactly 2 arguments."],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_dict(self, tx: "InstructionTranslator", *args, **kwargs):
         return BuiltinVariable.call_custom_dict(tx, dict, *args, **kwargs)
@@ -1594,14 +1846,42 @@ def call_custom_dict_fromkeys(
                     user_cls,
                     mutation_type=ValueMutationNew(),
                 )
+<<<<<<< HEAD
         unimplemented(f"{user_cls.__name__}.fromkeys(): {args} {kwargs}")
+=======
+
+        unimplemented_v2(
+            gb_type="failed to call dict.fromkeys()",
+            context=f"{user_cls.__name__}.fromkeys(): {args} {kwargs}",
+            explanation=f"Failed to call {user_cls.__name__}.fromkeys() because "
+            "arguments could not be automatically converted to a list, "
+            "or some dict key is not hashable.",
+            hints=[
+                "Manually convert the argument to a list.",
+                "Ensure all keys are hashable.",
+            ],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_set(self, tx: "InstructionTranslator", *args, **kwargs):
         # Can we merge this implementation and call_dict's one?
         assert not kwargs
         if not args:
             return SetVariable([], mutation_type=ValueMutationNew())
+<<<<<<< HEAD
         assert len(args) == 1
+=======
+        if len(args) != 1:
+            raise_observed_exception(
+                TypeError,
+                tx,
+                args=[
+                    ConstantVariable.create(
+                        f"set() takes 1 positional argument but {len(args)} were given"
+                    )
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arg = args[0]
         if isinstance(arg, variables.SetVariable):
             return arg.clone(mutation_type=ValueMutationNew())
@@ -1617,24 +1897,53 @@ def call_set(self, tx: "InstructionTranslator", *args, **kwargs):
                 if isinstance(out, SetVariable):
                     return out
                 return BuiltinVariable(set).call_set(tx, out)
+<<<<<<< HEAD
             else:
                 unimplemented(f"set(): {args} {kwargs}")
         else:
             unimplemented(f"set(): {args} {kwargs}")
+=======
+        raise_observed_exception(
+            TypeError,
+            tx,
+            args=[ConstantVariable.create("failed to construct builtin set()")],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_frozenset(self, tx: "InstructionTranslator", *args, **kwargs):
         assert not kwargs
         if not args:
             return FrozensetVariable([])
+<<<<<<< HEAD
         assert len(args) == 1
+=======
+        if len(args) != 1:
+            raise_observed_exception(
+                TypeError,
+                tx,
+                args=[
+                    ConstantVariable.create(
+                        f"frozenset() takes 1 positional argument but {len(args)} were given"
+                    )
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arg = args[0]
         if isinstance(arg, variables.FrozensetVariable):
             return FrozensetVariable([x.vt for x in arg.set_items])
         elif arg.has_unpack_var_sequence(tx):
             items = arg.unpack_var_sequence(tx)
             return FrozensetVariable(items)
+<<<<<<< HEAD
         else:
             unimplemented(f"frozenset(): {args} {kwargs}")
+=======
+        raise_observed_exception(
+            TypeError,
+            tx,
+            args=[ConstantVariable.create("failed to construct builtin frozenset()")],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_zip(self, tx: "InstructionTranslator", *args, **kwargs):
         if kwargs:
@@ -1649,7 +1958,14 @@ def call_zip(self, tx: "InstructionTranslator", *args, **kwargs):
         )
 
     def call_len(self, tx: "InstructionTranslator", *args, **kwargs):
+<<<<<<< HEAD
         return args[0].call_method(tx, "__len__", args[1:], kwargs)
+=======
+        try:
+            return args[0].call_method(tx, "__len__", args[1:], kwargs)
+        except AttributeError as e:
+            raise_observed_exception(type(e), tx, args=list(e.args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_getitem(self, tx: "InstructionTranslator", *args, **kwargs):
         return args[0].call_method(tx, "__getitem__", args[1:], kwargs)
@@ -1658,8 +1974,16 @@ def call_isinstance(self, tx: "InstructionTranslator", arg, isinstance_type):
         try:
             arg_type = arg.python_type()
         except NotImplementedError:
+<<<<<<< HEAD
             unimplemented(
                 f"isinstance({arg}, {isinstance_type}): can't determine type of {arg}"
+=======
+            unimplemented_v2(
+                gb_type="builtin isinstance() cannot determine type of argument",
+                context=f"isinstance({arg}, {isinstance_type})",
+                explanation=f"Dynamo doesn't have a rule to determine the type of argument {arg}",
+                hints=[*graph_break_hints.DYNAMO_BUG],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         isinstance_type = isinstance_type.as_python_constant()
@@ -1697,8 +2021,17 @@ def check_type(ty):
         if isinstance(arg, variables.UserDefinedObjectVariable) and isinstance(
             arg.value, types.MemberDescriptorType
         ):
+<<<<<<< HEAD
             unimplemented(
                 f"isinstance called on UserDefinedClass {arg} {isinstance_type}"
+=======
+            unimplemented_v2(
+                gb_type="isinstance() called on user defined object with C extensions",
+                context=f"isinstance({arg}, {isinstance_type})",
+                explanation="User-defined object with C extensions can have torch.Tensor "
+                "attributes; intentionally graph breaking.",
+                hints=[*graph_break_hints.SUPPORTABLE],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         # handle __instancecheck__ defined in user class
         if (
@@ -1709,6 +2042,12 @@ def check_type(ty):
                 isinstance_type.__class__.__instancecheck__(isinstance_type, arg.value)
             )
 
+<<<<<<< HEAD
+=======
+        if isinstance(arg, variables.UserDefinedExceptionClassVariable):
+            return ConstantVariable.create(isinstance(arg_type, isinstance_type))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         isinstance_type_tuple: tuple[type, ...]
         if isinstance(isinstance_type, type) or callable(
             # E.g. isinstance(obj, typing.Sequence)
@@ -1752,8 +2091,19 @@ def call_issubclass(self, tx: "InstructionTranslator", left_ty, right_ty):
             left_ty_py = left_ty.as_python_constant()
             right_ty_py = right_ty.as_python_constant()
         except NotImplementedError:
+<<<<<<< HEAD
             unimplemented(
                 f"call_issubclass args not constant left_ty: {left_ty}, right_ty: {right_ty}"
+=======
+            unimplemented_v2(
+                gb_type="issubclass() with non-constant arguments",
+                context=f"issubclass({left_ty}, {right_ty})",
+                explanation="issubclass() with non-constant arguments not supported.",
+                hints=[
+                    "Make sure your arguments are types.",
+                    *graph_break_hints.USER_ERROR,
+                ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # WARNING: This might run arbitrary user code `__subclasscheck__`.
@@ -1797,10 +2147,26 @@ def call_getattr(
         name_var: VariableTracker,
         default=None,
     ):
+<<<<<<< HEAD
         name = name_var.as_python_constant()
 
         if not name_var.is_python_constant():
             unimplemented("non-const getattr() name")
+=======
+        if not name_var.is_python_constant():
+            unimplemented_v2(
+                gb_type="getattr() with non-constant name argument",
+                context=f"getattr({obj}, {name_var}, {default})",
+                explanation="getattr() with non-constant name argument is not supported",
+                hints=["Ensure the name argument of getattr() is a string"],
+            )
+
+        name = name_var.as_python_constant()
+
+        # See NOTE [Tensor "grad" and "_grad" attr]
+        if isinstance(obj, TensorVariable) and name == "_grad":
+            name = "grad"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if tx.output.side_effects.is_attribute_mutation(obj):
             if isinstance(obj, variables.UnspecializedNNModuleVariable):
@@ -1817,8 +2183,17 @@ def call_getattr(
                     and obj.is_state_mutated
                     and tx.output.side_effects.has_pending_mutation(obj)
                 ):
+<<<<<<< HEAD
                     unimplemented(
                         f"pending mutation on nn module, so graph breaking at {name!r} call"
+=======
+                    unimplemented_v2(
+                        gb_type="getattr() on nn.Module with pending mutation",
+                        context=f"getattr({obj}, {name}, {default})",
+                        explanation="Intentionally graph breaking on getattr() on a nn.Module "
+                        "with a pending mutation",
+                        hints=[],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
         if tx.output.side_effects.has_pending_mutation_of_attr(obj, name):
@@ -1863,6 +2238,46 @@ def call_getattr(
                 variables.UserDefinedObjectVariable,
             ),
         ):
+<<<<<<< HEAD
+=======
+            if (
+                isinstance(obj, variables.UserDefinedObjectVariable)
+                and issubclass(obj.value.__class__, unittest.TestCase)
+                and config.enable_trace_unittest
+                and name
+                in (
+                    "assertRaisesRegex",
+                    "assertNotWarns",
+                    "assertWarnsRegex",
+                    "assertDictEqual",
+                    "assertSequenceEqual",
+                    "assertWarns",
+                )
+            ):
+                unimplemented_v2(
+                    gb_type="Failed to trace unittest method",
+                    context=f"function: unittest.TestCase.{name}",
+                    explanation=f"Dynamo does not know how to trace unittest method `{name}` ",
+                    hints=[
+                        f"Avoid calling `TestCase.{name}`. "
+                        "Please report an issue to PyTorch.",
+                    ],
+                )
+            if isinstance(obj, TensorVariable):
+                fake_val = obj.proxy.node.meta["example_value"]
+                if (
+                    isinstance(fake_val, torch.Tensor)
+                    and is_sparse_any(fake_val)
+                    and (not tx.export or not config.capture_sparse_compute)
+                ):
+                    unimplemented_v2(
+                        gb_type="Attempted to wrap sparse Tensor",
+                        context="",
+                        explanation="torch.compile does not support sparse Tensors",
+                        hints=[*graph_break_hints.SUPPORTABLE],
+                    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 return obj.var_getattr(tx, name)
             except NotImplementedError:
@@ -1911,6 +2326,10 @@ def call_setattr(
                 variables.PlacementVariable,
                 variables.NamedTupleVariable,
                 variables.UserDefinedObjectVariable,
+<<<<<<< HEAD
+=======
+                variables.NestedUserFunctionVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 variables.ExceptionVariable,
             ),
         ):
@@ -1923,6 +2342,7 @@ def call_setattr(
             if isinstance(obj, variables.TensorVariable):
                 from .builder import wrap_fx_proxy
 
+<<<<<<< HEAD
                 if name == "requires_grad":
                     # TODO(voz): Make it work properly
                     unimplemented(
@@ -1930,6 +2350,34 @@ def call_setattr(
                         "the middle of the graph, which aot_autograd does not currently know how to handle. "
                     )
                 if name == "data":
+=======
+                # Some special handling for tensor attributes.
+                if name == "requires_grad":
+                    # TODO(voz): Make it work properly
+                    unimplemented_v2(
+                        gb_type="setattr() on Tensor.requires_grad",
+                        context=f"setattr({obj}, {name}, {val})",
+                        explanation="setattr() on Tensor.requires_grad not supported. "
+                        "Mutating requires_grad can introduce a new leaf from non-leaf or vice versa in "
+                        "the middle of the graph, which AOTAutograd does not currently know how to handle.",
+                        hints=[*graph_break_hints.SUPPORTABLE],
+                    )
+                elif name == "data":
+                    # See comments on `test_set_data_on_scoped_tensor` for plans
+                    # to support this.
+                    if obj.source is None:
+                        unimplemented_v2(
+                            gb_type="Failed to mutate tensor data attribute",
+                            context=f"setattr({obj}, {name}, {val})",
+                            explanation="Dyanmo only supports mutating `.data`"
+                            " of tensor created outside `torch.compile` region",
+                            hints=[
+                                "Don't mutate `.data` on this tensor, or move "
+                                "the mutation out of `torch.compile` region",
+                            ],
+                        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # Remove the old reference in tracked fakes - if we don't do this
                     # new .data value size and shape differences will cause
                     # tracked fakes to produce incorrect guards. This is sound because the TensorVariable
@@ -1974,6 +2422,7 @@ def _lower_version_count_by_1(x):
                     # This handles options prop, guards and ends with a clone
                     # Step 4 - replace all reference to the current object with the new one
                     return out
+<<<<<<< HEAD
 
             tx.output.side_effects.store_attr(obj, name, val)
             if name == "_grad":
@@ -1984,6 +2433,31 @@ def _lower_version_count_by_1(x):
             unimplemented(
                 f"setattr(UserDefinedObjectVariable) {type(obj.value).__setattr__}"
             )
+=======
+                elif name in ("_grad", "grad"):
+                    # NOTE: [Tensor "grad" and "_grad" attr]
+                    # _grad and grad share the same setter/getter, see
+                    # THPVariable_properties, and here we make sure setting one
+                    # enables reading `val` from the other, by routing all
+                    # read/write to `grad`.
+                    name = "grad"
+                elif is_tensor_getset_descriptor(name):
+                    # Attribute like `torch.Tensor.real` has special setters we
+                    # don't yet support; it's not as simple adding an entry to
+                    # the side effect mapping.
+                    unimplemented_v2(
+                        gb_type="Failed to set tensor attribute",
+                        context=f"setattr({obj}, {name}, {val})",
+                        explanation="Dyanmo doesn't support setting these tensor attributes",
+                        hints=[
+                            f"Don't mutate attribute '{name}' on tensors, or "
+                            "move the mutation out of `torch.compile` region",
+                        ],
+                    )
+
+            tx.output.side_effects.store_attr(obj, name, val)
+            return val
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(obj, variables.NNModuleVariable):
             if not tx.output.is_root_tracer():
                 raise AttributeMutationError(
@@ -2003,7 +2477,11 @@ def _lower_version_count_by_1(x):
                     # get_fake_val will get the same fake tensor
                     existing_fake_attr = get_fake_value(getattr_var.as_proxy().node, tx)
 
+<<<<<<< HEAD
                     # same tensor identiy, setattr is a no-op
+=======
+                    # same tensor identity, setattr is a no-op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     mod_setattr = inspect.getattr_static(obj.module_type, "__setattr__")
                     if (
                         existing_fake_attr is assigning_fake_val
@@ -2019,7 +2497,11 @@ def call_delattr(
         obj: VariableTracker,
         name_var: VariableTracker,
     ):
+<<<<<<< HEAD
         return self.call_setattr(tx, obj, name_var, variables.DeletedVariable())
+=======
+        return obj.call_method(tx, "__delattr__", [name_var], {})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_type(self, tx: "InstructionTranslator", obj: VariableTracker):
         try:
@@ -2105,10 +2587,34 @@ def call_id(self, tx: "InstructionTranslator", *args):
         elif istype(args[0], variables.FunctoolsPartialVariable):
             return variables.ConstantVariable.create(id(args[0].fake_value))
         else:
+<<<<<<< HEAD
             unimplemented(f"call_id with args {args}")
 
     def call_deepcopy(self, tx: "InstructionTranslator", x):
         unimplemented(f"copy.deepcopy {repr(x)}")
+=======
+            unimplemented_v2(
+                gb_type="id() with unsupported args",
+                context=str(args),
+                explanation=f"Dynamo doesn't know how to trace id() call with args {args}",
+                hints=[
+                    "Supported args are Tensors, and functions/nn.Modules/user-defined objects "
+                    "from outside the compiled region.",
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
+
+    def call_deepcopy(self, tx: "InstructionTranslator", x):
+        unimplemented_v2(
+            gb_type="copy.deepcopy()",
+            context=f"copy.deepcopy({x})",
+            explanation="Dynamo does not support copy.deepcopy()",
+            hints=[
+                "Avoid calling copy.deepcopy()",
+                *graph_break_hints.SUPPORTABLE,
+            ],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _comparison_with_tensor(self, tx: "InstructionTranslator", left, right):
         from .builder import wrap_fx_proxy_cls
@@ -2129,7 +2635,17 @@ def _comparison_with_tensor(self, tx: "InstructionTranslator", left, right):
                 return ConstantVariable.create(not is_result)
 
         if op not in supported_tensor_comparison_op_values:
+<<<<<<< HEAD
             unimplemented(f"{op.__name__}({left}, {right})")
+=======
+            unimplemented_v2(
+                gb_type="unsupported Tensor comparison op",
+                context=f"{op.__name__}({left}, {right})",
+                explanation=f"Dynamo does not support the comparison op {op.__name__} "
+                f"with Tensor arguments {left}, {right}",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             isinstance(left, TensorVariable)
             and isinstance(right, TensorVariable)
@@ -2140,7 +2656,17 @@ def _comparison_with_tensor(self, tx: "InstructionTranslator", left, right):
                 torch.broadcast_shapes(left.size, right.size)
             except RuntimeError:
                 # not broadcastable, can't be compared
+<<<<<<< HEAD
                 unimplemented(f"{op.__name__}({left}, {right})")
+=======
+                unimplemented_v2(
+                    gb_type="failed to broadcast when attempting Tensor comparison op",
+                    context=f"{op.__name__}({left}, {right})",
+                    explanation=f"Dynamo was unable to broad cast the arguments {left}, {right} "
+                    f"when attempting to trace the comparison op {op.__name__}.",
+                    hints=[*graph_break_hints.USER_ERROR],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor_cls = left if isinstance(left, TensorVariable) else right
         proxy = tx.output.create_proxy(
             "call_function", op, (left.as_proxy(), right.as_proxy()), {}
@@ -2157,7 +2683,17 @@ def _comparison_with_symnode(self, tx: "InstructionTranslator", left, right):
         op = self.fn
 
         if op not in supported_tensor_comparison_op_values:
+<<<<<<< HEAD
             unimplemented(f"{op.__name__}({left}, {right})")
+=======
+            unimplemented_v2(
+                gb_type="unsupported SymNode comparison op",
+                context=f"{op.__name__}({left}, {right})",
+                explanation=f"Dynamo does not support the comparison op {op.__name__} "
+                f"with SymNode arguments {left}, {right}",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # This is seen in inspect signature where we check if the value is a default value
         if isinstance(right, variables.UserDefinedClassVariable):
@@ -2208,6 +2744,12 @@ def call_or_(self, tx: "InstructionTranslator", a, b):
             )
         if hasattr(a, "set_items") and hasattr(b, "set_items"):
             return SetVariable(list(a.set_items | b.set_items))
+<<<<<<< HEAD
+=======
+        # This call looks like `{"one": torch.ones(1)} | {"two": torch.ones(2)}`.
+        if isinstance(a, ConstDictVariable):
+            return a.call_method(tx, "__or__", args=[b], kwargs={})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # None no-ops this handler and lets the driving function proceed
         return None
 
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index 6760bd1ff73a..a4797f9f312c 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -133,7 +133,11 @@ def const_getattr(self, tx: "InstructionTranslator", name):
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
@@ -190,7 +194,16 @@ def call_method(
                     )
                     return SymNodeVariable.create(tx, proxy, add_target)
                 else:
+<<<<<<< HEAD
                     return ConstantVariable.create(op(self.value, add_target))
+=======
+                    try:
+                        return ConstantVariable.create(op(self.value, add_target))
+                    except Exception as e:
+                        raise_observed_exception(
+                            type(e), tx, args=list(map(ConstantVariable.create, e.args))
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(self.value, bytes) and name == "decode":
             method = getattr(self.value, name)
             return ConstantVariable.create(method(*const_args, **const_kwargs))
diff --git a/torch/_dynamo/variables/ctx_manager.py b/torch/_dynamo/variables/ctx_manager.py
index 4ebcb0632b0f..3b8f1e8bbc66 100644
--- a/torch/_dynamo/variables/ctx_manager.py
+++ b/torch/_dynamo/variables/ctx_manager.py
@@ -20,11 +20,18 @@
 restoring state changes.
 """
 
+<<<<<<< HEAD
 import dataclasses
 import inspect
 import sys
 import warnings
 from typing import Callable, Optional, TYPE_CHECKING, Union
+=======
+import inspect
+import sys
+import warnings
+from typing import TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._C
 from torch._guards import Guard
@@ -42,8 +49,16 @@
 from .base import VariableTracker
 from .functions import (
     NestedUserFunctionVariable,
+<<<<<<< HEAD
     UserFunctionVariable,
     UserMethodVariable,
+=======
+    SkipFunctionVariable,
+    UserFunctionVariable,
+    UserMethodVariable,
+    WrappedNestedUserFunctionVariable,
+    WrappedSkipFunctionVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     WrappedUserFunctionVariable,
     WrappedUserMethodVariable,
 )
@@ -51,6 +66,7 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 
@@ -75,6 +91,12 @@ def cleanup_assert(self):
         self.cleanup()
 
 
+=======
+    from torch._dynamo.codegen import PyCodegen
+    from torch._dynamo.symbolic_convert import InstructionTranslator
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ContextWrappingVariable(VariableTracker):
     _nonvar_fields = {
         "cm_obj",
@@ -84,6 +106,7 @@ class ContextWrappingVariable(VariableTracker):
         *VariableTracker._nonvar_fields,
     }
 
+<<<<<<< HEAD
     def __init__(
         self, target_values, initial_values=None, *, state=None, **kwargs
     ) -> None:
@@ -91,6 +114,12 @@ def __init__(
         self.target_values = target_values
         self.initial_values = initial_values
         self.state = ContextManagerState() if state is None else state
+=======
+    def __init__(self, target_values, initial_values=None, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.target_values = target_values
+        self.initial_values = initial_values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def enter(self, tx):
         self._call_func(tx, self.target_values)
@@ -103,6 +132,7 @@ def set_cleanup_hook(self, tx: "InstructionTranslator", fn=None):
             def fn():
                 self._call_func(tx, self.initial_values)
 
+<<<<<<< HEAD
         self.state.cleanup_fn = fn
         tx.output.add_cleanup_hook(self.state.cleanup)
 
@@ -111,11 +141,25 @@ def exit(self, tx: "InstructionTranslator", *args):
         return variables.ConstantVariable.create(None)
 
     def reconstruct_type(self, codegen):
+=======
+        self.cleanup_fn = fn
+        tx.output.add_cleanup_hook(self.cleanup)
+
+    def exit(self, tx: "InstructionTranslator", *args):
+        self.cleanup_assert()
+        return variables.ConstantVariable.create(None)
+
+    def reconstruct_type(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(
             AttrSource(codegen.tx.import_source(self.module_name()), self.fn_name())
         )
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(lambda: self.reconstruct_type(codegen))
         target_values = self.target_values
         if not target_values:
@@ -136,9 +180,27 @@ def call_function(
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
         assert len(args) == 1
+<<<<<<< HEAD
         if isinstance(args[0], NestedUserFunctionVariable):
             args[0] = UserFunctionVariable(args[0].get_function())
         assert isinstance(args[0], (UserMethodVariable, UserFunctionVariable))
+=======
+        assert isinstance(
+            args[0],
+            (
+                NestedUserFunctionVariable,
+                SkipFunctionVariable,
+                UserMethodVariable,
+                UserFunctionVariable,
+            ),
+        )
+
+        if isinstance(args[0], NestedUserFunctionVariable):
+            return WrappedNestedUserFunctionVariable(args[0], self)
+
+        if isinstance(args[0], SkipFunctionVariable):
+            return WrappedSkipFunctionVariable(args[0], self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if isinstance(args[0], UserMethodVariable):
             return WrappedUserMethodVariable(args[0], self)
@@ -152,10 +214,26 @@ def supports_graph_breaks(self):
     def exit_on_graph_break(self):
         return True
 
+<<<<<<< HEAD
 
 class GenericContextWrappingVariable(UserDefinedObjectVariable):
     # Some methods in ContextWrappingVariable assumes the arguments are
     # python contants. Which might not always be the case here.
+=======
+    def cleanup(self):
+        if self.cleanup_fn is not None:
+            self.cleanup_fn()
+            self.cleanup_fn = None
+
+    def cleanup_assert(self):
+        assert self.cleanup_fn, "multiple exits?"
+        self.cleanup()
+
+
+class GenericContextWrappingVariable(UserDefinedObjectVariable):
+    # Some methods in ContextWrappingVariable assumes the arguments are
+    # python constants. Which might not always be the case here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, cm_obj, **kwargs) -> None:
         assert cm_obj is not None
         super().__init__(
@@ -197,7 +275,11 @@ def exit_on_graph_break(self):
 
 
 class GradInplaceRequiresGradCtxManagerVariable(ContextWrappingVariable):
+<<<<<<< HEAD
     """represents torch grad requries grad"""
+=======
+    """represents torch grad requires grad"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def create(tx: "InstructionTranslator", target_values, **kwargs):
@@ -217,7 +299,11 @@ def enter(self, tx):
                 self.prev_state
             ),
         )
+<<<<<<< HEAD
         self.state.proxy = tx.output.create_node(
+=======
+        self.proxy = tx.output.create_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function",
             torch._C._functorch.set_inplace_requires_grad_allowed,
             (enabled,),
@@ -226,7 +312,11 @@ def enter(self, tx):
         return variables.ConstantVariable.create(None)
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup()
+=======
+        self.cleanup()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tx.output.create_node(
             "call_function",
             torch._C._functorch.set_inplace_requires_grad_allowed,
@@ -253,7 +343,11 @@ def enter(self, tx):
             tx,
             lambda: torch._C._functorch.push_dynamic_layer_stack(self.saved),
         )
+<<<<<<< HEAD
         self.state.proxy = tx.output.create_node(
+=======
+        self.proxy = tx.output.create_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function",
             torch._C._functorch.pop_dynamic_layer_stack,
             (),
@@ -262,11 +356,19 @@ def enter(self, tx):
         return variables.ConstantVariable.create(None)
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup()
         tx.output.create_node(
             "call_function",
             torch._C._functorch.push_dynamic_layer_stack,
             (self.state.proxy,),
+=======
+        self.cleanup()
+        tx.output.create_node(
+            "call_function",
+            torch._C._functorch.push_dynamic_layer_stack,
+            (self.proxy,),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {},
         )
         return variables.ConstantVariable.create(None)
@@ -297,7 +399,11 @@ def enter(self, tx):
         self.set_cleanup_hook(
             tx, lambda: torch._functorch.eager_transforms.exit_jvp_nesting()
         )
+<<<<<<< HEAD
         self.state.proxy = tx.output.create_node(
+=======
+        self.proxy = tx.output.create_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function",
             torch._C._functorch._jvp_increment_nesting,
             (),
@@ -306,7 +412,11 @@ def enter(self, tx):
         return variables.ConstantVariable.create(jvp_level)
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup()
+=======
+        self.cleanup()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tx.output.create_node(
             "call_function", torch._C._functorch._jvp_decrement_nesting, (), {}
         )
@@ -332,7 +442,11 @@ def enter(self, tx):
             tx,
             lambda: torch._C._set_fwd_grad_enabled(self.prev_state),
         )
+<<<<<<< HEAD
         self.state.proxy = tx.output.create_node(
+=======
+        self.proxy = tx.output.create_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function",
             torch._C._set_fwd_grad_enabled,
             (mode,),
@@ -341,7 +455,11 @@ def enter(self, tx):
         return variables.ConstantVariable.create(None)
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup()
+=======
+        self.cleanup()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tx.output.create_node(
             "call_function",
             torch._C._set_fwd_grad_enabled,
@@ -370,7 +488,11 @@ def enter(self, tx):
         self.set_cleanup_hook(
             tx, lambda: torch.autograd.forward_ad.exit_dual_level(level=self.new_level)
         )
+<<<<<<< HEAD
         self.state.proxy = tx.output.create_node(
+=======
+        self.proxy = tx.output.create_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function",
             torch._C._enter_dual_level,
             (),
@@ -379,7 +501,11 @@ def enter(self, tx):
         return variables.ConstantVariable.create(self.new_level)
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup()
+=======
+        self.cleanup()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tx.output.create_node(
             "call_function",
             torch._C._exit_dual_level,
@@ -412,7 +538,11 @@ def enter(self, tx):
         install_guard(self._guards_singleton)
         grad_level = torch._C._functorch._grad_increment_nesting()
         self.set_cleanup_hook(tx, lambda: torch._C._functorch._grad_decrement_nesting())
+<<<<<<< HEAD
         self.state.proxy = tx.output.create_node(
+=======
+        self.proxy = tx.output.create_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function",
             torch._C._functorch._grad_increment_nesting,
             (),
@@ -421,7 +551,11 @@ def enter(self, tx):
         return variables.ConstantVariable.create(grad_level)
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup()
+=======
+        self.cleanup()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tx.output.create_node(
             "call_function", torch._C._functorch._grad_decrement_nesting, (), {}
         )
@@ -492,7 +626,11 @@ def enter(self, tx):
             batch_size_value, randomness
         )
         self.set_cleanup_hook(tx, lambda: torch._C._functorch._vmap_decrement_nesting())
+<<<<<<< HEAD
         self.state.proxy = tx.output.create_node(
+=======
+        self.proxy = tx.output.create_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function",
             torch._C._functorch._vmap_increment_nesting,
             (batch_size_node, randomness),
@@ -501,7 +639,11 @@ def enter(self, tx):
         return variables.ConstantVariable.create(vmap_level)
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup()
+=======
+        self.cleanup()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tx.output.create_node(
             "call_function", torch._C._functorch._vmap_decrement_nesting, (), {}
         )
@@ -589,20 +731,53 @@ def __init__(
         self.target_values = target_values
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup_assert()
         tx.output.create_node(
             "call_function",
             torch.autograd.grad_mode._exit_inference_mode,
             (self.state.proxy,),
+=======
+        self.cleanup_assert()
+        tx.output.create_node(
+            "call_function",
+            torch.autograd.grad_mode._exit_inference_mode,
+            (self.proxy,),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {},
         )
 
     def enter(self, tx):
+<<<<<<< HEAD
         ctx = torch.autograd.grad_mode._enter_inference_mode(*self.target_values)
         self.set_cleanup_hook(
             tx, lambda: torch.autograd.grad_mode._exit_inference_mode(ctx)
         )
         self.state.proxy = tx.output.create_node(
+=======
+        disabled_inference_mode_forcibly = False
+        if (
+            torch._dynamo.config.fake_tensor_disable_inference_mode
+            and self.target_values[0]
+        ):
+            # Do not set the inference mode because we keep it off during
+            # compilation. Set the grad_enabled to False to reflect the relevant
+            # part of inference_mode to torch.compile.
+            disabled_inference_mode_forcibly = True
+            prior = torch.is_grad_enabled()
+            torch._C._set_grad_enabled(False)
+        else:
+            ctx = torch.autograd.grad_mode._enter_inference_mode(*self.target_values)
+
+        def cleanup_hook():
+            if disabled_inference_mode_forcibly:
+                torch._C._set_grad_enabled(prior)
+            else:
+                torch.autograd.grad_mode._exit_inference_mode(ctx)
+
+        self.set_cleanup_hook(tx, cleanup_hook)
+        self.proxy = tx.output.create_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function",
             torch.autograd.grad_mode._enter_inference_mode,
             (*self.target_values,),
@@ -640,11 +815,19 @@ def __init__(
         self.target_values = target_values
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup_assert()
         tx.output.create_node(
             "call_function",
             torch.cuda._maybe_exchange_device,
             (self.state.proxy,),
+=======
+        self.cleanup_assert()
+        tx.output.create_node(
+            "call_function",
+            torch.cuda._maybe_exchange_device,
+            (self.proxy,),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {},
         )
         return variables.ConstantVariable.create(False)
@@ -652,7 +835,11 @@ def exit(self, tx: "InstructionTranslator", *args):
     def enter(self, tx):
         prev_idx = torch.cuda._exchange_device(*self.target_values)
         self.set_cleanup_hook(tx, lambda: torch.cuda._maybe_exchange_device(prev_idx))
+<<<<<<< HEAD
         self.state.proxy = tx.output.create_node(
+=======
+        self.proxy = tx.output.create_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function",
             torch.cuda._exchange_device,
             (*self.target_values,),
@@ -674,6 +861,7 @@ class TorchFunctionDisableVariable(ContextWrappingVariable):
     @staticmethod
     def create(tx: "InstructionTranslator", **kwargs):
         var = TorchFunctionDisableVariable(
+<<<<<<< HEAD
             target_values=[False],
             initial_values=[tx.output.torch_function_enabled],
             **kwargs,
@@ -684,11 +872,36 @@ def create(tx: "InstructionTranslator", **kwargs):
         return var
 
     def __init__(self, target_values, initial_values=None, **kwargs) -> None:
+=======
+            target_values=[],
+            initial_values=[],
+            **kwargs,
+        )
+        return var
+
+    def __init__(
+        self, target_values, initial_values=None, only_subclass=True, **kwargs
+    ) -> None:
+        assert len(target_values) == 0
+        assert len(initial_values) == 0
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        self.only_subclass = only_subclass
+        self.initial_torch_function_subclass_enabled = (
+            tx.symbolic_torch_function_state.torch_function_subclass_enabled
+        )
+        self.initial_torch_function_mode_enabled = (
+            tx.symbolic_torch_function_state.torch_function_mode_enabled
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
         install_guard(self._guards_singleton)
 
+<<<<<<< HEAD
     def enter(self, tx):
         return variables.ConstantVariable.create(None)
 
@@ -697,6 +910,36 @@ def _call_func(self, tx: "InstructionTranslator", values):
         tx.symbolic_torch_function_state.torch_function_subclass_enabled = values[0]
         tx.symbolic_torch_function_state.torch_function_mode_enabled = values[0]
         tx.output.set_torch_function_state(values[0])
+=======
+    def set_cleanup_hook(self, tx: "InstructionTranslator", fn=None):
+        if fn is None:
+
+            def fn():
+                tx.symbolic_torch_function_state.torch_function_subclass_enabled = (
+                    self.initial_torch_function_subclass_enabled
+                )
+                if not self.only_subclass:
+                    tx.symbolic_torch_function_state.torch_function_mode_enabled = (
+                        self.initial_torch_function_subclass_enabled
+                    )
+
+        self.cleanup_fn = fn
+        tx.output.add_cleanup_hook(self.cleanup)
+
+    def _call_func(self, tx: "InstructionTranslator", values):
+        assert len(values) == 0
+        tx.symbolic_torch_function_state.torch_function_subclass_enabled = False
+        if not self.only_subclass:
+            tx.symbolic_torch_function_state.torch_function_mode_enabled = False
+
+    def module_name(self):
+        return "torch._C"
+
+    def fn_name(self):
+        if self.only_subclass:
+            return "DisableTorchFunctionSubclass"
+        return "DisableTorchFunction"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DeterministicAlgorithmsVariable(ContextWrappingVariable):
@@ -835,16 +1078,26 @@ def __init__(self, target_values, initial_values=None, **kwargs) -> None:
         self.target_values = target_values
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup_assert()
         tx.output.create_node(
             "call_function", torch.amp._exit_autocast, (self.state.proxy,), {}
+=======
+        self.cleanup_assert()
+        tx.output.create_node(
+            "call_function", torch.amp._exit_autocast, (self.proxy,), {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return variables.ConstantVariable.create(None)
 
     def enter(self, tx):
         ctx = torch.amp._enter_autocast(*self.target_values)
         self.set_cleanup_hook(tx, lambda: torch.amp._exit_autocast(ctx))
+<<<<<<< HEAD
         self.state.proxy = tx.output.create_node(
+=======
+        self.proxy = tx.output.create_node(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function", torch.amp._enter_autocast, (*self.target_values,), {}
         )
 
@@ -972,7 +1225,11 @@ def exit(self, tx: "InstructionTranslator", *args):
             (self.initial_values[0].as_proxy(),),
             {},
         )
+<<<<<<< HEAD
         self.state.cleanup_assert()
+=======
+        self.cleanup_assert()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class PreserveVersionContextVariable(ContextWrappingVariable):
@@ -1024,7 +1281,11 @@ def exit(self, tx: "InstructionTranslator", *args):
             _unsafe_set_version_counter
         ).call_function(tx, [self.tensors, self.prev_versions], {})
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unimplemented_v2(
             gb_type="torch.autograd._unsafe_preserve_version_counter escaped from compiled region",
             context=str(self),
@@ -1103,12 +1364,20 @@ class SDPAKernelVariable(ContextWrappingVariable):
     """represents torch.nn.attention.sdpa_kernel"""
 
     @staticmethod
+<<<<<<< HEAD
     def create(tx: "InstructionTranslator", backends, **kwargs):
+=======
+    def create(tx: "InstructionTranslator", backends, set_priority=False, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(backends, torch.nn.attention.SDPBackend):
             backends = [backends]
         var = SDPAKernelVariable(
             target_values=backends,
             initial_values=None,
+<<<<<<< HEAD
+=======
+            set_priority=set_priority,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             **kwargs,
         )
         return var
@@ -1117,11 +1386,19 @@ def __init__(
         self,
         target_values: list[torch.nn.attention.SDPBackend],
         initial_values=None,
+<<<<<<< HEAD
+=======
+        set_priority: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **kwargs,
     ) -> None:
         super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
+<<<<<<< HEAD
+=======
+        self.set_priority = set_priority
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def _backends_to_nodes(tx, backends):
@@ -1138,27 +1415,54 @@ def _backends_to_nodes(tx, backends):
         return nodes
 
     def enter(self, tx):
+<<<<<<< HEAD
         self.prev_backends = torch.nn.attention._cur_sdpa_kernel_backends()
         self.set_cleanup_hook(
             tx, lambda: torch.nn.attention._sdpa_kernel(self.prev_backends)
         )
         torch.nn.attention._sdpa_kernel(self.target_values)
+=======
+        self.prev_backends = torch.nn.attention._cur_sdpa_kernel_backends(
+            with_priority=self.set_priority
+        )
+        self.set_cleanup_hook(
+            tx,
+            lambda: torch.nn.attention._sdpa_kernel(
+                self.prev_backends, set_priority=self.set_priority
+            ),
+        )
+        torch.nn.attention._sdpa_kernel(
+            self.target_values, set_priority=self.set_priority
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arg = self._backends_to_nodes(tx, self.target_values)
         tx.output.create_node(
             "call_function",
             torch.nn.attention._sdpa_kernel,
+<<<<<<< HEAD
             (arg,),
+=======
+            (arg, bool(self.set_priority)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {},
         )
         return variables.ConstantVariable.create(None)
 
     def exit(self, tx: "InstructionTranslator", *args):
+<<<<<<< HEAD
         self.state.cleanup_assert()
+=======
+        self.cleanup_assert()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arg = self._backends_to_nodes(tx, self.prev_backends)
         tx.output.create_node(
             "call_function",
             torch.nn.attention._sdpa_kernel,
+<<<<<<< HEAD
             (arg,),
+=======
+            (arg, bool(self.set_priority)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {},
         )
         return variables.ConstantVariable.create(None)
@@ -1235,7 +1539,11 @@ def call_method(
     def as_proxy(self):
         return self.proxy
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If we got here, this stream is fully subsumed by the graph - this means it is
         # not an input or global
         assert not self.source
@@ -1282,6 +1590,7 @@ def call_method(
                 ),
             )
         else:
+<<<<<<< HEAD
             unimplemented_v2(
                 gb_type="Unsupported torch.cuda.Event method",
                 context=str(name),
@@ -1289,6 +1598,16 @@ def call_method(
                     f"Dynamo doesn't support tracing the torch.cuda.Event.{name} method. "
                     f"We currently support wait, record, synchronize, and query.",
                 ),
+=======
+            method_name = (
+                f"{type(self.value).__module__}.{type(self.value).__qualname__}.{name}"
+            )
+            unimplemented_v2(
+                gb_type="Unsupported event method",
+                context=str(name),
+                explanation=f"Dynamo doesn't support tracing the {method_name} method. "
+                f"We currently support wait, record, synchronize, and query.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 hints=[
                     *graph_break_hints.SUPPORTABLE,
                 ],
@@ -1297,7 +1616,11 @@ def call_method(
     def as_proxy(self):
         return self.proxy
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If we got here, this event is fully subsumed by the graph - this means it is
         # not an input or global
         assert not self.source
@@ -1307,6 +1630,50 @@ def reconstruct(self, codegen):
         codegen.append_output(codegen.create_load_global(name, add=True))
 
 
+<<<<<<< HEAD
+=======
+class DynamoConfigPatchVariable(ContextWrappingVariable):
+    """represents torch._dynamo.patch_dynamo_config"""
+
+    # NOTE: no need to guard on dynamo config because dynamo config should not affect soundness
+    # (though it may affect tracing behavior)
+    def __init__(self, target_values, **kwargs) -> None:
+        target_values = tuple(target_values.items())
+        super().__init__(target_values=(target_values,), initial_values=None, **kwargs)
+        self.initial_values = {}
+        for key, _ in target_values:
+            self.initial_values[key] = torch._dynamo.config.__getattr__(key)
+        self.initial_values = (tuple(self.initial_values.items()),)
+
+    def enter(self, tx):
+        # resets all config patches at the end of tracing
+        self.set_cleanup_hook(tx)
+        self._call_func(tx, self.target_values)
+        return variables.ConstantVariable.create(None)
+
+    def exit(self, tx: "InstructionTranslator", *args):
+        self._call_func(tx, self.initial_values)
+        return variables.ConstantVariable.create(None)
+
+    def _call_func(self, tx: "InstructionTranslator", values):
+        assert len(values) == 1
+        value = values[0]
+        # manually patch dynamo config
+        for key, val in value:
+            torch._dynamo.config.__setattr__(key, val)
+        # No need to keep track of global side effects because
+        # dynamo will properly restore this context manager for
+        # unsupported instructions and continuation functions.
+        # Dynamo config also should not affect the semantics of the compiled graph.
+
+    def module_name(self):
+        return "torch._dynamo"
+
+    def fn_name(self):
+        return "patch_dynamo_config"
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class WithExitFunctionVariable(VariableTracker):
     _nonvar_fields = {
         "target",
@@ -1335,7 +1702,11 @@ def call_function(
         assert not kwargs
         return self.ctx.exit(tx, *args)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Note here we reconstruct the context manager rather than the
         # exit function.  The handler generated by BlockStackEntry
         # will re-enter the context in the resume function.
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index 6ed522f5a874..c85009c778ac 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -22,22 +22,49 @@
 
 import collections
 import functools
+<<<<<<< HEAD
 import types
+=======
+import inspect
+import operator
+import types
+from collections.abc import Hashable as py_Hashable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional, TYPE_CHECKING
 
 from torch._subclasses.fake_tensor import is_fake
 
+<<<<<<< HEAD
 from .. import polyfills, variables
 from ..bytecode_transformation import create_call_function, create_instruction
 from ..exc import raise_observed_exception, unimplemented
 from ..guards import GuardBuilder, install_guard
 from ..source import is_from_local_source
 from ..utils import cmp_name_to_op_mapping, dict_keys, dict_values, specialize_symnode
+=======
+from .. import graph_break_hints, polyfills, variables
+from ..bytecode_transformation import create_call_function, create_instruction
+from ..exc import raise_observed_exception, unimplemented_v2
+from ..guards import GuardBuilder, install_guard
+from ..source import is_from_local_source
+from ..utils import (
+    cmp_name_to_op_mapping,
+    dict_items,
+    dict_keys,
+    dict_values,
+    istype,
+    specialize_symnode,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .base import ValueMutationNew, VariableTracker
 from .constant import ConstantVariable
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from torch._dynamo.codegen import PyCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 
@@ -46,6 +73,31 @@
 # - (perhaps) Define how it is compared in _HashableTracker._eq_impl
 
 
+<<<<<<< HEAD
+=======
+def raise_args_mismatch(tx, name):
+    raise_observed_exception(
+        TypeError,
+        tx,
+        args=[ConstantVariable(f"wrong number of arguments for {name}() call")],
+    )
+
+
+def was_instancecheck_override(obj):
+    return type(obj).__dict__.get("__instancecheck__", False)
+
+
+def raise_unhashable(arg, tx=None):
+    if tx is None:
+        from torch._dynamo.symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+    raise_observed_exception(
+        TypeError, tx, args=[ConstantVariable(f"unhashable type: {type(arg)}")]
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_hashable(x):
     # NB - performing isinstance check on a LazVT realizes the VT, accidentally
     # inserting the guard. To avoid this, lazyVT `is_hashable` methods looks at
@@ -65,6 +117,16 @@ def is_hashable(x):
         return x.as_proxy().node.meta.get("example_value") is not None
     elif isinstance(x, variables.TupleVariable):
         return all(is_hashable(e) for e in x.items)
+<<<<<<< HEAD
+=======
+    elif (
+        isinstance(x, variables.UserDefinedObjectVariable)
+        and not was_instancecheck_override(x.value)
+        and inspect.getattr_static(x.value, "__hash__") is int.__hash__
+        and isinstance(x.value, int)
+    ):
+        return isinstance(x.value, py_Hashable)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return isinstance(
             x,
@@ -73,7 +135,11 @@ def is_hashable(x):
                 variables.SymNodeVariable,
                 variables.ConstantVariable,
                 variables.EnumVariable,
+<<<<<<< HEAD
                 variables.user_defined.UserDefinedClassVariable,
+=======
+                variables.UserDefinedClassVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 variables.UserFunctionVariable,
                 variables.SkipFunctionVariable,
                 variables.misc.NumpyVariable,
@@ -104,10 +170,17 @@ class _HashableTracker:
         def __init__(self, vt) -> None:
             # We specialize SymNodes
             vt = specialize_symnode(vt)
+<<<<<<< HEAD
             # TODO Temorarily remove to figure out what keys are we breaking on
             # and add proper support for them
             if not is_hashable(vt):
                 unimplemented(f"Dict key of type {type(vt)}. Key: {vt}")
+=======
+            # TODO Temporarily remove to figure out what keys are we breaking on
+            # and add proper support for them
+            if not is_hashable(vt):
+                raise_unhashable(vt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.vt = vt
 
         @property
@@ -133,6 +206,14 @@ def underlying_value(self):
                 # Access the underlying value inside the referent_vt for the key representation
                 Hashable = ConstDictVariable._HashableTracker
                 return Hashable(self.vt.referent_vt).underlying_value
+<<<<<<< HEAD
+=======
+            elif isinstance(self.vt, variables.UserDefinedObjectVariable):
+                # The re module in Python 3.13+ has a dictionary (_cache2) with
+                # an object as key (`class _ZeroSentinel(int): ...`):
+                # python test/dynamo/test_unittest.py CPythonTestLongMessage.test_baseAssertEqual
+                return self.vt.value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 x = self.vt.as_python_constant()
             return x
@@ -257,6 +338,7 @@ def is_new_item(self, value, other):
             return id(value.realize()) != id(other.realize())
         return id(value) != id(other)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
         # instructions to load collections.OrderedDict if necessary
         if self.user_cls is collections.OrderedDict:
@@ -269,6 +351,10 @@ def reconstruct(self, codegen):
                 )
             )
         # instructions to build the dict keys and values
+=======
+    def reconstruct_kvs_into_new_dict(self, codegen):
+        # Build a dictionary that contains the keys and values.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_args = 0
         for key, value in self.items.items():
             # We can safely call realize() here as it won't introduce any new guards
@@ -277,6 +363,7 @@ def reconstruct(self, codegen):
                 codegen(key.vt)
                 codegen(value)
                 num_args += 1
+<<<<<<< HEAD
 
         # BUILD_MAP and calling collections.OrderedDict if necessary
         if self.user_cls is collections.OrderedDict:
@@ -289,6 +376,25 @@ def reconstruct(self, codegen):
         # BUILD_MAP only if user_cls is dict
         else:
             codegen.append_output(create_instruction("BUILD_MAP", arg=num_args))
+=======
+        codegen.append_output(create_instruction("BUILD_MAP", arg=num_args))
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        if self.user_cls is collections.OrderedDict:
+            # emit `OrderedDict(constructed_dict)`
+            codegen.add_push_null(
+                lambda: codegen.extend_output(
+                    [
+                        codegen.create_load_python_module(collections),
+                        codegen.create_load_attr("OrderedDict"),
+                    ]
+                )
+            )
+            self.reconstruct_kvs_into_new_dict(codegen)
+            codegen.extend_output(create_call_function(1, False))
+        else:
+            self.reconstruct_kvs_into_new_dict(codegen)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def getitem_const_raise_exception_if_absent(
         self, tx: "InstructionTranslator", arg: VariableTracker
@@ -301,7 +407,20 @@ def getitem_const_raise_exception_if_absent(
     def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
         key = ConstDictVariable._HashableTracker(arg)
         if key not in self.items:
+<<<<<<< HEAD
             unimplemented(f"dict KeyError: {arg.value}")
+=======
+            msg = f"Dictionary key {arg.value} not found during tracing"
+            unimplemented_v2(
+                gb_type="key not found in dict",
+                context=f"Key {arg.value}",
+                explanation=msg,
+                hints=[
+                    "Check if the key exists in the dictionary before accessing it.",
+                    *graph_break_hints.USER_ERROR,
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.items[key]
 
     def maybe_getitem_const(self, arg: VariableTracker):
@@ -376,7 +495,11 @@ def call_method(
         # corresponding value VT. For __contains__, we add a DICT_CONTAINS
         # guard. But for all the other methods, we insert the DICT_KEYS_MATCH
         # guard to be conservative.
+<<<<<<< HEAD
         from . import BuiltinVariable, ConstantVariable, TupleVariable
+=======
+        from . import BuiltinVariable, ConstantVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Hashable = ConstDictVariable._HashableTracker
 
@@ -397,6 +520,7 @@ def call_method(
             assert not (args or kwargs)
             self.install_dict_keys_match_guard()
             if self.source:
+<<<<<<< HEAD
                 tx.output.guard_on_key_order.add(self.source.name())
             return TupleVariable(
                 [TupleVariable([k.vt, v]) for k, v in self.items.items()]
@@ -405,12 +529,24 @@ def call_method(
             self.install_dict_keys_match_guard()
             if self.source:
                 tx.output.guard_on_key_order.add(self.source.name())
+=======
+                tx.output.guard_on_key_order.add(self.source)
+            return DictItemsVariable(self)
+        elif name == "keys":
+            self.install_dict_keys_match_guard()
+            if self.source:
+                tx.output.guard_on_key_order.add(self.source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert not (args or kwargs)
             return DictKeysVariable(self)
         elif name == "values":
             self.install_dict_keys_match_guard()
             if self.source:
+<<<<<<< HEAD
                 tx.output.guard_on_key_order.add(self.source.name())
+=======
+                tx.output.guard_on_key_order.add(self.source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert not (args or kwargs)
             return DictValuesVariable(self)
         elif name == "copy":
@@ -423,7 +559,14 @@ def call_method(
             assert not (args or kwargs)
             self.install_dict_keys_match_guard()
             return ConstantVariable.create(len(self.items))
+<<<<<<< HEAD
         elif name == "__setitem__" and arg_hashable and self.is_mutable():
+=======
+        elif name == "__setitem__" and self.is_mutable():
+            if not arg_hashable:
+                raise_unhashable(args[0])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.install_dict_keys_match_guard()
             assert not kwargs and len(args) == 2
             tx.output.side_effects.mutation(self)
@@ -484,6 +627,12 @@ def call_method(
             # Key guarding - Nothing to do.
             return self.getitem_const(tx, args[0])
         elif name == "__contains__" and len(args) == 1:
+<<<<<<< HEAD
+=======
+            if not arg_hashable:
+                raise_unhashable(args[0])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.install_dict_contains_guard(tx, args)
             contains = args[0] in self
             return ConstantVariable.create(contains)
@@ -511,6 +660,26 @@ def call_method(
             self.items.pop(key)
             self.items[key] = val
             return ConstantVariable.create(None)
+<<<<<<< HEAD
+=======
+        elif name == "__or__":
+            assert len(args) == 1
+            if not isinstance(args[0], ConstDictVariable):
+                raise TypeError(
+                    f"unsupported operand type(s) for |: 'dict' and '{args[0].python_type().__name__}'"
+                )
+
+            self.install_dict_keys_match_guard()
+            new_dict_vt = self.clone(
+                items=self.items.copy(), mutation_type=ValueMutationNew(), source=None
+            )
+
+            # NB - Guard on all the keys of the other dict to ensure
+            # correctness.
+            args[0].install_dict_keys_match_guard()
+            new_dict_vt.items.update(args[0].items)
+            return new_dict_vt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return super().call_method(tx, name, args, kwargs)
 
@@ -525,7 +694,21 @@ def call_obj_hasattr(self, tx, name):
             if name in self.user_cls.__dict__:
                 return ConstantVariable.create(True)
             return ConstantVariable.create(False)
+<<<<<<< HEAD
         unimplemented(f"hasattr on {self.user_cls} is not supported")
+=======
+
+        msg = f"hasattr on {self.user_cls} is not supported"
+        unimplemented_v2(
+            gb_type="unsupported hasattr operation",
+            context=f"Class {self.user_cls}",
+            explanation=msg,
+            hints=[
+                "Consider using a regular dictionary instead",
+                *graph_break_hints.SUPPORTABLE,
+            ],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def clone(self, **kwargs):
         self.install_dict_keys_match_guard()
@@ -539,6 +722,7 @@ def __init__(self, dv_dict: ConstDictVariable, **kwargs) -> None:
         assert isinstance(dv_dict, ConstDictVariable)
         self.dv_dict = dv_dict
 
+<<<<<<< HEAD
     def unpack_var_sequence(self, tx):
         return self.dv_dict.unpack_var_sequence(tx)
 
@@ -548,6 +732,29 @@ def reconstruct(self, codegen):
             unimplemented(
                 "Can't reconstruct an existing mapping variable because"
                 " the connection to the original dict will be lost"
+=======
+    def python_type(self):
+        return types.MappingProxyType
+
+    def unpack_var_sequence(self, tx):
+        return self.dv_dict.unpack_var_sequence(tx)
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        # load types.MappingProxyType
+        if self.source:
+            msg = (
+                f"Preexisting MappingProxyVariable (source: {self.source}) cannot be reconstructed "
+                "because the connection to the original dict will be lost."
+            )
+            unimplemented_v2(
+                gb_type="mapping proxy cannot be reconstructed",
+                context=f"Source: {self.source}",
+                explanation=msg,
+                hints=[
+                    "Use a mapping proxy constructed in the same `torch.compile` region.",
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         codegen.add_push_null(
             lambda: codegen.extend_output(
@@ -568,7 +775,11 @@ def call_method(
         kwargs: dict[str, "VariableTracker"],
     ) -> "VariableTracker":
         if self.source and tx.output.side_effects.has_existing_dict_mutation():
+<<<<<<< HEAD
             unimplemented(
+=======
+            msg = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "A dict has been modified while we have an existing mappingproxy object. "
                 "A mapping proxy object, as the name suggest, proxies a mapping "
                 "object (usually a dict). If the original dict object mutates, it "
@@ -577,6 +788,19 @@ def call_method(
                 "for correctness we graph break when there is dict mutation and we "
                 "are trying to access a proxy object."
             )
+<<<<<<< HEAD
+=======
+
+            unimplemented_v2(
+                gb_type="mapping proxy affected by dictionary mutation",
+                context=f"Source: {self.source}, Dict mutation detected",
+                explanation=msg,
+                hints=[
+                    "Avoid modifying dictionaries that might be referenced by mapping proxy objects",
+                    "Or avoid using the mapping proxy objects after modifying its underlying dictionary",
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.dv_dict.call_method(tx, name, args, kwargs)
 
 
@@ -638,12 +862,33 @@ def call_method(
         else:
             return super().call_method(tx, name, args, kwargs)
 
+<<<<<<< HEAD
+=======
+    def reconstruct(self, codegen):
+        # emit `defaultdict(default_factory, new_dict)`
+        codegen.add_push_null(
+            lambda: codegen.extend_output(
+                [
+                    codegen.create_load_python_module(collections),
+                    codegen.create_load_attr("defaultdict"),
+                ]
+            )
+        )
+        codegen(self.default_factory)
+        self.reconstruct_kvs_into_new_dict(codegen)
+        codegen.extend_output(create_call_function(2, False))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: Implementing this via inheritance rather than composition is a
 # footgun, because self method calls in dict will route back to the set
 # implementation, which is almost assuredly wrong
 class SetVariable(ConstDictVariable):
+<<<<<<< HEAD
     """We model a sets as dictonary with None values"""
+=======
+    """We model a sets as dictionary with None values"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -665,7 +910,11 @@ def set_items(self):
 
     @staticmethod
     def _default_value():
+<<<<<<< HEAD
         # Variable to fill in he keys of the dictinary
+=======
+        # Variable to fill in he keys of the dictionary
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ConstantVariable.create(None)
 
     def as_proxy(self):
@@ -677,7 +926,11 @@ def python_type(self):
     def as_python_constant(self):
         return {k.vt.as_python_constant() for k in self.set_items}
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.foreach([x.vt for x in self.set_items])
         codegen.append_output(create_instruction("BUILD_SET", arg=len(self.set_items)))
 
@@ -688,17 +941,40 @@ def call_method(
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> "VariableTracker":
+<<<<<<< HEAD
         # We foward the calls to the dictionary model
         if name == "add":
             assert not kwargs
             assert len(args) == 1
+=======
+        # We forward the calls to the dictionary model
+        if name == "__init__":
+            temp_set_vt = variables.BuiltinVariable(set).call_set(tx, *args, *kwargs)
+            tx.output.side_effects.mutation(self)
+            self.items.clear()
+            self.items.update(temp_set_vt.items)
+            return ConstantVariable.create(None)
+        elif name == "add":
+            assert not kwargs
+            if len(args) != 1:
+                raise_args_mismatch(tx, name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             name = "__setitem__"
             args = (args[0], SetVariable._default_value())
         elif name == "pop":
             assert not kwargs
             assert not args
             # Choose an item at random and pop it via the Dict.pop method
+<<<<<<< HEAD
             result = self.set_items.pop().vt
+=======
+            try:
+                result = self.set_items.pop().vt
+            except KeyError as e:
+                raise_observed_exception(
+                    KeyError, tx, args=list(map(ConstantVariable.create, e.args))
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             super().call_method(tx, name, (result,), kwargs)
             return result
         elif name == "isdisjoint":
@@ -709,6 +985,7 @@ def call_method(
             ).call_function(tx, [self, args[0]], {})
         elif name == "intersection":
             assert not kwargs
+<<<<<<< HEAD
             assert len(args) == 1
             return variables.UserFunctionVariable(
                 polyfills.set_intersection
@@ -730,12 +1007,59 @@ def call_method(
             assert len(args) == 1
             return variables.UserFunctionVariable(polyfills.set_update).call_function(
                 tx, [self, args[0]], {}
+=======
+            return variables.UserFunctionVariable(
+                polyfills.set_intersection
+            ).call_function(tx, [self, *args], {})
+        elif name == "intersection_update":
+            assert not kwargs
+            return variables.UserFunctionVariable(
+                polyfills.set_intersection_update
+            ).call_function(tx, [self, *args], {})
+        elif name == "union":
+            assert not kwargs
+            return variables.UserFunctionVariable(polyfills.set_union).call_function(
+                tx, [self, *args], {}
+            )
+        elif name == "difference":
+            assert not kwargs
+            return variables.UserFunctionVariable(
+                polyfills.set_difference
+            ).call_function(tx, [self, *args], {})
+        elif name == "difference_update":
+            assert not kwargs
+            return variables.UserFunctionVariable(
+                polyfills.set_difference_update
+            ).call_function(tx, [self, *args], {})
+        elif name == "symmetric_difference":
+            if len(args) != 1:
+                raise_args_mismatch(tx, name)
+            assert not kwargs
+            return variables.UserFunctionVariable(
+                polyfills.set_symmetric_difference
+            ).call_function(tx, [self, *args], {})
+        elif name == "symmetric_difference_update":
+            if len(args) != 1:
+                raise_args_mismatch(tx, name)
+            assert not kwargs
+            return variables.UserFunctionVariable(
+                polyfills.set_symmetric_difference_update
+            ).call_function(tx, [self, *args], {})
+        elif name == "update" and self.is_mutable():
+            assert not kwargs
+            return variables.UserFunctionVariable(polyfills.set_update).call_function(
+                tx, [self, *args], {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         elif name == "remove":
             assert not kwargs
             assert len(args) == 1
             if args[0] not in self:
+<<<<<<< HEAD
                 unimplemented("key does not exist")
+=======
+                raise_observed_exception(KeyError, tx, args=args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return super().call_method(tx, "pop", args, kwargs)
         elif name == "discard":
             assert not kwargs
@@ -744,6 +1068,20 @@ def call_method(
                 return super().call_method(tx, "pop", args, kwargs)
             else:
                 return ConstantVariable.create(value=None)
+<<<<<<< HEAD
+=======
+        elif name in ("issubset", "issuperset"):
+            op = {
+                "issubset": operator.le,
+                "issuperset": operator.ge,
+            }
+            other = args[0].realize()
+            if not istype(other, SetVariable):
+                other = variables.BuiltinVariable(set).call_function(tx, [other], {})
+            return variables.BuiltinVariable(op.get(name)).call_function(
+                tx, [self, other], {}
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().call_method(tx, name, args, kwargs)
 
     def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
@@ -782,7 +1120,11 @@ def python_type(self):
     def as_python_constant(self):
         return {k.vt.as_python_constant() for k in self.set_items}
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.foreach([x.vt for x in self.set_items])
         codegen.add_push_null(
             lambda: codegen.extend_output(
@@ -802,6 +1144,18 @@ def call_method(
     ) -> "VariableTracker":
         if name in ["add", "pop", "update", "remove", "discard", "clear"]:
             raise RuntimeError(f"Illegal call_method {name} on a frozenset")
+<<<<<<< HEAD
+=======
+        elif name == "__init__":
+            # frozenset is immutable. Calling __init__ again shouldn't have any effect
+            # In[1]: s = frozenset([1, 2])
+            #
+            # In[2]: s.__init__([3, 4])
+            #
+            # In[3]: s
+            # frozenset({1, 2})
+            return ConstantVariable.create(None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().call_method(tx, name, args, kwargs)
 
 
@@ -858,7 +1212,11 @@ class DictViewVariable(VariableTracker):
 
     def __init__(self, dv_dict: ConstDictVariable, **kwargs) -> None:
         super().__init__(**kwargs)
+<<<<<<< HEAD
         assert self.kv in ("keys", "values")
+=======
+        assert self.kv in ("keys", "values", "items")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(dv_dict, ConstDictVariable)
         self.dv_dict = dv_dict
 
@@ -873,12 +1231,18 @@ def view_items_vt(self):
         raise NotImplementedError
 
     def unpack_var_sequence(self, tx):
+<<<<<<< HEAD
         def unwrap(x):
             return x.vt if self.kv == "keys" else x
 
         return [unwrap(x) for x in self.view_items]
 
     def reconstruct(self, codegen):
+=======
+        return self.view_items_vt
+
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.dv_dict)
         codegen.load_method(self.kv)
         codegen.call_method(0)
@@ -938,3 +1302,18 @@ def view_items_vt(self):
 
     def python_type(self):
         return dict_values
+<<<<<<< HEAD
+=======
+
+
+class DictItemsVariable(DictViewVariable):
+    kv = "items"
+
+    @property
+    def view_items_vt(self):
+        # Returns an iterable of the unpacked items
+        return [variables.TupleVariable([k.vt, v]) for k, v in self.view_items]
+
+    def python_type(self):
+        return dict_items
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/variables/distributed.py b/torch/_dynamo/variables/distributed.py
index 6742254f266f..95432848c7fc 100644
--- a/torch/_dynamo/variables/distributed.py
+++ b/torch/_dynamo/variables/distributed.py
@@ -49,7 +49,11 @@ class DistributedVariable(VariableTracker):
     Concrete distributed objects could inherit this class and add object
     specific logic.
 
+<<<<<<< HEAD
     i.e. It provides the check on the distributed package existance
+=======
+    i.e. It provides the check on the distributed package existence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     and hold the tracking value for the corresponding distributed object.
     """
 
@@ -59,7 +63,11 @@ def __init__(self, value, **kwargs) -> None:
             unimplemented_v2(
                 gb_type="torch.distributed package is not available!",
                 context="",
+<<<<<<< HEAD
                 explanation="The PyTorch package doesn't include torch.distributed when builing from source.",
+=======
+                explanation="The PyTorch package doesn't include torch.distributed when building from source.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 hints=[
                     "Set USE_DISTRIBUTED=1 to enable it when building PyTorch from source."
                 ],
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index 75746f4e0f0e..66826dc9ceca 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -27,6 +27,7 @@
 import functools
 import inspect
 import itertools
+<<<<<<< HEAD
 import sys
 import types
 from collections.abc import Sequence
@@ -37,6 +38,23 @@
 import torch
 
 from .. import graph_break_hints, polyfills, variables
+=======
+import logging
+import sys
+import traceback
+import types
+from collections.abc import Sequence
+from types import FunctionType
+from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar
+from typing_extensions import Never
+from unittest.mock import patch
+from weakref import WeakKeyDictionary
+
+import torch
+from torch._dynamo.exc import get_stack_above_dynamo
+
+from .. import config, graph_break_hints, polyfills, variables
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..bytecode_transformation import create_call_function, create_rot_n, is_generator
 from ..exc import (
     get_dynamo_observed_exception,
@@ -63,7 +81,16 @@
     istype,
     make_cell,
 )
+<<<<<<< HEAD
 from .base import typestr, ValueMutationNew, VariableTracker
+=======
+from .base import (
+    AsPythonConstantNotImplementedError,
+    AttributeMutationNew,
+    ValueMutationNew,
+    VariableTracker,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .constant import ConstantVariable
 
 
@@ -74,6 +101,10 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from torch._dynamo.codegen import PyCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.symbolic_convert import InstructionTranslator
     from torch._higher_order_ops.triton_kernel_wrap import (
         TritonGridType,
@@ -82,6 +113,107 @@
 
 
 _F = TypeVar("_F", bound=Callable)
+<<<<<<< HEAD
+=======
+CO_VARARGS = 0x04
+CO_VARKEYWORDS = 0x08
+
+
+# Module‐level cache keyed by the function object
+_spec_cache = WeakKeyDictionary()
+
+
+class FunctionSpec:
+    def __init__(self, func: FunctionType):
+        code = func.__code__
+        vn = code.co_varnames
+
+        self.posonly_count = code.co_posonlyargcount
+        self.arg_count = code.co_argcount
+        self.kwonly_count = code.co_kwonlyargcount
+
+        self.posonly_names = vn[: self.posonly_count]
+        self.pos_or_kw_names = vn[self.posonly_count : self.arg_count]
+        self.all_pos_names = self.posonly_names + self.pos_or_kw_names
+        self.kwonly_names = vn[self.arg_count : self.arg_count + self.kwonly_count]
+
+        off = self.arg_count + self.kwonly_count
+        self.varargs_name = vn[off] if code.co_flags & CO_VARARGS else None
+        off += 1 if self.varargs_name else 0
+        self.varkw_name = vn[off] if code.co_flags & CO_VARKEYWORDS else None
+
+    def update_defaults(self, func: FunctionType):
+        # Defaults can change from function call to function call. So re-update
+        # them on every call.
+        self.defaults = func.__defaults__ or ()
+        self.kwdefaults = func.__kwdefaults__ or {}
+
+        # Map positional‐default names → their index in self.defaults
+        self.pos_default_map = dict(
+            zip(self.all_pos_names[-len(self.defaults) :], range(len(self.defaults)))
+        )
+
+
+def _get_spec(func: FunctionType) -> FunctionSpec:
+    spec = _spec_cache.get(func)
+    if spec is None:
+        spec = FunctionSpec(func)
+        _spec_cache[func] = spec
+    return spec
+
+
+def bind_args_cached(func, tx, fn_source, args, kwargs):
+    spec = _get_spec(func)
+    spec.update_defaults(func)
+    ba = {}
+    rem_kw = dict(kwargs)
+
+    # 1) Bind all positional (pos-only + pos-or-kw)
+    for i, name in enumerate(spec.all_pos_names):
+        if i < len(args):
+            ba[name] = wrap_bound_arg(tx, args[i])
+        elif name in rem_kw:
+            if name in spec.posonly_names:
+                raise TypeError(f"{name} is positional-only")
+            ba[name] = wrap_bound_arg(tx, rem_kw.pop(name))
+        elif name in spec.pos_default_map:
+            idx = spec.pos_default_map[name]
+            default_source = None
+            if fn_source:
+                default_source = DefaultsSource(fn_source, idx)
+            ba[name] = wrap_bound_arg(tx, spec.defaults[idx], default_source)
+        else:
+            raise TypeError(f"Missing required positional argument: {name}")
+
+    # 2) *args
+    extra = args[len(spec.all_pos_names) :]
+    if spec.varargs_name:
+        ba[spec.varargs_name] = wrap_bound_arg(tx, tuple(extra))
+    elif extra:
+        raise TypeError(
+            f"Too many positional arguments: got {len(args)}, expected {len(spec.all_pos_names)}"
+        )
+
+    # 3) Keyword-only
+    for name in spec.kwonly_names:
+        if name in rem_kw:
+            ba[name] = wrap_bound_arg(tx, rem_kw.pop(name))
+        elif name in spec.kwdefaults:
+            kwdefault_source = None
+            if fn_source:
+                kwdefault_source = DefaultsSource(fn_source, name, is_kw=True)
+            ba[name] = wrap_bound_arg(tx, spec.kwdefaults[name], kwdefault_source)
+        else:
+            raise TypeError(f"Missing required keyword-only argument: {name}")
+
+    # 4) **kwargs
+    if spec.varkw_name:
+        ba[spec.varkw_name] = wrap_bound_arg(tx, rem_kw)
+    elif rem_kw:
+        raise TypeError(f"Unexpected keyword arguments: {list(rem_kw)}")
+
+    return ba
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def wrap_bound_arg(tx: "InstructionTranslator", val, source=None):
@@ -224,9 +356,24 @@ def __init__(self, fn, is_constant=False, **kwargs) -> None:
         else:
             self.is_constant = False
 
+<<<<<<< HEAD
         assert isinstance(fn, (types.FunctionType, torch.jit.ScriptFunction)), (
             f"expected FunctionType found {typestr(fn)} {fn}"
         )
+=======
+        # TODO putting this here to avoid duplication, because we could hit this
+        # from several paths (e.g., SuperVariable or `var_getattr`s).
+        if not isinstance(fn, (types.FunctionType, torch.jit.ScriptFunction)):
+            unimplemented_v2(
+                gb_type="can't handle functions not implemented in python ",
+                context=f"{fn}",
+                explanation="Dynamo can only handle functions defined in python",
+                hints=[
+                    "Move usage of this function out of `torch.compile` region",
+                    *graph_break_hints.INFERENCE_MODE,
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO(anijain2305) - Replace directly calling UserFunctionVariable with
         # VariableBuilder, which handles the wrapping of _torchdynamo_inline.
         # unpack @torch._dynamo.optimize()(fn) wrapped function
@@ -263,6 +410,7 @@ def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
         this function, create new bindings for initial locals.
         """
         assert not self.is_constant
+<<<<<<< HEAD
         root_tx = parent.output.root_tx
         wrap = functools.partial(wrap_bound_arg, tx=root_tx)
 
@@ -303,6 +451,16 @@ def bind_args(self, parent, args, kwargs) -> dict[str, VariableTracker]:
         result = dict(bound.arguments.items())
 
         wrap_args_kwargs(root_tx, result)
+=======
+
+        fn: types.FunctionType = self.fn
+
+        if not isinstance(fn, FunctionType):
+            raise TypeError("Only supports regular Python functions.")
+        root_tx = parent.output.root_tx
+        result = bind_args_cached(fn, root_tx, self.source, args, kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         init_cellvars(parent, result, fn.__code__)
         closure = self.fn.__closure__ or ()
         assert len(closure) == len(self.fn.__code__.co_freevars)
@@ -361,6 +519,27 @@ def call_function(
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
+<<<<<<< HEAD
+=======
+        # Handle patch_dynamo_config call
+
+        if self.fn is torch._dynamo.patch_dynamo_config:
+            try:
+                args_const = [arg.as_python_constant() for arg in args]
+                kwargs_const = {
+                    key: val.as_python_constant() for key, val in kwargs.items()
+                }
+                changes = torch._dynamo.patch_dynamo_config(
+                    *args_const, **kwargs_const
+                ).changes
+                return variables.DynamoConfigPatchVariable(changes)
+            except AsPythonConstantNotImplementedError as e:
+                raise RuntimeError(
+                    "Cannot convert patch_dynamo_config args/kwargs to constants. "
+                    "Please fix your call to patch_dynamo_config by using simpler inputs. "
+                    f"args: {args}, kwargs: {kwargs}"
+                ) from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Handle a `nonstrict_trace(fn)` call
         if self.fn is torch._dynamo.nonstrict_trace:
             bound = inspect.signature(self.fn).bind(*args, **kwargs)
@@ -397,6 +576,20 @@ def call_function(
             return invoke_and_store_as_constant(
                 tx, self.fn, self.get_name(), args, kwargs
             )
+<<<<<<< HEAD
+=======
+
+        if (
+            not tx.output.current_tracer.unsafe_allow_externally_visible_side_effects
+            and self.fn
+            is torch._dynamo.utils._disable_side_effect_safety_checks_for_current_subtracer
+        ):
+            with torch._dynamo.side_effects.allow_externally_visible_side_effects_in_subtracer(
+                tx
+            ):
+                return super().call_function(tx, args, kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             tx.output.current_tracer.under_activation_checkpoint
             and not tx.output.current_tracer.allow_side_effects_under_checkpoint
@@ -480,7 +673,11 @@ def __str__(self):
 
     __repr__ = __str__
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._dynamo.side_effects import disallow_side_effects_in_generator
         from torch._dynamo.symbolic_convert import (
             InstructionTranslator,
@@ -530,6 +727,10 @@ def next_variable(self, tx):
             with patch.dict(counters, {"unimplemented": counters["inline_call"]}):
                 return tracer.inline_call_()
         except ObservedException as e:
+<<<<<<< HEAD
+=======
+            tracer.generator_exhausted = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise e
         except InfiniteGeneratorError:
             # test/dynamo/test_misc.py::test_iterator_limit
@@ -548,6 +749,7 @@ def has_force_unpack_var_sequence(self, tx) -> builtins.bool:
 
     def force_unpack_var_sequence(self, tx) -> list[VariableTracker]:
         result = []
+<<<<<<< HEAD
         while True:
             try:
                 result.append(self.next_variable(tx))
@@ -555,6 +757,18 @@ def force_unpack_var_sequence(self, tx) -> list[VariableTracker]:
                 handle_observed_exception(tx)
                 break
         return result
+=======
+        self.force_apply_to_var_sequence(tx, result.append)
+        return result
+
+    def force_apply_to_var_sequence(self, tx, fn) -> None:
+        while True:
+            try:
+                fn(self.next_variable(tx))
+            except ObservedUserStopIteration:
+                handle_observed_exception(tx)
+                break
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _setup_exception(self, tx, exc):
         tracer = self._get_inline_tracer(tx)
@@ -966,11 +1180,22 @@ def call_function(
         self.context.exit(tx)
         return result
 
+<<<<<<< HEAD
+=======
+    def reconstruct(self, codegen):
+        codegen.add_push_null(lambda: codegen(self.context))
+        codegen(self.wrapped)
+        codegen.extend_output(create_call_function(1, False))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class WrappedUserFunctionVariable(UserFunctionVariable):
     def __init__(self, wrapped, context, **kwargs) -> None:
         kwargs.pop("fn", None)
+<<<<<<< HEAD
         kwargs.pop("obj", None)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(wrapped.fn, **kwargs)
         self.wrapped = wrapped
         self.context = context
@@ -986,6 +1211,14 @@ def call_function(
         self.context.exit(tx)
         return result
 
+<<<<<<< HEAD
+=======
+    def reconstruct(self, codegen):
+        codegen.add_push_null(lambda: codegen(self.context))
+        codegen(self.wrapped)
+        codegen.extend_output(create_call_function(1, False))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def invoke_and_store_as_constant(tx: "InstructionTranslator", fn, name, args, kwargs):
     def convert(x):
@@ -1023,6 +1256,11 @@ def __init__(
         wrapped_fn=None,
         **kwargs,
     ) -> None:
+<<<<<<< HEAD
+=======
+        if kwargs.get("mutation_type") is None:
+            kwargs.update(mutation_type=AttributeMutationNew())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(**kwargs)
         assert isinstance(fn_name.as_python_constant(), str)
         assert isinstance(code.as_python_constant(), types.CodeType)
@@ -1069,9 +1307,34 @@ def get_function(self):
             func.__annotations__ = annotations
         return func
 
+<<<<<<< HEAD
+    def has_closure(self):
+        return self.closure is not None
+
+=======
+    def call_setattr(
+        self,
+        tx: "InstructionTranslator",
+        name_var: VariableTracker,
+        val: VariableTracker,
+    ):
+        tx.output.side_effects.store_attr(self, name_var.value, val)
+        return ConstantVariable(None)
+
+    def call_method(self, tx, name, args, kwargs):
+        if name == "__setattr__":
+            return self.call_setattr(tx, *args)
+        return super().call_method(tx, name, args, kwargs)
+
     def has_closure(self):
         return self.closure is not None
 
+    def const_getattr(self, tx, name):
+        if name == "__name__":
+            return self.fn_name.as_python_constant()
+        return super().const_getattr(tx, name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def has_self(self):
         return False
 
@@ -1102,7 +1365,11 @@ def bind_args(self, parent, args, kwargs):
 
         return result
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(__name__, "_create_nested_fn")
         )
@@ -1147,6 +1414,62 @@ def reconstruct(self, codegen):
             codegen.extend_output(create_rot_n(2))
             codegen.extend_output(create_call_function(1, True))
 
+<<<<<<< HEAD
+=======
+        # codegen attributes
+        from torch._dynamo.symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        if tx.output.side_effects.has_pending_mutation(self):
+            for name, value in tx.output.side_effects.store_attr_mutations[
+                self
+            ].items():
+                codegen.dup_top()
+                codegen(value)
+                codegen.extend_output(create_rot_n(2))
+                codegen.store_attr(name)
+
+
+class WrappedNestedUserFunctionVariable(NestedUserFunctionVariable):
+    def __init__(self, wrapped, context, **kwargs) -> None:
+        kwargs.pop("fn_name", None)
+        kwargs.pop("code", None)
+        kwargs.pop("f_globals", None)
+        kwargs.pop("defaults", None)
+        kwargs.pop("kwdefaults", None)
+        kwargs.pop("annotations", None)
+        kwargs.pop("closure", None)
+        kwargs.pop("wrapped_fn", None)
+        super().__init__(
+            wrapped.fn_name,
+            wrapped.code,
+            wrapped.f_globals,
+            wrapped.defaults,
+            wrapped.kwdefaults,
+            wrapped.annotations,
+            wrapped.closure,
+            wrapped.wrapped_fn,
+        )
+        self.wrapped = wrapped
+        self.context = context
+
+    def call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: "list[VariableTracker]",
+        kwargs: "dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        self.context.enter(tx)
+        result = super().call_function(tx, args, kwargs)
+        self.context.exit(tx)
+        return result
+
+    def reconstruct(self, codegen):
+        codegen.add_push_null(lambda: codegen(self.context))
+        codegen(self.wrapped)
+        codegen.extend_output(create_call_function(1, False))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class SkipFunctionVariable(VariableTracker):
     _nonvar_fields = {
@@ -1179,10 +1502,19 @@ def call_function(
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
         if inspect.getattr_static(self.value, "_torchdynamo_disable", False):
+<<<<<<< HEAD
             unimplemented_v2(
                 gb_type="Skip calling `torch.compiler.disable()`d function",
                 context=str(self.value),
                 explanation=f"Skip calling function `{self.value}` since it was wrapped with `torch.compiler.disable`",
+=======
+            msg = inspect.getattr_static(self.value, "_torchdynamo_disable_msg", None)
+            unimplemented_v2(
+                gb_type="Skip calling `torch.compiler.disable()`d function",
+                context=str(self.value),
+                explanation=f"Skip calling function `{self.value}` since it was wrapped "
+                f"with `torch.compiler.disable` (reason: {msg})",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 hints=[
                     "Remove the `torch.compiler.disable` call",
                 ],
@@ -1199,7 +1531,26 @@ def call_function(
                     "Remove the `torch._dynamo.graph_break()` call.",
                 ],
             )
+<<<<<<< HEAD
+        else:
+=======
+        elif self.value is torch._dynamo.skip_frame:
+            skip_frame_msg = kwargs.get("msg", None)
+            if skip_frame_msg:
+                skip_frame_msg = skip_frame_msg.as_python_constant()
+            raise SkipFrame(
+                f"Skip frame due to `torch._dynamo.skip_frame()`. Message: {skip_frame_msg}"
+            )
         else:
+            if config.dont_skip_tracing:
+                from .builder import SourcelessBuilder
+
+                # re-build the function, attempting to not skip
+                rebuilt_fn = SourcelessBuilder.create(tx, self.value)
+                # if we still get SkipFunctionVariable, then we *really* should skip this function
+                if not isinstance(rebuilt_fn, SkipFunctionVariable):
+                    return rebuilt_fn.call_function(tx, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             qualname = getattr(self.value, "__qualname__", "<unknown qualname>")
             module_or = getattr(self.value, "__module__", None)
             module_name = "<unknown module>" if module_or is None else str(module_or)
@@ -1217,11 +1568,18 @@ def call_function(
                 # Do a very basic check for now.
                 if "_dynamo" not in path:
                     hints += [
+<<<<<<< HEAD
                         f"Remove the function `{qualname}` or the file `{path}` "
                         "from torch/_dynamo/trace_rules.py. More graph breaks may occur as a result of "
                         "attempting to trace into the function.",
                         "Please file an issue to PyTorch.",
                         # TODO suggest mark_force_inline when implemented
+=======
+                        f"Apply `@torch._dynamo.dont_skip_tracing` to the function `{qualname}` "
+                        "to force tracing into the function. "
+                        "More graph breaks may occur as a result of attempting to trace into the function.",
+                        "Please file an issue to PyTorch.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ]
             except TypeError:
                 known_python_builtin_modules = {"_abc", "_warnings"}
@@ -1286,6 +1644,34 @@ def var_getattr(self, tx: "InstructionTranslator", name: str):
         return fn_var_getattr(tx, self.value, self.source, name)
 
 
+<<<<<<< HEAD
+=======
+class WrappedSkipFunctionVariable(SkipFunctionVariable):
+    def __init__(self, wrapped, context, **kwargs) -> None:
+        kwargs.pop("value", None)
+        kwargs.pop("reason", None)
+        super().__init__(wrapped.value, reason=wrapped.reason, **kwargs)
+        self.wrapped = wrapped
+        self.context = context
+
+    def call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: "list[VariableTracker]",
+        kwargs: "dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        self.context.enter(tx)
+        result = super().call_function(tx, args, kwargs)
+        self.context.exit(tx)
+        return result
+
+    def reconstruct(self, codegen):
+        codegen.add_push_null(lambda: codegen(self.context))
+        codegen(self.wrapped)
+        codegen.extend_output(create_call_function(1, False))
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class WrapperUserFunctionVariable(VariableTracker):
     """
     Used to represent a wrapper object that contains the actual callable as an
@@ -1307,12 +1693,19 @@ def var_getattr(self, tx: "InstructionTranslator", name):
 
         return super().var_getattr(tx, name)
 
+<<<<<<< HEAD
+=======
+    def self_args(self):
+        return []
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_function(
         self,
         tx: "InstructionTranslator",
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
+<<<<<<< HEAD
         return variables.UserFunctionVariable(
             polyfills.getattr_and_trace
         ).call_function(
@@ -1320,6 +1713,58 @@ def call_function(
         )
 
 
+=======
+        if hasattr(self.wrapper_obj, "cache_info"):
+            target_fn = getattr(self.wrapper_obj, self.attr_to_trace, None)
+            module_name = getattr(target_fn, "__module__", "") or ""
+
+            if module_name.split(".", maxsplit=1)[0] != "torch":
+                msg = (
+                    "Dynamo detected a call to a `functools.lru_cache`-wrapped "
+                    "function. Dynamo ignores the cache wrapper and directly "
+                    "traces the wrapped function. Silent incorrectness is only "
+                    "a *potential* risk, not something we have observed. "
+                    'Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.'
+                )
+
+                torch._dynamo.utils.warn_once(msg)
+
+                dynamo_logger = torch._dynamo.utils.logging.getLogger("torch._dynamo")
+                if dynamo_logger.isEnabledFor(logging.DEBUG):
+                    user_stack = torch._guards.TracingContext.extract_stack()
+                    user_stack = get_stack_above_dynamo() + user_stack
+                    frame_loc = (user_stack[-1].filename, user_stack[-1].lineno)
+                    user_stack_formatted = "".join(traceback.format_list(user_stack))
+                    user_stack_trace = f"call to a lru_cache wrapped function at: {frame_loc[0]}:{frame_loc[1]}\n"
+                    user_stack_trace += str(user_stack_formatted)
+                    dynamo_logger.debug(user_stack_trace)
+
+        all_args = self.self_args() + args
+        return variables.UserFunctionVariable(
+            polyfills.getattr_and_trace
+        ).call_function(
+            tx,
+            [self, variables.ConstantVariable(self.attr_to_trace), *all_args],
+            kwargs,
+        )
+
+
+class WrapperUserMethodVariable(WrapperUserFunctionVariable):
+    """
+    Similar to WrapperUserFunctionVariable, but for methods. The only delta is
+    saving the vt for `self` object of the method which is then used by
+    WrapperUserFunctionVariable in `call_function` method.
+    """
+
+    def __init__(self, wrapper_obj, attr_to_trace, self_obj, **kwargs) -> None:
+        super().__init__(wrapper_obj, attr_to_trace, **kwargs)
+        self.obj = self_obj
+
+    def self_args(self):
+        return [self.obj]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _traceable_collective_remaps():
     # We can't rely on importing from distributed, since it's not always built
     if torch.distributed.is_available():
@@ -1494,7 +1939,11 @@ def __init__(self, func: VariableTracker, args, keywords, **kwargs) -> None:
     def python_type(self):
         return functools.partial
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(lambda: codegen.load_import_from("functools", "partial"))
         codegen(self.func)
         if self.args:
@@ -1540,6 +1989,11 @@ def var_getattr(self, tx: "InstructionTranslator", name: str):
         if name == "keywords":
             items = {ConstantVariable.create(k): v for k, v in self.keywords.items()}
             return variables.ConstDictVariable(items, source=source)
+<<<<<<< HEAD
+=======
+        if name in cmp_name_to_op_mapping:
+            return variables.GetAttrVariable(self, name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise_observed_exception(AttributeError, tx)
 
     def as_python_constant(self):
@@ -1567,7 +2021,11 @@ class PolyfilledFunctionVariable(VariableTracker):
     }
 
     @classmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _get_polyfill_handlers(cls) -> dict[Callable[..., Any], types.FunctionType]:
         return {}
 
@@ -1726,6 +2184,11 @@ def call_function(self, tx, args, kwargs):
 
 
 from torch._higher_order_ops.triton_kernel_wrap import (
+<<<<<<< HEAD
+=======
+    create_tma_experimental_metadata,
+    create_tma_stable_metadata,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TMADescriptorMetadata,
     TritonHOPifier,
 )
@@ -1821,16 +2284,29 @@ def call_HOP(self, variable, grids, combined_args_raw, tx) -> ConstantVariable:
         from .dicts import ConstDictVariable
 
         # as we can only pass tensors as non-const args in fx graph,
+<<<<<<< HEAD
         # here we replace TMA descriptors (TMADescriptorVariable
+=======
+        # here we replace TMA descriptors
+        # (TMADescriptorExperimentalVariable and TMADescriptorStableVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # instances) with the underlying tensors, while moving the
         # TMA descriptor-related metadata to a separate argument,
         # so that we can reconstruct the TMA descriptors downstream
         tma_descriptor_metadata: TMADescriptorMetadata = {}
         for k in list(combined_args_raw.keys()):
             v = combined_args_raw[k]
+<<<<<<< HEAD
             if isinstance(v, TMADescriptorVariable):
                 tma_descriptor_metadata[k] = v.to_metadata()
                 combined_args_raw[k] = v.data_ptr.from_tensor
+=======
+            if isinstance(
+                v, (TMADescriptorExperimentalVariable, TMADescriptorStableVariable)
+            ):
+                tma_descriptor_metadata[k] = v.to_metadata()
+                combined_args_raw[k] = v.get_tensor()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         combined_args = {
             variables.ConstantVariable.create(k): v
@@ -1932,7 +2408,11 @@ def specialize_symbolic(self, arg: Any) -> Any:
         return arg
 
 
+<<<<<<< HEAD
 class TMADescriptorVariable(VariableTracker):
+=======
+class TMADescriptorExperimentalVariable(VariableTracker):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         data_ptr: "variables.DataPtrVariable",
@@ -1949,13 +2429,21 @@ def __init__(
         self.element_size = element_size
 
     def to_metadata(self):
+<<<<<<< HEAD
         return (
+=======
+        return create_tma_experimental_metadata(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [dim.as_proxy() for dim in self.dims],
             [dim.as_proxy() for dim in self.block_dims],
             self.element_size.as_proxy(),
         )
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from(
                 "triton.tools.experimental_descriptor",
@@ -1967,8 +2455,49 @@ def reconstruct(self, codegen):
         codegen.foreach(args)
         codegen.call_function(len(args) + 1, False)
 
+<<<<<<< HEAD
 
 class CreateTMADescriptorVariable(VariableTracker):
+=======
+    def get_tensor(self):
+        return self.data_ptr.from_tensor
+
+
+class TMADescriptorStableVariable(VariableTracker):
+    def __init__(
+        self,
+        tensor: "variables.TensorVariable",
+        block_shape: "variables.ListVariable",
+        **kwargs,
+    ):
+        assert isinstance(tensor, variables.TensorVariable)
+        super().__init__(**kwargs)
+        self.tensor = tensor
+        self.block_shape = block_shape
+
+    def to_metadata(self):
+        return create_tma_stable_metadata(
+            self.block_shape.as_proxy(),
+        )
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.add_push_null(
+            lambda: codegen.load_import_from(
+                "triton.tools.tensor_descriptor",
+                "TensorDescriptor",
+            )
+        )
+        codegen.load_method("from_tensor")
+        self.tensor.reconstruct(codegen)
+        codegen(self.block_shape)
+        codegen.call_method(2)
+
+    def get_tensor(self) -> "variables.TensorVariable":
+        return self.tensor
+
+
+class CreateTMADescriptorExperimentalVariable(VariableTracker):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         rank: int,
@@ -2013,9 +2542,32 @@ def call_function(
             ]
         element_size = kwargs["element_size"] if "element_size" in kwargs else args[-1]
 
+<<<<<<< HEAD
         return TMADescriptorVariable(
+=======
+        return TMADescriptorExperimentalVariable(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             data_ptr=ptr,
             dims=dims,
             block_dims=block_dims,
             element_size=element_size,
         )
+<<<<<<< HEAD
+=======
+
+
+class CreateTMADescriptorStableVariable(VariableTracker):
+    def call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: "list[VariableTracker]",
+        kwargs: "dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        tensor = kwargs["tensor"] if "tensor" in kwargs else args[0]
+        block_shape = kwargs["block_shape"] if "block_shape" in kwargs else args[1]
+
+        return TMADescriptorStableVariable(
+            tensor=tensor,
+            block_shape=block_shape,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index 08db4fd1b7bf..e5a597bcef0b 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -20,7 +20,10 @@
 """
 
 import contextlib
+<<<<<<< HEAD
 import copy
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import inspect
 import itertools
@@ -41,6 +44,7 @@
 from torch._dynamo.variables.tensor import SymNodeVariable
 from torch._guards import Source
 from torch._ops import HigherOrderOperator
+<<<<<<< HEAD
 from torch.fx.node import map_arg
 from torch.fx.passes.shape_prop import _extract_tensor_metadata
 from torch.utils import _pytree as pytree
@@ -50,6 +54,18 @@
     IncorrectUsage,
     UncapturedHigherOrderOpError,
     unimplemented,
+=======
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from torch.utils import _pytree as pytree
+
+from .. import graph_break_hints, variables
+from ..exc import (
+    IncorrectUsage,
+    ObservedException,
+    UncapturedHigherOrderOpError,
+    unimplemented,
+    unimplemented_v2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Unsupported,
 )
 from ..source import AttrSource, DictGetItemSource
@@ -65,6 +81,10 @@
 
 
 log = logging.getLogger(__name__)
+<<<<<<< HEAD
+=======
+hc_log = torch._logging.getArtifactLogger(__name__, "hierarchical_compile")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def raise_hard_error_if_graph_break(reason):
@@ -73,7 +93,11 @@ def deco(fn):
         def graph_break_as_hard_error(*args, **kwargs):
             try:
                 return fn(*args, **kwargs)
+<<<<<<< HEAD
             except Unsupported as e:
+=======
+            except (Unsupported, ObservedException) as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 msg = " Scroll up to find out what causes the graph break."
                 raise UncapturedHigherOrderOpError(reason + msg) from e
 
@@ -107,8 +131,14 @@ def check_meta_consistency_vt(
     vars2: list[VariableTracker],
     lhs_name: str,
     rhs_name: str,
+<<<<<<< HEAD
 ) -> None:
     from torch._higher_order_ops.while_loop import check_meta_consistency
+=======
+    include_contiguity: bool = True,
+) -> None:
+    from torch._higher_order_ops.utils import check_meta_consistency
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from . import TensorVariable
 
@@ -125,7 +155,17 @@ def _unwrap_var(var):
     unwrapped1 = [_unwrap_var(var) for var in vars1]
     unwrapped2 = [_unwrap_var(var) for var in vars2]
 
+<<<<<<< HEAD
     return check_meta_consistency(unwrapped1, unwrapped2, lhs_name, rhs_name)
+=======
+    return check_meta_consistency(
+        unwrapped1,
+        unwrapped2,
+        lhs_name,
+        rhs_name,
+        include_contiguity=include_contiguity,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @contextlib.contextmanager
@@ -254,6 +294,118 @@ def _check_supported_callable_arg(
         )
 
 
+<<<<<<< HEAD
+=======
+def are_same_graph_modules(fn_name, a_mod, b_mod, fake_mode):
+    from torch._subclasses._fake_tensor_utils import _CacheKeyState
+    from torch._subclasses.fake_tensor import extract_tensor_metadata
+
+    # Maps the equivalent nodes from a to b
+    node_map = {}
+
+    def check_all_args(a_nodes, b_nodes):
+        for arg_a, arg_b in zip(a_nodes, b_nodes):
+            if isinstance(arg_a, torch.fx.Node):
+                if node_map[arg_a] != arg_b:
+                    return False
+            elif isinstance(arg_a, slice):
+                if not isinstance(arg_b, slice):
+                    return False
+                if not check_all_args(
+                    (arg_a.start, arg_a.stop, arg_a.step),
+                    (arg_b.start, arg_b.stop, arg_b.step),
+                ):
+                    return False
+            elif arg_a != arg_b:
+                # This is a catch-all for everything else. `slice` was a
+                # surprise but can there be other data structures that can
+                # contain fx.Nodes in them?
+                return False
+        return True
+
+    for a_node, b_node in zip(a_mod.graph.nodes, b_mod.graph.nodes):
+        if a_node.op != b_node.op:
+            return False
+
+        if a_node.op == "placeholder":
+            a_value = a_node.meta["example_value"]
+            b_value = b_node.meta["example_value"]
+
+            if isinstance(a_value, torch.Tensor):
+                if not isinstance(b_value, torch.Tensor):
+                    return False
+                # Extract fake tensor metadata for a and b and then compare
+                a_result = []
+                state = _CacheKeyState(fake_mode.shape_env)
+                a_metadata = extract_tensor_metadata(a_value)
+                a_metadata._flatten_into(a_result, fake_mode, state)
+
+                b_result = []
+                state = _CacheKeyState(fake_mode.shape_env)
+                b_metadata = extract_tensor_metadata(b_value)
+                b_metadata._flatten_into(b_result, fake_mode, state)
+                if a_result != b_result:
+                    return False
+            elif isinstance(a_value, torch.SymInt):
+                if not isinstance(b_value, torch.SymInt):
+                    return False
+                if a_value is not b_value:
+                    return False
+        elif a_node.op == "call_function":
+            if a_node.target is not b_node.target:
+                return False
+            a_flat, _ = pytree.tree_flatten((a_node.args, a_node.kwargs))
+            b_flat, _ = pytree.tree_flatten((b_node.args, b_node.kwargs))
+            if not check_all_args(a_flat, b_flat):
+                hc_log.debug(
+                    "%s: Graph comparison failed at node (call_function): %s",
+                    fn_name,
+                    a_node,
+                )
+                return False
+        elif a_node.op == "call_method":
+            if a_node.target != b_node.target:
+                return False
+            a_flat, _ = pytree.tree_flatten((a_node.args, a_node.kwargs))
+            b_flat, _ = pytree.tree_flatten((b_node.args, b_node.kwargs))
+            if not check_all_args(a_flat, b_flat):
+                hc_log.debug(
+                    "%s: Graph comparison failed at node (call_method) : %s",
+                    fn_name,
+                    a_node,
+                )
+                return False
+        elif a_node.op == "output":
+            a_flat, _ = pytree.tree_flatten((a_node.args, a_node.kwargs))
+            b_flat, _ = pytree.tree_flatten((b_node.args, b_node.kwargs))
+            if not check_all_args(a_flat, b_flat):
+                hc_log.debug("%s: Graph comparison failed at the output node", fn_name)
+                return False
+        elif a_node.op == "get_attr":
+            a_attr = getattr(a_mod, a_node.target)
+            b_attr = getattr(b_mod, b_node.target)
+            if isinstance(a_attr, torch.fx.GraphModule):
+                if not isinstance(b_attr, torch.fx.GraphModule):
+                    return False
+                # This is an example of a HOP inside a HOP
+                if not are_same_graph_modules(fn_name, a_attr, b_attr, fake_mode):
+                    return False
+            else:
+                # TODO - write an example with tensor as a graph attribute in
+                # the Fx graph
+                raise NotImplementedError(f"get_attr with {type(a_attr)}")
+        else:
+            # TODO - call_module is not supported because Dynamo Fx graph does
+            # not install a call_module
+            raise NotImplementedError(f"Graph equivalence check saw a {a_node.op}")
+
+        # Two nodes are equal - add them to them map
+        node_map[a_node] = b_node
+
+    return True
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def validate_args_and_maybe_create_graph_inputs(
     sub_args,
     tracer,
@@ -392,7 +544,11 @@ def dedup_and_sort_lifted_freevars(l_lifted_freevars, r_lifted_freevars):
         #
         # Note: ideally, dynamo should just create a single proxy for the same attribute of a nn module. But
         # true_branch and false_branch belong to two separate tracing contexts, they may register the same
+<<<<<<< HEAD
         # attribute to top level seperately. This creates two get_attr proxies for the same attribute
+=======
+        # attribute to top level separately. This creates two get_attr proxies for the same attribute
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # that have different meta data such as stack_trace (one stack trace for the true_branch,
         # and the other for false_branch). It seems better to discard the proxy explicitly in cond
         # than make dynamo create a single proxy for the same get_attr target.
@@ -463,7 +619,11 @@ def _insert_or_replace_phs(new_args, name_suffix):
                 if arg in lifted_freevars:
                     old_ph = lifted_freevars[arg].node
                     old_ph.replace_all_uses_with(new_ph)
+<<<<<<< HEAD
                     # replace_all_uses_with doesn't clean users. Clean it mannually so that we could erase it.
+=======
+                    # replace_all_uses_with doesn't clean users. Clean it manually so that we could erase it.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     old_ph.users = {}
                     graph.erase_node(old_ph)
 
@@ -506,6 +666,12 @@ def speculate_subgraph(
     restore_side_effects=True,
     should_flatten_outputs=False,
     under_activation_checkpoint=False,
+<<<<<<< HEAD
+=======
+    # TODO - supports input_mutation and aliasing should be False by default for strictness
+    supports_input_mutation=True,
+    supports_aliasing=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Pass in an originating tracer - this is needed for preserving context
     # across fwd-bwd for autograd.Function
     tracer=None,
@@ -632,8 +798,13 @@ def speculate_subgraph(
 
                 # NOTE: [HigherOrderOperator subgraph input ordering]
                 # The input ordering of the higher order ops is determined by the order of
+<<<<<<< HEAD
                 # the creatation of the placehoder.
                 # Mannually created inputs are created in validate_args_and_maybe_create_graph_inputs before
+=======
+                # the creation of the placeholder.
+                # Manually created inputs are created in validate_args_and_maybe_create_graph_inputs before
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # speculating subgraph.
                 # During subgraph speculation, we may lift closured tensors and free symbols as inputs,
                 # their ordering is determined by the time they are lifted: earlier lifted ones precede later
@@ -694,6 +865,37 @@ def move_lifted_freevars_phs_to_end(
                 if len(lifted_freevars) > 0:
                     move_lifted_freevars_phs_to_end(graph, lifted_freevars)
 
+<<<<<<< HEAD
+=======
+                if not supports_input_mutation:
+                    mutation_info = subtracer.has_input_mutation()
+                    if mutation_info.has_mutation:
+                        context = f"{mutation_info.msg} in\n {graph}"
+                        unimplemented_v2(
+                            gb_type="Encountered input mutation during higher order op tracing",
+                            context=context,
+                            explanation=f"Higher order ops do not support input mutation. Found in {source_target.name()}",
+                            hints=[
+                                "Consider using the debug context to change user code to avoid mutation.",
+                                "Please open an issue.",
+                            ],
+                        )
+
+                if not supports_aliasing:
+                    aliasing_info = subtracer.has_aliasing()
+                    if aliasing_info.has_aliasing:
+                        context = f"{aliasing_info.msg} in\n {graph}"
+                        unimplemented_v2(
+                            gb_type="Encountered aliasing during higher order op tracing",
+                            context=context,
+                            explanation=f"Higher order ops do not support aliasing. Found in {source_target.name()}",
+                            hints=[
+                                "Consider using the debug context to change user code to avoid aliasing.",
+                                "Please open an issue.",
+                            ],
+                        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return (
                     (output, treespec),
                     graph,
@@ -753,6 +955,11 @@ def make(value, source=None, **kwargs):
             return HintsWrapperHigherOrderVariable(value, source, **kwargs)
         elif value.__name__ == "flex_attention":
             return FlexAttentionHigherOrderVariable(value, source, **kwargs)
+<<<<<<< HEAD
+=======
+        elif value.__name__ == "flex_attention_backward":
+            return FlexAttentionBackwardHighOrderVariable(value, source, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif value.__name__ in (
             "wrap_activation_checkpoint",
             "tag_activation_checkpoint",
@@ -776,6 +983,11 @@ def make(value, source=None, **kwargs):
             return WrapWithSetGradEnabledHigherOrderVariable(value, source, **kwargs)
         elif value.__name__ == "wrap_with_autocast":
             return WrapWithAutocastHigherOrderVariable(value, source, **kwargs)
+<<<<<<< HEAD
+=======
+        elif value.__name__ == "dynamo_bypassing_wrapper":
+            return DynamoBypassingWrapperHigherOrderVariable(value, source, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif (
             value.__name__ == "auto_functionalized"
             or value.__name__ == "auto_functionalized_v2"
@@ -798,6 +1010,12 @@ def call_function(
     ) -> VariableTracker:
         unimplemented(f"HigherOrderOperator {self.value.__name__}")
 
+<<<<<<< HEAD
+=======
+    def as_python_constant(self):
+        return self.value
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CustomFunctionHigherOrderOperatorVariable(TorchHigherOrderOperatorVariable):
     """
@@ -820,6 +1038,12 @@ def call_function(
 
 
 class CondHigherOrderVariable(TorchHigherOrderOperatorVariable):
+<<<<<<< HEAD
+=======
+    supports_input_mutation = False
+    supports_aliasing = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @raise_hard_error_if_graph_break(
         reason="Cond doesn't work unless it is captured completely with torch.compile."
     )
@@ -917,12 +1141,30 @@ def speculate_branch(branch):
                 "cond",
                 source_target=self.value,
                 should_flatten_outputs=True,
+<<<<<<< HEAD
             )
 
             if not only_consist_of(ret_val, (TensorVariable,)):
                 unimplemented(
                     "Expected branches to return a possibly nested list/tuple/dict of tensors but it consists of non tensors.",
                 )
+=======
+                supports_input_mutation=self.supports_input_mutation,
+                supports_aliasing=self.supports_aliasing,
+            )
+
+            if not only_consist_of(ret_val, (TensorVariable, ConstantVariable)):
+                unimplemented(
+                    "Expected branches to return a possibly nested pytree of tensors "
+                    "or constant ints but it consists of others.",
+                )
+            for ret in ret_val.unpack_var_sequence(tx):
+                if isinstance(ret, ConstantVariable) and ret.python_type() is not int:
+                    unimplemented(
+                        "Expected branches to return a possibly nested pytree of tensors "
+                        f"or constant ints but it consists of others {ret.python_type()}.",
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ret_val, ret_treespec, ret_graph, ret_lifted_freevars
 
         (true_r, true_treespec, true_graph, true_lifted_freevars) = speculate_branch(
@@ -977,7 +1219,11 @@ def speculate_branch(branch):
             true_node,
             false_node,
             # We pick true_shared but it shouldn't matter
+<<<<<<< HEAD
             true_shared + unique_true + unique_false,
+=======
+            tuple(true_shared + unique_true + unique_false),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         return _call_function_and_unflatten_output(
@@ -1043,6 +1289,12 @@ def validate_subgraph_output_types(output: VariableTracker):
 
 
 class WhileLoopHigherOrderVariable(TorchHigherOrderOperatorVariable):
+<<<<<<< HEAD
+=======
+    supports_input_mutation = False
+    supports_aliasing = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @raise_hard_error_if_graph_break(
         reason="while_loop doesn't work unless it is captured completely with torch.compile."
     )
@@ -1158,6 +1410,11 @@ def create_unbacked_sym_node_var(tx) -> SymNodeVariable:
             # So it's best we always enforce the ordering of carried_inputs the same as outputs
             # with "flatten_manual".
             set_subgraph_inputs="flatten_manual",
+<<<<<<< HEAD
+=======
+            supports_input_mutation=self.supports_input_mutation,
+            supports_aliasing=self.supports_aliasing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         cond_nn_modules = dict(tx.output.nn_modules)
         validate_subgraph_output_types(cond_r)
@@ -1196,14 +1453,31 @@ def create_unbacked_sym_node_var(tx) -> SymNodeVariable:
             source_target=self.value,
             set_subgraph_inputs="flatten_manual",
             should_flatten_outputs=True,
+<<<<<<< HEAD
+        )
+        validate_subgraph_output_types(body_r)
+
+=======
+            supports_input_mutation=False,
+            supports_aliasing=False,
         )
         validate_subgraph_output_types(body_r)
 
+        # We set include contiguity=False because we have vmap x HOP tests, where if
+        # include_contiguity=True will call t.is_contiguous inside of vmap and get an error
+        # "querying is_contiguous inside of vmap for memory_format other than
+        # torch.contiguous_format is not yet implemented". This is okay because stride
+        # is still checked.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         check_meta_consistency_vt(
             body_r.unpack_var_sequence(tx),
             operands_seq,
             "body_fn_output",
             "carried_inputs",
+<<<<<<< HEAD
+=======
+            include_contiguity=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         (
@@ -1223,7 +1497,11 @@ def create_unbacked_sym_node_var(tx) -> SymNodeVariable:
         )
 
         # Note: cond_shared and body_shared refer to the same proxy in parent graph
+<<<<<<< HEAD
         # so using either of them is OK. Use cond_shared as it doesnt matter.
+=======
+        # so using either of them is OK. Use cond_shared as it doesn't matter.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         additional_lifted_inputs = cond_shared + cond_unique + body_unique
 
         body_nn_modules = dict(tx.output.nn_modules)
@@ -1273,6 +1551,12 @@ def create_unbacked_sym_node_var(tx) -> SymNodeVariable:
 
 
 class AssociativeScanHigherOrderVariable(TorchHigherOrderOperatorVariable):
+<<<<<<< HEAD
+=======
+    supports_input_mutation = False
+    supports_aliasing = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @raise_hard_error_if_graph_break(
         reason="associative_scan must be captured completely with torch.compile."
     )
@@ -1284,9 +1568,12 @@ def call_function(
     ) -> VariableTracker:
         from torch._higher_order_ops.utils import first_slice_copy
 
+<<<<<<< HEAD
         from . import TensorVariable
         from .builder import wrap_fx_proxy
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
 
         def arg_extractor(combine_fn, xs, additional_inputs):
@@ -1294,6 +1581,7 @@ def arg_extractor(combine_fn, xs, additional_inputs):
 
         combine_fn, xs, additional_inputs = arg_extractor(*args, **kwargs)
 
+<<<<<<< HEAD
         if xs.python_type() != list:
             unimplemented(
                 f"Expected xs to be a list of tensors but got {xs.python_type()}",
@@ -1471,6 +1759,30 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
                 f"internal error, please report an issue to PyTorch."
             )
         init_treespec = args[0].keywords["spec_init"]
+=======
+        if args[0].python_type() is functools.partial:
+            # This is the standard case when the user calls the frontend
+            # and the frontend invokes dynamo
+            if len(args) != 2:
+                unimplemented(
+                    f"Expected 2 positional arguments but got {len(args)}.\n"
+                    f"Usage: associative_scan(combine_fn, xs)",
+                )
+
+            xs_treespec = args[0].keywords["spec"]
+
+            # combine_fn input check
+            # We need to get the pure combine_fn from the functools.partial
+            _check_supported_callable_arg(
+                tx, combine_fn.keywords["combine_fn"], "combine_fn"
+            )
+        else:
+            # This case is hit during re-tracing, for example in export tests
+            # In this case, the combine_fn is a callable and not a functools.partial
+            xs_treespec = _make_inlined(tx, pytree.tree_structure)(xs)
+
+            _check_supported_callable_arg(tx, combine_fn, "combine_fn")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # xs input check
         if not isinstance(xs, (ListVariable, TupleVariable)):
@@ -1495,10 +1807,245 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
         scan_length = get_fake_value(xs_vars[0].as_proxy().node, tx).size()[0]
         if scan_length == 0:
             unimplemented(
+<<<<<<< HEAD
                 "scan() operator doesn't support zero-sized tensors during tracing."
             )
 
         # Trace the subgraph
+=======
+                "associative_scan() operator doesn't support zero-sized tensors during tracing."
+            )
+
+        # Trace the subgraph
+        # The sub_args is a slice of original input, e.g. if input.size is (3, 4), and scan dim=0
+        # the sub_args shape will be (4, ).
+        with discard_graph_changes(tx):
+            sub_args = [
+                _make_inlined(tx, first_slice_copy)(leaf)
+                for leaf in itertools.chain(xs_vars, xs_vars)
+            ]
+            sub_args_additional_inputs = [
+                t.call_method(tx, "clone", args=(), kwargs={})
+                for t in additional_inputs_vars
+            ]
+
+        sub_args = sub_args + sub_args_additional_inputs
+        (
+            (combine_result, _combine_treespec),
+            combine_graph,
+            combine_lifted_freevars,
+        ) = speculate_subgraph(
+            tx,
+            combine_fn,
+            sub_args,
+            sub_kwargs={},
+            description="associative_scan_combine_fn",
+            source_target=self.value,
+            set_subgraph_inputs="flatten_manual",
+            supports_input_mutation=self.supports_input_mutation,
+            supports_aliasing=self.supports_aliasing,
+        )
+
+        # Ensure that the output of scan is a flattened list of elements,
+        # because downstream operations assume that the output of HOPs
+        # is flattened
+        output_node = combine_graph.find_nodes(op="output")[0]
+        output_node.args = (pytree.tree_leaves(output_node.args),)
+        combine_graph.lint()
+
+        # Collect the results from the combine_fn
+        results, _combine_treespec = _make_inlined(tx, pytree.tree_flatten)(
+            combine_result
+        ).unpack_var_sequence(tx)
+
+        # Check whether the combine_fn returns one child tree for the output.
+        if _combine_treespec.as_python_constant().num_leaves < 1:
+            unimplemented(
+                f"combine_fn needs to produce one pytree for the output "
+                f"but combine_fn produces the pytree {_combine_treespec.as_python_constant()}."
+            )
+
+        # Check whether the outs produced by combine_fn has the same treespec as xs
+        # We need to have this check this way, because in case init is a TreeSpec and carry
+        # but carry is only a LeafSpec, these two cannot be compared correctly.
+        if (
+            isinstance(xs_treespec.as_python_constant(), pytree.LeafSpec)
+            != isinstance(_combine_treespec.as_python_constant(), pytree.LeafSpec)
+        ) or not _make_inlined(tx, pytree.TreeSpec.__eq__)(
+            xs_treespec, _combine_treespec
+        ).as_python_constant():
+            unimplemented(
+                f"The tree structure of the xs and the outs of the combine_fn are are expected to be identical, but got "
+                f"xs: {xs_treespec.as_python_constant()} vs output: {_combine_treespec.as_python_constant()}."
+            )
+
+        # We set include contiguity=False because we have vmap x HOP tests, where if
+        # include_contiguity=True will call t.is_contiguous inside of vmap and get an error
+        # "querying is_contiguous inside of vmap for memory_format other than
+        # torch.contiguous_format is not yet implemented". This is okay because stride
+        # is still checked.
+        check_meta_consistency_vt(
+            [_make_inlined(tx, first_slice_copy)(t) for t in xs_vars],
+            results.items,
+            "initial_xs",
+            "combine_fn_output",
+            include_contiguity=False,
+        )
+
+        combine_gm = torch.fx.GraphModule(dict(tx.output.nn_modules), combine_graph)
+        combine_freevars_proxy = tuple(combine_lifted_freevars.keys())
+
+        # Compute the proxies for the input check
+        proxy_vars_inputcheck = (
+            tuple(sarg.as_proxy() for sarg in sub_args) + combine_freevars_proxy
+        )
+
+        from torch._higher_order_ops.utils import _maybe_fake_tracing
+        from torch._inductor.utils import is_pointwise_use
+
+        with tx.fake_mode:
+            sub_args_fake = [
+                leaf.node.meta["example_value"].clone()
+                if hasattr(leaf.node.meta["example_value"], "clone")
+                else leaf.node.meta["example_value"]
+                for leaf in pytree.tree_leaves(proxy_vars_inputcheck)
+            ]
+            pre_dispatch = False
+
+            fx = _maybe_fake_tracing(
+                combine_gm, sub_args_fake, pre_dispatch=pre_dispatch
+            )
+
+            for node in fx.graph.nodes:
+                # Check that the combine_fn is pointwise, if combine_mode='pointwise'
+                if not all(
+                    is_pointwise_use(use) or use.op == "output" for use in node.users
+                ):
+                    raise RuntimeError(
+                        "For combine_mode='pointwise', the combine_fn needs to be pointwise"
+                    )
+
+        combine_fn_name = tx.output.install_subgraph(
+            "associative_scan_combine_fn", combine_gm
+        )
+
+        # Compute the proxies
+        xs_proxy = xs.as_proxy()
+        combine_freevars_proxy = tuple(combine_lifted_freevars.keys())
+        additional_inputs_proxy = additional_inputs.as_proxy() + combine_freevars_proxy
+
+        p_args = (
+            make_attr(tx, combine_fn_name),
+            xs_proxy,
+            additional_inputs_proxy,
+        )
+
+        with tx.fake_mode:
+            out_meta = tuple(
+                inp_proxy.node.meta["example_value"].clone() for inp_proxy in xs_proxy
+            )
+
+        return _call_function_and_unflatten_output(
+            tx,
+            torch.ops.higher_order.associative_scan,
+            p_args,
+            {},
+            out_meta,
+            xs_treespec,
+        )
+
+
+class ScanHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    supports_input_mutation = False
+    supports_aliasing = False
+
+    @raise_hard_error_if_graph_break(
+        reason="scan must be captured completely with torch.compile."
+    )
+    def call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        from torch._higher_order_ops.scan import _extract_carry_and_out, stack_y
+        from torch._higher_order_ops.utils import first_slice_copy
+
+        args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
+
+        # combine_fn input check
+        def _check_combine_fn_is_normalized(combine_fn_var):
+            if not isinstance(
+                combine_fn_var,
+                (
+                    variables.nn_module.NNModuleVariable,
+                    variables.FunctoolsPartialVariable,
+                ),
+            ):
+                unimplemented(
+                    f"Expected combine_fn to be wrapped as functools.partial in scan user-facing api "
+                    f"or a graph module if we're re-exporting but got "
+                    f"{combine_fn.python_type()}. Please report an issue to PyTorch if you're seeing this."
+                )
+            return isinstance(combine_fn_var, variables.nn_module.NNModuleVariable)
+
+        def arg_extractor(combine_fn, init, xs, additional_inputs):
+            return combine_fn, init, xs, additional_inputs
+
+        combine_fn, init, xs, additional_inputs = arg_extractor(*args, **kwargs)
+        init_vars = init.unpack_var_sequence(tx)
+        xs_vars = xs.unpack_var_sequence(tx)
+        additional_inputs_vars = additional_inputs.unpack_var_sequence(tx)
+
+        # combine_fn input check
+        combine_fn_is_normalized = _check_combine_fn_is_normalized(combine_fn)
+        if combine_fn_is_normalized:
+            combine_gm = combine_fn.value
+            assert isinstance(combine_gm, torch.fx.GraphModule), (
+                combine_fn,
+                combine_gm,
+            )
+        else:
+            # combine_fn input check
+            # We need to get the pure combine_fn from the functools.partial
+            _check_supported_callable_arg(
+                tx, combine_fn.keywords["combine_fn"], "combine_fn"
+            )
+        # xs input check
+        if not isinstance(xs, (ListVariable, TupleVariable)):
+            unimplemented(
+                f"Expected xs to be a list/tuple but got "
+                f"{xs.python_type()}. It seems to be an "
+                f"internal error, please report an issue to PyTorch."
+            )
+        # init input check
+        if not isinstance(init, (ListVariable, TupleVariable)):
+            unimplemented(
+                f"Expected init to be a list/tuple with at least one element but got "
+                f"{init.python_type()}. It seems to be an "
+                f"internal error, please report an issue to PyTorch."
+            )
+        if len(init_vars) == 0:
+            unimplemented(
+                "scan() operator requires init leaves.  It seems to be an "
+                "internal error, please report an issue to PyTorch."
+            )
+        # additional_inputs input check
+        if not isinstance(additional_inputs, (ListVariable, TupleVariable)):
+            unimplemented(
+                f"Expected additional_inputs to be a list/tuple but got "
+                f"{additional_inputs.python_type()}. It seems to be an "
+                f"internal error, please report an issue to PyTorch."
+            )
+        # scan_length check
+        scan_length = get_fake_value(xs_vars[0].as_proxy().node, tx).size()[0]
+        if scan_length == 0:
+            unimplemented("NYI: scan() operator doesn't support zero scan_length.")
+        _check_all_tensorvariable(init_vars)
+        _check_all_tensorvariable(xs_vars)
+        _check_all_tensorvariable(additional_inputs_vars)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with discard_graph_changes(tx):
             sub_args_init = [
                 ini.call_method(tx, "clone", args=(), kwargs={}) for ini in init_vars
@@ -1524,6 +2071,7 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
             description="scan_combine_fn",
             source_target=self.value,
             set_subgraph_inputs="flatten_manual",
+<<<<<<< HEAD
         )
         combine_freevars_proxy = list(combine_lifted_freevars.keys())
 
@@ -1565,14 +2113,70 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
 
         # Check meta data of carries and inits. If we pass this stage, we are sure that the init and carries
         # have the same tree structure
+=======
+            supports_input_mutation=self.supports_input_mutation,
+            supports_aliasing=self.supports_aliasing,
+        )
+
+        # Ensure that the output of scan is a flattened list of elements,
+        # because downstream operations assume that the output of HOPs
+        # is flattened
+        output_node = combine_graph.find_nodes(op="output")[0]
+        output_node.args = (pytree.tree_leaves(output_node.args),)
+        combine_graph.lint()
+        combine_freevars_proxy = list(combine_lifted_freevars.keys())
+        combine_result_vars = combine_result.unpack_var_sequence(tx)
+
+        if combine_fn_is_normalized:
+            carry_vars, out_vars = _extract_carry_and_out(
+                combine_result_vars, len(init_vars)
+            )
+        else:
+            if len(combine_result_vars) != 2:
+                unimplemented(
+                    f"Expect combine_fn to return a tuple (next_carry, y) but got {combine_result_vars}"
+                )
+            carry_tree, out_vars = combine_result_vars
+            carry_vars, carry_treespec = _make_inlined(tx, pytree.tree_flatten)(
+                carry_tree
+            ).unpack_var_sequence(tx)
+            carry_vars = carry_vars.unpack_var_sequence(tx)
+            out_vars = _make_inlined(tx, pytree.tree_leaves)(
+                out_vars
+            ).unpack_var_sequence(tx)
+
+            # additional output checking
+            _combine_treespec = _make_inlined(tx, pytree.tree_structure)(combine_result)
+
+            check_meta_consistency_vt(
+                init_vars,
+                carry_vars,
+                "init",
+                "carry",
+            )
+
+        # Check meta data of carries and inits. If we pass this stage, we are sure that the init and carries
+        # have the same tree structure.
+        # We set include contiguity=False because we have vmap x HOP tests, where if
+        # include_contiguity=True will call t.is_contiguous inside of vmap and get an error
+        # "querying is_contiguous inside of vmap for memory_format other than
+        # torch.contiguous_format is not yet implemented". This is okay because stride
+        # is still checked.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         check_meta_consistency_vt(
             init_vars,
             carry_vars,
             "init",
             "carry",
+<<<<<<< HEAD
         )
 
         # Compute the proxies
+=======
+            include_contiguity=False,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         xs_proxy = xs.as_proxy()
         init_proxy = init.as_proxy()
         additional_inputs_proxy = list(additional_inputs.as_proxy()) + list(
@@ -1616,25 +2220,50 @@ def non_single_tensor_return_unsupported(api, ret):
 
 
 class MapHigherOrderVariable(TorchHigherOrderOperatorVariable):
+<<<<<<< HEAD
+=======
+    supports_input_mutation = False
+    supports_aliasing = False
+
+    @raise_hard_error_if_graph_break(
+        reason="map doesn't work unless it is captured completely with torch.compile."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_function(
         self,
         tx: "InstructionTranslator",
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
+<<<<<<< HEAD
         from . import TensorVariable
         from .builder import wrap_fx_proxy_cls
+=======
+        args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if len(kwargs) > 0:
             unimplemented(
                 "torch.ops.higher_order.map: kwargs are not supported in the map operator."
             )
 
+<<<<<<< HEAD
         _check_supported_callable_arg(tx, args[0].realize(), "map_fn")
 
         assert type(args[1].realize()) is TensorVariable
 
         sample_shape = get_fake_value(args[1].as_proxy().node, tx).size()
+=======
+        _check_supported_callable_arg(tx, args[0], "map_fn")
+
+        # args = f, flat_xs, flat_args
+        assert isinstance(args[1], (ListVariable, TupleVariable)), args[1]
+        assert isinstance(args[2], (ListVariable, TupleVariable)), args[2]
+        unpacked_xs = args[1].unpack_var_sequence(tx)
+        unpacked_args = args[2].unpack_var_sequence(tx)
+
+        sample_shape = get_fake_value(unpacked_xs[0].as_proxy().node, tx).size()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if len(sample_shape) < 1 or sample_shape[0] == 0:
             unimplemented(
@@ -1645,9 +2274,21 @@ def call_function(
         # the loop body. In our case we will always use xs[0], and our map() won't support zero
         # sized tensor during tracing.
         with discard_graph_changes(tx):
+<<<<<<< HEAD
             first_dim = wrap_fx_proxy_cls(
                 target_cls=TensorVariable, tx=tx, proxy=args[1].as_proxy()[0]
             )
+=======
+            sliced_xs = [
+                xs.call_method(
+                    tx,
+                    "select",
+                    args=(VariableTracker.build(tx, 0), VariableTracker.build(tx, 0)),
+                    kwargs={},
+                )
+                for xs in unpacked_xs
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO: Support kwargs
         (
@@ -1658,14 +2299,20 @@ def call_function(
             tx,
             args[0],
             [
+<<<<<<< HEAD
                 first_dim,
                 *args[2:],
+=======
+                *sliced_xs,
+                *unpacked_args,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ],
             {},
             "torch.ops.higher_order.map",
             source_target=self.value,
             set_subgraph_inputs="flatten_manual",
             should_flatten_outputs=True,
+<<<<<<< HEAD
         )
 
         subgraph_example_value = [
@@ -1684,6 +2331,22 @@ def call_function(
                 )
                 for t in subgraph_example_value
             ]
+=======
+            supports_input_mutation=self.supports_input_mutation,
+            supports_aliasing=self.supports_aliasing,
+        )
+
+        # Check all outputs of map are tensors.
+        # For map, outputting None is OK, thus ignore None values in the check
+        body_r_vars = body_r.unpack_var_sequence(tx)
+        none_mask = [
+            type(x.realize()) is ConstantVariable and x.as_python_constant() is None
+            for x in body_r_vars
+        ]
+        _check_all_tensorvariable(
+            [br for bm, br in zip(none_mask, body_r_vars) if not bm]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         body_nn_modules = dict(tx.output.nn_modules)
 
@@ -1696,12 +2359,22 @@ def call_function(
 
         p_args = (
             body_node,
+<<<<<<< HEAD
             [args[1].as_proxy()],
             [arg.as_proxy() for arg in args[2:]] + list(body_lifted_freevars.keys()),
         )
 
         return _call_function_and_unflatten_output(
             tx, torch.ops.higher_order.map_impl, p_args, {}, map_example_out, body_spec
+=======
+            [xs.as_proxy() for xs in unpacked_xs],
+            [arg.as_proxy() for arg in unpacked_args]
+            + list(body_lifted_freevars.keys()),
+        )
+
+        return _call_function_and_unflatten_output(
+            tx, torch.ops.higher_order.map_impl, p_args, {}, None, body_spec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1780,6 +2453,12 @@ def call_function(
 
 
 class WrapHigherOrderVariable(TorchHigherOrderOperatorVariable):
+<<<<<<< HEAD
+=======
+    supports_input_mutation = True
+    supports_aliasing = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def install_subgraph_in_output_graph(
         self, tx, fn_vt, fn_args_vt, kwargs, body_gmod, attr_name="wrap_body"
     ):
@@ -1814,6 +2493,11 @@ def create_wrapped_node(
             source_target=self.value,
             should_flatten_outputs=True,
             under_activation_checkpoint=under_activation_checkpoint,
+<<<<<<< HEAD
+=======
+            supports_input_mutation=self.supports_input_mutation,
+            supports_aliasing=self.supports_aliasing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         body_gmod = torch.fx.GraphModule(tx.output.nn_modules, body_graph)
@@ -2269,6 +2953,76 @@ def call_function(
         return _make_inlined(tx, pytree.tree_unflatten)(variable, treespec)
 
 
+<<<<<<< HEAD
+=======
+class DynamoBypassingWrapperHigherOrderVariable(WrapHigherOrderVariable):
+    def __init__(self, hop, source) -> None:
+        super().__init__(hop, source)
+
+    def call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        from .builder import wrap_fx_proxy
+
+        func_var = args[0]
+
+        if isinstance(func_var, torch._dynamo.variables.UserFunctionVariable):
+            func = func_var.fn
+        elif isinstance(
+            func_var, torch._dynamo.variables.functions.FunctoolsPartialVariable
+        ):
+            func = func_var.as_python_constant()
+        else:
+            raise RuntimeError(
+                f"DynamoBypassingWrapperHigherOrderVariable: Unsupported function {type(func_var)}"
+            )
+        (
+            p_args,
+            _,
+            example_value,
+            _body_r,
+            treespec,
+            gmod,
+            _,
+        ) = self.create_wrapped_node(
+            tx,
+            args[1],
+            args[2:],
+            kwargs,
+            str(func),
+        )
+
+        # Alternatively, we could've stored only the function's fqn and
+        # reconstructed, but that requires the function to be a global.
+        gmod_meta_key = "_dynamo_bypassing_wrapper_fn"
+        gmod.meta[gmod_meta_key] = func
+
+        # Store the invocation as a call
+        variable = wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                self.value,
+                args=(gmod_meta_key,) + tuple(p_args),
+                kwargs={},
+            ),
+            example_value=example_value,
+        )
+
+        if treespec is None:
+            return variable
+
+        # Transform variable back into a list (previously made into a tuple by
+        # speculate_subgraph function) so as to respect the pytree API typing.
+        variable = BuiltinVariable(list).call_function(tx, [variable], {})
+
+        return _make_inlined(tx, pytree.tree_unflatten)(variable, treespec)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ExportTracepointHigherOrderVariable(TorchHigherOrderOperatorVariable):
     def call_function(
         self,
@@ -2337,8 +3091,13 @@ def call_function(
 
 class FlexAttentionBackwardHighOrderVariable(TorchHigherOrderOperatorVariable):
     def proxy_submod(self, tx, arg):
+<<<<<<< HEAD
         assert isinstance(arg.source, DictGetItemSource)
         submod_name = tx.output.install_subgraph(arg.source.index, arg.value)
+=======
+        assert isinstance(arg.source.base, DictGetItemSource)
+        submod_name = tx.output.install_subgraph(arg.source.base.index, arg.value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         p_submod = make_attr(tx, submod_name)
         set_example_value(p_submod.node, arg.value)
         return p_submod
@@ -2482,8 +3241,11 @@ def call_function(
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
+<<<<<<< HEAD
         from torch._higher_order_ops.flex_attention import flex_attention_fake_impl
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from .builder import wrap_fx_proxy
 
         (
@@ -2520,12 +3282,15 @@ def call_function(
         # Proxying user defined functions is not supported.
         inp_args, _ = proxy_args_kwargs(proxied_args, {})
 
+<<<<<<< HEAD
         query_meta = query.as_proxy().node.meta["example_value"]
         value_meta = value.as_proxy().node.meta["example_value"]
         with torch._guards.TracingContext.try_get().fake_mode:
             out_meta, lse_meta = flex_attention_fake_impl(query_meta, value_meta)
         example_value = (out_meta, lse_meta)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Compose the ordered HOO args:
         # - inp_args: [query, key, value, block_mask, scale, kernel_options]
         # - subgraph node: [score_mod, mask_fn_node]
@@ -2548,7 +3313,11 @@ def call_function(
                 ),
                 kwargs={},
             ),
+<<<<<<< HEAD
             example_value=example_value,
+=======
+            example_value=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -2612,6 +3381,18 @@ def bwd(ctx, grad, x):
         )
 
         ctx = AutogradFunctionContextVariable.create(tx, args, kwargs)
+<<<<<<< HEAD
+=======
+        with discard_graph_changes(tx):
+            # A little hacky, but we need a dummy ctx proxy for speculate_subgraph.
+            # We should clean this up at some point.
+            proxy = tx.output.create_proxy(
+                "call_function", torch.autograd.function.FunctionCtx, (), {}
+            )
+            set_example_value(proxy.node, ctx.value)
+            ctx.proxy = proxy
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.fwd_graph, types.FunctionType):
             fwd_fn = UserFunctionVariable(self.fwd_graph)
             fwd_args = [ctx, *args]
@@ -2884,7 +3665,11 @@ def unwrap_proxy(x):
         # Store the invocation as a call
         from torch._functorch.autograd_function import autograd_function_apply
 
+<<<<<<< HEAD
         # We use speculate_subgraph to get the fwd graph, but it's alway under no grad mode like what eager mode does.
+=======
+        # We use speculate_subgraph to get the fwd graph, but it's always under no grad mode like what eager mode does.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # The fwd outputs (tensor's example_value) need to be inferred from fake tensor prop to get the correct attributes
         # (e.g, tensor.requires_grad), which would be used by downstream Dynamo tracing.
         # Since there can be other ops like Triton kernels, which depends on python dispatcher, we have to enable it.
@@ -2950,6 +3735,7 @@ def maybe_positional_arg_names(func):
     return result
 
 
+<<<<<<< HEAD
 def canonicalize(gmod, root_gmod):
     # autograd_cache_key is sensitive to the name of the placeholder and intermediate nodes.
     # So, we first canonicalize it.
@@ -3025,6 +3811,12 @@ def hash_graph_and_inputs(tx, gmod, fake_inputs):
 
 
 class BaseHOPVariable(WrapHigherOrderVariable):
+=======
+class BaseHOPVariable(WrapHigherOrderVariable):
+    supports_input_mutation = False
+    supports_aliasing = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def python_type(self):
         return type(self.value)
 
@@ -3047,6 +3839,7 @@ def call_function(
         )
         assert len(p_kwargs) == 0
 
+<<<<<<< HEAD
         from torch._higher_order_ops.utils import has_potential_input_alias_or_mutation
 
         fake_inputs = [
@@ -3060,6 +3853,8 @@ def call_function(
                 f"outputs are aliases of the inputs. Please ensure that this doesn't happen."
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat_example_value = pytree.tree_map_only(
             torch.fx.Proxy,
             lambda a: a.node.meta["example_value"],
@@ -3072,6 +3867,12 @@ def call_function(
 
 
 class InvokeSubgraphHigherOrderVariable(WrapHigherOrderVariable):
+<<<<<<< HEAD
+=======
+    supports_input_mutation = False
+    supports_aliasing = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def install_subgraph_in_output_graph(
         self, tx, fn_vt, fn_args_vt, kwargs, body_gmod, attr_name
     ):
@@ -3079,6 +3880,7 @@ def install_subgraph_in_output_graph(
         # inputs have already been seen before. If yes, the subgraph is already
         # installed in the output graph and we can just access the subgraph
         # using the saved attr name.
+<<<<<<< HEAD
         from torch._higher_order_ops.utils import has_potential_input_alias_or_mutation
 
         fake_inputs = [
@@ -3093,6 +3895,16 @@ def install_subgraph_in_output_graph(
             unimplemented("NYI: invoke_subgraph with aliasing/mutation")
 
         key = hash_graph_and_inputs(tx, body_gmod, fake_inputs)
+=======
+
+        if not isinstance(fn_vt, (UnspecializedNNModuleVariable, UserFunctionVariable)):
+            unimplemented_v2(
+                gb_type="Encountered non user function variable during invoke_subgraph HOP tracing",
+                context=str(fn_vt),
+                explanation="invoke_subgraph does not support non user function variable",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         invoke_subgraph_cache = (
             tx.output.tracing_context.hop_dispatch_set_cache.get_cache(
@@ -3100,6 +3912,7 @@ def install_subgraph_in_output_graph(
             )
         )
 
+<<<<<<< HEAD
         if invoke_subgraph_cache:
             if identifier := invoke_subgraph_cache.get_dynamo_identifier(key):
                 return identifier
@@ -3112,6 +3925,49 @@ def install_subgraph_in_output_graph(
 
         return body_name
 
+=======
+        if isinstance(fn_vt, UserFunctionVariable):
+            fn_id = id(fn_vt.get_function())
+            fn_name = fn_vt.get_function().__name__
+        else:
+            assert isinstance(fn_vt, UnspecializedNNModuleVariable)
+            fn_id = id(fn_vt.value.forward.__func__)
+            fn_name = fn_vt.value.forward.__name__
+        previously_installed_submodules = []
+        if invoke_subgraph_cache:
+            previously_installed_submodules = (
+                invoke_subgraph_cache.get_dynamo_installed_submodules(fn_id)
+            )
+            current_mod = body_gmod
+            # NB - reverse is more likely to cause a hit sooner because first
+            # graph can have requires_grad=False for a few inputs
+            for submodule_name in reversed(previously_installed_submodules):
+                assert submodule_name in tx.output.nn_modules
+                previous_mod = tx.output.nn_modules[submodule_name]
+                if are_same_graph_modules(
+                    fn_name, previous_mod, current_mod, tx.fake_mode
+                ):
+                    return submodule_name
+
+        body_name = super().install_subgraph_in_output_graph(
+            tx, fn_vt, fn_args_vt, kwargs, body_gmod, "subgraph"
+        )
+        hc_log.debug(
+            "%s: Installing subgraph with identifier '%s', bringing total count for '%s' function to %s",
+            fn_name,
+            body_name,
+            fn_name,
+            len(previously_installed_submodules) + 1,
+        )
+        if invoke_subgraph_cache:
+            invoke_subgraph_cache.add_dynamo_installed_submodule(fn_id, body_name)
+
+        return body_name
+
+    @raise_hard_error_if_graph_break(
+        reason="torch.compile requires the `nested_compile_region` decorated function to be capturable into a single graph",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_function(
         self,
         tx: "InstructionTranslator",
@@ -3141,7 +3997,11 @@ def call_function(
         p_args = (
             p_args[0],
             body_name,
+<<<<<<< HEAD
             p_args[1:],
+=======
+            *p_args[1:],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return _call_function_and_unflatten_output(
             tx,
diff --git a/torch/_dynamo/variables/iter.py b/torch/_dynamo/variables/iter.py
index 502616c440e9..8c2ead1a4ade 100644
--- a/torch/_dynamo/variables/iter.py
+++ b/torch/_dynamo/variables/iter.py
@@ -20,13 +20,21 @@
 import sys
 from typing import Optional, TYPE_CHECKING, Union
 
+<<<<<<< HEAD
 from .. import polyfills, variables
+=======
+from .. import graph_break_hints, polyfills, variables
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..bytecode_transformation import create_call_function, create_instruction
 from ..exc import (
     handle_observed_exception,
     ObservedUserStopIteration,
     raise_observed_exception,
+<<<<<<< HEAD
     unimplemented,
+=======
+    unimplemented_v2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     UserError,
 )
 from .base import ValueMutationNew, VariableTracker
@@ -34,6 +42,10 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from torch._dynamo.codegen import PyCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 
@@ -75,6 +87,7 @@ def call_function(
             from .builtin import BuiltinVariable
 
             if any(key not in ["initial", "func"] for key in kwargs.keys()):
+<<<<<<< HEAD
                 unimplemented(
                     "Unsupported kwargs for itertools.accumulate: "
                     f"{','.join(set(kwargs.keys()) - {'initial', 'func'})}"
@@ -82,6 +95,16 @@ def call_function(
 
             acc = kwargs.get("initial")
 
+=======
+                unimplemented_v2(
+                    gb_type="Unsupported kwargs for itertools.accumulate",
+                    context=f"call_function {self} {args} {kwargs}",
+                    explanation=f"Expected kwargs: 'initial', 'func', but got "
+                    f"{','.join(set(kwargs.keys()) - {'initial', 'func'})}",
+                    hints=[*graph_break_hints.USER_ERROR],
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if len(args) in [1, 2] and args[0].has_unpack_var_sequence(tx):
                 seq = args[0].unpack_var_sequence(tx)
 
@@ -93,6 +116,7 @@ def call_function(
                     # Default to operator.add
                     func = BuiltinVariable(operator.add).call_function
                 else:
+<<<<<<< HEAD
                     unimplemented(
                         "itertools.accumulate can only accept one of: `func` kwarg, pos 2 arg"
                     )
@@ -100,6 +124,34 @@ def call_function(
                 unimplemented("Unsupported arguments for itertools.accumulate")
 
             items = []
+=======
+                    unimplemented_v2(
+                        gb_type="Unsupported `func` in itertools.accumulate",
+                        context=f"call_function {self} {args} {kwargs}",
+                        explanation="Dynamo does not know how to get the "
+                        "function to use for itertools.accumulate. "
+                        "itertools.accumulate expects the `func` as the second "
+                        "argument or as a keyword argument.",
+                        hints=[*graph_break_hints.USER_ERROR],
+                    )
+            else:
+                unimplemented_v2(
+                    gb_type="Unsupported arguments for itertools.accumulate",
+                    context=f"call_function {self} {args} {kwargs}",
+                    explanation="Dynamo does not know how to trace "
+                    f"itertools.accumulate with args: {args} and kwargs: {kwargs}. "
+                    "itertools.accumulate expects an iterable, an optional "
+                    "binary function for accumulation, and an optional initial "
+                    "value to set the starting state.",
+                    hints=[
+                        "Make sure the arguments to itertools.accumulate are correct.",
+                        *graph_break_hints.SUPPORTABLE,
+                    ],
+                )
+
+            items = []
+            acc = kwargs.get("initial")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if acc is not None:
                 items.append(acc)
             for item in seq:
@@ -109,8 +161,17 @@ def call_function(
                     try:
                         acc = func(tx, [acc, item], {})
                     except Exception as e:
+<<<<<<< HEAD
                         unimplemented(
                             f"Unexpected failure in invoking function during accumulate. Failed running func {func}({item}{acc})",
+=======
+                        unimplemented_v2(
+                            gb_type="Unexpected failure during itertools.accumulate() iteration",
+                            context=f"call_function {self} {args} {kwargs}",
+                            explanation="Unexpected failure in invoking function during accumulate. "
+                            f"Failed running func {func}({item}{acc})",
+                            hints=[*graph_break_hints.DIFFICULT],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             from_exc=e,
                         )
                 items.append(acc)
@@ -136,9 +197,18 @@ def call_function(
             )
         elif self.value is itertools.groupby:
             if any(kw != "key" for kw in kwargs.keys()):
+<<<<<<< HEAD
                 unimplemented(
                     "Unsupported kwargs for itertools.groupby: "
                     f"{','.join(set(kwargs.keys()) - {'key'})}"
+=======
+                unimplemented_v2(
+                    gb_type="Unsupported kwargs for itertools.groupby",
+                    context=f"call_function {self} {args} {kwargs}",
+                    explanation=f"Expected kwargs: 'key', but got "
+                    f"{','.join(set(kwargs.keys()) - {'key'})}",
+                    hints=[*graph_break_hints.USER_ERROR],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             def retrieve_const_key(key):
@@ -147,14 +217,39 @@ def retrieve_const_key(key):
                 elif isinstance(key, variables.ConstantVariable):
                     return key.as_python_constant()
                 else:
+<<<<<<< HEAD
                     unimplemented(
                         "Unsupported key type for itertools.groupby: " + str(type(key))
+=======
+                    unimplemented_v2(
+                        gb_type="Unsupported key type for itertools.groupby",
+                        context=f"call_function {self} {args} {kwargs}",
+                        explanation="Dynamo does not know how to trace "
+                        f"itertools.groupby with key type: {str(type(key))}. "
+                        "We only support grouping keys that are constants (int, float, str, etc.)",
+                        hints=[*graph_break_hints.SUPPORTABLE],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
             if len(args) == 1 and args[0].has_unpack_var_sequence(tx):
                 seq = args[0].unpack_var_sequence(tx)
             else:
+<<<<<<< HEAD
                 unimplemented("Unsupported arguments for itertools.groupby")
+=======
+                unimplemented_v2(
+                    gb_type="Unsupported arguments for itertools.groupby",
+                    context=f"call_function {self} {args} {kwargs}",
+                    explanation="Dynamo does not know how to trace "
+                    f"itertools.groupby with args: {args} and kwargs: {kwargs}. "
+                    "itertools.groupby expects an iterable to group and an "
+                    "optional key function to determine groupings.",
+                    hints=[
+                        "Make sure the arguments to itertools.groupby are correct.",
+                        *graph_break_hints.SUPPORTABLE,
+                    ],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if "key" in kwargs:
 
@@ -185,8 +280,16 @@ def keyfunc(x):
                         )
                     )
             except Exception as e:
+<<<<<<< HEAD
                 unimplemented(
                     "Unexpected failure when calling itertools.groupby",
+=======
+                unimplemented_v2(
+                    gb_type="Unexpected failure during itertools.groupby() iteration",
+                    context=f"call_function {self} {args} {kwargs}",
+                    explanation="Unexpected failure in invoking function during groupby",
+                    hints=[*graph_break_hints.SUPPORTABLE],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     from_exc=e,
                 )
             return variables.ListIteratorVariable(
@@ -218,7 +321,16 @@ def __init__(self, **kwargs) -> None:
         super().__init__(**kwargs)
 
     def next_variable(self, tx):
+<<<<<<< HEAD
         unimplemented("abstract method, must implement")
+=======
+        unimplemented_v2(
+            gb_type="Unimplemented next() call",
+            context=f"next({self})",
+            explanation="This abstract method must be implemented",
+            hints=[*graph_break_hints.DYNAMO_BUG],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # NOTE: only call when unpacking this iterator safely done eagerly!
     # Normally, iterators are accessed lazily.
@@ -226,6 +338,7 @@ def next_variable(self, tx):
     # Example of unsafe eager unpacking: list(islice(map(f, seq), 5))
     def force_unpack_var_sequence(self, tx) -> list[VariableTracker]:
         result = []
+<<<<<<< HEAD
         while True:
             try:
                 result.append(self.next_variable(tx))
@@ -233,6 +346,18 @@ def force_unpack_var_sequence(self, tx) -> list[VariableTracker]:
                 handle_observed_exception(tx)
                 break
         return result
+=======
+        self.force_apply_to_var_sequence(tx, result.append)
+        return result
+
+    def force_apply_to_var_sequence(self, tx, fn) -> None:
+        while True:
+            try:
+                fn(self.next_variable(tx))
+            except ObservedUserStopIteration:
+                handle_observed_exception(tx)
+                break
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # don't call force_unpack_var_sequence since it can mutate
     # IteratorVariable state!
@@ -249,7 +374,11 @@ def __init__(self, item: VariableTracker, **kwargs) -> None:
     def next_variable(self, tx):
         return self.item
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.extend_output(
                 [
@@ -279,7 +408,11 @@ def next_variable(self, tx):
         self.item = self.item.call_method(tx, "__add__", [self.step], {})
         return old_item
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.extend_output(
                 [
@@ -317,8 +450,16 @@ def next_variable(self, tx):
             try:
                 new_item = self.iterator.next_variable(tx)
                 if len(self.saved) > MAX_ITERATOR_LIMIT:
+<<<<<<< HEAD
                     unimplemented(
                         "input iterator to itertools.cycle has too many items"
+=======
+                    unimplemented_v2(
+                        gb_type="input iterator to itertools.cycle has too many items",
+                        context=f"next({self})",
+                        explanation=f"Has reached internal Dynamo max iterator limit: {MAX_ITERATOR_LIMIT}",
+                        hints=[],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 tx.output.side_effects.mutation(self)
                 self.saved.append(new_item)
@@ -425,7 +566,11 @@ def get_item(it):
         self.index += 1
         return variables.TupleVariable(args)
 
+<<<<<<< HEAD
     def reconstruct_items(self, codegen):
+=======
+    def reconstruct_items(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for it in self.iterables:
             if isinstance(it, list):
                 remaining_items = it[self.index :]
@@ -436,7 +581,11 @@ def reconstruct_items(self, codegen):
             else:
                 codegen(it)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from("builtins", "zip"), call_function_ex=True
         )
@@ -481,7 +630,11 @@ def next_variable(self, tx):
         args = super().next_variable(tx)
         return self.fn.call_function(tx, args.items, {})
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from("builtins", "map"), call_function_ex=True
         )
@@ -555,7 +708,11 @@ def _next():
             if pred_res.as_python_constant():
                 return item
 
+<<<<<<< HEAD
     def reconstruct_items(self, codegen):
+=======
+    def reconstruct_items(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.iterable, list):
             remaining_items = self.iterable[self.index :]
             codegen.foreach(remaining_items)
@@ -565,7 +722,11 @@ def reconstruct_items(self, codegen):
         else:
             codegen(self.iterable)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(lambda: codegen.load_import_from("builtins", "filter"))
         codegen(self.fn)
         self.reconstruct_items(codegen)
diff --git a/torch/_dynamo/variables/lazy.py b/torch/_dynamo/variables/lazy.py
index 86432c2d43ca..a07db331f76e 100644
--- a/torch/_dynamo/variables/lazy.py
+++ b/torch/_dynamo/variables/lazy.py
@@ -22,15 +22,26 @@ def __init__(self, value: Any, source: Any) -> None:
     def realize(self) -> None:
         assert self.vt is None
         from ..symbolic_convert import InstructionTranslator
+<<<<<<< HEAD
+=======
+        from . import builder
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         tx = InstructionTranslator.current_tx()
 
         if isinstance(self.value, LazySymNodeFormatString):
+<<<<<<< HEAD
             source = None
         else:
             source = self.source
 
         self.vt = VariableTracker.build(tx, self.value, source)
+=======
+            self.vt = builder.SourcelessBuilder.create(tx, self.value)
+        else:
+            self.vt = builder.VariableBuilder(tx, self.source)(self.value)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         del self.value
         del self.source
 
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 29c5eba4514b..d944b593b8fb 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -26,7 +26,11 @@ class that handles its unique behaviors while integrating with Dynamo's
 
 from .. import graph_break_hints, polyfills, variables
 from ..bytecode_transformation import create_call_function, create_instruction
+<<<<<<< HEAD
 from ..exc import raise_observed_exception, unimplemented, unimplemented_v2
+=======
+from ..exc import raise_observed_exception, unimplemented_v2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..source import AttrSource
 from ..utils import (
     cmp_name_to_op_mapping,
@@ -116,7 +120,16 @@ def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
             )
         else:
             assert isinstance(index, (int, torch.SymInt))
+<<<<<<< HEAD
             return self.items[index]
+=======
+            try:
+                return self.items[index]
+            except IndexError:
+                raise_observed_exception(
+                    IndexError, tx, args=["list index out of range"]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def unpack_var_sequence(self, tx):
         return list(self.items)
@@ -137,7 +150,18 @@ def call_method(
                 if value.constant is not None and value.constant.numel() == 1:
                     value = variables.ConstantVariable.create(value.constant.item())
                 else:
+<<<<<<< HEAD
                     unimplemented("__getitem__ with non-constant tensor")
+=======
+                    unimplemented_v2(
+                        gb_type="Indexing list with non-scalar tensor",
+                        context=f"call_method {self} {name} {args} {kwargs}",
+                        explanation=(
+                            "Attempted to index list-like object with tensor with > 1 element."
+                        ),
+                        hints=[*graph_break_hints.USER_ERROR],
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 value = args[0]
             return self.getitem_const(tx, value)
@@ -341,7 +365,17 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
     def var_getattr(self, tx: "InstructionTranslator", name):
         fields = ["start", "stop", "step"]
         if name not in fields:
+<<<<<<< HEAD
             unimplemented(f"range.{name}")
+=======
+            unimplemented_v2(
+                gb_type="Unsupported attribute for range() object",
+                context=f"var_getattr {self} {name}",
+                explanation=f"Expected attribute to be one of {','.join(fields)} "
+                f"but got {name}",
+                hints=[*graph_break_hints.USER_ERROR],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.items[fields.index(name)]
 
 
@@ -373,9 +407,15 @@ def call_method(
         ):
             assert not kwargs
             (arg,) = args
+<<<<<<< HEAD
             seq = arg.force_unpack_var_sequence(tx)
             tx.output.side_effects.mutation(self)
             self.items.extend(seq)
+=======
+            arg.force_apply_to_var_sequence(
+                tx, lambda item: self.call_method(tx, "append", [item], {})
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return ConstantVariable.create(None)
         elif name == "insert" and self.is_mutable():
             assert not kwargs
@@ -458,8 +498,16 @@ def call_method(
             tx.output.side_effects.mutation(self)
             if isinstance(key, SliceVariable):
                 if not value.has_force_unpack_var_sequence(tx):
+<<<<<<< HEAD
                     unimplemented(
                         f"Missing dynamo support for expanding {value} into a list for slice assignment."
+=======
+                    unimplemented_v2(
+                        gb_type="Unsupported conversion for slice assignment",
+                        context=f"call_method {self} {name} {args}",
+                        explanation=f"Missing dynamo support for converting {value} into a list for slice assignment.",
+                        hints=[*graph_break_hints.SUPPORTABLE],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 self.items[key.as_python_constant()] = value.force_unpack_var_sequence(
                     tx
@@ -485,7 +533,32 @@ def call_method(
                 keys = [key_fn_var.call_function(tx, [x], {}) for x in self.items]
 
             if not all(k.is_python_constant() for k in keys):
+<<<<<<< HEAD
                 unimplemented("sort with non-constant keys")
+=======
+                first_non_constant_key = None
+                for k in keys:
+                    if not k.is_python_constant():
+                        first_non_constant_key = k
+                assert first_non_constant_key is not None
+
+                try:
+                    python_type = first_non_constant_key.python_type()
+                except NotImplementedError:
+                    python_type = "unknown"
+
+                unimplemented_v2(
+                    gb_type="sort with non-constant keys",
+                    context=str(first_non_constant_key),
+                    explanation=(
+                        f"Cannot perform sort with non-constant key. "
+                        f"First non-constant key type: {python_type}. "
+                        f"Most notably, we cannot sort with Tensor or SymInt keys, but we can "
+                        f"sort ints."
+                    ),
+                    hints=["Use something else as the key."],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             tx.output.side_effects.mutation(self)
             sorted_items_with_keys = sorted(
@@ -614,9 +687,17 @@ def call_method(
         ):
             assert len(args) == 1
             assert not kwargs
+<<<<<<< HEAD
             prefix = args[0].force_unpack_var_sequence(tx)
             tx.output.side_effects.mutation(self)
             self.items[:] = [*reversed(prefix), *self.items]
+=======
+            # NOTE this is inefficient, but the alternative is to represent self.items
+            # as a deque, which is a more intrusive change.
+            args[0].force_apply_to_var_sequence(
+                tx, lambda item: self.call_method(tx, "appendleft", [item], {})
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             slice_within_maxlen = slice(None, maxlen)
             result = ConstantVariable.create(None)
         elif name == "popleft" and self.is_mutable():
@@ -1018,7 +1099,17 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             return variables.GetAttrVariable(self, name)
         fields = ["start", "stop", "step"]
         if name not in fields:
+<<<<<<< HEAD
             unimplemented(f"slice.{name}")
+=======
+            unimplemented_v2(
+                gb_type="Unsupported attribute for slice() object",
+                context=f"var_getattr {self} {name}",
+                explanation=f"Expected attribute to be one of {','.join(fields)} "
+                f"but got {name}",
+                hints=[*graph_break_hints.USER_ERROR],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.items[fields.index(name)]
 
 
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 186454a34e80..ad3cc9f9faa5 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -32,10 +32,17 @@
 import torch._numpy as tnp
 import torch.utils._pytree as pytree
 
+<<<<<<< HEAD
 from .. import config, variables
 from ..bytecode_transformation import create_call_function, create_instruction
 from ..create_parameter_op import do_not_convert_to_tracable_parameter
 from ..exc import raise_observed_exception, unimplemented
+=======
+from .. import config, graph_break_hints, trace_rules, variables
+from ..bytecode_transformation import create_call_function, create_instruction
+from ..create_parameter_op import do_not_convert_to_tracable_parameter
+from ..exc import raise_observed_exception, unimplemented, unimplemented_v2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..guards import GuardBuilder, install_guard
 from ..mutation_guard import unpatched_nn_module_init
 from ..source import (
@@ -53,7 +60,10 @@
     istype,
     list_methods,
     proxy_args_kwargs,
+<<<<<<< HEAD
     set_example_value,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tuple_methods,
 )
 from .base import VariableTracker
@@ -63,6 +73,10 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from torch._dynamo.codegen import PyCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 
@@ -77,7 +91,11 @@ class SuperVariable(VariableTracker):
 
     def __init__(self, typevar, objvar=None, **kwargs) -> None:
         super().__init__(**kwargs)
+<<<<<<< HEAD
         # typevar is the fist argument to super(). In the case where no argument
+=======
+        # typevar is the first argument to super(). In the case where no argument
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # is provided to super(), it is the __class__ object where
         # the super() function is being called
         self.typevar = typevar
@@ -87,7 +105,11 @@ def __init__(self, typevar, objvar=None, **kwargs) -> None:
         # cls for a classmethod)
         self.objvar = objvar
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(lambda: codegen(variables.BuiltinVariable(super)))
         codegen(self.typevar)
         if self.objvar is not None:
@@ -141,7 +163,21 @@ def _resolved_getattr_and_source(self, tx: "InstructionTranslator", name):
                         )
                     return resolved_getattr, source
 
+<<<<<<< HEAD
         unimplemented("Unable to resolve super getattr")
+=======
+        unimplemented_v2(
+            gb_type="Unable to resolve super getattr",
+            context="",
+            explanation=f"Dynamo failed to trace attribute `{name}` accessed "
+            f"via `super()` (for type `{self.typevar}` and object `{self.objvar}`) "
+            "because the resolved attribute type is not supported.",
+            hints=[
+                "Ensure the attribute exists in the parent class.",
+                "Check the arguments passed to `super()`.",
+            ],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracker":
         # Check if getattr is a constant. If not, delay the actual work by
@@ -161,12 +197,27 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracke
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
         inner_fn, source = self._resolved_getattr_and_source(self, name)
+<<<<<<< HEAD
+=======
+        # This essentially simulates CPython's `super_getattro`:
+        # https://github.com/python/cpython/blob/a1c52d1265c65bcf0d9edf87e143843ad54f9b8f/Objects/typeobject.c#L11138-L11168
+        # where `inner_fn` is the VT for `res = _super_lookup_descr(...)`.
+        #
+        # However, `res`'s type needs to be checked for `tp_descr_get`, and
+        # applied if it has one. We currently don't have polyfills for all the
+        # relevant `tp_descr_get`, so we explicitly handle the cases we care
+        # about here (e.g., note the staticmethod, classmethod cases).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if inner_fn is object.__init__:
             return LambdaVariable(identity)
         elif inner_fn is torch.nn.Module.__init__:
@@ -183,7 +234,18 @@ def call_method(
                         unpatched_nn_module_init, source=source
                     ).call_function(tx, [self.objvar] + args, kwargs)
             else:
+<<<<<<< HEAD
                 unimplemented("super() nn.Module.__init__")
+=======
+                unimplemented_v2(
+                    gb_type="Unsupported super().__init__() call",
+                    context=f"call_method {self} {name} {args} {kwargs}",
+                    explanation="Dynamo encountered a super().__init__() call "
+                    f"on {objvar} that resolved to a `torch.nn.Module.__init__()` "
+                    "call that we cannot trace.",
+                    hints=[*graph_break_hints.DIFFICULT],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif (
             self.objvar.source
             and hasattr(inner_fn, "__name__")
@@ -208,8 +270,32 @@ def call_method(
         elif isinstance(inner_fn, classmethod) and isinstance(
             inner_fn.__func__, types.FunctionType
         ):
+<<<<<<< HEAD
             return variables.UserMethodVariable(
                 inner_fn.__func__, self.objvar, source=source
+=======
+            if isinstance(self.objvar, variables.UserDefinedClassVariable):
+                # super().classmethod is called from a classmethod itself. So,
+                # super was converted to super(__class__, cls) in bytecode and
+                # therefore we have to propagate the cls.
+                cls_variable = self.objvar
+            else:
+                # current function is an instance method, therefore super was
+                # converted to super(__class__, self). We have to find
+                # type(self) to bind the cls to the parent classmethod.
+                # Note that it can't be the self.typevar because __class__ is
+                # the class where the method is defined, which could be
+                # different from type(self) with polymorphism.
+                cls_source = None
+                if self.objvar.source:
+                    cls_source = AttrSource(self.objvar.source, "__class__")
+                cls_variable = VariableTracker.build(
+                    tx, self.objvar.value_type, cls_source
+                )
+
+            return variables.UserMethodVariable(
+                inner_fn.__func__, cls_variable, source=source
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).call_function(tx, args, kwargs)
         elif isinstance(inner_fn, types.FunctionType):
             return variables.UserFunctionVariable(
@@ -227,10 +313,37 @@ def call_method(
             attr = args[0]
             try:
                 attr = attr.as_python_constant()
+<<<<<<< HEAD
             except NotImplementedError:
                 unimplemented(f"non-const delattr attr: {attr}")
             if not tx.output.side_effects.is_attribute_mutation(self.objvar):
                 unimplemented(f"delattr({self.objvar}, {attr}, ...)")
+=======
+            except NotImplementedError as exc:
+                unimplemented_v2(
+                    gb_type="Non-constant attribute given to `super().__delattr__()`",
+                    context=f"call_method {self} {name}",
+                    explanation="Dynamo requires the attribute name passed to "
+                    "`super().__delattr__(...)` to be a constant (string).",
+                    hints=[
+                        "Ensure the attribute name is a string literal or a constant variable."
+                    ],
+                    from_exc=exc,
+                )
+            if not tx.output.side_effects.is_attribute_mutation(self.objvar):
+                unimplemented_v2(
+                    gb_type="Attempted super().__delattr__() on an object without mutation tracking",
+                    context=f"call_method {self} {name}",
+                    explanation="Dynamo needs to track mutations on an object "
+                    "before `super().__delattr__` can be used on it. But the "
+                    f"object ({self.objvar}) doesn't have attribute mutation "
+                    "tracking enabled.",
+                    hints=[
+                        "Ensure the object is tracked by Dynamo's side effect system.",
+                        *graph_break_hints.DYNAMO_BUG,
+                    ],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             tx.output.side_effects.store_attr(
                 self.objvar, attr, variables.DeletedVariable()
@@ -276,8 +389,51 @@ def call_method(
                 # setup a object.__getattribute__(self.objvar, name) source
                 attr_source = GenericAttrSource(self.objvar.source, attr_name)
             return VariableTracker.build(tx, attr_value, attr_source)
+<<<<<<< HEAD
 
         unimplemented(f"non-function or method super: {inner_fn}")
+=======
+        elif inner_fn is torch._C._disabled_torch_function_impl:
+            # See `THPModule_disable_torch_function` for the C impl.
+            # The signature of _disabled_torch_function_impl is similar to
+            # `__torch_function__`, just without the first `cls` argument:
+            #  * (func, types, args, kwargs)
+            func = args[0]
+            tf_kwargs = {}
+            tf_args = args[2].items
+            for hash_key_vt, value_vt in args[3].items.items():
+                key_str = hash_key_vt.vt.as_python_constant()
+                tf_kwargs[key_str] = value_vt
+
+            tx_old = tx.symbolic_torch_function_state.torch_function_subclass_enabled
+            tx.symbolic_torch_function_state.torch_function_subclass_enabled = False
+            try:
+                return func.call_function(tx, tf_args, tf_kwargs)
+            finally:
+                tx.symbolic_torch_function_state.torch_function_subclass_enabled = (
+                    tx_old
+                )
+        elif (
+            isinstance(inner_fn, types.MethodDescriptorType)
+            and inner_fn in trace_rules.get_tensor_method()
+        ):
+            # FunctionType but implementation is in C, we support some of these,
+            # e.g., tensor ops like `torch.Tensor.to`.
+            fn_var = VariableTracker.build(tx, inner_fn, source)
+            return fn_var.call_function(tx, [self.objvar] + args, kwargs)
+
+        unimplemented_v2(
+            gb_type="Attempted to call a super() attribute that is "
+            "not a function or method",
+            context=f"call_method {self} {name}",
+            explanation="Dynamo does not know how to trace the call "
+            f"`super().{name}()` because `super().{name}` is not a "
+            "function or method attribute.",
+            hints=[
+                "Ensure the attribute accessed via `super()` is a standard method or function.",
+            ],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ExceptionVariable(VariableTracker):
@@ -302,7 +458,11 @@ def __init__(self, exc_type, args, **kwargs) -> None:
     def set_context(self, context: "ExceptionVariable"):
         self.__context__ = context
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.load_import_from("builtins", self.exc_type.__name__)
         )
@@ -361,9 +521,32 @@ def raise_error(msg):
             if isinstance(val, ConstantVariable) and val.value is None:
                 self.__traceback__ = val
             else:
+<<<<<<< HEAD
                 unimplemented(f"setattr(ExceptionVariable, {name_var}, {val})")
         else:
             unimplemented(f"setattr(ExceptionVariable, {name_var}, {val})")
+=======
+                unimplemented_v2(
+                    gb_type="Set Exception object `__traceback__` attribute to not-`None`",
+                    context=f"call_setattr {self} {name}",
+                    explanation="Dynamo does not support setting the attribute "
+                    "'__traceback__' on tracked exception objects to anything "
+                    "other than None.",
+                    hints=[
+                        "Avoid setting '__traceback__' on exception objects "
+                        "within traced code, or set it to None."
+                    ],
+                )
+        else:
+            unimplemented_v2(
+                gb_type="Unsupported attribute assignment on Exception object",
+                context=f"call_setattr {self} {name}",
+                explanation="Dynamo does not support setting the attribute "
+                f"'{name}' on tracked exception objects. Only `__context__`, "
+                "`__cause__`, `__suppress_context__`, and `__traceback__` are supported.",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return variables.ConstantVariable(None)
 
     def call_method(self, tx, name, args, kwargs):
@@ -406,6 +589,27 @@ class DelayGraphBreakVariable(UnknownVariable):
     Used to insert a dummy variable in the stack to do the graph break at CALL_FUNCTION.
     """
 
+<<<<<<< HEAD
+=======
+    def __init__(self, msg=None, **kwargs):
+        super().__init__(**kwargs)
+        self.msg = msg
+
+    def call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: "list[VariableTracker]",
+        kwargs: "dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        unimplemented_v2(
+            gb_type="Unsupported function call (delayed)",
+            context=f"source: {self.source}",
+            explanation="Dynamo determined that a graph break should occur "
+            f"when calling `{self.source.name()}`. Reason: {self.msg}",
+            hints=[],
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ComptimeVariable(VariableTracker):
     """
@@ -413,7 +617,11 @@ class ComptimeVariable(VariableTracker):
     Dynamo compile time
     """
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError("comptime is special form")
 
     def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracker":
@@ -550,11 +758,40 @@ def visit(node):
 
             vjp_fn = self.fn_cls.vjp  # type: ignore[attr-defined]
             if vjp_fn is not torch.autograd.Function.vjp:
+<<<<<<< HEAD
                 unimplemented("NYI - User defind vjp")
 
             jvp_fn = self.fn_cls.jvp  # type: ignore[attr-defined]
             if jvp_fn is not torch.autograd.Function.jvp:
                 unimplemented("NYI - User defind jvp")
+=======
+                unimplemented_v2(
+                    gb_type="Unsupported custom vjp",
+                    context=f"call_apply {self} {args} {kwargs}",
+                    explanation="Dynamo does not support tracing "
+                    "`torch.autograd.Function` subclasses that define "
+                    "a custom `vjp` method.",
+                    hints=[
+                        "Remove the custom `vjp` method if possible.",
+                        "Use standard `backward` instead if applicable.",
+                        *graph_break_hints.SUPPORTABLE,
+                    ],
+                )
+
+            jvp_fn = self.fn_cls.jvp  # type: ignore[attr-defined]
+            if jvp_fn is not torch.autograd.Function.jvp:
+                unimplemented_v2(
+                    gb_type="Unsupported custom jvp",
+                    context=f"call_apply {self} {args} {kwargs}",
+                    explanation="Dynamo does not support tracing "
+                    "`torch.autograd.Function` subclasses that define "
+                    "a custom `jvp` method.",
+                    hints=[
+                        "Remove the custom `jvp` method if possible.",
+                        *graph_break_hints.SUPPORTABLE,
+                    ],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             from .higher_order_ops import AutogradFunctionApplyVariable
 
@@ -606,8 +843,21 @@ def visit(node):
                 source=source,
             ).call_function(tx, args, kwargs)
         else:
+<<<<<<< HEAD
             unimplemented(
                 f"non-function or method in subclass of torch.autograd.Function: {fn}"
+=======
+            unimplemented_v2(
+                gb_type="Non-function or method in subclass of torch.autograd.Function",
+                context=f"call_apply {self} {args} {kwargs}",
+                explanation="Dynamo requires the `forward` attribute of a "
+                "`torch.autograd.Function` subclass to be a standard Python "
+                f"function or method. Found type `{type(fn).__name__}` instead.",
+                hints=[
+                    "Ensure the `forward` method is defined as a regular "
+                    "function or instance method."
+                ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def call_backward(self, tx: "InstructionTranslator", args, kwargs):
@@ -625,16 +875,27 @@ def call_function(self, tx: "InstructionTranslator", args, kwargs):
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ):
+<<<<<<< HEAD
         from ..trace_rules import is_callable_allowed
         from .builder import wrap_fx_proxy
 
         if name == "apply":
             if is_callable_allowed(self.fn_cls):
+=======
+        from .builder import wrap_fx_proxy
+
+        if name == "apply":
+            if trace_rules.is_callable_allowed(self.fn_cls):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 trampoline_autograd_apply = produce_trampoline_autograd_apply(
                     self.fn_cls
                 )
@@ -652,8 +913,11 @@ def call_method(
         elif name == "backward":
             return self.call_backward(tx, args, kwargs)
         else:
+<<<<<<< HEAD
             from .. import trace_rules
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             source = AttrSource(self.source, name) if self.source is not None else None
             try:
                 obj = inspect.getattr_static(self.fn_cls, name)
@@ -677,7 +941,22 @@ def call_method(
                     obj.__func__, self, source=source
                 ).call_function(tx, args, kwargs)
             else:
+<<<<<<< HEAD
                 unimplemented(f"Unsupported method: {name}")
+=======
+                unimplemented_v2(
+                    gb_type="Unsupported autograd.Function method",
+                    context=f"call_method {self} {name}",
+                    explanation="Dynamo does not support calling the method "
+                    f"`{name}` directly on the `torch.autograd.Function` "
+                    "instance. Supported methods include `apply`, `backward`, "
+                    "static methods, and class methods.",
+                    hints=[
+                        "Ensure the method is decorated with `@staticmethod` "
+                        "or `@classmethod` if it's meant to be called on the class.",
+                    ],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -702,7 +981,10 @@ def __init__(
         value,
         value_type=None,
         inference=False,
+<<<<<<< HEAD
         proxy=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         saved_tensors=None,
         needs_input_grad=None,
         non_differentiable=None,
@@ -710,7 +992,10 @@ def __init__(
     ) -> None:
         super().__init__(value=value, value_type=value_type, **kwargs)
         self.inference = inference
+<<<<<<< HEAD
         self.proxy = proxy
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.saved_tensors = saved_tensors
         self.needs_input_grad = needs_input_grad
         self.non_differentiable = non_differentiable
@@ -723,33 +1008,56 @@ def create(tx: "InstructionTranslator", args=None, kwargs=None):
                 isinstance(x, variables.TensorVariable) and x.requires_grad
                 for x in args
             )
+<<<<<<< HEAD
         proxy = tx.output.create_proxy(
             "call_function", torch.autograd.function.FunctionCtx, (), {}
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out = tx.output.side_effects.track_object_new(
             None,
             torch.autograd.function.FunctionCtx,
             functools.partial(
                 AutogradFunctionContextVariable,
                 inference=True,
+<<<<<<< HEAD
                 proxy=proxy,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 saved_tensors=SavedTensorBox(),
                 needs_input_grad=needs_input_grad,
             ),
             {},
         )
+<<<<<<< HEAD
         set_example_value(proxy.node, out.value)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return out
 
     def as_proxy(self):
         if self.proxy is None:
+<<<<<<< HEAD
             unimplemented("proxy not set")
+=======
+            unimplemented_v2(
+                gb_type="proxy not set",
+                context=f"as_proxy {self}",
+                explanation="Dynamo requires the autograd.Function context "
+                "to be initialized with a proxy.",
+                hints=[*graph_break_hints.DYNAMO_BUG],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.proxy
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
@@ -762,10 +1070,33 @@ def call_method(
             return variables.ConstantVariable.create(None)
 
         if name != "save_for_backward":
+<<<<<<< HEAD
             unimplemented(f"autograd.Function context method: {name}")
         if self.saved_tensors is None:
             unimplemented(
                 "save_for_backward only supported on a newly constructed FunctionCtx"
+=======
+            unimplemented_v2(
+                gb_type="Unsupported autograd.Function context method",
+                context=f"call_method {self} {name}",
+                explanation="Dynamo does not support calling the method "
+                f"`{name}` on `autograd.Function` context objects. Supported "
+                "methods are `__setattr__`, `save_for_backward` and "
+                "`mark_non_differentiable`.",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+        if self.saved_tensors is None:
+            unimplemented_v2(
+                gb_type="Unsupported autograd.Function context `save_for_backward`",
+                context=f"call_method {self} {name}",
+                explanation="Dynamo requires the `saved_tensors` attribute "
+                "to be initialized on the `autograd.Function` context object.",
+                hints=[
+                    "Ensure that the `saved_tensors` attribute is properly "
+                    "initialized before calling `save_for_backward`. "
+                    "`save_for_backward` only supported on a newly constructed `torch.autograd.function.FunctionCtx`.",
+                ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if not self.inference:
@@ -811,7 +1142,11 @@ def __init__(
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
@@ -830,11 +1165,29 @@ def call_method(
                     kwargs,
                 )
             else:
+<<<<<<< HEAD
                 unimplemented(
                     "queue_callback() is only supported when Compiled Autograd is enabled with fullgraph=True"
                 )
         else:
             unimplemented(f"torch._C._ImperativeEngine method: {name}")
+=======
+                unimplemented_v2(
+                    gb_type="Unsupported torch._C._ImperativeEngine.queue_callback()",
+                    context=f"call_method {self} {name}",
+                    explanation="queue_callback() is only supported when "
+                    "Compiled Autograd is enabled with fullgraph=True.",
+                    hints=[],
+                )
+        else:
+            unimplemented_v2(
+                gb_type="Unsupported torch._C._ImperativeEngine method",
+                context=f"call_method {self} {name}",
+                explanation="Dynamo only supports the `queue_callback` method "
+                f"on a torch._C._ImperativeEngine instance, but found: `{name}`.",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class LambdaVariable(VariableTracker):
@@ -900,7 +1253,11 @@ def const_getattr(self, tx: "InstructionTranslator", name):
             raise NotImplementedError
         return inspect.getattr_static(step2, name)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.obj)
         codegen.extend_output(codegen.create_load_attrs(self.name))
 
@@ -914,7 +1271,11 @@ def call_function(
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
@@ -971,7 +1332,13 @@ def call_method(
         elif name == "__setitem__" and self.name == "__dict__" and not kwargs:
             if isinstance(self.obj, variables.UserDefinedObjectVariable):
                 # Bypass any custom setattr as we are updating the `__dict__` itself
+<<<<<<< HEAD
                 return self.obj.method_setattr_standard(tx, args[0], args[1])
+=======
+                return self.obj.method_setattr_standard(
+                    tx, args[0], args[1], directly_update_dict=True
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(self.obj, variables.NNModuleVariable):
                 # This matches how `setattr` is handled for NNModuleVariable
                 self.obj.convert_to_unspecialized(tx)
@@ -1014,7 +1381,11 @@ def call_function(
                     tx, wrapper_name, [self_obj, *args], kwargs
                 )
 
+<<<<<<< HEAD
         super().call_function(tx, args, kwargs)
+=======
+        return super().call_function(tx, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def is_python_constant(self):
         return True
@@ -1072,7 +1443,14 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             return tx.output.side_effects.load_attr(self, name)
 
         if self.is_torch or name not in self.value.__dict__:
+<<<<<<< HEAD
             attr_value = getattr(self.value, name)
+=======
+            try:
+                attr_value = getattr(self.value, name)
+            except AttributeError:
+                raise_observed_exception(AttributeError, tx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             attr_value = self.value.__dict__[name]
 
@@ -1087,7 +1465,11 @@ def __init__(self, value, **kwargs) -> None:
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
@@ -1117,7 +1499,11 @@ def var_getattr(self, tx: "InstructionTranslator", name: str):
     def as_python_constant(self):
         return self.value
 
+<<<<<<< HEAD
     def reconstruct(self, codegen: "torch._dynamo.codegen.PyCodegen") -> None:
+=======
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We're just trying to load the type here. Reconstructing the type from
         # scratch is tricky - for a type like `typing.List[int]` we'd need to
         # deconstruct the origin and args.  The origin for `List[int]` is `list`
@@ -1238,7 +1624,13 @@ def call_function(
                 and config.use_numpy_random_stream
             ):
                 msg = f"delegate '{func.__qualname__}' to NumPy itself via "
+<<<<<<< HEAD
                 msg += f"confg.use_numpy_random_stream={config.use_numpy_random_stream}"
+=======
+                msg += (
+                    f"config.use_numpy_random_stream={config.use_numpy_random_stream}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 unimplemented(msg)
 
             args, kwargs = NumpyNdarrayVariable.patch_args(func.__name__, args, kwargs)
@@ -1264,7 +1656,11 @@ def call_function(
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
@@ -1292,7 +1688,11 @@ def __init__(self, **kwargs) -> None:
     def __repr__(self) -> str:
         return "NullVariable"
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if sys.version_info < (3, 11):
             unimplemented("cannot reconstruct NullVariable in < Python 3.11")
         codegen.append_output(create_instruction("PUSH_NULL"))
@@ -1333,7 +1733,11 @@ def __init__(self, format_string, sym_args, sym_kwargs, **kwargs) -> None:
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}({self.format_string!r}, {self.sym_args!r}, {self.sym_kwargs!r})"
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.extend_output(
                 [
@@ -1382,7 +1786,11 @@ def call_function(self, tx: "InstructionTranslator", args, kwargs):
 
         tx.debug_locals.append((self, list(args)))
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.source.reconstruct(codegen)
 
     @staticmethod
@@ -1417,7 +1825,11 @@ def __init__(self, value, **kwargs) -> None:
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
@@ -1459,7 +1871,11 @@ def as_python_constant(self):
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
@@ -1555,7 +1971,11 @@ class RandomVariable(VariableTracker):
     """random.Random()
 
     Implemented by wrapping a VariableTracker around a random.Random object.
+<<<<<<< HEAD
     The supported methods for the random.Random object cannot be overriden.
+=======
+    The supported methods for the random.Random object cannot be overridden.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Assumes that random objects behave the same given a set seed or state.
     """
 
@@ -1640,7 +2060,11 @@ def unwrap_state(state):
 
     def call_method(
         self,
+<<<<<<< HEAD
         tx,
+=======
+        tx: "InstructionTranslator",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name,
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
@@ -1677,7 +2101,11 @@ def call_random_meth(*args, **kwargs):
             return call_random_fn(tx, call_random_meth, args, kwargs)
         return super().call_method(tx, name, args, kwargs)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen.add_push_null(
             lambda: codegen.extend_output(
                 [
@@ -1700,15 +2128,30 @@ class WeakRefVariable(VariableTracker):
     @staticmethod
     def build(tx, weakref_value, **options):
         source = options.get("source", None)
+<<<<<<< HEAD
+=======
+        callback = weakref_value.__callback__
+        callback_source = source and AttrSource(source, "__callback__")
+        callback_vt = VariableTracker.build(tx, callback, callback_source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         referent = weakref_value()
         source = source and WeakRefCallSource(source)
         referent_vt = VariableTracker.build(tx, referent, source)
         options["source"] = source
+<<<<<<< HEAD
         return WeakRefVariable(referent_vt, **options)
 
     def __init__(self, referent_vt, **options):
         super().__init__(**options)
         self.referent_vt = referent_vt
+=======
+        return WeakRefVariable(referent_vt, callback_vt, **options)
+
+    def __init__(self, referent_vt, callback_vt, **options):
+        super().__init__(**options)
+        self.referent_vt = referent_vt
+        self.callback_vt = callback_vt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_function(
         self,
@@ -1718,7 +2161,15 @@ def call_function(
     ) -> "VariableTracker":
         return self.referent_vt
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
         codegen.add_push_null(lambda: codegen.load_import_from("weakref", "ref"))
         codegen(self.referent_vt)
         codegen.extend_output(create_call_function(1, False))
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.add_push_null(lambda: codegen.load_import_from("weakref", "ref"))
+        codegen(self.referent_vt)
+        codegen(self.callback_vt)
+        codegen.extend_output(create_call_function(2, False))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 57ddc0c357b0..108d6c17e5bd 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -35,7 +35,10 @@
 from .. import graph_break_hints, trace_rules, variables
 from ..exc import (
     raise_observed_exception,
+<<<<<<< HEAD
     unimplemented,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unimplemented_v2,
     UnspecializeRestartAnalysis,
     Unsupported,
@@ -49,7 +52,10 @@
     FSDPNNModuleSource,
     GetItemSource,
     NNModuleSource,
+<<<<<<< HEAD
     UnspecializedBuiltinNNModuleSource,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     UnspecializedNNModuleSource,
 )
 from ..utils import (
@@ -247,7 +253,21 @@ def has_key_in_generic_dict(self, tx: "InstructionTranslator", key):
         base = tx.output.get_submodule(self.module_key)
 
         if object_has_getattribute(base):
+<<<<<<< HEAD
             unimplemented("NNModuleVariable with custom __getattribute__")
+=======
+            unimplemented_v2(
+                gb_type="Custom __getattribute__ in nn.Module dict key check",
+                context=f"has_key_in_generic_dict {self} {key}",
+                explanation="Dynamo does not support checking key existence "
+                "on `nn.Module` instances that have a custom "
+                "`__getattribute__` method defined.",
+                hints=[
+                    "Avoid defining `__getattribute__` in your module.",
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if tx.output.side_effects.has_pending_mutation_of_attr(self, key):
             mutated_attr = tx.output.side_effects.load_attr(self, key, deleted_ok=True)
@@ -259,14 +279,48 @@ def has_key_in_generic_dict(self, tx: "InstructionTranslator", key):
     def _custom_getattr_fallback(self, base, tx, name, obj_source):
         """Check for a __getattr__ and handle it specially if it is implemented"""
         if object_has_getattribute(base):
+<<<<<<< HEAD
             unimplemented("torch.nn.Module with a custom __getattribute__ defined")
+=======
+            unimplemented_v2(
+                gb_type="Custom __getattribute__ in nn.Module attribute access",
+                context=f"var_getattr {self} {name}",
+                explanation="Dynamo does not support checking key existence "
+                "on `nn.Module` instances that have a custom "
+                "`__getattribute__` method defined.",
+                hints=[
+                    "Avoid defining `__getattribute__` in your module.",
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         getattr_fn = get_custom_getattr(base, ignore_nn_module_getattr=True)
         if getattr_fn is None:
             return None
 
         if not isinstance(getattr_fn, types.FunctionType):
+<<<<<<< HEAD
             unimplemented("torch.nn.Module with a non-function custom __getattr__")
+=======
+            unimplemented_v2(
+                gb_type="torch.nn.Module with a non-function custom __getattr__",
+                context=f"var_getattr {self} {name}",
+                explanation=(
+                    "Dynamo detected a nn.Module object with a custom "
+                    "`__getattr__` method, but this method is not a standard "
+                    "Python function (e.g., it might be implemented in C/C++). "
+                    "Dynamo cannot currently trace into such non-standard "
+                    "`__getattr__` methods."
+                ),
+                hints=[
+                    "Avoid using objects with non-standard __getattr__ methods "
+                    "within the compiled region. If possible, implement "
+                    "__getattr__ as a standard Python function.",
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         options = {"source": AttrSource(obj_source, "__getattr__")}
         return variables.UserMethodVariable(getattr_fn, self, **options).call_function(
@@ -284,7 +338,18 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             all_class_attribute_names.update(x.__dict__.keys())
 
         if not self.source:
+<<<<<<< HEAD
             unimplemented("GETATTR with no source")
+=======
+            unimplemented_v2(
+                gb_type="getattr with no source",
+                context=f"var_getattr {self} {name}",
+                explanation="Dynamo does not know how to access an attribute "
+                "on an `nn.Module` instance that lacks a source. This is "
+                "usually an internal error in Dynamo.",
+                hints=[*graph_break_hints.DYNAMO_BUG],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if name == "__dict__":
             return variables.GetAttrVariable(self, name, source=source)
@@ -565,7 +630,17 @@ def assert_all_args_kwargs_const():
             if not all(
                 x.is_python_constant() for x in itertools.chain(args, kwargs.values())
             ):
+<<<<<<< HEAD
                 unimplemented(f"non-const NNModule method {name}")
+=======
+                unimplemented_v2(
+                    gb_type="non-const argument in nn.Module method",
+                    context=f"call_method: {self} {name} {args} {kwargs}",
+                    explanation="Dynamo does not support calling "
+                    f"method `{name}` of ``nn.Module`` {module} with non-constant arguments.",
+                    hints=[],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def get_kwargs(*names):
             assert_all_args_kwargs_const()
@@ -614,16 +689,24 @@ def gen_source(source, name):
             return source
 
         if name == "named_children":
+<<<<<<< HEAD
             tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules").name())
+=======
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert not (args or kwargs)
             result = []
             for name, submod in module.named_children():
                 result.append(named_embed(name, submod))
             return ListIteratorVariable(result, mutation_type=ValueMutationNew())
         elif name == "named_parameters":
+<<<<<<< HEAD
             tx.output.guard_on_key_order.add(
                 AttrSource(self.source, "_parameters").name()
             )
+=======
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_parameters"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = []
             for name, param in module.named_parameters(
                 **get_kwargs("prefix", "recurse")
@@ -631,7 +714,11 @@ def gen_source(source, name):
                 result.append(named_embed(name, param))
             return ListIteratorVariable(result, mutation_type=ValueMutationNew())
         elif name == "named_buffers":
+<<<<<<< HEAD
             tx.output.guard_on_key_order.add(AttrSource(self.source, "_buffers").name())
+=======
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_buffers"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = []
             for name, buffer in module.named_buffers(
                 **get_kwargs("prefix", "recurse", "remove_duplicate")
@@ -639,7 +726,11 @@ def gen_source(source, name):
                 result.append(named_embed(name, buffer))
             return ListIteratorVariable(result, mutation_type=ValueMutationNew())
         elif name == "named_modules":
+<<<<<<< HEAD
             tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules").name())
+=======
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = []
             for name, submod in module.named_modules(
                 **get_kwargs("memo", "prefix", "remove_duplicate")
@@ -647,6 +738,7 @@ def gen_source(source, name):
                 result.append(named_embed(name, submod))
             return ListIteratorVariable(result, mutation_type=ValueMutationNew())
         elif name == "children":
+<<<<<<< HEAD
             tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules").name())
             assert not (args or kwargs)
             return wrap_values(module.named_children())
@@ -660,6 +752,19 @@ def gen_source(source, name):
             return wrap_values(module.named_parameters(**get_kwargs("recurse")))
         elif name == "buffers":
             tx.output.guard_on_key_order.add(AttrSource(self.source, "_buffers").name())
+=======
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules"))
+            assert not (args or kwargs)
+            return wrap_values(module.named_children())
+        elif name == "modules":
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules"))
+            return wrap_values(module.named_modules())
+        elif name == "parameters":
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_parameters"))
+            return wrap_values(module.named_parameters(**get_kwargs("recurse")))
+        elif name == "buffers":
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_buffers"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return wrap_values(module.named_buffers(**get_kwargs("recurse")))
         elif name == "keys":
             assert not (args or kwargs)
@@ -756,7 +861,17 @@ def gen_source(source, name):
             elif args[0].is_python_constant():
                 key = args[0].as_python_constant()
             else:
+<<<<<<< HEAD
                 unimplemented(f"getitem on NNModuleVariable with key {args[0]}")
+=======
+                unimplemented_v2(
+                    gb_type="Unsupported key type for nn.Module.__getitem__",
+                    context=f"call_method: {self} {name} {args} {kwargs}",
+                    explanation="Dynamo does not support getitem on "
+                    "`nn.Module` with non-constant key.",
+                    hints=[],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             submod = module[key]
             return tx.output.register_attr_or_module(
@@ -819,7 +934,11 @@ def __init__(self, value, **kwargs) -> None:
         if type(value) is torch.jit._script.RecursiveScriptModule:
             raise Unsupported(
                 "ScriptModules aren't supported in UnspecializedNNModuleVariable"
+<<<<<<< HEAD
                 " becuase their .forward function isn't a static member of their type"
+=======
+                " because their .forward function isn't a static member of their type"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         if "value_type" in kwargs:
             lazy_value_to_become = getattr(kwargs["value_type"], "cls_to_become", None)
@@ -841,8 +960,12 @@ def __init__(self, value, **kwargs) -> None:
         self.nn_module_stack_source = self.source
 
     def _wrap_source(self, attr_source):
+<<<<<<< HEAD
         if not isinstance(attr_source, UnspecializedNNModuleSource):
             return UnspecializedNNModuleSource(attr_source)
+=======
+        # the vt is already wrapped with UnspecializedNNModuleSource
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return attr_source
 
     def get_nn_module_stack_source(self):
@@ -852,7 +975,11 @@ def set_nn_module_stack_source(self, source):
         self.nn_module_stack_source = source
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _nn_module_method_ids():
         # Allow __setattr__ to fall through to base class handler
         supported = {torch.nn.Module.__setattr__, torch.nn.Module.__init__}
@@ -991,14 +1118,30 @@ def call_method(
                 hasattr(method, "__code__")
                 and id(method.__code__) in self._nn_module_method_ids()
             ):
+<<<<<<< HEAD
                 unimplemented(f"UnspecializedNNModuleVariable missing {name}")
+=======
+                unimplemented_v2(
+                    gb_type="UnspecializedNNModuleVariable missing method",
+                    context=f"call_method: {self} {name} {args} {kwargs}",
+                    explanation=f"Dynamo does not support tracing method {name} of nn.Module {self.value}",
+                    hints=[
+                        "Dynamo does not really define unspecialized nn.Module very well.",
+                        *graph_break_hints.DIFFICULT,
+                    ],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # "_parameters" in self.value.__dict__ checks that module is initialized
             if name == "__setattr__" and "_parameters" in self.value.__dict__:
                 # Record if mutations happens on parameters/buffers/modules. The
                 # mutations on these are not tracked by base class
                 # UserDefinedObject vt. This will be used later to graph break
+<<<<<<< HEAD
                 # on seeing a paramters() and family calls.
+=======
+                # on seeing a parameters() and family calls.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # TODO(anijain2305) - This might not be needed if we let Dynamo
                 # inline both getattr and setattr. In that case, it should see
                 # the lowest level dicts - _parameters and family and
@@ -1074,7 +1217,11 @@ def var_getattr(self, tx: "InstructionTranslator", name):
 
         # For non-empty hook dicts, one way is to just fallback to VariableTracker.build() and create a ConstDictVariable.
         # However, ConstDictVariable guards on keys. This can cause recompiles when the same hook is installed for
+<<<<<<< HEAD
         # differnt nn module instances, because the key keeps changing (look more into RemovableHandle to understand why
+=======
+        # different nn module instances, because the key keeps changing (look more into RemovableHandle to understand why
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # key changes - also related https://github.com/pytorch/pytorch/issues/125836). Here, we carefully craft a
         # NNModuleHooksDictVariable (a subclass of ConstDictVariable) to avoid any guard on the keys.
         if (
@@ -1089,7 +1236,11 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             hooks_dict = getattr(self.value, name)
             hooks_dict_source = AttrSource(self.source, name)
             install_guard(hooks_dict_source.make_guard(GuardBuilder.SEQUENCE_LENGTH))
+<<<<<<< HEAD
             tx.output.guard_on_key_order.add(hooks_dict_source.name())
+=======
+            tx.output.guard_on_key_order.add(hooks_dict_source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def build_key_value(i, k, v):
                 # Make key sourceless to avoid any guard on it
@@ -1135,8 +1286,12 @@ class UnspecializedBuiltinNNModuleVariable(UnspecializedNNModuleVariable):
     """
 
     def _wrap_source(self, attr_source):
+<<<<<<< HEAD
         if not isinstance(attr_source, UnspecializedBuiltinNNModuleSource):
             return UnspecializedBuiltinNNModuleSource(attr_source)
+=======
+        # vt is already wrapped with the UnspecializedBuiltinNNModuleSource
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return attr_source
 
 
diff --git a/torch/_dynamo/variables/optimizer.py b/torch/_dynamo/variables/optimizer.py
index 64c39b85077b..1c62e31f2a13 100644
--- a/torch/_dynamo/variables/optimizer.py
+++ b/torch/_dynamo/variables/optimizer.py
@@ -266,7 +266,11 @@ def mark_static(x):
         # We need to realize the top level state dict to populate
         # the guard locals
         state_vt.realize()
+<<<<<<< HEAD
         tx.output.guard_on_key_order.add(state_source.name())
+=======
+        tx.output.guard_on_key_order.add(state_source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Populate self.grad_to_source and self.tensor_to_source so that we can
         # manually update_list_args
@@ -334,7 +338,11 @@ def mark_static(x):
             p_state_source = DictGetItemSource(
                 state_source, ConstDictKeySource(state_source, idx)
             )
+<<<<<<< HEAD
             tx.output.guard_on_key_order.add(p_state_source.name())
+=======
+            tx.output.guard_on_key_order.add(p_state_source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for inner_idx, (k, v) in enumerate(value.items()):
                 if (
                     isinstance(v, torch.Tensor)
diff --git a/torch/_dynamo/variables/sdpa.py b/torch/_dynamo/variables/sdpa.py
index 51c1ea6bf141..f6a584b01645 100644
--- a/torch/_dynamo/variables/sdpa.py
+++ b/torch/_dynamo/variables/sdpa.py
@@ -10,6 +10,10 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from torch._dynamo.codegen import PyCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 PARAM_NAMES = "query key value attn_mask dropout is_causal enable_gqa".split()
@@ -36,7 +40,11 @@ def __init__(self, proxy, param_vars, **kwargs) -> None:
         self.param_vars = param_vars
         super().__init__(**kwargs)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.source is None
         assert self.param_vars is not None
         codegen.add_push_null(
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index a2c24611a2f2..38b196757e40 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -18,7 +18,10 @@
 """
 
 import functools
+<<<<<<< HEAD
 import inspect
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import operator
 import textwrap
@@ -43,10 +46,17 @@
 )
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
+<<<<<<< HEAD
 from .. import config, variables
 from .._trace_wrapped_higher_order_op import trace_wrapped
 from ..exc import (
     unimplemented,
+=======
+from .. import config, graph_break_hints, variables
+from .._trace_wrapped_higher_order_op import trace_wrapped
+from ..exc import (
+    unimplemented_v2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     UnknownPropertiesDuringBackwardTrace,
     UserError,
     UserErrorType,
@@ -66,9 +76,16 @@
     set_example_value,
     tensortype_to_dtype,
 )
+<<<<<<< HEAD
 from .base import VariableTracker
 from .constant import ConstantVariable
 from .lists import SizeVariable
+=======
+from .base import AttributeMutationNew, VariableTracker
+from .constant import ConstantVariable
+from .lists import SizeVariable
+from .user_defined import UserDefinedClassVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -78,6 +95,10 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from torch._dynamo.codegen import PyCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 
@@ -123,6 +144,16 @@ def is_bound_tensor_method(value):
     )
 
 
+<<<<<<< HEAD
+=======
+# instead of using inspect.getattr_static, we directly lookup the appropriate
+# dicts. It is necessary to keep the torch._C.TensorBase first in the or
+# operation, because the second arg takes priority in or operation when there
+# are common keys.
+all_tensor_attrs = torch._C.TensorBase.__dict__ | torch.Tensor.__dict__
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TensorVariable(VariableTracker):
     """A torch.Tensor input or an intermediate value in the FX graph"""
 
@@ -367,7 +398,16 @@ def method_attr_is_nested(self, tx):
             return ConstantVariable.create(self.is_nested)
 
     def method_attr_retain_grad(self, tx):
+<<<<<<< HEAD
         unimplemented("retain_grad does not work with AOTDispatcher")
+=======
+        unimplemented_v2(
+            gb_type="Tensor.retain_grad() with AOTDispatcher",
+            context=f"var_getattr {self} retain_grad",
+            explanation="`Tensor.retain_grad()` does not work with AOTDispatcher.",
+            hints=[],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def method_attr_data(self, tx):
         return variables.TorchInGraphFunctionVariable(
@@ -376,7 +416,16 @@ def method_attr_data(self, tx):
 
     def method_attr_grad_fn(self, tx):
         if self.has_grad_fn:
+<<<<<<< HEAD
             unimplemented("TensorVariable has a grad_fn")
+=======
+            unimplemented_v2(
+                gb_type="Tensor with grad_fn()",
+                context=f"var_getattr {self} grad_fn",
+                explanation="Dynamo does not support tracing tensors with a grad_fn directly.",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return variables.ConstantVariable(None)
 
@@ -391,6 +440,16 @@ def call_obj_hasattr(self, tx: "InstructionTranslator", name):
         from . import GetAttrVariable
         from .builtin import BuiltinVariable
 
+<<<<<<< HEAD
+=======
+        # TODO - This is not a good solution but solves an accuracy issue.
+        # Today, var_getattr returns GetAttrVariable for both non-existent
+        # attributes and existing attributes. This is a bug and requires more
+        # deep dive.
+        if name in ("size", "stride"):
+            return ConstantVariable(True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             var = BuiltinVariable(getattr).call_function(
                 tx, [self, ConstantVariable(name)], {}
@@ -409,12 +468,25 @@ def call_obj_hasattr(self, tx: "InstructionTranslator", name):
         return ConstantVariable(ret_val)
 
     def var_getattr(self, tx: "InstructionTranslator", name):
+<<<<<<< HEAD
         from . import UserDefinedClassVariable
 
         if self.is_strict_mode(tx):
             if name in self._strict_mode_banned_ops():
                 unimplemented(
                     f"Getattr invocation {name} in strict mode is not supported"
+=======
+        if self.is_strict_mode(tx):
+            if name in self._strict_mode_banned_ops():
+                unimplemented_v2(
+                    gb_type="Strict mode banned op",
+                    context=f"var_getattr {self} {name}",
+                    explanation=f"Getattr invocation '{name}' in strict mode is not supported.",
+                    hints=[
+                        f"Remove `{name}` from the list of banned ops by "
+                        "setting `torch._dynamo.config._autograd_backward_strict_mode_banned_ops`.",
+                    ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             elif name in self._strict_mode_conditional_banned_ops():
                 raise UnknownPropertiesDuringBackwardTrace(
@@ -452,7 +524,12 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             ):
                 # Delay the graph break to the actual call of unsqueeze_/resize_/resize_as_ etc.
                 return variables.misc.DelayGraphBreakVariable(
+<<<<<<< HEAD
                     source=AttrSource(self.source, name)
+=======
+                    source=AttrSource(self.source, name),
+                    msg="Getting an inplace view on a graph input is not supported",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         # For attributes (not methods) that were not caught in the special handling above,
@@ -464,9 +541,14 @@ def try_generic_attr_handling():
                 from .builder import wrap_fx_proxy
                 from .misc import GetAttrVariable
 
+<<<<<<< HEAD
                 try:
                     static_attr = inspect.getattr_static(torch.Tensor, name)
                 except AttributeError:
+=======
+                static_attr = all_tensor_attrs.get(name, None)
+                if static_attr is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return None
 
                 # Make sure this is an attribute, not a method.
@@ -497,17 +579,45 @@ def try_generic_attr_handling():
 
     def call_id(self, tx):
         if not self.source:
+<<<<<<< HEAD
             unimplemented("call_id not supported for sourceless TensorVariable")
+=======
+            unimplemented_v2(
+                gb_type="Unsupported call_id() without source",
+                context=f"call_id {self}",
+                explanation="call_id() not supported for sourceless TensorVariable.",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # For local source, we associate the real value. We use this real value
         scope = {"L": tx.output.local_scope, "G": tx.output.global_scope}
         try:
             _input_associated_real_value = eval(self.source.name(), scope)
         except Exception as exc:
+<<<<<<< HEAD
             unimplemented(f"error getting associated real value: {exc}")
 
         if _input_associated_real_value is None:
             unimplemented("call_id without associated real value")
+=======
+            unimplemented_v2(
+                gb_type="Error getting associated real value",
+                context=f"call_id {self}",
+                explanation="Dynamo encountered an error while trying to "
+                "get the associated real value.",
+                hints=[],
+                from_exc=exc,
+            )
+
+        if _input_associated_real_value is None:
+            unimplemented_v2(
+                gb_type="call_id() without associated real value",
+                context=f"call_id {self}",
+                explanation="Dynamo could not find an associated real value for the tensor.",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         install_guard(self.source.make_guard(GuardBuilder.ID_MATCH))
         id_value = id(_input_associated_real_value)
@@ -578,17 +688,32 @@ def call_method(
         from .torch_function import can_dispatch_torch_function, dispatch_torch_function
 
         if self.is_strict_mode(tx) and name in self._strict_mode_banned_ops():
+<<<<<<< HEAD
             unimplemented(f"Illegal method invocation {name} in strict mode")
+=======
+            unimplemented_v2(
+                gb_type="Illegal method invocation in strict mode",
+                context=f"call_method {self} {name} {args} {kwargs}",
+                explanation="Dynamo currently does not support this method "
+                f"({name}) invocation in strict mode.",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Only override builtin tensor methods
         # The user can manually add override handling
         # with a decorator for other methods (e.g. a dispatch subclass with other methods)
+<<<<<<< HEAD
         is_base_tensor_method = False
         try:
             inspect.getattr_static(torch.Tensor, name)
             is_base_tensor_method = True
         except AttributeError:
             is_base_tensor_method = False
+=======
+        static_attr = all_tensor_attrs.get(name, None)
+        is_base_tensor_method = static_attr is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if (
             can_dispatch_torch_function(tx, tuple([self] + list(args)), kwargs)
@@ -597,7 +722,11 @@ def call_method(
             if self.source:
                 func_var = VariableBuilder(
                     tx, AttrSource(AttrSource(self.source, "__class__"), name)
+<<<<<<< HEAD
                 )(inspect.getattr_static(torch.Tensor, name))
+=======
+                )(static_attr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 func_var = SourcelessBuilder.create(tx, getattr(torch.Tensor, name))
 
@@ -612,9 +741,40 @@ def call_method(
         """
 
         # This is seen in inspect signature where we check if the value is a default value
+<<<<<<< HEAD
         if name == "__eq__" and isinstance(args[0], variables.UserDefinedClassVariable):
             return variables.ConstantVariable(False)
 
+=======
+        if name == "__eq__" and isinstance(args[0], UserDefinedClassVariable):
+            return variables.ConstantVariable(False)
+
+        # For historical reasons, these ops decompose down to syntactically
+        # invalid aten ops because they contain the python keyword `from`, see
+        # discussions in #151432 for more details.
+        # We graph break for now since this use case is uncommon.
+        if name == "random_":
+            unimplemented_v2(
+                gb_type="Tensor.random_ op",
+                context=f"Tensor.{name}({args=}, {kwargs=})",
+                explanation="This is currently not supported.",
+                hints=[
+                    "Use the out-of-place version of this op",
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
+        elif name == "uniform_" and "from" in kwargs:
+            unimplemented_v2(
+                gb_type="Tensor.uniform_ op called with `from` keyword",
+                context=f"Tensor.{name}({args=}, {kwargs=})",
+                explanation="This is currently not supported.",
+                hints=[
+                    "Avoid using the `from` keyword.",
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             handler_method = getattr(self, f"method_{name}")
         except AttributeError:
@@ -625,7 +785,18 @@ def call_method(
                 if result:
                     return result
             except TypeError as e:
+<<<<<<< HEAD
                 unimplemented(f"unhandled args for {name}: {e}")
+=======
+                unimplemented_v2(
+                    gb_type="Unhandled args for method",
+                    context=f"call_method {self} {name} {args} {kwargs}",
+                    explanation="Dynamo encountered an error while calling "
+                    f"the method `{name}`.",
+                    hints=[],
+                    from_exc=e,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from .builder import wrap_fx_proxy
 
@@ -708,6 +879,19 @@ def method_is_floating_point(self):
             return ConstantVariable.create(self.dtype.is_floating_point)
 
     def method_is_inference(self):
+<<<<<<< HEAD
+=======
+        if config.fake_tensor_disable_inference_mode:
+            unimplemented_v2(
+                gb_type="Encountered tensor.is_inference() during tracing",
+                context="",
+                explanation="tensor.is_inference() is not supported",
+                hints=[
+                    *graph_break_hints.FUNDAMENTAL,
+                    *graph_break_hints.INFERENCE_MODE,
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (fake := self.proxy.node.meta.get("example_value")) is not None:
             return ConstantVariable.create(fake.is_inference())
 
@@ -776,6 +960,7 @@ def method_as_subclass(self, cls):
             from .torch_function import TensorWithTFOverrideVariable
 
             tx = InstructionTranslator.current_tx()
+<<<<<<< HEAD
 
             # [Note: __torch_function__] coerce this tensor variable into a TensorWithTFOverrideVariable
             # in eager, this is just a type change. This isn't sound if a __torch_function__ tensor subclass
@@ -791,6 +976,26 @@ def method_as_subclass(self, cls):
             return TensorWithTFOverrideVariable.from_tensor_var(
                 tx, self, py_cls, torch_fn
             )
+=======
+            py_cls = cls.as_python_constant()
+            var = TensorWithTFOverrideVariable.from_tensor_var(
+                tx, self, py_cls, cls.source
+            )
+            # See NOTE [Side effect tracking for newly constructed tensor]
+            tx.output.side_effects._track_obj(
+                object(), var, mutation_type_cls=AttributeMutationNew
+            )
+            return var
+        unimplemented_v2(
+            gb_type="Argument of `as_subclass` must be a non-dispatcher-style tensor subclass",
+            context=f"{self}.as_subclass({cls})",
+            explanation="Currently not supported",
+            hints=[
+                "Avoid this call or move it outside `torch.compile` regione",
+                *graph_break_hints.SUPPORTABLE,
+            ],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def method_get_device(self):
         if isinstance(self.device, torch.device):
@@ -802,12 +1007,38 @@ def method_element_size(self):
 
     def method_numpy(self, *, force=False):
         if not config.trace_numpy:
+<<<<<<< HEAD
             unimplemented("Tensor.numpy(). config.trace_numpy is False")
         if not np:
             unimplemented("Tensor.numpy(). NumPy is not available")
         if self.layout != torch.strided:
             raise TypeError(
                 f"can't convert {self.layout} layout tensor to numpy. Use Tensor.dense() first"
+=======
+            unimplemented_v2(
+                gb_type="Tensor.numpy() with trace_numpy=False",
+                context=f"call_method {self} numpy",
+                explanation="`Tensor.numpy()` was called, but the `trace_numpy` "
+                "configuration was manually disabled.",
+                hints=[
+                    "Set `torch._dynamo.config.trace_numpy = True` to allow "
+                    "Dynamo to trace through NumPy.",
+                ],
+            )
+        if not np:
+            unimplemented_v2(
+                gb_type="Tensor.numpy() without NumPy installed",
+                context=f"call_method {self} numpy",
+                explanation="`Tensor.numpy()` was called, but the NumPy library "
+                "is not available in the current environment.",
+                hints=[
+                    "Ensure NumPy is installed in your Python environment.",
+                ],
+            )
+        if self.layout != torch.strided:
+            raise TypeError(
+                f"can't convert {self.layout} layout tensor to numpy. Use Tensor.to_dense() first"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         from ..symbolic_convert import InstructionTranslator
 
@@ -850,7 +1081,20 @@ def wrap(i, sub_proxy):
                 torch.int32,
                 torch.int64,
             ]:
+<<<<<<< HEAD
                 unimplemented("Input tensor for tolist must be an integer tensor")
+=======
+                unimplemented_v2(
+                    gb_type="Tensor.tolist() with non-integer tensor",
+                    context=f"call_method {self} to_list",
+                    explanation="Dynamo currently does not support tracing "
+                    "`tolist()` on non-integer tensors.",
+                    hints=[
+                        "Ensure the input tensor to `tolist()` is an integer "
+                        "type (e.g., int8, int16, int32, int64)."
+                    ],
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if tensor.dim() == 0:
                 return wrap(tensor, sub_proxy)
@@ -868,7 +1112,16 @@ def wrap(i, sub_proxy):
         return VariableTracker.build(tx, out)
 
     def method_backward(self, *args, **kwargs):
+<<<<<<< HEAD
         unimplemented("Tensor.backward")
+=======
+        unimplemented_v2(
+            gb_type="Unsupported Tensor.backward() call",
+            context=f"call_method {self} backward {args} {kwargs}",
+            explanation="Dynamo currently does not support tracing `Tensor.backward()`.",
+            hints=[*graph_break_hints.FUNDAMENTAL],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def method_data_ptr(self, *args, **kwargs):
         return DataPtrVariable(self)
@@ -876,7 +1129,21 @@ def method_data_ptr(self, *args, **kwargs):
     def method_item(self, *args, **kwargs):
         if not config.capture_scalar_outputs:
             self._warn_capture_scalar_outputs()
+<<<<<<< HEAD
             unimplemented("Tensor.item")
+=======
+            unimplemented_v2(
+                gb_type="Unsupported Tensor.item() call with capture_scalar_outputs=False",
+                context=f"call_method {self} item {args} {kwargs}",
+                explanation="Dynamo does not support tracing `Tensor.item()` "
+                "with config.capture_scalar_outputs=False.",
+                hints=[
+                    "Set `torch._dynamo.config.capture_scalar_outputs = True` "
+                    "or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` "
+                    "to include these operations in the captured graph.",
+                ],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def method___getitem__(self, *args, **kwargs):
         from ..symbolic_convert import InstructionTranslator
@@ -906,7 +1173,11 @@ def method___getitem__(self, *args, **kwargs):
         return wrap_fx_proxy(tx, proxy)
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _warn_capture_scalar_outputs():
         user_stack = torch._guards.TracingContext.extract_stack()
         user_stack_formatted = "".join(traceback.format_list(user_stack))
@@ -946,6 +1217,7 @@ def method_addcmul_(self, tensor1, tensor2, *, value=None):
             )
 
     def method___setitem__(self, key, value):
+<<<<<<< HEAD
         def has_bool_key(v):
             if isinstance(v, TensorVariable):
                 return v.dtype in (torch.bool, torch.int8)
@@ -958,10 +1230,17 @@ def has_bool_key(v):
 
         tx = InstructionTranslator.current_tx()
         tx.output.create_proxy(
+=======
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        proxy = tx.output.create_proxy(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "call_function",
             operator.setitem,
             *proxy_args_kwargs([self, key, value], {}),
         )
+<<<<<<< HEAD
         return ConstantVariable.create(None)
 
     def method_resize_(self, *args, **kwargs):
@@ -975,6 +1254,45 @@ def method_sparse_resize_(self, *args, **kwargs):
 
     def method_sparse_resize_and_clear_(self, *args, **kwargs):
         unimplemented("Tensor.sparse_resize_and_clear_")
+=======
+
+        if config.use_graph_deduplication or config.track_nodes_for_deduplication:
+            tx.output.region_tracker.add_node_mutation(proxy.node, 0)
+
+        return ConstantVariable.create(None)
+
+    def method_resize_(self, *args, **kwargs):
+        unimplemented_v2(
+            gb_type="Unsupported Tensor.resize_() call",
+            context=f"call_method {self} resize_ {args} {kwargs}",
+            explanation="Dynamo currently does not support tracing `Tensor.resize_()`.",
+            hints=[],
+        )
+
+    def method_resize_as_(self, *args, **kwargs):
+        unimplemented_v2(
+            gb_type="Unsupported Tensor.resize_as_() call",
+            context=f"call_method {self} resize_as_ {args} {kwargs}",
+            explanation="Dynamo currently does not support tracing `Tensor.resize_as_()`.",
+            hints=[],
+        )
+
+    def method_sparse_resize_(self, *args, **kwargs):
+        unimplemented_v2(
+            gb_type="Unsupported Tensor.sparse_resize_() call",
+            context=f"call_method {self} sparse_resize_ {args} {kwargs}",
+            explanation="Dynamo currently does not support tracing `Tensor.sparse_resize_()`.",
+            hints=[],
+        )
+
+    def method_sparse_resize_and_clear_(self, *args, **kwargs):
+        unimplemented_v2(
+            gb_type="Unsupported Tensor.sparse_resize_and_clear_() call",
+            context=f"call_method {self} sparse_resize_and_clear_ {args} {kwargs}",
+            explanation="Dynamo currently does not support tracing `Tensor.sparse_resize_and_clear_()`.",
+            hints=[],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def method_set_(self, *args, **kwargs):
         if len(args) > 1:
@@ -984,7 +1302,17 @@ def method_set_(self, *args, **kwargs):
             # overload and is used by FSDP.
             # graph-breaking on aten::set_source_Tensor_storage_offset for now,
             # unless we find that we need to make it work.
+<<<<<<< HEAD
             unimplemented("Tensor.set_.source_Tensor_storage_offset")
+=======
+            unimplemented_v2(
+                gb_type="Unsupported Tensor.set_() call",
+                context=f"call_method {self} set_ {args} {kwargs}",
+                explanation="Dynamo currently does not support tracing `Tensor.set_()` "
+                "overloads that include more than one argument.",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def method_add_(self, other, *, alpha=None):
         if alpha is not None:
@@ -1110,8 +1438,16 @@ def _method_register_hook(self, name: str, hook: VariableTracker):
                 # would have no recourse - their forward traces just fine, but will fail at backwards unless
                 # compiled_autograd is enabled. If compiled_autograd fails (there are a lot of failures today)
                 # then they have nothing they can do except disable compile.
+<<<<<<< HEAD
                 unimplemented(
                     "Compilation of intermediate hooks requires compiled autograd"
+=======
+                unimplemented_v2(
+                    gb_type="Compilation of intermediate hooks requires compiled autograd",
+                    context=f"var_getattr {self} {name}",
+                    explanation="Dynamo must be in compiled_autograd to register hooks.",
+                    hints=[],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             hook_name, bw_state_proxy = tx.output.add_backward_state_hook(hook)
@@ -1157,7 +1493,17 @@ def method_requires_grad_(self, requires_grad=True):
             requires_grad = requires_grad.as_python_constant()
 
         if self.as_proxy().node.meta["example_value"].requires_grad != requires_grad:
+<<<<<<< HEAD
             unimplemented("Tensor.requires_grad_")
+=======
+            unimplemented_v2(
+                gb_type="Unsupported Tensor.requires_grad_() call",
+                context=f"call_method {self} requires_grad_",
+                explanation="Dynamo does not support changes to a Tensor's "
+                "`requires_grad` through calling `requires_grad_()`.",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return self
 
@@ -1343,9 +1689,25 @@ def insert_into_graph():
                 return ConstantVariable.create(int(r))
             return insert_into_graph()
         elif name in ["base", "flags", "dtype"]:
+<<<<<<< HEAD
             unimplemented(f"TODO: add support for ndarray.{name}")
         elif name in ["__version__"]:
             unimplemented("delegate np.__version__ to NumPy")
+=======
+            unimplemented_v2(
+                gb_type="Unsupported ndarray attribute access",
+                context=f"var_getattr {self} {name}",
+                explanation=f"Dynamo currently does not support tracing `ndarray.{name}`.",
+                hints=[],
+            )
+        elif name in ["__version__"]:
+            unimplemented_v2(
+                gb_type="Unsupported ndarray.__version__ access",
+                context=f"var_getattr {self} {name}",
+                explanation=f"Dynamo currently does not support tracing `ndarray.{name}`.",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if result is None:
             raise NotImplementedError
         return result
@@ -1371,8 +1733,18 @@ def call_method(
         if name in ["__len__", "size", "tolist"]:
             # delegate back to TensorVariable
             return super().call_method(tx, name, args, kwargs)
+<<<<<<< HEAD
         if name in ("tostring", "tobytes"):
             unimplemented(f"{name} is not modelled in torch._numpy")
+=======
+        if name in ("tostring", "tobytes", "__delattr__"):
+            unimplemented_v2(
+                gb_type="Unsupported ndarray method call",
+                context=f"call_method {self} {name} {args} {kwargs}",
+                explanation=f"`ndarray.{name}()` is not modelled in `torch._numpy`.",
+                hints=[],
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         proxy = tx.output.create_proxy(
             "call_function",
             numpy_method_wrapper(name),
@@ -1431,17 +1803,22 @@ def from_tensor_variable(cls, tensor_variable):
         return FakeItemVariable(**dict(tensor_variable.__dict__))
 
 
+<<<<<<< HEAD
 class TensorSubclassVariable(VariableTracker):
     def __init__(self, value, *args, **kwargs) -> None:
         self.value = value
         super().__init__(*args, **kwargs)
 
+=======
+class TensorSubclassVariable(UserDefinedClassVariable):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_function(
         self,
         tx: "InstructionTranslator",
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
+<<<<<<< HEAD
         if len(args) == 1 and isinstance(args[0], TensorVariable):
             from .torch_function import TensorWithTFOverrideVariable
 
@@ -1453,6 +1830,53 @@ def call_function(
             )
 
         return super().call_function(tx, args, kwargs)
+=======
+        # Handle `Subclass(existing_tensor, ...)` calls.
+        from .torch_function import TensorWithTFOverrideVariable
+
+        new_func = self.value.__new__
+        if new_func is torch.Tensor.__new__:
+            if (
+                len(args) == 1
+                and isinstance(args[0], TensorVariable)
+                and len(kwargs) == 0
+            ):
+                data = args[0]
+                # Simulate `torch.Tensor.__new__` as shallow-copying the input
+                # tensor data with a new type. TODO polyfill?
+                var = TensorWithTFOverrideVariable.from_tensor_var(
+                    tx, data, self.value, self.source
+                )
+            else:
+                unimplemented_v2(
+                    gb_type="Calling subclass default constructor with more than tensor argument",
+                    context=f"{self.value}(args={args}, kwargs={kwargs})",
+                    explanation="Currently not supported",
+                    hints=[
+                        "Avoid this constructor call or move it outside "
+                        "`torch.compile` regione",
+                        *graph_break_hints.SUPPORTABLE,
+                    ],
+                )
+        else:
+            # Let Dynamo trace through custom `__new__`
+            var = VariableTracker.build(tx, new_func).call_function(
+                tx, [self] + args, kwargs
+            )
+
+        # Let Dynamo trace through custom `__init__`
+        init_func = self.value.__init__
+        # TODO builder should be able to handle `torch.Tensor.__init__`,
+        # which is `object.__init__`, so that we can remove this check.
+        if init_func is not torch.Tensor.__init__:
+            VariableTracker.build(tx, init_func).call_function(tx, [var], kwargs)
+
+        # See NOTE [Side effect tracking for newly constructed tensor]
+        tx.output.side_effects._track_obj(
+            object(), var, mutation_type_cls=AttributeMutationNew
+        )
+        return var
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def as_python_constant(self):
         return self.value
@@ -1514,7 +1938,11 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.from_tensor)
         codegen.load_method("untyped_storage")
         codegen.call_method(0)
@@ -1529,7 +1957,11 @@ def __init__(
         super().__init__(**kwargs)
         self.from_tensor = from_tensor
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         codegen(self.from_tensor)
         codegen.load_method("data_ptr")
         codegen.call_method(0)
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 6f380bac8f60..b2e027d32c2a 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -34,7 +34,11 @@
 import math
 import re
 from collections.abc import Sequence
+<<<<<<< HEAD
 from typing import TYPE_CHECKING
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._C
 import torch._refs
@@ -44,7 +48,11 @@
 from torch._logging import warning_once
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass_type
 
+<<<<<<< HEAD
 from .. import config, polyfills, variables
+=======
+from .. import config, graph_break_hints, polyfills, variables
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..codegen import PyCodegen
 from ..create_parameter_op import (
     can_convert_to_tracable_parameter,
@@ -52,7 +60,11 @@
     tracable_create_parameter,
 )
 from ..device_interface import get_registered_device_interfaces
+<<<<<<< HEAD
 from ..exc import unimplemented
+=======
+from ..exc import unimplemented, unimplemented_v2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..guards import GuardBuilder, install_guard
 from ..source import CallFunctionNoArgsSource, SyntheticLocalSource
 from ..utils import (
@@ -64,7 +76,11 @@
     proxy_args_kwargs,
     unwrap_if_wrapper,
 )
+<<<<<<< HEAD
 from .base import VariableTracker
+=======
+from .base import typestr, VariableTracker
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .ctx_manager import (
     AutocastModeVariable,
     ProfilerContextVariable,
@@ -76,6 +92,10 @@
 from .torch_function import (
     can_dispatch_torch_function,
     dispatch_torch_function,
+<<<<<<< HEAD
+=======
+    TensorWithTFOverrideVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TorchFunctionModeStackVariable,
 )
 
@@ -105,6 +125,10 @@
         torch.autograd.profiler.profile,
         torch.autograd.profiler.record_function,
         torch._C.DisableTorchFunctionSubclass,
+<<<<<<< HEAD
+=======
+        torch._C.DisableTorchFunction,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._functorch.vmap.vmap_increment_nesting,
         torch._functorch.eager_transforms.grad_increment_nesting,
         torch._functorch.eager_transforms.jvp_increment_nesting,
@@ -130,7 +154,15 @@
 )
 
 constant_fold_functions_need_guards = [
+<<<<<<< HEAD
     torch.cuda.current_device,
+=======
+    torch.accelerator.current_device_index,
+    torch.cuda.current_device,
+    torch.cuda.is_initialized,
+    torch.xpu.current_device,
+    torch.xpu.is_initialized,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 constant_fold_functions = [
@@ -138,6 +170,10 @@
     torch._utils._get_device_index,
     torch._C._get_cublas_allow_tf32,
     torch._C._is_any_autocast_enabled,
+<<<<<<< HEAD
+=======
+    torch.accelerator.is_available,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.cuda.get_device_properties,
     torch.cuda.is_available,
     torch.distributed.is_available,
@@ -153,6 +189,11 @@
     torch.promote_types,
     torch._C._get_privateuse1_backend_name,
     torch.autograd._is_checkpoint_valid,
+<<<<<<< HEAD
+=======
+    torch.xpu.get_device_properties,
+    torch.xpu.is_available,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ] + constant_fold_functions_need_guards
 if torch.distributed.is_available():
     constant_fold_functions.extend(
@@ -167,6 +208,7 @@
 constant_fold_functions = dict.fromkeys(constant_fold_functions)
 
 
+<<<<<<< HEAD
 tracing_state_functions = {
     torch.jit.is_scripting: False,
     torch.jit.is_tracing: False,
@@ -180,6 +222,25 @@
     torch.compiler.is_exporting: True,
     torch.nn.modules.activation._is_make_fx_tracing: False,
 }
+=======
+@functools.cache
+def tracing_state_functions() -> dict[Callable[[], Any], Optional[bool]]:
+    # Defined as a function to avoid circular import like torch.onnx
+    return {
+        torch.jit.is_scripting: False,
+        torch.jit.is_tracing: False,
+        torch._C._get_tracing_state: None,
+        torch.fx._symbolic_trace.is_fx_tracing: False,
+        torch.onnx.is_in_onnx_export: False,
+        torch._dynamo.external_utils.is_compiling: True,
+        torch._utils.is_compiling: True,
+        torch.compiler.is_compiling: True,
+        torch.compiler.is_dynamo_compiling: True,
+        torch.compiler.is_exporting: True,
+        torch.nn.modules.activation._is_make_fx_tracing: False,
+    }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bin_ops = dict.fromkeys(["add", "sub", "mul", "div", "sqrt"])
 
@@ -190,14 +251,22 @@
 }
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_overridable_functions():
     from itertools import chain
 
     from torch.overrides import get_overridable_functions as get_overridable_functions_
 
     funcs = set(chain.from_iterable(get_overridable_functions_().values()))
+<<<<<<< HEAD
     more = {
+=======
+    more: set[Callable[..., Any]] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.ones,
         torch.ones_like,
         torch.zeros,
@@ -221,7 +290,11 @@ def __init__(self, value, **kwargs) -> None:
         super().__init__(**kwargs)
         self.value = value
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             name = f"{self.value.__module__}.{self.value.__name__}"
         except Exception:
@@ -342,9 +415,20 @@ def call_function(
         ):
             warning_once(log, "Profiler function %s will be ignored", self.value)
             return ProfilerContextVariable()
+<<<<<<< HEAD
         elif self.value is torch._C.DisableTorchFunctionSubclass:
             assert not (args or kwargs)
             return TorchFunctionDisableVariable.create(tx)
+=======
+        elif (
+            self.value is torch._C.DisableTorchFunctionSubclass
+            or self.value is torch._C.DisableTorchFunction
+        ):
+            assert not (args or kwargs)
+            return TorchFunctionDisableVariable.create(
+                tx, only_subclass=self.value is torch._C.DisableTorchFunctionSubclass
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif self.value is torch._functorch.vmap.vmap_increment_nesting:
             assert len(args) == 2
             return VmapIncrementNestingCtxManagerVariable.create(
@@ -390,7 +474,14 @@ def call_function(
         elif self.value is torch.nn.attention.sdpa_kernel:
             assert len(args) == 1 or (len(kwargs) == 1 and "backends" in kwargs)
             backends = args[0] if len(args) == 1 else kwargs["backends"]
+<<<<<<< HEAD
             return SDPAKernelVariable.create(tx, backends.as_python_constant())
+=======
+            set_priority = kwargs["set_priority"] if "set_priority" in kwargs else False
+            return SDPAKernelVariable.create(
+                tx, backends.as_python_constant(), set_priority
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif self.value is torch.nn.attention._sdpa_kernel_variadic:
             return SDPAKernelVariable.create(
                 tx, [arg.as_python_constant() for arg in args]
@@ -417,7 +508,11 @@ def get_function(self):
         return self.value
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _get_handlers():
         """Build a dict from function -> method to handle it so that we are O(1)
         in terms of the number of function with special handling."""
@@ -446,7 +541,11 @@ def _register(handler):
         )
         from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
 
+<<<<<<< HEAD
         @register(*tracing_state_functions)
+=======
+        @register(*tracing_state_functions())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def handle_tracing_state_functions(
             self, tx: "InstructionTranslator", *args, **kwargs
         ):
@@ -460,7 +559,11 @@ def handle_tracing_state_functions(
                 torch.compiler.is_exporting,
             ):
                 tx.mark_inconsistent_side_effects()
+<<<<<<< HEAD
             return ConstantVariable.create(tracing_state_functions[self.value])
+=======
+            return ConstantVariable.create(tracing_state_functions()[self.value])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @register(*dispatch_key_set_functions)
         def handle_dispatch_key_set_functions(
@@ -516,6 +619,21 @@ def handle_radians(self, tx: "InstructionTranslator", *args, **kwargs):
                     VariableTracker.build(tx, polyfills.radians), args, kwargs
                 )
 
+<<<<<<< HEAD
+=======
+        @register(torch.is_inference_mode_enabled)
+        def handle_is_inference_mode_enabled(self, tx: "InstructionTranslator"):
+            unimplemented_v2(
+                gb_type="Encountered torch.is_inference_mode_enabled during tracing",
+                context="",
+                explanation="torch.is_inference_mode_enabled() is not supported",
+                hints=[
+                    *graph_break_hints.FUNDAMENTAL,
+                    *graph_break_hints.INFERENCE_MODE,
+                ],
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @register(torch.is_tensor, torch.overrides.is_tensor_like)
         def handle_is_tensor(self, tx: "InstructionTranslator", arg):
             if isinstance(arg, TensorVariable) or (
@@ -593,7 +711,22 @@ def handle_are_deterministic_algorithms_enabled(self, tx):
         @register(torch._C._is_torch_function_enabled)
         def handle_is_torch_function_enabled(self, tx):
             install_guard(TorchFunctionDisableVariable._guards_singleton)
+<<<<<<< HEAD
             return ConstantVariable.create(tx.output.torch_function_enabled)
+=======
+            # see comment on SymbolicTorchFunctionState class as to why
+            # this is not a bug
+            return ConstantVariable.create(
+                tx.symbolic_torch_function_state.torch_function_subclass_enabled
+            )
+
+        @register(torch._C._is_torch_function_all_disabled)
+        def handle_is_torch_function_all_disabled(self, tx):
+            install_guard(TorchFunctionDisableVariable._guards_singleton)
+            return ConstantVariable.create(
+                not tx.symbolic_torch_function_state.torch_function_mode_enabled
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @register(
             torch.overrides.has_torch_function,
@@ -864,6 +997,100 @@ def handle_guard_size_oblivious(self, tx: "InstructionTranslator", expr):
             elif isinstance(expr, ConstantVariable):
                 return expr
 
+<<<<<<< HEAD
+=======
+        @register(torch.fx.experimental.symbolic_shapes.guard_or_true)
+        def handle_guard_or_true(self, tx: "InstructionTranslator", expr):
+            if isinstance(expr, SymNodeVariable):
+                # TODO: this probably should be folded somewhere else but I'm not sure where
+                # TODO: some of the other symbolic_shapes special tools can also get this treatment too
+                return variables.ConstantVariable.create(
+                    torch.fx.experimental.symbolic_shapes.guard_or_true(expr.sym_num)
+                )
+            elif isinstance(expr, ConstantVariable):
+                return expr
+
+        @register(torch.fx.experimental.symbolic_shapes.guard_or_false)
+        def handle_guard_or_false(self, tx: "InstructionTranslator", expr):
+            if isinstance(expr, SymNodeVariable):
+                # TODO: this probably should be folded somewhere else but I'm not sure where
+                # TODO: some of the other symbolic_shapes special tools can also get this treatment too
+                return variables.ConstantVariable.create(
+                    torch.fx.experimental.symbolic_shapes.guard_or_false(expr.sym_num)
+                )
+            elif isinstance(expr, ConstantVariable):
+                return expr
+
+        @register(torch.fx.experimental.symbolic_shapes.statically_known_false)
+        def handle_statically_known_false(self, tx: "InstructionTranslator", expr):
+            if isinstance(expr, SymNodeVariable):
+                return variables.ConstantVariable.create(
+                    torch.fx.experimental.symbolic_shapes.statically_known_false(
+                        expr.sym_num
+                    )
+                )
+            elif isinstance(expr, ConstantVariable):
+                return expr
+
+        @register(torch.fx.experimental.symbolic_shapes.guard_scalar)
+        def guard_scalar(self, tx: "InstructionTranslator", expr):
+            if isinstance(expr, SymNodeVariable):
+                val = expr.sym_num
+            elif isinstance(expr, ConstantVariable):
+                val = expr.value
+            else:
+                raise torch._dynamo.exc.Unsupported("branch not supported")
+            return variables.ConstantVariable.create(
+                torch.fx.experimental.symbolic_shapes.guard_scalar(val)
+            )
+
+        @register(torch.fx.experimental.symbolic_shapes.statically_known_true)
+        def handle_statically_known_true(self, tx: "InstructionTranslator", expr):
+            if isinstance(expr, SymNodeVariable):
+                return variables.ConstantVariable.create(
+                    torch.fx.experimental.symbolic_shapes.statically_known_true(
+                        expr.sym_num
+                    )
+                )
+            elif isinstance(expr, ConstantVariable):
+                return expr
+
+        @register(torch.fx.experimental.symbolic_shapes.sym_and)
+        def handle_sym_and(self, tx: "InstructionTranslator", *terms):
+            if all(isinstance(x, SymNodeVariable) for x in terms):
+                return SymNodeVariable.create(
+                    tx,
+                    torch.fx.experimental.symbolic_shapes.sym_and(
+                        *(x.as_proxy() for x in terms)
+                    ),
+                    sym_num=None,
+                )
+
+        @register(torch.fx.experimental.symbolic_shapes.sym_or)
+        def handle_sym_or(self, tx: "InstructionTranslator", *terms):
+            if all(isinstance(x, SymNodeVariable) for x in terms):
+                return SymNodeVariable.create(
+                    tx,
+                    torch.fx.experimental.symbolic_shapes.sym_or(
+                        *(x.as_proxy() for x in terms)
+                    ),
+                    sym_num=None,
+                )
+
+        @register(torch.fx.experimental.symbolic_shapes.has_static_value)
+        def handle_has_static_value(self, tx: "InstructionTranslator", expr):
+            if isinstance(expr, SymNodeVariable):
+                val = expr.sym_num
+            elif isinstance(expr, ConstantVariable):
+                val = expr.value
+            else:
+                return
+
+            return variables.ConstantVariable.create(
+                torch.fx.experimental.symbolic_shapes.has_static_value(val)
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @register(torch._C._autograd._unsafe_set_version_counter)
         def handle_unsafe_set_version_counter(
             self, tx: "InstructionTranslator", *args, **kwargs
@@ -1125,6 +1352,31 @@ def patched_fn(*args, **kwargs):
             )
 
         if self.is_tensor_method():
+<<<<<<< HEAD
+=======
+            name = self.value.__name__
+            # Guard against inplace view op on input tensor (not supported)
+            if args and isinstance(args[0], variables.TensorVariable):
+                tensor_var = args[0]
+                # Check if input tensor and inplace_view op specifically
+                if tensor_var.source is not None and hasattr(torch.ops.aten, name):
+                    fn = getattr(torch.ops.aten, name)
+                    if (
+                        hasattr(fn, "overloads")
+                        and hasattr(fn, fn.overloads()[0])
+                        and torch.Tag.inplace_view
+                        in getattr(fn, fn.overloads()[0]).tags
+                    ):
+                        unimplemented_v2(
+                            gb_type="Inplace op on input tensor",
+                            context="",
+                            explanation=f"Attempted to trace an inplace view op on input tensor {typestr(self.value)}.",
+                            hints=[
+                                *graph_break_hints.SUPPORTABLE,
+                                "Ensure you do not modify input tensor in place.",
+                            ],
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.call_tensor_method(tx, args, kwargs)
 
         special_handler = self._get_handlers().get(self.value)
@@ -1318,7 +1570,13 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
         if data.source:
             return cls._nn_param_via_prefix_insert(tx, data, requires_grad)
 
+<<<<<<< HEAD
         if is_traceable_wrapper_subclass_type(data.class_type):
+=======
+        if isinstance(
+            data, TensorWithTFOverrideVariable
+        ) or is_traceable_wrapper_subclass_type(data.class_type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unimplemented("Parameter constructor with tensor subclass NYI")
 
         if not can_convert_to_tracable_parameter():
@@ -1369,7 +1627,11 @@ def _nn_param_via_prefix_insert(tx: "InstructionTranslator", data, requires_grad
         # Alternate version if we have a .source
         varname = tx.output.new_var()
 
+<<<<<<< HEAD
         # construct the nn.Parmeter before the graph save it to varname
+=======
+        # construct the nn.Parameter before the graph save it to varname
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cg = PyCodegen(tx)
         cg.add_push_null(lambda: cg.load_import_from("torch.nn", "Parameter"))
         cg(data.source)
@@ -1390,6 +1652,11 @@ def _nn_param_via_prefix_insert(tx: "InstructionTranslator", data, requires_grad
             tx.output.example_value_from_input_node(data.as_proxy().node)
         )
         result = VariableTracker.build(tx, example_value, source)
+<<<<<<< HEAD
+=======
+        # Realize the VT because we will delete the guards on it in the next line.
+        result = result.realize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # No need to guard on this since we already guarded on `data`.
         # These guards would fail since varname doesn't exist until after the function starts
         TracingContext.get().guards_context.dynamo_guards.remove_guards_with_source(
diff --git a/torch/_dynamo/variables/torch_function.py b/torch/_dynamo/variables/torch_function.py
index fc662faf1e37..c3ee4ab71337 100644
--- a/torch/_dynamo/variables/torch_function.py
+++ b/torch/_dynamo/variables/torch_function.py
@@ -24,9 +24,12 @@
 
 See https://docs.google.com/document/d/1WBxBSvW3NXhRp9ncmtokJloMLCtF4AYNhJaffvHe8Kw/edit#heading=h.vacn73lozd9w
 for more information on the design.
+<<<<<<< HEAD
 
 To enable subclass behavior, add your tensor subclass type to traceable_tensor_subclasses
 in torch/_dynamo/config.py
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 
 import collections
@@ -47,7 +50,12 @@
 )
 from torch.utils._device import DeviceContext
 
+<<<<<<< HEAD
 from ..exc import unimplemented
+=======
+from .. import graph_break_hints
+from ..exc import unimplemented_v2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..guards import GuardBuilder, install_guard
 from ..polyfills import NoEnterTorchFunctionMode
 from ..source import AttrSource, GlobalSource, TorchFunctionModeStackSource, TypeSource
@@ -62,6 +70,10 @@
 from .base import VariableTracker
 from .constant import ConstantVariable
 from .ctx_manager import GenericContextWrappingVariable
+<<<<<<< HEAD
+=======
+from .functions import UserMethodVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .lazy import LazyVariableTracker
 from .lists import TupleVariable
 from .tensor import TensorSubclassVariable, TensorVariable
@@ -69,6 +81,10 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from torch._dynamo.codegen import PyCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 
@@ -199,7 +215,11 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 ]
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_prev_stack_var_name():
     from ..bytecode_transformation import unique_id
 
@@ -368,7 +388,11 @@ def is_supported_torch_function_mode(ty):
         # We are able to trace custom modes but if there are graph breaks under them
         # and they have a custom __enter__/__exit__ we don't handle this for the
         # same reason we don't handle generic context managers: there may be side effects
+<<<<<<< HEAD
         # that are now affected by executing the funtion across two frames instead of one
+=======
+        # that are now affected by executing the function across two frames instead of one
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Today we support the enter/exit of the default TorchFunctionMode as well as
         # DeviceContext (which is used for set_default_device)
         return issubclass(ty, (NoEnterTorchFunctionMode, DeviceContext)) or (
@@ -384,7 +408,11 @@ def __init__(self, value, source=None, **kwargs):
         self.cm_obj = value  # needed for BC with calling enter from CM code
         self.source = source
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This shouldn't be called unless we have a source
         assert self.source
         self.source.reconstruct(codegen)
@@ -401,8 +429,12 @@ def python_type(self):
     def call_torch_function(self, tx: "InstructionTranslator", fn, types, args, kwargs):
         return call_torch_function(
             tx,
+<<<<<<< HEAD
             self,
             build_torch_function_fn(tx, self.value, self.source),
+=======
+            get_torch_function_fn(tx, self),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn,
             types,
             args,
@@ -428,7 +460,11 @@ def exit(self, tx: "InstructionTranslator", *args):
         )
         return ConstantVariable.create(None)
 
+<<<<<<< HEAD
     def reconstruct_type(self, codegen):
+=======
+    def reconstruct_type(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ty = NoEnterTorchFunctionMode
         codegen(
             AttrSource(
@@ -488,7 +524,11 @@ def _get_subclass_type_var(tx: "InstructionTranslator", var):
         return VariableTracker.build(tx, var.python_type(), source)
 
 
+<<<<<<< HEAD
 def _is_attr_overidden(tx: "InstructionTranslator", var, name):
+=======
+def _is_attr_overridden(tx: "InstructionTranslator", var, name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import torch
 
     overridden = False
@@ -501,6 +541,7 @@ def _is_attr_overidden(tx: "InstructionTranslator", var, name):
     return overridden
 
 
+<<<<<<< HEAD
 def call_torch_function(
     tx, torch_function_type, torch_function_var, fn, types, args, kwargs
 ):
@@ -508,10 +549,22 @@ def call_torch_function(
     # def __torch_function__(cls, func, types, args=(), kwargs=None):
     tf_args = (
         torch_function_type,
+=======
+def call_torch_function(tx, torch_function_var, fn, types, args, kwargs):
+    # This emulates calling __torch_function__, which has a signature
+    #   def __torch_function__(cls, func, types, args=(), kwargs=None):
+    #
+    # Also notice the `cls` is not explicitly passed in the reference
+    # implementations:
+    # 1. https://github.com/pytorch/pytorch/blob/8d81806211bc3c0ee6c2ef235017bacf1d775a85/torch/csrc/utils/python_arg_parser.cpp#L368-L374  # noqa: B950
+    # 2. https://github.com/pytorch/pytorch/blob/8d81806211bc3c0ee6c2ef235017bacf1d775a85/torch/overrides.py#L1741-L1743
+    tf_args = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn,
         types,
         VariableTracker.build(tx, tuple(args)),
         VariableTracker.build(tx, kwargs),
+<<<<<<< HEAD
     )
     return tx.inline_user_function_return(torch_function_var, tf_args, {})
 
@@ -526,6 +579,21 @@ def build_torch_function_fn(tx: "InstructionTranslator", value, source):
 
     source = source and AttrSource(AttrSource(source, "__torch_function__"), "__func__")
     return VariableTracker.build(tx, func, source)
+=======
+    ]
+    return torch_function_var.call_function(tx, tf_args, {})
+
+
+def get_torch_function_fn(tx: "InstructionTranslator", vt):
+    # The underlying function could be a classmethod, staticmethod, regular
+    # function or a function with C-implementation. It doesn't matter as long as
+    # they satisfy the calling convention in `call_torch_function`.
+    from .builtin import BuiltinVariable
+
+    args = [vt, ConstantVariable("__torch_function__")]
+    func_vt = BuiltinVariable(getattr).call_function(tx, args, {})
+    return func_vt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def can_dispatch_torch_function(tx: "InstructionTranslator", args, kwargs):
@@ -568,8 +636,18 @@ def dispatch_torch_function(tx: "InstructionTranslator", fn, args, kwargs):
         if not (isinstance(res, ConstantVariable) and res.value is NotImplemented):
             return res
 
+<<<<<<< HEAD
     unimplemented(
         f"All __torch_function__ overrides for call {fn} with args {args} and kwargs {kwargs} returned NotImplemented"
+=======
+    unimplemented_v2(
+        gb_type="All __torch_function__ overrides returned NotImplemented due to TypeError from user code",
+        context=f"{fn=}, {args=}, {kwargs=}",
+        explanation=f"All __torch_function__ overrides for for function {fn} returned NotImplemented",
+        hints=[
+            *graph_break_hints.USER_ERROR,
+        ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -578,6 +656,7 @@ class TensorWithTFOverrideVariable(TensorVariable):
     Represents a tensor subclass instance with a __torch_function__ override.
     """
 
+<<<<<<< HEAD
     def __init__(self, *args, **kwargs) -> None:
         self.torch_function_fn = kwargs.pop("torch_function_fn")
         super().__init__(*args, **kwargs)
@@ -591,6 +670,21 @@ def from_tensor_var(cls, tx, tensor_var, class_type, torch_function_fn):
             "invalid class type in TensorWithTFOverrideVariable.from_tensor_var"
         )
         var = cls(torch_function_fn=torch_function_fn, class_type=class_type, **kwargs)
+=======
+    @classmethod
+    def from_tensor_var(cls, tx, tensor_var, class_type, cls_source):
+        # [Note: __torch_function__] coerce `tensor_var` into a
+        # TensorWithTFOverrideVariable. In eager, this is just a type change.
+        import torch
+
+        # This simulates shallow-copying the tensor object.
+        kwargs = dict(tensor_var.__dict__)
+        input_tensor_type = kwargs.pop("class_type")
+        assert input_tensor_type in (torch.Tensor, torch.nn.Parameter), (
+            f"invalid class type {input_tensor_type} in TensorWithTFOverrideVariable.from_tensor_var"
+        )
+        var = cls(class_type=class_type, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         var.install_global(tx)
         return var
 
@@ -622,6 +716,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
         # base tensors, custom attribute accesses will graph break.
         import torch
 
+<<<<<<< HEAD
         if name in banned_attrs:
             unimplemented(
                 f"Accessing {name} on a tensor subclass with a __torch_function__ override is not supported"
@@ -656,6 +751,96 @@ def call_torch_function(self, tx: "InstructionTranslator", fn, types, args, kwar
         return call_torch_function(
             tx,
             self.class_type_var(tx),
+=======
+        # I think only `_base` is breaking because we aren't modelling view
+        # relationship perfectly in some scenarios.
+        if name in banned_attrs:
+            unimplemented_v2(
+                gb_type="Unsupported tensor subclass attribute access",
+                context=f"{name}",
+                explanation="`torch.compile` currently can't trace this",
+                hints=[
+                    f"Avoid accessing {name} of tensor subclass in torch.compile region",
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
+
+        # Handle non-overridden attributes inherited from `torch.Tensor`.
+        attr_is_overridden = _is_attr_overridden(tx, self, name)
+        if (
+            hasattr(torch.Tensor, name)
+            and not attr_is_overridden
+            and not inspect.ismethoddescriptor(getattr(torch.Tensor, name))
+        ):
+            args, kwargs = [self], {}
+            if can_dispatch_torch_function(tx, args, kwargs):
+                if self.source:
+                    install_guard(
+                        AttrSource(
+                            AttrSource(self.source, "__class__"), name
+                        ).make_guard(GuardBuilder.FUNCTION_MATCH)
+                    )
+                get_fn = VariableTracker.build(tx, getattr(torch.Tensor, name).__get__)
+
+                return self.call_torch_function(
+                    tx,
+                    get_fn,
+                    TupleVariable([self.class_type_var(tx)]),
+                    args,
+                    kwargs,
+                )
+        else:
+            # `TensorVariable.var_getattr` doesn't handle user-defined
+            # function/attribute well, so we explicitly handle them here.
+            #
+            # TODO move this logic into `TensorVariable`, or try to merge it
+            # with similar logic in `UserDefinedObjectVariable`.
+            try:
+                attr = inspect.getattr_static(self.class_type, name)
+            except AttributeError:
+                pass
+            else:
+                import types
+
+                cls_source = GlobalSource(self.global_mangled_class_name(tx))
+                attr_source = AttrSource(cls_source, name)
+                if isinstance(attr, types.FunctionType):
+                    install_guard(attr_source.make_guard(GuardBuilder.FUNCTION_MATCH))
+                    return UserMethodVariable(attr, self)
+
+                elif isinstance(attr, property):
+                    getter_source = AttrSource(attr_source, "fget")
+                    getter = attr.fget
+                    getter_var = UserMethodVariable(getter, self, source=getter_source)
+                    return getter_var.call_function(tx, [], {})
+
+                elif isinstance(attr, classmethod):
+                    return UserMethodVariable(
+                        attr.__func__, self.class_type_var(tx), source=attr_source
+                    )
+
+                elif attr_is_overridden:
+                    unimplemented_v2(
+                        gb_type="Unsupported tensor subclass overridden attribute access",
+                        context=f"{name}",
+                        explanation="`torch.compile` only support tracing certain types of overridden tensor subclass attributes",
+                        hints=[
+                            f"Avoid accessing {name} of tensor subclass in torch.compile region",
+                            f"Renaming attribute `{name}` of type {self.class_type}",
+                            *graph_break_hints.SUPPORTABLE,
+                        ],
+                    )
+
+        return super().var_getattr(tx, name)
+
+    def call_torch_function(self, tx: "InstructionTranslator", fn, types, args, kwargs):
+        # NOTE this assumes `__torch_function__` isn't modified during tracing.
+        if not hasattr(self, "torch_function_fn"):
+            self.torch_function_fn = get_torch_function_fn(tx, self)
+
+        return call_torch_function(
+            tx,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.torch_function_fn,
             fn,
             types,
@@ -672,6 +857,7 @@ def call_method(
     ) -> "VariableTracker":
         # This code block implements inlining the __torch_function__ override
         # of `call_method`.
+<<<<<<< HEAD
         if tx.output.torch_function_enabled:
             import torch
 
@@ -679,6 +865,22 @@ def call_method(
                 unimplemented(
                     f"Calling overridden method {name} on a tensor"
                     " subclass with a __torch_function__ override is not supported"
+=======
+        tf_args = [self] + args
+        if can_dispatch_torch_function(tx, tf_args, kwargs):
+            import torch
+
+            if _is_attr_overridden(tx, self, name):
+                unimplemented_v2(
+                    gb_type="Tensor subclass overridden method call",
+                    context=f"{name}",
+                    explanation="`torch.compile` currently can't trace this",
+                    hints=[
+                        f"Avoid calling {name} of tensor subclass in torch.compile region",
+                        f"Renaming method `{name}` of type {self.class_type}",
+                        *graph_break_hints.SUPPORTABLE,
+                    ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             # [Note: __torch_function__] Currently we only support methods that are defined on tensor
@@ -692,6 +894,10 @@ def call_method(
                 source = None
                 value = getattr(torch.Tensor, name)
             func_var = VariableTracker.build(tx, value, source)
+<<<<<<< HEAD
             return dispatch_torch_function(tx, func_var, [self] + args, kwargs)
+=======
+            return dispatch_torch_function(tx, func_var, tf_args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return super().call_method(tx, name, args, kwargs)
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index b842a552649f..19db29069f76 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -21,6 +21,10 @@
 maintaining proper semantics while enabling optimizations where possible.
 """
 
+<<<<<<< HEAD
+=======
+import _collections
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import builtins
 import collections
 import contextlib
@@ -28,6 +32,10 @@
 import enum
 import functools
 import inspect
+<<<<<<< HEAD
+=======
+import itertools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import random
 import sys
 import threading
@@ -55,6 +63,10 @@
 from ..source import (
     AttrSource,
     CallFunctionNoArgsSource,
+<<<<<<< HEAD
+=======
+    DataclassFieldsSource,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GetItemSource,
     RandomValueSource,
     TypeSource,
@@ -68,6 +80,10 @@
     get_custom_getattr,
     has_torch_function,
     is_frozen_dataclass,
+<<<<<<< HEAD
+=======
+    is_lru_cache_wrapped_function,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_namedtuple_cls,
     is_utils_checkpoint,
     is_wrapper_or_member_descriptor,
@@ -82,6 +98,10 @@
 )
 from .base import AttributeMutationExisting, ValueMutationNew, VariableTracker
 from .dicts import DefaultDictVariable
+<<<<<<< HEAD
+=======
+from .lists import SizeVariable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -96,6 +116,10 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from torch._dynamo.codegen import PyCodegen
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 
@@ -103,6 +127,13 @@ def is_standard_setattr(val):
     return val in (object.__setattr__, BaseException.__setattr__)
 
 
+<<<<<<< HEAD
+=======
+def is_standard_delattr(val):
+    return val in (object.__delattr__, BaseException.__delattr__)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_forbidden_context_manager(ctx):
     f_ctxs = []
 
@@ -142,7 +173,11 @@ def __repr__(self) -> str:
         return f"{self.__class__.__name__}({self.value})"
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _constant_fold_classes():
         return {
             torch.device,
@@ -152,12 +187,34 @@ def _constant_fold_classes():
         }
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
     def _in_graph_classes():
         _in_graph_class_list = {
             torch.Tensor,
             torch.cuda.Stream,
             torch.cuda.Event,
+=======
+    @functools.cache
+    def _in_graph_classes():
+        _in_graph_class_list = {
+            torch.Tensor,
+            torch.cuda.FloatTensor,
+            torch.cuda.DoubleTensor,
+            torch.cuda.HalfTensor,
+            torch.cuda.BFloat16Tensor,
+            torch.cuda.ByteTensor,
+            torch.cuda.CharTensor,
+            torch.cuda.IntTensor,
+            torch.cuda.ShortTensor,
+            torch.cuda.LongTensor,
+            torch.Stream,
+            torch.Event,
+            torch.cuda.Stream,
+            torch.cuda.Event,
+            torch.xpu.Stream,
+            torch.xpu.Event,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         if hasattr(torch, "hpu"):
             _in_graph_class_list.update(
@@ -170,7 +227,11 @@ def _in_graph_classes():
         return set(tensortype_to_dtype.keys()) | _in_graph_class_list
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def supported_c_new_functions():
         exceptions = [
             getattr(builtins, name).__new__
@@ -470,7 +531,15 @@ def call_function(
                 items, maxlen=maxlen, mutation_type=ValueMutationNew()
             )
         elif self.value is weakref.ref:
+<<<<<<< HEAD
             return variables.WeakRefVariable(args[0])
+=======
+            if len(args) > 1:
+                callback = args[1]
+            else:
+                callback = variables.ConstantVariable.create(None)
+            return variables.WeakRefVariable(args[0], callback)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif self.value is functools.partial:
             if not args:
                 unimplemented("functools.partial malformed")
@@ -579,13 +648,27 @@ def call_function(
                 assert all(x is not None for x in items)
 
             return variables.NamedTupleVariable(items, self.value)
+<<<<<<< HEAD
+        elif is_frozen_dataclass(self.value) and self.is_standard_new():
+            fields = dataclasses.fields(self.value)
+=======
+        elif self.value is torch.Size:
+            # This simulates `THPSize_pynew`, the C impl for `Size.__new__`.
+            tup = variables.BuiltinVariable(tuple).call_function(tx, args, kwargs)
+            return SizeVariable(tup.items)
         elif is_frozen_dataclass(self.value) and self.is_standard_new():
             fields = dataclasses.fields(self.value)
+            fields_source = DataclassFieldsSource(self.source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             items = list(args)
             items.extend([None] * (len(fields) - len(items)))
 
             default_kwargs = {}
+<<<<<<< HEAD
             for field, var_tracker in zip(fields, items):
+=======
+            for ind, field, var_tracker in zip(itertools.count(), fields, items):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if var_tracker is None:
                     if field.name in kwargs:
                         var_tracker = kwargs[field.name]
@@ -594,7 +677,17 @@ def call_function(
                             continue
 
                         if field.default is not dataclasses.MISSING:
+<<<<<<< HEAD
                             var_tracker = VariableTracker.build(tx, field.default)
+=======
+                            var_tracker = VariableTracker.build(
+                                tx,
+                                field.default,
+                                source=AttrSource(
+                                    GetItemSource(fields_source, ind), "default"
+                                ),
+                            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         elif field.default_factory is not dataclasses.MISSING:
                             factory_fn = VariableTracker.build(
                                 tx, field.default_factory
@@ -701,7 +794,10 @@ class UserDefinedExceptionClassVariable(UserDefinedClassVariable):
     def fn(self):
         return self.value
 
+<<<<<<< HEAD
     @property
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def python_type(self):
         return self.value
 
@@ -730,7 +826,16 @@ class UserDefinedObjectVariable(UserDefinedVariable):
     Mostly objects of defined type.  Catch-all for something where we only know the type.
     """
 
+<<<<<<< HEAD
     _nonvar_fields = {"value", "value_type", *UserDefinedVariable._nonvar_fields}
+=======
+    _nonvar_fields = {
+        "value",
+        "value_type",
+        "attrs_directly_modifed_on_dict",
+        *UserDefinedVariable._nonvar_fields,
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -758,6 +863,16 @@ def __init__(
         self.base_cls_vt = base_cls_vt
         self.init_args = init_args
 
+<<<<<<< HEAD
+=======
+        # This records names of the attributes that were modified via instance
+        # `__dict__` directly, rather than the normal setattr path.
+        #
+        # TODO consider emulating `obj.__dict__` as a `ConstDictVariable` to get
+        # rid of these workarounds here and in `GetAttrVariable`.
+        self.attrs_directly_modifed_on_dict = set()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __str__(self) -> str:
         inner = self.value_type.__name__
         if inner in [
@@ -802,18 +917,31 @@ def torch_function_check(self):
 
     def get_torch_fn(self, tx):
         self.torch_function_check()
+<<<<<<< HEAD
         from .torch_function import build_torch_function_fn
 
         return build_torch_function_fn(tx, self.value, self.source)
+=======
+        from .torch_function import get_torch_function_fn
+
+        return get_torch_function_fn(tx, self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_torch_function(self, tx: "InstructionTranslator", fn, types, args, kwargs):
         self.torch_function_check()
 
+<<<<<<< HEAD
         from .torch_function import _get_subclass_type_var, call_torch_function
 
         return call_torch_function(
             tx,
             _get_subclass_type_var(tx, self),
+=======
+        from .torch_function import call_torch_function
+
+        return call_torch_function(
+            tx,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.get_torch_fn(tx),
             fn,
             types,
@@ -822,7 +950,11 @@ def call_torch_function(self, tx: "InstructionTranslator", fn, types, args, kwar
         )
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _supported_random_functions():
         fns = {
             random.random,
@@ -857,6 +989,14 @@ def call_method(
             if is_standard_setattr(method) or isinstance(self.value, threading.local):
                 return self.method_setattr_standard(tx, *args, **kwargs)
 
+<<<<<<< HEAD
+=======
+            if is_standard_delattr(method):
+                return self.method_setattr_standard(
+                    tx, args[0], variables.DeletedVariable()
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if method is object.__eq__ and len(args) == 1 and not kwargs:
                 other = args[0]
                 if not isinstance(other, UserDefinedObjectVariable):
@@ -893,7 +1033,13 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
+<<<<<<< HEAD
     def method_setattr_standard(self, tx: "InstructionTranslator", name, value):
+=======
+    def method_setattr_standard(
+        self, tx: "InstructionTranslator", name, value, directly_update_dict=False
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             name = name.as_python_constant()
         except NotImplementedError:
@@ -901,6 +1047,31 @@ def method_setattr_standard(self, tx: "InstructionTranslator", name, value):
         if not tx.output.side_effects.is_attribute_mutation(self):
             unimplemented(f"setattr({self}, {name}, ...)")
 
+<<<<<<< HEAD
+=======
+        if directly_update_dict:
+            self.attrs_directly_modifed_on_dict.add(name)
+        else:
+            tmp = self.try_get_descritor_and_setter_py_func(name)
+            if tmp:
+                descriptor, setter = tmp
+                # Emulate
+                # https://github.com/python/cpython/blob/3.11/Objects/object.c#L1371-L1452
+                desc_source = None
+                func_source = None
+                if self.cls_source:
+                    desc_source = self.get_source_by_walking_mro(name)
+                    # use `type(...)` to ignore instance attrs.
+                    func_source = AttrSource(TypeSource(desc_source), "__set__")
+                desc_var = VariableTracker.build(tx, descriptor, desc_source)
+                func_var = VariableTracker.build(tx, setter, func_source)
+                args = [desc_var, self, value]
+                return func_var.call_function(tx, args, {})
+            # NOTE: else we assume the descriptor (if any) has a
+            # side-effect-free `__set__` as far as Dynamo tracing is concerned.
+
+        # Emulate the standard setattr on instance dict.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tx.output.side_effects.store_attr(self, name, value)
         return variables.ConstantVariable(None)
 
@@ -983,7 +1154,12 @@ def call_function(
             return func_var.call_function(tx, [obj_var] + args, kwargs)
         elif callable(self.value):
             if self.source:
+<<<<<<< HEAD
                 install_guard(self.source.make_guard(GuardBuilder.FUNCTION_MATCH))
+=======
+                source = AttrSource(self.cls_source, "__call__")
+                install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.call_method(tx, "__call__", args, kwargs)
 
         return super().call_function(tx, args, kwargs)
@@ -1005,6 +1181,7 @@ def _is_c_defined_property(self, subobj):
 
     def _getattr_static(self, name):
         subobj = inspect.getattr_static(self.value, name, NO_SUCH_SUBOBJ)
+<<<<<<< HEAD
         import _collections
 
         # In some cases, we have to do dynamic lookup because getattr_static is not enough. For example, threading.local
@@ -1025,6 +1202,22 @@ def _getattr_static(self, name):
             # Call __getattribute__, we have already checked that this is not overridden and side-effect free. We don't
             # want to call getattr because it can be user-overridden.
             subobj = self.value.__getattribute__(name)
+=======
+
+        # In some cases, we have to do dynamic lookup because getattr_static is not enough. For example, threading.local
+        # has side-effect free __getattribute__ and the attribute is not visible without a dynamic lookup.
+        # NOTE we assume the following descriptors are side-effect-free as far
+        # as Dynamo tracing is concerned.
+        if not object_has_getattribute(self.value) and (
+            subobj is NO_SUCH_SUBOBJ  # e.g., threading.local
+            or inspect.ismemberdescriptor(subobj)  # e.g., __slots__
+            or inspect.isgetsetdescriptor(subobj)  # e.g., __dict__
+            or self._is_c_defined_property(subobj)
+        ):
+            # Call __getattribute__, we have already checked that this is not overridden and side-effect free. We don't
+            # want to call getattr because it can be user-overridden.
+            subobj = type(self.value).__getattribute__(self.value, name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif object_has_getattribute(self.value) and subobj is NO_SUCH_SUBOBJ:
             # If the object has an overridden getattribute method, Dynamo has
             # already tried tracing it, and encountered an AttributeError. We
@@ -1035,6 +1228,30 @@ def _getattr_static(self, name):
 
         return subobj
 
+<<<<<<< HEAD
+=======
+    def should_skip_descriptor_setter(self, attr_name):
+        # Check if `attr_name` corresponds to a descriptor.
+        descriptor = inspect.getattr_static(type(self.value), attr_name, None)
+        setter = inspect.getattr_static(type(descriptor), "__set__", None)
+        if setter:
+            # Skip if `__set__` was traceable (no need to redo the side effect).
+            if inspect.isfunction(setter):
+                return True
+            # For untraceable `__set__` we should still skip if the attribute
+            # was mutated via instance `__dict__`.
+            elif attr_name in self.attrs_directly_modifed_on_dict:
+                return True
+        return False
+
+    def try_get_descritor_and_setter_py_func(self, attr_name):
+        descriptor = inspect.getattr_static(type(self.value), attr_name, None)
+        setter = inspect.getattr_static(type(descriptor), "__set__", None)
+        if inspect.isfunction(setter):
+            return (descriptor, setter)
+        return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def has_key_in_generic_dict(self, tx: "InstructionTranslator", key):
         if tx.output.side_effects.has_pending_mutation_of_attr(self, key):
             mutated_attr = tx.output.side_effects.load_attr(self, key, deleted_ok=True)
@@ -1155,12 +1372,29 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             return variables.UserMethodVariable(
                 subobj.fget, self, source=source
             ).call_function(tx, [], {})
+<<<<<<< HEAD
         elif isinstance(subobj, staticmethod):
             func = subobj.__get__(self.value)
             if source is not None:
                 return trace_rules.lookup(func).create_with_source(func, source=source)
             else:
                 return trace_rules.lookup(func)(func)
+=======
+        elif isinstance(subobj, _collections._tuplegetter):
+            # namedtuple fields are represented by _tuplegetter, and here we
+            # emulate its `__get__`, which is implemented in C.
+            _, (idx, _) = subobj.__reduce__()
+            # Don't go through the `__getitem__` method anymore, see
+            # https://github.com/python/cpython/blob/470941782f74288823b445120f6383914b659f23/Modules/_collectionsmodule.c#L2690
+            assert isinstance(self, UserDefinedTupleVariable)
+            return self._tuple_vt.items[idx]
+        elif isinstance(subobj, staticmethod):
+            # Safe because `staticmethod.__get__` basically won't trigger user
+            # code and just returns the underlying `__func__`:
+            # https://github.com/python/cpython/blob/3.11/Objects/funcobject.c#L1088-L1100
+            func = subobj.__get__(self.value)
+            return VariableTracker.build(tx, func, source)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(subobj, classmethod):
             return variables.UserMethodVariable(
                 subobj.__func__, self.var_getattr(tx, "__class__"), source=source
@@ -1169,9 +1403,26 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             # e.g.: inspect.getattr_static({}, "fromkeys")
             func = subobj.__get__(self.value, None)
             return VariableTracker.build(tx, func, source)
+<<<<<<< HEAD
         elif inspect.ismethoddescriptor(subobj) and not is_wrapper_or_member_descriptor(
             subobj.__get__
         ):
+=======
+        elif is_lru_cache_wrapped_function(subobj):
+            # getattr_static returns the lru_wrapped function, and we cannot
+            # extract the underlying method from the wrapped function. To handle
+            # it, manually create a wrapped user method vt.
+            return variables.WrapperUserMethodVariable(
+                subobj, "__wrapped__", self, source=source
+            )
+        elif inspect.getattr_static(
+            type(subobj), "__get__", NO_SUCH_SUBOBJ
+        ) is not NO_SUCH_SUBOBJ and not is_wrapper_or_member_descriptor(
+            type(subobj).__get__
+        ):
+            # Emulate https://github.com/python/cpython/blob/3.11/Objects/object.c#L1271-L1285
+            #
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Attribute has a __get__ method. Create a user defined object vt
             # for the subobj, and then trace the __get__ method.
             descriptor_source = None
@@ -1180,7 +1431,13 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 # To access the method descriptor from the udf object w/o using
                 # inspect.getattr_static, we can look into the class mro
                 descriptor_source = self.get_source_by_walking_mro(name)
+<<<<<<< HEAD
                 descriptor_get_source = AttrSource(descriptor_source, "__get__")
+=======
+                descriptor_get_source = AttrSource(
+                    TypeSource(descriptor_source), "__get__"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 descriptor_var = VariableTracker.build(tx, subobj, descriptor_source)
             else:
                 # Sourceless Builder does not support user defined objects
@@ -1215,12 +1472,24 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                             f"Found a method whose __func__ is not of FunctionType - {dynamic_subobj}"
                         )
 
+<<<<<<< HEAD
                     from .builder import SourcelessUserDefinedObjectBuilder
 
                     # This means that we are calling a method of some other object here.
                     object_vt = SourcelessUserDefinedObjectBuilder.create(
                         tx, dynamic_subobj.__self__
                     )
+=======
+                    # Use the __self__ attribute of the method to find the
+                    # source of the new self object.
+                    self_source = None
+                    if source is not None:
+                        self_source = AttrSource(source, "__self__")
+                    object_vt = VariableTracker.build(
+                        tx, dynamic_subobj.__self__, self_source
+                    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return variables.UserMethodVariable(
                         dynamic_subobj.__func__, object_vt
                     )
@@ -1253,7 +1522,11 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             and isinstance(self, variables.UnspecializedNNModuleVariable)
             # export has some awkwardness around specialized and unspecialized modules. Skip wrapping source for export
             # usecase for now.
+<<<<<<< HEAD
             and not tx.output.export
+=======
+            and (not tx.output.export or torch._dynamo.config.install_free_tensors)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # Recalculate source for params/buffers
             if name in ("_buffers", "_parameters"):
@@ -1474,6 +1747,18 @@ def var_getattr(self, tx: "InstructionTranslator", name):
         return super().var_getattr(tx, name)
 
 
+<<<<<<< HEAD
+=======
+class IntWrapperVariable(UserDefinedObjectVariable):
+    # Dummy class to check if the object is an IntWrapper, and turn it into a
+    # symint
+    @staticmethod
+    def is_matching_object(obj):
+        mod = sys.modules.get("torch.export.dynamic_shapes")
+        return mod is not None and type(obj) is mod._IntWrapper
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class RemovableHandleClass:
     # Dummy class to pass to python_type of RemovableHandleVariable
     # Useful for isinstance check on hooks
@@ -1502,7 +1787,11 @@ def call_method(self, tx: "InstructionTranslator", method_name, args, kwargs):
             return variables.ConstantVariable.create(None)
         super().call_method(tx, method_name, args, kwargs)
 
+<<<<<<< HEAD
     def reconstruct(self, codegen):
+=======
+    def reconstruct(self, codegen: "PyCodegen"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.idx == self.REMOVED:
             # Hook has already been removed, return a dummy handle
             codegen.add_push_null(
@@ -1620,6 +1909,7 @@ class UserDefinedTupleVariable(UserDefinedObjectVariable):
 
     _nonvar_fields = UserDefinedObjectVariable._nonvar_fields
 
+<<<<<<< HEAD
     def __init__(self, value, **kwargs):
         super().__init__(value, **kwargs)
         self._tuple_vt = None
@@ -1632,6 +1922,26 @@ def create(value, tuple_vt, **kwargs):
         result = UserDefinedTupleVariable(value, **kwargs)
         result.set_underlying_tuple_vt(tuple_vt)
         return result
+=======
+    def __init__(self, value, tuple_vt=None, init_args=None, **kwargs):
+        super().__init__(value, init_args=init_args, **kwargs)
+        self._tuple_vt = tuple_vt
+        if self._tuple_vt is None:
+            assert self.source is None, (
+                "tuple_vt must be constructed by builder.py when source is present"
+            )
+            # Emulate `tuple.__new__`
+            # https://github.com/python/cpython/blob/3.11/Objects/tupleobject.c#L697-L710
+            #
+            # TODO this duplicates the logic in `BuiltinVariable(tuple)`
+            from torch._dynamo.symbolic_convert import InstructionTranslator
+
+            tx = InstructionTranslator.current_tx()
+            elems = init_args[0].unpack_var_sequence(tx)
+            self._tuple_vt = variables.TupleVariable(
+                elems, mutation_type=ValueMutationNew()
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def call_method(
         self,
diff --git a/torch/_export/__init__.py b/torch/_export/__init__.py
index 4affd3698d41..0fae36da8425 100644
--- a/torch/_export/__init__.py
+++ b/torch/_export/__init__.py
@@ -16,7 +16,11 @@
 from contextlib import contextmanager
 from functools import lru_cache
 
+<<<<<<< HEAD
 from typing import Any, Callable, Optional, Union
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from unittest.mock import patch
 
 import torch
@@ -48,6 +52,12 @@
 from .wrappers import _wrap_submodules
 from .utils import _materialize_cpp_cia_ops
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from torch._C._aoti import AOTIModelContainerRunner
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 
 @dataclasses.dataclass
@@ -83,7 +93,11 @@ def aot_compile(
     remove_runtime_assertions: bool = False,
     disable_constraint_solver: bool = False,
     same_signature: bool = True,
+<<<<<<< HEAD
 ) -> Union[list[str], str]:
+=======
+) -> Union[list[Any], str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Note: this function is not stable yet
 
@@ -160,22 +174,40 @@ def aot_load(so_path: str, device: str) -> Callable:
     aot_compile_warning()
 
     if device == "cpu":
+<<<<<<< HEAD
         runner = torch._C._aoti.AOTIModelContainerRunnerCpu(so_path, 1)  # type: ignore[call-arg]
     elif device == "cuda" or device.startswith("cuda:"):
         runner = torch._C._aoti.AOTIModelContainerRunnerCuda(so_path, 1, device)  # type: ignore[assignment, call-arg]
     elif device == "xpu" or device.startswith("xpu:"):
         runner = torch._C._aoti.AOTIModelContainerRunnerXpu(so_path, 1, device)  # type: ignore[assignment, call-arg]
 
+=======
+        runner: AOTIModelContainerRunner = torch._C._aoti.AOTIModelContainerRunnerCpu(so_path, 1)
+    elif device == "cuda" or device.startswith("cuda:"):
+        runner = torch._C._aoti.AOTIModelContainerRunnerCuda(so_path, 1, device)
+    elif device == "xpu" or device.startswith("xpu:"):
+        runner = torch._C._aoti.AOTIModelContainerRunnerXpu(so_path, 1, device)
+    elif device == "mps" or device.startswith("mps:"):
+        runner = torch._C._aoti.AOTIModelContainerRunnerMps(so_path, 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         raise RuntimeError("Unsupported device " + device)
 
     def optimized(*args, **kwargs):
+<<<<<<< HEAD
         call_spec = runner.get_call_spec()  # type: ignore[attr-defined]
+=======
+        call_spec = runner.get_call_spec()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         in_spec = pytree.treespec_loads(call_spec[0])
         out_spec = pytree.treespec_loads(call_spec[1])
         flat_inputs = pytree.tree_flatten((args, reorder_kwargs(kwargs, in_spec)))[0]
         flat_inputs = [x for x in flat_inputs if isinstance(x, torch.Tensor)]
+<<<<<<< HEAD
         flat_outputs = runner.run(flat_inputs)  # type: ignore[attr-defined]
+=======
+        flat_outputs = runner.run(flat_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return pytree.tree_unflatten(flat_outputs, out_spec)
 
     return optimized
diff --git a/torch/_export/converter.py b/torch/_export/converter.py
index 74eaebcff127..d65e2d9eb551 100644
--- a/torch/_export/converter.py
+++ b/torch/_export/converter.py
@@ -6,7 +6,11 @@
 import warnings
 from collections.abc import Sequence
 from contextlib import contextmanager
+<<<<<<< HEAD
 from typing import Any, Optional, Union
+=======
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.export._trace
@@ -229,7 +233,11 @@ def get_dtype_as_int(tensor):
 # Those operators will be automatically populated to a instance method
 # of TS2FXGraphConverter with name convert_<namespace>_<opname>().
 # Please check __init__ for method population implementations.
+<<<<<<< HEAD
 kind_to_standard_operators = {
+=======
+kind_to_standard_operators: dict[str, Callable[..., Any]] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "prim::max": builtins.max,
     "prim::min": builtins.min,
     "prim::TupleIndex": operator.getitem,
@@ -1514,7 +1522,11 @@ def retrace_as_exported_program(
     ):
         dynamic_shapes = _tree_map_with_path(
             lambda path, x: (
+<<<<<<< HEAD
                 [Dim.AUTO] * x.dim() if isinstance(x, torch.Tensor) else None  # type: ignore[attr-defined]
+=======
+                [Dim.AUTO] * x.dim() if isinstance(x, torch.Tensor) else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             self.sample_args,
         )
@@ -1549,6 +1561,10 @@ def retrace_as_exported_program(
                     name_to_constant[spec.target], torch.Tensor
                 ), f"{type(name_to_constant[spec.target])} has been erroneously marked as buffer"
                 spec.kind = InputKind.CONSTANT_TENSOR
+<<<<<<< HEAD
+=======
+                spec.persistent = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep.verifier().check(ep)
 
         return ep
diff --git a/torch/_export/non_strict_utils.py b/torch/_export/non_strict_utils.py
index dc61de00439a..f64ccecaa254 100644
--- a/torch/_export/non_strict_utils.py
+++ b/torch/_export/non_strict_utils.py
@@ -1,8 +1,20 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import contextlib
 import inspect
 import logging
 from collections import defaultdict
+=======
+import builtins
+import contextlib
+import functools
+import inspect
+import logging
+import math
+from collections import defaultdict
+from collections.abc import Sequence
+from contextlib import contextmanager
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 
 import torch
@@ -15,7 +27,10 @@
     TensorPropertySource,
 )
 from torch._dynamo.variables.builder import TrackedFake
+<<<<<<< HEAD
 from torch._export.passes.add_runtime_assertions_for_constraints_pass import InputDim
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._export.passes.lift_constants_pass import ConstantAttrMap
 from torch._export.utils import _fakify_params_buffers
 from torch._guards import Source
@@ -26,6 +41,11 @@
     _check_dynamic_shapes,
     _combine_args,
     _DimHint,
+<<<<<<< HEAD
+=======
+    _DimHintType,
+    _IntWrapper,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _process_dynamic_shapes,
     _RelaxedConstraint,
     _tree_map_with_path,
@@ -42,6 +62,10 @@
     RelaxedUnspecConstraint,
     ShapeEnv,
     StatelessSymbolicContext,
+<<<<<<< HEAD
+=======
+    SymIntSymbolicContext,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ValueRanges,
 )
 from torch.utils._pytree import (
@@ -51,6 +75,10 @@
     SequenceKey,
     tree_map_with_path,
 )
+<<<<<<< HEAD
+=======
+from torch.utils._sympy.numbers import int_oo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -60,11 +88,85 @@
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 def key_path_to_source(kp: KeyPath) -> Source:
     """
     Given a key path, return the source for the key path.
     """
     source: Source = LocalSource("args")
+=======
+class _KeyPath:
+    """
+    Wraps `KeyPath` to aid `isinstance` checks.
+    """
+
+    def __init__(self, kp: KeyPath):
+        self.kp = kp
+
+
+class _KeyPathTrie:
+    """
+    Builds a trie of `KeyPath` prefixes mapping to `Source` leaves.
+    """
+
+    def __init__(self):
+        self.root = {}
+
+    def add(self, kp: KeyPath, src: Source):
+        assert len(kp) > 0
+        *path, leaf = kp
+        node = self.root
+        for k in path:
+            if k not in node:
+                node[k] = {}
+            node = node[k]
+        node[leaf] = src
+
+    def get(self, kp: KeyPath) -> tuple[Source, KeyPath]:
+        node = self.root
+        while not isinstance(node, Source):
+            assert len(kp) > 0
+            k, *kp = kp  # type: ignore[assignment]
+            node = node[k]
+        return node, kp
+
+
+def make_sourced_prefixes(nn_module, args, kwargs) -> _KeyPathTrie:
+    kp_args, kp_kwargs = tree_map_with_path(
+        lambda kp, _: _KeyPath(kp),
+        (tuple(None for _ in args), {k: None for k in kwargs}),  # noqa: C420
+    )
+    kp_combined_args = _combine_args(nn_module, kp_args, kp_kwargs)
+
+    sourced_prefixes = _KeyPathTrie()
+    for name, struct in kp_combined_args.items():
+        src = LocalSource(name)
+
+        if isinstance(struct, _KeyPath):
+            sourced_prefixes.add(struct.kp, src)
+        elif isinstance(struct, tuple):
+            for i, prefix in enumerate(struct):
+                assert isinstance(prefix, _KeyPath)
+                sourced_prefixes.add(prefix.kp, GetItemSource(src, i))
+        elif isinstance(struct, dict):
+            for k, prefix in struct.items():
+                assert isinstance(prefix, _KeyPath)
+                sourced_prefixes.add(prefix.kp, GetItemSource(src, k))
+
+    return sourced_prefixes
+
+
+def key_path_to_source(
+    kp: KeyPath, sourced_prefixes: Optional[_KeyPathTrie] = None
+) -> Source:
+    """
+    Given a key path, return the source for the key path.
+    """
+    if sourced_prefixes is None:
+        source: Source = LocalSource("args")
+    else:
+        source, kp = sourced_prefixes.get(kp)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for k in kp:
         if isinstance(k, SequenceKey):
             source = GetItemSource(source, k.idx)
@@ -79,7 +181,11 @@ def key_path_to_source(kp: KeyPath) -> Source:
 
 
 def _is_constant_argument(t):
+<<<<<<< HEAD
     return t is None or isinstance(t, (int, float, bool, str))
+=======
+    return t is None or isinstance(t, (float, bool, str))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def fakify(
@@ -88,11 +194,39 @@ def fakify(
     t: Any,
     t_constraints: dict[int, dict[int, Constraint]],
     sources: dict[tuple[int, int], list[Source]],
+<<<<<<< HEAD
 ):
     source = key_path_to_source(kp)
     if _is_constant_argument(t) or isinstance(t, (torch.ScriptObject, torch.nn.Module)):
         return t
 
+=======
+    sourced_prefixes: Optional[_KeyPathTrie] = None,
+):
+    source = key_path_to_source(kp, sourced_prefixes=sourced_prefixes)
+    if _is_constant_argument(t) or isinstance(t, (torch.ScriptObject, torch.nn.Module)):
+        return t
+
+    if isinstance(t, _IntWrapper):
+        if t.dynamism is not None and t.dynamism.type in (_DimHintType.DYNAMIC, _DimHintType.AUTO):  # type: ignore[union-attr]
+            symint = mode.shape_env.create_unspecified_symint_and_symbol(  # type: ignore[union-attr]
+                t.val, source, DimDynamic.DYNAMIC
+            )
+            context = (
+                SymIntSymbolicContext(
+                    constraint=RelaxedUnspecConstraint(warn_only=False)
+                )
+                if t.dynamism.type == _DimHintType.DYNAMIC  # type: ignore[union-attr]
+                else None
+            )
+            mode.shape_env.tracked_fakes.append(  # type: ignore[union-attr]
+                TrackedFake(symint, source, context)
+            )
+            return symint
+        else:
+            return t.val
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not isinstance(t, torch.Tensor):
         raise ValueError(
             f"Unsupported input type {type(t)}. "
@@ -101,6 +235,10 @@ def fakify(
             "To register a custom container type, use torch.utils._pytree.register_pytree_node. "
             "To register a constant input, use torch.utils._pytree.register_constant"
         )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     n_dims = len(t.shape)
     dynamic_sizes = []
     constraint_sizes = [None] * n_dims
@@ -115,9 +253,17 @@ def fakify(
             constraint_sizes[i] = RelaxedUnspecConstraint(warn_only=False)  # type: ignore[call-overload]
         else:
             dynamic_sizes.append(DimDynamic.STATIC)
+<<<<<<< HEAD
     symbolic_context = StatelessSymbolicContext(
         dynamic_sizes=dynamic_sizes,
         constraint_sizes=constraint_sizes,  # type: ignore[arg-type]
+=======
+    symbolic_context: StatelessSymbolicContext = (  # make mypy happy
+        StatelessSymbolicContext(
+            dynamic_sizes=dynamic_sizes,
+            constraint_sizes=constraint_sizes,  # type: ignore[arg-type]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     t_id = id(t)
     assert mode.shape_env is not None
@@ -134,6 +280,100 @@ def fakify(
     return fake
 
 
+<<<<<<< HEAD
+=======
+def _is_unbacked_symint(symbol):
+    if not isinstance(symbol, torch.SymInt):
+        return False
+
+    return symbol.node.shape_env.is_unbacked_symint(symbol.node.expr)
+
+
+def _tensor_min_max(*args, real_callable, tensor_callable, **kwargs):
+    """
+    This logic is replicated from dynamo/variables/builtin.py
+    """
+    if len(args) == 2 and not kwargs:
+        arg1, arg2 = args
+
+        # Case 1: Both are tensors
+        if isinstance(arg1, torch.Tensor) and isinstance(arg2, torch.Tensor):
+            return tensor_callable(arg1, arg2)
+
+        # Case 2: One tensor, one scalar
+        elif isinstance(arg1, torch.Tensor) or isinstance(arg2, torch.Tensor):
+            if not isinstance(arg1, torch.Tensor):
+                arg1, arg2 = arg2, arg1
+
+            if isinstance(arg2, (int, float)):
+                kwarg = {"min" if tensor_callable is torch.maximum else "max": arg2}
+                return torch.clamp(arg1, **kwarg)  # type: ignore[call-overload]
+            else:
+                return real_callable(arg1, arg2)
+
+        # Case 3: SymInts
+        elif isinstance(arg1, torch.SymInt) or isinstance(arg2, torch.SymInt):
+            return (
+                torch.sym_max(arg1, arg2)
+                if tensor_callable is torch.maximum
+                else torch.sym_min(arg1, arg2)
+            )
+
+        # Fallback
+        else:
+            return real_callable(arg1, arg2)
+
+    # Single iterable argument handling
+    if len(args) == 1 and not kwargs:
+        iterable = args[0]
+
+        if isinstance(iterable, torch.Tensor):
+            return tensor_callable(iterable)
+        try:
+            iterator = iter(iterable)
+        except TypeError:
+            pass
+        else:
+            items = list(iterator)
+            if not items:
+                raise ValueError(f"{real_callable.__name__}() arg is an empty sequence")
+
+            return functools.reduce(
+                lambda a, b: _tensor_min_max(
+                    a, b, real_callable=real_callable, tensor_callable=tensor_callable
+                ),
+                items,
+            )
+
+    # Fallback to original callable
+    return real_callable(*args, **kwargs)
+
+
+@contextmanager
+def _override_builtin_ops():
+    original_max = builtins.max
+    original_min = builtins.min
+    original_pow = math.pow
+
+    builtins.max = functools.partial(
+        _tensor_min_max, real_callable=original_max, tensor_callable=torch.maximum
+    )
+
+    builtins.min = functools.partial(
+        _tensor_min_max, real_callable=original_min, tensor_callable=torch.minimum
+    )
+
+    math.pow = lambda x, y: x**y  # type: ignore[operator]
+
+    try:
+        yield
+    finally:
+        builtins.max = original_max
+        builtins.min = original_min
+        math.pow = original_pow
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def make_fake_inputs(
     nn_module,
     args,
@@ -156,6 +396,14 @@ def make_fake_inputs(
     # In strict, these steps are spread across multiple files:
     #   - output_graph.py fakifies inputs.
     #   - [post-tracing] guards.py processes input shape equalities.
+<<<<<<< HEAD
+=======
+    import torch._functorch.config as _config
+
+    # Map ints to a wrapper structure to help us mark it as dynamic, if it is
+    # dynamic. We will unwrap ints in fakify later.
+    args, kwargs = pytree.tree_map_only(int, lambda a: _IntWrapper(a), (args, kwargs))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     combined_args = _combine_args(nn_module, args, kwargs)
     _check_dynamic_shapes(combined_args, dynamic_shapes)
@@ -171,12 +419,21 @@ def make_fake_inputs(
         # create another fake mode.
         fake_mode = context.fake_mode
     elif not _is_torch_jit_trace:
+<<<<<<< HEAD
         code = nn_module.forward.__code__
+=======
+        if isinstance(nn_module.forward, functools.partial):
+            # functools handles nesting by itself, no need to recurse
+            code = nn_module.forward.func.__code__
+        else:
+            code = nn_module.forward.__code__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         co_fields = {
             "co_name": code.co_name,
             "co_filename": code.co_filename,
             "co_firstlineno": code.co_firstlineno,
         }
+<<<<<<< HEAD
         fake_mode = FakeTensorMode(
             shape_env=ShapeEnv(
                 tracked_fakes=[],
@@ -196,6 +453,31 @@ def make_fake_inputs(
             ),
             allow_non_fake_inputs=True,
         )
+=======
+        with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+            fake_mode = FakeTensorMode(
+                shape_env=ShapeEnv(
+                    tracked_fakes=[],
+                    co_fields=co_fields,
+                    prefer_deferred_runtime_asserts_over_guards=True,
+                    allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+                    trace_asserts=True,
+                ),
+                allow_non_fake_inputs=True,
+                export=True,
+            )
+    else:
+        with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+            fake_mode = FakeTensorMode(
+                shape_env=ShapeEnv(
+                    tracked_fakes=[],
+                    prefer_deferred_runtime_asserts_over_guards=True,
+                    allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+                    trace_asserts=True,
+                ),
+                allow_non_fake_inputs=True,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if fake_mode.shape_env is None or fake_mode.shape_env.tracked_fakes is None:
         raise ValueError(
             "Detected fake_mode does not have a shape_env with tracked fakes. "
@@ -210,8 +492,21 @@ def make_fake_inputs(
         else:
             original_signature = None
         sources: dict[tuple[int, int], list[Source]] = defaultdict(list)
+<<<<<<< HEAD
         fake_args, fake_kwargs = tree_map_with_path(
             lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),
+=======
+        sourced_prefixes = make_sourced_prefixes(nn_module, args, kwargs)
+        fake_args, fake_kwargs = tree_map_with_path(
+            lambda kp, val: fakify(
+                fake_mode,
+                kp,
+                val,
+                t_constraints,
+                sources,
+                sourced_prefixes=sourced_prefixes,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (args, kwargs),
         )
 
@@ -340,6 +635,86 @@ def produce_guards_and_solve_constraints(
         raise constraint_violation_error
 
 
+<<<<<<< HEAD
+=======
+def is_int(x: object) -> bool:
+    return isinstance(x, int) or (isinstance(x, torch.SymInt) and x.node.expr.is_number)
+
+
+def _constrain_user_specified_dimhint_range(
+    symint: torch.SymInt,
+    hint: int,
+    dim: _DimHint,
+    range_constraints,
+    shape_env,
+    keypath: KeyPath,
+    i: Optional[int] = None,
+) -> Optional[str]:
+    trace_vr = (
+        range_constraints[symint.node.expr]
+        if not is_int(symint)
+        else ValueRanges(int(symint), int(symint))
+    )
+
+    # warn on 0/1 specialization for Dim.AUTO; not an actual error
+    if dim.type == _DimHintType.AUTO and trace_vr.is_singleton() and hint in (0, 1):
+        pathstr = f"inputs{pytree.keystr(keypath)}"
+        if i is not None:
+            pathstr += f".shape[{i}]"
+        msg = (
+            f"dimension {pathstr} 0/1 specialized; Dim.AUTO was specified along "
+            + f"with a sample input with hint = {hint}."
+        )
+        log.warning(msg)
+
+    try:
+        user_vr = ValueRanges(
+            lower=0 if dim.min is None else dim.min,
+            upper=int_oo if dim.max is None else dim.max,
+        )
+        if is_int(symint):
+            out_vr = trace_vr & user_vr
+        else:
+            range_constraints[symint.node.expr] &= user_vr
+            shape_env.var_to_range[symint.node._expr] &= user_vr
+            out_vr = range_constraints[symint.node.expr]
+
+        # check for Dim.DYNAMIC specializations; special case error message on 0/1
+        if dim.type == _DimHintType.DYNAMIC and out_vr.is_singleton():
+            path = f"inputs{pytree.keystr(keypath)}"
+            if i is not None:
+                path += f".shape[{i}]"
+            if (
+                trace_vr.is_singleton()
+                and hint in (0, 1)
+                and not torch.fx.experimental._config.backed_size_oblivious
+            ):
+                msg = (
+                    f"- Received user-specified dim hint Dim.DYNAMIC(min={dim.min}, max={dim.max}), "
+                    f"but export 0/1 specialized due to hint of {hint} for dimension {path}."
+                )
+            else:
+                msg = (
+                    f"- Received user-specified dim hint Dim.DYNAMIC(min={dim.min}, max={dim.max}), "
+                    f"but tracing inferred a static shape of {out_vr.lower} for dimension {path}."
+                )
+            return msg
+
+    except torch.utils._sympy.value_ranges.ValueRangeError:
+        path = f"inputs{pytree.keystr(keypath)}"
+        if i is not None:
+            path += f".shape[{i}]"
+        msg = (
+            f"- Received user-specified min/max range of [{dim.min}, {dim.max}], "
+            f"conflicting with the inferred min/max range of [{trace_vr.lower}, {trace_vr.upper}], "
+            f"for {path}."
+        )
+        return msg
+
+    return None
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def make_constraints(
     fake_mode: FakeTensorMode,
     gm: torch.fx.GraphModule,
@@ -359,6 +734,7 @@ def make_constraints(
     shape_env = fake_mode.shape_env
     assert shape_env is not None
     inline_constraints = gm.meta.get("inline_constraints", [])
+<<<<<<< HEAD
     range_constraints = {
         symbol: inline_constraints[symbol] for symbol in inline_constraints
     }
@@ -367,6 +743,15 @@ def make_constraints(
 
     # clean up dynamic markers from tensors
     for arg in pytree.tree_flatten(combined_args)[0]:
+=======
+    range_constraints = defaultdict(lambda: ValueRanges(0, int_oo)) | inline_constraints
+    if not dynamic_shapes:
+        return dict(range_constraints)
+
+    # clean up dynamic markers from tensors
+    flat_paths, flat_args = zip(*pytree.tree_flatten_with_path(combined_args)[0])
+    for arg in flat_args:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(arg, torch.Tensor):
             _clean_dynamic_markers(arg)
 
@@ -380,6 +765,7 @@ def make_constraints(
     num_placeholders = [node.op == "placeholder" for node in gm.graph.nodes].count(True)
     assert len(flat_dynamic_shapes) == num_placeholders - num_lifted_inputs
 
+<<<<<<< HEAD
     input_dims = defaultdict(list)
     free_symbols = set()
     for input_index, node in enumerate(gm.graph.nodes):
@@ -408,6 +794,89 @@ def make_constraints(
                     )
                 input_dims[d.node.expr].append(InputDim(input_name=node.name, dim=i))
                 free_symbols.update(d.node.expr.free_symbols)
+=======
+    free_symbols = set()
+    range_violations = []
+    for input_index, node in enumerate(gm.graph.nodes):
+        meta_val = node.meta.get("val")
+
+        if (
+            input_index < num_lifted_inputs
+            or node.op != "placeholder"
+            or meta_val is None
+        ):
+            continue
+
+        elif _is_constant_argument(meta_val) or isinstance(meta_val, CustomObjArgument):
+            continue
+
+        shape_spec = flat_dynamic_shapes[input_index - num_lifted_inputs]
+        keypath = flat_paths[input_index - num_lifted_inputs]
+        flat_arg = flat_args[input_index - num_lifted_inputs]
+
+        if isinstance(meta_val, int) or (
+            isinstance(meta_val, torch.SymInt) and meta_val.node.expr.is_number
+        ):
+            pass
+
+        elif isinstance(meta_val, torch.SymInt):
+            if shape_spec is not None and isinstance(shape_spec, _DimHint):
+                hint = flat_arg
+                range_constraints[meta_val.node.expr] &= shape_env.bound_sympy(
+                    meta_val.node._expr
+                )
+                violation = _constrain_user_specified_dimhint_range(
+                    meta_val,
+                    hint,
+                    shape_spec,
+                    range_constraints,
+                    shape_env,
+                    keypath,
+                    None,
+                )
+                if violation:
+                    range_violations.append(violation)
+            else:
+                raise RuntimeError("nyi")
+            free_symbols.update(meta_val.node.expr.free_symbols)
+
+        elif isinstance(meta_val, torch.Tensor):
+            for i, d in enumerate(node.meta["val"].shape):
+                dim = None
+                if isinstance(shape_spec, (list, tuple)):
+                    dim = shape_spec[i]
+                elif isinstance(shape_spec, dict):
+                    dim = shape_spec.get(i)
+                if not is_int(d):
+                    # Compute the range constraint for the symbolic expression corresponding
+                    # to this shape dimension and store it.
+                    if dim is None or isinstance(dim, _DimHint):
+                        range_constraints[d.node.expr] &= shape_env.bound_sympy(
+                            d.node.expr
+                        )
+                    else:
+                        range_constraints[d.node.expr] &= ValueRanges(
+                            lower=dim.min, upper=dim.max
+                        )
+
+                    free_symbols.update(d.node.expr.free_symbols)
+
+                # check user-specified min/max range for DimHints;
+                # we might want to do this even if model tracing inferred a static dimension.
+                if isinstance(dim, _DimHint):
+                    hint = flat_arg.shape[i]
+                    violation = _constrain_user_specified_dimhint_range(
+                        d, hint, dim, range_constraints, shape_env, keypath, i
+                    )
+                    if violation:
+                        range_violations.append(violation)
+        else:
+            raise RuntimeError(f"Unfamiliar meta val: {meta_val}")
+
+    if range_violations:
+        prefix = "Found the following conflicts between user-specified ranges and inferred ranges from model tracing:\n"
+        raise ValueError(prefix + "\n".join(range_violations))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for symbol in free_symbols:
         if symbol not in range_constraints:
@@ -417,7 +886,11 @@ def make_constraints(
             # we want to record range constraints for their root symbols.
             range_constraints[symbol] = shape_env.var_to_range[symbol]
 
+<<<<<<< HEAD
     return range_constraints
+=======
+    return dict(range_constraints)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _gather_constant_attrs(m: torch.nn.Module) -> ConstantAttrMap:
@@ -528,7 +1001,11 @@ def _fakify_module_inputs(
 @contextlib.contextmanager
 def _fakify_script_objects(
     mod: torch.nn.Module,
+<<<<<<< HEAD
     args: tuple[Any],
+=======
+    args: Sequence[Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kwargs: dict[Any, Any],
     fake_mode: torch._subclasses.fake_tensor.FakeTensorMode,
 ):
@@ -653,6 +1130,7 @@ def _override(self, func, args, kwargs):
             # because it has some known incompletenesses, e.g., it doesn't support
             # empty data. See https://github.com/pytorch/pytorch/issues/143216
             if any(
+<<<<<<< HEAD
                 isinstance(a, torch.SymInt) for a in pytree.tree_flatten(args[0])[0]
             ):
                 return torch._refs.tensor, args, kwargs
@@ -660,6 +1138,45 @@ def _override(self, func, args, kwargs):
             # Redirect to torch.select for indexing with symint.
             if isinstance(args[1], torch.SymInt):
                 return torch.select, [args[0], 0, args[1]], {}
+=======
+                isinstance(a, (torch.SymInt, torch.SymFloat, torch.SymBool))
+                for a in pytree.tree_flatten(args[0])[0]
+            ):
+                return torch._refs.tensor, args, kwargs
+        if func.__name__ == "__getitem__" and isinstance(args[0], torch.Tensor):
+
+            def rewrite(dim, item):
+                # Redirect to torch.select for indexing.
+                if isinstance(item, (int, torch.SymInt)):
+                    return dim, (torch.select, [dim, item])
+                # Redirect to torch.ops.aten.slice for slicing.
+                if isinstance(item, slice):
+                    return dim + 1, (
+                        torch.ops.aten.slice,
+                        [dim, item.start, item.stop, item.step or 1],
+                    )
+                # Otherwise do nothing.
+
+            items = args[1] if isinstance(args[1], tuple) else (args[1],)
+            dim = 0
+            # Sequence rewrites.
+            sequence = []
+            for item in items:
+                if (r := rewrite(dim, item)) is None:
+                    return func, args, kwargs
+                dim, call_spec = r
+                sequence.append(call_spec)
+
+            def run():
+                # Run sequence.
+                t = args[0]
+                for _method, _args in sequence:
+                    t = _method(t, *_args)
+                return t
+
+            return run, [], {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return func, args, kwargs
 
     def __torch_function__(self, func, types, args=(), kwargs=None):
diff --git a/torch/_export/pass_base.py b/torch/_export/pass_base.py
index 9d63811f09ed..769137a9528f 100644
--- a/torch/_export/pass_base.py
+++ b/torch/_export/pass_base.py
@@ -6,20 +6,37 @@
 from typing import Any, Callable, Optional, Union
 
 import torch
+<<<<<<< HEAD
 from functorch.experimental.control_flow import _unstack_pytree
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch import fx
 from torch._dispatch.python import enable_python_dispatcher
 from torch._export.pass_infra.node_metadata import NodeMetadata
 from torch._export.pass_infra.proxy_value import ProxyValue
+<<<<<<< HEAD
+=======
+from torch._higher_order_ops.map import _unstack_pytree
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._subclasses import FakeTensor, UnsupportedFakeTensorException
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx import traceback as fx_traceback
 from torch.fx.experimental.proxy_tensor import PythonKeyTracer
+<<<<<<< HEAD
+=======
+from torch.fx.experimental.symbolic_shapes import (
+    compute_unbacked_bindings,
+    PropagateUnbackedSymInts,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.graph import CodeGen
 from torch.fx.passes.infra.pass_base import PassBase, PassResult
 from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
 from torch.utils import _pytree as pytree
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import PropagateUnbackedSymInts, compute_unbacked_bindings
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["_ExportPassBaseDeprecatedDoNotUse"]
@@ -56,9 +73,16 @@ class _ExportPassBaseDeprecatedDoNotUse(PassBase):
     def _create_dummy_node_metadata():
         return NodeMetadata({"stack_trace": "".join(traceback.format_stack(limit=1))})
 
+<<<<<<< HEAD
 
     class ExportTracer(PythonKeyTracer):
         def __init__(self, callback: "_ExportPassBaseDeprecatedDoNotUse", codegen: CodeGen) -> None:
+=======
+    class ExportTracer(PythonKeyTracer):
+        def __init__(
+            self, callback: "_ExportPassBaseDeprecatedDoNotUse", codegen: CodeGen
+        ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             super().__init__()
             self.callback = callback
             self.root = torch.nn.Module()
@@ -92,12 +116,32 @@ def create_arg(self, a: Argument) -> torch.fx.Node:
             return node
 
         def set_metadata(
+<<<<<<< HEAD
             self, node: torch.fx.Node, value: Argument,
+=======
+            self,
+            node: torch.fx.Node,
+            value: Argument,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> None:
             # propagate the fake tensor or sym nodes
             def make_val(
                 x: Argument,
+<<<<<<< HEAD
             ) -> Union[FakeTensor, torch.SymInt, torch.SymFloat, torch.SymBool, int, float, bool, str, None]:
+=======
+            ) -> Union[
+                FakeTensor,
+                torch.SymInt,
+                torch.SymFloat,
+                torch.SymBool,
+                int,
+                float,
+                bool,
+                str,
+                None,
+            ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(x, FakeTensor):
                     return x
                 elif isinstance(x, torch.Tensor):
@@ -124,7 +168,22 @@ def make_val(
                         )
                         fake_tensor = None
                     return fake_tensor
+<<<<<<< HEAD
                 elif isinstance(x, (torch.SymInt, torch.SymFloat, torch.SymBool, int, float, bool, str)):
+=======
+                elif isinstance(
+                    x,
+                    (
+                        torch.SymInt,
+                        torch.SymFloat,
+                        torch.SymBool,
+                        int,
+                        float,
+                        bool,
+                        str,
+                    ),
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return x
                 else:
                     return None
@@ -153,7 +212,13 @@ def make_tensor_meta(x: Argument) -> Optional[TensorMetadata]:
             node.meta["tensor_meta"] = pytree.tree_map(make_tensor_meta, value)
 
     class ExportInterpreter(fx.Interpreter):
+<<<<<<< HEAD
         def __init__(self, callback: "_ExportPassBaseDeprecatedDoNotUse", gm: fx.GraphModule) -> None:
+=======
+        def __init__(
+            self, callback: "_ExportPassBaseDeprecatedDoNotUse", gm: fx.GraphModule
+        ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             super().__init__(gm)
             self.callback = callback
             self.node: torch.fx.Node = next(iter(gm.graph.nodes))
@@ -186,13 +251,27 @@ def call_function(
             if target == operator.getitem:
                 value, key = args
                 return self.callback.call_getitem(value, key, meta)
+<<<<<<< HEAD
             elif getattr(target, "__module__", None) in {"_operator", "builtins", "math"}:
+=======
+            elif getattr(target, "__module__", None) in {
+                "_operator",
+                "builtins",
+                "math",
+            }:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert callable(target)
                 return self.callback.call_sym(target, args, meta)
             elif target in _TORCH_SYM_OPS:
                 assert callable(target)
                 return self.callback.call_sym(target, args, meta)
+<<<<<<< HEAD
             elif isinstance(target, (torch._ops.OpOverload, torch._ops.OpOverloadPacket)):
+=======
+            elif isinstance(
+                target, (torch._ops.OpOverload, torch._ops.OpOverloadPacket)
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return self.callback.call_operator(
                     target,
                     args,
@@ -269,7 +348,13 @@ def _fx(
         if isinstance(target, torch._ops.OpOverload):
             name = self.tracer.graph._target_to_str(target.overloadpacket.__name__)
 
+<<<<<<< HEAD
         res_proxy = self.tracer.create_proxy(kind, target, args_proxy, kwargs_proxy, name=name)
+=======
+        res_proxy = self.tracer.create_proxy(
+            kind, target, args_proxy, kwargs_proxy, name=name
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res_proxy.node.meta.update(meta.data)
         if self.fake_tensor_mode and (shape_env := self.fake_tensor_mode.shape_env):
             if symbol_to_path := compute_unbacked_bindings(shape_env, res_data):
diff --git a/torch/_export/pass_infra/proxy_value.py b/torch/_export/pass_infra/proxy_value.py
index df62c9d0ffe5..99496745df43 100644
--- a/torch/_export/pass_infra/proxy_value.py
+++ b/torch/_export/pass_infra/proxy_value.py
@@ -1,4 +1,5 @@
 # pyre-strict
+<<<<<<< HEAD
 from typing import Union, Generic
 from collections.abc import Iterator, Iterable
 import torch
@@ -7,6 +8,17 @@
 
 _T = TypeVar("_T")
 
+=======
+from collections.abc import Iterable, Iterator
+from typing import Generic, TypeVar, Union
+
+import torch
+
+
+_T = TypeVar("_T")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ProxyValue(Generic[_T]):
     # pyre-ignore
     def __init__(self, data: Iterable[_T], proxy: Union[torch.fx.Proxy, torch.fx.Node]):
diff --git a/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py b/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
index 99df6c7fb635..0d9f231f201e 100644
--- a/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
+++ b/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
@@ -9,10 +9,18 @@
 
 import torch
 import torch.fx
+<<<<<<< HEAD
 from torch.utils._sympy.value_ranges import ValueRanges
 from torch.utils._sympy.numbers import int_oo
 from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
 from torch.fx.passes.infra.pass_base import PassBase, PassResult
+=======
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+from torch.utils._sympy.numbers import int_oo
+from torch.utils._sympy.value_ranges import ValueRanges
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __all__ = ["InputDim"]
 
@@ -30,9 +38,13 @@ def _convert_to_int(val):
         return -math.inf
     if isinstance(val, sympy.Integer):
         return int(val)
+<<<<<<< HEAD
     raise RuntimeError(
         "Export constraints cannot be non-integer expressions"
     )
+=======
+    raise RuntimeError("Export constraints cannot be non-integer expressions")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _convert_range_to_int(range: ValueRanges):
@@ -55,10 +67,21 @@ def __init__(
     def _assert_range_constraint(self, node, lower, upper, assert_msg):
         last_node = node
         if lower > -math.inf:
+<<<<<<< HEAD
             last_node = self._insert_assert_async(last_node, operator.ge, node, lower, assert_msg)
 
         if upper < math.inf:
             last_node = self._insert_assert_async(last_node, operator.le, node, upper, assert_msg)
+=======
+            last_node = self._insert_assert_async(
+                last_node, operator.ge, node, lower, assert_msg
+            )
+
+        if upper < math.inf:
+            last_node = self._insert_assert_async(
+                last_node, operator.le, node, upper, assert_msg
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _insert_assert_async(self, last_node, op, lower, upper, assert_msg):
         """
@@ -70,7 +93,13 @@ def _insert_assert_async(self, last_node, op, lower, upper, assert_msg):
         with graph.inserting_after(last_node):
             cmp = graph.call_function(op, (lower, upper), {})
         with graph.inserting_after(cmp):
+<<<<<<< HEAD
             cmp_tensor = graph.call_function(torch.ops.aten.scalar_tensor.default, (cmp,), {})
+=======
+            cmp_tensor = graph.call_function(
+                torch.ops.aten.scalar_tensor.default, (cmp,), {}
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with graph.inserting_after(cmp_tensor):
             assert_async = graph.call_function(
                 torch.ops.aten._assert_async.msg,
@@ -111,7 +140,13 @@ def add_assertions(val):
                         symbol = val.node.expr
                         if symbol in self.existing_inline_assertions:
                             return call_backs, messages
+<<<<<<< HEAD
                         if isinstance(symbol, sympy.Symbol) and free_unbacked_symbols(symbol):
+=======
+                        if isinstance(symbol, sympy.Symbol) and free_unbacked_symbols(
+                            symbol
+                        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             if symbol in self._asserts_generated_unbacked_symbols:
                                 return call_backs, messages
                             # We only care about unbacked symints for these inline
@@ -120,7 +155,15 @@ def add_assertions(val):
                             min_val, max_val = _convert_range_to_int(constraint)
                             assert_msg = f" is outside of inline constraint [{min_val}, {max_val}]."
                             call_backs.append(
+<<<<<<< HEAD
                                 partial(self._assert_range_constraint, lower=min_val, upper=max_val)
+=======
+                                partial(
+                                    self._assert_range_constraint,
+                                    lower=min_val,
+                                    upper=max_val,
+                                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             )
                             messages.append(assert_msg)
                             self._asserts_generated_unbacked_symbols.add(symbol)
@@ -129,6 +172,10 @@ def add_assertions(val):
                         for i, sym in enumerate(val.shape):
                             cbs, msgs = add_assertions(sym)
                             for cb, msg in zip(cbs, msgs):
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 def sym_size_cb(node, assert_msg, dim):
                                     with node.graph.inserting_after(node):
                                         dim_node = module.graph.call_function(
@@ -137,6 +184,10 @@ def sym_size_cb(node, assert_msg, dim):
                                             {},
                                         )
                                     cb(node=dim_node, assert_msg=assert_msg)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 call_backs.append(partial(sym_size_cb, dim=i))
                                 messages.append(f".shape[{i}]" + msg)
                     return call_backs, messages
@@ -149,12 +200,26 @@ def sym_size_cb(node, assert_msg, dim):
 
         # Sometimes this pass would return a wrong graph where we have mismatched
         # node names in signature. Before we fix it, let's just skip it.
+<<<<<<< HEAD
         if self.counter == 0 and type(self) is _AddRuntimeAssertionsForInlineConstraintsPass:
+=======
+        if (
+            self.counter == 0
+            and type(self) is _AddRuntimeAssertionsForInlineConstraintsPass
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return PassResult(graph_module, False)
 
         # Populate the stack trace with dummy vals to respect IR
         for node in graph_module.graph.nodes:
+<<<<<<< HEAD
             if not node.meta.get("stack_trace", None) and node.op not in ["placeholder", "output"]:
+=======
+            if not node.meta.get("stack_trace", None) and node.op not in [
+                "placeholder",
+                "output",
+            ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 node.meta["stack_trace"] = "".join(traceback.format_stack(limit=1))
         return PassResult(graph_module, True)
 
@@ -179,10 +244,17 @@ def _get_existing_inline_assertions(
 
             compare_arg = node.args[0]
             if not (
+<<<<<<< HEAD
                 isinstance(compare_arg, torch.fx.Node) and
                 compare_arg.op == "call_function" and
                 compare_arg.target in (operator.le, operator.ge) and
                 len(compare_arg.args) == 2
+=======
+                isinstance(compare_arg, torch.fx.Node)
+                and compare_arg.op == "call_function"
+                and compare_arg.target in (operator.le, operator.ge)
+                and len(compare_arg.args) == 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 continue
 
@@ -191,9 +263,15 @@ def _get_existing_inline_assertions(
 
             def maybe_get_symint(x):
                 if (
+<<<<<<< HEAD
                     isinstance(x, torch.fx.Node) and
                     "val" in x.meta and
                     isinstance(x.meta["val"], torch.SymInt)
+=======
+                    isinstance(x, torch.fx.Node)
+                    and "val" in x.meta
+                    and isinstance(x.meta["val"], torch.SymInt)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     return x.meta["val"].node.expr
                 return x
@@ -214,9 +292,19 @@ def maybe_get_symint(x):
                 continue
 
             if symint not in range_constraints:
+<<<<<<< HEAD
                 raise RuntimeError(f"Unable to find symint {symint} in {range_constraints}")
 
             previous_range = existing_inline_assertions.get(symint, ValueRanges(-math.inf, math.inf))
+=======
+                raise RuntimeError(
+                    f"Unable to find symint {symint} in {range_constraints}"
+                )
+
+            previous_range = existing_inline_assertions.get(
+                symint, ValueRanges(-math.inf, math.inf)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if symint is lhs:
                 bounds = ValueRanges(-math.inf, scalar)
diff --git a/torch/_export/passes/functionalize_side_effectful_ops_pass.py b/torch/_export/passes/functionalize_side_effectful_ops_pass.py
index c14e859e4ef3..a2b12ec5b8e5 100644
--- a/torch/_export/passes/functionalize_side_effectful_ops_pass.py
+++ b/torch/_export/passes/functionalize_side_effectful_ops_pass.py
@@ -2,15 +2,31 @@
 from typing import Optional
 
 import torch
+<<<<<<< HEAD
 from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse, PassResult, Argument
+=======
+from torch._export.pass_base import (
+    _ExportPassBaseDeprecatedDoNotUse,
+    Argument,
+    PassResult,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._export.pass_infra.node_metadata import NodeMetadata
 from torch._export.pass_infra.proxy_value import ProxyValue
 from torch._ops import OpOverload
 
+<<<<<<< HEAD
 aten = torch.ops.aten
 
 _NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS: dict[OpOverload, OpOverload] = {
     aten.sym_constrain_range.default: aten._functional_sym_constrain_range,
+=======
+
+aten = torch.ops.aten
+
+_NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS: dict[OpOverload, OpOverload] = {
+    aten.sym_constrain_range.default: aten._functional_sym_constrain_range.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten._assert_async.msg: aten._functional_assert_async.msg,
 }
 
diff --git a/torch/_export/passes/insert_custom_op_guards.py b/torch/_export/passes/insert_custom_op_guards.py
index 997e6e3193f5..438aaa4c93c3 100644
--- a/torch/_export/passes/insert_custom_op_guards.py
+++ b/torch/_export/passes/insert_custom_op_guards.py
@@ -1,13 +1,24 @@
 import functools
+<<<<<<< HEAD
+=======
+from collections import defaultdict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._export.passes._node_metadata_hook import (
     _node_metadata_hook,
     _set_node_metadata_hook,
 )
+<<<<<<< HEAD
 
 
 def insert_custom_op_guards(gm: torch.fx.GraphModule, ops_to_guard: list[str]) -> None:
+=======
+from torch._library.fake_profile import OpProfile, TensorMetadata
+
+
+def insert_custom_op_guards(gm: torch.fx.GraphModule, ops_to_guard: set[str]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This is used by draft_export to insert guards in front of calls to custom
     operators which have a generated fake kernel.
@@ -36,3 +47,44 @@ def insert_custom_op_guards(gm: torch.fx.GraphModule, ops_to_guard: list[str]) -
                         )
 
     gm.recompile()
+<<<<<<< HEAD
+=======
+
+
+def get_op_profiles(
+    gm: torch.fx.GraphModule, ops_to_guard: set[str]
+) -> dict[str, set[OpProfile]]:
+    """
+    This is used by draft_export to get a list of custom operator profiles so
+    that we can generate fake kernels.
+    """
+
+    def _get_op_profile(node: torch.fx.Node) -> OpProfile:
+        args_profile = tuple(
+            [
+                TensorMetadata.maybe_from_tensor(arg.meta.get("val"))
+                if isinstance(arg, torch.fx.Node)
+                else None
+                for arg in (*node.args, *node.kwargs.values())
+            ]
+        )
+
+        out_profile = None
+        meta = node.meta.get("val")
+        assert meta is not None
+        if isinstance(meta, torch.Tensor):
+            out_profile = TensorMetadata.maybe_from_tensor(meta)
+        elif isinstance(meta, (list, tuple)):
+            out_profile = tuple([TensorMetadata.maybe_from_tensor(m) for m in meta])  # type: ignore[assignment]
+        assert out_profile is not None
+
+        return OpProfile(args_profile, out_profile)  # type: ignore[arg-type]
+
+    op_profiles: dict[str, set[OpProfile]] = defaultdict(set)
+
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and str(node.target) in ops_to_guard:
+            op_profiles[str(node.target)].add(_get_op_profile(node))
+
+    return op_profiles
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_export/passes/lift_constants_pass.py b/torch/_export/passes/lift_constants_pass.py
index 805b512591ef..99cb00b1287a 100644
--- a/torch/_export/passes/lift_constants_pass.py
+++ b/torch/_export/passes/lift_constants_pass.py
@@ -1,7 +1,12 @@
 # mypy: allow-untyped-defs
 import collections
+<<<<<<< HEAD
 import warnings
 from typing import Any, Union
+=======
+import logging
+from typing import Any, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._export.verifier import SpecViolationError
@@ -16,9 +21,19 @@
     InputSpec,
     TensorArgument,
 )
+<<<<<<< HEAD
 from torch.fx.graph_module import _get_attr
 
 
+=======
+from torch.fx._symbolic_trace import _ConstantAttributeType
+from torch.fx.graph_module import _get_attr
+
+
+log = logging.getLogger(__name__)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ConstantAttrMap(collections.abc.MutableMapping):
     """A mapping class that understands how to use module constants (tensors,
     ScriptObjects, FakeScriptObjects) as keys. We store tensors and FakeScriptObjects normally,
@@ -30,30 +45,47 @@ class ConstantAttrMap(collections.abc.MutableMapping):
     def __init__(self) -> None:
         # Underlying dict that we use to implement this mapping.
         self._constant_attrs: dict[
+<<<<<<< HEAD
             Union[int, torch.Tensor, FakeScriptObject], list[Any]
+=======
+            Union[int, torch.Tensor, FakeScriptObject, torch.utils._pytree.TreeSpec],
+            list[Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ] = {}
         # Map from the hash(ScriptObject) to the ScriptObject itself. Used for
         # APIs like `__iter__` that should look like they're returning the
         # original ScriptObjects.
         self._script_object_map: dict[int, torch.ScriptObject] = {}
 
+<<<<<<< HEAD
     def __getitem__(
         self, key: Union[torch.Tensor, torch.ScriptObject, FakeScriptObject]
     ) -> Any:
+=======
+    def __getitem__(self, key: _ConstantAttributeType) -> Any:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
         assert isinstance(real_key, (int, torch.Tensor, FakeScriptObject))
         return self._constant_attrs[real_key]
 
+<<<<<<< HEAD
     def __setitem__(self, key: Union[torch.Tensor, torch.ScriptObject], value):
+=======
+    def __setitem__(self, key: _ConstantAttributeType, value):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # we shouldn't actually call this, should go to add() instead to handle aliasing
         raise NotImplementedError(
             """Directly setting values for ConstantAttrMap is not supported, please use add(key, value) instead.
 The same key can be mapped to multiple values, for handling constant aliasing."""
         )
 
+<<<<<<< HEAD
     def add(
         self, key: Union[torch.Tensor, torch.ScriptObject, FakeScriptObject], value: Any
     ) -> None:
+=======
+    def add(self, key: _ConstantAttributeType, value: Any) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(key, torch.ScriptObject):
             if hash(key) not in self._constant_attrs:
                 self._constant_attrs[hash(key)] = []
@@ -68,7 +100,11 @@ def add(
                 f"Expected key to be a tensor or ScriptObject, got {type(key)}"
             )
 
+<<<<<<< HEAD
     def __delitem__(self, key):
+=======
+    def __delitem__(self, key: _ConstantAttributeType):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
 
         del self._constant_attrs[real_key]
@@ -103,17 +139,77 @@ def get_constant_fqn(node: torch.fx.Node, constant_name: str) -> str:
 
 def _get_first_fqn(
     const_attrs: ConstantAttrMap,
+<<<<<<< HEAD
     key: Union[torch.Tensor, torch.ScriptObject, FakeScriptObject],
+=======
+    key: _ConstantAttributeType,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Any:
     fqns = const_attrs.get(key)
     return fqns[0] if fqns else None
 
 
+<<<<<<< HEAD
+=======
+def _unused_constant(node: torch.fx.Node) -> Optional[list[torch.fx.Node]]:
+    """
+    If there is a tensor constant created while tracing, here is how the graph
+    looks like:
+
+        %_tensor_constant0 : [num_users=1] = get_attr[target=_tensor_constant0]
+        %lift_fresh_copy : [num_users=1] = call_function[target=torch.ops.aten.lift_fresh_copy.default](args = (%_tensor_constant0,))
+        %detach_ : [num_users=?] = call_function[target=torch.ops.aten.detach_.default](args = (%lift_fresh_copy,))
+
+    To check to see if the tensor constant is being used, we want to traverse to
+    the detach node to see if it's actually being used.
+
+    This function returns None if this constant is being used, otherwise it returns the
+    lift_fresh and detach node to be removed later.
+    """  # noqa: B950
+    if len(node.users) > 1:
+        return None
+
+    lift_fresh_node = next(iter(node.users.keys()))
+    if not (
+        lift_fresh_node.op == "call_function"
+        and lift_fresh_node.target
+        in (
+            torch.ops.aten.lift_fresh.default,
+            torch.ops.aten.lift_fresh_copy.default,
+        )
+    ):
+        return None
+
+    if len(lift_fresh_node.users) > 1:
+        return None
+
+    detach_node = next(iter(lift_fresh_node.users.keys()))
+    if not (
+        detach_node.op == "call_function"
+        and detach_node.target
+        in (
+            torch.ops.aten.detach_.default,
+            torch.ops.aten.detach.default,
+        )
+    ):
+        return None
+
+    if len(detach_node.users) > 0:
+        return None
+    else:
+        return [detach_node, lift_fresh_node, node]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def lift_constants_pass(
     gm: torch.fx.GraphModule,
     graph_signature: ExportGraphSignature,
     constant_attrs: ConstantAttrMap,
+<<<<<<< HEAD
 ) -> dict[str, Union[torch.Tensor, torch.ScriptObject, FakeScriptObject]]:
+=======
+) -> dict[str, _ConstantAttributeType]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Takes a graph module, graph signature, and modifies them implace to lift any
     constants (tensors or custom classes) as inputs to the graph. Returns a
@@ -131,9 +227,13 @@ def lift_constants_pass(
     Returns:
         A dictionary of fqn => constant value.
     """
+<<<<<<< HEAD
     all_constants: dict[
         str, Union[torch.Tensor, torch.ScriptObject, FakeScriptObject]
     ] = {}
+=======
+    all_constants: dict[str, _ConstantAttributeType] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     inputs = graph_signature.input_specs
     num_custom_obj = sum(
@@ -148,11 +248,19 @@ def lift_constants_pass(
     )
 
     first_user_input_loc, first_user_input = 0, next(iter(gm.graph.nodes))
+<<<<<<< HEAD
+=======
+    used_target_names = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in gm.graph.nodes:
         if node.op == "placeholder":
             if node.name in graph_signature.user_inputs:
                 first_user_input = node
                 break
+<<<<<<< HEAD
+=======
+            used_target_names.add(inputs[first_user_input_loc].target)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             first_user_input_loc += 1
         # If we ever hit here, it means that
         # there was no user input so the constants
@@ -164,9 +272,32 @@ def lift_constants_pass(
 
     lifted_objs = ConstantAttrMap()
     renamed_targets = {}
+<<<<<<< HEAD
     for node in gm.graph.nodes:
         if node.op == "get_attr":
             constant_val = _get_attr(gm, node.target)
+=======
+    for node in list(gm.graph.nodes):
+        if node.op == "get_attr":
+            if nodes_to_remove := _unused_constant(node):
+                # Remove the node if it's not being used
+                for node_rm in nodes_to_remove:
+                    gm.graph.erase_node(node_rm)
+                continue
+
+            constant_val = _get_attr(gm, node.target)
+            # These are not hashable and not gonna be lifted
+            # so we can skip them earlier
+            if isinstance(constant_val, torch.fx.GraphModule):
+                continue
+            if "LoweredBackendModule" in type(constant_val).__name__:
+                continue
+            if "AOTInductorRunnerWrapper" in type(constant_val).__name__:
+                continue
+            if isinstance(constant_val, torch.utils._pytree.TreeSpec):
+                continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if constant_val in lifted_objs:
                 # We already lifted this constant elsewhere. Just rewrite uses
                 # of this get_attr to point to the already-existing placeholder
@@ -194,13 +325,28 @@ def lift_constants_pass(
                 else:
                     constant_name = f"lifted_custom_{num_custom_obj}"
                     constant_fqn = get_constant_fqn(node, constant_name)
+<<<<<<< HEAD
+=======
+                    while constant_fqn in used_target_names:
+                        num_custom_obj += 1
+                        constant_name = f"lifted_custom_{num_custom_obj}"
+                        constant_fqn = get_constant_fqn(node, constant_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     num_custom_obj += 1
             elif isinstance(constant_val, torch.Tensor):
                 # Remove the parameterness of constant_val
                 if isinstance(constant_val, torch.nn.Parameter):
+<<<<<<< HEAD
                     warnings.warn(
                         f"{node.target} created when tracing {node.meta.get('stack_trace', '<unknown stack>')} is a parameter. But"
                         f"it's not registered with register_parameter(). export will treat it as a constant tensor"
+=======
+                    log.debug(
+                        "%s created when tracing %s is a parameter. But "
+                        "it's not registered with register_parameter(). export will treat it as a constant tensor",
+                        str(node.target),
+                        str(node.meta.get("stack_trace", "<unknown stack>")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     # We get the real data out of the parameter by disabling the surrounding fake mode.
                     with unset_fake_temporarily():
@@ -212,11 +358,19 @@ def lift_constants_pass(
                 else:
                     constant_name = f"lifted_tensor_{num_tensor_constants}"
                     constant_fqn = get_constant_fqn(node, constant_name)
+<<<<<<< HEAD
                     num_tensor_constants += 1
             elif isinstance(constant_val, torch.fx.GraphModule):
                 continue
             elif "LoweredBackendModule" in type(constant_val).__name__:
                 continue
+=======
+                    while constant_fqn in used_target_names:
+                        num_tensor_constants += 1
+                        constant_name = f"lifted_tensor_{num_tensor_constants}"
+                        constant_fqn = get_constant_fqn(node, constant_name)
+                    num_tensor_constants += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise SpecViolationError(
                     f"getattr node {node} referencing unsupported type {type(constant_val)}"
@@ -300,7 +454,11 @@ def lift_constants_pass(
 
 def rewrite_script_object_meta(
     gm: torch.fx.GraphModule,
+<<<<<<< HEAD
 ) -> dict[str, Union[torch.Tensor, torch.ScriptObject, FakeScriptObject],]:
+=======
+) -> dict[str, _ConstantAttributeType,]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """When tracing, we produce a graph with FakeScriptObject in the
     meta["val"].
 
@@ -308,11 +466,15 @@ def rewrite_script_object_meta(
     """
     constants: dict[
         str,
+<<<<<<< HEAD
         Union[
             torch.Tensor,
             torch.ScriptObject,
             FakeScriptObject,
         ],
+=======
+        _ConstantAttributeType,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ] = {}
     for node in gm.graph.nodes:
         if "val" not in node.meta:
@@ -335,7 +497,15 @@ def rewrite_script_object_meta(
     return constants
 
 
+<<<<<<< HEAD
 def _materialize_and_lift_constants(gm, export_graph_signature, constant_attrs):
+=======
+def _materialize_and_lift_constants(
+    gm: torch.fx.GraphModule,
+    export_graph_signature: ExportGraphSignature,
+    constant_attrs: ConstantAttrMap,
+) -> dict[str, _ConstantAttributeType]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constants = rewrite_script_object_meta(gm)
     constants.update(lift_constants_pass(gm, export_graph_signature, constant_attrs))
     return constants
diff --git a/torch/_export/passes/remove_runtime_assertions.py b/torch/_export/passes/remove_runtime_assertions.py
index 87682814b4d8..f4e678f1cb0e 100644
--- a/torch/_export/passes/remove_runtime_assertions.py
+++ b/torch/_export/passes/remove_runtime_assertions.py
@@ -14,7 +14,17 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
             if not isinstance(module, torch.fx.GraphModule):
                 continue
             for node in module.graph.nodes:
+<<<<<<< HEAD
                 if node.target == torch.ops.aten._assert_async.msg:
+=======
+                if node.target in [
+                    torch.ops.aten._assert_async.msg,
+                    torch.ops.aten._assert_scalar.default,
+                    torch.ops.aten.sym_constrain_range_for_size.default,
+                    torch.ops.aten.sym_constrain_range.default,
+                    torch.ops.aten._assert_tensor_metadata.default,
+                ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     assert_async_node = node
                     if len(assert_async_node.users) > 0:
                         continue
@@ -23,4 +33,11 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                     # linear chain of nodes of nodes is removed by the
                     # downstream dead code elimination
                     modified = True
+<<<<<<< HEAD
+=======
+
+        # We don't necessarily want to run DCE here because it could affect
+        # nodes that are in the module_call_graph attribute of the exported
+        # program. We will leave it to the pass caller to call DCE.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return PassResult(graph_module, modified)
diff --git a/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py b/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py
index 2043212d0f66..e65abf3a9677 100644
--- a/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py
+++ b/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py
@@ -1,9 +1,17 @@
 # mypy: allow-untyped-defs
 from typing import Optional
+<<<<<<< HEAD
 import torch
 from torch._ops import OpOverload, HigherOrderOperator
 from torch._export.error import InternalError
 from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
+=======
+
+import torch
+from torch._export.error import InternalError
+from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
+from torch._ops import HigherOrderOperator, OpOverload
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["ReplaceViewOpsWithViewCopyOpsPass"]
@@ -25,9 +33,13 @@ def get_view_copy_of_view_op(schema: torch._C.FunctionSchema) -> Optional[OpOver
     if is_view_op(schema) and schema.name.startswith("aten::"):
         view_op_name = schema.name.split("::")[1]
         view_op_overload = (
+<<<<<<< HEAD
             schema.overload_name
             if schema.overload_name != ""
             else "default"
+=======
+            schema.overload_name if schema.overload_name != "" else "default"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         view_copy_op_name = view_op_name + "_copy"
         if not hasattr(torch.ops.aten, view_copy_op_name):
@@ -50,6 +62,10 @@ class ReplaceViewOpsWithViewCopyOpsPass(_ExportPassBaseDeprecatedDoNotUse):
     program. This pass replaces view ops with view copy ops for backends that
     need AOT memory planning.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_operator(self, op, args, kwargs, meta):
         if op in _NON_FUNCTIONAL_OPS_TO_FUNCTIONAL_OPS:
             return super().call_operator(
diff --git a/torch/_export/serde/dynamic_shapes.py b/torch/_export/serde/dynamic_shapes.py
index 241199b56b86..aa27ce9a6cf0 100644
--- a/torch/_export/serde/dynamic_shapes.py
+++ b/torch/_export/serde/dynamic_shapes.py
@@ -6,7 +6,10 @@
 from torch.export.dynamic_shapes import (
     _check_dynamic_shapes,
     _DerivedDim,
+<<<<<<< HEAD
     _Dim,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _DimHint,
     _tree_map_with_path,
     Dim,
@@ -19,7 +22,11 @@
 @dataclasses.dataclass
 class RootDim:
     """
+<<<<<<< HEAD
     This represents a _Dim object.
+=======
+    This represents a Dim object.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     min: int
@@ -137,7 +144,11 @@ def _standardize_shapes(path, tensor, shape):  # type: ignore[no-untyped-def]
         if not isinstance(tensor, torch.Tensor):
             return None
         if shape is None:
+<<<<<<< HEAD
             return [Dim.STATIC] * len(tensor.shape)  # type: ignore[attr-defined]
+=======
+            return [Dim.STATIC] * len(tensor.shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         out = []
         if isinstance(shape, dict):
@@ -150,7 +161,11 @@ def _standardize_shapes(path, tensor, shape):  # type: ignore[no-untyped-def]
         return out
 
     def _track_dim_from_dims(
+<<<<<<< HEAD
         val: Union[None, int, _DimHint, _Dim]
+=======
+        val: Union[None, int, _DimHint, Dim]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Union[None, int, str]:
         """
         Tracks dims, ranges, derived dims from the standardized dynamic_shapes spec.
@@ -158,9 +173,15 @@ def _track_dim_from_dims(
         if val is None or isinstance(val, int):  # non-tensor input or static
             return val
         if isinstance(val, _DimHint):  # store enum as string
+<<<<<<< HEAD
             return val.__class__.__name__ + "." + val.name
 
         assert isinstance(val, _Dim)
+=======
+            return val.__class__.__name__ + "." + val.type.name
+
+        assert isinstance(val, Dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # track root dim
         root = val.root if isinstance(val, _DerivedDim) else val  # type: ignore[attr-defined]
@@ -290,13 +311,20 @@ def _load_dynamic_shapes(
             modulus, remainder = sympy.polys.polytools.div(expr, symbol)
             ddim = dim_cache[name]
             if modulus != 1:
+<<<<<<< HEAD
                 ddim = int(modulus) * ddim
             if remainder != 0:
                 ddim = ddim + int(remainder)
+=======
+                ddim = int(modulus) * ddim  # type: ignore[assignment, operator]
+            if remainder != 0:
+                ddim = ddim + int(remainder)  # type: ignore[assignment, operator]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dim_cache[_expr] = ddim  # cache derived dims
 
     def deserialize_shape(
         val: Union[None, int, str]
+<<<<<<< HEAD
     ) -> Union[None, int, _Dim, _DimHint]:
         if val is None or isinstance(val, int):
             return val
@@ -304,6 +332,17 @@ def deserialize_shape(
             return _DimHint.AUTO
         elif val == "_DimHint.STATIC":
             return _DimHint.STATIC
+=======
+    ) -> Union[None, int, Dim, _DimHint]:
+        if val is None or isinstance(val, int):
+            return val
+        elif val == "_DimHint.AUTO":
+            return _DimHint.AUTO()
+        elif val == "_DimHint.DYNAMIC":
+            return _DimHint.DYNAMIC()
+        elif val == "_DimHint.STATIC":
+            return _DimHint.STATIC()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(val, str):
             raise UserError(
                 UserErrorType.INVALID_INPUT,
@@ -316,6 +355,10 @@ def deserialize_shape(
                 "Expected dims in `spec['dynamic_shapes']` to be tracked in `spec['dims']`, "
                 f"got {val} which is not in {dims.keys()}",
             )
+<<<<<<< HEAD
         return dim_cache[val]
+=======
+        return dim_cache[val]  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return tree_map(deserialize_shape, dynamic_shapes)
diff --git a/torch/_export/serde/export_schema.thrift b/torch/_export/serde/export_schema.thrift
index fbf0be7d78f6..b6894168cffc 100644
--- a/torch/_export/serde/export_schema.thrift
+++ b/torch/_export/serde/export_schema.thrift
@@ -1,5 +1,9 @@
 // @generated by update_schema.py
+<<<<<<< HEAD
 // checksum<<f36968728ea96d9629b7c5269f5303e5cf23fba341d0221cb364aaf571b94dd6>>
+=======
+// checksum<<fd7996be362400fc786560be9e3d5680d1d7ec37b21d7a89d74146b669f72da6>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace py3 torch._export
 namespace cpp2 torch._export.schema
@@ -158,6 +162,10 @@ union Argument {
   220: string as_operator;
   230: SymFloatArgument as_sym_float;
   240: list<SymFloatArgument> as_sym_floats;
+<<<<<<< HEAD
+=======
+  250: OptionalTensorArgument as_optional_tensor;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 struct NamedArgument {
diff --git a/torch/_export/serde/schema.py b/torch/_export/serde/schema.py
index 0fbaf8644d74..64eb33d99e51 100644
--- a/torch/_export/serde/schema.py
+++ b/torch/_export/serde/schema.py
@@ -7,8 +7,14 @@
 
 from torch._export.serde.union import _Union
 
+<<<<<<< HEAD
 # NOTE: Please update this value if any modifications are made to the schema
 SCHEMA_VERSION = (8, 7)
+=======
+
+# NOTE: Please update this value if any modifications are made to the schema
+SCHEMA_VERSION = (8, 8)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TREESPEC_VERSION = 1
 
 
@@ -81,6 +87,10 @@ class SymInt(_Union):
     as_expr: Annotated[SymExpr, 10]
     as_int: Annotated[int, 20]
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass(repr=False)
 class SymFloat(_Union):
     as_expr: Annotated[SymExpr, 10]
@@ -115,6 +125,10 @@ class SymIntArgument(_Union):
     as_name: Annotated[str, 10]
     as_int: Annotated[int, 20]
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # In most cases we will use the "as_name" field to store arguments which are
 # SymFloats.
 # The "as_float" field is used in the case where we have a list containing a mix
@@ -126,6 +140,10 @@ class SymFloatArgument(_Union):
     as_name: Annotated[str, 10]
     as_float: Annotated[float, 20]
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # In most cases we will use the "as_name" field to store arguments which are
 # SymBools.
 # The "as_bool" field is used in the case where we have a list containing a mix
@@ -161,7 +179,11 @@ class OptionalTensorArgument(_Union):
 @dataclass
 class GraphArgument:
     name: Annotated[str, 10]
+<<<<<<< HEAD
     graph: Annotated['Graph', 20]
+=======
+    graph: Annotated["Graph", 20]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -198,6 +220,10 @@ class Argument(_Union):
     as_operator: Annotated[str, 220]
     as_sym_float: Annotated[SymFloatArgument, 230]
     as_sym_floats: Annotated[list[SymFloatArgument], 240]
+<<<<<<< HEAD
+=======
+    as_optional_tensor: Annotated[OptionalTensorArgument, 250]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ArgumentKind(IntEnum):
@@ -236,9 +262,18 @@ class Graph:
     # tensor, rather than following export schema and returning a singleton
     # list.
     is_single_tensor_return: Annotated[bool, 70] = False
+<<<<<<< HEAD
     custom_obj_values: Annotated[dict[str, CustomObjArgument], 80] = field(default_factory=dict)
     sym_float_values: Annotated[dict[str, SymFloat], 90] = field(default_factory=dict)
 
+=======
+    custom_obj_values: Annotated[dict[str, CustomObjArgument], 80] = field(
+        default_factory=dict
+    )
+    sym_float_values: Annotated[dict[str, SymFloat], 90] = field(default_factory=dict)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class UserInputSpec:
     # Actually, only tensors and SymInts are allowed here
@@ -273,7 +308,10 @@ class InputToBufferSpec:
     persistent: Annotated[bool, 30]
 
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class InputToTensorConstantSpec:
     arg: Annotated[TensorArgument, 10]
@@ -400,15 +438,30 @@ class GraphModule:
     module_call_graph: Annotated[list[ModuleCallEntry], 60]
     metadata: Annotated[dict[str, str], 40] = field(default_factory=dict)
     # Mapping of namedtuple types to namedtuple field names, used for BC
+<<<<<<< HEAD
     treespec_namedtuple_fields: Annotated[dict[str, NamedTupleDef], 70] = field(default_factory=dict)
+=======
+    treespec_namedtuple_fields: Annotated[dict[str, NamedTupleDef], 70] = field(
+        default_factory=dict
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Invariant: Every time a change is made to the schema, one of the versions
 #            should be upadted.
 @dataclass
 class SchemaVersion:
+<<<<<<< HEAD
     major: Annotated[int, 10]  # Major version number is bumped every time a breaking change is made.
     minor: Annotated[int, 20]  # Minor version number is bumped when a compatible change is made.
+=======
+    major: Annotated[
+        int, 10
+    ]  # Major version number is bumped every time a breaking change is made.
+    minor: Annotated[
+        int, 20
+    ]  # Minor version number is bumped when a compatible change is made.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -421,14 +474,26 @@ class ExportedProgram:
     verifiers: Annotated[list[str], 70] = field(default_factory=list)
     torch_version: Annotated[str, 80] = "<=2.4"
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #########################################################################
 # Container types for inference tasks, not being used directly for export.
 #########################################################################
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class Program:
     methods: Annotated[dict[str, ExportedProgram], 200]
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This is the top-level model definition that be will serialized into the package
 @dataclass
 class Model:
@@ -448,6 +513,10 @@ class Model:
     # value is the archive path of serialized constants
     constantPaths: Annotated[dict[str, str], 70]
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 # The structure is used to serialize instances of AOTInductorModel to pass
 # them from the publishing pipeline to the predictor.
@@ -475,12 +544,20 @@ class AOTInductorModelPickleData:
     # Whether AOTInductor runtime is for CPU.
     aot_inductor_model_is_cpu: Annotated[Optional[bool], 6] = None
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class ExternKernelNode:
     # name is not the unique identifier of the node
     name: Annotated[str, 10]
     node: Annotated[Node, 20]
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class ExternKernelNodes:
     nodes: Annotated[list[ExternKernelNode], 10]
diff --git a/torch/_export/serde/schema.yaml b/torch/_export/serde/schema.yaml
index 3898303bda4b..7baf57c55fac 100644
--- a/torch/_export/serde/schema.yaml
+++ b/torch/_export/serde/schema.yaml
@@ -1,5 +1,9 @@
 # @generated by update_schema.py
+<<<<<<< HEAD
 # checksum<<31c433c768b3f1bb61a5e8f4ceffc40c857bd80cf4fa0fc33fd03fa5ebb6c4d8>>
+=======
+# checksum<<110c364974d3b0f7dcbdf6862781212bdcc7178925c43c894c336fc2b6ca6628>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTInductorModelPickleData:
   kind: struct
   fields:
@@ -71,6 +75,11 @@ Argument:
       type: SymFloatArgument
     as_sym_floats:
       type: List[SymFloatArgument]
+<<<<<<< HEAD
+=======
+    as_optional_tensor:
+      type: OptionalTensorArgument
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ArgumentKind:
   kind: enum
   fields:
@@ -530,5 +539,9 @@ UserOutputSpec:
       type: Argument
 SCHEMA_VERSION:
 - 8
+<<<<<<< HEAD
 - 7
+=======
+- 8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TREESPEC_VERSION: 1
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
index 0506f6ab26f1..1f67d623f622 100644
--- a/torch/_export/serde/serialize.py
+++ b/torch/_export/serde/serialize.py
@@ -13,6 +13,7 @@
 import operator
 import traceback
 import typing
+<<<<<<< HEAD
 
 from collections import OrderedDict, namedtuple
 from contextlib import contextmanager
@@ -28,24 +29,45 @@
     Union,
 )
 from collections.abc import Iterator
+=======
+from collections import namedtuple, OrderedDict
+from collections.abc import Iterable, Iterator
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Annotated, Any, Callable, cast, final, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
 import torch
 import torch.export.exported_program as ep
+<<<<<<< HEAD
 from torch._export.verifier import load_verifier
 from torch._export.non_strict_utils import _enable_graph_inputs_of_type_nn_module
 from torch._library.fake_class_registry import FakeScriptObject
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+=======
+from torch._export.non_strict_utils import _enable_graph_inputs_of_type_nn_module
+from torch._export.verifier import load_verifier
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.fx._symbolic_trace import _ConstantAttributeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental import symbolic_shapes
 from torch.utils import _pytree as pytree
 from torch.utils._pytree import treespec_dumps, treespec_loads
 from torch.utils._sympy.numbers import int_oo
 from torch.utils._sympy.symbol import prefix_str, SymT
 from torch.utils._sympy.value_ranges import ValueRanges
+<<<<<<< HEAD
 
 from ..utils import remove_proxy_from_state_dict
 
+=======
+from torch.utils._traceback import CapturedTraceback
+
+from ..utils import remove_proxy_from_state_dict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .schema import (  # type: ignore[attr-defined]
     Argument,
     ArgumentKind,
@@ -100,6 +122,10 @@
 )
 from .union import _Union
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "serialize",
     "GraphModuleSerializer",
@@ -120,7 +146,18 @@ def _reverse_map(d: dict[Any, Enum]):
 
 
 MetaType = Union[
+<<<<<<< HEAD
     FakeTensor, int, torch.SymInt, float, torch.SymFloat, bool, torch.SymBool, ep.CustomObjArgument
+=======
+    FakeTensor,
+    int,
+    torch.SymInt,
+    float,
+    torch.SymFloat,
+    bool,
+    torch.SymBool,
+    ep.CustomObjArgument,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 DEFAULT_PICKLE_PROTOCOL = 2
@@ -183,6 +220,11 @@ def _reverse_map(d: dict[Any, Enum]):
     operator.gt,
     operator.neg,
     operator.pos,
+<<<<<<< HEAD
+=======
+    operator.and_,
+    operator.or_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     math.trunc,
     torch.sym_not,
     operator.mul,
@@ -204,6 +246,10 @@ def _reverse_map(d: dict[Any, Enum]):
 
 assert not any(isinstance(op, torch._ops.OpOverload) for op in _SYM_OPS)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class SerializedArtifact:
     exported_program: bytes
@@ -252,6 +298,10 @@ def serialize_sym_int(s: Union[int, torch.SymInt]) -> SymInt:
             f"SymInt should be either symbol or int, got `{s}` of type `{type(s)}`"
         )
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def serialize_sym_float(s: Union[float, torch.SymFloat]) -> SymFloat:
     if isinstance(s, (torch.SymFloat, sympy.Symbol, float)):
         if symbolic_shapes.is_concrete_float(s):
@@ -272,14 +322,22 @@ def serialize_sym_float(s: Union[float, torch.SymFloat]) -> SymFloat:
             f"SymFloat should be either symbol or float, got `{s}` of type `{type(s)}`"
         )
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def serialize_sym_bool(s: Union[bool, torch.SymBool]) -> SymBool:
     if isinstance(s, (torch.SymBool, bool)):
         if symbolic_shapes.is_concrete_bool(s):
             return SymBool.create(as_bool=bool(s))
         else:
+<<<<<<< HEAD
             return SymBool.create(
                 as_expr=SymExpr(expr_str=_print_sympy(s))
             )
+=======
+            return SymBool.create(as_expr=SymExpr(expr_str=_print_sympy(s)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         raise SerializeError(
             f"SymBool should be either symbol or bool, got `{s}` of type `{type(s)}`"
@@ -329,7 +387,13 @@ def _reconstruct_fake_tensor(
     return fake_tensor
 
 
+<<<<<<< HEAD
 def serialize_torch_artifact(artifact: Optional[Any], pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL) -> bytes:
+=======
+def serialize_torch_artifact(
+    artifact: Optional[Any], pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL
+) -> bytes:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if artifact is None:
         return b""
 
@@ -351,7 +415,13 @@ def serialize_torch_artifact(artifact: Optional[Any], pickle_protocol: int = DEF
         del copyreg.dispatch_table[FakeTensor]
 
 
+<<<<<<< HEAD
 def deserialize_torch_artifact(serialized: Union[dict[str, Any], tuple[Any, ...], bytes]):
+=======
+def deserialize_torch_artifact(
+    serialized: Union[dict[str, Any], tuple[Any, ...], bytes]
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(serialized, (dict, tuple)):
         return serialized
     if len(serialized) == 0:
@@ -377,7 +447,14 @@ def _sympy_int_to_int(val: sympy.Expr, adjust: str) -> Optional[int]:
     log.warning(
         "Export constraints cannot be non-integer expressions. Found "
         "type %s, and value %s. We will attempt to %s "
+<<<<<<< HEAD
         "this value.", type(val), val, adjust
+=======
+        "this value.",
+        type(val),
+        val,
+        adjust,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     if adjust == "floor":
@@ -392,6 +469,11 @@ def _int_to_sympy_int(val: Optional[int], default) -> sympy.Expr:
     # Convert concrete int into simple sympy Integers
     if val is None:
         return default
+<<<<<<< HEAD
+=======
+    if val in [-int_oo, int_oo]:
+        return val
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if val == math.inf:
         return int_oo
     if val == -math.inf:
@@ -400,7 +482,11 @@ def _int_to_sympy_int(val: Optional[int], default) -> sympy.Expr:
 
 
 def _symbol_index(sym: sympy.Symbol, sym_type: SymT):
+<<<<<<< HEAD
     return int(str(sym)[len(prefix_str[sym_type]):])
+=======
+    return int(str(sym)[len(prefix_str[sym_type]) :])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def serialize_range_constraints(
@@ -423,10 +509,13 @@ def _get_schema_from_target(target):
     raise RuntimeError(f"Cannot find schema for {type(target)}")
 
 
+<<<<<<< HEAD
 
 
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class GraphState:
     inputs: list[Argument] = field(default_factory=list)
@@ -476,9 +565,19 @@ def handle_placeholder(self, node: torch.fx.Node):
         val = node.meta["val"]
         log.debug("[handle_placeholder] %s: %s", node.name, val)
         if isinstance(val, torch.Tensor):
+<<<<<<< HEAD
             graph_input = Argument.create(as_tensor=self.serialize_tensor_output(node.name, val))
         elif isinstance(val, torch.SymInt):
             graph_input = Argument.create(as_sym_int=self.serialize_sym_int_output(node.name, val))
+=======
+            graph_input = Argument.create(
+                as_tensor=self.serialize_tensor_output(node.name, val)
+            )
+        elif isinstance(val, torch.SymInt):
+            graph_input = Argument.create(
+                as_sym_int=self.serialize_sym_int_output(node.name, val)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(val, torch.SymFloat):
             raise AssertionError("SymFloat graph input is not implemented yet.")
         elif isinstance(val, (int, bool, str, float, type(None))):
@@ -488,9 +587,15 @@ def handle_placeholder(self, node: torch.fx.Node):
             graph_input = Argument.create(
                 as_custom_obj=CustomObjArgument(name=node.name, class_fqn=class_fqn)
             )
+<<<<<<< HEAD
             self.graph_state.custom_obj_values[node.name] = (
                 self.serialize_script_obj_meta(val)
             )
+=======
+            self.graph_state.custom_obj_values[
+                node.name
+            ] = self.serialize_script_obj_meta(val)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise AssertionError(f"Unimplemented graph input type: {node.meta['val']}")
         self.graph_state.inputs.append(graph_input)
@@ -522,15 +627,32 @@ def serialize_operator(self, target) -> str:
     def handle_call_function(self, node: torch.fx.Node):
         assert node.op == "call_function"
         meta_val = node.meta.get("val")
+<<<<<<< HEAD
         log.debug("[handle_call_function] %s: %s(%s, {%s}) -> %s", node.name, node.target, node.args, node.kwargs, meta_val)
+=======
+        log.debug(
+            "[handle_call_function] %s: %s(%s, {%s}) -> %s",
+            node.name,
+            node.target,
+            node.args,
+            node.kwargs,
+            meta_val,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # getitem has been handled in the producer node, skip it here
         if node.target is operator.getitem:
             return
 
+<<<<<<< HEAD
         if (
             node.target in _SYM_OPS
             or (meta_val is not None and isinstance(meta_val, (torch.SymInt, torch.SymBool, torch.SymFloat)))
+=======
+        if node.target in _SYM_OPS or (
+            meta_val is not None
+            and isinstance(meta_val, (torch.SymInt, torch.SymBool, torch.SymFloat))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             assert len(node.kwargs) == 0
             ex_node = Node(
@@ -548,6 +670,10 @@ def handle_call_function(self, node: torch.fx.Node):
                 metadata=self.serialize_metadata(node),
             )
         elif isinstance(node.target, torch._ops.HigherOrderOperator):
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def _is_hop_single_tensor_return(node) -> bool:
                 assert isinstance(node.target, torch._ops.HigherOrderOperator)
                 # HOP schema is not always available, so we look at node.meta["val"]
@@ -568,6 +694,24 @@ def _is_hop_single_tensor_return(node) -> bool:
                 # Skip serializing original_gm as a workaround
                 serializable_args[1] = None
 
+<<<<<<< HEAD
+=======
+                serializable_weight_nodes = []
+                if serializable_args[2] is not None and isinstance(
+                    serializable_args[2], Iterable
+                ):
+                    for weight_node in serializable_args[2]:
+                        # skip passing custom obj into the weight arg as an hack
+                        # The schema of weight input is a list of Tensors.
+                        # Downstream runtime is not actively consuming the weighs arg for anything meaningful.
+                        if isinstance(weight_node, torch.fx.Node) and isinstance(
+                            weight_node.meta.get("val", None), ep.CustomObjArgument
+                        ):
+                            continue
+                        serializable_weight_nodes.append(weight_node)
+                    serializable_args[2] = serializable_weight_nodes
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 def serialize_tensor_list_output(node):
                     meta_val = node.meta.get("val", None)
                     tensor_args = []
@@ -576,7 +720,10 @@ def serialize_tensor_list_output(node):
                         tensor_args.append(self.serialize_tensor_output(name, meta))
                     return [Argument.create(as_tensors=tensor_args)]
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ex_node = Node(
                     target=self.serialize_operator(node.target),
                     inputs=self.serialize_hoo_inputs(serializable_args, node.kwargs),
@@ -594,7 +741,13 @@ def serialize_tensor_list_output(node):
                 )
         elif type(node.target) in _serialization_registry:
             # Sanity check for unhandled serialization.
+<<<<<<< HEAD
             assert type(node.target) in _serialization_registry, f"{type(node.target)} is not supported in export serialization."
+=======
+            assert (
+                type(node.target) in _serialization_registry
+            ), f"{type(node.target)} is not supported in export serialization."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             handler = _serialization_registry[type(node.target)]
             namespace = handler.namespace()
@@ -708,7 +861,11 @@ def serialize_inputs(
         self,
         target: Any,  # torch._ops.OpOverload and other custom operator types.
         args,
+<<<<<<< HEAD
         kwargs=None
+=======
+        kwargs=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> list[NamedArgument]:
         schema = None
         serialized_args = []
@@ -718,7 +875,13 @@ def serialize_inputs(
             method = args[1]
             schema = target.schema(obj, method)
         else:
+<<<<<<< HEAD
             assert isinstance(target, (torch._ops.OpOverload, *_registered_extension_types()))
+=======
+            assert isinstance(
+                target, (torch._ops.OpOverload, *_registered_extension_types())
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             schema = _get_schema_from_target(target)
         assert schema is not None
         kwargs = kwargs or {}
@@ -728,7 +891,13 @@ def serialize_inputs(
                 serialized_args.append(
                     NamedArgument(
                         name=schema_arg.name,
+<<<<<<< HEAD
                         arg=self.serialize_input(kwargs[schema_arg.name], schema_arg.type),
+=======
+                        arg=self.serialize_input(
+                            kwargs[schema_arg.name], schema_arg.type
+                        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         kind=ArgumentKind.KEYWORD,
                     )
                 )
@@ -753,9 +922,13 @@ def serialize_hoo_inputs(self, args, kwargs) -> list[NamedArgument]:
         """
         inputs = [
             NamedArgument(
+<<<<<<< HEAD
                 name="",
                 arg=self.serialize_input(a),
                 kind=ArgumentKind.POSITIONAL
+=======
+                name="", arg=self.serialize_input(a), kind=ArgumentKind.POSITIONAL
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for a in args
         ]
@@ -797,9 +970,13 @@ def is_sym_bool_arg(self, arg) -> bool:
         )
 
     # should be torch._C.JitType but that annotation is busted
+<<<<<<< HEAD
     def serialize_input(
         self, arg, arg_type: Optional[Any] = None
     ) -> Argument:
+=======
+    def serialize_input(self, arg, arg_type: Optional[Any] = None) -> Argument:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import torch._inductor.ir as inductor_ir
 
         inductor_tensor_buffers = (
@@ -866,9 +1043,13 @@ def serialize_input(
             arg_val = arg.get_real_obj()
             class_fqn = arg_val._type().qualified_name()
             self.custom_objs[arg_name] = arg_val
+<<<<<<< HEAD
             return Argument.create(
                 as_custom_obj=CustomObjArgument(arg_name, class_fqn)
             )
+=======
+            return Argument.create(as_custom_obj=CustomObjArgument(arg_name, class_fqn))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(arg, torch.SymInt):
             # This is a special branch for handling SymInt args in inductor's
             # ExternalFallbackNode.
@@ -880,7 +1061,13 @@ def serialize_input(
             # ExternalFallbackNode.
             # For regular FX graph, SymInt arg should be a fx.Node with
             # self.is_sym_float_arg(arg) being true
+<<<<<<< HEAD
             return Argument.create(as_sym_float=SymFloatArgument.create(as_name=str(arg)))
+=======
+            return Argument.create(
+                as_sym_float=SymFloatArgument.create(as_name=str(arg))
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif type(arg) is bool:
             return Argument.create(as_bool=arg)
         elif type(arg) is str:
@@ -1058,7 +1245,13 @@ def serialize_optional_tensor_args(a):
         elif isinstance(arg, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)):
             return Argument.create(as_operator=self.serialize_operator(arg))
         else:
+<<<<<<< HEAD
             raise SerializeError(f"Unsupported argument type: {type(arg)} with schema arg_type {arg_type}")
+=======
+            raise SerializeError(
+                f"Unsupported argument type: {type(arg)} with schema arg_type {arg_type}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def serialize_tensor_output(self, name, meta_val) -> TensorArgument:
         assert name not in self.graph_state.tensor_values
@@ -1095,7 +1288,13 @@ def serialize_input_spec(self, spec: ep.InputSpec) -> InputSpec:
                 elif spec.arg.value is None:
                     constant_spec = ConstantValue.create(as_none=True)
                 else:
+<<<<<<< HEAD
                     raise SerializeError(f"Unhandled constant input {spec.arg.value} to serialize")
+=======
+                    raise SerializeError(
+                        f"Unhandled constant input {spec.arg.value} to serialize"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return InputSpec.create(
                     constant_input=InputToConstantInputSpec(
                         name=spec.arg.name, value=constant_spec
@@ -1103,9 +1302,13 @@ def serialize_input_spec(self, spec: ep.InputSpec) -> InputSpec:
                 )
             else:
                 return InputSpec.create(
+<<<<<<< HEAD
                     user_input=UserInputSpec(
                         arg=self.serialize_argument_spec(spec.arg)
                     )
+=======
+                    user_input=UserInputSpec(arg=self.serialize_argument_spec(spec.arg))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
         elif spec.kind == ep.InputKind.PARAMETER:
             assert spec.target is not None
@@ -1243,10 +1446,21 @@ def serialize_treespec(self, treespec):
         def store_namedtuple_fields(ts):
             if ts.type is None:
                 return
+<<<<<<< HEAD
             if ts.type == namedtuple:
                 serialized_type_name = pytree.SUPPORTED_SERIALIZED_TYPES[ts.context].serialized_type_name
                 if serialized_type_name in self.treespec_namedtuple_fields:
                     field_names = self.treespec_namedtuple_fields[serialized_type_name].field_names
+=======
+            if ts.type is namedtuple or pytree.is_namedtuple_class(ts.type):
+                serialized_type_name = pytree.SUPPORTED_SERIALIZED_TYPES[
+                    ts.context
+                ].serialized_type_name
+                if serialized_type_name in self.treespec_namedtuple_fields:
+                    field_names = self.treespec_namedtuple_fields[
+                        serialized_type_name
+                    ].field_names
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if field_names != ts.context._fields:
                         raise SerializeError(
                             f"The given TreeSpec's namedtuple type {ts.context} "
@@ -1254,7 +1468,13 @@ def store_namedtuple_fields(ts):
                             f"but somehow previously was found to have field names {field_names}."
                         )
                 else:
+<<<<<<< HEAD
                     self.treespec_namedtuple_fields[serialized_type_name] = NamedTupleDef(field_names=ts.context._fields)
+=======
+                    self.treespec_namedtuple_fields[
+                        serialized_type_name
+                    ] = NamedTupleDef(field_names=ts.context._fields)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             for child in ts.children_specs:
                 store_namedtuple_fields(child)
@@ -1276,7 +1496,13 @@ def serialize_module_call_signature(
             ],
             in_spec=self.serialize_treespec(module_call_signature.in_spec),
             out_spec=self.serialize_treespec(module_call_signature.out_spec),
+<<<<<<< HEAD
             forward_arg_names=names if (names := module_call_signature.forward_arg_names) else None
+=======
+            forward_arg_names=names
+            if (names := module_call_signature.forward_arg_names)
+            else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def serialize_module_call_graph(
@@ -1326,7 +1552,13 @@ def _is_single_tensor_list_return(target: Any) -> bool:
                 return_type.getElementType(), torch.TensorType
             )
 
+<<<<<<< HEAD
         assert node.op == "call_function" and isinstance(node.target, (torch._ops.OpOverload, *_registered_extension_types()))
+=======
+        assert node.op == "call_function" and isinstance(
+            node.target, (torch._ops.OpOverload, *_registered_extension_types())
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         schema = _get_schema_from_target(node.target)
         returns = schema.returns
@@ -1365,7 +1597,13 @@ def _is_single_tensor_list_return(target: Any) -> bool:
                 # undefined Tensor which will be implicitly converted to None in Python.
                 output_arguments.append(Argument.create(as_none=True))
             elif isinstance(meta, FakeTensor):
+<<<<<<< HEAD
                 assert isinstance(return_schema.real_type, (torch.OptionalType, torch.TensorType))
+=======
+                assert isinstance(
+                    return_schema.real_type, (torch.OptionalType, torch.TensorType)
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 name = self._output_node_name_at_index(node, idx)
                 output_arguments.append(self.serialize_output(name, meta))
             elif isinstance(meta, list):
@@ -1412,7 +1650,13 @@ def serialize_hoo_outputs(self, node: torch.fx.Node) -> list[Argument]:
                     tensors = []
                     for j, m in enumerate(element_meta_val):
                         if not isinstance(m, torch.Tensor):
+<<<<<<< HEAD
                             raise SerializeError(f"Serialize list output with type {type(m)} nyi")
+=======
+                            raise SerializeError(
+                                f"Serialize list output with type {type(m)} nyi"
+                            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                         name = self._output_node_name_at_index(user_node, j)
                         tensors.append(self.serialize_tensor_output(name, m))
@@ -1487,7 +1731,13 @@ def _handle_getitem_users(self, node: torch.fx.Node) -> list[TensorArgument]:
 
     def serialize_graph(self, graph_module: torch.fx.GraphModule) -> Graph:
         assert isinstance(graph_module, torch.fx.GraphModule)
+<<<<<<< HEAD
         log.debug("[serialize_graph]\n\n%s", graph_module.print_readable(print_output=False))
+=======
+        log.debug(
+            "[serialize_graph]\n\n%s", graph_module.print_readable(print_output=False)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for node in graph_module.graph.nodes:
             try:
@@ -1531,13 +1781,25 @@ def serialize(self, graph_module: torch.fx.GraphModule) -> GraphModule:
             signature=self.serialize_signature(self.graph_signature),
             module_call_graph=self.serialize_module_call_graph(self.module_call_graph),
             metadata=self.serialize_graph_module_metadata(graph_module.meta),
+<<<<<<< HEAD
             treespec_namedtuple_fields=self.treespec_namedtuple_fields
+=======
+            treespec_namedtuple_fields=self.treespec_namedtuple_fields,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
 @final
 class ExportedProgramSerializer(metaclass=Final):
+<<<<<<< HEAD
     def __init__(self, opset_version: Optional[dict[str, int]] = None, pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL):
+=======
+    def __init__(
+        self,
+        opset_version: Optional[dict[str, int]] = None,
+        pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.opset_version: dict[str, int] = {}
         if opset_version:
             self.opset_version.update(opset_version)
@@ -1564,9 +1826,13 @@ def serialize(self, exported_program: ep.ExportedProgram) -> _SerializedProgram:
         # TODO: Directly serialize exported_program.constants once
         # CustomClassHolders get stored in the ExportedProgram rather than in
         # the graph
+<<<<<<< HEAD
         constants: dict[str, Any] = {}
         for n, c in gm_serializer.custom_objs.items():
             constants[n] = c
+=======
+        constants: dict[str, Any] = gm_serializer.custom_objs.copy()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for n, t in exported_program.constants.items():
             assert n not in constants
             constants[n] = t
@@ -1594,7 +1860,13 @@ def serialize(self, exported_program: ep.ExportedProgram) -> _SerializedProgram:
             serialized_ep,
             serialize_torch_artifact(new_state_dict, self.pickle_protocol),
             serialize_torch_artifact(constants, self.pickle_protocol),
+<<<<<<< HEAD
             serialize_torch_artifact(exported_program.example_inputs, self.pickle_protocol),
+=======
+            serialize_torch_artifact(
+                exported_program.example_inputs, self.pickle_protocol
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1607,7 +1879,11 @@ class Result:
         module_call_graph: list[ep.ModuleCallEntry]
         names_to_symbols: dict[str, sympy.Symbol]
         state_dict: dict[str, Union[torch.Tensor, torch.nn.Parameter]]
+<<<<<<< HEAD
         constants: dict[str, Union[torch.Tensor, FakeScriptObject, torch.ScriptObject]]
+=======
+        constants: dict[str, _ConstantAttributeType]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         example_inputs: Optional[tuple[tuple[torch.Tensor, ...], dict[str, Any]]]
 
     def __init__(self) -> None:
@@ -1623,11 +1899,19 @@ def save_graph_module(self) -> Iterator[None]:
             self.module,
             self.serialized_name_to_node,
             self.serialized_name_to_meta,
+<<<<<<< HEAD
+=======
+            self.unbacked_symbols,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.graph = torch.fx.Graph()
         self.module = torch.nn.Module()
         self.serialized_name_to_node = {}
         self.serialized_name_to_meta = {}
+<<<<<<< HEAD
+=======
+        self.unbacked_symbols: set[sympy.Symbol] = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             yield
         finally:
@@ -1636,6 +1920,10 @@ def save_graph_module(self) -> Iterator[None]:
                 self.module,
                 self.serialized_name_to_node,
                 self.serialized_name_to_meta,
+<<<<<<< HEAD
+=======
+                self.unbacked_symbols,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ) = saved
 
     def deserialize_extension_operator(self, serialized_target: str):
@@ -1653,6 +1941,12 @@ def deserialize_operator(self, serialized_target: str):
         elif serialized_target.startswith("torch"):
             module = torch  # type: ignore[misc]
             serialized_target_names = serialized_target.split(".")[1:]
+<<<<<<< HEAD
+=======
+        elif serialized_target.startswith("math"):
+            module = math  # type: ignore[misc]
+            serialized_target_names = serialized_target.split(".")[1:]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif serialized_target.startswith("#"):
             return self.deserialize_extension_operator(serialized_target)
         else:  # TODO(zhxchen17) Don't catch all here.
@@ -1666,12 +1960,25 @@ def deserialize_operator(self, serialized_target: str):
                 target = getattr(target, name)
         return target
 
+<<<<<<< HEAD
     def _parse_sym_expr(self, expr_str: str, hint: Optional[Union[int, bool, float]] = None) -> sympy.Expr:
+=======
+    def _parse_sym_expr(
+        self, expr_str: str, hint: Optional[Union[int, bool, float]] = None
+    ) -> sympy.Expr:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Parses and does bottom-up processing of sympy.Expr nodes,
         populating ShapeEnv & caching symbols as needed.
         """
+<<<<<<< HEAD
         def _process_sym_expr(sym: sympy.Expr, hint: Optional[Union[int, bool, float]] = None) -> sympy.Expr:
+=======
+
+        def _process_sym_expr(
+            sym: sympy.Expr, hint: Optional[Union[int, bool, float]] = None
+        ) -> sympy.Expr:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if sym.is_Integer or sym.is_Float or sym.is_Boolean:  # base case
                 return sym
             else:  # recursive case
@@ -1685,6 +1992,7 @@ def _process_sym_expr(sym: sympy.Expr, hint: Optional[Union[int, bool, float]] =
                     sym = self.symbol_name_to_symbol[expr_str]
                 else:
                     self.symbol_name_to_symbol[expr_str] = sym
+<<<<<<< HEAD
                     if (
                         isinstance(sym, sympy.Symbol)
                         and symbolic_shapes.symbol_is_type(sym, (SymT.UNBACKED_INT, SymT.UNBACKED_FLOAT))
@@ -1695,6 +2003,14 @@ def _process_sym_expr(sym: sympy.Expr, hint: Optional[Union[int, bool, float]] =
                     hint is not None
                     and sym not in self.shape_env.var_to_val
                 ):
+=======
+                    if isinstance(sym, sympy.Symbol) and symbolic_shapes.symbol_is_type(
+                        sym, (SymT.UNBACKED_INT, SymT.UNBACKED_FLOAT)
+                    ):
+                        self.unbacked_symbols.add(sym)
+                # hints
+                if hint is not None and sym not in self.shape_env.var_to_val:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.shape_env.add_var_to_val(sym, hint)  # type: ignore[arg-type]
                 # ValueRanges
                 if vr := self.symbol_name_to_range.get(expr_str):
@@ -1703,6 +2019,12 @@ def _process_sym_expr(sym: sympy.Expr, hint: Optional[Union[int, bool, float]] =
                         compiler_min=vr.lower,  # type: ignore[arg-type]
                         compiler_max=vr.upper,  # type: ignore[arg-type]
                     )
+<<<<<<< HEAD
+=======
+                # ShapeEnv meta
+                if isinstance(sym, sympy.Symbol):
+                    self.shape_env.var_to_stack[sym] = CapturedTraceback.extract(skip=1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return sym
 
         expr = sympy.sympify(
@@ -1855,7 +2177,13 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
                     placeholder_node.name = node_name
                     self.sync_fx_node(node_name, placeholder_node)
                 else:
+<<<<<<< HEAD
                     raise SerializeError(f"Deserializing a constant symint {input_.value} as an input")
+=======
+                    raise SerializeError(
+                        f"Deserializing a constant symint {input_.value} as an input"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif input_.type in (
                 "as_int",
                 "as_float",
@@ -1863,7 +2191,11 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
                 "as_none",
                 "as_string",
             ):
+<<<<<<< HEAD
                 node_name = self.signature.input_specs[i].arg.name
+=======
+                node_name = self.signature.input_specs[i].arg.name or f"arg{i}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 placeholder_node = self.graph.placeholder(node_name)
                 placeholder_node.meta["val"] = self.deserialize_input(input_)
             else:
@@ -1904,12 +2236,22 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
 
         # recompute unbacked bindings
         for node in self.graph.nodes:
+<<<<<<< HEAD
             if (
                 (val := node.meta.get("val")) is not None
                 and (
                     unbacked_bindings := symbolic_shapes._free_unbacked_symbols_with_path(
                         val, (), shape_env=self.shape_env, pending=self.unbacked_symbols, simplify=True
                     )
+=======
+            if (val := node.meta.get("val")) is not None and (
+                unbacked_bindings := symbolic_shapes._free_unbacked_symbols_with_path(
+                    val,
+                    (),
+                    shape_env=self.shape_env,
+                    pending=self.unbacked_symbols,
+                    simplify=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             ):
                 node.meta["unbacked_bindings"] = unbacked_bindings
@@ -1918,6 +2260,7 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
         return self.graph
 
     def deserialize_node(self, serialized_node: Node, target: Callable) -> None:
+<<<<<<< HEAD
 
         def _is_single_tensor_return(target) -> bool:
             schema = _get_schema_from_target(target)
@@ -1927,6 +2270,19 @@ def _is_single_tensor_return(target) -> bool:
         if (
             target in _SYM_OPS
             or target == torch.ops.aten.item.default  # this can produce either SymInt or SymBool
+=======
+        def _is_single_tensor_return(target) -> bool:
+            schema = _get_schema_from_target(target)
+            returns = schema.returns
+            return len(returns) == 1 and isinstance(
+                returns[0].real_type, torch.TensorType
+            )
+
+        if (
+            target in _SYM_OPS
+            or target
+            == torch.ops.aten.item.default  # this can produce either SymInt or SymBool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             name = serialized_node.outputs[0].value.as_name
             args = self.deserialize_sym_op_inputs(serialized_node.inputs)
@@ -1965,7 +2321,13 @@ def _is_single_tensor_return(target) -> bool:
             self.deserialize_outputs(serialized_node, fx_node)
             fx_node.meta.update(metadata)
 
+<<<<<<< HEAD
         elif isinstance(target, (torch._ops.OpOverload, *_registered_extension_types())):
+=======
+        elif isinstance(
+            target, (torch._ops.OpOverload, *_registered_extension_types())
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # For convenience: if this node returns a single tensor, name the
             # newly-created node after it. This ensures that these tensor values
             # have names that are consistent with serialized.
@@ -1980,8 +2342,24 @@ def _is_single_tensor_return(target) -> bool:
             )
             self.deserialize_outputs(serialized_node, fx_node)
         else:
+<<<<<<< HEAD
             raise SerializeError(
                 f"Unsupported target type for node {serialized_node}: {type(target)}"
+=======
+            _additional_msg = (
+                (
+                    f"We failed to resolve {target} to an operator. "
+                    + "If it's a custom op/custom triton op, this is usally because the custom op is not registered"
+                    + " when deserializing. Please import the custom op to register it before deserializing."
+                    + " Otherwise, please file an issue on github."
+                )
+                if isinstance(target, str)
+                else ""
+            )
+            raise SerializeError(
+                _additional_msg
+                + f" Unsupported target type for node {serialized_node}: {type(target)}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         fx_node.meta.update(self.deserialize_metadata(serialized_node.metadata))
@@ -1993,8 +2371,18 @@ def _is_single_tensor_return(target) -> bool:
             fx_node.kwargs,
             fx_node.meta.get("val"),
         )
+<<<<<<< HEAD
         if fx_node.op not in ["placeholder", "output"] and "nn_module_stack" not in fx_node.meta:
             fx_node.meta["nn_module_stack"] = {}  # serialization throws away empty dicts
+=======
+        if (
+            fx_node.op not in ["placeholder", "output"]
+            and "nn_module_stack" not in fx_node.meta
+        ):
+            fx_node.meta[
+                "nn_module_stack"
+            ] = {}  # serialization throws away empty dicts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def deserialize_input_spec(self, i: InputSpec) -> ep.InputSpec:
         log.debug("[deserialize_input_spec] %s", i)
@@ -2035,14 +2423,22 @@ def deserialize_input_spec(self, i: InputSpec) -> ep.InputSpec:
             return ep.InputSpec(
                 kind=ep.InputKind.TOKEN,
                 arg=ep.TokenArgument(name=i.token.arg.name),
+<<<<<<< HEAD
                 target=None
+=======
+                target=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         elif i.type == "constant_input":
             return ep.InputSpec(
                 kind=ep.InputKind.USER_INPUT,
                 arg=ep.ConstantArgument(
                     name=i.constant_input.name,
+<<<<<<< HEAD
                     value=self.deserialize_constant_input(i.constant_input.value)
+=======
+                    value=self.deserialize_constant_input(i.constant_input.value),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 target=None,
             )
@@ -2091,7 +2487,11 @@ def deserialize_output_spec(self, o: OutputSpec) -> ep.OutputSpec:
             return ep.OutputSpec(
                 kind=ep.OutputKind.TOKEN,
                 arg=ep.TokenArgument(name=o.token.arg.name),
+<<<<<<< HEAD
                 target=None
+=======
+                target=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             raise AssertionError(f"Unknown output spec {o}")
@@ -2108,7 +2508,13 @@ def deserialize(
         serialized_graph_module: GraphModule,
         serialized_state_dict: Union[dict[str, torch.Tensor], bytes],
         constants: Union[dict[str, Any], bytes],
+<<<<<<< HEAD
         example_inputs: Optional[Union[tuple[tuple[torch.Tensor, ...], dict[str, Any]], bytes]] = None,
+=======
+        example_inputs: Optional[
+            Union[tuple[tuple[torch.Tensor, ...], dict[str, Any]], bytes]
+        ] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         symbol_name_to_range: Optional[dict[str, symbolic_shapes.ValueRanges]] = None,
     ) -> Result:
         global _CURRENT_DESERIALIZER
@@ -2152,14 +2558,24 @@ def deserialize(
             }
             self.symbol_name_to_symbol: dict[str, sympy.Symbol] = {}
             self.constants = deserialize_torch_artifact(constants)
+<<<<<<< HEAD
             self.signature = self.deserialize_signature(serialized_graph_module.signature)
+=======
+            self.signature = self.deserialize_signature(
+                serialized_graph_module.signature
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # deserialization does analysis with checks on 0/1, so we create fake range constraints and
             # restore the original range constraints afterwards
             self.symbol_name_to_range = {}
             # we also need to bump unbacked sym[float,int] counters in the
             # shape env to accommodate unbacked symbols in the exported program
+<<<<<<< HEAD
             self.unbacked_symbols: set[sympy.Symbol] = set()
+=======
+            self.unbacked_symbols = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             count_unbacked_symfloat, count_unbacked_symint = -1, -1
             unbacked_symfloat_prefix, unbacked_symint_prefix = (
                 prefix_str[t] for t in [SymT.UNBACKED_FLOAT, SymT.UNBACKED_INT]
@@ -2169,12 +2585,23 @@ def deserialize(
                     lower = vr.lower
                     if vr.upper >= 2:  # max is >= 2, not sym bool range
                         lower = max(2, lower)
+<<<<<<< HEAD
                     self.symbol_name_to_range[k] = symbolic_shapes.ValueRanges(_int_to_sympy_int(lower, -int_oo), vr.upper)
                     if k.startswith(unbacked_symfloat_prefix):
                         i = int(k[len(unbacked_symfloat_prefix):])
                         count_unbacked_symfloat = max(count_unbacked_symfloat, i)
                     elif k.startswith(unbacked_symint_prefix):
                         i = int(k[len(unbacked_symint_prefix):])
+=======
+                    self.symbol_name_to_range[k] = symbolic_shapes.ValueRanges(
+                        _int_to_sympy_int(lower, -int_oo), vr.upper
+                    )
+                    if k.startswith(unbacked_symfloat_prefix):
+                        i = int(k[len(unbacked_symfloat_prefix) :])
+                        count_unbacked_symfloat = max(count_unbacked_symfloat, i)
+                    elif k.startswith(unbacked_symint_prefix):
+                        i = int(k[len(unbacked_symint_prefix) :])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         count_unbacked_symint = max(count_unbacked_symint, i)
 
             # TODO(pianpwk): if we can clean up unused symbols in range_constraints,
@@ -2194,15 +2621,26 @@ def deserialize(
                 module_call_graph = self.deserialize_module_call_graph(
                     serialized_graph_module.module_call_graph
                 )
+<<<<<<< HEAD
             graph_module = ep._create_graph_module_for_export(
                 self.module, self.graph
             )
+=======
+            graph_module = ep._create_graph_module_for_export(self.module, self.graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             meta = {}
             if custom := serialized_graph_module.metadata.get("custom"):
                 meta["custom"] = json.loads(custom)
             if hasattr(serialized_graph_module, "treespec_namedtuple_fields"):
                 meta["treespec_namedtuple_fields"] = {}
+<<<<<<< HEAD
                 for type_, fields in serialized_graph_module.treespec_namedtuple_fields.items():
+=======
+                for (
+                    type_,
+                    fields,
+                ) in serialized_graph_module.treespec_namedtuple_fields.items():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     meta["treespec_namedtuple_fields"][type_] = fields.field_names
             graph_module.meta = meta
             return GraphModuleDeserializer.Result(
@@ -2231,10 +2669,14 @@ def deserialize_sym_op_inputs(self, inputs):
 
     def deserialize_inputs(self, target, serialized_node: Node):
         schema_args = _get_schema_from_target(target).arguments
+<<<<<<< HEAD
         argument_kinds = {
             input.name: input.kind
             for input in serialized_node.inputs
         }
+=======
+        argument_kinds = {input.name: input.kind for input in serialized_node.inputs}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         actual_args = {
             input.name: self.deserialize_input(input.arg)
             for input in serialized_node.inputs
@@ -2248,7 +2690,13 @@ def deserialize_inputs(self, target, serialized_node: Node):
                 if kind == ArgumentKind.POSITIONAL:
                     args.append(arg)
                     continue
+<<<<<<< HEAD
                 elif kind == ArgumentKind.KEYWORD and not keyword.iskeyword(schema_arg.name):
+=======
+                elif kind == ArgumentKind.KEYWORD and not keyword.iskeyword(
+                    schema_arg.name
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     kwargs[schema_arg.name] = arg
                     continue
 
@@ -2397,6 +2845,7 @@ def deserialize_outputs(self, serialized_node: Node, fx_node: torch.fx.Node):
         # Check single value return
         if len(serialized_node.outputs) == 0:
             return
+<<<<<<< HEAD
         if (
             len(serialized_node.outputs) == 1
             and serialized_node.outputs[0].type == "as_tensor"
@@ -2413,11 +2862,36 @@ def deserialize_outputs(self, serialized_node: Node, fx_node: torch.fx.Node):
                 meta_val: list[Any] = []
                 arg = serialized_node.outputs[0].as_tensor
                 deserialized_metadata = self.deserialize_metadata(serialized_node.metadata)
+=======
+
+        if (
+            len(serialized_node.outputs) == 1
+            and "torch.ops.higher_order" in serialized_node.target
+            and not getattr(serialized_node, "is_hop_single_tensor_return", True)
+            and serialized_node.outputs[0].type != "as_none"
+        ):
+
+            def _deserialize_hop_with_single_return(serialized_node, fx_node):
+                meta_val: list[Any] = []
+                arg = None
+                if serialized_node.outputs[0].type == "as_tensor":
+                    arg = serialized_node.outputs[0].as_tensor
+                elif isinstance(
+                    serialized_node.outputs[0].value,
+                    (SymIntArgument, SymBoolArgument, SymFloatArgument),
+                ):
+                    arg = serialized_node.outputs[0].value
+                deserialized_metadata = self.deserialize_metadata(
+                    serialized_node.metadata
+                )
+                assert arg is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.generate_getitem(meta_val, fx_node, arg, 0, deserialized_metadata)
                 fx_node.meta["val"] = tuple(meta_val)
                 self.serialized_name_to_node[fx_node.name] = fx_node
                 return
 
+<<<<<<< HEAD
             self.sync_fx_node(serialized_node.outputs[0].as_tensor.name, fx_node)
             return
         elif len(serialized_node.outputs) == 1 and isinstance(
@@ -2426,6 +2900,26 @@ def deserialize_outputs(self, serialized_node: Node, fx_node: torch.fx.Node):
             self.sync_fx_node(serialized_node.outputs[0].value.as_name, fx_node)
             return
         elif len(serialized_node.outputs) == 1 and serialized_node.outputs[0].type == "as_none":
+=======
+            return _deserialize_hop_with_single_return(serialized_node, fx_node)
+
+        if (
+            len(serialized_node.outputs) == 1
+            and serialized_node.outputs[0].type == "as_tensor"
+        ):
+            self.sync_fx_node(serialized_node.outputs[0].as_tensor.name, fx_node)
+            return
+        elif len(serialized_node.outputs) == 1 and isinstance(
+            serialized_node.outputs[0].value,
+            (SymIntArgument, SymBoolArgument, SymFloatArgument),
+        ):
+            self.sync_fx_node(serialized_node.outputs[0].value.as_name, fx_node)
+            return
+        elif (
+            len(serialized_node.outputs) == 1
+            and serialized_node.outputs[0].type == "as_none"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # manually rename the node to a unused name to avoid naming conflicts
             fx_node.meta["val"] = None
             fx_node._rename(f"{self.graph._target_to_str(fx_node.target)}_unused")
@@ -2472,12 +2966,24 @@ def generate_getitems(
     ):
         for idx, arg in enumerate(args):
             if isinstance(arg, (TensorArgument, SymIntArgument, SymFloatArgument)):
+<<<<<<< HEAD
                 self.generate_getitem(meta_val, fx_node, arg, idx, deserialized_metadata)
+=======
+                self.generate_getitem(
+                    meta_val, fx_node, arg, idx, deserialized_metadata
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
 
             assert isinstance(arg, Argument)
             if arg.type in ("as_tensor", "as_sym_int", "as_sym_float"):
+<<<<<<< HEAD
                 self.generate_getitem(meta_val, fx_node, arg.value, idx, deserialized_metadata)
+=======
+                self.generate_getitem(
+                    meta_val, fx_node, arg.value, idx, deserialized_metadata
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif arg.type in (
                 "as_tensors",
                 "as_sym_ints",
@@ -2494,7 +3000,13 @@ def generate_getitems(
                     (fx_node, idx),
                 )
                 meta_val.append([])
+<<<<<<< HEAD
                 self.generate_getitems(meta_val[-1], list_output, arg.value, deserialized_metadata)
+=======
+                self.generate_getitems(
+                    meta_val[-1], list_output, arg.value, deserialized_metadata
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 list_output.meta.update(deserialized_metadata)
                 list_output.meta["val"] = meta_val[-1]
             elif arg.type == "as_none":
@@ -2505,7 +3017,11 @@ def generate_getitems(
                     name="as_none",
                 )
                 meta_val.append(None)
+<<<<<<< HEAD
                 individual_output.meta['val'] = None
+=======
+                individual_output.meta["val"] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 individual_output.meta.update(deserialized_metadata)
             else:
                 raise NotImplementedError(f"Unimplemented node output type: {arg}")
@@ -2525,9 +3041,22 @@ def deserialize_multiple_outputs(
         if len(serialized_node.outputs) == 1:
             assert isinstance(serialized_node.outputs[0].value, list)
             assert isinstance(serialized_node.outputs[0].value[0], TensorArgument)
+<<<<<<< HEAD
             self.generate_getitems(meta_val, fx_node, serialized_node.outputs[0].as_tensors, deserialized_metadata)
         else:
             self.generate_getitems(meta_val, fx_node, serialized_node.outputs, deserialized_metadata)
+=======
+            self.generate_getitems(
+                meta_val,
+                fx_node,
+                serialized_node.outputs[0].as_tensors,
+                deserialized_metadata,
+            )
+        else:
+            self.generate_getitems(
+                meta_val, fx_node, serialized_node.outputs, deserialized_metadata
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # also update the metaval for `fx_node` to be a list(meta)
         fx_node.meta["val"] = tuple(meta_val)
@@ -2573,7 +3102,11 @@ def metadata_split(metadata):
                     elif c in b:
                         n -= 1
                     elif c == "," and n == 0:
+<<<<<<< HEAD
                         out.append(metadata[start : end])
+=======
+                        out.append(metadata[start:end])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         start = end + 1
                 out.append(metadata[start:])
                 assert len(out) == 3
@@ -2610,7 +3143,13 @@ def deserialize_argument_spec(self, x: Argument) -> ep.ArgumentSpec:
         elif x.type == "as_sym_float":
             return ep.SymFloatArgument(name=x.as_sym_float.as_name)
         elif x.type == "as_custom_obj":
+<<<<<<< HEAD
             return ep.ConstantArgument(name=x.as_custom_obj.name, value=self.deserialize_input(x))
+=======
+            return ep.ConstantArgument(
+                name=x.as_custom_obj.name, value=self.deserialize_input(x)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return ep.ConstantArgument(name="", value=self.deserialize_input(x))
 
@@ -2626,7 +3165,13 @@ def deserialize_module_call_signature(
             ],
             in_spec=treespec_loads(module_call_signature.in_spec),
             out_spec=treespec_loads(module_call_signature.out_spec),
+<<<<<<< HEAD
             forward_arg_names=names if (names := module_call_signature.forward_arg_names) else None,
+=======
+            forward_arg_names=names
+            if (names := module_call_signature.forward_arg_names)
+            else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def deserialize_module_call_graph(
@@ -2667,7 +3212,13 @@ def deserialize_range_constraints(
                 log.debug("[deserialize_range_constraints] %s -> %s", k, v)
                 range_constraints[symbol] = v  # type: ignore[arg-type]
             else:
+<<<<<<< HEAD
                 log.warning("Symbol %s did not appear in the graph that was deserialized", k)
+=======
+                log.warning(
+                    "Symbol %s did not appear in the graph that was deserialized", k
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return range_constraints
 
     def deserialize(
@@ -2675,7 +3226,13 @@ def deserialize(
         exported_program: ExportedProgram,
         state_dict: Union[dict[str, torch.Tensor], bytes],
         constants: Union[dict[str, torch.Tensor], bytes],
+<<<<<<< HEAD
         example_inputs: Optional[Union[tuple[tuple[torch.Tensor, ...], dict[str, Any]], bytes]] = None,
+=======
+        example_inputs: Optional[
+            Union[tuple[tuple[torch.Tensor, ...], dict[str, Any]], bytes]
+        ] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *,
         _unsafe_skip_version_check=False,
     ) -> ep.ExportedProgram:
@@ -2683,7 +3240,13 @@ def deserialize(
         version = exported_program.schema_version
 
         # TODO(zhxchen17) blocked on thrift schema refactor
+<<<<<<< HEAD
         if version.major != SCHEMA_VERSION[0] and not (version.major == 0 and version.minor == 0):
+=======
+        if version.major != SCHEMA_VERSION[0] and not (
+            version.major == 0 and version.minor == 0
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not _unsafe_skip_version_check:
                 raise SerializeError(
                     f"Serialized schema version {exported_program.schema_version} "
@@ -2692,6 +3255,7 @@ def deserialize(
 
         symbol_name_to_range = {
             k: symbolic_shapes.ValueRanges(
+<<<<<<< HEAD
                 _int_to_sympy_int(v.min_val, -int_oo), _int_to_sympy_int(v.max_val, int_oo)
             )
             for k, v in exported_program.range_constraints.items()
@@ -2705,6 +3269,19 @@ def deserialize(
                 example_inputs,
                 symbol_name_to_range,
             )
+=======
+                _int_to_sympy_int(v.min_val, -int_oo),
+                _int_to_sympy_int(v.max_val, int_oo),
+            )
+            for k, v in exported_program.range_constraints.items()
+        }
+        res = GraphModuleDeserializer().deserialize(
+            exported_program.graph_module,
+            state_dict,
+            constants,
+            example_inputs,
+            symbol_name_to_range,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         range_constraints = self.deserialize_range_constraints(
             symbol_name_to_range,
@@ -2763,7 +3340,13 @@ def _dataclass_to_dict(obj):
 
 
 def _to_json_bytes(obj: Any) -> bytes:
+<<<<<<< HEAD
     return json.dumps(_dataclass_to_dict(obj), cls=EnumEncoder, allow_nan=False).encode("utf-8")
+=======
+    return json.dumps(_dataclass_to_dict(obj), cls=EnumEncoder, allow_nan=False).encode(
+        "utf-8"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def serialize(
@@ -2772,9 +3355,15 @@ def serialize(
     pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
 ) -> SerializedArtifact:
     with _enable_graph_inputs_of_type_nn_module(exported_program.example_inputs):
+<<<<<<< HEAD
         serialized_program = ExportedProgramSerializer(opset_version, pickle_protocol).serialize(
             exported_program
         )
+=======
+        serialized_program = ExportedProgramSerializer(
+            opset_version, pickle_protocol
+        ).serialize(exported_program)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(serialized_program.exported_program, ExportedProgram)
 
     json_bytes = _to_json_bytes(serialized_program.exported_program)
@@ -2782,7 +3371,11 @@ def serialize(
         json_bytes,
         serialized_program.state_dict,
         serialized_program.constants,
+<<<<<<< HEAD
         serialized_program.example_inputs
+=======
+        serialized_program.example_inputs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return artifact
 
@@ -2835,6 +3428,7 @@ def deserialize(
     assert isinstance(artifact.exported_program, bytes)
     exported_program_str = artifact.exported_program.decode("utf-8")
     exported_program_dict = json.loads(exported_program_str)
+<<<<<<< HEAD
     serialized_exported_program = _dict_to_dataclass(ExportedProgram, exported_program_dict)
     return (
         ExportedProgramDeserializer(expected_opset_version)
@@ -2845,6 +3439,17 @@ def deserialize(
             artifact.example_inputs,
             _unsafe_skip_version_check=_unsafe_skip_version_check,
         )
+=======
+    serialized_exported_program = _dict_to_dataclass(
+        ExportedProgram, exported_program_dict
+    )
+    return ExportedProgramDeserializer(expected_opset_version).deserialize(
+        serialized_exported_program,
+        artifact.state_dict,
+        artifact.constants,
+        artifact.example_inputs,
+        _unsafe_skip_version_check=_unsafe_skip_version_check,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -3100,7 +3705,13 @@ def replace_use(a):
         n.metadata.clear()
 
     # Stage 4: Aggregate values.
+<<<<<<< HEAD
     sorted_tensor_values = dict(sorted(graph.tensor_values.items(), key=operator.itemgetter(0)))
+=======
+    sorted_tensor_values = dict(
+        sorted(graph.tensor_values.items(), key=operator.itemgetter(0))
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sorted_sym_int_values = dict(
         sorted(graph.sym_int_values.items(), key=operator.itemgetter(0))
     )
@@ -3121,7 +3732,14 @@ def replace_use(a):
             a = i.arg
             if a.type == "as_graph":
                 a.as_graph.graph, _ = _canonicalize_graph(
+<<<<<<< HEAD
                     a.as_graph.graph.inputs, a.as_graph.graph.outputs, a.as_graph.graph, constants
+=======
+                    a.as_graph.graph.inputs,
+                    a.as_graph.graph.outputs,
+                    a.as_graph.graph,
+                    constants,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 a.as_graph.name = f"_g{counter}"
                 counter += 1
@@ -3140,7 +3758,13 @@ def replace_use(a):
     return graph, name_table
 
 
+<<<<<<< HEAD
 def canonicalize(ep: ExportedProgram, constants: Optional[set[str]] = None) -> ExportedProgram:
+=======
+def canonicalize(
+    ep: ExportedProgram, constants: Optional[set[str]] = None
+) -> ExportedProgram:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Normalize a serialized ExportedProgram, so that different eager program which
     shares the same semantics can get a single representation on disk.
@@ -3164,7 +3788,13 @@ def canonicalize(ep: ExportedProgram, constants: Optional[set[str]] = None) -> E
     constants: set[str] = constants or set()
 
     opset_version = dict(sorted(ep.opset_version.items(), key=operator.itemgetter(0)))
+<<<<<<< HEAD
     range_constraints = dict(sorted(ep.range_constraints.items(), key=operator.itemgetter(0)))
+=======
+    range_constraints = dict(
+        sorted(ep.range_constraints.items(), key=operator.itemgetter(0))
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     module_call_graph = sorted(ep.graph_module.module_call_graph, key=lambda x: x.fqn)
     signature = ep.graph_module.signature
     graph = ep.graph_module.graph
@@ -3363,6 +3993,10 @@ class ExtensionHandler:
     """
     Base class for handling extension operators.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def namespace(cls) -> str:
         raise NotImplementedError(f"{cls.__class__} namespace() must be implemented")
@@ -3385,19 +4019,35 @@ def register_extension(
     extension_handler: type[ExtensionHandler],
 ):
     """Register custom de/serialization method for a node with non-standard type."""
+<<<<<<< HEAD
     assert issubclass(extension_handler, ExtensionHandler), f"Expected ExtensionHandler, got {extension_handler}."
     assert op_type not in _serialization_registry, f"{op_type} is already registered."
     assert isinstance(op_type, type)  # Maybe a good idea to enforce this first.
     assert not (op_type.__module__.startswith("torch") or op_type.__module__.startswith("builtins"))
+=======
+    assert issubclass(
+        extension_handler, ExtensionHandler
+    ), f"Expected ExtensionHandler, got {extension_handler}."
+    assert op_type not in _serialization_registry, f"{op_type} is already registered."
+    assert isinstance(op_type, type)  # Maybe a good idea to enforce this first.
+    assert not (
+        op_type.__module__.startswith("torch")
+        or op_type.__module__.startswith("builtins")
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert extension_handler.namespace() not in _deserialization_registry
     _serialization_registry[op_type] = extension_handler
     _deserialization_registry[extension_handler.namespace()] = extension_handler
 
 
 def _registered_extension_types():
+<<<<<<< HEAD
     return tuple(
         _serialization_registry.keys()
     )
+=======
+    return tuple(_serialization_registry.keys())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Registry to store all custom serialization implementations.
diff --git a/torch/_export/utils.py b/torch/_export/utils.py
index 5b49a1a65af8..2f4944789209 100644
--- a/torch/_export/utils.py
+++ b/torch/_export/utils.py
@@ -58,6 +58,11 @@
     InputKind.TOKEN: "token",
 }
 
+<<<<<<< HEAD
+=======
+_DISABLE_ATEN_TO_ASSERTION_PASS = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _collect_and_set_constant_attrs(
     graph_signature, constants, mod
@@ -280,6 +285,7 @@ def _rename_without_collisions(
     return name_map[orig_name]
 
 
+<<<<<<< HEAD
 def _check_input_constraints_for_graph(
     input_placeholders: list[torch.fx.Node], flat_args_with_path, range_constraints
 ) -> None:
@@ -299,6 +305,45 @@ def get_keystr(key_path: KeyPath) -> str:
             assert isinstance(kwarg_key, MappingKey)
             name = str(kwarg_key)[1:-1]  # get rid of the enclosed []
             return f"{name}{keystr(key_path[2:])}"
+=======
+def get_keystr(key_path: KeyPath) -> str:
+    """For a given index into the flat_args, return a human readable string
+    describing how to access it, e.g. "*args["foo"][0].bar"
+    """
+    # Prefix the keypath with "*args" or "**kwargs" to make it clearer where
+    # the arguments come from. Ultimately we ought to serialize the
+    # original arg names for the best error message here.
+    args_kwargs_key_path = key_path[0]
+    assert isinstance(args_kwargs_key_path, SequenceKey)
+    if args_kwargs_key_path.idx == 0:
+        return f"*args{keystr(key_path[1:])}"
+    else:
+        kwarg_key = key_path[1]
+        assert isinstance(kwarg_key, MappingKey)
+        name = str(kwarg_key)[1:-1]  # get rid of the enclosed []
+        return f"{name}{keystr(key_path[2:])}"
+
+
+def _check_symint(
+    symint: Union[int, torch.SymInt],
+    arg: int,
+    range_constraints,
+    unification_map,
+    keypath: KeyPath,
+    i: Optional[int] = None,
+) -> None:
+    from torch.export.dynamic_shapes import _IntWrapper
+
+    if (
+        isinstance(arg, torch.SymInt)
+        and not arg.node.expr.is_number
+        or isinstance(arg, _IntWrapper)
+    ):
+        # This can happen when, say, arg is a fake tensor.
+        # We do not run checks on symbolic shapes of fake inputs as
+        # such checks can affect the shape env.
+        return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     import sympy
 
@@ -307,6 +352,81 @@ def get_keystr(key_path: KeyPath) -> str:
     )
     from torch.utils._sympy.solve import try_solve
 
+<<<<<<< HEAD
+=======
+    if isinstance(symint, torch.SymInt) and len(symint.node.expr.free_symbols) == 1:
+        symbol = next(iter(symint.node.expr.free_symbols))
+        if symbol in unification_map:
+            existing_dim = symint.node.expr.subs(unification_map)
+            if arg != existing_dim:
+                path = get_keystr(keypath)
+                if i is not None:
+                    path += f".shape[{i}]"
+                raise RuntimeError(
+                    f"Expected input at {path} to be equal to {existing_dim}, but got {arg}",
+                )
+        else:
+            if isinstance(symint.node.expr, sympy.Symbol):
+                # Short cut for try_solve below. Also useful in cases where
+                # sympy.Eq(symint.node.expr, arg) would evaluate to False
+                # purely because symbol is constrained to be size-like,
+                # e.g., when symint.node.expr = symbol and arg = 0.
+                unification_map[symbol] = int(arg)
+            else:
+                solution = try_solve(sympy.Eq(symint.node.expr, arg), symbol)
+                if solution is None:
+                    path = get_keystr(keypath)
+                    if i is not None:
+                        path += f".shape[{i}]"
+                    raise RuntimeError(  # noqa: B904
+                        f"Expected input {path} = {arg} to be "
+                        f"of the form {symint.node.expr}, where {symbol} is an integer"
+                    )
+                else:
+                    unification_map[symbol] = int(solution[1])
+
+        if symint.node.expr in range_constraints:
+            min_val, max_val = _convert_range_to_int(
+                range_constraints[symint.node.expr]
+            )
+            # NOTE: we allow dimensions to be 0/1 at runtime
+            if min_val > 2:
+                if arg < min_val:
+                    path = get_keystr(keypath)
+                    if i is not None:
+                        path += f".shape[{i}]"
+                    raise RuntimeError(
+                        f"Expected input at {path} to be >= {min_val}, but got {arg}",
+                    )
+            if max_val < math.inf:
+                if arg > max_val:
+                    path = get_keystr(keypath)
+                    if i is not None:
+                        path += f".shape[{i}]"
+                    raise RuntimeError(
+                        f"Expected input at {path} to be <= {max_val}, but got {arg}",
+                    )
+    elif isinstance(symint, torch.SymInt) and not symint.node.expr.is_number:
+        # this means we deferred a guard from export analysis to runtime, let this pass
+        # we'll add a runtime assert checking equality to this replacement expression
+        pass
+    elif arg != symint:
+        path = get_keystr(keypath)
+        if i is not None:
+            path += f".shape[{i}]"
+        raise RuntimeError(
+            f"Expected input at {path} to be equal to {symint}, but got {arg}. "
+            "If you meant for this dimension to be dynamic, please re-export and specify dynamic_shapes "
+            "(e.g. with Dim.DYNAMIC)"
+        )
+
+
+def _check_input_constraints_for_graph(
+    input_placeholders: list[torch.fx.Node], flat_args_with_path, range_constraints
+) -> None:
+    import sympy  # noqa: TC002
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if len(flat_args_with_path) != len(input_placeholders):
         raise RuntimeError(
             "Unexpected number of inputs "
@@ -331,6 +451,7 @@ def get_keystr(key_path: KeyPath) -> str:
                 )
 
             for j, (arg_dim, node_dim) in enumerate(zip(arg.shape, node_val.shape)):
+<<<<<<< HEAD
                 if (
                     isinstance(arg_dim, torch.SymInt)
                     and not arg_dim.node.expr.is_number
@@ -399,11 +520,24 @@ def get_keystr(key_path: KeyPath) -> str:
                         f"Expected input at {get_keystr(key_path)}.shape[{j}] to be equal to "
                         f"{node_dim}, but got {arg_dim}",
                     )
+=======
+                _check_symint(
+                    node_dim, arg_dim, range_constraints, unification_map, key_path, j
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(node_val, (int, float, str)):
             if type(arg) != type(node_val) or arg != node_val:
                 raise RuntimeError(
                     f"Expected input at {get_keystr(key_path)} to be equal to {node_val}, but got {arg}",
                 )
+<<<<<<< HEAD
+=======
+        elif isinstance(node_val, torch.SymInt):
+            _check_symint(
+                node_val, arg, range_constraints, unification_map, key_path, None
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def register_dataclass_as_pytree_node(
@@ -577,6 +711,62 @@ def nodes_filter(nodes: list[torch.fx.Node], node_call_back) -> list[torch.fx.No
     return [node for node in nodes if node_call_back(node)]
 
 
+<<<<<<< HEAD
+=======
+@contextmanager
+def _disable_aten_to_metadata_assertions():
+    global _DISABLE_ATEN_TO_ASSERTION_PASS
+    orig_val = _DISABLE_ATEN_TO_ASSERTION_PASS
+    _DISABLE_ATEN_TO_ASSERTION_PASS = True
+    try:
+        yield
+    finally:
+        _DISABLE_ATEN_TO_ASSERTION_PASS = orig_val
+
+
+def _insert_aten_to_metadata_assert_pass(gm: torch.fx.GraphModule) -> None:
+    from torch._export.passes._node_metadata_hook import (
+        _node_metadata_hook,
+        _set_node_metadata_hook,
+    )
+
+    if _DISABLE_ATEN_TO_ASSERTION_PASS:
+        return
+
+    aten_to_variants = [
+        torch.ops.aten.to.device,
+        torch.ops.aten.to.dtype,
+        torch.ops.aten.to.dtype_layout,
+    ]
+    for node in gm.graph.nodes:
+        if node.target in aten_to_variants:
+            if (
+                node.prev.target == torch.ops.aten._assert_tensor_metadata.default
+                and node.args[0] == node.prev.args[0]
+            ):
+                # skip if already guarded
+                continue
+
+            if (tensor_val := node.args[0].meta.get("val")) is not None:
+                with gm.graph.inserting_before(node), _set_node_metadata_hook(
+                    gm,
+                    functools.partial(
+                        _node_metadata_hook,
+                        stack_trace=node.meta.get("stack_trace"),
+                    ),
+                ):
+                    gm.graph.call_function(
+                        torch.ops.aten._assert_tensor_metadata.default,
+                        args=(node.args[0],),
+                        kwargs={
+                            "dtype": tensor_val.dtype,
+                            "device": tensor_val.device,
+                            "layout": tensor_val.layout,
+                        },
+                    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def apply_runtime_assertion_pass(gm: torch.fx.GraphModule, graph_signature):
     from torch._export.passes._node_metadata_hook import (
         _node_metadata_hook,
@@ -600,6 +790,13 @@ def apply_runtime_assertion_pass(gm: torch.fx.GraphModule, graph_signature):
                     f"exported program: {first_call_function_nn_module_stack(gm.graph)}",
                     export=True,
                 )
+<<<<<<< HEAD
+=======
+
+        # insert runtime assertions for aten.to nodes
+        _insert_aten_to_metadata_assert_pass(gm)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # update output specs
     gm.recompile()
     graph_signature.user_outputs = _graph_output_names(gm)
@@ -844,6 +1041,15 @@ def placeholder_naming_pass(
             These are named token, token_1, ...
     """
 
+<<<<<<< HEAD
+=======
+    custom_meta: dict[str, Any] = {}
+    if isinstance(mod, torch.fx.GraphModule):
+        for node in mod.graph.nodes:
+            if "custom" in node.meta:
+                custom_meta[node.name] = node.meta["custom"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _strip_name(x):
         if x.startswith("L__self___"):
             x = x[len("L__self___") :]
@@ -903,6 +1109,15 @@ def _extract_pytree_key(x):
             placeholder_prefixes[spec.kind] + base_name,
             is_placeholder=True,
         )
+<<<<<<< HEAD
+=======
+        if base_name in custom_meta:
+            # the keys in custom_meta are node names from `mod`,
+            # which is the base_name here.
+            # we need the re-mapped name for lookup later
+            custom_meta[name_map[spec.arg.name]] = custom_meta[base_name]
+            del custom_meta[base_name]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # handle naming collisions with call_function/get_attr inputs.
     # here, we want to prioritize user input names over call_function names
@@ -918,6 +1133,14 @@ def _extract_pytree_key(x):
         if node.op == "placeholder":
             assert node.name in name_map
             node.name = node.target = name_map[node.name]
+<<<<<<< HEAD
+=======
+            if node.name in custom_meta:
+                if node.meta.get("custom") is None:
+                    node.meta["custom"] = custom_meta[node.name]
+                else:
+                    assert node.meta["custom"] == custom_meta[node.name]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # if the constant obj is an input, we also need to update meta["val"]
             # because this is created before the placeholder naming pass
             if isinstance(node.meta["val"], CustomObjArgument):
@@ -1111,16 +1334,28 @@ def _check_valid_to_preserve(op_overload: "OperatorBase"):
 
 @functools.lru_cache(maxsize=1)
 def _collect_all_valid_cia_ops_for_aten_namespace() -> set["OperatorBase"]:
+<<<<<<< HEAD
     return _collect_all_valid_cia_ops_for_namespace("aten")
 
 
 def _collect_all_valid_cia_ops_for_namespace(namespace: str) -> set["OperatorBase"]:
+=======
+    return _collect_all_valid_cia_ops_for_namespace(torch.ops.aten)
+
+
+def _collect_all_valid_cia_ops_for_namespace(
+    op_namespace: torch._ops._OpNamespace,
+) -> set["OperatorBase"]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Step 1: Materialize all ops from C++ dispatcher
     _materialize_cpp_cia_ops()
 
     # Step 2: Query all ops from python dispatcher
+<<<<<<< HEAD
     assert hasattr(torch.ops, namespace)
     op_namespace = getattr(torch.ops, namespace)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cia_ops = set()
     for op in op_namespace:
         op_packet = getattr(op_namespace, op)
@@ -1150,7 +1385,14 @@ def _collect_all_valid_cia_ops() -> set["OperatorBase"]:
     for op_namespace_name in torch.ops._dir:
         # The reason we split here is because aten ops are safe to cache.
         if op_namespace_name != "aten":
+<<<<<<< HEAD
             cia_ops |= _collect_all_valid_cia_ops_for_namespace(op_namespace_name)
+=======
+            assert hasattr(torch.ops, op_namespace_name)
+            op_namespace = getattr(torch.ops, op_namespace_name)
+            if isinstance(op_namespace, torch._ops._OpNamespace):
+                cia_ops |= _collect_all_valid_cia_ops_for_namespace(op_namespace)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             cia_ops |= _collect_all_valid_cia_ops_for_aten_namespace()
     return cia_ops
diff --git a/torch/_export/verifier.py b/torch/_export/verifier.py
index 957589a70b0f..234c10925413 100644
--- a/torch/_export/verifier.py
+++ b/torch/_export/verifier.py
@@ -11,17 +11,31 @@
 from torch.export.graph_signature import (
     CustomObjArgument,
     InputKind,
+<<<<<<< HEAD
     SymIntArgument,
     SymFloatArgument,
     SymBoolArgument,
+=======
+    SymBoolArgument,
+    SymFloatArgument,
+    SymIntArgument,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorArgument,
     TokenArgument,
 )
 from torch.fx import GraphModule
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from torch.export.exported_program import ExportedProgram
 
+=======
+
+if TYPE_CHECKING:
+    from torch.export.exported_program import ExportedProgram
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SpecViolationError(Exception):
     pass
 
@@ -43,9 +57,19 @@ def _check_correct_val(val):
             return True
         elif isinstance(val, (int, bool, str, float)):
             return True
+<<<<<<< HEAD
         elif isinstance(val, (torch.memory_format, torch.dtype, torch.device, torch.layout)):
             return True
         elif isinstance(val, (FakeTensor, torch.Tensor)):  # TODO(zhxchen17) Remove Tensor.
+=======
+        elif isinstance(
+            val, (torch.memory_format, torch.dtype, torch.device, torch.layout)
+        ):
+            return True
+        elif isinstance(
+            val, (FakeTensor, torch.Tensor)
+        ):  # TODO(zhxchen17) Remove Tensor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return True
         elif isinstance(val, (SymInt, SymFloat, SymBool)):
             return True
@@ -73,6 +97,7 @@ def _no_returns(op):
 def _check_torch_fn(node: torch.fx.Node) -> None:
     torch_fn = node.meta.get("torch_fn")
     if torch_fn is None:
+<<<<<<< HEAD
         raise SpecViolationError(f"Unable to find torch_fn metadata for node {node.name}")
     if (
         not isinstance(torch_fn, tuple) and
@@ -83,6 +108,23 @@ def _check_torch_fn(node: torch.fx.Node) -> None:
 
 class _VerifierMeta(type):
     _registry: dict[str, type['Verifier']] = {}
+=======
+        raise SpecViolationError(
+            f"Unable to find torch_fn metadata for node {node.name}"
+        )
+    if (
+        not isinstance(torch_fn, tuple)
+        and isinstance(torch_fn[0], str)
+        and isinstance(torch_fn[1], str)
+    ):
+        raise SpecViolationError(
+            f"Node.meta {node.name} has invalid torch_fn field {torch_fn}"
+        )
+
+
+class _VerifierMeta(type):
+    _registry: dict[str, type["Verifier"]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __new__(metacls, name, bases, attrs):
         if bases:
@@ -99,12 +141,24 @@ def __new__(metacls, name, bases, attrs):
         metacls._registry[attrs["dialect"]] = ret  # type: ignore[assignment]
         return ret
 
+<<<<<<< HEAD
 def getattr_recursive(obj: Any, target: str) -> Any:
     target_atoms = target.split('.')
     attr_itr = obj
     for i, atom in enumerate(target_atoms):
         if not hasattr(attr_itr, atom):
             raise RuntimeError(f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}")
+=======
+
+def getattr_recursive(obj: Any, target: str) -> Any:
+    target_atoms = target.split(".")
+    attr_itr = obj
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         attr_itr = getattr(attr_itr, atom)
     return attr_itr
 
@@ -145,11 +199,24 @@ def allowed_op_types(self) -> tuple[type[Any], ...]:
         return (OpOverload, HigherOrderOperator)
 
     def allowed_getattr_types(self) -> tuple[type[Any], ...]:
+<<<<<<< HEAD
         return (torch.fx.GraphModule,)
 
     def allowed_getattr_types_for_subgm(self) -> tuple[type[Any], ...]:
         # subgm in HOP's argument could has have getattr(weight) nodes, thus stateful
         return (torch.fx.GraphModule, torch.nn.parameter.Parameter)
+=======
+        return (torch.fx.GraphModule, torch.utils._pytree.TreeSpec)
+
+    def allowed_getattr_types_for_subgm(self) -> tuple[type[Any], ...]:
+        # subgm in HOP's argument could has have getattr(weight) nodes, thus stateful
+        return (
+            torch.fx.GraphModule,
+            torch.nn.parameter.Parameter,
+            torch.Tensor,  # for buffer and constant tensor
+            torch.utils._pytree.TreeSpec,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def check_valid_op(self, op):
         pass
@@ -206,7 +273,14 @@ def _allowed_op_types() -> tuple[type[Any], ...]:
             )
 
             if not isinstance(op, _allowed_op_types()):
+<<<<<<< HEAD
                 if op not in _allowed_builtin_ops() and op not in _allowed_torch_functions:
+=======
+                if (
+                    op not in _allowed_builtin_ops()
+                    and op not in _allowed_torch_functions
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raise SpecViolationError(
                         f"Operator '{op}' is not an allowed operator type: {_allowed_op_types()}\n"
                         f"Valid builtin ops: {_allowed_builtin_ops()}"
@@ -217,9 +291,13 @@ def _allowed_op_types() -> tuple[type[Any], ...]:
                 # All ops functional
                 # TODO (tmanlaibaatar) more proper way is needed here
                 if self.dialect != "TRAINING" and not is_functional(op):
+<<<<<<< HEAD
                     raise SpecViolationError(
                         f"operator '{op}' is not functional"
                     )
+=======
+                    raise SpecViolationError(f"operator '{op}' is not functional")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.check_valid_op(op)
 
         for mod in gm.modules():
@@ -249,6 +327,7 @@ def _allowed_op_types() -> tuple[type[Any], ...]:
 
                     attr = getattr_recursive(mod, node.target)
                     if isinstance(attr, torch.nn.Module):
+<<<<<<< HEAD
                         def _is_type(name, ty):
                             return isinstance(getattr(attr, name, None), ty)
                         if type(attr).__name__ == "LoweredBackendModule":
@@ -256,6 +335,19 @@ def _is_type(name, ty):
                                     and _is_type("processed_bytes", bytes) \
                                     and _is_type("compile_specs", list) \
                                     and hasattr(attr, "original_module"):
+=======
+
+                        def _is_type(name, ty):
+                            return isinstance(getattr(attr, name, None), ty)
+
+                        if type(attr).__name__ == "LoweredBackendModule":
+                            if (
+                                _is_type("backend_id", str)
+                                and _is_type("processed_bytes", bytes)
+                                and _is_type("compile_specs", list)
+                                and hasattr(attr, "original_module")
+                            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 continue
                             else:
                                 backend_id = getattr(attr, "backend_id", None)
@@ -271,6 +363,7 @@ def _is_type(name, ty):
                         elif type(attr).__name__ == "AOTInductorEPModule":
                             continue
 
+<<<<<<< HEAD
 
                     if not isinstance(attr, _allowed_getattr_types(is_toplevel_gm)):
                         raise SpecViolationError(
@@ -279,6 +372,17 @@ def _is_type(name, ty):
                         )
 
 
+=======
+                        elif type(attr).__name__ == "AOTInductorRunnerWrapper":
+                            continue
+
+                    if not isinstance(attr, _allowed_getattr_types(is_toplevel_gm)):
+                        raise SpecViolationError(
+                            f"Invalid get_attr type {type(attr)} on target {node.target}. \n"
+                            f"Valid get_attr types: {_allowed_getattr_types(is_toplevel_gm)}"
+                        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 elif node.op == "placeholder":
                     _check_val(node)
                 # TODO(zhxchen17)
@@ -294,9 +398,13 @@ class TrainingIRVerifier(Verifier):
 
 def _verify_exported_program_module_call_graph(exported_program) -> None:
     module_call_graph = exported_program.module_call_graph
+<<<<<<< HEAD
     nodes = {
         node.name for node in exported_program.graph.nodes
     }
+=======
+    nodes = {node.name for node in exported_program.graph.nodes}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for entry in module_call_graph:
         if entry.signature is not None:
             for arg in entry.signature.inputs:
@@ -316,7 +424,13 @@ def _verify_exported_program_signature(exported_program) -> None:
     gs = exported_program.graph_signature
 
     # Check every node in the signature exists in the graph
+<<<<<<< HEAD
     input_node_names = [node.name for node in exported_program.graph.nodes if node.op == "placeholder"]
+=======
+    input_node_names = [
+        node.name for node in exported_program.graph.nodes if node.op == "placeholder"
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if len(input_node_names) != len(gs.input_specs):
         raise SpecViolationError(
@@ -325,7 +439,14 @@ def _verify_exported_program_signature(exported_program) -> None:
         )
 
     for input_spec, node in zip(gs.input_specs, input_node_names):
+<<<<<<< HEAD
         if isinstance(input_spec.arg, (TensorArgument, SymIntArgument, SymFloatArgument, SymBoolArgument)):
+=======
+        if isinstance(
+            input_spec.arg,
+            (TensorArgument, SymIntArgument, SymFloatArgument, SymBoolArgument),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if input_spec.arg.name != node:
                 raise SpecViolationError(
                     f"Input spec name {input_spec.arg.name} does not match node name {node}"
@@ -346,9 +467,13 @@ def _verify_exported_program_signature(exported_program) -> None:
 
             param = input_spec.target
             if param not in exported_program.state_dict:
+<<<<<<< HEAD
                 raise SpecViolationError(
                     f"Parameter {param} is not in the state dict."
                 )
+=======
+                raise SpecViolationError(f"Parameter {param} is not in the state dict.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if not isinstance(exported_program.state_dict[param], torch.nn.Parameter):
                 raise SpecViolationError(
@@ -371,10 +496,18 @@ def _verify_exported_program_signature(exported_program) -> None:
                     f"Buffer {buffer} is missing a persistence flag"
                 )
 
+<<<<<<< HEAD
             if input_spec.persistent is True and buffer not in exported_program.state_dict:
                 raise SpecViolationError(
                     f"Buffer {buffer} is not in the state dict."
                 )
+=======
+            if (
+                input_spec.persistent is True
+                and buffer not in exported_program.state_dict
+            ):
+                raise SpecViolationError(f"Buffer {buffer} is not in the state dict.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if input_spec.persistent is False and buffer in exported_program.state_dict:
                 raise SpecViolationError(
@@ -416,9 +549,13 @@ def _verify_exported_program_signature(exported_program) -> None:
                     f"Constant tensor {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
                 )
         else:
+<<<<<<< HEAD
             raise SpecViolationError(
                 f"Unknown InputKind {input_spec.kind}."
             )
+=======
+            raise SpecViolationError(f"Unknown InputKind {input_spec.kind}.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Check outputs
     output_node = list(exported_program.graph.nodes)[-1]
@@ -439,7 +576,11 @@ def _verify_exported_program_signature(exported_program) -> None:
     num_tokens = len(gs.output_tokens)
     end = len(gs.buffers_to_mutate) + len(gs.user_inputs_to_mutate) + num_tokens
     mutate_nodes: list[str] = output_nodes[num_tokens:end]
+<<<<<<< HEAD
     user_output_nodes = output_nodes[end:end + len(gs.user_outputs)]
+=======
+    user_output_nodes = output_nodes[end : end + len(gs.user_outputs)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for mutation_node in mutate_nodes:
         if mutation_node in gs.buffers_to_mutate:
@@ -454,7 +595,12 @@ def _verify_exported_program_signature(exported_program) -> None:
                 raise SpecViolationError(
                     f"User input output {mutation_node} does not point to a user input that exists. \n"
                     f"Dict of user inputs that are mutated, in order: {gs.user_inputs_to_mutate} \n"
+<<<<<<< HEAD
                     f"User input nodes available: {gs.user_inputs} \n")
+=======
+                    f"User input nodes available: {gs.user_inputs} \n"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise SpecViolationError(
                 f"Mutation node {mutation_node} is neither a buffer nor a user input. "
diff --git a/torch/_export/wrappers.py b/torch/_export/wrappers.py
index 83c68de1623d..afd88287ee84 100644
--- a/torch/_export/wrappers.py
+++ b/torch/_export/wrappers.py
@@ -4,12 +4,31 @@
 import torch
 import torch._custom_ops
 from torch._C import DispatchKey
+<<<<<<< HEAD
+=======
+from torch._higher_order_ops.flat_apply import (
+    _ConstantFunction,
+    flat_apply,
+    to_graphable,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.strict_mode import strict_mode
 from torch._higher_order_ops.utils import autograd_not_implemented
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
+<<<<<<< HEAD
 from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
 from torch.utils import _pytree as pytree
+=======
+from torch.fx.experimental.proxy_tensor import (
+    get_proxy_slot,
+    PreDispatchTorchFunctionMode,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.utils import _pytree as pytree
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass_type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ExportTracepoint(HigherOrderOperator):
@@ -115,3 +134,129 @@ def call(self, *args):
 
     cls.__call__ = call
     return cls
+<<<<<<< HEAD
+=======
+
+
+def _register_subclass_spec_proxy_in_tracer(tracer, name, spec):
+    """
+    This is a wrapper utility method on top of tracer to cache the
+    already registered subclass spec attribute. This is useful because
+    Subclass.__init__ will be same for each subclass. By default, fx will
+    create multiple attributes/proxies for given attribute.
+    """
+    fx_name = name + "0"
+    if hasattr(tracer.root, fx_name):
+        assert getattr(tracer.root, fx_name) == spec
+        return tracer.create_proxy("get_attr", fx_name, (), {})
+
+    qualname = tracer.get_fresh_qualname(name)
+    setattr(tracer.root, qualname, spec)
+    return tracer.create_proxy("get_attr", qualname, (), {})
+
+
+def mark_subclass_constructor_exportable_experimental(constructor_subclass):
+    """
+    Experimental decorator that makes subclass to be traceable in export
+    with pre-dispatch IR. To make your subclass traceble in export, you need to:
+        1. Implement __init__ method for your subclass (Look at DTensor implementation)
+        2. Decorate your __init__ method with _mark_constructor_exportable_experimental
+        3. Put torch._dynamo_disable decorator to prevent dynamo from peeking into its' impl
+
+    Example:
+
+    class FooTensor(torch.Tensor):
+        @staticmethod
+        def __new__(cls, elem, *, requires_grad=False):
+            # ...
+            return torch.Tensor._make_subclass(cls, elem, requires_grad=requires_grad)
+
+        @torch._dynamo_disable
+        @mark_subclass_constructor_exportable_experimental
+        def __init__(self, elem, ...):
+            # ...
+    """
+
+    def _is_init(fn):
+        return callable(fn) and fn.__name__ == "__init__"
+
+    if not _is_init(constructor_subclass):
+        raise RuntimeError(
+            f"torch._export.wrappers.mark_constructor_exportable_experimental can only be applied on subclass tensor.__init__"
+            f"But, you are adding it on {constructor_subclass.__name__} which is not supported. "
+            f"If __init__ doesn't exist on your subclass, please add it. Look at DTensor.__init__ implementation for example"
+        )
+
+    def wrapper(*args, **kwargs):
+        if not is_traceable_wrapper_subclass_type(type(args[0])):
+            assert constructor_subclass.__qualname__.endswith("__init__")
+            obj_name = constructor_subclass.__qualname__[: -len("__init__")]
+            raise RuntimeError(
+                f"Applying mark_constructor_exportable_experimental on {obj_name} is not valid as it is not a traceable "
+                f"tensor subclass. Please look at DTensor.__init__ implementation as an example of proper usage of this API."
+            )
+        constructor_subclass(*args, **kwargs)
+        if not torch._C._is_torch_function_mode_enabled():
+            return
+        torch_function_mode_stack = torch.overrides._get_current_function_mode_stack()
+
+        pre_dispatch_tf_modes = [
+            mode
+            for mode in torch_function_mode_stack
+            if isinstance(mode, PreDispatchTorchFunctionMode)
+        ]
+        assert (
+            len(pre_dispatch_tf_modes) <= 1
+        ), f"Expected only one PreDispatchTorchFunctionMode, found {len(pre_dispatch_tf_modes)}"
+
+        if len(pre_dispatch_tf_modes) == 0:
+            return
+
+        mode = pre_dispatch_tf_modes[0]
+
+        tracer = mode.tracer
+        subclass = args[0]
+
+        flat_args, in_spec = to_graphable((tuple(args[1:]), kwargs))
+
+        constructor_spec_name = "_".join(
+            constructor_subclass.__qualname__.lower().split(".")
+        )
+        qualname = tracer.get_fresh_qualname(constructor_spec_name)  # type: ignore[union-attr]
+        setattr(tracer.root, qualname, in_spec)  # type: ignore[union-attr]
+        spec_proxy = tracer.create_proxy("get_attr", qualname, (), {})
+        flat_proxy_args = pytree.tree_map_only(
+            torch.Tensor, lambda x: get_proxy_slot(x, tracer).proxy, flat_args
+        )
+
+        _, func_spec = torch.utils._pytree.tree_flatten(
+            _ConstantFunction(type(subclass))
+        )
+
+        # We actually don't want to create a new spec for each instance
+        # In fx graph, it will look like dtensor_const_func_spec
+        # We can't directly shove DTensor.__init__ into fx as it is not
+        # allowed type.
+        fxable_constructor_call_spec_name = (
+            type(subclass).__name__.lower() + "_const_func_spec"
+        )
+
+        # We should try to reuse the constructor call spec as it is guaranteed to be same
+        # for each subclass type. This is different from proxy-ing the init arguments which
+        # can't be reused because for example, DTensor can receive different DeviceMesh etc
+        # as it's arguments
+        func_spec_proxy = _register_subclass_spec_proxy_in_tracer(
+            tracer, fxable_constructor_call_spec_name, func_spec
+        )
+
+        inner_proxy = tracer.create_proxy(
+            "call_function",
+            flat_apply,
+            (func_spec_proxy, spec_proxy, *flat_proxy_args),
+            {},
+        )
+        track_tensor_tree(subclass, inner_proxy, constant=None, tracer=tracer)
+        return
+
+    return wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py b/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py
index b29a05ecdf69..cd2b8fcca726 100644
--- a/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py
+++ b/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections import deque
 from typing import Callable
 
@@ -165,7 +169,11 @@ def evaluate_knapsack_output(
                     for i in saved_nodes_idxs
                 ),
             )
+<<<<<<< HEAD
             peak_memory = max(memory_list, key=lambda x: x[0])[0]
+=======
+            peak_memory = max(memory_list, key=operator.itemgetter(0))[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             peak_memory = sum(
                 self._graph_info_provider.all_node_memories[
@@ -236,6 +244,7 @@ def get_knee_point_memory_budget(
         Returns:
             float: Memory budget at the knee point.
         """
+<<<<<<< HEAD
         import numpy as np
 
         results = self.evaluate_distribution_of_results_for_knapsack_algo(
@@ -258,4 +267,39 @@ def get_knee_point_memory_budget(
         memory_norm = (memory_values - memory_values.min()) / memory_range
         distances = np.sqrt(runtime_norm**2 + memory_norm**2)
         knee_index = np.argmin(distances)
+=======
+        results = self.evaluate_distribution_of_results_for_knapsack_algo(
+            knapsack_algo=knapsack_algo,
+            memory_budget_values=[
+                min_mem_budget
+                + i * (max_mem_budget - min_mem_budget) / (iterations - 1)
+                for i in range(iterations)
+            ],
+        )
+        runtime_values = [
+            result["percentage_of_theoretical_peak_runtime"] for result in results
+        ]
+        memory_values = [
+            result["percentage_of_theoretical_peak_memory"] for result in results
+        ]
+        runtime_range = max(runtime_values) - min(runtime_values)
+        memory_range = max(memory_values) - min(memory_values)
+        if runtime_range == 0 or memory_range == 0:
+            return max_mem_budget
+
+        # Normalize values
+        runtime_min = min(runtime_values)
+        memory_min = min(memory_values)
+        runtime_norm = [
+            (value - runtime_min) / runtime_range for value in runtime_values
+        ]
+        memory_norm = [(value - memory_min) / memory_range for value in memory_values]
+        # Calculate Euclidean distance
+        distances = [
+            (runtime_norm[i] ** 2 + memory_norm[i] ** 2) ** 0.5
+            for i in range(len(runtime_norm))
+        ]
+        # Find the knee point(shortest distance from the origin)
+        knee_index = distances.index(min(distances))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return results[knee_index]["memory_budget"]
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index 5534041db218..19fea6813abb 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -13,12 +13,31 @@
 import pickle
 import shutil
 import time
+<<<<<<< HEAD
 from dataclasses import dataclass
 from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 
 import torch
 from torch._dynamo.trace_rules import torch_non_c_binding_in_graph_functions
 from torch._dynamo.utils import CompileEventLogger, counters
+=======
+import traceback
+from abc import ABC, abstractmethod
+from copy import copy
+from dataclasses import dataclass
+from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import override
+
+import torch
+from torch._dynamo.precompile_context import PrecompileCacheArtifact, PrecompileContext
+from torch._dynamo.trace_rules import torch_non_c_binding_in_graph_functions
+from torch._dynamo.utils import (
+    chromium_event_log_active,
+    CompileEventLogger,
+    counters,
+    dynamo_timed,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._functorch import config
 from torch._inductor.codecache import (
     _ident,
@@ -29,20 +48,45 @@
     FxGraphCache,
     FxGraphCachePickler,
     FxGraphHashDetails,
+<<<<<<< HEAD
     write_atomic,
 )
 from torch._inductor.output_code import CompiledFxGraphConstants
+=======
+    GuardedCache,
+    sha256_hash,
+    write_atomic,
+)
+from torch._inductor.output_code import (
+    CompiledFxGraph,
+    CompiledFxGraphConstants,
+    OutputCode,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch._inductor.utils import should_use_remote_fx_graph_cache
 from torch._logging import LazyString
 from torch._utils_internal import log_cache_bypass
+<<<<<<< HEAD
 from torch.compiler._cache import CacheArtifactManager, CacheArtifactType
+=======
+from torch.compiler._cache import (
+    CacheArtifact,
+    CacheArtifactFactory,
+    CacheArtifactManager,
+)
+from torch.fx.experimental.symbolic_shapes import hint_int
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import has_triton_package
 from torchgen.utils import dataclass_repr
 
 from .runtime_wrappers import (
     AOTDispatchAutograd,
     AOTDispatchSubclassWrapper,
+<<<<<<< HEAD
+=======
+    CachedAutogradLazyBackwardCompileInfo,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CompilerWrapper,
     FunctionalizedRngRuntimeWrapper,
     post_compile,
@@ -54,7 +98,11 @@
 
 if TYPE_CHECKING:
     from torch._inductor.compile_fx import _CompileFxKwargs
+<<<<<<< HEAD
     from torch._inductor.output_code import CompiledFxGraph
+=======
+    from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._inductor.remote_cache import JsonDataTy, RemoteCache
     from torch._inductor.utils import BoxedBool
     from torch.fx.node import Node
@@ -120,11 +168,22 @@ def check_node_safe(node: Node):
     SAFE_TORCH_MODULES = ("torch.functional", "torch.nn.functional")
     SAFE_TORCH_FUNCTIONS = (
         "torch.Size",
+<<<<<<< HEAD
+=======
+        "torch.Tensor",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torch.sym_int",
         "torch._sym_sqrt",
         "torch.sym_float",
         "torch.sym_sum",
+<<<<<<< HEAD
+        "einops.einops.rearrange",
+=======
+    )
+    SAFE_NON_TORCH_FUNCTIONS = (
         "einops.einops.rearrange",
+        "einops.einops.repeat",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     def is_public_torch_api(target):
@@ -138,24 +197,53 @@ def is_public_torch_api(target):
     def is_safe_torch_function(target):
         """Allowlisted torch functions"""
         function_name = f"{target.__module__}.{target.__name__}"
+<<<<<<< HEAD
+=======
+        # Allow torch.autograd.function.FunctionCtx if custom autograd functions are allowed
+        if function_name == "torch.autograd.function.FunctionCtx":
+            return (
+                torch._functorch.config.autograd_cache_allow_custom_autograd_functions
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Functions in torch_non_c_binding_in_graph_functions
         # are guaranteed to be cache safe.
         # See NOTE: [Cacheability of in-graph torch functions]
         return (
             function_name in torch_non_c_binding_in_graph_functions
             or function_name in SAFE_TORCH_FUNCTIONS
+<<<<<<< HEAD
         )
 
     def is_torch_function(target):
+=======
+            or function_name in torch._inductor.config.unsafe_marked_cacheable_functions
+        )
+
+    def is_cacheable_function(target):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(target, (torch._ops.OpOverload, torch._ops.OpOverloadPacket)):
             return True
         if is_public_torch_api(target):
             return True
+<<<<<<< HEAD
+=======
+        # Technically, FXGraphCache._check_for_hop already checks this,
+        # but better to error earlier anyway
+        if isinstance(target, torch._ops.HigherOrderOperator):
+            return target.cacheable()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_builtin_fun_or_type = type(target).__name__ == "builtin_function_or_method"
         if is_builtin_fun_or_type:
             return True
         if is_safe_torch_function(target):
             return True
+<<<<<<< HEAD
+=======
+        function_name = f"{target.__module__}.{target.__name__}"
+        if function_name in SAFE_NON_TORCH_FUNCTIONS:
+            return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     def is_tensor(target):
@@ -164,9 +252,20 @@ def is_tensor(target):
 
     # I'd love to use a match statement here, but it wasn't introduced until py3.10
     if node.op == "call_function":
+<<<<<<< HEAD
         # We support only torch.* functions for now
         # We can probably add an allowlist of safe non-torch implementations as well
         if not is_torch_function(node.target):
+=======
+        if node.meta and node.meta.get("is_wrapped", False):
+            # This is fx.wrap function
+            # By default we BypassAOTAutogradCache for unknown functions,
+            # But if user explicitly specified cache hash - allow to cache it.
+            if node.meta.get("user_cache_hash", None):
+                return
+
+        if not is_cacheable_function(node.target):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             module = getattr(node.target, "__module__", None)
             name = getattr(node.target, "__name__", None)
             raise BypassAOTAutogradCache(
@@ -223,6 +322,18 @@ def check_cacheable(gm: torch.fx.GraphModule):
     for node in nodes:
         check_node_safe(node)
 
+<<<<<<< HEAD
+=======
+    # Saved tensors hooks are globally set subgraphs,
+    # that are not used explicitly in the main graph.
+    # They are inlined in aot_autograd graphs.
+    # Subgraphs are only used for caching logic.
+    if hasattr(gm, "saved_tensors_hooks_pack_0"):
+        check_cacheable(gm.saved_tensors_hooks_pack_0)  # type: ignore[arg-type]
+        # We have guarantee of unpack sugraph existance if pack subgraph exists
+        check_cacheable(gm.saved_tensors_hooks_unpack_0)  # type: ignore[arg-type]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def check_metadata_cacheable(metadata: ViewAndMutationMeta):
     """
@@ -256,6 +367,30 @@ def __init__(
         self.disable_amp = torch._C._is_any_autocast_enabled()
         self.deterministic_algorithms = torch.are_deterministic_algorithms_enabled()
         self.autograd_config = config.save_config()
+<<<<<<< HEAD
+=======
+        self.saved_tensors_hooks_fx_wrap_cache_hashes: tuple[list[str], list[str]] = (
+            [],
+            [],
+        )
+
+        if hasattr(gm, "saved_tensors_hooks_pack_0"):
+
+            def _add_wrapped_user_cache_hashes(_gm, _l):
+                for node in _gm.graph.nodes:
+                    if node.meta and node.meta.get("is_wrapped", False):
+                        _l.append(node.meta["user_cache_hash"])
+
+            _add_wrapped_user_cache_hashes(
+                gm.saved_tensors_hooks_pack_0,
+                self.saved_tensors_hooks_fx_wrap_cache_hashes[0],
+            )
+            _add_wrapped_user_cache_hashes(
+                gm.saved_tensors_hooks_unpack_0,
+                self.saved_tensors_hooks_fx_wrap_cache_hashes[1],
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             # FXGraphCache has constraints on what can be pickled in its inductor
             # config. Check that the gm is cacheable by inductor first,
@@ -264,7 +399,11 @@ def __init__(
             super().__init__(gm, example_inputs, fx_config, [])
         except BypassFxGraphCache as e:
             # Sometimes inductor configs are unpickleable and can fail
+<<<<<<< HEAD
             raise BypassAOTAutogradCache from e
+=======
+            raise BypassAOTAutogradCache(str(e)) from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class AOTAutogradCachePickler(FxGraphCachePickler):
@@ -341,6 +480,7 @@ def autograd_cache_key(
     return key, debug_lines
 
 
+<<<<<<< HEAD
 @dataclass
 class FXGraphCacheLoadable:
     fx_graph_cache_key: str
@@ -349,6 +489,83 @@ def is_backward(self):
         return False
 
     def load(self, example_inputs, fx_config: _CompileFxKwargs) -> CompiledFxGraph:
+=======
+TOut = TypeVar("TOut", bound=OutputCode)
+
+
+class InductorOutput(Generic[TOut], ABC):
+    """
+    Class representing a single inductor output
+    """
+
+    @abstractmethod
+    def pre_save(self) -> None:
+        ...
+
+    @abstractmethod
+    def load(self, example_inputs) -> TOut:
+        ...
+
+    @abstractmethod
+    def post_compile(self, result: TOut, fx_config: _CompileFxKwargs) -> TOut:
+        ...
+
+
+@dataclass
+class CompiledFxGraphLoadable(InductorOutput[CompiledFxGraph]):
+    """
+    A full compiled fx graph that doesn't need to lookup the FxGraphCache
+    to run
+    """
+
+    result: CompiledFxGraph
+
+    def pre_save(self) -> None:
+        disk_compiled_graph = copy(self.result)
+        disk_compiled_graph.prepare_for_serialization()
+        self.result = disk_compiled_graph
+        return
+
+    def load(self, example_inputs) -> CompiledFxGraph:
+        self.example_inputs = example_inputs
+
+        return self.result
+
+    def post_compile(
+        self, result: CompiledFxGraph, fx_config: _CompileFxKwargs
+    ) -> CompiledFxGraph:
+        constants = CompiledFxGraphConstants()
+        # Cache hit specific post compile
+        graph, cache_info = FxGraphCache.cache_hit_post_compile(result, {}, constants)
+        if graph is None:
+            raise BypassAOTAutogradCache("Failed to reload cache entry from disk")
+        torch._logging.trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "fx_graph_bundled_cache_hit",  # always a hit
+                "encoding": "json",
+            },
+            payload_fn=lambda: json.dumps(cache_info),
+        )
+        counters["inductor"]["fxgraph_cache_hit"] += 1
+        # Run normal post compile
+        graph.post_compile(self.example_inputs, constants, fx_config)
+        return graph
+
+
+@dataclass
+class FxGraphCacheLoadable(InductorOutput[CompiledFxGraph]):
+    fx_graph_cache_info: tuple[str, list[str]]
+    fx_graph_guard_expr: Optional[str]
+
+    def pre_save(self):
+        return
+
+    def _is_backward(self) -> bool:
+        return False
+
+    def load(self, example_inputs) -> CompiledFxGraph:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # [Note: AOTAutogradCache and FXGraphCache Guard interactions]
         # As mentioned, AOTAutograd takes in the symint inputs from dynamo's list of arguments.
         # FXGraphCache serializes guards that are needed in the shape_env based on these symint inputs to the graph.
@@ -358,11 +575,20 @@ def load(self, example_inputs, fx_config: _CompileFxKwargs) -> CompiledFxGraph:
         # That is, AOTAutograd and Inductor never create new guards based on symints with different sources
         # than those passed to it by inductor.
 
+<<<<<<< HEAD
         # TODO: We don't cache debug lines for now, but we should for improved debugging
+=======
+        # We pass the post compile function, which sets various fx_config boxed values,
+        # so we can call it only after we're sure both forward and backward have
+
+        # Clear CompiledTritonKernels before loading from FXGraphCache
+        torch._inductor.async_compile.CompiledTritonKernels.cache_clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         remote_cache = None
         constants = CompiledFxGraphConstants()
         if should_use_remote_fx_graph_cache():
             remote_cache = FxGraphCache.get_remote_cache()
+<<<<<<< HEAD
 
         result, cache_info = FxGraphCache.load_with_key(
             self.fx_graph_cache_key,
@@ -375,6 +601,39 @@ def load(self, example_inputs, fx_config: _CompileFxKwargs) -> CompiledFxGraph:
         )
         if result is None:
             log.info("FXGraphCache cache miss for key %s", self.fx_graph_cache_key)
+=======
+        (cache_key, debug_lines) = self.fx_graph_cache_info
+
+        def check_exact_guard_match(guard_expr, _hints):
+            """
+            AOTAutogradCache tracks its own guards, so we just need to treat these guard expressions as a second
+            cache key of sorts: we just check for equality, i.e. the FXGraphCache entry with
+            the exact same guards as we originally saved into the cache.
+            """
+            return guard_expr == self.fx_graph_guard_expr
+
+        result, cache_info = FxGraphCache.load_with_key(
+            cache_key,
+            debug_lines,
+            example_inputs,
+            local=True,
+            remote_cache=remote_cache,
+            is_backward=self._is_backward(),
+            constants=constants,
+            evaluate_guards=check_exact_guard_match,
+        )
+        if result is None:
+            log.info("FXGraphCache cache miss for key %s", self.fx_graph_cache_info)
+            torch._logging.trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "fx_graph_cache_miss",  # always a hit
+                    "encoding": "json",
+                },
+                payload_fn=lambda: json.dumps(cache_info),
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise FXGraphCacheMiss
 
         # No need to log chromium event because AOTAutograd will log that immediately for us
@@ -386,32 +645,59 @@ def load(self, example_inputs, fx_config: _CompileFxKwargs) -> CompiledFxGraph:
             },
             payload_fn=lambda: json.dumps(cache_info),
         )
+<<<<<<< HEAD
 
         # TODO: How come cudagraphs could be None here?
         result.post_compile(example_inputs, fx_config["cudagraphs"], constants)  # type: ignore[arg-type]
+=======
+        self.example_inputs = example_inputs
+        self.constants = constants
+        return result
+
+    def post_compile(
+        self, result: CompiledFxGraph, fx_config: _CompileFxKwargs
+    ) -> CompiledFxGraph:
+        """
+        Called after FXGraphCacheLoadable.load, mutates fx_config
+        """
+        result.post_compile(self.example_inputs, self.constants, fx_config)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return result
 
 
 @dataclass
+<<<<<<< HEAD
 class CompiledForward(FXGraphCacheLoadable):
+=======
+class CompiledForward(FxGraphCacheLoadable):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Cacheable entry for a forward function
     """
 
+<<<<<<< HEAD
     def is_backward(self):
+=======
+    def _is_backward(self) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
 
 @dataclass
+<<<<<<< HEAD
 class CompiledBackward(FXGraphCacheLoadable):
     """
     Cacheable entry for a forward function
     """
 
+=======
+class GenericCompiledBackward(InductorOutput[TOut]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Used by AOTDispatchAutograd.post_compile
     backward_state_indices: list[int]
     num_symints_saved_for_bw_: int
 
+<<<<<<< HEAD
     def is_backward(self):
         return True
 
@@ -423,6 +709,91 @@ class AOTAutogradCacheEntry:
     # Forward and Backward info
     compiled_fw: CompiledForward
     compiled_bw: Optional[CompiledBackward]
+=======
+
+@dataclass
+class CompiledBackward(GenericCompiledBackward[CompiledFxGraph], FxGraphCacheLoadable):
+    """
+    Cacheable entry for a forward function
+    """
+
+    def _is_backward(self) -> bool:
+        return True
+
+    def post_compile(
+        self, result: CompiledFxGraph, fx_config: _CompileFxKwargs
+    ) -> CompiledFxGraph:
+        compiled_bw = super().post_compile(result, fx_config)
+        # See note [Wrapping bw_compiler in disable]
+        # This is done by _wrapped_bw_compiler in torch/_dynamo/backends/common.py
+        # But since on cache hit we do not call the bw_compiler, we need to reapply the disable
+        return torch._dynamo.disable(compiled_bw, reason="do not trace generated backwards pass")  # type: ignore[return-value]
+
+
+# Forward types don't have any extra parameters, so this is just a TypeAlias, in essence
+class BundledCompiledForward(CompiledFxGraphLoadable):
+    pass
+
+
+@dataclass
+class BundledCompiledBackward(
+    GenericCompiledBackward[CompiledFxGraph], CompiledFxGraphLoadable
+):
+    def post_compile(
+        self, result: CompiledFxGraph, fx_config: _CompileFxKwargs
+    ) -> CompiledFxGraph:
+        compiled_bw = super().post_compile(result, fx_config)
+        # See note [Wrapping bw_compiler in disable]
+        # This is done by _wrapped_bw_compiler in torch/_dynamo/backends/common.py
+        # But since on cache hit we do not call the bw_compiler, we need to reapply the disable
+        return torch._dynamo.disable(compiled_bw, reason="do not trace generated backwards pass")  # type: ignore[return-value]
+
+
+@dataclass
+class SerializedGraphModule:
+    fn: Callable[[dict[Any, Any], str], torch.nn.Module]
+    args: tuple[Any, ...]
+
+    def __init__(self, gm: torch.fx.GraphModule):
+        self.fn, self.args = gm.__reduce__()
+
+    def deserialize(self) -> torch.fx.GraphModule:
+        gm = self.fn(*self.args)
+        assert isinstance(gm, torch.fx.GraphModule)
+        return gm
+
+
+def serialize_graph_module(gm: torch.fx.GraphModule) -> SerializedGraphModule:
+    # NOTE: mutates the graph module
+    gm.meta = {}
+    for node in gm.graph.nodes:
+        node.meta = {}
+    return SerializedGraphModule(gm)
+
+
+TForward = TypeVar("TForward", bound=InductorOutput)
+TBackward = TypeVar("TBackward", bound=GenericCompiledBackward)
+
+
+@dataclass
+class GenericAOTAutogradCacheEntry(Generic[TForward, TBackward]):
+    """A single entry into the cache, genericized by Forward and Backward types.
+
+    A TForward is always an InductorOutput of some sort, which represents the
+    forward graph of the compile.
+    A TBackward is an InductorOutput + metadata about the backward, useful for specific
+    backward-only wrappers. This type is encapsulated by GenericCompiledBackward.
+
+    Each AOTAutogradCacheEntry is essentially parameterized by 1. the method of loading
+    from the cache (either Bundled or UnBundled), and 2. The type of the output. For now,
+    the only type of output we support is Python Wrapper output, i.e. OutputCode.CompiledFxGraph,
+    but the same technique works for C++ wrapper code; we'd just add an extra InductorOutput type.
+    """
+
+    # Forward and Backward info
+    compiled_fw: TForward
+    compiled_bw: Optional[TBackward]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Code of the joint graph using print_readable()
     # Used for logging purposes
@@ -449,6 +820,26 @@ class AOTAutogradCacheEntry:
     forward_time_taken_ns: int
     backward_time_taken_ns: int
 
+<<<<<<< HEAD
+=======
+    # Used by standalone_compile
+    sanitized_aot_config: AOTConfig
+
+    guards_expr: Optional[str]
+
+    # Used by Compiled Autograd
+    serialized_bw_module: Optional[SerializedGraphModule]
+
+    def pre_save(self):
+        """
+        Perform any preparations to make the cache entry ready for serialization.
+        """
+        check_metadata_cacheable(self.runtime_metadata)
+        self.compiled_fw.pre_save()
+        if self.compiled_bw is not None:
+            self.compiled_bw.pre_save()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Turn cache entry into the original callable
     def wrap_post_compile(
         self,
@@ -472,7 +863,10 @@ def wrap_post_compile(
 
         Which we'll handle separately later on, if necessary.
         """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Log the output of AOTAutogradCache
         if aot_config.enable_log:
             # TODO: maybe also log to aot_graphs_log
@@ -516,6 +910,7 @@ def wrap_post_compile(
                 torch._logging.trace_structured(
                     "aot_backward_graph", payload_fn=lambda: self.aot_backward_graph_str
                 )
+<<<<<<< HEAD
 
         compiled_fw_func = self.compiled_fw.load(args, fx_config)
         compiled_bw_func = None
@@ -530,6 +925,47 @@ def wrap_post_compile(
             CompileEventLogger.try_add_pt2_compile(
                 "backend_compile", dispatch_mode="inference"
             )
+=======
+        with dynamo_timed("AOTAutogradCache.inductor_load"):
+            compiled_fw_func = self.compiled_fw.load(args)
+            compiled_bw_func = None
+            if self.compiled_bw is not None:
+                compiled_bw_func = self.compiled_bw.load(args)
+                needs_autograd = True
+                CompileEventLogger.try_add_pt2_compile(
+                    "backend_compile", dispatch_mode="autograd"
+                )
+                # Now that we've loaded forward and backward, call post compile on both
+                # This avoids setting things like BoxedBools in fx_config until
+                # after both forward and backward cache hit
+                fw_fx_config: _CompileFxKwargs = {
+                    **fx_config,
+                    "is_backward": False,
+                }
+                bw_fx_config: _CompileFxKwargs = {
+                    **fx_config,
+                    "is_backward": True,
+                }
+                compiled_fw_func = self.compiled_fw.post_compile(
+                    compiled_fw_func, fw_fx_config
+                )
+                compiled_bw_func = self.compiled_bw.post_compile(
+                    compiled_bw_func, bw_fx_config
+                )
+            else:
+                inference_fx_config: _CompileFxKwargs = {
+                    **fx_config,
+                    "is_backward": False,
+                }
+
+                needs_autograd = False
+                CompileEventLogger.try_add_pt2_compile(
+                    "backend_compile", dispatch_mode="inference"
+                )
+                compiled_fw_func = self.compiled_fw.post_compile(
+                    compiled_fw_func, inference_fx_config
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Wrap the forward function in post compile wrappers
         compiled_fw_func = AOTDispatchSubclassWrapper(
@@ -542,7 +978,11 @@ def wrap_post_compile(
         )
 
         req_subclass_dispatch = self.maybe_subclass_meta is not None
+<<<<<<< HEAD
         CompileEventLogger.pt2_compile(
+=======
+        CompileEventLogger.try_add_pt2_compile(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "backend_compile", requires_subclass_dispatch=req_subclass_dispatch
         )
 
@@ -557,6 +997,15 @@ def wrap_post_compile(
 
         if needs_autograd:
             assert self.compiled_bw is not None
+<<<<<<< HEAD
+=======
+
+            cached_lazy_backward = None
+            if self.serialized_bw_module is not None:
+                cached_lazy_backward = CachedAutogradLazyBackwardCompileInfo(
+                    self.serialized_bw_module.deserialize
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # This function is run on both cache miss and cache hit, either here
             # or in aot_dispatch_autograd. On a cache hit,
             # 1. the bw is already compiled
@@ -570,7 +1019,11 @@ def wrap_post_compile(
                 self.compiled_bw.backward_state_indices,
                 disable_amp,
                 self.indices_of_inps_to_detach,
+<<<<<<< HEAD
                 None,  # lazy_backward_info
+=======
+                cached_lazy_backward,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 aot_config,
                 fw_metadata=self.runtime_metadata,
                 try_save_cache_entry=None,
@@ -591,9 +1044,40 @@ def wrap_post_compile(
             runtime_metadata=self.runtime_metadata,
         )
 
+<<<<<<< HEAD
         return compiled_function
 
 
+=======
+        # Now that we're pretty sure it's a successful load, add guards
+        # to the existing shape environment from the cache
+        if self.guards_expr:
+            symints = AOTAutogradCache._filter_backed_symints(args)
+            check = bool(AOTAutogradCache.evaluate_guards(self.guards_expr, symints))
+            assert check is True
+
+        return compiled_function
+
+
+class AOTAutogradCacheEntry(
+    GenericAOTAutogradCacheEntry[CompiledForward, CompiledBackward]
+):
+    """
+    Regular AOTAutogradCacheEntry: saves the forward/backward FxGraphCache keys
+    and looks them up in FxGraphCache on load
+    """
+
+
+class BundledAOTAutogradCacheEntry(
+    GenericAOTAutogradCacheEntry[BundledCompiledForward, BundledCompiledBackward]
+):
+    """
+    AOTAutogradCacheEntry where we save the entire CompiledFxGraph instead
+    of relying on cache keys from FxGraphCache
+    """
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @contextlib.contextmanager
 def sanitize_gm_for_cache(gm: torch.fx.GraphModule):
     """
@@ -610,6 +1094,10 @@ def sanitize_gm_for_cache(gm: torch.fx.GraphModule):
         "meta",  # metadata used by export
         "compile_subgraph_reason",  # Used by dynamo only for logging, no change in inductor/autograd behavior
         "_param_name_to_source",  # Encapsulated by aot_config.aot_autograd_arg_pos_to_source
+<<<<<<< HEAD
+=======
+        "_backend_id",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     saved_fields = {}
     for field in IGNORED_FIELDS:
@@ -624,7 +1112,52 @@ def sanitize_gm_for_cache(gm: torch.fx.GraphModule):
             setattr(gm, field, value)
 
 
+<<<<<<< HEAD
 class AOTAutogradCache:
+=======
+@CacheArtifactFactory.register
+class AOTAutogradCacheArtifact(CacheArtifact):
+    @override
+    def populate_cache(self):
+        AOTAutogradCache._write_to_local_cache(self.key, self.content)
+
+    @override
+    @staticmethod
+    def type():
+        return "aot_autograd"
+
+
+@CacheArtifactFactory.register
+class BundledAOTAutogradCacheArtifact(PrecompileCacheArtifact[Callable]):
+    @override
+    @staticmethod
+    def type():
+        return "precompile_aot_autograd"
+
+    @override
+    def after_deserialization(self) -> Callable:
+        entry = pickle.loads(self.content)
+        # In the precompile use case, guards are already serialized
+        # by dynamo, so we don't need to add them to the environment
+        entry.guards_expr = None
+        # TODO: this isn't exactly right, because cudagraphs needs to be a shared config
+        # which is set by compile_fx. But in precompile, we never actually call compile_fx
+        # so we don't have a place to track cudagraphs here.
+        cudagraphs = torch._inductor.config.triton.cudagraphs
+        compiled_fn = entry.wrap_post_compile(
+            [], entry.sanitized_aot_config, {"cudagraphs": cudagraphs}
+        )
+
+        # TODO: this ignores flat_params, which can exist
+        # if inline_builtin_nn_modules=False
+        def forward(*runtime_args: tuple[Any]):
+            return compiled_fn(list(runtime_args))
+
+        return forward
+
+
+class AOTAutogradCache(GuardedCache[GenericAOTAutogradCacheEntry]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Caches the results of running AOTAutograd. This class mostly handles the save and load logic, whereas
     AOTAutogradCacheEntry handles the wrapping/unwrapping logic.
@@ -675,13 +1208,20 @@ def load(
         args,
         aot_config: AOTConfig,
         cudagraphs: BoxedBool,
+<<<<<<< HEAD
+=======
+        boxed_forward_device_index: Optional[BoxedDeviceIndex],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local: bool,
         remote: bool,
     ) -> Callable:
         """
         Load a result from the cache, and reconstruct a runtime wrapper around the object
         """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gm = mod.gm if isinstance(mod, torch._dynamo.utils.GmWrapper) else mod
         with sanitize_gm_for_cache(gm):
             compiled_fn = None
@@ -690,13 +1230,27 @@ def load(
             debug_lines: list[str] = []
             cache_event_time = time.time_ns()
             cache_state = None
+<<<<<<< HEAD
             fx_config: _CompileFxKwargs = {"cudagraphs": cudagraphs}
+=======
+            fx_config: _CompileFxKwargs = {
+                "cudagraphs": cudagraphs,
+                "boxed_forward_device_index": boxed_forward_device_index,
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 cache_key, debug_lines = autograd_cache_key(
                     gm, args, aot_config, fx_config
                 )
+<<<<<<< HEAD
                 entry: Optional[AOTAutogradCacheEntry] = AOTAutogradCache._lookup(
                     cache_key, local, remote
+=======
+                entry: Optional[
+                    GenericAOTAutogradCacheEntry
+                ] = AOTAutogradCache._lookup(
+                    cache_key, local, remote, args, cache_info, aot_config
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 if entry is not None:
                     compiled_fn = entry.wrap_post_compile(args, aot_config, fx_config)
@@ -735,9 +1289,12 @@ def load(
             # Count missing the FXGraphCache as a miss not a bypass
             except FXGraphCacheMiss as e:
                 counters["aot_autograd"]["autograd_cache_miss"] += 1
+<<<<<<< HEAD
                 # Special counter when we pass autograd cache but
                 # fail when on inductor guards
                 counters["aot_autograd"]["autograd_cache_guard_miss"] += 1
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cache_state = "miss"
                 if config.strict_autograd_cache:
                     raise e
@@ -753,9 +1310,20 @@ def load(
             except Exception as e:
                 cache_key = None
                 counters["aot_autograd"]["autograd_cache_bypass"] += 1
+<<<<<<< HEAD
+                cache_state = "bypass"
+                cache_event_time = time.time_ns()
+                cache_info["cache_bypass_reason"] = str(e)
+=======
+                log.info("Bypassing autograd cache due to: %s", e)
                 cache_state = "bypass"
                 cache_event_time = time.time_ns()
                 cache_info["cache_bypass_reason"] = str(e)
+                cache_info["cache_bypass_exception_type"] = type(e).__name__
+                cache_info["cache_bypass_traceback"] = traceback.format_exc().split(
+                    "\n"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # TODO: this gets logged implicitly by cache_bypass_reason,
                 # and here we explicitly log it into tlparse.
                 # We may want to log this as an extra column in Scuba, though.
@@ -768,9 +1336,18 @@ def load(
                     raise e
             if compiled_fn is None:
                 # Set the cache key so we can save a cache result later
+<<<<<<< HEAD
                 if cache_key is not None:
                     aot_config.cache_info = AOTAutogradCacheInfo(
                         cache_key, time.time_ns()
+=======
+                symints = AOTAutogradCache._filter_backed_symints(args)
+                if cache_key is not None:
+                    aot_config.cache_info = AOTAutogradCacheInfo(
+                        cache_key,
+                        time.time_ns(),
+                        forward_symints=symints,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 compiled_fn = dispatch_and_compile()
 
@@ -781,6 +1358,7 @@ def load(
                     "components": debug_lines,
                 }
             )
+<<<<<<< HEAD
             CompileEventLogger.instant(
                 f"autograd_cache_{cache_state}",
                 metadata=cache_info,
@@ -796,6 +1374,24 @@ def load(
                 remote_cache_enabled=remote,
                 local_cache_enabled=local,
             )
+=======
+            if chromium_event_log_active():
+                CompileEventLogger.instant(
+                    f"autograd_cache_{cache_state}",
+                    metadata=cache_info,
+                    time_ns=cache_event_time,
+                )
+                CompileEventLogger.try_add_pt2_compile(
+                    "backend_compile",
+                    cache_state=cache_state,
+                    cache_event_time=cache_event_time,
+                    key=cache_info.get("key"),
+                    components=cache_info.get("components"),
+                    cache_bypass_reason=cache_info.get("cache_bypass_reason"),
+                    remote_cache_enabled=remote,
+                    local_cache_enabled=local,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             torch._logging.trace_structured(
                 "artifact",
@@ -807,13 +1403,29 @@ def load(
             )
             return compiled_fn
 
+<<<<<<< HEAD
     @staticmethod
     def _get_tmp_dir() -> str:
+=======
+    @classmethod
+    def generate_guards_expression(
+        cls: type[AOTAutogradCache], cache_info: AOTAutogradCacheInfo
+    ) -> Optional[str]:
+        shape_env = cls._get_shape_env()
+        assert shape_env is not None
+        symints = cache_info.forward_symints
+        guards = shape_env.get_pruned_guards(symints)
+        return shape_env.produce_guards_expression(placeholders=symints, guards=guards)
+
+    @classmethod
+    def _get_tmp_dir(cls: type[AOTAutogradCache]) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Get the toplevel temporary directory for storing compiled graphs.
         """
         return os.path.join(cache_dir(), "aotautograd")
 
+<<<<<<< HEAD
     @staticmethod
     def _lookup(key: str, local: bool, remote: bool) -> Optional[AOTAutogradCacheEntry]:
         """Given a key generated by AOTAutogradCachePickler, look up its location in the cache."""
@@ -868,18 +1480,99 @@ def _lookup(key: str, local: bool, remote: bool) -> Optional[AOTAutogradCacheEnt
 
         # Otherwise both caches missed
         return None
+=======
+    @classmethod
+    def _get_tmp_dir_for_key(cls: type[AOTAutogradCache], key) -> str:
+        """
+        Get the toplevel temporary directory for storing compiled graphs.
+        """
+        return os.path.join(cls._get_tmp_dir(), key)
+
+    @staticmethod
+    def evaluate_guards(guard_expr: str, hints: Union[list[int], list[torch.SymInt]]):
+        if torch._inductor.config.unsafe_skip_cache_dynamic_shape_guards:
+            return True
+        shape_env = AOTAutogradCache._get_shape_env()
+        assert shape_env is not None
+        result = shape_env.evaluate_guards_expression(guard_expr, hints)
+        return result
+
+    @staticmethod
+    def _lookup(
+        key: str,
+        local: bool,
+        remote: bool,
+        args: list[Any],
+        cache_info: dict[str, Any],
+        aot_config: Optional[AOTConfig],
+    ) -> Optional[GenericAOTAutogradCacheEntry]:
+        """Given a key generated by AOTAutogradCachePickler, look up its location in the cache."""
+        remote_cache: Optional[RemoteCache[JsonDataTy]] = None
+        if remote:
+            remote_cache = AOTAutogradCache.get_remote_cache()
+
+        symints = AOTAutogradCache._filter_backed_symints(args)
+        hints = [hint_int(s) for s in symints]
+        entry = None
+        try:
+            (
+                entry,
+                pickled_content,
+                guard_info,
+            ) = AOTAutogradCache.find_guarded_entry(
+                key, local, remote_cache, AOTAutogradCache.evaluate_guards, hints
+            )
+
+            if entry is None and guard_info["cache_status_detailed"] == "guard_miss":
+                counters["aot_autograd"]["autograd_cache_guard_miss"] += 1
+            cache_info.update(guard_info)
+            if pickled_content is not None:
+                CacheArtifactManager.record_artifact(
+                    AOTAutogradCacheArtifact.type(), key, pickled_content
+                )
+                if (
+                    config.bundled_autograd_cache
+                    and aot_config is not None
+                    and aot_config.precompile_backend_id is not None
+                ):
+                    # NB: We don't want to use the cached aot_config.precompile_backend_id
+                    # 1. because we set it to None on save 2. even if we didn't, this new run
+                    # that cache hit has a *new* backend id associated with it.
+                    PrecompileContext.record_artifact(
+                        BundledAOTAutogradCacheArtifact.type(),
+                        aot_config.precompile_backend_id,
+                        pickled_content,
+                    )
+        except Exception as e:
+            log.info("AOTAutograd cache unable to load compiled graph: %s", e)
+            if config.strict_autograd_cache:
+                raise e
+        return entry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def _write_to_local_cache(key: str, content: bytes):
         """Write an entry to the local cache."""
+<<<<<<< HEAD
         subdir = os.path.join(AOTAutogradCache._get_tmp_dir(), key)
         if not os.path.exists(subdir):
             os.makedirs(subdir, exist_ok=True)
         path = os.path.join(subdir, "entry")
+=======
+        subdir = AOTAutogradCache._get_tmp_dir_for_key(key)
+        if not os.path.exists(subdir):
+            os.makedirs(subdir, exist_ok=True)
+
+        # Use a hash of the serialized entry to get a unique file
+        # name. The specific name doesn't matter since a lookup involves
+        # iterating over all entries in the parent subdir.
+        path = os.path.join(subdir, sha256_hash(content))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.info("Writing AOTAutograd cache entry to %s", path)
         write_atomic(path, content)
 
     @staticmethod
+<<<<<<< HEAD
     def save(key: str, entry: AOTAutogradCacheEntry, remote: bool):
         """Save a single entry into the cache."""
         try:
@@ -888,6 +1581,27 @@ def save(key: str, entry: AOTAutogradCacheEntry, remote: bool):
             CacheArtifactManager.record_artifact(
                 CacheArtifactType.AOT_AUTOGRAD, key, content
             )
+=======
+    def save(key: str, entry: GenericAOTAutogradCacheEntry, remote: bool):
+        """Save a single entry into the cache."""
+        try:
+            entry.pre_save()
+            content = pickle.dumps(entry)
+            CacheArtifactManager.record_artifact(
+                AOTAutogradCacheArtifact.type(), key, content
+            )
+            if (
+                config.bundled_autograd_cache
+                and entry.sanitized_aot_config.precompile_backend_id is not None
+            ):
+                precompile_key = entry.sanitized_aot_config.precompile_backend_id
+                # Now that we're saving it, the precompile_backend_id field is no longer
+                # useful, remove it from the entry.
+                entry.sanitized_aot_config.precompile_backend_id = None
+                PrecompileContext.record_artifact(
+                    BundledAOTAutogradCacheArtifact.type(), precompile_key, content
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             AOTAutogradCache._write_to_local_cache(key, content)
             counters["aot_autograd"]["autograd_cache_saved"] += 1
         except BypassAOTAutogradCache as e:
@@ -921,7 +1635,11 @@ def save(key: str, entry: AOTAutogradCacheEntry, remote: bool):
                 remote_cache.put(key, cache_data)
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_remote_cache() -> Optional[RemoteCache[JsonDataTy]]:
         """
         Attempts to load the remote cache, returns None on error.
@@ -933,3 +1651,109 @@ def get_remote_cache() -> Optional[RemoteCache[JsonDataTy]]:
             "FbRemoteAOTAutogradCache",
             "RemoteAOTAutogradCache",
         )
+<<<<<<< HEAD
+=======
+
+    @staticmethod
+    def make_entry(
+        compiled_fw_func: CompiledFxGraph,
+        compiled_bw_func: Optional[CompiledFxGraph],
+        aot_joint_graph_str: Optional[str],
+        aot_forward_graph_str: Optional[str],
+        aot_backward_graph_str: Optional[str],
+        runtime_metadata: ViewAndMutationMeta,
+        dispatch_wrappers: list[CompilerWrapper],
+        maybe_subclass_meta: Optional[SubclassMeta],
+        num_fw_outs_saved_for_bw: Optional[int],
+        indices_of_inps_to_detach: list[int],
+        forward_time_taken_ns: int,
+        backward_time_taken_ns: int,
+        sanitized_aot_config: AOTConfig,
+        guards_expr: Optional[str],
+        backward_state_indices: Optional[list[int]],
+        num_symints_saved_for_bw: Optional[int],
+        serialized_bw_module: Optional[SerializedGraphModule],
+    ) -> GenericAOTAutogradCacheEntry:
+        if config.bundled_autograd_cache:
+            # Helper function to unwrap all the wrappers we added during aotdispatch
+            # They get reapplied on cache load
+            def unwrap_compiled_fx_graph(obj):
+                while hasattr(obj, "__wrapped__"):
+                    obj = obj.__wrapped__
+                assert isinstance(obj, CompiledFxGraph)
+                return obj
+
+            compiled_fw_graph = unwrap_compiled_fx_graph(compiled_fw_func)
+            bundled_compiled_forward = BundledCompiledForward(compiled_fw_graph)
+            bundled_compiled_backward = None
+            if compiled_bw_func is not None:
+                assert backward_state_indices is not None
+                assert num_symints_saved_for_bw is not None
+                compiled_bw_graph = unwrap_compiled_fx_graph(compiled_bw_func)
+                bundled_compiled_backward = BundledCompiledBackward(
+                    compiled_bw_graph, backward_state_indices, num_symints_saved_for_bw
+                )
+
+            return BundledAOTAutogradCacheEntry(
+                compiled_fw=bundled_compiled_forward,
+                compiled_bw=bundled_compiled_backward,
+                aot_joint_graph_str=aot_joint_graph_str,
+                aot_forward_graph_str=aot_forward_graph_str,
+                aot_backward_graph_str=aot_backward_graph_str,
+                runtime_metadata=runtime_metadata,
+                dispatch_wrappers=dispatch_wrappers,
+                maybe_subclass_meta=maybe_subclass_meta,
+                num_fw_outs_saved_for_bw=num_fw_outs_saved_for_bw,
+                indices_of_inps_to_detach=indices_of_inps_to_detach,
+                forward_time_taken_ns=forward_time_taken_ns,
+                backward_time_taken_ns=backward_time_taken_ns,
+                sanitized_aot_config=sanitized_aot_config,
+                guards_expr=guards_expr,
+                serialized_bw_module=serialized_bw_module,
+            )
+
+        else:
+            fw_key = getattr(compiled_fw_func, "_fx_graph_cache_key", None)
+            fw_debug_lines = getattr(
+                compiled_fw_func, "_fx_graph_cache_debug_lines", []
+            )
+
+            assert fw_key is not None
+            compiled_forward = CompiledForward(
+                fx_graph_cache_info=(fw_key, fw_debug_lines),
+                fx_graph_guard_expr=getattr(compiled_fw_func, "guards_expr", None),
+            )
+            compiled_backward = None
+            if compiled_bw_func is not None:
+                bw_key = getattr(compiled_bw_func, "_fx_graph_cache_key", None)
+                bw_debug_lines = getattr(
+                    compiled_bw_func, "_fx_graph_cache_debug_lines", []
+                )
+                assert bw_key is not None
+                assert backward_state_indices is not None
+                assert num_symints_saved_for_bw is not None
+                compiled_backward = CompiledBackward(
+                    fx_graph_cache_info=(bw_key, bw_debug_lines),
+                    fx_graph_guard_expr=getattr(compiled_bw_func, "guards_expr", None),
+                    backward_state_indices=backward_state_indices,
+                    num_symints_saved_for_bw_=num_symints_saved_for_bw,
+                )
+
+            return AOTAutogradCacheEntry(
+                compiled_fw=compiled_forward,
+                compiled_bw=compiled_backward,
+                aot_joint_graph_str=aot_joint_graph_str,
+                aot_forward_graph_str=aot_forward_graph_str,
+                aot_backward_graph_str=aot_backward_graph_str,
+                runtime_metadata=runtime_metadata,
+                dispatch_wrappers=dispatch_wrappers,
+                maybe_subclass_meta=maybe_subclass_meta,
+                num_fw_outs_saved_for_bw=num_fw_outs_saved_for_bw,
+                indices_of_inps_to_detach=indices_of_inps_to_detach,
+                forward_time_taken_ns=forward_time_taken_ns,
+                backward_time_taken_ns=backward_time_taken_ns,
+                sanitized_aot_config=sanitized_aot_config,
+                guards_expr=guards_expr,
+                serialized_bw_module=serialized_bw_module,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
index 87d5411c05d6..b5a0be67dba3 100644
--- a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
+++ b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@@ -41,6 +41,10 @@
 from .schemas import (
     FunctionalTensorMetadataEq,
     InputAliasInfo,
+<<<<<<< HEAD
+=======
+    MemoryFormatMeta,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MutationType,
     OutputAliasInfo,
     OutputType,
@@ -73,6 +77,7 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
 
     out = x.detach()
 
+<<<<<<< HEAD
     suggest_memory_format = torch._prims_common.suggest_memory_format
     is_subclass = is_traceable_wrapper_subclass(out)
 
@@ -81,6 +86,16 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
     was = out
     out = out.contiguous(memory_format=memory_format)
     updated = out is not was
+=======
+    is_subclass = is_traceable_wrapper_subclass(out)
+
+    memory_format = MemoryFormatMeta.from_tensor(out)
+
+    if memory_format.memory_format is not None:
+        was = out
+        out = out.contiguous(memory_format=memory_format.memory_format)
+        updated = was is not out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # For subclass we keep memory format of outer strides at the beggining of the list
     out_memory_format = [memory_format] if is_subclass else memory_format
diff --git a/torch/_functorch/_aot_autograd/functional_utils.py b/torch/_functorch/_aot_autograd/functional_utils.py
index 6d73acc14862..b9cf8ee4872c 100644
--- a/torch/_functorch/_aot_autograd/functional_utils.py
+++ b/torch/_functorch/_aot_autograd/functional_utils.py
@@ -17,11 +17,15 @@
 from torch._subclasses.fake_tensor import FakeTensor
 from torch._subclasses.functional_tensor import FunctionalTensor
 from torch._subclasses.meta_utils import is_sparse_any
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import (
     definitely_true,
     sym_eq,
     SymIntEqByExpr,
 )
+=======
+from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_eq, SymIntEqByExpr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.utils._python_dispatch import (
     is_traceable_wrapper_subclass,
@@ -317,6 +321,7 @@ def patch_requires_grad(out):
 
 def has_same_metadata(t1, t2):
     return (
+<<<<<<< HEAD
         definitely_true(sym_eq(t1.size(), t2.size()))
         and definitely_true(t1.layout == t2.layout)
         and (
@@ -324,6 +329,15 @@ def has_same_metadata(t1, t2):
             or (
                 definitely_true(sym_eq(t1.stride(), t2.stride()))
                 and definitely_true(t1.storage_offset() == t2.storage_offset())
+=======
+        guard_or_false(sym_eq(t1.size(), t2.size()))
+        and guard_or_false(t1.layout == t2.layout)
+        and (
+            is_sparse_any(t1)
+            or (
+                guard_or_false(sym_eq(t1.stride(), t2.stride()))
+                and guard_or_false(t1.storage_offset() == t2.storage_offset())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         )
         and t1.is_conj() == t2.is_conj()
diff --git a/torch/_functorch/_aot_autograd/input_output_analysis.py b/torch/_functorch/_aot_autograd/input_output_analysis.py
index 7dc4112a101f..e42e11d40ee6 100644
--- a/torch/_functorch/_aot_autograd/input_output_analysis.py
+++ b/torch/_functorch/_aot_autograd/input_output_analysis.py
@@ -28,6 +28,10 @@
     BackwardSignature,
     GraphSignature,
     InputAliasInfo,
+<<<<<<< HEAD
+=======
+    MemoryFormatMeta,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OutputAliasInfo,
     OutputType,
     ViewAndMutationMeta,
@@ -61,7 +65,13 @@ def remove_dupe_metadata(
 
     assert m.subclass_tangent_meta is not None
     subclass_tangent_meta = [
+<<<<<<< HEAD
         PlainTensorMeta(0, memory_format=torch.contiguous_format)
+=======
+        PlainTensorMeta(
+            0, memory_format=MemoryFormatMeta(memory_format=torch.contiguous_format)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ] * len(filtered_inp_traced_tangents) + m.subclass_tangent_meta[num_data_mutations:]
 
     return ViewAndMutationMeta(
@@ -297,6 +307,24 @@ def compute_overlapping_inputs(aot_config, fwd_inputs, aliased_input_indices):
         ]
     )
 
+<<<<<<< HEAD
+=======
+    if torch._inductor.config.is_fbcode():
+        if symbolic and num_aliases > 400:
+            from torch._subclasses.fake_tensor import (
+                UnsupportedMutationAliasingException,
+            )
+            from torch._utils_internal import justknobs_check
+
+            msg = f"Encountered {num_aliases} dynamic, aliased/mutated inputs, consider setting dynamic=False"
+
+            if justknobs_check(
+                "pytorch/compiler:aliased_inputs_with_mutation_and_dyn_shapes_killswitch",
+                False,
+            ):
+                raise UnsupportedMutationAliasingException(msg)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with maybe_suppress_guards():
         aliased_fwd_inputs = [fwd_inputs[i] for i in aliased_input_indices]
         actual_aliased_indices = {
diff --git a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
index 91806726fff7..d11e3d344d36 100644
--- a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
@@ -21,9 +21,16 @@
 from typing import Any, Callable, Optional, TYPE_CHECKING
 
 import torch
+<<<<<<< HEAD
 import torch.utils.dlpack
 from torch import Tensor
 from torch._dynamo.utils import detect_fake_mode, lazy_format_graph_code
+=======
+import torch.utils._pytree as pytree
+import torch.utils.dlpack
+from torch import Tensor
+from torch._dynamo.utils import detect_fake_mode, dynamo_timed, lazy_format_graph_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._guards import CompileContext, TracingContext
 from torch._logging import getArtifactLogger, trace_structured
 from torch._subclasses import FakeTensor
@@ -34,14 +41,23 @@
 from torch.fx.graph_module import GraphModule
 from torch.fx.passes._tensorify_python_scalars import tensorify_python_scalars
 from torch.multiprocessing.reductions import StorageWeakRef
+<<<<<<< HEAD
+=======
+from torch.types import py_sym_types
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torchgen.utils import dataclass_repr
 
 from .. import config
 from .autograd_cache import (
     AOTAutogradCache,
+<<<<<<< HEAD
     AOTAutogradCacheEntry,
     CompiledBackward,
     CompiledForward,
+=======
+    serialize_graph_module,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     should_use_remote_autograd_cache,
 )
 from .dispatch_and_compile_graph import (
@@ -142,6 +158,31 @@ def aot_dispatch_export(
     return compiled_fn, fw_metadata
 
 
+<<<<<<< HEAD
+=======
+def sanitize_aot_config(input: AOTConfig) -> AOTConfig:
+    return AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        inference_compiler=None,
+        num_params_buffers=input.num_params_buffers,
+        aot_id=input.aot_id,
+        keep_inference_input_mutations=input.keep_inference_input_mutations,
+        is_export=input.is_export,
+        no_tangents=input.no_tangents,
+        aot_autograd_arg_pos_to_source=input.aot_autograd_arg_pos_to_source,
+        dynamic_shapes=input.dynamic_shapes,
+        enable_log=input.enable_log,
+        static_input_indices=input.static_input_indices,
+        pre_dispatch=input.pre_dispatch,
+        cache_info=None,
+        precompile_backend_id=input.precompile_backend_id,
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def aot_dispatch_base(
     flat_fn,
     flat_args: list[Any],
@@ -164,7 +205,14 @@ def aot_dispatch_base(
     aot_forward_graph_str = None
     if aot_config.cache_info is not None:
         aot_forward_graph_str = fw_module.print_readable(
+<<<<<<< HEAD
             print_output=False, include_stride=True, include_device=True
+=======
+            print_output=False,
+            include_stride=True,
+            include_device=True,
+            fast_sympy_print=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     fakified_out_wrapper = FakifiedOutWrapper()
@@ -183,6 +231,10 @@ def aot_dispatch_base(
     ) = functionalized_rng_wrapper.pre_compile(
         fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata
     )
+<<<<<<< HEAD
+=======
+    assert isinstance(fw_module, GraphModule)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if aot_config.enable_log:
         trace_structured(
@@ -214,7 +266,10 @@ def aot_dispatch_base(
         with TracingContext.report_output_strides() as fwd_output_strides:
             fake_mode = detect_fake_mode()
             if fake_mode is not None and fake_mode.shape_env is not None:
+<<<<<<< HEAD
                 assert isinstance(fw_module, GraphModule)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 tensorify_python_scalars(fw_module, fake_mode.shape_env, fake_mode)
             compiled_fw = compiler(fw_module, updated_flat_args)
 
@@ -226,7 +281,11 @@ def aot_dispatch_base(
     # However, RuntimeWrapper does not expect the rng offsets in the
     # output. So, we have to create another wrapper and take out the offset. As
     # a result, we have to account for not boxed_call compilers as well.
+<<<<<<< HEAD
     if not hasattr(compiled_fw, "_boxed_call"):
+=======
+    if not getattr(compiled_fw, "_boxed_call", False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled_fw = make_boxed_func(compiled_fw)
 
     # Create a wrapper to set up the rng functionalize and fakified out bits
@@ -235,11 +294,20 @@ def aot_dispatch_base(
     )
     cache_info = aot_config.cache_info
     if cache_info is not None:
+<<<<<<< HEAD
         if fw_key := getattr(compiled_fw, "_fx_graph_cache_key", None):
             time_taken_ns = time.time_ns() - cache_info.start_time_ns
             entry = AOTAutogradCacheEntry(
                 compiled_fw=CompiledForward(fw_key),
                 compiled_bw=None,
+=======
+        if hasattr(compiled_fw, "_fx_graph_cache_key"):
+            time_taken_ns = time.time_ns() - cache_info.start_time_ns
+            guards_expr = AOTAutogradCache.generate_guards_expression(cache_info)
+            entry = AOTAutogradCache.make_entry(
+                compiled_fw_func=compiled_fw,  # type: ignore[arg-type]
+                compiled_bw_func=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 aot_joint_graph_str=None,
                 aot_forward_graph_str=aot_forward_graph_str,
                 aot_backward_graph_str=None,
@@ -250,6 +318,14 @@ def aot_dispatch_base(
                 indices_of_inps_to_detach=[],
                 forward_time_taken_ns=time_taken_ns,
                 backward_time_taken_ns=0,
+<<<<<<< HEAD
+=======
+                sanitized_aot_config=sanitize_aot_config(aot_config),
+                guards_expr=guards_expr,
+                backward_state_indices=None,
+                num_symints_saved_for_bw=None,
+                serialized_bw_module=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             AOTAutogradCache.save(
                 cache_info.cache_key, entry, remote=should_use_remote_autograd_cache()
@@ -281,7 +357,11 @@ def aot_dispatch_base(
         runtime_metadata=fw_metadata,
     )
 
+<<<<<<< HEAD
     if not hasattr(compiled_fw, "_boxed_call"):
+=======
+    if not getattr(compiled_fw, "_boxed_call", False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled_fw = make_boxed_func(compiled_fw)
 
     compiled_fn = RuntimeWrapper(
@@ -486,7 +566,11 @@ def prepare_for_partitioner(mod, num_primals, num_fw_outputs):
 
         new_graph.lint()
 
+<<<<<<< HEAD
         out = torch.fx.GraphModule(joint_gm, new_graph)
+=======
+        out = torch.fx.GraphModule(mod, new_graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return out
 
     new_hop_graphs: dict[str, InvokeSubgraphHopGraphs] = defaultdict(
@@ -504,13 +588,17 @@ def prepare_for_partitioner(mod, num_primals, num_fw_outputs):
     # using these nodes.
     fw_hop_nodes = []
     bw_hop_nodes = []
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in joint_gm.graph.nodes:
         if (
             node.op == "call_function"
             and node.target is invoke_subgraph
             and isinstance(node.args[1], str)
         ):
+<<<<<<< HEAD
             identifier = (
                 node.args[1].replace("___forward", "").replace("___backward", "")
             )
@@ -575,6 +663,78 @@ def prepare_for_partitioner(mod, num_primals, num_fw_outputs):
     if not new_hop_graphs:
         return joint_gm
 
+=======
+            if node.args[1].startswith("fw"):
+                fw_hop_nodes.append(node)
+            elif node.args[1].startswith("bw"):
+                bw_hop_nodes.append(node)
+
+    if not bw_hop_nodes:
+        return joint_gm
+
+    assert len(fw_hop_nodes) == len(bw_hop_nodes)
+
+    # Create a bw to hop node mapping. This helps us in identifying the bw and
+    # fw subgraph pairs without relying on the identifier. This is important
+    # because we can have different subgraphs for bwd for same subgraph in the
+    # fwd because of differing strides in the backward.
+    bw_to_fw_hop_node = dict(zip(list(reversed(bw_hop_nodes)), fw_hop_nodes))
+
+    for node in bw_hop_nodes:
+        identifier = node.args[1].removeprefix("bw")
+
+        # If partitioning already done for this identifier, skip. This saves
+        # redundant joint graph passes for same subgraphs.
+        if new_hop_graphs[identifier].partitioning_done:
+            continue
+
+        # Collect some information from the forward hop graph
+        fw_hop_node = bw_to_fw_hop_node[node]
+        fw_hop_gm = getattr(joint_gm, fw_hop_node.args[0].target)
+        assert isinstance(fw_hop_gm, torch.fx.GraphModule)
+        num_fw_inputs = num_inputs(fw_hop_gm)
+        num_fw_outputs = num_outputs(fw_hop_gm)
+        new_hop_graphs[identifier].old_num_fw_inputs = num_fw_inputs
+        new_hop_graphs[identifier].old_num_fw_outputs = num_fw_outputs
+
+        # Step 1) - Get the `joint_hop_gm`. As mentioned earlier, the
+        # backward graph is the joint graph.
+        joint_hop_gm = getattr(joint_gm, node.args[0].target)
+        assert isinstance(joint_hop_gm, torch.fx.GraphModule)
+
+        # Prepare the graph for the partitioner
+        joint_hop_gm = prepare_for_partitioner(
+            joint_hop_gm, num_fw_inputs, num_fw_outputs
+        )
+
+        # TODO: invoke_subgraph should track which of its inputs static indices
+        # so it can propagate them to the partitioner (and use in cudagraphs)
+        static_lifetime_input_indices: list[int] = []
+        # Step 2) and 3) - Run joint graph passes and partitioner
+        new_fw_hop_gm, new_bw_hop_gm = aot_config.partition_fn(
+            joint_hop_gm,
+            [],
+            num_fwd_outputs=num_fw_outputs,
+            static_lifetime_input_indices=static_lifetime_input_indices,
+        )
+
+        # Save the new forward and backward graph modules
+        new_hop_graphs[identifier].new_fw_hop_gm = new_fw_hop_gm
+        new_hop_graphs[identifier].new_bw_hop_gm = new_bw_hop_gm
+
+        # Save the number of symints and saved tensors
+        new_fw_out_nodes = new_fw_hop_gm.graph.find_nodes(op="output")[0].args[0]
+        extra_outputs = new_fw_out_nodes[num_fw_outputs:]
+        symint_outputs = [n for n in extra_outputs if is_sym_node(n)]
+
+        new_hop_graphs[identifier].new_num_sym_nodes = len(symint_outputs)
+        new_hop_graphs[identifier].new_num_saved_nodes = len(extra_outputs) - len(
+            symint_outputs
+        )
+
+        new_hop_graphs[identifier].partitioning_done = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Step 3) Restitch the new fw and bw graphs back into the main graph.
     #
     # This is a very mechanical process. There are a quite a few pieces that we
@@ -614,15 +774,22 @@ def prepare_for_partitioner(mod, num_primals, num_fw_outputs):
     # hop1 order for the backward. This structure allows us to just use a stack
     # to collect all the information that we need to pass from the forward hop
     # node to the corresponding backward node.
+<<<<<<< HEAD
     old_fw_hop_nodes_stack = []
     # Collect the saved tensor nodes that we need to pass as inputs to the
     # backward hop.
     fw_node_to_saved_tensors_map = {}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     already_added_new_hop_mods = set()
 
     def add_new_hop_gm(new_subgraph_mod, name):
+<<<<<<< HEAD
         new_subgraph_attr_name = f"{name}_post_graph"
+=======
+        new_subgraph_attr_name = f"partitioned_{name}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if new_subgraph_attr_name in already_added_new_hop_mods:
             return new_subgraph_attr_name
 
@@ -639,15 +806,29 @@ def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_no
         out_example_vals = [n.meta["val"] if n else None for n in output.args[0]]
         new_call_function_node.meta["val"] = tuple(out_example_vals)
 
+<<<<<<< HEAD
     for fw_node in fw_hop_nodes:
+=======
+    for bw_node in reversed(bw_hop_nodes):
+        identifier = bw_node.args[1].removeprefix("bw")
+
+        # Make changes to the corresponding fw and bw node pair simultaneously.
+        # The removes the need of any bookkeeping.
+
+        # Fw node changes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Insert the new_fw_hop_gm. This is straightforward. Get the
         # new_fw_hop_gm, insert the hop_gm as a get_attr fw_node, and then
         # add a call_function fw_node. Additionally, also use getitem
         # call_functions to collect the saved_tensor nodes
 
+<<<<<<< HEAD
         identifier = (
             fw_node.args[1].replace("___forward", "").replace("___backward", "")
         )
+=======
+        fw_node = bw_to_fw_hop_node[bw_node]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_fw_hop_gm = new_hop_graphs[identifier].new_fw_hop_gm
         assert new_fw_hop_gm is not None
 
@@ -663,7 +844,11 @@ def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_no
 
         # Insert the new_fw_hop_gm into the joint_gm
         with joint_gm.graph.inserting_after(fw_node):
+<<<<<<< HEAD
             new_fw_mod_attr_name = add_new_hop_gm(new_fw_hop_gm, fw_node.args[1])
+=======
+            new_fw_mod_attr_name = add_new_hop_gm(new_fw_hop_gm, f"fw{identifier}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_fw_mod_attr = joint_gm.graph.get_attr(new_fw_mod_attr_name)
 
         # new_hop_fw_gm output signature is (*fw_outs, *saved_tensors)
@@ -673,7 +858,11 @@ def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_no
                 args=(
                     new_fw_mod_attr,
                     new_fw_mod_attr_name,
+<<<<<<< HEAD
                     fw_node.args[2],
+=======
+                    *fw_node.args[2:],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             )
             propagate_meta_info(new_fw_hop_gm, new_fw_node, fw_node)
@@ -692,6 +881,7 @@ def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_no
         fw_node.replace_all_uses_with(new_fw_node)
         joint_gm.graph.erase_node(fw_node)
 
+<<<<<<< HEAD
         # Save the saved_tensors info for the fw_node. This will be used
         # to form the inputs for the backward hop.
         old_fw_hop_nodes_stack.append(fw_node)
@@ -706,11 +896,15 @@ def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_no
         new_bw_hop_gm = new_hop_graphs[identifier].new_bw_hop_gm
         assert new_bw_hop_gm is not None
 
+=======
+        # Bw node changes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Prepare the operands for the bwd graph
         # Old bw graph signature : (*primals, *tangents)
         # New signature will be : (*sym_nodes, *saved_tensors, *tangents)
         # We have already collected the saved_tensors in the forward hop processing.
 
+<<<<<<< HEAD
         assert len(old_fw_hop_nodes_stack)
         fw_hop_node = old_fw_hop_nodes_stack.pop()
         (
@@ -732,6 +926,21 @@ def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_no
         num_primals = new_hop_graphs[identifier].old_num_fw_inputs
         assert num_primals is not None
         tangents = list(bw_node.args[2][num_primals:])
+=======
+        # extra_fw_outputs are in the order (*saved_nodes, *sym_nodes).
+        # Partitioner has this quirk where the backward wants sym_nodes
+        # first. So extract the sym and saved nodes.
+
+        new_bw_hop_gm = new_hop_graphs[identifier].new_bw_hop_gm
+        assert new_bw_hop_gm is not None
+
+        saved_tensor_nodes = extra_fw_outputs[:new_num_saved_nodes]
+        sym_nodes = extra_fw_outputs[new_num_saved_nodes:]
+
+        num_primals = new_hop_graphs[identifier].old_num_fw_inputs
+        assert num_primals is not None
+        tangents = list(bw_node.args[2 + num_primals :])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         operands = sym_nodes + saved_tensor_nodes + tangents
 
         # Insert the new_bw_hop_gm into the joint_gm
@@ -745,10 +954,22 @@ def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_no
                 args=(
                     new_bw_mod_attr,
                     new_bw_mod_attr_name,
+<<<<<<< HEAD
                     tuple(operands),
                 ),
             )
             propagate_meta_info(new_bw_hop_gm, new_bw_node, bw_node)
+=======
+                    *operands,
+                ),
+            )
+            propagate_meta_info(new_bw_hop_gm, new_bw_node, bw_node)
+            # Since the partitioner is run after the graph passes, we have lost
+            # the eager information and cannot faithfully extract the eager
+            # inputs for the new partitioned backward graph. For the forward
+            # graph, it was fine because the input signature remains same.
+            new_bw_node.meta.pop("eager_input_vals", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         bw_node.replace_all_uses_with(new_bw_node)
         joint_gm.graph.erase_node(bw_node)
@@ -759,6 +980,466 @@ def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_no
     return joint_gm
 
 
+<<<<<<< HEAD
+=======
+def maybe_log_graph(
+    gm,
+    graph_name,
+    aot_config,
+    structured_log_prefix_fn,
+    out_structured_logs: Optional[list[str]] = None,
+):
+    if not aot_config.enable_log:
+        return
+    aot_graphs_log.debug(
+        "%s",
+        lazy_format_graph_code(
+            f"{graph_name}",
+            gm,
+            aot_config.aot_id,
+            include_stride=True,
+            include_device=True,
+            colored=True,
+        ),
+    )
+
+    def gm_str_fn() -> str:
+        return gm.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        )
+
+    if out_structured_logs is not None:
+        out_structured_logs.append(f"{structured_log_prefix_fn()}:{gm_str_fn()}")
+    else:
+        trace_structured(
+            f"{structured_log_prefix_fn()}",
+            payload_fn=lambda: gm_str_fn(),
+        )
+
+
+def create_wrap_fn(fn, args):
+    from functools import wraps
+
+    from torch.fx.experimental.proxy_tensor import maybe_enable_thunkify
+
+    from .functional_utils import from_fun, has_data_mutation, to_fun
+
+    def assert_no_mutation(t):
+        assert not has_data_mutation(
+            t
+        ), "Saved tensors hooks with inputs mutations are not allowed"
+
+    @wraps(fn)
+    def _wrapper(*args):
+        with maybe_enable_thunkify():
+            disable_above = torch._C._ExcludeDispatchKeyGuard(
+                torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+            )
+
+            with disable_above:
+                f_args = pytree.tree_map(to_fun, args)
+                f_outs = fn(*f_args)
+                pytree.tree_map(assert_no_mutation, f_args)
+                return pytree.tree_map(from_fun, f_outs)
+
+    return _wrapper, args
+
+
+def prepare_hook_gm(aot_config, fn, args):
+    from torch._functorch._aot_autograd.dispatch_and_compile_graph import _create_graph
+
+    fn, args = create_wrap_fn(fn, args)
+    gm = _create_graph(fn, args, aot_config=aot_config)
+    return gm
+
+
+# Inline Autograd saved_tensors_hooks into epilogue of forward graph
+# and prologue of backward graph.
+# This changes forward graph outputs and inputs.
+# Pack hook can return tensors, sym scalars, constants.
+# All tensors to save for backward will be grouped together at front.
+# Sym scalars grouped on another end. Constants are inlined in the graph.
+def maybe_inline_graph_saved_tensors_hooks(
+    fw_module,
+    bw_module,
+    num_inner_fwd_outputs,
+    inner_meta,
+    aot_config,
+    static_input_indices,
+):
+    if torch._dynamo.compiled_autograd.in_compiled_autograd_region:
+        return
+
+    get_hooks = torch._functorch._aot_autograd.utils.top_saved_tensors_hooks
+    are_inline_hooks = (
+        torch._functorch._aot_autograd.utils.saved_tensors_hooks_are_inlineable
+    )
+
+    hooks = get_hooks()
+    if not are_inline_hooks(hooks):
+        return
+
+    pack_hook_gm, unpack_hook_gm = hooks
+
+    structured_logs: list[str] = []
+    maybe_log_graph(
+        fw_module,
+        "Forward graph pre saved_tensors_hooks inlining",
+        aot_config,
+        lambda: "aot_forward_graph_pre_saved_tensors_hooks",
+        structured_logs,
+    )
+    maybe_log_graph(
+        bw_module,
+        "Backward graph pre saved_tensors_hooks inlining",
+        aot_config,
+        lambda: "aot_backward_graph_pre_saved_tensors_hooks",
+        structured_logs,
+    )
+    fw_g = fw_module.graph
+    bw_g = bw_module.graph
+
+    fw_g_names = {node.name for node in fw_g.nodes}
+    bw_g_names = {node.name for node in bw_g.nodes}
+
+    def _gen_unused_name(candidate: str):
+        c = candidate
+        i = 0
+        while c in fw_g_names or c in bw_g_names:
+            c = f"{candidate}_{i}"
+            i = i + 1
+        return c
+
+    bw_g_inputs = bw_g.find_nodes(op="placeholder")
+
+    fw_out_n = fw_g.output_node()
+    fw_outs = fw_out_n.args[0]  # type: ignore[var-annotated]
+    fw_outs_inner_set = set(fw_outs[:num_inner_fwd_outputs])
+    fw_outs_saved_for_bw = fw_outs[num_inner_fwd_outputs:]
+    fw_outs_packed_tensors = []  # type: ignore[var-annotated]
+    fw_outs_packed_syms = []  # type: ignore[var-annotated]
+
+    # The main use case for saved_tensors_hooks is activation quantization,
+    # for memory usage optimization.
+    # Desired behavior is to quantize saved activations to free the original saved tensor.
+    # Saved nodes may include forward inputs, outputs, parameters.
+    # They may be held by something else and will not be deallocated after quantization.
+    # Donated buffers are intermediates in the graph invisible for the user,
+    # this guarantees that they can be deallocated.
+    # Using this as a default behavior to select saved nodes to apply hooks.
+    # There is also a config to apply hooks for all saved nodes without any filtering.
+    # The plan is to propagate meta about the source of the saved node to the user hook function.
+    mode = torch._functorch.config.saved_tensors_hooks_filtering_mode
+    allow_set = None
+    exclude_set = None
+
+    if mode == "donated":
+        # collect_bw_donated_buffer_idxs requires inner_meta to have num_symints_saved_for_bw
+        inner_meta.num_symints_saved_for_bw = len(
+            [n for n in fw_outs_saved_for_bw if is_sym_node(n)]
+        )
+        bw_donated_idxs = collect_bw_donated_buffer_idxs(
+            fw_module,
+            bw_module,
+            inner_meta,
+        )
+        fw_donated_idxs = [
+            i - inner_meta.num_symints_saved_for_bw for i in bw_donated_idxs
+        ]
+        allow_set = {fw_outs_saved_for_bw[i].name for i in fw_donated_idxs}
+    elif mode == "no_static":
+        fw_g_inputs = fw_g.find_nodes(op="placeholder")
+        exclude_set = {fw_g_inputs[i].name for i in static_input_indices}
+
+    if (allow_set is not None) and (not allow_set):
+        # This means we have empty whitelist,
+        # No donated (intermediate) saved.
+        # Do not do anything in this case
+        return
+
+    if aot_config.enable_log:
+        structured_logs.append(f"fw_outs_saved_for_bw:{fw_outs_saved_for_bw}")
+        structured_logs.append(f"mode:{mode}")
+        structured_logs.append(f"allow_set:{allow_set}")
+        structured_logs.append(f"exclude_set:{exclude_set}")
+
+    for saved in fw_outs_saved_for_bw:
+        if ((allow_set is not None) and (saved.name not in allow_set)) or (
+            (exclude_set is not None) and (saved.name in exclude_set)
+        ):
+            if isinstance(saved.meta["val"], torch.Tensor):
+                fw_outs_packed_tensors.append(saved)
+            continue
+
+        val = saved.meta["val"]
+        if not isinstance(val, torch.Tensor):
+            continue
+
+        pack_out_val = pack_hook_gm(val)
+
+        requires_sc_handling = any(
+            is_traceable_wrapper_subclass(x) for x in pytree.tree_leaves(pack_out_val)
+        )
+        if requires_sc_handling:
+            raise NotImplementedError(
+                "Tensor subclasses in GraphModule saved tensors hooks are not supported"
+                "You can workaround it by manually returning subclass's inner tensors"
+                " in the pack hook, and reconstructing the subclass in the unpack hook"
+            )
+
+        pack_gm = prepare_hook_gm(aot_config, pack_hook_gm, (val,))
+        pack_g = pack_gm.graph
+        maybe_log_graph(
+            pack_gm,
+            f"saved_tensors_pack_hook {saved.name}",
+            aot_config,
+            lambda: f"aot_saved_tensors_hooks_pack {saved.name}",
+            structured_logs,
+        )
+        pack_out_val = pack_gm(val)
+
+        # Install pack hook graph as eiplogue of fw_module.
+        # Saved tensor output becomes input of pack hook graph.
+        # Replace saved tensor output with pack hook graph output.
+        # Outputs symbolic scalars, tensors  are accumulated separately.
+        # Then in forward outputs and backward inputs installed in order
+        # sym_scalars, packed_saved_tensors.
+        # Keeping all tensors together allows to preserve
+        # the same identification at runtime,
+        # updating only number of saved sym_scalars and tensors.
+        pack_g_inputs = pack_g.find_nodes(op="placeholder")
+        assert len(pack_g_inputs) == 1
+        env = {pack_g_inputs[0]: saved}
+        fw_pack_out_args = None
+        with fw_g.inserting_before(fw_out_n):
+            for node in pack_g.nodes:
+                if node.op == "placeholder":
+                    continue
+                new_n = fw_g.node_copy(node, lambda n: env[n])
+                fw_g_names.add(new_n.name)
+                env[node] = new_n
+                # Output node is temporarily copied to have remapped arguments.
+                # Removed in the end.
+                if node.op == "output":
+                    fw_pack_out_args = new_n.args[0]
+                    fw_g.erase_node(new_n)
+
+        env.clear()
+        assert fw_pack_out_args
+        fw_outs_bw_ins_node_names = []
+        for out_idx, _n in enumerate(pytree.tree_leaves(fw_pack_out_args)):
+            if not isinstance(_n, torch.fx.Node):
+                fw_outs_bw_ins_node_names.append("")
+                continue
+
+            # This happens when hook is noop and it is either user input or user output.
+            # Do not do anything with this node.
+            if _n.op == "placeholder" or _n in fw_outs_inner_set:
+                # This means the hook returned input primals unchanged
+                # Do not rename in this case.
+                n = _n
+                new_node_name = _n.name
+                fw_outs_bw_ins_node_names.append(new_node_name)
+            else:
+                # We can not specify desired name in node_copy.
+                # Copying node manually to set specifc name,
+                # to have matching fw_outs, bw_inputs names.
+                new_node_name = _gen_unused_name(f"{saved.name}_hook_{out_idx}")
+                with fw_g.inserting_before(_n):
+                    n = fw_g.create_node(
+                        _n.op,
+                        _n.target,
+                        _n.args,
+                        _n.kwargs,
+                        name=new_node_name,
+                    )
+                assert n.name == new_node_name
+                fw_outs_bw_ins_node_names.append(new_node_name)
+                n.meta = copy.copy(_n.meta)
+                _n.replace_all_uses_with(n)
+                fw_g.erase_node(_n)
+            if isinstance(n.meta["val"], torch.Tensor):
+                fw_outs_packed_tensors.append(n)
+            elif is_sym_node(n):
+                fw_outs_packed_syms.append(n)
+
+        # Install unpack hook graph as a prologue of backward graph
+        # Saved tensors inputs are replaced with packed tensors and packed sym scalars.
+        # The saved tensors inputs usages in the graph are replaced with unpack hook graph outputs.
+        unpack_gm = prepare_hook_gm(aot_config, unpack_hook_gm, (pack_out_val,))
+        unpack_g = unpack_gm.graph
+        maybe_log_graph(
+            unpack_gm,
+            f"saved_tensors_unpack_hook {saved.name}",
+            aot_config,
+            lambda: f"aot_saved_tensors_hooks_unpack {saved.name}",
+            structured_logs,
+        )
+
+        def find_saved_in_bw_inputs(bw_inputs):
+            for n in bw_inputs:
+                if n.name == saved.name:
+                    return n
+
+        bw_g_input = find_saved_in_bw_inputs(bw_g_inputs)
+        assert bw_g_input
+        original_bw_g_input_users = list(bw_g_input.users.keys())
+        bw_g_input_used_directly = False
+
+        # Replace backward graph saved tensor input with copy of pack graph outputs
+        # All non-Tensor, non-symscalars outputs are constanted.
+
+        unpack_g_inputs = unpack_g.find_nodes(op="placeholder")
+        env = {}
+        for out_idx, (unp_in_n, out_n, val) in enumerate(
+            zip(
+                unpack_g_inputs,
+                pytree.tree_leaves(fw_pack_out_args),
+                pytree.tree_leaves(pack_out_val),
+            )
+        ):
+            is_sym = isinstance(val, py_sym_types)
+            if isinstance(val, torch.Tensor) or is_sym:
+                # We want forward_outputs names to match backward_inputs,
+                # Potentially backward may already have "{saved.name}_hook_{idx}",
+                # In this case fx.Graph will add suffix.
+                new_node_name = fw_outs_bw_ins_node_names[out_idx]
+                if bw_g_input.name == new_node_name:
+                    env[unp_in_n] = bw_g_input
+                    bw_g_input_used_directly = True
+                else:
+                    # Backward calling convention: ctx_symints,ctx_saved_tensors
+                    # Inserting packed sym scalars before first saved tensor input.
+                    # Inserting packed tensors before last saved tensor input.
+                    # Saved tensor inputs between them will be removed.
+                    with bw_g.inserting_before(
+                        bw_g_inputs[0]
+                    ) if is_sym else bw_g.inserting_before(bw_g_input):
+                        new_n = bw_g.placeholder(new_node_name)
+                        assert new_n.name == new_node_name
+                    new_n.meta = copy.copy(out_n.meta)
+                    env[unp_in_n] = new_n
+            else:
+                # Inline values of non-Tensor, non-SymScalars
+                env[unp_in_n] = val
+
+        # Inserting unpack hook after placeholders.
+        bw_unpack_out_n = None
+        with bw_g.inserting_before(bw_g_inputs[-1].next):
+            for node in unpack_g.nodes:
+                if node.op == "placeholder":
+                    continue
+                new_n = bw_g.node_copy(node, lambda n: env[n])
+                bw_g_names.add(new_n.name)
+                env[node] = new_n
+                # Temporary insert output, to have remapped by node_copy args.
+                # Removed in the end.
+                if node.op == "output":
+                    bw_unpack_out_n = new_n
+
+        assert bw_unpack_out_n
+        _leaves = pytree.tree_leaves(bw_unpack_out_n.args)
+        assert len(_leaves) == 1
+        unpack_saved_tensor_n = _leaves[0]
+
+        if not bw_g_input_used_directly:
+            bw_g_input.replace_all_uses_with(unpack_saved_tensor_n)
+            bw_g.erase_node(bw_g_input)
+        else:
+            # Keep usages of bw_g_input in inserted unpacked hook graph.
+            # Replace other usages of bw_g_input with unpack_saved_tensor_n.
+            from torch._C import _fx_map_arg
+
+            def maybe_replace_node(n):
+                return unpack_saved_tensor_n if n == bw_g_input else n
+
+            for use_node in original_bw_g_input_users:
+                new_args = _fx_map_arg(use_node.args, maybe_replace_node)
+                new_kwargs = _fx_map_arg(use_node.kwargs, maybe_replace_node)
+                assert isinstance(new_args, tuple)
+                assert isinstance(new_kwargs, dict)
+                use_node._update_args_kwargs(new_args, new_kwargs)
+        bw_g.erase_node(bw_unpack_out_n)
+
+    # Changing forward graph outputs,
+    # Inserting packed_tensors and packed_syms on the place of saved tensors.
+    # Packed sym_scalars are together with saved symints
+    symint_outs_saved_for_bw = [n for n in fw_outs_saved_for_bw if is_sym_node(n)]
+    fw_new_outs = pytree.tree_leaves(
+        (
+            fw_outs[:num_inner_fwd_outputs],
+            fw_outs_packed_tensors,
+            fw_outs_packed_syms,
+            symint_outs_saved_for_bw,
+        )
+    )
+    fw_out_n.args = (tuple(fw_new_outs),)
+
+    # Assert that saved tensors and symints in forward outputs are aligned with backward inputs
+    _fw_n = num_inner_fwd_outputs
+    _fw_num_t = len(fw_outs_packed_tensors)
+    _fw_num_s = len(fw_outs_packed_syms) + len(symint_outs_saved_for_bw)
+    fw_outs_saved_tensors = fw_new_outs[_fw_n : _fw_n + _fw_num_t]
+    fw_outs_saved_syms = fw_new_outs[_fw_n + _fw_num_t :]
+    bw_new_ins = list(bw_g.find_nodes(op="placeholder"))
+    bw_ins_saved_syms = bw_new_ins[:_fw_num_s]
+    bw_ins_saved_tensors = bw_new_ins[_fw_num_s : _fw_num_s + _fw_num_t]
+
+    fw_t_names = [n.name for n in fw_outs_saved_tensors]
+    bw_t_names = [n.name for n in bw_ins_saved_tensors]
+    fw_s_names = [n.name for n in fw_outs_saved_syms]
+    bw_s_names = [n.name for n in bw_ins_saved_syms]
+
+    def _log_structured_logs():
+        if not aot_config.enable_log:
+            return
+
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "aot_saved_tensors_hooks_graphs",
+                "encoding": "string",
+            },
+            payload_fn=lambda: "\n".join(structured_logs),
+        )
+
+    if aot_config.enable_log:
+        structured_logs.append(
+            f"fw_outs[:num_inner_fwd_outputs]:{fw_outs[:num_inner_fwd_outputs]}"
+        )
+        structured_logs.append(f"fw_outs_packed_tensors:{fw_outs_packed_tensors}")
+        structured_logs.append(f"fw_t_names:{fw_t_names}")
+        structured_logs.append(f"bw_t_names:{bw_t_names}")
+        structured_logs.append(f"fw_s_names:{fw_s_names}")
+        structured_logs.append(f"bw_s_names:{bw_s_names}")
+        structured_logs.append(f"\nfw_g_pre_assert:{fw_g}")
+        structured_logs.append(f"\nbw_g_pre_assert:{bw_g}")
+        maybe_log_graph(
+            fw_module,
+            "Forward graph after transform pre-assert",
+            aot_config,
+            lambda: "aot_forward_graph_pre_assert_saved_tensors_hooks",
+            structured_logs,
+        )
+        maybe_log_graph(
+            bw_module,
+            "Backward graph after transform pre-assert",
+            aot_config,
+            lambda: "aot_backward_graph_pre_assert_saved_tensors_hooks",
+            structured_logs,
+        )
+        _log_structured_logs()
+
+    assert fw_t_names == bw_t_names
+    assert fw_s_names == bw_s_names
+
+    fw_g.lint()
+    bw_g.lint()
+    fw_module.recompile()
+    bw_module.recompile()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def aot_dispatch_autograd(
     flat_fn,
     flat_args: list[Any],
@@ -780,9 +1461,16 @@ def aot_dispatch_autograd(
     )
 
     fw_metadata.deterministic = torch.are_deterministic_algorithms_enabled()
+<<<<<<< HEAD
     fx_g, joint_inputs, maybe_subclass_meta = aot_dispatch_autograd_graph(
         flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
     )
+=======
+    with dynamo_timed("aot_trace_joint_graph", log_pt2_compile_event=True):
+        fx_g, joint_inputs, maybe_subclass_meta = aot_dispatch_autograd_graph(
+            flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Copied from aot_dispatch_autograd_graph.
     disable_amp = torch._C._is_any_autocast_enabled()
@@ -837,8 +1525,17 @@ def aot_dispatch_autograd(
             if fake_mode is not None and fake_mode.shape_env is not None:
                 tensorify_python_scalars(fx_g, fake_mode.shape_env, fake_mode)
 
+<<<<<<< HEAD
             fw_module, bw_module = aot_config.partition_fn(
                 fx_g, joint_inputs, num_fwd_outputs=num_inner_fwd_outputs
+=======
+            static_lifetime_input_indices = fw_metadata.static_input_indices
+            fw_module, bw_module = aot_config.partition_fn(
+                fx_g,
+                joint_inputs,
+                num_fwd_outputs=num_inner_fwd_outputs,
+                static_lifetime_input_indices=static_lifetime_input_indices,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             rng_states = [
                 n
@@ -863,11 +1560,25 @@ def aot_dispatch_autograd(
                     joint_inputs[1],
                 )
 
+<<<<<<< HEAD
+=======
+            maybe_inline_graph_saved_tensors_hooks(
+                fw_module,
+                bw_module,
+                num_inner_fwd_outputs,
+                inner_meta,
+                aot_config,
+                fw_metadata.static_input_indices,
+            )
+            static_lifetime_input_indices = fw_metadata.static_input_indices
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fw_outs = next(iter(fw_module.graph.find_nodes(op="output"))).args[0]
             # we only need to bookkeep the symints that are saved for bw, not any symints
             # the user forward might have returned in its own output
             fw_outs_saved_for_bw = fw_outs[num_inner_fwd_outputs:]
             num_fw_outs_saved_for_bw = len(fw_outs_saved_for_bw)
+<<<<<<< HEAD
             symint_outs_saved_for_bw = [
                 n for n in fw_outs_saved_for_bw if is_sym_node(n)
             ]
@@ -875,6 +1586,29 @@ def aot_dispatch_autograd(
             inner_meta.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
             num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
 
+=======
+            symint_outs_saved_for_bw = []
+            for idx, node in enumerate(fw_outs_saved_for_bw):
+                if is_sym_node(node):
+                    symint_outs_saved_for_bw.append(node)
+                elif (
+                    isinstance(node, torch.fx.Node)
+                    and "val" in getattr(node, "meta", {})
+                    and isinstance(node.meta["val"], FakeTensor)
+                ):
+                    # record dynamic tensor activations
+                    dynamic_dims: set[int] = {
+                        dim
+                        for dim, size in enumerate(node.meta["val"].shape)
+                        if not isinstance(size, int)
+                    }
+                    if dynamic_dims:
+                        fw_metadata.dynamic_saved_tensors_idxs[idx] = dynamic_dims
+
+            fw_metadata.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+            inner_meta.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+            num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if torch._functorch.config.donated_buffer:
                 fw_metadata.bw_donated_idxs = collect_bw_donated_buffer_idxs(
                     fw_module,
@@ -1096,7 +1830,11 @@ def aot_dispatch_autograd(
             with TracingContext.report_output_strides() as fwd_output_strides:
                 compiled_fw_func = aot_config.fw_compiler(fw_module, adjusted_flat_args)
 
+<<<<<<< HEAD
             if not hasattr(compiled_fw_func, "_boxed_call"):
+=======
+            if not getattr(compiled_fw_func, "_boxed_call", False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 compiled_fw_func = make_boxed_func(compiled_fw_func)
 
             if fakified_out_wrapper.needs_post_compile:
@@ -1184,9 +1922,22 @@ def aot_dispatch_autograd(
             compiled_bw_func = None
             if num_symints_saved_for_bw > 0:
                 try:
+<<<<<<< HEAD
                     compiled_bw_func = aot_config.bw_compiler(
                         bw_module, placeholder_list
                     )
+=======
+                    # See Note: [Backward graph lazy lowering]
+                    with torch._subclasses.fake_tensor.unset_fake_temporarily():
+                        # If bw_module contains lifted constants, they will be real tensors stored as
+                        # GraphModule. Deepcopying tensors under fake mode is not supported and will
+                        # raise when attempting to set storage.
+                        bw_module_copy = copy.deepcopy(bw_module)
+                    compiled_bw_func = aot_config.bw_compiler(
+                        bw_module_copy, placeholder_list
+                    )
+                    del bw_module_copy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except Exception as e:
                     exc = e
                     trace_structured(
@@ -1250,7 +2001,14 @@ def aot_dispatch_autograd(
         # close over aot_config.cache_info, since aot_config never changes.
         # But closing over random variables is confusing IMO, so I'm leaving it.
         def try_save_cache_entry(  # noqa: F811
+<<<<<<< HEAD
             compiled_bw_func, _fw_metadata, aot_config
+=======
+            compiled_bw_func: Callable,
+            bw_module: torch.fx.GraphModule,
+            _fw_metadata: ViewAndMutationMeta,
+            aot_config: AOTConfig,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             fw_key = getattr(compiled_fw_func, "_fx_graph_cache_key", None)
             bw_key = getattr(compiled_bw_func, "_fx_graph_cache_key", None)
@@ -1268,6 +2026,7 @@ def try_save_cache_entry(  # noqa: F811
                 aot_forward_graph_str: Optional[str] = fw_module_str
                 aot_backward_graph_str: Optional[str] = bw_module_str
                 aot_joint_graph_str: Optional[str] = joint_graph_str
+<<<<<<< HEAD
                 entry = AOTAutogradCacheEntry(
                     CompiledForward(fw_key),
                     CompiledBackward(
@@ -1275,6 +2034,13 @@ def try_save_cache_entry(  # noqa: F811
                         backward_state_indices,
                         num_symints_saved_for_bw,
                     ),
+=======
+                guards_expr = AOTAutogradCache.generate_guards_expression(cache_info)
+
+                entry = AOTAutogradCache.make_entry(
+                    compiled_fw_func,  # type: ignore[arg-type]
+                    compiled_bw_func,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     aot_joint_graph_str,
                     aot_forward_graph_str,
                     aot_backward_graph_str,
@@ -1285,13 +2051,26 @@ def try_save_cache_entry(  # noqa: F811
                     _indices_of_inps_to_detach,
                     forward_time_taken_ns,
                     backward_time_taken_ns,
+<<<<<<< HEAD
+=======
+                    sanitized_aot_config=sanitize_aot_config(aot_config),
+                    guards_expr=guards_expr,
+                    backward_state_indices=backward_state_indices,
+                    num_symints_saved_for_bw=num_symints_saved_for_bw,
+                    serialized_bw_module=serialize_graph_module(bw_module),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 remote = should_use_remote_autograd_cache()
                 AOTAutogradCache.save(cache_info.cache_key, entry, remote)
 
         if compiled_bw_func is not None:
+<<<<<<< HEAD
             # If we already compiled it we can just run it right now without waiting
             try_save_cache_entry(compiled_bw_func, fw_metadata, aot_config)
+=======
+            # If we already compiled the backward, we save its cache entry now
+            try_save_cache_entry(compiled_bw_func, bw_module, fw_metadata, aot_config)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try_save_cache_entry = None
 
     compiled_fn = AOTDispatchAutograd.post_compile(
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 539c1a91052d..932519e9cb96 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -8,9 +8,17 @@
 """
 import builtins
 import collections
+<<<<<<< HEAD
 import itertools
 import pprint
 from contextlib import nullcontext
+=======
+import contextlib
+import copy
+import itertools
+import pprint
+from contextlib import AbstractContextManager, nullcontext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass, field
 from functools import wraps
 from typing import Any, Callable, Optional, TYPE_CHECKING, Union
@@ -18,6 +26,11 @@
 import torch
 import torch.utils.dlpack
 from torch import Tensor
+<<<<<<< HEAD
+=======
+from torch._dynamo import config as dynamo_config
+from torch._dynamo.callback import callback_handler, CallbackTrigger
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.utils import CompileEventLogger, dynamo_timed, get_metrics_context
 from torch._guards import (
     compile_context,
@@ -45,6 +58,10 @@
 from .schemas import (
     AOTConfig,
     InputAliasInfo,
+<<<<<<< HEAD
+=======
+    MemoryFormatMeta,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MutationType,
     OutputType,
     PlainTensorMeta,
@@ -246,6 +263,34 @@ def make_output_handler(info, runtime_metadata, trace_joint):
     return handler_type(info, runtime_metadata, trace_joint)
 
 
+<<<<<<< HEAD
+=======
+# not sure why AOTDispatcher needs to manually set this
+def maybe_mark_dynamic_helper(t: torch.Tensor, dims: set[int]):
+    if hasattr(t, "_dynamo_weak_dynamic_indices"):
+        t._dynamo_weak_dynamic_indices |= dims
+    else:
+        t._dynamo_weak_dynamic_indices = dims.copy()  # type: ignore[attr-defined]
+
+
+def _should_disable_saved_tensors_hooks():
+    # Compiled autograd is not supported yet, to be added in future.
+    if torch._dynamo.compiled_autograd.in_compiled_autograd_region:
+        return False
+
+    get_hooks = torch._functorch._aot_autograd.utils.top_saved_tensors_hooks
+    are_inline_hooks = (
+        torch._functorch._aot_autograd.utils.saved_tensors_hooks_are_inlineable
+    )
+
+    hooks = get_hooks()
+    if are_inline_hooks(hooks):
+        return True
+
+    return False
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _create_runtime_wrapper(
     compiled_fn,
     *,
@@ -255,7 +300,11 @@ def _create_runtime_wrapper(
     keep_input_mutations: bool,
     disable_amp: bool,
 ):
+<<<<<<< HEAD
     if not hasattr(compiled_fn, "_boxed_call"):
+=======
+    if not getattr(compiled_fn, "_boxed_call", False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compiled_fn = make_boxed_func(compiled_fn)
 
     # Note [Inputs needed in runtime epilogue after list clearing]
@@ -288,7 +337,34 @@ def _create_runtime_wrapper(
             for info in runtime_metadata.output_info
         )
 
+<<<<<<< HEAD
     def runtime_wrapper(args: list[Any]):
+=======
+    def record_runtime_wrapper_prologue_enter() -> (
+        Optional[AbstractContextManager[None]]
+    ):
+        if (
+            torch.autograd.profiler._is_profiler_enabled
+            and dynamo_config.record_runtime_overhead
+        ):
+            cm = torch._C._profiler._RecordFunctionFast(
+                "AOTDispatcher Runtime Wrapper Prologue"
+            )
+            cm.__enter__()
+            return cm
+        return None
+
+    def record_runtime_wrapper_prologue_exit(
+        cm: Optional[AbstractContextManager[None]],
+    ) -> None:
+        if cm is not None:
+            cm.__exit__(None, None, None)
+
+    def runtime_wrapper(args: list[Any]):
+        # Create context manager for profiler
+        cm = record_runtime_wrapper_prologue_enter()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # stash a ref to each input tensor we plan to use after the compiled function
         orig_inputs = {i: args[i] for i in epilogue_args_idx}
 
@@ -309,9 +385,17 @@ def runtime_wrapper(args: list[Any]):
             # It's possible to have trace_joint inside user specified with no_grad() region,
             # if there is a nested with enable_grad(), that forces some outputs to require gradients.
             # Therefore, we unconditionally turn on enable_grad() for compiled_fn execution.
+<<<<<<< HEAD
             with torch.autograd._force_original_view_tracking(
                 True
             ), torch.enable_grad():
+=======
+            with (
+                torch.autograd._force_original_view_tracking(True),
+                torch.enable_grad(),
+            ):
+                record_runtime_wrapper_prologue_exit(cm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 all_outs = call_func_at_runtime_with_args(
                     compiled_fn, args_, disable_amp=disable_amp, steal_args=True
                 )
@@ -325,6 +409,10 @@ def runtime_wrapper(args: list[Any]):
             try:
                 if grad_enabled:
                     torch._C._set_grad_enabled(False)
+<<<<<<< HEAD
+=======
+                record_runtime_wrapper_prologue_exit(cm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 all_outs = call_func_at_runtime_with_args(
                     compiled_fn, args, disable_amp=disable_amp, steal_args=True
                 )
@@ -431,15 +519,31 @@ def runtime_wrapper(args: list[Any]):
             for t, o in zip(ret_outs, runtime_metadata.output_info):
                 if o.dynamic_dims is None:
                     continue
+<<<<<<< HEAD
                 if hasattr(t, "_dynamo_weak_dynamic_indices"):
                     t._dynamo_weak_dynamic_indices |= o.dynamic_dims
                 else:
                     t._dynamo_weak_dynamic_indices = o.dynamic_dims.copy()
+=======
+                maybe_mark_dynamic_helper(t, o.dynamic_dims)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if runtime_metadata.grad_enabled_mutation is not None:
             torch._C._set_grad_enabled(runtime_metadata.grad_enabled_mutation)
         return ret_outs
 
+<<<<<<< HEAD
     return runtime_wrapper
+=======
+    if not (trace_joint and _should_disable_saved_tensors_hooks()):
+        return runtime_wrapper
+
+    # Disabling saved tensors hooks
+    def _runtime_wrapper(*args, **kwargs):
+        with _disable_saved_tensors_hooks():
+            return runtime_wrapper(*args, **kwargs)
+
+    return _runtime_wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -850,6 +954,7 @@ def pre_compile(
             """
             )
 
+<<<<<<< HEAD
         # Strategy 2: Duplicate specialize.
         #
         # In Haskell types, suppose you have:
@@ -886,6 +991,28 @@ def pre_compile(
         #       (3, 2),
         #   ]
         #   keep_arg_mask = [True, True, False, True]
+=======
+        # Strategy 2: Duplicate specialization
+        #
+        # When we have duplicate arguments in a function call, we need to handle them specially.
+        # For example, if we have a function call f(a, b, a, c), we need to:
+        #
+        # 1. Remove duplicates to get a deduplicated list [a, b, c]
+        # 2. Compile our function to work with this deduplicated list
+        # 3. At runtime, convert incoming arguments with duplicates to the deduplicated form
+        # 4. Pass the deduplicated arguments to our compiled function
+        #
+        # To do this, we need two helper functions:
+        #
+        # - remove_dupe_args: Converts [a, b, a, c] -> [a, b, c]
+        # - add_dupe_args: Converts [a, b, c] -> [a, b, a, c]
+        #
+        # For our example [a, b, a, c], we track:
+        #
+        # - seen_args = {a: 0, b: 1, c: 2} (maps each unique arg to its first position)
+        # - add_dupe_map = [0, 1, 0, 2] (tells us how to reconstruct the original list)
+        # - keep_arg_mask = [True, True, False, True] (tells us which args to keep when deduplicating)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         seen_args: dict[Tensor, int] = {}
         # Implicitly map duped arg position (list index) to de-duped arg position
@@ -1439,6 +1566,7 @@ def make_hashable(arg):
         # (2) Metadata telling functionalization how to generate the inner argument list given the outer calling convention.
         #     We post-process it into a list, where meta[i] tells you info about the i'th argument in the inner calling convention.
         args_to_functionalization = base_args + other_args
+<<<<<<< HEAD
         arg_to_old_idx_map = {
             make_hashable(arg): i for (i, arg) in enumerate(fwd_inputs)
         }
@@ -1446,6 +1574,25 @@ def make_hashable(arg):
             new_idx = len(base_args) + i
             old_idx = arg_to_old_idx_map[make_hashable(other_arg)]
             inner_calling_convention_meta[old_idx] = new_idx
+=======
+
+        # Map each argument into its old index.
+        # There may be some repeated arguments, so we collect their indices in a list.
+        arg_to_old_idx_map = collections.defaultdict(list)
+        for i, arg in enumerate(fwd_inputs):
+            arg_to_old_idx_map[make_hashable(arg)].append(i)
+        # Reverse the list of each argument, so that we can easily pop them one-after-the-other in order.
+        for hashable_arg in arg_to_old_idx_map:
+            arg_to_old_idx_map[hashable_arg] = list(
+                reversed(arg_to_old_idx_map[hashable_arg])
+            )
+
+        for i, other_arg in enumerate(other_args):
+            new_idx = len(base_args) + i
+            old_idx = arg_to_old_idx_map[make_hashable(other_arg)].pop()
+            inner_calling_convention_meta[old_idx] = new_idx
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # post process into a list
         post_processed_calling_convention_meta: list[
             Union[int, tuple[int, torch.Tensor]]
@@ -1458,6 +1605,16 @@ def make_hashable(arg):
         return args_to_functionalization, post_processed_calling_convention_meta
 
 
+<<<<<<< HEAD
+=======
+# Note: [Backward graph lazy lowering]
+# After AOTDispatch traces the backward for graphs requiring autograd, we will lower the graph lazily,
+# unless we suspect that inductor might specialize and insert additional guards. When we do lazy
+# lowering, we stash the AOT backward graph (bw_module) in this class.
+#
+# Lowering passes are performed on a deepcopy of this bw_module due to compatbility
+# with compiled autograd. See: https://github.com/pytorch/pytorch/pull/149229#discussion_r2002122645.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class AutogradLazyBackwardCompileInfo:
     bw_module: Callable
@@ -1466,6 +1623,17 @@ class AutogradLazyBackwardCompileInfo:
     saved_compile_context: Optional[CompileContext]
 
 
+<<<<<<< HEAD
+=======
+# On an AOT Autograd cache hit, we already have a lowered backward, so there is usually
+# no need to keep information around for a new lazy compilation. Except for compiled autograd,
+# which wants to retrace this backward into a larger graph, and it needs the graph module to do so.
+@dataclass
+class CachedAutogradLazyBackwardCompileInfo:
+    bw_module_fn: Callable
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _raise_if_functorch_active():
     # not ideal but prevent the user from seeing a nasty traceback - See #138422
     stack = torch._C._functorch.peek_interpreter_stack()
@@ -1752,6 +1920,71 @@ def _backward_epilogue_functional(
     return out
 
 
+<<<<<<< HEAD
+=======
+def coerce_to_expected_memory_format(x: torch.Tensor, memory_format: MemoryFormatMeta):
+    if memory_format.memory_format is not None:
+        # Coerce to torch.memory_format
+        if not x.is_contiguous(memory_format=memory_format.memory_format):
+            x = x.contiguous(memory_format=memory_format.memory_format)
+        return x
+
+    expected_size = memory_format.size
+    assert expected_size is not None
+    expected_stride = memory_format.stride
+    assert expected_stride is not None
+    # Expected size and stride are static ints
+    # ok to use == to compare runtime tensor strides and shapes
+
+    if x.shape == expected_size and x.stride() == expected_stride:
+        # Runtime tangent size and stride are the same as expected, no need to coerce
+        return x
+
+    # Empty_strided creates a raw Tensor.
+    # We are guranteed that only raw Tensors has expected size and stride.
+    # Subclasses have only expected memory_format.
+    restrided = torch.empty_strided(
+        size=expected_size,
+        stride=expected_stride,
+        dtype=x.dtype,
+        device=x.device,
+        layout=x.layout,
+        requires_grad=x.requires_grad,
+    )
+    restrided.copy_(x)
+    return restrided
+
+
+@contextlib.contextmanager
+def _disable_saved_tensors_hooks():
+    error_message = (
+        "Saved tensors hooks were specialized as GraphModules."
+        "In this case aot_autograd inlines them in forward and backward graph "
+        "and disables them during runtime of aot_autograd compiled region."
+        "If you see this error, that means that there is some unexpected push or pop manipulation "
+        "during aot_autograd compiled region runtime."
+        "Compilation with different hooks must result in recompilation."
+    )
+    fail_if_non_empty = False
+    maybe_prev_message = None
+    try:
+        maybe_prev_message = (
+            torch._C._autograd._saved_tensors_hooks_get_disabled_error_message()
+        )
+        torch._C._autograd._saved_tensors_hooks_disable(
+            error_message, fail_if_non_empty
+        )
+        yield
+    finally:
+        if maybe_prev_message is None:
+            torch._C._autograd._saved_tensors_hooks_enable()
+        else:
+            torch._C._autograd._saved_tensors_hooks_disable(
+                maybe_prev_message, fail_if_non_empty
+            )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This is wrapped in a class just for namespacing purposes
 # No need to make it into an actual CompilerWrapper because it doesn't fit the abstract as cleanly
 class AOTDispatchAutograd:
@@ -1761,8 +1994,13 @@ def process_runtime_tangent(x, meta: Union[PlainTensorMeta, SubclassCreationMeta
             return x, [x]
 
         if isinstance(x, FakeTensor):
+<<<<<<< HEAD
             if not x.is_contiguous(memory_format=meta.memory_format):
                 x = x.contiguous(memory_format=meta.memory_format)
+=======
+            assert meta.memory_format
+            x = coerce_to_expected_memory_format(x, meta.memory_format)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return x, [x]
 
         expected_type: Optional[type] = torch.Tensor
@@ -1820,8 +2058,13 @@ def maybe_coerce(x):
             )
 
         # Coerce to expected memory format
+<<<<<<< HEAD
         if not x.is_contiguous(memory_format=meta.memory_format):
             x = x.contiguous(memory_format=meta.memory_format)
+=======
+        assert meta.memory_format
+        x = coerce_to_expected_memory_format(x, meta.memory_format)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not is_traceable_wrapper_subclass(x):
             return x, [x]
@@ -1852,7 +2095,16 @@ def post_compile(
         backward_state_indices: list[int],
         disable_amp: bool,
         indices_of_inps_to_detach: list[int],
+<<<<<<< HEAD
         lazy_backward_info: Optional[AutogradLazyBackwardCompileInfo],
+=======
+        lazy_backward_info: Optional[
+            Union[
+                AutogradLazyBackwardCompileInfo,
+                CachedAutogradLazyBackwardCompileInfo,
+            ]
+        ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aot_config: AOTConfig,
         *,
         fw_metadata: ViewAndMutationMeta,  # runtime metadata
@@ -1954,11 +2206,30 @@ def forward(ctx, *deduped_flat_tensor_args):
                 assert all(
                     isinstance(x, torch.Tensor) for x in tensors_saved_for_backwards
                 )
+<<<<<<< HEAD
                 # See Note [Detaching saved tensors in AOTAutograd]
                 ctx.save_for_backward(
                     *(
                         x.detach() if x._is_view() else x
                         for x in tensors_saved_for_backwards
+=======
+
+                def mark_dynamic_activations(activations: list[torch.Tensor]):
+                    for (
+                        idx,
+                        dims,
+                    ) in CompiledFunction.metadata.dynamic_saved_tensors_idxs.items():
+                        maybe_mark_dynamic_helper(activations[idx], dims)
+                    return activations
+
+                # See Note [Detaching saved tensors in AOTAutograd]
+                ctx.save_for_backward(
+                    *mark_dynamic_activations(
+                        [
+                            x.detach() if x._is_view() else x
+                            for x in tensors_saved_for_backwards
+                        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
                 symint_outs = fw_outs[
@@ -2149,6 +2420,12 @@ def _backward_impl(ctx, all_args):
 
                 if CompiledFunction.compiled_bw is None:
                     assert lazy_backward_info is not None
+<<<<<<< HEAD
+=======
+                    assert isinstance(
+                        lazy_backward_info, AutogradLazyBackwardCompileInfo
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                     if not saved_tensors_use_once:
                         fw_metadata.bw_donated_idxs = []
@@ -2172,6 +2449,7 @@ def _backward_impl(ctx, all_args):
 
                     context = torch._C._DisableAutocast if disable_amp else nullcontext
                     metrics_context = get_metrics_context()
+<<<<<<< HEAD
                     with tracing(saved_context), compile_context(
                         saved_compile_context
                     ), context(), track_graph_compiling(
@@ -2193,6 +2471,37 @@ def _backward_impl(ctx, all_args):
                             # bw_module
                             try_save_cache_entry(
                                 CompiledFunction.compiled_bw,
+=======
+                    with (
+                        tracing(saved_context),
+                        compile_context(saved_compile_context),
+                        context(),
+                        track_graph_compiling(aot_config, "backward"),
+                        metrics_context,
+                        dynamo_timed(
+                            "backward._backward_impl",
+                            phase_name="entire_backward_compile",
+                            log_pt2_compile_event=True,
+                            dynamo_compile_column_us="backward_cumulative_compile_time_us",
+                            log_waitcounter=True,
+                            waitcounter_name_override="entire_backward_compile",
+                        ),
+                        callback_handler.install_callbacks(
+                            CallbackTrigger.LAZY_BACKWARD,
+                            str(CompileContext.current_compile_id()),
+                        ),
+                    ):
+                        CompileEventLogger.compilation_metric(is_forward=False)
+                        # See Note: [Backward graph lazy lowering]
+                        CompiledFunction.compiled_bw = aot_config.bw_compiler(
+                            copy.deepcopy(bw_module), placeholder_list
+                        )
+                        # Maybe save cache entry
+                        if try_save_cache_entry is not None:
+                            try_save_cache_entry(
+                                CompiledFunction.compiled_bw,
+                                bw_module,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 fw_metadata,
                                 aot_config,
                             )
diff --git a/torch/_functorch/_aot_autograd/schemas.py b/torch/_functorch/_aot_autograd/schemas.py
index 6259d082e2ae..4cfbafae0647 100644
--- a/torch/_functorch/_aot_autograd/schemas.py
+++ b/torch/_functorch/_aot_autograd/schemas.py
@@ -7,7 +7,12 @@
 import collections
 import dataclasses
 import functools
+<<<<<<< HEAD
 from collections.abc import Iterable
+=======
+import itertools
+from collections.abc import Iterable, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Callable, NewType, Optional, Union
@@ -156,9 +161,55 @@ def mutation_type(self) -> MutationType:
 
 
 @dataclass
+<<<<<<< HEAD
 class PlainTensorMeta:
     unwrapped_idx: int
     memory_format: Optional[torch.memory_format] = None
+=======
+class MemoryFormatMeta:
+    # For static shapes we assume tangents have the same strideness as outputs
+    size: Optional[Sequence[int]] = None
+    stride: Optional[Sequence[int]] = None
+
+    # For dynamic shapes we assume the same memory format: contiguous, channels_last etc.
+    memory_format: Optional[torch.memory_format] = None
+
+    @staticmethod
+    def from_tensor(t: torch.Tensor) -> Optional["MemoryFormatMeta"]:
+        # We only memorize expected memory format for
+        # 1. Traceable wrapper subclasses
+        # We can not create restrided subclass tensor, as torch.empty_strided works only with dense tensors.
+        # 2. Dynamic shape tensors
+        # Support for symbolic shapes is not implemented yet.
+        use_memory_format: bool = (
+            not torch._functorch.config.guess_tangent_strides_as_outputs
+            or is_traceable_wrapper_subclass(t)
+        )
+        if not use_memory_format:
+            is_static_shape = True
+            for s in itertools.chain(t.shape, t.stride()):
+                if not isinstance(s, int):
+                    is_static_shape = False
+                    break
+
+            use_memory_format = not is_static_shape
+
+        if use_memory_format:
+            return MemoryFormatMeta(
+                memory_format=torch._prims_common.suggest_memory_format(t),
+            )
+
+        return MemoryFormatMeta(
+            size=t.size(),
+            stride=t.stride(),
+        )
+
+
+@dataclass
+class PlainTensorMeta:
+    unwrapped_idx: int
+    memory_format: Optional[MemoryFormatMeta] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -204,7 +255,11 @@ class SubclassCreationMeta:
 
     # Used at runtime to determine the subclass type, so we don't need to save the original subclass
     original_subclass_type: Optional[type] = None
+<<<<<<< HEAD
     memory_format: Optional[torch.memory_format] = None
+=======
+    memory_format: Optional[MemoryFormatMeta] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def compute_outer_size_and_stride(
         self,
@@ -315,6 +370,11 @@ def __post_init__(self):
 # This class encapsulates all aliasing + mutation info we need about the forward graph
 # See a more detailed overview of the edge case handling at
 # https://docs.google.com/document/d/19UoIh_SVrMy_b2Sx5ZaeOJttm6P0Qmyss2rdBuyfoic/edit
+<<<<<<< HEAD
+=======
+# NOTE: This class is saved in AOTAutogradCache, If you are adding elements, make sure
+# they are covered by warm cache tests.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass(eq=False)
 class ViewAndMutationMeta:
     # length = # user inputs
@@ -526,7 +586,19 @@ def __post_init__(self):
             + self.num_outputs_aliased_to_intermediates
         )
 
+<<<<<<< HEAD
         self.dynamic_outputs = any(o.dynamic_dims for o in self.output_info)
+=======
+        # Record dynamic outputs of the Dynamo traced forward graph
+        # Mark them as dynamic at the end of the runtime wrapper
+        self.dynamic_outputs = any(o.dynamic_dims for o in self.output_info)
+
+        # Record the indices of dynamic outputs in the partitioned forward graph
+        # Mark them as dynamic in the runtime wrapper
+        # activation index -> dynamic dims indices
+        self.dynamic_saved_tensors_idxs: dict[int, set[int]] = {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # See Note: [AOTAutograd Backward Guards]
         # This is pre-computed for fast asserts on the types of our grad_outputs in the backward.
         # Eventually, we should kill this and replace with real backward guards.
@@ -872,6 +944,10 @@ def from_tracing_metadata(
 class AOTAutogradCacheInfo:
     cache_key: str
     start_time_ns: int
+<<<<<<< HEAD
+=======
+    forward_symints: list[torch.SymInt]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -898,6 +974,15 @@ class AOTConfig:
     pre_dispatch: bool = False
     # Key to use for AOTAutogradCache
     cache_info: Optional[AOTAutogradCacheInfo] = None
+<<<<<<< HEAD
+=======
+    # If we should ignore the shape_env in the ambient tracing_context.
+    # The net effect is that if dynamic shapes are on, we end up
+    # specializing on example_inputs.
+    # Used only by standalone_compile.
+    ignore_shape_env: bool = False
+    precompile_backend_id: Optional[str] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         if self.pre_dispatch:
diff --git a/torch/_functorch/_aot_autograd/subclass_utils.py b/torch/_functorch/_aot_autograd/subclass_utils.py
index d352f43da371..61399dfe51c9 100644
--- a/torch/_functorch/_aot_autograd/subclass_utils.py
+++ b/torch/_functorch/_aot_autograd/subclass_utils.py
@@ -14,6 +14,10 @@
 import torch.utils._pytree as pytree
 from torch import SymInt, Tensor
 from torch._subclasses.fake_tensor import get_plain_tensors
+<<<<<<< HEAD
+=======
+from torch.types import IntLikeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 from .schemas import (
@@ -46,16 +50,28 @@ def requires_subclass_dispatch(args, fw_metadata: ViewAndMutationMeta) -> bool:
     return any_subclass_args or any_subclass_outputs
 
 
+<<<<<<< HEAD
 suggest_memory_format = torch._prims_common.suggest_memory_format
+=======
+from .schemas import MemoryFormatMeta
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def maybe_suggest_memory_format(
     t, with_memory_format: bool
+<<<<<<< HEAD
 ) -> Optional[torch.memory_format]:
     if not with_memory_format:
         return None
 
     return suggest_memory_format(t)
+=======
+) -> Optional[MemoryFormatMeta]:
+    if not with_memory_format:
+        return None
+
+    return MemoryFormatMeta.from_tensor(t)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_subclass_typing_container(
@@ -166,9 +182,15 @@ def create_subclass_meta(
     return infos
 
 
+<<<<<<< HEAD
 def filter_symints(lst: Iterable[Union[int, SymInt]]):
     # Capture all SymInts from the iterable.
     def symint_check(s: Union[int, SymInt]) -> bool:
+=======
+def filter_symints(lst: Iterable[IntLikeType]):
+    # Capture all SymInts from the iterable.
+    def symint_check(s: IntLikeType) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return isinstance(s, SymInt) and not s.node.is_nested_int()
 
     return [s for s in lst if symint_check(s)]
diff --git a/torch/_functorch/_aot_autograd/traced_function_transforms.py b/torch/_functorch/_aot_autograd/traced_function_transforms.py
index 433ff7348d90..0b33562e1dd9 100644
--- a/torch/_functorch/_aot_autograd/traced_function_transforms.py
+++ b/torch/_functorch/_aot_autograd/traced_function_transforms.py
@@ -29,7 +29,11 @@
     maybe_enable_thunkify,
 )
 from torch.fx.experimental.symbolic_shapes import (
+<<<<<<< HEAD
     definitely_false,
+=======
+    guard_or_true,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PropagateUnbackedSymInts,
     sym_eq,
 )
@@ -221,12 +225,24 @@ def inner_fn(primals: list[Any], tangents: list[Any]):
                 # A bit sketchy, but fixes e.g. test_aot_autograd_exhaustive_matmul_cpu_float32
                 # The issue is that we are sensitive to decomps that don't accurately maintain
                 # their output's _base.shape compared to eager mode, and this helps mitigate a bit.
+<<<<<<< HEAD
                 # The not definitely_false is also sketchy; if unbacked
                 # symints are involved, we're just going to assume that the
                 # decomps setup the base shape correctly
                 needed_outs.append(
                     out
                     if not definitely_false(sym_eq(out.shape, tangent.shape))
+=======
+                # The guard_or_true also sketchy; if unbacked
+                # symints are involved, we're just going to assume that the
+                # decomps setup the base shape correctly
+
+                # Return out if the result of out.shape==tangent.shape is unknown or known to be true.
+                # otherwise if its a known false return out.view(tangent.shape).
+                needed_outs.append(
+                    out
+                    if guard_or_true(sym_eq(out.shape, tangent.shape))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     else out.view(tangent.shape)
                 )
                 needed_tangents.append(tangent)
diff --git a/torch/_functorch/_aot_autograd/utils.py b/torch/_functorch/_aot_autograd/utils.py
index cc14a77244f6..076c9fdeb225 100644
--- a/torch/_functorch/_aot_autograd/utils.py
+++ b/torch/_functorch/_aot_autograd/utils.py
@@ -122,7 +122,11 @@ def call_func_at_runtime_with_args(
 
     context = torch._C._DisableAutocast if disable_amp else nullcontext
     with context():
+<<<<<<< HEAD
         if hasattr(f, "_boxed_call"):
+=======
+        if getattr(f, "_boxed_call", False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out = normalize_as_list(f(args))
         else:
             # TODO: Please remove soon
@@ -500,3 +504,19 @@ def get_cuda_generator_meta_val(device_idx: int):
     it is fine to use in the meta.
     """
     return torch.cuda.default_generators[device_idx].clone_state()
+<<<<<<< HEAD
+=======
+
+
+def top_saved_tensors_hooks():
+    return torch._C._autograd._top_saved_tensors_default_hooks(True)
+
+
+def saved_tensors_hooks_are_inlineable(hooks) -> bool:
+    if not hooks:
+        return False
+    pack, unpack = hooks
+    return isinstance(pack, torch.fx.GraphModule) and isinstance(
+        unpack, torch.fx.GraphModule
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index 1ee4ec9a720f..e704cf59255c 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -23,10 +23,21 @@
     set_feature_use,
 )
 from torch._guards import detect_fake_mode
+<<<<<<< HEAD
 from torch._inductor.output_code import OutputCode
 from torch._inductor.utils import BoxedBool, InputType
 from torch._subclasses import FakeTensor, FakeTensorMode
 from torch.fx.experimental.proxy_tensor import make_fx
+=======
+from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+from torch._inductor.output_code import OutputCode
+from torch._inductor.utils import BoxedBool, InputType
+from torch._subclasses import FakeTensor, FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    _pytree_subclasses_that_lose_info,
+    make_fx,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
@@ -484,11 +495,19 @@ def process_inputs(
     aot_config: AOTConfig,
     fake_mode: FakeTensorMode,
     shape_env: Optional[ShapeEnv],
+<<<<<<< HEAD
+=======
+    ignore_shape_env: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> FakifiedFlatArgs:
     with fake_mode:
 
         def convert(idx, x):
+<<<<<<< HEAD
             if shape_env is not None:
+=======
+            if shape_env is not None and not ignore_shape_env:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 from torch._dynamo.source import ConstantSource
 
                 if isinstance(x, int):
@@ -536,13 +555,23 @@ def convert(idx, x):
                 # Dynamo
                 return fake_mode.from_tensor(x, static_shapes=True)
 
+<<<<<<< HEAD
             return fake_mode.from_tensor(
                 x,
                 static_shapes=False,
+=======
+            result = fake_mode.from_tensor(
+                x,
+                static_shapes=ignore_shape_env,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 symbolic_context=symbolic_context,
                 source=source,
                 trace=trace,
             )
+<<<<<<< HEAD
+=======
+            return result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return FakifiedFlatArgs([convert(idx, x) for idx, x in enumerate(flat_args)])
 
@@ -667,7 +696,21 @@ def _dup_fake_script_obj(fake_flat_args):
                     ctx = _detect_attribute_assignment(mod)
                 else:
                     ctx = nullcontext()
+<<<<<<< HEAD
                 with ctx:
+=======
+
+                if torch._functorch.config.fake_tensor_propagate_real_tensors:
+                    # Running dynamo_timed causes fake tensor issues when
+                    # propagate real tensor is switched on.
+                    dynamo_timed_ctx = nullcontext()
+                else:
+                    dynamo_timed_ctx = dynamo_timed(
+                        "aot_collect_metadata", log_pt2_compile_event=True
+                    )
+
+                with dynamo_timed_ctx, ctx:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     fw_metadata = run_functionalized_fw_and_collect_metadata(
                         flat_fn,
                         static_input_indices=aot_config.static_input_indices,
@@ -1006,6 +1049,15 @@ def _try_get_metadata_from_dynamo(
         aot_autograd_arg_pos_to_source: used to dedup params and their guards
         static_input_indices: used to identify static inputs for cudagraphs
     """
+<<<<<<< HEAD
+=======
+    # Note [Assumption on Dynamo Metadata]
+    # This function assumes a graph module from dynamo provides `dynamo_compiled_id`,
+    # _param_name_to_source, and every placeholder node has `_dynamo_source` attributes.
+    # When gm is modified (e.g., DDPOptimizer via split_module), metadata needs to
+    # be propagated in order to be recognized as a dynamo graph
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not (isinstance(mod, torch.fx.GraphModule) and "dynamo_compile_id" in mod.meta):
         # graph was not captured by dynamo
         return None, []
@@ -1039,7 +1091,14 @@ def _try_get_metadata_from_dynamo(
     for pos, node in enumerate(mod.graph.find_nodes(op="placeholder")):
         assert hasattr(node, "_dynamo_source")
         source = node._dynamo_source
+<<<<<<< HEAD
         assert source not in seen_sources, source
+=======
+        # `source`` specifies the source from user code. ddp optimizer may have
+        # intermediate values becoming submodule placeholders which does not
+        # have a source
+        assert source is None or source not in seen_sources, source
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seen_sources.add(source)
         aot_autograd_arg_pos_to_source.append(source)
         source_name = source.name() if source else str(source)
@@ -1076,6 +1135,11 @@ def aot_module_simplified(
     keep_inference_input_mutations=False,
     inference_compiler: Optional[AOTDispatchCompiler] = None,
     cudagraphs: Optional[BoxedBool] = None,
+<<<<<<< HEAD
+=======
+    boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
+    ignore_shape_env: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> nn.Module:
     """
     This is the simplified or low overhead version of aot_module. For frontends
@@ -1143,9 +1207,19 @@ def aot_module_simplified(
         is_export=False,
         no_tangents=False,
         cache_info=None,
+<<<<<<< HEAD
     )
     fake_mode, shape_env = construct_fake_mode(full_args, aot_config)
     fake_flat_args = process_inputs(full_args, aot_config, fake_mode, shape_env)
+=======
+        ignore_shape_env=ignore_shape_env,
+        precompile_backend_id=getattr(mod, "_backend_id", None),
+    )
+    fake_mode, shape_env = construct_fake_mode(full_args, aot_config)
+    fake_flat_args = process_inputs(
+        full_args, aot_config, fake_mode, shape_env, ignore_shape_env
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def dispatch_and_compile():
         functional_call = create_functional_call(mod, params_spec, params_len)
@@ -1171,6 +1245,10 @@ def dispatch_and_compile():
                 fake_flat_args,
                 aot_config,
                 cudagraphs,
+<<<<<<< HEAD
+=======
+                boxed_forward_device_index,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 local,
                 remote,
             )
@@ -1638,6 +1716,7 @@ def _get_attributes(mod):
         # return any attributes of a module that are not standard attributes
         return {k: v for k, v in mod.__dict__.items() if k not in STD_ATTRS}
 
+<<<<<<< HEAD
     def is_leaf(x):
         # Ideally is_leaf should not be needed when mapping, but it seems that
         # subclasses of a standard container X may sometimes map to X, which
@@ -1650,6 +1729,14 @@ def is_leaf(x):
 
     # save state of attributes before enter
     snapshot = pytree.tree_map(lambda x: x, _get_attributes(mod), is_leaf=is_leaf)
+=======
+    # save state of attributes before enter
+    snapshot = pytree.tree_map(
+        lambda x: x,
+        _get_attributes(mod),
+        is_leaf=lambda x: type(x) in _pytree_subclasses_that_lose_info,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         yield
     finally:
@@ -1666,9 +1753,37 @@ def _collect_assigned_tensor_attributes(kp, v, _v):
                     )
                 # TODO(avik): Assigning all other types are allowed right now.
                 # Maybe in the future we want to limit this to primitive types?
+<<<<<<< HEAD
 
         pytree.tree_map_with_path(
             _collect_assigned_tensor_attributes, snapshot, _get_attributes(mod)
+=======
+            return v
+
+        new_attrs = _get_attributes(mod)
+        if len(new_attrs) != len(snapshot):
+            added_attrs = new_attrs.keys() - snapshot.keys()
+            deleted_attrs = snapshot.keys() - new_attrs.keys()
+
+            if len(added_attrs) > 0:
+                raise ValueError(
+                    f"During torch.export, following attrs were created in the model.forward: {added_attrs} "
+                    f"Such attributes must be registered as buffers using the `register_buffer` "
+                    f"API and must be initialized at model.__init__ "
+                    f"(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
+                )
+
+            if len(deleted_attrs) > 0:
+                raise ValueError(
+                    f"During torch.export, following attrs were deleted in the model.forward: {deleted_attrs} "
+                    f"Such attributes must be registered as buffers using the `register_buffer` "
+                    f"API and must be initialized at model.__init__ "
+                    f"(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
+                )
+
+        pytree.tree_map_with_path(
+            _collect_assigned_tensor_attributes, snapshot, new_attrs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # restore state of all attributes (including, e.g., of primitive types)
         mod.__dict__.update(snapshot)
diff --git a/torch/_functorch/apis.py b/torch/_functorch/apis.py
index 7d7f3e08a540..4da86ab3e8c1 100644
--- a/torch/_functorch/apis.py
+++ b/torch/_functorch/apis.py
@@ -190,7 +190,11 @@ def vmap(
         vmap does not provide general autobatching or handle variable-length
         sequences out of the box.
     """
+<<<<<<< HEAD
     from torch._dynamo import is_compiling
+=======
+    from torch.compiler import is_compiling
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _check_randomness_arg(randomness)
     if not (chunk_size is None or chunk_size > 0):
@@ -392,7 +396,11 @@ def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Calla
     """
     # To avoid cyclical dependency.
     import torch._functorch.eager_transforms as eager_transforms
+<<<<<<< HEAD
     from torch._dynamo import is_compiling
+=======
+    from torch.compiler import is_compiling
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def wrapper(*args, **kwargs):
         return eager_transforms.grad_impl(func, argnums, has_aux, args, kwargs)
@@ -434,8 +442,13 @@ def grad_and_value(
 
     See :func:`grad` for examples
     """
+<<<<<<< HEAD
     from torch._dynamo import is_compiling
     from torch._functorch import eager_transforms
+=======
+    from torch._functorch import eager_transforms
+    from torch.compiler import is_compiling
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def wrapper(*args, **kwargs):
         return eager_transforms.grad_and_value_impl(
diff --git a/torch/_functorch/autograd_function.py b/torch/_functorch/autograd_function.py
index bc715c44ed85..539adcc3c71c 100644
--- a/torch/_functorch/autograd_function.py
+++ b/torch/_functorch/autograd_function.py
@@ -258,7 +258,11 @@ class VmapInfo(NamedTuple):
     randomness: str
 
 
+<<<<<<< HEAD
 def has_overriden_vmap_rule(autograd_function):
+=======
+def has_overridden_vmap_rule(autograd_function):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return autograd_function.vmap is not torch.autograd.Function.vmap
 
 
@@ -286,14 +290,24 @@ def custom_function_call_vmap(interpreter, autograd_function, *operands, **kwarg
         )
 
     if autograd_function.generate_vmap_rule:
+<<<<<<< HEAD
         if has_overriden_vmap_rule(autograd_function):
+=======
+        if has_overridden_vmap_rule(autograd_function):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # TODO: Update link to stable once that's out
             # https://github.com/pytorch/pytorch/issues/92029
             raise RuntimeError(
                 f"You tried to vmap over {autograd_function.__name__}, but "
+<<<<<<< HEAD
                 f"it has both generate_vmap_rule=True and an overriden vmap "
                 f"staticmethod. Please set generate_vmap_rule=False or delete "
                 f"the overriden vmap staticmethod to avoid ambiguity. "
+=======
+                f"it has both generate_vmap_rule=True and an overridden vmap "
+                f"staticmethod. Please set generate_vmap_rule=False or delete "
+                f"the overridden vmap staticmethod to avoid ambiguity. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 f"For more details, please see "
                 f"https://pytorch.org/docs/main/notes/extending.func.html"
             )
@@ -301,7 +315,11 @@ def custom_function_call_vmap(interpreter, autograd_function, *operands, **kwarg
             interpreter, autograd_function, *operands
         )
 
+<<<<<<< HEAD
     if not has_overriden_vmap_rule(autograd_function):
+=======
+    if not has_overridden_vmap_rule(autograd_function):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: Update link to stable once that's out
         # https://github.com/pytorch/pytorch/issues/92029
         raise RuntimeError(
diff --git a/torch/_functorch/compile_utils.py b/torch/_functorch/compile_utils.py
index 3bf61f1af3bf..83c6694a49d4 100644
--- a/torch/_functorch/compile_utils.py
+++ b/torch/_functorch/compile_utils.py
@@ -1,10 +1,21 @@
 # mypy: ignore-errors
 
 
+<<<<<<< HEAD
 from typing import Callable
 
 import torch
 import torch.fx as fx
+=======
+import operator
+from typing import Callable
+
+import sympy
+
+import torch
+import torch.fx as fx
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.utils import _pytree as pytree
 from torch.utils._pytree import tree_flatten
@@ -95,6 +106,23 @@ def checkable_node(node: fx.Node) -> bool:
             # so it's not worth CSEing.
             or get_aten_target(n) is aten.empty
             or n in nodes_that_alias_outputs
+<<<<<<< HEAD
+=======
+            # This CSE pass currently doesn't handle re-propogation of unbacked
+            # meta where it'll sometimes eliminate a _local_scalar_dense but not
+            # replace the meta of downstream users. eg. one bug we've seen is:
+            #
+            # _local_scalar_dense_11: "Sym(u14)" = torch.ops.aten._local_scalar_dense.default(select_10);
+            # sym_sum_2: "Sym(u19 + u20 + u21)" = torch.sym_sum((_local_scalar_dense_11, _local_scalar_dense_12, _local_scalar_dense_13)) # noqa: B950
+            #
+            # Notice how _local_scalar_dense_11 is u14 but sym_sum_2's meta is incorrectly the old
+            # pre-cse value of u19.
+            or (
+                "val" in n.meta
+                and isinstance(n.meta["val"], sympy.Symbol)
+                and free_unbacked_symbols(n.meta["val"])
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             new_node = new_graph.node_copy(n, lambda x: env[x])
             env[n] = new_node
@@ -153,6 +181,27 @@ def substitute(arg_list):
     return new_graph
 
 
+<<<<<<< HEAD
+=======
+def raise_getitems(gm: fx.GraphModule) -> fx.GraphModule:
+    # Pre-create a list of nodes to iterate over, as modifying the node order
+    # during the loop can lead to infinite loops if not handled properly.
+    getitem_nodes = list(
+        gm.graph.find_nodes(op="call_function", target=operator.getitem)
+    )
+
+    # loop through getitem nodes in the graph and raise them to the parent node
+    # in reverse order to perserve their original relative order
+    for node in reversed(getitem_nodes):
+        assert len(node.all_input_nodes) == 1
+        parent = node.all_input_nodes[0]
+        parent.append(node)
+
+    gm.recompile()
+    return gm
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def strip_overloads(gm):
     """
     Modifies the target of graph nodes in :attr:`gm` to strip overloads.
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
index 844c3d62dfd9..d2014809831b 100644
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@@ -9,7 +9,11 @@
 """
 import os
 import sys
+<<<<<<< HEAD
 from typing import Optional, TYPE_CHECKING
+=======
+from typing import Literal, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils._config_module import Config, install_config_module
 
@@ -34,10 +38,21 @@
 
 static_weight_shapes = True
 
+<<<<<<< HEAD
 # Applies CSE to the graph before partitioning
 cse = True
 
 from torch._inductor.config import is_fbcode
+=======
+# See https://github.com/pytorch/pytorch/issues/141881
+# Tells partitioner that parameters are free to save for backward.
+treat_parameters_as_free_to_save = True
+
+# Applies CSE to the graph before partitioning
+cse = True
+
+from torch._environment import is_fbcode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 enable_autograd_cache: bool = Config(
@@ -46,6 +61,19 @@
     default=True,
 )
 
+<<<<<<< HEAD
+=======
+autograd_cache_allow_custom_autograd_functions: bool = Config(
+    env_name_force="TORCHINDUCTOR_AUTOGRAD_CACHE_ALLOW_CUSTOM_AUTOGRAD", default=False
+)
+
+# For now, this is just for enabling unit testing in test_aot_autograd_cache.py
+# We will either make this the default with AOTAutogradCache, or
+# we'll just use it in the precompile flow. So there's no
+# need to add env vars or make it configurable
+bundled_autograd_cache: bool = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def remote_autograd_cache_default() -> Optional[bool]:
     if os.environ.get("TORCHINDUCTOR_AUTOGRAD_REMOTE_CACHE") == "1":
@@ -166,6 +194,23 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # tokens.
 unlift_effect_tokens = False
 
+<<<<<<< HEAD
+=======
+# NOTE: [The default layout constraint for custom operators.]
+# This must be the name of one of the layout constraint tags
+# (that is, one of {"needs_fixed_stride_order", "flexible_layout"}),
+# If the custom op does not have a layout constraint tag already
+# then we assume the following applies.
+#
+# This config is respected by Inductor and we recommend other backends also
+# respect it.
+# This config is in torch._functorch and not torch._inductor because it affects
+# ProxyTensor tracing.
+custom_op_default_layout_constraint: Literal[
+    "needs_exact_strides", "needs_fixed_stride_order", "flexible_layout"
+] = "needs_exact_strides"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Run aot eager decomp partition with CrossRefFakeMode
 # options = False, "all", "custom_ops"
@@ -223,10 +268,64 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # Used for tests
 strict_autograd_cache = False
 
+<<<<<<< HEAD
+=======
+# Note [Recomputing collectives in the partitioner]
+# The purpose of this config is as follows:
+# - We have many passes in the compiler (min-cut partitioning, DCE, etc)
+#   which can reorder or ,delete duplicate nodes in the graph
+# - If any of these passes reorder/delete/duplicate a collective
+#   in a setting where the compiler is being run independently on multiple
+#   ranks, we run the risk that the compiler will make a different decison on
+#   different ranks, resulting in a NCCL hang when using torch.compile
+# To handle this, we will (by default) ensure that collectives are not modified
+# by the compiler.
+#
+# A few examples:
+# - don't dead-code-eliminate collectives
+#   (in case they are dead on rank i but not rank j)
+# - don't recompute collectives in partitioning
+#   (in case we recompute on rank i but not rank j)
+#
+# Today this flag **must** be set to false, but eventually
+# we want the option to set it to true.
+# In order to potentially optimize collectives, we'll need the compiler
+# to broadcast information across ranks at compile time to ensure
+# that any decisions on collectives are made consistently.
+unsafe_allow_optimization_of_collectives = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # See Note [AOTAutograd Tangent Subclassness for mutated inputs]
 # TODO(ivankobzarev): Remove this config, being able to deduce it compile time.
 disable_guess_zero_tangent_for_mutated_input_subclass = False
 
+<<<<<<< HEAD
+=======
+# See Note [Tangents memory format]
+# By default tangents strideness is guessed to be contiguous,
+# At runtime non contiguous tangents will be coerced to be contiguous.
+# This config changes this guess for tangents strides to be the same as outputs.
+# TODO(ivankobzarev): Remove this config once extra memory usage is investigated.
+guess_tangent_strides_as_outputs = False
+
+# This is a temporary config to ensure all ranks take the same decision in the partitioner
+# it will untimately be removed once we share size_hints across ranks through compiler collectives
+_broadcast_rank0_decision = False
+
+# By default apply inlined saved_tensors_hooks only for "donated" buffers.
+# "donated" buffers are invisible to the user, they are intermediates of the forward graph.
+# Applying saved tensors hooks for memory optimizations only for intermediates
+# guarantees that original saved tensors could be deallocated.
+# This config enables saved_tensors_hooks are applied for **all** saved tensors,
+# that could include inputs, parameters, outputs.
+# "donated" - applied only to saved intermediates of the graph
+# "no_static" - applied to all saved but not "static"
+# (this includes parameters and user marked as static)
+# "all" - no filtering, everything saved for backward.
+saved_tensors_hooks_filtering_mode = "donated"
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
 
diff --git a/torch/_functorch/eager_transforms.py b/torch/_functorch/eager_transforms.py
index f058c215c39e..20d7535c3389 100644
--- a/torch/_functorch/eager_transforms.py
+++ b/torch/_functorch/eager_transforms.py
@@ -138,6 +138,7 @@ def _autograd_grad(
             diff_outputs, grad_outputs = zip(*result)
     if len(diff_outputs) == 0:
         return tuple(torch.zeros_like(inp) for inp in inputs)
+<<<<<<< HEAD
     grad_inputs = torch.autograd.grad(
         diff_outputs,
         inputs,
@@ -146,6 +147,17 @@ def _autograd_grad(
         create_graph=create_graph,
         allow_unused=True,
     )
+=======
+    with torch._dynamo.compiled_autograd._disable():
+        grad_inputs = torch.autograd.grad(
+            diff_outputs,
+            inputs,
+            grad_outputs,
+            retain_graph=retain_graph,
+            create_graph=create_graph,
+            allow_unused=True,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grad_inputs = tuple(
         torch.zeros_like(inp) if gi is None else gi
         for gi, inp in zip(grad_inputs, inputs)
@@ -1701,6 +1713,10 @@ def linearize(func: Callable, *primals) -> tuple[Any, Callable]:
         with a single evaluation.
 
     Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> import torch
         >>> from torch.func import linearize
         >>> def fn(x):
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index 14b2961b5f6b..b1dd4848c403 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
 import copy
 import functools
+<<<<<<< HEAD
+=======
+import hashlib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import heapq
 import itertools
 import logging
@@ -10,16 +14,30 @@
 import os.path
 from collections import defaultdict
 from dataclasses import dataclass, replace
+<<<<<<< HEAD
 from typing import Callable, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._inductor.inductor_prims
 import torch.distributed
 import torch.fx as fx
 import torch.utils._pytree as pytree
+<<<<<<< HEAD
 from torch._functorch._activation_checkpointing.ac_logging_utils import (
     create_structured_trace_for_min_cut_info,
 )
+=======
+from torch._dynamo.utils import counters, is_node_meta_valid
+from torch._functorch._activation_checkpointing.ac_logging_utils import (
+    create_structured_trace_for_min_cut_info,
+)
+from torch._inductor import config as inductor_config
+from torch._logging import trace_structured
+from torch._subclasses.fake_tensor import extract_tensor_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.proxy_tensor import is_sym_node, py_sym_types
 from torch.fx.experimental.sym_node import magic_methods, method_to_operator
@@ -28,6 +46,11 @@
     free_symbols,
     hint_int,
     is_symbol_binding_fx_node,
+<<<<<<< HEAD
+=======
+    statically_known_false,
+    statically_known_true,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.fx.passes import graph_drawer
 from torch.utils._ordered_set import OrderedSet
@@ -43,7 +66,11 @@
 from ._activation_checkpointing.knapsack_evaluator import KnapsackEvaluator
 from ._aot_autograd.logging_utils import get_aot_graph_name
 from ._aot_autograd.utils import get_cuda_generator_meta_val, is_with_effects
+<<<<<<< HEAD
 from .compile_utils import fx_graph_cse, get_aten_target
+=======
+from .compile_utils import fx_graph_cse, get_aten_target, raise_getitems
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -92,6 +119,11 @@ class NodeInfo:
     required_bw_nodes: OrderedSet[fx.Node]
     unclaimed_nodes: OrderedSet[fx.Node]
     fw_order: dict[fx.Node, int]
+<<<<<<< HEAD
+=======
+    # Effectively maps to which of our primals are parameters
+    static_lifetime_input_nodes: OrderedSet[fx.Node]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @functools.cached_property
     def required_fw_nodes(self) -> list[fx.Node]:
@@ -294,12 +326,512 @@ def _remove_by_name(saved_values: list[fx.Node], name: str):
             break
 
 
+<<<<<<< HEAD
+=======
+def find_first_sym_node(
+    fwd_module_outputs: Union[list[fx.Node], tuple[fx.Node]],
+) -> int:
+    idx = len(fwd_module_outputs)
+    for i in range(len(fwd_module_outputs) - 1, -1, -1):
+        if not is_sym_node(fwd_module_outputs[i]):
+            idx = i + 1
+            break
+    return idx
+
+
+def calculate_quantization_scaling(
+    graph: torch.fx.Graph,
+    node: torch.fx.Node,
+    max: float = 57344.0,
+    min: float = 1e-12,
+):
+    with graph.inserting_after(node):
+        abs_node = graph.call_function(
+            torch.ops.aten.abs.default,
+            args=(node,),
+        )
+        abs_node.meta["val"] = torch.ops.aten.abs.default(node.meta["val"])
+        abs_node.meta["tensor_meta"] = extract_tensor_metadata(abs_node.meta["val"])
+    with graph.inserting_after(abs_node):
+        amax_node = graph.call_function(
+            torch.ops.aten.amax.default,
+            args=(abs_node, [-1], True),
+        )
+        amax_node.meta["val"] = torch.ops.aten.amax.default(
+            abs_node.meta["val"], [-1], True
+        )
+        amax_node.meta["tensor_meta"] = extract_tensor_metadata(amax_node.meta["val"])
+    with graph.inserting_after(amax_node):
+        amax_64_node = graph.call_function(
+            torch.ops.prims.convert_element_type.default,
+            args=(amax_node, torch.float64),
+        )
+        amax_64_node.meta["val"] = torch.ops.prims.convert_element_type.default(
+            amax_node.meta["val"], torch.float64
+        )
+        amax_64_node.meta["tensor_meta"] = extract_tensor_metadata(
+            amax_64_node.meta["val"]
+        )
+    with graph.inserting_after(amax_64_node):
+        clamp_min_node = graph.call_function(
+            torch.ops.aten.clamp_min.default,
+            args=(amax_64_node, min),
+        )
+        clamp_min_node.meta["val"] = torch.ops.aten.clamp_min.default(
+            amax_64_node.meta["val"], min
+        )
+        clamp_min_node.meta["tensor_meta"] = extract_tensor_metadata(
+            clamp_min_node.meta["val"]
+        )
+    with graph.inserting_after(clamp_min_node):
+        reciprocal_node = graph.call_function(
+            torch.ops.aten.reciprocal.default,
+            args=(clamp_min_node,),
+        )
+        reciprocal_node.meta["val"] = torch.ops.aten.reciprocal.default(
+            clamp_min_node.meta["val"]
+        )
+        reciprocal_node.meta["tensor_meta"] = extract_tensor_metadata(
+            reciprocal_node.meta["val"]
+        )
+    with graph.inserting_after(reciprocal_node):
+        mul_node = graph.call_function(
+            torch.ops.aten.mul.Tensor,
+            args=(reciprocal_node, max),
+        )
+        mul_node.meta["val"] = torch.ops.aten.mul.Tensor(
+            reciprocal_node.meta["val"], max
+        )
+        mul_node.meta["tensor_meta"] = extract_tensor_metadata(mul_node.meta["val"])
+    with graph.inserting_after(mul_node):
+        scale_node = graph.call_function(
+            torch.ops.prims.convert_element_type.default,
+            args=(mul_node, torch.float32),
+            name="fp8_scale_" + str(node.name),
+        )
+        scale_node.meta["val"] = torch.ops.prims.convert_element_type.default(
+            mul_node.meta["val"], torch.float32
+        )
+        scale_node.meta["tensor_meta"] = extract_tensor_metadata(scale_node.meta["val"])
+    return scale_node
+
+
+def perform_quantization(
+    graph: torch.fx.Graph,
+    node: torch.fx.Node,
+    scale_node: torch.fx.Node,
+    quant_type: torch.dtype,
+    clamp_min: float,
+    clamp_max: float,
+) -> torch.fx.Node:
+    with graph.inserting_after(scale_node):
+        target_node_32 = graph.call_function(
+            torch.ops.prims.convert_element_type.default,
+            args=(node, torch.float32),
+        )
+        target_node_32.meta["val"] = torch.ops.prims.convert_element_type.default(
+            node.meta["val"], torch.float32
+        )
+        target_node_32.meta["tensor_meta"] = extract_tensor_metadata(
+            target_node_32.meta["val"]
+        )
+    with graph.inserting_after(target_node_32):
+        scaled_target_node = graph.call_function(
+            torch.ops.aten.mul.Tensor,
+            args=(target_node_32, scale_node),
+        )
+        scaled_target_node.meta["val"] = torch.ops.aten.mul.Tensor(
+            target_node_32.meta["val"], scale_node.meta["val"]
+        )
+        scaled_target_node.meta["tensor_meta"] = extract_tensor_metadata(
+            scaled_target_node.meta["val"]
+        )
+    with graph.inserting_after(scaled_target_node):
+        clamp_min_scaled_node = graph.call_function(
+            torch.ops.aten.clamp_min.default,
+            args=(scaled_target_node, clamp_min),
+        )
+        clamp_min_scaled_node.meta["val"] = torch.ops.aten.clamp_min.default(
+            scaled_target_node.meta["val"], clamp_min
+        )
+        clamp_min_scaled_node.meta["tensor_meta"] = extract_tensor_metadata(
+            clamp_min_scaled_node.meta["val"]
+        )
+    with graph.inserting_after(clamp_min_scaled_node):
+        clamp_max_scaled_node = graph.call_function(
+            torch.ops.aten.clamp_max.default,
+            args=(clamp_min_scaled_node, clamp_max),
+        )
+        clamp_max_scaled_node.meta["val"] = torch.ops.aten.clamp_max.default(
+            clamp_min_scaled_node.meta["val"], clamp_max
+        )
+        clamp_max_scaled_node.meta["tensor_meta"] = extract_tensor_metadata(
+            clamp_max_scaled_node.meta["val"]
+        )
+    with graph.inserting_after(clamp_max_scaled_node):
+        quant_activation_node = graph.call_function(
+            torch.ops.prims.convert_element_type.default,
+            args=(clamp_max_scaled_node, quant_type),
+            name="fp8_quant_" + str(node.name),
+        )
+        quant_activation_node.meta[
+            "val"
+        ] = torch.ops.prims.convert_element_type.default(
+            clamp_max_scaled_node.meta["val"], quant_type
+        )
+        quant_activation_node.meta["tensor_meta"] = extract_tensor_metadata(
+            quant_activation_node.meta["val"]
+        )
+    return quant_activation_node
+
+
+def calculate_tensor_size(tensor: torch.Tensor) -> float:
+    """
+    Calculate the size of a PyTorch tensor in megabytes (MB).
+
+    Args:
+        tensor (torch.Tensor): Input tensor
+
+    Returns:
+        float: Memory size in MB
+    """
+    # Get number of elements and size per element
+    num_elements = tensor.numel()
+    element_size = tensor.element_size()
+
+    return (num_elements * element_size) / (1024 * 1024)
+
+
+def get_allowed_dtypes() -> list[torch.dtype]:
+    allowed_dtypes = torch._inductor.config.post_grad_fusion_options[
+        "activation_quantization_aten_pass"
+    ].get("allowed_dtypes", "torch.bfloat16")
+    allowed_dtypes = [
+        getattr(torch, dtype.split(".")[-1]) for dtype in allowed_dtypes.split(";")
+    ]
+    return allowed_dtypes
+
+
+def should_quantize(node: torch.fx.Node) -> bool:
+    allowed_dtypes = get_allowed_dtypes()
+    if not is_node_meta_valid(node) or node.meta["val"].dtype not in allowed_dtypes:
+        return False
+    size_threshold = torch._inductor.config.post_grad_fusion_options[
+        "activation_quantization_aten_pass"
+    ].get("size_in_mb", 100)
+    # calculate the size of the node
+    size_in_mb = calculate_tensor_size(node.meta["val"])
+    if not torch._inductor.config.post_grad_fusion_options[
+        "activation_quantization_aten_pass"
+    ].get("skip_dynamo_guards", False):
+        return size_in_mb >= size_threshold
+    else:
+        # case 1: we alway quantize tensors with dynamic shapes
+        if torch._inductor.config.post_grad_fusion_options[
+            "activation_quantization_aten_pass"
+        ].get("quantize_dynamic_shape", False):
+            return statically_known_true(
+                size_in_mb >= size_threshold
+            ) or not statically_known_false(size_in_mb >= size_threshold)
+        else:
+            # case 2: we alway not quantize tensors with dynamic shapes
+            return statically_known_true(size_in_mb >= size_threshold)
+
+
+def get_quant_type() -> torch.dtype:
+    quant_type = torch._inductor.config.post_grad_fusion_options[
+        "activation_quantization_aten_pass"
+    ].get("quant_type", "torch.float8_e5m2")
+
+    return getattr(torch, quant_type.split(".")[-1])
+
+
+def calculate_range(dtype: torch.dtype) -> tuple:
+    """
+    Calculate the range of values for a given torch.dtype.
+    Args:
+        dtype (torch.dtype): The input dtype.
+    Returns:
+        tuple: A tuple containing the minimum and maximum values.
+    """
+    info = torch.finfo(dtype)
+    return info.min, info.max
+
+
+def quantize_activation_fw(graph: torch.fx.Graph) -> None:
+    output = graph.find_nodes(op="output")[0]
+    fwd_outputs = output.args[0]
+    quant_type = get_quant_type()
+    clamp_min, clamp_max = calculate_range(quant_type)
+    node_to_quant = dict()
+    tensor_scale_nodes, sym_scale_nodes = [], []
+    for node in fwd_outputs:
+        # check if the activation node is the node saved for quantization
+        if node.meta.get("saved_for_quantization", False):
+            # case: use scaling
+            if torch._inductor.config.post_grad_fusion_options[
+                "activation_quantization_aten_pass"
+            ].get("use_scaling", True):
+                # calculating the scale
+                scale_node = calculate_quantization_scaling(
+                    graph, node, clamp_max, 1e-12
+                )
+                # converting to fp8
+                quant_node = perform_quantization(
+                    graph, node, scale_node, quant_type, clamp_min, clamp_max
+                )
+                if not is_sym_node(scale_node):
+                    tensor_scale_nodes.append(scale_node)
+                else:
+                    sym_scale_nodes.append(scale_node)
+            else:
+                # case: do not use scaling
+                with graph.inserting_after(node):
+                    quant_node = graph.call_function(
+                        torch.ops.prims.convert_element_type.default,
+                        args=(node, quant_type),
+                        name="fp8_quant_" + str(node.name),
+                    )
+                    quant_node.meta[
+                        "val"
+                    ] = torch.ops.prims.convert_element_type.default(
+                        node.meta["val"], quant_type
+                    )
+                    quant_node.meta["tensor_meta"] = extract_tensor_metadata(
+                        quant_node.meta["val"]
+                    )
+            node_to_quant[node] = quant_node
+    # only update the return node args, and remain all other users unchanged
+    output_updated_args = [
+        node_to_quant[node] if node in node_to_quant else node for node in fwd_outputs  # type: ignore[union-attr]
+    ]
+    # add the scale nodes to the ouput find the first sym_node in the output
+    idx = find_first_sym_node(output_updated_args)
+    scale_nodes = tensor_scale_nodes + sym_scale_nodes
+    if scale_nodes:
+        output_updated_args = (
+            output_updated_args[:idx] + scale_nodes + output_updated_args[idx:]
+        )
+
+    output.update_arg(0, tuple(output_updated_args))
+    counters["inductor"]["activation_quantization_fwd_aten_pass"] += 1
+
+
+def quantize_activation_bw(graph: torch.fx.Graph) -> None:
+    bw_inputs = [node for node in graph.nodes if node.op == "placeholder"]
+    activation_node = None
+    for node in bw_inputs:
+        if node.meta.get("saved_for_quantization", False):
+            node.meta.pop("saved_for_quantization")
+            dequant_type = node.meta.pop("dequant_type")
+            # dequantize the node
+            if torch._inductor.config.post_grad_fusion_options[
+                "activation_quantization_aten_pass"
+            ].get("use_scaling", False):
+                # case: use scaling
+                with graph.inserting_after(node):
+                    # find corresponding scale node
+                    scale_name = "fp8_scale_" + node.name.replace("fp8_quant_", "")
+                    scale_node = next(
+                        bwd_input
+                        for bwd_input in bw_inputs
+                        if bwd_input.name == scale_name
+                    )
+                with graph.inserting_after(scale_node):
+                    activation_node = graph.call_function(
+                        torch.ops.prims.convert_element_type.default,
+                        args=(node, dequant_type),
+                    )
+                    activation_node.meta[
+                        "val"
+                    ] = torch.ops.prims.convert_element_type.default(
+                        node.meta["val"], dequant_type
+                    )
+                    activation_node.meta["tensor_meta"] = extract_tensor_metadata(
+                        activation_node.meta["val"]
+                    )
+                with graph.inserting_after(activation_node):
+                    divided_target_node_32 = graph.call_function(
+                        torch.ops.aten.div.Tensor,
+                        args=(activation_node, scale_node),
+                    )
+                    divided_target_node_32.meta["val"] = torch.ops.aten.div.Tensor(
+                        activation_node.meta["val"], scale_node.meta["val"]
+                    )
+                    divided_target_node_32.meta[
+                        "tensor_meta"
+                    ] = extract_tensor_metadata(divided_target_node_32.meta["val"])
+                with graph.inserting_after(divided_target_node_32):
+                    dequant_node = graph.call_function(
+                        torch.ops.prims.convert_element_type.default,
+                        args=(divided_target_node_32, dequant_type),
+                    )
+                    dequant_node.meta[
+                        "val"
+                    ] = torch.ops.prims.convert_element_type.default(
+                        divided_target_node_32.meta["val"], dequant_type
+                    )
+                    dequant_node.meta["tensor_meta"] = extract_tensor_metadata(
+                        dequant_node.meta["val"]
+                    )
+            else:
+                with graph.inserting_after(node):
+                    dequant_node = graph.call_function(
+                        torch.ops.prims.convert_element_type.default,
+                        args=(node, dequant_type),
+                        name="dequant_" + str(node.name),
+                    )
+                    dequant_node.meta[
+                        "val"
+                    ] = torch.ops.prims.convert_element_type.default(
+                        node.meta["val"], dequant_type
+                    )
+                    dequant_node.meta["tensor_meta"] = extract_tensor_metadata(
+                        dequant_node.meta["val"]
+                    )
+            # find the users of the node and replace them with the new node except the dequant_node
+            for user in list(node.users.keys()):
+                if user != dequant_node and user != activation_node:
+                    user.replace_input_with(node, dequant_node)
+
+    counters["inductor"]["activation_quantization_bwd_aten_pass"] += 1
+
+
+def enable_activation_quantization(
+    saved_values: list[fx.Node],
+    fwd_module: fx.GraphModule,
+    bwd_module: fx.GraphModule,
+    static_lifetime_input_nodes: Optional[OrderedSet[fx.Node]] = None,
+) -> None:
+    if (
+        inductor_config.post_grad_fusion_options.get(
+            "activation_quantization_aten_pass", None
+        )
+        is None
+    ):
+        return
+
+    static_input_names = (
+        [node.name for node in static_lifetime_input_nodes]
+        if static_lifetime_input_nodes
+        else []
+    )
+    saved_values_names = {node.name: node for node in saved_values}
+    if torch._inductor.config.post_grad_fusion_options[
+        "activation_quantization_aten_pass"
+    ].get("exclude_primals", False):
+        saved_values_names = {
+            node.name: node for node in saved_values if "primals" not in node.name
+        }
+    fwd_module_outputs = fwd_module.graph.find_nodes(op="output")[0].args[0]
+    bwd_module_inputs = {
+        node.name: node for node in bwd_module.graph.find_nodes(op="placeholder")
+    }
+    for node in fwd_module_outputs:
+        if node.name in saved_values_names and should_quantize(node):
+            if node.name in static_input_names:
+                log.debug("Skipping quantization of static input %s: ", node.name)
+                continue
+            node.meta["saved_for_quantization"] = True
+            node.meta["dequant_type"] = node.meta["val"].dtype
+            # some of the fwd outputs and bwd inputs are not share the same object
+            bwd_module_inputs[node.name].meta["saved_for_quantization"] = True
+            bwd_module_inputs[node.name].meta["dequant_type"] = node.meta["val"].dtype
+
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "before_activation_quantization_fwd_aten_pass",
+            "encoding": "string",
+        },
+        payload_fn=lambda: fwd_module.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        ),
+    )
+
+    quantize_activation_fw(fwd_module.graph)
+
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "after_activation_quantization_fwd_aten_pass",
+            "encoding": "string",
+        },
+        payload_fn=lambda: fwd_module.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        ),
+    )
+
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "before_activation_quantization_bwd_aten_pass",
+            "encoding": "string",
+        },
+        payload_fn=lambda: bwd_module.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        ),
+    )
+
+    quant_fwd_module_outputs = fwd_module.graph.find_nodes(op="output")[0].args[0]
+    # update the corresponding bwd_inputs due to the fwd_outputs quantization
+    for fwd_node in quant_fwd_module_outputs:
+        if "fp8_quant_" in fwd_node.name:
+            bwd_input = bwd_module_inputs[fwd_node.name.replace("fp8_quant_", "")]
+            with bwd_module.graph.inserting_after(bwd_input):
+                quant_bwd_input = bwd_module.graph.placeholder(name=fwd_node.name)
+            dequant_type = bwd_input.meta["dequant_type"]
+            quant_bwd_input.meta.update(fwd_node.meta)
+            quant_bwd_input.meta["saved_for_quantization"] = True
+            quant_bwd_input.meta["dequant_type"] = dequant_type
+            bwd_input.replace_all_uses_with(quant_bwd_input)
+            bwd_module.graph.erase_node(bwd_input)
+    # update the bwd_inputs if quantization with scaling is used
+    if torch._inductor.config.post_grad_fusion_options[
+        "activation_quantization_aten_pass"
+    ].get("use_scaling", True):
+        quant_bwd_module_inputs = list(bwd_module.graph.find_nodes(op="placeholder"))
+        # update the corresponding bwd input nodes find the last non-tangent node
+        bwd_input_loc = quant_bwd_module_inputs[-1]
+        for bw_input in reversed(quant_bwd_module_inputs):
+            if not _is_tangent(bw_input):
+                bwd_input_loc = bw_input
+                break
+
+        scaled_fwd_module_outputs = fwd_module.graph.find_nodes(op="output")[0].args[0]
+        for fwd_node in scaled_fwd_module_outputs:
+            if "fp8_scale_" in fwd_node.name:
+                # fwd node is a scale node
+                with bwd_module.graph.inserting_after(bwd_input_loc):
+                    scale_bwd_input = bwd_module.graph.placeholder(name=fwd_node.name)
+                scale_bwd_input.meta.update(fwd_node.meta)
+                bwd_input_loc = scale_bwd_input
+
+    quantize_activation_bw(bwd_module.graph)
+
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "after_activation_quantization_bwd_aten_pass",
+            "encoding": "string",
+        },
+        payload_fn=lambda: bwd_module.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        ),
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _extract_fwd_bwd_modules(
     joint_module: fx.GraphModule,
     saved_values: list[fx.Node],
     saved_sym_nodes: list[fx.Node],
     *,
     num_fwd_outputs: int,
+<<<<<<< HEAD
+=======
+    static_lifetime_input_nodes: Optional[OrderedSet[fx.Node]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[fx.GraphModule, fx.GraphModule]:
     fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(
         joint_module, num_fwd_outputs=num_fwd_outputs
@@ -318,11 +850,30 @@ def _extract_fwd_bwd_modules(
         "backward",
     )
 
+<<<<<<< HEAD
+=======
+    distributed_enabled = torch.distributed.is_available()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in bwd_graph.find_nodes(op="placeholder"):
         # This is to filter out saved values that don't actually end up being used by the backwards pass
         if not node.users:
             _remove_by_name(saved_values, node.name)
             _remove_by_name(saved_sym_nodes, node.name)
+<<<<<<< HEAD
+=======
+        # wait_tensor is a bit special: if we have a "dead activation" that is not used in the bw,
+        # but this dead activation is actually a collective,
+        # then the collective will generally by followed by a wait_tensor() call.
+        # we need to peak one node further to see if this wait_tensor is dead as well.
+        elif distributed_enabled and all(
+            n.target is torch.ops._c10d_functional.wait_tensor.default
+            and len(n.users) == 0
+            for n in node.users
+        ):
+            _remove_by_name(saved_values, node.name)
+            _remove_by_name(saved_sym_nodes, node.name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif _is_backward_state(node):
             # BackwardState is saved directly
             _remove_by_name(saved_values, node.name)
@@ -390,11 +941,26 @@ def _extract_fwd_bwd_modules(
 
     fwd_module = fx._lazy_graph_module._make_graph_module(joint_module, fwd_graph)
     bwd_module = fx._lazy_graph_module._make_graph_module(joint_module, bwd_graph)
+<<<<<<< HEAD
+=======
+    enable_activation_quantization(
+        saved_values, fwd_module, bwd_module, static_lifetime_input_nodes
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fwd_module, bwd_module
 
 
 def default_partition(
+<<<<<<< HEAD
     joint_module: fx.GraphModule, _joint_inputs, *, num_fwd_outputs
+=======
+    joint_module: fx.GraphModule,
+    _joint_inputs,
+    *,
+    num_fwd_outputs,
+    static_lifetime_input_indices: Optional[list[int]] = None,
+    static_lifetime_input_nodes: Optional[OrderedSet[fx.Node]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[fx.GraphModule, fx.GraphModule]:
     """
     Partitions the :attr:`joint_module` in a manner that closely resembles the
@@ -421,7 +987,14 @@ def default_partition(
     """
     if has_recomputable_ops(joint_module):
         return min_cut_rematerialization_partition(
+<<<<<<< HEAD
             joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs
+=======
+            joint_module,
+            _joint_inputs,
+            num_fwd_outputs=num_fwd_outputs,
+            static_lifetime_input_indices=static_lifetime_input_indices,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
     fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes))
@@ -476,6 +1049,10 @@ def default_partition(
         saved_values,
         saved_sym_nodes=saved_sym_nodes,
         num_fwd_outputs=num_fwd_outputs,
+<<<<<<< HEAD
+=======
+        static_lifetime_input_nodes=static_lifetime_input_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -522,10 +1099,17 @@ def _count_ops(graph: fx.Graph):
     for node in graph.nodes:
         if node.op == "call_function":
             cnt[node.target.__name__] += 1
+<<<<<<< HEAD
     log.info("%s", sorted(cnt.items(), key=lambda x: x[1], reverse=True))
 
 
 @functools.lru_cache(None)
+=======
+    log.info("%s", sorted(cnt.items(), key=operator.itemgetter(1), reverse=True))
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def pointwise_ops():
     ops = []
     for attr_name in dir(torch.ops.aten):
@@ -547,7 +1131,11 @@ def sort_depths(args, depth_map: dict[fx.Node, int]) -> list[tuple[fx.Node, int]
     arg_depths = {
         arg: depth_map[arg] for arg in args if isinstance(arg, torch.fx.node.Node)
     }
+<<<<<<< HEAD
     return sorted(arg_depths.items(), key=lambda x: x[1], reverse=True)
+=======
+    return sorted(arg_depths.items(), key=operator.itemgetter(1), reverse=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def reordering_to_mimic_autograd_engine(gm: fx.GraphModule) -> fx.GraphModule:
@@ -580,9 +1168,13 @@ def reordering_to_mimic_autograd_engine(gm: fx.GraphModule) -> fx.GraphModule:
     for node in gm.graph.find_nodes(op="placeholder"):
         env[node] = new_graph.node_copy(node, lambda x: env[x])
 
+<<<<<<< HEAD
     order = {}
     for idx, node in enumerate(gm.graph.nodes):
         order[node] = idx
+=======
+    order = {node: idx for idx, node in enumerate(gm.graph.nodes)}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def insert_node_in_graph(node):
         cur_nodes = [node]
@@ -616,6 +1208,14 @@ def insert_node_in_graph(node):
         return gm
 
     # Build the graph op-by-op by starting from the node all the way to the end
+<<<<<<< HEAD
+=======
+    # copy_ can be not using tangents at all, we must copy it.
+    for node in list(gm.graph.nodes)[: order[first_node_in_bwd]]:
+        if node.op == "call_function" and node.target == torch.ops.aten.copy_.default:
+            insert_node_in_graph(node)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in list(gm.graph.nodes)[order[first_node_in_bwd] :]:
         insert_node_in_graph(node)
 
@@ -920,6 +1520,24 @@ def get_sample_rng_state(device: Optional[torch.device]):
     return fw_module, bw_module
 
 
+<<<<<<< HEAD
+=======
+def force_save_collectives(joint_module: fx.GraphModule) -> None:
+    """
+    By default, the partitioner is not allowed to recompute collectives
+    unless they come from a user-annotated AC region.
+    See Note [Recomputing collectives in the partitioner]
+    """
+    for node in joint_module.graph.nodes:
+        if (
+            isinstance(node.target, torch._ops.OpOverload)
+            and node.target.namespace == "_c10d_functional"
+            and not must_recompute(node)
+        ):
+            node.meta["recompute"] = CheckpointPolicy.MUST_SAVE
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def cleanup_recompute_tags(joint_module: fx.GraphModule) -> fx.GraphModule:
     """
     If there are two consecutive checkpointed blocks with no operator in
@@ -1102,7 +1720,16 @@ def is_materialized(node):
 
         return not all(is_fusible(node, user) for user in node.users)
 
+<<<<<<< HEAD
     def get_node_weight(node) -> float:
+=======
+    def get_node_weight(node, static_lifetime_input_nodes) -> float:
+        if (
+            config.treat_parameters_as_free_to_save
+            and node in static_lifetime_input_nodes
+        ):
+            return 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mem_sz = _size_of(node)
         if config.recompute_views and op_types.is_view(node):
             # If `config.recompute_views=True`, we don't save views. This is generally
@@ -1134,7 +1761,18 @@ def ban_recomputation_if_allowed(node):
         if op_types.is_view(node):
             return False
         if node in dont_ban:
+<<<<<<< HEAD
             return False
+=======
+            # collectives are *always* banned from recompute, overriding `dont_ban`
+            # (in particular, the activation memory budget logic is not allowed to recompute collectives)
+            is_collective = (
+                isinstance(node.target, torch._ops.OpOverload)
+                and node.target.namespace == "_c10d_functional"
+            )
+            if config.unsafe_allow_optimization_of_collectives or not is_collective:
+                return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This bans recomputation of the node unless we've been forced not to by
         # user annotation
         if must_recompute(node):
@@ -1142,7 +1780,10 @@ def ban_recomputation_if_allowed(node):
 
         if "val" in node.meta and isinstance(node.meta["val"], torch.SymFloat):
             return False
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         banned_nodes.add(node)
         # A node will only ever be recomputed if there is a path from an
         # ancestor of this node to the backwards path through this node that
@@ -1198,7 +1839,11 @@ def ban_recomputation_if_allowed(node):
                 0.0 if isinstance(node.meta.get("val"), BackwardState) else math.inf
             )
         else:
+<<<<<<< HEAD
             weight = get_node_weight(node)
+=======
+            weight = get_node_weight(node, node_info.static_lifetime_input_nodes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Creates the weights on the "node" edge
         nx_graph.add_edge(node.name + "_in", node.name + "_out", capacity=weight)
         for user in node.users:
@@ -1358,6 +2003,7 @@ def visualize_min_cut_graph(nx_graph):
     import pydot
 
     dot_format = nx.nx_pydot.to_pydot(nx_graph).to_string()
+<<<<<<< HEAD
     dot_graph = pydot.graph_from_dot_data(dot_format)[0]
     for edge in dot_graph.get_edges():
         weight = nx_graph[edge.get_source()][edge.get_destination()]["capacity"]
@@ -1368,6 +2014,18 @@ def visualize_min_cut_graph(nx_graph):
             edge.set_color("red")
     log.info("Visualizing the failed graph to min_cut_failed.svg")
     dot_graph.write_svg("min_cut_failed.svg")
+=======
+    dot_graph = pydot.graph_from_dot_data(dot_format)[0]  # type: ignore[index]
+    for edge in dot_graph.get_edges():
+        weight = nx_graph[edge.get_source()][edge.get_destination()]["capacity"]
+        # Set edge label to weight
+        edge.set_label(str(weight))  # type: ignore[union-attr]
+        # Color edges with weight 'inf' as red
+        if weight == float("inf"):
+            edge.set_color("red")  # type: ignore[union-attr]
+    log.info("Visualizing the failed graph to min_cut_failed.svg")
+    dot_graph.write_svg("min_cut_failed.svg")  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_default_op_list() -> OpTypes:
@@ -1458,6 +2116,10 @@ def get_default_op_list() -> OpTypes:
         aten.as_strided,
         aten.permute,
         aten.select,
+<<<<<<< HEAD
+=======
+        aten.split,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     view_ops = recomputable_view_ops
     default_recomputable_ops += [
@@ -1495,7 +2157,11 @@ def get_default_op_list() -> OpTypes:
         aten.argmax,
         aten.maximum,
         prims.iota,
+<<<<<<< HEAD
         prims._low_memory_max_pool2d_offsets_to_indices,
+=======
+        prims._low_memory_max_pool_offsets_to_indices,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]  # noqa: E501,B950
     # Natalia said that we should allow recomputing indexing :)
     default_recomputable_ops += [aten.index, aten.gather]
@@ -1510,7 +2176,13 @@ def get_default_op_list() -> OpTypes:
     default_recomputable_ops += [method_to_operator(m) for m in magic_methods]
     recomputable_ops = OrderedSet(default_recomputable_ops)
 
+<<<<<<< HEAD
     random_ops = OrderedSet([aten.native_dropout, aten.rand_like, aten.randn_like])
+=======
+    random_ops = OrderedSet[Callable[..., Any]](
+        [aten.native_dropout, aten.rand_like, aten.randn_like]
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     compute_intensive_ops = [
         aten.mm,
         aten.convolution,
@@ -1737,8 +2409,19 @@ def get_recomputable_banned_nodes(
         ]
 
     recomputable_banned_nodes = get_recomputable_banned_nodes(banned_nodes)
+<<<<<<< HEAD
     # sort first by name, to ensure determinism when multiple nodes have same size
     recomputable_banned_nodes = sorted(recomputable_banned_nodes, key=lambda x: x.name)
+=======
+    must_save_nodes = [
+        i
+        for i in recomputable_banned_nodes
+        if i.meta.get("recompute", False) == CheckpointPolicy.MUST_SAVE
+    ]
+    recomputable_banned_nodes = [
+        i for i in recomputable_banned_nodes if i not in must_save_nodes
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # default: runtime_optimized_saved_values
     # more aggressive: more_aggressive_saved_values
@@ -1748,7 +2431,11 @@ def get_recomputable_banned_nodes(
         recomputable_banned_nodes, key=_size_of, reverse=True
     )
     if len(all_recomputable_banned_nodes) == 0:
+<<<<<<< HEAD
         return node_info.inputs
+=======
+        return node_info.inputs + must_save_nodes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     memories_banned_nodes = [
         get_normalized_size(_size_of(i)) for i in all_recomputable_banned_nodes
     ]
@@ -1878,12 +2565,63 @@ def estimate_for_budget(b):
     )[0]
 
 
+<<<<<<< HEAD
+=======
+def _broadcast_rank0_decision(
+    joint_graph: torch.fx.Graph, saved_values: list[torch.fx.Node]
+):
+    # use the same policy across different GPUs
+    from torch._subclasses.fake_tensor import unset_fake_temporarily
+
+    def has_collectives(joint_graph):
+        for node in joint_graph.nodes:
+            if isinstance(
+                node.target, torch._ops.OpOverload
+            ) and node.target.namespace in {"_c10d_functional", "c10d_functional"}:
+                return True
+        return False
+
+    def has_same_nodes(joint_graph):
+        # proxy to check if the graph is the same across different GPUs.
+        # We only consider the name and order of nodes. A more robust way
+        # would be to check the hash of the whole graph (disregarding input shapes),
+        # this is is a reasonable first-order approximation.
+        node_str = "/".join(x.name for x in joint_graph.nodes)
+        inputs = hashlib.sha256(node_str.encode("utf-8")).hexdigest()
+        all_inputs = [None for _ in range(torch.distributed.get_world_size())]
+        with no_dispatch(), unset_fake_temporarily():
+            # TODO: maybe use a different process group?
+            torch.distributed.all_gather_object(all_inputs, inputs)
+        return all(all_inputs[0] == x for x in all_inputs)
+
+    if (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and torch.distributed.get_world_size() > 1
+        and has_collectives(joint_graph)
+        and has_same_nodes(joint_graph)
+    ):
+        with no_dispatch(), unset_fake_temporarily():
+            objects = [[x.name for x in saved_values]]
+            # TODO: maybe use a different process group for this
+            torch.distributed.broadcast_object_list(objects, src=0)
+            saved_values_names = objects[0]
+            name_to_node = get_name_to_node(joint_graph)
+            saved_values = [name_to_node[n] for n in saved_values_names]
+    return saved_values
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def min_cut_rematerialization_partition(
     joint_module: fx.GraphModule,
     _joint_inputs,
     compiler="inductor",
     *,
     num_fwd_outputs,
+<<<<<<< HEAD
+=======
+    static_lifetime_input_indices: Optional[list[int]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[fx.GraphModule, fx.GraphModule]:
     """
     Partitions the joint graph such that the backward recomputes the forward.
@@ -1926,8 +2664,15 @@ def min_cut_rematerialization_partition(
     graph_has_recomputable_rng_ops = has_recomputable_rng_ops(joint_module)
     if graph_has_recomputable_ops:
         joint_module = cleanup_recompute_tags(joint_module)
+<<<<<<< HEAD
 
     def classify_nodes(joint_module):
+=======
+    if not config.unsafe_allow_optimization_of_collectives:
+        force_save_collectives(joint_module)
+
+    def classify_nodes(joint_module, static_lifetime_input_indices):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name_to_node = get_name_to_node(joint_module.graph)
         required_bw_nodes: OrderedSet[fx.Node] = OrderedSet()
         for node in joint_module.graph.nodes:
@@ -1963,6 +2708,12 @@ def classify_nodes(joint_module):
             for node in joint_module.graph.nodes
             if node not in required_fw_nodes and node not in required_bw_nodes
         )
+<<<<<<< HEAD
+=======
+        static_lifetime_input_nodes = OrderedSet(
+            p for i, p in enumerate(primal_inputs) if i in static_lifetime_input_indices
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fw_cnt = 0
         fw_order = {}
         for node in joint_module.graph.nodes:
@@ -1970,17 +2721,39 @@ def classify_nodes(joint_module):
                 fw_order[node] = fw_cnt
                 fw_cnt += 1
         return NodeInfo(
+<<<<<<< HEAD
             inputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes, fw_order
         )
 
     node_info = classify_nodes(joint_module)
+=======
+            inputs,
+            required_fw_nodes,
+            required_bw_nodes,
+            unclaimed_nodes,
+            fw_order,
+            static_lifetime_input_nodes,
+        )
+
+    if static_lifetime_input_indices is None:
+        static_lifetime_input_indices = []
+    node_info = classify_nodes(joint_module, static_lifetime_input_indices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # networkx blows up on graphs with no required backward nodes
     # Since there's nothing to partition anyway, and the default partitioner can "handle"
     # this case, send our graph over to the default partitioner.
     if len(node_info.required_bw_nodes) == 0:
         return default_partition(
+<<<<<<< HEAD
             joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs
+=======
+            joint_module,
+            _joint_inputs,
+            num_fwd_outputs=num_fwd_outputs,
+            static_lifetime_input_indices=static_lifetime_input_indices,
+            static_lifetime_input_nodes=node_info.static_lifetime_input_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     for node in reversed(joint_module.graph.nodes):
@@ -2003,6 +2776,11 @@ def classify_nodes(joint_module):
         node_info,
         memory_budget=memory_budget,
     )
+<<<<<<< HEAD
+=======
+    if config._broadcast_rank0_decision:
+        saved_values = _broadcast_rank0_decision(joint_graph, saved_values)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # save_for_backward on tensors and stashes symints in autograd .ctx
     saved_sym_nodes = list(filter(is_sym_node, saved_values))
     saved_values = list(filter(lambda n: not is_sym_node(n), saved_values))
@@ -2013,6 +2791,10 @@ def classify_nodes(joint_module):
         saved_values,
         saved_sym_nodes=saved_sym_nodes,
         num_fwd_outputs=num_fwd_outputs,
+<<<<<<< HEAD
+=======
+        static_lifetime_input_nodes=node_info.static_lifetime_input_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     if graph_has_recomputable_ops:
         if graph_has_recomputable_rng_ops:
@@ -2021,6 +2803,14 @@ def classify_nodes(joint_module):
             )
     bw_module = reordering_to_mimic_autograd_engine(bw_module)
 
+<<<<<<< HEAD
+=======
+    # raise all getitem ops to as early as possible
+    # this is helpful for memory, especially in the case of aot_eager backend
+    fw_module = raise_getitems(fw_module)
+    bw_module = raise_getitems(bw_module)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if AOT_PARTITIONER_DEBUG:
         # Calculate sorted sizes of saved values
         sorted_sizes = sorted([(_size_of(i), str(i)) for i in saved_values])
@@ -2049,7 +2839,13 @@ def classify_nodes(joint_module):
             len(fw_module_nodes),
             len(bw_module_nodes),
         )
+<<<<<<< HEAD
         rematerialized_ops = sorted(counts.items(), key=lambda x: x[1], reverse=True)
+=======
+        rematerialized_ops = sorted(
+            counts.items(), key=operator.itemgetter(1), reverse=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.info("Count of Ops Rematerialized: %s", rematerialized_ops)
     return fw_module, bw_module
 
diff --git a/torch/_functorch/pyfunctorch.py b/torch/_functorch/pyfunctorch.py
index 28bd74f28d38..83297ab4b607 100644
--- a/torch/_functorch/pyfunctorch.py
+++ b/torch/_functorch/pyfunctorch.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
 import contextlib
 from abc import ABC, abstractmethod
+<<<<<<< HEAD
+=======
+from functools import cached_property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 
 import torch
@@ -79,6 +83,14 @@ def get_state(self):
     def check_state(self, state):
         return state == self.get_state()
 
+<<<<<<< HEAD
+=======
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state.pop("_cptr", None)
+        return state
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @contextlib.contextmanager
 def temporarily_pop_interpreter_stack():
@@ -123,7 +135,14 @@ def __init__(self, cdata: CInterpreter):
         # cdata is a generic CInterpreter. We wrap it in a CVmapInterpreterPtr
         # so that we can access methods specific to the vmap interpreter
         self._cdata = cdata
+<<<<<<< HEAD
         self._cptr = CVmapInterpreterPtr(cdata)
+=======
+
+    @cached_property
+    def _cptr(self):
+        return CVmapInterpreterPtr(self._cdata)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def process(self, op, args, kwargs):
         kernel = op.functorch_table[TransformType.Vmap]
@@ -159,7 +178,14 @@ def __init__(self, cdata: CInterpreter):
         assert cdata.key() == TransformType.Grad
         # See NOTE: [Interpreter cdata vs cptr]
         self._cdata = cdata
+<<<<<<< HEAD
         self._cptr = CGradInterpreterPtr(cdata)
+=======
+
+    @cached_property
+    def _cptr(self):
+        return CGradInterpreterPtr(self._cdata)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def lift(self, args, kwargs):
         args, kwargs = pytree.tree_map_only(
@@ -193,7 +219,14 @@ def __init__(self, cdata: CInterpreter):
         assert cdata.key() == TransformType.Jvp
         # See NOTE: [Interpreter cdata vs cptr]
         self._cdata = cdata
+<<<<<<< HEAD
         self._cptr = CJvpInterpreterPtr(cdata)
+=======
+
+    @cached_property
+    def _cptr(self):
+        return CJvpInterpreterPtr(self._cdata)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def lift(self, args, kwargs):
         args, kwargs = pytree.tree_map_only(
@@ -226,7 +259,14 @@ class FunctionalizeInterpreter(FuncTorchInterpreter):
     def __init__(self, cdata: CInterpreter):
         assert cdata.key() == TransformType.Functionalize
         self._cdata = cdata
+<<<<<<< HEAD
         self._cptr = CFunctionalizeInterpreterPtr(cdata)
+=======
+
+    @cached_property
+    def _cptr(self):
+        return CFunctionalizeInterpreterPtr(self._cdata)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def process(self, op, args, kwargs):
         kernel = op.functorch_table[TransformType.Functionalize]
diff --git a/torch/_functorch/top_operators_github_usage.py b/torch/_functorch/top_operators_github_usage.py
index 1fcdbe0b41ac..c52b1ab7a258 100644
--- a/torch/_functorch/top_operators_github_usage.py
+++ b/torch/_functorch/top_operators_github_usage.py
@@ -625,8 +625,13 @@ def get_nn_functional_top_list():
     return top_nn_functional_
 
 
+<<<<<<< HEAD
 usage_count = {}
 for k, v in get_nn_functional_top_list():
     usage_count[k] = v
 for k, v in top_torch:
     usage_count[k] = v
+=======
+usage_count = dict(get_nn_functional_top_list())
+usage_count.update(top_torch)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_guards.py b/torch/_guards.py
index ad5f4a7b130a..5a982fc4e8af 100644
--- a/torch/_guards.py
+++ b/torch/_guards.py
@@ -12,6 +12,10 @@
 import unittest.mock
 import weakref
 from abc import abstractmethod
+<<<<<<< HEAD
+=======
+from collections import defaultdict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import (
@@ -36,6 +40,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from types import CodeType
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import sympy
 
 
@@ -409,7 +418,11 @@ def set_export_info(self, guard_type, guarded_class, code_list, obj_weakref):
 """
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass
+=======
+@dataclasses.dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GuardEnvExpr:
     pass
 
@@ -420,7 +433,11 @@ class GuardEnvExpr:
 """
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass
+=======
+@dataclasses.dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DuplicateInputs(GuardEnvExpr):
     input_source_a: Source
     input_source_b: Source
@@ -443,7 +460,11 @@ def __post_init__(self):
 """
 
 
+<<<<<<< HEAD
 @dataclasses.dataclass
+=======
+@dataclasses.dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class StorageOverlap(GuardEnvExpr):
     overlapping_sources: list[Source]
     non_overlapping_sources: list[Source]
@@ -560,7 +581,10 @@ class GlobalContext(Checkpointable[GlobalContextCheckpointState]):
 
     _supported_global_states = {
         "grad_enabled",
+<<<<<<< HEAD
         "torch_function_enabled",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "autocast_enabled",
         "autocast_cpu_enabled",
         "autocast_gpu_dtype",
@@ -585,6 +609,7 @@ def restore_graphstate(self, state):
             func(args)
 
 
+<<<<<<< HEAD
 """
 A GuardsContext is a checkpointable representation of all the guards in the current tracing
 context. It's lifecycle is bound 1:1 to the tracing context, and it should never be instantiated
@@ -593,6 +618,8 @@ def restore_graphstate(self, state):
 """
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Like a Set[Guard] but will record the user stack on all guards at the
 # time they were installed at their destination
 class GuardsSet:
@@ -631,8 +658,25 @@ def update(self, *others: set[Guard]):
                 self.add(g, skip=1)
 
     def remove_guards_with_source(self, source):
+<<<<<<< HEAD
         """Delete all guards with a given source"""
         self.inner = {g for g in self.inner if g.originating_source != source}
+=======
+        """Delete all guards that contains a given source"""
+        from ._dynamo.source import is_from_source
+
+        self.inner = {
+            g for g in self.inner if not is_from_source(g.originating_source, source)
+        }
+
+
+"""
+A GuardsContext is a checkpointable representation of all the guards in the current tracing
+context. It's lifecycle is bound 1:1 to the tracing context, and it should never be instantiated
+directly outside of it. For passing around internal state representations of this object,
+prefer to extract them with copy_graphstate to produce a GuardsCheckpointState.
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class GuardsContext(Checkpointable[GuardsCheckpointState]):
@@ -651,10 +695,17 @@ def restore_graphstate(self, state):
 
 class HopSubgraphCache:
     @abstractmethod
+<<<<<<< HEAD
     def add_dynamo_identifier(self, cache_key: str, identifier: str): ...
 
     @abstractmethod
     def get_dynamo_identifier(self, cache_key: str) -> Optional[str]: ...
+=======
+    def add_dynamo_installed_submodule(self, fn_id: int, identifier: str): ...
+
+    @abstractmethod
+    def get_dynamo_installed_submodules(self, fn_id: int) -> list[str]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @abstractmethod
     def add_autograd_key_entry(self, identifier: str, key: Callable): ...
@@ -668,11 +719,28 @@ def add_proxy_dispatch_entry(self, identifier: str, key: Callable): ...
     @abstractmethod
     def get_proxy_dispatch_entry(self, identifier: str): ...
 
+<<<<<<< HEAD
+=======
+    @abstractmethod
+    def add_lazy_bwd_entry(
+        self,
+        identifier: str,
+        tangent_metadata: tuple[object],
+        gmod: torch.fx.GraphModule,
+    ): ...
+
+    @abstractmethod
+    def get_lazy_bwd_entry(
+        self, identifier: str, tangent_metadata: tuple[object]
+    ) -> int: ...
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class InvokeSubgraphCache(HopSubgraphCache):
     def __init__(self) -> None:
         self.autograd_cache: dict[str, Callable] = {}
         self.proxy_dispatch_cache: dict[str, Callable] = {}
+<<<<<<< HEAD
         self.dynamo_identifiers: dict[str, str] = {}
 
     def add_dynamo_identifier(self, cache_key: str, identifier: str):
@@ -680,6 +748,18 @@ def add_dynamo_identifier(self, cache_key: str, identifier: str):
 
     def get_dynamo_identifier(self, cache_key: str) -> Optional[str]:
         return self.dynamo_identifiers.get(cache_key, None)
+=======
+        self.dynamo_installed_submodules: dict[int, list[str]] = defaultdict(list)
+        self.lazy_bwd_cache: dict[
+            str, dict[tuple[object], tuple[torch.fx.GraphModule, int]]
+        ] = defaultdict(dict)
+
+    def add_dynamo_installed_submodule(self, fn_id: int, identifier: str):
+        self.dynamo_installed_submodules[fn_id].append(identifier)
+
+    def get_dynamo_installed_submodules(self, fn_id: int) -> list[str]:
+        return self.dynamo_installed_submodules.get(fn_id, [])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def add_autograd_key_entry(self, identifier: str, key: Callable):
         self.autograd_cache[identifier] = key
@@ -693,6 +773,26 @@ def add_proxy_dispatch_entry(self, identifier: str, key: Callable):
     def get_proxy_dispatch_entry(self, identifier: str):
         return self.proxy_dispatch_cache.get(identifier, None)
 
+<<<<<<< HEAD
+=======
+    def add_lazy_bwd_entry(
+        self,
+        identifier: str,
+        tangent_metadata: tuple[object],
+        gmod: torch.fx.GraphModule,
+    ):
+        # Save the number of existing graph modules in the dictionary to get the suffix
+        num_gmods = len(self.lazy_bwd_cache[identifier])
+        self.lazy_bwd_cache[identifier][tangent_metadata] = (gmod, num_gmods)
+        return num_gmods
+
+    def get_lazy_bwd_entry(self, identifier: str, tangent_metadata: tuple[object]):
+        if identifier not in self.lazy_bwd_cache:
+            return (None, None)
+
+        return self.lazy_bwd_cache[identifier].get(tangent_metadata, (None, None))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class HopDispatchSetCache:
     def __init__(self) -> None:
@@ -788,6 +888,11 @@ def __init__(self, fake_mode):
         self.guards_context = GuardsContext()
         self.module_context = ModuleContext()
         self.global_context = GlobalContext()
+<<<<<<< HEAD
+=======
+        self.previously_inlined_functions = dict()
+        self.previously_cleaned_instructions = dict()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fake_mode = fake_mode
         self.frame_summary_stack = []
         # This is morally part of frame_summary_stack, but it is kept separate
@@ -828,11 +933,21 @@ def __init__(self, fake_mode):
         # see note: [Returning Fake Tensors on First AOT Autograd Call]
         self.fakify_first_call = False
         self.hop_dispatch_set_cache = HopDispatchSetCache()
+<<<<<<< HEAD
+=======
+        # list of code objects for inlined functions
+        self.traced_code: list[CodeType] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def clear(self):
         # Look at the note in output_graph.py in function `save_global_state`
         # for the context on clearing global context.
         self.global_context.global_state = {}
+<<<<<<< HEAD
+=======
+        self.previously_inlined_functions.clear()
+        self.previously_cleaned_instructions.clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     @contextmanager
@@ -858,9 +973,20 @@ def extract_stack():
             return traceback.StackSummary()
         stack = self.frame_summary_stack
         if self.loc_in_frame is not None:
+<<<<<<< HEAD
             stack = stack + [self.loc_in_frame]
         return traceback.StackSummary.from_list(stack)
 
+=======
+            stack = stack + [self._populate_loc_in_frame_summary()]
+        return traceback.StackSummary.from_list(stack)
+
+    def _populate_loc_in_frame_summary(self):
+        assert self.loc_in_frame is not None
+        filename, lineno, frame_name = self.loc_in_frame
+        return traceback.FrameSummary(filename, lineno, frame_name, lookup_line=False)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Call this when you want to call into some code that isn't necessarily
     # associated with the current frame state
     @staticmethod
@@ -932,9 +1058,22 @@ def report_output_strides():
 
     @staticmethod
     def set_current_loc(filename, lineno, frame_name):
+<<<<<<< HEAD
         TracingContext.get().loc_in_frame = traceback.FrameSummary(
             filename, lineno, frame_name, lookup_line=False
         )
+=======
+        # Save the current location in the frame. Lazily generate the
+        # framesummary.
+        TracingContext.get().loc_in_frame = (filename, lineno, frame_name)
+
+    @staticmethod
+    def get_traced_code():
+        tc = TracingContext.try_get()
+        if tc is None:
+            return None
+        return tc.traced_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @contextmanager
diff --git a/torch/_higher_order_ops/__init__.py b/torch/_higher_order_ops/__init__.py
index 348fcb747191..91bc93b18e04 100644
--- a/torch/_higher_order_ops/__init__.py
+++ b/torch/_higher_order_ops/__init__.py
@@ -21,6 +21,10 @@
 from torch._higher_order_ops.foreach_map import _foreach_map, foreach_map
 from torch._higher_order_ops.hints_wrap import hints_wrapper
 from torch._higher_order_ops.invoke_subgraph import invoke_subgraph
+<<<<<<< HEAD
+=======
+from torch._higher_order_ops.map import map
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._higher_order_ops.run_const_graph import run_const_graph
 from torch._higher_order_ops.scan import scan
@@ -28,6 +32,10 @@
 from torch._higher_order_ops.torchbind import call_torchbind
 from torch._higher_order_ops.while_loop import while_loop
 from torch._higher_order_ops.wrap import (
+<<<<<<< HEAD
+=======
+    dynamo_bypassing_wrapper,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tag_activation_checkpoint,
     wrap_activation_checkpoint,
     wrap_with_autocast,
@@ -40,6 +48,10 @@
     "while_loop",
     "invoke_subgraph",
     "scan",
+<<<<<<< HEAD
+=======
+    "map",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "flex_attention",
     "flex_attention_backward",
     "hints_wrapper",
@@ -62,6 +74,13 @@
     "wrap_with_set_grad_enabled",
     "wrap_with_autocast",
     "wrap_activation_checkpoint",
+<<<<<<< HEAD
     "strict_mode",
     "aoti_call_delegate",
+=======
+    "dynamo_bypassing_wrapper",
+    "strict_mode",
+    "aoti_call_delegate",
+    "map",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/_higher_order_ops/aoti_call_delegate.py b/torch/_higher_order_ops/aoti_call_delegate.py
index 286575726dc2..bf987be516c0 100644
--- a/torch/_higher_order_ops/aoti_call_delegate.py
+++ b/torch/_higher_order_ops/aoti_call_delegate.py
@@ -1,20 +1,39 @@
+<<<<<<< HEAD
+=======
+# mypy: allow-untyped-defs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+<<<<<<< HEAD
 # pyre-strict
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from __future__ import annotations
 
 import torch
 import torch.utils._pytree as pytree
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+<<<<<<< HEAD
 
 
 AOTI_LOWERED_MODULE = "AOTInductorEPModule"
+=======
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+
+
+AOTI_LOWERED_MODULE = "AOTInductorEPModule/AOTInductorRunnerWrapper"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class AOTICallDelegate(HigherOrderOperator):
@@ -22,7 +41,11 @@ class AOTICallDelegate(HigherOrderOperator):
 
     It has the following signature:
     aoti_call_delegate(
+<<<<<<< HEAD
         lowered_module: AOTInductorEPModule,
+=======
+        lowered_module: Union[AOTInductorEPModule, AOTInductorRunnerWrapper]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         original_gm:fx.GraphModule,
         weight_args: List[Tensor],
         input_args: List[Tensor],
@@ -30,6 +53,7 @@ class AOTICallDelegate(HigherOrderOperator):
 
     where,
     - lowered_module is the AOTInductor lowered submodule, backed by compiled .so file, supporting real tensor inputs
+<<<<<<< HEAD
     - original_gm is the original GraphModule before lowering, allowing FakeTensor propagation
     - weight_args is the list of weights in original GraphModule, including parameters and buffers
     - input_args is the list of flatten inputs
@@ -39,6 +63,11 @@ class AOTICallDelegate(HigherOrderOperator):
 
     When serialization, we have special hanlding for aoti_call_delegate, as AOTInductorEPModule is not serializable
     and stateful original_gm is failing the verifier.
+=======
+    - original_gm is the stateless version of the original GraphModule before lowering, allowing FakeTensor propagation
+    - weight_args is the list of weights in original GraphModule, including parameters and buffers
+    - input_args is the list of flatten inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(self) -> None:
@@ -62,7 +91,10 @@ def __call__(
 
 
 @aoti_call_delegate.py_impl(torch._C.DispatchKey.CompositeExplicitAutograd)
+<<<<<<< HEAD
 # pyre-ignore
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def call_delegate_cpu(
     lowered_module: AOTI_LOWERED_MODULE,  # type: ignore[valid-type]
     original_gm: torch.fx.GraphModule,
@@ -77,6 +109,7 @@ def call_delegate_cpu(
     new_args = pytree.tree_map_only(
         tuple(map_types.keys()),
         lambda a: map_types[type(a)](a),
+<<<<<<< HEAD
         input_args,
         lambda a: isinstance(a, tuple(map_types.keys())),
     )
@@ -98,6 +131,62 @@ def call_delegate_cpu(
 
 @aoti_call_delegate.py_impl(FakeTensorMode)
 # pyre-ignore
+=======
+        weight_args + input_args,
+        lambda a: isinstance(a, tuple(map_types.keys())),
+    )
+    has_fake_args = any(isinstance(arg, FakeTensor) for arg in new_args)
+    if has_fake_args:
+        # use stateless original_gm for tracing with fake tensors
+        fake_out = original_gm(*new_args)
+        return fake_out
+    else:
+        # use AOTI Runner for real tensors
+        new_input_args = new_args[len(weight_args) :]
+        if type(lowered_module).__name__ == "AOTInductorRunnerWrapper":
+            return lowered_module(*new_input_args)  # type: ignore[misc]
+        elif type(lowered_module).__name__ == "AOTInductorEPModule":
+            return lowered_module(new_input_args)  # type: ignore[misc]
+        else:
+            raise RuntimeError(
+                f"Unexpected lowered_module type: {type(lowered_module)}."
+            )
+
+
+def trace_aoti_call_delegate(
+    proxy_mode, func_overload, lowered_module, original_gm, weight_args, input_args
+):
+    proxy_mode.tracer.root.register_module("lowered_module", lowered_module)
+    proxy_mode.tracer.root.register_module("original_gm", original_gm)
+
+    node_args = (lowered_module, original_gm, weight_args, input_args)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="aoti_call_delegate"
+    )
+    with disable_proxy_modes_tracing():
+        out = call_delegate_cpu(lowered_module, original_gm, weight_args, input_args)
+
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+
+
+@aoti_call_delegate.py_impl(ProxyTorchDispatchMode)
+def call_delegate_proxy_torch_dispatch_mode(
+    mode: ProxyTorchDispatchMode,
+    lowered_module: AOTI_LOWERED_MODULE,  # type: ignore[valid-type]
+    original_gm: torch.fx.GraphModule,
+    weight_args: list[torch.Tensor],
+    input_args: list[torch.Tensor],
+):
+    res = trace_aoti_call_delegate(
+        mode, aoti_call_delegate, lowered_module, original_gm, weight_args, input_args
+    )
+    return res
+
+
+@aoti_call_delegate.py_impl(FakeTensorMode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def call_delegate_fake_tensor_mode(
     mode: FakeTensorMode,
     lowered_module: AOTI_LOWERED_MODULE,  # type: ignore[valid-type]
@@ -107,3 +196,27 @@ def call_delegate_fake_tensor_mode(
 ) -> list[torch.Tensor]:
     with mode:
         return call_delegate_cpu(lowered_module, original_gm, weight_args, input_args)
+<<<<<<< HEAD
+=======
+
+
+@aoti_call_delegate.py_functionalize_impl
+def call_delegate_functionalize(
+    ctx,
+    lowered_module: AOTI_LOWERED_MODULE,  # type: ignore[valid-type]
+    original_gm: torch.fx.GraphModule,
+    weight_args: list[torch.Tensor],
+    input_args: list[torch.Tensor],
+):
+    unwrapped_weight_args = tuple(
+        ctx.unwrap_tensors(weight_arg) for weight_arg in weight_args
+    )
+    unwrapped_input_args = tuple(
+        ctx.unwrap_tensors(input_arg) for input_arg in input_args
+    )
+    with ctx.redispatch_to_next():
+        res = aoti_call_delegate(
+            lowered_module, original_gm, unwrapped_weight_args, unwrapped_input_args  # type: ignore[arg-type]
+        )
+        return ctx.wrap_tensors(res)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_higher_order_ops/associative_scan.py b/torch/_higher_order_ops/associative_scan.py
index 55f9d0dcc319..c0364058da50 100644
--- a/torch/_higher_order_ops/associative_scan.py
+++ b/torch/_higher_order_ops/associative_scan.py
@@ -9,9 +9,16 @@
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._higher_order_ops.utils import (
+<<<<<<< HEAD
     _maybe_run_with_interpreter,
     _set_compilation_env,
     autograd_not_implemented,
+=======
+    _maybe_compile_and_run_fn,
+    _maybe_run_with_interpreter,
+    autograd_not_implemented,
+    check_meta_consistency,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     first_slice_copy,
     reenter_make_fx,
     unique_graph_id,
@@ -30,6 +37,7 @@
 
 
 def wrap_combine_fn_flat(*args, combine_fn, spec, num_leaves):
+<<<<<<< HEAD
     assert len(args) == 2 * num_leaves
     lhs = pytree.tree_unflatten(args[:num_leaves], spec)
     rhs = pytree.tree_unflatten(args[num_leaves:], spec)
@@ -37,6 +45,14 @@ def wrap_combine_fn_flat(*args, combine_fn, spec, num_leaves):
     combined_leaves = pytree.tree_leaves(combined)
     assert num_leaves == len(combined_leaves)
     return combined_leaves
+=======
+    assert (
+        len(args) == 2 * num_leaves
+    ), f"Combin_fn received wrong number of arguments, expected {2 * num_leaves}, but got {len(args)}"
+    lhs = pytree.tree_unflatten(args[:num_leaves], spec)
+    rhs = pytree.tree_unflatten(args[num_leaves:], spec)
+    return combine_fn(lhs, rhs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _interleave(a, b, dim=0):
@@ -82,6 +98,14 @@ def __call__(self, combine_fn, xs, additional_inputs):
         assert isinstance(
             additional_inputs, (tuple, list)
         ), "additional_inputs must be a tuple."
+<<<<<<< HEAD
+=======
+        additional_inputs = (
+            tuple(additional_inputs)
+            if isinstance(additional_inputs, list)
+            else additional_inputs
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         validate_subgraph_args_types(additional_inputs)
         return super().__call__(combine_fn, xs, additional_inputs)
 
@@ -132,6 +156,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
         cumsum = associative_scan(add, x, dim)
 
     """
+<<<<<<< HEAD
     if not callable(combine_fn):
         raise ValueError("Combine_fn must be a callable, but got {combine_fn}")
     if not isinstance(dim, int):
@@ -201,6 +226,61 @@ def add(x: torch.Tensor, y: torch.Tensor):
             f"\n  xs metadata             : {[(x.shape, x.dtype, x.device, x.stride()) for x in sliced_leaves]}"
             f"\n  operator output metadata: {[(x.shape, x.dtype, x.device, x.stride()) for x in out_leaves]}"
         )
+=======
+    # The reason we flatten xs before calling into dynamo is that
+    # we want to create a consistent input ordering for combine_fn
+    # and we also want to the input ordering matches the output ordering.
+    leaves_xs_orig, spec_xs = pytree.tree_flatten(xs)
+
+    def _validate_input(cfn, lxs, d, r, cm):
+        # Basic arguments check
+        if not callable(cfn):
+            raise ValueError("Combine_fn must be a callable, but got {cfn}")
+        if not isinstance(d, int):
+            raise ValueError("Dim must be an int, but got " + str(type(d)))
+        if not isinstance(r, bool):
+            raise RuntimeError("Reverse must be a bool, but got " + str(type(r)))
+        if cm not in ["pointwise", "generic"]:
+            raise ValueError(
+                "Combine_mode must either 'pointwise' or 'generic', but got {cm}"
+            )
+        if cm == "pointwise" and not all(l.device.type == "cuda" for l in lxs):
+            raise ValueError(
+                "For combine_mode='pointwise', all input tensors need to be on CUDA"
+            )
+
+        # Checks for xs
+        if len(lxs) == 0:
+            raise ValueError("Expected at least 1 xs leaf")
+        if any(not isinstance(x, torch.Tensor) for x in lxs):
+            raise ValueError("xs leaves must be a Tensor")
+        if any(x.is_sparse for x in lxs):
+            raise ValueError(
+                "xs leaves must dense Tensors, consider using `to_dense()`"
+            )
+        if any(x.ndim <= d for x in lxs):
+            raise ValueError(
+                "All xs leaves must at least have 'dim' number of dimensions and scan dimension > 0"
+            )
+        if any(x.shape[d] == 0 for x in lxs):
+            raise ValueError(
+                "All xs leaves must at least have 'dim' number of dimensions and scan dimension > 0"
+            )
+
+    ndim = leaves_xs_orig[0].ndim
+    dim = utils.canonicalize_dim(ndim, dim)
+
+    _validate_input(combine_fn, leaves_xs_orig, dim, reverse, combine_mode)
+
+    # Move scan dim to 0 and always perform scan on dim 0
+    leaves_xs = [torch.movedim(elem, dim, 0) for elem in leaves_xs_orig]
+
+    if reverse:
+        leaves_xs = [torch.flip(elem, [0]) for elem in leaves_xs]
+
+    # TODO: Support Autograd
+    # TODO: Unify handling of pytrees for control flow ops, such as cond, while_loop, etc.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if combine_mode == "generic":
         # The generic_associative_scan implementation calls the combine_fn with a `batch` along the scan dimension
@@ -222,6 +302,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
             combine_fn=torch.vmap(
                 combine_fn,
                 in_dims=(
+<<<<<<< HEAD
                     pytree.tree_unflatten([0] * len(leaves), spec),
                     pytree.tree_unflatten([0] * len(leaves), spec),
                 ),
@@ -231,10 +312,23 @@ def add(x: torch.Tensor, y: torch.Tensor):
             num_leaves=len(leaves),
         )
         result_flat = generic_associative_scan(combine_fn, leaves, additional_inputs=())
+=======
+                    pytree.tree_unflatten([0] * len(leaves_xs), spec_xs),
+                    pytree.tree_unflatten([0] * len(leaves_xs), spec_xs),
+                ),
+                out_dims=0,
+            ),
+            spec=spec_xs,
+            num_leaves=len(leaves_xs),
+        )
+        out = generic_associative_scan(combine_fn, leaves_xs, additional_inputs=())
+        out = pytree.tree_unflatten(out, spec_xs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         combine_fn = functools.partial(
             wrap_combine_fn_flat,
             combine_fn=combine_fn,
+<<<<<<< HEAD
             spec=spec,
             num_leaves=len(leaves),
         )
@@ -246,6 +340,27 @@ def add(x: torch.Tensor, y: torch.Tensor):
     result_flat = [torch.movedim(elem, 0, orig_scan_dim) for elem in result_flat]
 
     return pytree.tree_unflatten(result_flat, spec)
+=======
+            spec=spec_xs,
+            num_leaves=len(leaves_xs),
+        )
+
+        def run_flattened_associative_scan(combine_fn, leaves_xs):
+            return associative_scan_op(combine_fn, leaves_xs, additional_inputs=())
+
+        out = _maybe_compile_and_run_fn(
+            run_flattened_associative_scan,
+            combine_fn,
+            leaves_xs,
+        )
+
+    if reverse:
+        out = pytree.tree_map(lambda elem: elem.flip([0]), out)
+
+    out = pytree.tree_map(lambda elem: torch.movedim(elem, 0, dim), out)
+
+    return out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def generic_associative_scan(operator, leaves, dim=0, additional_inputs=()):
@@ -297,6 +412,12 @@ def add(x: torch.Tensor, y: torch.Tensor):
 
     """
 
+<<<<<<< HEAD
+=======
+    def call_operator(*args):
+        return pytree.tree_leaves(operator(*args))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _scan(elems):
         """Perform the actual recursive scan on ``elems``."""
         num_elems = elems[0].shape[dim]
@@ -304,7 +425,11 @@ def _scan(elems):
         if num_elems < 2:
             return elems
 
+<<<<<<< HEAD
         reduced_elems = operator(
+=======
+        reduced_elems = call_operator(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             *[aten.slice(elem, dim, 0, -1, 2) for elem in elems],
             *[aten.slice(elem, dim, 1, None, 2) for elem in elems],
             *additional_inputs,
@@ -314,13 +439,21 @@ def _scan(elems):
         odd_elems = _scan(reduced_elems)
 
         if num_elems % 2 == 0:
+<<<<<<< HEAD
             even_elems = operator(
+=======
+            even_elems = call_operator(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *[aten.slice(e, dim, 0, -1) for e in odd_elems],
                 *[aten.slice(e, dim, 2, None, 2) for e in elems],
                 *additional_inputs,
             )
         else:
+<<<<<<< HEAD
             even_elems = operator(
+=======
+            even_elems = call_operator(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *odd_elems,
                 *[aten.slice(e, dim, 2, None, 2) for e in elems],
                 *additional_inputs,
@@ -355,9 +488,23 @@ def trace_associative_scan(
     xs: list[torch.Tensor],
     additional_inputs: tuple[torch.Tensor],
 ):
+<<<<<<< HEAD
     with disable_proxy_modes_tracing():
         sample_xs = [first_slice_copy(x) for x in itertools.chain(xs, xs)]
         combine_graph = reenter_make_fx(combine_fn)(*sample_xs, *additional_inputs)
+=======
+    from torch._dynamo.utils import clone_input
+
+    with disable_proxy_modes_tracing():
+        sample_xs = [first_slice_copy(x) for x in itertools.chain(xs, xs)]
+        sample_additional_inputs = [
+            clone_input(x) if isinstance(x, torch.Tensor) else x
+            for x in additional_inputs
+        ]
+        combine_graph = reenter_make_fx(combine_fn)(
+            *sample_xs, *sample_additional_inputs
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     outputs = None
     for node in combine_graph.graph.nodes:
@@ -367,10 +514,15 @@ def trace_associative_scan(
             outputs = node.args[0]
 
     assert outputs is not None
+<<<<<<< HEAD
+=======
+    outputs = pytree.tree_leaves(outputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert len(outputs) == len(
         xs
     ), f"expected combine_fn to return {len(xs)} results but got {len(outputs)}"
 
+<<<<<<< HEAD
     for i, o in zip(xs, outputs):
         o_meta = o.meta["tensor_meta"]
         assert o_meta.dtype == i.dtype, (
@@ -379,6 +531,21 @@ def trace_associative_scan(
         )
 
     _, combine_graph_name = unique_graph_id(proxy_mode, prefix="scan_combine_graph")
+=======
+    xs_fake_tensors: list[torch.Tensor | torch.SymInt | int] = [
+        first_slice_copy(x) for x in xs
+    ]
+    output_fake_tensors: list[torch.Tensor | torch.SymInt | int] = [
+        c.meta["val"] for c in outputs
+    ]
+    check_meta_consistency(
+        xs_fake_tensors, output_fake_tensors, "init", "carry", include_contiguity=False
+    )
+
+    _, combine_graph_name = unique_graph_id(
+        proxy_mode, prefix="associative_scan_combine_graph"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     proxy_mode.tracer.root.register_module(combine_graph_name, combine_graph)
 
@@ -399,7 +566,11 @@ def associative_scan_op_dense(combine_fn, xs, additional_inputs):
     return generic_associative_scan(combine_fn, xs, additional_inputs=additional_inputs)
 
 
+<<<<<<< HEAD
 associative_scan_op.py_impl(DispatchKey.Autograd)(
+=======
+associative_scan_op.py_autograd_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     autograd_not_implemented(associative_scan_op, deferred_error=True)
 )
 
@@ -419,12 +590,33 @@ def assoiciative_scan_fake_tensor_mode(mode, combine_fn, xs, additional_inputs):
 
 @associative_scan_op.py_functionalize_impl
 def associative_scan_functionalize(ctx, combine_fn, xs, additional_inputs):
+<<<<<<< HEAD
+=======
+    from torch._higher_order_ops.utils import _check_alias_and_mutation
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unwrapped_xs = ctx.unwrap_tensors(xs)
     unwrapped_additional_inputs = ctx.unwrap_tensors(additional_inputs)
     with ctx.redispatch_to_next():
         functional_combine_fn = ctx.functionalize(
             _maybe_run_with_interpreter(combine_fn)
         )
+<<<<<<< HEAD
+=======
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        sample_unwrapped_xs_sliced = [
+            first_slice_copy(inp) for inp in itertools.chain(unwrapped_xs, unwrapped_xs)
+        ]
+        sample_inputs = list(
+            itertools.chain(
+                sample_unwrapped_xs_sliced,
+                unwrapped_additional_inputs,
+            )
+        )
+        _check_alias_and_mutation(
+            combine_fn, sample_inputs, "associative_scan", pre_dispatch
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ret = associative_scan_op(
             functional_combine_fn,
             unwrapped_xs,
diff --git a/torch/_higher_order_ops/auto_functionalize.py b/torch/_higher_order_ops/auto_functionalize.py
index 9ffa409351ed..02718d8c4fff 100644
--- a/torch/_higher_order_ops/auto_functionalize.py
+++ b/torch/_higher_order_ops/auto_functionalize.py
@@ -3,13 +3,28 @@
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from dataclasses import dataclass
+<<<<<<< HEAD
 from typing import Any, Optional, Union
+=======
+from typing import Any, get_args, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._library.utils as library_utils
 import torch.utils._pytree as pytree
 from torch import Tensor
 from torch._C import DispatchKey
+<<<<<<< HEAD
+=======
+from torch._higher_order_ops.utils import (
+    _has_gen_schema,
+    call_op,
+    HopInstance,
+    HopSchema,
+    materialize_callable_in_args,
+    unique_graph_id,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._ops import HigherOrderOperator, OperatorBase, OpOverload
 from torch._prims_common import clone_preserve_strides
 from torch._subclasses.fake_tensor import FakeTensorMode
@@ -20,6 +35,31 @@
 )
 
 
+<<<<<<< HEAD
+=======
+class SchemaHolder:
+    def __init__(self, schema: torch.FunctionSchema):
+        self.schema = schema
+
+    def __eq__(self, other):
+        return self.schema == other.schema
+
+    def __hash__(self) -> int:
+        return hash(self.schema)
+
+    @classmethod
+    def from_tree_spec(cls, tree_spec: pytree.TreeSpec):
+        assert tree_spec is not None
+        return cls(pytree.tree_unflatten([], tree_spec).schema)
+
+
+# regsiter_constant allows us to get a tree_spec from pytree.tree_flatten(SchemaHolder(FunctionSchema)).
+# The tree_spec is proxable in the graph and we can get back the schema via
+# schema = pytree.tree_unflatten([], tree_spec).schema
+pytree.register_constant(SchemaHolder)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_base(tensor):
     if torch.is_inference_mode_enabled():
         return tensor._inference_mode_base
@@ -209,7 +249,11 @@ def use_alias():
             write_single_view(
                 f"_{arg_name}",
                 kwargs[arg_name],
+<<<<<<< HEAD
                 arg_to_base_index.get(arg_name, None),
+=======
+                arg_to_base_index.get(arg_name, None),  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             raise RuntimeError(f"Unsupported type {arg_type}")
@@ -336,6 +380,12 @@ def __call__(
 auto_functionalized.fallthrough(DispatchKey.AutogradCUDA)
 
 
+<<<<<<< HEAD
+=======
+_MutableOpType = Union[OpOverload, HigherOrderOperator]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AutoFunctionalizedV2(HigherOrderOperator):
     """auto_functionalized_v2(_mutable_op, **kwargs)
 
@@ -350,10 +400,27 @@ def __init__(self) -> None:
     def __call__(
         self,
         /,
+<<<<<<< HEAD
         _mutable_op: OpOverload,
         **kwargs: Any,
     ) -> tuple[Any, tuple[Tensor, ...]]:
         assert can_auto_functionalize(_mutable_op)
+=======
+        _mutable_op: _MutableOpType,
+        **kwargs: Any,
+    ) -> tuple[Any, tuple[Tensor, ...]]:
+        _op_to_check: Optional[Union[OpOverload, HopInstance]] = None
+        if isinstance(_mutable_op, HigherOrderOperator):
+            _op_to_check = HopInstance(
+                _mutable_op,
+                SchemaHolder.from_tree_spec(kwargs.get("_op_schema", None)).schema,  # type: ignore[arg-type]
+            )
+        else:
+            _op_to_check = _mutable_op
+
+        assert _op_to_check is not None
+        assert can_auto_functionalize(_op_to_check)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(kwargs, dict)
         return super().__call__(_mutable_op, **kwargs)
 
@@ -365,6 +432,7 @@ def __call__(
 auto_functionalized_v2.fallthrough(DispatchKey.AutogradCUDA)
 
 
+<<<<<<< HEAD
 def can_auto_functionalize(op: OperatorBase) -> bool:
     if not isinstance(op, OpOverload):
         return False
@@ -373,6 +441,25 @@ def can_auto_functionalize(op: OperatorBase) -> bool:
         # We control the built-ins. These may (in rare cases)
         # do input metadata mutation (which we have banned on custom ops)
         return False
+=======
+def can_auto_functionalize(
+    op: Union[OperatorBase, HopInstance],
+) -> bool:
+    if isinstance(op, HopInstance):
+        # HOPs that implement gen_schema and schema is not functional are auto_functionalizable.
+        if not _has_gen_schema(op._op):
+            return False
+
+    else:
+        if not isinstance(op, OpOverload):
+            return False
+
+        if torch._library.utils.is_builtin(op):
+            # We control the built-ins. These may (in rare cases)
+            # do input metadata mutation (which we have banned on custom ops)
+            return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     schema = op._schema
     if not schema.is_mutable:
         return False
@@ -392,6 +479,7 @@ def can_auto_functionalize(op: OperatorBase) -> bool:
     if len(schema.returns) == 1 and isinstance(schema.returns[0].type, torch.NoneType):
         # Skip schema returns -> None
         return True
+<<<<<<< HEAD
     # The returns must not alias anything
     for ret in schema.returns:
         if ret.alias_info is None and type(ret.type) is torch.TensorType:
@@ -404,22 +492,54 @@ def can_auto_functionalize(op: OperatorBase) -> bool:
 
 
 def get_mutable_args(op: OpOverload) -> tuple[list[str], list[torch.Type]]:
+=======
+    if isinstance(op, OpOverload):
+        # The returns of OpOverload must not alias anything
+        for ret in schema.returns:
+            if ret.alias_info is None and type(ret.type) is torch.TensorType:
+                continue
+            # Not yet supported: List[Tensor] return.
+            return False
+        if torch._C._dispatch_has_kernel_for_dispatch_key(op.name(), "Functionalize"):
+            return False
+    return True
+
+
+def get_mutable_args_from_schema(
+    schema: torch.FunctionSchema,
+) -> tuple[list[str], list[torch.Type]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Returns the list of argument names that get mutated according to the
     schema and their types.
     """
     mutable_args_names = [
         arg.name
+<<<<<<< HEAD
         for arg in op._schema.arguments
+=======
+        for arg in schema.arguments
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if arg.alias_info is not None and arg.alias_info.is_write
     ]
 
     mutable_args_types = [
         arg.type
+<<<<<<< HEAD
         for arg in op._schema.arguments
         if arg.alias_info is not None and arg.alias_info.is_write
     ]
     return mutable_args_names, mutable_args_types
+=======
+        for arg in schema.arguments
+        if arg.alias_info is not None and arg.alias_info.is_write
+    ]
+    return mutable_args_names, mutable_args_types  # type: ignore[return-value]
+
+
+def get_mutable_args(op: OpOverload) -> tuple[list[str], list[torch.Type]]:
+    return get_mutable_args_from_schema(op._schema)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def do_auto_functionalize(
@@ -514,7 +634,11 @@ def sync_update(o, orig_arg):
 
 def do_auto_functionalize_v2(
     mode: "torch._subclasses.functional_tensor.FunctionalTensorMode",
+<<<<<<< HEAD
     op: OpOverload,
+=======
+    op: Union[OpOverload, HopInstance],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args: tuple[Any, ...],
     kwargs: dict[str, Any],
 ) -> Any:
@@ -527,6 +651,31 @@ def do_auto_functionalize_v2(
     normalized_kwargs = {}
 
     schema = op._schema
+<<<<<<< HEAD
+=======
+    op = op._op if isinstance(op, HopInstance) else op
+    assert isinstance(op, get_args(_MutableOpType))
+
+    def _functionalize_callable(arg: Any):
+        if callable(arg):
+
+            def functional_fn(*args, **kwargs):
+                # We call torch.func.functionalize. This allows us to inline the epilogue graph.
+                # Inlining has the benefit of allowing easiser fusion inside subgraph.
+                # Though the epilogue graph contains copy_, it is OK becuase inductor can handle it
+                # and this is also how we have been supporting top-level graph input mutation.
+                return tuple(
+                    pytree.tree_leaves(torch.func.functionalize(arg)(*args, **kwargs))
+                )
+
+            return torch._higher_order_ops.base_hop.FunctionWithNoFreeVars(
+                functional_fn
+            )
+        return arg
+
+    args, kwargs = pytree.tree_map(_functionalize_callable, (args, kwargs))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for idx, arg in enumerate(schema.arguments):
         # NB: torch_dispatch kwargs are the args defined as kwarg-only in the schema
         if arg.name in kwargs:
@@ -540,7 +689,11 @@ def do_auto_functionalize_v2(
             normalized_kwargs[arg.name] = arg.default_value
 
     # List of the name of args that get mutated (according to the schema)
+<<<<<<< HEAD
     mutable_args_names, mutable_args_types = get_mutable_args(op)
+=======
+    mutable_args_names, mutable_args_types = get_mutable_args_from_schema(schema)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # A list of all bases of mutable args without duplication
     all_bases = []
@@ -602,9 +755,24 @@ def set_result(base_index):
         )
     all_basis_unwrapped = ctx.unwrap_tensors(all_bases)
 
+<<<<<<< HEAD
     with ctx.redispatch_to_next():
         unwrapped_outs = auto_functionalized_v2(
             op, **dict(unwrapped_kwargs, _all_bases=all_basis_unwrapped)  # type: ignore[arg-type]
+=======
+    assert "_all_bases" not in unwrapped_kwargs, (op, unwrapped_kwargs)
+    auto_func_kwargs = dict(unwrapped_kwargs, _all_bases=all_basis_unwrapped)
+    if isinstance(op, HigherOrderOperator):
+        assert "_ops_schema" not in unwrapped_kwargs, (op, unwrapped_kwargs)
+        # We pass in the tree_spec of tree_flatten(SchemaHolder) to make it proxable
+        auto_func_kwargs.update(
+            {"_op_schema": pytree.tree_flatten(SchemaHolder(schema))[1]}
+        )
+
+    with ctx.redispatch_to_next():
+        unwrapped_outs = auto_functionalized_v2(
+            op, **auto_func_kwargs  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     unwrapped_actual_out: Union[Any, tuple[Any]] = (
@@ -615,6 +783,7 @@ def set_result(base_index):
         [] if len(all_bases) == 0 else unwrapped_outs[-len(all_bases) :]
     )
 
+<<<<<<< HEAD
     if len(op._schema.returns) == 0:
         assert unwrapped_actual_out[0] is None
         unwrapped_actual_out = None
@@ -623,6 +792,22 @@ def set_result(base_index):
         unwrapped_actual_out = unwrapped_actual_out[0]
     else:
         assert len(unwrapped_actual_out) == len(op._schema.returns)
+=======
+    if isinstance(op, HigherOrderOperator):
+        assert (
+            len(schema.returns) > 0
+        ), f"hop is expected to return at least one output {schema}."
+        assert len(unwrapped_actual_out) == len(schema.returns)
+    else:
+        if len(schema.returns) == 0:
+            assert unwrapped_actual_out[0] is None
+            unwrapped_actual_out = None
+        elif len(schema.returns) == 1:
+            assert len(unwrapped_actual_out) == 1
+            unwrapped_actual_out = unwrapped_actual_out[0]
+        else:
+            assert len(unwrapped_actual_out) == len(schema.returns)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for orig_arg, unwrapped_out in zip(all_bases, unwrapped_mutable_out):
         # Can be None if input was `Tensor(a!)?`
@@ -731,19 +916,67 @@ def auto_functionalized_func(ctx, _mutable_op, **kwargs):
 # auto_functionalized_v2 functions
 @auto_functionalized_v2.py_impl(DispatchKey.CompositeExplicitAutograd)
 def auto_functionalized_v2_dense(
+<<<<<<< HEAD
     _mutable_op: OpOverload,
     _only_clone_these_bases: Optional[tuple[int, ...]] = None,
     **kwargs: Any,
 ) -> tuple[Any, tuple[Tensor, ...]]:
     all_bases: list[Tensor] = kwargs.pop("_all_bases", [])
     mutable_args_names, mutable_args_types = get_mutable_args(_mutable_op)
+=======
+    _mutable_op: _MutableOpType,
+    _only_clone_these_bases: Optional[tuple[int, ...]] = None,
+    **kwargs: Any,
+) -> tuple[Any, tuple[Tensor, ...]]:
+    _all_bases: list[Tensor] = kwargs.pop("_all_bases", [])
+    if _only_clone_these_bases is None:
+        _only_clone_these_bases = tuple(range(len(_all_bases)))
+
+    if isinstance(_mutable_op, OpOverload):
+        schema: torch._C.FunctionSchema = _mutable_op._schema
+    else:
+        schema = pytree.tree_unflatten([], kwargs.pop("_op_schema")).schema
+
+    if isinstance(_mutable_op, OpOverload):
+        _callable_op: Union[HopInstance, OpOverload] = _mutable_op
+    else:
+        assert isinstance(schema, HopSchema)
+        _callable_op = HopInstance(_mutable_op, schema)
+
+    op_kwargs_new, all_bases_new = _generate_new_op_kwargs_from_bases(
+        schema,
+        kwargs,
+        _all_bases,
+        _only_clone_these_bases,
+    )
+
+    out = call_op(
+        _callable_op,
+        tuple(),
+        op_kwargs_new,
+    )
+
+    if isinstance(out, tuple):
+        return (*out, *all_bases_new)  # type: ignore[return-value]
+    else:
+        return (out, *all_bases_new)  # type: ignore[return-value]
+
+
+def _generate_new_op_kwargs_from_bases(
+    schema, kwargs, all_bases, _only_clone_these_bases
+):
+    mutable_args_names, mutable_args_types = get_mutable_args_from_schema(schema)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args_view_info = read_view_information_from_args(
         mutable_args_names, mutable_args_types, kwargs, all_bases
     )
 
+<<<<<<< HEAD
     if _only_clone_these_bases is None:
         _only_clone_these_bases = tuple(range(len(all_bases)))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def maybe_copy(i, t):
         if t is None:
             return None
@@ -776,18 +1009,26 @@ def maybe_copy(i, t):
                 all_bases_new
             )
 
+<<<<<<< HEAD
     out = _mutable_op(**new_kwargs)
 
     if isinstance(out, tuple):
         return (*out, *all_bases_new)  # type: ignore[return-value]
     else:
         return (out, *all_bases_new)  # type: ignore[return-value]
+=======
+    return new_kwargs, all_bases_new
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @auto_functionalized_v2.py_impl(FakeTensorMode)
 def auto_functionalized_v2_fake(
     mode,
+<<<<<<< HEAD
     _mutable_op: OpOverload,
+=======
+    _mutable_op: _MutableOpType,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     **kwargs: dict[str, Any],
 ) -> tuple[Any, tuple[Tensor, ...]]:
     with mode:
@@ -800,13 +1041,79 @@ def auto_functionalized_v2_fake(
 @auto_functionalized_v2.py_impl(ProxyTorchDispatchMode)
 def auto_functionalized_v2_proxy(
     mode,
+<<<<<<< HEAD
     _mutable_op: OpOverload,
     **kwargs: dict[str, Any],
 ) -> tuple[Any, tuple[Tensor, ...]]:
+=======
+    _mutable_op: _MutableOpType,
+    **kwargs: Any,
+) -> tuple[Any, tuple[Tensor, ...]]:
+    if isinstance(_mutable_op, HigherOrderOperator):
+        # Note [materialize callable inputs as graph]
+        # Below code materializes the callable inputs to the hop as graph modules.
+        # kwargs may contain general callables, that are not proxable e.g. FunctionWithNoFreeVars
+        # this could happen when we auto_functionalize the backward of the hop,
+        # where backward fn is a callablle that wrapps forward graph module.
+        # This function materialize the callable args according to the schema of the hop.
+
+        # We cannot materialize the callables in kwargs directly because the inputs to callable
+        # vary from hops to hop. To make the materialiation process generic to all hops,
+        # we trace a function that wraps the hop and let each hop itself figure out how to trace
+        # its callable inputs. Then we look at the schema of the traced hop node and replace the
+        # callable in original kwarg with the traced subgraphs.
+        #
+        # Specifically, we first trace a wrapped_fn that calls into the hop. Then we look for the
+        # hop node in the traced graph and graph module inputs to the hop. Finally, we replace the
+        # original kwarg's callable with the graph module.
+        all_bases = kwargs.get("_all_bases", [])
+        _only_clone_these_bases = kwargs.get("_only_clone_these_bases", None)
+        if _only_clone_these_bases is None:
+            _only_clone_these_bases = tuple(range(len(all_bases)))
+
+        schema = pytree.tree_unflatten([], kwargs.get("_op_schema", None)).schema  # type: ignore[arg-type]
+        new_kwargs, _ = _generate_new_op_kwargs_from_bases(
+            schema,
+            {k: v for k, v in kwargs.items() if k not in ("_all_bases", "_op_schema")},
+            all_bases,
+            _only_clone_these_bases,
+        )
+
+        _, materialized_kwargs = materialize_callable_in_args(
+            HopInstance(_mutable_op, schema), tuple(), new_kwargs
+        )
+
+        # Only replace the callabes in kwargs with the materialized subgraphs.
+        # The rest of the kwargs are kept unchanged.
+        for k, v in kwargs.items():
+            if callable(v):
+                assert k in materialized_kwargs and isinstance(
+                    materialized_kwargs[k], torch.fx.GraphModule
+                )
+                kwargs[k] = materialized_kwargs[k]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with disable_proxy_modes_tracing():
         out = auto_functionalized_v2(_mutable_op, **kwargs)
 
     proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+<<<<<<< HEAD
+=======
+
+    if isinstance(_mutable_op, HigherOrderOperator):
+
+        def _maybe_register_subgraph(val: Any):
+            if isinstance(val, torch.fx.GraphModule):
+                _, graph_name = unique_graph_id(
+                    mode, prefix="auto_functionalized_subgraph"
+                )
+                mode.tracer.root.register_module(graph_name, val)
+                return val
+            return val
+
+        proxy_kwargs = pytree.tree_map(_maybe_register_subgraph, proxy_kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out_proxy = mode.tracer.create_proxy(
         "call_function",
         auto_functionalized_v2,
diff --git a/torch/_higher_order_ops/base_hop.py b/torch/_higher_order_ops/base_hop.py
index 02eee4b2c07b..3b34e92d682a 100644
--- a/torch/_higher_order_ops/base_hop.py
+++ b/torch/_higher_order_ops/base_hop.py
@@ -6,7 +6,16 @@
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._dispatch.python import suspend_functionalization
+<<<<<<< HEAD
 from torch._higher_order_ops.utils import reenter_make_fx
+=======
+from torch._higher_order_ops.utils import (
+    check_input_alias_and_mutation_return_outputs,
+    HopInstance,
+    materialize_as_graph,
+    reenter_make_fx,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._ops import HigherOrderOperator
 from torch._subclasses import FakeTensorMode
 from torch._subclasses.functional_tensor import disable_functional_mode
@@ -53,7 +62,11 @@ def __init__(self, hop_name) -> None:
 
         # Set up the registrations
         # If you want to override any of these, override them in your subclass.
+<<<<<<< HEAD
         self.py_impl(DispatchKey.Autograd)(self._call_Autograd)
+=======
+        self.py_autograd_impl(self._call_Autograd)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.py_functionalize_impl(self._call_Functionalize)
         self.py_impl(ProxyTorchDispatchMode)(self._call_ProxyTorchDispatchMode)
         self.py_impl(FakeTensorMode)(self._call_FakeTensorMode)
@@ -73,6 +86,7 @@ def __call__(self, subgraph, *operands, **kwargs):
     def _call_Autograd(self, subgraph, *operands, **kwargs):
         if isinstance(subgraph, torch.fx.GraphModule):
             pass
+<<<<<<< HEAD
         if not torch.is_grad_enabled() or pytree.tree_all_only(
             torch.Tensor,
             lambda t: not t.requires_grad,  # type: ignore[union-attr]
@@ -80,6 +94,8 @@ def _call_Autograd(self, subgraph, *operands, **kwargs):
         ):
             with torch._C._AutoDispatchBelowAutograd():
                 return self(subgraph, *operands, **kwargs)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # We assume the subgraph doesn't mutate inputs and there is no aliasing.
         # In the PT2 stack, this is Dynamo's responsibility to figure out.
@@ -115,7 +131,43 @@ def _call_FakeTensorMode(self, mode, subgraph, *operands, **kwargs):
         with mode:
             return subgraph(*operands)
 
+<<<<<<< HEAD
+    def _call_Functionalize(self, ctx, subgraph, *operands, **kwargs):
+=======
+    # NOTE [Support input mutation of hops]
+    # To support input mutation, hop's subgraph must be functionalized because many inductor passes are
+    #   applied to subgraph recursively and only work on functional graph. However, we could inline an
+    #   epilogue graph (i.e. the copy_) into the subgraph because this is how input mutation
+    #   is implemented in the top-level graph when no hop is presented. All passes must have been and will be
+    #   aware of the epilogue graph.
+    #
+    # Since we've supported input mutation for custom op with auto_functionalized, we share the infra for hops
+    # The plan is:
+    #   1. In hop's Functionalization key, it calls do_auto_functionalize_v2 if subgraph mutates input
+    #   2. In do_auto_functionalize_v2:
+    #       a. we functionalize the callables in hop's argument. This is to make the subgraphs functional so we
+    #          could recursively run passes on them. Also the epilogue graph is inlined at the end.
+    #       b. we call auto_functionalized_v2 and pass in an additional schema in order to properly invoke
+    #          the hop with normalized kwargs.
+    #   3. In inductor, we decompose the auto_functionalized hop by callilng into the dense implementation, which
+    #      copies the mutated inputs to the hop if necessary and call the hop.
+    # After these steps, the rest of the inductor stack knows how to fuse the copy_ in subgraph with other ops.
     def _call_Functionalize(self, ctx, subgraph, *operands, **kwargs):
+        from torch._higher_order_ops.auto_functionalize import (
+            can_auto_functionalize,
+            do_auto_functionalize_v2,
+        )
+
+        # invoke_quant has non-proxable argument of type InvokeQuant that
+        # we cannot generate schema for.
+        if self is not torch.ops.higher_order.invoke_quant_packed:
+            hop_instance = HopInstance.create(self, subgraph, *operands, **kwargs)
+            if can_auto_functionalize(hop_instance):
+                return do_auto_functionalize_v2(
+                    ctx.mode, hop_instance, (subgraph, *operands), kwargs
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unwrapped_operands = ctx.unwrap_tensors(operands)
         with ctx.redispatch_to_next():
             # We assume the subgraph doesn't mutate inputs and there is no aliasing.
@@ -126,6 +178,56 @@ def _call_Functionalize(self, ctx, subgraph, *operands, **kwargs):
             out = self(functionalized_subgraph, *unwrapped_operands, **kwargs)
         return ctx.wrap_tensors(out)
 
+<<<<<<< HEAD
+=======
+    def gen_schema(self, subgraph, *operands, **kwargs):
+        from .schema import HopSchemaGenerator
+
+        if not isinstance(subgraph, torch.fx.GraphModule):
+            subgraph = materialize_as_graph(subgraph, operands)
+
+        fake_args = [
+            ph.meta["example_value"] if "example_value" in ph.meta else ph.meta["val"]
+            for ph in subgraph.graph.find_nodes(op="placeholder")
+        ]
+        (
+            inp_inp_alias,
+            inp_out_alias,
+            out_out_alias,
+            mutated_inp_idx,
+            output,
+        ) = check_input_alias_and_mutation_return_outputs(subgraph, fake_args)
+
+        if not (
+            len(inp_inp_alias) == 0
+            and len(inp_out_alias) == 0
+            and len(out_out_alias) == 0
+        ):
+            # TODO: turn this into an error.
+            # test_foreach_map_backward_binary_foreach_map_addrecip_op fails the alias test.
+            import warnings
+
+            warnings.warn(
+                "Aliasing is not suppported for HOP subgraph.\n"
+                f"{subgraph.print_readable(print_output=False)}\n"
+                f"Alias info: inp-inp alias: {inp_inp_alias}, inp-out alias: {inp_out_alias}, out-out alias{out_out_alias}"
+                f"This may lead to silent incorrectness."
+            )
+
+        schema_gen = HopSchemaGenerator(self)
+        schema_gen.add_arg("subgraph", subgraph)
+        for idx, arg in enumerate(operands):
+            schema_gen.add_arg(f"arg{idx}", arg, is_mutated=idx in mutated_inp_idx)
+
+        for name, arg in kwargs.items():
+            schema_gen.add_arg(name, arg, default_value=arg, kw_only=True)
+
+        for out in output:
+            schema_gen.add_output(out)
+
+        return schema_gen.gen_schema()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class BaseHOPFunction(torch.autograd.Function):
     @staticmethod
@@ -151,9 +253,17 @@ def backward(ctx, *grad_outputs):
                 from .utils import _from_fun
 
                 fw_inputs = pytree.tree_map(_from_fun, operands)
+<<<<<<< HEAD
                 _, joint_graph, _ = create_fw_bw_graph(
                     subgraph, fw_inputs, grad_outputs
                 )
+=======
+                (
+                    _,
+                    joint_graph,
+                    _,
+                ) = create_fw_bw_graph(subgraph, fw_inputs, grad_outputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # The joint graph returns (*grad_inputs, *fwd_outputs).
         # We only need the grad_inputs.
diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index ff677b6d240f..c85fa5e11d92 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -15,6 +15,7 @@
     is_batchedtensor,
     maybe_get_bdim,
 )
+<<<<<<< HEAD
 from torch._dispatch.python import suspend_functionalization
 from torch._functorch.utils import exposed_in
 from torch._higher_order_ops.utils import (
@@ -22,26 +23,46 @@
     _has_potential_branch_input_mutation,
     _maybe_run_with_interpreter,
     _set_compilation_env,
+=======
+from torch._functorch.utils import exposed_in
+from torch._higher_order_ops.utils import (
+    _maybe_run_with_interpreter,
+    _set_compilation_env,
+    materialize_as_graph,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reenter_make_fx,
     save_tensors_and_symints_for_backward,
     saved_tensors_and_symints,
     unique_graph_id,
+<<<<<<< HEAD
     UnsupportedAliasMutationException,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     validate_subgraph_args_types,
 )
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+<<<<<<< HEAD
 from torch._subclasses.functional_tensor import disable_functional_mode
 from torch.fx.experimental.proxy_tensor import (
     _temp_remove_metadata_torch_function_mode,
     _temp_remove_pre_dispatch_torch_function_mode,
     disable_proxy_modes_tracing,
+=======
+from torch.fx.experimental.proxy_tensor import (
+    _temp_remove_metadata_torch_function_mode,
+    _temp_remove_pre_dispatch_torch_function_mode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
+<<<<<<< HEAD
 from .utils import _from_fun, _maybe_fake_prop_ignore_unbacked, create_fw_bw_graph
+=======
+from .utils import clone_outputs_aliasing_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
@@ -101,7 +122,13 @@ def cond(pred, true_branch, false_branch, operands):
         false_fn (Callable): A callable function (a -> b) that is within the
           scope that is being traced. The true branch and false branch must
           have consistent input and outputs, meaning the inputs have to be
+<<<<<<< HEAD
           the same, and the outputs have to be the same type and shape.
+=======
+          the same, and the outputs have to be the same type and shape. Int
+          output is also allowed. We'll make the output dynamic by turning it
+          into a symint.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         operands (Tuple of possibly nested dict/list/tuple of torch.Tensor): A tuple of inputs to the
           true/false functions. It can be empty if true_fn/false_fn doesn't require input. Defaults to ().
@@ -196,6 +223,7 @@ def _cond_op_wrapper(*args, **kwargs):
             )
 
 
+<<<<<<< HEAD
 def create_fw_bw_graph_branches(true_fn, false_fn, *operands):
     # See Note [HOP create fw_bw graph] in create_fw_bw_graph in utils.py
 
@@ -239,6 +267,60 @@ def create_fw_bw_graph_branches(true_fn, false_fn, *operands):
             )
 
         return fw_true_graph, fw_false_graph, joint_true_graph, joint_false_graph
+=======
+def create_bw_fn(fn: Callable, args: tuple[Any]) -> Callable:
+    """
+    For a fn that accepts flat inputs and returns flat outputs:
+        fw_out = fn(*args),
+    this function returns:
+        grad_args = bw_fn(*args_and_grad_output)
+    with the following invariants:
+      1. args + fw_out has an 1-1 correspondence to args_and_grad_output
+      2. grad_args has an 1-1 corresponsence to args
+      3. for tensor arg whose requires_grad is False, its corresponding grad in
+         grad_args will be a zero tensor with the same shape.
+    """
+
+    from torch._functorch.aot_autograd import AOTConfig, create_joint
+    from torch._higher_order_ops.utils import prepare_fw_with_masks_all_requires_grad
+
+    dummy_aot_config = AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        num_params_buffers=0,
+        aot_id=0,
+        keep_inference_input_mutations=False,
+    )
+    n_primals = len(args)
+
+    bw_fn = create_joint(
+        prepare_fw_with_masks_all_requires_grad(fn), aot_config=dummy_aot_config
+    )
+
+    def flat_fn(*args_and_grad_outs):
+        primals = args_and_grad_outs[:n_primals]
+        tangents = args_and_grad_outs[n_primals:]
+        grad_args = bw_fn(primals, tangents)[1]
+        assert len(args) == len(grad_args)
+        # In order to keep HOPs functional where the backward graph,
+        # would have outputs that are aliasing inputs.
+        # For example in cases where the backward of the function is simply
+        # passing the upstream gradients through.
+        maybe_clone = clone_outputs_aliasing_inputs(args_and_grad_outs)
+
+        return [
+            (
+                torch.zeros_like(arg)
+                if isinstance(arg, torch.Tensor) and grad is None
+                else maybe_clone(grad)
+            )
+            for grad, arg in zip(grad_args, primals)
+        ]
+
+    return flat_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
@@ -307,6 +389,7 @@ class CondAutogradOp(torch.autograd.Function):
     def forward(
         ctx,
         pred,
+<<<<<<< HEAD
         fw_true_graph,
         fw_false_graph,
         joint_true_graph,
@@ -320,10 +403,34 @@ def forward(
 
         with torch._C._AutoDispatchBelowAutograd():
             return cond_op(pred, fw_true_graph, fw_false_graph, operands)
+=======
+        true_fn,
+        false_fn,
+        *operands,
+    ):
+        ctx._pred = pred
+        ctx._true_bw_fn = create_bw_fn(
+            true_fn,
+            operands,
+        )
+        ctx._false_bw_fn = create_bw_fn(
+            false_fn,
+            operands,
+        )
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+        save_tensors_and_symints_for_backward(ctx, operands)
+
+        with torch._C._AutoDispatchBelowAutograd():
+            return cond_op(pred, true_fn, false_fn, operands)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def backward(ctx, *flat_grads):
         operands = saved_tensors_and_symints(ctx)
+<<<<<<< HEAD
 
         grads = cond_op(
             ctx._pred,
@@ -361,6 +468,46 @@ def cond_autograd(pred, true_fn, false_fn, operands):
         *operands,
     )
     return flat_out
+=======
+        args = operands + flat_grads
+        # TODO: we need to materialize the bw graphs because dynamo is unable to
+        # trace through the joint funcion when torch.compile torch.autograd.grad.
+        true_bw_gm = materialize_as_graph(
+            ctx._true_bw_fn,
+            args,
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+        false_bw_gm = materialize_as_graph(
+            ctx._false_bw_fn,
+            args,
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+        grads = cond_op(
+            ctx._pred,
+            true_bw_gm,
+            false_bw_gm,
+            args,
+        )
+        return None, None, None, *grads
+
+
+# Note:
+# As long as one of the tensors in pred or operands requires grad,
+# all the output would require grad with backward fn set to be the CondAutogradOp.
+# This is consistent with autograd.Function's semantic.
+@cond_op.py_autograd_impl
+def cond_autograd(pred, true_fn, false_fn, operands):
+    return CondAutogradOp.apply(
+        pred,
+        true_fn,
+        false_fn,
+        *operands,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @cond_op.py_impl(ProxyTorchDispatchMode)
@@ -389,7 +536,11 @@ def cond_fake_tensor_mode(mode, pred, true_fn, false_fn, operands):
 
     merged_outs = []
     for true_out, false_out in zip(flat_true_outs, flat_false_outs):
+<<<<<<< HEAD
         merged_outs.append(_merge_tensors(true_out, false_out, mode))
+=======
+        merged_outs.append(_merge_output(true_out, false_out, mode))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return pytree.tree_unflatten(merged_outs, true_out_spec)
 
 
@@ -411,15 +562,52 @@ def _get_attr_maybe_call(t: torch.Tensor, attr_name: str) -> Any:
         )
 
 
+<<<<<<< HEAD
 def _merge_tensors(
     a: Optional[torch.Tensor], b: Optional[torch.Tensor], mode: FakeTensorMode
 ):
     from torch.fx.experimental.symbolic_shapes import SymIntEqByExpr
+=======
+def _merge_output(
+    a: Optional[Union[torch.Tensor, int]],
+    b: Optional[Union[torch.Tensor, int]],
+    mode: FakeTensorMode,
+):
+    from torch.fx.experimental.symbolic_shapes import (
+        has_free_unbacked_symbols,
+        SymIntEqByExpr,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if a is None or b is None:
         assert a is None and b is None, (a, b)
         return None
 
+<<<<<<< HEAD
+=======
+    def min_max(s0, s1):
+        def _bound(s0, lower_bound: bool):
+            if isinstance(s0, int):
+                return s0
+            r = mode.shape_env.var_to_range.get(  # type: ignore[union-attr]
+                s0.node.expr,
+                torch.utils._sympy.value_ranges.ValueRanges.unknown(),
+            )
+            return r.lower if lower_bound else r.upper
+
+        return min(_bound(s0, True), _bound(s1, True)), max(
+            _bound(s0, False), _bound(s1, False)
+        )
+
+    if type(a) is int and type(b) is int:
+        if a == b:
+            return a
+        assert mode.shape_env is not None
+        merged_out = mode.shape_env.create_unbacked_symint()
+        mode.shape_env.constrain_symbol_range(merged_out.node.expr, *min_max(a, b))
+        return merged_out
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert type(a) is FakeTensor and type(b) is FakeTensor, (a, type(a), b, type(b))
 
     # Note: we don't check size, stride because
@@ -457,6 +645,7 @@ def _merge_tensors(
         u3 has range [5, 7]
     """
     merged_size: list[Union[int, torch.SymInt]] = []
+<<<<<<< HEAD
     for s0, s1 in zip(a.size(), b.size()):
         if SymIntEqByExpr(s0) == SymIntEqByExpr(s1):
             merged_size.append(s0)
@@ -476,6 +665,25 @@ def _bound(s0, lower_bound: bool):
                     _bound(s0, False), _bound(s1, False)
                 )
 
+=======
+
+    def _has_unbacked_symbols(s: Union[int, torch.SymInt]) -> bool:
+        if isinstance(s, int):
+            return False
+        else:
+            return has_free_unbacked_symbols(s.node.expr)
+
+    for s0, s1 in zip(a.size(), b.size()):
+        # If there are unbacked symbols leaked out of true_branch or false_branch
+        # we need to merge them with a new unbacked symbol and track in parent graph.
+        if (
+            not _has_unbacked_symbols(s0)
+            and not _has_unbacked_symbols(s1)
+            and SymIntEqByExpr(s0) == SymIntEqByExpr(s1)
+        ):
+            merged_size.append(s0)
+        else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert mode.shape_env is not None
             new_size = mode.shape_env.create_unbacked_symint()
             mode.shape_env.constrain_symbol_range(new_size.node.expr, *min_max(s0, s1))
@@ -625,12 +833,18 @@ def _maybe_expr(s: Union[int, torch.SymInt]):
 
 @cond_op.py_functionalize_impl
 def cond_func(ctx, pred, true_fn, false_fn, inputs):
+<<<<<<< HEAD
+=======
+    from torch._higher_order_ops.utils import _check_alias_and_mutation
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unwrapped_inputs = ctx.unwrap_tensors(inputs)
     unwrapped_pred = ctx.unwrap_tensors(pred)
     with ctx.redispatch_to_next():
         functional_true = ctx.functionalize(_maybe_run_with_interpreter(true_fn))
         functional_false = ctx.functionalize(_maybe_run_with_interpreter(false_fn))
         pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+<<<<<<< HEAD
         for branch in [true_fn, false_fn]:
             if _has_potential_branch_input_mutation(
                 branch, unwrapped_inputs, pre_dispatch=pre_dispatch
@@ -648,6 +862,12 @@ def cond_func(ctx, pred, true_fn, false_fn, inputs):
                     "If you are returning a view of the input, please make sure "
                     "to clone it. "
                 )
+=======
+        for branch, branch_name in [(true_fn, "cond_true"), (false_fn, "cond_false")]:
+            _check_alias_and_mutation(
+                branch, unwrapped_inputs, branch_name, pre_dispatch
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cond_return = cond_op(
             unwrapped_pred, functional_true, functional_false, unwrapped_inputs
diff --git a/torch/_higher_order_ops/effects.py b/torch/_higher_order_ops/effects.py
index ab64bed55176..a6e78b961aa4 100644
--- a/torch/_higher_order_ops/effects.py
+++ b/torch/_higher_order_ops/effects.py
@@ -24,11 +24,19 @@ class _EffectType(Enum):
 OpType = Union[torch._ops.HigherOrderOperator, torch._ops.OpOverload]
 
 
+<<<<<<< HEAD
 SIDE_EFFECTS: "WeakKeyDictionary[OpType, _EffectType]" = WeakKeyDictionary(
     {
         torch.ops.aten._print.default: _EffectType.ORDERED,
         call_torchbind: _EffectType.ORDERED,
     }
+=======
+SIDE_EFFECTS = WeakKeyDictionary[OpType, _EffectType](
+    [
+        (torch.ops.aten._print.default, _EffectType.ORDERED),
+        (call_torchbind, _EffectType.ORDERED),
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -123,6 +131,16 @@ def get_effect_key(op, args, kwargs) -> Optional[_EffectType]:
             SIDE_EFFECTS[op] = _EffectType.ORDERED
             return _EffectType.ORDERED
 
+<<<<<<< HEAD
+=======
+    for arg in kwargs.values():
+        if isinstance(arg, (torch.ScriptObject, FakeScriptObject)):
+            # Add it to the table so that next time we see the same op we don't
+            # have to parse through the args again
+            SIDE_EFFECTS[op] = _EffectType.ORDERED
+            return _EffectType.ORDERED
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return None
 
 
@@ -139,6 +157,15 @@ def with_effects_dense(
 ) -> tuple[torch.Tensor, ...]:
     out = op(*args, **kwargs)
     new_token = new_token_tensor()
+<<<<<<< HEAD
+=======
+    # [NOTE: with_effects return type]
+    # Note that we should only do *out for tuple type, but not list type.
+    # This is to match the schema of the op.
+    # For tuple output, the length of schema output is the same as the length of out.
+    # For list output, the length of schema output is 1 (e.g. Tensor[]) regardless of the
+    # length of the list.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(out, tuple):
         return (new_token, *out)
     return (new_token, out)
diff --git a/torch/_higher_order_ops/executorch_call_delegate.py b/torch/_higher_order_ops/executorch_call_delegate.py
index 2782ddce230b..4dc7b69a2dd6 100644
--- a/torch/_higher_order_ops/executorch_call_delegate.py
+++ b/torch/_higher_order_ops/executorch_call_delegate.py
@@ -87,7 +87,11 @@ def call_delegate_cpu(lowered_module, *args):
     return lowered_module.original_module.module()(*new_args)
 
 
+<<<<<<< HEAD
 @executorch_call_delegate.py_impl(torch._C.DispatchKey.Autograd)
+=======
+@executorch_call_delegate.py_autograd_impl
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # pyre-ignore
 def call_delegate_autograd(lowered_module, *args):
     # TODO: support autograd
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
index 411794d1d8c9..cb627471c792 100644
--- a/torch/_higher_order_ops/flex_attention.py
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -10,20 +10,36 @@
     _has_potential_branch_input_mutation,
     _maybe_reenter_make_fx,
     autograd_not_implemented,
+<<<<<<< HEAD
     reenter_make_fx,
+=======
+    has_user_subclass,
+    redirect_to_mode,
+    reenter_make_fx,
+    register_fake,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     save_tensors_and_symints_for_backward,
     saved_tensors_and_symints,
     UnsupportedAliasMutationException,
     validate_subgraph_args_types,
 )
 from torch._ops import HigherOrderOperator
+<<<<<<< HEAD
 from torch._subclasses import FakeTensorMode
+=======
+from torch._subclasses import FakeTensor
+from torch._subclasses.functional_tensor import FunctionalTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.proxy_tensor import (
     make_fx,
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
 from torch.fx.graph_module import GraphModule
+<<<<<<< HEAD
+=======
+from torch.utils.checkpoint import _CachedTorchDispatchMode, _CachingTorchDispatchMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Duplicate of _inductor/kernel/flex_attention.py to avoid circular import
@@ -396,6 +412,25 @@ def flex_attention_functionalize(
     """
     from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
 
+<<<<<<< HEAD
+=======
+    if has_user_subclass(
+        (
+            query,
+            key,
+            value,
+            score_mod,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        ),
+        allowed_subclasses=(FakeTensor, FunctionalTensor),
+    ):
+        return NotImplemented
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     query_unwrapped = ctx.unwrap_tensors(query)
     key_unwrapped = ctx.unwrap_tensors(key)
     value_unwrapped = ctx.unwrap_tensors(value)
@@ -420,6 +455,12 @@ def flex_attention_functionalize(
         functional_score_mod = ctx.functionalize(score_mod)
         pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
         with TransformGetItemToIndex():
+<<<<<<< HEAD
+=======
+            # TODO: So far only the input mutations are checked
+            # In the other HOPs, also aliases are checked which is
+            # omitted here
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mutates = _has_potential_branch_input_mutation(
                 score_mod, example_vals, pre_dispatch
             )
@@ -442,9 +483,40 @@ def flex_attention_functionalize(
     return ctx.wrap_tensors(out)  # type: ignore[return-value, arg-type]
 
 
+<<<<<<< HEAD
 def flex_attention_fake_impl(
     query: torch.Tensor, value: torch.Tensor
 ) -> tuple[torch.Tensor, torch.Tensor]:
+=======
+@register_fake(flex_attention)
+def flex_attention_fake_impl(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    block_mask: tuple,
+    scale: float,
+    kernel_options: dict[str, Any],
+    score_mod_other_buffers: tuple = (),
+    mask_mod_other_buffers: tuple = (),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if has_user_subclass(
+        (
+            query,
+            key,
+            value,
+            score_mod,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        ),
+        allowed_subclasses=(FakeTensor,),
+    ):
+        return NotImplemented
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: Figure out a better way to handle this for NJT than using sum()
     if query.is_nested:
         out = torch.empty_like(query, memory_format=torch.contiguous_format)
@@ -460,6 +532,7 @@ def flex_attention_fake_impl(
     return out, logsumexp
 
 
+<<<<<<< HEAD
 @flex_attention.py_impl(FakeTensorMode)
 def flex_attention_fake_tensor_mode(
     mode: FakeTensorMode,
@@ -476,6 +549,11 @@ def flex_attention_fake_tensor_mode(
     with mode:
         out, logsumexp = flex_attention_fake_impl(query, value)
         return out, logsumexp
+=======
+# Registers dispatches for SAC
+redirect_to_mode(flex_attention, _CachingTorchDispatchMode)
+redirect_to_mode(flex_attention, _CachedTorchDispatchMode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ---------------------------- Autograd Implementation ----------------------------
@@ -507,7 +585,11 @@ def create_fw_bw_graph(
         with disable_proxy_modes_tracing():
 
             def _from_fun(
+<<<<<<< HEAD
                 t: Union[Tensor, torch.SymInt, int]
+=======
+                t: Union[Tensor, torch.SymInt, int],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ) -> Union[Tensor, torch.SymInt, int]:
                 if isinstance(t, torch.Tensor):
                     return torch.empty_strided(
@@ -709,6 +791,10 @@ def backward(ctx: Any, grad_out: Tensor, grad_logsumexp: Tensor) -> tuple[Option
         return grad_query, grad_key, grad_value, *none_grads, *grad_score_mod_captured
 
 
+<<<<<<< HEAD
+=======
+# TODO: Rework DispatchKey.Autograd to py_autograd_impl
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @flex_attention.py_impl(DispatchKey.Autograd)
 def flex_attention_autograd(
     query: torch.Tensor,
@@ -794,7 +880,11 @@ def sdpa_dense_backward(
     actual_grad_value = _permute_strides(actual_grad_value, value.stride())
 
     def _maybe_new_buffer(
+<<<<<<< HEAD
         buffer: Union[torch.Tensor, torch.SymInt, int]
+=======
+        buffer: Union[torch.Tensor, torch.SymInt, int],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Optional[Union[torch.Tensor, torch.SymInt, int]]:
         if isinstance(buffer, torch.Tensor):
             return (
@@ -1086,6 +1176,28 @@ def flex_attention_backward_functionalize(
     since we know that the forward score mod function is assured to be free of mutations
     to the other_buffers, we skip that mutate check and go straight to redispatching.
     """
+<<<<<<< HEAD
+=======
+
+    if has_user_subclass(
+        (
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            grad_logsumexp,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        ),
+        allowed_subclasses=(FakeTensor, FunctionalTensor),
+    ):
+        return NotImplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     query_unwrapped = ctx.unwrap_tensors(query)
     key_unwrapped = ctx.unwrap_tensors(key)
     value_unwrapped = ctx.unwrap_tensors(value)
@@ -1138,9 +1250,14 @@ def flex_attention_backward_functionalize(
     return ctx.wrap_tensors((grad_query, grad_key, grad_value, grad_score_mod_captured))  # type: ignore[return-value,arg-type]
 
 
+<<<<<<< HEAD
 @flex_attention_backward.py_impl(FakeTensorMode)
 def flex_attention_backward_fake_tensor_mode(
     mode: FakeTensorMode,
+=======
+@register_fake(flex_attention_backward)
+def flex_attention_backward_fake_tensor_mode(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -1158,6 +1275,7 @@ def flex_attention_backward_fake_tensor_mode(
 ) -> tuple[
     torch.Tensor, torch.Tensor, torch.Tensor, tuple[Optional[torch.Tensor], ...]
 ]:
+<<<<<<< HEAD
     with mode:
         Bq, _, _, qk_head_dim = query.shape
         Bkv, Hkv, seq_len_kv, v_head_dim = value.shape
@@ -1192,5 +1310,58 @@ def flex_attention_backward_fake_tensor_mode(
 
 
 flex_attention_backward.py_impl(DispatchKey.Autograd)(
+=======
+    if has_user_subclass(
+        (
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            grad_logsumexp,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        ),
+        allowed_subclasses=(FakeTensor,),
+    ):
+        return NotImplemented
+    Bq, _, _, qk_head_dim = query.shape
+    Bkv, Hkv, seq_len_kv, v_head_dim = value.shape
+
+    grad_query = torch.empty_like(query)
+    # zeros_and_scatter creates a contiguous zeros tensor -> contiguous_format
+    grad_score_mod_captured = tuple(
+        [
+            (
+                torch.empty_like(buffer, memory_format=torch.contiguous_format)
+                if isinstance(buffer, torch.Tensor) and buffer.requires_grad
+                else None
+            )
+            for buffer in score_mod_other_buffers
+        ]
+    )
+
+    broadcasted_grad_key = key.new_empty((Bq, Hkv, seq_len_kv, qk_head_dim))
+    broadcasted_grad_key = _permute_strides(broadcasted_grad_key, key.stride())
+
+    broadcasted_grad_value = value.new_empty((Bq, Hkv, seq_len_kv, v_head_dim))
+    broadcasted_grad_value = _permute_strides(broadcasted_grad_value, value.stride())
+
+    if Bq > 1 and Bkv == 1:
+        grad_key = torch.sum(broadcasted_grad_key, dim=0, keepdim=True)
+        grad_value = torch.sum(broadcasted_grad_value, dim=0, keepdim=True)
+    else:
+        grad_key = broadcasted_grad_key
+        grad_value = broadcasted_grad_value
+
+    return grad_query, grad_key, grad_value, grad_score_mod_captured
+
+
+flex_attention_backward.py_autograd_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     autograd_not_implemented(flex_attention_backward, deferred_error=True)
 )
diff --git a/torch/_higher_order_ops/hints_wrap.py b/torch/_higher_order_ops/hints_wrap.py
index 681749290549..ccf49d1718ea 100644
--- a/torch/_higher_order_ops/hints_wrap.py
+++ b/torch/_higher_order_ops/hints_wrap.py
@@ -3,12 +3,18 @@
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._higher_order_ops.utils import (
+<<<<<<< HEAD
     _has_potential_branch_input_alias,
     _has_potential_branch_input_mutation,
     autograd_not_implemented,
     reenter_make_fx,
     unique_graph_id,
     UnsupportedAliasMutationException,
+=======
+    autograd_not_implemented,
+    reenter_make_fx,
+    unique_graph_id,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
@@ -77,7 +83,11 @@ def hints_wrapper_dense(body_fn, args, kwargs, hints):
     return body_fn(*args, **kwargs)
 
 
+<<<<<<< HEAD
 hints_wrapper.py_impl(DispatchKey.Autograd)(
+=======
+hints_wrapper.py_autograd_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     autograd_not_implemented(hints_wrapper, deferred_error=True)
 )
 
@@ -91,12 +101,18 @@ def hints_wrapper_fake_tensor_mode(mode, body_func, args, kwargs, hints):
 
 @hints_wrapper.py_functionalize_impl
 def hints_wrapper_functionalize(ctx, body_fn, args, kwargs, hints):
+<<<<<<< HEAD
+=======
+    from torch._higher_order_ops.utils import _check_alias_and_mutation
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unwrapped_args = ctx.unwrap_tensors(args)
     unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
     unwrapped_hints = ctx.unwrap_tensors(hints)
     with ctx.redispatch_to_next():
         functional_body_fn = ctx.functionalize(body_fn)
         pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+<<<<<<< HEAD
         if _has_potential_branch_input_mutation(
             body_fn, unwrapped_args, pre_dispatch=pre_dispatch
         ):
@@ -109,6 +125,12 @@ def hints_wrapper_functionalize(ctx, body_fn, args, kwargs, hints):
             raise UnsupportedAliasMutationException(
                 "body_fn of hints_wrapper might be aliasing the input!"
             )
+=======
+        _check_alias_and_mutation(
+            body_fn, unwrapped_args, "hints_wrapper", pre_dispatch
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputs = hints_wrapper(
             functional_body_fn,
             unwrapped_args,
diff --git a/torch/_higher_order_ops/invoke_subgraph.py b/torch/_higher_order_ops/invoke_subgraph.py
index 0cd3322f7e38..b4cf64a61dca 100644
--- a/torch/_higher_order_ops/invoke_subgraph.py
+++ b/torch/_higher_order_ops/invoke_subgraph.py
@@ -1,6 +1,12 @@
 # mypy: allow-untyped-defs
 
 
+<<<<<<< HEAD
+=======
+import contextlib
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional, Union
 
 import torch
@@ -10,62 +16,182 @@
 from torch._higher_order_ops.utils import (
     _from_fun,
     _maybe_reenter_make_fx,
+<<<<<<< HEAD
     clone_outputs_aliasing_inputs,
     get_dummy_aot_autograd_config,
     prepare_fw_with_masks,
     reenter_make_fx,
+=======
+    _set_compilation_env,
+    clone_outputs_aliasing_inputs,
+    FunctionalizeCtxWrapper,
+    get_dummy_aot_autograd_config,
+    HopInstance,
+    prepare_fw_with_masks,
+    reenter_make_fx,
+    register_fake,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     save_tensors_and_symints_for_backward,
     saved_tensors_and_symints,
 )
 from torch._ops import HigherOrderOperator
+<<<<<<< HEAD
 from torch._subclasses import FakeTensorMode
 from torch._subclasses.functional_tensor import disable_functional_mode
 from torch.fx.experimental.proxy_tensor import (
+=======
+from torch._subclasses.functional_tensor import disable_functional_mode
+from torch.fx.experimental.proxy_tensor import (
+    _temp_remove_metadata_torch_function_mode,
+    _temp_remove_pre_dispatch_torch_function_mode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     disable_proxy_modes_tracing,
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
 from torch.fx.graph_module import GraphModule
+<<<<<<< HEAD
+=======
+from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 invoke_subgraph_counter = 0
 
 
+<<<<<<< HEAD
 class InvokeSubgraphHOP(HigherOrderOperator):
     def __init__(self) -> None:
         super().__init__("invoke_subgraph")
+=======
+# During the tracing of the joint graph, we construct this information. This is
+# used to filter out grad_outs/tangents in the `backward` method of
+# InvokeSubgraphAutogradOp.
+@dataclass
+class OutputMetadata:
+    num_fw_outs: Optional[int] = None
+    indexes_with_none: set[int] = field(default_factory=set)
+    indexes_with_no_grad: set[int] = field(default_factory=set)
+
+
+class InvokeSubgraphHOP(HigherOrderOperator):
+    def __init__(self) -> None:
+        # Invoke subgraph does not have any state, it is just a wrapper over a
+        # subgraph, so we can safely cache the HOP.
+        super().__init__("invoke_subgraph", cacheable=True)
+        # This is used by the fake tensor cache key validator to extract the
+        # subgraph and iterate over the nodes to find if all nodes are fake
+        # tensor cacheable.
+        self.subgraph_indexes = [
+            0,
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # identifier is setup by upper part of the stack. This helps us in
     # identifying two invoke_subgraph calls have same subgraph.
     def __call__(
         self,
+<<<<<<< HEAD
         subgraph: GraphModule,
         identifier: Optional[str],
         operands: Union[
             list[Union[torch.Tensor, int, torch.SymInt]],
             tuple[Union[torch.Tensor, int, torch.SymInt]],
         ],
+=======
+        subgraph: Union[GraphModule, FunctionalizeCtxWrapper],
+        identifier: Optional[str],
+        *operands,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         assert identifier is None or isinstance(
             identifier, str
         ), "identifier must be a None or a string"
 
+<<<<<<< HEAD
         assert isinstance(
             operands, (list, tuple)
         ), f"invoke_subgraph operands must be a list or tuple of tensors/ints/SymInts {operands}"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert all(
             isinstance(o, (torch.Tensor, int, torch.SymInt)) for o in operands
         ), f"invoke_subgraph operands must be a list of tensors/ints/SymInts {operands}"
 
+<<<<<<< HEAD
         return super().__call__(subgraph, identifier, operands)
+=======
+        return super().__call__(subgraph, identifier, *operands)
+
+    def gen_schema(self, subgraph, identifier, *operands):
+        from torch._higher_order_ops.schema import HopSchemaGenerator
+        from torch._higher_order_ops.utils import (
+            check_input_alias_and_mutation_return_outputs,
+            materialize_as_graph,
+        )
+
+        gm: torch.fx.GraphModule = (
+            subgraph
+            if isinstance(subgraph, torch.fx.GraphModule)
+            else materialize_as_graph(subgraph, operands)
+        )
+
+        schema_gen = HopSchemaGenerator(self)
+        schema_gen.add_arg("subgraph", gm)
+        schema_gen.add_arg("identifier", identifier)
+        (
+            _,
+            _,
+            _,
+            mutated_inputs,
+            outputs,
+        ) = check_input_alias_and_mutation_return_outputs(gm, operands)
+        for idx, arg in enumerate(operands):
+            schema_gen.add_arg(f"arg{idx}", arg, is_mutated=idx in mutated_inputs)
+        for out in outputs:
+            schema_gen.add_output(out)
+
+        return schema_gen.gen_schema()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 invoke_subgraph = InvokeSubgraphHOP()
 
 
+<<<<<<< HEAD
 def invoke_subgraph_placeholder(subgraph, *args, **kwargs):
     # Just a placeholder for Dynamo to replace with invoke_subgraph
     return subgraph(*args, **kwargs)
+=======
+def invoke_subgraph_placeholder(func, *args, **kwargs):
+    if torch.compiler.is_dynamo_compiling():
+        # This is just a placeholder for Dynamo to replace with invoke_subgraph
+        raise RuntimeError("invoke_subgraph should not be called directly in Dynamo")
+
+    if torch.compiler.is_compiling():
+        # For non-strict export tracing, we still want to go through Dynamo
+        from torch._dynamo.backends.debugging import (
+            make_eager_backend_with_torch_function_mode,
+        )
+
+        def _invoke_subgraph_placeholder_wrapper(func, args):
+            return invoke_subgraph_placeholder(func, *args)
+
+        with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit(), _temp_remove_pre_dispatch_torch_function_mode():
+            with _temp_remove_metadata_torch_function_mode() as metadata_mode:
+                if metadata_mode:
+                    backend = make_eager_backend_with_torch_function_mode(metadata_mode)
+                else:
+                    backend = "eager"
+
+                return torch.compile(
+                    _invoke_subgraph_placeholder_wrapper,
+                    backend=backend,
+                    fullgraph=True,
+                )(func, args)
+
+    return func(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def mark_compile_region(fn=None):
@@ -80,7 +206,17 @@ def mark_compile_region(fn=None):
 
     def wrap(func):
         def inner(*args, **kwargs):
+<<<<<<< HEAD
             return invoke_subgraph_placeholder(func, *args, **kwargs)
+=======
+            # Get the innermost function to avoid nested compile regions
+            inner_func = func
+            while hasattr(inner_func, "__marked_compile_region_fn__"):
+                inner_func = inner_func.__marked_compile_region_fn__
+            return invoke_subgraph_placeholder(inner_func, *args, **kwargs)
+
+        inner.__marked_compile_region_fn__ = func  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return inner
 
@@ -97,6 +233,10 @@ def get_invoke_subgraph_cache():
     return cache
 
 
+<<<<<<< HEAD
+=======
+# TODO (@anijain2305) - Delete this function when base_hop uses invoke_subgraph infra
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def trace_joint_graph(fn, fw_inputs, fw_outputs):
     """
     Naively trace out a joint graph. This simplifies the reconstruction of joint
@@ -137,16 +277,68 @@ def joint_fn(*primals_and_tangents):
     return _maybe_reenter_make_fx(joint_fn)(*joint_operands)
 
 
+<<<<<<< HEAD
+=======
+# TODO (@anijain2305) - Delete this function when base_hop uses invoke_subgraph infra
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def create_fw_bw_graph(subgraph, operands, grad_outputs=None):
     with suspend_functionalization(), disable_functional_mode():
         with disable_proxy_modes_tracing():
             # args are functional tensors, generate some example tensors
             fw_inputs = pytree.tree_map(_from_fun, operands)
 
+<<<<<<< HEAD
             if grad_outputs is None:
                 # Infer grad_outputs to be the same properties as the fw_outputs
                 # if they're not passed in.
                 grad_outputs = pytree.tree_map(_from_fun, subgraph(*fw_inputs))
+=======
+            from torch._guards import detect_fake_mode
+
+            fake_mode = detect_fake_mode(fw_inputs)
+            context = (
+                nullcontext()
+                if fake_mode is None or fake_mode.shape_env is None
+                else fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+            )
+
+            with context:
+                fw_outs = pytree.tree_map(_from_fun, subgraph(*fw_inputs))
+
+            num_fw_outs = len(fw_outs)
+
+            # Collect the indexes of none in the output to check that the grad
+            # is None at the corresponding index in the backward. This check is
+            # performed in the autograd.Function - InvokeSubgraphAutogradOp.
+            # Also collect the indexes of no_grad in the output to filter out
+            # the grad_outs in the `backward` method.
+            output_metadata = OutputMetadata()
+
+            output_metadata.num_fw_outs = num_fw_outs
+            for idx, fw_out in enumerate(fw_outs):
+                if fw_out is None:
+                    output_metadata.indexes_with_none.add(idx)
+                elif not fw_out.requires_grad:
+                    output_metadata.indexes_with_no_grad.add(idx)
+
+            if grad_outputs is None:
+                # Infer grad_outputs to be the same properties as the fw_outputs
+                # if they're not passed in
+                # Although fw_outs are equivalent to grad_outputs for tracing
+                # purposes, we have to carefully handle the None and fw_out that do
+                # not have require_grad. At those indexes, we will have None in the
+                # backward graph.
+                grad_outputs = fw_outs
+                grad_outputs = [grad for grad in grad_outputs if grad is not None]
+                grad_outputs = [grad for grad in grad_outputs if grad.requires_grad]
+
+                # Force grad_out to be contiguous. This is because at runtime,
+                # grad_out could have different strides than fw_outs. So, we
+                # force the grad_outs to be contiguous for both tracing and
+                # runtime.
+                grad_outputs = [grad.contiguous() for grad in grad_outputs]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if any(
                 not isinstance(out, torch.Tensor)
                 for out in grad_outputs
@@ -166,11 +358,106 @@ def create_fw_bw_graph(subgraph, operands, grad_outputs=None):
                 fw_inputs,
                 grad_outputs,
             )
+<<<<<<< HEAD
             return fw_graph, bw_graph, len(grad_outputs)
+=======
+            return fw_graph, bw_graph, output_metadata
+
+
+def get_output_metadata(subgraph, *operands):
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+            # args are functional tensors, generate some example tensors
+            fw_inputs = pytree.tree_map(_from_fun, operands)
+
+            from torch._guards import detect_fake_mode
+
+            fake_mode = detect_fake_mode(fw_inputs)
+            context = (
+                nullcontext()
+                if fake_mode is None or fake_mode.shape_env is None
+                else fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+            )
+
+            with context:
+                fw_outs = pytree.tree_map(_from_fun, subgraph(*fw_inputs))
+
+            num_fw_outs = len(fw_outs)
+
+            # Collect the indexes of none in the output to check that the grad
+            # is None at the corresponding index in the backward. This check is
+            # performed in the autograd.Function - InvokeSubgraphAutogradOp.
+            # Also collect the indexes of no_grad in the output to filter out
+            # the grad_outs in the `backward` method.
+            output_metadata = OutputMetadata()
+
+            output_metadata.num_fw_outs = num_fw_outs
+            for idx, fw_out in enumerate(fw_outs):
+                if fw_out is None:
+                    output_metadata.indexes_with_none.add(idx)
+                elif not fw_out.requires_grad:
+                    output_metadata.indexes_with_no_grad.add(idx)
+            return output_metadata
+
+
+def trace_joint_graph_as_bwd(
+    subgraph, num_primals, joint_operands, include_key_set, exclude_key_set
+):
+    """
+    Naively trace out a joint graph. This simplifies the reconstruction of joint
+    graph in the min-cut partitioner later on.
+    """
+    from torch._functorch.aot_autograd import create_joint
+
+    dummy_aot_config = get_dummy_aot_autograd_config()
+
+    if isinstance(subgraph, torch.fx.GraphModule):
+
+        def graph_with_interpreter(*args):
+            # Running graph with interpreter is needed for propagating the stack_trace
+            with torch.fx.traceback.preserve_node_meta():
+                return torch.fx.Interpreter(subgraph).run(*args)
+
+        fn = graph_with_interpreter
+    else:
+        fn = subgraph
+
+    # This joint_fn is inserted as the backward graph as is. This simplifies the
+    # min-cut partitioner work later on.
+    #   Input signature - (*primals, *tangents)
+    #   Output signature - (*grads, *fw_outs)
+    # The output signature is deliberately kept grads first and fw_outs second.
+    # Having grads first makes the min-cut partitioner HOP graph stitching
+    # easier.
+    def joint_fn(*primals_and_tangents):
+        primals = primals_and_tangents[:num_primals]
+        tangents = primals_and_tangents[num_primals:]
+
+        fw_outs, grads = create_joint(
+            prepare_fw_with_masks(fn), aot_config=dummy_aot_config
+        )(primals, tangents)
+
+        maybe_clone = clone_outputs_aliasing_inputs(primals_and_tangents)
+
+        # return signature is deliberately kept (*grads, *fw_outs). This
+        # simplifies partitioning work later on.
+        return pytree.tree_map(maybe_clone, tuple(grads + list(fw_outs)))
+
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+            joint_operands = [_from_fun(arg) for arg in joint_operands]
+            with contextlib.ExitStack() as stack:
+                stack.enter_context(
+                    torch._C._ForceDispatchKeyGuard(include_key_set, exclude_key_set),
+                )
+                with torch.enable_grad():
+                    return _maybe_reenter_make_fx(joint_fn)(*joint_operands)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class InvokeSubgraphAutogradOp(torch.autograd.Function):
     """
+<<<<<<< HEAD
     This autograd function op is to stash the backward graph in the ctx while
     running forward.
     """
@@ -202,10 +489,96 @@ def backward(ctx, *grad_outs):
         # While tracing we made the assumption that tangents are contiguous. So,
         # force the grad_outs to be contiguous.
         contiguous_grad_outs = tuple([o.contiguous() for o in grad_outs])
+=======
+    Saves the subgraph, i.e. original callable, in the forward method. And then
+    traces out a joint graph in the backward. This delaying of tracing in
+    backward, also called as lazy backward, ensures that the assumptions about
+    the grad_out strides and tensor-subclass-ness are already accounted for.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        subgraph,
+        identifier,
+        output_metadata,
+        *operands,
+    ):
+        # We want to delay the backward graph construction until the backward.
+        # So in forward, we just run the fw callable as is. And save all the
+        # information necessary to construct the backward graph in the ctx.
+        ctx._subgraph = subgraph
+        ctx._identifier = identifier
+        ctx._output_metadata = output_metadata
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+
+        save_tensors_and_symints_for_backward(ctx, operands)
+
+        with torch._C._AutoDispatchBelowAutograd():
+            out = invoke_subgraph(
+                subgraph,
+                f"fw_{identifier}",
+                *operands,
+            )
+
+        # Check that None is at expected indexes.
+        for idx, o in enumerate(out):
+            if o is None:
+                assert idx in output_metadata.indexes_with_none
+
+        return out
+
+    @staticmethod
+    def backward(
+        ctx,
+        *grad_outs,
+    ):
+        from torch._dynamo.utils import dynamo_timed
+
+        subgraph = ctx._subgraph
+        identifier = ctx._identifier
+        output_metadata = ctx._output_metadata
+        primals = saved_tensors_and_symints(ctx)
+
+        # Filter out grads that are None or do not require_grad. This was
+        # the assumption we made during the tracing of joint_graph.
+        filtered_grad_outs = []
+        for idx, o in enumerate(grad_outs):
+            if o is None:
+                assert idx in output_metadata.indexes_with_none
+            elif idx in output_metadata.indexes_with_no_grad:
+                # Deliberately skip over the grad_outs which we know should be
+                # None because the corresponding fwd_out does not require_grad.
+                pass
+            else:
+                filtered_grad_outs.append(o)
+        filtered_grad_outs = tuple(filtered_grad_outs)
+
+        # Important note - Even though the forward graph can be same for
+        # different invoke_subgraphs, the backward graph can be different
+        # because the tangent strides can be different. So, here we cache on
+        # tangent_metadata in addition to identifier
+        from torch._guards import detect_fake_mode
+        from torch._subclasses._fake_tensor_utils import _CacheKeyState
+        from torch._subclasses.fake_tensor import extract_tensor_metadata
+
+        fake_mode = detect_fake_mode(primals + filtered_grad_outs)
+        state = _CacheKeyState(fake_mode.shape_env)
+
+        tangent_metadata: list[object] = []
+        for tangent in filtered_grad_outs:
+            metadata = extract_tensor_metadata(tangent)
+            metadata._flatten_into(tangent_metadata, fake_mode, state)
+        tangent_metadata = tuple(tangent_metadata)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # bw_graph is a joint graph with signature (*primals_and_tangents) and
         # returns (*grads_and_fw_outs). To get the grads, we use the num_fw_outs
         # to extract the grads.
+<<<<<<< HEAD
         primals_and_tangents = primals + contiguous_grad_outs
         grads = invoke_subgraph(
             bw_graph, f"___backward_{identifier}", primals_and_tangents
@@ -238,6 +611,47 @@ def _(subgraph, identifier, operands):
         with torch._C._AutoDispatchBelowAutograd():
             return invoke_subgraph(subgraph, identifier, operands)
 
+=======
+        primals_and_tangents = primals + filtered_grad_outs
+
+        # Check if we have already traced the bwd subgraph.
+        bw_graph = None
+        suffix = None
+        invoke_subgraph_cache = get_invoke_subgraph_cache()
+        cache_hit = False
+        if invoke_subgraph_cache:
+            bw_graph, suffix = invoke_subgraph_cache.get_lazy_bwd_entry(
+                identifier, tangent_metadata
+            )
+            cache_hit = bw_graph is not None
+
+        if bw_graph is None:
+            assert suffix is None
+            with dynamo_timed(
+                "invoke_subgraph_trace_joint_graph", log_pt2_compile_event=True
+            ):
+                bw_graph = trace_joint_graph_as_bwd(
+                    subgraph,
+                    len(primals),
+                    primals_and_tangents,
+                    ctx._fw_include_key_set,
+                    ctx._fw_exclude_key_set,
+                )
+
+        if invoke_subgraph_cache and not cache_hit:
+            suffix = invoke_subgraph_cache.add_lazy_bwd_entry(
+                identifier, tangent_metadata, bw_graph
+            )
+
+        grads = invoke_subgraph(
+            bw_graph, f"bw_{identifier}_{suffix}", *primals_and_tangents
+        )[: -output_metadata.num_fw_outs]
+        return None, None, None, *grads
+
+
+@invoke_subgraph.py_autograd_impl
+def _(subgraph, identifier, *operands):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Check if we have already traced the subgraph.
     invoke_subgraph_cache = get_invoke_subgraph_cache()
     if invoke_subgraph_cache:
@@ -246,11 +660,19 @@ def _(subgraph, identifier, operands):
         ):
             return saved_autograd_fn(*operands)
 
+<<<<<<< HEAD
     fw_graph, bw_graph, num_fw_outs = create_fw_bw_graph(subgraph, operands)
 
     def autograd_fn_callable(*args):
         return InvokeSubgraphAutogradOp.apply(
             fw_graph, bw_graph, identifier, num_fw_outs, *args
+=======
+    output_metadata = get_output_metadata(subgraph, *operands)
+
+    def autograd_fn_callable(*args):
+        return InvokeSubgraphAutogradOp.apply(
+            subgraph, identifier, output_metadata, *args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     # Save the autograd_fn_callable in the dispatch set cache.
@@ -260,13 +682,52 @@ def autograd_fn_callable(*args):
     return autograd_fn_callable(*operands)
 
 
+<<<<<<< HEAD
 @invoke_subgraph.py_functionalize_impl
 def _(ctx, subgraph, identifier, operands):
     unwrapped_operands = ctx.unwrap_tensors(operands)
+=======
+@invoke_subgraph.py_impl(DispatchKey.CompositeExplicitAutograd)
+def _(subgraph, identifier, *operands):
+    from torch.utils._python_dispatch import _get_current_dispatch_mode
+
+    mode = _get_current_dispatch_mode()
+    assert mode is None, "Mode should never be enabled for CPU/CUDA key"
+    return subgraph(*operands)
+
+
+@invoke_subgraph.py_functionalize_impl
+def _(ctx, subgraph, identifier, *operands):
+    from torch._higher_order_ops.auto_functionalize import (
+        can_auto_functionalize,
+        do_auto_functionalize_v2,
+    )
+
+    unwrapped_operands = ctx.unwrap_tensors(operands)
+    hop_instance = HopInstance.create(invoke_subgraph, subgraph, identifier, *operands)
+    if can_auto_functionalize(hop_instance):
+        # NOTE: [auto_functionalize x invoke_subgraph caching]
+        # We call auto_functionalized_v2 to support input mutation of invoke_subgraph.
+        # See NOTE [Support input mutation of hops] for the overall design.
+        #
+        # invoke_subgraph is special because of its identifier based caching machanism.
+        # In invoke_subgraph's functionalization key implementation, we create a new
+        # identifer because the subgraph is replaced by FunctionWithNoFreeVars in a
+        # functional + epilogue form.
+        assert isinstance(identifier, str), identifier
+        return do_auto_functionalize_v2(
+            ctx.mode,
+            hop_instance,
+            (subgraph, "auto_functionalized_" + identifier, *operands),
+            {},
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with ctx.redispatch_to_next():
         # NB: There is an assumption that subgraph does not mutate inputs and
         # there is no aliasing. Its Dynamo responsibility to prevent formation
         # of invoke_subgraph ops if input aliasing/mutation is detected.
+<<<<<<< HEAD
         functionalized_subgraph = ctx.functionalize(subgraph)
         out = invoke_subgraph(functionalized_subgraph, identifier, unwrapped_operands)
     return ctx.wrap_tensors(out)
@@ -276,11 +737,28 @@ def _(ctx, subgraph, identifier, operands):
 def _(mode, subgraph, identifier, operands):
     # TODO(anijain2305) - Implement fake tensor caching.
     with mode:
+=======
+        functionalized_subgraph = FunctionalizeCtxWrapper(ctx, subgraph)
+        out = invoke_subgraph(functionalized_subgraph, identifier, *unwrapped_operands)
+    return ctx.wrap_tensors(out)
+
+
+# Register the hop fake fn. This will be called in the fake_tensor _dispatch_impl.
+@register_fake(invoke_subgraph)
+def _(subgraph, identifier, *operands):
+    from torch._dynamo.utils import dynamo_timed
+
+    with dynamo_timed("invoke_subgraph_fake_tensor", log_pt2_compile_event=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return subgraph(*operands)
 
 
 @invoke_subgraph.py_impl(ProxyTorchDispatchMode)
+<<<<<<< HEAD
 def _(proxy_mode: ProxyTorchDispatchMode, subgraph, identifier, operands):
+=======
+def _(proxy_mode: ProxyTorchDispatchMode, subgraph, identifier, *operands):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Check if we have already traced the subgraph.
     graph = None
     invoke_subgraph_cache = get_invoke_subgraph_cache()
@@ -288,6 +766,7 @@ def _(proxy_mode: ProxyTorchDispatchMode, subgraph, identifier, operands):
         graph = invoke_subgraph_cache.get_proxy_dispatch_entry(identifier)
 
     if graph is None:
+<<<<<<< HEAD
         graph = reenter_make_fx(subgraph)(*operands)
         assert isinstance(proxy_mode.tracer, torch.fx.Tracer)
         qualname = proxy_mode.tracer.get_fresh_qualname("repeated_subgraph")
@@ -297,11 +776,69 @@ def _(proxy_mode: ProxyTorchDispatchMode, subgraph, identifier, operands):
 
     node_args = (graph, identifier, operands)
     proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)  # type: ignore[union-attr]
+=======
+        from torch._dynamo.utils import dynamo_timed
+
+        with dynamo_timed("invoke_subgraph_proxy_tensor", log_pt2_compile_event=True):
+            graph = reenter_make_fx(subgraph)(*operands)
+
+        from torch._guards import detect_fake_mode
+
+        fake_mode = detect_fake_mode(operands)
+        insert_deferred_runtime_asserts(
+            graph,
+            fake_mode.shape_env,
+            "invoke_subgraph_proxy_torch_dispatch_mode",
+            export=True,
+        )
+        graph.recompile()
+
+        assert isinstance(proxy_mode.tracer, torch.fx.Tracer)
+        if invoke_subgraph_cache:
+            invoke_subgraph_cache.add_proxy_dispatch_entry(identifier, graph)
+
+    node_args = (graph, identifier, *operands)
+
+    def _unwrap_proxy(arg):
+        if isinstance(arg, torch.fx.GraphModule):
+            # NOTE: [invoke_subgraph proxy_mode x auto_functionalize]
+            # Previously, we assumed that `invoke_subgraph` would always be traced with the same tracer.
+            # This allowed us to cache modules by their identifiers, assuming they were already registered.
+            #
+            # However, this assumption no longer holds when we auto-functionalize `invoke_subgraph`.
+            # auto_functionalize functionalizes the subgraph and wrap it with `FunctionWithNoFreeVars`.
+            # In the proxy mode implementation of `auto_functionalized_v2`, we need to materialize `FunctionWithNoFreeVars`
+            # input as a graph module. To do this, we re-trace the `invoke_subgraph` hop, which starts a new sub-tracer
+            # (see NOTE [materialize callable inputs as graph]). # When the new sub-tracer traces the `invoke_subgraph`
+            # with a previously cached identifier, the corresponding graph module might not
+            # exist as a submodule in the new tracer's root. Therefore, we register it as a submodule below.
+            #
+            # The alternative is to give a new identifer when we re-trace the invoke_subgraph but this will increase
+            # the compilatoin time, which defeats the purpose of caching.
+            registered_before = False
+            for (
+                _,
+                submod,
+            ) in proxy_mode.tracer.root.named_modules():  # type: ignore[union-attr]
+                if arg is submod:
+                    registered_before = True
+
+            if not registered_before:
+                qualname = proxy_mode.tracer.get_fresh_qualname("repeated_subgraph")  # type: ignore[union-attr]
+                proxy_mode.tracer.root.register_module(qualname, arg)  # type: ignore[union-attr]
+        return proxy_mode.tracer.unwrap_proxy(arg)  # type: ignore[union-attr]
+
+    proxy_args = pytree.tree_map(_unwrap_proxy, node_args)  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out_proxy = proxy_mode.tracer.create_proxy(
         "call_function", invoke_subgraph, proxy_args, {}
     )
 
+<<<<<<< HEAD
     example_out = invoke_subgraph(graph, identifier, operands)
+=======
+    example_out = invoke_subgraph(graph, identifier, *operands)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return track_tensor_tree(
         example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
     )
diff --git a/torch/_higher_order_ops/map.py b/torch/_higher_order_ops/map.py
index cbd7f9c45fd4..b03a0dafbabc 100644
--- a/torch/_higher_order_ops/map.py
+++ b/torch/_higher_order_ops/map.py
@@ -1,8 +1,16 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+import functools
+from typing import Callable, Union
+from typing_extensions import TypeVarTuple
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._dispatch.python import suspend_functionalization
+<<<<<<< HEAD
 from torch._functorch.aot_autograd import AOTConfig, create_joint
 from torch._higher_order_ops.utils import (
     _has_potential_branch_input_alias,
@@ -11,6 +19,9 @@
     reenter_make_fx,
     UnsupportedAliasMutationException,
 )
+=======
+from torch._higher_order_ops.utils import _maybe_run_with_interpreter, reenter_make_fx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch._subclasses.functional_tensor import disable_functional_mode
@@ -32,6 +43,7 @@
 )
 
 
+<<<<<<< HEAD
 # TODO: We add this to prevent dymamo from tracing into map_wrapper,
 # remove the wrapper call when it's ready.
 class MapWrapper(HigherOrderOperator):
@@ -42,6 +54,8 @@ def __call__(self, xs, *args):
         return map_wrapper(xs, *args)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MapImpl(HigherOrderOperator):
     def __init__(self):
         super().__init__("map_impl")
@@ -50,6 +64,7 @@ def __call__(self, *args, **kwargs):
         return super().__call__(*args, **kwargs)
 
 
+<<<<<<< HEAD
 map = MapWrapper()
 
 map_impl = MapImpl()
@@ -64,6 +79,10 @@ def __call__(self, *args, **kwargs):
     keep_inference_input_mutations=False,
 )
 
+=======
+map_impl = MapImpl()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def create_fw_bw_graph(f, num_mapped_args, *args):
     mapped_xs = args[:num_mapped_args]
@@ -96,6 +115,21 @@ def create_fw_bw_graph(f, num_mapped_args, *args):
 
             fw_graph = make_fx(f)(*example_xs, *example_pos_args)
 
+<<<<<<< HEAD
+=======
+        from torch._functorch.aot_autograd import AOTConfig, create_joint
+
+        dummy_aot_config = AOTConfig(
+            fw_compiler=None,  # type: ignore[arg-type]
+            bw_compiler=None,  # type: ignore[arg-type]
+            partition_fn=None,  # type: ignore[arg-type]
+            decompositions={},
+            num_params_buffers=0,
+            aot_id=0,
+            keep_inference_input_mutations=False,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def joint_f(*example_args):
             joint_mapped_args = example_args[:joint_num_mapped]
             args = example_args[joint_num_mapped:]
@@ -124,12 +158,65 @@ def joint_f(*example_args):
         return fw_graph, joint_graph
 
 
+<<<<<<< HEAD
 def map_wrapper(f, xs, *args):
     flat_xs, xs_spec = pytree.tree_flatten(xs)
     if not all(isinstance(t, torch.Tensor) for t in flat_xs):
         raise RuntimeError(f"Mapped xs can only consist of tensors. Got xs {flat_xs}.")
 
     num_mapped_args = len(flat_xs)
+=======
+def map(
+    f: Callable[[pytree.PyTree, tuple[pytree.PyTree, ...]], pytree.PyTree],
+    xs: Union[pytree.PyTree, torch.Tensor],
+    *args: TypeVarTuple,
+):
+    r"""
+    Perfoms a map of f with xs. Intuitively, you can think of the semantic being:
+
+    out = []
+    for idx in len(xs.size(0)):
+        xs_sliced = xs.select(0, idx)
+        out.append(f(xs_sliced, *args))
+    torch.stack(out)
+
+    .. warning::
+        `torch._higher_order_ops.map` is a prototype feature in PyTorch. It currently
+        does not support autograd and you may run into miscompiles.
+        Read more about feature classification at:
+        https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+
+
+    Args:
+        f (Callable): a callable that takes an input x, that could either be a single Tensor
+            or a nested dict, list of tensors and some additional inputs
+        xs: the inputs that're to be mapped over. We'll iterate over the first dim of each x
+            and perform f on each slice.
+
+        *args: additional arguments provided to each step of f. They could also be omitted and
+            map is able to automatically figure out the read dependency.
+
+    Return:
+        the stacked output for each step of f
+
+    Example:
+
+        def f(xs):
+            return xs[0] + xs[1] + const1 + const2
+
+        xs = [torch.randn(2, 3), torch.randn(2, 3)]
+        const1 = torch.randn(2, 3)
+        const2 = torch.randn(2, 3)
+        # returns a tensor of shape [2, 2, 3]
+        torch._higher_order_ops.map(f, xs)
+
+    """
+    flat_xs, xs_spec = pytree.tree_flatten(xs)
+    flat_args, args_spec = pytree.tree_flatten(args)
+    if not all(isinstance(t, torch.Tensor) for t in flat_xs):
+        raise RuntimeError(f"Mapped xs can only consist of tensors. Got xs {flat_xs}.")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shapes = [xs.shape for xs in flat_xs]
     leading_dim_size = shapes[0][0]
     if leading_dim_size == 0:
@@ -140,6 +227,7 @@ def map_wrapper(f, xs, *args):
             f"Leading dimensions of mapped xs must be consistent. Got shapes {shapes}."
         )
 
+<<<<<<< HEAD
     out_spec = None
 
     def flat_fn(*flat_args):
@@ -154,6 +242,26 @@ def flat_fn(*flat_args):
     return pytree.tree_unflatten(
         map_impl(flat_fn, flat_xs, args), out_spec  # type: ignore[arg-type]
     )
+=======
+    def run_flattened_map(f, flat_xs, flat_args):
+        def wrapped_fn(*flat_args, f, xs_tree_spec, args_tree_spec, num_xs):
+            xs = pytree.tree_unflatten(flat_args[:num_xs], xs_tree_spec)
+            args = pytree.tree_unflatten(flat_args[num_xs:], args_tree_spec)
+            return f(xs, *args)
+
+        inner_f = functools.partial(
+            wrapped_fn,
+            f=f,
+            xs_tree_spec=xs_spec,
+            args_tree_spec=args_spec,
+            num_xs=len(flat_xs),
+        )
+        return map_impl(inner_f, flat_xs, flat_args)
+
+    from torch._higher_order_ops.utils import _maybe_compile_and_run_fn
+
+    return _maybe_compile_and_run_fn(run_flattened_map, f, flat_xs, flat_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class MapAutogradOp(torch.autograd.Function):
@@ -184,8 +292,11 @@ def backward(ctx, *flat_grads):
 
 
 def trace_map(proxy_mode, func_overload, f, xs, pos_args):
+<<<<<<< HEAD
     leading_dim_size = xs[0].shape[0]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     example_input = _unstack_pytree(xs)[0]
     body_graph = f
 
@@ -195,6 +306,7 @@ def trace_map(proxy_mode, func_overload, f, xs, pos_args):
 
     proxy_mode.tracer.root.register_module(next_name, body_graph)
 
+<<<<<<< HEAD
     with disable_proxy_modes_tracing():
         example_outs = body_graph(*example_input, *pos_args)
 
@@ -204,6 +316,9 @@ def expand_tensor(t):
             return t
 
         expanded_outs = pytree.tree_map(expand_tensor, example_outs)
+=======
+    fake_outs = map_impl(body_graph, xs, pos_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     node_args = (body_graph, list(xs), list(pos_args))
     proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
@@ -211,7 +326,11 @@ def expand_tensor(t):
         "call_function", func_overload, proxy_args, {}, name="map_impl"
     )
     return track_tensor_tree(
+<<<<<<< HEAD
         expanded_outs, out_proxy, constant=None, tracer=proxy_mode.tracer
+=======
+        fake_outs, out_proxy, constant=None, tracer=proxy_mode.tracer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -221,7 +340,11 @@ def map_dense(f, xs, pos_args):
     return _stack_pytree(pytrees)
 
 
+<<<<<<< HEAD
 @map_impl.py_impl(DispatchKey.Autograd)
+=======
+@map_impl.py_autograd_impl
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def map_autograd(f, xs, pos_args):
     num_mapped_args = len(xs)
     fw_graph, bw_graph = create_fw_bw_graph(f, num_mapped_args, *xs, *pos_args)
@@ -242,11 +365,17 @@ def map_fake_tensor_mode(mode, f, xs, args):
 
 @map_impl.py_functionalize_impl
 def map_functionalize(ctx, f, xs, pos_args):
+<<<<<<< HEAD
+=======
+    from torch._higher_order_ops.utils import _check_alias_and_mutation
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unwrapped_xs = ctx.unwrap_tensors(xs)
     unwrapped_args = ctx.unwrap_tensors(pos_args)
     wrapped_fn = ctx.functionalize(_maybe_run_with_interpreter(f))
 
     with ctx.redispatch_to_next():
+<<<<<<< HEAD
         with disable_proxy_modes_tracing():
             example_inputs = (*_unstack_pytree(unwrapped_xs)[0], *unwrapped_args)
         pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
@@ -262,3 +391,20 @@ def map_functionalize(ctx, f, xs, pos_args):
 
         map_return = map_impl(wrapped_fn, unwrapped_xs, unwrapped_args)
         return ctx.wrap_tensors(map_return)
+=======
+        example_inputs = (*_unstack_pytree(unwrapped_xs)[0], *unwrapped_args)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        _check_alias_and_mutation(f, example_inputs, "map", pre_dispatch)
+        map_return = map_impl(wrapped_fn, unwrapped_xs, unwrapped_args)
+        return ctx.wrap_tensors(map_return)
+
+
+def _fake_map(f, x, *args):
+    from functorch.experimental.control_flow import _stack_pytree, _unstack_pytree
+
+    x_pytrees = _unstack_pytree(x)
+    zs = []
+    for xp in x_pytrees:
+        zs.append(f(xp, *args))
+    return _stack_pytree(zs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_higher_order_ops/out_dtype.py b/torch/_higher_order_ops/out_dtype.py
index a0efe87a26b3..7795ef970ff3 100644
--- a/torch/_higher_order_ops/out_dtype.py
+++ b/torch/_higher_order_ops/out_dtype.py
@@ -130,9 +130,13 @@ def out_dtype_fallback(op, output_dtype, *args):
     return res
 
 
+<<<<<<< HEAD
 out_dtype.py_impl(DispatchKey.Autograd)(
     autograd_not_implemented(out_dtype, deferred_error=True)
 )
+=======
+out_dtype.py_autograd_impl(autograd_not_implemented(out_dtype, deferred_error=True))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @out_dtype.py_impl(ProxyTorchDispatchMode)
diff --git a/torch/_higher_order_ops/run_const_graph.py b/torch/_higher_order_ops/run_const_graph.py
index 34f5741550b3..c87bb4c3981b 100644
--- a/torch/_higher_order_ops/run_const_graph.py
+++ b/torch/_higher_order_ops/run_const_graph.py
@@ -42,7 +42,11 @@ def run_const_graph_functional(ctx, graph, args):
         return ctx.wrap_tensors(out)
 
 
+<<<<<<< HEAD
 run_const_graph.py_impl(DispatchKey.Autograd)(
+=======
+run_const_graph.py_autograd_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     autograd_not_implemented(run_const_graph, deferred_error=True)
 )
 
diff --git a/torch/_higher_order_ops/scan.py b/torch/_higher_order_ops/scan.py
index a71f1de05c07..cf56ef0ee461 100644
--- a/torch/_higher_order_ops/scan.py
+++ b/torch/_higher_order_ops/scan.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import functools
 import itertools
+<<<<<<< HEAD
 from typing import Any, Callable
 
 import torch
@@ -17,6 +18,25 @@
     reenter_make_fx,
     unique_graph_id,
     UnsupportedAliasMutationException,
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Optional
+
+import torch
+import torch._prims_common as utils
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._higher_order_ops.cond import create_bw_fn
+from torch._higher_order_ops.utils import (
+    _maybe_compile_and_run_fn,
+    check_meta_consistency,
+    first_slice_copy,
+    materialize_as_graph,
+    reenter_make_fx,
+    save_tensors_and_symints_for_backward,
+    saved_tensors_and_symints,
+    unique_graph_id,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     validate_subgraph_args_types,
 )
 from torch._ops import HigherOrderOperator
@@ -44,7 +64,70 @@ def wrap_combine_fn_flat(
 
 
 def _extract_carry_and_out(flat_out: list[Any], num_carry: int):
+<<<<<<< HEAD
     return flat_out[:num_carry], flat_out[num_carry:]
+=======
+    return split_into_chunks(flat_out, [num_carry, len(flat_out) - num_carry])
+
+
+# We also do a clone with contiguous_format. This is to be consistent with
+# eager semantic of scan, which stacks the outputs. The result is contiguous
+# as a result of the stack operation.
+def stack_y(y: torch.Tensor, scan_length: int) -> torch.Tensor:
+    return (
+        y.unsqueeze(0)
+        .repeat(*([scan_length] + [1] * y.ndim))
+        .clone(memory_format=torch.contiguous_format)
+    )
+
+
+# NOTE: These functions can be reused in associative_scan and eventually moved to
+# torch._higher_order_ops.utils
+def get_tensor_mask(tensor_list: list[Any]) -> list[bool]:
+    # Returns a mask whether a list element is a tensor or not
+    return [True if isinstance(v, torch.Tensor) else False for v in tensor_list]
+
+
+def mask_list(
+    mask: list[bool], inp: list[Any], other: Optional[list[Any]] = None
+) -> list[Any]:
+    # Masks elements on an `inp` list.
+    # If other is None, then the elements of the `inp` list where the mask is False are removed
+    # If other is not None, then the elements of the `inp` list where the mask is False are
+    # replaced with the elements of the `other` list
+    assert len(mask) == len(
+        inp
+    ), "The length of the mask needs to be identical to the length of the input"
+    if other is not None:
+        assert len(inp) == len(
+            other
+        ), "If an input and an other list is provided, they need to have the same length"
+        return [i if m else o for m, i, o in zip(mask, inp, other)]
+    else:
+        return [i for m, i in zip(mask, inp) if m]
+
+
+def first_slice_copy_with_grad(li: list[Any]) -> list[Any]:
+    # First_slice_copy does not keep the original requires_grad flag,
+    # but we need it for materialize_as_graph
+    # in order to compute the correct gradients
+    # The reason why first_slice_copy doesn't keep requires_grad flag is
+    # because it's called in torch.autograd.Function.backward/forward.
+    slc = [first_slice_copy(x).requires_grad_(x.requires_grad) for x in li]
+    return slc
+
+
+def split_into_chunks(iterable: Sequence[Any], chunk_sizes: list[int]) -> list[Any]:
+    it = iter(iterable)
+    assert sum(chunk_sizes) == len(
+        iterable
+    ), "the sum of all chunks needs to match the length of the iterable."
+    return [list(itertools.islice(it, size)) for size in chunk_sizes]
+
+
+def call_operator(operator, *args):
+    return pytree.tree_leaves(operator(*args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def scan(
@@ -90,11 +173,28 @@ def scan(
         out (torch.Tensor or pytree with tensor leaves),
             each tensor leaf is a stacked output along first dim, where each slice is the output of a scan iteration.
 
+<<<<<<< HEAD
+=======
+    Restrictions:
+        - The combine_fn shouldn't have any aliasing between input-input, input-output, and output-output. E.g. return a view
+            or the same tensor as input is not supported. As a workaround, can clone the output to avoid aliasing.
+
+        - The combine_fn shoudn't mutate any inputs. We'll remove the mutation restriction for inference soon. Please file an issue
+            if you input mutation support for training is needed.
+
+        - The combine_fn's init carry should match the next_carry in pytree structure and in tensor metadata.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Example::
 
         def add(x: torch.Tensor, y: torch.Tensor):
             next_carry = y = x + y
+<<<<<<< HEAD
             return next_carry, y
+=======
+            # clone the output to avoid output-output aliasing
+            return next_carry, y.clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         i0 = torch.zeros(1)
         xs = torch.arange(5)
@@ -133,6 +233,17 @@ def _validate_input(cfn, lxs, linit, d, r):
         for x in lxs:
             if not isinstance(x, torch.Tensor):
                 raise RuntimeError(f"All xs leaves must be a Tensor but got {x}")
+<<<<<<< HEAD
+=======
+        if any(x.ndim <= d for x in lxs):
+            raise RuntimeError(
+                "All xs leaves must at least have 'dim' number of dimensions and scan dimension > 0"
+            )
+        if any(x.shape[d] == 0 for x in lxs):
+            raise RuntimeError(
+                "All xs leaves must at least have 'dim' number of dimensions and scan dimension > 0"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     ndim = leaves_xs_orig[0].ndim
     dim = utils.canonicalize_dim(ndim, dim)
@@ -148,7 +259,10 @@ def _validate_input(cfn, lxs, linit, d, r):
         leaves_xs = [torch.flip(elem, [0]) for elem in leaves_xs]
 
     # TODO: Support _inductor lowering
+<<<<<<< HEAD
     # TODO: Support Autograd
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: Unify handling of pytrees for control flow ops, such as cond, while_loop, etc.
 
     combine_fn = functools.partial(
@@ -201,9 +315,12 @@ def __call__(self, combine_fn, init, xs, additional_inputs):
 
 
 def generic_scan(operator, init, xs, dim=0, additional_inputs=()):
+<<<<<<< HEAD
     def call_operator(*args):
         return pytree.tree_leaves(operator(*args))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _scan(init, xs):
         """Perform scan on `elems` using `elems_init."""
         carry = init
@@ -217,6 +334,10 @@ def _scan(init, xs):
         num_init_leaves = len(init)
         dummy_carry, dummy_out = _extract_carry_and_out(
             call_operator(
+<<<<<<< HEAD
+=======
+                operator,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *carry,
                 *[first_slice_copy(elem, dim) for elem in xs],
                 *additional_inputs,
@@ -224,11 +345,18 @@ def _scan(init, xs):
             num_init_leaves,
         )
 
+<<<<<<< HEAD
+=======
+        out_tensor_mask = get_tensor_mask(dummy_out)
+        dummy_out_masked = mask_list(out_tensor_mask, dummy_out)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Pre-alocate
         # outs -> Output matrix
         # idxs -> Index matrix for scatter_
         # out: (num_elems, M, N, ...)
         # idx: (1, M, N)
+<<<<<<< HEAD
         outs, idxs = zip(
             *[
                 [
@@ -242,6 +370,20 @@ def _scan(init, xs):
                 for i, e in enumerate(dummy_out)
             ]
         )
+=======
+        outs = [
+            torch.zeros(
+                [num_elems] + list(e.size()),
+                dtype=e.dtype,
+                device=e.device,
+            )
+            for i, e in enumerate(dummy_out_masked)
+        ]
+        idxs = [
+            torch.ones_like(e, dtype=torch.int64).unsqueeze(0)
+            for i, e in enumerate(dummy_out_masked)
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def store_out_in_outs(out, ind):
             # Store the intermediate out in the outs matrix
@@ -256,6 +398,10 @@ def store_out_in_outs(out, ind):
             ind = i
             carry, out = _extract_carry_and_out(
                 call_operator(
+<<<<<<< HEAD
+=======
+                    operator,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     *carry,
                     *[elem.select(dim, ind) for elem in xs],
                     *additional_inputs,
@@ -264,14 +410,24 @@ def store_out_in_outs(out, ind):
             )
 
             # Store the inits in the outs matrix.
+<<<<<<< HEAD
             store_out_in_outs(out, ind)
 
         return [*carry, *list(outs)]
+=======
+            store_out_in_outs(mask_list(out_tensor_mask, out), ind)
+
+        # Expand outs with None depending on the tensor mask of the output
+        outs_expanded = [outs.pop(0) if out_m else None for out_m in out_tensor_mask]
+
+        return [*carry, *outs_expanded]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     scans = _scan(init, xs)
     return scans
 
 
+<<<<<<< HEAD
 # We also do a clone with contiguous_format. This is to be consistent with
 # eager semantic of scan, which stacks the outputs. The result is contiguous
 # as a result of the stack operation.
@@ -283,6 +439,8 @@ def stack_y(y: torch.Tensor, scan_length: int) -> torch.Tensor:
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def trace_scan(
     proxy_mode,
     func_overload,
@@ -314,6 +472,7 @@ def trace_scan(
     assert outputs is not None
 
     carry, output = _extract_carry_and_out(outputs, len(init))
+<<<<<<< HEAD
 
     for ini, ca in zip(init, carry):
         ini_meta = ini
@@ -328,6 +487,17 @@ def trace_scan(
                 f"Expected metadata of the combine_fn result {carry_meta} to be the same as "
                 + f"the metadata of init with {ini_meta}"
             )
+=======
+    init_fake_tensors: list[torch.Tensor | torch.SymInt | int] = [
+        i.clone() for i in init
+    ]
+    carry_fake_tensors: list[torch.Tensor | torch.SymInt | int] = [
+        c.meta["val"] for c in carry
+    ]
+    check_meta_consistency(
+        init_fake_tensors, carry_fake_tensors, "init", "carry", include_contiguity=False
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _, combine_graph_name = unique_graph_id(proxy_mode, prefix="scan_combine_graph")
 
@@ -359,9 +529,416 @@ def scan_op_dense(combine_fn, init, xs, additional_inputs):
     return generic_scan(combine_fn, init, xs, additional_inputs=additional_inputs)
 
 
+<<<<<<< HEAD
 scan_op.py_impl(DispatchKey.Autograd)(
     autograd_not_implemented(scan_op, deferred_error=True)
 )
+=======
+class ScanAutogradOp(torch.autograd.Function):
+    """
+    Example ::
+
+        def combine_fn(x: torch.Tensor, y: torch.Tensor):
+            next_carry = y = x * y
+            return next_carry, y
+
+        The ``combine_fn_bw``, computing the gradients for x and y of ``combine_fn`` is computed as:
+        def combine_fn_bw(x: torch.Tensor, y: torch.Tensor, g_carry: torch.Tensor, g_y: torch.Tensor):
+            return g_y * y + g_carry * y, g_y * x + g_carry * x
+
+        Note: In a real usecase of scan, there may be additional_inputs that participate in the
+        forward as well as in the backward of the scan operator. For the sake of readability those inputs
+        have been omitted in the following example, but are included in the subsequent detailed description below
+
+        The forward output of scan is computed as:
+        carry, ys = scan(combine_fn, init, xs).
+
+        This computation can be unpacked as
+        c_0, ys_0 = combine_fn(init, xs_0)
+        c_1, ys_1 = combine_fn(carry_0, xs_1)
+        c_2, ys_2 = combine_fn(carry_1, xs_2)
+        ...
+        c_T, ys_T = combine_fn(carry_(T-1), xs_T)
+
+        We collect c_0, c_1, ..., c_T into a vector of carries that we save for the backward,
+        but we only output (c_T, ys),
+        where ys is the vector of all intermediate outputs [y_0, y_1, ..., y_T].
+
+        Given the carries and the ys, the gradients for xs and for init can be computed as follows:
+        We receive the upstream gradients in torch.autograd.Function, i.e., we get g_c_T and g_ys,
+        where g_ys is the vector of all intermediate gradients of the outputs [g_ys_0, g_ys_1, ..., g_ys_T]
+
+        We then proceed to compute the gradients for the init (g_init) and the xs (g_xs) by running a
+        scan operation reverse over time. For example,
+
+        g_c_(T-1), g_xs_T = combine_fn_bw(c_(T-1), xs_T, g_c_T, g_ys_T)
+        g_c_(T-2), g_xs_(T-1) = combine_fn_bw(c_(T-2), xs_(T-1), g_c_(T-1), g_ys_(T-1))
+        g_c_(T-3), g_xs_(T-2) = combine_fn_bw(c_(T-3), xs_(T-2), g_c_(T-2), g_ys_(T-2))
+        ...
+        g_init, g_xs_1 = combine_fn_bw(c_0, xs_1, g_c_0, g_ys_1)
+        0     , g_xs_0 = combine_fn_bw(init, xs_0, g_init, g_ys_0),
+
+        where combine_fn_bw takes the forward inputs of step t (i.e. c_(t-1), xs_t),
+        the gradients of the carry of step t (i.e. g_c_t) and
+        the upstream gradient of the output of step t (i.e. g_ys_T)
+        and returns the gradient of xs_t -> g_xs_t, as well as the gradient for the carry of step t-1 -> g_c_(t-1).
+
+        Through this procedure we end up with the
+        gradients for the init -> g_init,
+        the gradients for the xs -> g_xs.
+
+
+    NOTE: [scan autograd implementation]
+
+    The forward of scan can be computed as:
+    1.) Prepare the forward graph wrapper ``combine_fn_with_carry_checkpoint``:
+    To use a scan operation for the backward path as well, we need access to the carries from all steps.
+    Thus, the function ``combine_fn`` is wrapped such that it returns all carries and not only the last carry.
+    In particular, we define ``combine_fn_with_carry_checkpoint``:
+    def combine_fn_with_carry_checkpoint(x: torch.Tensor, y: torch.Tensor):
+        carry, y = combine_fn(x, y)
+        return carry, (carry, y)
+
+    The scan operator will stack all outputs along the scan dimension.
+    Thus, by putting next_carry also into outputs of ``combine_fn_with_carry_checkpoint``,
+    the carries from all steps will be stacked and hence gives us chekpointed_carries
+
+    2.) Compute all carries, the last carry and all outputs using ``combine_fn_with_carry_checkpoint``:
+    c_T, (carries, ys) = scan_op(combine_fn_with_carry_checkpoint, init, xs, additional_inputs),
+    Where c_T (last carry) and ys (all outputs) are the original results of scan with the ``combine_fn``.
+    However, carries are checkpointed carries from all steps.
+    As a result of the forward, only the last carry c_T and the ys are returned,
+    while all carries are saved for the backward.
+
+    The backward of scan can be computed as:
+
+    3.) Prepare the backward graph:
+    We prepare the backward graph to be used in the backward function.
+    We utilize ``create_bw_fn`` to generate the joint function, i.e.,
+    ctx._combine_fn_bw = create_bw_fn(ctx._combine_fn, fw_operands), where fw_operands = [init, xs_0, additional_inputs]
+
+    The ctx._combine_fn_bw requires the primals (operands)
+    followed by the tangents (upstream gradients) from a single step
+    and produces the gradients of that step, i.e.,
+    g_c_(T-1), g_xs_T, g_additional_input_T = ctx._combine_fn_bw(c_(T-1), xs_T, additional_inputs, g_c_T, g_ys_T).
+
+    4.) Create a wrapper of the ``combine_fn_bw``, i.e., ``combine_fn_bw_grad_accumulation``:
+    In the forward, there may be additional inputs that participate in every forward step.
+    The gradients for those additional inputs are also computed at every step and need to be accumulated over all steps,
+    which is taken care of in this wrapper. For example:
+    def combine_fn_bw_grad_accumulation(*args):
+        carried_g_additional_input = args[:num_additional_inputs]
+        inputs_bw_fn = args[num_additional_inputs:]
+        g_c_(t-1), g_xs_t, g_additional_input_t = ctx._combine_fn_bw(*inputs_bw_fn)
+        new_g_additional_inputs = carried_g_additional_input + g_additional_input_t
+        # The ``new_g_additional_inputs`` and the ``g_c_t`` are encoded in the carry of the backward scan operator
+        # The ``g_xs_t`` is encoded as the output of the backward scan operator
+        return [*new_g_additional_inputs, *g_c_t, *g_xs_t]
+
+    5.) Perform the backward scan as
+    g_additional_inputs, g_init, g_xs = scan_op(combine_fn_bw_grad_accumulation, bw_init, bw_xs), where
+    bw_init consists of the initial gradient carry for the additional_inputs (initialized with 0s):
+    initial_g_additional_inputs, and the gradient of the last carry: g_c_T. Thus:
+    bwd_init = [*initial_g_additional_inputs, *g_c_T].
+
+    bw_xs consists of the combination of the upstream gradients g_ys,
+    the forward carries prepended with the fw_init, i.e., bw_carries = concat([fw_init, fw_carries[:-1]]) and
+    the fw_xs. In particular,
+    bwd_xs = [*g_ys, *bw_carries, *fw_xs].
+
+    Note: g_c_T and g_ys are provided through the torch.autograd.Function.backward's input
+
+    As demonstrated in the Example above, this backward scan then yields the gradient for the init -> g_init
+    and the gradient for the xs -> g_xs
+
+    NOTE: [scan partial grad handling]
+    If any element of init, of xs, of the outputs or of the additional_inputs does not require gradients,
+    i.e., requires_grad=False, there will be still gradients returned for those elements,
+    but those gradients will be a tensor filled with zeros of the same shape as the element itself.
+
+    A special case are additional_inputs that are not tensors. Such inputs can occur for example with symbolic tracing,
+    where the shape symbol (SymInt) becomes an additional_input.
+    For such cases, we compute a ``additional_inputs_tensor_mask``, which is True for elements of additional_inputs
+    that are tensors and False otherwise. Gradients of additional_inputs are only accumulated if this mask is True,
+    otherwise, the value of initial_g_additional_inputs is passed, which is None for non-Tensor values.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        combine_fn,
+        num_leaves_init,
+        num_leaves_xs,
+        num_additional_inputs,
+        *operands,
+    ):
+        ctx._num_leaves_init = num_leaves_init
+        ctx._num_leaves_xs = num_leaves_xs
+        ctx._num_additional_inputs = num_additional_inputs
+        ctx._combine_fn = combine_fn
+        init, xs, additional_inputs = split_into_chunks(
+            operands, [num_leaves_init, num_leaves_xs, num_additional_inputs]
+        )
+        additional_inputs_tensor_mask = get_tensor_mask(additional_inputs)
+        ctx._additional_inputs_tensor_mask = additional_inputs_tensor_mask
+
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+
+        # 1.) Prepare the forward graph wrapper ``combine_fn_with_carry_checkpoint``
+        # The wrapper of the forward graph returns carries from all iterations,
+        # not just from the last iteration. These are required in the backward path
+        def combine_fn_with_carry_checkpoint(*args):
+            carry, y = _extract_carry_and_out(combine_fn(*args), num_leaves_init)
+            return [
+                *carry,
+                # We additionally checkpoint all the intemediate carry outputs for backward.
+                *[
+                    n_c.clone().detach() if isinstance(n_c, torch.Tensor) else n_c
+                    for n_c in carry
+                ],
+                *y,
+            ]
+
+        with torch._C._AutoDispatchBelowAutograd():
+            # 2.) Compute the all carries, the last carry and all outputs using ``combine_fn_with_carry_checkpoint``
+            c_T, carries_ys = _extract_carry_and_out(
+                scan_op(
+                    combine_fn_with_carry_checkpoint,
+                    init,
+                    xs,
+                    additional_inputs,
+                ),
+                num_leaves_init,
+            )
+
+            # Collect the carries for each time step from the outs
+            # and save them for the backward path
+            carries = list(carries_ys[:num_leaves_init])
+            ys = list(carries_ys[num_leaves_init:])
+            save_tensors_and_symints_for_backward(ctx, list(operands) + carries + ys)
+            ctx._num_leaves_ys = len(ys)
+
+            return (*c_T, *ys)
+
+    @staticmethod
+    def backward(ctx, *flat_grads):
+        r"""
+        This function computes the gradients of the scan operation.
+        It does so by using a scan operator using all carries and the upstream gradients (see description above)
+
+        Args:
+            flat_grads (torch.Tensor): The tensor of flattened upstream gradients.
+        """
+
+        # Collect the saved items from the forward
+        num_leaves_init = ctx._num_leaves_init
+        num_leaves_xs = ctx._num_leaves_xs
+        num_leaves_ys = ctx._num_leaves_ys
+        num_additional_inputs = ctx._num_additional_inputs
+        additional_inputs_tensor_mask = ctx._additional_inputs_tensor_mask
+
+        def prepend_init_to_carries(init, carries):
+            # Prepare the carries for the backward path.
+            # This requires to concatenate the init and the carries
+            return [
+                torch.cat([torch.unsqueeze(i, 0), c[:-1]], dim=0)
+                for i, c in zip(init, carries)
+            ]
+
+        def initialize_g_additional_inputs(
+            additional_inputs,
+        ):
+            # The initial gradients for the additional_inputs are all zeros
+            g_additional_inputs = [
+                torch.zeros_like(ai) if ai_tm else None
+                for ai_tm, ai in zip(additional_inputs_tensor_mask, additional_inputs)
+            ]
+            return g_additional_inputs
+
+        # Retrieve the forward inputs and the forward outputs and dissect them
+        flat_args = saved_tensors_and_symints(ctx)
+        fw_init, fw_xs, additional_inputs, fw_carries, fw_ys = split_into_chunks(
+            flat_args,
+            [
+                num_leaves_init,
+                num_leaves_xs,
+                num_additional_inputs,
+                num_leaves_init,
+                num_leaves_ys,
+            ],
+        )
+
+        # 3.) Prepare the backward graph
+        fw_operands = (
+            *fw_init,
+            *[first_slice_copy(xs) for xs in fw_xs],
+            *additional_inputs,
+        )
+        ctx._combine_fn_bw = create_bw_fn(ctx._combine_fn, fw_operands)
+
+        # 4.) Create the BW wrapper to accumulate the gradients for the additional_inputs
+        def combine_fn_bw_grad_accumulation(*args):
+            # Dissect args and re-order them for the ``ctx._combine_fn_bw``
+            # The content of ``combine_fn_bw_tangents`` is [*carries_g, *outs_g]
+            # The content of ``combine_fn_bw_primals`` is [*init, *xs, *additional_inputs]
+            (
+                carried_g_additional_input,
+                combine_fn_bw_tangents,
+                combine_fn_bw_primals,
+            ) = split_into_chunks(
+                args,
+                [
+                    num_additional_inputs,
+                    num_leaves_init + num_leaves_ys,
+                    num_leaves_init + num_leaves_xs + num_additional_inputs,
+                ],
+            )
+            combine_fn_bw_args = (*combine_fn_bw_primals, *combine_fn_bw_tangents)
+
+            g_c_t, g_xs_t, g_additional_inputs_t = split_into_chunks(
+                ctx._combine_fn_bw(*combine_fn_bw_args),
+                [num_leaves_init, num_leaves_xs, num_additional_inputs],
+            )
+
+            new_g_additional_inputs = [
+                # If the additional inputs are ints or SymInts, those values are taken as is and no gradients are added
+                carr_g + curr_g if add_inp_tm else carr_g
+                for add_inp_tm, carr_g, curr_g in zip(
+                    additional_inputs_tensor_mask,
+                    carried_g_additional_input,
+                    g_additional_inputs_t,
+                )
+            ]
+
+            # The ``new_g_additional_inputs`` and the ``g_c_t`` are encoded in the carry of the backward scan operator
+            # The ``g_xs_t`` is encoded as the output of the backward scan operator
+            return [*new_g_additional_inputs, *g_c_t, *g_xs_t]
+
+        # Materialize the ``combine_fn_bw_grad_accumulation``
+        def construct_args_single_step_bw():
+            # This function constructs the arguments for a single step of the backward scan.
+            # In other words, it creates the arguments for ``combine_fn_bw_grad_accumulation``
+            # The order of the arguments returned is identical to the order the backward scan
+            # operations provides
+
+            # The following arguments are used for the backward part of the joint graph
+            # The first argument relates to the gradient accumulation of the additional inputs.
+            # Because only tensor elements of additional inputs can have requires_grad=True,
+            # the values for non-tensor elements of additional inputs are None
+            masked_additional_inputs = [
+                a.clone() if add_inp_tm else None
+                for add_inp_tm, a in zip(
+                    additional_inputs_tensor_mask, additional_inputs
+                )
+            ]
+
+            # The second argument relates to the gradients of the carries.
+            # Because the arguments are for a single step only,
+            # only the first slice of the carries is used.
+            sliced_carries = [first_slice_copy(c) for c in fw_carries]
+
+            # The third argument relates to the gradients of the ys.
+            # Because the arguments are for a single step only,
+            # only the first slice of the ys is used.
+            sliced_ys = [first_slice_copy(o) for o in fw_ys]
+
+            # The following arguments are used for the forward part of the joint graph
+            # The fourth argument relates to the init for the forward.
+            # I.e., fw_init
+
+            # The fifth argument relates to the xs for the forward.
+            # Because the arguments are for a single step only,
+            # only the first slice of the xs is used.
+            # Note: It is important to preserve the requires_grad flag of xs
+            # and thus we use the wrapper function ``first_slice_copy_with_grad``
+            fw_xs_slice = first_slice_copy_with_grad(fw_xs)
+
+            # The last argument relates to the additional inputs for the forward.
+            # I.e., additional_inputs
+
+            return (
+                *masked_additional_inputs,
+                *sliced_carries,
+                *sliced_ys,
+                *fw_init,
+                *fw_xs_slice,
+                *additional_inputs,
+            )
+
+        args_single_step_bw = construct_args_single_step_bw()
+
+        # TODO: we need to materialize the bw graphs because dynamo is unable to
+        # trace through the joint function when torch.compile torch.autograd.grad.
+        combine_fn_bw_grad_accumulation_gm = materialize_as_graph(
+            combine_fn_bw_grad_accumulation,
+            args_single_step_bw,
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+
+        # Decompose the flat_grads into g_c_T, g_ys
+        g_c_T, g_ys = split_into_chunks(flat_grads, [num_leaves_init, num_leaves_ys])
+
+        # Initialize the g_additional_inputs with zero-tensors.
+        # This step is necessary because the gradients of the additional inputs are accumulated in the
+        # ``wrapper_bwd_combine_fn`` and thus need a zero-initialized starting point
+        initial_g_additional_inputs = initialize_g_additional_inputs(additional_inputs)
+
+        # Prepend the inits to the carries.
+        # This is needed, because when computing the gradients, the last carry is not needed
+        # but the first carry, the init, is required.
+        bw_carries = prepend_init_to_carries(fw_init, fw_carries)
+
+        # Prepare the xs for the backward scan.
+        bwd_xs = [*g_ys, *bw_carries, *fw_xs]
+
+        # The flipping of the ``bwd_xs`` is necessary because the scan_op in the backward is always performed in reverse
+        bwd_xs = [torch.flip(elem, [0]) for elem in bwd_xs]
+
+        # Prepare the bwd_init
+        bwd_init = [*initial_g_additional_inputs, *g_c_T]
+
+        # 5.) Perform the backwrad scan:
+        # The ``combine_fn_bw_wrapped`` receives the
+        # initial_g_additional_inputs and the last carry as the ``bwd_init`` and the
+        # gradients of the outputs (g_ys), as well as the fw_carries and the fw_xs of the forward as the ``bwd_xs``
+        gradients = scan_op(
+            combine_fn_bw_grad_accumulation_gm,
+            bwd_init,
+            bwd_xs,
+            additional_inputs,
+        )
+
+        # Unpack the computed gradients
+        g_additional_inputs, g_init, g_xs = split_into_chunks(
+            gradients, [num_additional_inputs, num_leaves_init, num_leaves_xs]
+        )
+
+        # The flipping back along the scan dimension is required to get the gradients in the right order for ``xs``
+        g_xs = [torch.flip(elem, [0]) for elem in g_xs]
+
+        return *[None] * 4, *g_init, *g_xs, *g_additional_inputs
+
+
+@scan_op.py_autograd_impl
+def scan_autograd(combine_fn, init, xs, additional_inputs):
+    num_leaves_init = len(init)
+    num_leaves_xs = len(xs)
+    num_additional_inputs = len(additional_inputs)
+
+    flat_out = ScanAutogradOp.apply(
+        combine_fn,
+        num_leaves_init,
+        num_leaves_xs,
+        num_additional_inputs,
+        *(tuple(init) + tuple(xs) + additional_inputs),
+    )
+    return *flat_out[:num_leaves_init], *flat_out[num_leaves_init:]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @scan_op.py_impl(ProxyTorchDispatchMode)
@@ -390,12 +967,28 @@ def scan_fake_tensor_mode(mode, combine_fn, init, xs, additional_inputs):
 
 @scan_op.py_functionalize_impl
 def scan_functionalize(ctx, combine_fn, init, xs, additional_inputs):
+<<<<<<< HEAD
     unwrapped_xs = ctx.unwrap_tensors(xs)
     unwrapped_init = ctx.unwrap_tensors(init)
     unwrapped_additional_inputs = ctx.unwrap_tensors(additional_inputs)
     with ctx.redispatch_to_next():
         functional_combine_fn = ctx.functionalize(combine_fn)
         pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+=======
+    from torch._higher_order_ops.utils import (
+        _check_alias_and_mutation,
+        _maybe_run_with_interpreter,
+    )
+
+    unwrapped_xs = ctx.unwrap_tensors(xs)
+    unwrapped_init = ctx.unwrap_tensors(init)
+    unwrapped_additional_inputs = ctx.unwrap_tensors(additional_inputs)
+
+    with ctx.redispatch_to_next():
+        functional_combine_fn = ctx.functionalize(
+            _maybe_run_with_interpreter(combine_fn)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sample_unwrapped_xs_sliced = [first_slice_copy(inp) for inp in unwrapped_xs]
         sample_inputs = list(
             itertools.chain(
@@ -404,6 +997,7 @@ def scan_functionalize(ctx, combine_fn, init, xs, additional_inputs):
                 unwrapped_additional_inputs,
             )
         )
+<<<<<<< HEAD
         if _has_potential_branch_input_mutation(
             combine_fn, sample_inputs, pre_dispatch=pre_dispatch
         ):
@@ -416,6 +1010,10 @@ def scan_functionalize(ctx, combine_fn, init, xs, additional_inputs):
             raise UnsupportedAliasMutationException(
                 "Combine_fn might be aliasing the input!"
             )
+=======
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        _check_alias_and_mutation(combine_fn, sample_inputs, "scan", pre_dispatch)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ret = scan_op(
             functional_combine_fn,
             unwrapped_init,
diff --git a/torch/_higher_order_ops/schema.py b/torch/_higher_order_ops/schema.py
new file mode 100644
index 000000000000..8ae6d5a48bef
--- /dev/null
+++ b/torch/_higher_order_ops/schema.py
@@ -0,0 +1,306 @@
+import copy
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import torch
+import torch.utils._pytree as pytree
+from torch.fx.node import Target
+
+
+# Below is an implementation of generating FunctionSchema from example values.
+# This is helpful for generating FunctionSchema for HigherOrderOperator, where
+# we don't have a function to inspect and each call of the higher order operator
+# would have different schema.
+@dataclass(frozen=True)
+class HopArgumentInfo:
+    # Could give a name to the operand by default it's empty string.
+    name: str
+    example_value: Any
+    # Provide an default_value
+    default_value: Any
+    # Whether this arugment gets mutated in the hop subgraph.
+    # For output, this should always be False
+    is_mutated: bool
+    kw_only: bool
+
+
+class HopArgumentInfoGen:
+    @staticmethod
+    def from_example(
+        example_value: Any,
+        *,
+        name: str = "",
+        default_value: Optional[Any] = None,
+        is_mutated: bool = False,
+        kw_only: bool = False,
+    ) -> HopArgumentInfo:
+        if default_value is not None:
+            assert type(example_value) == type(
+                default_value
+            ), f"example_value type {type(example_value)} doesn't match default_value type: {type(default_value)}"
+
+        return HopArgumentInfo(
+            name=name,
+            example_value=example_value,
+            default_value=default_value,
+            is_mutated=is_mutated,
+            kw_only=kw_only,
+        )
+
+
+class CTypeGen:
+    convert_to_base_ty = {
+        int: torch._C.IntType.get(),
+        float: torch._C.FloatType.get(),
+        str: torch._C.StringType.get(),
+        bool: torch._C.BoolType.get(),
+    }
+
+    # should return torch._C.JitType but that annotation is busted
+    @staticmethod
+    def from_example(obj: Any) -> Any:
+        import torch
+
+        if isinstance(obj, torch.fx.GraphModule):
+            return torch._C.AnyType.get()
+        elif isinstance(obj, torch.SymInt):
+            return torch._C.SymIntType.get()
+        return torch._C._jit_try_infer_type(obj).type()
+
+
+class CArgumentGen:
+    @staticmethod
+    def from_hop_argument_info(
+        arg_idx: int, arg_info: HopArgumentInfo, is_output: bool = False
+    ) -> Any:
+        typ = CTypeGen.from_example(arg_info.example_value)
+        if is_output:
+            return torch._C.Argument("", typ, None, None, False, None)
+
+        alias_set = set({f"alias::a{arg_idx}"}) if arg_info.is_mutated else set()
+        alias_info = torch._C._AliasInfo(arg_info.is_mutated, alias_set, alias_set)  # type: ignore[attr-defined]
+        return torch._C.Argument(
+            arg_info.name,
+            typ,
+            None,
+            arg_info.default_value,
+            arg_info.kw_only,
+            alias_info,
+        )
+
+
+class HopSchemaGenerator:
+    def __init__(self, hop: torch._ops.HigherOrderOperator):
+        self.arg_infos: list[HopArgumentInfo] = []
+        self.example_outputs: list[Any] = []
+        self.schema_tree_spec: Optional[pytree.TreeSpec] = None
+        self.hop = hop
+
+    def add_arg(
+        self,
+        name: str,
+        example_value: Any,
+        default_value: Optional[Any] = None,
+        is_mutated: bool = False,
+        kw_only: bool = False,
+    ) -> None:
+        if callable(example_value):
+            assert isinstance(
+                example_value, (torch.fx.GraphModule, torch._ops.OperatorBase)
+            ), (
+                "Expect callable to be a GraphModule or an. Please call materialize_as_graph first "
+                f"to turn callable arguments {example_value} into a GraphModule."
+            )
+        _, flat_spec = pytree.tree_flatten(example_value)
+        if not flat_spec.is_leaf():
+            raise RuntimeError(
+                f"example_value {example_value} is not a leaf node. "
+                "Please only add flattened inputs to the hop schema. "
+                "If you need some structure in the arguments, please"
+                "add_arg for flattened args one by one then "
+                "call add_schema_tree_spec to register the original pytree "
+                " spec of the args."
+            )
+
+        arg_info = HopArgumentInfoGen.from_example(
+            example_value=example_value,
+            name=name,
+            default_value=default_value,
+            is_mutated=is_mutated,
+            kw_only=kw_only,
+        )
+        self.arg_infos.append(arg_info)
+
+    def add_output(self, output: Any) -> None:
+        self.example_outputs.append(output)
+
+    def add_schema_tree_spec(self, *args: Any, **kwargs: Any) -> None:
+        """schema tree spec is the tree spec from flattening all inputs to the hop with pytree.tree_flatten
+        Since torch.FunctionSchema only have proper mutation/alias support for flattened inputs, we need
+        to store the tree spec in order to reconstruct the inputs to the hop.
+        """
+        self.schema_tree_spec = pytree.tree_flatten((args, kwargs))[1]
+
+    def gen_schema(self) -> torch._C.FunctionSchema:
+        for i, arg_info in enumerate(self.arg_infos):
+            arg_spec = pytree.tree_flatten(arg_info.example_value)[1]
+            if not arg_spec.is_leaf() and self.schema_tree_spec is None:
+                raise RuntimeError(
+                    f"example_value of arg_infos[{i}] is {arg_info.example_value}, which is not a leaf node. "
+                    "Please call add_schema_tree_spec to add a schema tree spec first. "
+                    "Or consider changing the hop's signature to only take flattened arguments."
+                )
+
+        return CFunctionSchemaGen.from_hop_argument_info(
+            str(self.hop),
+            self.arg_infos,
+            HopArgumentInfoGen.from_example(tuple(self.example_outputs), name="out"),
+            self.schema_tree_spec,
+        )
+
+
+class CFunctionSchemaGen:
+    """
+    Note: [HigherOrderOperator schema generation]
+    Each invocation of a HigherOrderOperator will have a different schema.
+    For example, the schema of torch.cond varies depending on the true_fn and
+    false_fn. So we need a way to generate the schema for each invocation of a HOP.
+
+    We want to enforce the following invariants for HOP's schema:
+        1. Flattened inputs. There should be no pytree structure in it.
+        2. Flattened outputs. Note even if the hop returns a single value, it should be wrapped as a tuple.
+        3. No aliasing. This includes inp-inp aliasing, inp-out aliasing and out-out aliasing.
+
+    By enforcing these invariants, we could make HOP's schema meets the requirement of schema parser
+    and makes hop easier to handle downstream. For example, suppose we have an invoke_quant_test HOP:
+
+    class GraphModule(torch.nn.Module):
+        def forward(self, l_x_, l_y_):
+            subgraph_0 = self.subgraph_0
+            invoke_quant_test = torch.ops.higher_order.invoke_quant_test(subgraph_0, l_x_, l_y_, scheme = 'nf4');
+
+        class subgraph_0(torch.nn.Module):
+            def forward(self, l_x_, l_y_):
+                add_ = l_x_.add_(1)
+                matmul = l_x_ @ l_y_
+                sin = matmul.sin()
+                child = sin.cos()
+                child_1 = l_x_ + l_y_
+                child_2 = l_x_ - l_y_
+                child_3 = l_x_ @ l_y_
+                return (child, child_1, child_2, child_3)
+
+    By encoding the inputs of hop into a list of HopArgumentInfo and output as a single HopArgumentInfo,
+    we would get the following schema:
+        invoke_quant_test(Any arg0, Tensor(!) arg1, Tensor arg2, str scheme="\\"nf4\\"") -> (Tensor, Tensor, Tensor, Tensor)
+    """
+
+    @staticmethod
+    def from_hop_argument_info(
+        op_name: str,
+        inp_argument_info: list[HopArgumentInfo],
+        out_argument_info: HopArgumentInfo,
+        schema_tree_spec: Optional[pytree.TreeSpec],
+    ) -> Any:
+        args = []
+        for i, arg_info in enumerate(inp_argument_info):
+            args.append(CArgumentGen.from_hop_argument_info(i, arg_info))
+
+        # NOTE: we want the output to always be a single argument with torch._C.TupleType.
+        assert isinstance(
+            out_argument_info.example_value, tuple
+        ), f"expect out_argument_info's example_value to be a tuple but got {out_argument_info.example_value}"
+        assert (
+            not out_argument_info.is_mutated
+        ), "out_argument_info.is_mutated should always be set to False."
+        rets = None
+        if len(out_argument_info.example_value) == 1:
+            rets = [CArgumentGen.from_hop_argument_info(0, out_argument_info, True)]
+        else:
+            rets = [
+                CArgumentGen.from_hop_argument_info(
+                    i,
+                    HopArgumentInfoGen.from_example(
+                        name=f"out{i}",
+                        example_value=val,
+                        default_value=None,
+                        is_mutated=False,
+                    ),
+                    is_output=True,
+                )
+                for i, val in enumerate(out_argument_info.example_value)
+            ]
+
+        return HopSchema(
+            op_name,
+            "",
+            args,
+            rets,
+            False,
+            False,
+            schema_tree_spec,
+        )
+
+
+class HopSchema(torch._C.FunctionSchema):
+    def __init__(
+        self,
+        name: str,
+        overload_name: str,
+        arguments: list[torch._C.Argument],
+        returns: list[torch._C.Argument],
+        is_vararg: bool,
+        is_varret: bool,
+        schema_tree_spec: Optional[pytree.TreeSpec],
+    ):
+        self.tree_spec = schema_tree_spec
+        self.is_vararg = is_vararg
+        self.is_varret = is_varret
+        super().__init__(
+            name,
+            overload_name,
+            arguments,
+            returns,
+            self.is_vararg,
+            self.is_varret,
+        )
+
+    def __deepcopy__(self, memo: Any) -> "HopSchema":
+        # Need to additionally copy the tree_spec since
+        # it's not a member of torch._C.FunctionSchema
+        return HopSchema(
+            self.name,
+            self.overload_name,
+            self.arguments,
+            self.returns,
+            self.is_vararg,
+            self.is_varret,
+            copy.deepcopy(self.tree_spec),
+        )
+
+
+def find_hop_schema(
+    gm: torch.fx.GraphModule, target: Target
+) -> list[torch._C.FunctionSchema]:
+    schemas = []
+    for node in gm.graph.find_nodes(op="call_function", target=target):
+
+        def _get_example_value(node: torch.fx.Node) -> Any:
+            if node.op == "get_attr":
+                assert isinstance(node.target, str)
+                return getattr(gm, node.target)
+            else:
+                return (
+                    node.meta["example_value"]
+                    if "example_value" in node.meta
+                    else node.meta["val"]
+                )
+
+        fake_args, fake_kwargs = pytree.tree_map_only(
+            torch.fx.Node,
+            _get_example_value,
+            (node.args, node.kwargs),
+        )
+        schema = node.target.gen_schema(*fake_args, **fake_kwargs)
+        schemas.append(schema)
+    return schemas
diff --git a/torch/_higher_order_ops/strict_mode.py b/torch/_higher_order_ops/strict_mode.py
index e8543412c53a..71119943020f 100644
--- a/torch/_higher_order_ops/strict_mode.py
+++ b/torch/_higher_order_ops/strict_mode.py
@@ -60,7 +60,11 @@ def strict_mode_op_dense(callable, operands):
     return callable(*operands)
 
 
+<<<<<<< HEAD
 strict_mode_op.py_impl(DispatchKey.Autograd)(
+=======
+strict_mode_op.py_autograd_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     autograd_not_implemented(strict_mode_op, deferred_error=True)
 )
 
diff --git a/torch/_higher_order_ops/torchbind.py b/torch/_higher_order_ops/torchbind.py
index deaf9d503b5d..2c3b2eca3cbc 100644
--- a/torch/_higher_order_ops/torchbind.py
+++ b/torch/_higher_order_ops/torchbind.py
@@ -42,6 +42,7 @@ def schema(obj, method) -> torch.FunctionSchema:
         val = obj.get_real_obj()
         schema = val._get_method(method).schema
         schema_str = str(schema)
+<<<<<<< HEAD
         new_schema_str = (
             "call_torchbind(" + str(schema.arguments[0].real_type) + " obj,"
         )
@@ -49,6 +50,14 @@ def schema(obj, method) -> torch.FunctionSchema:
         new_schema_str = (
             new_schema_str + " str method," + schema_str[first_comma_index + 1 :]
         )
+=======
+        new_schema_str = f"call_torchbind({str(schema.arguments[0].real_type)} {schema.arguments[0].name},"
+        first_comma_index = schema_str.find(",")
+        if first_comma_index == -1:
+            # If no comma is found, find the last closing parenthesis
+            first_comma_index = schema_str.rfind(") ->")
+        new_schema_str = new_schema_str + " str method" + schema_str[first_comma_index:]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_schema = torch._C.parse_schema(new_schema_str)
         return new_schema
 
@@ -151,7 +160,11 @@ def call_torchbind_fake(mode, *args, **kwargs):
         )
 
 
+<<<<<<< HEAD
 call_torchbind.py_impl(DispatchKey.Autograd)(
+=======
+call_torchbind.py_autograd_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     autograd_not_implemented(call_torchbind, deferred_error=True)
 )
 
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
index d62e1d689149..83523dd7992d 100644
--- a/torch/_higher_order_ops/triton_kernel_wrap.py
+++ b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -3,7 +3,13 @@
 import dataclasses
 import functools
 import inspect
+<<<<<<< HEAD
 import logging
+=======
+import itertools
+import logging
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import threading
 from collections import defaultdict
 from collections.abc import Sequence
@@ -25,6 +31,10 @@
     track_tensor_tree,
 )
 from torch.fx.experimental.symbolic_shapes import guard_scalar
+<<<<<<< HEAD
+=======
+from torch.types import IntLikeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -62,6 +72,7 @@ class JITFunction:  # type: ignore[no-redef]
 
 log = logging.getLogger("torch._dynamo")
 
+<<<<<<< HEAD
 # TMADescriptorMetadata maps kernel parameter names to the metadata that allows
 # reconstructing TMA descriptors from the underlying tensors (passed as kernel
 # arguments in the fx graph, instead of the TMA descriptors). Namely: a tuple
@@ -75,6 +86,74 @@ class JITFunction:  # type: ignore[no-redef]
         list[Union[int, SymInt]],  # block_dims
         Union[int, SymInt],  # element_size
     ],
+=======
+# e.g. for a host-side Triton TMA API call ``create_2d_tma_descriptor(ptr, 50, 60, 32, 15, 4)``,
+# the metadata will look like ``("experimental", ([50, 60], [32, 15], 4))``
+TMAExperimentalMetadata = tuple[
+    str,  # type of TMA (should be "experimental")
+    tuple[
+        list[IntLikeType],  # dims
+        list[IntLikeType],  # block_dims
+        IntLikeType,  # element_size
+    ],
+]
+
+# e.g. for host-side Triton TMA API call ``TensorDescriptor.from_tensor(ptr, [32, 64])``
+# the metadata will look like ``("stable", ([32, 64],))``
+TMAStableMetadata = tuple[
+    str,  # type of TMA ("experimental" or "stable")
+    tuple[list[IntLikeType],],  # block_shape
+]
+
+
+def create_tma_experimental_metadata(
+    dims: list[IntLikeType],
+    block_dims: list[IntLikeType],
+    element_size: IntLikeType,
+) -> TMAExperimentalMetadata:
+    return ("experimental", (dims, block_dims, element_size))
+
+
+def maybe_unpack_tma_experimental_metadata(
+    tma_meta: Union[TMAExperimentalMetadata, TMAStableMetadata]
+) -> Optional[tuple[list[IntLikeType], list[IntLikeType], IntLikeType]]:
+    if not tma_meta or len(tma_meta) != 2:
+        return None
+    if tma_meta[0] == "experimental":
+        return tma_meta[1]  # type: ignore[return-value]
+    return None
+
+
+def create_tma_stable_metadata(
+    block_shape: list[IntLikeType],
+) -> TMAStableMetadata:
+    return ("stable", (block_shape,))
+
+
+def maybe_unpack_tma_stable_metadata(
+    tma_meta: Union[TMAExperimentalMetadata, TMAStableMetadata]
+) -> Optional[tuple[list[IntLikeType]]]:
+    if not tma_meta or len(tma_meta) != 2:
+        return None
+    if tma_meta[0] == "stable":
+        return tma_meta[1]  # type: ignore[return-value]
+    return None
+
+
+# TMADescriptorMetadata maps kernel parameter names to the metadata that allows
+# reconstructing TMA descriptors from the underlying tensors (passed as kernel
+# arguments in the fx graph, instead of the TMA descriptors).
+#
+# Since there are two TMA APIs (the old "experimental" API and the new "stable" API),
+# each entry in the dict is a tuple that starts with a string, either "experimental"
+# or "stable". The second entry in the tuple is another tuple, with data that depends
+# on the API type (see TMAExperimentalMetadata and TMAStableMetadata above).
+#
+# These are stored as raw tuples (instead of classes) for ease of serialization.
+TMADescriptorMetadata = dict[
+    str,  # kernel parameter name
+    Union[TMAExperimentalMetadata, TMAStableMetadata],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -171,7 +250,13 @@ def __post_init__(self) -> None:
 
 
 def generate_ttir(
+<<<<<<< HEAD
     kernel: "TritonKernelType", kwargs: dict[str, Any]
+=======
+    kernel: "TritonKernelType",
+    kwargs: dict[str, Any],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple["TritonIRModule", list[str]]:
     """
     Uses Triton's internal code generation to create TTIR
@@ -188,6 +273,10 @@ def generate_ttir(
         triton_version_uses_attrs_dict,
         TritonAttrsDescriptorVersion,
     )
+<<<<<<< HEAD
+=======
+    from torch.utils._triton import has_triton_tensor_descriptor_host_tma
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     triton_version = get_triton_attrs_descriptor_version()
 
@@ -229,15 +318,78 @@ def generate_ttir(
         a = kwargs[name]
         if isinstance(a, (torch.SymInt, torch.SymFloat, torch.SymBool, sympy.Expr)):
             ordered_args[name] = 2
+<<<<<<< HEAD
+=======
+        elif (
+            stable_meta := maybe_unpack_tma_stable_metadata(
+                tma_descriptor_metadata.get(name, None)
+            )
+        ) is not None:
+            from triton.tools.tensor_descriptor import TensorDescriptor
+
+            block_shape = stable_meta[0]
+            with torch._C._DisableTorchDispatch():
+                # need 16-byte aligned strides
+                elements_per_dim = max(1, 16 // a.dtype.itemsize)
+                base_tensor = torch.empty(
+                    [elements_per_dim] * len(block_shape), dtype=a.dtype
+                )
+            ordered_args[name] = TensorDescriptor.from_tensor(base_tensor, block_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(a, (FakeTensor, torch._inductor.ir.TensorBox)):
             with torch._C._DisableTorchDispatch():
                 ordered_args[name] = torch.empty(2, dtype=a.dtype)
         else:
             ordered_args[name] = a
 
+<<<<<<< HEAD
     ordered_tensor_names = [
         name for name, arg in ordered_args.items() if isinstance(arg, Tensor)
     ]
+=======
+    def is_stable_tensor_descriptor_arg(arg: Any) -> bool:
+        if has_triton_tensor_descriptor_host_tma():
+            from triton.tools.tensor_descriptor import TensorDescriptor
+
+            if isinstance(arg, TensorDescriptor):
+                return True
+        return False
+
+    def is_tensor_like_arg(arg: Any) -> bool:
+        if isinstance(arg, Tensor) or is_stable_tensor_descriptor_arg(arg):
+            return True
+        return False
+
+    # Note: one would expect that each input to the triton kernel maps to
+    # one input parameter in the TTIR. This is _not_ true for TMA descriptors:
+    # one TMA descriptor gets converted into:
+    #   * one TMA descriptor input
+    #   * N strides, for a rank-N tensor
+    #   * N sizes, for a rank-N tensor
+    # To account for this, we inject some fake arg names as placeholders for
+    # the stride and size parameters.
+    def get_tensor_names(name: str, arg: Any) -> list[str]:
+        if isinstance(arg, Tensor):
+            return [name]
+        if is_stable_tensor_descriptor_arg(arg):
+            stable_meta = maybe_unpack_tma_stable_metadata(
+                tma_descriptor_metadata[name]
+            )
+            assert stable_meta is not None
+            block_shape = stable_meta[0]
+            tensor_rank = len(block_shape)
+            names = [name]
+            names.extend(name + f" STRIDE PLACEHOLDER {i}" for i in range(tensor_rank))
+            names.extend(name + f" SIZE PLACEHOLDER {i}" for i in range(tensor_rank))
+            return names
+        return []
+
+    ordered_tensor_names = list(
+        itertools.chain.from_iterable(
+            get_tensor_names(name, arg) for name, arg in ordered_args.items()
+        )
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _get_specialization(args):  # type: ignore[no-untyped-def]
         # Support multiple triton versions.
@@ -303,7 +455,11 @@ def _get_specialization(args):  # type: ignore[no-untyped-def]
 
     specialization = _get_specialization(ordered_args.values())
     constants = {
+<<<<<<< HEAD
         name: arg for name, arg in ordered_args.items() if not isinstance(arg, Tensor)
+=======
+        name: arg for name, arg in ordered_args.items() if not is_tensor_like_arg(arg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     if (mangle_type := getattr(triton.runtime.jit, "mangle_type", None)) is not None:
@@ -619,7 +775,11 @@ def mlir_to_functions(op: "TritonIROperation") -> None:
 
 class MemoizeWithCycleCheck:
     fn: Callable[..., Any]
+<<<<<<< HEAD
     cache: dict[tuple[str, int], Any]
+=======
+    cache: dict[tuple[Any], Any]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, fn: Callable[..., Any]) -> None:
         self.fn = fn
@@ -629,12 +789,21 @@ def __call__(
         self,
         functions: dict[str, dict[Intermediate, list[Op]]],
         fn_name: str,
+<<<<<<< HEAD
         num_args: int,
     ) -> list[bool]:
         key = (fn_name, num_args)
         if key not in self.cache:
             self.cache[key] = None
             self.cache[key] = self.fn(functions, fn_name, num_args)
+=======
+        *args: Any,
+    ) -> list[bool]:
+        key: tuple[Any, ...] = (fn_name, *args)
+        if key not in self.cache:
+            self.cache[key] = None
+            self.cache[key] = self.fn(functions, fn_name, *args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.cache[key] is None:
             raise RuntimeError("Recursion is not supported")
         return self.cache[key]
@@ -644,6 +813,64 @@ def reset(self) -> None:
 
 
 @MemoizeWithCycleCheck
+<<<<<<< HEAD
+=======
+def get_tma_stores(
+    functions: dict[str, dict[Intermediate, list[Op]]], fn_name: str
+) -> set[Union[Intermediate, Param]]:
+    """
+    Identifies all intermediates and parameters that are written to by a
+    `tt.experimental_descriptor_store`. It tracks only the specific values
+    written to via experimental_descriptor_store and the input values to
+    `tt.reinterpret_tensor_descriptor` used to construct the direct inputs
+    to tt.experimental_descriptor_store - not any recursive values
+    used to construct those values.
+
+    For example: for
+      tt.reinterpret_tensor_descriptor(Intermediate(idx=0), ...)
+      Intermediate(idx=1) = tt.experimental_descriptor_store(Intermediate(idx=0), ...)
+    this function will return [Intermediate(idx=0), Intermediate(idx=1)],
+
+    However
+      Intermediate(idx=4) = arith.addptr(Intermediate(idx=2), Intermediate(idx=3))
+      Intermediate(idx=5) = tt.experimental_descriptor_store(Intermediate(idx=4), ...)
+      tt.experimental_descriptor_store(Intermediate(idx=5), ...)
+    this function will mark only idx=4 and idx=5 (but not idx=2 or idx=3)
+
+    If an intermediate/parameter is passed into a function and is written to
+    via experimental_descriptor_store within that function, the argument to the
+    function will also be marked.
+    """
+
+    result: set[Union[Intermediate, Param]] = set()
+
+    ops = functions[fn_name]
+    for op_list in ops.values():
+        for op in op_list:
+            if op.name == "tt.call":
+                assert op.fn_call_name in functions
+                tma_stores = get_tma_stores(functions, op.fn_call_name)
+                for i, inp in enumerate(op.args):
+                    if Param(idx=i) in tma_stores:
+                        result.add(inp)
+            elif op.name == "tt.experimental_descriptor_store":
+                assert len(op.args) >= 1
+                result.add(op.args[0])
+
+    for val in list(result):
+        if val in ops:
+            if not isinstance(val, Intermediate):
+                continue
+            for op in ops[val]:
+                if op.name == "tt.reinterpret_tensor_descriptor":
+                    assert len(op.args) >= 1
+                    result.add(op.args[0])
+
+    return result
+
+
+@MemoizeWithCycleCheck
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def analyze_kernel_mutations(
     functions: dict[str, dict[Intermediate, list[Op]]], fn_name: str, num_args: int
 ) -> list[bool]:
@@ -662,6 +889,11 @@ def analyze_kernel_mutations(
         "tt.atomic_cas": [0],
         "tt.atomic_rmw": [0],
         "tt.experimental_descriptor_store": [0],
+<<<<<<< HEAD
+=======
+        "tt.experimental_tensormap_create": [0],
+        "tt.descriptor_store": [0],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     # Ops that we want to bail out on
     UNKNOWN_OPS = {"tt.elementwise_inline_asm"}
@@ -669,6 +901,11 @@ def analyze_kernel_mutations(
     stack: list[Union[Param, Intermediate]] = []
     visited = set()
     ops = functions[fn_name]
+<<<<<<< HEAD
+=======
+    tma_stores = get_tma_stores(functions, fn_name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for op_list in ops.values():
         for op in op_list:
             # If we encounter an operation with effects that cannot be reliably analyzed
@@ -683,6 +920,22 @@ def analyze_kernel_mutations(
                     f"ttir analysis hit an op we do not know how to analyze: {op.name}"
                 )
 
+<<<<<<< HEAD
+=======
+            if op.name == "tt.experimental_tensormap_create":
+                # Note: this is how we implement experimental_descriptor_store mutation analysis.
+                # for on-device TMA.
+                # experimental_tensormap_store(a, b, ...) stores b to the location specified
+                # by descriptor in the memory of a.
+                # To track this, we first find all the intermediates/params to which we store via
+                # experimental_tensormap_store (get_tma_stores, called above). Then, during this
+                # analysis we wait to find the corresponding experimental_tensormap_create (if it
+                # exists), at which point we will mark the global_ptr as mutated (as done below).
+                assert len(op.args) >= 2
+                if op.args[0] in tma_stores:
+                    stack.append(op.args[1])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if op.name == "tt.call":
                 assert op.fn_call_name in functions
                 mutations = analyze_kernel_mutations(
@@ -715,7 +968,13 @@ def analyze_kernel_mutations(
 
 
 def identify_mutated_tensors(
+<<<<<<< HEAD
     kernel: "TritonKernelType", kwargs: dict[str, Any]
+=======
+    kernel: "TritonKernelType",
+    kwargs: dict[str, Any],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[str]:
     """
     Given a triton kernel and the arguments for this kernel, this function
@@ -727,7 +986,13 @@ def identify_mutated_tensors(
     ttir_module = None
     functions = None
     try:
+<<<<<<< HEAD
         ttir_module, ordered_tensor_names = generate_ttir(kernel, kwargs)
+=======
+        ttir_module, ordered_tensor_names = generate_ttir(
+            kernel, kwargs, tma_descriptor_metadata
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # extract functions from TTIR using MLIR bindings exposed by Triton code
         functions = ttir_to_functions(ttir_module)
@@ -740,6 +1005,10 @@ def identify_mutated_tensors(
         # The cache for analyze kernel mutations is mainly used for cycle
         # detection, so each top level invocation needs a clean cache
         analyze_kernel_mutations.reset()
+<<<<<<< HEAD
+=======
+        get_tma_stores.reset()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mutations = analyze_kernel_mutations(
             functions, kernel_name, len(ordered_tensor_names)
         )
@@ -844,11 +1113,14 @@ def triton_kernel_wrapper_mutation_dense(
         grid_fn = namespace[fn_name]
 
     if tma_descriptor_metadata:
+<<<<<<< HEAD
         from triton.tools.experimental_descriptor import (  # noqa: F401
             create_1d_tma_descriptor,
             create_2d_tma_descriptor,
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # as we need to launch the kernel here, we "unwrap" the
         # tma_descriptor_metadata, create the TMA descriptors
         # from it, and replace the tensors in the kwargs by the
@@ -856,6 +1128,7 @@ def triton_kernel_wrapper_mutation_dense(
         kwargs = kwargs.copy()
         for k, v in tma_descriptor_metadata.items():
             tensor = kwargs[k]
+<<<<<<< HEAD
             dims, block_dims, element_size = v
             create_tma_descriptor = (
                 create_1d_tma_descriptor if len(dims) == 1 else create_2d_tma_descriptor
@@ -866,6 +1139,34 @@ def triton_kernel_wrapper_mutation_dense(
                 *block_dims,
                 element_size,
             )
+=======
+            if (exp_meta := maybe_unpack_tma_experimental_metadata(v)) is not None:
+                from triton.tools.experimental_descriptor import (  # noqa: F401
+                    create_1d_tma_descriptor,
+                    create_2d_tma_descriptor,
+                )
+
+                dims, block_dims, element_size = exp_meta
+                create_tma_descriptor = (
+                    create_1d_tma_descriptor
+                    if len(dims) == 1
+                    else create_2d_tma_descriptor
+                )
+                kwargs[k] = create_tma_descriptor(
+                    tensor.data_ptr(),
+                    *dims,
+                    *block_dims,
+                    element_size,
+                )
+            else:
+                stable_meta = maybe_unpack_tma_stable_metadata(v)
+                assert stable_meta is not None
+                from triton.tools.tensor_descriptor import TensorDescriptor
+
+                block_shape = stable_meta[0]
+                kwargs[k] = TensorDescriptor.from_tensor(tensor, block_shape)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # move as many positional arguments from dicts to args as we
     # can to circumvent the bug with the kwargs and pre_/post_hook:
     # https://github.com/triton-lang/triton/issues/5082
@@ -961,11 +1262,24 @@ def triton_kernel_wrapper_mutation_proxy_torch_dispatch_mode(
 
 
 def get_mutated_tensors(
+<<<<<<< HEAD
     kernel_idx: int, constant_args_idx: int, kwargs: dict[str, Any]
 ) -> list[str]:
     kernel = kernel_side_table.get_kernel(kernel_idx)
     constant_args = kernel_side_table.get_constant_args(constant_args_idx)
     return identify_mutated_tensors(kernel, {**kwargs, **constant_args})
+=======
+    kernel_idx: int,
+    constant_args_idx: int,
+    kwargs: dict[str, Any],
+    tma_descriptor_metadata: TMADescriptorMetadata,
+) -> list[str]:
+    kernel = kernel_side_table.get_kernel(kernel_idx)
+    constant_args = kernel_side_table.get_constant_args(constant_args_idx)
+    return identify_mutated_tensors(
+        kernel, {**kwargs, **constant_args}, tma_descriptor_metadata
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @triton_kernel_wrapper_mutation.py_functionalize_impl
@@ -983,7 +1297,11 @@ def triton_kernel_wrapper_mutation_functionalize(
     # they are no longer equal. Fix this by graph breaking on this condition
     # earlier in dynamo.
     tensors_to_clone = get_mutated_tensors(
+<<<<<<< HEAD
         kernel_idx, constant_args_idx, unwrapped_kwargs
+=======
+        kernel_idx, constant_args_idx, unwrapped_kwargs, tma_descriptor_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     with ctx.redispatch_to_next():
         unwrapped_outputs = triton_kernel_wrapper_functional(
@@ -1153,7 +1471,11 @@ class TritonHOPifier:
     to the HOP (which can then be traced).
 
     Because Dynamo has its own calling conventions for e.g. invoking a user-defined function
+<<<<<<< HEAD
     TritonHOPifier is an abstract class that can be overriden by its subclasses.
+=======
+    TritonHOPifier is an abstract class that can be overridden by its subclasses.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def raise_unsupported(self, msg: str) -> Never:
@@ -1245,7 +1567,11 @@ def do_prune_configs(  # type: ignore[no-untyped-def]
                 ]
                 configs = [
                     config[0]
+<<<<<<< HEAD
                     for config in sorted(est_timing, key=lambda x: x[1])[:top_k]
+=======
+                    for config in sorted(est_timing, key=operator.itemgetter(1))[:top_k]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
         return configs
 
@@ -1473,7 +1799,17 @@ def call_triton_kernel(
             new_var = type(variable)(new_kernel, None, variable.grid)
             return self.call_triton_kernel(new_var, args, kwargs, tx)
 
+<<<<<<< HEAD
         SPECIAL_CONFIG_NAMES = {"num_warps", "num_stages", "num_ctas"}
+=======
+        SPECIAL_CONFIG_NAMES = {
+            "num_warps",
+            "num_stages",
+            "num_ctas",
+            "num_consumer_groups",
+            "num_buffers_warp_spec",
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # move special config names to configs out of kwargs
         special_kwargs = {}
@@ -1743,6 +2079,30 @@ def check_grid(
         # normalize to tuple
         return tuple(grid)
 
+<<<<<<< HEAD
+=======
+    def store_non_graphable_args(
+        self,
+        combined_args: dict[str, Any],
+    ) -> tuple[dict, int]:
+        """
+        Some args cannot be stored in the FX graph.
+        Put them in the side table.
+        """
+
+        def is_graphable(val: Any) -> bool:
+            return isinstance(val, (fx.node.base_types, fx.Node))
+
+        non_graphable_args = {
+            k: v for k, v in combined_args.items() if not is_graphable(v)
+        }
+        graphable_args = {k: v for k, v in combined_args.items() if is_graphable(v)}
+
+        constant_args_idx = kernel_side_table.add_constant_args(non_graphable_args)
+
+        return graphable_args, constant_args_idx
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def call_HOP(
         self,
         variable: "TraceableTritonKernelWrapper",
@@ -1753,6 +2113,7 @@ def call_HOP(
         assert tx is None
         assert isinstance(variable, TraceableTritonKernelWrapper)
 
+<<<<<<< HEAD
         def is_graphable(val: Any) -> bool:
             return isinstance(val, fx.node.base_types)
 
@@ -1762,6 +2123,10 @@ def is_graphable(val: Any) -> bool:
         graphable_args = {k: v for k, v in combined_args.items() if is_graphable(v)}
 
         constant_args_idx = kernel_side_table.add_constant_args(non_graphable_args)
+=======
+        graphable_args, constant_args_idx = self.store_non_graphable_args(combined_args)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(variable.kernel_idx, int)
         return triton_kernel_wrapper_mutation(
             kernel_idx=variable.kernel_idx,
diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py
index 0870a077a4f4..2c97ab366e21 100644
--- a/torch/_higher_order_ops/utils.py
+++ b/torch/_higher_order_ops/utils.py
@@ -1,21 +1,46 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import functools
 from contextlib import contextmanager, ExitStack
 from dataclasses import dataclass
 from typing import Any, Callable, Optional, Union
+=======
+import contextlib
+import functools
+from contextlib import contextmanager, ExitStack, nullcontext
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, overload, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx.traceback as fx_traceback
 import torch.utils._pytree as pytree
+<<<<<<< HEAD
 from torch._guards import detect_fake_mode
 from torch._ops import OperatorBase
 from torch._subclasses.fake_tensor import FakeTensor
+=======
+from torch._dispatch.python import suspend_functionalization
+from torch._guards import detect_fake_mode
+from torch._higher_order_ops.schema import HopSchema
+from torch._ops import HigherOrderOperator, OperatorBase, OpOverload
+from torch._subclasses.fake_tensor import FakeTensor
+from torch._subclasses.functional_tensor import (
+    disable_functional_mode,
+    FunctionalTensor,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.proxy_tensor import (
     _temp_remove_metadata_torch_function_mode,
     disable_proxy_modes_tracing,
     make_fx,
 )
+<<<<<<< HEAD
 from torch.fx.passes.shape_prop import TensorMetadata
+=======
+from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
+from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.multiprocessing.reductions import StorageWeakRef
 
 
@@ -86,7 +111,11 @@ def graph_with_interpreter(*args):
 
 
 def _maybe_compile_and_run_fn(fn, *args):
+<<<<<<< HEAD
     if not torch._dynamo.is_compiling():
+=======
+    if not torch.compiler.is_dynamo_compiling():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._dynamo.backends.debugging import (
             make_eager_backend_with_torch_function_mode,
         )
@@ -142,6 +171,91 @@ def wrapped(*args):
         return _maybe_make_fx_with_fake_mode(fn)
 
 
+<<<<<<< HEAD
+=======
+def check_meta_consistency(
+    lhs_list: list[Union[torch.Tensor, torch.SymInt, int]],
+    rhs_list: list[Union[torch.Tensor, torch.SymInt, int]],
+    lhs_name: str,
+    rhs_name: str,
+    include_contiguity: bool = True,
+) -> None:
+    def diff_meta_pairs(
+        lhs_list: list[Union[torch.Tensor, torch.SymInt, int]],
+        rhs_list: list[Union[torch.Tensor, torch.SymInt, int]],
+    ) -> list[str]:
+        def diff_meta(
+            lhs: Union[torch.Tensor, torch.SymInt, int],
+            rhs: Union[torch.Tensor, torch.SymInt, int],
+        ) -> str:
+            if isinstance(lhs, torch.Tensor) and isinstance(rhs, torch.Tensor):
+                return ", ".join(
+                    diff_tensor_meta(
+                        _extract_tensor_metadata(
+                            lhs, include_contiguity=include_contiguity
+                        ),
+                        _extract_tensor_metadata(
+                            rhs, include_contiguity=include_contiguity
+                        ),
+                        check_grad=False,
+                    )
+                )
+            else:
+
+                def _both_int_types(lhs, rhs):
+                    return isinstance(lhs, (int, torch.SymInt)) and isinstance(
+                        rhs, (int, torch.SymInt)
+                    )
+
+                def _both_tensor(lhs, rhs):
+                    return isinstance(lhs, torch.Tensor) and isinstance(
+                        rhs, torch.Tensor
+                    )
+
+                if not _both_int_types(lhs, rhs) and not _both_tensor(lhs, rhs):
+                    return f"type: {lhs} vs {rhs}"
+
+            return ""
+
+        # Manually check the device of lhs and rhs as this field is currently not part of TensorMetadata
+        def diff_device(
+            lhs: Union[torch.Tensor, torch.SymInt, int],
+            rhs: Union[torch.Tensor, torch.SymInt, int],
+        ) -> str:
+            if isinstance(lhs, torch.Tensor) and isinstance(rhs, torch.Tensor):
+                if (
+                    rhs.device.type == lhs.device.type
+                    and rhs.device.index == lhs.device.index
+                ):
+                    return ""
+                else:
+                    return "device"
+            return ""
+
+        if len(lhs_list) != len(rhs_list):
+            raise torch._dynamo.exc.UncapturedHigherOrderOpError(
+                f"Expected {lhs_name} and {rhs_name} to have same number of outputs but got lhs:{lhs_list} and rhs:{rhs_list}"
+            )
+        all_diffs = []
+        for i, (lhs, rhs) in enumerate(zip(lhs_list, rhs_list)):
+            if diff := diff_meta(lhs, rhs):
+                all_diffs.append(
+                    f"pair[{i}] differ in {diff}, where lhs is {lhs} and rhs is {rhs}"
+                )
+            if diff := diff_device(lhs, rhs):
+                all_diffs.append(
+                    f"pair[{i}] differ in {diff}, where lhs is {lhs} and rhs is {rhs}"
+                )
+        return all_diffs
+
+    if all_diffs := diff_meta_pairs(lhs_list, rhs_list):
+        diff_str = "\n".join(all_diffs)
+        raise torch._dynamo.exc.UncapturedHigherOrderOpError(
+            f"Expected {lhs_name} and {rhs_name} to have same metadata but found:\n{diff_str}"
+        )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @contextmanager
 def _set_compilation_env():
     _old_is_tracing = torch.fx._symbolic_trace._is_fx_tracing_flag
@@ -165,6 +279,7 @@ def _set_compilation_env():
         torch._dynamo.config.allow_empty_graphs = _old_allow_empty_graphs
 
 
+<<<<<<< HEAD
 def _detect_input_mutation(gm: torch.fx.GraphModule) -> bool:
     example_inputs = [
         ph.meta.get("val", None) for ph in gm.graph.find_nodes(op="placeholder")
@@ -193,26 +308,48 @@ def _detect_input_alias(gm: torch.fx.GraphModule) -> bool:
     return False
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # The invariant here is that we always trace the branch with fake tensor
 def _maybe_fake_tracing(fn, inputs: list[Any], pre_dispatch):
     fake_mode = detect_fake_mode(inputs)
     tracing_mode = "real"
     if fake_mode is None:
+<<<<<<< HEAD
+=======
+        fake_mode = nullcontext()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tracing_mode = "fake"
 
     # Note: we need to turn off proxy tensor mode to avoid tracing infra
     # code that happens in make_fx e.g. we now call as_strided when wrapping tensor
     # as fake tensor.
+<<<<<<< HEAD
     with disable_proxy_modes_tracing():
         return make_fx(
+=======
+    with fake_mode, disable_proxy_modes_tracing():
+        gm = make_fx(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn,
             tracing_mode=tracing_mode,
             pre_dispatch=pre_dispatch,
             _error_on_data_dependent_ops=False,
         )(*inputs)
+<<<<<<< HEAD
 
 
 def has_potential_input_alias_or_mutation(gm, inputs, pre_dispatch=False):
+=======
+        if not isinstance(fake_mode, nullcontext) and fake_mode.shape_env is not None:
+            insert_deferred_runtime_asserts(
+                gm, fake_mode.shape_env, "hoo_maybe_fake_tracing", export=True
+            )
+        return gm
+
+
+def potential_input_alias_or_mutation(gm, inputs, pre_dispatch=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         gm = _maybe_fake_tracing(gm, inputs, pre_dispatch)
     except UnsupportedAliasMutationException:
@@ -222,6 +359,7 @@ def has_potential_input_alias_or_mutation(gm, inputs, pre_dispatch=False):
     except Exception as e:
         raise e
 
+<<<<<<< HEAD
     return _detect_input_mutation(gm) or _detect_input_alias(gm)
 
 
@@ -259,6 +397,115 @@ def _has_potential_branch_input_alias(branch, inputs, pre_dispatch=False):
         raise e
 
     return _detect_input_alias(gm)
+=======
+    example_inputs = [
+        ph.meta.get("val", None) for ph in gm.graph.find_nodes(op="placeholder")
+    ]
+    (
+        inp_inp_alias_map,
+        inp_out_alias_map,
+        out_out_alias_map,
+        inp_mutation,
+    ) = check_input_alias_and_mutation(gm, example_inputs)
+    return (inp_inp_alias_map, inp_out_alias_map, out_out_alias_map), inp_mutation
+
+
+def analyze_potential_input_alias_or_mutation(name, aliases, input_mutations):
+    if any(len(a) > 0 for a in aliases):
+        # TODO: Investigate here further which node is exactly aliasing
+        raise RuntimeError(
+            f"{name} where aliases appear. "
+            + f"In particular, these inputs \
+            {set(el for el_map in aliases if len(el_map.keys()) > 0 for el in el_map.keys())} "  # noqa: C401
+            + "get aliased. Please ensure that this doesn't happen."
+        )
+    if len(input_mutations):
+        # TODO: Investigate here further which node is exactly mutating the inputs
+        raise RuntimeError(
+            f"{name} where the inputs are mutated. "
+            + f"In particular, these nodes are mutating the inputs \
+            {set(el for el in input_mutations)}."  # noqa: C401
+            + "Please ensure that this doesn't happen."
+        )
+
+
+def _has_potential_branch_input_mutation(gm, inputs, pre_dispatch=False):
+    (
+        _,
+        _,
+        _,
+    ), inp_mutation = potential_input_alias_or_mutation(gm, inputs, pre_dispatch)
+
+    return len(inp_mutation) > 0
+
+
+def has_potential_input_alias_or_mutation(gm, inputs, pre_dispatch=False):
+    (
+        inp_inp_alias_map,
+        inp_out_alias_map,
+        out_out_alias_map,
+    ), inp_mutation = potential_input_alias_or_mutation(gm, inputs, pre_dispatch)
+    return (
+        any(
+            (
+                len(inp_inp_alias_map) > 0,
+                len(inp_out_alias_map) > 0,
+                len(out_out_alias_map) > 0,
+            )
+        ),
+        len(inp_mutation) > 0,
+    )
+
+
+def _collect_fake_inputs(inputs):
+    from torch._subclasses.fake_tensor import FakeTensor
+
+    # Get the example values of the inputs.
+    inputs_fake: list[Union[FakeTensor, torch.Tensor, int]] = []
+    for inp in inputs:
+        if isinstance(inp, (torch.fx.proxy.Proxy, torch.fx.node.Node)):
+            inp = inp.node if isinstance(inp, torch.fx.proxy.Proxy) else inp
+            if hasattr(inp, "meta"):
+                val = inp.meta["example_value"]
+                if isinstance(val, torch.Tensor):
+                    if torch._C._functorch.is_batchedtensor(
+                        val
+                    ) or torch._C._functorch.is_functionaltensor(val):
+                        # This case is for batched or functional tensors
+                        # Unwrap the tensors
+                        while torch._C._functorch.is_batchedtensor(
+                            val
+                        ) or torch._C._functorch.is_functionaltensor(val):
+                            val = torch._C._functorch.get_unwrapped(val)
+                        assert isinstance(val, FakeTensor)
+                        inputs_fake.append(val)
+                    else:
+                        # This is the standard case of a TensorVariable
+                        assert isinstance(val, FakeTensor)
+                        inputs_fake.append(val)
+                else:
+                    # This case is for SymInts and other non-Tensor elements
+                    assert not isinstance(val, torch.Tensor)
+                    inputs_fake.append(val)
+        else:
+            # This case is for ints
+            assert isinstance(inp, int)
+            inputs_fake.append(inp)
+
+    return inputs_fake
+
+
+def _check_alias_and_mutation(graph_module, inputs_fake, name, pre_dispatch):
+    aliases, inp_mutation = has_potential_input_alias_or_mutation(
+        graph_module, inputs_fake, pre_dispatch=pre_dispatch
+    )
+    if aliases:
+        raise RuntimeError(
+            f"{name} might be aliasing the input or the output!"
+        )  # noqa: F541
+    if inp_mutation:
+        raise RuntimeError(f"{name} might be modifying the input!")  # noqa: F541
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def unique_graph_id(proxy_mode, prefix):
@@ -266,11 +513,24 @@ def unique_graph_id(proxy_mode, prefix):
     # There are probably better ways - I know that create_arg has some self incrementing name
     # magic to it, but since we explicitly have to get the name for register_module,
     # I was not sure how to do that. This kinda simulates it.
+<<<<<<< HEAD
+=======
+    return unique_graph_name_with_root(proxy_mode.tracer.root, prefix)
+
+
+def unique_graph_name_with_root(
+    root: torch.fx.GraphModule, prefix: str
+) -> tuple[int, str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     next_name = None
     i = 0
     while not next_name:
         candidate = f"{prefix}_{i}"
+<<<<<<< HEAD
         if hasattr(proxy_mode.tracer.root, candidate):
+=======
+        if hasattr(root, candidate):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             i += 1
         else:
             next_name = candidate
@@ -279,7 +539,10 @@ def unique_graph_id(proxy_mode, prefix):
 
 def _from_fun(t):
     from torch._functorch.aot_autograd import from_fun
+<<<<<<< HEAD
     from torch._subclasses.functional_tensor import FunctionalTensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if isinstance(t, torch.Tensor):
         if t.dtype != torch.bool:
@@ -336,6 +599,29 @@ def fw_with_masks(*args):
     return fw_with_masks
 
 
+<<<<<<< HEAD
+=======
+def prepare_fw_with_masks_all_requires_grad(fn):
+    def fw_with_masks(*args):
+        fw_out = fn(*args)
+        # Note [force all outputs to be require grad]
+        # Instead of using the original fn, we set the output of original
+        # fn to all require grad. This is consistent with the behavior
+        # of autograd.Function, where if any one of the inputs requires grad
+        # all output will be require grad. This also makes the downstream
+        # require_gradness reasoning much easier.
+        if pytree.tree_any_only(torch.Tensor, lambda t: t.requires_grad, args):
+            fw_out = pytree.tree_map_only(
+                torch.Tensor, lambda x: x.requires_grad_(True), fw_out
+            )
+        return fw_out, pytree.tree_map_only(
+            torch.Tensor, lambda x: x.requires_grad, fw_out
+        )
+
+    return fw_with_masks
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This function replaces None gradients with all-zero gradients.
 # `None` gradients are problematic for CUDA graphs. Those gradients are
 # replaced with an all-zero tensor for better optimization
@@ -371,6 +657,27 @@ def _maybe_fake_prop_ignore_unbacked(fn, args):
         return fn(*args)
 
 
+<<<<<<< HEAD
+=======
+def redirect_to_mode(hop: OperatorBase, mode):
+    """Utility for redispatching HOP to underlying mode
+
+    Args:
+        hop: The HOP to redispatch
+        mode: The mode to redispatch to
+
+    Returns:
+        A decorated function that implements the HOP for the given mode
+    """
+
+    @hop.py_impl(mode)
+    def impl(mode, *args, **kwargs):
+        return mode.__torch_dispatch__(hop, [], args, kwargs)
+
+    return impl
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: The parameter use_output_and_grad_bw is required because some operations
 # that utilize this function, such as the while_loop, may require (grad, fwd_outputs)
 def create_fw_bw_graph(fn, use_output_and_grad_bw, fw_inputs, fw_outputs):
@@ -584,27 +891,86 @@ def validate_subgraph_args_types(lifted_args: Union[tuple[Any, ...], list[Any]])
     ), f"{lifted_args} can only be of {allowed_types} but got {tuple(type(arg) for arg in lifted_args)}"
 
 
+<<<<<<< HEAD
 def check_input_alias_and_mutation(
     gm: torch.fx.GraphModule,
     fake_args: list[FakeTensor],
 ) -> tuple[list[int], dict[int, int], dict[int, int], dict[int, int]]:
     with disable_proxy_modes_tracing():
+=======
+# TODO: Return a more detailed information as to which node
+# causes a mutation or an alias. This may requires a per operator tensor version checking
+def check_input_alias_and_mutation(
+    gm: torch.fx.GraphModule,
+    fake_args: list[FakeTensor],
+) -> tuple[dict[int, int], dict[int, int], dict[int, int], list[int]]:
+    (
+        inp_inp_alias_map,
+        inp_out_alias_map,
+        out_out_alias_map,
+        mutated_inputs,
+    ) = check_input_alias_and_mutation_return_outputs(gm, fake_args)[:-1]
+    return inp_inp_alias_map, inp_out_alias_map, out_out_alias_map, mutated_inputs
+
+
+def check_input_alias_and_mutation_return_outputs(
+    gm: torch.fx.GraphModule,
+    fake_args: Union[list[FakeTensor], tuple[FakeTensor, ...]],
+) -> tuple[
+    dict[int, int],
+    dict[int, int],
+    dict[int, int],
+    list[int],
+    Union[tuple[Any, ...], list[Any]],
+]:
+    # This function can be called under autograd, functional, proxy and fake tensor mode.
+    # We need to return either a fake tensor or a real tensor depending on the mode.
+    # to detect the input mutation/aliasing.
+    with disable_proxy_modes_tracing(), disable_functional_mode(), suspend_functionalization():
+
+        def _from_functional_tensor(t: torch.Tensor) -> torch.Tensor:
+            if isinstance(t, FunctionalTensor) or torch._is_functional_tensor(t):
+                return torch.empty_strided(
+                    t.size(),
+                    t.stride(),
+                    dtype=t.dtype,
+                    requires_grad=t.requires_grad,
+                    device=t.device,
+                )
+            return t
+
+        fake_args = pytree.tree_map_only(
+            torch.Tensor, _from_functional_tensor, fake_args
+        )
+    # We want to disable active functional, proxy and fake modes if any.
+    # to create a encapsulated environment for fake tensor prop
+    with torch.utils._python_dispatch._disable_current_modes():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """This function returns mutated inputs, inp-inp alias, inp-out alias, out-out alias
         in the graph module gm. It checks whether input tensor versions have
         changed after run gm once to detect mutation and checks tensor storage
         to detect alias.
         """
+<<<<<<< HEAD
         from torch._prims_common import clone_preserve_strides
 
         def _tensor_version(t) -> Optional[int]:
             if isinstance(t, torch.Tensor):
                 assert isinstance(t, FakeTensor), "Only fake tensor is allowed"
+=======
+
+        def _tensor_version(t) -> Optional[int]:
+            if isinstance(t, torch.Tensor):
+                if not isinstance(t, FakeTensor):
+                    raise RuntimeError("Only fake tensor is allowed")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return t._version
             return None
 
         def _tensor_storage(t) -> StorageWeakRef:
             return StorageWeakRef(t._typed_storage())
 
+<<<<<<< HEAD
         # Clone the fake args to avoid mutating the original fake args
         with ExitStack() as ctx_stack:
             # We need to temporarily turn inference_mode off because
@@ -616,6 +982,58 @@ def _tensor_storage(t) -> StorageWeakRef:
             ]
             before = [_tensor_version(arg) for arg in cloned]
             outputs = _maybe_fake_prop_ignore_unbacked(gm, cloned)
+=======
+        def _get_shape_env(
+            fake_args,
+        ) -> Optional[torch.fx.experimental.symbolic_shapes.ShapeEnv]:
+            # detect_fake_mode requires there could be only one active fake mode. This
+            # restricts the usage of this function because the global TracingContext
+            # has a persistent fake mode but fake tensors can be created
+            # outside of the tracing context (e.g. in testing).
+            # Instead, we just look at fake_args fake tensor mode
+            if len(fake_args) == 0:
+                return torch.fx.experimental.symbolic_shapes.ShapeEnv()
+
+            for arg in fake_args:
+                if isinstance(arg, FakeTensor):
+                    return arg.fake_mode.shape_env
+            return None
+
+        # Clone the fake args to avoid mutating the original fake args
+        with ExitStack() as ctx_stack:
+            # We need to re-use prev_fake_mode's shape env to resolve
+            # the runtime assertions for unbacked symbols.
+            new_fake_mode = torch._subclasses.FakeTensorMode(
+                shape_env=_get_shape_env(fake_args),
+                allow_non_fake_inputs=False,
+            )
+            # We need to temporarily turn inference_mode off because
+            # under inference mode, tensor version counter is not tracked.
+            no_inference_mode_ctx = torch.inference_mode(False)
+            ctx_stack.enter_context(new_fake_mode)
+            ctx_stack.enter_context(no_inference_mode_ctx)
+            if new_fake_mode.shape_env is not None:
+                ctx_stack.enter_context(
+                    new_fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+                )
+
+            # create new fake tensors in new fake mode to avoid mutating original tensors
+            cloned = [
+                torch.empty_strided(
+                    arg.size(),
+                    arg.stride(),
+                    dtype=arg.dtype,
+                    device=arg.device,
+                    requires_grad=arg.requires_grad,
+                    layout=arg.layout,
+                )
+                if isinstance(arg, torch.Tensor)
+                else arg
+                for arg in fake_args
+            ]
+            before = [_tensor_version(arg) for arg in cloned]
+            outputs = gm(*cloned)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             outputs = [outputs] if not isinstance(outputs, (list, tuple)) else outputs
             after = [_tensor_version(arg) for arg in cloned]
             mutated_inputs = [
@@ -650,4 +1068,227 @@ def _tensor_storage(t) -> StorageWeakRef:
             for i, inp in enumerate(cloned)
             if isinstance(inp, torch.Tensor) and _tensor_storage(inp) in out_storage_map
         }
+<<<<<<< HEAD
         return mutated_inputs, inp_inp_alias_map, inp_out_alias_map, out_out_alias_map
+=======
+        return (
+            inp_inp_alias_map,
+            inp_out_alias_map,
+            out_out_alias_map,
+            mutated_inputs,
+            outputs,
+        )
+
+
+registered_hop_fake_fns: dict[torch._ops.OpOverload, Callable] = {}
+
+
+F = TypeVar("F", bound=Callable)
+
+
+@overload
+def register_fake(hop, fn: None = None) -> Callable[[F], F]:
+    ...
+
+
+@overload
+def register_fake(hop, fn: F) -> F:
+    ...
+
+
+def register_fake(hop, fn=None):
+    """
+    Register a fake function for a HOP. This is conceptually equivalent of the
+    register_fake utility for the custom ops. The registered function is called
+    inside the fake_tensor _dispatch_impl.
+    """
+    assert hop not in registered_hop_fake_fns
+
+    def register(func: F) -> F:
+        from torch._subclasses.fake_tensor import FakeTensorMode
+
+        redirect_to_mode(hop, FakeTensorMode)
+
+        registered_hop_fake_fns[hop] = func
+        return func
+
+    if fn is None:
+        return register
+    return register(fn)
+
+
+class FunctionalizeCtxWrapper:
+    """
+    This is a dummy wrapper to facilitate fake tensor caching.
+
+    For AOT Dispatcher metadata collection pass, HOPs go from functionalization
+    key to fake tensor key. The functionalization key wraps the subgraphs in a
+    function, which changes from call to call even though the subgraph might
+    still be same.
+
+    To enable fake tensor caching, we just wrap the ctx and subgraph in this
+    class and then use the subgraph as the hash.
+    """
+
+    # Prevents PYTORCH_TEST_WITH_DYNAMO=1 test failures
+    @torch._disable_dynamo
+    def __init__(self, ctx, subgraph):
+        self.ctx = ctx
+        self.subgraph = subgraph
+
+    def __hash__(self):
+        return id(self.subgraph)
+
+    def __repr__(self):
+        return f"FunctionalizeCtxWrapper on subgraph {self.subgraph})"
+
+    def __call__(self, *args, **kwargs):
+        if isinstance(self.subgraph, torch.fx.GraphModule):
+            # Running graph with interpreter is needed for propagating the stack_trace
+            with fx_traceback.preserve_node_meta():
+                return self.ctx.functionalize(torch.fx.Interpreter(self.subgraph).run)(
+                    *args, **kwargs
+                )
+        return self.ctx.functionalize(self.subgraph)(*args, **kwargs)
+
+
+# A wrapper over HigherOrderOperator that also carries its schema
+class HopInstance:
+    def __init__(self, op: HigherOrderOperator, schema: HopSchema):
+        assert isinstance(op, HigherOrderOperator), op
+        self._op = op
+        # Using "_" to be consistent with how we access _schema of OpOverload
+        self._schema = schema
+
+    def __call__(self, *args, **kwargs):
+        return self._op(*args, **kwargs)
+
+    @staticmethod
+    def create(hop: HigherOrderOperator, *args, **kwargs):
+        return HopInstance(hop, hop.gen_schema(*args, **kwargs))
+
+
+# This call_op can be used to call a HopInstance with
+# flat args and kwargs. We need to make use of the hop's schema's tree_spec
+# to unflatten the args and kwargs before calling the hop.
+def call_op(op: Union[OpOverload, HopInstance], args, kwargs):
+    if isinstance(op, OpOverload):
+        return op(*args, **kwargs)
+
+    assert isinstance(op, HopInstance), op
+    schema = op._schema
+    bound_args = list(args)
+    bound_kwargs = {}
+    for arg in schema.arguments[len(bound_args) :]:
+        assert arg.name in kwargs, (arg.name, kwargs)
+        val = kwargs[arg.name]
+        if not arg.kwarg_only:
+            bound_args.append(val)
+        else:
+            bound_kwargs[arg.name] = val
+
+    if schema.tree_spec is not None:
+        assert len(bound_args) == len(schema.arguments) and len(bound_kwargs) == 0
+        args, kwargs = pytree.tree_unflatten(bound_args, schema.tree_spec)
+        return op(*args, **kwargs)
+    else:
+        assert len(bound_args) + len(bound_kwargs) == len(schema.arguments)
+        return op(*bound_args, **bound_kwargs)
+
+
+def materialize_as_graph(
+    fn: Callable,
+    args: tuple[Any],
+    include_key_set: Optional[torch._C.DispatchKeySet] = None,
+    exclude_key_set: Optional[torch._C.DispatchKeySet] = None,
+    force_enable_grad=False,
+) -> torch.fx.GraphModule:
+    if include_key_set is None:
+        include_key_set = torch._C._dispatch_tls_local_include_set()
+    if exclude_key_set is None:
+        exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+
+    @torch._dynamo.disable(recursive=True, reason=None)
+    def _materialize_as_graph_inner():
+        with suspend_functionalization(), disable_functional_mode():
+            with disable_proxy_modes_tracing():
+                unfunc_t = [_from_fun(arg) for arg in args]
+            with contextlib.ExitStack() as stack:
+                stack.enter_context(
+                    torch._C._ForceDispatchKeyGuard(include_key_set, exclude_key_set),
+                )
+                if force_enable_grad:
+                    stack.enter_context(torch.enable_grad())
+                return _maybe_reenter_make_fx(fn)(*unfunc_t)
+
+    gm = _materialize_as_graph_inner()
+    assert gm is not None
+    return gm
+
+
+def materialize_callable_in_args(op: HopInstance, args, kwargs):
+    schema = op._schema
+    hop = op._op
+    flat_args, flat_spec = pytree.tree_flatten((args, kwargs))
+
+    def wrapped_fn(*flat_args):
+        return call_op(op, args, kwargs)
+
+    # We need to trace the higher order op in order to materilaize the callable inputs that
+    # are a callable (e.g. after functionalization key)
+    gm = reenter_make_fx(wrapped_fn)(*flat_args)
+    hop_node = gm.graph.find_nodes(op="call_function", target=hop)[0]
+    arg_proxies = pytree.tree_leaves((hop_node.args, hop_node.kwargs))
+    assert isinstance(schema, torch._C.FunctionSchema) and len(arg_proxies) == len(
+        schema.arguments
+    )
+
+    # call_op preserves ordering of proxies via schema
+    materialized_args = []
+    for i, (proxy, arg) in enumerate(zip(arg_proxies, schema.arguments)):
+        if (
+            isinstance(proxy, torch.fx.Node)
+            and proxy.op == "get_attr"
+            and isinstance(getattr(gm, proxy.target), torch.fx.GraphModule)  # type: ignore[arg-type]
+        ):
+            assert callable(flat_args[i]), (schema, args, kwargs)
+            materialized_args.append(getattr(gm, proxy.target))  # type: ignore[arg-type]
+        else:
+            materialized_args.append(flat_args[i])
+
+    return pytree.tree_unflatten(materialized_args, flat_spec)
+
+
+def has_user_subclass(args, allowed_subclasses):
+    """Check if any tensor arguments are user subclasses.
+
+    This is used to determine if tensor subclasses should get a chance to run
+    their own implementation first before falling back to the default implementation.
+
+    Args:
+        args: Arguments to check (will be flattened with pytree)
+        allowed_subclasses: Tuple of allowed subclass types
+
+    Returns:
+        True if user tensor subclasses are found, False otherwise
+    """
+    flat_args, _ = pytree.tree_flatten(args)
+
+    val = any(
+        isinstance(a, torch.Tensor)
+        and type(a) is not torch.Tensor
+        and not isinstance(a, allowed_subclasses)
+        for a in flat_args
+    )
+    return val
+
+
+def _has_gen_schema(op: HigherOrderOperator):
+    # There is an InvokeQuant argument we cannot gen_schema.
+    if op is torch.ops.higher_order.invoke_quant_packed:
+        return False
+    method = "gen_schema"
+    return hasattr(type(op), method) and getattr(type(op), method) is not getattr(
+        HigherOrderOperator, method
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py
index 6aaee3280a00..c413ed6cdb28 100644
--- a/torch/_higher_order_ops/while_loop.py
+++ b/torch/_higher_order_ops/while_loop.py
@@ -6,6 +6,7 @@
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._higher_order_ops.utils import (
+<<<<<<< HEAD
     _has_potential_branch_input_alias,
     _has_potential_branch_input_mutation,
     _maybe_run_with_interpreter,
@@ -14,6 +15,13 @@
     diff_tensor_meta,
     reenter_make_fx,
     UnsupportedAliasMutationException,
+=======
+    _maybe_run_with_interpreter,
+    _set_compilation_env,
+    autograd_not_implemented,
+    check_meta_consistency,
+    reenter_make_fx,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     validate_subgraph_args_types,
 )
 from torch._ops import HigherOrderOperator
@@ -23,7 +31,10 @@
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
+<<<<<<< HEAD
 from torch.fx.passes.shape_prop import _extract_tensor_metadata
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class WhileLoopOp(HigherOrderOperator):
@@ -216,7 +227,11 @@ def _validate_cond_output(pred):
     return carried_vals
 
 
+<<<<<<< HEAD
 while_loop_op.py_impl(DispatchKey.Autograd)(
+=======
+while_loop_op.py_autograd_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     autograd_not_implemented(while_loop_op, deferred_error=True)
 )
 
@@ -340,6 +355,7 @@ def _trace_while_loop(
     )
 
 
+<<<<<<< HEAD
 def check_meta_consistency(
     lhs_list: list[Union[torch.Tensor, torch.SymInt, int]],
     rhs_list: list[Union[torch.Tensor, torch.SymInt, int]],
@@ -422,6 +438,8 @@ def diff_device(
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @while_loop_op.py_impl(FakeTensorMode)
 def while_loop_fake_tensor_mode(
     mode, cond_fn, body_fn, carried_inputs, additional_inputs
@@ -463,7 +481,15 @@ def while_loop_fake_tensor_mode(
             # so we could just return the output after one iteration.
             body_outs = body_fn(*carried_inputs, *additional_inputs)
             check_meta_consistency(
+<<<<<<< HEAD
                 carried_inputs, body_outs, "carried_inputs", "body_output"
+=======
+                carried_inputs,
+                body_outs,
+                "carried_inputs",
+                "body_output",
+                include_contiguity=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         # See NOTE [unspecialize int carry with unbacked symints]
         return pytree.tree_map_only(
@@ -479,6 +505,11 @@ def while_loop_fake_tensor_mode(
 
 @while_loop_op.py_functionalize_impl
 def while_loop_func(ctx, cond_fn, body_fn, carried_inputs, additional_inputs):
+<<<<<<< HEAD
+=======
+    from torch._higher_order_ops.utils import _check_alias_and_mutation
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unwrapped_carried_inputs = ctx.unwrap_tensors(carried_inputs)
     unwrapped_additional_inputs = ctx.unwrap_tensors(additional_inputs)
     unwrapped_inputs = unwrapped_carried_inputs + unwrapped_additional_inputs
@@ -490,6 +521,7 @@ def while_loop_func(ctx, cond_fn, body_fn, carried_inputs, additional_inputs):
             (cond_fn, "cond_fn"),
             (body_fn, "body_fn"),
         ]:
+<<<<<<< HEAD
             if _has_potential_branch_input_mutation(
                 fn, unwrapped_inputs, pre_dispatch=pre_dispatch
             ):
@@ -503,6 +535,9 @@ def while_loop_func(ctx, cond_fn, body_fn, carried_inputs, additional_inputs):
                 raise UnsupportedAliasMutationException(
                     f"torch.while_loop's {fn_name} might be aliasing the input!"
                 )
+=======
+            _check_alias_and_mutation(fn, unwrapped_inputs, fn_name, pre_dispatch)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ret = while_loop_op(
             functional_cond_fn,
             functional_body_fn,
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py
index 604c545e586b..c8111ddb31b3 100644
--- a/torch/_higher_order_ops/wrap.py
+++ b/torch/_higher_order_ops/wrap.py
@@ -87,6 +87,48 @@ def wrapper():
 wrap_with_autocast = WrapWithAutocast()
 
 
+<<<<<<< HEAD
+=======
+# This HOP allows you to bypass dynamo tracing of the wrapper function while
+# still tracing the inner function.
+# Takes two callables: The first, `wrapper_fn`, accepts `inner_fn` and returns a
+# callable with the same signature. The second is the `inner_fn` itself. Any
+# extra *args and **kwargs are forwarded to `wrapper_fn(inner_fn)` when it is
+# executed.
+class DynamoBypassingWrapper(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("dynamo_bypassing_wrapper")
+
+    def __call__(
+        self,
+        wrapper_fn_or_key,
+        inner_fn,
+        *args,
+        **kwargs,
+    ):
+        # Dynamo already traces the body of HigherOrderOp beforehand when it
+        # so no need to trace into it.
+        import torch._dynamo  # noqa: F401
+        from torch._dynamo import disable
+
+        is_compiling = isinstance(wrapper_fn_or_key, str)
+        if is_compiling:
+            assert isinstance(inner_fn, torch.fx.GraphModule)
+            wrapper_fn = inner_fn.meta[wrapper_fn_or_key]
+        else:
+            wrapper_fn = wrapper_fn_or_key
+
+        @disable
+        def wrapper():
+            return wrapper_fn(inner_fn)(*args, **kwargs)
+
+        return wrapper()
+
+
+dynamo_bypassing_wrapper = DynamoBypassingWrapper()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class WrapActivationCheckpoint(HigherOrderOperator):
     """
     This operator is used to wrap torch.utils.checkpoint. This avoids
diff --git a/torch/_inductor/__autotune_main__.py b/torch/_inductor/__autotune_main__.py
new file mode 100644
index 000000000000..1eb5ca86e8c1
--- /dev/null
+++ b/torch/_inductor/__autotune_main__.py
@@ -0,0 +1,33 @@
+import argparse
+import logging
+import os
+
+from torch._inductor.autotune_process import TuningProcess
+from torch._inductor.compile_worker.utils import _async_compile_initializer
+
+
+log = logging.getLogger(__name__)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--parent", type=int)
+    parser.add_argument("--read-fd", type=int)
+    parser.add_argument("--write-fd", type=int)
+    args = parser.parse_args()
+    read_pipe = os.fdopen(args.read_fd, "rb")
+    write_pipe = os.fdopen(args.write_fd, "wb")
+
+    try:
+        # Ensures the subprocess exits if the parent crashes:
+        _async_compile_initializer(args.parent)
+        TuningProcess.process_main(read_pipe, write_pipe)
+    except Exception:
+        log.exception("Uncaught exception in autotune subprocess")
+    finally:
+        read_pipe.close()
+        write_pipe.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torch/_inductor/__init__.py b/torch/_inductor/__init__.py
index a2acd6570a20..1c4f726fcc27 100644
--- a/torch/_inductor/__init__.py
+++ b/torch/_inductor/__init__.py
@@ -4,15 +4,28 @@
 import io
 import logging
 import os
+<<<<<<< HEAD
 from typing import Any, IO, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, IO, Literal, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._inductor.config
 import torch.fx
 
+<<<<<<< HEAD
+=======
+from .standalone_compile import CompiledArtifact  # noqa: TC001
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TYPE_CHECKING:
     from torch._inductor.utils import InputType
     from torch.export import ExportedProgram
+<<<<<<< HEAD
+=======
+    from torch.export.pt2_archive._package_weights import Weights
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.types import FileLike
 
 __all__ = [
@@ -20,6 +33,10 @@
     "list_mode_options",
     "list_options",
     "cudagraph_mark_step_begin",
+<<<<<<< HEAD
+=======
+    "standalone_compile",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -194,13 +211,21 @@ def _aoti_compile_and_package_inner(
         path = [
             os.path.splitext(file)[0]
             for file in aoti_files
+<<<<<<< HEAD
             if os.path.splitext(file)[1] == ".so"
+=======
+            if isinstance(file, str) and os.path.splitext(file)[1] == ".so"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         if len(path) == 0:
             path = [
                 os.path.splitext(file)[0]
                 for file in aoti_files
+<<<<<<< HEAD
                 if os.path.splitext(file)[1] == ".cpp"
+=======
+                if isinstance(file, str) and os.path.splitext(file)[1] == ".cpp"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
         package_path = path[0] + ".pt2"
 
@@ -232,7 +257,13 @@ def _aoti_compile_and_package_inner(
     return package_path
 
 
+<<<<<<< HEAD
 def aoti_load_package(path: FileLike, run_single_threaded: bool = False) -> Any:  # type: ignore[type-arg]
+=======
+def aoti_load_package(
+    path: FileLike, run_single_threaded: bool = False, device_index: int = -1
+) -> Any:  # type: ignore[type-arg]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Loads the model from the PT2 package.
 
@@ -251,10 +282,23 @@ def aoti_load_package(path: FileLike, run_single_threaded: bool = False) -> Any:
         run_single_threaded (bool): Whether the model should be run without
             thread synchronization logic. This is useful to avoid conflicts with
             CUDAGraphs.
+<<<<<<< HEAD
     """
     from torch._inductor.package import load_package
 
     return load_package(path, run_single_threaded=run_single_threaded)
+=======
+        device_index (int): The index of the device to which the PT2 package is
+            to be loaded. By default, `device_index=-1` is used, which corresponds
+            to the device `cuda` when using CUDA. Passing `device_index=1` would
+            load the package to `cuda:1`, for example.
+    """
+    from torch._inductor.package import load_package
+
+    return load_package(
+        path, run_single_threaded=run_single_threaded, device_index=device_index
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def aot_compile(
@@ -263,7 +307,11 @@ def aot_compile(
     kwargs: Optional[dict[str, Any]] = None,
     *,
     options: Optional[dict[str, Any]] = None,
+<<<<<<< HEAD
 ) -> Union[str, list[str]]:
+=======
+) -> Union[str, list[Union[str, Weights]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Ahead-of-time compile a given FX graph with TorchInductor into a shared library.
 
@@ -283,12 +331,23 @@ def aot_compile(
     flat_example_inputs, options = _aoti_flatten_inputs(
         gm, args, kwargs, options=options
     )
+<<<<<<< HEAD
 
     return compile_fx_aot(
         gm,
         flat_example_inputs,  # type: ignore[arg-type]
         config_patches=options,
     )
+=======
+    from torch._export.utils import _compiling_state_context
+
+    with _compiling_state_context():
+        return compile_fx_aot(
+            gm,
+            flat_example_inputs,  # type: ignore[arg-type]
+            config_patches=options,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def list_mode_options(
@@ -356,3 +415,49 @@ def cudagraph_mark_step_begin():
     from .cudagraph_trees import mark_step_begin
 
     mark_step_begin()
+<<<<<<< HEAD
+=======
+
+
+def standalone_compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: list[InputType],
+    *,
+    dynamic_shapes: Literal[
+        "from_example_inputs", "from_tracing_context", "from_graph"
+    ] = "from_graph",
+    options: Optional[dict[str, Any]] = None,
+) -> CompiledArtifact:
+    """
+    Precompilation API for inductor.
+
+    .. code-block:: python
+
+        compiled_artifact = torch._inductor.standalone_compile(gm, args)
+        compiled_artifact.save(path=path, format="binary")
+
+        # Later on a new process
+        loaded = torch._inductor.CompiledArtifact.load(path=path, format="binary")
+        compiled_out = loaded(*args)
+
+    Args:
+        gm: Graph Module
+        example_inputs: Inputs for the graph module
+        dynamic_shapes: If "from_graph" (default), we will use the dynamic
+            shapes in the passed-in graph module.
+            If "from_tracing_context", we use the dynamic shape info in the
+            ambient tracing context.
+            If "from_example_inputs", we will specialize the graph on the
+            example_inputs.
+        options: Inductor compilation options
+
+    Returns:
+        CompiledArtifact that can be saved to disk or invoked directly.
+    """
+    from .standalone_compile import standalone_compile
+
+    options = options if options else {}
+    return standalone_compile(
+        gm, example_inputs, dynamic_shapes=dynamic_shapes, options=options
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/async_compile.py b/torch/_inductor/async_compile.py
index 293151a907da..b677bf380683 100644
--- a/torch/_inductor/async_compile.py
+++ b/torch/_inductor/async_compile.py
@@ -3,11 +3,19 @@
 
 import atexit
 import functools
+<<<<<<< HEAD
+=======
+import json
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import multiprocessing
 import os
 import sys
+<<<<<<< HEAD
 from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor
+=======
+from concurrent.futures import Future, ThreadPoolExecutor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from concurrent.futures.process import BrokenProcessPool
 from functools import partial
 from time import time, time_ns
@@ -32,15 +40,31 @@
     HalideCodeCache,
     LambdaFuture,
     ROCmCodeCache,
+<<<<<<< HEAD
     torch_key,
 )
 from torch._inductor.compile_worker.subproc_pool import AnyPool, SubprocPool
 from torch._inductor.compile_worker.watchdog import _async_compile_initializer
+=======
+    StaticAutotunerFuture,
+    torch_key,
+)
+from torch._inductor.compile_worker.subproc_pool import AnyPool, SubprocPool
+from torch._inductor.compile_worker.tracked_process_pool import (
+    TrackedProcessPoolExecutor,
+)
+from torch._inductor.compile_worker.utils import _async_compile_initializer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.runtime.compile_tasks import (
     _set_triton_ptxas_path,
     _worker_compile_triton,
 )
+<<<<<<< HEAD
 from torch._inductor.utils import clear_on_fresh_inductor_cache
+=======
+from torch._inductor.utils import clear_on_fresh_cache
+from torch._inductor.virtualized import V
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.hub import _Faketqdm, tqdm
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._triton import has_triton_package
@@ -58,6 +82,11 @@
 
 log = logging.getLogger(__name__)
 
+<<<<<<< HEAD
+=======
+_triton_kernel_metrics: Optional[dict[str, dict[str, Any]]] = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def pre_fork_setup():
     """
@@ -85,6 +114,7 @@ def caching_device_properties():
 
 
 def _compile_start() -> None:
+<<<<<<< HEAD
     global _t0
     if _t0 is None:
         _t0 = time()
@@ -92,11 +122,44 @@ def _compile_start() -> None:
 
 def _compile_end() -> None:
     global _cumulative_compile_time, _t0
+=======
+    global _t0, _triton_kernel_metrics
+    if _t0 is None:
+        _t0 = time()
+    if _triton_kernel_metrics is None:
+        _triton_kernel_metrics = {}
+
+
+def _compile_end() -> None:
+    global _cumulative_compile_time, _t0, _triton_kernel_metrics
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if _t0 is not None:
         t1 = time()
         _cumulative_compile_time += t1 - _t0
         _t0 = None
         # print("CUMULATIVE COMPILE TIME", _cumulative_compile_time)
+<<<<<<< HEAD
+=======
+    if _triton_kernel_metrics:
+        # Log triton kernel info
+        sorted_info = dict(sorted(_triton_kernel_metrics.items()))
+        torch._logging.trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "triton_kernel_info",
+                "encoding": "json",
+            },
+            payload_fn=lambda: json.dumps(sorted_info),
+        )
+        _triton_kernel_metrics = None
+
+
+def _add_triton_kernel_info(kernel_name: str, info: dict[str, Any]):
+    global _triton_kernel_metrics
+    # Must be called between _compile_start and _compile_end
+    if _triton_kernel_metrics is not None:
+        _triton_kernel_metrics[kernel_name] = info
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _IS_WINDOWS = sys.platform == "win32"
@@ -136,7 +199,11 @@ def get_compile_threads() -> int:
     return config.compile_threads
 
 
+<<<<<<< HEAD
 @clear_on_fresh_inductor_cache
+=======
+@clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CompiledTritonKernels:
     """
     In memory cache for storing compiled triton kernels.
@@ -147,7 +214,11 @@ class CompiledTritonKernels:
     Currently, the cache stores Future objects, but it should be generalizable for any kernels.
     """
 
+<<<<<<< HEAD
     _cache: dict[str, LambdaFuture] = {}
+=======
+    _cache: dict[str, CodeCacheFuture] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def key(kernel_src: str):
@@ -160,7 +231,11 @@ def key(kernel_src: str):
         return code_hash(kernel_src, extra=torch_key())
 
     @staticmethod
+<<<<<<< HEAD
     def save(kernel_src: str, future: LambdaFuture):
+=======
+    def save(kernel_src: str, future: CodeCacheFuture):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Saves a compiled triton kernel to the cache.
         TODO: We store a LambdaFuture as that's the callable returned by async_compile.triton,
@@ -173,9 +248,15 @@ def save(kernel_src: str, future: LambdaFuture):
         CompiledTritonKernels._cache[key] = future
 
     @staticmethod
+<<<<<<< HEAD
     def get(kernel_src: str, default: Any) -> LambdaFuture:
         key = CompiledTritonKernels.key(kernel_src)
         return CompiledTritonKernels._cache.get(key, default)
+=======
+    def get(kernel_src: str) -> Optional[CodeCacheFuture]:
+        key = CompiledTritonKernels.key(kernel_src)
+        return CompiledTritonKernels._cache.get(key, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def cache_clear():
@@ -184,6 +265,11 @@ def cache_clear():
     @staticmethod
     def remove_future(kernel_src: str) -> None:
         key = CompiledTritonKernels.key(kernel_src)
+<<<<<<< HEAD
+=======
+
+        # Delete the LambdaFuture if there is one
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if key in CompiledTritonKernels._cache:
             del CompiledTritonKernels._cache[key]
 
@@ -223,7 +309,11 @@ def process_pool() -> AnyPool:
                 os.environ["TORCH_WARM_POOL"] = "0"
             pre_fork_setup()
             ctx = multiprocessing.get_context(config.worker_start_method)
+<<<<<<< HEAD
             pool = ProcessPoolExecutor(
+=======
+            pool = TrackedProcessPoolExecutor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 get_compile_threads(),
                 mp_context=ctx,
                 initializer=partial(_async_compile_initializer, os.getpid()),
@@ -281,9 +371,20 @@ def triton(self, kernel_name: str, source_code: str, device_str: str = "cuda"):
         - The AutotuneCache, if enabled, is constructed on each worker per triton config
           and pickled by to us via `CachingAutotuner.save_cache_hook`.
         """
+<<<<<<< HEAD
         if future := CompiledTritonKernels.get(source_code, None):
             counters["inductor"]["async_compile_cache_hit"] += 1
             return future
+=======
+        load_kernel = functools.partial(
+            _load_triton_kernel_from_source, kernel_name, source_code
+        )
+
+        def reload_kernel_in_parent():
+            # Benchmark how often this happens
+            with dynamo_timed("reload_kernel_in_parent"):
+                return load_kernel()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         counters["inductor"]["async_compile_cache_miss"] += 1
 
@@ -295,21 +396,50 @@ def triton(self, kernel_name: str, source_code: str, device_str: str = "cuda"):
                 torch._inductor.codecache.PyCodeCache.load(source_code), kernel_name
             )
 
+<<<<<<< HEAD
         load_kernel = functools.partial(
             _load_triton_kernel_from_source, kernel_name, source_code
         )
         is_parallel = self.use_process_pool()
         set_feature_use("parallel_compile_post_warmup", is_parallel)
+=======
+        is_parallel = self.use_process_pool()
+        set_feature_use("parallel_compile_post_warmup", is_parallel)
+
+        compile_id = torch._guards.CompileContext.current_compile_id()
+        is_backward = getattr(V.graph, "is_backward", False)
+
+        if (future := CompiledTritonKernels.get(source_code)) is not None:
+            counters["inductor"]["async_compile_cache_hit"] += 1
+            # Set reload_kernel_from_src properly based on source_code
+            if isinstance(future, StaticAutotunerFuture):
+                # Remove the future now that we've cache hit
+                CompiledTritonKernels.remove_future(source_code)
+                future.reload_kernel_from_src = reload_kernel_in_parent
+            if is_parallel:
+                return future
+            else:
+                return future.result()
+
+        # Cache miss
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_parallel:
             # We want to support changing these env vars after (and while) the
             # process pool is running, so pass them to the subprocess to reset.
             env_vars = ["TORCHINDUCTOR_CACHE_DIR", "TRITON_CACHE_DIR"]
             extra_env = {v: os.environ[v] for v in env_vars if v in os.environ}
+<<<<<<< HEAD
+=======
+            extra_config = {
+                "use_static_cuda_launcher": torch._inductor.config.use_static_cuda_launcher
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             task = self.process_pool().submit(
                 _worker_compile_triton,
                 load_kernel,
                 extra_env,
+<<<<<<< HEAD
             )
 
             def reload_kernel_in_parent():
@@ -325,6 +455,26 @@ def get_result() -> tuple[CachingAutotuner, int]:
                 kernel.precompile(
                     warm_cache_only=False, reload_kernel=reload_kernel_in_parent
                 )
+=======
+                extra_config,
+            )
+
+            def get_result() -> CachingAutotuner:
+                kernel, elapsed_us = task.result()
+                # Now that we've compiled, we should clear the future
+                # so it can't be used again
+                kernel.set_compile_info(compile_id, is_backward)
+                CompiledTritonKernels.remove_future(source_code)
+
+                kernel.precompile(
+                    warm_cache_only=False,
+                    reload_kernel=reload_kernel_in_parent,
+                    static_triton_bundle_key=CompiledTritonKernels.key(source_code),
+                )
+                info = kernel.autotune_cache_info or {}
+                info["compile_time_us"] = elapsed_us
+                _add_triton_kernel_info(kernel_name, info)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 get_metrics_context().add_top_n(
                     "triton_kernel_compile_times_us", kernel_name, elapsed_us
                 )
@@ -339,15 +489,33 @@ def get_result() -> tuple[CachingAutotuner, int]:
                 log_pt2_compile_event=True,
                 dynamo_compile_column_us="triton_compile_time_us",
                 log_waitcounter=True,
+<<<<<<< HEAD
+=======
+                waitcounter_name_override="compile_triton",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 start_ns = time_ns()
                 _set_triton_ptxas_path()
                 kernel = load_kernel()
+<<<<<<< HEAD
                 kernel.precompile(warm_cache_only=False)
+=======
+                kernel.set_compile_info(compile_id, is_backward)
+                kernel.precompile(
+                    warm_cache_only=False,
+                    static_triton_bundle_key=CompiledTritonKernels.key(source_code),
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 elapsed_us = (time_ns() - start_ns) // 1000
                 get_metrics_context().add_top_n(
                     "triton_kernel_compile_times_us", kernel_name, elapsed_us
                 )
+<<<<<<< HEAD
+=======
+                info = kernel.autotune_cache_info or {}
+                info["compile_time_us"] = elapsed_us
+                _add_triton_kernel_info(kernel_name, info)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return kernel
 
     def multi_kernel(self, *args, **kwargs) -> Any:
@@ -381,7 +549,12 @@ def task():
             if aot_compile:
                 # We rely on JITInductor to compile the CUDA code,
                 # so that we can load it into AOTInductor.
+<<<<<<< HEAD
                 CUDACodeCache.compile(source_code, "o")
+=======
+                output_path, *_ = CUDACodeCache.compile(source_code, "o")
+                CUDACodeCache.aot_kernels_o.append(output_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return CUDACodeCache.load(source_code, dst_file_ext)[0]
 
         return self.submit(task)
@@ -396,7 +569,12 @@ def rocm(
 
         def task():
             if aot_compile:
+<<<<<<< HEAD
                 _ = ROCmCodeCache.compile(source_code, dst_file_ext="o")
+=======
+                output_path, *_ = ROCmCodeCache.compile(source_code, dst_file_ext="o")
+                ROCmCodeCache.aot_kernels_o.append(output_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if config.rocm.generate_test_runner:
                 _ = ROCmCodeCache.compile(source_code, dst_file_ext="exe")
             return ROCmCodeCache.load(source_code, dst_file_ext)[0]
@@ -420,6 +598,10 @@ def wait(self, scope: dict[str, Any]) -> None:
                 log_pt2_compile_event=True,
                 dynamo_compile_column_us="triton_compile_time_us",
                 log_waitcounter=True,
+<<<<<<< HEAD
+=======
+                waitcounter_name_override="compile_triton",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 self._wait_futures(scope)
 
@@ -437,12 +619,20 @@ def _wait_futures(self, scope: dict[str, Any]) -> None:
             disable=config.disable_progress,
             delay=0,
         )
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for key, result in kernels.items():
             if config.verbose_progress and not isinstance(pbar, _Faketqdm):
                 pbar.set_postfix_str(key)
             try:
+<<<<<<< HEAD
                 scope[key] = result.result()
+=======
+                kernel = result.result()
+                scope[key] = kernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except BrokenProcessPool as e:
                 raise RuntimeError(
                     "A compilation subprocess exited unexpectedly. This "
diff --git a/torch/_inductor/autoheuristic/learnedheuristic_interface.py b/torch/_inductor/autoheuristic/learnedheuristic_interface.py
index cb1519c8dd89..eeb07b872a3c 100644
--- a/torch/_inductor/autoheuristic/learnedheuristic_interface.py
+++ b/torch/_inductor/autoheuristic/learnedheuristic_interface.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional
 
 from torch._inductor.autoheuristic.autoheuristic_utils import (
@@ -51,7 +55,13 @@ def get_decision(
         for choice in choices:
             predicted_feedback = self.get_feedback(context, choice)
             choice2feedback[choice] = predicted_feedback
+<<<<<<< HEAD
         sorted_choices_feedback = sorted(choice2feedback.items(), key=lambda t: t[1])
+=======
+        sorted_choices_feedback = sorted(
+            choice2feedback.items(), key=operator.itemgetter(1)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         highest_feedback = sorted_choices_feedback[-1][1]
         second_highest_feedback = sorted_choices_feedback[-2][1]
         if highest_feedback / second_highest_feedback > self.get_confidence_threshold():
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
index 0faca77e4b07..6fcf5681266c 100644
--- a/torch/_inductor/autotune_process.py
+++ b/torch/_inductor/autotune_process.py
@@ -1,23 +1,42 @@
 # mypy: allow-untyped-defs
 from __future__ import annotations
 
+<<<<<<< HEAD
 import contextlib
+=======
+import atexit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import ctypes
 import dataclasses
 import functools
 import logging
 import os
+<<<<<<< HEAD
 import queue
+=======
+import pickle
+import queue
+import selectors
+import subprocess
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import time
 import warnings
 from collections.abc import Iterable, Sequence
 from concurrent.futures import ThreadPoolExecutor
 from ctypes import byref, c_size_t, c_void_p, CDLL
+<<<<<<< HEAD
 from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
 from torch import multiprocessing
+=======
+from typing import Any, Callable, IO, Optional, TYPE_CHECKING, Union
+
+import torch
+import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.device_interface import get_interface_for_device
 from torch._dynamo.testing import rand_strided
 from torch._inductor import ir
@@ -28,27 +47,39 @@
     get_hash,
     PyCodeCache,
 )
+<<<<<<< HEAD
 from torch._inductor.utils import get_gpu_type, is_gpu
+=======
+from torch._inductor.utils import get_gpu_type, get_ld_library_path, is_gpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import getArtifactLogger
 from torch.utils._ordered_set import OrderedSet
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from multiprocessing.process import BaseProcess
     from multiprocessing.queues import Queue
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from types import ModuleType
 
     from torch._inductor.select_algorithm import TritonTemplateCaller
 
+<<<<<<< HEAD
     from .codegen.common import WorkspaceArg
 
 from . import config
 from .codegen.common import WorkspaceZeroMode
+=======
+from . import config
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .runtime.benchmarking import benchmarker
 from .virtualized import V
 
 
 CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
+<<<<<<< HEAD
 EXIT_HANDLER_REGISTERED = False
 
 autotuning_log = getArtifactLogger(__name__, "autotuning")
@@ -62,12 +93,17 @@ class Ping:
 
 class Pong:
     pass
+=======
+
+autotuning_log = getArtifactLogger(__name__, "autotuning")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class NonzeroWorkspaceNotSupportedError(Exception):
     pass
 
 
+<<<<<<< HEAD
 @contextlib.contextmanager
 def set_cuda_visible_device(device: Optional[int]):
     """
@@ -107,10 +143,20 @@ def process_main(
         request_queue: Queue[Any],
         response_queue: Queue[Any],
     ) -> None:
+=======
+class TuningProcess:
+    """
+    Class to launch and interact with a benchmarking subprocess.
+    """
+
+    @staticmethod
+    def process_main(read_pipe: IO[bytes], write_pipe: IO[bytes]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Entry point for the child process.
         """
         autotuning_log.debug(
+<<<<<<< HEAD
             "Entering TuningProcess child. Visible devices = %s",
             os.environ.get(CUDA_VISIBLE_DEVICES),
         )
@@ -245,11 +291,147 @@ def terminate(self) -> None:
             assert self.process is not None
             assert self.request_queue is not None
             self.request_queue.put(None)
+=======
+            "Started autotune subprocess %s. Visible devices: %s",
+            os.getpid(),
+            os.environ.get(CUDA_VISIBLE_DEVICES),
+        )
+
+        def workloop():
+            while True:
+                job = TuningProcess.recv(read_pipe)
+                if job is None:
+                    # None is a sentinel for the child to shut down
+                    break
+                try:
+                    result = job()
+                except Exception as e:
+                    result = e
+                TuningProcess.send(result, write_pipe)
+
+        try:
+            workloop()
+        except EOFError:
+            # The parent closed the pipe
+            pass
+
+    @staticmethod
+    def send(obj: Any, write_pipe: IO[bytes]) -> None:
+        pickle.dump(obj, write_pipe)
+        write_pipe.flush()
+
+    @staticmethod
+    def recv(read_pipe: IO[bytes]) -> Any:
+        return pickle.load(read_pipe)
+
+    def __init__(self, device: Optional[int]):
+        self.device = device
+        self.start()
+
+    def start(self):
+        """
+        Start the benchmarking subprocess.
+        """
+        entry = os.path.join(os.path.dirname(__file__), "__autotune_main__.py")
+
+        subproc_read_fd, write_fd = os.pipe()
+        read_fd, subproc_write_fd = os.pipe()
+        self.write_pipe = os.fdopen(write_fd, "wb")
+        self.read_pipe = os.fdopen(read_fd, "rb")
+
+        self.selector = selectors.DefaultSelector()
+        self.selector.register(self.read_pipe, selectors.EVENT_READ)
+
+        cmd = [
+            sys.executable,
+            entry,
+            f"--parent={os.getpid()}",
+            f"--read-fd={str(subproc_read_fd)}",
+            f"--write-fd={str(subproc_write_fd)}",
+        ]
+        extra_env = {
+            # We need to set the PYTHONPATH so the subprocess can find torch.
+            "PYTHONPATH": os.environ.get(
+                "TORCH_CUSTOM_PYTHONPATH", os.pathsep.join(sys.path)
+            ),
+            # We shouldn't be using the Triton async compile subprocess pool,
+            # but as a precaution set the env var that disables its creation.
+            "TORCH_WARM_POOL": "0",
+            # Some internal usages need a modified LD_LIBRARY_PATH.
+            "LD_LIBRARY_PATH": get_ld_library_path(),
+            # This will cause the subprocs to profile using the profiler.
+            "TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILING": "1"
+            if config.profile_bandwidth_with_do_bench_using_profiling
+            else "0",
+        }
+        if self.device is not None:
+            extra_env[CUDA_VISIBLE_DEVICES] = str(self.device)
+        self.process = subprocess.Popen(
+            cmd,
+            env={**os.environ, **extra_env},
+            pass_fds=(subproc_read_fd, subproc_write_fd),
+        )
+        os.close(subproc_read_fd)
+        os.close(subproc_write_fd)
+
+        self.running = True
+
+    def alive(self) -> bool:
+        """
+        True if the subprocess is still running.
+        """
+        return self.running and self.process.poll() is None
+
+    def put(self, req: Any) -> None:
+        """
+        Push a work item to the child process.
+        """
+        if not self.alive():
+            self.start()
+        TuningProcess.send(req, self.write_pipe)
+
+    def get(self, timeout: float = 120.0) -> Any:
+        """
+        Get a response from the child process. Raises TimeoutError on timeout;
+        raises EOFError if the subprocess crashes.
+        """
+        try:
+            if not self.selector.select(timeout):
+                raise TimeoutError(f"Timeout in autotune subprocess {self.process.pid}")
+            result = TuningProcess.recv(self.read_pipe)
+        except TimeoutError:
+            self.kill()
+            raise
+        except EOFError:
+            # The subprocess crashed
+            self.close()
+            raise
+        except Exception:
+            autotuning_log.exception(
+                "Unexpected exception in autotune subprocess %s", self.process.pid
+            )
+            self.kill()
+            raise
+
+        if isinstance(result, Exception):
+            raise result
+        return result
+
+    def shutdown(self, wait: bool = True) -> None:
+        """
+        Signal the child process to shut down gracefully.
+        """
+        if self.alive():
+            TuningProcess.send(None, self.write_pipe)
+        if wait:
+            self.wait()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def wait(self) -> None:
         """
         Wait for the child process to exit.
         """
+<<<<<<< HEAD
         if self.process is not None:
             self.process.join()
             self.clear()
@@ -279,6 +461,34 @@ def kill(self, graceful_timeout=5.0, terminate_timeout=1.0) -> None:
 
 
 @dataclasses.dataclass
+=======
+        if self.alive():
+            self.process.wait()
+        self.close()
+
+    def close(self) -> None:
+        """
+        Close resources.
+        """
+        self.selector.close()
+        self.read_pipe.close()
+        self.write_pipe.close()
+        self.running = False
+
+    def kill(self) -> None:
+        """
+        Send a SIGKILL to the child process.
+        """
+        if self.alive():
+            autotuning_log.error(
+                "Sending SIGKILL to autotune subprocess %d",
+                self.process.pid,
+            )
+            self.process.kill()
+        self.close()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TuningProcessPool:
     """
     Maintains a pool of TuningProcesses to benchmark kernels in parallel
@@ -286,6 +496,7 @@ class TuningProcessPool:
     set the sub-process environment to make only that device visible.
     """
 
+<<<<<<< HEAD
     processes: Optional[queue.Queue[TuningProcess]] = None
     executor: Optional[ThreadPoolExecutor] = None
 
@@ -311,12 +522,28 @@ def initialize(self) -> None:
         # Wait for the initialization to finish
         for p in self.processes.queue:
             assert isinstance(p.get(result_timeout=None), Pong)
+=======
+    def __init__(self) -> None:
+        """
+        Start the child processes.
+        """
+        devices = self.get_device_list()
+        autotuning_log.debug("Sub-process autotune device list: %s", devices)
+
+        # Launch the child processes.
+        self.processes = [TuningProcess(device=device) for device in devices]
+
+        self.process_queue: queue.Queue[TuningProcess] = queue.Queue()
+        for p in self.processes:
+            self.process_queue.put(p)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Use a thread pool to manage distributing work to the subprocesses.
         # Threads block on an available process, so it makes sense to match
         # the number of threads with the number of devices.
         self.executor = ThreadPoolExecutor(max_workers=len(devices))
 
+<<<<<<< HEAD
         # Register the exit handler for the parent process so it will terminate
         # the child processes.
         global EXIT_HANDLER_REGISTERED
@@ -327,6 +554,10 @@ def initialize(self) -> None:
             atexit.register(self.terminate)
 
     def get_device_list(self) -> Sequence[Optional[int]]:
+=======
+    @staticmethod
+    def get_device_list() -> Sequence[Optional[int]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Gather the list of devices to be used in the pool.
         """
@@ -346,6 +577,7 @@ def get_device_list(self) -> Sequence[Optional[int]]:
 
         return list(range(count))
 
+<<<<<<< HEAD
     def terminate(self) -> None:
         """
         Signal all child processes to terminate.
@@ -360,6 +592,18 @@ def terminate(self) -> None:
             for p in self.processes.queue:
                 p.wait()
             self.processes = None
+=======
+    def shutdown(self) -> None:
+        """
+        Signal all child processes to exit.
+        """
+        self.executor.shutdown()
+
+        for p in self.processes:
+            p.shutdown(wait=False)
+        for p in self.processes:
+            p.wait()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def target(self, choice: TritonTemplateCaller) -> float:
         """
@@ -368,6 +612,7 @@ def target(self, choice: TritonTemplateCaller) -> float:
         the TuningProcess to the queue.
         """
         assert choice.bmreq is not None
+<<<<<<< HEAD
         assert self.processes is not None
 
         process = self.processes.get()
@@ -379,14 +624,38 @@ def target(self, choice: TritonTemplateCaller) -> float:
                 config.max_autotune_subproc_terminate_timeout_seconds,
             )
         except queue.Empty:
+=======
+
+        process = self.process_queue.get()
+        process.put(choice.bmreq.benchmark)
+        try:
+            return process.get(
+                config.max_autotune_subproc_result_timeout_seconds,
+            )
+        except TimeoutError:
+            warnings.warn(
+                f"Timed out benchmarking choice '{choice}'. It will be ignored. "
+                "Please debug the root cause in case the choice can bring perf gains."
+            )
+            # Set to INF so this choice will be ignored
+            return float("inf")
+        except Exception:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             warnings.warn(
                 f"Failed to benchmark choice '{choice}'. It will be ignored. "
                 "Please debug the root cause in case the choice can bring perf gains."
             )
+<<<<<<< HEAD
             # set to INF so this choice will be ignored
             return float("inf")
         finally:
             self.processes.put(process)
+=======
+            # Set to INF so this choice will be ignored
+            return float("inf")
+        finally:
+            self.process_queue.put(process)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def benchmark(
         self,
@@ -395,6 +664,7 @@ def benchmark(
         """
         Benchmark each choice in a separate process.
         """
+<<<<<<< HEAD
         assert self.processes is not None, "Tuning process pool is not initialized"
         assert self.executor is not None
 
@@ -404,13 +674,22 @@ def benchmark(
         # to grab subprocesses as soon as they're free.
         for choice, result in zip(choices, self.executor.map(self.target, choices)):
             results[choice] = result
+=======
+
+        # Use a ThreadExecutorPool to spread the work across the subprocesses and
+        # to grab subprocesses as soon as they're free.
+        results = dict(zip(choices, self.executor.map(self.target, choices)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return results
 
 
+<<<<<<< HEAD
 tuning_pool = TuningProcessPool()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 LayoutOrBuffer = Union[ir.Layout, ir.Buffer]
 
 
@@ -507,7 +786,11 @@ def __init__(
         self.extra_args = extra_args
 
     def make_run_fn(
+<<<<<<< HEAD
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+=======
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Callable[[], None]:
         raise NotImplementedError
 
@@ -518,30 +801,49 @@ def do_bench(
         self,
         fn,
         *input_tensors: torch.Tensor,
+<<<<<<< HEAD
         output_tensor: Optional[torch.Tensor] = None,
+=======
+        out: Optional[torch.Tensor] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> float:
         raise NotImplementedError
 
     def benchmark(
         self,
         *input_tensors: torch.Tensor,
+<<<<<<< HEAD
         output_tensor: Optional[torch.Tensor] = None,
+=======
+        out: Optional[torch.Tensor] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> float:
         debug = autotuning_log.isEnabledFor(logging.DEBUG)
         if debug:
             start_ts = time.time()
 
         # create args and out tensor
+<<<<<<< HEAD
         if output_tensor is None:
             assert len(input_tensors) == 0
             input_tensors = tuple(x.to_tensor() for x in self.input_tensor_meta)
             output_tensor = self.output_tensor_meta.to_tensor()
+=======
+        if out is None:
+            assert len(input_tensors) == 0
+            input_tensors = tuple(x.to_tensor() for x in self.input_tensor_meta)
+            out = self.output_tensor_meta.to_tensor()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if debug:
             create_tensor_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
             start_ts = time.time()
         try:
+<<<<<<< HEAD
             fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
+=======
+            fn = self.make_run_fn(*input_tensors, out=out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except NonzeroWorkspaceNotSupportedError:
             # Skipping all ops with nonzero workspace requirements
             autotuning_log.info("Skipping op due to nonzero workspace requirement")
@@ -551,7 +853,11 @@ def benchmark(
             load_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
             start_ts = time.time()
 
+<<<<<<< HEAD
         out = self.do_bench(fn, *input_tensors, output_tensor)
+=======
+        res = self.do_bench(fn, *input_tensors, out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if debug:
             bench_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
@@ -563,6 +869,7 @@ def benchmark(
                 bench_elapse,
             )
         self.cleanup_run_fn()
+<<<<<<< HEAD
         return out
 
 
@@ -581,6 +888,43 @@ def benchmark(
         if self.value is None:
             raise Exception("Failed to run")  # noqa: TRY002
         return self.value
+=======
+        return res
+
+
+class _TestBenchmarkRequest(BenchmarkRequest):
+    """
+    Supports unit testing. Defined in this file instead of the test file so the
+    TuningProcess sub-process can unpickle these objects.
+    """
+
+    def __init__(
+        self,
+        result: float = 0.0,
+        device: Optional[int] = None,
+        sleep: Optional[float] = None,
+        exc: Optional[Exception] = None,
+        crash: bool = False,
+    ):
+        self.result = result
+        self.device = device
+        self.sleep = sleep
+        self.exc = exc
+        self.crash = crash
+
+    def benchmark(
+        self, *input_tensors: torch.Tensor, out: Optional[torch.Tensor] = None
+    ) -> float:
+        if self.device is not None:
+            assert os.environ.get(CUDA_VISIBLE_DEVICES, None) == str(self.device)
+        if self.sleep:
+            time.sleep(self.sleep)
+        if self.exc:
+            raise self.exc
+        if self.crash:
+            sys.exit(1)
+        return self.result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class GPUDeviceBenchmarkMixin:
@@ -588,11 +932,19 @@ def do_bench(
         self,
         fn,
         *input_tensors: torch.Tensor,
+<<<<<<< HEAD
         output_tensor: Optional[torch.Tensor] = None,
     ) -> float:
         device_idx_set = OrderedSet(
             tensor.device.index
             for tensor in [*input_tensors, output_tensor]
+=======
+        out: Optional[torch.Tensor] = None,
+    ) -> float:
+        device_idx_set = OrderedSet(
+            tensor.device.index
+            for tensor in [*input_tensors, out]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(tensor, torch.Tensor)
             and is_gpu(tensor.device.type)
             and tensor.device.index is not None
@@ -612,10 +964,17 @@ def do_bench(
         else:
             device_idx = device_interface.current_device()
         with device_interface.device(device_idx):  # type: ignore[attr-defined]
+<<<<<<< HEAD
             out = benchmarker.benchmark_gpu(fn)
             device_interface.synchronize()  # shake out any CUDA errors
 
         return out
+=======
+            res = benchmarker.benchmark_gpu(fn)
+            device_interface.synchronize()  # shake out any CUDA errors
+
+        return res
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CPUDeviceBenchmarkMixin:
@@ -623,7 +982,11 @@ def do_bench(
         self,
         fn,
         *input_tensors: torch.Tensor,
+<<<<<<< HEAD
         output_tensor: Optional[torch.Tensor] = None,
+=======
+        out: Optional[torch.Tensor] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> float:
         return benchmarker.benchmark_cpu(fn)
 
@@ -641,16 +1004,25 @@ def __init__(
         module_cache_key: str,
         num_stages: int,
         num_warps: int,
+<<<<<<< HEAD
         matrix_instr_nonkdim: int = 0,  # only used for hip to choose the shape of mfma instruction.
         waves_per_eu: int = 0,  # only used for hip to schedule waves per execution unit
         kpack: int = 0,  # ROCm specific gemm paramete
         workspace_arg: Optional[WorkspaceArg] = None,
+=======
+        num_consumer_groups: int = 0,
+        num_buffers_warp_spec: int = 0,
+        matrix_instr_nonkdim: int = 0,  # only used for hip to choose the shape of mfma instruction.
+        waves_per_eu: int = 0,  # only used for hip to schedule waves per execution unit
+        kpack: int = 0,  # ROCm specific gemm parameter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
         self.module_path = module_path
         self.module_cache_key = module_cache_key
         self.num_stages = num_stages
         self.num_warps = num_warps
+<<<<<<< HEAD
         self.matrix_instr_nonkdim = matrix_instr_nonkdim
         self.waves_per_eu = waves_per_eu
         self.kpack = kpack
@@ -658,6 +1030,16 @@ def __init__(
 
     def make_run_fn(
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+=======
+        self.num_consumer_groups = num_consumer_groups
+        self.num_buffers_warp_spec = num_buffers_warp_spec
+        self.matrix_instr_nonkdim = matrix_instr_nonkdim
+        self.waves_per_eu = waves_per_eu
+        self.kpack = kpack
+
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Callable[[], None]:
         mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
         autotuning_log.debug(
@@ -678,15 +1060,23 @@ def make_run_fn(
         if "warmup" in inspect.signature(run_method).parameters:
             warmup_arg["warmup"] = False
 
+<<<<<<< HEAD
         if output_tensor.device.type == "cpu":
             stream = 0
         else:
             device_type = output_tensor.device.type
+=======
+        if out.device.type == "cpu":
+            stream = 0
+        else:
+            device_type = out.device.type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device_interface = get_interface_for_device(device_type)
             stream = device_interface.get_raw_stream(
                 self.output_tensor_meta.device.index
             )
 
+<<<<<<< HEAD
         if self.workspace_arg is not None:
             # Create a function that handles both workspace creation and kernel execution
             workspace_arg = self.workspace_arg
@@ -717,6 +1107,8 @@ def run_with_workspace():
                 )
 
             return run_with_workspace
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(
             getattr(mod, self.kernel_name),
             torch._inductor.runtime.triton_heuristics.DebugAutotuner,
@@ -724,7 +1116,11 @@ def run_with_workspace():
             return functools.partial(
                 run_method,
                 *input_tensors,
+<<<<<<< HEAD
                 output_tensor,
+=======
+                out,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *extra_args,
                 **warmup_arg,
                 stream=stream,
@@ -733,7 +1129,11 @@ def run_with_workspace():
             return functools.partial(
                 run_method,
                 *input_tensors,
+<<<<<<< HEAD
                 output_tensor,
+=======
+                out,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *extra_args,
                 **warmup_arg,
                 stream=stream,
@@ -757,8 +1157,19 @@ class TritonCPUBenchmarkRequest(CPUDeviceBenchmarkMixin, TritonBenchmarkRequest)
 
 
 class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
+<<<<<<< HEAD
     # Important: Instances of this class have to be serializable
     # across process boundaries. Do not put CUDA Tensors in here!
+=======
+    """
+    A class to handle CUDA (CUTLASS) benchmark requests. This class is for
+    managing the lifecycle of a CUDA kernel benchmark, including compiling
+    the source code, managing workspace memory, and executing the kernel.
+
+    Important: Instances of this class have to be serializable across
+    process boundaries. Do not put CUDA Tensors in here!
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -779,13 +1190,21 @@ def __init__(
         self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so")
 
     def precompile(self):
+<<<<<<< HEAD
         # Prepopulate CUDACodeCache
         # may happen in separate Threadpool
+=======
+        """
+        Precompile the CUDA source code to populate the CUDACodeCache.
+        This may happen in a separate thread pool.
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         autotuning_log.debug("Precompiling %s", self)
         CUDACodeCache.compile(self.source_code, "so")
         autotuning_log.debug("Done precompiling %s", self)
 
     def make_run_fn(
+<<<<<<< HEAD
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
     ) -> Callable[[], None]:
         self.ensure_dll_loaded()
@@ -794,6 +1213,17 @@ def make_run_fn(
             c_void_p(tensor.data_ptr())
             for tensor in list(input_tensors) + [output_tensor]
         ]
+=======
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
+    ) -> Callable[[], None]:
+        """
+        Create a function to run the CUDA kernel with the given input and output tensors.
+        """
+
+        self.ensure_dll_loaded()
+        self.update_workspace_size()
+        args = [c_void_p(tensor.data_ptr()) for tensor in list(input_tensors) + [out]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         autotuning_log.debug(
             "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
             self.kernel_name,
@@ -810,12 +1240,20 @@ def make_run_fn(
             self.workspace = torch.zeros(
                 (self.workspace_size + 7) // 8,
                 dtype=torch.float64,
+<<<<<<< HEAD
                 device=output_tensor.device,
+=======
+                device=out.device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             workspace_ptr = c_void_p(self.workspace.data_ptr())
 
         # Generate partial function.
+<<<<<<< HEAD
         return functools.partial(
+=======
+        ret = functools.partial(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             run_method,
             *args,
             *self.extra_args,
@@ -824,6 +1262,23 @@ def make_run_fn(
             stream_ptr,
         )
 
+<<<<<<< HEAD
+=======
+        # sanity check to make sure we cleanup run fn properly
+        try:
+            ret()
+        except RuntimeError as e:
+            err_msg = str(e)
+
+            def raise_runtime_error():
+                raise RuntimeError(err_msg)
+
+            self.cleanup_run_fn()
+            return raise_runtime_error
+
+        return ret
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def update_workspace_size(self) -> None:
         if self._workspace_size_updated:
             return
@@ -869,6 +1324,10 @@ def ensure_dll_loaded(self):
     def cleanup_run_fn(self) -> None:
         if self.DLL is not None:
             self.DLL.close()
+<<<<<<< HEAD
+=======
+            self.DLL = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.workspace = None
 
     def __str__(self) -> str:
@@ -900,11 +1359,19 @@ def precompile(self):
         autotuning_log.debug("Done precompiling %s", self)
 
     def make_run_fn(
+<<<<<<< HEAD
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
     ) -> Callable[[], None]:
         # TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf
         self.DLL = CppCodeCache.load(self.source_code, device_type="cpu")
         args = [tensor.data_ptr() for tensor in list(input_tensors) + [output_tensor]]
+=======
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
+    ) -> Callable[[], None]:
+        # TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf
+        self.DLL = CppCodeCache.load(self.source_code, device_type="cpu")
+        args = [tensor.data_ptr() for tensor in list(input_tensors) + [out]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         autotuning_log.debug(
             "make_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s",
             self.kernel_name,
@@ -938,10 +1405,24 @@ def __str__(self) -> str:
         return f"{self.kernel_name=}"
 
 
+<<<<<<< HEAD
+=======
+@functools.cache
+def get_tuning_process_pool() -> TuningProcessPool:
+    pool = TuningProcessPool()
+    atexit.register(pool.shutdown)
+    return pool
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def benchmark_in_sub_process(
     choices: list[TritonTemplateCaller],
 ) -> dict[TritonTemplateCaller, float]:
     """
     Do benchmarking in a subprocess and return the perf number (latency).
     """
+<<<<<<< HEAD
     return tuning_pool.benchmark(choices)
+=======
+    return get_tuning_process_pool().benchmark(choices)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index dc9f5c253654..3feee4782911 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -6,6 +6,10 @@
 import functools
 import hashlib
 import importlib
+<<<<<<< HEAD
+=======
+import importlib.resources
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import io
 import itertools
 import json
@@ -27,7 +31,11 @@
 from copy import copy
 from ctypes import c_void_p, CDLL, cdll
 from datetime import timedelta
+<<<<<<< HEAD
 from functools import partial
+=======
+from functools import lru_cache, partial
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from pathlib import Path
 from time import time, time_ns
 from types import ModuleType
@@ -35,37 +43,68 @@
     Any,
     Callable,
     cast,
+<<<<<<< HEAD
+=======
+    Generic,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NoReturn,
     Optional,
     TYPE_CHECKING,
     TypeVar,
     Union,
 )
+<<<<<<< HEAD
 from typing_extensions import Self
+=======
+from typing_extensions import override, Self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
 from torch import SymInt, Tensor
+<<<<<<< HEAD
 from torch._dynamo.utils import CompileEventLogger, counters, dynamo_timed
 from torch._inductor import config, exc, metrics
+=======
+from torch._dynamo.exc import SkipFrame
+from torch._dynamo.utils import CompileEventLogger, counters, dynamo_timed
+from torch._inductor import config, exc, metrics
+from torch._inductor.codegen.common import (
+    custom_backend_passes,
+    init_backend_registration,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.codegen.cuda import cuda_env
 from torch._inductor.codegen.rocm.compile_command import (
     rocm_compile_command,
     rocm_compiler,
 )
+<<<<<<< HEAD
+=======
+from torch._inductor.compile_worker.utils import in_toplevel_process
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.cpp_builder import (
     _LINKER_SCRIPT,
     _set_gpu_runtime_env,
     _TORCH_PATH,
     _transform_cuda_paths,
+<<<<<<< HEAD
+=======
+    convert_cubin_to_obj,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CppBuilder,
     CppOptions,
     CppTorchDeviceOptions,
     get_compiler_version_info,
+<<<<<<< HEAD
+=======
+    get_ld_and_objcopy,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_name_and_dir_from_output_file_path,
     normalize_path_separator,
 )
 from torch._inductor.cpu_vec_isa import pick_vec_isa
+<<<<<<< HEAD
 from torch._inductor.custom_graph_pass import CustomGraphPass, CustomGraphPassType
 from torch._inductor.freezing_utils import has_frozen_params, is_frozen_param
 from torch._inductor.runtime.compile_tasks import (
@@ -76,6 +115,19 @@
 from torch._inductor.utils import (
     ALIGN_BYTES,
     clear_on_fresh_inductor_cache,
+=======
+from torch._inductor.custom_graph_pass import (
+    CustomGraphModulePass,
+    CustomGraphPass,
+    CustomGraphPassType,
+)
+from torch._inductor.freezing_utils import has_frozen_params, is_frozen_param
+from torch._inductor.runtime.compile_tasks import _reload_python_module
+from torch._inductor.runtime.runtime_utils import cache_dir, default_cache_dir
+from torch._inductor.utils import (
+    ALIGN_BYTES,
+    clear_on_fresh_cache,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_linux,
     is_windows,
 )
@@ -87,19 +139,41 @@
 )
 from torch._utils_internal import log_cache_bypass
 from torch.compiler import config as cconfig
+<<<<<<< HEAD
 from torch.compiler._cache import CacheArtifactManager, CacheArtifactType
 from torch.fx.experimental.symbolic_shapes import has_hint, hint_int, ShapeEnv
 from torch.utils._ordered_set import OrderedSet
 
 from .package.pt2_archive_constants import CUSTOM_OBJ_FILENAME_PREFIX
+=======
+from torch.compiler._cache import (
+    CacheArtifact,
+    CacheArtifactFactory,
+    CacheArtifactManager,
+)
+from torch.export.pt2_archive._package_weights import TensorProperties, Weights
+from torch.export.pt2_archive.constants import CUSTOM_OBJ_FILENAME_PREFIX
+from torch.fx.experimental.symbolic_shapes import has_hint, hint_int, ShapeEnv
+from torch.utils._ordered_set import OrderedSet
+
+from .output_code import CompiledFxGraph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .remote_cache import create_cache
 from .runtime import autotune_cache
 from .runtime.autotune_cache import AutotuneCacheBundler
 from .triton_bundler import TritonBundler
+<<<<<<< HEAD
 
 
 if config.is_fbcode():
     from triton.fb import build_paths
+=======
+from .virtualized import V
+
+
+if config.is_fbcode():
+    from triton.fb.build import build_paths
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from torch._inductor.fb.utils import (
         log_global_cache_errors,
@@ -109,6 +183,7 @@
     )
 else:
 
+<<<<<<< HEAD
     def log_global_cache_errors(*args: Any, **kwargs: Any) -> None:  # type: ignore[misc]
         pass
 
@@ -122,11 +197,33 @@ def use_global_cache() -> bool:  # type: ignore[misc]
         return False
 
 
+=======
+    def log_global_cache_errors(*args: Any, **kwargs: Any) -> None:
+        pass
+
+    def log_global_cache_stats(*args: Any, **kwargs: Any) -> None:
+        pass
+
+    def log_global_cache_vals(*args: Any, **kwargs: Any) -> None:
+        pass
+
+    def use_global_cache() -> bool:
+        return False
+
+
+T = TypeVar("T")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     from collections.abc import Generator, KeysView, Sequence
     from concurrent.futures import Future
 
+<<<<<<< HEAD
     from .compile_fx import _CompileFxKwargs, CompiledFxGraph
+=======
+    from .compile_fx import _CompileFxKwargs
+    from .cpp_builder import BuildOptionsBase
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .graph import GraphLowering
     from .ir import ChoiceCaller
     from .output_code import CompiledFxGraphConstants, OutputCode
@@ -135,8 +232,11 @@ def use_global_cache() -> bool:  # type: ignore[misc]
     from .runtime.triton_heuristics import CachingAutotuner
     from .utils import InputType
 
+<<<<<<< HEAD
     T = TypeVar("T")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _IS_WINDOWS = sys.platform == "win32"
 LOCK_TIMEOUT = 600
@@ -145,11 +245,38 @@ def use_global_cache() -> bool:  # type: ignore[misc]
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
+=======
+def use_re_build() -> bool:
+    """
+    Use for CUTLASS compilation only right now.
+    """
+    if config.is_fbcode() and not cuda_env.nvcc_exist(_cuda_compiler()):
+        from triton.fb.re_build_helper import should_build_locally
+
+        return not should_build_locally()
+    return False
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_cpp_wrapper_cubin_path_name() -> str:
     return "cubin_path" if torch.version.hip is None else "hsaco_path"
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+def get_kernel_bin_format(device: str) -> str:
+    if device == "cuda":
+        return "cubin" if torch.version.hip is None else "hsaco"
+    elif device == "xpu":
+        return "spv"
+    else:
+        return ""
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_global_cache_path_impl(global_cache_dir: str) -> Optional[Path]:
     return (
         Path(os.path.join(global_cache_dir, CacheBase.get_system()["hash"]))
@@ -160,7 +287,11 @@ def get_global_cache_path_impl(global_cache_dir: str) -> Optional[Path]:
 
 class CacheBase:
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_system() -> dict[str, Any]:
         try:
             from triton.compiler.compiler import triton_key
@@ -198,8 +329,13 @@ def get_system() -> dict[str, Any]:
         return system
 
     @staticmethod
+<<<<<<< HEAD
     @clear_on_fresh_inductor_cache
     @functools.lru_cache(None)
+=======
+    @clear_on_fresh_cache
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_local_cache_path() -> Path:
         return Path(os.path.join(cache_dir(), "cache", CacheBase.get_system()["hash"]))
 
@@ -253,7 +389,11 @@ def set_value(self, *keys: str, value: Any) -> None:
 
 
 class PersistentCache(CacheBase):
+<<<<<<< HEAD
     @functools.lru_cache(None)  # noqa: B019
+=======
+    @functools.cache  # noqa: B019
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_global_cache(self) -> dict[str, Any]:
         global_cache_path = self.get_global_cache_path()
         if global_cache_path is None or not global_cache_path.is_file():
@@ -380,9 +520,15 @@ def get_path(
 def get_hash(
     content: Union[str, bytes], extra: str = "", hash_type: str = "code"
 ) -> str:
+<<<<<<< HEAD
     if hash_type == "code":
         return code_hash(content, extra)
     if hash_type in ["cubin", "hsaco", "spv"]:
+=======
+    if hash_type in {"amdgcn", "code", "ptx", "spv"}:
+        return code_hash(content, extra)
+    if hash_type in {"cubin", "hsaco", "spv"}:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return code_hash(repr(content))
     raise AssertionError(f"Unknown hash type {hash_type}")
 
@@ -393,11 +539,21 @@ def write(
     extra: str = "",
     hash_type: str = "code",
     specified_dir: str = "",
+<<<<<<< HEAD
 ) -> tuple[str, str]:
     # use striped content to compute hash so we don't end up with different
     # hashes just because the content begins/ends with different number of
     # spaces.
     key: str = get_hash(content.strip(), extra, hash_type)
+=======
+    key: Optional[str] = None,
+) -> tuple[str, str]:
+    if key is None:
+        # use striped content to compute hash so we don't end up with different
+        # hashes just because the content begins/ends with different number of
+        # spaces.
+        key = get_hash(content.strip(), extra, hash_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     basename, _subdir, path = get_path(key, extension, specified_dir)
     if not os.path.exists(path):
         write_atomic(path, content, make_dirs=True)
@@ -580,7 +736,11 @@ def _reduce_graph_module(
         defined triton kernels
         Essentially what we are doing here is a huge hack where user defined
         triton kernel contain a dynamo time side table and the arguments to the
+<<<<<<< HEAD
         call_function are indicies into this side table. These arguments are not
+=======
+        call_function are indices into this side table. These arguments are not
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for hashing purposes since we included the source code into the cache
         key and the numbers are prone to give false negatives due to ordering.
         """
@@ -664,12 +824,45 @@ def build_code_hash(
             build_code_hash(spec.submodule_search_locations, f"{spec.name}.", hasher)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+def torch_key_cache(func: Callable[[], bytes]) -> Callable[[], bytes]:
+    """
+    This function is a reimplementation of functools.lru_cache with a
+    set function that allows prepopulating the cache.
+    """
+    # Use list for reference semantics
+    _cache: list[bytes] = []
+
+    def wrapper() -> bytes:
+        if len(_cache) == 0:
+            _cache.append(func())
+        return _cache[0]
+
+    def set_val(val: bytes) -> None:
+        assert len(_cache) == 0
+        _cache.append(val)
+
+    def clear() -> None:
+        _cache.clear()
+
+    wrapper.set = set_val  # type: ignore[attr-defined]
+    wrapper.clear = clear  # type: ignore[attr-defined]
+    return wrapper
+
+
+@torch_key_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def torch_key() -> bytes:
     """
     Compute a key that contains relevant information about torch source files
     """
+<<<<<<< HEAD
     with dynamo_timed("inductor_codecache_torch_key", log_pt2_compile_event=True):
+=======
+    with dynamo_timed("inductor_codecache_torch_key", log_pt2_compile_event=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not config.is_fbcode():
 
             def get_code_hash(root: str) -> bytes:
@@ -678,7 +871,10 @@ def get_code_hash(root: str) -> bytes:
                 # a hash representing the state of the source code.
                 extra_files = (
                     "codegen/aoti_runtime/interface.cpp",
+<<<<<<< HEAD
                     "codegen/cpp_prefix.h",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "script.ld",
                 )
                 inductor_root = os.path.dirname(__file__)
@@ -829,7 +1025,11 @@ def __init__(
         # Also hash on various system info (including the triton compiler version).
         self.torch_version = torch_key()
         self.system_info = CacheBase.get_system()
+<<<<<<< HEAD
         self.inductor_config = config.save_config_portable()
+=======
+        self.inductor_config = config.save_config_portable(ignore_private_configs=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Custom post grad passes should provide an ID to hash.
         self.post_grad_custom_pre_pass = self._get_custom_pass_detail(
             config.post_grad_custom_pre_pass
@@ -837,6 +1037,7 @@ def __init__(
         self.post_grad_custom_post_pass = self._get_custom_pass_detail(
             config.post_grad_custom_post_pass
         )
+<<<<<<< HEAD
 
     def _get_custom_pass_detail(
         self, custom_pass: CustomGraphPassType
@@ -844,6 +1045,51 @@ def _get_custom_pass_detail(
         if not custom_pass:
             return None
         assert isinstance(custom_pass, CustomGraphPass)
+=======
+        self._pre_fusion_custom_pass = self._get_custom_pass_detail_unsafe(
+            config._pre_fusion_custom_pass
+        )
+        self._fuse_ddp_communication_passes = self._get_custom_pass_detail_unsafe(
+            config._fuse_ddp_communication_passes
+        )
+
+        # Register indcutor backends and custom passes and get their UUIDs.
+        init_backend_registration()
+        self.custom_backend_passes = tuple(
+            map(self._get_custom_pass_detail, custom_backend_passes.values())
+        )
+
+    # This is mainly added to handle these two inductor configs, which are (unfortunately)
+    # sometimes cache safe:
+    # - _pre_fusion_custom_pass
+    # - _fuse_ddp_communication_passes
+    # Their types can be found in `torch/_inductor/config.py`, but:
+    # - if they are string names, we can cache them safely (one is by default)
+    # - if any of them are set to custom callables, we will need to cache miss
+    # Future work is for someone to find any places where these functions are used
+    # and force them to be of type CustomGraphPass, so we can guarantee serialization.
+    def _get_custom_pass_detail_unsafe(self, custom_pass: Any) -> Optional[Any]:
+        if not custom_pass:
+            return None
+        if isinstance(custom_pass, list):
+            return [self._get_custom_pass_detail_unsafe(x) for x in custom_pass]
+        if isinstance(custom_pass, str):
+            return custom_pass
+        if isinstance(custom_pass, CustomGraphPass):
+            return custom_pass.uuid()
+        if callable(custom_pass):
+            # Returning None is safe here because we raise an explicit bypass error
+            # later if we detect these passes are set to callables
+            return None
+        raise AssertionError(f"unknown config type: {str(type(custom_pass))}")
+
+    def _get_custom_pass_detail(
+        self, custom_pass: Union[CustomGraphPassType, CustomGraphModulePass]
+    ) -> Optional[Any]:
+        if not custom_pass:
+            return None
+        assert isinstance(custom_pass, (CustomGraphPass, CustomGraphModulePass))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return custom_pass.uuid()
 
 
@@ -897,7 +1143,148 @@ def add_ephemeral_timeout_increase_for_distributed(time_saved_ns: int) -> int:
     return increased_timeout_sec
 
 
+<<<<<<< HEAD
 class FxGraphCache:
+=======
+class GuardedCache(Generic[T]):
+    """
+    Mixin for caches that have guards associated with their entries.
+    """
+
+    @classmethod
+    def _get_tmp_dir_for_key(cls: type[GuardedCache[T]], _key: str) -> str:
+        raise NotImplementedError("Implement _get_tmp_dir_for_key on parent class")
+
+    @classmethod
+    def iterate_over_candidates(
+        cls: type[GuardedCache[T]],
+        local: bool,
+        remote_cache: Optional[RemoteCache[JsonDataTy]],
+        key: str,
+    ) -> Generator[tuple[T, bytes], None, None]:
+        if local:
+            subdir = cls._get_tmp_dir_for_key(key)
+            if os.path.exists(subdir):
+                for path in sorted(os.listdir(subdir)):
+                    try:
+                        with open(os.path.join(subdir, path), "rb") as f:
+                            content = f.read()
+                            yield pickle.loads(content), content
+                    except Exception:
+                        log.warning(
+                            "fx graph cache unable to load compiled graph",
+                            exc_info=True,
+                        )
+
+        if remote_cache:
+            try:
+                if (cache_data := remote_cache.get(key)) is not None:
+                    assert isinstance(cache_data, dict)
+                    data = cache_data["data"]
+                    assert isinstance(data, (str, bytes))
+                    content = base64.b64decode(data)
+                    yield pickle.loads(content), content
+            except Exception:
+                log.warning(
+                    "%s unable to load compiled graph", cls.__name__, exc_info=True
+                )
+
+    @classmethod
+    def find_guarded_entry(
+        cls: type[GuardedCache[T]],
+        key: str,
+        local: bool,
+        remote_cache: Optional[RemoteCache[JsonDataTy]],
+        evaluate_guards: Callable[[str, Union[list[int], list[torch.SymInt]]], bool],
+        hints: list[int],
+    ) -> tuple[Optional[T], Optional[bytes], dict[str, str]]:
+        """
+        Find the first cache entry in iterate_over_candidates that passes `evaluate_guards`.
+
+        Args:
+            key: The cache key to look up
+            local: Whether to check the local cache
+            remote_cache: The remote cache to check, if any
+            evaluate_guards: Function that evaluates whether a guard passes the check,
+                given a list of hint values and the guard expression.
+            hints: List of symint hints paired with evaluate_guards
+
+        Returns:
+            A tuple of (graph, pickled_content) if found, or (None, None) if not found
+        """
+        graph = None
+        pickled_content = None
+        result_status = "full_miss"
+        sample_guards_expr = None
+
+        # Iterate over any entries in the subdir for this key and evaluate
+        # guards to determine whether there's a hit.
+
+        for candidate, content in cls.iterate_over_candidates(local, remote_cache, key):
+            assert hasattr(candidate, "guards_expr")
+            if not candidate.guards_expr:  # type: ignore[attr-defined]
+                # No guards to evaluate, so this is a hit.
+                graph = candidate
+                pickled_content = content
+                result_status = "hit"
+                break
+
+            # Evaluate the guard expression in the current context.
+            # If there's not a cache hit, we don't want the evaluation to
+            # affect the current env, e.g., cause the creation of new guards,
+            # so we evaluate with the hints instead of the symbols.
+            hit = bool(evaluate_guards(candidate.guards_expr, hints))  # type: ignore[attr-defined]
+            if hit:
+                graph = candidate
+                pickled_content = content
+                result_status = "hit"
+                sample_guards_expr = candidate.guards_expr
+                break
+            else:
+                # At least one guard missed, log this
+                result_status = "guard_miss"
+                sample_guards_expr = candidate.guards_expr
+
+        info = {"cache_status_detailed": result_status}
+        if sample_guards_expr is not None:
+            info["cache_status_guard_expr"] = sample_guards_expr
+        return graph, pickled_content, info
+
+    @classmethod
+    def _filter_backed_symints(
+        cls: type[GuardedCache[T]], inputs: Sequence[InputType]
+    ) -> list[torch.SymInt]:
+        """
+        Get the backed SymInt objects from the input list. Note that we can never
+        have guards that depend on unbacked symint.
+        """
+        return [s for s in inputs if isinstance(s, torch.SymInt) and has_hint(s)]
+
+    @classmethod
+    def _get_shape_env(cls: type[GuardedCache[T]]) -> Optional[ShapeEnv]:
+        """
+        Helper to get the shape env from the tracing context.
+        """
+        ctx = torch._guards.TracingContext.try_get()
+        if not ctx:
+            return None
+        return ctx.fake_mode.shape_env
+
+
+@CacheArtifactFactory.register
+class InductorCacheArtifact(CacheArtifact):
+    @override
+    def populate_cache(self) -> None:
+        FxGraphCache._write_to_local_cache(self.key, self.content)
+
+    @override
+    @staticmethod
+    def type() -> str:
+        return "inductor"
+
+
+class FxGraphCache(GuardedCache[CompiledFxGraph]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Supports caching and reusing compiled Fx graphs.
 
@@ -916,7 +1303,11 @@ class FxGraphCache:
       current context to validate that a cached entry can be served.
     - A given graph could have multiple compiled versions, corresponding to
       different sets of guards. Therefore, we store cache entries in the form:
+<<<<<<< HEAD
           <temp dir>/<fx graph hash>/<serialized metatdata>
+=======
+          <temp dir>/<fx graph hash>/<serialized metadata>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     - On lookup, we compute the key from the graph details, iterate over all
       leaf files in the corresponding subdirectory, deserialize the entry, and
       evaluate its guards expression. If the evaluation succeeds, we have a
@@ -934,14 +1325,20 @@ def _get_tmp_dir() -> str:
         """
         return os.path.join(cache_dir(), "fxgraph")
 
+<<<<<<< HEAD
     @staticmethod
     def _get_tmp_dir_for_key(key: str) -> str:
+=======
+    @classmethod
+    def _get_tmp_dir_for_key(cls: type[FxGraphCache], key: str) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Return the disk location for a given cache key.
         """
         return os.path.join(FxGraphCache._get_tmp_dir(), key[1:3], key)
 
     @staticmethod
+<<<<<<< HEAD
     def _filter_backed_symints(inputs: Sequence[InputType]) -> list[torch.SymInt]:
         """
         Get the backed SymInt objects from the input list. Note that we can never
@@ -1045,16 +1442,45 @@ def iterate_over_candidates() -> Generator[
                 CacheArtifactType.INDUCTOR, key, pickled_content
             )
 
+=======
+    def cache_hit_post_compile(
+        graph: CompiledFxGraph,
+        cache_info: dict[str, Any],
+        constants: CompiledFxGraphConstants,
+    ) -> tuple[Optional[CompiledFxGraph], dict[str, Any]]:
+        """
+        Cache specific post compile steps that need to run if we find a graph in the cache
+        This includes putting bundled triton artifacts in the right place,
+        reloading the PyCodeCache artifact, etc.
+
+        These don't always happen (i.e. on a cache miss, so they are in a separate function from
+        CompiledFxGraph.post_compile)
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if bundle := graph._triton_bundle:
             triton_bundler_meta = TritonBundler.read_and_emit(bundle)
             if (meta := triton_bundler_meta) is not None:
                 cache_info["triton_bundler_meta"] = str(meta)
+<<<<<<< HEAD
                 # TODO: Clean up autograd cache integration
                 CompileEventLogger.try_add_pt2_compile(
                     "inductor_compile", cached_kernel_names=meta.cached_kernel_names
                 )
                 if len(meta.cached_kernel_names) > 0:
                     CompileEventLogger.increment_toplevel("num_triton_bundles")
+=======
+                CompileEventLogger.try_add_pt2_compile(
+                    "inductor_compile", cached_kernel_names=meta.cached_kernel_names
+                )
+                CompileEventLogger.try_add_pt2_compile(
+                    "AOTAutogradCache.inductor_load",
+                    cached_kernel_names=meta.cached_kernel_names,
+                )
+                if len(meta.cached_kernel_names) > 0:
+                    CompileEventLogger.try_(
+                        CompileEventLogger.increment_toplevel, "num_triton_bundles"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         try:
             artifact_path = graph.after_deserialization(constants)
@@ -1074,6 +1500,7 @@ def iterate_over_candidates() -> Generator[
         code = graph.source_code
         AutotuneCacheBundler.begin_compile(inductor_meta, code=code)
 
+<<<<<<< HEAD
         # Now re-evaluate with the symints to add any guards to the current env.
         if graph.guards_expr:
             check = bool(
@@ -1084,6 +1511,8 @@ def iterate_over_candidates() -> Generator[
                 "fx graph cache key %s post-load guards: %s", key, shape_env.guards
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Increment the cached metrics/counters by the amounts recorded when the FX
         # graph was compiled for this cache entry. Pretending these counters
         # were incremented normally is useful for testing with the cache enabled.
@@ -1094,6 +1523,21 @@ def iterate_over_candidates() -> Generator[
         output_code_log.debug("Output code written to: %s", artifact_path)
         # On cache hit, use artifact path as filename
         trace_structured(
+<<<<<<< HEAD
+=======
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "fx_graph_runnable",
+                "encoding": "string",
+            },
+            payload_fn=lambda: graph.runnable_graph_str,
+        )
+        trace_structured(
+            "inductor_post_grad_graph",
+            payload_fn=lambda: graph.inductor_post_grad_graph_str,
+        )
+        trace_structured(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "inductor_output_code",
             lambda: {"filename": artifact_path},
             payload_fn=lambda: code,
@@ -1101,6 +1545,69 @@ def iterate_over_candidates() -> Generator[
         return graph, cache_info
 
     @staticmethod
+<<<<<<< HEAD
+=======
+    def _lookup_graph(
+        key: str,
+        example_inputs: Sequence[InputType],
+        local: bool,
+        remote_cache: Optional[RemoteCache[JsonDataTy]],
+        constants: CompiledFxGraphConstants,
+        evaluate_guards: Optional[
+            Callable[[str, Union[list[int], list[torch.SymInt]]], bool]
+        ] = None,
+    ) -> tuple[Optional[CompiledFxGraph], dict[str, Any]]:
+        """
+        Lookup a compiled graph in the cache by key. On a hit, return the
+        deserialized CompiledFxGraph object. On a miss, return None.
+        `constants` tracks a list of constants, or a way to obtain the list of constants
+        associated with a given cache entry
+        `evaluate_guards` allows AOTAutogradCache and other callers to customize
+        what constitutes a guard success. Normally, a guard hit happens if
+        `shape_env.evaluate_guards_expression` returns True.
+        """
+        shape_env = FxGraphCache._get_shape_env()
+        assert shape_env is not None
+
+        symints = FxGraphCache._filter_backed_symints(example_inputs)
+        hints = [hint_int(s) for s in symints]
+
+        # If this config is turned on, everything is a guard hit and we check nothing
+        if config.unsafe_skip_cache_dynamic_shape_guards:
+            # This also makes it so we don't add anything to the dynamic
+            # shape environment
+            evaluate_guards = lambda x, y: True  # noqa: E731
+
+        if evaluate_guards is None:
+            evaluate_guards = shape_env.evaluate_guards_expression
+
+        cache_info: dict[str, Any] = dict()
+
+        # Use the find_graph_for_key method to find a graph for the given key
+        graph, pickled_content, guard_info = FxGraphCache.find_guarded_entry(
+            key, local, remote_cache, evaluate_guards, hints
+        )
+        cache_info.update(guard_info)
+        if graph is None:
+            return None, cache_info
+
+        if pickled_content is not None:
+            CacheArtifactManager.record_artifact(
+                InductorCacheArtifact.type(), key, pickled_content
+            )
+
+        # Now re-evaluate with the symints to add any guards to the current env.
+        if graph.guards_expr:
+            check = bool(evaluate_guards(graph.guards_expr, symints))
+            assert check is True
+            log.debug(
+                "fx graph cache key %s post-load guards: %s", key, shape_env.guards
+            )
+
+        return FxGraphCache.cache_hit_post_compile(graph, cache_info, constants)
+
+    @staticmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _write_to_local_cache(key: str, content: bytes) -> None:
         subdir = FxGraphCache._get_tmp_dir_for_key(key)
         if not os.path.exists(subdir):
@@ -1128,8 +1635,11 @@ def _save_graph(
         assert isinstance(compiled_graph, CompiledFxGraph), (
             f"serialization for {type(compiled_graph)} NYI"
         )
+<<<<<<< HEAD
         disk_compiled_graph = copy(compiled_graph)
         disk_compiled_graph.prepare_for_serialization()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Before serializing, compute the guard expression that will be used to
         # ensure that a CompiledFxGraph is valid when loaded from the cache. It's
@@ -1140,9 +1650,17 @@ def _save_graph(
         assert shape_env is not None
         symints = FxGraphCache._filter_backed_symints(example_inputs)
         guards = shape_env.get_pruned_guards(symints)
+<<<<<<< HEAD
         disk_compiled_graph.guards_expr = shape_env.produce_guards_expression(
             placeholders=symints, guards=guards
         )
+=======
+        compiled_graph.guards_expr = shape_env.produce_guards_expression(
+            placeholders=symints, guards=guards
+        )
+        disk_compiled_graph = copy(compiled_graph)
+        disk_compiled_graph.prepare_for_serialization()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         try:
             content = pickle.dumps(disk_compiled_graph)
@@ -1155,7 +1673,11 @@ def _save_graph(
 
         try:
             CacheArtifactManager.record_artifact(
+<<<<<<< HEAD
                 CacheArtifactType.INDUCTOR, key, content
+=======
+                InductorCacheArtifact.type(), key, content
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             if local:
                 FxGraphCache._write_to_local_cache(key, content)
@@ -1200,6 +1722,17 @@ def _check_can_cache(gm: torch.fx.GraphModule) -> None:
         for p in (config.post_grad_custom_pre_pass, config.post_grad_custom_post_pass):
             if p and (not isinstance(p, CustomGraphPass) or not p.uuid()):
                 raise BypassFxGraphCache("Unsupported post grad custom pass")
+<<<<<<< HEAD
+=======
+        # We should find any users of _pre_fusion_custom_pass and _fuse_ddp_communication_passes
+        # and ensure they are not passing us raw callables
+        if config._pre_fusion_custom_pass is not None:
+            if not isinstance(config._pre_fusion_custom_pass, CustomGraphPass):
+                raise BypassFxGraphCache("Unsupported _pre_fusion_custom_pass")
+        for p in config._fuse_ddp_communication_passes:
+            if callable(p) and not isinstance(p, CustomGraphPass):
+                raise BypassFxGraphCache("Unsupported _fuse_ddp_communication_pass")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Freezing can embed constants that wouldn't be static across runs.
         if has_frozen_params(gm) and not torch._utils_internal.justknobs_check(
@@ -1287,6 +1820,12 @@ def load_with_key(
         remote_cache: Optional[RemoteCache[JsonDataTy]],
         is_backward: bool,
         constants: CompiledFxGraphConstants,
+<<<<<<< HEAD
+=======
+        evaluate_guards: Optional[
+            Callable[[str, Union[list[int], list[torch.SymInt]]], bool]
+        ] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[Optional[CompiledFxGraph], dict[str, Any]]:
         """
         Lookup the graph with the given key, and return results and metadata.
@@ -1294,7 +1833,11 @@ def load_with_key(
         differently from FXGraphCache.
         """
         compiled_graph, cache_info = FxGraphCache._lookup_graph(
+<<<<<<< HEAD
             key, example_inputs, local, remote_cache, constants
+=======
+            key, example_inputs, local, remote_cache, constants, evaluate_guards
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         cache_info = {
             **cache_info,
@@ -1308,17 +1851,35 @@ def load_with_key(
             cache_info["cache_state"] = "hit"
             if remote_cache:
                 # Count remote cache hit stats
+<<<<<<< HEAD
                 CompileEventLogger.increment_toplevel(
                     "inductor_fx_remote_cache_hit_count"
                 )
                 CompileEventLogger.add_to_set_toplevel(
                     "inductor_fx_remote_cache_hit_keys", key
+=======
+                CompileEventLogger.try_(
+                    CompileEventLogger.increment_toplevel,
+                    "inductor_fx_remote_cache_hit_count",
+                )
+                CompileEventLogger.try_(
+                    CompileEventLogger.add_to_set_toplevel,
+                    "inductor_fx_remote_cache_hit_keys",
+                    key,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             if (time_saved_ns := compiled_graph._time_taken_ns) is not None:
                 cache_info["time_saved_ns"] = time_saved_ns
+<<<<<<< HEAD
                 CompileEventLogger.increment_toplevel(
                     "distributed_ephemeral_timeout_us", time_saved_ns // 1000
+=======
+                CompileEventLogger.try_(
+                    CompileEventLogger.increment_toplevel,
+                    "distributed_ephemeral_timeout_us",
+                    time_saved_ns // 1000,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 if (
                     ephemeral_increase
@@ -1328,11 +1889,22 @@ def load_with_key(
         else:
             if remote_cache:
                 # Count remote cache miss stats
+<<<<<<< HEAD
                 CompileEventLogger.increment_toplevel(
                     "inductor_fx_remote_cache_miss_count"
                 )
                 CompileEventLogger.add_to_set_toplevel(
                     "inductor_fx_remote_cache_miss_keys", key
+=======
+                CompileEventLogger.try_(
+                    CompileEventLogger.increment_toplevel,
+                    "inductor_fx_remote_cache_miss_count",
+                )
+                CompileEventLogger.try_(
+                    CompileEventLogger.add_to_set_toplevel,
+                    "inductor_fx_remote_cache_miss_keys",
+                    key,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             log.info("fx graph cache miss for key %s", key)
             counters["inductor"]["fxgraph_cache_miss"] += 1
@@ -1351,7 +1923,11 @@ def clear() -> None:
             pass
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def split_aot_inductor_output_path(path: str) -> tuple[str, str]:
     """Returns the path where the AOT Inductor compiled kernels are stored."""
     if path.endswith(".so"):
@@ -1362,23 +1938,84 @@ def split_aot_inductor_output_path(path: str) -> tuple[str, str]:
         return path, ""
 
 
+<<<<<<< HEAD
 @clear_on_fresh_inductor_cache
+=======
+@clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CudaKernelParamCache:
     cache: dict[str, dict[str, Any]] = {}
     cache_clear = staticmethod(cache.clear)
 
     @classmethod
+<<<<<<< HEAD
     def set(cls, key: str, params: dict[str, str], cubin: str, bin_type: str) -> None:
         _, path = write(
+=======
+    def set(
+        cls,
+        key: str,
+        params: dict[str, Optional[str]],
+        cubin: str,
+        bin_type: str,
+        asm: Optional[str] = None,
+        asm_type: Optional[str] = None,
+    ) -> None:
+        basename = None
+        if config.aot_inductor.package_cpp_only:
+            assert config.triton.unique_kernel_names, (
+                "package_cpp_only requires triton kernel names to be unique"
+            )
+            assert params["mangled_name"], "Missing kernel name"
+            basename = params["mangled_name"]
+
+        _, bin_path = write(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cubin,
             bin_type,
             hash_type=bin_type,
             specified_dir=split_aot_inductor_output_path(
                 config.aot_inductor.output_path
             )[0],
+<<<<<<< HEAD
         )
         params[get_cpp_wrapper_cubin_path_name()] = path
 
+=======
+            key=basename,
+        )
+        # Retrieve the basename again in case it is a generated hashcode
+        basename, _ = get_name_and_dir_from_output_file_path(bin_path)
+
+        if config.aot_inductor.emit_multi_arch_kernel:
+            bin_type_to_ext = {"cubin": ".fatbin", "spv": ".spv"}
+            assert bin_type in bin_type_to_ext.keys(), (
+                "multi_arch_kernel_binary only supported in CUDA/XPU"
+            )
+            base_path, _ = os.path.splitext(bin_path)
+            bin_path = base_path + bin_type_to_ext[bin_type]
+
+        asm_path: str = ""
+        if (
+            config.aot_inductor.emit_multi_arch_kernel
+            or config.aot_inductor.package_cpp_only
+        ):
+            assert asm, "Missing kernel assembly code"
+            assert asm_type, "Missing kernel assembly type"
+            _, asm_path = write(
+                asm,
+                asm_type,
+                hash_type=asm_type,
+                specified_dir=split_aot_inductor_output_path(
+                    config.aot_inductor.output_path
+                )[0],
+                # make sure asm file has the same basename
+                key=basename,
+            )
+
+        params[get_cpp_wrapper_cubin_path_name()] = bin_path
+        params["asm"] = asm_path
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cls.cache[key] = params
 
     @classmethod
@@ -1391,6 +2028,13 @@ def get_keys(cls) -> KeysView[str]:
 
 
 class AotCodeCompiler:
+<<<<<<< HEAD
+=======
+    """
+    Compile AOT Inductor generated code.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def compile(
         cls,
@@ -1401,12 +2045,20 @@ def compile(
         *,
         device_type: str,
         additional_files: list[str],
+<<<<<<< HEAD
     ) -> Union[list[str], str]:
+=======
+    ) -> Union[list[Union[str, Weights]], str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns the .so path, or returns a list of files that were generated if
         config.aot_inductor.package=True.
         """
+<<<<<<< HEAD
         generated_files = additional_files
+=======
+        generated_files: list[Union[str, Weights]] = additional_files  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if sys.platform == "win32":
             raise RuntimeError("AotCodeCompiler not yet supported for inductor")
@@ -1452,14 +2104,34 @@ def compile(
             "wrapper.cpp",
             extra=cpp_command,
             specified_dir=specified_output_path,
+<<<<<<< HEAD
+=======
+            key=config.aot_inductor.model_name_for_generated_files,
+        )
+        kernel_code = (
+            f"// Triton kernels are embedded as comments in {wrapper_path}\n"
+            + kernel_code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         _, kernel_path = write(
             kernel_code,
             "kernel.cpp",
             extra=cpp_command,
             specified_dir=specified_output_path,
+<<<<<<< HEAD
         )
 
+=======
+            key=config.aot_inductor.model_name_for_generated_files,
+        )
+
+        # Log the AOTInductor wrapper and kernel code, if needed.
+        with tempfile.NamedTemporaryFile("w+") as t:
+            t.writelines((wrapper_code, "\n", kernel_code, "\n"))
+            t.flush()
+            V.debug.output_code(t.name, extension="cpp")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.aot_inductor.package:
             generated_files.append(wrapper_path)
             if not config.aot_inductor.package_cpp_only:
@@ -1540,8 +2212,13 @@ def _compile_consts(consts: bytes, platform: str) -> str:
             )
             consts_s = Path(consts_s)
             object_build_options = CppTorchDeviceOptions(
+<<<<<<< HEAD
                 # Intel compiler failed to compile this manully constructed assembly file.
                 # it is ok to use gcc to compile the .S to a .o and linked with Intel comiler .
+=======
+                # Intel compiler failed to compile this manually constructed assembly file.
+                # it is ok to use gcc to compile the .S to a .o and linked with Intel compiler .
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 device_type=device_type if device_type != "xpu" else "cpu",
                 aot_mode=graph.aot_mode,
                 compile_only=True,
@@ -1670,6 +2347,23 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
             else:
                 serialized_weights = b""
 
+<<<<<<< HEAD
+=======
+            if config.aot_inductor.package_constants_on_disk:
+                # We need to return a storage key here because the original value tensor might be a clone
+                weights_dict = Weights(
+                    {
+                        graph.allocated_constant_name[name]: (
+                            graph.get_original_value_of_constant(name),
+                            TensorProperties(graph.constants[name]),
+                        )
+                        for name in graph.constants.keys()
+                        if name not in graph.folded_constants
+                    }
+                )
+                generated_files.append(weights_dict)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             consts_size = len(serialized_weights)
 
             # TODO: Fix mmap weights with cuda
@@ -1681,7 +2375,11 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                 "aot_mode": graph.aot_mode,
                 "device_type": device_type,
                 "use_mmap_weights": use_mmap_weights,
+<<<<<<< HEAD
                 "use_relative_path": config.is_fbcode(),
+=======
+                "use_relative_path": use_relative_path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "vec_isa": picked_vec_isa,
             }
             # If we're packaging via CMake, we build the whole code at max optimization.
@@ -1695,6 +2393,27 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                 **compile_command,
             )
 
+<<<<<<< HEAD
+=======
+            # potentially, precompile the AOT header for this device
+            if config.aot_inductor.precompile_headers and not _IS_WINDOWS:
+                header_file = _get_cpp_wrapper_header(
+                    device_type, aot_mode=graph.aot_mode
+                )
+                wrapper_build_options.precompiled_header = _precompile_header(
+                    header_file,
+                    cpp_command,
+                    min_optimize=not config.aot_inductor.package_cpp_only,
+                    **compile_command,
+                )
+                if cpp_prefix := _get_cpp_prefix_header(device_type):
+                    kernel_build_options.precompiled_header = _precompile_header(
+                        cpp_prefix,
+                        cpp_command,
+                        **compile_command,
+                    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             wrapper_builder = CppBuilder(
                 name=str(wrapper_path_operator.stem),
                 sources=wrapper_path,
@@ -1724,11 +2443,26 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                 )
                 wrapper_build_options.save_flags_to_json(compile_flags)
                 generated_files.append(compile_flags)
+<<<<<<< HEAD
                 wrapper_builder.save_compile_cmd_to_cmake(cmake_path)
                 wrapper_builder.save_src_to_cmake(cmake_path, wrapper_path)
                 generated_files.append(cmake_path)
             else:
                 wrapper_builder.build()
+=======
+                wrapper_builder.save_compile_cmd_to_cmake(cmake_path, device_type)
+                wrapper_builder.save_src_to_cmake(cmake_path, wrapper_path)
+                generated_files.append(cmake_path)
+            else:
+                try:
+                    wrapper_builder.build()
+                except (exc.CppCompileError, SkipFrame) as e:
+                    if " is too big to optimize" in str(e):
+                        raise RuntimeError(
+                            "Please use torch._inductor.config.aot_inductor.compile_wrapper_opt_level = 'O0' flag."
+                        ) from e
+                    raise e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kernel_builder.build()
 
             if not use_mmap_weights:
@@ -1751,6 +2485,13 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
             for custom_obj_idx, (name, constant) in enumerate(
                 graph.torchbind_constants.items()
             ):
+<<<<<<< HEAD
+=======
+                if isinstance(
+                    constant, torch._library.fake_class_registry.FakeScriptObject
+                ):
+                    constant = constant.real_obj
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert isinstance(constant, torch._C.ScriptObject)
                 custom_obj_name = f"{CUSTOM_OBJ_FILENAME_PREFIX}{custom_obj_idx}"
 
@@ -1765,22 +2506,70 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                 write_atomic(custom_obj_path, custom_obj_bytes, True)
                 generated_files.append(custom_obj_path)
 
+<<<<<<< HEAD
             constants_config_json = os.path.join(
                 wrapper_path_operator.parent, "custom_objs_config.json"
             )
             with open(constants_config_json, "w") as f:
                 f.write(json.dumps(qual_name_to_id))
             generated_files.append(constants_config_json)
+=======
+            if qual_name_to_id:
+                constants_config_json = os.path.join(
+                    wrapper_path_operator.parent, "custom_objs_config.json"
+                )
+                with open(constants_config_json, "w") as f:
+                    f.write(json.dumps(qual_name_to_id))
+                generated_files.append(constants_config_json)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             gpu_codecache: Union[ROCmCodeCache, CUDACodeCache] = (
                 ROCmCodeCache() if torch.version.hip else CUDACodeCache()
             )
+<<<<<<< HEAD
             gpu_kernels_o = [
                 entry.output_path
                 for entry in gpu_codecache.cache.values()
                 if entry.output_path.endswith(".o")
             ]
             gpu_kernels_o = " ".join(gpu_kernels_o)
+=======
+            gpu_kernels_o = gpu_codecache.aot_kernels_o.copy()
+            # clear the list of aot kernels after each linking
+            gpu_codecache.aot_kernels_o.clear()
+
+            if gpu_kernels_o:
+                assert not config.aot_inductor.emit_multi_arch_kernel, (
+                    "TODO: add emit_multi_arch_kernel support for cutlass kernels"
+                )
+
+            cubins_o = []
+            asm_files = []
+            ld, objcopy = get_ld_and_objcopy(use_relative_path)
+            for kernel_name, value in CudaKernelParamCache.cache.items():
+                if asm_file := value["asm"]:
+                    asm_files.append(asm_file)
+
+                cubin_file = value[get_cpp_wrapper_cubin_path_name()]
+                if config.aot_inductor.emit_multi_arch_kernel and device_type == "cuda":
+                    current_arch = _nvcc_arch_as_compile_option()
+                    cmd = (
+                        f"{_cuda_compiler()} -fatbin {asm_file} -o {cubin_file} "
+                        # Triton only allows generating PTX version as same as the current arch
+                        f"-gencode arch=compute_{current_arch},code=compute_{current_arch} "
+                        # Include SASS for the current specific arch
+                        f"-gencode arch=compute_{current_arch},code=sm_{current_arch} "
+                    )
+                    subprocess.run(
+                        cmd.split(), capture_output=True, text=True, check=True
+                    )
+
+                if config.aot_inductor.embed_kernel_binary:
+                    # Embed cubin files into model.so using objcopy
+                    cubins_o.append(
+                        convert_cubin_to_obj(cubin_file, kernel_name, ld, objcopy)
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             output_name, output_dir = get_name_and_dir_from_output_file_path(output_so)
             so_build_options = CppTorchDeviceOptions(
@@ -1790,11 +2579,18 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                 use_relative_path=use_relative_path,
             )
 
+<<<<<<< HEAD
             so_builder = CppBuilder(
                 name=output_name,
                 sources=[wrapper_o, kernel_o, consts_o, gpu_kernels_o]
                 if gpu_kernels_o
                 else [wrapper_o, kernel_o, consts_o],
+=======
+            obj_srcs = [wrapper_o, kernel_o, consts_o, *gpu_kernels_o, *cubins_o]
+            so_builder = CppBuilder(
+                name=output_name,
+                sources=obj_srcs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_dir=output_dir,
                 BuildOption=so_build_options,
             )
@@ -1826,7 +2622,10 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
 
                 # If we only want to package the cpp, then we need to save the
                 # weights separately into a bin, and we also need to prevent compiling the so
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if use_mmap_weights:
                     weight_file = str(
                         wrapper_path_operator.with_name(
@@ -1838,6 +2637,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                         f_weights.write(struct.pack("q", magic_number))
 
                     generated_files.append(weight_file)
+<<<<<<< HEAD
 
                 generated_files.append(consts_o)
                 generated_files.append(gpu_kernels_o)
@@ -1850,6 +2650,28 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                 so_builder.build()
 
                 for o_file in [wrapper_o, kernel_o, consts_o]:
+=======
+                else:
+                    # TODO: unify to always use mmap_weights
+                    generated_files.append(consts_o)
+                    so_builder.save_src_to_cmake(cmake_path, consts_o)
+
+                if config.aot_inductor.emit_multi_arch_kernel:
+                    so_builder.save_kernel_asm_to_cmake(cmake_path, asm_files)
+                    generated_files.extend(asm_files)
+                else:
+                    obj_srcs = [*gpu_kernels_o, *cubins_o]
+                    generated_files.extend(obj_srcs)
+                    for obj in obj_srcs:
+                        so_builder.save_src_to_cmake(cmake_path, obj)
+
+                so_builder.save_link_cmd_to_cmake(cmake_path)
+            else:
+                so_builder.build()
+                for o_file in obj_srcs:
+                    if o_file in gpu_kernels_o:
+                        continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # Remove these as they are not needed anymore
                     os.remove(o_file)
 
@@ -1878,6 +2700,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
         return output_so
 
 
+<<<<<<< HEAD
 # Putting this fn in cpp.py (unfortunately) causes a deadlock, which is why it's in codecache.py.
 # Why? importing from cpp.py invokes codecache.pick_vec_isa(), which takes out a lock.
 # Cycle goes:
@@ -1913,6 +2736,12 @@ def cpp_prefix() -> str:
 
 
 def custom_op_wrapper(op: str, *args: Any) -> Union[list[c_void_p], c_void_p]:
+=======
+_libgomp: Optional[CDLL] = None
+
+
+def custom_op_wrapper(op: str, *args: Any) -> Union[list[c_void_p], c_void_p, None]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This function will be called from generated cpp wrapper code in the JIT mode.
     # Because tensors will be passed in as AtenTensorHandle, we need to explicitly convert them.
     def convert_arg(arg: Any) -> Any:
@@ -1946,12 +2775,19 @@ def convert_arg(arg: Any) -> Any:
         del converted_args[-len(kwargs) :]
 
     result = func(*converted_args, **kwargs)
+<<<<<<< HEAD
+=======
+    if result is None:
+        return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(result, (list, tuple)):
         # unsafe_alloc_void_ptrs_from_tensors expects result contains tensor only
         result = [torch.tensor([]) if r is None else r for r in result]
         for i, r in enumerate(result):
             assert isinstance(r, torch.Tensor), op + " returns a list of non-tensors"
         return torch._C._aoti.unsafe_alloc_void_ptrs_from_tensors(result)  # type: ignore[arg-type]
+<<<<<<< HEAD
     else:
         assert isinstance(result, torch.Tensor), op + " returns a non-tensor"
         return torch._C._aoti.unsafe_alloc_void_ptr_from_tensor(result)
@@ -1959,6 +2795,107 @@ def convert_arg(arg: Any) -> Any:
 
 @clear_on_fresh_inductor_cache
 class CppCodeCache:
+=======
+
+    assert isinstance(result, torch.Tensor), op + " returns a non-tensor"
+    return torch._C._aoti.unsafe_alloc_void_ptr_from_tensor(result)
+
+
+# Precompiled headers are persistent past program runtime, but associated with one
+# specific compiler version and set of flags.  We explicitly use default_cache_dir here
+# because these headers need to be global, rather than ignored by fresh_cache.
+_HEADER_DIR = os.path.join(default_cache_dir(), "precompiled_headers")
+_HEADER_LOCK_DIR = os.path.join(_HEADER_DIR, "locks")
+
+
+@functools.cache
+def _precompile_header(
+    header: str,
+    hashable_cmd_line: str,
+    **compile_command: Any,
+) -> str:
+    assert not _IS_WINDOWS, (
+        "CppBuilder does not currently support precompiling on Windows!"
+    )
+
+    # Get the preprocessed output from the header file to be precompiled.  This allows
+    # us to properly invalidate the file cache when any header dependency changes.  This
+    # is thread-safe, as each thread will get its own temporary directory.
+    #
+    # N.B. we can't use NamedTemporaryFile here because Windows errors out on attempts
+    # to read from a file with an open write handle.
+    with tempfile.TemporaryDirectory() as preprocessing_dir:
+        preprocessing_header = Path(preprocessing_dir) / "header.hpp"
+        preprocessing_header.write_text(f"#include <{header}>\n")
+        preprocessor = CppBuilder(
+            name=str(preprocessing_header)[:-4],  # strip off the .hpp extension
+            sources=str(preprocessing_header),
+            BuildOption=CppTorchDeviceOptions(**compile_command, preprocessing=True),
+        )
+        preprocessor.build()
+
+        def _get_file_checksum(filename: str) -> str:
+            """Reading the whole preprocessed header in for hashing is very expensive,
+            but calling a fast hashing utility in a subprocess is cheap."""
+            # If Windows support needs to be added here, use certutil -hashfile.
+            cmd_output = subprocess.run(
+                ("openssl", "sha512", filename), capture_output=True, text=True
+            )
+            return cmd_output.stdout.split()[-1]
+
+        preprocessor_hash = _get_file_checksum(preprocessor.get_target_file_path())
+
+    header_build_option = CppTorchDeviceOptions(**compile_command, precompiling=True)
+    header_hash, header_full_path = write(
+        content=f"#include <{header}>\n",
+        extension="h",
+        extra=(
+            hashable_cmd_line
+            + preprocessor_hash
+            + get_compiler_version_info(header_build_option.get_compiler())
+        ),
+        specified_dir=_HEADER_DIR,
+    )
+    cpp_builder = CppBuilder(
+        name=header_full_path,
+        sources=header_full_path,
+        BuildOption=header_build_option,
+    )
+    # _worker_compile_cpp will automatically ignore any compilation whose result already
+    # exists, so this is always safe.
+    os.makedirs(_HEADER_LOCK_DIR, exist_ok=True)
+    _worker_compile_cpp(
+        os.path.join(_HEADER_LOCK_DIR, f"{header_hash}.lock"),
+        (cpp_builder,),
+    )
+
+    return header_full_path
+
+
+def _get_cpp_prefix_header(device: str) -> Optional[str]:
+    if device.startswith("cpu"):
+        return "torch/csrc/inductor/cpp_prefix.h"
+    return None
+
+
+def _get_cpp_wrapper_header(device: str, aot_mode: bool = False) -> str:
+    """Given a device type (and optionally whether we're in AOT Inductor mode), returns
+    the path to the cpp_wrapper header file to be precompiled."""
+    base_device = device.split(":")[0]
+    is_array_ref = config.aot_inductor.allow_stack_allocation and base_device == "cpu"
+    return (
+        "torch/csrc/inductor/"
+        f"{'aoti_include' if aot_mode else 'cpp_wrapper'}/"
+        f"{'array_ref' if is_array_ref else base_device}.h"
+    )
+
+
+@clear_on_fresh_cache
+class CppCodeCache:
+    """Compiles and caches C++ libraries.  Users of this class supply the source code to
+    be compiled, while compilation flags are set by CppBuilder."""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
     cache_clear = staticmethod(cache.clear)
     cpp_compile_command_flags: dict[str, Any] = {}
@@ -1991,6 +2928,7 @@ def _load_library(cls, path: str, key: str) -> Union[CDLL, ModuleType]:
             raise
 
     @classmethod
+<<<<<<< HEAD
     def load_async(
         cls,
         source_code: str,
@@ -2003,10 +2941,36 @@ def load_async(
             "device_type": device_type,
             "vec_isa": pick_vec_isa(),
             "extra_flags": extra_flags,
+=======
+    def _get_uncompiled_header(cls, device: str) -> str | None:
+        """
+        Given a device type, returns the path to a CPP header file to be precompiled.
+        """
+        return None
+
+    @classmethod
+    def load_async(
+        cls,
+        main_code: str,
+        device_type: str = "cpu",
+        submit_fn: Any = None,
+        extra_flags: Sequence[str] = (),
+        optimized_code: Optional[str] = None,
+    ) -> Any:
+        """Compile and load a C++ library.  Returns a callable that returns the loaded
+        library."""
+        compile_command = {
+            **cls.cpp_compile_command_flags,
+            "device_type": device_type,
+            "extra_flags": extra_flags,
+            "use_relative_path": config.is_fbcode(),
+            "vec_isa": pick_vec_isa(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         _set_gpu_runtime_env()  # cpp_extension consults the env
 
+<<<<<<< HEAD
         command_gen = CppBuilder(
             name="o", sources="i", BuildOption=CppTorchDeviceOptions(**compile_command)
         )
@@ -2017,11 +2981,50 @@ def load_async(
         # guarantee the source code hash contains ISA difference.
         vec_isa_cmd = repr(command_gen.get_command_line())
         key, input_path = write(source_code, "cpp", extra=vec_isa_cmd)
+=======
+        # Note the distinction between the two booleans.  We do minimal optimization if
+        # the optimized_code argument is present at all, since that's how the user of
+        # this function opts in, but we do compilation and linking in one step if the
+        # optimized_code argument is empty (as a micro-optimization).
+        main_build_option = CppTorchDeviceOptions(
+            compile_only=bool(optimized_code),
+            min_optimize=optimized_code is not None,
+            **compile_command,
+        )
+        optimized_build_option = CppTorchDeviceOptions(
+            compile_only=True, **compile_command
+        )
+
+        def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
+            """Writing the code to file will calculate a hash, which we need to vary if
+            the command line flags change.  This implements a mostly-generic way of
+            validating that."""
+            return CppBuilder(
+                name="o", sources="i", BuildOption=build_option
+            ).get_command_line()
+
+        main_cmd_line = get_hashable_command_line(main_build_option)
+        optimized_cmd_line = get_hashable_command_line(optimized_build_option)
+
+        key, main_path = write(
+            main_code, "main.cpp", extra=f"{optimized_code} {main_cmd_line}"
+        )
+
+        # Don't bother writing if the argument is empty.
+        if optimized_code:
+            _, optimized_path = write(
+                optimized_code, "optimized.cpp", extra=optimized_cmd_line
+            )
+        else:
+            # Unused, but makes type checkers happy.
+            optimized_path = os.devnull
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if key not in cls.cache:
             from torch.utils._filelock import FileLock
 
             lock_path = os.path.join(get_lock_dir(), key + ".lock")
+<<<<<<< HEAD
             output_name, output_dir = get_name_and_dir_from_output_file_path(input_path)
             future: Optional[Future[Any]] = None
             lib = None
@@ -2042,6 +3045,73 @@ def load_async(
                 cpp_builder,
             )
             binary_path = normalize_path_separator(cpp_builder.get_target_file_path())
+=======
+            future: Optional[Future[Any]] = None
+            lib = None
+
+            # if requested, pre-compile any headers
+            if config.cpp_cache_precompile_headers and not _IS_WINDOWS:
+                if header := cls._get_uncompiled_header(device_type):
+                    main_build_option.precompiled_header = _precompile_header(
+                        header,
+                        main_cmd_line,
+                        min_optimize=optimized_code is not None,
+                        **compile_command,
+                    )
+
+                # Currently, the optimized_code field is only used for cpp kernel code,
+                # so go ahead and precompile the relevant header here.  Revisit this
+                # decision if that ever changes.
+                if optimized_code and (header := _get_cpp_prefix_header(device_type)):
+                    optimized_build_option.precompiled_header = _precompile_header(
+                        header,
+                        optimized_cmd_line,
+                        **compile_command,
+                    )
+
+            main_name, output_dir = get_name_and_dir_from_output_file_path(main_path)
+            main_builder = CppBuilder(
+                name=main_name,
+                sources=main_path,
+                BuildOption=main_build_option,
+                output_dir=output_dir,
+            )
+
+            if optimized_code:
+                optimized_name, _ = get_name_and_dir_from_output_file_path(
+                    optimized_path
+                )
+                optimized_builder = CppBuilder(
+                    name=optimized_name,
+                    sources=optimized_path,
+                    BuildOption=optimized_build_option,
+                    output_dir=output_dir,
+                )
+
+                linker = CppBuilder(
+                    name=main_name,
+                    sources=[
+                        main_builder.get_target_file_path(),
+                        optimized_builder.get_target_file_path(),
+                    ],
+                    BuildOption=CppTorchDeviceOptions(**compile_command),
+                    output_dir=output_dir,
+                )
+
+                worker_fn = functools.partial(
+                    _worker_compile_cpp,
+                    lock_path,
+                    (main_builder, optimized_builder, linker),
+                )
+                binary_path = normalize_path_separator(linker.get_target_file_path())
+            else:
+                worker_fn = functools.partial(
+                    _worker_compile_cpp, lock_path, (main_builder,)
+                )
+                binary_path = normalize_path_separator(
+                    main_builder.get_target_file_path()
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def load_fn() -> Any:
                 nonlocal lib
@@ -2064,23 +3134,42 @@ def load_fn() -> Any:
         return cls.cache[key]
 
     @classmethod
+<<<<<<< HEAD
     def load(cls, source_code: str, device_type: str = "cpu") -> Any:
         return cls.load_async(source_code, device_type)()
+=======
+    def load(cls, *args: Any, **kwargs: Any) -> Any:
+        return cls.load_async(*args, **kwargs)()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _worker_compile_cpp(
     lock_path: str,
+<<<<<<< HEAD
     cpp_builder: CppBuilder,
+=======
+    cpp_builders: Sequence[CppBuilder],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     from torch.utils._filelock import FileLock
 
     with FileLock(lock_path, timeout=LOCK_TIMEOUT):
+<<<<<<< HEAD
         if not os.path.exists(cpp_builder.get_target_file_path()):
             cpp_builder.build()
 
 
 # Customized Python binding for cpp kernels
 @clear_on_fresh_inductor_cache
+=======
+        for builder in cpp_builders:
+            if not os.path.exists(builder.get_target_file_path()):
+                builder.build()
+
+
+# Customized Python binding for cpp kernels
+@clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CppPythonBindingsCodeCache(CppCodeCache):
     cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
     cache_clear = staticmethod(cache.clear)
@@ -2174,7 +3263,11 @@ class CppPythonBindingsCodeCache(CppCodeCache):
                 return NULL;
             }}
             #ifdef Py_GIL_DISABLED
+<<<<<<< HEAD
                 PyUnstable_Module_SetGIL(mod, Py_MOD_GIL_NOT_USED);
+=======
+                PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             #endif
             return module;
         }}
@@ -2195,6 +3288,7 @@ def _load_library_inner(cls, path: str, key: str) -> ModuleType:
         assert spec is not None
         module = importlib.util.module_from_spec(spec)
         sys.modules[module_name] = module
+<<<<<<< HEAD
         spec.loader.exec_module(module)  # type: ignore[union-attr]
         return module
 
@@ -2203,17 +3297,44 @@ def load_pybinding_async(
         cls,
         argtypes: list[str],
         source_code: str,
+=======
+        assert spec.loader is not None
+        spec.loader.exec_module(module)
+        return module
+
+    @classmethod
+    def _get_uncompiled_header(cls, device: str) -> str | None:
+        return _get_cpp_prefix_header(device)
+
+    @classmethod
+    def load_pybinding_async(
+        cls,
+        argtypes: Sequence[str],
+        main_code: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device_type: str = "cpu",
         num_outputs: int = -1,
         submit_fn: Any = None,
         extra_flags: Sequence[str] = (),
+<<<<<<< HEAD
+=======
+        kernel_code: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Any:
         """
         Wrap a C++ function in fast Python bindings.
 
         Args:
             argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
+<<<<<<< HEAD
             source_code: C++ source code containing a ENTRY_FUNCTION() function
+=======
+            main_code: C++ source code containing ENTRY_FUNCTION().  Will be built at
+                -O3 if kernel_code is None (to maximize performance in any kernels that
+                are present), or -O1 otherwise (to minimize compile time).
+            kernel_code: If present, C++ source code that will be built at -O3 and
+                linked to main_code.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Returns:
             A python version of ENTRY_FUNCTION()
@@ -2229,10 +3350,18 @@ def load_pybinding_async(
             extra_parse_arg=cls.extra_parse_arg.format(array_len=num_outputs),
         )
         get_result = cls.load_async(
+<<<<<<< HEAD
             source_code + suffix,
             device_type,
             submit_fn=submit_fn,
             extra_flags=extra_flags,
+=======
+            main_code + suffix,
+            device_type,
+            submit_fn=submit_fn,
+            extra_flags=extra_flags,
+            optimized_code=kernel_code,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         result = None
 
@@ -2250,7 +3379,11 @@ def load_pybinding(cls, *args: Any, **kwargs: Any) -> Any:
         return cls.load_pybinding_async(*args, **kwargs)()
 
 
+<<<<<<< HEAD
 @clear_on_fresh_inductor_cache
+=======
+@clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CppWrapperCodeCache(CppPythonBindingsCodeCache):
     cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
     cache_clear = staticmethod(cache.clear)
@@ -2314,8 +3447,17 @@ class CppWrapperCodeCache(CppPythonBindingsCodeCache):
         """
     )
 
+<<<<<<< HEAD
 
 @clear_on_fresh_inductor_cache
+=======
+    @classmethod
+    def _get_uncompiled_header(cls, device: str) -> str | None:
+        return _get_cpp_wrapper_header(device)
+
+
+@clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class HalideCodeCache(CppPythonBindingsCodeCache):
     cache: dict[str, Callable[[], Union[ModuleType, CDLL]]] = {}
     cache_clear = staticmethod(cache.clear)
@@ -2460,7 +3602,11 @@ def _codegen_glue(cls, meta: HalideMeta, headerfile: object) -> str:
         return glue_code
 
     @classmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def config_hash(cls) -> str:
         command_gen = CppBuilder(
             name="O",
@@ -2504,7 +3650,11 @@ def _search_for_file(suffix: str, errmsg: str) -> str:
         raise RuntimeError(errmsg)
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def find_libautoschedule(name: str) -> str:
         sofile = f"libautoschedule_{name.lower()}.so"
         if "HALIDE_LIB" in os.environ:
@@ -2517,7 +3667,11 @@ def find_libautoschedule(name: str) -> str:
         return HalideCodeCache._search_for_file(sofile, errmsg)
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def find_header(name: str) -> str:
         if "HALIDE_INCLUDE" in os.environ:
             path = os.path.join(os.environ["HALIDE_INCLUDE"], name)
@@ -2618,40 +3772,71 @@ def build_standalone_runtime(cls) -> str:
         target = "host-cuda" if device_type == "cuda" else "host"
         if cls._standalone_runtime_path:
             assert not os.path.exists(cls._standalone_runtime_path)
+<<<<<<< HEAD
             # We hit this case in unittests when we run with fresh_inductor_cache()
             # Generating a fresh runtime over and over causes errors because we initialize
             # cuda hundreds of times in the same process and run out of file descriptors.
             # Workaround by jail breaking the current fresh_inductor_cache().
+=======
+            # We hit this case in unittests when we run with fresh_cache()
+            # Generating a fresh runtime over and over causes errors because we initialize
+            # cuda hundreds of times in the same process and run out of file descriptors.
+            # Workaround by jail breaking the current fresh_cache().
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             base = default_cache_dir()
         else:
             base = cache_dir()
         dirpath = Path(base) / f"halide-runtime-{target}-{cls.config_hash()}"
         os.makedirs(dirpath, exist_ok=True)
+<<<<<<< HEAD
         donefile = str(dirpath / "done")
         lockfile = str(dirpath / "lock")
         hookfile = str(dirpath / "hooks.cpp")
         afile = str(dirpath / "standalone_halide_runtime.a")
         sofile = str(dirpath / libname)
         if not os.path.exists(donefile):
+=======
+        done_file = str(dirpath / "done")
+        lock_file = str(dirpath / "lock")
+        hook_file = str(dirpath / "hooks.cpp")
+        a_file = str(dirpath / "standalone_halide_runtime.a")
+        so_file = str(dirpath / libname)
+        if not os.path.exists(done_file):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             import halide as hl  # type: ignore[import-untyped,import-not-found]
 
             from torch.utils._filelock import FileLock
 
+<<<<<<< HEAD
             with FileLock(lockfile, LOCK_TIMEOUT):
                 if not os.path.exists(donefile):
                     with open(hookfile, "w") as f:
+=======
+            with FileLock(lock_file, LOCK_TIMEOUT):
+                if not os.path.exists(done_file):
+                    with open(hook_file, "w") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         if device_type == "cuda":
                             f.write(
                                 cls.standalone_runtime_cuda_init.format(
                                     cls.find_header("HalideRuntimeCuda.h")
                                 )
                             )
+<<<<<<< HEAD
                     hl.compile_standalone_runtime(afile, hl.Target(target))
 
                     name, output_dir = get_name_and_dir_from_output_file_path(sofile)
                     halide_cmd_gen = CppBuilder(
                         name=name,
                         sources=[hookfile, afile],
+=======
+                    hl.compile_standalone_runtime(a_file, hl.Target(target))
+
+                    name, output_dir = get_name_and_dir_from_output_file_path(so_file)
+                    halide_cmd_gen = CppBuilder(
+                        name=name,
+                        sources=[hook_file, a_file],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         output_dir=output_dir,
                         BuildOption=CppTorchDeviceOptions(
                             device_type=device_type,
@@ -2661,10 +3846,22 @@ def build_standalone_runtime(cls) -> str:
                     subprocess.check_call(
                         shlex.split(halide_cmd_gen.get_command_line())
                     )
+<<<<<<< HEAD
                     touch(donefile)
         assert os.path.exists(sofile)
         cls._standalone_runtime_path = sofile
         return sofile
+=======
+                    touch(done_file)
+        assert os.path.exists(so_file)
+        cls._standalone_runtime_path = so_file
+        return so_file
+
+    @classmethod
+    def _get_uncompiled_header(cls, device: str) -> str | None:
+        """Header precompiling is currently disabled for halide."""
+        return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _worker_task_halide(lockfile: str, jobs: list[partial[Any]]) -> None:
@@ -2676,6 +3873,10 @@ def _worker_task_halide(lockfile: str, jobs: list[partial[Any]]) -> None:
                 job()
     except subprocess.SubprocessError as e:
         if os.environ.get("HALIDE_REPRO") == "1":
+<<<<<<< HEAD
+=======
+            cmd: list[Any]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             python, script, *cmd = getattr(e, "cmd", ("", "", ""))
             if os.path.basename(python).startswith("python"):
                 code = open(script).read()
@@ -2686,7 +3887,13 @@ class Out:
                     def __repr__(self) -> str:
                         return "out"
 
+<<<<<<< HEAD
                 cmd[cmd.index("-o") + 1] = Out()  # type: ignore[call-overload]
+=======
+                ci = cmd.index("-o")
+                assert isinstance(ci, int)
+                cmd[ci + 1] = Out()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 repl = textwrap.indent(
                     textwrap.dedent(
                         f"""\
@@ -2709,13 +3916,25 @@ def touch(filename: str) -> None:
     open(filename, "a").close()
 
 
+<<<<<<< HEAD
 @clear_on_fresh_inductor_cache
+=======
+@clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class PyCodeCache:
     # Track the loaded modules so we can remove the on-disk artifacts when
     # clearing the cache. Note also that we may load the same path more
     # than once, but attach different attributes, i.e., due to different
     # constant values.
     modules: list[ModuleType] = []
+<<<<<<< HEAD
+=======
+
+    # Modules loaded without extra attributes are stored here, those do not
+    # need to be re-loaded.
+    modules_no_attr: dict[str, ModuleType] = {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     linemaps: dict[str, list[tuple[Any, ...]]] = {}
 
     @classmethod
@@ -2723,6 +3942,7 @@ def write(cls, source_code: str, extra: str = "") -> tuple[str, str]:
         return write(source_code, "py", extra=extra)
 
     @classmethod
+<<<<<<< HEAD
     def load(
         cls,
         source_code: str,
@@ -2732,6 +3952,11 @@ def load(
     ) -> ModuleType:
         key, path = write(source_code, "py", extra=extra)
         return cls.load_by_key_path(key, path, linemap, attrs)
+=======
+    def load(cls, source_code: str, extra: str = "") -> ModuleType:
+        key, path = write(source_code, "py", extra=extra)
+        return cls.load_by_key_path(key, path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def load_by_key_path(
@@ -2744,21 +3969,43 @@ def load_by_key_path(
         if linemap is None:
             linemap = []
 
+<<<<<<< HEAD
         mod = _reload_python_module(key, path)
 
         # unzip into separate lines/nodes lists
         cls.linemaps[path] = list(zip(*linemap))
+=======
+        # we only cache when attrs is None
+        if attrs is None and path in cls.modules_no_attr:
+            return cls.modules_no_attr[path]
+
+        in_toplevel = in_toplevel_process()
+        mod = _reload_python_module(key, path, set_sys_modules=in_toplevel)
+
+        # unzip into separate lines/nodes lists
+        if in_toplevel:
+            cls.linemaps[path] = list(zip(*linemap))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if attrs is not None:
             for k, v in attrs.items():
                 setattr(mod, k, v)
 
+<<<<<<< HEAD
         if not (linemap or attrs):
             mod._reload_in_subproc = functools.partial(  # type: ignore[attr-defined]
                 _reload_python_module_in_subproc, key, path
             )
 
         cls.modules.append(mod)
+=======
+        if in_toplevel:
+            # we only cache when attrs is None
+            if attrs is None:
+                cls.modules_no_attr[path] = mod
+
+            cls.modules.append(mod)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return mod
 
     @classmethod
@@ -2775,14 +4022,26 @@ def cache_clear(cls, purge: bool = False) -> None:
                 except FileNotFoundError:
                     pass
         cls.modules.clear()
+<<<<<<< HEAD
 
     @classmethod
     @functools.lru_cache(None)
+=======
+        cls.modules_no_attr.clear()
+
+    @classmethod
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def stack_frames_for_code(
         cls, path: str, lineno: int
     ) -> Optional[list[dict[str, Any]]]:
         if path not in cls.linemaps:
             return None
+<<<<<<< HEAD
+=======
+        if len(cls.linemaps[path]) == 0:
+            return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # [(starting_line, <fx node>), ...]
         lines, nodes = cls.linemaps[path]
         p = bisect_right(lines, lineno)
@@ -2823,6 +4082,7 @@ def _cuda_compiler() -> Optional[str]:
     return "nvcc"
 
 
+<<<<<<< HEAD
 def _cutlass_include_paths() -> list[str]:
     if config.is_fbcode():
         from libfb.py import parutil
@@ -2840,14 +4100,88 @@ def _cutlass_include_paths() -> list[str]:
 
 
 def _cuda_lib_options() -> list[str]:
+=======
+def _cutlass_path() -> str:
+    if config.is_fbcode():
+        from libfb.py import parutil
+
+        return parutil.get_dir_path("cutlass-3-headers")
+    else:
+        return config.cuda.cutlass_dir
+
+
+def _cutlass_paths() -> list[str]:
+    return [
+        "include",
+        "tools/library/include",
+        "tools/library/src",
+        "tools/util/include",
+    ]
+
+
+def _clone_cutlass_paths(build_root: str) -> list[str]:
+    paths = _cutlass_paths()
+    cutlass_root = _cutlass_path()
+    for path in _cutlass_paths():
+        old_path = os.path.join(cutlass_root, path)
+        new_path = os.path.join(build_root, path)
+        shutil.copytree(old_path, new_path, dirs_exist_ok=True)
+    return paths
+
+
+def _cutlass_include_paths() -> list[str]:
+    cutlass_path = _cutlass_path()
+    return [
+        # Use realpath to get canonical absolute paths, in order not to mess up cache keys
+        os.path.realpath(os.path.join(cutlass_path, path))
+        for path in _cutlass_paths()
+    ]
+
+
+@torch_key_cache
+def cutlass_key() -> bytes:
+    """
+    Compute a key representing the state of the CUTLASS library.
+
+    Note: OSS and fbcode will have different keys.
+    """
+    if config.is_fbcode():
+        with importlib.resources.path("cutlass", "src_hash.txt") as resource_path:
+            with open(resource_path) as resource_file:
+                return resource_file.read().encode()
+
+    combined_hash = hashlib.sha256()
+    build_code_hash([config.cuda.cutlass_dir], "", combined_hash)
+    return combined_hash.digest()
+
+
+def _cuda_lib_options() -> list[str]:
+    """
+    Util function for CUTLASS backend to find the correct CUDA libraries.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _set_gpu_runtime_env()  # cpp_extension consults the env
     from torch.utils import cpp_extension
 
     lpaths = cpp_extension.library_paths(device_type="cuda")
+<<<<<<< HEAD
+=======
+    if use_re_build():
+        lpaths += [
+            build_paths.sdk_lib,
+            os.path.join(build_paths.sdk_lib, "stubs"),
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     extra_ldflags: list[str] = []
     if is_linux():
         _transform_cuda_paths(lpaths)
         for path in lpaths:
+<<<<<<< HEAD
+=======
+            if "torch/lib" in path:
+                # don't want to depend on pytorch
+                continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # -rpath ensures the DLL can find its dependencies when loaded, even
             # if the library path is non-standard.
             extra_ldflags.extend([f"-L{path}", "-Xlinker", f"-rpath={path}"])
@@ -2869,11 +4203,26 @@ def _nvcc_host_compiler_options() -> list[str]:
     ]
 
 
+<<<<<<< HEAD
 def _nvcc_compiler_options() -> list[str]:
     arch = cuda_env.get_cuda_arch()
     if arch == "90":
         # Required by cutlass compilation.
         arch = "90a"
+=======
+def _nvcc_arch_as_compile_option() -> str:
+    arch = cuda_env.get_cuda_arch()
+    if arch == "90":
+        # Required by cutlass compilation.
+        return "90a"
+    if arch == "100":
+        return "100a"
+    return arch
+
+
+def _nvcc_compiler_options() -> list[str]:
+    arch = _nvcc_arch_as_compile_option()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     code = [f"sm_{arch}", f"compute_{arch}"]
     if config.cuda.enable_cuda_lto:
         code += [f"lto_{arch}"]
@@ -2921,7 +4270,17 @@ def cuda_compile_command(
 ) -> str:
     if extra_args is None:
         extra_args = []
+<<<<<<< HEAD
     include_paths = _cutlass_include_paths()
+=======
+    if use_re_build():
+        build_path = os.path.dirname(dst_file)
+        include_paths = _clone_cutlass_paths(build_path)
+        src_files = [os.path.basename(src_file) for src_file in src_files]
+        dst_file = os.path.basename(dst_file)
+    else:
+        include_paths = _cutlass_include_paths()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cuda_lib_options = _cuda_lib_options()
     nvcc_host_compiler_options = _nvcc_host_compiler_options()
     nvcc_compiler_options = _nvcc_compiler_options()
@@ -3025,17 +4384,84 @@ def __del__(self) -> None:
         self.close()
 
 
+<<<<<<< HEAD
 @clear_on_fresh_inductor_cache
 class CUDACodeCache:
+=======
+@lru_cache
+def binary_error_path(output_path: str) -> str:
+    """
+    standard format for the error path
+    """
+    return output_path + ".error"
+
+
+@clear_on_fresh_cache
+class CUDACodeCache:
+    """
+    A cache for managing the compilation and loading of CUDA source code specifically for CUTLASS.
+    This class handles writing source code to files, compiling them into shared objects, and caching
+    the results to avoid redundant compilations. It also manages error handling and logging for the
+    compilation process.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dataclasses.dataclass
     class CacheEntry:
         input_path: str
         output_path: str
+<<<<<<< HEAD
 
     cache: dict[str, CacheEntry] = {}
     cache_clear = staticmethod(cache.clear)
     _SOURCE_CODE_SUFFIX = "cu"
 
+=======
+        error_json: Optional[str] = None
+
+    cache: dict[str, CacheEntry] = {}
+    aot_kernels_o: list[str] = []
+    _SOURCE_CODE_SUFFIX = "cu"
+
+    @staticmethod
+    def cache_clear() -> None:
+        CUDACodeCache.cache.clear()
+        CUDACodeCache.aot_kernels_o.clear()
+
+    @staticmethod
+    @lru_cache(maxsize=4)
+    def get_kernel_binary_remote_cache(
+        caching_enabled: bool, caching_available: bool
+    ) -> Optional[Any]:
+        """
+        Get or create the class instance of the CUTLASSKernelBinaryRemoteCache.
+
+        Args:
+            caching_enabled: Whether binary remote caching is enabled
+            caching_available: Whether we're in fbcode environment
+
+        Returns:
+            CUTLASSKernelBinaryRemoteCache: The class instance of the kernel binary remote cache
+        """
+        if not caching_enabled:
+            log.debug("CUTLASSKernelBinaryRemoteCache not requested, skipping")
+            return None
+        if not caching_available:
+            return None
+
+        try:
+            from torch._inductor.fb.kernel_binary_remote_cache import (
+                CUTLASSKernelBinaryRemoteCache,
+            )
+
+            return CUTLASSKernelBinaryRemoteCache()
+        except ImportError:
+            log.debug(
+                "CUTLASSKernelBinaryRemoteCache not available, remote caching disabled"
+            )
+            return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def write(cls, source_code: str, dst_file_ext: str) -> tuple[str, str]:
         """
@@ -3043,12 +4469,38 @@ def write(cls, source_code: str, dst_file_ext: str) -> tuple[str, str]:
         Returns the hash key of source code, and the path to the file.
         """
 
+<<<<<<< HEAD
         cuda_command = repr(
             cuda_compile_command(["dummy_input"], "dummy_output", dst_file_ext)
         )
         key, input_path = write(
             source_code, cls._SOURCE_CODE_SUFFIX, extra=cuda_command
         )
+=======
+        if config.cuda.cutlass_hash_with_compile_cmd:
+            cuda_command = repr(
+                cuda_compile_command(["dummy_input"], "dummy_output", dst_file_ext)
+            )
+            extra = cuda_command
+        else:
+            extra = repr(
+                [
+                    # nvcc and cuda hash
+                    _cuda_compiler(),
+                    # cutlass flags and gcc hash
+                    _nvcc_compiler_options(),
+                    # flags
+                    _nvcc_host_compiler_options(),
+                    # cutlass key
+                    cutlass_key(),
+                    # hack to deal with AOTI .o compilation
+                ]
+                + [dst_file_ext]
+                if dst_file_ext == "o"
+                else []
+            )
+        key, input_path = write(source_code, cls._SOURCE_CODE_SUFFIX, extra=extra)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return key, input_path
 
     @classmethod
@@ -3067,6 +4519,38 @@ def compile(
             lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
             with lock:
                 output_path = input_path[: -len(cls._SOURCE_CODE_SUFFIX)] + dst_file_ext
+<<<<<<< HEAD
+=======
+                error_path = binary_error_path(output_path)
+                binary_remote_cache = cls.get_kernel_binary_remote_cache(
+                    caching_enabled=config.cuda.use_binary_remote_cache
+                    and not config.force_disable_caches,
+                    caching_available=config.is_fbcode(),
+                )
+                if binary_remote_cache is not None:
+                    # The remote cache implementation will only download if the file does
+                    # not already exist locally
+                    binary_remote_cache.get(output_path, error_path)
+
+                if os.path.exists(error_path):
+                    with open(error_path, encoding="utf-8") as fh:
+                        error_json = fh.read()
+                    cmd_parts, error_output = json.loads(error_json)
+                    if (
+                        binary_remote_cache is not None
+                        and config.cuda.upload_to_binary_remote_cache
+                    ):
+                        # This ensures that a local error is uploaded to the remote cache,
+                        # as we make no assumptions about the remote cache having the same
+                        # information as the local cache
+                        binary_remote_cache.put(
+                            error_path, config.cuda.binary_remote_cache_force_write
+                        )
+                    cls.cache[key] = CUDACodeCache.CacheEntry(
+                        input_path, output_path, error_json
+                    )
+                    raise exc.CUDACompileError(cmd_parts, error_output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if not os.path.exists(output_path):
                     cmd = cuda_compile_command(
                         [input_path], output_path, dst_file_ext, extra_args
@@ -3078,6 +4562,7 @@ def compile(
                     log.debug("CUDA Compilation: %s", cmd)
                     cmd_parts = cmd.split(" ")
                     try:
+<<<<<<< HEAD
                         subprocess.check_output(
                             cmd_parts, stderr=subprocess.STDOUT, env=os.environ
                         )
@@ -3086,13 +4571,71 @@ def compile(
                     end_time = time()
                     log_duration_msg = f"CUDA Compilation took {end_time - start_time} seconds. Compile command: {cmd}"
                     log.info(log_duration_msg)
+=======
+                        if use_re_build():
+                            from triton.fb.re_build_helper import run_build_command
+
+                            run_build_command(
+                                cmd_parts,
+                                os.path.dirname(input_path),
+                                os.path.basename(output_path),
+                            )
+                        else:
+                            subprocess.check_output(
+                                cmd_parts, stderr=subprocess.STDOUT, env=os.environ
+                            )
+                    except subprocess.CalledProcessError as error:
+                        cls._record_cuda_compile_error(
+                            error.output.decode("utf-8"),
+                            key,
+                            cmd_parts,
+                            input_path,
+                            output_path,
+                            binary_remote_cache,
+                        )
+                        raise exc.CUDACompileError(cmd_parts, error.output) from error
+                    except Exception as error:
+                        if "COMPILE FAILED WITH" in str(error):
+                            cls._record_cuda_compile_error(
+                                str(error),
+                                key,
+                                cmd_parts,
+                                input_path,
+                                output_path,
+                                binary_remote_cache,
+                            )
+                            raise exc.CUDACompileError(cmd_parts, str(error)) from error
+                        raise error
+                    end_time = time()
+                    log_duration_msg = f"CUDA Compilation took {end_time - start_time} seconds. Compile command: {cmd}"
+                    log.info(log_duration_msg)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     log.debug(
                         "CUDA Compilation skipped: %s since output already exists",
                         input_path,
                     )
+<<<<<<< HEAD
                 cls.cache[key] = CUDACodeCache.CacheEntry(input_path, output_path)
 
+=======
+                # Upload to remote cache if enabled
+                if (
+                    binary_remote_cache is not None
+                    and config.cuda.upload_to_binary_remote_cache
+                ):
+                    # will log on errors, but not fail out
+                    binary_remote_cache.put(
+                        output_path, config.cuda.binary_remote_cache_force_write
+                    )
+                cls.cache[key] = CUDACodeCache.CacheEntry(input_path, output_path, None)
+        cache_entry: CUDACodeCache.CacheEntry = cls.cache[key]
+        if cache_entry.error_json is not None:
+            # Restore cached Exception and raise it as if we had compiled
+            cmd_parts, error_output = json.loads(cache_entry.error_json)
+            raise exc.CUDACompileError(cmd_parts, error_output.encode("utf-8"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (cls.cache[key].output_path, key, input_path)
 
     @classmethod
@@ -3112,8 +4655,40 @@ def load(cls, source_code: str, dst_file_ext: str) -> tuple[DLLWrapper, str, str
         )
         return (DLLWrapper(dst_file_path), hash_key, source_code_path)
 
+<<<<<<< HEAD
 
 @clear_on_fresh_inductor_cache
+=======
+    @classmethod
+    def _record_cuda_compile_error(
+        cls,
+        error_str: str,
+        key: str,
+        cmd_parts: list[str],
+        input_path: str,
+        output_path: str,
+        # Any here, as the import and type will only work in fbcode
+        # TODO: Make the typing hint strong here
+        binary_remote_cache: Any = None,
+    ) -> None:
+        error_json = json.dumps([cmd_parts, error_str])
+        cls.cache[key] = CUDACodeCache.CacheEntry(input_path, output_path, error_json)
+        error_path = binary_error_path(output_path)
+        with open(error_path, "w", encoding="utf-8") as fh:
+            fh.write(error_json)
+
+        # Upload to remote cache directly from memory if enabled
+        if (
+            binary_remote_cache is not None
+            and config.cuda.upload_to_binary_remote_cache
+        ):
+            binary_remote_cache.put(
+                error_path, config.cuda.binary_remote_cache_force_write
+            )
+
+
+@clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ROCmCodeCache:
     @dataclasses.dataclass
     class CacheEntry:
@@ -3121,10 +4696,22 @@ class CacheEntry:
         output_path: str
 
     cache: dict[str, CacheEntry] = {}
+<<<<<<< HEAD
     cache_clear = staticmethod(cache.clear)
     _SOURCE_CODE_SUFFIX = "cpp"
     _logged_compiler_version = False
 
+=======
+    aot_kernels_o: list[str] = []
+    _SOURCE_CODE_SUFFIX = "cpp"
+    _logged_compiler_version = False
+
+    @staticmethod
+    def cache_clear() -> None:
+        ROCmCodeCache.cache.clear()
+        ROCmCodeCache.aot_kernels_o.clear()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def write(cls, source_code: str, dst_file_ext: str) -> tuple[str, str]:
         """
@@ -3220,5 +4807,38 @@ def __init__(
         self.result_fn = result_fn
         self.future = future
 
+<<<<<<< HEAD
     def result(self) -> Callable[..., Any]:  # type: ignore[override]
         return self.result_fn()
+=======
+    def result(self) -> Callable[..., Any]:
+        return self.result_fn()
+
+
+class StaticAutotunerFuture(CodeCacheFuture):
+    """
+    A statically launchable CachingAutotuner, loaded from TritonBundler
+    """
+
+    def __init__(self, static_autotuner: CachingAutotuner) -> None:
+        # Pickled version of CachingAutotuner
+        self.static_autotuner = static_autotuner
+        # This needs to be set in AsyncCompile.triton, in case
+        # we need to reload the CachingAutotuner from its source code
+        # We don't store the source code on the CachingAutotuner itself
+        # since it can be very large.
+        self.reload_kernel_from_src: Optional[Callable[[], Any]] = None
+
+    def result(self) -> CachingAutotuner:
+        assert self.reload_kernel_from_src is not None
+        with dynamo_timed("StaticAutotunerFuture.warm_precompile"):
+            self.static_autotuner.recheck_autotune_cache(
+                reload_kernel_from_src=self.reload_kernel_from_src
+            )
+            self.static_autotuner.precompile(  # type: ignore[union-attr]
+                warm_cache_only=False,
+                reload_kernel=self.reload_kernel_from_src,
+                static_triton_bundle_key=None,  # no need to save again
+            )
+            return self.static_autotuner
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/codegen/aoti_hipify_utils.py b/torch/_inductor/codegen/aoti_hipify_utils.py
index b6ccaab56f82..7a0436e27d98 100644
--- a/torch/_inductor/codegen/aoti_hipify_utils.py
+++ b/torch/_inductor/codegen/aoti_hipify_utils.py
@@ -8,7 +8,11 @@
 #   "...
 #    from ..codecache import CudaKernelParamCache
 #   ..."
+<<<<<<< HEAD
 # In such cases, we do not need to hipify_torch the orignial class/file name in codegen/codecache
+=======
+# In such cases, we do not need to hipify_torch the original class/file name in codegen/codecache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def maybe_hipify_code_wrapper(source_codes: str, force_hipify: bool = False) -> str:
diff --git a/torch/_inductor/codegen/aoti_runtime/interface.cpp b/torch/_inductor/codegen/aoti_runtime/interface.cpp
index 1da5b45073e2..bd543977fdb4 100644
--- a/torch/_inductor/codegen/aoti_runtime/interface.cpp
+++ b/torch/_inductor/codegen/aoti_runtime/interface.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/inductor/aoti_runtime/model_container.h>
 
 #include <iostream>
+<<<<<<< HEAD
 #include <sstream>
 #include <stdexcept>
 #include <vector>
@@ -18,6 +19,20 @@
     std::cerr << "Unknown exception occurred." << std::endl; \
     return AOTI_RUNTIME_FAILURE;                             \
   }                                                          \
+=======
+#include <vector>
+
+#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
+  try {                                           \
+    __VA_ARGS__                                   \
+  } catch (const std::exception& e) {             \
+    std::cerr << "Error: " << e.what() << '\n';   \
+    return AOTI_RUNTIME_FAILURE;                  \
+  } catch (...) {                                 \
+    std::cerr << "Unknown exception occurred.\n"; \
+    return AOTI_RUNTIME_FAILURE;                  \
+  }                                               \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return AOTI_RUNTIME_SUCCESS;
 
 #define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
@@ -36,6 +51,7 @@
 // A RAII, thread local (!) guard that enables or disables grad mode upon
 // construction, and sets it back to the original value upon destruction.
 struct AOTINoGradGuard {
+<<<<<<< HEAD
   AOTINoGradGuard() : prev_mode(aoti_torch_grad_mode_is_enabled()) {
     aoti_torch_grad_mode_set_enabled(false);
   }
@@ -43,6 +59,19 @@ struct AOTINoGradGuard {
     aoti_torch_grad_mode_set_enabled(prev_mode);
   }
   bool prev_mode;
+=======
+  AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(false);
+  }
+  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
+  ~AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(prev_mode);
+  }
+  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
+  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 extern "C" {
@@ -65,7 +94,11 @@ AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
     const char* device_str,
     const char* cubin_dir) {
   if (num_models == 0) {
+<<<<<<< HEAD
     std::cerr << "Error: num_models must be positive, but got 0" << std::endl;
+=======
+    std::cerr << "Error: num_models must be positive, but got 0\n";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return AOTI_RUNTIME_FAILURE;
   }
   CONVERT_EXCEPTION_TO_ERROR_CODE({
@@ -205,6 +238,51 @@ AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
     { *dtype = container->constant_dtype(idx); })
 }
 
+<<<<<<< HEAD
+=======
+AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
+  AOTInductorModelContainerHandle container_handle,
+  size_t idx,
+  size_t* data_size) {
+  auto* container =
+    reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+        container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *data_size = container->constant_data_size(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { const auto ret = container->extract_constants_map(use_inactive);
+      for (const auto& pair: ret) {
+        constants_map->emplace(pair.first, pair.second);
+      }
+    })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
+  })
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
     AOTInductorModelContainerHandle container_handle,
     AOTInductorConstantMapHandle constant_map_handle,
@@ -229,6 +307,19 @@ AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
           /*validate_full_update*/ true);
 }
 
+<<<<<<< HEAD
+=======
+AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->free_inactive_constant_buffer();
+  })
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
     AOTInductorModelContainerHandle container_handle,
     bool use_inactive,
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 152d2ef36197..fe356ce1dd6d 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -1,5 +1,9 @@
 from __future__ import annotations
 
+<<<<<<< HEAD
+=======
+import atexit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import contextlib
 import dataclasses
 import enum
@@ -8,8 +12,15 @@
 import logging
 import math
 import operator
+<<<<<<< HEAD
 import re
 import typing
+=======
+import os
+import re
+import tempfile
+from abc import ABC, abstractmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from enum import auto, Enum
 from itertools import chain
 from typing import (
@@ -23,7 +34,11 @@
     TYPE_CHECKING,
     Union,
 )
+<<<<<<< HEAD
 from typing_extensions import TypeVar
+=======
+from typing_extensions import Self, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
@@ -44,6 +59,10 @@
     boolean_ops,
     DeferredLineBase,
     generate_assert,
+<<<<<<< HEAD
+=======
+    get_current_backend,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IndentedBuffer,
     ir_dataclass,
     ScopedDict,
@@ -59,6 +78,12 @@
 if TYPE_CHECKING:
     from collections.abc import Iterator, MutableMapping, Sequence
 
+<<<<<<< HEAD
+=======
+    from torch.fx import GraphModule
+
+    from ..custom_graph_pass import CustomGraphModulePass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from ..ir import Buffer, ChoiceCaller, FixedLayout, IRNode
     from ..loop_body import LoopBody
     from ..scheduler import BaseScheduling, Scheduler, SchedulerNode
@@ -82,6 +107,41 @@ def data_type_logger(msg: str) -> None:
         schedule_log.debug("Data type propagation: %s", msg)
 
 
+<<<<<<< HEAD
+=======
+@dataclasses.dataclass
+class FileBackedGraphModule:
+    """
+    Output of FX wrapper codegen. Exposes the same methods as ModuleType, but these
+    map back to a GraphModule instead of Python source.
+    """
+
+    gm: GraphModule
+    compiled_fn: Callable[..., Any]
+
+    def __post_init__(self) -> None:
+        # Write the code to a file for compatibility with debugging utilities.
+        # The file is deleted upon program termination.
+        self.tempfile = tempfile.NamedTemporaryFile(
+            mode="w+", suffix=".py", delete=False
+        )
+        atexit.register(os.remove, self.tempfile.name)
+        with self.tempfile as f:
+            f.write(self.value)
+
+    @property
+    def __file__(self) -> str:
+        return self.tempfile.name
+
+    def call(self, args: list[Any]) -> Any:
+        return self.compiled_fn(*args)
+
+    @property
+    def value(self) -> str:
+        return self.gm.code
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class WorkspaceZeroMode(enum.Enum):
     UNINITIALIZED = 0
     ZERO_ON_CALL = 1  # kernel may leave workspace dirty
@@ -102,8 +162,27 @@ def from_bool(zero_fill: bool) -> WorkspaceZeroMode:
         return WorkspaceZeroMode.UNINITIALIZED
 
 
+<<<<<<< HEAD
 @ir_dataclass(frozen=True)
 class WorkspaceArg:
+=======
+class CodegenSymbol(ABC):
+    """
+    An IR object possibly corresponding to a variable in the wrapper code.
+    """
+
+    @abstractmethod
+    def get_name(self) -> str:
+        pass
+
+    @abstractmethod
+    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
+        pass
+
+
+@ir_dataclass(frozen=True)
+class WorkspaceArg(CodegenSymbol):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """A temporary buffer used for a single kernel, then discarded.
 
     Not registered as a traditional buffer since there are no users,
@@ -166,6 +245,12 @@ def get_device(self) -> torch.device:
     def get_dtype(self) -> torch.dtype:
         return self.dtype
 
+<<<<<<< HEAD
+=======
+    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
+        return self.get_layout().get_example()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_layout(self) -> FixedLayout:
         from ..ir import FixedLayout
 
@@ -184,6 +269,12 @@ def layout(self) -> FixedLayout:
     maybe_get_output_spec = get_layout
     maybe_get_layout = get_layout
 
+<<<<<<< HEAD
+=======
+    def get_offset(self) -> sympy.Expr:
+        return sympy.S.Zero
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_size(self) -> list[sympy.Expr]:
         return [self.count]
 
@@ -197,6 +288,18 @@ def get_inputs_that_alias_output(self) -> list[str]:
         return []
 
 
+<<<<<<< HEAD
+=======
+class TritonScratchWorkspace:
+    def __init__(self, size: int, generate_dtype_str: Callable[..., str]):
+        self.size = size
+        self._generate_dtype_str = generate_dtype_str
+
+    def generate_dtype_str(self) -> str:
+        return self._generate_dtype_str()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class TensorArg:
     name: str
@@ -224,6 +327,12 @@ class ConstexprArg:
 @dataclasses.dataclass
 class TMADescriptorArg:
     name: str
+<<<<<<< HEAD
+=======
+    api_type: str  # "experimental" or "stable"
+    block_shape: Optional[list[sympy.Expr]]  # only needed for "stable"
+    dtype: Optional[torch.dtype]  # only needed for "stable"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -287,12 +396,22 @@ def cpp_device_ptr(self) -> str:
     def tma_descriptor_helpers(self) -> str:
         raise NotImplementedError
 
+<<<<<<< HEAD
     def cpp_global_scratch(self, idx: int) -> Optional[tuple[str, str]]:
+=======
+    def cpp_global_scratch(
+        self, idx: int, workspace: TritonScratchWorkspace
+    ) -> Optional[tuple[list[str], str]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # optionally return (scratch definition, arg name)
         raise NotImplementedError
 
 
 device_op_overrides_dict: dict[str, DeviceOpOverrides] = {}
+<<<<<<< HEAD
+=======
+custom_backend_passes: dict[str, Optional[CustomGraphModulePass]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # The code generated by Inductor consists of two main parts: kernel code and wrapper code.
@@ -321,10 +440,18 @@ def register_backend_for_device(
     device_scheduling: SchedulingConstructor,
     device_wrapper_codegen: WrapperConstructor,
     device_cpp_wrapper_codegen: Optional[WrapperConstructor] = None,
+<<<<<<< HEAD
+=======
+    device_custom_pass: Optional[CustomGraphModulePass] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     device_codegens[device] = DeviceCodegen(
         device_scheduling, device_wrapper_codegen, device_cpp_wrapper_codegen
     )
+<<<<<<< HEAD
+=======
+    custom_backend_passes[device] = device_custom_pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class BackendFeature(Enum):
@@ -349,7 +476,11 @@ def get_backend_features(
     if isinstance(device, torch.device):
         device_type = device.type
     else:
+<<<<<<< HEAD
         assert isinstance(device, str)
+=======
+        assert isinstance(device, str), type(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device_type = device
         device = torch.device(device_type)
     scheduling_ctor = get_scheduling_for_device(device_type)
@@ -383,12 +514,24 @@ def get_wrapper_codegen_for_device(
     return None
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+def get_custom_backend_pass_for_device(device: str) -> Optional[CustomGraphModulePass]:
+    return custom_backend_passes[device] if device in custom_backend_passes else None
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def init_backend_registration() -> None:
     from .cpp import CppScheduling
     from .cpp_wrapper_cpu import CppWrapperCpu
     from .cpp_wrapper_cpu_array_ref import CppWrapperCpuArrayRef
     from .cpp_wrapper_gpu import CppWrapperGpu
+<<<<<<< HEAD
+=======
+    from .cpp_wrapper_mps import CppWrapperMps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .cuda_combined_scheduling import CUDACombinedScheduling
     from .halide import HalideScheduling
     from .mps import MetalScheduling
@@ -436,7 +579,11 @@ def init_backend_registration() -> None:
             "mps",
             MetalScheduling,
             PythonWrapperCodegen,
+<<<<<<< HEAD
             CppWrapperGpu,
+=======
+            CppWrapperMps,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     private_backend = torch._C._get_privateuse1_backend_name()
@@ -479,7 +626,11 @@ def register_device_op_overrides(
 
 
 def get_device_op_overrides(device: str) -> DeviceOpOverrides:
+<<<<<<< HEAD
     assert isinstance(device, str)
+=======
+    assert isinstance(device, str), type(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not device_op_overrides_dict:
         from . import cpu_device_op_overrides, mps_device_op_overrides  # noqa: F401
@@ -553,6 +704,34 @@ def deduce_output_dtype_by_name(
     return None
 
 
+<<<<<<< HEAD
+=======
+def check_dtype(
+    buffer: IndentedBuffer, var: CSEVariableType, dtype: torch.dtype
+) -> None:
+    backend = get_current_backend()
+    if config.test_configs.runtime_triton_dtype_assert and backend == "triton":
+        buffer.writeline(f"tl.static_assert({var}.dtype == {triton_type(dtype)})")
+    elif config.test_configs.static_cpp_dtype_assert and backend == "cpp":
+        from .cpp_utils import CppCSEVariable, DTYPE_TO_CPP
+
+        assert isinstance(var, CppCSEVariable), type(var)
+        if dtype == torch.bool:
+            if var.is_vec:
+                is_same_dt = f"IsVecMaskType<decltype({var})>::value"
+            else:
+                # operator&(bool, bool) returns int and it can be used as boolean in C++
+                is_same_dt = f"std::is_same_v<decltype({var}), bool> || std::is_same_v<decltype({var}), int>"
+        else:
+            c_var_type = f"decltype({var})"
+            if var.is_vec:
+                c_var_type = f"typename {c_var_type}::value_type"
+            is_same_dt = f"std::is_same_v<{c_var_type}, {DTYPE_TO_CPP[dtype]}>"
+
+        buffer.writeline(f"static_assert({is_same_dt});")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DataTypePropagation:
     def __init__(self, body: LoopBody) -> None:
         self.body = body
@@ -598,9 +777,17 @@ def deduce_node_dtype(self, node: torch.fx.Node) -> Optional[torch.dtype]:
             return None
 
         if node.target == operator.getitem:
+<<<<<<< HEAD
             return self.deduce_node_dtype(node.args[0])  # type: ignore[arg-type]
 
         assert isinstance(node.target, str)
+=======
+            node_arg = node.args[0]
+            assert isinstance(node_arg, torch.fx.Node), type(node_arg)
+            return self.deduce_node_dtype(node_arg)
+
+        assert isinstance(node.target, str), type(node.target)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if node.target.startswith("masked_subblock"):
             return self.deduce_node_dtype_by_subgraph(node)
@@ -646,8 +833,13 @@ def propagate_scheduler_node(cls, node: SchedulerNode) -> Optional[torch.dtype]:
         from ..loop_body import LoopBody
         from ..scheduler import SchedulerNode
 
+<<<<<<< HEAD
         assert isinstance(node, SchedulerNode)
         assert isinstance(node._body, LoopBody)
+=======
+        assert isinstance(node, SchedulerNode), type(node)
+        assert isinstance(node._body, LoopBody), type(node._body)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return DataTypePropagation.propagate_loopbody(node._body)
 
 
@@ -782,6 +974,7 @@ def constant(value: Union[bool, float, int], dtype: torch.dtype) -> OpVarT:
         return repr(value)
 
     @staticmethod
+<<<<<<< HEAD
     def libdevice_sigmoid(x: OpVarT) -> OpVarT:
         one = ops.constant(1, torch.int32)
         return ops.truediv(one, ops.add(one, ops.libdevice_exp(ops.neg(x))))
@@ -811,6 +1004,8 @@ def libdevice_exp(x: OpVarT) -> OpVarT:
         return ops.exp(x)
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def bitwise_not(x: OpVarT) -> OpVarT:
         return f"~{OpOverrides.paren(x)}"
 
@@ -1147,7 +1342,12 @@ class OverridesData:
     ),
     polygamma=OverridesData(
         type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+<<<<<<< HEAD
         cpp=lambda x, y: f"{x} == 0 ? calc_digamma({y}) : calc_polygamma({y}, {x})",
+=======
+        cpp=lambda x,
+        y: f"{x} == 0 ? calc_digamma({y}) : ({x} == 1 ? trigamma({y}) : calc_polygamma({y}, {x}))",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name="polygamma",
     ),
     # psi - alias to digamma
@@ -1370,7 +1570,13 @@ def output(self, name: str) -> str:
         return self._lookup("out_ptr", self.output_buffers, name)
 
     def make_inplace(self, input_name: str, output_name: str) -> None:
+<<<<<<< HEAD
         assert output_name not in self.inplace_buffers
+=======
+        if input_name in V.graph.unaligned_buffers:
+            V.graph.unaligned_buffers.add(output_name)
+        assert output_name not in self.inplace_buffers, output_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input_name in self.inplace_buffers:
             buf = self.inplace_buffers[input_name]
             assert not isinstance(buf, RemovedArg)
@@ -1432,7 +1638,11 @@ def workspace(self, nbytes: sympy.Expr, zero_fill: bool) -> tuple[str, int]:
             assert (
                 existing_arg.inner_name != arg.inner_name
                 and existing_arg.outer_name != arg.outer_name
+<<<<<<< HEAD
             )
+=======
+            ), existing_arg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.workspace_args.append(arg)
         return arg.inner_name, 0
 
@@ -1460,7 +1670,11 @@ def semaphores(self, min_size: sympy.Expr) -> str:
         )
         for existing_arg in self.workspace_args:
             if existing_arg.inner_name == arg.inner_name:
+<<<<<<< HEAD
                 assert arg == existing_arg
+=======
+                assert arg == existing_arg, (arg, existing_arg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.workspace_args.append(arg)
         return arg.inner_name
 
@@ -1480,7 +1694,11 @@ def seed_offset(self, name: str, value: int) -> str:
     def size(self, name: sympy.Symbol) -> str:
         assert isinstance(name, sympy.Symbol), (type(name), name)
         if name.name == "seed":
+<<<<<<< HEAD
             self.sizevars[name] = "seed"  # dont' mange the name of seeds
+=======
+            self.sizevars[name] = "seed"  # don't manage the name of seeds
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return "seed"
         return self._lookup("ks", self.sizevars, name)
 
@@ -1507,8 +1725,20 @@ def wrap_ptr_arg(self, buf: str, dtype: torch.dtype) -> str:
     def wrap_size_arg(self, size: SymbolLike) -> str:
         return str(size)
 
+<<<<<<< HEAD
     def cpp_argdefs(self) -> tuple[list[str], list[str], list[str]]:
         from .cpp_utils import DTYPE_TO_CPP, INDEX_TYPE
+=======
+    def cpp_argdefs(
+        self, dtype_to_cpp_type: Optional[dict[torch.dtype, str]] = None
+    ) -> tuple[list[str], list[str], list[str]]:
+        from .cpp_utils import INDEX_TYPE
+
+        if dtype_to_cpp_type is None:
+            from .cpp_utils import DTYPE_TO_CPP
+
+            dtype_to_cpp_type = DTYPE_TO_CPP
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         call_args = []
         arg_defs = []
@@ -1519,7 +1749,11 @@ def cpp_argdefs(self) -> tuple[list[str], list[str], list[str]]:
             outer = inplaced.other_names[-1]
             inner = inplaced.inner_name
             dtype = V.graph.get_dtype(outer)
+<<<<<<< HEAD
             cpp_dtype = DTYPE_TO_CPP[dtype]
+=======
+            cpp_dtype = dtype_to_cpp_type[dtype]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             arg_defs.append(f"{cpp_dtype}* {inner}")
             call_args.append(self.wrap_ptr_arg(outer, dtype))
             arg_types.append(f"{cpp_dtype}*")
@@ -1527,7 +1761,11 @@ def cpp_argdefs(self) -> tuple[list[str], list[str], list[str]]:
             if outer in self.inplace_buffers:
                 continue
             dtype = V.graph.get_dtype(outer)
+<<<<<<< HEAD
             cpp_dtype = DTYPE_TO_CPP[dtype]
+=======
+            cpp_dtype = dtype_to_cpp_type[dtype]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             arg_defs.append(f"const {cpp_dtype}* {inner}")
             call_args.append(self.wrap_ptr_arg(outer, dtype))
             arg_types.append(f"const {cpp_dtype}*")
@@ -1535,7 +1773,11 @@ def cpp_argdefs(self) -> tuple[list[str], list[str], list[str]]:
             if outer in self.inplace_buffers or isinstance(maybe_inner, RemovedArg):
                 continue
             dtype = V.graph.get_dtype(outer)
+<<<<<<< HEAD
             cpp_dtype = DTYPE_TO_CPP[dtype]
+=======
+            cpp_dtype = dtype_to_cpp_type[dtype]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             arg_defs.append(f"{cpp_dtype}* {maybe_inner}")
             call_args.append(self.wrap_ptr_arg(outer, dtype))
             arg_types.append(f"{cpp_dtype}*")
@@ -1553,7 +1795,11 @@ def python_argdefs(
     ) -> tuple[list[ArgName], list[str], list[KernelArgType], list[Any]]:
         arg_defs: list[ArgName] = []
         call_args: list[str] = []
+<<<<<<< HEAD
         arg_types: list[torch.dtype] = []
+=======
+        arg_types: list[Any] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         precompile_args: list[KernelArgType] = []
         for inplaced in unique(self.inplace_buffers.values()):
             if isinstance(inplaced, RemovedArg):
@@ -1586,7 +1832,11 @@ def python_argdefs(
         for outer, inner in self.sizevars.items():
             arg_defs.append(ArgName(inner))
             call_args.append(outer)
+<<<<<<< HEAD
             arg_types.append(type(outer))  # type: ignore[arg-type]
+=======
+            arg_types.append(type(outer))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             precompile_args.append(SizeArg(inner, outer))
             if V.graph.wrapper_code:
                 V.graph.wrapper_code.ensure_size_computed(outer)
@@ -1621,7 +1871,11 @@ def is_removed(self, name: str) -> bool:
     # after you do a call into this kernel, which buffers actually contain
     # updated data?  Modeled off of python_argdefs.
     def live_output_buffers(self) -> OrderedSet[str]:
+<<<<<<< HEAD
         live_outs = OrderedSet()  # type: ignore[var-annotated]
+=======
+        live_outs: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for inplaced in unique(self.inplace_buffers.values()):
             if isinstance(inplaced, RemovedArg):
                 continue
@@ -1647,7 +1901,11 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
     ):
         super().__init__()
+<<<<<<< HEAD
         assert isinstance(bounds, ValueRanges)
+=======
+        assert isinstance(bounds, ValueRanges), type(bounds)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.name = name
         self.bounds = bounds
         self.use_count = 1  # track how many times this expression is used
@@ -1717,7 +1975,11 @@ def invalidate(self, keep_vars: OrderedSet[CSEVariable]) -> None:
         else:
             self._cache = {}
 
+<<<<<<< HEAD
     def clone(self) -> typing.Self:
+=======
+    def clone(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return type(self)(
             prefix=self.prefix,
             suffix=self.suffix,
@@ -1728,7 +1990,11 @@ def clone(self) -> typing.Self:
             reduction_cache=self.reduction_cache,
         )
 
+<<<<<<< HEAD
     def scoped_copy(self) -> typing.Self:
+=======
+    def scoped_copy(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Return a copy of using ScopedDict so changes to *_cache aren't visible in self"""
         new_cse = self.clone()
         new_cse._cache = ScopedDict(self._cache)
@@ -1806,6 +2072,7 @@ def generate(
                         line = f"{expr}{self.suffix}"
                     buffer.writeline(line)
 
+<<<<<<< HEAD
                     if (
                         assignment
                         and config.test_configs.runtime_triton_dtype_assert
@@ -1813,6 +2080,19 @@ def generate(
                     ):
                         assert_line = f"tl.static_assert({self.prefix}{var}.dtype == {triton_type(dtype)})"
                         buffer.writeline(assert_line)
+=======
+                    # cpp backend cannot determine is_vec at this point
+                    if (
+                        assignment
+                        and (
+                            config.test_configs.runtime_triton_dtype_assert
+                            or config.test_configs.static_cpp_dtype_assert
+                        )
+                        and dtype is not None
+                        and get_current_backend() != "cpp"
+                    ):
+                        check_dtype(buffer, var, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         else:
             var.bounds = var.bounds.tighten(bounds)
@@ -1849,7 +2129,11 @@ def __init__(self) -> None:
         super().__init__()
         self.exit_stack = contextlib.ExitStack()
 
+<<<<<<< HEAD
     def __enter__(self) -> typing.Self:
+=======
+    def __enter__(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.exit_stack.__enter__()
         return self
 
@@ -1877,16 +2161,26 @@ def __init__(
         self.num_reduction = 0
 
         self.cse: CSE[CSEVariableType, Any] = CSE(self.newvar_prefix, self.suffix)
+<<<<<<< HEAD
         self.must_keep_buffers = OrderedSet[str]()
         self.store_buffer_names = OrderedSet[str]()
+=======
+        self.must_keep_buffers: OrderedSet[str] = OrderedSet()
+        self.store_buffer_names: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._load_mask: Optional[str] = None
         self._load_other: Union[None, int, float] = None
         # OrderedSet in set_current_node
         self.current_node: Optional[SchedulerNode] = None
         self.node_to_bounds: Optional[dict[torch.fx.Node, ValueRanges[Any]]] = None
 
+<<<<<<< HEAD
         self.removed_buffers = OrderedSet[str]()
         self.inplaced_to_remove = OrderedSet[str]()
+=======
+        self.removed_buffers: OrderedSet[str] = OrderedSet()
+        self.inplaced_to_remove: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # key: the buffer to write
         # value: the buffer to read and whose memory can be reused for
@@ -2015,12 +2309,20 @@ def indirect_assert(
     ) -> str:
         if isinstance(var, CSEVariable):
             var = str(var)
+<<<<<<< HEAD
         assert isinstance(var, str)
+=======
+        assert isinstance(var, str), type(var)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert lower is None or isinstance(lower, str)
         assert upper is None or isinstance(upper, str)
         if lower and upper:
             # The conditions need to be in parens because of Python's operator precedence.
+<<<<<<< HEAD
             # It'd be less error-prone to use and/or/not, which is suported by triton
+=======
+            # It'd be less error-prone to use and/or/not, which is supported by triton
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cond = f"({lower} <= {var}) & ({var} < {upper})"
             cond_print = f"{lower} <= {var} < {upper}"
         elif lower:
@@ -2044,7 +2346,11 @@ def check_bounds(
     def index_to_str(self, index: sympy.Expr) -> str:
         raise NotImplementedError
 
+<<<<<<< HEAD
     def __enter__(self) -> typing.Self:
+=======
+    def __enter__(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__enter__()
         assert self.overrides
         self.exit_stack.enter_context(
@@ -2073,7 +2379,11 @@ def remove_kernel_local_buffers(self) -> None:
             for buf in self.store_buffer_names
             if buf in scheduler.name_to_buf
         )
+<<<<<<< HEAD
         names_to_remove = OrderedSet[str]()
+=======
+        names_to_remove: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for name in self.store_buffer_names:
             if (
                 name not in self.must_keep_buffers
@@ -2115,7 +2425,11 @@ def rename_indexing(
         # adds the necessary kernel args for index expressions
         # and renames variables in index expressions to kernel arg names
         if isinstance(index, (list, tuple)):
+<<<<<<< HEAD
             return [self.rename_indexing(x) for x in index]  # type: ignore[return-value]
+=======
+            return [self.rename_indexing(x) for x in index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         index = V.graph.sizevars.simplify(index)
         sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
         replacements = {
@@ -2152,7 +2466,11 @@ class OptimizationContext:
     ops_name: str = ""
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def jinja2_env() -> Any:
     try:
         import jinja2
@@ -2293,6 +2611,7 @@ def __init__(self, kernel: Kernel[Any], parent_handler: OpsHandler[Any]):
     def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
         bounds = self._bound_variable(name, *args, **kwargs)
 
+<<<<<<< HEAD
         value = getattr(self.parent_handler, name)(*args, **kwargs)  # type: ignore[has-type]
         dtype_handler = DtypePropagationOpsHandler()
 
@@ -2324,6 +2643,43 @@ def do_cse(v: str) -> CSEVariable:
             else:
                 # cpp backend doesnt track dtype yet
                 output_dtype = None
+=======
+        value = getattr(self.parent_handler, name)(*args, **kwargs)
+        dtype_handler = DtypePropagationOpsHandler()
+
+        backend = get_current_backend()
+
+        output_dtype = None
+        if name == "masked" and backend == "triton":
+            output_dtype = value.dtype
+        elif name == "masked" and backend == "cpp":
+            output_dtype = V.interpreter.current_node.meta.get(
+                OptimizationContext.key, None
+            ).dtype
+        elif backend in ("triton", "cpp", "mps"):
+            dtype_op = getattr(dtype_handler, name)
+            output_dtype = dtype_op(*args, **kwargs)
+
+        if backend in ("triton", "cpp"):
+            # maybe there are some exceptions on mps?
+            assert output_dtype is not None
+
+        output_idx = 0
+
+        def do_cse(v: str) -> CSEVariable:
+            # we tree_map over the output, so we need to fetch corresponding dtype
+            nonlocal output_idx
+            var_dtype: Optional[torch.dtype] = (
+                output_dtype[output_idx]
+                if isinstance(output_dtype, (list, tuple))
+                else output_dtype
+            )
+            output_idx += 1
+
+            # some cpp op implementations don't set the dtype
+            if backend == "cpp" and isinstance(v, CSEVariable) and v.dtype is None:
+                v.dtype = var_dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             csevar = V.kernel.cse.generate(
                 V.kernel.compute,
@@ -2332,6 +2688,7 @@ def do_cse(v: str) -> CSEVariable:
                 dtype=output_dtype,
             )
 
+<<<<<<< HEAD
             nonlocal output_idx
             if config.test_configs.runtime_triton_dtype_assert and triton_backend:
                 from torch._inductor.codegen.triton import triton_type
@@ -2347,6 +2704,16 @@ def do_cse(v: str) -> CSEVariable:
 
             csevar.update_on_args(name, args, kwargs)
 
+=======
+            csevar.update_on_args(name, args, kwargs)
+
+            if (
+                config.test_configs.runtime_triton_dtype_assert
+                or config.test_configs.static_cpp_dtype_assert
+            ):
+                assert var_dtype is not None
+                check_dtype(V.kernel.compute, csevar, var_dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return csevar
 
         return pytree.tree_map(do_cse, value)
@@ -2358,13 +2725,28 @@ def _bound_variable(self, name: str, *args: Any, **kwargs: Any) -> ValueRanges[A
         """
         from ..bounds import ValueRangeAnalysis
         from ..select_algorithm import TritonTemplateKernel
+<<<<<<< HEAD
+=======
+        from .cuda.cuda_kernel import CUDATemplateKernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if isinstance(V.kernel, TritonTemplateKernel):
             return ValueRanges.unknown()
 
+<<<<<<< HEAD
         fx_node = V.interpreter.current_node
         if fx_node.target == name and self.kernel.node_to_bounds is not None:
             assert isinstance(self.kernel.node_to_bounds, dict)
+=======
+        if isinstance(V.kernel, CUDATemplateKernel):
+            return ValueRanges.unknown()
+
+        fx_node = V.interpreter.current_node
+        if fx_node.target == name and self.kernel.node_to_bounds is not None:
+            assert isinstance(self.kernel.node_to_bounds, dict), type(
+                self.kernel.node_to_bounds
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return self.kernel.node_to_bounds.get(fx_node, ValueRanges.unknown())
         elif config.compute_all_bounds and hasattr(ValueRangeAnalysis, name):
             # These create lots of inner strings. We would need to compute the bounds at the ops
@@ -2399,6 +2781,7 @@ def indirect_indexing(
     ) -> sympy.Symbol:
         if isinstance(size, int):
             size = sympy.Integer(size)
+<<<<<<< HEAD
         assert isinstance(size, sympy.Expr), size
         # Skip CSE since this doesn't return an expression
 
@@ -2407,6 +2790,16 @@ def indirect_indexing(
                 stm = ops.add(var, ops.index_expr(size, torch.long))
                 # Mixed negative and non-negative
                 if var.bounds.upper >= 0:  # type: ignore[operator]
+=======
+        assert isinstance(size, sympy.Expr), (type(size), size)
+        # Skip CSE since this doesn't return an expression
+
+        if var.bounds.lower < 0:
+            if wrap_neg:
+                stm = ops.add(var, ops.index_expr(size, torch.long))
+                # Mixed negative and non-negative
+                if var.bounds.upper >= 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     lt = ops.lt(var, 0)
                     stm = ops.where(lt, stm, var)
             else:
@@ -2423,7 +2816,11 @@ def indirect_indexing(
                     neg_bounds.lower + size, neg_bounds.upper + size
                 )
                 # We don't have a good way of representing the empty range
+<<<<<<< HEAD
                 if var.bounds.upper >= 0:  # type: ignore[operator]
+=======
+                if var.bounds.upper >= 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     pos = var.bounds & ValueRanges(0, int_oo)
                     new_bounds = new_bounds | pos
 
@@ -2475,8 +2872,12 @@ def store(
         if mode is None:
             self._update_store_cache(name, value)
         if name not in V.graph.removed_buffers:
+<<<<<<< HEAD
             return self.kernel.store(name, index, value, mode=mode)
         return None  # type: ignore[return-value]
+=======
+            self.kernel.store(name, index, value, mode=mode)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable) -> None:
         self.kernel.store_buffer_names.add(name)
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 2218eba4f7ce..4da98012a103 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -23,7 +23,11 @@
 from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
 
 from ..._dynamo.utils import counters
+<<<<<<< HEAD
 from .. import codecache, config, cpp_builder, cpu_vec_isa, ir, metrics
+=======
+from .. import config, cpp_builder, cpu_vec_isa, ir, metrics
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..loop_body import LoopBody
 from ..scheduler import (
     BaseSchedulerNode,
@@ -43,6 +47,10 @@
     is_welford_reduction,
     parallel_num_threads,
     Placeholder,
+<<<<<<< HEAD
+=======
+    set_kernel_post_grad_provenance_tracing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sympy_index_symbol,
     sympy_index_symbol_with_prefix,
     sympy_product,
@@ -71,6 +79,10 @@
     codegen_rand,
     CppCSEVariable,
     DTYPE_TO_CPP,
+<<<<<<< HEAD
+=======
+    get_promote_dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     INDEX_TYPE,
     LocalBufferContext,
     may_unify_binary_op_mask_type,
@@ -84,7 +96,11 @@
 _IS_WINDOWS = sys.platform == "win32"
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_export_declaration():
     return "__declspec(dllexport)" if _IS_WINDOWS else ""
 
@@ -152,6 +168,11 @@ def get_export_declaration():
     torch.int8,
     torch.int32,
     torch.int64,
+<<<<<<< HEAD
+=======
+    torch.float8_e4m3fn,
+    torch.float8_e5m2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 MASKED_VECTORIZABLE_DTYPES: list[torch.dtype] = [
@@ -323,7 +344,11 @@ def reduction_prefix_array(
     Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler
     MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here.
     We just use unique_ptr make it works on MSVC.
+<<<<<<< HEAD
     For other compilers, we continue to use VLA to get best performence.
+=======
+    For other compilers, we continue to use VLA to get best performance.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     code_buffer = IndentedBuffer()
     acc_decl = (
@@ -646,11 +671,26 @@ def get_fx_node(self):
         return self.current_node
 
 
+<<<<<<< HEAD
+=======
+def decltype_promoted(*args):
+    assert not any(isinstance(arg, CppCSEVariable) and arg.is_vec for arg in args), (
+        "Promotion of vector types is not supported"
+    )
+
+    if (dt := get_promote_dtype(args)) is not None:
+        return DTYPE_TO_CPP[dt]
+    else:
+        return f"decltype({args[0]})"
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CppOverrides(OpOverrides):
     """Map element-wise ops to C++"""
 
     @staticmethod
     def add(a, b):
+<<<<<<< HEAD
         return f"decltype({a})({a} + {b})"
 
     @staticmethod
@@ -660,6 +700,17 @@ def sub(a, b):
     @staticmethod
     def mul(a, b):
         return f"decltype({a})({a} * {b})"
+=======
+        return f"{decltype_promoted(a, b)}({a} + {b})"
+
+    @staticmethod
+    def sub(a, b):
+        return f"{decltype_promoted(a, b)}({a} - {b})"
+
+    @staticmethod
+    def mul(a, b):
+        return f"{decltype_promoted(a, b)}({a} * {b})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def to_dtype(x, dtype, src_dtype=None, use_compute_types=True):
@@ -1407,7 +1458,19 @@ def tan(a):
 
     @staticmethod
     def tanh(a):
+<<<<<<< HEAD
         return f"{a}.tanh()"
+=======
+        if config.cpp.use_decompose_tanh:
+            vec_one = f"decltype({a})(1)"
+            vec_two = f"decltype({a})(2)"
+            vec_minus_two = f"decltype({a})(-2)"
+            return (
+                f"{vec_two} / ({vec_one} + ({vec_minus_two} * {a}).exp()) - {vec_one}"
+            )
+        else:
+            return f"{a}.tanh()"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def reciprocal(a):
@@ -1586,6 +1649,11 @@ def to_dtype(x, dtype, src_dtype=None, use_compute_dtypes=True):
             torch.int8,
             torch.int32,
             torch.int64,
+<<<<<<< HEAD
+=======
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ], f"{__name__} does not support {dtype}"
         assert isinstance(x, CppCSEVariable)
         src_dtype = x.dtype
@@ -1881,9 +1949,16 @@ def __init__(self, args, num_threads):
         self.local_reduction_stores = IndentedBuffer()
         self.is_reduction = False
         self.non_parallel_reduction_prefix = IndentedBuffer()
+<<<<<<< HEAD
         self.reduction_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
         self.weight_recps_cse = CSE(
             self.newvar_prefix, self.suffix, name_prefix="wrecps"
+=======
+        self.non_parallel_reduction_suffix = IndentedBuffer()
+        self.reduction_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
+        self.welford_helper_cse = CSE(
+            self.newvar_prefix, self.suffix, name_prefix="welford_helper"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.preloads = IndentedBuffer()
         self.poststores = IndentedBuffer()
@@ -2045,7 +2120,11 @@ def load(self, name: str, index: sympy.Expr):
         var = self.args.input(name)
         index = self.rename_indexing(index)
         line = f"{var}[{cexpr_index(index)}]"
+<<<<<<< HEAD
         csevar = self.cse.generate(self.loads, line)
+=======
+        csevar = self.cse.generate(self.loads, line, dtype=V.graph.get_dtype(name))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         csevar.update_on_args("load", (self, name, index), {})
         return csevar
 
@@ -2215,6 +2294,11 @@ def get_reduction_prefix_suffix(kernel, parallel=False, is_suffix=False):
                     suffix = kernel.reduction_suffix
                     if parallel:
                         suffix = kernel.parallel_reduction_suffix + suffix
+<<<<<<< HEAD
+=======
+                    else:
+                        suffix = kernel.non_parallel_reduction_suffix + suffix
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return suffix
                 else:
                     prefix = kernel.reduction_prefix
@@ -2656,7 +2740,11 @@ def vec_to_array(vec_var: CppCSEVariable) -> CppCSEVariable:
             buffer.splice(code)
             return None
         else:
+<<<<<<< HEAD
             csevar = self.cse.generate(buffer, code)
+=======
+            csevar = self.cse.generate(buffer, code, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert isinstance(csevar, CppCSEVariable)
             csevar.is_vec = True
             return csevar
@@ -2673,7 +2761,11 @@ def load(self, name: str, index: sympy.Expr):
         elif stride == 1:
             # load contiguously
             line = self._get_vec_load_line(var, index, dtype, self._load_mask)  # type: ignore[arg-type]
+<<<<<<< HEAD
             csevar = self.cse.generate(self.loads, line)  # type: ignore[assignment]
+=======
+            csevar = self.cse.generate(self.loads, line, dtype=dtype)  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             csevar = self._load_or_store_non_contiguous(var, index, dtype)  # type: ignore[assignment]
         assert isinstance(csevar, CppCSEVariable)
@@ -2801,12 +2893,16 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 self.reduction_init_vec,
             )
         )
+<<<<<<< HEAD
         reduction_size = functools.reduce(
             lambda x, y: x * y, self.ranges[self.reduction_depth :]
         )
         if reduction_type == "welford_reduce":
             # save the reciprocal of weights for welford reduce
             assert self.reduction_depth is not None
+=======
+        if reduction_type == "welford_reduce":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # use masked acc_vec for tail vec kernel
             self.reduction_prefix_generators.append(
                 self._gen_reduction_prefix(
@@ -2817,6 +2913,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                     self.reduction_init_vec,
                 )
             )
+<<<<<<< HEAD
             reduction_size = functools.reduce(
                 lambda x, y: x * y, self.ranges[self.reduction_depth :]
             )
@@ -2851,6 +2948,54 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             acc_vec_ = masked_acc_vec if self.tail_size else acc_vec
             self.stores.writeline(
                 f"{acc_vec_} = {self.reduction_combine_vec(reduction_type, acc_vec_, value, True)};"
+=======
+
+            # use welford_helper for vec kernel
+            assert self.reduction_depth is not None
+            reduction_size = functools.reduce(
+                operator.mul, self.ranges[self.reduction_depth :]
+            )
+            welford_helper_val = self.welford_helper_cse.generate(
+                self.compute, f"reduction {reduction_key}", write=False
+            )
+            masked_welford_helper_val = f"masked_{welford_helper_val}"
+            welford_helper_vec_range = (
+                (
+                    FloorDiv(reduction_size, self.ranges[self.tiling_idx])
+                    * FloorDiv(self.ranges[self.tiling_idx], self.tiling_factor)
+                    if self.tiling_idx >= self.reduction_depth
+                    else reduction_size
+                )
+                if FloorDiv(self.ranges[self.tiling_idx], self.tiling_factor)
+                else sympy.Integer(0)
+            )
+            masked_welford_helper_vec_range = (
+                (
+                    FloorDiv(reduction_size, self.ranges[self.tiling_idx])
+                    if self.tiling_idx >= self.reduction_depth
+                    else reduction_size
+                )
+                if self.ranges[self.tiling_idx] % self.tiling_factor
+                else sympy.Integer(0)
+            )
+            self._use_welford_helper(
+                acc_vec, welford_helper_val, welford_helper_vec_range, dtype
+            )
+            self._use_welford_helper(
+                masked_acc_vec,
+                masked_welford_helper_val,
+                masked_welford_helper_vec_range,
+                dtype,
+            )
+
+            # use masked acc_vec for tail vec kernel
+            acc_vec_ = masked_acc_vec if self.tail_size else acc_vec
+            welford_helper_val_ = (
+                masked_welford_helper_val if self.tail_size else welford_helper_val
+            )
+            self.stores.writeline(
+                f"{acc_vec_} = {self.reduction_combine_vec(reduction_type, acc_vec_, value, welford_helper_val_)};"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         else:
             assert self.reduction_depth is not None
@@ -2967,7 +3112,13 @@ def store_reduction(self, name, index, value):
         else:
             # Vertical reduction
             if out_dtype != dtype:
+<<<<<<< HEAD
                 converted_value = f"{DTYPE_TO_CPP[out_dtype]}_{value}"
+=======
+                converted_value = (
+                    f"{DTYPE_TO_CPP[out_dtype].replace('::', '_')}_{value}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if out_dtype == torch.bool:
                     convert = f"{value}.template cast<bool,{self._get_num_vectors(torch.bool)}>()"
                 else:
@@ -3065,6 +3216,7 @@ def reduction_acc_type_vec(self, reduction_type, dtype):
             return f"{self._get_mask_type()}"
         return vec_type
 
+<<<<<<< HEAD
     def welford_weight_reciprocal_vec(self, dtype, num_threads=None):
         vec_num_range_thread = (
             CeilDiv(self.weight_recp_vec_range, num_threads)
@@ -3074,17 +3226,68 @@ def welford_weight_reciprocal_vec(self, dtype, num_threads=None):
         vec_num_range_thread_expr = cexpr_index(vec_num_range_thread)
         return (
             f"static WeightRecp<{self._get_vec_type(dtype)}> {self.weight_recps_val}"
+=======
+    def _welford_helper_init(
+        self, welford_helper_val, welford_helper_vec_range, dtype, num_threads=None
+    ):
+        vec_num_range_thread = (
+            CeilDiv(welford_helper_vec_range, num_threads)
+            if num_threads
+            else welford_helper_vec_range
+        )
+        vec_num_range_thread_expr = cexpr_index(vec_num_range_thread)
+        chunk_size = 4096
+        num_chunks = CeilDiv(vec_num_range_thread, chunk_size)
+        welford_helper_init_line = (
+            f"WelfordHelper<{self._get_vec_type(dtype)}, {chunk_size}> {welford_helper_val}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"("
             f"{vec_num_range_thread_expr}"
             f");"
         )
+<<<<<<< HEAD
+=======
+        if isinstance(num_chunks, sympy.Integer) and num_chunks <= 1:
+            # When the number of chunks <= 1, there is no need to use cascade summation to improve
+            # reduction accuracy. We can initialize a static WelfordHelper to improve performance.
+            return f"static {welford_helper_init_line}"
+        else:
+            return welford_helper_init_line
+
+    def _use_welford_helper(
+        self, acc_vec, welford_helper_val, welford_helper_vec_range, dtype
+    ):
+        num_threads = (
+            "max_threads" if config.cpp.dynamic_threads else parallel_num_threads()
+        )
+        self.non_parallel_reduction_prefix.writeline(
+            self._welford_helper_init(
+                welford_helper_val, welford_helper_vec_range, dtype
+            )
+        )
+        self.local_reduction_init.writeline(
+            self._welford_helper_init(
+                welford_helper_val, welford_helper_vec_range, dtype, num_threads
+            )
+        )
+        self.non_parallel_reduction_suffix.writeline(
+            f"{acc_vec} = welford_combine({acc_vec}, &{welford_helper_val});"
+        )
+        self.local_reduction_stores.writeline(
+            f"{acc_vec}_local = welford_combine({acc_vec}_local, &{welford_helper_val});"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def reduction_combine_vec(
         self,
         reduction_type,
         var,
         next_value,
+<<<<<<< HEAD
         use_weight_recps=False,
+=======
+        welford_helper_val=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         index: Optional[sympy.Symbol] = None,
         horizontal_reduction: Optional[bool] = None,
         src_dtype: Optional[torch.dtype] = torch.float32,
@@ -3125,11 +3328,21 @@ def reduction_combine_vec(
             else:
                 return f"{var} ^ {next_value}"
         elif reduction_type == "welford_reduce":
+<<<<<<< HEAD
             if use_weight_recps:
                 if self.tail_size:
                     return f"welford_combine({var}, {next_value}, {cexpr_index(self.tail_size)}, &{self.weight_recps_val})"
                 else:
                     return f"welford_combine({var}, {next_value}, &{self.weight_recps_val})"
+=======
+            if welford_helper_val:
+                if self.tail_size:
+                    return f"welford_combine({var}, {next_value}, {cexpr_index(self.tail_size)}, &{welford_helper_val})"
+                else:
+                    return (
+                        f"welford_combine({var}, {next_value}, &{welford_helper_val})"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 if self.tail_size:
                     return f"welford_combine({var}, {next_value}, {cexpr_index(self.tail_size)})"
@@ -3377,7 +3590,11 @@ def load(self, name: str, index: sympy.Expr):
             loadbuf = f"{tile_var} + {cexpr_index(inner * self.num_elems)}"
             dtype = V.graph.get_dtype(name)
             line = self._get_vec_load_line(loadbuf, 0, dtype)  # type: ignore[arg-type]
+<<<<<<< HEAD
             csevar = self.cse.generate(self.loads, line)
+=======
+            csevar = self.cse.generate(self.loads, line, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             csevar.update_on_args("load", (self, name, index), {})
             assert isinstance(csevar, CppCSEVariable)
             csevar.is_vec = True
@@ -3726,6 +3943,19 @@ def _select_tiling_indices(
 
 
 class CppKernelProxy(CppKernel):
+<<<<<<< HEAD
+=======
+    # Subclass CppKernel, CppVecKernel, etc., to customize code generation.
+    # Override CppOverrides or CppVecOverrides to emit custom ops.
+    # Earlier, this meant copying codegen_functions() to use your subclasses.
+    # Now, use kernel_cls and vec_kernel_cls class attributes instead.
+    # This lets CppKernelProxy subclasses inject custom behavior cleanly.
+    # No need to duplicate codegen_functions() just to swap kernel classes.
+    kernel_cls: type[CppKernel] = CppKernel
+    vec_kernel_cls: type[CppVecKernel] = CppVecKernel
+    tile2d_kernel_cls: type[CppTile2DKernel] = CppTile2DKernel
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, kernel_group):
         super().__init__(kernel_group.args, kernel_group.ws.num_threads)
         self.kernel_group = kernel_group
@@ -4041,7 +4271,11 @@ def run(kernel):
                     with kernel.write_to_suffix():
                         fn(vars, ())
 
+<<<<<<< HEAD
         scalar_kernel = codegen_kernel(CppKernel)
+=======
+        scalar_kernel = codegen_kernel(self.kernel_cls)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         V.graph.removed_buffers |= scalar_kernel.removed_buffers
         V.graph.inplaced_to_remove |= scalar_kernel.inplaced_to_remove
         self.loop_nest = LoopNest.build(scalar_kernel)
@@ -4090,13 +4324,21 @@ def run(kernel):
                 metrics.generated_cpp_vec_kernel_count += 1
                 loop = self.loop_nest.tile(tiling_indices[0], factor=tiling_factors[0])
                 vec_kernel = codegen_kernel(
+<<<<<<< HEAD
                     CppVecKernel, tiling_factors[0], tiling_indices[0]
+=======
+                    self.vec_kernel_cls, tiling_factors[0], tiling_indices[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 tail_size = loop.size - loop.tiled_size
                 vec_kernel.active_ranges = {loop.var: (0, loop.tiled_size)}
                 if config.cpp.enable_loop_tail_vec and could_masked_vec:
                     tail_kernel = codegen_kernel(
+<<<<<<< HEAD
                         CppVecKernel,
+=======
+                        self.vec_kernel_cls,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         tiling_factors[0],
                         tiling_indices[0],
                         tail_size,
@@ -4131,7 +4373,11 @@ def run(kernel):
                 }
                 inner_tail_size = inner_loop.size - inner_loop.tiled_size
                 tile2d_kernel = codegen_kernel(
+<<<<<<< HEAD
                     CppTile2DKernel,
+=======
+                    self.tile2d_kernel_cls,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     tiling_factors[0],
                     tiling_indices,
                 )
@@ -4153,7 +4399,11 @@ def run(kernel):
                             outer_tail_size if outer_r == "tail" else None
                         )
                         kernel = codegen_kernel(
+<<<<<<< HEAD
                             CppTile2DKernel,
+=======
+                            self.tile2d_kernel_cls,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             tiling_factors[0],
                             tiling_indices,
                             _inner_tail_size,
@@ -4166,7 +4416,11 @@ def run(kernel):
                         tail_kernel.append(kernel)
                 else:
                     vec_kernel = codegen_kernel(
+<<<<<<< HEAD
                         CppVecKernel, tiling_factors[0], tiling_indices[0]
+=======
+                        self.vec_kernel_cls, tiling_factors[0], tiling_indices[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     vec_kernel.active_ranges = {
                         outer_loop.var: outer_ranges["main"],
@@ -4255,10 +4509,17 @@ def aggregate_reduction_prefix_suffix(outer_loop: "LoopLevel"):
             assert len(self.kernels) >= 2
             main_loop_kernel = self.kernels[0]
             tail_loop_kernel = self.kernels[-1]
+<<<<<<< HEAD
             assert isinstance(main_loop_kernel, CppVecKernel)
 
             # Prefix
             if type(tail_loop_kernel) == CppKernel:
+=======
+            assert isinstance(main_loop_kernel, self.vec_kernel_cls)
+
+            # Prefix
+            if type(tail_loop_kernel) == self.kernel_cls:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # if tail loop kernel is a scalar kernel, we need to extend tmp_acc -> tmp_acc_arr[] to
                 # hold the temporary inner loop acc result for outer tail loop
                 tail_loop_kernel.finalize_reduction_prefix(
@@ -4286,7 +4547,11 @@ def aggregate_reduction_prefix_suffix(outer_loop: "LoopLevel"):
                     suffix_buf, "C10_UNLIKELY", outer_loop.var
                 ):
                     stack.enter_context(suffix_buf.indent())
+<<<<<<< HEAD
                     if type(tail_loop_kernel) == CppKernel:
+=======
+                    if type(tail_loop_kernel) == self.kernel_cls:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         reduction_vars = tail_loop_kernel.reduction_var_names
                         for name in reduction_vars:
                             new_name = f"{name}_arr[{outer_loop.var}_tail - {cexpr_index(outer_loop.tiled_size)}]"
@@ -4322,6 +4587,12 @@ def aggregate_reduction_prefix_suffix(outer_loop: "LoopLevel"):
         self.non_parallel_reduction_prefix.splice(
             main_kernel.non_parallel_reduction_prefix
         )
+<<<<<<< HEAD
+=======
+        self.non_parallel_reduction_suffix.splice(
+            main_kernel.non_parallel_reduction_suffix
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class OuterLoopFusedKernel(CppKernel):
@@ -4366,6 +4637,13 @@ class ReasonFusedNodes(Enum):
 
 
 class CppScheduling(BaseScheduling):
+<<<<<<< HEAD
+=======
+    # Subclass CppKernelProxy to customize codegen without copying codegen_node().
+    # Use kernel_proxy_cls to inject custom proxies in CppScheduling subclasses.
+    # Avoid duplicating codegen_node() just to swap in a custom kernel proxy class.
+    kernel_proxy_cls: type[CppKernelProxy] = CppKernelProxy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # ctypes limits the number of args to 1024, refer to:
     # https://github.com/python/cpython/commit/a285af7e626d1b81cf09f8b2bf7656f100bc1237
     # We set a conservative threshold here.
@@ -4704,6 +4982,11 @@ def try_loop_split(self, nodes: list[SchedulerNode]):
             assert isinstance(node.node, ir.ComputedBuffer)
             _, original_body, _ = node.node.get_default_sizes_body()
             for name, expr in original_body.indexing_exprs.items():
+<<<<<<< HEAD
+=======
+                if not isinstance(expr, sympy.Expr):
+                    continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for div_expr in expr.find(FloorDiv):
                     if (
                         any(div_expr.has(var) for var in original_body.iter_vars)
@@ -4788,7 +5071,11 @@ def codegen_outer_loop_node(
         """
         kernel_group = self.kernel_group
         generated_cpp_vec_kernel_count = metrics.generated_cpp_vec_kernel_count
+<<<<<<< HEAD
         cpp_kernel_proxy_list: list[CppKernelProxy] = []
+=======
+        cpp_kernel_proxy_list: list[self.kernel_proxy_cls] = []  # type: ignore[name-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nodes_list: list[list[SchedulerNode]] = []
         assert isinstance(node, OuterLoopFusedSchedulerNode)
 
@@ -4816,12 +5103,20 @@ def get_call_ranges(node: BaseSchedulerNode):
                 len(get_call_ranges(_node)) == node.outer_loop_fusion_depth + 1
                 for _node in node.get_outer_nodes()
             ):
+<<<<<<< HEAD
                 # Ref to the typical case of local buffer
                 # in https://github.com/pytorch/pytorch/blob/
                 # 1115a25c36340554442f28f9570abd42f0aface2/aten/src/ATen/native/cpu/SoftMaxKernel.cpp#L159
                 # where the buffer is with size of last dim and contiguous.
                 # Only support this typical case at first.
                 visited_scheduler_nodes = OrderedSet[str]()
+=======
+                # Ref to the typical case of local buffer in
+                # https://github.com/pytorch/pytorch/blob/1115a25c36340554442f28f9570abd42f0aface2/aten/src/ATen/native/cpu/SoftMaxKernel.cpp#L159 # noqa: B950
+                # where the buffer is with size of last dim and contiguous.
+                # Only support this typical case at first.
+                visited_scheduler_nodes: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for scheduler_node in node.get_nodes():
                     # all users inside same OuterLoopFusedSchedulerNode
                     assert isinstance(scheduler_node, SchedulerNode)
@@ -4910,7 +5205,11 @@ def try_share_local_buffer(local_buffer_layout, local_buffers):
                                 layout=local_buffer_layout,
                             )
                             local_buffers.append(local_buffer_used)
+<<<<<<< HEAD
                             local_to_global_buffers[local_buffer_used.name] = []
+=======
+                            local_to_global_buffers[local_buffer_used.name] = []  # type: ignore[index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         local_to_global_buffers[local_buffer_used.name].append(
                             global_buffer,
                         )
@@ -4924,7 +5223,11 @@ def try_share_local_buffer(local_buffer_layout, local_buffers):
                         )
                 for _node in node.get_outer_nodes():
                     assert isinstance(_node, (FusedSchedulerNode, SchedulerNode))
+<<<<<<< HEAD
                     cpp_kernel_proxy = CppKernelProxy(kernel_group)
+=======
+                    cpp_kernel_proxy = self.kernel_proxy_cls(kernel_group)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     cpp_kernel_proxy.codegen_nodes(_node.get_nodes())  # type: ignore[arg-type]
                     cpp_kernel_proxy_list.append(cpp_kernel_proxy)
                     nodes_list.append(_node.get_nodes())  # type: ignore[arg-type]
@@ -4965,7 +5268,11 @@ def try_share_local_buffer(local_buffer_layout, local_buffers):
                 for _node in node.get_outer_nodes():
                     assert isinstance(_node, (FusedSchedulerNode, SchedulerNode))
                     _nodes: list[SchedulerNode] = _node.get_nodes()  # type: ignore[assignment]
+<<<<<<< HEAD
                     cpp_kernel_proxy = CppKernelProxy(kernel_group)
+=======
+                    cpp_kernel_proxy = self.kernel_proxy_cls(kernel_group)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     cpp_kernel_proxy.codegen_nodes(_nodes)
                     kernel_group.finalize_kernel(cpp_kernel_proxy, _nodes)
 
@@ -4983,7 +5290,11 @@ def codegen_node(
         else:
             nodes: list[SchedulerNode] = node.get_nodes()  # type: ignore[assignment]
             nodes = self.try_loop_split(nodes)
+<<<<<<< HEAD
             cpp_kernel_proxy = CppKernelProxy(kernel_group)
+=======
+            cpp_kernel_proxy = self.kernel_proxy_cls(kernel_group)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cpp_kernel_proxy.codegen_nodes(nodes)
             kernel_group.finalize_kernel(cpp_kernel_proxy, nodes)
 
@@ -5100,6 +5411,13 @@ def define_kernel(self, src_code, nodes, kernel_args=None):
             else ""
         )
         kernel_name = "_".join(["cpp", fused_name, wrapper.next_kernel_suffix()])
+<<<<<<< HEAD
+=======
+        # below add provenance tracing info for cpu CppKernel types
+        if config.trace.enabled:
+            set_kernel_post_grad_provenance_tracing(nodes, kernel_name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
         src_code = src_code.replace(str(Placeholder.KERNEL_NAME), kernel_decl_name)
         src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
@@ -5111,6 +5429,12 @@ def define_kernel(self, src_code, nodes, kernel_args=None):
         # excluding the the first line including cpp_prefix.h.
         first_char = src_code.rfind('extern "C"')
         last_char = src_code.find(")", first_char)
+<<<<<<< HEAD
+=======
+        if _IS_WINDOWS:
+            # get_export_declaration introduced one more ')' in Windows
+            last_char = src_code.find(")", last_char + 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kernel_definition = f"{src_code[first_char : last_char + 1]};\n"
 
         compile_wrapper = IndentedBuffer()
@@ -5177,7 +5501,11 @@ def codegen_group(self, name=None) -> str:
         ]
         if enable_kernel_profile:
             code.writelines(["#include <ATen/record_function.h>"])
+<<<<<<< HEAD
         code.writeline(codecache.cpp_prefix())
+=======
+        code.writeline("#include <torch/csrc/inductor/cpp_prefix.h>")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # 2. Function definition
         kernel_decl_name = str(Placeholder.KERNEL_NAME) if name is None else name
@@ -5390,6 +5718,7 @@ def max_parallel_depth(self):
         start_depth = 0
         max_depth = 0
         is_reduction = self.loops[0].is_reduction
+<<<<<<< HEAD
         loop_sizes = sympy.Integer(1)
         for loop in self.loops:
             if loop.is_reduction != is_reduction:
@@ -5404,6 +5733,38 @@ def max_parallel_depth(self):
             and isinstance(loop_sizes, sympy.Integer)
             and isinstance(self.loops[max_depth].size, sympy.Integer)
             and loop_sizes * 300 < self.loops[max_depth].size
+=======
+        num_steps = sympy.Integer(1)
+        for loop in self.loops:
+            if loop.is_reduction != is_reduction:
+                break
+            num_steps = num_steps * FloorDiv(loop.size, loop.steps)
+            max_depth += 1
+
+        def get_simd_vec_depth(loops):
+            # Return the first loop level which is simd_vec
+            for i, loop in enumerate(loops):
+                if loop.simd_vec:
+                    return i
+            return None
+
+        simd_vec_depth = get_simd_vec_depth(self.loops)
+
+        # When the number of steps of the first inner loop is much larger than the number of steps of
+        # all outer loops, change `start_depth` to the first inner loop and recalculate `max_depth`.
+        if (
+            max_depth < len(self.loops)
+            and isinstance(num_steps, sympy.Integer)
+            and isinstance(self.loops[max_depth].size, sympy.Integer)
+            and num_steps * 300
+            < FloorDiv(self.loops[max_depth].size, self.loops[max_depth].steps)
+            and not (
+                # Disable parallel reduction under the vec loop
+                simd_vec_depth is not None
+                and max_depth > simd_vec_depth
+                and self.loops[max_depth].is_reduction
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             start_depth = max_depth
             max_depth = 0
diff --git a/torch/_inductor/codegen/cpp_flex_attention_template.py b/torch/_inductor/codegen/cpp_flex_attention_template.py
index beacaa10a75d..502ceff77790 100644
--- a/torch/_inductor/codegen/cpp_flex_attention_template.py
+++ b/torch/_inductor/codegen/cpp_flex_attention_template.py
@@ -201,7 +201,11 @@
 """
 
 ALLOCATE_BUFFER = r"""
+<<<<<<< HEAD
   int64_t {{buffer_name}}_dtype_itemsize = std::is_same_v<{{buffer_dtype}}, at::BFloat16> ? 2 : 4;
+=======
+  int64_t {{buffer_name}}_dtype_itemsize = c10::is_reduced_floating_point_v<{{buffer_dtype}}> ? 2 : 4;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& {{buffer_name}}_allocator = *at::getCPUAllocator();
   auto {{buffer_name}}_work_data = {{buffer_name}}_allocator.allocate({{buffer_size}}*{{buffer_name}}_dtype_itemsize);
   void* {{buffer_name}}_data_ptr = {{buffer_name}}_work_data.get();
@@ -217,16 +221,26 @@
 {{template.codegen_softmax_fusion(kernel.kernel_name)}}
 {{template.codegen_brgemm_pack_function(kernel.kernel_name)}}
 {%- set kernel_args = {"query": query, "key": key, "value": value,
+<<<<<<< HEAD
                        "kv_num_blocks": kv_num_blocks, "kv_indices": kv_indices, "full_kv_num_blocks": full_kv_num_blocks} %}
+=======
+                       "kv_num_blocks": kv_num_blocks, "kv_indices": kv_indices,
+                       "full_kv_num_blocks": full_kv_num_blocks, "full_kv_indices": full_kv_indices } %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {%- set kernel_args = template.update_kernel_args(kernel_args) %}
 
 extern "C"
 {{kernel.def_kernel(inputs=kernel_args, outputs={"output": output}, extra_sizevars=template.extra_sizevars)}}
 {
   {{ kernel.maybe_codegen_profile() }}
+<<<<<<< HEAD
   int64_t kvBlockSize = {{kvBlockSize}};
   kvBlockSize = kvBlockSize>{{kernel.size(key, 1)}} ? {{kernel.size(key, 1)}}
                                                     : kvBlockSize;
+=======
+  int64_t qBlockSize = {{qBlockSize}};
+  int64_t kvBlockSize = {{kvBlockSize}};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t num_thread = {{num_thread}};
 
   // dtypes of kernel and internal buffers
@@ -254,10 +268,34 @@
   bool is_broadcast_head_kvi = num_head != num_head_kvi;
   int64_t gqa_shards_kvi = num_head / num_head_kvi;
   int64_t bs_shards_kvi = batchSize / batchSize_kvi;
+<<<<<<< HEAD
   int64_t kviStrideB = {{kernel.stride(kv_indices, 0)}};
   int64_t kviStrideH = {{kernel.stride(kv_indices, 1)}};
   int64_t kviStrideQ = {{kernel.stride(kv_indices, 2)}};
   auto  kv_indices_data = kv_indices;
+=======
+
+  int64_t kviStrideB = {{kernel.stride(kv_indices, 0)}};
+  int64_t kviStrideH = {{kernel.stride(kv_indices, 1)}};
+  int64_t kviStrideQ = {{kernel.stride(kv_indices, 2)}};
+
+  int64_t num_kviStrideB = {{kernel.stride(kv_num_blocks, 0)}};
+  int64_t num_kviStrideH = {{kernel.stride(kv_num_blocks, 1)}};
+
+{%- if has_full_kv_block %}
+  int64_t full_kviStrideB = {{kernel.stride(full_kv_indices, 0)}};
+  int64_t full_kviStrideH = {{kernel.stride(full_kv_indices, 1)}};
+  int64_t full_kviStrideQ = {{kernel.stride(full_kv_indices, 2)}};
+
+  int64_t full_num_kviStrideB = {{kernel.stride(full_kv_num_blocks, 0)}};
+  int64_t full_num_kviStrideH = {{kernel.stride(full_kv_num_blocks, 1)}};
+  auto full_kv_indices_data = full_kv_indices;
+  auto full_kv_num_blocks_data = full_kv_num_blocks;
+{%- endif %}
+
+  auto kv_num_blocks_data = kv_num_blocks;
+  auto kv_indices_data = kv_indices;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Strides
   int64_t qStrideB = {{kernel.stride(query, 0)}};
@@ -273,6 +311,7 @@
   int64_t oStrideM = {{kernel.stride(output, 2)}};
   int64_t oStrideH = {{kernel.stride(output, 1)}};
 
+<<<<<<< HEAD
   // Check total kv block number for kv value.
   int64_t block_num_kv_count = 0;
   bool has_block_indice_zero = true;
@@ -310,6 +349,13 @@
   if (kvBlockSize < kvSplitSize) {
     kvSplitSize = kvBlockSize;
   }
+=======
+  int64_t kvSize = {{kernel.size(key, 1)}};
+
+  int64_t qSplitSize = qBlockSize;
+  int64_t kvSplitSize = kvBlockSize;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   qSplitSize = qSplitSize > qSize ? qSize : qSplitSize;
   kvSplitSize = kvSplitSize > kvSize ? kvSize : kvSplitSize;
@@ -318,6 +364,7 @@
   int64_t kvTail = (kvSize - 1) % kvSplitSize + 1;
 
   bool need_pack = false;
+<<<<<<< HEAD
   // Whether pack is needed for BFloat16
   if (std::is_same_v<scalar_t, at::BFloat16>) {
     // check platform ability
@@ -327,6 +374,18 @@
     // When the number of gemm is greater than the number of pack,
     // the pack overhead can be overlaped.
     int64_t thresh_size = 64 ;
+=======
+  // Whether pack is needed for BFloat16/Half
+  if (is_reduced_type) {
+    // check platform ability
+    need_pack = std::is_same_v<scalar_t, at::BFloat16> ? at::native::cpublas::could_pack(at::kBFloat16)
+                                                       : at::native::cpublas::could_pack(at::kHalf);
+  }
+  if (need_pack) {
+    // When the number of gemm is greater than the number of pack,
+    // the pack overhead can be overlapped.
+    int64_t thresh_size = 64;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     need_pack = kvSize >= thresh_size && qSize >= thresh_size;
     if (need_pack) {
       double pack_size = batchSize * num_head * kvSize * headSize;
@@ -358,6 +417,7 @@
   // Buffers to store accum results, padding query and transpose/packing key/value
   {{template.codegen_allocate_buffer("buf_data", "accum_t", "num_thread*_size_per_thread")}}
   {{template.codegen_allocate_buffer("buf_reduced_data", "scalar_t", "num_thread*qSplitSize*ekvSplitSize")}}
+<<<<<<< HEAD
   {{template.codegen_allocate_buffer("key_reorder_ptr", "scalar_t", "batchSize*num_head*eheadSize*kvSize")}}
   {{template.codegen_allocate_buffer("value_reorder_ptr", "scalar_t", "batchSize*num_head*kv_padding_size*headSize_v")}}
   {{template.codegen_allocate_buffer("transpose_buffer_ptr", "scalar_t", "num_thread*kvSplitSize*headSize")}}
@@ -394,6 +454,26 @@
                 v_data + i_kv * vStrideB + j_kv * vStrideH +
                 (*kv_logical_data * cur_kvSplitSize + kv_block_offset) * vStrideN;
         }
+=======
+  {{template.codegen_allocate_buffer("key_reorder_ptr", "scalar_t", "batchSize_k*num_head_k*eheadSize*kvSize")}}
+  {{template.codegen_allocate_buffer("value_reorder_ptr", "scalar_t", "batchSize_k*num_head_k*kv_padding_size*headSize_v")}}
+  {{template.codegen_allocate_buffer("transpose_buffer_ptr", "scalar_t", "num_thread*kvSplitSize*headSize")}}
+  {{template.codegen_allocate_buffer("query_padding_ptr", "scalar_t", "num_thread*qSplitSize*eheadSize")}}
+  if (need_pack) {
+    // Pack K, V
+    at::parallel_for(0, batchSize_k * num_head_k * kvSlice, 1, [&](int64_t begin, int64_t end) {
+      int ompIdx = at::get_thread_num();
+      int64_t i = 0, j = 0, l = 0, n = 0;
+      scalar_t* transpose_ptr = need_pack? transpose_buffer_ptr + ompIdx * kvSplitSize * headSize : nullptr;
+      at::native::data_index_init(begin, i, batchSize_k, j, num_head_k, l, kvSlice);
+      for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
+        n = l * kvSplitSize;
+        int64_t cur_kvSplitSize = std::min(kvSplitSize, kvSize - n);
+        auto k_addr =
+              k_data + i * kStrideB + j * kStrideH + n * kStrideN;
+        auto v_addr =
+              v_data + i * vStrideB + j * vStrideH + n * vStrideN;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // transpose [cur_kvSplitSize, headSize] -> [headSize, cur_kvSplitSize]
         at::native::utils::transpose<uint16_t>(
           cur_kvSplitSize,
@@ -407,7 +487,11 @@
         // Pack [headSize, cur_kvSplitSize]
         at::vec::pack_vnni2(
           /* src */ reinterpret_cast<const uint16_t*>(transpose_ptr),
+<<<<<<< HEAD
           /* dst */ reinterpret_cast<uint16_t*>(key_reorder_ptr + i * num_head * eheadSize * kvSize +
+=======
+          /* dst */ reinterpret_cast<uint16_t*>(key_reorder_ptr + i * num_head_k * eheadSize * kvSize +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   j * eheadSize * kvSize + n * eheadSize),
           /* ld_src */ cur_kvSplitSize,
           /* K */ headSize,
@@ -417,13 +501,21 @@
         at::vec::pack_vnni2(
           /* src */ reinterpret_cast<const uint16_t*>(v_addr),
           /* dst */ reinterpret_cast<uint16_t*>(value_reorder_ptr +
+<<<<<<< HEAD
                   i * num_head * kv_padding_size * headSize_v +
+=======
+                  i * num_head_k * kv_padding_size * headSize_v +
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   j * kv_padding_size * headSize_v + n * headSize_v),
           /* ld_src */ vStrideN,
           /* K */ cur_kvSplitSize,
           /* N */ headSize_v);
       // Move to the next query
+<<<<<<< HEAD
       at::native::data_index_step(i, batchSize, j, num_head, l, kvSlice);
+=======
+      at::native::data_index_step(i, batchSize_k, j, num_head_k, l, kvSlice);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
     });
   }
@@ -446,6 +538,7 @@
             : nullptr;
 
     for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
+<<<<<<< HEAD
       int64_t m = k * qSplitSize;
       int64_t cur_qSplitSize = std::min(qSplitSize, qSize - m);
       // Initialize max and sum
@@ -467,12 +560,71 @@
         );
       }
       for (int64_t n = 0; n < kvSize; n += kvSplitSize) {
+=======
+      auto i_kvi = is_broadcast_bs_kvi ? i/bs_shards_kvi : i;
+      auto j_kvi = is_broadcast_head_kvi ? j/gqa_shards_kvi : j;
+      auto kv_logical_num_data = kv_num_blocks_data + i_kvi * num_kviStrideB +
+                              j_kvi * num_kviStrideH + k;
+      int kv_indice_num = *kv_logical_num_data;
+      std::vector<int> kv_indice_list(kv_indice_num);
+      for(int kv_i = 0; kv_i < kv_indice_num; kv_i++){
+        auto kv_logical_data = kv_indices_data + i_kvi * kviStrideB +
+                                  j_kvi * kviStrideH + k*kviStrideQ + kv_i;
+        kv_indice_list[kv_i] = *kv_logical_data;
+      }
+      bool is_skip_kv = kv_indice_num > 0 ? false : true;
+{%- if has_full_kv_block %}
+      auto full_kv_logical_num_data = full_kv_num_blocks_data + i_kvi * num_kviStrideB +
+                              j_kvi * num_kviStrideH + k;
+      int full_kv_indice_num = *full_kv_logical_num_data;
+      std::vector<int> full_kv_indice_list(full_kv_indice_num);
+      for(int kv_i = 0; kv_i < full_kv_indice_num; kv_i++){
+        auto full_kv_logical_data = full_kv_indices_data + i_kvi * full_kviStrideB +
+                                  j_kvi * full_kviStrideH + k*full_kviStrideQ + kv_i;
+        full_kv_indice_list[kv_i] = *full_kv_logical_data;
+      }
+      is_skip_kv = kv_indice_num + full_kv_indice_num > 0 ? false : true;
+{%- endif %}
+      int64_t m = k * qSplitSize;
+      int64_t cur_qSplitSize = std::min(qSplitSize, qSize - m);
+      if (!is_skip_kv){
+        // Initialize max and sum
+        {{kernel.kernel_name}}_fill_stub(qk_max_data,
+            -std::numeric_limits<accum_t>::infinity(), cur_qSplitSize);
+        {{kernel.kernel_name}}_fill_stub(qk_sum_data,
+            static_cast<accum_t>(0), cur_qSplitSize);
+
+        if (!headSize_even && need_pack) {
+          // Pad query if headSize is not even
+          {{kernel.kernel_name}}_copy_value_with_pad<scalar_t>(
+            q_data + i * qStrideB + j * qStrideH + m * qStrideM,
+            query_t_padding_ptr,
+            cur_qSplitSize,
+            headSize,
+            cur_qSplitSize,
+            eheadSize,
+            qStrideM
+          );
+        }
+      }
+
+{%- if has_full_kv_block %}
+      for (int64_t n_idx = 0; n_idx < kv_indice_num + full_kv_indice_num ; n_idx += 1) {
+        auto n = n_idx < kv_indice_num ? kv_indice_list[n_idx]*kvSplitSize : full_kv_indice_list[n_idx - kv_indice_num]*kvSplitSize;
+{%- else %}
+      for (int64_t n_idx = 0; n_idx < kv_indice_num ; n_idx += 1) {
+        auto n = kv_indice_list[n_idx]*kvSplitSize;
+{%- endif %}
+
+        auto cur_n = n/kvSplitSize;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         int64_t cur_kvSplitSize = std::min(kvSplitSize, kvSize - n);
         int64_t cur_ekvSplitSize = (need_pack && cur_kvSplitSize % 2 != 0) ? cur_kvSplitSize + 1 : cur_kvSplitSize;
 
         // Calculate scale * q @ k.T
         auto i_kv = is_broadcast_bs_kv ? i/bs_shards : i;
         auto j_kv = is_broadcast_head_kv ? j/gqa_shards : j;
+<<<<<<< HEAD
         auto kv_block_num = n / kvBlockSize;
         auto kv_block_offset = n - kv_block_num * kvBlockSize;
         // getting kv indices by [BS, Head, 1, kv_block_num]
@@ -490,6 +642,14 @@
           }
 
           {{kernel.kernel_name}}_kernel_micro_gemm<static_cast<bool>(false)>(
+=======
+
+        if (!need_pack) {
+          auto k_addr =
+              k_data + i_kv * kStrideB + j_kv * kStrideH + n * kStrideN;
+
+          {{kernel.kernel_name}}_kernel_micro_gemm_transpose_b<static_cast<bool>(false)>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               q_data + i * qStrideB + j * qStrideH +
                   m * qStrideM,
               k_addr,
@@ -513,8 +673,13 @@
               !headSize_even
                   ? query_t_padding_ptr
                   : q_data + i * qStrideB + j * qStrideH + m * qStrideM,
+<<<<<<< HEAD
               key_reorder_ptr + i * num_head * eheadSize * kvSize +
                   j * eheadSize * kvSize + n * eheadSize,
+=======
+              key_reorder_ptr + i_kv * num_head_k * eheadSize * kvSize +
+                  j_kv * eheadSize * kvSize + n * eheadSize,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               qk_data,
               need_pack);
         }
@@ -525,16 +690,24 @@
         // TODO: reduce the number of calls of q_idx and kv_idx initialization
         std::vector<int64_t> q_idx(cur_qSplitSize);
         for (int64_t i = 0; i < cur_qSplitSize; ++i) {
+<<<<<<< HEAD
             q_idx[i] = m + i;
+=======
+          q_idx[i] = m + i;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         std::vector<int64_t> kv_idx(cur_kvSplitSize);
         for (int64_t i = 0; i < cur_kvSplitSize; ++i) {
+<<<<<<< HEAD
             if (use_kv_indice) {
                 kv_idx[i] = *kv_logical_data * kvBlockSize + i;
             } else {
                 kv_idx[i] = n + i;
             }
+=======
+          kv_idx[i] = n + i;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         std::vector<int64_t> b_idx = {i};
@@ -554,11 +727,21 @@
             {{ template.modification(score_mod, score_buf_name, score_buf_idx)|indent(12, false) }}
         }
 
+<<<<<<< HEAD
         // Apply block mask, fill unused with -inf
         {
             {{ template.generate_other_buffer("mask_others", -1, "len_mask_other", kernel.args) }}
             accum_t* out_ptr{{mask_buf_idx}} = in_ptr0;
             {{ template.modification(mask_mod, mask_buf_name, mask_buf_idx)|indent(12, false) }}
+=======
+        if ((std::find(kv_indice_list.begin(), kv_indice_list.end(), cur_n) != kv_indice_list.end()) ){
+          // Apply block mask, fill unused with -inf
+          {
+              {{ template.generate_other_buffer("mask_others", -1, "len_mask_other", kernel.args) }}
+              accum_t* out_ptr{{mask_buf_idx}} = in_ptr0;
+              {{ template.modification(mask_mod, mask_buf_name, mask_buf_idx)|indent(12, false) }}
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
 {%- endif %}
@@ -592,7 +775,11 @@
             // max[row] <- max
             qk_max_data[row] = tmp_max;
             // dst <- dst * exp_tmp
+<<<<<<< HEAD
             if (n > 0) {
+=======
+            if (n_idx > 0) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               at::vec::map<accum_t>(
               [exp_tmp](Vec x) { return x * Vec(exp_tmp); },
               dst_data + row * headSize_v,
@@ -609,23 +796,61 @@
         if (!need_pack) {
           auto v_addr =
               v_data + i_kv * vStrideB + j_kv * vStrideH + n * vStrideN;
+<<<<<<< HEAD
           if (use_kv_indice) {
               v_addr =
                   v_data + i_kv * vStrideB + j_kv * vStrideH +
                   (*kv_logical_data * kvBlockSize + kv_block_offset) * vStrideN;
           }
           at::native::cpublas::brgemm(
+=======
+          // Fallback Half brgemm is slower than micro gemm
+          if (!std::is_same_v<scalar_t, at::Half>) {
+            at::native::cpublas::brgemm(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   cur_qSplitSize,
                   headSize_v,
                   cur_ekvSplitSize,
                   cur_ekvSplitSize,
                   vStrideN,
                   headSize_v,
+<<<<<<< HEAD
                   n > 0,
+=======
+                  n_idx > 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                   {{kernel.kernel_name}}_conditional_data_ptr(qk_data, qk_reduced_data),
                   v_addr,
                   dst_data,
                   need_pack);
+<<<<<<< HEAD
+=======
+          } else {
+            if (n_idx > 0) {
+              {{kernel.kernel_name}}_kernel_micro_gemm<static_cast<bool>(true)>(
+                {{kernel.kernel_name}}_conditional_data_ptr(qk_data, qk_reduced_data),
+                v_addr,
+                dst_data,
+                cur_qSplitSize,
+                headSize_v,
+                cur_ekvSplitSize,
+                cur_ekvSplitSize,
+                vStrideN,
+                headSize_v);
+            } else {
+              {{kernel.kernel_name}}_kernel_micro_gemm<static_cast<bool>(false)>(
+                {{kernel.kernel_name}}_conditional_data_ptr(qk_data, qk_reduced_data),
+                v_addr,
+                dst_data,
+                cur_qSplitSize,
+                headSize_v,
+                cur_ekvSplitSize,
+                cur_ekvSplitSize,
+                vStrideN,
+                headSize_v);
+            }
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         } else {
           int64_t psize = n / kvSplitSize * ekvSplitSize;
           at::native::cpublas::brgemm(
@@ -635,15 +860,27 @@
               cur_ekvSplitSize,
               headSize_v,
               headSize_v,
+<<<<<<< HEAD
               n > 0,
               qk_reduced_data,
               value_reorder_ptr +
                   i * num_head * kv_padding_size * headSize_v +
                   j * kv_padding_size * headSize_v + psize * headSize_v,
+=======
+              n_idx > 0,
+              qk_reduced_data,
+              value_reorder_ptr +
+                  i_kv * num_head_k * kv_padding_size * headSize_v +
+                  j_kv * kv_padding_size * headSize_v + psize * headSize_v,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               dst_data,
               need_pack);
         }
       }
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // dst <- dst / sum[row]
       // reorder MHA output with strides
       for (int64_t row = 0; row < cur_qSplitSize; ++row) {
@@ -654,11 +891,19 @@
         qk_sum_data[row] = qk_sum_data[row] == 0 ? 1 : qk_sum_data[row];
         accum_t sum_reciprocal = 1 / qk_sum_data[row];
         at::vec::map<scalar_t>(
+<<<<<<< HEAD
             [sum_reciprocal](Vec x) { return x * Vec(sum_reciprocal); },
+=======
+            [sum_reciprocal, is_skip_kv](Vec x) { return  is_skip_kv ? Vec(0.0) : x * Vec(sum_reciprocal); },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out_data + i * oStrideB + j * oStrideH + m * oStrideM + row * oStrideM,
             dst_data + row * headSize_v,
             headSize_v);
       }
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // Move to the next query
       at::native::data_index_step(i, batchSize, j, num_head, k, qSlice);
     }
@@ -679,6 +924,10 @@ def __init__(
         score_mod,
         mask_mod,
         kv_block_size,
+<<<<<<< HEAD
+=======
+        q_block_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         has_other_buffer,
         no_full_kv_block,
         fake_buffers,
@@ -709,9 +958,16 @@ def get_idx(buf_name):
         )
         self.mask_buf_idx = get_idx(self.mask_buf_name) if self.mask_buf_name else None
         self.kv_block_size = kv_block_size
+<<<<<<< HEAD
         self.has_other_buffer = has_other_buffer
         self.no_full_kv_block = no_full_kv_block
         self.other_buffer_input_offset = 1
+=======
+        self.q_block_size = q_block_size
+        self.has_other_buffer = has_other_buffer
+        self.no_full_kv_block = no_full_kv_block
+        self.other_buffer_input_offset = 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.no_full_kv_block:
             self.other_buffer_input_offset = 0
         self.fake_buffers = fake_buffers
@@ -896,6 +1152,10 @@ def add_choices(
         score_mod,
         mask_mod,
         kv_block_size,
+<<<<<<< HEAD
+=======
+        q_block_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         has_other_buffer,
         no_full_kv_block,
         fake_buffers,
@@ -920,6 +1180,10 @@ def postprocessor(output):
             score_mod=score_mod,
             mask_mod=mask_mod,
             kv_block_size=kv_block_size,
+<<<<<<< HEAD
+=======
+            q_block_size=q_block_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             has_other_buffer=has_other_buffer,
             no_full_kv_block=no_full_kv_block,
             fake_buffers=fake_buffers,
@@ -971,12 +1235,24 @@ def render(  # type: ignore[override,return]
             full_kv_num_blocks=self.input_nodes[5]
             if not self.no_full_kv_block
             else None,
+<<<<<<< HEAD
             score_mod_other_buffers=self.score_mod_other_buffers,
             mask_mod_other_buffers=self.mask_mod_other_buffers,
             scale=self.scale,
             accumulate_dtype=self.accumulate_dtype,
             query_dtype=self.input_dtype,
             kvBlockSize=self.kv_block_size,
+=======
+            full_kv_indices=self.input_nodes[6] if not self.no_full_kv_block else None,
+            score_mod_other_buffers=self.score_mod_other_buffers,
+            mask_mod_other_buffers=self.mask_mod_other_buffers,
+            scale=self.scale,
+            has_full_kv_block=not self.no_full_kv_block,
+            accumulate_dtype=self.accumulate_dtype,
+            query_dtype=self.input_dtype,
+            kvBlockSize=self.kv_block_size,
+            qBlockSize=self.q_block_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             template=self,
             output=buf_out,
             kernel=kernel,
@@ -1024,8 +1300,13 @@ def micro_gemm_define(self, kernel_name: str):
         from torch._inductor.codegen.cpp_micro_gemm import CppMicroGemmFP32Vec
         from torch._inductor.virtualized import V
 
+<<<<<<< HEAD
         micro_gemm = CppMicroGemmFP32Vec(
             kernel_name + "_kernel_micro_gemm",
+=======
+        micro_gemm_trans = CppMicroGemmFP32Vec(
+            kernel_name + "_kernel_micro_gemm_transpose_b",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.input_dtype,
             self.input_dtype,
             self.accumulate_dtype,
@@ -1036,10 +1317,30 @@ def micro_gemm_define(self, kernel_name: str):
             True,
         )
 
+<<<<<<< HEAD
         with V.set_graph_handler(V.graph):
             kernel = CppTemplateKernel("cpp_micro_gemm", parallel_num_threads())
             code = micro_gemm.codegen_define(kernel)
         return code
+=======
+        micro_gemm = CppMicroGemmFP32Vec(
+            kernel_name + "_kernel_micro_gemm",
+            self.input_dtype,
+            self.input_dtype,
+            self.accumulate_dtype,
+            self.accumulate_dtype,
+            GemmBlocking(1, 16, 1),
+            1,
+            True,
+            False,
+        )
+
+        with V.set_graph_handler(V.graph):
+            kernel = CppTemplateKernel("cpp_micro_gemm", parallel_num_threads())
+            code_trans = micro_gemm_trans.codegen_define(kernel)
+            code = micro_gemm.codegen_define(kernel)
+        return code + code_trans
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_micro_gemm(self, kernel_name: str):
         micro_gemm = self.micro_gemm_define(kernel_name)
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
index 6a6d7b9d720d..b14c15f70e39 100644
--- a/torch/_inductor/codegen/cpp_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -26,7 +26,13 @@
     CppMicroBrgemm,
     CppMicroGemm,
     CppMicroGemmAMX,
+<<<<<<< HEAD
     create_micro_gemm,
+=======
+    CppMicroGemmFP32Vec,
+    create_micro_gemm,
+    is_int8_woq_gemm_small_m_dim_corner_case,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     LayoutType,
 )
 from .cpp_template import CppTemplate
@@ -41,7 +47,11 @@
 
 log = logging.getLogger(__name__)
 
+<<<<<<< HEAD
 GEMM_TEMPLATE_INIT_BLOCKING = r"""
+=======
+GEMM_TEMPLATE_INIT_BLOCKING_BASIC_BLOCK = r"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr int64_t num_threads = {{num_threads}};
     constexpr int64_t N = {{N}};
     constexpr int64_t K = {{K}};
@@ -50,10 +60,24 @@
     constexpr int64_t Kr = {{micro_gemm.register_blocking.block_k}};
     constexpr int64_t Nr_blocks = (N + Nr - 1) / Nr;
     constexpr int64_t Kr_blocks = (K + Kr - 1) / Kr;
+<<<<<<< HEAD
 
 {%- if is_dynamic_M %}
     const int64_t M = {{kernel.size(GemmOut, 0)}};
     const int64_t Mr_blocks = (M + Mr - 1) / Mr;
+=======
+{%- if is_dynamic_M %}
+    const int64_t M = {{kernel.size(GemmOut, 0)}};
+    const int64_t Mr_blocks = (M + Mr - 1) / Mr;
+{%- else %}
+    constexpr int64_t M = {{kernel.size(GemmOut, 0)}};
+    constexpr int64_t Mr_blocks = (M + Mr - 1) / Mr;
+{%- endif %}
+"""
+
+GEMM_TEMPLATE_INIT_BLOCKING_EXTENDED = r"""
+{%- if is_dynamic_M %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {%- if num_threads > 1 %}
     int64_t Mt_blocks, Nt_blocks, Kt_blocks;
     mm_get_thread_blocking(num_threads, {{config.cpp.gemm_max_k_slices}}, M, N, K, Mr, Nr, Kr, Mt_blocks, Nt_blocks, Kt_blocks);
@@ -88,8 +112,11 @@
     const int64_t num_Nt_blocks = (Nr_blocks + Nt_blocks - 1) / Nt_blocks;
     const int64_t num_Kt_blocks = (Kr_blocks + Kt_blocks - 1) / Kt_blocks;
 {%- else %}
+<<<<<<< HEAD
     constexpr int64_t M = {{kernel.size(GemmOut, 0)}};
     constexpr int64_t Mr_blocks = (M + Mr - 1) / Mr;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constexpr int64_t Mt_blocks = {{template.thread_blocking(num_threads).block_m}};
     constexpr int64_t Nt_blocks = {{template.thread_blocking(num_threads).block_n}};
     constexpr int64_t Kt_blocks = {{template.thread_blocking(num_threads).block_k}};
@@ -227,14 +254,24 @@
 {%- set tile_X = kernel.slice_nd(X, [("m_start", "m_end"), ("k_start", "k_end")]) %}
                     for (int64_t nci = nc; nci < nc_block_end; nci++) {
 {%- set acc_slice = kernel.slice_nd(acc, [("0", "m_end - m_start"), ("(nci - nc)*Nr", "(nci - nc + 1)*Nr")]) %}
+<<<<<<< HEAD
 {%- if template.should_block_weights %}
+=======
+{%- if template.should_block_weights and not is_woq_int4 %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {%- set tile_W_3d = kernel.slice_nd(W, [("nci", "nci + 1"), ("k_start", "k_end"), ()]) %}
 {%- set tile_W = kernel.view(tile_W_3d, ["k_end - k_start", micro_gemm.register_blocking.block_n]) %}
 {%- else %}
     {%- if is_woq_int4 %}
+<<<<<<< HEAD
         {%- set tile_W = kernel.slice_nd(W, [("n_start", "n_start + n_size"), ("k_start * Nr / 2", "k_end * Nr / 2")]) %}
         {%- set tile_qparam = kernel.slice_nd(
             qscale_and_zeros, [("k_start / group_size", "k_end / group_size"), ("n_start", "n_start + n_size"), ()]) %}
+=======
+        {%- set tile_W = kernel.slice_nd(W, [("nci * Nr", "(nci + 1) * Nr"), ("k_start * Nr / 2", "k_end * Nr / 2")]) %}
+        {%- set tile_qparam = kernel.slice_nd(
+            qscale_and_zeros, [("k_start // group_size", "k_end // group_size"), ("nci * Nr", "(nci + 1) * Nr"), ()]) %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {%- else %}
         {%- set tile_W = kernel.slice_nd(W, [("k_start", "k_end"), ("n_start", "n_start + n_size")]) %}
         {%- set tile_qparam = None %}
@@ -318,6 +355,65 @@
 }
 """
 
+<<<<<<< HEAD
+=======
+SMALL_M_GEMM_TEMPLATE = r"""
+{{ template.codegen_gemm_stub_def() }}
+{
+    {{ kernel.maybe_codegen_profile() }}
+    {{ template.codegen_blocks(
+        num_threads, N, K, micro_gemm, is_dynamic_M, kernel, GemmOut, config, L1_cache_size, L2_cache_size, X, W
+    ) }}
+    # pragma omp parallel
+    {
+        #pragma omp for nowait
+        for (int64_t nr_block_id = 0; nr_block_id < Nr_blocks; nr_block_id++) {
+            // Handle one output M * Nr block in each thread
+            int64_t n_start = nr_block_id * Nr;
+            int64_t n_end = (nr_block_id + 1) * Nr;
+{%- if use_local_acc %}
+    {%- set acc_buf_name = "local_acc_buf" %}
+            {{ kernel.define_stack_allocated_buffer(acc_buf_name, ["M", "Nr"], acc_buf_dtype) }}
+    {%- set acc = kernel.local_buffers[acc_buf_name] %}
+{%- else %}
+    {%- set acc = kernel.slice_nd(GemmOut, [(0, "M"), ("n_start", "n_end")]) %}
+{%- endif %}
+            for (int64_t kr_block_id = 0; kr_block_id < Kr_blocks; kr_block_id++) {
+                // this loop is not parallelized
+                int64_t k_start = kr_block_id * Kr;
+                int64_t k_end = std::min((kr_block_id + 1) * Kr, K);
+{%- set tile_X = kernel.slice_nd(X, [(0, "M"), ("k_start", "k_end")]) %}
+{%- set tile_W_3d = kernel.slice_nd(W, [("nr_block_id", "nr_block_id + 1"), ("k_start", "k_end"), ()]) %}
+{%- set tile_W = kernel.view(tile_W_3d, ["k_end - k_start", micro_gemm.register_blocking.block_n]) %}
+                if C10_UNLIKELY(kr_block_id == 0) {
+                    {{ micro_gemm.codegen_call(kernel, tile_X, tile_W, acc, accum=False, prefetch=True)|indent(20, false) }}
+                } else if C10_UNLIKELY(k_end == K) {
+                    {{ micro_gemm.codegen_call(kernel, tile_X, tile_W, acc, accum=True, prefetch=False)|indent(20, false) }}
+                } else {
+                    {{ micro_gemm.codegen_call(kernel, tile_X, tile_W, acc, accum=True, prefetch=True)|indent(20, false) }}
+                }
+            }
+{%- set tile_Y = kernel.slice_nd(Y_2d, [("0", "M"), ("n_start", "n_end")]) %}
+{%- set tile_acc = kernel.slice_nd(acc, [("0", "M"), ("0", "n_end - n_start")]) %}
+            {{ kernel.store_output(
+                tile_Y, tile_acc, GemmOut, epilogue_nodes, offsets=("0", "n_start"), reindexers=reindexers
+            )|indent(20, false) }}
+        }
+    }
+}
+"""
+
+
+def _is_int8_gemm(inputs):
+    return (
+        isinstance(inputs[0], ir.IRNode)
+        and inputs[0].get_dtype() in [torch.uint8, torch.int8]
+    ) or (
+        isinstance(inputs[0], torch.Tensor)
+        and inputs[0].dtype in [torch.uint8, torch.int8]
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def get_padded_n(n, block_n):
     return (n + block_n - 1) // block_n * block_n
@@ -493,7 +589,11 @@ def get_reindexer(epilogue_node, default_reindexer=None):
                 from_stride_ordered_decreasingly_to_epilogue_node_order
             )
 
+<<<<<<< HEAD
             reindexer = ir.fuse_reindexing(stride_reindex, reshape_reindex)
+=======
+            reindexer = ir.fuse_reindexing(stride_reindex, reshape_reindex)  # type: ignore[var-annotated]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return reindexer
 
         if default_reindexers is None:
@@ -515,6 +615,13 @@ def get_reindexer(epilogue_node, default_reindexer=None):
 
 
 class CppGemmTemplate(CppTemplate):
+<<<<<<< HEAD
+=======
+    """
+    GEMM Template for Inductor CPP Backend.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         input_nodes,
@@ -739,6 +846,20 @@ def get_num_byte(dtype):
             if size_cache_B > L1:
                 Kc_blocks = math.floor(L1 / (Kr * Nr * num_byte_B))
 
+<<<<<<< HEAD
+=======
+            if (
+                config.cpp.use_small_dequant_buffer
+                and dtype_A is torch.bfloat16
+                and dtype_B is torch.uint8
+                and Mt_blocks == 1
+            ):
+                # Make a small dequant_B buffer for woq int4 [q_group_size, Nr]
+                # Since when Mt_blocks == 1, L1-reside B block can't be reused by A.
+                if Kc_blocks * Kr >= self.q_group_size():
+                    Kc_blocks = self.q_group_size() // Kr
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Step 2: Decide Mc assuming A block is L2-reside.
             min_Mc_ratio = 2  # TODO(jgong5): something to tune?
             min_Mc_blocks = math.ceil(min_Mc_ratio * Mr / Nr)
@@ -825,6 +946,21 @@ def add_choices(
         epilogue_creator: Optional[Callable[[ir.Buffer], ir.Pointwise]] = None,
         act_mapping: Optional[dict[int, ir.IRNode]] = None,
     ):
+<<<<<<< HEAD
+=======
+        """
+        Add choices for the GEMM template.
+        """
+        # Fast path to save the epilogue calculation when x_scale/x_zp/w_scale are constant
+        use_int8_fast_compensation_path = _is_int8_gemm(input_nodes) and all(
+            (
+                isinstance(input_nodes[idx], ir.TensorBox)
+                and isinstance(input_nodes[idx].data.data, ir.ConstantBuffer)
+            )
+            for idx in [1, 2, 4]
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if input_indices is None:
             input_indices = list(range(len(input_nodes)))
         only_one_input = (
@@ -937,7 +1073,15 @@ def preprocessor(inputs, layout):
             if only_one_input and isinstance(new_inputs[0], torch.Tensor):
                 return new_inputs[1:], new_layout
             return cls.prep_weight(
+<<<<<<< HEAD
                 new_inputs, new_layout, micro_gemm, pre_block_weights
+=======
+                new_inputs,
+                new_layout,
+                micro_gemm,
+                pre_block_weights,
+                use_int8_fast_compensation_path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         def postprocessor(output):
@@ -956,7 +1100,16 @@ def postprocessor(output):
                     *maybe_to_dense(new_input_nodes, layout)
                 )
                 new_input_nodes, _ = cls.prep_weight(
+<<<<<<< HEAD
                     new_input_nodes, new_layout, micro_gemm, pre_block_weights
+=======
+                    new_input_nodes,
+                    new_layout,
+                    micro_gemm,
+                    pre_block_weights,
+                    use_int8_fast_compensation_path,
+                    skip_int8_compensation=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 W_packed = new_input_nodes[1]
                 W_packed_constant = V.graph.add_tensor_constant(W_packed)
@@ -1002,11 +1155,20 @@ def prep_weight(
         layout: ir.Layout,
         micro_gemm: CppMicroGemm,
         should_block_weight: bool,
+<<<<<<< HEAD
+=======
+        use_int8_fast_compensation_path: bool = False,
+        skip_int8_compensation: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         NOTE Weight prep consists of 2 separate steps:
         1. Blocking the weight tensor into a 3D shape: [n//block_n, k, block_n]
+<<<<<<< HEAD
            This is always done if the weight tensor is contant, i.e. for all GEMM and some BMM.
+=======
+           This is always done if the weight tensor is constant, i.e. for all GEMM and some BMM.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            For BMM, we also block non-contiguous weight tensors, since they would be reshaped anyway.
            This assumes that blocked, contiguous weights will be more efficient for the GEMM kernel,
            and is worth the overhead of reshape and blocking.
@@ -1039,13 +1201,23 @@ def prep_weight(
         new_size, padded_n = cls.get_padded_size(n, block_n, k, should_block_weight)
         padding = padded_n - n
 
+<<<<<<< HEAD
         if should_block_weight:
             blocked_w = cls.block_weight(W, new_size, padding)
             new_inputs[1] = cls.pack_vnni_weight(blocked_w, micro_gemm, new_size)
+=======
+        if should_block_weight and not cls.is_woq_int4():
+            blocked_w = cls.block_weight(W, new_size, padding)
+            new_inputs[1] = cls.pack_vnni_weight(blocked_w, micro_gemm, new_size)
+        elif should_block_weight:
+            assert cls.is_woq_int4()
+            new_inputs[1] = cls.block_weight(W, new_size, padding)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(W, ir.IRNode):
             # Require W layout to be fixed & contiguous, happens inplace.
             ir.ExternKernel.require_contiguous(W)
 
+<<<<<<< HEAD
         def _is_int8_gemm(inputs):
             return (
                 isinstance(inputs[0], ir.IRNode)
@@ -1058,14 +1230,61 @@ def _is_int8_gemm(inputs):
         if _is_int8_gemm(new_inputs):
             BCompensate = None
             if isinstance(W, ir.IRNode):
+=======
+        if not skip_int8_compensation and _is_int8_gemm(new_inputs):
+            BCompensate = None
+            x_w_scale = None
+
+            def _get_compensation_node(W, use_int8_fast_compensation_path):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 BCompensate = V.graph.add_tensor_constant(
                     V.graph.constants[W.get_name() + "_BMatrixCompens"],
                     W.get_name() + "_BMatrixCompens",
                 )
+<<<<<<< HEAD
             else:
                 # Use the original W, not the blocked_w in new_inputs[1] to calculate BCompensate
                 BCompensate = torch.sum(W.to_dense().to(torch.float), dim=0)  # type: ignore[assignment]
             new_inputs.append(BCompensate)
+=======
+                x_w_scale = None
+                if use_int8_fast_compensation_path:
+                    x_w_scale = V.graph.add_tensor_constant(
+                        V.graph.constants[W.get_name() + "_x_w_compens"],
+                        W.get_name() + "_x_w_compens",
+                    )
+                return BCompensate, x_w_scale
+
+            if use_int8_fast_compensation_path:
+                # new_inputs has been reordered: [x, w, optional[bias], x_scale, x_zp, w_scale, w_zp]
+                x_scale = new_inputs[-4]
+                x_zp = new_inputs[-3]
+                w_scale = new_inputs[-2]
+                if isinstance(W, ir.IRNode):
+                    BCompensate, x_w_scale = _get_compensation_node(
+                        W, use_int8_fast_compensation_path
+                    )
+                else:
+                    # Use the original W, not the blocked_w in new_inputs[1] to calculate BCompensate
+                    BCompensate = torch.sum(W.to_dense().to(torch.float), dim=0)  # type: ignore[assignment]
+                    assert all(
+                        isinstance(item, torch.Tensor)
+                        for item in (x_scale, x_zp, w_scale)
+                    )
+                    BCompensate = BCompensate * x_scale * w_scale * x_zp
+                    x_w_scale = x_scale * w_scale
+                new_inputs.append(BCompensate)
+                new_inputs.append(x_w_scale)
+            else:
+                if isinstance(W, ir.IRNode):
+                    BCompensate, _ = _get_compensation_node(
+                        W, use_int8_fast_compensation_path
+                    )
+                else:
+                    # Use the original W, not the blocked_w in new_inputs[1] to calculate BCompensate
+                    BCompensate = torch.sum(W.to_dense().to(torch.float), dim=0)  # type: ignore[assignment]
+                new_inputs.append(BCompensate)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return new_inputs, layout
 
     @staticmethod
@@ -1115,6 +1334,15 @@ def block_weight(cls, W, new_size, padding):
 
     @classmethod
     def pack_vnni_weight(cls, W, micro_gemm, new_size):
+<<<<<<< HEAD
+=======
+        # WOQ INT4 weights are reordered in microkernel so do not pack them here
+        should_pack = (
+            micro_gemm.get_b_layout() != LayoutType.NORMAL
+            and not micro_gemm.is_woq_int4()
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # These are separated into two methods to allow subclasses to override them separately
         if isinstance(W, ir.IRNode):
             if isinstance(W, ir.Buffer) and W.get_name() in V.graph.constants:
@@ -1122,7 +1350,11 @@ def pack_vnni_weight(cls, W, micro_gemm, new_size):
             k = new_size[-2]
             if not isinstance(W, ir.TensorBox):
                 W = ir.TensorBox(W)
+<<<<<<< HEAD
             if micro_gemm.get_b_layout() != LayoutType.NORMAL:
+=======
+            if should_pack:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 permute_dims = list(range(len(new_size) + 1))
                 permute_dims[-1], permute_dims[-2] = permute_dims[-2], permute_dims[-1]
                 vnni_size = 4 if micro_gemm.get_b_layout() == LayoutType.VNNI4 else 2
@@ -1139,7 +1371,11 @@ def pack_vnni_weight(cls, W, micro_gemm, new_size):
         else:
             k = new_size[-2]
             # Apply VNNI packing to the weight tensor
+<<<<<<< HEAD
             if micro_gemm.get_b_layout() != LayoutType.NORMAL:
+=======
+            if should_pack:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # TODO: Move VNNI weight packing for non-constant tensors into the template,
                 # to improve cache locality and avoid full-tensor copy.
                 layout_str = (
@@ -1223,7 +1459,11 @@ def get_options(
         reindexers: list[Optional[Callable[[list[Any]], list[Any]]]] = []
         epilogue_creators: list[Callable[[ir.Buffer], ir.Pointwise]] = []
         fake_buffers: list[ir.Buffer] = []
+<<<<<<< HEAD
         Y_aliases = OrderedSet[str]()
+=======
+        Y_aliases: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         use_local_acc = (
             self.layout.dtype != torch.float
@@ -1420,6 +1660,27 @@ def copy_inner(index):
         )
         return options
 
+<<<<<<< HEAD
+=======
+    def is_int8_woq_gemm_small_m_dim(
+        self,
+        X: ir.ReinterpretView,
+        W: ir.ReinterpretView,
+        N,
+        K,
+        micro_gemm,
+    ):
+        """Use SMALL_M_GEMM_TEMPLATE"""
+        return (
+            isinstance(micro_gemm, CppMicroGemmFP32Vec)
+            and is_int8_woq_gemm_small_m_dim_corner_case(
+                micro_gemm, X.get_size()[0], N, K
+            )
+            and X.get_dtype() is torch.bfloat16
+            and W.get_dtype() is torch.int8
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def render(  # type: ignore[override, return]
         self,
         kernel: CppTemplateKernel,
@@ -1441,7 +1702,21 @@ def render(  # type: ignore[override, return]
                 stack.enter_context(
                     patch.object(V.graph, "get_dtype", self._fake_get_dtype(buf))
                 )
+<<<<<<< HEAD
             return self._template_from_string(GEMM_TEMPLATE).render(**options)
+=======
+            if not options["is_dynamic_M"] and self.is_int8_woq_gemm_small_m_dim(
+                options["X"],
+                options["W"],
+                options["N"],
+                options["K"],
+                options["micro_gemm"],
+            ):
+                template_str = SMALL_M_GEMM_TEMPLATE
+            else:
+                template_str = GEMM_TEMPLATE
+            return self._template_from_string(template_str).render(**options)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_blocks(
         self,
@@ -1474,7 +1749,17 @@ def codegen_blocks(
             W=W,
             is_woq_int4=self.is_woq_int4(),
         )
+<<<<<<< HEAD
         return self._template_from_string(GEMM_TEMPLATE_INIT_BLOCKING).render(options)
+=======
+        template_str = GEMM_TEMPLATE_INIT_BLOCKING_BASIC_BLOCK
+        if not (
+            not is_dynamic_M
+            and self.is_int8_woq_gemm_small_m_dim(X, W, N, K, micro_gemm)
+        ):
+            template_str += GEMM_TEMPLATE_INIT_BLOCKING_EXTENDED
+        return self._template_from_string(template_str).render(options)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_microkernel_def(self):
         return self._template_from_string(GEMM_TEMPLATE_MICROKERNEL_DEF).render(
@@ -1537,7 +1822,72 @@ def q_group_size(cls):
             @staticmethod
             def check_if_block_weight(W, micro_gemm):
                 # For WOQ INT4, weight is already packed
+<<<<<<< HEAD
                 return False
+=======
+                # However, for AMX microkernel, we want to change the blocking of weight
+                from .cpp_micro_gemm import CppMicroGemmWoQInt4Amx
+
+                return isinstance(micro_gemm, CppMicroGemmWoQInt4Amx)
+
+            @classmethod
+            def block_weight(cls, W, new_size, padding):
+                # This method is called only if AMX microkernels are used.
+                # In this case, we unpack and repack weight so that block_n=32
+                # the format of packed weight is described here:
+                # https://github.com/pytorch/pytorch/blob/32eee8ed225d9f10fbbcb38c24b8b44c24c0c97c/aten/src/ATen/native/cpu/int4mm_kernel.cpp#L583
+                if isinstance(W, ir.IRNode):
+                    # in this case, we do nothing
+                    ir.ExternKernel.require_contiguous(W)
+                    blocked_w = W
+                else:
+                    # in this case, we unpack and repack weight
+                    assert isinstance(W, torch.Tensor)
+                    assert W.dim() == 2
+                    N = W.size(0)
+                    K = W.size(-1) * 2
+                    G = cls.q_group_size()
+                    # x and qscales_and_zeros are in bfloat16 instead of float to use the optimized kernel
+                    # so that the unpacking process is faster
+                    x = torch.eye(K).bfloat16()
+                    # Here we use scale=1 and qzero=8 because we want to unpack weight
+                    # without dequantizing it. The qzero here is 8 instead of 0 because
+                    # int4 values are converted to [-7, 8] in the _weight_int4pack_mm_for_cpu kernel:
+                    # https://github.com/pytorch/pytorch/blob/32eee8ed225d9f10fbbcb38c24b8b44c24c0c97c/aten/src/ATen/native/cpu/int4mm_kernel.cpp#L95
+                    qscales_and_zeros = (
+                        torch.tensor([1.0, 8.0])
+                        .bfloat16()
+                        .expand(K // G, N, 2)
+                        .contiguous()
+                    )
+                    # shape: [K, N]
+                    unpacked_w = torch.ops.aten._weight_int4pack_mm_for_cpu(
+                        x,
+                        W,
+                        G,
+                        qscales_and_zeros,
+                    ).to(torch.uint8)
+                    block_n = 32
+                    # shape: [N // block_n, K, block_n]
+                    w_blocked = (
+                        unpacked_w.view(K, N // block_n, block_n)
+                        .permute(1, 0, 2)
+                        .contiguous()
+                    )
+                    # pack 2 int4 -> 1 int8
+                    # block_n: [a0, a1, ..., a15, b0, b1, ..., b15]
+                    # -> [(a0 & 0xf) | (b0 << 4), (a1 & 0xf) | (b1 << 4), ...]
+                    # shape: [N // block_n, K, 2, block_n // 2]
+                    w_blocked = w_blocked.view(N // block_n, K, 2, block_n // 2)
+                    # shape: [N // block_n, K, block_n // 2]
+                    w_blocked_packed = (w_blocked[:, :, 0, :] & 0xF) | (
+                        w_blocked[:, :, 1, :] << 4
+                    )
+                    # shape: [N, K // 2]
+                    blocked_w = w_blocked_packed.view(N, K // 2)
+
+                return blocked_w
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return CppWoqInt4GemmTemplateInstance
 
diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py
index cdc0dfa1fc1e..9abfa766bb98 100644
--- a/torch/_inductor/codegen/cpp_micro_gemm.py
+++ b/torch/_inductor/codegen/cpp_micro_gemm.py
@@ -1,11 +1,18 @@
 # mypy: allow-untyped-defs
 import dataclasses
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 from enum import Enum
 from typing import Callable, Optional
 
+<<<<<<< HEAD
 import sympy
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 
 from .. import cpp_builder, ir
@@ -55,7 +62,11 @@ class CppMicroGemm:
 
     # TODO(jgong5): support constant shapes and lds as template args.
     DECLARE_KERNEL = r"""
+<<<<<<< HEAD
 template <bool accum>
+=======
+template <bool accum, bool prefetch=false>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void {{kernel_name}}(
 {%- if kernel_extra_args_declare %}
     {{kernel_extra_args_declare}}
@@ -138,6 +149,10 @@ def codegen_call(
         B: ir.Buffer,
         C: ir.Buffer,
         accum: bool,
+<<<<<<< HEAD
+=======
+        prefetch: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **kwargs_for_extra_args,
     ) -> str:
         """
@@ -154,7 +169,13 @@ def codegen_call(
         ldb = kernel.stride(B, 0)
         ldc = kernel.stride(C, 0)
         res = IndentedBuffer()
+<<<<<<< HEAD
         res.writeline(f"{self.name}<{value_to_cpp(accum, 'bool')}>(")
+=======
+        res.writeline(
+            f"{self.name}<{value_to_cpp(accum, 'bool')}, {value_to_cpp(prefetch, 'bool')}>("
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with res.indent():
             kwargs_for_extra_args.update({"kernel": kernel})
             extra_args = self.get_kernel_extra_args(**kwargs_for_extra_args)
@@ -317,6 +338,29 @@ def codegen_define(self, kernel: CppTemplateKernel) -> str:
         return KernelTemplate._template_from_string(self.TEMPLATE_ENTRY).render(options)
 
 
+<<<<<<< HEAD
+=======
+def is_int8_woq_gemm_small_m_dim_corner_case(config, m, n, k):
+    return (
+        k % config.register_blocking.block_k == 0
+        and n % config.register_blocking.block_n == 0
+        and m < 16
+    )
+
+
+# extra check for small M dimension for int8 WoQ case
+def check_int8_woq_small_m_dim(config, m, n, k, alpha, num_threads, **kwargs):
+    return is_int8_woq_gemm_small_m_dim_corner_case(config, m, n, k) and not kwargs.get(
+        "dynamic_M", False
+    )
+
+
+# For int8 WoQ GEMM with small M, we use different blockings that shouldn't be used otherwise
+def do_not_use_with_small_m_for_int8_woq(config, m, n, k, alpha, num_threads, **kwargs):
+    return not check_int8_woq_small_m_dim(config, m, n, k, alpha, num_threads, **kwargs)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_micro_gemm(
     *generate_gemm_config(
         VecAVX512,
@@ -342,6 +386,22 @@ def codegen_define(self, kernel: CppTemplateKernel) -> str:
         input2_dtype=torch.int8,
         output_dtype=torch.float,
         compute_dtype=torch.float,
+<<<<<<< HEAD
+=======
+        extra_check=do_not_use_with_small_m_for_int8_woq,
+    ),
+    *generate_gemm_config(
+        VecAVX512,
+        [
+            (4, 32, 64),
+            (8, 32, 64),
+        ],
+        input_dtype=torch.bfloat16,
+        input2_dtype=torch.int8,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+        extra_check=check_int8_woq_small_m_dim,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     *generate_gemm_config(
         VecAVX2,
@@ -367,6 +427,22 @@ def codegen_define(self, kernel: CppTemplateKernel) -> str:
         input2_dtype=torch.int8,
         output_dtype=torch.float,
         compute_dtype=torch.float,
+<<<<<<< HEAD
+=======
+        extra_check=do_not_use_with_small_m_for_int8_woq,
+    ),
+    *generate_gemm_config(
+        VecAVX2,
+        [
+            (2, 16, 64),
+            (4, 16, 64),
+        ],
+        input_dtype=torch.bfloat16,
+        input2_dtype=torch.int8,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+        extra_check=check_int8_woq_small_m_dim,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     *generate_gemm_config(
         VecNEON,
@@ -397,7 +473,11 @@ class CppMicroGemmFP32Vec(CppMicroGemm):
 {{declare_kernel}} {
     using Vectorized = at::vec::Vectorized<{{compute_t}}>;
     constexpr auto VLEN = Vectorized::size();
+<<<<<<< HEAD
     {{kernel.assert_function}}({{block_n}} % VLEN == 0, "{{block_n}} dimension must be multiple of Vector size");
+=======
+    {{kernel.assert_function}}({{block_n}} % VLEN == 0, "block_n dimension must be multiple of Vector size");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {{kernel.assert_function}}(K % {{block_k}} == 0, "K dimension must be multiple of {{block_k}}");
     // TODO(jgong5): loop unroll for M and N
     for (int64_t m = 0; m < M; m += {{block_m}}) {
@@ -406,9 +486,15 @@ class CppMicroGemmFP32Vec(CppMicroGemm):
             int64_t block_n = std::min<int64_t>(N - n, {{block_n}});
             if (block_m == {{block_m}} && block_n == {{block_n}}) {
 {%- if not trans_b %}
+<<<<<<< HEAD
                 {{kernel_name}}_kernel<{{block_m}}, {{block_n}}, accum>(
 {%- else %}
                 {{kernel_name}}_transpose_b_kernel<{{block_m}}, {{block_n}}, accum>(
+=======
+                {{kernel_name}}_kernel<{{block_m}}, {{block_n}}, accum, prefetch>(
+{%- else %}
+                {{kernel_name}}_transpose_b_kernel<{{block_m}}, {{block_n}}, accum, prefetch>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {%- endif %}
                     A + m * lda,
 {%- if not trans_b %}
@@ -431,9 +517,15 @@ class CppMicroGemmFP32Vec(CppMicroGemm):
 {%- for b in range(block_m - 1, 0, -1) %}
                 case {{b}}:
     {%- if not trans_b %}
+<<<<<<< HEAD
                     {{kernel_name}}_kernel<{{b}}, {{block_n}}, accum>(
     {%- else %}
                     {{kernel_name}}_transpose_b_kernel<{{b}}, {{block_n}}, accum>(
+=======
+                    {{kernel_name}}_kernel<{{b}}, {{block_n}}, accum, prefetch>(
+    {%- else %}
+                    {{kernel_name}}_transpose_b_kernel<{{b}}, {{block_n}}, accum, prefetch>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {%- endif %}
                         A + m * lda,
     {%- if not trans_b %}
@@ -459,9 +551,15 @@ class CppMicroGemmFP32Vec(CppMicroGemm):
     {%- for b in range(block_m, 0, -1) %}
                 case {{b}}:
         {%- if not trans_b %}
+<<<<<<< HEAD
                     {{kernel_name}}_ntail_kernel<{{b}}, {{block_n}}, accum>(
         {%- else %}
                     {{kernel_name}}_ntail_transpose_b_kernel<{{b}}, {{block_n}}, accum>(
+=======
+                    {{kernel_name}}_ntail_kernel<{{b}}, {{block_n}}, accum, prefetch>(
+        {%- else %}
+                    {{kernel_name}}_ntail_transpose_b_kernel<{{b}}, {{block_n}}, accum, prefetch>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         {%- endif %}
                         A + m * lda,
         {%- if not trans_b %}
@@ -492,7 +590,11 @@ class CppMicroGemmFP32Vec(CppMicroGemm):
 
     TEMPLATE_KERNEL = r"""
 
+<<<<<<< HEAD
 template <int64_t BLOCK_M, int64_t BLOCK_N, bool accum>
+=======
+template <int64_t BLOCK_M, int64_t BLOCK_N, bool accum, bool prefetch=false>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {%- if not trans_b %}
     {%- if tail_n %}
 inline void {{kernel_name}}_ntail_kernel(
@@ -592,6 +694,12 @@ class CppMicroGemmFP32Vec(CppMicroGemm):
         {%- elif input2_dtype == torch.int8 %}
             // Convert VLEN int8 elements to int32, and then fp32
             auto b32 = at::vec::convert_to_int32<int8_t>(B + k * ldb + col * VLEN);
+<<<<<<< HEAD
+=======
+            if constexpr (prefetch) {
+              _mm_prefetch(B + (k + {{block_k}}) * ldb + col * VLEN, _MM_HINT_T0);
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             vb[col] = at::vec::convert<float>(b32);
         {%- else %}
             vb[col] = Vectorized::loadu(B + k * ldb + col * VLEN);
@@ -633,7 +741,11 @@ class CppMicroGemmFP32Vec(CppMicroGemm):
     // Use 2 implementations for the transposed B:
     // First implementation:
     //   Transpose first and then perform outer product calculation in sub-blocks,
+<<<<<<< HEAD
     //   which introduces an additional tranpose overhead of [K, N] compared to the non-tranpose version.
+=======
+    //   which introduces an additional transpose overhead of [K, N] compared to the non-transpose version.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Second implementation:
     //   Directly perform inner product calculation in sub-blocks,
     //   which introduces an additional vector reduction of [M, N] compared to the non-tranpose version.
@@ -950,7 +1062,11 @@ def check_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
 )
 class CppMicroGemmAMX(CppMicroGemm):
     """
+<<<<<<< HEAD
     This class generates the code for micro gemm using Advanced Matrix eXtention (AMX)
+=======
+    This class generates the code for micro gemm using Advanced Matrix extension (AMX)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     instructions available in 4th generation Intel Xeon for compute.
     It supports input types of torch.bfloat16 with fp32 output.
     """
@@ -972,6 +1088,7 @@ class CppMicroGemmAMX(CppMicroGemm):
         // Load a tile of B & cache it in L1D.
         {{input2_t}}* base_addr = const_cast<{{input2_t}}*>(B) + base_idx;
         for (int idx_dq = 0, idx_q = 0; idx_dq < buf_size; idx_q += ldb, idx_dq += {{block_n}}) {
+<<<<<<< HEAD
         {%- for vec_idx in range(0, block_n - 1, 32) %}
             auto b_int8 = at::vec::Vectorized<int8_t>::loadu(
                 base_addr + idx_q + {{vec_idx}} ,
@@ -981,6 +1098,17 @@ class CppMicroGemmAMX(CppMicroGemm):
             b_bf16.store(dequantized_B_buf + idx_dq + {{vec_idx}});
         {%- endfor %}
         {%- if (block_n % 32) != 0 %}
+=======
+        {%- for vec_idx in range(0, block_n, 32) %}
+            {%- if (block_n - vec_idx) >= 32 %}
+            auto b_int8_idx_{{vec_idx}} = at::vec::Vectorized<int8_t>::loadu(
+                base_addr + idx_q + {{vec_idx}} ,
+                static_cast<int64_t>(32)
+            );
+            auto b_bf16_idx_{{vec_idx}} = at::vec::convert<{{input_t}}>(b_int8_idx_{{vec_idx}});
+            b_bf16_idx_{{vec_idx}}.store(dequantized_B_buf + idx_dq + {{vec_idx}});
+            {%- else %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             auto b_int8_tail = at::vec::Vectorized<int8_t>::loadu(
                 base_addr + idx_q + {{block_n - (block_n % 32)}},
                 static_cast<int64_t>({{block_n % 32}})
@@ -990,7 +1118,12 @@ class CppMicroGemmAMX(CppMicroGemm):
                 dequantized_B_buf + idx_dq + {{block_n - (block_n % 32)}},
                 static_cast<int64_t>({{block_n % 32}})
             );
+<<<<<<< HEAD
         {%- endif %}
+=======
+            {%- endif %}
+        {%- endfor %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
     };
 {%- endif %}
@@ -1064,7 +1197,11 @@ class CppMicroGemmAMX(CppMicroGemm):
 
     TEMPLATE_KERNEL = r"""
 
+<<<<<<< HEAD
 template <bool accum>
+=======
+template <bool accum, bool prefetch=false>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline void {{kernel_name}}_amx_kernel_{{num_rows}}_{{num_columns}}(
     AMXState& amx_state,
     const {{input_t}}* {{restrict_keyword}} A,
@@ -1181,8 +1318,15 @@ def codegen_define(self, kernel: CppTemplateKernel) -> str:
         num_columns = block_n // 16
         options = {
             "declare_kernel": self.get_kernel_declaration(),
+<<<<<<< HEAD
             "use_cached_dequantized_B": self.input_dtype == torch.bfloat16
             and self.input2_dtype == torch.int8,
+=======
+            "use_cached_dequantized_B": (
+                self.input_dtype == torch.bfloat16
+                and self.input2_dtype in [torch.int8, torch.uint8]
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "kernel": kernel,
             "block_m": block_m,
             "block_n": block_n,
@@ -1412,7 +1556,11 @@ class CppMicroGemmWoQInt4Avx512(CppMicroGemmFP32Vec):
     int64_t ldb,
     int64_t ldc,
     int64_t q_group_size,
+<<<<<<< HEAD
     const bfloat16* {{restrict_keyword}} ScaleAndZeros,
+=======
+    const at::BFloat16* {{restrict_keyword}} ScaleAndZeros,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t lds, // leading dimension of ScaleAndZeros
     int64_t k_start) {
   constexpr int BLOCK_K = {{block_k}};
@@ -1550,7 +1698,11 @@ class CppMicroGemmWoQInt4Avx512(CppMicroGemmFP32Vec):
     def get_kernel_extra_args_declare(self) -> str:
         return (
             "const int64_t q_group_size,\n"
+<<<<<<< HEAD
             "    const bfloat16* __restrict__ ScaleAndZeros,\n"
+=======
+            "    const at::BFloat16* __restrict__ ScaleAndZeros,\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "    const int64_t lds,\n"
             "    int64_t k_start,"
         )
@@ -1571,6 +1723,258 @@ def is_woq_int4(self):
         return True
 
 
+<<<<<<< HEAD
+=======
+@register_micro_gemm(
+    *generate_gemm_config(
+        VecAMX,
+        [  # (block_m, block_n, block_k)
+            (16, 32, 32),
+            (32, 32, 32),
+        ],
+        input_dtype=torch.bfloat16,
+        input2_dtype=torch.uint8,
+        output_dtype=torch.float,
+        compute_dtype=torch.float,
+        extra_check=check_amx_extra,
+    ),
+)
+class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
+    """
+    This class generates the code for WoQ int4 micro gemm using AMX intrinsics,
+    which are available on 4th and newer generations of Intel Xeon.
+    Shape of packed weight = [N // 32, K, 16], viewed as [N, K // 2]
+    Shape of packed ScalesAndZeros = [K // group_size, N, 2]
+    Reuse TEMPLATE_KERNEL of CppMicroGemmAMX.
+    """
+
+    TEMPLATE_ENTRY = r"""
+inline bool {{kernel_name}}_is_block_start(int index, int k_start, int group_size) {
+  return (k_start + index) % group_size == 0;
+}
+
+{{declare_kernel}} {
+    {{kernel.assert_function}}(N % {{block_n}} == 0, "N dimension must be multiple of {{block_n}}");
+    {{kernel.assert_function}}(K % 2 == 0, "K dimension must be multiple of 2");
+    {{kernel.assert_function}}({{block_n}} == 32, "block_n must be 32 for WOQ int4");
+
+    // Create a stack-allocated buffer for tiles of B.
+    // Except maybe for the tail-case, an AMX tile of B has 16x32 BF16 elements.
+    // we cache K * {{block_n}} elements of dequantized B
+    {{template.codegen_allocate_weight_buffer("dequantized_B_buf", input_t, "K", block_n)}}
+
+    constexpr int BLOCK_K = {{block_k}};
+    constexpr int64_t BLOCK_N = {{block_n}};
+    constexpr int COLS = BLOCK_N / 16;
+    const int PREFETCH_SIZE_K = 16 * 4;
+    const int PREFETCH_SIZE_KB = (PREFETCH_SIZE_K + BLOCK_K - 1) / BLOCK_K;
+    const int KB = K / BLOCK_K;
+
+    __m512i b32[COLS * 2];
+    __m512 vb[COLS * 2];
+    __m512 scale[COLS];
+    __m512 zero[COLS];
+
+    // Lookup table to de-quantize int4 values to bf16.
+    // Values are dequantized as truly int4 [-8, 7] range;
+    //
+    // dequant = (bf16(int4_value) * bf16_scale) + bf16_zero
+    //
+    static const __m512 lut = _mm512_set_ps(
+        7.0f, 6.0f, 5.0f, 4.0f,
+        3.0f, 2.0f, 1.0f, 0.0f,
+        -1.0f, -2.0f, -3.0f, -4.0f,
+        -5.0f, -6.0f, -7.0f, -8.0f);
+
+    // index for transpose
+    static const __m512i idx1 = _mm512_set_epi32(
+        30, 28, 26, 24, 22, 20, 18, 16,
+        14, 12, 10, 8, 6, 4, 2, 0);
+    static const __m512i idx2 = _mm512_set_epi32(
+        31, 29, 27, 25, 23, 21, 19, 17,
+        15, 13, 11, 9, 7, 5, 3, 1);
+
+    // Indices for VNNI layout conversion
+    __m512i idx_low = _mm512_set_epi32(
+        0x17,
+        0x07,
+        0x16,
+        0x06,
+        0x15,
+        0x05,
+        0x14,
+        0x04,
+        0x13,
+        0x03,
+        0x12,
+        0x02,
+        0x11,
+        0x01,
+        0x10,
+        0x00);
+    __m512i idx_high = _mm512_set_epi32(
+        0x1f,
+        0x0f,
+        0x1e,
+        0x0e,
+        0x1d,
+        0x0d,
+        0x1c,
+        0x0c,
+        0x1b,
+        0x0b,
+        0x1a,
+        0x0a,
+        0x19,
+        0x09,
+        0x18,
+        0x08);
+
+    // load scale and zero point
+    auto load_scale_and_zeros = [&](int i, int _kb) {
+        // load 2x bfloat16 vector
+        __m512i t = _mm512_loadu_si512((__m512i*)(ScaleAndZeros + _kb * lds + 32 * i));
+        if (_kb + PREFETCH_SIZE_KB < KB) {
+            _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * lds + 32 * i, _MM_HINT_T0);
+        }
+
+        // convert to 2x f32 vector
+        __m512 a, b;
+        at::vec::cvtbf16_fp32(t, a, b);
+
+        // transpose scale_and_zero from {16, 2} to {2, 16}
+        // inputs:
+        //   a: {s0, z0, s1, z1, ..., s7, z7}
+        //   b: {s8, z8, s9, z9, ..., s15, z15}
+        // output:
+        //   scale: {s0, s1, s2, ..., s15}
+        //   zero:  {z0, z1, z2, ..., z15}
+        scale[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b);
+        zero[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b);
+    };
+
+    // Dequantize a B block of 2 * block_n into bf16
+    // So, it handles k and k+1 at the same time
+    auto dequantize_B = [&](int n) {
+        constexpr int64_t ldb_int4 = BLOCK_N / 2; // 16
+        for (int k = 0, kb = 0; k < K; k += 2) {
+            // Since block_k must be 32 for AMX microkernels, k_start may not be
+            // a multiple of q_group_size. In that case, we need to load scales
+            // and zero points immediately when k == 0 here
+            if ({{kernel_name}}_is_block_start(k, k_start, q_group_size) || k == 0) {
+                c10::ForcedUnroll<COLS>{}(load_scale_and_zeros, kb++);
+            }
+
+            // load 256 bits = 64 elements in int4
+            if (k + PREFETCH_SIZE_K < K) {
+                _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb_int4, _MM_HINT_T0);
+            }
+
+            __m128i b4 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + k * ldb_int4));
+            b32[0] = _mm512_cvtepu8_epi32(b4);
+            b32[1] = _mm512_srli_epi32(b32[0], 4);
+            vb[0] = _mm512_permutexvar_ps(b32[0] , lut);
+            vb[0] = _mm512_fmadd_ps(vb[0], scale[0], zero[0]);
+            vb[1] = _mm512_permutexvar_ps(b32[1], lut);
+            vb[1] = _mm512_fmadd_ps(vb[1], scale[1], zero[1]);
+
+            b4 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + (k + 1) * ldb_int4));
+            b32[0 + COLS] = _mm512_cvtepu8_epi32(b4);
+            b32[1 + COLS] = _mm512_srli_epi32(b32[0 + COLS], 4);
+            vb[0 + COLS] = _mm512_permutexvar_ps(b32[0 + COLS] , lut);
+            vb[0 + COLS] = _mm512_fmadd_ps(vb[0 + COLS], scale[0], zero[0]);
+            vb[1 + COLS] = _mm512_permutexvar_ps(b32[1 + COLS], lut);
+            vb[1 + COLS] = _mm512_fmadd_ps(vb[1 + COLS], scale[1], zero[1]);
+
+            for (int i = 0; i < COLS; i++) {
+                // convert to VNNI
+                auto low = _mm512_permutex2var_ps(vb[i], idx_low, vb[i + COLS]);
+                auto high = _mm512_permutex2var_ps(vb[i], idx_high, vb[i + COLS]);
+                // convert lower 16 float32 values to bfloat16
+                auto v0_bf16 = reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(low));
+                // convert higher 16 float32 values to bfloat16
+                auto v1_bf16 = reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(high));
+                // combine the lower 16 and higher 16 bfloat16 values
+                auto v = _mm512_castsi256_si512(v0_bf16);
+                v = _mm512_inserti64x4(v, v1_bf16, 1);
+                // store the VNNI format bfloat16 values
+                {{input_t}}* addr = dequantized_B_buf + k * 32 + (i % 2) * 32;
+                _mm512_storeu_si512(addr, v);
+            }
+        }
+    };
+
+    for (int64_t n = 0; n < N; n += {{block_n}}) {
+        // Dequantize K * block_n int8 B elements into BF16
+        dequantize_B(n);
+        for (int64_t m = 0; m < M; m += {{block_m}}) {
+            int64_t block_m = std::min<int64_t>(M - m, {{block_m}});
+            int64_t m_tail = m;
+        {%- for num_rows in range(block_m, 0, -16) %}
+            {%- if num_rows != block_m %}
+            else
+        {%- endif %}
+            if (block_m >= {{num_rows}}) {
+                {{kernel_name}}_amx_kernel_{{num_rows}}_{{num_columns}}<accum>(
+                    amx_state,
+                    A + m * lda,
+                    dequantized_B_buf + n * K,
+                    C + m * ldc + n,
+                    K,
+                    lda,
+                    {{block_n}},
+                    ldc,
+                    16
+                );
+                block_m -= {{num_rows}};
+                m_tail += {{num_rows}};
+            }
+        {%- endfor %}
+            if (block_m > 0) {
+                {{kernel_name}}_amx_kernel_16_{{num_columns}}<accum>(
+                    amx_state,
+                    A + m_tail * lda,
+                    dequantized_B_buf + n * K,
+                    C + m_tail * ldc + n,
+                    K,
+                    lda,
+                    {{block_n}},
+                    ldc,
+                    block_m
+                );
+            }
+        } // for m
+    } // for n
+}
+"""
+
+    def get_kernel_extra_args_declare(self) -> str:
+        return (
+            "AMXState& amx_state,\n"
+            "    const int64_t q_group_size,\n"
+            "    const c10::BFloat16* __restrict__ ScaleAndZeros,\n"
+            "    const int64_t lds,\n"
+            "    int64_t k_start,"
+        )
+
+    def get_kernel_extra_args(self, **kwargs) -> list[str]:
+        assert "kernel" in kwargs
+        assert "qscale_and_zeros" in kwargs
+        kernel = kwargs["kernel"]
+        qscale_and_zeros = kwargs["qscale_and_zeros"]
+        return [
+            "amx_state,",
+            "group_size,",
+            f"&({kernel.index(qscale_and_zeros, [0, 0, 0])}),",
+            "N * 2,",  # lds
+            "k_start,",
+        ]
+
+    def is_woq_int4(self):
+        return True
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def create_micro_gemm(
     name,
     m,
@@ -1585,6 +1989,14 @@ def create_micro_gemm(
     use_ref=True,
     q_group_size=None,
 ) -> Optional[CppMicroGemm]:
+<<<<<<< HEAD
+=======
+    """
+    Based on the provided info, try to find the config of the micro-kernel that would
+    deliver the best performance in terms of lower latency for this case.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def create_from_config(cls, config: CppMicroGemmConfig):
         return cls(
             name,
@@ -1596,10 +2008,36 @@ def create_from_config(cls, config: CppMicroGemmConfig):
             alpha,
         )
 
+<<<<<<< HEAD
     assert isinstance(n, int) or n.is_number, n
     assert isinstance(k, int) or k.is_number, k
     m = V.graph.sizevars.size_hint(m, fallback=1) if isinstance(m, sympy.Expr) else m
     assert isinstance(m, int), m
+=======
+    def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
+        # For WoQ GEMM, AMX micro-kernel may not perform well if m is small.
+        # Exception: for dynamic shapes, we consider using the AMX micro-kernel.
+        if (
+            dynamic_M
+            or input_dtype != torch.bfloat16
+            or input2_dtype not in [torch.int8, torch.uint8]
+        ):
+            return False
+        # For WOQ INT8, use AMX for m >= block_m
+        # For WOQ INT4, use AMX for m >= 5
+        block_m, *_ = config.register_blocking
+        is_woq_int4 = micro_gemm_cls == CppMicroGemmWoQInt4Amx
+        m_threshold = 5 if is_woq_int4 else block_m
+        return m < m_threshold
+
+    assert isinstance(n, int) or n.is_number, n
+    assert isinstance(k, int) or k.is_number, k
+    from ..utils import has_free_symbols
+
+    dynamic_M = has_free_symbols((m,))
+    m = V.graph.sizevars.size_hint(m, fallback=1) if dynamic_M else m
+    assert isinstance(m, int) or m.is_number, m
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if output_dtype is None:
         output_dtype = input_dtype
     if compute_dtype is None:
@@ -1624,6 +2062,7 @@ def create_from_config(cls, config: CppMicroGemmConfig):
                 # subject to change in the future.
             ):
                 if config.extra_check is not None and not config.extra_check(
+<<<<<<< HEAD
                     config, m, n, k, alpha, num_threads, q_group_size=q_group_size
                 ):
                     continue
@@ -1635,6 +2074,22 @@ def create_from_config(cls, config: CppMicroGemmConfig):
                     and input2_dtype == torch.int8
                 ):
                     # For int8 WoQ GEMM, AMX micro-kernel may not perform well if m < block_m
+=======
+                    config,
+                    m,
+                    n,
+                    k,
+                    alpha,
+                    num_threads,
+                    dynamic_M=dynamic_M,
+                    q_group_size=q_group_size,
+                ):
+                    continue
+                block_m, block_n, block_k = config.register_blocking
+                if config.vec_isa_cls == VecAMX and skip_amx_kernel_for_woq(
+                    config, dynamic_M, cls
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     continue
                 # Criteria on the ranking of configurations
                 # 1. ISA: AMX > VEC
@@ -1678,4 +2133,8 @@ def create_from_config(cls, config: CppMicroGemmConfig):
         else:
             return None
     # TODO(jgong5): allow autotuning on choices of configs
+<<<<<<< HEAD
     return create_from_config(*max(matched_configs, key=lambda x: x[0])[1:])
+=======
+    return create_from_config(*max(matched_configs, key=operator.itemgetter(0))[1:])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/codegen/cpp_template.py b/torch/_inductor/codegen/cpp_template.py
index 3c01c5a398cc..f271df935e83 100644
--- a/torch/_inductor/codegen/cpp_template.py
+++ b/torch/_inductor/codegen/cpp_template.py
@@ -10,7 +10,11 @@
 
 import sympy
 
+<<<<<<< HEAD
 from .. import codecache, config, ir
+=======
+from .. import config, ir
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..autotune_process import CppBenchmarkRequest, TensorMeta
 from ..utils import IndentedBuffer, Placeholder, unique
 from ..virtualized import V
@@ -47,6 +51,10 @@ def generate(self, **kwargs):
         with (
             patch.object(V.graph, "get_dtype", self._fake_get_dtype(self.output_node)),
             patch.object(ir.FlexibleLayout, "allow_indexing", True),
+<<<<<<< HEAD
+=======
+            V.graph.set_current_device(self.layout.device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             CppTemplateKernel(
                 kernel_name=kernel_name, num_threads=self.num_threads
             ) as kernel,
@@ -121,7 +129,11 @@ def make_kernel_render(
 
     def header(self) -> IndentedBuffer:
         res = IndentedBuffer()
+<<<<<<< HEAD
         res.writeline(codecache.cpp_prefix())
+=======
+        res.writeline("#include <torch/csrc/inductor/cpp_prefix.h>")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: add c10::ForcedUnroll test to test_aoti_abi_check
         res.splice("""#include <c10/util/Unroll.h>""")
         res.splice("""#include <torch/csrc/inductor/aoti_torch/c/shim.h>""")
diff --git a/torch/_inductor/codegen/cpp_template_kernel.py b/torch/_inductor/codegen/cpp_template_kernel.py
index c05baf717478..76d78006ed78 100644
--- a/torch/_inductor/codegen/cpp_template_kernel.py
+++ b/torch/_inductor/codegen/cpp_template_kernel.py
@@ -7,6 +7,10 @@
 from sympy.parsing.sympy_parser import parse_expr
 
 import torch
+<<<<<<< HEAD
+=======
+from torch._inductor.utils import do_bench_using_profiling
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.symbol import SymT
 
@@ -208,6 +212,22 @@ def define_buffer(self, name, sizes: list[Any], dtype=torch.float) -> str:
         numel = f"{cexpr_index(buf.get_numel())}"
         return f"auto _{name} = std::make_unique<{ctype}[]>({numel}); auto {name} = _{name}.get();"
 
+<<<<<<< HEAD
+=======
+    def define_stack_allocated_buffer(
+        self, name, sizes: list[Any], dtype=torch.float
+    ) -> str:
+        """Define stack-allocated buffer"""
+        sizes = parse_expr_with_index_symbols(sizes)
+        buf = ir.Buffer(
+            name=name, layout=ir.FixedLayout(torch.device("cpu"), dtype, sizes)
+        )
+        self.local_buffers[name] = buf
+        ctype = f"{DTYPE_TO_CPP[dtype]}"
+        numel = f"{cexpr_index(buf.get_numel())}"
+        return f"alignas(64) {ctype} _{name}[{numel}]; {ctype}* {name} = _{name};"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def reinit_buffer_if_null(self, name):
         """Reinit the previously defined local buffer if it is null"""
         assert name in self.local_buffers
@@ -553,7 +573,14 @@ def precompile(self) -> None:
 
     def benchmark(self, *args, out) -> float:
         assert self.bmreq is not None
+<<<<<<< HEAD
         return self.bmreq.benchmark(*args, output_tensor=out)
+=======
+        if config.profile_bandwidth_with_do_bench_using_profiling:
+            algo = self.bmreq.make_run_fn(*args, out=out)
+            return do_bench_using_profiling(algo)
+        return self.bmreq.benchmark(*args, out=out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def hash_key(self) -> str:
         return "-".join(
diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py
index f984f1d4f77e..ae71eded427c 100644
--- a/torch/_inductor/codegen/cpp_utils.py
+++ b/torch/_inductor/codegen/cpp_utils.py
@@ -24,6 +24,7 @@
 from ..scheduler import BaseSchedulerNode, SchedulerBuffer
 from ..utils import IndentedBuffer, sympy_index_symbol_with_prefix, sympy_subs
 from ..virtualized import ops, OpsValue, V
+<<<<<<< HEAD
 from .common import (
     CSEVariable,
     deduce_output_dtype_by_name,
@@ -31,12 +32,19 @@
     KernelArgs,
     OptimizationContext,
 )
+=======
+from .common import CSEVariable, Kernel, KernelArgs, OptimizationContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 DTYPE_TO_CPP = {
     torch.float32: "float",
     torch.float64: "double",
+<<<<<<< HEAD
     torch.float16: "half",
+=======
+    torch.float16: "at::Half",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.int64: "int64_t",
     torch.int32: "int32_t",
     torch.int16: "int16_t",
@@ -46,6 +54,7 @@
     torch.uint16: "uint16_t",
     torch.uint8: "uint8_t",
     torch.bool: "bool",
+<<<<<<< HEAD
     torch.bfloat16: "bfloat16",
     torch.complex32: "c10::complex<half>",
     torch.complex64: "c10::complex<float>",
@@ -54,6 +63,16 @@
     torch.float8_e5m2: "float8_e5m2",
     torch.float8_e4m3fnuz: "float8_e4m3fnuz",
     torch.float8_e5m2fnuz: "float8_e5m2fnuz",
+=======
+    torch.bfloat16: "at::BFloat16",
+    torch.complex32: "at::complex<at::Half>",
+    torch.complex64: "at::complex<float>",
+    torch.complex128: "at::complex<double>",
+    torch.float8_e4m3fn: "at::Float8_e4m3fn",
+    torch.float8_e5m2: "at::Float8_e5m2",
+    torch.float8_e4m3fnuz: "at::Float8_e4m3fnuz",
+    torch.float8_e5m2fnuz: "at::Float8_e5m2fnuz",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 DTYPE_TO_ATEN = {
@@ -86,6 +105,10 @@
     "cpu": "at::kCPU",
     "cuda": "at::kCUDA",
     "xpu": "at::kXPU",
+<<<<<<< HEAD
+=======
+    "mps": "at::kMPS",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 LAYOUT_TO_ATEN = {
@@ -93,6 +116,12 @@
     torch._mkldnn: "at::kMkldnn",  # type: ignore[attr-defined]
 }
 
+<<<<<<< HEAD
+=======
+# matches c10/core/DeviceType.h
+DEVICE_TO_INT = {"cpu": 0, "cuda": 1}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _IS_WINDOWS = sys.platform == "win32"
 
 INDEX_TYPE = "int64_t"
@@ -141,6 +170,7 @@ def promote_arg(arg, promote_type):
     return new_args
 
 
+<<<<<<< HEAD
 def get_opt_ctx(node: torch.fx.Node) -> OptimizationContext:
     return node.meta.get(OptimizationContext.key, None)
 
@@ -180,6 +210,8 @@ def deduce_dtype_for_cpp_cse_variable(name, *args, **kwargs):
         )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CppCSEVariable(CSEVariable):
     def __init__(
         self,
@@ -214,6 +246,7 @@ def update_on_args(self, name, args, kwargs):
                 self._set_dependent_itervars(args[0])
             if any(arg.is_vec for arg in args if isinstance(arg, CppCSEVariable)):
                 self.is_vec = True
+<<<<<<< HEAD
         # NOTE [Deduce dtype of CppCSEVariable at runtime]
         if self.dtype is None:
             # Take frexp for example: 2 output with different data type.
@@ -221,6 +254,8 @@ def update_on_args(self, name, args, kwargs):
             # of return tensor everywhere invoking update_on_args
             self.dtype = deduce_dtype_for_cpp_cse_variable(name, *args, **kwargs)
         assert self.dtype is not None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _set_dependent_itervars(self, index: sympy.Expr):
         """
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index 8cb70516a57e..02868ac3f94b 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -1,16 +1,31 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from __future__ import annotations
+
+import ctypes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import math
 import os
 import sys
 import textwrap
+<<<<<<< HEAD
 from collections.abc import Sequence
 from itertools import count
 from typing import Callable, Optional, Protocol, Union
+=======
+from itertools import chain, count
+from typing import Any, Callable, Optional, Protocol, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
 import torch
+<<<<<<< HEAD
+=======
+import torch._higher_order_ops.torchbind
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
 import torch._ops
 from torch._inductor.runtime.runtime_utils import dynamo_timed
@@ -23,7 +38,11 @@
 from ..virtualized import V
 from .aoti_hipify_utils import maybe_hipify_code_wrapper
 from .common import get_device_op_overrides, IndentedBuffer, Kernel
+<<<<<<< HEAD
 from .cpp_utils import cexpr, DEVICE_TO_ATEN, DTYPE_TO_ATEN, DTYPE_TO_CPP
+=======
+from .cpp_utils import cexpr, DEVICE_TO_ATEN, DEVICE_TO_INT, DTYPE_TO_ATEN, DTYPE_TO_CPP
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .wrapper import (
     EnterSubgraphLine,
     ExitSubgraphLine,
@@ -32,6 +51,18 @@
 )
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from ..graph import GraphLowering
+
+    # At most, the list nesting can go one layer deep.
+    _OUTPUT_ARGS_TYPE = list[Union[Optional[str], list[Optional[str]]]]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class HasWriteLine(Protocol):
     def writeline(self, line: Union[LineContext, DeferredLineBase, str]) -> None: ...
 
@@ -45,7 +76,11 @@ def __init__(self):
         if not hasattr(self, "device"):
             self.device = "cpu"
         # must be initialized prior to calling super().__init__()
+<<<<<<< HEAD
         self.included_devices = OrderedSet[str]()
+=======
+        self.included_devices: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.declare = "auto "
         self.declare_maybe_reference = "decltype(auto) "
@@ -55,6 +90,7 @@ def __init__(self):
         self.supports_intermediate_hooks = False
         self.kernel_callsite_id = count()
         self.int_array_id = count()  # for int array local variable declarations
+<<<<<<< HEAD
         self.declared_int_array_vars = OrderedSet[str]()
         self.tmp_tensor_id = count()  # for tmp tensor local variable declarations
         self.arg_var_id = count()
@@ -63,6 +99,16 @@ def __init__(self):
         self.used_cached_layouts = OrderedSet[str]()
         self.used_cached_memory_formats = OrderedSet[str]()
         self.used_cond_predicate = OrderedSet[str]()
+=======
+        self.declared_int_array_vars: OrderedSet[str] = OrderedSet()
+        self.tmp_tensor_id = count()  # for tmp tensor local variable declarations
+        self.arg_var_id = count()
+        self.used_cached_devices: OrderedSet[str] = OrderedSet()
+        self.used_cached_dtypes: OrderedSet[str] = OrderedSet()
+        self.used_cached_layouts: OrderedSet[str] = OrderedSet()
+        self.used_cached_memory_formats: OrderedSet[str] = OrderedSet()
+        self.used_cond_predicate: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cached_output_id = count()
         self.scalar_to_tensor_id = count()
         self.custom_op_wrapper_loaded = False
@@ -102,7 +148,11 @@ def _generate_temporary_array_pointer(
             f"std::array<{c_type}, {len(elements)}>{{{', '.join(elements)}}}.{ptr_call}"
         )
 
+<<<<<<< HEAD
     def generate_kernel_call(
+=======
+    def _generate_kernel_call_helper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         kernel_name: str,
         call_args,
@@ -110,8 +160,16 @@ def generate_kernel_call(
         device=None,
         triton=True,
         arg_types=None,
+<<<<<<< HEAD
+        raw_args=None,
+        triton_meta=None,
+=======
+        raw_keys=None,
         raw_args=None,
         triton_meta=None,
+        graph_name="",
+        original_fxnode_name=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Generates kernel call code.
@@ -121,7 +179,13 @@ def generate_kernel_call(
                 Only valid when cuda == True.
         """
         assert arg_types is not None and len(call_args) == len(arg_types), (
+<<<<<<< HEAD
             "Mismatch call_args and arg_types in generate_kernel_call"
+=======
+            "Mismatch call_args and arg_types in generate_kernel_call:\n"
+            f"call_args: {call_args}\n"
+            f"arg_types: {arg_types}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         new_args = []
         for idx, arg in enumerate(call_args):
@@ -227,6 +291,25 @@ def write_prefix(self):
         if V.graph.is_const_graph:
             # We do not write prefix for constant graph, it will be written by main module.
             return
+<<<<<<< HEAD
+=======
+        if config.aot_inductor.custom_ops_to_c_shims:
+            # custom_ops_to_c_shims contains declaration of custom ops with C shim.
+            # TODO: this could be auto-generated from a passed-in custom op schema
+            custom_c_shims = list(
+                chain(*config.aot_inductor.custom_ops_to_c_shims.values())
+            )
+            declarations = "\n".join(
+                [f"extern {textwrap.dedent(shim)};" for shim in custom_c_shims]
+            )
+            self.prefix.splice(
+                f"""
+                extern "C" {{
+                    {declarations}
+                }}
+                """
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if V.graph.aot_mode:
             self.prefix.writeline("namespace torch::aot_inductor {")
 
@@ -246,12 +329,20 @@ def codegen_input_symbol_assignment(
     ):
         code = self.prefix
 
+<<<<<<< HEAD
         @functools.lru_cache(None)
+=======
+        @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def sizeof(name):
             self.codegen_input_size_var_decl(code, name)
             return f"{name}_size"
 
+<<<<<<< HEAD
         @functools.lru_cache(None)
+=======
+        @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def strideof(name):
             self.codegen_input_stride_var_decl(code, name)
             return f"{name}_stride"
@@ -268,6 +359,7 @@ def codegen_symbol(
                 code.writeline(f"int64_t {sym_or_exp} = {name_fn(base_name)}[{dim}];")
                 bound_vars.add(sym_or_exp)
             elif isinstance(sym_or_exp, sympy.Expr):
+<<<<<<< HEAD
                 free_symbol = None
                 for sym in sym_or_exp.free_symbols:
                     if sym not in bound_vars:
@@ -279,10 +371,23 @@ def codegen_symbol(
                                 + " contains more than one undefined symbols"
                             )
                 if free_symbol is None:
+=======
+                undefined_symbols = [
+                    sym for sym in sym_or_exp.free_symbols if sym not in bound_vars
+                ]
+                if len(undefined_symbols) != 1:
+                    # Skip if expression contains no symbols or if multiple
+                    # symbols exists since we assume each base symbol is defined
+                    # by other codegen_symbol calls.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return
 
                 from torch.utils._sympy.solve import try_solve
 
+<<<<<<< HEAD
+=======
+                free_symbol = undefined_symbols.pop()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 base_name = name_fn(base_name)
                 # Use a size symbol to solve the free symbol
                 size_symbol = sympy.Symbol(f"{base_name}_{dim}", integer=True)
@@ -319,9 +424,18 @@ def codegen_symbol(
             raise AssertionError(f"Unknown value type: {type(value)}")
 
     def generate_input_output_runtime_checks(self):
+<<<<<<< HEAD
         # In debug_compile mode, we generate checks to ensure the dtype/shape/stride of each
         # real input/output tensor match ones provided at compile time via sample
         # input/output.
+=======
+        """
+        In debug_compile mode, we generate checks to ensure the dtype/shape/stride/device of each
+        real input/output tensor match ones provided at compile time via sample
+        input/output.
+        """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def gen_check(handle_kind, idx, name, tensor):
             # Wrap AtenTensorHandle with ConstantHandle for cleaner utility function access
             self.prefix.writeline(
@@ -373,12 +487,24 @@ def gen_check(handle_kind, idx, name, tensor):
                             """
                         )
                     if not math.isinf(sym_range.upper):
+<<<<<<< HEAD
                         self.prefix.splice(
                             f"""
                                 if ({name}_size[{dim_idx}] > {sym_range.upper}) {{
                                     std::stringstream ss;
                                     ss << "{handle_kind}[{idx}]: dim value is too large at {dim_idx}, "
                                        << "expected to be <= {sym_range.upper}, " << "but got: "
+=======
+                        # Limit upper bound to max C long long value (2^63 - 1)
+                        max_long_long = ctypes.c_longlong(2**63 - 1).value
+                        upper_bound = min(sym_range.upper, max_long_long)
+                        self.prefix.splice(
+                            f"""
+                                if ({name}_size[{dim_idx}] > {upper_bound}) {{
+                                    std::stringstream ss;
+                                    ss << "{handle_kind}[{idx}]: dim value is too large at {dim_idx}, "
+                                       << "expected to be <= {upper_bound}, " << "but got: "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                        << {name}_size[{dim_idx}] << "\\n";
                                     throw std::runtime_error(ss.str());
                                 }}
@@ -401,13 +527,56 @@ def gen_check(handle_kind, idx, name, tensor):
                     """
                 )
 
+<<<<<<< HEAD
+=======
+            # check input device type
+            if isinstance(tensor, ir.TensorBox):
+                tensor_device = tensor.get_device()
+                if tensor_device is not None:
+                    expected_device_type = DEVICE_TO_INT.get(tensor_device.type)
+                    if expected_device_type is not None:
+                        self.codegen_input_device_type_var_decl(self.prefix, name)
+                        device_type_str = str(tensor_device.type)
+                        self.prefix.splice(
+                            f"""
+                                int32_t {name}_expected_device_type = {expected_device_type};
+                                if ({name}_expected_device_type != {name}_device_type) {{
+                                    std::stringstream ss;
+                                    ss << "{handle_kind}[{idx}]: unmatched device type, "
+                                    << "expected: " << {name}_expected_device_type << "{expected_device_type}({device_type_str}), "
+                                    << "but got: " << {name}_device_type << "\\n";
+                                    throw std::runtime_error(ss.str());
+                                }}
+                            """
+                        )
+
+        # Create a separate function for each input check to avoid "too big to optimize" error
+        for idx, (name, tensor) in enumerate(V.graph.graph_inputs.items()):
+            self.prefix.splice(
+                f"""
+                AOTI_NOINLINE static void check_input_{idx}(
+                    AtenTensorHandle* input_handles
+                ) {{
+                """
+            )
+            with self.prefix.indent():
+                gen_check("input_handles", idx, name, tensor)
+            self.prefix.writeline("}")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # force noinline to avoid any potential compilation slowdown due to aggressive
         # inline done by the host compiler
         self.prefix.splice(
             """
+<<<<<<< HEAD
             bool _check_aoti_runtime_check_inputs_env() {
                 const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
                 const static bool result = env_var_value != nullptr && env_var_value[0] != 0;
+=======
+            static bool _check_aoti_runtime_check_inputs_env() {
+                const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
+                const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return result;
             }
 
@@ -420,8 +589,13 @@ def gen_check(handle_kind, idx, name, tensor):
             """
         )
         with self.prefix.indent():
+<<<<<<< HEAD
             for idx, (name, tensor) in enumerate(V.graph.graph_inputs.items()):
                 gen_check("input_handles", idx, name, tensor)
+=======
+            for idx in range(len(V.graph.graph_inputs)):
+                self.prefix.writeline(f"check_input_{idx}(input_handles);")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.prefix.writeline("}")
 
     def write_wrapper_decl(self):
@@ -461,6 +635,7 @@ def write_wrapper_decl(self):
                         """
                     )
 
+<<<<<<< HEAD
                 run_impl_proto = ""
                 if config.aot_inductor.compile_wrapper_with_O0:
                     run_impl_proto += """
@@ -472,6 +647,9 @@ def write_wrapper_decl(self):
                     """
 
                 run_impl_proto += """
+=======
+                run_impl_proto = """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     void AOTInductorModel::run_impl(
                         AtenTensorHandle*
                             input_handles, // array of input AtenTensorHandle; handles
@@ -483,6 +661,7 @@ def write_wrapper_decl(self):
                         DeviceStreamType stream,
                         AOTIProxyExecutorHandle proxy_executor
                     ) {
+<<<<<<< HEAD
                     """
 
                 self.generate_input_output_runtime_checks()
@@ -490,6 +669,12 @@ def write_wrapper_decl(self):
                     __check_inputs_outputs(input_handles, output_handles);
                 """
 
+=======
+                        __check_inputs_outputs(input_handles, output_handles);
+                    """
+
+                self.generate_input_output_runtime_checks()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.prefix.splice(run_impl_proto)
         else:
             # cpp entry function for JIT with cpp wrapper
@@ -575,7 +760,11 @@ def write_wrapper_decl(self):
                 if not V.graph.is_const_graph:
                     self.prefix.writeline("inputs.clear();")
                 self.prefix.writeline(
+<<<<<<< HEAD
                     "auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());"
+=======
+                    "[[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     def codegen_tensor_dtype_var_decl(self, code: IndentedBuffer, name):
@@ -590,6 +779,15 @@ def codegen_input_size_var_decl(self, code: IndentedBuffer, name):
     def codegen_input_stride_var_decl(self, code: IndentedBuffer, name):
         code.writeline(f"auto {name}_stride = {name}.strides();")
 
+<<<<<<< HEAD
+=======
+    def codegen_input_device_type_var_decl(self, code: IndentedBuffer, name):
+        code.writeline(f"int32_t {name}_device_type;")
+        code.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type({name}, &{name}_device_type));"
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_model_kernels(self):
         self.prefix.writeline("namespace {")
 
@@ -629,7 +827,23 @@ def codegen_model_kernels(self):
             signature = kernel.get_signature().replace(name, kernel_ptr)
             self.prefix.writeline(f"    {signature} = torch::aot_inductor::{name};")
         self.prefix.writeline("};")
+<<<<<<< HEAD
         self.prefix.writeline("}  // namespace")
+=======
+        self.prefix.writeline("}  // namespace\n\n")
+
+        if config.aot_inductor.embed_kernel_binary:
+            self.prefix.writeline('extern "C" {')
+            for name in sorted(declare_kernel):
+                self.prefix.writeline(
+                    f"    extern const unsigned char __{name}_start[];"
+                )
+                if torch.xpu.is_available():
+                    self.prefix.writeline(
+                        f"    extern const unsigned char __{name}_end[];"
+                    )
+            self.prefix.writeline("}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_model_constructor(self):
         """
@@ -662,9 +876,19 @@ def codegen_model_constructor(self):
             AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
                                                std::shared_ptr<std::vector<ConstantHandle>> constants_array,
                                                const std::string& device_str,
+<<<<<<< HEAD
                                                std::optional<std::string> cubin_dir,
                                                bool include_weights)
                 : AOTInductorModelBase({num_inputs}, {num_outputs}, {num_constants}, device_str, cubin_dir, {include_weights}) {{
+=======
+                                               std::optional<std::string> cubin_dir)
+                : AOTInductorModelBase({num_inputs},
+                                       {num_outputs},
+                                       {num_constants},
+                                       device_str,
+                                       std::move(cubin_dir),
+                                       {include_weights}) {{
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
         )
 
@@ -714,12 +938,20 @@ def codegen_model_constructor(self):
                     constant_type_str = "TensorConstant"
                 elif any(
                     name == normalize_name(parameter_name)
+<<<<<<< HEAD
                     for parameter_name, _ in V.graph.orig_gm.named_parameters()
+=======
+                    for parameter_name in V.graph.named_parameters
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     constant_type_str = "Parameter"
                 elif any(
                     name == normalize_name(buffer_name)
+<<<<<<< HEAD
                     for buffer_name, _ in V.graph.orig_gm.named_buffers()
+=======
+                    for buffer_name in V.graph.named_buffers
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     constant_type_str = "Buffer"
                 else:
@@ -775,10 +1007,17 @@ def escape_string(x):
                 )
 
             self.prefix.writeline(
+<<<<<<< HEAD
                 f'in_spec_ = "{escape_string(config.aot_inductor.serialized_in_spec)}";'
             )
             self.prefix.writeline(
                 f'out_spec_ = "{escape_string(config.aot_inductor.serialized_out_spec)}";'
+=======
+                f'in_spec_ = R"({config.aot_inductor.serialized_in_spec})";'
+            )
+            self.prefix.writeline(
+                f'out_spec_ = R"({config.aot_inductor.serialized_out_spec})";'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             for idx, output in enumerate(V.graph.graph_outputs):
@@ -906,7 +1145,11 @@ def finalize_prefix(self):
         self.prefix.splice(aot_mode_decls)
         self.prefix.splice(prior)
 
+<<<<<<< HEAD
     def define_kernel(
+=======
+    def _define_kernel_helper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         kernel_name: str,
         kernel_body: str,
@@ -1004,6 +1247,10 @@ def generate_before_suffix(self, result):
                 result.writeline("} // inductor_entry_impl")
 
     def generate_end(self, result):
+<<<<<<< HEAD
+=======
+        """Generates the end of the code block, and any code needed to call it."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if V.graph.aot_mode:
             if V.graph.is_const_graph:
                 result.writeline("} // AOTInductorModel::_const_run_impl")
@@ -1011,6 +1258,7 @@ def generate_end(self, result):
                 result.writeline("} // namespace torch::aot_inductor\n\n\n")
             return
 
+<<<<<<< HEAD
         # Add any kernel definitions into the wrapped code.  We currently only build
         # them in separate files in AOT mode.
         result.splice(self.kernel_declarations.getvalue())
@@ -1024,6 +1272,39 @@ def generate_end(self, result):
 
             inductor_entry = CppWrapperCodeCache.load_pybinding(
                 ["std::vector<AtenTensorHandle>"], cpp_wrapper_src, "{self.device}", {len(V.graph.graph_outputs)})
+=======
+        if config.cpp_wrapper_build_separate:
+            # Close the wrapper code block, then write any kernel definitions.
+            result.splice("'''\n)")
+            if self.kernel_declarations:
+                result.splice("\nkernel_src = (\nr'''")
+                result.splice(self.kernel_declarations.getvalue())
+                result.splice("'''\n)")
+            else:
+                result.splice(
+                    """
+                    kernel_src = ''
+                    """
+                )
+        else:
+            # Merge main code and kernel code
+            result.splice(self.kernel_declarations.getvalue())
+            self.kernel_declarations.clear()
+            # Close the wrapper code block
+            result.splice("'''\n)")
+
+        kernel_code = "kernel_src" if config.cpp_wrapper_build_separate else "None"
+        # Cpp entry function for JIT with cpp wrapper
+        result.splice(
+            f"""
+            inductor_entry = CppWrapperCodeCache.load_pybinding(
+                argtypes=["std::vector<AtenTensorHandle>"],
+                main_code=cpp_wrapper_src,
+                device_type="{self.device}",
+                num_outputs={len(V.graph.graph_outputs)},
+                kernel_code={kernel_code},
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
         )
 
@@ -1115,11 +1396,33 @@ def generate_c_shim_extern_kernel_call(
         debug_printer_manager.set_printer_args(
             debug_args if debug_args is not None else args, kernel, None, None, "extern"
         )
+<<<<<<< HEAD
         with debug_printer_manager:
             shim_fn = self.get_c_shim_func_name(kernel, device)
             self.writeline(
                 f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(args)}));"
             )
+=======
+        enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
+            "linux",
+            "win32",
+        ]
+        with debug_printer_manager:
+            shim_fn = self.get_c_shim_func_name(kernel, device)
+            shim_fn_codes = (
+                f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(args)}));"
+            )
+            if enable_kernel_profile:
+                shim_fn_codes = textwrap.dedent(
+                    f"""
+                    {{
+                      RECORD_FUNCTION("{shim_fn}", c10::ArrayRef<c10::IValue>());
+                      {shim_fn_codes}
+                    }}
+                    """
+                )
+            self.writeline(shim_fn_codes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def generate_c_shim_extern_kernel_alloc(
         self, extern_kernel: ir.ExternKernelAlloc, args: list[str]
@@ -1144,7 +1447,11 @@ def generate_c_shim_extern_kernel_alloc(
         if not is_inplace:
             self.writeline(f"RAIIAtenTensorHandle {name}({output_handle_name});")
 
+<<<<<<< HEAD
     def generate_extern_kernel_alloc(self, extern_kernel, args):
+=======
+    def _generate_extern_kernel_alloc_helper(self, extern_kernel, args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if getattr(extern_kernel, "outputs", None):
             # ir.ExternKernelAlloc may have outputs if it returns a tuple
             self.generate_c_shim_fallback_kernel(extern_kernel, args)
@@ -1193,10 +1500,14 @@ def generate_c_shim_fallback_kernel(
         for raii_handle in output_raii_handles:
             self.writeline(raii_handle)
 
+<<<<<<< HEAD
     def generate_fallback_kernel(self, fallback_kernel, args):
         self.generate_c_shim_fallback_kernel(fallback_kernel, args)
 
     def generate_extern_kernel_out(
+=======
+    def _generate_extern_kernel_out_helper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         kernel: str,
         out: str,
@@ -1246,7 +1557,11 @@ def generate_scatter_fallback(
     def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
         # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version
         # See the comment in codegen_reinterpret_view about why having something like
+<<<<<<< HEAD
         # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the correponding
+=======
+        # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the corresponding
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # tensor prematurely deallocated, thus the temporary array trick here.
         indices_str = self._generate_temporary_array_pointer(
             "AtenTensorHandle", indices
@@ -1292,6 +1607,7 @@ def ensure_size_computed(self, sym: sympy.Symbol):
             expr = V.graph.sizevars.inv_precomputed_replacements[sym]
             self.writeline(f"int64_t {sym} = {cexpr(expr)};")
 
+<<<<<<< HEAD
     def generate_numel_expr(self, kernel_name: str, tree, suffix: Optional[str] = None):
         expr = f"{kernel_name}_{tree.prefix}numel"
         if suffix is not None:
@@ -1310,6 +1626,17 @@ def generate_numel_expr(self, kernel_name: str, tree, suffix: Optional[str] = No
         # constant now, need type info. I agree, this needs type info, and while this is not true type info
         # it suffices as a type hint for the purposes of producing the correct code for this type.
         return SymbolicCallArg(expr, tree.numel)
+=======
+    def _generate_symbolic_call_arg_helper(
+        self, arg: SymbolicCallArg, graph: GraphLowering
+    ) -> None:
+        if (arg.inner, graph) not in self.kernel_numel_expr:
+            # declare expr once in each graph (scope)
+            self.kernel_numel_expr.add((arg.inner, graph))
+            self.writeline(f"int64_t {arg.inner} = {cexpr(arg.inner_expr)};")
+        else:
+            self.writeline(f"{arg.inner} = {cexpr(arg.inner_expr)};")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_dynamic_scalar(self, node):
         (data,) = (t.codegen_reference() for t in node.inputs)
@@ -1385,7 +1712,11 @@ def codegen_memory_format(self, memory_format):
         self.used_cached_memory_formats.add(memory_format_str)
         return f"cached_torch_memory_format_{memory_format_str}"
 
+<<<<<<< HEAD
     @functools.lru_cache(None)  # noqa: B019
+=======
+    @functools.cache  # noqa: B019
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_int_array_var(
         self,
         int_array: str,
@@ -1566,6 +1897,7 @@ def create_dtypeview_call(reinterpret_call: str) -> tuple[str, list[str]]:
             return f"RAIIAtenTensorHandle({tmp_AtenTensorHandle})", tmp_call_strs
 
         def create_new_tensor_handle() -> tuple[str, list[str]]:
+<<<<<<< HEAD
             # TODO (benjaminglass1): uncomment this and remove the call to
             # create_reinterpret_view after the AOTI forwards compatibility window has
             # passed.
@@ -1577,6 +1909,14 @@ def create_new_tensor_handle() -> tuple[str, list[str]]:
             # ]
             # return f"RAIIAtenTensorHandle({tmp_AtenTensorHandle})", tmp_call_strs
             return create_reinterpret_call(), []
+=======
+            tmp_AtenTensorHandle = f"tmp_{data.get_name()}_{next(self.tmp_tensor_id)}"
+            tmp_call_strs = [
+                f"AtenTensorHandle {tmp_AtenTensorHandle};",
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_tensor_handle({data.get_name()}, &{tmp_AtenTensorHandle}));",
+            ]
+            return f"RAIIAtenTensorHandle({tmp_AtenTensorHandle})", tmp_call_strs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if (
             size == data.layout.size
@@ -1641,7 +1981,11 @@ def codegen_device_copy(self, src, dst, non_blocking: bool):
             f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_copy_({dst}, {src}, {non_blocking}));"
         )
 
+<<<<<<< HEAD
     def codegen_multi_output(self, name, value):
+=======
+    def codegen_multi_output(self, node: ir.MultiOutput):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # in the abi_compatible mode, outputs are retrieved by passing
         # output pointers, so we skip its codegen here.
         pass
@@ -1697,7 +2041,11 @@ def codegen_conditional(self, conditional):
 
         if not isinstance(conditional.predicate, ir.ShapeAsConstantBuffer):
             # in ABI-compatible mode, we need to use the ABI shim function
+<<<<<<< HEAD
             # to extract a C++ bool from the unrelying scalar bool Tensor
+=======
+            # to extract a C++ bool from the underlying scalar bool Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             predicate = f"{conditional.predicate.get_name()}_scalar"
             if predicate not in self.used_cond_predicate:
                 self.codegen_tensor_item(
@@ -1761,7 +2109,11 @@ def codegen_while_loop(self, while_loop):
             # in ABI-compatible mode, the carried inputs are codegened
             # as buffers outside the while loop and set to the initial
             # values. at the end of each while_loop iteration, they
+<<<<<<< HEAD
             # will be assined the carried values.
+=======
+            # will be assigned the carried values.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             out_name = out.get_name()
             self.writeline(f"AtenTensorHandle {out_name}_handle;")
             self.writeline(
@@ -1770,7 +2122,11 @@ def codegen_while_loop(self, while_loop):
             self.writeline(f"RAIIAtenTensorHandle {out_name}({out_name}_handle);")
             cond_outer_inputs.append(out_name)
 
+<<<<<<< HEAD
         # additional inputs will be assinged within the while_loop
+=======
+        # additional inputs will be assigned within the while_loop
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # iteration directly from the corresponding outer graph buffers
         cond_outer_inputs.extend(outer_additional_inputs)
 
@@ -1801,17 +2157,34 @@ def codegen_while_loop(self, while_loop):
 
     def generate_extern_kernel_args_decl_if_needed(
         self,
+<<<<<<< HEAD
         op_overload,
         raw_args,
         output_args: Optional[list[str]] = None,
         raw_outputs: Optional[list[ir.Buffer]] = None,
     ):
+=======
+        op_overload: Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator],
+        raw_args: Sequence[Any],
+        output_args: _OUTPUT_ARGS_TYPE,
+        raw_outputs: Sequence[ir.Buffer],
+    ):
+        """
+        Generates declarations for external kernel arguments if needed, based on the provided
+        operator and its arguments. It processes both input and output arguments, categorizing
+        them into tensor and integer arguments for further code generation.
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         schema = None
         if isinstance(op_overload, torch._higher_order_ops.torchbind.CallTorchBind):
             obj = raw_args[0]
             method = raw_args[1]
             schema = op_overload.schema(obj, method)
         else:
+<<<<<<< HEAD
+=======
+            assert isinstance(op_overload, torch._ops.OpOverload), type(op_overload)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             schema = op_overload._schema
         assert schema is not None
         arg_types = [x.real_type for x in schema.arguments]
@@ -1907,7 +2280,13 @@ def fill_args(arg, arg_type):
                 else:
                     fill_args(arg, arg_type)
 
+<<<<<<< HEAD
         def fill_output_arg(arg, return_type, is_mutated_output: bool):
+=======
+        def fill_output_arg(
+            arg: str, return_type: torch.JitType, is_mutated_output: bool
+        ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(return_type, torch.TensorType):
                 if not is_mutated_output:
                     self.writeline(f"AtenTensorHandle {arg}_handle;  // output buffer")
@@ -1925,9 +2304,17 @@ def fill_output_arg(arg, return_type, is_mutated_output: bool):
             else:
                 raise AssertionError(f"Unsupported return type found: {return_type}")
 
+<<<<<<< HEAD
         # TODO: Only support tensor(s) returns for now, SymInt is not implemented yet
         for return_type in return_types:
             if isinstance(return_type, (torch.TensorType)):
+=======
+        # TODO: Only support None and tensor(s) returns for now, SymInt is not implemented yet
+        for return_type in return_types:
+            if isinstance(
+                return_type, (torch.TensorType, torch.NoneType, torch.IntType)
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 pass
             elif isinstance(return_type, torch.OptionalType):
                 assert isinstance(return_type.getElementType(), torch.TensorType)
@@ -1939,9 +2326,20 @@ def fill_output_arg(arg, return_type, is_mutated_output: bool):
                 )
 
         for output_arg, raw_output_arg in zip(output_args, raw_outputs):  # type: ignore[arg-type]
+<<<<<<< HEAD
             assert output_arg is not None, "Optional return types are not yet supported"
             if isinstance(output_arg, (list, tuple)):
                 for out in output_arg:
+=======
+            # None output is supported, but Optional return types are not yet supported
+            if output_arg is None:
+                continue
+            elif isinstance(raw_output_arg, int):
+                new_int_args.append(str(raw_output_arg))
+            elif isinstance(output_arg, list):
+                for out in output_arg:
+                    assert out is not None, out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     fill_output_arg(
                         out,
                         torch.TensorType.get(),
@@ -1956,10 +2354,41 @@ def fill_output_arg(arg, return_type, is_mutated_output: bool):
 
         return new_tensor_args, new_int_args
 
+<<<<<<< HEAD
+=======
+    @staticmethod
+    def _compatible_with_stableivalue(op: torch._ops.OpOverload) -> bool:
+        """Returns true if op_overload._schema only utilizes types supported by the AOT
+        C-shim *internal* function to_ivalue.  to_ivalue is an implementation detail, so
+        these types are not guaranteed to be supported long-term.  When generating code
+        for cpp_wrapper mode, we don't have to be forward-compatible, so changing this
+        function's implementation in future is fine."""
+        supported_types = (
+            torch.BoolType,
+            torch.DeviceObjType,
+            torch.FloatType,
+            # ScalarTypeType, LayoutType, and MemoryFormatType are seen as IntType
+            # when queried via torch.JitType.type.
+            torch.IntType,
+            torch.TensorType,
+        )
+
+        def type_supported(t: torch.JitType) -> bool:
+            if isinstance(t, torch.OptionalType):
+                return type_supported(t.getElementType())
+            return isinstance(t, supported_types)
+
+        return all(
+            type_supported(a.type)
+            for a in chain(op._schema.arguments, op._schema.returns)
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def generate_fallback_kernel_with_runtime_lookup(
         self,
         buf_name: str,
         python_kernel_name: str,
+<<<<<<< HEAD
         cpp_kernel_name: str,
         codegen_args: list[str],
         op_overload: Optional[torch._ops.OpOverload] = None,
@@ -1972,11 +2401,30 @@ def extract_output_name(out):
             elif isinstance(out, (ir.MultiOutput, ir._CollectiveKernel)):
                 return out.get_name()
             elif isinstance(out, ir.MutationOutput):
+=======
+        get_args: Callable[[], Sequence[str]],
+        op_overload: Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator],
+        raw_args: Sequence[Any],
+        outputs: Sequence[ir.Buffer],
+    ) -> None:
+        """Generate a call to a kernel not contained in the C-shim.  This results in
+        different code paths for AOT Inductor vs cpp_wrapper Inductor mode."""
+
+        def extract_output_name(
+            out: Optional[Union[ir.Buffer, Sequence[ir.Buffer]]],
+        ) -> Union[Optional[str], _OUTPUT_ARGS_TYPE]:
+            if out is None:
+                return None
+            if isinstance(out, (ir.MultiOutput, ir._CollectiveKernel)):
+                return out.get_name()
+            if isinstance(out, ir.MutationOutput):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mutated_buf_names = out.get_mutation_names()
                 assert (
                     isinstance(mutated_buf_names, list) and len(mutated_buf_names) == 1
                 ), "Expect only one mutated buffer in MutationOutput"
                 return mutated_buf_names[0]
+<<<<<<< HEAD
             elif isinstance(out, (list, tuple)):
                 return type(out)(extract_output_name(o) for o in out)
             else:
@@ -2027,6 +2475,77 @@ def extract_output_name(out):
                 output_args,
                 outputs,
             )
+=======
+            if isinstance(out, (list, tuple)):
+                return [extract_output_name(o) for o in out]  # type: ignore[misc]
+            if isinstance(out, int):
+                return str(out)
+            raise AssertionError(f"Unexpected output: {type(out)}")
+
+        if isinstance(op_overload, torch._ops.HigherOrderOperator):
+            assert isinstance(
+                op_overload, torch._higher_order_ops.torchbind.CallTorchBind
+            ), type(op_overload)
+            assert len(raw_args) > 1
+            obj = raw_args[0]
+            method = raw_args[1]
+            return_schema = op_overload.schema(obj, method).returns
+        else:
+            return_schema = op_overload._schema.returns
+
+        # output_args has the same pytree structure as outputs
+        if not return_schema:
+            # kernel does not return a value
+            output_args: _OUTPUT_ARGS_TYPE = []
+        elif isinstance(output_name := extract_output_name(outputs), str):
+            output_args = [output_name]
+        else:
+            # If the schema indicates a return value, we should have a non-None value by
+            # this point.
+            assert isinstance(output_name, list), type(output_name)
+            output_args = output_name
+
+        # In AOT mode, we use a ProxyExecutor to run fallback kernels.
+        if V.graph.aot_mode:
+            self.generate_fallback_kernel_with_runtime_lookup_aot(
+                op_overload,
+                raw_args,
+                output_args,
+                outputs,
+            )
+            return
+
+        assert isinstance(op_overload, torch._ops.OpOverload), type(op_overload)
+        for output in output_args:
+            assert output is None or isinstance(output, str), (
+                "fallback kernels with runtime lookup currently only support tensor "
+                "returns, not more complicated types (such as list-of-list-of-tensor)"
+            )
+
+        # In non-AOT mode, we use aoti_torch_call_dispatcher if all the inputs and
+        # outputs of the op can be represented with StableIValue.  This avoids the
+        # overhead of calling back into Python, and covers most remaining fallback ops.
+        if self._compatible_with_stableivalue(op_overload):
+            self.generate_fallback_kernel_with_runtime_lookup_nopython(
+                get_args,
+                op_overload,
+                output_args,  # type: ignore[arg-type]
+                outputs,
+            )
+            return
+
+        # Otherwise, we call back into Python, which has some extra runtime overhead,
+        # but handles situations like list[Tensor] (currently unrepresentable via
+        # StableIValue).
+        self.generate_fallback_kernel_with_runtime_lookup_python(
+            buf_name,
+            python_kernel_name,
+            op_overload,
+            raw_args,
+            output_args,  # type: ignore[arg-type]
+            outputs,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def generate_scoped_gil_acquire(self, declarations_before_scope, lines_in_scope):
         scoped_lines = IndentedBuffer()
@@ -2047,11 +2566,19 @@ def load_custom_op_wrapper(self):
 
         lines = """
 RAIIPyObject codecache_module(PyImport_ImportModule("torch._inductor.codecache"));
+<<<<<<< HEAD
 if (codecache_module.get() == NULL) {
     throw std::runtime_error("Failed to load torch._inductor.codecache");
 }
 custom_op_wrapper = PyObject_GetAttrString(codecache_module, "custom_op_wrapper");
 if (custom_op_wrapper.get() == NULL) {
+=======
+if (!codecache_module) {
+    throw std::runtime_error("Failed to load torch._inductor.codecache");
+}
+custom_op_wrapper = PyObject_GetAttrString(codecache_module, "custom_op_wrapper");
+if (!custom_op_wrapper) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw std::runtime_error("Failed to load torch._inductor.codecache.custom_op_wrapper");
 }"""
 
@@ -2076,11 +2603,14 @@ def generate_float_value(self, val):
 
     def generate_py_arg(self, py_args_var, idx, raw_arg, arg_type):
         def generate_py_arg_inner(lines, raw_arg, arg_type):
+<<<<<<< HEAD
             def add_py_newref():
                 if sys.version_info < (3, 10):
                     # Py_NewRef is only available since Python 3.10
                     self.include_extra_header("torch/csrc/utils/pythoncapi_compat.h")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def handle_scalar(scalar):
                 if isinstance(scalar, int):
                     return f"PyLong_FromLongLong({scalar})"
@@ -2141,6 +2671,7 @@ def handle_scalar(scalar):
                 # torch/_prims_common/__init__.py
                 return handle_scalar(raw_arg)
             elif isinstance(raw_arg, torch.device):
+<<<<<<< HEAD
                 # device
                 self.include_extra_header("torch/csrc/Device.h")
                 device_str, device_index = self.codegen_device(raw_arg).split(", ")
@@ -2159,6 +2690,15 @@ def handle_scalar(scalar):
                 # memory_format
                 add_py_newref()
                 self.include_extra_header("torch/csrc/utils/tensor_memoryformats.h")
+=======
+                device_str, device_index = self.codegen_device(raw_arg).split(", ")
+                return f"THPDevice_New(c10::Device(static_cast<c10::DeviceType>({device_str}), {device_index}))"
+            elif isinstance(raw_arg, torch.dtype):
+                return f"Py_NewRef(torch::getTHPDtype(static_cast<c10::ScalarType>({self.codegen_dtype(raw_arg)})))"
+            elif isinstance(raw_arg, torch.layout):
+                return f"Py_NewRef(torch::getTHPLayout(static_cast<c10::Layout>({self.codegen_layout(raw_arg)})))"
+            elif isinstance(raw_arg, torch.memory_format):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return (
                     "Py_NewRef(torch::utils::getTHPMemoryFormat(static_cast<c10::MemoryFormat>("
                     f"{self.codegen_memory_format(raw_arg)})))"
@@ -2187,6 +2727,7 @@ def handle_scalar(scalar):
             )
         return "".join(lines)
 
+<<<<<<< HEAD
     def generate_fallback_kernel_with_runtime_lookup_jit(
         self,
         buf_name: str,
@@ -2204,21 +2745,163 @@ def generate_fallback_kernel_with_runtime_lookup_jit(
         self.load_custom_op_wrapper()
 
         assert output_args is not None, "output_args should not be None"
+=======
+    def generate_fallback_kernel_with_runtime_lookup_nopython(
+        self,
+        get_args: Callable[[], Sequence[str]],
+        op_overload: torch._ops.OpOverload,
+        output_args: Sequence[Optional[str]],
+        raw_outputs: Sequence[ir.Buffer],
+    ) -> None:
+        """Generate fallback kernel calls with runtime (non-AOT) dispatch.  This can
+        only be called in cpp_wrapper mode, and assumes that the input is a non-None
+        OpOverload.
+
+        In the future, we may switch over to directly calling c10::Dispatcher if we need
+        to support more datatypes."""
+        if raw_outputs:
+            declarations_before_scope = [
+                f"RAIIAtenTensorHandle {output_arg};"
+                for output_arg, raw_output_arg in zip(output_args, raw_outputs)  # type: ignore[arg-type]
+                if output_arg is not None
+                and not isinstance(raw_output_arg, ir.MutationOutput)
+            ]
+        else:
+            declarations_before_scope = [
+                f"RAIIAtenTensorHandle {output_arg};"
+                for output_arg in output_args  # type: ignore[arg-type]
+                if output_arg is not None
+            ]
+
+        dispatch_lines = IndentedBuffer()
+        dispatch_lines.writelines(declarations_before_scope)
+        dispatch_lines.writeline("{")
+
+        with dispatch_lines.indent():
+            tmp_var_number = count()
+
+            def parse_arg(arg_type: torch.JitType, codegen_arg: str) -> str:
+                # Strip off any temporary references; we're in an indented context, so
+                # any saved-off variables will be auto-destroyed.
+                new_codegen_arg = codegen_arg.removeprefix("&temporary_reference(")
+                if new_codegen_arg != codegen_arg:
+                    # If we removed temporary_reference, there's a good chance the
+                    # variable ends with get() (which would retrieve an ATenTensorHandle
+                    # from a temporary RAII handle).  Strip that off too, since we're
+                    # going to save this in a temporary RAII handle.
+                    if codegen_arg.endswith(".get())"):
+                        codegen_arg = new_codegen_arg.removesuffix(".get())")
+                    else:
+                        codegen_arg = new_codegen_arg.removesuffix(")")
+
+                if isinstance(arg_type, torch.OptionalType):
+                    # If we have a pointer to a variable, strip it off and let
+                    # from<std::optional> handle any internal pointers.
+                    codegen_arg = codegen_arg.removeprefix("&")
+
+                    if codegen_arg == "nullptr":
+                        return "from(std::nullopt)"
+
+                    var_name = f"tmp_var_{next(tmp_var_number)}"
+                    dispatch_lines.writeline(
+                        f"std::optional {var_name}{{{parse_arg(arg_type.getElementType(), codegen_arg)}}};"
+                    )
+                    return f"from({var_name})"
+
+                raii_var = self.create_tmp_raii_handle_var_if_needed(
+                    codegen_arg, dispatch_lines
+                )
+                temp_handle = raii_var != codegen_arg
+
+                if isinstance(arg_type, torch.TensorType):
+                    if not temp_handle:
+                        # If the RAII tensor being referenced _isn't_ a temporary,
+                        # scoped to this fallback call, then create a new handle
+                        # referencing it which from<AtenTensorHandle> can steal.
+                        var_name = f"tmp_var_{next(tmp_var_number)}"
+                        dispatch_lines.writeline(f"AtenTensorHandle {var_name};")
+                        dispatch_lines.writeline(
+                            f"aoti_torch_new_tensor_handle({raii_var}, &{var_name});"
+                        )
+                        return f"from({var_name})"
+                    # If the RAII tensor _is_ a temporary scoped to this fallback call,
+                    # simply release and steal the handle.
+                    return f"from({raii_var}.release())"
+                return f"from({codegen_arg})"
+
+            codegen_args = get_args()
+            ivalue_args = (
+                parse_arg(a.type, c)
+                for a, c in zip(op_overload._schema.arguments, codegen_args)
+            )
+            array_len = max(len(codegen_args), len(output_args))
+            dispatch_lines.writeline(
+                f"std::array<StableIValue, {array_len}> dispatch_vars{{{', '.join(ivalue_args)}}};"
+            )
+            dispatch_lines.writeline("AOTI_TORCH_ERROR_CODE_CHECK(")
+            with dispatch_lines.indent():
+                dispatch_lines.writeline(
+                    f'aoti_torch_call_dispatcher("{op_overload._schema.name}", "{op_overload._schema.overload_name}", dispatch_vars.data())'  # noqa: B950
+                )
+            dispatch_lines.writeline(");")
+
+            if len(output_args) == 1 and (output := output_args[0]) is not None:
+                # result is a single tensor
+                dispatch_lines.writeline(
+                    f"{output} = to<AtenTensorHandle>(dispatch_vars[0]);"
+                )
+            else:
+                # result is a tuple of tensors
+                for idx, output_arg in enumerate(output_args):
+                    if output_arg is None:
+                        continue
+                    dispatch_lines.writeline(
+                        f"{output_arg} = to<AtenTensorHandle>(dispatch_vars[{idx}]);"
+                    )
+
+        dispatch_lines.writeline("}")
+        self.writelines(dispatch_lines.getvalue().splitlines())
+
+    def generate_fallback_kernel_with_runtime_lookup_python(
+        self,
+        buf_name: str,
+        python_kernel_name: str,
+        op_overload: torch._ops.OpOverload,
+        raw_args: Sequence[Any],
+        output_args: Sequence[Optional[str]],
+        raw_outputs: Sequence[ir.Buffer],
+    ) -> None:
+        """Generate fallback kernel calls with runtime (non-AOT) dispatch.  This can
+        only be called in cpp_wrapper mode, and assumes that the input is a non-None
+        OpOverload.
+
+        This function calls into Python to dispatch, which allows it to handle datatypes
+        that cannot be contained in StableIValue, at the cost of some performance."""
+        self.load_custom_op_wrapper()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_args = len(raw_args)
         py_args_var = f"py_args_{next(self.arg_var_id)}"
         # First arg is always the python op name
         lines = textwrap.dedent(
             f"""
             RAIIPyObject {py_args_var}(PyTuple_New({num_args + 1}));
+<<<<<<< HEAD
             if ({py_args_var}.get() == NULL) {{
+=======
+            if (!{py_args_var}) {{
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 throw std::runtime_error("PyTuple_New {py_args_var} failed");
             }}
             PyTuple_SetItem({py_args_var}, 0, PyUnicode_FromString("{python_kernel_name}"));
             """
         )
 
+<<<<<<< HEAD
         assert op_overload is not None, "op_overload should not be None"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for idx, (raw_arg, schema_arg) in enumerate(
             zip(raw_args, op_overload._schema.arguments)
         ):
@@ -2230,7 +2913,11 @@ def generate_fallback_kernel_with_runtime_lookup_jit(
             f"""
             // Call the custom op in Python
             RAIIPyObject py_{buf_name}(PyObject_CallObject(custom_op_wrapper, {py_args_var}));
+<<<<<<< HEAD
             if (py_{buf_name}.get() == NULL) {{
+=======
+            if (!py_{buf_name}) {{
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if (PyErr_Occurred()) {{
                     return;
                 }}
@@ -2239,9 +2926,15 @@ def generate_fallback_kernel_with_runtime_lookup_jit(
             """
         )
 
+<<<<<<< HEAD
         if len(output_args) == 1:
             # result is a single tensor
             lines += f"{output_args[0]} = reinterpret_cast<AtenTensorHandle>(PyCapsule_GetPointer(py_{buf_name}.get(), NULL));\n"
+=======
+        if len(output_args) == 1 and (output := output_args[0]) is not None:
+            # result is a single tensor
+            lines += f"{output} = reinterpret_cast<AtenTensorHandle>(PyCapsule_GetPointer(py_{buf_name}.get(), NULL));\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             # result is a tuple of tensors
             for idx, output_arg in enumerate(output_args):
@@ -2269,11 +2962,19 @@ def generate_fallback_kernel_with_runtime_lookup_jit(
 
     def generate_fallback_kernel_with_runtime_lookup_aot(
         self,
+<<<<<<< HEAD
         op_overload,
         raw_args,  # contains both args and flatten kwargs
         output_args: Optional[list[str]] = None,
         raw_outputs: Optional[list[ir.Buffer]] = None,
     ):
+=======
+        op_overload: Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator],
+        raw_args: Sequence[Any],
+        output_args: _OUTPUT_ARGS_TYPE,
+        raw_outputs: Sequence[ir.Buffer],
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             tensor_call_args,
             int_call_args,
@@ -2317,7 +3018,11 @@ def c_type_for_prim_type(self, val, type_) -> str:
             return "int64_t"
         elif isinstance(
             type_, (torch.BoolType, torch.SymBoolType, torch.EnumType)
+<<<<<<< HEAD
         ) or repr(type_) in ("ScalarType", "Layout"):
+=======
+        ) or repr(type_) in ("Layout", "MemoryFormat", "ScalarType"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return "int32_t"
         elif isinstance(type_, torch.FloatType):
             return "double"
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
index e0d56cfc5ddb..d8cf542a1740 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
@@ -1,5 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from typing import Callable, Optional
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import sympy
 
@@ -87,7 +92,26 @@ def codegen_input_numel_asserts(self):
             numel = buf.get_numel()
             self.prefix.writeline(f"assert_numel({name}, {numel});")
 
+<<<<<<< HEAD
     def generate_kernel_call(
+=======
+    def generate_extern_kernel_alloc(self, *args, **kwargs):
+        # Disable stack allocation for extern kernels.
+        self.allow_stack_allocation = False
+        super().generate_extern_kernel_alloc(*args, **kwargs)
+
+    def generate_extern_kernel_out(self, *args, **kwargs):
+        # Disable stack allocation for extern kernels.
+        self.allow_stack_allocation = False
+        super().generate_extern_kernel_out(*args, **kwargs)
+
+    def generate_fallback_kernel(self, node: ir.FallbackKernel) -> None:
+        # Disable stack allocation for extern kernels.
+        self.allow_stack_allocation = False
+        super().generate_fallback_kernel(node)
+
+    def _generate_kernel_call_helper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         kernel_name: str,
         call_args,
@@ -95,8 +119,16 @@ def generate_kernel_call(
         device=None,
         triton=True,
         arg_types=None,
+<<<<<<< HEAD
         raw_args=None,
         triton_meta=None,
+=======
+        raw_keys=None,
+        raw_args=None,
+        triton_meta=None,
+        graph_name="",
+        original_fxnode_name=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Generates kernel call code.
@@ -189,6 +221,7 @@ def write_wrapper_decl(self):
                         """
                     )
 
+<<<<<<< HEAD
                 run_impl_proto = ""
                 if config.aot_inductor.compile_wrapper_with_O0:
                     run_impl_proto += """
@@ -200,6 +233,9 @@ def write_wrapper_decl(self):
                     """
 
                 run_impl_proto += """
+=======
+                run_impl_proto = """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     void AOTInductorModel::run_impl(
                         AtenTensorHandle*
                             input_handles, // array of input AtenTensorHandle; handles
@@ -348,7 +384,11 @@ def write_wrapper_decl(self):
                     else:
                         self.prefix.writeline("inputs.clear();")
                 self.prefix.writeline(
+<<<<<<< HEAD
                     "auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());"
+=======
+                    "[[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     def generate_return(self, output_refs: list[str]):
@@ -719,7 +759,11 @@ def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
         self._assert_safe_to_use_borrow_arrayref_tensor_as_tensor()
         # TODO: update aoti_torch_index_put_out in ir.py to use autogen out version
         # See the comment in codegen_reinterpret_view about why having something like
+<<<<<<< HEAD
         # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the correponding
+=======
+        # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the corresponding
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # tensor prematurely deallocated, thus the temporary array trick here.
         indices_str = self._generate_temporary_array_pointer(
             "AtenTensorHandle",
@@ -741,6 +785,7 @@ def generate_fallback_kernel_with_runtime_lookup(
         self,
         buf_name: str,
         python_kernel_name: str,
+<<<<<<< HEAD
         cpp_kernel_name: str,
         codegen_args: list[str],
         op_overload: Optional[torch._ops.OpOverload] = None,
@@ -792,6 +837,18 @@ def extract_output_name(out):
                 output_args,
                 outputs,
             )
+=======
+        get_args: Callable[[], Sequence[str]],
+        op_overload: Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator],
+        raw_args: Sequence[Any],
+        outputs: Sequence[ir.Buffer],
+    ) -> None:
+        # No stack allocation when there is a fallback op
+        self.allow_stack_allocation = False
+        super().generate_fallback_kernel_with_runtime_lookup(
+            buf_name, python_kernel_name, get_args, op_overload, raw_args, outputs
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_device_copy(self, src, dst, non_blocking: bool):
         # aoti_torch_tensor_copy_ takes AtenTensorHandle as input,
@@ -843,6 +900,7 @@ def create_new_tensor_handle() -> tuple[str, list[str]]:
             if (name := data.get_name()) in self.stack_allocated_buffers:
                 return name, []
 
+<<<<<<< HEAD
             # TODO (benjaminglass1): uncomment this and remove  create_reinterpret_view
             # after the AOTI forwards compatibility window has passed.
             #
@@ -853,6 +911,14 @@ def create_new_tensor_handle() -> tuple[str, list[str]]:
             # ]
             # return f"RAIIAtenTensorHandle({tmp_AtenTensorHandle})", tmp_call_strs
             return create_reinterpret_call(), []
+=======
+            tmp_AtenTensorHandle = f"tmp_{name}_{next(self.tmp_tensor_id)}"
+            tmp_call_strs = [
+                f"AtenTensorHandle {tmp_AtenTensorHandle};",
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_tensor_handle({data.get_name()}, &{tmp_AtenTensorHandle}));",
+            ]
+            return f"RAIIAtenTensorHandle({tmp_AtenTensorHandle})", tmp_call_strs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if (
             size == data.layout.size
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
index d717087a2aa9..9642531e198d 100644
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -9,17 +9,34 @@
 
 import sympy
 
+<<<<<<< HEAD
+=======
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch import dtype as torch_dtype
 from torch._inductor.codecache import get_cpp_wrapper_cubin_path_name
 from torch._inductor.runtime.runtime_utils import dynamo_timed
 
 from .. import config
 from ..codecache import CudaKernelParamCache
+<<<<<<< HEAD
 from ..ir import GraphPartitionSignature, TensorBox
 from ..utils import cache_on_self, get_gpu_type, GPU_ALIGN_BYTES, IndentedBuffer
 from ..virtualized import V
 from .aoti_hipify_utils import maybe_hipify_code_wrapper
 from .common import get_device_op_overrides
+=======
+from ..ir import (
+    GraphPartitionSignature,
+    TensorBox,
+    TMADescriptorExperimental,
+    TMADescriptorStable,
+)
+from ..utils import cache_on_self, get_gpu_type, GPU_ALIGN_BYTES, IndentedBuffer
+from ..virtualized import V
+from .aoti_hipify_utils import maybe_hipify_code_wrapper
+from .common import get_device_op_overrides, TritonScratchWorkspace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .cpp_utils import cexpr
 from .cpp_wrapper_cpu import CppWrapperCpu
 from .multi_kernel import MultiKernelCall
@@ -54,9 +71,19 @@ class DeferredTritonCallWrapper:
 
     wrapper_name: str
     kernel_name: str
+<<<<<<< HEAD
     arg_types: list[Any]
 
     def generate(self, wrapper: CppWrapperGpu):
+=======
+    kernel_name_to_body: dict[str, str]
+    arg_types: list[Any]
+
+    def generate(self, wrapper: CppWrapperGpu):
+        """
+        Generate the GPU kernel definition, as well as load and launch code.
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prefix = wrapper.prefix
         if self.kernel_name.startswith("multi_kernel_"):
             # MultiKernel will select one kernel after running the autotune block
@@ -110,7 +137,16 @@ def generate(self, wrapper: CppWrapperGpu):
                     prefix.writeline(f"bool {name},")
                 else:
                     raise ValueError(f"Unexpected arg type {arg_type}")
+<<<<<<< HEAD
             prefix.writeline(f"{wrapper.device_codegen.cpp_stream_type()} stream_,")
+=======
+            prefix.writeline("int32_t device_idx_,")
+            prefix.writeline(
+                maybe_hipify_code_wrapper(
+                    f"{wrapper.device_codegen.cpp_stream_type()} stream_,"
+                )
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if V.graph.aot_mode:
                 prefix.writeline("kernels_type_& kernels_,")
             prefix.writeline(
@@ -118,14 +154,31 @@ def generate(self, wrapper: CppWrapperGpu):
             )
         prefix.writeline("){")
         with prefix.indent():
+<<<<<<< HEAD
+=======
+            if V.graph.aot_mode:
+                # Emit the original Triton kernel for debugging purposes
+                prefix.writeline("/*")
+                prefix.splice(self.kernel_name_to_body[self.kernel_name])
+                prefix.writeline("*/")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.generate_grid(prefix, inductor_meta, params)
             self.generate_load_kernel(prefix, kernel_var_name, params)
             self.generate_launch_kernel(prefix, wrapper, kernel_var_name, params)
         prefix.writeline("}")
+<<<<<<< HEAD
         # Ensure the cubin file is included in the package
         V.graph.wrapper_code.additional_files.append(
             params[get_cpp_wrapper_cubin_path_name()]
         )
+=======
+
+        if not config.aot_inductor.embed_kernel_binary:
+            # Ensure the cubin file is included in the package
+            V.graph.wrapper_code.additional_files.append(
+                params[get_cpp_wrapper_cubin_path_name()]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def generate_grid(
         self,
@@ -150,12 +203,36 @@ def generate_grid(
     def generate_load_kernel(self, prefix, kernel_var_name, params):
         prefix.writeline(f"if ({kernel_var_name} == nullptr) {{")
         with prefix.indent():
+<<<<<<< HEAD
             load_kernel_args = [
                 cpp_string_literal(params[get_cpp_wrapper_cubin_path_name()]),
                 cpp_string_literal(params["mangled_name"]),
                 str(params["shared_mem"]),
                 "cubin_dir_",
             ]
+=======
+            embed_kernel_args = [f"__{params['inductor_meta']['kernel_name']}_start"]
+            if torch.xpu.is_available():
+                # XPU needs the end address of the kernel to calculate the size of the kernel binary.
+                embed_kernel_args.append(
+                    f"__{params['inductor_meta']['kernel_name']}_end"
+                )
+
+            load_kernel_args = (
+                [
+                    *embed_kernel_args,
+                    cpp_string_literal(params["mangled_name"]),
+                    str(params["shared_mem"]),
+                ]
+                if V.graph.aot_mode and config.aot_inductor.embed_kernel_binary
+                else [
+                    cpp_string_literal(params[get_cpp_wrapper_cubin_path_name()]),
+                    cpp_string_literal(params["mangled_name"]),
+                    str(params["shared_mem"]),
+                    "cubin_dir_",
+                ]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prefix.writeline(
                 f"{kernel_var_name} = loadKernel({', '.join(load_kernel_args)}); "
             )
@@ -175,7 +252,15 @@ def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
         arg_types = [arg_type_loookup[name] for name in call_args]
         arg_signatures = [triton_meta["signature"][name] for name in call_args]
         call_args_str = wrapper.generate_args_decl(
+<<<<<<< HEAD
             prefix, call_args, arg_types, arg_signatures
+=======
+            prefix,
+            call_args,
+            arg_types,
+            arg_signatures,
+            workspace_size=params.get("global_scratch") or 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         prefix.writeline(f"void* kernel_args_[] = {{{call_args_str}}};")
         launch_kernel_args = [
@@ -201,7 +286,13 @@ def __init__(self) -> None:
         self.device_codegen = get_device_op_overrides(self.device)
         super().__init__()
         self.grid_id = count()
+<<<<<<< HEAD
         self._triton_call_wrappers: dict[str, DeferredTritonCallWrapper] = {}
+=======
+        self._kernel_name_to_body: dict[str, str] = {}
+        self._triton_call_wrappers: dict[str, DeferredTritonCallWrapper] = {}
+        self.autotune_input_prefix = "_REAL_AUTOTUNE_INPUT"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def create(
@@ -228,7 +319,11 @@ def write_header(self):
     def write_tma_descriptor_helpers_once(self):
         self.header.splice(self.device_codegen.tma_descriptor_helpers())
 
+<<<<<<< HEAD
     def write_get_raw_stream(self, device_idx: int, graph=None) -> str:
+=======
+    def write_get_raw_stream(self, device_idx: int, graph_name: str) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = f"stream{device_idx}"
         self.writeline(
             maybe_hipify_code_wrapper(
@@ -240,6 +335,12 @@ def write_get_raw_stream(self, device_idx: int, graph=None) -> str:
         )
         return name
 
+<<<<<<< HEAD
+=======
+    def get_autotuning_input_name(self, idx):
+        return f"{self.autotune_input_prefix}_{idx}"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_inputs(self):
         # See Note: [Input Alignment handling in Inductor]
         #
@@ -279,7 +380,11 @@ def codegen_inputs(self):
 
         super().codegen_inputs()
 
+<<<<<<< HEAD
     def define_kernel(
+=======
+    def _define_kernel_helper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         kernel_name: str,
         kernel_body: str,
@@ -288,6 +393,7 @@ def define_kernel(
         cpp_definition: Optional[str] = None,
     ):
         if gpu:
+<<<<<<< HEAD
             if config.triton.autotune_at_compile_time:
                 # Call PythonWrapperCodegen to create the autotune code block
                 PythonWrapperCodegen.define_kernel(
@@ -295,6 +401,16 @@ def define_kernel(
                 )
         else:
             return CppWrapperCpu.define_kernel(
+=======
+            self._kernel_name_to_body[kernel_name] = kernel_body
+            if config.triton.autotune_at_compile_time:
+                # Call PythonWrapperCodegen to create the autotune code block
+                PythonWrapperCodegen._define_kernel_helper(
+                    self, kernel_name, kernel_body, metadata, gpu, cpp_definition
+                )
+        else:
+            return CppWrapperCpu._define_kernel_helper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self, kernel_name, kernel_body, metadata, gpu, cpp_definition
             )
 
@@ -305,17 +421,45 @@ def generate(self, is_inference):
     def finalize_prefix(self):
         """Define the triton kernels now that autotuning is finished"""
         old_prefix = self.prefix  # new content should go at start of prefix
+<<<<<<< HEAD
         self.prefix = IndentedBuffer()
         super().finalize_prefix()
         for kernel in self._triton_call_wrappers.values():
             self.prefix.writeline("\n")
             kernel.generate(self)
+=======
+
+        # Generating triton kernel callers can modify the prefix (cached dtypes),
+        # so do this before running finalize_prefix(), but put the generated code
+        # after the finalize_prefix() code.
+        self.prefix = IndentedBuffer()
+        for kernel in self._triton_call_wrappers.values():
+            self.prefix.writeline("\n")
+            kernel.generate(self)
+        triton_prefix = self.prefix
+
+        self.prefix = IndentedBuffer()
+        super().finalize_prefix()
+
+        self.prefix.splice(triton_prefix)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.prefix.writeline("\n")
         self.prefix.splice(old_prefix)
 
     def generate_tma_descriptor(self, desc):
         self.write_tma_descriptor_helpers_once()
 
+<<<<<<< HEAD
+=======
+        if isinstance(desc, TMADescriptorExperimental):
+            self._generate_experimental_tma_descriptor(desc)
+        else:
+            assert isinstance(desc, TMADescriptorStable)
+            self._generate_stable_tma_descriptor(desc)
+
+    def _generate_experimental_tma_descriptor(self, desc):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # generate data pointer for the source tensor
         source = self.generate_args_decl(
             code=self,
@@ -331,7 +475,11 @@ def generate_tma_descriptor(self, desc):
 
         # `source` is in the form of `&var_x`, where `var_x` is the data pointer
         # (CUdeviceptr); we dereference `source` and cast to `void*` to pass to
+<<<<<<< HEAD
         # the data pointer of the source tensor ot the helper function
+=======
+        # the data pointer of the source tensor to the helper function
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # `init{1,2}DTMADescriptor`
         ptr = f"reinterpret_cast<void*>(*({source}))"
         dims = ", ".join(self.val_to_arg_str(dim) for dim in desc.dims)
@@ -341,6 +489,51 @@ def generate_tma_descriptor(self, desc):
         args = f"&{desc_name}, {ptr}, {dims}, {block_dims}, {element_size}"
         self.writeline(f"{fn}({args});")
 
+<<<<<<< HEAD
+=======
+    def _generate_stable_tma_descriptor(self, desc):
+        source = self.generate_args_decl(
+            code=self,
+            call_args=[self.val_to_arg_str(desc.tensor)],
+            arg_types=[desc.tensor.get_dtype()],
+            arg_signatures=[None],
+            # these args are passed to initNDTMADescriptor, which is NOT a triton kernel
+            is_triton_kernel=False,
+        )
+
+        desc_name = desc.name
+        # Pack the relevant information into a StableTMADescriptor struct.
+        # See [Note: AOTI TMA Stable handling] for more details.
+        self.writeline(f"alignas(64) StableTMADescriptor {desc_name};")
+
+        def fill_array(name, values):
+            for i, val in enumerate(values):
+                self.writeline(f"{name}[{i}] = {val};")
+
+        ptr = f"reinterpret_cast<void*>(*({source}))"
+        rank = len(desc.tensor.get_size())
+
+        fill_array(f"{desc_name}.block_shape", desc.block_shape)
+        fill_array(f"{desc_name}.global_shape", desc.tensor.get_size())
+        fill_array(f"{desc_name}.strides", desc.tensor.get_stride())
+
+        element_size = self.val_to_arg_str(desc.tensor.get_dtype().itemsize)
+        fn = "initTMADescriptor"
+        args = ", ".join(
+            str(x)
+            for x in [
+                f"&{desc_name}.m",
+                ptr,
+                element_size,
+                rank,
+                f"{desc_name}.block_shape",
+                f"{desc_name}.global_shape",
+                f"{desc_name}.strides",
+            ]
+        )
+        self.writeline(f"{fn}({args});")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def generate_args_decl(
         self,
         code: Union[IndentedBuffer, Self],
@@ -348,6 +541,10 @@ def generate_args_decl(
         arg_types,
         arg_signatures,
         is_triton_kernel=True,
+<<<<<<< HEAD
+=======
+        workspace_size=0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Generates any declarations of args to pass into a kernel call, and then returns the arg names.
@@ -372,28 +569,93 @@ def generate_args_decl(
             "fp32": "float",
         }
 
+<<<<<<< HEAD
         def process_args(arg, arg_type, arg_signature=None):
             var_name = f"var_{next(self.arg_var_id)}"
             # ignore nvTmaDesc, as host-side TMA descriptors need
             # to be passed to the compiled Triton kernel by value
             if isinstance(arg_type, UnwrapUnspecArg) and arg_signature != "nvTmaDesc":
+=======
+        def signature_is_tma_desc(sig):
+            if not sig:
+                return False
+            if sig == "nvTmaDesc":
+                return True
+            if sig.startswith("tensordesc<"):
+                return True
+            return False
+
+        def process_tma_stable_arg(arg, arg_type, arg_signature, var_name):
+            # [Note: AOTI TMA Stable handling]
+            # For most args, a single arg passed to the python triton interface
+            # maps to a single arg in the cubin interface. However, for host-side
+            # TMA descriptors, a single python arg turns into 1 + 2 * N args in the
+            # cubin interface (where N is the rank).
+            #
+            # To do this: at TMA codegen time (for aoti), we generate a struct
+            # (StableTMADescriptor) containing the necessary information; and then
+            # when we call the function (i.e. here), we unpack the struct members.
+            code.writeline(f"auto {var_name} = {cexpr(arg)};")
+
+            result = []
+            result.append(f"&{var_name}.m")
+
+            # from https://github.com/triton-lang/triton/blob/16961b79bdac1b774b42d44e52fd55a266ec2866/third_party/nvidia/backend/driver.py#L111  # noqa: B950
+            match = re.match("tensordesc<([^[>]*)\\[([^]]*)\\]", arg_signature)
+            assert match is not None
+            shape = match.group(2)
+            ndim = shape.count(",") + 1
+
+            for i in range(ndim):
+                result.append(f"&{var_name}.block_shape[{i}]")
+
+            for i in range(ndim):
+                result.append(f"&{var_name}.strides[{i}]")
+
+            return result
+
+        def process_args(arg, arg_type, arg_signature=None):
+            var_name = f"var_{next(self.arg_var_id)}"
+            # ignore tma descriptors, as host-side TMA descriptors need
+            # to be passed to the compiled Triton kernel by value
+            if isinstance(arg_type, UnwrapUnspecArg) and not signature_is_tma_desc(
+                arg_signature
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.codegen_tensor_item(
                     arg_type.dtype,
                     arg,
                     var_name,
                     indented_buffer=code,
                 )
+<<<<<<< HEAD
             elif isinstance(arg_type, torch_dtype) and arg_signature != "nvTmaDesc":
+=======
+                new_args.append(f"&{var_name}")
+            elif isinstance(arg_type, torch_dtype) and not signature_is_tma_desc(
+                arg_signature
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 device_ptr_type = self.device_codegen.cpp_device_ptr()
                 code.writeline(
                     maybe_hipify_code_wrapper(
                         f"{device_ptr_type} {var_name} = reinterpret_cast<{device_ptr_type}>({arg}.data_ptr());"
                     )
                 )
+<<<<<<< HEAD
+            elif arg_type in (sympy.Integer, int):
+                code.writeline(f"int {var_name} = {cexpr(arg)};")
+            elif arg_type in (sympy.Float, float):
+                code.writeline(f"float {var_name} = {cexpr(arg)};")
+=======
+                new_args.append(f"&{var_name}")
             elif arg_type in (sympy.Integer, int):
                 code.writeline(f"int {var_name} = {cexpr(arg)};")
+                new_args.append(f"&{var_name}")
             elif arg_type in (sympy.Float, float):
                 code.writeline(f"float {var_name} = {cexpr(arg)};")
+                new_args.append(f"&{var_name}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # For symbolic call arguments, examine the arg signatures from triton meta
             # to explicitly cast to the right type
             # Reason: `auto` can infer unexpected type against kernel input signature.
@@ -405,9 +667,20 @@ def process_args(arg, arg_type, arg_signature=None):
                 code.writeline(
                     f"{signature2dtype[arg_signature]} {var_name} = {cexpr(arg)};"
                 )
+<<<<<<< HEAD
             else:
                 code.writeline(f"auto {var_name} = {cexpr(arg)};")
             new_args.append(f"&{var_name}")
+=======
+                new_args.append(f"&{var_name}")
+            elif arg_signature and arg_signature.startswith("tensordesc<"):
+                new_args.extend(
+                    process_tma_stable_arg(arg, arg_type, arg_signature, var_name)
+                )
+            else:
+                code.writeline(f"auto {var_name} = {cexpr(arg)};")
+                new_args.append(f"&{var_name}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for arg, arg_type, arg_signature in zip_longest(
             call_args, arg_types, arg_signatures
@@ -418,18 +691,34 @@ def process_args(arg, arg_type, arg_signature=None):
             is_triton_kernel
             and (
                 global_scratch := self.device_codegen.cpp_global_scratch(
+<<<<<<< HEAD
                     next(self.arg_var_id)
+=======
+                    next(self.arg_var_id),
+                    workspace=TritonScratchWorkspace(
+                        size=workspace_size,
+                        generate_dtype_str=(lambda: self.codegen_dtype(torch.uint8)),
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
             is not None
         ):
             global_scratch_def, global_scratch_var = global_scratch
+<<<<<<< HEAD
             code.writeline(global_scratch_def)
+=======
+            code.writelines([maybe_hipify_code_wrapper(x) for x in global_scratch_def])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_args.append(f"&{global_scratch_var}")
 
         return ", ".join(new_args)
 
+<<<<<<< HEAD
     def generate_kernel_call(
+=======
+    def _generate_kernel_call_helper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self,
         kernel_name: str,
         call_args,
@@ -437,8 +726,16 @@ def generate_kernel_call(
         device=None,
         triton=True,
         arg_types=None,
+<<<<<<< HEAD
         raw_args=None,
         triton_meta=None,
+=======
+        raw_keys=None,
+        raw_args=None,
+        triton_meta=None,
+        graph_name="",
+        original_fxnode_name=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Override the default value of argument 'gpu' to True here.
@@ -448,13 +745,21 @@ def generate_kernel_call(
         device = device or V.graph.get_current_device_or_throw()
         if device.type == "cpu":
             # Even in CppWrapperGpu, we may see cpp kernels
+<<<<<<< HEAD
             return CppWrapperCpu.generate_kernel_call(
+=======
+            return CppWrapperCpu._generate_kernel_call_helper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self,
                 kernel_name,
                 call_args,
                 device=device,
                 triton=triton,
                 arg_types=arg_types,
+<<<<<<< HEAD
+=======
+                raw_keys=raw_keys,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raw_args=raw_args,
                 triton_meta=triton_meta,
             )
@@ -465,21 +770,36 @@ def generate_kernel_call(
             and kernel_name not in self.kernel_autotune_names
         ):
             # Call PythonWrapperCodegen to create the autotune code block
+<<<<<<< HEAD
             PythonWrapperCodegen.generate_kernel_call(
+=======
+            PythonWrapperCodegen._generate_kernel_call_helper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self,
                 kernel_name,
                 call_args,
                 device=device,
                 triton=triton,
                 arg_types=arg_types,
+<<<<<<< HEAD
+                raw_args=raw_args,
+                triton_meta=triton_meta,
+=======
+                raw_keys=raw_keys,
                 raw_args=raw_args,
                 triton_meta=triton_meta,
+                original_fxnode_name=original_fxnode_name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         stream = (
             "stream"
             if V.graph.aot_mode
+<<<<<<< HEAD
             else self.write_get_raw_stream(device.index, V.graph)
+=======
+            else self.write_get_raw_stream(device.index, graph_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         if triton:
@@ -489,8 +809,18 @@ def generate_kernel_call(
             wrapper_name = f"call_{kernel_name}"
             if wrapper_name not in self._triton_call_wrappers:
                 self._triton_call_wrappers[wrapper_name] = DeferredTritonCallWrapper(
+<<<<<<< HEAD
                     wrapper_name, kernel_name, arg_types
                 )
+=======
+                    wrapper_name,
+                    kernel_name,
+                    self._kernel_name_to_body,
+                    arg_types,
+                )
+            device_idx = "this->device_idx_" if V.graph.aot_mode else str(device.index)
+            call_args.append(device_idx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             call_args.append(stream)
             if V.graph.aot_mode:
                 call_args.append("kernels")
diff --git a/torch/_inductor/codegen/cpp_wrapper_mps.py b/torch/_inductor/codegen/cpp_wrapper_mps.py
new file mode 100644
index 000000000000..d0d8a1eb7578
--- /dev/null
+++ b/torch/_inductor/codegen/cpp_wrapper_mps.py
@@ -0,0 +1,99 @@
+from typing import Any, Optional
+
+import sympy
+
+import torch
+
+from ..ir import GraphPartitionSignature
+from ..virtualized import V
+from .cpp_wrapper_gpu import CppWrapperGpu
+from .wrapper import PythonWrapperCodegen
+
+
+class CppWrapperMps(CppWrapperGpu):
+    @staticmethod
+    def create(
+        is_subgraph: bool,
+        subgraph_name: Optional[str],
+        parent_wrapper: Optional[PythonWrapperCodegen],
+        partition_signatures: Optional[GraphPartitionSignature] = None,
+    ) -> "CppWrapperMps":
+        return CppWrapperMps()
+
+    def _generate_kernel_call_helper(
+        self,
+        kernel_name: str,
+        call_args: list[str],
+        arg_types: Optional[list[type]] = None,
+        **kwargs: dict[str, Any],
+    ) -> None:
+        """
+        Generates MPS kernel call code. It should look something like:
+        ```
+        auto mps_lib_0_func = mps_lib_0.getKernelFunction("generated_kernel");
+        auto mps_lib_0_func_handle = AOTIMetalKernelFunctionHandle(mps_lib_0_func.get());
+        mps_lib_0_func->runCommandBlock([&] {
+            mps_lib_0_func->startEncoding();
+            aoti_torch_mps_set_arg(mps_lib_0_func_handle, 0, buf0);
+            aoti_torch_mps_set_arg(mps_lib_0_func_handle, 1, arg0_1);
+            ...
+            mps_lib_0_func->dispatch(9);
+        });
+        ```
+        """
+        assert arg_types is not None
+
+        new_args = []
+        for idx, (arg, arg_type) in enumerate(zip(call_args[:-2], arg_types[:-2])):
+            if isinstance(arg_type, torch.dtype):
+                new_args.append(
+                    f"aoti_torch_mps_set_arg_tensor({kernel_name}_handle, {idx}, {arg});\n"
+                )
+            elif arg_type in (int, sympy.core.symbol.Symbol):
+                new_args.append(
+                    f"aoti_torch_mps_set_arg_int({kernel_name}_handle, {idx}, {arg});\n"
+                )
+            else:
+                raise NotImplementedError(
+                    f"Unsupported arg type {arg_type} for arg {arg} for kernel {kernel_name}"
+                )
+
+        threads, group_size = call_args[-2], call_args[-1]
+        if threads is None:
+            raise NotImplementedError("No threads or group_size provided")
+        elif group_size is None:
+            new_args.append(f"{kernel_name}->dispatch({threads});\n")
+        else:
+            new_args.append(f"{kernel_name}->dispatch({threads}, {group_size});\n")
+
+        # debug printer related logic for cpp kernel type.
+        debug_printer_manager = V.graph.wrapper_code.debug_printer
+        debug_printer_manager.set_printer_args(
+            call_args[:-2],
+            kernel_name,
+            None,
+            None,
+            "cpp",
+        )
+        with debug_printer_manager:
+            self.writeline(self.wrap_kernel_call(kernel_name, new_args))
+
+    def wrap_kernel_call(self, name: str, call_args: list[str]) -> str:
+        lib_name = name[: -len("_func")]
+        calling_args = "        ".join(call_args)
+        return f"""
+    auto {name} = {lib_name}.getKernelFunction("generated_kernel");
+    auto {name}_handle = AOTIMetalKernelFunctionHandle({name}.get());
+    {name}->runCommandBlock([&] {{
+        {name}->startEncoding();
+        {calling_args}
+    }});
+        """
+
+    @staticmethod
+    def get_device_include_path(device: str) -> str:
+        assert V.graph.aot_mode
+        return (
+            "#include <torch/csrc/inductor/aoti_include/mps.h>\n"
+            "#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>"
+        )
diff --git a/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py b/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
index f8be71fa64dc..06c48dbed4e2 100644
--- a/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
+++ b/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
@@ -1,15 +1,38 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+import hashlib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 from collections.abc import Sequence
 from typing import cast
 
+<<<<<<< HEAD
+=======
+from torch._inductor.codegen.cuda.cutlass_python_evt import (
+    CutlassEVTCodegen,
+    MockCutlassHandler,
+)
+from torch._inductor.utils import Placeholder
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 
 from ...._dynamo.utils import counters
 from ... import config
 from ...codecache import code_hash, get_path
+<<<<<<< HEAD
 from ...ir import CUDATemplateBuffer
 from ...scheduler import BaseSchedulerNode, BaseScheduling, SchedulerNode
+=======
+from ...ir import Buffer, ComputedBuffer, CUDATemplateBuffer, Pointwise
+from ...scheduler import (
+    BaseSchedulerNode,
+    BaseScheduling,
+    FusedSchedulerNode,
+    SchedulerNode,
+    WhyNoFuse,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ...utils import get_fused_kernel_name, get_kernel_metadata, sympy_product
 from ...virtualized import V
 from ..common import BackendFeature, IndentedBuffer
@@ -18,6 +41,15 @@
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
+=======
+class WhyNoFuseNames(WhyNoFuse):
+    def __init__(self, name1: str, name2: str) -> None:
+        self.name1 = name1
+        self.name2 = name2
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CUDACPPScheduling(BaseScheduling):
     """
     Partial Scheduling implementation for CUDA C++ Kernels.
@@ -40,9 +72,37 @@ def is_cuda_cpp_template(node: BaseSchedulerNode) -> bool:
             node.node, CUDATemplateBuffer
         )
 
+<<<<<<< HEAD
+    def can_fuse_vertical(
+        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
+    ) -> bool:
+=======
+    def is_cuda_cpp_fused_template(self, node: BaseSchedulerNode) -> bool:
+        return isinstance(node, FusedSchedulerNode) and self.is_cuda_cpp_template(node)
+
     def can_fuse_vertical(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
     ) -> bool:
+        if self.is_cuda_cpp_template(node1) and isinstance(node2, BaseSchedulerNode):
+            assert node1.node, "node1.node should not be None"
+            return self._can_fuse_epilogue_impl(
+                cast(CUDATemplateBuffer, node1.node),
+                [],
+                node2,  # type: ignore[arg-type]
+            )
+        elif self.is_cuda_cpp_fused_template(node1) and isinstance(
+            node2, BaseSchedulerNode
+        ):
+            assert node1.node, "node1.node should not be None"
+            assert node2.node, "node2.node should not be None"
+            fnode1 = cast(FusedSchedulerNode, node1)
+            return self._can_fuse_epilogue_impl(
+                fnode1.get_template_node(),  # type: ignore[arg-type]
+                self._unwrap_epilogue_nodes(fnode1),
+                node2,  # type: ignore[arg-type]
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     def define_kernel(self, src_code: str, node_schedule) -> str:
@@ -55,10 +115,23 @@ def define_kernel(self, src_code: str, node_schedule) -> str:
                 if config.triton.descriptive_names
                 else ""
             )
+<<<<<<< HEAD
             kernel_name = "_".join(["cuda", fused_name, wrapper.next_kernel_suffix()])
             # use the original src_code as the key
             wrapper.src_to_kernel[src_code] = kernel_name
             src_code = src_code.replace("KERNEL_NAME", kernel_name)
+=======
+
+            # use the original src_code as the key
+            kernel_hash = hashlib.sha256(src_code.encode("utf-8")).hexdigest()[:8]
+            if fused_name == "fused":
+                # no EVT kernel, use the original kernel name
+                kernel_name = f"cutlass_{kernel_hash}"
+            else:
+                kernel_name = f"cutlass_{fused_name}_{kernel_hash}"
+            wrapper.src_to_kernel[src_code] = kernel_name
+            src_code = src_code.replace(str(Placeholder.KERNEL_NAME), kernel_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             _, _, kernel_path = get_path(code_hash(src_code), "py")
 
@@ -94,6 +167,7 @@ def codegen_template(
         _, (_numel, rnumel) = template_node.group
         assert rnumel == 1
         ctb: CUDATemplateBuffer = cast(CUDATemplateBuffer, template_node.node)
+<<<<<<< HEAD
         kernel, render = ctb.make_kernel_render(ctb)
         with kernel:
             template_node.mark_run()
@@ -101,6 +175,33 @@ def codegen_template(
 
         with V.set_kernel_handler(kernel):
             node_schedule = [template_node]
+=======
+        epilogue_ir_nodes: list[Buffer] = [n.node for n in epilogue_nodes]  # type: ignore[misc]
+        assert all(isinstance(n, ComputedBuffer) for n in epilogue_ir_nodes), (
+            "Epilogue nodes must all be instances of ir.ComputedBuffer"
+        )
+        kernel, render = ctb.make_kernel_render(ctb, epilogue_nodes=epilogue_nodes)
+
+        with kernel:
+            for node in [template_node, *epilogue_nodes]:
+                node.mark_run()
+
+            # typically there is a codegen pass which runs after mark_run
+            # for this kernel we've already generated the C++ code, but we still
+            # need to let the kernel know about loads/stores that occur in the fused
+            # kernel for memory planning to properly optimize allocations
+            ctb.emulate_store_fn()
+            for node in epilogue_ir_nodes:
+                with V.set_ops_handler(MockCutlassHandler(V.get_ops_handler())):
+                    assert isinstance(
+                        node, ComputedBuffer
+                    )  # Not sure why we need to do this again
+                    node.get_store_function()(CutlassEVTCodegen.get_index_vars(node))
+
+        with V.set_kernel_handler(kernel):
+            src_code = render()
+            node_schedule = [template_node, *epilogue_nodes]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             kernel_name = self.define_kernel(src_code, node_schedule)
 
         # debug printing values of intermediate tensors
@@ -114,3 +215,119 @@ def codegen_template(
 
         V.graph.removed_buffers |= kernel.removed_buffers
         self.free_buffers_in_scheduler()
+<<<<<<< HEAD
+=======
+
+    @staticmethod
+    def _unwrap_epilogue_nodes(
+        fused_node: FusedSchedulerNode,
+    ) -> list[BaseSchedulerNode]:
+        nodes = fused_node.get_nodes()
+        template_node = fused_node.get_template_node()
+        assert all(n.node is not None for n in nodes), (
+            "All epilogue nodes should have an IRNode"
+        )
+        return cast(
+            list[BaseSchedulerNode], [n for n in nodes if n.node is not template_node]
+        )
+
+    def _can_fuse_epilogue_impl(
+        self,
+        cuda_template_buffer: CUDATemplateBuffer,
+        existing_epilogue_nodes: list[BaseSchedulerNode],
+        node_to_fuse: BaseSchedulerNode,
+    ) -> bool:
+        """
+        Check if the given node can be fused with the epilogue. At the moment, Kernels
+        support fusion with Pointwise operations, wrapped in (named) ComputedBuffer nodes.
+
+        Args:
+            cuda_template_buffer : A CUDATemplateBuffer object representing the CUDA template and it's result buffer
+            existing_epilogue_nodes : List[SchedulerNode]: The list of already fused epilogue nodes.
+            node_to_fuse: The SchedulerNode node to be checked if it can be fused with the epilogue.
+        Returns:
+        - bool: True if the given node can be fused with the epilogue, False otherwise.
+
+        """
+        why = WhyNoFuseNames(cuda_template_buffer.get_name(), node_to_fuse.get_name())
+
+        scheduler_nodes_to_fuse = node_to_fuse.get_nodes()
+
+        assert isinstance(cuda_template_buffer, CUDATemplateBuffer)
+
+        # Checks on constituent nodes
+        for s_node in scheduler_nodes_to_fuse:
+            node = s_node.node
+
+            if not isinstance(node, ComputedBuffer):
+                why(f"{node} is not a ComputedBuffer")
+                return False
+            elif not isinstance(node.data, Pointwise):
+                why(f"{node} is not a Pointwise op")
+                return False
+            elif not node.get_computed_buffer_name():  # type: ignore[attr-defined]
+                why(f"{node} does not have a computed buffer name")
+                return False
+
+            name = node.get_computed_buffer_name()  # type: ignore[attr-defined]
+            # dtype can differ, and strides can differ as long as they are broadcastable
+            if node.get_size() != cuda_template_buffer.get_size():
+                why(
+                    f"{name}'s size: {node.get_size()} differs from {cuda_template_buffer.get_name()}'s \
+size: {cuda_template_buffer.get_size()}"
+                )
+                return False
+
+        assert len(
+            existing_epilogue_nodes
+        ) or cuda_template_buffer.get_name() in OrderedSet(
+            [rd.name for rd in node_to_fuse.read_writes.reads]
+        ), "First epilogue node must read from cuda template buffer"
+
+        if node_to_fuse.has_aliasing_or_mutation():
+            why(f"{node_to_fuse.get_name()} has aliasing or mutation")
+            return False
+        elif node_to_fuse.is_reduction():
+            why(
+                f"{node_to_fuse.get_name()} is a reduction which is not yet supported by EVT"
+            )
+            return False
+        elif (
+            not config.cuda.cutlass_epilogue_fusion_enabled
+            or not config.epilogue_fusion
+        ):
+            why("cutlass epilogue fusion is not enabled")
+            return False
+        elif not cuda_template_buffer.supports_epilogue_fusion:
+            why("epilogue fusion is only supported for TMA-enabled gemm ops")
+            return False
+
+        try:
+            from torch._inductor.codegen.cuda.cutlass_python_evt import (
+                CutlassEVTCodegen,
+            )
+
+            CutlassEVTCodegen.ir_to_evt_python_code(
+                cuda_template_buffer.get_name(),
+                existing_epilogue_nodes + list(node_to_fuse.get_nodes()),
+                OrderedSet(),
+            )
+
+        except NotImplementedError as e:
+            not_implemented_op = str(e)
+            if not_implemented_op.startswith("_op_"):
+                not_implemented_op = not_implemented_op[4:]
+                why(
+                    f"Cannot fuse epilogue node {node_to_fuse} into {cuda_template_buffer.name}, \
+likely due to unsupported operation: {not_implemented_op}"  # noqa: G004, B950
+                )
+                return False
+            else:  # Likely due to unsupported dtype.
+                why(
+                    f"Cannot fuse epilogue node {node_to_fuse} into {cuda_template_buffer.name}. \
+Reason: {not_implemented_op}"  # noqa: G004, B950
+                )
+                return False
+
+        return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/codegen/cuda/cuda_env.py b/torch/_inductor/codegen/cuda/cuda_env.py
index fa2723142600..047afd2df615 100644
--- a/torch/_inductor/codegen/cuda/cuda_env.py
+++ b/torch/_inductor/codegen/cuda/cuda_env.py
@@ -1,8 +1,16 @@
 import functools
 import logging
+<<<<<<< HEAD
 from typing import Optional
 
 import torch
+=======
+import shutil
+from typing import Optional
+
+import torch
+from torch._inductor.utils import clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ... import config
 
@@ -10,6 +18,11 @@
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
+=======
+@clear_on_fresh_cache
+@functools.lru_cache(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_cuda_arch() -> Optional[str]:
     try:
         cuda_arch = config.cuda.arch
@@ -23,6 +36,11 @@ def get_cuda_arch() -> Optional[str]:
         return None
 
 
+<<<<<<< HEAD
+=======
+@clear_on_fresh_cache
+@functools.lru_cache(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_cuda_version() -> Optional[str]:
     try:
         cuda_version = config.cuda.version
@@ -34,6 +52,7 @@ def get_cuda_version() -> Optional[str]:
         return None
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
 def nvcc_exist(nvcc_path: str = "nvcc") -> bool:
     if nvcc_path is None:
@@ -44,3 +63,8 @@ def nvcc_exist(nvcc_path: str = "nvcc") -> bool:
         ["which", nvcc_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
     )
     return res == 0
+=======
+@functools.cache
+def nvcc_exist(nvcc_path: Optional[str] = "nvcc") -> bool:
+    return nvcc_path is not None and shutil.which(nvcc_path) is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/codegen/cuda/cuda_kernel.py b/torch/_inductor/codegen/cuda/cuda_kernel.py
index e6df1e901445..620b6df490a7 100644
--- a/torch/_inductor/codegen/cuda/cuda_kernel.py
+++ b/torch/_inductor/codegen/cuda/cuda_kernel.py
@@ -1,12 +1,30 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import logging
+=======
+import functools
+import itertools
+import logging
+from collections import defaultdict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass
 from typing import Any, Callable, Literal, Optional, TYPE_CHECKING, Union
 
 from sympy import Expr, symbols
 
+<<<<<<< HEAD
+from torch import dtype as torch_dtype
+from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+=======
+import torch._inductor.config as config
 from torch import dtype as torch_dtype
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+from torch._inductor.scheduler import BaseSchedulerNode
+from torch._inductor.utils import do_bench_using_profiling, OrderedSet, Placeholder
+from torch.utils._sympy.value_ranges import ValueRanges
+
+from .cutlass_utils import DTYPE_TO_CUTLASS_TYPE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -25,6 +43,10 @@
 from ...utils import sympy_product
 from ...virtualized import V
 from ..common import (
+<<<<<<< HEAD
+=======
+    CSEVariable,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IndentedBuffer,
     Kernel,
     OpOverrides,
@@ -46,7 +68,11 @@ def _normalize_idx(index: int, total_length: int) -> int:
     return index if index >= 0 else index + total_length
 
 
+<<<<<<< HEAD
 ValidLayoutSymbols = Literal["M", "N", "K", "lda", "ldb", "ldc", "ldd"]
+=======
+ValidLayoutSymbols = Literal["M", "N", "K", "B", "lda", "ldb", "ldc", "ldd"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ValidLayoutAttrs = Literal["size", "stride"]
 
 
@@ -70,7 +96,12 @@ class CUDAKernel(Kernel):
 
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
+<<<<<<< HEAD
         self.layout_args: dict[str, LayoutArg] = {}
+=======
+        self.layout_args: dict[str, list[LayoutArg]] = defaultdict(list)
+        self.size_args: list[Union[Expr, int]] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Mapping from arg name to IRNode.
         self.named_nodes: dict[str, IRNode] = {}
 
@@ -84,7 +115,13 @@ def find_layout_arg(
         self, node: IRNode, attr: ValidLayoutAttrs, dim: int
     ) -> Optional[LayoutArg]:
         matches = [
+<<<<<<< HEAD
             arg for arg in self.layout_args.values() if arg.matches(node, attr, dim)
+=======
+            arg
+            for arg in itertools.chain.from_iterable(self.layout_args.values())
+            if arg.matches(node, attr, dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         if len(matches) >= 1:
             # Verify all matches have the same node, attribute, and dimension
@@ -105,19 +142,40 @@ def add_layout_arg(
         self, symbol: ValidLayoutSymbols, node: IRNode, attr: ValidLayoutAttrs, dim: int
     ):
         arg = LayoutArg(node, symbol, attr, dim)
+<<<<<<< HEAD
         self.layout_args.setdefault(symbol, arg)
+=======
+        self.layout_args[symbol].append(arg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def init_layout_args(self) -> None:
         X = self.named_nodes["X"]
         W = self.named_nodes["W"]
         Y = self.named_nodes["Y"]
         Bias = self.named_nodes.get("Bias", None)
+<<<<<<< HEAD
         mdim = _normalize_idx(-2, len(X.get_size()))
         ndim = _normalize_idx(-1, len(W.get_size()))
         kdim = _normalize_idx(-1, len(X.get_size()))
         self.add_layout_arg("M", X, "size", mdim)
         self.add_layout_arg("N", W, "size", ndim)
         self.add_layout_arg("K", X, "size", kdim)
+=======
+        x_mdim = _normalize_idx(-2, len(X.get_size()))
+        x_kdim = _normalize_idx(-1, len(X.get_size()))
+        w_kdim = _normalize_idx(-2, len(W.get_size()))
+        w_ndim = _normalize_idx(-1, len(W.get_size()))
+        y_mdim = _normalize_idx(-2, len(Y.get_size()))
+        y_ndim = _normalize_idx(-1, len(Y.get_size()))
+        self.add_layout_arg("M", X, "size", x_mdim)
+        self.add_layout_arg("K", X, "size", x_kdim)
+        self.add_layout_arg("K", W, "size", w_kdim)
+        self.add_layout_arg("N", W, "size", w_ndim)
+        self.add_layout_arg("M", Y, "size", y_mdim)
+        self.add_layout_arg("N", Y, "size", y_ndim)
+        if len(X.get_size()) > 2:
+            self.add_layout_arg("B", X, "size", 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         lda_dim = self.find_ld_idx(X)
         ldb_dim = self.find_ld_idx(W)
@@ -145,11 +203,22 @@ def get_ld(node) -> Union[Expr, int]:
         M = X.get_size()[mdim]
         N = W.get_size()[ndim]
         K = X.get_size()[kdim]
+<<<<<<< HEAD
+=======
+        B = X.get_size()[0] if len(X.get_size()) > 2 else 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         LDA = get_ld(X)
         LDB = get_ld(W)
         LDC = get_ld(Bias) if Bias else 0
         LDD = get_ld(Y)
+<<<<<<< HEAD
         return (M, N, K, LDA, LDB, LDC, LDD)
+=======
+        return (M, N, K, B, LDA, LDB, LDC, LDD)
+
+    def get_dynamic_shape_args(self) -> list[Union[Expr, int]]:
+        return [*self.get_layout_args(), *self.size_args]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def find_ld_idx(node: IRNode) -> int:
@@ -236,6 +305,10 @@ def def_kernel(
                            e.g. The template might have input argument defined as [X, W, Bias],
                            and the actual input passed into this template could be [Bias, X, W].
                            In this case, the `input_reorder` would be [2, 0, 1].
+<<<<<<< HEAD
+=======
+            additional_size_args: Additional size arguments for epilogue inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         names = [x.strip() for x in names_str.strip().split(",")]
         if len(inputs) + len(outputs) != len(names):
@@ -255,17 +328,42 @@ def def_kernel(
                 self.named_nodes[name] = node
                 self.args.input_buffers[node.get_name()] = name
 
+<<<<<<< HEAD
+=======
+        free_symbols: OrderedSet[Expr] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for name, node in zip(names[len(inputs) : len(inputs) + len(outputs)], outputs):
             if node is not None:
                 self.named_nodes[name] = node
                 self.args.output_buffers[node.get_name()] = name
 
+<<<<<<< HEAD
         arg_defs, *_ = self.args.cpp_argdefs()
 
         self.init_layout_args()
         size_args = [
             f"const int {s}" for s in ("M", "N", "K", "lda", "ldb", "ldc", "ldd")
         ]
+=======
+                if name not in (
+                    "X",
+                    "W",
+                    "Bias",
+                    "Y",
+                ):  # we handle these symbolic shapes explicitly
+                    for expr in itertools.chain(node.get_size(), node.get_stride()):
+                        if isinstance(expr, Expr):
+                            for s in expr.free_symbols:
+                                free_symbols.add(s)  # type: ignore[arg-type]
+
+        arg_defs, *_ = self.args.cpp_argdefs(DTYPE_TO_CUTLASS_TYPE)
+
+        self.init_layout_args()
+        size_vars = ["M", "N", "K", "B", "lda", "ldb", "ldc", "ldd"]
+        size_vars.extend(str(s) for s in free_symbols)
+        self.size_args.extend(free_symbols)
+        size_args = [f"const int {s}" for s in size_vars]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         runtime_arg_decls = ",".join(
             [f"{arg.ty} {arg.name}" for arg in self.runtime_arg_info]
@@ -300,6 +398,7 @@ def call_kernel(
             wrapper.initialized_kernels[name] = self
             # We always originally initialize name with "KERNEL_NAME". So, we
             # we replace with the real kernel name passed as an arg to this function.
+<<<<<<< HEAD
             self.signature = self.signature.replace("KERNEL_NAME", name)
             _, call_args, arg_types = self.args.cpp_argdefs()
         else:
@@ -310,6 +409,18 @@ def call_kernel(
         for arg in self.runtime_arg_values:
             call_args.append(arg)
         arg_types.extend("int" for a in layout_args)
+=======
+            self.signature = self.signature.replace(str(Placeholder.KERNEL_NAME), name)
+            _, call_args, arg_types = self.args.cpp_argdefs(DTYPE_TO_CUTLASS_TYPE)
+        else:
+            _, call_args, _, arg_types = self.args.python_argdefs()
+
+        dynamic_shape_args = self.get_dynamic_shape_args()
+        call_args.extend(dynamic_shape_args)  # type: ignore[arg-type]
+        for arg in self.runtime_arg_values:
+            call_args.append(arg)
+        arg_types.extend("int" for _ in dynamic_shape_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for arg in self.runtime_arg_info:
             arg_types.append(arg.ty)
         # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
@@ -461,6 +572,32 @@ def stride(self, node: IRNode, index: int, default_value: int = 0) -> str:
             return str(stride)
         return self.find_symbol(node, "stride", dim=index) or str(stride)
 
+<<<<<<< HEAD
+=======
+    def batch_stride(self, node: IRNode, default_value: int = 0) -> str:
+        """
+        Hook called from template code to get the batch stride of an arg.
+        Returns 0 if batch dim is not present.
+
+        This method assumes that batch stride is the largest stride.
+        """
+
+        if node is None:
+            return str(default_value)
+
+        if len(node.get_size()) < 3:
+            return str(default_value)
+
+        batch_stride = node.get_stride()[0]
+        if V.graph.sizevars.statically_known_leq(batch_stride, 1):
+            return str(batch_stride)
+
+        return "{}*{}".format(
+            self.find_symbol(node, "size", dim=1) or node.get_size()[1],
+            self.find_symbol(node, "size", dim=2) or node.get_size()[2],
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def row_or_column_stride(self, node: IRNode, default_value: int = 0) -> str:
         """
         Hook called from template code to get the row or column stride of an arg.
@@ -485,6 +622,21 @@ def row_or_column_stride(self, node: IRNode, default_value: int = 0) -> str:
                 f"At least 1 stride should be 1. Strides: {node.get_stride()=}"
             )
 
+<<<<<<< HEAD
+=======
+    def load(self, name: str, index: Expr, mode: Any = None) -> CSEVariable:
+        """
+        Mock load function for memory planning to optimize allocations properly.
+        """
+        return self.create_cse_var(name, bounds=ValueRanges.unknown())
+
+    def store(self, name: str, index: Expr, value: Any, mode: Any = None) -> None:
+        """
+        Mock store function for memory planning to optimize allocations properly.
+        """
+        self.store_buffer_names.add(name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CUDATemplateCaller(ChoiceCaller):
     """
@@ -504,8 +656,17 @@ def __init__(
         category: str,
         input_nodes: list[Buffer],
         layout: Layout,
+<<<<<<< HEAD
         make_kernel_render: Callable[[CUDATemplateBuffer, Optional[list[IRNode]]], str],
         bmreq: CUDABenchmarkRequest,
+=======
+        make_kernel_render: Callable[
+            [CUDATemplateBuffer, Optional[list[BaseSchedulerNode]]],
+            tuple[CUDATemplateKernel, functools.partial[str]],
+        ],
+        bmreq: CUDABenchmarkRequest,
+        supports_epilogue_fusion: bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         template: "CUDATemplate",  # type: ignore[name-defined]
         info_kwargs: Optional[
             dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]
@@ -516,6 +677,10 @@ def __init__(
         self.category = category
         self.make_kernel_render = make_kernel_render
         self.bmreq = bmreq
+<<<<<<< HEAD
+=======
+        self.supports_epilogue_fusion = supports_epilogue_fusion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.template = template
         self.info_kwargs = info_kwargs
 
@@ -525,9 +690,16 @@ def precompile(self) -> None:
 
     def benchmark(self, *args, out) -> float:
         assert self.bmreq is not None
+<<<<<<< HEAD
         return self.bmreq.benchmark(
             *args, output_tensor=out
         )  # @TODO: Hack for ensuring that Cutlass Kernel is preferred
+=======
+        if config.profile_bandwidth_with_do_bench_using_profiling:
+            algo = self.bmreq.make_run_fn(*args, out=out)
+            return do_bench_using_profiling(algo)
+        return self.bmreq.benchmark(*args, out=out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __str__(self) -> str:
         return f"CUDATemplateCaller(source_file={self.bmreq.source_file})"
@@ -535,14 +707,36 @@ def __str__(self) -> str:
     def call_name(self) -> str:
         return f"cuda_template_kernels.{self.name}"
 
+<<<<<<< HEAD
     def hash_key(self) -> str:
+=======
+    def kernel_hash_key(self) -> str:
+        """
+        Return kernel hash key that does not depend on swizzle.
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+        return "-".join(
+            [
+                self.category,
+                self.bmreq.hash_key,
+            ]
+        )
+
+<<<<<<< HEAD
+=======
+    def hash_key(self) -> str:
+        """
+        Return kernel hash key that does not depend on swizzle.
+        """
         return "-".join(
             [
                 self.category,
                 self.bmreq.hash_key,
+                str(self.info_dict().get("swizzle")),
             ]
         )
 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def info_dict(self) -> dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]:
         """Information returned here is logged to the autotune log file when that is enabled."""
         if self.info_kwargs is not None and "op" in self.info_kwargs:
@@ -560,6 +754,10 @@ def info_dict(self) -> dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType
                 "instruction_shape": str(
                     op.tile_description.math_instruction.instruction_shape
                 ),
+<<<<<<< HEAD
+=======
+                "swizzle": str(self.info_kwargs["swizzle"]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         else:
             return {"backend": "CUDA", "op_type": "unknown"}
@@ -572,6 +770,10 @@ def output_node(self) -> TensorBox:
                 inputs=self.input_nodes,
                 make_kernel_render=self.make_kernel_render,
                 workspace_size=self.bmreq.workspace_size,
+<<<<<<< HEAD
+=======
+                supports_epilogue_fusion=self.supports_epilogue_fusion,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 template=self.template,
             )
         )
diff --git a/torch/_inductor/codegen/cuda/cuda_template.py b/torch/_inductor/codegen/cuda/cuda_template.py
index 3de6f20bb6a1..93dc4a342757 100644
--- a/torch/_inductor/codegen/cuda/cuda_template.py
+++ b/torch/_inductor/codegen/cuda/cuda_template.py
@@ -1,14 +1,25 @@
 # mypy: allow-untyped-defs
 import functools
+<<<<<<< HEAD
 import itertools
 from dataclasses import dataclass
 from typing import Any, Optional
+=======
+import hashlib
+import itertools
+from dataclasses import dataclass
+from typing import Any, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import override
 from unittest.mock import patch
 
 import sympy
 
 import torch
+<<<<<<< HEAD
+=======
+from torch._inductor.utils import Placeholder
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import getArtifactLogger
 
 from ...autotune_process import CUDABenchmarkRequest, TensorMeta
@@ -17,8 +28,21 @@
 from ...virtualized import V
 from ..common import KernelTemplate
 from .cuda_kernel import CUDATemplateCaller, CUDATemplateKernel
+<<<<<<< HEAD
+
+
+=======
+from .cutlass_utils import DTYPE_TO_CUTLASS_TYPE
 
 
+if TYPE_CHECKING:
+    from ...scheduler import BaseSchedulerNode  # noqa: TC004
+else:
+    BaseSchedulerNode = Any
+
+GemmOperation = Any
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 autotuning_log = getArtifactLogger(__name__, "autotuning")
 
 
@@ -55,6 +79,13 @@ def __init__(
         self.input_reorder = input_reorder
         self.layout = layout
 
+<<<<<<< HEAD
+=======
+    @staticmethod
+    def supports_epilogue_fusion(op: GemmOperation) -> bool:
+        return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def generate(  # type: ignore[override]
         self,
         description,
@@ -70,7 +101,11 @@ def generate(  # type: ignore[override]
         Returns:
             A CUDATemplateCaller object representing the generated CUDA template caller.
         """
+<<<<<<< HEAD
         kernel_name = f"cuda_{self.name}"
+=======
+        kernel_name = str(Placeholder.KERNEL_NAME)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with (
             patch.object(V.graph, "get_dtype", self._fake_get_dtype(self.output_node)),
             CUDATemplateKernel(
@@ -84,7 +119,11 @@ def generate(  # type: ignore[override]
             autotuning_log.debug("Generated Code:\n%s", code)
             autotuning_log.debug(
                 "Args: cpp_argdefs: %s, python_argdefs: %s",
+<<<<<<< HEAD
                 kernel.args.cpp_argdefs(),
+=======
+                kernel.args.cpp_argdefs(DTYPE_TO_CUTLASS_TYPE),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kernel.args.python_argdefs(),
             )
 
@@ -102,10 +141,19 @@ def generate(  # type: ignore[override]
             expected_args,
         )
         V.graph.sizevars.size_hints(map(sympy.expand, call_args[len(expected_args) :]))
+<<<<<<< HEAD
         size_args = V.graph.sizevars.size_hints(kernel.get_layout_args())
         extra_args = tuple(list(size_args) + self.get_runtime_arg_values(**kwargs))
 
         kernel_hash_name = f"cuda_{self.name}_{next(self.index_counter)}"
+=======
+        size_args = V.graph.sizevars.size_hints(kernel.get_dynamic_shape_args())
+        extra_args = tuple(list(size_args) + self.get_runtime_arg_values(**kwargs))
+
+        kernel_hash = hashlib.sha256(code.encode("utf-8")).hexdigest()[:8]
+        kernel_name = f"cutlass_{kernel_hash}"
+        code = code.replace(self.name, kernel_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # create the BenchmarkRequest
         bmreq = CUDABenchmarkRequest(
@@ -116,12 +164,32 @@ def generate(  # type: ignore[override]
             source_code=code,
         )
 
+<<<<<<< HEAD
         def make_kernel_render(
             template_node: CUDATemplateBuffer,
             epilogue_nodes: Optional[list[IRNode]] = None,
         ):
             kernel = CUDATemplateKernel(
                 kernel_name="KERNEL_NAME",
+=======
+        # kwargs has "op" argument in case of CUTLASSGemmTemplate
+        op = kwargs["op"]
+        if not op:
+            supports_epilogue_fusion = False
+        else:
+            # epilogue fusion is only supported for TMA kernels
+            supports_epilogue_fusion = self.supports_epilogue_fusion(op)
+
+        def make_kernel_render(
+            template_node: CUDATemplateBuffer,
+            epilogue_nodes: Optional[list[BaseSchedulerNode]] = None,
+        ) -> tuple[CUDATemplateKernel, functools.partial[str]]:
+            assert supports_epilogue_fusion or not epilogue_nodes, (
+                "epilogue fusion is not supported for this kernel"
+            )
+            kernel = CUDATemplateKernel(
+                kernel_name=str(Placeholder.KERNEL_NAME),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 runtime_arg_info=self.get_runtime_arg_info(),
                 runtime_arg_values=self.get_runtime_arg_values(**kwargs),
             )
@@ -135,12 +203,21 @@ def make_kernel_render(
             return kernel, render
 
         return CUDATemplateCaller(
+<<<<<<< HEAD
             kernel_hash_name,
             self.name,
+=======
+            kernel_name,
+            "cutlass_gemm",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.input_nodes,
             self.output_node.get_layout(),
             make_kernel_render,
             bmreq,
+<<<<<<< HEAD
+=======
+            supports_epilogue_fusion,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self,
             kwargs,
             description,
@@ -175,7 +252,10 @@ def globals(self) -> IndentedBuffer:
                 #define PT_EXPORT
                 #endif
                 #endif
+<<<<<<< HEAD
                 using bfloat16 = nv_bfloat16;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
         )
         return res
@@ -257,6 +337,10 @@ def cute_int(self, int_str: str, var_name: str) -> str:
         torch.uint8: "uint8_t",
         torch.bool: "bool",
         torch.bfloat16: "cutlass::bfloat16_t",
+<<<<<<< HEAD
+=======
+        torch.float8_e4m3fn: "cutlass::float_e4m3_t",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     _DTYPE_TO_CUTLASS_SPARSE_META = {
diff --git a/torch/_inductor/codegen/cuda/cutlass_cache.py b/torch/_inductor/codegen/cuda/cutlass_cache.py
new file mode 100644
index 000000000000..7afdd654ea74
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_cache.py
@@ -0,0 +1,105 @@
+# mypy: allow-untyped-defs
+import functools
+import hashlib
+import json
+import logging
+import os
+import time
+from typing import Any, Optional
+
+import torch._inductor.config as config
+from torch._inductor.codecache import cutlass_key
+from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch, get_cuda_version
+from torch._inductor.codegen.cuda.serialization import get_cutlass_operation_serializer
+from torch._inductor.runtime.cache_dir_utils import cache_dir
+from torch._inductor.utils import clear_on_fresh_cache
+
+
+log = logging.getLogger(__name__)
+
+
+CONFIG_PREFIX: str = "configs"
+
+
+def get_config_request_key(
+    arch: str,
+    cuda_version: str,
+    instantiation_level: str,
+) -> str:
+    """
+    Return a key for the full ops, based on cutlass key, arch, cuda version, and instantiation level.
+    """
+    hash_target = "-".join(
+        [
+            cutlass_key().hex(),
+            arch,
+            cuda_version,
+            instantiation_level,
+        ]
+    )
+    return hashlib.sha256(hash_target.encode("utf-8")).hexdigest()[0:8]
+
+
+def _generate_config_filename(request_key: str) -> str:
+    """
+    Generate a filename for the full ops.
+    """
+    return f"{CONFIG_PREFIX}_{request_key}.json"
+
+
+@clear_on_fresh_cache
+@functools.cache
+def maybe_fetch_ops() -> Optional[list[Any]]:
+    """
+    Fetch ops from databases.
+    """
+    if config.force_disable_caches:
+        return None
+
+    # setup
+    arch: str = get_cuda_arch()
+    # get_cuda_version might return "12.4.0" or "12.4"
+    # but we want to use "12.4"
+    version: str = ".".join(get_cuda_version().split(".")[:2])
+    instantiation_level: str = config.cuda.cutlass_instantiation_level
+
+    # filename and filepath
+    request_key: str = get_config_request_key(arch, version, instantiation_level)
+    filename: str = _generate_config_filename(request_key)
+    filepath: str = os.path.join(cache_dir(), filename)
+
+    # try fetch
+    serialized_ops: Optional[list[str]] = None
+    start_time = time.time()
+    if os.path.isfile(filepath):
+        # locally
+        try:
+            with open(filepath) as f:
+                serialized_ops = json.load(f)
+
+            assert isinstance(serialized_ops, list), (
+                f"Expected serialized ops is a list, got {type(serialized_ops)}"
+            )
+        except Exception as e:
+            log.warning(
+                "Failed to load CUTLASS config %s from local cache: %s",
+                filename,
+                e,
+            )
+            serialized_ops = None
+    elif config.is_fbcode():
+        from torch._inductor.fb.cutlass_remote_cache import (
+            maybe_fetch_cutlass_configs_from_remote,
+        )
+
+        # from remote
+        serialized_ops = maybe_fetch_cutlass_configs_from_remote(filepath)
+
+    if serialized_ops is None:
+        return None
+
+    # deserialize
+    serializer = get_cutlass_operation_serializer()
+    full_ops = [serializer.deserialize(x) for x in serialized_ops]  # type: ignore[union-attr]
+    log.info("Loaded ops from %s cache in %.3fs", filename, time.time() - start_time)
+    return full_ops
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/cuda/__init__.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/cuda/__init__.py
new file mode 100644
index 000000000000..e12a86af8ab0
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/cuda/__init__.py
@@ -0,0 +1,6 @@
+import torch
+
+
+__version__ = torch.version.cuda
+
+from .cuda import *  # noqa: F403
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/cuda/cuda.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/cuda/cuda.py
new file mode 100644
index 000000000000..ad41f04fc897
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/cuda/cuda.py
@@ -0,0 +1,24 @@
+# mypy: disable-error-code="no-untyped-def"
+# flake8: noqa
+import torch
+
+
+class CUdeviceptr:
+    pass
+
+
+class CUstream:
+    def __init__(self, v):
+        pass
+
+
+class CUresult:
+    CUDA_SUCCESS = True
+
+
+class nvrtc:
+    pass
+
+
+def cuDeviceGetCount():
+    return (CUresult.CUDA_SUCCESS, torch.cuda.device_count())
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/cuda/cudart.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/cuda/cudart.py
new file mode 100644
index 000000000000..ca2ee5f1f616
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/cuda/cudart.py
@@ -0,0 +1,17 @@
+# mypy: disable-error-code="no-untyped-def"
+import torch.cuda
+
+
+class cudaError_t:
+    cudaSuccess = True
+
+
+def cudaFree(n):
+    return (cudaError_t.cudaSuccess,)
+
+
+def cudaGetDeviceProperties(d):
+    class DummyError:
+        value = False
+
+    return (DummyError(), torch.cuda.get_device_properties(d))
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/pydot/__init__.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/pydot/__init__.py
new file mode 100644
index 000000000000..8aefb6171b68
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/pydot/__init__.py
@@ -0,0 +1,2 @@
+# mypy: disable-error-code="var-annotated"
+Dot = None
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/scipy/__init__.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/scipy/__init__.py
new file mode 100644
index 000000000000..f0378d35a9c4
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/scipy/__init__.py
@@ -0,0 +1,3 @@
+# typing: ignore
+# flake8: noqa
+from .special import *
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/scipy/special.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/scipy/special.py
new file mode 100644
index 000000000000..79af3029aa0b
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/cutlass_mock_imports/scipy/special.py
@@ -0,0 +1,2 @@
+# mypy: disable-error-code="var-annotated"
+erf = None
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
new file mode 100644
index 000000000000..becbf1f2c552
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
@@ -0,0 +1,240 @@
+from typing import Any, Callable, Union
+
+from sympy import Expr
+
+from torch._inductor.ir import (
+    ComputedBuffer,
+    InputBuffer,
+    is_contiguous_strides_for_shape,
+)
+from torch.utils._ordered_set import OrderedSet
+
+from ..cutlass_utils import torch_dtype_to_cutlass_type, try_import_cutlass
+
+
+EpilogueFunctor = Any  # EpilogueFunctor local class defined in _trace
+Buffer = Union[ComputedBuffer, InputBuffer]
+CutlassTupleType = Any  # cutlass.backend.c_types.tuple_factory_.<locals>.TupleType
+CutlassVisitorType = Any  # cutlass.backend.c_types.visitor_factory.<locals>.VisitorType
+CutlassArgType = (
+    Any  # Can be a CutlassTupleType, CutlassVisitorType, EmptyByte, or ctype.c_void_p
+)
+
+
+if try_import_cutlass():
+    import ast
+    import ctypes
+    import textwrap
+    from typing import Union
+
+    from cutlass.backend.c_types import (  # type: ignore[import-untyped, import-not-found]
+        EmptyByte,
+    )
+    from cutlass.backend.epilogue import (  # type: ignore[import-untyped, import-not-found]
+        dtype2ctype,
+    )
+    from cutlass.backend.evt import (  # type: ignore[import-untyped, import-not-found]
+        EpilogueFunctorVisitor,
+    )
+    from cutlass.backend.evt.backend.emitter_base import (  # type: ignore[import-untyped, import-not-found]
+        FusionCallbacks,
+    )
+    from cutlass.backend.evt.backend.sm90_emitter import (  # type: ignore[import-untyped, import-not-found]
+        CollectiveEpilogue,
+    )
+    from cutlass.backend.evt.frontend import (  # type: ignore[import-untyped, import-not-found]
+        PythonASTFrontend,
+    )
+    from cutlass.backend.evt.ir.tensor import (  # type: ignore[import-untyped, import-not-found]
+        Tensor as CutlassTensor,
+    )
+    from cutlass_library import (
+        DataType,
+        EpilogueScheduleType,
+        LayoutType,
+        TileDescription,
+    )
+
+    from torch._inductor.codegen.cuda import cuda_env
+    from torch._inductor.utils import IndentedBuffer
+
+    _CUTLASS_C_DTYPES = OrderedSet(dtype2ctype.values())  # type: ignore[var-annotated]
+
+    def create_example_tensors(
+        var_name_to_buffer_name: dict[str, str],
+        name_to_buffer: dict[str, Buffer],
+        size_hint_fn: Callable[[Union[Expr, int]], int],
+    ) -> dict[str, CutlassTensor]:
+        def cutlass_tensor_from_buffer(buffer: Buffer) -> CutlassTensor:
+            shape = buffer.get_layout().size
+            stride = buffer.get_layout().stride
+            shape = tuple(size_hint_fn(x) for x in shape)
+            stride = tuple(size_hint_fn(x) for x in stride)
+
+            is_row_major = is_contiguous_strides_for_shape(stride, shape)
+            is_column_major = is_contiguous_strides_for_shape(stride[::-1], shape[::-1])
+
+            if not is_row_major and not is_column_major:
+                raise RuntimeError(
+                    f"Cannot create example tensor for {buffer.get_name()} with \
+non-contiguous layout, received stride: {stride} and shape: {shape}"
+                )
+
+            return CutlassTensor(
+                shape=shape,
+                layout_tag=LayoutType.RowMajor
+                if is_row_major
+                else LayoutType.ColumnMajor,
+                element=torch_dtype_to_cutlass_type(buffer.get_layout().dtype),
+            )
+
+        return {
+            key: cutlass_tensor_from_buffer(name_to_buffer[name])
+            for key, name in var_name_to_buffer_name.items()
+        }
+
+    def trace(
+        fn_src: str,
+        example_tensors: dict[str, CutlassTensor],
+        accum_type: DataType,
+        output_type: DataType,
+        tile_description: TileDescription,
+        epilogue_schedule: EpilogueScheduleType,
+        name_to_buffer: dict[str, Buffer],
+        size_hint_fn: Callable[[Union[Expr, int]], int],
+        **kwargs: dict[str, Any],
+    ) -> tuple[str, str, str]:
+        cuda_arch = int(cuda_env.get_cuda_arch())  # type: ignore[arg-type]
+        assert cuda_arch >= 90, "Only SM90+ is supported for EVT"
+        epilogue_functor = _trace(fn_src, example_tensors, cuda_arch, **kwargs)
+        visitor = EpilogueFunctorVisitor(cuda_arch, epilogue_functor)
+        fusion_callbacks = FusionCallbacks(visitor.graph, cuda_arch, emit_CD=False)
+        collective_epilogue = CollectiveEpilogue(
+            tile_description,
+            epilogue_schedule,
+            accum_type,
+            output_type,
+            fusion_callbacks,
+        )
+        evt_name, evt_code = collective_epilogue.emit()
+        evt_args = _render_argument_type(epilogue_functor, name_to_buffer, size_hint_fn)
+        return evt_name, evt_args, evt_code
+
+    # Based off of
+    # https://github.com/NVIDIA/cutlass/blob/df18f5e4f5de76bed8be1de8e4c245f2f5ec3020/python/cutlass/epilogue/epilogue.py#L117
+    # This is modified to enable directly passing the source code of the epilogue vs getting it from a bona-fide python function
+    # The reason for this is that inspect.getsource does not work with functions defined at runtime via exec/eval
+    def _trace(
+        fn_src: str, example_tensors: dict[str, CutlassTensor], cc: int, **kwargs: Any
+    ) -> EpilogueFunctor:
+        class EpilogueFunctor(PythonASTFrontend):
+            def __init__(self, cc: int, **kwargs: Any):
+                self.source = textwrap.dedent(fn_src)
+                super().__init__(cc, **kwargs)
+
+            def parse(self, example_inputs: dict[str, CutlassTensor]) -> None:
+                self.example_inputs = example_inputs
+                self.ast = ast.parse(self.source)
+                self.visit(self.ast)
+
+        cc = int(cuda_env.get_cuda_arch())
+        epilogue_functor = EpilogueFunctor(cc=cc, **kwargs)
+        epilogue_functor.trace(example_tensors)
+        return epilogue_functor
+
+    def _render_argument_type(
+        epilogue_functor: EpilogueFunctor,
+        name_to_buffer: dict[str, Buffer],
+        size_hint_fn: Callable[[Union[Expr, int]], int],
+    ) -> str:
+        epilogue_thread_type = epilogue_functor.epilogue_thread_type
+
+        # Fragile, but this is the only way to guarantee t is expected type because t is a local class
+        def is_nested_visitor_type(t: type) -> bool:
+            return (
+                ".".join([t.__module__, t.__qualname__])
+                == "cutlass.backend.c_types.visitor_factory.<locals>.VisitorType"
+            )
+
+        buffer = IndentedBuffer()
+        with buffer.set_tabwidth(2):
+
+            def render_argument_type(name: str, t: CutlassArgType) -> None:
+                if issubclass(t, ctypes.c_byte):
+                    buffer.writeline(f"{{}}, /* {name} */")
+                else:
+                    fields = [
+                        (
+                            fname,
+                            _get_arg_from_node(ty, name_to_buffer[name], size_hint_fn),
+                        )
+                        for fname, ty in t._fields_
+                    ]
+                    field_strs = [
+                        f"/* {fname} */ {str(field)}" for fname, field in fields
+                    ]
+                    buffer.writeline(f"{{{', '.join(field_strs)}}}, /* {name} */")
+
+            def render_thread_type(name: str, t: CutlassArgType) -> None:
+                if is_nested_visitor_type(t):
+                    buffer.writeline(f"{{ /* {name} */")
+                    with buffer.indent():
+                        for name, inner_t in t._fields_:
+                            render_thread_type(name, inner_t)
+                    buffer.writeline("},")
+                else:
+                    render_argument_type(name, t)
+
+            # unroll the recursion once to address special case formatting
+            # namely, no ending comma and no indentation for the outermost thread type
+            buffer.writeline("{ /* thread */")
+            with buffer.indent(3):
+                if is_nested_visitor_type(epilogue_thread_type):
+                    with buffer.indent():
+                        for name, inner_t in epilogue_thread_type._fields_:
+                            render_thread_type(name, inner_t)
+                else:
+                    render_argument_type("thread", epilogue_thread_type)
+                buffer.writeline("}")
+
+        return buffer.getvalue()
+
+    def _get_arg_from_node(
+        arg_ty: type, node: Buffer, size_hint_fn: Callable[[Union[Expr, int]], int]
+    ) -> str:
+        from ..cuda_template import CUTLASSTemplate
+
+        # Today, arguments are either a pointer to the
+        # node's memory, a stride tuple, the datatype
+        # Once again, need to check for local class type for stride tuple
+        if (
+            str(arg_ty)
+            == "<class 'cutlass.backend.c_types.tuple_factory_.<locals>.TupleType'>"
+        ):
+            DEFAULT_STRIDE_LEN = 3
+            assert len(node.get_layout().stride) <= DEFAULT_STRIDE_LEN
+            stride = [size_hint_fn(x) for x in node.get_layout().stride]
+            for _ in range(DEFAULT_STRIDE_LEN - len(stride)):
+                stride.append(0)
+
+            def render_stride(x: int) -> str:
+                # Handle EBO for 0 and 1
+                if x == 0:
+                    return "_0{}"
+                elif x == 1:
+                    return "_1{}"
+                else:
+                    return str(x)
+
+            return f"{{{', '.join([render_stride(x) for x in stride])}}}"
+
+        elif issubclass(arg_ty, ctypes.c_void_p):
+            return f"({CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]}*) {node.get_name()}"
+        elif (
+            arg_ty in _CUTLASS_C_DTYPES
+        ):  # Assumption: this is the element dtype, this holds for all cutlass ir nodes currently
+            return f"{CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]}(0)"
+        elif issubclass(arg_ty, EmptyByte):
+            return "{}"
+
+        raise NotImplementedError(f"Unsupported arg type: {arg_ty}")
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py
index bdbe9f8e0d23..e8f07cdf57d3 100644
--- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py
@@ -1,13 +1,24 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
 from ..cutlass_utils import try_import_cutlass
 
 
+=======
+# mypy: ignore-errors
+from ..cutlass_utils import try_import_cutlass
+
+
+# copied / modified from original at
+# https://github.com/NVIDIA/cutlass/blob/8783c41851cd3582490e04e69e0cd756a8c1db7f/tools/library/scripts/gemm_operation.py#L658
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if try_import_cutlass():
     import enum
 
     from cutlass_library.gemm_operation import *  # noqa: F401, F403
     from cutlass_library.library import *  # noqa: F401, F403
 
+<<<<<<< HEAD
     # copied / modified from original at
     # https://github.com/NVIDIA/cutlass/blob/8783c41851cd3582490e04e69e0cd756a8c1db7f/tools/library/scripts/gemm_operation.py#L658
     # to support EVT similar to
@@ -16,6 +27,14 @@ class EmitGemmUniversal3xInstanceWithEVT:
         """Responsible for emitting a CUTLASS 3.x template definition"""
 
         def __init__(self, operation_suffix="") -> None:
+=======
+    _LOGGER = logging.getLogger(__name__)
+
+    class EmitGemmUniversal3xInstanceWithEVT:
+        """Responsible for emitting a CUTLASS 3.x template definition"""
+
+        def __init__(self, operation_suffix="", evt_name=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.operation_suffix = operation_suffix
             self.includes = [
                 "cutlass/cutlass.h",
@@ -25,6 +44,7 @@ def __init__(self, operation_suffix="") -> None:
                 "cutlass/gemm/collective/collective_builder.hpp",
                 "cutlass/epilogue/collective/collective_builder.hpp",
             ]
+<<<<<<< HEAD
             self.builtin_epilogue_functor_template = """
             ${epilogue_functor}<
               ${element_c},
@@ -77,12 +97,61 @@ def __init__(self, operation_suffix="") -> None:
         // Define named type
         struct ${operation_name} :
           public ${operation_name}_base { };
+=======
+            self.builtin_epilogue_functor_template = """${epilogue_functor}<
+            ${element_d},
+            ${element_epilogue},
+            ${element_c},
+            ${element_epilogue}
+            >"""
+
+            self.evt_name = evt_name
+            self.gemm_template = """
+using ${operation_name}_epilogue =
+typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class_epi},
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
+    cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
+    ${epi_tile_mn},
+    ${element_accumulator}, ${element_epilogue},
+    ${element_c}, ${layout_c}, ${align_c},
+    ${element_d}, ${layout_d}, ${align_d},
+    ${epilogue_schedule},
+    ${epilogue_functor}
+>::CollectiveOp;
+
+${mixed_dtype_prepare_code}
+
+using ${operation_name}_mainloop =
+typename cutlass::gemm::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class_main},
+    ${element_a}, ${layout_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${align_b},
+    ${element_accumulator},
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
+    cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
+    ${stages},
+    ${kernel_schedule}
+>::CollectiveOp;
+
+// Gemm operator ${operation_name}
+using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+    ${problem_shape},
+    ${operation_name}_mainloop,
+    ${operation_name}_epilogue,
+    ${tile_scheduler}>;
+
+// Define named type
+struct ${operation_name} :
+public ${operation_name}_base { };
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         """
 
         #
         def instance_template(self):
             return """
+<<<<<<< HEAD
         ${compile_guard_start}
           using GemmKernel = cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>;
           manifest.append(
@@ -100,6 +169,107 @@ def emit(self, operation):
             else:
                 stage_count_string = f"cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename {str(operation.procedural_name())}_epilogue::SharedStorage)>"  # noqa: B950
             warp_shape = [tile_shape[idx] // warp_count[idx] for idx in range(3)]
+=======
+${compile_guard_start}
+{
+    using GemmKernel = cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>;
+    manifest.append(
+    new ${gemm_kind}<GemmKernel>("${operation_name}"));
+}
+${compile_guard_end}
+        """
+
+        def emit_block_scale_epilogue_functor(self, operation):
+            block_scaled_template = """
+            ${epilogue_functor}<
+                ${epi_vs},
+                ${element_d},
+                ${element_accumulator},
+                ${element_sfd},
+                ${layout_sfd},
+                ${element_c},
+                ${element_scalar}
+            >
+            """
+            block_scaled_values = {
+                "epi_vs": str(operation.ScaleFactorVectorSize),
+                "element_d": str(DataTypeTag[operation.D.element]),
+                "element_sfd": str(DataTypeTag[operation.ScaleFactorD.element]),
+                "layout_sfd": LayoutTag[operation.ScaleFactorD.layout],
+                "epilogue_functor": EpilogueFunctor3xTag[
+                    EpilogueFunctor3x.LinearCombinationBlockScaleFactor
+                ],
+                "element_accumulator": str(DataTypeTag[operation.accumulator_type()]),
+                "element_scalar": str(DataTypeTag[operation.accumulator_type()]),
+                "element_c": str(DataTypeTag[operation.C.element]),
+            }
+            return SubstituteTemplate(block_scaled_template, block_scaled_values)
+
+        @staticmethod
+        def pointerize_if_grouped(operation, layout):
+            return layout if not is_grouped(operation.gemm_kind) else layout + "* "
+
+        @staticmethod
+        def problem_shape(operation):
+            gemm_shape_type = "cute::Shape<int,int,int,int>"
+            grouped_gemm_shape_type = "cute::Shape<int,int,int>"
+            grouped_gemm_shape_type = (
+                "cutlass::gemm::GroupProblemShape<" + grouped_gemm_shape_type + ">"
+            )
+
+            return (
+                gemm_shape_type
+                if not is_grouped(operation.gemm_kind)
+                else grouped_gemm_shape_type
+            )
+
+        def emit(self, operation):
+            """Given a gem operation, emits a template definition of the operation"""
+
+            opcode_class_main = operation.tile_description.math_instruction.opcode_class
+            opcode_class_epi = opcode_class_main
+
+            tile_shape = operation.tile_description.tile_shape
+            instruction_shape = (
+                operation.tile_description.math_instruction.instruction_shape
+            )
+            cluster_m = operation.tile_description.cluster_shape[0]
+            cluster_n = operation.tile_description.cluster_shape[1]
+
+            tile_shape_m, tile_shape_n, tile_shape_k = tile_shape
+
+            # account for static/dynamic cluster shapes
+            cta_m = tile_shape[0] // cluster_m if cluster_m > 0 else tile_shape[0]
+            cta_n = tile_shape[1] // cluster_n if cluster_n > 0 else tile_shape[1]
+
+            # Shape passed to epilogue builder
+            is_sm100_kernel = operation.arch == 100
+            if is_sm100_kernel:
+                cta_m_per_mma_instruction = (
+                    2 if "2sm" in operation.procedural_name() else 1
+                )
+                if cluster_m <= 0:
+                    cta_m = cta_m // cta_m_per_mma_instruction
+
+                if opcode_class_main in [
+                    OpcodeClass.TensorOp,
+                    OpcodeClass.BlockScaledTensorOp,
+                ]:
+                    tile_shape_m = instruction_shape[0]
+                    tile_shape_n = instruction_shape[1]
+
+            # stage count set to zero indicates builder automatic stage selection
+            if operation.tile_description.stages > 0:
+                stage_count_string = f"cutlass::gemm::collective::StageCount<\
+{str(operation.tile_description.stages)}>"
+            else:
+                stage_count_string = (
+                    f"cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(\
+sizeof(typename {str(operation.procedural_name())}_epilogue::SharedStorage))>"
+                )
+
+            epi_tile_mn = "cutlass::epilogue::collective::EpilogueTileAuto"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             (
                 instance_layout_A,
@@ -119,6 +289,7 @@ def emit(self, operation):
             # Support built-in epilogue functors or user-defined functions
             if isinstance(operation.epilogue_functor, enum.Enum):
                 values = {
+<<<<<<< HEAD
                     "epilogue_vector_length": str(epilogue_vector_length),
                     "element_epilogue": str(DataTypeTag[operation.element_epilogue]),  # type: ignore[name-defined]
                     "epilogue_functor": EpilogueFunctorTag[operation.epilogue_functor],  # type: ignore[name-defined]
@@ -173,12 +344,210 @@ def emit(self, operation):
                 "epilogue_schedule": str(
                     EpilogueScheduleTag[operation.epilogue_schedule]  # type: ignore[name-defined]
                 ),
+=======
+                    "element_epilogue": str(DataTypeTag[operation.element_epilogue]),
+                    "epilogue_functor": EpilogueFunctor3xTag[
+                        operation.epilogue_functor
+                    ],
+                }
+                epilogue_functor = SubstituteTemplate(
+                    self.builtin_epilogue_functor_template, values
+                )
+
+                if (
+                    is_block_scaled(operation.gemm_kind)
+                    and operation.ScaleFactorD.element != DataType.void
+                ):
+                    epilogue_functor = self.emit_block_scale_epilogue_functor(operation)
+            else:
+                epilogue_functor = self.epilogue_functor.emit_declaration()
+
+            if (
+                is_block_scaled(operation.gemm_kind)
+                and operation.ScaleFactorD.element != DataType.void
+            ):
+                epilogue_functor = self.emit_block_scale_epilogue_functor(operation)
+
+            #
+            # Cutlass3x complex kernels' ElementA(B) is a tuple in collective mainloop builder,
+            # e.g. cute::tuple<Element, Transform>, Transform : cute::identity / cute::conjugate.
+            element_a = (
+                DataTypeTag[operation.A.element]
+                if not operation.is_complex()
+                else f"cute::tuple<{str(DataTypeTag[operation.A.element])},\
+{str(ComplexTransformTag3x[operation.A.complex_transform])}>"
+            )
+            element_b = (
+                DataTypeTag[operation.B.element]
+                if not operation.is_complex()
+                else f"cute::tuple<{str(DataTypeTag[operation.B.element])},\
+{str(ComplexTransformTag3x[operation.B.complex_transform])}>"
+            )
+            epilogue_schedule_type = EpilogueScheduleTag[operation.epilogue_schedule]
+
+            if opcode_class_main == OpcodeClass.BlockScaledTensorOp:
+                is_no_smem_epilogue = operation.epilogue_schedule in [
+                    EpilogueScheduleType.NoSmemWarpSpecialized1Sm,
+                    EpilogueScheduleType.NoSmemWarpSpecialized2Sm,
+                ]
+                grouped = is_grouped(operation.gemm_kind)
+                if cta_n == 256 and operation.kernel_schedule == to_grouped_schedule(
+                    KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100, grouped
+                ):
+                    epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+                    if not is_no_smem_epilogue:
+                        epilogue_schedule_type = EpilogueScheduleTag[
+                            to_grouped_schedule(
+                                EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped
+                            )
+                        ]
+                if cta_n == 256 and operation.kernel_schedule == to_grouped_schedule(
+                    KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100, grouped
+                ):
+                    epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+                    if not is_no_smem_epilogue:
+                        epilogue_schedule_type = EpilogueScheduleTag[
+                            to_grouped_schedule(
+                                EpilogueScheduleType.TmaWarpSpecialized2Sm, grouped
+                            )
+                        ]
+                element_a = f"cute::tuple<{str(element_a)},{str(DataTypeTag[operation.ScaleFactorA])}>"
+                element_b = f"cute::tuple<{str(element_b)},{str(DataTypeTag[operation.ScaleFactorB])}>"
+
+            operation_name_str = operation.procedural_name()
+            layout_a_str = LayoutTag[instance_layout_A]
+            layout_b_str = LayoutTag[instance_layout_B]
+            mixed_dtype_prepare_code = ""
+            if operation.mixed_input_mode is not None:
+                A_dtype = operation.A.element
+                B_dtype = operation.B.element
+                A_dtype_bits = DataTypeSize[A_dtype]
+                B_dtype_bits = DataTypeSize[B_dtype]
+                is_A_dtype_narrow = A_dtype_bits < B_dtype_bits
+                if is_A_dtype_narrow:
+                    narrow_dtype, wide_dtype = (A_dtype, B_dtype)
+                    narrow_dtype_bits, wide_dtype_bits = (A_dtype_bits, B_dtype_bits)
+                else:
+                    narrow_dtype, wide_dtype = (B_dtype, A_dtype)
+                    narrow_dtype_bits, wide_dtype_bits = (B_dtype_bits, A_dtype_bits)
+
+                narrow_tag = DataTypeTag[narrow_dtype]
+                wide_tag = DataTypeTag[wide_dtype]
+                scale_tag = DataTypeTag[wide_dtype]
+                zero_tag = DataTypeTag[wide_dtype]
+
+                do_shuffle = False
+                value_shuffle_str = ""
+                if narrow_dtype_bits == 4 and wide_dtype_bits == 16:
+                    value_shuffle_str = "cute::Layout<cute::Shape<cute::_2,cute::_4>, \
+cute::Stride<cute::_4,cute::_1>>"
+                    do_shuffle = True
+                if narrow_dtype_bits == 8 and wide_dtype_bits == 16:
+                    value_shuffle_str = "cute::Layout<cute::Shape<cute::_2,cute::_2>, \
+cute::Stride<cute::_2,cute::_1>>"
+                    do_shuffle = True
+                do_shuffle = operation.mixed_input_shuffle and do_shuffle
+
+                if do_shuffle:
+                    if is_A_dtype_narrow:
+                        stride_narrow_str = (
+                            f"cutlass::detail::TagToStrideA_t<{layout_a_str}>"
+                        )
+                        layout_a_str = f"{operation_name_str}_LayoutNarrowReordered"
+                    else:
+                        stride_narrow_str = (
+                            f"cutlass::detail::TagToStrideB_t<{layout_b_str}>"
+                        )
+                        layout_b_str = f"{operation_name_str}_LayoutNarrowReordered"
+                    # The {operation_name_str}_ prefixs in mixed_dtype_prepare_code and
+                    # layout_{a, b}_str are to prevent errors in Windows platform unity build
+                    mixed_dtype_prepare_code = f"""
+            using {operation_name_str}_StrideNarrow = {stride_narrow_str};
+            using {operation_name_str}_ValueShuffle = {value_shuffle_str};
+            static constexpr int {operation_name_str}_NumShuffleAtoms = 1;
+            using {operation_name_str}_MmaAtomShape = \
+cute::Layout<cute::Shape<cute::_1, cute::Int<{operation_name_str}_NumShuffleAtoms>>>;
+            using {operation_name_str}_LayoutAtomQuant = \
+decltype(cutlass::compute_memory_reordering_atom<{wide_tag}, {operation_name_str}_MmaAtomShape, \
+{operation_name_str}_ValueShuffle>());
+            using {operation_name_str}_LayoutNarrowReordered = \
+decltype(cute::tile_to_shape({operation_name_str}_LayoutAtomQuant{{}}, \
+cute::Layout<cute::Shape<int,int,int>, {operation_name_str}_StrideNarrow>{{}}));
+                    """
+
+                mixed_input_modes_to_element = {
+                    MixedInputMode.ConvertOnly: narrow_tag,
+                    MixedInputMode.ScaleOnly: f"cute::tuple<{narrow_tag}, {scale_tag}>",
+                    MixedInputMode.ScaleWithZeroPoint: f"cute::tuple<{narrow_tag}, {scale_tag}, {zero_tag}>",
+                }
+                narrow_element = mixed_input_modes_to_element.get(
+                    operation.mixed_input_mode, narrow_tag
+                )
+
+                if narrow_dtype == DataType.s4 and (
+                    wide_dtype == DataType.e4m3 or wide_dtype == DataType.e5m2
+                ):
+                    narrow_element = (
+                        f"cute::tuple<{narrow_tag}, cutlass::Array<{scale_tag}, 8>>"
+                    )
+
+                if is_A_dtype_narrow:
+                    element_a = narrow_element
+                else:
+                    element_b = narrow_element
+
+            if self.evt_name:
+                epilogue_functor = self.evt_name
+
+            values = {
+                "operation_name": operation_name_str,
+                "operation_suffix": self.operation_suffix,
+                "problem_shape": self.problem_shape(operation),
+                "element_a": element_a,
+                "layout_a": self.pointerize_if_grouped(operation, layout_a_str),
+                "element_b": element_b,
+                "layout_b": self.pointerize_if_grouped(operation, layout_b_str),
+                "element_c": DataTypeTag[operation.C.element],
+                "layout_c": self.pointerize_if_grouped(
+                    operation, LayoutTag[instance_layout_C]
+                ),
+                "element_d": DataTypeTag[operation.D.element],
+                "layout_d": self.pointerize_if_grouped(
+                    operation, LayoutTag[instance_layout_D]
+                ),
+                "element_accumulator": DataTypeTag[operation.accumulator_type()],
+                "opcode_class_main": OpcodeClassTag[opcode_class_main],
+                "opcode_class_epi": OpcodeClassTag[opcode_class_epi],
+                "arch": f"cutlass::arch::Sm{operation.arch}",
+                "tile_shape_m": str(tile_shape_m),
+                "tile_shape_n": str(tile_shape_n),
+                "tile_shape_k": str(tile_shape_k),
+                "cluster_shape_m": "cute::_"
+                + str(operation.tile_description.cluster_shape[0])
+                if operation.tile_description.cluster_shape[0] > 0
+                else "int",
+                "cluster_shape_n": "cute::_"
+                + str(operation.tile_description.cluster_shape[1])
+                if operation.tile_description.cluster_shape[1] > 0
+                else "int",
+                "cluster_shape_k": "cute::_"
+                + str(operation.tile_description.cluster_shape[2])
+                if operation.tile_description.cluster_shape[2] > 0
+                else "int",
+                "instruction_shape_m": str(instruction_shape[0]),
+                "instruction_shape_n": str(instruction_shape[1]),
+                "instruction_shape_k": str(instruction_shape[2]),
+                "kernel_schedule": str(KernelScheduleTag[operation.kernel_schedule]),
+                "epilogue_schedule": str(epilogue_schedule_type),
+                "epi_tile_mn": epi_tile_mn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "epilogue_functor": epilogue_functor,
                 "stages": stage_count_string,
                 "align_a": str(operation.A.alignment),
                 "align_b": str(operation.B.alignment),
                 "align_c": str(operation.C.alignment),
                 "align_d": str(operation.C.alignment),
+<<<<<<< HEAD
                 "transform_a": ComplexTransformTag[operation.A.complex_transform],  # type: ignore[name-defined]
                 "transform_b": ComplexTransformTag[operation.B.complex_transform],  # type: ignore[name-defined]
                 "math_operation": MathOperationTag[  # type: ignore[name-defined]
@@ -190,3 +559,17 @@ def emit(self, operation):
             }
 
             return SubstituteTemplate(self.gemm_template, values)  # type: ignore[name-defined]
+=======
+                "transform_a": ComplexTransformTag[operation.A.complex_transform],
+                "transform_b": ComplexTransformTag[operation.B.complex_transform],
+                "math_operation": MathOperationTag[
+                    operation.tile_description.math_instruction.math_operation
+                ],
+                "epilogue_vector_length": str(epilogue_vector_length),
+                "element_epilogue": str(DataTypeTag[operation.element_epilogue]),
+                "tile_scheduler": str(TileSchedulerTag[operation.tile_scheduler]),
+                "mixed_dtype_prepare_code": mixed_dtype_prepare_code,
+            }
+
+            return SubstituteTemplate(self.gemm_template, values)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/codegen/cuda/cutlass_presets.py b/torch/_inductor/codegen/cuda/cutlass_presets.py
new file mode 100644
index 000000000000..f0888e2ef29b
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_presets.py
@@ -0,0 +1,239 @@
+import functools
+from collections import defaultdict
+
+import torch
+from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch
+
+
+@functools.cache
+def gen_cutlass_presets() -> dict[int, dict[str, list[str]]]:
+    """
+    Generate cutlass presets for the given CUDA arch.
+    """
+    presets: dict[int, dict[str, list[str]]] = {}
+
+    if not torch._C._has_cuda:
+        return presets
+
+    presets[0] = defaultdict(list)
+    arch = get_cuda_arch()
+    if arch == "90":
+        preset = presets[0]
+        preset["0"] = [
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_64x256x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+        ]
+        preset["1111"] = [
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+        ]
+        preset["2222"] = [
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x1x1_0_.*_align.*",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+        ]
+        preset["3333"] = [
+            r"cutlass3x_sm90_tensorop_s64x48x16gemm_.*_64x48x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_4x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_4x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_2x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_4x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x192x16gemm_.*_256x192x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_2x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x192x16gemm_.*_256x192x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+        ]
+        preset["4444"] = [
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_1x8x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_2x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x128_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x160x16gemm_.*_256x160x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x192x16gemm_.*_64x192x64_4x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x160x16gemm_.*_256x160x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x160x16gemm_.*_256x160x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x128_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_2x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_4x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_2x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_2x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+        ]
+        preset["5555"] = [
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_2x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x32x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_128x32x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x256_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_1x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_2x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x160x16gemm_.*_256x160x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_256x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x128_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_1x8x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x64x128_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x128_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_64x32x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x192x16gemm_.*_128x192x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x128_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_2x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x256x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_64x128x128_4x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x128x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x32x16gemm_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x64x16gemm_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x128_1x8x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x160x16gemm_.*_256x160x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
+            r"cutlass3x_sm90_tensorop_s64x16x16gemm_.*_64x16x256_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
+            r"cutlass3x_sm90_tensorop_s64x192x16gemm_.*_256x192x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
+        ]
+
+    return presets
diff --git a/torch/_inductor/codegen/cuda/cutlass_python_evt.py b/torch/_inductor/codegen/cuda/cutlass_python_evt.py
new file mode 100644
index 000000000000..ca5e6031b19c
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/cutlass_python_evt.py
@@ -0,0 +1,322 @@
+import itertools
+from collections.abc import Generator, Iterable, Iterator, Sequence
+from contextlib import contextmanager
+from os import linesep
+from typing import Any, Optional
+
+import sympy
+
+import torch
+import torch._inductor.virtualized as virtualized
+from torch._inductor.ir import ComputedBuffer, Pointwise
+from torch._inductor.ops_handler import DefaultHandler, WrapperHandler
+from torch._inductor.scheduler import BaseSchedulerNode
+from torch._inductor.utils import DelayReplaceLine, IndentedBuffer, OrderedSet
+from torch._inductor.virtualized import OpsValue
+
+from ...virtualized import V
+
+
+_ACCUMULATOR_ARG_NAME = "accum"
+
+
+def scaled_mm_evt(
+    scale_A_name: str, scale_B_name: str, bias_name: Optional[str], output_name: str
+) -> tuple[list[str], dict[str, Any], str]:
+    evt_read_names = [scale_A_name, scale_B_name]
+    var_name_to_buffer_name = {n: n for n in [scale_A_name, scale_B_name]}
+    var_name_to_buffer_name["D"] = output_name
+    var_name_to_buffer_name[_ACCUMULATOR_ARG_NAME] = output_name
+    expr = f"accum * {scale_A_name} * {scale_B_name}{linesep}"
+    if bias_name:
+        expr = f"({expr}) + {bias_name}"
+        evt_read_names.append(bias_name)
+        var_name_to_buffer_name[bias_name] = bias_name
+
+    evt_py_code = f"def fn(accum, {','.join(evt_read_names)}):{linesep}\
+    D = {expr}{linesep}\
+    return D{linesep}"
+
+    return evt_read_names, var_name_to_buffer_name, evt_py_code
+
+
+class CutlassEVTOpsMixIn:
+    @staticmethod
+    def _infix_bin_op(op: str, a: str, b: str) -> str:
+        return f"{a} {op} {b}"
+
+    @staticmethod
+    def _prefix_bin_op(op: str, a: str, b: str) -> str:
+        return f"{op}({a}, {b})"
+
+    @staticmethod
+    def _prefix_un_op(op: str, a: str) -> str:
+        return f"{op}({a})"
+
+    @staticmethod
+    def to_dtype(
+        x: str,
+        dtype: Any,
+        src_dtype: Optional[torch.dtype] = None,
+        use_compute_types: bool = False,
+    ) -> str:
+        return x
+
+    @staticmethod
+    def constant(value: Any, dtype: Any) -> str:
+        raise NotImplementedError
+
+    @staticmethod
+    def mul(x0: str, x1: str) -> str:
+        return CutlassEVTOpsMixIn._infix_bin_op("*", x0, x1)
+
+    @staticmethod
+    def truediv(x0: str, x1: str) -> str:
+        return CutlassEVTOpsMixIn._infix_bin_op("/", x0, x1)
+
+    @staticmethod
+    def ge(x0: str, x1: str) -> str:
+        raise NotImplementedError
+
+    @staticmethod
+    def add(x0: str, x1: str) -> str:
+        return CutlassEVTOpsMixIn._infix_bin_op("+", x0, x1)
+
+    @staticmethod
+    def relu(x0: str) -> str:
+        return CutlassEVTOpsMixIn._prefix_un_op("relu", x0)
+
+    @staticmethod
+    def sigmoid(x0: str) -> str:
+        raise NotImplementedError("sigmoid is not supported in CUTLASS python evt")
+
+    @staticmethod
+    def sub(x0: str, x1: str) -> str:
+        return CutlassEVTOpsMixIn._infix_bin_op("-", x0, x1)
+
+    @staticmethod
+    def tanh(x0: str) -> str:
+        raise NotImplementedError("tanh is not supported in CUTLASS python evt")
+
+
+class MockCutlassHandler(CutlassEVTOpsMixIn, WrapperHandler):
+    """Passthrough handler for cutlass ops, used for running epilogue nodes for memory planning"""
+
+
+class _AssignmentFormatter(DefaultHandler):
+    def __init__(self, parent_handler: "CutlassEVTCodegen"):
+        self.parent_handler = parent_handler
+
+    def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+        # Handle op dispatch here
+        if hasattr(self.parent_handler, name):
+            fn = getattr(self.parent_handler, name)
+            line = fn(*args, **kwargs)
+            if name in ("load", "store"):
+                return OpsValue(line)
+            else:
+                var = self.parent_handler._tmp_var()
+                line = DelayReplaceLine(
+                    var,
+                    lambda: "D"
+                    if var == self.parent_handler.last_stored_var_name
+                    else var,
+                    f"{var} = {line}",
+                )
+                self.parent_handler.body.writeline(line)
+                return OpsValue(var)
+        else:
+            raise NotImplementedError(name)
+
+
+class CutlassEVTCodegen(CutlassEVTOpsMixIn):
+    """
+    Notes:
+        * Used by CUTLASSGemmTemplate.
+        * This class should not be instantiated by users, it is intended to be used
+            by calling CutlassEVTCodegen.ir_to_evt_python_code(...)
+            which instantiates this class as an ops handler for virtualized.V.ops.[op-name]
+        * Extend this with more _op_<whatever> nodes to add support for new pointwise operations.
+    """
+
+    def __init__(self, accumulator_node_name: str, removed_buffers: OrderedSet[str]):
+        """
+
+        Initializes a CutlassEVTEpilogueArgumentFormatter object. Do not instantiate directly.
+        Use the CutlassEVTCodegen.ir_to_evt_python_code static method.
+
+        Args:
+            accumulator_node_name: The name of the accumulator node which should contain
+                                          the Matmul result before fusion according to the IR graph.
+            epilogue_nodes: The list of scheduler nodes to be fused into the epilogue
+        """
+        self.accumulator_node_name: str = accumulator_node_name  #
+        self.body: IndentedBuffer = IndentedBuffer(1)  # The body buffer for codegen
+        self.var_counter: Iterator[int] = itertools.count()
+        self.store_name_to_value: dict[str, OpsValue] = (
+            dict()
+        )  # Aliases for subexpression functors
+        self.reads: OrderedSet[str] = OrderedSet([])
+        # Used for creating example tensors
+        self.var_name_to_buffer_name: dict[str, str] = {
+            _ACCUMULATOR_ARG_NAME: accumulator_node_name
+        }
+        self.removed_buffers: OrderedSet[str] = removed_buffers
+        self.cur_node: Optional[ComputedBuffer] = None
+        self.name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
+        for name in V.graph.constants.keys():
+            self.name_to_buffer[name] = V.graph.add_tensor_constant(
+                V.graph.constants[name], name
+            )
+        self.is_D_assigned = False
+        self.D_var_name = None
+
+        if accumulator_node_name not in removed_buffers:
+            # cannot return accumulator directly, so alias it
+            var = self._tmp_var()
+            self.body.writeline(f"{var} = {_ACCUMULATOR_ARG_NAME}")
+            self.store(accumulator_node_name, value=OpsValue(var))
+
+    @staticmethod
+    def ir_to_evt_python_code(
+        cuda_template_node_name: str,
+        epilogue_nodes: list[BaseSchedulerNode],
+        removed_buffers: OrderedSet[str],
+    ) -> tuple[list[str], list[str], dict[str, Any], str]:
+        codegen = CutlassEVTCodegen(cuda_template_node_name, removed_buffers)
+        handler = _AssignmentFormatter(codegen)
+
+        with virtualized.V.set_ops_handler(handler):
+            for s_node in epilogue_nodes:
+                node = s_node.node
+                assert isinstance(node, ComputedBuffer)
+                with codegen.set_cur_node(node):
+                    index_vars = CutlassEVTCodegen.get_index_vars(node)
+                    node.get_store_function()(index_vars)
+
+        codegen.finalize()
+
+        return (
+            codegen.get_reads(),
+            codegen.get_writes(),
+            codegen.get_renames(),
+            codegen.get_value(),
+        )
+
+    def get_value(self) -> str:
+        return linesep.join(
+            [
+                self._render_input_signature(),
+                self.body.getvalue(),
+                self._render_return_statement(),
+            ]
+        )
+
+    def finalize(self) -> None:
+        # Rename the last store to D
+        # no other code references this store
+        # to workaround https://github.com/NVIDIA/cutlass/issues/2288
+        # Note: the delayed line will automatically rewrite the last assignment to
+        # be to D
+        buffer_name = self.var_name_to_buffer_name[self.last_stored_var_name]
+        self.var_name_to_buffer_name.pop(self.last_stored_var_name)
+        self.var_name_to_buffer_name["D"] = buffer_name
+        self.store_name_to_value[buffer_name] = OpsValue("D")
+
+    @contextmanager
+    def set_cur_node(self, node: ComputedBuffer) -> Generator[None, Any, Any]:
+        prev_node = self.cur_node
+        try:
+            self.cur_node = node
+            yield
+        finally:
+            self.cur_node = prev_node
+
+    def get_renames(self) -> dict[str, str]:
+        return dict(self.var_name_to_buffer_name)
+
+    def get_reads(self) -> list[str]:
+        return list(self.reads.difference(self.store_name_to_value.keys()))
+
+    def get_writes(self) -> list[str]:
+        return list(self.store_name_to_value.keys())
+
+    def load(self, name: str, index: Any) -> str:
+        self._check_indexing(name, index)
+        if name in self.store_name_to_value:
+            return self.store_name_to_value[name].value
+        elif name == self.accumulator_node_name:
+            return _ACCUMULATOR_ARG_NAME
+        else:
+            self.reads.add(name)
+            self.var_name_to_buffer_name[name] = name
+            return name
+
+    def store(
+        self, name: Any, index: Any = None, value: Any = None, mode: Any = None
+    ) -> None:
+        if name not in self.removed_buffers:
+            if index:
+                self._check_indexing(name, index)
+            assert value.value != _ACCUMULATOR_ARG_NAME, (
+                "Cannot store accumulator arg name"
+            )
+            self.var_name_to_buffer_name[value.value] = name
+            self.store_name_to_value[name] = value
+            self.last_stored_var_name = value.value
+        return None
+
+    def _get_cur_node(self) -> ComputedBuffer:
+        assert self.cur_node
+        return self.cur_node
+
+    @staticmethod
+    def get_index_vars(node: ComputedBuffer) -> Sequence[sympy.Expr]:
+        data = node.data
+        # TODO mlazos: relax this, cutlass supports reductions and other ops
+        assert isinstance(data, Pointwise)
+        return data._index(data.ranges)
+
+    def _get_current_index_vars(self) -> Sequence[sympy.Expr]:
+        return self.get_index_vars(self._get_cur_node())
+
+    def _check_indexing(self, name: str, index: sympy.Expr) -> None:
+        # We only support indexing that matches the layout today because
+        # CUTLASS doesn't support arbitrary indexing
+        buffer_name = (
+            self.accumulator_node_name if name == _ACCUMULATOR_ARG_NAME else name
+        )
+        buffer = self.name_to_buffer[buffer_name]
+        index_strides = V.graph.sizevars.stride_vars(
+            index, self._get_current_index_vars()
+        )
+        stride = buffer.get_layout().stride
+        if not self._stride_compatible(stride, index_strides):
+            raise NotImplementedError(
+                f"Unsupported indexing for {name} with index {index}, index strides {index_strides}, and layout stride {stride}"
+            )
+
+    def _stride_compatible(
+        self, left: Iterable[sympy.Expr], right: Iterable[sympy.Expr]
+    ) -> bool:
+        return all(
+            sympy.Eq(l, r) or sympy.Eq(l, 0) or sympy.Eq(r, 0)
+            for l, r in (zip(left, right))
+        )
+
+    def _render_input_signature(self) -> str:
+        arguments = ", ".join(
+            [_ACCUMULATOR_ARG_NAME]
+            + [name for name in self.reads if name != self.accumulator_node_name]
+        )
+        return f"def fn({arguments}):"
+
+    def _render_return_statement(self) -> str:
+        return_vars = OrderedSet(
+            op_v.value for op_v in self.store_name_to_value.values()
+        )
+        assert "D" in return_vars
+        return f"return {', '.join(return_vars)}"
+
+    def _tmp_var(self) -> str:
+        return f"tmp_{next(self.var_counter)}"
diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
index c631558fbe44..d1b4416b87f6 100644
--- a/torch/_inductor/codegen/cuda/cutlass_utils.py
+++ b/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -1,8 +1,18 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import functools
 import logging
 import os
 import sys
+=======
+import atexit
+import functools
+import logging
+import os
+import shutil
+import sys
+import time
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional
@@ -10,17 +20,49 @@
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch._inductor.utils import clear_on_fresh_inductor_cache
+=======
+from torch._inductor.utils import clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ... import config
 from ...ir import Layout
 from ...runtime.runtime_utils import cache_dir
 from ...virtualized import V
+<<<<<<< HEAD
+=======
+from ..cpp_utils import DTYPE_TO_CPP
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .cuda_env import get_cuda_arch, get_cuda_version
 
 
 log = logging.getLogger(__name__)
 
+<<<<<<< HEAD
+=======
+CUTLASS_OPERATION_KIND: str = "gemm"
+
+
+@atexit.register
+def move_cutlass_compiled_cache() -> None:
+    """Move CUTLASS compiled cache file to the cache directory if it exists."""
+    if "cutlass" not in sys.modules:
+        return
+
+    import cutlass  # type: ignore[import-not-found]
+
+    if not os.path.exists(cutlass.CACHE_FILE):
+        return
+
+    try:
+        filename = os.path.basename(cutlass.CACHE_FILE)
+        shutil.move(cutlass.CACHE_FILE, os.path.join(cache_dir(), filename))
+        log.debug("Moved CUTLASS compiled cache file to %s", cache_dir())
+    except OSError as e:
+        log.warning("Failed to move CUTLASS compiled cache file: %s", str(e))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _rename_cutlass_import(content: str, cutlass_modules: list[str]) -> str:
     for cutlass_module in cutlass_modules:
@@ -31,7 +73,11 @@ def _rename_cutlass_import(content: str, cutlass_modules: list[str]) -> str:
     return content
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def try_import_cutlass() -> bool:
     """
     We want to support three ways of passing in CUTLASS:
@@ -42,11 +88,29 @@ def try_import_cutlass() -> bool:
        which is the directory when developers build from source.
     """
     if config.is_fbcode():
+<<<<<<< HEAD
         return True
 
     try:
         import cutlass  # type: ignore[import-not-found]
         import cutlass_library  # type: ignore[import-not-found]
+=======
+        try:
+            import cutlass  # type: ignore[import-not-found]
+            import cutlass_library  # type: ignore[import-not-found]
+        except ImportError as e:
+            log.warning(
+                "Failed to import CUTLASS packages in fbcode: %s, ignoring the CUTLASS backend.",
+                str(e),
+            )
+            return False
+
+        return True
+
+    try:
+        import cutlass  # type: ignore[import-not-found]  # noqa: F811
+        import cutlass_library  # type: ignore[import-not-found]  # noqa: F811
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cutlass_minor_vesion = int(cutlass.__version__.split(".")[1])
         if cutlass_minor_vesion < 7:
@@ -65,6 +129,10 @@ def try_import_cutlass() -> bool:
                 "source",
             )
         )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
     except ModuleNotFoundError:
         log.debug(
@@ -75,6 +143,7 @@ def try_import_cutlass() -> bool:
     # This is a temporary hack to avoid CUTLASS module naming conflicts.
     # TODO(ipiszy): remove this hack when CUTLASS solves Python scripts packaging structure issues.
 
+<<<<<<< HEAD
     cutlass_py_full_path = os.path.abspath(
         os.path.join(config.cuda.cutlass_dir, "python/cutlass_library")
     )
@@ -100,6 +169,76 @@ def try_import_cutlass() -> bool:
             import cutlass_library.generator  # noqa: F401
             import cutlass_library.library  # noqa: F401
             import cutlass_library.manifest  # noqa: F401
+=======
+    # TODO(mlazos): epilogue visitor tree currently lives in python/cutlass,
+    # but will be moved to python/cutlass_library in the future (later 2025)
+    def path_join(path0, path1):
+        return os.path.abspath(os.path.join(path0, path1))
+
+    # contains both cutlass and cutlass_library
+    # we need cutlass for eVT
+    cutlass_python_path = path_join(config.cuda.cutlass_dir, "python")
+    torch_root = os.path.abspath(os.path.dirname(torch.__file__))
+    mock_src_path = os.path.join(
+        torch_root,
+        "_inductor",
+        "codegen",
+        "cuda",
+        "cutlass_lib_extensions",
+        "cutlass_mock_imports",
+    )
+
+    cutlass_library_src_path = path_join(cutlass_python_path, "cutlass_library")
+    cutlass_src_path = path_join(cutlass_python_path, "cutlass")
+    pycute_src_path = path_join(cutlass_python_path, "pycute")
+
+    tmp_cutlass_full_path = os.path.abspath(os.path.join(cache_dir(), "torch_cutlass"))
+
+    dst_link_library = path_join(tmp_cutlass_full_path, "cutlass_library")
+    dst_link_cutlass = path_join(tmp_cutlass_full_path, "cutlass")
+    dst_link_pycute = path_join(tmp_cutlass_full_path, "pycute")
+
+    # mock modules to import cutlass
+    mock_modules = ["cuda", "scipy", "pydot"]
+
+    if os.path.isdir(cutlass_python_path):
+        if tmp_cutlass_full_path not in sys.path:
+
+            def link_and_append(dst_link, src_path, parent_dir):
+                if os.path.exists(dst_link):
+                    assert os.path.islink(dst_link), (
+                        f"{dst_link} is not a symlink. Try to remove {dst_link} manually and try again."
+                    )
+                    assert os.path.realpath(os.readlink(dst_link)) == os.path.realpath(
+                        src_path,
+                    ), f"Symlink at {dst_link} does not point to {src_path}"
+                else:
+                    os.makedirs(parent_dir, exist_ok=True)
+                    os.symlink(src_path, dst_link)
+
+                if parent_dir not in sys.path:
+                    sys.path.append(parent_dir)
+
+            link_and_append(
+                dst_link_library, cutlass_library_src_path, tmp_cutlass_full_path
+            )
+            link_and_append(dst_link_cutlass, cutlass_src_path, tmp_cutlass_full_path)
+            link_and_append(dst_link_pycute, pycute_src_path, tmp_cutlass_full_path)
+
+            for module in mock_modules:
+                link_and_append(
+                    path_join(tmp_cutlass_full_path, module),  # dst_link
+                    path_join(mock_src_path, module),  # src_path
+                    tmp_cutlass_full_path,  # parent
+                )
+
+        try:
+            import cutlass  # noqa: F401
+            import cutlass_library.generator  # noqa: F401
+            import cutlass_library.library  # noqa: F401
+            import cutlass_library.manifest  # noqa: F401
+            import pycute  # type: ignore[import-not-found]  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return True
         except ImportError as e:
@@ -110,7 +249,11 @@ def try_import_cutlass() -> bool:
     else:
         log.debug(
             "Failed to import CUTLASS packages: CUTLASS repo does not exist: %s",
+<<<<<<< HEAD
             cutlass_py_full_path,
+=======
+            cutlass_python_path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     return False
 
@@ -148,8 +291,13 @@ class CUTLASSArgs:
     architectures: Optional[str] = None
     cuda_version: Optional[str] = None
     instantiation_level: Optional[str] = None
+<<<<<<< HEAD
 
     operations = "all"
+=======
+    operations: Optional[str] = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     build_dir = ""
     curr_build_dir = ""
     generator_target = ""
@@ -171,9 +319,15 @@ def __post_init__(self):
         self.architectures = _normalize_cuda_arch(self.architectures)
 
 
+<<<<<<< HEAD
 @clear_on_fresh_inductor_cache
 @functools.lru_cache(None)
 def _gen_ops_cached(arch, version) -> list[Any]:
+=======
+@clear_on_fresh_cache
+@functools.cache
+def _gen_ops_cached(arch, version) -> dict[Any, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Note: Cache needs to be specific for cuda architecture and version
 
     # Import cutlass python scripts.
@@ -189,13 +343,18 @@ def _gen_ops_cached(arch, version) -> list[Any]:
             arch,
             version,
         )
+<<<<<<< HEAD
         return []
+=======
+        return {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     arch = _normalize_cuda_arch(arch)
     instantiation_level: str = config.cuda.cutlass_instantiation_level
     args = CUTLASSArgs(
         architectures=arch,
         cuda_version=version,
         instantiation_level=instantiation_level,
+<<<<<<< HEAD
     )
     manifest = cutlass_manifest.Manifest(args)
 
@@ -210,6 +369,17 @@ def _gen_ops_cached(arch, version) -> list[Any]:
     elif arch == "90":
         cutlass_generator.GenerateSM90(manifest, args.cuda_version)
         cutlass_generator.GenerateSM80(manifest, args.cuda_version)
+=======
+        operations=CUTLASS_OPERATION_KIND,
+    )
+    manifest = cutlass_manifest.Manifest(args)
+
+    start_time = time.time()
+    if arch == "100":
+        if hasattr(cutlass_generator, "GenerateSM100"):
+            cutlass_generator.GenerateSM100(manifest, args.cuda_version)
+        cutlass_generator.GenerateSM90(manifest, args.cuda_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         try:
             func = getattr(cutlass_generator, "GenerateSM" + arch)
@@ -218,10 +388,23 @@ def _gen_ops_cached(arch, version) -> list[Any]:
             raise NotImplementedError(
                 "Arch " + arch + " is not supported by current cutlass lib."
             ) from e
+<<<<<<< HEAD
     return manifest.operations
 
 
 def gen_ops() -> list[Any]:
+=======
+
+    log.info(
+        "CUTLASS library generated a dict of %d operation kinds in %.2f seconds",
+        len(manifest.operations),
+        time.time() - start_time,
+    )
+    return manifest.operations
+
+
+def gen_ops() -> dict[Any, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Generates all supported CUTLASS operations.
     """
@@ -230,6 +413,18 @@ def gen_ops() -> list[Any]:
     return _gen_ops_cached(arch, version)
 
 
+<<<<<<< HEAD
+=======
+DTYPE_TO_CUTLASS_TYPE = {
+    **DTYPE_TO_CPP,
+    torch.float16: "__half",
+    torch.bfloat16: "__nv_bfloat16",
+    torch.float8_e4m3fn: "__nv_fp8_e4m3",
+}
+
+
+@functools.lru_cache(32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def torch_dtype_to_cutlass_type(
     torch_dtype: torch.dtype,
 ) -> "cutlass_library.library.DataType":  # type: ignore[name-defined] # noqa: F821
@@ -247,6 +442,10 @@ def torch_dtype_to_cutlass_type(
         raise NotImplementedError(f"Unsupported data type: {torch_dtype=}")
 
 
+<<<<<<< HEAD
+=======
+@functools.lru_cache(32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def dtype_match(
     torch_dtype: Optional[torch.dtype],
     cutlass_dtype: "cutlass_library.library.DataType",  # type: ignore[name-defined]  # noqa: F821
@@ -270,6 +469,11 @@ def dtype_match(
         return cutlass_dtype == cutlass_library.library.DataType.u8
     elif torch_dtype == torch.int32:
         return cutlass_dtype == cutlass_library.library.DataType.s32
+<<<<<<< HEAD
+=======
+    elif torch_dtype == torch.float8_e4m3fn:
+        return cutlass_dtype == cutlass_library.library.DataType.e4m3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return False
 
@@ -300,13 +504,21 @@ def get_accumulator_dtype(
         ]:
             torch_dtype = dtype0
 
+<<<<<<< HEAD
     if torch_dtype in (torch.float16, torch.bfloat16, torch.float):
+=======
+    if torch_dtype in (torch.float16, torch.bfloat16, torch.float, torch.float8_e4m3fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch.float
     if torch_dtype == torch.int8:
         return torch.int32
     raise NotImplementedError(f"Unsupported data types: {input_torch_dtypes=}")
 
 
+<<<<<<< HEAD
+=======
+@functools.lru_cache(32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_alignments(torch_dtype: torch.dtype) -> list[int]:
     """
     Returns all possible valid CUTLASS alignments in terms of the number of elements for a given dtype.
@@ -317,7 +529,11 @@ def get_alignments(torch_dtype: torch.dtype) -> list[int]:
         return [8, 4, 2, 1]
     elif torch_dtype == torch.float:
         return [4, 2, 1]
+<<<<<<< HEAD
     elif torch_dtype in (torch.uint8, torch.int8):
+=======
+    elif torch_dtype in (torch.uint8, torch.int8, torch.float8_e4m3fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [16, 8, 4, 2]
     elif torch_dtype == torch.int32:
         return [4, 2, 1]
diff --git a/torch/_inductor/codegen/cuda/device_op_overrides.py b/torch/_inductor/codegen/cuda/device_op_overrides.py
index ea22ee3c9e03..95a4f29461f5 100644
--- a/torch/_inductor/codegen/cuda/device_op_overrides.py
+++ b/torch/_inductor/codegen/cuda/device_op_overrides.py
@@ -5,10 +5,25 @@
 import torch
 
 from ...utils import triton_version_uses_attrs_dict
+<<<<<<< HEAD
 from ..common import DeviceOpOverrides, register_device_op_overrides
 
 
 class CUDADeviceOpOverrides(DeviceOpOverrides):
+=======
+from ..common import (
+    DeviceOpOverrides,
+    register_device_op_overrides,
+    TritonScratchWorkspace,
+)
+
+
+class CUDADeviceOpOverrides(DeviceOpOverrides):
+    """
+    CUDA-specific codegen functions, see DeviceOpOverrides for details
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def import_get_raw_stream_as(self, name: str) -> str:
         return f"from torch._C import _cuda_getCurrentRawStream as {name}"
 
@@ -88,6 +103,24 @@ def kernel_driver(self) -> str:
                 return func;
             }
 
+<<<<<<< HEAD
+=======
+            static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
+                CUmodule mod;
+                CUfunction func;
+                CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
+                CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+                if (sharedMemBytes > 0) {
+                    CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+                        func,
+                        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                        sharedMemBytes
+                    ))
+                }
+                return func;
+            }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             static inline void launchKernel(
                     CUfunction func,
                     uint32_t gridX,
@@ -111,11 +144,24 @@ def kernel_driver(self) -> str:
         return source_codes
 
     def tma_descriptor_helpers(self) -> str:
+<<<<<<< HEAD
+=======
+        """
+        CUDA helper functions for initializing TMA Descriptors on host side
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch.version.hip is not None:
             raise RuntimeError("Host-side TMA descriptors not supported on HIP.")
 
         # helper functions for initializing 1D and 2D TMA descriptors in C++. borrowed from the Triton code here:
+<<<<<<< HEAD
+        # https://github.com/triton-lang/triton/blob/6af4f88591c85de079d8a36a4d7dba67918e2b39/third_party/nvidia/backend/driver.c#L283
+=======
+        # Old APIs (fill(1|2)DTMADescriptor):
         # https://github.com/triton-lang/triton/blob/6af4f88591c85de079d8a36a4d7dba67918e2b39/third_party/nvidia/backend/driver.c#L283
+        # New APIs (fillTMADescriptor):
+        # https://github.com/triton-lang/triton/blob/main/third_party/nvidia/backend/driver.c#L283
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return """
             #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12000
             [[maybe_unused]] static void init1DTMADescriptor(
@@ -210,6 +256,88 @@ def tma_descriptor_helpers(self) -> str:
                     swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
                     CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE));
             }
+<<<<<<< HEAD
+=======
+
+            [[maybe_unused]] static void initTMADescriptor(
+                CUtensorMap* m,
+                void* globalAddress,
+                int elemSize,
+                int rank,
+                uint32_t* blockSize,
+                uint64_t* shape,
+                uint64_t* stride
+            ) {
+                uint32_t elementStrides[5] = {1, 1, 1, 1, 1};
+                uint32_t blockSizeInt[5];
+                uint64_t shapeInt[5];
+                uint64_t stridesLL[5];
+
+                // Reorder blockSize (reverse the order)
+                for (int i = 0; i < rank; ++i) {
+                    blockSizeInt[rank - i - 1] = blockSize[i];
+                }
+
+                // Reorder shape (reverse the order)
+                for (int i = 0; i < rank; ++i) {
+                    shapeInt[rank - i - 1] = shape[i];
+                }
+
+                // Reorder and calculate strides
+                for (int i = 0; i + 1 < rank; ++i) {
+                    stridesLL[rank - i - 2] = elemSize * stride[i];
+                }
+                stridesLL[rank - 1] =
+                    shapeInt[rank - 1] * (rank == 1 ? elemSize : stridesLL[rank - 2]);
+
+                CUtensorMapDataType type;
+                // In Triton this is computed ahead of time; but for simplicity
+                // in the PyTorch version we copied this code from the old
+                // TMA API handling (i.e. init2DTMADescriptor)
+                switch (elemSize) {
+                case 1:
+                    type = CU_TENSOR_MAP_DATA_TYPE_UINT8;
+                    break;
+                case 2:
+                    type = CU_TENSOR_MAP_DATA_TYPE_UINT16;
+                    break;
+                case 4:
+                    type = CU_TENSOR_MAP_DATA_TYPE_UINT32;
+                    break;
+                default:
+                    throw std::runtime_error("elemSize must be 1, 2, or 4");
+                }
+
+                // Calculate the size of the most contiguous dimension in bytes
+                CUtensorMapSwizzle swizzle = CU_TENSOR_MAP_SWIZZLE_128B;
+                uint32_t contigDimSizeInByte = elemSize * blockSizeInt[0];
+                if (rank == 1) {
+                    // rank 1 should not be swizzled
+                    swizzle = CU_TENSOR_MAP_SWIZZLE_NONE;
+                } else if (contigDimSizeInByte >= 128) {
+                    swizzle = CU_TENSOR_MAP_SWIZZLE_128B;
+                } else if (contigDimSizeInByte >= 64) {
+                    swizzle = CU_TENSOR_MAP_SWIZZLE_64B;
+                } else if (contigDimSizeInByte >= 32) {
+                    swizzle = CU_TENSOR_MAP_SWIZZLE_32B;
+                } else {
+                    throw std::runtime_error("block size too small");
+                }
+
+                CUDA_DRIVER_CHECK(cuTensorMapEncodeTiled(
+                    m, type, rank, globalAddress,
+                    shapeInt, stridesLL, blockSizeInt, elementStrides,
+                    CU_TENSOR_MAP_INTERLEAVE_NONE, (CUtensorMapSwizzle)swizzle,
+                    CU_TENSOR_MAP_L2_PROMOTION_L2_128B, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE));
+            }
+
+            struct StableTMADescriptor {
+                CUtensorMap m;
+                uint32_t block_shape[5];
+                uint64_t global_shape[5];
+                uint64_t strides[5];
+            };
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             #endif
         """
 
@@ -225,9 +353,39 @@ def cpp_kernel_type(self) -> str:
     def cpp_device_ptr(self) -> str:
         return "CUdeviceptr"
 
+<<<<<<< HEAD
     def cpp_global_scratch(self, idx: int) -> Optional[tuple[str, str]]:
         if triton_version_uses_attrs_dict():
             return f"CUdeviceptr global_scratch_{idx} = 0;", f"global_scratch_{idx}"
+=======
+    def cpp_global_scratch(
+        self, idx: int, workspace: TritonScratchWorkspace
+    ) -> Optional[tuple[list[str], str]]:
+        if triton_version_uses_attrs_dict():
+            var_name = f"global_scratch_{idx}"
+            if workspace.size > 0:
+                size_array = f"int64_t {var_name}_size[] = {{{workspace.size}}};"
+                stride_array = f"int64_t {var_name}_stride[] = {{1}};"
+                device_type = "cached_torch_device_type_cuda"
+                device_idx = "device_idx_"
+
+                return (
+                    [
+                        f"{size_array}",
+                        f"{stride_array}",
+                        f"AtenTensorHandle {var_name}_handle;",
+                        (
+                            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(1, {var_name}_size, {var_name}_stride, "
+                            f"{workspace.generate_dtype_str()}, {device_type}, {device_idx}, &{var_name}_handle));"
+                        ),
+                        f"RAIIAtenTensorHandle {var_name}_tensor({var_name}_handle);",
+                        f"CUdeviceptr {var_name} = reinterpret_cast<CUdeviceptr>({var_name}_tensor.data_ptr());",
+                    ],
+                    var_name,
+                )
+            else:
+                return [f"CUdeviceptr {var_name} = 0;"], var_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
 
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
index f4d171a632f8..dbfd0b53e564 100644
--- a/torch/_inductor/codegen/cuda/gemm_template.py
+++ b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -1,10 +1,26 @@
 # mypy: allow-untyped-defs
 import copy
 import enum
+<<<<<<< HEAD
 import logging
 import re
 from abc import ABC, abstractmethod
 from typing import Optional, Union
+=======
+import functools
+import logging
+import re
+import time
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Union
+
+import torch
+import torch.utils._pytree as pytree
+from torch._inductor.codegen.cuda.cutlass_cache import maybe_fetch_ops
+from torch._inductor.scheduler import BaseSchedulerNode
+from torch._inductor.select_algorithm import create_inputs_key
+from torch._inductor.utils import clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ... import ir
 from ...config import cuda as inductor_cuda_config
@@ -17,27 +33,48 @@
     Layout,
     ReinterpretView,
 )
+<<<<<<< HEAD
 from ...utils import is_dynamic
+=======
+from ...utils import is_dynamic, Placeholder
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ...virtualized import V
 from ..common import IndentedBuffer
 from . import cutlass_utils
 from .cuda_kernel import CUDATemplateKernel
 from .cuda_template import CUTLASSTemplate
+<<<<<<< HEAD
+
+
+=======
+from .cutlass_presets import gen_cutlass_presets
+from .cutlass_python_evt import CutlassEVTCodegen, scaled_mm_evt
+from .cutlass_utils import torch_dtype_to_cutlass_type
+
 
+GemmOperation = Any
 
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 
 # Jinja template for GEMM Kernel, used by the CUTLASSGemm3xTemplate class below.
 GEMM_TEMPLATE_CUTLASS_3X = r"""
 {{template.header().getvalue()}}
 {{template.globals().getvalue()}}
+<<<<<<< HEAD
+=======
+{{epilogue_visitor_tree}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 {{instance_definition}}
 // When workspace_size is not a nullptr, populates requested workspace_size and returns.
 // Otherwise, computes the Gemm kernel using the given workspace ptr.
 extern "C" {
 PT_EXPORT {{kernel_call_signature}} {
   try {
+<<<<<<< HEAD
   int B = {{kernel.size(Y, 0, -3, default_value=1)}};
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using ElementComputeEpilogue = {{instance_type}}::ElementAccumulator;
   using coord_t = cutlass::gemm::GemmCoord::Index;
   static cutlass::KernelHardwareInfo hw_info;
@@ -108,13 +145,21 @@
       {
         {{template.cute_int(kernel.stride(X, -2), "stride_x0")}},
         {{template.cute_int(kernel.stride(X, -1), "stride_x1")}},
+<<<<<<< HEAD
         {{template.cute_int(kernel.stride(X, -3), "batch_stride_x")}}
+=======
+        {{template.cute_int(kernel.batch_stride(X), "batch_stride_x")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },  // StrideA dA
       {{template.cutlass_type_cast(W, kernel.ptr(W))}},  // ElementB const* ptr_B
       {
         {{template.cute_int(kernel.stride(W, -1), "stride_w1")}},
         {{template.cute_int(kernel.stride(W, -2), "stride_w0")}},
+<<<<<<< HEAD
         {{template.cute_int(kernel.stride(W, -3), "batch_stride_w")}}
+=======
+        {{template.cute_int(kernel.batch_stride(W), "batch_stride_w")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },  // StrideB dB
     },  // MainloopArguments mainloop
     {{epilogue_arguments}},
@@ -133,13 +178,21 @@
       {
         {{template.cute_int(kernel.stride(Bias, -2, 1), "stride_bias0")}},
         {{template.cute_int(kernel.stride(Bias, -1, 1), "stride_bias1")}},
+<<<<<<< HEAD
         {{template.cute_int(kernel.stride(Bias, -3), "batch_stride_bias")}}
+=======
+        {{template.cute_int(kernel.batch_stride(Bias), "batch_stride_bias")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },  // StrideC dC
       {{template.cutlass_type_cast(Y, kernel.ptr(Y))}},  // ElementD const* ptr_D
       {
         {{template.cute_int(kernel.stride(Y, -2), "stride_y0")}},
         {{template.cute_int(kernel.stride(Y, -1), "stride_y1")}},
+<<<<<<< HEAD
         {{template.cute_int(kernel.stride(Y, -3), "batch_stride_y")}}
+=======
+        {{template.cute_int(kernel.batch_stride(Y), "batch_stride_y")}}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },  // StrideD dD
     },  // EpilogueArguments epilogue
 """
@@ -279,7 +332,11 @@
   };
 """
 
+<<<<<<< HEAD
 # Additional includes which are neccessary if the standalone test / debug runner is generated as wel
+=======
+# Additional includes which are necessary if the standalone test / debug runner is generated as well
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 GEMM_STANDALONE_RUNNER_ADDITIONAL_INCLUDES = r"""
 #ifdef GENERATE_STANDALONE_RUNNER
 #include "cutlass/util/distribution.h"
@@ -329,10 +386,18 @@
     int M = {{kernel.get_layout_args()[0]}};
     int N = {{kernel.get_layout_args()[1]}};
     int K = {{kernel.get_layout_args()[2]}};
+<<<<<<< HEAD
     int lda = {{kernel.get_layout_args()[3]}};
     int ldb = {{kernel.get_layout_args()[4]}};
     int ldc = {{kernel.get_layout_args()[5]}};
     int ldd = {{kernel.get_layout_args()[6]}};
+=======
+    int B = {{kernel.get_layout_args()[3]}};
+    int lda = {{kernel.get_layout_args()[4]}};
+    int ldb = {{kernel.get_layout_args()[5]}};
+    int ldc = {{kernel.get_layout_args()[6]}};
+    int ldd = {{kernel.get_layout_args()[7]}};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     uint8_t swizzle = {{kernel.runtime_arg_values[0]}};
 
     using ElementA = {{kernel.cutlass_dtype(X)}};
@@ -360,7 +425,11 @@
 
     std::cout << "Calling once to get workspace size" << std::endl;
     {{test_call_statement}};
+<<<<<<< HEAD
     // Allocate workspace if neccessary
+=======
+    // Allocate workspace if necessary
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (workspace_size > 0) {
         workspace_data.reset(workspace_size);
         std::cout << "Allocated workspace size of " << workspace_size << " bytes" << std::endl;
@@ -390,12 +459,22 @@
 """  # noqa: B950
 
 
+<<<<<<< HEAD
+=======
+@clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class CUTLASSGemmTemplate(CUTLASSTemplate, ABC):
     """
     CUTLASS GEMM Template, which is used to generate CUTLASS GEMM kernels
     including those which allow flexible fusions with epilogues.
     """
 
+<<<<<<< HEAD
+=======
+    filtered_ops_cache: dict[str, list[Any]] = {}
+    cache_clear = staticmethod(filtered_ops_cache.clear)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         input_nodes: list[Buffer],
@@ -403,6 +482,10 @@ def __init__(
         alpha: float,
         beta: float,
         input_reorder: Optional[list[int]] = None,
+<<<<<<< HEAD
+=======
+        use_fast_accum: Optional[bool] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """
         Args:
@@ -413,14 +496,29 @@ def __init__(
             input_reorder (Optional[List[int]]): Specifies the reordering of the input nodes. If not provided,
                             no reordering is performed. Defaults to None.
         """
+<<<<<<< HEAD
         super().__init__("cutlass_gemm", input_nodes, layout, input_reorder)
         self.alpha = alpha
         self.beta = beta
         assert len(input_nodes) == 2 or len(input_nodes) == 3
+=======
+        super().__init__(
+            str(Placeholder.KERNEL_NAME), input_nodes, layout, input_reorder
+        )
+        self.alpha = alpha
+        self.beta = beta
+        self.use_fast_accum = use_fast_accum
+        assert 2 <= len(input_nodes) <= 5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self._are_inputs_layout_compatible(
             [node.get_layout() for node in input_nodes]
         )
 
+<<<<<<< HEAD
+=======
+        self.cache_key: str = create_inputs_key(self.input_nodes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     @abstractmethod
     def add_cutlass_gemm_choices(
@@ -430,6 +528,10 @@ def add_cutlass_gemm_choices(
         alpha: Union[float, int] = 1,
         beta: Union[float, int] = 0,
         input_reorder: Optional[list[int]] = None,
+<<<<<<< HEAD
+=======
+        use_fast_accum: Optional[bool] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **extra_kwargs,
     ) -> None:
         raise NotImplementedError
@@ -483,7 +585,12 @@ def _set_bias_layout_and_alignment(
     @abstractmethod
     def _define_gemm_instance(
         self,
+<<<<<<< HEAD
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
+=======
+        op: GemmOperation,
+        evt_name: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[str, str]:
         raise NotImplementedError
 
@@ -535,6 +642,10 @@ def _add_cutlass_gemm_choices(
                 self.maybe_append_choice(
                     choices, description=description, op=op, swizzle=swizzle
                 )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(ops) == 0:
             input_layouts = [node.get_layout() for node in input_nodes]
             input_strides = [node.get_stride() for node in input_nodes]
@@ -619,6 +730,10 @@ def flip_cutlass_layout(
             return cutlass_lib.LayoutType.RowMajor
 
     @staticmethod
+<<<<<<< HEAD
+=======
+    @functools.lru_cache(32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def layout_match(
         torch_layout: ir.Layout,
         cutlass_layout: "cutlass_lib.LayoutType",  # type: ignore[name-defined] # noqa: F821
@@ -655,13 +770,21 @@ def should_swap_XW(
     ) -> bool:
         """
         Helper method to determine whether we should do an explicit transpose by switching the order of the
+<<<<<<< HEAD
         matmul operands. This might be neccessary when we can't otherwise arrive at the right memory
+=======
+        matmul operands. This might be necessary when we can't otherwise arrive at the right memory
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         layout for the given Bias operand.
 
         Note: This method is a workaround for CUDA Errors that seemingly non-deterministically
         occurred in practice in some CUTLASS GEMM Kernels with Linear epilogues that have a bias term.
         it might make sense to check on newer Cutlass releases whether it makes sense to keep
+<<<<<<< HEAD
         returning True in certain cases or whether it becomes unneccessary.
+=======
+        returning True in certain cases or whether it becomes unnecessary.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         # If bias is row major, swap all M and N dimensions
         if (
@@ -835,26 +958,73 @@ def filter_op(
             and self.set_alignment(self.output_node.get_layout(), op.D)
         )
         if not status:
+<<<<<<< HEAD
             log.debug("Skipping due to alignment setting failure. op: %s", op)
+=======
+            log.debug(
+                "Skipping due to alignment setting failure. op: %s",
+                op.configuration_name(),
+            )
+            return None
+
+        if inductor_cuda_config.cutlass_tma_only and not self._has_tma_epilogue(op):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
 
         # Set epilogue.
         # TODO: update epilogue functor according to epilogues.
         op.element_epilogue = op.accumulator_type()
 
+<<<<<<< HEAD
+=======
+        if self.use_fast_accum is not None:
+            is_op_fast_accum = "fastaccum" in op.configuration_name()
+            if self.use_fast_accum ^ is_op_fast_accum:
+                return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Set bias layout and alignment.
         status = self._set_bias_layout_and_alignment(op)
         if not status:
             log.debug(
+<<<<<<< HEAD
                 "Skipping due to bias layout and alignment setting failure. op: %s", op
+=======
+                "Skipping due to bias layout and alignment setting failure. op: %s",
+                op.configuration_name(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return None
 
         # Apply regex filters at the end when configuration name doesn't change anymore
+<<<<<<< HEAD
         if inductor_cuda_config.cutlass_op_allowlist_regex is not None:
             if not re.search(
                 inductor_cuda_config.cutlass_op_allowlist_regex, op.configuration_name()
             ):
+=======
+        if (
+            inductor_cuda_config.cutlass_op_allowlist_regex
+            or inductor_cuda_config.cutlass_presets
+        ):
+            patterns = []
+            if inductor_cuda_config.cutlass_op_allowlist_regex:
+                patterns.append(inductor_cuda_config.cutlass_op_allowlist_regex)
+            if inductor_cuda_config.cutlass_presets:
+                presets = gen_cutlass_presets()
+                preset_nums = [
+                    int(x) for x in inductor_cuda_config.cutlass_presets.split(",")
+                ]
+                for preset_num in preset_nums:
+                    preset = presets.get(preset_num, {}).get(
+                        inductor_cuda_config.cutlass_instantiation_level, []
+                    )
+
+                    patterns.extend(preset)
+
+            pattern = "|".join(patterns)
+            if pattern and not re.search(pattern, op.configuration_name()):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return None
         if inductor_cuda_config.cutlass_op_denylist_regex is not None:
             if re.search(
@@ -877,6 +1047,7 @@ def gen_ops(self) -> "list[tuple[str, cutlass_gemm_op.GemmOperation]]":  # type:
         """
         assert cutlass_utils.try_import_cutlass()
         import cutlass_library.gemm_operation as cutlass_gemm_op
+<<<<<<< HEAD
         import cutlass_library.library as cutlass_lib
 
         ops = cutlass_utils.gen_ops()[cutlass_lib.OperationKind.Gemm]
@@ -904,6 +1075,45 @@ def gen_ops(self) -> "list[tuple[str, cutlass_gemm_op.GemmOperation]]":  # type:
         log.info("Got cutlass configs: total number of ops: %d, ", len(res))
         sorted_res = sorted(res.items())
         return sorted_res[: inductor_cuda_config.cutlass_max_profiling_configs]
+=======
+
+        if self.cache_key in self.filtered_ops_cache:
+            log.debug("Using cached ops for %s", self.cache_key)
+            return self.filtered_ops_cache[self.cache_key]
+
+        maybe_ops = maybe_fetch_ops()
+        if maybe_ops is None:
+            log.debug("Cannot fetch ops from cache, generating ops from scratch")
+            full_ops = cutlass_utils.gen_ops()
+            ops = pytree.tree_flatten(full_ops)[0]
+        else:
+            log.debug("Using cached ops from cache")
+            ops = maybe_ops
+
+        res: dict[str, cutlass_gemm_op.GemmOperation] = {}
+        start_time = time.time()
+        for op in ops:
+            # if changed, need to also change CUTLASS_OPERATION_KIND
+            assert isinstance(op, cutlass_gemm_op.GemmOperation)
+            filter_res = self.filter_op(op)
+            if (
+                filter_res is not None
+                and res.get(filter_res.configuration_name(), None) is None
+            ):
+                res[filter_res.configuration_name()] = filter_res
+        log.info(
+            "Got cutlass configs: total number of ops: %d. Filtering took %.2f seconds",
+            len(res),
+            time.time() - start_time,
+        )
+        sorted_res = sorted(res.items())
+        ret_res = sorted_res[: inductor_cuda_config.cutlass_max_profiling_configs]
+        if len(self.filtered_ops_cache) < 50:
+            self.filtered_ops_cache[self.cache_key] = ret_res
+        else:
+            log.debug("Not caching ops since filtered_ops_cache has reached size 50.")
+        return ret_res
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def gemm_mode(self) -> str:
         """
@@ -926,6 +1136,10 @@ def render(  # type: ignore[override]
         kernel: CUDATemplateKernel,
         op: "cutlass_gemm_op.GemmOperation" = None,  # type: ignore[name-defined]  # noqa: F821
         template_buffer_node: Optional[CUDATemplateBuffer] = None,
+<<<<<<< HEAD
+=======
+        epilogue_nodes: Optional[list[BaseSchedulerNode]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **kwargs,
     ) -> str:
         """
@@ -947,7 +1161,10 @@ def render(  # type: ignore[override]
             All inputs and their corresponding buffer addresses and names take precedence over previously
             passed inputs to the template at construction time. However, they should be layout compatible.
         """
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert cutlass_utils.try_import_cutlass()
         import cutlass_library.gemm_operation as cutlass_gemm_op
         import cutlass_library.library as cutlass_lib
@@ -956,12 +1173,25 @@ def render(  # type: ignore[override]
             "op argument is required and has to be an instance of GemmOperation"
         )
 
+<<<<<<< HEAD
         assert len(self.input_nodes) >= 2 and self.output_node is not None
         X, W = self.input_nodes[0], self.input_nodes[1]
         if not isinstance(X.layout, FixedLayout):
             raise NotImplementedError("X.layout is not fixed")
         if not isinstance(W.layout, FixedLayout):
             raise NotImplementedError("W.layout is not fixed")
+=======
+        if epilogue_nodes and not self._has_tma_epilogue(op):
+            raise NotImplementedError(
+                "Non-TMA epilogue visitor tree is not supported in Cutlass."
+            )
+
+        assert len(self.input_nodes) >= 2 and self.output_node is not None
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        for input_node in self.input_nodes:
+            if not isinstance(X.layout, FixedLayout):
+                input_node.freeze_layout()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Y = self.output_node
         if template_buffer_node is not None:
@@ -979,6 +1209,7 @@ def render(  # type: ignore[override]
             input_reorder = self.input_reorder
         else:
             input_reorder = None
+<<<<<<< HEAD
         kernel_call_signature = kernel.def_kernel(
             inputs=inputs,  # type: ignore[arg-type]
             outputs=[Y],
@@ -986,21 +1217,39 @@ def render(  # type: ignore[override]
             input_reorder=input_reorder,
         )
         test_call_statement = self.test_call_statement(kernel, inputs, names_str)
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # The layouts might have changed between autotuning and this call if they were FlexibleLayout
         # we need to adapt, which might lead to suboptimal performance.
         op = self.fix_op_layout(op, X, W, Bias, Y)
 
         # to make op mutable without affecting others
         op = copy.deepcopy(op)
+<<<<<<< HEAD
         if Bias is not None:
             assert Bias.get_layout().dtype == X.get_layout().dtype
+=======
+        is_scaled_mm = len(self.input_nodes) in (4, 5)
+        if Bias is not None and not is_scaled_mm:
+            assert Bias.get_dtype() == X.get_dtype()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # This might have been set to void during filtering, when the assumption was still that there's no C
             # operand
             op.C.element = op.A.element
 
+<<<<<<< HEAD
         argument_template, epilogue_template = self._get_template_args(op)
         should_swap_xw: bool = False
         epilogue_args = f"{{ElementComputeEpilogue({self.alpha}), ElementComputeEpilogue({self.beta})}}"
+=======
+            assert op.C.element == op.D.element, (
+                f"Expect C and D to have the same dtype, found {op.C.element} and {op.D.element}"
+            )
+
+        argument_template, epilogue_template = self._get_template_args(op)
+        should_swap_xw: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if Bias is not None and self._has_tma_epilogue(op):
             if (
                 op.epilogue_schedule
@@ -1011,7 +1260,102 @@ def render(  # type: ignore[override]
                 op = self.swap_XW(op)
                 should_swap_xw = True
 
+<<<<<<< HEAD
         instance_definition, instance_type = self._define_gemm_instance(op)
+=======
+        if epilogue_nodes or is_scaled_mm:
+            if epilogue_nodes:
+                (
+                    input_names,
+                    output_names,
+                    var_name_to_buffer_name,
+                    evt_py_code,
+                ) = CutlassEVTCodegen.ir_to_evt_python_code(
+                    Y.get_name(), epilogue_nodes, V.kernel.removed_buffers
+                )
+
+                D_output_name = var_name_to_buffer_name["D"]
+                name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
+                for name in V.graph.constants.keys():
+                    name_to_buffer[name] = V.graph.add_tensor_constant(
+                        V.graph.constants[name], name
+                    )
+                D_output_buffer = name_to_buffer[D_output_name]
+                Y = D_output_buffer  # type: ignore[assignment]
+                # Interestingly, I don't think the rest of the layout matters here since we
+                # use the properties of the Y buffer to fill in D's properties in the epilogue
+                # args. This is needed though because it defines types expected in the epilogue args.
+                op.D.element = cutlass_utils.torch_dtype_to_cutlass_type(
+                    D_output_buffer.get_dtype()
+                )
+
+                assert output_names, "There should be at least one write"
+
+                epilogue_inputs = [name_to_buffer[name] for name in input_names]
+                outputs = [name_to_buffer[name] for name in output_names]
+            else:  # Scaled MM, we read the two scale matrices (and optional bias) and write a single output
+                bias = None if len(self.input_nodes) < 5 else self.input_nodes[4]
+                bias_name = bias.get_name() if bias else None
+
+                (
+                    evt_read_names,
+                    var_name_to_buffer_name,
+                    evt_py_code,
+                ) = scaled_mm_evt(
+                    self.input_nodes[2].get_name(),  # scale_A
+                    self.input_nodes[3].get_name(),  # scale_B
+                    bias_name,
+                    Y.get_name(),
+                )
+
+                input_names = list(evt_read_names)
+                output_names = []  # We only need Y
+                epilogue_inputs = [self.input_nodes[2], self.input_nodes[3]]
+                if bias:
+                    epilogue_inputs.append(bias)
+                outputs = []
+
+            acc_dtype = cutlass_utils.get_accumulator_dtype(
+                [X.get_dtype(), W.get_dtype()]
+            )
+            assert acc_dtype, "Could not determine accumulator dtype"
+
+            evt_name, evt_args, evt_code = self._render_evt(
+                op,
+                evt_py_code,
+                var_name_to_buffer_name,
+                Y.get_dtype(),
+                acc_dtype,
+            )
+
+            inputs = [
+                X,
+                W,
+                Bias,
+                *epilogue_inputs,  # type: ignore[list-item]
+                Y,
+                *extra_inputs,
+            ]
+            names_str = ",".join(
+                ["X", "W", "Bias", *input_names, "Y", *output_names, *extra_names]
+            )
+        else:
+            evt_name = None
+            outputs = [Y]
+            evt_args = f"{{ElementComputeEpilogue({self.alpha}), ElementComputeEpilogue({self.beta})}}"
+            evt_code = ""
+
+        kernel_call_signature = kernel.def_kernel(
+            inputs=inputs,  # type: ignore[arg-type]
+            outputs=outputs,  # type: ignore[arg-type]
+            names_str=names_str,
+            input_reorder=input_reorder,
+        )
+
+        test_call_statement = self.test_call_statement(kernel, inputs, names_str)
+
+        instance_definition, instance_type = self._define_gemm_instance(op, evt_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         options = dict(
             alpha=self.alpha,
@@ -1029,9 +1373,16 @@ def render(  # type: ignore[override]
             instance_definition=instance_definition,
             instance_type=instance_type,
             input_reorder=self.input_reorder,
+<<<<<<< HEAD
             epilogue_args=epilogue_args,
             test_call_statement=test_call_statement,
             op_conf_name=op.configuration_name(),
+=======
+            epilogue_args=evt_args,
+            test_call_statement=test_call_statement,
+            op_conf_name=op.configuration_name(),
+            epilogue_visitor_tree=evt_code,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         options.update(dict(zip(extra_names, extra_inputs)))
         res = self._template_from_string(self._get_template()).render(**options)
@@ -1040,7 +1391,15 @@ def render(  # type: ignore[override]
                 GEMM_STANDALONE_RUNNER_TEMPLATE
             ).render(**options)
             res += "\n\n" + test_runner_code
+<<<<<<< HEAD
         return res
+=======
+
+        # splice to remove trailing spaces in each line
+        buf = IndentedBuffer()
+        buf.splice(res)
+        return buf.getvalue()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def test_call_statement(
         self,
@@ -1055,7 +1414,11 @@ def test_call_statement(
 
         Returns a C++ statement that calls the GEMM operation with the correct arguments.
         """
+<<<<<<< HEAD
         _, __, arg_types = kernel.args.cpp_argdefs()
+=======
+        _, __, arg_types = kernel.args.cpp_argdefs(cutlass_utils.DTYPE_TO_CUTLASS_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arg_names = [name.strip() for name in names_str.strip().split(",")]
         arg_names = self._update_arg_names_for_test_call_statement(
             arg_names, input_nodes
@@ -1064,10 +1427,32 @@ def test_call_statement(
             f"(({arg_type}){arg_name}_data.get())"
             for arg_type, arg_name in zip(arg_types, arg_names)
         ]
+<<<<<<< HEAD
         return f"{kernel.kernel_name}({', '.join(arguments)}, M, N, K, lda, ldb, ldc, ldd, swizzle, workspace_size_ptr, (uint8_t*)workspace_data.get(), 0);"  # noqa: B950
 
 
 class CUTLASS3xGemmTemplate(CUTLASSGemmTemplate):
+=======
+        return f"{kernel.kernel_name}({', '.join(arguments)}, M, N, K, B, lda, ldb, ldc, ldd, swizzle, workspace_size_ptr, (uint8_t*)workspace_data.get(), 0);"  # noqa: B950
+
+    def _render_evt(
+        self,
+        op: GemmOperation,
+        evt_py_code: str,
+        buffer_renames: dict[str, str],
+        output_dtype: torch.dtype,
+        accumulator_dtype: torch.dtype,
+    ) -> tuple[str, str, str]:  # type: ignore[name-defined]  # noqa: F821
+        raise NotImplementedError("_render_evt in CUTLASSGemmTemplate not implemented")
+
+
+class CUTLASS3xGemmTemplate(CUTLASSGemmTemplate):
+    """
+    CUTLASS 3x GEMM Template, which is used to generate CUTLASS GEMM kernels
+    including those which allow flexible fusions with epilogues.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         input_nodes: list[Buffer],
@@ -1075,8 +1460,16 @@ def __init__(
         alpha: float,
         beta: float,
         input_reorder: Optional[list[int]] = None,
+<<<<<<< HEAD
     ):
         super().__init__(input_nodes, layout, alpha, beta, input_reorder)
+=======
+        use_fast_accum: Optional[bool] = None,
+    ):
+        super().__init__(
+            input_nodes, layout, alpha, beta, input_reorder, use_fast_accum
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def add_cutlass_gemm_choices(
@@ -1086,16 +1479,33 @@ def add_cutlass_gemm_choices(
         alpha: Union[float, int] = 1,
         beta: Union[float, int] = 0,
         input_reorder: Optional[list[int]] = None,
+<<<<<<< HEAD
         **extra_kwargs,
     ) -> None:
         template = CUTLASS3xGemmTemplate(
             input_nodes, layout, alpha, beta, input_reorder
+=======
+        use_fast_accum: Optional[bool] = None,
+        **extra_kwargs,
+    ) -> None:
+        template = CUTLASS3xGemmTemplate(
+            input_nodes,
+            layout,
+            alpha,
+            beta,
+            input_reorder,
+            use_fast_accum,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         template._add_cutlass_gemm_choices(
             choices, layout, input_nodes, alpha, beta, input_reorder, **extra_kwargs
         )
 
     @staticmethod
+<<<<<<< HEAD
+=======
+    @functools.lru_cache(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _get_supported_ops() -> "list[cutlass_library.gemm_operation.GemmOperation]":  # type: ignore[name-defined]  # noqa: F821
         import cutlass_library.library as cutlass_lib
 
@@ -1124,6 +1534,13 @@ def _has_tma_epilogue(  # noqa: F821 # type: ignore[arg-type,name-defined]
             result = epilogue_schedule_str.lower().startswith("tma")
         return result
 
+<<<<<<< HEAD
+=======
+    @staticmethod
+    def supports_epilogue_fusion(op: GemmOperation) -> bool:
+        return CUTLASS3xGemmTemplate._has_tma_epilogue(op)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _are_inputs_layout_compatible(self, layouts: list[Layout]) -> bool:
         """
         Evaluates whether input layouts are compatible for General Matrix Multiply (GEMM).
@@ -1141,7 +1558,11 @@ def _are_inputs_layout_compatible(self, layouts: list[Layout]) -> bool:
         Returns:
             bool: True if layouts are GEMM compatible, otherwise False.
         """
+<<<<<<< HEAD
         assert len(layouts) == 2 or len(layouts) == 3
+=======
+        assert 2 <= len(layouts) <= 5
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Check if A and B are compatible
         A_layout, B_layout = layouts[:2]
         if len(A_layout.size) < 1:
@@ -1172,7 +1593,11 @@ def _are_inputs_layout_compatible(self, layouts: list[Layout]) -> bool:
                 return False
         if len(layouts) == 3:
             C_layout = layouts[2]
+<<<<<<< HEAD
             C_size = [int(i) for i in C_layout.size]
+=======
+            C_size = [V.graph.sizevars.size_hint(i) for i in C_layout.size]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             while len(C_size) < len(A_size):
                 C_size.insert(0, 1)
             # check batch dims
@@ -1198,6 +1623,53 @@ def _are_inputs_layout_compatible(self, layouts: list[Layout]) -> bool:
                 return False
         return True
 
+<<<<<<< HEAD
+=======
+    def _render_evt(
+        self,
+        op: GemmOperation,
+        evt_py_code: str,
+        var_name_to_buffer_name: dict[str, str],
+        output_dtype: torch.dtype,
+        accumulator_dtype: torch.dtype,
+    ) -> tuple[str, str, str]:  # type: ignore[name-defined]  # noqa: F821
+        from .cutlass_lib_extensions.evt_extensions import create_example_tensors, trace
+
+        name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
+
+        for name in V.graph.constants.keys():
+            name_to_buffer[name] = V.graph.add_tensor_constant(
+                V.graph.constants[name], name
+            )
+
+        # handle the fake output buffer during lowering
+        name_to_buffer[self.output_node.get_name()] = self.output_node  # type: ignore[assignment]
+
+        acc_dtype = torch_dtype_to_cutlass_type(accumulator_dtype)
+        output_dtype = torch_dtype_to_cutlass_type(output_dtype)
+        examples = create_example_tensors(
+            var_name_to_buffer_name,
+            name_to_buffer,  # type: ignore[arg-type]
+            V.graph.sizevars.size_hint,
+        )
+        evt_name, evt_args, evt_code = trace(
+            evt_py_code,
+            examples,
+            acc_dtype,
+            output_dtype,
+            op.tile_description,  # type: ignore[attr-defined]
+            op.epilogue_schedule,  # type: ignore[attr-defined]
+            {k: name_to_buffer[v] for k, v in var_name_to_buffer_name.items()},  # type: ignore[arg-type,misc]
+            V.graph.sizevars.size_hint,
+        )
+
+        return (
+            evt_name,
+            evt_args,
+            evt_code,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _shape_match(
         self,
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
@@ -1216,16 +1688,23 @@ def _set_bias_layout_and_alignment(
     ) -> bool:
         import cutlass_library.library as cutlass_lib
 
+<<<<<<< HEAD
         has_bias = len(self.input_nodes) >= 3 and self.input_nodes[2] is not None
+=======
+        has_bias = len(self.input_nodes) == 3 and self.input_nodes[2] is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if has_bias:
             Bias = self.input_nodes[2]
             # bias dtype
             op.C.element = cutlass_utils.torch_dtype_to_cutlass_type(
                 Bias.get_layout().dtype
             )
+<<<<<<< HEAD
             assert op.C.element == op.D.element, (
                 f"Expect C and D to have the same dtype, found {op.C.element} and {op.D.element}"
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Bias layout
             bias_layout = CUTLASSGemmTemplate.cutlass_layout(Bias.get_layout())
@@ -1241,7 +1720,12 @@ def _set_bias_layout_and_alignment(
 
     def _define_gemm_instance(
         self,
+<<<<<<< HEAD
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
+=======
+        op: GemmOperation,
+        evt_name: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[str, str]:
         """Defines and renders the Cutlass / CUDA C++ code for a given GEMM operation instance.
 
@@ -1257,15 +1741,28 @@ def _define_gemm_instance(
                              code (render) and the second part is the string that specifies the operation type.
         """
         assert cutlass_utils.try_import_cutlass()
+<<<<<<< HEAD
         import cutlass_library.gemm_operation as cutlass_gemm_op
         import cutlass_library.library as cutlass_lib
 
         emitter = cutlass_gemm_op.EmitGemmUniversal3xInstance()
+=======
+        import cutlass_library.library as cutlass_lib
+
+        from .cutlass_lib_extensions import gemm_operation_extensions as gemm_extensions
+
+        emitter = gemm_extensions.EmitGemmUniversal3xInstanceWithEVT(evt_name=evt_name)  # type: ignore[call-arg]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not hasattr(op, "epilogue_functor") or not isinstance(
             op.epilogue_functor, enum.Enum
         ):
             op = copy.deepcopy(op)
             op.epilogue_functor = cutlass_lib.EpilogueFunctor.LinearCombination
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op_def = emitter.emit(op)
         pattern = re.compile(r"\s*struct\s(.*?)\s:")
         decl = [line for line in op_def.split("\n") if "struct " in line][-1]
@@ -1277,13 +1774,21 @@ def _define_gemm_instance(
         if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
             op_def += f"\n  using {op_type}_device_type = cutlass::gemm::device::GemmUniversalAdapter<{op_type}>;\n"
             op_type = f"{op_type}_device_type"
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return op_def, op_type
 
     def _get_extra_inputs_and_names(
         self,
         op: "cutlass_gemm_op.GemmOperation" = None,  # type: ignore[name-defined]  # noqa: F821
     ) -> tuple[Optional[Buffer], list[Optional[Buffer]], list[str]]:
+<<<<<<< HEAD
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+=======
+        Bias = self.input_nodes[2] if len(self.input_nodes) == 3 else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputs: list[Optional[Buffer]] = []
         names: list[str] = []
         return (Bias, inputs, names)
@@ -1413,6 +1918,10 @@ def add_cutlass_gemm_choices(
         alpha: Union[float, int] = 1,
         beta: Union[float, int] = 0,
         input_reorder: Optional[list[int]] = None,
+<<<<<<< HEAD
+=======
+        use_fast_accum: Optional[bool] = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **extra_kwargs,
     ) -> None:
         template = CUTLASS2xGemmTemplate(
@@ -1523,7 +2032,12 @@ def _set_bias_layout_and_alignment(
 
     def _define_gemm_instance(
         self,
+<<<<<<< HEAD
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
+=======
+        op: GemmOperation,
+        evt_name: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[str, str]:
         """Defines and renders the Cutlass / CUDA C++ code for a given GEMM operation instance.
 
diff --git a/torch/_inductor/codegen/cuda/serialization.py b/torch/_inductor/codegen/cuda/serialization.py
new file mode 100644
index 000000000000..82fe188c09e8
--- /dev/null
+++ b/torch/_inductor/codegen/cuda/serialization.py
@@ -0,0 +1,466 @@
+# mypy: allow-untyped-defs
+import enum
+import functools
+import json
+from enum import Enum
+from typing import Optional
+
+from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
+
+
+class CUTLASSOperationSerializer:
+    """Serializes and deserializes CUTLASS GEMM operations to/from JSON.
+
+    Handles GemmOperation objects and their nested components (TileDescription, TensorDescription).
+    """
+
+    # not used, but keeping in case we want to generalize the serializer
+    _SUPPORTED_CLASSES: list[str] = [
+        "GemmOperation",
+        "GemmKind",
+        "TileDescription",
+        "TensorDescription",
+        "DataType",
+        "EpilogueFunctor",
+        "EpilogueFunctor3x",
+        "SwizzlingFunctor",
+        "KernelScheduleType",
+        "EpilogueScheduleType",
+        "TileSchedulerType",
+    ]
+
+    @classmethod
+    def serialize(cls, operation: "GemmOperation"):  # type: ignore[name-defined]  # noqa: F821
+        """Serialize a GEMM operation to JSON string.
+
+        Args:
+            operation: GemmOperation object
+            indent: JSON indentation spaces
+
+        Returns:
+            str: JSON representation of the operation
+        """
+        assert operation.__class__.__qualname__ == "GemmOperation", (
+            "Only GemmOperation objects are supported via the main API"
+        )
+        return json.dumps(cls._gemm_operation_to_json(operation))
+
+    @classmethod
+    def deserialize(cls, json_str: str) -> "GemmOperation":  # type: ignore[name-defined]  # noqa: F821
+        """Deserialize JSON string to a GEMM operation.
+
+        Args:
+            json_str: JSON string of a GEMM operation
+
+        Returns:
+            GemmOperation: Reconstructed operation
+        """
+        json_dict = json.loads(json_str)
+        return cls._json_to_gemm_operation(json_dict)
+
+    @classmethod
+    def _gemm_operation_to_json(cls, operation):
+        """Convert GemmOperation to JSON-serializable dict.
+
+        Args:
+            operation: GemmOperation object
+
+        Returns:
+            dict: Dictionary representation
+        """
+        from cutlass_library.library import TensorDescription
+
+        # Create the main dictionary with required and optional parameters
+        result = {
+            # Required parameters
+            "gemm_kind": cls._enum_to_json(operation.gemm_kind),
+            "arch": operation.arch,
+            "tile_description": cls._tile_description_to_json(
+                operation.tile_description
+            ),
+            "A": cls._tensor_description_to_json(operation.A),
+            "B": cls._tensor_description_to_json(operation.B),
+            "C": cls._tensor_description_to_json(operation.C),
+            "element_epilogue": cls._enum_to_json(operation.element_epilogue),
+            # Optional parameters
+            "epilogue_functor": cls._enum_to_json(operation.epilogue_functor),
+            "swizzling_functor": cls._enum_to_json(operation.swizzling_functor),
+            "D": cls._tensor_description_to_json(operation.D) if operation.D else None,
+            "kernel_schedule": cls._enum_to_json(operation.kernel_schedule),
+            "epilogue_schedule": cls._enum_to_json(operation.epilogue_schedule),
+            "tile_scheduler": cls._enum_to_json(operation.tile_scheduler),
+        }
+
+        # Process optional attributes
+        optional_attrs = [
+            "mixed_input_mode",
+            "mixed_input_shuffle",
+            "ScaleFactorA",
+            "ScaleFactorB",
+            "ScaleFactorD",
+            "ScaleFactorMVecSize",
+            "ScaleFactorNVecSize",
+            "ScaleFactorKVecSize",
+            "ScaleFactorVectorSize",
+            "is_3x",
+        ]
+
+        for attr in optional_attrs:
+            if not hasattr(operation, attr):
+                continue
+
+            value = getattr(operation, attr)
+
+            if isinstance(value, TensorDescription):
+                result[attr] = cls._tensor_description_to_json(value)
+            elif isinstance(value, Enum):
+                result[attr] = cls._enum_to_json(value)
+            else:
+                result[attr] = value
+
+        return result
+
+    @classmethod
+    def _json_to_gemm_operation(cls, json_dict):
+        """Convert JSON dict to GemmOperation object.
+
+        Args:
+            json_dict: Dictionary representation
+
+        Returns:
+            GemmOperation: Reconstructed object
+        """
+        from cutlass_library import DataType
+        from cutlass_library.gemm_operation import GemmKind, GemmOperation
+        from cutlass_library.library import (
+            EpilogueFunctor,
+            EpilogueFunctor3x,
+            EpilogueScheduleType,
+            KernelScheduleType,
+            MixedInputMode,
+            SwizzlingFunctor,
+            TileSchedulerType,
+        )
+
+        # Extract constructor parameters from the JSON dictionary
+        gemm_kind = cls._json_to_enum(json_dict["gemm_kind"], GemmKind)
+        arch = json_dict["arch"]
+        tile_description = cls._json_to_tile_description(json_dict["tile_description"])
+        A = cls._json_to_tensor_description(json_dict.get("A"))
+        B = cls._json_to_tensor_description(json_dict.get("B"))
+        C = cls._json_to_tensor_description(json_dict.get("C"))
+        element_epilogue = cls._json_to_enum(json_dict["element_epilogue"], DataType)
+
+        # Get optional parameters with defaults
+        epilogue_functor = cls._json_to_enum(
+            json_dict.get("epilogue_functor"),
+            EpilogueFunctor3x if json_dict.get("is_3x") else EpilogueFunctor,
+        )
+        swizzling_functor = cls._json_to_enum(
+            json_dict.get("swizzling_functor"), SwizzlingFunctor
+        )
+        D = cls._json_to_tensor_description(json_dict.get("D"))
+        kernel_schedule = cls._json_to_enum(
+            json_dict.get("kernel_schedule"), KernelScheduleType
+        )
+        epilogue_schedule = cls._json_to_enum(
+            json_dict.get("epilogue_schedule"), EpilogueScheduleType
+        )
+        tile_scheduler = cls._json_to_enum(
+            json_dict.get("tile_scheduler"), TileSchedulerType
+        )
+
+        mixed_input_mode = cls._json_to_enum(
+            json_dict.get("mixed_input_mode"), MixedInputMode
+        )
+        mixed_input_shuffle = json_dict.get("mixed_input_shuffle", False)
+
+        # Scale factors
+        ScaleFactorA = cls._json_to_enum(json_dict.get("ScaleFactorA"), DataType)
+        ScaleFactorB = cls._json_to_enum(json_dict.get("ScaleFactorB"), DataType)
+
+        ScaleFactorD = None
+        if "ScaleFactorD" in json_dict and "ScaleFactorVectorSize" in json_dict:
+            ScaleFactorD = {
+                "tensor": cls._json_to_tensor_description(
+                    json_dict.get("ScaleFactorD")
+                ),
+                "vector_size": json_dict.get("ScaleFactorVectorSize"),
+            }
+
+        ScaleFactorMVecSize = json_dict.get("ScaleFactorMVecSize")
+        ScaleFactorNVecSize = json_dict.get("ScaleFactorNVecSize")
+        ScaleFactorKVecSize = json_dict.get("ScaleFactorKVecSize")
+
+        # Create the GemmOperation with the extracted parameters
+        operation = GemmOperation(
+            gemm_kind=gemm_kind,
+            arch=arch,
+            tile_description=tile_description,
+            A=A,
+            B=B,
+            C=C,
+            element_epilogue=element_epilogue,
+            epilogue_functor=epilogue_functor,
+            swizzling_functor=swizzling_functor,
+            D=D,
+            kernel_schedule=kernel_schedule,
+            epilogue_schedule=epilogue_schedule,
+            tile_scheduler=tile_scheduler,
+            mixed_input_mode=mixed_input_mode,
+            mixed_input_shuffle=mixed_input_shuffle,
+            ScaleFactorA=ScaleFactorA,
+            ScaleFactorB=ScaleFactorB,
+            ScaleFactorD=ScaleFactorD,
+            ScaleFactorMVecSize=ScaleFactorMVecSize,
+            ScaleFactorNVecSize=ScaleFactorNVecSize,
+            ScaleFactorKVecSize=ScaleFactorKVecSize,
+        )
+
+        return operation
+
+    @classmethod
+    def _tile_description_to_json(cls, tile_desc):
+        """
+        Convert TileDescription to JSON dict.
+
+        Args:
+            tile_desc: TileDescription object
+
+        Returns:
+            dict: Dictionary representation
+        """
+        if tile_desc is None:
+            return None
+
+        # Create a dictionary for math_instruction if it exists
+        math_instruction_dict = None
+        if (
+            hasattr(tile_desc, "math_instruction")
+            and tile_desc.math_instruction is not None
+        ):
+            math_instruction = tile_desc.math_instruction
+            math_instruction_dict = {
+                "instruction_shape": math_instruction.instruction_shape,
+                "element_a": cls._enum_to_json(math_instruction.element_a),
+                "element_b": cls._enum_to_json(math_instruction.element_b),
+                "element_accumulator": cls._enum_to_json(
+                    math_instruction.element_accumulator
+                ),
+                "opcode_class": cls._enum_to_json(math_instruction.opcode_class),
+                "math_operation": cls._enum_to_json(math_instruction.math_operation),
+            }
+
+            # Add element_scale_factor if it exists
+            if (
+                hasattr(math_instruction, "element_scale_factor")
+                and math_instruction.element_scale_factor is not None
+            ):
+                math_instruction_dict["element_scale_factor"] = cls._enum_to_json(
+                    math_instruction.element_scale_factor
+                )
+
+        # Create the main dictionary with field names matching TileDescription constructor parameters
+        result = {
+            "threadblock_shape": tile_desc.threadblock_shape,
+            "stages": tile_desc.stages,
+            "warp_count": tile_desc.warp_count,
+            "math_instruction": math_instruction_dict,
+            "min_compute": tile_desc.minimum_compute_capability,  # Store as min_compute for constructor
+            "max_compute": tile_desc.maximum_compute_capability,  # Store as max_compute for constructor
+            "cluster_shape": tile_desc.cluster_shape,
+            "explicit_vector_sizes": tile_desc.explicit_vector_sizes,
+        }
+
+        # Add tile_shape if it exists and differs from threadblock_shape
+        if (
+            hasattr(tile_desc, "tile_shape")
+            and tile_desc.tile_shape != tile_desc.threadblock_shape
+        ):
+            result["tile_shape"] = tile_desc.tile_shape
+
+        return result
+
+    @classmethod
+    def _json_to_tile_description(cls, json_dict):
+        """
+        Convert JSON dict to TileDescription object.
+
+        Args:
+            json_dict: Dictionary representation
+
+        Returns:
+            TileDescription: Reconstructed object
+        """
+        if json_dict is None:
+            return None
+
+        from cutlass_library import DataType
+        from cutlass_library.library import (
+            MathInstruction,
+            MathOperation,
+            OpcodeClass,
+            TileDescription,
+        )
+
+        # First, reconstruct the math_instruction if it exists
+        math_instruction_obj = None
+        if (
+            "math_instruction" in json_dict
+            and json_dict["math_instruction"] is not None
+        ):
+            mi_dict = json_dict["math_instruction"]
+
+            # Convert string enum names back to enum values
+            element_a = cls._json_to_enum(mi_dict["element_a"], DataType)
+            element_b = cls._json_to_enum(mi_dict["element_b"], DataType)
+            element_acc = cls._json_to_enum(mi_dict["element_accumulator"], DataType)
+
+            # Get the opcode_class enum
+            opcode_class = cls._json_to_enum(mi_dict["opcode_class"], OpcodeClass)
+
+            # Get the math_operation enum
+            math_op = cls._json_to_enum(mi_dict["math_operation"], MathOperation)
+
+            # Create the MathInstruction object
+            math_instruction_obj = MathInstruction(
+                instruction_shape=mi_dict["instruction_shape"],
+                element_a=element_a,
+                element_b=element_b,
+                element_accumulator=element_acc,
+                opcode_class=opcode_class,
+                math_operation=math_op,
+            )
+
+            # Add element_scale_factor if it exists
+            if (
+                "element_scale_factor" in mi_dict
+                and mi_dict["element_scale_factor"] is not None
+            ):
+                math_instruction_obj.element_scale_factor = cls._json_to_enum(
+                    mi_dict["element_scale_factor"], DataType
+                )
+
+        # Get compute capability values, checking both naming conventions
+        min_compute = json_dict.get(
+            "min_compute", json_dict.get("minimum_compute_capability")
+        )
+        max_compute = json_dict.get(
+            "max_compute", json_dict.get("maximum_compute_capability")
+        )
+
+        # Get cluster shape with default value
+        cluster_shape = json_dict.get("cluster_shape", [1, 1, 1])
+
+        # Create the TileDescription object
+        tile_desc = TileDescription(
+            threadblock_shape=json_dict["threadblock_shape"],
+            stages=json_dict["stages"],
+            warp_count=json_dict["warp_count"],
+            math_instruction=math_instruction_obj,
+            min_compute=min_compute,
+            max_compute=max_compute,
+            cluster_shape=cluster_shape,
+            explicit_vector_sizes=json_dict.get("explicit_vector_sizes"),
+        )
+
+        # Set tile_shape if it exists and differs from threadblock_shape
+        if (
+            "tile_shape" in json_dict
+            and json_dict["tile_shape"] != json_dict["threadblock_shape"]
+        ):
+            tile_desc.tile_shape = json_dict["tile_shape"]
+
+        return tile_desc
+
+    @classmethod
+    def _tensor_description_to_json(cls, tensor_desc):
+        """Convert TensorDescription to JSON dict.
+
+        Args:
+            tensor_desc: TensorDescription object
+
+        Returns:
+            dict: Dictionary representation
+        """
+        if tensor_desc is None:
+            return None
+
+        return {
+            "element": cls._enum_to_json(tensor_desc.element),
+            "layout": cls._enum_to_json(tensor_desc.layout),
+            "alignment": tensor_desc.alignment,
+            "complex_transform": cls._enum_to_json(tensor_desc.complex_transform),
+        }
+
+    @classmethod
+    def _json_to_tensor_description(cls, tensor_json):
+        """Convert JSON dict to TensorDescription object.
+
+        Args:
+            tensor_json: Dictionary representation
+
+        Returns:
+            TensorDescription: Reconstructed object
+        """
+        if tensor_json is None:
+            return None
+
+        from cutlass_library import DataType
+        from cutlass_library.library import (
+            ComplexTransform,
+            LayoutType,
+            TensorDescription,
+        )
+
+        element = cls._json_to_enum(tensor_json["element"], DataType)
+        layout = cls._json_to_enum(tensor_json["layout"], LayoutType)
+        alignment = tensor_json["alignment"]
+        complex_transform = cls._json_to_enum(
+            tensor_json["complex_transform"], ComplexTransform
+        )
+
+        return TensorDescription(element, layout, alignment, complex_transform)
+
+    @classmethod
+    def _enum_to_json(cls, enum_value):
+        """Convert enum value to JSON dict.
+
+        Args:
+            enum_value: Enum value
+
+        Returns:
+            dict: Dictionary representation
+        """
+        if enum_value is None:
+            return None
+
+        assert isinstance(enum_value, enum.Enum)
+        return {
+            "type": enum_value.__class__.__name__,
+            "name": enum_value.name,
+        }
+
+    @classmethod
+    def _json_to_enum(cls, json_dict, enum_class):
+        """Convert JSON dict to enum value.
+
+        Format: {name: "EnumName", value: 1}
+
+        Args:
+            json_dict: Dictionary representation
+            enum_class: Target enum class
+
+        Returns:
+            Reconstructed enum value
+        """
+        if json_dict is None or json_dict.get("name", "None") == "None":
+            return None
+        return enum_class[json_dict["name"]]
+
+
+@functools.lru_cache(1)
+def get_cutlass_operation_serializer() -> Optional[CUTLASSOperationSerializer]:
+    if not try_import_cutlass():
+        return None
+    return CUTLASSOperationSerializer()
diff --git a/torch/_inductor/codegen/cuda_combined_scheduling.py b/torch/_inductor/codegen/cuda_combined_scheduling.py
index 3af7d72f7103..8353a4e877fa 100644
--- a/torch/_inductor/codegen/cuda_combined_scheduling.py
+++ b/torch/_inductor/codegen/cuda_combined_scheduling.py
@@ -60,6 +60,13 @@ def can_fuse_vertical(
     ) -> bool:
         if self._cuda_cpp_scheduling.can_fuse_vertical(node1, node2):
             return True
+<<<<<<< HEAD
+=======
+        elif self._cuda_cpp_scheduling.is_cuda_cpp_template(
+            node1
+        ) or self._cuda_cpp_scheduling.is_cuda_cpp_template(node2):
+            return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._triton_scheduling.can_fuse_vertical(node1, node2)
 
     def can_fuse_horizontal(
@@ -84,7 +91,10 @@ def codegen_template(
         prologue_nodes: Sequence[BaseSchedulerNode],
     ) -> Optional[str]:
         if self._cuda_cpp_scheduling.is_cuda_cpp_template(template_node):
+<<<<<<< HEAD
             assert not epilogue_nodes
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert not prologue_nodes
             return self._cuda_cpp_scheduling.codegen_template(
                 template_node, epilogue_nodes, prologue_nodes
diff --git a/torch/_inductor/codegen/debug_utils.py b/torch/_inductor/codegen/debug_utils.py
index 0cd4c851844b..5a4fe89bf88f 100644
--- a/torch/_inductor/codegen/debug_utils.py
+++ b/torch/_inductor/codegen/debug_utils.py
@@ -5,7 +5,11 @@
 import logging
 import os
 from enum import Enum
+<<<<<<< HEAD
 from typing import Optional
+=======
+from typing import Callable, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import dtype as torch_dtype
@@ -23,6 +27,12 @@ def _print_debugging_tensor_value_info(msg, arg):
     # at jit inductor level codegen
     max_numel_to_print = 64
     print(msg)
+<<<<<<< HEAD
+=======
+    if not isinstance(arg, torch.Tensor):
+        print("Value: ", arg)
+        return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     numel = arg.float().numel()
     # print the debug printing stats
     if numel <= max_numel_to_print:
@@ -54,6 +64,10 @@ def __init__(
         self,
         debug_printer_level,
         use_array_ref: bool,
+<<<<<<< HEAD
+=======
+        writeline: Optional[Callable[..., None]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args_to_print_or_save: Optional[list[str]] = None,
         kernel_name: str = "",
         kernel=None,
diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py
index 9f4469698207..6bd3f29a7e03 100644
--- a/torch/_inductor/codegen/halide.py
+++ b/torch/_inductor/codegen/halide.py
@@ -96,6 +96,11 @@ def _print_floor(self, expr):
         assert len(expr.args) == 1
         return self.cast_index(f"hl.floor({self._print(expr.args[0])})")
 
+<<<<<<< HEAD
+=======
+    _print_FloorToInt = _print_floor
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _print_Trunc(self, expr):
         assert len(expr.args) == 1
         return self.cast_index(f"hl.trunc({self._print(expr.args[0])})")
@@ -140,6 +145,7 @@ def _print_Abs(self, expr):
 
     def _print_OpaqueUnaryFn_cos(self, expr):
         assert len(expr.args) == 1
+<<<<<<< HEAD
         return f"hl.cos(({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_cosh(self, expr):
@@ -173,6 +179,44 @@ def _print_OpaqueUnaryFn_tanh(self, expr):
     def _print_OpaqueUnaryFn_atan(self, expr):
         assert len(expr.args) == 1
         return f"hl.atan(({self._print(expr.args[0])})"
+=======
+        return f"hl.cos({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_cosh(self, expr):
+        assert len(expr.args) == 1
+        return f"hl.cosh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_acos(self, expr):
+        assert len(expr.args) == 1
+        return f"hl.acos({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sin(self, expr):
+        assert len(expr.args) == 1
+        return f"hl.sin({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sinh(self, expr):
+        assert len(expr.args) == 1
+        return f"hl.sinh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_asin(self, expr):
+        assert len(expr.args) == 1
+        return f"hl.asin({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_tan(self, expr):
+        assert len(expr.args) == 1
+        return f"hl.tan({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_tanh(self, expr):
+        assert len(expr.args) == 1
+        return f"hl.tanh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_atan(self, expr):
+        assert len(expr.args) == 1
+        return f"hl.atan({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_log2(self, expr):
+        raise NotImplementedError("log2")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _print_FloorDiv(self, expr):
         if expr.is_integer:
@@ -270,10 +314,13 @@ def exp(x):
         return f"hl.fast_exp(hl.cast(hl.Float(32), {x})) if {x.name}.type().bits() <= 32 else hl.exp({x})"
 
     @staticmethod
+<<<<<<< HEAD
     def libdevice_exp(x):
         return f"hl.exp({x})"  # higher precision that ops.exp
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def sqrt(x):
         return f"hl.sqrt({x})"
 
@@ -454,6 +501,13 @@ def log(x):
         return f"hl.log({x})"  # hl.fast_log fails accuracy
 
     @staticmethod
+<<<<<<< HEAD
+=======
+    def log2(x):
+        raise NotImplementedError("log2")
+
+    @staticmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def isinf(x):
         # workaround https://github.com/halide/Halide/issues/8309
         return f"hl.is_inf(hl.cast(hl.Float(32), {x}))"
@@ -934,7 +988,11 @@ def indexing_to_dimensions(self, var: str, index: sympy.Expr, is_store: bool):
 
         # group the expression by variables used
         offset = sympy.S.Zero
+<<<<<<< HEAD
         split_expr = {s: sympy.S.Zero for s in symbols}
+=======
+        split_expr = dict.fromkeys(symbols, sympy.S.Zero)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         split_failed: list[tuple[list[sympy.Symbol], sympy.Expr]] = []
         index = sympy.expand(self.rename_indexing(index))
         for part in index.args if isinstance(index, sympy.Add) else [index]:
@@ -1442,7 +1500,11 @@ def halide_kernel_meta(self) -> HalideMeta:
         current_device = V.graph.get_current_device_or_throw()
         if current_device.type == "cpu":
             target = [config.halide.cpu_target]
+<<<<<<< HEAD
             schduler = config.halide.scheduler_cpu
+=======
+            scheduler = config.halide.scheduler_cpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             scheduler_flags = {
                 "parallelism": parallel_num_threads(),
             }
@@ -1451,7 +1513,11 @@ def halide_kernel_meta(self) -> HalideMeta:
             assert current_device.type == "cuda", "only cpu/cuda supported"
             assert current_device.index <= 0, "only default device supported"
             target = [config.halide.gpu_target]
+<<<<<<< HEAD
             schduler = config.halide.scheduler_cuda
+=======
+            scheduler = config.halide.scheduler_cuda
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             capability = torch.cuda.get_device_properties(current_device)
             if "cuda_capability" not in target[0]:
                 for major, minor in [(8, 6), (8, 0), (7, 5), (7, 0), (6, 1)]:
@@ -1485,7 +1551,11 @@ def halide_kernel_meta(self) -> HalideMeta:
         return HalideMeta(
             argtypes,
             target="-".join(target),
+<<<<<<< HEAD
             scheduler=schduler,
+=======
+            scheduler=scheduler,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             scheduler_flags=scheduler_flags,  # type: ignore[arg-type]
             cuda_device=cuda_device,
         )
@@ -1628,7 +1698,13 @@ def call_kernel(self, name: str, node=None):
         call_args = [f"{n}" for n, arg in self.halide_argdefs() if arg.alias_of is None]
         current_device = V.graph.get_current_device_or_throw()
         if current_device.type == "cuda":
+<<<<<<< HEAD
             stream_name = wrapper.write_get_raw_stream(current_device.index, V.graph)
+=======
+            stream_name = wrapper.write_get_raw_stream(
+                current_device.index, V.graph.name
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             call_args.append(stream_name)
         wrapper.generate_kernel_call(
             name,
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index e494ac698689..b9427d0ebfda 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -4,16 +4,31 @@
 
 import functools
 import itertools
+<<<<<<< HEAD
+=======
+import logging
+import math
+from pathlib import Path
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Optional, TYPE_CHECKING
 
 import sympy
 from sympy.printing.precedence import PRECEDENCE
 
 import torch
+<<<<<<< HEAD
 from torch.utils._sympy.printers import ExprPrinter as ExprPrinter_
 from torch.utils._sympy.value_ranges import ValueRanges
 
 from ..utils import get_bounds_index_expr, get_kernel_metadata
+=======
+from torch.utils._cpp_embed_headers import _embed_headers
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.printers import CppPrinter, ExprPrinter as ExprPrinter_
+from torch.utils._sympy.value_ranges import ValueRanges
+
+from ..utils import ceildiv, get_bounds_index_expr, get_kernel_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..virtualized import ops, OpsWrapper, V
 from .common import (
     CSEVariable,
@@ -33,6 +48,10 @@
     from ..scheduler import Scheduler, SchedulerNode
     from .common import OpVarT
 
+<<<<<<< HEAD
+=======
+log = logging.getLogger(__name__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 DTYPE_TO_METAL = {
     torch.bool: "bool",
@@ -62,12 +81,21 @@ def value_to_metal(val: Union[float, int, bool, str, CSEVariable]) -> str:
 
 
 class MetalExprPrinter(ExprPrinter_):
+<<<<<<< HEAD
+=======
+    """Converts sympy expression to Metal code snippet"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _print_FloorDiv(self, expr: sympy.Expr) -> str:
         x, div = expr.args
         x = self.doprint(x)
         div = self.doprint(div)
         if expr.is_integer:
+<<<<<<< HEAD
             return f"({x}) / ({div})"
+=======
+            return f"c10::metal::floor_divide({x}, {div})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"metal::floor({x}) / ({div})"
 
     def _print_ModularIndexing(self, expr: sympy.Expr) -> str:
@@ -85,12 +113,26 @@ def _print_ModularIndexing(self, expr: sympy.Expr) -> str:
     def _print_Min(self, expr: sympy.Expr) -> str:
         if len(expr.args) != 2:
             raise RuntimeError("metal::min only supported for 2 args")
+<<<<<<< HEAD
         return f"metal::min({', '.join(map(self._print, expr.args))})"
+=======
+        a, b = map(self._print, expr.args)
+        typecast_a = f"static_cast<decltype({a}+{b})>({a})"
+        typecast_b = f"static_cast<decltype({a}+{b})>({b})"
+        return f"metal::min({typecast_a}, {typecast_b})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _print_Max(self, expr: sympy.Expr) -> str:
         if len(expr.args) != 2:
             raise RuntimeError("metal::max only supported for 2 args")
+<<<<<<< HEAD
         return f"metal::max({', '.join(map(self._print, expr.args))})"
+=======
+        a, b = map(self._print, expr.args)
+        typecast_a = f"static_cast<decltype({a}+{b})>({a})"
+        typecast_b = f"static_cast<decltype({a}+{b})>({b})"
+        return f"metal::max({typecast_a}, {typecast_b})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _print_Abs(self, expr: sympy.Expr) -> str:
         assert len(expr.args) == 1
@@ -117,8 +159,42 @@ def _print_IntTrueDiv(self, expr: sympy.Expr) -> str:
         # TODO: This is only accurate up to 2**23
         return f"static_cast<float>({self._print(lhs)}) / static_cast<float>({self._print(rhs)})"
 
+<<<<<<< HEAD
+
+class MetalOverrides(OpOverrides):
+=======
+    def _print_PowByNatural(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 2
+        x, y = map(self.doprint, expr.args)
+        return f"metal::pow(static_cast<float>({x}), static_cast<float>({y}))"
+
+    def _print_ToFloat(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        x = self.doprint(expr.args[0])
+        return f"static_cast<float>({x})"
+
+    def _print_FloorToInt(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        x = self.doprint(expr.args[0])
+        return f"static_cast<int>(metal::floor(static_cast<float>({x})))"
+
+    _print_floor = _print_FloorToInt
+
+    def _print_TruncToInt(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        x = self.doprint(expr.args[0])
+        return f"static_cast<int>(metal::trunc({x}))"
+
+    def _print_OpaqueUnaryFn_log2(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        x = self.doprint(expr.args[0])
+        return f"metal::log2({x})"
+
 
 class MetalOverrides(OpOverrides):
+    """Implements Metal-specific overrides for ops. Base class emits Python-friendly overrides."""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def to_dtype(
         x: CSEVariable,
@@ -126,13 +202,25 @@ def to_dtype(
         src_dtype: Optional[torch.dtype] = None,
         use_compute_types: bool = True,
     ) -> str:
+<<<<<<< HEAD
+=======
+        if dtype == torch.double:
+            log.warning(
+                "float64 cast requested, probably from tensorify_python_scalars"
+            )
+            return f"static_cast<float>({x})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"static_cast<{DTYPE_TO_METAL[dtype]}>({x})"
 
     @staticmethod
     def to_dtype_bitcast(
         x: CSEVariable, dtype: torch.dtype, src_dtype: torch.dtype
     ) -> str:
+<<<<<<< HEAD
         return f"*reinterpret_cast<thread {DTYPE_TO_METAL[dtype]}*>(&{x})"
+=======
+        return f"as_type<{DTYPE_TO_METAL[dtype]}>(static_cast<{DTYPE_TO_METAL[src_dtype]}>({x}))"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def constant(val: Union[bool, float, int], dtype: torch.dtype) -> str:
@@ -163,6 +251,7 @@ def where(a: OpVarT, b: OpVarT, c: OpVarT) -> str:
 
     @staticmethod
     def remainder(a: OpVarT, b: OpVarT) -> str:
+<<<<<<< HEAD
         if (
             isinstance(b, CSEVariable)
             and b.dtype is not None
@@ -181,6 +270,9 @@ def remainder(a: OpVarT, b: OpVarT) -> str:
             else b
         )
         return f"{float_a} - {float_b} * metal::floor({float_a} / {float_b})"
+=======
+        return f"c10::metal::remainder({a}, {b})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def maximum(a: CSEVariable, b: CSEVariable) -> str:
@@ -239,6 +331,7 @@ def cos(x: CSEVariable) -> str:
         return f"metal::precise::cos({x})"
 
     @staticmethod
+<<<<<<< HEAD
     def i0(x: CSEVariable) -> str:
         return f"c10::metal::i0({x})"
 
@@ -267,6 +360,8 @@ def digamma(x: CSEVariable) -> str:
         return f"c10::metal::digamma({x})"
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def tan(x: CSEVariable) -> str:
         return f"metal::tan({x})"
 
@@ -283,10 +378,26 @@ def atan(x: CSEVariable) -> str:
         return f"metal::atan({x})"
 
     @staticmethod
+<<<<<<< HEAD
+=======
+    def atan2(x: CSEVariable, y: CSEVariable) -> str:
+        return f"::metal::atan2({x}, {y})"
+
+    @staticmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def sqrt(x: CSEVariable) -> str:
         return f"metal::sqrt({x})"
 
     @staticmethod
+<<<<<<< HEAD
+=======
+    def neg(x: CSEVariable) -> str:
+        # TODO: Does it rely on undefined behavior?
+        # If so, add special logic for unsigned types
+        return f"static_cast<decltype({x})>(-{x})"
+
+    @staticmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def rsqrt(x: CSEVariable) -> str:
         return f"metal::rsqrt({x})"
 
@@ -300,10 +411,15 @@ def atanh(x: CSEVariable) -> str:
 
     @staticmethod
     def floordiv(a: CSEVariable, b: CSEVariable) -> str:
+<<<<<<< HEAD
         # a and b are integer type
         quot = f"{a} / {b}"
         rem = f"{a} % {b}"
         return f"(({a} < 0) != ({b} < 0) ? ({rem} != 0 ? {quot} - 1 : {quot}) : {quot})"
+=======
+        # a and b must be of integer type
+        return f"c10::metal::floor_divide({a}, {b})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def floor(x: CSEVariable) -> str:
@@ -325,11 +441,20 @@ def trunc(x: CSEVariable) -> str:
 
     @staticmethod
     def truncdiv(a: CSEVariable, b: CSEVariable) -> str:
+<<<<<<< HEAD
         # Upcast to float otherwise the generated code doesn't typecheck.
         # TODO (dcci): remove this workaround
         float_a = f"static_cast<float>({a})" if a.dtype != torch.float else a
         float_b = f"static_cast<float>({b})" if b.dtype != torch.float else b
         return f"metal::trunc({float_a}/{float_b})"
+=======
+        quot = f"{a} / {b}"
+        if (a.dtype is not None and a.dtype.is_floating_point) or (
+            b.dtype is not None and b.dtype.is_floating_point
+        ):
+            return f"metal::trunc({quot})"
+        return quot
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def ceil(x: CSEVariable) -> str:
@@ -337,16 +462,28 @@ def ceil(x: CSEVariable) -> str:
 
     @staticmethod
     def rand(seed: CSEVariable, offset: CSEVariable) -> str:
+<<<<<<< HEAD
+=======
+        V.kernel.headers.add("random")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"c10::metal::rand({seed}, {offset})"
 
     @staticmethod
     def randn(seed: CSEVariable, offset: CSEVariable) -> str:
+<<<<<<< HEAD
+=======
+        V.kernel.headers.add("random")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"c10::metal::randn({seed}, {offset})"
 
     @staticmethod
     def randint64(
         seed: CSEVariable, offset: CSEVariable, low: CSEVariable, high: CSEVariable
     ) -> str:
+<<<<<<< HEAD
+=======
+        V.kernel.headers.add("random")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return f"c10::metal::randint64({seed}, {offset}, {low}, {high})"
 
     @staticmethod
@@ -359,6 +496,7 @@ def pow(a: CSEVariable, b: CSEVariable) -> str:
         cast_b = f"static_cast<decltype({a}+{b})>({b})"
         return f"metal::pow({cast_a}, {cast_b})"
 
+<<<<<<< HEAD
     @staticmethod
     def zeta(a: CSEVariable, b: CSEVariable) -> str:
         return f"c10::metal::zeta({a}, {b})"
@@ -380,13 +518,99 @@ def entr(x: CSEVariable) -> str:
 
 
 class MetalKernel(SIMDKernel):
+=======
+    def _special_unary(self, a: CSEVariable, name: str) -> str:
+        V.kernel.headers.add("special_math")
+        return f"c10::metal::{name}({a})"
+
+    def _special_binary(self, a: CSEVariable, b: CSEVariable, name: str) -> str:
+        V.kernel.headers.add("special_math")
+        return f"c10::metal::{name}({a}, {b})"
+
+    @classmethod
+    def _initialize_special_ops(cls) -> None:
+        # Unary special ops
+        for name in [
+            "erf",
+            "erfinv",
+            "i0",
+            "i0e",
+            "i1",
+            "i1e",
+            "digamma",
+            "spherical_bessel_j0",
+        ]:
+            setattr(cls, name, functools.partialmethod(cls._special_unary, name=name))
+
+        cls.lgamma = functools.partialmethod(cls._special_unary, name="log_gamma")  # type: ignore[assignment]
+
+        # Unary special ops with forward in method name
+        for name in [
+            "bessel_j0",
+            "bessel_j1",
+            "bessel_y0",
+            "bessel_y1",
+            "modified_bessel_i0",
+            "modified_bessel_i1",
+            "modified_bessel_k0",
+            "modified_bessel_k1",
+            "scaled_modified_bessel_k0",
+            "scaled_modified_bessel_k1",
+        ]:
+            setattr(
+                cls,
+                name,
+                functools.partialmethod(cls._special_unary, name=name + "_forward"),
+            )
+
+        # Binary special ops
+        for name in [
+            "polygamma",
+            "zeta",
+        ]:
+            setattr(cls, name, functools.partialmethod(cls._special_binary, name=name))
+
+        # Binary special ops with forward in method name
+        for name in [
+            "chebyshev_polynomial_t",
+            "chebyshev_polynomial_u",
+            "chebyshev_polynomial_v",
+            "chebyshev_polynomial_w",
+            "hermite_polynomial_h",
+            "hermite_polynomial_he",
+        ]:
+            setattr(
+                cls,
+                name,
+                functools.partialmethod(cls._special_binary, name=name + "_forward"),
+            )
+
+
+MetalOverrides._initialize_pointwise_overrides("mps")
+MetalOverrides._initialize_special_ops()
+
+
+class MetalKernel(SIMDKernel):
+    """Implement Metal codegen based on the SIMDKernel abstraction"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     overrides = MetalOverrides  # type: ignore[assignment]
     suffix = ";"
     newvar_prefix = "auto "
     max_threadgroup_size = 1024
+<<<<<<< HEAD
+    pexpr = PythonPrinter().doprint
+    sexpr = MetalExprPrinter().doprint
+    kexpr = sexpr
+=======
+    simd_group_size = 32
     pexpr = PythonPrinter().doprint
+    cexpr = CppPrinter().doprint
     sexpr = MetalExprPrinter().doprint
     kexpr = sexpr
+    headers: OrderedSet[str] = OrderedSet(["utils"])
+    multistage_reduction_entry: list[IterationRangesEntry] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -395,7 +619,10 @@ def __init__(
     ) -> None:
         super().__init__(tiling, **kwargs)
         self.acc_var_ids = itertools.count()
+<<<<<<< HEAD
         self.multistage_reduction = False
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def dtype_to_str(self, dtype: torch.dtype) -> str:
         return DTYPE_TO_METAL[dtype]
@@ -404,8 +631,20 @@ def load(self, name: str, index: sympy.Expr) -> CSEVariable:
         """Codegen a load from an InputBuffer"""
         var = self.args.input(name)
         index = self.prepare_indexing(index)
+<<<<<<< HEAD
         line = f"{var}[{self.index_to_str(index)}]"
         return self.cse.generate(self.loads, line, dtype=V.graph.get_dtype(name))
+=======
+        dtype = V.graph.get_dtype(name)
+        line = f"{var}[{self.index_to_str(index)}]"
+        if dtype in [torch.float16, torch.bfloat16]:
+            # TODO(NS): Figure out the right balance between optype casts
+            # op_math_t for half-precision floats should be float32
+            # Otherwise it can lead to a correctness issues with eager
+            line = f"static_cast<float>({line})"
+            dtype = torch.float32
+        return self.cse.generate(self.loads, line, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def store(
         self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
@@ -413,12 +652,26 @@ def store(
         var = self.args.output(name)
         index = self.prepare_indexing(index)
         dtype_str = self.dtype_to_str(V.graph.get_dtype(name))
+<<<<<<< HEAD
         line = f"{var}[{self.index_to_str(index)}] = static_cast<{dtype_str}>({value});"
+=======
+        cast_val = f"static_cast<{dtype_str}>({value})"
+        if mode is None:
+            line = f"{var}[{self.index_to_str(index)}] = {cast_val};"
+        elif mode == "atomic_add":
+            self.headers.add("atomic")
+            atomic_type = f"c10::metal::AtomicType<{dtype_str}>"
+            cast_var = f"reinterpret_cast<device {atomic_type}::type *>({var})"
+            line = f"{atomic_type}::atomic_add({cast_var}, {self.index_to_str(index)}, {cast_val});"
+        else:
+            raise RuntimeError(f"Unimplemented store mode {mode}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.inside_reduction:
             self.compute.writeline(DeferredLine(name, line))
         else:
             self.stores.writeline(DeferredLine(name, line))
 
+<<<<<<< HEAD
     def _new_accvar(
         self,
         dtype: torch.dtype,
@@ -435,6 +688,38 @@ def _new_accvar(
             self.indexing_code.writeline(
                 f"threadgroup {self.dtype_to_str(dtype)} {var_name};"
             )
+=======
+    def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable) -> None:
+        var = self.args.output(name)
+        index = self.prepare_indexing(index)
+        dtype_str = self.dtype_to_str(V.graph.get_dtype(name))
+        reduction_dim = next(t for t in self.range_trees if t.is_reduction)
+        # Only one thread in the reduction group needs to store the results
+        line = f"{var}[{self.index_to_str(index)}] = static_cast<{dtype_str}>({value});"
+        line = f"if ({reduction_dim.name} == 0) {line}"
+        self.stores.writeline(DeferredLine(name, line))
+
+    def _new_idxvar(
+        self,
+        dtype: Union[str | torch.dtype],
+        elem_count: Optional[int] = None,
+        default_value: Optional[Any] = None,
+        is_threadgroup: bool = True,
+        bounds: ValueRanges[Any] = ValueRanges.unknown(),
+    ) -> CSEVariable:
+        if isinstance(dtype, torch.dtype):
+            dtype = self.dtype_to_str(dtype)
+        var_name = f"tmp_acc_{next(self.acc_var_ids)}"
+        var = V.kernel.create_cse_var(var_name, bounds, dtype)
+        var_def = "threadgroup " if is_threadgroup else ""
+        var_def += f"{dtype} {var_name}"
+        if elem_count:
+            var_def += f"[{elem_count}]"
+        if default_value is not None:
+            assert not is_threadgroup, "Thread group var can not have default value"
+            var_def += f" = {default_value}"
+        self.indexing_code.writeline(var_def + self.suffix)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return var
 
     def reduction(
@@ -444,11 +729,55 @@ def reduction(
         reduction_type: ReductionType,
         value: Union[CSEVariable, tuple[CSEVariable, ...]],
     ) -> Union[CSEVariable, tuple[CSEVariable, ...]]:
+<<<<<<< HEAD
         """Codegen a reduction operation"""
         reduction_dim = next(t for t in self.range_trees if t.is_reduction)
         acc_buf_size = min(reduction_dim.numel, self.max_threadgroup_size)
         if reduction_type == "any":
             acc = self._new_accvar(dtype)
+=======
+        "Caching wrapper around _reduction_nocache"
+        cache_key = (src_dtype, reduction_type, value)
+        # Return cached reduction
+        if cache_key in self.cse.reduction_cache:
+            return self.cse.reduction_cache[cache_key]
+        result = self._reduction_nocache(dtype, src_dtype, reduction_type, value)
+        self.cse.reduction_cache[cache_key] = result  # type: ignore[assignment]
+        return result
+
+    def _reduction_nocache(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[CSEVariable, tuple[CSEVariable, ...]],
+    ) -> Union[CSEVariable, tuple[CSEVariable, ...]]:
+        """Codegen a reduction operation.
+        Only sum and prod operations are somewhat reasonable optimized"""
+        assert self.inside_reduction
+        assert not self._load_mask
+
+        def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
+            # Uwraps vec3 dtype into individual components
+            return OpsWrapper._unwrap(
+                [CSEVariable(f"{res3}.{t}", res3.bounds, res3.dtype) for t in "xyz"]
+            )
+
+        # Establish reduction buffer size and index expression
+        reduction_idx = ""
+        acc_buf_size = 1
+        for rd in self.range_trees:
+            if not rd.is_reduction:
+                continue
+            if reduction_idx:
+                reduction_idx += " + "
+            reduction_idx += f"{rd.name} * {acc_buf_size}"
+            acc_buf_size *= rd.numel
+        acc_buf_size = min(acc_buf_size, self.max_threadgroup_size)
+
+        if reduction_type == "any":
+            acc = self._new_idxvar(dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.indexing_code.writeline(f"{acc} = false;")
             self.indexing_code.writeline(
                 "threadgroup_barrier(metal::mem_flags::mem_threadgroup);"
@@ -464,6 +793,7 @@ def reduction(
                 "threadgroup_barrier(metal::mem_flags::mem_threadgroup);"
             )
             return acc
+<<<<<<< HEAD
         if reduction_type in ["prod", "sum"]:
             acc_buf = self._new_accvar(src_dtype, acc_buf_size)
             if self.multistage_reduction:
@@ -488,6 +818,36 @@ def reduction(
             acc_thread_var = f"{acc_buf}[{reduction_dim.name}]"
             src_metal_type = DTYPE_TO_METAL[src_dtype]
             if not self.multistage_reduction:
+=======
+
+        self.headers.add("reduction_utils")
+
+        if reduction_type in ["prod", "sum"]:
+            acc_dtype = DTYPE_TO_COMPUTATION_DTYPE[src_dtype]
+            acc_buf = self._new_idxvar(
+                acc_dtype, ceildiv(acc_buf_size, self.simd_group_size)
+            )
+            if not self.multistage_reduction_entry:
+                val = value
+            else:
+                default_val, reduction_op = (
+                    (0, "+") if reduction_type == "sum" else (1, "*")
+                )
+                val = self._new_idxvar(
+                    acc_dtype, default_value=default_val, is_threadgroup=False
+                )
+                self.compute.splice(f"{val} {reduction_op}= {value};")
+            return self.cse.generate(
+                self.stores,
+                f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {val}, {reduction_idx}, {acc_buf_size})",
+                dtype=DTYPE_TO_COMPUTATION_DTYPE[dtype],
+            )
+        if reduction_type in ["max", "min", "argmin", "argmax"]:
+            acc_buf = self._new_idxvar(src_dtype, acc_buf_size)
+            acc_thread_var = f"{acc_buf}[{reduction_idx}]"
+            src_metal_type = DTYPE_TO_METAL[src_dtype]
+            if not self.multistage_reduction_entry:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.compute.splice(
                     f"{acc_thread_var} = static_cast<{src_metal_type}>({value});"
                 )
@@ -504,9 +864,15 @@ def reduction(
                 idx_var = next(
                     t for t in self.range_tree_nodes.values() if t.is_reduction
                 )
+<<<<<<< HEAD
                 idx_acc_buf = self._new_accvar(torch.long, acc_buf_size)
                 cmp_op = ">" if reduction_type == "argmax" else "<"
                 idx_thread_var = f"{idx_acc_buf}[{reduction_dim.name}]"
+=======
+                idx_acc_buf = self._new_idxvar(torch.long, acc_buf_size)
+                cmp_op = ">" if reduction_type == "argmax" else "<"
+                idx_thread_var = f"{idx_acc_buf}[{reduction_idx}]"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.indexing_code.splice(f"{idx_thread_var} = -1;")
                 self.compute.splice(f"""
                 if ({value} {cmp_op} {acc_thread_var}) {{
@@ -528,6 +894,7 @@ def reduction(
                 dtype=dtype,
             )
         if reduction_type == "welford_reduce":
+<<<<<<< HEAD
             assert not self.multistage_reduction, (
                 f"Multistage reduction not yet supported for {reduction_type}"
             )
@@ -540,19 +907,71 @@ def reduction(
             return OpsWrapper._unwrap(
                 (f"{wf_res}.x", f"{wf_res}.y", self.features.reduction_numel)
             )
+=======
+            if not self.multistage_reduction_entry:
+                acc_buf = self._new_idxvar(src_dtype, acc_buf_size)
+                self.compute.splice(f"{acc_buf}[{reduction_idx}] = {value};")
+                wf_res = self.cse.generate(
+                    self.compute,
+                    f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size})",
+                    dtype=torch.float32,
+                )
+                return _unwrap_helper(wf_res)
+            acc_buf = self._new_idxvar("float3", acc_buf_size)
+            acc_thread_var = f"{acc_buf}[{reduction_idx}]"
+            self.indexing_code.splice(f"{acc_thread_var} = 0.0;")
+            self.compute.writeline(
+                f"{acc_thread_var} = ::c10::metal::welford_combine({acc_thread_var}, float3({value}, 0.0, 1.0));"
+            )
+            wf_res = self.cse.generate(
+                self.stores,
+                f"c10::metal::threadgroup_welford_combine({acc_buf}, {acc_buf_size})",
+                dtype=torch.float32,
+            )
+            return _unwrap_helper(wf_res)
+        if reduction_type == "welford_combine":
+            assert isinstance(value, tuple), "Input to welford combine must be tuple"
+            acc_buf = self._new_idxvar("float3", acc_buf_size)
+            acc_thread_var = f"{acc_buf}[{reduction_idx}]"
+            inp_value = f"float3({value[0]}, {value[1]}, {value[2]})"
+            self.indexing_code.splice(f"{acc_thread_var} = 0.0;")
+            if self.multistage_reduction_entry:
+                self.indexing_code.splice(f"{acc_thread_var} = 0.0;")
+                self.compute.writeline(
+                    f"{acc_thread_var} = ::c10::metal::welford_combine({acc_thread_var}, {inp_value});"
+                )
+            else:
+                self.compute.writeline(f"{acc_thread_var} = {inp_value};")
+            wf_res = self.cse.generate(
+                self.stores if self.multistage_reduction_entry else self.compute,
+                f"c10::metal::threadgroup_{reduction_type}({acc_buf}, {acc_buf_size})",
+                dtype=torch.float32,
+            )
+            return _unwrap_helper(wf_res)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError(reduction_type)
 
     def codegen_iteration_ranges_entry(self, entry: IterationRangesEntry) -> None:
         index_expr = self.rename_indexing(entry.expr)
         index_str = self.sexpr(index_expr)  # type: ignore[misc]
+<<<<<<< HEAD
         if entry.is_reduction:
             self.multistage_reduction = entry.root.numel > self.max_threadgroup_size
         if not entry.is_reduction or not self.multistage_reduction:
+=======
+
+        if not entry.is_reduction or entry.root.numel <= self.max_threadgroup_size:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.indexing_code.writeline(
                 f"{self.index_dtype} {entry.name} = {index_str};"
             )
             return
+<<<<<<< HEAD
         # When reducing the thensor whose size exceeds max threadgroup size
+=======
+        self.multistage_reduction_entry.append(entry)
+        # When reducing the tensor whose size exceeds max threadgroup size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # loop over extra indices per reduction thread and perform part of the operation
         # using values in the shared memory
         loop_size = (
@@ -579,12 +998,34 @@ def codegen_body(self) -> None:
         For reduction kernels, this generates a loop over the reduction
         axis.
         """
+<<<<<<< HEAD
         if self.multistage_reduction:
             with self.body.indent():
                 self.body.splice(self.loads)
                 self.body.splice(self.compute)
             self.body.writeline("}")
             self.multistage_reduction = False
+=======
+        if self.multistage_reduction_entry:
+            with self.body.indent():
+                self.body.splice(self.loads)
+                self.body.splice(self.compute)
+            self.body.writeline("}" * len(self.multistage_reduction_entry))
+            # Invalidate variables instantiated inside loop
+            # But results of reduction alive. Reduction cache values can be
+            # either CSEVariable or tuple of CSEVariables, in which case all
+            # variables in the tuple must be preserved
+            self.cse.invalidate(
+                OrderedSet(
+                    v
+                    for item in self.cse.reduction_cache.values()
+                    for v in (item if isinstance(item, tuple) else (item,))
+                )
+            )
+            # And loop codegen
+            while self.multistage_reduction_entry:
+                self.multistage_reduction_entry.pop().cache_clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             self.body.splice(self.loads)
             self.body.splice(self.compute)
@@ -597,6 +1038,7 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
         """Called at the end to generate a final kernel string"""
         self.codegen_body()
         code = IndentedBuffer()
+<<<<<<< HEAD
         code.writeline('compile_mps_shader("""')
         idx_vars = self.active_range_trees()
         with code.indent():
@@ -610,6 +1052,38 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
             )
             if self.inside_reduction:
                 code.writeline("#include <c10/metal/reduction_utils.h>")
+=======
+
+        if V.graph.cpp_wrapper:
+            code.writeline('(R"MTL(')
+        else:
+            code.writeline("compile_mps_shader('''")
+
+        idx_vars = self.active_range_trees()
+        with code.indent():
+            if not V.graph.cpp_wrapper:
+                for header in self.headers:
+                    code.writeline(f"#include <c10/metal/{header}.h>")
+            else:
+                headers = [
+                    f"#include <c10/metal/{header}.h>" for header in self.headers
+                ]
+                header_contents = _embed_headers(
+                    headers,
+                    [Path(__file__).parent.parent.parent / "include"],
+                    OrderedSet(),  # type: ignore[arg-type]
+                )
+                code.writeline(header_contents)
+
+            if self.inside_reduction:
+                total_reduction_size = math.prod(
+                    t.numel for t in self.range_trees if t.is_reduction
+                )
+                threadgroup_size = min(total_reduction_size, self.max_threadgroup_size)
+                code.writeline(
+                    f"[[max_total_threads_per_threadgroup({threadgroup_size})]]"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             code.writeline("kernel void generated_kernel(")
             with code.indent():
                 for outer, inner in self.args.output_buffers.items():
@@ -618,7 +1092,19 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
                     dtype_str = self.dtype_to_str(V.graph.get_dtype(outer))
                     code.writeline(f"device {dtype_str}* {inner},")
                 for outer, inner in self.args.input_buffers.items():
+<<<<<<< HEAD
                     dtype_str = self.dtype_to_str(V.graph.get_dtype(outer))
+=======
+                    dtype = V.graph.get_dtype(outer)
+                    # MPS does not support float64, but scalar inputs are fine
+                    if dtype == torch.float64:
+                        outer_buf = V.graph.try_get_buffer(outer)
+                        if outer_buf is None or outer_buf.get_size() != []:
+                            raise RuntimeError("float64 is not supported by MPS")
+                        dtype_str = "float"
+                    else:
+                        dtype_str = self.dtype_to_str(dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     code.writeline(f"constant {dtype_str}* {inner},")
                 for outer, inner in self.args.sizevars.items():
                     code.writeline(f"constant long& {inner},")
@@ -647,13 +1133,22 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
                 code.splice(self.indexing_code)
                 code.splice(self.body)
             code.writeline("}")
+<<<<<<< HEAD
         code.writeline('""")')
+=======
+
+        if V.graph.cpp_wrapper:
+            code.writeline(')MTL");')
+        else:
+            code.writeline("''')")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return code.getvalue()
 
     def call_kernel(self, name: str, node: Any = None) -> None:
         """Codegen a call to this kernel"""
         wrapper = V.graph.wrapper_code
+<<<<<<< HEAD
         args = [*self.args.output_buffers.keys(), *self.args.input_buffers.keys()]
         args = [arg for arg in args if arg not in self.removed_buffers]
         args += [str(v) for v in self.args.sizevars.keys()]
@@ -662,27 +1157,85 @@ def call_kernel(self, name: str, node: Any = None) -> None:
         if len(self.active_range_trees()) > 0:
             threads = [
                 self.pexpr(
+=======
+        # Make sure sizevars has been computed
+        for v in self.args.sizevars.keys():
+            wrapper.ensure_size_computed(v)
+
+        _, call_args, _, arg_types = self.args.python_argdefs()
+        arg_name_to_type = {
+            str(call_arg): arg_type for call_arg, arg_type in zip(call_args, arg_types)
+        }
+
+        args = [*self.args.output_buffers.keys(), *self.args.input_buffers.keys()]
+        args = [arg for arg in args if arg not in self.removed_buffers]
+        args += [str(v) for v in self.args.sizevars.keys()]
+
+        arg_types = [arg_name_to_type[arg] for arg in args]
+        expr_printer = self.cexpr if V.graph.cpp_wrapper else self.pexpr
+
+        def format_threads(threads: list[str], kwarg: str) -> str:
+            if V.graph.cpp_wrapper:
+                threads = [f"static_cast<uint64_t>({t})" for t in threads]
+                return f"{{{', '.join(threads)}}}"
+            else:
+                return f"{kwarg}=[{', '.join(threads)}]"
+
+        # For reduction kernels, limit the maximum size over reduction dimensions to
+        # a maximum threadgroup size
+        if len(self.active_range_trees()) > 0:
+            threads = [
+                expr_printer(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     sympy.Min(v.numel, self.max_threadgroup_size)  # type: ignore[misc]
                     if v.is_reduction
                     else v.numel
                 )
                 for v in self.active_range_trees()
             ]
+<<<<<<< HEAD
             args += [f"threads=[{', '.join(threads)}]"]
         if self.inside_reduction:
             threads = [
                 self.pexpr(sympy.Min(v.numel, self.max_threadgroup_size))  # type: ignore[misc]
+=======
+
+            args.append(format_threads(threads, "threads"))
+            arg_types.append(list)
+        else:
+            if V.graph.cpp_wrapper:
+                raise RuntimeError("We should always have threads?")
+
+        if self.inside_reduction:
+            threads = [
+                expr_printer(sympy.Min(v.numel, self.max_threadgroup_size))  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if v.is_reduction
                 else "1"
                 for v in self.active_range_trees()
             ]
+<<<<<<< HEAD
             args += [f"group_size=[{', '.join(threads)}]"]
+=======
+            args.append(format_threads(threads, "group_size"))
+            arg_types.append(list)
+        else:
+            if V.graph.cpp_wrapper:
+                # Add a None so that we always have a group_size in the
+                # arguments. We won't use it if the value is None.
+                args += [None]  # type: ignore[list-item]
+                arg_types.append(None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         wrapper.generate_kernel_call(
             name,
             args,
             device=torch.device("cpu"),  # TODO: Fix me, MPS does not expose streams now
             triton=False,
+<<<<<<< HEAD
+=======
+            arg_types=arg_types,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def check_bounds(
@@ -694,7 +1247,12 @@ def check_bounds(
         # See https://github.com/pytorch/pytorch/issues/144634
         expr_str = self.index_to_str(expr)
         lower_expr = f"{expr_str} < 0" if lower else ""
+<<<<<<< HEAD
         upper_expr = f"{expr_str} >= {self.index_to_str(size)}" if upper else ""
+=======
+        # TODO(malfet): Is upper bound inclusive or exclusive?
+        upper_expr = f"{expr_str} > {self.index_to_str(size)}" if upper else ""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if lower and upper:
             line = f"if (({lower_expr}) && ({upper_expr})) return"
         else:
@@ -702,6 +1260,7 @@ def check_bounds(
         self.cse.generate(self.compute, line, assignment=False)
 
 
+<<<<<<< HEAD
 @functools.cache
 def _warn_prototype() -> None:
     import warnings
@@ -713,17 +1272,28 @@ def _warn_prototype() -> None:
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MetalScheduling(SIMDScheduling):
     kernel_type = MetalKernel  # type: ignore[assignment]
 
     def __init__(self, scheduler: Optional[Scheduler]) -> None:
         super().__init__(scheduler)
+<<<<<<< HEAD
         _warn_prototype()
         wrapper = V.graph.wrapper_code
         if wrapper is not None:
             wrapper.header.splice(
                 "from torch._inductor.runtime.runtime_utils import compile_mps_shader"
             )
+=======
+        wrapper = V.graph.wrapper_code
+        if wrapper is not None:
+            if not V.graph.cpp_wrapper:
+                wrapper.header.splice(
+                    "from torch._inductor.runtime.runtime_utils import compile_mps_shader"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def define_kernel(
         self, src_code: str, node_schedule: list[SchedulerNode], kernel: MetalKernel
@@ -735,10 +1305,27 @@ def define_kernel(
             # TODO: Merge multiple kernels into a single library
             # Either using MultiKernel concept or overriding SIMDScheduling.codegen_node_scheduling
             mps_lib_name = f"mps_lib_{wrapper.next_kernel_suffix()}"
+<<<<<<< HEAD
             kernel_name = f"{mps_lib_name}.generated_kernel"
             wrapper.src_to_kernel[src_code] = kernel_name
             origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
             metadata_comment = f"{origins}\n{detailed_origins}"
             wrapper.define_kernel(mps_lib_name, src_code, metadata_comment)
+=======
+
+            if V.graph.cpp_wrapper:
+                src_code = (
+                    f"at::native::mps::DynamicMetalShaderLibrary {mps_lib_name}"
+                    + src_code
+                )
+                kernel_name = f"{mps_lib_name}_func"
+            else:
+                kernel_name = f"{mps_lib_name}.generated_kernel"
+
+            wrapper.src_to_kernel[src_code] = kernel_name
+            origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
+            metadata_comment = f"{origins}\n{detailed_origins}"
+            wrapper.define_kernel(mps_lib_name, src_code, metadata_comment, gpu=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return kernel_name
diff --git a/torch/_inductor/codegen/mps_device_op_overrides.py b/torch/_inductor/codegen/mps_device_op_overrides.py
index ee50f4eefdbc..9b3b4e376bfb 100644
--- a/torch/_inductor/codegen/mps_device_op_overrides.py
+++ b/torch/_inductor/codegen/mps_device_op_overrides.py
@@ -10,7 +10,19 @@ def device_guard(self, device_idx: int) -> str:
 
     def set_device(self, device_idx: int) -> str:
         assert device_idx == 0
+<<<<<<< HEAD
         return "# MPS set device"
+=======
+        return "pass  # MPS set device"
+
+    def kernel_driver(self) -> str:
+        return """
+            #include <ATen/native/mps/MetalShaderLibrary.h>
+        """
+
+    def cpp_kernel_type(self) -> str:
+        return "MTLFunction_t"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 register_device_op_overrides("mps", MPSDeviceOpOverrides())
diff --git a/torch/_inductor/codegen/multi_kernel.py b/torch/_inductor/codegen/multi_kernel.py
index 43a283130c1c..5f61c733958b 100644
--- a/torch/_inductor/codegen/multi_kernel.py
+++ b/torch/_inductor/codegen/multi_kernel.py
@@ -8,7 +8,11 @@
 from torch.utils._ordered_set import OrderedSet
 
 from .. import config
+<<<<<<< HEAD
 from ..codecache import code_hash, CodeCacheFuture, get_path
+=======
+from ..codecache import code_hash, CodeCacheFuture, get_path, write_atomic
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..runtime.benchmarking import benchmarker
 from ..utils import cache_on_self, IndentedBuffer
 from ..virtualized import V
@@ -18,6 +22,7 @@
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 def get_kernel_argdefs(kernel):
     arg_defs, _, _, _ = kernel.args.python_argdefs()
     return [x.name for x in arg_defs]
@@ -77,6 +82,8 @@ def get_numel_argdefs(kernel):
     return numel_argdefs
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MultiKernelState:
     """
     Maintain state of multi-kernel compilation so we don't define duplicated
@@ -87,6 +94,10 @@ class MultiKernelState:
 
     def __init__(self):
         self.subkernel_to_kernel_name = {}
+<<<<<<< HEAD
+=======
+        self.kernel_defs = IndentedBuffer()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def define_kernel(self, kernels):
         """
@@ -116,7 +127,11 @@ def define_kernel(self, kernels):
             # the second pass of cpp-wrapper.
             return multi_kernel_name
 
+<<<<<<< HEAD
         buf = IndentedBuffer()
+=======
+        buf = self.kernel_defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         buf.writeline("")
         buf.writeline(
             f"{multi_kernel_name} = async_compile.multi_kernel({multi_kernel_name!r}, ["
@@ -126,12 +141,19 @@ def define_kernel(self, kernels):
                 buf.writeline(f"{name},")
         buf.writeline("])")
 
+<<<<<<< HEAD
         wrapper = V.graph.wrapper_code
         if config.triton.autotune_at_compile_time:
             wrapper.kernel_autotune_defs.splice(buf)
             wrapper.src_to_kernel["\n".join(kernel_names)] = multi_kernel_name
         else:
             wrapper.header.splice(buf)
+=======
+        if config.triton.autotune_at_compile_time:
+            V.graph.wrapper_code.src_to_kernel["\n".join(kernel_names)] = (
+                multi_kernel_name
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return multi_kernel_name
 
@@ -148,7 +170,11 @@ class MultiKernel:
     )
     ```
 
+<<<<<<< HEAD
     Here is an concrete example: https://gist.github.com/shunting314/d9f3fb6bc6cee3dbae005825ca196d39
+=======
+    Here is a concrete example: https://gist.github.com/shunting314/d9f3fb6bc6cee3dbae005825ca196d39
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(self, kernels):
@@ -225,7 +251,11 @@ def call_kernel(self, kernel_name):
 
     def codegen_nan_check(self):
         wrapper = V.graph.wrapper_code
+<<<<<<< HEAD
         seen = OrderedSet[str]()
+=======
+        seen: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for k in self.kernels:
             _, call_args, precompile_args, _ = k.args.python_argdefs()
             for arg, precompile_arg in zip(call_args, precompile_args):
@@ -315,8 +345,12 @@ def store_cache(self):
         path = self.cache_file_path()
         path.parent.mkdir(parents=True, exist_ok=True)
 
+<<<<<<< HEAD
         with path.open("w") as fd:
             fd.write(str(self.picked_kernel))
+=======
+        write_atomic(path, str(self.picked_kernel))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.debug("Store picked kernel %d to cache file %s", self.picked_kernel, path)
 
     @property
diff --git a/torch/_inductor/codegen/rocm/ck_conv_template.py b/torch/_inductor/codegen/rocm/ck_conv_template.py
index 7065b0aceb0d..064e71d4d56b 100644
--- a/torch/_inductor/codegen/rocm/ck_conv_template.py
+++ b/torch/_inductor/codegen/rocm/ck_conv_template.py
@@ -474,9 +474,15 @@ def gen_ops(self):
         chosen_instances = (
             random.sample(
                 filtered_instances,
+<<<<<<< HEAD
                 min(len(filtered_instances), config.rocm.n_max_profiling_configs),
             )
             if config.rocm.n_max_profiling_configs
+=======
+                min(len(filtered_instances), config.rocm.ck_max_profiling_configs),
+            )
+            if config.rocm.ck_max_profiling_configs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else filtered_instances
         )
         log.debug(
diff --git a/torch/_inductor/codegen/rocm/ck_tile_template.py b/torch/_inductor/codegen/rocm/ck_tile_template.py
new file mode 100644
index 000000000000..65ac33444bdb
--- /dev/null
+++ b/torch/_inductor/codegen/rocm/ck_tile_template.py
@@ -0,0 +1,56 @@
+import torch
+from torch._inductor.codegen.rocm.rocm_template import ROCmTemplate
+from torch._inductor.ir import IRNode
+from torch._inductor.utils import IndentedBuffer
+
+
+class CKTileTemplate(ROCmTemplate):
+    """
+    Base class for generating CK templates, has common, i.e. non-gemm-specific, code generation logic
+    """
+
+    _TORCH_DTYPE_TO_CK = {
+        torch.float32: "F32",
+        torch.float64: "F64",
+        torch.float16: "F16",
+        torch.bfloat16: "BF16",
+        torch.int32: "I32",
+        torch.int8: "I8",
+        torch.float8_e4m3fnuz: "F8",
+        torch.float8_e5m2fnuz: "BF8",
+    }
+
+    ck_dtype_to_size = {
+        "FP16": 2,
+        "BF16": 2,
+    }
+
+    def header(self) -> IndentedBuffer:
+        res = super().header()
+        res.splice(
+            """
+                // CK headers
+                #include "ck_tile/core.hpp"
+
+            """
+        )
+        return res
+
+    def globals(self) -> IndentedBuffer:
+        res = super().globals()
+        res.splice(
+            """
+                using F8  = ck_tile::fp8_t;
+                using BF8 = ck_tile::bf8_t;
+                using F16 = ck_tile::half_t;
+                using F32 = float;
+                using BF16 = ck_tile::bfloat16_t;
+            """
+        )
+        return res
+
+    def torch_type_to_ck(self, node: IRNode, ptr: str) -> str:
+        if node is None:
+            return ptr
+        else:
+            return f"({self._TORCH_DTYPE_TO_CK.get(node.get_dtype())}*)({ptr})"
diff --git a/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py b/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py
new file mode 100644
index 000000000000..5862534ce6cc
--- /dev/null
+++ b/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py
@@ -0,0 +1,967 @@
+# mypy: allow-untyped-defs, disable-error-code="attr-defined, valid-type"
+import functools
+import logging
+import random
+from dataclasses import asdict, dataclass
+from typing import Any
+
+import torch
+from torch._inductor import config
+from torch._inductor.codegen.rocm.ck_tile_template import CKTileTemplate
+from torch._inductor.codegen.rocm.rocm_kernel import ROCmTemplateKernel
+from torch._inductor.codegen.rocm.rocm_template import ArgInfo
+from torch._inductor.ir import Buffer, Layout
+from torch.utils._ordered_set import OrderedSet
+
+from ...utils import IndentedBuffer
+
+
+log = logging.getLogger(__name__)
+
+
+def is_static_int(number):
+    import sympy
+
+    return isinstance(number, (int, sympy.Integer))
+
+
+def torch_layout_to_ck_layout(torch_layout):
+    if torch_layout.stride[-1] == 1:
+        return "Row"
+    elif torch_layout.stride[-2] == 1:
+        return "Col"
+    else:
+        return None
+
+
+@dataclass
+class CKTileGemmOperation:
+    layout_a: str
+    layout_b: str
+    layout_c: str
+
+    datatype_a: str
+    datatype_b: str
+    datatype_c: str
+
+    tile_m: int
+    tile_n: int
+    tile_k: int
+
+    warp_m: int
+    warp_n: int
+    warp_k: int
+
+    warp_tile_m: int
+    warp_tile_n: int
+    warp_tile_k: int
+
+    m_is_padded: str
+    n_is_padded: str
+    k_is_padded: str
+
+    pipeline: str
+    scheduler: str
+    epilogue: str
+
+    def layout_repr(self):
+        return f"{self.layout_a[0]}{self.layout_b[0]}{self.layout_c[0]}"
+
+    def dtype_repr(self):
+        return f"{self.datatype_a}{self.datatype_b}{self.datatype_c}"
+
+    def tile_sizes(self):
+        return "_".join(
+            [
+                f"{self.tile_m}{self.tile_n}{self.tile_k}",
+                f"{self.warp_m}{self.warp_n}{self.warp_k}",
+                f"{self.warp_tile_m}{self.warp_tile_n}{self.warp_tile_k}",
+            ]
+        )
+
+    def name(self):
+        return "ck_tile_gemm_universal_" + "_".join(
+            [
+                f"{self.layout_repr()}",
+                f"{self.dtype_repr()}",
+                f"{self.tile_sizes()}",
+                f"{self.pipeline}",
+                f"{self.scheduler}",
+                f"{self.epilogue}",
+            ]
+        )
+
+    def dict_items(self):
+        return asdict(self).items()
+
+
+@functools.cache
+def ops():
+    """
+    Generate the supported instance dataclasses
+    """
+    import itertools
+
+    compute_v3_instances = [
+        CKTileGemmOperation(
+            layout_a=layout_a,
+            layout_b=layout_b,
+            layout_c=layout_c,
+            datatype_a=datatype_a,
+            datatype_b=datatype_b,
+            datatype_c=datatype_c,
+            tile_m=tile_m,
+            tile_n=tile_n,
+            tile_k=tile_k,
+            warp_m=warp_m,
+            warp_n=warp_n,
+            warp_k=warp_k,
+            warp_tile_m=warp_tile_m,
+            warp_tile_n=warp_tile_n,
+            warp_tile_k=warp_tile_k,
+            m_is_padded=m_is_padded,
+            n_is_padded=n_is_padded,
+            k_is_padded=k_is_padded,
+            pipeline="CompV3",
+            scheduler="Intrawave",
+            epilogue=epilogue,
+        )
+        for (layout_a, layout_b, layout_c) in [
+            ("Row", "Row", "Row"),
+            ("Row", "Col", "Row"),
+        ]
+        for (datatype_a, datatype_b, datatype_c) in [("FP16",) * 3, ("BF16",) * 3]
+        for (tile_m, tile_n, tile_k) in [(256, 256, 32), (256, 256, 64)]
+        for (warp_m, warp_n, warp_k) in [(2, 2, 1)]
+        for (warp_tile_m, warp_tile_n, warp_tile_k) in [(32, 32, 16)]
+        for m_is_padded in ["true", "false"]
+        for n_is_padded in ["true", "false"]
+        for k_is_padded in ["true", "false"]
+        for epilogue in ["Default", "CShuffle"]
+    ]
+
+    compute_v4_instances = [
+        CKTileGemmOperation(
+            layout_a=layout_a,
+            layout_b=layout_b,
+            layout_c=layout_c,
+            datatype_a=datatype_a,
+            datatype_b=datatype_b,
+            datatype_c=datatype_c,
+            tile_m=tile_m,
+            tile_n=tile_n,
+            tile_k=tile_k,
+            warp_m=warp_m,
+            warp_n=warp_n,
+            warp_k=warp_k,
+            warp_tile_m=warp_tile_m,
+            warp_tile_n=warp_tile_n,
+            warp_tile_k=warp_tile_k,
+            m_is_padded=m_is_padded,
+            n_is_padded=n_is_padded,
+            k_is_padded=k_is_padded,
+            pipeline="CompV4",
+            scheduler="Intrawave",
+            epilogue=epilogue,
+        )
+        for (layout_a, layout_b, layout_c) in [
+            ("Row", "Row", "Row"),
+            ("Row", "Col", "Row"),
+        ]
+        for (datatype_a, datatype_b, datatype_c) in [("FP16",) * 3, ("BF16",) * 3]
+        for (tile_m, tile_n, tile_k) in [
+            (256, 256, 32)
+        ]  # half the tile size since it has double buffering
+        for (warp_m, warp_n, warp_k) in [(2, 2, 1)]
+        for (warp_tile_m, warp_tile_n, warp_tile_k) in [(32, 32, 16)]
+        for m_is_padded in ["true", "false"]
+        for n_is_padded in ["true", "false"]
+        for k_is_padded in ["true", "false"]
+        for epilogue in ["Default", "CShuffle"]
+    ]
+
+    mem_instances = [
+        CKTileGemmOperation(
+            layout_a=layout_a,
+            layout_b=layout_b,
+            layout_c=layout_c,
+            datatype_a=datatype_a,
+            datatype_b=datatype_b,
+            datatype_c=datatype_c,
+            tile_m=tile_m,
+            tile_n=tile_n,
+            tile_k=tile_k,
+            warp_m=warp_m,
+            warp_n=warp_n,
+            warp_k=warp_k,
+            warp_tile_m=warp_tile_m,
+            warp_tile_n=warp_tile_n,
+            warp_tile_k=warp_tile_k,
+            m_is_padded=m_is_padded,
+            n_is_padded=n_is_padded,
+            k_is_padded=k_is_padded,
+            pipeline="Mem",
+            scheduler=scheduler,
+            epilogue=epilogue,
+        )
+        for (layout_a, layout_b, layout_c) in [
+            ("Row", "Row", "Row"),
+            ("Row", "Col", "Row"),
+        ]
+        for (datatype_a, datatype_b, datatype_c) in [("FP16",) * 3, ("BF16",) * 3]
+        for (tile_m, tile_n, tile_k) in [(256, 256, 32), (256, 256, 64)]
+        for (warp_m, warp_n, warp_k) in [(2, 2, 1)]
+        for (warp_tile_m, warp_tile_n, warp_tile_k) in [(32, 32, 16)]
+        for m_is_padded in ["true", "false"]
+        for n_is_padded in ["true", "false"]
+        for k_is_padded in ["true", "false"]
+        for scheduler in ["Intrawave", "Interwave"]
+        for epilogue in ["Default", "CShuffle"]
+    ]
+
+    return list(
+        itertools.chain(compute_v3_instances, compute_v4_instances, mem_instances)
+    )
+
+
+class CKTileGemmTemplate(CKTileTemplate):
+    """
+    This class is used for rendering CK-Tile Universal GEMM kernels
+    """
+
+    gemm_template = r"""{{version_comment}}
+    {{headers}}
+    {{globals}}
+    {{instance_definition}}
+    extern "C" {
+    PT_EXPORT {{kernel_definition}} {
+
+        using {{instance_namespace}}::BaseGemmPipeline;
+        using {{instance_namespace}}::TilePartitioner;
+
+        constexpr auto TileK = {{instance_namespace}}::TileK;
+        constexpr auto kPrefetchStages = BaseGemmPipeline::PrefetchStages;
+
+        auto kargs = ck_tile::GemmKernelArgs {
+           X,
+           W,
+           Y,
+           M,
+           N,
+           K,
+           LDA,
+           LDB,
+           LDC,
+           kBatch
+        };
+
+        if (workspace_size) {
+            *workspace_size = 0;
+            return 0;
+        }
+
+        // run the kernel
+        const auto dispatch = [&](const auto has_hot_loop_, const auto tail_number_) constexpr {
+            using Kernel = {{instance_namespace}}::Kernel<has_hot_loop_.value, tail_number_.value>;
+
+            if (!Kernel::IsSupportedArgument(kargs)) {
+                // we do our best to statically avoid this case in `filter_op`
+                throw std::runtime_error("invalid argument");
+            }
+            auto stream_config = ck_tile::stream_config{stream};
+            auto grid_size = Kernel::GridSize(M, N, kBatch);
+            constexpr auto block_size = Kernel::BlockSize();
+            constexpr auto lds_bytes = 0;
+            constexpr auto kBlockPerCU = 1;
+            auto gemm = ck_tile::make_kernel<block_size.x, kBlockPerCU>(Kernel{}, grid_size, block_size, lds_bytes, kargs);
+            float elapsed_time = ck_tile::launch_kernel(stream_config, gemm);
+        };
+
+        const ck_tile::index_t k_grain     = kBatch * TileK;
+        const ck_tile::index_t K_split     = (K + k_grain - 1) / k_grain * TileK;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        {{rendered_dispatch}}
+
+        return 0;
+    } // kernel definition
+    } // extern C
+    """
+
+    def __init__(
+        self,
+        input_nodes: list[Buffer],
+        layout: Layout,
+    ) -> None:
+        super().__init__(
+            "ck_tile_gemm_template",
+            input_nodes=input_nodes,
+            layout=layout,
+        )
+
+    def header(self) -> IndentedBuffer:
+        res = super().header()
+        res.splice(
+            """
+                // CK GEMM header(s)
+
+                #include "ck_tile/ops/gemm.hpp"
+                #include "ck_tile/ops/epilogue.hpp"
+            """
+        )
+        return res
+
+    def globals(self) -> IndentedBuffer:
+        res = super().globals()
+        res.splice(
+            """
+                // CK GEMM globals
+
+                using Row = ck_tile::tensor_layout::gemm::RowMajor;
+                using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+                template <ck_tile::index_t PrefetchStages, typename Dispatcher>
+                void dispatch_memory_pipeline_hot_loop(const ck_tile::TailNumber tail_num, Dispatcher dispatch)
+                {
+                    if(tail_num == ck_tile::TailNumber::One)
+                    {
+                        dispatch(ck_tile::bool_constant<true>{},
+                            ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
+                    }
+                    else if(tail_num == ck_tile::TailNumber::Full)
+                    {
+                        dispatch(ck_tile::bool_constant<true>{},
+                            ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+                    }
+
+                    if constexpr(PrefetchStages > 2)
+                    {
+                        if(tail_num == ck_tile::TailNumber::Two)
+                        {
+                            dispatch(ck_tile::bool_constant<true>{},
+                                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
+                        }
+                    }
+                    if constexpr(PrefetchStages > 3)
+                    {
+                        if(tail_num == ck_tile::TailNumber::Three)
+                        {
+                            dispatch(ck_tile::bool_constant<true>{},
+                                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
+                        }
+                    }
+                    if constexpr(PrefetchStages > 4)
+                    {
+                        if(tail_num == ck_tile::TailNumber::Four)
+                        {
+                            dispatch(ck_tile::bool_constant<true>{},
+                                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
+                        }
+                    }
+                    if constexpr(PrefetchStages > 5)
+                    {
+                        if(tail_num == ck_tile::TailNumber::Five)
+                        {
+                            dispatch(ck_tile::bool_constant<true>{},
+                                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
+                        }
+                    }
+                    if constexpr(PrefetchStages > 6)
+                    {
+                        if(tail_num == ck_tile::TailNumber::Six)
+                        {
+                            dispatch(ck_tile::bool_constant<true>{},
+                                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
+                        }
+                    }
+                    if constexpr(PrefetchStages > 7)
+                    {
+                        if(tail_num == ck_tile::TailNumber::Seven)
+                        {
+                            dispatch(ck_tile::bool_constant<true>{},
+                                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
+                        }
+                    }
+                }
+            """
+        )
+        return res
+
+    def check_dtypes(self, op: "CKTileGemmOperation"):
+        X_dtype, W_dtype, out_dtype = [
+            T.get_layout().dtype for T in [*self.input_nodes, self.output_node]
+        ]
+        if op.datatype_a != self._TORCH_DTYPE_TO_CK[X_dtype]:
+            return False
+        if op.datatype_b != self._TORCH_DTYPE_TO_CK[W_dtype]:
+            return False
+        if op.datatype_c != self._TORCH_DTYPE_TO_CK[out_dtype]:
+            return False
+        return True
+
+    def check_layouts(self, op: "CKTileGemmOperation"):
+        X_layout, W_layout, out_layout = [
+            torch_layout_to_ck_layout(T.get_layout())
+            for T in [*self.input_nodes, self.output_node]
+        ]
+        if op.layout_a != X_layout:
+            return False
+        if op.layout_b != W_layout:
+            return False
+        if op.layout_c != out_layout:
+            return False
+        return True
+
+    def get_gemm_problem_size(self):
+        X_size, W_size = [T.get_layout().size for T in [*self.input_nodes]]
+
+        M, K = X_size
+        _, N = W_size
+
+        return M, N, K
+
+    def check_block_tiles(self, op: "CKTileGemmOperation"):
+        """
+        The contiguous dimension of a tensor must be divisible by the block tile size
+        This helper function enforces it for the inputs and the output.
+        """
+        M, N, K = self.get_gemm_problem_size()
+
+        def check(dim_size, tile_size, is_padded):
+            if (
+                is_static_int(dim_size)
+                and dim_size % tile_size != 0
+                and is_padded == "false"
+            ):
+                return False
+            return True
+
+        if op.layout_a == "Row":
+            # handle in kBatch check
+            return True
+        elif op.layout_a == "Col":
+            if not check(M, op.tile_m, op.m_is_padded):
+                return False
+        else:
+            raise AssertionError(f"Invalid layout {op.layout_a=}")
+
+        if op.layout_b == "Row":
+            if not check(N, op.tile_n, op.n_is_padded):
+                return False
+        elif op.layout_b == "Col":
+            # handle in kBatch check
+            return True
+        else:
+            raise AssertionError(f"Invalid {op.layout_b=}")
+
+        if op.layout_c == "Row":
+            if not check(N, op.tile_n, op.n_is_padded):
+                return False
+        elif op.layout_c == "Col":
+            if not check(M, op.tile_m, op.m_is_padded):
+                return False
+        else:
+            raise AssertionError(f"Invalid layout {op.layout_c=}")
+
+        return True
+
+    def check_alignments(self, op: "CKTileGemmOperation"):
+        """
+        The contiguous dimension of a tensor must be divisible by the vector load size.
+        """
+        M, N, K = self.get_gemm_problem_size()
+
+        def max_alignment(contiguous_elements_per_tile, elements_per_thread, ck_dtype):
+            for vector_load_bytes in (16, 8, 4, 2, 1):
+                alignment = vector_load_bytes // self.ck_dtype_to_size[ck_dtype]
+                if (
+                    alignment > 0
+                    and contiguous_elements_per_tile % alignment == 0
+                    and elements_per_thread % alignment == 0
+                ):
+                    return alignment
+
+        threads_per_block = (
+            op.warp_m * op.warp_n * op.warp_k * self.gfx9_threads_per_warp
+        )
+        a_elements_per_thread = op.tile_m * op.tile_k / threads_per_block
+        b_elements_per_thread = op.tile_n * op.tile_k / threads_per_block
+
+        if op.layout_a == "Row":
+            # K is contiguous tensor dimension
+            a_max_vector_size = max_alignment(
+                op.tile_k, a_elements_per_thread, op.datatype_a
+            )
+            if is_static_int(K) and K % a_max_vector_size != 0:
+                return False
+        elif op.layout_a == "Col":
+            # M is contiguous tensor dimension
+            a_max_vector_size = max_alignment(
+                op.tile_m, a_elements_per_thread, op.datatype_a
+            )
+            if is_static_int(M) and M % a_max_vector_size != 0:
+                return False
+        else:
+            raise AssertionError(f"Invalid layout {op.layout_a=}")
+
+        if op.layout_b == "Row":
+            # N is contiguous tensor dimension
+            b_max_vector_size = max_alignment(
+                op.tile_n, b_elements_per_thread, op.datatype_b
+            )
+            if is_static_int(N) and N % b_max_vector_size != 0:
+                return False
+        elif op.layout_b == "Col":
+            # K is contiguous tensor dimension
+            b_max_vector_size = max_alignment(
+                op.tile_k, b_elements_per_thread, op.datatype_b
+            )
+            if is_static_int(K) and K % b_max_vector_size != 0:
+                return False
+        else:
+            raise AssertionError(f"Invalid layout {op.layout_b=}")
+
+        # the `default` epilogue writes C to memory by 1 tensor element
+        # (divisibility check not necessary)
+        # the `cshuffle` epilogue writes C to memory by 16 bytes
+        # (so the contiguous C dimension size must be divisible by the number of tensor elements in 16 bytes)
+        if op.epilogue == "CShuffle":
+            if (
+                op.layout_c == "Row"
+                and is_static_int(N)
+                and N % (16 / self.ck_dtype_to_size[op.datatype_c]) != 0
+            ):
+                return False
+
+        return True
+
+    def check_warp_tiles(self, op: "CKTileGemmOperation"):
+        if op.tile_m % (op.warp_m * op.warp_tile_m) != 0:
+            return False
+        if op.tile_n % (op.warp_n * op.warp_tile_n) != 0:
+            return False
+        if op.tile_k % (op.warp_k * op.warp_tile_k) != 0:
+            return False
+        return True
+
+    def check_block_tile_size(self, op: "CKTileGemmOperation"):
+        # assuming LDS size is 64KB
+        if op.pipeline == "CompV4":
+            max_block_tile_size = 2**15
+        else:
+            max_block_tile_size = 2**16
+
+        block_tile_size = (
+            self.ck_dtype_to_size[op.datatype_a] * op.tile_m * op.tile_k
+            + self.ck_dtype_to_size[op.datatype_b] * op.tile_n * op.tile_k
+        )
+        if block_tile_size > max_block_tile_size:
+            return False
+        return True
+
+    def filter_op(self, op: "CKTileGemmOperation"):
+        """
+        Determines whether a given op definition is suitable for the current
+        input / output of the operation that this template implements.
+
+        Filter is based on inputs' dtype, layout and statically inferred size.
+
+        Returns None if the op is not suitable, otherwise returns the op to be used.
+        """
+        if not self.check_dtypes(op):
+            return None
+        if not self.check_layouts(op):
+            return None
+        if not self.check_block_tiles(op):
+            return None
+        if not self.check_alignments(op):
+            return None
+
+        return op
+
+    def emit_ck_instance(self, op: "CKTileGemmOperation"):
+        """
+        This method is used to generate code which defines the type alias for the generated kernel class
+        """
+        template_definition = r"""
+    // Gemm operator {{operation_name}}
+
+    namespace {{operation_name}} {
+         // block tile
+        constexpr int32_t TileM = {{tile_m}};
+        constexpr int32_t TileN = {{tile_n}};
+        constexpr int32_t TileK = {{tile_k}};
+        // warps per block
+        constexpr int32_t WarpM = {{warp_m}};
+        constexpr int32_t WarpN = {{warp_n}};
+        constexpr int32_t WarpK = {{warp_k}};
+        // xdl tile
+        constexpr int32_t WarpTileM = {{warp_tile_m}};
+        constexpr int32_t WarpTileN = {{warp_tile_n}};
+        constexpr int32_t WarpTileK = {{warp_tile_k}};
+
+        constexpr bool kPadM = {{m_is_padded}};
+        constexpr bool kPadN = {{n_is_padded}};
+        constexpr bool kPadK = {{k_is_padded}};
+
+        using ALayout = {{layout_a}};
+        using BLayout = {{layout_b}};
+        using CLayout = {{layout_c}};
+
+        using ADataType = {{datatype_a}};
+        using BDataType = {{datatype_b}};
+        using CDataType = {{datatype_c}};
+        using AccDataType = F32;
+
+        constexpr bool permuteA = false;
+        constexpr bool permuteB = false;
+        constexpr bool DoubleSmemBuffer = {{has_double_smem_buffer}};
+        constexpr bool TransposeC = false;
+
+        constexpr int kBlockPerCu                         = 1;
+        constexpr ck_tile::index_t TilePartitionerGroupNum = 8;
+        constexpr ck_tile::index_t TilePartitionerM01      = 4;
+
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<TileM, TileN, TileK>,
+                                   ck_tile::sequence<WarpM, WarpN, WarpK>,
+                                   ck_tile::sequence<WarpTileM, WarpTileN, WarpTileK>,
+                                   permuteA,
+                                   permuteB>;
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       TilePartitionerGroupNum,
+                                                       TilePartitionerM01>;
+
+        using Traits  =
+            ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<kPadM, kPadN, kPadK, DoubleSmemBuffer,
+                                             ALayout, BLayout, CLayout, TransposeC>;
+
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        {{rendered_scheduler}}
+
+        template<bool has_hot_loop_v, ck_tile::TailNumber tail_number_v>
+        using UniversalGemmProblem =
+            ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                  BDataType,
+                                                  AccDataType,
+                                                  GemmShape,
+                                                  GemmUniversalTraits,
+                                                  scheduler,
+                                                  has_hot_loop_v,
+                                                  tail_number_v>;
+
+        {{rendered_pipeline}}
+
+        {{rendered_epilogue}}
+
+        template<bool has_hot_loop_v, ck_tile::TailNumber tail_number_v>
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline<has_hot_loop_v, tail_number_v>, GemmEpilogue>;
+    }
+
+"""
+
+        def render_epilogue(epilogue_type):
+            if epilogue_type == "Default":
+                return r"""
+            using EpilogueProblem = ck_tile::DefaultGemm2DEpilogueProblem<ADataType,
+                                                                          BDataType,
+                                                                          AccDataType,
+                                                                          CDataType,
+                                                                          CLayout,
+                                                                          kPadM,
+                                                                          kPadN,
+                                                                          WarpTileM,
+                                                                          WarpTileN,
+                                                                          WarpTileK,
+                                                                          TransposeC>;
+            using GemmEpilogue = ck_tile::DefaultGemm2DEpilogue<EpilogueProblem>;
+        """
+            elif epilogue_type == "CShuffle":
+                return r"""
+            constexpr auto kMemoryOperation = ck_tile::memory_operation_enum::set;
+            using EpilogueProblem = ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                                     BDataType,
+                                                                     AccDataType,
+                                                                     CDataType,
+                                                                     CLayout,
+                                                                     GemmPipelineProblem::kBlockSize,
+                                                                     TileM,
+                                                                     TileN,
+                                                                     WarpM,
+                                                                     WarpN,
+                                                                     WarpTileM,
+                                                                     WarpTileN,
+                                                                     WarpTileK,
+                                                                     TransposeC,
+                                                                     kMemoryOperation>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<EpilogueProblem>;
+        """
+            else:
+                raise AssertionError("Epilogue must be set")
+
+        def render_pipeline(pipeline_type):
+            return rf"""
+            using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCr{pipeline_type}<GemmPipelineProblem>;
+
+            template<bool has_hot_loop_v, ck_tile::TailNumber tail_number_v>
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCr{pipeline_type}<UniversalGemmProblem<has_hot_loop_v, tail_number_v>>;
+        """
+
+        def render_scheduler(scheduler_type):
+            return rf"""
+            constexpr auto scheduler = ck_tile::GemmPipelineScheduler::{scheduler_type};
+        """
+
+        rendered_definition = self._template_from_string(template_definition).render(
+            operation_name=op.name(),
+            **asdict(op),
+            rendered_scheduler=render_scheduler(op.scheduler),
+            rendered_pipeline=render_pipeline(op.pipeline),
+            rendered_epilogue=render_epilogue(op.epilogue),
+            has_double_smem_buffer=("true" if op.pipeline == "CompV4" else "false"),
+        )
+        return rendered_definition
+
+    def render(  # type: ignore[override]
+        self, kernel: ROCmTemplateKernel, op: "CKTileGemmOperation", **kwargs
+    ) -> str:
+        """
+        The primary entry point for the code rendering process used in this template.
+        """
+        epilogue_nodes = kwargs.get("epilogue_nodes", None)
+        assert epilogue_nodes is None or 0 == len(epilogue_nodes)
+        template_buffer_node = kwargs.get("template_buffer_node", None)
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+        assert 2 == len(self.input_nodes)
+        X, W = self.input_nodes
+        Y = self.output_node
+
+        instance_definition = self.emit_ck_instance(op)
+
+        version_comment = rf"""/**
+* Generated code for CK inductor backend
+* See {type(self).__module__}.{type(self).__qualname__}
+*
+* Template instance {op}
+*
+* {torch.__version__=}
+* torch.version.git_version={getattr(torch.version, "git_version", "None")}
+*/
+"""
+
+        def render_dispatch(pipeline_type, op_name):
+            switch_tailnum_template = r"""
+            switch (tail_num) {
+                {% for tail_num in valid_tailnums %}
+                case ck_tile::TailNumber::{{tail_num}}:
+                    dispatch({{has_hot_loop}},
+                             ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::{{tail_num}}>{});
+                    break;
+                {% endfor %}
+                default:
+                    std::ostringstream err;
+                    err << "Unsupported dispatch: "
+                        << "Pipeline: " << "{{pipeline}}"
+                        << "Prefetch stages: " << kPrefetchStages
+                        << "Tail num: " << tail_num;
+                    throw std::runtime_error(err.str());
+            } // switch tail_num
+            """
+            dispatch_template = r"""
+        if (has_hot_loop) {
+            {{rendered_with_hot_loop}}
+        }
+        else { // has_hot_loop == false
+            {{rendered_without_hot_loop}}
+        } // if has_hot_loop
+        """
+            if pipeline_type == "CompV3":
+                return self._template_from_string(dispatch_template).render(
+                    rendered_with_hot_loop=self._template_from_string(
+                        switch_tailnum_template
+                    ).render(
+                        has_hot_loop="ck_tile::integral_constant<bool, true>{}",
+                        valid_tailnums=("Full", "Odd", "Even"),
+                        pipeline=pipeline_type,
+                    ),
+                    rendered_without_hot_loop=self._template_from_string(
+                        switch_tailnum_template
+                    ).render(
+                        has_hot_loop="ck_tile::integral_constant<bool, false>{}",
+                        valid_tailnums=("Full", "Odd", "Even"),
+                        pipeline=pipeline_type,
+                    ),
+                )
+            elif pipeline_type == "Mem":
+                return self._template_from_string(dispatch_template).render(
+                    rendered_with_hot_loop="dispatch_memory_pipeline_hot_loop<kPrefetchStages>(tail_num, dispatch);",
+                    rendered_without_hot_loop=self._template_from_string(
+                        switch_tailnum_template
+                    ).render(
+                        has_hot_loop="ck_tile::integral_constant<bool, false>{}",
+                        valid_tailnums=("Full", "Odd", "Even"),
+                        pipeline=pipeline_type,
+                    ),
+                )
+            elif pipeline_type == "CompV4":
+                return self._template_from_string(dispatch_template).render(
+                    rendered_with_hot_loop=self._template_from_string(
+                        switch_tailnum_template
+                    ).render(
+                        has_hot_loop="ck_tile::integral_constant<bool, true>{}",
+                        valid_tailnums=("Two", "Three"),
+                        pipeline=pipeline_type,
+                    ),
+                    rendered_without_hot_loop=self._template_from_string(
+                        switch_tailnum_template
+                    ).render(
+                        has_hot_loop="ck_tile::integral_constant<bool, false>{}",
+                        valid_tailnums=("Full", "Odd", "Even"),
+                        pipeline=pipeline_type,
+                    ),
+                )
+            else:
+                raise AssertionError(f"Pipeline {pipeline_type} is not supported")
+
+        return self._template_from_string(self.gemm_template).render(
+            headers=self.header().getvalue(),
+            globals=self.globals().getvalue(),
+            instance_definition=instance_definition,
+            kernel_definition=kernel.def_kernel(
+                inputs=[X, W],  # type: ignore[list-item]
+                outputs=[Y],
+                names_str="X, W, Y",
+                size_args=[
+                    f"int32_t {arg}" for arg in ["M", "N", "K", "LDA", "LDB", "LDC"]
+                ],
+            ),
+            instance_namespace=op.name(),
+            version_comment=version_comment,
+            rendered_dispatch=render_dispatch(op.pipeline, op.name()),
+        )
+
+    def gen_ops(self):
+        """
+        Creates a list of `CKTileGemmOperation` instances that match the GEMM operation this template represents.
+        The instances are guaranteed to have the correct layout, dtype and dimension padding for the GEMM input arguments.
+
+        An instance may invalidate the GEMM configuration at runtime.
+        Such instances will be assigned +inf runtime by the autotune process.
+        """
+        instances = ops()
+        if not instances:
+            raise AssertionError(
+                "No Composable Kernel Universal GEMM instances found. "
+                "Please check if the library is installed."
+            )
+        filtered_instances = list(filter(self.filter_op, instances))
+        # NB: when using a fixed list order, most likely we will pick the subset of instances
+        # which are very similar to each other. Randomizing the choice seems to solve this.
+        random.seed(-11)
+        chosen_instances = (
+            random.sample(
+                filtered_instances,
+                min(len(filtered_instances), config.rocm.ck_tile_max_profiling_configs),
+            )
+            if config.rocm.ck_tile_max_profiling_configs
+            else filtered_instances
+        )
+        log.debug(
+            "generated %d ck instances after sample: %s",
+            len(chosen_instances),
+            chosen_instances,
+        )
+        return chosen_instances
+
+    @staticmethod
+    def add_choices(
+        choices,
+        layout,
+        input_nodes,
+    ):
+        """
+        Add Composable Kernel Universal GEMM instance choices to the auto-tuning list.
+        """
+        template = CKTileGemmTemplate(
+            input_nodes,
+            layout,
+        )
+        ops = template.gen_ops()
+        for op in ops:
+            for k_batch in template.k_batch_choices(op):
+                template.maybe_append_choice(
+                    choices,
+                    op=op,
+                    kBatch=k_batch,
+                )
+
+    def k_batch_choices(self, op: "CKTileGemmOperation") -> tuple[int, ...]:
+        """
+        Returns a list of k_batch choices for the template.
+        """
+        default_choices = (1, 2, 4, 8, 16, 32)
+
+        def check(dim_size, tile_size, is_padded):
+            if (
+                is_static_int(dim_size)
+                and dim_size % tile_size != 0
+                and is_padded == "false"
+            ):
+                return False
+            return True
+
+        _, _, K, _, _, _ = self.size_args()
+        if op.layout_a == "Row" or op.layout_b == "Col":
+            choices = tuple(
+                filter(
+                    lambda k_batch: check(K, op.tile_k * k_batch, op.k_is_padded),
+                    default_choices,
+                )
+            )
+        else:
+            choices = default_choices
+
+        if op.epilogue == "Default":
+            choices = (1,)
+
+        return choices
+
+    def size_args(self):
+        """
+        Sizes and strides to be used for the kernel call
+        """
+        X = self.input_nodes[0]
+        W = self.input_nodes[1]
+        Y = self.output_node
+
+        M = X.get_size()[0]
+        K = X.get_size()[1]
+        N = W.get_size()[1]
+        LDA = X.get_stride()[0 if X.get_stride()[1] == 1 else 1]
+        LDB = W.get_stride()[0 if W.get_stride()[1] == 1 else 1]
+        LDC = Y.get_stride()[0 if Y.get_stride()[1] == 1 else 1]
+
+        return M, N, K, LDA, LDB, LDC
+
+    def get_runtime_arg_info(self) -> list[ArgInfo]:
+        return [ArgInfo("kBatch", "int32_t")]
+
+    def get_runtime_arg_values(self, **kwargs: Any) -> list[Any]:
+        # maybe_append_choice kwarg for k_batch must match the name of the argument
+        arg_names = OrderedSet([arg.name for arg in self.get_runtime_arg_info()])
+        if not arg_names.issubset(kwargs):
+            raise ValueError(
+                "Missing runtime arguments: " + ", ".join(arg_names - kwargs.keys())
+            )
+        return [kwargs[k] for k in arg_names]
diff --git a/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py b/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
index e43d1c9d0f49..a4fd6ff838cd 100644
--- a/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
+++ b/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
@@ -17,7 +17,11 @@
 from torch._inductor.ir import Buffer, Layout
 from torch._inductor.runtime.runtime_utils import next_power_of_2
 
+<<<<<<< HEAD
 from ...utils import IndentedBuffer, try_import_ck_lib
+=======
+from ...utils import IndentedBuffer, is_dynamic, try_import_ck_lib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _, gen_ops_library, gen_ops_preselected, CKGemmOperation = try_import_ck_lib()
@@ -887,6 +891,11 @@ def _get_kBatch(self, op):
         M = X_meta.size[-2]
         K = X_meta.size[-1]
         N = W_meta.size[-1]
+<<<<<<< HEAD
+=======
+        if is_dynamic(*self.input_nodes):
+            return [1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if K // max(M, N) < config.rocm.split_k_threshold:
             return [1]
         # if the user is telling us which kBatches to sweep, just use those
@@ -945,9 +954,15 @@ def gen_ops(self) -> list[InductorROCmOp]:
         chosen_instances = (
             random.sample(
                 filtered_instances,
+<<<<<<< HEAD
                 min(len(filtered_instances), config.rocm.n_max_profiling_configs),
             )
             if config.rocm.n_max_profiling_configs
+=======
+                min(len(filtered_instances), config.rocm.ck_max_profiling_configs),
+            )
+            if config.rocm.ck_max_profiling_configs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else filtered_instances
         )
         log.debug(
diff --git a/torch/_inductor/codegen/rocm/compile_command.py b/torch/_inductor/codegen/rocm/compile_command.py
index f1166837c044..55d0cd07f433 100644
--- a/torch/_inductor/codegen/rocm/compile_command.py
+++ b/torch/_inductor/codegen/rocm/compile_command.py
@@ -80,6 +80,10 @@ def _rocm_compiler_options() -> list[str]:
         *gpu_arch_flags,
         "-fno-gpu-rdc",
         "-fPIC",
+<<<<<<< HEAD
+=======
+        "-fvisibility=hidden",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "-mllvm",
         "-amdgpu-early-inline-all=true",
         "-mllvm",
diff --git a/torch/_inductor/codegen/rocm/rocm_benchmark_request.py b/torch/_inductor/codegen/rocm/rocm_benchmark_request.py
index 24cd830bd944..b7aeb19e0169 100644
--- a/torch/_inductor/codegen/rocm/rocm_benchmark_request.py
+++ b/torch/_inductor/codegen/rocm/rocm_benchmark_request.py
@@ -55,6 +55,7 @@ def precompile(self):
         log.debug("Done precompiling %s", self)
 
     def make_run_fn(
+<<<<<<< HEAD
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
     ) -> Callable[[], None]:
         self.ensure_dll_loaded()
@@ -63,6 +64,13 @@ def make_run_fn(
             c_void_p(tensor.data_ptr())
             for tensor in list(input_tensors) + [output_tensor]
         ]
+=======
+        self, *input_tensors: torch.Tensor, out: torch.Tensor
+    ) -> Callable[[], None]:
+        self.ensure_dll_loaded()
+        self.update_workspace_size()
+        args = [c_void_p(tensor.data_ptr()) for tensor in list(input_tensors) + [out]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         size_args = [c_int(arg) for arg in self.extra_args]
         log.debug(
             "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
@@ -80,7 +88,11 @@ def make_run_fn(
             self.workspace = torch.zeros(
                 (self.workspace_size + 7) // 8,
                 dtype=torch.float64,
+<<<<<<< HEAD
                 device=output_tensor.device,
+=======
+                device=out.device,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             workspace_ptr = c_void_p(self.workspace.data_ptr())
 
diff --git a/torch/_inductor/codegen/rocm/rocm_kernel.py b/torch/_inductor/codegen/rocm/rocm_kernel.py
index 374e38118229..b1af660f134a 100644
--- a/torch/_inductor/codegen/rocm/rocm_kernel.py
+++ b/torch/_inductor/codegen/rocm/rocm_kernel.py
@@ -3,7 +3,13 @@
 from collections.abc import Sequence
 from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 
+<<<<<<< HEAD
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+=======
+import torch._inductor.config as config
+from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+from torch._inductor.utils import do_bench_using_profiling
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ...ir import Buffer, ChoiceCaller, IRNode, Layout, PrimitiveInfoType, TensorBox
 from ...virtualized import V
@@ -11,6 +17,10 @@
 from ..cpp_utils import CppPrinter
 from .rocm_benchmark_request import ROCmBenchmarkRequest
 from .rocm_template_buffer import ROCmTemplateBuffer
+<<<<<<< HEAD
+=======
+from .rocm_utils import DTYPE_TO_ROCM_TYPE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -109,7 +119,11 @@ def def_kernel(
                 self.named_nodes[name] = node
                 self.args.output_buffers[node.get_name()] = name
 
+<<<<<<< HEAD
         arg_defs, *_ = self.args.cpp_argdefs()
+=======
+        arg_defs, *_ = self.args.cpp_argdefs(DTYPE_TO_ROCM_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         runtime_arg_defs = [f"{arg.ty} {arg.name}" for arg in self.runtime_arg_info]
 
@@ -141,7 +155,11 @@ def call_kernel(
             # Kinda hacky because we always originally initialize name with "KERNEL_NAME"
             # So, we replace with the real kernel name passed as an arg to this function.
             self.signature = self.signature.replace("KERNEL_NAME", name)
+<<<<<<< HEAD
             _, call_args, arg_types = self.args.cpp_argdefs()
+=======
+            _, call_args, arg_types = self.args.cpp_argdefs(DTYPE_TO_ROCM_TYPE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             _, call_args, _, arg_types = self.args.python_argdefs()
 
@@ -246,7 +264,14 @@ def precompile(self) -> None:
 
     def benchmark(self, *args, out) -> float:
         assert self.bmreq is not None
+<<<<<<< HEAD
         return self.bmreq.benchmark(*args, output_tensor=out)
+=======
+        if config.profile_bandwidth_with_do_bench_using_profiling:
+            algo = self.bmreq.make_run_fn(*args, out=out)
+            return do_bench_using_profiling(algo)
+        return self.bmreq.benchmark(*args, out=out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __str__(self) -> str:
         return f"ROCmTemplateCaller(source_file={self.bmreq.source_file}, {self.info_dict()})"
diff --git a/torch/_inductor/codegen/rocm/rocm_template.py b/torch/_inductor/codegen/rocm/rocm_template.py
index 9f9659eca1a5..89b684d887d8 100644
--- a/torch/_inductor/codegen/rocm/rocm_template.py
+++ b/torch/_inductor/codegen/rocm/rocm_template.py
@@ -15,6 +15,10 @@
 from .rocm_benchmark_request import ROCmBenchmarkRequest
 from .rocm_kernel import ROCmTemplateCaller, ROCmTemplateKernel
 from .rocm_template_buffer import ROCmTemplateBuffer
+<<<<<<< HEAD
+=======
+from .rocm_utils import DTYPE_TO_ROCM_TYPE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
@@ -29,6 +33,10 @@ class ArgInfo:
 
 class ROCmTemplate(KernelTemplate):
     index_counter = itertools.count()
+<<<<<<< HEAD
+=======
+    gfx9_threads_per_warp = 64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -83,7 +91,11 @@ def generate(  # type: ignore[override]
             log.debug("Autotune key: %s, Generated Code:\n%s", kernel_hash_name, code)
             log.debug(
                 "Args: cpp_argdefs: %s, python_argdefs: %s",
+<<<<<<< HEAD
                 kernel.args.cpp_argdefs(),
+=======
+                kernel.args.cpp_argdefs(DTYPE_TO_ROCM_TYPE),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kernel.args.python_argdefs(),
             )
 
@@ -176,11 +188,14 @@ def globals(self) -> IndentedBuffer:
                 #define PT_EXPORT
                 #endif
                 #endif
+<<<<<<< HEAD
 
                 // as long as there is no custom arithmetic it's fine
                 using bfloat16 = uint16_t;
                 using float8_e4m3fnuz = uint8_t;
                 using float8_e5m2fnuz = uint8_t;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
         )
         return res
diff --git a/torch/_inductor/codegen/rocm/rocm_utils.py b/torch/_inductor/codegen/rocm/rocm_utils.py
new file mode 100644
index 000000000000..4fce90fa760d
--- /dev/null
+++ b/torch/_inductor/codegen/rocm/rocm_utils.py
@@ -0,0 +1,15 @@
+# mypy: allow-untyped-defs
+
+
+import torch
+
+from ..cpp_utils import DTYPE_TO_CPP
+
+
+DTYPE_TO_ROCM_TYPE = {
+    **DTYPE_TO_CPP,
+    torch.float16: "uint16_t",
+    torch.float8_e4m3fnuz: "uint8_t",
+    torch.float8_e5m2fnuz: "uint8_t",
+    torch.bfloat16: "uint16_t",
+}
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index 6e5d56fecb41..91285f1c1ca9 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -18,6 +18,11 @@
 
 import torch
 import torch._logging
+<<<<<<< HEAD
+=======
+from torch._inductor.tiling_utils import analyze_memory_coalescing
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.immutable_collections import immutable_dict
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import FloorDiv, Identity, ModularIndexing
@@ -33,7 +38,15 @@
 from ..analyze_preserves_zero_mask import prologue_preserves_zero_mask
 from ..codecache import code_hash
 from ..dependencies import MemoryDep, StarDep, WeakDep
+<<<<<<< HEAD
 from ..ir import IRNode, TritonTemplateBuffer
+=======
+
+
+if TYPE_CHECKING:
+    from ..ir import IRNode
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..optimize_indexing import indexing_dtype_strength_reduction
 from ..runtime.runtime_utils import green_text, yellow_text
 from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse
@@ -57,6 +70,10 @@
 from .simd_kernel_features import (
     DisableReduction,
     EnableReduction,
+<<<<<<< HEAD
+=======
+    NodeScheduleEntry,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NodeScheduleMarker,
     SIMDKernelFeatures,
 )
@@ -65,6 +82,11 @@
 if TYPE_CHECKING:
     from collections.abc import Iterable, Iterator, Sequence
 
+<<<<<<< HEAD
+=======
+    from torch._inductor.tiling_utils import CoalesceVarAnalysis
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
@@ -77,6 +99,14 @@
 all_prefixes = OrderedSet(["z", "y", "x", "r0_", "r1_"])
 
 
+<<<<<<< HEAD
+=======
+def get_max_tiles(default: int = 2) -> int:
+    max_tiles = torch._inductor.config.triton.max_tiles
+    return max_tiles if max_tiles is not None else default
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class IterationRanges:
     """
@@ -135,6 +165,15 @@ def symt(self) -> SymT:
 
 
 class IterationRangesRoot(IterationRanges):
+<<<<<<< HEAD
+=======
+    """
+    Root of a iteration range tree that represents a single
+    tiled dimension in the output kernel. It contains multiple
+    sets of iteration represented with IterationRangesEntry.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         name: str,
@@ -227,6 +266,7 @@ def vars_and_sizes(
         self, index: sympy.Expr
     ) -> tuple[list[sympy.Symbol], list[sympy.Expr]]:
         """Figure out vars from this tree used in index"""
+<<<<<<< HEAD
         nodes = [V.kernel.range_tree_nodes.get(s) for s in index.free_symbols]
         nodes = [n for n in nodes if n and n.prefix == self.prefix]
         nodes.sort(
@@ -234,6 +274,31 @@ def vars_and_sizes(
                 x.divisor, fallback=config.unbacked_symint_fallback
             )
         )
+=======
+
+        def get_sort_key(x: IterationRangesEntry) -> tuple[int, bool]:
+            """
+            Gets the key for sorting nodes. When two nodes have the
+            same divisor, the node with length as 1 should be handled
+            first so the current divisor is not changed after multiplied
+            node.length. Returns `not length_is_one_hint` for ascending
+            sort.
+            """
+            divisor_hint = V.graph.sizevars.size_hint(
+                x.divisor, fallback=config.unbacked_symint_fallback
+            )
+            length_is_one_hint = (
+                V.graph.sizevars.size_hint(
+                    x.length, fallback=config.unbacked_symint_fallback
+                )
+                == 1
+            )
+            return (divisor_hint, not length_is_one_hint)
+
+        nodes = [V.kernel.range_tree_nodes.get(s) for s in index.free_symbols]
+        nodes = [n for n in nodes if n and n.prefix == self.prefix]
+        nodes.sort(key=lambda x: get_sort_key(x))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         divisor = sympy.S.One
         index_vars = []
         sizes = []
@@ -349,6 +414,10 @@ def __init__(
         pid_cache: Optional[dict[str, str]] = None,
         override_persistent_reduction: Optional[bool] = None,
         override_cooperative_reduction: Optional[bool] = None,
+<<<<<<< HEAD
+=======
+        tiling_scores: Optional[dict[str, sympy.Expr]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         if pid_cache is None:
             pid_cache = {}
@@ -369,6 +438,10 @@ def __init__(
             if override_cooperative_reduction is not None
             else self.should_use_cooperative_reduction()
         )
+<<<<<<< HEAD
+=======
+        self.tiling_scores: Optional[dict[str, sympy.Expr]] = tiling_scores
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.persistent_reduction: bool = (
             override_persistent_reduction
             if override_persistent_reduction is not None
@@ -378,7 +451,11 @@ def __init__(
         self.code_hash: Optional[str] = None
 
         # define this in a closure to make cache local to object
+<<<<<<< HEAD
         @functools.lru_cache(None)
+=======
+        @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def simplify_indexing(index: sympy.Expr):
             index = V.graph.sizevars.simplify_with_ranges(index, self.var_ranges())
             for tree in self.range_trees:
@@ -398,9 +475,18 @@ def num_reduction_dims(self) -> int:
     def dtype_to_str(self, dtype: torch.dtype) -> str:
         raise NotImplementedError
 
+<<<<<<< HEAD
     @property
     def index_dtype(self) -> str:
         return self.dtype_to_str(self.features.select_index_dtype())
+=======
+    def get_index_dtype_as_torch_dtype(self) -> torch.dtype:
+        return self.features.select_index_dtype()
+
+    @property
+    def index_dtype(self) -> str:
+        return self.dtype_to_str(self.get_index_dtype_as_torch_dtype())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def want_no_x_dim(self) -> bool:
         return False
@@ -424,13 +510,23 @@ def filtered_index_map(seq, mask) -> dict[Any, int]:
             }
 
         grid_dims = ["x", "y", "z"]
+<<<<<<< HEAD
+=======
+        pointwise_tensor_dims = list(reversed(grid_dims))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction_dims = ["r0_", "r1_"]
         if no_x_dim:
             tensor_dims = reduction_dims
         elif no_r_dim:
+<<<<<<< HEAD
             tensor_dims = grid_dims
         else:
             tensor_dims = grid_dims + reduction_dims
+=======
+            tensor_dims = pointwise_tensor_dims
+        else:
+            tensor_dims = pointwise_tensor_dims + reduction_dims
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Filter out unused tensor dims.
         # Convert to dicts for O(1) index lookup.
@@ -652,6 +748,10 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
                         size, remaining[current_group]
                     ):
                         raise CantSplit
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     size1 = remaining[current_group]
                     size2 = FloorDiv(size, remaining[current_group])
                     return_getters.append(
@@ -662,9 +762,16 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
                         )
                     )
                 else:
+<<<<<<< HEAD
                     return_getters.append(
                         operator.itemgetter(add_range(current_group, size))
                     )
+=======
+                    if current_group < len(remaining):
+                        return_getters.append(
+                            operator.itemgetter(add_range(current_group, size))
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return_getters_groups.append(return_getters)
 
         assert all(V.graph.sizevars.size_hint(s) == 1 for s in remaining), (
@@ -674,12 +781,36 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
         return new_ranges, return_getters_groups
 
     @classmethod
+<<<<<<< HEAD
+=======
+    def prepare_split_iteration_lengths(
+        cls,
+        groups: Iterable[sympy.Expr],
+        lengths: Sequence[Sequence[sympy.Expr]],
+        reduction_numel: sympy.Expr = sympy.S.One,
+    ) -> Sequence[Sequence[sympy.Expr]]:
+        "Fill in the reduction numel of lengths if missing"
+        sizevars = V.graph.sizevars
+        if len(lengths[1]) == 0 and (
+            not sizevars.statically_known_equals(reduction_numel, sympy.S.One)
+            and sizevars.statically_known_equals(
+                sympy_product(groups),
+                sympy_product(lengths[0]) * reduction_numel,
+            )
+        ):
+            return (lengths[0], [reduction_numel])
+
+        return lengths
+
+    @classmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_compatible(
         cls,
         groups: Iterable[sympy.Expr],
         lengths: Sequence[Sequence[sympy.Expr]],
         reduction_numel: sympy.Expr = sympy.S.One,
     ) -> bool:
+<<<<<<< HEAD
         # Fill in the reduction numel, in case the node is missing it.
         sizevars = V.graph.sizevars
         if len(lengths[1]) == 0 and (
@@ -689,6 +820,9 @@ def is_compatible(
             )
         ):
             lengths = (lengths[0], [reduction_numel])
+=======
+        lengths = cls.prepare_split_iteration_lengths(groups, lengths, reduction_numel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         try:
             cls._split_iteration_ranges(groups, lengths)
@@ -814,6 +948,7 @@ def prepare_indexing(
 
         return self.codegen_indexing(simp_index)
 
+<<<<<<< HEAD
     def active_range_trees(self, reorder: bool = False) -> list[IterationRangesRoot]:
         trees = [
             t for t in self.range_trees if not t.is_reduction or self.inside_reduction
@@ -825,6 +960,12 @@ def active_range_trees(self, reorder: bool = False) -> list[IterationRangesRoot]
             ]
             trees[:count] = reversed(trees[:count])
         return trees
+=======
+    def active_range_trees(self) -> list[IterationRangesRoot]:
+        return [
+            t for t in self.range_trees if not t.is_reduction or self.inside_reduction
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_indexing(self, expr: sympy.Expr) -> sympy.Expr:
         expr = V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges())
@@ -1063,6 +1204,14 @@ def codegen_iteration_ranges_entry(self, entry: IterationRangesEntry):
 
 
 class SIMDScheduling(BaseScheduling):
+<<<<<<< HEAD
+=======
+    """
+    Single Instruction Multiple Data parent class used for fusion across
+    multiple different backends.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kernel_type: type[Any] = SIMDKernel  # override in subclass
 
     def group_fn(self, sizes):
@@ -1135,6 +1284,7 @@ def can_fuse(self, node1, node2):
                             )
                             return False
 
+<<<<<<< HEAD
             for n, node_name in zip((node1, node2), ("node1", "node2")):
                 if n.is_template():
                     # Only allow fusion for TritonTemplates for now.
@@ -1145,6 +1295,11 @@ def can_fuse(self, node1, node2):
                     if not is_triton_template:
                         why(f"{node_name} is not TritonTemplateBuffer")
                     return is_triton_template
+=======
+            for n in (node1, node2):
+                if n.is_template():
+                    return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # check for a bad combined tiling
             tiling1 = self.select_tiling(node1.get_nodes(), numel1, rnumel1)
@@ -1212,8 +1367,13 @@ def generate_node_schedule(self, nodes, numel, rnumel):
         done = OrderedSet[scheduler.BaseSchedulerNode]()
         # Writes with a reduced shape, meaning they are only present once the
         # reduction loop has ended
+<<<<<<< HEAD
         not_ready_yet_nodes = OrderedSet[str]()
         current_loop_buffer_usage = OrderedSet[str]()
+=======
+        not_ready_yet_nodes: OrderedSet[str] = OrderedSet()
+        current_loop_buffer_usage: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         maybe_split_index: Optional[int] = None
 
         def fits_in_main_body(n):
@@ -1312,19 +1472,36 @@ def codegen_node(
 
         nodes: list[scheduler.SchedulerNode] = node.get_nodes()  # type: ignore[assignment]
 
+<<<<<<< HEAD
+=======
+        if torch._inductor.config.triton.coalesce_tiling_analysis:
+            coalesce_analysis = analyze_memory_coalescing(node)
+        else:
+            coalesce_analysis = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
 
         node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
         schedule_log.debug("Schedule:\n %s", node_schedule)
 
         return self.codegen_node_schedule(
+<<<<<<< HEAD
             SIMDKernelFeatures(node_schedule, numel, rnumel)
+=======
+            SIMDKernelFeatures(node_schedule, numel, rnumel, coalesce_analysis)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @staticmethod
     def can_use_32bit_indexing(
         numel: sympy.Expr,
+<<<<<<< HEAD
         buffers: Iterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]],
+=======
+        buffers: Iterable[
+            Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject, ir.IRNode]
+        ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> bool:
         int_max = torch.iinfo(torch.int32).max
 
@@ -1351,11 +1528,25 @@ def can_use_32bit_indexing(
 
     def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
         node_schedule = kernel_features.node_schedule
+<<<<<<< HEAD
         tiling = self.select_tiling(
             node_schedule, kernel_features.numel, kernel_features.reduction_numel
         )
         kernels = self.create_kernel_choices(
             kernel_features, [tiling], {"features": kernel_features}
+=======
+
+        tiling, tiling_score = self.get_tiling_and_scores(
+            node_schedule,
+            kernel_features.numel,
+            kernel_features.reduction_numel,
+            kernel_features.coalesce_analysis,
+        )
+        kernels = self.create_kernel_choices(
+            kernel_features,
+            [tiling],
+            {"features": kernel_features, "tiling_scores": tiling_score},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         for kernel in kernels:
             self.codegen_node_schedule_with_kernel(node_schedule, kernel)
@@ -1511,7 +1702,11 @@ def codegen_template(
                         p_n.can_codegen_without_upcasts() for p_n in prologue_group
                     )
 
+<<<<<<< HEAD
                     # TODO - this doesnt work with libdevice calls, potentially other bugs
+=======
+                    # TODO - this doesn't work with libdevice calls, potentially other bugs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # upcasting to fp32 and downcasting gives large slowdown
                     with config.patch(
                         "triton.codegen_upcast_to_fp32", not can_codegen_without_upcast
@@ -1656,6 +1851,14 @@ def codegen_combo_kernel(self, combo_kernel_node):
 
         for src_code, kernel, _ in kernel_code_list:
             kernel_name = self.define_kernel(src_code, [combo_kernel_node], kernel)
+<<<<<<< HEAD
+=======
+            # dump provenance node info for ComboKernelNode/ForeachKernel type
+            if config.trace.enabled:
+                set_kernel_post_grad_provenance_tracing(
+                    combo_kernel_node.snodes, kernel_name
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.codegen_comment([combo_kernel_node])
             log.debug("ComboKernels: generated kernel %s.", kernel_name)
             kernel.call_kernel(V.graph.wrapper_code, kernel_name)
@@ -1762,7 +1965,15 @@ def collapse_ranges(ranges: Sequence[sympy.Expr]) -> sympy.Expr:
             return tilings
 
         pointwise_ranges, reduction_ranges = node.get_ranges()
+<<<<<<< HEAD
         if len(pointwise_ranges) <= 1 and len(reduction_ranges) <= 1:
+=======
+        if (
+            len(pointwise_ranges) <= 1
+            and len(reduction_ranges) <= 1
+            or free_unbacked_symbols(pointwise_ranges + reduction_ranges)
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return []
 
         # Tile either pointwise or reduction dims.
@@ -1840,7 +2051,11 @@ def get_nd_tilings(
         reduction_numel,
     ) -> list[dict[str, tuple[sympy.Expr]]]:
         """
+<<<<<<< HEAD
         Creates N-dimensional tiling candidiates, attempting to simplify loads/stores
+=======
+        Creates N-dimensional tiling candidates, attempting to simplify loads/stores
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         by tiling the kernel into higher dimensions.
 
         Returns a list of tilings ranked by dimensionality.
@@ -1918,11 +2133,27 @@ def get_nd_tilings(
                     dims = match_result[0] if match_result is not None else [numel]
                     index_tiling.extend(dims)
 
+<<<<<<< HEAD
                 node_tilings.append(index_tiling)
 
             # Flatten leading dimensions, assigning labels to each dim.
             for node_tiling in node_tilings:
                 num_leading_dims = max(0, len(node_tiling) - config.triton.max_tiles)
+=======
+                # Prune dimensions of size 1.
+                index_tiling = [
+                    dim
+                    for dim in index_tiling
+                    if not V.graph.sizevars.statically_known_equals(dim, sympy.S.One)
+                ]
+
+                if len(index_tiling) > 0:
+                    node_tilings.append(index_tiling)
+
+            # Flatten leading dimensions, assigning labels to each dim.
+            for node_tiling in node_tilings:
+                num_leading_dims = max(0, len(node_tiling) - get_max_tiles(2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 first_trailing_dim = num_leading_dims + 1
                 collapsed_leading_dim = sympy_product(node_tiling[:first_trailing_dim])
                 collapsed_splits = (collapsed_leading_dim,) + tuple(
@@ -1947,9 +2178,270 @@ def get_nd_tilings(
         return ranked_tilings
 
     @classmethod
+<<<<<<< HEAD
     def select_tiling(
         cls, node_schedule, numel, reduction_numel=sympy.S.One
     ) -> dict[str, sympy.Expr]:
+=======
+    def compute_tiling_strategy(
+        cls,
+        node_schedule: list[NodeScheduleEntry],
+        pointwise_numel: sympy.Expr,
+        reduction_numel: sympy.Expr,
+        coalesce_analysis: CoalesceVarAnalysis,
+    ) -> tuple[dict[str, sympy.Expr], Optional[dict[str, sympy.Expr]]]:
+        """
+        Generates a tiling, and a score of each tile according to each tile's coalesced memory accesses.
+        """
+        tiling_var: Optional[sympy.Expr] = (
+            None
+            if not coalesce_analysis.suggested_split
+            else coalesce_analysis.suggested_split.var
+        )
+
+        all_iter_vars = coalesce_analysis.norm_read_writes.index_vars
+        all_red_vars = coalesce_analysis.norm_read_writes.reduce_vars
+        ranges = coalesce_analysis.norm_read_writes.var_ranges
+
+        pw_ranges = [ranges[v] for v in all_iter_vars]
+        red_ranges = [ranges[v] for v in all_red_vars]
+
+        torch._check(
+            sympy_product(pw_ranges) == pointwise_numel,
+            lambda: f"{pw_ranges}, {pointwise_numel}, {node_schedule}",
+        )
+        torch._check(
+            sympy_product(red_ranges) == reduction_numel,
+            lambda: f"{red_ranges}, {reduction_numel}, {node_schedule}",
+        )
+
+        # score of a pointwise or reduction split
+        scored_sub_split: dict[Any, tuple[list[int], list[int]]] = {}
+
+        score_split: list[
+            tuple[tuple[list[int], list[int]], tuple[list[int], list[int]]]
+        ] = []
+
+        def process_node_vars(
+            vars_to_use: tuple[sympy.Expr, ...] = (),
+            use_split_var: bool = False,
+            is_pointwise: bool = False,
+        ) -> tuple[list[int], list[int]]:
+            """
+            Generate a tiling, and a tiling score, given vars to use as splits.
+            """
+
+            ranges = pw_ranges if is_pointwise else red_ranges
+            target_numel = pointwise_numel if is_pointwise else reduction_numel
+            # Some kernels have no reduction ranges, and a reduction numel of 1
+            if not ranges:
+                if target_numel:
+                    return ([target_numel], [])
+                else:
+                    return ([], [])
+
+            key = (repr(vars_to_use), use_split_var, is_pointwise)
+            if out := scored_sub_split.get(key, None):
+                return out
+
+            splitting_vars = all_iter_vars if is_pointwise else all_red_vars
+
+            splits = []
+            split_scores = []
+            prod = 1
+            prev_var_coalesced_score = 0
+
+            # iterate from non-dense to dense
+            for v, v_range in zip(splitting_vars, ranges):
+                if v not in vars_to_use:
+                    prod *= v_range
+                    prev_var_coalesced_score = coalesce_analysis.coalesced_by_var.get(
+                        v, 0
+                    )
+                    continue
+
+                if use_split_var and v == tiling_var:
+                    var_tiling = coalesce_analysis.suggested_split
+                    assert var_tiling is not None
+
+                    tile = var_tiling.tiling_factor
+                    remainder = FloorDiv(v_range, var_tiling.tiling_factor)
+
+                    splits.append(prod * remainder)
+                    split_scores.append(var_tiling.score)
+
+                    splits.append(tile)
+                    split_scores.append(coalesce_analysis.coalesced_by_var.get(v, 0))
+
+                    prod = 1
+                    prev_var_coalesced_score = 0
+
+                    continue
+
+                prod *= v_range
+                splits.append(prod)
+                split_scores.append(coalesce_analysis.coalesced_by_var.get(v, 0))
+                prod = 1
+
+            if prod != 1 or (is_pointwise and len(splits) == 0):
+                splits.append(prod)
+                split_scores.append(prev_var_coalesced_score)
+
+            # penalize splits that leave small blocks
+            # where we can't fully utilize full memory transaction
+            # TODO: incorporate exact bitwidth, and read/write
+            # coalesced write is 2x more important
+            for i in range(len(splits)):
+                s = V.graph.sizevars.size_hint(splits[i], fallback=32)
+                s = min(s, 8)
+                split_scores[i] = int(split_scores[i] * s / 8)
+
+            scored_sub_split[key] = (splits, split_scores)
+            return (splits, split_scores)
+
+        # add the default tiling
+        score_split.append(
+            (
+                process_node_vars(is_pointwise=True),
+                process_node_vars(is_pointwise=False),
+            )
+        )
+
+        if tiling_var:
+            score_split.append(
+                (
+                    process_node_vars(
+                        (tiling_var,), use_split_var=True, is_pointwise=True
+                    ),
+                    process_node_vars(is_pointwise=False),
+                )
+            )
+
+        # TODO, add tests, reduction splits if config.triton.tile_reductions
+        # TODO: we should ignore tiny increases in score for extra splits
+        overlapping_iter_vars = (
+            all_iter_vars & coalesce_analysis.coalesced_by_var.keys()
+        )
+        for v in overlapping_iter_vars:
+            score_split.append(
+                (
+                    process_node_vars((v,), is_pointwise=True),
+                    process_node_vars(is_pointwise=False),
+                )
+            )
+
+        if get_max_tiles(default=3) == 3 and reduction_numel == 1:
+            for vars_to_use in itertools.combinations(overlapping_iter_vars, 2):
+                score_split.append(
+                    (
+                        process_node_vars(vars_to_use, is_pointwise=True),
+                        process_node_vars(is_pointwise=False),
+                    )
+                )
+
+        tilings: list[tuple[CandidateTiling, dict[str, sympy.Expr]]] = []
+        for (pw_split, pw_score), (red_split, red_score) in score_split:
+            candidate = CandidateTiling(
+                cls.create_tiling(pw_split, red_split),
+                score=sum(pw_score) + sum(red_score),
+            )
+            tiling_score = cls.create_tiling(pw_score, red_score)
+            tilings.append((candidate, tiling_score))
+
+        default_tiling = cls.create_tiling([pointwise_numel], [reduction_numel])
+
+        # add a slight penalty for longer tilings that dont increase score much,
+        # and are poor sizes
+        bad_size_additional_tiling_penalty = 1.025
+        good_size_tiling_penalty = 1.005
+
+        def score_mod(t):
+            score_factor = 1.0
+            for tile_size in t[0].tiling.values():
+                if not CandidateTiling.is_good_size(tile_size):
+                    score_factor = score_factor / bad_size_additional_tiling_penalty
+                else:
+                    score_factor = score_factor / good_size_tiling_penalty
+
+            return -t[0].score * score_factor
+
+        # apply penalty for longer tilings that dont increase score much
+        for cand, tiling_score in sorted(tilings, key=score_mod):
+            if cls.tiling_is_compatible(
+                node_schedule, pointwise_numel, reduction_numel, cand.tiling
+            ):
+                # we always include default reduction numel == 1, dont include
+                tiling_len = len(cand.tiling) - (1 if reduction_numel == 1 else 0)
+                if tiling_len > get_max_tiles(default=3):
+                    perf_hint_log.info(
+                        "Found optimal tiling with %s tiles but torch._inductor.config.triton.max_tiles "
+                        "set to %s. Consider increasing",
+                        tiling_len,
+                        torch._inductor.config.triton.max_tiles,
+                    )
+                    continue
+
+                return cand.tiling, tiling_score
+
+            # surprisingly, the default tiling is not always read as compatible by `tiling_is_compatible`
+            # TODO - look into, occurs with dynamic shapes often
+            if cand.tiling == default_tiling:
+                return cand.tiling, tiling_score
+
+        return default_tiling, None
+
+    @classmethod
+    def tiling_is_compatible(
+        cls,
+        node_schedule: list[NodeScheduleEntry],
+        numel: sympy.Expr,
+        reduction_numel: sympy.Expr,
+        tiling: dict[str, sympy.Expr],
+    ):
+        assert isinstance(tiling, dict)
+        return all(
+            SIMDKernel.is_compatible(
+                tiling.values(), node.get_ranges(), reduction_numel=reduction_numel
+            )
+            for node in node_schedule
+            if isinstance(node, scheduler.SchedulerNode)
+        )
+
+    @classmethod
+    def get_first_compatible_tiling(
+        cls,
+        node_schedule: list[NodeScheduleEntry],
+        numel: sympy.Expr,
+        reduction_numel: sympy.Expr,
+        ranked_tilings: list[dict[str, sympy.Expr]],
+    ):
+        for tiling in ranked_tilings:
+            if cls.tiling_is_compatible(node_schedule, numel, reduction_numel, tiling):
+                return tiling
+
+        return None
+
+    @classmethod
+    def select_tiling(
+        cls,
+        node_schedule,
+        numel,
+        reduction_numel=sympy.S.One,
+        coalesce_analysis: Optional[CoalesceVarAnalysis] = None,
+    ) -> dict[str, sympy.Expr]:
+        return cls.get_tiling_and_scores(
+            node_schedule, numel, reduction_numel, coalesce_analysis
+        )[0]
+
+    @classmethod
+    def get_tiling_and_scores(
+        cls,
+        node_schedule,
+        numel,
+        reduction_numel=sympy.S.One,
+        coalesce_analysis: Optional[CoalesceVarAnalysis] = None,
+    ) -> tuple[dict[str, sympy.Expr], Optional[dict[str, sympy.Expr]]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Heuristics to decide how to tile kernels.
         Currently, we tile based on stride-1 dimensions.
@@ -1963,9 +2455,26 @@ def select_tiling(
 
         # Tiled reductions are gated by a config flag.
         default_tiling = cls.create_tiling([numel], [reduction_numel])
+<<<<<<< HEAD
         if (
             not is_pointwise and not config.triton.tile_reductions
         ) or config.triton.max_tiles <= 1:
+=======
+
+        # # TODO: enable by default
+        if (
+            torch._inductor.config.triton.coalesce_tiling_analysis
+            and coalesce_analysis
+            and not config.triton.prefer_nd_tiling
+        ):
+            return cls.compute_tiling_strategy(
+                node_schedule, numel, reduction_numel, coalesce_analysis
+            )
+
+        if (not is_pointwise and not config.triton.tile_reductions) or get_max_tiles(
+            default=2
+        ) <= 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Emit a perf hint in case we miss an opportunity to tile a reduction.
             if perf_hint_log.level <= logging.WARNING:
                 for node in EnableReduction.filter(node_schedule):
@@ -1982,9 +2491,16 @@ def select_tiling(
                             )
                         )
                         break
+<<<<<<< HEAD
             return default_tiling
 
         seen_names = OrderedSet[str]()
+=======
+
+            return default_tiling, None
+
+        seen_names: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         candidate_tiles: Counter[CandidateTiling] = collections.Counter()
         for node in EnableReduction.filter(node_schedule):
             for candidate_tiling in cls.candidate_tilings(node, numel, reduction_numel):
@@ -1999,7 +2515,11 @@ def select_tiling(
             for candidate_tiling, score in candidate_tiles.most_common()
         ]
 
+<<<<<<< HEAD
         if config.triton.max_tiles >= 3 and is_pointwise:
+=======
+        if get_max_tiles(default=2) >= 3 and is_pointwise:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Consider adding a third dimension of tiling, but only
             # when a1 is a multiple of b1; otherwise, you have a lot
             # of stragglers which is annoying to generate code for.
@@ -2011,7 +2531,15 @@ def convert_tiling_to_3d(
             ) -> Optional[dict[str, sympy.Expr]]:
                 a0, a1 = tiling0["x"], tiling0.get("y", 1)
                 b0, b1 = tiling1["x"], tiling1.get("y", 1)
+<<<<<<< HEAD
                 if V.graph.sizevars.size_hint(a1 - b1) == 0:
+=======
+
+                if (
+                    free_unbacked_symbols([a1, b1])
+                    or V.graph.sizevars.size_hint(a1 - b1) == 0
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return None
                 if V.graph.sizevars.size_hint(a1 - b1) < 0:
                     # swap so a0 is bigger
@@ -2048,6 +2576,7 @@ def convert_tiling_to_3d(
                 + ranked_tilings
             )
 
+<<<<<<< HEAD
         for tiling in ranked_tilings:
             assert isinstance(tiling, dict)
             if all(
@@ -2060,6 +2589,14 @@ def convert_tiling_to_3d(
                 return tiling
 
         return default_tiling
+=======
+        if tiling := cls.get_first_compatible_tiling(
+            node_schedule, numel, reduction_numel, ranked_tilings
+        ):
+            return tiling, None
+
+        return default_tiling, None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def flush(self):
         pass
diff --git a/torch/_inductor/codegen/simd_kernel_features.py b/torch/_inductor/codegen/simd_kernel_features.py
index 54dcbfa275f2..3aab4559b24b 100644
--- a/torch/_inductor/codegen/simd_kernel_features.py
+++ b/torch/_inductor/codegen/simd_kernel_features.py
@@ -24,6 +24,11 @@
 if typing.TYPE_CHECKING:
     from collections.abc import Iterable, Sequence
 
+<<<<<<< HEAD
+=======
+    from torch._inductor.tiling_utils import CoalesceVarAnalysis
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class NodeScheduleMarker:
     @staticmethod
@@ -80,12 +85,20 @@ def __init__(
         node_schedule: list[NodeScheduleEntry],
         numel: sympy.Expr,
         reduction_numel: sympy.Expr = sympy.S.One,
+<<<<<<< HEAD
+=======
+        coalesce_analysis: Optional[CoalesceVarAnalysis] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         self.node_schedule = node_schedule
         # numel excludes reduction_numel
         self.numel: sympy.Expr = V.graph.sizevars.simplify(numel)
         self.reduction_numel: sympy.Expr = V.graph.sizevars.simplify(reduction_numel)
         self._stats_cache: dict[tuple[sympy.Expr, ...], MemoryStats] = {}
+<<<<<<< HEAD
+=======
+        self.coalesce_analysis = coalesce_analysis
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @cache_on_self
     def is_reduction(self) -> bool:
@@ -119,7 +132,11 @@ def contains_op(self, op_name: str) -> bool:
         return bool(self.op_counts().get(op_name))
 
     def get_mutations(self) -> OrderedSet[str]:
+<<<<<<< HEAD
         mutations = OrderedSet[str]()
+=======
+        mutations: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in self.scheduler_nodes():
             for buf in node.get_outputs():
                 mutations.update(buf.get_mutations())
@@ -128,7 +145,11 @@ def get_mutations(self) -> OrderedSet[str]:
     @cache_on_self
     def select_index_dtype(self) -> torch.dtype:
         # Gather all used buffer names
+<<<<<<< HEAD
         buffer_names = OrderedSet[str]()
+=======
+        buffer_names: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in self.scheduler_nodes():
             buffer_names.update(node.get_buffer_names())
             buffer_names.update(node.used_buffer_names())
diff --git a/torch/_inductor/codegen/subgraph.py b/torch/_inductor/codegen/subgraph.py
new file mode 100644
index 000000000000..b89966947e56
--- /dev/null
+++ b/torch/_inductor/codegen/subgraph.py
@@ -0,0 +1,208 @@
+import itertools
+import logging
+from typing import Any, Callable
+
+import torch
+import torch._inductor.config as config
+from torch._inductor import ir
+from torch._inductor.codegen.common import KernelTemplate
+from torch._inductor.ir import (
+    Buffer,
+    get_free_symbols,
+    get_symbolic_inputs,
+    gm_original_output_strides,
+    ir_node_to_tensor,
+    Layout,
+)
+from torch._inductor.runtime.benchmarking import benchmarker
+from torch._inductor.utils import do_bench_using_profiling
+from torch._inductor.virtualized import V
+
+
+log = logging.getLogger(__name__)
+
+
+class SubgraphChoiceCaller(ir.ChoiceCaller):
+    """
+    Represents a Subgraph Autotuning choice, and the subgraph can be any arbitrary
+    GraphModule. Compiles the Subgraph down to a module for benchmarking.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        input_nodes: list[Buffer],
+        layout: Layout,
+        description: str,
+        make_fx_graph: Callable[..., Any],
+    ) -> None:
+        super().__init__(name, input_nodes, layout, description)
+
+        self.example_inputs = []
+        with V.fake_mode:
+            for inp in self.input_nodes:
+                # Here there will be no unbacked symbols, as SubgraphBuffer does not support them
+                assert len(get_free_symbols(inp.get_size(), unbacked_only=True)) == 0
+                assert len(get_free_symbols(inp.get_stride(), unbacked_only=True)) == 0
+
+                inp.data.freeze_layout()  # type: ignore[attr-defined]
+                self.example_inputs.append(ir_node_to_tensor(inp))
+
+        self.gm = make_fx_graph(*self.example_inputs)
+        gm_original_output_strides(self.gm)
+
+        self.sym_inputs = get_symbolic_inputs(self.input_nodes)
+
+    def __str__(self) -> str:
+        return f"SubgraphCaller({self.name})"
+
+    def benchmark(self, *args: list[Any], out: torch.Tensor) -> float:
+        # Codegen Subgraph for benchmarking
+        # Need GraphLowering instead of SubgraphLowering to generate
+        # fully callable module
+        import torch._inductor.config as inductor_config
+        from torch._inductor.graph import GraphLowering
+
+        bm_graph_lowering = GraphLowering(
+            gm=self.gm,
+            example_inputs=self.example_inputs,
+            shape_env=V.graph._shape_env,
+            cpp_wrapper=V.graph.cpp_wrapper,
+            aot_mode=V.graph.aot_mode,
+            extern_node_serializer=V.graph.extern_node_serializer,
+            is_inference=V.graph.is_inference,
+            is_backward=V.graph.is_backward,
+            name=f"benchmark_{self.name}",
+        )
+
+        for sym_inp in self.sym_inputs:
+            bm_graph_lowering.graph_inputs[sym_inp.name] = sym_inp
+            bm_graph_lowering.graph_input_names.append(sym_inp.name)
+
+        sym_inputs = [
+            int(V.graph.sizevars.shape_env.size_hint(sym_var))
+            for sym_var in self.sym_inputs
+        ]
+
+        if len(sym_inputs) == 0:
+            # Sanity check that args are same layout as example inputs
+            # Only do it if there are no symbolic inputs, otherwise
+            # the dynamic dim will be realized to the same size as args
+            for ar, example_inp in zip(args, self.example_inputs):
+                # Sanity check that args are same layout as example inputs
+                if isinstance(ar, torch.Tensor):
+                    assert isinstance(example_inp, torch.Tensor)
+                    assert ar.shape == example_inp.shape
+                    assert ar.stride() == example_inp.stride()
+
+        if len(sym_inputs) == 0:
+            # Sanity check that args are same layout as example inputs
+            # Only do it if there are no symbolic inputs, otherwise
+            # the dynamic dim will be realized to the same size as args
+            for ar, example_inp in zip(args, self.example_inputs):
+                # Sanity check that args are same layout as example inputs
+                if isinstance(ar, torch.Tensor):
+                    assert isinstance(example_inp, torch.Tensor)
+                    assert ar.shape == example_inp.shape
+                    assert ar.stride() == example_inp.stride()
+
+        with V.set_graph_handler(bm_graph_lowering):
+            # Don't bother autotuning on Triton here
+            with inductor_config.patch(
+                max_autotune=False,
+                max_autotune_gemm=False,
+                max_autotune_gemm_backends="ATEN",
+            ):
+                bm_graph_lowering.run(*self.example_inputs)
+                mod = bm_graph_lowering.compile_to_module()
+                bm_func = mod.call
+
+                bm_func([*sym_inputs, *args])
+        if config.profile_bandwidth_with_do_bench_using_profiling:
+            return do_bench_using_profiling(lambda: bm_func([*sym_inputs, *args]))
+        return benchmarker.benchmark_gpu(lambda: bm_func([*sym_inputs, *args]))
+
+    def hash_key(self) -> str:
+        return "-".join(
+            [
+                self.name.rsplit("_", 1)[0],
+                *[str(inp.get_size()) for inp in self.input_nodes],
+                *[str(inp.get_stride()) for inp in self.input_nodes],
+                str(self.gm.graph),
+            ]
+        )
+
+    def output_node(self) -> ir.TensorBox:
+        return ir.TensorBox.create(
+            ir.SubgraphBuffer(
+                layout=self.layout,
+                input_nodes=self.input_nodes,
+                gm=self.gm,
+                example_inputs=self.example_inputs,
+                subgraph_name=self.name,
+            )
+        )
+
+    def info_dict(self) -> dict[str, Any]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return {
+            "backend": "subgraph",
+            "kernel_name": self.name,
+        }
+
+    def autoheuristic_id(self) -> str:
+        return f"subgraph_{self.name}"
+
+
+class SubgraphTemplate(KernelTemplate):
+    """
+    A template for subgraph evaluation to be used in autotuning.
+
+    This class allows creating customized subgraphs that can be appended
+    as choices during the autotuning process, enabling the selection of
+    optimal implementations for complex operations.
+    """
+
+    index_counter = itertools.count()
+
+    def __init__(
+        self,
+        name: str,
+        make_fx_graph: Callable[..., Any],
+    ):
+        """
+        Initialize a subgraph template.
+
+        Args:
+            name: The name of this template
+            graph: The FX graph
+        """
+        self.name = f"{name}_{next(SubgraphTemplate.index_counter)}"
+        self.make_fx_graph = make_fx_graph
+
+    def generate(  # type: ignore[override]
+        self,
+        input_nodes: list[Buffer],
+        layout: Layout,
+        **kwargs: Any,
+    ) -> SubgraphChoiceCaller:
+        """
+        Generate a SubgraphChoiceCaller instance for autotuning.
+
+        Args:
+            input_nodes: List of input nodes to the subgraph
+            layout: Memory layout information for the output
+            example_inputs: Example tensor inputs used to trace and benchmark the subgraph
+            **kwargs: Additional keyword arguments
+
+        Returns:
+            SubgraphChoiceCaller: A callable object that can be used for autotuning
+        """
+
+        return SubgraphChoiceCaller(
+            name=self.name,
+            input_nodes=input_nodes,
+            layout=layout,
+            description="",
+            make_fx_graph=self.make_fx_graph,
+        )
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index e9c5b910ba02..9b6ee6766916 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -8,6 +8,10 @@
 import itertools
 import logging
 import math
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import textwrap
 from collections.abc import Iterable, Sequence
@@ -25,13 +29,21 @@
 from torch._prims_common import is_integer_dtype
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import CeilDiv, FloorDiv, ModularIndexing
+<<<<<<< HEAD
 from torch.utils._triton import has_triton_package, get_triton_version
+=======
+from torch.utils._triton import has_triton_package
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ...utils._sympy.symbol import free_symbol_is_type, prefix_str, symbol_is_type, SymT
 from ...utils._sympy.value_ranges import ValueRanges
 from .. import config, ir, metrics
 from ..async_compile import AsyncCompile
+<<<<<<< HEAD
 from ..codecache import code_hash, get_path, PyCodeCache
+=======
+from ..codecache import code_hash, get_path, PyCodeCache, write_atomic
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..ops_handler import DefaultHandler
 from ..runtime import triton_heuristics
 from ..runtime.benchmarking import benchmarker
@@ -226,7 +238,16 @@ def has_rmask(self) -> bool:
 
     @property
     def mask_str(self) -> str:
+<<<<<<< HEAD
         return " & ".join(map(str, self.mask_vars)) if self.mask_vars else "None"
+=======
+        # The sorted call is added to make sure the order is still
+        # deterministic if self.mask_vars contains mix of string
+        # and TritonCSEVariable
+        return (
+            " & ".join(sorted(map(str, self.mask_vars))) if self.mask_vars else "None"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -282,6 +303,7 @@ def codegen_broadcast_and_reshape(
         # We need an explicit broadcast for stores, or if the final reshape does more
         # than add singletons.
         sizevars = V.graph.sizevars
+<<<<<<< HEAD
         require_broadcast = any(self.broadcasting_dims) and (
             len(pre_broadcast_shape) != len(final_shape)
             or any(
@@ -289,11 +311,22 @@ def codegen_broadcast_and_reshape(
                     sizevars.statically_known_equals(pre_dim, 1)
                     or sizevars.statically_known_equals(pre_dim, post_dim)
                 )
+=======
+        supports_implicit_broadcast = allow_implicit and (
+            len(pre_broadcast_shape) == len(final_shape)
+            and all(
+                sizevars.statically_known_equals(pre_dim, 1)
+                or sizevars.statically_known_equals(pre_dim, post_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for pre_dim, post_dim in zip(pre_broadcast_shape, final_shape)
             )
         )
 
+<<<<<<< HEAD
         if not allow_implicit or require_broadcast:
+=======
+        if any(self.broadcasting_dims) and not supports_implicit_broadcast:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             value = f"tl.broadcast_to({value}, {V.kernel.index_to_str(self.broadcast_shape)})"
 
         # Reshape to the final shape.
@@ -385,7 +418,11 @@ def remove_dims(it):
             broadcast_shape=broadcast_shape,
             broadcasting_dims=broadcasting_dims,
         )
+<<<<<<< HEAD
         result.compute_boundary_check(get_max_block)
+=======
+        result.compute_boundary_check(get_max_block, range_trees)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return result
 
     def replace_offset(
@@ -432,27 +469,69 @@ def remove_roffsets(expr: sympy.Expr) -> sympy.Expr:
         ]
         return f"tl.make_block_ptr({', '.join(args)})"
 
+<<<<<<< HEAD
     def compute_boundary_check(self, get_max_block: Callable[[str], int]) -> None:
+=======
+    def compute_boundary_check(
+        self,
+        get_max_block: Callable[[str], int],
+        range_trees: list[IterationRangesRoot],
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """List of indices to pass to tl.load(boundary_check=...)"""
         sizevars = V.graph.sizevars
 
         # Substitute maximum block sizes in shape expressions.
         # This works in multiple_of checks because block sizes are powers of 2.
         block_to_max: dict[sympy.Expr, Any] = {
+<<<<<<< HEAD
             block_size: get_max_block(prefix_str[symt])
             for symt, block_size in TritonSymbols.block_sizes.items()
         }
 
+=======
+            TritonSymbols.block_sizes[t.symt]: get_max_block(prefix_str[t.symt])
+            for t in range_trees
+        }
+
+        # Also see Note: Constant mask optimisation
+        # if ynumel / YBLOCK > max_ygrid, then the z dimension is used to handle
+        # the remaining programs that cannot fit into the y dimension. This means
+        # it's possible that more than the required number of programs are launched,
+        # possibly leading to out-of-bounds accesses. So even if ynumel divides YBLOCK,
+        # boundary checking is required in the dimensions that are based on YBLOCK
+        # e.g. for [YBLOCK // 16, YBLOCK, XBLOCK] dimensions 0 and 1 need boundary
+        # checks when max_ygrid is exceeded.
+        needs_overflow_grid = any(map(V.kernel.needs_yz_grid_overflow, range_trees))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._boundary_check = [
             idx
             for idx in range(len(self.shape))
             if (
                 not sizevars.statically_known_equals(self.strides[idx], sympy.S.Zero)
+<<<<<<< HEAD
                 and not sizevars.statically_known_multiple_of(
                     self.shape[idx], self.block_shape[idx]
                 )
                 and not sizevars.statically_known_multiple_of(
                     self.shape[idx], sympy_subs(self.block_shape[idx], block_to_max)
+=======
+                and (
+                    (
+                        needs_overflow_grid
+                        and TritonSymbols.block_sizes[SymT.YBLOCK]
+                        in self.block_shape[idx].free_symbols
+                    )
+                    or (
+                        not sizevars.statically_known_multiple_of(
+                            self.shape[idx], self.block_shape[idx]
+                        )
+                        and not sizevars.statically_known_multiple_of(
+                            self.shape[idx],
+                            sympy_subs(self.block_shape[idx], block_to_max),
+                        )
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 and not (
                     V.kernel.no_x_dim
@@ -506,7 +585,11 @@ def has_mask(self) -> bool:
 def triton_reshape(
     value: str, old_shape: Sequence[sympy.Expr], new_shape: Sequence[sympy.Expr]
 ) -> str:
+<<<<<<< HEAD
     """Workaround https://github.com/openai/triton/issues/2836"""
+=======
+    """Workaround https://github.com/triton-lang/triton/issues/2836"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(old_shape, list) and isinstance(new_shape, list)
 
     old_shape_str = [V.kernel.index_to_str(shape) for shape in old_shape]
@@ -605,7 +688,16 @@ def _print_FloatPow(self, expr: sympy.Expr) -> str:
             f"libdevice.pow({self._print(expr.args[0])}, {self._print(expr.args[1])})"
         )
 
+<<<<<<< HEAD
     _print_PowByNatural = _print_FloatPow
+=======
+    def _print_PowByNatural(self, expr: sympy.Expr) -> str:
+        if expr.args[0].is_Integer:
+            return f"libdevice.pow({float(expr.args[0])}, {self._print(expr.args[1])})"
+        return (
+            f"libdevice.pow({self._print(expr.args[0])}, {self._print(expr.args[1])})"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _print_Where(self, expr: sympy.Expr) -> str:
         c = self.doprint(expr.args[0])
@@ -615,7 +707,11 @@ def _print_Where(self, expr: sympy.Expr) -> str:
 
     def _print_min_max_helper(self, expr: sympy.Expr, cmp: str) -> str:
         """
+<<<<<<< HEAD
         Helper for max/min code genereration.
+=======
+        Helper for max/min code generation.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cmp: > or <
         """
         if len(expr.args) == 1:
@@ -678,6 +774,13 @@ def _print_OpaqueUnaryFn_atan(self, expr: sympy.Expr) -> str:
         assert len(expr.args) == 1
         return f"libdevice.atan(({self._print(expr.args[0])}).to(tl.float32))"
 
+<<<<<<< HEAD
+=======
+    def _print_OpaqueUnaryFn_log2(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"libdevice.log2(({self._print(expr.args[0])}).to(tl.float32))"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _print_RoundToInt(self, expr: sympy.Expr) -> str:
         assert len(expr.args) == 1
         return (
@@ -741,7 +844,11 @@ class TritonCSEVariable(CSEVariable):
     def __init__(self, name, bounds: ValueRanges[Any], dtype: torch.dtype) -> None:
         super().__init__(name, bounds, dtype)
         # We'll use this to track which masks the variable needs when used for indirect indexing
+<<<<<<< HEAD
         self.mask_vars = OrderedSet[str]()
+=======
+        self.mask_vars: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert dtype is not None, "TritonCSEVariable must have dtype"
 
     def update_on_args(self, name, args, kwargs):
@@ -834,7 +941,11 @@ def _get_min_elements_per_thread(
 
             # fp8 data type conversions has min_elem_per_thread requirements.
             # Refer to Triton implementations here:
+<<<<<<< HEAD
             # https://github.com/openai/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10.
+=======
+            # https://github.com/triton-lang/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fp8_dtypes = (
                 torch.float8_e4m3fn,
                 torch.float8_e5m2,
@@ -903,8 +1014,17 @@ def _shaped_constant(value, dtype, shape):
             return triton_val
 
         # NOTE: We use a tensor here in order to get the expected type.
+<<<<<<< HEAD
         # Otherwise, e.g. float64 constants would be trunctated to float32.
         return f"tl.full({shape}, {triton_val}, {triton_type})"
+=======
+        # Otherwise, e.g. float64 constants would be truncated to float32.
+        if value < 0 and not dtype.is_signed:
+            triton_signed_type = f"tl.{triton_type[4:]}"
+            return f"tl.full({shape}, {triton_val}, {triton_signed_type}).to({triton_type})"
+        else:
+            return f"tl.full({shape}, {triton_val}, {triton_type})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def constant(cls, value, dtype):
@@ -939,11 +1059,14 @@ def mod(x, y):
 
     @staticmethod
     @maybe_upcast_float32()
+<<<<<<< HEAD
     def libdevice_abs(x):
         return f"libdevice.abs({x})"
 
     @staticmethod
     @maybe_upcast_float32()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def exp(x):
         """
         When use_fast_math, use the ftz (flushing to zero) variant
@@ -959,11 +1082,14 @@ def exp(x):
 
     @staticmethod
     @maybe_upcast_float32()
+<<<<<<< HEAD
     def libdevice_exp(x):
         return f"libdevice.exp({x})"
 
     @staticmethod
     @maybe_upcast_float32()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def exp2(x):
         return f"libdevice.exp2({x})"
 
@@ -978,11 +1104,14 @@ def sqrt(x):
         return f"libdevice.sqrt({x})"
 
     @staticmethod
+<<<<<<< HEAD
     @maybe_upcast_float32()
     def libdevice_sqrt(x):
         return f"libdevice.sqrt({x})"
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def relu(x):
         bug = config.triton.inject_relu_bug_TESTING_ONLY
         if bug == "compile_error":
@@ -1029,6 +1158,7 @@ def cos(x):
 
     @staticmethod
     @maybe_upcast_float32()
+<<<<<<< HEAD
     def libdevice_cos(x):
         return f"libdevice.cos({x})"
 
@@ -1042,6 +1172,11 @@ def sin(x):
     def libdevice_sin(x):
         return f"libdevice.sin({x})"
 
+=======
+    def sin(x):
+        return f"tl_math.sin({x})"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def index_expr(cls, expr, dtype):
         raise NotImplementedError("ops.index_expr not implemented outside a kernel")
@@ -1217,6 +1352,7 @@ def tan(x):
     @staticmethod
     @maybe_upcast_float32()
     def tanh(x):
+<<<<<<< HEAD
         if config.use_fast_math and torch.version.hip:
             if get_triton_version() > (3, 4):
                 return f"libdevice.fast_tanhf({x})"
@@ -1224,6 +1360,9 @@ def tanh(x):
                 return f"libdevice.tanh({x})"
         else:
             return f"libdevice.tanh({x})"
+=======
+        return f"libdevice.fast_tanhf({x})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     @maybe_upcast_float32()
@@ -1253,11 +1392,14 @@ def log(x):
         return f"tl_math.log({x})"
 
     @staticmethod
+<<<<<<< HEAD
     @maybe_upcast_float32()
     def libdevice_log(x):
         return f"libdevice.log({x})"
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @maybe_upcast_float32(convert_output=False)
     def isinf(x):
         return f"libdevice.isinf({x}).to(tl.int1)"
@@ -1322,6 +1464,54 @@ class TritonKernelOverrides(TritonOverrides):
     variables which are assumed to already be defined in the current scope.
     """
 
+<<<<<<< HEAD
+=======
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # happens in __init__ unlike _initialize_pointwise_overrides
+        # because the libdevice registrations are populated during lowerings
+        self._setup_libdevice_routing()
+
+    @classmethod
+    @functools.cache
+    def _setup_libdevice_routing(cls):
+        """Set up routing to libdevice implementations for fp64 inputs."""
+
+        from torch._inductor.codegen.common import OpDecompositions
+
+        for fn_name in torch._inductor.utils.op_requires_libdevice_fp64:
+            assert hasattr(cls, fn_name)
+            original_impl = getattr(cls, fn_name)
+
+            def decomposition_router(x, _original_impl, _fn_name):
+                if x.dtype != torch.float64:
+                    return _original_impl(x)
+                else:
+                    return getattr(OpDecompositions, _fn_name)(x).value
+
+            if fn_name == "sigmoid":
+                assert hasattr(OpDecompositions, "sigmoid")
+                fn = functools.partial(
+                    decomposition_router, _original_impl=original_impl, _fn_name=fn_name
+                )
+                fn.__name__ = fn_name  # type: ignore[attr-defined]
+                setattr(cls, fn_name, staticmethod(fn))
+                continue
+
+            def dtype_router(x, _original_impl, _fn_name):
+                if x.dtype == torch.float64:
+                    return f"libdevice.{_fn_name}({x})"
+                else:
+                    return _original_impl(x)
+
+            fn = functools.partial(
+                dtype_router, _original_impl=original_impl, _fn_name=fn_name
+            )
+            fn.__name__ = fn_name  # type: ignore[attr-defined]
+            setattr(cls, fn_name, staticmethod(fn))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def constant(cls, value, dtype):
         # NOTE: Cannot use shape=[] as it's not supported by triton-rocm
@@ -1338,7 +1528,11 @@ def index_expr(cls, expr, dtype):
 
         # Our sympy expr printing casts to the current kernel index dtype.
         # we only respect non int32-int64 dtypes and otherwise use current kernel indexing dtype
+<<<<<<< HEAD
         index_dtype = torch.int32 if V.kernel.index_dtype == "tl.int32" else torch.int64
+=======
+        index_dtype = V.kernel.get_index_dtype_as_torch_dtype()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype = dtype if dtype not in (torch.int32, torch.int64) else index_dtype
 
         # after we emit this var we cast it to the correct dtype
@@ -1745,8 +1939,13 @@ def indexing(
         index_vars = index.free_symbols
         has_rindex = False
 
+<<<<<<< HEAD
         mask_vars = OrderedSet[str]()
         for var in index_vars:
+=======
+        mask_vars: OrderedSet[str] = OrderedSet()
+        for var in sorted(index_vars, key=operator.attrgetter("name")):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert isinstance(var, sympy.Symbol)
             has_rindex = has_rindex or symbol_is_type(
                 var, TritonSymbols.reduction_types
@@ -1787,7 +1986,11 @@ def indexing(
 
         have_dense = True
         have_loop_vars = False
+<<<<<<< HEAD
         dense_mask_vars = OrderedSet[str]()
+=======
+        dense_mask_vars: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for tree in self.active_range_trees():
             if index_vars.intersection(tree.var_list):
@@ -1805,7 +2008,11 @@ def indexing(
             and len(mask_vars - dense_mask_vars) == 0
             and not self.is_indirect_indexing(index)
             and have_loop_vars
+<<<<<<< HEAD
             # workaround https://github.com/openai/triton/issues/2821
+=======
+            # workaround https://github.com/triton-lang/triton/issues/2821
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and self.index_dtype == "tl.int32"
         ):
 
@@ -1904,7 +2111,11 @@ def match_mod_div_block(
                 # Compute the ND block shape from the linear block size.
                 # Use CielDiv to round leading dimensions up to 1.
                 # Non-leading dimensions are clamped to the size of the iteration range,
+<<<<<<< HEAD
                 # while the leading dimension can exceed this to accomodate a larger
+=======
+                # while the leading dimension can exceed this to accommodate a larger
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # block size.
                 linear_block_size = TritonSymbols.get_block_size(range_tree)
                 block_shape: list[sympy.Expr] = [
@@ -1949,7 +2160,11 @@ def match_block_pointer() -> Optional[BlockPtrOptions]:
                 index_relative_to_xyr_index = sympy_subs(
                     index, {v: t.expr for v, t in self.range_tree_nodes.items()}
                 )
+<<<<<<< HEAD
                 range_trees = self.active_range_trees(reorder=True)
+=======
+                range_trees = self.active_range_trees()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # Partition the index into subexpressions pertaining to each range tree.
                 # For example xindex * 5 + r0_index * 3 is partitioned to
@@ -2030,7 +2245,11 @@ def codegen_block_ptr(
     ) -> tuple[str, str]:
         check = indexing.boundary_check()
         if not check:
+<<<<<<< HEAD
             # workaround https://github.com/openai/triton/issues/2813
+=======
+            # workaround https://github.com/triton-lang/triton/issues/2813
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             other = ""
         elif other:
             assert other == ", other=0.0"
@@ -2073,12 +2292,33 @@ def codegen_block_ptr(
         return block_ptr, other
 
     def codegen_block_ptr_store_line(self, name, indexing, block_ptr, value, other=""):
+<<<<<<< HEAD
         # Stores require an explicit broadcast.
+=======
+        # Stores require an explicit broadcast. We do this in two phases:
+        #  1. Broadcast the operand to the final shape of the range trees, e.g. [ZBLOCK,
+        #     YBLOCK, XBLOCK]. This protects against implicit broadcasting from loads.
+        #  2. In case the block pointer has different dimensionality, broadcast/reshape the
+        #     result to the shape of the pointer.
+        value = f"tl.broadcast_to({value}, {indexing.final_shape})"
+
+        # These dims no longer need broadcasting.
+        for idx, (dim, broadcast_dim) in enumerate(
+            zip(indexing.final_shape, indexing.broadcast_shape)
+        ):
+            if V.graph.sizevars.statically_known_equals(dim, broadcast_dim):
+                indexing.broadcasting_dims[idx] = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         value = indexing.codegen_broadcast_and_reshape(
             value, indexing.final_shape, indexing.block_shape, False
         )
 
+<<<<<<< HEAD
         # workaround https://github.com/openai/triton/issues/2814
+=======
+        # workaround https://github.com/triton-lang/triton/issues/2814
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})"
         return f"tl.store({block_ptr}, {value}{other})"
 
@@ -2224,7 +2464,11 @@ def decide_later():
                 line += ".to(tl.float32)"
                 dtype = torch.float32
             if dtype == torch.bool and torch.version.hip is None:
+<<<<<<< HEAD
                 # Workaround for https://github.com/openai/triton/issues/2151
+=======
+                # Workaround for https://github.com/triton-lang/triton/issues/2151
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # tl.load returns int8 when loading from pointer to int1
                 # NOTE: Currently causes hangs on bool UTs for ROCm
                 line += ".to(tl.int1)"
@@ -2266,7 +2510,11 @@ def store(
         indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None)
 
         # Guard against write-after-read corruption in triton.
+<<<<<<< HEAD
         # See # https://github.com/openai/triton/issues/1615
+=======
+        # See # https://github.com/triton-lang/triton/issues/1615
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This triton bug means that a load which is broadcasted over multiple
         # warps may see the result of a store that happens later in the triton
         # program. The workaround is to add a barrier before storing, which
@@ -2526,19 +2774,31 @@ def _mask_value(value, default) -> CSEVariable:
                 masked_value = _mask_value(value, default)
 
             if reduction_type in ("argmax", "argmin"):
+<<<<<<< HEAD
+=======
+                accumulator_dtype = V.kernel.get_index_dtype_as_torch_dtype()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 accumulator_index = str(
                     self.cse.generate(
                         self.compute,
                         f"tl.broadcast_to({reduction_range_prefix}index, {masked_value}.shape)",
+<<<<<<< HEAD
                         dtype=torch.int32
                         if V.kernel.index_dtype == "tl.int32"
                         else torch.int64,
+=======
+                        dtype=accumulator_dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
                 root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
                 final_argreduce(
                     self.compute, result_var, masked_value, accumulator_index
                 )
+<<<<<<< HEAD
+=======
+                result_var.dtype = accumulator_dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif reduction_type == "welford_reduce":
                 if self.cooperative_reduction:
                     # cooperative reductions require full welford for correctness
@@ -2700,6 +2960,7 @@ def _mask_value(value, default) -> CSEVariable:
                 assert reduction_type == "welford_reduce"
                 result_mean, result_m2, result_weight = result_var
                 peer_mean = self.codegen_cooperative_reduction_peer_combine(
+<<<<<<< HEAD
                     result_mean, upcast_acc_dtype(src_dtype), default[0]
                 )
                 peer_m2 = self.codegen_cooperative_reduction_peer_combine(
@@ -2707,6 +2968,21 @@ def _mask_value(value, default) -> CSEVariable:
                 )
                 peer_weight = self.codegen_cooperative_reduction_peer_combine(
                     result_weight, upcast_acc_dtype(src_dtype), default[2]
+=======
+                    result_mean,
+                    upcast_acc_dtype(src_dtype),
+                    default[0],  # type: ignore[index]
+                )
+                peer_m2 = self.codegen_cooperative_reduction_peer_combine(
+                    result_m2,
+                    upcast_acc_dtype(src_dtype),
+                    default[1],  # type: ignore[index]
+                )
+                peer_weight = self.codegen_cooperative_reduction_peer_combine(
+                    result_weight,
+                    upcast_acc_dtype(src_dtype),
+                    default[2],  # type: ignore[index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 self.welford_reduce_final_reduction(
                     self.post_loop_store,
@@ -3513,7 +3789,11 @@ def codegen_kernel(self, name=None):
                         arg.name, V.graph.sizevars.inv_precomputed_replacements[symbol]
                     )
 
+<<<<<<< HEAD
         mutated_args = OrderedSet[str]()
+=======
+        mutated_args: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for mutation in self.mutations:
             if mutation in self.args.input_buffers:
                 mutated_args.add(self.args.input_buffers[mutation])
@@ -3605,6 +3885,12 @@ def add_constexpr_arg(arg_name):
             "num_reduction": self.num_reduction,
             **self.inductor_meta_common(),
         }
+<<<<<<< HEAD
+=======
+        if self.tiling_scores:
+            inductor_meta["tiling_scores"] = self.tiling_scores
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.cooperative_reduction:
             inductor_meta["persistent_reduction"] = self.persistent_reduction
 
@@ -3619,7 +3905,11 @@ def add_constexpr_arg(arg_name):
         # when they are not constexpr. otherwise there may be a segfault
         # during launching the Inductor-compiled Triton kernel.
         # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
+<<<<<<< HEAD
         # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
+=======
+        # https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for arg_num in equal_1_arg_indices(signature):  # type: ignore[index]
             triton_meta["constants"][signature[arg_num].name] = 1  # type: ignore[index,union-attr]
 
@@ -3892,6 +4182,10 @@ def _has_constant_mask(self, tree: IterationRangesRoot) -> bool:
         if tree.is_reduction and self.cooperative_reduction:
             max_block = max_block * self.max_rsplit()
 
+<<<<<<< HEAD
+=======
+        # [Note: Constant mask optimisation]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Optional optimization: if block divides numel exactly, we will
         # never need to do a masked load to handle stragglers at the end.
         # If this tree is for the y dimension, we should only use a constant
@@ -4059,7 +4353,11 @@ def codegen_comment(self, node_schedule):
         wrapper = V.graph.wrapper_code
         origins, _detailed_origins = get_kernel_metadata(node_schedule, wrapper)
         if origins:
+<<<<<<< HEAD
             wrapper.writeline(origins)
+=======
+            wrapper.make_comment(origins)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if config.debug_fusion:
             from torch._inductor.scheduler import (
@@ -4077,7 +4375,11 @@ def codegen_comment(self, node_schedule):
                     for n in node_schedule
                     if isinstance(n, BaseSchedulerNode)
                 ]
+<<<<<<< HEAD
                 wrapper.writeline(
+=======
+                wrapper.make_comment(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     f"{wrapper.comment} Fused node name list: {', '.join(node_names)}"
                 )
 
@@ -4167,8 +4469,12 @@ def cache_file_path():
 
             def store_cache():
                 path = cache_file_path()
+<<<<<<< HEAD
                 with open(path, "w") as fd:
                     fd.write(str(ms))  # type: ignore[has-type]
+=======
+                write_atomic(path, str(ms))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def load_cache():
                 path = cache_file_path()
@@ -4345,8 +4651,12 @@ def load_cache():
 
         def store_cache():
             path = cache_file_path()
+<<<<<<< HEAD
             with open(path, "w") as fd:
                 fd.write(str(ms) + " " + str(ms_clone))
+=======
+            write_atomic(path, str(ms) + " " + str(ms_clone))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         total_ms, file_list = 0, []
         total_clone_ms: float = 0.0
diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py
index 9fc3f2137502..66be374e5918 100644
--- a/torch/_inductor/codegen/triton_combo_kernel.py
+++ b/torch/_inductor/codegen/triton_combo_kernel.py
@@ -51,7 +51,11 @@ def _default_custom_combo_kernel_horizontal_partition(
     node_info_map: dict[BaseSchedulerNode, tuple[Any, Any, Any, Any]],
 ) -> list[list[BaseSchedulerNode]]:
     """Horizontally partition the given list of nodes into a list of list of nodes where each sublist
+<<<<<<< HEAD
     represents a partion. Nodes in different partitions are implemented in different combo kernels.
+=======
+    represents a partition. Nodes in different partitions are implemented in different combo kernels.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Nodes in the same partition are likely to be implemented
     in the same combo kernel, but subject to subsequent restrictions like CUDA limits for number of args.
 
@@ -536,7 +540,11 @@ def select_combo_heuristics(
             return heuristics_list[0], size_hints_list[0], self.sub_kernels[0]
 
     def get_mutated_args_sub_kernels(self) -> list[str]:
+<<<<<<< HEAD
         mutated_args = OrderedSet[str]()
+=======
+        mutated_args: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for sub_kernel in self.sub_kernels:
             for mutation in sub_kernel.mutations:
                 if mutation in sub_kernel.args.input_buffers:
diff --git a/torch/_inductor/codegen/triton_utils.py b/torch/_inductor/codegen/triton_utils.py
index 2d5f6a55b4cc..5fb9b5e69ae6 100644
--- a/torch/_inductor/codegen/triton_utils.py
+++ b/torch/_inductor/codegen/triton_utils.py
@@ -34,6 +34,7 @@ def should_unwrap_unspec_arg(name: str):
 def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str:
     if isinstance(arg, TensorArg):
         # TODO: Remove fp8 special handling when Triton supports PyTorch fp8 dtypes.
+<<<<<<< HEAD
         # Related PR: https://github.com/openai/triton/pull/2279/
         if arg.dtype == torch.float8_e4m3fn:
             tye = "*fp8e4nv"
@@ -54,6 +55,28 @@ def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str:
                 return new_tye
         else:
             return tye
+=======
+        # Related PR: https://github.com/triton-lang/triton/pull/2279/
+        if arg.dtype == torch.float8_e4m3fn:
+            typ = "*fp8e4nv"
+        elif arg.dtype == torch.float8_e5m2:
+            typ = "*fp8e5"
+        elif arg.dtype == torch.float8_e4m3fnuz:
+            typ = "*fp8e4b8"
+        elif arg.dtype == torch.float8_e5m2fnuz:
+            typ = "*fp8e5b16"
+        else:
+            typ = _type_of(arg.dtype)
+        if should_unwrap_unspec_arg(arg.buffer):
+            # had unwrapped 0d tensor as scalar
+            new_typ = typ.lstrip("*")
+            if new_typ in ["fp16", "bf16"]:
+                return "fp32"
+            else:
+                return new_typ
+        else:
+            return typ
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(arg, SizeArg):
         if arg.expr is None:
             if triton_version_uses_attrs_dict():
@@ -90,7 +113,19 @@ def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str:
     if isinstance(arg, WorkspaceArg):
         return _type_of(arg.dtype)
     if isinstance(arg, TMADescriptorArg):
+<<<<<<< HEAD
         return "nvTmaDesc"
+=======
+        if arg.api_type == "experimental":
+            return "nvTmaDesc"
+        else:
+            # https://github.com/triton-lang/triton/blob/9695baed9b46cf957e08b157bb4133f4a4b331c5/python/triton/runtime/jit.py#L360-L363
+            assert arg.api_type == "stable"
+            assert arg.block_shape is not None
+            assert arg.dtype is not None
+            inner = _type_of(arg.dtype)[1:]  # strip the `*`: *fp32 -> fp32
+            return f"tensordesc<{inner}{list(arg.block_shape)}>"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(arg, ConstexprArg):
         return "constexpr"
     raise NotImplementedError(f"unhandled {type(arg)}: {arg}")
@@ -111,20 +146,62 @@ def signature_to_meta(
     size_dtype: Optional[str],
     argdefs: list[ArgName],
     indices: Optional[list[int]] = None,
+<<<<<<< HEAD
 ) -> dict[str, str]:
     if indices is None:
         indices = list(range(len(signature)))
     return {
         argdefs[i].name: signature_of(arg, size_dtype=size_dtype)
+=======
+    is_template: bool = False,
+) -> dict[str, str]:
+    if indices is None:
+        indices = list(range(len(signature)))
+
+    def _decide_tl_dtype(arg):
+        # Even if the ks0 symbol itself is within tl.int32 range, it's
+        # risky to use tl.int32 dtype since we may have ks0*ks1 later
+        # for kernels like torch.mean when dynamic shape is enabled.
+        #
+        # Check config.triton.use_block_ptr, since Triton block pointer
+        # does not support 64bit indexing:
+        # https://gist.github.com/shunting314/6a41c776171720ce4561f202dcde0ad6
+        #
+        # If the triton metadata is for a template, don't use tl.int64 index.
+        # Templates like flex attention/decoding uses block pointers which
+        # does not support 64 bit indexing.
+        if (
+            not config.triton.use_block_ptr
+            and not is_template
+            and isinstance(arg, SizeArg)
+            and arg.name.startswith("ks")
+        ):
+            return "tl.int64"
+        return size_dtype
+
+    return {
+        argdefs[i].name: signature_of(arg, size_dtype=_decide_tl_dtype(arg))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i, arg in zip(indices, signature)
     }
 
 
 def is_unaligned_buffer(arg: TensorArg):
     buf_name = arg.buffer
+<<<<<<< HEAD
     if buf_name in V.graph.graph_inputs:
         # See Note: [Input Alignment handling in Inductor]
         return buf_name not in V.graph.aligned_inputs
+=======
+    if buf_name in V.graph.unaligned_buffers:
+        return True
+
+    if buf_name in V.graph.graph_inputs:
+        # See Note: [Input Alignment handling in Inductor]
+        # For graph inputs that is not recorded in V.graph.unaligned_buffers,
+        # we know for sure the tensor is aligned.
+        return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if buf_name in V.graph.constants:
         # all constants are assumed to be aligned
@@ -179,7 +256,11 @@ def config_of(
     def is_aligned(x: KernelArgType, alignment: int, include_tensor: bool) -> bool:
         """
         Roughly follow triton code here:
+<<<<<<< HEAD
         https://github.com/openai/triton/blob/5282ed890d453e10b9ee30076ef89115dd197761/python/triton/runtime/jit.py#L208-L222
+=======
+        https://github.com/triton-lang/triton/blob/5282ed890d453e10b9ee30076ef89115dd197761/python/triton/runtime/jit.py#L208-L222
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if isinstance(x, TensorArg):
             if include_tensor:
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index de8d1dfca787..c98df55ad329 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -12,7 +12,11 @@
 import random
 import re
 import tempfile
+<<<<<<< HEAD
 from itertools import count
+=======
+from itertools import chain, count
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 
 import sympy
@@ -45,8 +49,17 @@
 from ..runtime.hints import DeviceProperties
 from ..utils import (
     cache_on_self,
+<<<<<<< HEAD
     get_benchmark_name,
     LineContext,
+=======
+    DelayReplaceLine,
+    get_benchmark_name,
+    IndentedBuffer,
+    is_codegen_graph_partition_subgraph,
+    LineContext,
+    set_kernel_post_grad_provenance_tracing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sympy_product,
     sympy_str,
     sympy_subs,
@@ -57,7 +70,10 @@
     ArgName,
     CodeGen,
     DeferredLine,
+<<<<<<< HEAD
     IndentedBuffer,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PythonPrinter,
     WorkspaceArg,
     WorkspaceZeroMode,
@@ -72,6 +88,7 @@
     import triton
 
     from ..graph import GraphLowering
+<<<<<<< HEAD
 
 
 pexpr = PythonPrinter().doprint
@@ -79,10 +96,27 @@
 
 ReuseKey = tuple[torch.device, torch.dtype, str]
 BufferLike = Union[ir.Buffer, WorkspaceArg]
+=======
+    from .wrapper_fxir import FxConverter
+
+
+log = logging.getLogger(__name__)
+
+pexpr = PythonPrinter().doprint
+
+
+ReuseKey = tuple[torch.device, torch.dtype, str, bool]
+BufferLike = Union[ir.Buffer, WorkspaceArg]
+FxConversionFunc = Callable[["WrapperLine"], None]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def buffer_reuse_key(node: BufferLike) -> ReuseKey:
     storage_size = V.graph.get_allocation_storage_size(node)
+<<<<<<< HEAD
+=======
+    alignment = node.get_name() not in V.graph.unaligned_buffers
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         node.get_device_or_error(),
         node.get_dtype(),
@@ -90,6 +124,10 @@ def buffer_reuse_key(node: BufferLike) -> ReuseKey:
         # for s0 for s1, just because they happen to share the same
         # size hint
         sympy_str(V.graph.sizevars.simplify(storage_size)),
+<<<<<<< HEAD
+=======
+        alignment,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -124,6 +162,7 @@ def can_match_buffer_size(input_buf: BufferLike, output_buf: BufferLike):
     return False
 
 
+<<<<<<< HEAD
 def convert_arg_type(arg: torch.Argument) -> str:
     from .cpp import CONTAINER_PYTHON_TO_CPP, PYTHON_TO_CPP
 
@@ -190,6 +229,8 @@ def get_cpp_op_schema(kernel: torch._ops.OpOverload) -> str:
     return f"{cpp_return_value}({', '.join(cpp_arg_type)})"  # type: ignore[possibly-undefined]
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: Move to a well known place
 TritonMetaParams = dict[str, int]
 TritonGrid = Union[
@@ -202,6 +243,10 @@ def user_defined_kernel_grid_fn_code(
     configs: list[triton.Config],  # type: ignore[name-defined]
     grids: list[TritonGrid],
     wrapper: Optional[PythonWrapperCodegen] = None,
+<<<<<<< HEAD
+=======
+    original_fxnode_name: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[str, str]:
     output = IndentedBuffer()
 
@@ -210,6 +255,10 @@ def _convert_to_sympy_expr(item: Union[int, sympy.Expr]) -> sympy.Expr:
 
     def determine_grid(
         grid: TritonGrid,
+<<<<<<< HEAD
+=======
+        example_grid: Optional[TritonGrid] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         This function return a tuple of two values: the first one is for the real grid
@@ -222,13 +271,22 @@ def determine_grid(
             return grid, grid
         # Grid contains ints/Expr, so utilize wrapper's expr printer for codegen
         sympy_grid = tuple(_convert_to_sympy_expr(g) for g in grid)
+<<<<<<< HEAD
+=======
+        if not example_grid:
+            example_grid = sympy_grid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             wrapper.codegen_python_shape_tuple(sympy_grid),
             (
                 wrapper.codegen_python_shape_tuple(
                     tuple(
                         wrapper.generate_example_arg_value(g, type(g))
+<<<<<<< HEAD
                         for g in sympy_grid
+=======
+                        for g in example_grid  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
                 if config.triton.autotune_at_compile_time
@@ -253,20 +311,45 @@ def writeline(line: str, example_grid: Optional[str] = None):
         else contextlib.nullcontext()
     )
     with output.indent(), kernel_autotune_calls_indent:
+<<<<<<< HEAD
         if len(grids) == 1:
             grid, example_grid = determine_grid(grids[0])
+=======
+        if (
+            config.triton.autotune_at_compile_time
+            and original_fxnode_name
+            and V.graph.autotuning_grids
+            and original_fxnode_name in V.graph.autotuning_grids
+        ):
+            example_grids = V.graph.autotuning_grids[original_fxnode_name]
+        else:
+            example_grids = [None] * len(grids)
+        if len(grids) == 1:
+            grid, example_grid = determine_grid(grids[0], example_grids[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             writeline(f"return {grid}", f"return {example_grid}")
         else:
             assert len(grids) > 1
             assert len(grids) == len(configs)
+<<<<<<< HEAD
             seen = OrderedSet[str]()
+=======
+            seen: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # sort the configs from the largest # of kwargs to the smallest to
             # emit the grids in the order of (approximately) decreasing specificity
             # TODO(aakhundov): the sorting below is generally not sufficient, so
             # maybe we'll need to restrict the supported cases to identical kwarg
             # names in all autotuning configs.
+<<<<<<< HEAD
             for grid, c in sorted(
                 zip(grids, configs), key=lambda x: len(x[1].kwargs), reverse=True
+=======
+            for grid, c, example_grid in sorted(
+                zip(grids, configs, example_grids),
+                key=lambda x: len(x[1].kwargs),
+                reverse=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 if c.kwargs:
                     guards = [
@@ -275,7 +358,11 @@ def writeline(line: str, example_grid: Optional[str] = None):
                     guards = " and ".join(guards)
                 else:
                     guards = "True"  # for configs with empty kwargs
+<<<<<<< HEAD
                 grid, example_grid = determine_grid(grid)
+=======
+                grid, example_grid = determine_grid(grid, example_grid)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 statement = f"if {guards}: return {grid}"
                 if statement in seen:
                     continue
@@ -394,7 +481,12 @@ def push(self, key: ReuseKey, item: FreeIfNotReusedLine) -> None:
 
 
 class WrapperLine:
+<<<<<<< HEAD
     pass
+=======
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        raise NotImplementedError("FX codegen not yet supported for type {type(self)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -409,6 +501,24 @@ def codegen(self, code: IndentedBuffer) -> None:
         self.wrapper.push_codegened_graph(self.graph)
         code.do_indent()
 
+<<<<<<< HEAD
+=======
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_enter_subgraph
+
+
+@dataclasses.dataclass
+class CommentLine(WrapperLine):
+    line: LineContext
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        code.writeline(self.line)
+
+    @staticmethod
+    def codegen_fx(converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_comment
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class ExitSubgraphLine(WrapperLine):
@@ -421,6 +531,12 @@ def codegen(self, code: IndentedBuffer) -> None:
         self.wrapper.pop_codegened_graph()
         code.do_unindent()
 
+<<<<<<< HEAD
+=======
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_exit_subgraph
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class EnterDeviceContextManagerLine(WrapperLine):
@@ -456,12 +572,138 @@ def codegen(self, code: IndentedBuffer) -> None:
             code.do_indent()
             code.writeline(V.graph.device_ops.set_device(self.device_idx))
 
+<<<<<<< HEAD
+=======
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_enter_device_context_manager
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ExitDeviceContextManagerLine(WrapperLine):
     def codegen(self, code: IndentedBuffer) -> None:
         if not V.graph.cpp_wrapper:
             code.do_unindent()
 
+<<<<<<< HEAD
+=======
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_exit_device_context_manager
+
+
+@dataclasses.dataclass
+class ExternKernelAllocLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    node: ir.ExternKernelAlloc
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        node = self.node
+        args = [*node.codegen_args(), *node.codegen_kwargs()]
+        self.wrapper._generate_extern_kernel_alloc_helper(self.node, args)
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_extern_kernel_alloc
+
+
+@dataclasses.dataclass
+class ExternKernelOutLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    node: ir.ExternKernelOut
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        node = self.node
+        args = [*node.codegen_args(), *node.codegen_kwargs(skip_out=True)]
+        kernel_name = node.get_kernel_name()
+        if (
+            V.graph.cpp_wrapper
+            and node.cpp_kernel_name == "torch::inductor::_mm_plus_mm"
+        ):
+            # For https://github.com/pytorch/pytorch/issues/128474
+            kernel_name = "aoti_torch__mm_plus_mm_out"
+        else:
+            kernel_name = node.get_kernel_name()
+        device = d.type if (d := node.get_device()) else V.graph.device_type
+        # set provenance tracing kernel mapping for ExternKernel types
+        if config.trace.enabled:
+            set_kernel_post_grad_provenance_tracing(node, kernel_name, is_extern=True)
+        self.wrapper._generate_extern_kernel_out_helper(
+            kernel_name,
+            node.codegen_reference(),
+            node.output_view.codegen_reference() if node.output_view else None,
+            args,
+            device,
+        )
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_extern_kernel_out
+
+
+@dataclasses.dataclass
+class FreeLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    node: Union[BufferLike, ir.TorchBindObject]
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        assert self.node.get_name() not in V.graph.removed_buffers
+        code.writeline(self.wrapper.make_buffer_free(self.node))
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_free
+
+
+@dataclasses.dataclass
+class KernelCallLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    kernel_name: str
+    call_args: tuple[Any, ...]
+    raw_keys: tuple[Any, ...]
+    raw_args: tuple[Any, ...]
+    arg_types: list[str]
+    triton: bool
+    triton_meta: dict[str, Any]
+    device: torch.device
+    graph_name: str
+    original_fxnode_name: str
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        self.wrapper._generate_kernel_call_helper(
+            self.kernel_name,
+            self.call_args,
+            triton=self.triton,
+            arg_types=self.arg_types,
+            raw_keys=self.raw_keys,
+            raw_args=self.raw_args,
+            triton_meta=self.triton_meta,
+            device=self.device,
+            graph_name=self.graph_name,
+            original_fxnode_name=self.original_fxnode_name,
+        )
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_kernel_call
+
+
+@dataclasses.dataclass
+class KernelDefinitionLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    kernel_name: str
+    kernel_body: str
+    metadata: Optional[str] = None
+    gpu: bool = True
+    cpp_definition: Optional[str] = None
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        self.wrapper._define_kernel_helper(
+            self.kernel_name,
+            self.kernel_body,
+            metadata=self.metadata,
+            gpu=self.gpu,
+            cpp_definition=self.cpp_definition,
+        )
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_kernel_definition
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class MemoryPlanningLine(WrapperLine):
@@ -518,6 +760,12 @@ def codegen(self, code: IndentedBuffer) -> None:
         line = self.wrapper.make_buffer_allocation(self.node)
         code.writeline(line)
 
+<<<<<<< HEAD
+=======
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_allocate
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class FreeIfNotReusedLine(MemoryPlanningLine):
@@ -541,6 +789,32 @@ def codegen(self, code: IndentedBuffer) -> None:
         if not self.is_reused:
             code.writeline(self.wrapper.make_buffer_free(self.node))
 
+<<<<<<< HEAD
+=======
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_free_if_not_reused
+
+
+@dataclasses.dataclass
+class ReinterpretLine(MemoryPlanningLine):
+    node: BufferLike
+    reused_as: BufferLike
+    layout: ir.Layout
+
+    def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
+        return self
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        assert isinstance(self.layout, ir.NonOwningLayout)
+        assert isinstance(self.layout.view, ir.ReinterpretView)
+        self.wrapper.codegen_deferred_allocation(
+            self.reused_as.get_name(), self.layout.view
+        )
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_reinterpret
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class ReuseLine(MemoryPlanningLine):
@@ -562,9 +836,19 @@ def codegen(self, code: IndentedBuffer) -> None:
             self.wrapper.make_buffer_reuse(self.node, self.reused_as, self.delete_old)
         )
 
+<<<<<<< HEAD
 
 class NullLine(MemoryPlanningLine):
     pass
+=======
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_reuse
+
+
+class NullLine(MemoryPlanningLine):
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_null
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -638,6 +922,12 @@ def make_allocation_line(
                 f"Unsupported comm buffer type: {comm_buffer_type}"
             )
 
+<<<<<<< HEAD
+=======
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_comm_buffer_allocate
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class CommBufferFreeLine(CommBufferLine):
@@ -645,8 +935,69 @@ def codegen(self, code: IndentedBuffer) -> None:
         line = self.wrapper.make_buffer_free(self.node)
         code.writeline(f"{line} # {self.comm_buffer_type.value} buffer free")
 
+<<<<<<< HEAD
 
 BufferName = str
+=======
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_comm_buffer_free
+
+
+@dataclasses.dataclass
+class MultiOutputLine(WrapperLine):
+    """
+    Given a MultiOutputLayout buffer, indexes actual buffer(s) from the result.
+    """
+
+    wrapper: PythonWrapperCodegen
+    result_name: str
+    arg_name: str
+    indices: Sequence[Any]
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        def codegen_list_tuple_access(basename, indices):  # type: ignore[no-untyped-def]
+            if len(indices) > 0:
+                itype, i = indices[0]
+                if issubclass(itype, list):
+                    return codegen_list_tuple_access(f"{basename}[{i}]", indices[1:])
+                elif issubclass(itype, tuple):
+                    # cpp wrapper code needs to use std::get<> to access a tuple
+                    tuple_access = self.wrapper.codegen_tuple_access(
+                        basename, self.result_name, str(i)
+                    )
+                    return codegen_list_tuple_access(tuple_access, indices[1:])
+                elif issubclass(itype, dict):
+                    return codegen_list_tuple_access(f"{basename}['{i}']", indices[1:])
+                else:
+                    raise AssertionError("non supported index type: ", itype)
+            else:
+                return basename
+
+        value = codegen_list_tuple_access(self.arg_name, self.indices)
+        code.writeline(
+            f"{self.wrapper.declare}{self.result_name} = {value}{self.wrapper.ending}"
+        )
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_multi_output
+
+
+@dataclasses.dataclass
+class SymbolicCallArgLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    arg: SymbolicCallArg
+    graph: GraphLowering
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        self.wrapper._generate_symbolic_call_arg_helper(self.arg, self.graph)
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_symbolic_call_arg
+
+
+BufferName = str
+Line = Union[MemoryPlanningLine, LineContext]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class PythonWrapperCodegen(CodeGen):
@@ -654,9 +1005,20 @@ class PythonWrapperCodegen(CodeGen):
     Generate outer wrapper in Python that calls the kernels.
     """
 
+<<<<<<< HEAD
     def __init__(self):
         super().__init__()
         self._names_iter: Iterator[int] = count()
+=======
+    supports_caching = True  # Whether the output code is cacheable.
+
+    def __init__(self):
+        super().__init__()
+        self._names_iter: Iterator[int] = count()
+        self.args_to_buffers: dict[
+            str, Union[None, ir.TensorBox, ir.Buffer, ir.TorchBindObject]
+        ] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.imports = IndentedBuffer()
         self.header = IndentedBuffer()
         self.prefix = IndentedBuffer()
@@ -666,12 +1028,24 @@ def __init__(self):
         self.kernel_autotune_defs = IndentedBuffer()
         self.kernel_autotune_calls = IndentedBuffer()
         self.subgraph_definitions = IndentedBuffer()
+<<<<<<< HEAD
         self.kernel_autotune_names = OrderedSet[str]()
+=======
+        self.kernel_autotune_names: OrderedSet[str] = OrderedSet()
+        # Map key is the kernel argument name; value is a tuple of the resulting example
+        # tensor name with the kernel where that tensor was most recently used.
+        self.kernel_autotune_example_args: dict[str, tuple[str, str]] = {}
+        self.kernel_autotune_tmp_arg_idx: int = 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If the generated source code is exactly the same, reuse the
         # pre-existing kernel for it
         self.src_to_kernel: dict[str, str] = {}
         self.kernel_numel_expr: OrderedSet[tuple[str, GraphLowering]] = OrderedSet()
+<<<<<<< HEAD
         self.lines: list[Union[MemoryPlanningLine, LineContext]] = []
+=======
+        self.lines: list[Line] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.declare = ""
         self.declare_maybe_reference = ""
         self.ending = ""
@@ -682,7 +1056,13 @@ def __init__(self):
         self.last_seen_device_guard_index: Optional[int] = None
         self.supports_intermediate_hooks = True
         self.user_defined_kernel_cache: dict[tuple[Any, ...], tuple[str, Any]] = {}
+<<<<<<< HEAD
         self.unbacked_symbol_decls = OrderedSet[str]()  # str of sympy.Symbol
+=======
+        self.unbacked_symbol_decls: OrderedSet[str] = (
+            OrderedSet()
+        )  # str of sympy.Symbol
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.computed_sizes: OrderedSet[sympy.Symbol] = OrderedSet()
         self.launcher_fn_name = None
         # This function can be overridden to change the launcher name
@@ -696,7 +1076,15 @@ def __init__(self):
         self.computed_sizes_stack = []
 
         self.write_header()
+<<<<<<< HEAD
         self.write_prefix()
+=======
+
+        if not is_codegen_graph_partition_subgraph(self):
+            # See [Note: Removed Graph Partition Arguments]
+            self.write_prefix()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.write_kernel_autotune_defs_header()
 
         if not V.graph.aot_mode:
@@ -714,7 +1102,11 @@ def __init__(self):
             self.write_get_raw_stream
         )
 
+<<<<<<< HEAD
         @functools.lru_cache(None)
+=======
+        @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def add_import_once(line: str) -> None:
             self.imports.writeline(line)
             if config.triton.autotune_at_compile_time:
@@ -722,9 +1114,15 @@ def add_import_once(line: str) -> None:
 
         self.add_import_once = add_import_once
         self._metas: dict[str, str] = {}
+<<<<<<< HEAD
         self._meta_vars = OrderedSet[str]()
         self.multi_kernel_state = MultiKernelState()
         self.already_codegened_subgraphs = OrderedSet[str]()
+=======
+        self._meta_vars: OrderedSet[str] = OrderedSet()
+        self.multi_kernel_state = MultiKernelState()
+        self.already_codegened_subgraphs: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.allocated_workspaces: dict[str, Any] = {}
 
         # intermediate tensor value printing utility
@@ -782,7 +1180,10 @@ def write_header(self) -> None:
                 from torch import device, empty_strided
                 from {async_compile.__name__} import AsyncCompile
                 from torch._inductor.select_algorithm import extern_kernels
+<<<<<<< HEAD
                 from torch._inductor.codegen.multi_kernel import MultiKernelCall
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 {aot_inductor_debug_utils}
             """,
             strip=True,
@@ -793,6 +1194,10 @@ def write_header(self) -> None:
                 inductor_ops = torch.ops.inductor
                 _quantized = torch.ops._quantized
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+<<<<<<< HEAD
+=======
+                assert_alignment = torch._C._dynamo.guards.assert_alignment
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                 empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                 empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
@@ -855,8 +1260,12 @@ def write_triton_header_once(self) -> None:
                 V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
             )
 
+<<<<<<< HEAD
     @cache_on_self
     def write_get_raw_stream_header_once(self) -> None:
+=======
+    def write_get_raw_stream_header(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.triton.autotune_at_compile_time:
             self.kernel_autotune_calls.writeline(
                 V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
@@ -866,6 +1275,13 @@ def write_get_raw_stream_header_once(self) -> None:
                 V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
             )
 
+<<<<<<< HEAD
+=======
+    @cache_on_self
+    def write_get_raw_stream_header_once(self) -> None:
+        self.write_get_raw_stream_header()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def add_meta_once(self, meta: TritonMetaParams) -> str:
         meta = repr(meta)
         if meta not in self._metas:
@@ -996,10 +1412,17 @@ def codegen_input_size_and_nan_asserts(self) -> None:
         if config.nan_asserts:
             self.codegen_input_nan_asserts()
 
+<<<<<<< HEAD
     # this function (and below) takes a graph as input so
     # that stream caching happens per graph instance. this
     # is important for nested subgraph codegening.
     def write_get_raw_stream(self, device_idx: int, graph=None) -> str:
+=======
+    # this function (and below) takes the graph name as input so
+    # that stream caching happens per graph instance. this
+    # is important for nested subgraph codegening.
+    def write_get_raw_stream(self, device_idx: int, graph_name: str) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.write_get_raw_stream_header_once()
         name = f"stream{device_idx}"
         if config.triton.autotune_at_compile_time:
@@ -1046,6 +1469,12 @@ def codegen_device_guard_enter(self, device_idx: int) -> None:
             self.kernel_autotune_calls.writeline(
                 V.graph.device_ops.set_device(device_idx)
             )
+<<<<<<< HEAD
+=======
+            if is_codegen_graph_partition_subgraph(self):
+                # Need get_raw_stream for subgraph
+                self.write_get_raw_stream_header()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.kernel_autotune_calls.writeline(
                 f"stream{device_idx} = get_raw_stream({device_idx})"
             )
@@ -1058,6 +1487,21 @@ def codegen_device_guard_exit(self) -> None:
 
     def generate_return(self, output_refs: list[str]) -> None:
         if output_refs:
+<<<<<<< HEAD
+=======
+            if config.nan_asserts:
+                self.wrapper_call.writeline(
+                    "return_vars = (" + ", ".join(output_refs) + ", )"
+                )
+                self.wrapper_call.writeline("for var in return_vars:")
+                self.wrapper_call.do_indent()
+                self.wrapper_call.writeline("if isinstance(var, torch.Tensor):")
+                self.wrapper_call.do_indent()
+                self.wrapper_call.writeline("assert not var.isnan().any().item()")
+                self.wrapper_call.writeline("assert not var.isinf().any().item()")
+                self.wrapper_call.do_unindent(2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.wrapper_call.writeline("return (" + ", ".join(output_refs) + ", )")
         else:
             self.wrapper_call.writeline("return ()")
@@ -1082,10 +1526,23 @@ def generate_after_suffix(self, result: IndentedBuffer) -> None:
     def generate_end(self, result: IndentedBuffer) -> None:
         return
 
+<<<<<<< HEAD
     def generate_fallback_kernel(self, fallback_kernel, args):
         self.generate_extern_kernel_alloc(fallback_kernel, args)
 
     def generate_extern_kernel_alloc(self, extern_kernel, args):
+=======
+    def generate_fallback_kernel(self, node: ir.FallbackKernel) -> None:
+        self.writeline(ExternKernelAllocLine(self, node))
+
+    def generate_extern_kernel_alloc(self, node: ir.ExternKernelAlloc):
+        node.codegen_comment(self)
+        self.writeline(ExternKernelAllocLine(self, node))
+        if isinstance(node.layout, ir.Layout):
+            node.codegen_size_asserts(self)
+
+    def _generate_extern_kernel_alloc_helper(self, extern_kernel, args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If it's a NoneLayout then the extern_kernel should essentially be
         # treated as if it doesn't return anything
         no_return = isinstance(extern_kernel.layout, ir.NoneLayout)
@@ -1116,6 +1573,16 @@ def generate_extern_kernel_alloc(self, extern_kernel, args):
 
     def generate_extern_kernel_out(
         self,
+<<<<<<< HEAD
+=======
+        node: ir.ExternKernelOut,
+    ) -> None:
+        node.codegen_comment(self)
+        self.writeline(ExternKernelOutLine(self, node))
+
+    def _generate_extern_kernel_out_helper(
+        self,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kernel: str,
         out: str,
         out_view: Optional[str],
@@ -1129,7 +1596,11 @@ def generate_extern_kernel_out(
         with debug_printer_manager:
             self.writeline(f"{kernel}({', '.join(args)})")
 
+<<<<<<< HEAD
     def _generate_tma_descriptor_call(self, desc, apply_size_hints=False):
+=======
+    def _generate_tma_descriptor_call_experimental(self, desc, apply_size_hints=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dims = desc.dims
         block_dims = desc.block_dims
         if apply_size_hints:
@@ -1151,6 +1622,31 @@ def _generate_tma_descriptor_call(self, desc, apply_size_hints=False):
         call = f"{fn}({args})"
         return call
 
+<<<<<<< HEAD
+=======
+    def _generate_tma_descriptor_call_stable(self, desc, apply_size_hints=False):
+        block_shape = desc.block_shape
+        if apply_size_hints:
+            block_shape = tuple(
+                V.graph.sizevars.atomically_apply_size_hint(d) for d in block_shape
+            )
+
+        prefix = "triton.tools.tensor_descriptor.TensorDescriptor"
+        fn = f"{prefix}.from_tensor"
+        args = f"{desc.tensor.codegen_reference()}, {block_shape}"
+        call = f"{fn}({args})"
+        return call
+
+    def _generate_tma_descriptor_call(self, desc, apply_size_hints=False):
+        if isinstance(desc, ir.TMADescriptorExperimental):
+            return self._generate_tma_descriptor_call_experimental(
+                desc, apply_size_hints
+            )
+        else:
+            assert isinstance(desc, ir.TMADescriptorStable)
+            return self._generate_tma_descriptor_call_stable(desc, apply_size_hints)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def generate_tma_descriptor(self, desc):
         call = self._generate_tma_descriptor_call(desc)
         line = f"{desc.name} = {call}{self.ending}"
@@ -1184,6 +1680,7 @@ def generate_fallback_kernel_with_runtime_lookup(
         self,
         buf_name: str,
         python_kernel_name: str,
+<<<<<<< HEAD
         cpp_kernel_name: str,
         codegen_args: list[str],
         op_overload: Optional[torch._ops.OpOverload] = None,
@@ -1191,6 +1688,14 @@ def generate_fallback_kernel_with_runtime_lookup(
         outputs=None,
     ):
         self.writeline(f"{buf_name} = {python_kernel_name}({', '.join(codegen_args)})")
+=======
+        get_args: Callable[[], Sequence[str]],
+        op_overload: Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator],
+        raw_args: Sequence[Any],
+        outputs: Sequence[ir.Buffer],
+    ) -> None:
+        self.writeline(f"{buf_name} = {python_kernel_name}({', '.join(get_args())})")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def generate(self, is_inference):
         with dynamo_timed("PythonWrapperCodegen.generate"):
@@ -1202,6 +1707,7 @@ def get_wrapper_call_indent(self) -> int:
         else:
             return 1
 
+<<<<<<< HEAD
     def _generate(self, is_inference):
         if config.profile_bandwidth:
             self.write_triton_header_once()
@@ -1216,6 +1722,27 @@ def _generate(self, is_inference):
 
         # Add subgraph definitions to the result
         result.splice(self.subgraph_definitions)
+=======
+    @contextlib.contextmanager
+    def set_writeline(self, new: Callable[..., None]) -> Iterator[Callable[..., None]]:
+        old = self.writeline
+        try:
+            self.writeline = new  # type: ignore[method-assign]
+            yield new
+        finally:
+            self.writeline = old  # type: ignore[method-assign]
+
+    def _write_multi_kernel_defs(self) -> None:
+        kernel_defs = self.multi_kernel_state.kernel_defs
+        if config.triton.autotune_at_compile_time:
+            self.kernel_autotune_defs.splice(kernel_defs)
+        else:
+            self.header.splice(kernel_defs)
+
+    def _generate(self, is_inference):
+        if config.profile_bandwidth:
+            self.write_triton_header_once()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with contextlib.ExitStack() as stack:
             stack.enter_context(self.wrapper_call.indent())
@@ -1224,20 +1751,37 @@ def _generate(self, is_inference):
             if config.profile_bandwidth:
                 self.generate_start_graph()
 
+<<<<<<< HEAD
             # We disable planning during training because it presently increases peak memory consumption.
             if is_inference and config.memory_planning:
                 self.memory_plan()
             else:
                 self.memory_plan_reuse()
+=======
+            self.run_wrapper_ir_passes(is_inference)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if config.triton.store_cubin and not config.triton.autotune_at_compile_time:
                 self.generate_reset_kernel_saved_flags()
 
+<<<<<<< HEAD
             for line in self.lines:
                 if isinstance(line, WrapperLine):
                     line.codegen(self.wrapper_call)
                 else:
                     self.wrapper_call.writeline(line)
+=======
+            # At this point, we shouldn't generate any new memory planning lines.
+            # Override writeline to point at the wrapper call, in case it gets called.
+            with self.set_writeline(self.wrapper_call.writeline):
+                for line in self.lines:
+                    if isinstance(line, WrapperLine):
+                        line.codegen(self.wrapper_call)
+                    else:
+                        self.wrapper_call.writeline(line)
+
+            self._write_multi_kernel_defs()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             output_refs = self.get_output_refs()
             self.mark_output_type()
@@ -1260,6 +1804,21 @@ def _generate(self, is_inference):
                 )
             self.generate_return(output_refs)
 
+<<<<<<< HEAD
+=======
+        # Assemble the final code from sections.
+        result = IndentedBuffer()
+        result.splice(self.imports)
+        result.writeline("")
+        result.splice(self.header)
+        # We do not want the cpp header for intermediate const graph. Headers would be
+        # rendered by the main module instead.
+        if V.graph.aot_mode and V.graph.cpp_wrapper and V.graph.is_const_graph:
+            result = IndentedBuffer()
+
+        # Add subgraph definitions to the result
+        result.splice(self.subgraph_definitions)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.finalize_prefix()
         result.splice(self.prefix)
 
@@ -1293,6 +1852,14 @@ def generate_and_run_autotune_block(self):
         """
         )
         scope = {}  # type: ignore[var-annotated]
+<<<<<<< HEAD
+=======
+        if config.triton.autotune_at_compile_time and V.graph.autotuning_inputs:
+            scope = {
+                self.get_autotuning_input_name(idx): v  # type: ignore[attr-defined]
+                for idx, v in enumerate(V.graph.autotuning_inputs)
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tuning_code = (
             self.kernel_autotune_defs.getvalue()
             + "\n"
@@ -1354,6 +1921,16 @@ def memory_plan_reuse(self):
             s.total_allocated_buffer_size for s in past_planning_states
         )
 
+<<<<<<< HEAD
+=======
+    def run_wrapper_ir_passes(self, is_inference: bool):
+        # We disable planning during training because it presently increases peak memory consumption.
+        if is_inference and config.memory_planning:
+            self.memory_plan()
+        else:
+            self.memory_plan_reuse()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_input_symbol_assignment(
         self,
         name: str,
@@ -1362,12 +1939,20 @@ def codegen_input_symbol_assignment(
     ):
         code = self.prefix
 
+<<<<<<< HEAD
         @functools.lru_cache(None)
+=======
+        @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def sizeof(name):
             code.writeline(f"{name}_size = {name}.size()")
             return f"{name}_size"
 
+<<<<<<< HEAD
         @functools.lru_cache(None)
+=======
+        @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def strideof(name):
             code.writeline(f"{name}_stride = {name}.stride()")
             return f"{name}_stride"
@@ -1413,6 +1998,33 @@ def codegen_inputs(self):
         for name, value in inputs:
             self.codegen_input_symbol_assignment(name, value, bound_vars)
 
+<<<<<<< HEAD
+=======
+        def _verify_input_symbol_assignment(
+            value: ir.TensorBox,
+            bound_vars: OrderedSet[sympy.Symbol],
+        ):
+            for expr in chain.from_iterable([value.get_size(), value.get_stride()]):
+                if not isinstance(expr, Expr) or isinstance(expr, sympy.Symbol):
+                    continue
+
+                undefined_symbols = [
+                    sym for sym in expr.free_symbols if sym not in bound_vars
+                ]
+                if len(undefined_symbols) > 0:
+                    raise AssertionError(
+                        f"For {expr}, expected {undefined_symbols} to have been codegen-ed."
+                    )
+
+        # For inputs with size/strides which contain sympy expressions, we can
+        # encounter symbols that weren't defined yet. Now, let's check each
+        # symbol is defined.
+        for _, value in inputs:
+            if not isinstance(value, ir.TensorBox):
+                continue
+            _verify_input_symbol_assignment(value, bound_vars)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def ensure_size_computed(self, sym: sympy.Symbol):
         if isinstance(sym, sympy.Symbol) and symbol_is_type(sym, SymT.PRECOMPUTED_SIZE):
             if sym in self.computed_sizes:
@@ -1492,8 +2104,15 @@ def codegen_reinterpret_view(
     def codegen_device_copy(self, src, dst, non_blocking: bool):
         self.writeline(f"{dst}.copy_({src}, {non_blocking})")
 
+<<<<<<< HEAD
     def codegen_multi_output(self, name, value):
         self.writeline(f"{self.declare}{name} = {value}{self.ending}")
+=======
+    def codegen_multi_output(self, node: ir.MultiOutput):
+        result_name = node.get_name()
+        arg_name = node.inputs[0].get_name()
+        self.writeline(MultiOutputLine(self, result_name, arg_name, node.indices))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_dynamic_scalar(self, node):
         (data,) = (t.codegen_reference() for t in node.inputs)
@@ -1635,15 +2254,57 @@ def define_kernel(
         gpu: bool = True,
         cpp_definition: Optional[str] = None,
     ):
+<<<<<<< HEAD
         if config.triton.autotune_at_compile_time:
             # Skip inserting comments for the autotune block as they may contain cpp style comments
             body = f"\n\n{kernel_name} = {kernel_body}"
+=======
+        self.writeline(
+            KernelDefinitionLine(
+                self,
+                kernel_name,
+                kernel_body,
+                metadata=metadata,
+                gpu=gpu,
+                cpp_definition=cpp_definition,
+            )
+        )
+
+    @staticmethod
+    def _format_kernel_definition(
+        kernel_name: str, kernel_body: str, metadata: Optional[str] = None
+    ):
+        metadata_comment = f"{metadata}\n" if metadata else ""
+        body = f"\n\n{metadata_comment}{kernel_name} = {kernel_body}"
+        return body
+
+    def _define_kernel_helper(
+        self,
+        kernel_name: str,
+        kernel_body: str,
+        metadata: Optional[str] = None,
+        gpu: bool = True,
+        cpp_definition: Optional[str] = None,
+    ):
+        if config.triton.autotune_at_compile_time:
+            # Skip inserting comments for the autotune block as they may contain cpp style comments
+            body = self._format_kernel_definition(
+                kernel_name, kernel_body, metadata=None
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.kernel_autotune_defs.splice(body)
             if V.graph.cpp_wrapper:
                 # For cpp wrapper, no need to continue codegen for the main body
                 return
+<<<<<<< HEAD
         metadata_comment = f"{metadata}\n" if metadata else ""
         body = f"\n\n{metadata_comment}{kernel_name} = {kernel_body}"
+=======
+
+        body = self._format_kernel_definition(
+            kernel_name, kernel_body, metadata=metadata
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.header.splice(body)
 
     def define_subgraph_launcher_fn(self, fn_code: str):
@@ -1658,8 +2319,11 @@ def define_user_defined_triton_kernel(
         reset_to_zero_args,
         grids: list[list[Union[int, sympy.Expr]]],
     ):
+<<<<<<< HEAD
         from torch.utils._triton import patch_triton_dtype_repr
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from ..runtime.triton_heuristics import (
             config_to_dict,
             FixedGrid,
@@ -1674,7 +2338,10 @@ def define_user_defined_triton_kernel(
         )
         from .triton import gen_common_triton_imports, TritonKernel
 
+<<<<<<< HEAD
         patch_triton_dtype_repr()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         original_name = kernel.__name__
         signature: list[KernelArgType] = []
         constants: dict[str, Any] = {}
@@ -1735,10 +2402,24 @@ def add_arg(idx, arg, is_constexpr=False, equals_1=False, equals_none=False):
                 add_arg(idx, ConstexprArg(name=key), equals_none=True)
             else:
                 if isinstance(arg, ir.TMADescriptor):
+<<<<<<< HEAD
+=======
+                    api_type, block_shape, dtype = (
+                        ("stable", arg.block_shape, arg.tensor.get_dtype())
+                        if isinstance(arg, ir.TMADescriptorStable)
+                        else ("experimental", None, None)
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     add_arg(
                         idx,
                         TMADescriptorArg(
                             name=key,
+<<<<<<< HEAD
+=======
+                            api_type=api_type,
+                            block_shape=block_shape,
+                            dtype=dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         ),
                     )
                 elif isinstance(arg, ir.Buffer):
@@ -1787,7 +2468,11 @@ def add_arg(idx, arg, is_constexpr=False, equals_1=False, equals_none=False):
             # TODO(aakhundov): add None args to constants, too. currently, this
             # causes CUDA errors in test_aot_inductor.test_triton_kernel_with_none_input.
             # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
+<<<<<<< HEAD
             # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
+=======
+            # https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "constants": {
                 **constants,
                 **dict.fromkeys(equal_to_1_args, 1),
@@ -1894,6 +2579,10 @@ def rename_sizes_for_launcher(expr: Union[int, sympy.Expr]) -> sympy.Expr:
         if config.triton.unique_user_kernel_names:
             # We replace the original_name with the unique name.
             kernel_src = kernel_src.replace(f"def {original_name}(", f"def {name}(")
+<<<<<<< HEAD
+=======
+        kernel_src = kernel_src.replace("'''", "\\'\\'\\'")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compile_wrapper.splice(kernel_src)
 
         current_device = V.graph.get_current_device_or_throw()
@@ -1914,7 +2603,11 @@ def generate_numel_expr(self, kernel_name: str, tree, suffix: Optional[str] = No
         expr = f"{kernel_name}_{tree.prefix}numel"
         if suffix is not None:
             expr += f"_{suffix}"
+<<<<<<< HEAD
         self.writeline(f"{expr} = {pexpr(tree.numel)}")
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We can get symbolic expressions here, like s0*64
         # It is fine to have them here, but we need to handle them correctly as their own type
         # This is tricky to do, so we wrap in a custom type, distinct from scalars, but also from sympy*
@@ -1922,7 +2615,19 @@ def generate_numel_expr(self, kernel_name: str, tree, suffix: Optional[str] = No
         # This is handled in `generate_args_decl` which has a correct comment of: TODO: only works for
         # constant now, need type info. I agree, this needs type info, and while this is not true type info
         # it suffices as a type hint for the purposes of producing the correct code for this type.
+<<<<<<< HEAD
         return SymbolicCallArg(expr, tree.numel)
+=======
+        arg = SymbolicCallArg(expr, tree.numel)
+        self.writeline(SymbolicCallArgLine(self, arg, V.graph))
+
+        return arg
+
+    def _generate_symbolic_call_arg_helper(
+        self, arg: SymbolicCallArg, graph: GraphLowering
+    ) -> None:
+        self.writeline(f"{arg.inner} = {pexpr(arg.inner_expr)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def generate_workspace_allocation(self, ws: WorkspaceArg):
         name = ws.get_name()
@@ -2033,6 +2738,7 @@ def wrap_arg(arg):
 
         return [wrap_arg(arg) for arg in call_args]
 
+<<<<<<< HEAD
     def generate_example_arg_value(self, arg, arg_type, raw_arg=None, index=None):
         if isinstance(arg_type, torch_dtype):
             if isinstance(raw_arg, ir.TMADescriptor):
@@ -2042,13 +2748,32 @@ def generate_example_arg_value(self, arg, arg_type, raw_arg=None, index=None):
             elif V.graph.try_get_buffer(arg) is not None:
                 buf_name = arg
                 buf = V.graph.get_buffer(arg)
+=======
+    def generate_example_arg_value(self, arg, arg_type, raw_arg=None):
+        if isinstance(arg_type, torch_dtype):
+            if isinstance(raw_arg, ir.TMADescriptor):
+                # first we generate the underlying buffer
+                buf_name = raw_arg.get_tensor().get_name()
+                buf = self.args_to_buffers[arg]
+            elif self.args_to_buffers.get(arg):
+                buf_name = arg
+                buf = self.args_to_buffers[arg]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 assert raw_arg is not None, (
                     "V.graph.get_buffer(arg) and raw_arg can't be None at the same time"
                 )
+<<<<<<< HEAD
                 buf_name = f"tmp_arg_{index}"
                 buf = raw_arg
 
+=======
+                buf_name = f"tmp_arg_{self.kernel_autotune_tmp_arg_idx}"
+                buf = raw_arg
+                self.kernel_autotune_tmp_arg_idx += 1
+
+            assert buf is not None, f"Failed to find a buffer for arg {arg}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             size = tuple(
                 V.graph.sizevars.atomically_apply_size_hint(
                     e,
@@ -2132,8 +2857,15 @@ def generate_kernel_call(
         device=None,
         triton=True,
         arg_types=None,
+<<<<<<< HEAD
         raw_args=None,
         triton_meta=None,
+=======
+        raw_keys=None,
+        raw_args=None,
+        triton_meta=None,
+        original_fxnode_name=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Generates kernel call code.
@@ -2141,6 +2873,51 @@ def generate_kernel_call(
         triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
                 and C++ when gpu=False.
         """
+<<<<<<< HEAD
+=======
+
+        # Store buffers corresponding to each call arg.
+        # This is used to generate example args for autotuning later on.
+        self.args_to_buffers.update(
+            {
+                arg: V.graph.try_get_buffer(arg)
+                for arg in call_args
+                if isinstance(arg, str)
+            }
+        )
+
+        device = device or V.graph.get_current_device_or_throw()
+        self.writeline(
+            KernelCallLine(
+                self,
+                kernel_name=kernel_name,
+                call_args=call_args,
+                raw_keys=raw_keys,
+                raw_args=raw_args,
+                arg_types=arg_types,
+                triton=triton,
+                triton_meta=triton_meta,
+                device=device,
+                graph_name=V.graph.name,
+                original_fxnode_name=original_fxnode_name,
+            )
+        )
+
+    def _generate_kernel_call_helper(
+        self,
+        kernel_name: str,
+        call_args,
+        *,
+        device=None,
+        triton=True,
+        arg_types=None,
+        raw_keys=None,
+        raw_args=None,
+        triton_meta=None,
+        graph_name="",
+        original_fxnode_name=None,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = device or V.graph.get_current_device_or_throw()
         if not (triton or device.type != "cpu"):
             self.writeline(self.wrap_kernel_call(kernel_name, call_args))
@@ -2149,7 +2926,11 @@ def generate_kernel_call(
         call_args_str = self.prepare_triton_kernel_call(call_args)
         call_args_str = ", ".join(call_args_str)
         stream_name = PythonWrapperCodegen.write_get_raw_stream(
+<<<<<<< HEAD
             self, device.index, V.graph
+=======
+            self, device.index, graph_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if not triton:
             stream_ptr = f"c_void_p({stream_name})"
@@ -2169,29 +2950,120 @@ def generate_kernel_call(
                 "call_args and arg_types do not match"
             )
 
+<<<<<<< HEAD
             tensor_args = {}
             all_args = []
             if raw_args is None:
                 # create a dummy raw_args for uniform behavior in the following loop
+=======
+            autotune_args = None
+            if original_fxnode_name and V.graph.autotuning_mapping:
+                autotune_args = V.graph.autotuning_mapping.get(
+                    original_fxnode_name, None
+                )
+
+            def get_autotune_deletion_call() -> str:
+                """After all the autotune kernel calls have been written (i.e.
+                self.kernel_autotune_example_args is complete), returns a deletion call
+                for all autotune example tensors that are unnecessary after kernel_name
+                is called."""
+                tensors_to_delete = [
+                    tensor
+                    for tensor, kn in self.kernel_autotune_example_args.values()
+                    if kn == kernel_name
+                ]
+                if tensors_to_delete:
+                    return f"del {', '.join(tensors_to_delete)}\n"
+                return ""
+
+            def infer_arg_by_inputs(raw_keys, raw_args, idx, reused_args):
+                """We try to infer raw_arg (i.e. raw_args[idx]) from remaining raw_args.
+                This is particularly useful for jagged cases, where the dimension is often
+                being passed in as an input."""
+
+                target_arg = raw_args[idx]
+                if target_arg in reused_args:
+                    return True
+
+                for i, (raw_key, raw_arg) in enumerate(zip(raw_keys, raw_args)):
+                    if i == idx or not isinstance(raw_arg, IRNode):
+                        continue
+
+                    triton_input = ""
+                    if autotune_args and raw_key in autotune_args:
+                        triton_input = self.get_autotuning_input_name(  # type: ignore[attr-defined]
+                            autotune_args[raw_key]
+                        )
+                    if triton_input == "":
+                        continue
+
+                    try:
+                        layout = raw_arg.get_layout()
+                        for dim, s in enumerate(layout.size):
+                            if s == target_arg:
+                                reused_args[target_arg] = f"{triton_input}.shape[{dim}]"
+                                return True
+                    except NotImplementedError:
+                        # If layout for this IRNode is not implemented, we could just skip.
+                        # Only raise for other Error cases.
+                        continue
+                return False
+
+            all_args = []
+            if raw_args is None:
+                # create a dummy raw_args for uniform behavior in the following loop
+                assert raw_keys is None, "keys are not None but args are"
+                raw_keys = [None] * len(call_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raw_args = [None] * len(call_args)
             else:
                 assert len(raw_args) == len(call_args), (
                     "call_args and raw_args do not match"
                 )
 
+<<<<<<< HEAD
             for i, (arg, arg_type, raw_arg) in enumerate(
                 zip(call_args, arg_types, raw_args)
+=======
+            reused_args = {}
+            for i, (arg, arg_type, raw_key, raw_arg) in enumerate(
+                zip(call_args, arg_types, raw_keys, raw_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 key = None
                 if isinstance(arg, str) and "=" in str(arg):
                     # arg may be passed in a kwarg style, and then we need to extract its value
                     key, arg = arg.split("=")
 
+<<<<<<< HEAD
                 if isinstance(arg_type, torch_dtype):
+=======
+                triton_input: Optional[str] = None
+                if autotune_args and raw_key in autotune_args:
+                    triton_input = self.get_autotuning_input_name(  # type: ignore[attr-defined]
+                        autotune_args[raw_key]
+                    )
+
+                if triton_input:
+                    arg_str = triton_input
+                    if not isinstance(arg_type, torch_dtype) and (
+                        issubclass(arg_type, sympy.Basic)
+                        or isinstance(arg, SymbolicCallArg)
+                    ):
+                        reused_args[raw_arg] = arg_str
+                elif raw_key == "" and infer_arg_by_inputs(
+                    raw_keys, raw_args, i, reused_args
+                ):
+                    # Empty raw_key means this is a arg that's not native to the triton kernel,
+                    # and is being added by inductor.
+                    arg_str = reused_args[raw_arg]
+                elif isinstance(arg_type, torch_dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # workspace allocation is already generated by `generate_workspace_allocation()`
                     # in `TritonKernel.call_kernel()`.
                     if re.match(r"^(workspace|semaphore)", arg):
                         arg_str = arg
+<<<<<<< HEAD
                         tensor_args[arg] = arg_str
                     elif arg not in tensor_args:
                         arg_str = self.generate_example_arg_value(
@@ -2202,13 +3074,28 @@ def generate_kernel_call(
                         arg_str = tensor_args[arg]
                 else:
                     arg_str = self.generate_example_arg_value(arg, arg_type, raw_arg, i)
+=======
+                    elif arg not in self.kernel_autotune_example_args:
+                        arg_str = self.generate_example_arg_value(
+                            arg, arg_type, raw_arg
+                        )
+                    else:
+                        arg_str = self.kernel_autotune_example_args[arg][0]
+                    self.kernel_autotune_example_args[arg] = (arg_str, kernel_name)
+                else:
+                    arg_str = self.generate_example_arg_value(arg, arg_type, raw_arg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 all_args.append(arg_str if key is None else f"{key}={arg_str}")
 
             self.kernel_autotune_calls.writeline(
                 f"{kernel_name}.run({', '.join(all_args)}, stream={stream_name})"
             )
             self.kernel_autotune_calls.writeline(
+<<<<<<< HEAD
                 f"del {', '.join(arg for arg in tensor_args.values())}\n",
+=======
+                DelayReplaceLine("<del_call>", get_autotune_deletion_call, "<del_call>")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.kernel_autotune_names.add(kernel_name)
             if V.graph.cpp_wrapper:
@@ -2220,6 +3107,10 @@ def generate_kernel_call(
         debug_printer_manager.set_printer_args(call_args, kernel_name, arg_types, None)
         with debug_printer_manager:
             self.writeline(f"{kernel_name}.run({call_args_str}, stream={stream_name})")
+<<<<<<< HEAD
+=======
+        self.write_triton_header_once()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def writeline(self, line):
         self.lines.append(line)
@@ -2232,7 +3123,11 @@ def enter_context(self, ctx):
         self.lines.append(LineContext(ctx))
 
     def val_to_arg_str(self, s, type_=None):
+<<<<<<< HEAD
         from torch.utils._triton import dtype_to_string, has_triton_package
+=======
+        from torch.utils._triton import has_triton_package
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if has_triton_package():
             import triton
@@ -2259,7 +3154,11 @@ def __repr__(self):
         elif isinstance(s, (ir.Buffer, ir.MutableBox, ReinterpretView)):
             return s.codegen_reference()
         elif has_triton_package() and isinstance(s, triton.language.dtype):  # type: ignore[possibly-undefined]
+<<<<<<< HEAD
             return dtype_to_string(s)
+=======
+            return repr(s)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(s, ir.GeneratorState):
             return s.codegen_reference()
         else:
@@ -2308,6 +3207,12 @@ def make_allocation(
             out = out + f".as_strided({codegen_shape_tuple}, {codegen_stride_tuple})"
         return out
 
+<<<<<<< HEAD
+=======
+    def make_comment(self, line):
+        self.writeline(CommentLine(line))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def make_tensor_alias(self, new_name, old_name, comment=""):
         return f"{self.declare}{new_name} = {old_name}{self.ending}  {self.comment} {comment}"
 
@@ -2350,7 +3255,11 @@ def codegen_allocation(self, buffer: ir.Buffer):
         if (
             name in V.graph.removed_buffers
             or name in self.allocated
+<<<<<<< HEAD
             or isinstance(buffer, ir.DonatedBuffer)
+=======
+            or isinstance(buffer, (ir.DonatedBuffer, ir.SubgraphBuffer))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             return
         self.allocated.add(name)
@@ -2372,10 +3281,19 @@ def codegen_allocation(self, buffer: ir.Buffer):
             assert isinstance(layout.view, ir.ReinterpretView), (
                 f"unexpected {type(layout.view)}: {layout.view}"
             )
+<<<<<<< HEAD
             assert isinstance(layout.view.data, ir.StorageBox), type(layout.view.data)
             assert isinstance(layout.view.data.data, ir.Buffer), type(layout.view.data)
             self.codegen_allocation(layout.view.data.data)
             self.codegen_deferred_allocation(name, layout.view)
+=======
+            box = layout.view.data
+            assert isinstance(box, ir.StorageBox), type(box)
+            input_buffer = box.data
+            assert isinstance(input_buffer, ir.Buffer), type(box)
+            self.codegen_allocation(input_buffer)
+            self.writeline(ReinterpretLine(self, input_buffer, buffer, layout))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         if isinstance(layout, ir.CommBufferLayout):
@@ -2389,7 +3307,11 @@ def codegen_free(self, buffer):
 
         # can be freed but not reused
         if isinstance(buffer, (ir.InputBuffer, ir.TorchBindObject)):
+<<<<<<< HEAD
             self.writeline(self.make_buffer_free(buffer))
+=======
+            self.writeline(FreeLine(self, buffer))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         if isinstance(buffer.get_output_spec(), ir.CommBufferLayout):
@@ -2570,6 +3492,7 @@ def _codegen_subgraph_suffix():
         finally:
             self.pop_codegened_graph()
 
+<<<<<<< HEAD
     def codegen_subgraph_prefix(self, subgraph, outer_inputs, outer_outputs):
         # All inputs of hops must be explicitly passed in.
         # Free tensors and basic symbols should have been explicitly lifted as inputs in dynamo.
@@ -2581,6 +3504,8 @@ def codegen_subgraph_prefix(self, subgraph, outer_inputs, outer_outputs):
         ):
             self.writeline(f"{self.declare}{inner_input} = {outer_input}{self.ending}")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_partition_call(
         self,
         partition_id: int,
@@ -2590,9 +3515,17 @@ def codegen_partition_call(
         input_deallocation = partition_signatures.input_deallocation
         output_nodes = partition_signatures.output_nodes
 
+<<<<<<< HEAD
         inputs = ", ".join(input_deallocation.keys()) + (
             "," if len(input_deallocation) == 1 else ""
         )
+=======
+        input_names = list(input_deallocation.keys()) + [
+            symbol_input.name for symbol_input in partition_signatures.symbol_inputs
+        ]
+
+        inputs = ", ".join(input_names) + ("," if len(input_names) == 1 else "")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         output_names = [node.get_name() for node in output_nodes]
         outputs = ", ".join(output_names) + ("," if len(output_nodes) == 1 else "")
@@ -2615,6 +3548,7 @@ def codegen_partition_call(
     def set_all_partition_names(self, num_partitions: int):
         self.all_partition_names = [f"partition_{idx}" for idx in range(num_partitions)]
 
+<<<<<<< HEAD
     def codegen_subgraph_call(self, subgraph, outer_inputs, outer_outputs):
         # Get the input and output names of the subgraph
         input_names = subgraph.graph.graph_input_names
@@ -2630,12 +3564,27 @@ def codegen_subgraph_call(self, subgraph, outer_inputs, outer_outputs):
         self.writeline(f"{subgraph.graph.name}_args = [{inner_inputs}]")
         for inner_input in input_names[: len(outer_inputs)]:
             self.writeline(f"del {inner_input}")
+=======
+    def codegen_subgraph_call_with_flattened_outputs(
+        self, subgraph, outer_inputs, outer_flattened_outputs
+    ):
+        # Get the input and output names of the subgraph
+        outer_output_names = ", ".join(outer_flattened_outputs) + (
+            "," if len(outer_flattened_outputs) == 1 else ""
+        )
+        outer_input_names = ", ".join(outer_inputs) + (
+            "," if len(outer_inputs) == 1 else ""
+        )
+
+        self.writeline(f"{subgraph.graph.name}_args = [{outer_input_names}]")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Call the subgraph launcher function
         self.writeline(
             f"({outer_output_names}) = {subgraph.graph.name}({subgraph.graph.name}_args)"
         )
 
+<<<<<<< HEAD
     def codegen_subgraph(self, subgraph, outer_inputs, outer_outputs):
         # Codegen subgraph by recursively calling the codegen for the subgraph.
         # This lifts the subgraph as a function in the output code.
@@ -2647,6 +3596,29 @@ def codegen_subgraph(self, subgraph, outer_inputs, outer_outputs):
         self.writeline("")
         self.writeline(f"{self.comment} subgraph: {subgraph.name}")
         self.codegen_subgraph_prefix(subgraph, outer_inputs, outer_outputs)
+=======
+    def codegen_subgraph_call(self, subgraph, outer_inputs, outer_buffer_name):
+        # Get the input and output names of the subgraph
+        outer_input_names = ", ".join(outer_inputs) + (
+            "," if len(outer_inputs) == 1 else ""
+        )
+
+        self.writeline(f"{subgraph.graph.name}_args = [{outer_input_names}]")
+
+        # Since the buffers are already put into the args list, we can free the
+        # buffers here.
+        V.graph.scheduler.free_buffers()
+
+        # Call the subgraph launcher function
+        self.writeline(
+            f"{outer_buffer_name} = {subgraph.graph.name}({subgraph.graph.name}_args)"
+        )
+
+    def codegen_subgraph_common(self, subgraph):
+        self.push_codegened_graph(subgraph.graph)
+        self.writeline("")
+        self.writeline(f"{self.comment} subgraph: {subgraph.name}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         parent_graph = V.graph
         subgraph.graph.cpp_wrapper = parent_graph.cpp_wrapper
@@ -2662,21 +3634,53 @@ def codegen_subgraph(self, subgraph, outer_inputs, outer_outputs):
             self.already_codegened_subgraphs.add(subgraph.graph.name)
             self.define_subgraph_launcher_fn(subgraph_code.value)
 
+<<<<<<< HEAD
         self.codegen_subgraph_call(subgraph, outer_inputs, outer_outputs)
+=======
+    def codegen_subgraph_with_flattened_outputs(
+        self, subgraph, outer_inputs, outer_flattened_outputs
+    ):
+        self.codegen_subgraph_common(subgraph)
+        self.codegen_subgraph_call_with_flattened_outputs(
+            subgraph, outer_inputs, outer_flattened_outputs
+        )
+
+    def codegen_subgraph(self, subgraph, outer_inputs, outer_buffer_name):
+        # Codegen subgraph by recursively calling the codegen for the subgraph.
+        # This lifts the subgraph as a function in the output code.
+        self.codegen_subgraph_common(subgraph)
+        self.codegen_subgraph_call(subgraph, outer_inputs, outer_buffer_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_invoke_subgraph(self, invoke_subgraph):
         name = invoke_subgraph.get_name()
 
         self.writeline(f"{name} = [None] * {len(invoke_subgraph.outputs)}")
         outer_inputs = [buf.codegen_reference() for buf in invoke_subgraph.inputs]
+<<<<<<< HEAD
         outer_outputs = [f"{name}[{i}]" for i in range(len(invoke_subgraph.outputs))]
         self.codegen_subgraph(invoke_subgraph.subgraph, outer_inputs, outer_outputs)
+=======
+
+        if V.graph.aot_mode:
+            outer_outputs = [
+                f"{name}[{i}]" for i in range(len(invoke_subgraph.outputs))
+            ]
+            self.codegen_subgraph_by_inlining(
+                invoke_subgraph.subgraph, outer_inputs, outer_outputs
+            )
+        else:
+            self.codegen_subgraph(invoke_subgraph.subgraph, outer_inputs, name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_conditional(self, conditional):
         name = conditional.get_name()
 
         outer_inputs = [buf.codegen_reference() for buf in conditional.operands]
+<<<<<<< HEAD
         outer_outputs = [f"{name}[{i}]" for i in range(len(conditional.outputs))]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         predicate = conditional.predicate.codegen_reference()
         if not isinstance(conditional.predicate, ir.ShapeAsConstantBuffer):
@@ -2686,11 +3690,32 @@ def codegen_conditional(self, conditional):
         self.writeline(f"{name} = [None] * {len(conditional.outputs)}")
         self.writeline(f"if {predicate}:")
         self.writeline(EnterSubgraphLine(self, conditional.true_subgraph.graph))
+<<<<<<< HEAD
         self.codegen_subgraph(conditional.true_subgraph, outer_inputs, outer_outputs)
         self.writeline(ExitSubgraphLine(self))
         self.writeline("else:")
         self.writeline(EnterSubgraphLine(self, conditional.false_subgraph.graph))
         self.codegen_subgraph(conditional.false_subgraph, outer_inputs, outer_outputs)
+=======
+        if V.graph.aot_mode:
+            outer_outputs = [f"{name}[{i}]" for i in range(len(conditional.outputs))]
+            self.codegen_subgraph_by_inlining(
+                conditional.true_subgraph, outer_inputs, outer_outputs
+            )
+        else:
+            self.codegen_subgraph(conditional.true_subgraph, outer_inputs, name)
+
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline("else:")
+        self.writeline(EnterSubgraphLine(self, conditional.false_subgraph.graph))
+        if V.graph.aot_mode:
+            outer_outputs = [f"{name}[{i}]" for i in range(len(conditional.outputs))]
+            self.codegen_subgraph_by_inlining(
+                conditional.false_subgraph, outer_inputs, outer_outputs
+            )
+        else:
+            self.codegen_subgraph(conditional.false_subgraph, outer_inputs, name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.writeline(ExitSubgraphLine(self))
 
     def codegen_while_loop(self, while_loop):
@@ -2722,17 +3747,40 @@ def codegen_while_loop(self, while_loop):
 
         self.writeline("while True:")
         self.writeline(EnterSubgraphLine(self, while_loop.cond_subgraph.graph))
+<<<<<<< HEAD
         self.codegen_subgraph(
             while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
         )
+=======
+
+        if V.graph.aot_mode:
+            self.codegen_subgraph_by_inlining(
+                while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
+            )
+        else:
+            self.codegen_subgraph_with_flattened_outputs(
+                while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.writeline(
             f"if not {cond_outer_outputs[0]}: break"
         )  # condition doesn't hold
         self.writeline(ExitSubgraphLine(self))
         self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+<<<<<<< HEAD
         self.codegen_subgraph(
             while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
         )
+=======
+        if V.graph.aot_mode:
+            self.codegen_subgraph_by_inlining(
+                while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
+            )
+        else:
+            self.codegen_subgraph_with_flattened_outputs(
+                while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.writeline(ExitSubgraphLine(self))
 
     @staticmethod
@@ -2840,14 +3888,26 @@ def get_graph_inputs(
         self,
     ) -> dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]:
         if signature := self.partition_signatures:
+<<<<<<< HEAD
             inputs = signature.input_nodes
+=======
+            inputs = signature.input_nodes | {
+                str(s): s for s in signature.symbol_inputs
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             inputs = V.graph.graph_inputs
         return inputs
 
     def get_graph_input_names(self) -> list[str]:
         if signature := self.partition_signatures:
+<<<<<<< HEAD
             names = list(signature.input_nodes.keys())
+=======
+            names = list(signature.input_nodes.keys()) + [
+                symbol_input.name for symbol_input in signature.symbol_inputs
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             names = V.graph.graph_input_names
         return names
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
new file mode 100644
index 000000000000..12cf3859c2f4
--- /dev/null
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -0,0 +1,693 @@
+import dataclasses
+import functools
+import logging
+import operator
+import textwrap
+from collections import Counter
+from typing import Any, Callable, Optional, Union
+
+import sympy
+
+import torch
+from torch._higher_order_ops.triton_kernel_wrap import (
+    TraceableTritonKernelWrapper,
+    tracing_triton_hopifier_singleton,
+    triton_kernel_wrapper_mutation,
+)
+from torch._inductor.codecache import PyCodeCache
+from torch._inductor.runtime.triton_heuristics import CachingAutotuner
+from torch._inductor.select_algorithm import extern_kernels  # noqa: F401
+from torch._inductor.utils import sympy_product
+from torch._inductor.virtualized import V
+from torch._library.triton import wrap_triton
+from torch.fx import GraphModule
+from torch.utils import _pytree as pytree
+from torch.utils._sympy.functions import FloorDiv
+
+from .. import config, ir
+from ..utils import convert_shape_to_symint, convert_to_symint, LineContext
+from .common import (
+    CodegenSymbol,
+    FileBackedGraphModule,
+    WorkspaceArg,
+    WorkspaceZeroMode,
+)
+from .wrapper import (
+    AllocateLine,
+    BufferLike,
+    CommBufferAllocateLine,
+    CommBufferFreeLine,
+    CommentLine,
+    EnterDeviceContextManagerLine,
+    EnterSubgraphLine,
+    ExitDeviceContextManagerLine,
+    ExitSubgraphLine,
+    ExternKernelAllocLine,
+    ExternKernelOutLine,
+    FreeIfNotReusedLine,
+    FreeLine,
+    KernelCallLine,
+    KernelDefinitionLine,
+    Line,
+    MultiOutputLine,
+    NullLine,
+    PythonWrapperCodegen,
+    ReinterpretLine,
+    ReuseLine,
+    SymbolicCallArg,
+    SymbolicCallArgLine,
+    WrapperLine,
+)
+
+
+aten = torch.ops.aten
+log = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class SymbolBuffer(CodegenSymbol):
+    """
+    Represents a sympy.Symbol graph input.
+    """
+
+    symbol: sympy.Symbol
+
+    def get_name(self) -> str:
+        return str(self.symbol)
+
+    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
+        return self.symbol
+
+
+CodegenBuffer = Union[BufferLike, SymbolBuffer]
+
+
+@dataclasses.dataclass
+class TritonKernel:
+    """
+    Stores metadata about Triton kernels for use in FX.
+    """
+
+    tuner: CachingAutotuner
+    wrapped: TraceableTritonKernelWrapper
+
+
+class WrapperFxCodegen(PythonWrapperCodegen):
+    """
+    Backend to generate wrapper code as an FX IR graph.
+    """
+
+    supports_caching = False
+
+    def _generate(self, is_inference: bool) -> tuple[FileBackedGraphModule, None]:
+        self.run_wrapper_ir_passes(is_inference)
+
+        prologue = "\n".join(
+            [
+                self.imports.getvalue(),
+                self.header.getvalue(),
+            ]
+        )
+        gm = FxConverter(lines=self.lines, prologue=prologue).generate()
+        compiled_fn = self.compile_graph(gm)
+
+        return FileBackedGraphModule(gm, compiled_fn), None
+
+    def compile_graph(self, gm: GraphModule) -> Callable[..., Any]:
+        """
+        Converts the graph module into a runnable function. The default implementation
+        is simply an interpreter calling kernels in eager mode. Derived backends can
+        override this to do further compilation.
+        """
+        return gm.forward
+
+    @classmethod
+    def create(
+        cls,
+        is_subgraph: bool,
+        subgraph_name: Optional[str],
+        parent_wrapper: Optional[PythonWrapperCodegen],
+        partition_signatures: Optional[ir.GraphPartitionSignature] = None,
+    ) -> "WrapperFxCodegen":
+        if is_subgraph:
+            raise NotImplementedError(
+                "Subgraphs are not yet supported by FX conversion"
+            )
+
+        # For derived backends, this could be a subclass.
+        return cls()
+
+
+@dataclasses.dataclass
+class FxConverter:
+    """
+    Generates FX IR from Wrapper IR. As each instance is only meant to be used once, the
+    input and output code are stored as attributes.
+    """
+
+    lines: list[Line]
+    prologue: str = ""
+
+    def __post_init__(self) -> None:
+        graph = torch.fx.Graph()
+        self.gm = GraphModule({}, graph)  # Wrapper FX IR.
+        self.buffer_to_node: dict[
+            Optional[str], torch.fx.Node
+        ] = {}  # Symbol table for codegen.
+        self.kernels: dict[str, TritonKernel] = {}  # Table to store Triton kernels.
+        self._unique_symbol_ids: Counter[str] = Counter()
+
+    def _import_kernel(self, code: str, kernel_name: str) -> CachingAutotuner:
+        """
+        Imports a kernel from source, possibly autotuning block parameters.
+        """
+        module_code = "\n".join([self.prologue, code])
+        mod = PyCodeCache.load(module_code)
+        kernel = getattr(mod, kernel_name)
+
+        if not isinstance(kernel, CachingAutotuner):
+            raise NotImplementedError(
+                textwrap.dedent(f"""
+                Unsupported type for kernel {kernel_name}: {type(kernel)}.
+                FX conversion only supports Triton kernels.
+            """)
+            )
+
+        return kernel
+
+    def _fake_tensor(
+        self,
+        size: tuple[Any, ...],
+        stride: tuple[Any, ...],
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        with V.fake_mode:
+            return torch.empty_strided(
+                convert_shape_to_symint(size),
+                convert_shape_to_symint(stride),
+                dtype=dtype,
+                device=device,
+            )
+
+    def _create_meta_from_buffer(
+        self, node: torch.fx.Node, buffer: CodegenBuffer
+    ) -> None:
+        name = buffer.get_name()
+        assert name
+        node.name = name
+        node.meta["val"] = buffer.get_example()
+
+    def _create_as_strided(
+        self,
+        input_node: torch.fx.Node,
+        size: tuple[Any, ...],
+        stride: tuple[Any, ...],
+        offset: Union[int, sympy.Expr],
+    ) -> torch.fx.Node:
+        return self.gm.graph.call_function(
+            torch.as_strided,
+            args=(
+                input_node,
+                convert_shape_to_symint(size),
+                convert_shape_to_symint(stride),
+                convert_to_symint(offset),
+            ),
+        )
+
+    def _record_allocation(self, buffer: CodegenBuffer, node: torch.fx.Node) -> None:
+        """
+        Updates the symbol table to record that an Inductor buffer maps to the result of
+        an FX node.
+        """
+        assert node not in self.buffer_to_node
+        self.buffer_to_node[buffer.get_name()] = node
+
+    def _free(self, buffer: Union[CodegenBuffer, ir.TorchBindObject]) -> None:
+        """
+        Removes the buffer from the symbol table.
+        """
+        name = buffer.get_name()
+        del self.buffer_to_node[name]
+
+    def _lookup_args(self, args: tuple[Any, ...]) -> tuple[Any, ...]:
+        """
+        Maps call args back to FX nodes.
+        """
+        return tuple(
+            self.buffer_to_node[arg]
+            if isinstance(arg, str)
+            else arg.inner_expr
+            if isinstance(arg, SymbolicCallArg)
+            else arg
+            for arg in args
+        )
+
+    def _get_buffer(self, node: ir.IRNode) -> CodegenBuffer:
+        """
+        Extract buffer data from an IR node.
+        """
+        if isinstance(node, (ir.Buffer, WorkspaceArg)):
+            return node
+        elif isinstance(node, (ir.BaseView, ir.MutableBox)):
+            return self._get_buffer(node.data)
+        elif isinstance(node, sympy.Symbol):
+            return SymbolBuffer(node)
+        else:
+            raise NotImplementedError(f"Unable to extract buffer from node: {node}")
+
+    def _generate_graph_inputs(self) -> None:
+        """
+        Converts graph inputs to FX placeholders.
+        """
+        for name, ir_node in V.graph.graph_inputs.items():
+            # Introduce a new symbol for constant inputs.
+            buffer = (
+                SymbolBuffer(sympy.Symbol(name, is_integer=True))
+                if isinstance(ir_node, (int, float, sympy.Integer, sympy.Float))
+                else self._get_buffer(ir_node)
+            )
+            node = self.gm.graph.placeholder(buffer.get_name())
+            self._create_meta_from_buffer(node, buffer)
+            self._record_allocation(buffer, node)
+
+    def _generate_buffer(self, node: ir.IRNode) -> Optional[torch.fx.Node]:
+        """
+        Generates FX IR for transformations on a buffer, such as ReinterpretView.
+        Does nothing if no such transformations are present.
+        """
+
+        def generate_to_buffer(node: ir.IRNode) -> Optional[BufferLike]:
+            if isinstance(node, (ir.Buffer, WorkspaceArg)):
+                return node
+            elif isinstance(node, ir.NoneAsConstantBuffer):
+                return None
+            elif isinstance(node, ir.StorageBox):
+                return generate_to_buffer(node.data)
+            elif isinstance(node, ir.ReinterpretView):
+                # We need to introduce a new symbol if the output is a ReinterpretView.
+                # Use a WorkspaceArg for this.
+                buffer = self._get_buffer(node.data)
+                assert isinstance(buffer, (ir.Buffer, WorkspaceArg))
+                unique_name = self.gm.graph._graph_namespace.create_name(
+                    f"{buffer.get_name()}_view", None
+                )
+                device = buffer.get_device()
+                assert device
+                reused_as = WorkspaceArg(
+                    count=buffer.get_size(),
+                    zero_mode=WorkspaceZeroMode.UNINITIALIZED,
+                    device=device,
+                    outer_name=unique_name,
+                    dtype=buffer.get_dtype(),
+                )
+
+                # Generate FX IR for the view.
+                self._generate_reinterpret_helper(buffer, reused_as, node.layout)
+
+                return reused_as
+            else:
+                raise NotImplementedError(f"Unrecognized buffer/view node: {node}")
+
+        buffer = generate_to_buffer(node)
+        return self.buffer_to_node[buffer.get_name()] if buffer is not None else None
+
+    def _generate_output(self) -> None:
+        """
+        Generate FX IR for graph outputs.
+        """
+        output_nodes = [
+            self._generate_buffer(node)
+            for idx, node in enumerate(V.graph.graph_outputs)
+        ]
+
+        # Single return elements don't use a tuple.
+        output_value = output_nodes[0] if len(output_nodes) == 1 else output_nodes
+
+        self.gm.graph.output(output_value)
+
+    def generate(self) -> torch.fx.GraphModule:
+        """
+        Main entrypoint for FX codegen.
+        """
+        self._generate_graph_inputs()
+
+        # Generate FX IR from Wrapper IR lines.
+        for line in self.lines:
+            if isinstance(line, WrapperLine):
+                line.codegen_fx(self)(line)
+            elif isinstance(line, LineContext):
+                # Ignore line context in FX IR.
+                pass
+            else:
+                raise NotImplementedError(
+                    textwrap.dedent(
+                        f"""
+                    Found line of unrecognized type '{type(line)}':
+                        '{line}'
+
+                    FX conversion only supports Wrapper IR lines.
+                    """
+                    )
+                )
+
+        self._generate_output()
+        self.gm.recompile()
+        return self.gm
+
+    def _generate_allocate(self, line: WrapperLine) -> None:
+        assert isinstance(line, AllocateLine)
+        buffer = line.node
+        name = buffer.get_name()
+        assert name not in V.graph.removed_buffers
+
+        device = buffer.get_device()
+        dtype = buffer.get_dtype()
+        shape = convert_shape_to_symint(buffer.get_size())
+        stride = convert_shape_to_symint(buffer.get_stride())
+
+        node = self.gm.graph.call_function(
+            torch.empty_strided,
+            args=(shape, stride),
+            kwargs={"dtype": dtype, "device": device},
+        )
+        assert name
+        node.name = name
+        self._create_meta_from_buffer(node, buffer)
+        self._record_allocation(buffer, node)
+
+    def _generate_comment(self, line: WrapperLine) -> None:
+        assert isinstance(line, CommentLine)
+        # We ignore comments in FX IR.
+
+    def _generate_enter_device_context_manager(self, line: WrapperLine) -> None:
+        assert isinstance(line, EnterDeviceContextManagerLine)
+        # We ignore the device context in FX IR.
+
+    def _generate_exit_device_context_manager(self, line: WrapperLine) -> None:
+        assert isinstance(line, ExitDeviceContextManagerLine)
+        # We ignore the device context in FX IR.
+
+    def _generate_enter_subgraph(self, line: WrapperLine) -> None:
+        assert isinstance(line, EnterSubgraphLine)
+        raise NotImplementedError("Subgraphs are not yet supported by FX conversion")
+
+    def _generate_exit_subgraph(self, line: WrapperLine) -> None:
+        assert isinstance(line, ExitSubgraphLine)
+        raise NotImplementedError("Subgraphs are not yet supported by FX conversion")
+
+    def _generate_free(self, line: WrapperLine) -> None:
+        assert isinstance(line, FreeLine)
+
+        buf = line.node
+
+        # No need to free placeholders.
+        if self.buffer_to_node[buf.get_name()].op == "placeholder":
+            return
+
+        self._free(buf)
+
+    def _generate_free_if_not_reused(self, line: WrapperLine) -> None:
+        assert isinstance(line, FreeIfNotReusedLine)
+        buf = line.node
+        assert buf.get_name() not in V.graph.removed_buffers
+        if not line.is_reused:
+            self._free(buf)
+
+    def _generate_line_context(self, line: WrapperLine) -> None:
+        assert isinstance(line, LineContext)
+        # We ignore line context in FX IR.
+
+    def _generate_reinterpret(self, line: WrapperLine) -> None:
+        assert isinstance(line, ReinterpretLine)
+        self._generate_reinterpret_helper(line.node, line.reused_as, line.layout)
+
+    def _generate_reinterpret_helper(
+        self, input_buffer: BufferLike, result_buffer: BufferLike, layout: ir.Layout
+    ) -> None:
+        input_node = self.buffer_to_node[input_buffer.get_name()]
+
+        # Look up output metadata.
+        name = result_buffer.get_name()
+        assert name
+        size = tuple(layout.size)
+        stride = tuple(layout.stride)
+        if isinstance(layout, ir.NonOwningLayout):
+            # Look up the view's layout.
+            view = layout.view
+            assert isinstance(view, ir.ReinterpretView), (
+                f"unexpected type: {type(view)}"
+            )
+            layout = view.layout
+        offset = input_buffer.get_offset() + layout.offset
+
+        # Map ReinterpretView to as_strided.
+        result_node = self._create_as_strided(input_node, size, stride, offset)
+        result_node.name = name
+        result_node.meta["val"] = layout.get_example()
+        self._record_allocation(result_buffer, result_node)
+
+    def _generate_reuse(self, line: WrapperLine) -> None:
+        assert isinstance(line, ReuseLine)
+        old = line.node
+        new = line.reused_as
+        assert not any(buf.get_name() in V.graph.removed_buffers for buf in (old, new))
+        assert old.get_dtype() == new.get_dtype()
+
+        old_node = self.buffer_to_node[old.get_name()]
+        result_node = old_node
+
+        # Change shape and stride.
+        size = tuple(new.get_size())
+        stride = tuple(new.get_stride())
+        offset = new.get_offset()
+        if (
+            tuple(old.get_size()) != size
+            or tuple(old.get_stride()) != stride
+            or old.get_offset() != offset
+        ):
+            result_node = self._create_as_strided(old_node, size, stride, offset)
+            self._create_meta_from_buffer(result_node, new)
+
+        self._record_allocation(new, result_node)
+
+        # Free the old buffer, if we allocated a new tensor.
+        if (
+            old.get_name() not in V.graph.get_output_names()
+            and line.delete_old
+            and result_node is not old_node
+        ):
+            self._free(old)
+
+    def _generate_multi_output(self, line: WrapperLine) -> None:
+        assert isinstance(line, MultiOutputLine)
+
+        # Extract the index for tuple access.
+        inds = line.indices[0][1:]
+        assert len(inds) == 1, f"Cannot convert {inds} to an index."
+        idx = inds[0]
+
+        arg_node = self.buffer_to_node[line.arg_name]
+        node = self.gm.graph.call_function(operator.getitem, args=(arg_node, idx))
+        node.meta["val"] = arg_node.meta["val"][idx]
+        node.name = line.result_name
+        self.buffer_to_node[line.result_name] = node
+
+    def _generate_null(self, line: WrapperLine) -> None:
+        assert isinstance(line, NullLine)
+        # Does nothing.
+
+    def _generate_comm_buffer_allocate(self, line: WrapperLine) -> None:
+        assert isinstance(line, CommBufferAllocateLine)
+        raise NotImplementedError("Comm buffer allocation is not yet supported")
+
+    def _generate_comm_buffer_free(self, line: WrapperLine) -> None:
+        assert isinstance(line, CommBufferFreeLine)
+        self._free(line.node)
+
+    def _generate_triton_call(self, line: WrapperLine) -> None:
+        assert isinstance(line, KernelCallLine)
+
+        # Collect all kwargs, including autotuned block sizes.
+        call_args = self._lookup_args(line.call_args)
+        kernel = self.kernels[line.kernel_name]
+        tuner = kernel.tuner
+
+        # Optionally autotune the kernels.
+        # The FX backend currently only supports compile-time tuning.
+        kernel_name = tuner.fn.__name__
+        if config.triton.autotune_at_compile_time:
+            from triton.runtime import driver
+
+            log.info("Autotuning Triton kernel %s at compile time.", kernel_name)
+            device = driver.active.get_current_device()
+            stream = driver.active.get_current_stream(device)
+
+            def node_to_tuning_arg(arg: Any) -> Any:
+                """
+                Create real tensors for autotuning arguments, substituting size hints
+                for dynamic shapes.
+                """
+                to_size_hint = functools.partial(
+                    pytree.tree_map, V.graph.sizevars.size_hint
+                )
+                if not isinstance(arg, torch.fx.Node):
+                    return to_size_hint(arg)
+
+                fake = arg.meta["val"]
+                return torch.empty_strided(
+                    to_size_hint(fake.shape),
+                    to_size_hint(fake.stride()),
+                    device=device,
+                ).zero_()
+
+            arg_values = [node_to_tuning_arg(arg) for arg in call_args]
+            tuner.run(*arg_values, stream=stream)
+        else:
+            log.info(
+                "Skipping autotuning for kernel %s. Set config.triton.autotune_at_compile_time = True to enable.",
+                kernel_name,
+            )
+
+        kernel_config = tuner.compile_results[0].config
+        call_args, grid = tuner._interpret_args_grid(call_args, kernel_config)
+        call_kwargs = dict(zip(tuner.triton_meta["signature"], call_args))
+        call_kwargs.update(kernel_config.kwargs)
+
+        def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
+            """
+            Converts floor(x / c) to x // c.
+            """
+            if isinstance(expr, sympy.core.mul.Mul) and isinstance(
+                expr.args[0], sympy.Rational
+            ):
+                # Only the first argument of a Mul can be a Rational.
+                frac = expr.args[0]
+                numerator = sympy_product(expr.args[1:]) * frac.numerator
+                denominator = frac.denominator
+
+                # Sanity check the results.
+                new_expr = numerator / denominator
+                assert V.graph.sizevars.statically_known_equals(new_expr, expr), (
+                    f"Unsound replacement: '{new_expr}' != '{expr}'"
+                )
+
+                return FloorDiv(numerator, denominator)
+            else:
+                return sympy.floor(expr)
+
+        def expr_to_symint(expr: Union[int, sympy.Expr]) -> Union[int, sympy.Expr]:
+            return (
+                convert_to_symint(expr.replace(sympy.floor, replace_floor_div))
+                if isinstance(expr, sympy.Expr)
+                else expr
+            )
+
+        # Convert sympy expressions to symints.
+        # Use FloorDiv over sympy.floor, so we can get nicer Python code from FX.
+        wrapper_grid = [tuple(expr_to_symint(dim) for dim in grid)]
+        call_kwargs = {name: expr_to_symint(val) for name, val in call_kwargs.items()}
+
+        # Store non-graphable kwargs in the side table.
+        (
+            call_kwargs,
+            constant_args_idx,
+        ) = tracing_triton_hopifier_singleton.store_non_graphable_args(call_kwargs)
+
+        self.gm.graph.call_function(
+            triton_kernel_wrapper_mutation,
+            kwargs={
+                "kernel_idx": kernel.wrapped.kernel_idx,
+                "constant_args_idx": constant_args_idx,
+                "grid": wrapper_grid,
+                "tma_descriptor_metadata": {},
+                "kwargs": call_kwargs,
+            },
+        )
+
+    def _generate_extern_kernel_alloc(self, line: WrapperLine) -> None:
+        assert isinstance(line, ExternKernelAllocLine)
+        node = line.node
+        self._generate_extern_kernel_common(node, node)
+
+    def _generate_extern_kernel_out(
+        self,
+        line: WrapperLine,
+    ) -> None:
+        assert isinstance(line, ExternKernelOutLine)
+        node = line.node
+        out_node = node.output_view if node.output_view else node
+        self._generate_extern_kernel_common(node, out_node)
+
+    def _generate_extern_kernel_common(
+        self, kernel: ir.ExternKernel, out_ir_node: ir.IRNode
+    ) -> None:
+        """
+        Generates FX IR from either ExternKernelAlloc or ExternKernelOut.
+        """
+
+        # Get FX nodes corresponding to the call args.
+        tensor_nodes = tuple(self._generate_buffer(arg) for arg in kernel.inputs)
+        args = tensor_nodes + tuple(kernel.constant_args)
+
+        # Get the result buffer.
+        # Some kernels write to a pre-existing output tensor via the "out" kwarg.
+        kwargs = kernel.kwargs.copy()
+        result_buffer: Optional[str] = None
+        if isinstance(kernel, ir.ExternKernelOut):
+            kwargs["out"] = self.buffer_to_node[out_ir_node.codegen_reference()]
+        elif isinstance(kernel.layout, (ir.Layout, ir.MultiOutputLayout)):
+            result_buffer = kernel.get_name()
+        elif isinstance(kernel.layout, ir.NoneLayout):
+            pass
+        else:
+            raise NotImplementedError(f"Unrecognized output layout: {kernel.layout}")
+
+        # Look up the kernel function from its name.
+        kernel_name = kernel.get_kernel_name()
+        module_name, kernel_name = kernel_name.split(".", 1)
+        op = globals()[module_name]  # E.g. extern_kernels, aten, etc.
+        for subname in kernel_name.split("."):
+            op = getattr(op, subname)  # E.g. extern_kernels.addmm
+
+        fx_node = self.gm.graph.call_function(op, args=args, kwargs=kwargs)
+
+        # Assign the result to the given name.
+        if result_buffer:
+            assert "out" not in kwargs, (
+                f"Extern kernel '{kernel}' has both result and out kwarg. Expected only one."
+            )
+            fx_node.name = result_buffer
+            self.buffer_to_node[result_buffer] = fx_node
+
+            arg_tensors = [
+                arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
+                for arg in args
+            ]
+
+            # Run the operation to propagate metadata.
+            fx_node.meta["val"] = op(*arg_tensors, **kwargs)
+
+    def _generate_kernel_call(self, line: WrapperLine) -> None:
+        assert isinstance(line, KernelCallLine)
+        if not line.triton:
+            raise NotImplementedError("FX conversion only supports Triton kernels.")
+
+        self._generate_triton_call(line)
+
+    def _generate_kernel_definition(self, line: WrapperLine) -> None:
+        assert isinstance(line, KernelDefinitionLine)
+
+        # Generate code for the kernel.
+        kernel_code = PythonWrapperCodegen._format_kernel_definition(
+            line.kernel_name, line.kernel_body, metadata=line.metadata
+        )
+
+        # Import the module and store the JIT kernel.
+        tuner = self._import_kernel(kernel_code, line.kernel_name)
+        wrapped = wrap_triton(tuner.fn)
+        self.kernels[line.kernel_name] = TritonKernel(tuner, wrapped)
+
+    def _generate_symbolic_call_arg(self, line: WrapperLine) -> None:
+        assert isinstance(line, SymbolicCallArgLine)
+        # No need for an FX node, as we will pass the arg to kernels via a SymInt.
diff --git a/torch/_inductor/codegen/xpu/device_op_overrides.py b/torch/_inductor/codegen/xpu/device_op_overrides.py
index 8678e30d26b0..39a7310be86e 100644
--- a/torch/_inductor/codegen/xpu/device_op_overrides.py
+++ b/torch/_inductor/codegen/xpu/device_op_overrides.py
@@ -2,7 +2,15 @@
 
 from typing import Optional
 
+<<<<<<< HEAD
 from ..common import DeviceOpOverrides, register_device_op_overrides
+=======
+from ..common import (
+    DeviceOpOverrides,
+    register_device_op_overrides,
+    TritonScratchWorkspace,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class XPUDeviceOpOverrides(DeviceOpOverrides):
@@ -54,7 +62,13 @@ def cpp_kernel_type(self) -> str:
     def cpp_device_ptr(self) -> str:
         return "void *"
 
+<<<<<<< HEAD
     def cpp_global_scratch(self, idx: int) -> Optional[tuple[str, str]]:
+=======
+    def cpp_global_scratch(
+        self, idx: int, workspace: TritonScratchWorkspace
+    ) -> Optional[tuple[list[str], str]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
 
diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py
index deb4ca2a22bb..ef14ce9f91d5 100644
--- a/torch/_inductor/comms.py
+++ b/torch/_inductor/comms.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import heapq
+<<<<<<< HEAD
 import logging
 import operator
 import sys
@@ -10,11 +11,28 @@
 from typing import Any, TYPE_CHECKING
 
 import torch
+=======
+import importlib
+import logging
+import operator
+import sys
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, TYPE_CHECKING
+
+import torch
+from torch._logging import trace_structured
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.utils._ordered_set import OrderedSet
 
 from . import config, ir
 from .dependencies import WeakDep
+<<<<<<< HEAD
+=======
+from .memory import estimate_peak_memory, FreeableInputBuffer, get_freeable_input_buf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .utils import (
     contains_collective,
     contains_wait,
@@ -24,13 +42,21 @@
     is_fallback_op,
     is_wait,
 )
+<<<<<<< HEAD
+=======
+from .virtualized import V
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
 overlap_log = torch._logging.getArtifactLogger(__name__, "overlap")
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from .scheduler import BaseSchedulerNode
+=======
+    from torch._inductor.scheduler import BaseSchedulerNode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def sink_waits(snodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
@@ -70,6 +96,201 @@ def reorder_compute_for_overlap(
     )
 
 
+<<<<<<< HEAD
+=======
+def reorder_communication_preserving_peak_memory(
+    snodes: list[BaseSchedulerNode],
+) -> list[BaseSchedulerNode]:
+    """
+    Reorders communication ops relative to computation ops to improve communication-compute overlapping and hide comm
+    latency.  Stops moving a particular op if it reaches a point that would have increased the peak memory footprint.
+
+    Currently, follows these heuristics (subject to change or tune):
+    - never reorders collectives relative to one another, for SPMD safety
+    - has an option for per-collective prefetch limit, but does not enable it by default
+    - limits the total number of reorder steps to some factor of the graph size to prevent worst-case quadratic
+      performance
+
+    Prerequisite: sink_comms_and_waits - ensure comm and wait nodes are scheduled as late as possible, respecting data
+    dependencies.  That allows reorder_communication_preserving_peak_memory to take a best case peak-memory snapshot,
+    and then monotonically improve latency by moving collectives backward in time.
+
+    Peak memory impact is computed in an iterative fashion.  First, memory use at each timestep is computed, and global
+    peak memory is computed as a max over timesteps.  Then, when swapping any two adjacent nodes, only the curr-memory
+    for the earlier of the nodes after the swap is affected.  This enables checking step by step whether a swap is
+    peak-memory-safe, and bailing out if not.  Example:
+
+    0   n0      C0
+    1   n1      C0 + Allocs(n1) - Frees(n1)
+    2   n2      C0 + Allocs(n1) - Frees(n1) + Allocs(n2) - Frees(n2)
+
+    0   n0      C0
+    1   n2      C0 + Allocs(n2) - Frees(n2)    <-- After moving n2 to Time 1, only time1 memory changes
+    2   n1      C0 + Allocs(n2) - Frees(n2) + Allocs(n1) - Frees(n1)
+
+    """
+    reordered_snodes, node_stats = (
+        _reorder_communication_preserving_peak_memory_internal(snodes)
+    )
+    improvement = {snode: node_stats[snode].improvement for snode in node_stats}
+    total_improvement = sum([improvement[snode] for snode in improvement])
+    total_moves = sum([node_stats[snode].moves for snode in node_stats])
+
+    reorder_log_str = (
+        f"reorder_communication_preserving_peak_memory improved overlap by {total_improvement} ns"
+        f" after {total_moves} reorders.\n"
+    )
+    headers = [
+        "Collective node",
+        "initial exposed",
+        "final exposed",
+        "improvement",
+        "limiting factor",
+        "moves",
+    ]
+    rows = [
+        [
+            node_summary(snode),
+            node_reorder_info.initial_exposed,
+            node_reorder_info.final_exposed,
+            node_reorder_info.improvement,
+            node_reorder_info.limiting_factor,
+            node_reorder_info.moves,
+        ]
+        for snode, node_reorder_info in node_stats.items()
+    ]
+    if importlib.util.find_spec("tabulate"):
+        from tabulate import tabulate
+
+        reorder_log_str += tabulate(
+            rows,
+            headers=headers,
+        )
+    else:
+        reorder_log_str += (
+            "Please `pip install tabulate` to nicely render overlap stats.\n"
+        )
+        reorder_log_str += str(headers) + "\n"
+        reorder_log_str += "\n".join(map(str, rows))
+    overlap_log.info(reorder_log_str)
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "reorder_communication_preserving_peak_memory",
+            "encoding": "string",
+        },
+        payload_fn=lambda: reorder_log_str,
+    )
+
+    return reordered_snodes
+
+
+@dataclass
+class ReorderInfo:
+    """
+    Debug info describing how an individual snode was reordered
+    """
+
+    initial_exposed: float = -1
+    final_exposed: float = -1
+    limiting_factor: str = "None"
+    moves: int = 0
+
+    @property
+    def improvement(self):
+        return self.initial_exposed - self.final_exposed
+
+
+def _reorder_communication_preserving_peak_memory_internal(
+    snodes: list[BaseSchedulerNode],
+) -> tuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, ReorderInfo]]:
+    """
+    Internal testing helper that also returns debug info.
+    Returns:
+        - reordered snodes list
+        - dict {snode: ReorderInfo}
+    """
+    # heuristic to avoid degenerating to quadratic time
+    MOVE_LIMIT = len(snodes) * 100
+    total_moves = 0
+    # TODO - experiment with whether this limit is useful, setting `len(snodes)` disables it
+    PER_COLLECTIVE_PREFETCH_LIMIT = len(snodes)
+    if config.reorder_prefetch_limit is not None:
+        PER_COLLECTIVE_PREFETCH_LIMIT = config.reorder_prefetch_limit
+    graph_inputs: OrderedSet[str] = OrderedSet(V.graph.graph_inputs.keys())
+    graph_outputs: OrderedSet[str] = OrderedSet(V.graph.get_output_names())
+    name_to_freeable_input_buf: dict[str, FreeableInputBuffer] = get_freeable_input_buf(
+        snodes, graph_inputs
+    )
+    peak_memory, curr_memory = estimate_peak_memory(
+        snodes, name_to_freeable_input_buf, graph_outputs
+    )
+    runtimes = {snode: estimate_op_runtime(snode) for snode in snodes}
+
+    # debug stats
+    stats: dict[BaseSchedulerNode, ReorderInfo] = {}
+
+    def exposed_communication_time(collective_snode, remaining_snodes):
+        # assumes a linear schedule and computes the overlap of the collective with the remaining nodes
+        comm_time = estimate_op_runtime(collective_snode)
+        compute_time = 0.0
+        for snode in remaining_snodes:
+            if contains_collective(snode):
+                continue
+            if contains_wait(snode):
+                # TODO - if the wait is for a collective that started before this collective or on another stream,
+                # we can ignore it. Otherwise, it's the end of the road for overlap opportunities
+                break
+
+            compute_time += runtimes[snode]
+        return max(0, comm_time - compute_time)
+
+    for i, snode in enumerate(snodes):
+        if contains_collective(snode):
+            reorder_info = stats[snode] = ReorderInfo()
+            reorder_info.initial_exposed = reorder_info.final_exposed = (
+                exposed_communication_time(snode, snodes[i + 1 :])
+            )
+            if total_moves >= MOVE_LIMIT:
+                reorder_info.limiting_factor = "move limit"
+                continue
+            for j in range(i - 1, -1, -1):
+                prev_snode = snodes[j]
+                if j < max(0, i - PER_COLLECTIVE_PREFETCH_LIMIT):
+                    reorder_info.limiting_factor = "prefetch limit"
+                    break
+                if contains_collective(prev_snode):
+                    reorder_info.limiting_factor = "collective ordering"
+                    break
+                dep_names = OrderedSet([s.name for s in snode.unmet_dependencies])
+                if any(
+                    o.get_name() in dep_names for o in prev_snode.get_outputs()
+                ) and not contains_wait(prev_snode):
+                    reorder_info.limiting_factor = "data dependency"
+                    break
+                if peak_memory - curr_memory[j] < curr_memory[j - 1] - curr_memory[j]:
+                    reorder_info.limiting_factor = "peak memory"
+                    break
+                if reorder_info.final_exposed > runtimes[snode]:
+                    reorder_info.limiting_factor = "sufficient overlapping"
+                    break
+                reorder_info.moves += 1
+                total_moves += 1
+                tmp = snodes[j]
+                snodes[j] = snodes[j + 1]
+                snodes[j + 1] = tmp
+                # swapping nodes j and j+1 affects curr memory at j only
+                j_plus_one_alloc = curr_memory[j + 1] - curr_memory[j]
+                j_alloc = curr_memory[j] - curr_memory[j - 1]
+                curr_memory[j] = curr_memory[j] - j_alloc + j_plus_one_alloc
+                reorder_info.final_exposed = exposed_communication_time(
+                    snode, snodes[j + 1 :]
+                )
+
+    return snodes, stats
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _schedule_for_comm(
     snodes: list[BaseSchedulerNode],
     raise_comms: bool,
@@ -128,8 +349,13 @@ def _schedule_for_comm(
     for snode in snodes:
         if raise_comms and contains_collective(snode):
             scores_0[snode.get_name()] = comm_idx
+<<<<<<< HEAD
             for anc in snode.ancestors:
                 anc_fused_name = name_to_fused_node[anc].get_name()
+=======
+            for ancestor in snode.ancestors:
+                anc_fused_name = name_to_fused_node[ancestor].get_name()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 scores_0[anc_fused_name] = min(scores_0[anc_fused_name], comm_idx)
             comm_idx += 1
         elif sink_waits and contains_wait(snode):
@@ -259,6 +485,7 @@ def estimate_op_runtime(snode: BaseSchedulerNode) -> float:
 
 
 def node_summary(snode):
+<<<<<<< HEAD
     detail = ""
     if isinstance(snode.node, ir.ExternKernelOut):
         detail = f" ({snode.node.python_kernel_name})"
@@ -274,11 +501,53 @@ def visualize_overlap(order):
     total_est_runtime: float = 0.0
     cur_comm_node = None
     for snode in order:
+=======
+    snodes = snode.get_nodes()
+    if len(snodes) == 1:
+        detail = ""
+        if isinstance(snode.node, (ir.ExternKernelOut, ir._CollectiveKernel)):
+            detail = f" ({snode.node.python_kernel_name})"
+        layouts = [child.node.get_output_spec() for child in snode.get_nodes()]
+        out_tensor_info = ",".join(
+            [
+                f" (size={layout.size}, stride={layout.stride})"
+                if isinstance(layout, ir.Layout)
+                else ""
+                for layout in layouts
+            ]
+        )
+        try:
+            node_name = snode.node.maybe_get_name()
+        except AttributeError:
+            # TODO: node_summary was written without FusedSchedulerNode in mind, generally needs to be hardened
+            node_name = ""
+        return f"{snode.node.__class__.__name__}{detail}{out_tensor_info} ({node_name} ({snode.get_estimated_runtime():.0f} ns)"
+
+    # Flatten the summaries for Fused/Foreach/Grouped nodes
+    summaries = []
+    for child_snode in snodes:
+        summaries.append(node_summary(child_snode))
+    return f"{snode.__class__.__name__}: {', '.join(summaries)}"
+
+
+def visualize_overlap(order):
+    # TODO - this function probably doesn't do a very good job estimating the runtime because it doesn't carefully model
+    # streams and overlap. For now its mostly useful as a debug visualization.
+
+    total_est_runtime: float = 0.0
+    cur_comm_node = None
+
+    def step_log(step, msg):
+        overlap_log.debug(f"{step:>6}: {msg}")  # noqa: G004
+
+    for step, snode in enumerate(order):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if cur_comm_node is None:
             if contains_collective(snode):
                 total_est_runtime += estimate_op_runtime(snode)
                 cur_comm_node = snode.node
             elif is_wait(snode.node):
+<<<<<<< HEAD
                 raise AssertionError(
                     "Wait is not expected when there is no collective running"
                 )
@@ -296,6 +565,25 @@ def visualize_overlap(order):
                 cur_comm_node = None
             else:  # overlapped compute op
                 overlap_log.debug(f"| {node_summary(snode)}")  # noqa: G004
+=======
+                # raise AssertionError(
+                #     "Wait is not expected when there is no collective running"
+                # )
+                pass
+            else:  # exposed compute op
+                total_est_runtime += estimate_op_runtime(snode)
+            step_log(step, f"{node_summary(snode)}")
+        else:  # cur_comm_node is not None
+            if contains_collective(snode):
+                total_est_runtime += estimate_op_runtime(snode)
+                cur_comm_node = snode.node
+                step_log(step, f"{node_summary(snode)}")  # noqa: G004
+            elif is_wait(snode.node):  # end of this comm op
+                step_log(step, f"{node_summary(snode)}")
+                cur_comm_node = None
+            else:  # overlapped compute op
+                step_log(step, f"| {node_summary(snode)}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     overlap_log.debug(
         f"Est. runtime (ms): {total_est_runtime / 1000 / 1000}"  # noqa: G004
     )
@@ -305,6 +593,7 @@ def reorder_compute_and_comm_for_overlap(
     snodes: list[BaseSchedulerNode],
 ) -> list[BaseSchedulerNode]:
     order = snodes
+<<<<<<< HEAD
 
     for p in config.reorder_for_compute_comm_overlap_passes:
         if isinstance(p, str) and p in globals():
@@ -312,20 +601,54 @@ def reorder_compute_and_comm_for_overlap(
         if torch.distributed.get_rank() == 0:
             overlap_log.debug(
                 f"==== Visualize overlap before reordering pass {p} ===="  # noqa: G004
+=======
+    graph_inputs: OrderedSet[str] = OrderedSet(V.graph.graph_inputs.keys())
+    graph_outputs: OrderedSet[str] = OrderedSet(V.graph.get_output_names())
+    for p in config.reorder_for_compute_comm_overlap_passes:
+        if isinstance(p, str) and p in globals():
+            p = globals()[p]  # it is a builtin pass
+        assert callable(p), (
+            f"Invalid reorder_compute_and_comm_for_overlap pass: {p} is not callable"
+        )
+        peak_memory, _ = estimate_peak_memory(
+            snodes, get_freeable_input_buf(snodes, graph_inputs), graph_outputs
+        )
+        if torch.distributed.get_rank() == 0:
+            overlap_log.debug(
+                f"==== Visualize overlap before reordering pass {p}, {peak_memory=} ===="  # noqa: G004
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             try:
                 visualize_overlap(order)
             except Exception as e:
+<<<<<<< HEAD
                 overlap_log.debug(str(e))
         order = p(order)  # type: ignore[operator]
         if torch.distributed.get_rank() == 0:
             overlap_log.debug(
                 f"==== Visualize overlap after reordering pass {p} ===="  # noqa: G004
+=======
+                overlap_log.debug("", exc_info=e)
+        t0 = time.time()
+        order = p(order)  # type: ignore[operator]
+        t = time.time() - t0
+        if torch.distributed.get_rank() == 0:
+            overlap_log.debug(
+                f"==== Visualize overlap after reordering pass {p} (ran in {t} sec)===="  # noqa: G004
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             try:
                 visualize_overlap(order)
             except Exception as e:
+<<<<<<< HEAD
                 overlap_log.debug(str(e))
+=======
+                overlap_log.debug("", exc_info=e)
+        peak_memory, _ = estimate_peak_memory(
+            snodes, get_freeable_input_buf(snodes, graph_inputs), graph_outputs
+        )
+        print(f"final {peak_memory=}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return order
 
 
@@ -589,6 +912,11 @@ def remove_unused_getitem(g):
                     KeywordArg("rank"),
                     KeywordArg("dtype"),
                     KeywordArg("device"),
+<<<<<<< HEAD
+=======
+                    KeywordArg("group_name_inner"),
+                    KeywordArg("allocate_memory_from_process_group"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
                 KeywordArg("item_idx"),
             ),
@@ -627,6 +955,11 @@ def repl(
                 kwargs["rank"],
                 kwargs["dtype"],
                 kwargs["device"],
+<<<<<<< HEAD
+=======
+                kwargs["group_name_inner"],
+                kwargs["allocate_memory_from_process_group"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kwargs["group_size"],
                 kwargs["group_name"],
             ],
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 48975a868a95..2c87a2672996 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -16,6 +16,10 @@
 from contextlib import AbstractContextManager
 from inspect import currentframe
 from itertools import count
+<<<<<<< HEAD
+=======
+from operator import attrgetter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import Never, override, ParamSpec, Protocol, TypedDict, Unpack
 from unittest import mock
@@ -70,17 +74,30 @@
     index_expanded_dims,
     OutputCode,
 )
+<<<<<<< HEAD
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch._inductor.utils import (
     BoxedBool,
     count_tangents,
     fresh_inductor_cache,
+=======
+from torch._inductor.runtime.cache_dir_utils import cache_dir
+from torch._inductor.utils import (
+    BoxedBool,
+    count_tangents,
+    fresh_cache,
+    get_all_devices,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     InputType,
     is_gpu,
     should_assume_input_aligned,
     should_use_remote_fx_graph_cache,
     tensor_is_aligned,
 )
+<<<<<<< HEAD
+=======
+from torch._library.fake_class_registry import FakeScriptObject
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import trace_structured
 from torch._utils_internal import compile_time_strobelight_meta
 from torch.fx import GraphModule
@@ -95,6 +112,10 @@
 from ..fx.graph import _PyTreeCodeGen
 from ..utils._triton import has_triton
 from . import config, metrics
+<<<<<<< HEAD
+=======
+from .codegen.common import get_wrapper_codegen_for_device, init_backend_registration
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .debug import DebugContext
 from .decomposition import select_decomp_table
 from .exc import InductorError
@@ -124,6 +145,10 @@
 
     from torch._inductor.output_code import _StrideExprStr
     from torch._ops import OpOverload
+<<<<<<< HEAD
+=======
+    from torch.export.pt2_archive._package_weights import Weights
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from .ir import ExternKernelNode
 
@@ -159,6 +184,7 @@ class FxCompileMode(enum.Enum):
     SUBPROCESS = 2
 
 
+<<<<<<< HEAD
 def _fx_compile_mode_default() -> FxCompileMode:
     name = "TORCHINDUCTOR_FX_COMPILE_MODE"
     value = os.environ.get(name)
@@ -167,6 +193,23 @@ def _fx_compile_mode_default() -> FxCompileMode:
     try:
         value = value.upper()
         return FxCompileMode[value]
+=======
+# Return compile mode and use_async flag
+def _fx_compile_mode_default() -> tuple[FxCompileMode, bool]:
+    name = "TORCHINDUCTOR_FX_COMPILE_MODE"
+    value = os.environ.get(name)
+    if value is None:
+        return FxCompileMode.NORMAL, False
+
+    use_async = False
+    if value.lower().startswith("async+"):
+        use_async = True
+        value = value[6:]
+
+    try:
+        value = value.upper()
+        return FxCompileMode[value], use_async
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     except KeyError:
         import logging
 
@@ -179,10 +222,17 @@ def _fx_compile_mode_default() -> FxCompileMode:
         )
         # Remove from the environment so subprocesses don't ALSO complain.
         os.environ.pop(name)
+<<<<<<< HEAD
         return FxCompileMode.NORMAL
 
 
 fx_compile_mode = _fx_compile_mode_default()
+=======
+        return FxCompileMode.NORMAL, False
+
+
+fx_compile_mode, fx_compile_async = _fx_compile_mode_default()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
@@ -191,6 +241,10 @@ def _fx_compile_mode_default() -> FxCompileMode:
 static_inputs_log = torch._logging.getArtifactLogger(
     __name__, "cudagraph_static_inputs"
 )
+<<<<<<< HEAD
+=======
+inductor_metrics_log = torch._logging.getArtifactLogger(__name__, "inductor_metrics")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_static_input_idxs(num_fixed: int) -> list[int]:
@@ -209,7 +263,17 @@ def get_static_input_idxs(num_fixed: int) -> list[int]:
 def record_original_output_strides(gm: GraphModule) -> None:
     output_node = gm.graph.find_nodes(op="output")[0]
     output_strides = []
+<<<<<<< HEAD
     for output in output_node.args[0]:
+=======
+
+    if not isinstance(output_node.args[0], torch.fx.Node):
+        output_node_args = output_node.args[0]
+    else:
+        output_node_args = output_node.args
+
+    for output in output_node_args:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             isinstance(output, torch.fx.Node)
             and (val := output.meta.get("val")) is not None
@@ -221,12 +285,46 @@ def record_original_output_strides(gm: GraphModule) -> None:
     output_node.meta["original_output_strides"] = output_strides
 
 
+<<<<<<< HEAD
+=======
+def _recursive_record_original_output_strides(gm: GraphModule) -> None:
+    # invoke_subgraph HOP requires output strides to be respected
+    for node in gm.graph.find_nodes(
+        op="call_function", target=torch.ops.higher_order.invoke_subgraph
+    ):
+        subgraph = getattr(gm, node.args[0].target)
+        _recursive_record_original_output_strides(subgraph)
+
+    record_original_output_strides(gm)
+
+
+def _recursive_record_user_visible_output_idxs(gm: GraphModule) -> None:
+    # invoke_subgraph HOP requires output strides to be respected
+    for node in gm.graph.find_nodes(
+        op="call_function", target=torch.ops.higher_order.invoke_subgraph
+    ):
+        subgraph = getattr(gm, node.args[0].target)
+
+        for node in subgraph.graph.find_nodes(op="output"):
+            node.meta["user_visible_output_idxs"] = [
+                idx
+                for idx in range(len(node.args[0]))
+                if isinstance(node.args[0][idx], torch.fx.Node)
+            ]
+        _recursive_record_user_visible_output_idxs(subgraph)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @functools.lru_cache(None)
 def _step_logger() -> Callable[..., None]:
     return dynamo_logging.get_step_logger(log)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _warn_tf32_disabled() -> None:
     if (
         torch.cuda.is_available()
@@ -239,11 +337,85 @@ def _warn_tf32_disabled() -> None:
         )
 
 
+<<<<<<< HEAD
+=======
+def _resolve_name_collision(mod: GraphModule, gm: GraphModule) -> None:
+    """
+    In aot_export_module (make_fx), we create get_attr nodes with name prefix
+    "_tensor_constant" and "_torchbind_obj". See Tracer.create_arg() in
+    torch/fx/_symbolic_trace.py
+
+    However, this might result in name collision if the original mod already
+    has a different buffer with the same name.
+
+    We resolve this potential name collision here by changing the target name
+    with a new number post fix.
+    """
+
+    existing_keys = OrderedSet(
+        [name for name, val in mod.named_parameters(remove_duplicate=False)]
+    )
+    existing_keys.update(
+        OrderedSet([name for name, val in mod.named_buffers(remove_duplicate=False)])
+    )
+
+    def find_smallest_i(graph: fx.Graph, prefix: str) -> int:
+        i = 0
+        for node in graph.nodes:
+            if node.op == "get_attr" and node.target.startswith(prefix):
+                if len(node.target) > len(prefix):
+                    post_fix = node.target.split(prefix)[-1]
+                    if post_fix.isdigit():
+                        i = max(i, int(post_fix))
+        for key in existing_keys:
+            if key.startswith(prefix):
+                if len(key) > len(prefix):
+                    post_fix = key.split(prefix)[-1]
+                    if post_fix.isdigit():
+                        i = max(i, int(post_fix))
+        return i + 1
+
+    for node in gm.graph.nodes:
+        if node.op == "get_attr":
+            target_name = node.target
+            if not target_name.startswith(
+                "_tensor_constant"
+            ) and not target_name.startswith("_torchbind_obj"):
+                continue
+
+            if not hasattr(mod, target_name):
+                continue
+            gm_target = attrgetter(target_name)(gm)
+            model_target = attrgetter(target_name)(mod)
+            if (
+                torch.equal(gm_target, model_target)
+                and gm_target.dtype == model_target.dtype
+            ):
+                continue
+
+            prefix = (
+                "_tensor_constant"
+                if target_name.startswith("_tensor_constant")
+                else "_torchbind_obj"
+            )
+            new_id = find_smallest_i(gm.graph, prefix)
+            new_target_name = f"{prefix}{new_id}"
+            node.target = new_target_name
+            setattr(gm, new_target_name, gm_target)
+            existing_keys.add(new_target_name)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _unlift_graph(
     mod: GraphModule, gm: GraphModule, graph_signature: GraphSignature
 ) -> GraphModule:
     from torch.export.unflatten import _assign_attr, _AttrKind
 
+<<<<<<< HEAD
+=======
+    _resolve_name_collision(mod, gm)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     state_dict: dict[str, Union[torch.nn.parameter.Parameter, torch.Tensor]] = {}
     for name, param in mod.named_parameters(remove_duplicate=False):
         state_dict[name] = param
@@ -315,6 +487,7 @@ def _unlift_graph(
     return unlifted_gm
 
 
+<<<<<<< HEAD
 def _get_subgraph_names(gm: GraphModule) -> Generator[str, None, None]:
     for node in sorted(
         itertools.chain(
@@ -334,6 +507,30 @@ def _get_subgraph_names(gm: GraphModule) -> Generator[str, None, None]:
             body_subgraph_name = node.args[1].name
             yield cond_subgraph_name
             yield body_subgraph_name
+=======
+def _get_subgraph_names(
+    gm: GraphModule, skip_invoke_subgraph: bool = False
+) -> Generator[str, None, None]:
+    all_subgraph_names: OrderedSet[str] = OrderedSet(
+        x.target for x in gm.graph.find_nodes(op="get_attr")
+    )
+    fx_subgraph_names: OrderedSet[str] = OrderedSet()
+    for child_name, child_module in gm.named_children():
+        # Sometimes an owning_module can have unused children. Skip them
+        # by checking them from get_attr node targets.
+        if child_name in all_subgraph_names and isinstance(
+            child_module, torch.fx.GraphModule
+        ):
+            fx_subgraph_names.add(child_name)
+
+    if skip_invoke_subgraph:
+        for node in gm.graph.find_nodes(
+            op="call_function", target=torch.ops.higher_order.invoke_subgraph
+        ):
+            fx_subgraph_names.discard(node.args[0].target)
+
+    yield from fx_subgraph_names
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _recursive_pre_grad_passes(
@@ -355,15 +552,33 @@ def _recursive_pre_grad_passes(
         return pre_grad_passes(gm, example_inputs, add_passes, remove_passes)
 
 
+<<<<<<< HEAD
 def _recursive_joint_graph_passes(gm: GraphModule) -> None:
+=======
+def _recursive_joint_graph_passes(
+    gm: GraphModule, skip_invoke_subgraph: bool = False
+) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with dynamo_timed(
         "_recursive_joint_graph_passes",
         log_pt2_compile_event=True,
         dynamo_compile_column_us="joint_graph_pass_time_us",
     ):
+<<<<<<< HEAD
         for subgraph_name in _get_subgraph_names(gm):
             subgraph = getattr(gm, subgraph_name)
             _recursive_joint_graph_passes(subgraph)
+=======
+        # invoke_subgraph already runs the _recursive_joint_graph_passes.  In
+        # AOTAutograd, `run_joint_graph_passes_on_hops` partitions the
+        # invoke_subgraph HOP before calling the partitioner on the outer graph.
+        # AOTAutograd has access to partition_fn, which internally calls the
+        # `_recursive_joint_graph_passes` for the subgraph. So, skip recursing
+        # skip_invoke_subgraph.
+        for subgraph_name in _get_subgraph_names(gm, skip_invoke_subgraph):
+            subgraph = getattr(gm, subgraph_name)
+            _recursive_joint_graph_passes(subgraph, skip_invoke_subgraph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         joint_graph_passes(gm)
 
 
@@ -551,7 +766,11 @@ def with_fresh_cache_if_config() -> Generator[None, None, None]:
         # Don't delete the cache dir because it has to survive beyond the
         # compile_fx call. Let's put the temp dirs under the default cache
         # dir so they're easier to locate.
+<<<<<<< HEAD
         with fresh_inductor_cache(dir=cache_dir(), delete=False):
+=======
+        with fresh_cache(dir=cache_dir(), delete=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield
     else:
         yield
@@ -606,6 +825,7 @@ def compile_fx_inner(
                 "compile_fx_inner",
                 phase_name="inductor_compile",
                 log_pt2_compile_event=True,
+<<<<<<< HEAD
                 dynamo_compile_column_us="inductor_cumulative_compile_time_us",
             )
         )
@@ -619,6 +839,13 @@ def compile_fx_inner(
         if torch._dynamo.callback_handler.prevent_duplicate_callbacks:
             stack.enter_context(torch._dynamo.callback_handler.install_callbacks())
 
+=======
+                log_waitcounter=True,
+                waitcounter_name_override="compile_inductor",
+                dynamo_compile_column_us="inductor_cumulative_compile_time_us",
+            )
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stack.enter_context(with_fresh_cache_if_config())
         stack.enter_context(DebugContext())
         CompileEventLogger.pt2_compile(
@@ -646,12 +873,33 @@ def _compile_fx_inner(
     """
     aot_mode: bool = V.aot_compilation
 
+<<<<<<< HEAD
     if dynamo_utils.count_calls(gm.graph) == 0 and not aot_mode:
         # trigger the real recompilation for _LazyGraphModule before returning
         # the forward method.
         from torch.fx._lazy_graph_module import _LazyGraphModule
 
         _LazyGraphModule.force_recompile(gm)
+=======
+    # Clean up Compiled Triton Kernels per inductor compile, as the future objects
+    # may not be valid for use after they are run/autotuned
+    torch._inductor.async_compile.CompiledTritonKernels.cache_clear()
+
+    if dynamo_utils.count_calls(gm.graph) == 0 and not aot_mode:
+        # trigger the real recompilation for _LazyGraphModule before returning
+        # the forward method.
+        from torch._dynamo.utils import CompileEventLogLevel
+        from torch.fx._lazy_graph_module import _LazyGraphModule
+
+        _LazyGraphModule.force_recompile(gm)
+        compile_id = torch._guards.CompileContext.current_compile_id()
+        CompileEventLogger.log_instant_event(
+            "backward no-op",
+            metadata={"compile_id": compile_id},
+            log_level=CompileEventLogLevel.PT2_COMPILE,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return make_boxed_func(gm.forward)
 
     static_input_idxs: Sequence[int] = graph_kwargs.setdefault("static_input_idxs", ())
@@ -662,8 +910,13 @@ def _compile_fx_inner(
         f"inductor can only compile FX graphs which return a tuple/list, but got {gm.graph}"
     )
 
+<<<<<<< HEAD
     if (cudagraphs := graph_kwargs.get("cudagraphs")) is None:
         graph_kwargs["cudagraphs"] = cudagraphs = BoxedBool(config.triton.cudagraphs)
+=======
+    if graph_kwargs.get("cudagraphs") is None:
+        graph_kwargs["cudagraphs"] = BoxedBool(config.triton.cudagraphs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if config.save_args:
         save_args_for_compile_fx_inner(
             gm,
@@ -675,19 +928,51 @@ def _compile_fx_inner(
 
     fx_graph_remote_cache = should_use_remote_fx_graph_cache()
 
+<<<<<<< HEAD
     with (
         _WaitCounter("pytorch.wait_counter.fx_codegen_and_compile").guard() as _,
         _WaitCounter("pytorch.wait_counter.all_compilation_types").guard(),
+=======
+    # Check if the registered backend(s) support caching.
+    init_backend_registration()
+    backends_support_caching = all(
+        backend.supports_caching
+        for backend in (
+            get_wrapper_codegen_for_device(device.type, config.cpp_wrapper)
+            for device in get_all_devices(gm)
+        )
+        if backend is not None
+    )
+
+    with dynamo_timed(
+        "fx_codegen_and_compile", log_pt2_compile_event=True, log_waitcounter=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         use_cache = (
             not config.force_disable_caches
             and (config.fx_graph_cache or fx_graph_remote_cache)
             and not aot_mode
+<<<<<<< HEAD
+=======
+            and backends_support_caching
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         local = config.fx_graph_cache
         remote = fx_graph_remote_cache
         set_feature_use("fx_cache", use_cache)
 
+<<<<<<< HEAD
+=======
+        log.debug(
+            "FX cache status: use_cache=%s, local=%s, remote=%s, aot_mode=%s, force_disable_caches=%s",
+            use_cache,
+            local,
+            remote,
+            aot_mode,
+            config.force_disable_caches,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: This is a hack purely to get some info to extract_tensor_metadata_for_cache_key,
         # figure out how to not have to modify example inputs
         for i, input in enumerate(example_inputs):
@@ -715,8 +1000,15 @@ def _compile_fx_inner(
             # Attempt a cache lookup
             if key_info is not None:
                 key, debug_lines = key_info
+<<<<<<< HEAD
                 if remote:
                     remote_cache = FxGraphCache.get_remote_cache()
+=======
+                log.debug("FX cache key generated: %s", key)
+                if remote:
+                    remote_cache = FxGraphCache.get_remote_cache()
+                    log.debug("Using remote FX cache")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mb_compiled_graph, cache_info = FxGraphCache.load_with_key(
                     key,
                     debug_lines,
@@ -726,12 +1018,28 @@ def _compile_fx_inner(
                     is_backward=graph_kwargs.get("is_backward", False),
                     constants=constants,
                 )
+<<<<<<< HEAD
+=======
+            else:
+                log.debug("Failed to generate FX cache key")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # CACHE BYPASS: Compile the graph, don't save it to the cache
         # (this can happen either because cache was disabled, or we
         # determined the input is uncacheable)
         if cache_info is None or cache_info["cache_state"] == "bypass":
             assert mb_compiled_graph is None
+<<<<<<< HEAD
+=======
+            log.debug(
+                "FX cache bypass reason: %s",
+                (
+                    cache_info.get("cache_bypass_reason", "unknown")
+                    if cache_info is not None
+                    else "FX cache disabled or key generation failed"
+                ),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mb_compiled_graph = fx_codegen_and_compile(
                 gm, example_inputs, inputs_to_check, **graph_kwargs
             )
@@ -740,6 +1048,10 @@ def _compile_fx_inner(
         elif cache_info["cache_state"] == "miss":
             assert mb_compiled_graph is None
             assert key_info is not None
+<<<<<<< HEAD
+=======
+            log.debug("FX cache miss, compiling and saving to cache")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             TritonBundler.begin_compile()
             try:
                 mb_compiled_graph = fx_codegen_and_compile(
@@ -747,8 +1059,14 @@ def _compile_fx_inner(
                 )
                 assert mb_compiled_graph is not None
                 mb_compiled_graph._time_taken_ns = time.time_ns() - start_time
+<<<<<<< HEAD
                 cache_key = key_info[0]
                 mb_compiled_graph._fx_graph_cache_key = cache_key
+=======
+                cache_key, debug_lines = key_info
+                mb_compiled_graph._fx_graph_cache_key = cache_key
+                mb_compiled_graph._fx_graph_cache_debug_lines = debug_lines
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 (
                     triton_bundle,
                     triton_bundler_meta,
@@ -765,6 +1083,10 @@ def _compile_fx_inner(
             if triton_bundler_meta is not None:
                 cache_info["triton_bundler_meta"] = str(triton_bundler_meta)
             cache_info["time_taken_ns"] = mb_compiled_graph._time_taken_ns
+<<<<<<< HEAD
+=======
+            log.debug("Saving compiled graph to FX cache with key: %s", cache_key)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             FxGraphCache._save_graph(
                 cache_key,
                 mb_compiled_graph,
@@ -779,8 +1101,15 @@ def _compile_fx_inner(
             assert cache_info["cache_state"] == "hit"
             assert mb_compiled_graph is not None
             assert key_info is not None
+<<<<<<< HEAD
             cache_key = key_info[0]
             mb_compiled_graph._fx_graph_cache_key = cache_key
+=======
+            (cache_key, debug_lines) = key_info
+            log.debug("FX cache hit with key: %s", cache_key)
+            mb_compiled_graph._fx_graph_cache_key = cache_key
+            mb_compiled_graph._fx_graph_cache_debug_lines = debug_lines
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert mb_compiled_graph is not None
         compiled_graph = mb_compiled_graph
@@ -829,6 +1158,7 @@ def _compile_fx_inner(
                 },
                 payload_fn=lambda: json.dumps(cache_info),
             )
+<<<<<<< HEAD
         compiled_graph.post_compile(example_inputs, cudagraphs, constants)
 
     log.debug("FX codegen and compilation took %.3fs", time.time() - start)
@@ -843,6 +1173,75 @@ def _compile_fx_inner(
 
     # Clear Compiled Triton Kernels per inductor compile, as the future objects
     # may not be valid for use after they are run/autotuned
+=======
+        compiled_graph.post_compile(example_inputs, constants, graph_kwargs)
+
+    log.debug("FX codegen and compilation took %.3fs", time.time() - start)
+
+    # Dump provenance artifacts for debugging trace
+    provenance_info = V.debug.log_inductor_triton_kernel_to_post_grad_node_info()
+    # provenance_info might be None if config.trace.enabled is not set
+    if provenance_info:
+        (
+            debug_info,
+            node_mappings,
+        ) = provenance_info
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "inductor_generated_kernel_to_post_grad_nodes",
+                "encoding": "json",
+            },
+            payload_fn=lambda: json.dumps(debug_info),
+        )
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "inductor_provenance_tracking_node_mappings",
+                "encoding": "json",
+            },
+            payload_fn=lambda: json.dumps(node_mappings),
+        )
+
+    # This message is for printing overview information of inductor mm counts, shapes,etc after lowering
+    if log.isEnabledFor(logging.INFO):
+        mm_table_data = []
+        for key, value in counters["aten_mm_info"].items():
+            parts = key.split("_")
+            if len(parts) < 3:
+                # Unexpected format, show as-is
+                mm_table_data.append([key, "-", "?", "?", "?", value])
+                continue
+
+            # Determine if this is a batched operation by checking the operation name
+            name = "_".join(parts[:-4]) if len(parts) >= 4 else "_".join(parts[:-3])
+            is_batched = name.endswith(("bmm", "baddbmm"))
+
+            if is_batched and len(parts) >= 4:
+                # Batched operation: last 4 parts are batch, m, n, k
+                batch, m, n, k = parts[-4:]
+                name = "_".join(parts[:-4])
+                mm_table_data.append([name, batch, m, n, k, value])
+            else:
+                # Non-batched operation: last 3 parts are m, n, k
+                m, n, k = parts[-3:]
+                name = "_".join(parts[:-3])
+                mm_table_data.append([name, "-", m, n, k, value])
+
+        log.info("Overview info of inductor aten mms: ")
+        log.info(
+            "{:<30} | {:<20} | {:<20} | {:<20} | {:<20} | {:<20}".format(  # noqa: G001
+                "Name", "B", "M", "N", "K", "Count"
+            )
+        )
+        log.info("-" * 130)
+        for row in mm_table_data:
+            log.info("{:<30} | {:<20} | {:<20} | {:<20} | {:<20} | {:<20}".format(*row))  # noqa: G001
+            log.info("-" * 130)
+
+    # Not strictly necessary, but good to clean up straggling futures
+    # that are unused to reclaim memory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._inductor.async_compile.CompiledTritonKernels.cache_clear()
 
     _step_logger()(
@@ -897,6 +1296,12 @@ def codegen_and_compile(
         inputs_to_check: Sequence[int],
         graph_kwargs: _CompileFxKwargs,
     ) -> OutputCode:
+<<<<<<< HEAD
+=======
+        """
+        Generates the OutputCode from the GraphModule and example_inputs.
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Sorry about the mess, we need graph_kwargs to continue to be able
         # to propagate it further on
         # TODO: _CompileFxKwargs actually has stronger types than in the
@@ -913,9 +1318,12 @@ def codegen_and_compile(
         extern_node_serializer: Optional[Callable[[list[ExternKernelNode]], Any]] = (
             graph_kwargs.get("extern_node_serializer", None)
         )
+<<<<<<< HEAD
         boxed_forward_device_index: Optional[BoxedDeviceIndex] = graph_kwargs.get(
             "boxed_forward_device_index", None
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         with (
             _WaitCounter("pytorch.wait_counter.actual_codegen_and_compile").guard(),
@@ -945,12 +1353,20 @@ def codegen_and_compile(
                 f"graph {graph_id}",
             )
 
+<<<<<<< HEAD
             def log_graph_runnable() -> str:
                 fd = io.StringIO()
                 torch._dynamo.repro.after_aot.save_graph_repro(
                     fd, gm, example_inputs, "inductor", save_dir=None
                 )
                 return fd.getvalue()
+=======
+            fd = io.StringIO()
+            torch._dynamo.repro.after_aot.save_graph_repro(
+                fd, gm, example_inputs, "inductor", save_dir=None
+            )
+            runnable_graph_str = fd.getvalue()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             trace_structured(
                 "artifact",
@@ -958,7 +1374,11 @@ def log_graph_runnable() -> str:
                     "name": "fx_graph_runnable",
                     "encoding": "string",
                 },
+<<<<<<< HEAD
                 payload_fn=lambda: log_graph_runnable(),
+=======
+                payload_fn=lambda: runnable_graph_str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             V.debug.fx_graph(gm, example_inputs)
@@ -986,6 +1406,7 @@ def log_graph_runnable() -> str:
             # .view() call.
             view_to_reshape(gm)
 
+<<<<<<< HEAD
             # It is safe to run FakeTensorProp under no_grad because by the time
             # we're in inductor, we assume that AOTAutograd has already "taken care"
             # of autograd, so there should be no more autograd-related API's in the
@@ -994,11 +1415,37 @@ def log_graph_runnable() -> str:
                 fake_mode = fake_tensor_prop(gm, example_inputs)
 
             record_original_output_strides(gm)
+=======
+            with dynamo_timed(
+                "additional_fake_tensor_prop", log_pt2_compile_event=True
+            ):
+                # It is safe to run FakeTensorProp under no_grad because by the time
+                # we're in inductor, we assume that AOTAutograd has already "taken care"
+                # of autograd, so there should be no more autograd-related API's in the
+                # graph.
+                with torch.no_grad():
+                    fake_mode = fake_tensor_prop(gm, example_inputs)
+
+            _recursive_record_original_output_strides(gm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # pattern matcher passes might not preserve striding information
             # on node.meta["val"]. if in the future we rely on these being
             # correct we will need to fix.
+<<<<<<< HEAD
 
+=======
+            trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "before_post_grad_graph",
+                    "encoding": "string",
+                },
+                payload_fn=lambda: gm.print_readable(
+                    print_output=False, include_stride=True, include_device=True
+                ),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with V.set_fake_mode(fake_mode):
                 # has some issues with memory in training
                 cuda_context = get_cuda_device_context(gm)
@@ -1015,11 +1462,33 @@ def log_graph_runnable() -> str:
                         colored=True,
                     ),
                 )
+<<<<<<< HEAD
                 trace_structured(
                     "inductor_post_grad_graph",
                     payload_fn=lambda: gm.print_readable(
                         print_output=False, include_stride=True, include_device=True
                     ),
+=======
+
+                # We're printing the graph to be used as a cache key - so a
+                # printer which is a little less readable but faster is
+                # appropriate.
+                inductor_post_grad_graph_str = gm.print_readable(
+                    print_output=False,
+                    include_stride=True,
+                    include_device=True,
+                    fast_sympy_print=True,
+                )
+                # "after_post_grad_graph" is used in inductor provenance
+                # tracking highlighter front-end.
+                trace_structured(
+                    "artifact",
+                    metadata_fn=lambda: {
+                        "name": "after_post_grad_graph",
+                        "encoding": "string",
+                    },
+                    payload_fn=lambda: inductor_post_grad_graph_str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 if config.trace.enabled:
                     provenance_tracking_json = (
@@ -1054,7 +1523,11 @@ def log_graph_runnable() -> str:
                                 "pt2_configs": str(get_patched_config_dict())
                             }
                         )
+<<<<<<< HEAD
                     except ValueError:
+=======
+                    except Exception:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         # TODO(T216453900): need to work around for now to support vllm
                         # See details in vllm/compilation/pass_manager.py.
                         log.warning("failed to log pt2_configs")
@@ -1070,7 +1543,21 @@ def log_graph_runnable() -> str:
                 const_kernel_code = None
 
                 if aot_mode and config.aot_inductor.use_runtime_constant_folding:
+<<<<<<< HEAD
                     const_gm, const_output_index = split_const_gm(gm)
+=======
+                    # torchbind objects have name that starts with _torchbind_obj
+                    # See caffe2/torch/fx/_symbolic_trace.py?lines=406
+                    const_gm, const_output_index = split_const_gm(
+                        gm,
+                        skip_folding_node_fn=lambda node: node.op == "get_attr"
+                        and isinstance(node.target, str)
+                        and (
+                            node.target.startswith("_torchbind_obj")
+                            or isinstance(node.meta.get("val", None), FakeScriptObject)
+                        ),
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                     const_graph = GraphLowering(
                         const_gm,
@@ -1105,16 +1592,32 @@ def log_graph_runnable() -> str:
                     is_inference=is_inference,
                     is_backward=is_backward,
                     const_output_index=const_output_index,
+<<<<<<< HEAD
                     const_wrapper_code=const_wrapper_code.value
                     if const_wrapper_code
                     else None,
                     const_kernel_code=const_kernel_code.value
                     if const_kernel_code
                     else None,
+=======
+                    const_wrapper_code=(
+                        const_wrapper_code.value if const_wrapper_code else None
+                    ),
+                    const_kernel_code=(
+                        const_kernel_code.value if const_kernel_code else None
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     const_module=const_graph,
                     inputs_to_check=inputs_to_check,
                 )
                 metrics_helper = metrics.CachedMetricsHelper()
+<<<<<<< HEAD
+=======
+
+                # We are going to start code generating runtime asserts, so make sure
+                # you don't start adding new ones in the lowering process
+                graph.freeze_runtime_asserts()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 with V.set_graph_handler(graph):
                     graph.run(*example_inputs)
                     output_strides: list[Optional[tuple[_StrideExprStr, ...]]] = []
@@ -1142,6 +1645,7 @@ def log_graph_runnable() -> str:
                     # not going to touch it for now
 
                     compiled_fn: Any
+<<<<<<< HEAD
 
                     with dynamo_timed(
                         "GraphLowering.compile_to_fn", log_pt2_compile_event=True
@@ -1150,6 +1654,12 @@ def log_graph_runnable() -> str:
                         # you don't start adding new ones in the lowering process
                         graph.freeze_runtime_asserts()
 
+=======
+                    compiled_fn_runner = None
+                    with dynamo_timed(
+                        "GraphLowering.compile_to_fn", log_pt2_compile_event=True
+                    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         if graph.aot_mode:
                             from .codecache import AotCodeCompiler
 
@@ -1190,16 +1700,46 @@ def log_graph_runnable() -> str:
                                     additional_files=[
                                         *dict.fromkeys(
                                             graph.wrapper_code.additional_files
+<<<<<<< HEAD
+=======
+                                            + (
+                                                const_graph.wrapper_code.additional_files
+                                                if const_graph
+                                                else []
+                                            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                         )
                                     ],
                                 )
                         else:
+<<<<<<< HEAD
                             compiled_fn = graph.compile_to_module().call
 
                     num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
                     metrics.num_bytes_accessed += num_bytes
                     metrics.node_runtimes += node_runtimes
                     metrics.nodes_num_elem += nodes_num_elem
+=======
+                            compiled_module = graph.compile_to_module()
+                            compiled_fn = compiled_module.call
+                            compiled_fn_runner = getattr(
+                                compiled_module, "runner", None
+                            )
+
+                    if inductor_metrics_log.isEnabledFor(logging.INFO):
+                        num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
+                        metrics.num_bytes_accessed += num_bytes
+                        metrics.node_runtimes += node_runtimes
+                        metrics.nodes_num_elem += nodes_num_elem
+                        inductor_metrics_log.info(
+                            "Graph Metrics:\n%s",
+                            {
+                                "num_bytes_accessed": num_bytes,
+                                "nodes_num_elem": nodes_num_elem,
+                                "node_runtimes": node_runtimes,
+                            },
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                     if (
                         cudagraphs
@@ -1267,7 +1807,13 @@ def log_graph_runnable() -> str:
                         static_input_idxs,
                         graph_kwargs,
                         inputs_to_check,
+<<<<<<< HEAD
                         boxed_forward_device_index,
+=======
+                        runnable_graph_str,
+                        inductor_post_grad_graph_str,
+                        compiled_fn_runner,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
 
@@ -1292,6 +1838,18 @@ def fx_codegen_and_compile(
 
         scheme = _SubprocessFxCompile()
 
+<<<<<<< HEAD
+=======
+    if fx_compile_async:
+        from .compile_fx_async import _AsyncFxCompile
+        from .compile_fx_ext import _OutOfProcessFxCompile
+
+        assert isinstance(scheme, _OutOfProcessFxCompile), (
+            "async is only valid with an out-of-process compile mode"
+        )
+        scheme = _AsyncFxCompile(scheme)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
 
 
@@ -1368,8 +1926,13 @@ def run(new_inputs: Sequence[InputType]) -> Any:
         nonlocal compiled_fn
         if compiled_fn is None:
             with dynamo_utils.preserve_rng_state():
+<<<<<<< HEAD
                 compiled_fn = cudagraphify_fn(model, new_inputs, static_input_idxs)
         return compiled_fn(new_inputs)
+=======
+                compiled_fn = cudagraphify_fn(model, new_inputs, static_input_idxs)  # type: ignore[arg-type]
+        return compiled_fn(new_inputs)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return run
 
@@ -1484,7 +2047,11 @@ def run(new_inputs: list[InputType]) -> Callable[[list[InputType]], Any]:
             graph.replay()
             return static_outputs
 
+<<<<<<< HEAD
     return align_inputs_from_check_idxs(run, check_input_idxs)
+=======
+    return align_inputs_from_check_idxs(run, check_input_idxs, OrderedSet())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def compile_fx_aot(
@@ -1492,7 +2059,11 @@ def compile_fx_aot(
     example_inputs_: list[InputType],
     inner_compile: _CompileFxCallable = compile_fx_inner,
     config_patches: Optional[dict[str, str]] = None,
+<<<<<<< HEAD
 ) -> Union[list[str], str]:
+=======
+) -> Union[list[Union[str, Weights]], str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(model_, GraphModule), model_
 
     # [See NOTE] Unwrapping subclasses AOT
@@ -1680,6 +2251,7 @@ def get_cuda_device_context(gm: torch.fx.GraphModule) -> AbstractContextManager[
     if not torch.cuda.is_available():
         return contextlib.nullcontext()
 
+<<<<<<< HEAD
     placeholder_nodes = gm.graph.find_nodes(op="placeholder")
     input_devices: OrderedSet[torch.device] = OrderedSet(
         node.meta["val"].device
@@ -1694,6 +2266,10 @@ def get_cuda_device_context(gm: torch.fx.GraphModule) -> AbstractContextManager[
     )
     cuda_devices: OrderedSet[torch.device] = OrderedSet(
         device for device in (input_devices | out_devices) if device.type == "cuda"
+=======
+    cuda_devices: OrderedSet[torch.device] = OrderedSet(
+        device for device in get_all_devices(gm) if device.type == "cuda"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     return (
@@ -1709,7 +2285,12 @@ def compile_fx(
     inner_compile: Callable[..., OutputCode] = compile_fx_inner,
     config_patches: Optional[dict[str, Any]] = None,
     decompositions: Optional[dict[OpOverload, Callable[..., Any]]] = None,
+<<<<<<< HEAD
 ) -> Union[Callable[[list[object]], Sequence[torch.Tensor]], str, list[str]]:
+=======
+    ignore_shape_env: bool = False,
+) -> Union[Callable[[list[object]], Sequence[torch.Tensor]], str, list[str], Weights]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Main entry point for compiling given FX graph.  Despite the fact that this
     lives in :mod:`torch._inductor`, this function is responsible for calling
@@ -1733,6 +2314,10 @@ def compile_fx(
                 # need extra layer of patching as backwards is compiled out of scope
                 inner_compile=config.patch(config_patches)(inner_compile),
                 decompositions=decompositions,
+<<<<<<< HEAD
+=======
+                ignore_shape_env=ignore_shape_env,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     # TODO: This probably shouldn't be a recursive call
@@ -1773,17 +2358,41 @@ def compile_fx(
                                     "make sure torch.export() and torch.aot_compile() run on the same device."
                                 )
                     inputs_ = fake_inputs  # type: ignore[assignment]
+<<<<<<< HEAD
             return compile_fx(
                 model_,
                 inputs_,
                 inner_compile=functools.partial(inner_compile, cpp_wrapper=True),
                 decompositions=decompositions,
             )
+=======
+            from torch._export.non_strict_utils import _fakify_script_objects
+
+            fake_mode = detect_fake_mode(inputs_)
+            with _fakify_script_objects(model_, inputs_, {}, fake_mode) as (
+                patched_mod,
+                fake_args,
+                _,
+                _,
+                _,
+            ):
+                return compile_fx(
+                    patched_mod,
+                    fake_args,
+                    inner_compile=functools.partial(inner_compile, cpp_wrapper=True),
+                    decompositions=decompositions,
+                    ignore_shape_env=ignore_shape_env,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     recursive_compile_fx = functools.partial(
         compile_fx,
         inner_compile=inner_compile,
         decompositions=decompositions,
+<<<<<<< HEAD
+=======
+        ignore_shape_env=ignore_shape_env,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     if not graph_returns_tuple(model_):
@@ -1816,8 +2425,19 @@ def compile_fx(
         # having AOTAutograd trace it.
         # TODO: Get rid of this?
         if isinstance(model_, GraphModule):
+<<<<<<< HEAD
             trace_structured(
                 "inductor_pre_grad_graph",
+=======
+            # "before_pre_grad_graph" is used in inductor provenance
+            # tracking highlighter front-end.
+            trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "before_pre_grad_graph",
+                    "encoding": "string",
+                },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 payload_fn=lambda: model_.print_readable(
                     print_output=False, include_stride=True, include_device=True
                 )
@@ -1836,6 +2456,20 @@ def compile_fx(
             torch._inductor.debug._pre_grad_graph_id = id(model_.graph)
 
             model_ = _recursive_pre_grad_passes(model_, example_inputs_)
+<<<<<<< HEAD
+=======
+            trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "after_pre_grad_graph",
+                    "encoding": "string",
+                },
+                payload_fn=lambda: model_.print_readable(
+                    print_output=False, include_stride=True, include_device=True
+                )
+                + f"\n\n # graph id: {id(model_.graph)}",
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO: Move this before recursive pre-grad passes
         # NB: This short circuit never occurs for Dynamo produced graphs
@@ -1940,6 +2574,14 @@ def fw_compiler_base(
                 else:
                     model_outputs_node.meta["user_visible_output_idxs"] = []
 
+<<<<<<< HEAD
+=======
+                # We also mark the invoke_subgraph outputs as user_visible to
+                # force the outputs of invoke_subgraph subgraph to follow the
+                # original strides
+                _recursive_record_user_visible_output_idxs(gm)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return inner_compile(
                     gm,
                     example_inputs,
@@ -1978,11 +2620,34 @@ def partition_fn(
         ) -> tuple[GraphModule, GraphModule]:
             cuda_context = get_cuda_device_context(gm)
             with cuda_context:
+<<<<<<< HEAD
                 _recursive_joint_graph_passes(gm)
             return min_cut_rematerialization_partition(
                 gm, joint_inputs, **kwargs, compiler="inductor"
             )
 
+=======
+                # We can skip the invoke_subgraph because the
+                # entire_partition_fn is called recursively for invoke_subgraph
+                # in partitioning.
+                _recursive_joint_graph_passes(gm, skip_invoke_subgraph=True)
+
+            static_lifetime_input_indices: Optional[list[int]] = kwargs.pop(  # type: ignore[assignment]
+                "static_lifetime_input_indices", None
+            )
+
+            with dynamo_utils.dynamo_timed(
+                "min_cut_rematerialization_partition", log_pt2_compile_event=True
+            ):
+                return min_cut_rematerialization_partition(
+                    gm,
+                    joint_inputs,
+                    compiler="inductor",
+                    static_lifetime_input_indices=static_lifetime_input_indices,
+                    **kwargs,
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @compile_time_strobelight_meta(phase_name="backward")
         def bw_compiler(
             gm: GraphModule, example_inputs: Sequence[InputType]
@@ -2050,11 +2715,26 @@ def bw_compiler(
                 # this will go away.
                 for node in gm.graph.nodes:
                     if node.op == "get_attr" and "val" not in node.meta:
+<<<<<<< HEAD
                         target = getattr(gm, node.target)
+=======
+                        target = attrgetter(node.target)(gm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         if isinstance(target, torch.Tensor):
                             node.meta["val"] = fake_mode.from_tensor(
                                 target, static_shapes=True
                             )
+<<<<<<< HEAD
+=======
+                        elif isinstance(target, torch.ScriptObject):
+                            node.meta["val"] = (
+                                torch._library.fake_class_registry.maybe_to_fake_obj(
+                                    fake_mode, target
+                                )
+                            )
+                        elif isinstance(target, FakeScriptObject):
+                            node.meta["val"] = target
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             unlifted_gm = _unlift_graph(model_, gm, graph_signature)
             if "dynamo_flat_name_to_original_fqn" in model_.meta:
@@ -2093,6 +2773,11 @@ def bw_compiler(
                     partition_fn=partition_fn,
                     keep_inference_input_mutations=True,
                     cudagraphs=cudagraphs,
+<<<<<<< HEAD
+=======
+                    boxed_forward_device_index=forward_device,
+                    ignore_shape_env=ignore_shape_env,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )(model_, example_inputs_)
             except ShortenTraceback as e:
                 # We will also shorten the traceback inside dynamo.
@@ -2244,6 +2929,18 @@ def _aoti_flatten_inputs(
         (args, kwargs or {})
     )
 
+<<<<<<< HEAD
+=======
+    if any(isinstance(x[1], torch.ScriptObject) for x in flat_args_with_path):
+        from torch._dynamo.exc import UserError, UserErrorType
+
+        raise UserError(
+            UserErrorType.INVALID_INPUT,
+            "TorchBind objects found in inputs. TorchBind object inputs are not supported in AOTInductor. "
+            "TorchBind objects can only be attributes.",
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Replace non-tensor (constant) inputs with Nones, since these are not being
     # used anyways by the graph
     flat_example_inputs = [
diff --git a/torch/_inductor/compile_fx_async.py b/torch/_inductor/compile_fx_async.py
new file mode 100644
index 000000000000..4d412edbb72c
--- /dev/null
+++ b/torch/_inductor/compile_fx_async.py
@@ -0,0 +1,181 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, TYPE_CHECKING
+from typing_extensions import final, override
+
+import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+from torch._inductor.output_code import CompiledFxGraphConstants, OutputCode
+
+from .compile_fx import _CompileFxKwargs, _InProcessFxCompile, FxCompile
+from .output_code import complex_memory_overlap as complex_memory_overlap  # noqa: F401
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from concurrent.futures import Future
+
+    from torch._inductor.utils import InputType
+    from torch.fx import GraphModule
+
+    from .compile_fx_ext import _OutOfProcessFxCompile, _WireProtocolPickledOutput
+
+
+@dataclass
+class _PostCompileData:
+    example_inputs: Sequence[InputType]
+    constants: CompiledFxGraphConstants
+    graph_kwargs: _CompileFxKwargs
+
+
+# _AsyncOutputCode handles the actual management of waiting for an
+# out-of-process compile to finish and then switching over to it.
+@final
+class _AsyncOutputCode(OutputCode):
+    _eager_forward: Optional[Callable[..., Any]]
+    _output_code: Optional[OutputCode]
+    _future: Optional[Future[_WireProtocolPickledOutput]]
+    _callback: Callable[[_WireProtocolPickledOutput], OutputCode]
+    _post_compile_data: Optional[_PostCompileData] = None
+    _boxed_call: bool  # Copied from the forward/output_code
+
+    def __init__(
+        self,
+        # eager_forward is run until the future is finished.
+        eager_forward: Callable[..., Any],
+        # this responds with the result of the out-of-process compile when it's
+        # ready.
+        future: Future[_WireProtocolPickledOutput],
+        # this callback gets called to turn the _WireProtocolPickledOutput into an OutputCode
+        callback: Callable[[_WireProtocolPickledOutput], OutputCode],
+    ) -> None:
+        self._eager_forward = eager_forward
+        self._boxed_call = getattr(eager_forward, "_boxed_call", False)
+        self._output_code = None
+
+        self._future = future
+        self._callback = callback
+
+    @override
+    def __call__(self, *args: Any) -> Any:
+        if self._future is not None and self._future.done():
+            args = self._switch_to_compiled_forward(args)
+
+        if eager_forward := self._eager_forward:
+            _AsyncFxCompile._stat_eager_runs += 1
+            return eager_forward(*args)
+
+        else:
+            _AsyncFxCompile._stat_compiled_runs += 1
+            assert self._output_code is not None
+            return self._output_code.__call__(*args)
+
+    # Takes and returns the args (converted to the "right" boxed mode)
+    def _switch_to_compiled_forward(self, args: tuple[Any, ...]) -> tuple[Any, ...]:
+        assert self._future is not None
+
+        # TODO: If the future ended in an exception do we want to continue
+        # running eager or hit the exception now?
+        f, self._future = self._future, None
+        output_code = self._callback(f.result())
+
+        if pcd := self._post_compile_data:
+            self._post_compile_data = None
+
+            output_code.post_compile(
+                pcd.example_inputs, pcd.constants, pcd.graph_kwargs
+            )
+
+        self._output_code = output_code
+        self._eager_forward = None
+        boxed_call = getattr(output_code, "_boxed_call", False)
+
+        if self._boxed_call != boxed_call:
+            if self._boxed_call:
+                # Was boxed, now unboxed
+                args = args[0] if len(args) > 0 else ()
+            else:
+                # Was unboxed, now boxed
+                args = (args,)
+
+        self._boxed_call = boxed_call
+        return args
+
+    @override
+    def post_compile(
+        self,
+        example_inputs: Sequence[InputType],
+        constants: CompiledFxGraphConstants,
+        graph_kwargs: _CompileFxKwargs,
+    ) -> None:
+        if self._eager_forward is not None:
+            self._post_compile_data = _PostCompileData(
+                example_inputs, constants, graph_kwargs
+            )
+        else:
+            assert self._output_code is not None
+            self._output_code.post_compile(example_inputs, constants, graph_kwargs)
+
+
+# Given an FxCompile for an out-of-process compile _AsyncFxCompile will run
+# eager until the compiled artifact is ready then it will automatically switch
+# over to using the compiled version.
+@final
+class _AsyncFxCompile(FxCompile):
+    _compile: _OutOfProcessFxCompile
+
+    # Some debugging stats:
+    # Number of times we started a background compile.
+    _stat_bg_started: int = 0
+    # Number of times we finished a background compile.
+    _stat_bg_finished: int = 0
+    # Number of times we ran "eager"
+    _stat_eager_runs: int = 0
+    # Number of times we ran our compiled (out-of-process) artifact
+    _stat_compiled_runs: int = 0
+
+    def __init__(self, compile: _OutOfProcessFxCompile) -> None:
+        self._compile = compile
+
+    @classmethod
+    def _reset_stats(cls) -> None:
+        cls._stat_bg_started = 0
+        cls._stat_bg_finished = 0
+        cls._stat_eager_runs = 0
+        cls._stat_compiled_runs = 0
+
+    @override
+    def codegen_and_compile(
+        self,
+        gm: GraphModule,
+        example_inputs: Sequence[InputType],
+        inputs_to_check: Sequence[int],
+        graph_kwargs: _CompileFxKwargs,
+    ) -> OutputCode:
+        eager_output_code = _InProcessFxCompile().codegen_and_compile(
+            gm, example_inputs, inputs_to_check, graph_kwargs
+        )
+
+        # This is similar to _SerializedFxCompile.codegen_and_compile() but
+        # handles the async routing.
+
+        serialized = self._compile.serialize_compile(
+            gm, example_inputs, inputs_to_check, graph_kwargs
+        )
+        if not serialized:
+            # We can't serialize - just return the eager OutputCode
+            return eager_output_code
+
+        inputs, constants = serialized
+
+        _AsyncFxCompile._stat_bg_started += 1
+        f = self._compile._send_to_child_async(inputs)
+
+        # This is called by _switch_to_compiled_forward() when f has a result...
+        def callback(pickled_output: _WireProtocolPickledOutput) -> OutputCode:
+            _AsyncFxCompile._stat_bg_finished += 1
+            output = pickled_output.deserialize(constants)
+            self._compile._postprocess(output)
+            return output.graph
+
+        return _AsyncOutputCode(eager_output_code, f, callback)
diff --git a/torch/_inductor/compile_fx_ext.py b/torch/_inductor/compile_fx_ext.py
index 436a3ca37e8a..7ac503129f15 100644
--- a/torch/_inductor/compile_fx_ext.py
+++ b/torch/_inductor/compile_fx_ext.py
@@ -11,7 +11,11 @@
 from abc import abstractmethod
 from dataclasses import dataclass
 from typing import Any, Optional, TYPE_CHECKING, Union
+<<<<<<< HEAD
 from typing_extensions import override, Self, TypeGuard
+=======
+from typing_extensions import final, override, Self, TypeGuard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
 import torch.fx
@@ -37,6 +41,10 @@
 if TYPE_CHECKING:
     import types
     from collections.abc import Generator, Mapping, Sequence
+<<<<<<< HEAD
+=======
+    from concurrent.futures import Future
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from torch._inductor.utils import InputType
     from torch.fx import GraphModule
@@ -166,6 +174,10 @@ class _FakeTensorModeSerializer:
 
     def __init__(self, fake_mode: FakeTensorMode) -> None:
         self.allow_non_fake_inputs = fake_mode.allow_non_fake_inputs
+<<<<<<< HEAD
+=======
+        self.shape_env = fake_mode.shape_env
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @contextlib.contextmanager
     def patch(self, fake_mode: FakeTensorMode) -> Generator[None, None, None]:
@@ -191,7 +203,11 @@ class _WireProtocolInput:
     tracing_context: Optional[torch._guards.TracingContext]
     config: dict[str, object]
     virtualized: _VirtualizedSerializer
+<<<<<<< HEAD
     deterministic_guard_for_testing: Optional[
+=======
+    deterministic_guard_for_testing: Optional[  # type: ignore[name-defined]  # mypy bug
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.testing._internal.common_utils.DeterministicGuard
     ]
     logger_state: _LoggerState
@@ -246,6 +262,10 @@ class _WireProtocolOutput:
     metrics: CachedMetricsDeltas
     logs: list[logging.LogRecord]
     warning_replay: Optional[list[warnings.WarningMessage]]
+<<<<<<< HEAD
+=======
+    shape_env: Optional[torch.fx.experimental.symbolic_shapes.ShapeEnv]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def serialize(self) -> _WireProtocolPickledOutput:
         """
@@ -404,18 +424,57 @@ def codegen_and_compile(
         inputs_to_check: Sequence[int],
         graph_kwargs: _CompileFxKwargs,
     ) -> OutputCode:
+<<<<<<< HEAD
         def fallback() -> OutputCode:
+=======
+        # If this code changes it's likely _AsyncFxCompile.codegen_and_compile()
+        # will also need to match.
+
+        serialized = self.serialize_compile(
+            gm, example_inputs, inputs_to_check, graph_kwargs
+        )
+        if not serialized:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return _InProcessFxCompile().codegen_and_compile(
                 gm, example_inputs, inputs_to_check, graph_kwargs
             )
 
+<<<<<<< HEAD
+=======
+        inputs, constants = serialized
+        output = self._send_to_child(inputs).deserialize(constants)
+
+        self._postprocess(output)
+        self._compile_stats[type(self)].codegen_and_compile += 1
+
+        # TODO: Do we need to figure out what changed in TracingContext in the
+        # child and plumb that back up to the parent?
+
+        return output.graph
+
+    def serialize_compile(
+        self,
+        gm: GraphModule,
+        example_inputs: Sequence[InputType],
+        inputs_to_check: Sequence[int],
+        graph_kwargs: _CompileFxKwargs,
+    ) -> Optional[tuple[_WireProtocolPickledInput, CompiledFxGraphConstantsWithGm]]:
+        """
+        Prepare a _WireProtocolInput to compile. If None is returned then it
+        wasn't possible to serialize and we should fallback to in-process.
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             # _check_for_hop raises BypassFxGraphCache when it detects something
             # we can't cache (or serialize)
             FxGraphCache._check_for_hop(gm)
         except BypassFxGraphCache as e:
             log.debug("Skipping %s compile: %s", type(self), e)
+<<<<<<< HEAD
             return fallback()
+=======
+            return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         context = torch._guards.TracingContext.try_get()
         constants = CompiledFxGraphConstantsWithGm(gm)
@@ -424,12 +483,20 @@ def fallback() -> OutputCode:
 
         # If we're running tests then grab the DeterministicGuard (don't want to
         # import this if it isn't already imported because it has side-effects)
+<<<<<<< HEAD
         deterministic_guard_for_testing: Optional[
+=======
+        deterministic_guard_for_testing: Optional[  # type: ignore[name-defined]  # mypy bug
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.testing._internal.common_utils.DeterministicGuard
         ] = None
         try:
             deterministic_guard_for_testing = (
+<<<<<<< HEAD
                 torch.testing._internal.common_utils.DeterministicGuard._current_state()
+=======
+                torch.testing._internal.common_utils.DeterministicGuard._current_state()  # type: ignore[attr-defined]  # mypy bug
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         except AttributeError:
             pass
@@ -451,11 +518,16 @@ def fallback() -> OutputCode:
                 lowering,
                 fake_tensor_mode,
             ).serialize()
+<<<<<<< HEAD
+=======
+            return (input, constants)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except (AttributeError, BypassFxGraphCache):
             # For example: AttributeError: Can't pickle local object
             # 'make_opaque_unary_fn.<locals>.OpaqueUnaryFn'
 
             # TODO: scuba record about not being able to do this?
+<<<<<<< HEAD
             log.debug("Unable to pickle input graph or example inputs", exc_info=True)
 
             return fallback()
@@ -469,6 +541,11 @@ def fallback() -> OutputCode:
         # child and plumb that back up to the parent?
 
         return output.graph
+=======
+            log.warning("Unable to pickle input graph or example inputs", exc_info=True)
+
+            return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @abstractmethod
     def _send_to_child(
@@ -526,12 +603,24 @@ def _run_in_child(
         logs = captured_logs.finish()
 
         return _WireProtocolOutput(
+<<<<<<< HEAD
             output_graph, metrics.get_deltas(), logs, warning_replay
+=======
+            output_graph,
+            metrics.get_deltas(),
+            logs,
+            warning_replay,
+            fake_mode.shape_env,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).serialize()
 
 
 # This is a debugging/testing implementation of FxCompile which serializes the
 # input and output but still runs the FxCompile in-process.
+<<<<<<< HEAD
+=======
+@final
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _DebugSerdeFxCompile(_SerializedFxCompile):
     @override
     def _send_to_child(
@@ -548,6 +637,32 @@ class _OutOfProcessFxCompile(_SerializedFxCompile):
     either a subprocess or possibly even a separate machine).
     """
 
+<<<<<<< HEAD
+=======
+    @override
+    @final
+    def _send_to_child(
+        self, pickled_input: _WireProtocolPickledInput
+    ) -> _WireProtocolPickledOutput:
+        f = self._send_to_child_async(pickled_input)
+
+        # For debugging: If we want to print status updates...
+        # last = time.time()
+        # while not f.done():
+        #     print("tick...")
+        #     time.sleep(0.125)
+        #     now = time.time()
+        #     if now - last > 1:
+        #         last = now
+
+        return f.result()
+
+    @abstractmethod
+    def _send_to_child_async(
+        self, pickled_input: _WireProtocolPickledInput
+    ) -> Future[_WireProtocolPickledOutput]: ...
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _postprocess(self, output: _WireProtocolOutput) -> None:
         # Since our metrics were gathered in a subprocess make sure to add them
         # here.
@@ -564,7 +679,11 @@ def _postprocess(self, output: _WireProtocolOutput) -> None:
 
         # And forward our collected logs. The cache is cleared when the outer
         # function exits.
+<<<<<<< HEAD
         @functools.lru_cache(None)
+=======
+        @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def getLogger(name: str) -> logging.Logger:
             return logging.getLogger(name)
 
@@ -599,7 +718,12 @@ def getLogger(name: str) -> logging.Logger:
 #     with open(f"/tmp/pytorch_compile_fx_tmp_output_{idx}.bin", "wb") as f:
 #         f.write(result.value)
 #
+<<<<<<< HEAD
 class _DebugFileFxCompile(_OutOfProcessFxCompile):
+=======
+@final
+class _DebugFileFxCompile(_SerializedFxCompile):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     file_index = 0
 
     @override
diff --git a/torch/_inductor/compile_fx_subproc.py b/torch/_inductor/compile_fx_subproc.py
index 6515f9d8ac45..a8fbc84d765c 100644
--- a/torch/_inductor/compile_fx_subproc.py
+++ b/torch/_inductor/compile_fx_subproc.py
@@ -4,7 +4,11 @@
 import functools
 import os
 from typing import Optional, TYPE_CHECKING
+<<<<<<< HEAD
 from typing_extensions import override
+=======
+from typing_extensions import final, override
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
 import torch.fx
@@ -13,7 +17,11 @@
     SubprocKind,
     SubprocPool,
 )
+<<<<<<< HEAD
 from torch._inductor.utils import clear_inductor_caches
+=======
+from torch._inductor.utils import clear_caches
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .compile_fx_ext import (
     _OutOfProcessFxCompile,
@@ -25,6 +33,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Mapping
+<<<<<<< HEAD
 
 
 class _SubprocessFxCompile(_OutOfProcessFxCompile):
@@ -32,6 +41,17 @@ class _SubprocessFxCompile(_OutOfProcessFxCompile):
     def _send_to_child(
         self, input: _WireProtocolPickledInput
     ) -> _WireProtocolPickledOutput:
+=======
+    from concurrent.futures import Future
+
+
+@final
+class _SubprocessFxCompile(_OutOfProcessFxCompile):
+    @override
+    def _send_to_child_async(
+        self, input: _WireProtocolPickledInput
+    ) -> Future[_WireProtocolPickledOutput]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: Do we need to copy across some kind of logging IDs? (ChromiumEventLogger)
 
         pool = self.process_pool()
@@ -41,6 +61,7 @@ def _send_to_child(
         env_vars = ["TORCHINDUCTOR_CACHE_DIR", "TRITON_CACHE_DIR"]
         extra_env = {v: os.environ[v] for v in env_vars if v in os.environ}
 
+<<<<<<< HEAD
         f = pool.submit(_SubprocessFxCompile._run_in_child_subprocess, input, extra_env)
 
         # For debugging: If we want to print status updates...
@@ -55,6 +76,11 @@ def _send_to_child(
         output = f.result()
 
         return output
+=======
+        return pool.submit(
+            _SubprocessFxCompile._run_in_child_subprocess, input, extra_env
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     @functools.cache
@@ -86,14 +112,22 @@ def _run_in_child_subprocess(
         #      tmpdir still exists and fails to compile.
         #
         # TODO: We probably should be using a separate tmpdir in the worker
+<<<<<<< HEAD
         # anyway... but we should probably still respect clear_inductor_caches()
+=======
+        # anyway... but we should probably still respect clear_caches()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # in the parent... maybe?
         #
         # TODO: We could be less aggressive by keeping a clock which gets
         # incremented when we clear the cache, send the clock to the worker and
         # only clear caches if the clock changed since last time.
         #
+<<<<<<< HEAD
         clear_inductor_caches()
+=======
+        clear_caches()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._inductor.metrics.reset()
 
         # TODO: turn off config.fx_graph_async_compile
diff --git a/torch/_inductor/compile_worker/__main__.py b/torch/_inductor/compile_worker/__main__.py
index 46fc1b2eb2eb..202d011f62ad 100644
--- a/torch/_inductor/compile_worker/__main__.py
+++ b/torch/_inductor/compile_worker/__main__.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import argparse
+<<<<<<< HEAD
+=======
+import base64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import importlib
 import logging
@@ -8,12 +12,20 @@
 from typing import TypeVar
 
 from torch._inductor.async_compile import pre_fork_setup
+<<<<<<< HEAD
+=======
+from torch._inductor.codecache import torch_key
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.compile_worker.subproc_pool import (
     SubprocKind,
     SubprocMain,
     SubprocPickler,
 )
+<<<<<<< HEAD
 from torch._inductor.compile_worker.watchdog import _async_compile_initializer
+=======
+from torch._inductor.compile_worker.utils import _async_compile_initializer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.runtime.compile_tasks import _set_triton_ptxas_path
 
 
@@ -56,6 +68,10 @@ def main():
         parser.add_argument("--parent", type=int)
         parser.add_argument("--read-fd", type=int)
         parser.add_argument("--write-fd", type=int)
+<<<<<<< HEAD
+=======
+        parser.add_argument("--torch-key", type=str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args = parser.parse_args()
         if os.getppid() != args.parent:
             sys.exit(0)
@@ -64,6 +80,11 @@ def main():
 
         pre_fork_setup()
 
+<<<<<<< HEAD
+=======
+        torch_key.set(base64.b64decode(args.torch_key.encode("utf-8")))  # type: ignore[attr-defined]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _async_compile_initializer(args.parent)
 
         SubprocMain(args.pickler, args.kind, args.workers, read_fd, write_fd).main()
diff --git a/torch/_inductor/compile_worker/subproc_pool.py b/torch/_inductor/compile_worker/subproc_pool.py
index 8f6761e3d197..e511ef08913f 100644
--- a/torch/_inductor/compile_worker/subproc_pool.py
+++ b/torch/_inductor/compile_worker/subproc_pool.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+import base64
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import itertools
 import logging
@@ -21,7 +25,16 @@
 # functionality to destroy singletons before forking and re-enable them after.
 import torch._thread_safe_fork  # noqa: F401
 from torch._inductor import config
+<<<<<<< HEAD
 from torch._inductor.compile_worker.watchdog import _async_compile_initializer
+=======
+from torch._inductor.codecache import torch_key
+from torch._inductor.compile_worker.tracked_process_pool import (
+    TrackedProcessPoolExecutor,
+)
+from torch._inductor.compile_worker.utils import _async_compile_initializer
+from torch._inductor.utils import get_ld_library_path
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
@@ -57,6 +70,7 @@ def _recv_msg(read_pipe: IO[bytes]) -> tuple[int, bytes]:
     return job_id, data
 
 
+<<<<<<< HEAD
 def _get_ld_library_path() -> str:
     path = os.environ.get("LD_LIBRARY_PATH", "")
     if config.is_fbcode():
@@ -70,6 +84,8 @@ def _get_ld_library_path() -> str:
     return path
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _SubprocExceptionInfo:
     """
     Carries exception info from subprocesses across the wire. traceback
@@ -128,6 +144,10 @@ def __init__(
         read_fd, subproc_write_fd = os.pipe()
         self.write_pipe = os.fdopen(write_fd, "wb")
         self.read_pipe = os.fdopen(read_fd, "rb")
+<<<<<<< HEAD
+=======
+        torch_key_str = base64.b64encode(torch_key()).decode("utf-8")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cmd = [
             sys.executable,
@@ -138,21 +158,45 @@ def __init__(
             f"--parent={os.getpid()}",
             f"--read-fd={str(subproc_read_fd)}",
             f"--write-fd={str(subproc_write_fd)}",
+<<<<<<< HEAD
         ]
+=======
+            f"--torch-key={torch_key_str}",
+        ]
+        local = False
+        if config.worker_suppress_logging:
+            log.info("Suppressing compile worker output due to config")
+            local = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.process = subprocess.Popen(
             cmd,
             env={
                 **os.environ,
                 # We need to set the PYTHONPATH so the subprocess can find torch.
+<<<<<<< HEAD
                 "PYTHONPATH": os.pathsep.join(sys.path),
+=======
+                "PYTHONPATH": os.environ.get(
+                    "TORCH_CUSTOM_PYTHONPATH", os.pathsep.join(sys.path)
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # We don't want to re-warm the pool when the subprocess imports
                 # torch._inductor.codecache since the warming process is what
                 # creates the SubprocPool in the first place.
                 "TORCH_WARM_POOL": "0",
                 # Some internal usages need a modified LD_LIBRARY_PATH.
+<<<<<<< HEAD
                 "LD_LIBRARY_PATH": _get_ld_library_path(),
             },
             pass_fds=(subproc_read_fd, subproc_write_fd),
+=======
+                "LD_LIBRARY_PATH": get_ld_library_path(),
+            },
+            pass_fds=(subproc_read_fd, subproc_write_fd),
+            stdout=subprocess.DEVNULL if local else None,
+            stderr=subprocess.DEVNULL if local else None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.write_lock = threading.Lock()
         self.read_thread = threading.Thread(target=self._read_thread, daemon=True)
@@ -268,7 +312,11 @@ def __init__(
         self.running = True
 
     def _new_pool(self, nprocs: int, warm: bool) -> ProcessPoolExecutor:
+<<<<<<< HEAD
         pool = ProcessPoolExecutor(
+=======
+        pool = TrackedProcessPoolExecutor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nprocs,
             mp_context=multiprocessing.get_context(self.kind.value),
             initializer=functools.partial(_async_compile_initializer, os.getpid()),
@@ -308,6 +356,7 @@ def submit(self, job_id: int, data: bytes) -> None:
                 self.pool = self._new_pool(self.nprocs, False)
 
     def _submit_inner(self, job_id: int, data: bytes) -> None:
+<<<<<<< HEAD
         future = self.pool.submit(
             functools.partial(SubprocMain.do_job, self.pickler, data)
         )
@@ -317,6 +366,13 @@ def callback(_: Future[Any]) -> None:
                 return
             try:
                 result = future.result()
+=======
+        def callback(fut: Future[Any]) -> None:
+            if not self.running:
+                return
+            try:
+                result = fut.result()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except Exception as e:
                 log.exception("Error in subprocess")
                 result = self.pickler.dumps(e)
@@ -326,6 +382,12 @@ def callback(_: Future[Any]) -> None:
                     _send_msg(self.write_pipe, job_id, result)
             return
 
+<<<<<<< HEAD
+=======
+        future = self.pool.submit(
+            functools.partial(SubprocMain.do_job, self.pickler, data)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         future.add_done_callback(callback)
 
     @staticmethod
diff --git a/torch/_inductor/compile_worker/tracked_process_pool.py b/torch/_inductor/compile_worker/tracked_process_pool.py
new file mode 100644
index 000000000000..36df56b963d6
--- /dev/null
+++ b/torch/_inductor/compile_worker/tracked_process_pool.py
@@ -0,0 +1,111 @@
+import atexit
+import concurrent
+import dataclasses
+import logging
+import threading
+from concurrent.futures import Future, ProcessPoolExecutor
+from dataclasses import dataclass
+from multiprocessing.context import BaseContext
+from time import time
+from typing import Any, Callable, Optional, TypeVar
+from typing_extensions import ParamSpec
+
+# _thread_safe_fork is needed because the subprocesses in the pool can read
+# justknobs, e.g., in the Triton compiler. For internal, the import installs
+# functionality to destroy singletons before forking and re-enable them after.
+import torch._thread_safe_fork  # noqa: F401
+
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+
+log = logging.getLogger(__name__)
+
+
+@dataclass
+class _QueueStats:
+    # Mapping from id(future) -> start time
+    pending: dict[int, float] = dataclasses.field(default_factory=dict)
+    timing: list[float] = dataclasses.field(default_factory=list)
+    enqueue_count: int = 0
+    dequeue_count: int = 0
+    max_queue_depth: int = 0
+    pool_count: int = 0
+
+
+# The queue statistics tracked by TrackedProcessPoolExecutor. Always grab
+# _queue_stats_lock before touching.
+_queue_stats = _QueueStats()
+_queue_stats_lock = threading.Lock()
+
+
+class TrackedProcessPoolExecutor(ProcessPoolExecutor):
+    def __init__(
+        self,
+        max_workers: Optional[int] = None,
+        mp_context: Optional[BaseContext] = None,
+        initializer: Optional[Callable[[], object]] = None,
+    ) -> None:
+        with _queue_stats_lock:
+            _queue_stats.pool_count += 1
+        super().__init__(max_workers, mp_context, initializer)
+
+    def _record_dequeue(self, f: Future[Any]) -> None:
+        now = time()
+        with _queue_stats_lock:
+            stats = _queue_stats
+            if (start_time := stats.pending.pop(id(f), None)) is None:
+                return
+            stats.dequeue_count += 1
+            duration = now - start_time
+            stats.timing.append(duration)
+
+    def _record_enqueue(self, f: Future[Any]) -> None:
+        # Monkeypatch the set_running_or_notify_cancel so we can track when the Future moves out of PENDING.
+        saved_running_or_notify_cancel = f.set_running_or_notify_cancel
+
+        def set_running_or_notify_cancel() -> Any:
+            self._record_dequeue(f)
+            return saved_running_or_notify_cancel()
+
+        now = time()
+        with _queue_stats_lock:
+            stats = _queue_stats
+            stats.pending[id(f)] = now
+            stats.enqueue_count += 1
+            stats.max_queue_depth = max(stats.max_queue_depth, len(stats.pending))
+            f.set_running_or_notify_cancel = set_running_or_notify_cancel  # type: ignore[method-assign]
+
+        if f._state != concurrent.futures._base.PENDING:
+            self._record_dequeue(f)
+
+    def submit(
+        self, fn: Callable[_P, _R], /, *args: _P.args, **kwargs: _P.kwargs
+    ) -> Future[_R]:
+        f = super().submit(fn, *args, **kwargs)
+        self._record_enqueue(f)
+        return f
+
+
+@atexit.register
+def _queue_stats_report() -> None:
+    stats = _queue_stats
+    if stats.pool_count == 0:
+        return
+
+    timing = stats.timing
+    timing.sort()
+
+    log.info("AsyncCompile Metrics:")
+    log.info("  Pools %s", stats.pool_count)
+    log.info(
+        "  Items %d enqueued / %d dequeued", stats.enqueue_count, stats.dequeue_count
+    )
+    log.info("  Max Queue Depth: %d", stats.max_queue_depth)
+    n = len(timing)
+    if n > 0:
+        log.info("  Longest queue time: %0.2fs", timing[-1])
+        log.info("  P50: %0.2fs", timing[n // 2])
+        if n >= 20:
+            log.info("  P95: %0.2fs", timing[n * 95 // 100])
diff --git a/torch/_inductor/compile_worker/utils.py b/torch/_inductor/compile_worker/utils.py
new file mode 100644
index 000000000000..864dcf9c9682
--- /dev/null
+++ b/torch/_inductor/compile_worker/utils.py
@@ -0,0 +1,49 @@
+import os
+import signal
+from threading import Thread
+from time import sleep
+from typing import Optional
+
+
+_IN_TOPLEVEL_PROCESS = True
+
+
+def in_toplevel_process() -> bool:
+    global _IN_TOPLEVEL_PROCESS
+    return _IN_TOPLEVEL_PROCESS
+
+
+# If this process dies abnormally (e.g. segfault)
+# it will not shut down the workers. Instead,
+# the workers will have their parent reassigned to the
+# init process. This launches a separate thread to
+# watch for the worker getting reassigned,
+# and cleans it up in this case.
+#
+# This function cannot be an inner function since otherwise mp_context="spawn" would
+# not work for ProcessPoolExecutor since inner functions cannot be pickled.
+def _async_compile_initializer(orig_ppid: int) -> None:
+    def run() -> None:
+        while True:
+            sleep(1)
+            if orig_ppid != os.getppid():
+                os.kill(os.getpid(), signal.SIGKILL)
+
+    global _watchdog_thread, _original_parent
+    _original_parent = orig_ppid
+    _watchdog_thread = Thread(target=run, daemon=True)
+    _watchdog_thread.start()
+    # Ignore Ctrl-C (i.e. SIGINT) sent to pool workers to avoid meaningless log spam.
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+
+    # Set a bit to distinguish async_compile subprocesses from the toplevel process.
+    global _IN_TOPLEVEL_PROCESS
+    _IN_TOPLEVEL_PROCESS = False
+
+
+_watchdog_thread: Optional[Thread] = None
+_original_parent: Optional[int] = None
+
+
+def has_parent_changed() -> bool:
+    return _original_parent != os.getppid()
diff --git a/torch/_inductor/compiler_bisector.py b/torch/_inductor/compiler_bisector.py
index eebff4b566ce..19a55dfced28 100644
--- a/torch/_inductor/compiler_bisector.py
+++ b/torch/_inductor/compiler_bisector.py
@@ -79,7 +79,11 @@ def reset_counters() -> None:
     call_counter_debug_info.clear()
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_env_val(env_str: str) -> Optional[str]:
     return os.environ.get(env_str, None)
 
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 0b58772fbf0b..5d42e18bf950 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 import os  # noqa: C101
+=======
+import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 from typing import Any, Callable, Literal, Optional, TYPE_CHECKING, Union
 
@@ -39,6 +43,24 @@ def bundle_triton_into_fx_graph_cache_default() -> Optional[bool]:
     )
 
 
+<<<<<<< HEAD
+=======
+def static_cuda_launcher_default() -> bool:
+    STATIC_CUDA_LAUNCHER_VERSION = 1
+
+    if "TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHER" in os.environ:
+        return os.environ.get("TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHER") == "1"
+    elif is_fbcode():
+        version = torch._utils_internal.justknobs_getval_int(
+            "pytorch/inductor:static_cuda_launcher_version"
+        )
+        return version <= STATIC_CUDA_LAUNCHER_VERSION
+    else:
+        # Default true in OSS
+        return True
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def prologue_fusion_enabled() -> bool:
     ENABLE_PROLOGUE_FUSION_VERSION = 0
 
@@ -66,6 +88,12 @@ def prologue_fusion_enabled() -> bool:
 # Whether to enable printing the source code for each future
 verbose_progress = False
 
+<<<<<<< HEAD
+=======
+# precompilation timeout
+precompilation_timeout_seconds: int = 60 * 60
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # use fx aot graph codegen cache
 fx_graph_cache: bool = Config(
     justknob="pytorch/remote_cache:enable_local_fx_graph_cache",
@@ -84,6 +112,15 @@ def prologue_fusion_enabled() -> bool:
     bundle_triton_into_fx_graph_cache_default()
 )
 
+<<<<<<< HEAD
+=======
+non_blocking_remote_cache_write: bool = Config(
+    justknob="pytorch/remote_cache:enable_non_blocking_remote_cache_write_v2",
+    env_name_force="TORCHINDUCTOR_NON_BLOCKING_REMOTE_CACHE_WRITE",
+    default=True,
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Enable autotune local cache.
 #
 # See bundled_autotune_remote_cache for the effect this flag has on the bundled
@@ -115,11 +152,37 @@ def prologue_fusion_enabled() -> bool:
 bundled_autotune_remote_cache: Optional[bool] = bundled_autotune_remote_cache_default()
 
 # Force disabled all inductor level caching -- This will override any other caching flag
+<<<<<<< HEAD
 force_disable_caches: bool = os.environ.get("TORCHINDUCTOR_FORCE_DISABLE_CACHES") == "1"
+=======
+force_disable_caches: bool = Config(
+    justknob="pytorch/remote_cache:force_disable_caches",
+    env_name_force="TORCHINDUCTOR_FORCE_DISABLE_CACHES",
+    default=False,
+)
+
+# Unsafe way to skip dynamic shape guards to get faster cache load
+unsafe_skip_cache_dynamic_shape_guards: bool = False
+
+# Unsafe way to mark non torch functions as safe to cache
+# dictionary is from function name -> cache key
+# Any function name in the dictionary will be allowed to be cacheable
+# by AOTAutogradCache and FxGraphCache.
+# changing the cache key value will change the resulting
+# FXGraphCache key.
+# Example usage:
+# torch._inductor.config.unsafe_marked_cacheable_functions = {
+# 'torch.ops.my_function' : torch.__version__
+# }
+# The above example causes the custom op torch.ops.my_function to be cacheable,
+# and for cache keys to be keyed by the current torch version
+unsafe_marked_cacheable_functions: dict[str, str] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # sleep in inductor for testing
 sleep_sec_TESTING_ONLY: Optional[int] = None
 
+<<<<<<< HEAD
 # The default layout constraint for custom operators.
 # This must be the name of one of the layout constraint tags
 # (that is, one of {"needs_fixed_stride_order", "flexible_layout"}),
@@ -129,6 +192,8 @@ def prologue_fusion_enabled() -> bool:
     "needs_fixed_stride_order", "flexible_layout"
 ] = "needs_fixed_stride_order"
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # The default layout constraint for user-defined triton kernels.
 # See "The default layout constraint for custom operators" for options.
 triton_kernel_default_layout_constraint: Literal[
@@ -139,6 +204,23 @@ def prologue_fusion_enabled() -> bool:
 # incompatible with disable_cpp_codegen
 cpp_wrapper: bool = os.environ.get("TORCHINDUCTOR_CPP_WRAPPER", "0") == "1"
 
+<<<<<<< HEAD
+=======
+# controls whether to compile entry and kernel separately for cpp_wrapper mode.
+# turn on this option to compile entry and kernel separately and minimize compile time of the entry part.
+# see https://github.com/pytorch/pytorch/pull/148773
+# Note: compiling entry and kernel separately may have a non-negligible impact on the performance.
+# see https://github.com/pytorch/pytorch/issues/156037
+cpp_wrapper_build_separate: bool = (
+    os.environ.get("TORCHINDUCTOR_CPP_WRAPPER_BUILD_SEPARATE", "0") == "1"
+)
+
+# Controls automatic precompiling of common include files for codecache.CppCodeCache
+# (i.e. for cpp_wrapper mode and for cpp kernels on CPU).  AOTI header precompiling is
+# controlled by a separate flag.
+cpp_cache_precompile_headers: bool = not is_fbcode()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 online_softmax = os.environ.get("TORCHINDUCTOR_ONLINE_SOFTMAX", "1") == "1"
 
 # dead code elimination
@@ -152,6 +234,15 @@ def prologue_fusion_enabled() -> bool:
 nan_asserts = os.environ.get("TORCHINDUCTOR_NAN_ASSERTS") == "1"
 scalar_asserts = os.environ.get("TORCHINDUCTOR_SCALAR_ASSERTS", "1") == "1"
 
+<<<<<<< HEAD
+=======
+# Disable by default in fbcode
+alignment_asserts = (
+    os.environ.get("TORCHINDUCTOR_ALIGNMENT_ASSERTS", "0" if is_fbcode() else "1")
+    == "1"
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # enable loop reordering based on input orders
 pick_loop_orders = True
 
@@ -167,6 +258,12 @@ def prologue_fusion_enabled() -> bool:
 # Enable to allow using ftz variant of exponenet instruction in triton codegen.
 use_fast_math = os.environ.get("TORCHINDUCTOR_USE_FAST_MATH") == "1"
 
+<<<<<<< HEAD
+=======
+# Enable bfloat16 atomic adds (fbcode only until upstreamed to triton)
+bfloat16_atomic_adds_enabled = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # How to organize memory under memory_planning=True:
 # - "none": do not try to pool storage, just reuse
 # - "intermediates": all non-outputs share storage, outputs each get unique storage
@@ -221,6 +318,19 @@ def prologue_fusion_enabled() -> bool:
     ]
 ] = None
 
+<<<<<<< HEAD
+=======
+# Registers a custom pass to be run right after fusion in Inductor scheduler.
+# WARNING: Inductor scheduler IR is at prototype stage and subject to change,
+# hence custom IR passes built on top of it might break in the future.
+_post_fusion_custom_pass: Optional[
+    Callable[
+        [list["torch._inductor.scheduler.BaseSchedulerNode"]],
+        list["torch._inductor.scheduler.BaseSchedulerNode"],
+    ]
+] = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Deprecated
 split_cat_fx_passes = True
 
@@ -270,9 +380,13 @@ def prologue_fusion_enabled() -> bool:
 # but the mul gets fused with other pointwise ops instead.
 force_fuse_int_mm_with_mul = False
 
+<<<<<<< HEAD
 # for pattern torch.mm(a, b.to(dtype)) with cuda tensors,
 # enable torch._inductor.kernel.mm.tuned_mixed_mm fused kernel.
 # Autotune will compare perf with normal cast->then->mm option
+=======
+# DEPRECATED. This setting is ignored.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 use_mixed_mm = True
 
 # enable runtime numeric check for pre/post grad fx passes
@@ -287,6 +401,7 @@ def prologue_fusion_enabled() -> bool:
     "requires_optimizer": True,
 }
 
+<<<<<<< HEAD
 # mixed_mm_choice can be used to control the behaviour for pattern torch.mm(a, b.to(dtype)) with cuda tensors.
 # The fallback aten implementation is normal cast->then->mm option.
 # If mixed_mm_choice is "default": this flag will be ignored.
@@ -300,6 +415,9 @@ def prologue_fusion_enabled() -> bool:
 # - If autotune is disabled, this config will always be chosen.
 # - If autotune is enabled, it will also compare with fallback aten implementation and fused kernel.
 # The use_mixed_mm flag will be ignored if mixed_mm_choice != "default".
+=======
+# DEPRECATED. This setting is ignored.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mixed_mm_choice: Literal["default", "triton", "aten", "heuristic"] = "heuristic"
 
 # enable reordering pass for increasing overlap between compute and communication
@@ -323,6 +441,12 @@ def prologue_fusion_enabled() -> bool:
     "raise_comms",
 ]
 
+<<<<<<< HEAD
+=======
+# Maximum number of positions to advance a given collective, unlimited by default
+reorder_prefetch_limit: Optional[int] = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # enable operator reordering for peak memory optimization
 reorder_for_peak_memory = True
 
@@ -357,6 +481,12 @@ def prologue_fusion_enabled() -> bool:
 # enable slow autotuning passes to select gemm algorithms
 max_autotune_gemm = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_GEMM") == "1"
 
+<<<<<<< HEAD
+=======
+# disable decomposek autotune choice for gemm
+disable_decompose_k = os.environ.get("TORCHINDUCTOR_DISABLE_DECOMPOSE_K") == "1"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Modifies the number of autotuning choices displayed, set to None for all
 autotune_num_choices_displayed: Optional[int] = 10
 
@@ -382,6 +512,10 @@ def prologue_fusion_enabled() -> bool:
     "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS", "ATEN,TRITON,CPP"
 ).upper()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # As above, specify candidate backends for conv autotune.
 # NB: in some cases for 1x1 convs we emit as matmul,
 # which will use the backends of `max_autotune_gemm_backends`
@@ -404,11 +538,16 @@ def prologue_fusion_enabled() -> bool:
     "TORCHINDUCTOR_MAX_AUTOTUNE_FLEX_SEARCH_SPACE", "DEFAULT"
 ).upper()  # type: ignore[assignment]
 
+<<<<<<< HEAD
 # NOTE: This feature is deprecated and will be defauled to False in the future.
 # Whether we fall back to ATen or hard error when no matches are found during autotuning
 autotune_fallback_to_aten = (
     os.environ.get("TORCHINDUCTOR_AUTOTUNE_FALLBACK_TO_ATEN", "1") == "1"
 )
+=======
+# DEPRECATED. This setting is ignored.
+autotune_fallback_to_aten = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # the value used as a fallback for the unbacked SymInts
 # that can appear in the input shapes (e.g., in autotuning)
@@ -424,12 +563,21 @@ def prologue_fusion_enabled() -> bool:
 
 # The following three timeouts are applicable if autotune_in_subproc is True:
 
+<<<<<<< HEAD
 # Max time that a a valid benchmark result may take during autotuning
 max_autotune_subproc_result_timeout_seconds = 60.0
 # Additional time we allow subprocesses to terminate gracefully after the timeout until we send a SIGTERM
 max_autotune_subproc_graceful_timeout_seconds = 1.0
 # Additional time that we grant after a SIGTERM until we do a hard SIGKILL of subprocesses
 max_autotune_subproc_terminate_timeout_seconds = 2.0
+=======
+# Max time that a valid benchmark result may take during autotuning
+max_autotune_subproc_result_timeout_seconds = 60.0
+# DEPRECATED. This setting is ignored.
+max_autotune_subproc_graceful_timeout_seconds = 0.0
+# DEPRECATED. This setting is ignored.
+max_autotune_subproc_terminate_timeout_seconds = 0.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # If autotuning in subprocess, whether to use multiple devices
 autotune_multi_device = os.environ.get("TORCHINDUCTOR_AUTOTUNE_MULTI_DEVICE") == "1"
@@ -445,7 +593,11 @@ def prologue_fusion_enabled() -> bool:
 )
 
 # AutoHeuristic is a framework that allows one to collect data from autotuning, use the data to learn a heuristic, and
+<<<<<<< HEAD
 # generate the learned heursitic to code which is shipped with the compiler
+=======
+# generate the learned heuristic to code which is shipped with the compiler
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Specify a list of comma separated optimizations to collect data for
 autoheuristic_collect = os.environ.get("TORCHINDUCTOR_AUTOHEURISTIC_COLLECT", "")
 # Specify a list of comma separated optimizations to use learned heuristics for
@@ -502,6 +654,12 @@ def use_autoheuristic(name: str) -> bool:
 
 # automatically create fallbacks when encountering an unhandled op
 implicit_fallbacks = True
+<<<<<<< HEAD
+=======
+assume_unaligned_fallback_output = (
+    os.environ.get("TORCHINDUCTOR_ASSUME_UNALIGNED_FALLBACK_OUTPUT") == "1"
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # fuse even in cases without common reads
 aggressive_fusion = False
@@ -537,7 +695,14 @@ def use_autoheuristic(name: str) -> bool:
 # how many nodes to allow into a single fusion
 max_fusion_size = 64
 
+<<<<<<< HEAD
 # max number of inputs to generate cat as a pointwise op with masked laods
+=======
+# how many nodes to attempt pairwise fusion with in a buffer group
+max_fusion_buffer_group_pairwise_attempts = 64
+
+# max number of inputs to generate cat as a pointwise op with masked loads
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 max_pointwise_cat_inputs = 8
 
 # force concat to be generated as a pointwise op with masked loads
@@ -560,6 +725,15 @@ def use_autoheuristic(name: str) -> bool:
 # enabling both of these will implicitly disable split_reductions
 split_reductions = True
 
+<<<<<<< HEAD
+=======
+# When we do split reduction, this number control the minimum value for
+# num_split. Too small num_split make the split reduction less efficient.
+# It's a much bigger problem when we compile a dynamic shape kernel with
+# non-representative inputs.
+min_num_split = int(os.environ.get("TORCHINDUCTOR_MIN_NUM_SPLIT", 0))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 benchmark_kernel = os.environ.get("TORCHINDUCTOR_BENCHMARK_KERNEL", "0") == "1"
 
 # Enable constant and index_expr folding
@@ -642,7 +816,18 @@ def decide_worker_start_method() -> str:
 
 worker_start_method: str = decide_worker_start_method()
 
+<<<<<<< HEAD
 # Flags to turn on all_reduce fusion. These 2 flags should be automaticaly turned
+=======
+# Whether to log from subprocess workers that are launched.
+worker_suppress_logging: bool = Config(
+    justknob="pytorch/compiler:worker_suppress_logging",
+    env_name_force="TORCHINDUCTOR_WORKER_SUPPRESS_LOGGING",
+    default=True,
+)
+
+# Flags to turn on all_reduce fusion. These 2 flags should be automatically turned
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # on by DDP and should not be set by the users.
 _fuse_ddp_communication = False
 _fuse_ddp_bucket_size = 25
@@ -726,6 +911,26 @@ def decide_compile_threads() -> int:
 # TODO: Set directly after internal rollout.
 compile_threads: Optional[int] = None if is_fbcode() else decide_compile_threads()
 
+<<<<<<< HEAD
+=======
+# Whether or not to enable statically launching CUDA kernels
+# compiled by triton (instead of using triton's own launcher)
+use_static_cuda_launcher: bool = static_cuda_launcher_default()
+
+# Attempt to statically launch user defined triton kernels
+# Requires use_static_cuda_launcher
+static_launch_user_defined_triton_kernels: bool = Config(
+    justknob="pytorch/inductor:static_launch_user_defined_triton_kernels",
+    env_name_force="TORCHINDUCTOR_STATIC_LAUNCH_USER_DEFINED_TRITON_KERNELS",
+    default=False,
+)
+
+# Raise error if we bypass the launcher
+strict_static_cuda_launcher: bool = (
+    os.environ.get("TORCHINDUCTOR_STRICT_STATIC_CUDA_LAUNCHER", "0") == "1"
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # gemm autotuning global cache dir
 global_cache_dir: Optional[str]
 if is_fbcode():
@@ -770,7 +975,11 @@ def decide_compile_threads() -> int:
 # Pad too small stride may also cause perf loss. We may result in many tiny data blocks
 # with gaps in between. That causes less coalesced GPU memory access!
 #
+<<<<<<< HEAD
 # Initially we pick 320 as the threshold since for alignement=16,
+=======
+# Initially we pick 320 as the threshold since for alignment=16,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # that results in at most 5% memory cost.
 #
 # But later on we raise the threshold to 1024 to avoid interfere with persistent reduction.
@@ -870,9 +1079,18 @@ def decide_compile_threads() -> int:
 )
 
 
+<<<<<<< HEAD
 # Adds NVTX annotations aroung training phases
 annotate_training: bool = os.environ.get("TORCHINDUCTOR_ANNOTATE_TRAINING", "0") == "1"
 
+=======
+# Adds NVTX annotations around training phases
+annotate_training: bool = os.environ.get("TORCHINDUCTOR_ANNOTATE_TRAINING", "0") == "1"
+
+# Enable caching codegen of triton templates.
+enable_caching_generated_triton_templates: bool = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # config specific to codegen/cpp.py
 class cpp:
@@ -918,9 +1136,15 @@ class cpp:
     vec_isa_ok: Optional[bool] = get_tristate_env("TORCHINDUCTOR_VEC_ISA_OK")
 
     # similar to config.triton.descriptive_names
+<<<<<<< HEAD
     descriptive_names: Union[
         bool, Literal["torch", "original_aten", "inductor_node"]
     ] = "original_aten"
+=======
+    descriptive_names: Literal["torch", "original_aten", "inductor_node"] = (
+        "original_aten"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # how many nodes to allow into a single horizontal fusion
     max_horizontal_fusion_size = int(
@@ -982,6 +1206,18 @@ class cpp:
     # enable this feature by their need.
     enable_concat_linear = False
 
+<<<<<<< HEAD
+=======
+    # Whether to use decomposed tanh for cpu device
+    # Disable by default due to https://github.com/pytorch/pytorch/issues/148241
+    use_decompose_tanh = (
+        os.environ.get("TORCHINDUCTOR_CPP_USE_DECOMPOSE_TANH", "0") == "1"
+    )
+
+    # Use a small dequant buffer for wgt of woq int4 size as: [q_group_size, Nr]
+    use_small_dequant_buffer = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # config specific to codegen/triton.py
 class triton:
@@ -995,6 +1231,13 @@ class triton:
     # If False, we will re-record a graph for each unique set of shape inputs
     cudagraph_skip_dynamic_graphs = False
 
+<<<<<<< HEAD
+=======
+    # Specify dynamic shapes to capture cudagraphs and skip cudagraph for other shapes.
+    # Default to None, which means we capture cudagraphs for all shapes.
+    cudagraph_capture_sizes: Optional[tuple[Union[int, tuple[int, ...]]]] = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # assertions not on the fast path, steady state
     slow_path_cudagraph_asserts = True
 
@@ -1037,12 +1280,32 @@ class triton:
     # Always load full blocks (rather than broadcasting inside the block)
     dense_indexing = False
 
+<<<<<<< HEAD
     # limit tiling dimensions
     #   - max_tiles=1 disables tiling
     #   - max_tiles=2 is the default
     #   - max_tiles=3 is experimental and may have bugs
     # higher values are unsupported
     max_tiles = 2
+=======
+    # TODO - enable by default
+    coalesce_tiling_analysis: bool = (
+        os.environ.get(
+            "TORCHINDUCTOR_COALESCE_TILING_ANALYSIS", "1" if not is_fbcode() else "0"
+        )
+        == "1"
+    )
+
+    # limit tiling dimensions
+    #   - max_tiles=1 disables tiling
+    #   - max_tiles=2
+    #   - max_tiles=3 is experimental and may have bugs
+    # higher values are unsupported
+
+    #  We use a max of 3 if coalesce_tiling_analysis is True, and 2 otherwise.
+    #  Note - coalesce_tiling_analysis does not yet apply to dynamic shapes.
+    max_tiles: Optional[int] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Prefer higher dimensional tilings. This simplifies indexing expressions, making
     # it easier to identify block pointers.
@@ -1059,6 +1322,14 @@ class triton:
     # Setting to None means uninitialized
     autotune_at_compile_time: Optional[bool] = None
 
+<<<<<<< HEAD
+=======
+    # We use random tensors for autotune by default. Setting this as true will let us
+    # use inputs from sample inputs to autotune user defined triton kernels.
+    # Side effect for this option is increased memory footprint during first pass compilation.
+    autotune_with_sample_inputs: bool = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Allows tiling reductions into multiple dimensions.
     # For best results, this should be used with prefer_nd_tiling.
     tile_reductions: bool = False
@@ -1085,6 +1356,7 @@ class triton:
     )
 
     # should we put op names in kernel names
+<<<<<<< HEAD
     # False: No special names (just triton__1, triton__2, etc.)
     # "torch": Maps to the fx op in the Dynamo graph (module name, method name, etc.)
     # "original_aten": Maps to the highest-level aten op (i.e. pre-decompositions)
@@ -1092,6 +1364,14 @@ class triton:
     descriptive_names: Union[
         bool, Literal["torch", "original_aten", "inductor_node"]
     ] = "original_aten"
+=======
+    # "torch": Maps to the fx op in the Dynamo graph (module name, method name, etc.)
+    # "original_aten": Maps to the highest-level aten op (i.e. pre-decompositions)
+    # "inductor_node": Maps to the node name in the FX graph passed to Inductor
+    descriptive_names: Literal["torch", "original_aten", "inductor_node"] = (
+        "original_aten"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # use alternate codegen for smaller reductions
     persistent_reductions = (
@@ -1115,7 +1395,11 @@ class triton:
     )  # type: ignore[assignment]
 
     # hint to Triton when arguments are divisible by 16
+<<<<<<< HEAD
     divisible_by_16 = True
+=======
+    divisible_by_16 = os.environ.get("TORCHINDUCTOR_DIVISIBLE_BY_16", "1") == "1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Minimum R0_BLOCK to be used for a TritonSplitScanKernel
     # NOTE: This also indirectly controls the size of workspace buffer required
@@ -1131,7 +1415,11 @@ class triton:
     # of registers being benchmarked.
     #
     # NOTE: triton will always report >0 register spills for kernels using sin/cos.
+<<<<<<< HEAD
     # (check this issue https://github.com/openai/triton/issues/1756 )
+=======
+    # (check this issue https://github.com/triton-lang/triton/issues/1756 )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # So far we see a fixed 8 spilled registers for kernels using sin/cos.
     # Raise the threshold to 16 to be safe.
     # We should revisit this once we understand more of the source of register spills.
@@ -1149,7 +1437,11 @@ class triton:
     codegen_upcast_to_fp32 = True
 
     # Whether persistent matmul kernels should be enabled this flag only has effect when on h100
+<<<<<<< HEAD
     # with a verison of triton new enough to support TMA
+=======
+    # with a version of triton new enough to support TMA
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     enable_persistent_tma_matmul = (
         os.environ.get("ENABLE_PERSISTENT_TMA_MATMUL", "0") == "1"
     )
@@ -1162,6 +1454,7 @@ class triton:
     # Note: it may also need to be used with config.compile_threads = 1
     disallow_failing_autotune_kernels_TESTING_ONLY = False
 
+<<<<<<< HEAD
     # Map for storing the amount of kernel runs with dumped imput tensors
     # Based on hash of Triton source code to avoid bloating the folder
     kernel_dump_occurency_map: dict[str, int] = {}
@@ -1173,6 +1466,14 @@ class triton:
 
 
 class aot_inductor:
+=======
+
+class aot_inductor:
+    """
+    Settings for Ahead-Of-Time Inductor Compilation
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # AOTInductor output path
     # If an absolute path is specified, the generated lib files will be stored under the directory;
     # If a relative path is specified, it will be used as a subdirectory under the default caching path;
@@ -1184,9 +1485,15 @@ class aot_inductor:
     debug_compile = os.environ.get("AOT_INDUCTOR_DEBUG_COMPILE", "0") == "1"
 
     # Annotate generated main wrapper function, i.e. AOTInductorModel::run_impl,
+<<<<<<< HEAD
     # to skip cpp compiler optimizations for faster compilation.
     compile_wrapper_with_O0 = (
         os.environ.get("AOT_INDUCTOR_COMPILE_WRAPPER_WITH_O0", "0") == "1"
+=======
+    # to use which cpp compiler optimization level, default to O1
+    compile_wrapper_opt_level = os.environ.get(
+        "AOT_INDUCTOR_COMPILE_WRAPPER_OPT_LEVEL", "O1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     # option for debug printing/saving for intermediate tensor values for aot inductor
@@ -1214,7 +1521,11 @@ class aot_inductor:
     # flag to decide whether to create a submodule for constant graph.
     use_runtime_constant_folding: bool = False
 
+<<<<<<< HEAD
     # flag to force weight to be appened to the shared library and mmaped  by the runtime
+=======
+    # flag to force weight to be appended to the shared library and mapped by the runtime
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # rather than embedded into the data section. Needed to support 1B+ parameter models
     force_mmap_weights: bool = False
 
@@ -1261,8 +1572,38 @@ class aot_inductor:
     # Experimental. Flag to control whether to include weight in .so
     package_constants_in_so: bool = True
 
+<<<<<<< HEAD
+
+class cuda:
+=======
+    # Experimental. Flag to control whether to package weight separately on disk
+    package_constants_on_disk: bool = False
+
+    # Experimental.  Controls automatic precompiling of common AOTI include files.
+    precompile_headers: bool = not is_fbcode()
+
+    # Embed generated kernel binary files into model.so
+    embed_kernel_binary: bool = False
+
+    # Generate kernel files that support multiple archs
+    # For CUDA, this means generating fatbin files for kernels, and the fatbin files
+    # contains PTX and SASS for the current architecture.
+    emit_multi_arch_kernel: bool = False
+
+    # If not None, the generated files with use this name in file stem.
+    # If None, we will use a hash to name files.
+    model_name_for_generated_files: Optional[str] = None
+
+    # Custom ops that have implemented C shim wrappers, defined as an op to C shim declaration dict
+    custom_ops_to_c_shims: dict[torch._ops.OpOverload, list[str]] = {}
+    # custom op libs that have implemented C shim wrappers
+    custom_op_libs: Optional[list[str]] = None
+
 
 class cuda:
+    """Settings for cuda backend, today this consists of cutlass"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # CUDA arch to use for CUDA template kernel compilation.
     # e.g. "70", "75", "80", "90", etc.
     # When arch is None, Inductor uses torch.cuda.get_device_capability(0).
@@ -1303,7 +1644,19 @@ class cuda:
     cutlass_max_profiling_configs: Optional[int] = None
 
     # The L2 swizzle values to consider when profiling CUTLASS configs in max_autotune.
+<<<<<<< HEAD
     cutlass_max_profiling_swizzle_options: list[int] = [1, 2, 4]
+=======
+    cutlass_max_profiling_swizzle_options: list[int] = [1, 2, 4, 8]
+
+    # Whether to use CUTLASS EVT for epilogue fusion
+    cutlass_epilogue_fusion_enabled = (
+        os.environ.get("CUTLASS_EPILOGUE_FUSION", "0") == "1"
+    )
+
+    # Whether to only use TMA-compatible kernels in CUTLASS
+    cutlass_tma_only = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Path to CUDA NVCC.
     # NVCC search order:
@@ -1351,6 +1704,41 @@ class cuda:
         "TORCHINDUCTOR_CUTLASS_INSTANTIATION_LEVEL", "0"
     )
 
+<<<<<<< HEAD
+=======
+    # Experimental. Only for H100 for now. Flag to control whether to use presets.
+    # Format looks like: "0,1,3" for using presets 0, 1, and 3. Presets can be
+    # controlled by some cutlass instantiation level flags (e.g. 0, 1111, 2222, ...)
+    cutlass_presets: Optional[str] = os.environ.get("TORCHINDUCTOR_CUTLASS_PRESETS")
+
+    # use compile command to create kernel .cu and .so name
+    cutlass_hash_with_compile_cmd: bool = (
+        os.environ.get("TORCHINDUCTOR_CUTLASS_HASH_WITH_COMPILE_CMD", "0") == "1"
+    )
+
+    # Experimental. Prescreen top x configs before tuning on swizzle.
+    cutlass_prescreening: bool = (
+        os.environ.get("TORCHINDUCTOR_CUTLASS_PRESCREENING", "1") == "1"
+    )
+
+    # Specify which operations should use CUTLASS backend
+    # Comma-separated list like "mm,addmm,bmm", "all" for all operations, and "" for none.
+    # Acceptable operations: mm, int_mm, addmm, sparse_semi_structured_mm, bmm, scaled_mm
+    cutlass_enabled_ops: str = os.environ.get(
+        "TORCHINDUCTOR_CUTLASS_ENABLED_OPS", "all"
+    )
+
+    # Whether to consult the binary remote cache
+    use_binary_remote_cache: bool = True
+
+    # Whether to upload compiled kernels to remote cache
+    upload_to_binary_remote_cache: bool = False
+
+    # Whether to force upload if the key already exists
+    # Use this to overwrite and handle cache pollution
+    binary_remote_cache_force_write: bool = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class rocm:
     # Offload arch list for device code compilation, e.g. ["gfx90a", "gfx942"].
@@ -1395,9 +1783,23 @@ class rocm:
         os.environ.get("INDUCTOR_CK_BACKEND_GENERATE_TEST_RUNNER_CODE", "0") == "1"
     )
 
+<<<<<<< HEAD
     # Number of op instance choices to trade off between runtime perf and compilation time
     n_max_profiling_configs: Optional[int] = None
 
+=======
+    # Deprecated, use CK and/or CK-tile specific settings
+    n_max_profiling_configs: Optional[int] = None
+
+    # Number of op instance choices to trade off between runtime perf and compilation time
+    # For CK Kernels
+    ck_max_profiling_configs: Optional[int] = None
+
+    # Number of op instance choices to trade off between runtime perf and compilation time
+    # For CK-Tile Kernels
+    ck_tile_max_profiling_configs: Optional[int] = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Flag to use a short list of CK instances which perform well across a variety of shapes.
     # Currently RCR and F16 only
     use_preselected_instances: bool = False
@@ -1489,7 +1891,11 @@ class trace:
     #    replace records with HTML-like labels"
     # and thus fail to generate a graph. So, let's give the user an option
     # to specify the shape attribute for the dot graph. For example, passing
+<<<<<<< HEAD
     # INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like lables
+=======
+    # INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like labels
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # to workaround the above failure.
     dot_graph_shape = os.environ.get("INDUCTOR_DOT_GRAPH_SHAPE_SVG", None)
 
@@ -1503,10 +1909,17 @@ class trace:
     compile_profile = False
 
     # Upload the .tar.gz file
+<<<<<<< HEAD
     # Needs to be overriden based on specific environment needs
     upload_tar: Optional[Callable[[str], None]] = None
 
     log_autotuning_results: bool = False
+=======
+    # Needs to be overridden based on specific environment needs
+    upload_tar: Optional[Callable[[str], None]] = None
+
+    log_autotuning_results = os.environ.get("LOG_AUTOTUNE_RESULTS", "0") == "1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Save mapping info from inductor generated triton kernel to post_grad fx nodes
     log_inductor_triton_kernel_to_post_grad_node_info: bool = True
@@ -1520,6 +1933,13 @@ class trace:
     "pre_grad_custom_pass",
     "aot_inductor.repro_level",
     "aot_inductor.dump_aoti_minifier",
+<<<<<<< HEAD
+=======
+    "post_grad_custom_pre_pass",
+    "post_grad_custom_post_pass",
+    "_fuse_ddp_communication_passes",
+    "_pre_fusion_custom_pass",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 _cache_config_ignore_prefix: list[str] = [
@@ -1533,8 +1953,20 @@ class trace:
     # see CustomGraphPass; these are handled specially
     "post_grad_custom_post_pass",
     "post_grad_custom_pre_pass",
+<<<<<<< HEAD
+    # tests assume that changes here don't invalidate cache
+    "always_complex_memory_overlap_TESTING_ONLY",
+=======
+    "_fuse_ddp_communication_passes",
+    "_pre_fusion_custom_pass",
     # tests assume that changes here don't invalidate cache
     "always_complex_memory_overlap_TESTING_ONLY",
+    # cache related options are not relevant to cache results
+    "fx_graph_cache",
+    "fx_graph_remote_cache",
+    "autotune_local_cache",
+    "autotune_remote_cache",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 # External callable for matmul tuning candidates
@@ -1547,6 +1979,10 @@ class test_configs:
     max_mm_configs: Optional[int] = None
 
     runtime_triton_dtype_assert = False
+<<<<<<< HEAD
+=======
+    static_cpp_dtype_assert = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # regex to control the set of considered autotuning
     # choices (aka configs) by name and / or description
diff --git a/torch/_inductor/constant_folding.py b/torch/_inductor/constant_folding.py
index 1972bcc3583f..3336481df6a0 100644
--- a/torch/_inductor/constant_folding.py
+++ b/torch/_inductor/constant_folding.py
@@ -16,6 +16,21 @@
 MODULE_TAG = "_MAIN_MODULE"
 CONST_MODULE_TAG = "_CONST_MODULE"
 
+<<<<<<< HEAD
+=======
+_dont_constant_fold: list[torch.fx.node.Target] = []
+
+
+def add_dont_constant_fold(op: torch.fx.node.Target) -> None:
+    global _dont_constant_fold
+    _dont_constant_fold.append(op)
+
+
+def clear_dont_constant_fold() -> None:
+    global _dont_constant_fold
+    _dont_constant_fold.clear()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def replace_node_with_constant(
     gm: torch.fx.GraphModule,
@@ -146,6 +161,12 @@ def is_woq_int8_pattern(node: torch.fx.node.Node) -> bool:
             # We only folding fp32_weight -> q
             # int8_weight and leave dq in graph to be fused
             return True
+<<<<<<< HEAD
+=======
+
+        if node.target in _dont_constant_fold:
+            return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     def node_to_last_non_output_use(self) -> dict[torch.fx.Node, list[torch.fx.Node]]:
@@ -233,6 +254,13 @@ def set_env(arg: torch.fx.Node) -> None:
             return self.unknown_value
 
         out = self._deduce_value(node)
+<<<<<<< HEAD
+=======
+
+        if isinstance(out, torch._C.ScriptObject):
+            return out
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if out == self.unknown_value:
             return self.unknown_value
 
@@ -370,7 +398,11 @@ def untag(node: torch.fx.Node) -> bool:
     # We rewrite the tags, if it's a constant being directly consumed, without
     # any folding opportunity, we keep it in main gm.
     for node in gm.graph.nodes:
+<<<<<<< HEAD
         if node.op == "getattr" or (node.name in (lifted_constant_names or ())):
+=======
+        if node.op == "get_attr" or (node.name in (lifted_constant_names or ())):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             untag(node)
 
     new_graph = torch.fx.Graph()
diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
index 9f5f80726873..1a419bb5655d 100644
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@@ -32,8 +32,12 @@
 
 
 if config.is_fbcode():
+<<<<<<< HEAD
     from triton.fb import build_paths  # noqa: F401
     from triton.fb.build import _run_build_command
+=======
+    from triton.fb.build import _run_build_command, build_paths
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from torch._inductor.fb.utils import (
         log_global_cache_errors,
@@ -128,7 +132,11 @@ def install_gcc_via_conda() -> str:
     return cxx_path
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def check_compiler_exist_windows(compiler: str) -> None:
     """
     Check if compiler is ready, in case end user not activate MSVC environment.
@@ -157,13 +165,65 @@ def get_cpp_compiler() -> str:
     return compiler
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+def get_ld_and_objcopy(use_relative_path: bool) -> tuple[str, str]:
+    if _IS_WINDOWS:
+        raise RuntimeError("Windows is not supported yet.")
+    else:
+        if config.is_fbcode():
+            ld = build_paths.ld
+            objcopy = (
+                build_paths.objcopy_fallback
+                if use_relative_path
+                else build_paths.objcopy
+            )
+        else:
+            ld = "ld"
+            objcopy = "objcopy"
+    return ld, objcopy
+
+
+def convert_cubin_to_obj(
+    cubin_file: str,
+    kernel_name: str,
+    ld: str,
+    objcopy: str,
+) -> str:
+    obj_file = cubin_file + ".o"
+    # Convert .cubin to .o
+    cmd = f"{ld} -r -b binary -z noexecstack -o {obj_file} {cubin_file}"
+    subprocess.run(cmd.split(), capture_output=True, text=True, check=True)
+    # Rename .data to .rodata
+    cmd = f"{objcopy} --rename-section .data=.rodata,alloc,load,readonly,data,contents {obj_file}"
+    subprocess.run(cmd.split(), capture_output=True, text=True, check=True)
+    # By default objcopy will create *_start, *_size, *_end symbols using the full path
+    # Rename to use the unique kernel name
+    file_name = re.sub(r"[\W]", "_", cubin_file)
+    cmd = (
+        objcopy
+        + f" --redefine-sym _binary_{file_name}_start=__{kernel_name}_start "
+        + f"--redefine-sym _binary_{file_name}_size=__{kernel_name}_size "
+        + f"--redefine-sym _binary_{file_name}_end=__{kernel_name}_end "
+        + obj_file
+    )
+    subprocess.run(cmd.split(), capture_output=True, text=True, check=True)
+    return obj_file
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_apple_clang(cpp_compiler: str) -> bool:
     version_string = subprocess.check_output([cpp_compiler, "--version"]).decode("utf8")
     return "Apple" in version_string.splitlines()[0]
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_clang(cpp_compiler: str) -> bool:
     # Mac OS apple clang maybe named as gcc, need check compiler info.
     if sys.platform == "darwin":
@@ -178,15 +238,26 @@ def _is_clang(cpp_compiler: str) -> bool:
     return bool(re.search(r"(clang|clang\+\+)", cpp_compiler))
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_gcc(cpp_compiler: str) -> bool:
     # Since "clang++" ends with "g++", the regex match below would validate on it.
     if _is_clang(cpp_compiler):
         return False
+<<<<<<< HEAD
     return bool(re.search(r"(gcc|g\+\+)", cpp_compiler))
 
 
 @functools.lru_cache(None)
+=======
+    return bool(re.search(r"(gcc|g\+\+|gnu-c\+\+)", cpp_compiler))
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_msvc_cl(cpp_compiler: str) -> bool:
     if not _IS_WINDOWS:
         return False
@@ -204,7 +275,11 @@ def _is_msvc_cl(cpp_compiler: str) -> bool:
     return False
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_intel_compiler(cpp_compiler: str) -> bool:
     def _check_minimal_version(compiler_version: TorchVersion) -> None:
         """
@@ -248,31 +323,55 @@ def _check_minimal_version(compiler_version: TorchVersion) -> None:
     return False
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_gcc() -> bool:
     return _is_gcc(get_cpp_compiler())
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_clang() -> bool:
     return _is_clang(get_cpp_compiler())
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_intel_compiler() -> bool:
     return _is_intel_compiler(get_cpp_compiler())
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_apple_clang() -> bool:
     return _is_apple_clang(get_cpp_compiler())
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_msvc_cl() -> bool:
     return _is_msvc_cl(get_cpp_compiler())
 
 
+<<<<<<< HEAD
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_compiler_version_info(compiler: str) -> str:
     env = os.environ.copy()
     env["LC_ALL"] = "C"  # Don't localize output
@@ -287,7 +386,11 @@ def get_compiler_version_info(compiler: str) -> str:
             ).decode(*SUBPROCESS_DECODE_ARGS)
         except Exception:
             return ""
+<<<<<<< HEAD
     # Mutiple lines to one line string.
+=======
+    # Multiple lines to one line string.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     version_string = version_string.replace("\r", "_")
     version_string = version_string.replace("\n", "_")
     return version_string
@@ -332,9 +435,17 @@ def _remove_dir(path_dir: str) -> None:
 def _run_compile_cmd(cmd_line: str, cwd: str) -> None:
     cmd = shlex.split(cmd_line)
     try:
+<<<<<<< HEAD
         subprocess.check_output(args=cmd, cwd=cwd, stderr=subprocess.STDOUT)
     except subprocess.CalledProcessError as e:
         output = e.output.decode("utf-8")
+=======
+        subprocess.run(
+            cmd, cwd=cwd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+        )
+    except subprocess.CalledProcessError as e:
+        output = e.stdout.decode("utf-8")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         openmp_problem = "'omp.h' file not found" in output or "libomp" in output
         if openmp_problem and sys.platform == "darwin":
             instruction = (
@@ -364,7 +475,11 @@ def normalize_path_separator(orig_path: str) -> str:
 class BuildOptionsBase:
     """
     This is the Base class for store cxx build options, as a template.
+<<<<<<< HEAD
     Acturally, to build a cxx shared library. We just need to select a compiler
+=======
+    Actually, to build a cxx shared library. We just need to select a compiler
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     and maintains the suitable args.
     """
 
@@ -381,6 +496,11 @@ def __init__(
         aot_mode: bool = False,
         use_relative_path: bool = False,
         compile_only: bool = False,
+<<<<<<< HEAD
+=======
+        precompiling: bool = False,
+        preprocessing: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         self._compiler = compiler
         self._definitions: list[str] = definitions or []
@@ -389,12 +509,27 @@ def __init__(
         self._ldflags: list[str] = ldflags or []
         self._libraries_dirs: list[str] = libraries_dirs or []
         self._libraries: list[str] = libraries or []
+<<<<<<< HEAD
         # Some args is hard to abstract to OS compatable, passthrough it directly.
         self._passthrough_args: list[str] = passthrough_args or []
 
         self._aot_mode: bool = aot_mode
         self._use_relative_path: bool = use_relative_path
         self._compile_only: bool = compile_only
+=======
+        # Some args are hard to abstract to OS compatible, passthrough directly.
+        self._passthrough_args: list[str] = passthrough_args or []
+
+        # Optionally, the path to a precompiled header which should be included on the
+        # build command line.
+        self.precompiled_header: Optional[str] = None
+
+        self._aot_mode: bool = aot_mode
+        self._use_relative_path: bool = use_relative_path
+        self._compile_only: bool = compile_only
+        self._precompiling: bool = precompiling
+        self._preprocessing: bool = preprocessing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _process_compile_only_options(self) -> None:
         if self._compile_only:
@@ -447,6 +582,15 @@ def get_use_relative_path(self) -> bool:
     def get_compile_only(self) -> bool:
         return self._compile_only
 
+<<<<<<< HEAD
+=======
+    def get_precompiling(self) -> bool:
+        return self._precompiling
+
+    def get_preprocessing(self) -> bool:
+        return self._preprocessing
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def save_flags_to_json(self, file: str) -> None:
         attrs = {
             "compiler": self.get_compiler(),
@@ -542,10 +686,18 @@ def _get_optimization_cflags(
     if _IS_WINDOWS:
         return ["O1" if min_optimize else "O2"]
     else:
+<<<<<<< HEAD
         cflags = (
             ["O0", "g"]
             if config.aot_inductor.debug_compile
             else ["O1" if min_optimize else "O3", "DNDEBUG"]
+=======
+        wrapper_opt_level = config.aot_inductor.compile_wrapper_opt_level
+        cflags = (
+            ["O0", "g"]
+            if config.aot_inductor.debug_compile
+            else [wrapper_opt_level if min_optimize else "O3", "DNDEBUG"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         cflags += _get_ffast_math_flags()
         cflags.append("fno-finite-math-only")
@@ -568,12 +720,17 @@ def _get_optimization_cflags(
         return cflags
 
 
+<<<<<<< HEAD
 def _get_shared_cflag(compile_only: bool) -> list[str]:
+=======
+def _get_shared_cflag(do_link: bool) -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if _IS_WINDOWS:
         """
         MSVC `/MD` using python `ucrtbase.dll` lib as runtime.
         https://learn.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=msvc-170
         """
+<<<<<<< HEAD
         SHARED_FLAG = ["DLL", "MD"]
     else:
         if compile_only:
@@ -585,11 +742,24 @@ def _get_shared_cflag(compile_only: bool) -> list[str]:
             return ["shared", "fPIC"]
 
     return SHARED_FLAG
+=======
+        return ["DLL", "MD"]
+    if not do_link:
+        return ["fPIC"]
+    if platform.system() == "Darwin" and "clang" in get_cpp_compiler():
+        # This causes undefined symbols to behave the same as linux
+        return ["shared", "fPIC", "undefined dynamic_lookup"]
+    return ["shared", "fPIC"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_cpp_options(
     cpp_compiler: str,
+<<<<<<< HEAD
     compile_only: bool,
+=======
+    do_link: bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     warning_all: bool = True,
     extra_flags: Sequence[str] = (),
     min_optimize: bool = False,
@@ -603,7 +773,11 @@ def get_cpp_options(
     passthrough_args: list[str] = []
 
     cflags = (
+<<<<<<< HEAD
         _get_shared_cflag(compile_only)
+=======
+        _get_shared_cflag(do_link)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         + _get_optimization_cflags(cpp_compiler, min_optimize)
         + _get_warning_all_cflag(warning_all)
         + _get_cpp_std_cflag()
@@ -642,11 +816,24 @@ def __init__(
         use_relative_path: bool = False,
         compiler: str = "",
         min_optimize: bool = False,
+<<<<<<< HEAD
     ) -> None:
         super().__init__()
         self._compiler = compiler if compiler else get_cpp_compiler()
         self._use_relative_path = use_relative_path
         self._compile_only = compile_only
+=======
+        precompiling: bool = False,
+        preprocessing: bool = False,
+    ) -> None:
+        super().__init__(
+            compile_only=compile_only,
+            use_relative_path=use_relative_path,
+            precompiling=precompiling,
+            preprocessing=preprocessing,
+        )
+        self._compiler = compiler if compiler else get_cpp_compiler()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         (
             definitions,
@@ -658,7 +845,11 @@ def __init__(
             passthrough_args,
         ) = get_cpp_options(
             cpp_compiler=self._compiler,
+<<<<<<< HEAD
             compile_only=compile_only,
+=======
+            do_link=not (compile_only or precompiling or preprocessing),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             extra_flags=extra_flags,
             warning_all=warning_all,
             min_optimize=min_optimize,
@@ -823,7 +1014,11 @@ def _get_python_related_args() -> tuple[list[str], list[str]]:
     return python_include_dirs, python_lib_path
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_conda_llvm_openmp_installed() -> bool:
     try:
         command = "conda list llvm-openmp --json"
@@ -833,11 +1028,20 @@ def is_conda_llvm_openmp_installed() -> bool:
         return False
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
 def homebrew_libomp() -> tuple[bool, str]:
     try:
         # check if `brew` is installed
         subprocess.check_output(["which", "brew"])
+=======
+@functools.cache
+def homebrew_libomp() -> tuple[bool, str]:
+    try:
+        # check if `brew` is installed
+        if shutil.which("brew") is None:
+            return False, ""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # get the location of `libomp` if it is installed
         # this is the location that `libomp` **would** be installed
         # see https://github.com/Homebrew/brew/issues/10261#issuecomment-756563567 for details
@@ -853,7 +1057,11 @@ def homebrew_libomp() -> tuple[bool, str]:
         return False, ""
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def perload_clang_libomp_win(cpp_compiler: str, omp_name: str) -> None:
     try:
         output = subprocess.check_output([cpp_compiler, "-print-file-name=bin"]).decode(
@@ -867,7 +1075,11 @@ def perload_clang_libomp_win(cpp_compiler: str, omp_name: str) -> None:
         pass
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def perload_icx_libomp_win(cpp_compiler: str) -> None:
     def _load_icx_built_in_lib_by_name(cpp_compiler: str, lib_name: str) -> bool:
         try:
@@ -885,7 +1097,11 @@ def _load_icx_built_in_lib_by_name(cpp_compiler: str, lib_name: str) -> bool:
         return False
 
     """
+<<<<<<< HEAD
     Intel Compiler implenmented more math libraries than clang, for performance proposal.
+=======
+    Intel Compiler implemented more math libraries than clang, for performance proposal.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     We need preload them like openmp library.
     """
     preload_list = [
@@ -1017,7 +1233,10 @@ def get_cpp_torch_options(
     vec_isa: VecISA,
     include_pytorch: bool,
     aot_mode: bool,
+<<<<<<< HEAD
     compile_only: bool,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     use_relative_path: bool,
     use_mmap_weights: bool,
 ) -> tuple[list[str], list[str], list[str], list[str], list[str], list[str], list[str]]:
@@ -1122,6 +1341,11 @@ def __init__(
         extra_flags: Sequence[str] = (),
         compiler: str = "",
         min_optimize: bool = False,
+<<<<<<< HEAD
+=======
+        precompiling: bool = False,
+        preprocessing: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(
             compile_only=compile_only,
@@ -1130,6 +1354,11 @@ def __init__(
             use_relative_path=use_relative_path,
             compiler=compiler,
             min_optimize=min_optimize,
+<<<<<<< HEAD
+=======
+            precompiling=precompiling,
+            preprocessing=preprocessing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self._aot_mode = aot_mode
@@ -1147,7 +1376,10 @@ def __init__(
             vec_isa=vec_isa,
             include_pytorch=include_pytorch,
             aot_mode=aot_mode,
+<<<<<<< HEAD
             compile_only=compile_only,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             use_relative_path=use_relative_path,
             use_mmap_weights=use_mmap_weights,
         )
@@ -1248,12 +1480,17 @@ def get_cpp_torch_device_options(
                 "in https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support."
             )
 
+<<<<<<< HEAD
     if aot_mode:
         if config.is_fbcode():
             from torch._inductor.codecache import cpp_prefix_path
 
             cpp_prefix_include_dir = [f"{os.path.dirname(cpp_prefix_path())}"]
             include_dirs += cpp_prefix_include_dir
+=======
+    if device_type == "mps":
+        definitions.append(" USE_MPS")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if config.is_fbcode():
         include_dirs.append(build_paths.sdk_include)
@@ -1264,6 +1501,12 @@ def get_cpp_torch_device_options(
                     # Only add link args, when compile_only is false.
                     passthrough_args = ["-Wl,-Bstatic -lcudart_static -Wl,-Bdynamic"]
 
+<<<<<<< HEAD
+=======
+    if config.aot_inductor.custom_op_libs:
+        libraries += config.aot_inductor.custom_op_libs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         definitions,
         include_dirs,
@@ -1294,6 +1537,11 @@ def __init__(
         shared: bool = True,
         extra_flags: Sequence[str] = (),
         min_optimize: bool = False,
+<<<<<<< HEAD
+=======
+        precompiling: bool = False,
+        preprocessing: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(
             vec_isa=vec_isa,
@@ -1304,6 +1552,11 @@ def __init__(
             use_mmap_weights=use_mmap_weights,
             extra_flags=extra_flags,
             min_optimize=min_optimize,
+<<<<<<< HEAD
+=======
+            precompiling=precompiling,
+            preprocessing=preprocessing,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         device_definitions: list[str] = []
@@ -1359,7 +1612,11 @@ def get_name_and_dir_from_output_file_path(
         dir = /tmp/tmpof1n5g7t/5c/
 
     put 'name' and 'dir' to CppBuilder's 'name' and 'output_dir'.
+<<<<<<< HEAD
     CppBuilder --> get_target_file_path will format output path accoding OS:
+=======
+    CppBuilder --> get_target_file_path will format output path according OS:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Linux: /tmp/tmppu87g3mm/zh/czhwiz4z7ca7ep3qkxenxerfjxy42kehw6h5cjk6ven4qu4hql4i.so
     Windows: [Windows temp path]/tmppu87g3mm/zh/czhwiz4z7ca7ep3qkxenxerfjxy42kehw6h5cjk6ven4qu4hql4i.dll
     """
@@ -1376,17 +1633,26 @@ class CppBuilder:
     Args:
         name:
             1. Build target name, the final target file will append extension type automatically.
+<<<<<<< HEAD
             2. Due to the CppBuilder is supports mutliple OS, it will maintains ext for OS difference.
+=======
+            2. Due to the CppBuilder is supports multiple OS, it will maintains ext for OS difference.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sources:
             Source code file list to be built.
         BuildOption:
             Build options to the builder.
         output_dir:
+<<<<<<< HEAD
             1. The output_dir the taget file will output to.
+=======
+            1. The output_dir the target file will output to.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             2. The default value is empty string, and then the use current dir as output dir.
             3. Final target file: output_dir/name.ext
     """
 
+<<<<<<< HEAD
     def __get_python_module_ext(self) -> str:
         SHARED_LIB_EXT = ".pyd" if _IS_WINDOWS else ".so"
         return SHARED_LIB_EXT
@@ -1394,6 +1660,31 @@ def __get_python_module_ext(self) -> str:
     def __get_object_ext(self) -> str:
         EXT = ".obj" if _IS_WINDOWS else ".o"
         return EXT
+=======
+    @staticmethod
+    def __get_python_module_flags() -> tuple[str, str]:
+        extension = ".pyd" if _IS_WINDOWS else ".so"
+        output_flags = "/Fe" if _IS_WINDOWS else "-o"
+        return extension, output_flags
+
+    @staticmethod
+    def __get_object_flags() -> tuple[str, str]:
+        extension = ".obj" if _IS_WINDOWS else ".o"
+        output_flags = "/c /Fo" if _IS_WINDOWS else "-c -o"  # codespell:ignore
+        return extension, output_flags
+
+    @staticmethod
+    def __get_precompiled_header_flags() -> tuple[str, str]:
+        extension = ".pch" if _IS_WINDOWS or not is_gcc() else ".gch"
+        output_flags = "/Fp" if _IS_WINDOWS else "-o"
+        return extension, output_flags
+
+    @staticmethod
+    def __get_preprocessor_output_flags() -> tuple[str, str]:
+        extension = ".i"
+        output_flags = "/EP /P" if _IS_WINDOWS else "-E -P -o"
+        return extension, output_flags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -1421,7 +1712,11 @@ def __init__(
 
         self._name = name
 
+<<<<<<< HEAD
         # Code start here, initial self internal veriables firstly.
+=======
+        # Code start here, initial self internal variables firstly.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._build_option = BuildOption
         self._compiler = BuildOption.get_compiler()
         self._use_relative_path = BuildOption.get_use_relative_path()
@@ -1430,6 +1725,7 @@ def __init__(
         self._output_dir = output_dir
 
         self._compile_only = BuildOption.get_compile_only()
+<<<<<<< HEAD
         file_ext = (
             self.__get_object_ext()
             if self._compile_only
@@ -1449,6 +1745,60 @@ def __init__(
                 inp_name = [os.path.basename(i) for i in sources]
                 self._orig_source_paths = sources
             self._sources_args = " ".join(inp_name)
+=======
+        self._precompiling = BuildOption.get_precompiling()
+        self._preprocessing = BuildOption.get_preprocessing()
+        # Only one of these options (if any) should be true at any given time.
+        assert sum((self._compile_only, self._precompiling, self._preprocessing)) <= 1
+        self._do_link = not (
+            self._compile_only or self._precompiling or self._preprocessing
+        )
+
+        # MSVC produces two files when precompiling: the actual .pch file, as well as an
+        # object file which must be linked into the final library.  This class assumes
+        # only one output file of note, so for now we'll error out here.
+        assert not _IS_WINDOWS or not self._precompiling, (
+            "Cannot currently precompile headers on Windows!"
+        )
+
+        if self._compile_only:
+            file_ext, output_flags = self.__get_object_flags()
+        elif self._precompiling:
+            file_ext, output_flags = self.__get_precompiled_header_flags()
+        elif self._preprocessing:
+            file_ext, output_flags = self.__get_preprocessor_output_flags()
+        else:
+            file_ext, output_flags = self.__get_python_module_flags()
+        self._target_file = os.path.join(self._output_dir, f"{self._name}{file_ext}")
+
+        relative_target_file = (
+            os.path.basename(self._target_file)
+            if self._use_relative_path
+            else self._target_file
+        )
+        if _IS_WINDOWS:
+            if self._preprocessing:
+                # The target file name is automatically determined by MSVC.
+                self._output = output_flags
+            else:
+                self._output = f"{output_flags}{relative_target_file}"
+        else:
+            self._output = f"{output_flags} {relative_target_file}"
+
+        if isinstance(sources, str):
+            sources = [sources]
+
+        if config.is_fbcode() and (not self._aot_mode or self._use_relative_path):
+            # Will create another temp directory for building, so do NOT use the
+            # absolute path.
+            self._orig_source_paths = list(sources)
+            sources = [os.path.basename(i) for i in sources]
+
+        if self._precompiling:
+            assert len(sources) == 1
+            # See above; we can currently assume this is not on MSVC.
+            self._sources_args = f"-x c++-header {sources[0]}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             self._sources_args = " ".join(sources)
 
@@ -1464,11 +1814,27 @@ def __init__(
             else:
                 self._definitions_args += f"-D {definition} "
 
+<<<<<<< HEAD
+=======
+        if precompiled_header := BuildOption.precompiled_header:
+            if _IS_WINDOWS:
+                log.warning(
+                    "Precompiled header support for MSVC is currently unavailable; ignoring %s",
+                    precompiled_header,
+                )
+            else:
+                self._include_dirs_args = f"-include {precompiled_header} "
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for inc_dir in BuildOption.get_include_dirs():
             if _IS_WINDOWS:
                 self._include_dirs_args += f'/I "{inc_dir}" '
             else:
+<<<<<<< HEAD
                 self._include_dirs_args += f"-I{inc_dir} "
+=======
+                self._include_dirs_args += f"-I{shlex.quote(inc_dir)} "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for ldflag in BuildOption.get_ldflags():
             if _IS_WINDOWS:
@@ -1502,12 +1868,17 @@ def format_build_command(
             libraries_args: str,
             libraries_dirs_args: str,
             passthrough_args: str,
+<<<<<<< HEAD
             target_file: str,
+=======
+            output: str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> str:
             if _IS_WINDOWS:
                 # https://learn.microsoft.com/en-us/cpp/build/walkthrough-compile-a-c-program-on-the-command-line?view=msvc-1704
                 # https://stackoverflow.com/a/31566153
                 cmd = (
+<<<<<<< HEAD
                     f"{compiler} {include_dirs_args} {definitions_args} {cflags_args} {sources} "
                     f"{passthrough_args} /LD /Fe{target_file} /link {libraries_dirs_args} {libraries_args} {ldflags_args} "
                 )
@@ -1522,6 +1893,21 @@ def format_build_command(
                     {passthrough_args} {ldflags_args} {libraries_args} {libraries_dirs_args} {compile_only_arg} -o {target_file}
                     """,
                 ).strip()
+=======
+                    f"{compiler} {include_dirs_args} {definitions_args} {cflags_args} "
+                    f"{sources} {passthrough_args} {output}"
+                )
+                if self._do_link:
+                    cmd += f" /LD /link {libraries_dirs_args} {libraries_args} {ldflags_args}"
+                cmd = normalize_path_separator(cmd)
+            else:
+                cmd = (
+                    f"{compiler} {sources} {definitions_args} {cflags_args} "
+                    f"{include_dirs_args} {passthrough_args} {output}"
+                )
+                if self._do_link:
+                    cmd += f" {ldflags_args} {libraries_args} {libraries_dirs_args}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return cmd
 
         command_line = format_build_command(
@@ -1534,9 +1920,13 @@ def format_build_command(
             libraries_args=self._libraries_args,
             libraries_dirs_args=self._libraries_dirs_args,
             passthrough_args=self._passthrough_parameters_args,
+<<<<<<< HEAD
             target_file=os.path.basename(self._target_file)
             if self._use_relative_path
             else self._target_file,
+=======
+            output=self._output,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return command_line
 
@@ -1546,6 +1936,7 @@ def get_target_file_path(self) -> str:
     def build_fbcode_re(
         self,
     ) -> None:
+<<<<<<< HEAD
         from torch._inductor.codecache import cpp_prefix_path
 
         with dynamo_timed("compile_file"):
@@ -1554,6 +1945,11 @@ def build_fbcode_re(
                 # Need to copy our header into the same folder as the sourcecode.
                 header_path = cpp_prefix_path()
                 header_name = os.path.basename(header_path)
+=======
+        with dynamo_timed("compile_file"):
+            command = self.get_command_line().split()
+            try:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_path = self._target_file
                 # When we build remotely, we need to make sure to carefully copy any files
                 # that are required during the compilation process into our build directly.
@@ -1561,7 +1957,10 @@ def build_fbcode_re(
                 torch_includes_path = os.path.join(_TORCH_PATH, "include")
                 with tempfile.TemporaryDirectory() as tmp_dir:
                     # Copy everything to tmp compilation folder
+<<<<<<< HEAD
                     shutil.copy(header_path, os.path.join(tmp_dir, header_name))
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     shutil.copy(_LINKER_SCRIPT, os.path.join(tmp_dir, "script.ld"))
                     for src in self._orig_source_paths:
                         shutil.copy(src, os.path.join(tmp_dir, os.path.basename(src)))
@@ -1585,8 +1984,13 @@ def build_fbcode_re(
 
     def build(self) -> None:
         """
+<<<<<<< HEAD
         It is must need a temperary directory to store object files in Windows.
         After build completed, delete the temperary directory to save disk space.
+=======
+        It is must need a temporary directory to store object files in Windows.
+        After build completed, delete the temporary directory to save disk space.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if self._use_relative_path:
             # remote build uses relative path
@@ -1604,11 +2008,25 @@ def build(self) -> None:
     def save_compile_cmd_to_cmake(
         self,
         cmake_path: str,
+<<<<<<< HEAD
     ) -> None:
         definitions = " ".join(self._build_option.get_definitions())
         contents = textwrap.dedent(
             f"""
             cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+=======
+        device_type: str,
+    ) -> None:
+        """
+        Save global cmake settings here, e.g. compiler options.
+        If targeting CUDA, also emit a custom function to embed CUDA kernels.
+        """
+
+        definitions = " ".join(self._build_option.get_definitions())
+        contents = textwrap.dedent(
+            f"""
+            cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             project(aoti_model LANGUAGES CXX)
             set(CMAKE_CXX_STANDARD 17)
 
@@ -1628,6 +2046,73 @@ def save_compile_cmd_to_cmake(
 
             """
         )
+<<<<<<< HEAD
+=======
+        if device_type == "cuda" and torch.version.hip is None:
+            from torch._inductor.codecache import _nvcc_arch_as_compile_option
+
+            current_arch = _nvcc_arch_as_compile_option()
+            contents += textwrap.dedent(
+                f"""
+                enable_language(CUDA)
+                find_package(CUDAToolkit REQUIRED)
+
+                find_program(OBJCOPY_EXECUTABLE objcopy)
+                if(NOT OBJCOPY_EXECUTABLE)
+                    message(FATAL_ERROR "objcopy not found. Cannot embed fatbin as object file")
+                endif()
+
+                set(KERNEL_TARGETS "")
+                set(KERNEL_OBJECT_FILES "")
+                # Function to embed a single kernel
+                function(embed_gpu_kernel KERNEL_NAME PTX_FILE)
+                    set(FATBIN_BASENAME ${{KERNEL_NAME}}.fatbin)
+                    set(FATBIN_FILE ${{CMAKE_CURRENT_BINARY_DIR}}/${{FATBIN_BASENAME}})
+                    set(OBJECT_BASENAME ${{KERNEL_NAME}}.fatbin.o)
+                    set(OBJECT_FILE ${{CMAKE_CURRENT_BINARY_DIR}}/${{OBJECT_BASENAME}})
+
+                    # --- Define UNIQUE C symbol names ---
+                    set(SYMBOL_START __${{KERNEL_NAME}}_start)
+                    set(SYMBOL_END __${{KERNEL_NAME}}_end)
+                    set(SYMBOL_SIZE __${{KERNEL_NAME}}_size)
+                    string(REGEX REPLACE "[^a-zA-Z0-9]" "_" MANGLED_BASENAME ${{FATBIN_FILE}})
+                    set(OBJCOPY_START_SYM _binary_${{MANGLED_BASENAME}}_start)
+                    set(OBJCOPY_END_SYM _binary_${{MANGLED_BASENAME}}_end)
+                    set(OBJCOPY_SIZE_SYM _binary_${{MANGLED_BASENAME}}_size)
+
+                    # --- PTX to FATBIN Command & Target ---
+                    add_custom_command(
+                        OUTPUT ${{FATBIN_FILE}}
+                        COMMAND ${{CUDAToolkit_NVCC_EXECUTABLE}} --fatbin ${{PTX_FILE}} -o ${{FATBIN_FILE}} ${{NVCC_GENCODE_FLAGS}}
+                                -gencode arch=compute_80,code=compute_80
+                                -gencode arch=compute_{current_arch},code=sm_{current_arch}
+                        DEPENDS ${{PTX_FILE}}
+                    )
+
+                    # --- FATBIN to Object File (.o) Command ---
+                    add_custom_command(
+                        OUTPUT ${{OBJECT_FILE}}
+                        COMMAND ${{CMAKE_LINKER}} -r -b binary -z noexecstack -o ${{OBJECT_FILE}} ${{FATBIN_FILE}}
+                        COMMAND ${{OBJCOPY_EXECUTABLE}} --rename-section .data=.rodata,alloc,load,readonly,data,contents
+                                ${{OBJECT_FILE}}
+                        COMMAND ${{OBJCOPY_EXECUTABLE}}
+                                --redefine-sym ${{OBJCOPY_START_SYM}}=${{SYMBOL_START}}
+                                --redefine-sym ${{OBJCOPY_END_SYM}}=${{SYMBOL_END}}
+                                --redefine-sym ${{OBJCOPY_SIZE_SYM}}=${{SYMBOL_SIZE}}
+                                ${{OBJECT_FILE}}
+                        DEPENDS ${{FATBIN_FILE}}
+                    )
+                    add_custom_target(build_kernel_object_${{KERNEL_NAME}} DEPENDS ${{OBJECT_FILE}})
+
+                    # --- Add to a list for linking later ---
+                    set(KERNEL_TARGETS ${{KERNEL_TARGETS}} build_kernel_object_${{KERNEL_NAME}} PARENT_SCOPE)
+                    set(KERNEL_OBJECT_FILES ${{KERNEL_OBJECT_FILES}} ${{OBJECT_FILE}} PARENT_SCOPE)
+                endfunction()
+
+                """
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with open(cmake_path, "w") as f:
             f.write(contents)
 
@@ -1637,6 +2122,26 @@ def save_src_to_cmake(self, cmake_path: str, src_path: str) -> None:
         with open(cmake_path, "a") as f:
             f.write(f"target_sources(aoti_model PRIVATE {src_path})\n")
 
+<<<<<<< HEAD
+=======
+    def save_kernel_asm_to_cmake(self, cmake_path: str, asm_files: list[str]) -> None:
+        # TODO: make this work beyond CUDA
+        with open(cmake_path, "a") as f:
+            for asm_file in asm_files:
+                kernel_name = Path(asm_file).name.split(".")[0]
+                asm_file = f"${{CMAKE_CURRENT_SOURCE_DIR}}/{Path(asm_file).name}"
+                contents = textwrap.dedent(
+                    f"""
+                    embed_gpu_kernel({kernel_name} {asm_file})
+                    """
+                )
+                f.write(contents)
+            f.write("add_dependencies(aoti_model ${KERNEL_TARGETS})\n")
+            f.write(
+                "target_link_libraries(aoti_model PRIVATE ${KERNEL_OBJECT_FILES})\n"
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def save_link_cmd_to_cmake(self, cmake_path: str) -> None:
         lflags = " ".join(self._build_option.get_ldflags())
         libs = " ".join(self._build_option.get_libraries())
diff --git a/torch/_inductor/cpu_vec_isa.py b/torch/_inductor/cpu_vec_isa.py
index 0c8c315bbc1d..c378b1d05d7c 100644
--- a/torch/_inductor/cpu_vec_isa.py
+++ b/torch/_inductor/cpu_vec_isa.py
@@ -131,7 +131,16 @@ def check_build(self, code: str) -> bool:
                     ],
                     cwd=output_dir,
                     stderr=subprocess.DEVNULL,
+<<<<<<< HEAD
                     env={**os.environ, "PYTHONPATH": ":".join(sys.path)},
+=======
+                    env={
+                        **os.environ,
+                        "PYTHONPATH": os.environ.get(
+                            "TORCH_CUSTOM_PYTHONPATH", os.pathsep.join(sys.path)
+                        ),
+                    },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             except Exception:
                 return False
@@ -141,7 +150,11 @@ def check_build(self, code: str) -> bool:
     def __bool__(self) -> bool:
         return self.__bool__impl(config.cpp.vec_isa_ok)
 
+<<<<<<< HEAD
     @functools.lru_cache(None)  # noqa: B019
+=======
+    @functools.cache  # noqa: B019
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __bool__impl(self, vec_isa_ok) -> bool:
         if vec_isa_ok is not None:
             return vec_isa_ok
@@ -164,7 +177,11 @@ def __str__(self) -> str:
             return "neon"
         return "asimd"  # detects the presence of advanced SIMD on armv8-a kernels
 
+<<<<<<< HEAD
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+=======
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -175,8 +192,15 @@ class VecSVE256(VecISA):
         "CPU_CAPABILITY_SVE",
         "CPU_CAPABILITY_SVE256",
         "AT_BUILD_ARM_VEC256_WITH_SLEEF",
+<<<<<<< HEAD
     ]
     _arch_flags = "-march=armv8-a+sve -msve-vector-bits=256"
+=======
+        "__ARM_FEATURE_BF16",
+    ]
+    _arch_flags = "-march=armv8-a+sve+bf16 -msve-vector-bits=256"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
 
     def __str__(self) -> str:
@@ -184,7 +208,11 @@ def __str__(self) -> str:
             return "neon"
         return "asimd"
 
+<<<<<<< HEAD
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+=======
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -201,7 +229,11 @@ class VecAVX512(VecISA):
     def __str__(self) -> str:
         return "avx512"
 
+<<<<<<< HEAD
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+=======
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -234,7 +266,11 @@ def __str__(self) -> str:
 }
 """
 
+<<<<<<< HEAD
     @functools.lru_cache(None)  # noqa: B019
+=======
+    @functools.cache  # noqa: B019
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __bool__(self) -> bool:
         if super().__bool__():
             if config.is_fbcode():
@@ -256,7 +292,11 @@ class VecAVX2(VecISA):
     def __str__(self) -> str:
         return "avx2"
 
+<<<<<<< HEAD
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+=======
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -273,7 +313,11 @@ class VecZVECTOR(VecISA):
     def __str__(self) -> str:
         return "zvector"
 
+<<<<<<< HEAD
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+=======
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -286,7 +330,11 @@ class VecVSX(VecISA):
     def __str__(self) -> str:
         return "vsx"
 
+<<<<<<< HEAD
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+=======
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class InvalidVecISA(VecISA):
@@ -301,7 +349,11 @@ def __str__(self) -> str:
     def __bool__(self) -> bool:  # type: ignore[override]
         return False
 
+<<<<<<< HEAD
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+=======
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def x86_isa_checker() -> list[str]:
@@ -332,7 +384,17 @@ def _check_and_append_supported_isa(
 
 
 invalid_vec_isa = InvalidVecISA()
+<<<<<<< HEAD
 supported_vec_isa_list = [VecAMX(), VecAVX512(), VecAVX2(), VecNEON(), VecSVE256()]
+=======
+supported_vec_isa_list = [
+    VecAMX(),
+    VecAVX512(),
+    VecAVX2(),
+    VecNEON(),
+    VecSVE256(),
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_isa_from_cpu_capability(
@@ -367,7 +429,11 @@ def get_isa_from_cpu_capability(
 # Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content
 # might have too much redundant content that is useless for ISA check. Hence,
 # we only cache some key isa information.
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def valid_vec_isa_list() -> list[VecISA]:
     isa_list: list[VecISA] = []
     if sys.platform == "darwin" and platform.processor() == "arm":
@@ -397,6 +463,10 @@ def valid_vec_isa_list() -> list[VecISA]:
             isa_list.append(VecSVE256())
         else:
             isa_list.append(VecNEON())
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif arch in ["x86_64", "AMD64"]:
         """
         arch value is x86_64 on Linux, and the value is AMD64 on Windows.
diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py
index 974503572753..19435757701b 100644
--- a/torch/_inductor/cudagraph_trees.py
+++ b/torch/_inductor/cudagraph_trees.py
@@ -54,6 +54,10 @@
 
 import torch.fx
 from torch import Tensor
+<<<<<<< HEAD
+=======
+from torch._dynamo.callback import CallbackTrigger
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._dynamo.mutation_guard import GenerationTracker
 from torch._dynamo.utils import counters, dynamo_timed, preserve_rng_state
 from torch._inductor.compile_fx import (
@@ -227,7 +231,11 @@ def _finalize_tensor(self) -> None:
                 self.graph = None
 
                 # manager was used again after existing cleanup,
+<<<<<<< HEAD
                 # we shouldnt set it to None
+=======
+                # we shouldn't set it to None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if self.live_cudagraphify_fns == 0:
                     self.tree_manager = None
 
@@ -345,6 +353,20 @@ def get_manager(
     return get_container(device_index).tree_manager
 
 
+<<<<<<< HEAD
+=======
+def is_cudagraph_capture_sizes(int_key: Union[int, tuple[int, ...]]) -> bool:
+    """
+    Returns true if all dynamic shapes should be captured or the dynamic shape
+    int_key should be captured.
+    """
+    return (
+        config.triton.cudagraph_capture_sizes is None
+        or int_key in config.triton.cudagraph_capture_sizes
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def cudagraphify_impl(
     model: ModelType,
     inputs: list[InputType],
@@ -366,6 +388,13 @@ def deferred_cudagraphify(inputs: list[InputType]) -> OutputType:
         nonlocal has_warn
 
         int_key = get_ints(inputs)
+<<<<<<< HEAD
+=======
+
+        if not is_cudagraph_capture_sizes(int_key):
+            return model(inputs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn = fn_cache.get(int_key)
         if fn is not None:
             return fn(inputs)
@@ -385,7 +414,15 @@ def deferred_cudagraphify(inputs: list[InputType]) -> OutputType:
         copy_misaligned_inputs(inputs, check_input_idxs)
 
         fn, out = cudagraphify(model, inputs, new_static_input_idxs, *args, **kwargs)
+<<<<<<< HEAD
         fn = align_inputs_from_check_idxs(fn, inputs_to_check=check_input_idxs)
+=======
+        # cudagraph will already clones input locally, no need to copy back
+        mutated_input_idxs: OrderedSet[int] = OrderedSet()
+        fn = align_inputs_from_check_idxs(
+            fn, inputs_to_check=check_input_idxs, mutated_input_idxs=mutated_input_idxs
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn_cache[int_key] = fn
 
         return out
@@ -398,6 +435,7 @@ def dynamo_timed_cudagraph(
     name: str,
     compile_id: Optional[CompileId],
     mode: Optional[CompilationMode],
+<<<<<<< HEAD
     dynamo_compile: bool = False,
 ) -> Generator[Any, None, None]:
     """
@@ -405,15 +443,28 @@ def dynamo_timed_cudagraph(
     to the 'dynamo_compile' param; if True, then we add the timing to the overall
     cudagraphify overhead logged to dynamo_compile. We only want to count those
     regions that are purely cudagraph overhead.
+=======
+) -> Generator[Any, None, None]:
+    """
+    Makes usages of dynamo_timed in this file less verbose. NOTE: This CM sums
+    all durations into a single column in the dynamo_compile table. Use only if
+    you consider the timed region to be part of the runtime overhead associated
+    with the compiler.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     with dynamo_timed(
         name,
         log_pt2_compile_event=True,
         compile_id=compile_id,
+<<<<<<< HEAD
         is_forward=mode != CompilationMode.BACKWARD,
         dynamo_compile_runtime_column_us="runtime_cudagraphify_time_us"
         if dynamo_compile
         else None,
+=======
+        is_backward=mode == CompilationMode.BACKWARD,
+        dynamo_compile_column_us="runtime_cudagraphify_time_us",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         yield
 
@@ -439,6 +490,7 @@ def cudagraphify(
         else (CompilationMode.INFERENCE if is_inference else CompilationMode.FORWARD)
     )
 
+<<<<<<< HEAD
     with dynamo_timed_cudagraph(
         "cudagraphify.get_container", compile_id, mode, dynamo_compile=True
     ):
@@ -456,6 +508,22 @@ def cudagraphify(
             mutated_input_idxs,
             compile_id,
         )
+=======
+    with dynamo_timed_cudagraph("cudagraphify.get_container", compile_id, mode):
+        manager = get_container(device_index).get_tree_manager()
+
+    return manager.add_function(
+        model,
+        inputs,
+        static_input_idxs,
+        stack_traces,
+        mode,
+        constants,
+        placeholders,
+        mutated_input_idxs,
+        compile_id,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class StorageWeakRefWrapper:
@@ -488,7 +556,11 @@ def __init__(
 
     @classmethod
     def from_weakref_and_data_ptr(
+<<<<<<< HEAD
         cls: type[S],
+=======
+        cls: type[StorageWeakRefWrapper],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cdata: Any,
         data_ptr: int,
         extra_ref_check: Optional[Callable[[], bool]] = None,
@@ -561,11 +633,22 @@ def _use_cuda_memory_pool_manager(
     stream.wait_stream(torch.cuda.current_stream())
 
     with torch.cuda.stream(stream), torch.device(device):
+<<<<<<< HEAD
         torch._C._cuda_beginAllocateCurrentStreamToPool(device, mem_pool)
         try:
             yield
         finally:
             torch._C._cuda_endAllocateCurrentStreamToPool(device, mem_pool)
+=======
+        # Begin allocate to mem pool for all memory allocation on the current thread.
+        # This is thread safe since a thread can only warmup or record 1 cudagraph
+        # at the same time.
+        torch._C._cuda_beginAllocateCurrentThreadToPool(device, mem_pool)
+        try:
+            yield
+        finally:
+            torch._C._cuda_endAllocateToPool(device, mem_pool)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._C._cuda_releasePool(device, mem_pool)
 
     torch.cuda.current_stream().wait_stream(stream)
@@ -1015,9 +1098,13 @@ def maybe_get_static_data_ptr(
         self.static_output_tensors: OutputList[Optional[Tensor]] = []
 
         # Cleared after recording
+<<<<<<< HEAD
         with dynamo_timed_cudagraph(
             "CUDAGraphNode.record", compile_id, mode, dynamo_compile=True
         ):
+=======
+        with dynamo_timed_cudagraph("CUDAGraphNode.record", compile_id, mode):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.recording_outputs: Optional[OutputType] = self._record(
                 wrapped_function.model, recording_inputs
             )
@@ -1036,8 +1123,12 @@ def maybe_get_static_data_ptr(
                 assert isinstance(out, (int, type(None))), type(out)
                 self.outputs_metadata.append(out)
 
+<<<<<<< HEAD
         with dynamo_timed_cudagraph("CUDAGraphNode.replay", compile_id, mode):
             self.graph.replay()
+=======
+        self.graph.replay()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _copy_inputs_and_remove_from_src(
         self, dsts: list[InputType], srcs: list[InputType]
@@ -1232,7 +1323,11 @@ def static_input_iter() -> Generator[torch.Tensor, None, None]:
         }
 
         if config.triton.slow_path_cudagraph_asserts:
+<<<<<<< HEAD
             # need to use parent live weakrefs because live_indices isnt set yet
+=======
+            # need to use parent live weakrefs because live_indices isn't set yet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             memory = (
                 [] if self.parent is None else list(self.parent.path_live_weakrefs())
             )
@@ -1608,7 +1703,11 @@ def remove_path_cached_tensors(self) -> None:
 
     def clear_path_state(self) -> None:
         "Clear the path state in this current executing node"
+<<<<<<< HEAD
         # this doesnt actually do anything right now, leaving it as placeholder
+=======
+        # this doesn't actually do anything right now, leaving it as placeholder
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def _tensor_metadata(
@@ -1800,6 +1899,10 @@ def check_memory_pool(
     # at this point we are past the fast-path. we have seen rare cases where a dead tensor is dead,
     # but hasn't been gc'd yet, and gives false positive for allocated_not_in_live_storages
     gc.collect()
+<<<<<<< HEAD
+=======
+    torch.cuda.synchronize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     segments = get_cudagraph_segments(pool_id)
 
@@ -2098,12 +2201,16 @@ def _run(self, new_inputs: list[InputType], function_id: FunctionID) -> OutputTy
             if self.path_state == ExecutionState.EXECUTION:
                 self.apply_checkpoint_execution_state_in_allocator()
 
+<<<<<<< HEAD
             with dynamo_timed_cudagraph(
                 "CUDAGraphTreeManager.run_eager", self.compile_id, self.mode
             ):
                 out = self.run_eager(new_inputs, function_id)
 
             return out
+=======
+            return self.run_eager(new_inputs, function_id)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert not isinstance(self.current_node, CUDAWarmupNode)
         child_nodes = (
@@ -2169,12 +2276,16 @@ def _run(self, new_inputs: list[InputType], function_id: FunctionID) -> OutputTy
                 self.apply_checkpoint_execution_state_in_allocator()
 
         # now, we are in a recording state !
+<<<<<<< HEAD
         with dynamo_timed_cudagraph(
             "CUDAGraphTreeManager.record_function", self.compile_id, self.mode
         ):
             out = self.record_function(new_inputs, function_id)
 
         return out
+=======
+        return self.record_function(new_inputs, function_id)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def shutdown(self) -> None:
         """
@@ -2201,6 +2312,7 @@ def record_function(
         self, new_inputs: list[InputType], function_id: FunctionID
     ) -> OutputType:
         assert not isinstance(self.current_node, CUDAWarmupNode)
+<<<<<<< HEAD
         graph_id = self.new_graph_id()
         log.debug(
             "Recording function %d of graph recording id %d",
@@ -2229,6 +2341,39 @@ def record_function(
         self.update_generation()
         torch.cuda.synchronize()
         return node.run_first_inputs(new_inputs)
+=======
+        with torch._dynamo.callback_handler.install_callbacks(
+            CallbackTrigger.CUDAGRAPH_RECORDING, str(self.compile_id)
+        ):
+            graph_id = self.new_graph_id()
+            log.debug(
+                "Recording function %d of graph recording id %d",
+                function_id.id,
+                graph_id.id,
+            )
+            torch.cuda.synchronize()
+            node = CUDAGraphNode(
+                self.ids_to_funcs[function_id],
+                graph_id,
+                self.current_node,
+                new_inputs,
+                self.cuda_graphs_thread_pool,
+                self.device_index,
+                self.ids_to_stack_traces[function_id],
+                self.stream,
+                self.mode,
+                self.compile_id,
+            )
+            if self.current_node is None:
+                self.roots[function_id].append(node)
+            else:
+                self.current_node.add_child(function_id, node)
+            self.current_node = node
+            self.path_state = ExecutionState.RECORDING
+            self.update_generation()
+            torch.cuda.synchronize()
+            return node.run_first_inputs(new_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def execute_node(
         self, node: CUDAGraphNode, new_inputs: list[InputType]
diff --git a/torch/_inductor/cudagraph_utils.py b/torch/_inductor/cudagraph_utils.py
index b3026966ade8..d3bad6c835cc 100644
--- a/torch/_inductor/cudagraph_utils.py
+++ b/torch/_inductor/cudagraph_utils.py
@@ -7,7 +7,11 @@
 
 import torch
 from torch._dynamo.utils import counters, get_metrics_context
+<<<<<<< HEAD
 from torch._inductor.utils import InputType
+=======
+from torch._inductor.utils import GraphPartitionMap, InputType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 
 
@@ -131,7 +135,11 @@ def check_for_mutation(
     inputs: list[InputType],
     is_cuda_graph_recorded_tensor: Callable[[torch.Tensor], bool],
 ) -> Optional[str]:
+<<<<<<< HEAD
     # doesnt work for non-trees because the warmup run would apply mutation twice
+=======
+    # doesn't work for non-trees because the warmup run would apply mutation twice
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if torch._inductor.config.triton.cudagraph_trees:
         # checking if mutation is only on parameters/static inputs
         mutation_indices: Sequence[int] = [
@@ -167,6 +175,16 @@ def _get_use_stack_trace(node: torch.fx.Node) -> Optional[str]:
 def check_multiple_devices_or_any_cpu_nodes(
     device_node_mapping: dict[torch.device, torch.fx.Node],
 ) -> Optional[str]:
+<<<<<<< HEAD
+=======
+    # meta tensors are supported since there is no compute
+    device_node_mapping.pop(torch.device("meta"), None)
+
+    if torch._inductor.config.graph_partition:
+        # graph partition supports splitting on cpu op. So we can ignore cpu nodes.
+        device_node_mapping.pop(torch.device("cpu"), None)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if cpu_node := device_node_mapping.get(torch.device("cpu")):
         msg = f"cpu device ({cpu_node.name})"
         if stack_trace := _get_use_stack_trace(cpu_node):
@@ -215,7 +233,11 @@ def check_for_mutation_ignore_cuda_graph_managed_tensor(
 ) -> Optional[str]:
     default_msg = format_default_skip_message("mutated inputs")
 
+<<<<<<< HEAD
     # doesnt work for non-trees because the warmup run would apply mutation twice
+=======
+    # doesn't work for non-trees because the warmup run would apply mutation twice
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if torch._inductor.config.triton.cudagraph_trees:
         unique_idxs = OrderedSet(static_input_idxs)
         # checking if mutation is only on parameters/static inputs
@@ -338,3 +360,73 @@ class CudagraphCachedInfo:
     placeholders: Sequence[PlaceholderInfo]
     stack_traces: list[Optional[str]]
     cudagraph_fail_reasons: list[str]
+<<<<<<< HEAD
+=======
+
+
+@dataclasses.dataclass(frozen=True)
+class CudagraphMetadata:
+    """
+    Metadata for recording a CUDA graph.
+    """
+
+    placeholders: Sequence[PlaceholderInfo]
+    static_input_idxs: OrderedSet[int]
+    mutated_input_idxs: OrderedSet[int]
+    stack_traces: list[Optional[str]]
+    constants: dict[str, torch.Tensor]
+
+
+def get_partition_cudagraph_metadata(
+    partition_map: GraphPartitionMap,
+    metadata: CudagraphMetadata,
+) -> CudagraphMetadata:
+    """
+    Convert the cudagraph metadata at the graph level to the graph partition level,
+    given the graph partition info (i.e., mapping from partition input/output index
+    to graph input/output index).
+    """
+
+    partition_placeholders = []
+    partition_static_input_idxs: OrderedSet[int] = OrderedSet()
+    partition_mutated_input_idxs: OrderedSet[int] = OrderedSet()
+    for partition_input_idx, graph_input_idx in enumerate(
+        partition_map.input_index_mapping
+    ):
+        if graph_input_idx in metadata.static_input_idxs:
+            partition_static_input_idxs.add(partition_input_idx)
+
+        if graph_input_idx in metadata.mutated_input_idxs:
+            partition_mutated_input_idxs.add(partition_input_idx)
+
+        if graph_input_idx is not None:
+            placeholder = metadata.placeholders[graph_input_idx]
+        else:
+            # create a dummy placeholder info since this partition input is not a graph input
+            placeholder = PlaceholderInfo(
+                name=f"partition_{partition_map.id}_placeholder_{partition_input_idx}",
+                stack_trace=None,
+                users=[],
+                mutating_use_stack_trace=None,
+            )
+        partition_placeholders.append(placeholder)
+
+    partition_stack_traces = []
+    for graph_output_idx in partition_map.output_index_mapping:
+        if graph_output_idx is not None:
+            partition_stack_traces.append(metadata.stack_traces[graph_output_idx])
+        else:
+            partition_stack_traces.append(None)
+
+    partition_constants = {
+        name: metadata.constants[name] for name in partition_map.constant_names
+    }
+
+    return CudagraphMetadata(
+        partition_placeholders,
+        partition_static_input_idxs,
+        partition_mutated_input_idxs,
+        partition_stack_traces,
+        partition_constants,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/custom_graph_pass.py b/torch/_inductor/custom_graph_pass.py
index 9a22f17896a5..325052b5afeb 100644
--- a/torch/_inductor/custom_graph_pass.py
+++ b/torch/_inductor/custom_graph_pass.py
@@ -18,7 +18,11 @@ class CustomGraphPass(ABC):
     identifies your implementation (and can be pickled). The caching logic includes this
     identifier in its key calculation, i.e., any new value will effectively invalidate
     existing entries. We expect custom passes would typically depend purely on the
+<<<<<<< HEAD
     textual reprensentation of the implementation. In that case, we recommend using the
+=======
+    textual representation of the implementation. In that case, we recommend using the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     'get_hash_for_files' helper below to compute a unique hash from the contents of a
     static list of source files, i.e., the source(s) containing the custom pass
     implementation. That approach ensures that any change to the implementation will
@@ -53,6 +57,41 @@ def uuid(self) -> Optional[Any]:
         """
 
 
+<<<<<<< HEAD
+=======
+class CustomGraphModulePass(ABC):
+    """
+    Implement this interface for custom Graph passes:
+
+    1) The __call__() method contains the implementation of the custom pass.
+
+    2) The uuid() method enables inductor to cache compiled graphs when your custom
+    passes are applied. This method can return any identifier as long as it uniquely
+    identifies your implementation (and can be pickled). The caching logic includes this
+    identifier in its key calculation, i.e., any new value will effectively invalidate
+    existing entries. We expect custom passes would typically depend purely on the
+    textual representation of the implementation. In that case, we recommend using the
+    'get_hash_for_files' helper below to compute a unique hash from the contents of a
+    static list of source files, i.e., the source(s) containing the custom pass
+    implementation. That approach ensures that any change to the implementation will
+    mean a new uuid.
+    """
+
+    @abstractmethod
+    def __call__(self, gm: torch.fx.GraphModule) -> None:
+        """
+        Implementation of the custom pass.
+        """
+
+    @abstractmethod
+    def uuid(self) -> Optional[Any]:
+        """
+        Return an ID to uniquely identify your custom pass implementation. Return None
+        to skip inductor code caching entirely.
+        """
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CustomGraphPassType: TypeAlias = Optional[
     Union[CustomGraphPass, Callable[[torch.fx.graph.Graph], None]]
 ]
diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py
index c60fa62dbc86..a0e5f504f747 100644
--- a/torch/_inductor/debug.py
+++ b/torch/_inductor/debug.py
@@ -12,7 +12,10 @@
 import pickle
 import pstats
 import shutil
+<<<<<<< HEAD
 import subprocess
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import traceback
 from collections.abc import Iterator
 from typing import Any, Callable, IO, Optional, Union
@@ -51,6 +54,7 @@
 GRAPHVIZ_COMMAND_SCALABLE = ["dot", "-Gnslimit=2", "-Gnslimit1=2", "-Gmaxiter=5000"]
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
 def has_dot() -> bool:
     try:
@@ -58,6 +62,11 @@ def has_dot() -> bool:
         return True
     except subprocess.SubprocessError:
         return False
+=======
+@functools.cache
+def has_dot() -> bool:
+    return shutil.which("dot") is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def draw_buffers(
@@ -559,11 +568,19 @@ def draw_orig_fx_graph(
             dot_graph_shape=config.trace.dot_graph_shape,
         )
 
+<<<<<<< HEAD
     def output_code(self, filename: str) -> None:
         shutil.copy(filename, self.filename("output_code.py"))
 
     def log_inductor_triton_kernel_to_post_grad_node_info(
         self, filename: str = "inductor_triton_kernel_to_post_grad_nodes.json"
+=======
+    def output_code(self, filename: str, extension: str = "py") -> None:
+        shutil.copy(filename, self.filename(f"output_code.{extension}"))
+
+    def log_inductor_triton_kernel_to_post_grad_node_info(
+        self, filename: str = "inductor_generated_kernel_to_post_grad_nodes.json"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[dict[str, list[str]], dict[str, Any]]:
         debug_info = {}
         with self.fopen(filename, "w") as fd:
@@ -588,6 +605,10 @@ def log_autotuning_results(
         timings: dict["ChoiceCaller", float],  # type: ignore[name-defined] # noqa: F821
         elapse: float,
         precompile_elapse: float,
+<<<<<<< HEAD
+=======
+        prescreening_elapse: Optional[float],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         from .ir import FixedLayout
 
@@ -658,6 +679,10 @@ def build_node_info(node: ir.IRNode) -> dict[str, str]:
             "input_nodes": [build_node_info(node) for node in input_nodes],
             "autotuning_time": elapse,
             "precompile_time": precompile_elapse,
+<<<<<<< HEAD
+=======
+            "prescreening_time": prescreening_elapse,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         with self.fopen_context(
             "autotuning_result_json_list.txt", "at", encoding="utf-8"
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index a5d7bf7a8cc3..5d6aaa46b26b 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -2,10 +2,18 @@
 import functools
 import logging
 import math
+<<<<<<< HEAD
 import sys
 import typing
 from typing import Any, Callable, Optional, TypeVar, Union
 from typing_extensions import ParamSpec
+=======
+import operator
+import sys
+import typing
+from typing import Any, Callable, Optional, TypeVar, Union
+from typing_extensions import ParamSpec, TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._decomp as decomp
@@ -19,7 +27,13 @@
 from torch._decomp.decompositions import (
     _grid_sampler_2d as decomp_grid_sampler_2d,
     _index_add,
+<<<<<<< HEAD
     pw_cast_for_opmath,
+=======
+    embedding_dense_backward as decomp_embedding_dense_backward,
+    pw_cast_for_opmath,
+    pw_cast_for_opmath_non_tensor_args,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch._decomp.decompositions_for_rng import extra_random_decomps
 from torch._dynamo.utils import counters
@@ -31,7 +45,15 @@
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     type_to_dtype,
 )
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import definitely_true, guard_size_oblivious
+=======
+from torch.fx.experimental.symbolic_shapes import (
+    guard_or_false,
+    guard_size_oblivious,
+    statically_known_true,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from . import config, inductor_prims
 from .utils import (
@@ -44,6 +66,13 @@
 _T = TypeVar("_T")
 _P = ParamSpec("_P")
 
+<<<<<<< HEAD
+=======
+_GenericOperator: TypeAlias = Union[
+    torch._ops.OperatorBase, torch._ops.OpOverloadPacket
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
 prims = torch.ops.prims
@@ -61,6 +90,10 @@
         aten.bitwise_or_,
         aten.clamp_min_,
         aten.dist,
+<<<<<<< HEAD
+=======
+        aten.elu,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aten.empty_like,
         aten.flip,
         aten.gelu,
@@ -100,7 +133,11 @@
 
 # Remove unwanted decompositions included via the core ATen decompositions from
 # the Inductor decomp table.
+<<<<<<< HEAD
 decomps_to_exclude = [
+=======
+decomps_to_exclude: list[Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket]] = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten._unsafe_index,
     aten._unsafe_masked_index,
     aten._unsafe_masked_index_put_accumulate,
@@ -108,6 +145,10 @@
     aten._softmax_backward_data,
     aten.clamp_max,
     aten.clamp_min,
+<<<<<<< HEAD
+=======
+    aten.embedding_dense_backward,  # we fall back on xpu
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten.index_add,  # we conditionally call this decomp
     aten.glu,  # inductor lowers this directly
     aten.select_scatter,  # need to be in the ATen graph in order for it to work with the re-inplacing pass
@@ -123,14 +164,41 @@
 
 
 def register_decomposition(
+<<<<<<< HEAD
     ops: list[Union[torch._ops.OperatorBase, torch._ops.OpOverloadPacket]],
 ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
     for op in [ops] if callable(ops) else ops:  # type: ignore[attr-defined]
+=======
+    ops: Union[_GenericOperator, list[_GenericOperator]],
+) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+    for op in ops if isinstance(ops, list) else [ops]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if op in decompositions:
             log.warning("duplicate decomp: %s", ops)
     return decomp.register_decomposition(ops, decompositions)
 
 
+<<<<<<< HEAD
+=======
+@register_decomposition([aten.embedding_dense_backward])
+def _embedding_dense_backward(
+    grad_output: torch.Tensor,
+    indices: torch.Tensor,
+    num_weights: int,
+    padding_idx: int,
+    scale_grad_by_freq: bool,
+) -> torch.Tensor:
+    # TODO: check if XE4 still need this fallback
+    # check torch.xpu.get_device_properties(grad_output.device).architecture
+    if grad_output.is_xpu:
+        return NotImplemented
+    # We can write a util function to update decomp table if we have more ops to fallback.
+    return decomp_embedding_dense_backward(
+        grad_output, indices, num_weights, padding_idx, scale_grad_by_freq
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: for now, inductor doesn't handle asserts
 # because the condition is symbol -> tensor in the graph.
 @register_decomposition([aten._assert_async.msg])
@@ -155,7 +223,11 @@ def sym_constrain_range_for_size(
 
 
 @register_decomposition([aten.clamp])
+<<<<<<< HEAD
 @pw_cast_for_opmath
+=======
+@pw_cast_for_opmath_non_tensor_args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def clamp(
     x: torch.Tensor,
     min: Optional[torch.types.Number] = None,
@@ -259,15 +331,28 @@ def round_dec(x: torch.Tensor, decimals: int = 0) -> torch.Tensor:
 def bmm(
     self: torch.Tensor,
     batch2: torch.Tensor,
+<<<<<<< HEAD
 ) -> torch.Tensor:
     if config.coordinate_descent_tuning and self.device.type != "cpu":
         if guard_size_oblivious(self.shape[1] == 1) or guard_size_oblivious(
+=======
+    out_dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    # TODO: Re-enable for mps once our reductions are performant enough
+    # (https://github.com/pytorch/pytorch/issues/150121)
+    if config.coordinate_descent_tuning and self.device.type not in ["cpu", "mps"]:
+        if statically_known_true(self.shape[1] == 1) or statically_known_true(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             batch2.shape[2] == 1
         ):
             out = (self.unsqueeze(-1) * batch2.unsqueeze(1)).sum(dim=2)
             return out
     if self.device.type == "cpu":
+<<<<<<< HEAD
         if guard_size_oblivious(self.size(1) == 1) and guard_size_oblivious(
+=======
+        if statically_known_true(self.size(1) == 1) and statically_known_true(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             batch2.size(-1) == 1
         ):
             counters["inductor"]["decompose_bmm"] += 1
@@ -283,11 +368,19 @@ def addmm(
     self: torch.Tensor,
     mat1: torch.Tensor,
     mat2: torch.Tensor,
+<<<<<<< HEAD
+=======
+    out_dtype: Optional[torch.dtype] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     beta: torch.types.Number = 1,
     alpha: torch.types.Number = 1,
 ) -> torch.Tensor:
     if self.device.type == "cpu":
+<<<<<<< HEAD
         if guard_size_oblivious(mat1.size(0) == 1) and guard_size_oblivious(
+=======
+        if statically_known_true(mat1.size(0) == 1) and statically_known_true(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mat2.size(-1) == 1
         ):
             counters["inductor"]["decompose_addmm"] += 1
@@ -296,9 +389,15 @@ def addmm(
             ).unsqueeze(0)
             return alpha * out + beta * self
         if (
+<<<<<<< HEAD
             guard_size_oblivious(mat1.size(0) == 1)
             and definitely_true(mat2.size(0) <= 16)
             and definitely_true(mat2.size(1) <= 16)
+=======
+            statically_known_true(mat1.size(0) == 1)
+            and guard_or_false(mat2.size(0) <= 16)
+            and guard_or_false(mat2.size(1) <= 16)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             counters["inductor"]["decompose_addmm"] += 1
             out = (mat1.T * mat2).sum(dim=0, keepdim=True)
@@ -311,16 +410,29 @@ def addmm(
 def mm(
     self: torch.Tensor,
     input2: torch.Tensor,
+<<<<<<< HEAD
 ) -> torch.Tensor:
     # Our matrix vector multiplies only achieve peak bandwidth with coordinate descent tuning.
     # todo: Look into why and fix it (hopefully)
     if config.coordinate_descent_tuning and self.device.type != "cpu":
         if guard_size_oblivious(self.shape[0] == 1) or guard_size_oblivious(
+=======
+    out_dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    # Our matrix vector multiplies only achieve peak bandwidth with coordinate descent tuning.
+    # todo: Look into why and fix it (hopefully)
+
+    # TODO: Re-enable for mps once our reductions are performant enough
+    # (https://github.com/pytorch/pytorch/issues/150121)
+    if config.coordinate_descent_tuning and self.device.type not in ["cpu", "mps"]:
+        if statically_known_true(self.shape[0] == 1) or statically_known_true(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input2.shape[1] == 1
         ):
             return (self.unsqueeze(2) * input2.unsqueeze(0)).sum(dim=1)
     if self.device.type == "cpu":
         if (
+<<<<<<< HEAD
             guard_size_oblivious(self.size(-1) == 1)
             and guard_size_oblivious(self.size(0) > 0)
             and guard_size_oblivious(input2.size(0) == 1)
@@ -330,6 +442,17 @@ def mm(
             counters["inductor"]["decompose_mm"] += 1
             return torch.cat([self[i, :] * input2 for i in range(self.size(0))])
         if guard_size_oblivious(self.size(0) == 1) and guard_size_oblivious(
+=======
+            statically_known_true(self.size(-1) == 1)
+            and statically_known_true(self.size(0) > 0)
+            and statically_known_true(input2.size(0) == 1)
+            and (self.dtype == input2.dtype)
+            and guard_or_false((torch.numel(self) + torch.numel(input2)) <= 32)
+        ):
+            counters["inductor"]["decompose_mm"] += 1
+            return torch.cat([self[i, :] * input2 for i in range(self.size(0))])
+        if statically_known_true(self.size(0) == 1) and statically_known_true(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             input2.size(-1) == 1
         ):
             counters["inductor"]["decompose_mm"] += 1
@@ -348,8 +471,11 @@ def cat(
     tensors: list[torch.Tensor],
     dim: int = 0,
 ) -> torch.Tensor:
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def non_empty_tensor(x: torch.Tensor) -> bool:
         # For better or worse, this is a valid cat:
         #
@@ -378,7 +504,21 @@ def non_empty_tensor(x: torch.Tensor) -> bool:
     filtered_tensors = list(filter(non_empty_tensor, tensors))
 
     if len(filtered_tensors) == 1:
+<<<<<<< HEAD
         return filtered_tensors[0].clone()
+=======
+        # check dtype promotion
+        promoted_dtype = elementwise_dtypes(
+            *tensors,
+            type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+        )[1]
+        filtered_t = filtered_tensors[0]
+        return (
+            filtered_t.clone()
+            if promoted_dtype == filtered_t.dtype
+            else filtered_t.to(dtype=promoted_dtype)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif 1 < len(filtered_tensors) < len(tensors):
         # on the first call, when we remove empty tensors, we redispatch recursively
         return aten.cat.default(filtered_tensors, dim)
@@ -453,6 +593,13 @@ def reshape_tensor_complex(tensor: torch.Tensor) -> torch.Tensor:
         reshaped_tensor = tensor.view(new_shape)
         return reshaped_tensor
 
+<<<<<<< HEAD
+=======
+    # Manually resolve complex tensors, as .is_conj() is unreliable after cloning during compilation.
+    x = x + 0
+    z = z + 0
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     x_reshaped = reshape_tensor_complex(x.view(x.real.dtype))
     z_reshaped = reshape_tensor_complex(z.view(y.real.dtype))
     result = torch.flatten(x_reshaped + z_reshaped, start_dim=-2).view(complex_type)
@@ -461,7 +608,12 @@ def reshape_tensor_complex(tensor: torch.Tensor) -> torch.Tensor:
 
 @register_decomposition([aten.conj_physical])
 def conj_physical(self: torch.Tensor) -> torch.Tensor:
+<<<<<<< HEAD
     assert not self.is_complex(), "TODO: implement this"
+=======
+    if self.is_complex():
+        return NotImplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return self
 
 
@@ -813,7 +965,11 @@ def miopen_batch_norm(
     )
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def fast_random_decomps() -> dict[Any, Callable[..., Any]]:
     return {**decompositions, **extra_random_decomps}
 
@@ -962,6 +1118,61 @@ def index_reduce(
     )
 
 
+<<<<<<< HEAD
+=======
+def _max_pool_with_indices(
+    x: torch.Tensor,
+    kernel_size: list[int],
+    stride: Optional[Union[int, list[int]]],
+    padding: Union[int, list[int]],
+    dilation: Union[int, list[int]],
+    ceil_mode: bool,
+    dim: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if dilation == 1:
+        dilation = [1] * dim
+
+    if padding == 0:
+        padding = [0] * dim
+
+    if not stride:
+        stride = kernel_size
+
+    kernel_size = pad_listlike(kernel_size, dim)
+    dilation = pad_listlike(dilation, dim)
+    padding = pad_listlike(padding, dim)
+    stride = pad_listlike(stride, dim)
+
+    window_size = functools.reduce(operator.mul, kernel_size)
+    # We fallback when using non-default dilation or when the window size is too large
+    if (
+        torch._inductor.lowering.should_fallback_max_pool_with_indices(
+            kernel_size, n_dim=dim
+        )
+        or window_size > torch.iinfo(torch.int8).max
+    ):
+        return NotImplemented
+
+    vals, offsets = prims._low_memory_max_pool_with_offsets(
+        x,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode,
+    )
+    indices = prims._low_memory_max_pool_offsets_to_indices(
+        offsets,
+        kernel_size,
+        x.shape[-dim:],
+        stride,
+        padding,
+        dilation,
+    )
+    return vals, indices
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_decomposition(aten.max_pool2d_with_indices)
 def max_pool2d_with_indices(
     x: torch.Tensor,
@@ -971,6 +1182,7 @@ def max_pool2d_with_indices(
     dilation: Union[int, list[int]] = 1,
     ceil_mode: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
+<<<<<<< HEAD
     if dilation == 1:
         dilation = [1, 1]
 
@@ -1011,6 +1223,25 @@ def max_pool2d_with_indices(
         padding,
     )
     return vals, indices
+=======
+    return _max_pool_with_indices(
+        x, kernel_size, stride, padding, dilation, ceil_mode, dim=2
+    )
+
+
+@register_decomposition(aten.max_pool3d_with_indices)
+def max_pool3d_with_indices(
+    x: torch.Tensor,
+    kernel_size: list[int],
+    stride: Optional[Union[int, list[int]]] = None,
+    padding: Union[int, list[int]] = 0,
+    dilation: Union[int, list[int]] = 1,
+    ceil_mode: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return _max_pool_with_indices(
+        x, kernel_size, stride, padding, dilation, ceil_mode, dim=3
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_decomposition(aten.adaptive_max_pool2d)
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index a9860186c844..694c5ed9c622 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -11,7 +11,11 @@
 import sympy
 
 import torch
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+=======
+from torch.fx.experimental.symbolic_shapes import free_symbols, free_unbacked_symbols
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 
 from ..utils._sympy.symbol import make_symbol, SymT
@@ -125,7 +129,11 @@ def decide_loop_order_to_match(self, other: "MemoryDep") -> Optional[list[int]]:
             )
             return None
 
+<<<<<<< HEAD
         # May hanppen if self and other are as follows
+=======
+        # May happen if self and other are as follows
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # MemoryDep('addmm_6', 393216*d0 + 768*d1 + d2, {d0: 16, d1: 512, d2: 768}, None)
         # MemoryDep('addmm_6', 98304*d0 + d1 + 768*d2, {d0: 64, d1: 768, d2: 128}, None)
         if OrderedSet(self_strides) != OrderedSet(other_strides):
@@ -708,7 +716,11 @@ def extract_input_node_reduction_ranges(
 
     # There is one issue: what if there are views / permutations between the input node and its dependent realized nodes?
     # The current method still uses reduction ranges from the dependent realized node, which is not ideal.
+<<<<<<< HEAD
     # Is there a way to check whether there are permutations inbetween?
+=======
+    # Is there a way to check whether there are permutations in between?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reads = input_node.get_reads()
     reduction_size: Optional[list[sympy.Expr]] = None
     size: Optional[list[sympy.Expr]] = None
@@ -749,17 +761,31 @@ def canonicalization_prefix() -> str:
     return "c"
 
 
+<<<<<<< HEAD
 # ops handler which computes all the free unbacked symbols for an IR
 class FreeUnbackedSymbolsOpsHandler(DefaultHandler):
     symbols: OrderedSet[sympy.Symbol]
 
     def __init__(self) -> None:
         self.symbols = OrderedSet()
+=======
+# ops handler which computes all the free symbols for an IR
+class FreeSymbolsOpsHandler(DefaultHandler):
+    symbols: OrderedSet[sympy.Symbol]
+
+    def __init__(self, unbacked_only: bool = True) -> None:
+        self.symbols = OrderedSet()
+        self.get_symbols = free_unbacked_symbols if unbacked_only else free_symbols
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
         for a in itertools.chain(args, kwargs.values()):
             if isinstance(a, (sympy.Expr, sympy.logic.boolalg.Boolean)):
+<<<<<<< HEAD
                 self.symbols |= free_unbacked_symbols(a)
+=======
+                self.symbols |= self.get_symbols(a)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def indirect_indexing(
         self,
@@ -769,7 +795,11 @@ def indirect_indexing(
         wrap_neg: bool = True,
     ) -> sympy.Symbol:
         assert not isinstance(index_var, (sympy.Expr, sympy.logic.boolalg.Boolean))
+<<<<<<< HEAD
         self.symbols |= free_unbacked_symbols(size)
+=======
+        self.symbols |= self.get_symbols(size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return sympy_index_symbol(f"({str(index_var)})")
 
     def frexp(self, x: Any) -> tuple[None, ...]:
@@ -801,15 +831,27 @@ def masked(self, mask: Any, body: Callable[..., Any], other: Any) -> None:
         body()
 
 
+<<<<<<< HEAD
 def extract_free_unbacked_symbols(
     fn: Callable[..., Any],
     index: Sequence[sympy.Expr],
     rindex: Optional[Sequence[sympy.Expr]] = None,
+=======
+def extract_free_symbols(
+    fn: Callable[..., Any],
+    index: Sequence[sympy.Expr],
+    rindex: Optional[Sequence[sympy.Expr]] = None,
+    unbacked_only: bool = True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> OrderedSet[sympy.Symbol]:
     from .ir import FlexibleLayout
 
     args = [index, rindex] if rindex is not None else [index]
+<<<<<<< HEAD
     handler = FreeUnbackedSymbolsOpsHandler()
+=======
+    handler = FreeSymbolsOpsHandler(unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NB: I cargo culted the allow_indexing patch here, I don't understand why
     # people do this all over
     with (
diff --git a/torch/_inductor/dtype_propagation.py b/torch/_inductor/dtype_propagation.py
index 0bcc120af3c6..129ef16b5c14 100644
--- a/torch/_inductor/dtype_propagation.py
+++ b/torch/_inductor/dtype_propagation.py
@@ -29,7 +29,11 @@ def dtype(self) -> torch.dtype: ...
 # So first decompose CSEVars -> tuple before calling this
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_promoted_dtype(
     *args: Sequence[tuple[torch.dtype, bool]],
     type_promotion_kind: Optional[ELEMENTWISE_TYPE_PROMOTION_KIND] = None,
@@ -181,7 +185,11 @@ def index_expr(expr: sympy.Expr, dtype: torch.dtype) -> torch.dtype:
         ):
             return upcast_compute_type(dtype)
 
+<<<<<<< HEAD
         return torch.int32 if V.kernel.index_dtype == "tl.int32" else torch.int64
+=======
+        return V.kernel.get_index_dtype_as_torch_dtype()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def to_dtype(
@@ -358,10 +366,13 @@ def lshift(x: DTypeArg, y: DTypeArg) -> torch.dtype:
         return promote_types([x])
 
     @staticmethod
+<<<<<<< HEAD
     def libdevice_abs(x: DTypeArg) -> torch.dtype:
         return promote_types([x])
 
     @staticmethod
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def check_bounds(
         expr: sympy.Expr, size: sympy.Expr, lower: bool, upper: bool
     ) -> None:
diff --git a/torch/_inductor/extern_node_serializer.py b/torch/_inductor/extern_node_serializer.py
index ffd390152034..bc5b51beff37 100644
--- a/torch/_inductor/extern_node_serializer.py
+++ b/torch/_inductor/extern_node_serializer.py
@@ -1,6 +1,10 @@
 import json
 
+<<<<<<< HEAD
 from torch._export.serde.aoti_schema import ExternKernelNode, ExternKernelNodes, Node
+=======
+from torch._export.serde.schema import ExternKernelNode, ExternKernelNodes, Node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._export.serde.serialize import _dataclass_to_dict, EnumEncoder
 from torch._inductor.ir import ExternKernelNode as inductor_ExternKernelNode
 
diff --git a/torch/_inductor/freezing.py b/torch/_inductor/freezing.py
index 7fe28a9f4a2f..176abf6f037c 100644
--- a/torch/_inductor/freezing.py
+++ b/torch/_inductor/freezing.py
@@ -150,7 +150,11 @@ def __init__(self, elem, name: Optional[str], mod) -> None:
         self.owning_mod_ref = weakref.ref(mod)
 
     @classmethod
+<<<<<<< HEAD
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+=======
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         erased_tensors = [
             e
             for e in pytree.arg_tree_leaves(*args, **kwargs)
diff --git a/torch/_inductor/fuzzer.py b/torch/_inductor/fuzzer.py
index 4a42b71559c5..3aba8801fe0c 100644
--- a/torch/_inductor/fuzzer.py
+++ b/torch/_inductor/fuzzer.py
@@ -174,6 +174,10 @@ def failing(self) -> bool:
     "autoheuristic_collect": ["pad_mm", "mixed_mm"],
     "autoheuristic_use": ["pad_mm", "mixed_mm"],
     "traceable_tensor_subclasses": [OrderedSet()],
+<<<<<<< HEAD
+=======
+    "nontraceable_tensor_subclasses": [OrderedSet()],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 SamplingType = Callable[[str, type[Any], Any], Any]
 
@@ -219,9 +223,15 @@ def _generate_value_for_type(
             elem_type = getattr(
                 type_hint,
                 "__args__",
+<<<<<<< HEAD
                 [type(default[0])] if len(default) else [type(None)],
             )[0]
             new_default = default[0] if len(default) > 0 else None
+=======
+                [type(default[0])] if default and len(default) else [type(None)],
+            )[0]
+            new_default = default[0] if default and len(default) > 0 else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [
                 SamplingMethod._generate_value_for_type(
                     random_sample, field_name, elem_type, new_default
@@ -233,9 +243,15 @@ def _generate_value_for_type(
             elem_type = getattr(
                 type_hint,
                 "__args__",
+<<<<<<< HEAD
                 [type(indexable[0])] if len(default) else [type(None)],
             )[0]
             new_default = indexable[0] if len(default) > 0 else None
+=======
+                [type(indexable[0])] if default and len(default) else [type(None)],
+            )[0]
+            new_default = indexable[0] if default and len(default) > 0 else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return {  # noqa: set_linter
                 SamplingMethod._generate_value_for_type(
                     random_sample, field_name, elem_type, new_default
@@ -247,9 +263,15 @@ def _generate_value_for_type(
             elem_type = getattr(
                 type_hint,
                 "__args__",
+<<<<<<< HEAD
                 [type(indexable[0])] if len(default) else [type(None)],
             )[0]
             new_default = indexable[0] if len(default) > 0 else None
+=======
+                [type(indexable[0])] if default and len(default) else [type(None)],
+            )[0]
+            new_default = indexable[0] if default and len(default) > 0 else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return OrderedSet(
                 [
                     SamplingMethod._generate_value_for_type(
@@ -362,6 +384,11 @@ def dummy_function(*args, **kwargs):  # type: ignore[no-untyped-def]
                 )
 
             return dummy_function
+<<<<<<< HEAD
+=======
+        elif type_hint == torch._ops.OpOverload:
+            return torch.ops.aten.add.default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif TypeExemplars.contains(type_hint):
             return TypeExemplars.example(type_hint)
         elif type_hint == Any:
@@ -499,6 +526,10 @@ def keys(self) -> KeysView[ComboType]:
     },
     "torch._dynamo.config": {
         "traceable_tensor_subclasses": DEFAULT,  # Typing
+<<<<<<< HEAD
+=======
+        "nontraceable_tensor_subclasses": DEFAULT,  # Typing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "compiled_autograd_kwargs_override": DEFAULT,  # Typing
         "fail_on_recompile_limit_hit": DEFAULT,  # fails in combo with suppress_errors
         "suppress_errors": DEFAULT,
diff --git a/torch/_inductor/fx_passes/binary_folding.py b/torch/_inductor/fx_passes/binary_folding.py
index c64f1309319d..e022a2e5e666 100644
--- a/torch/_inductor/fx_passes/binary_folding.py
+++ b/torch/_inductor/fx_passes/binary_folding.py
@@ -83,7 +83,11 @@ def recover_original_precision_folded_computation_ops(gm):
 _binary_ops = [aten.add.Tensor, aten.sub.Tensor, aten.mul.Tensor, aten.div.Tensor]
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def binary_folding_init():
     _conv_args = [Arg() for _ in range(9)]
     _addmm_args = [Arg() for _ in range(3)]
diff --git a/torch/_inductor/fx_passes/ddp_fusion.py b/torch/_inductor/fx_passes/ddp_fusion.py
index 2d9409523c15..b5700190333b 100644
--- a/torch/_inductor/fx_passes/ddp_fusion.py
+++ b/torch/_inductor/fx_passes/ddp_fusion.py
@@ -73,7 +73,11 @@ class CommBlock:
 def get_comm_block(comm_node: fx.Node) -> Optional[CommBlock]:
     """
     Given a collective node (e.g., allreduce), find out all the nodes belong to
+<<<<<<< HEAD
     this communcation.
+=======
+    this communication.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         comm_node(fx.Node): The target communication/collective node.
@@ -304,7 +308,11 @@ def _scatter_fused_allreduce_waits(
     """
 
     # Before we mass up the order, we need to get the index of the last wait node
+<<<<<<< HEAD
     # in orig_comm_blocks. This index will be later used to determinee what users
+=======
+    # in orig_comm_blocks. This index will be later used to determine what users
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # nodes need to be move to maintain a correct topological sort order.
     last_wait_node_idx = 0
     for node in graph.nodes:
diff --git a/torch/_inductor/fx_passes/decompose_mem_bound_mm.py b/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
index 5c3a93d25904..f0e7073855e2 100644
--- a/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
+++ b/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
@@ -3,7 +3,12 @@
 
 import torch
 from torch import Tensor
+<<<<<<< HEAD
 from torch._dynamo.utils import counters
+=======
+from torch._dynamo.utils import counters, is_node_meta_valid
+from torch.fx.experimental.symbolic_shapes import statically_known_true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .. import config
 from ..pattern_matcher import Arg, CallFunction, Match, register_graph_pattern
@@ -18,14 +23,24 @@
 MAX_OTHER_DIMENSION_DECOMPOSITION = 32
 
 min_first_dimension_decomposition = MIN_FIRST_DIMENSION_DECOMPOSITION
+<<<<<<< HEAD
 max_other_dimention_decomposition = MAX_OTHER_DIMENSION_DECOMPOSITION
+=======
+max_other_dimension_decomposition = MAX_OTHER_DIMENSION_DECOMPOSITION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if "decompose_mm_pass" in config.post_grad_fusion_options:
     min_first_dimension_decomposition = config.post_grad_fusion_options[
         "decompose_mm_pass"
     ].get("min_first_dimension_decomposition", MIN_FIRST_DIMENSION_DECOMPOSITION)
+<<<<<<< HEAD
     max_other_dimention_decomposition = config.post_grad_fusion_options[
         "decompose_mm_pass"
     ].get("max_other_dimention_decomposition", MAX_OTHER_DIMENSION_DECOMPOSITION)
+=======
+    max_other_dimension_decomposition = config.post_grad_fusion_options[
+        "decompose_mm_pass"
+    ].get("max_other_dimension_decomposition", MAX_OTHER_DIMENSION_DECOMPOSITION)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def check_device(a: Tensor, b: Tensor, device="cuda") -> bool:
@@ -50,9 +65,15 @@ def should_decompose_bmm(mat1, mat2) -> bool:
         if mat1.shape[0] < min_first_dimension_decomposition:
             return False
         # 2 of m, n, k must be <= MAX_OTHER_DIMENSION_DECOMPOSITION
+<<<<<<< HEAD
         if (mat1.shape[1] < max_other_dimention_decomposition) + (
             mat1.shape[2] < max_other_dimention_decomposition
         ) + (mat2.shape[2] < max_other_dimention_decomposition) < 2:
+=======
+        if (mat1.shape[1] < max_other_dimension_decomposition) + (
+            mat1.shape[2] < max_other_dimension_decomposition
+        ) + (mat2.shape[2] < max_other_dimension_decomposition) < 2:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
         return True
     elif check_device(mat1, mat2, device="cpu"):
@@ -71,6 +92,7 @@ def should_decompose_mm(mat1, mat2) -> bool:
         return False
     return (
         check_device(mat1, mat2, device="cuda")
+<<<<<<< HEAD
         and mat1.shape[0] >= min_first_dimension_decomposition
         and mat2.shape[0] < max_other_dimention_decomposition
         and mat2.shape[1] < max_other_dimention_decomposition
@@ -86,6 +108,19 @@ def is_node_meta_valid(node: torch.fx.Node):
     return "val" in node.meta
 
 
+=======
+        and statically_known_true(mat1.shape[0] >= min_first_dimension_decomposition)
+        and statically_known_true(mat2.shape[0] < max_other_dimension_decomposition)
+        and statically_known_true(mat2.shape[1] < max_other_dimension_decomposition)
+    ) or (
+        check_device(mat1, mat2, device="cpu")
+        and statically_known_true(mat1.shape[0] == 1)
+        and statically_known_true(mat2.shape[0] <= 128)
+        and statically_known_true(mat2.shape[1] <= 512)
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def print_decompose_pattern(match: Match, inputs: list[torch.fx.Node]):
     node = match.nodes[-1]
     log.debug(
diff --git a/torch/_inductor/fx_passes/freezing_patterns.py b/torch/_inductor/fx_passes/freezing_patterns.py
index 8b6437fc2582..a0545a9d2f6e 100644
--- a/torch/_inductor/fx_passes/freezing_patterns.py
+++ b/torch/_inductor/fx_passes/freezing_patterns.py
@@ -119,12 +119,63 @@ def register_binary_folding_pattern(pattern, extra_check=_return_true):
     )
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
 def addmm_patterns_init():
+=======
+@functools.cache
+def addmm_patterns_init():
+    """
+    addmm related patterns.
+    To avoid duplication, also includes int8 WoQ GEMM pattern without bias.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device = next(
         (gpu for gpu in GPU_TYPES if getattr(torch, gpu).is_available()), "cpu"
     )
     val = functools.partial(torch.empty, (10, 10), device=device, requires_grad=False)
+<<<<<<< HEAD
+=======
+    scale = functools.partial(torch.empty, (10,), device=device, requires_grad=False)
+
+    def check_int8_woq_concat_linear_weights(match):
+        is_cpu = match.kwargs["inp"].meta["val"].is_cpu
+        if not is_cpu or not config.cpp.enable_concat_linear:
+            # Currently, this pattern is only supported on CPU
+            return False
+
+        weight_inputs = ["w1", "w2"]
+        if "w3" in match.kwargs:
+            weight_inputs.append("w3")
+
+        if not all(
+            match.kwargs[wgt].target == torch.ops.prims.convert_element_type.default
+            for wgt in weight_inputs
+        ):
+            return False
+
+        if not all(
+            next(iter(match.kwargs[wgt]._input_nodes.keys())).meta["val"].dtype
+            is torch.int8
+            for wgt in weight_inputs
+        ):
+            return False
+
+        if not all(
+            match.kwargs[wgt].meta["val"].dtype is torch.bfloat16
+            for wgt in weight_inputs
+        ):
+            return False
+
+        equal_shape_inputs = [weight_inputs]
+        for equal_shape_group in equal_shape_inputs:
+            inps = [match.kwargs[name] for name in equal_shape_group]
+            if not all(
+                inp.meta["val"].shape == inps[0].meta["val"].shape for inp in inps
+            ):
+                return False
+        return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def check_concat_weights(match):
         is_cpu = match.kwargs["inp"].meta["val"].is_cpu
@@ -153,9 +204,33 @@ def check_concat_weights(match):
                 for inp in inps
             ):
                 return False
+<<<<<<< HEAD
+
+        return True
 
+=======
         return True
 
+    def int8_woq_fusion_pattern(inp, w1, w2, w3, s1, s2, s3):
+        return ((inp @ w1) * s1, (inp @ w2) * s2, (inp @ w3) * s3)
+
+    def int8_woq_fusion_replacement(inp, w1, w2, w3, s1, s2, s3):
+        cat_w = torch.cat((w1, w2, w3), dim=1)
+        cat_s = torch.cat((s1, s2, s3), dim=0)
+        mm = (inp @ cat_w).mul(cat_s)
+        return mm.chunk(3, dim=1)
+
+    register_replacement(
+        int8_woq_fusion_pattern,
+        int8_woq_fusion_replacement,
+        [val(), val(), val(), val(), scale(), scale(), scale()],
+        fwd_only,
+        pass_patterns[0],
+        extra_check=check_int8_woq_concat_linear_weights,
+        exclusive_arg_names=("w1", "w2", "w3", "s1", "s2", "s3"),
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def matmul_fuse_pattern(inp, w1, w2, w3):
         return (inp @ w1, inp @ w2, inp @ w3)
 
diff --git a/torch/_inductor/fx_passes/fuse_attention.py b/torch/_inductor/fx_passes/fuse_attention.py
index 1894eb628c89..8e8ad3a4f445 100644
--- a/torch/_inductor/fx_passes/fuse_attention.py
+++ b/torch/_inductor/fx_passes/fuse_attention.py
@@ -542,6 +542,155 @@ def _sfdp_replacement_19(query, key, value, causal_mask, attn_mask, dropout_p):
     )
 
 
+<<<<<<< HEAD
+=======
+def _sfdp_pattern_20(query, key, value, attn_mask, dropout_p):
+    # for DistilBert with dropout transformers==4.44.2
+    q = query.permute([0, 2, 1, 3])
+    k = key.permute([0, 2, 1, 3])
+    v = value.permute([0, 2, 1, 3])
+    bs = q.size(0)
+    k_len = k.size(-2)
+    q = q.div(math.sqrt(q.size(-1)))
+    scores = q @ k.transpose(-2, -1)
+    fill_value = torch.full((), -float("inf"), dtype=query.dtype, device=query.device)
+    attn_mask = (attn_mask == 0).view((bs, 1, 1, k_len)).expand_as(scores)
+    return (
+        torch.nn.functional.dropout(
+            torch.softmax(scores.masked_fill(attn_mask, fill_value), dim=-1), dropout_p
+        )
+        @ v
+    )
+
+
+def _sfdp_replacement_20(query, key, value, attn_mask, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    bs = query.size(0)
+    n_head = query.size(2)
+    q_len = query.size(1)
+    k_len = key.size(1)
+    # do attn_mask->logical_not() in _scaled_dot_product_attention
+    attn_mask = (
+        (attn_mask == 1).view((bs, 1, 1, k_len)).expand((bs, n_head, q_len, k_len))
+    )
+    return _scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=attn_mask.to(dtype=torch.bool),
+        dropout_p=dropout_p,
+        is_causal=False,
+        scale=1.0 / math.sqrt(query.size(-1)),
+    )
+
+
+def _sfdp_pattern_21(query, key, value, attn_mask):
+    # for T5 with inplace add
+    query = query.permute([0, 2, 1, 3])
+    key = key.permute([0, 2, 1, 3])
+    value = value.permute([0, 2, 1, 3])
+    score = torch.matmul(query, key.permute(0, 1, 3, 2))
+    masked_score = score + attn_mask
+    score = masked_score.type_as(query)
+    viewd_score1 = score.view(
+        score.size(0) * score.size(1), score.size(2), score.size(3)
+    )
+    viewd_score2 = viewd_score1.view(
+        score.size(0), score.size(1), score.size(2), score.size(3)
+    )
+    return viewd_score2.float().softmax(dim=-1).type_as(query).matmul(value)
+
+
+def _sfdp_replacement_21(query, key, value, attn_mask):
+    counters["inductor"]["fuse_attention"] += 1
+    query = query.permute(0, 2, 1, 3)
+    key = key.permute(0, 2, 1, 3)
+    value = value.permute(0, 2, 1, 3)
+    return _scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=attn_mask,
+        is_causal=False,
+        scale=1.0,
+    )
+
+
+def _sfdp_pattern_22(query, key, value, attn_mask):
+    # for T5 with inplace add and return key and value
+    query = query.permute([0, 2, 1, 3])
+    key = key.permute([0, 2, 1, 3])
+    value = value.permute([0, 2, 1, 3])
+    score = torch.matmul(query, key.permute(0, 1, 3, 2))
+    masked_score = score + attn_mask
+    score = masked_score.type_as(query)
+    viewd_score1 = score.view(
+        score.size(0) * score.size(1), score.size(2), score.size(3)
+    )
+    viewd_score2 = viewd_score1.view(
+        score.size(0), score.size(1), score.size(2), score.size(3)
+    )
+    return viewd_score2.float().softmax(dim=-1).type_as(query).matmul(value), key, value
+
+
+def _sfdp_replacement_22(query, key, value, attn_mask):
+    counters["inductor"]["fuse_attention"] += 1
+    query = query.permute(0, 2, 1, 3)
+    key = key.permute(0, 2, 1, 3)
+    value = value.permute(0, 2, 1, 3)
+    return (
+        _scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=attn_mask,
+            is_causal=False,
+            scale=1.0,
+        ),
+        key,
+        value,
+    )
+
+
+def _sfdp_pattern_23(query, key, value):
+    # for T5 with inplace add and
+    # return key and value and
+    # attn_mask is generated by atem.full(..., 0)
+    query = query.permute([0, 2, 1, 3])
+    key = key.permute([0, 2, 1, 3])
+    value = value.permute([0, 2, 1, 3])
+    score = torch.matmul(query, key.permute(0, 1, 3, 2))
+    fp32_score = score.float()
+    score = fp32_score.type_as(query)
+    viewd_score1 = score.view(
+        score.size(0) * score.size(1), score.size(2), score.size(3)
+    )
+    viewd_score2 = viewd_score1.view(
+        score.size(0), score.size(1), score.size(2), score.size(3)
+    )
+    return viewd_score2.float().softmax(dim=-1).type_as(query).matmul(value), key, value
+
+
+def _sfdp_replacement_23(query, key, value):
+    counters["inductor"]["fuse_attention"] += 1
+    query = query.permute(0, 2, 1, 3)
+    key = key.permute(0, 2, 1, 3)
+    value = value.permute(0, 2, 1, 3)
+    return (
+        _scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=None,
+            is_causal=False,
+            scale=1.0,
+        ),
+        key,
+        value,
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _sfdp_params_check(match):
     assert all(k in match.kwargs for k in ("query", "key", "value"))
     query = match.kwargs["query"].meta["val"]
@@ -641,6 +790,11 @@ def _get_sfdp_patterns():
     # attn_mask
     b_inp = functools.partial(torch.empty, (1, 1, 8, 8), device=device)
     m_inp = functools.partial(torch.empty, (2, 1, 1, 4), device=device)
+<<<<<<< HEAD
+=======
+    # need 2d attn_mask to generate patterns with view op
+    m_inp_2d = functools.partial(torch.empty, (2, 4), device=device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # inv_scale
     c_inp = functools.partial(torch.tensor, 2.0, device=device)
     # workaround https://github.com/pytorch/pytorch/issues/97894
@@ -670,6 +824,10 @@ def _get_sfdp_patterns():
         m = functools.partial(m_inp, dtype=dtype)
         m_float = functools.partial(m_inp, dtype=torch.float)
         m_bool = functools.partial(m_inp, dtype=torch.bool)
+<<<<<<< HEAD
+=======
+        m_2d = functools.partial(m_inp_2d, dtype=dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         c = functools.partial(c_inp, dtype=dtype)
         g_3d = functools.partial(g_3d_inp, dtype=dtype)
         g_bs1 = functools.partial(g_bs1_inp, dtype=dtype)
@@ -779,7 +937,11 @@ def _get_sfdp_patterns():
             (
                 _sfdp_pattern_15,
                 _sfdp_replacement_15,
+<<<<<<< HEAD
                 [g(), g(), g(), m(), c()],
+=======
+                [g(), g(), g(), m_2d(), c()],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 {},
                 _sfdp_extra_check(aten.div.Tensor),
             ),
@@ -801,7 +963,11 @@ def _get_sfdp_patterns():
             (
                 _sfdp_pattern_17,
                 _sfdp_replacement_17,
+<<<<<<< HEAD
                 [g(), g(), g(), m(), c()],
+=======
+                [g(), g(), g(), m_2d(), c()],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 d,
                 _sfdp_extra_check(aten.div.Tensor),
             ),
@@ -826,6 +992,37 @@ def _get_sfdp_patterns():
                 d,
                 _sfdp_params_check,
             ),
+<<<<<<< HEAD
+=======
+            (
+                _sfdp_pattern_20,
+                _sfdp_replacement_20,
+                [g(), g(), g(), m_2d()],
+                d,
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+            (
+                _sfdp_pattern_21,
+                _sfdp_replacement_21,
+                [g(), g(), g(), m_float()],
+                {},
+                _sfdp_params_check,
+            ),
+            (
+                _sfdp_pattern_22,
+                _sfdp_replacement_22,
+                [g(), g(), g(), m_float()],
+                {},
+                _sfdp_params_check,
+            ),
+            (
+                _sfdp_pattern_23,
+                _sfdp_replacement_23,
+                [g(), g(), g()],
+                {},
+                _sfdp_params_check,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         mask_fp32_patterns = ["pattern_16"]
         if dtype == torch.half:
@@ -906,7 +1103,11 @@ def _get_sfdp_patterns():
             )
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _sfdp_init():
     for key, register_replacement_kwargs in _get_sfdp_patterns():
         gen_register_replacement(key, **register_replacement_kwargs)
diff --git a/torch/_inductor/fx_passes/group_batch_fusion.py b/torch/_inductor/fx_passes/group_batch_fusion.py
index b4f378007d11..a5db6ac1b1c9 100644
--- a/torch/_inductor/fx_passes/group_batch_fusion.py
+++ b/torch/_inductor/fx_passes/group_batch_fusion.py
@@ -7,7 +7,11 @@
 from typing import Any, Optional
 
 import torch
+<<<<<<< HEAD
 from torch._dynamo.utils import counters
+=======
+from torch._dynamo.utils import counters, is_node_meta_valid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import trace_structured
 from torch.fx.passes.graph_transform_observer import GraphTransformObserver
 from torch.utils._ordered_set import OrderedSet
@@ -18,6 +22,10 @@
     get_arg_value,
     stable_topological_sort,
 )
+<<<<<<< HEAD
+=======
+from ..utils import OPTIMUS_EXCLUDE_POST_GRAD
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 try:
@@ -574,10 +582,13 @@ def fuse(self, graph: torch.fx.GraphModule, subset: list[torch.fx.Node]):
         counters["inductor"]["batch_linear_lhs"] += 1
 
 
+<<<<<<< HEAD
 def is_node_meta_valid(node: Optional[torch.fx.Node]):
     return node is None or "example_value" in node.meta or "val" in node.meta
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Poor person's check for if a node in the graph mutates its input.
 # (the graph is torch IR, so we will see torch fns and python operators)
 def _is_mutable_node(tgt):
@@ -1055,7 +1066,11 @@ def __init__(self, op, **kwargs):
     def match(self, node: torch.fx.Node):
         input = get_arg_value(node, 0, "input")
         if CallFunctionVarArgs(self.op).match(node) and is_node_meta_valid(node):
+<<<<<<< HEAD
             # check the input has the same shape and its uers have the same target
+=======
+            # check the input has the same shape and its users have the same target
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # check all clamp operators have the same min and max values, and
             # nan_to_num operators use the same default value.
             child = next(iter(node.users.keys()))
@@ -1403,7 +1418,14 @@ def group_batch_fusion_passes(graph: torch.fx.Graph, pre_grad=True):
         fbgemm_fusion_keys = [
             x
             for x in config.post_grad_fusion_options
+<<<<<<< HEAD
             if config.post_grad_fusion_options[x].get("require_fbgemm", False)
+=======
+            if (
+                x not in OPTIMUS_EXCLUDE_POST_GRAD
+                and config.post_grad_fusion_options[x].get("require_fbgemm", False)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         fbgemm_fusions = {
             fusion: config.post_grad_fusion_options[fusion]
diff --git a/torch/_inductor/fx_passes/joint_graph.py b/torch/_inductor/fx_passes/joint_graph.py
index 07ab3a5b6d69..3a137425ea6a 100644
--- a/torch/_inductor/fx_passes/joint_graph.py
+++ b/torch/_inductor/fx_passes/joint_graph.py
@@ -5,15 +5,29 @@
 import operator
 import typing
 from collections import Counter
+<<<<<<< HEAD
+=======
+from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Union
 
 import torch
 import torch._guards
 import torch.utils._pytree as pytree
+<<<<<<< HEAD
 from torch._inductor.constant_folding import ConstantFolder
 from torch._inductor.fx_passes.dedupe_symint_uses import _SymHashingDict
 from torch.fx.experimental.symbolic_shapes import (
     _guard_sizes_oblivious,
+=======
+from torch._dynamo.utils import counters
+from torch._inductor.constant_folding import ConstantFolder
+from torch._inductor.fx_passes.dedupe_symint_uses import _SymHashingDict
+from torch._inductor.utils import get_gpu_type
+from torch.fx.experimental.symbolic_shapes import (
+    guard_or_false,
+    guard_or_true,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     statically_known_true,
 )
 from torch.multiprocessing.reductions import StorageWeakRef
@@ -21,6 +35,10 @@
 
 from .. import config
 from ..pattern_matcher import (
+<<<<<<< HEAD
+=======
+    Arg,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CallFunction,
     init_once_fakemode,
     KeywordArg,
@@ -30,6 +48,10 @@
     register_graph_pattern,
     stable_topological_sort,
 )
+<<<<<<< HEAD
+=======
+from .decompose_mem_bound_mm import check_device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .replace_random import replace_random_passes
 
 
@@ -200,7 +222,11 @@ def remove_redundant_views(gm: torch.fx.GraphModule):
 
 class UniformValueConstantFolder(ConstantFolder):
     """
+<<<<<<< HEAD
     Runs constant folding and replaces tensors that have a unifrom value
+=======
+    Runs constant folding and replaces tensors that have a uniform value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with a tensor constructor call: aten.full([shape], value, ...)
     """
 
@@ -239,6 +265,43 @@ def __init__(self, gm, skip_constructors=False) -> None:
             ]
         )
 
+<<<<<<< HEAD
+=======
+        self._add_peephole_patterns()
+
+    def _add_peephole_patterns(self) -> None:
+        """
+        Add peephole patterns for nodes where we can infer constant value even if some inputs
+        of the node are unknown.
+        """
+        for op in itertools.chain(
+            self.module.graph.find_nodes(  # type: ignore[operator, union-attr]
+                op="call_function", target=torch.ops.aten.mul.Tensor
+            ),
+            self.module.graph.find_nodes(  # type: ignore[operator, union-attr]
+                op="call_function", target=torch.ops.aten.mul.Scalar
+            ),
+        ):
+            tensor_val = op.meta.get("val", None)
+            if not isinstance(tensor_val, torch.Tensor):
+                continue
+
+            def is_zero_int(arg: Any) -> bool:
+                return isinstance(arg, int) and arg == 0
+
+            if not any(is_zero_int(a) for a in op.args):
+                continue
+
+            t = torch.full(
+                [1],  # shape
+                0,  # value
+                dtype=tensor_val.dtype,
+                device=tensor_val.device,
+                pin_memory=False,
+            )
+            self.add_node_replacement(op, t)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _support_dynamic_shape(self):
         return True
 
@@ -658,6 +721,53 @@ def pointless_convert(match: Match, arg, dtype1: torch.dtype, dtype2: torch.dtyp
         match.erase_nodes()
 
 
+<<<<<<< HEAD
+=======
+def definitely_equal(
+    old_sizes: Sequence[Union[torch.SymInt, int]],
+    new_sizes: Sequence[Union[torch.SymInt, torch.fx.Node, int]],
+) -> bool:
+    """
+    Leverage guard_or_true/false to compare if two lists of int/symint are equal.
+    Useful to compare sizes, strides etc.
+
+    Can handle -1 in new_sizes which happens in the size arguments of a
+    view op. old_sizes is supposed to be the tensor shape and should not
+    contain -1.
+
+    new_sizes can contains fx.Node when dynamic shape is enabled. In that
+    case new_sizes[i].meta['val'] contains the real torch.SymInt.
+    """
+
+    num_neg1 = 0
+
+    if len(old_sizes) != len(new_sizes):
+        return False
+
+    for lhs_item, rhs_item in zip(old_sizes, new_sizes):
+        if isinstance(rhs_item, torch.fx.Node):
+            rhs_item = rhs_item.meta["val"]
+
+        assert isinstance(lhs_item, (int, torch.SymInt)), type(lhs_item)
+        assert isinstance(rhs_item, (int, torch.SymInt)), type(rhs_item)
+
+        # It still makes sense to call guard_or_true/false since lhs_item
+        # rhs_item are torch.SymInt rather than sympy expressions when
+        # dynamic shape is enabled.
+        if guard_or_false(lhs_item == rhs_item):
+            continue
+
+        if guard_or_true(rhs_item != -1):
+            return False
+
+        num_neg1 += 1
+
+        if num_neg1 > 1:
+            return False
+    return True
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_graph_pattern(
     CallFunction(torch.ops.aten.view.default, KeywordArg("arg"), KeywordArg("size")),
     pass_dict=patterns,
@@ -666,7 +776,11 @@ def pointless_view(match: Match, arg, size):
     """Remove no-op view"""
     node = match.output_node()
     arg_size = list(node.args[0].meta["val"].shape)  # type: ignore[union-attr]
+<<<<<<< HEAD
     if _guard_sizes_oblivious(size, arg_size):
+=======
+    if definitely_equal(arg_size, size):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node.replace_all_uses_with(node.args[0])  # type: ignore[arg-type]
         match.erase_nodes()
 
@@ -685,9 +799,16 @@ def pointless_view_pair(match: Match, arg, size1, size2):
     """
     node = match.output_node()
     arg_size = list(arg.meta["val"].shape)
+<<<<<<< HEAD
     if _guard_sizes_oblivious(arg_size, size2):
         node.replace_all_uses_with(arg)
         match.erase_nodes()
+=======
+    if definitely_equal(arg_size, size2):
+        node.replace_all_uses_with(arg)
+        match.erase_nodes()
+        counters["inductor"]["removed_pointless_view_pair"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_graph_pattern(
@@ -710,6 +831,31 @@ def pointless_permute_pair(match: Match, arg, perm1, perm2):
     match.erase_nodes()
 
 
+<<<<<<< HEAD
+=======
+@register_graph_pattern(
+    CallFunction(
+        aten.bmm,
+        Arg(),
+        Arg(),
+    ),
+    pass_dict=patterns,
+)
+def bmm_to_mm(match: Match, mat1: torch.fx.Node, mat2: torch.fx.Node):
+    """Convert bmm to mm when batch size is 1"""
+
+    def repl(a, b):
+        return torch.mm(a.squeeze(0), b.squeeze(0)).unsqueeze(0)
+
+    if (
+        check_device(mat1.meta["val"], mat2.meta["val"], get_gpu_type())
+        and statically_known_true(mat1.meta["val"].shape[0] == 1)
+        and statically_known_true(mat2.meta["val"].shape[0] == 1)
+    ):
+        match.replace_by_example(repl, [mat1, mat2])
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # When softmax is used with temperature or other scaling, we get the pattern
 #
 #   scale(x) - scale(x).amax(dim, keepdim=True)
diff --git a/torch/_inductor/fx_passes/micro_pipeline_tp.py b/torch/_inductor/fx_passes/micro_pipeline_tp.py
index ef42a0153692..7af34621d9a8 100644
--- a/torch/_inductor/fx_passes/micro_pipeline_tp.py
+++ b/torch/_inductor/fx_passes/micro_pipeline_tp.py
@@ -1,7 +1,15 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import operator
 from collections import defaultdict
 from dataclasses import dataclass, field
+=======
+import logging
+import operator
+from collections import defaultdict
+from dataclasses import dataclass, field
+from math import prod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, cast, Optional
 
 import torch
@@ -20,6 +28,10 @@
 )
 
 
+<<<<<<< HEAD
+=======
+log = logging.getLogger(__name__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten = torch.ops.aten
 patterns = PatternMatcherPass()
 
@@ -201,14 +213,46 @@ def make_cat_pattern(splits):
 class _ReduceScatterMatch:
     match: Match
     input_node: torch.fx.Node
+<<<<<<< HEAD
     rs_node: torch.fx.Node
     res_node: torch.fx.Node
+=======
+    reduce_scatter_node: torch.fx.Node
+    wait_tensor_node: torch.fx.Node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reduce_op: str
     scatter_dim: int
     group_name: str
 
     def replace_with(self, new_node: torch.fx.Node) -> None:
+<<<<<<< HEAD
         self.res_node.replace_all_uses_with(new_node)
+=======
+        # Replace all uses of the result node (wait_tensor) with the fused node.
+        self.wait_tensor_node.replace_all_uses_with(new_node)
+
+        # If the reduce-scatter result is saved for backward, save the fused node for backward instead.
+        self._update_save_for_backward(new_node)
+
+    def _update_save_for_backward(self, new_node: torch.fx.Node) -> None:
+        """
+        If the output node is a user of the reduce_scatter node (indicating the reduce_scatter
+        result is saved for backward), this method will update the output node to use the fused node instead.
+        """
+        output_node = None
+        for user in self.reduce_scatter_node.users:
+            if user.target == "output":
+                output_node = user
+                break
+        if output_node is not None:
+            output_node.replace_input_with(self.reduce_scatter_node, new_node)
+
+            # Assert that now the reduce scatter node has only one user (the wait_tensor) and it's not
+            # saved for backward anymore.
+            assert len(self.reduce_scatter_node.users) == 1, (
+                "Reduce scatter node has multiple users, this is not expected"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def erase(self) -> None:
         for node in reversed(self.match.nodes):
@@ -219,7 +263,11 @@ def erase(self) -> None:
 def find_reduce_scatter_patterns(graph: torch.fx.Graph):
     c10d = torch.ops._c10d_functional
 
+<<<<<<< HEAD
     def reduce_scatter_template(inp: PatternExpr):
+=======
+    def reduce_scatter_template(inp: PatternExpr, users: int):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return CallFunction(
             c10d.wait_tensor.default,
             CallFunction(
@@ -228,14 +276,32 @@ def reduce_scatter_template(inp: PatternExpr):
                 KeywordArg("reduce_op"),
                 Ignored(),
                 KeywordArg("group_name"),
+<<<<<<< HEAD
+=======
+                _users=users,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
 
     # Matches funcol.reduce_scatter_tensor with scatter_dim == 0
+<<<<<<< HEAD
     zero_dim_reduce_scatter_pattern = reduce_scatter_template(KeywordArg("input"))
 
     # Matches funcol.reduce_scatter_tensor with scatter_dim > 0
     non_zero_dim_reduce_scatter_pattern = reduce_scatter_template(
+=======
+    zero_dim_reduce_scatter_pattern_single_user = reduce_scatter_template(
+        KeywordArg("input"), users=1
+    )
+
+    # Two users will occur when the reduce-scatter result is saved for backward
+    zero_dim_reduce_scatter_pattern_multi_user = reduce_scatter_template(
+        KeywordArg("input"), users=2
+    )
+
+    # Matches funcol.reduce_scatter_tensor with scatter_dim > 0
+    non_zero_dim_reduce_scatter_pattern_single_user = reduce_scatter_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         CallFunction(
             aten.cat.default,
             ListOf(
@@ -252,32 +318,101 @@ def reduce_scatter_template(inp: PatternExpr):
                 )
             ),
         ),
+<<<<<<< HEAD
+=======
+        users=1,
+    )
+
+    # Two users will occur when the reduce-scatter result is saved for backward
+    non_zero_dim_reduce_scatter_pattern_multi_user = reduce_scatter_template(
+        CallFunction(
+            aten.cat.default,
+            ListOf(
+                CallFunction(
+                    operator.getitem,
+                    CallFunction(
+                        aten.split.Tensor,
+                        KeywordArg("input"),
+                        Ignored(),
+                        KeywordArg("scatter_dim"),
+                        _users=MULTIPLE,
+                    ),
+                    Ignored(),
+                )
+            ),
+        ),
+        users=2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     reduce_scatters = []
     for node in reversed(graph.nodes):
         if node.target == c10d.wait_tensor.default:
+<<<<<<< HEAD
             if match := non_zero_dim_reduce_scatter_pattern.match(node):
+=======
+            if match := non_zero_dim_reduce_scatter_pattern_single_user.match(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert isinstance(match, Match)
                 reduce_scatters.append(
                     _ReduceScatterMatch(
                         match=match,
                         input_node=match.kwargs["input"],
+<<<<<<< HEAD
                         rs_node=match.nodes[-2],
                         res_node=node,
+=======
+                        reduce_scatter_node=match.nodes[-2],
+                        wait_tensor_node=node,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         reduce_op=match.kwargs["reduce_op"],
                         scatter_dim=match.kwargs["scatter_dim"],
                         group_name=match.kwargs["group_name"],
                     )
                 )
+<<<<<<< HEAD
             elif match := zero_dim_reduce_scatter_pattern.match(node):
+=======
+            elif match := zero_dim_reduce_scatter_pattern_single_user.match(node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert isinstance(match, Match)
                 reduce_scatters.append(
                     _ReduceScatterMatch(
                         match=match,
                         input_node=match.kwargs["input"],
+<<<<<<< HEAD
                         rs_node=match.nodes[0],
                         res_node=node,
+=======
+                        reduce_scatter_node=match.nodes[0],
+                        wait_tensor_node=node,
+                        reduce_op=match.kwargs["reduce_op"],
+                        scatter_dim=0,
+                        group_name=match.kwargs["group_name"],
+                    )
+                )
+            elif match := non_zero_dim_reduce_scatter_pattern_multi_user.match(node):
+                assert isinstance(match, Match)
+                reduce_scatters.append(
+                    _ReduceScatterMatch(
+                        match=match,
+                        input_node=match.kwargs["input"],
+                        reduce_scatter_node=match.nodes[-2],
+                        wait_tensor_node=node,
+                        reduce_op=match.kwargs["reduce_op"],
+                        scatter_dim=match.kwargs["scatter_dim"],
+                        group_name=match.kwargs["group_name"],
+                    )
+                )
+            elif match := zero_dim_reduce_scatter_pattern_multi_user.match(node):
+                assert isinstance(match, Match)
+                reduce_scatters.append(
+                    _ReduceScatterMatch(
+                        match=match,
+                        input_node=match.kwargs["input"],
+                        reduce_scatter_node=match.nodes[0],
+                        wait_tensor_node=node,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         reduce_op=match.kwargs["reduce_op"],
                         scatter_dim=0,
                         group_name=match.kwargs["group_name"],
@@ -292,6 +427,11 @@ class _Matmul:
     arg_ancestor_nodes: OrderedSet[torch.fx.Node] = field(init=False)
     A_node: torch.fx.Node
     B_node: torch.fx.Node
+<<<<<<< HEAD
+=======
+    pre_mm_reshape: Optional[torch.fx.Node]
+    post_mm_reshape: Optional[torch.fx.Node]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         assert len(self.nodes) in (1, 3)
@@ -353,8 +493,17 @@ def from_match(cls, match: list[torch.fx.Node]) -> "_Matmul":
         mm_node = match[0] if len(match) == 1 else match[1]
         return _Matmul(
             nodes=match,
+<<<<<<< HEAD
             A_node=cast(torch.fx.Node, match[0].args[0]),
             B_node=cast(torch.fx.Node, mm_node.args[1]),
+=======
+            A_node=cast("torch.fx.Node", match[0].args[0]),
+            B_node=cast("torch.fx.Node", mm_node.args[1]),
+            # _Matmul handles reshapes via custom graph manipulation logic, see `replace_with()` method.
+            # TODO: explore unifying the _Matmul and _ScaledMatmul approaches to handling reshapes.
+            pre_mm_reshape=None,
+            post_mm_reshape=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -366,6 +515,11 @@ class _ScaledMatmul(_Matmul):
     result_scale_node: Optional[torch.fx.Node]
     out_dtype: Optional[torch.dtype]
     use_fast_accum: bool
+<<<<<<< HEAD
+=======
+    pre_mm_reshape: Optional[torch.fx.Node]
+    post_mm_reshape: Optional[torch.fx.Node]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         super().__post_init__()
@@ -379,13 +533,17 @@ def from_match(cls, match: list[torch.fx.Node]) -> "_ScaledMatmul":
             aten._scaled_mm.default,
             aten.reshape.default,
         )
+<<<<<<< HEAD
         mm_node = match[0] if len(match) == 1 else match[1]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def get_arg(node: torch.fx.Node, idx: int, default: Any) -> Any:
             if idx >= len(node.args):
                 return default
             return node.args[idx]
 
+<<<<<<< HEAD
         def insert_reshape_op(node: torch.fx.Node):
             """
             Given a reciprocal node with a parent reshape node,
@@ -477,6 +635,19 @@ def insert_reshape_op(node: torch.fx.Node):
             and is_reciprocal_with_reshape_parent
         ):
             A_scale_node = insert_reshape_op(A_scale_node)
+=======
+        # Use mm_node with 2D args for both A and B, even if this is a "reshape -> mm -> reshape" pattern.
+        # We will store the reshapes in pre_mm_reshape and post_mm_reshape, to be referenced later to
+        # produce the correct output shapes, reduce-scatter along the correct dimensions, etc.
+        is_reshape_mm_reshape_pattern = match[0].target == aten.reshape.default
+        mm_node = match[1] if is_reshape_mm_reshape_pattern else match[0]
+        pre_mm_reshape = match[0] if is_reshape_mm_reshape_pattern else None
+        post_mm_reshape = match[-1] if is_reshape_mm_reshape_pattern else None
+        A_node = cast("torch.fx.Node", mm_node.args[0])
+        B_node = cast("torch.fx.Node", mm_node.args[1])
+        A_scale_node = cast("torch.fx.Node", mm_node.args[2])
+        B_scale_node = cast("torch.fx.Node", mm_node.args[3])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return _ScaledMatmul(
             nodes=match,
@@ -488,6 +659,11 @@ def insert_reshape_op(node: torch.fx.Node):
             result_scale_node=get_arg(mm_node, 5, None),
             out_dtype=get_arg(mm_node, 6, None),
             use_fast_accum=get_arg(mm_node, 7, False),
+<<<<<<< HEAD
+=======
+            pre_mm_reshape=pre_mm_reshape,
+            post_mm_reshape=post_mm_reshape,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -506,8 +682,13 @@ def _find_reshape_mm_reshape(node: torch.fx.Node) -> list[_Matmul]:
             # Since the reshape -> mm -> reshape pattern would be subsumed into
             # the fused op, we only match the patterns where the shape of the
             # second reshape is matches the mm result produced by the fused op.
+<<<<<<< HEAD
             matmul_input_node = cast(torch.fx.Node, node.args[0])
             B_node = cast(torch.fx.Node, mm_node.args[1])
+=======
+            matmul_input_node = cast("torch.fx.Node", node.args[0])
+            B_node = cast("torch.fx.Node", mm_node.args[1])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             matmul_out_shape = torch.Size(
                 [
                     *_get_tensor(matmul_input_node).shape[:-1],
@@ -576,7 +757,11 @@ def _insert_fused_all_gather_matmul(
             kwargs={"return_A": True},
         )
     elif mm_type == _ScaledMatmul:
+<<<<<<< HEAD
         scaled_matmuls = cast(list[_ScaledMatmul], matmuls)
+=======
+        scaled_matmuls = cast("list[_ScaledMatmul]", matmuls)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return graph.call_function(
             torch.ops.symm_mem.fused_all_gather_scaled_matmul.default,
             args=(
@@ -707,7 +892,59 @@ def fuse_all_gather_matmul(all_gather: _AllGatherMatch) -> None:
             fused_node.prepend(node)
 
 
+<<<<<<< HEAD
 def _find_producer_matmul(node: torch.fx.Node) -> Optional[_Matmul]:
+=======
+def _scatter_dim_after_reshape(
+    reshape_node: torch.fx.Node, orig_scatter_dim: int
+) -> int:
+    """
+    Given a reshape node and the original scatter dim for the target tensor,
+    returns the new scatter dim for the reshaped tensor.
+    """
+    # if there was no pre-mm reshape, scatter dim will not change.
+    if not reshape_node:
+        return orig_scatter_dim
+
+    reshape_op_output_tensor = _get_tensor(reshape_node)
+    assert reshape_op_output_tensor.ndim == 2, (
+        "reshape must produce 2D tensor for scaled_mm"
+    )
+
+    assert len(reshape_node.args) >= 1, "reshape node must have at least 1 arg"
+    input_tensor_node = cast(torch.fx.Node, reshape_node.args[0])
+    reshape_op_input_tensor = _get_tensor(input_tensor_node)
+    assert reshape_op_input_tensor.ndim > reshape_op_output_tensor.ndim, (
+        "reshape must be from 3D+ to 2D"
+    )
+
+    # Note: for a N-D tensor to be reshaped into 2D, either the leading dims or ending dims must
+    # be collapsed to a single dim. First determine which of these happened.
+    input_shape = reshape_op_input_tensor.shape
+    output_shape = reshape_op_output_tensor.shape
+    leading_dims_collapsed = output_shape[0] == prod(input_shape[:-1])
+
+    # Case 1: scatter dim 0 always maps to 0 after any reshape from 3D+ to 2D, regardless if
+    # leading dims or ending dims were collapsed.
+    if orig_scatter_dim == 0:
+        return 0
+
+    # Case 2: scatter dim "ndim-1" always maps to 1 after any reshape from 3D+ to 2D, regardless if
+    # leading dims or ending dims were collapsed.
+    if orig_scatter_dim == reshape_op_input_tensor.ndim - 1:
+        return 1
+
+    # Case 3: scatter dim was one of the middle dims (between 0 and ndim-1).
+    # if the leading dims were collapsed, the new scatter dim will be 0.
+    # if the ending dims were collapsed, the new scatter dim will be 1.
+    return 0 if leading_dims_collapsed else 1
+
+
+def _find_producer_matmul(node: torch.fx.Node) -> Optional[_Matmul]:
+    """
+    Returns producer matmul node if found, otherwise returns None.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if node.target == aten.mm.default:
         return _Matmul.from_match(match=[node])
     elif node.target == aten._scaled_mm.default:
@@ -738,13 +975,30 @@ def _insert_fused_matmul_reduce_scatter(
     graph: torch.fx.Graph,
     matmul: _Matmul,
     reduce_op: str,
+<<<<<<< HEAD
     scatter_dim: int,
     group_name: str,
+=======
+    orig_scatter_dim: int,
+    group_name: str,
+    scatter_dim_after_reshape: int,  # only used for reshape -> scaled_mm -> reshape pattern
+    output_shape: list[int],  # only used for reshape -> scaled_mm -> reshape pattern
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> torch.fx.Node:
     if type(matmul) == _Matmul:
         return graph.call_function(
             torch.ops.symm_mem.fused_matmul_reduce_scatter.default,
+<<<<<<< HEAD
             args=(matmul.A_node, matmul.B_node, reduce_op, scatter_dim, group_name),
+=======
+            args=(
+                matmul.A_node,
+                matmul.B_node,
+                reduce_op,
+                orig_scatter_dim,
+                group_name,
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     elif type(matmul) == _ScaledMatmul:
         return graph.call_function(
@@ -755,8 +1009,15 @@ def _insert_fused_matmul_reduce_scatter(
                 matmul.A_scale_node,
                 matmul.B_scale_node,
                 reduce_op,
+<<<<<<< HEAD
                 scatter_dim,
                 group_name,
+=======
+                orig_scatter_dim,
+                scatter_dim_after_reshape,
+                group_name,
+                output_shape,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 matmul.bias_node,
                 matmul.result_scale_node,
                 matmul.out_dtype,
@@ -778,6 +1039,11 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
         torch.ops.symm_mem.fused_matmul_reduce_scatter(
             A, B, scatter_dim, group_name,
         )
+<<<<<<< HEAD
+=======
+
+    Returns boolean indicating if fusion was successful or not.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if (
         not torch.distributed.is_available()
@@ -790,10 +1056,24 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
         restride_A_for_fused_matmul_reduce_scatter,
     )
 
+<<<<<<< HEAD
     input_node, _rs_node, rs_res_node, reduce_op, scatter_dim, group_name = (
         reduce_scatter.input_node,
         reduce_scatter.rs_node,
         reduce_scatter.res_node,
+=======
+    (
+        input_node,
+        _reduce_scatter_node,
+        rs_wait_tensor_node,
+        reduce_op,
+        orig_scatter_dim,
+        group_name,
+    ) = (
+        reduce_scatter.input_node,
+        reduce_scatter.reduce_scatter_node,
+        reduce_scatter.wait_tensor_node,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduce_scatter.reduce_op,
         reduce_scatter.scatter_dim,
         reduce_scatter.group_name,
@@ -807,10 +1087,17 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
     # users. This is not a fundamental limitation of the fused op and can be
     # addressed if needed.
     if len(input_node.users) != 1:
+<<<<<<< HEAD
+=======
+        log.warning(
+            "matmul result has more than one user, skipping fused_matmul_reduce_scatter fusion."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     matmul = _find_producer_matmul(input_node)
     if matmul is None:
+<<<<<<< HEAD
         return
 
     if rs_res_node in matmul.arg_ancestor_nodes:
@@ -822,18 +1109,73 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
             restrided = restride_A_for_fused_matmul_reduce_scatter(
                 _get_tensor(matmul.A_node),
                 scatter_dim,
+=======
+        log.warning(
+            "no producer matmul found for reduce scatter, skipping fuse_matmul_reduce_scatter fusion"
+        )
+        return
+
+    if rs_wait_tensor_node in matmul.arg_ancestor_nodes:
+        log.warning(
+            "reduce-scatter result node is an ancestor of matmul, skipping fuse_matmul_reduce_scatter fusion"
+        )
+        return
+
+    # We need to track 3 values for the fused scaled mm reduce scatter implementation:
+    #   1. The scatter dim before the reshape, which was assigned using the original (a,b,c) @ (c,d) = (a,b,d) dims.
+    #   2. The scatter dim after the reshape, to use when we are doing the 2D (a*b,c) @ (c,d) = (a,b,d) scaled mm op.
+    #   3. Store expected potentially 3D+ mm output shape, so we can reshape the 2D mm output to the intended
+    #      3D+ shape before applying reduce-scatter, and to prevent shape errors with subsequent ops.
+
+    # If 'A' was reshaped from 3D+ -> 2D for the mm, we need to determine the new scattter dim after the reshape
+    # for the fused matmul reduce scatter implementation to use.
+    if matmul.pre_mm_reshape:
+        scatter_dim_after_maybe_reshape = _scatter_dim_after_reshape(
+            matmul.pre_mm_reshape, orig_scatter_dim
+        )
+    else:
+        scatter_dim_after_maybe_reshape = orig_scatter_dim
+
+    # If the 2D mm output was reshaped from 2D -> 3D+, we need to store the intended output shape for the
+    # fused matmul reduce scatter implementation to use.
+    if matmul.post_mm_reshape:
+        output_shape = list(_get_tensor(matmul.post_mm_reshape).shape)
+    else:
+        A_orig_shape = list(_get_tensor(matmul.A_node).shape)
+        B_shape = list(_get_tensor(matmul.B_node).shape)
+        output_shape = [*A_orig_shape[:-1], B_shape[-1]]
+
+    graph = rs_wait_tensor_node.graph
+    with graph.inserting_before(rs_wait_tensor_node):
+        # Restride A tensor before fused op, for optimal perf in fused matmul reduce scatter
+        if "val" in matmul.A_node.meta:
+            restrided = restride_A_for_fused_matmul_reduce_scatter(
+                _get_tensor(matmul.A_node),
+                scatter_dim_after_maybe_reshape,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             matmul.A_node = graph.call_function(
                 inductor_prims.force_stride_order,
                 args=(matmul.A_node, restrided.stride()),
             )
 
+<<<<<<< HEAD
+=======
+        # Replace matched subgraph with fused matmul reduce scatter node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fused_node = _insert_fused_matmul_reduce_scatter(
             graph,
             matmul,
             reduce_op,
+<<<<<<< HEAD
             scatter_dim,
             group_name,
+=======
+            orig_scatter_dim,
+            group_name,
+            scatter_dim_after_maybe_reshape,
+            output_shape,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         reduce_scatter.replace_with(fused_node)
         reduce_scatter.erase()
@@ -848,6 +1190,11 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
         if order[node] > order[fused_node]:
             fused_node.prepend(node)
 
+<<<<<<< HEAD
+=======
+    log.debug("successfully fused matmul reduce scatter")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _get_node_to_ancestors(
     graph: torch.fx.Graph,
@@ -945,9 +1292,22 @@ def micro_pipeline_tp_pass(graph: torch.fx.Graph):
         unexposed_collectives = _get_unexposed_collectives(graph)
         all_gathers = [x for x in all_gathers if x.ag_node not in unexposed_collectives]
         reduce_scatters = [
+<<<<<<< HEAD
             x for x in reduce_scatters if x.rs_node not in unexposed_collectives
         ]
 
+=======
+            x
+            for x in reduce_scatters
+            if x.reduce_scatter_node not in unexposed_collectives
+        ]
+
+    if not all_gathers and not reduce_scatters:
+        log.warning(
+            "async TP found no matching all-gather/reduce-scatter patterns for fusion"
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for all_gather in all_gathers:
         fuse_all_gather_matmul(all_gather)
 
diff --git a/torch/_inductor/fx_passes/misc_patterns.py b/torch/_inductor/fx_passes/misc_patterns.py
index b4e0f1f35023..2e9b9a6fb4d3 100644
--- a/torch/_inductor/fx_passes/misc_patterns.py
+++ b/torch/_inductor/fx_passes/misc_patterns.py
@@ -12,7 +12,11 @@
 aten = torch.ops.aten
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _misc_patterns_init():
     from .joint_graph import patterns as joint_graph_patterns
     from .post_grad import pass_patterns as post_grad_patterns_all
diff --git a/torch/_inductor/fx_passes/mkldnn_fusion.py b/torch/_inductor/fx_passes/mkldnn_fusion.py
index 9e69f96d27f0..8252c2700da9 100644
--- a/torch/_inductor/fx_passes/mkldnn_fusion.py
+++ b/torch/_inductor/fx_passes/mkldnn_fusion.py
@@ -2,7 +2,11 @@
 import functools
 import operator
 from functools import reduce
+<<<<<<< HEAD
 from typing import Any
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._dynamo.utils import counters
@@ -19,10 +23,22 @@
     KeywordArg,
     MULTIPLE,
 )
+<<<<<<< HEAD
+=======
+from ..utils import (
+    is_mkldnn_bf16_supported,
+    is_mkldnn_fp16_supported,
+    SUPPORTED_MKLDNN_DEVICES,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..virtualized import ops, V
 from .freezing_patterns import register_freezing_graph_pattern
 from .post_grad import register_lowering_pattern
 from .quantization import (
+<<<<<<< HEAD
+=======
+    _register_int8_woq_concat_linear_pattern,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _register_quantization_lowerings,
     _register_quantization_weight_pack_pass,
     _register_woq_lowerings,
@@ -38,6 +54,129 @@
     _linear_args = [Arg() for _ in range(6)]
     _conv_transpose_args = [Arg() for _ in range(11)]
 
+<<<<<<< HEAD
+=======
+    class MkldnnDeviceOpBase:
+        def get_linear_transpose_weight(self, weight_node):
+            raise NotImplementedError
+
+        def pack_conv_weight(
+            self,
+            graph,
+            is_transposed,
+            weight,
+            constant_args,
+            input_size,
+        ):
+            raise NotImplementedError
+
+        def pack_linear_weight(
+            self, graph, is_lp_weight, transpose_weight_node, batch_size
+        ):
+            raise NotImplementedError
+
+        def pack_linear(
+            self, graph, is_lp_weight, batch_size, input, packed_weight_node, bias
+        ):
+            raise NotImplementedError
+
+    class CpuMkldnnDeviceOp(MkldnnDeviceOpBase):
+        def get_linear_transpose_weight(self, weight_node):
+            packed_weight_node = weight_node
+            assert packed_weight_node.target == mkldnn._reorder_linear_weight
+            transpose_weight_node = packed_weight_node.args[0]
+            assert transpose_weight_node.target == aten.permute.default
+            return transpose_weight_node
+
+        def pack_conv_weight(
+            self,
+            graph,
+            is_transposed,
+            weight,
+            constant_args,
+            input_size,
+        ):
+            packed_weight_op = mkldnn._reorder_convolution_weight
+            if is_transposed:
+                packed_weight_op = mkldnn._reorder_convolution_transpose_weight
+
+            # mkldnn_reorder_conv_weight(self, padding, stride, dilation, groups, input_size)
+            packed_weight_inputs = (weight,) + tuple(constant_args) + (input_size,)
+            return graph.create_node(
+                "call_function", packed_weight_op, args=packed_weight_inputs
+            )
+
+        def pack_linear_weight(
+            self, graph, is_lp_weight, transpose_weight_node, batch_size
+        ):
+            # For bfloat16 dynamic shape path, using input size hint to pack weight for a better performance.
+            packed_weight_inputs = (
+                transpose_weight_node,
+                batch_size.node.shape_env.size_hint(batch_size.node.expr)
+                if has_free_symbols(batch_size)
+                else batch_size,
+            )
+
+            # MKL packed matrix can't be copied to a different address because the internal implementation
+            # depends on the alignment of internally-stored metadata.
+            # In aot mode, we need to firstly save the packed weight, when loading it,
+            # it will be in a different address which doesn't work.
+            # Disable MKL prepack linear in AOT mode
+            packed_weight_op = (
+                mkldnn._reorder_linear_weight
+                if (
+                    is_lp_weight
+                    or mkldnn._is_mkldnn_acl_supported()
+                    or V.aot_compilation
+                )
+                else torch.ops.mkl._mkl_reorder_linear_weight
+            )
+            return graph.create_node(
+                "call_function", packed_weight_op, args=packed_weight_inputs
+            )
+
+        def pack_linear(
+            self, graph, is_lp_weight, batch_size, input, packed_weight_node, bias
+        ):
+            packed_linear_inputs: tuple[Any, ...] = (input, packed_weight_node)
+            transpose_weight_node = packed_weight_node.args[0]
+            if is_lp_weight or mkldnn._is_mkldnn_acl_supported() or V.aot_compilation:
+                packed_linear_inputs += (bias, "none", [], "")
+                packed_linear_op: Callable[..., Any] = mkldnn._linear_pointwise.default
+            else:
+                packed_linear_inputs += (transpose_weight_node, bias, batch_size)
+                packed_linear_op = torch.ops.mkl._mkl_linear
+
+            return graph.create_node(
+                "call_function", packed_linear_op, packed_linear_inputs
+            )
+
+    class XpuMkldnnDeviceOp(MkldnnDeviceOpBase):
+        def pack_conv_weight(
+            self,
+            graph,
+            is_transposed,
+            weight,
+            constant_args,
+            input_size,
+        ):
+            assert not is_transposed, (
+                "'mkldnn::_convolution_transpose_pointwise' is not currently implemented for the XPU device."
+            )
+            return weight
+
+    def _get_mkldnn_device_op(device_type: str) -> MkldnnDeviceOpBase:
+        """
+        Returns the MKLDNN device operation class based on the current device type.
+        """
+        if device_type == "cpu":
+            return CpuMkldnnDeviceOp()
+        elif device_type == "xpu":
+            return XpuMkldnnDeviceOp()
+        else:
+            raise RuntimeError(f"MKLDNN is not supported on {device_type} device.")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _is_valid_grouped_gemm_fusion(computation_nodes):
         """
         Here we check:
@@ -61,7 +200,11 @@ def _is_valid_grouped_gemm_fusion(computation_nodes):
 
     def grouped_gemm_pass(graph: torch.fx.Graph):
         """
+<<<<<<< HEAD
         Group GEMM has multi output nodes which is compilicated to define a Pattern.
+=======
+        Group GEMM has multi output nodes which is complicated to define a Pattern.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Use below way to connect the pattern to the lowering.
         TODO: Use MultiOutputPattern, current limitation is the pattern requires
         fixed number of output nodes. Extend to support Group GEMM for pattern matcher.
@@ -923,10 +1066,18 @@ def get_val(val):
         def is_linear_add_bias(match):
             add_node = match.output_node()
             linear_node = add_node.args[0]
+<<<<<<< HEAD
             packed_weight_node = linear_node.args[1]
             assert packed_weight_node.target == mkldnn._reorder_linear_weight
             transpose_weight_node = packed_weight_node.args[0]
             assert transpose_weight_node.target == aten.permute.default
+=======
+            device_type = add_node.meta.get("val").device.type
+            mkldnn_device_op = _get_mkldnn_device_op(device_type)
+            transpose_weight_node = mkldnn_device_op.get_linear_transpose_weight(
+                linear_node.args[1]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             weight_meta = transpose_weight_node.args[0].meta.get("val")
             bias_node = add_node.args[1]
             if isinstance(bias_node, int):
@@ -935,10 +1086,14 @@ def is_linear_add_bias(match):
             bias_meta = add_node.args[1].meta.get("val")
             if weight_meta is None or bias_meta is None:
                 return False
+<<<<<<< HEAD
             assert weight_meta.dtype in (
                 torch.bfloat16,
                 torch.float16,
             )
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if bias_meta.dtype != weight_meta.dtype:
                 return False
             return (
@@ -997,13 +1152,21 @@ def _is_packable_mkldnn_rnn_layer(match):
         # Check dtype
         if any(
             lstm_node.args[POS_ARG].meta.get("val").dtype == torch.bfloat16
+<<<<<<< HEAD
             and not mkldnn._is_mkldnn_bf16_supported()
+=======
+            and not is_mkldnn_bf16_supported("cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for POS_ARG in POS_ARGS
         ):
             return False
         if any(
             lstm_node.args[POS_ARG].meta.get("val").dtype == torch.float16
+<<<<<<< HEAD
             and not mkldnn._is_mkldnn_fp16_supported()
+=======
+            and not is_mkldnn_fp16_supported("cpu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for POS_ARG in POS_ARGS
         ):
             return False
@@ -1015,6 +1178,14 @@ def _is_packable_convolution(match):
         Check if the node is supported for MKLDNN convolution.
         """
         conv_node = match.output_node()
+<<<<<<< HEAD
+=======
+        device_type = conv_node.meta.get("val").device.type
+        # The operator 'mkldnn::_convolution_transpose_pointwise' is not currently implemented for the XPU device.
+        if match.kwargs["is_transposed"] and device_type == "xpu":
+            return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_meta_value = conv_node.args[0].meta.get("val")
         weight_meta_value = conv_node.args[1].meta.get("val")
         if input_meta_value is None or weight_meta_value is None:
@@ -1025,21 +1196,37 @@ def _is_packable_convolution(match):
         for meta_value in [input_meta_value, weight_meta_value]:
             if (
                 meta_value is None
+<<<<<<< HEAD
                 or meta_value.device.type != "cpu"
                 or (meta_value.dim() != 4 and meta_value.dim() != 5)
             ):
                 return False
+=======
+                or meta_value.device.type not in SUPPORTED_MKLDNN_DEVICES
+                or (meta_value.dim() != 4 and meta_value.dim() != 5)
+            ):
+                return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             input_meta_value.dtype == torch.bfloat16
             or weight_meta_value.dtype == torch.bfloat16
         ):
+<<<<<<< HEAD
             if not mkldnn._is_mkldnn_bf16_supported():
+=======
+            if not is_mkldnn_bf16_supported(device_type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
         if (
             input_meta_value.dtype == torch.float16
             or weight_meta_value.dtype == torch.float16
         ):
+<<<<<<< HEAD
             if not mkldnn._is_mkldnn_fp16_supported():
+=======
+            if not is_mkldnn_fp16_supported(device_type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
         is_transposed = conv_node.args[-3]
         if is_transposed:
@@ -1123,17 +1310,29 @@ def is_const_or_cat_by_const(weight):
             ):
                 return False
 
+<<<<<<< HEAD
+=======
+        device_type = input_meta_value.device.type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             input_meta_value.dtype == torch.bfloat16
             or weight_meta_value.dtype == torch.bfloat16
         ):
+<<<<<<< HEAD
             if not mkldnn._is_mkldnn_bf16_supported():
+=======
+            if not is_mkldnn_bf16_supported(device_type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
         if (
             input_meta_value.dtype == torch.float16
             or weight_meta_value.dtype == torch.float16
         ):
+<<<<<<< HEAD
             if not mkldnn._is_mkldnn_fp16_supported():
+=======
+            if not is_mkldnn_fp16_supported(device_type):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
         return True
 
@@ -1178,6 +1377,7 @@ def convolution(match, *args, **kwargs):
             assert isinstance(is_transposed, bool)
             graph = match.graph
             conv_node = match.output_node()
+<<<<<<< HEAD
             input_size = conv_node.args[0].meta.get("val").shape
             with graph.inserting_before(conv_node):
                 constant_args = [args[4], args[3], args[5], args[-1]]
@@ -1193,11 +1393,34 @@ def convolution(match, *args, **kwargs):
                     )
                     packed_weight_node = graph.create_node(
                         "call_function", packed_weight_op, args=packed_weight_inputs
+=======
+            device_type = conv_node.args[0].meta.get("val").device.type
+            mkldnn_device_op = _get_mkldnn_device_op(device_type)
+            input_size = conv_node.args[0].meta.get("val").shape
+            with graph.inserting_before(conv_node):
+                constant_args = [args[4], args[3], args[5], args[-1]]
+                packed_conv_op = mkldnn._convolution_pointwise.default
+                if is_transposed:
+                    constant_args.insert(1, args[-2])  # output_padding
+                    packed_conv_op = mkldnn._convolution_transpose_pointwise.default
+
+                if not has_free_symbols(input_size):
+                    packed_weight_node = mkldnn_device_op.pack_conv_weight(
+                        graph,
+                        is_transposed,
+                        args[1],
+                        constant_args,
+                        input_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 else:
                     assert not is_transposed
                     # For dynamic shape case, we need to pack weight in runtime.
                     packed_weight_node = args[1]
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 packed_conv_inputs = (
                     (args[0], packed_weight_node, args[2])
                     + tuple(constant_args)
@@ -1299,6 +1522,11 @@ def linear(match, *args, **kwargs):
                 else args[0]
             )
             weight = args[1] if linear_node.target == aten.mm.default else args[2]
+<<<<<<< HEAD
+=======
+            device_type = input.meta.get("val").device.type
+            mkldnn_device_op = _get_mkldnn_device_op(device_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with graph.inserting_before(linear_node):
                 transpose_weight_node = graph.create_node(
                     "call_function", aten.permute.default, (weight, (1, 0))
@@ -1313,6 +1541,7 @@ def linear(match, *args, **kwargs):
                     assert is_lp_weight or mkldnn._is_mkldnn_acl_supported(), (
                         f"only bf16/fp16 weight prepacking supports dynamic shape inputs but got {weight_dtype}"
                     )
+<<<<<<< HEAD
                 # For bfloat16 dynamic shape path, using input size hint to pack weight for a better performance.
                 packed_weight_inputs = (
                     transpose_weight_node,
@@ -1352,6 +1581,15 @@ def linear(match, *args, **kwargs):
                 packed_linear_node = graph.create_node(
                     "call_function", packed_linear_op, packed_linear_inputs
                 )
+=======
+                packed_weight_node = mkldnn_device_op.pack_linear_weight(
+                    graph, is_lp_weight, transpose_weight_node, batch_size
+                )
+                packed_linear_node = mkldnn_device_op.pack_linear(
+                    graph, is_lp_weight, batch_size, input, packed_weight_node, bias
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 linear_node.replace_all_uses_with(packed_linear_node)
                 packed_linear_node.meta.update(linear_node.meta)
                 graph.erase_node(linear_node)
@@ -1398,7 +1636,11 @@ def forward(self, x):
                         user_node.replace_all_uses_with(node)
                         gm.graph.erase_node(user_node)
 
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _mkldnn_fusion_init():
         # TODO: aarch64: enable op fusion for acl once it supports fused operators. Disabling it for now.
         # Otherwise even the matmul or innerproduct can not be accelerated with acl
@@ -1414,9 +1656,17 @@ def _mkldnn_fusion_init():
             _register_quantization_lowerings()
             _register_woq_lowerings()
 
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _mkldnn_weight_pack_init():
         if torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available():
             _register_weight_pack_pass()
             _recover_linear()
             _register_quantization_weight_pack_pass()
+<<<<<<< HEAD
+=======
+            _register_int8_woq_concat_linear_pattern()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py
index a42296fe68ab..5870117654ad 100644
--- a/torch/_inductor/fx_passes/pad_mm.py
+++ b/torch/_inductor/fx_passes/pad_mm.py
@@ -102,7 +102,11 @@ def valid_shape_and_stride(t: Optional[Tensor]) -> bool:
                 symbolic_cnt += 1
             else:
                 return False
+<<<<<<< HEAD
         # filter out cases where all dimentions are symbolic
+=======
+        # filter out cases where all dimensions are symbolic
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if symbolic_cnt == len(t.size()):
             return False
         return all(
@@ -226,7 +230,11 @@ def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool:
         and K > M
         and K > N
         and torch.cuda.get_device_capability() < (9, 0)
+<<<<<<< HEAD
     ):  # doesnt repro on h100s:
+=======
+    ):  # doesn't repro on h100s:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
     # Fails with AMD
@@ -239,7 +247,11 @@ def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool:
 
     # dram_gbps might be underestimating bandwidth because of cache.
     # if we estimate machine balance too low we might miss some speedups,
+<<<<<<< HEAD
     # if we extimate too high there will be unnecessary compilation time increase.
+=======
+    # if we estimate too high there will be unnecessary compilation time increase.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO - finetune coefficient here. As a reference point, Triton mm model assumes
     # 80% of reads are in cache and cache is 4x faster than dram_gbps
     machine_balance = machine_balance * 0.5
@@ -247,7 +259,11 @@ def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool:
     return arithmetic_intensity > machine_balance
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_pad_cache() -> torch._inductor.codecache.LocalCache:
     return torch._inductor.codecache.LocalCache()
 
@@ -326,7 +342,11 @@ def should_exclude_padding_time(match: Match, arg_name: str) -> bool:
     if not fetch_fake_tensors(match, (arg_name,))[0].is_contiguous():
         return False
 
+<<<<<<< HEAD
     # TODO - see issue https://githpub.com/pytorch/pytorch/issues/128889
+=======
+    # TODO - see issue https://github.com/pytorch/pytorch/issues/128889
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # We would only able to completely plan these out if we were only doing
     # first dimension padding. non-first we would still need a copy
     # because these outputs are fixed dense.
@@ -382,7 +402,11 @@ def should_pad_mm_bf16(dtype: torch.dtype, M: int, N: int, K: int) -> bool:
         and N % 2 == 1
         and K >= large_k_threshold_to_pad
         and torch.cuda.get_device_capability() < (9, 0)
+<<<<<<< HEAD
     ):  # doesnt repro on h100s:
+=======
+    ):  # doesn't repro on h100s:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
     return False
 
@@ -390,7 +414,11 @@ def should_pad_mm_bf16(dtype: torch.dtype, M: int, N: int, K: int) -> bool:
 def should_pad_bench(*args: Any, **kwargs: Any) -> bool:
     with dynamo_timed(
         "pad_mm_benchmark",
+<<<<<<< HEAD
         log_pt2_compile_event=True,
+=======
+        log_pt2_compile_event=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dynamo_compile_column_us="compile_time_autotune_time_us",
     ):
         return _should_pad_bench(*args, **kwargs)
@@ -711,7 +739,11 @@ def fallback() -> str:
         ah_ori_time = autoheuristic.get_collected_feedback(orig_choice)
         ah_pad_time = autoheuristic.get_collected_feedback(pad_choice)
 
+<<<<<<< HEAD
         # if precondition is not satisifed, autoheuristic does not collect data
+=======
+        # if precondition is not satisfied, autoheuristic does not collect data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if ah_ori_time is not None and ah_pad_time is not None:
             if ori_time is None:
                 set_cached_base_mm_benchmark_time(ori_time_key, ah_ori_time)
@@ -851,7 +883,11 @@ def bmm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
     )
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _pad_mm_init() -> None:
     from .joint_graph import patterns
 
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
index 2e54f462279b..9c21d43ca2db 100644
--- a/torch/_inductor/fx_passes/post_grad.py
+++ b/torch/_inductor/fx_passes/post_grad.py
@@ -22,6 +22,10 @@
 from torch.utils._ordered_set import OrderedSet
 
 from .. import config, ir, pattern_matcher
+<<<<<<< HEAD
+=======
+from ..codegen.common import custom_backend_passes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..comms import remove_fsdp2_unsharded_param_graph_input_usage
 from ..fx_utils import FakeTensorUpdater, get_fake_args_kwargs, get_node_storage
 from ..lowering import lowerings as L
@@ -46,7 +50,18 @@
     register_replacement,
     stable_topological_sort,
 )
+<<<<<<< HEAD
 from ..utils import decode_device, get_gpu_type, is_gpu, is_pointwise_use
+=======
+from ..utils import (
+    decode_device,
+    get_all_devices,
+    get_gpu_type,
+    is_gpu,
+    is_pointwise_use,
+    OPTIMUS_EXCLUDE_POST_GRAD,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..virtualized import V
 from .b2b_gemm import B2B_GEMM_PASS
 from .ddp_fusion import fuse_ddp_communication
@@ -103,6 +118,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
             post_grad_custom_pre_pass
         )
 
+<<<<<<< HEAD
     if (
         config.cpp.enable_grouped_gemm_template
         and config.max_autotune
@@ -125,6 +141,26 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
                 print_output=False, include_stride=True, include_device=True
             ),
         )
+=======
+    if torch._C._has_mkldnn:
+        if (
+            config.cpp.enable_grouped_gemm_template
+            and config.max_autotune
+            and "CPP" in config.max_autotune_gemm_backends
+        ):
+            from .mkldnn_fusion import grouped_gemm_pass
+
+            grouped_gemm_pass(gm.graph)
+
+        if config.cpp.enable_concat_linear:
+            from .quantization import concat_linear_woq_int4
+
+            # Concat linear optimization for WOQ int4
+            concat_linear_woq_int4(gm)
+
+    if config.pattern_matcher:
+        lazy_init()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         GraphTransformObserver(gm, "post_grad_custom_pre_pass").apply_graph_pass(
             functools.partial(group_batch_fusion_passes, pre_grad=False)
         )
@@ -137,8 +173,13 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
                 patterns.apply
             )
         for pass_name in config.post_grad_fusion_options:
+<<<<<<< HEAD
             # skip all patterns for group batch fusions
             if pass_name in POST_GRAD_FUSIONS:
+=======
+            # skip all patterns for group batch fusions or quantization patterns
+            if pass_name in POST_GRAD_FUSIONS or pass_name in OPTIMUS_EXCLUDE_POST_GRAD:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             pattern_matcher_pass = POST_GRAD_PATTERNS[pass_name]
             inductor_before_change = save_inductor_dict(
@@ -186,6 +227,16 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
 
     fake_tensor_updater.incremental_update()
 
+<<<<<<< HEAD
+=======
+    for device, custom_backend_pass in custom_backend_passes.items():
+        if custom_backend_pass is not None:
+            gm_devices = [d.type for d in get_all_devices(gm)]
+            if device in gm_devices:
+                pass_name = "custom_backend_passes_" + device
+                GraphTransformObserver(gm, pass_name).apply_gm_pass(custom_backend_pass)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Keep these last, since they introduces mutation. Look at
     # ./fx_passes/README.md for a discussion of mutation invariants.
     GraphTransformObserver(gm, "reinplace_inplaceable_ops").apply_graph_pass(
@@ -197,6 +248,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
     GraphTransformObserver(gm, "decompose_auto_functionalized").apply_graph_pass(
         decompose_auto_functionalized
     )
+<<<<<<< HEAD
     GraphTransformObserver(gm, "reinplace_fsdp_all_gather").apply_graph_pass(
         comms.reinplace_fsdp_all_gather
     )
@@ -212,6 +264,20 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
             print_output=False, include_stride=True, include_device=True
         ),
     )
+=======
+    if not torch._dynamo.config.skip_fsdp_hooks:
+        GraphTransformObserver(gm, "reinplace_fsdp_all_gather").apply_graph_pass(
+            comms.reinplace_fsdp_all_gather
+        )
+    GraphTransformObserver(gm, "decompose_scan_to_while_loop").apply_gm_pass(
+        decompose_scan_to_while_loop
+    )
+    GraphTransformObserver(gm, "decompose_map_to_while_loop").apply_gm_pass(
+        decompose_map_to_while_loop
+    )
+
+    gm.recompile()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gm.graph.lint()
 
 
@@ -247,6 +313,323 @@ def prepare_softmax_extra_check(match):
     )
 
 
+<<<<<<< HEAD
+=======
+def decompose_map_to_while_loop(gm: torch.fx.GraphModule):
+    """This is similar to decompose_scan_to_while_loop."""
+    graph_pass = PatternMatcherPass()
+
+    @register_graph_pattern(
+        CallFunctionVarArgs(torch.ops.higher_order.map_impl),
+        pass_dict=graph_pass,
+    )
+    def _(match: Match, *args, **kwargs):
+        assert len(kwargs) == 0, (
+            "kwargs of map are not merged into args before entering decompose_map_to_while_loop_pass"
+        )
+        subgraph, fx_xs, fx_additional_inputs = args
+        sub_gm: torch.fx.GraphModule = getattr(gm, subgraph.target)
+        cur_node = match.nodes[0]
+        mapped_outputs = cur_node.meta["val"]
+
+        def lower_to_while_loop(*args, **kwargs):
+            assert len(kwargs) == 0
+            xs, additional_inputs = pytree.tree_unflatten(args, tree_spec)
+            assert isinstance(xs, (tuple, list)) and isinstance(
+                additional_inputs, (tuple, list)
+            ), (xs, additional_inputs)
+            map_length = xs[0].size(0)
+            loop_idx = torch.zeros([], dtype=torch.int64, device=torch.device("cpu"))
+
+            # Similar to NOTE [Pre-allocate scan's output buffer]
+            bound_symbols = {
+                arg.node.expr: arg
+                for arg in pytree.tree_leaves((args, map_length))
+                if isinstance(arg, torch.SymInt)
+            }
+            out_buffers = [
+                torch.empty_strided(
+                    resolve_shape_to_proxy(out.size(), bound_symbols),
+                    resolve_shape_to_proxy(out.stride(), bound_symbols),
+                    device=out.device,
+                    dtype=out.dtype,
+                    layout=out.layout,
+                    requires_grad=out.requires_grad,
+                )
+                for out in mapped_outputs
+            ]
+
+            while_loop_operands = (loop_idx, out_buffers, xs)
+            while_loop_flat_operands, operands_spec = pytree.tree_flatten(
+                while_loop_operands
+            )
+            while_loop_additional_inputs = additional_inputs
+            _, operands_and_additional_inputs_spec = pytree.tree_flatten(
+                (*while_loop_operands, additional_inputs)
+            )
+
+            def cond_fn(*flat_args):
+                loop_idx, _, _, _ = pytree.tree_unflatten(
+                    flat_args,
+                    operands_and_additional_inputs_spec,
+                )
+                return loop_idx < map_length
+
+            def body_fn(*flat_args):
+                loop_idx, out_bufs, xs, additional_inputs = pytree.tree_unflatten(
+                    flat_args,
+                    operands_and_additional_inputs_spec,
+                )
+
+                idx_int = loop_idx.item()
+                torch.ops.aten._assert_scalar.default(idx_int >= 0, "")
+                torch.ops.aten._assert_scalar.default(idx_int < map_length, "")
+                sub_xs = [torch.ops.aten.select.int(x, 0, idx_int) for x in xs]
+                outs = sub_gm(*sub_xs, *additional_inputs)
+
+                for out, buffer in zip(outs, out_bufs):
+                    buffer_slice = torch.ops.aten.select.int(buffer, 0, idx_int)
+                    buffer_slice.copy_(out)
+                return loop_idx + 1, *out_bufs, *xs
+
+            _, final_out, _ = pytree.tree_unflatten(
+                torch.ops.higher_order.while_loop(
+                    cond_fn,
+                    body_fn,
+                    tuple(while_loop_flat_operands),
+                    tuple(while_loop_additional_inputs),
+                ),
+                operands_spec,
+            )
+            return (final_out,)
+
+        lower_to_while_loop_args, tree_spec = pytree.tree_flatten(
+            (fx_xs, fx_additional_inputs)
+        )
+        match.replace_by_example(
+            lower_to_while_loop, lower_to_while_loop_args, run_functional_passes=False
+        )
+
+    graph_pass.apply(gm)
+
+    for node in gm.graph.find_nodes(
+        op="call_function", target=torch.ops.higher_order.map_impl
+    ):
+        raise AssertionError("map is not lowered to while_loop")
+
+
+def resolve_shape_to_proxy(
+    shape: list[Union[int, torch.SymInt]], bound_symbols: dict[Any, Any]
+):
+    """
+    Given a list of symints/ints, this function returns a calculated expression of bound_symbols' values.
+    When we trace this function, we'll get a graph with call_function nodes that describes how the shape expr is
+    computed from bound_symbols' values.
+
+    Suppose shape = (s1*s2, s1+s2) and bound_symbols = {s1: arg0, s2: arg1}, the result will be
+    (arg0 * arg1, arg0 + arg1).
+    """
+    from torch.utils._sympy.interp import sympy_interp
+    from torch.utils._sympy.reference import PythonReferenceAnalysis
+
+    ret = []
+    for s in shape:
+        if isinstance(s, torch.SymInt):
+            ret.append(
+                sympy_interp(
+                    PythonReferenceAnalysis,
+                    bound_symbols,
+                    s.node.expr,
+                ),
+            )
+        else:
+            assert isinstance(s, int)
+            ret.append(s)
+    return ret
+
+
+def decompose_scan_to_while_loop(gm: torch.fx.GraphModule):
+    """
+    NOTE [decompose scan to while_loop]
+    This pass decomposes `scan` to  `while_loop` by replacing the scan fx_node with a while_loop hop.
+
+    Suppose we have a function f:
+
+        def f():
+            init = torch.zeros([])
+            xs = torch.arange(4)
+            ys = []
+            for i in range(xs.size(0)):
+                init = xs[i] + init
+                ys.append(init)
+
+            # Return the final carry and stack the intermediates
+            return init, torch.stack(ys)
+
+    We could rewrite it with a scan with the benefits of reducing compilation time/binary size, reducing
+    memory usage, supporting loops over unbacked shapes and cudagraph etc.
+
+        def g():
+            def step_fn(init: torch.Tensor, x: torch.Tensor):
+                next_init = x + init
+                return next_init, next_init
+
+            init = torch.zeros([])
+            xs = torch.arange(4)
+            final_carry, ys = torch._higher_order.scan(step_fn, init, xs)
+            return final_carry, ys
+
+    This pass will rewrite scan into:
+
+        def k():
+            init = torch.zeros([])
+            xs = torch.arange(4)
+
+            # we create a loop_idx and loop through xs.shape[0]
+            loop_idx = torch.zeros([])
+            ys = torch.empty_strided(_shape_stride_of_ys)
+            def cond_fn(loop_idx, ys, init, xs):
+                return loop_idx < xs.shape[0]
+
+            # we pre-allocate the output buffer ys and inplace
+            # copy the y of each intermediate into a slice.
+            # NOTE [Pre-allocate scan's output buffer].
+            def body_fn(loop_idx, ys, init, xs):
+                int_idx = loop_idx.item()
+                next_init, y = step_fn(init, xs[int_idx])
+                ys[int_idx].copy_(y)
+                return loop_idx + 1, ys, next_init, xs
+
+            final_carry, _, _, ys = torch._higher_order.while_loop(cond_fn, body_fn, (loop_idx, ys, init, xs))
+            return final_carry, ys
+    """
+
+    graph_pass = PatternMatcherPass()
+
+    @register_graph_pattern(
+        CallFunctionVarArgs(torch.ops.higher_order.scan),
+        pass_dict=graph_pass,
+    )
+    def _(match: Match, *args, **kwargs):
+        from torch._higher_order_ops.scan import _extract_carry_and_out
+
+        assert len(kwargs) == 0, (
+            "kwargs of scan are not merged into args before entering decompose_scan_to_while_loop_pass"
+        )
+
+        combine_subgraph, fx_init, fx_xs, fx_additional_inputs = args
+        assert combine_subgraph.op == "get_attr", "first arg is not combine_subgraph"
+        sub_gm: torch.fx.GraphModule = getattr(gm, combine_subgraph.target)
+        cur_node = match.nodes[0]
+        num_init_leaves = len(fx_init)
+        _, ys_outputs = _extract_carry_and_out(cur_node.meta["val"], num_init_leaves)
+
+        def lower_to_while_loop(*args, **kwargs):
+            """
+            The traced graph of this function will be used to replace the original scan fx_node.
+            """
+            assert len(kwargs) == 0
+
+            # Step 1: construct necessary inputs to while_loop based on scan's input.
+            (
+                init,
+                xs,
+                additional_inputs,
+            ) = pytree.tree_unflatten(args, tree_spec)
+            scan_length = xs[0].size(0)
+            loop_idx = torch.zeros([], dtype=torch.int64, device=torch.device("cpu"))
+
+            # NOTE [Pre-allocate scan's output buffer]
+            # In order to pre-allocate the output buffer for ys, we rely on the meta of scan's fx_node.
+            # However, the meta consists of concrete symints, we need to bind those symints with
+            # proxies in order to trace the torch.empyt_strided call correctly.
+            #
+            # Also note that basic free symbols of tensor's shapes are guaranteed to be lifted as subgraph inputs
+            # in dynamo so we can always re-construct the sym expression from placeholders.
+            # See Note [Auto lift basic free symbols when create_graph_input] for how this is done.
+            bound_symbols = {
+                arg.node.expr: arg
+                for arg in pytree.tree_leaves((args, scan_length))
+                if isinstance(arg, torch.SymInt)
+            }
+            ys_outs = [
+                torch.empty_strided(
+                    resolve_shape_to_proxy(ys_out.size(), bound_symbols),
+                    resolve_shape_to_proxy(ys_out.stride(), bound_symbols),
+                    device=ys_out.device,
+                    dtype=ys_out.dtype,
+                    layout=ys_out.layout,
+                    requires_grad=ys_out.requires_grad,
+                )
+                for ys_out in ys_outputs
+            ]
+
+            while_loop_operands = (loop_idx, ys_outs, init, xs)
+            flat_operands, operands_spec = pytree.tree_flatten(while_loop_operands)
+            _, operands_and_additional_inputs_spec = pytree.tree_flatten(
+                (*while_loop_operands, additional_inputs)
+            )
+
+            # Step 2: create the cond_fn and body_fn for while_loop
+            def cond_fn(*flat_args):
+                loop_idx, _, _, _, _ = pytree.tree_unflatten(
+                    flat_args, operands_and_additional_inputs_spec
+                )  # type: ignore[has-type]
+                return loop_idx < scan_length  # type: ignore[has-type]
+
+            def body_fn(*flat_args):
+                loop_idx, ys_outs, carry, xs, additional_inputs = pytree.tree_unflatten(
+                    flat_args,
+                    operands_and_additional_inputs_spec,  # type: ignore[has-type]
+                )
+
+                idx_int = loop_idx.item()
+                torch.ops.aten._assert_scalar.default(idx_int >= 0, "")
+                torch.ops.aten._assert_scalar.default(idx_int < scan_length, "")
+                sub_xs = [torch.ops.aten.select.int(x, 0, idx_int) for x in xs]
+                next_carry, ys = _extract_carry_and_out(
+                    sub_gm(*(list(carry) + sub_xs + list(additional_inputs))),
+                    num_init_leaves,
+                )
+                for y, y_out in zip(ys, ys_outs):
+                    y_out_slice = torch.ops.aten.select.int(y_out, 0, idx_int)
+                    y_out_slice.copy_(y)
+                return loop_idx + 1, *ys_outs, *next_carry, *xs
+
+            # Step 3: call the while_loop operator
+            _, ys_outs, last_carry, _ = pytree.tree_unflatten(
+                torch.ops.higher_order.while_loop(
+                    cond_fn,
+                    body_fn,
+                    tuple(flat_operands),
+                    tuple(additional_inputs),
+                ),
+                operands_spec,
+            )
+            return list(last_carry) + list(ys_outs)
+
+        lower_to_while_loop_args, tree_spec = pytree.tree_flatten(
+            (
+                fx_init,
+                fx_xs,
+                fx_additional_inputs,
+            )
+        )
+        match.replace_by_example(
+            lower_to_while_loop,
+            lower_to_while_loop_args,
+            run_functional_passes=False,
+        )
+
+    graph_pass.apply(gm)
+
+    for node in gm.graph.find_nodes(
+        op="call_function", target=torch.ops.higher_order.scan
+    ):
+        raise AssertionError("scan is not lowered to while_loop")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @init_once_fakemode
 def lazy_init():
     if torch._C._has_mkldnn:
@@ -270,6 +653,24 @@ def lazy_init():
 
 
 def reorder_for_locality(graph: torch.fx.Graph):
+<<<<<<< HEAD
+=======
+    if torch.distributed.is_available():
+
+        def check():
+            # This is a wait node, and `other_node`` is some collective node.
+            # Eager semantics allow waits to be issued in a different order than
+            # the collectives. Reordering this wait node might reorder collectives
+            # which cause hangs. Once we have SPMD mode, we can safely reorder them.
+            # However, increasing the locality between a collective and its wait node
+            # is generally worse for performance.
+            return node.target != torch.ops._c10d_functional.wait_tensor.default
+    else:
+
+        def check():
+            return True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def visit(other_node):
         if (
             other_node.op == "call_function"
@@ -277,6 +678,10 @@ def visit(other_node):
             and all((n in seen_nodes) for n in other_node.users)
             and get_mutation_region_id(graph, node)
             == get_mutation_region_id(graph, other_node)
+<<<<<<< HEAD
+=======
+            and check()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # move node's producers right before it
             node.prepend(other_node)
@@ -285,7 +690,11 @@ def visit(other_node):
 
     # only reorder nodes before the first copy_ in the graph.
     # copy_ will appear at the end of functionalized graphs when there is mutation on inputs,
+<<<<<<< HEAD
     # and this reordering doesnt work well with mutation
+=======
+    # and this reordering doesn't work well with mutation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     first_copy = next(
         iter(graph.find_nodes(op="call_function", target=torch.ops.aten.copy_.default)),
         None,
@@ -321,7 +730,11 @@ def register_lowering_pattern(
 
 
 def is_valid_mm_plus_mm(match: Match):
+<<<<<<< HEAD
     if not torch._inductor.utils.use_max_autotune():
+=======
+    if not (config.max_autotune or config.max_autotune_gemm):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
 
     *_b1, m1, k1 = match.kwargs["mat1"].meta.get("tensor_meta").shape
@@ -598,9 +1011,20 @@ def register_fun(cond):
 def slice_noop(self, dim=0, start=None, end=None, step=1):
     if start is None or end is None:
         return False
+<<<<<<< HEAD
     if (
         statically_known_true(sym_eq(start, 0))
         and statically_known_true(end >= 2**63 - 1)
+=======
+
+    slice_dim_size = self.shape[dim]
+    if (
+        statically_known_true(sym_eq(start, 0))
+        and (
+            statically_known_true(end >= 2**63 - 1)
+            or statically_known_true(end >= slice_dim_size)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and statically_known_true(sym_eq(step, 1))
     ):
         return True
@@ -613,7 +1037,20 @@ def slice_scatter_noop(self, src, dim=0, start=None, end=None, step=1):
         start = 0
     if end is None:
         end = 2**63 - 1
+<<<<<<< HEAD
     if start == 0 and end >= 2**63 - 1 and step == 1:
+=======
+    slice_scatter_dim_size = self.shape[dim]
+    if (
+        self.shape == src.shape
+        and start == 0
+        and (
+            statically_known_true(end >= 2**63 - 1)
+            or statically_known_true(end >= slice_scatter_dim_size)
+        )
+        and step == 1
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
     return False
 
@@ -653,9 +1090,20 @@ def cat_noop(inputs, dim=0):
     return len(inputs) == 1
 
 
+<<<<<<< HEAD
 @register_noop_decomp(aten.view)
 def view_noop(arg, size):
     return arg.shape == size
+=======
+@register_noop_decomp(aten.view.default)
+def view_default_noop(arg, size):
+    return statically_known_true(sym_eq(arg.shape, tuple(size)))
+
+
+@register_noop_decomp(aten.view.dtype)
+def view_dtype_noop(arg, dtype):
+    return arg.dtype == dtype
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Note, we also always have a check for identical metadata, which is why these
@@ -843,6 +1291,25 @@ def _(match: Match, *args, **kwargs):
 
         flat_args, spec = pytree.tree_flatten((args, kwargs))
 
+<<<<<<< HEAD
+=======
+        def _maybe_resolve_constant_get_attr(node):
+            # Resolve getattr node to its value because they don't always have meta["val"]
+            if (
+                isinstance(node, torch.fx.Node)
+                and node.op == "get_attr"
+                and "val" not in node.meta
+            ):
+                const_attr = getattr(graph.owning_module, node.target)  # type: ignore[arg-type]
+                assert isinstance(
+                    const_attr, (torch.fx.GraphModule, pytree.TreeSpec)
+                ), (type(const_attr), const_attr)
+                return const_attr
+            return node
+
+        flat_args = [_maybe_resolve_constant_get_attr(arg) for arg in flat_args]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NB: we combine (args, kwargs) into flat args for replacing.
         # This is replace_by_example uses make_fx which does not support
         # tracing a function with kwargs.
@@ -858,6 +1325,24 @@ def decomp(*flat_args):
 
     graph_pass.apply(graph)
 
+<<<<<<< HEAD
+=======
+    # We need to remove the get_attr registered for _constant_schema and the
+    # auto_functioanlized's graph module (it's replaced with original ) when auto_functionalize a hop.
+    _to_remove = []
+    for node in graph.nodes:
+        if node.op == "get_attr" and len(node.users) == 0:
+            _to_remove.append(node)
+            if hasattr(graph.owning_module, node.target) and isinstance(
+                getattr(graph.owning_module, node.target), torch.fx.GraphModule
+            ):
+                delattr(graph.owning_module, node.target)
+    for node in _to_remove:
+        graph.erase_node(node)
+
+    graph.lint()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for _ in graph.find_nodes(
         op="call_function", target=torch.ops.higher_order.auto_functionalized
     ):
@@ -952,6 +1437,17 @@ def view_to_reshape(gm):
     """
     Replace view ops in the GraphModule to reshape ops.
     """
+<<<<<<< HEAD
+=======
+    subgraph_names: OrderedSet[str] = OrderedSet(
+        x.target for x in gm.graph.find_nodes(op="get_attr")
+    )
+
+    for child_name, child_mod in gm.named_children():
+        if child_name in subgraph_names and isinstance(child_mod, torch.fx.GraphModule):
+            view_to_reshape(child_mod)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for nd in gm.graph.find_nodes(
         op="call_function", target=torch.ops.aten.view.default
     ):
@@ -1046,7 +1542,11 @@ def register_partial_reduction_pattern():
         def reuse_partial(match, input, reduced_dims, keepdim):
             partial_red, full_red = match.output_nodes()
 
+<<<<<<< HEAD
             # if theyre small, reuse not worth it
+=======
+            # if they're small, reuse not worth it
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not statically_known_true(input.meta["val"].numel() >= 4096):
                 return True
 
@@ -1100,7 +1600,13 @@ def is_index_put_and_requires_h2d_sync_for_gpu_value(node):
 
 
 class ConstructorMoverPass:
+<<<<<<< HEAD
     def __init__(self, target: str, allow_outputs: bool = False) -> None:
+=======
+    def __init__(
+        self, target: str, allow_outputs: bool = False, allow_inputs: bool = False
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Move constructors from cpu to the target_device.
 
@@ -1113,9 +1619,17 @@ def __init__(self, target: str, allow_outputs: bool = False) -> None:
 
         - target: target device type
         - allow_outputs: allow outputs to be moved
+<<<<<<< HEAD
         """
 
         self.target = target
+=======
+        - allow_inputs: allow inputs to be moved
+        """
+
+        self.target = target
+        self.allow_inputs = allow_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.allow_outputs = allow_outputs
 
         assert isinstance(target, str), (
@@ -1137,6 +1651,41 @@ def allow_cpu_device(self, node: fx.Node) -> bool:
             torch.ops.aten.slice_scatter.default,
         )
 
+<<<<<<< HEAD
+=======
+    def is_on_target_device(self, node: fx.Node) -> bool:
+        """
+        Returns whether a node is on the target device.
+        """
+        node_device = self.get_node_device(node)
+        return node_device is not None and node_device.type == self.target
+
+    def is_cpu_scalar_tensor(self, node: fx.Node) -> bool:
+        """
+        Returns whether a node is a cpu scalar tensor.
+        """
+        device = self.get_node_device(node)
+        is_cpu = device is not None and device.type == "cpu"
+        ten = node.meta.get("val")
+        is_scalar = isinstance(ten, torch.Tensor) and len(ten.size()) == 0
+        return is_cpu and is_scalar
+
+    def all_inputs_are_cpu_scalar_or_on_target_device(self, node: fx.Node) -> bool:
+        """
+        Returns whether a node's inputs are either cpu scalar tensors or
+        on the target device.
+        """
+        inputs = (
+            inp
+            for inp in itertools.chain(node.args, node.kwargs.values())
+            if isinstance(inp, fx.Node)
+        )
+        return all(
+            self.is_cpu_scalar_tensor(inp) or self.is_on_target_device(inp)
+            for inp in inputs
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def cannot_be_moved(self, node: fx.Node) -> bool:
         """
         Returns whether a node can be moved to the target device.
@@ -1152,6 +1701,10 @@ def cannot_be_moved(self, node: fx.Node) -> bool:
             and node.target.namespace in ("prims", "aten")
         ):
             return True
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_index_put_and_requires_h2d_sync_for_gpu_value(node):
             return True
 
@@ -1188,12 +1741,28 @@ def add_cpu_inp(node):
     def __call__(self, graph: fx.Graph) -> None:
         target_devices = OrderedSet[torch.device]()
         constructors = []
+<<<<<<< HEAD
+=======
+        cpu_placeholders: OrderedSet[fx.Node] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for node in graph.nodes:
             device = self.get_node_device(node)
             if device and device.type == self.target:
                 target_devices.add(device)
 
+<<<<<<< HEAD
+=======
+            if (
+                self.allow_inputs
+                and node.op == "placeholder"
+                and self.is_cpu_scalar_tensor(node)
+            ):
+                cpu_placeholders.add(node)
+                constructors.append(node)
+                continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not (
                 isinstance(node.target, torch._ops.OpOverload)
                 and node.target.namespace in ("prims", "aten")
@@ -1214,10 +1783,42 @@ def __call__(self, graph: fx.Graph) -> None:
 
         movable_constructors = self.find_movable_constructors(graph, constructors)
 
+<<<<<<< HEAD
         for node in movable_constructors:
             kwargs = node.kwargs.copy()
             kwargs["device"] = next(iter(target_devices))
             node.kwargs = kwargs
+=======
+        target_device = next(iter(target_devices))
+        for node in movable_constructors:
+            if node in cpu_placeholders:
+                with graph.inserting_after(node):
+                    gpu_node = graph.call_function(
+                        torch.ops.prims.device_put.default, (node, target_device)
+                    )
+                node.replace_all_uses_with(
+                    gpu_node,
+                    lambda x: x != gpu_node
+                    and x.target != torch.ops.aten.copy_.default,
+                )
+
+                # noop elimination if there are other device_put for gpu_node to
+                # target device. Alternatively, we could just move the other device_put
+                # earlier in the graph, but that is not supported in fx graph yet.
+                noop_device_puts = [
+                    user
+                    for user in gpu_node.users
+                    if user.target == torch.ops.prims.device_put.default
+                    and user.args[1] == target_device
+                ]
+                for noop in noop_device_puts:
+                    noop.replace_all_uses_with(gpu_node)
+                    graph.erase_node(noop)
+            else:
+                kwargs = node.kwargs.copy()
+                kwargs["device"] = target_device
+                node.kwargs = kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def find_movable_constructors(
         self, graph: fx.Graph, constructors: list[fx.Node]
@@ -1268,12 +1869,24 @@ def make_dependencies_equivalent(
 
                 # this node was used on a op which takes in multiple devices and output a gpu
                 # tensor. we can convert its cpu input to gpu without making further changes
+<<<<<<< HEAD
                 node_device = self.get_node_device(user)
                 if (
                     self.allow_cpu_device(user)
                     and node_device
                     and node_device.type == self.target
                 ):
+=======
+                if self.allow_cpu_device(user) and self.is_on_target_device(user):
+                    del cpu_indeg[user]
+                elif (
+                    self.allow_inputs
+                    and self.all_inputs_are_cpu_scalar_or_on_target_device(user)
+                ):
+                    # this node takes only cpu scalar tensors or gpu tensors as inputs
+                    # and outputs a gpu tensor. we can convert its cpu scalar inputs to gpu
+                    # without making further changes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     del cpu_indeg[user]
                 else:
                     # otherwise, we should continue look at its downstream uses
@@ -1302,4 +1915,25 @@ def move_constructors_to_gpu(graph: fx.Graph) -> None:
     """
     Moves intermediary tensors which are constructed on the cpu to gpu when safe
     """
+<<<<<<< HEAD
     ConstructorMoverPass(get_gpu_type())(graph)
+=======
+
+    # cudagraph does not support cpu tensors. In this pass, we update the graph
+    # by explicitly moving cpu scalar tensors to gpu when profitable, relying on
+    # graph partition to split off this data copy, and cudagraphifying
+    # the remaining gpu ops.
+    allow_inputs_outputs = (
+        True
+        if (
+            torch._inductor.config.triton.cudagraphs
+            and torch._inductor.config.graph_partition
+        )
+        else False
+    )
+    ConstructorMoverPass(
+        get_gpu_type(),
+        allow_inputs=allow_inputs_outputs,
+        allow_outputs=allow_inputs_outputs,
+    )(graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/fx_passes/pre_grad.py b/torch/_inductor/fx_passes/pre_grad.py
index 0d6db06cbbad..2d011ac151fc 100644
--- a/torch/_inductor/fx_passes/pre_grad.py
+++ b/torch/_inductor/fx_passes/pre_grad.py
@@ -49,6 +49,7 @@
 )
 
 # based on predispatch aten IR
+<<<<<<< HEAD
 normalization_pass_aten = PatternMatcherPass()
 merge_splits_pass_aten = PatternMatcherPass()
 split_cat_pass_aten = PatternMatcherPass()
@@ -57,6 +58,22 @@
 merge_stack_tahn_unbind_pass_aten = PatternMatcherPass()
 mutate_cat_pass_aten = PatternMatcherPass()
 remove_split_with_size_one_pass_aten = PatternMatcherPass()
+=======
+normalization_pass_aten = PatternMatcherPass(pass_name="normalization_pass_aten")
+merge_splits_pass_aten = PatternMatcherPass(pass_name="merge_splits_pass_aten")
+split_cat_pass_aten = PatternMatcherPass(pass_name="split_cat_pass_aten")
+unbind_stack_pass_aten = PatternMatcherPass(pass_name="unbind_stack_pass_aten")
+merge_getitem_cat_pass_aten = PatternMatcherPass(
+    pass_name="merge_getitem_cat_pass_aten"
+)
+merge_stack_tahn_unbind_pass_aten = PatternMatcherPass(
+    pass_name="merge_stack_tahn_unbind_pass_aten"
+)
+mutate_cat_pass_aten = PatternMatcherPass(pass_name="mutate_cat_pass_aten")
+remove_split_with_size_one_pass_aten = PatternMatcherPass(
+    pass_name="remove_split_with_size_one_pass_aten"
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def save_inductor_dict(pass_to_compare=None):
@@ -122,6 +139,37 @@ def fuse_split_getitem_squeeze_cat(graph):
     return None
 
 
+<<<<<<< HEAD
+=======
+def use_triton_dot_compress(graph):
+    return None
+
+
+def use_triton_lce_replace_simple_LCE_helper(gm, shape_prop):
+    return None
+
+
+def use_triton_lce_replace_simple_LCE(graph):
+    return use_triton_lce_replace_simple_LCE_helper(graph.owning_module, shape_prop)
+
+
+def use_triton_lce_replace_normal_LCE_helper(gm, shape_prop):
+    return None
+
+
+def use_triton_lce_replace_normal_LCE(graph):
+    return use_triton_lce_replace_simple_LCE_helper(graph.owning_module, shape_prop)
+
+
+def use_matmul_lce_replace_normal_LCE(graph):
+    return None
+
+
+def use_matmul_fuse_lce_replace_first_LCE(graph):
+    return None
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @init_once_fakemode
 def lazy_init():
     from . import efficient_conv_bn_eval, split_cat  # noqa: F401
@@ -135,7 +183,11 @@ def _get_pass_name_func(p):
         pass_name = p.pass_name
         pass_func = p.apply
     elif isinstance(p, types.FunctionType):
+<<<<<<< HEAD
         pass_name = p.__name__
+=======
+        pass_name = p.__name__.lstrip("_")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_func = p
     else:
         pass_name = None
@@ -172,6 +224,14 @@ def _run_pre_dispatch_passes(
 
     full_pass_list = default_pass_list + [
         fuse_split_getitem_squeeze_cat,
+<<<<<<< HEAD
+=======
+        use_triton_dot_compress,
+        use_triton_lce_replace_simple_LCE,
+        use_triton_lce_replace_normal_LCE,
+        use_matmul_fuse_lce_replace_first_LCE,
+        use_matmul_lce_replace_normal_LCE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
     log.info(
@@ -258,6 +318,7 @@ def pre_grad_passes(
             if example_inputs is not None:
                 gm = fuse_fx(gm, example_inputs)
             numpy_compat_normalization(gm.graph)
+<<<<<<< HEAD
             trace_structured(
                 "artifact",
                 metadata_fn=lambda: {
@@ -268,6 +329,8 @@ def pre_grad_passes(
                     print_output=False, include_stride=True, include_device=True
                 ),
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # We should always do the normalization_pass first
             if "normalization_pass" in config.pre_grad_fusion_options:
                 pattern_matcher_pass = PRE_GRAD_PATTERNS["normalization_pass"]
@@ -310,6 +373,7 @@ def pre_grad_passes(
 
     gm.graph.lint()
     gm.recompile()
+<<<<<<< HEAD
     trace_structured(
         "artifact",
         metadata_fn=lambda: {
@@ -320,6 +384,8 @@ def pre_grad_passes(
             print_output=False, include_stride=True, include_device=True
         ),
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (
         config.pattern_matcher
@@ -375,7 +441,11 @@ def fetch_attr(target: str, mod):
     for i, atom in enumerate(target_atoms):
         if not hasattr(attr_itr, atom):
             raise RuntimeError(
+<<<<<<< HEAD
                 f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}"
+=======
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         attr_itr = getattr(attr_itr, atom)
     return attr_itr
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
index e1dff0162cb5..6fd394c2e3ac 100644
--- a/torch/_inductor/fx_passes/quantization.py
+++ b/torch/_inductor/fx_passes/quantization.py
@@ -12,8 +12,22 @@
 from torch.fx.experimental.symbolic_shapes import has_free_symbols
 from torch.fx.node import map_arg
 
+<<<<<<< HEAD
 from ..lowering import lowerings as L, require_channels_last
 from ..pattern_matcher import Arg, CallFunction, filter_nodes, KeywordArg, ListOf, Match
+=======
+from .. import config
+from ..lowering import lowerings as L, require_channels_last
+from ..pattern_matcher import (
+    Arg,
+    CallFunction,
+    filter_nodes,
+    KeywordArg,
+    ListOf,
+    Match,
+    stable_topological_sort,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..utils import pad_listlike
 from .freezing_patterns import register_freezing_graph_pattern
 from .post_grad import register_lowering_pattern
@@ -163,9 +177,15 @@ def get_dequantize_per_tensor_activation_pattern(is_tensor_overload=False):
 )
 
 
+<<<<<<< HEAD
 def get_qconv2d_pt2e_pattern(users=1):
     return CallFunction(
         torch.ops.onednn.qconv2d_pointwise.default,
+=======
+def get_qconv_pt2e_pattern(users=1):
+    return CallFunction(
+        torch.ops.onednn.qconv_pointwise.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         KeywordArg("x"),
         KeywordArg("x_scale"),
         KeywordArg("x_zp"),
@@ -345,13 +365,21 @@ def _check_node_kwarg_arg_value(check_node, kwarg_name, args_index, expected_val
         return actual_value == expected_value
 
 
+<<<<<<< HEAD
 def _is_valid_quantized_conv2d_optimization_pattern():
+=======
+def _is_valid_quantized_conv_optimization_pattern():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def fn(match):
         output_dtype = _get_pattern_output_dtype(match)
         if output_dtype in [torch.float32, torch.bfloat16]:
             # Only keep matched pattern with same output_dtype
             qconv_node_after_weight_prepack = filter_nodes(
+<<<<<<< HEAD
                 match.nodes, torch.ops.onednn.qconv2d_pointwise
+=======
+                match.nodes, torch.ops.onednn.qconv_pointwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )[0]
             return _check_node_kwarg_arg_value(
                 qconv_node_after_weight_prepack, "output_dtype", 13, output_dtype
@@ -365,7 +393,11 @@ def _is_valid_qconv_post_op_fusion_pattern(has_binary_post_op=False):
     return (
         _is_valid_qconv_binary_optimization_pattern()
         if has_binary_post_op
+<<<<<<< HEAD
         else _is_valid_quantized_conv2d_optimization_pattern()
+=======
+        else _is_valid_quantized_conv_optimization_pattern()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -374,8 +406,13 @@ def fn(match):
         if len(match.nodes) != 1:
             return False
         return match.nodes[0].target in (
+<<<<<<< HEAD
             torch.ops.onednn.qconv2d_pointwise.default,
             torch.ops.onednn.qconv2d_pointwise.tensor,
+=======
+            torch.ops.onednn.qconv_pointwise.default,
+            torch.ops.onednn.qconv_pointwise.tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.ops.onednn.qconv2d_pointwise.binary,
             torch.ops.onednn.qconv2d_pointwise.binary_tensor,
         )
@@ -444,8 +481,13 @@ def qconv(match: Match, *args, **kwargs):
             postop_args,
             postop_algorithm,
         )
+<<<<<<< HEAD
         counters["inductor"]["qconv2d_unary_lower_count"] += 1
         counters["inductor"]["qconv2d_unary_lower_nodes"] += len(match.nodes)
+=======
+        counters["inductor"]["qconv_unary_lower_count"] += 1
+        counters["inductor"]["qconv_unary_lower_nodes"] += len(match.nodes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return L[computation_op](*computation_args)
 
     return qconv
@@ -630,7 +672,11 @@ def qlinear_binary(match: Match, *args, **kwargs):
 
 def _is_valid_qconv_binary_optimization_pattern():
     return _is_valid_quantized_op_binary_optimization_pattern(
+<<<<<<< HEAD
         torch.ops.onednn.qconv2d_pointwise
+=======
+        torch.ops.onednn.qconv_pointwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -801,11 +847,19 @@ def qconv_binary(match: Match, *args, **kwargs):
 def _register_quantization_unary_lowering():
     # QConv2d
     for users in [1, 2]:
+<<<<<<< HEAD
         qconv_pattern = get_qconv2d_pt2e_pattern(users)
         _register_quantized_conv_lowering(
             qconv_pattern,
             2,  # pass_number
             torch.ops.onednn.qconv2d_pointwise.default,  # computation_op
+=======
+        qconv_pattern = get_qconv_pt2e_pattern(users)
+        _register_quantized_conv_lowering(
+            qconv_pattern,
+            2,  # pass_number
+            torch.ops.onednn.qconv_pointwise.default,  # computation_op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     # QLinear
@@ -940,7 +994,11 @@ def _register_quantization_maxpool2d():
             *max_pool2d_args,
         )
         dequantize_lowmem_maxpool2d_pattern = CallFunction(
+<<<<<<< HEAD
             prims._low_memory_max_pool2d_with_offsets.default,
+=======
+            prims._low_memory_max_pool_with_offsets.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             get_dequantize_per_tensor_activation_pattern(),
             KeywordArg("kernel_size"),
             *max_pool2d_args,
@@ -1068,6 +1126,56 @@ def _register_quantization_reshape():
     )
 
 
+<<<<<<< HEAD
+=======
+def _is_valid_concat_linear_int8_woq_optimization_pattern():
+    def fn(match):
+        if not config.cpp.enable_concat_linear:
+            return False
+        assert all(k in match.kwargs for k in ("x", "w1", "w2", "w3", "scales"))
+        if not all(
+            hasattr(match.kwargs[key], "meta")
+            for key in ["x", "w1", "w2", "w3", "scales"]
+        ):
+            return False
+        x = match.kwargs["x"].meta["val"]
+        w1 = match.kwargs["w1"].meta["val"]
+        w2 = match.kwargs["w2"].meta["val"]
+        w3 = match.kwargs["w3"].meta["val"]
+        scales = match.kwargs["scales"].meta["val"]
+        if len(match.kwargs["scales"].meta["val"].size()) > 1:
+            return False
+        num_scales = match.kwargs["scales"].meta["val"].numel()
+        w1_cols = match.kwargs["w1"].meta["val"].size()[0]
+        w2_cols = match.kwargs["w2"].meta["val"].size()[0]
+        w3_cols = match.kwargs["w3"].meta["val"].size()[0]
+        # Technically, the shapes of the three weights need not be equal.
+        # But currently, we only enable replacement in this case.
+        if w1_cols != w2_cols or w2_cols != w3_cols:
+            return False
+        if 3 * w1_cols != num_scales:
+            return False
+        return (
+            # For now, we only support woq mm kernels
+            # with x.type=bfloat16 and w.type=int8
+            x.dtype == torch.bfloat16
+            and w1.dtype == torch.int8
+            and w2.dtype == torch.int8
+            and w3.dtype == torch.int8
+            and scales.dtype == torch.bfloat16
+            # _weight_int8pack_mm kernel only supports cpu now
+            # TODO: add cuda kernel support instead of calling mul+sum
+            and x.device.type == "cpu"
+            and x.device == w1.device
+            and w1.device == w2.device
+            and w2.device == w3.device
+            and x.device == scales.device
+        )
+
+    return fn
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_valid_woq_optimization_pattern():
     def fn(match):
         assert all(k in match.kwargs for k in ("x", "weight", "scales"))
@@ -1094,6 +1202,76 @@ def fn(match):
     return fn
 
 
+<<<<<<< HEAD
+=======
+def _register_concat_linear_int8_woq_lowering(
+    pattern, computation_woq, computation_reshape
+):
+    @register_freezing_graph_pattern(
+        pattern,
+        extra_check=_is_valid_concat_linear_int8_woq_optimization_pattern(),
+        pass_number=4,
+    )
+    def woq(match: Match, *args, **kwargs):
+        x = kwargs["x"]
+        w1 = kwargs["w1"]
+        w2 = kwargs["w2"]
+        w3 = kwargs["w3"]
+        scales = kwargs["scales"]
+        counters["inductor"]["woq_matcher_count"] += 1
+        counters["inductor"]["woq_matcher_nodes"] += len(match.nodes)
+        out_features = (
+            w1.meta["val"].size()[0]
+            + w2.meta["val"].size()[0]
+            + w3.meta["val"].size()[0]
+        )
+        origin_x_size = tuple(x.meta["val"].size())
+        x_shape = [-1, origin_x_size[-1]]
+        out_shape = list(origin_x_size[:-1] + (out_features,))
+        mm_node_of_x = None
+        for candidate in iter(x.users.keys()):
+            if (
+                candidate.target == aten.mm.default
+                and list(candidate._input_nodes)[1].target == aten.cat.default
+            ):
+                mm_node_of_x = candidate
+                break
+        assert mm_node_of_x is not None, "unable to find mm node"
+        _, cat_wgt_node = mm_node_of_x._input_nodes
+        scaling_node = next(iter(mm_node_of_x.users.keys()))
+        user_of_scaling_node = next(iter(scaling_node.users.keys()))
+        # Some other pass is making some changes that entails
+        # adding a node before it's used, but it can only be found when
+        # lint is run. stable_topological_sort() is being run before lint,
+        # so that error was not being being discovered.
+        # We call stable_topological_sort here as a workaround.
+        stable_topological_sort(match.graph)
+        with match.graph.inserting_before(user_of_scaling_node):
+            new_cat_node = match.graph.call_function(
+                aten.cat.default,
+                args=([w1, w2, w3], 0),
+            )
+            x_reshape_node = match.graph.call_function(
+                computation_reshape, args=(x, x_shape)
+            )
+            new_woq_node = match.graph.call_function(
+                computation_woq,
+                args=(x_reshape_node, new_cat_node, scales),
+            )
+            new_woq_node.meta = copy.copy(x.meta)
+            output_reshape_node = match.graph.call_function(
+                computation_reshape, args=(new_woq_node, out_shape)
+            )
+            scaling_node.replace_all_uses_with(output_reshape_node)
+            match.graph.erase_node(scaling_node)
+            match.graph.erase_node(mm_node_of_x)
+            match.graph.erase_node(cat_wgt_node)
+            match.graph.lint()
+
+    return woq
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _register_woq_lowering(pattern, computation_woq, computation_reshape):
     @register_lowering_pattern(
         pattern,
@@ -1214,6 +1392,35 @@ def _register_woq_mm_int8_pattern4():
     _register_woq_lowering(_woq_pattern, aten._weight_int8pack_mm.default, aten.reshape)
 
 
+<<<<<<< HEAD
+=======
+def _register_int8_woq_concat_linear_pattern():
+    def _create_wgt_node(wgt_node_name: str):
+        return CallFunction(
+            prims.convert_element_type.default,
+            CallFunction(
+                aten.permute.default,
+                KeywordArg(wgt_node_name),
+                Arg(),
+            ),
+            Arg(),
+        )
+
+    cat_wgt = CallFunction(
+        aten.cat.default, [_create_wgt_node(wgt) for wgt in ["w1", "w2", "w3"]], 1
+    )
+
+    _woq_pattern = CallFunction(
+        aten.mul.Tensor,
+        CallFunction(aten.mm.default, KeywordArg("x"), cat_wgt),
+        KeywordArg("scales"),
+    )
+    _register_concat_linear_int8_woq_lowering(
+        _woq_pattern, aten._weight_int8pack_mm.default, aten.reshape
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _register_quantization_lowerings():
     _register_quantization_unary_lowering()
     _register_quantization_binary_lowering()
@@ -1375,7 +1582,11 @@ def _find_first_node_in_dequant_pattern(_node):
         counters["inductor"]["dequant_promotion_matcher_nodes"] += len(match.nodes)
 
 
+<<<<<<< HEAD
 def _is_valid_dequant_conv2d_pattern(dtype):
+=======
+def _is_valid_dequant_conv_pattern(dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _inner(match):
         # Here we do some further check to ensure:
         # 1. It's a conv2d node with dim of 4, since we only support lowering of conv2d now.
@@ -1390,9 +1601,15 @@ def _inner(match):
             if (
                 meta_value is None
                 or (meta_value.device.type != "cpu" and meta_value.device.type != "xpu")
+<<<<<<< HEAD
                 or meta_value.dim() != 4
             ):
                 # Only support conv2d now
+=======
+                or meta_value.dim() not in [3, 4]
+            ):
+                # Only support conv1d/2d now
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
 
         assert dtype in [torch.float32, torch.bfloat16]
@@ -1415,7 +1632,11 @@ def _inner(match):
 def _register_qconv_weight_prepack_pass(pattern, pass_number, dtype=torch.float32):
     @register_freezing_graph_pattern(
         pattern,
+<<<<<<< HEAD
         extra_check=_is_valid_dequant_conv2d_pattern(dtype),
+=======
+        extra_check=_is_valid_dequant_conv_pattern(dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass_number=pass_number,
     )
     def qconv_weight_prepack(match: Match, *args, **kwargs):
@@ -1430,7 +1651,11 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
         Insert weight prepack node and change the pattern to:
         int8 activation
           |
+<<<<<<< HEAD
         onednn.qconv2d_pointwise <- onednn.qconv_prepack <- int8_weight
+=======
+        onednn.qconv_pointwise <- onednn.qconv_prepack <- int8_weight
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         assert dtype in [torch.float32, torch.bfloat16]
         conv_node = match.output_node()
@@ -1532,7 +1757,11 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
                 "",  # algorithm
             )
             new_conv_node = graph.call_function(
+<<<<<<< HEAD
                 torch.ops.onednn.qconv2d_pointwise.default, args=new_args
+=======
+                torch.ops.onednn.qconv_pointwise.default, args=new_args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             conv_node.replace_all_uses_with(new_conv_node)
             new_conv_node.meta.update(conv_node.meta)
@@ -1549,8 +1778,13 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
             if dtype == torch.bfloat16:
                 graph.erase_node(weight_to_bf16_node)  # type: ignore[possibly-undefined, arg-type]
             graph.erase_node(dequant_per_channel)  # type: ignore[arg-type]
+<<<<<<< HEAD
             counters["inductor"]["qconv2d_weight_prepack_matcher_count"] += 1
             counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"] += len(
+=======
+            counters["inductor"]["qconv_weight_prepack_matcher_count"] += 1
+            counters["inductor"]["qconv_weight_prepack_matcher_nodes"] += len(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 match.nodes
             )
 
@@ -2185,8 +2419,12 @@ def _register_qlinear_weight_prepack():
     # Step 2: register patterns from bmm
     # Linear might be decomposed into bmm when input dim exceeds 2 and not contiguous
     # refer to:
+<<<<<<< HEAD
     # https://github.com/pytorch/pytorch/blob/
     # 80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
+=======
+    # https://github.com/pytorch/pytorch/blob/80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # in this case, we can convert it back to qlinear
     for dtype, with_bias, is_tensor_overload in itertools.product(
         [torch.float32, torch.bfloat16], [True, False], [True, False]
@@ -2803,12 +3041,20 @@ def qconv(match: Match, *args, **kwargs):
         count_key = (
             "qconv2d_binary_matcher_count"
             if has_binary_post_op
+<<<<<<< HEAD
             else "qconv2d_unary_matcher_count"
+=======
+            else "qconv_unary_matcher_count"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         nodes_key = (
             "qconv2d_binary_matcher_nodes"
             if has_binary_post_op
+<<<<<<< HEAD
             else "qconv2d_unary_matcher_nodes"
+=======
+            else "qconv_unary_matcher_nodes"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         counters["inductor"][count_key] += 1
         counters["inductor"][nodes_key] += len(match.nodes)
@@ -2828,13 +3074,21 @@ def _register_qconv_unary_fusion():
             PostOpAttr(
                 "none", None, "none", [], ""
             ): generate_pattern_with_output_quant(
+<<<<<<< HEAD
                 get_qconv2d_pt2e_pattern(1),
+=======
+                get_qconv_pt2e_pattern(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             PostOpAttr(
                 "none", None, "relu", [], ""
             ): generate_pattern_with_output_quant(
                 generate_pattern_with_unary(
+<<<<<<< HEAD
                     get_qconv2d_pt2e_pattern(1), aten.relu.default
+=======
+                    get_qconv_pt2e_pattern(1), aten.relu.default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             ),
             PostOpAttr(
@@ -2842,7 +3096,11 @@ def _register_qconv_unary_fusion():
             ): generate_pattern_with_output_quant(
                 _unary_fusion_pattern(
                     _hardtanh_fusion,
+<<<<<<< HEAD
                     get_qconv2d_pt2e_pattern(1),
+=======
+                    get_qconv_pt2e_pattern(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     1,
                     is_bf16,
                 ),
@@ -2853,7 +3111,11 @@ def _register_qconv_unary_fusion():
             ): generate_pattern_with_output_quant(
                 _unary_fusion_pattern(
                     _hardswish_fusion,
+<<<<<<< HEAD
                     get_qconv2d_pt2e_pattern(1 if is_bf16 else 2),
+=======
+                    get_qconv_pt2e_pattern(1 if is_bf16 else 2),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     2,
                     is_bf16,
                 ),
@@ -2864,7 +3126,11 @@ def _register_qconv_unary_fusion():
             ): generate_pattern_with_output_quant(
                 _unary_fusion_pattern(
                     _silu_fusion,
+<<<<<<< HEAD
                     get_qconv2d_pt2e_pattern(1 if is_bf16 else 2),
+=======
+                    get_qconv_pt2e_pattern(1 if is_bf16 else 2),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     2,
                     is_bf16,
                 ),
@@ -2877,21 +3143,33 @@ def _register_qconv_unary_fusion():
             _register_qconv_post_op_fusion_pass(
                 patterns,
                 3,  # pass_number
+<<<<<<< HEAD
                 torch.ops.onednn.qconv2d_pointwise.default,  # computation_op
+=======
+                torch.ops.onednn.qconv_pointwise.default,  # computation_op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 unary_attr,  # unary_attr
             )
 
         # Priority 2 to match: QConv2d Unary pattern with fp32/bfloat16 output
         conv_unary_replace_float_out_patterns = {
             PostOpAttr("none", None, "relu", [], ""): generate_pattern_with_unary(
+<<<<<<< HEAD
                 get_qconv2d_pt2e_pattern(1), aten.relu.default
+=======
+                get_qconv_pt2e_pattern(1), aten.relu.default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             PostOpAttr(
                 "none", None, "hardtanh", [], ""
             ): _may_generate_pattern_with_dtype_convert(
                 _unary_fusion_pattern(
                     _hardtanh_fusion,
+<<<<<<< HEAD
                     get_qconv2d_pt2e_pattern(1),
+=======
+                    get_qconv_pt2e_pattern(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     1,
                     is_bf16,
                 ),
@@ -2903,7 +3181,11 @@ def _register_qconv_unary_fusion():
             ): _may_generate_pattern_with_dtype_convert(
                 _unary_fusion_pattern(
                     _hardswish_fusion,
+<<<<<<< HEAD
                     get_qconv2d_pt2e_pattern(1 if is_bf16 else 2),
+=======
+                    get_qconv_pt2e_pattern(1 if is_bf16 else 2),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     2,
                     is_bf16,
                 ),
@@ -2915,7 +3197,11 @@ def _register_qconv_unary_fusion():
             ): _may_generate_pattern_with_dtype_convert(
                 _unary_fusion_pattern(
                     _silu_fusion,
+<<<<<<< HEAD
                     get_qconv2d_pt2e_pattern(1 if is_bf16 else 2),
+=======
+                    get_qconv_pt2e_pattern(1 if is_bf16 else 2),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     2,
                     is_bf16,
                 ),
@@ -2929,7 +3215,11 @@ def _register_qconv_unary_fusion():
             _register_qconv_post_op_fusion_pass(
                 patterns,
                 4,  # pass_number
+<<<<<<< HEAD
                 torch.ops.onednn.qconv2d_pointwise.default,  # computation_op
+=======
+                torch.ops.onednn.qconv_pointwise.default,  # computation_op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 unary_attr,  # unary_attr
             )
 
@@ -2947,7 +3237,11 @@ def _register_qconv_binary_fusion():
                     ): generate_pattern_with_output_quant(
                         generate_pattern_with_binary(
                             aten.add.Tensor,
+<<<<<<< HEAD
                             get_qconv2d_pt2e_pattern(1),
+=======
+                            get_qconv_pt2e_pattern(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             dequantize_accum_pattern,
                             int8_mixed_bf16_with_inplace_add,
                             swap_inputs=swap_inputs,
@@ -2959,7 +3253,11 @@ def _register_qconv_binary_fusion():
                         generate_pattern_with_unary(
                             generate_pattern_with_binary(
                                 aten.add.Tensor,
+<<<<<<< HEAD
                                 get_qconv2d_pt2e_pattern(1),
+=======
+                                get_qconv_pt2e_pattern(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 dequantize_accum_pattern,
                                 int8_mixed_bf16_with_inplace_add,
                                 swap_inputs=swap_inputs,
@@ -2986,7 +3284,11 @@ def _register_qconv_binary_fusion():
                     PostOpAttr("sum", 1.0, "relu", [], ""): generate_pattern_with_unary(
                         generate_pattern_with_binary(
                             aten.add.Tensor,
+<<<<<<< HEAD
                             get_qconv2d_pt2e_pattern(1),
+=======
+                            get_qconv_pt2e_pattern(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             KeywordArg("accum_after_dequant"),
                             int8_mixed_bf16_with_inplace_add,
                             swap_inputs=swap_inputs,
@@ -3024,7 +3326,11 @@ def _register_qconv_binary_fusion():
                         "sum", 1.0, "none", [], ""
                     ): generate_pattern_with_binary(
                         aten.add.Tensor,
+<<<<<<< HEAD
                         get_qconv2d_pt2e_pattern(1),
+=======
+                        get_qconv_pt2e_pattern(1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         KeywordArg("accum_after_dequant"),
                         int8_mixed_bf16_with_inplace_add,
                         swap_inputs=swap_inputs,
@@ -3479,7 +3785,11 @@ def _register_qlinear_binary_fusion():
             )
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _register_quantization_weight_pack_pass():
     # Step 1: Dequant promotion for int8-mixed-fp32/bf16
     _register_dequant_promotion()
@@ -3503,6 +3813,158 @@ def _register_quantization_weight_pack_pass():
         _register_qlinear_binary_fusion()
 
 
+<<<<<<< HEAD
+=======
+def _is_valid_concat_linear_woq_int4_fusion(computation_nodes):
+    computation_op = torch.ops.aten._weight_int4pack_mm_for_cpu.default
+    act = computation_nodes[0].args[0]
+    wgt = computation_nodes[0].args[1]
+    in_feature_size = wgt.meta.get("val").size(1)  # type: ignore[union-attr]
+    group_size = computation_nodes[0].args[2]
+    return len(computation_nodes) >= 2 and all(
+        (
+            node.target == computation_op
+            and node.args[0] == act  # share same activation
+            and (
+                node.args[1].meta.get("val").size(1) == in_feature_size
+            )  # same in feature size
+            and (node.args[1] != wgt or gemm_idx == 0)
+            and node.args[1].op == "get_attr"  # wgt are all constants
+            and node.args[2] == group_size  # same group size
+        )
+        for gemm_idx, node in enumerate(computation_nodes)
+    )
+
+
+def concat_linear_woq_int4(gm: torch.fx.GraphModule):
+    """
+    Concat Linear optimization pass for WOQ int4
+    This pass fuses the original pattern:
+    def ...
+        return (woq_int4(x, w1, group_size, scale_zp1), woq_int4(x, w2, group_size, scale_zp1) ...)
+    into a single operation:
+    def ...
+        concat_res = woq_int4(x, concat_w, group_size, concat_scale_zp)
+        return split(concat_res, split_size_list)
+    """
+
+    def concat_wgt(packed_wgts, scale_zps, group_size, act_dtype):
+        # Concat the wgts and scale_zps, and repack the wgt
+        unpacked_wgts = []
+        for packed_wgt in packed_wgts:
+            # Get the unpacked weight list
+            # Same as https://github.com/pytorch/pytorch/pull/156174
+            K = packed_wgt.size(1) * 2
+            N = packed_wgt.size(0)
+            x = torch.eye(K).to(dtype=act_dtype)
+            qscales_and_zeros = (
+                torch.tensor([1.0, 8.0])
+                .to(dtype=act_dtype)
+                .expand(K // group_size, N, 2)
+                .contiguous()
+            )
+            unpacked_wgts.append(
+                torch.ops.aten._weight_int4pack_mm_for_cpu(
+                    x,
+                    packed_wgt,
+                    group_size,
+                    qscales_and_zeros,
+                )
+                .t()
+                .contiguous()
+                .to(torch.int32)  # N, K
+            )
+        concat_unpacked_wgt = torch.cat(unpacked_wgts, dim=0)
+        repack_w = torch.ops.aten._convert_weight_to_int4pack_for_cpu(
+            concat_unpacked_wgt, 1
+        )
+        concat_scale_zp = torch.cat(scale_zps, dim=1).contiguous()
+        return repack_w, concat_scale_zp
+
+    graph = gm.graph
+    computation_op = torch.ops.aten._weight_int4pack_mm_for_cpu.default
+    for node in graph.find_nodes(op="call_function", target=computation_op):
+        if (
+            not node._erased
+            and isinstance(node.meta.get("val"), torch.Tensor)
+            and node.meta["val"].device.type == "cpu"
+        ):
+            act = node.args[0]
+            users = list(act.users)
+            if _is_valid_concat_linear_woq_int4_fusion(users):
+                with graph.inserting_before(node):
+                    assert all(user.args[1].op == "get_attr" for user in users)
+                    computation_node_0 = users[0]
+                    packed_wgts = [getattr(gm, user.args[1].target) for user in users]
+                    group_size = computation_node_0.args[2]
+                    scale_zps = [getattr(gm, user.args[3].target) for user in users]
+                    out_feature_size_list = [
+                        packed_wgt.size(0) for packed_wgt in packed_wgts
+                    ]
+                    repack_w, concat_scale_zp = concat_wgt(
+                        packed_wgts, scale_zps, group_size, act.meta.get("val").dtype
+                    )
+                    repack_w_node_name = computation_node_0.args[1].target + "_concat"
+                    concat_scale_zp_node_name = (
+                        computation_node_0.args[3].target + "_concat"
+                    )
+                    gm.register_buffer(repack_w_node_name, repack_w)
+                    setattr(gm, repack_w_node_name, repack_w)
+                    gm.register_buffer(concat_scale_zp_node_name, concat_scale_zp)
+                    setattr(gm, concat_scale_zp_node_name, concat_scale_zp)
+
+                    repack_w_node = graph.create_node(
+                        "get_attr", repack_w_node_name, (), {}
+                    )
+                    with graph.inserting_after(repack_w_node):
+                        concat_scale_zp_node = graph.create_node(
+                            "get_attr", concat_scale_zp_node_name, (), {}
+                        )
+
+                    with graph.inserting_after(concat_scale_zp_node):
+                        concat_int4_gemm_node = graph.create_node(
+                            "call_function",
+                            computation_op,
+                            (
+                                act,
+                                repack_w_node,
+                                group_size,
+                                concat_scale_zp_node,
+                            ),
+                        )
+                    with graph.inserting_after(concat_int4_gemm_node):
+                        split_node = graph.create_node(
+                            "call_function",
+                            torch.ops.aten.split_with_sizes.default,
+                            (
+                                concat_int4_gemm_node,
+                                out_feature_size_list,
+                                1,  # split dim
+                            ),
+                        )
+                    with graph.inserting_after(split_node):
+                        for gemm_idx, user in enumerate(users):
+                            assert user.target == computation_op
+                            get_item = graph.create_node(
+                                "call_function",
+                                operator.getitem,
+                                (
+                                    split_node,
+                                    gemm_idx,
+                                ),
+                            )
+                            with graph.inserting_after(get_item):
+                                clone_node = graph.create_node(
+                                    "call_function",
+                                    torch.ops.aten.clone.default,
+                                    (get_item,),
+                                    {"memory_format": torch.contiguous_format},
+                                )
+                                user.replace_all_uses_with(clone_node)
+                                graph.erase_node(user)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def quant_lift_up(graph_module: torch.fx.GraphModule):
     """
     Lift up the quant node before view like nodes. It can benefit performance
diff --git a/torch/_inductor/fx_passes/reinplace.py b/torch/_inductor/fx_passes/reinplace.py
index a4d6f482e25d..17da2d8fe0b6 100644
--- a/torch/_inductor/fx_passes/reinplace.py
+++ b/torch/_inductor/fx_passes/reinplace.py
@@ -3,10 +3,19 @@
 import logging
 import operator
 from collections import defaultdict
+<<<<<<< HEAD
 from dataclasses import dataclass
 from typing import Any, Callable, Union
 
 import torch
+=======
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any, Callable, cast, Union
+
+import torch
+import torch.fx.node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._C._dynamo.guards import compute_overlapping_tensors
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import ReinplaceCounters, ReInplaceTrigger
@@ -21,7 +30,11 @@
 )
 from torch._inductor.virtualized import V
 from torch.fx.experimental.symbolic_shapes import GuardOnDataDependentSymNode
+<<<<<<< HEAD
 from torch.fx.immutable_collections import immutable_dict
+=======
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.passes.reinplace import _is_view_op
 from torch.utils import _pytree as pytree
 from torch.utils._ordered_set import OrderedSet
@@ -176,7 +189,16 @@ def _decompose_scatter_mutating(
 
 def scatter_always_uses_mutation(node: torch.fx.Node) -> bool:
     _, _, view_ops = node.args
+<<<<<<< HEAD
     return any(view.target in _ALWAYS_MUTATING_SCATTER_OPS for view in view_ops)  # type: ignore[union-attr]
+=======
+    view_ops = cast(Sequence[torch.fx.node.Argument], view_ops)
+    return any(
+        target in _ALWAYS_MUTATING_SCATTER_OPS
+        for view in view_ops
+        if isinstance(target := getattr(view, "target", None), torch._ops.OpOverload)
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def should_reinplace_scatter(node: torch.fx.Node) -> bool:
@@ -253,7 +275,11 @@ def scatter(inp, src, views):
 
     def handle_views(node: torch.fx.Node):
         inp = node.args[0]
+<<<<<<< HEAD
         node_to_view_base[node] = node_to_view_base.get(inp, inp)  # type: ignore[arg-type]
+=======
+        node_to_view_base[node] = node_to_view_base.get(inp, inp)  # type: ignore[arg-type, assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node_to_view_op[node] = [
             *node_to_view_op[inp],  # type: ignore[index]
             ViewOp(
@@ -267,6 +293,10 @@ def handle_view_scatter(node: torch.fx.Node):
         assert len(node.args) >= 2
         inp, src = node.args[:2]
 
+<<<<<<< HEAD
+=======
+        assert isinstance(node.target, torch._ops.OpOverload)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         scatter_view_op = ViewOp(
             _SCATTER_OP_TO_VIEW[node.target],
             args=node.args[2:],
@@ -331,7 +361,11 @@ def can_fuse():
             handle_view_scatter(node)
 
 
+<<<<<<< HEAD
 inplaceable_ops = {
+=======
+inplaceable_ops: dict[Callable[..., Any], InplaceableOp] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten.index_put.default: InplaceableOp(aten.index_put_.default, 0),
     aten._unsafe_index_put.default: InplaceableOp(inductor_prims._unsafe_index_put_, 0),
     _generalized_scatter: InplaceableOp(
@@ -343,7 +377,11 @@ def can_fuse():
 
 try:
     c10d_functional = torch.ops._c10d_functional
+<<<<<<< HEAD
     inplaceable_collective_ops = {
+=======
+    inplaceable_collective_ops: dict[Callable[..., Any], InplaceableOp] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         c10d_functional.all_reduce.default: InplaceableOp(
             c10d_functional.all_reduce_.default, 0
         ),
@@ -720,6 +758,17 @@ def tensor_with_same_storage_already_reinplaced(arg):
             kwargs = dict(node.kwargs)
             kwargs["tensors_to_clone"] = tensors_to_clone
             node.kwargs = immutable_dict(kwargs)
+<<<<<<< HEAD
+=======
+            if "eager_input_vals" in node.meta:
+                # We changed the kwargs, so we need to update eager_input_vals
+                # to something sane.
+                args, kwargs = node.meta["eager_input_vals"]
+                new_kwargs = {**kwargs}
+                new_kwargs["tensors_to_clone"] = immutable_list(tensors_to_clone)
+                new_kwargs = immutable_dict(new_kwargs)
+                node.meta["eager_input_vals"] = (args, new_kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif (
             inplaceable_op := inplaceable_foreach_ops.get(node.target, None)
         ) is not None:
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py
index 55d2216b4e1f..a64bdaac6a9e 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py
index 860ef1c8551f..a0c68e288f07 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py
index d8119c33ed93..8dbb13b43a66 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py
index 40834960904a..3d2a5d05eee7 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py
index bef5eab2bee9..f7fa42913514 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py
index a1e87c009fcc..0f27880ea7f0 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_15.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_15.py
index 289585111a54..5b85c794d4bd 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_15.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_15.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
@@ -31,20 +35,36 @@
    _TargetExprVarArgs,
 )
 eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+<<<<<<< HEAD
 expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+=======
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored(), _users=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
 clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+<<<<<<< HEAD
 view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+=======
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
 permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
 expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
 clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+<<<<<<< HEAD
 view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
 bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
 view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
 div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+=======
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor, _users=2)
 amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
 sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
@@ -52,6 +72,7 @@
 sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
 div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
 expand_default_3 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+<<<<<<< HEAD
 view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
 expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
@@ -66,10 +87,27 @@
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
 view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, div_Tensor_1, _users=2)
+=======
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+view_default_6 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor_1)
+view_default_7 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_7, permute_default_4)
+view_default_8 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_8, div_Tensor_1, _users=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
 fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
 where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, fma_default)
 div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+<<<<<<< HEAD
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
@@ -85,6 +123,23 @@
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
 permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
 _sfdp_pattern_15_training = MultiOutputPattern([view_default_5,
+=======
+view_default_9 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_2, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_9, permute_default_5)
+view_default_10 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_9)
+view_default_11 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_11, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_7)
+view_default_12 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_12, Ignored())
+_sfdp_pattern_15_training = MultiOutputPattern([view_default_6,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   permute_default_6,
   permute_default_9,
   permute_default_11,
@@ -126,20 +181,36 @@
 
 
 eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+<<<<<<< HEAD
 expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+=======
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored(), _users=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
 clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+<<<<<<< HEAD
 view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+=======
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
 permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
 expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
 clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+<<<<<<< HEAD
 view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
 bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
 view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
 div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+=======
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor)
 convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
 amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
@@ -149,6 +220,7 @@
 div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
 convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
 expand_default_3 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+<<<<<<< HEAD
 view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
 expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
@@ -164,12 +236,30 @@
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
 view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+=======
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+view_default_6 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, convert_element_type_default_1, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_7 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_7, permute_default_4)
+view_default_8 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_8, Ignored())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
 fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
 convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, convert_element_type_default_4)
 div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+<<<<<<< HEAD
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
@@ -185,6 +275,23 @@
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
 permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
 _sfdp_pattern_15_half_training = MultiOutputPattern([view_default_5,
+=======
+view_default_9 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_2, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_9, permute_default_5)
+view_default_10 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_9)
+view_default_11 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_11, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_7)
+view_default_12 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_12, Ignored())
+_sfdp_pattern_15_half_training = MultiOutputPattern([view_default_6,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   permute_default_6,
   permute_default_9,
   permute_default_11,
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_16.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_16.py
index e3c1b5c60235..b79942f06ae9 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_16.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_16.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_17.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_17.py
index f741b23c0dd3..708ca918d89c 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_17.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_17.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
@@ -33,20 +37,36 @@
 rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
 gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
 eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+<<<<<<< HEAD
 expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+=======
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored(), _users=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
 clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+<<<<<<< HEAD
 view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+=======
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
 permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
 expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
 clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+<<<<<<< HEAD
 view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
 bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
 view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
 div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+=======
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor, _users=2)
 amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
 sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
@@ -56,6 +76,7 @@
 mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
 mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
 expand_default_3 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+<<<<<<< HEAD
 view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
 expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
@@ -72,11 +93,30 @@
 convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
 mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+=======
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+view_default_6 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor_1)
+view_default_7 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_7, permute_default_4)
+view_default_8 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_8, mul_Tensor_2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, mul_Tensor_3, div_Tensor_1, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
 fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
 where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, fma_default)
 div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+<<<<<<< HEAD
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
@@ -92,6 +132,23 @@
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
 permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
 _sfdp_pattern_17_training = MultiOutputPattern([view_default_5,
+=======
+view_default_9 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_2, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_9, permute_default_5)
+view_default_10 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_9)
+view_default_11 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_11, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_7)
+view_default_12 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_12, Ignored())
+_sfdp_pattern_17_training = MultiOutputPattern([view_default_6,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   permute_default_6,
   permute_default_9,
   permute_default_11,
@@ -136,20 +193,36 @@
 rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
 gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
 eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+<<<<<<< HEAD
 expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+=======
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored(), _users=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
 clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+<<<<<<< HEAD
 view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+=======
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
 permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
 expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
 clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+<<<<<<< HEAD
 view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
 bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
 view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
 div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+=======
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor)
 convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
 amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
@@ -161,6 +234,7 @@
 mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
 mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
 expand_default_3 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+<<<<<<< HEAD
 view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
 expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
@@ -178,6 +252,25 @@
 convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
 mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+=======
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+view_default_6 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, convert_element_type_default_1, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_7 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_7, permute_default_4)
+view_default_8 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_8, mul_Tensor_2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, mul_Tensor_3, Ignored())
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
@@ -185,6 +278,7 @@
 convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, convert_element_type_default_5)
 div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+<<<<<<< HEAD
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
@@ -200,6 +294,23 @@
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
 permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
 _sfdp_pattern_17_half_training = MultiOutputPattern([view_default_5,
+=======
+view_default_9 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_2, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_9, permute_default_5)
+view_default_10 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_9)
+view_default_11 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_11, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_7)
+view_default_12 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_12, Ignored())
+_sfdp_pattern_17_half_training = MultiOutputPattern([view_default_6,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   permute_default_6,
   permute_default_9,
   permute_default_11,
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_18.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_18.py
index 25c482876a99..dec24420bc26 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_18.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_18.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_19.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_19.py
index 3cba2215bc76..48f5660dcb20 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_19.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_19.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py
index f573cb373491..5ec8fb6ba4c4 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_20.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_20.py
new file mode 100644
index 000000000000..9185aa3b1e33
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_20.py
@@ -0,0 +1,244 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+import operator
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored(), _users=2)
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+where_self = CallFunction(aten.where.self, expand_default, full_default, view_default_3, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+view_default_6 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor_1)
+view_default_7 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_7, permute_default_4)
+view_default_8 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_8, mul_Tensor_2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, mul_Tensor_3, div_Tensor_1, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, fma_default)
+view_default_9 = CallFunction(aten.view.default, where_self_1, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_2, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_9, permute_default_5)
+view_default_10 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, view_default_10, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, div_Tensor_2, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_9)
+view_default_11 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_11, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_7)
+view_default_12 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_12, Ignored())
+_sfdp_pattern_20_training = MultiOutputPattern([view_default_6,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+where_self = CallFunction(aten.where.self, expand_default, full_default, view_default_3, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_3 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+_sfdp_pattern_20_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored(), _users=2)
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+where_self = CallFunction(aten.where.self, expand_default, full_default, view_default_3)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+view_default_6 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, convert_element_type_default_1, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_7 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_7, permute_default_4)
+view_default_8 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_8, mul_Tensor_2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, mul_Tensor_3, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, convert_element_type_default_5)
+view_default_9 = CallFunction(aten.view.default, where_self_1, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_2, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_9, permute_default_5)
+view_default_10 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, view_default_10, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, div_Tensor_2, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_9)
+view_default_11 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_11, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_7)
+view_default_12 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_12, Ignored())
+_sfdp_pattern_20_half_training = MultiOutputPattern([view_default_6,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+where_self = CallFunction(aten.where.self, expand_default, full_default, view_default_3)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+_sfdp_pattern_20_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py
new file mode 100644
index 000000000000..ad27e6eb6bb8
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py
@@ -0,0 +1,217 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+import operator
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+view_default_3 = CallFunction(aten.view.default, add_Tensor, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_9, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+view_default_10 = CallFunction(aten.view.default, fma_default, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+view_default_12 = CallFunction(aten.view.default, view_default_11, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_21_training = MultiOutputPattern([view_default_7,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+view_default_3 = CallFunction(aten.view.default, add_Tensor, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+_sfdp_pattern_21_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_1, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_1, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_2, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_9, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_3, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+view_default_10 = CallFunction(aten.view.default, convert_element_type_default_4, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, view_default_11, Ignored())
+convert_element_type_default_6 = CallFunction(prims.convert_element_type.default, convert_element_type_default_5, Ignored())
+view_default_12 = CallFunction(aten.view.default, convert_element_type_default_6, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_21_half_training = MultiOutputPattern([view_default_7,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_1, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_1, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_2, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+_sfdp_pattern_21_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py
new file mode 100644
index 000000000000..41a433e40543
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py
@@ -0,0 +1,229 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+import operator
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+view_default_3 = CallFunction(aten.view.default, add_Tensor, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_9, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+view_default_10 = CallFunction(aten.view.default, fma_default, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+view_default_12 = CallFunction(aten.view.default, view_default_11, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_22_training = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+view_default_3 = CallFunction(aten.view.default, add_Tensor, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_22_inference = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_1, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_1, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_2, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_9, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_3, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+view_default_10 = CallFunction(aten.view.default, convert_element_type_default_4, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, view_default_11, Ignored())
+convert_element_type_default_6 = CallFunction(prims.convert_element_type.default, convert_element_type_default_5, Ignored())
+view_default_12 = CallFunction(aten.view.default, convert_element_type_default_6, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_22_half_training = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_1, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_1, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_2, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_22_half_inference = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3
+])
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py
new file mode 100644
index 000000000000..dc6f27cd2849
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py
@@ -0,0 +1,225 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+import operator
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, view_default_2, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_9, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+view_default_10 = CallFunction(aten.view.default, fma_default, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+view_default_12 = CallFunction(aten.view.default, view_default_11, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_23_training = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, view_default_2, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_23_inference = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, view_default_2, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_2, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_2, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_3, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, view_default_9, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_4, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+view_default_10 = CallFunction(aten.view.default, convert_element_type_default_5, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+convert_element_type_default_6 = CallFunction(prims.convert_element_type.default, view_default_11, Ignored())
+convert_element_type_default_7 = CallFunction(prims.convert_element_type.default, convert_element_type_default_6, Ignored())
+view_default_12 = CallFunction(aten.view.default, convert_element_type_default_7, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_23_half_training = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, view_default_2, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_2, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_2, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_3, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_23_half_inference = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3
+])
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py
index d7eb251ba52d..c4506a38c48b 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py
index 773b2be31bde..71f5589f639c 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py
index fe481c8293be..2ed0b09dbaf9 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py
index 7de8b8229ea8..b54fc94094de 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py
index ff198232b5e6..12200d70a774 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py
index 8c4b27c8a6fb..d611e4694fbe 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py
index 78380c1bb341..2d1450def263 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/addmm_pattern.py b/torch/_inductor/fx_passes/serialized_patterns/addmm_pattern.py
index 58864035cb93..6b1c5a4570c7 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/addmm_pattern.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/addmm_pattern.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/bmm_pattern.py b/torch/_inductor/fx_passes/serialized_patterns/bmm_pattern.py
index b1363546bd11..7a7c1d677b6f 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/bmm_pattern.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/bmm_pattern.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/serialized_patterns/mm_pattern.py b/torch/_inductor/fx_passes/serialized_patterns/mm_pattern.py
index 5380feb34d56..50b368a1ee47 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/mm_pattern.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/mm_pattern.py
@@ -7,6 +7,10 @@
 
 import torch
 import torch._inductor
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 aten = torch.ops.aten
 prims = torch.ops.prims
diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py
index 3a3df02bdbab..b95b01f108ed 100644
--- a/torch/_inductor/fx_passes/split_cat.py
+++ b/torch/_inductor/fx_passes/split_cat.py
@@ -2,6 +2,10 @@
 import itertools
 import logging
 import operator
+<<<<<<< HEAD
+=======
+from collections import defaultdict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Sequence
 from typing import Any, Callable, Optional, Union
 from typing_extensions import TypeAlias
@@ -71,6 +75,10 @@
     "pad_aten_mm_pass",
     "split_cat_aten_pass",
     "select_cat_aten_pass",
+<<<<<<< HEAD
+=======
+    "move_view_after_cat_aten_pass",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 for pass_name in pre_grad_pass_names:
@@ -245,7 +253,11 @@ def remove_split_with_size_one(match: Match, *args, **kwargs):
         return
     # remove the dummy split whose split sections size is one
     # theoretically nodes with no users should be removed, but we have seen the corner case
+<<<<<<< HEAD
     # thus we add its uers check to walk around the StopIteration error.
+=======
+    # thus we add its users check to walk around the StopIteration error.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if len(split_sections) == 1 and len(split_node.users.keys()) > 0:
         # find the grand children of the split_node
         next_users = find_next_users(split_node)
@@ -865,7 +877,12 @@ def get_transform_params(
                 elif isinstance(user_input, tuple):  # Split being simplified
                     # Verify equal split
                     subset_split_sections = split_sections[  # type: ignore[index]
+<<<<<<< HEAD
                         user_input[0] : user_input[1] + 1
+=======
+                        user_input[0] : user_input[1]
+                        + 1  # type: ignore[index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ]
                     # All sections should be equal
                     if len(OrderedSet(subset_split_sections)) != 1:  # type: ignore[arg-type]
@@ -1446,7 +1463,11 @@ def has_same_parent_node(node: torch.fx.Node):
         if prev_node is None:
             prev_node = getitem.args[0]  # type: ignore[union-attr]
         else:
+<<<<<<< HEAD
             if getitem.args[0] != prev_node:
+=======
+            if getitem.args[0] != prev_node:  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
     return True
 
@@ -1522,7 +1543,11 @@ def merge_getitem_cat(match: Match, split_sections: list[int], dim: int):
             # find the index of getitems to be cated/stacked
             # type: ignore[union-attr]
             indices = [arg.args[1] for arg in cat_user.args[0]]  # type: ignore[union-attr]
+<<<<<<< HEAD
             # the gettitems to be merged must be consecutive, otherwise
+=======
+            # the getitems to be merged must be consecutive, otherwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # returned sliced tensor could be wrong
             if not is_sorted_and_consecutive(indices):  # type: ignore[arg-type]
                 continue
@@ -1624,7 +1649,11 @@ def mutate_cat_node(match: Match, split_sections: list[int], dim: int):
             for getitem in cat_user.args[0]:  # type: ignore[union-attr]
                 indices.append(getitem.args[1])  # type: ignore[union-attr]
                 idx_to_getitem[getitem.args[1]] = getitem  # type: ignore[union-attr]
+<<<<<<< HEAD
             # the gettitems to be merged must be consecutive, otherwise
+=======
+            # the getitems to be merged must be consecutive, otherwise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # returned sliced tensor could be wrong
             if not is_sorted_and_consecutive(indices):  # type: ignore[arg-type]
                 continue
@@ -1668,7 +1697,11 @@ def mutate_cat_node(match: Match, split_sections: list[int], dim: int):
 getitem_split_aten = ListOf(
     CallFunction(
         operator.getitem,
+<<<<<<< HEAD
         CallFunctionVarArgs(torch.ops.aten.split.Tensor, users=MULTIPLE),
+=======
+        CallFunctionVarArgs([torch.ops.aten.split_with_sizes.default], users=MULTIPLE),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Ignored(),
         _users=MULTIPLE,
     ),
@@ -1697,8 +1730,13 @@ def normalize_split_default_aten(match: Match, *args, **kwargs):
         return
     if split_dim < 0:  # Normalize split dim
         split_dim += split_input.meta["val"].dim()
+<<<<<<< HEAD
 
     new_args = (split_input, split_size)
+=======
+    split_section_list = [split_size] * (len(split_node.meta["val"]))
+    new_args = (split_input, split_section_list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     new_kwargs = {"dim": split_dim}
     if (
         split_node.args == new_args
@@ -1709,7 +1747,52 @@ def normalize_split_default_aten(match: Match, *args, **kwargs):
 
     with graph.inserting_after(split_node):
         new_split_node = graph.call_function(
+<<<<<<< HEAD
             torch.ops.aten.split.Tensor,
+=======
+            torch.ops.aten.split_with_sizes.default,
+            args=new_args,
+            kwargs=new_kwargs,  # type: ignore[arg-type]
+        )
+    split_node.replace_all_uses_with(new_split_node)
+    new_split_node.meta.update(split_node.meta)
+    graph.erase_node(split_node)
+    counters["inductor"]["normalization_aten_pass"] += 1
+
+
+@register_graph_pattern(
+    CallFunctionVarArgs(torch.ops.aten.split_with_sizes.default, users=MULTIPLE),
+    pass_dict=construct_pattern_matcher_pass("normalization_aten_pass"),
+)
+def normalize_split_with_size_default_aten(match: Match, *args, **kwargs):
+    split_node = match.nodes[0]
+    graph = match.graph
+    split_input, split_sections, split_dim = _get_split_args_default(split_node)
+    if split_input is None or split_dim is None or split_sections is None:
+        log.debug("couldn't find split args")
+        return
+    if not is_node_meta_valid(split_node):
+        log.debug("val absent for node: %s", split_node)
+        return
+    if any(isinstance(section, torch.SymInt) for section in split_sections):
+        # TODO dynamic_shapes with assume_static_by_default=False fails while AOT Autograd tracing.
+        return
+    if split_dim < 0:  # Normalize split dim
+        split_dim += split_input.meta["val"].dim()
+
+    new_args = (split_input, split_sections)
+    new_kwargs = {"dim": split_dim}
+    if (
+        split_node.args == new_args
+        and split_node.kwargs == new_kwargs
+        and split_node.op == "call_function"
+    ):
+        return
+
+    with graph.inserting_after(split_node):
+        new_split_node = graph.call_function(
+            torch.ops.aten.split_with_sizes.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             args=new_args,
             kwargs=new_kwargs,  # type: ignore[arg-type]
         )
@@ -1731,12 +1814,19 @@ def normalize_split_default_aten(match: Match, *args, **kwargs):
 def merge_split_cat_aten(match: Match, *args, **kwargs):
     graph = match.graph
     split_node = match.nodes[0]
+<<<<<<< HEAD
     split_input, _, split_dim = _get_split_args_default(split_node)
+=======
+    threshold_to_cat = torch._inductor.config.post_grad_fusion_options[
+        "split_cat_aten_pass"
+    ].get("threshold_to_cat", 10)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # get the getitem nodes from the split node
     getitem_nodes = list(split_node.users.keys())
     for cat_node in list(getitem_nodes[0].users.keys()):
         cat_dim = get_arg_value(cat_node, 1, "dim")
         cat_inputs = get_arg_value(cat_node, 0, "tensors")
+<<<<<<< HEAD
         # check split node and cat node has same dim, and all getitem nodes have same parent node
         if split_dim != cat_dim or not has_same_parent_node(cat_node):
             continue
@@ -1752,6 +1842,77 @@ def merge_split_cat_aten(match: Match, *args, **kwargs):
         # remove the cat node
         graph.erase_node(cat_node)
         # remove getitem nodes and split node with no users
+=======
+        if len(cat_inputs) < threshold_to_cat:
+            continue
+        # check split node and cat node has same dim, and all getitem nodes have same parent node
+        parent_to_indices = defaultdict(list)  # type: ignore[var-annotated]
+        parent_to_getitems = defaultdict(list)  # type: ignore[var-annotated]
+        for cat_input in cat_inputs:
+            # skip all non-getitem cat input
+            if cat_input.target != operator.getitem:
+                continue
+            current_getitem_parent = cat_input.args[0]
+            split_dim = get_arg_value(current_getitem_parent, 2, "dim")
+            if split_dim != cat_dim:
+                break
+            getitem_idx = cat_input.args[1]
+            if (
+                current_getitem_parent not in parent_to_indices
+            ) or getitem_idx != parent_to_indices[current_getitem_parent][-1][-1] + 1:
+                parent_to_indices[current_getitem_parent].append([getitem_idx])
+                parent_to_getitems[current_getitem_parent].append([cat_input])
+            else:
+                parent_to_getitems[current_getitem_parent][-1].append(cat_input)
+                parent_to_indices[current_getitem_parent][-1].append(getitem_idx)
+
+        cat_inputs_list = list(cat_inputs)
+        update_cat_arg = []
+        # iterate through the indices to construct the slice nodes
+        for parent, indices in parent_to_indices.items():
+            for idx, indice in enumerate(indices):
+                start, end = indice[0], indice[-1]
+                split_sections = list(parent.args[1])
+                input_of_current_getitem_parent = parent.args[0]
+                if len(indice) >= threshold_to_cat or len(indice) == len(
+                    split_sections
+                ):
+                    if len(indice) != len(split_sections):
+                        # get the start and end slicing indices
+                        slice_node = graph.call_function(
+                            torch.ops.aten.slice.Tensor,
+                            args=(
+                                input_of_current_getitem_parent,
+                                split_dim,  # type: ignore[possibly-undefined]
+                                sum(split_sections[:start]),
+                                sum(split_sections[: end + 1]),
+                            ),
+                        )
+                    else:
+                        slice_node = input_of_current_getitem_parent
+                    # find the index in the cat_inputs_list given the getitem node
+                    update_cat_arg.append(
+                        (
+                            slice_node,
+                            cat_inputs_list.index(parent_to_getitems[parent][idx][0]),
+                            cat_inputs_list.index(parent_to_getitems[parent][idx][-1]),
+                        )
+                    )
+
+        result = []
+        i = 0
+        for slice_tensor, start, end in update_cat_arg:
+            while i < start:
+                result.append(cat_inputs_list[i])
+                i += 1
+            result.append(slice_tensor)
+            i = end + 1
+        while i < len(cat_inputs_list):
+            result.append(cat_inputs_list[i])
+            i += 1
+
+        cat_node.update_arg(0, result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for getitem_node in getitem_nodes:
             if len(getitem_node.users) == 0:
                 graph.erase_node(getitem_node)
@@ -1969,7 +2130,11 @@ def update_args_from_split_getitem(
     threshold_to_cat: int = 2,
 ):
     split_input, split_size, split_dim = _get_split_args_default(parents_seen[-1])
+<<<<<<< HEAD
     # case 1: the number of getitems is the same as the split size, elimiate the split
+=======
+    # case 1: the number of getitems is the same as the split size, eliminate the split
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if len(split_size) == len(getitem_indices) and is_sorted_and_consecutive(
         getitem_indices
     ):
@@ -2064,7 +2229,11 @@ def update_args_from_unbind_getitem(
     unbind_input = get_arg_value(parents_seen[-1], 0, "input")  # split or unbind input
     unbind_dim = get_arg_value(parents_seen[-1], 1, "dim")  # split or unbind dim
     cat_dim = get_arg_value(node, 1, "dim")  # cat or stack dim
+<<<<<<< HEAD
     # case 1: the number of getitems is the same as the split size, elimiate the split
+=======
+    # case 1: the number of getitems is the same as the split size, eliminate the split
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     size = list(unbind_input.meta["example_value"].shape)[unbind_dim]
     if size == len(getitem_indices):
         cat_shape = torch.cat(
@@ -2766,3 +2935,98 @@ def move_reshape_out_of_split_stack(match: Match, *args, **kwargs):
                 remove_split_unbind_children(graph, stack_inputs)  # type: ignore[arg-type]
                 remove_split_unbind_children(graph, split_users)  # type: ignore[arg-type]
             counters["inductor"]["move_reshape_out_of_split_stack_pass"] += 1
+<<<<<<< HEAD
+=======
+
+
+view_getitem_split_aten = ListOf(
+    CallFunction(
+        [torch.ops.aten.reshape.default],
+        CallFunction(
+            operator.getitem,
+            CallFunctionVarArgs(
+                torch.ops.aten.split_with_sizes.default, users=MULTIPLE
+            ),
+            Ignored(),
+            _users=MULTIPLE,
+        ),
+        Arg(),
+        _users=MULTIPLE,
+    ),
+    partial=True,
+)
+
+
+@register_graph_pattern(
+    CallFunction(
+        torch.ops.aten.cat.default,
+        view_getitem_split_aten,
+        dim=Ignored(),
+        _users=MULTIPLE,
+    ),
+    pass_dict=construct_pattern_matcher_pass("move_view_after_cat_aten_pass"),
+)
+def move_view_after_cat(match: Match, *args, **kwargs):
+    split_node = next(
+        node
+        for node in match.nodes
+        if node.target == torch.ops.aten.split_with_sizes.default
+    )
+    split_input, split_section, split_dim = _get_split_args_default(split_node)
+    split_users = list(split_node.users.keys())
+    getitem_indices = [
+        getitem.args[1] for getitem in split_users if getitem.target == operator.getitem
+    ]
+    if not is_sorted_and_consecutive(getitem_indices):  # type: ignore[arg-type]
+        return
+    cat_nodes = [
+        node for node in match.nodes if node.target == torch.ops.aten.cat.default
+    ]
+    graph = match.graph
+    for cat_node in cat_nodes:
+        if not is_node_meta_valid(cat_node):
+            log.debug("example value absent for node: %s", cat_node)
+            continue
+        cat_dim = _get_dim(cat_node)
+        cat_inputs = get_arg_value(cat_node, 0, "tensors")  # type: ignore[union-attr]
+        # we only consider the following special case
+        if len(cat_inputs) != len(split_section):
+            continue
+        # check if the cat inputs are all the view nodes
+        if not all(
+            view_node.target == torch.ops.aten.reshape.default
+            for view_node in cat_inputs
+        ):
+            continue
+        # check if the view nodes are all from getitem nodes
+        if not all(
+            view_node.args[0].target == operator.getitem for view_node in cat_inputs
+        ):
+            continue
+        view_indices = [view.args[0].args[1] for view in cat_inputs]
+        if not is_sorted_and_consecutive(view_indices):  # type: ignore[arg-type]
+            continue
+        if cat_dim != split_dim:
+            # construct permute node
+            permute_list = list(range(len(cat_node.meta["val"].shape) + 1))
+            permute_list[split_dim], permute_list[cat_dim] = (
+                permute_list[cat_dim],
+                permute_list[split_dim],
+            )
+            permute_node = graph.call_function(
+                torch.ops.aten.permute.default,
+                args=(split_input, permute_list),
+            )
+        else:
+            permute_node = split_input
+
+        with graph.inserting_before(cat_node):
+            view_node = graph.call_function(
+                torch.ops.aten.reshape.default,
+                args=(permute_node, list(cat_node.meta["val"].shape)),
+            )
+            cat_node.replace_all_uses_with(view_node)
+            view_node.meta.update(cat_node.meta)
+            graph.erase_node(cat_node)
+        counters["inductor"]["move_view_after_cat_aten_pass"] += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/fx_utils.py b/torch/_inductor/fx_utils.py
index 280bf47a0b43..5afcf40f14e3 100644
--- a/torch/_inductor/fx_utils.py
+++ b/torch/_inductor/fx_utils.py
@@ -8,6 +8,10 @@
 import torch
 import torch.fx
 from torch._dispatch.python import enable_python_dispatcher
+<<<<<<< HEAD
+=======
+from torch._subclasses.fake_tensor import FakeTensorMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.symbolic_shapes import (
     compute_unbacked_bindings,
     rebind_unbacked,
@@ -17,6 +21,10 @@
 from torch.utils import _pytree as pytree
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._pytree import tree_map
+<<<<<<< HEAD
+=======
+from torch.utils.flop_counter import flop_registry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .virtualized import V
 
@@ -250,3 +258,37 @@ def realizes_inputs(node: torch.fx.Node) -> bool:
 
     # Otherwise, assume node isn't realized
     return False
+<<<<<<< HEAD
+=======
+
+
+def count_flops_fx(node: torch.fx.Node) -> Optional[int]:
+    if isinstance(node.target, str):
+        return None
+    with FakeTensorMode(allow_non_fake_inputs=True):
+        success, args, kwargs = get_fake_args_kwargs(node)
+
+        if success:
+            with torch.utils.flop_counter.FlopCounterMode(
+                display=False
+            ) as flop_counter_mode:
+                node.target(*args, **kwargs)
+
+            counted_flops = flop_counter_mode.get_total_flops()
+            return counted_flops
+    return None
+
+
+def countable_fx(node: torch.fx.Node) -> bool:
+    """
+    Whether or not we can count the flops of an FX node.
+    """
+    assert isinstance(node, torch.fx.Node)
+    if not hasattr(node, "target"):
+        return False
+    target = node.target
+    if not hasattr(target, "overloadpacket"):
+        return target in flop_registry
+    packet = target.overloadpacket
+    return packet in flop_registry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 399a8c06e0f3..df3980bc47b0 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -3,7 +3,10 @@
 import contextlib
 import functools
 import itertools
+<<<<<<< HEAD
 import json
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import operator
 import os
@@ -24,15 +27,27 @@
 from torch._decomp import get_decompositions
 from torch._dynamo.utils import defake, dynamo_timed
 from torch._library.fake_class_registry import FakeScriptObject
+<<<<<<< HEAD
+=======
+from torch._library.utils import get_layout_constraint_tag
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import LazyString, trace_structured
 from torch._prims_common import (
     compute_required_storage_length,
     make_channels_last_strides_for,
 )
 from torch._subclasses.fake_tensor import FakeTensor
+<<<<<<< HEAD
 from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.sym_node import magic_methods, method_to_operator
 from torch.fx.experimental.symbolic_shapes import (
+=======
+from torch._utils_internal import full_aoti_runtime_assert
+from torch.fx.experimental._backward_state import BackwardState
+from torch.fx.experimental.sym_node import magic_methods, method_to_operator
+from torch.fx.experimental.symbolic_shapes import (
+    _get_placeholder_expr,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     free_unbacked_symbols,
     has_free_symbols,
     resolve_unbacked_bindings,
@@ -50,6 +65,10 @@
 from .codegen.common import (
     BackendFeature,
     DeviceOpOverrides,
+<<<<<<< HEAD
+=======
+    FileBackedGraphModule,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_backend_features,
     get_device_op_overrides,
     get_wrapper_codegen_for_device,
@@ -86,6 +105,10 @@
     maybe_layout_constraints,
     needs_realized_inputs,
     require_contiguous,
+<<<<<<< HEAD
+=======
+    tag_to_layout_constraint,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unsupported_output_tensor,
 )
 from .runtime import autotune_cache
@@ -97,10 +120,18 @@
     get_cloned_parameter_buffer_name,
     get_donated_idxs,
     get_sympy_Expr_dtype,
+<<<<<<< HEAD
+=======
+    GraphPartitionMap,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_same_tensor,
     maybe_get_suppress_shape_guards_ctx,
     normalize_name,
     should_assume_input_aligned,
+<<<<<<< HEAD
+=======
+    SUPPORTED_MKLDNN_DEVICES,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ValueWithLineMap,
 )
 from .virtualized import NullHandler, V
@@ -113,9 +144,18 @@
     from torch._higher_order_ops.effects import _EffectType
     from torch.fx import GraphModule
     from torch.fx.graph import Graph
+<<<<<<< HEAD
+    from .codegen.wrapper import PythonWrapperCodegen
+    from .scheduler import BaseSchedulerNode
+
+=======
+
     from .codegen.wrapper import PythonWrapperCodegen
     from .scheduler import BaseSchedulerNode
 
+    CompiledModule = Union[ModuleType, FileBackedGraphModule]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.codecache import output_code_log
 
 
@@ -180,7 +220,16 @@ def get_user_visible_output_strides(g: Graph) -> dict[Node, tuple[int, ...]]:
     if "user_visible_output_idxs" not in output_node.meta:
         return ret
 
+<<<<<<< HEAD
     for idx, node in enumerate(output_node.args[0]):
+=======
+    if not isinstance(output_node.args[0], torch.fx.Node):
+        output_node_args = output_node.args[0]
+    else:
+        output_node_args = output_node.args
+
+    for idx, node in enumerate(output_node_args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if idx in output_node.meta["user_visible_output_idxs"]:
             ret[node] = output_node.meta["original_output_strides"][idx]
     return ret
@@ -244,6 +293,17 @@ def _get_overload_packet(
             cur.meta["dislike_padding"] = True
             continue
 
+<<<<<<< HEAD
+=======
+        if (
+            isinstance(cur.target, torch._ops.OpOverload)
+            and get_layout_constraint_tag(cur.target)
+            == torch._C.Tag.needs_exact_strides
+        ):
+            cur.meta["dislike_padding"] = True
+            continue
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op = _get_overload_packet(cur)
         if not op:
             continue
@@ -320,7 +380,12 @@ def __init__(
         self.graph_input_names: list[str] = []
         self.graph_inputs: dict[str, Union[TensorBox, TorchBindObject, sympy.Expr]] = {}
         self.graph_inputs_original: dict[str, InputBuffer] = {}
+<<<<<<< HEAD
         self.zero_dim_cpu_tensor_list = OrderedSet[str]()
+=======
+        self.partition_maps: Optional[list[GraphPartitionMap]] = None
+        self.zero_dim_cpu_tensor_list: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device_types: OrderedSet[str] = (
             const_module.device_types if const_module else OrderedSet()
         )
@@ -346,6 +411,7 @@ def __init__(
         self.constants: dict[str, torch.Tensor] = (
             const_module.constants if const_module else {}
         )
+<<<<<<< HEAD
         self.torchbind_constants: dict[str, torch._C.ScriptObject] = {}
         self.seen_subgraphs: dict[str, ir.Subgraph] = {}
         self.constant_reprs: dict[str, str] = {}
@@ -355,6 +421,25 @@ def __init__(
         self.mutated_buffers = OrderedSet[str]()
         self.never_reuse_buffers = OrderedSet[str]()
         self.inplaced_to_remove = OrderedSet[str]()
+=======
+        self.named_buffers: dict[str, torch.Tensor] = (
+            const_module.named_buffers if const_module else {}
+        )
+        self.named_parameters: dict[str, torch.Tensor] = (
+            const_module.named_parameters if const_module else {}
+        )
+        self.torchbind_constants: dict[
+            str, Union[torch._C.ScriptObject, FakeScriptObject]
+        ] = {}
+        self.seen_subgraphs: dict[str, ir.Subgraph] = {}
+        self.constant_reprs: dict[str, str] = {}
+        self.removed_operations: OrderedSet[str] = OrderedSet()
+        self.removed_buffers: OrderedSet[str] = OrderedSet()
+        self.removed_inplace_buffers: OrderedSet[str] = OrderedSet()
+        self.mutated_buffers: OrderedSet[str] = OrderedSet()
+        self.never_reuse_buffers: OrderedSet[str] = OrderedSet()
+        self.inplaced_to_remove: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.device_ops: DeviceOpOverrides = None  # type: ignore[assignment]
         self.wrapper_code: PythonWrapperCodegen = None  # type: ignore[assignment]
         # See `ProxyExecutor Design Note` in ir.py for more details
@@ -370,7 +455,11 @@ def __init__(
 
         self.current_node: torch.fx.Node = None  # type: ignore[assignment]
         self.lists: dict[str, list[str]] = {}
+<<<<<<< HEAD
         self.mutated_inputs = OrderedSet[str]()
+=======
+        self.mutated_inputs: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.mutated_input_idxs: list[int] = []
         self.name_to_buffer: dict[str, ir.Buffer] = {}
         self.name_to_users: defaultdict[str, list[ir.IRNode]] = defaultdict(list)
@@ -390,6 +479,15 @@ def __init__(
         self.post_grad_graph_id = next(_post_grad_graph_counter)
         self.scheduler: torch._inductor.scheduler.Scheduler = None  # type: ignore[assignment]
 
+<<<<<<< HEAD
+=======
+        # record intermediate results for input of UsedDefinedTritonKernels
+        # This will be used if autotuning is done in one pass.
+        self.autotuning_inputs: Optional[list[torch.Tensor]] = None
+        self.autotuning_mapping: Optional[dict[str, dict[str, int]]] = None
+        self.autotuning_grids: Optional[dict[str, Any]] = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # current_device is set only during codegen of a device-specific kernel
         # a graph can have many devices
         self.current_device: Optional[torch.device] = None
@@ -411,6 +509,13 @@ def __init__(
         # only keeping one node per device for stack trace purposes
         self.device_node_mapping: dict[torch.device, torch.fx.Node] = {}
         self.orig_gm: torch.fx.GraphModule = gm.__copy__()
+<<<<<<< HEAD
+=======
+        for k, v in self.orig_gm.named_buffers():
+            self.named_buffers[k] = v
+        for k, v in self.orig_gm.named_parameters():
+            self.named_parameters[k] = v
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.dynamo_flat_name_to_original_fqn = self.module.meta.get(  # type: ignore[operator, union-attr]
             "dynamo_flat_name_to_original_fqn", {}
         )
@@ -421,15 +526,27 @@ def __init__(
         self.get_backend_features = functools.lru_cache(None)(get_backend_features)
 
         self.effectful_ops: dict[_EffectType, ir.Buffer] = {}
+<<<<<<< HEAD
         self.aligned_inputs: OrderedSet[str] = OrderedSet()
         self.no_fuse_buffer_names = OrderedSet[str]()
+=======
+        # Track the buffers that we know is unaligned
+        # This can either be a graph input or the output of fallback
+        # kernels.
+        self.unaligned_buffers: OrderedSet[str] = OrderedSet()
+        self.no_fuse_buffer_names: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.low_precision_codegen_ops: OrderedSet[str] = OrderedSet()
         # more aggressive prologue fusion
         self.invoke_quant_ops: OrderedSet[str] = OrderedSet()
 
         # Below field is related to printing debug intermediate tensor values info for debugging
+<<<<<<< HEAD
         self.all_codegen_kernel_names = OrderedSet[str]()
+=======
+        self.all_codegen_kernel_names: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # state used by for Kernel.workspace
         self.workspace_id = itertools.count()
@@ -571,7 +688,11 @@ def decide_layout_opt(gm: GraphModule, *, is_inference: bool) -> bool:
             torch.backends.mkldnn.enabled
             and torch.backends.mkldnn.is_available()
             and all(
+<<<<<<< HEAD
                 n.args[idx].meta["val"].device == torch.device("cpu")
+=======
+                n.args[idx].meta["val"].device.type in SUPPORTED_MKLDNN_DEVICES
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for n in conv_nodes
                 for idx in [0, 1]
             )
@@ -1023,7 +1144,16 @@ def placeholder(
         example = super().placeholder(target, args, kwargs)  # type: ignore[arg-type]
         target = self.qualify_name(target)
         if isinstance(example, SymTypes):
+<<<<<<< HEAD
             expr = example.node.expr
+=======
+            # TODO fix partitioning issue and re-enable for backward
+            # https://github.com/pytorch/pytorch/issues/155468.
+            if not V.graph.is_backward:
+                expr = _get_placeholder_expr(example.node)
+            else:
+                expr = example.node.expr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.graph_inputs[target] = expr
             self.graph_input_names.append(target)
             return expr
@@ -1107,8 +1237,13 @@ def placeholder(
         # expensive and cause recompiles; Instead, we're generating code
         # based on the alignment of the example input without guarding.
         with maybe_get_suppress_shape_guards_ctx():
+<<<<<<< HEAD
             if should_assume_input_aligned(example):
                 self.aligned_inputs.add(target)
+=======
+            if not should_assume_input_aligned(example):
+                self.unaligned_buffers.add(target)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return tensor
 
     def call_function(self, target: Callable, args: Any, kwargs: dict[str, Any]) -> Any:  # type: ignore[type-arg, override]
@@ -1140,6 +1275,7 @@ def call_function(self, target: Callable, args: Any, kwargs: dict[str, Any]) ->
                     error.operator_str(target, args, kwargs),
                 )
 
+<<<<<<< HEAD
                 # use contiguous unless the (custom) op asks something else
                 # explicitly
                 if torch._C.Tag.needs_fixed_stride_order in target.tags:
@@ -1166,6 +1302,28 @@ def call_function(self, target: Callable, args: Any, kwargs: dict[str, Any]) ->
                 # contiguous input since some eager kernels does not
                 # support non-contiguous inputs. They may silently cause
                 # accuracy problems. Check https://github.com/pytorch/pytorch/issues/140452
+=======
+                tag = get_layout_constraint_tag(target, with_default=False)
+                if (
+                    tag is None
+                    and torch._library.utils.is_builtin(target)
+                    and self.is_backward
+                ):
+                    # for implicit fallback ATen ops during backward, if there
+                    # is no layout constraint tag, we conservatively require contiguous
+                    # input since some eager kernels do not
+                    # support non-contiguous inputs. Otherwise they may silently cause
+                    # accuracy problems. Check https://github.com/pytorch/pytorch/issues/140452
+                    # We only do this For ATen ops and for backward.
+                    #
+                    # TODO: should really switch to "needs_fixed_stride" constraint on these
+                    # and identify them one by one.
+                    decided_constraint = require_contiguous  # type: ignore[assignment]
+                else:
+                    tag = get_layout_constraint_tag(target, with_default=True)
+                    decided_constraint = tag_to_layout_constraint(tag)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 make_fallback(target, layout_constraint=decided_constraint)
 
             elif get_decompositions([target]):
@@ -1183,7 +1341,38 @@ def call_function(self, target: Callable, args: Any, kwargs: dict[str, Any]) ->
             layout_constraints = maybe_layout_constraints(target)
             if layout_constraints:
                 old_args, old_kwargs = args, kwargs
+<<<<<<< HEAD
                 args, kwargs = layout_constraints(n, *args, **kwargs)
+=======
+                if layout_constraints is constrain_to_fake_tensors:
+                    # only constrain_to_fake_tensor if this exists.
+                    # otherwise, no constraints at all: the implication is
+                    # that this operator was inserted by a custom pass
+                    # so we'll give them the freedom.
+                    if "eager_input_vals" in n.meta:
+                        fake_args, fake_kwargs = n.meta["eager_input_vals"]
+
+                        # (fake_args, fake_kwargs) might not align with (args, kwargs).
+                        # we need to normalize them based on the schema
+                        assert isinstance(target, torch._ops.OpOverload)
+
+                        def normalize(args: Any, kwargs: Any) -> tuple[Any, Any]:
+                            result = torch.fx.operator_schemas.normalize_function(
+                                target, args, kwargs
+                            )
+                            assert result is not None
+                            return result[0], result[1]
+
+                        fake_args, fake_kwargs = normalize(fake_args, fake_kwargs)
+                        args, kwargs = normalize(args, kwargs)
+                        old_args, old_kwargs = normalize(old_args, old_kwargs)
+
+                        args, kwargs = constrain_to_fake_tensors(
+                            args, kwargs, fake_args, fake_kwargs
+                        )
+                else:
+                    args, kwargs = layout_constraints(n, *args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             out = lowerings[target](*args, **kwargs)  # type: ignore[index]
 
@@ -1229,9 +1418,15 @@ def get_attr(
             self.constant_reprs[target] = ""
             return TorchBindObject(name=target, value=value)
         elif isinstance(value, FakeScriptObject):
+<<<<<<< HEAD
             self.torchbind_constants[target] = value.real_obj
             self.constant_reprs[target] = ""
             return TorchBindObject(name=target, value=value.real_obj)
+=======
+            self.torchbind_constants[target] = value
+            self.constant_reprs[target] = ""
+            return TorchBindObject(name=target, value=value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert isinstance(value, torch.Tensor)
         if (
@@ -1404,6 +1599,10 @@ def propagate_mutation(
                     k: v.meta["val"] if isinstance(v, torch.fx.Node) else v
                     for k, v in kwargs.items()
                 },
+<<<<<<< HEAD
+=======
+                old_kwargs["tma_descriptor_metadata"],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for name in mutated:
                 old_arg = old_kwargs["kwargs"][name]
@@ -1451,7 +1650,11 @@ def maybe_propagate(
 
     def run_node(self, n: torch.fx.Node) -> object:
         def debug(msg: str) -> None:
+<<<<<<< HEAD
             log.debug("lowering %s %s", LazyString(n.format_node), msg)
+=======
+            log.debug("lowering %s %s", LazyString(n.format_node), msg)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from torch._inductor.compiler_bisector import CompilerBisector
 
@@ -1471,7 +1674,12 @@ def debug(msg: str) -> None:
         ):
             if (
                 n.op == "call_function"
+<<<<<<< HEAD
                 and n.target is not operator.getitem
+=======
+                and n.target
+                not in (operator.getitem, torch._higher_order_ops.invoke_subgraph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 and (
                     fallback_node_due_to_unsupported_type(n)
                     or CompilerBisector.disable_subsystem(
@@ -1497,9 +1705,15 @@ def debug(msg: str) -> None:
                     old_args = args  # type: ignore[possibly-undefined]
                     old_kwargs = kwargs  # type: ignore[possibly-undefined]
 
+<<<<<<< HEAD
                     if arg_kwarg_vals := n.meta.get("arg_kwarg_vals"):
                         inp_args = arg_kwarg_vals[0]
                         inp_kwargs = arg_kwarg_vals[1]
+=======
+                    if eager_input_vals := n.meta.get("eager_input_vals"):
+                        inp_args = eager_input_vals[0]
+                        inp_kwargs = eager_input_vals[1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         args, kwargs = constrain_to_fake_tensors(
                             args, kwargs, inp_args, inp_kwargs
                         )
@@ -1653,7 +1867,11 @@ def debug(msg: str) -> None:
                                 torch.ops.mkldnn._convolution_pointwise.binary,
                                 torch.ops.mkldnn._convolution_pointwise_.binary,
                                 torch.ops.mkldnn._convolution_transpose_pointwise.default,
+<<<<<<< HEAD
                                 torch.ops.onednn.qconv2d_pointwise.default,
+=======
+                                torch.ops.onednn.qconv_pointwise.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 torch.ops.onednn.qconv2d_pointwise.binary,
                             ]
                             if torch._C.has_mkl:
@@ -1730,6 +1948,20 @@ def debug(msg: str) -> None:
         for op in self.operations[operation_watermark:]:
             new_unbacked_defs |= op.get_unbacked_symbol_defs()
 
+<<<<<<< HEAD
+=======
+        shape_env = V.graph.sizevars.shape_env
+
+        # An input can an unbacked symint i.e.: when mark_unabcked is used.
+        # in that case add it to new_unbacked_defs.
+        if (
+            n.op == "placeholder"
+            and isinstance(result, sympy.Symbol)
+            and shape_env.is_unbacked_symint(result)
+        ):
+            new_unbacked_defs.add(result)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def format_new_defs() -> str:
             r = [
                 f"unbacked_symbol_defs={buf.get_unbacked_symbol_defs()} in:\n{buf}\n"
@@ -1741,6 +1973,7 @@ def format_new_defs() -> str:
             )
             return "***\n".join(r)
 
+<<<<<<< HEAD
         if n.op != "placeholder":
             # Note [Backwards runtime asserts]
             # Backwards poses an interesting problem for deferred runtime
@@ -1769,6 +2002,119 @@ def make_assert(expr: SympyBoolean, msg: str) -> None:
                 self.register_buffer(assert_op, set_name=True)
                 self.register_operation(assert_op)
 
+=======
+        # We do not skip unbacked symints that are input for backward see the note below.
+        if V.graph.is_backward and n.op == "placeholder":
+            return result
+
+        # Note [Backwards runtime asserts]
+        # Backwards poses an interesting problem for deferred runtime
+        # asserts.  In the easy case, we may solely close over data
+        # dependent sized tensors, and there are no binding sites for
+        # unbacked SymInts.  In this case, we can just drop all the
+        # runtime asserts on the floor: no non-placeholder bindings, no
+        # problem.
+        #
+        # However, it is *possible* for a fresh runtime assert to show up
+        # between forwards and backwards.  Right now, the freezing process
+        # that happens when we lower forwards means that we will freeze
+        # runtime asserts, and then the moment the backwards lowering
+        # process attempts to add a new deferred runtime assert, we will
+        # fail.  Let's say you remove that assert.  Now when we get here,
+        # we need to make sure we actually emit these asserts (because we
+        # can't emit them in forwards, we already compiled it).  So we
+        # have to do something here.  But we don't want to reemit ALL
+        # deferred runtime asserts, we only want to emit the NEW ones.
+        # Therefore needing some sort of stratification in the ShapeEnv.
+        # This is all doable, it just hasn't been done yet.
+
+        unbacked_bindings = resolve_unbacked_bindings(
+            V.graph.sizevars.shape_env, n.meta.get("unbacked_bindings", {})
+        )
+        assert unbacked_bindings is not None
+        # When we do lowering, it is possible we reallocate unbacked SymInts.
+        # So we need to line up the unbacked SymInts when performing the test
+        # here
+        #
+        # In principle, we could permit lowering to introduce MORE unbacked
+        # SymInts: as long as all the old unbacked ones are accounted for,
+        # it's fine for inductor to introduce extra calls to item()/unbacked()
+        # whatever.  This actually happens in practice when an unbacked SymInt
+        # gets memoized away; naively, when Inductor reprocesses a kernel, it
+        # doesn't know that the memo still applies, and ends up allocating a
+        # new symbol.  However, this is generally a bad thing: we may still
+        # end up needing to test equalities on the symbols, and a fresh
+        # symbol is likely to hit lots of GuardOnDataDependent errors that
+        # we already know facts for.
+        renamed_unbacked_bindings = OrderedSet(
+            V.fake_mode.shape_env.unbacked_renamings.get(s, s)
+            for s in unbacked_bindings.keys()
+        )
+        assert new_unbacked_defs >= renamed_unbacked_bindings, (
+            f"failed {new_unbacked_defs} >= {renamed_unbacked_bindings} (inductor >= fx)\n"
+            f"fx node is: {n.format_node()}\n"
+            f"new operations are:\n\n{format_new_defs()}"
+        )
+        self.create_deferred_runtime_asserts(n, new_unbacked_defs)
+        return result
+
+    def create_deferred_runtime_asserts(
+        self, n: torch.fx.Node, new_unbacked_defs: OrderedSet[sympy.Symbol]
+    ) -> None:
+        # [NOTE] Codegen runtime asserts in Inductor
+        #
+        # We need to generate runtime asserts directly in Inductor instead
+        # of just reusing the asserts from input graphs because we reuse the
+        # same ShapeEnv as before. In particular, on subsequent graph passes,
+        # we would immediately turn all of these assertions into noops,
+        # because when we evaluated their expressions, we would see that
+        # because we had a deferred runtime assert in the ShapeEnv, we
+        # know "oh, of course this expression is True" already.
+        # One example is below:
+        #
+        # class Model(torch.nn.Module):
+        #     def forward(self, a, b, c):
+        #         nz = torch.nonzero(a)
+        #         ones = a.new_ones([nz.size(0), b.size(0)])
+        #         torch._check(ones.size(0) >= 1)
+        #         equals = torch.add(ones, c)
+        #         return equals
+        # torch._dynamo.mark_dynamic(c, 0)
+        # When we reuse the ShapeEnv in Inductor lowering, the check that checks
+        # a and nonzero have the same shape would be evaluated to True after we resolve
+        # unbacked bindings using the ShapeEnv.
+        # See test_unbacked_equals_input_size_runtime_assertion in test_aot_inductor.
+        #
+        #
+        # In addition to the Inductor generated runtime asserts, we also
+        # need the runtime asserts from the input graph, because some derived
+        # runtime asserts on backed symints are not generated in Inductor. One example is
+        # this: `y = x.reshape(100, -1).clone()`. x.shape[0] needs to be a multiple of 100.
+        # See test_aoti_runtime_asserts_backed_symint in test_aot_inductor.
+
+        def make_assert(expr: SympyBoolean, msg: str) -> None:
+            assert_op = ir.AssertScalar(expr, msg)
+            self.register_buffer(assert_op, set_name=True)
+            self.register_operation(assert_op)
+
+        if (
+            full_aoti_runtime_assert()
+            and n.target == torch.ops.aten._assert_scalar.default
+            and self.aot_mode
+        ):
+            node_args, _ = self.fetch_args_kwargs_from_env(n)
+            if node_args[0] != True:  # noqa: E712
+                make_assert(node_args[0], f"{node_args[0]} to be True")
+        else:
+            # bound_unbacked_symbols tracks the symbols that are created so far,
+            # we use it to make sure that runtime assertions are added after all
+            # symbols used in them are defined.
+            self.bound_unbacked_symbols |= new_unbacked_defs
+
+            shape_env = V.graph.sizevars.shape_env
+
+            # Emit code for runtime asserts that can be inserted at this point.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i0 in new_unbacked_defs:
                 ras = self.ras_by_symbol.pop(i0, [])
                 # NB: size-like not needed, we won't retrace
@@ -1798,6 +2144,7 @@ def is_convertible(s: Expr) -> bool:
                     else:
                         make_assert(ra.expr, f"{ra.expr}")
 
+<<<<<<< HEAD
             self.bound_unbacked_symbols |= new_unbacked_defs
 
             unbacked_bindings = resolve_unbacked_bindings(
@@ -1830,6 +2177,8 @@ def is_convertible(s: Expr) -> bool:
 
         return result
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def validate_can_generate_cpp_wrapper(self) -> None:
         if config.disable_cpp_codegen:
             raise CppWrapperCodegenError("C++ codegen is disabled")
@@ -1872,6 +2221,7 @@ def init_wrapper_code(
         )
 
         if self.const_module:
+<<<<<<< HEAD
             # If we have const module, we could reuse the kernels
             # This could avoid duplication and save time on doing recompilation (if Triton.)
             self.wrapper_code._names_iter = self.const_module.wrapper_code._names_iter
@@ -1879,6 +2229,104 @@ def init_wrapper_code(
                 self.const_module.wrapper_code.src_to_kernel
             )
 
+=======
+            self.wrapper_code._names_iter = self.const_module.wrapper_code._names_iter
+
+    def extract_autotune_inputs(
+        self, example_inputs: list[Union[int, float, torch.Tensor]]
+    ) -> None:
+        import copy
+
+        cloned_gm = copy.deepcopy(self.orig_gm)
+        example_inputs = copy.deepcopy(example_inputs)
+        triton_nodes = []
+        for node in cloned_gm.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target is torch.ops.higher_order.triton_kernel_wrapper_mutation
+            ):
+                triton_nodes.append(node)
+
+        # Store grid related nodes
+        grid_inputs: list[torch.fx.Node] = []
+        visited_grids: dict[torch.fx.Node, int] = {}
+        # Store kwargs related nodes
+        triton_inputs: dict[str, Any] = {}
+        kwargs_inputs: list[torch.fx.Node] = []
+        visited_kwargs: dict[Any, int] = {}
+        for node in triton_nodes:
+            # first check whether we have fx node in grid settings.
+            for grid in node.kwargs["grid"]:
+                for val in grid:
+                    if val in visited_grids:
+                        continue
+
+                    if isinstance(val, torch.fx.Node):
+                        visited_grids[val] = len(grid_inputs)
+                        grid_inputs.append(val)
+
+            kwargs = node.kwargs["kwargs"]
+            # identify which args might be mutated, those should be cloned.
+            mutated = torch._higher_order_ops.triton_kernel_wrap.get_mutated_tensors(
+                node.kwargs["kernel_idx"],
+                node.kwargs["constant_args_idx"],
+                {
+                    k: v.meta["val"] if isinstance(v, torch.fx.Node) else v
+                    for k, v in kwargs.items()
+                },
+                node.kwargs["tma_descriptor_metadata"],
+            )
+
+            new_kwargs: dict[str, int] = {}
+            with cloned_gm.graph.inserting_before(node):
+                for k, v in kwargs.items():
+                    if k in mutated:
+                        new_node = cloned_gm.graph.call_function(torch.clone, args=(v,))
+                        new_kwargs[k] = len(kwargs_inputs)
+                        kwargs_inputs.append(new_node)
+                        continue
+
+                    if v in visited_kwargs:
+                        new_kwargs[k] = visited_kwargs[v]
+                        continue
+                    visited_kwargs[v] = len(kwargs_inputs)
+                    kwargs_inputs.append(v)
+                    new_kwargs[k] = visited_kwargs[v]
+            triton_inputs[node.name] = new_kwargs
+
+        new_outputs = kwargs_inputs + grid_inputs
+        for node in cloned_gm.graph.nodes:
+            if node.op == "output":
+                node.args = (tuple(new_outputs),)
+                break
+
+        cloned_gm.recompile()
+        runner = torch.fx.Interpreter(cloned_gm)
+        returned_outputs = runner.run(example_inputs)
+        # Extract and store the grid for autotuning
+        if len(grid_inputs) > 0:
+            grid_outputs = returned_outputs[len(kwargs_inputs) :]
+            self.autotuning_grids = {}
+            for node in triton_nodes:
+                dynamic_grid = False
+                new_grids: list[tuple[Any]] = []
+                for grid in node.kwargs["grid"]:
+                    new_grid = []
+                    for val in grid:
+                        if not isinstance(val, torch.fx.Node):
+                            new_grid.append(val)
+                            continue
+                        dynamic_grid = True
+                        new_grid.append(grid_outputs[visited_grids[val]])
+                    new_grids.append(tuple(new_grid))
+
+                if dynamic_grid:
+                    self.autotuning_grids[node.name] = new_grids
+        # Store the kwargs input for autotuning
+        self.autotuning_inputs = returned_outputs[: len(kwargs_inputs)]
+        self.autotuning_mapping = triton_inputs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_with_cpp_wrapper(
         self,
     ) -> tuple[ValueWithLineMap, ValueWithLineMap]:
@@ -1886,6 +2334,7 @@ def codegen_with_cpp_wrapper(
         For GPU, Triton kernels are autotuned and stored as cubin files
         """
         if any(device in self.device_types for device in ["cuda", "xpu"]):
+<<<<<<< HEAD
             if config.triton.autotune_at_compile_time:
                 # If autotune_at_compile_time is True, we can do the codegen in one-pass
                 # TODO: once autotune_at_compile_time is stable, we should delete the else branch
@@ -1895,6 +2344,10 @@ def codegen_with_cpp_wrapper(
                 self.cpp_wrapper = False
                 compiled = self.compile_to_module().call
 
+=======
+
+            def extract_real_inputs() -> list[Union[int, float, torch.Tensor]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 def materialize(
                     x: Union[torch.SymInt, torch.SymFloat, torch.Tensor],
                 ) -> Union[int, float, torch.Tensor]:
@@ -1961,7 +2414,31 @@ def materialize(
                         assert isinstance(mutated_inp, torch.Tensor)
                         real_inputs[idx] = clone_preserve_strides(mutated_inp)
                         del mutated_inp
+<<<<<<< HEAD
+
+=======
+                return real_inputs
+
+            if config.triton.autotune_at_compile_time:
+                # If autotune_at_compile_time is True, we can do the codegen in one-pass
+                # We will construct the autotuning values if user defined kernel exists.
+                if config.triton.autotune_with_sample_inputs:
+                    user_defined_kernels = False
+                    for op in self.operations:
+                        if isinstance(op, ir.UserDefinedTritonKernel):
+                            user_defined_kernels = True
+                            break
+                    if user_defined_kernels:
+                        real_inputs = extract_real_inputs()
+                        self.extract_autotune_inputs(real_inputs)
+                return self.codegen()
+            else:
+                # first pass
+                self.cpp_wrapper = False
+                compiled = self.compile_to_module().call
 
+                real_inputs = extract_real_inputs()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 with torch.utils._python_dispatch._disable_current_modes():
                     compiled(real_inputs)
                 del real_inputs
@@ -2005,6 +2482,7 @@ def codegen(self) -> tuple[ValueWithLineMap, ValueWithLineMap]:
                 "Finished codegen for all nodes. The list of kernel names available: %s",
                 V.graph.all_codegen_kernel_names,
             )
+<<<<<<< HEAD
             # Dump provenance artifacts for debugging trace
             provenance_info = (
                 V.debug.log_inductor_triton_kernel_to_post_grad_node_info()
@@ -2031,6 +2509,8 @@ def codegen(self) -> tuple[ValueWithLineMap, ValueWithLineMap]:
                     },
                     payload_fn=lambda: json.dumps(node_mappings),
                 )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             result = self.wrapper_code.generate(self.is_inference)
             self.wrapper_code.pop_codegened_graph()
@@ -2043,7 +2523,11 @@ def codegen_subgraph(self, parent_graph: GraphLowering) -> None:
         graph. The parent graph is passed as an argument: the
         intention is to inline codegening of the subgraph in
         the parent graph's wrapper code (including the generated
+<<<<<<< HEAD
         kerenls). The wrapper code is not finalized (via `.generate()`
+=======
+        kernels). The wrapper code is not finalized (via `.generate()`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         call), as this will be done in the parent graph's `codegen()`.
         """
         with dynamo_timed("GraphLowering.codegen_subgraph", log_pt2_compile_event=True):
@@ -2073,7 +2557,11 @@ def count_bytes(
     # No-op to be patched for unit tests
     save_output_code: Optional[Callable[[str], None]] = None
 
+<<<<<<< HEAD
     def compile_to_module(self) -> ModuleType:
+=======
+    def compile_to_module(self) -> CompiledModule:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with dynamo_timed(
             "GraphLowering.compile_to_module",
             phase_name="code_gen",
@@ -2082,6 +2570,7 @@ def compile_to_module(self) -> ModuleType:
         ):
             return self._compile_to_module()
 
+<<<<<<< HEAD
     def _compile_to_module(self) -> ModuleType:
         from .codecache import PyCodeCache
 
@@ -2095,6 +2584,52 @@ def _compile_to_module(self) -> ModuleType:
                 '"""\n'
                 + "Compile-time auto-tuning block: \n"
                 + self.wrapper_code.kernel_autotune_defs.getvalue()
+=======
+    def _compile_to_module(self) -> CompiledModule:
+        # If we're here, we don't have to worry about the kernel code, which is only
+        # returned separately in AOTInductor mode.
+        wrapper_code, _ = (
+            self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
+        )
+
+        if isinstance(wrapper_code, ValueWithLineMap):
+            mod = self._compile_to_module_lines(wrapper_code)
+        elif isinstance(wrapper_code, FileBackedGraphModule):
+            mod = wrapper_code
+        else:
+            raise NotImplementedError(
+                f"Unrecognized wrapper code type: {type(wrapper_code)}"
+            )
+
+        # Logged twice as per https://github.com/pytorch/pytorch/pull/99038#discussion_r1167826029
+        # TODO. Revisit this once the logging API is more mature
+        assert mod.__file__ is not None
+
+        log_module_code(mod.__file__)
+        log.debug("Output code written to: %s", mod.__file__)
+        output_code_log.info("Output code written to: %s", mod.__file__)
+        if config.benchmark_kernel:
+            print(f"Compiled module path: {mod.__file__}", file=sys.stderr)
+        V.debug.output_code(mod.__file__)
+        V.debug.copy(os.path.splitext(mod.__file__)[0] + ".debug")
+
+        return mod
+
+    def _compile_to_module_lines(
+        self, wrapper_code: ValueWithLineMap
+    ) -> CompiledModule:
+        from .codecache import PyCodeCache
+
+        if config.triton.autotune_at_compile_time:
+            # sanitize docstrings in kernel defs (#155006)
+            kernel_autotune_defs = self.wrapper_code.kernel_autotune_defs.getvalue()
+            kernel_autotune_defs = kernel_autotune_defs.replace('"""', '\\"\\"\\"')
+
+            tuning_code = (
+                '"""\n'
+                + "Compile-time auto-tuning block: \n"
+                + kernel_autotune_defs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 + self.wrapper_code.kernel_autotune_calls.getvalue()
                 + '"""\n'
             )
@@ -2140,6 +2675,7 @@ def _compile_to_module(self) -> ModuleType:
         if config.benchmark_harness and config.profile_bandwidth_output:
             # run the inputs code gen to get the bandwidth info
             mod.benchmark_compiled_module(times=1, repeat=1)
+<<<<<<< HEAD
         # Logged twice as per https://github.com/pytorch/pytorch/pull/99038#discussion_r1167826029
         # TODO. Revisit this once the logging API is more mature
         assert mod.__file__ is not None
@@ -2151,6 +2687,9 @@ def _compile_to_module(self) -> ModuleType:
             print(f"Compiled module path: {mod.__file__}", file=sys.stderr)
         V.debug.output_code(mod.__file__)
         V.debug.copy(os.path.splitext(mod.__file__)[0] + ".debug")
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return mod
 
     def get_output_names(self) -> list[str]:
diff --git a/torch/_inductor/index_propagation.py b/torch/_inductor/index_propagation.py
index 16430ced7e6c..110028fee599 100644
--- a/torch/_inductor/index_propagation.py
+++ b/torch/_inductor/index_propagation.py
@@ -35,7 +35,11 @@
 from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges
 
 from .ops_handler import DefaultHandler
+<<<<<<< HEAD
 from .sizevars import evaluate_expr
+=======
+from .sizevars import statically_known_true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .utils import generate_assert
 from .virtualized import V
 
@@ -311,7 +315,11 @@ def statically_true(self, e):
               If this is an issue, just use guards in `self.axioms`.
 
               The proper way of handling this would be to have a global shape_env that adds
+<<<<<<< HEAD
               runtime_asserts as they happen in the code. Then, it shuld be used in SimplifyIndexing
+=======
+              runtime_asserts as they happen in the code. Then, it should be used in SimplifyIndexing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               to perform wrap_expr and in CSEProxy.check_bounds to elide upper / lower bounds also
               for indirect_indexing
         """
@@ -322,7 +330,11 @@ def statically_true(self, e):
                 for k, v in self.indirect_var_ranges.items()
             ),
         )
+<<<<<<< HEAD
         return evaluate_expr(self.shape_env, e, self.axioms, var_to_range)
+=======
+        return statically_known_true(self.shape_env, e, self.axioms, var_to_range)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def indirect_indexing(
         self,
diff --git a/torch/_inductor/inductor_prims.py b/torch/_inductor/inductor_prims.py
index ba3feee517d2..2cd184fd1ec4 100644
--- a/torch/_inductor/inductor_prims.py
+++ b/torch/_inductor/inductor_prims.py
@@ -1,7 +1,13 @@
 # mypy: allow-untyped-defs
 from __future__ import annotations
 
+<<<<<<< HEAD
 import logging
+=======
+import functools
+import logging
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional, TYPE_CHECKING
 
 import torch
@@ -119,7 +125,39 @@ def eager_prepare_softmax(x: Tensor, dim: int) -> tuple[Tensor, Tensor]:
 )
 
 
+<<<<<<< HEAD
 def _low_memory_max_pool2d_with_offsets_aten(
+=======
+def _flattened_index_to_nd(indices, width):
+    import sympy
+
+    from torch.utils._sympy.functions import FloorDiv
+
+    dim = len(width)
+
+    if dim == 1:
+        return [indices]
+    elif dim >= 2:
+        m = functools.reduce(operator.mul, width[1:])
+        if isinstance(indices, sympy.Expr) or isinstance(m, sympy.Expr):
+            ih = FloorDiv(indices, m)
+        else:
+            ih = indices // m
+        indices_new = indices - (ih * m)
+        return [ih, *_flattened_index_to_nd(indices_new, width[1:])]
+    else:
+        raise ValueError(f"Unknown dim: {dim}")
+
+
+def _flatten_index(indices, width):
+    result = indices[0]
+    for d in range(1, len(indices)):
+        result = width[d] * result + indices[d]
+    return result
+
+
+def _low_memory_max_pool_with_offsets_aten(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     self,
     kernel_size,
     stride,
@@ -127,6 +165,7 @@ def _low_memory_max_pool2d_with_offsets_aten(
     dilation,
     ceil_mode,
 ):
+<<<<<<< HEAD
     vals, indices = torch.ops.aten.max_pool2d_with_indices(
         self, kernel_size, stride, padding, dilation, ceil_mode
     )
@@ -156,10 +195,38 @@ def _low_memory_max_pool2d_with_offsets_aten(
     w_inc = iw - wbase
 
     offsets = h_inc * kernel_width + w_inc
+=======
+    dim = len(kernel_size)
+    if dim == 2:
+        vals, indices = torch.ops.aten.max_pool2d_with_indices(
+            self, kernel_size, stride, padding, dilation, ceil_mode
+        )
+    else:
+        vals, indices = torch.ops.aten.max_pool3d_with_indices(
+            self, kernel_size, stride, padding, dilation, ceil_mode
+        )
+
+    idhw = _flattened_index_to_nd(indices, self.shape[-dim:])
+
+    dhw_inc = []
+
+    for d in range(dim):
+        bh_shape = [1] * self.ndim
+        bh_shape[-dim + d] = -1
+        bh = torch.arange(
+            indices.shape[-dim + d], dtype=torch.int64, device=self.device
+        ).view(bh_shape)
+        hbase = bh * stride[d] - padding[d]
+        h_inc = (idhw[d] - hbase) // dilation[d]
+        dhw_inc.append(h_inc)
+
+    offsets = _flatten_index(dhw_inc, kernel_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return vals, offsets.to(torch.int8)
 
 
+<<<<<<< HEAD
 def _low_memory_max_pool2d_offsets_to_indices_aten(
     offsets, kernel_width, input_width, stride, padding
 ):
@@ -190,12 +257,48 @@ def _low_memory_max_pool2d_offsets_to_indices_aten(
 _low_memory_max_pool2d_with_offsets = make_prim(
     "_low_memory_max_pool2d_with_offsets(Tensor self, SymInt[2] kernel_size, SymInt[2] stride,  SymInt[2] padding, SymInt[2] dilation, bool ceil_mode) -> (Tensor, Tensor)",  # noqa: B950
     _low_memory_max_pool2d_with_offsets_aten,
+=======
+def _low_memory_max_pool_offsets_to_indices_aten(
+    offsets,
+    kernel_size,
+    input_size,
+    stride,
+    padding,
+    dilation,
+):
+    dim = len(kernel_size)
+    offsets = offsets.to(torch.int64)
+    dhw_inc = _flattened_index_to_nd(offsets, kernel_size)
+
+    idhw = []
+    for d in range(dim):
+        bh_shape = [1] * offsets.ndim
+        bh_shape[-dim + d] = -1
+        bh = torch.arange(
+            offsets.shape[-dim + d], dtype=torch.int64, device=offsets.device
+        ).view(bh_shape)
+        hbase = bh * stride[d] - padding[d]
+        idhw.append(hbase + dhw_inc[d] * dilation[d])
+
+    return _flatten_index(idhw, input_size)
+
+
+_low_memory_max_pool_with_offsets = make_prim(
+    "_low_memory_max_pool_with_offsets(Tensor self, SymInt[] kernel_size, SymInt[] stride,  SymInt[] padding, SymInt[] dilation, bool ceil_mode) -> (Tensor, Tensor)",  # noqa: B950
+    _low_memory_max_pool_with_offsets_aten,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return_type=(_prims.RETURN_TYPE.NEW, _prims.RETURN_TYPE.NEW),
     doc="Instead of returning indices, returns indices offsets.",
 )
 
+<<<<<<< HEAD
 _low_memory_max_pool2d_offsets_to_indices = make_prim(
     "_low_memory_max_pool2d_offsets_to_indices(Tensor self, SymInt kernel_w, SymInt input_w, SymInt[2] stride, SymInt[2] padding) -> Tensor",  # noqa: B950
     _low_memory_max_pool2d_offsets_to_indices_aten,
+=======
+_low_memory_max_pool_offsets_to_indices = make_prim(
+    "_low_memory_max_pool_offsets_to_indices(Tensor self, SymInt[] kernel_size, SymInt[] input_size, SymInt[] stride, SymInt[] padding, SymInt[] dilation) -> Tensor",  # noqa: B950
+    _low_memory_max_pool_offsets_to_indices_aten,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     doc="Convert small int offsets to regular indices.",
 )
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 9beafb2730ef..49dbe2c22aa6 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -5,6 +5,10 @@
 import functools
 import itertools
 import logging
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import textwrap
 import traceback
 import typing
@@ -47,11 +51,23 @@
 )
 from torch._subclasses.fake_tensor import get_schema_info
 from torch.fx.experimental.symbolic_shapes import (
+<<<<<<< HEAD
     compute_unbacked_bindings,
     free_unbacked_symbols,
     rebind_unbacked,
     resolve_unbacked_bindings,
     ShapeEnv,
+=======
+    _remove_effect_token_unbacked_bindings,
+    compute_unbacked_bindings,
+    free_symbols,
+    free_unbacked_symbols,
+    IterateExprs,
+    rebind_unbacked,
+    resolve_unbacked_bindings,
+    ShapeEnv,
+    statically_known_true,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SymTypes,
 )
 from torch.utils._ordered_set import OrderedSet
@@ -61,12 +77,20 @@
 from . import config, dependencies
 from .codegen.common import (
     BackendFeature,
+<<<<<<< HEAD
+=======
+    CodegenSymbol,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_scheduling_for_device,
     index_prevent_reordering,
 )
 from .dependencies import (
     Dep,
+<<<<<<< HEAD
     extract_free_unbacked_symbols,
+=======
+    extract_free_symbols,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     extract_input_node_reduction_ranges,
     extract_read_writes,
     var_builder,
@@ -83,7 +107,15 @@
     convert_shape_to_inductor,
     convert_shape_to_symint,
     developer_warning,
+<<<<<<< HEAD
+    get_kernel_metadata,
+=======
+    do_bench_using_profiling,
+    dtype_from_size,
+    get_dtype_size,
     get_kernel_metadata,
+    GPU_ALIGN_BYTES,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ir_dataclass,
     is_dynamic,
     is_gpu,
@@ -92,11 +124,19 @@
     sympy_index_symbol_with_prefix,
     sympy_product,
     sympy_subs,
+<<<<<<< HEAD
+=======
+    tensor_is_aligned,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from .virtualized import ops, OpsValue, V
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from torch._library.fake_class_registry import FakeScriptObject
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.fx.node import Node
 
     from .codegen.cuda.cuda_template import CUDATemplate
@@ -182,17 +222,39 @@
 ]
 
 
+<<<<<<< HEAD
+@dataclasses.dataclass(frozen=True)
+class GraphPartitionSignature:
+=======
+def _is_static(x: object) -> bool:
+    return isinstance(x, (int, Integer))
+
+
 @dataclasses.dataclass(frozen=True)
 class GraphPartitionSignature:
+    # symbol inputs that are necessary for codegen
+    symbol_inputs: OrderedSet[sympy.Symbol]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # mapping from partition input name to IRNode or Expr. Need the name str since
     # we cannot get name from Expr.
     input_nodes: dict[str, Union[IRNode, sympy.Expr, TorchBindObject]]
     output_nodes: list[IRNode]
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # mapping from partition input name to a boolean for whether deallocating it
     # in the partition function
     input_deallocation: dict[str, bool]
     skip_cudagraph: bool
 
+<<<<<<< HEAD
+=======
+    # name of constants read/written by the graph partition
+    constant_names: list[str]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def validate_ir(node_or_nodes: Optional[_NodeOrNodes]) -> None:
     def _check_tensorbox(nodes: Optional[_NodeOrNodes]) -> None:
@@ -265,6 +327,16 @@ def reindex(index: Sequence[_T]) -> Sequence[_V]:
     return reindex
 
 
+<<<<<<< HEAD
+=======
+def get_free_symbols(x: IterateExprs, unbacked_only: bool) -> OrderedSet[sympy.Symbol]:
+    if unbacked_only:
+        return free_unbacked_symbols(x)
+    else:
+        return free_symbols(x)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 NHWC_STRIDE_ORDER = [3, 0, 2, 1]
 NHWDC_STRIDE_ORDER = [4, 0, 3, 2, 1]
 
@@ -395,6 +467,7 @@ def is_aligned_realized_tensor(x: Union[Buffer, TensorBox], alignment: int) -> b
         return False
 
     aligned_strides = all(
+<<<<<<< HEAD
         (V.graph.sizevars.size_hint(x.get_stride()[i]) % alignment) == 0
         for i in range(len(x.get_stride()) - 1)
     )
@@ -402,6 +475,15 @@ def is_aligned_realized_tensor(x: Union[Buffer, TensorBox], alignment: int) -> b
     aligned_last_dim = (
         V.graph.sizevars.size_hint(x.get_stride()[-1]) == 1
         or V.graph.sizevars.size_hint(x.get_size()[-1]) <= 1
+=======
+        (V.graph.sizevars.size_hint_or_throw(x.get_stride()[i]) % alignment) == 0
+        for i in range(len(x.get_stride()) - 1)
+    )
+    # if the last dim size is <= 1, stride doesn't matter
+    aligned_last_dim = (
+        V.graph.sizevars.size_hint_or_throw(x.get_stride()[-1]) == 1
+        or V.graph.sizevars.size_hint_or_throw(x.get_size()[-1]) <= 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return aligned_last_dim and aligned_strides
 
@@ -467,6 +549,28 @@ def try_match_insignificant_strides(
     return TensorBox(ReinterpretView(data=storage, layout=new_layout))
 
 
+<<<<<<< HEAD
+=======
+def gm_original_output_strides(gm: torch.fx.GraphModule) -> None:
+    output_node = gm.graph.find_nodes(op="output")[0]
+    output_node.meta["user_visible_output_idxs"] = [
+        idx for idx, _ in enumerate(output_node.args)
+    ]
+    from torch._inductor.compile_fx import record_original_output_strides
+
+    record_original_output_strides(gm)
+
+
+def get_symbolic_inputs(inputs: list[Buffer]) -> list[Expr]:
+    sym_vars: OrderedSet[Expr] = OrderedSet()
+    for inp in inputs:
+        sym_vars |= get_free_symbols(inp.get_size(), unbacked_only=False)
+        sym_vars |= get_free_symbols(inp.get_stride(), unbacked_only=False)
+
+    return list(sym_vars)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class IRNode:
     _current_origins: ClassVar[OrderedSet[Any]] = OrderedSet()
 
@@ -576,7 +680,11 @@ def get_numel(self) -> Expr:
         return sympy_product(self.get_size())
 
     def is_zero_elements(self) -> bool:
+<<<<<<< HEAD
         return V.graph.sizevars.is_expr_static_and_true(sympy.Eq(self.get_numel(), 0))
+=======
+        return V.graph.sizevars.statically_known_true(sympy.Eq(self.get_numel(), 0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def realize(self) -> Optional[str]:
         """
@@ -634,6 +742,15 @@ def maybe_get_name(self) -> Optional[str]:
         except NotImplementedError:
             return None
 
+<<<<<<< HEAD
+=======
+    def is_input_buffer(self) -> bool:
+        try:
+            return self.get_name() in V.graph.graph_inputs
+        except NotImplementedError:
+            return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def has_large_inner_fn(self, threshold: Optional[int] = None) -> bool:
         return False
 
@@ -677,7 +794,13 @@ def num_reads(self) -> int:
     def get_storage_numel(self) -> _IntLike:
         raise NotImplementedError(type(self).__name__)
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[Symbol]:
+=======
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError(type(self).__name__)
 
     def get_reduction_type(self) -> Optional[str]:
@@ -754,8 +877,16 @@ def get_outputs(self) -> list[Buffer]:
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
         """
+=======
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        """
+        When unbacked_only=True:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Returns the unbacked symbols which are required to be in scope in
         order to successfully perform codegen for this buffer.  For example,
         a buffer that corresponds to an extern kernel call that takes i0 as
@@ -768,6 +899,13 @@ def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
         not report it here, because you will already have a dependency
         on that buffer, which will eventually have a dependency on i0 if
         necessary.
+<<<<<<< HEAD
+=======
+
+        When unbacked_only=False:
+        Similar to `unbacked_only=True` but including all free symbols
+        instead of only free unbacked symbols.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         return OrderedSet()
 
@@ -786,10 +924,19 @@ class Loops(IRNode):
     inner_fn: Callable[..., Any]
     ranges: Sequence[_IntLike]
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[Symbol]:
         return OrderedSet().union(
             *(free_unbacked_symbols(e) for e in self.ranges),
             self.inner_fn_free_unbacked_symbols(),
+=======
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        return OrderedSet().union(
+            *(get_free_symbols(e, unbacked_only) for e in self.ranges),
+            self.inner_fn_free_symbols(unbacked_only),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _to_str(self, names: Sequence[str]) -> str:
@@ -827,8 +974,11 @@ def get_pointwise_size(self) -> Sequence[Expr]:
     def create(cls, *args: Any, **kwargs: Any) -> TensorBox:
         origin_node = kwargs.pop("origin_node", None)
         tb = kwargs.pop("traceback", None)
+<<<<<<< HEAD
         # if "origin_node" in kwargs:
         #     breakpoint()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r = cls(*args, **kwargs)
         # Need to explicitly set origin_node here to propagate it down.
         # todo(chilli): I think it would be better for IRNode to directly set
@@ -869,9 +1019,15 @@ def has_large_inner_fn(self, threshold: Optional[int] = None) -> bool:
         threshold = max(threshold, config.realize_opcount_threshold)
         return self.inner_fn_opcount().num_ops > threshold
 
+<<<<<<< HEAD
     def inner_fn_free_unbacked_symbols(self) -> OrderedSet[Symbol]:
         index = self._index(self.ranges)
         return extract_free_unbacked_symbols(self.inner_fn, index)
+=======
+    def inner_fn_free_symbols(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
+        index = self._index(self.ranges)
+        return extract_free_symbols(self.inner_fn, index, unbacked_only=unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_reads(self) -> OrderedSet[Dep]:
         with patch.object(FlexibleLayout, "allow_indexing", True):
@@ -1070,9 +1226,15 @@ def __str__(self) -> str:
 
     __repr__ = __str__
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[Symbol]:
         return super().get_unbacked_symbol_uses() | OrderedSet().union(
             *(free_unbacked_symbols(e) for e in self.reduction_ranges)
+=======
+    def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
+        return super().get_free_symbol_uses(unbacked_only) | OrderedSet().union(
+            *(get_free_symbols(e, unbacked_only) for e in self.reduction_ranges)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def get_reduction_size(self) -> Sequence[sympy.Expr]:
@@ -1104,10 +1266,19 @@ def inner_fn_args(self) -> Sequence[Sequence[Expr]]:
         rindex = self._index(self.reduction_ranges, SymT.R0_INDEX)
         return (index, rindex)
 
+<<<<<<< HEAD
     def inner_fn_free_unbacked_symbols(self) -> OrderedSet[Symbol]:
         index = self._index(self.ranges)
         rindex = self._index(self.reduction_ranges, SymT.R0_INDEX)
         return extract_free_unbacked_symbols(self.inner_fn, index, rindex)
+=======
+    def inner_fn_free_symbols(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
+        index = self._index(self.ranges)
+        rindex = self._index(self.reduction_ranges, SymT.R0_INDEX)
+        return extract_free_symbols(
+            self.inner_fn, index, rindex, unbacked_only=unbacked_only
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def constant_to_device(self, device: torch.device) -> IRNode:
         """Move this to a given device. Requires that all reads are to constants."""
@@ -1136,9 +1307,12 @@ def num_splits(
         reduction_numel: Expr,
         input_node: Optional[IRNode] = None,
     ) -> tuple[ReductionHint, _IntLike]:
+<<<<<<< HEAD
         def _is_static(x: object) -> bool:
             return isinstance(x, (int, Integer))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction_numel_hint = V.graph.sizevars.symbolic_hint(reduction_numel)
         numel_hint = V.graph.sizevars.symbolic_hint(sympy_product(ranges))
 
@@ -1406,7 +1580,11 @@ def fn(index: int) -> OpsValue:
 
         if (
             isinstance(reduction_numel, Integer)
+<<<<<<< HEAD
             and V.graph.sizevars.size_hint(reduction_numel)
+=======
+            and V.graph.sizevars.size_hint_or_throw(reduction_numel)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             < config.unroll_reductions_threshold
             and (sympy_product(ranges) != 1 or is_gpu(device.type))
         ):
@@ -1433,6 +1611,21 @@ def fn(index: int) -> OpsValue:
             reduction_numel,
             input_node,
         )
+<<<<<<< HEAD
+=======
+
+        def _maybe_increase_split(split: int) -> int:
+            # don't apply min_num_split constraint for static shape case.
+            if _is_static(reduction_numel):
+                return split
+            if split > 1:
+                return max(split, config.min_num_split)
+            else:
+                return split
+
+        split = _maybe_increase_split(split)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # intermediate reduction in split can contain complex indexing,
         # and num_splits will fail to correctly set the hint
         # reuse the passed hint if available
@@ -1469,6 +1662,10 @@ def fn(index: int) -> OpsValue:
                 reduction_type,
                 split,
                 reduction_hint,
+<<<<<<< HEAD
+=======
+                input_node,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         return TensorBox.create(
@@ -1541,6 +1738,41 @@ def _multilayer_second_step_hint(
         return reduction_hint
 
     @classmethod
+<<<<<<< HEAD
+=======
+    def check_for_split_dense_dim_reindexing(
+        cls, reduction_numel: _IntLike, input_node: Optional[IRNode]
+    ) -> Optional[int]:
+        """
+        If we are reducing over the full tensor, and it is non-dense in the last dimension,
+        reindex so we reduce over the dense dimension. initially just handle complete
+        reduction case
+        """
+        if input_node is None:
+            return None
+
+        if not V.graph.sizevars.statically_known_equals(
+            input_node.get_numel(), reduction_numel
+        ):
+            return None
+
+        input_node.realize()
+        try:
+            # finalize layout
+            as_storage_and_layout(input_node)
+        except NotImplementedError:
+            return None
+
+        strides = input_node.get_stride()
+
+        for i, s in enumerate(strides[:-1]):
+            if V.graph.sizevars.statically_known_equals(s, 1):
+                return i
+
+        return None
+
+    @classmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _multilayer_wrap_loader(
         cls,
         loader: Callable[..., OpsValue],
@@ -1549,9 +1781,21 @@ def _multilayer_wrap_loader(
         split: _IntLike,
         block_size: _IntLike,
         default: Union[_NumLike, Sequence[_NumLike]],
+<<<<<<< HEAD
     ) -> Callable[..., object]:
         reindex = View.dynamic_reshape_indexer(reduction_ranges, [reduction_numel])
         need_mask = not V.graph.sizevars.is_expr_static_and_true(
+=======
+        input_node: Optional[IRNode] = None,
+    ) -> Callable[..., object]:
+        dense_index = cls.check_for_split_dense_dim_reindexing(
+            reduction_numel, input_node
+        )
+        reindex = View.dynamic_reshape_indexer(
+            reduction_ranges, [reduction_numel], dense_index
+        )
+        need_mask = not V.graph.sizevars.statically_known_true(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sympy.Eq(reduction_numel % split, 0)
         )
 
@@ -1566,9 +1810,16 @@ def body() -> OpsValue:
                 return loader(new_index, reindex([indices]))
 
             if need_mask:
+<<<<<<< HEAD
                 mask = ops.lt(
                     ops.index_expr(indices, torch.int32),
                     ops.index_expr(reduction_numel, torch.int32),
+=======
+                index_dtype = dtype_from_size(reduction_numel)
+                mask = ops.lt(
+                    ops.index_expr(indices, index_dtype),
+                    ops.index_expr(reduction_numel, index_dtype),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 return ops.masked(mask, body, default)
             else:
@@ -1681,6 +1932,10 @@ def create_multilayer(
         reduction_type: ReductionType,
         split: _IntLike,
         reduction_hint: ReductionHint,
+<<<<<<< HEAD
+=======
+        input_node: Optional[IRNode] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> TensorBox:
         """
         Break a large reduction up into multiple smaller reductions
@@ -1691,7 +1946,17 @@ def create_multilayer(
         block_size = FloorDiv(reduction_numel + (split - 1), split)
         default = cls.default_value(reduction_type, dst_dtype)
         wrapper_fn = cls._multilayer_wrap_loader(
+<<<<<<< HEAD
             inner_fn, reduction_ranges, reduction_numel, split, block_size, default
+=======
+            inner_fn,
+            reduction_ranges,
+            reduction_numel,
+            split,
+            block_size,
+            default,
+            input_node,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         return cls.create_multilayer_helper(
@@ -1992,7 +2257,11 @@ def create_multilayer(  # type: ignore[override]
         recursively
         """
         reduction_numel = sympy_product(reduction_ranges)
+<<<<<<< HEAD
         need_mask = not V.graph.sizevars.is_expr_static_and_true(
+=======
+        need_mask = not V.graph.sizevars.statically_known_true(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sympy.Eq(reduction_numel % split, 0)
         )
 
@@ -2081,16 +2350,32 @@ class Scan(Loops):
     dtypes: tuple[torch.dtype, ...]
     inner_fns: tuple[Callable[..., Any], ...]
 
+<<<<<<< HEAD
     # HACK we mimick reduction
 
     def get_unbacked_symbol_uses(self) -> OrderedSet[Symbol]:
+=======
+    # HACK we mimic reduction
+
+    def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: Can combine_fn/reindex close over unbacked symbols? If so, we
         # need to explicitly represent the closure so we can pull out unbacked
         # symbols here
         return (
+<<<<<<< HEAD
             super().get_unbacked_symbol_uses()
             | OrderedSet().union(*(free_unbacked_symbols(e) for e in self.scan_ranges))
             | OrderedSet().union(*(free_unbacked_symbols(e) for e in self.size))
+=======
+            super().get_free_symbol_uses(unbacked_only)
+            | OrderedSet().union(
+                *(get_free_symbols(e, unbacked_only) for e in self.scan_ranges)
+            )
+            | OrderedSet().union(
+                *(get_free_symbols(e, unbacked_only) for e in self.size)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def __post_init__(self) -> None:
@@ -2133,11 +2418,19 @@ def inner_fn_args(self) -> Sequence[Sequence[_IntLike]]:
         idx = self.reindex(index, rindex)
         return (idx,)
 
+<<<<<<< HEAD
     def inner_fn_free_unbacked_symbols(self) -> OrderedSet[Symbol]:
         index = self._index(self.ranges)
         rindex = self._index(self.scan_ranges, SymT.R0_INDEX)
         idx = self.reindex(index, rindex)
         return extract_free_unbacked_symbols(self.inner_fn, idx)
+=======
+    def inner_fn_free_symbols(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
+        index = self._index(self.ranges)
+        rindex = self._index(self.scan_ranges, SymT.R0_INDEX)
+        idx = self.reindex(index, rindex)
+        return extract_free_symbols(self.inner_fn, idx, unbacked_only=unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def create(  # type: ignore[override]
@@ -2171,7 +2464,11 @@ def create(  # type: ignore[override]
         assert len(dtypes) == len(inner_fns)
 
         # Scan with a single element is just a copy
+<<<<<<< HEAD
         if sizevars.is_expr_static_and_true(sympy.Le(scan_numel, 1)):
+=======
+        if sizevars.statically_known_true(sympy.Le(scan_numel, 1)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [
                 Pointwise.create(
                     device=device,
@@ -2286,6 +2583,7 @@ class Sort(Loops):
     stable: bool
     descending: bool
 
+<<<<<<< HEAD
     # HACK we mimick reduction
 
     def get_unbacked_symbol_uses(self) -> OrderedSet[Symbol]:
@@ -2293,6 +2591,19 @@ def get_unbacked_symbol_uses(self) -> OrderedSet[Symbol]:
             super().get_unbacked_symbol_uses()
             | OrderedSet().union(*(free_unbacked_symbols(e) for e in self.sort_ranges))
             | OrderedSet().union(*(free_unbacked_symbols(e) for e in self.size))
+=======
+    # HACK we mimic reduction
+
+    def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
+        return (
+            super().get_free_symbol_uses(unbacked_only)
+            | OrderedSet().union(
+                *(get_free_symbols(e, unbacked_only) for e in self.sort_ranges)
+            )
+            | OrderedSet().union(
+                *(get_free_symbols(e, unbacked_only) for e in self.size)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def __post_init__(self) -> None:
@@ -2334,11 +2645,19 @@ def inner_fn_args(self) -> Sequence[Sequence[Expr]]:
         idx = self.reindex(index, rindex)
         return (idx,)
 
+<<<<<<< HEAD
     def inner_fn_free_unbacked_symbols(self) -> OrderedSet[Symbol]:
         index = self._index(self.ranges)
         rindex = self._index(self.sort_ranges, SymT.R0_INDEX)
         idx = self.reindex(index, rindex)
         return extract_free_unbacked_symbols(self.inner_fn, idx)
+=======
+    def inner_fn_free_symbols(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
+        index = self._index(self.ranges)
+        rindex = self._index(self.sort_ranges, SymT.R0_INDEX)
+        idx = self.reindex(index, rindex)
+        return extract_free_symbols(self.inner_fn, idx, unbacked_only=unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def create(  # type: ignore[override]
@@ -2367,7 +2686,11 @@ def create(  # type: ignore[override]
         max_rblock = 512
         is_persistent_kernel = (
             config.triton.persistent_reductions
+<<<<<<< HEAD
             and sizevars.is_expr_static_and_true(sympy.Le(sort_numel, max_rblock))
+=======
+            and sizevars.statically_known_true(sympy.Le(sort_numel, max_rblock))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if not is_persistent_kernel:
             # We only support persistent triton kernels
@@ -2376,7 +2699,11 @@ def create(  # type: ignore[override]
         assert len(dtypes) == len(inner_fns)
 
         # Sort with a single element is just a copy
+<<<<<<< HEAD
         if sizevars.is_expr_static_and_true(sympy.Le(sort_numel, 1)):
+=======
+        if sizevars.statically_known_true(sympy.Le(sort_numel, 1)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [
                 Pointwise.create(
                     device=device,
@@ -2510,12 +2837,38 @@ def is_stride_order_storage_and_layout(
         return False
 
 
+<<<<<<< HEAD
+=======
+def is_unaligned(node: IRNode) -> bool:
+    if isinstance(node, (TensorBox, StorageBox)):
+        return is_unaligned(node.data)
+
+    if isinstance(node, ReinterpretView):
+        layout = node.layout
+        has_unaligned_layout = not statically_known_true(
+            layout.offset * get_dtype_size(layout.dtype) % GPU_ALIGN_BYTES == 0
+        )
+        return is_unaligned(node.data) or has_unaligned_layout
+
+    if isinstance(node, Buffer):
+        return node.get_name() in V.graph.unaligned_buffers
+
+    # assume to be aligned otherwise
+    return False
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @ir_dataclass
 class BaseView(IRNode):
     data: IRNode
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[Symbol]:
         return self.data.get_unbacked_symbol_uses()
+=======
+    def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
+        return self.data.get_free_symbol_uses(unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def make_reindexer(self) -> Callable[[Sequence[Expr]], Sequence[Expr]]:
         raise NotImplementedError(f"make_reindexer NYI on {self}")
@@ -2830,7 +3183,11 @@ def handle_negative_index(idx, size):  # type: ignore[no-untyped-def]
         return idx
 
     @classmethod
+<<<<<<< HEAD
     def create(cls, x, new_size):  # type: ignore[no-untyped-def]
+=======
+    def create(cls, x, new_size):  # type: ignore[no-untyped-def, override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(new_size, (tuple, list))
         old_size, new_size = cls.resolve_negative_size(x.get_size(), new_size)
 
@@ -2859,9 +3216,13 @@ def fake_reindex(index):  # type: ignore[no-untyped-def]
                 # TODO: unbacked should not diverge from backed in determining striding
                 # Need to require contiguous here instead of realize, see:
                 # https://github.com/pytorch/pytorch/issues/145561
+<<<<<<< HEAD
                 x = ExternKernel.require_exact_strides(
                     x, FlexibleLayout.contiguous_strides(x.get_size())
                 )
+=======
+                x = ExternKernel.require_contiguous(x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             storage, old_layout = as_storage_and_layout(x, want_contiguous=True)
             new_layout = FixedLayout(
@@ -2892,9 +3253,20 @@ def resolve_negative_size(old_size, new_size):  # type: ignore[no-untyped-def]
         return old_size, new_size
 
     @classmethod
+<<<<<<< HEAD
     def dynamic_reshape_indexer(cls, old_size, new_size):  # type: ignore[no-untyped-def]
         try:
             reindex = cls._dynamic_reshape_indexer(old_size, new_size)
+=======
+    def dynamic_reshape_indexer(
+        cls,
+        old_size: Sequence[_IntLike],
+        new_size: Sequence[_IntLike],
+        dense_dim: Optional[int] = None,  # type: ignore[no-untyped-def]
+    ) -> Callable[[Sequence[_T]], Sequence[_V]]:
+        try:
+            reindex = cls._dynamic_reshape_indexer(old_size, new_size, dense_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except (AssertionError, IndexError):
             # optimistic algorithm failed, lets do a fallback
             flat = [sympy_product(old_size)]
@@ -2904,7 +3276,11 @@ def dynamic_reshape_indexer(cls, old_size, new_size):  # type: ignore[no-untyped
         return reindex
 
     @staticmethod
+<<<<<<< HEAD
     def _dynamic_reshape_indexer(old_size, new_size):  # type: ignore[no-untyped-def]
+=======
+    def _dynamic_reshape_indexer(old_size, new_size, dense_dim: Optional[int] = None):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Perform a reshape entirely by modifying indexing math
         """
@@ -2918,6 +3294,20 @@ def _dynamic_reshape_indexer(old_size, new_size):  # type: ignore[no-untyped-def
         stack_new = list(zip(vars, new_size))
         stack_old = list(old_size)
 
+<<<<<<< HEAD
+=======
+        # process the dense dim first
+        reordering_dense_dim = (
+            dense_dim is not None
+            and dense_dim != len(stack_old) - 1
+            and len(new_size) == 1
+        )
+        if reordering_dense_dim:
+            assert dense_dim is not None  # mypy
+            old_dim = stack_old.pop(dense_dim)
+            stack_old.append(old_dim)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         view_expr = []
         while stack_new and stack_old:
             size_old = stack_old.pop()
@@ -2960,7 +3350,18 @@ def _dynamic_reshape_indexer(old_size, new_size):  # type: ignore[no-untyped-def
             var, size_new = stack_new.pop()
             V.graph.sizevars.guard_equals(size_new, 1)
 
+<<<<<<< HEAD
         view_expr.reverse()
+=======
+        if dense_dim is not None and len(new_size) == 1:
+            view_expr.reverse()
+            # Move the last expression (dense dim) to its original position
+            dense_expr = view_expr.pop()
+            view_expr.insert(dense_dim, dense_expr)
+        else:
+            view_expr.reverse()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(view_expr) == len(old_size)
 
         def reindex(index):  # type: ignore[no-untyped-def]
@@ -2992,7 +3393,11 @@ def __str__(self) -> str:
 
     __repr__ = __str__
 
+<<<<<<< HEAD
     def get_name(self):  # type: ignore[no-untyped-def]
+=======
+    def get_name(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.data.get_name()
 
     def get_device(self) -> Optional[torch.device]:
@@ -3031,11 +3436,21 @@ def get_layout(self) -> Layout:
     def freeze_layout(self):  # type: ignore[no-untyped-def]
         pass
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
         return (
             free_unbacked_symbols(self.layout.size)
             | free_unbacked_symbols(self.layout.stride)
             | free_unbacked_symbols(self.layout.offset)
+=======
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        return (
+            get_free_symbols(self.layout.size, unbacked_only)
+            | get_free_symbols(self.layout.stride, unbacked_only)
+            | get_free_symbols(self.layout.offset, unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def codegen_reference(self, writer: Optional[IndentedBuffer] = None) -> str:
@@ -3135,7 +3550,11 @@ def clamp_wrap(val, lower, upper, default):  # type: ignore[no-untyped-def]
         return start, end
 
     @classmethod
+<<<<<<< HEAD
     def create(cls, x, dim, start, end, step=1, clamp=True):  # type: ignore[no-untyped-def]
+=======
+    def create(cls, x, dim, start, end, step=1, clamp=True):  # type: ignore[no-untyped-def, override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         step = sympy.expand(step)
         assert isinstance(step, sympy.Expr) or step > 0
         try:
@@ -3293,6 +3712,18 @@ def __str__(self) -> str:
     def get_device(self) -> torch.device:
         return self.device
 
+<<<<<<< HEAD
+=======
+    def get_example(self) -> torch.Tensor:
+        with V.fake_mode:
+            return torch.empty_strided(
+                convert_shape_to_symint(self.size),
+                convert_shape_to_symint(self.stride),
+                dtype=self.dtype,
+                device=self.device,
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_contiguous(self) -> bool:
         return is_contiguous_strides_for_shape(self.stride, self.size)
 
@@ -3557,7 +3988,11 @@ def same_ordered(sizes, stride):  # type: ignore[no-untyped-def]
         the fill order should be [1, 3, 2, 0]
         """
         assert len(sizes) == len(stride)
+<<<<<<< HEAD
         stride = [V.graph.sizevars.size_hint(x) for x in stride]
+=======
+        stride = [V.graph.sizevars.size_hint_or_throw(x) for x in stride]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fill_order = sorted(range(len(stride)), key=stride.__getitem__)
         return FlexibleLayout.fill_ordered(sizes, fill_order)
 
@@ -3727,7 +4162,11 @@ def __init__(self, target: IRNode) -> None:
     def stride(self) -> list[Expr]:
         return self.real_layout().stride
 
+<<<<<<< HEAD
     @stride.setter
+=======
+    @stride.setter  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def stride(self, value: Never) -> None:
         pass  # ignore setting of stride
 
@@ -3796,7 +4235,11 @@ def make_indexer(self) -> Callable[[Sequence[Expr]], Expr]:
 
 
 @ir_dataclass(frozen=False)
+<<<<<<< HEAD
 class Buffer(IRNode):
+=======
+class Buffer(IRNode, CodegenSymbol):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Name is sometimes None; e.g., ForceInPlace, where there isn't
     # a meaningful name
     name: Optional[str]
@@ -3816,6 +4259,14 @@ def get_name(self) -> str:
         assert self.name, self
         return self.name
 
+<<<<<<< HEAD
+=======
+    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
+        if isinstance(self.layout, Layout):
+            return self.layout.get_example()
+        raise NotImplementedError(type(self.layout).__name__)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_device(self) -> Optional[torch.device]:
         return self.get_output_spec().get_device()
 
@@ -3873,7 +4324,11 @@ def freeze_layout_with_exact_strides(  # type: ignore[no-untyped-def]
         )
 
     def is_zero_elements(self):  # type: ignore[no-untyped-def]
+<<<<<<< HEAD
         return V.graph.sizevars.is_expr_static_and_true(sympy.Eq(self.get_numel(), 0))
+=======
+        return V.graph.sizevars.statically_known_true(sympy.Eq(self.get_numel(), 0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
         # Loading from a zero-element buffer is a no-op
@@ -3905,7 +4360,13 @@ def get_mutation_names(self) -> Sequence[str]:
     def get_read_names(self) -> OrderedSet[str]:
         return OrderedSet([self.get_name()])
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
+=======
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return OrderedSet()
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
@@ -3975,7 +4436,13 @@ class NoneAsConstantBuffer(IRNode):
     def get_reads(self) -> OrderedSet[Dep]:
         return OrderedSet()
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
+=======
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return OrderedSet()
 
     def codegen_reference(self, writer: Optional[IndentedBuffer] = None) -> str:
@@ -3992,8 +4459,15 @@ def has_tensor_output(self) -> bool:
 class ShapeAsConstantBuffer(IRNode):
     expr: Expr
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
         return free_unbacked_symbols(self.expr)
+=======
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        return get_free_symbols(self.expr, unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen_reference(self, writer: Optional[IndentedBuffer] = None) -> str:
         return V.graph.wrapper_code.codegen_sizevar(self.expr)
@@ -4040,7 +4514,13 @@ def get_read_writes(self) -> dependencies.ReadWrites:
                     self.data.get_size(),  # type: ignore[arg-type]
                 )
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
+=======
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Ordinarily, we'd like to just peek at the arguments list,
         # but ComputedBuffers have no argument list.
         #
@@ -4059,10 +4539,17 @@ def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
         # but it's impossible to end up with a reduction over i0 from an
         # item() call without a regular non-reduction buffer first.
         return (
+<<<<<<< HEAD
             free_unbacked_symbols(self.get_size())
             | free_unbacked_symbols(self.get_stride())
             | free_unbacked_symbols(self.get_offset())
             | self.data.get_unbacked_symbol_uses()
+=======
+            get_free_symbols(self.get_size(), unbacked_only)
+            | get_free_symbols(self.get_stride(), unbacked_only)
+            | get_free_symbols(self.get_offset(), unbacked_only)
+            | self.data.get_free_symbol_uses(unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
@@ -4413,7 +4900,11 @@ def __init__(  # type: ignore[no-untyped-def]
         NOTE:[TritonTemplates with multiple outputs]
         We want the ability for TritonTemplates to output multiple tensors. Triton
         kernels have no notion of outputs and this is done by creating tensors that
+<<<<<<< HEAD
         are then mutated by the kernel. Currenlty our STORE_OUTPUT codegen doesn't
+=======
+        are then mutated by the kernel. Currently our STORE_OUTPUT codegen doesn't
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         support creating multinode outputs for triton templates.
         We work around this by creating an extra input buffer during the lowering
         and we mark them as mutated inputs.
@@ -4441,6 +4932,35 @@ def __init__(  # type: ignore[no-untyped-def]
             allowed_prologue_inps if allowed_prologue_inps else OrderedSet()
         )
 
+<<<<<<< HEAD
+=======
+        self.subgraph_inps: Optional[list[Optional[Union[IRNode, sympy.Expr]]]] = None
+        self.subgraph_outs: Optional[list[Optional[IRNode]]] = None
+
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        res = super().get_free_symbol_uses(unbacked_only)
+        subgraph_outs = self.subgraph_outs if self.subgraph_outs else []
+        subgraph_inps = self.subgraph_inps if self.subgraph_inps else []
+
+        for inp in subgraph_inps:
+            if isinstance(inp, sympy.Expr):
+                res.update(get_free_symbols(inp, unbacked_only))
+            elif isinstance(inp, IRNode):
+                res.update(inp.get_free_symbol_uses(unbacked_only))
+            else:
+                assert inp is None
+
+        for out in subgraph_outs:
+            if isinstance(out, IRNode):
+                res.update(out.get_free_symbol_uses(unbacked_only))
+            else:
+                assert out is None
+
+        return res
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_outputs(self) -> list[Buffer]:
         return self.outputs
 
@@ -4481,6 +5001,11 @@ def __init__(
 
     def benchmark(self, *args, out) -> float:  # type: ignore[no-untyped-def]
         algo = self.to_callable()
+<<<<<<< HEAD
+=======
+        if config.profile_bandwidth_with_do_bench_using_profiling:
+            return do_bench_using_profiling(lambda: algo(*args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return benchmarker.benchmark(algo, args, {"out": out})
 
     def call_name(self) -> str:
@@ -4489,6 +5014,16 @@ def call_name(self) -> str:
     def to_callable(self):  # type: ignore[no-untyped-def]
         raise NotImplementedError
 
+<<<<<<< HEAD
+=======
+    def kernel_hash_key(self) -> str:
+        """
+        Hash key for the underlying kernel. By default, we assume there are no
+        runtime params, so kernel hash key defaults to choice caller's hash key.
+        """
+        return self.hash_key()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def hash_key(self) -> str:
         raise NotImplementedError
 
@@ -4521,7 +5056,11 @@ def __init__(
         self,
         layout: Layout,
         inputs: list[IRNode],
+<<<<<<< HEAD
         choice_timings: Callable[[], dict[ChoiceCaller, float]],
+=======
+        choice_timings_fn: Callable[[], dict[ChoiceCaller, float]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unfiltered_choices: list[ChoiceCaller],
         allowed_prologue_inps: OrderedSet[str],
     ) -> None:
@@ -4531,7 +5070,11 @@ def __init__(
             make_kernel_render=None,
             allowed_prologue_inps=allowed_prologue_inps,
         )
+<<<<<<< HEAD
         self._choice_timings_fn = choice_timings
+=======
+        self._choice_timings_fn = choice_timings_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._choice_timings: Optional[dict[ChoiceCaller, float]] = None
         self.original_inputs = inputs
         self._output_plannable = all(
@@ -4587,15 +5130,30 @@ def __init__(  # type: ignore[no-untyped-def]
         make_kernel_render,
         workspace_size: int,
         template: CUDATemplate,
+<<<<<<< HEAD
+=======
+        supports_epilogue_fusion: bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(layout, inputs, make_kernel_render)
         # Global memory (in bytes) needed for this template.
         self.workspace_size = workspace_size
         self.template = template
+<<<<<<< HEAD
+=======
+        self.supports_epilogue_fusion = supports_epilogue_fusion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_workspace_size(self):  # type: ignore[no-untyped-def]
         return self.workspace_size if self.workspace_size is not None else 0
 
+<<<<<<< HEAD
+=======
+    def emulate_store_fn(self) -> None:
+        for output in self.get_outputs():
+            ops.store(output.get_name(), None, None)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CppTemplateBuffer(TemplateBuffer):
     def __init__(self, layout, inputs, make_kernel_render, template, choice) -> None:  # type: ignore[no-untyped-def]
@@ -4627,7 +5185,11 @@ def get_read_writes(self) -> dependencies.ReadWrites:
             if isinstance(input, list):
                 reads.update(StarDep(x.get_name()) for x in input)
             elif isinstance(input, ShapeAsConstantBuffer):
+<<<<<<< HEAD
                 # Skip creating dependncy for symbolics as they're visible globally
+=======
+                # Skip creating dependency for symbolics as they're visible globally
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             else:
                 reads.add(StarDep(input.get_name()))
@@ -4944,7 +5506,11 @@ def collect_arg_kwarg_properties(self):  # type: ignore[no-untyped-def]
             else {}
         )
         # FIXME: self.kwargs does not always match kwargs defined in schema, so sometimes
+<<<<<<< HEAD
         # ordered_kwargs_for_cpp_kernel is explicilty passed in.
+=======
+        # ordered_kwargs_for_cpp_kernel is explicitly passed in.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.op_overload, torch._ops.OpOverload):
             if not self.ordered_kwargs_for_cpp_kernel:
                 self.ordered_kwargs_for_cpp_kernel = [
@@ -4953,6 +5519,11 @@ def collect_arg_kwarg_properties(self):  # type: ignore[no-untyped-def]
             self.schema_kwargs = [
                 x for x in self.op_overload._schema.arguments if x.kwarg_only
             ]
+<<<<<<< HEAD
+=======
+        else:
+            self.schema_kwargs = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def decide_layout(self):  # type: ignore[no-untyped-def]
         if isinstance(self.layout, FlexibleLayout):
@@ -4962,7 +5533,11 @@ def decide_layout(self):  # type: ignore[no-untyped-def]
     def codegen_comment(self, wrapper) -> None:  # type: ignore[no-untyped-def]
         origin_str, _detailed_origin_str = get_kernel_metadata(self, wrapper)
         if origin_str:
+<<<<<<< HEAD
             wrapper.writeline(origin_str)
+=======
+            wrapper.make_comment(origin_str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen(self, wrapper):  # type: ignore[no-untyped-def]
         raise NotImplementedError
@@ -5080,7 +5655,13 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):  # type: ignore[no-unt
         # strides of inputs and we need to determine accurately what the
         # output stride will be.
         example_args: list[
+<<<<<<< HEAD
             Union[torch.Tensor, torch._C.ScriptObject, torch.Generator]
+=======
+            Union[
+                torch.Tensor, torch._C.ScriptObject, FakeScriptObject, torch.Generator
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ] = []
 
         # We need to retain the constant values of fake tensors that we originally
@@ -5097,7 +5678,11 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):  # type: ignore[no-unt
             ):
                 example_args.append(V.graph.torchbind_constants[x.get_name()])
             elif isinstance(x, TorchBindObject):
+<<<<<<< HEAD
                 example_args.append(x.get_real_obj())
+=======
+                example_args.append(x.get_value())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif isinstance(x, torch._inductor.ir.GeneratorState):
                 device_index = x.device.index
                 assert x.device.type == "cuda" and device_index is not None
@@ -5112,9 +5697,23 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):  # type: ignore[no-unt
 
         unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]] = None
         if shape_env := V.fake_mode.shape_env:
+<<<<<<< HEAD
             rebind_unbacked(shape_env, V.current_node, example_output)
             unbacked_bindings = compute_unbacked_bindings(
                 shape_env, example_output, V.current_node.meta.get("val")
+=======
+            node_meta_val = V.current_node.meta.get("val")
+            ctx = nullcontext()
+            if V.current_node.target == torch._higher_order_ops.effects.with_effects:
+                # remove the first effect token in meta["val"] and meta["unbacked_bindings"]
+                node_meta_val = node_meta_val[1]
+                ctx = _remove_effect_token_unbacked_bindings(V.current_node)  # type: ignore[assignment]
+
+            with ctx:
+                rebind_unbacked(shape_env, V.current_node, example_output)
+            unbacked_bindings = compute_unbacked_bindings(
+                shape_env, example_output, node_meta_val
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         example_out_li = (
@@ -5426,7 +6025,37 @@ def require_channels_last_3d(cls, x):  # type: ignore[no-untyped-def]
 
     @classmethod
     def require_contiguous(cls, x):  # type: ignore[no-untyped-def]
+<<<<<<< HEAD
         return cls.require_stride_order(x, list(reversed(range(len(x.get_size())))))
+=======
+        def is_mkldnn_tensor(x):  # type: ignore[no-untyped-def]
+            def safe_get_name(x):  # type: ignore[no-untyped-def]
+                try:
+                    return x.get_name()
+                except (AttributeError, NotImplementedError):
+                    return None
+
+            return (
+                safe_get_name(x) in V.graph.constants
+                and V.graph.constants[safe_get_name(x)].is_mkldnn
+            )
+
+        # TODO move this to the more proper places
+        if is_mkldnn_tensor(x):
+            return x
+        else:
+            return cls.require_exact_strides(
+                x, FlexibleLayout.contiguous_strides(x.get_size())
+            )
+
+    @classmethod
+    def require_contiguous_strides(cls, x):  # type: ignore[no-untyped-def]
+        # TODO: combine this with require_contiguous after
+        # https://github.com/pytorch/pytorch/pull/148235 lands.
+        return cls.require_exact_strides(
+            x, FlexibleLayout.contiguous_strides(x.get_size())
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def apply_constraint(self) -> None:
         pass
@@ -5565,6 +6194,20 @@ def codegen_kwargs(self, skip_out=False):  # type: ignore[no-untyped-def]
             ]
         return kwargs
 
+<<<<<<< HEAD
+=======
+    def get_op_name(self) -> str:
+        if self.fx_node is not None:
+            target = self.fx_node.target
+            op_namespace = getattr(target, "__module__", "unknown_namespace")
+            op_namespace = op_namespace.replace("._ops.", ".ops.")
+            op_namespace = op_namespace.rsplit(".", 1)[0]
+            op_name = f"{op_namespace}.{target}"
+        else:
+            op_name = "unknown_op"
+        return op_name
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_size_asserts(self, wrapper) -> None:  # type: ignore[no-untyped-def]
         if config.size_asserts and not V.graph.cpp_wrapper:
             # comparing strides for 0 size tensor is tricky. Ignore them for now.
@@ -5572,10 +6215,32 @@ def codegen_size_asserts(self, wrapper) -> None:  # type: ignore[no-untyped-def]
                 return
             size = V.graph.wrapper_code.codegen_shape_tuple(self.get_size())
             stride = V.graph.wrapper_code.codegen_shape_tuple(self.get_stride())
+<<<<<<< HEAD
             wrapper.writeline(
                 f"assert_size_stride({self.get_name()}, {size}, {stride})"
             )
 
+=======
+            op_name = self.get_op_name()
+            wrapper.writeline(
+                f"assert_size_stride({self.get_name()}, {size}, {stride}, {op_name!r})"
+            )
+
+    def codegen_alignment_asserts(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        if config.alignment_asserts and not V.graph.cpp_wrapper:
+            name = self.get_name()
+            aligned = name not in V.graph.unaligned_buffers
+            op_name = self.get_op_name()
+            if aligned:
+                wrapper.writeline(
+                    f"assert_alignment({name}, {GPU_ALIGN_BYTES}, {op_name!r})"
+                )
+            else:
+                wrapper.writeline(
+                    f"# buffer {name} (op: {op_name}) is assumed to be not aligned"
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_group_stride(self):  # type: ignore[no-untyped-def]
         """
         get output sizes and strides, for template_codegen
@@ -5616,6 +6281,7 @@ def canonicalize(self):  # type: ignore[no-untyped-def]
         index = sympy_subs(sympy.expand(index), replacement)
         return index, tuple(new_sizes)
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
         # NB: It's not necessary to check regular inputs as we automatically
         # have dependencies on them
@@ -5624,6 +6290,21 @@ def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
             r |= maybe_free_unbacked_symbols(arg)
         for arg in self.kwargs.values():
             r |= maybe_free_unbacked_symbols(arg)
+=======
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        # NB: It's not necessary to check regular inputs as we automatically
+        # have dependencies on them
+        maybe_get_symbols = (
+            maybe_free_unbacked_symbols if unbacked_only else maybe_free_symbols
+        )
+        r = OrderedSet[sympy.Symbol]()
+        for arg in self.constant_args:
+            r |= maybe_get_symbols(arg)
+        for arg in self.kwargs.values():
+            r |= maybe_get_symbols(arg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return r
 
     def __str__(self) -> str:
@@ -5644,6 +6325,7 @@ def __str__(self) -> str:
 @ir_dataclass(frozen=False)
 class ExternKernelOut(ExternKernel):
     def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+<<<<<<< HEAD
         self.codegen_comment(wrapper)
         args = [*self.codegen_args(), *self.codegen_kwargs(skip_out=True)]
         kernel_name = self.get_kernel_name()
@@ -5663,6 +6345,9 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
             args,
             device,
         )
+=======
+        wrapper.generate_extern_kernel_out(self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(  # type: ignore[no-untyped-def]
         self,
@@ -5717,11 +6402,15 @@ def __init__(self, count: int, device: torch.device) -> None:
 
 class ExternKernelAlloc(ExternKernel):
     def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+<<<<<<< HEAD
         self.codegen_comment(wrapper)
         args = [*self.codegen_args(), *self.codegen_kwargs()]
         V.graph.wrapper_code.generate_extern_kernel_alloc(self, args)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
+=======
+        wrapper.generate_extern_kernel_alloc(self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(  # type: ignore[no-untyped-def]
         self,
@@ -5776,7 +6465,11 @@ def __init__(self, layout, mutated_node, mutating_node: Operation) -> None:  # t
     def get_defining_op(self) -> Operation:
         return self.mutating_node
 
+<<<<<<< HEAD
     def get_mutation_names(self):  # type: ignore[no-untyped-def]
+=======
+    def get_mutation_names(self) -> Sequence[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.mutation_names
 
     def should_allocate(self) -> bool:
@@ -5785,10 +6478,19 @@ def should_allocate(self) -> bool:
 
 class TMADescriptor(ExternKernel):
     """
+<<<<<<< HEAD
     An IR node representing a host-side TMA descriptor in the Triton API
     (the ones obtained via create_{1d,2d}_tma_descriptor calls). Mostly
     useful for user-defined Triton kernels relying on host-side TMA; but
     can, in principle, be used for Inductor's Triton templates, too.
+=======
+    An IR node representing a generic host-side TMA descriptor in the Triton API
+    Mostly useful for user-defined Triton kernels relying on host-side TMA;
+    but can, in principle, be used for Inductor's Triton templates, too.
+
+    See TMADescriptorExperimental and TMADescriptorStable for the two implementations
+    (the old API and the new API)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     # as TMA descriptors are immutable,
@@ -5796,6 +6498,7 @@ class TMADescriptor(ExternKernel):
     _CACHE: dict[Any, TMADescriptor] = {}
 
     @classmethod
+<<<<<<< HEAD
     def create(  # type: ignore[no-untyped-def]
         cls,
         tensor: IRNode,
@@ -5834,6 +6537,28 @@ def __init__(
             self.element_size,
         ]
 
+=======
+    def _create_impl(
+        cls, tensor: IRNode, tma_meta: tuple[str, tuple[Any, ...]]
+    ) -> TMADescriptor:
+        assert len(tma_meta) == 2
+        if tma_meta[0] == "experimental":
+            return TMADescriptorExperimental(tensor, *tma_meta[1])
+        else:
+            assert tma_meta[0] == "stable"
+            return TMADescriptorStable(tensor, *tma_meta[1])
+
+    @classmethod
+    def create(
+        cls, tensor: IRNode, tma_meta: tuple[str, tuple[Any, ...]]
+    ) -> TMADescriptor:
+        key = (id(tensor), tma_meta)
+        if key not in cls._CACHE:
+            cls._CACHE[key] = cls._create_impl(tensor, tma_meta)
+        return cls._CACHE[key]
+
+    def __init__(self, tensor: IRNode, inputs, constant_args):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             None,
             # link back to the underlying tensor in terms of ownership
@@ -5850,12 +6575,130 @@ def __init__(
             None,
         )
 
+<<<<<<< HEAD
+=======
+        self.tensor = tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.name = V.graph.register_buffer(self)
         V.graph.register_operation(self)
 
     def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
         wrapper.generate_tma_descriptor(self)
 
+<<<<<<< HEAD
+=======
+    def get_tensor(self) -> IRNode:
+        return self.tensor
+
+
+class TMADescriptorExperimental(TMADescriptor):
+    """
+    the new host-side TMA Descriptor API:
+    (the ones obtained via create_{1d,2d}_tma_descriptor calls).
+
+    See also TMADescriptorStable for the new API.
+    """
+
+    def __init__(
+        self,
+        tensor: IRNode,
+        dims: list[Union[int, torch.SymInt]],
+        block_dims: list[Union[int, torch.SymInt]],
+        element_size: Optional[int] = None,
+    ) -> None:
+        assert len(dims) in (1, 2)
+        assert len(dims) == len(block_dims)
+
+        if element_size is None:
+            element_size = tensor.get_dtype().itemsize
+
+        self.dims = dims
+        self.block_dims = block_dims
+        self.element_size = element_size
+        self.rank = len(self.dims)
+
+        inputs = [tensor]
+        constant_args = [
+            *self.dims,
+            *self.block_dims,
+            self.element_size,
+        ]
+
+        super().__init__(
+            tensor=tensor,
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+
+
+class TMADescriptorStable(TMADescriptor):
+    """
+    the new host-side TMA descriptor API
+    (the ones obtained via TensorDescriptor.from_tensor).
+
+    See also TMADescriptorExperimental for the old API.
+    """
+
+    def __init__(self, tensor: IRNode, block_shape: list[Union[int, torch.SymInt]]):
+        self.block_shape = block_shape
+
+        super().__init__(
+            tensor=tensor,
+            inputs=[tensor],
+            constant_args=block_shape,
+        )
+
+
+class SubgraphBuffer(ExternKernel):
+    def __init__(
+        self,
+        layout: Layout,
+        input_nodes: list[Buffer],
+        gm: torch.fx.GraphModule,
+        example_inputs: list[Any],
+        subgraph_name: str,
+    ):
+        super().__init__(None, layout, input_nodes)
+        self.gm = gm
+        self.example_inputs = example_inputs
+        self.name = V.graph.register_buffer(self)
+        V.graph.register_operation(self)
+
+        self.subgraph = V.graph.make_subgraph(self.gm, example_inputs, subgraph_name)
+
+        sym_inputs = get_symbolic_inputs(self.inputs)
+
+        for sym_inp in sym_inputs:
+            self.subgraph.graph_inputs[sym_inp.name] = sym_inp
+            self.subgraph.graph_input_names.append(sym_inp.name)
+
+        self.sym_inputs = [sym_var.name for sym_var in sym_inputs]
+
+        import torch._inductor.config as inductor_config
+
+        with V.set_graph_handler(self.subgraph):
+            # Don't bother autotuning on Triton here
+            with inductor_config.patch(  # type: ignore[no-untyped-def]
+                max_autotune=False,
+                max_autotune_gemm=False,
+                max_autotune_gemm_backends="ATEN",
+            ):
+                self.subgraph.run(*self.example_inputs)
+
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        class CodegenGraph:
+            def __init__(self, graph: GraphLowering):
+                self.graph = graph
+                self.name = graph.name
+
+        outer_inputs = [t.codegen_reference() for t in self.inputs]
+        wrapper.codegen_subgraph_with_flattened_outputs(
+            CodegenGraph(self.subgraph),
+            [*self.sym_inputs, *outer_inputs],
+            [self.name],
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class UserDefinedTritonKernel(ExternKernel):
     def get_kernel_and_metadata(self):  # type: ignore[no-untyped-def]
@@ -5919,10 +6762,18 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
 
         args: list[Any] = []
         arg_types: list[Any] = []
+<<<<<<< HEAD
+=======
+        raw_keys_filtered: list[Any] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raw_args_filtered: list[Any] = []
         for name, arg in itertools.chain(
             named_args.items(), zip(itertools.repeat(""), extra_launch_args)
         ):
+<<<<<<< HEAD
+=======
+            raw_keys_filtered.append(name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raw_args_filtered.append(arg)
             if isinstance(arg, IRNode):
                 args.append(arg.codegen_reference())
@@ -5949,6 +6800,10 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
                     args.append(-1)
                     arg_types.append(int)
                 else:
+<<<<<<< HEAD
+=======
+                    raw_keys_filtered.pop()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     raw_args_filtered.pop()
             else:
                 raise NotImplementedError(f"Unsupported arg type: {type(arg)}: {arg}")
@@ -5959,6 +6814,7 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
             args,
             arg_types=arg_types,
             raw_args=raw_args_filtered,
+<<<<<<< HEAD
             triton_meta=triton_meta,
             triton=True,
             device=self.get_device(),
@@ -5968,6 +6824,23 @@ def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
         # add unbacked symbols used in the grid to the ones used
         # in the kwargs (the latter is generated by ExternKernel)
         return super().get_unbacked_symbol_uses() | free_unbacked_symbols(self.grid)
+=======
+            raw_keys=raw_keys_filtered,
+            triton_meta=triton_meta,
+            triton=True,
+            device=self.get_device(),
+            original_fxnode_name=self.fx_node.name,
+        )
+
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        # add unbacked symbols used in the grid to the ones used
+        # in the kwargs (the latter is generated by ExternKernel)
+        return super().get_free_symbol_uses(unbacked_only) | get_free_symbols(
+            self.grid, unbacked_only
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet()
@@ -5982,7 +6855,11 @@ def __init__(  # type: ignore[no-untyped-def]
             if isinstance(v, TensorBox):
                 t = InputsKernel.unwrap_storage_for_input(self.realize_input(v))
                 if k in tma_descriptor_metadata:
+<<<<<<< HEAD
                     t = TMADescriptor.create(t, *tma_descriptor_metadata[k])
+=======
+                    t = TMADescriptor.create(t, tma_descriptor_metadata[k])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 inputs.append(t)
                 kwargs[k] = t
             else:
@@ -6015,7 +6892,11 @@ def __init__(  # type: ignore[no-untyped-def]
         self.mutable_args = [
             kernel_args[key]
             for key in identify_mutated_tensors(
+<<<<<<< HEAD
                 kernel, {**kernel_args, **autotuned_kwargs}
+=======
+                kernel, {**kernel_args, **autotuned_kwargs}, tma_descriptor_metadata
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         ]
 
@@ -6054,7 +6935,11 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
     def should_allocate(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def get_mutation_names(self):  # type: ignore[no-untyped-def]
+=======
+    def get_mutation_names(self) -> Sequence[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [self.inputs[0].get_name()]
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
@@ -6086,7 +6971,11 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
     def should_allocate(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def get_mutation_names(self):  # type: ignore[no-untyped-def]
+=======
+    def get_mutation_names(self) -> Sequence[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [self.inputs[0].get_name()]
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
@@ -6139,7 +7028,11 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
     def should_allocate(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def get_mutation_names(self):  # type: ignore[no-untyped-def]
+=======
+    def get_mutation_names(self) -> Sequence[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [self.inputs[0].get_name()]
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
@@ -6221,7 +7114,11 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
     def should_allocate(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def get_mutation_names(self):  # type: ignore[no-untyped-def]
+=======
+    def get_mutation_names(self) -> Sequence[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [self.inputs[0].get_name()]
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
@@ -6285,7 +7182,11 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
     def should_allocate(self) -> bool:
         return False
 
+<<<<<<< HEAD
     def get_mutation_names(self):  # type: ignore[no-untyped-def]
+=======
+    def get_mutation_names(self) -> Sequence[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [self.inputs[0].get_name()]
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
@@ -6397,8 +7298,13 @@ def __init__(self, scalar, msg) -> None:  # type: ignore[no-untyped-def]
     def has_side_effects(self) -> bool:
         return True
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self):  # type: ignore[no-untyped-def]
         return free_unbacked_symbols(self.scalar)
+=======
+    def get_free_symbol_uses(self, unbacked_only: bool = False):  # type: ignore[no-untyped-def]
+        return get_free_symbols(self.scalar, unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
         if not config.scalar_asserts:
@@ -6408,7 +7314,11 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
         # "u0 == 0" in the runtime asserts, if you subsequently try to
         # simplify(u0 == 0), you will get True (because we've already runtime assert'ed
         # that it's true).  But we're code generating the actual runtime assert here!!
+<<<<<<< HEAD
         symbol = next(iter(self.get_unbacked_symbol_uses()))
+=======
+        symbol = next(iter(self.get_free_symbol_uses(unbacked_only=False)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if V.graph.cpp_wrapper:
             symbol_str = f"std::to_string({symbol})"
             sizevar = V.graph.wrapper_code.codegen_cpp_sizevar(
@@ -6422,7 +7332,11 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
             sizevar = V.graph.wrapper_code.codegen_python_sizevar(
                 self.scalar, simplify=False
             )
+<<<<<<< HEAD
             wrapper.writeline(f"if not {sizevar}:")
+=======
+            wrapper.writeline(f"if not ({sizevar}):")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             wrapper.writeline(f"    raise RuntimeError({repr(self.msg)})")
             # No one should ever use this buffer, but for uniformity
             # define the variable and assign it None
@@ -6436,6 +7350,15 @@ class ExternKernelNode:
 
 
 class FallbackKernel(ExternKernelAlloc):
+<<<<<<< HEAD
+=======
+    """
+    A class that represents a fallback kernel for handling operators that are not
+    directly support by inductor. It currently supports functional ops, view ops,
+    inplace aten ops, and mutating ops that are auto-functionalizable.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(  # type: ignore[no-untyped-def]
         self,
         layout,
@@ -6544,6 +7467,21 @@ def add_alias(t) -> None:  # type: ignore[no-untyped-def]
         for info, arg in torch._library.utils.zip_schema(schema, args, kwargs):
             handle_aliasing_and_mutation(info, arg)
 
+<<<<<<< HEAD
+=======
+    def get_read_writes(self) -> dependencies.ReadWrites:
+        read_writes = super().get_read_writes()
+
+        if self.op_overload is torch._prims.rng_prims.graphsafe_run_with_rng_state:
+            for arg in self.constant_args:
+                if isinstance(arg, GeneratorState):
+                    read_writes = read_writes.with_read(
+                        dependencies.StarDep(arg.get_name())
+                    )
+
+        return read_writes
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def codegen_unbacked_symbol_defs(self, wrapper) -> None:  # type: ignore[no-untyped-def]
         return wrapper.codegen_unbacked_symbol_defs_for_outputs(
             self.get_name(), self.outputs, getattr(self, "unbacked_bindings", None)
@@ -6584,7 +7522,16 @@ def __repr__(self) -> str:
 
     @staticmethod
     def find_device(tensor_args, example_output):  # type: ignore[no-untyped-def]
+<<<<<<< HEAD
         if tensor_args:
+=======
+        non_torch_bind_tensor_args = (
+            [t for t in tensor_args if not isinstance(t, TorchBindObject)]
+            if tensor_args
+            else None
+        )
+        if non_torch_bind_tensor_args:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             devices = [arg.get_device() for arg in tensor_args if arg.get_device()]
             return devices[0]
         if isinstance(example_output, torch.Tensor):
@@ -6611,6 +7558,7 @@ def has_side_effects(self):  # type: ignore[no-untyped-def]
     def get_inputs_that_alias_output(self):  # type: ignore[no-untyped-def]
         return self.alias_names
 
+<<<<<<< HEAD
     def get_mutation_names(self):  # type: ignore[no-untyped-def]
         assert len(self.mutation_names) <= 1
         return self.mutation_names
@@ -6622,6 +7570,21 @@ def get_mutation_names(self):  # type: ignore[no-untyped-def]
     # Detailed design doc can be found at
     # https://docs.google.com/document/d/1wC4DOZFaYym2t1Esz0X5yxlLI3RDnSiyRbUus3bkJ64/edit?usp=sharing
     def export_extern_kernel_node(self):  # type: ignore[no-untyped-def]
+=======
+    def get_mutation_names(self) -> Sequence[str]:
+        assert len(self.mutation_names) <= 1
+        return self.mutation_names
+
+    def export_extern_kernel_node(self):  # type: ignore[no-untyped-def]
+        """
+        ProxyExecutor Design Note
+        We export the ExternFallbackNodes (for custom ops) into a serialized file
+        and run it with a host side proxy executor to address the ABI problem
+        This is currently only implemented for fbcode. Eventually, we will also make this work for OSS.
+        Detailed design doc can be found at
+        https://docs.google.com/document/d/1wC4DOZFaYym2t1Esz0X5yxlLI3RDnSiyRbUus3bkJ64/edit?usp=sharing
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         log.debug(
             "Extern kernel node added for node %s with target %s.",
             self.get_name(),
@@ -6646,15 +7609,30 @@ def export_extern_kernel_node(self):  # type: ignore[no-untyped-def]
 
         # serialize_outputs
         def handle_single_output(return_type, output):  # type: ignore[no-untyped-def]
+<<<<<<< HEAD
             if isinstance(return_type, torch.TensorType):
                 # For single Tensor
+=======
+            if isinstance(return_type, (torch.TensorType, torch.NoneType)):
+                # For single Tensor or None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out = output
                 if isinstance(output, (list, tuple)):
                     assert len(output) == 1
                     out = output[0]
+<<<<<<< HEAD
                 return export_schema.Argument.create(
                     as_tensor=export_schema.TensorArgument(name=out.get_name())
                 )
+=======
+                if isinstance(return_type, torch.TensorType):
+                    return export_schema.Argument.create(
+                        as_tensor=export_schema.TensorArgument(name=out.get_name())
+                    )
+                else:  # NoneType
+                    assert out is None
+                    return export_schema.Argument.create(as_none=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif isinstance(return_type, torch.ListType) and isinstance(
                 return_type.getElementType(), torch.TensorType
             ):
@@ -6665,6 +7643,29 @@ def handle_single_output(return_type, output):  # type: ignore[no-untyped-def]
                         for out in output
                     ]
                 )
+<<<<<<< HEAD
+=======
+            elif isinstance(return_type, torch.OptionalType) and isinstance(
+                return_type.getElementType(), torch.TensorType
+            ):
+                # For OptionalTensor
+                if output is None:
+                    return export_schema.Argument.create(
+                        as_optional_tensor=export_schema.OptionalTensorArgument.create(
+                            as_none=True
+                        )
+                    )
+                else:
+                    return export_schema.Argument.create(
+                        as_optional_tensor=export_schema.OptionalTensorArgument.create(
+                            as_tensor=export_schema.TensorArgument(
+                                name=output.get_name()
+                            )
+                        )
+                    )
+            elif isinstance(return_type, torch.IntType):
+                return export_schema.Argument.create(as_int=output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise RuntimeError(f"Unsupported return type {type(return_type)}")
 
@@ -6721,6 +7722,7 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
             assert isinstance(kernel, torch._ops.OpOverload)
         elif V.graph.cpp_wrapper:
             # For non-aten OpOverload, i.e. custom ops
+<<<<<<< HEAD
             self.use_runtime_dispatch = True
 
         def do_runtime_dispatch() -> None:
@@ -6732,11 +7734,58 @@ def do_runtime_dispatch() -> None:
                 self.python_kernel_name,
                 self.cpp_kernel_name,
                 args,
+=======
+            # If the op is in custom_ops_to_c_shims, generate direct function call
+            self.use_runtime_dispatch = (
+                kernel not in config.aot_inductor.custom_ops_to_c_shims
+            )
+
+        # Handle the special case where a complex number is input to a C-shim kernel for
+        # a scalar input.  The torchgen'ed shim API will use type "double", which is
+        # incompatible with complex numbers, forcing a fallback to runtime dispatch.
+        if (
+            V.graph.cpp_wrapper
+            and isinstance(kernel, torch._ops.OpOverload)
+            and not self.use_runtime_dispatch
+        ):
+
+            def is_number(t: torch.JitType) -> bool:
+                if isinstance(t, torch.OptionalType):
+                    return is_number(t.getElementType())
+                return isinstance(t, torch.NumberType)
+
+            # Using unflatten_args is a bit of a hack, but all the complex arguments we
+            # care about are in self.constant_args, and calling unflatten_args puts them
+            # in the correct order without triggering codegen.
+            args, kwargs = self.unflatten_args(self.inputs, self.constant_args)
+            # Append kwarg values to args.  ordered_kwargs_for_cpp_kernel is guaranteed
+            # to be set, since this is an OpOverload kernel.
+            args_iter = itertools.chain(
+                args,
+                (
+                    self.get_kwargs_value(k, **kwargs)
+                    for k in self.ordered_kwargs_for_cpp_kernel
+                ),
+            )
+            self.use_runtime_dispatch = any(
+                isinstance(v, complex) and is_number(a.real_type)
+                for v, a in zip(args_iter, kernel._schema.arguments)
+            )
+
+        self.codegen_comment(wrapper)
+        if self.use_runtime_dispatch:
+            exported_args = self.export_extern_kernel_node()
+            wrapper.generate_fallback_kernel_with_runtime_lookup(
+                self.get_name(),
+                self.python_kernel_name,
+                lambda: [*self.codegen_args(), *self.codegen_kwargs()],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.op_overload,
                 exported_args,
                 # NOTE: [special handling of all_reduce_coalesced_'s return value]
                 self.outputs if self.outputs else self.mutation_outputs,
             )
+<<<<<<< HEAD
 
         def is_number(t: torch.JitType) -> bool:
             return isinstance(t, torch.NumberType) or (
@@ -6767,6 +7816,13 @@ def is_number(t: torch.JitType) -> bool:
                 V.graph.wrapper_code.generate_fallback_kernel(self, args)
                 if isinstance(self.layout, Layout):
                     self.codegen_size_asserts(wrapper)
+=======
+        else:
+            wrapper.generate_fallback_kernel(self)
+            if isinstance(self.layout, Layout):
+                self.codegen_size_asserts(wrapper)
+                self.codegen_alignment_asserts(wrapper)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.codegen_unbacked_symbol_defs(wrapper)
 
@@ -6794,7 +7850,22 @@ def create(cls, kernel, *args, **kwargs):  # type: ignore[no-untyped-def]
                 unbacked_bindings,
             ) = cls.process_kernel(kernel, *args, **kwargs)
 
+<<<<<<< HEAD
+        device = cls.find_device(tensor_args, example_output)
+=======
+        # We need this extra check for input alignment since the example
+        # inputs we created are always aligned.
+        has_unaligned_input = any(is_unaligned(arg) for arg in tensor_args)
+
         device = cls.find_device(tensor_args, example_output)
+
+        if not device and isinstance(
+            kernel, torch._higher_order_ops.torchbind.CallTorchBind
+        ):
+            # use CPU device for torchbind methods that don't take in or output any tensor, e.g. size()
+            device = torch.device("cpu")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if example_output is None:
             packed = cls(
                 NoneLayout(device=device),
@@ -6828,11 +7899,25 @@ def generate_output(output, indices):  # type: ignore[no-untyped-def]
                     for key, val in output.items()
                 }
             elif isinstance(output, torch.Tensor):
+<<<<<<< HEAD
                 return MultiOutput(
+=======
+                buf = MultiOutput(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     cls.tensor_to_layout(output),
                     packed,
                     indices,
                 )
+<<<<<<< HEAD
+=======
+                if (
+                    config.assume_unaligned_fallback_output
+                    or has_unaligned_input
+                    or not tensor_is_aligned(output)
+                ):
+                    V.graph.unaligned_buffers.add(buf.name)  # type: ignore[arg-type]
+                return buf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif isinstance(output, int):
                 return output
             elif isinstance(output, torch.SymInt):
@@ -6894,6 +7979,7 @@ def get_device(self) -> Optional[torch.device]:
 
 
 class MultiOutput(ExternKernel):
+<<<<<<< HEAD
     # Given an input MultiOutputLayout buffer, indexes out an actual buffer
     # from that result.  This doesn't actually produce multiple outputs,
     # that's MultiOutputLayout!
@@ -6921,20 +8007,40 @@ def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
             self.codegen_list_tuple_access(self.inputs[0].get_name(), self.indices),
         )
         self.codegen_size_asserts(wrapper)
+=======
+    def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
+        wrapper.codegen_multi_output(self)
+        if not self.skip_size_stride_alignment_checks:
+            self.codegen_size_asserts(wrapper)
+            self.codegen_alignment_asserts(wrapper)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(  # type: ignore[no-untyped-def]
         self,
         layout: OutputSpec,
         input,
         indices: list[tuple[Any, ...]],
+<<<<<<< HEAD
+=======
+        skip_size_stride_alignment_checks=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         super().__init__(None, layout, [input], ())
         self.name = V.graph.register_buffer(self)
         V.graph.register_operation(self)
         self.indices = indices
+<<<<<<< HEAD
 
     def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
         return self.inputs[0].get_unbacked_symbol_uses()
+=======
+        self.skip_size_stride_alignment_checks = skip_size_stride_alignment_checks
+
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        return self.inputs[0].get_free_symbol_uses(unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def should_allocate(self) -> bool:
         if len(self.inputs) == 1 and (
@@ -6992,6 +8098,12 @@ def realize_hint(self) -> None:
     def unwrap_view(self) -> IRNode:
         return self.data.unwrap_view()
 
+<<<<<<< HEAD
+=======
+    def is_input_buffer(self) -> bool:
+        return self.data.is_input_buffer()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def freeze_layout(self) -> None:
         return self.data.freeze_layout()
 
@@ -7050,8 +8162,15 @@ def get_inputs_that_alias_output(self) -> Sequence[str]:
     def realize(self) -> Optional[str]:
         return self.data.realize()
 
+<<<<<<< HEAD
     def get_unbacked_symbol_uses(self) -> OrderedSet[sympy.Symbol]:
         return self.data.get_unbacked_symbol_uses()
+=======
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        return self.data.get_free_symbol_uses(unbacked_only)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_read_names(self) -> OrderedSet[str]:
         return self.data.get_read_names()
@@ -7109,7 +8228,11 @@ def create(data):  # type: ignore[no-untyped-def]
 
 
 class StorageBox(MutableBox):
+<<<<<<< HEAD
     def is_input_buffer(self):  # type: ignore[no-untyped-def]
+=======
+    def is_input_buffer(self) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.data, (InputBuffer, ReinterpretView)):
             return self.data.get_name() in V.graph.graph_inputs
         return False
@@ -7213,6 +8336,13 @@ def _has_aliased_buffers(buffers: Sequence[IRNode]) -> bool:
 
 @ir_dataclass(frozen=False)
 class InvokeSubgraph(ExternKernel):
+<<<<<<< HEAD
+=======
+    """
+    Ir node for the invoke_subgraph HOP.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     subgraph: Optional[Subgraph] = None
     operands: Optional[list[TensorBox]] = None
     outputs: Optional[list[MultiOutput]] = None
@@ -7230,26 +8360,53 @@ def __init__(
         V.graph.register_operation(self)
 
     @classmethod
+<<<<<<< HEAD
     def create(cls, subgraph: Subgraph, operands):  # type: ignore[no-untyped-def]
         # TODO(anijain2305) - Support sym expr as operands in future.
         fx_operands = V.graph.current_node.args[-1]
         fake_operands = [x.meta["val"] for x in fx_operands]  # type: ignore[union-attr]
+=======
+    def create(cls, subgraph: Subgraph, *operands):  # type: ignore[no-untyped-def]
+        from .lowering import constrain_to_fake_tensor
+
+        # TODO(anijain2305) - Support sym expr as operands in future.
+        current_node = V.graph.current_node
+
+        fake_operands = None
+        if eager_input_vals := current_node.meta.get("eager_input_vals"):
+            # eager_input_vals is (args_values, kwargs_values). We need args for invoke_subgraph
+            fake_operands = eager_input_vals[0][2:]
+        else:
+            # For the partitioned backward graph, we do not have
+            # eager_input_vals. Here, we rely on the recorded example values.
+            fx_operands = current_node.args[2:]
+            fake_operands = [x.meta["val"] for x in fx_operands]  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Realize the inputs. Also intermediates can have different strides than
         # the inputs of the subgraph. So, force the intermediates to have same
         # strides as that of subgraph inputs.
         operands = [cls.realize_input(x) for x in operands]
 
+<<<<<<< HEAD
         def handle_sym_expr(stride):  # type: ignore[no-untyped-def]
             return [s.node.expr if isinstance(s, torch.SymInt) else s for s in stride]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_operands = []
         for idx, operand in enumerate(operands):
             if isinstance(operand, ShapeAsConstantBuffer):
                 new_operands.append(operand)
             else:
+<<<<<<< HEAD
                 example_stride = handle_sym_expr(fake_operands[idx].stride())
                 new_operands.append(cls.require_exact_strides(operand, example_stride))
+=======
+                new_operands.append(
+                    constrain_to_fake_tensor(operand, fake_operands[idx])
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         operands = new_operands
 
@@ -7280,12 +8437,17 @@ def handle_sym_expr(stride):  # type: ignore[no-untyped-def]
             layout=MultiOutputLayout(device=device),
         )
 
+<<<<<<< HEAD
         def create_output(output: IRNode, ind: int):
+=======
+        def create_output(output: IRNode, ind: int):  # type: ignore[no-untyped-def]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(output, (ShapeAsConstantBuffer, NoneAsConstantBuffer)):
                 return output
             else:
                 return MultiOutput(
                     FixedLayout(
+<<<<<<< HEAD
                         device=output.get_device(),
                         dtype=output.get_dtype(),
                         size=output.get_size(),  # type: ignore[arg-type]
@@ -7294,6 +8456,17 @@ def create_output(output: IRNode, ind: int):
                     ),
                     invoke_subgraph,
                     [(list, ind)],
+=======
+                        device=output.get_device(),  # type: ignore[arg-type]
+                        dtype=output.get_dtype(),
+                        size=output.get_size(),  # type: ignore[arg-type]
+                        stride=output.get_stride(),  # type: ignore[arg-type]
+                        offset=output.get_layout().offset,
+                    ),
+                    invoke_subgraph,  # type: ignore[has-type]
+                    [(list, ind)],
+                    skip_size_stride_alignment_checks=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         outputs = [create_output(output, i) for i, output in enumerate(outputs)]
@@ -7376,10 +8549,17 @@ def create(  # type: ignore[no-untyped-def]
 
         # make sure true and false outputs are structurally equivalent
         assert len(true_outputs) == len(false_outputs), (true_outputs, false_outputs)
+<<<<<<< HEAD
         for i, (to, fo) in enumerate(zip(true_outputs, false_outputs)):
             assert to.get_device() == fo.get_device(), (i, to, fo)
             assert to.get_dtype() == fo.get_dtype(), (i, to, fo)
             assert to.get_layout().offset == fo.get_layout().offset, (i, to, fo)
+=======
+        for i, (t_o, f_o) in enumerate(zip(true_outputs, false_outputs)):
+            assert t_o.get_device() == f_o.get_device(), (i, t_o, f_o)
+            assert t_o.get_dtype() == f_o.get_dtype(), (i, t_o, f_o)
+            assert t_o.get_layout().offset == f_o.get_layout().offset, (i, t_o, f_o)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         device = next(
             o.get_device()
@@ -7498,12 +8678,48 @@ def create(  # type: ignore[no-untyped-def]
         carried_inputs: list[Union[TensorBox, ShapeAsConstantBuffer]],
         additional_inputs: list[Union[TensorBox, ShapeAsConstantBuffer]],
     ):
+<<<<<<< HEAD
         carried_inputs = [cls.realize_input(x) for x in carried_inputs]
         additional_inputs = [cls.realize_input(x) for x in additional_inputs]
         all_inputs = carried_inputs + additional_inputs
 
         fx_all_inputs = V.graph.current_node.args[-2] + V.graph.current_node.args[-1]  # type: ignore[operator]
         fake_all_inputs = [x.meta["val"] for x in fx_all_inputs]  # type: ignore[union-attr]
+=======
+        from torch._higher_order_ops.utils import check_input_alias_and_mutation
+
+        def _require_exact_strides(
+            tensor_boxes: list[TensorBox | ShapeAsConstantBuffer],
+            fake_tensors: list[Union[int, torch.SymInt, torch.Tensor]],
+        ) -> list[TensorBox | ShapeAsConstantBuffer]:
+            assert len(tensor_boxes) == len(fake_tensors)
+            ret = []
+            for tb, fk in zip(tensor_boxes, fake_tensors):
+                if isinstance(fk, torch.Tensor):
+                    ret.append(
+                        ExternKernel.require_exact_strides(
+                            tb, fk.stride(), allow_padding=False
+                        )
+                    )
+                else:
+                    ret.append(tb)
+            return ret
+
+        fx_carried_inputs = V.graph.current_node.args[-2]
+        fx_additional_inputs = V.graph.current_node.args[-1]
+        fx_all_inputs = fx_carried_inputs + fx_additional_inputs  # type: ignore[operator]
+        fake_all_inputs = [x.meta["val"] for x in fx_all_inputs]  # type: ignore[union-attr]
+        fake_carried_inputs = [x.meta["val"] for x in fx_carried_inputs]  # type: ignore[union-attr]
+        fake_additional_inputs = [x.meta["val"] for x in fx_additional_inputs]  # type: ignore[union-attr]
+
+        carried_inputs = [cls.realize_input(x) for x in carried_inputs]
+        carried_inputs = _require_exact_strides(carried_inputs, fake_carried_inputs)
+        additional_inputs = [cls.realize_input(x) for x in additional_inputs]
+        additional_inputs = _require_exact_strides(
+            additional_inputs, fake_additional_inputs
+        )
+        all_inputs = carried_inputs + additional_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for subgraph in (cond_fn, body_fn):
             if subgraph.graph is None:
@@ -7515,6 +8731,23 @@ def create(  # type: ignore[no-untyped-def]
                 )
                 with V.set_graph_handler(subgraph.graph):
                     subgraph.graph.run(*fake_all_inputs)
+<<<<<<< HEAD
+=======
+                    # For body_fn, we require its output to have the exact same stride
+                    # as inputs because the previous output is the input of next iteration.
+                    #
+                    # This cannot be automatically done in graph lowering because body_fn's graph outputs
+                    # are not user-facing so the special handling for strides of user-facing output in graph
+                    # lowering is not applicable.
+                    if subgraph is body_fn:
+                        assert len(subgraph.graph.graph_outputs) == len(
+                            fake_carried_inputs
+                        )
+                        subgraph.graph.graph_outputs = _require_exact_strides(  # type: ignore[assignment]
+                            subgraph.graph.graph_outputs,  # type: ignore[arg-type]
+                            fake_carried_inputs,
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cond_outputs = cond_fn.graph.graph_outputs  # type: ignore[union-attr]
         body_outputs = body_fn.graph.graph_outputs  # type: ignore[union-attr]
@@ -7538,13 +8771,22 @@ def create(  # type: ignore[no-untyped-def]
 
         device = all_inputs[0].get_device()
 
+<<<<<<< HEAD
+=======
+        assert device is not None  # to make linter happy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # make sure carried_inputs and body outputs are structurally equivalent
         assert len(carried_inputs) == len(body_outputs), (carried_inputs, body_outputs)
         for i, (op, bo) in enumerate(zip(carried_inputs, body_outputs)):
 
             def _guard_list_equals(
+<<<<<<< HEAD
                 lhs_exprs: list[Union[int, sympy.expr]],
                 rhs_exprs: list[Union[int, sympy.expr]],
+=======
+                lhs_exprs: Sequence[Union[int, Any]],
+                rhs_exprs: Sequence[Union[int, Any]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ) -> None:
                 for lhs, rhs in zip(lhs_exprs, rhs_exprs):
                     V.graph.sizevars.guard_equals(lhs, rhs)
@@ -7566,7 +8808,26 @@ def _guard_list_equals(
             layout=MultiOutputLayout(device=device),
         )
 
+<<<<<<< HEAD
         outputs = [
+=======
+        assert body_fn.graph is not None and isinstance(
+            body_fn.graph.module, torch.fx.GraphModule
+        )  # to make linter happy
+
+        # Handling input mutations
+        mutated_idxs = check_input_alias_and_mutation(
+            body_fn.graph.module, fake_all_inputs
+        )[3]
+        mutated_idx_set = OrderedSet(mutated_idxs)
+        mutated_inputs = [all_inputs[idx] for idx in mutated_idx_set]
+        real_outputs = {
+            idx: out
+            for idx, out in enumerate(body_outputs)
+            if idx not in mutated_idx_set
+        }
+        real_outputs = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             MultiOutput(
                 FixedLayout(
                     device=output.get_device(),
@@ -7576,12 +8837,32 @@ def _guard_list_equals(
                     offset=output.get_layout().offset,
                 ),
                 while_loop,
+<<<<<<< HEAD
                 [(list, i)],
             )
             for i, output in enumerate(body_outputs)
         ]
 
         for inp, out in zip(carried_inputs, outputs):
+=======
+                [(list, idx)],
+            )
+            for idx, output in real_outputs.items()
+        ]
+        while_loop.outputs = real_outputs
+        while_loop.mutation_outputs = [
+            MutationOutput(inp.layout, inp, while_loop)  # type: ignore[union-attr]
+            for inp in mutated_inputs
+        ]
+
+        outputs_iter = iter(real_outputs)
+        mutated_inputs_iter = iter(mutated_inputs)
+        all_outputs = [
+            next(mutated_inputs_iter) if idx in mutated_idx_set else next(outputs_iter)
+            for idx in range(len(body_outputs))
+        ]
+        for inp, out in zip(carried_inputs, all_outputs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if inp.get_name() in V.graph.graph_inputs:
                 # if a carried input of the while_loop is a graph input,
                 # it can be returned as is when the number of iterations
@@ -7589,9 +8870,13 @@ def _guard_list_equals(
                 # output buffers corresponding to the graph inputs, as
                 # the inputs may end up being mutated.
                 V.graph.never_reuse_buffers.add(out.get_name())
+<<<<<<< HEAD
 
         while_loop.outputs = outputs
         return outputs
+=======
+        return all_outputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen(self, wrapper) -> None:  # type: ignore[no-untyped-def]
         wrapper.codegen_while_loop(self)
@@ -7650,12 +8935,19 @@ class NonTensorObj(IRNode):
 
 @ir_dataclass
 class TorchBindObject(NonTensorObj):
+<<<<<<< HEAD
     from torch._library.fake_class_registry import FakeScriptObject
 
     name: str
     value: Union[FakeScriptObject, torch.ScriptObject]
 
     def get_name(self):  # type: ignore[no-untyped-def]
+=======
+    name: str
+    value: Union[FakeScriptObject, torch.ScriptObject]
+
+    def get_name(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.name
 
     def codegen_reference(self, writer: Optional[IndentedBuffer] = None) -> str:
@@ -7680,7 +8972,11 @@ def get_buf_bytes(self) -> int:
             for x in flat_elems
             if isinstance(x, torch.Tensor)
         ]
+<<<<<<< HEAD
         return functools.reduce(lambda x, y: x + y, flat_sizes, 0)
+=======
+        return functools.reduce(operator.add, flat_sizes, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @ir_dataclass
@@ -7688,7 +8984,11 @@ class GeneratorState(NonTensorObj):
     name: str
     device: torch.device
 
+<<<<<<< HEAD
     def get_name(self):  # type: ignore[no-untyped-def]
+=======
+    def get_name(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.name
 
     def codegen_reference(self, writer: Optional[IndentedBuffer] = None) -> str:
@@ -7815,6 +9115,14 @@ def create_out_of_place(  # type: ignore[no-untyped-def]
                 )
                 for i, tensor in enumerate(example_output)
             ]
+<<<<<<< HEAD
+=======
+            for buf, tensor in zip(packed.outputs, example_output):
+                if config.assume_unaligned_fallback_output or not tensor_is_aligned(
+                    tensor
+                ):
+                    V.graph.unaligned_buffers.add(buf.name)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return packed.outputs
         else:
             packed = cls(
@@ -7824,6 +9132,13 @@ def create_out_of_place(  # type: ignore[no-untyped-def]
                 non_tensor_args,
                 unflatten_args,
             )
+<<<<<<< HEAD
+=======
+            if config.assume_unaligned_fallback_output or not tensor_is_aligned(
+                example_output
+            ):
+                V.graph.unaligned_buffers.add(packed.name)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             packed.outputs = [packed]
             return packed
 
@@ -7898,3 +9213,22 @@ def maybe_free_unbacked_symbols(s: object) -> OrderedSet[Symbol]:
         return free_unbacked_symbols(s)
     else:
         return OrderedSet()
+<<<<<<< HEAD
+=======
+
+
+def maybe_free_symbols(s: object) -> OrderedSet[Symbol]:
+    if isinstance(s, (SymTypes, Expr)):
+        # This branch should be impossible in return position
+        return free_symbols(s)
+    elif isinstance(s, (tuple, list)):
+        r = OrderedSet[sympy.Symbol]()
+        for t in s:
+            r |= maybe_free_symbols(t)
+        return r
+    elif isinstance(s, torch.Tensor):
+        # This branch is impossible in constant-args position
+        return free_symbols(s)
+    else:
+        return OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index c3886111cb02..dbd3372fb38e 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -13,6 +13,10 @@
     TritonTemplate,
 )
 from ..utils import (
+<<<<<<< HEAD
+=======
+    _use_cutlass_for_op,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     use_aten_gemm_kernels,
     use_ck_gemm_template,
     use_cpp_bmm_template,
@@ -23,10 +27,17 @@
 from .mm_common import (
     _is_static_problem,
     addmm_epilogue,
+<<<<<<< HEAD
     mm_args,
     mm_config_kwargs,
     mm_options,
     should_fallback_to_aten,
+=======
+    is_batch_stride_largest,
+    mm_args,
+    mm_config_kwargs,
+    mm_options,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -74,6 +85,11 @@ def _is_large_block_for_cpu(m, n, k):
     group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
     pid_m = group_id * GROUP_M + (pid % group_size)
     pid_n = (pid % width) // (group_size)
+<<<<<<< HEAD
+=======
+    tl.assume(pid_m >= 0)
+    tl.assume(pid_n >= 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
@@ -115,16 +131,36 @@ def _is_large_block_for_cpu(m, n, k):
     # inductor generates a suffix
     {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask")}}
 """,
+<<<<<<< HEAD
+)
+
+aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out")
+=======
+    cache_codegen_enabled_for_template=True,
 )
 
 aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out")
+aten_bmm_dtype = ExternKernelChoice(
+    torch.bmm,
+    "at::_bmm_out_dtype_cuda",
+    name="bmm_dtype",
+    op_overload=aten.bmm.dtype_out,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 aten_baddbmm = ExternKernelChoice(
     torch.baddbmm, "at::baddbmm_out", op_overload=aten.baddbmm.out
 )
 
 
 @L.register_lowering(aten.bmm)
+<<<<<<< HEAD
 def tuned_bmm(mat1, mat2, *, layout=None):
+=======
+def tuned_bmm(mat1, mat2, out_dtype=None, *, layout=None):
+    """
+    Lowering for autotuning aten.bmm with different backends (Aten, Triton, CUTLASS, etc.)
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if all(x.get_device().type == "cpu" for x in [mat1, mat2]):
         # decompose to small ops when memory bound
         if mat1.get_size()[1] == 1 or mat2.get_size()[2] == 1:
@@ -162,12 +198,25 @@ def may_require_contiguous(t, meta_t):
             meta_mat2 = V.graph.current_node.args[1]
             mat2 = may_require_contiguous(mat2, meta_mat2)
 
+<<<<<<< HEAD
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
 
     # below is for getting an overview logging info of inductor mms
     counters["aten_mm_info"][f"aten.bmm_{m}_{n}_{k}"] += 1
     log.info(
         "Tuned aten.bmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s",
+=======
+    m, n, k, layout, mat1, mat2 = mm_args(
+        mat1, mat2, layout=layout, out_dtype=out_dtype
+    )
+
+    # below is for getting an overview logging info of inductor mms
+    batch_size = mat1.get_size()[0]  # Extract batch dimension
+    counters["aten_mm_info"][f"aten.bmm_{batch_size}_{m}_{n}_{k}"] += 1
+    log.info(
+        "Tuned aten.bmm: batch=%s, m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s",
+        batch_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m,
         n,
         k,
@@ -176,15 +225,38 @@ def may_require_contiguous(t, meta_t):
         layout,
     )
 
+<<<<<<< HEAD
     # options to tune from
     choices = [aten_bmm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
+=======
+    if out_dtype:
+        assert mat1.get_device().type == "cuda", "out_dtype is only supported for CUDA"
+        aten_func = aten_bmm_dtype.bind((mat1, mat2), layout, out_dtype=out_dtype)
+    else:
+        aten_func = aten_bmm.bind((mat1, mat2), layout)
+
+    # options to tune from
+    choices = [aten_func] if use_aten_gemm_kernels() else []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     device_type = ir.get_device_type(mat1)
     bmm_configs = V.choices.get_base_mm_configs(device_type)
 
+<<<<<<< HEAD
     if use_triton_template(layout):
         for config in bmm_configs(
             m, n, k, **mm_config_kwargs(device_type, _is_large_block_for_cpu)
+=======
+    dtype = mat1.get_dtype()
+    if use_triton_template(layout):
+        # TODO: add out_dtype support for Triton Template
+        assert out_dtype is None, "out_dtype is not supported for Triton"
+        for config in bmm_configs(
+            m,
+            n,
+            k,
+            **mm_config_kwargs(device_type, _is_large_block_for_cpu, dtype.itemsize),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             bmm_template.maybe_append_choice(
                 choices,
@@ -192,11 +264,25 @@ def may_require_contiguous(t, meta_t):
                 layout=layout,
                 **mm_options(config, m, n, k, layout),
             )
+<<<<<<< HEAD
     static_shape, is_nonzero = _is_static_problem(layout)
     if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
         from ..codegen.cuda.gemm_template import CUTLASS3xGemmTemplate
 
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])
+=======
+    _, is_nonzero = _is_static_problem(layout)
+    batch_stride_largest = is_batch_stride_largest(mat1, mat2, layout)
+    if (
+        batch_stride_largest
+        and is_nonzero
+        and use_cutlass_template(layout, m, n, k)
+        and _use_cutlass_for_op("bmm")
+    ):
+        from ..codegen.cuda.gemm_template import CUTLASS3xGemmTemplate
+
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if use_cpp_bmm_template(layout, mat1, mat2):
         from ..codegen.cpp_bmm_template import CppBmmTemplate
@@ -210,9 +296,12 @@ def may_require_contiguous(t, meta_t):
     if use_ck_gemm_template(layout, m, n, k):
         CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
 
+<<<<<<< HEAD
     if should_fallback_to_aten(choices):
         choices.append(aten_bmm.bind((mat1, mat2), layout))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)
 
 
@@ -221,9 +310,17 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
 
     # below is for getting an overview logging info of inductor mms
+<<<<<<< HEAD
     counters["aten_mm_info"][f"aten.baddbmm_{m}_{n}_{k}"] += 1
     log.info(
         "Tuned aten.baddbmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, inp=%s, output_layout=%s",
+=======
+    batch_size = mat1.get_size()[0]
+    counters["aten_mm_info"][f"aten.baddbmm_{batch_size}_{m}_{n}_{k}"] += 1
+    log.info(
+        "Tuned aten.baddbmm: batch_size=%s, m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, inp=%s, output_layout=%s",
+        batch_size,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m,
         n,
         k,
@@ -254,6 +351,10 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
                 **mm_options(config, m, n, k, layout),
                 prefix_args=1,
                 epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+<<<<<<< HEAD
+=======
+                epilogue_fn_hash=str(["addmm_epilogue", layout.dtype, alpha, beta]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     return autotune_select_algorithm("baddbmm", choices, [inp, mat1, mat2], layout)
diff --git a/torch/_inductor/kernel/conv.py b/torch/_inductor/kernel/conv.py
index 9e6c5e8d42b8..9f52d4de94d1 100644
--- a/torch/_inductor/kernel/conv.py
+++ b/torch/_inductor/kernel/conv.py
@@ -620,7 +620,11 @@ def channels_last_conv():
                     PADDING_W=padding[1],
                     GROUPS=groups,
                     # TODO(jansel): try unroll for bigger kernels once fixed:
+<<<<<<< HEAD
                     #               https://github.com/openai/triton/issues/1254
+=======
+                    #               https://github.com/triton-lang/triton/issues/1254
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     UNROLL=is_ones(kernel_shape),
                     ALLOW_TF32=torch.backends.cudnn.allow_tf32,
                     num_stages=cfg.num_stages,
@@ -643,7 +647,11 @@ def channels_last_conv():
                     PADDING_W=padding[2],
                     GROUPS=groups,
                     # TODO(jansel): try unroll for bigger kernels once fixed:
+<<<<<<< HEAD
                     #               https://github.com/openai/triton/issues/1254
+=======
+                    #               https://github.com/triton-lang/triton/issues/1254
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     UNROLL=is_ones(kernel_shape),
                     ALLOW_TF32=torch.backends.cudnn.allow_tf32,
                     num_stages=cfg.num_stages,
diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
index d743cdb05d34..5043848865fb 100644
--- a/torch/_inductor/kernel/flex_attention.py
+++ b/torch/_inductor/kernel/flex_attention.py
@@ -254,6 +254,23 @@ def build_subgraph_buffer(args: list[TensorBox], subgraph: Subgraph) -> Subgraph
     return build_subgraph_module_buffer(args, subgraph.graph_module)
 
 
+<<<<<<< HEAD
+=======
+def get_fwd_subgraph_outputs(
+    subgraph_buffer: SubgraphResults, mask_graph_buffer: SubgraphResults
+) -> list[Optional[ComputedBuffer]]:
+    subgraph_buffer = (
+        subgraph_buffer if isinstance(subgraph_buffer, Sequence) else [subgraph_buffer]
+    )
+    mask_graph_buffer = (
+        mask_graph_buffer
+        if isinstance(mask_graph_buffer, Sequence)
+        else [mask_graph_buffer]
+    )
+    return [*subgraph_buffer, *mask_graph_buffer]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Inner Triton functions shared by flex_attention & split-k decoding kernels.
 compute_next_offset_func = r"""
 @triton.jit
@@ -373,6 +390,36 @@ def load_checked_2d(
     off_zq = tl.program_id(1) // HQ
     off_hq = tl.program_id(1) % HQ
 
+<<<<<<< HEAD
+=======
+
+    # Setting up the TMA descriptors for Q, K, V
+    desc_q = None
+    desc_k = None
+    desc_v = None
+    {%- if USE_TMA %}
+    desc_q = tl.make_tensor_descriptor(
+        base=Q,
+        shape=[Q_LEN*HQ*ZQ, QK_HEAD_DIM],
+        strides=[QK_HEAD_DIM, 1],
+        block_shape=[BLOCK_M, QK_HEAD_DIM_ROUNDED],
+    )
+    desc_v = tl.make_tensor_descriptor(
+        base=V,
+        shape=[KV_LEN*ZKV*HQ, V_HEAD_DIM],
+        strides=[V_HEAD_DIM, 1],
+        block_shape=[BLOCK_N, V_HEAD_DIM_ROUNDED],
+    )
+    desc_k = tl.make_tensor_descriptor(
+        base=V,
+        shape=[KV_LEN*ZKV*HQ, V_HEAD_DIM],
+        strides=[V_HEAD_DIM, 1],
+        block_shape=[BLOCK_N, V_HEAD_DIM_ROUNDED],
+    )
+    {%- endif %}
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
     # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
     off_zkv = off_zq % ZKV
@@ -411,6 +458,7 @@ def load_checked_2d(
     sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
     sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
     sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
+<<<<<<< HEAD
 
     Q_block_ptr = tl.make_block_ptr(
         base=Q,
@@ -421,6 +469,31 @@ def load_checked_2d(
         order=(1, 0)
     )
     q = load_checked_block(Q_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
+=======
+    K_block_ptr = None
+    V_block_ptr = None
+    Q_block_ptr = None
+
+    if not USE_TMA:
+        Q_block_ptr = tl.make_block_ptr(
+            base=Q ,
+            shape=(Q_LEN, QK_HEAD_DIM),
+            strides=(stride_qm, stride_qk),
+            offsets=(q_start * BLOCK_M, 0),
+            block_shape=(BLOCK_M, QK_HEAD_DIM_ROUNDED),
+            order=(1, 0)
+        )
+
+    {%- if USE_TMA %}
+    q = tl.load_tensor_descriptor(
+        desc_q,
+        [(q_start * BLOCK_M).to(tl.int32), 0],
+    )
+    {%- else %}
+        q = load_checked_block(Q_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
+    {%- endif %}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # We don't know anything "special" about these blocks, so we need to apply
     # both score_mod and mask_mod to it
@@ -429,6 +502,7 @@ def load_checked_2d(
     kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
     block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
 
+<<<<<<< HEAD
     K_block_ptr = tl.make_block_ptr(
         base=K,
         shape=(QK_HEAD_DIM, KV_LEN),
@@ -452,6 +526,38 @@ def load_checked_2d(
         q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
         acc, l_i, m_i,
         off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+=======
+
+    if not USE_TMA:
+        K_block_ptr = tl.make_block_ptr(
+            base=K,
+            shape=(QK_HEAD_DIM, KV_LEN),
+            strides=(stride_kk, stride_kn),
+            offsets=(0, kv_start),
+            block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
+            order=(0, 1)
+        )
+
+        V_block_ptr = tl.make_block_ptr(
+            base=V,
+            shape=(KV_LEN, V_HEAD_DIM),
+            strides=(stride_vn, stride_vk),
+            offsets=(kv_start, 0),
+            block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
+            order=(1, 0)
+        )
+
+    offs_n = kv_start + tl.arange(0, BLOCK_N)
+
+
+    acc, l_i, m_i = forward_inner(
+        {{gen_argdefs()}},
+        q, K_block_ptr, V_block_ptr,
+        desc_k, desc_v, Q_LEN, KV_LEN,
+        acc, l_i, m_i,
+        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+        kv_start,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kv_indices, kv_num_blocks,
         0, block_n_end,
         MATMUL_PRECISION,
@@ -467,6 +573,7 @@ def load_checked_2d(
         kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
         kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
         block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
+<<<<<<< HEAD
 
         K_block_ptr = tl.make_block_ptr(
             base=K,
@@ -484,13 +591,40 @@ def load_checked_2d(
             block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
             order=(1, 0)
         )
+=======
+        if not USE_TMA:
+            K_block_ptr = tl.make_block_ptr(
+                base=K,
+                shape=(QK_HEAD_DIM, KV_LEN),
+                strides=(stride_kk, stride_kn),
+                offsets=(0, kv_start),
+                block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
+                order=(0, 1)
+            )
+            V_block_ptr = tl.make_block_ptr(
+                base=V,
+                shape=(KV_LEN, V_HEAD_DIM),
+                strides=(stride_vn, stride_vk),
+                offsets=(kv_start, 0),
+                block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
+                order=(1, 0)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offs_n = kv_start + tl.arange(0, BLOCK_N)
 
         acc, l_i, m_i = forward_inner(
             {{gen_argdefs()}},
+<<<<<<< HEAD
             q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
             acc, l_i, m_i,
             off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+=======
+            q, K_block_ptr, V_block_ptr,
+            desc_k, desc_v, Q_LEN, KV_LEN,
+            acc, l_i, m_i,
+            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
+            kv_start,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             kv_indices, kv_num_blocks,
             0, block_n_end,
             MATMUL_PRECISION,
@@ -528,12 +662,22 @@ def load_checked_2d(
 @triton.jit
 def forward_inner(
     {{gen_argdefs()}},
+<<<<<<< HEAD
     q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+=======
+    q, K_block_ptr, V_block_ptr,
+    desc_k, desc_v, Q_LEN, KV_LEN,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # accumulated values
     acc, l_i, m_i,
     # Offsets used as inputs to score_mod & mask_mod
     # of size [BLOCK_M, BLOCK_N] or scalar.
     off_z, off_h, offs_m, offs_n,
+<<<<<<< HEAD
+=======
+    # Offsets needed for TMA loads
+    kv_start,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # blocksparse data
     kv_indices, kv_num_blocks,
     # start kv and end kv block
@@ -552,14 +696,28 @@ def forward_inner(
 
     # loop over k, v and update accumulator until block_n_end
     for start_n in range(block_n_start, block_n_end):
+<<<<<<< HEAD
         if IS_DIVISIBLE:
             acc, l_i, m_i = forward_block_mn(
                 {{gen_argdefs()}},
                 q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+=======
+        # Here IS_DIVISIBLE acts are the start_n = tl.multiple_of(start_n, BLOCK_N) from triton_fused_attention.
+        if IS_DIVISIBLE:
+            acc, l_i, m_i = forward_block_mn(
+                {{gen_argdefs()}},
+                q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # accumulated values
                 acc, l_i, m_i,
                 # Offsets
                 off_z, off_h, offs_m, offs_n,
+<<<<<<< HEAD
+=======
+                # Offsets needed for TMA loads
+                kv_start,
+                start_n,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 MATMUL_PRECISION, RCP_LN2,
                 IS_FULL_BLOCKS,
             )
@@ -570,25 +728,48 @@ def forward_inner(
             # to the last block because it's faster a lot.
             acc, l_i, m_i = forward_block_mn(
                 {{gen_argdefs()}},
+<<<<<<< HEAD
                 q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+=======
+                q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # accumulated values
                 acc, l_i, m_i,
                 # Offsets
                 off_z, off_h, offs_m, offs_n,
+<<<<<<< HEAD
+=======
+                # Offsets needed for TMA loads
+                kv_start,
+                start_n,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 MATMUL_PRECISION, RCP_LN2,
                 IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
             )
 
+<<<<<<< HEAD
         # update pointers
+=======
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         offset = get_offset_for_next_block(
             start_n, kv_indices, kv_num_blocks,
             SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
         )
 
+<<<<<<< HEAD
         V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
         K_block_ptr = tl.advance(K_block_ptr, (0, offset))
 
         offs_n = offs_n + offset
+=======
+        offs_n = offs_n + offset
+        if not USE_TMA:
+            K_block_ptr = tl.advance(K_block_ptr, (0, offset))
+            V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return acc, l_i, m_i
 
@@ -599,11 +780,21 @@ def forward_inner(
 @triton.jit
 def forward_block_mn(
     {{gen_argdefs()}},
+<<<<<<< HEAD
     q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+=======
+    q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # accumulated values
     acc, l_i, m_i,
     # Offsets
     off_z, off_h, offs_m, offs_n,
+<<<<<<< HEAD
+=======
+    # Offsets needed for TMA loads
+    kv_start,
+    start_n,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MATMUL_PRECISION, RCP_LN2,
     IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
 
@@ -613,7 +804,21 @@ def forward_block_mn(
 
     # -- load k --
     # NB reversed order to since K is transposed
+<<<<<<< HEAD
     k = load_checked_block(K_block_ptr, SAFE_HEAD_DIM, IS_DIVISIBLE)
+=======
+    {%- if USE_TMA %}
+    k = tl.load_tensor_descriptor(  # load in row major
+            desc_k,
+            [start_n.to(tl.int32) , kv_start],
+    )
+    {%- else %}
+    k = load_checked_block(K_block_ptr, SAFE_HEAD_DIM, IS_DIVISIBLE)
+    {%- endif %}
+
+    if USE_TMA:
+        k = tl.trans(k)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # -- compute qk ---
     qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
     if not PRESCALE_QK:
@@ -677,7 +882,18 @@ def forward_block_mn(
     l_i = l_i * alpha + tl.sum(p, 1)
     # # -- scale and update acc --
     acc = acc * alpha[:, None]
+<<<<<<< HEAD
     v = load_checked_block(V_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
+=======
+    {%- if USE_TMA %}
+    v = tl.load_tensor_descriptor(
+        desc_v,
+        [kv_start.to(tl.int32) + start_n.to(tl.int32),0],
+    )
+    {%- else %}
+    v = load_checked_block(V_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
+    {%- endif %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
 
     # -- update m_i
@@ -833,7 +1049,11 @@ def check_cpu_supported():
 
 
 def contiguous_last_dim(x):
+<<<<<<< HEAD
     """Ensure that realized IR node has a contigous stride in the last dimension."""
+=======
+    """Ensure that realized IR node has a contiguous stride in the last dimension."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     strides = x.maybe_get_stride()
     if strides and strides[-1] != 1:
         contiguous_stride_order = list(reversed(range(len(x.get_size()))))
@@ -888,7 +1108,11 @@ def lower_cpu(
     cur_kvSplitSize = V.graph.sizevars.shape_env.create_unbacked_symint().node.expr
     shape_env = V.graph.sizevars.shape_env
 
+<<<<<<< HEAD
     # We don't know the concret value of cur_qSplitSize and cur_kvSplitSize during the compilation.
+=======
+    # We don't know the concrete value of cur_qSplitSize and cur_kvSplitSize during the compilation.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Mark symbols > 1 to ensure broadcasting is always applied.
     # This avoids treating them as equal when `eq(var, 1)` is evaluated in `broadcast_symbolic_shapes`.
     shape_env.var_to_range[cur_qSplitSize] = ValueRanges(2, int_oo)
@@ -1066,6 +1290,10 @@ def convert_mask_graph_module(mask_graph):
     else:
         no_full_kv_block = False
         input_nodes += [full_kv_num_blocks]
+<<<<<<< HEAD
+=======
+        input_nodes += [full_kv_indices]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     has_other_buffer = False
     kernel_input_name_to_buffer = {}
     if score_mod_other_buffers or mask_mod_other_buffers:
@@ -1106,6 +1334,10 @@ def convert_mask_graph_module(mask_graph):
         score_mod=None if skip_mask_score else subgraph_buffer,
         mask_mod=None if skip_mask_score else mask_graph_buffer,
         kv_block_size=SPARSE_KV_BLOCK_SIZE,
+<<<<<<< HEAD
+=======
+        q_block_size=SPARSE_Q_BLOCK_SIZE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         has_other_buffer=has_other_buffer,
         no_full_kv_block=no_full_kv_block,
         fake_buffers=fake_buffers,
@@ -1125,6 +1357,18 @@ def convert_mask_graph_module(mask_graph):
         inputs_for_autotuning,
         layout,
     )
+<<<<<<< HEAD
+=======
+
+    # need subgraph inputs and outputs to analyze all symints used in flex attention
+    res.data.data.subgraph_inps = list(score_mod_other_buffers) + list(
+        mask_mod_other_buffers
+    )
+    res.data.data.subgraph_outs = get_fwd_subgraph_outputs(
+        subgraph_buffer, mask_graph_buffer
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (res,)
 
 
@@ -1378,7 +1622,10 @@ def flex_attention(
     # because they're implicitly added by the score_mod function
     # We do need to explicitly pass it in for autotuning though.
     original_kernel_options = kernel_options.copy()
+<<<<<<< HEAD
     
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Default config for warp specialization
     num_consumer_groups, num_buffers_warp_spec = 0, 0
 
@@ -1404,7 +1651,10 @@ def flex_attention(
                 cur_kernel_options[k[4:]] = v
             if k.startswith("bwd_"):
                 cur_kernel_options.pop(k)
+<<<<<<< HEAD
         
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cur_kernel_options.setdefault("num_stages", conf.num_stages)
         cur_kernel_options.setdefault("num_warps", conf.num_warps)
         if cur_kernel_options.get("num_consumer_groups", False):
@@ -1418,7 +1668,10 @@ def flex_attention(
 
         cur_kernel_options.setdefault("BLOCK_M", conf.block_m)
         cur_kernel_options.setdefault("BLOCK_N", conf.block_n)
+<<<<<<< HEAD
         
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Blocksparse options
         cur_kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
         cur_kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
@@ -1473,6 +1726,7 @@ def flex_attention(
         6: create_num_blocks_fake_generator(full_kv_indices),
         7: create_indices_fake,
     }
+<<<<<<< HEAD
     return (
         autotune_select_algorithm(
             "flex_attention",
@@ -1484,6 +1738,29 @@ def flex_attention(
         logsumexp,
     )
 
+=======
+
+    out = autotune_select_algorithm(
+        "flex_attention",
+        choices,
+        # Need to filter out symbols since there is an invariant
+        # that all input_nodes are of type IRNode
+        [x for x in inputs_for_autotuning if isinstance(x, torch._inductor.ir.IRNode)],
+        layout,
+        input_gen_fns=input_gen_fns,
+    )
+
+    # need subgraph inputs and outputs to analyze all symints used in flex attention
+    out.data.data.subgraph_inps = list(score_mod_other_buffers) + list(
+        mask_mod_other_buffers
+    )
+    out.data.data.subgraph_outs = get_fwd_subgraph_outputs(
+        subgraph_buffer, mask_graph_buffer
+    )
+
+    return (out, logsumexp)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # ---------------------------- Backward HOP Implementation ----------------------------
 
@@ -1610,7 +1887,11 @@ def flex_attention_backward_grid(
         sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask
         sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m  # noqa: B950
 
+<<<<<<< HEAD
         # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads.
+=======
+        # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64)
         do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64)
         dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64)
@@ -1718,7 +1999,11 @@ def flex_attention_backward_grid(
         for off_g in range(0, GQA_SHARED_HEADS):
             off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g
 
+<<<<<<< HEAD
             # Offset Q, DQ, DO, DELTA & LSE. These inputs are offseted by query heads.
+=======
+            # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64)
             do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64)
             dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64)
@@ -2599,11 +2884,26 @@ def flex_attention_backward(*args, **kwargs):
     broadcasted_grad_key = autotune_select_algorithm(
         "flex_attention_backward",
         choices,
+<<<<<<< HEAD
         inputs_for_autotuning,
+=======
+        [x for x in inputs_for_autotuning if isinstance(x, torch._inductor.ir.IRNode)],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         layout_broadcasted_k,
         input_gen_fns=input_gen_fns,
     )  # [Bq, Hkv, seq_len_kv, k_head_dim]
 
+<<<<<<< HEAD
+=======
+    # need subgraph inputs and outputs to analyze all symints used in flex attention
+    broadcasted_grad_key.data.data.subgraph_inps = list(score_mod_other_buffers) + list(
+        mask_mod_other_buffers
+    )
+    broadcasted_grad_key.data.data.subgraph_outs = get_bwd_subgraph_outputs(
+        fw_subgraph_buffer, mask_graph_buffer, joint_outputs
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if V.graph.sizevars.evaluate_expr(sympy.Eq(Bq, Bkv)):
         grad_key = broadcasted_grad_key
         grad_value = broadcasted_grad_value
@@ -2617,3 +2917,29 @@ def flex_attention_backward(*args, **kwargs):
         grad_value = lowerings[aten.sum](broadcasted_grad_value, axis=0, keepdims=True)
 
     return (grad_query, grad_key, grad_value, tuple(joint_outputs.captured_grads))
+<<<<<<< HEAD
+=======
+
+
+def get_bwd_subgraph_outputs(
+    subgraph_buffer: SubgraphResults,
+    mask_graph_buffer: SubgraphResults,
+    joint_outputs: JointOutputResult,
+) -> list[Optional[Union[ComputedBuffer, TensorBox]]]:
+    subgraph_buffer = (
+        subgraph_buffer if isinstance(subgraph_buffer, Sequence) else [subgraph_buffer]
+    )
+    mask_graph_buffer = (
+        mask_graph_buffer
+        if isinstance(mask_graph_buffer, Sequence)
+        else [mask_graph_buffer]
+    )
+    joint_output_buffers = [
+        joint_outputs.grad_input,
+        *joint_outputs.captured_grads_compute,
+        *joint_outputs.captured_grads,
+        *joint_outputs.mutated_grads,
+    ]
+
+    return [*subgraph_buffer, *mask_graph_buffer, *joint_output_buffers]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/kernel/flex_decoding.py b/torch/_inductor/kernel/flex_decoding.py
index 74e511ca6a5e..b33288d595d7 100644
--- a/torch/_inductor/kernel/flex_decoding.py
+++ b/torch/_inductor/kernel/flex_decoding.py
@@ -20,6 +20,10 @@
     create_indices_fake,
     create_num_blocks_fake_generator,
     get_bounded_indices_func,
+<<<<<<< HEAD
+=======
+    get_fwd_subgraph_outputs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     load_checked_2d,
     load_checked_block,
     maybe_realize,
@@ -195,11 +199,19 @@ def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, me
 
     acc, l_i, m_i = forward_inner(
         {{gen_argdefs()}},
+<<<<<<< HEAD
         q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+=======
+        q, K_block_ptr, V_block_ptr, None, None, Q_LEN, KV_LEN,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # accumulatd values
         acc, l_i, m_i,
         #offsets
         off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+<<<<<<< HEAD
+=======
+        None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #block sparse data
         kv_indices, kv_num_blocks,
         block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
@@ -244,11 +256,19 @@ def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, me
 
         acc, l_i, m_i = forward_inner(
             {{gen_argdefs()}},
+<<<<<<< HEAD
             q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
+=======
+            q, K_block_ptr, V_block_ptr, None, None, Q_LEN, KV_LEN,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # accumulatd values
             acc, l_i, m_i,
             #offsets
             off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
+<<<<<<< HEAD
+=======
+            None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             #block sparse data
             kv_indices, kv_num_blocks,
             block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
@@ -587,6 +607,17 @@ def create_flex_decoding_kernel(*args, **kwargs):
         input_gen_fns=input_gen_fns,
     )
 
+<<<<<<< HEAD
+=======
+    # need subgraph inputs and outputs to analyze all symints used in flex attention
+    buf_ACC.data.data.subgraph_inps = list(score_mod_other_buffers) + list(
+        mask_mod_other_buffers
+    )
+    buf_ACC.data.data.subgraph_outs = get_fwd_subgraph_outputs(
+        score_mod_subgraph, mask_mod_subgraph
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Reduction
 
     g_M = lowerings[aten.max](buf_M, dim=1, keepdim=True)[0]
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index 156a52c7847c..7ff87e131636 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -1,9 +1,16 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
+<<<<<<< HEAD
 import re
 from typing import Any, Optional
 
+=======
+from typing import Any, Optional
+
+import sympy
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch._dynamo.utils import counters
 from torch._inductor.autoheuristic.autoheuristic import AutoHeuristicSelectAlgorithm
@@ -15,11 +22,22 @@
 )
 from torch._inductor.codegen.cpp_gemm_template import CppGemmTemplate
 from torch._inductor.virtualized import V
+<<<<<<< HEAD
 
 from .. import config as inductor_config, ir
 from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
 from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
 from ..codegen.wrapper import PythonWrapperCodegen
+=======
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.torch_version import TorchVersion
+
+from .. import config as inductor_config, ir
+from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
+from ..codegen.rocm.ck_tile_universal_gemm_template import CKTileGemmTemplate
+from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
+from ..codegen.subgraph import SubgraphTemplate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ..ir import FlexibleLayout, is_triton
 from ..lowering import (
     add_layout_constraint,
@@ -34,12 +52,24 @@
     TritonTemplate,
 )
 from ..utils import (
+<<<<<<< HEAD
     get_tma_workspace_arg,
     use_aten_gemm_kernels,
     use_ck_gemm_template,
     use_cpp_gemm_template,
     use_cutlass_template,
     use_max_autotune,
+=======
+    _use_cutlass_for_op,
+    get_k_splits,
+    get_tma_workspace_arg,
+    use_aten_gemm_kernels,
+    use_ck_gemm_template,
+    use_ck_tile_gemm_template,
+    use_cpp_gemm_template,
+    use_cutlass_template,
+    use_decompose_k_choice,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     use_triton_template,
     use_triton_tma_template,
 )
@@ -54,6 +84,7 @@
     persistent_mm_options,
     scale_mm_epilogue,
     scaled_mm_options,
+<<<<<<< HEAD
     should_fallback_to_aten,
 )
 
@@ -86,6 +117,23 @@ def parse_version(version_string: str) -> Optional[tuple[int, ...]]:
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
+=======
+)
+
+
+try:
+    import triton
+
+    triton_version = TorchVersion(triton.__version__)
+    has_triton = True
+except ImportError:
+    triton_version = TorchVersion("0.0.0")
+    has_triton = False
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+prims = torch.ops.prims
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 mm_template = TritonTemplate(
     name="mm",
@@ -115,6 +163,11 @@ def parse_version(version_string: str) -> Optional[tuple[int, ...]]:
     group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
     pid_m = group_id * GROUP_M + (pid % group_size)
     pid_n = (pid % width) // (group_size)
+<<<<<<< HEAD
+=======
+    tl.assume(pid_m >= 0)
+    tl.assume(pid_n >= 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
@@ -161,8 +214,12 @@ def parse_version(version_string: str) -> Optional[tuple[int, ...]]:
     # inductor generates a suffix
     {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
 """
+<<<<<<< HEAD
         if (torch.version.hip is None)
         or (has_triton and triton_major >= 3 and triton_minor >= 3)
+=======
+        if (torch.version.hip is None) or triton_version >= "3.3.0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # FIXME: To get around rocm failures like https://github.com/pytorch/pytorch/actions/runs/13123783322/job/36617154943
         # The only difference between the two templates is M >= BLOCK_M and N >= BLOCK_N checking.
         # See more details in https://github.com/pytorch/pytorch/pull/146293
@@ -190,6 +247,11 @@ def parse_version(version_string: str) -> Optional[tuple[int, ...]]:
     group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
     pid_m = group_id * GROUP_M + (pid % group_size)
     pid_n = (pid % width) // (group_size)
+<<<<<<< HEAD
+=======
+    tl.assume(pid_m >= 0)
+    tl.assume(pid_n >= 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
@@ -236,6 +298,11 @@ def parse_version(version_string: str) -> Optional[tuple[int, ...]]:
     {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
 """
     ),
+<<<<<<< HEAD
+=======
+    cache_codegen_enabled_for_template=True,
+    prologue_loads_all_inputs=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 persistent_tma_mm_template = TritonTemplate(
@@ -270,6 +337,10 @@ def parse_version(version_string: str) -> Optional[tuple[int, ...]]:
     a_desc_ptr = workspace_base
     b_desc_ptr = workspace_base + TMA_SIZE
 
+<<<<<<< HEAD
+=======
+    {%- if TMA_EXPERIMENTAL_API %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     triton.language.extra.cuda.experimental_device_tensormap_create2d(
         desc_ptr=a_desc_ptr,
         global_address=A,
@@ -288,6 +359,26 @@ def parse_version(version_string: str) -> Optional[tuple[int, ...]]:
     tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
     tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)
 
+<<<<<<< HEAD
+=======
+    a_desc = a_desc_ptr
+    b_desc = b_desc_ptr
+    {%- else %}
+    a_desc = triton.language.make_tensor_descriptor(
+        base=A,
+        shape=[M, K] if A_ROW_MAJOR else [K, M],
+        strides=[K, 1] if A_ROW_MAJOR else [M, 1],
+        block_shape=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
+    )
+    b_desc = triton.language.make_tensor_descriptor(
+        base=B,
+        shape=[K, N] if B_ROW_MAJOR else [N, K],
+        strides=[N, 1] if B_ROW_MAJOR else [K, 1],
+        block_shape=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
+    )
+    {%- endif %}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pid_m = 0
     pid_n = 0
     rm = 0
@@ -308,24 +399,48 @@ def parse_version(version_string: str) -> Optional[tuple[int, ...]]:
 
         rk = ki * BLOCK_K
 
+<<<<<<< HEAD
         a = tl._experimental_descriptor_load(
             a_desc_ptr,
+=======
+        {%- if TMA_EXPERIMENTAL_API %}
+        a = tl._experimental_descriptor_load(
+            a_desc,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [rm, rk] if A_ROW_MAJOR else [rk, rm],
             [BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
             A.dtype.element_ty,
         )
         b = tl._experimental_descriptor_load(
+<<<<<<< HEAD
             b_desc_ptr,
+=======
+            b_desc,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [rk, rn] if B_ROW_MAJOR else [rn, rk],
             [BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
             B.dtype.element_ty,
         )
+<<<<<<< HEAD
+=======
+        {%- else %}
+        a = tl.load_tensor_descriptor(
+            a_desc,
+            [rm, rk] if A_ROW_MAJOR else [rk, rm],
+        )
+        b = tl.load_tensor_descriptor(
+            b_desc,
+            [rk, rn] if B_ROW_MAJOR else [rn, rk],
+        )
+        {%- endif %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         acc += tl.dot(
             a if A_ROW_MAJOR else a.T,
             b if B_ROW_MAJOR else b.T,
             allow_tf32=ALLOW_TF32,
         )
 
+<<<<<<< HEAD
         {% if ki == k_tiles - 1 %}
         # rematerialize rm and rn to save registers
         rcm = rm + tl.arange(0, BLOCK_M)
@@ -338,6 +453,20 @@ def parse_version(version_string: str) -> Optional[tuple[int, ...]]:
         {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
         acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
         {% endif %}
+=======
+        if ki == k_tiles - 1:
+            # rematerialize rm and rn to save registers
+            rcm = rm + tl.arange(0, BLOCK_M)
+            rcn = rn + tl.arange(0, BLOCK_N)
+            idx_m = rcm[:, None]
+            idx_n = rcn[None, :]
+            mask = (idx_m < M) & (idx_n < N)
+
+            # inductor generates a suffix
+            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
+            acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -421,6 +550,10 @@ def apply_scaling(
     a_desc_ptr = workspace_base
     b_desc_ptr = workspace_base + TMA_SIZE
 
+<<<<<<< HEAD
+=======
+    {%- if TMA_EXPERIMENTAL_API %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     triton.language.extra.cuda.experimental_device_tensormap_create2d(
         desc_ptr=a_desc_ptr,
         global_address=A,
@@ -439,6 +572,26 @@ def apply_scaling(
     tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
     tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)
 
+<<<<<<< HEAD
+=======
+    a_desc = a_desc_ptr
+    b_desc = a_desc_ptr
+    {%- else %}
+    a_desc = triton.language.make_tensor_descriptor(
+        base=A,
+        shape=[M, K],
+        strides=[K, 1],
+        block_shape=[BLOCK_M, BLOCK_K],
+    )
+    b_desc = triton.language.make_tensor_descriptor(
+        base=B,
+        shape=[N, K],
+        strides=[K, 1],
+        block_shape=[BLOCK_N, BLOCK_K],
+    )
+    {%- endif %}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tiles_per_SM = num_tiles // NUM_SMS
     if start_pid < num_tiles % NUM_SMS:
         tiles_per_SM += 1
@@ -470,17 +623,29 @@ def apply_scaling(
 
         offs_k = ki * BLOCK_K
 
+<<<<<<< HEAD
+=======
+        {%- if TMA_EXPERIMENTAL_API %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a = tl._experimental_descriptor_load(
             a_desc_ptr, [offs_am, offs_k], [BLOCK_M, BLOCK_K],  A.dtype.element_ty
         )
         b = tl._experimental_descriptor_load(
             b_desc_ptr, [offs_bn, offs_k], [BLOCK_N, BLOCK_K],  B.dtype.element_ty
         )
+<<<<<<< HEAD
+=======
+        {%- else %}
+        a = tl.load_tensor_descriptor(a_desc, [offs_am, offs_k])
+        b = tl.load_tensor_descriptor(b_desc, [offs_bn, offs_k])
+        {%- endif %}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if USE_FAST_ACCUM:
             accumulator = tl.dot(a, b.T, accumulator)
         else:
             accumulator += tl.dot(a, b.T)
 
+<<<<<<< HEAD
         {% if ki == k_tiles - 1 %}
         # Apply inverse scaling
         offs_cm = offs_am + tl.arange(0, BLOCK_M)
@@ -506,6 +671,32 @@ def apply_scaling(
         {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
         accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
         {% endif %}
+=======
+        if ki == k_tiles - 1:
+            # Apply inverse scaling
+            offs_cm = offs_am + tl.arange(0, BLOCK_M)
+            offs_cn = offs_bn + tl.arange(0, BLOCK_N)
+            # Apply scaling
+            accumulator = apply_scaling(
+                accumulator,
+                a_scale,
+                b_scale,
+                SCALING_ROWWISE,
+                offs_cm,
+                offs_cn,
+                M,
+                N,
+                stride_a_scale_m,
+                stride_b_scale_n,
+            )
+
+            idx_m = offs_cm[:, None]
+            idx_n = offs_cn[None, :]
+            mask = (idx_m < M) & (idx_n < N)
+            # inductor generates a suffix
+            {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
+            accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 
 
@@ -517,7 +708,11 @@ def apply_scaling(
 
 
 # prevent duplication registration of extern functions
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def lazy_register_extern_choice(fn):
     return ExternKernelChoice(fn)
 
@@ -600,8 +795,30 @@ def has_zero_dim(size) -> bool:
 aten_bias_addmm = ExternKernelChoice(bias_addmm, None)
 
 
+<<<<<<< HEAD
+@register_lowering(aten.mm, type_promotion_kind=None)
+def tuned_mm(mat1, mat2, *, layout=None):
+=======
+def decomposeK(a, b, k_splits):
+    m = a.shape[0]
+    n = b.shape[1]
+    k = a.shape[1]
+
+    k_parts = k // k_splits
+    B = k_splits
+    a_reshaped = torch.permute(a.reshape(m, B, k_parts), (1, 0, 2))
+    b_reshaped = b.reshape(B, k_parts, n)
+    result = torch.bmm(a_reshaped, b_reshaped, out_dtype=torch.float32)
+    reduced_buf = torch.sum(result, 0)
+    return reduced_buf.to(a.dtype)
+
+
 @register_lowering(aten.mm, type_promotion_kind=None)
 def tuned_mm(mat1, mat2, *, layout=None):
+    """
+    Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
     device_type = ir.get_device_type(mat1)
     name = "mm"
@@ -619,7 +836,11 @@ def tuned_mm(mat1, mat2, *, layout=None):
     )
 
     aten_layout = layout
+<<<<<<< HEAD
     if not use_max_autotune():
+=======
+    if not (inductor_config.max_autotune or inductor_config.max_autotune_gemm):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aten_layout = FlexibleLayout(
             device=layout.device, dtype=layout.dtype, size=layout.size
         )
@@ -634,9 +855,19 @@ def tuned_mm(mat1, mat2, *, layout=None):
     persistent_mm_configs = V.choices.get_persistent_mm_configs(device_type)
     extra_mm_configs = V.choices.get_extra_mm_configs(device_type)
 
+<<<<<<< HEAD
     if is_nonzero and use_triton_template(layout):
         for config in mm_configs(
             m, n, k, *mm_config_kwargs(device_type, _is_large_block_for_cpu)
+=======
+    dtype = mat1.get_dtype()
+    if is_nonzero and use_triton_template(layout):
+        for config in mm_configs(
+            m,
+            n,
+            k,
+            **mm_config_kwargs(device_type, _is_large_block_for_cpu, dtype.itemsize),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             mm_template.maybe_append_choice(
                 choices,
@@ -644,9 +875,21 @@ def tuned_mm(mat1, mat2, *, layout=None):
                 layout=layout,
                 **mm_options(config, m, n, k, layout),
             )
+<<<<<<< HEAD
         if use_triton_tma_template(mat1, mat2):
             for config in persistent_mm_configs(
                 m, n, k, **mm_config_kwargs(device_type, _is_large_block_for_cpu)
+=======
+
+        if use_triton_tma_template(mat1, mat2):
+            for config in persistent_mm_configs(
+                m,
+                n,
+                k,
+                **mm_config_kwargs(
+                    device_type, _is_large_block_for_cpu, dtype.itemsize
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 persistent_tma_mm_template.maybe_append_choice(
                     choices,
@@ -660,11 +903,66 @@ def tuned_mm(mat1, mat2, *, layout=None):
                     **persistent_mm_options(mat1, mat2),
                 )
 
+<<<<<<< HEAD
     if is_nonzero and use_cutlass_template(layout, m, n, k):
+=======
+        from torch._inductor.ir import get_free_symbols
+
+        # Only do split-k optimization if K is much larger than m, n and m, n are small
+        # and if there aren't any unbacked symbols
+        unbacked_symbols = any(
+            len(get_free_symbols(itr, unbacked_only=True)) > 0
+            for itr in (
+                mat1.get_size(),
+                mat1.get_stride(),
+                mat2.get_size(),
+                mat2.get_stride(),
+            )
+        )
+        if use_decompose_k_choice(m, n, k) and not unbacked_symbols:
+            from torch._dispatch.python import enable_python_dispatcher
+
+            from ..decomposition import select_decomp_table
+
+            k_splits = get_k_splits(m, n, k)
+            for k_split in k_splits:
+                if not V.graph.sizevars.statically_known_true(
+                    sympy.Eq(sympy.Mod(k, k_split), 0)
+                ):
+                    continue
+
+                with enable_python_dispatcher():
+                    decompositions = select_decomp_table()
+
+                    decompose_k_subgraph_template = SubgraphTemplate(
+                        name=f"decompose_k_mm_{k_split}_split",
+                        make_fx_graph=make_fx(
+                            functools.partial(decomposeK, k_splits=k_split),
+                            decompositions,
+                        ),
+                    )
+
+                decompose_k_subgraph_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(mat1, mat2),
+                    layout=layout,
+                )
+
+    if (
+        is_nonzero
+        and use_cutlass_template(layout, m, n, k)
+        and _use_cutlass_for_op("mm")
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])
 
     if is_nonzero and use_ck_gemm_template(layout, m, n, k):
         CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
+<<<<<<< HEAD
+=======
+    if is_nonzero and use_ck_tile_gemm_template(layout, m, n, k):
+        CKTileGemmTemplate.add_choices(choices, layout, [mat1, mat2])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if use_cpp_gemm_template(layout, mat1, mat2):
         CppGemmTemplate.add_choices(
@@ -723,9 +1021,12 @@ def tuned_mm(mat1, mat2, *, layout=None):
     for k in inductor_config.external_matmul:
         choices.append(lazy_register_extern_choice(k).bind((mat1, mat2), layout))
 
+<<<<<<< HEAD
     if should_fallback_to_aten(choices):
         return aten_mm.bind((mat1, mat2), aten_layout).output_node()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return autotune_select_algorithm(name, choices, [mat1, mat2], layout)
 
 
@@ -756,7 +1057,11 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
         [aten__int_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
     )
 
+<<<<<<< HEAD
     if use_cutlass:
+=======
+    if use_cutlass and _use_cutlass_for_op("int_mm"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
         )
@@ -774,15 +1079,21 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
                 **mm_options(config, m, n, k, layout),
             )
 
+<<<<<<< HEAD
     if should_fallback_to_aten(choices):
         return aten__int_mm.bind((mat1, mat2), layout).output_node()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
 
 
 @register_lowering(aten.addmm, type_promotion_kind=None)
 def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+<<<<<<< HEAD
     ordered_kwargs_for_cpp_kernel = ("beta", "alpha")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device_type = ir.get_device_type(mat1)
     m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
     static_shape, is_nonzero = _is_static_problem(layout)
@@ -799,7 +1110,13 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         layout,
     )
 
+<<<<<<< HEAD
     if (not is_nonzero) or (not use_max_autotune()):
+=======
+    if (not is_nonzero) or (
+        not (inductor_config.max_autotune or inductor_config.max_autotune_gemm)
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Use a FlexibleLayout if we are not autotuning.
         # This allows padding strides for the output.
         from torch._inductor.ir import FixedLayout, FlexibleLayout
@@ -852,9 +1169,19 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     mm_configs = V.choices.get_base_mm_configs(device_type)
     persistent_mm_configs = V.choices.get_persistent_mm_configs(device_type)
 
+<<<<<<< HEAD
     if is_nonzero and use_triton_template(layout):
         for config in mm_configs(
             m, n, k, **mm_config_kwargs(device_type, _is_large_block_for_cpu)
+=======
+    dtype = mat1.get_dtype()
+    if is_nonzero and use_triton_template(layout):
+        for config in mm_configs(
+            m,
+            n,
+            k,
+            **mm_config_kwargs(device_type, _is_large_block_for_cpu, dtype.itemsize),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             mm_template.maybe_append_choice(
                 choices,
@@ -863,11 +1190,24 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
                 **mm_options(config, m, n, k, layout),
                 prefix_args=1,
                 epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+<<<<<<< HEAD
+=======
+                epilogue_fn_hash=str(["addmm_epilogue", layout.dtype, alpha, beta]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         if use_triton_tma_template(mat1, mat2):
             for config in persistent_mm_configs(
+<<<<<<< HEAD
                 m, n, k, **mm_config_kwargs(device_type, _is_large_block_for_cpu)
+=======
+                m,
+                n,
+                k,
+                **mm_config_kwargs(
+                    device_type, _is_large_block_for_cpu, dtype.itemsize
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 persistent_tma_mm_template.maybe_append_choice(
                     choices,
@@ -883,6 +1223,7 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
                     epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
                 )
 
+<<<<<<< HEAD
     if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
         # Filter out a known cause of CUDA illegal memory access errors
         # broadcasting on the last dim of the bias term seems not to be working
@@ -901,6 +1242,21 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
                 beta=beta,
                 input_reorder=[2, 0, 1],
             )
+=======
+    if (
+        is_nonzero
+        and use_cutlass_template(layout, m, n, k)
+        and _use_cutlass_for_op("addmm")
+    ):
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
+            choices,
+            layout,
+            [mat1, mat2, inp_expanded],
+            alpha=alpha,
+            beta=beta,
+            input_reorder=[2, 0, 1],
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if is_nonzero and use_ck_gemm_template(layout, m, n, k):
         CKGemmTemplate.add_ck_gemm_choices(
@@ -922,6 +1278,7 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
             has_bias=True,
         )
 
+<<<<<<< HEAD
     if should_fallback_to_aten(choices):
         choices.append(
             aten_addmm.bind(
@@ -946,6 +1303,8 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
                 ),
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return autotune_select_algorithm(
         "addmm", choices, [inp_expanded, mat1, mat2], layout
     )
@@ -986,7 +1345,15 @@ def tuned_sparse_semi_structured_mm(
         else []
     )
 
+<<<<<<< HEAD
     if m * n != 0 and use_cutlass_template(layout, m, n, k):
+=======
+    if (
+        m * n != 0
+        and use_cutlass_template(layout, m, n, k)
+        and _use_cutlass_for_op("sparse_semi_structured_mm")
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         CUTLASS2xGemmTemplate.add_cutlass_gemm_choices(
             choices, layout, [mat1, mat2, mat1_meta], fuseable=True, non_fuseable=True
         )
@@ -1011,6 +1378,24 @@ def tuned_scaled_mm(
     use_fast_accum=False,
     layout=None,
 ):
+<<<<<<< HEAD
+=======
+    """
+    Performs an optimized matrix multiplication where scaling factors are applied
+    to the inputs and/or output.
+
+    Args:
+        mat1 (Tensor): First input matrix
+        mat2 (Tensor): Second input matrix
+        scale1 (Tensor): Scale factor applied to mat1 (supports broadcasting)
+        scale2 (Tensor): Scale factor applied to mat2 (supports broadcasting)
+        bias (Tensor, optional): Optional bias tensor to add to the result
+        layout: Layout hint for optimization
+
+    Returns:
+        Tensor: The result of the scaled matrix multiplication
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     m, n, k, layout, mat_a, mat_b = mm_args(
         mat_a, mat_b, layout=layout, out_dtype=out_dtype
     )
@@ -1047,6 +1432,13 @@ def tuned_scaled_mm(
     if use_aten_gemm_kernels():
         choices.append(aten_choice)
 
+<<<<<<< HEAD
+=======
+    # We dont have triton lowerings for the MX variants yet
+    if scale_a.dtype != torch.float32:
+        return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _, is_nonzero = _is_static_problem(layout)
 
     scaled_mm_configs = V.choices.get_scaled_mm_configs(device_type)
@@ -1111,12 +1503,22 @@ def tuned_scaled_mm(
                 )
 
         for config in scaled_mm_configs(m, n, k):
+<<<<<<< HEAD
             if k == 16 and config.kwargs["BLOCK_M"] >= 64:
                 continue  # Triton crashes in this case
 
             # On NVIDIA B200 GPUs, K dim must be >= 32 for tcgen05.mma.kind::f8f6f4.* PTX instruction to be valid
             # source: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-matrix-shape
             if using_b200() and k < 32:
+=======
+            if V.graph.sizevars.guard_or_false(sympy.Le(k, 16)):
+                # Triton crashes however uncommon for real workloads
+                continue
+
+            # On NVIDIA B200 GPUs, K dim must be >= 32 for tcgen05.mma.kind::f8f6f4.* PTX instruction to be valid
+            # source: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-matrix-shape
+            if using_b200() and V.graph.sizevars.guard_or_false(sympy.Lt(k, 32)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
 
             kwargs = scaled_mm_options(
@@ -1130,6 +1532,7 @@ def tuned_scaled_mm(
                 **kwargs,
                 suffix_args=suffix_args,
                 epilogue_fn=scale_mm_epilogue(),
+<<<<<<< HEAD
             )
 
     if is_nonzero and use_ck_gemm_template(layout, m, n, k):
@@ -1142,6 +1545,30 @@ def tuned_scaled_mm(
 
 
 @functools.lru_cache(None)
+=======
+                epilogue_fn_hash="scale_mm_epilogue",
+            )
+
+    if (
+        is_nonzero
+        and use_cutlass_template(layout, m, n, k)
+        and _use_cutlass_for_op("scaled_mm")
+    ):
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
+            choices,
+            layout,
+            input_nodes,  # type: ignore[arg-type]
+            use_fast_accum=use_fast_accum,  # type: ignore[arg-type]
+        )
+
+    if is_nonzero and use_ck_gemm_template(layout, m, n, k):
+        CKGemmTemplate.add_ck_gemm_choices(choices, layout, input_nodes)
+
+    return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_sm7x_or_older_gpu(index: Optional[int]) -> bool:
     props = torch.cuda.get_device_properties(index or 0)
     return props.major <= 7
@@ -1186,7 +1613,10 @@ def get_context(m, k, n, mat1, mat2, mat1_stride, mat2_stride):
             "mat2_iscontig", mat2.layout.is_contiguous(), is_categorical=True
         )
         if name == "mm":
+<<<<<<< HEAD
             # for mixed_mm, we only consider fp16
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             context_add_using_tf32(context, mat1.layout.dtype)
         return context
 
diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py
index 079d6e83d623..621ca6a388a0 100644
--- a/torch/_inductor/kernel/mm_common.py
+++ b/torch/_inductor/kernel/mm_common.py
@@ -1,22 +1,36 @@
 # mypy: allow-untyped-defs
 import logging
+<<<<<<< HEAD
+=======
+from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 
 import sympy
 
 import torch
 from torch._inductor.select_algorithm import realize_inputs, SymbolicGridFn
+<<<<<<< HEAD
+=======
+from torch._inductor.utils import sympy_product
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.virtualized import V
 
 from .. import config as inductor_config
 from ..codegen.wrapper import PythonWrapperCodegen
+<<<<<<< HEAD
 from ..ir import ChoiceCaller, Layout
 from ..utils import get_num_sms, TMA_DESCRIPTOR_SIZE, use_aten_gemm_kernels
+=======
+from ..ir import _IntLike, Layout, TensorBox
+from ..utils import get_num_sms, TMA_DESCRIPTOR_SIZE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 def should_fallback_to_aten(choices: list[ChoiceCaller]) -> bool:
     if len(choices) == 0 and not use_aten_gemm_kernels():
         if inductor_config.autotune_fallback_to_aten:
@@ -36,6 +50,8 @@ def should_fallback_to_aten(choices: list[ChoiceCaller]) -> bool:
     return False
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @SymbolicGridFn
 def mm_grid(m, n, meta, *, cdiv):
     """
@@ -54,6 +70,15 @@ def persistent_mm_grid(M: int, N: int, meta: dict[str, Any], *, cdiv, min):
     )
 
 
+<<<<<<< HEAD
+=======
+@SymbolicGridFn
+def persistent_grouped_mm_grid(*args):
+    meta = args[-1]
+    return (meta["NUM_SMS"], 1, 1)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def acc_type(dtype):
     if dtype in (torch.float16, torch.bfloat16):
         return "tl.float32"
@@ -90,13 +115,29 @@ def mm_options(config, sym_m, sym_n, sym_k, layout):
     return options_dict
 
 
+<<<<<<< HEAD
 def persistent_mm_options(mat1, mat2):
     return dict(
+=======
+def tma_options() -> dict[str, Any]:
+    from torch.utils._triton import has_triton_stable_tma_api
+
+    return {"TMA_EXPERIMENTAL_API": not has_triton_stable_tma_api()}
+
+
+def persistent_mm_options(mat1, mat2):
+    res = dict(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         A_ROW_MAJOR=not mat1.layout.is_transposed(),
         B_ROW_MAJOR=not mat2.layout.is_transposed(),
         NUM_SMS=get_num_sms(),
         TMA_SIZE=TMA_DESCRIPTOR_SIZE,
     )
+<<<<<<< HEAD
+=======
+    res.update(tma_options())
+    return res
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def scaled_mm_options(  # type: ignore[no-untyped-def]
@@ -111,7 +152,11 @@ def scaled_mm_options(  # type: ignore[no-untyped-def]
     device_tma: bool = False,
 ) -> dict[str, Any]:
     def are_compatible_scales(size_a, size_b) -> bool:
+<<<<<<< HEAD
         # Same sized scales are compatable
+=======
+        # Same sized scales are compatible
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(size_a) == len(size_b):
             return True
 
@@ -137,6 +182,11 @@ def are_compatible_scales(size_a, size_b) -> bool:
         mm_template_options["TMA_SIZE"] = TMA_DESCRIPTOR_SIZE
         mm_template_options["NUM_SMS"] = get_num_sms()
 
+<<<<<<< HEAD
+=======
+    mm_template_options.update(tma_options())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return mm_template_options
 
 
@@ -182,12 +232,24 @@ def mm_args(
     return [m, n, k, layout, mat1, mat2, *others]
 
 
+<<<<<<< HEAD
 def mm_config_kwargs(device, exclude_condition):
+=======
+def mm_config_kwargs(device, exclude_condition, dtype_size=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if device == "cpu":
         return {
             "scale": 0.5,
             "exclude": exclude_condition,
         }
+<<<<<<< HEAD
+=======
+
+    if dtype_size and inductor_config.max_autotune_gemm_search_space == "EXHAUSTIVE":
+        return {
+            "dtype_size": dtype_size,
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return {}
 
 
@@ -259,3 +321,43 @@ def _is_static_problem(layout: Layout) -> tuple[bool, bool]:
         numel *= dim
     nonzero = numel > 0
     return static_shape, nonzero
+<<<<<<< HEAD
+=======
+
+
+def check_supported_striding(mat_a: TensorBox, mat_b: TensorBox) -> None:
+    def is_row_major(stride: Sequence[_IntLike]) -> bool:
+        return stride[-1] == 1
+
+    def is_col_major(stride: Sequence[_IntLike]) -> bool:
+        return stride[-2] == 1
+
+    def has_zero_dim(size: Sequence[_IntLike]) -> bool:
+        return bool(size[0] == 0 or size[1] == 0)
+
+    # Check mat_a (self) stride requirements
+    torch._check(
+        is_row_major(mat_a.get_stride()) or has_zero_dim(mat_a.get_size()),
+        lambda: f"mat_a must be row_major, got stride {mat_a.get_stride()}",
+    )
+
+    # Check mat_b stride requirements
+    torch._check(
+        is_col_major(mat_b.get_stride()) or has_zero_dim(mat_b.get_size()),
+        lambda: f"mat_b must be col_major, got stride {mat_b.get_stride()}",
+    )
+
+
+def is_batch_stride_largest(mat1, mat2, layout) -> bool:
+    """
+    Checking if the batch stride is the largest in the stride.
+    """
+    sizes = [mat1.get_size(), mat2.get_size(), layout.size]
+    strides = [mat1.get_stride(), mat2.get_stride(), layout.stride]
+    for size, stride in zip(sizes, strides):
+        assert len(size) == len(stride) == 3, "Expect 3D tensors"
+        if stride[0] != sympy_product(size[1:]):
+            return False
+
+    return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py
index ac6bbee6c75a..a2ab1b7f0c48 100644
--- a/torch/_inductor/kernel/mm_plus_mm.py
+++ b/torch/_inductor/kernel/mm_plus_mm.py
@@ -53,6 +53,11 @@
     group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
     pid_m = group_id * GROUP_M + (pid % group_size)
     pid_n = (pid % width) // (group_size)
+<<<<<<< HEAD
+=======
+    tl.assume(pid_m >= 0)
+    tl.assume(pid_n >= 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
     rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
@@ -109,6 +114,10 @@
     # inductor generates a suffix
     {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
 """,
+<<<<<<< HEAD
+=======
+    cache_codegen_enabled_for_template=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -132,7 +141,11 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
         )
     ):
         # TODO(jansel): support different K values when this is fixed:
+<<<<<<< HEAD
         # https://github.com/openai/triton/issues/967
+=======
+        # https://github.com/triton-lang/triton/issues/967
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return lowerings[aten.add](
             lowerings[aten.mm](mat1, mat2), lowerings[aten.mm](mat3, mat4)
         )
@@ -146,10 +159,16 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
     )
 
     mm_configs = V.choices.get_mm_plus_mm_configs(device_type)
+<<<<<<< HEAD
 
     if use_triton_template(layout1):
         for config in mm_configs():
             # see https://github.com/openai/triton/issues/1298
+=======
+    if use_triton_template(layout1):
+        for config in mm_configs():
+            # see https://github.com/triton-lang/triton/issues/1298
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # BLOCK_K = K causes llvm error
             if V.graph.sizevars.statically_known_lt(config.kwargs["BLOCK_K"], k1):
                 mm_plus_mm_template.maybe_append_choice(
diff --git a/torch/_inductor/kernel/mm_scaled_grouped.py b/torch/_inductor/kernel/mm_scaled_grouped.py
new file mode 100644
index 000000000000..ad34ea0210b5
--- /dev/null
+++ b/torch/_inductor/kernel/mm_scaled_grouped.py
@@ -0,0 +1,741 @@
+# mypy: allow-untyped-defs
+import logging
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import torch
+from torch._dynamo.utils import counters
+from torch._inductor.runtime.triton_compat import tl
+from torch._inductor.virtualized import V
+from torch.utils._triton import has_triton
+
+from ..ir import ChoiceCaller, Layout, TensorBox
+from ..lowering import register_lowering
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    realize_inputs,
+    TritonTemplate,
+)
+from ..utils import (
+    get_gpu_shared_memory,
+    get_num_sms,
+    has_free_symbols,
+    use_aten_gemm_kernels,
+)
+from .mm_common import (
+    _is_static_problem,
+    check_supported_striding,
+    persistent_grouped_mm_grid,
+)
+
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+
+
+@dataclass
+class Config:
+    kwargs: dict[str, int]
+    num_stages: int
+    num_warps: int
+
+
+_NV_CONFIGS = [
+    Config(
+        {
+            "BLOCK_M": block_size_m,
+            "BLOCK_N": block_size_n,
+            "BLOCK_K": block_size_k,
+            "NUM_CONSUMER_GROUPS": 1,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+    )
+    for block_size_m in [16, 32, 64, 128]
+    for block_size_n in [64, 128, 256]
+    for block_size_k in [64, 128, 256]
+    for num_stages in [3, 4]
+    for num_warps in [4, 8]
+]
+
+
+def grouped_mm_configs():
+    return _NV_CONFIGS
+
+
+def early_config_prune(g, m, configs, named_args):
+    dtsize = 1
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps, num_consumer_groups = (
+            kw["BLOCK_M"],
+            kw["BLOCK_N"],
+            kw["BLOCK_K"],
+            config.num_stages,
+            config.num_warps,
+            getattr(config, "num_consumer_groups", 0),
+        )
+
+        # 1. Prune NV configs depending on g and m.
+        if not has_free_symbols((g, m)):
+            a_is_2d, b_is_2d = named_args["A_IS_2D"], named_args["B_IS_2D"]
+            m_avg = m // g if a_is_2d and not b_is_2d else m
+            if m_avg <= 16:
+                if BLOCK_M > 32:
+                    continue
+            elif m_avg <= 32:
+                if BLOCK_M > 64:
+                    continue
+            elif m_avg <= 64:
+                if BLOCK_M <= 16:
+                    continue
+            else:
+                if BLOCK_M <= 32:
+                    continue
+
+        # 2. make sure we have enough smem
+        max_shared_memory = get_gpu_shared_memory()
+
+        required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory > max_shared_memory:
+            continue
+
+        use_warp_specialization = num_consumer_groups >= 1
+
+        # 3. make sure we can partition for ws
+        if use_warp_specialization:
+            if num_warps != 4:
+                continue
+
+            # "tritongpu-warp-spec-data-partition"
+            m_slice = BLOCK_M // num_consumer_groups
+            n_slice = BLOCK_N // num_consumer_groups
+            if m_slice < 64 and n_slice < 256:
+                continue
+
+        pruned_configs.append(config)
+
+    return pruned_configs
+
+
+triton_grouped_mm_source = r"""
+{%- if SCALED %}
+{%- if A_IS_2D or B_IS_2D %}
+{{def_kernel("a_ptr", "b_ptr", "scale_a_ptr", "scale_b_ptr", "offsets_ptr")}}
+{%- else %}
+{{def_kernel("a_ptr", "b_ptr", "scale_a_ptr", "scale_b_ptr")}}
+{%- endif %}
+{%- else %}
+{%- if A_IS_2D or B_IS_2D %}
+{{def_kernel("a_ptr", "b_ptr", "offsets_ptr")}}
+{%- else %}
+{{def_kernel("a_ptr", "b_ptr")}}
+{%- endif %}
+{%- endif %}
+    tidx = tl.program_id(0)
+
+{%- set M_IS_VARYING = A_IS_2D and not B_IS_2D %}
+{%- set N_IS_VARYING = not A_IS_2D and B_IS_2D %}
+{%- set K_IS_VARYING = A_IS_2D and B_IS_2D %}
+
+{%- if A_IS_2D %}
+{%- if B_IS_2D %}
+    G = {{size("offsets_ptr", 0)}}
+{%- else %}
+    G = {{size("b_ptr", 0)}}
+{%- endif %}
+{%- else %}
+{%- if B_IS_2D %}
+    G = {{size("a_ptr", 0)}}
+{%- else %}
+    G = {{size("a_ptr", 0)}}
+{%- endif %}
+{%- endif %}
+
+    # the b_ptr tensor is given with its last two dims transposed, revert here
+
+    M = {{size("a_ptr", -2)}}
+    N = {{size("b_ptr", -1)}}
+    K = {{size("a_ptr", -1)}}
+
+    A_STRIDE_M = {{stride("a_ptr", -2)}}
+    A_STRIDE_K = {{stride("a_ptr", -1)}}
+{%- if not A_IS_2D %}
+    A_STRIDE_G = {{stride("a_ptr", 0)}}
+{%- if SCALED %}
+    SCALE_A_STRIDE_G = {{stride("scale_a_ptr", 0)}}
+{%- endif %}
+{%- endif %}
+    B_STRIDE_N = {{stride("b_ptr", -1)}}
+    B_STRIDE_K = {{stride("b_ptr", -2)}}
+{%- if not B_IS_2D %}
+    B_STRIDE_G = {{stride("b_ptr", 0)}}
+{%- if SCALED %}
+    SCALE_B_STRIDE_G = {{stride("scale_b_ptr", 0)}}
+{%- endif %}
+{%- endif %}
+
+{%- if USE_TMA_LOAD %}
+{%- if USE_EXPERIMENTAL_MAKE_TENSOR_DESCRIPTOR %}
+    a_desc = tl._experimental_make_tensor_descriptor(
+{%- else %}
+    a_desc = tl.make_tensor_descriptor(
+{%- endif %}
+        a_ptr,
+{%- if A_IS_2D %}
+        shape=[M, K],
+        # fixme: strides=[A_STRIDE_M, A_STRIDE_K],
+        strides=[{{stride("a_ptr", -2)}}, {{stride("a_ptr", -1)}}],
+        block_shape=[BLOCK_M, BLOCK_K],
+{%- else %}
+        shape=[G, M, K],
+        # fixme: strides=[A_STRIDE_G, A_STRIDE_M, A_STRIDE_K],
+        strides=[{{stride("a_ptr", 0)}}, {{stride("a_ptr", -2)}}, {{stride("a_ptr", -1)}}],
+        block_shape=[1, BLOCK_M, BLOCK_K],
+{%- endif %}
+    )
+
+{%- if USE_EXPERIMENTAL_MAKE_TENSOR_DESCRIPTOR %}
+    b_desc = tl._experimental_make_tensor_descriptor(
+{%- else %}
+    b_desc = tl.make_tensor_descriptor(
+{%- endif %}
+        b_ptr,
+{%- if B_IS_2D %}
+        shape=[N, K],
+        # fixme: strides=[B_STRIDE_N, B_STRIDE_K],
+        strides=[{{stride("b_ptr", -1)}}, {{stride("b_ptr", -2)}}],
+        block_shape=[BLOCK_N, BLOCK_K],
+{%- else %}
+        shape=[G, N, K],
+        # fixme: strides=[B_STRIDE_G, B_STRIDE_N, B_STRIDE_K],
+        strides=[{{stride("b_ptr", 0)}}, {{stride("b_ptr", -1)}}, {{stride("b_ptr", -2)}}],
+        block_shape=[1, BLOCK_N, BLOCK_K],
+{%- endif %}
+    )
+{%- endif %}
+
+{%- if M_IS_VARYING %}
+    m_end_offset = 0
+{%- endif %}
+{%- if N_IS_VARYING %}
+    n_end_offset = 0
+{%- endif %}
+{%- if K_IS_VARYING %}
+    k_end_offset = 0
+{%- endif %}
+    iterated_tiles = 0
+    for g in tl.range(G):
+{%- if M_IS_VARYING %}
+        # Move across groups
+        m_start_offset = m_end_offset
+        m_end_offset = tl.load(offsets_ptr + g)
+        m_size = m_end_offset - m_start_offset
+{%- if SCALED %}
+        m_scale_start_offset = m_start_offset
+{%- endif %}
+{%- else %}
+        m_start_offset = 0
+        m_size = M
+{%- if SCALED %}
+        m_scale_start_offset = g * M
+{%- endif %}
+{%- endif %}
+
+{%- if N_IS_VARYING %}
+        # Move across groups
+        n_start_offset = n_end_offset
+        n_end_offset = tl.load(offsets_ptr + g)
+        n_size = n_end_offset - n_start_offset
+{%- if SCALED %}
+        n_scale_start_offset = n_start_offset
+{%- endif %}
+{%- else %}
+        n_start_offset = 0
+        n_size = N
+{%- if SCALED %}
+        n_scale_start_offset = g * N
+{%- endif %}
+{%- endif %}
+
+        if m_size > 0 and n_size > 0:
+{%- if K_IS_VARYING %}
+            # Move across groups
+            k_start_offset = k_end_offset
+            k_end_offset = tl.load(offsets_ptr + g)
+            k_size = k_end_offset - k_start_offset
+{%- else %}
+            k_start_offset = 0
+            k_size = K
+{%- endif %}
+
+            num_m_tiles = tl.cdiv(m_size, BLOCK_M)
+            num_n_tiles = tl.cdiv(n_size, BLOCK_N)
+            num_tiles = num_m_tiles * num_n_tiles
+
+            # Move across tiles
+            while tidx >= iterated_tiles and tidx < iterated_tiles + num_tiles:
+                gidx = tidx - iterated_tiles
+                # Split M first and N second.
+                tile_m_idx = gidx % num_m_tiles
+                tile_n_idx = gidx // num_m_tiles
+
+                accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+{%- if USE_TMA_LOAD %}
+                m_offset = (m_start_offset + tile_m_idx * BLOCK_M).to(tl.int32)
+                n_offset = (n_start_offset + tile_n_idx * BLOCK_N).to(tl.int32)
+
+                for k_offset in range(0, k_size, BLOCK_K):
+{%- if A_IS_2D %}
+                    a = a_desc.load([m_offset, k_start_offset + k_offset])
+{%- else %}
+                    a = a_desc.load([g, m_offset, k_start_offset + k_offset]).reshape(BLOCK_M, BLOCK_K)
+{%- endif %}
+{%- if B_IS_2D %}
+                    b = b_desc.load([n_offset, k_start_offset + k_offset])
+{%- else %}
+                    b = b_desc.load([g, n_offset, k_start_offset + k_offset]).reshape(BLOCK_N, BLOCK_K)
+{%- endif %}
+
+{%- if K_IS_VARYING %}
+                    if k_offset + BLOCK_K > k_size:
+                        group_offs_k = k_offset + tl.arange(0, BLOCK_K)
+                        a = tl.where(group_offs_k < k_size, a, 0)
+                        b = tl.where(group_offs_k < k_size, b, 0)
+{%- endif %}
+
+{%- if USE_FAST_ACCUM %}
+                    accumulator = tl.dot(a, b.T, accumulator)
+{%- else %}
+                    accumulator += tl.dot(a, b.T)
+{%- endif %}
+{%- else %}
+                offs_am = tile_m_idx * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_bn = tile_n_idx * BLOCK_N + tl.arange(0, BLOCK_N)
+                offs_k = k_start_offset + tl.arange(0, BLOCK_K)
+                a_ptrs = (
+                    a_ptr
+{%- if not A_IS_2D %}
+                    + g * A_STRIDE_G
+{%- endif %}
+                    + (m_start_offset + offs_am[:, None]) * A_STRIDE_M
+                    + offs_k[None, :] * A_STRIDE_K
+                )
+                b_ptrs = (
+                    b_ptr
+{%- if not B_IS_2D %}
+                    + g * B_STRIDE_G
+{%- endif %}
+                    + (n_start_offset + offs_bn[:, None]) * B_STRIDE_N
+                    + offs_k[None, :] * B_STRIDE_K
+                )
+                for k_offset in range(0, k_size, BLOCK_K):
+                    a = tl.load(a_ptrs, mask=offs_am[:, None] < m_size)
+                    b = tl.load(b_ptrs, mask=offs_bn[:, None] < n_size)
+                    if k_offset + BLOCK_K > k_size:
+                        group_offs_k = k_offset + tl.arange(0, BLOCK_K)
+                        a = tl.where(group_offs_k < k_size, a, 0)
+                        b = tl.where(group_offs_k < k_size, b, 0)
+{%- if USE_FAST_ACCUM %}
+                    accumulator = tl.dot(a, b.T, accumulator)
+{%- else %}
+                    accumulator += tl.dot(a, b.T)
+{%- endif %}
+                    a_ptrs += BLOCK_K
+                    b_ptrs += BLOCK_K
+{%- endif %}
+
+                offs_am = tile_m_idx * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_bn = tile_n_idx * BLOCK_N + tl.arange(0, BLOCK_N)
+{%- if SCALED %}
+                scale_a = tl.load(
+                    scale_a_ptr
+{%- if A_IS_2D %}
+                    + m_scale_start_offset
+{%- else %}
+                    + g * SCALE_A_STRIDE_G
+{%- endif %}
+                    + offs_am[:, None],
+                    mask=offs_am[:, None] < m_size,
+                )
+                scale_b = tl.load(
+                    scale_b_ptr
+{%- if B_IS_2D %}
+                    + n_scale_start_offset
+{%- else %}
+                    + g * SCALE_B_STRIDE_G
+{%- endif %}
+                    + offs_bn[None, :],
+                    mask=offs_bn[None, :] < n_size,
+                )
+                c = accumulator.to(tl.float32) * scale_a * scale_b
+{%- else %}
+                c = accumulator.to(tl.float32)
+{%- endif %}
+
+{%- if M_IS_VARYING %}
+                idx_m = (m_start_offset + offs_am[:, None])
+{%- else %}
+                idx_m = offs_am[:, None]
+{%- endif %}
+{%- if N_IS_VARYING %}
+                idx_n = (n_start_offset + offs_bn[None, :])
+{%- else %}
+                idx_n = offs_bn[None, :]
+{%- endif %}
+                mask = offs_am[:, None] < m_size and offs_bn[None, :] < n_size
+{%- if M_IS_VARYING or N_IS_VARYING %}
+                {{store_output(("idx_m", "idx_n"), "c", "mask", indent_width=16)}}
+{%- else %}
+                {{store_output(("g", "idx_m", "idx_n"), "c", "mask", indent_width=16)}}
+{%- endif %}
+                tidx += NUM_SMS
+
+            iterated_tiles += num_tiles
+"""
+
+
+triton_grouped_mm_template = TritonTemplate(
+    name="grouped_mm",
+    grid=persistent_grouped_mm_grid,
+    source=triton_grouped_mm_source,
+)
+
+triton_scaled_grouped_mm_template = TritonTemplate(
+    name="scaled_grouped_mm",
+    grid=persistent_grouped_mm_grid,
+    source=triton_grouped_mm_source,
+)
+
+
+def grouped_mm_args(
+    mat1: TensorBox,
+    mat2: TensorBox,
+    offs: Optional[TensorBox],
+    layout=None,
+    out_dtype=None,
+):
+    mat1, mat2 = realize_inputs(mat1, mat2)
+    if offs is not None:
+        realize_inputs(offs)
+    mat1_size = mat1.get_size()
+    mat2_size = mat2.get_size()
+
+    m1dim, m2dim = len(mat1_size), len(mat2_size)
+
+    assert m1dim == 2 or m1dim == 3
+    assert m2dim == 2 or m2dim == 3
+
+    if layout is None:
+        from torch._inductor.ir import FixedLayout
+
+        if out_dtype is None:
+            out_dtype = mat1.get_dtype()
+
+        dims = []
+        if m1dim == 2:
+            if m2dim == 2:
+                assert offs is not None
+                dims = [offs.get_size()[0], mat1_size[0], mat2_size[1]]
+            else:
+                dims = [mat1_size[0], mat2_size[-1]]
+        else:
+            if m2dim == 2:
+                dims = [mat1_size[1], mat2_size[1]]
+            else:
+                dims = [mat1_size[0], mat1_size[1], mat2_size[-1]]
+        layout = FixedLayout(
+            mat1.get_device(),
+            out_dtype,
+            dims,
+        )
+    else:
+        assert out_dtype is None, "out_dtype is ignored if layout is specified."
+
+    return (mat1_size, mat2_size, layout, mat1, mat2, offs)
+
+
+aten__grouped_mm = ExternKernelChoice(
+    torch._grouped_mm,
+    "at::_grouped_mm",
+    op_overload=aten._grouped_mm,
+    has_out_variant=False,
+)
+
+
+aten__scaled_grouped_mm = ExternKernelChoice(
+    torch._scaled_grouped_mm,
+    "at::_scaled_grouped_mm",
+    op_overload=aten._scaled_grouped_mm,
+    has_out_variant=False,
+)
+
+
+def can_use_triton_kernel(
+    mat_a: TensorBox,
+    mat_b: TensorBox,
+    offs: Optional[TensorBox],
+    bias: Optional[TensorBox],
+    scale_result: Optional[TensorBox],
+) -> bool:
+    if not (
+        torch.cuda.is_available()
+        and torch.cuda.get_device_capability() >= (9, 0)
+        and not torch.version.hip
+    ):
+        return False
+    if not has_triton():
+        return False
+
+    # The _grouped_mm()/_scaled_grouped_mm() operator do not support
+    # bias nor scale_result yet.
+    if bias is not None:
+        return False
+    if scale_result is not None:
+        return False
+
+    if len(mat_a.get_size()) == 2 or len(mat_b.get_size()) == 2:
+        return offs is not None
+    else:
+        return offs is None
+
+
+def create_offsets(x, m1_size, m2_size, offs_size):
+    m1_is_2d = len(m1_size) == 2
+    m2_is_2d = len(m2_size) == 2
+    if m1_is_2d:
+        if m2_is_2d:
+            k = V.graph.sizevars.size_hint(m1_size[1])
+            noffs = V.graph.sizevars.size_hint(offs_size[0])
+            step = k / noffs
+            return torch.linspace(
+                step, k, noffs, dtype=x.get_dtype(), device=x.get_device()
+            )
+
+        else:
+            m = V.graph.sizevars.size_hint(m1_size[0])
+            noffs = V.graph.sizevars.size_hint(offs_size[0])
+            step = m / noffs
+            return torch.linspace(
+                step, m, noffs, dtype=x.get_dtype(), device=x.get_device()
+            )
+    else:
+        if m2_is_2d:
+            n = V.graph.sizevars.size_hint(m2_size[0])
+            noffs = V.graph.sizevars.size_hint(offs_size[0])
+            step = n / noffs
+            return torch.linspace(
+                step, n, noffs, dtype=x.get_dtype(), device=x.get_device()
+            )
+        else:
+            return None
+
+
+def _tuned_grouped_mm_common(
+    operator_name: str,
+    algorithm_name: str,
+    extern_kernel_choice: ExternKernelChoice,
+    kernel_template: TritonTemplate,
+    mat_a: TensorBox,
+    mat_b: TensorBox,
+    scale_a: Optional[TensorBox] = None,
+    scale_b: Optional[TensorBox] = None,
+    offs: Optional[TensorBox] = None,
+    bias: Optional[TensorBox] = None,
+    scale_result: Optional[TensorBox] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    use_fast_accum: Optional[bool] = None,
+    layout: Optional[Layout] = None,
+) -> TensorBox:
+    assert (scale_a is None) == (scale_b is None)
+    assert scale_result is None or scale_a is not None
+
+    m1_size, m2_size, layout, mat_a, mat_b, offs = grouped_mm_args(
+        mat_a, mat_b, offs, layout=layout, out_dtype=out_dtype
+    )
+    counters["aten_mm_info"][operator_name] += 1
+    log_message = f"Tuned {operator_name}: mat1_shape=%s, mat2_shape=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s"
+    log.info(
+        log_message,
+        m1_size,
+        m2_size,
+        mat_a.get_dtype(),
+        mat_b.get_dtype(),
+        layout,
+    )
+
+    if scale_a is not None and scale_b is not None:
+        check_supported_striding(mat_a, mat_b)
+
+    # workaround for Inductor not supporting optional tensor input arguments
+    input_nodes: list[Any] = [mat_a, mat_b]
+    if scale_a is not None:
+        input_nodes.append(realize_inputs(scale_a))
+    if scale_b is not None:
+        input_nodes.append(realize_inputs(scale_b))
+    if offs is not None:
+        input_nodes.append(realize_inputs(offs))
+
+    if use_fast_accum is None:
+        aten_choice = extern_kernel_choice.bind(
+            input_nodes,
+            layout,
+            out_dtype=out_dtype,
+        )
+    else:
+        aten_choice = extern_kernel_choice.bind(
+            input_nodes,
+            layout,
+            out_dtype=out_dtype,
+            use_fast_accum=use_fast_accum,
+        )
+    if use_fast_accum is None:
+        use_fast_accum = False
+
+    choices: list[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.append(aten_choice)
+
+    _, is_nonzero = _is_static_problem(layout)
+
+    # Checking only for the equality of corresponding dims of
+    # multiplicands here, relying on meta function checks for
+    # everything else.
+    if is_nonzero and can_use_triton_kernel(mat_a, mat_b, offs, bias, scale_result):
+        scaled = scale_a is not None
+        if len(m1_size) == 2:
+            if len(m2_size) == 2:
+                m, k1 = m1_size
+                k2, _ = m2_size
+                g = offs.get_size()[0]
+                V.graph.sizevars.guard_equals(k1, k2)
+                a_is_2d, b_is_2d = True, True
+            else:
+                g1 = offs.layout.size[0]
+                m, k1 = m1_size
+                g2, k2, _ = m2_size
+                g = V.graph.sizevars.guard_equals(g1, g2)
+                V.graph.sizevars.guard_equals(k1, k2)
+                a_is_2d, b_is_2d = True, False
+        else:
+            if len(m2_size) == 2:
+                g1 = offs.layout.size[0]
+                g2, m, k1 = m1_size
+                k2, _ = m2_size
+                g = V.graph.sizevars.guard_equals(g1, g2)
+                V.graph.sizevars.guard_equals(k1, k2)
+                a_is_2d, b_is_2d = False, True
+            else:
+                g1, m, k1 = m1_size
+                g2, k2, _ = m2_size
+                g = V.graph.sizevars.guard_equals(g1, g2)
+                V.graph.sizevars.guard_equals(k1, k2)
+                a_is_2d, b_is_2d = False, False
+
+        triton_has_make_tensor_descriptor = hasattr(tl, "make_tensor_descriptor")
+        triton_has_experimental_make_tensor_descriptor = hasattr(
+            tl, "_experimental_make_tensor_descriptor"
+        )
+        use_tma_load = (
+            triton_has_make_tensor_descriptor
+            or triton_has_experimental_make_tensor_descriptor
+        )
+        # The make_tensor_descriptor imposes this additional limitation.
+        use_tma_load = use_tma_load and (
+            mat_a.get_stride()[-1] == 1 and mat_b.get_stride()[-2] == 1
+        )
+
+        kwargs = {
+            "SCALED": scaled,
+            "A_IS_2D": a_is_2d,
+            "B_IS_2D": b_is_2d,
+            "USE_FAST_ACCUM": use_fast_accum,
+            "NUM_SMS": get_num_sms(),
+            "USE_TMA_LOAD": use_tma_load,
+            "USE_EXPERIMENTAL_MAKE_TENSOR_DESCRIPTOR": triton_has_experimental_make_tensor_descriptor,
+        }
+
+        for config in early_config_prune(g, m, grouped_mm_configs(), kwargs):
+            kernel_template.maybe_append_choice(
+                choices,
+                input_nodes=input_nodes,
+                layout=layout,
+                num_stages=config.num_stages,
+                num_warps=config.num_warps,
+                **kwargs,
+                **config.kwargs,
+            )
+
+    input_gen_fns = {
+        4: lambda x: create_offsets(
+            x, m1_size, m2_size, offs.get_size() if offs is not None else None
+        ),
+    }
+    return autotune_select_algorithm(
+        algorithm_name, choices, input_nodes, layout, input_gen_fns=input_gen_fns
+    )
+
+
+@register_lowering(aten._grouped_mm.default, type_promotion_kind=None)
+def tuned_grouped_mm(
+    mat_a: TensorBox,
+    mat_b: TensorBox,
+    offs: Optional[TensorBox] = None,
+    bias: Optional[TensorBox] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    layout: Optional[Layout] = None,
+) -> TensorBox:
+    """Auto-tuning for _grouped_mm() operator."""
+
+    return _tuned_grouped_mm_common(
+        "aten._grouped_mm.default",
+        "grouped_mm",
+        aten__grouped_mm,
+        triton_grouped_mm_template,
+        mat_a,
+        mat_b,
+        None,
+        None,
+        offs,
+        bias,
+        None,
+        out_dtype,
+        None,
+        layout,
+    )
+
+
+@register_lowering(aten._scaled_grouped_mm.default, type_promotion_kind=None)
+def tuned_scaled_grouped_mm(
+    mat_a: TensorBox,
+    mat_b: TensorBox,
+    scale_a: TensorBox,
+    scale_b: TensorBox,
+    offs: Optional[TensorBox] = None,
+    bias: Optional[TensorBox] = None,
+    scale_result: Optional[TensorBox] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    use_fast_accum: bool = False,
+    layout: Optional[Layout] = None,
+) -> TensorBox:
+    """Auto-tuning for _scaled_grouped_mm() operator."""
+
+    return _tuned_grouped_mm_common(
+        "aten._scaled_grouped_mm.default",
+        "scaled_grouped_mm",
+        aten__scaled_grouped_mm,
+        triton_scaled_grouped_mm_template,
+        mat_a,
+        mat_b,
+        scale_a,
+        scale_b,
+        offs,
+        bias,
+        scale_result,
+        out_dtype,
+        use_fast_accum,
+        layout,
+    )
diff --git a/torch/_inductor/loop_body.py b/torch/_inductor/loop_body.py
index 18a5f547570a..12605133ff2d 100644
--- a/torch/_inductor/loop_body.py
+++ b/torch/_inductor/loop_body.py
@@ -36,7 +36,11 @@
 
 class InterpreterShim(torch.fx.Interpreter):
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _dummy_gm():
         return torch.fx.symbolic_trace(identity)
 
@@ -312,6 +316,17 @@ def get_read_exprs(self):
             for entry in self.memory_usage[MemoryUsageType.LOAD]
         ]
 
+<<<<<<< HEAD
+=======
+    def get_all_read_expr(self, buffer_name):
+        # reversed to match old behavior
+        out = []
+        for entry in reversed(self.memory_usage[MemoryUsageType.LOAD]):
+            if entry.buffer_name == buffer_name:
+                out.append(self.indexing_exprs[entry.index_name])
+        return out
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_write_exprs(self):
         return [
             self.indexing_exprs[entry.index_name]
@@ -321,6 +336,19 @@ def get_write_exprs(self):
             )
         ]
 
+<<<<<<< HEAD
+=======
+    def get_all_write_expr(self, buffer_name):
+        out = []
+        for entry in itertools.chain(
+            self.memory_usage[MemoryUsageType.STORE],
+            self.memory_usage[MemoryUsageType.STORE_REDUCTION],
+        ):
+            if entry.buffer_name == buffer_name:
+                out.append(self.indexing_exprs[entry.index_name])
+        return out
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def debug_str(self):
         lines = [f"var_ranges = {dict(self.var_ranges)}"]
         lines.extend([f"{name} = {val}" for name, val in self.indexing_exprs.items()])
@@ -442,7 +470,11 @@ class LoopBodyBlock:
     """
     Captures the body of a Loops subclass into an FX graph.
     In normal cases there will be a 1:1 mapping between LoopBody and
+<<<<<<< HEAD
     LoopBodyBlock, hower in the case of ops.masked() the masked out
+=======
+    LoopBodyBlock, however in the case of ops.masked() the masked out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     operations will manifest as an extra LoopBodyBlock.
     """
 
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index a04b8fc48124..a77f2e9c65b5 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -40,6 +40,7 @@
     Number,
 )
 from torch.fx.experimental.sym_node import magic_methods, method_to_operator
+<<<<<<< HEAD
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import (
     CeilDiv,
@@ -48,6 +49,11 @@
     IntTrueDiv,
     ModularIndexing,
 )
+=======
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.functions import CeilDiv, FloorDiv, Identity, ModularIndexing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .._dynamo.utils import import_submodule
 from . import config, inductor_prims, ir, test_operators  # NOQA: F401
@@ -78,6 +84,10 @@
     needs_fallback_due_to_atomic_add_limitations,
     pad_listlike,
     register_op_dtype_propagation_rules,
+<<<<<<< HEAD
+=======
+    register_op_requires_libdevice_fp64,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sympy_product,
     use_scatter_fallback,
 )
@@ -162,6 +172,7 @@ def maybe_layout_constraints(fn: Callable[..., Any]) -> Optional[Callable[..., A
         return None
     if fn in _maybe_layout_constraints:
         return _maybe_layout_constraints[fn]
+<<<<<<< HEAD
     # OpOverload with custom lowerings override tag-based layout constraints
     if fn in lowerings:
         _maybe_layout_constraints[fn] = None
@@ -193,6 +204,21 @@ def get_layout_constraint_tag(fn):
     if torch._library.utils.is_builtin(fn):
         return torch._C.Tag.flexible_layout
     return getattr(torch._C.Tag, config.custom_op_default_layout_constraint)
+=======
+    return None
+
+
+def tag_to_layout_constraint(tag):
+    if tag == torch._C.Tag.needs_exact_strides:
+        return constrain_to_fake_tensors
+    if tag == torch._C.Tag.needs_contiguous_strides:
+        return require_contiguous_strides
+    if tag == torch._C.Tag.needs_fixed_stride_order:
+        return constrain_to_fx_strides
+    if tag == torch._C.Tag.flexible_layout:
+        return None
+    raise AssertionError(f"Unknown layout constraint tag: {tag}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def assert_nyi(cond, msg):
@@ -228,6 +254,10 @@ def add_layout_constraint(fn, constraint):
         aten.convolution,
         aten.convolution_backward,
         aten.max_pool2d_with_indices,
+<<<<<<< HEAD
+=======
+        aten.max_pool3d_with_indices,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aten.max_pool2d_with_indices_backward,
         aten.mm,
         aten.upsample_nearest2d,
@@ -426,6 +456,10 @@ def _register_lowering(
     broadcast,
     type_promotion_kind: Optional[ELEMENTWISE_TYPE_PROMOTION_KIND],
     convert_input_to_bool,
+<<<<<<< HEAD
+=======
+    lowering_dict,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Add a lowering to lowerings dict
@@ -470,7 +504,11 @@ def wrapped(*args, **kwargs):
 
     aten_fn = get_overloads(aten_fn)
 
+<<<<<<< HEAD
     lowerings.update(dict.fromkeys(aten_fn, wrapped))
+=======
+    lowering_dict.update(dict.fromkeys(aten_fn, wrapped))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return wrapped
 
 
@@ -481,6 +519,10 @@ def register_lowering(
         ELEMENTWISE_TYPE_PROMOTION_KIND
     ] = ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     convert_input_to_bool=False,
+<<<<<<< HEAD
+=======
+    lowering_dict=lowerings,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
     """
     Shim to support decorator syntax.
@@ -491,6 +533,10 @@ def register_lowering(
         broadcast=broadcast,
         type_promotion_kind=type_promotion_kind,
         convert_input_to_bool=convert_input_to_bool,
+<<<<<<< HEAD
+=======
+        lowering_dict=lowering_dict,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -576,7 +622,10 @@ def make_pointwise(
     override_return_dtype=None,
     override_device=None,
     override_fn_when_input_bool=None,
+<<<<<<< HEAD
     override_fn_when_gpu_float64=None,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     allow_alpha=False,
     triton_fallback=None,
 ):
@@ -597,7 +646,10 @@ def inner(*inputs: TensorBox, alpha=None):
         loaders = [x.make_loader() for x in inputs]
         ranges = inputs[0].get_size()
         dtype = override_return_dtype or inputs[0].get_dtype()
+<<<<<<< HEAD
         is_gpu_device = is_gpu(decode_device(inputs[0].get_device()).type)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for other in inputs[1:]:
             assert isinstance(other, ir.BaseConstant) or len(ranges) == len(
@@ -620,12 +672,15 @@ def inner_fn(index):
             assert len(index) == len(ranges), f"wrong ndim {index} {ranges}"
             if dtype == torch.bool and override_fn_when_input_bool is not None:
                 return override_fn_when_input_bool(*[load(index) for load in loaders])
+<<<<<<< HEAD
             elif (
                 override_fn_when_gpu_float64
                 and is_gpu_device
                 and dtype == torch.float64
             ):
                 return override_fn_when_gpu_float64(*[load(index) for load in loaders])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 inputs_loaded = []
                 for inp_index, load in enumerate(loaders):
@@ -845,17 +900,23 @@ def register_pointwise(
     override_return_dtype=None,
     override_fn_when_input_bool=None,
     allow_alpha=False,
+<<<<<<< HEAD
     use_libdevice_for_f64=False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     triton_fallback=None,
 ):
     """A pointwise function that maps ops.{name} to inputs"""
     name = name or aten_fn.__name__
     fn = ops_wrapper(name)
+<<<<<<< HEAD
     if use_libdevice_for_f64:
         fn_libdevice = ops_wrapper("libdevice_" + name)
         register_op_dtype_propagation_rules(
             "libdevice_" + name, type_promotion_kind, override_return_dtype
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     register_op_dtype_propagation_rules(
         name, type_promotion_kind, override_return_dtype
@@ -868,7 +929,10 @@ def register_pointwise(
         fn,
         override_return_dtype=override_return_dtype,
         override_fn_when_input_bool=override_fn_when_input_bool,
+<<<<<<< HEAD
         override_fn_when_gpu_float64=fn_libdevice if use_libdevice_for_f64 else None,  # type: ignore[possibly-undefined]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         allow_alpha=allow_alpha,
         triton_fallback=triton_fallback,
     )
@@ -1093,8 +1157,11 @@ def trunc(x):
 
 @register_lowering(aten.expand, type_promotion_kind=None)
 def expand(x, sizes):
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     (x,) = promote_constants([x])
     if isinstance(x, ir.BaseConstant):
         return ExpandView.create(x, tuple(sizes))
@@ -1104,7 +1171,13 @@ def expand(x, sizes):
         return x
 
     if not free_unbacked_symbols(x.get_size()):
+<<<<<<< HEAD
         x_size_product = V.graph.sizevars.size_hint(sympy_product(x.get_size()))
+=======
+        x_size_product = V.graph.sizevars.size_hint_or_throw(
+            sympy_product(x.get_size())
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: It would be better to realize the input if any of its sizes
         # are unbacked, because typically the size will be non-zero.  However,
         # this cannot be done directly as below as we'll choke on the size_hint
@@ -1112,7 +1185,12 @@ def expand(x, sizes):
         if x_size_product > 0 and not free_unbacked_symbols(sizes):
             # maybe realize input before broadcasting it
             x.mark_reuse(
+<<<<<<< HEAD
                 V.graph.sizevars.size_hint(sympy_product(sizes)) // x_size_product
+=======
+                V.graph.sizevars.size_hint_or_throw(sympy_product(sizes))
+                // x_size_product
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
     return TensorBox(ExpandView.create(x.data, tuple(sizes)))
 
@@ -1170,12 +1248,24 @@ def inner_fn(index):
                     index[i] = ModularIndexing(index[i], 1, old_size[i])
         return x_loader(index)
 
+<<<<<<< HEAD
     old_size_product = V.graph.sizevars.size_hint(sympy_product(old_size))
     if old_size_product > 0:
         # maybe realize the input
         x.mark_reuse(
             V.graph.sizevars.size_hint(sympy_product(new_size)) // old_size_product
         )
+=======
+    if not free_unbacked_symbols(old_size) and not free_unbacked_symbols(new_size):
+        old_size_product = V.graph.sizevars.size_hint_or_throw(sympy_product(old_size))
+        if old_size_product > 0:
+            # maybe realize the input but skip for unbacked symints since it'll
+            # choke on the size hint.
+            x.mark_reuse(
+                V.graph.sizevars.size_hint_or_throw(sympy_product(new_size))
+                // old_size_product
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     x_loader = x.make_loader()
     return Pointwise.create(
@@ -1846,8 +1936,15 @@ def unfold(x, dimension, size, step):
     sizevars.guard_lt(0, step)  # type: ignore[arg-type]
 
     new_dim_size = FloorDiv(dim_size - size, step) + 1
+<<<<<<< HEAD
     if sizevars.size_hint(dim_size) > 0:
         x.mark_reuse(sizevars.size_hint(CeilDiv(new_dim_size * size, dim_size)))
+=======
+    if sizevars.size_hint_or_throw(dim_size) > 0:
+        x.mark_reuse(
+            sizevars.size_hint_or_throw(CeilDiv(new_dim_size * size, dim_size))
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     out_size = [*sizes[:dim], new_dim_size, *sizes[dim + 1 :], size]
 
@@ -1912,7 +2009,11 @@ def wrap_tensors(x):
     return handler
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _warn_complex_not_supported():
     warnings.warn(
         "Torchinductor does not support code generation for complex operators. Performance may be worse than eager."
@@ -1921,6 +2022,7 @@ def _warn_complex_not_supported():
 
 # There are some types (CPU) which we accept as input but not as
 # output.
+<<<<<<< HEAD
 def unsupported_input_tensor(t: torch.Tensor, parent=None, node=None):
     "Do not support reading or writing to this tensor"
     if t.is_complex():
@@ -1933,6 +2035,18 @@ def unsupported_input_tensor(t: torch.Tensor, parent=None, node=None):
         _warn_complex_not_supported()
         return True
 
+=======
+def unsupported_input_tensor(t: torch.Tensor, node=None):
+    "Do not support reading or writing to this tensor"
+    if t.is_complex():
+        # Complex views are supported with IR ComplexView
+        _warn_complex_not_supported()
+        return True
+
+    if t.is_meta:
+        return True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if t.dtype == torch.float8_e8m0fnu:
         if not node:
             return True
@@ -1940,20 +2054,44 @@ def unsupported_input_tensor(t: torch.Tensor, parent=None, node=None):
         # allow bitcast, views, memory movement, but not arithmetic
         # TODO: delete once triton adds native support
         return not (
+<<<<<<< HEAD
             node.target
             in (
                 aten.view.dtype,
                 aten.cat.default,
             )
             or is_view(node.target)
+=======
+            isinstance(node.target, torch._ops.OpOverload)
+            and node.target
+            in (
+                aten.view.dtype,
+                aten.cat.default,
+                aten.clone.default,
+                aten._scaled_mm.default,
+            )
+            or (isinstance(node.target, torch._ops.OpOverload) and is_view(node.target))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return False
 
 
+<<<<<<< HEAD
 def unsupported_output_tensor(t: torch.Tensor, parent=None, node=None):
     "Do not support writing tensor but can read from it"
     if unsupported_input_tensor(t, parent):
+=======
+def unsupported_output_tensor(t: torch.Tensor, node=None):
+    "Do not support writing tensor but can read from it"
+    supported_complex_views = (
+        aten.view.dtype,
+        torch.ops.prims.convert_element_type.default,
+    )
+    if node is not None and node.target in supported_complex_views and t.is_complex():
+        return False
+    if unsupported_input_tensor(t, node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
     return t.is_cpu and config.disable_cpp_codegen
 
@@ -1963,10 +2101,17 @@ def fallback_node_due_to_unsupported_type(node: torch.fx.Node, allow_cpu_inputs=
     if node.target is aten.view_as_complex.default:
         return False
 
+<<<<<<< HEAD
+=======
+    if node.op == "placeholder":
+        return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # We should be able to remove this special case once `disable_cpp_codegen` is killed.
     if node.target is aten.lift_fresh_copy.default:
         return False
 
+<<<<<<< HEAD
     def check_skip_condition(node, parent, is_output):
         if not isinstance(node, torch.fx.Node):
             return False
@@ -1975,24 +2120,48 @@ def check_skip_condition(node, parent, is_output):
             return False
 
         for meta in pytree.tree_leaves(node.meta["val"]):
+=======
+    def check_skip_condition(inp_out_node, is_output):
+        if not isinstance(inp_out_node, torch.fx.Node):
+            return False
+
+        if "val" not in inp_out_node.meta:
+            return False
+
+        for meta in pytree.tree_leaves(inp_out_node.meta["val"]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not isinstance(meta, torch._subclasses.FakeTensor):
                 continue
 
             if is_output:
+<<<<<<< HEAD
                 if unsupported_output_tensor(meta, parent, node):
                     return True
             else:
                 if unsupported_input_tensor(meta, parent, node):
+=======
+                if unsupported_output_tensor(meta, node):
+                    return True
+            else:
+                if unsupported_input_tensor(meta, node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return True
 
         return False
 
     # only skip codegen if there is a cpu output, not input
     for arg in pytree.arg_tree_leaves(*node.args, **node.kwargs):
+<<<<<<< HEAD
         if check_skip_condition(arg, node, is_output=False):
             return True
 
     return check_skip_condition(node, node, is_output=True)
+=======
+        if check_skip_condition(arg, is_output=False):
+            return True
+
+    return check_skip_condition(node, is_output=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def make_fallback(op, layout_constraint=None, warn=True, override_decomp=False):
@@ -2359,15 +2528,31 @@ def get_flattened_index(tb: TensorBox):
             )
 
     device = self.get_device()
+<<<<<<< HEAD
     return Pointwise.create(
+=======
+    result = Pointwise.create(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=device,
         dtype=index_dtype,
         inner_fn=inner_fn,
         ranges=self.shape,
     )
+<<<<<<< HEAD
 
 
 @register_lowering(aten.bucketize, type_promotion_kind=None)
+=======
+    # see [NOTE: inductor bucketize realize]
+    result.realize()
+
+    return result
+
+
+@register_lowering(
+    aten.bucketize, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def bucketize(
     input: TensorBox,
     boundaries: TensorBox,
@@ -2407,13 +2592,31 @@ def inner_fn(index):
 
         return indices
 
+<<<<<<< HEAD
     return Pointwise.create(
+=======
+    result = Pointwise.create(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device=device,
         dtype=index_dtype,
         inner_fn=inner_fn,
         ranges=input.get_size(),
     )
 
+<<<<<<< HEAD
+=======
+    # [NOTE: inductor bucketize realize]
+    # bucketize_binary_search is relatively expensive, so we don't want to re-compute
+    # it unnecessarily. If we run bucketize() and then broadcast the result, we don't
+    # want this to be fused into a large number of duplicate bucketize() computations
+    # for each of the elements in the result.
+    #
+    # If no broadcasting occurs, fusions can still occur in scheduler.py
+    result.realize()
+
+    return result
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def require_dense(_, *args, **kwargs):
     args, kwargs = pytree.tree_map_only(
@@ -2429,6 +2632,18 @@ def require_contiguous(_, *args, **kwargs):
     return args, kwargs
 
 
+<<<<<<< HEAD
+=======
+def require_contiguous_strides(_, *args, **kwargs):
+    # TODO: combine this with require_contiguous after
+    # https://github.com/pytorch/pytorch/pull/148235 lands.
+    args, kwargs = pytree.tree_map_only(
+        ir.IRNode, ir.ExternKernel.require_contiguous_strides, (args, kwargs)
+    )
+    return args, kwargs
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def require_channels_last(_, *args, **kwargs):
     args, kwargs = pytree.tree_map_only(
         ir.IRNode, ir.ExternKernel.require_channels_last, (args, kwargs)
@@ -2436,6 +2651,7 @@ def require_channels_last(_, *args, **kwargs):
     return args, kwargs
 
 
+<<<<<<< HEAD
 def constrain_to_fake_tensors(args, kwargs, fake_args, fake_kwargs):
     def apply_constraint(arg, fake_arg):
         if isinstance(arg, ir.IRNode):
@@ -2458,6 +2674,31 @@ def apply_constraint(arg, fake_arg):
         apply_constraint(arg, fake_arg) for arg, fake_arg in zip(args, fake_args)
     )
     kwargs = {k: apply_constraint(v, fake_kwargs[k]) for k, v in kwargs.items()}
+=======
+def constrain_to_fake_tensor(arg, fake_arg):
+    if isinstance(arg, ir.IRNode):
+        meta_stride_expr = [
+            s.node.expr if isinstance(s, torch.SymInt) else s for s in fake_arg.stride()
+        ]
+        return ir.ExternKernel.require_exact_strides(arg, meta_stride_expr)
+    if isinstance(arg, dict):
+        return {
+            key: constrain_to_fake_tensor(arg[key], fake_arg[key]) for key in arg.keys()
+        }
+    elif isinstance(arg, (tuple, list)):
+        return type(arg)(
+            constrain_to_fake_tensor(a, f_a) for (a, f_a) in zip(arg, fake_arg)
+        )
+    return arg
+
+
+def constrain_to_fake_tensors(args, kwargs, fake_args, fake_kwargs):
+    args = tuple(
+        constrain_to_fake_tensor(arg, fake_arg)
+        for arg, fake_arg in zip(args, fake_args)
+    )
+    kwargs = {k: constrain_to_fake_tensor(v, fake_kwargs[k]) for k, v in kwargs.items()}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return args, kwargs
 
 
@@ -2616,8 +2857,12 @@ def is_aligned(x):
 # WIP
 make_fallback(aten._adaptive_avg_pool3d)  # @isuruf
 make_fallback(aten.adaptive_max_pool3d)  # @isuruf
+<<<<<<< HEAD
 make_fallback(aten.fractional_max_pool3d)  # @isuruf
 make_fallback(aten.max_pool3d_with_indices)  # @isuruf (can this one be implemented?)
+=======
+make_fallback(aten._scaled_dot_product_attention_math_for_mps)  # @malfet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # 1) Easy
@@ -2625,6 +2870,14 @@ def is_aligned(x):
 make_fallback(aten.exponential.default, warn=False)  # (fails accuracy on test_torch.py)
 make_fallback(aten._pdist_forward)  # Has decomp. Needs benchmarks
 make_fallback(aten.soft_margin_loss_backward, warn=False)  # py_impl?
+<<<<<<< HEAD
+=======
+make_fallback(aten._fused_rms_norm, warn=False)  # (MPS-only and faster than decomp)
+if torch.xpu.is_available():
+    make_fallback(
+        aten.embedding_dense_backward, warn=False
+    )  # (XPU-only and faster than decomp)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # 1.5) Easy or Impossible
@@ -2652,6 +2905,11 @@ def is_aligned(x):
 make_fallback(aten.addbmm)
 make_fallback(aten._addmm_activation, warn=False)
 
+<<<<<<< HEAD
+=======
+make_fallback(aten._grouped_mm, require_dense)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Need templated kernel. Probably impossible to write efficiently
 make_fallback(aten.convolution_backward, constrain_to_fx_strides)
 make_fallback(aten._cudnn_rnn, require_dense)
@@ -2737,7 +2995,11 @@ def is_aligned(x):
 make_fallback(torch._prims.rng_prims.graphsafe_run_with_rng_state)
 
 
+<<<<<<< HEAD
 # Implmented / Half implemented
+=======
+# Implemented / Half implemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Scans. Implemented for CUDA, missing CPU
 make_fallback(aten.masked_scatter)
 make_fallback(aten.masked_scatter_backward)
@@ -3118,6 +3380,10 @@ def _assert_scalar(data, msg):
     # NB: These will be handled at codegen time
     # Not sure if we are guaranteed to be able to serve out truth from the
     # deferred_runtime_asserts, TODO: try this assert out
+<<<<<<< HEAD
+=======
+    # See [NOTE] Codegen runtime asserts in Inductor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # assert bool(data.scalar), data
     return None
 
@@ -3326,7 +3592,11 @@ def new_empty_strided(
 
 @register_lowering(prims.copy_strided.default)
 def copy_strided(x, stride):
+<<<<<<< HEAD
     stride = [V.graph.sizevars.size_hint(s) for s in stride]
+=======
+    stride = [V.graph.sizevars.size_hint_or_throw(s) for s in stride]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     stride_order = sorted(range(len(stride)), key=stride.__getitem__)
     return ir.ExternKernel.require_stride_order(x, stride_order)
 
@@ -3346,7 +3616,10 @@ def gather(x, dim, index, sparse_grad=False):
         # Empty index case. Return an empty array with the same shape
         return new_empty(x, index.get_size())
 
+<<<<<<< HEAD
     assert index.get_dtype() == torch.int64
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     size = x.get_size()
     offset = len(size) == 0
     dim = _validate_dim(x, dim, offset)
@@ -3377,6 +3650,14 @@ def fn(idx):
 
 @register_lowering(aten.embedding, type_promotion_kind=None)
 def embedding(weight, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False):
+<<<<<<< HEAD
+=======
+    if sparse:
+        return fallback_handler(aten.embedding.default)(
+            weight, indices, padding_idx, scale_grad_by_freq, sparse
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert not sparse
     assert isinstance(weight, TensorBox)
     assert isinstance(indices, TensorBox)
@@ -3576,7 +3857,11 @@ def _unsafe_index(x, indices):
 # https://github.com/pytorch/torchdynamo/issues/1235
 # and
 # https://github.com/pytorch/torchdynamo/issues/1863
+<<<<<<< HEAD
 @register_lowering(aten.index_put)
+=======
+@register_lowering(aten.index_put, type_promotion_kind=None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def index_put(x, indices, values, accumulate=False):
     return index_put_impl_(
         clone(x), indices, values, accumulate, check=True, may_realize=False
@@ -3599,6 +3884,7 @@ def index_put_as_masked_fill(self, indices, value, accumulate):
 
 
 def index_put_fallback(self, indices, values, accumulate):
+<<<<<<< HEAD
     deterministic = torch.are_deterministic_algorithms_enabled()
     if is_triton(values) and (accumulate or deterministic):
         msg = (
@@ -3610,6 +3896,8 @@ def index_put_fallback(self, indices, values, accumulate):
             msg = f"{msg} Found from : \n {stack_trace}"
         V.graph.disable_cudagraphs_reason = msg
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ir.IndexPutFallback(V.graph.current_node.target, self, indices, values, accumulate)
     return self
 
@@ -4319,14 +4607,32 @@ def load(index):
     return load
 
 
+<<<<<<< HEAD
 def pooling_size(x, i, kernel_size, stride, padding, ceil_mode):
     x_out = FloorDiv(
         x + 2 * padding[i] - (kernel_size[i] - 1) + (stride[i] - 1), stride[i]
+=======
+def pooling_size(x, i, kernel_size, stride, padding, ceil_mode, *, dilation=None):
+    if dilation is None:
+        dilation = [1] * len(padding)
+
+    x_out = FloorDiv(
+        x + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) + (stride[i] - 1),
+        stride[i],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     if ceil_mode:
         x_alt = FloorDiv(
+<<<<<<< HEAD
             x + 2 * padding[i] - (kernel_size[i] - 1) + 2 * (stride[i] - 1), stride[i]
+=======
+            x
+            + 2 * padding[i]
+            - dilation[i] * (kernel_size[i] - 1)
+            + 2 * (stride[i] - 1),
+            stride[i],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if V.graph.sizevars.size_hint((x_alt - 1) * stride[i] - x - padding[i]) >= 0:
             # Sliding windows must start within the input or left padding
@@ -4341,6 +4647,7 @@ def pooling_size(x, i, kernel_size, stride, padding, ceil_mode):
     return x_out, ceil_mode
 
 
+<<<<<<< HEAD
 def should_fallback_max_pool2d_with_indices(kernel_size, dilation):
     kernel_size = pad_listlike(kernel_size, 2)
     window_size = kernel_size[0] * kernel_size[1]
@@ -4370,18 +4677,54 @@ def max_pool2d_checks(
     assert len(x.get_size()) in (3, 4)
 
     use_fallback = should_fallback_max_pool2d_with_indices(kernel_size, dilation)
+=======
+def should_fallback_max_pool_with_indices(kernel_size, *, n_dim):
+    kernel_size = pad_listlike(kernel_size, n_dim)
+    window_size = functools.reduce(operator.mul, kernel_size)
+    return window_size > 25
+
+
+def max_pool_checks(
+    x, kernel_size, stride, padding, dilation, n_dim, *, assert_fallback=None
+):
+    if padding == 0:
+        padding = [0] * n_dim
+    if dilation == 1:
+        dilation = [1] * n_dim
+    if not stride:
+        stride = kernel_size
+
+    kernel_size = pad_listlike(kernel_size, n_dim)
+    stride = pad_listlike(stride, n_dim)
+    padding = pad_listlike(padding, n_dim)
+    dilation = pad_listlike(dilation, n_dim)
+
+    assert isinstance(x, TensorBox)
+    assert len(kernel_size) == n_dim
+    assert len(stride) == n_dim
+    assert len(padding) == n_dim
+    assert len(dilation) == n_dim
+    assert len(x.get_size()) in (n_dim + 1, n_dim + 2)
+
+    use_fallback = should_fallback_max_pool_with_indices(kernel_size, n_dim=n_dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if assert_fallback is not None:
         assert use_fallback == assert_fallback
 
     return kernel_size, stride, padding, dilation, use_fallback
 
 
+<<<<<<< HEAD
 def _max_pool2d_with_offsets(
+=======
+def _max_pool_with_offsets(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     x,
     kernel_size,
     stride,
     padding,
     dilation,
+<<<<<<< HEAD
     ceil_mode=False,
 ):
     x.realize_hint()
@@ -4389,6 +4732,24 @@ def _max_pool2d_with_offsets(
 
     h_out, ceil_mode1 = pooling_size(h, 0, kernel_size, stride, padding, ceil_mode)
     w_out, ceil_mode2 = pooling_size(w, 1, kernel_size, stride, padding, ceil_mode)
+=======
+    ceil_mode,
+    *,
+    n_dim,
+):
+    x.realize_hint()
+    batch = x.shape[:-n_dim]
+    dhw = x.shape[-n_dim:]
+
+    dhw_out, ceil_mode = zip(
+        *[
+            pooling_size(
+                dhw[d], d, kernel_size, stride, padding, ceil_mode, dilation=dilation
+            )
+            for d in range(n_dim)
+        ]
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     dtype = x.dtype
     min_value = (
@@ -4397,6 +4758,7 @@ def _max_pool2d_with_offsets(
         else (float("-inf") if dtype.is_floating_point else torch.iinfo(dtype).min)
     )
 
+<<<<<<< HEAD
     new_size = list(batch) + [h_out, w_out]
     if padding[0] or padding[1] or ceil_mode1 or ceil_mode2:
         x_loader = constant_boundary_condition(x, min_value, dim=2)
@@ -4409,6 +4771,21 @@ def fn_inner(idx, reduction_idx):
         prefix = idx[:-dim]
         bh = idx[-dim:]
         ih = [bh[i] * stride[i] + reduction_idx[i] - padding[i] for i in range(dim)]
+=======
+    new_size = list(batch) + list(dhw_out)
+    if any(padding) or any(ceil_mode) or any(d > 1 for d in dilation):
+        x_loader = constant_boundary_condition(x, min_value, dim=n_dim)
+    else:
+        x_loader = x.make_loader()
+
+    def fn_inner(idx, reduction_idx):
+        prefix = idx[:-n_dim]
+        bh = idx[-n_dim:]
+        ih = [
+            (bh[i] * stride[i]) + (reduction_idx[i] * dilation[i]) - padding[i]
+            for i in range(n_dim)
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x_loader([*prefix, *ih])
 
     result = Reduction.create(
@@ -4441,8 +4818,13 @@ def fn_inner(idx, reduction_idx):
     return result, offsets
 
 
+<<<<<<< HEAD
 @register_lowering(prims._low_memory_max_pool2d_with_offsets, type_promotion_kind=None)
 def _low_memory_max_pool2d_with_offsets(
+=======
+@register_lowering(prims._low_memory_max_pool_with_offsets, type_promotion_kind=None)
+def _low_memory_max_pool_with_offsets(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     x,
     kernel_size,
     stride,
@@ -4450,28 +4832,48 @@ def _low_memory_max_pool2d_with_offsets(
     dilation,
     ceil_mode=False,
 ):
+<<<<<<< HEAD
     # assert we are not on a fallback path, the inductor decomp should have guaranteed this
     kernel_size, stride, padding, dilation, _ = max_pool2d_checks(
+=======
+    n_dim = len(kernel_size)
+
+    # assert we are not on a fallback path, the inductor decomp should have guaranteed this
+    kernel_size, stride, padding, dilation, _ = max_pool_checks(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x,
         kernel_size,
         stride,
         padding,
         dilation,
+<<<<<<< HEAD
+=======
+        n_dim,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert_fallback=False,
     )
 
     with config.patch(unroll_reductions_threshold=25):
+<<<<<<< HEAD
         result, offsets = _max_pool2d_with_offsets(
+=======
+        result, offsets = _max_pool_with_offsets(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             x,
             kernel_size,
             stride,
             padding,
             dilation,
             ceil_mode,
+<<<<<<< HEAD
+=======
+            n_dim=n_dim,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return result, to_dtype(offsets, torch.int8)
 
 
+<<<<<<< HEAD
 @register_lowering(
     prims._low_memory_max_pool2d_offsets_to_indices, type_promotion_kind=None
 )
@@ -4497,6 +4899,29 @@ def offsets_to_indices(idx):
         h_inc = offset // kw_const
         w_inc = offset - (h_inc * kw_const)
         return increments_to_index(h_inc, w_inc, bh, bw)
+=======
+def _pool_offsets_to_indices(
+    offsets: TensorBox,
+    kernel_size: Sequence[Union[int, torch.SymInt]],
+    input_size: Sequence[Union[int, torch.SymInt]],
+    increments_to_index: Callable[
+        [Sequence[Union[int, torch.SymInt]], Sequence[Union[int, torch.SymInt]]],
+        torch._inductor.virtualized.OpsValue,
+    ],
+) -> TensorBox:
+    n_dim = len(kernel_size)
+    offsets_loader = offsets.make_loader()
+    window_size = sympy.sympify(functools.reduce(operator.mul, kernel_size))
+
+    def offsets_to_indices(idx):
+        offset = offsets_loader(idx)
+        offset_sympy = ops.indirect_indexing(offset, window_size)
+        reduction_idx = inductor_prims._flattened_index_to_nd(offset_sympy, kernel_size)
+        idhw = increments_to_index(idx, reduction_idx)
+        return ops.index_expr(
+            inductor_prims._flatten_index(idhw, input_size[-n_dim:]), torch.int64
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     indices = Pointwise.create(
         device=offsets.get_device(),
@@ -4507,10 +4932,61 @@ def offsets_to_indices(idx):
     return indices
 
 
+<<<<<<< HEAD
 fallback_max_pool2d_with_indices = fallback_handler(
     aten.max_pool2d_with_indices.default,
     add_to_fallback_set=False,
 )
+=======
+@register_lowering(
+    prims._low_memory_max_pool_offsets_to_indices, type_promotion_kind=None
+)
+def _low_memory_max_pool_offsets_to_indices(
+    offsets, kernel_size, input_size, stride, padding, dilation
+):
+    # TODO: Generalize to other max pooling flavors
+    n_dim = len(kernel_size)
+
+    def increments_to_index(idx, reduction_idx):
+        bh = idx[-n_dim:]
+        return [
+            (bh[i] * stride[i]) + (reduction_idx[i] * dilation[i]) - padding[i]
+            for i in range(n_dim)
+        ]
+
+    return _pool_offsets_to_indices(
+        offsets, kernel_size, input_size, increments_to_index
+    )
+
+
+def _max_pool_with_indices(
+    x,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+    n_dim,
+):
+    kernel_size, stride, padding, dilation, _ = max_pool_checks(
+        x, kernel_size, stride, padding, dilation, n_dim=n_dim
+    )
+
+    out, offsets = _max_pool_with_offsets(
+        x, kernel_size, stride, padding, dilation, ceil_mode, n_dim=n_dim
+    )
+
+    indices = _low_memory_max_pool_offsets_to_indices(
+        offsets,
+        kernel_size,
+        x.shape[-n_dim:],
+        stride,
+        padding,
+        dilation,
+    )
+
+    return out, indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Fallback when we do not decompose to the low-memory path.
@@ -4523,6 +4999,7 @@ def max_pool2d_with_indices(
     dilation=1,
     ceil_mode=False,
 ):
+<<<<<<< HEAD
     kernel_size, stride, padding, dilation, _ = max_pool2d_checks(
         x, kernel_size, stride, padding, dilation
     )
@@ -4542,6 +5019,27 @@ def max_pool2d_with_indices(
 
     return out, indices
 
+=======
+    return _max_pool_with_indices(
+        x, kernel_size, stride, padding, dilation, ceil_mode, n_dim=2
+    )
+
+
+# Fallback when we do not decompose to the low-memory path.
+@register_lowering(aten.max_pool3d_with_indices, type_promotion_kind=None)
+def max_pool3d_with_indices(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    dilation=1,
+    ceil_mode=False,
+):
+    return _max_pool_with_indices(
+        x, kernel_size, stride, padding, dilation, ceil_mode, n_dim=3
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 fallback_max_pool2d_with_indices_backward = fallback_handler(
     aten.max_pool2d_with_indices_backward.default,
@@ -4987,21 +5485,28 @@ def inner_fn_max_idx(idx):
     return rv, ri
 
 
+<<<<<<< HEAD
 fallback_fractional_max_pool2d = fallback_handler(
     aten.fractional_max_pool2d.default, add_to_fallback_set=False
 )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _fractional_pooling_offsets(samples, in_sz, out_sz, kernel_sz, dim, ndims):
     out_sz = out_sz[dim]
     in_sz = in_sz[dim]
     kernel_sz = kernel_sz[dim]
+<<<<<<< HEAD
     alpha = IntTrueDiv(in_sz - kernel_sz, out_sz - 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     samples_loader = samples.make_loader()
 
     def load(prefix, i):
         sample = samples_loader([*prefix, ndims - 1 - dim])
         i_expr = ops.index_expr(i, samples.get_dtype())
+<<<<<<< HEAD
         alpha_expr = ops.index_expr(alpha, samples.get_dtype())
         seq_i = ops.trunc((i_expr + sample) * alpha_expr) - ops.trunc(
             sample * alpha_expr
@@ -5013,12 +5518,25 @@ def load(prefix, i):
             ops.index_expr(out_sz - 1, torch.int64),
         )
         return ops.where(mask, seq_i, ops.index_expr(in_sz - kernel_sz, torch.int64))
+=======
+        diff = ops.index_expr(in_sz - kernel_sz, torch.int64)
+        out_sz_expr = ops.index_expr(out_sz - 1, torch.int64)
+        alpha = ops.truediv(
+            ops.to_dtype(diff, torch.float64), ops.to_dtype(out_sz_expr, torch.float64)
+        )
+        alpha = ops.where(ops.eq(out_sz_expr, 0), 0, alpha)
+        seq_i = ops.trunc((i_expr + sample) * alpha) - ops.trunc(sample * alpha)
+        seq_i = ops.to_dtype(seq_i, torch.int64)
+        mask = ops.lt(i_expr, out_sz_expr)
+        return ops.indirect_indexing(ops.where(mask, seq_i, diff), sympy.sympify(in_sz))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return load
 
 
 @register_lowering(aten.fractional_max_pool2d)
 def fractional_max_pool2d(x, kernel_size, output_size, random_samples):
+<<<<<<< HEAD
     x.realize_hint()
     *batch, inp_h, inp_w = x.get_size()
     kernel_h, kernel_w = kernel_size
@@ -5086,6 +5604,80 @@ def fn(idx, return_index):
         ranges=new_size,
     )
     return rv, ri
+=======
+    return _fractional_max_pool(x, kernel_size, output_size, random_samples, n_dim=2)
+
+
+@register_lowering(aten.fractional_max_pool3d)
+def fractional_max_pool3d(x, kernel_size, output_size, random_samples):
+    return _fractional_max_pool(x, kernel_size, output_size, random_samples, n_dim=3)
+
+
+def _fractional_max_pool(x, kernel_size, output_size, random_samples, n_dim):
+    x.realize_hint()
+    batch, inp_dhw = x.shape[:-n_dim], x.shape[-n_dim:]
+
+    with config.patch(unroll_reductions_threshold=25):
+        dhw_index_fn = [
+            _fractional_pooling_offsets(
+                samples=random_samples,
+                in_sz=inp_dhw,
+                out_sz=output_size,
+                kernel_sz=kernel_size,
+                ndims=n_dim,
+                dim=d,
+            )
+            for d in range(n_dim)
+        ]
+
+        x_loader = x.make_loader()
+
+        def fn_inner(idx, reduction_idx):
+            prefix = idx[:-n_dim]
+            return x_loader([*prefix, *increments_to_index(idx, reduction_idx)])
+
+        def increments_to_index(idx, reduction_idx):
+            prefix = idx[:-n_dim]
+            bdhw = idx[-n_dim:]
+            return [
+                dhw_index_fn[d](prefix, bdhw[d]) + reduction_idx[d]
+                for d in range(n_dim)
+            ]
+
+        new_size = list(batch) + list(output_size)
+        dtype = x.get_dtype()
+        result = Reduction.create(
+            reduction_type="max",
+            input_node=x,
+            device=x.get_device(),
+            dst_dtype=dtype,
+            src_dtype=dtype,
+            inner_fn=fn_inner,
+            ranges=new_size,
+            reduction_ranges=kernel_size,
+        )
+        offsets = Reduction.create(
+            reduction_type="argmax",
+            input_node=x,
+            device=x.get_device(),
+            dst_dtype=torch.int64,
+            src_dtype=dtype,
+            inner_fn=fn_inner,
+            ranges=new_size,
+            reduction_ranges=kernel_size,
+        )
+        if isinstance(result.data.data, Reduction):  # type: ignore[attr-defined]
+            # Only realize if reduction isn't unrolled
+            result.realize()
+        if isinstance(offsets.data.data, Reduction):  # type: ignore[attr-defined]
+            # Only realize if reduction isn't unrolled
+            offsets.realize()
+
+        indices = _pool_offsets_to_indices(
+            offsets, kernel_size, x.shape, increments_to_index
+        )
+        return result, indices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_lowering(aten.upsample_nearest2d_backward.default)
@@ -6065,7 +6657,11 @@ def div_mode(a, b, rounding_mode=None):
     both_boolean = is_boolean_type(a) and is_boolean_type(b)
 
     # floordiv and truncdiv need special handling for integer tensors on Triton,
+<<<<<<< HEAD
     # see the discussion at https://github.com/openai/triton/issues/605
+=======
+    # see the discussion at https://github.com/triton-lang/triton/issues/605
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if rounding_mode == "floor":
         assert not both_boolean, "floordiv operands can not be boolean at the same time"
         return floordiv(a, b) if both_integer else floor(div(a, b))
@@ -6268,7 +6864,14 @@ def cummax(x, axis=None):
 
     kwargs = _make_scan_inner(x, axis=axis, dtype=dtype)
     kwargs["dtypes"] = (dtype, torch.int64)
+<<<<<<< HEAD
     kwargs["inner_fns"] = (x.make_loader(), lambda _: "rindex")
+=======
+    kwargs["inner_fns"] = (
+        x.make_loader(),
+        lambda idx: ops.index_expr(idx[axis], torch.int64),
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     values, indices = ir.Scan.create(**kwargs, combine_fn=combine_fn)  # type: ignore[arg-type]
     if values is None:
         return fallback_cummax(x, dim=axis)
@@ -6288,7 +6891,14 @@ def cummin(x, axis=None):
 
     kwargs = _make_scan_inner(x, axis=axis, dtype=dtype)
     kwargs["dtypes"] = (dtype, torch.int64)
+<<<<<<< HEAD
     kwargs["inner_fns"] = (x.make_loader(), lambda _: "rindex")
+=======
+    kwargs["inner_fns"] = (
+        x.make_loader(),
+        lambda idx: ops.index_expr(idx[axis], torch.int64),
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     values, indices = ir.Scan.create(**kwargs, combine_fn=combine_fn)  # type: ignore[arg-type]
     if values is None:
         return fallback_cummin(x, dim=axis)
@@ -6405,11 +7015,19 @@ def register_pointwise_numeric(op, name=None, triton_fallback=None):
     )
 
 
+<<<<<<< HEAD
 def register_pointwise_numeric_ldf64(op):
     return register_pointwise(
         op,
         type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
         use_libdevice_for_f64=True,
+=======
+def register_pointwise_numeric_ldf64(op: torch._ops.OpOverloadPacket):
+    register_op_requires_libdevice_fp64(op.__name__)
+    return register_pointwise(
+        op,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -6860,6 +7478,7 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
             msg = f"{msg} Found from : \n {stack_trace}"
         V.graph.disable_cudagraphs_reason = msg
 
+<<<<<<< HEAD
     result = ir.WhileLoop.create(cond_fn, body_fn, carried_inputs, additional_inputs)
     return list(map(TensorBox.create, result))
 
@@ -6867,6 +7486,25 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
 @register_lowering(torch.ops.higher_order.invoke_subgraph, type_promotion_kind=None)
 def invoke_subgraph(subgraph_fn: ir.Subgraph, identifier: str, operands):
     result = ir.InvokeSubgraph.create(subgraph_fn, operands)
+=======
+    def _map_output(out: Any):
+        if isinstance(out, TensorBox):
+            return out
+        elif isinstance(out, ir.StorageBox):
+            return TensorBox(out)
+        elif isinstance(out, ir.MultiOutput):
+            return TensorBox.create(out)
+        else:
+            raise RuntimeError(f"NYI unsupported output type: {type(out)}")
+
+    result = ir.WhileLoop.create(cond_fn, body_fn, carried_inputs, additional_inputs)
+    return list(map(_map_output, result))
+
+
+@register_lowering(torch.ops.higher_order.invoke_subgraph, type_promotion_kind=None)
+def invoke_subgraph(subgraph_fn: ir.Subgraph, identifier: str, *operands):
+    result = ir.InvokeSubgraph.create(subgraph_fn, *operands)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return list(map(TensorBox.create, result))
 
 
@@ -6954,7 +7592,13 @@ def with_effects(token, op, *args, **kwargs):
         return (effectful_kernel,)
 
     result = pytree.tree_map_only(ir.MultiOutput, TensorBox.create, result)
+<<<<<<< HEAD
     if not isinstance(result, (list, tuple)):
+=======
+    # See [NOTE: with_effects return type]
+    # Only return `result` if it is a tuple, not list.
+    if not isinstance(result, tuple):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (effectful_kernel, result)
     else:
         return (effectful_kernel, *result)
@@ -6995,7 +7639,11 @@ def prepare_softmax_online(x, dim):
         # Note: [Split online_softmax_reduce]
         # We don't split reduction for online_softmax_reduce for now.
         # On one hand, supporting split reduction makes things complex since
+<<<<<<< HEAD
         # the splitted out reuctions requires 2 inputs rather than one.
+=======
+        # the split out reuctions requires 2 inputs rather than one.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # On the other hand, during training the online_softmax_reduce should
         # usually don't requires a split due to large batch size
         # (more specifically batch size times sequence length).
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
index 83a927e8c5f7..8842bc980a37 100644
--- a/torch/_inductor/memory.py
+++ b/torch/_inductor/memory.py
@@ -10,7 +10,11 @@
 from torch.utils._ordered_set import OrderedSet
 
 from .ir import MultiOutputLayout, NoneLayout
+<<<<<<< HEAD
 from .utils import get_dtype_size
+=======
+from .utils import get_dtype_size, is_wait
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .virtualized import V
 
 
@@ -23,6 +27,16 @@
 
 
 @dataclasses.dataclass
+<<<<<<< HEAD
+=======
+class PeakMemoryResult:
+    order: list[BaseSchedulerNode]
+    peak_memory: int
+    method: str
+
+
+@dataclasses.dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MemoryPlanningInfoForBuffer:
     size_alloc: int = 0
     size_free: int = 0
@@ -94,7 +108,11 @@ def _dep_size_hint(dep: Dep) -> int:
     for node in nodes:
         for dep in node.read_writes.reads:
             if dep.name in graph_inputs and not dep.name.startswith(
+<<<<<<< HEAD
                 ("primals_", "arg")
+=======
+                ("primals_", "arg", "fwd_rng_state", "bwd_rng_state")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 dep_name_to_succ_nodes[dep.name].add(node)
                 dep_name_to_size[dep.name] = _dep_size_hint(dep)
@@ -140,8 +158,28 @@ def _compute_and_update_buf_size(
         sched_buf: SchedulerBuffer, user_of_MultiOutputLayout: bool = False
     ) -> int:
         if isinstance(sched_buf.node.layout, NoneLayout):
+<<<<<<< HEAD
             sched_buf_to_size[sched_buf.get_name()] = (0, 0)
             return 0
+=======
+            _size = 0
+            # for a wait tensor op, its schedulerBuffer NoneLayout layout. However,
+            # the schedulerBuffer is treated as a mutation of the collective output
+            # so it needs to inherit the size of the collectives
+            if (
+                sched_buf.defining_op
+                and is_wait(sched_buf.defining_op.node)
+                and sched_buf.get_mutations()
+            ):
+                mutated_buf_name = sched_buf.get_mutations()[0]
+                _size = (
+                    sched_buf_to_size[mutated_buf_name][1]
+                    if mutated_buf_name in sched_buf_to_size
+                    else 0
+                )
+            sched_buf_to_size[sched_buf.get_name()] = (_size, _size)
+            return _size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif isinstance(sched_buf.node.layout, MultiOutputLayout):
             size_alloc = 0
             for user in sched_buf.users:
@@ -267,9 +305,15 @@ class BufferInfo:
 
     # get the execution step of each node, this will be used to determine
     # the end_step of buffers
+<<<<<<< HEAD
     node_to_step: dict[BaseSchedulerNode, int] = dict()
     for step, node in enumerate(nodes):
         node_to_step[node] = step
+=======
+    node_to_step: dict[BaseSchedulerNode, int] = {
+        node: step for step, node in enumerate(nodes)
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # get buffers' size and liveliness information
     buf_info_list: list[BufferInfo] = []
@@ -578,6 +622,38 @@ def visit(n: BaseSchedulerNode) -> None:
     return result
 
 
+<<<<<<< HEAD
+=======
+def prepare_planning_info(
+    nodes: list[BaseSchedulerNode],
+    name_to_buf: dict[str, SchedulerBuffer],
+    name_to_fused_node: dict[str, BaseSchedulerNode],
+    graph_inputs: OrderedSet[str],
+    graph_outputs: OrderedSet[str],
+) -> tuple[int, dict[str, FreeableInputBuffer]]:
+    """
+    Prepare planning info. As nodes are scheduled one at a time, these help
+    keep track of when a buffer can be freed, and when a node can be scheduled
+
+    Returns:
+        int: peak memory estimation
+        dict[str, FreeableInputBuffer]: name to freeable input buffer
+    """
+    name_to_freeable_input_buf = get_freeable_input_buf(nodes, graph_inputs)
+    assign_memory_planning_info_for_scheduler_buffers(nodes, name_to_buf)
+    assign_memory_planning_info_for_scheduler_nodes(
+        nodes, name_to_fused_node, name_to_buf, name_to_freeable_input_buf
+    )
+
+    # the default
+    estimated_peak_memory, _ = estimate_peak_memory(
+        nodes, name_to_freeable_input_buf, graph_outputs
+    )
+
+    return estimated_peak_memory, name_to_freeable_input_buf
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def reorder_for_peak_memory(
     nodes: list[BaseSchedulerNode],
     name_to_buf: dict[str, SchedulerBuffer],
@@ -597,6 +673,7 @@ def reorder_for_peak_memory(
 
     torch_log.info("Reordering for peak memory -- %d nodes", len(nodes))
 
+<<<<<<< HEAD
     @dataclasses.dataclass
     class PeakMemoryResult:
         order: list[BaseSchedulerNode]
@@ -611,15 +688,26 @@ class PeakMemoryResult:
     assign_memory_planning_info_for_scheduler_buffers(nodes, name_to_buf)
     assign_memory_planning_info_for_scheduler_nodes(
         nodes, name_to_fused_node, name_to_buf, name_to_freeable_input_buf
+=======
+    estimated_peak_memory, name_to_freeable_input_buf = prepare_planning_info(
+        nodes,
+        name_to_buf,
+        name_to_fused_node,
+        graph_inputs,
+        graph_outputs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     # keep track of the peak memory estimates of different methods
     peak_memory_diff_methods: list[PeakMemoryResult] = []
+<<<<<<< HEAD
 
     # the default
     estimated_peak_memory, _ = estimate_peak_memory(
         nodes, name_to_freeable_input_buf, graph_outputs
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     peak_memory_diff_methods.append(
         PeakMemoryResult(nodes, estimated_peak_memory, "baseline")
     )
diff --git a/torch/_inductor/metrics.py b/torch/_inductor/metrics.py
index 4892a2b5e369..2c007c5aa5fe 100644
--- a/torch/_inductor/metrics.py
+++ b/torch/_inductor/metrics.py
@@ -433,7 +433,11 @@ def enabled_metric_tables() -> OrderedSet[str]:
 
 @lru_cache
 def enabled_metric_tables_impl(config_str: str) -> OrderedSet[str]:
+<<<<<<< HEAD
     enabled = OrderedSet[str]()
+=======
+    enabled: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for name in config_str.split(","):
         name = name.strip()
         if not name:
diff --git a/torch/_inductor/mkldnn_ir.py b/torch/_inductor/mkldnn_ir.py
index cdec1e1c1d95..13ab6aff916e 100644
--- a/torch/_inductor/mkldnn_ir.py
+++ b/torch/_inductor/mkldnn_ir.py
@@ -23,7 +23,11 @@
     NoneLayout,
     TensorBox,
 )
+<<<<<<< HEAD
 from .utils import convert_shape_to_inductor, pad_listlike
+=======
+from .utils import convert_shape_to_inductor, pad_listlike, SUPPORTED_MKLDNN_DEVICES
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .virtualized import V
 
 
@@ -73,6 +77,25 @@ def _conv_input_size(
             input_size.append(input_size_d)
         return list(map(int, input_size))
 
+<<<<<<< HEAD
+=======
+    # Port from aten/src/ATen/native/ConvUtils.h: _conv_output_size
+    def _conv_output_size(input_size, weight_size, padding, stride, dilation=None):
+        has_dilation = dilation is not None
+        dim = len(input_size)
+        output_size = []
+        output_size.append(input_size[0])
+        output_size.append(weight_size[0])
+        for d in range(2, dim):
+            dilation_ = dilation[d - 2] if has_dilation else 1
+            kernel = dilation_ * (weight_size[d] - 1) + 1
+            output_size_d = (input_size[d] + (2 * padding[d - 2]) - kernel) // stride[
+                d - 2
+            ] + 1
+            output_size.append(output_size_d)
+        return output_size
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The size of prepacked_weight is the prepacked weight size of deconv:
     #   Groups > 1:  [g*o, i/g, ...]
     #   Groups == 1: [o, i, ...]
@@ -130,6 +153,7 @@ def _original_deconv_weight_size(
                 groups,
             )
         else:
+<<<<<<< HEAD
             bias_fake = (
                 ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
             )
@@ -145,13 +169,31 @@ def _original_deconv_weight_size(
                 groups,
             )
             output_size = output.size()
+=======
+            x_shape = list(x_fake.shape)
+            weight_shape = list(weight_fake.shape)
+            if len(x_shape) != len(weight_shape):
+                assert len(x_shape) == 3 and len(weight_shape) == 4
+                weight_shape.pop(2)
+            output_size = _conv_output_size(
+                x_shape,
+                weight_shape,
+                padding,
+                stride,
+                dilation,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         req_stride_order = [0] + list(reversed(range(1, len(stride) + 1)))
         req_stride_order = [len(req_stride_order)] + req_stride_order
 
     x = cls.require_stride_order(x, req_stride_order)
 
+<<<<<<< HEAD
     # We won't do weight prepack for Conv if dynamic_shapes.
+=======
+    # We won't do weight prepack for Conv if dynamic_shapes or if is xpu.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # In static shape cases, since weight is prepacked, we'll always force output to be channels last in the Conv kernel.
     # In dynamic shape cases, for input with channels = 1, like tensor of size (s0, 1, 28, 28) and stride (784, 784, 28, 1),
     # x = cls.require_stride_order(x, req_stride_order) where req_stride_order is in the channels last order
@@ -159,13 +201,31 @@ def _original_deconv_weight_size(
     # this tensor is considered as channels first and the output will be in contiguous format.
     # To align the behavior of the Conv kernel, we set the output_stride in such case to be contiguous instead of channels last.
     dynamic_shapes = not all(isinstance(i, int) for i in (output_size))
+<<<<<<< HEAD
     if dynamic_shapes and is_contiguous_storage_and_layout(x):
+=======
+    if (
+        dynamic_shapes or get_device_type(x) == "xpu"
+    ) and is_contiguous_storage_and_layout(x):
+        output_stride = FlexibleLayout.contiguous_strides(output_size)
+    # Currently we don't support channel last for the situation that stride of input's batch dim is 0,
+    # eg. input_size = (1, 1280, 64, 64), but input_stride=(0, 1, 81920, 1280).
+    # So we use NCHW hear instead.
+    # Different with cpu, cpu conv always use channels_last for convolution when weight is prepacked,
+    # but xpu does not do the prepack, so the problem exposed here is only for xpu.
+    # TODO support channels_last for such zero stride input.
+    elif get_device_type(x) == "xpu" and x.get_stride()[0] == 0:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_stride = FlexibleLayout.contiguous_strides(output_size)
     else:
         output_stride = make_channels_last_strides_for(output_size)
 
     assert get_device_type(x) == get_device_type(weight)
+<<<<<<< HEAD
     assert get_device_type(x) in ["cpu", "xpu"]
+=======
+    assert get_device_type(x) in SUPPORTED_MKLDNN_DEVICES
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     inputs = [x]
 
     if quantize_args is not None:
@@ -229,7 +289,11 @@ def _prepare_linear_fusion_create(
 
     x = cls.require_stride_order(x, req_stride_order)
     assert get_device_type(x) == get_device_type(weight)
+<<<<<<< HEAD
     assert get_device_type(x) in ["cpu", "xpu"]
+=======
+    assert get_device_type(x) in SUPPORTED_MKLDNN_DEVICES
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     inputs = [x]
 
     if quantize_args is not None:
@@ -281,17 +345,31 @@ def __init__(
         inputs,
         constant_args=(),
     ) -> None:
+<<<<<<< HEAD
+=======
+        self.device_type = get_device_type(inputs[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._convolution_pointwise.default,
+<<<<<<< HEAD
             cpp_kernel_name="aoti_torch_cpu_mkldnn__convolution_pointwise",
         )
 
     def codegen(self, wrapper):
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+            cpp_kernel_name=f"aoti_torch_{self.device_type}_mkldnn__convolution_pointwise",
+        )
+
+    def codegen(self, wrapper):
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
     @classmethod
@@ -338,18 +416,32 @@ def __init__(
         constant_args=(),
         cpp_constant_args=(),
     ) -> None:
+<<<<<<< HEAD
+=======
+        self.device_type = get_device_type(inputs[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._convolution_pointwise.binary,
+<<<<<<< HEAD
             cpp_kernel_name="aoti_torch_cpu_mkldnn__convolution_pointwise_binary",
+=======
+            cpp_kernel_name=f"aoti_torch_{self.device_type}_mkldnn__convolution_pointwise_binary",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.cpp_constant_args = cpp_constant_args
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
     @classmethod
@@ -403,6 +495,10 @@ def __init__(
         constant_args=(),
     ) -> None:
         # Due to constrain of op.call, other (Tensor&) should be at input[0]
+<<<<<<< HEAD
+=======
+        self.device_type = get_device_type(inputs[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reordered_inputs = [inputs[1], inputs[0]] + inputs[2:]
 
         super().__init__(
@@ -411,7 +507,11 @@ def __init__(
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._convolution_pointwise_.binary,
+<<<<<<< HEAD
             cpp_kernel_name="aoti_torch_cpu_mkldnn__convolution_pointwise_binary_",
+=======
+            cpp_kernel_name=f"aoti_torch_{self.device_type}_mkldnn__convolution_pointwise_binary_",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.mutation_outputs = [
@@ -420,7 +520,13 @@ def __init__(
         ]
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
@@ -489,7 +595,11 @@ def __init__(
         )
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
     @classmethod
@@ -562,12 +672,21 @@ def __init__(
             inputs,
             constant_args,
             None,
+<<<<<<< HEAD
             op_overload=torch.ops.onednn.qconv2d_pointwise.default,
             cpp_kernel_name="aoti_torch_cpu__qconv2d_pointwise_tensor",
         )
 
     def codegen(self, wrapper):
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+            op_overload=torch.ops.onednn.qconv_pointwise.default,
+            cpp_kernel_name="aoti_torch_cpu__qconv_pointwise_tensor",
+        )
+
+    def codegen(self, wrapper):
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
@@ -654,11 +773,19 @@ def __init__(
         if bias is not None
             - inputs = [x, x_scale, x_zp, w,  w_scale, w_zp, accum, b]
             - const_args = [stride, padding, dilation, groups, o_scale, o_zp,
+<<<<<<< HEAD
             output_dtype, accum_scale, accum_zp, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
         else
             - inputs = [x, x_scale, x_zp, w,  w_scale, w_zp, accum]
             - const_args [b, stride, padding, dilation, groups, o_scale, o_zp,
              output_dtype, accum_scale, accum_zp, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
+=======
+            output_dtype, accum_scale, accum_zp, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm]
+        else
+            - inputs = [x, x_scale, x_zp, w,  w_scale, w_zp, accum]
+            - const_args [b, stride, padding, dilation, groups, o_scale, o_zp,
+             output_dtype, accum_scale, accum_zp, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         self.has_bias = len(inputs) == 8
         self.idx_for_inplace_sum = 6
@@ -672,12 +799,20 @@ def __init__(
         )
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
 
+<<<<<<< HEAD
     def get_mutation_names(self):
+=======
+    def get_mutation_names(self) -> Sequence[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [self.inputs[self.idx_for_inplace_sum].get_name()]
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
@@ -782,7 +917,11 @@ def __init__(
         )
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
     @classmethod
@@ -826,7 +965,11 @@ def __init__(
         )
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
     @classmethod
@@ -879,7 +1022,11 @@ def __init__(
         )
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
     @classmethod
@@ -943,7 +1090,11 @@ def __init__(
         )
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
         if isinstance(self.layout, Layout):
@@ -1009,11 +1160,19 @@ def __init__(
         if bias is not None
             - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2, bias]
             - const_args is: [o_scale, o_zp,
+<<<<<<< HEAD
               fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
         else
             - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2]
             - const_args is: [bias, o_scale, o_zp,
               fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
+=======
+              fp32_output, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm]
+        else
+            - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2]
+            - const_args is: [bias, o_scale, o_zp,
+              fp32_output, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         self.has_bias = has_bias
         self.idx_for_inplace_sum = 6
@@ -1027,12 +1186,20 @@ def __init__(
         )
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
 
+<<<<<<< HEAD
     def get_mutation_names(self):
+=======
+    def get_mutation_names(self) -> Sequence[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         binary_post_op = self.constant_args[-5]
         if binary_post_op == "sum":
             return [self.inputs[self.idx_for_inplace_sum].get_name()]
@@ -1225,11 +1392,19 @@ def get_strides_of_lstm_output(output_shape, batch_first):
         return output_ir
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
         return super().codegen(wrapper)
 
 
 # Add this IR so that we can include shim_mkldnn.h for cpp_wrapper
+=======
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        return super().codegen(wrapper)
+
+
+# Add this IR so that we can include shim_cpu.h for cpp_wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class WeightInt4PackMatmul(ExternKernelAlloc):
     def __init__(
         self,
@@ -1253,7 +1428,11 @@ def __init__(
         )
 
     def codegen(self, wrapper):
+<<<<<<< HEAD
         wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+=======
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().codegen(wrapper)
 
         if isinstance(self.layout, Layout):
diff --git a/torch/_inductor/mkldnn_lowerings.py b/torch/_inductor/mkldnn_lowerings.py
index d665aa3b892d..b95027c5fd07 100644
--- a/torch/_inductor/mkldnn_lowerings.py
+++ b/torch/_inductor/mkldnn_lowerings.py
@@ -6,7 +6,11 @@
 import torch.utils._pytree as pytree
 from torch._inductor.kernel.mm_common import mm_args
 
+<<<<<<< HEAD
 from . import ir
+=======
+from . import config, ir
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .codegen.cpp_gemm_template import CppGemmTemplate
 from .codegen.cpp_grouped_gemm_template import CppGroupedGemmTemplate
 from .codegen.cpp_utils import create_epilogue_with_attr
@@ -25,8 +29,108 @@
     ChoiceCaller,
     ExternKernelChoice,
 )
+<<<<<<< HEAD
 from .utils import use_aten_gemm_kernels, use_cpp_gemm_template, use_max_autotune
 from .virtualized import ops, V
+=======
+from .utils import use_aten_gemm_kernels, use_cpp_gemm_template
+from .virtualized import ops, OpsValue, V
+
+
+def create_int8_compensation(
+    W_tensor: torch.Tensor,
+    packed_weight: ir.TensorBox,
+    x_scale: ir.TensorBox,
+    x_zp: ir.TensorBox,
+    w_scale: ir.TensorBox,
+) -> tuple[bool, ir.TensorBox, Optional[ir.TensorBox]]:
+    use_int8_fast_compensation_path = False
+    weight_compens = None
+    x_w_scale = None
+    if all(
+        isinstance(item, ir.TensorBox)
+        and item.get_name() in V.graph.constants
+        and hasattr(item.data, "data")
+        and isinstance(item.data.data, ir.ConstantBuffer)
+        for item in [x_scale, x_zp, w_scale]
+    ):
+        use_int8_fast_compensation_path = True
+        x_w_scale_tensor = (
+            V.graph.constants[x_scale.get_name()]
+            * V.graph.constants[w_scale.get_name()]
+        )
+        x_w_scale = V.graph.add_tensor_constant(
+            x_w_scale_tensor,
+            name=packed_weight.get_name() + "_x_w_compens",
+        )
+        weight_compens_tensor = torch.sum(W_tensor.to(torch.float), dim=0)
+        x_zp_tensor = V.graph.constants[x_zp.get_name()]
+        weight_compens_tensor = weight_compens_tensor * x_w_scale_tensor * x_zp_tensor
+        weight_compens = V.graph.add_tensor_constant(
+            weight_compens_tensor,
+            name=packed_weight.get_name() + "_BMatrixCompens",
+        )
+    else:
+        weight_compens_tensor = torch.sum(W_tensor.to(torch.float), dim=0)
+        weight_compens = V.graph.add_tensor_constant(
+            weight_compens_tensor,
+            name=packed_weight.get_name() + "_BMatrixCompens",
+        )
+    return (
+        use_int8_fast_compensation_path,
+        weight_compens,
+        x_w_scale,
+    )
+
+
+def codegen_int8_gemm_template_compensation(
+    use_int8_fast_compensation_path: bool,
+    input: OpsValue,
+    _weight_compo: OpsValue,
+    _x_scale: Optional[OpsValue],
+    _x_zp: Optional[OpsValue],
+    _w_scale: Optional[OpsValue],
+    _x_w_scale: Optional[OpsValue],
+) -> OpsValue:
+    if use_int8_fast_compensation_path:
+        temp = ops.sub(
+            ops.mul(
+                input,
+                _x_w_scale,
+            ),
+            _weight_compo,
+        )
+    else:
+        temp = ops.mul(
+            ops.mul(
+                input,
+                _x_scale,
+            ),
+            _w_scale,
+        )
+        # NOTE: We will apply compensation even if the x_zp is 0 for int8 quantization.
+        # That's because when torch.compile is invoked for dynamic quantization,
+        # x might coincidentally have such values that x_zp might be zero despite
+        # asymmetric quantization.
+        # Besides, if x_zp is dummy for int8 x, or if x is statically quantized,
+        # we'd still perform that redundant compute to avoid making the code messy
+        # because we discovered that redundant computation of compensation did not
+        # lead to performance degradation with the input shapes tested.
+        temp = ops.sub(
+            temp,
+            ops.mul(
+                ops.mul(
+                    ops.mul(
+                        _x_scale,
+                        _w_scale,
+                    ),
+                    _x_zp,
+                ),
+                _weight_compo,
+            ),
+        )
+    return temp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def grouped_gemm_lowering(
@@ -44,7 +148,11 @@ def grouped_gemm_lowering(
         x = view(x, [-1, x_size[-1]])
     num_gemm = len(w)
 
+<<<<<<< HEAD
     assert use_max_autotune()
+=======
+    assert config.max_autotune or config.max_autotune_gemm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     b = [bias if bias is None else ir.ExternKernel.realize_input(bias) for bias in b]
 
     choices: list[ChoiceCaller] = []
@@ -54,7 +162,11 @@ def grouped_gemm_lowering(
         has_bias=[bias is not None for bias in b],
         trans_w=True,
         epilogue_creator=None,
+<<<<<<< HEAD
         act_mapping={num: x for num in range(num_gemm)},
+=======
+        act_mapping=dict.fromkeys(range(num_gemm), x),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     input_nodes = [x, *w]
@@ -130,7 +242,11 @@ def register_onednn_fusion_ops():
             torch.ops.mkldnn._convolution_transpose_pointwise,
             torch.ops.mkldnn._linear_pointwise,
             aten.mkldnn_rnn_layer.default,
+<<<<<<< HEAD
             torch.ops.onednn.qconv2d_pointwise,
+=======
+            torch.ops.onednn.qconv_pointwise,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         @register_lowering(torch.ops.mkldnn._convolution_pointwise)
@@ -246,7 +362,11 @@ def linear_unary(
             if b is not None:
                 b = ir.ExternKernel.realize_input(b)
             choices: list[ChoiceCaller] = []
+<<<<<<< HEAD
             if use_max_autotune():
+=======
+            if config.max_autotune or config.max_autotune_gemm:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 transposed_w = permute(w, [1, 0])
                 *_, layout, x, transposed_w = mm_args(x, transposed_w, layout=layout)
                 if use_cpp_gemm_template(layout, x, transposed_w):
@@ -309,7 +429,11 @@ def linear_binary(
             if b is not None:
                 b = ir.ExternKernel.realize_input(b)
             choices: list[ChoiceCaller] = []
+<<<<<<< HEAD
             if use_max_autotune():
+=======
+            if config.max_autotune or config.max_autotune_gemm:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 transposed_w = permute(w, [1, 0])
                 *_, layout, x, transposed_w, y = mm_args(
                     x, transposed_w, y, layout=layout
@@ -428,7 +552,11 @@ def mkldnn_rnn_layer(
                 ),
             )
 
+<<<<<<< HEAD
         @register_lowering(torch.ops.onednn.qconv2d_pointwise, type_promotion_kind=None)
+=======
+        @register_lowering(torch.ops.onednn.qconv_pointwise, type_promotion_kind=None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def qconvolution_unary(
             x: TensorBox,
             x_scale,
@@ -529,7 +657,11 @@ def qconvolution_binary(
                 # For int8-mixed-bf16 quantization and inplace add,
                 # there is case when accum dtype is float32 but output dtype is bfloat16.
                 # Since the accum will be inplaced changed with post op sum,
+<<<<<<< HEAD
                 # we will do accum dtype convertion here.
+=======
+                # we will do accum dtype conversion here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 accum = to_dtype(accum, output_dtype)
             return TensorBox.create(
                 mkldnn_ir.QConvPointWiseBinaryPT2E.create(
@@ -615,8 +747,13 @@ def qlinear_unary(
             assert x_zp.get_numel() == 1, "x_zp is incompatible with oneDNN qlinear"
 
             # When channels less than 8, w_scale/w_zp is Pointwise instead of ConstantBuffer
+<<<<<<< HEAD
             # Refer to https://github.com/pytorch/pytorch/blob
             # /f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577
+=======
+            # Refer to
+            # https://github.com/pytorch/pytorch/blob/f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if w_zp is None:
                 # If w_zp is None, then it's a dummy tensor created to denote the
                 # absence of a zero point, and thus w is int8 symmetrically quantized.
@@ -639,7 +776,11 @@ def qlinear_unary(
             bias_dtype = None if bias is None else bias.get_dtype()
             choices: list[ChoiceCaller] = []
 
+<<<<<<< HEAD
             if use_max_autotune():
+=======
+            if config.max_autotune or config.max_autotune_gemm:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *_, layout, x, packed_weight = mm_args(
                     x, packed_weight, layout=layout, out_dtype=output_dtype
                 )
@@ -656,10 +797,24 @@ def qlinear_unary(
                     )
                 ) and use_cpp_gemm_template(layout, x, packed_weight):
                     W_tensor = V.graph.constants[packed_weight.get_name()].to_dense()
+<<<<<<< HEAD
                     weight_compens_tensor = torch.sum(W_tensor.to(torch.float), dim=0)
                     weight_compens = V.graph.add_tensor_constant(
                         weight_compens_tensor,
                         name=packed_weight.get_name() + "_BMatrixCompens",
+=======
+
+                    (
+                        use_int8_fast_compensation_path,
+                        weight_compens,
+                        x_w_scale,
+                    ) = create_int8_compensation(
+                        W_tensor,
+                        packed_weight,
+                        x_scale,
+                        x_zp,
+                        w_scale,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
                     def epilogue_creator(input_buffer):
@@ -672,6 +827,13 @@ def epilogue_creator(input_buffer):
                         ]
                         input_loader = input_buffer.make_loader()
                         weight_compens_loader = weight_compens.make_loader()
+<<<<<<< HEAD
+=======
+                        x_w_scale_loader = None
+                        if use_int8_fast_compensation_path:
+                            assert x_w_scale is not None
+                            x_w_scale_loader = x_w_scale.make_loader()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         x_scale_loader = x_scale.make_loader()
                         w_scale_loader = w_scale.make_loader()
                         x_zp_loader = x_zp.make_loader()
@@ -687,6 +849,7 @@ def inner_fn(index):
                             # cvt to FP32 before doing compensation
                             input = ops.to_dtype(input, torch.float32)
                             weight_compens_index = (index[-1],)
+<<<<<<< HEAD
                             _x_scale = x_scale_loader(())
                             _x_zp = x_zp_loader(())
                             _w_scale = w_scale_loader(weight_compens_index)
@@ -721,6 +884,30 @@ def inner_fn(index):
                                     ),
                                     _weight_compo,
                                 ),
+=======
+
+                            _x_scale = None
+                            _x_zp = None
+                            _w_scale = None
+                            if not use_int8_fast_compensation_path:
+                                _x_scale = x_scale_loader(())
+                                _x_zp = x_zp_loader(())
+                                _w_scale = w_scale_loader(weight_compens_index)
+                            _weight_compo = weight_compens_loader(weight_compens_index)
+                            _x_w_scale = None
+                            if use_int8_fast_compensation_path:
+                                assert x_w_scale_loader is not None
+                                _x_w_scale = x_w_scale_loader(weight_compens_index)
+                            # Step 1: Compute s8s8->s32 or u8s8->s32 GEMM & then apply compensation
+                            temp = codegen_int8_gemm_template_compensation(
+                                use_int8_fast_compensation_path,
+                                input,
+                                _weight_compo,
+                                _x_scale,
+                                _x_zp,
+                                _w_scale,
+                                _x_w_scale,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             )
                             # Step 2: add Bias if applicable
                             if bias is not None:
@@ -927,8 +1114,13 @@ def qlinear_binary(
                 x_zp.realize()
 
             # When channels less than 8, w_scale/w_zp is Pointwise instead of ConstantBuffer
+<<<<<<< HEAD
             # Refer to https://github.com/pytorch/pytorch/blob
             # /f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577
+=======
+            # Refer to
+            # https://github.com/pytorch/pytorch/blob/f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             w_scale.realize()
             w_zp.realize()
             if w_zp.get_dtype() != torch.int32 and isinstance(
@@ -948,7 +1140,11 @@ def qlinear_binary(
                         # For int8-mixed-bf16 quantization and inplace add,
                         # there is case when accum dtype is float32 but output dtype is bfloat16.
                         # Since the accum will be inplaced changed with post op sum,
+<<<<<<< HEAD
                         # we will do accum dtype convertion here.
+=======
+                        # we will do accum dtype conversion here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         x2 = to_dtype(x2, output_dtype)
                 else:
                     assert x2.get_dtype() == output_dtype, (
@@ -958,8 +1154,13 @@ def qlinear_binary(
             bias_dtype = bias.get_dtype() if bias is not None else None
             choices: list[ChoiceCaller] = []
             if (
+<<<<<<< HEAD
                 use_max_autotune() and binary_attr == "add"
             ):  # <TODO> Support inplace sum fusion
+=======
+                config.max_autotune or config.max_autotune_gemm
+            ) and binary_attr == "add":  # <TODO> Support inplace sum fusion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 *_, layout, x, packed_weight, x2 = mm_args(
                     x, packed_weight, x2, layout=layout, out_dtype=output_dtype
                 )
@@ -981,10 +1182,23 @@ def qlinear_binary(
                 ):
                     W_tensor = V.graph.constants[packed_weight.get_name()]
                     W_tensor = W_tensor.to_dense()
+<<<<<<< HEAD
                     weight_compens_tensor = torch.sum(W_tensor.to(torch.float), dim=0)
                     weight_compens = V.graph.add_tensor_constant(
                         weight_compens_tensor,
                         name=packed_weight.get_name() + "_BMatrixCompens",
+=======
+                    (
+                        use_int8_fast_compensation_path,
+                        weight_compens,
+                        x_w_scale,
+                    ) = create_int8_compensation(
+                        W_tensor,
+                        packed_weight,
+                        x_scale,
+                        x_zp,
+                        w_scale,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
                     def epilogue_creator(input_buffer):
@@ -999,6 +1213,13 @@ def epilogue_creator(input_buffer):
                         input_loader = input_buffer.make_loader()
                         x2_loader = x2.make_loader()
                         weight_compens_loader = weight_compens.make_loader()
+<<<<<<< HEAD
+=======
+                        x_w_scale_loader = None
+                        if use_int8_fast_compensation_path:
+                            assert x_w_scale is not None
+                            x_w_scale_loader = x_w_scale.make_loader()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         x_scale_loader = x_scale.make_loader()
                         w_scale_loader = w_scale.make_loader()
                         x_zp_loader = x_zp.make_loader()
@@ -1011,6 +1232,7 @@ def inner_fn(index):
                             nonlocal bias
                             input = input_loader(index)
                             _x2 = x2_loader(index)
+<<<<<<< HEAD
                             _x_scale = x_scale_loader(())
                             _x_zp = x_zp_loader(())
 
@@ -1044,6 +1266,33 @@ def inner_fn(index):
                                 ),
                             )
 
+=======
+                            _x_scale = None
+                            _x_zp = None
+                            _w_scale = None
+                            weight_compens_index = (index[-1],)
+                            if not use_int8_fast_compensation_path:
+                                _x_scale = x_scale_loader(())
+                                _x_zp = x_zp_loader(())
+                                _w_scale = w_scale_loader(weight_compens_index)
+                            # MicroKernel Output is with int32: cvt to FP32 before doing compensation
+                            input = ops.to_dtype(input, torch.float32)
+                            _weight_compo = weight_compens_loader(weight_compens_index)
+                            _x_w_scale = None
+                            if use_int8_fast_compensation_path:
+                                assert x_w_scale_loader is not None
+                                _x_w_scale = x_w_scale_loader(weight_compens_index)
+                            # Step 1: Doing compensation to cvt fp32
+                            temp = codegen_int8_gemm_template_compensation(
+                                use_int8_fast_compensation_path,
+                                input,
+                                _weight_compo,
+                                _x_scale,
+                                _x_zp,
+                                _w_scale,
+                                _x_w_scale,
+                            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             # Step 2: add Bias if applicable
                             if bias is not None:
                                 _bias = bias_loader(weight_compens_index)
@@ -1206,7 +1455,11 @@ def mkl_packed_linear(
                 layout=None,
             ):
                 choices: list[ChoiceCaller] = []
+<<<<<<< HEAD
                 if use_max_autotune():
+=======
+                if config.max_autotune or config.max_autotune_gemm:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     transposed_w = permute(orig_w, [1, 0])
                     *_, layout, x, transposed_w = mm_args(
                         x, transposed_w, layout=layout
diff --git a/torch/_inductor/ops_handler.py b/torch/_inductor/ops_handler.py
index 692857f26097..22f1970b55cd 100644
--- a/torch/_inductor/ops_handler.py
+++ b/torch/_inductor/ops_handler.py
@@ -681,6 +681,7 @@ def check_bounds(
     ) -> None:
         raise NotImplementedError
 
+<<<<<<< HEAD
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # In CUDA, optimized implementations of other mathematical operations are
     # offered separately via libdevice for double precision computation (in
@@ -715,6 +716,8 @@ def libdevice_sigmoid(self, x0: T) -> T:
     def libdevice_log(self, x0: T) -> T:
         raise NotImplementedError
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # halide-only
     def halide_clamp(self, value: T, size: sympy.Expr, check: bool) -> T:
         raise NotImplementedError
diff --git a/torch/_inductor/output_code.py b/torch/_inductor/output_code.py
index dd0adb36c49c..7d14262f9b51 100644
--- a/torch/_inductor/output_code.py
+++ b/torch/_inductor/output_code.py
@@ -25,8 +25,12 @@
 import dataclasses
 import logging
 import os
+<<<<<<< HEAD
 import re
 from pathlib import Path
+=======
+from functools import partial
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 from typing_extensions import TypeAlias
 
@@ -35,6 +39,11 @@
 from torch._inductor.cudagraph_utils import (
     BoxedDeviceIndex,
     CudagraphCachedInfo,
+<<<<<<< HEAD
+=======
+    CudagraphMetadata,
+    get_partition_cudagraph_metadata,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_placeholder_info,
     log_cudagraph_skip_and_bump_counter,
 )
@@ -42,6 +51,10 @@
 from torch._inductor.utils import (
     align_inputs_from_check_idxs,
     BoxedBool,
+<<<<<<< HEAD
+=======
+    GraphPartitionMap,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     InputType,
     output_node,
     set_tracing_context_output_strides,
@@ -58,9 +71,17 @@
 
     from torch._inductor import metrics
     from torch._inductor.graph import GraphLowering
+<<<<<<< HEAD
 
     from .compile_fx import _CompileFxKwargs
     from .triton_bundler import TritonKernelArtifacts
+=======
+    from torch._library.fake_class_registry import FakeScriptObject
+    from torch.export.pt2_archive._package_weights import Weights
+
+    from .compile_fx import _CompileFxKwargs
+    from .triton_bundler import TritonBundle
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 
@@ -71,6 +92,12 @@ class OutputCode:
 
     # None if the output is not remote cacheable
     _fx_graph_cache_key: Optional[str] = dataclasses.field(default=None, init=False)
+<<<<<<< HEAD
+=======
+    _fx_graph_cache_debug_lines: Optional[list[str]] = dataclasses.field(
+        default=None, init=False
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # How long it took to compile this OutputCode, end to end
     _time_taken_ns: Optional[int] = dataclasses.field(default=None, init=False)
@@ -81,8 +108,13 @@ def __call__(self, inputs: Sequence[Any]) -> Any:
     def post_compile(
         self,
         example_inputs: Sequence[InputType],
+<<<<<<< HEAD
         cudagraphs: BoxedBool,
         constants: CompiledFxGraphConstants,
+=======
+        constants: CompiledFxGraphConstants,
+        graph_kwargs: _CompileFxKwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         raise NotImplementedError(type(self))
 
@@ -132,11 +164,63 @@ def complex_memory_overlap(t: torch.Tensor) -> bool:
     return False
 
 
+<<<<<<< HEAD
+=======
+def maybe_handle_backward_generation(
+    compiled_graph: CompiledFxGraph,
+    boxed_forward_device_index: Optional[BoxedDeviceIndex],
+) -> None:
+    assert compiled_graph.current_callable is not None
+    is_backward = compiled_graph.fx_kwargs["is_backward"]
+
+    # See [Backward Generation Handling]
+    # if cudagraph'd the forward and set the device, we need to let the cudagraph manager
+    # know we are we running the backward even if we will not run it in cudagraphs
+    if is_backward and config.triton.cudagraph_trees:
+        assert boxed_forward_device_index is not None
+        assert boxed_forward_device_index.value is not None
+        compiled_graph_callable = compiled_graph.current_callable
+
+        manager = torch._inductor.cudagraph_trees.get_manager(
+            boxed_forward_device_index.value, create_if_none_exists=False
+        )
+        # should already exist from forward
+        assert manager is not None
+
+        def compiled_artifact(new_inputs: list[Any]) -> Callable[..., Any]:
+            manager.set_to_running_backward()  # type: ignore[union-attr]
+            return compiled_graph_callable(new_inputs)
+
+        compiled_graph.current_callable = compiled_artifact
+
+
+def prepare_cudagraph_post_compile(
+    compiled_graph: CompiledFxGraph,
+    example_inputs: Sequence[InputType],
+    boxed_forward_device_index: Optional[BoxedDeviceIndex],
+) -> None:
+    if not config.triton.cudagraph_trees:
+        # Force specialize all inputs so that CUDA graphs will work
+        for t in example_inputs:
+            if isinstance(t, torch.SymInt):
+                int(t)  # guard
+
+    is_inference = compiled_graph.fx_kwargs["is_inference"]
+    is_backward = compiled_graph.fx_kwargs["is_backward"]
+    if boxed_forward_device_index is not None and not is_inference and not is_backward:
+        boxed_forward_device_index.set(next(iter(compiled_graph.device_idxs)))
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def cudagraph_post_compile(
     example_inputs: Sequence[InputType],
     compiled_graph: CompiledFxGraph,
     cudagraphs: BoxedBool,
     constants: dict[str, torch.Tensor],
+<<<<<<< HEAD
+=======
+    boxed_forward_device_index: Optional[BoxedDeviceIndex],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     Checks for any reasons not to run cudagraphs and then
@@ -147,7 +231,10 @@ def cudagraph_post_compile(
     assert compiled_graph.cudagraph_info is not None
     cached_info = compiled_graph.cudagraph_info
     cudagraph_fail_reasons = cached_info.cudagraph_fail_reasons
+<<<<<<< HEAD
     boxed_forward_device_index = compiled_graph.boxed_forward_device_index
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_inference = compiled_graph.fx_kwargs["is_inference"]
     is_backward = compiled_graph.fx_kwargs["is_backward"]
 
@@ -157,6 +244,7 @@ def cudagraph_post_compile(
 
         placeholders = cached_info.placeholders
         stack_traces = cached_info.stack_traces
+<<<<<<< HEAD
         if not config.triton.cudagraph_trees:
             # Force specialize all inputs so that CUDA graphs will work
             for t in example_inputs:
@@ -169,6 +257,12 @@ def cudagraph_post_compile(
             and not is_backward
         ):
             boxed_forward_device_index.set(next(iter(compiled_graph.device_idxs)))
+=======
+
+        prepare_cudagraph_post_compile(
+            compiled_graph, example_inputs, boxed_forward_device_index
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from .compile_fx import cudagraphify
 
@@ -188,6 +282,7 @@ def cudagraph_post_compile(
 
     else:
         BoxedBool.disable(cudagraphs)
+<<<<<<< HEAD
 
         # See [Backward Generation Handling]
         # if cudagraph'd the forward and set the device, we need to let the cudagraph manager
@@ -208,6 +303,9 @@ def compiled_artifact(new_inputs: list[Any]) -> Callable[..., Any]:
                 return compiled_graph_callable(new_inputs)
 
             compiled_graph.current_callable = compiled_artifact
+=======
+        maybe_handle_backward_generation(compiled_graph, boxed_forward_device_index)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if "cuda" in compiled_graph.device_types:
             # prefer better disable_cudagraphs_reason bc stack trace
@@ -222,10 +320,92 @@ def compiled_artifact(new_inputs: list[Any]) -> Callable[..., Any]:
                 )
 
 
+<<<<<<< HEAD
+=======
+def cudagraph_partition_post_compile(
+    example_inputs: Sequence[InputType],
+    compiled_graph: CompiledFxGraph,
+    cudagraphs: BoxedBool,
+    constants: dict[str, torch.Tensor],
+    boxed_forward_device_index: Optional[BoxedDeviceIndex],
+) -> None:
+    """
+    Cudagraphify each partition functions, which first prepares the necessary
+    metadata and then applies the cudagraphify function to each partition.
+
+    Assuming all partition functions are cudagraphified and share the same order
+    as `compiled_graph.partition_maps`. See [Note: Graph Partition Map for CUDAGraph].
+    """
+    assert compiled_graph.cudagraph_info is not None
+    cudagraph_fail_reasons = compiled_graph.cudagraph_info.cudagraph_fail_reasons
+
+    if (
+        cudagraph_fail_reasons
+        or compiled_graph.partition_maps is None
+        or len(compiled_graph.partition_maps) == 0
+    ):
+        # cudagraphify is not called if there are no partitions
+        BoxedBool.disable(cudagraphs)
+        maybe_handle_backward_generation(compiled_graph, boxed_forward_device_index)
+        return
+
+    from .compile_fx import cudagraphify
+
+    assert compiled_graph.current_callable is not None
+    assert compiled_graph.recursively_apply_fns is not None
+    is_inference = compiled_graph.fx_kwargs["is_inference"]
+    is_backward = compiled_graph.fx_kwargs["is_backward"]
+    static_input_idxs = OrderedSet(compiled_graph.fx_kwargs["static_input_idxs"] or ())
+    mutated_input_idxs = compiled_graph.mutated_input_idxs
+    device_index = next(iter(compiled_graph.device_idxs))
+
+    graph_metadata = CudagraphMetadata(
+        compiled_graph.cudagraph_info.placeholders,
+        static_input_idxs,
+        mutated_input_idxs,
+        compiled_graph.cudagraph_info.stack_traces,
+        constants,
+    )
+
+    prepare_cudagraph_post_compile(
+        compiled_graph, example_inputs, boxed_forward_device_index
+    )
+
+    # cudagraphify each partition function, assuming every graph partition function
+    # is cudagraphable. Non-cudagraphable ops (e.g., cpu ops) are inlined into
+    # `call` function and not included in partition functions.
+    cudagraphify_fns = []
+    for partition_map in compiled_graph.partition_maps:
+        partition_metadata = get_partition_cudagraph_metadata(
+            partition_map,
+            graph_metadata,
+        )
+
+        cudagraphify_fn = partial(
+            cudagraphify,
+            static_input_idxs=tuple(partition_metadata.static_input_idxs),
+            device_index=device_index,
+            stack_traces=partition_metadata.stack_traces,
+            is_backward=is_backward,
+            is_inference=is_inference,
+            constants=tuple(partition_metadata.constants.values()),
+            placeholders=partition_metadata.placeholders,
+            mutated_input_idxs=tuple(partition_metadata.mutated_input_idxs),
+        )
+        cudagraphify_fns.append(cudagraphify_fn)
+
+    compiled_graph.recursively_apply_fns(cudagraphify_fns)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def maybe_realign_inputs(
     ran_cudagraphs: BoxedBool,
     compiled_graph: CompiledFxGraph,
     inputs_to_check: Sequence[int],
+<<<<<<< HEAD
+=======
+    mutated_inputs_idxs: OrderedSet[int],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     Realigns input strides from inputs_to_check if
@@ -236,7 +416,11 @@ def maybe_realign_inputs(
     if not ran_cudagraphs:
         assert compiled_graph.current_callable is not None
         new_callable = align_inputs_from_check_idxs(
+<<<<<<< HEAD
             compiled_graph.current_callable, inputs_to_check
+=======
+            compiled_graph.current_callable, inputs_to_check, mutated_inputs_idxs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if new_callable is not compiled_graph.current_callable:
             compiled_graph.current_callable = new_callable
@@ -294,8 +478,19 @@ class CompiledFxGraph(OutputCode):
     """
 
     current_callable: Optional[Callable[..., Any]]
+<<<<<<< HEAD
     cache_key: str
     source_code: str = dataclasses.field(repr=False)  # Do not display source_code
+=======
+    recursively_apply_fns: Optional[Callable[..., Any]]
+    compiled_fn_runner: Optional[Any]
+    cache_key: str
+    source_code: str = dataclasses.field(repr=False)  # Do not display source_code
+    runnable_graph_str: str = dataclasses.field(repr=False)  # Do not display graph
+    inductor_post_grad_graph_str: str = dataclasses.field(
+        repr=False
+    )  # Do not display graph
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cache_linemap: Optional[list[tuple[int, str]]]
     device_types: OrderedSet[str]
     device_idxs: OrderedSet[int]
@@ -303,7 +498,11 @@ class CompiledFxGraph(OutputCode):
     mutated_input_idxs: OrderedSet[int]
     constants: Optional[dict[str, torch.Tensor]]
     frozen_param_names: dict[str, str]
+<<<<<<< HEAD
     torchbind_constants: dict[str, torch._C.ScriptObject]
+=======
+    torchbind_constants: dict[str, torch._C.ScriptObject | FakeScriptObject]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     output_strides: Optional[list[Optional[tuple[_StrideExprStr, ...]]]]
     disabled_cudagraphs_reason: Optional[str]
     metrics_deltas: metrics.CachedMetricsDeltas
@@ -316,12 +515,21 @@ class CompiledFxGraph(OutputCode):
     guards_expr: Optional[str]
 
     cudagraph_info: Optional[CudagraphCachedInfo]
+<<<<<<< HEAD
     fx_kwargs: _CompileFxKwargs
     inputs_to_check: Sequence[int]
     boxed_forward_device_index: Optional[BoxedDeviceIndex]
 
     _boxed_call: Optional[bool] = None
     _triton_bundle: Optional[list[TritonKernelArtifacts]] = None
+=======
+    partition_maps: Optional[list[GraphPartitionMap]]
+    fx_kwargs: _CompileFxKwargs
+    inputs_to_check: Sequence[int]
+
+    _boxed_call: Optional[bool] = None
+    _triton_bundle: Optional[TritonBundle] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
@@ -337,13 +545,32 @@ def __init__(
         static_input_idxs: Sequence[int],
         fx_kwargs: _CompileFxKwargs,
         inputs_to_check: Sequence[int],
+<<<<<<< HEAD
         boxed_forward_device_index: Optional[BoxedDeviceIndex],
     ) -> None:
         self.current_callable = current_callable
+=======
+        runnable_graph_str: str,
+        inductor_post_grad_graph_str: str,
+        compiled_fn_runner: Optional[Any] = None,
+    ) -> None:
+        self.current_callable = current_callable
+        self.compiled_fn_runner = compiled_fn_runner
+        self.recursively_apply_fns = (
+            compiled_fn_runner.recursively_apply_fns
+            if compiled_fn_runner is not None
+            else None
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cache_key = graph.cache_key
         if graph.cache_path:
             with open(graph.cache_path) as f:
                 self.source_code = f.read()
+<<<<<<< HEAD
+=======
+        self.runnable_graph_str = runnable_graph_str
+        self.inductor_post_grad_graph_str = inductor_post_grad_graph_str
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cache_linemap = graph.cache_linemap
         # TODO - ordered set
         self.device_types = OrderedSet(graph.device_types)
@@ -377,9 +604,15 @@ def __init__(
         self.counter_deltas = counter_deltas
         self.guards_expr = None
         self.cudagraph_info = None
+<<<<<<< HEAD
         self.fx_kwargs = {}
         self.inputs_to_check = ()
         self.boxed_forward_device_index = None
+=======
+        self.partition_maps = graph.partition_maps
+        self.fx_kwargs = {}
+        self.inputs_to_check = ()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cudagraph_info = None
         if cudagraphs:
@@ -448,12 +681,27 @@ def __init__(
         self.cudagraph_info = cudagraph_info
         self.inputs_to_check = inputs_to_check
         self.fx_kwargs = fx_kwargs
+<<<<<<< HEAD
         # TODO: should this be part of fx_kwargs
         self.boxed_forward_device_index = boxed_forward_device_index
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # aot autograd needs to know to pass in inputs as a list
         self._boxed_call = True
 
+<<<<<<< HEAD
+=======
+    def __del__(self) -> None:
+        if self.compiled_fn_runner is not None:
+            # For torch._inductor.config.graph_partition = True,
+            # self.compiled_fn_runner.partitions hold cudagraphified functions
+            # which prevents deallocation. When CompiledFxGraph is deleted,
+            # self.compiled_fn_runner will not be called in the future so we
+            # should also delete these partitions.
+            del self.compiled_fn_runner.partitions
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __call__(self, inputs: Sequence[Any]) -> Any:
         assert self.current_callable is not None
         try:
@@ -465,8 +713,13 @@ def __call__(self, inputs: Sequence[Any]) -> Any:
     def post_compile(
         self,
         example_inputs: Sequence[InputType],
+<<<<<<< HEAD
         cudagraphs: BoxedBool,
         constants: CompiledFxGraphConstants,
+=======
+        constants: CompiledFxGraphConstants,
+        graph_kwargs: _CompileFxKwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """
         Run a set of post processing steps after loading from the cache. These involve:
@@ -478,7 +731,14 @@ def post_compile(
         The results of this function are *not* saved in the cache itself.
         """
         set_tracing_context_output_strides(example_inputs, self)
-
+<<<<<<< HEAD
+
+=======
+        assert graph_kwargs["cudagraphs"] is not None
+        assert graph_kwargs["is_backward"] is not None
+        is_backward = graph_kwargs["is_backward"]
+        cudagraphs: BoxedBool = graph_kwargs["cudagraphs"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if cudagraphs:
             # It's possible that cudagraphs is enabled, but was disabled
             # during a previous compilation we're loading from the cache.
@@ -492,12 +752,45 @@ def post_compile(
                     counters["inductor"]["cudagraph_skips"] += 1
                 BoxedBool.disable(cudagraphs)
             else:
+<<<<<<< HEAD
                 cudagraph_post_compile(
                     example_inputs,
                     self,
                     cudagraphs,
                     constants.unwrap(self),
                 )
+=======
+                if is_backward:
+                    assert "boxed_forward_device_index" in graph_kwargs
+                    boxed_forward_device_index = graph_kwargs[
+                        "boxed_forward_device_index"
+                    ]
+                else:
+                    # On the forward we don't know whether or not
+                    # boxed_foward_device_index is set yet
+                    boxed_forward_device_index = graph_kwargs.get(
+                        "boxed_forward_device_index", None
+                    )
+
+                if config.graph_partition:
+                    # with graph_partition=True, we skip some cudagraph checks if it's supported
+                    # with partition. So we have to use cudagraph_partition_post_compile.
+                    cudagraph_partition_post_compile(
+                        example_inputs,
+                        self,
+                        cudagraphs,
+                        constants.unwrap(self),
+                        boxed_forward_device_index,
+                    )
+                else:
+                    cudagraph_post_compile(
+                        example_inputs,
+                        self,
+                        cudagraphs,
+                        constants.unwrap(self),
+                        boxed_forward_device_index,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputs_to_check = self.inputs_to_check
         # cudagraphs could have been disabled from the earlier conditions
         # so we still need to realign inputs if that happens
@@ -505,6 +798,10 @@ def post_compile(
             cudagraphs,
             self,
             inputs_to_check,
+<<<<<<< HEAD
+=======
+            self.mutated_input_idxs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def set_triton_bundle(self, triton_bundle: Any) -> None:
@@ -516,6 +813,7 @@ def prepare_for_serialization(self) -> None:
         # TODO: This could be better if we're ever able to serialize compiled
         # models to disk.
         self.current_callable = None
+<<<<<<< HEAD
 
     def after_deserialization(self, constants: CompiledFxGraphConstants) -> str:
         from torch._dynamo.utils import counters, dynamo_timed
@@ -525,6 +823,14 @@ def after_deserialization(self, constants: CompiledFxGraphConstants) -> str:
             PyCodeCache,
             write_atomic,
         )
+=======
+        self.recursively_apply_fns = None
+        self.compiled_fn_runner = None
+
+    def write_to_disk(self) -> str:
+        from torch._dynamo.utils import counters
+        from torch._inductor.codecache import get_path, write_atomic
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # See _save_graph(); we don't store the callable in the cache entry so
         # recreate it here from the PyCodeCache disk cache.
@@ -532,6 +838,7 @@ def after_deserialization(self, constants: CompiledFxGraphConstants) -> str:
         code = self.source_code
         if not os.path.exists(artifact_path):
             counters["inductor"]["fxgraph_lookup_write_file"] += 1
+<<<<<<< HEAD
             Path(os.path.dirname(artifact_path)).mkdir(parents=True, exist_ok=True)
             cpp_pp = cpp_prefix_path()
             if os.path.basename(cpp_pp) in code:
@@ -545,18 +852,41 @@ def after_deserialization(self, constants: CompiledFxGraphConstants) -> str:
                     self.source_code = code
 
             write_atomic(artifact_path, code, make_dirs=True)
+=======
+            write_atomic(artifact_path, code, make_dirs=True)
+        return artifact_path
+
+    def after_deserialization(self, constants: CompiledFxGraphConstants) -> str:
+        from torch._dynamo.utils import dynamo_timed
+        from torch._inductor.codecache import PyCodeCache
+
+        artifact_path = self.write_to_disk()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         try:
             with dynamo_timed(
                 "PyCodeCache.load_by_key_path",
                 log_pt2_compile_event=True,
             ):
+<<<<<<< HEAD
                 self.current_callable = PyCodeCache.load_by_key_path(
+=======
+                code_cache = PyCodeCache.load_by_key_path(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.cache_key,
                     artifact_path,
                     self.cache_linemap,
                     constants.unwrap(self),
+<<<<<<< HEAD
                 ).call
+=======
+                )
+                self.current_callable = code_cache.call
+                self.recursively_apply_fns = getattr(
+                    code_cache, "recursively_apply_fns", None
+                )
+                self.compiled_fn_runner = getattr(code_cache, "runner", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except OSError:
             log.error("Failed to load artifact: %s", artifact_path)
             raise
@@ -570,7 +900,11 @@ class CompiledAOTI(OutputCode):
     Class holding an AOTInductor compiled so.
     """
 
+<<<<<<< HEAD
     filename: Union[str, list[str]]
+=======
+    filename: Union[str, list[Union[str, Weights]]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(self, inputs: Sequence[Any]) -> Any:
         raise NotImplementedError("NYI")
@@ -578,8 +912,13 @@ def __call__(self, inputs: Sequence[Any]) -> Any:
     def post_compile(
         self,
         example_inputs: Sequence[InputType],
+<<<<<<< HEAD
         cudagraphs: BoxedBool,
         constants: CompiledFxGraphConstants,
+=======
+        constants: CompiledFxGraphConstants,
+        graph_kwargs: _CompileFxKwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         pass
 
@@ -597,8 +936,13 @@ def __post_init__(self) -> None:
     def post_compile(
         self,
         example_inputs: Sequence[InputType],
+<<<<<<< HEAD
         cudagraphs: BoxedBool,
         constants: CompiledFxGraphConstants,
+=======
+        constants: CompiledFxGraphConstants,
+        graph_kwargs: _CompileFxKwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         pass
 
diff --git a/torch/_inductor/package/package.py b/torch/_inductor/package/package.py
index b139f7f1f02f..2c8d2b7e14d7 100644
--- a/torch/_inductor/package/package.py
+++ b/torch/_inductor/package/package.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import tempfile
+<<<<<<< HEAD
 import zipfile
 from pathlib import Path
 from typing import Any, IO, Optional, Union
@@ -22,11 +23,26 @@
     CONSTANTS_DIR,
     CUSTOM_OBJ_FILENAME_PREFIX,
 )
+=======
+from typing import IO
+
+import torch
+from torch._inductor import config
+from torch._inductor.cpp_builder import BuildOptionsBase, CppBuilder
+from torch.export.pt2_archive._package import (
+    AOTI_FILES,
+    AOTICompiledModel,
+    load_pt2,
+    package_pt2,
+)
+from torch.types import FileLike
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 class PT2ArchiveWriter:
     def __init__(self, archive_path: FileLike) -> None:
         self.archive_path: FileLike = archive_path
@@ -95,6 +111,8 @@ def get_file_names(self) -> list[str]:
         return self.archive_file.namelist()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def compile_so(aoti_dir: str, aoti_files: list[str], so_path: str) -> str:
     def get_aoti_file_with_suffix(suffix: str) -> str:
         for file in aoti_files:
@@ -156,7 +174,11 @@ def get_aoti_file_with_suffix(suffix: str) -> str:
 
 def package_aoti(
     archive_file: FileLike,
+<<<<<<< HEAD
     aoti_files: Union[list[str], dict[str, list[str]]],
+=======
+    aoti_files: AOTI_FILES,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> FileLike:
     """
     Saves the AOTInductor generated files to the PT2Archive format.
@@ -167,6 +189,7 @@ def package_aoti(
         the AOTInductor files, or a dictionary mapping the model name to the
         path to its AOTInductor generated files.
     """
+<<<<<<< HEAD
     if isinstance(aoti_files, list):
         aoti_files = {"model": aoti_files}
 
@@ -290,21 +313,63 @@ def load_package(
     ) or (isinstance(path, (str, os.PathLike)) and os.fspath(path).endswith(".pt2")), (
         f"Unable to load package. Path must be a buffer or a file ending in .pt2. Instead got {path}"
     )
+=======
+
+    return package_pt2(
+        archive_file,
+        aoti_files=aoti_files,
+    )
+
+
+def load_package(
+    path: FileLike,
+    model_name: str = "model",
+    run_single_threaded: bool = False,
+    num_runners: int = 1,
+    device_index: int = -1,
+) -> AOTICompiledModel:  # type: ignore[type-arg]
+    try:
+        pt2_contents = load_pt2(
+            path,
+            run_single_threaded=run_single_threaded,
+            num_runners=num_runners,
+            device_index=device_index,
+        )
+        if model_name not in pt2_contents.aoti_runners:
+            raise RuntimeError(f"Model {model_name} not found in package")
+        return pt2_contents.aoti_runners[model_name]
+    except RuntimeError:
+        log.warning("Loading outdated pt2 file. Please regenerate your package.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if isinstance(path, (io.IOBase, IO)):
         with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
             # TODO(angelayi): We shouldn't need to do this -- miniz should
             # handle reading the buffer. This is just a temporary workaround
+<<<<<<< HEAD
             f.write(path.read())
             path.seek(0)
             log.debug("Writing buffer to tmp file located at %s.", f.name)
             loader = torch._C._aoti.AOTIModelPackageLoader(
                 f.name, model_name, run_single_threaded
             )  # type: ignore[call-arg]
+=======
+            path.seek(0)
+            f.write(path.read())
+            log.debug("Writing buffer to tmp file located at %s.", f.name)
+            loader = torch._C._aoti.AOTIModelPackageLoader(
+                f.name, model_name, run_single_threaded, num_runners, device_index
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return AOTICompiledModel(loader)
 
     path = os.fspath(path)  # AOTIModelPackageLoader expects (str, str)
     loader = torch._C._aoti.AOTIModelPackageLoader(
+<<<<<<< HEAD
         path, model_name, run_single_threaded
     )  # type: ignore[call-arg]
+=======
+        path, model_name, run_single_threaded, num_runners, device_index
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return AOTICompiledModel(loader)
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
index e36e0949d86c..82c23ffde4de 100644
--- a/torch/_inductor/pattern_matcher.py
+++ b/torch/_inductor/pattern_matcher.py
@@ -251,14 +251,84 @@ def replace_by_example(
             else contextlib.nullcontext()
         )
 
+<<<<<<< HEAD
+=======
+        def should_propagate_eager_input_vals(nodes: list[torch.fx.Node]) -> bool:
+            if len(nodes) != 1:
+                return False
+            node = nodes[0]
+            if "eager_input_vals" not in node.meta:
+                return False
+            return node.target in OrderedSet(
+                [
+                    torch.ops.higher_order.triton_kernel_wrapper_functional,
+                    torch.ops.higher_order.auto_functionalized,
+                    torch.ops.higher_order.auto_functionalized_v2,
+                ]
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with context:
             if trace_fn is None:
                 trace_fn = functools.partial(
                     fwd_only, run_functional_passes=run_functional_passes
                 )
+<<<<<<< HEAD
             replacement = trace_fn(
                 replacement_fn, torch.fx.map_arg(args, lambda arg: arg.meta["val"])
             )
+=======
+
+            if should_propagate_eager_input_vals(self.nodes):
+                # Our strategy is:
+                # 1) trace out the graph with eager_input_vals (which have accurate eager-mode metadata)
+                # 2) trace out the graph with vals (which have the accurate Inductor metadata)
+                # 3) Propagate the eager_input_vals from the first graph to the second.
+                # 4) Use the second graph as the replacement graph.
+
+                # Construct a map of node -> FakeTensor val in eager_input_vals
+                node_to_val = {}
+
+                fake_args, fake_kwargs = self.nodes[0].meta["eager_input_vals"]
+                fake_kwargs = {**fake_kwargs}
+                match_args, match_kwargs = tuple(self.args), self.kwargs
+
+                def record(node: torch.fx.Node, val: Any) -> None:
+                    if isinstance(node, torch.fx.Node):
+                        node_to_val[node] = val
+
+                torch.utils._pytree.tree_map(
+                    record, (match_args, match_kwargs), (fake_args, fake_kwargs)
+                )
+                # map args to their FakeTensor val in eager_input_vals
+                example_vals = torch.fx.map_arg(args, lambda arg: node_to_val[arg])
+
+                # first graph
+                graph_with_eager_vals = trace_fn(replacement_fn, example_vals)
+
+                # second graph
+                example_vals = torch.fx.map_arg(args, lambda arg: arg.meta["val"])
+                replacement = trace_fn(graph_with_eager_vals, example_vals)
+
+                # propagate metadata from first graph to second
+                # NB: This assertion might not be true in general, but it is true for
+                # the two use cases we have
+                # (triton_kernel_wrapper_functional, auto_functionalized)
+                assert len(graph_with_eager_vals.graph.nodes) == len(
+                    replacement.graph.nodes
+                )
+                for old_node, new_node in zip(
+                    graph_with_eager_vals.graph.nodes, replacement.graph.nodes
+                ):
+                    if "eager_input_vals" in old_node.meta:
+                        new_node.meta["eager_input_vals"] = old_node.meta[
+                            "eager_input_vals"
+                        ]
+
+            else:
+                example_vals = torch.fx.map_arg(args, lambda arg: arg.meta["val"])
+                replacement = trace_fn(replacement_fn, example_vals)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if len(self.nodes) == 1:
                 for n in replacement.graph.nodes:
                     _transfer_meta(
@@ -949,7 +1019,11 @@ def __init__(self) -> None:
         self.memoized_objs_pp: dict[PatternExpr, str] = {}
 
     @staticmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def run(obj: PatternExpr, output_name: str = "output") -> str:
         """
         Serializes obj to python code with obj written out to `output_name`
@@ -1073,9 +1147,15 @@ class Replacer(torch.fx.Interpreter):
             def run_node(self, node: torch.fx.Node) -> Any:
                 if node.op in ("placeholder", "output"):
                     return super().run_node(node)
+<<<<<<< HEAD
                 if node.op == "call_function":
                     target = node.target
                     args, kwargs = self.fetch_args_kwargs_from_env(node)
+=======
+                target = node.target
+                args, kwargs = self.fetch_args_kwargs_from_env(node)
+                if node.op == "call_function":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     assert callable(target)
                     result = graph.call_function(target, args, kwargs)
                     _transfer_meta(
@@ -1083,12 +1163,43 @@ def run_node(self, node: torch.fx.Node) -> Any:
                         old_node=node,
                         pass_name="Interpreter_Replacer",
                     )
+<<<<<<< HEAD
+=======
+                    # This function copy-pastes the replacement graph into
+                    # the graph. If the replacement graph had any eager_input_vals,
+                    # or val/tensor_meta, we propagate those over.
+                    if "eager_input_vals" in node.meta:
+                        result.meta["eager_input_vals"] = node.meta["eager_input_vals"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if "val" in node.meta and "val" not in result.meta:
                         result.meta["val"] = node.meta["val"]
                         if isinstance(node.meta["val"], torch.Tensor):
                             assert "tensor_meta" in node.meta
                             result.meta["tensor_meta"] = node.meta["tensor_meta"]
                     return result
+<<<<<<< HEAD
+=======
+                if node.op == "get_attr":
+                    # If the replacement graph contains a HOP, the subgraphs of the HOP are "get_attr" nodes.
+                    # We need to fetch the subgraph of the HOP then register the subgraph to the replaced graph's root.
+                    from torch._higher_order_ops.utils import (
+                        unique_graph_name_with_root,
+                    )
+
+                    sub_gm = super().get_attr(target, args, kwargs)
+                    if not isinstance(sub_gm, torch.fx.GraphModule):
+                        raise NotImplementedError(
+                            f"NYI: replacement_graph.{target} is not a graph module. Got {sub_gm}."
+                        )
+
+                    assert graph.owning_module is not None
+                    _, graph_name = unique_graph_name_with_root(
+                        graph.owning_module, str(target)
+                    )
+                    graph.owning_module.register_module(graph_name, sub_gm)
+                    return graph.get_attr(graph_name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise NotImplementedError(f"unhandled {node}")
 
         output_nodes = match.output_nodes()
@@ -1472,10 +1583,17 @@ def normalize_args(**kwargs: Any) -> list[Any]:
             normalize_args=normalize_args,
         )
         pattern.register(pass_dicts)
+<<<<<<< HEAD
         return pattern.pattern
 
 
 _serialized_patterns = OrderedSet[str]()
+=======
+        return pattern.pattern  # type: ignore[return-value]
+
+
+_serialized_patterns: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _serialize_pattern(
@@ -1513,8 +1631,18 @@ def get_file_template() -> str:
         pattern_matcher_imports = []
         for name in dir(torch._inductor.pattern_matcher):
             attr = getattr(torch._inductor.pattern_matcher, name)
+<<<<<<< HEAD
             if isinstance(attr, type) and issubclass(attr, (PatternExpr, _TargetExpr)):
                 pattern_matcher_imports.append(name)
+=======
+            try:
+                if isinstance(attr, type) and issubclass(
+                    attr, (PatternExpr, _TargetExpr)
+                ):
+                    pattern_matcher_imports.append(name)
+            except TypeError:
+                pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         formatted_imports = ",\n   ".join(pattern_matcher_imports)
         formatted_imports = f"from torch._inductor.pattern_matcher import (\n   {formatted_imports},\n)\n"
@@ -2104,7 +2232,11 @@ def stable_topological_sort(graph: torch.fx.Graph) -> None:
 def init_once_fakemode(fn: Callable[..., Any]) -> Callable[[], Any]:
     """Wrapper around lazy init functions in fx_passes/"""
 
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @functools.wraps(fn)
     def lazy_init() -> Any:
         counters_ref = counters["inductor"].copy()
@@ -2144,7 +2276,11 @@ def run_node(self, old_node: torch.fx.Node) -> torch.fx.Node:
 
 
 # TODO: remove in follow up diff, used internally
+<<<<<<< HEAD
 _seen_patterns = OrderedSet[str]()
+=======
+_seen_patterns: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_arg_value(
diff --git a/torch/_inductor/quantized_lowerings.py b/torch/_inductor/quantized_lowerings.py
index 312ca833c2e1..ae8a2780e06a 100644
--- a/torch/_inductor/quantized_lowerings.py
+++ b/torch/_inductor/quantized_lowerings.py
@@ -4,7 +4,11 @@
 import torch
 from torch._inductor.kernel.mm_common import mm_args
 
+<<<<<<< HEAD
 from . import config as inductor_config, lowering
+=======
+from . import config, lowering
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .codegen.cpp_gemm_template import CppGemmTemplate, CppWoqInt4GemmTemplate
 from .codegen.cpp_utils import create_epilogue_with_attr
 from .lowering import expand, register_lowering
@@ -14,7 +18,11 @@
     ExternKernelChoice,
     realize_inputs,
 )
+<<<<<<< HEAD
 from .utils import use_aten_gemm_kernels, use_cpp_gemm_template, use_max_autotune
+=======
+from .utils import use_aten_gemm_kernels, use_cpp_gemm_template
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .virtualized import V
 
 
@@ -90,6 +98,7 @@ def _mul_epilogue(buf: torch.Tensor) -> Any:
                 epilogue_creator=_mul_epilogue,  # type: ignore[arg-type]
             )
 
+<<<<<<< HEAD
         if (
             len(choices) == 0
             and inductor_config.autotune_fallback_to_aten
@@ -100,6 +109,8 @@ def _mul_epilogue(buf: torch.Tensor) -> Any:
                 (mat1, mat2, scale), aten_layout
             ).output_node()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return autotune_select_algorithm(
             "_weight_int8pack_mm", choices, [mat1, mat2, scale], aten_layout
         )
@@ -136,7 +147,11 @@ def int4pack_mm_cpu(
             else []
         )
         if (
+<<<<<<< HEAD
             use_max_autotune()
+=======
+            (config.max_autotune or config.max_autotune_gemm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and use_cpp_gemm_template(
                 aten_layout,
                 mat1,
@@ -153,6 +168,7 @@ def int4pack_mm_cpu(
                 [mat1, mat2, group_size, qScaleAndZeros],
             )
 
+<<<<<<< HEAD
         if (
             len(choices) == 0
             and inductor_config.autotune_fallback_to_aten
@@ -163,6 +179,8 @@ def int4pack_mm_cpu(
                 (mat1, mat2, group_size, qScaleAndZeros), aten_layout
             ).output_node()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # define functions to generate example inputs for weight and group size
         # otherwise, autotuner generates example inputs of all zeros for them
         def get_example_weight(x: torch._inductor.ir.IRNode) -> torch.Tensor:
diff --git a/torch/_inductor/remote_cache.py b/torch/_inductor/remote_cache.py
index 85100c114136..29ced305d1dd 100644
--- a/torch/_inductor/remote_cache.py
+++ b/torch/_inductor/remote_cache.py
@@ -136,7 +136,11 @@ def decode(self, data: _T) -> _T:
 # To write (`put`), the RemoteCache takes data, uses the RemoteCacheSerde to
 # convert it for the backend and passes it to the backend.
 #
+<<<<<<< HEAD
 # Conversly when reading (`get`), the RemoteCache takes data from the backend,
+=======
+# Conversely when reading (`get`), the RemoteCache takes data from the backend,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # uses the RemoteCacheSerde to convert it and returns it.
 #
 # The RemoteCacheBackend is generic on _U - which is the type of data the
@@ -244,8 +248,12 @@ class RedisRemoteCacheBackend(RemoteCacheBackend[bytes]):
     def __init__(self, cache_id: str) -> None:
         super().__init__()
         if not redis:
+<<<<<<< HEAD
             # We had trouble importing redis - just skip init.
             return
+=======
+            raise RuntimeError("redis not available but required for remote cache")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if "TORCHINDUCTOR_REDIS_URL" in os.environ:
             self._redis = redis.Redis.from_url(os.environ["TORCHINDUCTOR_REDIS_URL"])
diff --git a/torch/_inductor/runtime/autotune_cache.py b/torch/_inductor/runtime/autotune_cache.py
index 0c098f6afa46..7875a3765db5 100644
--- a/torch/_inductor/runtime/autotune_cache.py
+++ b/torch/_inductor/runtime/autotune_cache.py
@@ -1,3 +1,31 @@
+<<<<<<< HEAD
+=======
+"""
+PyTorch Inductor Autotuning Cache System
+
+This module implements a caching system for autotuning configurations in PyTorch's Inductor compiler.
+It provides mechanisms to store and retrieve optimal kernel configurations both locally and remotely,
+which significantly speeds up compilation by reusing previously discovered optimal parameters.
+
+The caching system includes:
+- Local filesystem caching for individual machine reuse
+- Remote caching for sharing optimizations across machines
+- Bundled caching to efficiently store multiple related configurations
+- Cache invalidation based on PyTorch versions and backend changes
+- Serialization/deserialization support for worker processes
+
+Key components:
+- AutotuneCache: Main class for managing cache access and storage
+- AutotuneCacheBundler: Bundles multiple cache entries for efficient storage
+- LocalAutotuneCache: Handles filesystem-based caching
+- _LocalAutotuneCacheBackend: Low-level file operations for cache storage
+- AutotuneCacheArtifact: Integration with PyTorch's artifact system
+
+This caching system is critical for performance as it eliminates the need to re-run
+expensive autotuning operations when the same kernels are compiled multiple times.
+"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from __future__ import annotations
 
 import dataclasses
@@ -10,7 +38,16 @@
 from typing_extensions import override
 
 import torch
+<<<<<<< HEAD
 from torch.compiler._cache import CacheArtifactManager, CacheArtifactType
+=======
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch.compiler._cache import (
+    CacheArtifact,
+    CacheArtifactFactory,
+    CacheArtifactManager,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import has_triton
 
 from ..remote_cache import (
@@ -20,7 +57,11 @@
     RemoteCacheBackend,
     RemoteCacheJsonSerde,
 )
+<<<<<<< HEAD
 from .triton_compat import Config
+=======
+from .triton_compat import Config, HAS_WARP_SPEC
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -59,6 +100,32 @@ def inductor_meta_from_config() -> _InductorMetaTy:
     }
 
 
+<<<<<<< HEAD
+=======
+@CacheArtifactFactory.register
+class AutotuneCacheArtifact(CacheArtifact):
+    @override
+    def populate_cache(self) -> None:
+        autotune_cache = _LocalAutotuneCacheBackend()
+        key = os.path.join(cache_dir(), self.key)
+        autotune_cache._put(key, self.content)
+
+    @override
+    @staticmethod
+    def type() -> str:
+        return "autotune"
+
+    @override
+    @staticmethod
+    def encode(content: JsonDataTy) -> bytes:
+        assert not isinstance(content, bytes)
+        serde = RemoteCacheJsonSerde()
+        content_bytes = serde.encode(content)
+        assert isinstance(content_bytes, bytes)
+        return content_bytes
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class AutotuneCache:
     configs_hash: str
@@ -121,7 +188,25 @@ def _setup_local_cache(
         if not inductor_meta.get("autotune_local_cache", True):
             return
 
+<<<<<<< HEAD
         cache_filename = f"{dirname}/{cache_key}.best_config"
+=======
+        from ..codecache import torch_key
+
+        """
+        [Note: torch_key in autotune cache key]
+        Include torch_key() in the cache key so that different versions
+        of torch result in cache invalidation. This is important in case
+        of changes to the best_config format or other code changes that
+        are not backward compatible w.r.t. the cache.
+        """
+        hasher = hashlib.sha256()
+        hasher.update(cache_key.encode("utf-8"))
+        hasher.update(torch_key())
+        updated_cache_key = hasher.hexdigest()
+
+        cache_filename = f"{dirname}/{updated_cache_key}.best_config"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_cache = LocalAutotuneCache()
         self.local_cache = (local_cache, cache_filename)
 
@@ -139,10 +224,20 @@ def _setup_remote_autotune_cache(
             return
         assert isinstance(backend_hash, str)
 
+<<<<<<< HEAD
         is_fbcode = bool(inductor_meta.get("is_fbcode", False))
 
         salt = "autotune-best-config-v2"
         key = backend_hash + self.configs_hash + salt
+=======
+        from ..codecache import torch_key
+
+        is_fbcode = bool(inductor_meta.get("is_fbcode", False))
+
+        salt = "autotune-best-config-v2"
+        # re: torch_key - see [Note: torch_key in autotune cache key]
+        key = torch_key().hex() + backend_hash + self.configs_hash + salt
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         key = hashlib.sha256(key.encode("utf-8")).hexdigest()
 
         remote_cache = create_cache(
@@ -197,7 +292,15 @@ def __setstate__(self, state: dict[str, Any]) -> None:
 
     # Save the config in the caches
     def save(
+<<<<<<< HEAD
         self, config: Config, time_taken_ns: int, found_by_coordesc: bool = False
+=======
+        self,
+        config: Config,
+        time_taken_ns: int,
+        found_by_coordesc: bool = False,
+        triton_cache_hash: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         data = {
             **config.kwargs,
@@ -206,14 +309,34 @@ def save(
             "configs_hash": self.configs_hash,
             "found_by_coordesc": found_by_coordesc,
             "time_taken_ms": time_taken_ns // 1000000,  # Convert from NS to MS
+<<<<<<< HEAD
+        }
+=======
+            "triton_cache_hash": triton_cache_hash,
         }
+        if HAS_WARP_SPEC:
+            data.update(
+                {
+                    "num_consumer_groups": getattr(config, "num_consumer_groups", 0),
+                    "num_buffers_warp_spec": getattr(
+                        config, "num_buffers_warp_spec", 0
+                    ),
+                }
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if local_cache := self.local_cache:
             cache, key = local_cache
             cache.put(key, data)
             AutotuneCacheBundler.put(key, data)
+<<<<<<< HEAD
             CacheArtifactManager.record_artifact(
                 CacheArtifactType.AUTOTUNE, os.path.basename(key), data
+=======
+            autotune_artifact_key = os.path.join(*key.split(os.sep)[-2:])
+            CacheArtifactManager.record_artifact(
+                AutotuneCacheArtifact.type(), autotune_artifact_key, data
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             if log.isEnabledFor(logging.DEBUG):
@@ -459,12 +582,39 @@ def _load_cached_autotuning(
     # Remove time taken for comparison
     best_config.pop("time_taken_ms", None)
 
+<<<<<<< HEAD
+=======
+    best_config.pop("triton_cache_hash", None)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if inductor_meta.get("coordinate_descent_tuning") and best_config.pop(
         "found_by_coordesc", False
     ):
         num_warps = best_config.pop("num_warps")
         num_stages = best_config.pop("num_stages")
+<<<<<<< HEAD
         triton_config = Config(best_config, num_warps=num_warps, num_stages=num_stages)
+=======
+
+        # Extract common arguments
+        config_args = {
+            "num_warps": num_warps,
+            "num_stages": num_stages,
+        }
+
+        if HAS_WARP_SPEC:
+            config_args.update(
+                {
+                    "num_consumer_groups": best_config.pop("num_consumer_groups", 0),
+                    "num_buffers_warp_spec": best_config.pop(
+                        "num_buffers_warp_spec", 0
+                    ),
+                }
+            )
+
+        # Create the triton_config with the appropriate arguments
+        triton_config = Config(best_config, **config_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         triton_config.found_by_coordesc = True
         return triton_config
 
@@ -493,8 +643,14 @@ def _get(self, key: str) -> Optional[bytes]:
     @override
     def _put(self, key: str, data: bytes) -> None:
         os.makedirs(os.path.dirname(key), exist_ok=True)
+<<<<<<< HEAD
         with open(key, "wb") as fd:
             fd.write(data)
+=======
+        from torch._inductor import codecache
+
+        codecache.write_atomic(key, data)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class LocalAutotuneCache(RemoteCache[JsonDataTy]):
@@ -515,8 +671,14 @@ def _get(self, key: str, sample: Optional[Sample]) -> Optional[JsonDataTy]:
             # model would only bundle *newly* compiled kernels, not existing
             # kernels that were already compiled and cached.
             AutotuneCacheBundler.put(key, result)
+<<<<<<< HEAD
             CacheArtifactManager.record_artifact(
                 CacheArtifactType.AUTOTUNE, os.path.basename(key), result
+=======
+            autotune_artifact_key = os.path.join(*key.split(os.sep)[-2:])
+            CacheArtifactManager.record_artifact(
+                AutotuneCacheArtifact.type(), autotune_artifact_key, result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return result
 
diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py
index 057d59bb8558..54ed03b48a47 100644
--- a/torch/_inductor/runtime/benchmarking.py
+++ b/torch/_inductor/runtime/benchmarking.py
@@ -35,7 +35,11 @@ def time_and_count(
     def wrapper(self: Any, *args: P.args, **kwargs: P.kwargs) -> T:
         fn_qual_name = f"{self.__class__.__name__}.{fn.__name__}"
         counters["inductor"][f"benchmarking.{fn_qual_name}"] += 1
+<<<<<<< HEAD
         with dynamo_timed(fn_qual_name, log_pt2_compile_event=True):
+=======
+        with dynamo_timed(fn_qual_name, log_pt2_compile_event=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return fn(self, *args, **kwargs)
 
     return wrapper
@@ -230,7 +234,11 @@ def benchmark_gpu(
         in milliseconds. An estimated duration is calculated based on the values
         of `memory_warmup_iters` and `benchmark_iters`, along with the estimated
         runtime of `_callable` and various other factors, and we then shrink
+<<<<<<< HEAD
         `benchmark_iters` to fit in the alloted maximum duration.
+=======
+        `benchmark_iters` to fit in the allotted maximum duration.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         - **kwargs: Additional kwargs that may be passed to the fallback.
 
         Returns:
diff --git a/torch/_inductor/runtime/cache_dir_utils.py b/torch/_inductor/runtime/cache_dir_utils.py
index cf6a61bb2223..90833857d0db 100644
--- a/torch/_inductor/runtime/cache_dir_utils.py
+++ b/torch/_inductor/runtime/cache_dir_utils.py
@@ -2,6 +2,13 @@
 import os
 import re
 import tempfile
+<<<<<<< HEAD
+=======
+from collections.abc import Generator
+from contextlib import contextmanager
+
+from torch._environment import is_fbcode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Factoring out to file without torch dependencies
@@ -18,7 +25,11 @@ def cache_dir() -> str:
 def default_cache_dir() -> str:
     sanitized_username = re.sub(r'[\\/:*?"<>|]', "_", getpass.getuser())
     return os.path.join(
+<<<<<<< HEAD
         tempfile.gettempdir(),
+=======
+        tempfile.gettempdir() if not is_fbcode() else "/var/tmp",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "torchinductor_" + sanitized_username,
     )
 
@@ -31,3 +42,23 @@ def triton_cache_dir(device: int) -> str:
         "triton",
         str(device),
     )
+<<<<<<< HEAD
+=======
+
+
+@contextmanager
+def temporary_cache_dir(directory: str) -> Generator[None, None, None]:
+    from torch._inductor.utils import clear_caches
+
+    original = os.environ.get("TORCHINDUCTOR_CACHE_DIR")
+    os.environ["TORCHINDUCTOR_CACHE_DIR"] = directory
+    try:
+        clear_caches()
+        yield
+    finally:
+        clear_caches()
+        if original is None:
+            del os.environ["TORCHINDUCTOR_CACHE_DIR"]
+        else:
+            os.environ["TORCHINDUCTOR_CACHE_DIR"] = original
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/runtime/compile_tasks.py b/torch/_inductor/runtime/compile_tasks.py
index 67c487a24bac..5e2139a70b7f 100644
--- a/torch/_inductor/runtime/compile_tasks.py
+++ b/torch/_inductor/runtime/compile_tasks.py
@@ -1,19 +1,28 @@
 from __future__ import annotations
 
 import functools
+<<<<<<< HEAD
+=======
+import linecache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import sys
 import time
 import warnings
 from pathlib import Path
 from types import ModuleType
+<<<<<<< HEAD
 from typing import Callable, TYPE_CHECKING
+=======
+from typing import Any, Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
     from torch._inductor.runtime.triton_heuristics import CachingAutotuner
 
 
+<<<<<<< HEAD
 def _reload_python_module_in_subproc(key: str, path: str) -> ModuleType:
     codecache = sys.modules.get("torch._inductor.codecache")
     if codecache:
@@ -23,6 +32,11 @@ def _reload_python_module_in_subproc(key: str, path: str) -> ModuleType:
 
 
 def _reload_python_module(key: str, path: str) -> ModuleType:
+=======
+def _reload_python_module(
+    key: str, path: str, set_sys_modules: bool = True
+) -> ModuleType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with open(path) as f:
         try:
             code = compile(f.read(), path, "exec", dont_inherit=True)
@@ -34,11 +48,20 @@ def _reload_python_module(key: str, path: str) -> ModuleType:
         mod.__file__ = path
         mod.key = key  # type: ignore[attr-defined]
         exec(code, mod.__dict__, mod.__dict__)
+<<<<<<< HEAD
         sys.modules[mod.__name__] = mod
         return mod
 
 
 @functools.lru_cache(None)
+=======
+        if set_sys_modules:
+            sys.modules[mod.__name__] = mod
+        return mod
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _set_triton_ptxas_path() -> None:
     if os.environ.get("TRITON_PTXAS_PATH") is not None:
         return
@@ -52,6 +75,7 @@ def _set_triton_ptxas_path() -> None:
 
 
 def _worker_compile_triton(
+<<<<<<< HEAD
     load_kernel: Callable[[], CachingAutotuner], extra_env: dict[str, str]
 ) -> tuple[CachingAutotuner, int]:
     _set_triton_ptxas_path()
@@ -62,3 +86,22 @@ def _worker_compile_triton(
     elapsed_ns = time.time_ns() - start_ns
     kernel.prepare_for_pickle()
     return kernel, elapsed_ns // 1000
+=======
+    load_kernel: Callable[[], CachingAutotuner],
+    extra_env: dict[str, str],
+    extra_config: dict[str, Any],
+) -> tuple[CachingAutotuner, int]:
+    _set_triton_ptxas_path()
+    os.environ.update(extra_env)
+    from torch._inductor import config
+
+    with config.patch(extra_config):
+        start_ns = time.time_ns()
+        kernel = load_kernel()
+        kernel.precompile(warm_cache_only=True)
+        elapsed_ns = time.time_ns() - start_ns
+        kernel.prepare_for_pickle()
+        # We can release this memory in the compile subprocesses:
+        linecache.clearcache()
+        return kernel, elapsed_ns // 1000
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index 477b3c2f90a5..58b1f7477036 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -215,7 +215,11 @@ def compare_config(self, func, candidate_config, best_config, best_timing):
         """
         Check if candidate_config is better than best_config.
 
+<<<<<<< HEAD
         Return a touple of (compare_result, candidate_timing).
+=======
+        Return a tuple of (compare_result, candidate_timing).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compare_result is true iff candidate_config is better.
         """
         log.debug("Try config %s", candidate_config)
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
index 3bc8df35a838..7456c5a791f1 100644
--- a/torch/_inductor/runtime/hints.py
+++ b/torch/_inductor/runtime/hints.py
@@ -136,7 +136,11 @@ class DeviceProperties(typing.NamedTuple):
     warp_size: Optional[int] = None
 
     @classmethod
+<<<<<<< HEAD
     @functools.lru_cache(None)
+=======
+    @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def create(cls, device) -> DeviceProperties:
         import torch
         from torch._dynamo.device_interface import get_interface_for_device
@@ -181,14 +185,24 @@ class HalideInputSpec(typing.NamedTuple):
     alias_of: Optional[str] = None
 
     def bindings_type(self) -> str:
+<<<<<<< HEAD
         if self.ctype in ("half*", "bfloat16*"):
+=======
+        if self.ctype in ("at::Half*", "at::BFloat16*"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return "uint16_t*"  # half not defined
         return self.ctype
 
     def halide_type(self) -> str:
+<<<<<<< HEAD
         if self.ctype == "half*":
             return "halide_type_t(halide_type_float, 16)"  # half not defined
         if self.ctype == "bfloat16*":
+=======
+        if self.ctype == "at::Half*":
+            return "halide_type_t(halide_type_float, 16)"  # half not defined
+        if self.ctype == "at::BFloat16*":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return "halide_type_t(halide_type_bfloat, 16)"  # half not defined
         return f"halide_type_of<{self.ctype.replace('*', '')}>()"
 
diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py
index 9d57232e299e..58042512d352 100644
--- a/torch/_inductor/runtime/runtime_utils.py
+++ b/torch/_inductor/runtime/runtime_utils.py
@@ -5,6 +5,12 @@
 from typing import Any, TYPE_CHECKING
 
 import torch
+<<<<<<< HEAD
+=======
+
+# NOTE: other files rely on the imports below
+from torch._dynamo import callback as compilation_callback  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._inductor.runtime.cache_dir_utils import (  # noqa: F401
     cache_dir,
     default_cache_dir,
@@ -22,8 +28,13 @@ def conditional_product(*args: int) -> int:
     return functools.reduce(operator.mul, [x for x in args if x])
 
 
+<<<<<<< HEAD
 def ceildiv(numer: int, denom: int) -> int:
     return -(numer // -denom)
+=======
+def ceildiv(number: int, denom: int) -> int:
+    return -(number // -denom)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_power_of_2(n: int) -> bool:
@@ -152,7 +163,11 @@ def get_first_attr(obj: Any, *attrs: str) -> Any:
 def triton_hash_to_path_key(key: str) -> str:
     # In early versions of Triton, the hash is directly used in the path name.
     # Later, the hash is converted to base64 before being used in the path name.
+<<<<<<< HEAD
     # Later, the base64 convertion was replaced to the base32
+=======
+    # Later, the base64 conversion was replaced to the base32
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #
     # This code tries to import _base64 and falls back to _base32 if _base64 is unavailable.
     #
diff --git a/torch/_inductor/runtime/static_cuda_launcher.py b/torch/_inductor/runtime/static_cuda_launcher.py
new file mode 100644
index 000000000000..a52df4745f59
--- /dev/null
+++ b/torch/_inductor/runtime/static_cuda_launcher.py
@@ -0,0 +1,237 @@
+import functools
+import os
+from typing import Any, Optional
+from typing_extensions import Unpack
+
+from .triton_compat import ASTSource, CompiledKernel, knobs as triton_knobs
+
+
+class StaticallyLaunchedCudaKernel:
+    """
+    Parses the metadata of a CompiledKernel from Triton into a structure that can
+    launch the cuda kernel directly. Only works for triton kernels compiled to cubin.
+
+    Doing this avoids C++ codegen and compilation during compile, since we can use a
+    statically compiled library to launch the kernel. To avoid mallocing for the arguments,
+    we have a launcher for different numbers of arguments up to a max. StaticCudaLauncher
+    only supports # of arguments up until 10 for now.
+
+    Workflow:
+    Compile time:
+    1. Compile a kernel with triton and get a CompiledKernel
+    2. Instantiate kernel = StaticallyLaunchedCudaKernel(triton_kernel)
+    3. Write to a cubin file: kernel.write_cubin_to_file(filepath)
+    4. Call kernel.load_kernel() (CUDA should be initialized by this point) to load the cubin
+    Runtime:
+    5. Call kernel.run(grid, stream, args) to launch the kernel
+
+    Note that after step 3, StaticallyLaunchedCudaKernel is fully pickleable/serializable.
+    This allows it to be cached by FXGraphCache/TritonBundler, as well as sent from the worker
+    to the parent process in inductor.
+
+    There are two main versions of triton that we wish to support: 3.3 and 3.2. Triton makes considerable changes
+    to how it handles constants in 3.3, so there's some special logic necessary to handle both versions.
+    """
+
+    def __init__(self, kernel: CompiledKernel) -> None:
+        self.name = kernel.src.fn.__name__
+        self.cubin_raw = kernel.asm.get("cubin", None)
+        self.cubin_path = kernel._cubin_path
+
+        # Used by torch.compile to filter constants in older triton versions
+        self.arg_names = kernel.src.fn.arg_names
+
+        # Const exprs that are declared by the triton kernel directly
+        # Used to generate the kernel launcher's def args
+        self.declared_constexprs = kernel.src.fn.constexprs
+
+        self.hash = kernel.hash
+
+        if triton_knobs is None:
+            launch_enter = kernel.__class__.launch_enter_hook
+            launch_exit = kernel.__class__.launch_exit_hook
+        else:
+            launch_enter = triton_knobs.runtime.launch_enter_hook
+            launch_exit = triton_knobs.runtime.launch_exit_hook
+
+        if launch_enter is not None or launch_exit is not None:
+            raise NotImplementedError(
+                "We don't support launch enter or launch exit hooks"
+            )
+        self.num_warps = kernel.metadata.num_warps
+        self.shared = (
+            kernel.shared if hasattr(kernel, "shared") else kernel.metadata.shared
+        )
+
+        # Newer triton versions pass an extra global scratch parameter to the compiled cuda kernel.
+        # Inductor never uses this field or enables it, but we still have to pass
+        # an extra None into the set of params if its enabled
+        if hasattr(kernel.metadata, "global_scratch_size"):
+            if kernel.metadata.global_scratch_size > 0:
+                raise NotImplementedError("Global scratch not yet supported")
+            else:
+                self.has_global_scratch = True
+        else:
+            self.has_global_scratch = False
+
+        self.arg_tys = self.arg_ty_from_signature(kernel.src)
+        self.function: Optional[int] = (
+            None  # Loaded by load_kernel(on the parent process)
+        )
+        num_ctas = 1
+        if hasattr(kernel, "num_ctas"):
+            num_ctas = kernel.num_ctas
+        elif hasattr(kernel, "metadata"):
+            num_ctas = kernel.metadata.num_ctas
+
+        if num_ctas != 1:
+            raise NotImplementedError(
+                "Static cuda launcher only supports num_ctas == 1"
+            )
+
+    def reload_cubin_from_raw(self, filepath: str) -> str:
+        """
+        If the cubin file triton generated gets deleted under us, we can
+        reload it from the raw cubin file.
+        """
+        if self.cubin_path is None:
+            assert self.cubin_raw is not None
+            os.makedirs(os.path.dirname(filepath), exist_ok=True)
+            with open(filepath, "wb") as f:
+                f.write(self.cubin_raw)
+                self.cubin_path = filepath
+        return self.cubin_path
+
+    def load_kernel(self, device: int) -> None:
+        from torch._C import _StaticCudaLauncher
+
+        if self.function is not None:
+            return
+
+        assert hasattr(self, "cubin_path")
+        assert self.cubin_path is not None
+        (self.function, self.n_regs, self.n_spills) = _StaticCudaLauncher._load_kernel(
+            self.cubin_path, self.name, self.shared, device
+        )
+        # Don't need the cubin path anymore now that we've loaded
+        self.cubin_path = None
+        self.cubin_raw = None
+
+    @staticmethod
+    @functools.lru_cache
+    def type_mappings() -> dict[str, str]:
+        return {
+            "i1": "i",
+            "i8": "b",
+            "i16": "h",
+            "i32": "i",
+            "i64": "l",
+            "u1": "I",
+            "u8": "B",
+            "u16": "H",
+            "u32": "I",
+            "u64": "K",
+            "fp16": "f",
+            "bf16": "f",
+            "fp32": "f",
+            "f32": "f",
+            "fp64": "d",
+            # TODO handle nvTmaDesc/CUtensormap
+        }
+
+    def extract_type(self, ty: str) -> str:
+        """
+        Takes a triton type from CompiledKernel.signature and
+        converts it into a single char encoding. _StaticCudaLauncher
+        will switch on this char to figure out what type the underlying
+        value should be passed to the triton kernel as.
+        """
+        if ty[0] == "*":
+            return "O"
+        elif ty == "nvTmaDesc":
+            raise NotImplementedError("nvTmaDesc kernels are not yet supported")
+        return StaticallyLaunchedCudaKernel.type_mappings()[ty]
+
+    def arg_ty_from_signature(self, src: ASTSource) -> str:
+        def index_key(i: Any) -> int:
+            if isinstance(i, str):
+                return src.fn.arg_names.index(i)
+            elif isinstance(i, tuple):
+                # In triton 3.3, src.fn.constants has tuples as a key
+                return i[0]
+            else:
+                return i
+
+        signature = {index_key(key): value for key, value in src.signature.items()}
+        # Triton uses these as the main way to filter out constants passed to their cubin
+        constants = [index_key(key) for key in getattr(src, "constants", dict())]
+        # This value is always a superset of kernel.fn.constexprs: kernel.fn.constexprs are
+        # constants declared by the triton kernel directly, whereas this list can have
+        # constants that are unused by the triton kernel that triton figured out during
+        # compilation.
+        self.full_constexprs = constants
+        # Despite requiring them to be passed in, the triton CUDA launcher
+        # completely ignores the constexprs passed into it when generating code.
+        # So we can ignore them here too
+        params = []
+
+        for i in sorted(signature.keys()):
+            ty = signature[i]
+            # In newer triton versions, constants are passed in to signature with type `constexpr`
+            # In older triton versions, there can be constants in src.constants that are not `constexpr` in signature
+            # so we check both here
+            if ty == "constexpr" or i in constants:
+                pass
+            else:
+                params.append(self.extract_type(ty))
+        return "".join(params)
+
+    def __getstate__(self) -> dict[str, Any]:
+        # Remove objects that are no longer valid for pickling
+        state = self.__dict__.copy()
+        state["function"] = None
+        # Cubin paths aren't consistent across processes, so we clear
+        # and reload them.
+        state["cubin_path"] = None
+        return state
+
+    def run(
+        self,
+        grid_x: int,
+        grid_y: int,
+        grid_z: int,
+        stream: int,
+        *args: Unpack[tuple[object, ...]],
+    ) -> None:
+        """Actually run the kernel at runtime. This function is the hot codepath."""
+        from torch._C import _StaticCudaLauncher
+
+        # Assert load_kernel() has been called and args match
+        assert self.function is not None
+
+        # TODO: actually, if the args *don't* match, we probably should
+        # throw an exception. But if inductor is the only one calling this
+        # thing, it should always match.
+        # Get rid of constants before passing to cubin launcher
+
+        # Add a None if triton wants an extra parameter to the cubin
+        if self.has_global_scratch:
+            arg_tys = self.arg_tys + "O"
+            args = (*args, None)
+        else:
+            arg_tys = self.arg_tys
+        assert len(args) == len(arg_tys)
+
+        # TODO: can handle grid functions here or in C++, so
+        # that we don't need the grid handler above.
+        _StaticCudaLauncher._launch_kernel(
+            self.function,
+            grid_x,
+            grid_y,
+            grid_z,
+            self.num_warps,
+            self.shared,
+            arg_tys,
+            args,
+            stream,
+        )
diff --git a/torch/_inductor/runtime/triton_compat.py b/torch/_inductor/runtime/triton_compat.py
index 7e2d46e91340..e6634ce9933b 100644
--- a/torch/_inductor/runtime/triton_compat.py
+++ b/torch/_inductor/runtime/triton_compat.py
@@ -1,5 +1,9 @@
 from __future__ import annotations
 
+<<<<<<< HEAD
+=======
+import inspect
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Union
 
 import torch
@@ -44,7 +48,11 @@ def GPUTarget(
             return (backend, arch)
 
     # In the latest triton, math functions were shuffled around into different modules:
+<<<<<<< HEAD
     # https://github.com/openai/triton/pull/3172
+=======
+    # https://github.com/triton-lang/triton/pull/3172
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         from triton.language.extra import libdevice
 
@@ -68,6 +76,30 @@ def GPUTarget(
         def _log2(x: Any) -> Any:
             raise NotImplementedError
 
+<<<<<<< HEAD
+=======
+    def _triton_config_has(param_name: str) -> bool:
+        if not hasattr(triton, "Config"):
+            return False
+        if not hasattr(triton.Config, "__init__"):
+            return False
+        return param_name in inspect.signature(triton.Config.__init__).parameters
+
+    HAS_WARP_SPEC = (
+        hasattr(tl, "async_task")
+        and _triton_config_has("num_consumer_groups")
+        and _triton_config_has("num_buffers_warp_spec")
+    )
+
+    try:
+        from triton import knobs
+    except ImportError:
+        knobs = None
+
+    builtins_use_semantic_kwarg = (
+        "_semantic" in inspect.signature(triton.language.core.view).parameters
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else:
 
     def _raise_error(*args: Any, **kwargs: Any) -> Any:
@@ -87,6 +119,11 @@ class PTXASError(Exception):  # type: ignore[no-redef]
     _log2 = _raise_error
     libdevice = None
     math = None
+<<<<<<< HEAD
+=======
+    knobs = None
+    builtins_use_semantic_kwarg = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class triton:  # type: ignore[no-redef]
         @staticmethod
@@ -101,6 +138,11 @@ def constexpr(val: Any) -> Any:
         tensor = Any
         dtype = Any
 
+<<<<<<< HEAD
+=======
+    HAS_WARP_SPEC = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def cc_warp_size(cc: Union[str, int]) -> int:
     if torch.version.hip:
@@ -135,4 +177,8 @@ class autograd_profiler:  # type: ignore[no-redef]
     "math",
     "triton",
     "cc_warp_size",
+<<<<<<< HEAD
+=======
+    "knobs",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/_inductor/runtime/triton_helpers.py b/torch/_inductor/runtime/triton_helpers.py
index 05a04b4030a5..c6393e82246a 100644
--- a/torch/_inductor/runtime/triton_helpers.py
+++ b/torch/_inductor/runtime/triton_helpers.py
@@ -2,9 +2,23 @@
 # mypy: allow-untyped-defs
 import math as pymath
 import warnings
+<<<<<<< HEAD
 from typing import Any, TypeVar
 
 from .triton_compat import _log2, libdevice, math, tl, triton  # noqa: F401
+=======
+from functools import wraps
+from typing import Any, Callable, TypeVar
+
+from .triton_compat import (  # noqa: F401
+    _log2,
+    builtins_use_semantic_kwarg,
+    libdevice,
+    math,
+    tl,
+    triton,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _T = TypeVar("_T")
@@ -194,7 +208,11 @@ def online_softmax_combine(lhs_max, lhs_sum, rhs_max, use_fast_math: tl.constexp
 
     # Should be
     #   out_sum = lhs_sum * lhs_scale + rhs_sum * rhs_scale
+<<<<<<< HEAD
     # but since rhs_sum is all 1, we can simpliy it.
+=======
+    # but since rhs_sum is all 1, we can simplify it.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out_sum = lhs_sum * lhs_scale + rhs_scale
     return out_max, out_sum
 
@@ -345,7 +363,11 @@ def pack_value_flag(
     DTYPE_PACK: tl.constexpr,
 ):
     # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+<<<<<<< HEAD
     DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+=======
+    DTYPE_VALUE_AS_UINT = tl.core._unwrap_if_constexpr(DTYPE_VALUE_AS_UINT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
     uv = value.to(DTYPE_VALUE_AS_UINT, bitcast=True).to(DTYPE_PACK)
     return flag.to(DTYPE_PACK) | (uv << bitwidth)
@@ -358,8 +380,13 @@ def unpack_value(
     DTYPE_VALUE_AS_UINT,
 ):
     # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+<<<<<<< HEAD
     DTYPE_VALUE = tl.core._constexpr_to_value(DTYPE_VALUE)
     DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+=======
+    DTYPE_VALUE = tl.core._unwrap_if_constexpr(DTYPE_VALUE)
+    DTYPE_VALUE_AS_UINT = tl.core._unwrap_if_constexpr(DTYPE_VALUE_AS_UINT)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
     value_uint = (pack >> bitwidth).to(DTYPE_VALUE_AS_UINT)
     return value_uint.to(DTYPE_VALUE, bitcast=True)
@@ -452,7 +479,11 @@ def exclusive_scan_decoupled_lookback_64(scratch_base, block_value, index, combi
     block_value: Scalar value for this block, must be 64-bits wide
     index: Scalar index of this block relative to the current scan
     combine_fn: Function ``(value, value) -> value`` which is scanned over
+<<<<<<< HEAD
     init: Scalar value equal to the identiy of combine_fn
+=======
+    init: Scalar value equal to the identity of combine_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # Publish block sum so subsequent blocks don't get stuck waiting for us
     if index > 0:
@@ -682,7 +713,11 @@ def x_grid_barrier(sem):
     tl.debug_barrier()
 
 
+<<<<<<< HEAD
 def triton_builtin(f: _T) -> _T:
+=======
+def triton_builtin(f: Callable[..., _T]) -> Callable[..., _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Decorator to mark a function as a Triton built-in function.  These functions
     are evaluated at compile time.
@@ -693,8 +728,23 @@ def triton_builtin(f: _T) -> _T:
     Returns:
         function: The same function, marked as a Triton built-in.
     """
+<<<<<<< HEAD
     f.__triton_builtin__ = True  # type: ignore[attr-defined]
     return f
+=======
+    if builtins_use_semantic_kwarg:
+        # support Triton before and after https://github.com/triton-lang/triton/pull/7054
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            kwargs["_builder"] = kwargs["_semantic"]
+            del kwargs["_semantic"]
+            return f(*args, **kwargs)
+    else:
+        wrapper = f  # type: ignore[assignment]
+
+    wrapper.__triton_builtin__ = True  # type: ignore[attr-defined]
+    return wrapper
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @triton_builtin
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 4d87a8236c46..97f87ab2cfac 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -18,12 +18,30 @@
 import threading
 import time
 from collections import namedtuple
+<<<<<<< HEAD
 from typing import Any, Callable, Literal, Optional, TYPE_CHECKING, Union
 
 import torch
 from torch._prims_common import compute_required_storage_length
 from torch.utils._ordered_set import OrderedSet
 from torch._inductor.config import triton as inuctor_triton_config
+=======
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    Literal,
+    Optional,
+    TYPE_CHECKING,
+    TypeVar,
+    Union,
+)
+
+import torch
+from torch._dynamo.utils import set_feature_use
+from torch._prims_common import compute_required_storage_length
+from torch.utils._ordered_set import OrderedSet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ..triton_bundler import TritonBundler
 from ..utils import prefix_is_reduction, triton_version_uses_attrs_dict
@@ -43,6 +61,10 @@
 )
 from .runtime_utils import (
     ceildiv,
+<<<<<<< HEAD
+=======
+    compilation_callback,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     conditional_product,
     create_bandwidth_info_str,
     dynamo_timed,
@@ -55,6 +77,10 @@
     triton_hash_to_path_key,
     validate_triton_config,
 )
+<<<<<<< HEAD
+=======
+from .static_cuda_launcher import StaticallyLaunchedCudaKernel
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .triton_compat import (
     ASTSource,
     autograd_profiler,
@@ -62,7 +88,13 @@
     CompiledKernel,
     Config,
     GPUTarget,
+<<<<<<< HEAD
     KernelInterface,
+=======
+    HAS_WARP_SPEC,
+    KernelInterface,
+    knobs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OutOfResources,
     PTXASError,
     triton,
@@ -74,10 +106,21 @@ class NoTritonConfigsError(RuntimeError):
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Container, Hashable, Sequence
 
     LauncherType = Any
 
+=======
+    from collections.abc import Container, Hashable
+
+    from torch._guards import CompileId
+
+    LauncherType = Any
+
+_KernelType = Union[CompiledKernel, StaticallyLaunchedCudaKernel]
+_T = TypeVar("_T", bound=_KernelType)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 
@@ -153,10 +196,23 @@ def _dump_launch_params(args, kwargs, launcher, kernel_name, grid):
         else:
             call_kwargs[k] = v
     if not triton_version_uses_attrs_dict():
+<<<<<<< HEAD
         for k, v in launcher.config.kwargs.items():
             call_kwargs[k] = v
     call_kwargs["num_warps"] = launcher.config.num_warps
     call_kwargs["num_stages"] = launcher.config.num_stages
+=======
+        call_kwargs.update(launcher.config.kwargs)
+    call_kwargs["num_warps"] = launcher.config.num_warps
+    call_kwargs["num_stages"] = launcher.config.num_stages
+    if HAS_WARP_SPEC:
+        call_kwargs["num_consumer_groups"] = getattr(
+            launcher.config, "num_consumer_groups", 0
+        )
+        call_kwargs["num_buffers_warp_spec"] = getattr(
+            launcher.config, "num_buffers_warp_spec", 0
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args_str = [*call_args]
     args_str.extend(f"{k}={v}" for k, v in call_kwargs.items())
     args_str = ", ".join(args_str)
@@ -165,6 +221,7 @@ def _dump_launch_params(args, kwargs, launcher, kernel_name, grid):
         f.write(f"{kernel_name} | {args_str} | {grid!r}\n")
 
 
+<<<<<<< HEAD
 def _dump_launch_tensors(args, kernel_path, kernel_hash, kernel_name):
     tensor_list = [arg for arg in args if isinstance(arg, torch.Tensor)]
 
@@ -196,6 +253,55 @@ def _dump_launch_tensors(args, kernel_path, kernel_hash, kernel_name):
     for tensor in tensor_list:
         torch.save(tensor, f"{directory_path}/tensor_{tensor_index}.pt")
         tensor_index +=1
+=======
+def check_autotune_cache(
+    configs: list[Config], filename: Optional[str], inductor_meta: dict[str, Any]
+) -> tuple[list[Config], Optional[AutotuneCache], dict[str, Any]]:
+    """
+    Given a list of configs, checks autotune cache and return metadata
+    """
+    autotune_cache = None
+    autotune_cache_info = {}
+    disabled = inductor_meta.get("force_disable_caches", False)
+    if (
+        not disabled
+        and filename is not None
+        and (len(configs) > 1 or inductor_meta.get("coordinate_descent_tuning"))
+        and not os.environ.get("TRITON_INTERPRET", "0") == "1"
+    ):
+        configs_hash = hash_configs(configs)
+
+        autotune_cache = AutotuneCache.create(inductor_meta, filename, configs_hash)
+        if autotune_cache:
+            if best_config := autotune_cache.read_best(inductor_meta, configs):
+                configs = [best_config]
+                autotune_cache_info["best_config"] = triton_config_to_hashable(
+                    best_config
+                )
+                autotune_cache_info["autotune_cache_state"] = "hit"
+
+            else:
+                autotune_cache_info["autotune_cache_state"] = "miss"
+                autotune_cache_info["num_configs"] = len(configs)
+                if inductor_meta.get("coordinate_descent_tuning"):
+                    autotune_cache_info["coordesc_tuning"] = True
+                    if len(configs) == 1:
+                        # This is the config that coordinate descent tuning started at, which
+                        # is not the same as the final config chosen (i.e. only_config, best_config)
+                        autotune_cache_info["coordesc_tuning_start_config"] = (
+                            triton_config_to_hashable(configs[0])
+                        )
+    else:
+        if len(configs) == 1:
+            autotune_cache_info["autotune_cache_state"] = "only 1 config"
+            autotune_cache_info["only_config"] = triton_config_to_hashable(configs[0])
+
+        if disabled:
+            autotune_cache_info["autotune_cache_state"] = "force_disabled"
+            log.debug("autotune caching is disabled by config.force_disable_caches")
+
+    return configs, autotune_cache, autotune_cache_info
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CachingAutotuner(KernelInterface):
@@ -220,6 +326,10 @@ def __init__(
         custom_kernel=False,  # whether the kernel is inductor-generated or custom
         filename: Optional[str] = None,
         reset_to_zero_arg_names: Optional[list[str]] = None,
+<<<<<<< HEAD
+=======
+        autotune_cache_info: Optional[dict[str, Any]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         super().__init__()
 
@@ -246,6 +356,10 @@ def __init__(
         self.heuristic_type = heuristic_type
         self.custom_kernel = custom_kernel
         self.cuda_kernel_saved = False
+<<<<<<< HEAD
+=======
+        self.autotune_cache_info = autotune_cache_info
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if log.isEnabledFor(logging.DEBUG):
             log.debug(
                 "CachingAutotuner gets %d configs for %s",
@@ -255,7 +369,11 @@ def __init__(
             for c in self.configs:
                 log.debug(c)
 
+<<<<<<< HEAD
         self.compile_results: list[TritonCompileResult] = []
+=======
+        self.compile_results: list[CompileResult[_KernelType]] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.launchers: list[LauncherType] = []
         self.lock = threading.Lock()
         if os.getenv("TRITON_CACHE_DIR") is None:
@@ -289,6 +407,7 @@ def __init__(
         self.dump_launch_params = (
             os.environ.get("TORCHINDUCTOR_DUMP_LAUNCH_PARAMS", "0") == "1"
         )
+<<<<<<< HEAD
         self.dump_launch_tensors = (
             os.environ.get("TORCHINDUCTOR_DUMP_LAUNCH_TENSORS", "0") == "1"
         )
@@ -296,10 +415,75 @@ def __init__(
 
         self.triton_interpret = os.environ.get("TRITON_INTERPRET", "0") == "1"
 
+=======
+
+        self.triton_interpret = os.environ.get("TRITON_INTERPRET", "0") == "1"
+
+        # Compile-time info included in runtime logginging
+        self.compile_id: Optional[CompileId] = None
+        self.is_backward = False
+
+    def is_statically_launchable(self):
+        """
+        Checks if every compiled kernel is statically launchable, which
+        allows us to efficiently cache it in FXGraphCache
+        """
+        if not self.compile_results:
+            return False
+        return all(
+            isinstance(x, StaticTritonCompileResult) for x in self.compile_results
+        )
+
+    def recheck_autotune_cache(
+        self, reload_kernel_from_src: Callable[[], CachingAutotuner]
+    ) -> None:
+        """
+        On cache load on static autotuner, we need to recheck the autotune cache, since
+        a best config could have been found from a previous run
+        """
+        assert self.is_statically_launchable()
+
+        configs = [result.config for result in self.compile_results]
+
+        (cached_configs, _, autotune_cache_info) = check_autotune_cache(
+            configs, self.filename, self.inductor_meta
+        )
+        self.autotune_cache_info = autotune_cache_info
+        # I.e. there was an autotune cache hit
+        if len(cached_configs) == 1 and len(configs) > 1:
+            best_config = cached_configs[0]
+            # Grab the best compiled config, if it's in the list of available ones
+            best_config_hash = triton_config_to_hashable(best_config)
+
+            for compile_result in self.compile_results:
+                if triton_config_to_hashable(compile_result.config) == best_config_hash:
+                    self.compile_results = [compile_result]
+                    return
+
+            # If the best config isn't in our list of compile results,
+            # it's likely because it was found by coordesc after the cache
+            # already saved
+            if best_config.found_by_coordesc:
+                with dynamo_timed("CachingAutotuner.slow_precompile_config"):
+                    if self.fn.fn is None:
+                        self.fn = reload_kernel_from_src().fn
+                    self.compile_results = [self._precompile_config(best_config)]
+
+    def set_compile_info(
+        self, compile_id: Optional[CompileId], is_backward: bool
+    ) -> None:
+        self.compile_id = compile_id
+        self.is_backward = is_backward
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def precompile(
         self,
         warm_cache_only=False,
         reload_kernel: Optional[Callable[[], CachingAutotuner]] = None,
+<<<<<<< HEAD
+=======
+        static_triton_bundle_key: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         if warm_cache_only:
             self._precompile_worker()
@@ -312,6 +496,11 @@ def precompile(
             if reload_kernel is not None:
                 self._reload_kernel = reload_kernel
             self._precompile_worker()
+<<<<<<< HEAD
+=======
+            if static_triton_bundle_key is not None and self.is_statically_launchable():
+                TritonBundler.put_static_autotuner(static_triton_bundle_key, self)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._make_launchers()
             self._dynamic_scale_rblock()
 
@@ -319,7 +508,11 @@ def _precompile_worker(self):
         if self.compile_results:
             for result in self.compile_results:
                 TritonBundler.put(
+<<<<<<< HEAD
                     triton_hash_to_path_key(result.kernel.hash),
+=======
+                    triton_hash_to_path_key(result.kernel.hash),  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.triton_meta.get("device", 0),
                 )
             return
@@ -471,21 +664,54 @@ def _make_launchers(self):
                 try:
                     launchers.append(result.make_launcher())
 
+<<<<<<< HEAD
                 except (OutOfResources, PTXASError) as e:
+=======
+                except (OutOfResources, PTXASError, torch.cuda.OutOfMemoryError) as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     exc = e
         if len(launchers) == 0:
             raise RuntimeError(f"No valid triton configs. {type(exc).__name__}: {exc}")
         self.launchers = launchers
 
+<<<<<<< HEAD
     def prepare_for_pickle(self):
         """Drop stuff from triton.JITFunction that does not pickle.
         This must be called after precompile so that these things are no longer needed.
         """
+=======
+    def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any]:
+        """Drop stuff from triton.JITFunction that does not pickle.
+        This must be called after precompile so that these things are no longer needed.
+        Returns a tuple of old values
+        """
+        old_values = (
+            self.fn.fn,
+            self.fn.__globals__,
+            self.fn.used_global_vals,
+            self.fn.repr,
+            self.launchers,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.fn.fn = None
         self.fn.__globals__ = None
         self.fn.used_global_vals = None
         self.fn.repr = _ConstRepr(self.fn.repr(self.fn))
         self.launchers = []
+<<<<<<< HEAD
+=======
+        return old_values
+
+    def prepare_for_caching(self) -> None:
+        """
+        Statically Launched CUDA Kernels have a raw cubin on them
+        that we don't need to store in the cache(since TritonBundler handles the collection for us)
+        """
+        for result in self.compile_results:
+            if isinstance(result, StaticTritonCompileResult):
+                # Don't save this in the inductor cache, as it is very large
+                result.kernel.cubin_raw = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __getstate__(self) -> dict[str, Any]:
         assert not self.launchers, (
@@ -506,7 +732,11 @@ def get_device_interface(self):
 
         return get_interface_for_device(self.device_props.type.replace("hip", "cuda"))
 
+<<<<<<< HEAD
     def _precompile_config(self, cfg: Config) -> TritonCompileResult:
+=======
+    def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Ahead of time compile a given autotuner config."""
         compile_meta = copy.deepcopy(self.triton_meta)
         cfg_kwargs = cfg.kwargs
@@ -524,6 +754,14 @@ def _precompile_config(self, cfg: Config) -> TritonCompileResult:
                 compile_meta["constants"][arg_name] = getattr(cfg, arg_name)
         compile_meta["num_warps"] = cfg.num_warps
         compile_meta["num_stages"] = cfg.num_stages
+<<<<<<< HEAD
+=======
+        if HAS_WARP_SPEC:
+            compile_meta["num_consumer_groups"] = getattr(cfg, "num_consumer_groups", 0)
+            compile_meta["num_buffers_warp_spec"] = getattr(
+                cfg, "num_buffers_warp_spec", 0
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         compile_meta["debug"] = self.inductor_meta.get(
             "assert_indirect_indexing", True
         ) and not self.inductor_meta.get("is_hip", False)
@@ -549,9 +787,24 @@ def _precompile_config(self, cfg: Config) -> TritonCompileResult:
             ),
         )
 
+<<<<<<< HEAD
         target = GPUTarget(
             compile_meta["device_type"],
             compile_meta["cc"],
+=======
+        if self.device_props.type == "mtia":
+            from mtia.host_runtime.torch_mtia.acc_flags import (  # type: ignore[import-not-found]
+                build_codename,
+            )
+
+            arch = build_codename()
+        else:
+            arch = compile_meta["cc"]
+
+        target = GPUTarget(
+            compile_meta["device_type"],
+            arch,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cc_warp_size(compile_meta["cc"]),
         )
 
@@ -561,6 +814,18 @@ def _precompile_config(self, cfg: Config) -> TritonCompileResult:
             "debug": compile_meta["debug"],
             "sanitize_overflow": False,  # turn off additional asserts added for overflow checks
         }
+<<<<<<< HEAD
+=======
+        if HAS_WARP_SPEC:
+            options.update(
+                {
+                    "num_consumer_groups": compile_meta.get("num_consumer_groups", 0),
+                    "num_buffers_warp_spec": compile_meta.get(
+                        "num_buffers_warp_spec", 0
+                    ),
+                }
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.device_props.type == "hip":
             if "waves_per_eu" in compile_meta:
                 options["waves_per_eu"] = compile_meta["waves_per_eu"]
@@ -584,6 +849,20 @@ def _precompile_config(self, cfg: Config) -> TritonCompileResult:
         TritonBundler.put(
             triton_hash_to_path_key(binary.hash), self.triton_meta.get("device", 0)
         )
+<<<<<<< HEAD
+=======
+        # If the binary has a cubin file to directly launch, save it on the binary
+        static_launcher = StaticTritonCompileResult.can_statically_launch(
+            binary, self.inductor_meta, self.triton_meta, self.heuristic_type
+        )
+
+        if static_launcher is not None:
+            result = StaticTritonCompileResult(
+                static_launcher, cfg, compile_meta, self.inductor_meta
+            )
+            return result
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return TritonCompileResult(binary, cfg, compile_meta, self.inductor_meta)
 
     def _get_args_with_constexprs(self, args, launcher):
@@ -597,7 +876,12 @@ def _get_args_with_constexprs(self, args, launcher):
             # so we can sort them by index.
             constexpr_args: list[tuple[int, Any]] = []
             for arg_name, arg_val in launcher.config.kwargs.items():
+<<<<<<< HEAD
                 constexpr_args.append((self.fn.arg_names.index(arg_name), arg_val))
+=======
+                if arg_name in self.fn.arg_names:
+                    constexpr_args.append((self.fn.arg_names.index(arg_name), arg_val))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             constexpr_args.sort()
             new_args = [*args]
@@ -765,12 +1049,30 @@ def clone_args(self, *args, **kwargs) -> tuple[list[Any], dict[str, Any]]:
         return self.maybe_clone_args(OrderedSet(), *args, **kwargs)
 
     def benchmark_all_configs(self, *args, **kwargs):
+<<<<<<< HEAD
         with dynamo_timed(
             "CachingAutotuner.benchmark_all_configs",
             log_pt2_compile_event=True,
             metadata={"kernel_name": self.inductor_meta.get("kernel_name")},
             # TODO(masnesral): Enable this when we figure out how to get the CompileId:
             # dynamo_compile_runtime_column_us="runtime_triton_autotune_time_us",
+=======
+        with (
+            dynamo_timed(
+                "CachingAutotuner.benchmark_all_configs",
+                log_pt2_compile_event=True,
+                metadata={"kernel_name": self.inductor_meta.get("kernel_name")},
+                dynamo_compile_column_us="runtime_triton_autotune_time_us",
+                compile_id=self.compile_id,
+                is_backward=self.is_backward,
+                log_waitcounter=True,
+                waitcounter_name_override="triton_autotuner",
+            ),
+            compilation_callback.callback_handler.install_callbacks(
+                compilation_callback.CallbackTrigger.TRITON_AUTOTUNING,
+                str(self.compile_id),
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             timings = {
                 launcher: self.bench(launcher, *args, **kwargs)
@@ -818,7 +1120,15 @@ def autotune_to_one_config(self, *args, **kwargs):
         )
 
         if self.save_cache_hook:
+<<<<<<< HEAD
             self.save_cache_hook(launcher.config, self.autotune_time_taken_ns)
+=======
+            self.save_cache_hook(
+                launcher.config,
+                self.autotune_time_taken_ns,
+                triton_cache_hash=launcher.cache_hash,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def save_gpu_kernel(self, stream, launcher):
         key = self.inductor_meta.get("kernel_name", None)  # unique kernel name
@@ -846,13 +1156,27 @@ def save_gpu_kernel(self, stream, launcher):
             "triton_meta": self.triton_meta,
             "def_args": launcher.def_args,
             "call_args": launcher.call_args,
+<<<<<<< HEAD
+=======
+            "global_scratch": launcher.global_scratch,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         from torch._inductor.codecache import CudaKernelParamCache
 
         bin_type = {"hip": "hsaco", "xpu": "spv"}.get(self.device_props.type, "cubin")
         binary = launcher.bin.asm[bin_type]
+<<<<<<< HEAD
         CudaKernelParamCache.set(key, params, binary, bin_type)
 
+=======
+        # Also store asm code which can be used for debugging and generating cpp package
+        asm_type = {"hip": "amdgcn", "cuda": "ptx", "xpu": "spv"}.get(
+            self.device_props.type, None
+        )
+        asm = launcher.bin.asm.get(asm_type, None)
+
+        CudaKernelParamCache.set(key, params, binary, bin_type, asm, asm_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.cuda_kernel_saved = True
 
     def coordinate_descent_tuning(self, launcher, *args, **kwargs):
@@ -921,7 +1245,19 @@ def benchmark_one_config(config):
                 self.autotune_time_taken_ns + coordesc_time_taken_ns,
                 found_by_coordesc=True,
             )
+<<<<<<< HEAD
         return config2launcher.get(best_config)
+=======
+
+        if best_config not in config2launcher:
+            # On a Coordesc cache hit, we might not have loaded the launcher
+            # This can happen because PyCodeCache saves CachingAutotuners in memory,
+            # even for separate compile IDs (which can have different inputs without changing output code)
+            config2launcher[best_config] = self._precompile_config(
+                best_config
+            ).make_launcher()
+        return config2launcher[best_config]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def run(
         self,
@@ -930,6 +1266,18 @@ def run(
         benchmark_run=False,
         **kwargs,
     ):  # type:ignore[override]
+<<<<<<< HEAD
+=======
+        if hasattr(triton, "set_allocator"):
+
+            def alloc_fn(size: int, align: int, stream: Optional[int]):
+                return torch.empty(
+                    size, dtype=torch.int8, device=self.device_props.type
+                )
+
+            triton.set_allocator(alloc_fn)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.triton_interpret:
             args, grid = self._interpret_args_grid(args, self.configs[0])
             return self.fn[grid](
@@ -963,11 +1311,14 @@ def run(
             new_args, grid = self._interpret_args_grid(args, launcher.config)
             _dump_launch_params(new_args, kwargs, launcher, self.fn.__name__, grid)
 
+<<<<<<< HEAD
         if self.dump_launch_tensors:
             # Check the kernel name if the list was provided
             if not self.kernels_to_dump or any(kernel_name in self.fn.__name__ for kernel_name in self.kernels_to_dump):
                 _dump_launch_tensors(args, self.filename, self.kernel_hash, self.fn.__name__)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # it is faster than entering and exiting a context manager, even if the context
         # manager is a nullcontext.
         if autograd_profiler._is_profiler_enabled:
@@ -1009,7 +1360,11 @@ def _interpret_args_grid(
             dict(
                 zip(
                     [
+<<<<<<< HEAD
                         *self.fn.arg_names,
+=======
+                        *self.triton_meta["signature"].keys(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         *self.inductor_meta.get("extra_launcher_args", ()),
                     ],
                     args,
@@ -1029,7 +1384,263 @@ def __call__(self, _=None) -> str:
         return self.value
 
 
+<<<<<<< HEAD
 class TritonCompileResult:
+=======
+class CompileResult(Generic[_T]):
+    def __init__(
+        self,
+        kernel: _T,
+        config: Config,
+        compile_meta: dict[str, Any],
+        inductor_meta: dict[str, Any],
+    ):
+        self.kernel = kernel
+        self.config = config
+        self.compile_meta = compile_meta
+        self.inductor_meta = inductor_meta
+
+    def make_launcher(self) -> LauncherType: ...
+
+    def _gen_launcher_code(self, scope, def_args, runner_args) -> LauncherType:
+        grid = GridExpr.from_meta(self.inductor_meta, self.config)
+        # grid.prefix is usually empty, grid.x_grid is something like `-(xnumel//-1024)`
+        lines = [
+            f"def launcher({', '.join(def_args)}, stream):",
+            *[f"    {line}" for line in grid.prefix],
+            f"    grid_0 = {grid.x_grid}",
+            f"    grid_1 = {grid.y_grid}",
+            f"    grid_2 = {grid.z_grid}",
+            f"    runner({', '.join(runner_args)})",
+        ]
+        launcher_code = "\n".join(lines)
+        exec(launcher_code, scope)
+        return scope["launcher"]
+
+    def _get_arg_lists(
+        self, arg_names, constexprs
+    ) -> tuple[list[str], list[str], OrderedSet[str]]:
+        """
+        Return a bunch of intermediate lists of args needed for generating
+        launcher code.
+        """
+        compile_meta = self.compile_meta
+        cfg = self.config
+        known_constants = OrderedSet(
+            arg for i, arg in enumerate(arg_names) if i in constexprs
+        )
+
+        """
+        https://github.com/pytorch/pytorch/issues/115344
+
+        self.fn.constexprs doesn't properly deal with None args, so when we filter out
+        an arg in UserDefinedTritonKernel.codegen, we need to filter it here as well.
+        We also don't want to modify self.fn.
+
+        We know that we removed something from the signature if:
+            1. It's in compile_meta["constants"]
+            2. It isn't a constant we already know about
+                Note: The value of interest has already been added to compile_meta['constants'],
+                    so we use self.fn.constexprs instead.
+            3. It isn't in the compile_meta signature
+        """
+        none_args = OrderedSet(
+            k
+            for k, v in compile_meta["constants"].items()
+            if v is None and k not in known_constants
+        )
+        none_args = none_args.difference(OrderedSet(compile_meta["signature"].keys()))
+
+        if triton_version_uses_attrs_dict():
+            call_args = arg_names
+            def_args = arg_names
+            if (
+                "num_warps" in compile_meta["constants"]
+                or "num_stages" in compile_meta["constants"]
+            ):
+                # num_warps/num_stages are special implicit args that are not in the signature
+                # see test_triton_kernel_special_params
+                def_args = [
+                    arg for arg in def_args if arg not in ("num_warps", "num_stages")
+                ]
+                repl = {
+                    k: str(compile_meta["constants"].get(k))
+                    for k in ("num_warps", "num_stages")
+                }
+                call_args = [repl.get(arg, arg) for arg in call_args]
+        else:
+            call_args = [
+                arg
+                for i, arg in enumerate(arg_names)
+                if i not in constexprs and arg not in none_args
+            ]
+            cfg_dict = config_to_dict(cfg)
+            def_args = [
+                name
+                for name in arg_names
+                if name not in cfg_dict and name not in none_args
+            ]
+
+        if "extra_launcher_args" in self.inductor_meta:
+            def_args = [*def_args, *self.inductor_meta["extra_launcher_args"]]
+
+        return call_args, def_args, none_args
+
+
+class CannotStaticallyLaunchKernel(Exception):
+    pass
+
+
+class StaticTritonCompileResult(CompileResult[StaticallyLaunchedCudaKernel]):
+    """
+    TritonCompileResult that uses StaticCudaLauncher,
+    which vastly simplifies the setup and metadata needed to be kept.
+    """
+
+    @staticmethod
+    def can_statically_launch(
+        kernel: CompiledKernel,
+        inductor_meta: dict[str, Any],
+        triton_meta: dict[str, Any],
+        heuristic_type: HeuristicType,
+    ) -> Optional[StaticallyLaunchedCudaKernel]:
+        if not torch._inductor.config.use_static_cuda_launcher:
+            return None
+
+        def check_can_launch() -> StaticallyLaunchedCudaKernel:
+            if triton_meta.get("device_type", None) != "cuda":
+                # Only cuda kernels
+                raise CannotStaticallyLaunchKernel("Non-cuda device")
+
+            if torch._inductor.config.cpp_wrapper:
+                # If we're running with cpp wrapper, it doesn't
+                # make sense to statically compile since everything
+                # is codegenned anyway
+                raise CannotStaticallyLaunchKernel("Cpp wrapper enabled")
+
+            if (
+                heuristic_type == HeuristicType.USER_AUTOTUNE
+                and not torch._inductor.config.static_launch_user_defined_triton_kernels
+            ):
+                # Don't support user defined triton kernels yet
+                raise CannotStaticallyLaunchKernel("User defined triton kernel")
+
+            if inductor_meta.get("store_cubin", None):
+                # Requires storing the entire binary
+                raise CannotStaticallyLaunchKernel("store_cubin is enabled")
+
+            cubin_location = os.path.join(
+                triton_cache_dir(triton_meta.get("device", 0)),
+                triton_hash_to_path_key(kernel.hash),
+                f"{kernel.src.fn.__name__}.cubin",
+            )
+
+            if not os.path.exists(cubin_location):
+                raise CannotStaticallyLaunchKernel(
+                    f"Cubin path not found: {cubin_location}"
+                )
+
+            else:
+                kernel._cubin_path = cubin_location
+
+            try:
+                static_kernel = StaticallyLaunchedCudaKernel(kernel)
+            except NotImplementedError as e:
+                raise CannotStaticallyLaunchKernel(f"NotImplemented: {str(e)}") from e
+
+            return static_kernel
+
+        try:
+            result = check_can_launch()
+            return result
+        except CannotStaticallyLaunchKernel as e:
+            log.info("Bypassing StaticallyLaunchedCudaKernel due to %s", str(e))
+            if torch._inductor.config.strict_static_cuda_launcher:
+                raise e
+            return None
+
+    def reload_cubin_path(self):
+        """
+        When loading from cache on disk, we want to reload cubin
+        files from their appropriate location on disc.
+        """
+        cubin_location = os.path.join(
+            triton_cache_dir(self.compile_meta.get("device", 0)),
+            triton_hash_to_path_key(self.kernel.hash),
+            f"{self.kernel.name}.cubin",
+        )
+        if not os.path.exists(cubin_location):
+            if self.kernel.cubin_raw is not None:
+                # We saved the raw cubin, so write it to he appropriate location
+                self.kernel.reload_cubin_from_raw(cubin_location)
+            else:
+                raise RuntimeError(
+                    "Cubin file saved by TritonBundler not found at %s", cubin_location
+                )
+        self.kernel.cubin_path = cubin_location
+
+    def make_launcher(self) -> LauncherType:
+        # If at least one static make_launcher call occurs,
+        # we're sure static cuda launcher was used for this compile
+        set_feature_use("static_cuda_launcher", True)
+        # Load the binary on the parent
+        if not self.kernel.cubin_path:
+            self.reload_cubin_path()
+        device = self.compile_meta.get("device", 0)
+        if device is None:
+            device = 0
+        self.kernel.load_kernel(device)
+        scope = {
+            "runner": self.kernel.run,
+        }
+
+        # NOTE: Constexpr handling for triton and static cuda launcher
+
+        # Triton kernels have two types of constexprs: *declared* ones, which are ones the user
+        # has explicitly declared as tl.constexpr, and *implied* ones, which are expressions triton
+        # deems constant while compiling/analyzing the code (i.e. unused parameters, for example)
+
+        # Triton kernels handle constexprs slightly differently depending on which version of triton
+        # we care about (we support 3.2.0 and 3.3.0).
+
+        # In 3.2.0, triton kernels do not require passing any declared constexprs into the kernel
+        # In 3.3.0, triton kernels require all declared constexprs be passed into the kernel, where
+        # they are subsequently ignored.
+        # When statically launching, since we're launching from the triton generated cubin, we actually want to
+        # always get rid of all const exprs, declared or implied, since the underlying cubin file has all
+        # of the constants stripped away anyway.
+
+        # But CachingAutotuner.run will pass us a different number of arguments depending on
+        # whether or not we're in triton 3.2.0 or later, so we grab def_args with the same logic
+        # as the (non static) TritonCompileResult. We then generate call_args ourselves, since we
+        # want only a subset of the arguments passed to triton.
+        # Here, arg_names is exactly fn.src.arg_names and declared_constexprs is exactly fn.src.constexprs,
+        # which matches behavior with regular TritonCompileResult
+        _, def_args, none_args = self._get_arg_lists(
+            self.kernel.arg_names, self.kernel.declared_constexprs
+        )
+
+        call_args = [
+            arg
+            for i, arg in enumerate(self.kernel.arg_names)
+            if i not in self.kernel.full_constexprs and arg not in none_args
+        ]
+
+        # StaticallyLaunchedCudaKernel.run takes in order grid_0, grid_1, grid_2, stream, and call_args
+        runner_args = ["grid_0", "grid_1", "grid_2", "stream", *call_args]
+        launcher = self._gen_launcher_code(scope, def_args, runner_args)
+        launcher.config = self.config  # type: ignore[attr-defined]
+        launcher.n_regs = self.kernel.n_regs  # type: ignore[attr-defined]
+        launcher.n_spills = self.kernel.n_spills  # type: ignore[attr-defined]
+        launcher.shared = self.kernel.shared  # type: ignore[attr-defined]
+        launcher.cache_hash = triton_hash_to_path_key(self.kernel.hash)  # type: ignore[attr-defined]
+        launcher.store_cubin = False  # type: ignore[attr-defined]
+        launcher._is_static = True  # type: ignore[attr-defined]
+        return launcher
+
+
+class TritonCompileResult(CompileResult[CompiledKernel]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Upstream Triton CompileKernel can not be pickled.  This is a wrapper
     to support serialization and generate the launcher function.
@@ -1040,6 +1651,7 @@ class TritonCompileResult:
     def _kernel_metadata_cls(fields: tuple[str, ...]) -> Any:
         return namedtuple("KernelMetadata", sorted(fields))
 
+<<<<<<< HEAD
     def __init__(
         self,
         kernel: CompiledKernel,
@@ -1053,6 +1665,8 @@ def __init__(
         self.compile_meta = compile_meta
         self.inductor_meta = inductor_meta
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def _serialize_metadata(metadata):
         """
@@ -1132,6 +1746,7 @@ def make_launcher(self) -> LauncherType:
         binary = self.kernel
         fn = binary.src.fn
         binary._init_handles()
+<<<<<<< HEAD
         """
         https://github.com/pytorch/pytorch/issues/115344
 
@@ -1186,15 +1801,39 @@ def make_launcher(self) -> LauncherType:
                 if name not in cfg_dict and name not in none_args
             ]
 
+=======
+        (call_args, def_args, none_args) = self._get_arg_lists(
+            fn.arg_names, fn.constexprs
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         binary_shared = (
             binary.shared if hasattr(binary, "shared") else binary.metadata.shared
         )
 
+<<<<<<< HEAD
         scope = {
             "grid_meta": cfg.kwargs,
             "bin": binary,
             "launch_enter_hook": binary.__class__.launch_enter_hook,
             "launch_exit_hook": binary.__class__.launch_exit_hook,
+=======
+        if knobs is None:
+            launch_enter = binary.__class__.launch_enter_hook
+            launch_exit = binary.__class__.launch_exit_hook
+        else:
+            launch_enter = knobs.runtime.launch_enter_hook
+            launch_exit = knobs.runtime.launch_exit_hook
+
+        import math as math_lib
+
+        import torch as torch_lib
+
+        scope = {
+            "grid_meta": cfg.kwargs,
+            "bin": binary,
+            "launch_enter_hook": launch_enter,
+            "launch_exit_hook": launch_exit,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "metadata": (
                 binary.packed_metadata
                 if hasattr(binary, "packed_metadata")
@@ -1220,6 +1859,11 @@ def make_launcher(self) -> LauncherType:
             ),
             "function": get_first_attr(binary, "function", "cu_function"),
             "runner": get_first_attr(binary, "run", "c_wrapper"),
+<<<<<<< HEAD
+=======
+            "math": math_lib,
+            "torch": torch_lib,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         if not hasattr(binary, "launch_metadata"):
@@ -1239,13 +1883,21 @@ def make_launcher(self) -> LauncherType:
                 "metadata",
                 *call_args,
             ]
+<<<<<<< HEAD
         else:  # args after CompiledKernel.launch_metadata: https://github.com/openai/triton/pull/3492
+=======
+        else:  # args after CompiledKernel.launch_metadata: https://github.com/triton-lang/triton/pull/3492
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Getting the kernel launch args is extremely perf-sensitive.  Evaluating
             # `bin.launch_metadata` is relatively expensive, and returns None unless a
             # `launch_enter_hook` is installed.  So if we don't have that hook installed,
             # we want to burn None in to the launch args with zero overhead.
             # See https://github.com/pytorch/pytorch/issues/123597
+<<<<<<< HEAD
             if binary.__class__.launch_enter_hook:
+=======
+            if launch_enter:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 launch_metadata = f"bin.launch_metadata((grid_0, grid_1, grid_2), stream, {', '.join(call_args)})"
             else:
                 launch_metadata = "None"
@@ -1262,6 +1914,7 @@ def make_launcher(self) -> LauncherType:
                 *call_args,
             ]
 
+<<<<<<< HEAD
         if "extra_launcher_args" in self.inductor_meta:
             def_args = [*def_args, *self.inductor_meta["extra_launcher_args"]]
 
@@ -1276,12 +1929,19 @@ def make_launcher(self) -> LauncherType:
             f"    runner({', '.join(runner_args)})",
         ]
         exec("\n".join(lines), scope)
+=======
+        launcher = self._gen_launcher_code(scope, def_args, runner_args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         launcher = scope["launcher"]
         launcher.config = cfg
         launcher.n_regs = getattr(binary, "n_regs", None)
         launcher.n_spills = getattr(binary, "n_spills", None)
         launcher.shared = binary_shared
+<<<<<<< HEAD
+=======
+        launcher.cache_hash = triton_hash_to_path_key(binary.hash)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         launcher.store_cubin = self.inductor_meta.get("store_cubin", False)
         # store this global variable to avoid the high overhead of reading it when calling run
         if launcher.store_cubin:
@@ -1299,6 +1959,13 @@ def make_launcher(self) -> LauncherType:
                 ]
             launcher.def_args = def_args
             launcher.call_args = call_args
+<<<<<<< HEAD
+=======
+            kernel_metadata = getattr(self.kernel, "metadata", None)
+            launcher.global_scratch = getattr(
+                kernel_metadata, "global_scratch_size", None
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return launcher
 
 
@@ -1465,6 +2132,7 @@ def cached_autotune(
     assert len(configs) == 1 or filename
     inductor_meta = {} if inductor_meta is None else inductor_meta
 
+<<<<<<< HEAD
     disabled = inductor_meta.get("force_disable_caches", False)
 
     # on disk caching logic and/or remote caching
@@ -1486,6 +2154,11 @@ def cached_autotune(
         if disabled:
             log.debug("autotune caching is disabled by config.force_disable_caches")
 
+=======
+    configs, autotune_cache, autotune_cache_info = check_autotune_cache(
+        configs, filename, inductor_meta
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mutated_arg_names = inductor_meta.pop("mutated_arg_names", ())
     optimize_mem = inductor_meta.pop("optimize_mem", True)
 
@@ -1542,6 +2215,10 @@ def decorator(fn):
             size_hints=size_hints,
             custom_kernel=custom_kernel,
             filename=filename,
+<<<<<<< HEAD
+=======
+            autotune_cache_info=autotune_cache_info,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return decorator
@@ -1820,7 +2497,13 @@ def _get_config(numels: dict[str, int]) -> dict[str, int]:
     return {prefix.upper() + "BLOCK": numel for prefix, numel in numels.items()}
 
 
+<<<<<<< HEAD
 def triton_config_tiled_reduction(size_hints, x, y, r, num_stages=1):
+=======
+def triton_config_tiled_reduction(
+    size_hints, x, y, r, num_stages=1, register_intensive=False
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Construct a tile reduction triton config with some adjustment
     heuristics based on size_hints. Size_hints is a tuple of numels in
@@ -1846,12 +2529,23 @@ def total_numel() -> int:
     for prefix in sorted(rnumels):
         while rnumels[prefix] < size_hints[prefix] and total_numel() < target:
             rnumels[prefix] *= 2
+<<<<<<< HEAD
     while y < size_hints[1] and total_numel() < target:
+=======
+    while y < size_hints["y"] and total_numel() < target:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         y *= 2
 
     cfg = _get_config({"x": x, "y": y, **rnumels})
     num_warps = _num_warps(total_numel() // 256, min_num_warps=1)
+<<<<<<< HEAD
     check_config(cfg, xnumel=size_hints[0], ynumel=size_hints[1])
+=======
+    num_warps = _num_warps(
+        num_warps, max_num_warps=16, register_intensive=register_intensive
+    )
+    check_config(cfg, xnumel=size_hints["x"], ynumel=size_hints["y"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     check_max_block(cfg)
     return Config(cfg, num_warps=num_warps, num_stages=num_stages)
 
@@ -1974,6 +2668,7 @@ def _reduction_configs(
         MAX_R0_BLOCK = 1024
         register_intensive = True
 
+<<<<<<< HEAD
     contiguous_config = triton_config_reduction(
         size_hints,
         1,
@@ -1985,11 +2680,53 @@ def _reduction_configs(
     )
     tiny_config = triton_config_reduction(
         size_hints,
+=======
+    def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
+        # For 3D case with tiling scores, create an adapted version
+        if "y" in size_hints:
+            assert "tiling_scores" in inductor_meta
+            return adapt_config_for_tiling(
+                size_hints,
+                inductor_meta["tiling_scores"],
+                x,
+                r,
+                num_warps=num_warps,
+                num_stages=num_stages,
+                register_intensive=register_intensive,
+            )
+        else:
+            # For other cases, use the original function
+            return triton_config_reduction(
+                size_hints,
+                x,
+                r,
+                num_warps=num_warps,
+                num_stages=num_stages,
+                register_intensive=register_intensive,
+            )
+
+    contiguous_config = make_config(
+        1,
+        min(rnumel, MAX_R0_BLOCK),
+        register_intensive=register_intensive,
+    )
+    outer_config = make_config(64, 8, register_intensive=register_intensive)
+    tiny_config = make_config(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         2 * (256 // rnumel) if rnumel <= 256 else 1,
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
+<<<<<<< HEAD
     if inductor_meta.get("max_autotune") or inductor_meta.get("max_autotune_pointwise"):
+=======
+    # For 3d tiling, default to more autotuning initially
+    if "y" in size_hints:
+        pass
+    elif inductor_meta.get("max_autotune") or inductor_meta.get(
+        "max_autotune_pointwise"
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass  # skip all these cases
     elif reduction_hint == ReductionHint.INNER:
         return [contiguous_config]
@@ -1998,11 +2735,16 @@ def _reduction_configs(
     elif reduction_hint == ReductionHint.OUTER_TINY:
         return [tiny_config]
     if disable_pointwise_autotuning(inductor_meta):
+<<<<<<< HEAD
         return [triton_config_reduction(size_hints, 32, 128)]
+=======
+        return [make_config(32, 128)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return [
         contiguous_config,
         outer_config,
         tiny_config,
+<<<<<<< HEAD
         triton_config_reduction(size_hints, 64, 64),
         triton_config_reduction(size_hints, 8, 512),
         # halve the XBLOCK/Rn_BLOCK compared to outer_config
@@ -2012,6 +2754,93 @@ def _reduction_configs(
     ]
 
 
+=======
+        make_config(64, 64),
+        make_config(8, 512),
+        # halve the XBLOCK/Rn_BLOCK compared to outer_config
+        # TODO: this may only be beneficial when each iteration of the reduction
+        # is quite heavy. E.g. https://gist.github.com/shunting314/189a8ef69f90db9d614a823385147a72
+        make_config(64, 4, num_warps=8),
+    ]
+
+
+def match_target_block_product(
+    size_hints, tiling_scores, target_block_product, min_block_size=1
+):
+    """
+    Distribute block sizes across dimensions according to tiling scores,
+    aiming to match a target product of block sizes.
+    """
+    total_score = sum(tiling_scores.values())
+    if total_score == 0:
+        # just assume even score with no minimum block size
+        min_block_size = 1
+        tiling_scores = dict.fromkeys(tiling_scores.keys(), target_block_product)
+
+    # First, give each coalescing dimension at least min_block_size
+    block_sizes = {}
+    relative_scores = {}
+    curr_block_product = 1
+
+    for dim, score in tiling_scores.items():
+        if score == 0:
+            block_sizes[dim] = 1
+            continue
+
+        block_sizes[dim] = min_block_size
+        curr_block_product *= min_block_size
+        relative_scores[dim] = score / total_score
+
+    # Scale up dimensions by their relative scores until we reach the target
+    while curr_block_product < target_block_product and len(relative_scores):
+        dim, score = max(relative_scores.items(), key=lambda item: item[1])
+
+        # Check if we've hit the max for this dimension
+        if (
+            block_sizes[dim] >= TRITON_MAX_BLOCK[dim.capitalize()]
+            or block_sizes[dim] >= size_hints[dim]
+        ):
+            del relative_scores[dim]
+            continue
+
+        block_sizes[dim] *= 2
+        relative_scores[dim] /= 2
+        curr_block_product *= 2
+
+    return block_sizes
+
+
+def adapt_config_for_tiling(
+    size_hints,
+    tiling_scores,
+    original_x,
+    original_r,
+    num_warps=None,
+    num_stages=1,
+    register_intensive=False,
+    persistent_reduction=False,
+) -> Config:
+    """
+    Create an adapted configuration based on tiling scores,
+    redistributing the same total block size (x * r) according to tiling scores.
+    """
+    assert all(s in tiling_scores for s in size_hints)
+    target_block_product = original_x * original_r
+    block_sizes = match_target_block_product(
+        size_hints, tiling_scores, target_block_product
+    )
+
+    return triton_config_tiled_reduction(
+        size_hints,
+        block_sizes["x"],
+        block_sizes["y"],
+        block_sizes["r0_"],
+        num_stages=num_stages,
+        register_intensive=register_intensive,
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def reduction(
     size_hints,
     reduction_hint=False,
@@ -2092,11 +2921,16 @@ def _persistent_reduction_configs(
     xnumel = size_hints["x"]
     rnumel = get_total_reduction_numel(size_hints)
 
+<<<<<<< HEAD
+=======
+    MAX_PERSISTENT_BLOCK_NUMEL = 4096
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     max_autotune_enabled = not disable_pointwise_autotuning(inductor_meta) or (
         inductor_meta.get("max_autotune")
         or inductor_meta.get("max_autotune_pointwise")
     )
 
+<<<<<<< HEAD
     configs = [
         triton_config_reduction(size_hints, xblock, rnumel, register_intensive=True)
         for xblock in (1, 8, 32, 128)
@@ -2105,10 +2939,50 @@ def _persistent_reduction_configs(
 
     # TODO(jansel): we should be able to improve these heuristics
     if not max_autotune_enabled: # Don't filter if tuning enabled
+=======
+    if "y" not in size_hints:
+        configs = [
+            triton_config_reduction(size_hints, xblock, rnumel, register_intensive=True)
+            for xblock in (1, 8, 32, 128)
+            if xblock == 1
+            or (rnumel * xblock <= MAX_PERSISTENT_BLOCK_NUMEL and xblock <= xnumel)
+        ]
+    else:
+        configs = []
+        assert "tiling_scores" in inductor_meta
+        x_y_scores = {dim: inductor_meta["tiling_scores"][dim] for dim in ("x", "y")}
+        for target_block_size in (1, 8, 32, 64, 128):
+            if target_block_size * rnumel > MAX_PERSISTENT_BLOCK_NUMEL:
+                continue
+
+            block_sizes = match_target_block_product(
+                size_hints, x_y_scores, target_block_size
+            )
+            configs.append(
+                triton_config_tiled_reduction(
+                    size_hints, block_sizes["x"], block_sizes["y"], rnumel
+                )
+            )
+
+    # defer to more autotuning, initially
+    tiny_configs = [
+        triton_config_reduction(
+            size_hints,
+            2 * (256 // rnumel) if rnumel <= 256 else 1,
+            rnumel,
+        )
+    ]
+
+    if "y" in size_hints:
+        pass
+    # TODO(jansel): we should be able to improve these heuristics
+    elif not max_autotune_enabled: # Don't filter if tuning enabled
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if reduction_hint == ReductionHint.INNER and rnumel >= 256:
             configs = configs[:1]
         elif reduction_hint == ReductionHint.OUTER:
             configs = configs[-1:]
+<<<<<<< HEAD
 
     if reduction_hint == ReductionHint.OUTER_TINY:
         tiny_configs = [
@@ -2125,6 +2999,16 @@ def _persistent_reduction_configs(
             else:
                 configs = tiny_configs
 
+=======
+        elif reduction_hint == ReductionHint.OUTER_TINY:
+            configs = tiny_configs
+    else:
+        if max_autotune_enabled:
+            for conf in tiny_configs:
+                if conf not in configs:
+                    configs.append(conf)
+    
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction
         for prefix in size_hints:
@@ -2197,6 +3081,7 @@ def split_scan(
     )
 
 
+<<<<<<< HEAD
 def template(num_stages, num_warps, triton_meta, filename=None, inductor_meta=None):
     """
     Compile a triton template
@@ -2204,6 +3089,37 @@ def template(num_stages, num_warps, triton_meta, filename=None, inductor_meta=No
     return cached_autotune(
         None,
         [triton.Config({}, num_stages=num_stages, num_warps=num_warps)],
+=======
+def template(
+    num_stages,
+    num_warps,
+    triton_meta,
+    num_consumer_groups=0,
+    num_buffers_warp_spec=0,
+    filename=None,
+    inductor_meta=None,
+):
+    """
+    Compile a triton template
+    """
+    # Prepare the base configuration
+    config_args = {
+        "num_stages": num_stages,
+        "num_warps": num_warps,
+    }
+
+    # Conditionally add arguments based on HAS_WARP_SPEC
+    if HAS_WARP_SPEC:
+        config_args.update(
+            {
+                "num_consumer_groups": num_consumer_groups,
+                "num_buffers_warp_spec": num_buffers_warp_spec,
+            }
+        )
+    return cached_autotune(
+        None,
+        [triton.Config({}, **config_args)],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         triton_meta=triton_meta,
         inductor_meta=inductor_meta,
         heuristic_type=HeuristicType.TEMPLATE,
@@ -2214,7 +3130,18 @@ def template(num_stages, num_warps, triton_meta, filename=None, inductor_meta=No
 def _pop_config_kwargs(config: dict[str, Any]) -> dict[str, Any]:
     """Extract triton.Config options that should become kwargs"""
     popped = {}
+<<<<<<< HEAD
     for key in ("num_warps", "num_stages", "num_ctas", "maxnreg"):
+=======
+    for key in (
+        "num_warps",
+        "num_stages",
+        "num_ctas",
+        "maxnreg",
+        "num_consumer_groups",
+        "num_buffers_warp_spec",
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         val = config.pop(key, None)
         if val is not None:
             popped[key] = val
@@ -2222,11 +3149,26 @@ def _pop_config_kwargs(config: dict[str, Any]) -> dict[str, Any]:
 
 
 def config_to_dict(config: Config) -> dict[str, Any]:
+<<<<<<< HEAD
     return {
+=======
+    config_dict = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **config.kwargs,
         "num_warps": config.num_warps,
         "num_stages": config.num_stages,
     }
+<<<<<<< HEAD
+=======
+    if HAS_WARP_SPEC:
+        config_dict.update(
+            {
+                "num_consumer_groups": getattr(config, "num_consumer_groups", 0),
+                "num_buffers_warp_spec": getattr(config, "num_buffers_warp_spec", 0),
+            }
+        )
+    return config_dict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def config_from_dict(config: dict[str, Any]) -> Config:
@@ -2299,7 +3241,11 @@ class GridExpr:
 
     inductor_meta: dict[str, Any]
     mode: Literal["python", "cpp"] = "python"
+<<<<<<< HEAD
     prefix: Sequence[str] = ()
+=======
+    prefix: list[str] = dataclasses.field(default_factory=list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     x_grid: Union[str, int] = 1
     y_grid: Union[str, int] = 1
     z_grid: Union[str, int] = 1
@@ -2401,12 +3347,25 @@ def generate(self, meta: dict[str, int]) -> None:
 class Grid2DWithYZOverflow(GridExpr):
     def generate(self, meta: dict[str, int]) -> None:
         self.x_grid = self.ceildiv("xnumel", meta.get("XBLOCK"))
+<<<<<<< HEAD
         self.prefix = [
             self.assign_tmp("y_grid_raw_", self.ceildiv("ynumel", meta.get("YBLOCK"))),
             self.assign_tmp(
                 "y_grid_div_", self.ceildiv("y_grid_raw_", get_max_y_grid())
             ),
         ]
+=======
+        self.prefix.extend(
+            [
+                self.assign_tmp(
+                    "y_grid_raw_", self.ceildiv("ynumel", meta.get("YBLOCK"))
+                ),
+                self.assign_tmp(
+                    "y_grid_div_", self.ceildiv("y_grid_raw_", get_max_y_grid())
+                ),
+            ]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.y_grid = self.ceildiv("y_grid_raw_", "y_grid_div_")
         self.z_grid = "y_grid_div_"
 
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 27858e076a89..16f0defb1185 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -28,9 +28,15 @@
 from torch._dynamo.utils import counters, dynamo_timed
 from torch._inductor.codecache import LambdaFuture, PyCodeCache
 from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.symbol import free_symbol_is_type, SymT
+=======
+from torch.fx.experimental.symbolic_shapes import free_symbols
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._triton import has_triton
 
 from . import comms, config, dependencies, ir, metrics
@@ -39,12 +45,21 @@
 from .comm_analysis import estimate_nccl_collective_runtime
 from .dependencies import Dep, MemoryDep, StarDep, WeakDep
 from .exc import GPUTooOldForTriton, TritonMissing
+<<<<<<< HEAD
 from .ir import (
     ComputedBuffer,
+=======
+from .fx_utils import count_flops_fx, countable_fx
+from .ir import (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     get_device_type,
     GraphPartitionSignature,
     MultiOutput,
     MultiOutputLayout,
+<<<<<<< HEAD
+=======
+    NoneLayout,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from .loop_body import LoopBody
 from .memory import MemoryPlanningInfoForBuffer, MemoryPlanningInfoForNode
@@ -57,8 +72,15 @@
     get_device_tflops,
     get_dtype_size,
     get_gpu_dram_gbps,
+<<<<<<< HEAD
+    IndentedBuffer,
+    is_collective,
+=======
+    GraphPartitionMap,
     IndentedBuffer,
     is_collective,
+    is_cudagraph_unsafe_op,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_gpu,
     is_multi_outputs_template,
     is_output_of_multi_outputs_template,
@@ -71,6 +93,12 @@
 log = logging.getLogger(__name__)
 fusion_log = torch._logging.getArtifactLogger(__name__, "fusion")
 loop_ordering_log = torch._logging.getArtifactLogger(__name__, "loop_ordering")
+<<<<<<< HEAD
+=======
+compute_dependencies_log = torch._logging.getArtifactLogger(
+    __name__, "compute_dependencies"
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 PartitionType = list["BaseSchedulerNode"]
 
@@ -179,6 +207,12 @@ def get_mutations(self) -> Sequence[str]:
         assert self.node is not None
         return self.node.get_mutation_names()
 
+<<<<<<< HEAD
+=======
+    def get_device(self) -> Optional[torch.device]:
+        return self.node.get_output_spec().get_device()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class SchedulerDonatedBuffer(SchedulerBuffer):
@@ -206,7 +240,11 @@ def __init__(self, scheduler: Scheduler) -> None:
 
     def _init_from_node(self, node: ir.Operation) -> None:
         self.node: Optional[ir.Operation] = node
+<<<<<<< HEAD
         self.ancestors = OrderedSet[str]()
+=======
+        self.ancestors: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.last_usage = OrderedSet[
             str
         ]()  # buffers that won't be used after this kernel
@@ -319,7 +357,11 @@ def used_buffer_names(self) -> OrderedSet[str]:
         )
 
     def used_or_aliased_buffer_names(self) -> OrderedSet[str]:
+<<<<<<< HEAD
         used_names = OrderedSet[str]()
+=======
+        used_names: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         deps = [
             dep.name
@@ -471,7 +513,11 @@ def single_index_in_fused_node(buf_to_be_inplaced: SchedulerBuffer) -> bool:
             buf_name = buf_to_be_inplaced.get_name()
             # Dedup read/writes with equivalent indices
             # TODO - would be nice if we could just cache accesses on ReadWrites,
+<<<<<<< HEAD
             # and inforce variant that this class & members are functional..
+=======
+            # and enforce variant that this class & members are functional..
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             deps: OrderedSet[Dep] = OrderedSet()
             for user in buf_to_be_inplaced.users:
                 user_node = user.node
@@ -479,7 +525,13 @@ def single_index_in_fused_node(buf_to_be_inplaced: SchedulerBuffer) -> bool:
                     continue
 
                 if (
+<<<<<<< HEAD
                     buf_to_be_inplaced.scheduler.get_fused_node(user_node)
+=======
+                    user_node.get_first_name()
+                    not in buf_to_be_inplaced.scheduler.name_to_fused_node
+                    or buf_to_be_inplaced.scheduler.get_fused_node(user_node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     is not fused_node
                 ):
                     continue
@@ -669,6 +721,16 @@ def get_read_write_buffer_accesses(
         ):
             # todo: Calculate this - it's kinda annoying.
             return {}
+<<<<<<< HEAD
+=======
+        if (
+            isinstance(self, ExternKernelSchedulerNode)
+            and isinstance(self.node, ir.FallbackKernel)
+            and self.node.op_overload
+            is torch._prims.rng_prims.graphsafe_run_with_rng_state
+        ):
+            return {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def try_size_hint(s: sympy.Expr) -> int:
             return V.graph.sizevars.size_hint(s, fallback=0)
@@ -769,6 +831,25 @@ def get_buf_bytes(
         return buf_byte_accesses
 
     @cache_on_self
+<<<<<<< HEAD
+=======
+    def estimate_flops(self) -> int | None:
+        if self.node is None:
+            return None
+        fx_node = self.node.get_origin_node()
+        if fx_node is None:
+            return None
+        if not countable_fx(fx_node):
+            return None
+
+        flops = count_flops_fx(fx_node)
+
+        resolved_flops = V.graph.sizevars.size_hints((flops,), fallback=0)[0]
+        counters["inductor"]["flop_count"] += resolved_flops
+        return resolved_flops
+
+    @cache_on_self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_estimated_runtime(self) -> float:
         """
         Returns estimated op runtime in nanoseconds (ns)
@@ -805,6 +886,7 @@ def get_estimated_runtime(self) -> float:
         try:
             gpu_memory_bandwidth = get_gpu_dram_gbps()
             gpu_flops = get_device_tflops(dtype) * 10**12
+<<<<<<< HEAD
         except Exception:
             return 0
 
@@ -859,6 +941,35 @@ def get_estimated_runtime(self) -> float:
             return self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
 
         return 0
+=======
+            # If cudaGetDeviceProperties returns 0 for gpu_memory_bandwidth or gpu_flops
+            # there is a chance to continue execution successfully. Otherwise, it would fail with
+            # ZeroDivisionError below.
+            if gpu_memory_bandwidth <= 0:
+                raise AssertionError(
+                    f"gpu_memory_bandwidth cannot be <= 0, but got {gpu_memory_bandwidth}"
+                )
+            if gpu_flops <= 0:
+                raise AssertionError(f"gpu_flops cannot be <= 0, but got {gpu_flops}")
+        except Exception:
+            return 0
+
+        flops_est = self.estimate_flops()
+
+        if flops_est == 0 or flops_est is None:
+            # no flops estimate, so fall back to memory estimate
+            return self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
+
+        # TODO(xmfan): find a better heuristic to model FLOPS/latency relationship
+        factor = 1.0
+        counted_bytes = self.get_read_write_buffers_sizes()
+        counted_bytes = 0 if counted_bytes is None else counted_bytes
+        compute_time = (factor * flops_est / gpu_flops) * 1e9
+        transfer_time = counted_bytes / gpu_memory_bandwidth
+
+        # Return estimated runtime in nanoseconds
+        return max(compute_time, transfer_time)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_template_node(self) -> Optional[ir.TemplateBuffer]:
         return None
@@ -886,13 +997,22 @@ def get_prologue_template_epilogue(
 class WhyNoFuse:
     # TODO when we drop support for Python < 3.10, we can use
     # @dataclass(slots=True) instead of manually specifying __slots__.
+<<<<<<< HEAD
     __slots__ = ["node1", "node2", "reason", "args"]
+=======
+    __slots__ = ["name1", "name2", "reason", "args"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reason: str
     args: tuple[Any, ...]
 
     def __init__(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> None:
+<<<<<<< HEAD
         self.node1 = node1
         self.node2 = node2
+=======
+        self.name1 = node1.get_name()
+        self.name2 = node2.get_name()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(self, reason: str, *args: Any) -> None:
         self.reason = reason
@@ -900,7 +1020,11 @@ def __call__(self, reason: str, *args: Any) -> None:
         fusion_log.debug(self)
 
     def __str__(self) -> str:
+<<<<<<< HEAD
         return f"cannot fuse {self.node1.get_name()} with {self.node2.get_name()}: " + (
+=======
+        return f"cannot fuse {self.name1} with {self.name2}: " + (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.reason % self.args
         )
 
@@ -972,6 +1096,7 @@ def should_prune(dep: Dep) -> bool:
         node.set_read_writes(node.read_writes.remove_reads(deps_to_prune))
 
 
+<<<<<<< HEAD
 # TODO(xmfan): reuse: an existing mapping for this if it exists, or formalize this into ir.py:ExternKernel
 kernel_name_to_op = {
     "extern_kernels.convolution": torch.ops.aten.convolution,
@@ -982,6 +1107,8 @@ def should_prune(dep: Dep) -> bool:
 }
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ExternKernelSchedulerNode(BaseSchedulerNode):
     def __init__(self, scheduler: Scheduler, node: ir.Operation) -> None:
         super().__init__(scheduler)
@@ -1085,7 +1212,11 @@ def refresh_dependencies(
 
             # TODO(shunting) if this cause compilation time increase when
             # enabling LOAF by default, try just clearing the specific cache
+<<<<<<< HEAD
             # entry by using a customized cache implemetation rather than
+=======
+            # entry by using a customized cache implementation rather than
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # lru_cache.
             SIMDScheduling.candidate_tilings.cache_clear()
 
@@ -1244,7 +1375,11 @@ def can_inplace(self, read_dep: dependencies.Dep) -> bool:
 
     @cache_on_self
     def _get_atomic_add_buffers(self) -> OrderedSet[str]:
+<<<<<<< HEAD
         buffers_store_as_atomic_add = OrderedSet[str]()
+=======
+        buffers_store_as_atomic_add: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self._body, LoopBody):
             for node in self._body.get_nodes():
                 if (
@@ -1346,6 +1481,27 @@ def fuse(
         nodes = list(itertools.chain(node1.get_nodes(), node2.get_nodes()))
         return cls(node1.scheduler, nodes)
 
+<<<<<<< HEAD
+=======
+    @cache_on_self
+    def estimate_flops(self) -> int | None:
+        # don't increment counters in fused methods so we don't double count
+        fps = list(
+            filter(
+                None,
+                (
+                    node.estimate_flops()
+                    for node in self.get_nodes()
+                    if node.is_template() or node.is_extern()
+                ),
+            )
+        )
+        if len(fps) == 0:
+            return None
+        ret = sum(fps)
+        return ret
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def reorder_loops_by_dep_pair(
         self, self_dep: MemoryDep, other_dep: MemoryDep
     ) -> None:
@@ -1429,7 +1585,11 @@ def set_last_usage(
         super().set_last_usage(future_used_buffers, mutation_real_name)
         # Set self.last_usage on the snodes
         # This will be used for optimisations within the kernel
+<<<<<<< HEAD
         future_used_buffers = OrderedSet[str]()
+=======
+        future_used_buffers: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in reversed(self.snodes):
             node.set_last_usage(future_used_buffers, mutation_real_name)
             future_used_buffers.update(node.last_usage)
@@ -1721,6 +1881,13 @@ def __init__(
             for name in other_node.get_operation_names():
                 self.name_to_node[name] = other_node
 
+<<<<<<< HEAD
+=======
+            self.outputs_by_name: dict[str, SchedulerBuffer] = {
+                k: v for snode in self.snodes for k, v in snode.outputs_by_name.items()
+            }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.use_custom_partition_algo = use_custom_partition_algo
         device = snodes[0].get_device()
         assert device
@@ -1755,8 +1922,14 @@ def combinable_nodes(
         template_nodes = [x for x in filtered_nodes if x.is_template()]
         if template_nodes:
             log.debug(
+<<<<<<< HEAD
                 "ComboKernels: %d template nodes are filtered",
                 OrderedSet([len(template_nodes)]),
+=======
+                "ComboKernels: %d template nodes are filtered: %s",
+                len(template_nodes),
+                template_nodes,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         filtered_nodes = [x for x in filtered_nodes if x not in template_nodes]
         return filtered_nodes
@@ -1887,6 +2060,27 @@ def get_outputs(self) -> list[SchedulerBuffer]:
             result.extend(node.get_outputs())
         return result
 
+<<<<<<< HEAD
+=======
+    @cache_on_self
+    def estimate_flops(self) -> int | None:
+        # don't increment counters in fused methods so we don't double count
+        fps = list(
+            filter(
+                None,
+                (
+                    node.estimate_flops()
+                    for node in self.get_nodes()
+                    if node.is_template() or node.is_extern()
+                ),
+            )
+        )
+        if len(fps) == 0:
+            return None
+        ret = sum(fps)
+        return ret
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_nodes(self) -> Sequence[BaseSchedulerNode]:
         return self.snodes
 
@@ -1978,6 +2172,14 @@ def merge(self, other: NodeUser) -> NodeUser:
 
 
 class Scheduler:
+<<<<<<< HEAD
+=======
+    """
+    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
+    optimizations such as fusion, reorder, and graph partition.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __dep_size_hint_cache: dict[Dep, int]
 
     def __init__(self, nodes: list[ir.Operation]) -> None:
@@ -1992,7 +2194,11 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         self.post_grad_graph_id = next(_post_grad_graph_counter)
         self._graph_partition_counter = itertools.count()
 
+<<<<<<< HEAD
         self.completed_operations = OrderedSet[str]()
+=======
+        self.completed_operations: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.available_buffer_names = OrderedSet(
             [
                 *V.graph.graph_inputs.keys(),
@@ -2061,6 +2267,11 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         if config._pre_fusion_custom_pass is not None:
             self.nodes = config._pre_fusion_custom_pass(self.nodes)
         self.nodes = self.fuse_nodes(self.nodes)
+<<<<<<< HEAD
+=======
+        if config._post_fusion_custom_pass is not None:
+            self.nodes = config._post_fusion_custom_pass(self.nodes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.merge_loops()
         self.finalize_multi_template_buffers()
         if config.combo_kernels:
@@ -2081,13 +2292,25 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         if config.reorder_for_compute_comm_overlap:
             self.nodes = comms.reorder_compute_and_comm_for_overlap(self.nodes)
         self.process_grouped_nodes()
+<<<<<<< HEAD
+=======
+
+        if torch._inductor.config.graph_partition:
+            self.nodes = self.maybe_reorder_for_minimizing_partition(self.nodes)
+            self.nodes = self.reorder_for_partition_with_simple_dependency(self.nodes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.compute_last_usage()
         log_ir_post_fusion(self.nodes)
         V.debug.graph_diagram(self.nodes)
         self.debug_draw_graph()
 
         # used during codegen:
+<<<<<<< HEAD
         self.buffer_names_to_free = OrderedSet[str]()
+=======
+        self.buffer_names_to_free: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # fx graph node to the position it appears in the graph
         # for debug attribution
@@ -2147,7 +2370,11 @@ def create_scheduler_node(self, node: ir.Operation) -> BaseSchedulerNode:
             raise NotImplementedError(node)
 
     def create_foreach_nodes(self) -> None:
+<<<<<<< HEAD
         removed_node_names = OrderedSet[str]()
+=======
+        removed_node_names: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fe_nodes = []
         kept_node_names = self.name_to_fused_node.keys()
 
@@ -2231,6 +2458,18 @@ def __add__(self, other: DedupList[T]) -> DedupList[T]:
         for node in self.nodes:
             for buf1 in node.get_outputs():
                 buf1_name = buf1.get_name()
+<<<<<<< HEAD
+=======
+                # This is for handling auto functionized ops which return None
+                # and mutate more than 1 inputs, we shouldn't let them all
+                # point to the same user list since buffers in the aliases
+                # list might not be alias to each other.
+                if (
+                    isinstance(buf1.node.layout, ir.NoneLayout)
+                    and len(buf1.get_aliases()) > 1
+                ):
+                    continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for buf2_name in buf1.get_aliases():
                     if buf1_name in name_to_users and buf2_name in name_to_users:
                         # merge the two
@@ -2272,6 +2511,16 @@ def add_user(
             if isinstance(val, sympy.Expr):
                 for fs in val.free_symbols:
                     unbacked_symbol_to_origin_node[fs] = None
+<<<<<<< HEAD
+=======
+            elif isinstance(val, ir.TensorBox):
+                # We also need to add symbols from input size as well because
+                # AOTI doesn't lift the unbacked symints to inputs
+                sym_size = [s for s in val.get_size() if isinstance(s, sympy.Expr)]
+                for s in sym_size:
+                    for fs in s.free_symbols:
+                        unbacked_symbol_to_origin_node[fs] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for node in self.nodes:
             log.debug("scheduling %s", node.node)
@@ -2291,7 +2540,11 @@ def add_user(
                     unbacked_symbol_to_origin_node[s] = node.get_name()
 
             unbacked_symbol_uses = sorted(
+<<<<<<< HEAD
                 node.node.get_unbacked_symbol_uses(), key=lambda x: x.name
+=======
+                node.node.get_free_symbol_uses(unbacked_only=True), key=lambda x: x.name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             # if a kernel takes unbacked symints, register dependencies
             for s in unbacked_symbol_uses:
@@ -2356,7 +2609,11 @@ def add_user(
 
         # make sure unbacked symints aren't dead-code-eliminated
         for out in V.graph.graph_outputs:
+<<<<<<< HEAD
             for s in out.get_unbacked_symbol_uses():
+=======
+            for s in out.get_free_symbol_uses(unbacked_only=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert s in unbacked_symbol_to_origin_node, (
                     f"{s} not in {unbacked_symbol_to_origin_node.keys()}"
                 )
@@ -2391,6 +2648,21 @@ def add_user(
         for name in self.name_to_donated_buffer:
             self.name_to_donated_buffer[name].set_users(name_to_users[name].items)
 
+<<<<<<< HEAD
+=======
+        # For debug logging
+        logbuf = IndentedBuffer()
+        logbuf.splice("{")
+        for key, value in name_to_users.items():
+            with logbuf.indent():
+                users = [v.get_name() for v in value.items]
+                logbuf.splice(f"'{key}': {users},")
+        logbuf.splice("}")
+        str = logbuf.getrawvalue().rstrip()
+        compute_dependencies_log.debug("BUFFER USER LIST\n")
+        compute_dependencies_log.debug("===== AFTER SCHEDULING =====\n%s", str)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def dead_node_elimination(self) -> None:
         """
         Remove any nodes without users
@@ -2461,7 +2733,11 @@ def visit(n: BaseSchedulerNode) -> None:
         return result
 
     def _get_unmet_dep_nodes(self, snode: BaseSchedulerNode) -> list[BaseSchedulerNode]:
+<<<<<<< HEAD
         unmet_deps = OrderedSet[str]()
+=======
+        unmet_deps: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(
             snode,
             (
@@ -2513,7 +2789,11 @@ def compute_ancestors(self) -> None:
         # note self.nodes is topologically sorted
         name_to_ancestors: dict[str, OrderedSet[str]] = {}
         for node in self.nodes:
+<<<<<<< HEAD
             ancestors = OrderedSet[str]()
+=======
+            ancestors: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for dep in node.unmet_dependencies:
                 dep_node_name = self.name_to_buf[dep.name].defining_op_name()
                 ancestors.add(dep_node_name)
@@ -2556,7 +2836,13 @@ def fuse_nodes(self, nodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
         """
         Combine eligible nodes into FusedSchedulerNodes.
         """
+<<<<<<< HEAD
         with dynamo_timed("Scheduler.fused_nodes"):
+=======
+        with dynamo_timed(
+            "Scheduler.fused_nodes", log_pt2_compile_event=True, log_waitcounter=True
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i in range(10):
                 old_len = len(nodes)
                 fusion_log.debug(
@@ -2640,7 +2926,11 @@ def finalize_multi_template_buffers(self) -> None:
         choice finalized through fusion. In the case of an extern choice, this will result
         in replacing the SchedulerNode.
 
+<<<<<<< HEAD
         If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choie
+=======
+        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         will force completion of compilation and benchmarking.
         """
 
@@ -2831,9 +3121,12 @@ def compile_kernel(
 
             return (fut, mod)
 
+<<<<<<< HEAD
         # After the succesful fusion with Template, we finalize its config.
         # Subsequently we benchmark but dont update. Checking for SchedulerNode, instead of FusedSchedulerNode
         # accomplishes this.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_multi_template and any(
             n.get_template_node() is not None for n in (node1, node2)
         ):
@@ -2844,10 +3137,16 @@ def compile_kernel(
                 else node2.get_template_node()
             )
             assert isinstance(multi_node, ir.MultiTemplateBuffer)
+<<<<<<< HEAD
             choice_timings = multi_node.choice_timings
             _, ms1 = multi_node.get_min_choice()
 
             # Eagerly compile and benchmark non-template nodes
+=======
+
+            # Eagerly compile and benchmark non-template nodes
+            choice_timings = multi_node.choice_timings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _, ms1 = multi_node.get_min_choice()
             ms2, path2 = (
                 self.benchmark_fused_nodes(node_list_2)
@@ -2859,7 +3158,11 @@ def compile_kernel(
             future_choices: list[tuple[Any, Optional[LambdaFuture], ModuleType]] = []
             triton_choices = 0
             for choice, unfused_time in sorted(
+<<<<<<< HEAD
                 choice_timings.items(), key=lambda x: x[1]
+=======
+                choice_timings.items(), key=operator.itemgetter(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 if not isinstance(choice, torch._inductor.ir.TritonTemplateCallerBase):
                     continue
@@ -3022,7 +3325,11 @@ def fuse_nodes_once(
         if fusion_log.isEnabledFor(logging.DEBUG):
             fusion_log.debug("fuse_nodes_once, candidates:")
             for node in fused_nodes:
+<<<<<<< HEAD
                 fusion_log.debug("  " + node.debug_str_short())  # noqa: G003
+=======
+                fusion_log.debug("  %s", node.debug_str_short())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # These are potential fusions which we are async compiling,
         # and which we will benchmark profitability of.
@@ -3121,7 +3428,11 @@ def create_combo_kernel_nodes(self, num_ck_nodes: Optional[int] = None) -> None:
         fused_nodes = OrderedSet(self.nodes)
         count = 0
         num_nodes_orig = len(self.nodes)
+<<<<<<< HEAD
         log.debug("ComboKernels: Generating with num_ck_nodes = %d...", num_ck_nodes)
+=======
+        log.debug("ComboKernels: Generating with num_ck_nodes = %s...", num_ck_nodes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for num, node_list in enumerate(
             ForeachKernelSchedulerNode.group_nodes_for_combo_kernels(self)
         ):
@@ -3155,7 +3466,11 @@ def create_combo_kernel_nodes(self, num_ck_nodes: Optional[int] = None) -> None:
         self.nodes = sorted(fused_nodes, key=lambda x: x.min_order)
         self.nodes = self.topological_sort_schedule(self.nodes)
         log.info(
+<<<<<<< HEAD
             "Generated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodels",
+=======
+            "Generated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             count,
             num_nodes_orig,
             len(self.nodes),
@@ -3177,7 +3492,15 @@ def get_possible_fusions(
 
         def check_all_pairs(nodes: list[BaseSchedulerNode]) -> None:
             for node1_index, node1 in enumerate(nodes):
+<<<<<<< HEAD
                 for node2 in nodes[node1_index + 1 :]:
+=======
+                for node2 in nodes[
+                    node1_index + 1 : node1_index
+                    + 1
+                    + config.max_fusion_buffer_group_pairwise_attempts
+                ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     key = (node1, node2)
                     if key in seen:
                         continue
@@ -3269,7 +3592,11 @@ def can_fusion_increase_peak_memory(
         Return true if fusing the two nodes can potentially increasing peak memory.
 
         The implementation is more like a heuristic since we don't really know if we are at peak
+<<<<<<< HEAD
         or not when trying to fuse these two ndoes. The order of nodes may change later which makes the
+=======
+        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         peak memory estimation hard.
 
         Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
@@ -3309,7 +3636,11 @@ def _find_single_user_inputs(
             try:
                 memory_overhead += int(key[2])
             except ValueError:
+<<<<<<< HEAD
                 # not an interger. Fallback is to fuse
+=======
+                # not an integer. Fallback is to fuse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
 
         bw_saving = self.score_fusion_memory(node1, node2)
@@ -3414,7 +3745,11 @@ def shared_data_after_reordering_loop(
         """
         Right now just greedily reorder the loop of node1 to be compatible with node2,
         but ideally we should have some heuristics to reorder the loop for node2
+<<<<<<< HEAD
         to be compatibile with node1 if that's more efficient.
+=======
+        to be compatible with node1 if that's more efficient.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
 
         # TODO Don't do loop reordering for CPU for now.
@@ -3455,7 +3790,11 @@ def shared_data_after_reordering_loop(
             return 0
 
         # Pick the largest buffer to guide the loop reordering
+<<<<<<< HEAD
         _numel, lhs_dep, rhs_dep = max(candidates, key=lambda x: x[0])
+=======
+        _numel, lhs_dep, rhs_dep = max(candidates, key=operator.itemgetter(0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not isinstance(lhs_dep, MemoryDep) or not isinstance(rhs_dep, MemoryDep):
             return 0
@@ -3513,7 +3852,11 @@ def check_prologue_fusion_heuristics_fusable(
         # potential bad cache behavior and shared memory use.
         # we also want to avoid benchmarking reliably unprofitable fusions like downcasts from fp32 -> fp16 inside kernel.
         # allowing gathers by allowing increasing write_bytes by small factor
+<<<<<<< HEAD
         # TODO - make configurable per input, for insance, bias can fuse fp32 -> fp16 profitably
+=======
+        # TODO - make configurable per input, for instance, bias can fuse fp32 -> fp16 profitably
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         BYTES_THRESHOLD_MULTIPLIER = 1.1
         if read_bytes > (write_bytes * BYTES_THRESHOLD_MULTIPLIER):
@@ -3935,6 +4278,11 @@ def free_buffers(self) -> None:
                 inp = V.graph.graph_inputs[name]
                 if isinstance(inp, ir.TorchBindObject):
                     V.graph.wrapper_code.codegen_free(inp)
+<<<<<<< HEAD
+=======
+                elif isinstance(inp, ir.GeneratorState):
+                    continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     storage = inp.data
                     assert (
@@ -4024,6 +4372,12 @@ def can_buffer_be_removed_through_fusion(
 
     def should_partition(self, node: BaseSchedulerNode) -> bool:
         """Return True if we should partition the inductor graph on this node"""
+<<<<<<< HEAD
+=======
+        if isinstance(node, FusedSchedulerNode):
+            return any(self.should_partition(snode) for snode in node.snodes)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not node.is_gpu():
             return True
 
@@ -4039,10 +4393,14 @@ def should_partition(self, node: BaseSchedulerNode) -> bool:
         if getattr(node.node, "unbacked_bindings", None):
             return True
 
+<<<<<<< HEAD
         if hasattr(node.node, "layout") and any(
             isinstance(expr, sympy.Expr) and expr.free_symbols
             for expr in node.node.layout.size
         ):
+=======
+        if is_cudagraph_unsafe_op(node.node):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return True
 
         return False
@@ -4063,6 +4421,153 @@ def get_name_to_nodes(
 
         return name_to_node
 
+<<<<<<< HEAD
+=======
+    def compute_graph_partition_maps(
+        self,
+        signatures: list[GraphPartitionSignature],
+    ) -> None:
+        """
+        computes a mapping from partition input/output indices to graph input/output
+        indices for each partition.
+        """
+        name_to_graph_input_index = {
+            name: idx for idx, name in enumerate(V.graph.graph_inputs)
+        }
+        name_to_graph_output_index = {
+            name: idx for idx, name in enumerate(V.graph.get_output_names())
+        }
+
+        V.graph.partition_maps = []
+        for partition_id, signature in enumerate(signatures):
+            if signature.skip_cudagraph:
+                # Note: [Graph Partition Map for CUDAGraph]
+                # number of partition map should be the same as the number of generated
+                # partition functions. This assumption will be used when cudagraphify
+                # each partition function.
+                continue
+
+            input_mapping = []
+            for name in signature.input_nodes:
+                input_mapping.append(name_to_graph_input_index.get(name))
+
+            output_mapping = []
+            for node in signature.output_nodes:
+                output_mapping.append(name_to_graph_output_index.get(node.get_name()))
+
+            V.graph.partition_maps.append(
+                GraphPartitionMap(
+                    partition_id,
+                    input_mapping,
+                    output_mapping,
+                    signature.constant_names,
+                )
+            )
+
+    def get_graph_partition_symbol_inputs(
+        self,
+        partition: PartitionType,
+        input_nodes: dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]],
+    ) -> OrderedSet[sympy.Symbol]:
+        """
+        Returns all symbol inputs which are required to be in scope to successfully
+        perform codegen for this graph partition, including:
+        - free symbols used in partition nodes
+        - free symbols in partition input/node shapes, strides, and offsets. This is needed
+          for recording cudagraphs for tensors with dynamic shapes.
+        """
+
+        def get_layout_symints(node: ir.IRNode) -> OrderedSet[sympy.Symbol]:
+            free_symbol_uses: OrderedSet[sympy.Symbol] = OrderedSet()
+            layout = node.maybe_get_layout()
+            if isinstance(layout, ir.Layout):
+                free_symbol_uses.update(
+                    free_symbols(layout.size)
+                    | free_symbols(layout.stride)
+                    | free_symbols(layout.offset)
+                )
+                if isinstance(layout, ir.MutationLayoutSHOULDREMOVE):
+                    # symint may be used as index in layout.target
+                    free_symbol_uses.update(get_layout_symints(layout.target))
+            else:
+                assert layout is None, (
+                    f"Expect layout to be None but found layout={layout}"
+                )
+            return free_symbol_uses
+
+        def get_scheduler_node_symbol_uses(
+            node: BaseSchedulerNode,
+        ) -> OrderedSet[sympy.Symbol]:
+            """
+            Gets symbols used in node.
+            """
+            if isinstance(node, FusedSchedulerNode):
+                return OrderedSet().union(
+                    *(get_scheduler_node_symbol_uses(snode) for snode in node.snodes)
+                )
+            assert node.node is not None
+            free_symbol_uses = node.node.get_free_symbol_uses()
+            free_symbol_uses.update(
+                *(get_layout_symints(ir_node) for ir_node in node.node.get_outputs())
+            )
+            return free_symbol_uses
+
+        def get_input_node_symbols(
+            node: Union[ir.IRNode, sympy.Expr, ir.TorchBindObject],
+        ) -> OrderedSet[sympy.Symbol]:
+            """
+            Gets symbols used in input node shapes, strides, and offsets.
+            """
+            if isinstance(node, ir.TorchBindObject):
+                # TorchBindObject does not involve dynamic shapes yet
+                return OrderedSet()
+            elif isinstance(node, ir.IRNode):
+                return get_layout_symints(node)
+            else:
+                # node cannot be sympy.Expr since node comes from read_writes and
+                # read_writes does not contain sympy.Expr
+                raise NotImplementedError(f"Unsupported input node type: {type(node)}")
+
+        def filter_symbols(
+            symbols: OrderedSet[sympy.Symbol],
+        ) -> OrderedSet[sympy.Symbol]:
+            """
+            Filters a set of symbols that are required for codegen. Skip symbols
+            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
+            and SymT.R0_INDEX.
+            """
+            return OrderedSet(
+                s
+                for s in symbols
+                if symbol_is_type(
+                    s,
+                    (
+                        SymT.SIZE,
+                        SymT.FLOAT,
+                        SymT.UNBACKED_INT,
+                        SymT.UNBACKED_FLOAT,
+                    ),
+                )
+            )
+
+        candidate_symbols: OrderedSet[sympy.Symbol] = OrderedSet().union(
+            *(get_scheduler_node_symbol_uses(node) for node in partition)
+        )
+        candidate_symbols.union(
+            *(get_input_node_symbols(node) for _, node in input_nodes.items())
+        )
+
+        candidate_symbols = filter_symbols(candidate_symbols)
+
+        res: OrderedSet[sympy.Symbol] = OrderedSet()
+        for s in candidate_symbols:
+            symplified_s = V.graph.sizevars.simplify(s)
+            # use free_symbols only when s is simplified to an Integer or expr
+            res.update(symplified_s.free_symbols)
+
+        return OrderedSet(sorted(res, key=operator.attrgetter("name")))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def get_graph_partition_signature(
         self, partitions: list[PartitionType], skip_cudagraphs: list[bool]
     ) -> list[GraphPartitionSignature]:
@@ -4075,6 +4580,29 @@ def get_graph_partition_signature(
         unmet_output_names = OrderedSet(V.graph.get_output_names())
         name_to_node = self.get_name_to_nodes()
 
+<<<<<<< HEAD
+=======
+        def is_none_layout(buf_name: str) -> bool:
+            """
+            Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
+            so graph partition should not take it as inputs or outputs.
+            """
+            buf = self.name_to_buf.get(buf_name, None)
+
+            if buf is None:
+                return False
+
+            if isinstance(buf.node.layout, NoneLayout):
+                if isinstance(buf.node, ir.MutationOutput) and (
+                    real_name := self.mutation_real_name.get(buf_name, None)
+                ):
+                    return is_none_layout(real_name)
+
+                return True
+
+            return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for partition, skip_cudagraph in zip(
             reversed(partitions), reversed(skip_cudagraphs)
         ):
@@ -4086,6 +4614,7 @@ def get_graph_partition_signature(
             returned_output_names = output_names.intersection(unmet_output_names)
 
             # all reads/writes are partition inputs except those generated
+<<<<<<< HEAD
             # within the partition
             read_writes = dependencies.ReadWrites.merge_list(
                 [node.read_writes for node in partition]
@@ -4095,6 +4624,31 @@ def get_graph_partition_signature(
                 - output_names
             )
 
+=======
+            # within the partition and tensor constants
+            read_writes = dependencies.ReadWrites.merge_list(
+                [node.read_writes for node in partition]
+            )
+
+            # WeakDep is fake dependency on unused buffer. It should not appear
+            # in partition_input_names for inputs that are actually read or written.
+            partition_input_names = (
+                OrderedSet(
+                    [
+                        x.name
+                        for x in read_writes.reads | read_writes.writes
+                        if not is_none_layout(x.name)
+                    ]
+                )
+                - output_names
+            )
+
+            partition_input_names = OrderedSet(
+                self.mutation_real_name.get(name, name)
+                for name in partition_input_names
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             buffer_names_to_free: OrderedSet[str] = OrderedSet()
             for node in partition:
                 buffer_names_to_free.update(node.last_usage)
@@ -4109,6 +4663,7 @@ def get_graph_partition_signature(
                 for name in partition_input_names
                 if name in name_to_node
             }
+<<<<<<< HEAD
             output_nodes = [name_to_node[name] for name in returned_output_names]
             signatures.append(
                 GraphPartitionSignature(
@@ -4118,12 +4673,224 @@ def get_graph_partition_signature(
                     skip_cudagraph,
                 )
             )
+=======
+
+            # if an input tensor is not freed in the partition function, it should
+            # also be returned as an output. This brings benefits to cudagraph
+            # since the returned output tensor is a cudagraph managed tensor with
+            # a static tensor address.
+            extra_output_names = [
+                name
+                for name in partition_input_names
+                if name in name_to_node and name not in buffer_names_to_free
+            ]
+
+            returned_output_names.update(extra_output_names)
+
+            returned_output_names = OrderedSet(
+                self.mutation_real_name.get(name, name)
+                for name in returned_output_names
+            )
+
+            output_nodes = [
+                name_to_node[name]
+                for name in returned_output_names
+                if not is_none_layout(name)
+            ]
+
+            constant_names = [
+                name for name in partition_input_names if name in V.graph.constants
+            ]
+
+            symbol_inputs = self.get_graph_partition_symbol_inputs(
+                partition, input_nodes
+            )
+
+            partition_signature = GraphPartitionSignature(
+                symbol_inputs,
+                input_nodes,
+                output_nodes,
+                input_deallocation,
+                skip_cudagraph,
+                constant_names,
+            )
+
+            signatures.append(partition_signature)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             unmet_output_names = partition_input_names.union(
                 unmet_output_names - returned_output_names
             )
 
         return signatures[::-1]
 
+<<<<<<< HEAD
+=======
+    def clean_removed_buffer_from_partition_signatures(
+        self, signature: GraphPartitionSignature
+    ) -> GraphPartitionSignature:
+        """
+        Updates the partition signature by removing buffers specified in
+        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
+        """
+        input_nodes = {
+            name: buffer
+            for name, buffer in signature.input_nodes.items()
+            if name not in V.graph.removed_buffers
+        }
+        input_deallocation = {
+            name: val
+            for name, val in signature.input_deallocation.items()
+            if name not in V.graph.removed_buffers
+        }
+        output_nodes = [
+            node
+            for node in signature.output_nodes
+            if node.maybe_get_name() not in V.graph.removed_buffers
+        ]
+        constant_names = [
+            name
+            for name in signature.constant_names
+            if name not in V.graph.removed_buffers
+        ]
+        return GraphPartitionSignature(
+            signature.symbol_inputs,
+            input_nodes,
+            output_nodes,
+            input_deallocation,
+            signature.skip_cudagraph,
+            constant_names,
+        )
+
+    def reorder_for_minimizing_partition(
+        self,
+        nodes: list[BaseSchedulerNode],
+    ) -> list[BaseSchedulerNode]:
+        """
+        Reorder nodes to minimize the number of partitions via a bfs
+        topological sort. This is the optimal reordering such that the
+        number of partitions cannot be reduced further. This may be
+        sub-optimal for other metrics such as peak memory. This does not
+        change relative orders of two cudagraphable nodes, nor the
+        relative order of two non_cudagraphable nodes.
+        """
+        import heapq
+
+        node_to_indegree: dict[BaseSchedulerNode, int] = dict()
+        cudagraphable_nodes: list[tuple[int, BaseSchedulerNode]] = []
+        non_cudagraphable_nodes: list[tuple[int, BaseSchedulerNode]] = []
+        node_to_index = {node: idx for idx, node in enumerate(nodes)}
+
+        def insert_pending_nodes(node: BaseSchedulerNode) -> None:
+            node_with_index = (node_to_index[node], node)
+            if self.should_partition(node):
+                heapq.heappush(non_cudagraphable_nodes, node_with_index)
+            else:
+                heapq.heappush(cudagraphable_nodes, node_with_index)
+
+        def update_indegree(node: BaseSchedulerNode) -> None:
+            for succ_node in node.mpi_node.succ_nodes:
+                assert node_to_indegree[succ_node] > 0
+                node_to_indegree[succ_node] -= 1
+                if node_to_indegree[succ_node] == 0:
+                    insert_pending_nodes(succ_node)
+
+        for node in nodes:
+            node_to_indegree[node] = len(node.mpi_node.pred_nodes)
+            if node_to_indegree[node] == 0:
+                insert_pending_nodes(node)
+
+        schedule: list[BaseSchedulerNode] = []
+        num_iters: int = 0
+        while num_iters < len(nodes) and (
+            non_cudagraphable_nodes or cudagraphable_nodes
+        ):
+            while non_cudagraphable_nodes:
+                _, node = heapq.heappop(non_cudagraphable_nodes)
+                schedule.append(node)
+                update_indegree(node)
+
+            while cudagraphable_nodes:
+                _, node = heapq.heappop(cudagraphable_nodes)
+                schedule.append(node)
+                update_indegree(node)
+
+            num_iters += 1
+
+        if num_iters > len(nodes):
+            raise RuntimeError(
+                """
+                Failed to schedule, while loop ran too long when
+                reordering for minimizing the num of partitions
+                """
+            )
+
+        return schedule
+
+    def maybe_reorder_for_minimizing_partition(
+        self,
+        nodes: list[BaseSchedulerNode],
+    ) -> list[BaseSchedulerNode]:
+        """
+        Reorder nodes to minimize the number of partitions if this only slightly
+        increase peak memory.
+        """
+        from .memory import estimate_peak_memory, prepare_planning_info
+
+        graph_outputs = OrderedSet(V.graph.get_output_names())
+
+        default_peak_memory, name_to_freeable_input_buf = prepare_planning_info(
+            nodes,
+            self.name_to_buf,
+            self.name_to_fused_node,
+            OrderedSet(V.graph.graph_inputs.keys()),
+            graph_outputs,
+        )
+
+        reordered_nodes = self.reorder_for_minimizing_partition(nodes)
+        reorder_peak_memory, _ = estimate_peak_memory(
+            reordered_nodes, name_to_freeable_input_buf, graph_outputs
+        )
+
+        # 1.1 here means 10% extra peak memory budget which is quite arbitrary
+        if reorder_peak_memory < default_peak_memory * 1.1:
+            return reordered_nodes
+
+        return nodes
+
+    def reorder_for_partition_with_simple_dependency(
+        self, nodes: list[BaseSchedulerNode]
+    ) -> list[BaseSchedulerNode]:
+        """
+        Reorder a node if it should be partitioned and has simple dependency:
+        1. move a partitioned node to the front if it has no dependency
+        2. move a partitioned node to the back if it is only used by OutputNode
+        3. otherwise do not reorder
+        """
+
+        front: list[BaseSchedulerNode] = []
+        middle: list[BaseSchedulerNode] = []
+        back: list[BaseSchedulerNode] = []
+
+        def only_output_user(node: BaseSchedulerNode) -> bool:
+            for buf in node.get_outputs():
+                for use in buf.users:
+                    if not isinstance(use.node, OutputNode):
+                        return False
+            return True
+
+        for node in nodes:
+            should_partition = self.should_partition(node)
+            if should_partition and len(node.unmet_dependencies) == 0:
+                front.append(node)
+            elif should_partition and only_output_user(node):
+                back.append(node)
+            else:
+                middle.append(node)
+
+        return front + middle + back
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def graph_partition(
         self,
     ) -> tuple[list[PartitionType], list[GraphPartitionSignature]]:
@@ -4132,7 +4899,10 @@ def graph_partition(
         graph partitions and compute partition input/output signatures.
         """
         partitions: list[PartitionType] = []
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         skip_cudagraph = True
         cur_partition: PartitionType = []
         skip_cudagraphs = []
@@ -4150,9 +4920,18 @@ def graph_partition(
             partitions.append(cur_partition)
             skip_cudagraphs.append(skip_cudagraph)
 
+<<<<<<< HEAD
         return partitions, self.get_graph_partition_signature(
             partitions=partitions, skip_cudagraphs=skip_cudagraphs
         )
+=======
+        signatures = self.get_graph_partition_signature(
+            partitions=partitions, skip_cudagraphs=skip_cudagraphs
+        )
+        self.compute_graph_partition_maps(signatures)
+
+        return partitions, signatures
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def codegen(self) -> None:
         with dynamo_timed("Scheduler.codegen"):
@@ -4168,6 +4947,11 @@ def _codegen_partition_wrapper(
         signature: GraphPartitionSignature,
     ) -> None:
         """Codegen a partition given its inputs/outputs"""
+<<<<<<< HEAD
+=======
+        from .codegen.wrapper import SubgraphPythonWrapperCodegen
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         parent_wrapper_code = V.graph.wrapper_code
         graph_partition_id = next(self._graph_partition_counter)
 
@@ -4179,6 +4963,23 @@ def _codegen_partition_wrapper(
                 partition_signatures=signature,
             )
             self._codegen(partition)
+<<<<<<< HEAD
+=======
+
+            # Note: [Removed Graph Partition Arguments]
+            # Graph partition relies on node.read_writes to analyze the partition
+            # inputs and outputs. However, during codegen, we may decide some buffers
+            # are internal to a kernel (e.g., triton kernel) such that these buffers
+            # are never actually defined. This information is collected during codegen
+            # and recorded in V.graph.removed_buffers. So we cleanup signature and write
+            # prefix (i.e., generating call function and return outputs) after we have
+            # codegen the partition.
+            assert isinstance(V.graph.wrapper_code, SubgraphPythonWrapperCodegen)
+            signature = self.clean_removed_buffer_from_partition_signatures(signature)
+            V.graph.wrapper_code.partition_signatures = signature
+            V.graph.wrapper_code.write_prefix()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             partition_code, _ = V.graph.wrapper_code.generate(V.graph.is_inference)
 
         V.graph.wrapper_code.define_subgraph_launcher_fn(partition_code.value)
@@ -4209,6 +5010,16 @@ def _codegen_partitions(self) -> None:
         num_partitions = next(self._graph_partition_counter)
         V.graph.wrapper_code.set_all_partition_names(num_partitions)
 
+<<<<<<< HEAD
+=======
+        # See [Note: Graph Partition Map for CUDAGraph]
+        if num_partitions > 0:
+            assert V.graph.partition_maps is not None
+            assert num_partitions == len(V.graph.partition_maps), (
+                f"Expect {num_partitions} partition maps but got {len(V.graph.partition_maps)}"
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
         if config.check_stack_no_cycles_TESTING_ONLY:
             import torch._dynamo.convert_frame
@@ -4301,7 +5112,15 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
 
             if not isinstance(node, NopKernelSchedulerNode):
                 device = node.get_device()
+<<<<<<< HEAD
                 if device is not None and self.get_backend(device).ready_to_flush():
+=======
+                if (
+                    device is not None
+                    and device.type != "meta"
+                    and self.get_backend(device).ready_to_flush()
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.flush()
 
         if self.current_device and device_need_guard(self.current_device.type):
@@ -4361,7 +5180,11 @@ def speedup_by_combo_kernel(self, nodes: list[BaseSchedulerNode]) -> bool:
                     )
                     return False
             except CompilationError as e:
+<<<<<<< HEAD
                 # workaround triton issue: https://github.com/openai/triton/issues/2151
+=======
+                # workaround triton issue: https://github.com/triton-lang/triton/issues/2151
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if "Loop-carried variable" in str(e):
                     fusion_log.debug(
                         "ComboKernel benchmark: return True because of loop-carried variable"
@@ -4375,7 +5198,11 @@ def speedup_by_combo_kernel(self, nodes: list[BaseSchedulerNode]) -> bool:
         try:
             ms2, ms2_clone, _path2_list = self.benchmark_combo_kernel(subkernel_nodes)
         except CompilationError as e:
+<<<<<<< HEAD
             # workaround triton issue: https://github.com/openai/triton/issues/2151
+=======
+            # workaround triton issue: https://github.com/triton-lang/triton/issues/2151
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if "Loop-carried variable" in str(e):
                 fusion_log.debug(
                     "ComboKernel benchmark: return True because of loop-carried variable"
@@ -4413,7 +5240,13 @@ def update_zero_dim_cpu_tensor(self) -> None:
                     if (
                         buffer
                         and get_device_type(buffer) == "cpu"
+<<<<<<< HEAD
                         and not isinstance(buffer.layout, MultiOutputLayout)
+=======
+                        and not isinstance(
+                            buffer.layout, (NoneLayout, MultiOutputLayout)
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         and buffer.get_size() == []
                     ):
                         V.graph.zero_dim_cpu_tensor_list.add(read.name)
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 3dd1f2db38cf..4219e4755f15 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -1,5 +1,8 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import builtins
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import contextlib
 import dataclasses
 import functools
@@ -14,9 +17,17 @@
 import sys
 import textwrap
 import time
+<<<<<<< HEAD
 from concurrent.futures import as_completed, ThreadPoolExecutor
 from io import StringIO
 from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+=======
+from collections.abc import Sequence
+from concurrent.futures import as_completed, ThreadPoolExecutor
+from io import StringIO
+from types import ModuleType
+from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import Self
 from unittest.mock import patch
 
@@ -27,7 +38,11 @@
 from torch._dynamo.device_interface import get_interface_for_device
 from torch._dynamo.testing import rand_strided
 from torch._dynamo.utils import counters, dynamo_timed, identity, preserve_rng_state
+<<<<<<< HEAD
 from torch._inductor.utils import clear_on_fresh_inductor_cache
+=======
+from torch._inductor.utils import clear_on_fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._filelock import FileLock
 from torch.utils._ordered_set import OrderedSet
 
@@ -46,8 +61,15 @@
     KernelTemplate,
     OpOverrides,
     WorkspaceArg,
+<<<<<<< HEAD
+)
+from .codegen.simd_kernel_features import SIMDKernelFeatures
+=======
+    WorkspaceZeroMode,
 )
 from .codegen.simd_kernel_features import SIMDKernelFeatures
+from .codegen.subgraph import SubgraphChoiceCaller
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .codegen.triton import (
     gen_common_triton_imports,
     texpr,
@@ -61,9 +83,17 @@
 from .ops_handler import StoreMode
 from .runtime.benchmarking import benchmarker
 from .runtime.hints import DeviceProperties
+<<<<<<< HEAD
+from .runtime.triton_heuristics import FixedGrid
+from .utils import (
+    ceildiv,
+=======
+from .runtime.triton_compat import HAS_WARP_SPEC
 from .runtime.triton_heuristics import FixedGrid
 from .utils import (
     ceildiv,
+    do_bench_using_profiling,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     FakeIndentedBuffer,
     get_dtype_size,
     is_gpu,
@@ -86,6 +116,10 @@
 PRINT_AUTOTUNE = True
 DEBUG = False
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     import concurrent
 
@@ -281,6 +315,13 @@ def _process_indexing(self, index):
         return self.kernel.kexpr(self.kernel.rename_indexing(index))
 
 
+<<<<<<< HEAD
+=======
+# Function name, followed by args and kwargs.
+RecordedEventsType = list[tuple[str, list[Any], dict[str, Any]]]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TritonTemplateKernel(TritonKernel):
     def __init__(
         self,
@@ -293,12 +334,21 @@ def __init__(
         grid_fn,
         meta,
         call_sizes,
+<<<<<<< HEAD
+=======
+        num_consumer_groups=0,
+        num_buffers_warp_spec=0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         use_jit=False,
         prefix_args=0,
         suffix_args=0,
         epilogue_fn=identity,
         subgraphs: Optional[list[ir.ComputedBuffer]] = None,
         workspace_arg: Optional[WorkspaceArg] = None,
+<<<<<<< HEAD
+=======
+        prologue_loads_all_inputs=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         numel = sympy_product(output_node.get_size())
         super().__init__(
@@ -316,6 +366,11 @@ def __init__(
         self.use_jit = use_jit
         self.num_stages = num_stages
         self.num_warps = num_warps
+<<<<<<< HEAD
+=======
+        self.num_consumer_groups = num_consumer_groups
+        self.num_buffers_warp_spec = num_buffers_warp_spec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.grid_fn = grid_fn
         self.meta = meta
         self.call_sizes = call_sizes
@@ -360,6 +415,58 @@ def __init__(
         self.template_out: Optional[str] = None
         self.ops_handler: Optional[V.WrapperHandler] = None  # type: ignore[name-defined]
 
+<<<<<<< HEAD
+=======
+        # When caching is enabled, the generated code is not dependent on the input nodes names, or
+        # symbolic sizes names.
+        # However, some of the variables returned by generate_and_load that are computed during the
+        # triton template expansions (code generation) are dependent on those.
+        # In order to cache the code generation and avoid redoing it for similar inputs that varies only by
+        # input names or symbol names, we do a record and replay method.
+        # During template expansions we record all function calls that change input_dependent_preserved_state
+        # and replay them on a cache hit to regenerate them.
+        self.cached_replay_events: Optional[RecordedEventsType] = None
+
+        # Update each time an input is marked frozen, used to replay the freezing of inputs on a cache hit.
+        self.frozen_layouts_cnt = 0
+
+        # When prologue_loads_all_inputs is true, prologue_supported_inputs is populated during def_kernel
+        # by adding all inputs.
+        self.prologue_loads_all_inputs = prologue_loads_all_inputs
+
+    def input_dependent_preserved_state(self) -> str:
+        # Not adding self.args.output_buffers on purpose. But we do not need to reproduce it on a cache hit.
+        # (never accessed).
+        return repr(
+            [
+                self.args.input_buffers,
+                self.args.sizevars,
+                self.args.workspace_args,
+                self.prologue_supported_inputs,
+                self.frozen_layouts_cnt,
+            ]
+        )
+
+    def record_input_dependent_tracked_event(self) -> Callable[..., Any]:
+        def decorator(fn) -> Callable[..., Any]:
+            def wrapper(*args, **kwargs) -> Any:
+                pre_state = self.input_dependent_preserved_state()
+                result = fn(*args, **kwargs)
+                post_state = self.input_dependent_preserved_state()
+                if pre_state != post_state:
+                    assert self.cached_replay_events is not None
+                    self.cached_replay_events.append((fn.__name__, [*args], {**kwargs}))
+                return result
+
+            return wrapper
+
+        return decorator
+
+    def replay_cached_events(self, events: RecordedEventsType) -> None:
+        for f, args, kwargs in events:
+            getattr(self, f)(*args, **kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @contextlib.contextmanager
     def set_subgraph_body(self, body_name: str):
         assert all(
@@ -369,6 +476,10 @@ def set_subgraph_body(self, body_name: str):
             key.name: getattr(self, key.name)
             for key in dataclasses.fields(SubgraphInfo)
         }
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert body_name in self.subgraph_bodies, body_name
 
         subgraph = self.subgraph_bodies[body_name]
@@ -429,7 +540,14 @@ def jit_lines(self):
         argdefs, _, signature, _ = self.args.python_argdefs()
         triton_meta: dict[str, Any] = {
             "signature": signature_to_meta(
+<<<<<<< HEAD
                 signature, size_dtype=self.index_dtype, argdefs=argdefs
+=======
+                signature,
+                size_dtype=self.index_dtype,
+                argdefs=argdefs,
+                is_template=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             "device": DeviceProperties.create(self.output_node.get_device()),
             "constants": {},
@@ -457,12 +575,32 @@ def jit_lines(self):
         if config.profile_bandwidth or config.benchmark_kernel:
             num_gb = self.estimate_kernel_num_bytes() / 1e9
             inductor_meta["kernel_num_gb"] = num_gb
+<<<<<<< HEAD
         return f"""
             @triton_heuristics.template(
                 num_stages={self.num_stages},
                 num_warps={self.num_warps},
                 triton_meta={triton_meta!r},
                 inductor_meta={inductor_meta!r},
+=======
+
+        template_args = f"""
+            num_stages={self.num_stages},
+            num_warps={self.num_warps},
+            triton_meta={triton_meta!r},
+            inductor_meta={inductor_meta!r},
+        """
+
+        if HAS_WARP_SPEC:
+            template_args += f"""
+            num_consumer_groups={self.num_consumer_groups},
+            num_buffers_warp_spec={self.num_buffers_warp_spec},
+        """
+
+        return f"""
+            @triton_heuristics.template(
+                {template_args}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             @triton.jit
         """
@@ -515,10 +653,19 @@ def def_kernel(self, *argnames):
         # The args may be duplicated, so renaming must be after args are de-duplicated.
         for name in argnames:
             input_node = self.named_input_nodes[name]
+<<<<<<< HEAD
+=======
+            if self.prologue_loads_all_inputs:
+                self.prologue_supported_inputs.add(input_node.get_name())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if input_node.get_name() in V.graph.removed_buffers:
                 continue
             if input_node.get_name() in self.prologue_fused_inputs:
                 continue
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             arg_name = self.args.input_buffers[input_node.get_name()]
             if input_node.get_layout().offset == 0:
                 renames.writeline(f"{name} = {arg_name}")
@@ -686,7 +833,13 @@ def load_input(
         """
 
         input_node = self.named_input_nodes[input_name]
+<<<<<<< HEAD
         self.prologue_supported_inputs.add(input_node.get_name())
+=======
+        if not self.prologue_loads_all_inputs:
+            self.prologue_supported_inputs.add(input_node.get_name())
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tilings = (sympy_product(input_node.get_size()), sympy.Integer(1))
         groups = {
             "x": tilings[0],
@@ -909,6 +1062,11 @@ def store_output(
             ):
                 input_node.freeze_layout()
                 epilogue_args.append(input_node.make_loader()(index_symbols))
+<<<<<<< HEAD
+=======
+                # We update frozen_layouts_cnt in order to replay this function on a cache hit.
+                self.frozen_layouts_cnt += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             V.ops.store(
                 self.output_node.get_name(),
@@ -928,9 +1086,34 @@ def hook():
         self.render_hooks["<STORE_OUTPUT>"] = hook
         return "<STORE_OUTPUT>"
 
+<<<<<<< HEAD
     def render(self, template, kwargs):
         return PartialRender(
             template.render(**self.template_env(), **kwargs),
+=======
+    def render(self, template, kwargs, record_input_dependent_tracked_event=False):
+        if record_input_dependent_tracked_event:
+            self.cached_replay_events = []
+
+        template_env = {
+            fn.__name__: self.record_input_dependent_tracked_event()(fn)
+            if record_input_dependent_tracked_event
+            else fn
+            for fn in [
+                self.def_kernel,
+                self.size,
+                self.stride,
+                self.store_output,
+                self.load_input,
+                self.make_load,
+                self.modification,
+                self.gen_argdefs,
+                self.gen_defines,
+            ]
+        }
+        return PartialRender(
+            template.render(**template_env, **kwargs),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.render_hooks,
         )
 
@@ -950,6 +1133,7 @@ def make_load(self, name, indices, mask):
         )
         return f"tl.load({name} + ({index}), {mask}, other=0.0)"
 
+<<<<<<< HEAD
     def template_env(self):
         """
         Generate the namespace visible in the template.
@@ -969,6 +1153,8 @@ def template_env(self):
             ]
         }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def indexing(
         self,
         index: sympy.Expr,
@@ -1038,7 +1224,11 @@ def kernel_benchmark_extra_args(self) -> list[str]:
         ]
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _jinja2_env():
     try:
         import jinja2
@@ -1050,15 +1240,166 @@ def _jinja2_env():
         return None
 
 
+<<<<<<< HEAD
 class TritonTemplate(KernelTemplate):
     index_counter = itertools.count()
     all_templates: dict[str, "TritonTemplate"] = {}
 
     def __init__(self, name: str, grid: Any, source: str, debug=False) -> None:
+=======
+class GenerateAndLoadResult(NamedTuple):
+    """
+    Return type of TritonTemplate.generate_and_load.
+    """
+
+    mod: ModuleType
+    extra: str
+    input_call_args: tuple[str, ...]
+    prologue_supported_inputs: OrderedSet[str]
+    kernel_args_sizevars_keys: tuple[sympy.Expr]
+    kernel_options: dict[str, Any]
+
+
+class GeneratedCodeCacheEntry(NamedTuple):
+    code: str
+    extra: str
+    events: list[Any]
+
+
+class GeneratedCodeCache:
+    """
+    Cache for generated code. The cache key is a string representation of the input nodes,
+    number of stages, number of warps, and call sizes. The cache value is a tuple of the
+    generated code, extra code, and events.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self._cache: dict[str, GeneratedCodeCacheEntry] = {}
+
+    def cache_clear(self) -> None:
+        self._cache.clear()
+
+    def __repr__(self):
+        return repr(self._cache)
+
+    def make_key(
+        self,
+        input_nodes: tuple[ir.IRNode],
+        num_stages: int,
+        num_warps: int,
+        call_sizes: list[sympy.core.symbol.Symbol],
+        prefix_args: int,
+        suffix_args: int,
+        epilogue_fn: Optional[Callable[..., Any]],
+        epilogue_fn_hash: Optional[str],
+        subgraphs: Optional[list[ir.Buffer]],  # has to be none to cache
+        workspace_arg: Optional[WorkspaceArg],  # has to be none to cache
+        layout: ir.Layout,
+        num_consumer_groups: int,
+        num_buffers_warp_spec: int,
+        kwargs: dict[str, Any],
+    ) -> Optional[str]:
+        def layout_key(layout: ir.Layout) -> str:
+            assert not isinstance(layout, ir.FlexibleLayout)
+            return repr(
+                [
+                    layout.size,
+                    layout.stride,
+                    layout.dtype,
+                    layout.device,
+                    layout.offset,
+                ]
+            )
+
+        def has_flexible_layout() -> bool:
+            if isinstance(layout, ir.FlexibleLayout):
+                return True
+
+            for input in input_nodes:
+                if isinstance(input.get_layout(), ir.FlexibleLayout):
+                    return True
+            return False
+
+        if epilogue_fn is identity:
+            assert epilogue_fn_hash is None
+            epilogue_fn_hash = "identity"
+
+        # we do not cache under those conditions right now.
+        if (
+            has_flexible_layout()
+            or subgraphs is not None
+            or workspace_arg is not None
+            or epilogue_fn_hash is None
+        ):
+            return None
+
+        return repr(
+            {
+                "input_nodes": [
+                    layout_key(input.get_layout()) for input in input_nodes
+                ],
+                "num_stages": num_stages,
+                "num_warps": num_warps,
+                "prefix_args": prefix_args,
+                "suffix_args": suffix_args,
+                "call_sizes": call_sizes,
+                "layout": layout_key(layout),
+                "num_consumer_groups": num_consumer_groups,
+                "num_buffers_warp_spec": num_buffers_warp_spec,
+                "epilogue_fn_hash": epilogue_fn_hash,
+                "kwargs": kwargs,
+            }
+        )
+
+    def get_entry(self, cache_key: Optional[str]) -> Optional[GeneratedCodeCacheEntry]:
+        if cache_key is None:
+            return None
+
+        entry = self._cache.get(cache_key, None)
+        if entry is None:
+            torch._dynamo.utils.counters["inductor"]["generated_module_cache_miss"] += 1
+        else:
+            torch._dynamo.utils.counters["inductor"]["generated_module_cache_hit"] += 1
+        return entry
+
+    def put_entry(
+        self,
+        cache_key: Optional[str],
+        code: str,
+        extra: str,
+        events: list[Any],
+    ) -> None:
+        if cache_key is None:
+            return
+        entry = GeneratedCodeCacheEntry(code, extra, events)
+        self._cache.update({cache_key: entry})
+
+
+class TritonTemplate(KernelTemplate):
+    """
+    A Triton template is a template that can be used to generate a Triton kernel.
+    """
+
+    # Allow subclasses to override the kernel type
+    kernel_type: type[Any] = TritonTemplateKernel
+    index_counter = itertools.count()
+    all_templates: dict[str, "TritonTemplate"] = {}
+
+    def __init__(
+        self,
+        name: str,
+        grid: Any,
+        source: str,
+        debug=False,
+        cache_codegen_enabled_for_template=False,
+        prologue_loads_all_inputs=False,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(name)
         self.grid = grid
         self.template = self._template_from_string(source)
         assert name not in self.all_templates, "duplicate template name"
+<<<<<<< HEAD
         self.all_templates[name] = self
         self.debug = debug
 
@@ -1075,6 +1416,253 @@ def generate(  # type: ignore[override]
         mutated_inputs=None,
         call_sizes=None,
         workspace_arg: Optional[WorkspaceArg] = None,
+=======
+        TritonTemplate.all_templates[name] = self
+        self.debug = debug
+        self._cache_codegen_enabled_for_template = cache_codegen_enabled_for_template
+        self._generated_code_cache: GeneratedCodeCache = GeneratedCodeCache()
+        clear_on_fresh_cache(self._generated_code_cache)
+        # When prologue_loads_all_inputs is true, prologue_supported_inputs is populated during def_kernel
+        # by adding all inputs.
+        self.prologue_loads_all_inputs = prologue_loads_all_inputs
+
+    # When this flag is on, we ensure that the cached results and the generated result if cache
+    # was not used are the same.
+    test_cache = False
+
+    def maybe_append_choice(
+        self, choices: list[Any], **kwargs: Any
+    ) -> Optional[NotImplementedError]:
+        """
+        Maybe generates a new ChoiceCaller and appends it into existing choices.
+        Returns None if success, otherwise returns the error.
+
+        choices: A list of ChoiceCallers.
+        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
+        """
+
+        try:
+            choices.append(self.generate(generate_with_caching=True, **kwargs))
+            return None
+        except NotImplementedError as e:
+            log.info(
+                "Cannot Append Choice: %s. KernelTemplate type is %s",
+                e,
+                type(self),
+                stack_info=log.getEffectiveLevel() < logging.INFO,
+            )
+            return e
+
+    # NOTE: MAKE SURE THAT ANY ARGUMENT ADDED TO THIS FUNCTION IS PROPERLY HANDLED IN _generated_code_cache.make_key.
+    def generate_and_load(
+        self,
+        input_nodes: tuple[ir.IRNode],
+        num_stages: int,
+        num_warps: int,
+        call_sizes: list[sympy.core.symbol.Symbol],
+        prefix_args: int,
+        suffix_args: int,
+        epilogue_fn: Optional[Callable[..., Any]],
+        epilogue_fn_hash: Optional[str],
+        subgraphs: Optional[list[ir.Buffer]],
+        workspace_arg: Optional[WorkspaceArg],
+        num_consumer_groups: int,
+        num_buffers_warp_spec: int,
+        layout: ir.Layout,
+        kwargs: dict[str, Any],
+        generate_with_caching,
+    ) -> Optional[GenerateAndLoadResult]:
+        """Generate the python code and load it into the current process"""
+        caching_enabled = (
+            generate_with_caching
+            and torch._inductor.config.enable_caching_generated_triton_templates
+        )
+
+        cache_key = None
+        if caching_enabled:
+            cache_key = self._generated_code_cache.make_key(
+                input_nodes,
+                num_stages,
+                num_warps,
+                call_sizes,
+                prefix_args,
+                suffix_args,
+                epilogue_fn,
+                epilogue_fn_hash,
+                subgraphs,
+                workspace_arg,
+                layout,
+                num_consumer_groups,
+                num_buffers_warp_spec,
+                kwargs,
+            )
+
+        assert self.template, "requires jinja2"
+        defines = StringIO()
+
+        for name, val in kwargs.items():
+            defines.write(f"{name} : tl.constexpr = {val}\n")
+        defines = defines.getvalue()
+
+        fake_out = ir.Buffer(name="buf_out", layout=layout)
+        kernel_name = f"triton_{self.name}"
+
+        numel = sympy_product(layout.size)
+        buffers = itertools.chain(input_nodes, (fake_out,))
+        if not TritonScheduling.can_use_32bit_indexing(numel, buffers):
+            raise NotImplementedError(
+                "64-bit indexing is not yet implemented for triton templates"
+            )
+
+        kernel_options = {
+            "input_nodes": input_nodes,
+            "defines": defines,
+            "num_stages": num_stages,
+            "num_warps": num_warps,
+            "grid_fn": self.grid,
+            "meta": kwargs,
+            "call_sizes": call_sizes,
+            "prefix_args": prefix_args,
+            "suffix_args": suffix_args,
+            "epilogue_fn": epilogue_fn,
+            "subgraphs": subgraphs,
+            "prologue_loads_all_inputs": self.prologue_loads_all_inputs,
+        }
+
+        if HAS_WARP_SPEC:
+            kernel_options.update(
+                {
+                    "num_consumer_groups": num_consumer_groups,
+                    "num_buffers_warp_spec": num_buffers_warp_spec,
+                }
+            )
+
+        def make_kernel():
+            return self.kernel_type(
+                kernel_name=kernel_name,
+                output_node=fake_out,
+                workspace_arg=workspace_arg,
+                use_jit=False,
+                **kernel_options,
+            )
+
+        def generate_code(kernel) -> Optional[tuple[str, str]]:
+            def make_extra() -> str:
+                extra_parts = [
+                    f"{kwarg}={repr(kwargs[kwarg])}" for kwarg in sorted(kwargs.keys())
+                ]
+
+                extra_parts.extend(
+                    [
+                        f"num_stages={num_stages}",
+                        f"num_warps={num_warps}",
+                    ]
+                )
+                if HAS_WARP_SPEC:
+                    extra_parts.extend(
+                        [
+                            f"num_consumer_groups={num_consumer_groups}",
+                            f"num_buffers_warp_spec={num_buffers_warp_spec}",
+                        ]
+                    )
+                extra = "-".join(extra_parts) + "-"
+                return extra
+
+            try:
+                template = kernel.render(self.template, kwargs, caching_enabled)
+                with kernel.set_subgraph_body("<STORE_OUTPUT>"):
+                    code = template.finalize_all()
+            except ZeroDivisionError:
+                # TODO(nmacchioni): fix sympy division by zero
+                return None
+            if self.debug:
+                print("Generated Code:\n", code)
+
+            extra = make_extra()
+            return code, extra
+
+        def maybe_test_cache(code: str, extra: str, kernel):
+            if self.test_cache or self.debug:
+                with (
+                    patch.object(V.graph, "get_dtype", self._fake_get_dtype(fake_out)),
+                    V.graph.set_current_device(layout.device),
+                    make_kernel() as kernel_test,
+                ):
+                    result2 = generate_code(kernel_test)
+                    assert result2 is not None
+                    code_test, extra_test = result2
+                    assert (
+                        code == code_test
+                        and extra == extra_test
+                        and kernel.args.input_buffers == kernel_test.args.input_buffers
+                        and kernel.prologue_supported_inputs
+                        == kernel_test.prologue_supported_inputs
+                        and kernel.args.sizevars == kernel_test.args.sizevars
+                    ), "Generated code cache results in wrong output"
+
+        # Generate code, extra.
+        code: Optional[str] = None
+        extra: Optional[str] = None
+        with (
+            patch.object(V.graph, "get_dtype", self._fake_get_dtype(fake_out)),
+            V.graph.set_current_device(layout.device),
+            make_kernel() as kernel,
+        ):
+            cache_entry = self._generated_code_cache.get_entry(cache_key)
+            cache_hit = False
+
+            if cache_entry is not None:
+                code, extra, events = cache_entry
+                kernel.replay_cached_events(events)
+                cache_hit = True
+
+            else:
+                result = generate_code(kernel)
+                if result is None:  # happens at ZeroDivisionError:
+                    return None
+                code, extra = result
+                self._generated_code_cache.put_entry(
+                    cache_key, code, extra, kernel.cached_replay_events
+                )
+
+        assert code is not None and extra is not None
+
+        mod = PyCodeCache.load(code, extra)
+
+        input_call_args = tuple(kernel.args.input_buffers.keys())
+        prologue_supported_inputs = kernel.prologue_supported_inputs.copy()
+        kernel_args_sizevars_keys = tuple(kernel.args.sizevars.keys())
+
+        if cache_hit:
+            maybe_test_cache(code, extra, kernel)
+
+        return GenerateAndLoadResult(
+            mod,
+            extra,
+            input_call_args,
+            prologue_supported_inputs,
+            kernel_args_sizevars_keys,
+            kernel_options,
+        )
+
+    def generate(  # type: ignore[override]
+        self,
+        input_nodes: tuple[ir.IRNode],
+        layout: ir.Layout,
+        num_stages: int,
+        num_warps: int,
+        num_consumer_groups: int = 0,
+        num_buffers_warp_spec: int = 0,
+        prefix_args: int = 0,
+        suffix_args: int = 0,
+        epilogue_fn: Optional[Callable[..., Any]] = identity,
+        epilogue_fn_hash: Optional[str] = None,
+        subgraphs: Optional[list[ir.Buffer]] = None,
+        mutated_inputs: Optional[list[ir.IRNode]] = None,
+        call_sizes: Optional[list[sympy.core.symbol.Symbol]] = None,
+        workspace_arg: Optional[WorkspaceArg] = None,
+        generate_with_caching=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **kwargs,
     ):
         """This function generates a TritonTemplateCaller
@@ -1093,13 +1681,17 @@ def generate(  # type: ignore[override]
                 if you need to return multiple outputs. You can pass them as inputs and mark them as
                 being mutated by the kernel.
         """
+<<<<<<< HEAD
         assert self.template, "requires jinja2"
         defines = StringIO()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # HACK: Triton currently breaks if TF32 floats are requested, but the CUDA
         # capability doesn't support them.  This is a bug in Triton, but for now we'll
         # patch around it here.  See https://github.com/triton-lang/triton/issues/3011
         # for one example issue with this problem.
+<<<<<<< HEAD
         if not torch.cuda.is_tf32_supported():
             kwargs["ALLOW_TF32"] = "False"
 
@@ -1181,18 +1773,91 @@ def generate(  # type: ignore[override]
         full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args])
         extra_args = V.graph.sizevars.size_hints(
             map(sympy.expand, tuple(kernel.args.sizevars.keys())),
+=======
+        if torch.cuda.is_available() and not torch.cuda.is_tf32_supported():
+            kwargs["ALLOW_TF32"] = "False"
+
+        if call_sizes is None:
+            call_sizes = layout.size
+
+        result = self.generate_and_load(
+            input_nodes,
+            num_stages,
+            num_warps,
+            call_sizes,
+            prefix_args,
+            suffix_args,
+            epilogue_fn,
+            epilogue_fn_hash,
+            subgraphs,
+            workspace_arg,
+            num_consumer_groups,
+            num_buffers_warp_spec,
+            layout,
+            kwargs,
+            generate_with_caching and self._cache_codegen_enabled_for_template,
+        )
+
+        # May happen as result of dev by 0.
+        if result is None:
+            return None
+
+        # We expect the input_buffer order to be [*input_nodes, *captured_buffers]
+        expected_input_args = tuple(unique(x.get_name() for x in input_nodes))
+        assert (
+            result.input_call_args[: len(expected_input_args)] == expected_input_args
+        ), (
+            result.input_call_args,
+            expected_input_args,
+        )
+
+        full_input_nodes = tuple(
+            [V.graph.get_buffer(k) for k in result.input_call_args]
+        )
+        extra_args = V.graph.sizevars.size_hints(
+            map(sympy.expand, result.kernel_args_sizevars_keys),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fallback=config.unbacked_symint_fallback,
         )
 
         kernel_hash_name = f"triton_{self.name}_{next(self.index_counter)}"
 
+<<<<<<< HEAD
         def make_kernel_render(out_node):
             kernel = TritonTemplateKernel(
+=======
+        workspace_args = []
+        if workspace_arg is not None:
+            # Create workspace tensor
+            workspace_size = workspace_arg.count
+            workspace_tensor = torch.empty_strided(
+                (workspace_size,),
+                (1,),
+                dtype=torch.uint8,
+                device=layout.device.type,
+            )
+
+            # Handle zero initialization if needed
+            if workspace_arg.zero_mode != WorkspaceZeroMode.UNINITIALIZED:
+                workspace_tensor.zero_()
+
+            workspace_args.append(workspace_tensor)
+
+        options = result.kernel_options
+
+        def make_kernel_render(out_node):
+            assert result is not None
+            kernel = self.kernel_type(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 kernel_name=str(Placeholder.KERNEL_NAME),
                 output_node=out_node,
                 workspace_arg=workspace_arg,
                 use_jit=False,
+<<<<<<< HEAD
                 **kernel_options,
+=======
+                **options,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             render = functools.partial(
                 kernel.render,
@@ -1202,7 +1867,11 @@ def make_kernel_render(out_node):
             return kernel, render
 
         # create the BenchmarkRequest
+<<<<<<< HEAD
         assert mod.__file__ is not None
+=======
+        assert result.mod.__file__ is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         grid = self.grid(
             *V.graph.sizevars.size_hints(
                 call_sizes,
@@ -1216,18 +1885,32 @@ def make_kernel_render(out_node):
         else:
             bmreq_cls = TritonGPUBenchmarkRequest
         bmreq = bmreq_cls(
+<<<<<<< HEAD
             module_path=mod.__file__,
             module_cache_key=mod.key,
             kernel_name=kernel_name,
             extra_args=[*extra_args, *grid],
             num_stages=num_stages,
             num_warps=num_warps,
+=======
+            module_path=result.mod.__file__,
+            module_cache_key=result.mod.key,
+            kernel_name=f"triton_{self.name}",
+            extra_args=[*extra_args, *workspace_args, *grid],
+            num_stages=num_stages,
+            num_warps=num_warps,
+            num_consumer_groups=num_consumer_groups,
+            num_buffers_warp_spec=num_buffers_warp_spec,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             matrix_instr_nonkdim=kwargs.get("matrix_instr_nonkdim", 0),
             waves_per_eu=kwargs.get("waves_per_eu", 0),
             kpack=kwargs.get("kpack", 2),
             input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes),  # type: ignore[arg-type]
             output_tensor_meta=TensorMeta.from_irnodes(layout),
+<<<<<<< HEAD
             workspace_arg=workspace_arg,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         return TritonTemplateCaller(
@@ -1235,7 +1918,11 @@ def make_kernel_render(out_node):
             full_input_nodes,
             layout,
             make_kernel_render,
+<<<<<<< HEAD
             extra.strip("-").replace("-", ", "),
+=======
+            result.extra.strip("-").replace("-", ", "),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             bmreq,
             log_info={
                 "tile_shape": str(
@@ -1250,10 +1937,20 @@ def make_kernel_render(out_node):
                 "GROUP_M": kwargs.get("GROUP_M", -1),
                 "allow_tf32": str(kwargs.get("ALLOW_TF32", None)),
                 "acc_type": str(kwargs.get("ACC_TYPE", None)),
+<<<<<<< HEAD
             },
             mutated_inputs=mutated_inputs,
             workspace_arg=workspace_arg,
             allowed_prologue_inps=kernel.prologue_supported_inputs.copy(),
+=======
+                "matrix_instr_nonkdim": kwargs.get("matrix_instr_nonkdim", 0),
+                "waves_per_eu": kwargs.get("waves_per_eu", 0),
+                "kpack": kwargs.get("kpack", 2),
+            },
+            mutated_inputs=mutated_inputs,
+            workspace_arg=workspace_arg,
+            allowed_prologue_inps=result.prologue_supported_inputs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1287,7 +1984,11 @@ def to_callable(self):
     def call_name(self):
         return f"extern_kernels.{self.name}"
 
+<<<<<<< HEAD
     @functools.lru_cache(None)  # noqa: B019
+=======
+    @functools.cache  # noqa: B019
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def hash_key(self):
         fn = self.to_callable()
         parts = [
@@ -1351,7 +2052,14 @@ def __init__(
 
     def benchmark(self, *args, out):
         assert self.bmreq is not None
+<<<<<<< HEAD
         return self.bmreq.benchmark(*args, output_tensor=out)
+=======
+        if config.profile_bandwidth_with_do_bench_using_profiling:
+            algo = self.bmreq.make_run_fn(*args, out=out)
+            return do_bench_using_profiling(algo)
+        return self.bmreq.benchmark(*args, out=out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def precompile(self):
         assert self.bmreq is not None
@@ -1434,6 +2142,11 @@ def benchmark(self, *args, out):
                 out_new, tuple(out.size()), tuple(out.stride())
             )
             out.copy_(out_new)  # for correctness checking
+<<<<<<< HEAD
+=======
+            if config.profile_bandwidth_with_do_bench_using_profiling:
+                return do_bench_using_profiling(lambda: algo(*args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return benchmarker.benchmark(algo, args, {})
 
     def to_callable(self):
@@ -1489,7 +2202,11 @@ def autoheuristic_id(self):
         return f"extern_{self.choice.name}"
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_mm_log_filename() -> Optional[str]:
     mm_file_name = os.environ.get("TORCHINDUCTOR_MM_LOGGING_FILE", None)
     if not mm_file_name:
@@ -1608,7 +2325,11 @@ class NoValidChoicesError(RuntimeError):
     pass
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_num_workers() -> int:
     if "TORCHINDUCTOR_COMPILE_THREADS" in os.environ:
         return int(os.environ["TORCHINDUCTOR_COMPILE_THREADS"])
@@ -1619,6 +2340,18 @@ def get_num_workers() -> int:
         else os.cpu_count()
     )
     assert cpu_count
+<<<<<<< HEAD
+=======
+
+    # Divide the number of CPUs by the number of GPUs for distributed workloads
+    if (
+        config.is_fbcode()
+        and torch.cuda.is_available()
+        and torch.cuda.device_count() > 0
+    ):
+        cpu_count = cpu_count // torch.cuda.device_count()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cpu_count
 
 
@@ -1635,11 +2368,47 @@ def create_precompile_key(
             inputs_key,
             torch.get_float32_matmul_precision(),
         ]
+<<<<<<< HEAD
         + [choice.hash_key() for choice in choices]
     )
 
 
 class AlgorithmSelectorCache(PersistentCache):
+=======
+        + [choice.kernel_hash_key() for choice in choices]
+    )
+
+
+# Args to FeedbackFunctions
+# timings: mapping from choices to the benchmark time
+# name: name of the op
+# input_nodes: list of input ir.py Nodes
+# choices: list of choices
+# profiled time: Callable that returns a dict mapping from choices to the profiled time
+FeedbackFunction = Callable[
+    [
+        dict[ChoiceCaller, float],
+        str,
+        list[Any],
+        list[ChoiceCaller],
+        Callable[[], dict[ChoiceCaller, float]],
+    ],
+    None,
+]
+
+
+class AlgorithmSelectorCache(PersistentCache):
+    """
+    A persistent cache for algorithm selection results used in autotuning of GEMMs
+    and convolutions.
+
+    This classes includes precompilation and benchmarking of the kernels.
+
+    The cache is keyed by input characteristics (sizes, strides, dtypes, etc.) but
+    doesn't depend on the output layout.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
@@ -1649,6 +2418,7 @@ def __init__(self, *args, **kwargs) -> None:
         # of a particular key
         self.precompile_cache: dict[str, Callable[[], None]] = {}
         # list of callbacks that are called after benchmarking
+<<<<<<< HEAD
         self.feedback_saver_fns: list[
             Callable[
                 [dict[ChoiceCaller, float], str, list[Any], list[ChoiceCaller]], None
@@ -1659,6 +2429,17 @@ def __init__(self, *args, **kwargs) -> None:
 
     def cache_clear(self) -> None:
         self.precompile_cache.clear()
+=======
+        self.feedback_saver_fns: list[FeedbackFunction] = []
+        # cache for prescreening results to ensure deterministic candidate selection
+        self.prescreening_cache: dict[str, OrderedSet[str]] = {}
+
+        clear_on_fresh_cache(self)
+
+    def cache_clear(self) -> None:
+        self.precompile_cache.clear()
+        self.prescreening_cache.clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(
         self,
@@ -1685,9 +2466,12 @@ def __call__(
 
         # TODO - assert that we have not mutating kernels here
 
+<<<<<<< HEAD
         # TODO(nmacchioni): remove once CI tests are fixed
         choices = [choice for choice in choices if choice is not None]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.test_configs.autotune_choice_name_regex is not None:
             choices = [
                 c
@@ -1729,12 +2513,17 @@ def __call__(
                 # CUDATemplateCaller still needs to go through autotuning process to retrieve workspace size.
                 return choices[0].output_node()
 
+<<<<<<< HEAD
         @functools.lru_cache(None)
+=======
+        @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def make_benchmark_fn():
             return self.make_benchmark_fn(choices, input_nodes, layout, input_gen_fns)
 
         inputs_key = create_inputs_key(input_nodes)
 
+<<<<<<< HEAD
         def precompile(choices) -> Callable[[], None]:
             log.debug("Starting precompilation")
 
@@ -1886,20 +2675,49 @@ def wait_on_futures():
 
         def autotune(choices):
             log.debug("Starting autotuning")
+=======
+        def autotune(choices):
+            log.debug("Starting autotuning")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with dynamo_timed(
                 f"{name}_template_autotuning",
                 log_pt2_compile_event=True,
                 dynamo_compile_column_us="compile_time_autotune_time_us",
+<<<<<<< HEAD
+=======
+                metadata={
+                    "autotune_strides": ", ".join(
+                        [str(n.get_stride()) for n in input_nodes]
+                    ),
+                    "autotune_dtypes": ", ".join(
+                        [str(n.get_dtype()) for n in input_nodes]
+                    ),
+                    "autotune_shape": ", ".join(
+                        ["x".join(map(str, n.get_size())) for n in input_nodes]
+                    ),
+                    "autotune_offset": ", ".join(
+                        [str(n.get_layout().offset) for n in input_nodes]
+                    ),
+                },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 return make_benchmark_fn()(choices)
 
         if config.autotune_in_subproc:
+<<<<<<< HEAD
             from .autotune_process import tuning_pool
 
             # do the optional warmup
             tuning_pool.initialize()
 
         def do_autotuning(precompile_fn):
+=======
+            # Initialize the suprocess pool so it will warmup early.
+            torch._inductor.autotune_process.get_tuning_process_pool()
+
+        def do_autotuning(choices, precompile_fn):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             precompile_start_ts = time.time()
             with dynamo_timed(
                 f"{name}_template_precompiling",
@@ -1910,6 +2728,27 @@ def do_autotuning(precompile_fn):
             precompile_elapse = time.time() - precompile_start_ts
             log.debug("Precompilation elapsed time: %.02fs", precompile_elapse)
 
+<<<<<<< HEAD
+=======
+            candidates = self.prescreen_choices(
+                choices, name, inputs_key, self.prescreening_cache
+            )
+            prescreening_elapse: Optional[float] = None
+            if candidates:
+                prescreening_start_ts = time.time()
+                timings = self.lookup(
+                    candidates,
+                    name,
+                    inputs_key,
+                    autotune,
+                )
+                choices = self.prune_choices_postscreen(
+                    choices, timings, name, inputs_key, self.prescreening_cache
+                )
+                prescreening_elapse = time.time() - prescreening_start_ts
+                log.debug("Prescreening elapsed time: %.02fs", prescreening_elapse)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             autotune_start_ts = time.time()
             timings = self.lookup(
                 choices,
@@ -1917,6 +2756,10 @@ def do_autotuning(precompile_fn):
                 inputs_key,
                 autotune,
             )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             autotune_elapse = time.time() - autotune_start_ts
             log.debug("Autotuning elapsed time: %.02fs", autotune_elapse)
 
@@ -1934,6 +2777,7 @@ def do_autotuning(precompile_fn):
                 or config.trace.log_autotuning_results
             ):
                 self.log_results(
+<<<<<<< HEAD
                     name, input_nodes, timings, autotune_elapse, precompile_elapse
                 )
 
@@ -1943,11 +2787,56 @@ def do_autotuning(precompile_fn):
             return timings
 
         precompile_fn = precompile(choices)
+=======
+                    name,
+                    input_nodes,
+                    timings,
+                    autotune_elapse,
+                    precompile_elapse,
+                    prescreening_elapse,
+                )
+
+            def profiler_bench_function():
+                # we're not running through the normal caching autotuner method here because we want to avoid returning
+                # the cached value.
+                # Avoid benchmarking in a separate process because it's not easy to signal to the TuningProcess that we
+                # should use the profiler.
+                with config.patch(
+                    profile_bandwidth_with_do_bench_using_profiling=True,
+                    autotune_in_subproc=False,
+                ):
+                    return self.make_benchmark_fn(
+                        choices, input_nodes, layout, input_gen_fns
+                    )(choices)
+
+            for feedback_fn in self.feedback_saver_fns:
+                # re-benchmarking the same choices with profiler is a bit expensive, so pass it in as a thunk.
+                feedback_fn(
+                    timings,
+                    name,
+                    input_nodes,
+                    choices,
+                    profiler_bench_function,
+                )
+
+            return timings
+
+        precompile_fn = self.make_precompile_fn(
+            choices,
+            name,
+            inputs_key,
+            precompilation_timeout_seconds=precompilation_timeout_seconds,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if return_multi_template and (config.max_autotune or config.max_autotune_gemm):
 
             def get_timings():
+<<<<<<< HEAD
                 timings = do_autotuning(precompile_fn)
+=======
+                timings = do_autotuning(choices, precompile_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 min_extern_choice = float("inf")
                 for choice, timing in timings.items():
                     if isinstance(choice, ExternKernelCaller):
@@ -1982,6 +2871,7 @@ def get_timings():
                 )
             )
 
+<<<<<<< HEAD
         # TODO - dont want to precompile if we have a cache hit
         timings = do_autotuning(precompile_fn)
         if timings == {} or choices[0] not in timings:
@@ -2098,10 +2988,329 @@ def benchmark_in_current_process(
                     else:
                         if "illegal memory access" in msg:
                             msg += "\n\nEither error in template or triton bug.\n"
+=======
+        timings = do_autotuning(choices, precompile_fn)
+
+        # if timings is empty, we really have no choice but to return a semi-random
+        # choice. returning the first `ExternKernelCaller` is probably the safest bet
+        # in this case, since it will generally be the ATen kernel. if there are no
+        # `ExternKernelCaller`s to return, then returning the 0th kernel is our next
+        # best option (ideally we'd fail whenever there is no ATen kernel to fallback
+        # to, but that's not trivial to figure out)
+        if timings == {}:
+            for choice in choices:
+                if isinstance(choice, ExternKernelCaller):
+                    node = choice.output_node()
+                    log.debug(
+                        "Autotuning returned empty timings, falling back to first `ExternKernelCaller`: %s",
+                        node,
+                    )
+                    return node
+            node = choices[0].output_node()
+            log.debug(
+                "Autotuning returned empty timings, falling back to first choice: %s",
+                node,
+            )
+            return node
+
+        # if we got any timings at all, pick the best of those
+        choice = min(timings, key=timings.__getitem__)
+        node = choice.output_node()
+        log.debug("Autotuning selected choice: %s", node)
+        return node
+
+    def make_precompile_fn(
+        self,
+        choices,
+        name: str,
+        inputs_key: str,
+        precompilation_timeout_seconds: Optional[int] = 60 * 60,
+    ) -> Callable[[], None]:
+        """
+        Returns a function that precompiles the given choices.
+        """
+        log.debug("Starting precompilation")
+
+        def no_op(*args, **kwargs):
+            return
+
+        if (
+            precompilation_timeout_seconds is None
+            or precompilation_timeout_seconds <= 0
+        ):
+            log.debug("Precompilation timeout is None or <= 0, returning no_op")
+            return no_op
+
+        num_workers = min(get_num_workers(), len(choices))
+
+        if num_workers <= 0:
+            return no_op
+
+        # https://github.com/python/cpython/issues/106905
+        if (
+            sys.version_info.major == 3
+            and sys.version_info.minor == 11
+            and sys.version_info.micro <= 8
+        ):
+            return no_op
+
+        # check local and global cache before precompiling
+        timings = self.lookup(
+            choices,
+            name,
+            inputs_key,
+            benchmark=None,
+        )
+
+        if timings and len(timings) == len(choices):
+            # compilation in precompile stage is much cheaper than that in
+            # autotuning stage
+            log.debug("Found all %d timings in cache, returning no_op", len(timings))
+            return no_op
+
+        if config.search_autotune_cache and not (
+            config.max_autotune or config.max_autotune_gemm
+        ):
+            return no_op
+
+        precompile_key = create_precompile_key(name, inputs_key, choices)
+        if precompile_func := self.precompile_cache.get(precompile_key):
+            log.debug("Precompile function found in cache, returning it")
+            return precompile_func
+
+        log.info(
+            "Multithreaded precompilation for %d choices using %d worker threads",
+            len(choices),
+            num_workers,
+        )
+
+        # In rare circumstances, because python threads inherit global state,
+        # thread pool executor can race and leave stdout/stderr in a state
+        # different than the original values. we explicitly restore the state
+        # here to avoid this issue.
+
+        def precompile_with_captured_stdout(choice) -> tuple[None, int]:
+            log.debug("Precompiling choice with captured stdout: %s", choice)
+            start_ns = time.time_ns()
+            with restore_stdout_stderr():
+                choice.precompile()
+            elapsed_ns = time.time_ns() - start_ns
+            # Return tuple as triton async compile (_worker_compile_triton)
+            # returns tuple[CachingAutotuner, int]
+            return None, elapsed_ns // 1000
+
+        def on_complete(future):
+            if not future.exception():
+                _, precompile_elapsed_us = future.result()
+                elapsed_seconds = precompile_elapsed_us / 1e6
+                elapsed_times[future] = elapsed_seconds
+                log.debug(
+                    "Precompilation complete for future: %s, elapsed time: %.02fs",
+                    future,
+                    elapsed_seconds,
+                )
+
+        executor = ThreadPoolExecutor(max_workers=num_workers)
+        async_compile = torch._inductor.async_compile.AsyncCompile()
+
+        futures: dict[concurrent.futures.Future[Any], ChoiceCaller] = {}
+        elapsed_times: dict[concurrent.futures.Future[Any], float] = {}
+
+        # Some choices only differ in runtime arguments, so we
+        # skip a choice if it has the same hash as a previously seen choice
+        seen_choices: OrderedSet[str] = OrderedSet()
+        for c in choices:
+            # Skip choices which we have already issued a precompile
+            if c.kernel_hash_key() in seen_choices:
+                log.debug("Skipping already seen choice: %s", c)
+                continue
+            else:
+                seen_choices.add(c.kernel_hash_key())
+
+            if hasattr(c, "precompile"):
+                triton_cuda_choice = isinstance(c, TritonTemplateCaller) and isinstance(
+                    c.bmreq, TritonGPUBenchmarkRequest
+                )
+                if triton_cuda_choice and async_compile.use_process_pool():
+                    with open(c.bmreq.module_path) as file:
+                        source_code = file.read()
+                    future = async_compile.triton(
+                        kernel_name=c.bmreq.kernel_name, source_code=source_code
+                    ).future
+                    log.debug("Submitted triton async compile for choice: %s", c)
+                else:
+                    future = executor.submit(precompile_with_captured_stdout, c)
+                    log.debug("Submitted precompile for choice: %s", c)
+
+                future.add_done_callback(on_complete)
+                futures[future] = c
+
+        @functools.cache
+        @restore_stdout_stderr()
+        def wait_on_futures():
+            log.debug("Waiting on futures")
+            counters["inductor"]["select_algorithm_precompile"] += 1
+            for future in as_completed(
+                futures,
+                timeout=precompilation_timeout_seconds,
+            ):
+                if e := future.exception():
+                    from torch._inductor.codegen.cuda.cuda_kernel import (
+                        CUDATemplateCaller,
+                    )
+
+                    if isinstance(e, CUDACompileError) and isinstance(
+                        futures[future], CUDATemplateCaller
+                    ):
+                        log.debug(
+                            "Exception %s for benchmark choice %s",
+                            e,
+                            futures[future],
+                            exc_info=True,
+                        )
+                    else:
+                        log.error(
+                            "Exception %s for benchmark choice %s", e, futures[future]
+                        )
+                else:
+                    counters["inductor"]["select_algorithm_num_precompiles"] += 1
+                    log.info(
+                        "Precompiling benchmark choice %s took %.02fs",
+                        futures.get(future),
+                        elapsed_times.get(future),
+                    )
+
+            executor.shutdown(wait=True)
+
+        self.precompile_cache[precompile_key] = wait_on_futures
+
+        return wait_on_futures
+
+    @classmethod
+    def get_inputs(
+        cls,
+        choices: Sequence[ChoiceCaller],
+        input_nodes: list[ir.IRNode],
+        layout: ir.Layout,
+        input_gen_fns: Optional[dict[int, Callable[[ir.Buffer], torch.Tensor]]],
+    ) -> AutotuneArgs:
+        """
+        Factory method to create AutotuneArgs from a list of ChoiceCallers.
+        """
+        if input_gen_fns is None:
+            input_gen_fns = {}
+
+        # de-duplicate args
+        unique_example_inputs = {
+            x.get_name(): input_gen_fns.get(i, cls.benchmark_example_value)(x)
+            for i, x in enumerate(input_nodes)
+        }
+        example_inputs = list(unique_example_inputs.values())
+        example_inputs_extern = [
+            (
+                unique_example_inputs[input_node.get_name()]
+                if unique_example_inputs[input_node.get_name()].is_mkldnn
+                else torch.as_strided(
+                    unique_example_inputs[input_node.get_name()],
+                    V.graph.sizevars.size_hints(
+                        input_node.get_size(),
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                    V.graph.sizevars.size_hints(
+                        input_node.get_stride(),
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                    V.graph.sizevars.size_hint(
+                        input_node.get_layout().offset,
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                )
+            )
+            for input_node in input_nodes
+        ]
+        out = cls.benchmark_example_value(layout)
+        out_extern = torch.as_strided(
+            out, out.size(), out.stride(), V.graph.sizevars.size_hint(layout.offset)
+        )
+        expected = None
+        if VERIFY:
+            choices[0].benchmark(*example_inputs_extern, out=out_extern)
+            expected = out_extern.clone()
+
+        return AutotuneArgs.from_choice_args(
+            example_inputs,
+            example_inputs_extern,
+            out,
+            out_extern,
+            expected,
+        )
+
+    @classmethod
+    def benchmark_choice(
+        cls, choice: ChoiceCaller, autotune_args: AutotuneArgs
+    ) -> float:
+        is_extern = isinstance(choice, (ExternKernelCaller, SubgraphChoiceCaller))
+        benchmark_tensors = autotune_args.get_benchmark_tensors(is_extern)
+        inpts, output = benchmark_tensors.unpack()
+        output.zero_()
+        result = choice.benchmark(*inpts, out=output)
+        device_type = next(
+            (tensor.device.type for tensor in inpts if is_gpu(tensor.device.type)),
+            "cuda",
+        )
+        device_interface = get_interface_for_device(device_type)
+        if device_interface.is_available():
+            device_interface.synchronize()  # shake out any CUDA errors
+
+        if VERIFY and autotune_args.expected is not None:
+            autotune_args.verify(**VERIFY)
+        return result
+
+    @classmethod
+    def benchmark_choices(
+        cls,
+        choices: Sequence[ChoiceCaller],
+        autotune_args: AutotuneArgs,
+    ) -> dict[ChoiceCaller, float]:
+        timings = {}
+        for choice in choices:
+            try:
+                timing = cls.benchmark_choice(choice, autotune_args)
+            except CUDACompileError as e:
+                from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+                if not isinstance(choice, CUDATemplateCaller):
                     log.error(
+                        "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice.",
+                        e,
+                    )
+                timing = float("inf")
+            except NotImplementedError as e:
+                log.warning("Not yet implemented: %s", e)
+                timing = float("inf")
+            except RuntimeError as e:
+                from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+                msg = str(e)
+                if "invalid argument" in msg:
+                    msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
+                else:
+                    if "illegal memory access" in msg:
+                        msg += "\n\nEither error in template or triton bug.\n"
+
+                if isinstance(choice, CUDATemplateCaller):
+                    log.debug(
                         "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
                         msg,
+                        exc_info=True,
                     )
+                else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+                    log.error(
+                        "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
+                        msg,
+                    )
+<<<<<<< HEAD
                     timing = float("inf")
                 except AssertionError as e:
                     raise AssertionError(  # noqa: B904
@@ -2144,6 +3353,234 @@ def benchmark_in_sub_process(
         )
 
         return benchmark
+=======
+                timing = float("inf")
+            except AssertionError as e:
+                raise AssertionError(  # noqa: B904
+                    f"Incorrect result from choice {choice}\n\n{e}"
+                )
+            except Exception as e:
+                try:
+                    from triton.runtime.autotuner import OutOfResources
+
+                    if isinstance(e, OutOfResources):
+                        log.warning(e)
+                        timing = float("inf")
+                    else:
+                        raise e
+                except ImportError:
+                    raise e from None
+
+            timings[choice] = timing
+
+        return timings
+
+    @classmethod
+    def benchmark_in_current_process(
+        cls,
+        choices: Sequence[ChoiceCaller],
+        input_nodes: list[ir.IRNode],
+        layout: ir.Layout,
+        input_gen_fns: Optional[dict[int, Callable[[ir.Buffer], torch.Tensor]]],
+    ) -> dict[ChoiceCaller, float]:
+        inputs = cls.get_inputs(choices, input_nodes, layout, input_gen_fns)
+        return cls.benchmark_choices(choices, inputs)
+
+    @classmethod
+    def benchmark_in_sub_process(
+        cls,
+        choices: Sequence[ChoiceCaller],
+        input_nodes: list[ir.IRNode],
+        layout: ir.Layout,
+        input_gen_fns: Optional[dict[int, Callable[[ir.Buffer], torch.Tensor]]],
+    ):
+        from . import autotune_process
+
+        # only benchmark triton kernel in sub process for now.
+        # ATen/Extern kernel are still benchmarked in the current process.
+        extern = [c for c in choices if isinstance(c, ExternKernelCaller)]
+        triton = [c for c in choices if not isinstance(c, ExternKernelCaller)]
+
+        timings = cls.benchmark_in_current_process(
+            extern, input_nodes, layout, input_gen_fns
+        )
+        timings.update(autotune_process.benchmark_in_sub_process(triton))  # type: ignore[arg-type]
+        return timings
+
+    @classmethod
+    def make_benchmark_fn(
+        cls,
+        choices: Sequence[ChoiceCaller],
+        input_nodes: list[ir.IRNode],
+        layout: ir.Layout,
+        input_gen_fns: Optional[dict[int, Callable[[ir.Buffer], torch.Tensor]]],
+    ):
+        if DEBUG:
+            print(f"{len(choices)} tuning requests:")
+
+        if config.autotune_in_subproc:
+            return functools.partial(
+                cls.benchmark_in_sub_process,
+                input_nodes=input_nodes,
+                layout=layout,
+                input_gen_fns=input_gen_fns,
+            )
+        else:
+            return functools.partial(
+                cls.benchmark_in_current_process,
+                input_nodes=input_nodes,
+                layout=layout,
+                input_gen_fns=input_gen_fns,
+            )
+
+    @staticmethod
+    def prescreen_choices(
+        choices: list[ChoiceCaller],
+        name: str,
+        inputs_key: str,
+        prescreen_cache: dict[str, OrderedSet[str]],
+    ) -> list[ChoiceCaller]:
+        """
+        Figure out what choices need to be prescreened before autotuning with runtime
+        params.
+
+        Prescreening is a process of reducing the number of autotuning for choices with
+        runtime params via a two stage autotuning process. First, we fix a set of runtime
+        params (here we use swizzle=2) and run autotuning to get a set of candidates.
+        Then, we run autotuning again with the candidates and the full set of runtime
+        params.
+
+        Since have the concept of runtime params, we need to differentiate between
+        choice's hash_key and choice's kernel_hash_key. The former includes information
+        like runtime params, while the latter does not. prescreen_cache, if exists, stores
+        the set of hash_key that should win the prescreening.
+
+        Right now, only CUTLASS choices have runtime params.
+        """
+        # Create a cache key for prescreening results
+        prescreen_key = f"{name}:{inputs_key}"
+
+        # Check if we have cached prescreening results (prescreen_winners)
+        if prescreen_key in prescreen_cache:
+            prescreen_winners = [
+                choice
+                for choice in choices
+                if choice.hash_key() in prescreen_cache[prescreen_key]
+            ]
+            return prescreen_winners
+
+        # prescreen cutlass
+        from .codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+        candidates = []
+        if (
+            config.cuda.cutlass_prescreening
+            and len(config.cuda.cutlass_max_profiling_swizzle_options) > 1
+        ):
+            candidates.extend(
+                [
+                    c
+                    for c in choices
+                    if isinstance(c, CUDATemplateCaller)
+                    # hardcoded to only look at swizzle=2
+                    if c.info_dict().get("swizzle") == "2"
+                ]
+            )
+
+        # skip prescreening if the number of candidates is too small
+        if len(candidates) < 10:
+            return []
+
+        return candidates  # type: ignore[return-value]
+
+    @staticmethod
+    def prune_choices_postscreen(
+        choices: list[ChoiceCaller],
+        candidate_timings: dict[ChoiceCaller, float],
+        name: str,
+        inputs_key: str,
+        prescreen_cache: dict[str, OrderedSet[str]],
+    ) -> list[ChoiceCaller]:
+        """
+        Prune the choices after prescreening.
+        """
+        from .codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+        prescreen_key = f"{name}:{inputs_key}"
+
+        # Check if we have cached postscreen results
+        if prescreen_key in prescreen_cache:
+            # candidate_timings are from choices that have won prescreening already
+            winner_kernel_hashes = [
+                candidate.kernel_hash_key() for candidate in candidate_timings
+            ]
+
+            pruned_choices = [
+                choice
+                for choice in choices
+                if not isinstance(choice, CUDATemplateCaller)
+                or choice.kernel_hash_key() in winner_kernel_hashes
+            ]
+            return pruned_choices
+
+        log.debug("Before pruning using prescreening timings, %d choices", len(choices))
+        sorted_candidates = sorted(
+            candidate_timings.keys(), key=lambda choice: candidate_timings[choice]
+        )
+
+        # Print prescreening timings
+        if (
+            candidate_timings
+            and PRINT_AUTOTUNE
+            and config.autotune_num_choices_displayed != 0
+        ):
+            n = config.autotune_num_choices_displayed
+            top_k = sorted_candidates[:n]
+            best = top_k[0]
+            best_time = candidate_timings[best]
+
+            lines = ["PRESCREENING CANDIDATE TIMINGS"]
+            for choice in top_k:
+                result = candidate_timings[choice]
+                if result:
+                    lines.append(
+                        f"  {choice.name} {result:.4f} ms {best_time / result:.1%} {choice.description}"
+                    )
+                else:
+                    lines.append(
+                        f"  {choice.name} {result:.4f} ms <DIVIDED BY ZERO ERROR>"
+                    )
+
+            log.info("\n".join(lines))
+        num_to_keep = max(int(math.sqrt(len(choices)) / 4), 8)
+
+        # prune choices based on prescreening timings
+        candidates_to_prune = OrderedSet(
+            candidate.kernel_hash_key() for candidate in sorted_candidates[num_to_keep:]
+        )
+        winner_hashes: OrderedSet[str] = OrderedSet()
+        for candidate in sorted_candidates[:num_to_keep]:
+            if candidate_timings[candidate] == float("inf"):
+                candidates_to_prune.add(candidate.kernel_hash_key())
+            else:
+                winner_hashes.add(candidate.hash_key())
+                if isinstance(candidate, CUDATemplateCaller):
+                    candidate.bmreq.ensure_dll_loaded()
+
+        pruned_choices = [
+            choice
+            for choice in choices
+            if choice.kernel_hash_key() not in candidates_to_prune  # type: ignore[attr-defined]
+        ]
+
+        # Cache the hash_key of winners of prescreening
+        prescreen_cache[prescreen_key] = winner_hashes
+
+        log.debug(
+            "After pruning using prescreening timings, %d choices", len(pruned_choices)
+        )
+        return pruned_choices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def log_results(
@@ -2152,6 +3589,10 @@ def log_results(
         timings: dict[ChoiceCaller, float],
         elapse: float,
         precompile_elapse: float,
+<<<<<<< HEAD
+=======
+        prescreening_elapse: Optional[float] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         V.debug.log_autotuning_results(
             name, input_nodes, timings, elapse, precompile_elapse
@@ -2172,6 +3613,12 @@ def log_results(
                 for n in input_nodes
             ]
         )
+<<<<<<< HEAD
+=======
+
+        strides = ", ".join([str(n.get_stride()) for n in input_nodes])
+        dtypes = ", ".join([str(n.get_dtype()) for n in input_nodes])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if config.autotune_num_choices_displayed == 0:
             return
         # when autotune_num_choices_displayed is None, [:None] means all
@@ -2219,6 +3666,12 @@ def get_choice_info(choice):
 
         best_time = timings[best]
         sys.stderr.write(f"AUTOTUNE {name}({sizes})\n")
+<<<<<<< HEAD
+=======
+        sys.stderr.write(f"strides: {strides}\n")
+        sys.stderr.write(f"dtypes: {dtypes}\n")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for choice in top_k:
             result = timings[choice]
             if result:
@@ -2234,9 +3687,22 @@ def get_choice_info(choice):
         autotune_type_str = (
             "SubProcess" if config.autotune_in_subproc else "SingleProcess"
         )
+<<<<<<< HEAD
         sys.stderr.write(
             f"{autotune_type_str} AUTOTUNE benchmarking takes {elapse:.4f} seconds and {precompile_elapse:.4f}"
             f" seconds precompiling for {len(timings)} choices\n"
+=======
+        prescreening_msg = (
+            f" and {prescreening_elapse:.4f} seconds prescreening"
+            if prescreening_elapse is not None
+            else ""
+        )
+        sys.stderr.write(
+            f"{autotune_type_str} AUTOTUNE benchmarking takes {elapse:.4f} seconds and {precompile_elapse:.4f}"
+            f" seconds precompiling for {len(timings)} choices"
+            + prescreening_msg
+            + "\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @staticmethod
@@ -2321,12 +3787,16 @@ def key_of(node):
             ),
         )
 
+<<<<<<< HEAD
     def add_feedback_saver(
         self,
         fn: Callable[
             [dict[ChoiceCaller, float], str, list[Any], list[ChoiceCaller]], None
         ],
     ):
+=======
+    def add_feedback_saver(self, fn: FeedbackFunction):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.feedback_saver_fns.append(fn)
 
 
@@ -2343,11 +3813,21 @@ def autotune_select_algorithm(*args, **kwargs):
             torch._inductor.config.benchmark_epilogue_fusion
         )
 
+<<<<<<< HEAD
+=======
+    if "precompilation_timeout_seconds" not in kwargs:
+        kwargs["precompilation_timeout_seconds"] = config.precompilation_timeout_seconds
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
 
 
 def add_feedback_saver(
+<<<<<<< HEAD
     fn: Callable[[dict[ChoiceCaller, float], str, list[Any], list[ChoiceCaller]], None],
+=======
+    fn: FeedbackFunction,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     global _ALGORITHM_SELECTOR_CACHE
     if _ALGORITHM_SELECTOR_CACHE is None:
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 4d7f24f96498..0a6e4676429c 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -8,7 +8,11 @@
 import sympy
 from sympy import Expr
 
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols, ShapeEnv
+=======
+from torch.fx.experimental.symbolic_shapes import has_free_unbacked_symbols, ShapeEnv
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import FloorDiv, ModularIndexing
 from torch.utils._sympy.symbol import symbol_is_type, SymT
@@ -28,7 +32,11 @@
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 def evaluate_expr(
+=======
+def statically_known_true(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shape_env: ShapeEnv,
     expr: Union[sympy.Basic, bool],
     axioms: Optional[tuple[sympy.Expr]] = None,
@@ -62,6 +70,10 @@ def __init__(self, shape_env=None) -> None:
         self.shape_env = shape_env
         self.var_to_val = self.shape_env.var_to_val
         self.replacements: dict[sympy.Symbol, Expr] = self.shape_env.replacements
+<<<<<<< HEAD
+=======
+        self.unbacked_replacements: Optional[dict[Expr, Expr]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Maps of dynamic sizes that have to be precomputed on the host to the kernel args.
         # The basic idea is if we have some complicated sympy expression
         # f(s0), we may choose to precompute it on the host and then replace
@@ -303,6 +315,7 @@ def prune(index):
         return [x for x in sizes if x is not None], reindex, prune
 
     # Note - [On Statically Known]
+<<<<<<< HEAD
     #
     # The statically_known_* family of functions below replaces a prior system, called maybe_guard_*. The prior system
     # operated by providing essentially a question, where the size hinted values were evaluated. If the condition was
@@ -330,6 +343,18 @@ def prune(index):
 
     def is_expr_static_and_true(self, expr: Union[sympy.Basic, bool]) -> bool:
         return evaluate_expr(self.shape_env, expr)
+=======
+    # The statically_known_* family of functions below NEVER guard, they could return True if the
+    # asked questions can be answered without guarding otherwise they return False.
+    # Those are similar to statically_known_true in symbolic_shapes but operate on sympy
+    # expressions instead of symnodes.
+    def statically_known_true(self, expr: Union[sympy.Basic, bool]) -> bool:
+        """
+        Returns true if an expression is always true (symbolically or via guards),
+        false otherwise. Never add guards, or throw data dependent errors.
+        """
+        return statically_known_true(self.shape_env, expr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def statically_known_equals(
         self, left: Union[Expr, int], right: Union[Expr, int]
@@ -337,9 +362,14 @@ def statically_known_equals(
         """
         Returns a bool indicating if it is sound to optimize as if left and right are equal.
         """
+<<<<<<< HEAD
         return self.is_expr_static_and_true(sympy.Eq(left, right))  # type: ignore[arg-type]
 
     # See Note - [On Statically Known]
+=======
+        return self.statically_known_true(sympy.Eq(left, right))  # type: ignore[arg-type]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def statically_known_list_equals(self, left: list[Expr], right: list[Expr]) -> bool:
         """
         Returns a bool indicating if it is sound to optimize as if left and right lists are equal.
@@ -348,51 +378,85 @@ def statically_known_list_equals(self, left: list[Expr], right: list[Expr]) -> b
             self.statically_known_equals(l, r) for l, r in zip(left, right)
         )
 
+<<<<<<< HEAD
     # See Note - [On Statically Known]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def statically_known_leq(self, left: Expr, right: Union[Expr, int]) -> bool:
         """
         Returns a bool indicating if it is sound to optimize as if left is less than or equal to right.
         """
         expr = left <= right
+<<<<<<< HEAD
         return self.is_expr_static_and_true(expr)
 
     # See Note - [On Statically Known]
+=======
+        return self.statically_known_true(expr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def statically_known_geq(self, left: Expr, right: Union[Expr, int]) -> bool:
         """
         Returns a bool indicating if it is sound to optimize as if left is greater than or equal to right.
         """
         expr = left >= right
+<<<<<<< HEAD
         return self.is_expr_static_and_true(expr)
 
     # See Note - [On Statically Known]
+=======
+        return self.statically_known_true(expr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def statically_known_lt(self, left: Expr, right: Union[Expr, int]) -> bool:
         """
         Returns a bool indicating if it is sound to optimize as if left is less than right.
         """
         expr = left < right
+<<<<<<< HEAD
         return self.is_expr_static_and_true(expr)
 
     # See Note - [On Statically Known]
+=======
+        return self.statically_known_true(expr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def statically_known_gt(self, left: Expr, right: Union[Expr, int]) -> bool:
         """
         Returns a bool indicating if it is sound to optimize as if left is greater than right.
         """
         expr = left > right
+<<<<<<< HEAD
         return self.is_expr_static_and_true(expr)
 
     # See Note - [On Statically Known]
+=======
+        return self.statically_known_true(expr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def statically_known_multiple_of(
         self, numerator: Expr, denominator: Union[Expr, int]
     ) -> bool:
         """
         Return a bool indicating if it is sound to optimize for the numerator being a multiple of the denominator.
         """
+<<<<<<< HEAD
         if free_unbacked_symbols(numerator) or free_unbacked_symbols(denominator):
             return False
         expr = sympy.Eq(numerator % denominator, 0)
         return self.is_expr_static_and_true(expr)  # type: ignore[arg-type]
 
     # See Note - [On Statically Known]
+=======
+        # The reason we skip unbacked here is that we want to avoid the cost of trying to eval this symbolically.
+        if has_free_unbacked_symbols(numerator) or has_free_unbacked_symbols(
+            denominator
+        ):
+            return False
+        expr = sympy.Eq(numerator % denominator, 0)
+        return self.statically_known_true(expr)  # type: ignore[arg-type]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def statically_known_power_of_2(self, expr: Expr) -> bool:
         """
         Returns a bool indicating if x is known to be a power of 2.
@@ -417,7 +481,11 @@ def guard_equals(self, left: Expr, right: Expr) -> Expr:
             assert bool(static_expr)
             return left
 
+<<<<<<< HEAD
         assert self.shape_env.defer_runtime_assert(expr, "guard_equals")
+=======
+        assert self.shape_env.guard_or_defer_runtime_assert(expr, "guard_equals")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return left
 
     def guard_leq(self, left: Expr, right: Expr) -> None:
@@ -431,14 +499,25 @@ def guard_lt(self, left: Expr, right: Expr) -> None:
             assert bool(static_expr)
             return
 
+<<<<<<< HEAD
         assert self.shape_env.defer_runtime_assert(expr, "guard_lt")
+=======
+        assert self.shape_env.guard_or_defer_runtime_assert(expr, "guard_lt")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def guarded_order(self, seq):
         """
         Return the order of a sequence as a permutation of range(len(seq)) and guard on that order not changing.
         """
         seq = [*map(self.remove_precomputed_replacements, seq)]
+<<<<<<< HEAD
         seq = [(self.size_hint(var), orig_idx, var) for orig_idx, var in enumerate(seq)]
+=======
+        seq = [
+            (self.size_hint_or_throw(var), orig_idx, var)
+            for orig_idx, var in enumerate(seq)
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seq.sort()
         order = [-1] * len(seq)
         last_var = None
@@ -449,6 +528,18 @@ def guarded_order(self, seq):
             last_var = var
         return order
 
+<<<<<<< HEAD
+=======
+    # Similar to the functions guard_or_false/guard_or_true in symbolic_shapes but operates on sympy
+    # expressions instead of symnodes. see Note [guard_or_].
+
+    def guard_or_false(self, left):
+        return self.evaluate_expr(left, fallback_value=False)
+
+    def guard_or_true(self, left):
+        return self.evaluate_expr(left, fallback_value=True)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The evaluate functions evaluate some symbolic sympy expression
     # (NB: not necessarily an Expr) and return what the concrete result
     # is, guarding on the expression being that result
@@ -461,10 +552,20 @@ def evaluate_expr(
         self,
         left: Union[Expr, sympy.logic.boolalg.Boolean],
         size_oblivious: bool = False,
+<<<<<<< HEAD
     ) -> bool:
         assert isinstance(left, (Expr, sympy.logic.boolalg.Boolean)), type(left)
         return self.shape_env.evaluate_expr(
             sympy.sympify(left), size_oblivious=size_oblivious
+=======
+        fallback_value: Optional[bool] = None,
+    ) -> bool:
+        assert isinstance(left, (Expr, sympy.logic.boolalg.Boolean)), type(left)
+        return self.shape_env.evaluate_expr(
+            sympy.sympify(left),
+            size_oblivious=size_oblivious,
+            fallback_value=fallback_value,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def evaluate_min(self, left: Expr, right: Expr) -> Expr:
@@ -474,8 +575,13 @@ def evaluate_min(self, left: Expr, right: Expr) -> Expr:
         if isinstance(right, Expr):
             right = sympy_subs(right, self.inv_precomputed_replacements)  # type: ignore[arg-type]
         try:
+<<<<<<< HEAD
             lv = self.size_hint(left)
             rv = self.size_hint(right)
+=======
+            lv = self.size_hint_or_throw(left)
+            rv = self.size_hint_or_throw(right)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except TypeError:  # unbacked symints
             if left == right or self.statically_known_leq(left, right):
                 return left
@@ -506,7 +612,11 @@ def evaluate_max(self, left: Expr, right: Expr) -> Expr:
     def evaluate_static_shape(self, left: Union[Expr, int]) -> int:
         if isinstance(left, int):
             return left
+<<<<<<< HEAD
         right = self.size_hint(left)
+=======
+        right = self.size_hint_or_throw(left)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.guard_equals(left, sympy.Integer(right))
         return int(right)
 
@@ -558,9 +668,23 @@ def size_hint(
             log.debug("failed on: %s", out)
             raise
 
+<<<<<<< HEAD
     def size_hints(
         self,
         exprs: Iterable[Expr],
+=======
+    def size_hint_or_throw(self, expr: Union[Expr, int]) -> int:
+        out = self.symbolic_hint(expr)
+        try:
+            return int(out)
+        except Exception:
+            log.debug("failed on: %s", out, exc_info=True)
+            raise
+
+    def size_hints(
+        self,
+        exprs: Iterable[Union[Expr, int]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *,
         fallback: Optional[int] = None,
     ) -> tuple[int, ...]:
@@ -638,12 +762,69 @@ def _stride_vars(
                 )
         return strides
 
+<<<<<<< HEAD
     def atomically_apply_size_hint(
         self, expr: Union[Expr, int], *, fallback: Optional[int] = None
     ) -> Union[Expr, int]:
         if isinstance(expr, int):
             return int(expr)
 
+=======
+    def _get_unbacked_replacements(self) -> dict[Expr, Expr]:
+        """
+        This helps with covering unbacked symint cases where you may have two
+        expressions: s0 + u0 and u1. And s0 + u0 is known to be equal to u1
+        via deferred_runtime_asserts.
+
+        For example in atomically_apply_size_hint, it must return the same size
+        hint for both s0 + u0 and u1, but it first needs to know they are equal.
+        Then it can substitute s0 + u0 for u1.
+        """
+        if self.unbacked_replacements is not None:
+            return self.unbacked_replacements
+
+        self.unbacked_replacements = {}
+        for assertions in self.shape_env.deferred_runtime_asserts.values():
+            for assertion in assertions:
+                if not isinstance(assertion.expr, sympy.Equality):
+                    continue
+
+                lhs, rhs = assertion.expr.lhs, assertion.expr.rhs
+                l2r = lhs.compare(rhs) == 1  # see sympy.Basic.compare
+                src = lhs if l2r else rhs
+                dst = rhs if l2r else lhs
+
+                existing_replacement = self.unbacked_replacements.get(src, None)
+                if existing_replacement and isinstance(
+                    existing_replacement, sympy.Symbol
+                ):
+                    # Prefer to keep replacements with symbols.
+                    continue
+                self.unbacked_replacements[src] = dst
+        return self.unbacked_replacements
+
+    @functools.lru_cache  # noqa: B019
+    def _sub_unbacked_exprs(self, expr: Expr) -> Expr:
+        # it's fine to cache this fn since self is a singleton
+        replacements = self._get_unbacked_replacements()
+        while True:
+            new_expr = expr.subs(replacements)
+            if new_expr == expr:
+                return new_expr
+            expr = sympy.factor(new_expr)
+
+    def atomically_apply_size_hint(
+        self, expr: Union[Expr, int], *, fallback: Optional[int] = None
+    ) -> Union[Expr, int]:
+        if isinstance(expr, (int, sympy.Integer)):
+            return int(expr)
+
+        if has_free_unbacked_symbols(expr):
+            # Make sure to substitute with the factored version
+            # e.g. 10*(s0 + u0) instead of 10*s0 + 10*u0
+            expr = self._sub_unbacked_exprs(sympy.factor(expr))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # For multiple expressions that depend on an unbacked symint,
         # we want to compute them consistently for a size hint we have chosen.
         # So, recursively compute expressions via size hints of contained symbols.
@@ -673,7 +854,11 @@ def stride_hints(
         result = []
         for s in self.stride_vars(index, vars, support_vars):
             try:
+<<<<<<< HEAD
                 result.append(self.size_hint(s))
+=======
+                result.append(self.size_hint_or_throw(s))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except TypeError:
                 result.append(0)
         return result
@@ -723,11 +908,19 @@ def _check_args(x, div, mod, is_first):
                 return False
 
             if is_first:
+<<<<<<< HEAD
                 # first ModularIndexing should conatins a nested ModularIndex
                 if not isinstance(x, ModularIndexing):
                     return False
             else:
                 # second ModularIndexing should constains a non-negative
+=======
+                # first ModularIndexing should contains a nested ModularIndex
+                if not isinstance(x, ModularIndexing):
+                    return False
+            else:
+                # second ModularIndexing should contains a non-negative
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # symbol
                 if not isinstance(x, sympy.Symbol) or not self.statically_known_geq(
                     x, 0
@@ -758,7 +951,11 @@ def expand_floor_div(
     ) -> Union[bool, tuple[sympy.Expr, sympy.Expr]]:
         """
         Expand the FloorDiv to the entire expression so that the expression may
+<<<<<<< HEAD
         be simplfied.
+=======
+        be simplified.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         E.g., for a 2D contiguous tensor with shape [a, 2 * b], and index variables
         x1, x2, index expression 'x1 * 2b + x2' can be easily combined.
diff --git a/torch/_inductor/standalone_compile.py b/torch/_inductor/standalone_compile.py
new file mode 100644
index 000000000000..1f58f81b17f6
--- /dev/null
+++ b/torch/_inductor/standalone_compile.py
@@ -0,0 +1,246 @@
+from __future__ import annotations
+
+import copy
+import logging
+import os
+import pickle
+import shutil
+from contextlib import AbstractContextManager, nullcontext
+from typing import Any, Callable, Literal, Optional, TYPE_CHECKING
+
+import torch.fx
+from torch._dynamo.utils import dynamo_timed
+from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+from torch._inductor.runtime.cache_dir_utils import temporary_cache_dir
+from torch._inductor.utils import BoxedBool, InputType
+from torch._subclasses import FakeTensorMode
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+from . import config
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torch.compiler._cache import CacheInfo
+    from torch.fx import GraphModule
+
+
+log = logging.getLogger(__name__)
+
+
+class CompiledArtifact:
+    """
+    CompiledArtifact class represents the precompiled inductor artifact that
+    can be invoked in order to avoid repeated compilation.
+
+    CompiledArtifact can be obtained by calling standalone_compile(gm, example_inputs)
+    to create a fresh CompiledArtifact from a GraphModule and example inputs.
+
+    Later this CompiledArtifact can be saved to disk, either as a binary or unpacked
+    into the provided folder via the CompiledArtifact.save function.
+
+    CompiledArtifact.load provides a way to create a CompiledArtifact from the
+    binary or unpacked data.
+
+    Finally, the CompiledArtifact can be invoked via the __call__ method
+    to execute the precompiled artifact.
+    """
+
+    _compiled_fn: Callable[..., Any]
+    _artifacts: Optional[tuple[bytes, CacheInfo]]
+
+    def __init__(
+        self,
+        compiled_fn: Callable[..., Any],
+        artifacts: Optional[tuple[bytes, CacheInfo]],
+    ):
+        self._compiled_fn = compiled_fn
+        self._artifacts = artifacts
+
+    def __call__(self, *args: Any) -> Any:
+        return self._compiled_fn(*args)
+
+    def save(
+        self, *, path: str, format: Literal["binary", "unpacked"] = "binary"
+    ) -> None:
+        with dynamo_timed("CompiledArtifact.save"):
+            if self._artifacts is None:
+                raise RuntimeError(
+                    "CompiledArtifact.save failed to save since there's no artifact to save"
+                )
+            artifact_bytes, cache_info = self._artifacts
+            assert len(cache_info.aot_autograd_artifacts) == 1, cache_info
+            key = cache_info.aot_autograd_artifacts[0]
+
+            if format == "binary":
+                # can't assert that it is a file since it might not exist yet
+                assert not os.path.isdir(path)
+
+                from torch.utils._appending_byte_serializer import BytesWriter
+
+                from .codecache import torch_key
+
+                writer = BytesWriter()
+                writer.write_bytes(torch_key())
+                writer.write_str(key)
+                writer.write_bytes(artifact_bytes)
+                with open(path, "wb") as file:
+                    file.write(writer.to_bytes())
+            else:
+                assert format == "unpacked"
+                if os.path.exists(path):
+                    assert os.path.isdir(path)
+                    shutil.rmtree(path, ignore_errors=True)
+
+                from .codecache import FxGraphCache
+
+                with temporary_cache_dir(path):
+                    # This function unpacks the cache artifacts to disk
+                    loaded_cache_info = torch.compiler.load_cache_artifacts(
+                        artifact_bytes
+                    )
+                    assert loaded_cache_info is not None
+                    # Now write all the output_code artifacts to disk so that
+                    # they can be inspected and modified
+                    for key in loaded_cache_info.inductor_artifacts:
+                        subdir = FxGraphCache._get_tmp_dir_for_key(key)
+                        assert os.path.exists(subdir)
+                        for path in sorted(os.listdir(subdir)):
+                            with open(os.path.join(subdir, path), "rb") as f:
+                                graph = pickle.load(f)
+                            output_file = graph.write_to_disk()
+                            log.info("Output code written to: %s", output_file)
+
+    @staticmethod
+    def load(
+        *, path: str, format: Literal["binary", "unpacked"] = "binary"
+    ) -> CompiledArtifact:
+        with dynamo_timed("CompiledArtifact.load"):
+            if format == "binary":
+                # can't assert that it is a file since it might not exist yet
+                assert not os.path.isdir(path)
+                with open(path, "rb") as file:
+                    artifacts = file.read()
+                from torch.utils._appending_byte_serializer import BytesReader
+
+                from .codecache import torch_key
+
+                reader = BytesReader(artifacts)
+                assert reader.read_bytes() == torch_key()
+                key = reader.read_str()
+                artifact_bytes = reader.read_bytes()
+                assert reader.is_finished()
+
+                torch.compiler.load_cache_artifacts(artifact_bytes)
+
+                cache_dir_ctx: AbstractContextManager[None] = nullcontext()
+            else:
+                assert format == "unpacked"
+                assert os.path.isdir(path)
+                autograd_cache_dir = os.path.join(path, "aotautograd")
+                assert os.path.isdir(autograd_cache_dir)
+                files = list(os.listdir(autograd_cache_dir))
+                assert len(files) == 1
+                key = files[0]
+                cache_dir_ctx = temporary_cache_dir(path)
+
+            with (
+                cache_dir_ctx,
+                config.patch(unsafe_skip_cache_dynamic_shape_guards=True),
+            ):
+                with torch._functorch.config.patch(strict_autograd_cache=True):
+                    from torch._functorch._aot_autograd.autograd_cache import (
+                        AOTAutogradCache,
+                    )
+
+                    entry = AOTAutogradCache._lookup(
+                        key,
+                        local=True,
+                        remote=False,
+                        args=[],
+                        cache_info={},
+                        aot_config=None,
+                    )
+
+                assert entry is not None
+
+                from .compile_fx import _CompileFxKwargs
+
+                fx_config = _CompileFxKwargs(
+                    cudagraphs=BoxedBool(False),
+                    boxed_forward_device_index=BoxedDeviceIndex(0),
+                )
+
+                context = torch._guards.TracingContext(
+                    FakeTensorMode(shape_env=ShapeEnv())
+                )
+                with torch._guards.tracing(context):
+                    compiled_fn = entry.wrap_post_compile(
+                        [], entry.sanitized_aot_config, fx_config
+                    )
+            return CompiledArtifact(lambda *args: compiled_fn(list(args)), None)
+
+
+def standalone_compile(
+    gm: GraphModule,
+    example_inputs: Sequence[InputType],
+    *,
+    dynamic_shapes: Any,
+    options: Any,
+) -> CompiledArtifact:
+    from torch.compiler._cache import CacheArtifactManager
+
+    from .compile_fx import compile_fx
+
+    ignore_shape_env = False
+    if dynamic_shapes == "from_example_inputs":
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+        # tells compile_fx to ignore the shape_envs on the ambient context
+        # and the graph_module.
+        ignore_shape_env = True
+    elif dynamic_shapes == "from_tracing_context":
+        # Reuse fake_mode from the TracingContext.
+        # NB: The TracingContext only exists if we're currently in a torch.compile backend.
+        context = torch._guards.TracingContext.get()
+        fake_mode = context.fake_mode
+    elif dynamic_shapes == "from_graph":
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+        # Strategy: find a FakeTensor in the graph output, grab its FakeTensorMode.
+        # The graph passed to standalone_compile must be an Inductor-approved graph,
+        # which means that there is at least one Tensor output and the output node
+        # contains a flat list of Tensors.
+        last_node = next(iter(reversed(gm.graph.nodes)))
+        assert last_node.op == "output"
+        assert len(last_node.args) == 1
+        for node in last_node.args[0]:
+            if "example_value" in node.meta:
+                maybe_tensor = node.meta["example_value"]
+                if isinstance(maybe_tensor, torch._subclasses.fake_tensor.FakeTensor):
+                    fake_mode = maybe_tensor.fake_mode
+    else:
+        raise ValueError(
+            f"standalone_compile got unsupported `dynamic_shapes` value: dynamic_shapes={dynamic_shapes}."
+        )
+
+    context = torch._guards.TracingContext(fake_mode)
+    with (
+        torch._guards.tracing(context),
+        CacheArtifactManager.with_fresh_cache(),
+        config.patch("triton.autotune_at_compile_time", True),
+    ):
+        # compile_fx can mutate gm
+        gm = copy.deepcopy(gm)
+        compiled_fn = compile_fx(
+            gm, example_inputs, ignore_shape_env=ignore_shape_env, **options
+        )
+        assert callable(compiled_fn)
+
+        artifacts = torch.compiler.save_cache_artifacts()
+        if artifacts is None:
+            log.warning(
+                "standalone_compile artifact generation failed, cannot save. "
+                "Run with TORCH_LOGS=+torch._inductor.codecache to identify the problem"
+            )
+
+    return CompiledArtifact(compiled_fn, artifacts)
diff --git a/torch/_inductor/template_heuristics.py b/torch/_inductor/template_heuristics.py
index 5567350f84e6..c4232bf3c71f 100644
--- a/torch/_inductor/template_heuristics.py
+++ b/torch/_inductor/template_heuristics.py
@@ -2,6 +2,10 @@
 
 import dataclasses
 import itertools
+<<<<<<< HEAD
+=======
+import math
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functools import partial
 from threading import Lock
 from typing import Any, Callable, TYPE_CHECKING
@@ -519,6 +523,43 @@ def _scale_mm_configs(
 
         return scaled_configs
 
+<<<<<<< HEAD
+=======
+    def _prune_exhaustive_configs(
+        self,
+        configs: list[BaseConfig],
+        dtype_size: int,
+    ) -> list[BaseConfig]:
+        import torch
+
+        pruned_configs = []
+        for gemm_config in configs:
+            device = torch.cuda.current_device()
+            props = torch.cuda.get_device_properties(device)
+            sm_available = props.shared_memory_per_block_optin  # type: ignore[attr-defined]
+            NUM_REG = 255
+
+            acc_regs = math.ceil(
+                gemm_config.block_m * gemm_config.block_n / (gemm_config.num_warps * 32)
+            )
+
+            shared_mem_accum = dtype_size * (
+                gemm_config.block_m * gemm_config.block_k
+                + gemm_config.block_n * gemm_config.block_k
+            )
+
+            # Will use more shared memory than available
+            if shared_mem_accum * gemm_config.num_stages > sm_available:
+                continue
+            # Lower bound for register spillage, if exceeds the kernel will certainly spill
+            elif acc_regs > NUM_REG:
+                continue
+
+            pruned_configs.append(gemm_config)
+
+        return pruned_configs
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def preprocess_mm_configs(
         self,
         m: int,
@@ -528,10 +569,21 @@ def preprocess_mm_configs(
         has_int8_tensor: bool = False,
         scale: int = 1,
         exclude: Callable[[int, int, int], bool] = lambda m, n, k: False,
+<<<<<<< HEAD
+=======
+        dtype_size: int = 0,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Generator[TritonConfig, None, None]:
         scaled_configs = self._scale_mm_configs(
             m, n, k, configs, scale, has_int8_tensor, exclude
         )
+<<<<<<< HEAD
+=======
+
+        if config.max_autotune_gemm_search_space == "EXHAUSTIVE":
+            assert dtype_size > 0, "dtype_size must be provided for exhaustive search"
+            scaled_configs = self._prune_exhaustive_configs(scaled_configs, dtype_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._finalize_mm_configs(scaled_configs)
 
     def triton_config(
@@ -562,7 +614,21 @@ def get_mixed_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
         return partial(self.preprocess_mm_configs, configs=mm_configs)
 
     def get_persistent_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
+<<<<<<< HEAD
         return partial(self.preprocess_mm_configs, configs=self.persistent_mm_configs)
+=======
+        persistent_mm_configs = (
+            self.exhaustive_configs
+            if config.max_autotune_gemm_search_space == "EXHAUSTIVE"
+            else self.persistent_mm_configs
+        )
+
+        # num_warps=2 not safe for TMA
+        persistent_mm_configs = [
+            config for config in persistent_mm_configs if config.num_warps != 2
+        ]
+        return partial(self.preprocess_mm_configs, configs=persistent_mm_configs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_scaled_mm_configs(self) -> partial[Generator[TritonConfig, None, None]]:
         return partial(self.preprocess_mm_configs, configs=self.scaled_mm_configs)
diff --git a/torch/_inductor/test_case.py b/torch/_inductor/test_case.py
index e59ba6406773..57d82e1b06c0 100644
--- a/torch/_inductor/test_case.py
+++ b/torch/_inductor/test_case.py
@@ -8,7 +8,11 @@
 )
 from torch._functorch import config as functorch_config
 from torch._inductor import config
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def run_tests(needs: Union[str, tuple[str, ...]] = ()) -> None:
@@ -41,7 +45,11 @@ def setUp(self) -> None:
             os.environ.get("INDUCTOR_TEST_DISABLE_FRESH_CACHE") != "1"
             and os.environ.get("TORCH_COMPILE_DEBUG") != "1"
         ):
+<<<<<<< HEAD
             self._inductor_test_stack.enter_context(fresh_inductor_cache())
+=======
+            self._inductor_test_stack.enter_context(fresh_cache())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def tearDown(self) -> None:
         super().tearDown()
diff --git a/torch/_inductor/test_operators.py b/torch/_inductor/test_operators.py
index 2f205fcba1d9..6f34f23b282b 100644
--- a/torch/_inductor/test_operators.py
+++ b/torch/_inductor/test_operators.py
@@ -12,7 +12,11 @@
     )
 
     _test_lib_impl = torch.library.Library("_inductor_test", "IMPL")
+<<<<<<< HEAD
     for dispatch_key in ("CPU", "CUDA", "Meta"):
+=======
+    for dispatch_key in ("CPU", "CUDA", "MPS", "Meta"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _test_lib_impl.impl("realize", lambda x: x.clone(), dispatch_key)
 
     class Realize(Function):
diff --git a/torch/_inductor/tiling_utils.py b/torch/_inductor/tiling_utils.py
new file mode 100644
index 000000000000..4a1febe08e99
--- /dev/null
+++ b/torch/_inductor/tiling_utils.py
@@ -0,0 +1,764 @@
+import dataclasses
+import functools
+import itertools
+import sys
+from collections import Counter, defaultdict
+from collections.abc import Iterable, Iterator
+from typing import Callable, Literal, Optional, overload, TYPE_CHECKING, TypeVar, Union
+
+import sympy
+
+import torch
+from torch._inductor import config
+from torch._inductor.dependencies import index_vars_no_squeeze
+from torch._inductor.utils import sympy_product, sympy_subs
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._sympy.functions import Identity
+from torch.utils._sympy.solve import try_solve
+from torch.utils._sympy.symbol import symbol_is_type, SymT
+
+from .virtualized import V
+
+
+T = TypeVar("T")
+U = TypeVar("U")
+
+
+Split = tuple[sympy.Expr, ...]
+VarsAndRanges = tuple[list[sympy.Symbol], list[sympy.Expr]]
+
+
+loop_tiling_log = torch._logging.getArtifactLogger(__name__, "loop_tiling")
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+
+
+if TYPE_CHECKING:
+    from torch._inductor.scheduler import FusedSchedulerNode, SchedulerNode
+
+
+def solve_for_zero(expr: sympy.Expr) -> Optional[sympy.Expr]:
+    """
+    Given an expr with a single free symbol, solve for a constant relation that would make
+    this expression 0.
+    """
+    if expr.is_constant():
+        return None
+    elif isinstance(expr, FloorDiv):
+        return None
+
+    assert len(expr.free_symbols) == 1
+    free_symbol = next(iter(expr.free_symbols))
+    if isinstance(expr, ModularIndexing):
+        out = try_solve(sympy.Eq(expr.args[0], expr.args[2]), free_symbol)
+    else:
+        out = try_solve(sympy.Eq(expr, 0), free_symbol)
+    if not out or not out[1].is_constant():
+        return None
+    return out[1]
+
+
+def solve_for_tiling(expr: sympy.Expr) -> Optional[sympy.Expr]:
+    """
+    Giving an expr with a single free symbol, try to find a tiling that would
+    make the expression coalesced with respect to that symbol.
+
+    Tiling an expression `x` by `y` means that the expression will now be indexed
+    by both the original (x) and by (x * y). So we are looking for a
+    multiplicative factor that will make ((x + 1) * y) - (x * y) == 1.
+
+    To simplify things for sympy, we'll try just x * y == 1, check x(1) and x(0).
+    """
+
+    if len(expr.free_symbols) == 0:
+        return None
+
+    free_symbol = next(iter(expr.free_symbols))
+
+    def _solve_simple_expr(expr: sympy.Expr) -> Optional[sympy.Expr]:
+        assert not expr.has(ModularIndexing) and not expr.has(FloorDiv)
+        if len(expr.free_symbols) != 1:
+            return None
+
+        out = try_solve(sympy.Eq(expr, 1), free_symbol)
+        if not out or not out[1].is_constant():
+            return None
+        return out[1]
+
+    # Sympy solving is very limited with ModularIndexing and FloorDiv,
+    # but good otherwise.
+    if not expr.has(ModularIndexing) and not expr.has(FloorDiv):
+        return _solve_simple_expr(expr)
+
+    required_values = []
+    eq_1_expressions = []
+
+    # very piecemeal solution if ModularIndexing or FloorDiv involved.
+    # Look for terms we'll try to make 0, and then other terms we'll try to make 1.
+    # Expand as needed.
+    for arg in sympy.Add.make_args(expr):
+        # Try to make mul terms 0
+        if isinstance(arg, sympy.Mul):
+            seen = False
+            # TODO - only need one of these to be solvable to zero
+            #
+            for mul_arg in arg.args:
+                out = solve_for_zero(mul_arg)
+                if out is None:
+                    continue
+
+                assert out.is_constant()
+                seen = True
+                required_values.append(out)
+
+            if not seen:
+                return None
+        else:
+            eq_1_expressions.append(arg)
+
+    if not eq_1_expressions:
+        return None
+
+    eq_1_expr = sum(eq_1_expressions)
+
+    def indexing_div_rep(
+        x: sympy.Expr,
+        y: sympy.Expr,
+        z: Optional[sympy.Expr] = None,
+    ) -> sympy.Expr:
+        return x / y
+
+    # For the purposes of tiling/coalesced access, approximate ModularIndexing and FloorDiv
+    # then check later
+    eq_1_expr_simplified = eq_1_expr.replace(ModularIndexing, indexing_div_rep).replace(
+        FloorDiv, indexing_div_rep
+    )
+
+    out = _solve_simple_expr(eq_1_expr_simplified)
+    # since we approximated FloorDiv/ModularIndexing, double check here
+    if not out or not (sympy_subs(eq_1_expr, {free_symbol: out})) == 1:
+        return None
+
+    required_values.append(out)
+
+    if len(OrderedSet(required_values)) == 1:
+        return required_values[0]
+
+    return None
+
+
+def find_coalesced_var(
+    index: sympy.Expr, var_ranges: dict[sympy.Expr, int]
+) -> Optional[sympy.Expr]:
+    """
+    Try to find the symbol which coalesces this index
+    """
+    top_level_terms = sympy.Add.make_args(index)
+    for v in var_ranges:
+        if v in top_level_terms:
+            return v
+
+    # Approximate analysis by evaluating at 1 and 0
+    variables: dict[sympy.Symbol, int] = {}
+    for v in index.free_symbols:
+        if v in var_ranges:
+            variables[v] = 0
+        else:
+            variables[v] = get_hint(v)
+
+    zero_index = sympy_subs(index, variables)
+    for v in var_ranges.keys():
+        variables[v] = 1
+        try:
+            new_val = sympy_subs(index, variables)
+        except ZeroDivisionError:
+            loop_tiling_log.info("zero division error %s %s", index, variables)
+            continue
+        if new_val - zero_index == 1:
+            variables[v] = 2
+            # in some more complex expressions, 0->1 will be coalesced,
+            # but not 1->2
+            if (sympy_subs(index, variables) - new_val) == 1:
+                return v
+        variables[v] = 0
+
+    return None
+
+
+@dataclasses.dataclass(frozen=True)
+class FusedNormalizedReadsWrites:
+    """
+    Normalized reads and writes for nodes in the same FusedSchedulerNode.
+    """
+
+    index_vars: OrderedSet[sympy.Symbol]
+    reduce_vars: OrderedSet[sympy.Symbol]
+    reads: dict[sympy.Expr, OrderedSet[str]]
+    writes: dict[sympy.Expr, OrderedSet[str]]
+    var_ranges: dict[sympy.Symbol, int]
+
+
+@overload
+def get_pw_red_splits(
+    n: "SchedulerNode",
+    pointwise_numel: sympy.Expr,
+    red_numel: sympy.Expr,
+    none_if_not_divisible: Literal[True],
+) -> Optional[tuple[VarsAndRanges, VarsAndRanges]]: ...
+
+
+@overload
+def get_pw_red_splits(
+    n: "SchedulerNode",
+    pointwise_numel: sympy.Expr,
+    red_numel: sympy.Expr,
+    none_if_not_divisible: Literal[False] = False,
+) -> tuple[VarsAndRanges, VarsAndRanges]: ...
+
+
+def get_pw_red_splits(
+    n: "SchedulerNode",
+    pointwise_numel: sympy.Expr,
+    red_numel: sympy.Expr,
+    none_if_not_divisible: bool = False,
+) -> Optional[tuple[VarsAndRanges, VarsAndRanges]]:
+    if n.is_reduction() or sympy_product(n._body.sizes[0]) == pointwise_numel:
+        return (
+            (n._body.iter_vars, n._body.sizes[0]),
+            (n._body.reduce_vars, n._body.sizes[1]),
+        )  # type: ignore[return-value]
+
+    assert sympy_product(n._body.sizes[0]) == pointwise_numel * red_numel  # type: ignore[operator]
+    i = len(n._body.sizes[0]) - 1
+    prod = 1
+    while i >= 0:
+        prod *= n._body.sizes[0][i]
+        if prod == red_numel:
+            break
+        i -= 1
+
+    if i >= 0:
+        pw_splits = n._body.sizes[0][0:i]
+        iter_vars = n._body.iter_vars[0:i]
+
+        red_splits = n._body.sizes[0][i:]
+        red_vars = n._body.iter_vars[i:]
+        return (iter_vars, pw_splits), (red_vars, red_splits)  # type: ignore[return-value]
+
+    if none_if_not_divisible:
+        return None
+    else:
+        return (
+            (n._body.iter_vars, n._body.sizes[0]),
+            (n._body.reduce_vars, n._body.sizes[1]),
+        )  # type: ignore[return-value]
+
+
+class NodeSplitGetter:
+    """
+    Finds a Pointwise, Reduction Split that compatible with all nodes in a SchedulerNode.
+    """
+
+    def __init__(
+        self,
+        node: Union["FusedSchedulerNode", "SchedulerNode"],
+    ):
+        self.node = node
+        self.pointwise_numel: sympy.Expr = node.group[1][0]
+        self.red_numel: sympy.Expr = node.group[1][1]
+
+        self.pw_split_options: dict[int, OrderedSet[Split]] = defaultdict(OrderedSet)
+
+        self.reduction_split: Split = ()
+        self.all_node_sizes: OrderedSet[tuple[Split, Split]] = OrderedSet()
+
+        fused_group = node.group[1]
+        for n in reversed(node.get_nodes()):
+            if not isinstance(n, torch._inductor.scheduler.SchedulerNode):
+                continue
+
+            # if we can't split the pw ranges into a (pw, red) split,
+            # dont add as a split option, but do make sure we check that this size
+            # is splittable
+            maybe_splits = get_pw_red_splits(
+                n, self.pointwise_numel, self.red_numel, none_if_not_divisible=True
+            )
+            if maybe_splits is None:
+                self.all_node_sizes.add(n._body.sizes)
+                continue
+
+            (_, n_pw_splits), (_, n_red_splits) = maybe_splits
+
+            # fill in reduction size
+            n_pw_splits, n_red_splits = (
+                torch._inductor.codegen.simd.SIMDKernel.prepare_split_iteration_lengths(
+                    fused_group, (n_pw_splits, n_red_splits), self.red_numel
+                )
+            )
+
+            self.pw_split_options[len(n_pw_splits)].add(tuple(n_pw_splits))
+
+            # initially, we are just going to do a single reduction split since
+            # reduction tiling is off by default. even if we miss a reduction split,
+            # we can recover it in the split var analysis.
+            # TODO: an earlier version for this code tried to iteratively try the maximum number
+            # of split vars, by iterating over both pointwise and reduction. but not worth
+            # the complexity yet.
+
+            if n_red_splits != ():
+                self.reduction_split = (sympy_product(n_red_splits),)
+
+            n_size = (tuple(n_pw_splits), tuple(n_red_splits))
+            self.all_node_sizes.add(n_size)
+
+        self.seen_pw_splits: OrderedSet[Split] = OrderedSet()
+
+    def get_node_splits(self) -> tuple[Split, Split]:
+        """
+        Get a compatible pointwise, reduction split of the node
+        """
+
+        if len(self.all_node_sizes) == 1:
+            return next(iter(self.all_node_sizes))
+
+        max_pw_split = max(self.pw_split_options.keys())
+        for pw_split_len in range(max_pw_split, 0, -1):
+            for pw_split in self.pw_split_options[pw_split_len]:
+                if out := self.try_split(pw_split, self.reduction_split):
+                    return out
+
+            # combine dims for next round
+            for pw_split in self.pw_split_options[pw_split_len]:
+                for i in range(len(pw_split) - 1):
+                    new_split = tuple(
+                        pw_split[0:i]
+                        + (sympy_product(pw_split[i : i + 2]),)
+                        + pw_split[i + 2 :]
+                    )
+                    self.pw_split_options[len(new_split)].add(new_split)
+
+        # if for whatever reason we couldn't split above, return default split
+        return ((self.pointwise_numel,), (self.red_numel,))
+
+    def try_split(self, pw: Split, red: Split) -> Optional[tuple[Split, Split]]:
+        """
+        See if this split is compatible, and potentially returning a longer split
+        than the input.
+        """
+
+        from torch._inductor.codegen.simd import CantSplit, SIMDKernel
+
+        if pw in self.seen_pw_splits:
+            return None
+        self.seen_pw_splits.add(pw)
+
+        for n_pw, n_red in self.all_node_sizes:
+            try:
+                groups = pw + red
+                lengths = (n_pw, n_red)
+                splits, getters = SIMDKernel._split_iteration_ranges(groups, lengths)
+            except CantSplit:
+                return None
+
+            assert len(getters) == 2
+            pw_group_splits = splits[: len(pw)]
+            # if we had to divide a variable into two to do this split,
+            # then lets try the larger, induced split.
+            # e.g. splitting (12, 2) into (2, 12) will split the first var into:
+            # (2, 6) and produce an overall split of (2, 6, 2)
+            flattened_pw_splits = tuple(itertools.chain.from_iterable(pw_group_splits))
+            if flattened_pw_splits != pw:
+                if out := self.try_split(flattened_pw_splits, red):
+                    return out
+
+        return pw, red
+
+
+if sys.version_info >= (3, 10):
+    # On Python 3.10+ we can use zip(strict=True)
+    zip_equal = functools.partial(zip, strict=True)
+else:
+    # Fallback for older versions
+    def zip_equal(it1: Iterable[T], it2: Iterable[U]) -> Iterator[tuple[T, U]]:
+        """
+        Zip two iterables, raising ValueError if their lengths differ.
+        """
+        if len(it1) != len(it2):
+            raise ValueError(f"Lengths differ: {len(it1)} != {len(it2)}")
+        return zip(it1, it2)
+
+
+def apply_var_mapping(
+    iter_vars: list[sympy.Symbol],
+    red_vars: list[sympy.Symbol],
+    norm_pw_vars: list[sympy.Symbol],
+    norm_red_vars: list[sympy.Symbol],
+    new_ranges: list[list[sympy.Expr]],
+    return_getters_groups: list[list[Callable[[list[sympy.Expr]], sympy.Expr]]],
+) -> dict[sympy.Symbol, sympy.Expr]:
+    """Maps original variables to expressions using normalized variables."""
+
+    # the output of split_iteration_range is a new_ranges, return_getters_groups
+    # new_ranges is a flattened list of ranges corresponding to the new pw and red vars
+    # for example, taking in pw vars of range (6, 6) to normalized range [36],
+    # new_ranges would be [[6, 6]]
+    # There is a return_getter callable for each input iter_var and red_vars.
+    # if you flatten out all of the ranges, and create a variable for each index,
+    # then applying the flattening vars to the callables in return_getters_groups
+    # gives you the mapping from input vars -> flattened vars.
+    # From there, we can compute the output, normalized variables.
+    # For instance [6, 6] corresponding to flat vars v0, v1 will be
+    # v0 + 6 * v1
+
+    # Create flattened iteration variables
+    num_vars = sum(len(s) for s in new_ranges)
+    flat_vars = sympy.symbols(f"v_0:{num_vars}")
+    count = 0
+
+    if len(iter_vars) == 0 and len(red_vars) == 0:
+        return {}
+
+    assert len(new_ranges) == len(norm_pw_vars + norm_red_vars)
+    apply_groups = []
+    for group in return_getters_groups:
+        apply_groups.append([g(flat_vars) for g in group])
+
+    iter_vars_to_flat_vars = {}
+    for i, (group, var_group) in enumerate(
+        zip_equal(apply_groups, ((iter_vars, red_vars)))
+    ):
+        # if the node has sizes (p0, 1) and the fused node is (p0, r0)
+        # the reduction var gets filled in for split_iteration_range
+        if len(group) != len(var_group):
+            assert i == 1
+            assert len(var_group) == 0
+            continue
+
+        iter_vars_to_flat_vars.update({v: g for g, v in zip(group, var_group)})
+
+    count = 0
+    flat_vars_to_new_vars = {}
+    for new_range, new_var in zip_equal(new_ranges, norm_pw_vars + norm_red_vars):
+        range_vars = []
+        for i in range(len(new_range)):
+            range_vars.append(flat_vars[count])
+            count += 1
+
+        prod = 1
+        for i in range(len(new_range) - 1, -1, -1):
+            flat_vars_to_new_vars[range_vars[i]] = new_var * prod
+            prod = new_range[i] * prod
+
+    return {
+        k: sympy_subs(v, flat_vars_to_new_vars)
+        for k, v in iter_vars_to_flat_vars.items()
+    }
+
+
+def extract_normalized_read_writes(
+    node: Union["FusedSchedulerNode", "SchedulerNode"],
+) -> Optional[FusedNormalizedReadsWrites]:
+    """Extracts index variables, reduce variables, read/write expressions, and variable ranges from a fused node."""
+    reads: dict[sympy.Expr, OrderedSet[str]] = defaultdict(OrderedSet)
+    writes: dict[sympy.Expr, OrderedSet[str]] = defaultdict(OrderedSet)
+
+    all_output_names = node.get_buffer_names()
+    op_names = node.get_operation_names()
+    outputs: OrderedSet[str] = OrderedSet()
+    removed_buffers: OrderedSet[str] = OrderedSet()
+    for buf_name in all_output_names:
+        if V.graph.scheduler.can_buffer_be_removed_through_fusion(buf_name, op_names):
+            removed_buffers.add(buf_name)
+        else:
+            outputs.add(buf_name)
+
+    inputs = OrderedSet(
+        dep.name for dep in node.read_writes.reads if dep.name not in removed_buffers
+    )
+
+    pointwise_numel: sympy.Expr = node.group[1][0]
+    red_numel: sympy.Expr = node.group[1][1]
+
+    # TODO - a few dynamic shapes issues to resolve
+    if any(
+        (isinstance(var, sympy.Expr) and not var.is_constant())
+        for var in (pointwise_numel, red_numel)
+    ):
+        return None
+
+    pw_splits, red_splits = NodeSplitGetter(node).get_node_splits()
+
+    # lets use different prefix (`n`) to distinguish
+    (norm_pw_vars, norm_red_vars), ranges = index_vars_no_squeeze(
+        pw_splits, red_splits, prefix="n"
+    )
+    node = node
+
+    for n in list(node.get_nodes()):
+        if not isinstance(n, torch._inductor.scheduler.SchedulerNode):
+            continue
+
+        body = n._body
+
+        # TODO - not handled well. indirect loads will not be coalesced,
+        # need to account for that in analysis.
+        if body.indirect_vars:
+            return None
+
+        n_reads: dict[sympy.Expr, OrderedSet[str]] = defaultdict(OrderedSet)
+        n_writes: dict[sympy.Expr, OrderedSet[str]] = defaultdict(OrderedSet)
+
+        # TODO - will the names for all the inputs/outputs accurately
+        # reflect mutation, or do I need to remap with mutation_real_name
+        for inp in inputs:
+            for expr in body.get_all_read_expr(inp):
+                n_reads[expr].add(inp)
+
+        for out in outputs:
+            for expr in body.get_all_write_expr(out):
+                n_writes[expr].add(out)
+
+        if not n_reads and not n_writes:
+            continue
+
+        (iter_vars, n_pw_splits), (red_vars, n_red_splits) = get_pw_red_splits(
+            n, pointwise_numel, red_numel
+        )
+
+        groups = pw_splits + red_splits
+        lengths = (n_pw_splits, (n_red_splits))
+        lengths = (
+            torch._inductor.codegen.simd.SIMDKernel.prepare_split_iteration_lengths(
+                groups, lengths, red_numel
+            )
+        )
+        new_ranges, return_getters_groups = (
+            torch._inductor.codegen.simd.SIMDKernel._split_iteration_ranges(
+                groups, lengths
+            )
+        )
+        var_map = apply_var_mapping(
+            iter_vars,
+            red_vars,
+            norm_pw_vars,
+            norm_red_vars,
+            new_ranges,
+            return_getters_groups,
+        )
+
+        # We create Identity sympy.Functions to prevent expansion to int64,
+        # unwrap for tiling analysis.
+        def remove_identity(expr: sympy.Expr) -> sympy.Expr:
+            return expr.replace(Identity, lambda x: x)
+
+        n_reads_new = {
+            sympy_subs(remove_identity(read), var_map): v for read, v in n_reads.items()
+        }
+        n_writes_new = {
+            sympy_subs(remove_identity(write), var_map): v
+            for write, v in n_writes.items()
+        }
+
+        for expr, buf_names in n_reads_new.items():
+            reads[expr] |= buf_names
+
+        for expr, buf_names in n_writes_new.items():
+            writes[expr] |= buf_names
+
+    reads = {
+        V.graph.sizevars.simplify_with_ranges(r, ranges): v for r, v in reads.items()
+    }
+    writes = {
+        V.graph.sizevars.simplify_with_ranges(w, ranges): v for w, v in writes.items()
+    }
+
+    fused_out = FusedNormalizedReadsWrites(
+        norm_pw_vars,  # type: ignore[arg-type]
+        norm_red_vars,  # type: ignore[arg-type]
+        reads,
+        writes,
+        ranges,
+    )
+    loop_tiling_log.info("Normalized Fused reads: %s", fused_out)
+    return fused_out
+
+
+def get_score(addr: sympy.Expr, var_ranges: dict[sympy.Symbol, int]) -> int:
+    """
+    Score addr according to its approximate size
+    """
+
+    # TODO - deduplicate with candidate_tilings
+    var_sizes = []
+    for v in addr.free_symbols:
+        v_size = var_ranges.get(v, None)
+        # TODO - reason about indirect vars
+        if not symbol_is_type(v, SymT.INDIRECT) and v_size is not None:
+            var_sizes.append(v_size)
+    from .virtualized import V
+
+    return V.graph.sizevars.atomically_apply_size_hint(
+        sympy_product(var_sizes), fallback=config.unbacked_symint_fallback
+    )
+
+
+def get_hint(v: Union[sympy.Expr, int]) -> int:
+    if isinstance(v, int):
+        return v
+    else:
+        return V.graph.sizevars.size_hint(v, fallback=config.unbacked_symint_fallback)
+
+
+@dataclasses.dataclass(frozen=True)
+class VarTiling:
+    """
+    Tiling of a var by `tiling_factor` that yields additional coalesced mem accesses by `benefit_score`
+    """
+
+    var: sympy.Symbol
+    tiling_factor: int
+    score: int
+
+
+@dataclasses.dataclass(frozen=True)
+class CoalesceVarAnalysis:
+    # Var -> Memory Score - not strictly the amount of memory
+    # because we multiply writes x2
+    # TODO: separate into dataclass that olds mem, dtype, is_write
+    coalesced_by_var: dict[sympy.Expr, int]
+
+    norm_read_writes: FusedNormalizedReadsWrites
+
+    suggested_split: Optional[VarTiling] = None
+
+
+def analyze_memory_coalescing(
+    fused_node: Union["FusedSchedulerNode", "SchedulerNode"],
+) -> Optional[CoalesceVarAnalysis]:
+    """
+    Find variables that coalesce the reads and writes and score the total size.
+
+    If uncoalesced memory expressions are found, look for additionally tiling of variables
+    which will coalesce memory accesses.
+
+    For instance - for the following expression:
+
+    (32*p0) // 2048
+
+    Tiling p0 by 64 will make this expression coalesced.
+    """
+
+    norm_read_writes = extract_normalized_read_writes(fused_node)
+
+    if norm_read_writes is None:
+        return None
+
+    reads = norm_read_writes.reads
+    writes = norm_read_writes.writes
+    var_ranges = norm_read_writes.var_ranges
+
+    coalesced_by_var: dict[sympy.Symbol, int] = Counter()
+    uncoalesced_addrs: dict[sympy.Expr, int] = Counter()
+
+    for is_read, (memory_expr, buf_names) in itertools.chain(
+        ((True, item) for item in reads.items()),
+        ((False, item) for item in writes.items()),
+    ):
+        # skip memory deps with indirect vars - todo: better handling
+        indirect_expr = bool(
+            memory_expr.free_symbols - norm_read_writes.var_ranges.keys()
+        )
+
+        if indirect_expr:
+            continue
+
+        size = get_score(memory_expr, var_ranges)
+        if size == 0:
+            continue
+
+        maybe_coalesced_var = find_coalesced_var(memory_expr, var_ranges)
+
+        byte_multipler = 0
+        for buf_name in buf_names:
+            if buf := V.graph.try_get_buffer(buf_name):
+                byte_multipler += buf.dtype.itemsize
+
+        # coalesced writes more important
+        byte_multipler *= 1 if is_read else 2
+
+        if maybe_coalesced_var:
+            coalesced_by_var[maybe_coalesced_var] += size * byte_multipler
+        else:
+            uncoalesced_addrs[memory_expr] += size * byte_multipler
+
+    if not uncoalesced_addrs:
+        return CoalesceVarAnalysis(
+            coalesced_by_var=coalesced_by_var, norm_read_writes=norm_read_writes
+        )
+
+    # map from var -> tiling -> total_score
+    tiling_scores: dict[sympy.Expr, dict[int, int]] = defaultdict(Counter)
+
+    for uncoalesced_expr, addr_score in uncoalesced_addrs.items():
+        expr_subs = dict.fromkeys(uncoalesced_expr.free_symbols, 0)
+        for v in uncoalesced_expr.free_symbols:
+            # skip non iter/reduce var variables
+            if v not in var_ranges:
+                continue
+            # skip small addrs
+            if addr_score == 0:
+                continue
+            del expr_subs[v]
+            single_var_expr = sympy_subs(uncoalesced_expr, expr_subs)
+            expr_subs[v] = 0
+            tiling_factor = solve_for_tiling(single_var_expr)
+            if (
+                tiling_factor is None
+                or not tiling_factor.is_constant()
+                or not tiling_factor.is_integer
+            ):
+                continue
+
+            tiling_factor = int(tiling_factor)
+            if not V.graph.sizevars.statically_known_lt(tiling_factor, var_ranges[v]):
+                continue
+
+            # TODO - if a var is in the middle, such as [n0, n1, n2]
+            # n1 can can be split beyond range
+
+            MIN_TILING_BLOCK = 8
+            if not all(
+                V.graph.sizevars.statically_known_lt(MIN_TILING_BLOCK, block)
+                for block in (tiling_factor, var_ranges[v] // tiling_factor)
+            ):
+                continue
+
+            tiling_scores[v][tiling_factor] += addr_score
+
+    if len(tiling_scores) == 0:
+        return CoalesceVarAnalysis(
+            coalesced_by_var=coalesced_by_var, norm_read_writes=norm_read_writes
+        )
+
+    best_tiling: Optional[tuple[sympy.Expr, int]] = None
+    best_tiling_score = 0
+
+    for var, tiling_counter in tiling_scores.items():
+        for tile, tile_score in tiling_counter.items():
+            if tile_score > best_tiling_score:
+                best_tiling = (var, tile)
+                best_tiling_score = tile_score
+
+    if best_tiling is None:
+        return CoalesceVarAnalysis(
+            coalesced_by_var=coalesced_by_var, norm_read_writes=norm_read_writes
+        )
+
+    # TODO - for strictly pointwise fusions,
+    # we can consider just swizzling the var if the var we are going to tile
+    # does not coalesce a significant portion of global reads
+    # TODO - could also prefer index var splits to reduction, better tested
+    return CoalesceVarAnalysis(
+        coalesced_by_var=coalesced_by_var,
+        norm_read_writes=norm_read_writes,
+        suggested_split=VarTiling(best_tiling[0], best_tiling[1], best_tiling_score),
+    )
diff --git a/torch/_inductor/triton_bundler.py b/torch/_inductor/triton_bundler.py
index ab8307f43516..7827c705fc41 100644
--- a/torch/_inductor/triton_bundler.py
+++ b/torch/_inductor/triton_bundler.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+import copy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import dataclasses
 import logging
 import os
@@ -43,6 +47,24 @@ class TritonKernelArtifact:
 
 
 @dataclasses.dataclass(frozen=True)
+<<<<<<< HEAD
+=======
+class StaticallyLaunchedAutotuner:
+    """
+    Represents a statically compiled CachingAutotuner object that we can
+    save directly in the cache. A CachingAutotuner is made up of a list of
+    StaticTritonCompileResults, each of which uses the cubin from a TritonKernelArtifact.
+
+    Statically saved here have their cubin files saved by a corresponding TritonBundleEntry.
+    """
+
+    cache_key: str
+    kernel_name: str
+    kernel: "CachingAutotuner"  # type: ignore[name-defined] # noqa: F821
+
+
+@dataclasses.dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TritonKernelArtifacts:
     """
     Collection of artifacts for a particular kernel.
@@ -60,6 +82,20 @@ class TritonBundlerMetadata:
     """
 
     cached_kernel_names: list[str]
+<<<<<<< HEAD
+=======
+    statically_launched_kernel_names: list[str]
+
+
+@dataclasses.dataclass(frozen=True)
+class TritonBundle:
+    """
+    Serializable bundle to save into FXGraphCache
+    """
+
+    kernel_artifacts: list[TritonKernelArtifacts]
+    static_autotuners: list[StaticallyLaunchedAutotuner]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TritonBundler:
@@ -79,9 +115,16 @@ class TritonBundler:
     """
 
     _entries: Optional[list[TritonBundleEntry]] = None
+<<<<<<< HEAD
 
     # __grp__kernel_name.json contains metadata with source code paths
     # we use this as sentinal value for search and replace
+=======
+    _static_autotuners: Optional[list[StaticallyLaunchedAutotuner]] = None
+
+    # __grp__kernel_name.json contains metadata with source code paths
+    # we use this as sentinel value for search and replace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _REPLACE_BYTES: bytes = b"[REPLACE]"
 
     @staticmethod
@@ -112,6 +155,10 @@ def begin_compile(cls) -> None:
         log.debug("TritonBundler.begin_compile is called")
         assert cls._entries is None
         cls._entries = []
+<<<<<<< HEAD
+=======
+        cls._static_autotuners = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def end_compile(cls) -> None:
@@ -121,6 +168,10 @@ def end_compile(cls) -> None:
         """
         log.debug("TritonBundler.end_compile is called")
         cls._entries = None
+<<<<<<< HEAD
+=======
+        cls._static_autotuners = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def put(cls, kernel_hash: str, device: int) -> None:
@@ -134,19 +185,118 @@ def put(cls, kernel_hash: str, device: int) -> None:
             )
 
     @classmethod
+<<<<<<< HEAD
     def collect(
         cls,
     ) -> tuple[list[TritonKernelArtifacts], Optional[TritonBundlerMetadata]]:
+=======
+    def put_static_autotuner(cls, key: str, kernel: "CachingAutotuner") -> None:  # type: ignore[name-defined] # noqa: F821
+        from torch._inductor import config
+
+        assert config.use_static_cuda_launcher
+        if (entries := cls._static_autotuners) is not None:
+            # Clear a bunch of unpicklable values and make a copy to save
+            # for FXGraphCache
+            old_values = kernel.prepare_for_pickle()
+            new_kernel = copy.deepcopy(kernel)
+            new_kernel.prepare_for_caching()
+            new_kernel._reload_kernel = None
+
+            entries.append(
+                StaticallyLaunchedAutotuner(
+                    key,
+                    new_kernel.inductor_meta.get("kernel_name", "unknown_kernel"),
+                    new_kernel,
+                )
+            )
+            # Put the values back since we need it to use now
+            (
+                kernel.fn.fn,
+                kernel.fn.__globals__,
+                kernel.fn.used_global_vals,
+                kernel.fn.repr,
+                kernel.launchers,
+            ) = old_values
+
+    @classmethod
+    def collect_static_autotuners(
+        cls,
+    ) -> tuple[list[StaticallyLaunchedAutotuner], list[str]]:
+        if not cls._static_autotuners:
+            return [], []
+        else:
+            log.info(
+                "Saving %d statically launchable CachingAutotuners",
+                len(cls._static_autotuners),
+            )
+            static_autotuner_names = [i.kernel_name for i in cls._static_autotuners]
+            counters["inductor"]["triton_bundler_save_static_autotuner"] += 1
+            return cls._static_autotuners, static_autotuner_names
+
+    @classmethod
+    def load_autotuners(
+        cls, static_autotuners: Optional[list[StaticallyLaunchedAutotuner]]
+    ) -> list[str]:
+        """
+        Load statically launchable CachingAutotuners into async_compile.CompiledTritonKernels
+        cache.
+        """
+        if not static_autotuners:
+            return []
+
+        from torch._inductor.async_compile import CompiledTritonKernels
+        from torch._inductor.codecache import StaticAutotunerFuture
+
+        log.info("Loading %d statically launchable autotuners", len(static_autotuners))
+        kernel_names = []
+        with dynamo_timed("TritonBundler.load_cached_static_autotuners"):
+            for result in static_autotuners:
+                try:
+                    # Make sure the cubin path exists and is valid
+                    for compile_result in result.kernel.compile_results:
+                        compile_result.reload_cubin_path()
+                except RuntimeError as e:
+                    log.warning(
+                        "Failed to reload cubin file statically launchable autotuner %s: %s",
+                        result.kernel_name,
+                        e,
+                    )
+                    continue
+                # We make a future instead of returning the kernel here so that
+                # kernels that are not statically launchable (i.e. cache miss)
+                # can launch a worker without waiting on the blocking step of
+                # StaticAutotunerFuture.result().
+                CompiledTritonKernels._cache[result.cache_key] = StaticAutotunerFuture(
+                    result.kernel
+                )
+                counters["inductor"]["triton_bundler_load_static_autotuner"] += 1
+                kernel_names.append(result.kernel_name)
+        return kernel_names
+
+    @classmethod
+    def collect(
+        cls,
+    ) -> tuple[TritonBundle, Optional[TritonBundlerMetadata]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         This is the main function called when a cache write happens. This function
         converts all the previously remembered kernels into bundled format so that
         it can be written into a cache entry.
         This function also finalizes the current bundle.
         """
+<<<<<<< HEAD
         if not TritonBundler.is_enabled():
             cls.end_compile()
             set_feature_use("triton_bundling", False)
             return [], None
+=======
+        from torch._inductor import config
+
+        if not TritonBundler.is_enabled():
+            cls.end_compile()
+            set_feature_use("triton_bundling", False)
+            return TritonBundle([], []), None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         set_feature_use("triton_bundling", True)
 
         with dynamo_timed(key="TritonBundler.collect", log_pt2_compile_event=True):
@@ -199,6 +349,7 @@ def collect(
                                 artifacts,
                             )
                         )
+<<<<<<< HEAD
                 cls.end_compile()
                 return result, TritonBundlerMetadata(kernel_names)
             return [], None
@@ -207,6 +358,23 @@ def collect(
     def read_and_emit(
         bundle: list[TritonKernelArtifacts],
     ) -> Optional[TritonBundlerMetadata]:
+=======
+                if config.use_static_cuda_launcher:
+                    static_autotuners, static_kernel_names = (
+                        cls.collect_static_autotuners()
+                    )
+                else:
+                    static_autotuners = []
+                    static_kernel_names = []
+                cls.end_compile()
+                return TritonBundle(result, static_autotuners), TritonBundlerMetadata(
+                    kernel_names, static_kernel_names
+                )
+            return TritonBundle([], []), None
+
+    @staticmethod
+    def read_and_emit(bundle: TritonBundle) -> Optional[TritonBundlerMetadata]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         This is the main function called when a cache read happens. This function
         converts the bundled format back into individual files and writes them
@@ -219,6 +387,11 @@ def read_and_emit(
         Exclusive access means that no other process should be writing to
         or reading from the target directory.
         """
+<<<<<<< HEAD
+=======
+        from torch._inductor import config
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not TritonBundler.is_enabled():
             return None
 
@@ -227,7 +400,11 @@ def read_and_emit(
         ):
             kernel_names: list[str] = []
 
+<<<<<<< HEAD
             for artifacts in bundle:
+=======
+            for artifacts in bundle.kernel_artifacts:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 basedir = triton_cache_dir(artifacts.device)
                 directory = os.path.join(basedir, artifacts.kernel_hash)
 
@@ -270,6 +447,21 @@ def read_and_emit(
                         os.replace(tmp_dir, directory)
                 else:
                     # Atomic on POSIX systems
+<<<<<<< HEAD
                     os.replace(tmp_dir, directory)
 
             return TritonBundlerMetadata(kernel_names)
+=======
+                    try:
+                        os.replace(tmp_dir, directory)
+                    except OSError:
+                        log.warning("Directory %s is not empty - skipping!", tmp_dir)
+
+            if config.use_static_cuda_launcher:
+                static_kernel_names = TritonBundler.load_autotuners(
+                    bundle.static_autotuners
+                )
+            else:
+                static_kernel_names = []
+            return TritonBundlerMetadata(kernel_names, static_kernel_names)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index f62b3aa9c06a..cff549eb5e1a 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -16,6 +16,10 @@
 import platform
 import re
 import shutil
+<<<<<<< HEAD
+=======
+import statistics
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 import tempfile
 import textwrap
@@ -42,6 +46,10 @@
     dataclass_transform,
     ParamSpec,
     Self,
+<<<<<<< HEAD
+=======
+    TypeAlias,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TypeGuard,
 )
 from unittest import mock
@@ -54,6 +62,13 @@
 from torch.utils._pytree import tree_map_only
 
 
+<<<<<<< HEAD
+=======
+OPTIMUS_EXCLUDE_POST_GRAD = [
+    "activation_quantization_aten_pass",
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     from collections.abc import Iterable, Sequence, ValuesView
 
@@ -66,7 +81,19 @@
     from .codegen.common import WorkspaceArg
     from .codegen.wrapper import PythonWrapperCodegen
     from .graph import GraphLowering
+<<<<<<< HEAD
     from .ir import Buffer, ExternKernel, IRNode, Layout, Operation, ReinterpretView
+=======
+    from .ir import (
+        Buffer,
+        ExternKernel,
+        ExternKernelOut,
+        IRNode,
+        Layout,
+        Operation,
+        ReinterpretView,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from .output_code import CompiledFxGraph
     from .scheduler import BaseSchedulerNode, SchedulerBuffer
 
@@ -77,7 +104,11 @@
 
 # defines here before import torch._dynamo is for avoiding circular import
 # when get_gpu_type is imported from dynamo
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_gpu_type() -> str:
     avail_gpus = [x for x in GPU_TYPES if getattr(torch, x).is_available()]
     assert len(avail_gpus) <= 1
@@ -151,6 +182,98 @@ def eval(cls, value: sympy.Expr) -> Optional[sympy.Expr]:
             return value
 
 
+<<<<<<< HEAD
+=======
+@dataclasses.dataclass(frozen=True)
+class GraphPartitionMap:
+    """
+    Mapping from the partition info (e.g., input/output) to the graph info
+    """
+
+    # a unique id of graph partition
+    id: int
+
+    # map partition input/output indices to graph input/output indices. None indicates
+    # a partition input/output is not a graph input/output.
+    input_index_mapping: list[Optional[int]]
+    output_index_mapping: list[Optional[int]]
+
+    # name of constants read/written by the graph partition
+    constant_names: list[str]
+
+
+def fp8_bench(fn: Callable[[], Any], warmup: int = 25, rep: int = 100) -> float:
+    """
+    Returns benchmark results by examining torch profiler events.
+    This could be more accurate as it doesn't count CPU side overhead.
+    However, this also requires manually excluding irrelevant event, e.g.
+    vectorized_elementwise_kernel which is used to fill L2 cache,
+    various CUDA events, etc, so could also be fragile.
+    """
+
+    fn()
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.float16, device="cuda")
+
+    # Estimate the runtime of the function
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(5):
+        cache.zero_()
+        fn()
+    end_event.record()
+    torch.cuda.synchronize()
+    estimate_ms = start_event.elapsed_time(end_event) / 5
+
+    # compute number of warmup and repeat
+    n_warmup = max(1, int(warmup / estimate_ms))
+    n_repeat = max(1, int(rep / estimate_ms))
+
+    # Warm-up
+    for _ in range(n_warmup):
+        fn()
+
+    start_event = [torch.cuda.Event(enable_timing=True) for _ in range(n_repeat)]
+    end_event = [torch.cuda.Event(enable_timing=True) for _ in range(n_repeat)]
+    with torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CUDA,
+        ]
+    ) as p:
+        torch.cuda.synchronize()
+        for i in range(n_repeat):
+            cache.zero_()
+            start_event[i].record()
+            with torch.cuda.nvtx.range("RunCudaModule"):
+                fn()
+            end_event[i].record()
+        torch.cuda.synchronize()
+        times = torch.tensor(
+            [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
+        )
+
+    res = torch.mean(times).item()
+    log.debug("raw events")
+    log.debug(p.key_averages().table(sort_by="self_device_time_total", row_limit=-1))
+    filtered_events = EventList(
+        [
+            event
+            for event in p.events()
+            if event.device_type == DeviceType.CUDA and "fused_abs_max_0" in event.name
+        ]
+    )
+    if filtered_events:
+        res -= (
+            statistics.mean(event.device_time_total for event in filtered_events)
+            / 1000.0
+        )
+
+    log.debug("profiling results: %s ms", res)
+    return res
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def do_bench_using_profiling(
     fn: Callable[[], Any], warmup: int = 25, rep: int = 100
 ) -> float:
@@ -185,6 +308,11 @@ def do_bench_using_profiling(
     for _ in range(n_warmup):
         fn()
 
+<<<<<<< HEAD
+=======
+    torch.cuda.synchronize()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with torch.profiler.profile(
         activities=[
             torch.profiler.ProfilerActivity.CUDA,
@@ -235,7 +363,11 @@ def do_bench_using_profiling(
     return res
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def has_torchvision_roi_align() -> bool:
     try:
         from torchvision.ops import roi_align  # noqa: F401
@@ -276,6 +408,7 @@ def unique(it: Iterable[_T]) -> ValuesView[_T]:
 
 
 def ceildiv(
+<<<<<<< HEAD
     numer: Union[int, sympy.Expr], denom: Union[int, sympy.Expr]
 ) -> Union[int, sympy.Expr]:
     if isinstance(numer, sympy.Expr) or isinstance(denom, sympy.Expr):
@@ -287,12 +420,29 @@ def ceildiv(
         f"{numer}: {type(numer)}, {denom}: {type(denom)}"
     )
     return runtime_ceildiv(numer, denom)
+=======
+    number: Union[int, sympy.Expr], denom: Union[int, sympy.Expr]
+) -> Union[int, sympy.Expr]:
+    if isinstance(number, sympy.Expr) or isinstance(denom, sympy.Expr):
+        return CeilDiv(sympy.sympify(number), sympy.sympify(denom))
+    # TODO: There is a bug in a call to this function, to repro:
+    # python benchmarks/dynamo/huggingface.py --inductor -d cuda --accuracy
+    # --amp --only YituTechConvBert --dynamic-shapes
+    assert isinstance(number, int) and isinstance(denom, int), (
+        f"{number}: {type(number)}, {denom}: {type(denom)}"
+    )
+    return runtime_ceildiv(number, denom)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _type_of(key: Optional[torch.dtype]) -> str:
     # Use the function here to get rid of dependencies on the Triton during the codegen.
     # Refer to Triton implementation here:
+<<<<<<< HEAD
     # https://github.com/openai/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238
+=======
+    # https://github.com/triton-lang/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # `None` is nullptr.  Implicitly convert to *i8.
     if key is None:
         return "*i8"
@@ -308,6 +458,10 @@ def _type_of(key: Optional[torch.dtype]) -> str:
         # TODO: remove when support is added in triton
         # https://github.com/triton-lang/triton/issues/6054
         "float8_e8m0fnu": "u8",
+<<<<<<< HEAD
+=======
+        "float4_e2m1fn_x2": "u8",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "float16": "fp16",
         "bfloat16": "bf16",
         "float32": "fp32",
@@ -322,8 +476,12 @@ def _type_of(key: Optional[torch.dtype]) -> str:
         "uint64": "u64",
     }
     # reinterpret can create triton type
+<<<<<<< HEAD
     for v in list(tys.values()):
         tys[v] = v
+=======
+    tys.update({v: v for v in list(tys.values())})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return key if isinstance(key, str) else f"*{tys[dtype_str]}"
 
 
@@ -338,6 +496,26 @@ def convert_shape_to_inductor(
     return [sympy.sympify(i) for i in lst]
 
 
+<<<<<<< HEAD
+=======
+def convert_to_symint(i: Union[int, sympy.Expr]) -> Union[int, torch.SymInt]:
+    """
+    Like convert_shape_to_symint, but operates on a single expression.
+    """
+    from .virtualized import V
+
+    return (
+        i
+        if isinstance(i, int)
+        else (
+            int(i)
+            if isinstance(i, sympy.Integer)
+            else V.graph.sizevars.shape_env.create_symintnode(i, hint=None)
+        )
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def convert_shape_to_symint(
     lst: Iterable[Union[int, sympy.Expr]],
 ) -> list[Union[int, torch.SymInt]]:
@@ -345,6 +523,7 @@ def convert_shape_to_symint(
     Takes a list of shapes from Inductor and converts them into symints (or just
     ints if all shapes are static).
     """
+<<<<<<< HEAD
     from .virtualized import V
 
     return [
@@ -359,6 +538,9 @@ def convert_shape_to_symint(
         )
         for i in lst
     ]
+=======
+    return [convert_to_symint(i) for i in lst]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_view(op: torch._ops.OpOverload) -> bool:
@@ -617,9 +799,13 @@ def get_kernel_metadata(
             single_graph = inductor_nodes[0].graph
             # create a map of idx -> node and cache it
             if not hasattr(single_graph, "_inductor_kernel_metadata_node_to_idx_map"):
+<<<<<<< HEAD
                 node_to_idx_map = {}
                 for idx, n in enumerate(single_graph.nodes):
                     node_to_idx_map[n] = idx
+=======
+                node_to_idx_map = {n: idx for idx, n in enumerate(single_graph.nodes)}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 single_graph._inductor_kernel_metadata_node_to_idx_map = node_to_idx_map  # type: ignore[attr-defined]
             inductor_nodes.sort(
                 key=lambda n: single_graph._inductor_kernel_metadata_node_to_idx_map[n]  # type: ignore[attr-defined]
@@ -701,6 +887,7 @@ def sympy_str(expr: sympy.Expr) -> str:
     somewhat worse, as it doesn't do as much simplification.  So don't
     use this for final codegen.
     """
+<<<<<<< HEAD
     if isinstance(expr, sympy.Symbol):
         return expr.name
     if isinstance(expr, sympy.Add):
@@ -711,6 +898,47 @@ def sympy_str(expr: sympy.Expr) -> str:
     if isinstance(expr, (ModularIndexing, CleanDiv, FloorDiv, Identity)):
         return f"{expr.func.__name__}({', '.join(map(sympy_str, expr.args))})"
     return str(expr)
+=======
+
+    def is_neg_lead(expr: sympy.Expr) -> bool:
+        return (
+            isinstance(expr, sympy.Mul) and len(expr.args) == 2 and expr.args[0] == -1
+        )
+
+    def sympy_str_add(expr: sympy.Expr) -> str:
+        if isinstance(expr, sympy.Add):
+            # Special case 'a - b'. Note that 'a - b - c' will still appear as
+            # 'a + -1 * b + -1 * c'.
+            if len(expr.args) == 2 and is_neg_lead(expr.args[1]):
+                return f"{sympy_str_mul(expr.args[0])} - {sympy_str_mul(expr.args[1].args[1])}"
+            else:
+                return " + ".join(map(sympy_str_mul, expr.args))
+        else:
+            return sympy_str_mul(expr)
+
+    def sympy_str_mul(expr: sympy.Expr) -> str:
+        if isinstance(expr, sympy.Mul):
+            if is_neg_lead(expr):
+                # Special case '-a'. Note that 'a * -b' will still appear as
+                # '-1 * a * b'.
+                return f"-{sympy_str_atom(expr.args[1])}"
+            else:
+                return " * ".join(map(sympy_str_atom, expr.args))
+        else:
+            return sympy_str_atom(expr)
+
+    def sympy_str_atom(expr: sympy.Expr) -> str:
+        if isinstance(expr, sympy.Symbol):
+            return expr.name
+        elif isinstance(expr, (sympy.Add, sympy.Mul)):
+            return f"({sympy_str_add(expr)})"
+        elif isinstance(expr, (ModularIndexing, CleanDiv, FloorDiv, Identity)):
+            return f"{expr.func.__name__}({', '.join(map(sympy_str, expr.args))})"
+        else:
+            return str(expr)
+
+    return sympy_str_add(expr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_bounds_index_expr(index: sympy.Expr) -> ValueRanges[Any]:
@@ -837,8 +1065,25 @@ def get_first_incompatible_cudagraph_node(
     for node in gm.graph.nodes:
         if str(node.target) in forbidden_set:
             return node
+<<<<<<< HEAD
+        if (val := node.meta.get("val")) is not None and free_unbacked_symbols(val):
+            return node
+=======
+
+        if (
+            not torch._inductor.config.graph_partition
+            and isinstance(node.target, torch._ops.OpOverload)
+            and torch._C.Tag.cudagraph_unsafe in node.target.tags
+        ):
+            # skip cudagraph if a cudagraph_unsafe op is detected.
+            # graph_partition helps by splitting on this cudagraph_unsafe
+            # op and cudagraphifying the subgraphs.
+            return node
+
         if (val := node.meta.get("val")) is not None and free_unbacked_symbols(val):
             return node
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return None
 
 
@@ -849,6 +1094,7 @@ def output_node(gm: torch.fx.GraphModule) -> Node:
     return last_node
 
 
+<<<<<<< HEAD
 _registered_caches: list[Any] = []
 
 
@@ -870,6 +1116,25 @@ def clear_inductor_caches() -> None:
     """
     for obj in _registered_caches:
         obj.cache_clear()
+=======
+def get_all_devices(gm: torch.fx.GraphModule) -> OrderedSet[torch.device]:
+    placeholder_nodes = gm.graph.find_nodes(op="placeholder")
+    input_devices: OrderedSet[torch.device] = OrderedSet(
+        node.meta["val"].device
+        for node in placeholder_nodes
+        if isinstance(node.meta.get("val"), torch.Tensor)
+    )
+
+    out_arg = output_node(gm).args[0]  # type: ignore[union-attr]
+    out_args = out_arg if isinstance(out_arg, tuple) else (out_arg,)
+    out_devices: OrderedSet[torch.device] = OrderedSet(
+        arg.meta["val"].device
+        for arg in out_args
+        if isinstance(arg, torch.fx.Node)
+        and isinstance(arg.meta.get("val"), torch.Tensor)
+    )
+    return input_devices | out_devices
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 import gc
@@ -888,7 +1153,15 @@ def unload_xpu_triton_pyds() -> None:
                     kernel, torch._inductor.runtime.triton_heuristics.CachingAutotuner
                 ):
                     for result in kernel.compile_results:
+<<<<<<< HEAD
                         result.kernel.run.mod.__del__()
+=======
+                        if isinstance(
+                            result,
+                            torch._inductor.runtime.triton_heuristics.TritonCompileResult,
+                        ):
+                            result.kernel.run.mod.__del__()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         del sys.modules[module_name]
 
     # unload spirv_utils.pyd
@@ -900,19 +1173,55 @@ def unload_xpu_triton_pyds() -> None:
     gc.collect()
 
 
+<<<<<<< HEAD
 @contextlib.contextmanager
 def fresh_inductor_cache(
+=======
+_registered_caches: list[Any] = []
+
+
+def clear_on_fresh_cache(obj: Any) -> Any:
+    """
+    Use this decorator to register any caches that should be cache_clear'd
+    with fresh_cache().
+    """
+    if not hasattr(obj, "cache_clear") or not callable(obj.cache_clear):
+        raise AttributeError(f"{obj} does not have a cache_clear method")
+
+    _registered_caches.append(obj)
+    return obj
+
+
+def clear_caches() -> None:
+    """
+    Clear all registered caches.
+    """
+    for obj in _registered_caches:
+        obj.cache_clear()
+
+
+@contextlib.contextmanager
+def fresh_cache(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cache_entries: Optional[dict[str, Any]] = None,
     dir: Optional[str] = None,
     delete: bool = True,
 ) -> Iterator[None]:
     """
+<<<<<<< HEAD
     Contextmanager that provides a clean tmp cachedir for inductor.
+=======
+    Contextmanager that provides a clean tmp cachedir for pt2 caches.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
     generated with this cache instance.
     """
+<<<<<<< HEAD
     clear_inductor_caches()
+=======
+    clear_caches()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     inductor_cache_dir = tempfile.mkdtemp(dir=dir)
     try:
@@ -953,7 +1262,17 @@ def fresh_inductor_cache(
         log.warning("on error, temporary cache dir kept at %s", inductor_cache_dir)
         raise
     finally:
+<<<<<<< HEAD
         clear_inductor_caches()
+=======
+        clear_caches()
+
+
+# Deprecated functions -- only keeping them for BC reasons
+clear_on_fresh_inductor_cache = clear_on_fresh_cache
+clear_inductor_caches = clear_caches
+fresh_inductor_cache = fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def argsort(seq: Sequence[Any]) -> list[int]:
@@ -1025,6 +1344,18 @@ def __init__(self, initial_indent: int = 0) -> None:
         self._lines: list[Union[DeferredLineBase, LineContext, str]] = []
         self._indent = initial_indent
 
+<<<<<<< HEAD
+=======
+    @contextlib.contextmanager
+    def set_tabwidth(self, tabwidth: int) -> Iterator[None]:
+        prev = self.tabwidth
+        try:
+            self.tabwidth = tabwidth
+            yield
+        finally:
+            self.tabwidth = prev
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def getvaluewithlinemap(self) -> ValueWithLineMap:
         buf = StringIO()
         p = 1
@@ -1226,7 +1557,11 @@ def _new_line(self, line: str) -> DelayReplaceLine:
         return DelayReplaceLine(self.key, self.value_fn, line)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_big_gpu(index_or_device: Union[int, torch.device] = 0) -> bool:
     if isinstance(index_or_device, torch.device):
         device = index_or_device
@@ -1270,12 +1605,23 @@ def get_num_sms() -> int:
 def get_tma_workspace_arg(
     num_tma_descriptors: int,
     device: torch.device,
+<<<<<<< HEAD
+=======
+    num_programs: Optional[int] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> WorkspaceArg:
     """Builds and returns a WorkspaceArg for the device side TMA workspace buffer."""
     from .codegen.common import WorkspaceArg, WorkspaceZeroMode
 
+<<<<<<< HEAD
     zero_mode = WorkspaceZeroMode.from_bool(False)
     size = get_num_sms() * num_tma_descriptors * TMA_DESCRIPTOR_SIZE
+=======
+    if num_programs is None:
+        num_programs = get_num_sms()
+    zero_mode = WorkspaceZeroMode.from_bool(False)
+    size = num_programs * num_tma_descriptors * TMA_DESCRIPTOR_SIZE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return WorkspaceArg(
         count=size,
         zero_mode=zero_mode,
@@ -1284,6 +1630,7 @@ def get_tma_workspace_arg(
     )
 
 
+<<<<<<< HEAD
 def use_max_autotune() -> bool:
     return (
         config.max_autotune or config.max_autotune_gemm or config.search_autotune_cache
@@ -1293,6 +1640,17 @@ def use_max_autotune() -> bool:
 def _use_template_for_gpu(
     layout: Layout, allowed_layout_dtypes: list[torch.dtype]
 ) -> bool:
+=======
+def _use_template_for_gpu(
+    layout: Layout, allowed_layout_dtypes: list[torch.dtype]
+) -> bool:
+    if layout.dtype not in allowed_layout_dtypes:
+        log.debug(
+            "Not using template since dtype %s is not in allowed layout dtypes %s",
+            layout.dtype,
+            allowed_layout_dtypes,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         is_gpu(layout.device.type)
         and layout.dtype in allowed_layout_dtypes
@@ -1330,14 +1688,22 @@ def use_triton_template(
             )
             or (layout.device.type == "cpu" and layout.dtype in layout_dtypes)
         )
+<<<<<<< HEAD
         and use_max_autotune()
+=======
+        and (config.max_autotune or config.max_autotune_gemm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and _use_autotune_backend("TRITON")
         and has_backend_feature(layout.device, BackendFeature.TRITON_TEMPLATES)
     )
 
 
 def use_triton_tma_template(*matrices: IRNode) -> bool:
+<<<<<<< HEAD
     from torch.utils._triton import has_triton_tma_device
+=======
+    from torch.utils._triton import has_triton_stable_tma_api, has_triton_tma_device
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     from .virtualized import V
 
@@ -1366,6 +1732,13 @@ def _is_tma_compatible(x: IRNode) -> bool:
         inner_bytes = inner_dim * dtype.itemsize
         return V.graph.sizevars.statically_known_multiple_of(inner_bytes, TMA_ALIGNMENT)
 
+<<<<<<< HEAD
+=======
+    if has_triton_stable_tma_api() and config.cpp_wrapper:
+        # TODO(dberard) remove this when we get AOTI support for new TMA APIs (#155047)
+        return False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         config.triton.enable_persistent_tma_matmul
         and has_triton_tma_device()
@@ -1385,10 +1758,19 @@ def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
     if torch.version.hip:
         return False
 
+<<<<<<< HEAD
     layout_dtypes = [torch.float16, torch.bfloat16, torch.float32, torch.int32]
     res = (
         _use_template_for_gpu(layout, layout_dtypes)
         and use_max_autotune()
+=======
+    # output dtype
+    # FP32 not supported: https://github.com/pytorch/pytorch/issues/145952
+    layout_dtypes = [torch.float16, torch.bfloat16, torch.int32]
+    res = (
+        _use_template_for_gpu(layout, layout_dtypes)
+        and (config.max_autotune or config.max_autotune_gemm)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         and _use_autotune_backend("CUTLASS")
     )
 
@@ -1403,12 +1785,110 @@ def use_cutlass_template(layout: Layout, m: int, n: int, k: int) -> bool:
     return res
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+def _use_cutlass_for_op(op_name: str) -> bool:
+    """Check if CUTLASS should be used for the given operation."""
+    enabled_ops = config.cuda.cutlass_enabled_ops.upper()
+    if enabled_ops == "ALL":
+        return True
+    return op_name.upper() in [x.strip() for x in enabled_ops.split(",")]
+
+
+decompose_k_threshold = 32
+
+# To limit compile time
+k_splits_limit = 5
+
+# Hand-tuned
+default_k_splits = [16, 32, 64, 128, 256]
+
+_IntLike: TypeAlias = Union[int, sympy.Expr]
+
+
+def use_decompose_k_choice(m: _IntLike, n: _IntLike, k: _IntLike) -> bool:
+    from torch._inductor.virtualized import V
+
+    return (
+        V.graph.sizevars.statically_known_true(
+            sympy.And(
+                sympy.Ge(k, decompose_k_threshold * m),
+                sympy.Ge(k, decompose_k_threshold * n),
+            )
+        )
+        and not V.graph.aot_mode  # TODO: Support AOTI for decomposeK
+        and not V.graph.cpp_wrapper
+        and not config.disable_decompose_k
+    )
+
+
+@functools.cache
+def get_k_splits(m: _IntLike, n: _IntLike, k: _IntLike) -> list[int]:
+    # If k is a sympy expression, we can't do any splitting
+    if isinstance(k, sympy.Expr) and not k.is_number:
+        return default_k_splits
+
+    if (isinstance(m, sympy.Expr) and not m.is_number) or (
+        isinstance(n, sympy.Expr) and not n.is_number
+    ):
+        max_k_split = 256
+    else:
+        max_k_split = min(k // m, k // n)
+
+    min_k_split = 2
+    # Get all divisors of k, k has to be divisible by kPart
+    divisors = sympy.divisors(k)
+
+    divisors = [
+        divisor
+        for divisor in divisors
+        if divisor <= max_k_split and divisor >= min_k_split
+    ]
+
+    pow_of_2_divisors, mul_of_32_divisors, rest_of_splits = [], [], []
+
+    for d in divisors:
+        kPart = k // d
+
+        # Smaller than 128 might not even fit in a single tile, BLOCK_K can be 128
+        if kPart < 128:
+            continue
+
+        # Power of 2 divisors are best performing, conform to hardware
+        if (kPart & kPart - 1) == 0 and kPart >= 128:
+            pow_of_2_divisors.append(d)
+        # Else check if creates a multiple of 32
+        elif kPart % 32 == 0:
+            mul_of_32_divisors.append(d)
+        # otherwise, take the smallest values
+        else:
+            rest_of_splits.append(d)
+
+    if config.max_autotune_gemm_search_space == "EXHAUSTIVE":
+        return pow_of_2_divisors + mul_of_32_divisors + rest_of_splits
+    # If the # of power of 2 divisors are greater than k_splits_limit, return all
+    # This should be ok for compile time, all perfect squares between 128 and min(k / m, k / n)
+    # should never be a massive amount
+    if len(pow_of_2_divisors) >= k_splits_limit:
+        return pow_of_2_divisors
+    else:
+        best_splits = pow_of_2_divisors + mul_of_32_divisors + rest_of_splits
+        # Otherwise, conform results to k_splits_limit
+        return best_splits[:k_splits_limit]
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _rocm_native_device_arch_name(device: str) -> str:
     return torch.cuda.get_device_properties(device).gcnArchName
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def try_import_ck_lib() -> tuple[
     Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]
 ]:
@@ -1440,7 +1920,11 @@ class CKGemmOperation:  # type: ignore[no-redef]
 
 def use_ck_template(layout: Layout) -> bool:
     # config knobs check 1
+<<<<<<< HEAD
     if not use_max_autotune():
+=======
+    if not (config.max_autotune or config.max_autotune_gemm):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return False
     # platform check
     if not torch.version.hip:
@@ -1494,12 +1978,31 @@ def use_ck_gemm_template(layout: Layout, m: int, n: int, k: int) -> bool:
     )
 
 
+<<<<<<< HEAD
+=======
+def use_ck_tile_gemm_template(layout: Layout, m: int, n: int, k: int) -> bool:
+    from .virtualized import V
+
+    return (
+        _use_autotune_backend("CKTILE")
+        and use_ck_template(layout)
+        and V.graph.sizevars.size_hint(m * n * k, fallback=-1) > 0
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def use_ck_conv_template(layout: Layout) -> bool:
     return _use_conv_autotune_backend("CK") and use_ck_template(layout)
 
 
 def _use_template_for_cpu(layout: Layout) -> bool:
+<<<<<<< HEAD
     return use_max_autotune() and layout.device.type == "cpu"
+=======
+    return (
+        config.max_autotune or config.max_autotune_gemm
+    ) and layout.device.type == "cpu"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def use_cpp_bmm_template(
@@ -1548,6 +2051,10 @@ def use_cpp_gemm_template(
     # TODO(jgong5): support dynamic shapes for n or k
     if has_free_symbols((n, k)):
         return False
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(mat2, ir.BaseView):
         mat2 = mat2.unwrap_view()
 
@@ -1579,7 +2086,13 @@ def is_last_dim_stride1(x: IRNode) -> bool:
 
 
 def use_aten_gemm_kernels() -> bool:
+<<<<<<< HEAD
     return not use_max_autotune() or _use_autotune_backend("ATEN")
+=======
+    return not (
+        config.max_autotune or config.max_autotune_gemm
+    ) or _use_autotune_backend("ATEN")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DebugDirManager:
@@ -1618,8 +2131,13 @@ def save_output_code(code: str) -> None:
 
 
 def run_and_get_kernels(
+<<<<<<< HEAD
     fn: Callable[..., Any], *args: Any, **kwargs: Any
 ) -> tuple[Any, list[str]]:
+=======
+    fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs
+) -> tuple[_T, list[str]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     result, source_codes = run_and_get_code(fn, *args, **kwargs)
     kernels = []
     for code in source_codes:
@@ -1636,7 +2154,11 @@ def run_with_backward() -> Any:
     return run_and_get_code(run_with_backward)
 
 
+<<<<<<< HEAD
 def get_code(fn: Callable[..., Any], *args: Any, **kwargs: Any) -> list[str]:
+=======
+def get_code(fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs) -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Get the inductor-generated code, but skip any actual compilation or running."""
     from .graph import GraphLowering
 
@@ -1680,7 +2202,11 @@ def call(self, *args: Any, **kwargs: Any) -> None:
     return source_codes
 
 
+<<<<<<< HEAD
 def get_triton_code(fn: Callable[..., Any], *args: Any, **kwargs: Any) -> str:
+=======
+def get_triton_code(fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     source_codes = get_code(fn, *args, **kwargs)
     # Can have two outputs if backwards was eagerly compiled
     assert 1 <= len(source_codes) <= 2, (
@@ -1689,7 +2215,13 @@ def get_triton_code(fn: Callable[..., Any], *args: Any, **kwargs: Any) -> str:
     return source_codes[0]
 
 
+<<<<<<< HEAD
 def run_and_get_triton_code(fn: Callable[..., Any], *args: Any, **kwargs: Any) -> str:
+=======
+def run_and_get_triton_code(
+    fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs
+) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _, source_codes = run_and_get_code(fn, *args, **kwargs)
     # Can have two outputs if backwards was eagerly compiled
     assert 1 <= len(source_codes) <= 2, (
@@ -1699,7 +2231,11 @@ def run_and_get_triton_code(fn: Callable[..., Any], *args: Any, **kwargs: Any) -
 
 
 def run_and_get_graph_lowering(
+<<<<<<< HEAD
     fn: Callable[..., Any], *args: Any, **kwargs: Any
+=======
+    fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[Any, list[GraphLowering]]:
     from torch._inductor.graph import GraphLowering
     from torch._inductor.output_code import CompiledFxGraph
@@ -1845,7 +2381,11 @@ def parallel_num_threads() -> int:
     return threads
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_backend_num_stages() -> int:
     from .runtime.triton_helpers import get_backend_options
 
@@ -1853,14 +2393,22 @@ def get_backend_num_stages() -> int:
     return options.get("num_stages", 2 if torch.version.hip else 3)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_device_tflops(dtype: torch.dtype) -> int:
     from triton.testing import get_max_simd_tflops, get_max_tensorcore_tflops
 
     assert dtype in (torch.float16, torch.bfloat16, torch.float32)
 
     if inspect.signature(get_max_simd_tflops).parameters.get("clock_rate"):
+<<<<<<< HEAD
         # Triton API change in https://github.com/openai/triton/pull/2293
+=======
+        # Triton API change in https://github.com/triton-lang/triton/pull/2293
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._utils_internal import max_clock_rate
 
         sm_clock = max_clock_rate()
@@ -1881,7 +2429,11 @@ def get_device_tflops(dtype: torch.dtype) -> int:
             return get_max_simd_tflops(torch.float32)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_gpu_dram_gbps() -> int:
     from triton.testing import get_dram_gbps
 
@@ -2225,7 +2777,11 @@ def is_gpu(device: Optional[str]) -> bool:
 
 
 def device_need_guard(device: str) -> bool:
+<<<<<<< HEAD
     return is_gpu(device)
+=======
+    return device != "mps" and is_gpu(device)  # TODO: MPS does not expose streams now
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def needs_fallback_due_to_atomic_add_limitations(dtype: torch.dtype) -> bool:
@@ -2237,6 +2793,10 @@ def needs_fallback_due_to_atomic_add_limitations(dtype: torch.dtype) -> bool:
         and dtype == torch.bfloat16
         and torch.cuda.is_available()
         and torch.cuda.get_device_capability() >= (9, 0)
+<<<<<<< HEAD
+=======
+        and config.bfloat16_atomic_adds_enabled
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         return False
     else:
@@ -2354,8 +2914,13 @@ def maybe_get_suppress_shape_guards_ctx() -> contextlib.AbstractContextManager[N
 
 
 def run_and_get_cpp_code(
+<<<<<<< HEAD
     fn: Callable[..., Any], *args: Any, **kwargs: Any
 ) -> tuple[Any, str]:
+=======
+    fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs
+) -> tuple[_T, str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # We use the patch context manager instead of using it as a decorator.
     # In this way, we can ensure that the attribute is patched and unpatched correctly
     # even if this run_and_get_cpp_code function is called multiple times.
@@ -2399,15 +2964,36 @@ def shape_env_from_inputs(inputs: Sequence[InputType]) -> Optional[ShapeEnv]:
 
 
 def align_inputs_from_check_idxs(
+<<<<<<< HEAD
     model: Callable[[list[InputType]], Any],
     inputs_to_check: Sequence[int],
 ) -> Callable[[list[InputType]], Any]:
+=======
+    model: Callable[[list[InputType]], _T],
+    inputs_to_check: Sequence[int],
+    mutated_input_idxs: OrderedSet[int],
+) -> Callable[[list[InputType]], _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if len(inputs_to_check) == 0:
         return model
 
     def run(new_inputs: list[InputType]) -> Any:
+<<<<<<< HEAD
         copy_misaligned_inputs(new_inputs, inputs_to_check)
         return model(new_inputs)
+=======
+        old_tensors, new_tensors = copy_misaligned_inputs(
+            new_inputs, inputs_to_check, mutated_input_idxs
+        )
+        out = model(new_inputs)
+
+        # If a mutated tensor was cloned to be aligned, we need to reflect back the mutation to the
+        # original tensor.
+        if len(old_tensors):
+            torch._foreach_copy_(old_tensors, new_tensors)
+
+        return out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return run
 
@@ -2425,6 +3011,7 @@ def clone_preserve_strides(x: torch.Tensor) -> torch.Tensor:
 
 
 def copy_misaligned_inputs(
+<<<<<<< HEAD
     new_inputs: list[InputType], check_inputs_idxs: Sequence[int]
 ) -> None:
     for i in check_inputs_idxs:
@@ -2433,6 +3020,36 @@ def copy_misaligned_inputs(
         if _inp.data_ptr() % ALIGNMENT:
             new_inputs[i] = clone_preserve_strides(_inp)
 
+=======
+    new_inputs: list[InputType],
+    check_inputs_idxs: Sequence[int],
+    return_pair_idxs: Optional[OrderedSet[int]] = None,
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+    """
+    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
+    cloned tensor which is in `return_pair_idxs`.
+    """
+
+    old_tensors: list[torch.Tensor] = []
+    new_tensors: list[torch.Tensor] = []
+
+    # hoist above loop because this is on the hot path
+    ret_pair_defined = return_pair_idxs is not None
+    for i in check_inputs_idxs:
+        _inp = new_inputs[i]
+        assert isinstance(_inp, torch.Tensor), (
+            f"Expected tensors only, but got: {type(_inp)}"
+        )
+        if _inp.data_ptr() % ALIGNMENT:
+            new_inputs[i] = clone_preserve_strides(_inp)
+
+            if ret_pair_defined and i in return_pair_idxs:  # type: ignore[operator]
+                old_tensors.append(_inp)
+                new_tensors.append(new_inputs[i])  # type: ignore[arg-type]
+
+    return old_tensors, new_tensors
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def remove_unaligned_input_idxs(
     inputs: Sequence[InputType],
@@ -2461,7 +3078,11 @@ def expr_fits_within_32bit(e: sympy.Expr) -> bool:
 
     # Allow for unhinted e as long as we can still statically prove
     # (e.g., via ValueRanges) that it is still in bounds
+<<<<<<< HEAD
     if V.graph.sizevars.is_expr_static_and_true(e <= int_max):
+=======
+    if V.graph.sizevars.statically_known_true(e <= int_max):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
     # Otherwise, the hint MUST exist and be in range
     return has_hint(e) and size_hint(e) <= int_max
@@ -2529,6 +3150,10 @@ def normalize_name(name: str) -> str:
     # TODO: remove when support is added in triton
     # https://github.com/triton-lang/triton/issues/6054
     "tl.float8_e8m0fnu": "tl.uint8",
+<<<<<<< HEAD
+=======
+    "tl.float4_e2m1fn_x2": "tl.uint8",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 _torch_triton_mapping = {v: k for k, v in _triton_type_mapping.items()}
 
@@ -2572,7 +3197,11 @@ def is_same_mkldnn_tensor(data: torch.Tensor, value: torch.Tensor) -> bool:
     )
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def boolean_ops() -> tuple[str, ...]:
     return (
         "isinf",
@@ -2611,10 +3240,38 @@ def register_op_dtype_propagation_rules(
     )
 
 
+<<<<<<< HEAD
 def upcast_compute_type(dtype: torch.dtype) -> torch.dtype:
     """Maybe upcast [b]float16 to float32"""
     if config.triton.codegen_upcast_to_fp32 and (
         dtype in (torch.float16, torch.bfloat16)
+=======
+op_requires_libdevice_fp64: OrderedSet[str] = OrderedSet()
+
+
+def register_op_requires_libdevice_fp64(name: str) -> None:
+    op_requires_libdevice_fp64.add(name)
+
+
+def get_current_backend() -> str:
+    from torch._inductor.virtualized import V
+
+    device_str = V.graph.get_current_device_or_throw().type
+    if device_str == "cpu":
+        return config.cpu_backend
+    elif device_str == "mps":
+        return "mps"
+    else:
+        return config.cuda_backend
+
+
+def upcast_compute_type(dtype: torch.dtype) -> torch.dtype:
+    """Maybe upcast [b]float16 to float32"""
+    if (
+        dtype in (torch.float16, torch.bfloat16)
+        and config.triton.codegen_upcast_to_fp32
+        and get_current_backend() == "triton"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         return torch.float32
     return dtype
@@ -2695,6 +3352,7 @@ def get_donated_idxs() -> Optional[list[int]]:
 
 
 def set_kernel_post_grad_provenance_tracing(
+<<<<<<< HEAD
     node_schedule: Sequence[BaseSchedulerNode], kernel_name: str
 ) -> None:
     from .codegen.simd_kernel_features import DisableReduction, EnableReduction
@@ -2707,6 +3365,41 @@ def set_kernel_post_grad_provenance_tracing(
                     origin.name
                     for origin in node.node.origins  # type: ignore[attr-defined]
                 ]
+=======
+    node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernelOut],
+    kernel_name: str,
+    is_extern: bool = False,
+) -> None:
+    from .codegen.simd_kernel_features import DisableReduction, EnableReduction
+    from .ir import ExternKernelOut
+    from .virtualized import V
+
+    if is_extern:
+        assert isinstance(node_schedule, ExternKernelOut)
+        curr_node_info = (
+            V.debug._inductor_triton_kernel_to_post_grad_node_info.setdefault(
+                kernel_name, []
+            )
+        )
+        curr_node_info.extend(
+            origin.name
+            for origin in node_schedule.origins
+            if origin.name not in curr_node_info
+        )
+    else:
+        assert isinstance(node_schedule, list)
+        for snode in node_schedule:
+            if snode not in (EnableReduction, DisableReduction):
+                if snode.node is not None:
+                    curr_node_info = V.debug._inductor_triton_kernel_to_post_grad_node_info.setdefault(
+                        kernel_name, []
+                    )
+                    curr_node_info.extend(
+                        origin.name
+                        for origin in snode.node.origins
+                        if origin.name not in curr_node_info
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TritonAttrsDescriptorVersion(enum.Enum):
@@ -2719,7 +3412,11 @@ class TritonAttrsDescriptorVersion(enum.Enum):
     V4_DICT = 4  # a raw dict
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_triton_attrs_descriptor_version() -> TritonAttrsDescriptorVersion:
     if importlib.util.find_spec("triton") is None:
         return TritonAttrsDescriptorVersion.V0_NO_TRITON
@@ -2747,3 +3444,85 @@ def get_triton_attrs_descriptor_version() -> TritonAttrsDescriptorVersion:
 
 def triton_version_uses_attrs_dict() -> bool:
     return get_triton_attrs_descriptor_version() == TritonAttrsDescriptorVersion.V4_DICT
+<<<<<<< HEAD
+=======
+
+
+def is_cudagraph_unsafe_op(node: Operation) -> bool:
+    """
+    Returns True if the node is an op that is not cudagraphable.
+    Usually only custom ops have this tag.
+    """
+    from . import ir
+
+    if not isinstance(node, ir.FallbackKernel):
+        return False
+
+    if (
+        isinstance(node.op_overload, torch._ops.OpOverload)
+        and torch._C.Tag.cudagraph_unsafe in node.op_overload.tags
+    ):
+        return True
+
+    return False
+
+
+def get_ld_library_path() -> str:
+    path = os.environ.get("LD_LIBRARY_PATH", "")
+    if config.is_fbcode():
+        from libfb.py.parutil import get_runtime_path
+
+        runtime_path = get_runtime_path()
+        if runtime_path:
+            lib_path = os.path.join(runtime_path, "runtime", "lib")
+            path = os.pathsep.join([lib_path, path]) if path else lib_path
+
+    return path
+
+
+def is_codegen_graph_partition_subgraph(wrapper: PythonWrapperCodegen) -> bool:
+    from torch._inductor.codegen.wrapper import SubgraphPythonWrapperCodegen
+
+    return (
+        isinstance(wrapper, SubgraphPythonWrapperCodegen)
+        and wrapper.partition_signatures is not None
+    )
+
+
+def dtype_from_size(size: int) -> torch.dtype:
+    from .virtualized import V
+
+    if V.graph.sizevars.statically_known_lt(
+        size, 2**31
+    ) and V.graph.sizevars.statically_known_geq(size, -(2**31)):
+        return torch.int32
+    else:
+        return torch.int64
+
+
+SUPPORTED_MKLDNN_DEVICES = ("cpu", "xpu")
+
+
+def is_mkldnn_bf16_supported(device_type: str) -> bool:
+    """
+    Returns True if the device supports MKL-DNN BF16.
+    """
+    if device_type == "cpu":
+        return torch.ops.mkldnn._is_mkldnn_bf16_supported()
+    elif "xpu" in device_type:
+        # match "xpu", "xpu:0", "xpu:1", etc.
+        return True
+    return False
+
+
+def is_mkldnn_fp16_supported(device_type: str) -> bool:
+    """
+    Returns True if the device supports MKL-DNN FP16.
+    """
+    if device_type == "cpu":
+        return torch.ops.mkldnn._is_mkldnn_fp16_supported()
+    elif "xpu" in device_type:
+        # match "xpu", "xpu:0", "xpu:1", etc.
+        return True
+    return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index ae43803bd3be..caed143353fc 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -168,6 +168,19 @@ def __init__(self):
         self.inplaced_to_remove = OrderedSet[Any]()
         self.index_dtype = "tl.int64"
 
+<<<<<<< HEAD
+=======
+    def get_index_dtype_as_torch_dtype(self):
+        import torch
+
+        if self.index_dtype == "tl.int64":
+            return torch.int64
+        elif self.index_dtype == "tl.int32":
+            return torch.int32
+        else:
+            raise ValueError(f"Unknown dtype: {self.index_dtype}")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _ops: Virtualized[OpsHandler[Any]] = Virtualized(
     "ops", cast(type[OpsHandler[Any]], MockHandler)
diff --git a/torch/_inductor/wrapper_benchmark.py b/torch/_inductor/wrapper_benchmark.py
index ac7d10e8a0e3..0d6d211f9b83 100644
--- a/torch/_inductor/wrapper_benchmark.py
+++ b/torch/_inductor/wrapper_benchmark.py
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+import argparse
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import dataclasses
 import datetime
 import tempfile
@@ -316,12 +320,24 @@ def perf_profile(
 
 
 def ncu_analyzer(
+<<<<<<< HEAD
     benchmark_name: str, benchmark_compiled_module_fn: BenchmarkCallableType
+=======
+    benchmark_name: str,
+    benchmark_compiled_module_fn: BenchmarkCallableType,
+    args: argparse.Namespace,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     import inspect
     import os
     import subprocess
 
+<<<<<<< HEAD
+=======
+    kernel_regex = args.ncu_kernel_regex
+    metrics = args.ncu_metrics
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     module_file = inspect.getfile(benchmark_compiled_module_fn)
     module_dir = os.path.dirname(module_file)
     module_name = os.path.splitext(os.path.basename(module_file))[0]
@@ -345,18 +361,42 @@ def ncu_analyzer(
         "function",
         "--print-units",
         "base",
+<<<<<<< HEAD
         "--set",
         "full",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "--import-source",
         "yes",
         "--force-overwrite",
         "--export",
         ncu_output,
+<<<<<<< HEAD
         "python",
         "-c",
         python_cmd,
     ]
 
+=======
+    ]
+
+    if kernel_regex:
+        ncu_cmd.extend(["--kernel-name", f"regex:{kernel_regex}"])
+
+    if metrics:
+        ncu_cmd.extend(["--metrics", metrics])
+    else:
+        ncu_cmd.extend(["--set", "full"])
+
+    ncu_cmd.extend(
+        [
+            "python",
+            "-c",
+            python_cmd,
+        ]
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         subprocess.run(ncu_cmd, check=True)
         print(f"\nNCU profiling results for benchmark {benchmark_name}:")
@@ -380,6 +420,12 @@ def collect_memory_snapshot(
     print(f"The collect memory snapshot has been written to {snapshot_path}")
 
 
+<<<<<<< HEAD
+=======
+# With AOTAutograd cache, we directly call the compiled module. So prevent
+# Dynamo from reentering
+@torch.compiler.disable  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def compiled_module_main(
     benchmark_name: str, benchmark_compiled_module_fn: BenchmarkCallableType
 ) -> None:
@@ -421,6 +467,28 @@ def compiled_module_main(
         action="store_true",
         help="Whether to run ncu analysis",
     )
+<<<<<<< HEAD
+=======
+    parser.add_argument(
+        "--ncu-kernel-regex",
+        type=str,
+        default=None,
+        help=(
+            "Filter kernels profiled by NCU using a regex (e.g., '^triton_.*'). "
+            "Maps to '--kernel-name regex:<regex>'. "
+            "If None, NCU will profile all kernels."
+        ),
+    )
+    parser.add_argument(
+        "--ncu-metrics",
+        type=str,
+        default=None,
+        help=(
+            "Comma-separated list of NCU metrics to collect (e.g., 'dram__bytes.sum.per_second'). "
+            "If None, NCU will use '--set full'."
+        ),
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args = parser.parse_args()
 
     if args.benchmark_kernels:
@@ -449,4 +517,12 @@ def compiled_module_main(
                 benchmark_compiled_module_fn,
             )
         if args.ncu:
+<<<<<<< HEAD
             ncu_analyzer(benchmark_name, benchmark_compiled_module_fn)
+=======
+            ncu_analyzer(
+                benchmark_name,
+                benchmark_compiled_module_fn,
+                args=args,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index d41f101d4ed9..6059aa8df847 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -31,8 +31,15 @@
     List,
     Optional,
     Tuple,
+<<<<<<< HEAD
     Union,
 )
+=======
+    TypeVar,
+    Union,
+)
+from typing_extensions import ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -47,6 +54,12 @@
 from torch.futures import Future
 
 
+<<<<<<< HEAD
+=======
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IS_PY310_PLUS: Final[bool] = sys.version_info >= (3, 10)
 
 BuiltinUnionType: Union[type, tuple[type, ...]]
@@ -665,7 +678,11 @@ class FunctionModifiers:
     _DROP = "_drop (function is fully ignored, declaration can be unscriptable)"
 
 
+<<<<<<< HEAD
 def export(fn):
+=======
+def export(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This decorator indicates that a method on an ``nn.Module`` is used as an entry point into a
     :class:`ScriptModule` and should be compiled.
@@ -707,11 +724,19 @@ def unused_method(self, x):
         # any compiled methods and wasn't decorated with `@torch.jit.export`
         m = torch.jit.script(MyModule())
     """
+<<<<<<< HEAD
     fn._torchscript_modifier = FunctionModifiers.EXPORT
     return fn
 
 
 def unused(fn):
+=======
+    fn._torchscript_modifier = FunctionModifiers.EXPORT  # type:ignore[attr-defined]
+    return fn
+
+
+def unused(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This decorator indicates to the compiler that a function or method should
     be ignored and replaced with the raising of an exception. This allows you
@@ -764,7 +789,11 @@ def forward(self, x):
 
         return prop
 
+<<<<<<< HEAD
     fn._torchscript_modifier = FunctionModifiers.UNUSED
+=======
+    fn._torchscript_modifier = FunctionModifiers.UNUSED  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fn
 
 
@@ -882,6 +911,7 @@ def decorator(fn):
     return decorator
 
 
+<<<<<<< HEAD
 def _drop(fn):
     fn._torchscript_modifier = FunctionModifiers._DROP
     return fn
@@ -889,6 +919,15 @@ def _drop(fn):
 
 def _copy_to_script_wrapper(fn):
     fn._torchscript_modifier = FunctionModifiers.COPY_TO_SCRIPT_WRAPPER
+=======
+def _drop(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+    fn._torchscript_modifier = FunctionModifiers._DROP  # type: ignore[attr-defined]
+    return fn
+
+
+def _copy_to_script_wrapper(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+    fn._torchscript_modifier = FunctionModifiers.COPY_TO_SCRIPT_WRAPPER  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fn
 
 
diff --git a/torch/_library/autograd.py b/torch/_library/autograd.py
index 5c8c713b6e42..9c6f60747fc4 100644
--- a/torch/_library/autograd.py
+++ b/torch/_library/autograd.py
@@ -105,9 +105,13 @@ def backward(ctx, *grads):
     # The dispatcher passes any keyword-only-args as kwargs and the
     # rest of the args (even if specified as kwargs) as args.
     def autograd_impl(keyset, *args, **keyword_only_args):
+<<<<<<< HEAD
         if _C.is_grad_enabled() and _pytree.tree_any_only(
             Tensor, lambda x: x.requires_grad, args, not_list_of_tensor
         ):
+=======
+        if _C.is_grad_enabled() and _C._any_requires_grad(*args):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             result = Generated.apply(*args, Metadata(keyset, keyword_only_args))  # type: ignore[attr-defined]
         else:
             result = forward_no_grad(*args, Metadata(keyset, keyword_only_args))
diff --git a/torch/_library/custom_ops.py b/torch/_library/custom_ops.py
index b693a14ba673..f05d20533fff 100644
--- a/torch/_library/custom_ops.py
+++ b/torch/_library/custom_ops.py
@@ -54,6 +54,10 @@ def custom_op(
     mutates_args: Union[str, Iterable[str]],
     device_types: device_types_t = None,
     schema: Optional[str] = None,
+<<<<<<< HEAD
+=======
+    tags: Optional[Sequence[_C.Tag]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Union[Callable[[Callable[..., object]], "CustomOpDef"], "CustomOpDef"]:
     """Wraps a function into custom operator.
 
@@ -151,7 +155,11 @@ def inner(fn: Callable[..., object]) -> CustomOpDef:
             schema_str = schema
 
         namespace, opname = name.split("::")
+<<<<<<< HEAD
         result = CustomOpDef(namespace, opname, schema_str, fn)
+=======
+        result = CustomOpDef(namespace, opname, schema_str, fn, tags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if schema is not None:
             # Check that schema's alias annotations match those of `mutates_args`.
             expected = set()
@@ -183,11 +191,26 @@ class CustomOpDef:
     :func:`torch.library.custom_op` API.
     """
 
+<<<<<<< HEAD
     def __init__(self, namespace: str, name: str, schema: str, fn: Callable) -> None:
+=======
+    def __init__(
+        self,
+        namespace: str,
+        name: str,
+        schema: str,
+        fn: Callable,
+        tags: Optional[Sequence[_C.Tag]] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Fields used to interface with the PyTorch dispatcher
         self._namespace = namespace
         self._name = name
         self._schema = schema
+<<<<<<< HEAD
+=======
+        self._tags = tags if tags is not None else []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._init_fn = fn
 
@@ -201,7 +224,11 @@ def __init__(self, namespace: str, name: str, schema: str, fn: Callable) -> None
         self._autocast_cpu_dtype: Optional[_dtype] = None
 
         self._lib = get_library_allowing_overwrite(self._namespace, self._name)
+<<<<<<< HEAD
         self._register_to_dispatcher()
+=======
+        self._register_to_dispatcher(self._tags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._disabled_kernel: set = set()
         OPDEFS[self._qualname] = self
 
@@ -338,9 +365,16 @@ def get_module():
                             fn = self._backend_fns[device_type]
                             return inspect.getmodule(fn)
 
+<<<<<<< HEAD
                         utils.check_aliasing_constraint(
                             self._name,
                             utils.iter_tensors(args, kwargs),
+=======
+                        utils._c_check_aliasing_constraint(
+                            self._name,
+                            args,
+                            kwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             result,
                             get_module,
                         )
@@ -586,7 +620,11 @@ def register_autograd(
         self._backward_fn = backward
         self._setup_context_fn = setup_context
 
+<<<<<<< HEAD
     def _register_to_dispatcher(self) -> None:
+=======
+    def _register_to_dispatcher(self, tags: Sequence[_C.Tag]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if torch._running_with_deploy():
             utils.warn_deploy(stacklevel=5)
             return
@@ -605,7 +643,11 @@ def _register_to_dispatcher(self) -> None:
 
         lib.define(
             schema_str,
+<<<<<<< HEAD
             tags=[_C.Tag.pt2_compliant_tag, _C.Tag.needs_fixed_stride_order],
+=======
+            tags=[_C.Tag.pt2_compliant_tag, *tags],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self._opoverload = utils.lookup_op(self._qualname)
 
diff --git a/torch/_library/fake_class_registry.py b/torch/_library/fake_class_registry.py
index e1cea2148966..c43e6e45aa8e 100644
--- a/torch/_library/fake_class_registry.py
+++ b/torch/_library/fake_class_registry.py
@@ -155,7 +155,11 @@ def maybe_to_fake_obj(
 
     for name in x._method_names():  # type: ignore[attr-defined]
         attr = getattr(fake_x, name, None)
+<<<<<<< HEAD
         if attr:
+=======
+        if attr is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not callable(attr):
                 raise RuntimeError(f"Expect {name} to be a callable but got {attr}.")
 
diff --git a/torch/_library/fake_impl.py b/torch/_library/fake_impl.py
index 40e3694edf97..2360de98daae 100644
--- a/torch/_library/fake_impl.py
+++ b/torch/_library/fake_impl.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import contextlib
 import functools
+<<<<<<< HEAD
 from typing import Callable, Optional
+=======
+from typing import Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
@@ -13,15 +17,40 @@ class FakeImplHolder:
 
     def __init__(self, qualname: str):
         self.qualname: str = qualname
+<<<<<<< HEAD
         self.kernel: Optional[Kernel] = None
         self.lib: Optional[torch.library.Library] = None
 
     def register(self, func: Callable, source: str) -> RegistrationHandle:
+=======
+        # kernels stores all registered fake kernels, ordered by registration
+        # time ascendingly (newer registration after older registration). If an
+        # operator library gets loaded that overrides an existing fake kernel,
+        # both kernels will be in the list, but the newest one will be the one
+        # that is run. If the library is unloaded, we will remove the kernel
+        # from this list.
+        self.kernels: list[Kernel] = []
+
+    @property
+    def kernel(self):
+        if len(self.kernels) == 0:
+            return None
+        return self.kernels[-1]
+
+    @kernel.setter
+    def kernel(self, value):
+        raise RuntimeError("Unable to directly set kernel.")
+
+    def register(
+        self, func: Callable, source: str, lib, *, allow_override=False
+    ) -> RegistrationHandle:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Register an fake impl.
 
         Returns a RegistrationHandle that one can use to de-register this
         fake impl.
         """
+<<<<<<< HEAD
         if self.kernel is not None:
             raise RuntimeError(
                 f"register_fake(...): the operator {self.qualname} "
@@ -69,6 +98,52 @@ def deregister_fake_class():
             self.kernel = None
 
         return RegistrationHandle(deregister_fake_class)
+=======
+
+        if not allow_override:
+            if self.kernel is not None:
+                raise RuntimeError(
+                    f"register_fake(...): the operator {self.qualname} "
+                    f"already has an fake impl registered at "
+                    f"{self.kernel.source}."
+                )
+            if torch._C._dispatch_has_kernel_for_dispatch_key(self.qualname, "Meta"):
+                raise RuntimeError(
+                    f"register_fake(...): the operator {self.qualname} "
+                    f"already has an DispatchKey::Meta implementation via a "
+                    f"pre-existing torch.library or TORCH_LIBRARY registration. "
+                    f"Please either remove that registration or don't call "
+                    f"register_fake."
+                )
+
+            if torch._C._dispatch_has_kernel_for_dispatch_key(
+                self.qualname, "CompositeImplicitAutograd"
+            ):
+                raise RuntimeError(
+                    f"register_fake(...): the operator {self.qualname} "
+                    f"already has an implementation for this device type via a "
+                    f"pre-existing registration to "
+                    f"DispatchKey::CompositeImplicitAutograd."
+                    f"CompositeImplicitAutograd operators do not need an fake "
+                    f"impl; "
+                    f"instead, the operator will decompose into its constituents "
+                    f"and those "
+                    f"can have fake impls defined on them."
+                )
+
+        # Store the kernel in this holder
+        kernel = Kernel(func, source)
+        self.kernels.append(kernel)
+
+        def deregister_fake_kernel():
+            self.kernels.remove(kernel)
+
+        meta_kernel = construct_meta_kernel(self.qualname, self)
+        lib.impl(self.qualname, meta_kernel, "Meta", allow_override=allow_override)
+
+        handle = RegistrationHandle(deregister_fake_kernel)
+        return handle
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def construct_meta_kernel(qualname: str, fake_impl_holder: FakeImplHolder) -> Callable:
diff --git a/torch/_library/fake_profile.py b/torch/_library/fake_profile.py
new file mode 100644
index 000000000000..9b2fc0ae2baf
--- /dev/null
+++ b/torch/_library/fake_profile.py
@@ -0,0 +1,323 @@
+import contextlib
+import io
+import logging
+import os
+from collections.abc import Generator
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch._library.custom_ops import _maybe_get_opdef
+from torch.types import FileLike
+
+
+log = logging.getLogger(__name__)
+
+
+class MissingOpProfile(RuntimeError):
+    """
+    This is raised when we don't have an operator profile available for the
+    given inputs.
+    """
+
+
+@dataclass(frozen=True)
+class TensorMetadata:
+    rank: int
+    dtype: torch.dtype
+    device: torch.device
+    layout: torch.layout
+
+    @staticmethod
+    def maybe_from_tensor(t: Any) -> Optional["TensorMetadata"]:
+        if not isinstance(t, torch.Tensor):
+            return None
+        return TensorMetadata(t.dim(), t.dtype, t.device, t.layout)
+
+
+@dataclass(frozen=True)
+class OpProfile:
+    args_profile: tuple[Optional[TensorMetadata]]
+    out_profile: Union[TensorMetadata, tuple[TensorMetadata]]
+
+
+def _generate_fake_kernel(op_name: str, op_profile: set[OpProfile]) -> Callable:
+    def _match_args(args_profile: tuple[Optional[TensorMetadata]], args: Any) -> bool:
+        return all(
+            TensorMetadata.maybe_from_tensor(arg) == args_profile[i]
+            for i, arg in enumerate(args)
+        )
+
+    def _generate_res(
+        out_profile: Union[TensorMetadata, tuple[TensorMetadata]],
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        ctx = torch.library.get_ctx()
+
+        def _generate_tensor_out(t: TensorMetadata) -> torch.Tensor:
+            fake_shape = [ctx.new_dynamic_size() for _ in range(t.rank)]
+            fake_strides = [-1] * t.rank
+            expected = 1
+            fake_stride = expected
+            for i in range(t.rank):
+                fake_strides[i] = fake_stride  # type: ignore[assignment]
+                fake_stride = fake_stride * fake_shape[i]  # type: ignore[assignment]
+
+            return torch.empty_strided(
+                fake_shape,
+                fake_strides,
+                device=t.device,
+                dtype=t.dtype,
+                layout=t.layout,
+            )
+
+        if isinstance(out_profile, TensorMetadata):
+            return _generate_tensor_out(out_profile)
+        else:
+            return [_generate_tensor_out(t) for t in out_profile]
+
+    def _fake_kernel(*args, **kwargs):  # type: ignore[no-untyped-def]
+        for profile in op_profile:
+            if _match_args(profile.args_profile, (*args, *kwargs.values())):
+                return _generate_res(profile.out_profile)
+
+        raise MissingOpProfile(
+            f"No fake kernel was found for {op_name}, and although we have "
+            "previously registered some profiles to generate a fake kernel, "
+            f"no profiles match the given inputs: {args, kwargs}."
+        )
+
+    return _fake_kernel
+
+
+@contextlib.contextmanager
+def unsafe_generate_fake_kernels(op_profiles: dict[str, set[OpProfile]]) -> Generator:
+    """
+    Registers a fake kernel based on the given operator profiles. This fake
+    kernel registration will override any existing fake kernel registrations.
+
+    The input is a dictionary mapping operator names to a set of operator
+    profiles, which we will use to generate fake kernels. The operator profiles
+    are a record of the input and output tensor metadata. Based on this
+    information we will match a given input to the recorded profile, and return
+    an output with the same metadata as in the recorded profile. If a profile
+    doesn't exist then an exception will be thrown.
+
+    The fake kernel generation is considerd unsafe because it relies on the
+    rigid, pre-defined operator profiles that do not account for potential
+    variations in output behavior. Specifically, the generated kernels assume a
+    fixed relationship between input and output ranks. However, in reality, it's
+    possible that data-dependent operations may produce outputs of different
+    ranks even when given inputs of the same rank. The generated fake kernels
+    are inflexible and unable to accommodate these nuances, making them
+    potentially unsafe.
+
+    Args:
+        op_profiles (dict[str, set[OpProfile]]): A dictionary mapping operator
+            name to a set of operator profiles from which we will generate fake
+            kernels.
+
+    Examples:
+
+        >>> # Example: Registering an op-profile from draft-export
+        >>> import torch
+        >>> from torch.export._draft_export import draft_export
+        >>>
+        >>> @torch.library.custom_op("mylib::foo", mutates_args=())
+        >>> def foo(x: Tensor, y: Tensor) -> Tensor:
+        >>>     return x + y
+        >>>
+        >>> class M(torch.nn.Module):
+        >>>     def forward(self, a, b):
+        >>>         res = torch.ops.mylib.foo(a, b)  # no fake impl
+        >>>         return res
+        >>>
+        >>> ep = draft_export(M(), (torch.ones(3, 4), torch.ones(3, 4))
+        >>>
+        >>> with torch._library.fake_profile.unsafe_generate_fake_kernels(ep._report.op_profiles):
+        >>>     decomp = ep.run_decompositions()
+
+    """
+
+    libs: list[torch.library.Library] = []
+    # Stores old fake impls from custom ops declared through @custom_op
+    old_fake_impls: dict[str, Callable] = {}
+    for op_name, profiles in op_profiles.items():
+        log.warning(
+            "Registering fake profile for %s. This will override any existing "
+            "fake kernel registration.",
+            op_name,
+        )
+
+        op_name_split = op_name.split(".")
+        namespace, op_name_str = op_name_split[0], op_name_split[1]
+        op_str = f"{namespace}::{op_name_str}"
+
+        fake_kernel = _generate_fake_kernel(op_str, profiles)
+
+        if opdef := _maybe_get_opdef(op_str):
+            # If the op is a CustomOpDef, save the existing abstract_fn so that
+            # we can restore it after this contextmanager
+            if opdef._abstract_fn is not None:
+                old_fake_impls[op_str] = opdef._abstract_fn
+            opdef.register_fake(fake_kernel)
+
+        else:
+            # Create a new library so that we can register a new fake impl.
+            # These libraries will then be destroyed after the contextmanager,
+            # which will automatically restore the previously registered fake
+            # impls.
+            newlib = torch.library.Library(namespace, "FRAGMENT")  # noqa: TOR901
+            torch.library.register_fake(
+                op_str, fake_kernel, lib=newlib, allow_override=True
+            )
+            libs.append(newlib)
+
+    try:
+        yield libs
+    finally:
+        # Destroying the libraries will automatically restore the previously
+        # registered fake impls
+        for lib in libs:
+            lib._destroy()
+
+        # Restore abstract_fns for CustomOpDefs
+        for op_str, old_fake in old_fake_impls.items():
+            opdef = _maybe_get_opdef(op_str)
+            assert opdef is not None
+            opdef.register_fake(old_fake)
+
+
+def get_torch_version() -> str:
+    version = torch.__version__.split(".")
+    return f"{int(version[0])}.{int(version[1])}"
+
+
+def generate_yaml_from_profiles(op_profiles: dict[str, set[OpProfile]]) -> str:
+    """
+    Generates a yaml string from the given operator profiles which can be saved
+    to a file. The yaml string can be loaded back into an operator profile
+    structure using `read_profiles_from_yaml`.
+    """
+    import yaml
+
+    from torch._export.serde.serialize import (
+        _TORCH_TO_SERIALIZE_DTYPE,
+        _TORCH_TO_SERIALIZE_LAYOUT,
+    )
+
+    def serialize_tensor_metadata(t: TensorMetadata) -> dict:
+        return {
+            "rank": t.rank,
+            "dtype": _TORCH_TO_SERIALIZE_DTYPE[t.dtype].value,
+            "device": str(t.device),
+            "layout": _TORCH_TO_SERIALIZE_LAYOUT[t.layout].value,
+        }
+
+    def serialize_op_profile(op: OpProfile) -> dict:
+        return {
+            "args_profile": [
+                serialize_tensor_metadata(arg)
+                for arg in op.args_profile
+                if arg is not None
+            ],
+            "out_profile": (
+                serialize_tensor_metadata(op.out_profile)
+                if isinstance(op.out_profile, TensorMetadata)
+                else [serialize_tensor_metadata(out) for out in op.out_profile]
+            ),
+        }
+
+    serialized_data = {
+        operator: [serialize_op_profile(profile) for profile in profiles]
+        for operator, profiles in op_profiles.items()
+    }
+    return yaml.dump(
+        {"torch_version": get_torch_version(), "operators": serialized_data},
+        sort_keys=False,
+    )
+
+
+def save_op_profiles(op_profiles: dict[str, set[OpProfile]], f: FileLike) -> None:
+    """
+    Serializes the given operator profiles into a yaml format and saves it to
+    the given file. The operator profile can be loaded back using `load_op_profiles`.
+    """
+    yaml_str = generate_yaml_from_profiles(op_profiles)
+
+    if isinstance(f, (str, os.PathLike)):
+        f = os.fspath(f)
+
+        with open(f, "w") as file:
+            file.write(yaml_str)
+
+    elif isinstance(f, io.BytesIO):
+        f.write(yaml_str.encode("utf-8"))
+
+    else:
+        raise ValueError(f"Invalid type of file {f}")
+
+
+def read_profiles_from_yaml(yaml_str: str) -> dict[str, set[OpProfile]]:
+    """
+    Reads the yaml saved by `save_op_profiles` and returns the operator profiles.
+    """
+    import yaml
+
+    from torch._export.serde.serialize import (
+        _SERIALIZE_TO_TORCH_DTYPE,
+        _SERIALIZE_TO_TORCH_LAYOUT,
+    )
+
+    def deserialize_tensor_metadata(data: dict) -> TensorMetadata:
+        return TensorMetadata(
+            rank=data["rank"],
+            dtype=_SERIALIZE_TO_TORCH_DTYPE[data["dtype"]],
+            device=torch.device(data["device"]),
+            layout=_SERIALIZE_TO_TORCH_LAYOUT[data["layout"]],
+        )
+
+    def deserialize_op_profile(data: dict) -> OpProfile:
+        args_profile = tuple(
+            deserialize_tensor_metadata(arg) for arg in data["args_profile"]
+        )
+        out_profile_data = data["out_profile"]
+        out_profile: Union[tuple[TensorMetadata], TensorMetadata] = (
+            tuple(deserialize_tensor_metadata(out) for out in out_profile_data)  # type: ignore[assignment]
+            if isinstance(out_profile_data, list)
+            else deserialize_tensor_metadata(out_profile_data)
+        )
+        return OpProfile(args_profile=args_profile, out_profile=out_profile)  # type: ignore[arg-type]
+
+    loaded_data = yaml.safe_load(yaml_str)
+    loaded_torch_version = loaded_data["torch_version"]
+
+    if loaded_torch_version != get_torch_version():
+        raise RuntimeError(
+            "Unable to load outdated profile. It was saved with torch version: "
+            f"{loaded_torch_version} but the current torch version is: {get_torch_version()}"
+        )
+
+    operators_data = loaded_data["operators"]
+    return {
+        operator: {deserialize_op_profile(profile) for profile in profiles}
+        for operator, profiles in operators_data.items()
+    }
+
+
+def load_op_profiles(f: FileLike) -> dict[str, set[OpProfile]]:
+    """
+    Loads the saved operator profiles from `save_op_profiles`.
+    """
+    if isinstance(f, (str, os.PathLike)):
+        f = os.fspath(f)
+
+        with open(f) as file:
+            yaml_str = file.read()
+
+    elif isinstance(f, io.BytesIO):
+        yaml_str = f.read().decode("utf-8")
+
+    else:
+        raise ValueError(f"Invalid type of file {f}")
+
+    return read_profiles_from_yaml(yaml_str)
diff --git a/torch/_library/infer_schema.py b/torch/_library/infer_schema.py
index b9a0061139d6..19da17adac28 100644
--- a/torch/_library/infer_schema.py
+++ b/torch/_library/infer_schema.py
@@ -77,7 +77,11 @@ def convert_type_string(annotation_type: str):
             )
 
     def unstringify_types(
+<<<<<<< HEAD
         tys: tuple[Union[type[object], str], ...]
+=======
+        tys: tuple[Union[type[object], str], ...],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> tuple[tuple[typing.Any, ...], bool]:
         res = []
         changed = False
@@ -282,8 +286,17 @@ def parse_return(annotation, error_fn):
                 f"Return has unsupported type {annotation}. "
                 f"The valid types are: {SUPPORTED_RETURN_TYPES}."
             )
+<<<<<<< HEAD
 
     return "(" + ", ".join([SUPPORTED_RETURN_TYPES[arg] for arg in args]) + ")"
+=======
+    output_ty = ", ".join([SUPPORTED_RETURN_TYPES[arg] for arg in args])
+
+    # use (()) to represent tuple with single element
+    if len(args) == 1:
+        output_ty = "(" + output_ty + ")"
+    return "(" + output_ty + ")"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 SUPPORTED_PARAM_TYPES = get_supported_param_types()
diff --git a/torch/_library/utils.py b/torch/_library/utils.py
index 8348883cee30..bf8a4de3ac59 100644
--- a/torch/_library/utils.py
+++ b/torch/_library/utils.py
@@ -373,6 +373,34 @@ def check_aliasing_constraint(name, prev, result, get_module=lambda: "???"):
         storages.add(key)
 
 
+<<<<<<< HEAD
+=======
+def _c_check_aliasing_constraint(name, args, kwargs, result, get_module=lambda: "???"):
+    """
+    custom operators' outputs must not have any aliases
+    This version uses C++ implementation for perf.
+    Only List container is supported.
+    Tensors in Lists with not only Tensors are checked.
+    """
+    tuple_result = result
+    if not isinstance(result, tuple):
+        tuple_result = (result,)
+    if _C._any_output_is_alias_to_input_or_output(args, kwargs, tuple_result):
+        raise RuntimeError(
+            f"{name} (with implementation in {get_module()}): "
+            f"The output of this custom operator (1) must not "
+            f"also be an input to this custom operator and "
+            f"(2) may not alias any inputs to this custom operator "
+            f"or other returns. "
+            f"The most common way to trigger this error is if "
+            f"we have y = custom_op(x) and y and x are the same Tensor. "
+            f"Please instead return a clone of the offending output "
+            f"tensor(s) (e.g. return x.clone()) or refactor the custom "
+            f"operator to not return y."
+        )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MutationChecker:
     """
     Check if an operator mutated its arguments.
@@ -476,3 +504,28 @@ def mutated_args_kwargs(schema: _C.FunctionSchema) -> tuple[list[int], list[str]
             else:
                 idxs.append(i)
     return idxs, keys
+<<<<<<< HEAD
+=======
+
+
+tags_by_priority = [
+    _C.Tag.needs_exact_strides,
+    _C.Tag.needs_contiguous_strides,
+    _C.Tag.needs_fixed_stride_order,
+    _C.Tag.flexible_layout,
+]
+
+
+def get_layout_constraint_tag(fn, *, with_default=True):
+    for tag in tags_by_priority:
+        if tag in fn.tags:
+            return tag
+    if with_default:
+        if is_builtin(fn):
+            return _C.Tag.flexible_layout
+        import torch._functorch
+        from torch._functorch import config
+
+        return getattr(torch._C.Tag, config.custom_op_default_layout_constraint)
+    return None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py
index 03fe16f470c1..fb1580eac64f 100644
--- a/torch/_lobpcg.py
+++ b/torch/_lobpcg.py
@@ -498,7 +498,11 @@ def lobpcg(
       [DuerschEtal2018] Jed A. Duersch, Meiyue Shao, Chao Yang, Ming
       Gu. (2018) A Robust and Efficient Implementation of LOBPCG.
       SIAM J. Sci. Comput., 40(5), C655-C676. (22 pages)
+<<<<<<< HEAD
       https://epubs.siam.org/doi/abs/10.1137/17M1129830
+=======
+      https://arxiv.org/abs/1704.07458
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     """
 
@@ -783,11 +787,18 @@ def update_converged_count(self):
         A_norm = self.fvars["A_norm"]
         B_norm = self.fvars["B_norm"]
         E, X, R = self.E, self.X, self.R
+<<<<<<< HEAD
         rerr = (
             torch.norm(R, 2, (0,))
             * (torch.norm(X, 2, (0,)) * (A_norm + E[: X.shape[-1]] * B_norm)) ** -1
         )
         converged = rerr.real < tol  # this is a norm so imag is 0.0
+=======
+        rerr = torch.norm(R, 2, (0,)) / (
+            torch.norm(X, 2, (0,)) * (A_norm + torch.abs(E[: X.shape[-1]]) * B_norm)
+        )
+        converged = rerr < tol
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         count = 0
         for b in converged:
             if not b:
diff --git a/torch/_logging/_internal.py b/torch/_logging/_internal.py
index d07c8277c7a3..574705348dc4 100644
--- a/torch/_logging/_internal.py
+++ b/torch/_logging/_internal.py
@@ -14,7 +14,12 @@
 import time
 from collections import defaultdict
 from dataclasses import dataclass, field
+<<<<<<< HEAD
 from typing import Any, Callable, Optional, Union
+=======
+from typing import Any, Callable, Generic, Optional, Union
+from typing_extensions import ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from weakref import WeakSet
 
 import torch._logging.structured
@@ -23,6 +28,11 @@
 from torch.utils._traceback import CapturedTraceback
 
 
+<<<<<<< HEAD
+=======
+_P = ParamSpec("_P")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 log = logging.getLogger(__name__)
 
 # This is a synthetic logger which doesn't correspond to an actual logger,
@@ -96,7 +106,11 @@ def is_log(self, alias):
         return alias in self.log_alias_to_log_qnames
 
     # register a log with an alias
+<<<<<<< HEAD
     def register_log(self, alias, log_qnames: Union[str, list[str]]):
+=======
+    def register_log(self, alias, log_qnames: Union[str, list[str]]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(log_qnames, str):
             log_qnames = [log_qnames]
         self.log_alias_to_log_qnames[alias] = log_qnames
@@ -104,7 +118,11 @@ def register_log(self, alias, log_qnames: Union[str, list[str]]):
     # register an artifact name
     def register_artifact_name(
         self, name, description, visible, off_by_default, log_format
+<<<<<<< HEAD
     ):
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.artifact_names.add(name)
         if visible:
             self.visible_artifacts.add(name)
@@ -121,10 +139,17 @@ def register_artifact_name(
     # register the qualified name of an artifact log
     # this is needed to know which logs need to be reset
     # whenever the log_state is changed
+<<<<<<< HEAD
     def register_artifact_log(self, artifact_log_qname):
         self.artifact_log_qnames.add(artifact_log_qname)
 
     def register_child_log(self, log_qname):
+=======
+    def register_artifact_log(self, artifact_log_qname) -> None:
+        self.artifact_log_qnames.add(artifact_log_qname)
+
+    def register_child_log(self, log_qname) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.child_log_qnames.add(log_qname)
 
     # flattens all the qnames together (TODO: consider memoizing?)
@@ -149,13 +174,21 @@ class LogState:
     # the set of currently enabled artifacts
     artifact_names: set[str] = field(default_factory=set)
 
+<<<<<<< HEAD
     def enable_artifact(self, artifact_name):
+=======
+    def enable_artifact(self, artifact_name) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.artifact_names.add(artifact_name)
 
     def is_artifact_enabled(self, name):
         return name in self.artifact_names
 
+<<<<<<< HEAD
     def enable_log(self, log_qnames, log_level):
+=======
+    def enable_log(self, log_qnames, log_level) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(log_qnames, str):
             log_qnames = [log_qnames]
         for log_qname in log_qnames:
@@ -175,7 +208,11 @@ def get_log_level_pairs(self):
         """
         return self.log_qname_to_level.items()
 
+<<<<<<< HEAD
     def clear(self):
+=======
+    def clear(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.log_qname_to_level.clear()
         self.artifact_names.clear()
 
@@ -217,6 +254,10 @@ def set_logs(
     ddp_graphs: bool = False,
     graph: bool = False,
     graph_code: bool = False,
+<<<<<<< HEAD
+=======
+    graph_code_verbose: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     graph_breaks: bool = False,
     graph_sizes: bool = False,
     guards: bool = False,
@@ -246,7 +287,14 @@ def set_logs(
     benchmarking: bool = False,
     autotuning: bool = False,
     graph_region_expansion: bool = False,
+<<<<<<< HEAD
 ):
+=======
+    inductor_metrics: bool = False,
+    hierarchical_compile: bool = False,
+    compute_dependencies: bool = False,
+) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Sets the log level for individual components and toggles individual log
     artifact types.
@@ -345,6 +393,12 @@ def set_logs(
             Whether to emit the python source of the graph captured by TorchDynamo.
             Default: ``False``
 
+<<<<<<< HEAD
+=======
+        graph_code_verbose (:class:`bool`):
+            Whether to emit verbose/intermediate FX pass logs for graph code. Default: ``False``
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph_breaks (:class:`bool`):
             Whether to emit the graph breaks encountered by TorchDynamo.
             Default: ``False``
@@ -437,6 +491,14 @@ def set_logs(
         graph_region_expansion (:class:`bool`):
             Whether to emit the detailed steps of the duplicate graph region tracker expansion algorithm. Default: ``False``
 
+<<<<<<< HEAD
+=======
+        inductor_metrics (:class:`bool`):
+            Whether to estimate the runtimes of the nodes in a graph and log them to the metrics table. Default: ``False``
+
+        hierarchical_compile (:class:`bool`):
+            Whether to emit debug info for hierarchical compilation. Default: ``False``
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example::
 
@@ -463,7 +525,11 @@ def set_logs(
 
     modules = modules or {}
 
+<<<<<<< HEAD
     def _set_logs(**kwargs):
+=======
+    def _set_logs(**kwargs) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for alias, val in itertools.chain(kwargs.items(), modules.items()):  # type: ignore[union-attr]
             if val is None:
                 continue
@@ -486,6 +552,17 @@ def _set_logs(**kwargs):
                 log_state.enable_log(
                     log_registry.log_alias_to_log_qnames.get(alias, alias), val
                 )
+<<<<<<< HEAD
+=======
+            elif _is_valid_module(alias):
+                if not _has_registered_parent(alias):
+                    log_registry.register_log(alias, alias)
+                else:
+                    log_registry.register_child_log(alias)
+                log_state.enable_log(
+                    log_registry.log_alias_to_log_qnames.get(alias, alias), val
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise ValueError(
                     f"Unrecognized log or artifact name passed to set_logs: {alias}"
@@ -511,6 +588,10 @@ def _set_logs(**kwargs):
         dtensor=dtensor,
         graph=graph,
         graph_code=graph_code,
+<<<<<<< HEAD
+=======
+        graph_code_verbose=graph_code_verbose,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         graph_breaks=graph_breaks,
         graph_sizes=graph_sizes,
         guards=guards,
@@ -540,17 +621,31 @@ def _set_logs(**kwargs):
         benchmarking=benchmarking,
         autotuning=autotuning,
         graph_region_expansion=graph_region_expansion,
+<<<<<<< HEAD
     )
 
 
 def get_loggers():
+=======
+        inductor_metrics=inductor_metrics,
+        hierarchical_compile=hierarchical_compile,
+        compute_dependencies=compute_dependencies,
+    )
+
+
+def get_loggers() -> list[logging.Logger]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Returns: a list of all registered loggers
     """
     return [logging.getLogger(qname) for qname in log_registry.get_log_qnames()]
 
 
+<<<<<<< HEAD
 def register_log(setting_name, log_name):
+=======
+def register_log(setting_name, log_name) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Enables a log to be controlled by the env var and user API with the setting_name
     Args:
@@ -562,7 +657,11 @@ def register_log(setting_name, log_name):
 
 def register_artifact(
     setting_name, description, visible=False, off_by_default=False, log_format=None
+<<<<<<< HEAD
 ):
+=======
+) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Enables an artifact to be controlled by the env var and user API with name
     Args:
@@ -577,7 +676,11 @@ def register_artifact(
     )
 
 
+<<<<<<< HEAD
 def getArtifactLogger(module_qname, artifact_name):
+=======
+def getArtifactLogger(module_qname, artifact_name) -> logging.Logger:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if artifact_name not in log_registry.artifact_names:
         raise ValueError(
             f"Artifact name: {repr(artifact_name)} not registered,"
@@ -600,7 +703,11 @@ def getArtifactLogger(module_qname, artifact_name):
 )
 
 
+<<<<<<< HEAD
 def configure_artifact_log(log):
+=======
+def configure_artifact_log(log) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # If the artifact is off by default, then it should only be logged when explicitly
     # enabled; set propagate to False so that this artifact is not propagated
     # to its ancestor logger
@@ -631,7 +738,10 @@ def pad_to(s, length=30):
         printed_artifacts = log_registry.artifact_names
     else:
         printed_artifacts = log_registry.visible_artifacts
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if verbose:
         heading = "All registered names"
     else:
@@ -757,14 +867,22 @@ def _is_valid_module(qname):
     return spec is not None
 
 
+<<<<<<< HEAD
 def _update_log_state_from_env():
+=======
+def _update_log_state_from_env() -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global log_state
     log_setting = os.environ.get(LOG_ENV_VAR, None)
     if log_setting is not None:
         log_state = _parse_log_settings(log_setting)
 
 
+<<<<<<< HEAD
 def _has_registered_parent(log_qname):
+=======
+def _has_registered_parent(log_qname) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cur_log = logging.getLogger(log_qname)
 
     registered_log_qnames = log_registry.get_log_qnames()
@@ -801,7 +919,11 @@ def make_module_path_relative(abs_path):
 class TorchLogsFormatter(logging.Formatter):
     def __init__(
         self, *, trace: bool = False, trace_id_filter: Optional[set[str]] = None
+<<<<<<< HEAD
     ):
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self._is_trace = trace
         self._trace_id_filter = trace_id_filter
@@ -906,7 +1028,11 @@ def _default_formatter():
 DEFAULT_FORMATTER = _default_formatter()
 
 
+<<<<<<< HEAD
 def _setup_handlers(create_handler_fn, log):
+=======
+def _setup_handlers(create_handler_fn, log) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     debug_handler = _track_handler(create_handler_fn())
     debug_handler.setFormatter(DEFAULT_FORMATTER)
     debug_handler.setLevel(logging.DEBUG)
@@ -928,13 +1054,21 @@ def _is_torch_handler(handler):
 
 
 # clears all torch handlers on specified loggers
+<<<<<<< HEAD
 def _clear_handlers(log):
+=======
+def _clear_handlers(log) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     to_remove = [handler for handler in log.handlers if _is_torch_handler(handler)]
     for handler in to_remove:
         log.removeHandler(handler)
 
 
+<<<<<<< HEAD
 def _reset_logs():
+=======
+def _reset_logs() -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # reset all registered logs
     for log_qname in log_registry.get_log_qnames():
         log = logging.getLogger(log_qname)
@@ -958,12 +1092,20 @@ def _get_log_state():
     return log_state
 
 
+<<<<<<< HEAD
 def _set_log_state(state):
+=======
+def _set_log_state(state) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global log_state
     log_state = state
 
 
+<<<<<<< HEAD
 def _init_logs(log_file_name=None):
+=======
+def _init_logs(log_file_name=None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global GET_DTRACE_STRUCTURED
 
     _reset_logs()
@@ -1037,7 +1179,11 @@ def _init_logs(log_file_name=None):
 class LazyTraceHandler(logging.StreamHandler):
     """Like FileHandler, but the file is allocated lazily only upon the first log message"""
 
+<<<<<<< HEAD
     def __init__(self, root_dir: Optional[str]):
+=======
+    def __init__(self, root_dir: Optional[str]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This is implemented in the same way that delay is implemented on
         # FileHandler
         self.root_dir = root_dir
@@ -1046,7 +1192,11 @@ def __init__(self, root_dir: Optional[str]):
         self._builtin_open = open
 
     # cloned from FileHandler in cpython
+<<<<<<< HEAD
     def close(self):
+=======
+    def close(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.acquire()
         try:
             try:
@@ -1067,7 +1217,11 @@ def close(self):
         finally:
             self.release()
 
+<<<<<<< HEAD
     def emit(self, record):
+=======
+    def emit(self, record) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.stream is None:
             if self.root_dir is None:
                 TRACE_LOG_DIR = "/logs"
@@ -1119,8 +1273,13 @@ def emit(self, record):
             super().emit(record)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
 def warning_once(logger_obj, *args, **kwargs):
+=======
+@functools.cache
+def warning_once(logger_obj, *args, **kwargs) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This function is similar to `logger.warning()`, but will emit the warning with the same message only once
     Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
@@ -1130,13 +1289,24 @@ def warning_once(logger_obj, *args, **kwargs):
     logger_obj.warning(*args, **kwargs)
 
 
+<<<<<<< HEAD
 class LazyString:
     def __init__(self, func, *args, **kwargs):
+=======
+class LazyString(Generic[_P]):
+    def __init__(
+        self, func: Callable[_P, str], *args: _P.args, **kwargs: _P.kwargs
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.func = func
         self.args = args
         self.kwargs = kwargs
 
+<<<<<<< HEAD
     def __str__(self):
+=======
+    def __str__(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.func(*self.args, **self.kwargs)
 
 
@@ -1294,9 +1464,15 @@ def dtrace_structured(
     *,
     payload_fn: Callable[[], Optional[Union[str, object]]] = lambda: None,
     suppress_context: bool = False,
+<<<<<<< HEAD
     expect_trace_id: bool = True,  # Whether or not we expect to have a current trace id
     record_logging_overhead: bool = True,  # Whether or not to record the time spent on structured logging
 ):
+=======
+    expect_trace_id: bool = False,  # Whether or not we expect to have a current trace id
+    record_logging_overhead: bool = True,  # Whether or not to record the time spent on structured logging
+) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     For logging more detailed information used for debugging. This may result in
     the program becoming slow.
diff --git a/torch/_logging/_registrations.py b/torch/_logging/_registrations.py
index 73ac53145c63..d9a47b88e14d 100644
--- a/torch/_logging/_registrations.py
+++ b/torch/_logging/_registrations.py
@@ -14,6 +14,16 @@
 ]
 
 register_log(
+<<<<<<< HEAD
+=======
+    "async_compile",
+    [
+        "torch._inductor.async_compile",
+        "torch._inductor.compile_worker.tracked_process_pool",
+    ],
+)
+register_log(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "cache", ("torch._inductor.remote_cache", "torch._inductor.fb.remote_cache")
 )
 register_log("dynamo", ["torch._dynamo", *DYNAMIC])
@@ -71,6 +81,13 @@
 )
 register_artifact("graph_code", "Like `graph`, but gives you the Python code instead.")
 register_artifact(
+<<<<<<< HEAD
+=======
+    "graph_code_verbose",
+    "Verbose FX pass logs, e.g. from tensorify_python_scalars and runtime_assert.",
+)
+register_artifact(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "graph_sizes", "Prints the sizes of all FX nodes in the dynamo graph."
 )
 register_artifact(
@@ -172,6 +189,10 @@
 )
 register_artifact("perf_hints", "", off_by_default=True)
 register_artifact("onnx_diagnostics", "", off_by_default=True)
+<<<<<<< HEAD
+=======
+register_artifact("compute_dependencies", "", off_by_default=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 register_artifact(
     "fusion",
     "Detailed Inductor fusion decisions. More detailed than 'schedule'",
@@ -183,6 +204,15 @@
     off_by_default=True,
 )
 register_artifact(
+<<<<<<< HEAD
+=======
+    "loop_tiling",
+    "Logs related to loop ordering",
+    off_by_default=True,
+)
+
+register_artifact(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "overlap",
     "Detailed Inductor compute/comm overlap decisions",
     off_by_default=True,
@@ -218,4 +248,17 @@
     off_by_default=True,
 )
 
+<<<<<<< HEAD
+=======
+register_artifact(
+    "inductor_metrics",
+    "Logs Inductor metrics, such as num_bytes, nodes_num_elem, node_runtimes",
+    off_by_default=True,
+)
+register_artifact(
+    "hierarchical_compile",
+    "Logs debug info for hierarchical compilation",
+    off_by_default=True,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 register_artifact("custom_format_test_artifact", "Testing only", log_format="")
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 1b96ad1374ef..b431e73de996 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1,8 +1,15 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
 from collections.abc import Sequence
 from enum import Enum
 from functools import wraps
+=======
+import operator
+from collections.abc import Sequence
+from enum import Enum
+from functools import reduce, wraps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Callable, Optional, TypeVar, Union
 from typing_extensions import ParamSpec
 
@@ -16,17 +23,36 @@
     meta_table,
 )
 from torch._ops import OpOverload
+<<<<<<< HEAD
 from torch._prims import _prim_elementwise_meta, ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND
+=======
+from torch._prims import (
+    _prim_elementwise_meta,
+    ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND,
+    view_of,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._prims_common import (
     BoolLike,
     corresponding_complex_dtype,
     corresponding_real_dtype,
+<<<<<<< HEAD
+=======
+    definitely_contiguous,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     FloatLike,
     IntLike,
+<<<<<<< HEAD
+    make_contiguous_strides_for,
+    Number,
+=======
+    is_contiguous,
     make_contiguous_strides_for,
     Number,
+    suggest_memory_format,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TensorLike,
 )
 from torch._prims_common.wrappers import (
@@ -194,6 +220,171 @@ def linalg_cross(self, other, *, dim=-1):
     return self.new_empty(out_shape)
 
 
+<<<<<<< HEAD
+=======
+# This function is python match of computeStride_impl in TensorUtils.cpp
+def _compute_stride(old_shape, old_stride, new_shape, size_oblivious=False):
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        sym_eq,
+    )
+
+    def maybe_guard_or_false(x):
+        if size_oblivious:
+            return guard_or_false(x)
+
+        return x
+
+    def maybe_guard_or_true(x):
+        if size_oblivious:
+            return guard_or_true(x)
+
+        return x
+
+    if len(old_shape) == 0:
+        return [1] * len(new_shape)
+
+    numel = reduce(operator.mul, old_shape, 1)
+    zero_numel = maybe_guard_or_false(numel == 0)
+    if zero_numel and maybe_guard_or_false(sym_eq(old_shape, new_shape)):
+        return old_stride
+
+    new_stride = [0] * len(new_shape)
+
+    if zero_numel:
+        for view_d in range(len(new_shape) - 1, -1, -1):
+            if view_d == len(new_shape) - 1:
+                new_stride[view_d] = 1
+            else:
+                new_stride[view_d] = (
+                    max(new_shape[view_d + 1], 1) * new_stride[view_d + 1]
+                )
+        return new_stride
+
+    view_d = len(new_shape) - 1
+    chunk_base_stride = old_stride[-1]
+    tensor_numel = 1
+    view_numel = 1
+
+    for tensor_d in range(len(old_shape) - 1, -1, -1):
+        tensor_numel *= old_shape[tensor_d]
+
+        if tensor_d == 0 or (
+            maybe_guard_or_true(old_shape[tensor_d - 1] != 1)
+            and maybe_guard_or_true(
+                old_stride[tensor_d - 1] != tensor_numel * chunk_base_stride
+            )
+        ):
+            while view_d >= 0 and (
+                maybe_guard_or_true(view_numel < tensor_numel)
+                or maybe_guard_or_false(new_shape[view_d] == 1)
+            ):
+                new_stride[view_d] = view_numel * chunk_base_stride
+                view_numel *= new_shape[view_d]
+                view_d -= 1
+
+            if maybe_guard_or_true(view_numel != tensor_numel):
+                return None
+
+            if tensor_d > 0:
+                chunk_base_stride = old_stride[tensor_d - 1]
+                tensor_numel = 1
+                view_numel = 1
+    if view_d != -1:
+        return None
+    return new_stride
+
+
+def _view_has_unbacked_input(a, shape):
+    from torch.fx.experimental.symbolic_shapes import has_hint
+
+    return (
+        any(not has_hint(s) for s in a.size())
+        or any(not has_hint(s) for s in a.stride())
+        or any(not has_hint(s) for s in shape)
+    )
+
+
+def _view_unbacked_meta(a, shape, size_oblivious_enabled=True):
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_eq
+
+    # Creates a valid shape
+    shape = utils.extract_shape_from_varargs(shape, validate=False)
+
+    # Reshape may be given a shape with a -1 length
+    # This indicates that the dimension's length should be inferred
+    shape = utils.infer_size(shape, a.numel())
+
+    # Special-cases reshaping zero dim tensors
+    if a.ndim == 0:
+        _a = a
+        for length in shape:
+            torch._check(length == 1)
+            _a = torch._refs.unsqueeze(_a, -1)
+        if _a is a:
+            return view_of(a)
+        else:
+            return _a
+
+    # Special-cases reshaping to zero dim tensors
+    if len(shape) == 0:
+        _a = a
+        for length in a.shape:
+            torch._check(length == 1)
+            _a = torch._refs.squeeze(_a, -1)
+        if _a is a:
+            return view_of(a)
+        else:
+            return _a
+
+    shape_numel = reduce(operator.mul, shape, 1)
+
+    torch._check(
+        a.numel() == shape_numel,
+        lambda: f"Could not reshape a tensor with shape {a.shape} as a tensor with shape {shape}!",
+    )
+
+    if len(shape) == len(a.shape) and guard_or_false(sym_eq(shape, a.shape)):
+        return view_of(a)
+
+    if definitely_contiguous(a) if size_oblivious_enabled else is_contiguous(a):
+        strides = utils.make_contiguous_strides_for(shape)
+        return a.as_strided(shape, strides)
+
+    new_strides = _compute_stride(
+        a.size(), a.stride(), shape, size_oblivious=size_oblivious_enabled
+    )
+
+    if new_strides is not None:
+        return a.as_strided(shape, new_strides)
+
+    # If we fail to do size oblivious view, and backed_size_oblivious was on,
+    # then we redo everything by looking at hints and guarding instead of failing.
+    # Also if the expression has unbacked symbols, then we run again with size_oblivious_enabled=False
+    # to throw a data dependent error.
+
+    if size_oblivious_enabled and (
+        torch.fx.experimental._config.backed_size_oblivious
+        or _view_has_unbacked_input(a, shape)
+    ):
+        return _view_unbacked_meta(a, shape, size_oblivious_enabled=False)
+
+    msg = f"Cannot view a tensor with shape {a.shape} and strides {a.stride()} as a tensor with shape {shape}!"
+    raise ValueError(msg)
+
+
+@register_meta(aten.view.default)
+def _view_meta(a, *shape):
+    if torch.fx.experimental._config.backed_size_oblivious or _view_has_unbacked_input(
+        a, shape
+    ):
+        return _view_unbacked_meta(a, shape)
+    else:
+        return torch._refs._reshape_view_helper(a, *shape, allow_copy=False)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta(aten.linalg_matrix_exp)
 @out_wrapper()
 def linalg_matrix_exp(self):
@@ -329,14 +520,24 @@ def meta_fft_r2c(self, dim, normalization, onesided):
     if onesided:
         out_sizes[last_dim] = last_dim_halfsize
 
+<<<<<<< HEAD
     if device_hint(self) == "cuda":
         # _fft_r2c_cufft in aten/src/ATen/native/cuda/SpectralOps.cpp
+=======
+    if device_hint(self) == "cuda" or device_hint(self) == "xpu":
+        # _fft_r2c_cufft in aten/src/ATen/native/cuda/SpectralOps.cpp
+        # _fft_r2c_xpu in torch-xpu-ops/src/ATen/native/xpu/SpectralOps.cpp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output = self.new_empty(
             out_sizes, dtype=utils.corresponding_complex_dtype(self.dtype)
         )
 
         working_tensor = self
+<<<<<<< HEAD
         if use_optimized_cufft_path(dim):
+=======
+        if device_hint(self) == "cuda" and use_optimized_cufft_path(dim):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _exec_fft(output, working_tensor, out_sizes, dim, forward=True)
         else:
             # First do the R2C transform on the last dimension
@@ -369,12 +570,15 @@ def meta_fft_r2c(self, dim, normalization, onesided):
 
         return output
 
+<<<<<<< HEAD
     elif device_hint(self) == "xpu":
         sorted_dims = _sort_dims(self, dim, exclude_last=True)
         out = self.new_empty(
             out_sizes, dtype=utils.corresponding_complex_dtype(self.dtype)
         )
         return _exec_fft(out, self, out_sizes, sorted_dims, forward=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return self.new_empty(
             out_sizes, dtype=utils.corresponding_complex_dtype(self.dtype)
@@ -637,7 +841,11 @@ def meta__cslt_sparse_mm(
     transpose_result: bool = False,
     alg_id: int = 0,
     split_k: int = 1,
+<<<<<<< HEAD
     split_k_one_kernel: bool = False,
+=======
+    split_k_mode: int = -1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     assert dense_B.dtype in {
         torch.float32,
@@ -1853,6 +2061,13 @@ def meta_reflection_pad1d(input, padding):
 @register_meta(aten.replication_pad1d)
 @out_wrapper()
 def meta_replication_pad1d(input, padding):
+<<<<<<< HEAD
+=======
+    torch._check(
+        input.dtype != torch.bool,
+        lambda: f""""replication_pad1d" not implemented for '{input.dtype.__str__()}'""",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _pad1d_common(input, padding, is_reflection=False)
 
 
@@ -1960,6 +2175,13 @@ def meta_reflection_pad2d(input, padding):
 @register_meta(aten.replication_pad2d)
 @out_wrapper()
 def meta_replication_pad2d(input, padding):
+<<<<<<< HEAD
+=======
+    torch._check(
+        input.dtype != torch.bool,
+        lambda: f""""replication_pad2d" not implemented for '{input.dtype.__str__()}'""",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _pad2d_common(input, padding, is_reflection=False)
 
 
@@ -2073,6 +2295,13 @@ def meta_reflection_pad3d(input, padding):
 @register_meta(aten.replication_pad3d)
 @out_wrapper()
 def meta_replication_pad3d(input, padding):
+<<<<<<< HEAD
+=======
+    torch._check(
+        input.dtype != torch.bool,
+        lambda: f""""replication_pad3d" not implemented for '{input.dtype.__str__()}'""",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _pad3d_common(input, padding, is_reflection=False)
 
 
@@ -2152,6 +2381,7 @@ def meta__pdist_backward(grad: Tensor, self: Tensor, p: float, pdist: Tensor) ->
 
 
 @register_meta([aten.baddbmm.default, aten.baddbmm.out])
+<<<<<<< HEAD
 @out_wrapper()
 def meta_baddbmm(self, batch1, batch2, *, beta=1, alpha=1):
     dim1 = batch1.size(0)
@@ -2164,6 +2394,24 @@ def meta_baddbmm(self, batch1, batch2, *, beta=1, alpha=1):
         self.dtype == batch1.dtype == batch2.dtype,
         lambda: f"Input dtypes must be the same, got: input: {self.dtype}, batch1: {batch1.dtype}, batch2: {batch2.dtype}",
     )
+=======
+@out_wrapper(exact_dtype=True)
+def meta_baddbmm(self, batch1, batch2, *, beta=1, alpha=1):
+    from torch.fx.experimental.symbolic_shapes import guard_or_true, sym_eq
+
+    dim1 = batch1.size(0)
+    dim2 = batch1.size(1)
+    dim3 = batch2.size(2)
+    if guard_or_true(torch.sym_not(sym_eq(self.shape, (dim1, dim2, dim3)))):
+        self = self.expand((dim1, dim2, dim3))
+    torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor")
+    torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor")
+    if not exp_config.skip_dtype_check_in_meta_registrations:
+        torch._check(
+            self.dtype == batch1.dtype == batch2.dtype,
+            lambda: f"Input dtypes must be the same, got: input: {self.dtype}, batch1: {batch1.dtype}, batch2: {batch2.dtype}",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     batch1_sizes = batch1.shape
     batch2_sizes = batch2.shape
     bs = batch1_sizes[0]
@@ -2227,7 +2475,11 @@ def meta__fused_moving_avg_obs_fq_helper(
 
 
 @register_meta(aten.mm)
+<<<<<<< HEAD
 @out_wrapper()
+=======
+@out_wrapper(exact_dtype=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def meta_mm(a, b):
     torch._check(a.dim() == 2, lambda: "a must be 2D")
     torch._check(b.dim() == 2, lambda: "b must be 2D")
@@ -2361,6 +2613,10 @@ def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int
             ret_shape.append(
                 _formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i])
             )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._check(
         any(x > 0 for x in ret_shape[2:]),
         lambda: f"Given input size per channel: {list(dims)}. "
@@ -2508,7 +2764,12 @@ def meta_mkl_linear(input_tensor, packed_weight, orig_weight, bias, batch_size):
     )
 
     @register_meta(torch.ops.onednn.qconv2d_pointwise.default)
+<<<<<<< HEAD
     def meta_qconv2d_pointwise(
+=======
+    @register_meta(torch.ops.onednn.qconv_pointwise.default)
+    def meta_qconv_pointwise(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x,
         x_scale,
         x_zp,
@@ -2539,7 +2800,13 @@ def meta_qconv2d_pointwise(
         )
         assert output_dtype in [torch.float32, torch.bfloat16, torch.uint8, torch.int8]
         out = x.new_empty(shape_out, dtype=output_dtype)
+<<<<<<< HEAD
         out = out.to(memory_format=torch.channels_last)
+=======
+        assert len(shape_out) in [3, 4], "only conv1d/2d are supported"
+        format = torch.channels_last if len(shape_out) == 4 else torch.contiguous_format
+        out = out.to(memory_format=format)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return out
 
     @register_meta(torch.ops.onednn.qconv2d_pointwise.binary)
@@ -3460,7 +3727,11 @@ def meta_convolution_backward(
 
 
 @register_meta([aten.addbmm.default, aten.addbmm.out])
+<<<<<<< HEAD
 @out_wrapper()
+=======
+@out_wrapper(exact_dtype=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def meta_addbmm(self, batch1, batch2, *, beta=1, alpha=1):
     dim1 = batch1.size(1)
     dim2 = batch2.size(2)
@@ -3485,6 +3756,14 @@ def meta_addbmm(self, batch1, batch2, *, beta=1, alpha=1):
     return self.new_empty(self.size())
 
 
+<<<<<<< HEAD
+=======
+@register_meta([aten.randint_like.Tensor])
+def meta_randint_like(self, high, **kwargs):
+    return self.new_empty(self.size())
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta([aten._fused_adam_.default, aten._fused_adamw_.default])
 def meta__fused_adam_(
     self,
@@ -3636,6 +3915,24 @@ def meta__weight_int4pack_mm_for_cpu(x, w, q_group_size, q_scale_and_zeros):
     return x.new_empty(x.size(0), w.size(0), dtype=x.dtype)
 
 
+<<<<<<< HEAD
+=======
+@register_meta([aten._weight_int4pack_mm_with_scales_and_zeros])
+def _weight_int4pack_mm_with_scales_and_zeros(x, w, q_group_size, qScale, qZeros):
+    torch._check(x.dim() == 2, lambda: "x must be a 2D tensor")
+    torch._check(w.dim() == 2, lambda: "w must be a 2D tensor")
+    torch._check(
+        x.dtype in [torch.float32, torch.float16, torch.bfloat16],
+        lambda: f"expected x to be f32/f16/bf16, got {x.dtype}",
+    )
+    torch._check(
+        w.dtype is torch.int32,
+        lambda: f"expected w to be int32, got {w.dtype}",
+    )
+    return x.new_empty(x.size(0), w.size(0), dtype=x.dtype)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def kai_roundup(a: int, b: int) -> int:
     return ((a + b - 1) // b) * b
 
@@ -4038,6 +4335,14 @@ def meta_repeat(self, repeats):
         len(repeats) >= self.dim(),
         lambda: "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor",
     )
+<<<<<<< HEAD
+=======
+    for i, rep in enumerate(repeats):
+        torch._check(
+            rep >= 0,
+            lambda: f"Repeats cannot be negative, found {rep} at index {i}",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Add new leading dimensions to the tensor if the
     # number of target dimensions is larger than the
     # number of source dimensions.
@@ -4267,7 +4572,11 @@ def meta_alias(self):
     return self.view(self.shape)
 
 
+<<<<<<< HEAD
 def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None):
+=======
+def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None, out_dtype=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor")
     torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor")
 
@@ -4285,10 +4594,25 @@ def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None):
         lambda: f"Expected size for first two dimensions of batch2 tensor to be: [{bs}"
         f", {contraction_size}] but got: [{batch2_sizes[0]}, {batch2_sizes[1]}].",
     )
+<<<<<<< HEAD
 
     # TODO: handle out
 
     output = batch2.new_empty(output_size)
+=======
+    if out_dtype:
+        supported_out_dtype = (
+            batch1.dtype == torch.float16 or batch1.dtype == torch.bfloat16
+        ) and out_dtype == torch.float32
+        torch._check(
+            out_dtype == batch1.dtype or supported_out_dtype,
+            lambda: "out_dtype only supported for torch.float32 output with float16/bfloat16 inputs or same as input dtypes",
+        )
+        output = batch2.new_empty(output_size).to(out_dtype)
+    else:
+        # TODO: handle out
+        output = batch2.new_empty(output_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not is_bmm and self_baddbmm is not None:
         torch._check(self_baddbmm.dim() == 3, lambda: "self must be a 3D tensor")
@@ -4305,6 +4629,14 @@ def meta_bmm(self, mat2):
     return common_meta_baddbmm_bmm(self, mat2, True)
 
 
+<<<<<<< HEAD
+=======
+@register_meta(aten.bmm.dtype)
+def meta_bmm_dtype(self, mat2, out_dtype):
+    return common_meta_baddbmm_bmm(self, mat2, True, out_dtype=out_dtype)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def div_rtn(x, y):
     q = x // y
     r = x % y
@@ -5260,6 +5592,53 @@ def zeros_like(
     return res
 
 
+<<<<<<< HEAD
+=======
+@register_meta([aten.ones.default, aten.ones.out])
+@out_wrapper()
+def meta_ones(
+    size,
+    *,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    requires_grad=False,
+):
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if device is None:
+        device = torch.get_default_device()
+    if layout is None:
+        layout = torch.strided
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta([aten.zeros.default, aten.zeros.out])
+@out_wrapper()
+def meta_zeros(
+    size,
+    *,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    requires_grad=False,
+):
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if device is None:
+        device = torch.get_default_device()
+    if layout is None:
+        layout = torch.strided
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta(aten.select.int)
 def meta_select(self, dim, index):
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
@@ -5345,8 +5724,13 @@ def meta_gather(self, dim, index, sparse_grad=False):
     is_index_empty = guard_size_oblivious(index.numel() == 0)
     if not is_index_empty:
         torch._check(
+<<<<<<< HEAD
             index.dtype == torch.long,
             lambda: f"gather(): Expected dtype int64 for index, but got {index.dtype}",
+=======
+            index.dtype == torch.long or index.dtype == torch.int,
+            lambda: f"gather(): Expected dtype int32/int64 for index, but got {index.dtype}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         gather_shape_check(self, wrapped_dim, index)
     return self.new_empty(index.shape)
@@ -5385,8 +5769,13 @@ def scatter_gather_dtype_check(method_name, self, index, src_opt=None):
 
     if guard_size_oblivious(index.numel() != 0):
         torch._check(
+<<<<<<< HEAD
             index.dtype == torch.long,
             lambda: f"{method_name}(): Expected dtype int64 for index",
+=======
+            index.dtype == torch.long or index.dtype == torch.int,
+            lambda: f"{method_name}(): Expected dtype int32/int64 for index",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if src_opt is not None:
@@ -5565,6 +5954,29 @@ def meta__scaled_dot_product_flash_attention(
     )
 
 
+<<<<<<< HEAD
+=======
+def alloc_with_matching_layout(
+    query: Tensor,
+    res_shape: tuple[int, ...],
+):
+    if tuple(query.shape) == res_shape:
+        query_t = query.transpose(1, 2)
+        res = torch.empty_like(query_t).transpose(1, 2)
+    else:
+        dim_order = sorted(
+            [0, 1, 2, 3], key=lambda idx: query.stride()[idx], reverse=True
+        )
+        permuted_shape = [res_shape[idx] for idx in dim_order]
+        final_permute = [dim_order.index(i) for i in range(len(dim_order))]
+        res = torch.empty(
+            permuted_shape, dtype=query.dtype, device=query.device
+        ).permute(final_permute)
+
+    return res
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta([aten._scaled_dot_product_cudnn_attention])
 def meta__scaled_dot_product_cudnn_attention(
     query: Tensor,
@@ -5583,7 +5995,13 @@ def meta__scaled_dot_product_cudnn_attention(
     S_KV = key.size(2)
     D_V = value.size(-1)
 
+<<<<<<< HEAD
     res = torch.empty((B, H, S_Q, D_V), dtype=query.dtype, device=query.device)
+=======
+    res_shape = (B, H, S_Q, D_V)
+    res = alloc_with_matching_layout(query, res_shape)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     logsum_exp = torch.empty(
         (B, H, S_Q),
         dtype=torch.float,
@@ -5619,14 +6037,26 @@ def meta__scaled_dot_product_fused_attention_overrideable(
     scale: Optional[float] = None,
 ):
     B = query.size(0)
+<<<<<<< HEAD
     H = query.size(1)
+=======
+    H_Q = query.size(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     S_Q = query.size(2)
     S_KV = key.size(2)
     D_V = value.size(-1)
 
+<<<<<<< HEAD
     res = torch.empty((B, H, S_Q, D_V), dtype=query.dtype, device=query.device)
     logsum_exp = torch.empty(
         (B, H, S_Q),
+=======
+    res_shape = (B, H_Q, S_Q, D_V)
+    res = alloc_with_matching_layout(query, res_shape)
+
+    logsum_exp = torch.empty(
+        (B, H_Q, S_Q),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtype=torch.float,
         device=query.device,
     )
@@ -6125,12 +6555,20 @@ def meta_scaled_mm(
     out_dtype: Optional[torch.dtype] = None,
     use_fast_accum: bool = False,
 ):
+<<<<<<< HEAD
     def is_fp8_type(dtype):
+=======
+    def is_fp8_or_fp4_type(dtype):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return dtype in (
             torch.float8_e4m3fn,
             torch.float8_e5m2,
             torch.float8_e4m3fnuz,
             torch.float8_e5m2fnuz,
+<<<<<<< HEAD
+=======
+            torch.float4_e2m1fn_x2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     torch._check(
@@ -6138,8 +6576,13 @@ def is_fp8_type(dtype):
         lambda: f"Inputs must be 2D but got self.dim()={self.dim()} and mat2.dim()={mat2.dim()}",
     )
     torch._check(
+<<<<<<< HEAD
         is_fp8_type(self.dtype) and is_fp8_type(mat2.dtype),
         lambda: f"Expected both inputs to be fp8 types but got self.dtype={self.dtype} and mat2.dtype={mat2.dtype}",
+=======
+        is_fp8_or_fp4_type(self.dtype) and is_fp8_or_fp4_type(mat2.dtype),
+        lambda: f"Expected both inputs to be fp8 or fp4 types but got self.dtype={self.dtype} and mat2.dtype={mat2.dtype}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     if device_hint(self) == "cuda":
@@ -6171,6 +6614,7 @@ def has_zero_dim(tensor_2d):
         )
 
         # determine scaling type and check input dimensions (refer to Blas.cpp op)
+<<<<<<< HEAD
         torch._check(
             scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32,
             lambda: "Both scale_a and scale_b must be float (fp32) tensors.",
@@ -6182,6 +6626,80 @@ def has_zero_dim(tensor_2d):
             pass
         else:
             # for non-tensorwise scaling, enforce 2D input tensors
+=======
+
+        m, _k = self.shape
+        n = mat2.size(1)
+
+        is_blockwise_scaling = (
+            scale_a.dtype == torch.float8_e8m0fnu
+            and scale_b.dtype == torch.float8_e8m0fnu
+        ) or (
+            scale_a.dtype == torch.float8_e4m3fn
+            and scale_b.dtype == torch.float8_e4m3fn
+        )
+
+        if scale_a.numel() == 1 and scale_b.numel() == 1:
+            # tensorwise scaling
+            torch._check(
+                scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32,
+                lambda: "For tensorwise scaling, both scale_a and scale_b must be float (fp32) tensors.",
+            )
+        elif is_blockwise_scaling:
+            # blockwise scaling
+
+            if scale_a.dtype == torch.float8_e4m3fn:
+                # NVIDIA's nvfp4 recipe:
+                # * block size is 16 elements packed (32 unpacked)
+                # * _k needs to be translated to the unpacked version
+                block_size_k = 16
+                _k = _k * 2
+            else:
+                block_size_k = 32
+
+            block_size_mn = 128
+
+            def ceil_div(a, b):
+                return (a + b - 1) // b
+
+            num_k_blocks = ceil_div(_k, block_size_k)
+            padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4
+
+            expected_a_size = (
+                block_size_mn * ceil_div(m, block_size_mn) * padded_num_k_blocks
+            )
+            expected_b_size = (
+                block_size_mn * ceil_div(n, block_size_mn) * padded_num_k_blocks
+            )
+
+            if (
+                scale_a.numel() == expected_a_size
+                and scale_b.numel() == expected_b_size
+            ):
+                torch._check(
+                    scale_a.is_contiguous(),
+                    lambda: "scale_a must be contiguous",
+                )
+                torch._check(
+                    scale_b.is_contiguous(),
+                    lambda: "scale_b must be contiguous",
+                )
+            else:
+                torch._check(
+                    False,
+                    lambda: (
+                        "Invalid blockwise scaling configuration. "
+                        f"For blockwise scaling, scale_a should have {expected_a_size} elements, got {scale_a.numel()}, "
+                        f"scale_b should have {expected_b_size} elements, got {scale_b.numel()}."
+                    ),
+                )
+        else:
+            torch._check(
+                scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32,
+                lambda: "For rowwise scaling, both scale_a and scale_b must be float (fp32) tensors.",
+            )
+            # for rowwise scaling, enforce 2D input tensors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._check(
                 scale_a.dim() == 2 and scale_b.dim() == 2,
                 lambda: f"For non-tensorwise scaling, scale tensors must be 2D, but got {scale_a.dim()=} and {scale_b.dim()=}",
@@ -6620,10 +7138,19 @@ def meta__segment_reduce_backward(
 @register_meta([aten.kthvalue.default, aten.kthvalue.values])
 @out_wrapper("values", "indices")
 def kthvalue_meta(self, k, dim=-1, keepdim=False):
+<<<<<<< HEAD
     dim = maybe_wrap_dim(dim, self.dim(), wrap_scalar=True)
     dimSize = self.size(dim) if self.dim() > 0 else 1
     torch._check(
         k >= 1 and k <= dimSize,
+=======
+    from torch.fx.experimental.symbolic_shapes import sym_and
+
+    dim = maybe_wrap_dim(dim, self.dim(), wrap_scalar=True)
+    dimSize = self.size(dim) if self.dim() > 0 else 1
+    torch._check(
+        sym_and(k >= 1, k <= dimSize),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         lambda: f"kthvalue(): selected number k out of range for dimension {dim}",
     )
 
@@ -6765,6 +7292,11 @@ def meta_histc(input, bins=100, min=0, max=0):
             input.is_floating_point(),
             lambda: f"\"histogram_cpu\" not implemented for '{input.dtype}'",
         )
+<<<<<<< HEAD
+=======
+    if device_hint(input) == "cuda" and input.is_floating_point():
+        utils.alert_not_deterministic("_histc_cuda with floating point input")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._check(
         isinstance(bins, IntLike),
         lambda: f"{fn_name}: argument 'bins' must be int, not {type(bins)}",
@@ -6964,6 +7496,21 @@ def _check_for_unsupported_isin_dtype(dtype):
     )
 
 
+<<<<<<< HEAD
+=======
+@register_meta(aten.embedding_dense_backward)
+def meta_embedding_dense_backward(
+    grad_output,
+    indices,
+    num_weights,
+    padding_idx,
+    scale_grad_by_freq,
+):
+    grad_weight = grad_output.new_empty((num_weights, grad_output.size(-1)))
+    return grad_weight
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta(aten._embedding_bag_backward)
 def meta_embedding_bag_backward(
     grad,
@@ -7104,6 +7651,258 @@ def sigmoid(self: Tensor) -> Tensor:
     return torch.empty_like(self, dtype=result_dtype)
 
 
+<<<<<<< HEAD
+=======
+def _create_grouped_mm_output_tensor(mat1, mat2, offs, out_dtype):
+    mat1_is_2d = mat1.dim() == 2
+    mat2_is_2d = mat2.dim() == 2
+
+    if mat1_is_2d:
+        if mat2_is_2d:
+            out_size = [offs.size(0), mat1.size(0), mat2.size(1)]
+        else:
+            torch._check(
+                offs.size(0) == mat2.size(0), "matrix batch sizes have to match"
+            )
+            out_size = [mat1.size(0), mat2.size(-1)]
+    else:
+        if mat2_is_2d:
+            torch._check(
+                offs.size(0) == mat1.size(0), "matrix batch sizes have to match"
+            )
+            out_size = [mat1.size(1), mat2.size(1)]
+        else:
+            # regular bmm
+            torch._check(mat1.size(0) == mat2.size(0), "batched dimension has to match")
+            out_size = [mat1.size(0), mat1.size(1), mat2.size(-1)]
+
+    out_dtype = out_dtype or mat1.dtype
+
+    alignment = 16 // out_dtype.itemsize
+    size_padded = (out_size[-1] + alignment - 1) // alignment * alignment
+    if mat1_is_2d == mat2_is_2d:
+        out_stride = [out_size[1] * size_padded, size_padded, 1]
+    else:
+        out_stride = [size_padded, 1]
+    out = torch.empty_strided(out_size, out_stride, dtype=out_dtype, device=mat1.device)
+    return out
+
+
+def _meta_grouped_mm_common(
+    mat_a: Tensor,
+    mat_b: Tensor,
+    scale_a: Optional[torch.Tensor],
+    scale_b: Optional[torch.Tensor],
+    offs: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    scale_result: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    use_fast_accum: bool = False,
+):
+    torch._check(
+        (scale_a is None) == (scale_b is None),
+        lambda: "Either both scale factors are given, or none",
+    )
+    scaled = scale_a is not None and scale_b is not None
+
+    # Implementing all the checks from
+    # _grouped_mm_cuda()/_scaled_grouped_mm_cuda() code in
+    # aten/src/ATen/native/cuda/Blas.cpp.
+
+    if scaled:
+        torch._check(
+            mat_a.dtype == torch.float8_e4m3fn and mat_b.dtype == torch.float8_e4m3fn,
+            lambda: f"Expected inputs of E4M3 FP8 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",
+        )
+    else:
+        torch._check(
+            mat_a.dtype == torch.bfloat16 and mat_b.dtype == torch.bfloat16,
+            lambda: f"Expected inputs of BF16 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",
+        )
+
+    torch._check(
+        mat_a.dim() in [2, 3] and mat_b.dim() in [2, 3],
+        lambda: f"Multiplicands must be 2D or 3D but got mat_a.dim()={mat_a.dim()} and mat_b.dim()={mat_b.dim()}",
+    )
+
+    mat_a_is_2d = mat_a.dim() == 2
+    mat_b_is_2d = mat_b.dim() == 2
+
+    if scaled:
+
+        def is_row_major(mat):
+            mat_stride = mat.stride()
+            return mat_stride[-2] > 1 and mat_stride[-1] == 1
+
+        def is_col_major(mat):
+            mat_stride = mat.stride()
+            return mat_stride[-2] == 1 and mat_stride[-1] > 1
+
+        torch._check(
+            is_row_major(mat_a),
+            lambda: f"Expected mat_a tensor to be row major in the last two dimensions, got strides {mat_a.stride()[-2:]}",
+        )
+        torch._check(
+            is_col_major(mat_b),
+            lambda: f"Expected mat_b tensor to be column major in the last two dimensions, got strides {mat_b.stride()[-2:]}",
+        )
+
+    def check_valid_strides(mat_name, mat):
+        end_dim = mat.dim() - 1
+        alignment = 16 // mat.element_size()
+        mat_stride = mat.stride()
+        if mat_stride[end_dim - 1] == 1 and mat_stride[end_dim] >= max(
+            1, mat.shape[end_dim - 1]
+        ):
+            torch._check(
+                mat_stride[end_dim] % alignment == 0,
+                lambda: f"Expected {mat_name} stride along {end_dim} dim to be multiple of 16 bytes, got {mat_stride[end_dim]}.",
+            )
+        elif mat_stride[end_dim] == 1 and mat_stride[end_dim - 1] >= max(
+            1, mat.shape[end_dim]
+        ):
+            torch._check(
+                mat_stride[end_dim - 1] % alignment == 0,
+                lambda: f"Expected {mat_name} stride along {end_dim - 1} dim to be multiple of 16 bytes, got {mat_stride[end_dim - 1]}.",  # noqa: B950
+            )
+        else:
+            torch._check(
+                False,
+                lambda: f"Invalid strides/sizes, got {mat_stride} for strides and {mat.shape} for sizes.",  # noqa: B950
+            )
+
+    check_valid_strides("mat_a", mat_a)
+    check_valid_strides("mat_b", mat_b)
+
+    if scale_a is not None and scale_b is not None:
+        torch._check(
+            scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32,
+            lambda: "Both scale_a and scale_b must be float (fp32) tensors, but got scale_a.dtype={scale_a.dtype} and scale_b.dtype={scale_b.dtype}.",  # noqa: B950
+        )
+
+        def check_scale(scale_name, scale, mat, scaled_dim, scale_multiplier=1):
+            if mat.dim() == 2:
+                torch._check(
+                    scale.dim() == 1,
+                    lambda: f"Expected {scale_name} to be 1D tensor, but got {scale.dim()}D tensor.",
+                )
+                torch._check(
+                    scale.is_contiguous(),
+                    lambda: f"Expected {scale_name} to be contiguous.",
+                )
+                torch._check(
+                    scale.shape[0] == mat.shape[scaled_dim] * scale_multiplier,
+                    lambda: f"Expected {scale_name} to have {mat.shape[scaled_dim] * scale_multiplier} elements, got {scale.shape[0]} elements.",  # noqa: B950
+                )
+            else:
+                torch._check(
+                    scale.dim() == 2,
+                    lambda: f"Expected {scale_name} to be 2D tensor, but got {scale.dim()}D tensor.",
+                )
+                torch._check(
+                    scale.stride(1) == 1,
+                    lambda: f"Expected {scale_name} to be contiguous in the last dimension.",
+                )
+                torch._check(
+                    scale.shape[0] == mat.shape[0],
+                    lambda: f"Expected {scale_name} batch dimension to be {mat.shape[0]}, got {scale.shape[0]}.",
+                )
+                torch._check(
+                    scale.shape[1] == mat.shape[1 + scaled_dim],
+                    lambda: f"Expected {scale_name} non-batch dimension to be {mat.shape[1 + scaled_dim]}, got {scale.shape[1]}.",
+                )
+
+        scale_multiplier = (
+            offs.shape[0] if offs is not None and mat_a_is_2d and mat_b_is_2d else 1
+        )
+        check_scale("scale_a", scale_a, mat_a, 0, scale_multiplier)
+        check_scale("scale_b", scale_b, mat_b, 1, scale_multiplier)
+
+        torch._check(
+            scale_result is None,
+            lambda: "Scale result tensor provided, but it is not supported yet.",
+        )
+
+    if mat_a_is_2d or mat_b_is_2d:
+        torch._check(
+            offs is not None,
+            lambda: f"Offsets tensor not provided, but is needed for {mat_a.dim()}D/{mat_b.dim()}D multiplicand layouts.",
+        )
+        if offs is not None:  # to silence Mypy
+            torch._check(
+                offs.dim() == 1,
+                lambda: f"Offsets tensor must be 1D, but got offs.dim()={offs.dim()}.",
+            )
+            torch._check(
+                offs.dtype == torch.int32,
+                lambda: f"Offsets tensor must be integer (int32) tensor, but got {offs.dtype}.",
+            )
+    else:
+        torch._check(
+            offs is None,
+            lambda: "Offsets tensor provided, but is not needed for 3D/3D multiplicand layouts.",
+        )
+
+    torch._check(
+        bias is None,
+        lambda: "Bias tensor provided, but it is not supported yet.",
+    )
+
+    torch._check(
+        out_dtype is None or out_dtype == torch.bfloat16,
+        lambda: "If output dtype provided, it must be torch.bfloat16.",
+    )
+
+    return _create_grouped_mm_output_tensor(mat_a, mat_b, offs, out_dtype)
+
+
+@register_meta(aten._grouped_mm)
+@out_wrapper()
+def grouped_mm(
+    mat_a: Tensor,
+    mat_b: Tensor,
+    offs: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+) -> Tensor:
+    return _meta_grouped_mm_common(
+        mat_a,
+        mat_b,
+        scale_a=None,
+        scale_b=None,
+        offs=offs,
+        bias=bias,
+        scale_result=None,
+        out_dtype=out_dtype,
+    )
+
+
+@register_meta([aten._scaled_grouped_mm.default])
+def meta_scaled_grouped_mm(
+    mat_a: torch.Tensor,
+    mat_b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    offs: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    scale_result: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    use_fast_accum: bool = False,
+):
+    return _meta_grouped_mm_common(
+        mat_a,
+        mat_b,
+        scale_a=scale_a,
+        scale_b=scale_b,
+        offs=offs,
+        bias=bias,
+        scale_result=scale_result,
+        out_dtype=out_dtype,
+        use_fast_accum=use_fast_accum,
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta(aten._softmax)
 @out_wrapper()
 def softmax(x: Tensor, dim: int, half_to_float: bool) -> Tensor:
@@ -7118,6 +7917,51 @@ def softmax(x: Tensor, dim: int, half_to_float: bool) -> Tensor:
     return res
 
 
+<<<<<<< HEAD
+=======
+@register_meta(aten.constant_pad_nd)
+@out_wrapper()
+def _constant_pad_nd_meta(input, pad, value=0):
+    # same checks as decomposition in torch/_refs/__init__.py:constant_pad_nd()
+    torch._check(
+        len(pad) % 2 == 0,
+        lambda: f"Length of pad must be even but instead it equals {len(pad)}",
+    )
+
+    input_sizes = input.shape
+    l_inp = len(input_sizes)
+    l_pad = len(pad) // 2
+    l_diff = l_inp - l_pad
+
+    torch._check(
+        l_inp >= l_pad,
+        lambda: "Length of pad should be no more than twice the number of "
+        f"dimensions of the input. Pad length is {len(pad)} while the input has "
+        f"{l_inp} dimensions.",
+    )
+
+    new_shape = list(input_sizes[:l_diff])
+    for i in range(l_pad):
+        pad_idx = len(pad) - ((i + 1) * 2)
+        new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]
+        torch._check(
+            new_dim >= 0,
+            lambda: f"The input size {input_sizes[l_diff + i]}, plus negative padding "
+            f"{pad[pad_idx]} and {pad[pad_idx + 1]} resulted in a negative output size, "
+            f"which is invalid. Check dimension {l_diff + i} of your input.",
+        )
+        new_shape.append(new_dim)
+
+    return torch.empty(
+        new_shape,
+        dtype=input.dtype,
+        device=input.device,
+        requires_grad=input.requires_grad,
+        memory_format=suggest_memory_format(input),
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_meta(aten.embedding)
 @out_wrapper()
 def embedding(
diff --git a/torch/_numpy/_funcs_impl.py b/torch/_numpy/_funcs_impl.py
index 3579cfe83b42..f694b732f6f3 100644
--- a/torch/_numpy/_funcs_impl.py
+++ b/torch/_numpy/_funcs_impl.py
@@ -941,7 +941,11 @@ def choose(
     ]
 
     idx_list[0] = a
+<<<<<<< HEAD
     return choices[idx_list].squeeze(0)
+=======
+    return choices[tuple(idx_list)].squeeze(0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ### unique et al. ###
diff --git a/torch/_numpy/_ndarray.py b/torch/_numpy/_ndarray.py
index 20ebd9db8182..5dd6dafdedc2 100644
--- a/torch/_numpy/_ndarray.py
+++ b/torch/_numpy/_ndarray.py
@@ -435,7 +435,11 @@ def sort(self, axis=-1, kind=None, order=None):
     def item(self, *args):
         # Mimic NumPy's implementation with three special cases (no arguments,
         # a flat index and a multi-index):
+<<<<<<< HEAD
         # https://github.com/numpy/numpy/blob/main/numpy/core/src/multiarray/methods.c#L702
+=======
+        # https://github.com/numpy/numpy/blob/main/numpy/_core/src/multiarray/methods.c#L702
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if args == ():
             return self.tensor.item()
         elif len(args) == 1:
diff --git a/torch/_ops.py b/torch/_ops.py
index c6f5be583e41..0e127a231f19 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -6,8 +6,24 @@
 import inspect
 import sys
 import types
+<<<<<<< HEAD
 from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import Concatenate, ParamSpec
+=======
+from collections.abc import Iterator
+from functools import cached_property
+from typing import (
+    Any,
+    Callable,
+    ClassVar,
+    final,
+    Generic,
+    Optional,
+    TYPE_CHECKING,
+    Union,
+)
+from typing_extensions import Concatenate, ParamSpec, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -21,8 +37,13 @@
     from torch._subclasses.functional_tensor import BaseFunctionalizeAPI
 
 
+<<<<<<< HEAD
 _T = TypeVar("_T")
 _P = ParamSpec("_P")
+=======
+_T = TypeVar("_T", default=Any)
+_P = ParamSpec("_P", default=...)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Query `hasattr` only once.
@@ -306,12 +327,62 @@ def py_impl(
             self.non_fallthrough_keys = self.non_fallthrough_keys.add(k)
         return super().py_impl(k)
 
+<<<<<<< HEAD
+=======
+    def py_autograd_impl(
+        self,
+        fn: Callable[_P, _T],
+    ) -> Callable[_P, _T]:
+        def maybe_run_autograd(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+            if not torch.is_grad_enabled() or pytree.tree_all_only(
+                torch.Tensor,
+                lambda t: not t.requires_grad,  # type: ignore[union-attr]
+                (*args, kwargs),
+            ):
+                with torch._C._AutoDispatchBelowAutograd():
+                    return self(*args, **kwargs)
+
+            from torch._higher_order_ops.utils import _has_gen_schema
+
+            if _has_gen_schema(self):
+                schema = self.gen_schema(*args, **kwargs)
+                if any(arg.is_write for arg in schema.arguments):
+                    raise RuntimeError(
+                        f"The {self.name()} HigherOrderOperator does not currently support training "
+                        "with in-place input or buffer mutations "
+                        "If you require this feature, please submit an issue to PyTorch. "
+                        "Alternatively, consider creating your own custom autograd.Function. "
+                    )
+
+            return fn(*args, **kwargs)
+
+        self.py_impl(DispatchKey.Autograd)(maybe_run_autograd)
+
+        return fn
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def namespace(self):
         return self._ns
 
+<<<<<<< HEAD
     def cacheable(self):
         return self._cacheable
+=======
+    @final
+    def cacheable(self) -> bool:
+        from torch._functorch.autograd_function import AutogradFunctionApply
+
+        return (
+            self._cacheable
+            or f"{self.__module__}.{self.__name__}"
+            in torch._inductor.config.unsafe_marked_cacheable_functions
+            or (
+                isinstance(self, AutogradFunctionApply)
+                and torch._functorch.config.autograd_cache_allow_custom_autograd_functions
+            )
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def fallthrough(self, dispatch_key):
         self.non_fallthrough_keys = self.non_fallthrough_keys.remove(dispatch_key)
@@ -470,6 +541,29 @@ def wrapper():
 
         return wrapper()
 
+<<<<<<< HEAD
+=======
+    # NOTE [HigherOrderOprator Schema]
+    # Each invocation of a HigherOrderOperator (hop) should have its own schema because
+    # the subgraphs and the arguments can be different even for the same hop.
+    #
+    # Each hop should implement its own gen_schema method, which should
+    # take the same input as the __call__ method and returns a FunctionSchema.
+    # The schema provides a unified way to check if the hop mutates its inputs,
+    # which can be useful in implementing optimizations.
+    #
+    # If the hop doesn't implement the gen_schema method,
+    # we expect it to be functional. It should not mutate its inputs and there
+    # are no input, output aliasing via views or direct referencing.
+    def gen_schema(self, *args, **kwargs):
+        raise NotImplementedError(
+            f"HigherOrderOperator {self._name} does not implement a gen_schema. "
+            f"This is OK as long as the hop is functional. "
+            f"e.g. it should not mutate its inputs and there are no input, output aliasing "
+            f"via views or direct referencing."
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __str__(self):
         return f"{self.name()}"
 
@@ -684,8 +778,20 @@ def get_cached_ops():
 
 # Each OpOverload object contains pointer to a specific operator overload, a pointer to the parent `OpOverloadPacket` object.
 # You can obtain an OpOverload object through attribute query on OpOverloadPacket.
+<<<<<<< HEAD
 class OpOverload(OperatorBase):
     def __init__(self, overloadpacket, op, op_dk, schema, tags):
+=======
+class OpOverload(OperatorBase, Generic[_P, _T]):
+    def __init__(
+        self,
+        overloadpacket: "OpOverloadPacket",
+        op: Callable[_P, _T],
+        op_dk: Callable[Concatenate[DispatchKey, _P], _T],
+        schema: torch._C.FunctionSchema,
+        tags: list[Any],
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self._op = op
         self._op_dk = op_dk
@@ -705,9 +811,12 @@ def __init__(self, overloadpacket, op, op_dk, schema, tags):
         op.__module__ = overloadpacket.__module__
         self.__qualname__ = self._name
         self.__annotations__ = {}
+<<<<<<< HEAD
         # Only compute the OperatorHandle when we need it. Not all OpOverloads have
         # OperatorHandles (the TorchScript ones don't...)
         self._lazy_handle = None
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # If the OpOverload was constructed from a Library.def in Python.
         self._defined_in_python = self.__qualname__ in torch.library._defs
@@ -725,6 +834,7 @@ def __init__(self, overloadpacket, op, op_dk, schema, tags):
                 is_write = a.alias_info.is_write or is_write
         self.is_view = is_write is not None and not is_write
 
+<<<<<<< HEAD
     @property
     def _namespace(self):
         return self._schema.name.split("::")[0]
@@ -740,12 +850,28 @@ def _handle(self):
                 self._schema.name, self._schema.overload_name
             )
         return self._lazy_handle
+=======
+    @cached_property
+    def _namespace(self) -> str:
+        return self._schema.name.split("::", maxsplit=1)[0]
+
+    @cached_property
+    def _opname(self) -> str:
+        return self._schema.name.split("::", maxsplit=1)[1]
+
+    @cached_property
+    def _handle(self) -> torch._C._DispatchOperatorHandle:
+        return torch._C._dispatch_find_schema_or_throw(
+            self._schema.name, self._schema.overload_name
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # it's a no-op since OpOverload object is immutable and must be unique for a given op overload.
     def __deepcopy__(self, memo=None):
         return self
 
     def __repr__(self):
+<<<<<<< HEAD
         return "<OpOverload(op='{}.{}', overload='{}')>".format(
             *self._schema.name.split("::"), self._overloadname
         )
@@ -753,12 +879,26 @@ def __repr__(self):
     # Use positional-only argument to avoid naming collision with aten ops arguments
     # that are named "self". This way, all the aten ops can be called by kwargs.
     def __call__(self, /, *args, **kwargs):
+=======
+        return f"<OpOverload(op='{self._namespace}.{self._opname}', overload='{self._overloadname}')>"
+
+    # Use positional-only argument to avoid naming collision with aten ops arguments
+    # that are named "self". This way, all the aten ops can be called by kwargs.
+    def __call__(self, /, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._op(*args, **kwargs)
 
     # Use positional-only argument to avoid naming collision with aten ops arguments
     # that are named "self". This way, all the aten ops can be called by kwargs.
+<<<<<<< HEAD
     def redispatch(self, /, keyset, *args, **kwargs):
         return self._handle.redispatch_boxed(keyset, *args, **kwargs)
+=======
+    def redispatch(
+        self, /, keyset: torch._C.DispatchKeySet, *args: _P.args, **kwargs: _P.kwargs
+    ) -> _T:
+        return self._handle.redispatch_boxed(keyset, *args, **kwargs)  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __hash__(self):
         return hash(self._op)
@@ -767,27 +907,46 @@ def __hash__(self):
     def __str__(self):
         return "{}.{}.{}".format(*self._schema.name.split("::"), self._overloadname)
 
+<<<<<<< HEAD
     def has_kernel_for_dispatch_key(self, k):
+=======
+    def has_kernel_for_dispatch_key(self, k: DispatchKey) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().has_kernel_for_dispatch_key(
             k
         ) or torch._C._dispatch_has_kernel_for_dispatch_key(self.name(), k)
 
+<<<<<<< HEAD
     def has_kernel_for_any_dispatch_key(self, ks):
+=======
+    def has_kernel_for_any_dispatch_key(self, ks: torch._C.DispatchKeySet) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return torch._C._dispatch_has_kernel_for_any_dispatch_key(
             self.name(), ks
         ) or super().has_kernel_for_any_dispatch_key(ks)
 
     @property
+<<<<<<< HEAD
     def namespace(self):
         return self._schema.name.split("::")[0]
 
     def _can_decompose(self):
+=======
+    def namespace(self) -> str:
+        return self._namespace
+
+    def _can_decompose(self) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dk = DispatchKey.CompositeImplicitAutograd
         return dk in self.py_kernels or torch._C._dispatch_has_kernel_for_dispatch_key(
             self.name(), dk
         )
 
+<<<<<<< HEAD
     def decompose(self, *args, **kwargs):
+=======
+    def decompose(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dk = DispatchKey.CompositeImplicitAutograd
         if dk in self.py_kernels:
             # NB: This branch is not too necessary anymore, because we can
@@ -808,11 +967,19 @@ def decompose(self, *args, **kwargs):
     # registering Autograd affects AutogradCPU).  del_dispatch is to be used
     # only if you are specifically modifying how get_dispatch handles a
     # particular input 'key'.
+<<<<<<< HEAD
     def _uncache_dispatch(self, key):
         self._dispatch_cache.pop(key, None)
 
     # This implements the pre-computation logic for the Python dispatcher.
     def _get_dispatch(self, key):
+=======
+    def _uncache_dispatch(self, key: DispatchKey) -> None:
+        self._dispatch_cache.pop(key, None)
+
+    # This implements the pre-computation logic for the Python dispatcher.
+    def _get_dispatch(self, key: DispatchKey) -> Union[DispatchKey, Callable[_P, _T]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This is only called upon a cache miss
         assert key not in self._dispatch_cache, f"{self} {key}"
 
@@ -822,7 +989,11 @@ def _get_dispatch(self, key):
                 add_cached_op(self)
                 return key
 
+<<<<<<< HEAD
             def handler(*args, **kwargs):
+=======
+            def handler(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
                 # TODO: We also need to handle tensor subclasses here
@@ -862,7 +1033,11 @@ def handler(*args, **kwargs):
                 )
             ):
 
+<<<<<<< HEAD
                 def handler(*args, **kwargs):
+=======
+                def handler(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     @contextlib.contextmanager
                     def _temporarily_pop_modes_from_pre_dispatch():
                         top_mode = _pop_mode_from_pre_dispatch()
@@ -895,7 +1070,11 @@ def _temporarily_pop_modes_from_pre_dispatch():
             import torch._dispatch.python as pydispatch
 
             if pydispatch.CROSSREF_FUNCTIONALIZE:
+<<<<<<< HEAD
                 handler = pydispatch.make_crossref_functionalize(self, final_key)
+=======
+                handler = pydispatch.make_crossref_functionalize(self, final_key)  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if cache_result:
                     self._dispatch_cache[key] = handler
                     add_cached_op(self)
@@ -929,7 +1108,11 @@ def tags(self):
 # schema consists of torch.ScriptObject (i.e. custom class) input.
 # TorchBindOpOverload will skip C++ dispatcher and purely dispatched in python
 # when its inputs contain FakeScriptObject in a similar way as higher order ops.
+<<<<<<< HEAD
 class TorchBindOpOverload(OpOverload):
+=======
+class TorchBindOpOverload(OpOverload[_P, _T]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _fallthrough_keys(self) -> list[DispatchKey]:
         # TODO: we should be calling the fallback for these, but a fallthrough is almost close
         # enough to the fallback in most cases that we care about.
@@ -978,7 +1161,11 @@ def _register_as_effectful_op_temporarily(self):
 
     # Use positional-only argument to avoid naming collision with aten ops arguments
     # that are named "self". This way, all the aten ops can be called by kwargs.
+<<<<<<< HEAD
     def __call__(self, /, *args, **kwargs):
+=======
+    def __call__(self, /, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if _must_dispatch_in_python(args, kwargs):
             # When any inputs are FakeScriptObject, we need to
             # skip c++ dispatcher and dispatch in python through _get_dispatch of python_dispatcher
@@ -991,10 +1178,21 @@ def __call__(self, /, *args, **kwargs):
             # 2. We don't want to register the op as effectful for all torchbind ops in ctor because this might
             #    cause unexpected behavior for some autograd.profiler ops e.g. profiler._record_function_exit._RecordFunction.
             with self._register_as_effectful_op_temporarily():
+<<<<<<< HEAD
                 return self._dispatch_in_python(args, kwargs, self._fallthrough_keys())
         return self._op(*args, **kwargs)
 
     def _dispatch_in_python(self, args, kwargs, fallthrough_keys):
+=======
+                return self._dispatch_in_python(
+                    self._fallthrough_keys(), *args, **kwargs
+                )
+        return self._op(*args, **kwargs)
+
+    def _dispatch_in_python(
+        self, fallthrough_keys: list[DispatchKey], *args: _P.args, **kwargs: _P.kwargs
+    ) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         non_fallthrough_keys = torch._C._dispatch_keyset_full()
         for key in fallthrough_keys:
             non_fallthrough_keys = non_fallthrough_keys.remove(key)
@@ -1015,7 +1213,13 @@ def _dispatch_in_python(self, args, kwargs, fallthrough_keys):
                 self.name(), dispatch_key
             ):
                 return self._dispatch_in_python(
+<<<<<<< HEAD
                     args, kwargs, fallthrough_keys + [dispatch_key]
+=======
+                    fallthrough_keys + [dispatch_key],
+                    *args,
+                    **kwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             raise RuntimeError(
@@ -1047,15 +1251,32 @@ def _has_script_object_arg(schema: torch.FunctionSchema) -> bool:
 
 # OpOverloadPacket class contains pointer to a base unresolved operator that doesn't correspond to a specific operator
 # You can obtain an OpOverload object through attribute query.
+<<<<<<< HEAD
 class OpOverloadPacket:
     def __init__(self, qualified_op_name, op_name, op, overload_names):
+=======
+class OpOverloadPacket(Generic[_P, _T]):
+    __file__: ClassVar[str] = "torch.ops"
+
+    def __init__(
+        self,
+        qualified_op_name: str,
+        op_name: str,
+        op: Callable[_P, _T],
+        overload_names: list[str],
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # These attributes are accessible on the object through the properties
         # defined below but are immutable
         self._qualified_op_name = qualified_op_name
         self.__name__ = op_name
         self._op = op
         self._overload_names = overload_names
+<<<<<<< HEAD
         self._dir = []
+=======
+        self._dir: list[str] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._has_torchbind_op_overload = any(
             _has_script_object_arg(schema) for schema in self._schemas.values()
         )
@@ -1086,11 +1307,15 @@ def _schemas(self):
             for overload_name in self._overload_names
         }
 
+<<<<<<< HEAD
     def __getattr__(self, key):
         # It is not a valid op_name when __file__ is passed in
         if key == "__file__":
             return "torch.ops"
 
+=======
+    def __getattr__(self, key: str) -> OpOverload[_P, _T]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # ensure that query for dunder attributes that does not exist on
         # opoverloadpacket but instead exists on the self._op object does not unnecessarily call
         # `_get_operation_overload` (which is an expensive operation).
@@ -1125,7 +1350,11 @@ def __getattr__(self, key):
 
             op_, op_dk_, tags = op_dk_tags
             schema = torch._C._get_schema(self._qualified_op_name, use_key)
+<<<<<<< HEAD
             overload = (
+=======
+            overload: OpOverload[_P, _T] = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 OpOverload(self, op_, op_dk_, schema, tags)
                 if not _has_script_object_arg(schema)
                 else TorchBindOpOverload(self, op_, op_dk_, schema, tags)
@@ -1139,12 +1368,20 @@ def __getattr__(self, key):
                 f"The underlying op of '{str(self)}' has no overload name '{key}'"
             ) from None
 
+<<<<<<< HEAD
     def __iter__(self):
+=======
+    def __iter__(self) -> Iterator[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return iter(self._dir)
 
     # Use positional-only argument to avoid naming collision with aten ops arguments
     # that are named "self". This way, all the aten ops can be called by kwargs.
+<<<<<<< HEAD
     def __call__(self, /, *args, **kwargs):
+=======
+    def __call__(self, /, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # overloading __call__ to ensure torch.ops.foo.bar()
         # is still callable from JIT
         # We save the function ptr as the `op` attribute on
@@ -1154,8 +1391,13 @@ def __call__(self, /, *args, **kwargs):
         # the schema and cause an error for torchbind op when inputs consist of FakeScriptObject so we
         # intercept it here and call TorchBindOpverload instead.
         if self._has_torchbind_op_overload and _must_dispatch_in_python(args, kwargs):
+<<<<<<< HEAD
             return _call_overload_packet_from_python(self, args, kwargs)
         return self._op(*args, **(kwargs or {}))
+=======
+            return _call_overload_packet_from_python(self, *args, **kwargs)
+        return self._op(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO: use this to make a __dir__
     def overloads(self):
@@ -1164,7 +1406,13 @@ def overloads(self):
 
 # Note - this mirrors the logic of the cpp_function defined in jit/python/init.cpp
 # _jit_get_operations, which calls _get_operation_for_overload_or_packet.
+<<<<<<< HEAD
 def _call_overload_packet_from_python(op: OpOverloadPacket, args, kwargs):
+=======
+def _call_overload_packet_from_python(
+    op: OpOverloadPacket[_P, _T], *args: _P.args, **kwargs: _P.kwargs
+) -> _T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Re-use the torch function handling logic in cpp
     torch_function_called, ret = torch._C._maybe_call_torch_function_for_op_packet(
         op, *args, **kwargs
@@ -1238,6 +1486,7 @@ class _OpNamespace(types.ModuleType):
         operation will already exist).
     """
 
+<<<<<<< HEAD
     def __init__(self, name):
         super().__init__("torch.ops." + name)
         self.name = name
@@ -1251,6 +1500,20 @@ def __getattr__(self, op_name):
         if op_name == "__file__":
             return "torch.ops"
         elif op_name in ["__origin__", "__self__"]:
+=======
+    __file__ = "torch.ops"
+
+    def __init__(self, name: str) -> None:
+        super().__init__("torch.ops." + name)
+        self.name = name
+        self._dir: list[str] = []
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._dir)
+
+    def __getattr__(self, op_name: str) -> OpOverloadPacket:
+        if op_name in ("__origin__", "__self__"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise AttributeError(
                 f"Invalid attribute '{op_name}' for '_OpNamespace' '{self.name}'"
             )
@@ -1303,6 +1566,7 @@ def _refresh_packet(packet):
     packet._overload_names = overload_names
 
 
+<<<<<<< HEAD
 class _PyOpNamespace(_OpNamespace):
     def __init__(self, name, ops):
         super().__init__(name)
@@ -1316,6 +1580,27 @@ def __getattr__(self, name):
                 f"'_PyOpNamespace' '{self.name}' object has no attribute '{name}'"
             )
         setattr(self, name, op)
+=======
+class _HigherOrderNamespace(types.ModuleType):
+    __file__ = "torch.ops"
+
+    def __init__(self) -> None:
+        super().__init__("torch.ops.higher_order")
+        self._dir: list[str] = []
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._dir)
+
+    def __getattr__(self, name: str) -> HigherOrderOperator:
+        # Following _OpNamespace.__getattr__, we cache the op on this object.
+        op = _higher_order_ops.get(name, None)
+        if op is None:
+            raise AttributeError(
+                f"'_HigherOrderNamespace' 'torch.ops.higher_order' object has no attribute '{name}'"
+            )
+        setattr(self, name, op)
+        self._dir.append(name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return op
 
 
@@ -1325,6 +1610,7 @@ class _Ops(types.ModuleType):
     def __init__(self):
         super().__init__("torch.ops")
         self.loaded_libraries = set()
+<<<<<<< HEAD
         self._higher_order_op_namespace = _PyOpNamespace(
             "torch.ops.higher_order", _higher_order_ops
         )
@@ -1335,13 +1621,23 @@ def __getattr__(self, name):
         if name == "higher_order":
             return self._higher_order_op_namespace
 
+=======
+        self.higher_order = _HigherOrderNamespace()
+        self._dir = []
+
+    def __getattr__(self, name: str) -> _OpNamespace:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Here we are creating `torch.ops.my_namespace`
         namespace = _OpNamespace(name)
         setattr(self, name, namespace)
         self._dir.append(name)
         return namespace
 
+<<<<<<< HEAD
     def __iter__(self):
+=======
+    def __iter__(self) -> Iterator[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return iter(self._dir)
 
     def import_module(self, module):
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index a5b364f437da..0a14a0bc94c6 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1257,7 +1257,15 @@ def _as_strided_aten(
 def _broadcast_in_dim_meta(
     a: TensorLikeType, shape: ShapeType, broadcast_dimensions: Sequence[int]
 ):
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+=======
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        sym_or,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Type checks
     assert isinstance(a, TensorLike)
@@ -1284,11 +1292,18 @@ def _greater_than_reduce(acc, x):
 
     # shape must be broadcastable to
     for idx, new_idx in enumerate(broadcast_dimensions):
+<<<<<<< HEAD
         if not guard_size_oblivious(a.shape[idx] == 1):
             torch._check(
                 a.shape[idx] == shape[new_idx],
                 lambda: f"{a.shape[idx]} must be broadcastable to {shape[new_idx]}",
             )
+=======
+        torch._check(
+            sym_or(a.shape[idx] == 1, shape[new_idx] == a.shape[idx]),
+            lambda: f"{a.shape[idx]} must be broadcastable to {shape[new_idx]}",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     new_strides = []
     original_idx = 0
@@ -1296,6 +1311,7 @@ def _greater_than_reduce(acc, x):
         if idx in broadcast_dimensions:
             # Assigns a stride of zero to dimensions
             # which were actually broadcast
+<<<<<<< HEAD
             if guard_size_oblivious(a.shape[original_idx] != shape[idx]):
                 new_strides.append(0)
             else:
@@ -1303,6 +1319,23 @@ def _greater_than_reduce(acc, x):
             original_idx = original_idx + 1
         else:
             if guard_size_oblivious(shape[idx] != 1):
+=======
+            if guard_or_false(a.shape[original_idx] == 1):
+                if guard_or_false(a.shape[original_idx] == shape[idx]):
+                    new_strides.append(a.stride()[original_idx])
+                else:
+                    new_strides.append(0)
+            else:
+                torch._check(
+                    a.shape[original_idx] == shape[idx],
+                    lambda: f"non-broadcasting semantics require {a.shape[original_idx]} == {shape[idx]}",
+                )
+                new_strides.append(a.stride()[original_idx])
+            original_idx = original_idx + 1
+        else:
+            if guard_or_true(shape[idx] != 1):
+                # consistent with previous use of guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_strides.append(0)
             elif original_idx == a.ndim:
                 new_strides.append(1)
diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index 36cb40e79165..2e8e35985716 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -1,8 +1,21 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
 import functools
 from collections.abc import Sequence
 from contextlib import nullcontext
 from typing import Any, Callable, Optional
+=======
+from __future__ import annotations
+
+import functools
+from contextlib import nullcontext
+from typing import Any, Callable, TYPE_CHECKING, TypeVar
+from typing_extensions import ParamSpec
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._decomp
@@ -15,8 +28,17 @@
 from torch._prims_common import torch_function_passthrough
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
 def torch_to_refs_map():
+=======
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+
+@functools.cache
+def torch_to_refs_map() -> dict[Any, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Mapping of torch API functions to torch._refs functions.
     E.g. torch_to_refs_map()[torch.add] == torch._refs.add
@@ -70,8 +92,13 @@ def torch_to_refs_map():
     return r
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
 def all_prims():
+=======
+@functools.cache
+def all_prims() -> set[Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Set of all prim functions, e.g., torch._prims.add in all_prims()
     """
@@ -95,21 +122,36 @@ class TorchRefsMode(torch.overrides.TorchFunctionMode):
 
     def __init__(
         self,
+<<<<<<< HEAD
         strict=False,
         should_fallback_fn=lambda *_: False,
         prims_mode_cls=nullcontext,
     ):
+=======
+        strict: bool = False,
+        should_fallback_fn: Callable[..., bool] = lambda *_: False,
+        prims_mode_cls: type = nullcontext,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.strict = strict
         self.should_fallback_fn = should_fallback_fn
         self.prims_mode_cls = prims_mode_cls
 
     def __torch_function__(
         self,
+<<<<<<< HEAD
         orig_func: Callable,
         types: Sequence,
         args: Sequence[Any] = (),
         kwargs: Optional[dict] = None,
     ):
+=======
+        orig_func: Callable[_P, _R],
+        types: Sequence[type],
+        args: Sequence[Any] = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> Any:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if kwargs is None:
             kwargs = {}
         # For primitive operations, run them as is without interception
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index e8339b789f54..6c0dd0069938 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -259,20 +259,39 @@ def check_all_strides(
 
 
 # This function is equivalent to compute_contiguous() from TensorImpl.cpp
+<<<<<<< HEAD
 def is_contiguous(a: TensorLikeType) -> bool:
+=======
+def is_contiguous(a: TensorLikeType, false_if_dde=False) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Tests whether a tensor is contiguous or not.
 
     Tensors are contiguous when they have no elements,
     one element, or when they have "nested" strides.
     """
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
 
     if guard_size_oblivious(a.numel() < 2):
+=======
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        guard_size_oblivious,
+        is_nested_int,
+    )
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
+    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+
+    if maybe_guard_or_false(a.numel() < 2):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
     expected_stride = 1
     for x, y in reversed(tuple(zip(a.shape, a.stride()))):
+<<<<<<< HEAD
         # Skips checking strides when a dimension has length 1
         if guard_size_oblivious(x == 1):
             continue
@@ -280,26 +299,65 @@ def is_contiguous(a: TensorLikeType) -> bool:
         if guard_size_oblivious(y != expected_stride):
             return False
         expected_stride = expected_stride * x
+=======
+        # Skips checking strides when a dimension has length 1.
+        if maybe_guard_or_false(x == 1):
+            continue
+
+        if maybe_guard_or_true(y != expected_stride):
+            return False
+
+        # if x is 0 then a is contiguous anyway. So in the check above for non-contiguity condition we can
+        # can assume x is not 0 in expected_stride equation. This make the check consistent with
+        # make_contiguous_strides_for. If we make a tensor and used strides from make_contiguous_strides_for
+        # and then called definitely_contiguous we should get True.
+        expected_stride *= (
+            x if is_nested_int(x) else sym_max(x, 1)
+        )  # type:ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return True
 
 
 # This function is equivalent to compute_channels_last_contiguous_2d() in TensorImpl.cpp
+<<<<<<< HEAD
 def is_channels_last_contiguous_2d(a: Tensor) -> bool:
+=======
+def is_channels_last_contiguous_2d(a: Tensor, false_if_dde=False) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NHWC or not channels last 2D contiguous
     if a.ndim != 4:
         return False
 
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+=======
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        guard_size_oblivious,
+    )
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
+    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     expected_stride = 1
     for idx in (1, 3, 2, 0):
         length = a.shape[idx]
+<<<<<<< HEAD
         if guard_size_oblivious(length == 1):
             continue
 
         stride = a.stride()[idx]
         if guard_size_oblivious(stride != expected_stride):
+=======
+        if maybe_guard_or_false(length == 1):
+            continue
+
+        stride = a.stride()[idx]
+        if maybe_guard_or_true(stride != expected_stride):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
         expected_stride *= length
@@ -307,21 +365,44 @@ def is_channels_last_contiguous_2d(a: Tensor) -> bool:
     return True
 
 
+<<<<<<< HEAD
 def is_channels_last_contiguous_3d(a: Tensor) -> bool:
+=======
+def is_channels_last_contiguous_3d(a: Tensor, false_if_dde=False) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NDHWC or not channels last 3D contiguous
     if a.ndim != 5:
         return False
 
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+=======
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_or_true,
+        guard_size_oblivious,
+    )
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
+    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     expected_stride = 1
     for idx in (1, 4, 3, 2, 0):
         length = a.shape[idx]
+<<<<<<< HEAD
         if guard_size_oblivious(length == 1):
             continue
 
         stride = a.stride()[idx]
         if guard_size_oblivious(stride != expected_stride):
+=======
+        if maybe_guard_or_false(length == 1):
+            continue
+
+        stride = a.stride()[idx]
+        if maybe_guard_or_true(stride != expected_stride):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return False
 
         expected_stride *= length
@@ -345,16 +426,28 @@ def validate_memory_format(memory_format: torch.memory_format):
 
 
 def is_contiguous_for_memory_format(  # type: ignore[return]
+<<<<<<< HEAD
     a: Tensor, *, memory_format: torch.memory_format
+=======
+    a: Tensor, *, memory_format: torch.memory_format, false_if_dde=False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> bool:
     validate_memory_format(memory_format)
 
     if memory_format == torch.contiguous_format:
+<<<<<<< HEAD
         return is_contiguous(a)
     if memory_format == torch.channels_last:
         return is_channels_last_contiguous_2d(a)
     if memory_format == torch.channels_last_3d:
         return is_channels_last_contiguous_3d(a)
+=======
+        return is_contiguous(a, false_if_dde)
+    if memory_format == torch.channels_last:
+        return is_channels_last_contiguous_2d(a, false_if_dde)
+    if memory_format == torch.channels_last_3d:
+        return is_channels_last_contiguous_3d(a, false_if_dde)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     torch._check(
         False,
@@ -362,6 +455,32 @@ def is_contiguous_for_memory_format(  # type: ignore[return]
     )
 
 
+<<<<<<< HEAD
+=======
+def definitely_contiguous(a: TensorLikeType) -> bool:
+    return is_contiguous(a, false_if_dde=True)
+
+
+# similar to is_channels_last_contiguous_2d but return false on data dependency.
+def definitely_channels_last_contiguous_2d(a: Tensor) -> bool:
+    return is_channels_last_contiguous_2d(a, false_if_dde=True)
+
+
+# similar to is_channels_last_contiguous_3d but return false on data dependency.
+def definitely_channels_last_contiguous_3d(a: Tensor) -> bool:
+    return is_channels_last_contiguous_3d(a, false_if_dde=True)
+
+
+# similar to is_contiguous_for_memory_format but return false on data dependency.
+def definitely_contiguous_for_memory_format(  # type: ignore[return]
+    a: Tensor, *, memory_format: torch.memory_format
+) -> bool:
+    return is_contiguous_for_memory_format(
+        a, memory_format=memory_format, false_if_dde=True
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NOTE: that tensors with no elements and channels last is ???
 def is_channels_last_contiguous(a: Tensor) -> bool:
     """
@@ -379,6 +498,16 @@ def is_channels_last_contiguous(a: Tensor) -> bool:
     return is_channels_last_contiguous_2d(a) or is_channels_last_contiguous_3d(a)
 
 
+<<<<<<< HEAD
+=======
+# similar to is_channels_last_contiguous but return false on data dependency.
+def definitely_channels_last_contiguous(a: Tensor) -> bool:
+    return definitely_channels_last_contiguous_2d(
+        a
+    ) or definitely_channels_last_contiguous_3d(a)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_non_overlapping_and_dense(a: Tensor) -> bool:
     """
     True when a tensor is non-overlapping and dense.
@@ -393,7 +522,11 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
         return False
 
     # Short-circuits if the tensor is already contiguous or channels-last contiguous
+<<<<<<< HEAD
     if is_contiguous(a) or is_channels_last_contiguous(a):
+=======
+    if definitely_contiguous(a) or definitely_channels_last_contiguous(a):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
     # The following is equivalent to compute_non_overlapping_and_dense in TensorImpl.cpp
@@ -488,11 +621,19 @@ def compute_elementwise_output_logical_to_physical_perm(
     is_contiguous = True
     is_channels_last = True
     for t in tensors:
+<<<<<<< HEAD
         is_contiguous = is_contiguous and t.is_contiguous(
             memory_format=torch.contiguous_format
         )
         is_channels_last = is_channels_last and t.is_contiguous(
             memory_format=torch.channels_last
+=======
+        is_contiguous = is_contiguous and definitely_contiguous_for_memory_format(
+            t, memory_format=torch.contiguous_format
+        )
+        is_channels_last = is_channels_last and definitely_contiguous_for_memory_format(
+            t, memory_format=torch.channels_last
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if is_contiguous and not is_channels_last:
@@ -924,6 +1065,7 @@ def infer_size(shape: ShapeType, numel: int) -> tuple[int, ...]:
     Infers the size of a dim with size -1, if it exists.
     Also checks that new shape is compatible with the number of elements.
     """
+<<<<<<< HEAD
     dim = None
     newsize = 1
     for i, d in enumerate(shape):
@@ -934,20 +1076,46 @@ def infer_size(shape: ShapeType, numel: int) -> tuple[int, ...]:
             newsize *= d
         else:
             torch._check(False, lambda: f"invalid shape dimension {d}")
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
+
+    dim = None
+    newsize = 1
+    for i, d in enumerate(shape):
+        if guard_or_false(d == -1):
+            torch._check(dim is None, lambda: "only one dimension can be inferred")
+            dim = i
+        else:
+            torch._check(
+                d >= 0,
+                lambda: (
+                    f"invalid shape dimension {d}. If this was symbolic, it was assumed to not be -1."
+                    "If this was meant to be inferred, please explicitly pass in -1."
+                ),
+            )
+            newsize *= d
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if dim is None:
         torch._check(
             numel == newsize,
             lambda: f"shape '{list(shape)}' is invalid for input of size {numel}",
         )
     else:
+<<<<<<< HEAD
         from torch.fx.experimental.symbolic_shapes import definitely_true
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._check(
             newsize != 0,
             lambda: (
                 f"cannot reshape tensor of 0 elements into shape {list(shape)} because the "
                 f"unspecified dimension size -1 can be any value and is ambiguous"
+<<<<<<< HEAD
                 if definitely_true(numel == 0)
+=======
+                if guard_or_false(numel == 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else f"shape '{list(shape)}' is invalid for input of size {numel}"
             ),
         )
diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py
index e693be568b1f..99b7d4bb7def 100644
--- a/torch/_prims_common/wrappers.py
+++ b/torch/_prims_common/wrappers.py
@@ -5,7 +5,11 @@
 from collections.abc import Sequence
 from functools import wraps
 from types import GenericAlias
+<<<<<<< HEAD
 from typing import Callable, NamedTuple, Optional, overload, TypeVar
+=======
+from typing import Callable, NamedTuple, Optional, overload, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 import torch
@@ -285,7 +289,12 @@ def _out_wrapper(fn: Callable[_P, _T]) -> Callable[_P, _T]:
         is_factory_fn = all(p in sig.parameters for p in factory_kwargs)
 
         @wraps(fn)
+<<<<<<< HEAD
         def _fn(*args: _P.args, out=None, **kwargs: _P.kwargs):
+=======
+        def _fn(*args: _P.args, **kwargs: _P.kwargs):
+            out = kwargs.pop("out", None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if is_factory_fn and out is not None:
                 for k in factory_kwargs:
                     out_attr = getattr(out, k)
@@ -306,6 +315,11 @@ def maybe_check_copy_devices(out):
                 result = fn(*args, is_out=(out is not None), **kwargs)  # type: ignore[arg-type]
             else:
                 result = fn(*args, **kwargs)
+<<<<<<< HEAD
+=======
+            if result is NotImplemented:
+                return NotImplemented
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert (
                 (isinstance(result, TensorLike) and is_tensor)
                 or (
@@ -367,7 +381,13 @@ def maybe_check_copy_devices(out):
             annotation=out_type,
         )
         # Mark that the function now returns a tuple
+<<<<<<< HEAD
         assert isinstance(sig.return_annotation, str) or sig.return_annotation in (
+=======
+        assert isinstance(
+            sig.return_annotation, (str, TypeVar)
+        ) or sig.return_annotation in (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             sig.empty,
             out_type,
             bc_out_type,
@@ -450,7 +470,13 @@ def _autograd_impl(*args, **kwargs):
 # TODO: when tracing this will add torch tensors and not TensorMeta objects
 # to the trace -- we should fix this by adding a tracing context and NumberMeta classes
 # TODO: this wrapper is currently untested
+<<<<<<< HEAD
 def elementwise_unary_scalar_wrapper(fn: Callable) -> Callable:
+=======
+def elementwise_unary_scalar_wrapper(
+    fn: Callable[_P, _T],
+) -> Callable[_P, Union[_T, NumberType]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Allows unary operators that accept tensors to work with Python numbers.
     """
diff --git a/torch/_python_dispatcher.py b/torch/_python_dispatcher.py
index 2dfdbb296a4b..0ed4c6347d0f 100644
--- a/torch/_python_dispatcher.py
+++ b/torch/_python_dispatcher.py
@@ -92,10 +92,17 @@ def keys(self):
     """
 
     def register(self, dispatchKeys):
+<<<<<<< HEAD
         # Overriden is not supported and triggers a warning in C++ dispatcher.
         if len(set(dispatchKeys)) != len(dispatchKeys):
             raise RuntimeError(
                 f"Overriden is not allowed but found duplicates in {dispatchKeys}."
+=======
+        # Overridden is not supported and triggers a warning in C++ dispatcher.
+        if len(set(dispatchKeys)) != len(dispatchKeys):
+            raise RuntimeError(
+                f"Overridden is not allowed but found duplicates in {dispatchKeys}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         # We currently forbid this in codegen instead of C++ dispatcher.
         if (
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 647db3ce09f7..72d9c38e0164 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -19,6 +19,11 @@
 from torch import sym_float, sym_int
 from torch._prims_common import (
     BoolLike,
+<<<<<<< HEAD
+=======
+    definitely_contiguous,
+    definitely_contiguous_for_memory_format,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DeviceLikeType,
     Dim,
     DimsSequenceType,
@@ -2767,7 +2772,14 @@ def cat_compute_output_memory_format(inputs):
 
     utils.check_same_device(*tensors, allow_cpu_scalar_tensors=False)
 
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+=======
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_size_oblivious,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # This is a bit tricky.  Naively, you would expect to just pick one
     # arbitrary tensor and check that all tensors match this tensor.  However,
@@ -2828,7 +2840,11 @@ def cat_compute_output_memory_format(inputs):
             )
         else:
             # Remove inputs that are 1-D, zero size
+<<<<<<< HEAD
             if tensor.ndim == 1 and guard_size_oblivious(tensor.shape[0] == 0):
+=======
+            if tensor.ndim == 1 and guard_or_false(tensor.shape[0] == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             # Don't bother checking size match, prims.cat will handle it
             filtered.append(tensor)
@@ -2974,7 +2990,12 @@ def contiguous(
         lambda: "preserve memory format is unsupported by the contiguous operator",
     )
 
+<<<<<<< HEAD
     if utils.is_contiguous_for_memory_format(a, memory_format=memory_format):
+=======
+    # TODO: make logic consistent with aten contiguous
+    if definitely_contiguous_for_memory_format(a, memory_format=memory_format):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return a
 
     return torch.clone(a, memory_format=memory_format)
@@ -2989,7 +3010,11 @@ def dstack(tensors: TensorSequenceType) -> TensorLikeType:
 
 @register_decomposition(aten.expand)
 def expand(a: Tensor, *shape) -> Tensor:
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_or
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # NOTE: cannot use utils.extract_shape_from_varargs here
     # because that also validates the shape, but the shape
@@ -3007,6 +3032,7 @@ def expand(a: Tensor, *shape) -> Tensor:
     for idx, x in enumerate(a.shape):
         offset_idx = idx + offset
         requested_length = shape[offset_idx]
+<<<<<<< HEAD
         torch._check(
             guard_size_oblivious(requested_length == x)
             or guard_size_oblivious(x == 1)
@@ -3015,6 +3041,29 @@ def expand(a: Tensor, *shape) -> Tensor:
         )
 
         shape_[offset_idx] = requested_length if requested_length != -1 else x
+=======
+
+        # expand(in -> out) has 3 different semantics:
+        # 1) out == -1 -> size = in, stride unchanged
+        # 2) in == 1 -> size = out, stride = 0
+        # 3) in == out -> size = in, stride unchanged
+        #
+        # the code below is written for unbacked semantics s.t. we assume unbacked symbols don't
+        # represent -1 unless explicitly specified, and the user is opting for case 2) or 3).
+        # the sym_or allows either case, but in the decomposition's current state, broadcast_in_dim()
+        # will either assume case 3) (via validate_shape() marking the expanded shape size-like), or will
+        # raise a data-dependent error trying to figure out if the stride is 0, requiring the user to manually
+        # select between the semantics of cases 2) and 3).
+        if guard_or_false(requested_length == -1):
+            shape_[offset_idx] = x
+        else:
+            torch._check(
+                sym_or(x == 1, requested_length == x),
+                lambda: f"expand: attempting to expand a dimension of length {x} -> {requested_length}!",
+            )
+            torch._check(requested_length >= 0)
+            shape_[offset_idx] = requested_length
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # At this point shape must be valid
     utils.validate_shape(shape_)
@@ -3187,6 +3236,7 @@ def native_group_norm(
         + f"but got input of shape {input.shape} and num_groups = {num_groups}",
     )
 
+<<<<<<< HEAD
     # num_channels / num_groups and flattened inner dimension are the reduction axes
     reduction_dims = [2, 3]
     input_reshaped = torch.reshape(
@@ -3208,6 +3258,48 @@ def native_group_norm(
         out = out * unsqueeze_weight
     if unsqueeze_bias is not None:
         out = out + unsqueeze_bias
+=======
+    computation_dtype = utils.get_computation_dtype(input.dtype)
+    input_acc = _maybe_convert_to_dtype(input, computation_dtype)
+    # num_channels / num_groups and flattened inner dimension are the reduction axes
+    reduction_dims = [2, 3]
+    input_reshaped = torch.reshape(
+        input_acc,
+        [batch_size, num_groups, num_channels // num_groups, flattened_inner_size],
+    )
+    reduction_dims = utils.canonicalize_dims(input_reshaped.ndim, reduction_dims)
+    biased_var, mean = torch.var_mean(
+        input_reshaped, dim=reduction_dims, unbiased=False, keepdim=True
+    )
+    rstd = torch.rsqrt(biased_var + eps)
+    if input.device.type == "cpu" and weight is not None:
+        weight_reshaped = torch.reshape(
+            weight, [1, num_groups, num_channels // num_groups, 1]
+        )
+        w = rstd * weight_reshaped
+        b = -mean * w
+        if bias is not None:
+            bias_reshaped = torch.reshape(
+                bias, [1, num_groups, num_channels // num_groups, 1]
+            )
+            b = b + bias_reshaped
+        w = w.contiguous().as_strided([batch_size, num_channels], [num_channels, 1])
+        b = b.contiguous().as_strided([batch_size, num_channels], [num_channels, 1])
+        broadcast_dims = list(range(2, input.ndim))
+        unsqueeze_w = _unsqueeze_multiple(w, broadcast_dims)
+        unsqueeze_b = _unsqueeze_multiple(b, broadcast_dims)
+        out = input_acc * unsqueeze_w + unsqueeze_b
+    else:
+        out = (input_reshaped - mean) * rstd
+        out = out.view(input.shape)
+        broadcast_dims = [0] + list(range(2, input.ndim))
+        if weight is not None:
+            unsqueeze_weight = _unsqueeze_multiple(weight, broadcast_dims)
+            out = out * unsqueeze_weight
+        if bias is not None:
+            unsqueeze_bias = _unsqueeze_multiple(bias, broadcast_dims)
+            out = out + unsqueeze_bias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     out = _maybe_convert_to_dtype(out, input.dtype)  # type: ignore[assignment]
     mean = _maybe_convert_to_dtype(mean, input.dtype)  # type: ignore[assignment]
@@ -3265,11 +3357,19 @@ def native_layer_norm(
         + str(input.shape),
     )
 
+<<<<<<< HEAD
     input = input.contiguous()
     if weight is not None:
         weight = weight.contiguous()
     if bias is not None:
         bias = bias.contiguous()
+=======
+    input = contiguous(input)
+    if weight is not None:
+        weight = contiguous(weight)
+    if bias is not None:
+        bias = contiguous(bias)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     axis = input.ndim - normalized_ndim
     reduction_dims = list(range(axis, input.ndim))
@@ -3432,10 +3532,19 @@ def stft(
         left = (n_fft - win_length_) // 2
         window = aten.constant_pad_nd(window, [left, n_fft - win_length_ - left])
 
+<<<<<<< HEAD
     input = input.unfold(dimension=-1, size=n_fft, step=hop_length_)
     if not center and align_to_window:
         input_pad_amount = (n_fft - win_length_) // 2
         input = aten.pad(input, [input_pad_amount, input_pad_amount], pad_mode)
+=======
+    if not center and align_to_window:
+        input_pad_amount = (n_fft - win_length_) // 2
+        input = aten.pad(input, [input_pad_amount, input_pad_amount], pad_mode)
+
+    input = input.unfold(dimension=-1, size=n_fft, step=hop_length_)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if window is not None:
         input = input * window
 
@@ -3695,6 +3804,7 @@ def repeat(a: Tensor, *repeat_shape) -> Tensor:
     return permuted_result.reshape(target_shape)
 
 
+<<<<<<< HEAD
 def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorLikeType:
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious, sym_eq
 
@@ -3742,6 +3852,11 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
 
     # Handles general case: a 1+D tensor reshaped into a distinct 1+D shape
 
+=======
+def _reshape_view_helper_core_alg(
+    a: TensorLikeType, shape, allow_copy: bool
+) -> TensorLikeType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NOTE [Reshape Algorithm]
     # This algorithm works by attempting to greedily construct the desired dimensions in
     # the output shape, left to right. It does this by, conceptually, accumulating
@@ -3773,6 +3888,7 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
             continue
 
         # Skips dimensions that are already the correct length
+<<<<<<< HEAD
         if guard_size_oblivious(length == a_.shape[idx]):
             idx = idx + 1
             continue
@@ -3785,6 +3901,17 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
         while guard_size_oblivious(accum % length != 0):
             end = end + 1
             accum = accum * a_.shape[end]
+=======
+        if length == a_.shape[idx]:
+            idx = idx + 1
+            continue
+
+        accum = a_.shape[idx]
+        end = idx
+        while accum % length != 0:
+            end += 1
+            accum *= a_.shape[end]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if end != idx:
             # NOTE: in this case multiple dimensions must be flatten to create the desired dimension
             # This flattening is why reshape sometimes creates a copy -- because flattening
@@ -3801,8 +3928,14 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
 
             a_ = flatten(a_, idx, end)
 
+<<<<<<< HEAD
         # Splits the (possibly flattened) dimension to create the desired dim length
         if guard_size_oblivious(accum != length):
+=======
+        # Splits the (possibly flattened) dimension to create the desired dim length.
+        # guard_or_true is safe due to the tail unsqueeze routine.
+        if accum != length:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             a_ = prims.split_dim(a_, idx, length)
 
         idx = idx + 1
@@ -3821,6 +3954,62 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
         return a_
 
 
+<<<<<<< HEAD
+=======
+def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorLikeType:
+    # Creates a valid shape
+    shape = utils.extract_shape_from_varargs(shape, validate=False)
+    # Reshape may be given a shape with a -1 length
+    # This indicates that the dimension's length should be inferred
+    shape = utils.infer_size(shape, a.numel())
+
+    # Special-cases tensors with no elements
+    if a.numel() == 0:
+        return as_strided(a, shape, utils.make_contiguous_strides_for(shape))
+
+    # Special-cases reshaping zero dim tensors
+    if a.ndim == 0:
+        _a = a
+        for length in shape:
+            assert length == 1
+            _a = unsqueeze(_a, -1)
+        if _a is a:
+            return prims.view_of(a)
+        else:
+            return _a
+
+    # Special-cases reshaping to zero dim tensors
+    if len(shape) == 0:
+        _a = a
+        for length in a.shape:
+            assert length == 1
+            _a = squeeze(_a, -1)
+        if _a is a:
+            return prims.view_of(a)
+        else:
+            return _a
+
+    if definitely_contiguous(a):
+        # Special-cases for nd_to_1d
+        if len(shape) == 1 and a.ndim > 1:
+            return torch.as_strided(a, [a.numel()], [1])
+        # Special-cases for 1d_to_2d
+        if len(shape) == 2 and a.ndim == 1:
+            dim0 = shape[0]
+            dim1 = shape[1]
+            return torch.as_strided(a, [dim0, dim1], [dim1, 1])
+
+    shape_numel = reduce(operator.mul, shape, 1)
+    torch._check(
+        a.numel() == shape_numel,
+        f"Could not reshape a tensor with shape {a.shape} as a tensor with shape {shape}!",
+    )
+
+    # Handles general case: a 1+D tensor reshaped into a distinct 1+D shape
+    return _reshape_view_helper_core_alg(a, shape, allow_copy)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # CompositeImplicitAutograd - don't register decomp
 # NOTE: shape is a vararg because Tensor.reshape can be called with as
 # Tensor.reshape(a, b, c) or Tensor.reshape((a, b, c)) Function call
@@ -4964,6 +5153,18 @@ def new_full(
     )
 
 
+<<<<<<< HEAD
+=======
+@aten.empty.out.py_impl(DispatchKey.CompositeImplicitAutograd)
+def empty_out(
+    size: TensorLikeType,
+    out: TensorLikeType,
+    memory_format: Optional[torch.memory_format] = None,
+) -> TensorLikeType:
+    return out
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_decomposition(aten.empty_like)
 @out_wrapper()
 def empty_like(
@@ -5686,7 +5887,11 @@ def masked_fill(a: TensorLikeType, mask: TensorLikeType, value: TensorOrNumberLi
         # `masked_fill` allows cpu scalar to be moved to cuda, xpu and hpu but not otherwise.
         is_cpu_scalar = (
             a.device.type
+<<<<<<< HEAD
             in ["cuda", "xpu", torch._C._get_privateuse1_backend_name(), "hpu"]
+=======
+            in ["cuda", "xpu", "mps", torch._C._get_privateuse1_backend_name(), "hpu"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and value.device.type == "cpu"
         )
         torch._check(
@@ -6271,7 +6476,11 @@ def wrapper(self, other):
 
 
 @register_decomposition(aten.dot)
+<<<<<<< HEAD
 @out_wrapper()
+=======
+@out_wrapper(exact_dtype=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @_dot_check_wrapper
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("self", "other"),
@@ -6291,7 +6500,11 @@ def dot(self, other):
 
 
 @register_decomposition(aten.vdot)
+<<<<<<< HEAD
 @out_wrapper()
+=======
+@out_wrapper(exact_dtype=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @_dot_check_wrapper
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("self", "other"),
diff --git a/torch/_refs/linalg/__init__.py b/torch/_refs/linalg/__init__.py
index 00d95445c6f3..dfdb5eb7db4e 100644
--- a/torch/_refs/linalg/__init__.py
+++ b/torch/_refs/linalg/__init__.py
@@ -97,6 +97,37 @@ def diagonal(
     return torch.diagonal(input, offset=offset, dim1=dim1, dim2=dim2)
 
 
+<<<<<<< HEAD
+=======
+def _check_vector_norm_args(
+    x: TensorLikeType, ord: Union[float, int] = 2, dim: Optional[DimsType] = None
+):
+    from torch.fx.experimental.symbolic_shapes import sym_or
+
+    if not (ord < 0.0 or ord == float("inf")):
+        return
+
+    torch._check(
+        sym_or(
+            x.numel() != 0,
+            not isinstance(dim, IntLike) and dim is not None and len(dim) != 0,
+        ),
+        "linalg.vector_norm cannot compute the {ord} norm on an empty tensor "
+        "because the operation does not have an identity",
+    )
+
+    shape = x.shape
+    if dim is not None and not isinstance(dim, IntLike):
+        for d in dim:
+            torch._check(
+                sym_or(x.numel() != 0, d < len(shape) and d >= 0 and shape[d] != 0),
+                "linalg.vector_norm cannot compute the {ord} norm on the "
+                f"dimension {d} because this dimension is empty and the "
+                "operation does not have an identity",
+            )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_decomposition(torch._ops.ops.aten.linalg_vector_norm)
 @out_wrapper(exact_dtype=True)
 def vector_norm(
@@ -107,14 +138,20 @@ def vector_norm(
     *,
     dtype: Optional[torch.dtype] = None,
 ) -> Tensor:
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
 
     # Checks
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     check_fp_or_complex(x.dtype, "linalg.vector_norm")
 
     if isinstance(dim, Dim):
         dim = [dim]  # type: ignore[assignment]
 
+<<<<<<< HEAD
     if guard_size_oblivious(x.numel() == 0) and (ord < 0.0 or ord == float("inf")):
         torch._check(
             dim is not None and len(dim) != 0,
@@ -130,6 +167,10 @@ def vector_norm(
                 f"dimension {d} because this dimension is empty and the "
                 "operation does not have an identity",
             )
+=======
+    _check_vector_norm_args(x, ord, dim)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _check_norm_dtype(dtype, x.dtype, "linalg.vector_norm")
 
     computation_dtype, result_dtype = utils.reduction_dtypes(
@@ -151,6 +192,29 @@ def vector_norm(
         reduce_sum = partial(torch.sum, dim=dim, keepdim=keepdim)
 
         is_ord_even = ord % 2 == 0 if isinstance(ord, IntLike) else ord % 2.0 == 0.0
+<<<<<<< HEAD
+=======
+        if dim == []:
+            dim = None
+
+        if (dim is None and x.numel() == 1) or (
+            dim is not None
+            and (x.ndim > 0 and all(guard_or_false(x.shape[d] == 1) for d in dim))
+        ):
+            if x.ndim > 64:
+                raise RuntimeError(
+                    f"Received a tensor with {x.ndim} dimensions, but only tensors with up to 64 dims are supported!"
+                )
+            x = torch.abs(x)
+            if keepdim or x.ndim == 0:
+                return to_result_dtype(x).contiguous()
+            elif dim is None:
+                return x.flatten()[0]
+            else:
+                new_shape = [s for d, s in enumerate(x.shape) if d not in dim]
+                return to_result_dtype(x.view(new_shape)).contiguous()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not (is_ord_even and utils.is_float_dtype(x.dtype)):
             x = torch.abs(x)
         return to_result_dtype(torch.pow(reduce_sum(torch.pow(x, ord)), 1.0 / ord))  # type: ignore[return-value]
diff --git a/torch/_size_docs.py b/torch/_size_docs.py
index 4e79e8023f5b..85e37e40af9c 100644
--- a/torch/_size_docs.py
+++ b/torch/_size_docs.py
@@ -19,6 +19,10 @@ def add_docstr_all(method: str, docstr: str) -> None:
 ``x.numel() == x.size().numel() == s.numel() == 100`` holds true.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> x=torch.ones(10, 10)
     >>> s=x.size()
     >>> s
diff --git a/torch/_storage_docs.py b/torch/_storage_docs.py
index d1dbad078d9a..42b4587aeba7 100644
--- a/torch/_storage_docs.py
+++ b/torch/_storage_docs.py
@@ -20,7 +20,11 @@ def add_docstr_all(method, docstr):
 add_docstr_all(
     "from_file",
     """
+<<<<<<< HEAD
 from_file(filename, shared=False, size=0) -> Storage
+=======
+from_file(filename, shared=False, nbytes=0) -> Storage
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Creates a CPU storage backed by a memory-mapped file.
 
@@ -28,15 +32,26 @@ def add_docstr_all(method, docstr):
 All changes are written to the file. If ``shared`` is ``False``, then the changes on
 the storage do not affect the file.
 
+<<<<<<< HEAD
 ``size`` is the number of elements in the storage. If ``shared`` is ``False``,
 then the file must contain at least ``size * sizeof(Type)`` bytes
 (``Type`` is the type of storage, in the case of an ``UnTypedStorage`` the file must contain at
 least ``size`` bytes). If ``shared`` is ``True`` the file will be created if needed.
+=======
+``nbytes`` is the number of bytes of storage. If ``shared`` is ``False``,
+then the file must contain at least ``nbytes`` bytes. If ``shared`` is
+``True`` the file will be created if needed. (Note that for ``UntypedStorage``
+this argument differs from that of ``TypedStorage.from_file``)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Args:
     filename (str): file name to map
     shared (bool): whether to share memory (whether ``MAP_SHARED`` or ``MAP_PRIVATE`` is passed to the
                     underlying `mmap(2) call <https://man7.org/linux/man-pages/man2/mmap.2.html>`_)
+<<<<<<< HEAD
     size (int): number of elements in the storage
+=======
+    nbytes (int): number of bytes of storage
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
diff --git a/torch/_subclasses/_fake_tensor_utils.py b/torch/_subclasses/_fake_tensor_utils.py
index 62df5dab148d..d50493cab3ca 100644
--- a/torch/_subclasses/_fake_tensor_utils.py
+++ b/torch/_subclasses/_fake_tensor_utils.py
@@ -218,6 +218,14 @@ class _CacheKeyState:
     # matches one of the inputs so we can uncache it properly.
     sym_node_lookup: dict[int, int]  # id(SymNode) -> index
 
+<<<<<<< HEAD
+=======
+    # This is a list of all seen input sympy.Symbols. We use it when building
+    # the cache entry to see if the output value has any symbols that we didn't
+    # see on input. See _has_unrepresented_symbols().
+    known_symbols: set[sympy.Symbol]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # There are cases where we're asked to perform an op when we have no
     # ShapeEnv on the FakeTensorMode - but for SymNodes we MUST have a
     # ShapeEnv. So as we scan if we see a SymNode (with a ShapeEnv) we record it
@@ -226,6 +234,10 @@ class _CacheKeyState:
 
     def __init__(self, shape_env: Optional[ShapeEnv] = None) -> None:
         self.sym_node_lookup = {}
+<<<<<<< HEAD
+=======
+        self.known_symbols = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.shape_env = shape_env
 
     def cache_on_shape_env(self) -> bool:
@@ -247,6 +259,10 @@ def convert_sym_int(self, result: list[object], arg: SymInt) -> None:
             result.append(_InputBackref(self.sym_node_lookup[node_id]))
         else:
             self.sym_node_lookup[node_id] = len(result)
+<<<<<<< HEAD
+=======
+            self.known_symbols.update(arg.node.expr.free_symbols)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if self.shape_env is None:
                 self.shape_env = arg.node.shape_env
             result.append(_PySymInputStub(arg))
diff --git a/torch/_subclasses/fake_impls.py b/torch/_subclasses/fake_impls.py
index bc7bc1ba7f82..c3f5b26efff1 100644
--- a/torch/_subclasses/fake_impls.py
+++ b/torch/_subclasses/fake_impls.py
@@ -12,6 +12,10 @@
 from torch._dispatch.python import no_python_dispatcher
 from torch._ops import OpOverload
 from torch._prims_common import (
+<<<<<<< HEAD
+=======
+    definitely_contiguous_for_memory_format,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     is_boolean_dtype,
@@ -68,6 +72,11 @@ def is_noncontiguous_supported(device):
     aten.randn_like.default,
     aten.randn_like.out,
     aten.randint_like.default,
+<<<<<<< HEAD
+=======
+    aten.randint_like.Tensor,
+    aten.randint_like.Tensor_out,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     aten.randint_like.out,
     aten.randint_like.low_dtype,
     aten.randint_like.low_dtype_out,
@@ -111,7 +120,11 @@ def contains_tensor_types(type):
     )
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _is_tensor_constructor(func: OpOverload):
     assert isinstance(func, OpOverload)
     schema = func._schema
@@ -559,6 +572,43 @@ def masked_select(fake_mode, func, self, mask):
     return self.new_empty((nnz,))
 
 
+<<<<<<< HEAD
+=======
+@register_op_impl(torch.ops.aten._assert_tensor_metadata.default)
+def assert_tensor_metadata(
+    fake_mode,
+    func,
+    t,
+    sizes=None,
+    strides=None,
+    dtype=None,
+    *,
+    device=None,
+    layout=None,
+) -> None:
+    if sizes is not None:
+        assert (
+            t.size() == sizes
+        ), f"Tensor sizes mismatch! Expected: {sizes}, Got: {t.size()}"
+    if strides is not None:
+        assert (
+            t.stride() == strides
+        ), f"Tensor strides mismatch! Expected: {strides}, Got: {t.stride()}"
+    if dtype is not None:
+        assert (
+            t.dtype == dtype
+        ), f"Tensor dtype mismatch! Expected: {dtype}, Got: {t.dtype}"
+    if layout is not None:
+        assert (
+            t.layout == layout
+        ), f"Tensor layout mismatch! Expected: {layout}, Got: {t.layout()}"
+    if device is not None:
+        assert (
+            t.device == device
+        ), f"Tensor device mismatch! Expected: {device}, Got: {t.device}"
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NB: this must be ordered after local_scalar_dense
 @register_op_impl(lambda func: torch.Tag.data_dependent_output in func.tags)
 def data_dep(fake_mode, func, *args, **kwargs):
@@ -600,8 +650,16 @@ def has_meta(func):
     return torch._C._dispatch_has_computed_kernel_for_dispatch_key(func.name(), "Meta")
 
 
+<<<<<<< HEAD
 @register_op_impl(
     lambda func: is_builtin(func) and "foreach" in func.name() and has_meta(func)
+=======
+# These are for the `torch._foreach_...` ops like `torch._foreach_add`.
+@register_op_impl(
+    lambda func: is_builtin(func)
+    and func.name().startswith("aten::_foreach_")
+    and has_meta(func)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 def foreach_run_and_map_input_device(fake_mode, func, *args, **kwargs):
     tensor_lists = [
@@ -810,7 +868,12 @@ def bincount(fake_mode, func, inputs, weights=None, minlength=0):
 
     from torch.fx.experimental.symbolic_shapes import _constrain_range_for_size
 
+<<<<<<< HEAD
     _constrain_range_for_size(new_size, min=minlength)
+=======
+    _constrain_range_for_size(new_size)
+    torch._check(new_size >= minlength)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return inputs.new_empty(new_size)
 
 
@@ -854,7 +917,11 @@ def impl_decorator(op_impl):
 
 # infer_size_impl in ExpandUtils
 def infer_size(a, b):
+<<<<<<< HEAD
     from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+=======
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     dimsA = len(a)
     dimsB = len(b)
@@ -879,18 +946,32 @@ def infer_size(a, b):
         # were not the case, we'd need to write this using torch.sym_or() or
         # something like that).
         torch._check(
+<<<<<<< HEAD
             guard_size_oblivious(sizeA == 1)
             or guard_size_oblivious(sizeB == 1)
             or sizeA == sizeB,
+=======
+            guard_or_false(sizeA == 1) or guard_or_false(sizeB == 1) or sizeA == sizeB,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             lambda: f"The size of tensor a ({sizeA}) "
             f"must match the size of tensor b ({sizeB}) "
             f"at non-singleton dimension {i})",
         )
+<<<<<<< HEAD
         expandedSizes[i] = sizeB if guard_size_oblivious(sizeA == 1) else sizeA
     return tuple(expandedSizes)
 
 
 def make_fast_binary_impl(slow_ref):
+=======
+        expandedSizes[i] = sizeB if guard_or_false(sizeA == 1) else sizeA
+    return tuple(expandedSizes)
+
+
+def make_fast_binary_impl(
+    slow_ref, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def fast_binary_impl(mode, *args, **kwargs):
         def slow(msg):
             count_label(f"slow {msg}")
@@ -921,7 +1002,11 @@ def slow(msg):
             final_shape = infer_size(final_shape, shape)
         assert final_shape is not None
 
+<<<<<<< HEAD
         from torch.fx.experimental.symbolic_shapes import guard_size_oblivious, sym_eq
+=======
+        from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_eq
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Do some extra safety checks to see if the output
         # stride is obvious
@@ -929,10 +1014,19 @@ def slow(msg):
             if (
                 isinstance(op, torch.Tensor)
                 and len(op.shape) == len(final_shape)
+<<<<<<< HEAD
                 and guard_size_oblivious(sym_eq(op.shape, final_shape))
             ):
                 break
         else:
+=======
+                # take the slow path if result is not determined.
+                and guard_or_false(sym_eq(op.shape, final_shape))
+            ):
+                break
+        else:
+            # if we never break in the for loop above we take the slow path.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return slow("both tensors nontrivially broadcast")
 
         # compute_types
@@ -957,7 +1051,11 @@ def slow(msg):
             # compute promotion
             # TODO: we don't need the compute type
             _, common_dtype = elementwise_dtypes(
+<<<<<<< HEAD
                 *operands, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+=======
+                *operands, type_promotion_kind=type_promotion_kind
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # check all tensors on same device
@@ -975,8 +1073,13 @@ def slow(msg):
                 return slow("error")
 
         # compute_fast_setup_type
+<<<<<<< HEAD
         is_contiguous = True
         is_channels_last = True
+=======
+        definitely_contiguous = True
+        definitely_channels_last = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: is_non-overlapping_and_dense (not bound from Python
         # no inplace, no out, everything defined
 
@@ -984,6 +1087,7 @@ def slow(msg):
             for op in operands:
                 if not isinstance(op, torch.Tensor):
                     continue
+<<<<<<< HEAD
                 is_contiguous = is_contiguous and op.is_contiguous(
                     memory_format=torch.contiguous_format
                 )
@@ -991,6 +1095,21 @@ def slow(msg):
                     memory_format=torch.channels_last
                 )
         if is_contiguous:
+=======
+                definitely_contiguous = (
+                    definitely_contiguous
+                    and definitely_contiguous_for_memory_format(
+                        op, memory_format=torch.contiguous_format
+                    )
+                )
+                definitely_channels_last = (
+                    definitely_channels_last
+                    and definitely_contiguous_for_memory_format(
+                        op, memory_format=torch.channels_last
+                    )
+                )
+        if definitely_contiguous:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # do contiguous
             count_label("fast is_contiguous")
             return FakeTensor(
@@ -1003,7 +1122,11 @@ def slow(msg):
                 ),
                 device=common_device,
             )
+<<<<<<< HEAD
         if is_channels_last:
+=======
+        if definitely_channels_last:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             count_label("fast channels_last")
             # do channels last
             return FakeTensor(
@@ -1024,6 +1147,7 @@ def slow(msg):
 
 # disable the python dispatcher to avoid decomposing detach() further
 # (proxy_mode should still decompose detach() though)
+<<<<<<< HEAD
 def fast_detach(fake_mode, x):
     with no_python_dispatcher(), in_kernel_invocation_manager(fake_mode):
         out = torch.ops.aten.detach.default(x)
@@ -1031,6 +1155,17 @@ def fast_detach(fake_mode, x):
 
 
 @functools.lru_cache(None)
+=======
+def fast_detach(fake_mode, x, include_real=False):
+    with no_python_dispatcher(), in_kernel_invocation_manager(fake_mode):
+        out = torch.ops.aten.detach.default(x)
+    if include_real:
+        return FakeTensor(fake_mode, out, x.device, real_tensor=x.real_tensor)
+    return FakeTensor(fake_mode, out, x.device)
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_fast_op_impls():
     import torch._refs
 
@@ -1042,7 +1177,14 @@ def get_fast_op_impls():
     )
     register_fast_op_impl(torch.ops.aten.mul.Tensor)(make_fast_binary_impl(torch._refs.mul))  # type: ignore[has-type]
     register_fast_op_impl(torch.ops.aten.div.Tensor)(
+<<<<<<< HEAD
         make_fast_binary_impl(torch._refs.div)
+=======
+        make_fast_binary_impl(
+            torch._refs.div,
+            type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     register_fast_op_impl(torch.ops.aten.detach.default)(fast_detach)
     return FAST_OP_IMPLEMENTATIONS
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 4574bbdc5963..9a48859df4f8 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -10,6 +10,10 @@
 import os
 import threading
 import traceback
+<<<<<<< HEAD
+=======
+import types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import typing
 import weakref
 from collections import defaultdict
@@ -23,6 +27,10 @@
 from torch import SymBool, SymFloat, SymInt, Tensor
 from torch._C._functorch import is_functorch_wrapped_tensor, is_legacy_batchedtensor
 from torch._library.fake_class_registry import FakeScriptObject
+<<<<<<< HEAD
+=======
+from torch._library.fake_profile import MissingOpProfile
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import dtrace_structured
 from torch._prims_common import suggest_memory_format
 from torch._subclasses.meta_utils import (
@@ -60,6 +68,10 @@
     from torch.fx.experimental.symbolic_shapes import ShapeEnv, SymbolicContext
 
 log = logging.getLogger(__name__)
+<<<<<<< HEAD
+=======
+hc_log = torch._logging.getArtifactLogger(__name__, "hierarchical_compile")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: Hack to unblock https://github.com/pytorch/pytorch/pull/108186
 # Proper fix tracked by https://github.com/pytorch/pytorch/issues/120105
@@ -67,17 +79,24 @@
     not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemented")
 except ValueError as e:
     if "'not_implemented' not registered" in str(e):
+<<<<<<< HEAD
         import logging as not_implemented_log
+=======
+        not_implemented_log = logging.getLogger(__name__ + ".not_implemented")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         raise e
 
 
+<<<<<<< HEAD
 class _Unassigned:
     pass
 
 
 _UNASSIGNED = _Unassigned()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DimList = list
 
 pytree = torch.utils._pytree
@@ -125,6 +144,14 @@ class UnsupportedOperatorException(RuntimeError):
 
 
 @dataclass
+<<<<<<< HEAD
+=======
+class UnsupportedMutationAliasingException(RuntimeError):
+    reason: str
+
+
+@dataclass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MetadataMismatchError(RuntimeError):
     reason: str
 
@@ -155,6 +182,19 @@ def unset_fake_temporarily() -> Generator[Optional[TorchDispatchMode], None, Non
             torch._C._set_dispatch_mode(old)
 
 
+<<<<<<< HEAD
+=======
+@contextlib.contextmanager
+def disable_fake_tensor_cache(fake_mode: FakeTensorMode) -> Generator[None, None, None]:
+    old_value: bool = fake_mode.cache_enabled
+    try:
+        fake_mode.cache_enabled = False
+        yield
+    finally:
+        fake_mode.cache_enabled = old_value
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_plain_tensors(
     subclass: Tensor, *, out: list[Union[Tensor, int, SymInt]]
 ) -> list[Union[Tensor, int, SymInt]]:
@@ -221,7 +261,11 @@ def maybe_get_fake_mode(t: object) -> Optional[FakeTensorMode]:
     return None
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_schema_info(func: OpOverload) -> torch._C._SchemaInfo:
     return torch._C._SchemaInfo(func._schema)
 
@@ -231,7 +275,11 @@ def get_schema_info(func: OpOverload) -> torch._C._SchemaInfo:
 # torch/_decomp/decompositions.py.
 # decomps are used for aot autograd tracing so we would like to unify on their
 # implementation and add additional testing to them
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def torch_decomp_decompositions(func: OpOverload) -> bool:
     from torch._decomp import decomposition_table
 
@@ -477,7 +525,16 @@ def mk_fake_tensor(
 
     # If you specify the device, it MUST be a meta tensor.
     def from_meta_and_device(
+<<<<<<< HEAD
         self, fake_mode: FakeTensorMode, t: Tensor, device: torch.device
+=======
+        self,
+        fake_mode: FakeTensorMode,
+        t: Tensor,
+        device: torch.device,
+        pytype: Optional[type[torch.Tensor]] = None,
+        dispatch_keys: Optional[torch.DispatchKeySet] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> FakeTensor:
         assert (
             t.device.type == "meta"
@@ -487,12 +544,22 @@ def from_meta_and_device(
         maybe_memo = self._get_memo(t)
         if maybe_memo is not None:
             return maybe_memo
+<<<<<<< HEAD
         out = FakeTensor(fake_mode, t, device)
+=======
+        out = FakeTensor(
+            fake_mode, t, device, pytype=pytype, dispatch_keys=dispatch_keys
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.set_tensor_memo(t, out)
         return out
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def init_gpu_context(device: torch.device) -> None:
     # Backward will error with cuda Fake Tensors if no cuda tensors have been initialized first
     if torch.cuda.is_available() or torch.xpu.is_available():
@@ -639,6 +706,15 @@ class FakeTensor(Tensor):
     # nested int.
     nested_int_memo = SymNumberMemoDescriptor(is_nested_int=True)
 
+<<<<<<< HEAD
+=======
+    # FakeTensor doesn't fully emulate the original tensor's Python type
+    # and dispatch key set, therefore sometimes we want to track them
+    # separately.
+    pytype: Optional[type[Tensor]]
+    dispatch_keys: Optional[torch.DispatchKeySet]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Indicates to our torch_dispatch dispatching infra that
     # this is an "infra" mode with lower dispatching precedence.
     _mode_key = torch._C._TorchDispatchModeKey.FAKE
@@ -688,6 +764,11 @@ def __new__(
         device: torch.device,
         constant: Optional[Tensor] = None,
         real_tensor: Optional[Tensor] = None,
+<<<<<<< HEAD
+=======
+        pytype: Optional[type[Tensor]] = None,
+        dispatch_keys: Optional[torch.DispatchKeySet] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Self:
         self = Tensor._make_subclass(
             cls,
@@ -730,6 +811,11 @@ def __new__(
         self.fake_device = device
         self.fake_mode = fake_mode
         self.constant = constant
+<<<<<<< HEAD
+=======
+        self.pytype = pytype
+        self.dispatch_keys = dispatch_keys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert not isinstance(real_tensor, FakeTensor)
         self.real_tensor = real_tensor
         self.nonzero_memo = None
@@ -855,8 +941,21 @@ def _find_common_device(
         has_scalar_only_inputs = False
         is_cpu_zero_dim = None
 
+<<<<<<< HEAD
         def cpu_zero_dim(t: Tensor) -> bool:
             return t.device.type == "cpu" and t.dim() == 0
+=======
+        # list of ops which can have args(tensor/tensorList) in mixed device
+        mixed_device_fns = ordered_set(
+            aten._foreach_copy.default,
+        )
+
+        def check_cpu_device(device: torch.device) -> bool:
+            return device.type == "cpu"
+
+        def cpu_zero_dim(t: Tensor) -> bool:
+            return check_cpu_device(t.device) and t.dim() == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def merge_devices(t: object) -> None:
             nonlocal common_device
@@ -886,6 +985,17 @@ def merge_devices(t: object) -> None:
                 is_cpu_zero_dim = t_is_cpu_zero_dim
                 return
 
+<<<<<<< HEAD
+=======
+            # if still device mismatches we will check ops which can work
+            # on different devices for ex. _foreach_copy, and one of the
+            # device must be cpu in this case we will return from here without
+            # throwing an error
+            if func in mixed_device_fns:
+                if any(map(check_cpu_device, (common_device, t.device))):
+                    return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # mismatching devices of non-zero dim tensors, throw
             # This might be valid behavior and need to be explicitly modeled, e.g. reshape_as
             raise RuntimeError(
@@ -970,7 +1080,13 @@ def _flatten_into(
             if isinstance(value, (tuple, list, torch.Size)):
                 # This will recursively flatten the iterable, calling
                 # convert_sym_int() as necessary.
+<<<<<<< HEAD
                 mode._prep_args_for_hash(result, value, state)
+=======
+                id_hashed_objects: list[object] = []
+                mode._prep_args_for_hash(result, value, state, id_hashed_objects)
+                id_hashed_objects.clear()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif isinstance(value, SymInt):
                 state.convert_sym_int(result, value)
             else:
@@ -981,7 +1097,11 @@ def extract_tensor_metadata(t: Tensor) -> TensorMetadata:
     """
     Extract the TensorMetadata of a tensor.
     """
+<<<<<<< HEAD
     memory_format: Optional[torch.memory_format] = suggest_memory_format(t)
+=======
+    memory_format = suggest_memory_format(t)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Don't call is_contiguous() on a Tensor which has symbolic sizes or things
     # will go badly (guards will be messed up?)
     if (
@@ -989,7 +1109,11 @@ def extract_tensor_metadata(t: Tensor) -> TensorMetadata:
         or is_sparse_any(t)
         or not t.is_contiguous(memory_format=memory_format)
     ):
+<<<<<<< HEAD
         memory_format = None
+=======
+        memory_format = None  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     storage_offset = t.storage_offset()
 
@@ -1043,26 +1167,52 @@ def strip_shape_env(self) -> None:
                 v.strip_shape_env()
 
 
+<<<<<<< HEAD
+=======
+# Default value for constant_value in _DispatchCacheEntryOutputInfo. This is
+# only for checking and differentiates from None.
+class SingletonConstant:
+    pass
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass_slots
 @dataclass(frozen=True)
 class _DispatchCacheEntryOutputInfo:
     """
+<<<<<<< HEAD
     Entry type for the FakeTensor dispatch cache for an output. Accounts for two
+=======
+    Entry type for the FakeTensor dispatch cache for an output. Accounts for three
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     possibilities:
     1) The op is inplace, and a hit means we need to alias the argument at a
        given index.
     2) We need to synthesize a new FakeTensor given tensor metadata. For view
        ops, we further capture the index of the arg to alias.
+<<<<<<< HEAD
+=======
+    3) if the tensor related fields are None, then it is a constant value (e.g.
+    None or integer)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     inplace_idx: Optional[int]
     metadata: Optional[TensorMetadata]
     view_idx: Optional[int]
+<<<<<<< HEAD
+=======
+    constant_value: Optional[Any] = SingletonConstant
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass_slots
 @dataclass(frozen=True)
+<<<<<<< HEAD
 class _DispatchCacheEntry:
+=======
+class _DispatchCacheValidEntry:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Entry type for the FakeTensor dispatch cache. It supports two types of outputs
     1) tensor
@@ -1077,6 +1227,23 @@ class _DispatchCacheEntry:
 
 @dataclass_slots
 @dataclass(frozen=True)
+<<<<<<< HEAD
+=======
+class _DispatchCacheBypassEntry:
+    """
+    Entry type for a negative cache entry.
+    """
+
+    reason: str
+
+
+if TYPE_CHECKING:
+    _DispatchCacheEntry = Union[_DispatchCacheValidEntry, _DispatchCacheBypassEntry]
+
+
+@dataclass_slots
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _BypassDispatchCache(Exception):
     """
     Signals cases that should skip FakeTensor caching.
@@ -1177,7 +1344,11 @@ def __init__(
             torch._functorch.config.fake_tensor_allow_unsafe_data_ptr_access
         )
         self.allow_meta = torch._functorch.config.fake_tensor_allow_meta
+<<<<<<< HEAD
         self.cache_enabled = (
+=======
+        self.cache_enabled: bool = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch._dynamo.config.fake_tensor_cache_enabled
             and not self.propagate_real_tensors
         )
@@ -1362,6 +1533,7 @@ def _cached_dispatch_impl(
         Lookup a cache entry for the given arguments. If none exists, dispatch
         and cache the result (if the result is eligible for caching).
         """
+<<<<<<< HEAD
         output: object = _UNASSIGNED
         try:
             state = _CacheKeyState(self.shape_env)
@@ -1392,6 +1564,92 @@ def _cached_dispatch_impl(
         if output is _UNASSIGNED:
             output = self._dispatch_impl(func, types, args, kwargs)
 
+=======
+        state = None
+        key = None
+        try:
+            state = _CacheKeyState(self.shape_env)
+            key = self._cache_key(state, func, args, kwargs)
+        except _BypassDispatchCache as e:
+            # We couldn't create the cache key at all
+            if (
+                isinstance(func, torch._ops.HigherOrderOperator)
+                and func.name() == "invoke_subgraph"
+            ):
+                hc_log.debug(
+                    "Fake tensor cache failed: identifier = %s, reason = %s",
+                    args[1],
+                    e.reason,
+                )
+            FakeTensorMode.cache_bypasses[e.reason] += 1
+
+        if key is None:
+            # Do this dispatch outside the above except handler so if it
+            # generates its own exception there won't be a __context__ caused by
+            # the caching mechanism.
+            return self._dispatch_impl(func, types, args, kwargs)
+
+        assert state is not None
+        if state.cache_on_shape_env():
+            assert state.shape_env is not None
+            cache = state.shape_env.fake_tensor_cache
+            set_cache_key = _set_cache_key_for_shape_env
+        else:
+            cache = FakeTensorMode.cache
+            set_cache_key = _set_cache_key
+        entry = cache.get(key, None)
+
+        if entry is not None:
+            if isinstance(entry, _DispatchCacheBypassEntry):
+                # This represents a negative cache entry - we already saw that the
+                # output is uncachable. Compute it from first principals.
+                FakeTensorMode.cache_bypasses[entry.reason] += 1
+                return self._dispatch_impl(func, types, args, kwargs)
+
+            # We have a cache entry.
+            output = self._output_from_cache_entry(state, entry, key, func, args)
+            FakeTensorMode.cache_hits += 1
+            if self.cache_crosscheck_enabled:
+                # For debugging / testing: Validate that the output synthesized
+                # from the cache matches the output created by normal dispatch.
+                with disable_fake_tensor_cache(self):
+                    self._crosscheck_cache_output(output, func, types, args, kwargs)
+            return output
+
+        # We don't have a cache entry.
+        output = self._dispatch_impl(func, types, args, kwargs)
+
+        try:
+            self._validate_cache_key(func, args, kwargs)
+        except _BypassDispatchCache as e:
+            # We ran "extra" checks on the cache key and determined that it's no
+            # good. Record the reason and mark it so we don't bother validating
+            # again.
+            if (
+                isinstance(func, torch._ops.HigherOrderOperator)
+                and func.name() == "invoke_subgraph"
+            ):
+                hc_log.debug(
+                    "Fake tensor cache failed: identifier = %s, reason = %s",
+                    args[1],
+                    e.reason,
+                )
+            FakeTensorMode.cache_bypasses[e.reason] += 1
+            set_cache_key(cache, key, _DispatchCacheBypassEntry(e.reason))
+            return output
+
+        try:
+            entry = self._make_cache_entry(state, key, func, args, kwargs, output)
+        except _BypassDispatchCache as e:
+            # We had trouble making the cache entry. Record the reason and mark
+            # it.
+            FakeTensorMode.cache_bypasses[e.reason] += 1
+            set_cache_key(cache, key, _DispatchCacheBypassEntry(e.reason))
+            return output
+
+        set_cache_key(cache, key, entry)
+        FakeTensorMode.cache_misses += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return output
 
     def _cache_key(
@@ -1421,12 +1679,34 @@ def _cache_key(
             # where it wasn't seen on a previous instance of the same op.
             self.shape_env.settings if self.shape_env else None,
         ]
+<<<<<<< HEAD
         # Translate any FakeTensor args to metadata.
         if args:
             self._prep_args_for_hash(key_values, args, state)
         if kwargs:
             self._prep_args_for_hash(key_values, kwargs, state)
         return _DispatchCacheKey(tuple(key_values))
+=======
+        if state.known_symbols:
+            # If there are symbols then include the epoch - this is really more
+            # of a Shape env var which lives on the FakeTensorMode.
+            key_values.append(self.epoch)
+        # Collect the id_hashed objects to attach a weakref finalize later
+        id_hashed_objects: list[object] = []
+        # Translate any FakeTensor args to metadata.
+        if args:
+            self._prep_args_for_hash(key_values, args, state, id_hashed_objects)
+        if kwargs:
+            self._prep_args_for_hash(key_values, kwargs, state, id_hashed_objects)
+        key = _DispatchCacheKey(tuple(key_values))
+
+        for id_hashed_obj in id_hashed_objects:
+            weakref.finalize(
+                id_hashed_obj, functools.partial(evict_fake_tensor_cache_key, key=key)
+            )
+        id_hashed_objects.clear()
+        return key
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _validate_cache_key(
         self,
@@ -1438,6 +1718,19 @@ def _validate_cache_key(
         Validate that the cache key generated by _cache_key will be
         reasonable.
         """
+<<<<<<< HEAD
+=======
+        from torch._higher_order_ops.utils import registered_hop_fake_fns
+
+        # For hops, we perform the validity check in _make_cache_entry  because we
+        # need to have the output tensor.
+        if (
+            isinstance(func, torch._ops.HigherOrderOperator)
+            and func in registered_hop_fake_fns
+        ):
+            return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Avoid caching for any ops that would require a more sophisticated
         # caching implementation, e.g., data dependent ops or ops that modify
         # the inputs.
@@ -1445,6 +1738,24 @@ def _validate_cache_key(
             raise _BypassDispatchCache("data dependent output")
 
         if torch.Tag.dynamic_output_shape in func.tags:
+<<<<<<< HEAD
+=======
+            if func is aten.index.Tensor:
+                _, new_kwargs = normalize_function(  # type: ignore[misc]
+                    func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True  # type: ignore[arg-type]
+                )
+                for index in new_kwargs["indices"]:
+                    # index calls nonzero for bool or int8 tensors, and
+                    # therefore has a dynamic shape output. For other dtypes,
+                    # the output shape depends on the input shape (and not data)
+                    if isinstance(index, torch.Tensor) and index.dtype in (
+                        torch.bool,
+                        torch.int8,
+                    ):
+                        raise _BypassDispatchCache("dynamic output shape")
+                return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise _BypassDispatchCache("dynamic output shape")
 
         if torch.Tag.inplace_view in func.tags:
@@ -1475,6 +1786,10 @@ def _prep_args_for_hash(
         result: list[object],
         args: Union[Mapping[str, object], Sequence[object], Iterable[object]],
         state: _CacheKeyState,
+<<<<<<< HEAD
+=======
+        id_hashed_objects: list[object],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """
         Translate the provided args into a form suitable for caching at FakeTensor
@@ -1482,9 +1797,17 @@ def _prep_args_for_hash(
         convert FakeTensors into metadata. Raises _BypassDispatchCache to signal
         unsupported cases that should bypass caching.
         """
+<<<<<<< HEAD
         if isinstance(args, dict):
             self._prep_args_for_hash(result, args.keys(), state)
             self._prep_args_for_hash(result, args.values(), state)
+=======
+        from torch._higher_order_ops.utils import FunctionalizeCtxWrapper
+
+        if isinstance(args, dict):
+            self._prep_args_for_hash(result, args.keys(), state, id_hashed_objects)
+            self._prep_args_for_hash(result, args.values(), state, id_hashed_objects)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         for arg in args:
@@ -1495,10 +1818,13 @@ def _prep_args_for_hash(
                     raise _BypassDispatchCache("constant attribute")
                 if is_sparse_any(arg):
                     raise _BypassDispatchCache(f"{arg.layout} tensor")
+<<<<<<< HEAD
                 # FIXME: For now back out caching when there are symbolic nbytes
                 # - this doesn't seem to play nice with set(). See T196779132 for examples.
                 if isinstance(arg.untyped_storage().nbytes(), SymInt):
                     raise _BypassDispatchCache("symbolic nbytes")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 metadata = extract_tensor_metadata(arg)
                 metadata._flatten_into(result, self, state)
             elif isinstance(arg, Tensor):
@@ -1508,7 +1834,26 @@ def _prep_args_for_hash(
             elif isinstance(arg, (SymBool, SymFloat)):
                 raise _BypassDispatchCache("symbolic shape")
             elif isinstance(arg, (list, tuple, dict)):
+<<<<<<< HEAD
                 self._prep_args_for_hash(result, arg, state)
+=======
+                self._prep_args_for_hash(result, arg, state, id_hashed_objects)
+            elif isinstance(arg, types.FunctionType):
+                raise _BypassDispatchCache("function argument")
+            elif isinstance(arg, torch.fx.GraphModule):
+                # This is used for invoke_subgraph where id(graph_module) allows
+                # us to cache fake outputs
+                result.append(type(arg))
+                result.append(id(arg))
+                id_hashed_objects.append(arg)
+            elif isinstance(arg, FunctionalizeCtxWrapper):
+                # Special case for AOT Dispatcher first pass, where the fake
+                # tensor is called on the functional wrapper of the subgraph.
+                result.append(hash(arg))
+                # functional wrapper is destroyed after fake tensor prop. We
+                # need to put the finalizer on the subgraph.
+                id_hashed_objects.append(arg.subgraph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 # It's important to capture the type of the arg since, e.g., 1 and 1.0
                 # hash to the same value, but can produce different dtypes for the
@@ -1525,6 +1870,21 @@ def _validate_output_for_cache_entry(
         kwargs: Mapping[str, object],
         output: Optional[FakeTensor],
     ) -> None:
+<<<<<<< HEAD
+=======
+        # Is this even possible? According to the signature this can be None but
+        # not `int`. So either the signature is a lie or (part of) this line is
+        # unnecessary...
+        if isinstance(output, (int, type(None))):
+            return
+
+        if _has_unrepresented_symbols(state, output):
+            # Unbacked symbols are fine - but only if they're also represented
+            # in the input. If there are any new unbacked symbols then we can't
+            # cache this output.
+            raise _BypassDispatchCache("unrepresented symbol in output")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Some ops return tuples of Tensors, but it's rare, so avoid
         # the complexity of caching other types.
         if not isinstance(output, FakeTensor):
@@ -1557,6 +1917,14 @@ def _get_output_info_for_cache_entry(
         kwargs: Mapping[str, object],
         output: FakeTensor,
     ) -> _DispatchCacheEntryOutputInfo:
+<<<<<<< HEAD
+=======
+        if isinstance(output, (int, torch.SymInt, type(None))):
+            return _DispatchCacheEntryOutputInfo(
+                inplace_idx=None, metadata=None, view_idx=None, constant_value=output
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If this is an in-place op, the entry records which input arg is aliased.
         for idx in range(len(args)):
             if id(args[idx]) == id(output):
@@ -1566,7 +1934,11 @@ def _get_output_info_for_cache_entry(
 
         # Otherwise, create an entry that records the output tensor's metadata.
         view_idx = None
+<<<<<<< HEAD
         if func.is_view:
+=======
+        if isinstance(func, torch._ops.OpOverload) and func.is_view:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             idxs = [i for i, t in enumerate(args) if isinstance(t, Tensor)]
             assert len(idxs) == 1
             view_idx = idxs[0]
@@ -1592,12 +1964,30 @@ def _get_output_info_for_cache_entry(
         # we can synthesize a tensor here and do the checks on that instance.
         # This approach keeps the (more frequent) cache-hit path as lightweight
         # as possible.
+<<<<<<< HEAD
         entry_for_synth_output = _DispatchCacheEntry(
             output_infos=(entry,), is_output_tuple=False
         )
         synth_output = self._output_from_cache_entry(
             state, entry_for_synth_output, key, func, args
         )
+=======
+        entry_for_synth_output = _DispatchCacheValidEntry(
+            output_infos=(entry,), is_output_tuple=False
+        )
+        from torch.fx.experimental.symbolic_shapes import GuardOnDataDependentSymNode
+
+        try:
+            synth_output = self._output_from_cache_entry(
+                state, entry_for_synth_output, key, func, args
+            )
+        except GuardOnDataDependentSymNode:
+            # This should probably never really happen. If it does it means that
+            # although the original call didn't get a data-dependent error when
+            # we tried to reconstruct the output we did - that's almost
+            # certainly a bug.
+            raise _BypassDispatchCache("data dependent symnode") from None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Make sure the dispatch_key_set from the synthesized output tensor will
         # be the same.
@@ -1616,17 +2006,50 @@ def _make_cache_entry(
         args: Sequence[object],
         kwargs: Mapping[str, object],
         output: Optional[FakeTensor],
+<<<<<<< HEAD
     ) -> _DispatchCacheEntry:
+=======
+    ) -> _DispatchCacheValidEntry:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Make a cache entry object for the given 'output' Tensor. Raises
         _BypassDispatchCache if the output tensor has characteristics that
         prevent caching it.
         """
+<<<<<<< HEAD
         if output is None:
             output_info = _DispatchCacheEntryOutputInfo(
                 inplace_idx=None, metadata=None, view_idx=None
             )
             return _DispatchCacheEntry(
+=======
+        from torch._higher_order_ops.utils import registered_hop_fake_fns
+        from torch.fx.experimental.symbolic_shapes import has_free_unbacked_symbols
+
+        # For hops, lets look at the output tensor to find any unbacked symints.
+        # If there are none, then we rely on the existing checks to validate
+        # caching.
+        # NB: Note that the HOPs that sta alive till FakeTensor are functional,
+        # once they support mutations, we will have to revisit this logic.
+        if (
+            isinstance(func, torch._ops.HigherOrderOperator)
+            and func in registered_hop_fake_fns
+        ):
+            assert isinstance(output, tuple)
+            non_cacheable = any(
+                isinstance(o, (torch.Tensor, torch.SymInt))
+                and has_free_unbacked_symbols(o)
+                for o in output
+            )
+            if non_cacheable:
+                raise _BypassDispatchCache(f"unbacked symbol in HOP {func} output")
+
+        if isinstance(output, (int, torch.SymInt, type(None))):
+            output_info = _DispatchCacheEntryOutputInfo(
+                inplace_idx=None, metadata=None, view_idx=None, constant_value=output
+            )
+            return _DispatchCacheValidEntry(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_infos=(output_info,), is_output_tuple=False
             )
 
@@ -1647,7 +2070,11 @@ def _make_cache_entry(
                 )
                 for out_elem in output
             ]
+<<<<<<< HEAD
             return _DispatchCacheEntry(
+=======
+            return _DispatchCacheValidEntry(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_infos=tuple(output_infos), is_output_tuple=True
             )
 
@@ -1655,7 +2082,11 @@ def _make_cache_entry(
             output_info = self._get_output_info_for_cache_entry(
                 state, key, func, args, kwargs, output
             )
+<<<<<<< HEAD
             return _DispatchCacheEntry(
+=======
+            return _DispatchCacheValidEntry(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_infos=(output_info,), is_output_tuple=False
             )
 
@@ -1667,6 +2098,16 @@ def _get_output_tensor_from_cache_entry(
         func: OpOverload,
         args: Sequence[object],
     ) -> Optional[FakeTensor]:
+<<<<<<< HEAD
+=======
+        if (
+            entry.inplace_idx is None
+            and entry.metadata is None
+            and entry.view_idx is None
+        ):
+            assert entry.constant_value is not SingletonConstant
+            return entry.constant_value
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if entry.inplace_idx is not None:
             # This is an in-place op; return the aliased arg.
             inplace_arg = args[entry.inplace_idx]
@@ -1715,7 +2156,11 @@ def check_value(
         if metadata.is_neg:
             torch._C._set_neg(empty, True)
 
+<<<<<<< HEAD
         if func.is_view:
+=======
+        if isinstance(func, torch._ops.OpOverload) and func.is_view:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # For view ops, the storage should be the same as the tensor input.
             view_arg = args[cast(int, entry.view_idx)]
             assert isinstance(view_arg, FakeTensor)
@@ -1728,7 +2173,11 @@ def check_value(
     def _output_from_cache_entry(
         self,
         state: _CacheKeyState,
+<<<<<<< HEAD
         entry: _DispatchCacheEntry,
+=======
+        entry: _DispatchCacheValidEntry,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         key: _DispatchCacheKey,
         func: OpOverload,
         args: Sequence[object],
@@ -1740,11 +2189,15 @@ def _output_from_cache_entry(
         if entry.is_output_tuple:
             outputs = [
                 self._get_output_tensor_from_cache_entry(
+<<<<<<< HEAD
                     state,
                     output_info,
                     key,
                     func,
                     args,
+=======
+                    state, output_info, key, func, args
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 for output_info in entry.output_infos
             ]
@@ -1766,6 +2219,28 @@ def _crosscheck_cache_output(
         Helper to validate that the output synthesized from the cache matches
         the output created by normal dispatch.
         """
+<<<<<<< HEAD
+=======
+
+        def assert_helper(a: Any, b: Any) -> None:
+            if isinstance(a, tuple):
+                assert isinstance(b, tuple)
+                assert len(a) == len(b)
+                for l, r in zip(a, b):
+                    assert_helper(l, r)
+            elif isinstance(a, int):
+                assert isinstance(b, int) and a == b
+            elif a is None:
+                assert b is None
+            elif isinstance(a, py_sym_types):
+                assert type(a) == type(b) and a.node is b.node
+            elif isinstance(a, torch.Tensor):
+                assert isinstance(b, torch.Tensor)
+                assert_metadata_eq(assert_eq, a, b)
+            else:
+                raise RuntimeError(f"Unsupported type {type(a)}")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             true_output = self._dispatch_impl(func, types, args, kwargs)
         except Exception as e:
@@ -1774,6 +2249,7 @@ def _crosscheck_cache_output(
                 f"args={args}, kwargs={kwargs}: Dispatch raised={e}"
             ) from e
         try:
+<<<<<<< HEAD
             if (true_output is not None) and (output is not None):
                 if isinstance(true_output, tuple):
                     assert len(true_output) == len(output)
@@ -1785,6 +2261,9 @@ def _crosscheck_cache_output(
             else:
                 assert true_output is None
                 assert output is None
+=======
+            assert_helper(true_output, output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except Exception as e:
             raise RuntimeError(
                 f"FakeTensor cache crosscheck failure: func={func}, "
@@ -1910,6 +2389,24 @@ def _check_fake_real_vals(fake: Any, real: Any) -> None:
                         f"fake shape {s_fake} and real shape {s_real}, "
                         f"at output{keystr(path)}.size({j}), for func: {func}"
                     ) from exc
+<<<<<<< HEAD
+=======
+        elif fake is None and real is not None:
+            if torch._functorch.config.generate_fake_kernels_from_real_mismatches:
+                dtrace_structured(
+                    "mismatched_fake_kernel",
+                    metadata_fn=lambda: {
+                        "op": str(func),
+                        "reason": f"mismatch between fake value {fake} and real value {real}",  # noqa: F821
+                    },
+                )
+                return _infer_fake_from_real_tensor(self, func, real), True  # type: ignore[arg-type]
+            raise MetadataMismatchError(
+                f"Real tensor propagation found a metadata mismatch between "
+                f"fake tensor {fake} and real tensor {real}, "
+                f" at output{keystr(path)}, for func: {func}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             try:
                 _check_fake_real_vals(fake, real)
@@ -2004,6 +2501,11 @@ def _dispatch_impl(
         args: Sequence[object],
         kwargs: Mapping[str, object],
     ) -> Optional[FakeTensor]:
+<<<<<<< HEAD
+=======
+        from torch._higher_order_ops.utils import registered_hop_fake_fns
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         flat_args, args_spec = pytree.tree_flatten((args, kwargs))
 
         # DO NOT PUT LOGIC BEFORE UNRECOGNIZED TYPE CHECKING
@@ -2114,12 +2616,21 @@ def _dispatch_impl(
         # We dispatch size/stride/numel on the FakeTensor not its constant, so bail on inplace_view
         all_constant = all(e.constant is not None for e in flat_arg_fake_tensors)
         if (
+<<<<<<< HEAD
             torch.Tag.nondeterministic_seeded not in func.tags
+=======
+            isinstance(func, torch._ops.OpOverload)
+            and torch.Tag.nondeterministic_seeded not in func.tags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             and torch.Tag.inplace_view not in func.tags
             and all_constant
             and len(flat_arg_fake_tensors) != 0
             and not has_symbolic_sizes
             and not avoiding_device_init
+<<<<<<< HEAD
+=======
+            and func is not aten._nested_tensor_from_tensor_list.default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             const_flat_args = [
                 a.constant if self.is_our_fake(a) else a for a in flat_args
@@ -2150,6 +2661,24 @@ def _dispatch_impl(
         # we are falling through to running non constant tensors, any input constant that
         # is written to must be invalidated
         args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
+<<<<<<< HEAD
+=======
+
+        if (
+            isinstance(func, torch._ops.HigherOrderOperator)
+            and func in registered_hop_fake_fns
+        ):
+            # Reenable the fake tensor mode for the registered fake function
+            maybe_ignore_fresh_unbacked_symbols = (
+                contextlib.nullcontext
+                if self.shape_env is None
+                else self.shape_env.ignore_fresh_unbacked_symbols
+            )
+
+            with self, maybe_ignore_fresh_unbacked_symbols():
+                return registered_hop_fake_fns[func](*args, **kwargs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
         def maybe_to_real_tensor(
@@ -2180,10 +2709,16 @@ def maybe_to_real_tensor(
         if (
             self.propagate_real_tensors
             and all(e.real_tensor is not None for e in flat_arg_fake_tensors)
+<<<<<<< HEAD
             # TODO: Handle SymFloat/SymBool
             and not any(
                 (
                     isinstance(a, SymInt)
+=======
+            and not any(
+                (
+                    isinstance(a, py_sym_types)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     and (syms := free_unbacked_symbols(a))
                     and self.shape_env is not None
                     and any(s not in self.shape_env.unbacked_var_to_val for s in syms)
@@ -2384,10 +2919,39 @@ def go(t: object, real_t: Tensor) -> None:
             func.name()
         ).fake_impl.kernel
         if maybe_fake_impl:
+<<<<<<< HEAD
             ctx = torch._library.fake_impl.FakeImplCtx(self, func)
             with torch._library.fake_impl.set_ctx_getter(lambda: ctx), self:
                 result = maybe_fake_impl(*args, **kwargs)
                 return maybe_propagate_real_tensors(result)
+=======
+            try:
+                ctx = torch._library.fake_impl.FakeImplCtx(self, func)
+                with torch._library.fake_impl.set_ctx_getter(lambda: ctx), self:
+                    result = maybe_fake_impl(*args, **kwargs)
+                    return maybe_propagate_real_tensors(result)
+
+            except MissingOpProfile as e:
+                # If we have a fake kernel registered generated from OpProfiles
+                # but there doesn't exist a profile for the existing inputs, and we are in
+                if (
+                    self.propagate_real_tensors
+                    and real_out is not nil
+                    and not library_utils.is_builtin(func)
+                    and self.shape_env is not None
+                ):
+                    result = inferred_fake_kernel_from_real_out(self, func, real_out)
+
+                    dtrace_structured(
+                        "missing_fake_kernel",
+                        metadata_fn=lambda: {
+                            "op": str(func),
+                        },
+                    )
+                    return maybe_propagate_real_tensors(result)
+                else:
+                    raise e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # special handling for funcs registered through `register_op_impl`,
         # e.g., manipulating args on constructor calls to construct meta tensors
@@ -2484,7 +3048,11 @@ def validate(x: T) -> Union[T, FakeTensor]:
 
             nonlocal flat_arg_fake_tensors
             if not self.is_our_fake(x):
+<<<<<<< HEAD
                 if torch.Tag.inplace_view in func.tags:
+=======
+                if hasattr(func, "tags") and torch.Tag.inplace_view in func.tags:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
                     raise AssertionError(
                         f"Can't call metadata mutating ops on non-Fake Tensor inputs. Found in {render_call(func, args, kwargs)}"
@@ -2668,6 +3236,22 @@ def from_tensor(
 _StoragePointer = object
 
 
+<<<<<<< HEAD
+=======
+def _has_unrepresented_symbols(
+    state: _CacheKeyState, output: Optional[FakeTensor]
+) -> bool:
+    from torch.fx.experimental.symbolic_shapes import _iterate_exprs
+
+    for s in _iterate_exprs(output):
+        for symbol in s.free_symbols:
+            if symbol not in state.known_symbols:
+                return True
+
+    return False
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NB: returns fake tensors
 def run_fallback_kernel(
     fake_mode: FakeTensorMode,
@@ -2733,6 +3317,26 @@ def map_out(e: T) -> Union[T, FakeTensor]:
     return pytree.tree_map(map_out, r)
 
 
+<<<<<<< HEAD
+=======
+def _set_cache_key_for_shape_env(
+    cache: dict[_DispatchCacheKey, _DispatchCacheEntry],
+    key: _DispatchCacheKey,
+    entry: _DispatchCacheEntry,
+) -> None:
+    key.strip_shape_env()
+    cache[key] = entry
+
+
+def _set_cache_key(
+    cache: dict[_DispatchCacheKey, _DispatchCacheEntry],
+    key: _DispatchCacheKey,
+    entry: _DispatchCacheEntry,
+) -> None:
+    cache[key] = entry
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Just for use to allow copying a module to fake tensors,
 # does not apply elsewhere
 class FakeCopyMode(TorchFunctionMode):
@@ -2824,6 +3428,12 @@ def _check_for_subclass_arg(x: object) -> bool:
     torch.ops.aten.is_coalesced.default,
     torch.ops.aten.dense_dim.default,
     torch.ops.aten.sparse_dim.default,
+<<<<<<< HEAD
+=======
+    # _RecordFunction doesn't support __eq__ so make sure not to attempt to
+    # cache it.
+    torch.ops.profiler._record_function_exit._RecordFunction,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 from torch._subclasses.fake_impls import (  # noqa: F401
@@ -2838,6 +3448,14 @@ def _check_for_subclass_arg(x: object) -> bool:
 )
 
 
+<<<<<<< HEAD
+=======
+def evict_fake_tensor_cache_key(key: _DispatchCacheKey) -> None:
+    if key in FakeTensorMode.cache:
+        FakeTensorMode.cache.pop(key)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @atexit.register
 def dump_cache_stats() -> None:
     log.info("FakeTensor cache stats:")
diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py
index fb272adc7ea3..f515c593cfdd 100644
--- a/torch/_subclasses/functional_tensor.py
+++ b/torch/_subclasses/functional_tensor.py
@@ -118,7 +118,11 @@ def __new__(cls, elem, mode):
             FunctionalTensor._extra_dispatch_keys & torch._C._dispatch_keys(elem)
         )
 
+<<<<<<< HEAD
         out = torch.Tensor._make_wrapper_subclass(  # type: ignore[arg-type, attr-defined]
+=======
+        out = torch.Tensor._make_wrapper_subclass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # TODO: right now, _make_wrapper_subclass's dynamic shape interaction is not great.
             # Calling the overload that has kwargs causes us to go down the first overload path,
             # which will **always** specialize sizes.
@@ -160,7 +164,11 @@ def __new__(cls, elem, mode):
                 assert out._inference_mode_base is not None
         return out
 
+<<<<<<< HEAD
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+=======
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unrecognized_types = [
             t
             for t in types
@@ -260,9 +268,18 @@ def tolist(self) -> Any:
 
     def to(self, *args, **kwargs):
         if _detect_infra_mode(torch._C._TorchDispatchModeKey.FUNCTIONAL).export:
+<<<<<<< HEAD
             # If copy is specified as pos arg, it's always the second one.
             if len([arg for arg in args if isinstance(arg, bool)]) <= 1:
                 return super().to(*args, **{**kwargs, "copy": True})
+=======
+            torch.ops.aten._assert_tensor_metadata(
+                self,
+                dtype=self.dtype,
+                device=self.device,
+                layout=self.layout,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().to(*args, **kwargs)
 
     def cuda(self, device=None, *args, **kwargs):
@@ -288,7 +305,11 @@ def to_dense(self):  # type: ignore[override]
         return self.elem.to_dense()
 
     @property
+<<<<<<< HEAD
     def layout(self):
+=======
+    def layout(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.elem.layout
 
     def __bool__(self):
@@ -354,6 +375,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
 
+<<<<<<< HEAD
         if self.export:
             # We need to make sure that we don't decompose to() as usual in export mode,
             # because it can get optimized away. Instead we always replace it with _to_copy().
@@ -371,6 +393,8 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                     torch.ops.aten._to_copy.default, types, args[:1], kwargs
                 )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unrecognized_types = [
             t
             for t in types
@@ -527,6 +551,7 @@ def unwrap(x):
                         *args_unwrapped,
                         **kwargs_unwrapped,
                     )
+<<<<<<< HEAD
                     # We don't allow any mutation on result of dropout or _to_copy
                     if self.export:
                         if func in (
@@ -557,6 +582,12 @@ def must_copy():
                                 )
                             else:
                                 torch._freeze_functional_tensor(outs_unwrapped)  # type: ignore[attr-defined]
+=======
+
+                    if self.export:
+                        if func == torch.ops.aten.dropout.default:
+                            torch._freeze_functional_tensor(outs_unwrapped)  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     outs_wrapped = pytree.tree_map_only(
                         torch.Tensor, wrap, outs_unwrapped
                     )
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 15fa4ad0f44d..ab338038a14a 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -3,11 +3,19 @@
 import contextlib
 import dataclasses
 import functools
+<<<<<<< HEAD
+=======
+import threading
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import typing
 import warnings
 import weakref
 from abc import abstractmethod
+<<<<<<< HEAD
 from contextlib import AbstractContextManager
+=======
+from contextlib import AbstractContextManager, contextmanager
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from dataclasses import dataclass
 from typing import (
     Any,
@@ -38,6 +46,10 @@
     maybe_get_level,
     peek_interpreter_stack,
 )
+<<<<<<< HEAD
+=======
+from torch._dispatch.python import enable_python_dispatcher
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._logging import trace_structured
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
@@ -45,6 +57,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
+=======
+    from collections.abc import Generator
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch._C._functorch import CInterpreter
     from torch._guards import Source
     from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
@@ -92,6 +109,26 @@ def assert_eq(a: _T, b: _T) -> None:
     assert a == b, f"{a} != {b}"
 
 
+<<<<<<< HEAD
+=======
+tls = threading.local()
+# Turns off inference mode for fake tensor propagation. This is turned to True
+# only for `torch.compile`. Also look at
+# _dynamo.config.fake_tensor_disable_inference_mode
+tls.disable_inference_mode = False
+
+
+@contextmanager
+def disable_inference_mode_for_fake_prop() -> Generator[None, None, None]:
+    prior = getattr(tls, "disable_inference_mode", False)
+    tls.disable_inference_mode = True
+    try:
+        yield
+    finally:
+        tls.disable_inference_mode = prior
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def assert_metadata_eq(
     assert_eq: Callable[[object, object], None],
     m1: Union[MetaTensorDesc, torch.Tensor],
@@ -116,7 +153,14 @@ def go(m1: MetaTensorDesc, m2: torch.Tensor) -> None:
         # MetaTensorDesc doesn't store grad_fn; inferred from leaf
         # assert_eq(m1.grad_fn is None, m2.grad_fn is None)
         assert_eq(m1.is_sparse, m2.is_sparse)
+<<<<<<< HEAD
         assert_eq(m1.is_inference, m2.is_inference())
+=======
+        if not getattr(tls, "disable_inference_mode", False):
+            assert_eq(m1.is_inference, m2.is_inference())
+        else:
+            assert_eq(m1.is_inference, False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert_eq(m1.is_conj, m2.is_conj())
         assert_eq(m1.is_neg, m2.is_neg())
         assert_eq(m1.grad is not None, safe_grad(m2) is not None)
@@ -354,6 +398,7 @@ def describe_tensor(
 
         # TODO: Is it important to enable torch.inference_mode before querying
         # these values?
+<<<<<<< HEAD
         r: MetaTensorDesc = MetaTensorDesc(
             id=self.get_tensor_id(t),
             storage=storage,
@@ -362,6 +407,17 @@ def describe_tensor(
             requires_grad=t.requires_grad,
             # NB: ndim should be OK too but there is a disaster at
             # python test/dynamo/test_subclasses.py -k test_user_overidden_property_unsupported
+=======
+        is_inference_mode_disabled = getattr(tls, "disable_inference_mode", False)
+        r: MetaTensorDesc = MetaTensorDesc(
+            id=self.get_tensor_id(t),
+            storage=storage,
+            is_inference=False if is_inference_mode_disabled else t.is_inference(),
+            is_leaf=is_leaf,
+            requires_grad=t.requires_grad,
+            # NB: ndim should be OK too but there is a disaster at
+            # python test/dynamo/test_subclasses.py -k test_user_overridden_property_unsupported
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Actually, this means that we have a little bit of a problem
             # here, which is that there is some sensitivity to how exactly an
             # access is done if you have a __torch_function__ subclass.  Maybe
@@ -1154,6 +1210,7 @@ def view_from_base(
                 torch.fx.experimental.symbolic_shapes.ShapeEnv
             ] = shape_env,
         ) -> _TensorT:
+<<<<<<< HEAD
             # fake-ify t's metadata according to the outer symbolic context
             (sizes, strides, storage_offset) = sym_sizes_strides_storage_offset(
                 t, source
@@ -1282,6 +1339,142 @@ def tensor_visitor_fn(
             torch._check(sym_eq(fake_t.stride(), strides))
             torch._check(sym_eq(fake_t.storage_offset(), storage_offset))
             return fake_t
+=======
+            with enable_python_dispatcher():
+                # fake-ify t's metadata according to the outer symbolic context
+                (sizes, strides, storage_offset) = sym_sizes_strides_storage_offset(
+                    t, source
+                )
+                if (
+                    not t.is_traceable_wrapper_subclass
+                    and not is_traceable_wrapper_subclass(base)
+                ):
+                    # Dense -> Dense view case uses as_strided() to construct view relationship.
+                    # TODO: Change this logic to use view replay for consistency?
+                    # It's likely there is no view func available.
+                    with maybe_suppress():
+                        return self._checked_cast_tensor_t(
+                            base.as_strided(sizes, strides, storage_offset)
+                        )
+
+                from torch._dynamo.source import EphemeralSource
+                from torch.fx.experimental.symbolic_shapes import (
+                    StatelessSymbolicContext,
+                    sym_eq,
+                )
+
+                def symint_visitor_fn(s: int) -> int:
+                    nonlocal symbolic_context
+                    from torch.fx.experimental.symbolic_shapes import DimDynamic
+
+                    all_static_sizes = (
+                        symbolic_context is not None
+                        and isinstance(symbolic_context, StatelessSymbolicContext)
+                        and all(
+                            x is DimDynamic.STATIC
+                            for x in symbolic_context.dynamic_sizes
+                        )
+                    )
+                    # Can't just rely on shape env being None - dynamo always initializes it
+                    if all_static_sizes or shape_env is None:
+                        return s
+
+                    # NB: The symbol here is expected to be simplified out because we a priori
+                    # allocate inner and outer symbols according to the appropriate symbolic
+                    # contexts and prefer those over this symbol during symbol simplification
+                    # (via usage of EphemeralSource below). This -shouldn't- happen, but if
+                    # this symbol somehow leaks out beyond the view tensor's shape metadata, our
+                    # assumption of it being simplified out will fail and it may be guarded on,
+                    # which will hard error.
+                    sym_source = EphemeralSource("symint_visitor_fn")
+
+                    symbol = shape_env.create_symbol(s, sym_source, positive=None)
+                    return shape_env.create_symintnode(
+                        symbol, hint=s, source=sym_source
+                    )
+
+                real_to_fake_mapping = {}
+                if t.is_traceable_wrapper_subclass:
+                    assert t.attrs is not None
+                    # NB: t.ctx could be None if the subclass in question has no
+                    # meaningful context
+                    assert t.type is not None
+
+                    # Fake-ify t naively here; this is only done so we can get fake-ified inner
+                    # tensors with the correct relationships to the outer sizes / strides for use
+                    # in view replay. It's done beforehand here because it's not easy to do when
+                    # visiting tensors one-by-one during view replay.
+                    #
+                    # Example:
+                    #   Consider a Dense -> NJT view. NJT has (values, offsets) components and we
+                    #   want a view of values with the offsets closed over. As the offsets component
+                    #   is needed to describe the output view, it's important that it's fakeified
+                    #   correctly.
+                    fake_t: _TensorT = empty_create_subclass(
+                        t, outer_size=sizes, outer_stride=strides
+                    )
+                    attrs, _ = fake_t.__tensor_flatten__()  # type: ignore[attr-defined]
+                    for attr in attrs:
+                        real_to_fake_mapping[t.attrs[attr].id] = getattr(fake_t, attr)
+
+                def tensor_visitor_fn(
+                    visited_t: torch.Tensor,
+                    # These arguments are never passed, we just use them to close
+                    # over these relevant values
+                    shape_env: Optional[
+                        torch.fx.experimental.symbolic_shapes.ShapeEnv
+                    ] = shape_env,
+                    callback: _MetaTensorCallbackOptDevice[_TensorT] = callback,
+                ) -> torch.Tensor:
+                    # It's possible to close over an undefined tensor (e.g. NJT's lengths).
+                    if visited_t is None:
+                        return None
+
+                    # NB: visited_t being a Tensor here is very naughty!  Should
+                    # have already been described
+
+                    # Fake inner tensors of view subclasses will come from the mapping built above.
+                    visited_id = self.describer.get_tensor_id(visited_t)
+                    fake_visited_t = real_to_fake_mapping.get(visited_id, None)
+                    if fake_visited_t is not None:
+                        return fake_visited_t
+
+                    visited_desc = self.describer.describe_tensor(visited_t)
+
+                    # For other closed-over tensor state, fake-ify it as all dynamic with an
+                    # ephemeral source. This avoids invalid specialization during view replay.
+                    # If we find that in practice the usage of ephemeral sources isn't enough
+                    # to guarantee that we don't have guards on these symbols, we may need to
+                    # explicitly suppress guards (as is done for _base in the dense -> dense
+                    # view case).
+                    temp_source = EphemeralSource("tensor_visitor_fn")
+                    return self.meta_tensor(
+                        visited_desc,
+                        shape_env,
+                        callback,
+                        temp_source,
+                        all_dynamic_symbolic_context(
+                            visited_desc, temp_source, shape_env, callback
+                        ),
+                    )
+
+                # Replay the view, swapping out any non-symbolic SymInts or real tensors
+                # for symbolic SymInts or fake tensors.
+                assert t.view_func is not None
+                # NB: we do NOT suppress guards here, we need to remove ephemeral
+                # sources
+                fake_t = t.view_func.apply(
+                    t, base, symint_visitor_fn, tensor_visitor_fn
+                )
+
+                # Ensure the output has symbolic shapes according to the outer symbolic context.
+                # These checks should simplify out any symbols created for closed-over view func
+                # SymInts.
+                torch._check(sym_eq(fake_t.size(), sizes))
+                torch._check(sym_eq(fake_t.stride(), strides))
+                torch._check(sym_eq(fake_t.storage_offset(), storage_offset))
+                return fake_t
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.get_tensor_memo(t) is None:
             GRAD_TENSOR_SENTINEL_VALUE = -2
@@ -1622,7 +1815,11 @@ def is_c_of_r(
                                 # correct requires_grad, then do the final view.
                                 # NB: Can't have a non-leaf without requiring grad!
                                 assert t.requires_grad
+<<<<<<< HEAD
                                 with torch.no_grad():
+=======
+                                with torch.no_grad(), enable_python_dispatcher():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                     mid = self._checked_cast_tensor_t(
                                         base.view(base.shape)
                                     )
@@ -1789,9 +1986,15 @@ def is_c_of_r(
             # Thanks to storage resizing, it's possible to end up with a tensor
             # that advertises a real size, but has a storage that actually has zero bytes.
             # Need to reflect this in the generated FakeTensor.
+<<<<<<< HEAD
             from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
 
             if t.storage is not None and guard_size_oblivious(t.storage.size == 0):
+=======
+            from torch.fx.experimental.symbolic_shapes import guard_or_false
+
+            if t.storage is not None and guard_or_false(t.storage.size == 0):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 r.untyped_storage().resize_(0)
 
             if t.is_parameter:
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 5bf70c2eca8f..507db9edae0f 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -621,6 +621,7 @@ def backward(
         Args:
             gradient (Tensor, optional): The gradient of the function
                 being differentiated w.r.t. ``self``.
+<<<<<<< HEAD
                 This argument can be omitted if ``self`` is a scalar.
             retain_graph (bool, optional): If ``False``, the graph used to compute
                 the grads will be freed. Note that in nearly all cases setting
@@ -634,6 +635,20 @@ def backward(
                 accumulated into ``.grad``. All other tensors will be ignored. If not
                 provided, the gradient is accumulated into all the leaf Tensors that were
                 used to compute the :attr:`tensors`.
+=======
+                This argument can be omitted if ``self`` is a scalar. Defaults to ``None``.
+            retain_graph (bool, optional): If ``False``, the graph used to compute the grads will be freed;
+                If ``True``, it will be retained. The default is ``None``, in which case the value is inferred from ``create_graph``
+                (i.e., the graph is retained only when higher-order derivative tracking is requested). Note that in nearly all cases
+                setting this option to True is not needed and often can be worked around in a much more efficient way.
+            create_graph (bool, optional): If ``True``, graph of the derivative will
+                be constructed, allowing to compute higher order derivative
+                products. Defaults to ``False``.
+            inputs (Sequence[Tensor], optional): Inputs w.r.t. which the gradient will be
+                accumulated into ``.grad``. All other tensors will be ignored. If not
+                provided, the gradient is accumulated into all the leaf Tensors that were
+                used to compute the :attr:`tensors`. Defaults to ``None``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if has_torch_function_unary(self):
             return handle_torch_function(
@@ -1125,7 +1140,13 @@ def __format__(self, format_spec):
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.__format__, (self,), self, format_spec)
         if self.dim() == 0 and not self.is_meta and type(self) is Tensor:
+<<<<<<< HEAD
             return self.item().__format__(format_spec)
+=======
+            # Use detach() here to avoid the warning when converting a scalar Tensor that
+            # requires gradients to a python number. It is ok for formatting.
+            return self.detach().item().__format__(format_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return object.__format__(self, format_spec)
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
@@ -1262,7 +1283,11 @@ def __cuda_array_interface__(self):
         """Array view description for cuda tensors.
 
         See:
+<<<<<<< HEAD
         https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
+=======
+        https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if has_torch_function_unary(self):
             # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 9e1956763242..46f4be2c61c0 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -6,7 +6,11 @@
 from torch._torch_docs import parse_kwargs, reproducibility_notes
 
 
+<<<<<<< HEAD
 def add_docstr_all(method, docstr):
+=======
+def add_docstr_all(method: str, docstr: str) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     add_docstr(getattr(torch._C.TensorBase, method), docstr)
 
 
@@ -2474,6 +2478,10 @@ def add_docstr_all(method, docstr):
     value (float): the value to fill with
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
     >>> index = torch.tensor([0, 2])
     >>> x.index_fill_(1, index, -1)
@@ -3163,7 +3171,14 @@ def callable(a, b) -> number
 Example:
 
     >>> self = torch.tensor([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
+<<<<<<< HEAD
     >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]], dtype=torch.bool)
+=======
+    >>> mask = torch.tensor(
+    ...     [[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]],
+    ...     dtype=torch.bool,
+    ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
     >>> self.masked_scatter_(mask, source)
     tensor([[0, 0, 0, 0, 1],
@@ -3645,7 +3660,11 @@ def callable(a, b) -> number
     # Example 1: Padding
     >>> input_tensor = torch.tensor([[1, 0], [3, 2]])
     >>> static_size = 4
+<<<<<<< HEAD
     >>> t = torch.nonzero_static(input_tensor, size = static_size)
+=======
+    >>> t = torch.nonzero_static(input_tensor, size=static_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensor([[  0,   0],
             [  1,   0],
             [  1,   1],
@@ -3654,20 +3673,32 @@ def callable(a, b) -> number
     # Example 2: Truncating
     >>> input_tensor = torch.tensor([[1, 0], [3, 2]])
     >>> static_size = 2
+<<<<<<< HEAD
     >>> t = torch.nonzero_static(input_tensor, size = static_size)
+=======
+    >>> t = torch.nonzero_static(input_tensor, size=static_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensor([[  0,   0],
             [  1,   0]], dtype=torch.int64)
 
     # Example 3: 0 size
     >>> input_tensor = torch.tensor([10])
     >>> static_size = 0
+<<<<<<< HEAD
     >>> t = torch.nonzero_static(input_tensor, size = static_size)
+=======
+    >>> t = torch.nonzero_static(input_tensor, size=static_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensor([], size=(0, 1), dtype=torch.int64)
 
     # Example 4: 0 rank input
     >>> input_tensor = torch.tensor(10)
     >>> static_size = 2
+<<<<<<< HEAD
     >>> t = torch.nonzero_static(input_tensor, size = static_size)
+=======
+    >>> t = torch.nonzero_static(input_tensor, size=static_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensor([], size=(2, 0), dtype=torch.int64)
 """,
 )
@@ -4163,9 +4194,15 @@ def callable(a, b) -> number
 .. warning::
 
     :meth:`~Tensor.repeat` behaves differently from
+<<<<<<< HEAD
     `numpy.repeat <https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html>`_,
     but is more similar to
     `numpy.tile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_.
+=======
+    `numpy.repeat <https://numpy.org/doc/stable/reference/generated/numpy.repeat.html>`_,
+    but is more similar to
+    `numpy.tile <https://numpy.org/doc/stable/reference/generated/numpy.tile.html>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     For the operator similar to `numpy.repeat`, see :func:`torch.repeat_interleave`.
 
 Args:
@@ -5197,6 +5234,16 @@ def callable(a, b) -> number
     Otherwise, the returned tensor is a copy of ``self`` with the desired
     :class:`torch.dtype` and :class:`torch.device`.
 
+<<<<<<< HEAD
+=======
+.. note::
+
+    If ``self`` requires gradients (``requires_grad=True``) but the target
+    ``dtype`` specified is an integer type, the returned tensor will implicitly
+    set ``requires_grad=False``. This is because only tensors with
+    floating-point or complex dtypes can require gradients.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Here are the ways to call ``to``:
 
 .. method:: to(dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
@@ -5207,6 +5254,16 @@ def callable(a, b) -> number
     Args:
         {memory_format}
 
+<<<<<<< HEAD
+=======
+.. note::
+
+    According to `C++ type conversion rules <https://en.cppreference.com/w/cpp/language/implicit_conversion.html>`_,
+    converting floating point value to integer type will truncate the fractional part.
+    If the truncated value cannot fit into the target type (e.g., casting ``torch.inf`` to ``torch.long``),
+    the behavior is undefined and the result may vary across platforms.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 .. method:: to(device=None, dtype=None, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
    :noindex:
 
@@ -6554,7 +6611,14 @@ def callable(a, b) -> number
 Example:
 
     >>> self = torch.tensor([0, 0, 0, 0, 0])
+<<<<<<< HEAD
     >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]], dtype=torch.bool)
+=======
+    >>> mask = torch.tensor(
+    ...     [[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]],
+    ...     dtype=torch.bool,
+    ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
     >>> self.masked_scatter(mask, source)
     tensor([[0, 0, 0, 0, 1],
@@ -6858,6 +6922,10 @@ def callable(a, b) -> number
 Returns :attr:`self` if :attr:`self` is a real-valued tensor tensor.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> x=torch.randn(4, dtype=torch.cfloat)
     >>> x
     tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
@@ -6877,6 +6945,10 @@ def callable(a, b) -> number
     :func:`imag` is only supported for tensors with complex dtypes.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> x=torch.randn(4, dtype=torch.cfloat)
     >>> x
     tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
@@ -6910,6 +6982,10 @@ def callable(a, b) -> number
 to avoid downcasting and potentially losing information.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> csr = torch.eye(5,5).to_sparse_csr()
     >>> csr.crow_indices()
     tensor([0, 1, 2, 3, 4, 5], dtype=torch.int32)
@@ -6930,6 +7006,10 @@ def callable(a, b) -> number
 to avoid downcasting and potentially losing information.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> csr = torch.eye(5,5).to_sparse_csr()
     >>> csr.col_indices()
     tensor([0, 1, 2, 3, 4], dtype=torch.int32)
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index b13daaeba235..0e74b693bf65 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -120,6 +120,10 @@ def tensor_totype(t):
         if (
             t.is_mps
             or (t.is_xpu and not torch.xpu.get_device_properties(t.device).has_fp64)
+<<<<<<< HEAD
+=======
+            or t.is_maia
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         else torch.double
     )
@@ -142,6 +146,17 @@ def __init__(self, tensor):
                 self.max_width = max(self.max_width, len(value_str))
 
         else:
+<<<<<<< HEAD
+=======
+            if tensor.dtype == torch.float4_e2m1fn_x2:  # type: ignore[attr-defined]
+                # torch.float4_e2m1fn_x2 is special and does not support the casts necessary
+                # to print it, we choose to display the uint8 representation here for
+                # convenience of being able to print a tensor.
+                # TODO(#146647): extend this to other dtypes without casts defined, such
+                # as the bits, uint1..7 and int1..7 dtypes.
+                tensor_view = tensor_view.view(torch.uint8)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nonzero_finite_vals = torch.masked_select(
                 tensor_view, torch.isfinite(tensor_view) & tensor_view.ne(0)
             )
@@ -159,8 +174,12 @@ def __init__(self, tensor):
                 # support for them is removed
                 nonzero_finite_vals = nonzero_finite_vals.float()
 
+<<<<<<< HEAD
             # Convert to double for easy calculation. HalfTensor overflows with 1e8, and there's no div() on CPU.
 
+=======
+            # Convert to double (or float) for easy calculation. HalfTensor overflows with 1e8, and there's no div() on CPU.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             nonzero_finite_abs = tensor_totype(nonzero_finite_vals.abs())
             nonzero_finite_min = tensor_totype(nonzero_finite_abs.min())
             nonzero_finite_max = tensor_totype(nonzero_finite_abs.max())
@@ -258,6 +277,17 @@ def _val_formatter(val, formatter1=formatter1, formatter2=formatter2):
         else:
             return formatter1.format(val)
 
+<<<<<<< HEAD
+=======
+    if self.dtype == torch.float4_e2m1fn_x2:  # type: ignore[attr-defined]
+        # torch.float4_e2m1fn_x2 is special and does not support the casts necessary
+        # to print it, we choose to display the uint8 representation here for
+        # convenience of being able to print a tensor.
+        # TODO(#146647): extend this to other dtypes without casts defined, such
+        # as the bits, uint1..7 and int1..7 dtypes.
+        self = self.view(torch.uint8)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if summarize and not PRINT_OPTS.edgeitems:
         # Deal with edge case that negative zero is zero
         data = ["..."]
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 4225dff91680..5b9276c49c5d 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -50,6 +50,14 @@ def merge_dicts(*dicts):
     keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
 """
     ),
+<<<<<<< HEAD
+=======
+    {
+        "opt_keepdim": """
+    keepdim (bool, optional): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+"""
+    },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 multi_dim_common = merge_dicts(
@@ -70,12 +78,21 @@ def merge_dicts(*dicts):
     {
         "opt_dim": """
     dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+<<<<<<< HEAD
         If ``None``, all dimensions are reduced.
 """
     },
     {
         "opt_keepdim": """
     keepdim (bool, optional): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+=======
+"""
+    },
+    {
+        "opt_dim_all_reduce": """
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+        If ``None``, all dimensions are reduced.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
     },
 )
@@ -88,6 +105,20 @@ def merge_dicts(*dicts):
 """
     ),
     {
+<<<<<<< HEAD
+=======
+        "opt_dim": """
+    dim (int, optional): the dimension to reduce.
+"""
+    },
+    {
+        "opt_dim_all_reduce": """
+    dim (int, optional): the dimension to reduce.
+        If ``None``, all dimensions are reduced.
+"""
+    },
+    {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "keepdim_details": """If :attr:`keepdim` is ``True``, the output tensor is of the same size
 as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
 Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
@@ -525,7 +556,11 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.addmm,
     r"""
+<<<<<<< HEAD
 addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+=======
+addmm(input, mat1, mat2, out_dtype=None, *, beta=1, alpha=1, out=None) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
 The matrix :attr:`input` is added to the final result.
@@ -562,6 +597,12 @@ def merge_dicts(*dicts):
     input (Tensor): matrix to be added
     mat1 (Tensor): the first matrix to be matrix multiplied
     mat2 (Tensor): the second matrix to be matrix multiplied
+<<<<<<< HEAD
+=======
+    out_dtype (dtype, optional): the dtype of the output tensor,
+        Supported only on CUDA and for torch.float32 given
+        torch.float16/torch.bfloat16 input dtypes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
@@ -592,6 +633,10 @@ def merge_dicts(*dicts):
     {input}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> x = torch.arange(4, dtype=torch.float)
     >>> A = torch.complex(x, x).reshape(2, 2)
     >>> A
@@ -748,7 +793,11 @@ def merge_dicts(*dicts):
 """
     + r"""
 elementwise, for all elements of :attr:`input` and :attr:`other`. The behaviour of this function is analogous to
+<<<<<<< HEAD
 `numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_
+=======
+`numpy.allclose <https://numpy.org/doc/stable/reference/generated/numpy.allclose.html>`_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Args:
     input (Tensor): first tensor to compare
@@ -773,7 +822,11 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.all,
     r"""
+<<<<<<< HEAD
 all(input: Tensor) -> Tensor
+=======
+all(input: Tensor, *, out=None) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Tests if all elements in :attr:`input` evaluate to `True`.
 
@@ -781,6 +834,15 @@ def merge_dicts(*dicts):
           output of dtype `bool` for all supported dtypes except `uint8`.
           For `uint8` the dtype of output is `uint8` itself.
 
+<<<<<<< HEAD
+=======
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Example::
 
     >>> a = torch.rand(1, 2).bool()
@@ -804,8 +866,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {out}
@@ -836,6 +903,15 @@ def merge_dicts(*dicts):
           output of dtype `bool` for all supported dtypes except `uint8`.
           For `uint8` the dtype of output is `uint8` itself.
 
+<<<<<<< HEAD
+=======
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Example::
 
     >>> a = torch.rand(1, 2).bool()
@@ -859,8 +935,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {out}
@@ -918,6 +999,7 @@ def merge_dicts(*dicts):
 :attr:`size`, :attr:`stride` and :attr:`storage_offset`.
 
 .. warning::
+<<<<<<< HEAD
     Prefer using other view functions, like :meth:`torch.Tensor.expand`,
     to setting a view's strides manually with `as_strided`, as this
     function's behavior depends on the implementation of a tensor's storage.
@@ -925,6 +1007,17 @@ def merge_dicts(*dicts):
     the storage or a runtime error will be thrown, and if the view is
     "overlapped" (with multiple indices referring to the same element in
     memory) its behavior is undefined.
+=======
+    Prefer using other view functions, like :meth:`torch.Tensor.view` or
+    :meth:`torch.Tensor.expand`, to setting a view's strides manually with
+    `as_strided`, as this function will throw an error on non-standard Pytorch
+    backends (that do not have a concept of stride) and the result will depend
+    on the current layout in memory. The constructed view must only refer to
+    elements within the Tensor's storage or a runtime error will be thrown.
+    If the generated view is "overlapped" (with multiple indices referring to
+    the same element in memory), the behavior of inplace operations on this view
+    is undefined (and might not throw runtime errors).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Args:
     {input}
@@ -1306,7 +1399,11 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.baddbmm,
     r"""
+<<<<<<< HEAD
 baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+=======
+baddbmm(input, batch1, batch2, out_dtype=None, *, beta=1, alpha=1, out=None) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Performs a batch matrix-matrix product of matrices in :attr:`batch1`
 and :attr:`batch2`.
@@ -1340,6 +1437,12 @@ def merge_dicts(*dicts):
     input (Tensor): the tensor to be added
     batch1 (Tensor): the first batch of matrices to be multiplied
     batch2 (Tensor): the second batch of matrices to be multiplied
+<<<<<<< HEAD
+=======
+    out_dtype (dtype, optional): the dtype of the output tensor,
+        Supported only on CUDA and for torch.float32 given
+        torch.float16/torch.bfloat16 input dtypes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
@@ -1482,7 +1585,11 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.bmm,
     r"""
+<<<<<<< HEAD
 bmm(input, mat2, *, out=None) -> Tensor
+=======
+bmm(input, mat2, out_dtype=None, *, out=None) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Performs a batch matrix-matrix product of matrices stored in :attr:`input`
 and :attr:`mat2`.
@@ -1508,6 +1615,12 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the first batch of matrices to be multiplied
     mat2 (Tensor): the second batch of matrices to be multiplied
+<<<<<<< HEAD
+=======
+    out_dtype (dtype, optional): the dtype of the output tensor,
+        Supported only on CUDA and for torch.float32 given
+        torch.float16/torch.bfloat16 input dtypes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword Args:
     {out}
@@ -2011,6 +2124,10 @@ def merge_dicts(*dicts):
     indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> t = torch.arange(16.0).reshape(4,4)
     >>> t
     tensor([[ 0.,  1.,  2.,  3.],
@@ -2060,6 +2177,10 @@ def merge_dicts(*dicts):
     indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> t = torch.arange(16.0).reshape(4,4)
     >>> t
     tensor([[ 0.,  1.,  2.,  3.],
@@ -2101,6 +2222,10 @@ def merge_dicts(*dicts):
     indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> t = torch.arange(16.0).reshape(2, 2, 4)
     >>> t
     tensor([[[ 0.,  1.,  2.,  3.],
@@ -2256,6 +2381,10 @@ def merge_dicts(*dicts):
         :func:`torch.corrcoef` normalized covariance matrix.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> x = torch.tensor([[0, 2], [1, 1], [2, 0]]).T
     >>> x
     tensor([[0, 1, 2],
@@ -4012,7 +4141,15 @@ def merge_dicts(*dicts):
 
 ``True`` if two tensors have the same size and elements, ``False`` otherwise.
 
+<<<<<<< HEAD
 Note that tensors containing NaNs are never equal to each other.
+=======
+.. note::
+
+    Tensors containing NaNs are never equal to each other. Additionally, this function does not
+    differentiate between the data types of the tensors during comparison. For more thorough tensor checks,
+    use :meth:`torch.testing.assert_close`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Example::
 
@@ -4020,6 +4157,11 @@ def merge_dicts(*dicts):
     True
     >>> torch.equal(torch.tensor([3, torch.nan]), torch.tensor([3, torch.nan]))
     False
+<<<<<<< HEAD
+=======
+    >>> torch.equal(torch.tensor([1, 2, 3], dtype=torch.int32), torch.tensor([1, 2, 3], dtype=torch.float32))
+    True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -4432,6 +4574,10 @@ def merge_dicts(*dicts):
     {pin_memory}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> t = torch.randn(2, 5, dtype=torch.float64)
     >>> t.numpy().tofile('storage.pt')
     >>> t_mapped = torch.from_file('storage.pt', shared=False, size=10, dtype=torch.float64)
@@ -5103,6 +5249,10 @@ def merge_dicts(*dicts):
     bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
     ...                   weight=torch.tensor([1., 2., 4., 8.]))
         torch.return_types.histogramdd(
@@ -5628,7 +5778,11 @@ def merge_dicts(*dicts):
     {input}
     k (int): k for the k-th smallest element
     dim (int, optional): the dimension to find the kth value along
+<<<<<<< HEAD
     {keepdim}
+=======
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     out (tuple, optional): the output tuple of (Tensor, LongTensor)
@@ -6247,8 +6401,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {opt_dim}
     {keepdim}
+=======
+    {dim}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {out}
@@ -6455,13 +6614,23 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.max,
     r"""
+<<<<<<< HEAD
 max(input) -> Tensor
+=======
+max(input, *, out=None) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Returns the maximum value of all elements in the ``input`` tensor.
 
 Args:
     {input}
 
+<<<<<<< HEAD
+=======
+Keyword args:
+    {out}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Example::
 
     >>> a = torch.randn(1, 3)
@@ -6488,7 +6657,11 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {opt_dim}
+=======
+    {opt_dim_all_reduce}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {opt_keepdim}
 
 Keyword args:
@@ -6592,17 +6765,29 @@ def merge_dicts(*dicts):
 .. note::
     The difference between ``max``/``min`` and ``amax``/``amin`` is:
         - ``amax``/``amin`` supports reducing on multiple dimensions,
+<<<<<<< HEAD
         - ``amax``/``amin`` does not return indices,
         - ``amax``/``amin`` evenly distributes gradient between equal values,
           while ``max(dim)``/``min(dim)`` propagates gradient only to a single
           index in the source tensor.
+=======
+        - ``amax``/``amin`` does not return indices.
+
+    Both ``max``/``min`` and ``amax``/``amin`` evenly distribute gradients between equal values
+    when there are multiple input elements with the same minimum or maximum value.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 {keepdim_details}
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
   {out}
@@ -6656,8 +6841,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim} If ``None``, the argmax of the flattened input is returned.
     {keepdim}
+=======
+    {opt_dim} If ``None``, the argmax of the flattened input is returned.
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Example::
 
@@ -6748,8 +6938,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {dtype}
@@ -6794,8 +6989,13 @@ def merge_dicts(*dicts):
 
 Args:
     input (Tensor): the input tensor, either of floating point or complex dtype
+<<<<<<< HEAD
     {opt_dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {dtype}
@@ -6877,8 +7077,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
@@ -6934,8 +7139,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
@@ -6982,8 +7192,13 @@ def merge_dicts(*dicts):
 Args:
     {input}
     q (float or Tensor): a scalar or 1D tensor of values in the range [0, 1].
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword arguments:
     interpolation (str): interpolation method to use when the desired quantile lies between two data points.
@@ -7040,8 +7255,13 @@ def merge_dicts(*dicts):
 Args:
     {input}
     q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1]
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword arguments:
     interpolation (str): interpolation method to use when the desired quantile lies between two data points.
@@ -7070,13 +7290,23 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.min,
     r"""
+<<<<<<< HEAD
 min(input) -> Tensor
+=======
+min(input, *, out=None) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Returns the minimum value of all elements in the :attr:`input` tensor.
 
 Args:
     {input}
 
+<<<<<<< HEAD
+=======
+Keyword args:
+    {out}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Example::
 
     >>> a = torch.randn(1, 3)
@@ -7103,8 +7333,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     out (tuple, optional): the tuple of two output tensors (min, min_indices)
@@ -7197,17 +7432,29 @@ def merge_dicts(*dicts):
 .. note::
     The difference between ``max``/``min`` and ``amax``/``amin`` is:
         - ``amax``/``amin`` supports reducing on multiple dimensions,
+<<<<<<< HEAD
         - ``amax``/``amin`` does not return indices,
         - ``amax``/``amin`` evenly distributes gradient between equal values,
           while ``max(dim)``/``min(dim)`` propagates gradient only to a single
           index in the source tensor.
+=======
+        - ``amax``/``amin`` does not return indices.
+
+    Both ``max``/``min`` and ``amax``/``amin`` evenly distribute gradients between equal values
+    when there are multiple input elements with the same minimum or maximum value.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 {keepdim_details}
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
   {out}
@@ -7303,8 +7550,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim} If ``None``, the argmin of the flattened input is returned.
     {keepdim}
+=======
+    {opt_dim} If ``None``, the argmin of the flattened input is returned.
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Example::
 
@@ -7329,7 +7581,11 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.mm,
     r"""
+<<<<<<< HEAD
 mm(input, mat2, *, out=None) -> Tensor
+=======
+mm(input, mat2, out_dtype=None, *, out=None) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Performs a matrix multiplication of the matrices :attr:`input` and :attr:`mat2`.
 
@@ -7355,6 +7611,12 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the first matrix to be matrix multiplied
     mat2 (Tensor): the second matrix to be matrix multiplied
+<<<<<<< HEAD
+=======
+    out_dtype (dtype, optional): the dtype of the output tensor,
+        Supported only on CUDA and for torch.float32 given
+        torch.float16/torch.bfloat16 input dtypes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {out}
@@ -7493,8 +7755,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     out (tuple, optional): the result tuple of two output tensors (values, indices)
@@ -8580,8 +8847,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {dtype}
@@ -9988,6 +10260,10 @@ def merge_dicts(*dicts):
     {check_invariants}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> compressed_indices = [0, 2, 4]
     >>> plain_indices = [0, 1, 0, 1]
     >>> values = [1, 2, 3, 4]
@@ -10048,6 +10324,10 @@ def merge_dicts(*dicts):
     {check_invariants}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> crow_indices = [0, 2, 4]
     >>> col_indices = [0, 1, 0, 1]
     >>> values = [1, 2, 3, 4]
@@ -10110,6 +10390,10 @@ def merge_dicts(*dicts):
     {check_invariants}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> ccol_indices = [0, 2, 4]
     >>> row_indices = [0, 1, 0, 1]
     >>> values = [1, 2, 3, 4]
@@ -10174,6 +10458,10 @@ def merge_dicts(*dicts):
     {check_invariants}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> crow_indices = [0, 1, 2]
     >>> col_indices = [0, 1]
     >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
@@ -10240,6 +10528,10 @@ def merge_dicts(*dicts):
     {check_invariants}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> ccol_indices = [0, 1, 2]
     >>> row_indices = [0, 1]
     >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
@@ -10477,7 +10769,11 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {dim}
+=======
+    {opt_dim_all_reduce}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     correction (int): difference between the sample size and sample degrees of freedom.
@@ -10487,7 +10783,11 @@ def merge_dicts(*dicts):
             Previously this argument was called ``unbiased`` and was a boolean
             with ``True`` corresponding to ``correction=1`` and ``False`` being
             ``correction=0``.
+<<<<<<< HEAD
     {keepdim}
+=======
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {out}
 
 Example:
@@ -10496,7 +10796,12 @@ def merge_dicts(*dicts):
     ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
     ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
     ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+<<<<<<< HEAD
     ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+=======
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]]
+    ... )  # fmt: skip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.std(a, dim=1, keepdim=True)
     tensor([[1.0311],
             [0.7477],
@@ -10532,7 +10837,11 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {opt_dim}
+=======
+    {opt_dim_all_reduce}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     correction (int): difference between the sample size and sample degrees of freedom.
@@ -10542,7 +10851,11 @@ def merge_dicts(*dicts):
             Previously this argument was called ``unbiased`` and was a boolean
             with ``True`` corresponding to ``correction=1`` and ``False`` being
             ``correction=0``.
+<<<<<<< HEAD
     {keepdim}
+=======
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {out}
 
 Returns:
@@ -10554,7 +10867,12 @@ def merge_dicts(*dicts):
     ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
     ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
     ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+<<<<<<< HEAD
     ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+=======
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]]
+    ... )  # fmt: skip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.std_mean(a, dim=0, keepdim=True)
     (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
      tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
@@ -10640,8 +10958,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {opt_dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {dtype}
@@ -10692,8 +11015,13 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {opt_dim}
     {keepdim}
+=======
+    {opt_dim_all_reduce}
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     {dtype}
@@ -11480,8 +11808,13 @@ def merge_dicts(*dicts):
         Default: if not provided, 0.
 
 Keyword args:
+<<<<<<< HEAD
     dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
         Default: if ``None``, ``torch.long``.
+=======
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor,
+        only support ``torch.int``, ``torch.long``. Default: if ``None``, ``torch.long``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {device}
     layout (:class:`torch.layout`, optional): currently only support ``torch.strided``.
 
@@ -11605,8 +11938,13 @@ def merge_dicts(*dicts):
         Default: if not provided, 0.
 
 Keyword args:
+<<<<<<< HEAD
     dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
         Default: if ``None``, ``torch.long``.
+=======
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor,
+        only support ``torch.int``, ``torch.long``. Default: if ``None``, ``torch.long``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {device}
     layout (:class:`torch.layout`, optional): currently only support ``torch.strided``.
 
@@ -11823,7 +12161,11 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {opt_dim}
+=======
+    {opt_dim_all_reduce}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     correction (int): difference between the sample size and sample degrees of freedom.
@@ -11833,7 +12175,11 @@ def merge_dicts(*dicts):
             Previously this argument was called ``unbiased`` and was a boolean
             with ``True`` corresponding to ``correction=1`` and ``False`` being
             ``correction=0``.
+<<<<<<< HEAD
     {keepdim}
+=======
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {out}
 
 Example:
@@ -11842,7 +12188,12 @@ def merge_dicts(*dicts):
     ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
     ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
     ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+<<<<<<< HEAD
     ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+=======
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]]
+    ... )  # fmt: skip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.var(a, dim=1, keepdim=True)
     tensor([[1.0631],
             [0.5590],
@@ -11877,7 +12228,11 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+<<<<<<< HEAD
     {opt_dim}
+=======
+    {opt_dim_all_reduce}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Keyword args:
     correction (int): difference between the sample size and sample degrees of freedom.
@@ -11887,7 +12242,11 @@ def merge_dicts(*dicts):
             Previously this argument was called ``unbiased`` and was a boolean
             with ``True`` corresponding to ``correction=1`` and ``False`` being
             ``correction=0``.
+<<<<<<< HEAD
     {keepdim}
+=======
+    {opt_keepdim}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {out}
 
 Returns:
@@ -11899,7 +12258,12 @@ def merge_dicts(*dicts):
     ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
     ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
     ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+<<<<<<< HEAD
     ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+=======
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]]
+    ... )  # fmt: skip
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.var_mean(a, dim=0, keepdim=True)
     (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
      tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
@@ -12695,7 +13059,11 @@ def merge_dicts(*dicts):
 Computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_ along
 :attr:`dim`. By default the spacing between elements is assumed to be 1, but
 :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+<<<<<<< HEAD
 used to specify arbitrary spacing along :attr:`dim`.
+=======
+used to specify arbitrary spacing along :attr:`dim`. Only one of :attr:`x` or :attr:`dx` should be specified.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 Assuming :attr:`y` is a one-dimensional tensor with elements :math:`{y_0, y_1, ..., y_n}`,
@@ -13358,14 +13726,28 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.Event,
     r"""
+<<<<<<< HEAD
 Event(device, *, enable_timing) -> Event
+=======
+Event(device=None, *, enable_timing=False, blocking=False, interprocess=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Query and record Stream status to identify or control dependencies across Stream and measure timing.
 
 Arguments:
     device (:class:`torch.device`, optional): the desired device for the Event.
         If not given, the current :ref:`accelerator<accelerators>` type will be used.
+<<<<<<< HEAD
     enable_timing (bool, optional): indicates if the event should measure time (default: ``False``).
+=======
+    enable_timing (bool, optional): indicates if the event should measure time (default: ``False``)
+    blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``)
+    interprocess (bool): if ``True``, the event can be shared between processes (default: ``False``)
+
+.. warning::
+
+    Both blocking and interprocess are not supported right now and are noops.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Returns:
     Event: An torch.Event object.
@@ -13373,6 +13755,10 @@ def merge_dicts(*dicts):
 Example::
 
     >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+<<<<<<< HEAD
+=======
+    >>> event = torch.Event()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> e_cuda = torch.Event(device='cuda')
 """,
 )
@@ -13428,14 +13814,22 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.Event.record,
     r"""
+<<<<<<< HEAD
 Event.record(stream) -> None
+=======
+Event.record(stream=None) -> None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Record the event in a given stream. The stream's device must match the event's device.
 This function is equivalent to ``stream.record_event(self)``.
 
 Arguments:
     stream (:class:`torch.Stream`, optional): A stream to be recorded.
+<<<<<<< HEAD
     If not given, the current stream will be used.
+=======
+        If not given, the current stream will be used.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Example::
 
@@ -13466,13 +13860,21 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.Event.wait,
     r"""
+<<<<<<< HEAD
 Event.wait(stream) -> None
+=======
+Event.wait(stream=None) -> None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Make all future work submitted to the given stream wait for this event.
 
 Arguments:
     stream (:class:`torch.Stream`, optional): A stream to synchronize.
+<<<<<<< HEAD
     If not given, the current stream will be used.
+=======
+        If not given, the current stream will be used.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Example::
 
@@ -13783,7 +14185,11 @@ def merge_dicts(*dicts):
 boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
 as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that
 this behavior is opposite the behavior of
+<<<<<<< HEAD
 `numpy.digitize <https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html>`_.
+=======
+`numpy.digitize <https://numpy.org/doc/stable/reference/generated/numpy.digitize.html>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 More formally, the returned index satisfies the following rules:
 
 .. list-table::
diff --git a/torch/_utils.py b/torch/_utils.py
index 449075adb23f..e452e0209972 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -274,11 +274,33 @@ def _rebuild_tensor_v3(
 # to Pickler semantics, we have to use the same (non-validating) function for
 # unpickling sparse tensors, regardless of the caller.
 def _validate_loaded_sparse_tensors():
+<<<<<<< HEAD
     try:
         for t in _sparse_tensors_to_validate:
             if t.layout is torch.sparse_coo:
                 torch._validate_sparse_coo_tensor_args(
                     t._indices(), t._values(), t.size(), t.is_coalesced()
+=======
+    if not torch.sparse.check_sparse_tensor_invariants().is_enabled():
+        # Skip sparse tensor invariants validation for better
+        # performance. See check_sparse_tensor_invariants
+        # documentation for how to control sparse tensor invariants
+        # checking.
+        _sparse_tensors_to_validate.clear()
+        return
+    try:
+        # We disable pinning check (see check_pinning=False below) to
+        # avoid gh-153143. In fact, pinning check is unnecessary
+        # anywhy when loading sparse data from external sources.
+        for t in _sparse_tensors_to_validate:
+            if t.layout is torch.sparse_coo:
+                torch._validate_sparse_coo_tensor_args(
+                    t._indices(),
+                    t._values(),
+                    t.size(),
+                    t.is_coalesced(),
+                    check_pinning=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             elif t.layout in {
                 torch.sparse_csr,
@@ -299,7 +321,16 @@ def _validate_loaded_sparse_tensors():
                         t.row_indices(),
                     )
                 torch._validate_sparse_compressed_tensor_args(
+<<<<<<< HEAD
                     compressed_indices, plain_indices, t.values(), t.size(), t.layout
+=======
+                    compressed_indices,
+                    plain_indices,
+                    t.values(),
+                    t.size(),
+                    t.layout,
+                    check_pinning=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 raise NotImplementedError(
@@ -391,7 +422,11 @@ def _rebuild_wrapper_subclass(
     requires_grad,
 ):
     device = _get_restore_location(device)
+<<<<<<< HEAD
     return torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+=======
+    return torch.Tensor._make_wrapper_subclass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cls,
         size,
         strides=stride,
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index b30cbd25b506..8eb6dbd98e69 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -159,6 +159,13 @@ def export_training_ir_rollout_check() -> bool:
     return True
 
 
+<<<<<<< HEAD
+=======
+def full_aoti_runtime_assert() -> bool:
+    return True
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def log_torch_jit_trace_exportability(
     api: str,
     type_of_export: str,
@@ -206,7 +213,11 @@ def is_fb_unit_test() -> bool:
     return False
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def max_clock_rate():
     if not torch.version.hip:
         from triton.testing import nvsmi
@@ -217,8 +228,11 @@ def max_clock_rate():
         # functionality in triton.testing or via pyamdsmi enablement. Required
         # for test_snode_runtime unit tests.
         gcn_arch = str(torch.cuda.get_device_properties(0).gcnArchName.split(":", 1)[0])
+<<<<<<< HEAD
         if "gfx950" in gcn_arch:
             return 1700
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if "gfx94" in gcn_arch:
             return 1700
         elif "gfx90a" in gcn_arch:
@@ -276,3 +290,10 @@ def record_chromium_event_internal(
     event: dict[str, Any],
 ):
     return None
+<<<<<<< HEAD
+=======
+
+
+def profiler_allow_cudagraph_cupti_lazy_reinit_cuda12():
+    return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/_vmap_internals.py b/torch/_vmap_internals.py
index 1ea8f520123f..4cfd4c166628 100644
--- a/torch/_vmap_internals.py
+++ b/torch/_vmap_internals.py
@@ -219,7 +219,11 @@ def _vmap(
     # The `allow_none_pass_through` argument is a temporary workaround may be removed.
     # Currently it enables us to wrap the call in `autograd.grad` to the autograd engine,
     # which may return None if any of the inputs are unused. See the issue discussing this:
+<<<<<<< HEAD
     # https://github.com/facebookresearch/functorch/issues/159.
+=======
+    # https://github.com/pytorch/functorch/issues/159.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @functools.wraps(func)
     def wrapped(*args):
         _check_out_dims_is_int_or_int_tuple(out_dims, func)
diff --git a/torch/accelerator/__init__.py b/torch/accelerator/__init__.py
index 3bfc7ca0ce71..ccb8315aa1dd 100644
--- a/torch/accelerator/__init__.py
+++ b/torch/accelerator/__init__.py
@@ -16,6 +16,10 @@
     "current_device_index",
     "current_stream",
     "device_count",
+<<<<<<< HEAD
+=======
+    "device_index",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "is_available",
     "set_device_idx",  # deprecated
     "set_device_index",
@@ -30,8 +34,22 @@ def device_count() -> int:
     Returns:
         int: the number of the current :ref:`accelerator<accelerators>` available.
             If there is no available accelerators, return 0.
+<<<<<<< HEAD
     """
     return torch._C._accelerator_deviceCount()
+=======
+
+    .. note:: This API delegates to the device-specific version of `device_count`.
+        On CUDA, this API will NOT poison fork if NVML discovery succeeds.
+        Otherwise, it will. For more details, see :ref:`multiprocessing-poison-fork-note`.
+    """
+    acc = current_accelerator()
+    if acc is None:
+        return 0
+
+    mod = torch.get_device_module(acc)
+    return mod.device_count()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_available() -> bool:
@@ -42,6 +60,14 @@ def is_available() -> bool:
     Returns:
         bool: A boolean indicating if there is an available :ref:`accelerator<accelerators>`.
 
+<<<<<<< HEAD
+=======
+    .. note:: This API delegates to the device-specific version of `is_available`.
+        On CUDA, when the environment variable ``PYTORCH_NVML_BASED_CUDA_CHECK=1`` is set,
+        this function will NOT poison fork. Otherwise, it will. For more details, see
+        :ref:`multiprocessing-poison-fork-note`.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Example::
 
         >>> assert torch.accelerator.is_available() "No available accelerators detected."
@@ -74,6 +100,10 @@ def current_accelerator(check_available: bool = False) -> Optional[torch.device]
 
     .. note:: The index of the returned :class:`torch.device` will be ``None``, please use
         :func:`torch.accelerator.current_device_index` to know the current index being used.
+<<<<<<< HEAD
+=======
+        This API does NOT poison fork. For more details, see :ref:`multiprocessing-poison-fork-note`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example::
 
@@ -113,7 +143,11 @@ def set_device_index(device: _device_t, /) -> None:
 
     .. note:: This function is a no-op if this device index is negative.
     """
+<<<<<<< HEAD
     device_index = _get_device_index(device)
+=======
+    device_index = _get_device_index(device, optional=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._C._accelerator_setDeviceIndex(device_index)
 
 
@@ -134,7 +168,11 @@ def current_stream(device: _device_t = None, /) -> torch.Stream:
     Returns:
         torch.Stream: the currently selected stream for a given device.
     """
+<<<<<<< HEAD
     device_index = _get_device_index(device, True)
+=======
+    device_index = _get_device_index(device, optional=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch._C._accelerator_getStream(device_index)
 
 
@@ -172,5 +210,47 @@ def synchronize(device: _device_t = None, /) -> None:
         >>> torch.accelerator.synchronize()
         >>> elapsed_time_ms = start_event.elapsed_time(end_event)
     """
+<<<<<<< HEAD
     device_index = _get_device_index(device, True)
     torch._C._accelerator_synchronizeDevice(device_index)
+=======
+    device_index = _get_device_index(device, optional=True)
+    torch._C._accelerator_synchronizeDevice(device_index)
+
+
+class device_index:
+    r"""Context manager to set the current device index for the current :ref:`accelerator<accelerators>`.
+    Temporarily changes the current device index to the specified value for the duration
+    of the context, and automatically restores the previous device index when exiting
+    the context.
+
+    Args:
+        device (Optional[int]): a given device index to temporarily set. If None,
+            no device index switching occurs.
+
+    Examples:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> # Set device 0 as the current device temporarily
+        >>> with torch.accelerator.device_index(0):
+        ...     # Code here runs with device 0 as the current device
+        ...     pass
+        >>> # Original device is now restored
+        >>> # No-op when None is passed
+        >>> with torch.accelerator.device_index(None):
+        ...     # No device switching occurs
+        ...     pass
+    """
+
+    def __init__(self, device: Optional[int], /) -> None:
+        self.idx = device
+        self.prev_idx = -1
+
+    def __enter__(self) -> None:
+        if self.idx is not None:
+            self.prev_idx = torch._C._accelerator_exchangeDevice(self.idx)
+
+    def __exit__(self, *exc_info: object) -> None:
+        if self.idx is not None:
+            torch._C._accelerator_maybeExchangeDevice(self.prev_idx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/accelerator/_utils.py b/torch/accelerator/_utils.py
index 3a29acd240cd..0429dfe06726 100644
--- a/torch/accelerator/_utils.py
+++ b/torch/accelerator/_utils.py
@@ -16,7 +16,11 @@ def _get_device_index(device: _device_t, optional: bool = False) -> int:
             raise RuntimeError("Accelerator expected")
         if acc.type != device.type:
             raise ValueError(
+<<<<<<< HEAD
                 f"{device.type} doesn't match the current accelerator {torch.accelerator.current_accelerator()}."
+=======
+                f"{device.type} doesn't match the current accelerator {acc}."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         device_index = device.index
     if device_index is None:
diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py
index 7e6c61c2660d..d1c1b78e694f 100644
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@@ -30,7 +30,11 @@ def is_autocast_available(device_type: str) -> bool:
     Return a bool indicating if autocast is available on :attr:`device_type`.
 
     Args:
+<<<<<<< HEAD
         device_type(str):  Device type to use. Possible values are: 'cuda', 'cpu', 'mtia', 'xpu' and so on.
+=======
+        device_type(str):  Device type to use. Possible values are: 'cuda', 'cpu', 'mtia', 'maia', 'xpu', and so on.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             The type is the same as the `type` attribute of a :class:`torch.device`.
             Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
     """
@@ -202,7 +206,11 @@ def forward(self, x):
     (see :ref:`Working with Multiple GPUs<amp-multigpu>`).
 
     Args:
+<<<<<<< HEAD
         device_type(str, required):  Device type to use. Possible values are: 'cuda', 'cpu', 'mtia', 'xpu', and 'hpu'.
+=======
+        device_type(str, required):  Device type to use. Possible values are: 'cuda', 'cpu', 'mtia', 'maia', 'xpu', and 'hpu'.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                      The type is the same as the `type` attribute of a :class:`torch.device`.
                                      Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
         enabled(bool, optional):  Whether autocasting should be enabled in the region.
@@ -260,8 +268,13 @@ def __init__(
         self._cache_enabled = torch.is_autocast_cache_enabled()
         if (
             enabled
+<<<<<<< HEAD
             and torch.cuda.amp.common.amp_definitely_not_available()
             and self.device == "cuda"
+=======
+            and self.device == "cuda"
+            and torch.cuda.amp.common.amp_definitely_not_available()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             warnings.warn(
                 "User provided device_type of 'cuda', but CUDA is not available. Disabling"
@@ -289,6 +302,16 @@ def __init__(
                 error_message += "MTIA Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
                 warnings.warn(error_message)
                 enabled = False
+<<<<<<< HEAD
+=======
+        elif self.device == "maia":
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = "In MAIA autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "MAIA Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
+                warnings.warn(error_message)
+                enabled = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif self.device == "xpu":
             supported_dtype = [torch.bfloat16, torch.float16]
             if self.fast_dtype not in supported_dtype:
@@ -480,7 +503,11 @@ def custom_fwd(
     See the :ref:`example page<amp-custom-examples>` for more detail.
 
     Args:
+<<<<<<< HEAD
         device_type(str):  Device type to use. 'cuda', 'cpu', 'mtia', 'xpu' and so on.
+=======
+        device_type(str):  Device type to use. 'cuda', 'cpu', 'mtia', 'maia', 'xpu' and so on.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             The type is the same as the `type` attribute of a :class:`torch.device`.
             Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
         cast_inputs (:class:`torch.dtype` or None, optional, default=None):  If not ``None``,
@@ -534,7 +561,11 @@ def custom_bwd(bwd=None, *, device_type: str):
     See the :ref:`example page<amp-custom-examples>` for more detail.
 
     Args:
+<<<<<<< HEAD
         device_type(str):  Device type to use. 'cuda', 'cpu', 'mtia', 'xpu' and so on.
+=======
+        device_type(str):  Device type to use. 'cuda', 'cpu', 'mtia', 'maia', 'xpu' and so on.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             The type is the same as the `type` attribute of a :class:`torch.device`.
             Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
     """
diff --git a/torch/amp/grad_scaler.py b/torch/amp/grad_scaler.py
index 93b1d667c08a..33076dbee894 100644
--- a/torch/amp/grad_scaler.py
+++ b/torch/amp/grad_scaler.py
@@ -336,7 +336,15 @@ def unscale_(self, optimizer: torch.optim.Optimizer) -> None:
 
         # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
         assert self._scale is not None
+<<<<<<< HEAD
         inv_scale = self._scale.double().reciprocal().float()
+=======
+        inv_scale = (
+            self._scale.double().reciprocal().float()
+            if self._scale.device != torch.device("mps:0")
+            else self._scale.reciprocal()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         found_inf = torch.full((), 0.0, dtype=torch.float32, device=self._scale.device)
 
         optimizer_state["found_inf_per_device"] = self._unscale_grads_(
diff --git a/torch/ao/nn/intrinsic/modules/fused.py b/torch/ao/nn/intrinsic/modules/fused.py
index c7ae2ce3319d..a91371617b99 100644
--- a/torch/ao/nn/intrinsic/modules/fused.py
+++ b/torch/ao/nn/intrinsic/modules/fused.py
@@ -47,7 +47,14 @@ def __init__(self, conv, relu):
         assert (
             type_before_parametrizations(conv) == Conv1d
             and type_before_parametrizations(relu) == ReLU
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(relu)}"
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(conv, relu)
 
 
@@ -59,7 +66,14 @@ def __init__(self, conv, relu):
         assert (
             type_before_parametrizations(conv) == Conv2d
             and type_before_parametrizations(relu) == ReLU
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(relu)}"
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(conv, relu)
 
 
@@ -71,7 +85,14 @@ def __init__(self, conv, relu):
         assert (
             type_before_parametrizations(conv) == Conv3d
             and type_before_parametrizations(relu) == ReLU
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(relu)}"
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(conv, relu)
 
 
@@ -83,7 +104,14 @@ def __init__(self, linear, relu):
         assert (
             type_before_parametrizations(linear) == Linear
             and type_before_parametrizations(relu) == ReLU
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(linear)}{type_before_parametrizations(relu)}"
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(linear)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(linear, relu)
 
 
@@ -95,7 +123,14 @@ def __init__(self, conv, bn):
         assert (
             type_before_parametrizations(conv) == Conv1d
             and type_before_parametrizations(bn) == BatchNorm1d
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}"
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(conv, bn)
 
 
@@ -107,7 +142,14 @@ def __init__(self, conv, bn):
         assert (
             type_before_parametrizations(conv) == Conv2d
             and type_before_parametrizations(bn) == BatchNorm2d
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}"
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(conv, bn)
 
 
@@ -120,7 +162,15 @@ def __init__(self, conv, bn, relu):
             type_before_parametrizations(conv) == Conv1d
             and type_before_parametrizations(bn) == BatchNorm1d
             and type_before_parametrizations(relu) == ReLU
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}{type_before_parametrizations(relu)}"  # noqa: B950
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(conv, bn, relu)
 
 
@@ -133,7 +183,15 @@ def __init__(self, conv, bn, relu):
             type_before_parametrizations(conv) == Conv2d
             and type_before_parametrizations(bn) == BatchNorm2d
             and type_before_parametrizations(relu) == ReLU
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}{type_before_parametrizations(relu)}"  # noqa: B950
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(conv, bn, relu)
 
 
@@ -145,7 +203,14 @@ def __init__(self, conv, bn):
         assert (
             type_before_parametrizations(conv) == Conv3d
             and type_before_parametrizations(bn) == BatchNorm3d
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}"
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(conv, bn)
 
 
@@ -158,7 +223,15 @@ def __init__(self, conv, bn, relu):
             type_before_parametrizations(conv) == Conv3d
             and type_before_parametrizations(bn) == BatchNorm3d
             and type_before_parametrizations(relu) == ReLU
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}{type_before_parametrizations(relu)}"  # noqa: B950
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(conv)}"
+            f"{type_before_parametrizations(bn)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(conv, bn, relu)
 
 
@@ -170,7 +243,14 @@ def __init__(self, batch_norm, relu):
         assert (
             type_before_parametrizations(batch_norm) == BatchNorm2d
             and type_before_parametrizations(relu) == ReLU
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(batch_norm)}{type_before_parametrizations(relu)}"
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(batch_norm)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(batch_norm, relu)
 
 
@@ -182,7 +262,14 @@ def __init__(self, batch_norm, relu):
         assert (
             type_before_parametrizations(batch_norm) == BatchNorm3d
             and type_before_parametrizations(relu) == ReLU
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(batch_norm)}{type_before_parametrizations(relu)}"
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(batch_norm)}"
+            f"{type_before_parametrizations(relu)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(batch_norm, relu)
 
 
@@ -194,7 +281,14 @@ def __init__(self, linear, bn):
         assert (
             type_before_parametrizations(linear) == Linear
             and type_before_parametrizations(bn) == BatchNorm1d
+<<<<<<< HEAD
         ), f"Incorrect types for input modules{type_before_parametrizations(linear)}{type_before_parametrizations(bn)}"
+=======
+        ), (
+            f"Incorrect types for input modules{type_before_parametrizations(linear)}"
+            f"{type_before_parametrizations(bn)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(linear, bn)
 
 
@@ -203,9 +297,15 @@ class LinearLeakyReLU(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
 
     def __init__(self, linear, leaky_relu):
+<<<<<<< HEAD
         assert (
             type(linear) == Linear and type(leaky_relu) == torch.nn.LeakyReLU
         ), f"Incorrect types for input modules{type(linear)}{type(leaky_relu)}"
+=======
+        assert type(linear) == Linear and type(leaky_relu) == torch.nn.LeakyReLU, (
+            f"Incorrect types for input modules{type(linear)}{type(leaky_relu)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(linear, leaky_relu)
 
 
@@ -214,9 +314,15 @@ class LinearTanh(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
 
     def __init__(self, linear, tanh):
+<<<<<<< HEAD
         assert (
             type(linear) == Linear and type(tanh) == torch.nn.Tanh
         ), f"Incorrect types for input modules{type(linear)}{type(tanh)}"
+=======
+        assert type(linear) == Linear and type(tanh) == torch.nn.Tanh, (
+            f"Incorrect types for input modules{type(linear)}{type(tanh)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(linear, tanh)
 
 
diff --git a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
index 8e0ee5dcce04..cac569688c7f 100644
--- a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
@@ -456,6 +456,10 @@ class ConvBn1d(_ConvBnNd, nn.Conv1d):
         weight_fake_quant: fake quant module for weight
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm1d]] = nn.BatchNorm1d
     _FLOAT_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
     _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvBn1d  # type: ignore[assignment]
@@ -524,6 +528,10 @@ class ConvBnReLU1d(ConvBn1d):
         weight_fake_quant: fake quant module for weight
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # base class defines _FLOAT_MODULE as "ConvBn1d"
     _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvBnReLU1d
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
@@ -590,6 +598,10 @@ class ConvReLU1d(nnqat.Conv1d, nni._FusedModule):
         weight_fake_quant: fake quant module for weight
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nni.ConvReLU1d]] = nni.ConvReLU1d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
     _FLOAT_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
@@ -630,7 +642,11 @@ def forward(self, input):
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_float(
             mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
@@ -653,6 +669,10 @@ class ConvBn2d(_ConvBnNd, nn.Conv2d):
         weight_fake_quant: fake quant module for weight
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nni.ConvBn2d]] = nni.ConvBn2d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
     _FLOAT_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nn.BatchNorm2d
@@ -721,6 +741,10 @@ class ConvBnReLU2d(ConvBn2d):
         weight_fake_quant: fake quant module for weight
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # base class defines _FLOAT_MODULE as "ConvBn2d"
     _FLOAT_MODULE: ClassVar[type[nni.ConvBnReLU2d]] = nni.ConvBnReLU2d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
@@ -787,6 +811,10 @@ class ConvReLU2d(nnqat.Conv2d, nni._FusedModule):
         weight_fake_quant: fake quant module for weight
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nn.Module]] = nni.ConvReLU2d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
     _FLOAT_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
@@ -827,7 +855,11 @@ def forward(self, input):
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_float(
             mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
@@ -850,6 +882,10 @@ class ConvBn3d(_ConvBnNd, nn.Conv3d):
         weight_fake_quant: fake quant module for weight
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nni.ConvBn3d]] = nni.ConvBn3d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
     _FLOAT_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nn.BatchNorm3d
@@ -918,6 +954,10 @@ class ConvBnReLU3d(ConvBn3d):
         weight_fake_quant: fake quant module for weight
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nni.ConvBnReLU3d]] = nni.ConvBnReLU3d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
     _FLOAT_BN_MODULE: ClassVar[type[nn.BatchNorm3d]] = nn.BatchNorm3d
@@ -985,6 +1025,10 @@ class ConvReLU3d(nnqat.Conv3d, nni._FusedModule):
         weight_fake_quant: fake quant module for weight
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nni.ConvReLU3d]] = nni.ConvReLU3d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
     _FLOAT_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
@@ -1025,7 +1069,11 @@ def forward(self, input):
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_float(
             mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_fused.py b/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
index 75d46cf3f7d6..c814f827b617 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
@@ -169,6 +169,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             False,
             qconfig,
         )
+<<<<<<< HEAD
         qat_linearbn.weight = linear.weight
         qat_linearbn.bias = linear.bias
         qat_linearbn.bn.weight = bn.weight
@@ -176,6 +177,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         qat_linearbn.bn.running_mean = bn.running_mean
         qat_linearbn.bn.running_var = bn.running_var
         qat_linearbn.bn.num_batches_tracked = bn.num_batches_tracked
+=======
+        qat_linearbn.weight = linear.weight  # type: ignore[assignment]
+        qat_linearbn.bias = linear.bias  # type: ignore[assignment]
+        qat_linearbn.bn.weight = bn.weight  # type: ignore[assignment]
+        qat_linearbn.bn.bias = bn.bias  # type: ignore[assignment]
+        qat_linearbn.bn.running_mean = bn.running_mean  # type: ignore[assignment]
+        qat_linearbn.bn.running_var = bn.running_var  # type: ignore[assignment]
+        qat_linearbn.bn.num_batches_tracked = bn.num_batches_tracked  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return qat_linearbn
 
     def to_float(self):
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
index 9cf0d4bba898..25bb4ee72d4a 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
@@ -28,6 +28,10 @@ class LinearReLU(nnqat.Linear, nni._FusedModule):
         >>> print(output.size())
         torch.Size([128, 30])
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
 
     def __init__(self, in_features, out_features, bias=True, qconfig=None):
diff --git a/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
index 1515e005cc7b..d8f1805ba267 100644
--- a/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
@@ -27,6 +27,10 @@ class LinearReLU(nnqd.Linear):
         >>> print(output.size())
         torch.Size([128, 30])
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
 
     def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
@@ -56,5 +60,9 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_reference(cls, ref_qlinear_relu):
+=======
+    def from_reference(cls, ref_qlinear_relu):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_reference(ref_qlinear_relu[0])
diff --git a/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py b/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py
index 78009d1c76f4..d32b5c610c15 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py
@@ -19,6 +19,10 @@ class BNReLU2d(nnq.BatchNorm2d):
         Same as torch.ao.nn.quantized.BatchNorm2d
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.ao.nn.intrinsic.BNReLU2d
 
     def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
@@ -46,7 +50,11 @@ def _get_name(self):
         return "QuantizedBNReLU2d"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: Add qat support for BNReLU2d
         return super().from_float(
             mod, use_precomputed_fake_quant=use_precomputed_fake_quant
@@ -67,6 +75,10 @@ class BNReLU3d(nnq.BatchNorm3d):
         Same as torch.ao.nn.quantized.BatchNorm3d
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.ao.nn.intrinsic.BNReLU3d
 
     def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
@@ -94,7 +106,11 @@ def _get_name(self):
         return "QuantizedBNReLU3d"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: Add qat support for BNReLU3d
         return super().from_float(
             mod, use_precomputed_fake_quant=use_precomputed_fake_quant
diff --git a/torch/ao/nn/intrinsic/quantized/modules/conv_add.py b/torch/ao/nn/intrinsic/quantized/modules/conv_add.py
index 0d1b7e01f447..8ec88308e05d 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/conv_add.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/conv_add.py
@@ -19,6 +19,10 @@ class ConvAdd2d(nnq.Conv2d):
         Same as torch.ao.nn.quantized.Conv2d
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvAdd2d  # type: ignore[assignment]
 
     def __init__(
@@ -67,7 +71,11 @@ def _get_name(self):
         return "QuantizedConvAdd2d"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_float(
             mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
@@ -87,6 +95,10 @@ class ConvAddReLU2d(nnq.Conv2d):
         Same as torch.ao.nn.quantized.Conv2d
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvAddReLU2d  # type: ignore[assignment]
 
     def __init__(
@@ -135,7 +147,11 @@ def _get_name(self):
         return "QuantizedConvAddReLU2d"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_float(
             mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
diff --git a/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py b/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
index 25d695859180..7e8e82acf728 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
@@ -28,6 +28,10 @@ class ConvReLU1d(nnq.Conv1d):
         Same as torch.ao.nn.quantized.Conv1d
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU1d  # type: ignore[assignment]
 
     def __init__(
@@ -77,7 +81,11 @@ def _get_name(self):
         return "QuantizedConvReLU1d"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU1d:
             assert mod.bn.running_var is not None and mod.bn.running_mean is not None
             mod.weight, mod.bias = fuse_conv_bn_weights(
@@ -93,9 +101,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+<<<<<<< HEAD
         assert (
             type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU1d
         ), "BatchNorm1d should be fused into Conv1d before converting to reference module"
+=======
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU1d, (
+            "BatchNorm1d should be fused into Conv1d before converting to reference module"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
 
 
@@ -109,6 +123,10 @@ class ConvReLU2d(nnq.Conv2d):
         Same as torch.ao.nn.quantized.Conv2d
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU2d  # type: ignore[assignment]
 
     def __init__(
@@ -157,7 +175,11 @@ def _get_name(self):
         return "QuantizedConvReLU2d"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU2d:
             assert mod.bn.running_var is not None and mod.bn.running_mean is not None
             mod.weight, mod.bias = fuse_conv_bn_weights(
@@ -175,9 +197,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+<<<<<<< HEAD
         assert (
             type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU2d
         ), "BatchNorm2d should be fused into Conv2d before converting to reference module"
+=======
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU2d, (
+            "BatchNorm2d should be fused into Conv2d before converting to reference module"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
 
 
@@ -190,6 +218,10 @@ class ConvReLU3d(nnq.Conv3d):
     Attributes: Same as torch.ao.nn.quantized.Conv3d
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU3d  # type: ignore[assignment]
 
     def __init__(
@@ -239,7 +271,11 @@ def _get_name(self):
         return "QuantizedConvReLU3d"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU3d:
             assert mod.bn.running_var is not None and mod.bn.running_mean is not None
             mod.weight, mod.bias = fuse_conv_bn_weights(
@@ -257,7 +293,13 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+<<<<<<< HEAD
         assert (
             type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU3d
         ), "BatchNorm3d should be fused into Conv3d before converting to reference module"
+=======
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU3d, (
+            "BatchNorm3d should be fused into Conv3d before converting to reference module"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
diff --git a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
index 5173ed813bf0..e61740f4ccf8 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
@@ -30,6 +30,10 @@ class LinearReLU(nnq.Linear):
         >>> print(output.size())
         torch.Size([128, 30])
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
 
     def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
@@ -70,6 +74,10 @@ class LinearLeakyReLU(nnq.Linear):
         >>> print(output.size())
         torch.Size([128, 30])
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nni.LinearLeakyReLU  # type: ignore[assignment]
 
     def __init__(
@@ -92,14 +100,24 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+<<<<<<< HEAD
         assert (
             type(mod) == nni.LinearLeakyReLU
         ), "Input float module should be LinearLeakyReLU"
+=======
+        assert type(mod) == nni.LinearLeakyReLU, (
+            "Input float module should be LinearLeakyReLU"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
         activation_post_process = mod.activation_post_process
         leaky_relu = mod[1]
         mod = mod[0]
+<<<<<<< HEAD
         weight_post_process = mod.qconfig.weight()
+=======
+        weight_post_process = mod.qconfig.weight()  # type: ignore[union-attr, operator]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weight_post_process(mod.weight)
         dtype = weight_post_process.dtype
         act_scale, act_zp = activation_post_process.calculate_qparams()  # type: ignore[union-attr,operator]
@@ -108,7 +126,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         qlinear_leaky_relu = cls(
             mod.in_features, mod.out_features, leaky_relu.negative_slope, dtype=dtype
         )
+<<<<<<< HEAD
         qlinear_leaky_relu.set_weight_bias(qweight, mod.bias)
+=======
+        qlinear_leaky_relu.set_weight_bias(qweight, mod.bias)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qlinear_leaky_relu.scale = float(act_scale)
         qlinear_leaky_relu.zero_point = int(act_zp)
         return qlinear_leaky_relu
@@ -145,6 +167,10 @@ class LinearTanh(nnq.Linear):
         >>> print(output.size())
         torch.Size([128, 30])
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nni.LinearTanh  # type: ignore[assignment]
 
     def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
@@ -164,14 +190,22 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
         activation_post_process = mod.activation_post_process
         mod = mod[0]
+<<<<<<< HEAD
         weight_post_process = mod.qconfig.weight()
+=======
+        weight_post_process = mod.qconfig.weight()  # type: ignore[union-attr,operator]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weight_post_process(mod.weight)
         dtype = weight_post_process.dtype
         act_scale, act_zp = activation_post_process.calculate_qparams()  # type: ignore[union-attr,operator]
         assert dtype == torch.qint8, "Weight observer must have dtype torch.qint8"
         qweight = _quantize_weight(mod.weight.float(), weight_post_process)
         qlinear_tanh = cls(mod.in_features, mod.out_features, dtype=dtype)
+<<<<<<< HEAD
         qlinear_tanh.set_weight_bias(qweight, mod.bias)
+=======
+        qlinear_tanh.set_weight_bias(qweight, mod.bias)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qlinear_tanh.scale = float(act_scale)
         qlinear_tanh.zero_point = int(act_zp)
         return qlinear_tanh
diff --git a/torch/ao/nn/qat/modules/conv.py b/torch/ao/nn/qat/modules/conv.py
index ac6363b8b097..1956a6a21731 100644
--- a/torch/ao/nn/qat/modules/conv.py
+++ b/torch/ao/nn/qat/modules/conv.py
@@ -134,6 +134,10 @@ class Conv1d(_ConvNd, nn.Conv1d):
     Attributes:
         weight_fake_quant: fake quant module for weight
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv1d]] = nn.Conv1d
 
@@ -174,7 +178,11 @@ def __init__(
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_float(
             cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
@@ -195,6 +203,10 @@ class Conv2d(_ConvNd, nn.Conv2d):
     Attributes:
         weight_fake_quant: fake quant module for weight
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
 
@@ -238,7 +250,11 @@ def forward(self, input):
         return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_float(
             cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
@@ -259,6 +275,10 @@ class Conv3d(_ConvNd, nn.Conv3d):
     Attributes:
         weight_fake_quant: fake quant module for weight
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
     _FLOAT_CONV_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
 
@@ -302,7 +322,11 @@ def forward(self, input):
         return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().from_float(
             cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
diff --git a/torch/ao/nn/qat/modules/embedding_ops.py b/torch/ao/nn/qat/modules/embedding_ops.py
index 27ad7c9db437..9e69e1c9f1cc 100644
--- a/torch/ao/nn/qat/modules/embedding_ops.py
+++ b/torch/ao/nn/qat/modules/embedding_ops.py
@@ -23,6 +23,10 @@ class Embedding(nn.Embedding):
     Attributes:
         weight: fake quant module for weight
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nn.Embedding
 
     def __init__(
@@ -137,6 +141,10 @@ class EmbeddingBag(nn.EmbeddingBag):
     Attributes:
         weight: fake quant module for weight
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nn.EmbeddingBag
 
     def __init__(
diff --git a/torch/ao/nn/qat/modules/linear.py b/torch/ao/nn/qat/modules/linear.py
index ede488e66b0b..1c3cf36d56c4 100644
--- a/torch/ao/nn/qat/modules/linear.py
+++ b/torch/ao/nn/qat/modules/linear.py
@@ -28,6 +28,10 @@ class Linear(nn.Linear):
     Attributes:
         weight: fake quant module for weight
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nn.Linear
 
     def __init__(
diff --git a/torch/ao/nn/quantizable/modules/activation.py b/torch/ao/nn/quantizable/modules/activation.py
index 7e949a866cef..a9a497b1fcd1 100644
--- a/torch/ao/nn/quantizable/modules/activation.py
+++ b/torch/ao/nn/quantizable/modules/activation.py
@@ -96,7 +96,13 @@ def __init__(
             self.vdim, self.embed_dim, bias=bias, **factory_kwargs
         )
         # for the type: ignore, see https://github.com/pytorch/pytorch/issues/58969
+<<<<<<< HEAD
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=bias, **factory_kwargs)  # type: ignore[assignment]
+=======
+        self.out_proj = nn.Linear(
+            self.embed_dim, self.embed_dim, bias=bias, **factory_kwargs
+        )  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Functionals
         self.q_scaling_product = torch.ao.nn.quantized.FloatFunctional()
@@ -375,9 +381,15 @@ def _forward_impl(
         assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
 
         head_dim = self.embed_dim // self.num_heads
+<<<<<<< HEAD
         assert (
             head_dim * self.num_heads == self.embed_dim
         ), "embed_dim must be divisible by num_heads"
+=======
+        assert head_dim * self.num_heads == self.embed_dim, (
+            "embed_dim must be divisible by num_heads"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         scaling = float(head_dim) ** -0.5
 
         q = self.linear_Q(query)
@@ -394,9 +406,15 @@ def _forward_impl(
                     stacklevel=3,
                 )
                 attn_mask = attn_mask.to(torch.bool)
+<<<<<<< HEAD
             assert (
                 attn_mask.is_floating_point() or attn_mask.dtype == torch.bool
             ), f"Only float and bool types are supported for attn_mask, not {attn_mask.dtype}"
+=======
+            assert attn_mask.is_floating_point() or attn_mask.dtype == torch.bool, (
+                f"Only float and bool types are supported for attn_mask, not {attn_mask.dtype}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if attn_mask.dim() == 2:
                 attn_mask = attn_mask.unsqueeze(0)
diff --git a/torch/ao/nn/quantizable/modules/rnn.py b/torch/ao/nn/quantizable/modules/rnn.py
index af58765f33de..b0ad9f8ba879 100644
--- a/torch/ao/nn/quantizable/modules/rnn.py
+++ b/torch/ao/nn/quantizable/modules/rnn.py
@@ -38,6 +38,10 @@ class LSTMCell(torch.nn.Module):
         ...     hx, cx = rnn(input[i], (hx, cx))
         ...     output.append(hx)
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.nn.LSTMCell
     __constants__ = ["split_gates"]  # for jit.script
 
@@ -145,8 +149,14 @@ def forward(
     def initialize_hidden(
         self, batch_size: int, is_quantized: bool = False
     ) -> tuple[Tensor, Tensor]:
+<<<<<<< HEAD
         h, c = torch.zeros((batch_size, self.hidden_size)), torch.zeros(
             (batch_size, self.hidden_size)
+=======
+        h, c = (
+            torch.zeros((batch_size, self.hidden_size)),
+            torch.zeros((batch_size, self.hidden_size)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if is_quantized:
             (h_scale, h_zp) = self.initial_hidden_state_qparams
@@ -319,8 +329,14 @@ def forward(self, x: Tensor, hidden: Optional[tuple[Tensor, Tensor]] = None):
         if hx_fw is None and cx_fw is None:
             hidden_fw = None
         else:
+<<<<<<< HEAD
             hidden_fw = torch.jit._unwrap_optional(hx_fw), torch.jit._unwrap_optional(
                 cx_fw
+=======
+            hidden_fw = (
+                torch.jit._unwrap_optional(hx_fw),
+                torch.jit._unwrap_optional(cx_fw),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         result_fw, hidden_fw = self.layer_fw(x, hidden_fw)
 
@@ -421,6 +437,10 @@ class LSTM(torch.nn.Module):
         >>> print(rnn.layers[0].weight_hh)
         AssertionError: There is no reverse path in the non-bidirectional layer
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.nn.LSTM
 
     def __init__(
diff --git a/torch/ao/nn/quantized/dynamic/modules/conv.py b/torch/ao/nn/quantized/dynamic/modules/conv.py
index 1a6d73f93174..e63678b599b1 100644
--- a/torch/ao/nn/quantized/dynamic/modules/conv.py
+++ b/torch/ao/nn/quantized/dynamic/modules/conv.py
@@ -133,6 +133,10 @@ class Conv2d(nnq.Conv2d):
         >>> output = m(input)
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
     _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
     _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
@@ -217,6 +221,10 @@ class Conv3d(nnq.Conv3d):
         >>> output = m(input)
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
     _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = None
     _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = None
diff --git a/torch/ao/nn/quantized/dynamic/modules/linear.py b/torch/ao/nn/quantized/dynamic/modules/linear.py
index a94b308da267..2c5a663c10ef 100644
--- a/torch/ao/nn/quantized/dynamic/modules/linear.py
+++ b/torch/ao/nn/quantized/dynamic/modules/linear.py
@@ -35,6 +35,10 @@ class Linear(nnq.Linear):
         >>> print(output.size())
         torch.Size([128, 30])
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # version used in this class is different from the parent class nnq.Linear
     _version = 4
 
@@ -111,10 +115,16 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             torch.ao.nn.qat.dynamic.Linear,
         ]
 
+<<<<<<< HEAD
         assert (
             type(mod) in float_modules
         ), "nn.quantized.dynamic.Linear.from_float only works for one of" + str(
             [float_mod.__name__ for float_mod in float_modules]
+=======
+        assert type(mod) in float_modules, (
+            "nn.quantized.dynamic.Linear.from_float only works for one of"
+            + str([float_mod.__name__ for float_mod in float_modules])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
         if type(mod) == nni.LinearReLU:
@@ -147,7 +157,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         return qlinear
 
     @classmethod
+<<<<<<< HEAD
     def from_reference(cls, ref_qlinear):
+=======
+    def from_reference(cls, ref_qlinear):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Create a (fbgemm/qnnpack) dynamic quantized module from a reference quantized
         module
         Args:
diff --git a/torch/ao/nn/quantized/dynamic/modules/rnn.py b/torch/ao/nn/quantized/dynamic/modules/rnn.py
index 0c5363e9b71d..dffdbd83a893 100644
--- a/torch/ao/nn/quantized/dynamic/modules/rnn.py
+++ b/torch/ao/nn/quantized/dynamic/modules/rnn.py
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-decorators
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # mypy: allow-untyped-defs
 import numbers
 import warnings
@@ -521,6 +524,10 @@ class LSTM(RNNBase):
         >>> c0 = torch.randn(2, 3, 20)
         >>> output, (hn, cn) = rnn(input, (h0, c0))
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nn.LSTM
 
     __overloads__ = {"forward": ["forward_packed", "forward_tensor"]}
@@ -805,6 +812,10 @@ class GRU(RNNBase):
         >>> h0 = torch.randn(2, 3, 20)
         >>> output, hn = rnn(input, h0)
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = nn.GRU
 
     __overloads__ = {"forward": ["forward_packed", "forward_tensor"]}
@@ -1036,8 +1047,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             torch.nn.LSTMCell,
             torch.nn.GRUCell,
             torch.nn.RNNCell,
+<<<<<<< HEAD
         }, "nn.quantized.dynamic.RNNCellBase.from_float \
                                  only works for nn.LSTMCell, nn.GRUCell and nn.RNNCell"
+=======
+        }, (
+            "nn.quantized.dynamic.RNNCellBase.from_float \
+                                 only works for nn.LSTMCell, nn.GRUCell and nn.RNNCell"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
 
         if mod.qconfig is not None and mod.qconfig.weight is not None:
@@ -1210,6 +1228,10 @@ class RNNCell(RNNCellBase):
         ...     hx = rnn(input[i], hx)
         ...     output.append(hx)
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["input_size", "hidden_size", "bias", "nonlinearity"]
 
     def __init__(
diff --git a/torch/ao/nn/quantized/functional.py b/torch/ao/nn/quantized/functional.py
index 297629e08806..9203761f75b8 100644
--- a/torch/ao/nn/quantized/functional.py
+++ b/torch/ao/nn/quantized/functional.py
@@ -1,5 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 r""" Functional interface (quantized)."""
+=======
+r"""Functional interface (quantized)."""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 from typing import Optional
 
diff --git a/torch/ao/nn/quantized/modules/activation.py b/torch/ao/nn/quantized/modules/activation.py
index c7e37fd80baa..044ae2ee4169 100644
--- a/torch/ao/nn/quantized/modules/activation.py
+++ b/torch/ao/nn/quantized/modules/activation.py
@@ -265,7 +265,12 @@ def from_observed(cls, other):
         if converted.bias_v is not None:
             bias_v = converted._parameters.pop("bias_v")
             sc, zp = torch._choose_qparams_per_tensor(
+<<<<<<< HEAD
                 bias_k, reduce_range=False  # type: ignore[possibly-undefined]
+=======
+                bias_k,  # type: ignore[possibly-undefined]
+                reduce_range=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             bias_v = torch.quantize_per_tensor(bias_v, sc, zp, torch.quint8)
             setattr(converted, "bias_v", bias_v)  # noqa: B010
diff --git a/torch/ao/nn/quantized/modules/batchnorm.py b/torch/ao/nn/quantized/modules/batchnorm.py
index 345a17e0db9c..efd2d4c543b8 100644
--- a/torch/ao/nn/quantized/modules/batchnorm.py
+++ b/torch/ao/nn/quantized/modules/batchnorm.py
@@ -83,7 +83,11 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _BatchNorm.from_float(
             cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
@@ -122,7 +126,11 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _BatchNorm.from_float(
             cls, mod, use_precomputed_fake_quant=use_precomputed_fake_quant
         )
diff --git a/torch/ao/nn/quantized/modules/conv.py b/torch/ao/nn/quantized/modules/conv.py
index 9743f40e8074..5273c5999b0f 100644
--- a/torch/ao/nn/quantized/modules/conv.py
+++ b/torch/ao/nn/quantized/modules/conv.py
@@ -247,9 +247,15 @@ def get_qconv(cls, mod, activation_post_process, weight_post_process=None):
         if weight_post_process is None:
             weight_post_process = mod.qconfig.weight()
         weight_post_process(mod.weight)
+<<<<<<< HEAD
         assert (
             weight_post_process.dtype == torch.qint8
         ), "Weight observer must have a dtype of qint8"
+=======
+        assert weight_post_process.dtype == torch.qint8, (
+            "Weight observer must have a dtype of qint8"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qweight = _quantize_weight(mod.weight.float(), weight_post_process)
         # the __init__ call used is the one from derived classes and not the one from _ConvNd
         qconv = cls(
@@ -290,9 +296,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
                     mod.bn.weight,
                     mod.bn.bias,
                 )
+<<<<<<< HEAD
             assert hasattr(
                 mod, "activation_post_process"
             ), "Input QAT module must have observer attached"
+=======
+            assert hasattr(mod, "activation_post_process"), (
+                "Input QAT module must have observer attached"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             weight_post_process = mod.weight_fake_quant
             activation_post_process = mod.activation_post_process
         else:
@@ -304,9 +316,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
                 + " but got:"
                 + str(type(mod))
             )
+<<<<<<< HEAD
             assert hasattr(
                 mod, "qconfig"
             ), "Input float module must have qconfig defined."
+=======
+            assert hasattr(mod, "qconfig"), (
+                "Input float module must have qconfig defined."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             activation_post_process = (
                 None
                 if not hasattr(mod, "activation_post_process")
@@ -467,7 +485,11 @@ def forward(self, input):
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Creates a quantized module from a float module or qparams_dict.
 
         Args:
@@ -517,6 +539,10 @@ class Conv2d(_ConvNd):
         >>> output = m(q_input)
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nn.Conv2d]] = nn.Conv2d
     _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nniqat.ConvBn2d
     _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nni.ConvReLU2d
@@ -596,7 +622,11 @@ def forward(self, input):
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Creates a quantized module from a float module or qparams_dict.
 
         Args:
@@ -646,6 +676,10 @@ class Conv3d(_ConvNd):
         >>> output = m(q_input)
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE: ClassVar[type[nn.Conv3d]] = nn.Conv3d
     _NNIQAT_CONV_BN_MODULE: ClassVar[Optional[type[nn.Module]]] = nniqat.ConvBn3d
     _NNI_CONV_RELU_MODULE: ClassVar[Optional[type[nn.Module]]] = nni.ConvReLU3d
@@ -726,7 +760,11 @@ def forward(self, input):
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Creates a quantized module from a float module or qparams_dict.
 
         Args:
@@ -792,7 +830,11 @@ def _input_padding(
         return res
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+=======
+    def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Creates a quantized module from a float module or qparams_dict.
         Args:
             mod (Module): a float module, either produced by torch.ao.quantization
@@ -809,9 +851,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined."
         weight_post_process = mod.qconfig.weight()  # type: ignore[operator, union-attr]
         weight_post_process(mod.weight)
+<<<<<<< HEAD
         assert (
             weight_post_process.dtype == torch.qint8
         ), "Weight observer must have a dtype of qint8"
+=======
+        assert weight_post_process.dtype == torch.qint8, (
+            "Weight observer must have a dtype of qint8"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         qweight = _quantize_weight(mod.weight.float(), weight_post_process)
         # the __init__ call used is the one from derived classes and not the one from _ConvTransposeNd
         qconv = cls(
@@ -839,7 +887,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             return qconv
 
     @staticmethod
+<<<<<<< HEAD
     def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+=======
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module
         Args:
             ref_qconvt (Module): a reference quantized  module, either produced by torch.ao.quantization
@@ -987,7 +1039,11 @@ def forward(self, input):
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+=======
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _ConvTransposeNd.from_reference(
             cls, ref_qconvt, output_scale, output_zero_point
         )
@@ -1110,7 +1166,11 @@ def forward(self, input):
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+=======
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _ConvTransposeNd.from_reference(
             cls, ref_qconvt, output_scale, output_zero_point
         )
@@ -1235,7 +1295,11 @@ def forward(self, input):
         )
 
     @classmethod
+<<<<<<< HEAD
     def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+=======
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _ConvTransposeNd.from_reference(
             cls, ref_qconvt, output_scale, output_zero_point
         )
diff --git a/torch/ao/nn/quantized/modules/embedding_ops.py b/torch/ao/nn/quantized/modules/embedding_ops.py
index 2d9b5a6f0683..68bcad10e01f 100644
--- a/torch/ao/nn/quantized/modules/embedding_ops.py
+++ b/torch/ao/nn/quantized/modules/embedding_ops.py
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-decorators
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # mypy: allow-untyped-defs
 import torch
 import torch.nn as nn
@@ -116,6 +119,10 @@ class Embedding(torch.nn.Module):
         torch.Size([9, 12])
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _version = 1
 
     def __init__(
@@ -211,9 +218,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
                 + ".from_float only works for "
                 + nn.Embedding.__name__
             )
+<<<<<<< HEAD
             assert hasattr(
                 mod, "qconfig"
             ), "Embedding input float module must have qconfig defined"
+=======
+            assert hasattr(mod, "qconfig"), (
+                "Embedding input float module must have qconfig defined"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch.ao.quantization import float_qparams_weight_only_qconfig
 
             if mod.qconfig is not None and mod.qconfig.weight is not None:  # type: ignore[union-attr]
@@ -225,6 +238,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         is_float_qparams_qconfig = (
             weight_observer.qscheme == torch.per_channel_affine_float_qparams
         )
+<<<<<<< HEAD
         assert (
             is_float_qparams_qconfig
         ), "Embedding quantization is only supported with float_qparams_weight_only_qconfig."
@@ -232,6 +246,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         assert (
             dtype == torch.quint8 or dtype == torch.quint4x2
         ), f"The only supported dtype for nnq.Embedding is torch.quint8 and torch.quint4x2, got {dtype}"
+=======
+        assert is_float_qparams_qconfig, (
+            "Embedding quantization is only supported with float_qparams_weight_only_qconfig."
+        )
+
+        assert dtype == torch.quint8 or dtype == torch.quint4x2, (
+            f"The only supported dtype for nnq.Embedding is torch.quint8 and torch.quint4x2, got {dtype}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Run the observer to calculate qparams.
         weight_observer(mod.weight)
@@ -280,6 +303,10 @@ class EmbeddingBag(Embedding):
         torch.Size([5, 12])
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _version = 1
 
     def __init__(
@@ -354,9 +381,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
                 + ".from_float only works for "
                 + nn.EmbeddingBag.__name__
             )
+<<<<<<< HEAD
             assert hasattr(
                 mod, "qconfig"
             ), "EmbeddingBag input float module must have qconfig defined"
+=======
+            assert hasattr(mod, "qconfig"), (
+                "EmbeddingBag input float module must have qconfig defined"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch.ao.quantization.qconfig import float_qparams_weight_only_qconfig
 
             if mod.qconfig is not None and mod.qconfig.weight is not None:  # type: ignore[union-attr]
@@ -368,6 +401,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         is_float_qparams_qconfig = (
             weight_observer.qscheme == torch.per_channel_affine_float_qparams
         )
+<<<<<<< HEAD
         assert (
             is_float_qparams_qconfig
         ), "EmbeddingBag quantization is only supported with float_qparams_weight_only_qconfig."
@@ -375,6 +409,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         assert (
             dtype == torch.quint8 or dtype == torch.quint4x2
         ), f"The only supported dtype for nnq.EmbeddingBag is torch.quint8 and torch.quint4x2, got {dtype}"
+=======
+        assert is_float_qparams_qconfig, (
+            "EmbeddingBag quantization is only supported with float_qparams_weight_only_qconfig."
+        )
+
+        assert dtype == torch.quint8 or dtype == torch.quint4x2, (
+            f"The only supported dtype for nnq.EmbeddingBag is torch.quint8 and torch.quint4x2, got {dtype}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Run the observer to calculate qparams.
         weight_observer(mod.weight)
diff --git a/torch/ao/nn/quantized/modules/functional_modules.py b/torch/ao/nn/quantized/modules/functional_modules.py
index 10bdce2c0075..5c8a8604556a 100644
--- a/torch/ao/nn/quantized/modules/functional_modules.py
+++ b/torch/ao/nn/quantized/modules/functional_modules.py
@@ -288,9 +288,15 @@ def matmul(self, x: Tensor, y: Tensor) -> Tensor:
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
+<<<<<<< HEAD
         assert (
             type(mod) == FloatFunctional
         ), "QFunctional.from_float expects an instance of FloatFunctional"
+=======
+        assert type(mod) == FloatFunctional, (
+            "QFunctional.from_float expects an instance of FloatFunctional"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         scale, zero_point = mod.activation_post_process.calculate_qparams()  # type: ignore[operator]
         new_mod = QFunctional()
         new_mod.scale = float(scale)
diff --git a/torch/ao/nn/quantized/modules/linear.py b/torch/ao/nn/quantized/modules/linear.py
index cf4997a6c2c6..86e808742bd1 100644
--- a/torch/ao/nn/quantized/modules/linear.py
+++ b/torch/ao/nn/quantized/modules/linear.py
@@ -145,6 +145,10 @@ class Linear(WeightedQuantizedModule):
         >>> print(output.size())
         torch.Size([128, 30])
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _version = 3
     _FLOAT_MODULE = (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear)
 
@@ -314,12 +318,21 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
                 [float_mod.__name__ for float_mod in cls._FLOAT_MODULE]
             )
             error_msg = f"nnq.{cls.__name__}.from_float only works for {supported_modules}, but got: {type(mod)}"
+<<<<<<< HEAD
             assert (
                 type_before_parametrizations(mod) in cls._FLOAT_MODULE
             ), error_msg.format()
             assert hasattr(
                 mod, "qconfig"
             ), "Input float module must have qconfig defined"
+=======
+            assert type_before_parametrizations(mod) in cls._FLOAT_MODULE, (
+                error_msg.format()
+            )
+            assert hasattr(mod, "qconfig"), (
+                "Input float module must have qconfig defined"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             activation_post_process = mod.activation_post_process
             if type_before_parametrizations(mod) == nni.LinearReLU:
                 mod = mod[0]
diff --git a/torch/ao/nn/quantized/modules/normalization.py b/torch/ao/nn/quantized/modules/normalization.py
index e025184bd4a2..1b049c581ba9 100644
--- a/torch/ao/nn/quantized/modules/normalization.py
+++ b/torch/ao/nn/quantized/modules/normalization.py
@@ -93,6 +93,10 @@ class GroupNorm(torch.nn.GroupNorm):
         * **zero_point** - quantization zero point of the output, type: long.
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["num_groups", "num_channels", "eps", "affine"]
 
     def __init__(
diff --git a/torch/ao/nn/quantized/modules/rnn.py b/torch/ao/nn/quantized/modules/rnn.py
index 24b17ca2d62b..f85cbdc80848 100644
--- a/torch/ao/nn/quantized/modules/rnn.py
+++ b/torch/ao/nn/quantized/modules/rnn.py
@@ -32,6 +32,10 @@ class LSTM(torch.ao.nn.quantizable.LSTM):
         >>> tq.prepare(model, prepare_custom_module_class=custom_module_config)
         >>> tq.convert(model, convert_custom_module_class=custom_module_config)
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _FLOAT_MODULE = torch.ao.nn.quantizable.LSTM  # type: ignore[assignment]
 
     def _get_name(self):
diff --git a/torch/ao/nn/quantized/reference/modules/conv.py b/torch/ao/nn/quantized/reference/modules/conv.py
index cbe2fdca52e5..da3b4bb4be1f 100644
--- a/torch/ao/nn/quantized/reference/modules/conv.py
+++ b/torch/ao/nn/quantized/reference/modules/conv.py
@@ -110,7 +110,11 @@ def _get_name(self):
         return "QuantizedConv1d(Reference)"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, float_conv, weight_qparams):
+=======
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _ConvNd.from_float(cls, float_conv, weight_qparams)
 
 
@@ -173,7 +177,11 @@ def _get_name(self):
         return "QuantizedConv2d(Reference)"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, float_conv, weight_qparams):
+=======
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _ConvNd.from_float(cls, float_conv, weight_qparams)
 
 
@@ -236,7 +244,11 @@ def _get_name(self):
         return "QuantizedConv3d(Reference)"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, float_conv, weight_qparams):
+=======
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _ConvNd.from_float(cls, float_conv, weight_qparams)
 
 
@@ -346,7 +358,11 @@ def _get_name(self):
         return "QuantizedConvTranspose1d(Reference)"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, float_conv, weight_qparams):
+=======
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
 
 
@@ -427,7 +443,11 @@ def _get_name(self):
         return "QuantizedConvTranspose2d(Reference)"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, float_conv, weight_qparams):
+=======
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
 
 
@@ -507,5 +527,9 @@ def _get_name(self):
         return "QuantizedConvTranspose3d(Reference)"
 
     @classmethod
+<<<<<<< HEAD
     def from_float(cls, float_conv, weight_qparams):
+=======
+    def from_float(cls, float_conv, weight_qparams):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
diff --git a/torch/ao/nn/quantized/reference/modules/rnn.py b/torch/ao/nn/quantized/reference/modules/rnn.py
index bd5329851e5e..a0dc2c20818d 100644
--- a/torch/ao/nn/quantized/reference/modules/rnn.py
+++ b/torch/ao/nn/quantized/reference/modules/rnn.py
@@ -82,9 +82,15 @@ def __init__(
                 "weight_hh": weight_qparams,
                 "is_decomposed": False,
             }
+<<<<<<< HEAD
         assert (
             len(weight_qparams_dict) == 3
         ), "Expected length for weight_qparams_dict to be 3 for QuantizedRNNCellBase(Reference)"
+=======
+        assert len(weight_qparams_dict) == 3, (
+            "Expected length for weight_qparams_dict to be 3 for QuantizedRNNCellBase(Reference)"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._init_weight_qparams_dict(weight_qparams_dict, device)
 
     def _init_weight_qparams_dict(self, weight_qparams_dict, device):
@@ -185,7 +191,13 @@ def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
         assert input.dim() in (
             1,
             2,
+<<<<<<< HEAD
         ), f"RNNCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+=======
+        ), (
+            f"RNNCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_batched = input.dim() == 2
         if not is_batched:
             input = input.unsqueeze(0)
@@ -274,7 +286,13 @@ def forward(
         assert input.dim() in (
             1,
             2,
+<<<<<<< HEAD
         ), f"LSTMCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+=======
+        ), (
+            f"LSTMCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_batched = input.dim() == 2
         if not is_batched:
             input = input.unsqueeze(0)
@@ -347,7 +365,13 @@ def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
         assert input.dim() in (
             1,
             2,
+<<<<<<< HEAD
         ), f"GRUCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+=======
+        ), (
+            f"GRUCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_batched = input.dim() == 2
         if not is_batched:
             input = input.unsqueeze(0)
@@ -750,7 +774,13 @@ def forward(self, input, hx=None):  # noqa: F811
             assert input.dim() in (
                 2,
                 3,
+<<<<<<< HEAD
             ), f"GRU: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor"
+=======
+            ), (
+                f"GRU: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is_batched = input.dim() == 3
             batch_dim = 0 if self.batch_first else 1
             if not is_batched:
diff --git a/torch/ao/nn/quantized/reference/modules/utils.py b/torch/ao/nn/quantized/reference/modules/utils.py
index d846f305ad33..92c7883b7d8f 100644
--- a/torch/ao/nn/quantized/reference/modules/utils.py
+++ b/torch/ao/nn/quantized/reference/modules/utils.py
@@ -25,7 +25,13 @@ def _init_weight_qparams(self, weight_qparams, device):
             torch.per_tensor_affine,
             torch.per_channel_affine,
             torch.per_channel_affine_float_qparams,
+<<<<<<< HEAD
         ], f"qscheme: {self.weight_qscheme} is not support in reference quantized {self._get_name()}"
+=======
+        ], (
+            f"qscheme: {self.weight_qscheme} is not support in reference quantized {self._get_name()}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.weight_dtype in [
             torch.quint8,
             torch.qint8,
@@ -193,11 +199,20 @@ def _quantize_weight_decomposed(
     weight_quant_min: typing.Optional[int],
     weight_quant_max: typing.Optional[int],
 ) -> torch.Tensor:
+<<<<<<< HEAD
     _DTYPE_TO_QVALUE_BOUNDS = {
         torch.uint8: (0, 255),
         torch.int8: (-128, 127),
         torch.int32: (-(2**31), 2**31 - 1),
     }
+=======
+    _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = {
+        torch.uint8: (0, 255),
+        torch.int8: (-128, 127),
+        torch.int32: (int(-(2**31)), int(2**31 - 1)),
+    }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: add an util function for converting qdtype to dtype
     _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE = {
         torch.quint8: torch.uint8,
@@ -255,10 +270,17 @@ def _dequantize_weight_decomposed(
     weight_quant_max: typing.Optional[int],
 ) -> torch.Tensor:
     # TODO: get the quant_min and quant_max from activation_post_process
+<<<<<<< HEAD
     _DTYPE_TO_QVALUE_BOUNDS = {
         torch.uint8: (0, 255),
         torch.int8: (-128, 127),
         torch.int32: (-(2**31), 2**31 - 1),
+=======
+    _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = {
+        torch.uint8: (0, 255),
+        torch.int8: (-128, 127),
+        torch.int32: (int(-(2**31)), int(2**31 - 1)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     # TODO: add an util function for converting qdtype to dtype
     _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE = {
diff --git a/torch/ao/nn/sparse/quantized/dynamic/linear.py b/torch/ao/nn/sparse/quantized/dynamic/linear.py
index e937a0ad2a52..82af1b079554 100644
--- a/torch/ao/nn/sparse/quantized/dynamic/linear.py
+++ b/torch/ao/nn/sparse/quantized/dynamic/linear.py
@@ -18,6 +18,10 @@ class Linear(torch.nn.Module):
     r"""
     A dynamically quantized sparse linear module with float tensor as inputs and outputs.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _version = 1
     _op_type = "sparse_dynamic"
     _FLOAT_MODULE = torch.nn.Linear
@@ -83,9 +87,15 @@ def _load_from_state_dict(
         error_msgs,
     ):
         op_type = int(state_dict[prefix + "op_type"])
+<<<<<<< HEAD
         assert (
             op_type == "sparse"
         ), f"Cannot load from op_type [{op_type}], expecting [{self._op_type}]"
+=======
+        assert op_type == "sparse", (
+            f"Cannot load from op_type [{op_type}], expecting [{self._op_type}]"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         state_dict.pop(prefix + "op_type")
 
         version = local_metadata.get("version", None)
diff --git a/torch/ao/nn/sparse/quantized/linear.py b/torch/ao/nn/sparse/quantized/linear.py
index 81f663018e7e..6eb331ba7bf3 100644
--- a/torch/ao/nn/sparse/quantized/linear.py
+++ b/torch/ao/nn/sparse/quantized/linear.py
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-decorators
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # mypy: allow-untyped-defs
 from typing import Optional
 
@@ -104,6 +107,10 @@ class Linear(torch.nn.Module):
     r"""
     A quantized sparse linear module with quantized tensor as inputs and outputs.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _version = 1
     _FLOAT_MODULE = torch.nn.Linear
 
@@ -265,7 +272,14 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             dtype=dtype,
         )
         qlinear.set_weight_bias(
+<<<<<<< HEAD
             qweight, mod.bias, row_block_size, col_block_size  # type: ignore[arg-type]
+=======
+            qweight,
+            mod.bias,
+            row_block_size,  # type: ignore[arg-type]
+            col_block_size,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         qlinear.scale = float(act_scale)
         qlinear.zero_point = int(act_zp)
diff --git a/torch/ao/ns/_numeric_suite.py b/torch/ao/ns/_numeric_suite.py
index 49d51fdcd4ef..b4b4bca08a1e 100644
--- a/torch/ao/ns/_numeric_suite.py
+++ b/torch/ao/ns/_numeric_suite.py
@@ -63,15 +63,25 @@ def compare_weights(
 
     Example usage::
 
+<<<<<<< HEAD
         wt_compare_dict = compare_weights(
             float_model.state_dict(), qmodel.state_dict())
+=======
+        wt_compare_dict = compare_weights(float_model.state_dict(), qmodel.state_dict())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for key in wt_compare_dict:
             print(
                 key,
                 compute_error(
+<<<<<<< HEAD
                     wt_compare_dict[key]['float'],
                     wt_compare_dict[key]['quantized'].dequantize()
                 )
+=======
+                    wt_compare_dict[key]["float"],
+                    wt_compare_dict[key]["quantized"].dequantize(),
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     Args:
@@ -368,9 +378,13 @@ def prepare_model_with_stubs(
         "quantization_api._numeric_suite.prepare_model_with_stubs"
     )
 
+<<<<<<< HEAD
     float_module_children = {}
     for name, mod in float_module.named_children():
         float_module_children[name] = mod
+=======
+    float_module_children = dict(float_module.named_children())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     reassign = {}
     for name, mod in q_module.named_children():
@@ -424,10 +438,24 @@ def compare_model_stub(
 
     Example usage::
 
+<<<<<<< HEAD
         module_swap_list = [torchvision.models.quantization.resnet.QuantizableBasicBlock]
         ob_dict = compare_model_stub(float_model,qmodel,module_swap_list, data)
         for key in ob_dict:
             print(key, compute_error(ob_dict[key]['float'], ob_dict[key]['quantized'].dequantize()))
+=======
+        module_swap_list = [
+            torchvision.models.quantization.resnet.QuantizableBasicBlock
+        ]
+        ob_dict = compare_model_stub(float_model, qmodel, module_swap_list, data)
+        for key in ob_dict:
+            print(
+                key,
+                compute_error(
+                    ob_dict[key]["float"], ob_dict[key]["quantized"].dequantize()
+                ),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         float_model: float model used to generate the q_model
@@ -534,9 +562,15 @@ def compare_model_outputs(
             print(
                 key,
                 compute_error(
+<<<<<<< HEAD
                     act_compare_dict[key]['float'],
                     act_compare_dict[key]['quantized'].dequantize()
                 )
+=======
+                    act_compare_dict[key]["float"],
+                    act_compare_dict[key]["quantized"].dequantize(),
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     Args:
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index 420fea50740b..af3a689ea65b 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -9,7 +9,11 @@
     import torch.ao.ns._numeric_suite_fx as ns
 
     m = torch.nn.Sequential(torch.nn.Conv2d(1, 1, 1)).eval()
+<<<<<<< HEAD
     mp = quantize_fx.prepare_fx(m, {'': torch.ao.quantization.default_qconfig})
+=======
+    mp = quantize_fx.prepare_fx(m, {"": torch.ao.quantization.default_qconfig})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # We convert a copy because we need the original prepared model
     # to be available for comparisons, and `quantize_fx.convert_fx` is inplace.
     mq = quantize_fx.convert_fx(copy.deepcopy(mp))
@@ -19,12 +23,21 @@
     #
 
     # extract weight pairs
+<<<<<<< HEAD
     weight_comparison = ns.extract_weights('a', mp, 'b', mq)
 
     # add SQNR for each comparison, inplace
     ns.extend_logger_results_with_comparison(
         weight_comparison, 'a', 'b', torch.ao.ns.fx.utils.compute_sqnr,
         'sqnr')
+=======
+    weight_comparison = ns.extract_weights("a", mp, "b", mq)
+
+    # add SQNR for each comparison, inplace
+    ns.extend_logger_results_with_comparison(
+        weight_comparison, "a", "b", torch.ao.ns.fx.utils.compute_sqnr, "sqnr"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # weight_comparison contains the weights from `mp` and `mq` stored
     # in pairs, and can be used for further analysis.
@@ -36,9 +49,14 @@
 
     # add loggers
     mp_ns, mq_ns = ns.add_loggers(
+<<<<<<< HEAD
         'a', copy.deepcopy(mp),
         'b', copy.deepcopy(mq),
         ns.OutputLogger)
+=======
+        "a", copy.deepcopy(mp), "b", copy.deepcopy(mq), ns.OutputLogger
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # send an example datum to capture intermediate activations
     datum = torch.randn(1, 1, 1, 1)
@@ -46,6 +64,7 @@
     mq_ns(datum)
 
     # extract intermediate activations
+<<<<<<< HEAD
     act_comparison = ns.extract_logger_info(
         mp_ns, mq_ns, ns.OutputLogger, 'b')
 
@@ -53,6 +72,14 @@
     ns.extend_logger_results_with_comparison(
         act_comparison, 'a', 'b', torch.ao.ns.fx.utils.compute_sqnr,
         'sqnr')
+=======
+    act_comparison = ns.extract_logger_info(mp_ns, mq_ns, ns.OutputLogger, "b")
+
+    # add SQNR for each comparison, inplace
+    ns.extend_logger_results_with_comparison(
+        act_comparison, "a", "b", torch.ao.ns.fx.utils.compute_sqnr, "sqnr"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # act_comparison contains the activations from `mp_ns` and `mq_ns` stored
     # in pairs, and can be used for further analysis.
@@ -63,9 +90,14 @@
 
     # create shadow model
     mp_shadows_mq = ns.add_shadow_loggers(
+<<<<<<< HEAD
         'a', copy.deepcopy(mp),
         'b', copy.deepcopy(mq),
         ns.OutputLogger)
+=======
+        "a", copy.deepcopy(mp), "b", copy.deepcopy(mq), ns.OutputLogger
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # send an example datum to capture intermediate activations
     datum = torch.randn(1, 1, 1, 1)
@@ -73,12 +105,22 @@
 
     # extract intermediate activations
     shadow_act_comparison = ns.extract_shadow_logger_info(
+<<<<<<< HEAD
         mp_shadows_mq, ns.OutputLogger, 'b')
 
     # add SQNR for each comparison, inplace
     ns.extend_logger_results_with_comparison(
         shadow_act_comparison, 'a', 'b', torch.ao.ns.fx.utils.compute_sqnr,
         'sqnr')
+=======
+        mp_shadows_mq, ns.OutputLogger, "b"
+    )
+
+    # add SQNR for each comparison, inplace
+    ns.extend_logger_results_with_comparison(
+        shadow_act_comparison, "a", "b", torch.ao.ns.fx.utils.compute_sqnr, "sqnr"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # shadow_act_comparison contains the activations from `mp_ns` and `mq_ns` stored
     # in pairs, and can be used for further analysis.
@@ -596,9 +638,15 @@ def _extract_logger_info_one_model(
             key = mod.ref_name
             if key not in results:
                 results[key] = {}
+<<<<<<< HEAD
             assert (
                 mod.model_name not in results[key]
             ), f"{mod.model_name} is already present in results"
+=======
+            assert mod.model_name not in results[key], (
+                f"{mod.model_name} is already present in results"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if mod.results_type not in results[key]:
                 results[key][mod.results_type] = {}
             if mod.model_name not in results[key][mod.results_type]:
@@ -810,12 +858,21 @@ def extend_logger_results_with_comparison(
     """
     for results_type_to_results in results.values():
         for model_name_to_results in results_type_to_results.values():
+<<<<<<< HEAD
             assert (
                 model_name_1 in model_name_to_results
             ), f"{model_name_1} not found in results"
             assert (
                 model_name_2 in model_name_to_results
             ), f"{model_name_2} not found in results"
+=======
+            assert model_name_1 in model_name_to_results, (
+                f"{model_name_1} not found in results"
+            )
+            assert model_name_2 in model_name_to_results, (
+                f"{model_name_2} not found in results"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             results_1 = model_name_to_results[model_name_1]
             results_2 = model_name_to_results[model_name_2]
diff --git a/torch/ao/ns/fx/graph_matcher.py b/torch/ao/ns/fx/graph_matcher.py
index 83611b156beb..646f0b4aa7fb 100644
--- a/torch/ao/ns/fx/graph_matcher.py
+++ b/torch/ao/ns/fx/graph_matcher.py
@@ -225,7 +225,13 @@ def _get_subgraph_relationship_type(
         assert (
             subgraph_a.base_op_node == subgraph_a.start_node
             and subgraph_b.base_op_node == subgraph_b.start_node
+<<<<<<< HEAD
         ), "Matching call_module patterns where base_op_node != start_node is not supported yet"
+=======
+        ), (
+            "Matching call_module patterns where base_op_node != start_node is not supported yet"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # for call_module, we need to look up the modules to do the type check
         assert isinstance(node_a.target, str)
         mod_a = getattr_from_fqn(gm_a, node_a.target)
@@ -444,9 +450,15 @@ def get_matching_subgraph_pairs(
             key_name_b = _get_name_for_subgraph(
                 cur_subgraph_b, gm_b, base_name_to_sets_of_related_ops, existing_names_b
             )
+<<<<<<< HEAD
             assert (
                 key_name_a == key_name_b
             ), f"Subgraph names {key_name_a} and {key_name_b} do not match"
+=======
+            assert key_name_a == key_name_b, (
+                f"Subgraph names {key_name_a} and {key_name_b} do not match"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             results[key_name_a] = (cur_subgraph_a, cur_subgraph_b)
             continue
         elif cur_subgraph_a is None and cur_subgraph_b is None:
@@ -465,6 +477,10 @@ def get_matching_subgraph_pairs(
     # The subgraph pairs are originally created by traversing the two graphs
     # from the outputs to the inputs. Reverse the results to return the
     # subgraphs in their order of execution.
+<<<<<<< HEAD
     results = collections.OrderedDict(reversed(list(results.items())))
+=======
+    results = collections.OrderedDict(reversed(results.items()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return results
diff --git a/torch/ao/ns/fx/graph_passes.py b/torch/ao/ns/fx/graph_passes.py
index 04964bb79be6..120374a2ccb0 100644
--- a/torch/ao/ns/fx/graph_passes.py
+++ b/torch/ao/ns/fx/graph_passes.py
@@ -646,9 +646,15 @@ def _copy_arg(arg):
             return arg
         elif isinstance(kwarg_val, (list, tuple)):
             for el in kwarg_val:
+<<<<<<< HEAD
                 assert not isinstance(
                     el, Node
                 ), "handling of Node inside list is not implemented"
+=======
+                assert not isinstance(el, Node), (
+                    "handling of Node inside list is not implemented"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return arg
         else:
             raise AssertionError(
@@ -689,13 +695,29 @@ def _copy_arg(arg):
         mod_a = getattr_from_fqn(gm_a, node_a.target)
         setattr(gm_b, new_mod_copy_name, mod_a)
         node_a_shadows_c = graph_c.create_node(
+<<<<<<< HEAD
             node_a.op, new_mod_copy_name, new_args, new_kwargs, node_a_shadows_c_name  # type: ignore[arg-type]
+=======
+            node_a.op,
+            new_mod_copy_name,
+            new_args,  # type: ignore[arg-type]
+            new_kwargs,  # type: ignore[arg-type]
+            node_a_shadows_c_name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return node_a_shadows_c
     else:
         assert node_a.op in ("call_function", "call_method")
         node_a_shadows_c = graph_c.create_node(
+<<<<<<< HEAD
             node_a.op, node_a.target, new_args, new_kwargs, node_a_shadows_c_name  # type: ignore[arg-type]
+=======
+            node_a.op,
+            node_a.target,
+            new_args,  # type: ignore[arg-type]
+            new_kwargs,  # type: ignore[arg-type]
+            node_a_shadows_c_name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         return node_a_shadows_c
 
diff --git a/torch/ao/ns/fx/n_shadows_utils.py b/torch/ao/ns/fx/n_shadows_utils.py
index dde95fefcc3e..67322a58a276 100644
--- a/torch/ao/ns/fx/n_shadows_utils.py
+++ b/torch/ao/ns/fx/n_shadows_utils.py
@@ -109,9 +109,15 @@ def _get_dedup_subgraphs(matches: dict[str, _MatchResult]) -> dict[str, list[Nod
     # Dict items are not reversible until Python 3.8, so we hack it
     # to be compatible with previous Python versions
     # TODO(future PR): try reversed(list(matches.items()))
+<<<<<<< HEAD
     matches_items_reversed: list[tuple[str, _MatchResult]] = []
     for name, cur_match in matches.items():
         matches_items_reversed.insert(0, (name, cur_match))
+=======
+    matches_items_reversed: list[tuple[str, _MatchResult]] = list(
+        reversed(matches.items())
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Note: the order is important.  `matches` currently provides the matches
     # in reverse order.  We would like to process the matches in non-reverse
@@ -356,7 +362,14 @@ def _add_placeholder(
                     new_kwarg = []
                     for inner_kwarg in kwarg:
                         p = _add_placeholder(
+<<<<<<< HEAD
                             g, inner_kwarg, seen_names, old_name_to_new_node  # type: ignore[arg-type]
+=======
+                            g,
+                            inner_kwarg,  # type: ignore[arg-type]
+                            seen_names,
+                            old_name_to_new_node,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         new_kwarg.append(p)
                     cur_kwargs_copy[kwarg_name] = new_kwarg
@@ -427,9 +440,15 @@ def _add_placeholder(
             break
 
         # go to next node
+<<<<<<< HEAD
         assert (
             len(cur_node_orig.users.keys()) == 1
         ), f"{cur_node_orig} has more than 1 users, not supported yet"
+=======
+        assert len(cur_node_orig.users.keys()) == 1, (
+            f"{cur_node_orig} has more than 1 users, not supported yet"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cur_node_orig = next(iter(cur_node_orig.users.keys()))
         cur_iteration += 1
         if cur_iteration > iteration_limit:
@@ -529,9 +548,15 @@ def create_one_transformed_and_logged_copy_of_subgraph(
                 "prepare_custom_config",
                 "qconfig_mapping",
             ]:
+<<<<<<< HEAD
                 assert (
                     kwarg_name not in custom_prepare_kwargs
                 ), f"cannot specify {kwarg_name} in custom_prepare_kwargs"
+=======
+                assert kwarg_name not in custom_prepare_kwargs, (
+                    f"cannot specify {kwarg_name} in custom_prepare_kwargs"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             prepare_kwargs: dict[str, Any] = {
                 "example_inputs": example_inputs,
                 "qconfig_mapping": qconfig_mapping,
@@ -860,7 +885,11 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
                     new_args = cur_node_orig.args
                     new_kwargs = cur_node_orig.kwargs
                 else:
+<<<<<<< HEAD
                     first_arg_for_copy = cur_node_copy
+=======
+                    first_arg_for_copy: Optional[Node] = cur_node_copy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     new_args = (first_arg_for_copy, *cur_node_orig.args[1:])
                     new_kwargs = cur_node_orig.kwargs
                 # make a copy of cur_node_orig
@@ -1073,9 +1102,13 @@ def extract_weight_comparison(m: GraphModule) -> NSResultsType:
         if shadow_wrapper_node is None:
             continue
 
+<<<<<<< HEAD
         shadow_wrapper = getattr_from_fqn(
             m, shadow_wrapper_node.target
         )  # type: ignore[arg-type]
+=======
+        shadow_wrapper = getattr_from_fqn(m, shadow_wrapper_node.target)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         weight_info = _get_weight_info_from_shadow_wrapper(shadow_wrapper)
         if weight_info is None:
             continue
@@ -1226,9 +1259,15 @@ def group_results_by_subgraph(results: NSResultsType) -> Any:
             "comparison_fn_name": subgraph_candidate_results[0]["comparison_fn_name"],
         }
 
+<<<<<<< HEAD
         subgraph_name_to_subgraph_results[subgraph_name][
             subgraph_candidate_idx
         ] = subgraph_results
+=======
+        subgraph_name_to_subgraph_results[subgraph_name][subgraph_candidate_idx] = (
+            subgraph_results
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return dict(subgraph_name_to_subgraph_results)
 
diff --git a/torch/ao/ns/fx/utils.py b/torch/ao/ns/fx/utils.py
index 6f55600671c8..53bbe39a0c33 100644
--- a/torch/ao/ns/fx/utils.py
+++ b/torch/ao/ns/fx/utils.py
@@ -76,7 +76,12 @@ def get_node_first_input_and_output_type(
         assert isinstance(node.target, str)
         mod = getattr_from_fqn(gm, node.target)
         is_known_fp32_or_int8_input_module = any(
+<<<<<<< HEAD
             isinstance(mod, target_type) for target_type in MODS_IO_TYPE_FP32_OR_INT8  # type: ignore[arg-type]
+=======
+            isinstance(mod, target_type)  # type: ignore[arg-type]
+            for target_type in MODS_IO_TYPE_FP32_OR_INT8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if (
             isinstance(mod, (logger_cls, ObserverBase, FakeQuantizeBase))  # type: ignore[arg-type]
@@ -94,10 +99,19 @@ def get_node_first_input_and_output_type(
             )
             return (prev_node_output_type, prev_node_output_type)
         is_known_fp32_input_module = any(
+<<<<<<< HEAD
             isinstance(mod, target_type) for target_type in MODS_IO_TYPE_FP32  # type: ignore[arg-type]
         )
         is_known_int8_input_module = any(
             isinstance(mod, target_type) for target_type in MODS_IO_TYPE_INT8  # type: ignore[arg-type]
+=======
+            isinstance(mod, target_type)  # type: ignore[arg-type]
+            for target_type in MODS_IO_TYPE_FP32
+        )
+        is_known_int8_input_module = any(
+            isinstance(mod, target_type)  # type: ignore[arg-type]
+            for target_type in MODS_IO_TYPE_INT8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if is_known_fp32_input_module:
             return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32)
@@ -136,9 +150,15 @@ def get_node_first_input_and_output_type(
             )
 
             cur_node_dtype_target = get_normalized_nth_input(node, gm, 1)
+<<<<<<< HEAD
             assert (
                 cur_node_dtype_target is torch.float16
             ), f"{cur_node_dtype_target} handling needs to be added"
+=======
+            assert cur_node_dtype_target is torch.float16, (
+                f"{cur_node_dtype_target} handling needs to be added"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return (prev_node_output_type, NodeInputOrOutputType.FP16)
 
@@ -230,7 +250,12 @@ def _get_scale_zp_from_function_args(node, gm, scale_arg_idx, zp_arg_idx):
             return (module_obj.scale, module_obj.zero_point)  # type: ignore[return-value]
 
         is_known_fp32_or_int8_input_module = any(
+<<<<<<< HEAD
             isinstance(module_obj, target_type) for target_type in MODS_IO_TYPE_FP32_OR_INT8  # type: ignore[arg-type]
+=======
+            isinstance(module_obj, target_type)  # type: ignore[arg-type]
+            for target_type in MODS_IO_TYPE_FP32_OR_INT8
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if is_known_fp32_or_int8_input_module:
             return get_node_input_qparams(prev_node, gm, node_type_to_io_type_map)
diff --git a/torch/ao/ns/fx/weight_utils.py b/torch/ao/ns/fx/weight_utils.py
index fdd87963c2d8..72b9f130ecdb 100644
--- a/torch/ao/ns/fx/weight_utils.py
+++ b/torch/ao/ns/fx/weight_utils.py
@@ -52,7 +52,11 @@ def get_conv_mod_weight(mod: nn.Module) -> torch.Tensor:
     if isinstance(mod, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
         return mod.weight.detach()
     elif isinstance(mod, (nni.ConvReLU1d, nni.ConvReLU2d, nni.ConvReLU3d)):
+<<<<<<< HEAD
         return mod[0].weight.detach()
+=======
+        return mod[0].weight.detach()  # type: ignore[operator]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return mod._weight_bias()[0]  # type: ignore[operator]
 
@@ -61,7 +65,11 @@ def get_linear_mod_weight(mod: nn.Module) -> torch.Tensor:
     if isinstance(mod, nn.Linear):
         return mod.weight.detach()
     elif isinstance(mod, nni.LinearReLU):
+<<<<<<< HEAD
         return mod[0].weight.detach()
+=======
+        return mod[0].weight.detach()  # type: ignore[operator]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return mod._weight_bias()[0]  # type: ignore[operator]
 
@@ -79,8 +87,17 @@ def get_lstm_mod_weights(mod: nn.Module) -> list[torch.Tensor]:
         assert isinstance(mod, nnqd.LSTM), f"type {type(mod)} not handled yet"
         res = []
         for weight_value in mod._all_weight_values:
+<<<<<<< HEAD
             res.append(weight_value.param.__getstate__()[0][4][0].__getstate__()[0][0])
             res.append(weight_value.param.__getstate__()[0][4][1].__getstate__()[0][0])
+=======
+            res.append(
+                weight_value.param.__getstate__()[0][4][0].__getstate__()[0][0]  # type: ignore[index]
+            )
+            res.append(
+                weight_value.param.__getstate__()[0][4][1].__getstate__()[0][0]  # type: ignore[index]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return res
 
 
diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
index 721426e90800..4a5b1865c7e7 100644
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@@ -75,6 +75,7 @@ def mean_reduce_fn(agg_tensor):    return agg_tensor.mean(dim=0)
         >>>     return torch.eye(data.shape).to(data.device)
         >>>
         >>>
+<<<<<<< HEAD
         >>> act_sparsifier.register_layer(model.some_layer, aggregate_fn=agg_fn, reduce_fn=reduce_fn, mask_fn=mask_fn)
         >>>
         >>> # start training process
@@ -82,6 +83,20 @@ def mean_reduce_fn(agg_tensor):    return agg_tensor.mean(dim=0)
         >>>     # epoch starts
         >>>         # model.forward(), compute_loss() and model.backwards()
         >>>     # epoch ends
+=======
+        >>> act_sparsifier.register_layer(
+        ...     model.some_layer,
+        ...     aggregate_fn=agg_fn,
+        ...     reduce_fn=reduce_fn,
+        ...     mask_fn=mask_fn,
+        ... )
+        >>>
+        >>> # start training process
+        >>> for _ in [...]:
+        >>> # epoch starts
+        >>> # model.forward(), compute_loss() and model.backwards()
+        >>> # epoch ends
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>>     act_sparsifier.step()
         >>> # end training process
         >>> sparsifier.squash_mask()
@@ -231,9 +246,15 @@ def register_layer(
         self.data_groups[name] = local_args
         agg_hook = layer.register_forward_pre_hook(self._aggregate_hook(name=name))
 
+<<<<<<< HEAD
         self.state[name][
             "mask"
         ] = None  # mask will be created when model forward is called.
+=======
+        self.state[name]["mask"] = (
+            None  # mask will be created when model forward is called.
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # attach agg hook
         self.data_groups[name]["hook"] = agg_hook
@@ -255,9 +276,15 @@ def get_mask(self, name: Optional[str] = None, layer: Optional[nn.Module] = None
             Hence, if get_mask() is called before model.forward(), an
             error will be raised.
         """
+<<<<<<< HEAD
         assert (
             name is not None or layer is not None
         ), "Need at least name or layer obj to retrieve mask"
+=======
+        assert name is not None or layer is not None, (
+            "Need at least name or layer obj to retrieve mask"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if name is None:
             assert layer is not None
@@ -360,9 +387,15 @@ def squash_mask(self, attach_sparsify_hook=True, **kwargs):
                 configs["hook"] = configs["layer"].register_forward_pre_hook(
                     self._sparsify_hook(name)
                 )
+<<<<<<< HEAD
             configs[
                 "hook_state"
             ] = "sparsify"  # signals that sparsify hook is now attached
+=======
+            configs["hook_state"] = (
+                "sparsify"  # signals that sparsify hook is now attached
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _get_serializable_data_groups(self):
         """Exclude hook and layer from the config keys before serializing
diff --git a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
index 3f685661bd9f..5813ad7f2d4a 100644
--- a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
+++ b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
@@ -98,7 +98,13 @@ def get_schedule_param(self):
             >>> def get_schedule_param(self):
             ...     new_param = {}
             ...     for name in self.sparsifier.data_groups.keys():
+<<<<<<< HEAD
             ...         new_param[name] = self.sparsifier.data_groups[name][self.schedule_param] * 0.5
+=======
+            ...         new_param[name] = (
+            ...             self.sparsifier.data_groups[name][self.schedule_param] * 0.5
+            ...         )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ...     return new_param
 
         When the step() function is called, the value in self.sparsifier.data_groups[name][self.schedule_param]
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
index 1c5e698e8b4a..3ae69229aff8 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
@@ -91,9 +91,15 @@ def add_data(self, name: str, data, reuse_mask=True, **config):
             4. By default, the config of the replaced data is used as config for the replacing data, unless something
                is specified in the config dictionary.
         """
+<<<<<<< HEAD
         assert (
             type(data) in SUPPORTED_TYPES
         ), "specified data type not supported at the moment"
+=======
+        assert type(data) in SUPPORTED_TYPES, (
+            "specified data type not supported at the moment"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_args = copy.deepcopy(self.defaults)
         local_args.update(config)
         weight = self._extract_weight(data)
@@ -115,9 +121,15 @@ def add_data(self, name: str, data, reuse_mask=True, **config):
 
             if reuse_mask:
                 current_data = self.get_data(name=name)
+<<<<<<< HEAD
                 assert (
                     weight.shape == current_data.shape
                 ), "to retain the old mask, the shape of the new data must be the same as the previous one"
+=======
+                assert weight.shape == current_data.shape, (
+                    "to retain the old mask, the shape of the new data must be the same as the previous one"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mask = self.get_mask(
                     name=name
                 )  # reuse mask instead of creating a new one
@@ -310,7 +322,11 @@ def step(self):  # type:ignore[override]
                 self.update_mask(name, data, **config)
 
     @abc.abstractmethod
+<<<<<<< HEAD
     def update_mask(self, name, data, **kwargs):
+=======
+    def update_mask(self, name, data, **kwargs):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pass
 
     def _delete_data(self, name):
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
index 692960c09b5d..ff8b77e73560 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
@@ -8,7 +8,11 @@ The objective of this exercise is to use the data sparsifier to prune the embedd
 3. **Model forward time**: Can we speed up the model forward time by utilizing the sparsity? Specifically, can we introduce torch.sparse interim to reduce number of computations.
 
 ## Scope
+<<<<<<< HEAD
 The [DataNormSparsifier](https://github.com/pytorch/pytorch/blob/master/torch/ao/sparsity/_experimental/data_sparsifier/data_norm_sparsifier.py) is used to sparsify the embeddings of the DLRM model. The model is sparsified for all the combinations of -
+=======
+The [DataNormSparsifier](https://github.com/pytorch/pytorch/blob/main/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py) is used to sparsify the embeddings of the DLRM model. The model is sparsified for all the combinations of -
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 1. Sparsity Levels: [0.0, 0.1, 0.2, ... 0.9, 0.91, 0.92, ... 0.99, 1.0]
 2. Sparse Block shapes: (1,1) and (1,4)
 3. Norm: L1 and L2
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
index 052c137c35ef..22edafb0720c 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
@@ -47,7 +47,11 @@ def save_model_states(
         state_dict (Dict)
             The state_dict() as dumped by dlrm_s_pytorch.py. Only the model state will be extracted
             from this dictionary. This corresponds to the 'state_dict' key in the state_dict dictionary.
+<<<<<<< HEAD
             >>> model_state = state_dict['state_dict']
+=======
+            >>> model_state = state_dict["state_dict"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         save_file_name (str)
             The filename (not path) when saving the model state dictionary
         sparse_block_shape (Tuple)
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py b/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
index c57b639af82e..d26a38fe9d82 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
@@ -32,7 +32,11 @@ class DataNormSparsifier(BaseDataSparsifier):
         zeros_per_block: Number of zeros in a sparse block
     Note::
         All arguments to the DataNormSparsifier constructor are "default"
+<<<<<<< HEAD
         arguments and could be overriden by the configuration provided in the
+=======
+        arguments and could be overridden by the configuration provided in the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         `add_data` step.
     """
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py b/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
index 2efdf524b367..1e8577fd74e8 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
@@ -66,6 +66,7 @@ def post_training_sparse_quantize(
 
     else:
         embedding_modules = []
+<<<<<<< HEAD
         assert isinstance(
             select_embeddings, list
         ), "the embedding_modules must be a list of embedding modules"
@@ -77,6 +78,19 @@ def post_training_sparse_quantize(
             assert (
                 fqn_name is not None
             ), "the embedding modules must be part of input model"
+=======
+        assert isinstance(select_embeddings, list), (
+            "the embedding_modules must be a list of embedding modules"
+        )
+        for emb in select_embeddings:
+            assert type(emb) in SUPPORTED_MODULES, (
+                "the embedding_modules list must be an embedding or embedding bags"
+            )
+            fqn_name = module_to_fqn(model, emb)
+            assert fqn_name is not None, (
+                "the embedding modules must be part of input model"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             embedding_modules.append((fqn_name, emb))
 
     if sparsify_first:
@@ -118,9 +132,15 @@ def post_training_sparse_quantize(
 
             quantized_weight = quantized_emb.weight()  # type: ignore[operator]
             quantize_params["scales"][name] = quantized_weight.q_per_channel_scales()
+<<<<<<< HEAD
             quantize_params["zero_points"][
                 name
             ] = quantized_weight.q_per_channel_zero_points()
+=======
+            quantize_params["zero_points"][name] = (
+                quantized_weight.q_per_channel_zero_points()
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             quantize_params["dequant_weights"][name] = torch.dequantize(
                 quantized_weight
             )
diff --git a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
index fcbdb3593979..91de6ef4300e 100644
--- a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
+++ b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -90,12 +90,19 @@ def _get_supported_activation_modules():
     return SUPPORTED_ACTIVATION_MODULES
 
 
+<<<<<<< HEAD
 def _get_default_structured_pruning_patterns() -> (
     dict[
         tuple[Union[type[nn.Module], Callable, MatchAllNode, str], ...],
         Callable[..., None],
     ]
 ):
+=======
+def _get_default_structured_pruning_patterns() -> dict[
+    tuple[Union[type[nn.Module], Callable, MatchAllNode, str], ...],
+    Callable[..., None],
+]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Returns the patterns for conv2d / linear conversion for each element in the activation functions/modules defined above.
     """
diff --git a/torch/ao/pruning/_experimental/pruner/match_utils.py b/torch/ao/pruning/_experimental/pruner/match_utils.py
index 3f8567bc7907..e1666332798c 100644
--- a/torch/ao/pruning/_experimental/pruner/match_utils.py
+++ b/torch/ao/pruning/_experimental/pruner/match_utils.py
@@ -1,6 +1,10 @@
 """
 Contains utility functions to check if a pattern is in the graph and return the matching nodes
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Optional, Union
 
 import torch
diff --git a/torch/ao/pruning/_experimental/pruner/prune_functions.py b/torch/ao/pruning/_experimental/pruner/prune_functions.py
index eef1d5d6f3bb..f0e1cded2ba2 100644
--- a/torch/ao/pruning/_experimental/pruner/prune_functions.py
+++ b/torch/ao/pruning/_experimental/pruner/prune_functions.py
@@ -3,6 +3,10 @@
 Collection of conversion functions for linear / conv2d structured pruning
 Also contains utilities for bias propagation
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Callable, cast, Optional
 
 import torch
@@ -326,9 +330,15 @@ def prune_conv2d_pool_flatten_linear(
         linear_ic = linear.weight.shape[1]
 
     conv2d_oc = len(mask)
+<<<<<<< HEAD
     assert (
         linear_ic % conv2d_oc == 0
     ), f"Flattening from dimensions {conv2d_oc} to {linear_ic} not supported"
+=======
+    assert linear_ic % conv2d_oc == 0, (
+        f"Flattening from dimensions {conv2d_oc} to {linear_ic} not supported"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     flatten_scale = linear_ic // conv2d_oc
     flattened_mask = torch.tensor(
diff --git a/torch/ao/pruning/scheduler/lambda_scheduler.py b/torch/ao/pruning/scheduler/lambda_scheduler.py
index 07e95b524811..00b02024052f 100644
--- a/torch/ao/pruning/scheduler/lambda_scheduler.py
+++ b/torch/ao/pruning/scheduler/lambda_scheduler.py
@@ -21,7 +21,11 @@ class LambdaSL(BaseScheduler):
     Example:
         >>> # Assuming sparsifier has two groups.
         >>> lambda1 = lambda epoch: epoch // 30
+<<<<<<< HEAD
         >>> lambda2 = lambda epoch: 0.95 ** epoch
+=======
+        >>> lambda2 = lambda epoch: 0.95**epoch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> # xdoctest: +SKIP
         >>> scheduler = LambdaSL(sparsifier, sl_lambda=[lambda1, lambda2])
         >>> for epoch in range(100):
diff --git a/torch/ao/pruning/sparsifier/base_sparsifier.py b/torch/ao/pruning/sparsifier/base_sparsifier.py
index ed233b0f0b5a..a96354a2edef 100644
--- a/torch/ao/pruning/sparsifier/base_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/base_sparsifier.py
@@ -200,7 +200,13 @@ def prepare(self, model, config):
                             and "." + info_from_tensor_fqn[key] == local_args[key]
                         )
                         # info_from_tensor_fqn will chop leading '.' from tensor_fqn so ignore that
+<<<<<<< HEAD
                     ), f"Given both `{key}` and `tensor_fqn` in the config, it is expected them to agree!"
+=======
+                    ), (
+                        f"Given both `{key}` and `tensor_fqn` in the config, it is expected them to agree!"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_args.update(info_from_tensor_fqn)
             self.groups.append(local_args)
         self._prepare()
@@ -243,22 +249,37 @@ def squash_mask(
             >>> # xdoctest: +SKIP("locals are undefined")
             >>> # Don't save any sparse params
             >>> sparsifier.squash_mask()
+<<<<<<< HEAD
             >>> hasattr(model.submodule1, 'sparse_params')
+=======
+            >>> hasattr(model.submodule1, "sparse_params")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             False
 
             >>> # Keep sparse params per layer
             >>> sparsifier.squash_mask(
             ...     params_to_keep_per_layer={
+<<<<<<< HEAD
             ...         'submodule1.linear1': ('foo', 'bar'),
             ...         'submodule2.linear42': ('baz',)
             ...     })
+=======
+            ...         "submodule1.linear1": ("foo", "bar"),
+            ...         "submodule2.linear42": ("baz",),
+            ...     }
+            ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> print(model.submodule1.linear1.sparse_params)
             {'foo': 42, 'bar': 24}
             >>> print(model.submodule2.linear42.sparse_params)
             {'baz': 0.1}
 
             >>> # Keep sparse params for all layers
+<<<<<<< HEAD
             >>> sparsifier.squash_mask(params_to_keep=('foo', 'bar'))
+=======
+            >>> sparsifier.squash_mask(params_to_keep=("foo", "bar"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> print(model.submodule1.linear1.sparse_params)
             {'foo': 42, 'bar': 24}
             >>> print(model.submodule2.linear42.sparse_params)
@@ -267,10 +288,16 @@ def squash_mask(
             >>> # Keep some sparse params for all layers, and specific ones for
             >>> # some other layers
             >>> sparsifier.squash_mask(
+<<<<<<< HEAD
             ...     params_to_keep=('foo', 'bar'),
             ...     params_to_keep_per_layer={
             ...         'submodule2.linear42': ('baz',)
             ...     })
+=======
+            ...     params_to_keep=("foo", "bar"),
+            ...     params_to_keep_per_layer={"submodule2.linear42": ("baz",)},
+            ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> print(model.submodule1.linear1.sparse_params)
             {'foo': 42, 'bar': 24}
             >>> print(model.submodule2.linear42.sparse_params)
diff --git a/torch/ao/pruning/sparsifier/utils.py b/torch/ao/pruning/sparsifier/utils.py
index 4b7ce0ec4468..4d4474dbdbbe 100644
--- a/torch/ao/pruning/sparsifier/utils.py
+++ b/torch/ao/pruning/sparsifier/utils.py
@@ -52,9 +52,15 @@ def swap_module(
 
         # respect device affinity when swapping modules
         devices = {p.device for p in chain(mod.parameters(), mod.buffers())}
+<<<<<<< HEAD
         assert (
             len(devices) <= 1
         ), f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
+=======
+        assert len(devices) <= 1, (
+            f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = next(iter(devices)) if len(devices) > 0 else None
         if device:
             new_mod.to(device)
diff --git a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
index 58c0f7efa37d..1396f4bef961 100644
--- a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
@@ -52,7 +52,11 @@ class WeightNormSparsifier(BaseSparsifier):
 
     Note::
         All arguments to the WeightNormSparsifier constructor are "default"
+<<<<<<< HEAD
         arguments and could be overriden by the configuration provided in the
+=======
+        arguments and could be overridden by the configuration provided in the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         `prepare` step.
     """
 
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
index 57ed1f60f948..186f838f5ba6 100644
--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
@@ -223,9 +223,15 @@ def __init__(
         from .utils import is_per_channel
 
         if is_per_channel(self.qscheme):
+<<<<<<< HEAD
             assert (
                 self.ch_axis is not None
             ), "Must provide a valid ch_axis if qscheme is per channel"
+=======
+            assert self.ch_axis is not None, (
+                "Must provide a valid ch_axis if qscheme is per channel"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, x: Tensor) -> Tensor:
         return x
diff --git a/torch/ao/quantization/_correct_bias.py b/torch/ao/quantization/_correct_bias.py
index e1623ae8ee51..2defd9003915 100644
--- a/torch/ao/quantization/_correct_bias.py
+++ b/torch/ao/quantization/_correct_bias.py
@@ -119,10 +119,18 @@ def bias_correction(
         float_model, quantized_model, _supported_modules, MeanShadowLogger
     )
 
+<<<<<<< HEAD
     uncorrected_modules = {}
     for name, submodule in quantized_model.named_modules():
         if type(submodule) in target_modules:
             uncorrected_modules[name] = submodule
+=======
+    uncorrected_modules = {
+        name: submodule
+        for name, submodule in quantized_model.named_modules()
+        if type(submodule) in target_modules
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for uncorrected_module in uncorrected_modules:
         quantized_submodule = get_module(quantized_model, uncorrected_module)
diff --git a/torch/ao/quantization/_equalize.py b/torch/ao/quantization/_equalize.py
index 99b87b01dffb..c245d1cf4171 100644
--- a/torch/ao/quantization/_equalize.py
+++ b/torch/ao/quantization/_equalize.py
@@ -92,9 +92,15 @@ def channel_range(input, axis=0):
     mins = min_over_ndim(input, axis_list)
     maxs = max_over_ndim(input, axis_list)
 
+<<<<<<< HEAD
     assert mins.size(0) == input.size(
         axis
     ), "Dimensions of resultant channel range does not match size of requested axis"
+=======
+    assert mins.size(0) == input.size(axis), (
+        "Dimensions of resultant channel range does not match size of requested axis"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return maxs - mins
 
 
diff --git a/torch/ao/quantization/_learnable_fake_quantize.py b/torch/ao/quantization/_learnable_fake_quantize.py
index 9673318d3c70..7daac7e17769 100644
--- a/torch/ao/quantization/_learnable_fake_quantize.py
+++ b/torch/ao/quantization/_learnable_fake_quantize.py
@@ -56,19 +56,34 @@ def __init__(
             self.scale = Parameter(torch.tensor([scale]))
             self.zero_point = Parameter(torch.tensor([zero_point]))
         else:
+<<<<<<< HEAD
             assert (
                 isinstance(channel_len, int) and channel_len > 0
             ), "Channel size must be a positive integer."
+=======
+            assert isinstance(channel_len, int) and channel_len > 0, (
+                "Channel size must be a positive integer."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.scale = Parameter(torch.tensor([scale] * channel_len))
             self.zero_point = Parameter(torch.tensor([zero_point] * channel_len))
 
         self.activation_post_process = observer(**observer_kwargs)
+<<<<<<< HEAD
         assert (
             torch.iinfo(self.activation_post_process.dtype).min <= quant_min
         ), "quant_min out of bound"
         assert (
             quant_max <= torch.iinfo(self.activation_post_process.dtype).max
         ), "quant_max out of bound"
+=======
+        assert torch.iinfo(self.activation_post_process.dtype).min <= quant_min, (
+            "quant_min out of bound"
+        )
+        assert quant_max <= torch.iinfo(self.activation_post_process.dtype).max, (
+            "quant_max out of bound"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.dtype = self.activation_post_process.dtype
         self.qscheme = self.activation_post_process.qscheme
         self.ch_axis = (
@@ -145,7 +160,11 @@ def observe_quant_params(self):
         print(f"_LearnableFakeQuantize Zero Point: {self.zero_point.detach()}")
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.scale.data.clamp_(min=self.eps.item())  # type: ignore[operator]
         scale = self.scale.detach()
         zero_point = (
diff --git a/torch/ao/quantization/backend_config/README.md b/torch/ao/quantization/backend_config/README.md
index 5e63af1af355..de461dbe9e1d 100644
--- a/torch/ao/quantization/backend_config/README.md
+++ b/torch/ao/quantization/backend_config/README.md
@@ -1,6 +1,10 @@
 ## BackendConfig Overview
 
+<<<<<<< HEAD
 BackendConfig allows PyTorch quantization to work with different backend or kernel libraries. These backends may have different sets of supported quantized operator patterns, and the same operator patterns may require different handling across different backends. To make quantization work with different backends and allow maximum flexibility, we strived to make all the parts of the quantization flow configurable with BackendConfig. Currently, it is only used by FX graph mode quantization. For more details on how it integrates with the FX graph mode quantization flow, refer to this [README](/torch/ao/quantization/fx/README.md).
+=======
+BackendConfig allows PyTorch quantization to work with different backend or kernel libraries. These backends may have different sets of supported quantized operator patterns, and the same operator patterns may require different handling across different backends. To make quantization work with different backends and allow maximum flexibility, we strived to make all the parts of the quantization flow configurable with BackendConfig. Currently, it is only used by FX graph mode quantization. For more details on how it integrates with the FX graph mode quantization flow, refer to this [README](../fx/README.md).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 BackendConfig configures quantization behavior in terms of operator patterns. For each operator pattern, we need to specify what the supported data types are for the input and output activations, weights, and biases, and also specify the QAT modules, the reference quantized modules etc., which will be used in module swapping during the quantization passes.
 
diff --git a/torch/ao/quantization/backend_config/_common_operator_config_utils.py b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
index 60f2fe86b12e..58ffdfeb4db9 100644
--- a/torch/ao/quantization/backend_config/_common_operator_config_utils.py
+++ b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
@@ -165,9 +165,13 @@ def _get_binary_op_configs(
         )
     # matmul
     binary_op_configs.append(
+<<<<<<< HEAD
         BackendPatternConfig(torch.matmul).set_dtype_configs(
             dtype_configs
         )  # noqa: E131
+=======
+        BackendPatternConfig(torch.matmul).set_dtype_configs(dtype_configs)  # noqa: E131
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return binary_op_configs
 
@@ -483,16 +487,24 @@ def _get_ln_configs(dtype_configs: list[DTypeConfig]) -> list[BackendPatternConf
     ln_configs = []
     ln_configs.append(
         BackendPatternConfig(torch.nn.LayerNorm)
+<<<<<<< HEAD
         .set_observation_type(
             ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
         )  # noqa: E131
+=======
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .set_dtype_configs(dtype_configs)
     )
     ln_configs.append(
         BackendPatternConfig(torch.nn.functional.layer_norm)
+<<<<<<< HEAD
         .set_observation_type(
             ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
         )  # noqa: E131
+=======
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .set_dtype_configs(dtype_configs)
         ._set_input_type_to_index({"weight": 2, "bias": 3})
     )
@@ -518,27 +530,39 @@ def _get_default_op_configs(
     ]
     configs = [
         BackendPatternConfig(op)
+<<<<<<< HEAD
         .set_observation_type(
             ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
         )  # noqa: E131
+=======
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .set_dtype_configs(dtype_configs)
         for op in default_ops
     ]
 
     configs.append(
         BackendPatternConfig(torch.nn.functional.group_norm)
+<<<<<<< HEAD
         .set_observation_type(
             ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
         )  # noqa: E131
+=======
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .set_dtype_configs(dtype_configs)
         ._set_input_type_to_index({"weight": 2, "bias": 3})
     )
 
     configs.append(
         BackendPatternConfig(torch.nn.functional.instance_norm)
+<<<<<<< HEAD
         .set_observation_type(
             ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
         )  # noqa: E131
+=======
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .set_dtype_configs(dtype_configs)
         ._set_input_type_to_index({"weight": 3, "bias": 4})
     )
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index 33ebc91cfffd..7026c4ec3a10 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -272,6 +272,7 @@ def to_dict(self) -> dict[str, Any]:
         if self.input_dtype is not None:
             dtype_config_dict[INPUT_DTYPE_DICT_KEY] = self.input_dtype_with_constraints
         if self.output_dtype is not None:
+<<<<<<< HEAD
             dtype_config_dict[
                 OUTPUT_DTYPE_DICT_KEY
             ] = self.output_dtype_with_constraints
@@ -279,6 +280,15 @@ def to_dict(self) -> dict[str, Any]:
             dtype_config_dict[
                 WEIGHT_DTYPE_DICT_KEY
             ] = self.weight_dtype_with_constraints
+=======
+            dtype_config_dict[OUTPUT_DTYPE_DICT_KEY] = (
+                self.output_dtype_with_constraints
+            )
+        if self.weight_dtype is not None:
+            dtype_config_dict[WEIGHT_DTYPE_DICT_KEY] = (
+                self.weight_dtype_with_constraints
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.bias_dtype is not None:
             dtype_config_dict[BIAS_DTYPE_DICT_KEY] = self.bias_dtype
         if self.is_dynamic is not None:
@@ -671,6 +681,7 @@ def _get_dtype_config(obj: Any) -> DTypeConfig:
         for d in backend_pattern_config_dict.get(DTYPE_CONFIGS_DICT_KEY, []):
             conf.add_dtype_config(_get_dtype_config(d))
         conf.set_root_module(
+<<<<<<< HEAD
             backend_pattern_config_dict.get(ROOT_MODULE_DICT_KEY, None)
         )
         conf.set_qat_module(backend_pattern_config_dict.get(QAT_MODULE_DICT_KEY, None))
@@ -688,6 +699,25 @@ def _get_dtype_config(obj: Any) -> DTypeConfig:
         )
         conf._set_extra_inputs_getter(
             backend_pattern_config_dict.get(EXTRA_INPUTS_GETTER_DICT_KEY, None)
+=======
+            backend_pattern_config_dict.get(ROOT_MODULE_DICT_KEY, None)  # type: ignore[arg-type]
+        )
+        conf.set_qat_module(backend_pattern_config_dict.get(QAT_MODULE_DICT_KEY, None))  # type: ignore[arg-type]
+        conf.set_reference_quantized_module(
+            backend_pattern_config_dict.get(REFERENCE_QUANTIZED_MODULE_DICT_KEY, None)  # type: ignore[arg-type]
+        )
+        conf.set_fused_module(
+            backend_pattern_config_dict.get(FUSED_MODULE_DICT_KEY, None)  # type: ignore[arg-type]
+        )
+        conf.set_fuser_method(
+            backend_pattern_config_dict.get(FUSER_METHOD_DICT_KEY, None)  # type: ignore[arg-type]
+        )
+        conf._set_root_node_getter(
+            backend_pattern_config_dict.get(ROOT_NODE_GETTER_DICT_KEY, None)  # type: ignore[arg-type]
+        )
+        conf._set_extra_inputs_getter(
+            backend_pattern_config_dict.get(EXTRA_INPUTS_GETTER_DICT_KEY, None)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         conf._set_num_tensor_args_to_observation_type(
             backend_pattern_config_dict.get(
@@ -719,14 +749,21 @@ def to_dict(self) -> dict[str, Any]:
         if self.qat_module is not None:
             backend_pattern_config_dict[QAT_MODULE_DICT_KEY] = self.qat_module
         if self.reference_quantized_module is not None:
+<<<<<<< HEAD
             backend_pattern_config_dict[
                 REFERENCE_QUANTIZED_MODULE_DICT_KEY
             ] = self.reference_quantized_module
+=======
+            backend_pattern_config_dict[REFERENCE_QUANTIZED_MODULE_DICT_KEY] = (
+                self.reference_quantized_module
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.fused_module is not None:
             backend_pattern_config_dict[FUSED_MODULE_DICT_KEY] = self.fused_module
         if self.fuser_method is not None:
             backend_pattern_config_dict[FUSER_METHOD_DICT_KEY] = self.fuser_method
         if self._root_node_getter is not None:
+<<<<<<< HEAD
             backend_pattern_config_dict[
                 ROOT_NODE_GETTER_DICT_KEY
             ] = self._root_node_getter
@@ -734,11 +771,21 @@ def to_dict(self) -> dict[str, Any]:
             backend_pattern_config_dict[
                 EXTRA_INPUTS_GETTER_DICT_KEY
             ] = self._extra_inputs_getter
+=======
+            backend_pattern_config_dict[ROOT_NODE_GETTER_DICT_KEY] = (
+                self._root_node_getter
+            )
+        if self._extra_inputs_getter is not None:
+            backend_pattern_config_dict[EXTRA_INPUTS_GETTER_DICT_KEY] = (
+                self._extra_inputs_getter
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(self._num_tensor_args_to_observation_type) > 0:
             backend_pattern_config_dict[
                 NUM_TENSOR_ARGS_TO_OBSERVATION_TYPE_DICT_KEY
             ] = self._num_tensor_args_to_observation_type
         if len(self._input_type_to_index) > 0:
+<<<<<<< HEAD
             backend_pattern_config_dict[
                 INPUT_TYPE_TO_INDEX_DICT_KEY
             ] = self._input_type_to_index
@@ -746,4 +793,13 @@ def to_dict(self) -> dict[str, Any]:
             backend_pattern_config_dict[
                 PATTERN_COMPLEX_FORMAT_DICT_KEY
             ] = self._pattern_complex_format
+=======
+            backend_pattern_config_dict[INPUT_TYPE_TO_INDEX_DICT_KEY] = (
+                self._input_type_to_index
+            )
+        if self._pattern_complex_format is not None:
+            backend_pattern_config_dict[PATTERN_COMPLEX_FORMAT_DICT_KEY] = (
+                self._pattern_complex_format
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return backend_pattern_config_dict
diff --git a/torch/ao/quantization/backend_config/onednn.py b/torch/ao/quantization/backend_config/onednn.py
index 92f168e11145..aab1cad48cb6 100644
--- a/torch/ao/quantization/backend_config/onednn.py
+++ b/torch/ao/quantization/backend_config/onednn.py
@@ -88,9 +88,15 @@ def _fuse_linear_bn_leaky_relu(is_qat, linear, bn, leaky_relu):
         >>> lr = nn.LeakyReLU(0.01)
         >>> m2 = _fuse_linear_bn_leaky_relu(m1, b1, lr)
     """
+<<<<<<< HEAD
     assert (
         linear.training == bn.training and bn.training == leaky_relu.training
     ), "Linear, BN and LeakyReLU all must be in the same mode (train or eval)."
+=======
+    assert linear.training == bn.training and bn.training == leaky_relu.training, (
+        "Linear, BN and LeakyReLU all must be in the same mode (train or eval)."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if is_qat:
         raise NotImplementedError(
@@ -200,9 +206,13 @@ def _conv_bn_add_extra_inputs_getter_left(add_pattern):
     else:
         conv_configs.append(
             BackendPatternConfig()
+<<<<<<< HEAD
             ._set_pattern_complex_format(
                 (add_op, nn.Conv2d, MatchAllNode)
             )  # noqa: E131
+=======
+            ._set_pattern_complex_format((add_op, nn.Conv2d, MatchAllNode))  # noqa: E131
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .set_observation_type(observation_type)
             .set_dtype_configs(conv_dtype_configs)
             .set_fuser_method(_fuse_conv_add_left)
@@ -285,9 +295,13 @@ def _conv_bn_add_extra_inputs_getter_right(pattern):
     else:
         conv_configs.append(
             BackendPatternConfig()
+<<<<<<< HEAD
             ._set_pattern_complex_format(
                 (add_op, MatchAllNode, nn.Conv2d)
             )  # noqa: E131
+=======
+            ._set_pattern_complex_format((add_op, MatchAllNode, nn.Conv2d))  # noqa: E131
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .set_observation_type(observation_type)
             .set_dtype_configs(conv_dtype_configs)
             .set_fuser_method(_fuse_conv_add_right)
@@ -390,9 +404,13 @@ def _conv_bn_add_relu_extra_inputs_getter_left(pattern):
     else:
         conv_configs.append(
             BackendPatternConfig()
+<<<<<<< HEAD
             ._set_pattern_complex_format(
                 (nn.ReLU, (add_op, nn.Conv2d, MatchAllNode))
             )  # noqa: E131
+=======
+            ._set_pattern_complex_format((nn.ReLU, (add_op, nn.Conv2d, MatchAllNode)))  # noqa: E131
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .set_observation_type(observation_type)
             .set_dtype_configs(conv_dtype_configs)
             .set_fuser_method(_fuse_conv_add_relu_left)
@@ -485,9 +503,13 @@ def _conv_bn_add_relu_extra_inputs_getter_right(pattern):
     else:
         conv_configs.append(
             BackendPatternConfig()
+<<<<<<< HEAD
             ._set_pattern_complex_format(
                 (nn.ReLU, (add_op, MatchAllNode, nn.Conv2d))
             )  # noqa: E131
+=======
+            ._set_pattern_complex_format((nn.ReLU, (add_op, MatchAllNode, nn.Conv2d)))  # noqa: E131
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .set_observation_type(observation_type)
             .set_dtype_configs(conv_dtype_configs)
             .set_fuser_method(_fuse_conv_add_relu_right)
diff --git a/torch/ao/quantization/experimental/adaround_fake_quantize.py b/torch/ao/quantization/experimental/adaround_fake_quantize.py
index 77c8781ed272..f6c4da1a7904 100644
--- a/torch/ao/quantization/experimental/adaround_fake_quantize.py
+++ b/torch/ao/quantization/experimental/adaround_fake_quantize.py
@@ -1,5 +1,11 @@
 # mypy: allow-untyped-decorators
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
+=======
+from __future__ import annotations
+
+from typing import Any
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.ao.quantization.fake_quantize import _is_symmetric_quant
@@ -19,6 +25,7 @@ class AdaroundFakeQuantizer(FakeQuantize):
     zero_point: torch.Tensor
     V: torch.nn.Parameter
 
+<<<<<<< HEAD
     # pyre-fixme[3]: Return type must be annotated.
     def __init__(
         self,
@@ -30,6 +37,17 @@ def __init__(
         # pyre-fixme[2]: Parameter must be annotated.
         **observer_kwargs,
     ):
+=======
+    def __init__(
+        self,
+        observer: type = MinMaxObserver,
+        qscheme: torch.qscheme = torch.per_tensor_symmetric,  # not used, but needed for fakequant
+        quant_min: int = -128,
+        quant_max: int = 127,
+        ch_axis: int = 0,
+        **observer_kwargs: Any,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(
             observer=observer,
             qscheme=qscheme,
@@ -40,11 +58,18 @@ def __init__(
         )
         # Populate quant_min/quant_max to observer_kwargs if valid
         if quant_min is not None and quant_max is not None:
+<<<<<<< HEAD
             assert (
                 quant_min <= quant_max
             ), "quant_min must be less than or equal to quant_max"
         # pyre-fixme[4]: Attribute must be annotated.
         self.qscheme = qscheme
+=======
+            assert quant_min <= quant_max, (
+                "quant_min must be less than or equal to quant_max"
+            )
+        self.qscheme: torch.qscheme = qscheme
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.is_per_tensor: bool = is_per_tensor(qscheme)
         self.is_symmetric: bool = _is_symmetric_quant(qscheme)
         assert self.is_symmetric, "Only symmetric quantization is supported"
@@ -60,7 +85,11 @@ def __init__(
         self.use_soft_rounding = True
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
+=======
+    def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.scale, self.zero_point
 
     @torch.jit.export
@@ -106,9 +135,15 @@ def update_scale(
         X_q = X / self.scale
         X_q_floor = torch.floor(X_q)
         residual = X_q - X_q_floor  # [0,1)
+<<<<<<< HEAD
         assert torch.all(
             torch.ge(residual, 0)
         ), "residual should be non-negative [0, 1)"
+=======
+        assert torch.all(torch.ge(residual, 0)), (
+            "residual should be non-negative [0, 1)"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         V_init = -torch.log((self.zeta - self.gamma) / (residual - self.gamma) - 1)
         self.V.data = V_init
 
@@ -117,8 +152,14 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
             X_detached = X.detach()
             self.activation_post_process(X_detached)
             _scale, _zero_point = self.activation_post_process.calculate_qparams()
+<<<<<<< HEAD
             _scale, _zero_point = _scale.to(self.scale.device), _zero_point.to(
                 self.zero_point.device
+=======
+            _scale, _zero_point = (
+                _scale.to(self.scale.device),
+                _zero_point.to(self.zero_point.device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             dims = list(range(X.dim()))
             if not self.is_per_tensor:
diff --git a/torch/ao/quantization/experimental/adaround_loss.py b/torch/ao/quantization/experimental/adaround_loss.py
index 3fcf32b086a9..9795ebc70c57 100644
--- a/torch/ao/quantization/experimental/adaround_loss.py
+++ b/torch/ao/quantization/experimental/adaround_loss.py
@@ -37,9 +37,15 @@ def rounding_regularization(
         Major logics copied from official Adaround Implementation.
         Apply rounding regularization to the input tensor V.
         """
+<<<<<<< HEAD
         assert (
             curr_iter < self.max_iter
         ), "Current iteration strictly les sthan max iteration"
+=======
+        assert curr_iter < self.max_iter, (
+            "Current iteration strictly les sthan max iteration"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if curr_iter < self.warm_start * self.max_iter:
             return torch.tensor(0.0)
         else:
diff --git a/torch/ao/quantization/experimental/adaround_optimization.py b/torch/ao/quantization/experimental/adaround_optimization.py
index be7244581990..32833a722dd2 100644
--- a/torch/ao/quantization/experimental/adaround_optimization.py
+++ b/torch/ao/quantization/experimental/adaround_optimization.py
@@ -186,9 +186,15 @@ def optimize_adaptive_rounding(
         inp, out, fp_in = self.get_data_inp_out(module, q_module, self.data)
 
         print("==================== Before adaround ====================")
+<<<<<<< HEAD
         assert (
             torch.abs(out[0] - module(fp_in[0])).sum().item() == 0
         ), "In-placed activation is detected, please do not use activation in-placed"
+=======
+        assert torch.abs(out[0] - module(fp_in[0])).sum().item() == 0, (
+            "In-placed activation is detected, please do not use activation in-placed"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Stack the tensors in each list into a single tensor
         # Assuming inp and out are your lists of tensors
         inp_tensor = torch.vstack(inp)
diff --git a/torch/ao/quantization/experimental/fake_quantize.py b/torch/ao/quantization/experimental/fake_quantize.py
index 50fdcdb33ac2..203717db3e69 100644
--- a/torch/ao/quantization/experimental/fake_quantize.py
+++ b/torch/ao/quantization/experimental/fake_quantize.py
@@ -20,7 +20,13 @@ def __init__(self, observer: Callable = APoTObserver, **observer_kwargs: Any):
         self.activation_post_process = observer(**observer_kwargs)
         self.dtype = self.activation_post_process.dtype
 
+<<<<<<< HEAD
     def calculate_qparams(self, signed: bool = False) -> tuple[Tensor, Tensor, Tensor, Tensor]:  # type: ignore[override]
+=======
+    def calculate_qparams(  # type: ignore[override]
+        self, signed: bool = False
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.activation_post_process.calculate_qparams(signed=signed)
 
     def forward(self, X: torch.Tensor) -> Tensor:  # type: ignore[override]
diff --git a/torch/ao/quantization/experimental/fake_quantize_function.py b/torch/ao/quantization/experimental/fake_quantize_function.py
index c9ad8058008d..9a939d80bc09 100644
--- a/torch/ao/quantization/experimental/fake_quantize_function.py
+++ b/torch/ao/quantization/experimental/fake_quantize_function.py
@@ -27,6 +27,13 @@ def forward(  # type: ignore[override]
         return result
 
     @staticmethod
+<<<<<<< HEAD
     def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: Tensor) -> Tensor:  # type: ignore[override]
+=======
+    def backward(  # type: ignore[override]
+        ctx: torch.autograd.function.FunctionCtx,
+        grad_output: Tensor,
+    ) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mask = ctx.saved_tensors  # type: ignore[attr-defined]
         return grad_output * mask
diff --git a/torch/ao/quantization/experimental/observer.py b/torch/ao/quantization/experimental/observer.py
index 183655f95d35..4a4160d1452e 100644
--- a/torch/ao/quantization/experimental/observer.py
+++ b/torch/ao/quantization/experimental/observer.py
@@ -6,8 +6,11 @@
 
 import itertools
 
+<<<<<<< HEAD
 import matplotlib.pyplot as plt
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch.ao.quantization.experimental.apot_utils import apot_to_float, float_to_apot
 from torch.ao.quantization.observer import ObserverBase
@@ -148,6 +151,12 @@ def forward(self, x_orig):
     """
 
     def quant_levels_visualization(self, signed=False):
+<<<<<<< HEAD
+=======
+        # matplotlib is optional dep
+        import matplotlib.pyplot as plt
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         alpha, _gamma, quantization_levels, level_indices = self.calculate_qparams(
             signed
         )
diff --git a/torch/ao/quantization/fake_quantize.py b/torch/ao/quantization/fake_quantize.py
index e957f04a7ef5..0bc3f75bcb49 100644
--- a/torch/ao/quantization/fake_quantize.py
+++ b/torch/ao/quantization/fake_quantize.py
@@ -133,7 +133,11 @@ class FakeQuantize(FakeQuantizeBase):
     The output of this module is given by::
 
         x_out = (
+<<<<<<< HEAD
           clamp(round(x/scale + zero_point), quant_min, quant_max) - zero_point
+=======
+            clamp(round(x / scale + zero_point), quant_min, quant_max) - zero_point
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) * scale
 
     * :attr:`is_dynamic` indicates whether the fake quantie is a placeholder for dynamic quantization
@@ -177,9 +181,15 @@ def __init__(
         super().__init__()
         # Populate quant_min/quant_max to observer_kwargs if valid
         if quant_min is not None and quant_max is not None:
+<<<<<<< HEAD
             assert (
                 quant_min <= quant_max
             ), "quant_min must be less than or equal to quant_max"
+=======
+            assert quant_min <= quant_max, (
+                "quant_min must be less than or equal to quant_max"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dtype = observer_kwargs.get("dtype", torch.quint8)
             if hasattr(observer, "p"):
                 # In case observer is _PartialWrapper, dtype can be stored in
@@ -218,15 +228,25 @@ def __init__(
         self.is_per_channel = _is_per_channel(self.qscheme)
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.activation_post_process.calculate_qparams()
 
     def forward(self, X):
         if self.observer_enabled[0] == 1:
             self.activation_post_process(X.detach())
             _scale, _zero_point = self.calculate_qparams()
+<<<<<<< HEAD
             _scale, _zero_point = _scale.to(self.scale.device), _zero_point.to(
                 self.zero_point.device
+=======
+            _scale, _zero_point = (
+                _scale.to(self.scale.device),
+                _zero_point.to(self.zero_point.device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             if self.scale.shape != _scale.shape:
                 self.scale.resize_(_scale.shape)
@@ -328,9 +348,15 @@ class FixedQParamsFakeQuantize(FakeQuantize):
     # TODO: rename observer to observer_ctr
     def __init__(self, observer):
         super().__init__(observer=observer)
+<<<<<<< HEAD
         assert (
             type(self.activation_post_process) == FixedQParamsObserver
         ), f"{self.__class__.__name__}'s observer must be a {FixedQParamsObserver.__name__}"
+=======
+        assert type(self.activation_post_process) == FixedQParamsObserver, (
+            f"{self.__class__.__name__}'s observer must be a {FixedQParamsObserver.__name__}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._observer_ctr = observer
         self.scale = self.activation_post_process.scale
         self.zero_point = self.activation_post_process.zero_point
@@ -341,7 +367,11 @@ def __init__(self, observer):
         )
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.scale, self.zero_point
 
     @torch.jit.export
@@ -384,7 +414,13 @@ def __init__(
         assert isinstance(
             self.activation_post_process,
             (MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver),
+<<<<<<< HEAD
         ), "Fused observer+fake_quant module only works with MovingAverageMinMaxObserver"
+=======
+        ), (
+            "Fused observer+fake_quant module only works with MovingAverageMinMaxObserver"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.long))
         self.register_buffer("observer_enabled", torch.tensor([1], dtype=torch.long))
         self.is_symmetric_quant = _is_symmetric_quant(
@@ -392,7 +428,11 @@ def __init__(
         )
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
+=======
+    def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.activation_post_process.calculate_qparams()
 
     @torch.jit.export
diff --git a/torch/ao/quantization/fuser_method_mappings.py b/torch/ao/quantization/fuser_method_mappings.py
index 20232c5dd4d7..e569150df415 100644
--- a/torch/ao/quantization/fuser_method_mappings.py
+++ b/torch/ao/quantization/fuser_method_mappings.py
@@ -34,9 +34,15 @@ def fuse_conv_bn(is_qat, conv, bn):
         >>> # xdoctest: +SKIP
         >>> m2 = fuse_conv_bn(m1, b1)
     """
+<<<<<<< HEAD
     assert (
         conv.training == bn.training
     ), "Conv and BN both must be in the same mode (train or eval)."
+=======
+    assert conv.training == bn.training, (
+        "Conv and BN both must be in the same mode (train or eval)."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     fused_module_class_map = {
         nn.Conv1d: nni.ConvBn1d,
@@ -45,6 +51,7 @@ def fuse_conv_bn(is_qat, conv, bn):
     }
 
     if is_qat:
+<<<<<<< HEAD
         assert (
             bn.num_features == conv.out_channels
         ), "Output channel of Conv2d must match num_features of BatchNorm2d"
@@ -52,6 +59,15 @@ def fuse_conv_bn(is_qat, conv, bn):
         assert (
             bn.track_running_stats
         ), "Only support fusing BatchNorm2d with tracking_running_stats set to True"
+=======
+        assert bn.num_features == conv.out_channels, (
+            "Output channel of Conv2d must match num_features of BatchNorm2d"
+        )
+        assert bn.affine, "Only support fusing BatchNorm2d with affine set to True"
+        assert bn.track_running_stats, (
+            "Only support fusing BatchNorm2d with tracking_running_stats set to True"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fused_module_class = fused_module_class_map.get((type(conv)), None)
         if fused_module_class is not None:
             return fused_module_class(conv, bn)
@@ -80,9 +96,15 @@ def fuse_conv_bn_relu(is_qat, conv, bn, relu):
         >>> # xdoctest: +SKIP
         >>> m2 = fuse_conv_bn_relu(m1, b1, r1)
     """
+<<<<<<< HEAD
     assert (
         conv.training == bn.training == relu.training
     ), "Conv and BN both must be in the same mode (train or eval)."
+=======
+    assert conv.training == bn.training == relu.training, (
+        "Conv and BN both must be in the same mode (train or eval)."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fused_module: Optional[type[nn.Sequential]] = None
     if is_qat:
         map_to_fused_module_train = {
@@ -90,6 +112,7 @@ def fuse_conv_bn_relu(is_qat, conv, bn, relu):
             nn.Conv2d: nni.ConvBnReLU2d,
             nn.Conv3d: nni.ConvBnReLU3d,
         }
+<<<<<<< HEAD
         assert (
             bn.num_features == conv.out_channels
         ), "Output channel of Conv must match num_features of BatchNorm"
@@ -97,6 +120,15 @@ def fuse_conv_bn_relu(is_qat, conv, bn, relu):
         assert (
             bn.track_running_stats
         ), "Only support fusing BatchNorm with tracking_running_stats set to True"
+=======
+        assert bn.num_features == conv.out_channels, (
+            "Output channel of Conv must match num_features of BatchNorm"
+        )
+        assert bn.affine, "Only support fusing BatchNorm with affine set to True"
+        assert bn.track_running_stats, (
+            "Only support fusing BatchNorm with tracking_running_stats set to True"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fused_module = map_to_fused_module_train.get(type(conv), None)
         if fused_module is not None:
             return fused_module(conv, bn, relu)
@@ -133,6 +165,7 @@ def fuse_linear_bn(is_qat, linear, bn):
         >>> # xdoctest: +SKIP
         >>> m2 = fuse_linear_bn(m1, b1)
     """
+<<<<<<< HEAD
     assert (
         linear.training == bn.training
     ), "Linear and BN both must be in the same mode (train or eval)."
@@ -145,6 +178,20 @@ def fuse_linear_bn(is_qat, linear, bn):
         assert (
             bn.track_running_stats
         ), "Only support fusing BatchNorm1d with tracking_running_stats set to True"
+=======
+    assert linear.training == bn.training, (
+        "Linear and BN both must be in the same mode (train or eval)."
+    )
+
+    if is_qat:
+        assert bn.num_features == linear.out_features, (
+            "Output features of Linear must match num_features of BatchNorm1d"
+        )
+        assert bn.affine, "Only support fusing BatchNorm1d with affine set to True"
+        assert bn.track_running_stats, (
+            "Only support fusing BatchNorm1d with tracking_running_stats set to True"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return nni.LinearBn1d(linear, bn)
     else:
         return nn.utils.fusion.fuse_linear_bn_eval(linear, bn)
@@ -166,9 +213,15 @@ def fuse_convtranspose_bn(is_qat, convt, bn):
         >>> # xdoctest: +SKIP
         >>> m2 = fuse_convtranspose_bn(m1, b1)
     """
+<<<<<<< HEAD
     assert (
         convt.training == bn.training
     ), "ConvTranspose and BN both must be in the same mode (train or eval)."
+=======
+    assert convt.training == bn.training, (
+        "ConvTranspose and BN both must be in the same mode (train or eval)."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if is_qat:
         raise Exception(  # noqa: TRY002
diff --git a/torch/ao/quantization/fx/README.md b/torch/ao/quantization/fx/README.md
index ca116b282e7a..1cfe7147f783 100644
--- a/torch/ao/quantization/fx/README.md
+++ b/torch/ao/quantization/fx/README.md
@@ -446,4 +446,8 @@ However, for some operator based backends, like the current pytorch native backe
 
 ## Extensibility
 
+<<<<<<< HEAD
 FX graph mode quantization can be extended to work with different backends, which may have different sets of supported quantized operator patterns and different requirements for each pattern. For more detail, please refer to the [BackendConfig README](/torch/ao/quantization/backend_config/README.md).
+=======
+FX graph mode quantization can be extended to work with different backends, which may have different sets of supported quantized operator patterns and different requirements for each pattern. For more detail, please refer to the [BackendConfig README](../backend_config/README.md).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index da44665a5339..f02346b6a462 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -72,9 +72,15 @@ def quantize_per_tensor(
     """
     if input.dtype in [torch.float16, torch.bfloat16]:
         input = input.to(torch.float32)
+<<<<<<< HEAD
     assert (
         input.dtype == torch.float32
     ), f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+=======
+    assert input.dtype == torch.float32, (
+        f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
 
     inv_scale = 1.0 / scale
@@ -94,9 +100,15 @@ def quantize_per_tensor_meta(
 ) -> torch.Tensor:
     if input.dtype in [torch.float16, torch.bfloat16]:
         input = input.to(torch.float32)
+<<<<<<< HEAD
     assert (
         input.dtype == torch.float32
     ), f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+=======
+    assert input.dtype == torch.float32, (
+        f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch.empty_like(input, dtype=dtype)
 
 
@@ -122,6 +134,7 @@ def quantize_per_tensor_tensor(
     Same as `quantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
     scalar values
     """
+<<<<<<< HEAD
     assert (
         zero_point.numel() == 1
     ), f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
@@ -130,6 +143,21 @@ def quantize_per_tensor_tensor(
     ), f"Expecting scale tensor to be one element, but received : {scale.numel()}"
     return quantize_per_tensor(
         input, scale.item(), zero_point.item(), quant_min, quant_max, dtype  # type: ignore[arg-type]
+=======
+    assert zero_point.numel() == 1, (
+        f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    )
+    assert scale.numel() == 1, (
+        f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    )
+    return quantize_per_tensor(
+        input,
+        scale.item(),
+        zero_point.item(),  # type: ignore[arg-type]
+        quant_min,  # type: ignore[arg-type]
+        quant_max,  # type: ignore[arg-type]
+        dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -144,6 +172,7 @@ def quantize_per_tensor_tensor_meta(
 ) -> torch.Tensor:
     if input.dtype in [torch.float16, torch.bfloat16]:
         input = input.to(torch.float32)
+<<<<<<< HEAD
     assert (
         zero_point.numel() == 1
     ), f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
@@ -153,6 +182,17 @@ def quantize_per_tensor_tensor_meta(
     assert (
         input.dtype == torch.float32
     ), f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+=======
+    assert zero_point.numel() == 1, (
+        f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    )
+    assert scale.numel() == 1, (
+        f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    )
+    assert input.dtype == torch.float32, (
+        f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch.empty_like(input, dtype=dtype)
 
 
@@ -179,12 +219,21 @@ def quantize_per_tensor_tensor2(
     Same as `quantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
     scalar values
     """
+<<<<<<< HEAD
     assert (
         zero_point.numel() == 1
     ), f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
     assert (
         scale.numel() == 1
     ), f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+=======
+    assert zero_point.numel() == 1, (
+        f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    )
+    assert scale.numel() == 1, (
+        f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return quantize_per_tensor(
         input,
         scale.item(),
@@ -205,7 +254,16 @@ def quantize_per_tensor_tensor2_meta(
     dtype: torch.dtype,
 ) -> torch.Tensor:
     return quantize_per_tensor_tensor_meta(
+<<<<<<< HEAD
         input, scale, zero_point, quant_min, quant_max, dtype  # type: ignore[arg-type]
+=======
+        input,
+        scale,
+        zero_point,  # type: ignore[arg-type]
+        quant_min,  # type: ignore[arg-type]
+        quant_max,  # type: ignore[arg-type]
+        dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -256,9 +314,15 @@ def dequantize_per_tensor(
     Returns:
        dequantized float32 Tensor
     """
+<<<<<<< HEAD
     assert (
         input.dtype == dtype
     ), f"Expecting input to have dtype: {dtype}, but got {input.dtype}"
+=======
+    assert input.dtype == dtype, (
+        f"Expecting input to have dtype: {dtype}, but got {input.dtype}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if out_dtype is None:
         out_dtype = torch.float32
     if dtype in _DTYPE_TO_QVALUE_BOUNDS:
@@ -312,12 +376,21 @@ def dequantize_per_tensor_tensor(
     Same as `dequantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
     scalar values
     """
+<<<<<<< HEAD
     assert (
         zero_point.numel() == 1
     ), f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
     assert (
         scale.numel() == 1
     ), f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+=======
+    assert zero_point.numel() == 1, (
+        f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    )
+    assert scale.numel() == 1, (
+        f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return dequantize_per_tensor(
         input,
         scale.item(),
@@ -342,12 +415,21 @@ def dequantize_per_tensor_tensor_meta(
 ) -> torch.Tensor:
     if out_dtype is None:
         out_dtype = torch.float32
+<<<<<<< HEAD
     assert (
         zero_point.numel() == 1
     ), f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
     assert (
         scale.numel() == 1
     ), f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+=======
+    assert zero_point.numel() == 1, (
+        f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    )
+    assert scale.numel() == 1, (
+        f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert input.dtype == dtype, f"Expecting input to have dtype: {dtype}"
     if dtype in _DTYPE_TO_QVALUE_BOUNDS:
         return torch.empty_like(input, dtype=out_dtype)
@@ -382,12 +464,21 @@ def dequantize_per_tensor_tensor2(
     Same as `dequantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
     scalar values
     """
+<<<<<<< HEAD
     assert (
         zero_point.numel() == 1
     ), f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
     assert (
         scale.numel() == 1
     ), f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+=======
+    assert zero_point.numel() == 1, (
+        f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    )
+    assert scale.numel() == 1, (
+        f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return dequantize_per_tensor(
         input,
         scale.item(),
@@ -442,10 +533,19 @@ def choose_qparams_tensor(
         torch.float32,
         torch.float16,
         torch.bfloat16,
+<<<<<<< HEAD
     ], f"Expecting input to have dtype torch.float32/16/b16, but got dtype: {input.dtype}"
     assert (
         dtype in _DTYPE_TO_QVALUE_BOUNDS
     ), f"Expecting target dtype to be one of {_DTYPE_TO_QVALUE_BOUNDS.keys()}, but got: {dtype}"
+=======
+    ], (
+        f"Expecting input to have dtype torch.float32/16/b16, but got dtype: {input.dtype}"
+    )
+    assert dtype in _DTYPE_TO_QVALUE_BOUNDS, (
+        f"Expecting target dtype to be one of {_DTYPE_TO_QVALUE_BOUNDS.keys()}, but got: {dtype}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     validate_qmin_qmax(qmin, qmax)
 
     min_val, max_val = torch.aminmax(input)
@@ -492,10 +592,19 @@ def choose_qparams_symmetric_tensor(
         torch.float32,
         torch.float16,
         torch.bfloat16,
+<<<<<<< HEAD
     ], f"Expecting input to have dtype torch.float32/16/b16, but got dtype: {input.dtype}"
     assert (
         dtype in _DTYPE_TO_QVALUE_BOUNDS
     ), f"Expecting target dtype to be one of {_DTYPE_TO_QVALUE_BOUNDS.keys()}, but got: {dtype}"
+=======
+    ], (
+        f"Expecting input to have dtype torch.float32/16/b16, but got dtype: {input.dtype}"
+    )
+    assert dtype in _DTYPE_TO_QVALUE_BOUNDS, (
+        f"Expecting target dtype to be one of {_DTYPE_TO_QVALUE_BOUNDS.keys()}, but got: {dtype}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     validate_qmin_qmax(qmin, qmax)
 
     min_val, max_val = torch.aminmax(input)
@@ -519,11 +628,21 @@ def choose_qparams_tensor_meta(
         torch.float32,
         torch.float16,
         torch.bfloat16,
+<<<<<<< HEAD
     ], f"Expecting input to have dtype torch.float32/16/b16, but got dtype: {input.dtype}"
     assert (
         quant_min < quant_max
     ), f"Expecting quant_min to be smaller than quant_max but received min: \
         {quant_min} max: {quant_max}"
+=======
+    ], (
+        f"Expecting input to have dtype torch.float32/16/b16, but got dtype: {input.dtype}"
+    )
+    assert quant_min < quant_max, (
+        f"Expecting quant_min to be smaller than quant_max but received min: \
+        {quant_min} max: {quant_max}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch.empty(1, dtype=torch.double, device=input.device), torch.empty(
         1, dtype=torch.int64, device=input.device
     )
@@ -582,9 +701,15 @@ def quantize_per_channel(
     """
     if input.dtype in [torch.float16, torch.bfloat16]:
         input = input.to(torch.float32)
+<<<<<<< HEAD
     assert (
         input.dtype == torch.float32
     ), f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+=======
+    assert input.dtype == torch.float32, (
+        f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
     input, permute_axis_list = _permute_to_axis_zero(input, axis)
@@ -613,9 +738,15 @@ def quantize_per_channel_meta(
 ) -> torch.Tensor:
     if input.dtype in [torch.float16, torch.bfloat16]:
         input = input.to(torch.float32)
+<<<<<<< HEAD
     assert (
         input.dtype == torch.float32
     ), f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+=======
+    assert input.dtype == torch.float32, (
+        f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
     return torch.empty_like(input, dtype=dtype)
@@ -671,9 +802,15 @@ def dequantize_per_channel(
     Returns:
        dequantized float32 Tensor
     """
+<<<<<<< HEAD
     assert (
         input.dtype == dtype
     ), f"Expecting input to have dtype {dtype}, but got dtype: {input.dtype}"
+=======
+    assert input.dtype == dtype, (
+        f"Expecting input to have dtype {dtype}, but got dtype: {input.dtype}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if out_dtype is None:
         out_dtype = torch.float32
     assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
@@ -706,9 +843,15 @@ def dequantize_per_channel_meta(
     *,
     out_dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
+<<<<<<< HEAD
     assert (
         input.dtype == dtype
     ), f"Expecting input to have dtype {dtype}, but got dtype: {input.dtype}"
+=======
+    assert input.dtype == dtype, (
+        f"Expecting input to have dtype {dtype}, but got dtype: {input.dtype}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if out_dtype is None:
         out_dtype = torch.float32
     assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
@@ -863,12 +1006,21 @@ def choose_qparams_per_token_asymmetric_meta(
 
 def _per_token_quant_qparam_dim_check(input, scales, zero_points):
     num_tokens = math.prod(list(input.size())[:-1])
+<<<<<<< HEAD
     assert (
         num_tokens == scales.numel()
     ), f"num_tokens: {num_tokens} scales: {scales.size()}"
     assert (
         num_tokens == zero_points.numel()
     ), f"num_tokens: {num_tokens} zero_points: {zero_points.size()}"
+=======
+    assert num_tokens == scales.numel(), (
+        f"num_tokens: {num_tokens} scales: {scales.size()}"
+    )
+    assert num_tokens == zero_points.numel(), (
+        f"num_tokens: {num_tokens} zero_points: {zero_points.size()}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 quantized_decomposed_lib.define(
@@ -1138,9 +1290,15 @@ def forward(ctx, input, scales, zero_points, axis, quant_min, quant_max):
             scales = scales.to(torch.float32)
         if zero_points.dtype != torch.int32:
             zero_points = zero_points.to(torch.int32)
+<<<<<<< HEAD
         assert (
             input.dtype == torch.float32
         ), f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+=======
+        assert input.dtype == torch.float32, (
+            f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
         broadcast_dims = list(range(0, axis)) + list(range(axis + 1, input.ndim))
         unsqueeze_scales = _unsqueeze_multiple(scales, broadcast_dims)
diff --git a/torch/ao/quantization/fx/_equalize.py b/torch/ao/quantization/fx/_equalize.py
index 77bc4e31d199..751d97ab18ac 100644
--- a/torch/ao/quantization/fx/_equalize.py
+++ b/torch/ao/quantization/fx/_equalize.py
@@ -360,9 +360,17 @@ def get_op_node_and_weight_eq_obs(
             model, "equalization_node_name_to_qconfig"
         )
         assert maybe_equalization_node_name_to_config is not None
+<<<<<<< HEAD
         equalization_node_name_to_qconfig: dict[str, Any] = maybe_equalization_node_name_to_config  # type: ignore[assignment]
         assert equalization_node_name_to_qconfig.get(op_node.name, None) is not None
         weight_eq_obs = equalization_node_name_to_qconfig.get(
+=======
+        equalization_node_name_to_qconfig: dict[str, Any] = (
+            maybe_equalization_node_name_to_config  # type: ignore[assignment]
+        )
+        assert equalization_node_name_to_qconfig.get(op_node.name, None) is not None
+        weight_eq_obs = equalization_node_name_to_qconfig.get(  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             op_node.name, None
         ).weight()
 
@@ -843,7 +851,11 @@ def convert_eq_obs(
 
                 # Erase the weight equalization observer node
                 prev_node = weight_eq_obs_node.args[0]
+<<<<<<< HEAD
                 remove_node(model, weight_eq_obs_node, prev_node)
+=======
+                remove_node(model, weight_eq_obs_node, prev_node)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise ValueError(
                     "Expected operation node to be 'call_module' or 'call_function"
diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index 233d47ebc7ba..cc6598f54f03 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -28,7 +28,11 @@
 )
 
 
+<<<<<<< HEAD
 QOP_TO_ARG_NAMES_TO_SKIP = {
+=======
+QOP_TO_ARG_NAMES_TO_SKIP: dict[Callable[..., Any], list[str]] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._ops.ops.quantized.hardswish: ["inplace"],
     torch._ops.ops.quantized.elu: ["inplace"],
     torch._ops.ops.quantized.dropout: ["inplace"],
@@ -411,6 +415,11 @@ def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: dict[str, QConfigA
     torch.mul: torch.ops.quantized.mul_relu,
 }
 
+<<<<<<< HEAD
+=======
+ORIGINAL_WEIGHTS_LOOKUP = "original_weights_lookup"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _save_packed_weight(self, destination, prefix, keep_vars):
     for attr_name in dir(self):
@@ -433,7 +442,13 @@ def _load_packed_weight(
 ):
     attrs_to_pop = []
     for attr_name in state_dict:
+<<<<<<< HEAD
         if attr_name.startswith("_packed_weight") and isinstance(state_dict[attr_name], torch._C.ScriptObject):  # type: ignore[attr-defined] # noqa: B950
+=======
+        if attr_name.startswith("_packed_weight") and isinstance(
+            state_dict[attr_name], torch._C.ScriptObject
+        ):  # type: ignore[attr-defined] # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             setattr(self, attr_name, state_dict[attr_name])
             attrs_to_pop.append(attr_name)
 
@@ -509,6 +524,10 @@ def load_arg(a):
                     packed_weight_name.replace(":", "_")
                     .replace("/", "_")
                     .replace("|", "_")
+<<<<<<< HEAD
+=======
+                    .replace(" ", "")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     .lower()
                 )
                 original_weights_lookup[key_name] = original_weights_lookup[
@@ -529,7 +548,11 @@ def load_arg(a):
 
     if keep_original_weights:
         setattr(  # noqa: B010
+<<<<<<< HEAD
             quantized_model, "original_weights_lookup", original_weights_lookup
+=======
+            quantized_model, ORIGINAL_WEIGHTS_LOOKUP, original_weights_lookup
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return quantized_model
@@ -610,9 +633,15 @@ def _match_static_pattern(
     # (2) There must be at least one dequantize node
     matched_dequantize = False
     for i in dequantize_node_arg_indices:
+<<<<<<< HEAD
         assert i < len(
             ref_node.args
         ), f"Dequantize index {i} exceeded reference node's arg length {len(ref_node.args)}"
+=======
+        assert i < len(ref_node.args), (
+            f"Dequantize index {i} exceeded reference node's arg length {len(ref_node.args)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arg = ref_node.args[i]
         if is_dequantize_node(arg):
             matched_dequantize = True
@@ -697,7 +726,15 @@ def _lower_static_weighted_ref_module(
             STATIC_LOWER_FUSED_MODULE_MAP.keys()
         )
         q_node, _relu_node, ref_node = _match_static_pattern(
+<<<<<<< HEAD
             n, modules, qconfig_map, matching_modules, dequantize_node_arg_indices=[0]  # type: ignore[arg-type]
+=======
+            n,
+            modules,
+            qconfig_map,
+            matching_modules,  # type: ignore[arg-type]
+            dequantize_node_arg_indices=[0],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if q_node is None:
             continue
@@ -754,7 +791,14 @@ def _lower_static_weighted_ref_module_with_two_inputs(
         # Step 0: Find nodes that match this pattern (dequantize - ref module - quantize)
         matching_modules = list(STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP.keys())
         (q_node, ref_node) = _match_static_pattern_with_two_inputs(
+<<<<<<< HEAD
             n, modules, qconfig_map, matching_modules  # type: ignore[arg-type]
+=======
+            n,
+            modules,
+            qconfig_map,
+            matching_modules,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if q_node is None:
             continue
diff --git a/torch/ao/quantization/fx/_model_report/detector.py b/torch/ao/quantization/fx/_model_report/detector.py
index 351f88e43aa5..4f6c47971427 100644
--- a/torch/ao/quantization/fx/_model_report/detector.py
+++ b/torch/ao/quantization/fx/_model_report/detector.py
@@ -456,6 +456,10 @@ class DynamicStaticDetector(DetectorBase):
     Args:
         tolerance (float, optional): The threshold where S metric is stationary above and non-stationary otherwise. Default: 0.5
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # names for the pre and post observers that are inserted
     DEFAULT_PRE_OBSERVER_NAME = "model_report_pre_observer"
     DEFAULT_POST_OBSERVER_NAME = "model_report_post_observer"
@@ -1158,9 +1162,15 @@ def _generate_comparison_values(
             input_channels = len(input_ratio)
             if weight_channels != input_channels:
                 # we try to replicate
+<<<<<<< HEAD
                 assert (
                     input_channels % weight_channels == 0
                 ), "input channels should be divisible by weight channels."
+=======
+                assert input_channels % weight_channels == 0, (
+                    "input channels should be divisible by weight channels."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # get replication factor
                 rep_factor: int = input_channels // weight_channels
 
diff --git a/torch/ao/quantization/fx/_model_report/model_report.py b/torch/ao/quantization/fx/_model_report/model_report.py
index e76a2bf06f66..c77fba3e19f2 100644
--- a/torch/ao/quantization/fx/_model_report/model_report.py
+++ b/torch/ao/quantization/fx/_model_report/model_report.py
@@ -84,7 +84,13 @@ class compiles the report generated by each Detector class into a single report
         >>> # xdoctest: +SKIP
         >>> # get the necessary qconfig
         >>> config = PrepareCustomConfig()
+<<<<<<< HEAD
         >>> skipped_module_names, skipped_module_classes = get_skipped_module_name_and_classes(config, False)
+=======
+        >>> skipped_module_names, skipped_module_classes = (
+        ...     get_skipped_module_name_and_classes(config, False)
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> # initialize our model and get GraphModule
         >>> model = SomeModel()
@@ -92,7 +98,16 @@ class compiles the report generated by each Detector class into a single report
         >>> graph_module = GraphModule(model, tracer.trace(model))
 
         >>> # get our set of detectors and ModelReport instance
+<<<<<<< HEAD
         >>> detector_set = set([DynamicStaticDetector(tolerance=0.5), InputWeightEqualizationDetector(ratio_threshold=0.7)])
+=======
+        >>> detector_set = set(
+        ...     [
+        ...         DynamicStaticDetector(tolerance=0.5),
+        ...         InputWeightEqualizationDetector(ratio_threshold=0.7),
+        ...     ]
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> tracer_reporter = ModelReport(graph_module, tracer_detector_set)
 
         >>> # now we insert the observers and callibrate the model
@@ -102,7 +117,13 @@ class compiles the report generated by each Detector class into a single report
         >>>     tracer_model_with_observers(example_input)
 
         >>> # finally we generate the reports and optionally remove the observers we inserted
+<<<<<<< HEAD
         >>> reports = tracer_reporter.generate_model_report(remove_inserted_observers=True)
+=======
+        >>> reports = tracer_reporter.generate_model_report(
+        ...     remove_inserted_observers=True
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> # Optional: we can generate the qconfig mapping based on the suggestions
         >>> qconfigs = model_report.generate_qconfig_mapping()
diff --git a/torch/ao/quantization/fx/_model_report/model_report_observer.py b/torch/ao/quantization/fx/_model_report/model_report_observer.py
index db9c130606a8..d617ba2827e9 100644
--- a/torch/ao/quantization/fx/_model_report/model_report_observer.py
+++ b/torch/ao/quantization/fx/_model_report/model_report_observer.py
@@ -279,7 +279,11 @@ def reset_batch_and_epoch_values(self):
         self.constant_channels = torch.tensor([], device=device)
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise Exception(  # noqa: TRY002
             "calculate_qparams should not be called for ModelReportObserver"
         )
diff --git a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
index c8699813f2d1..7cbb4af1131a 100644
--- a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
+++ b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
@@ -338,9 +338,14 @@ def generate_filtered_tables(
         Example Use:
             >>> # xdoctest: +SKIP("undefined variables")
             >>> mod_report_visualizer.generate_filtered_tables(
+<<<<<<< HEAD
             ...     feature_filter = "per_channel_min",
             ...     module_fqn_filter = "block1"
             ... ) # generates table with per_channel_min info for all modules in block 1 of the model
+=======
+            ...     feature_filter="per_channel_min", module_fqn_filter="block1"
+            ... )  # generates table with per_channel_min info for all modules in block 1 of the model
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         # first get the filtered data
         filtered_data: OrderedDict[str, Any] = self._get_filtered_data(
@@ -427,8 +432,12 @@ def generate_table_visualization(
         Example Use:
             >>> # xdoctest: +SKIP("undefined variables")
             >>> mod_report_visualizer.generate_table_visualization(
+<<<<<<< HEAD
             ...     feature_filter = "per_channel_min",
             ...     module_fqn_filter = "block1"
+=======
+            ...     feature_filter="per_channel_min", module_fqn_filter="block1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ... )
             >>> # prints out neatly formatted table with per_channel_min info
             >>> # for all modules in block 1 of the model
@@ -590,8 +599,12 @@ def generate_plot_visualization(
         Example Use:
             >>> # xdoctest: +SKIP("undefined variables")
             >>> mod_report_visualizer.generate_plot_visualization(
+<<<<<<< HEAD
             ...     feature_filter = "per_channel_min",
             ...     module_fqn_filter = "block1"
+=======
+            ...     feature_filter="per_channel_min", module_fqn_filter="block1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ... )
             >>> # outputs line plot of per_channel_min information for all
             >>> # modules in block1 of model each channel gets it's own line,
@@ -664,8 +677,12 @@ def generate_histogram_visualization(
         Example Use:
             >>> # xdoctest: +SKIP
             >>> mod_report_visualizer.generategenerate_histogram_visualization_plot_visualization(
+<<<<<<< HEAD
             ...     feature_filter = "per_channel_min",
             ...     module_fqn_filter = "block1"
+=======
+            ...     feature_filter="per_channel_min", module_fqn_filter="block1"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ... )
             # outputs histogram of per_channel_min information for all modules in block1 of model
                 information is gathered across all channels for all modules in block 1 for the
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 457e03f66090..434fb4148476 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -509,9 +509,15 @@ def _replace_observer_or_dequant_stub_with_dequantize_node(
     node: Node, graph: Graph
 ) -> None:
     call_custom_module_node = node.args[0]
+<<<<<<< HEAD
     assert isinstance(
         call_custom_module_node, Node
     ), f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}"
+=======
+    assert isinstance(call_custom_module_node, Node), (
+        f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     node.replace_all_uses_with(call_custom_module_node)
     graph.erase_node(node)
     _insert_dequantize_node(call_custom_module_node, graph)
@@ -604,9 +610,15 @@ def _get_module_path_and_prefix(
     # operator (they can be the same)
     # this flag identifies if the observer is inserted only because the observed node is
     # the input of the next operator
+<<<<<<< HEAD
     assert isinstance(
         observed_node, Node
     ), f"Expecting observed node to be a Node, but got {observed_node}"
+=======
+    assert isinstance(observed_node, Node), (
+        f"Expecting observed node to be a Node, but got {observed_node}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_input_observer_only = (
         node_name_to_qconfig[observed_node.name] is None
         if observed_node.name in node_name_to_qconfig
@@ -865,9 +877,15 @@ def convert_weighted_module(
     ref_qmodule_cls = root_module_to_quantized_reference_module.get(
         type_before_parametrizations(float_module), None
     )
+<<<<<<< HEAD
     assert (
         ref_qmodule_cls is not None
     ), f"No reference quantized module class configured for {type_before_parametrizations(float_module)}"
+=======
+    assert ref_qmodule_cls is not None, (
+        f"No reference quantized module class configured for {type_before_parametrizations(float_module)}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ref_qmodule = ref_qmodule_cls.from_float(float_module, wq_or_wq_dict)  # type: ignore[attr-defined]
     if fused_module is not None:
         fused_module[0] = ref_qmodule  # type: ignore[operator]
@@ -887,9 +905,15 @@ def _remove_previous_dequantize_in_custom_module(
                  \\ - dequantize
     """
     # expecting the input node for a custom module node to be a Node
+<<<<<<< HEAD
     assert isinstance(
         prev_node, Node
     ), f"Expecting the argument for custom module node to be a Node, but got {prev_node}"
+=======
+    assert isinstance(prev_node, Node), (
+        f"Expecting the argument for custom module node to be a Node, but got {prev_node}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if prev_node.op == "call_method" and prev_node.target == "dequantize":
         node.replace_input_with(prev_node, prev_node.args[0])
         # Remove the dequantize node if it doesn't have other users
@@ -1060,14 +1084,26 @@ def convert(
 
     assert _is_observed_module(model), "incoming model must be produced by prepare_fx"
     observed_graph_module_attrs = model.meta["_observed_graph_module_attrs"]
+<<<<<<< HEAD
     node_name_to_scope: dict[
         str, tuple[str, type]
     ] = observed_graph_module_attrs.node_name_to_scope
+=======
+    node_name_to_scope: dict[str, tuple[str, type]] = (
+        observed_graph_module_attrs.node_name_to_scope
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prepare_custom_config: PrepareCustomConfig = (
         observed_graph_module_attrs.prepare_custom_config
     )
     observed_node_names: set[str] = observed_graph_module_attrs.observed_node_names
+<<<<<<< HEAD
     node_name_to_qconfig: dict[str, QConfigAny] = observed_graph_module_attrs.node_name_to_qconfig  # type: ignore[assignment]
+=======
+    node_name_to_qconfig: dict[str, QConfigAny] = (
+        observed_graph_module_attrs.node_name_to_qconfig
+    )  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # mapping from fully qualified module name to module instance
     # for example,
@@ -1083,14 +1119,26 @@ def convert(
     # TODO refactor this code once we update the prepare logic to have additional information on
     # which graph nodes have been observed and share that with convert to decide which observers to ignore.
     if qconfig_mapping:
+<<<<<<< HEAD
         prepare_qconfig_mapping: QConfigMapping = observed_graph_module_attrs.qconfig_mapping  # type: ignore[assignment]
+=======
+        prepare_qconfig_mapping: QConfigMapping = (
+            observed_graph_module_attrs.qconfig_mapping
+        )  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         modules_copy = copy.deepcopy(modules)
 
         if observed_graph_module_attrs.is_qat:
             _update_qconfig_for_qat(qconfig_mapping, backend_config)
         _update_qconfig_for_fusion(model, qconfig_mapping)
 
+<<<<<<< HEAD
         _compare_prepare_convert_qconfig_mappings(prepare_qconfig_mapping, qconfig_mapping)  # type: ignore[arg-type]
+=======
+        _compare_prepare_convert_qconfig_mappings(
+            prepare_qconfig_mapping, qconfig_mapping
+        )  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         convert_node_name_to_qconfig = _generate_node_name_to_qconfig(
             model, modules_copy, model.graph, qconfig_mapping, node_name_to_scope
         )
@@ -1098,9 +1146,15 @@ def convert(
         # all the values either match what was set in prepare node_name_to_qconfig
         # or are set to None in the convert_node_name_to_qconfig.
         for k, v in node_name_to_qconfig.items():
+<<<<<<< HEAD
             assert (
                 k in convert_node_name_to_qconfig
             ), f"Expected key {k} in convert node_name_to_qconfig"
+=======
+            assert k in convert_node_name_to_qconfig, (
+                f"Expected key {k} in convert node_name_to_qconfig"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if convert_node_name_to_qconfig[k] is not None:
                 assert qconfig_equals(v, convert_node_name_to_qconfig[k]), (
                     f"Expected k {k} to have the same value in prepare and convert QConfigMappings, "
diff --git a/torch/ao/quantization/fx/custom_config.py b/torch/ao/quantization/fx/custom_config.py
index 5301db9317fd..593d5b6541ef 100644
--- a/torch/ao/quantization/fx/custom_config.py
+++ b/torch/ao/quantization/fx/custom_config.py
@@ -355,9 +355,15 @@ def _make_tuple(key: Any, e: StandaloneModuleConfigEntry):
         ) in self.float_to_observed_mapping.items():
             if FLOAT_TO_OBSERVED_DICT_KEY not in d:
                 d[FLOAT_TO_OBSERVED_DICT_KEY] = {}
+<<<<<<< HEAD
             d[FLOAT_TO_OBSERVED_DICT_KEY][
                 _get_quant_type_to_str(quant_type)
             ] = float_to_observed_mapping
+=======
+            d[FLOAT_TO_OBSERVED_DICT_KEY][_get_quant_type_to_str(quant_type)] = (
+                float_to_observed_mapping
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(self.non_traceable_module_names) > 0:
             d[NON_TRACEABLE_MODULE_NAME_DICT_KEY] = self.non_traceable_module_names
         if len(self.non_traceable_module_classes) > 0:
@@ -460,9 +466,15 @@ def to_dict(self) -> dict[str, Any]:
         ) in self.observed_to_quantized_mapping.items():
             if OBSERVED_TO_QUANTIZED_DICT_KEY not in d:
                 d[OBSERVED_TO_QUANTIZED_DICT_KEY] = {}
+<<<<<<< HEAD
             d[OBSERVED_TO_QUANTIZED_DICT_KEY][
                 _get_quant_type_to_str(quant_type)
             ] = observed_to_quantized_mapping
+=======
+            d[OBSERVED_TO_QUANTIZED_DICT_KEY][_get_quant_type_to_str(quant_type)] = (
+                observed_to_quantized_mapping
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(self.preserved_attributes) > 0:
             d[PRESERVED_ATTRIBUTES_DICT_KEY] = self.preserved_attributes
         return d
@@ -474,7 +486,13 @@ class FuseCustomConfig:
 
     Example usage::
 
+<<<<<<< HEAD
         fuse_custom_config = FuseCustomConfig().set_preserved_attributes(["attr1", "attr2"])
+=======
+        fuse_custom_config = FuseCustomConfig().set_preserved_attributes(
+            ["attr1", "attr2"]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(self) -> None:
diff --git a/torch/ao/quantization/fx/fuse_handler.py b/torch/ao/quantization/fx/fuse_handler.py
index b7a3c60d0dd5..e4be3cc3a341 100644
--- a/torch/ao/quantization/fx/fuse_handler.py
+++ b/torch/ao/quantization/fx/fuse_handler.py
@@ -64,9 +64,15 @@ def fuse(
         fuser_method_mapping: dict[Pattern, Union[torch.nn.Sequential, Callable]],
         is_qat: bool,
     ) -> Node:
+<<<<<<< HEAD
         assert (
             root_node.op == "call_module"
         ), "Expecting module node to be a call_module Node"
+=======
+        assert root_node.op == "call_module", (
+            "Expecting module node to be a call_module Node"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         root_module = named_modules[str(root_node.target)]
 
         def get_modules(pattern):
diff --git a/torch/ao/quantization/fx/graph_module.py b/torch/ao/quantization/fx/graph_module.py
index 235292553d22..b1a2be55a6a0 100644
--- a/torch/ao/quantization/fx/graph_module.py
+++ b/torch/ao/quantization/fx/graph_module.py
@@ -175,7 +175,13 @@ def _load_from_state_dict(
     ):
         attrs_to_pop = []
         for attr_name in state_dict:
+<<<<<<< HEAD
             if attr_name.startswith("_packed_weight") and isinstance(state_dict[attr_name], torch._C.ScriptObject):  # type: ignore[attr-defined] # noqa: B950
+=======
+            if attr_name.startswith("_packed_weight") and isinstance(
+                state_dict[attr_name], torch._C.ScriptObject
+            ):  # type: ignore[attr-defined] # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 setattr(self, attr_name, state_dict[attr_name])
                 attrs_to_pop.append(attr_name)
 
diff --git a/torch/ao/quantization/fx/lstm_utils.py b/torch/ao/quantization/fx/lstm_utils.py
index f4fcb8689448..cae7313b4a36 100644
--- a/torch/ao/quantization/fx/lstm_utils.py
+++ b/torch/ao/quantization/fx/lstm_utils.py
@@ -81,6 +81,7 @@ def make_qconfig(obs_ctr: _PartialWrapper) -> QConfig:
     quantizable_lstm.qconfig = float_lstm.qconfig
 
     for idx in range(float_lstm.num_layers):
+<<<<<<< HEAD
         quantizable_lstm.layers[
             idx
         ] = torch.ao.nn.quantizable.modules.rnn._LSTMLayer.from_float(
@@ -89,6 +90,16 @@ def make_qconfig(obs_ctr: _PartialWrapper) -> QConfig:
             float_lstm.qconfig,
             batch_first=False,
             split_gates=split_gates,
+=======
+        quantizable_lstm.layers[idx] = (
+            torch.ao.nn.quantizable.modules.rnn._LSTMLayer.from_float(
+                float_lstm,
+                idx,
+                float_lstm.qconfig,
+                batch_first=False,
+                split_gates=split_gates,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     # Build QConfigMapping for the LSTM cell
@@ -104,7 +115,12 @@ def make_qconfig(obs_ctr: _PartialWrapper) -> QConfig:
     # Insert observers into each LSTM cell
     # TODO: maybe make this work for layer_bw as well
     for layer in quantizable_lstm.layers:
+<<<<<<< HEAD
         cell = layer.layer_fw.cell
+=======
+        cell = layer.layer_fw.cell  # type: ignore[union-attr]
+        assert isinstance(cell, torch.nn.Module), "cell should be a nn.Module"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cell = prepare_fx(cell, cell_qm, example_inputs, backend_config=backend_config)
         # HACK: Manually replace the activation_post_process following these ops.
         # This is needed for FloatFunctional ops because there is currently no way
@@ -154,7 +170,11 @@ def make_qconfig(obs_ctr: _PartialWrapper) -> QConfig:
                 setattr(
                     cell, activation_post_process_name, activation_post_process_ctr()
                 )
+<<<<<<< HEAD
         layer.layer_fw.cell = cell
+=======
+        layer.layer_fw.cell = cell  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return quantizable_lstm
 
 
@@ -216,5 +236,9 @@ def _get_reference_quantized_lstm_module(
                         node.replace_input_with(arg, arg.args[0])
         cell.graph.eliminate_dead_code()
         cell.recompile()
+<<<<<<< HEAD
         layer.layer_fw.cell = cell
+=======
+        layer.layer_fw.cell = cell  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return quantized_lstm
diff --git a/torch/ao/quantization/fx/pattern_utils.py b/torch/ao/quantization/fx/pattern_utils.py
index 551f68be424f..9db54ee91dbd 100644
--- a/torch/ao/quantization/fx/pattern_utils.py
+++ b/torch/ao/quantization/fx/pattern_utils.py
@@ -47,9 +47,15 @@ def _register_quant_pattern(pattern, fixed_qparams_observer=None):
     def insert(fn):
         _DEFAULT_QUANTIZATION_PATTERNS[pattern] = fn
         if fixed_qparams_observer is not None:
+<<<<<<< HEAD
             _DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP[
                 pattern
             ] = FixedQParamsFakeQuantize.with_args(observer=fixed_qparams_observer)
+=======
+            _DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP[pattern] = (
+                FixedQParamsFakeQuantize.with_args(observer=fixed_qparams_observer)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             _DEFAULT_OUTPUT_OBSERVER_MAP[pattern] = fixed_qparams_observer
         return fn
 
@@ -81,7 +87,11 @@ def get_default_output_activation_post_process_map(
 
 
 def _sorted_patterns_dict(
+<<<<<<< HEAD
     patterns_dict: dict[Pattern, QuantizeHandler]
+=======
+    patterns_dict: dict[Pattern, QuantizeHandler],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> dict[Pattern, QuantizeHandler]:
     """
     Return a sorted version of the patterns dictionary such that longer patterns are matched first,
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index e6fb3cda3bcf..38efaab00204 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -117,7 +117,11 @@
 
 
 def _get_observer_kwargs(
+<<<<<<< HEAD
     quant_spec: Union[QuantizationSpec, FixedQParamsQuantizationSpec]
+=======
+    quant_spec: Union[QuantizationSpec, FixedQParamsQuantizationSpec],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     kwargs_dict = asdict(quant_spec)
     return copy.deepcopy(kwargs_dict)
@@ -213,9 +217,15 @@ def _needs_obs_or_fq(
     # need to insert placeholder observer for dynamic quantization so that it can
     # be converted to choose_qparams -> q -> dq in convert step
     if cur_target_is_dynamic:
+<<<<<<< HEAD
         assert (
             cur_target_dtype in _OBS_DTYPE_LIST
         ), f"Expected cur_target_dtype to be torch.float, but got: {cur_target_dtype}"
+=======
+        assert cur_target_dtype in _OBS_DTYPE_LIST, (
+            f"Expected cur_target_dtype to be torch.float, but got: {cur_target_dtype}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert prev_output_dtype not in _DO_NOT_OBS_DTYPE_LIST
         return is_zeroth_arg
     if reuse_input_obs_or_fq:
@@ -695,9 +705,15 @@ def _get_output_act_obs_or_fq(
         )
     elif _is_activation_post_process_node(arg, named_modules):
         observed_arg = arg.args[0]
+<<<<<<< HEAD
         assert isinstance(
             observed_arg, Node
         ), "Currently we only support observing Node"
+=======
+        assert isinstance(observed_arg, Node), (
+            "Currently we only support observing Node"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if "quantization_annotation" in observed_arg.meta:
             output_act_obs_or_fq = _create_obs_or_fq_from_qspec(
                 observed_arg.meta["quantization_annotation"].output_qspec,
@@ -935,8 +951,12 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
                 maybe_obs_mod = named_modules[maybe_obs_node.target]  # type: ignore[index]
                 if (
                     type(maybe_obs_mod) == type(arg_as_input_act_obs_or_fq)
+<<<<<<< HEAD
                     and maybe_obs_mod.dtype
                     == arg_as_input_target_dtype  # type: ignore[possibly-undefined]
+=======
+                    and maybe_obs_mod.dtype == arg_as_input_target_dtype  # type: ignore[possibly-undefined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     arg_as_input_act_obs_or_fq = maybe_obs_mod  # type: ignore[assignment]
                     existing_obs_node = maybe_obs_node
@@ -1502,7 +1522,11 @@ def insert_observers_for_model(
 
     # first, populate the dtype map based only on qconfig and qhandler
     # this assumes:
+<<<<<<< HEAD
     # graph inputs are fp32 by default, and int8 where overriden
+=======
+    # graph inputs are fp32 by default, and int8 where overridden
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # other nodes output dtype is specified by the qconfig
     named_modules = dict(model.named_modules(remove_duplicate=False))
 
@@ -1938,9 +1962,13 @@ def _run_prepare_fx_on_standalone_modules(
         )
 
         standalone_module = named_modules[root_node.target]
+<<<<<<< HEAD
         prepare = (
             torch.ao.quantization.quantize_fx._prepare_standalone_module_fx
         )  # type: ignore[attr-defined]
+=======
+        prepare = torch.ao.quantization.quantize_fx._prepare_standalone_module_fx  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         observed_standalone_module = prepare(
             standalone_module,
             sm_qconfig_mapping,
@@ -2174,9 +2202,15 @@ def prepare(
         # converting List[int] to Tensor since module attribute is
         # Union[Tensor, Module]
         input_quantized_idxs: list[int] = prepare_custom_config.input_quantized_indexes
+<<<<<<< HEAD
         output_quantized_idxs: list[
             int
         ] = prepare_custom_config.output_quantized_indexes
+=======
+        output_quantized_idxs: list[int] = (
+            prepare_custom_config.output_quantized_indexes
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         observed_graph_module_attrs = model.meta["_observed_graph_module_attrs"]
         # inplace modification
         observed_graph_module_attrs.is_observed_standalone_module = True
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 47d30e424668..986e7e7608cf 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -229,7 +229,13 @@ def _compare_prepare_convert_qconfig_mappings(
     """
     assert qconfig_equals(
         prepare_qconfig_mapping.global_qconfig, convert_qconfig_mapping.global_qconfig
+<<<<<<< HEAD
     ), "Expected global qconfigs to be the same in the prepare and convert quantization configs"
+=======
+    ), (
+        "Expected global qconfigs to be the same in the prepare and convert quantization configs"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prepare_dicts: list[OrderedDict] = [
         prepare_qconfig_mapping.object_type_qconfigs,
         prepare_qconfig_mapping.module_name_qconfigs,
@@ -247,6 +253,7 @@ def _compare_prepare_convert_qconfig_mappings(
     ]
     for i in range(len(prepare_dicts)):
         for name in prepare_dicts[i].keys():
+<<<<<<< HEAD
             assert (
                 name in convert_dicts[i]
             ), f"Missing key {dict_names[i]} {name} in convert QConfigMapping \
@@ -255,6 +262,18 @@ def _compare_prepare_convert_qconfig_mappings(
                 prepare_dicts[i][name], convert_dicts[i][name]
             ), f"Expected convert QConfigMapping to have the same qconfig as prepare for key {dict_names[i]} {name}; \
                 prepare: {prepare_dicts[i][name]}; convert: {convert_dicts[i][name]}"
+=======
+            assert name in convert_dicts[i], (
+                f"Missing key {dict_names[i]} {name} in convert QConfigMapping \
+                when it was present in prepare"
+            )
+            assert convert_dicts[i][name] is None or qconfig_equals(
+                prepare_dicts[i][name], convert_dicts[i][name]
+            ), (
+                f"Expected convert QConfigMapping to have the same qconfig as prepare for key {dict_names[i]} {name}; \
+                prepare: {prepare_dicts[i][name]}; convert: {convert_dicts[i][name]}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _is_qconfig_supported_by_dtype_configs(
@@ -376,10 +395,15 @@ def _get_flattened_qconfig_dict(
     flattened: dict[Union[Callable, str], QConfigAny] = {
         "": qconfig_mapping.global_qconfig
     }
+<<<<<<< HEAD
     for obj, qconfig in qconfig_mapping.object_type_qconfigs.items():
         flattened[obj] = qconfig
     for obj, qconfig in qconfig_mapping.module_name_qconfigs.items():
         flattened[obj] = qconfig
+=======
+    flattened.update(qconfig_mapping.object_type_qconfigs)
+    flattened.update(qconfig_mapping.module_name_qconfigs)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return flattened
 
 
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index ba8d779e1c02..efa7f73500a3 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -114,7 +114,11 @@ def node_arg_is_bias(node: Node, arg: Any) -> bool:
 
 
 def get_custom_module_class_keys(
+<<<<<<< HEAD
     custom_module_mapping: dict[QuantType, dict[type, type]]
+=======
+    custom_module_mapping: dict[QuantType, dict[type, type]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> list[Any]:
     r"""Get all the unique custom module keys in the custom config dict
     e.g.
@@ -495,7 +499,13 @@ def _is_custom_module_lstm(
     """
     mod = _get_module(node, named_modules)
     if qconfig is not None and qhandler is not None:
+<<<<<<< HEAD
         assert isinstance(qhandler, torch.ao.quantization.fx.quantize_handler.QuantizeHandler)  # type: ignore[attr-defined]
+=======
+        assert isinstance(
+            qhandler, torch.ao.quantization.fx.quantize_handler.QuantizeHandler
+        )  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             isinstance(mod, torch.nn.LSTM)
             and activation_is_statically_quantized(qconfig)
@@ -517,7 +527,13 @@ def _is_custom_module_mha(
     """
     mod = _get_module(node, named_modules)
     if qconfig is not None and qhandler is not None:
+<<<<<<< HEAD
         assert isinstance(qhandler, torch.ao.quantization.fx.quantize_handler.QuantizeHandler)  # type: ignore[attr-defined]
+=======
+        assert isinstance(
+            qhandler, torch.ao.quantization.fx.quantize_handler.QuantizeHandler
+        )  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (
             isinstance(mod, torch.nn.MultiheadAttention)
             and activation_is_statically_quantized(qconfig)
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index a3672b5cb01d..33df8e95e94d 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -8,6 +8,10 @@
 the values observed during calibration (PTQ) or training (QAT).
 """
 
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import re
 import warnings
 from abc import ABCMeta, abstractmethod
@@ -24,6 +28,10 @@
     is_per_tensor,
     validate_qmin_qmax,
 )
+<<<<<<< HEAD
+=======
+from torch.fx import Node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -253,9 +261,17 @@ def __init__(
             torch.per_channel_affine,
             torch.per_channel_symmetric,
             torch.per_channel_affine_float_qparams,
+<<<<<<< HEAD
         ), "Default Observer only works for per_tensor_affine, \
                 per_tensor_symmetric, per_channel_affine, \
                 per_channel_symmetric and per_channel_float_qparams quantization scheme"
+=======
+        ), (
+            "Default Observer only works for per_tensor_affine, \
+                per_tensor_symmetric, per_channel_affine, \
+                per_channel_symmetric and per_channel_float_qparams quantization scheme"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         _ALLOWED_DTYPES = (
             torch.qint8,
@@ -271,9 +287,15 @@ def __init__(
             torch.uint16,
         )
 
+<<<<<<< HEAD
         assert (
             self.dtype in _ALLOWED_DTYPES
         ), f"Default Observer only works for {_ALLOWED_DTYPES} data type"
+=======
+        assert self.dtype in _ALLOWED_DTYPES, (
+            f"Default Observer only works for {_ALLOWED_DTYPES} data type"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.has_customized_qrange = (quant_min is not None) and (quant_max is not None)
         if self.has_customized_qrange:
             validate_qmin_qmax(quant_min, quant_max)
@@ -329,12 +351,21 @@ def _validate_qmin_qmax(self, quant_min: int, quant_max: int) -> None:
         """
         # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted
         # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer.
+<<<<<<< HEAD
         assert (
             quant_min <= 0 <= quant_max
         ), "Used-specified quantization range must include 0."
         assert (
             quant_min < quant_max
         ), "qmin must be strictly less than qmax for user-specified quantization range."
+=======
+        assert quant_min <= 0 <= quant_max, (
+            "Used-specified quantization range must include 0."
+        )
+        assert quant_min < quant_max, (
+            "qmin must be strictly less than qmax for user-specified quantization range."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @torch.jit.export
     def _calculate_qparams(
@@ -491,6 +522,10 @@ class MinMaxObserver(UniformQuantizationObserverBase):
     .. note:: If the running minimum equals to the running maximum, the scale
               and zero_point are set to 1.0 and 0.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     min_val: torch.Tensor
     max_val: torch.Tensor
 
@@ -559,7 +594,11 @@ def forward(self, x_orig):
         return x_orig
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Calculates the quantization parameters."""
         return self._calculate_qparams(self.min_val, self.max_val)
 
@@ -700,6 +739,10 @@ class PerChannelMinMaxObserver(UniformQuantizationObserverBase):
     .. note:: If the running minimum equals to the running maximum, the scales
               and zero_points are set to 1.0 and 0.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     min_val: torch.Tensor
     max_val: torch.Tensor
 
@@ -781,7 +824,11 @@ def _forward(self, x_orig):
         return x_orig
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._calculate_qparams(self.min_val, self.max_val)
 
     def extra_repr(self):
@@ -995,6 +1042,10 @@ class HistogramObserver(UniformQuantizationObserverBase):
     3. Compute the scale and zero point the same way as in the
         :class:`~torch.ao.quantization.MinMaxObserver`
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     histogram: torch.Tensor
     min_val: torch.Tensor
     max_val: torch.Tensor
@@ -1263,9 +1314,15 @@ def reset_histogram(
         self.min_val.copy_(min_val)
         self.max_val.resize_(max_val.shape)
         self.max_val.copy_(max_val)
+<<<<<<< HEAD
         assert (
             min_val.numel() == 1 and max_val.numel() == 1
         ), "histogram min/max values must be scalar."
+=======
+        assert min_val.numel() == 1 and max_val.numel() == 1, (
+            "histogram min/max values must be scalar."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_histogram = torch.histc(x, self.bins, min=min_val, max=max_val)  # type: ignore[arg-type]
         self.histogram.detach_().resize_(new_histogram.shape)
         self.histogram.copy_(new_histogram)
@@ -1300,7 +1357,14 @@ def forward(self, x_orig: torch.Tensor) -> torch.Tensor:  # pyre-ignore[14]
             # new_min and new_max should already have requires_grad set to False
             new_min, new_max = new_min.detach(), new_max.detach()
             update_histogram = torch.histc(
+<<<<<<< HEAD
                 x, self.bins, min=new_min, max=new_max  # type: ignore[arg-type]
+=======
+                x,
+                self.bins,
+                min=new_min,  # type: ignore[arg-type]
+                max=new_max,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).to(self.histogram.device)
             if new_min == current_min and new_max == current_max:
                 combined_histogram = self.histogram + update_histogram
@@ -1325,7 +1389,11 @@ def forward(self, x_orig: torch.Tensor) -> torch.Tensor:  # pyre-ignore[14]
         return x_orig
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_uninitialized = self.min_val == float("inf") and self.max_val == float(
             "-inf"
         )
@@ -1438,7 +1506,11 @@ def forward(self, X):
         return X
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.scale, self.zero_point
 
 
@@ -1507,7 +1579,11 @@ def extra_repr(self):
         return f"dtype={self.dtype}, is_dynamic={self.is_dynamic}"
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise Exception(  # noqa: TRY002
             "calculate_qparams should not be called for PlaceholderObserver"
         )
@@ -1522,6 +1598,10 @@ class RecordingObserver(ObserverBase):
         qscheme: Quantization scheme to be used
         reduce_range: Reduces the range of the quantized data type by 1 bit
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __annotations__ = {"tensor_val": list[Optional[torch.Tensor]]}
 
     def __init__(self, dtype=torch.quint8):
@@ -1533,7 +1613,11 @@ def forward(self, x):
         return x
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise Exception(  # noqa: TRY002
             "calculate_qparams should not be called for RecordingObserver"
         )
@@ -1566,7 +1650,11 @@ def forward(self, x):
         return x
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise Exception(  # noqa: TRY002
             "calculate_qparams should not be called for NoopObserver"
         )
@@ -1593,7 +1681,11 @@ def forward(self, x):
         return x
 
     @torch.jit.export
+<<<<<<< HEAD
     def calculate_qparams(self):
+=======
+    def calculate_qparams(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise Exception(  # noqa: TRY002
             "calculate_qparams should not be called for ReuseInputObserver"
         )
@@ -1771,9 +1863,15 @@ def get_block_size(
         input_shape: The input tensor shape possibly more than 2 dimensions
         granularity: The granularity type of the quantization
     """
+<<<<<<< HEAD
     assert isinstance(
         granularity, Granularity
     ), "Please provide an instance of Granularity, not subclass of it"
+=======
+    assert isinstance(granularity, Granularity), (
+        "Please provide an instance of Granularity, not subclass of it"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(granularity, PerTensor):
         return input_shape
     elif isinstance(granularity, PerAxis):
@@ -1783,12 +1881,21 @@ def get_block_size(
     elif isinstance(granularity, PerRow):
         return (1,) * (len(input_shape) - 1) + (input_shape[-1],)
     elif isinstance(granularity, PerGroup):
+<<<<<<< HEAD
         assert (
             len(input_shape) == 2
         ), f"Expecting input shape dim to be 2 for per group quantization, gotinput shape: {input_shape}"
         return (1, granularity.group_size)
     elif isinstance(granularity, PerToken):
         block_size = list(input_shape)
+=======
+        assert len(input_shape) == 2, (
+            f"Expecting input shape dim to be 2 for per group quantization, gotinput shape: {input_shape}"
+        )
+        return (1, granularity.group_size)
+    elif isinstance(granularity, PerToken):
+        block_size = [1] * len(input_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         block_size[-1] = input_shape[-1]
         return tuple(block_size)
     raise ValueError(f"Unsupported Granularity: {granularity}")
@@ -1850,6 +1957,87 @@ def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
         and returns a tuple of scale and zero_point Tensor
         """
 
+<<<<<<< HEAD
+=======
+    def convert(self, model: torch.fx.GraphModule, observer_node: Node):
+        """
+        Converts the observer node in the graph into its quantized representation
+
+        Args:
+            model: graph module to conver the observer node in
+            observer_node: the observer node to convert
+        """
+        from torch.ao.quantization.fx.utils import create_getattr_from_value
+
+        with model.graph.inserting_before(observer_node):
+            assert self.block_size is not None, "Expecting block_size to be populated"
+            assert self.original_dtype is not None, (
+                "Expecting original_dtype to be populated"
+            )
+            if hasattr(self, "is_dynamic") and self.is_dynamic:
+                choose_qparams_affine = model.graph.call_function(
+                    torch.ops.pt2e_quant.choose_qparams_affine,
+                    (
+                        observer_node.args[0],
+                        self.mapping_type.name,
+                        self.block_size,
+                        self.target_dtype,
+                        self.quant_min,
+                        self.quant_max,
+                        self.eps,
+                        self.scale_dtype,
+                        self.zero_point_dtype,
+                        self.preserve_zero,
+                        self.zero_point_domain.name,
+                    ),
+                )
+                scale_node = model.graph.call_function(
+                    operator.getitem, (choose_qparams_affine, 0)
+                )
+                zero_point_node = model.graph.call_function(
+                    operator.getitem, (choose_qparams_affine, 1)
+                )
+            else:
+                scale, zero_point = self.calculate_qparams()
+                scale_node = create_getattr_from_value(
+                    model, model.graph, "_scale", scale
+                )
+                zero_point_node = create_getattr_from_value(
+                    model, model.graph, "_zero_point", zero_point
+                )
+
+            q_node = model.graph.call_function(
+                torch.ops.pt2e_quant.quantize_affine,
+                (
+                    observer_node.args[0],
+                    self.block_size,
+                    scale_node,
+                    zero_point_node,
+                    self.target_dtype,
+                    self.quant_min,
+                    self.quant_max,
+                    self.zero_point_domain.name,
+                ),
+                {},
+            )
+            dq_node = model.graph.call_function(
+                torch.ops.pt2e_quant.dequantize_affine,
+                (
+                    q_node,
+                    self.block_size,
+                    scale_node,
+                    zero_point_node,
+                    self.target_dtype,
+                    self.quant_min,
+                    self.quant_max,
+                    self.zero_point_domain.name,
+                ),
+                {"output_dtype": self.original_dtype},
+            )
+            observer_node.replace_all_uses_with(dq_node)
+            model.graph.erase_node(observer_node)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _is_observer_script_module(mod, obs_type_name):
     """Returns true if given mod is an instance of Observer script module."""
diff --git a/torch/ao/quantization/pt2e/_affine_quantization.py b/torch/ao/quantization/pt2e/_affine_quantization.py
index 70ad5c0cde89..743177bbcba9 100644
--- a/torch/ao/quantization/pt2e/_affine_quantization.py
+++ b/torch/ao/quantization/pt2e/_affine_quantization.py
@@ -9,11 +9,18 @@
 from torch.ao.quantization.observer import (
     AffineQuantizedObserverBase,
     get_block_size,
+<<<<<<< HEAD
+=======
+    Granularity,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MappingType,
     TorchAODType,
     ZeroPointDomain,
 )
+<<<<<<< HEAD
 from torch.fx import Node
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ABC: Any = ABCMeta("ABC", (object,), {})  # compatible with Python 2 *and* 3:
@@ -165,12 +172,21 @@ def decorator(fn):
 
         # expecting fn.__name__ starts with `_` and we want to take the rest
         # to be the name of the custom op
+<<<<<<< HEAD
         assert (
             fn.__name__[0] == "_"
         ), f"Expecting function name starts with `_`, got {fn.__name__}"
         assert not any(
             c in fn.__name__ for c in ".<>"
         ), f"Expecting op to be defined in normal functions, not lambda or local: {fn.__name__}"
+=======
+        assert fn.__name__[0] == "_", (
+            f"Expecting function name starts with `_`, got {fn.__name__}"
+        )
+        assert not any(c in fn.__name__ for c in ".<>"), (
+            f"Expecting op to be defined in normal functions, not lambda or local: {fn.__name__}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op_name = fn.__name__[1:]
         schema = op_name + infer_schema(fn, mutates_args={})
         lib.define(schema)
@@ -261,9 +277,15 @@ def _choose_qparams_affine(
         MappingType.ASYMMETRIC.name,
     ], f"Unsupported mapping type: {mapping_type}"
     if target_dtype in FP8_TYPES:
+<<<<<<< HEAD
         assert (
             mapping_type == MappingType.SYMMETRIC.name
         ), f"Only symmetric quantization is supported for FP8 types, got {mapping_type}"
+=======
+        assert mapping_type == MappingType.SYMMETRIC.name, (
+            f"Only symmetric quantization is supported for FP8 types, got {mapping_type}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if input is not None:
         if scale_dtype is None:
@@ -273,9 +295,15 @@ def _choose_qparams_affine(
         if eps is None:
             eps = torch.finfo(input.dtype).eps
 
+<<<<<<< HEAD
         assert (
             len(block_size) == input.dim()
         ), f"Got input dim:{input.dim()}, block_size: {block_size}"
+=======
+        assert len(block_size) == input.dim(), (
+            f"Got input dim:{input.dim()}, block_size: {block_size}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape_for_reduction, reduction_dims = _get_reduction_params(
             block_size, input.size()
         )
@@ -284,12 +312,21 @@ def _choose_qparams_affine(
         min_val = torch.amin(input, dim=reduction_dims, keepdim=False)
         max_val = torch.amax(input, dim=reduction_dims, keepdim=False)
     else:
+<<<<<<< HEAD
         assert (
             min_val is not None and max_val is not None
         ), "Need to provide `min_val` and `max_val` when `input` is None, got: {min_val, max_val}"
         assert (
             min_val.dtype == max_val.dtype
         ), "Expecting `min_val` and `max_val` to have the same dtype, got: {min_val.dtype, max_val.dtype}"
+=======
+        assert min_val is not None and max_val is not None, (
+            "Need to provide `min_val` and `max_val` when `input` is None, got: {min_val, max_val}"
+        )
+        assert min_val.dtype == max_val.dtype, (
+            "Expecting `min_val` and `max_val` to have the same dtype, got: {min_val.dtype, max_val.dtype}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if scale_dtype is None:
             scale_dtype = min_val.dtype
@@ -351,9 +388,15 @@ def _choose_qparams_affine(
                 zero_point = quant_min - torch.round(min_val_neg / scale)
                 zero_point = torch.clamp(zero_point, quant_min, quant_max)
             else:
+<<<<<<< HEAD
                 assert (
                     zero_point_domain == ZeroPointDomain.FLOAT.name
                 ), "if not preserve_zero, zero_point must be in FLOAT domain"
+=======
+                assert zero_point_domain == ZeroPointDomain.FLOAT.name, (
+                    "if not preserve_zero, zero_point must be in FLOAT domain"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 mid_point = (quant_max + quant_min + 1) / 2
                 zero_point = min_val_neg + scale * mid_point
 
@@ -478,9 +521,15 @@ def _quantize_affine_no_dtype_cast(
         torch.float16,
         torch.bfloat16,
     ], f"Unsupported input dtype: {input.dtype}"
+<<<<<<< HEAD
     assert (
         len(block_size) == input.dim()
     ), f"Got input dim:{input.dim()}, block_size: {block_size}"
+=======
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shape_for_reduction, reduction_dims = _get_reduction_params(
         block_size, input.size()
     )
@@ -498,6 +547,7 @@ def _quantize_affine_no_dtype_cast(
             torch.round(input * (1.0 / scale)) + zero_point, quant_min, quant_max
         )
     elif zero_point_domain == ZeroPointDomain.NONE.name:
+<<<<<<< HEAD
         assert (
             zero_point is None
         ), "zero_point should be None when zero_point_domain is NONE"
@@ -507,6 +557,17 @@ def _quantize_affine_no_dtype_cast(
         assert (
             zero_point is None
         ), "zero_point should be None when zero_point_domain is None"
+=======
+        assert zero_point is None, (
+            "zero_point should be None when zero_point_domain is NONE"
+        )
+        quant = torch.clamp(torch.round(input * (1.0 / scale)), quant_min, quant_max)
+    elif zero_point_domain is None:
+        # This case handles quantization for float8 we expect no zero point and no zero point domain
+        assert zero_point is None, (
+            "zero_point should be None when zero_point_domain is None"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quant = torch.clamp(input * scale.reciprocal(), quant_min, quant_max)
     else:
         assert zero_point_domain == ZeroPointDomain.FLOAT.name
@@ -582,9 +643,15 @@ def _dequantize_affine(
     """op definition that has compatible signatures with custom op library"""
     # TODO: validate scale/zero_point dimensions are compatible with block_size
     if input_dtype not in _SUB_BYTE_UINT_BOUNDS:
+<<<<<<< HEAD
         assert (
             input.dtype == input_dtype
         ), f"Expected: {input_dtype}, got: {input.dtype}"
+=======
+        assert input.dtype == input_dtype, (
+            f"Expected: {input_dtype}, got: {input.dtype}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert output_dtype in [
         torch.float32,
         torch.float16,
@@ -621,9 +688,15 @@ def _dequantize_affine_no_dtype_check(
     2. dequantize the input based on the quantization parameters scale and zero_point and args like zero_point_domain
     3. reshape the quantized result to origianl shape and change dtype to the output_dtype
     """
+<<<<<<< HEAD
     assert (
         len(block_size) == input.dim()
     ), f"Got input dim:{input.dim()}, block_size: {block_size}"
+=======
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shape_for_reduction, reduction_dims = _get_reduction_params(
         block_size, input.size()
     )
@@ -646,13 +719,20 @@ def _dequantize_affine_no_dtype_check(
         dequant = dequant.to(output_dtype)
         dequant = dequant * scale
     elif zero_point_domain == ZeroPointDomain.NONE.name:
+<<<<<<< HEAD
         assert (
             zero_point is None
         ), "zero_point should be None when zero_point_domain is NONE"
+=======
+        assert zero_point is None, (
+            "zero_point should be None when zero_point_domain is NONE"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dequant = input.to(output_dtype)
         dequant = dequant * scale
     elif zero_point_domain is None:
         # This case handles dequantization for float8 we expect no zero point and no zero point domain
+<<<<<<< HEAD
         assert (
             zero_point is None
         ), "zero_point should be None when zero_point_domain is None"
@@ -665,6 +745,20 @@ def _dequantize_affine_no_dtype_check(
         assert (
             zero_point_domain == ZeroPointDomain.FLOAT.name
         ), f"Unexpected zero point domain: {zero_point_domain}"
+=======
+        assert zero_point is None, (
+            "zero_point should be None when zero_point_domain is None"
+        )
+        assert _is_float8_type(input.dtype), (
+            f"dequantiztion with no zero point domain is only supported with FP8 types, got {input.dtype}"
+        )
+        dequant = input.to(output_dtype)
+        dequant = dequant * scale
+    else:
+        assert zero_point_domain == ZeroPointDomain.FLOAT.name, (
+            f"Unexpected zero point domain: {zero_point_domain}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: this seems to be a detail for tinygemm (converting from uint to int, probably need to refactor this)
         mid_point = (quant_max + quant_min + 1) / 2
         # This should allocate new memory and avoid input modification
@@ -697,12 +791,21 @@ def forward(self, input: torch.Tensor):
             self.min_val = min_val
             self.max_val = max_val
         else:
+<<<<<<< HEAD
             assert (
                 self.min_val.shape == min_val.shape
             ), f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
             assert (
                 self.max_val.shape == max_val.shape
             ), f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
+=======
+            assert self.min_val.shape == min_val.shape, (
+                f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
+            )
+            assert self.max_val.shape == max_val.shape, (
+                f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             min_val = torch.min(self.min_val, min_val)
             max_val = torch.max(self.max_val, max_val)
             self.min_val.copy_(min_val)
@@ -711,9 +814,15 @@ def forward(self, input: torch.Tensor):
         return input
 
     def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
+<<<<<<< HEAD
         assert hasattr(self, "min_val") and hasattr(
             self, "max_val"
         ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+=======
+        assert hasattr(self, "min_val") and hasattr(self, "max_val"), (
+            "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return choose_qparams_affine_with_min_max(
             self.min_val,
             self.max_val,
@@ -729,6 +838,7 @@ def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
             self.zero_point_domain,
         )
 
+<<<<<<< HEAD
     def convert(self, model: torch.fx.GraphModule, observer_node: Node):
         print("calling convert")
         from torch.ao.quantization.fx.utils import create_getattr_from_value
@@ -773,3 +883,140 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
             )
             observer_node.replace_all_uses_with(dq_node)
             model.graph.erase_node(observer_node)
+=======
+
+class AffineQuantizedMovingAverageMinMaxObserver(AffineQuantizedObserverBase):
+    def __init__(
+        self,
+        mapping_type: MappingType,
+        target_dtype: torch.dtype,
+        granularity: Granularity,
+        averaging_constant=0.01,
+        quant_min: Optional[int] = None,
+        quant_max: Optional[int] = None,
+        eps: Optional[float] = None,
+        is_dynamic=False,
+        scale_dtype: Optional[torch.dtype] = None,
+        zero_point_dtype: Optional[torch.dtype] = None,
+        preserve_zero: bool = True,
+        zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.INT,
+        # there could be some extra args that's ignored
+        **kwargs,
+    ):
+        self.is_dynamic = is_dynamic
+        self.averaging_constant = averaging_constant
+        if is_dynamic and self.averaging_constant != 1:
+            raise NotImplementedError(
+                "MovingAverageMinMaxObserver doesn't support dynamic quantization for "
+                f"averaging constant of {self.averaging_constant}"
+            )
+
+        super().__init__(
+            mapping_type=mapping_type,
+            target_dtype=target_dtype,
+            granularity=granularity,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            eps=eps,
+            scale_dtype=scale_dtype,
+            zero_point_dtype=zero_point_dtype,
+            preserve_zero=preserve_zero,
+            zero_point_domain=zero_point_domain,
+        )
+
+    def forward(self, input: torch.Tensor):
+        if input.numel() == 0:
+            return input
+
+        input_detached = input.detach()
+        self.original_dtype = input_detached.dtype
+        assert self.granularity is not None, "granularity is None"
+        self.block_size = get_block_size(input_detached.shape, self.granularity)
+
+        shape_for_reduction, reduction_dims = _get_reduction_params(
+            self.block_size, input_detached.size()
+        )
+        input_detached = input_detached.view(shape_for_reduction)
+        min_val = torch.amin(input_detached, dim=reduction_dims, keepdim=False)
+        max_val = torch.amax(input_detached, dim=reduction_dims, keepdim=False)
+        if not hasattr(self, "min_val") or not hasattr(self, "max_val"):
+            self.min_val = min_val
+            self.max_val = max_val
+        else:
+            assert self.min_val.shape == min_val.shape, (
+                f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
+            )
+            assert self.max_val.shape == max_val.shape, (
+                f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
+            )
+            min_val = self.min_val + self.averaging_constant * (min_val - self.min_val)
+            max_val = self.max_val + self.averaging_constant * (max_val - self.max_val)
+            self.min_val.copy_(min_val)
+            self.max_val.copy_(max_val)
+
+        # returning original input
+        return input
+
+    def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
+        assert hasattr(self, "min_val") and hasattr(self, "max_val"), (
+            "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        )
+
+        return choose_qparams_affine_with_min_max(
+            self.min_val,
+            self.max_val,
+            self.mapping_type,
+            [],  # BlockSize is not needed because the min/max are already reduced
+            self.target_dtype,
+            self.quant_min,
+            self.quant_max,
+            self.eps,
+            self.scale_dtype,
+            self.zero_point_dtype,
+            self.preserve_zero,
+            self.zero_point_domain,
+        )
+
+
+class AffineQuantizedPlaceholderObserver(AffineQuantizedObserverBase):
+    def __init__(
+        self,
+        mapping_type: MappingType,
+        target_dtype: torch.dtype,
+        granularity: Granularity,
+        quant_min: Optional[int] = None,
+        quant_max: Optional[int] = None,
+        eps: Optional[float] = None,
+        is_dynamic=False,
+        scale_dtype: Optional[torch.dtype] = None,
+        zero_point_dtype: Optional[torch.dtype] = None,
+        preserve_zero: bool = True,
+        zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.INT,
+        # there could be some extra args that's ignored
+        **kwargs,
+    ):
+        self.is_dynamic = is_dynamic
+
+        super().__init__(
+            mapping_type=mapping_type,
+            target_dtype=target_dtype,
+            granularity=granularity,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            eps=eps,
+            scale_dtype=scale_dtype,
+            zero_point_dtype=zero_point_dtype,
+            preserve_zero=preserve_zero,
+            zero_point_domain=zero_point_domain,
+        )
+
+    def forward(self, input):
+        self.block_size = get_block_size(input.shape, self.granularity)
+        self.original_dtype = input.dtype
+        return input
+
+    def calculate_qparams(self):
+        raise Exception(  # noqa: TRY002
+            "calculate_qparams should not be called for PlaceholderObserver"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/ao/quantization/pt2e/duplicate_dq_pass.py b/torch/ao/quantization/pt2e/duplicate_dq_pass.py
index fdfdc7f84acd..af0062e35e56 100644
--- a/torch/ao/quantization/pt2e/duplicate_dq_pass.py
+++ b/torch/ao/quantization/pt2e/duplicate_dq_pass.py
@@ -33,7 +33,11 @@ def _maybe_duplicate_dq(
     gm: torch.fx.GraphModule, dq_node: torch.fx.Node, user: torch.fx.Node
 ):
     annotation = user.meta.get("quantization_annotation", None)
+<<<<<<< HEAD
     if not _is_valid_annotation(annotation):
+=======
+    if not _is_valid_annotation(annotation):  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
     with gm.graph.inserting_after(dq_node):
         new_node = gm.graph.node_copy(dq_node)
diff --git a/torch/ao/quantization/pt2e/graph_utils.py b/torch/ao/quantization/pt2e/graph_utils.py
index 54bad84e3ee8..24c04b669100 100644
--- a/torch/ao/quantization/pt2e/graph_utils.py
+++ b/torch/ao/quantization/pt2e/graph_utils.py
@@ -161,9 +161,15 @@ def bfs_trace_with_node_process(
 ) -> None:
     """Traverse the graph module and apply node_op to each node."""
 
+<<<<<<< HEAD
     assert isinstance(
         model, (ExportedProgram, torch.fx.GraphModule)
     ), f"Expected GraphModule or ExportedProgram, got {type(model)}"
+=======
+    assert isinstance(model, (ExportedProgram, torch.fx.GraphModule)), (
+        f"Expected GraphModule or ExportedProgram, got {type(model)}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gm = model.graph_module if isinstance(model, ExportedProgram) else model
     queue = [gm]
     while queue:
diff --git a/torch/ao/quantization/pt2e/lowering.py b/torch/ao/quantization/pt2e/lowering.py
new file mode 100644
index 000000000000..587cee22560d
--- /dev/null
+++ b/torch/ao/quantization/pt2e/lowering.py
@@ -0,0 +1,60 @@
+import torch
+from torch._inductor.constant_folding import constant_fold
+from torch._inductor.fx_passes.freezing_patterns import freezing_passes
+
+
+__all__ = [
+    "lower_pt2e_quantized_to_x86",
+]
+
+
+def lower_pt2e_quantized_to_x86(
+    model: torch.fx.GraphModule,
+    example_inputs: tuple[torch.Tensor, ...],
+) -> torch.fx.GraphModule:
+    """Lower a PT2E-qantized model to x86 backend.
+
+    Args:
+    * `model` (torch.fx.GraphModule): a model quantized by PT2E quantization flow.
+    * `example_inputs` (tuple[torch.Tensor, ...]): example inputs for the model.
+
+    Return:
+    A GraphModule lowered to x86 backend.
+    """
+
+    def _post_autograd_decomp_table():  # type: ignore[no-untyped-def]
+        decomp_table = torch.export.default_decompositions()
+
+        # if we are post-autograd, we shouldn't
+        # decomp prim ops.
+        for k in list(decomp_table.keys()):
+            if not torch._export.utils._is_cia_op(k):
+                del decomp_table[k]
+
+        return decomp_table
+
+    def _node_replace(m):  # type: ignore[no-untyped-def]
+        # Replace aten.t(x) with aten.permute(x, [1, 0])
+        aten = torch.ops.aten
+        g = m.graph
+        for node in g.nodes:
+            if node.target == aten.t.default:
+                with g.inserting_before(node):
+                    x = node.args[0]
+                    dims = [1, 0]
+                    perm_node = g.call_function(aten.permute.default, args=(x, dims))
+                    node.replace_all_uses_with(perm_node)
+                    g.erase_node(node)
+
+        g.lint()
+        m.recompile()
+
+    lowered_model = (
+        torch.export.export_for_training(model, example_inputs, strict=True)
+        .run_decompositions(_post_autograd_decomp_table())
+        .module()
+    )
+    _node_replace(lowered_model)
+    freezing_passes(lowered_model, example_inputs)
+    constant_fold(lowered_model)
+    return lowered_model
diff --git a/torch/ao/quantization/pt2e/port_metadata_pass.py b/torch/ao/quantization/pt2e/port_metadata_pass.py
index b0946d0075c9..43fd4340bbdd 100644
--- a/torch/ao/quantization/pt2e/port_metadata_pass.py
+++ b/torch/ao/quantization/pt2e/port_metadata_pass.py
@@ -27,17 +27,29 @@
     torch.ops.quantized_decomposed.quantize_per_tensor.default,
     torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
     torch.ops.quantized_decomposed.quantize_per_channel.default,
+<<<<<<< HEAD
+=======
+    torch.ops.pt2e_quant.quantize_affine,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 _DEQUANTIZE_OPS = [
     torch.ops.quantized_decomposed.dequantize_per_tensor.default,
     torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
     torch.ops.quantized_decomposed.dequantize_per_channel.default,
+<<<<<<< HEAD
+=======
+    torch.ops.pt2e_quant.dequantize_affine,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 _CHOOSE_QPARAMS_OPS = [
     torch.ops.quantized_decomposed.choose_qparams.tensor,
     torch.ops.quantized_decomposed.choose_qparams_symmetric.tensor,
+<<<<<<< HEAD
+=======
+    torch.ops.pt2e_quant.choose_qparams_affine,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
diff --git a/torch/ao/quantization/pt2e/prepare.py b/torch/ao/quantization/pt2e/prepare.py
index 789f892266be..c18af9f27d18 100644
--- a/torch/ao/quantization/pt2e/prepare.py
+++ b/torch/ao/quantization/pt2e/prepare.py
@@ -163,7 +163,11 @@ def _union_input_edge_with(
 
 
 def _get_edge_or_node_to_group_id(
+<<<<<<< HEAD
     edge_or_node_to_qspec: dict[EdgeOrNode, QuantizationSpecBase]
+=======
+    edge_or_node_to_qspec: dict[EdgeOrNode, QuantizationSpecBase],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> dict[EdgeOrNode, int]:
     """Map from edge/node to the group ID, generated from quantization annotations,
     edge/node with the same group ID should use the same observer/fake_quant instance
@@ -351,9 +355,15 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     original_arg = arg
     while _is_activation_post_process_node(original_arg, named_modules):
         original_arg = original_arg.args[0]  # type: ignore[assignment]
+<<<<<<< HEAD
     assert isinstance(
         original_arg, Node
     ), f"expect original argument to be a Node, but got: {type(original_arg)}"
+=======
+    assert isinstance(original_arg, Node), (
+        f"expect original argument to be a Node, but got: {type(original_arg)}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     input_edge = (original_arg, node)
     if input_edge not in obs_or_fq_map:
diff --git a/torch/ao/quantization/pt2e/qat_utils.py b/torch/ao/quantization/pt2e/qat_utils.py
index f6f786f245ad..2410a63c28c1 100644
--- a/torch/ao/quantization/pt2e/qat_utils.py
+++ b/torch/ao/quantization/pt2e/qat_utils.py
@@ -882,8 +882,17 @@ def _fold_conv_bn_qat(m: GraphModule) -> GraphModule:
             node.target == torch.ops.aten.add_.Tensor
             and node.args[0].op == "get_attr"
             and node.args[1] == 1
+<<<<<<< HEAD
             and torch.nn.modules.batchnorm.BatchNorm2d
             in [val[1] for val in node.meta["source_fn_stack"]]
+=======
+            and (
+                torch.nn.modules.batchnorm.BatchNorm2d
+                in [val[1] for val in node.meta["source_fn_stack"]]
+                or torch.nn.modules.batchnorm.BatchNorm1d
+                in [val[1] for val in node.meta["source_fn_stack"]]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             m.graph.erase_node(node)
 
diff --git a/torch/ao/quantization/pt2e/representation/rewrite.py b/torch/ao/quantization/pt2e/representation/rewrite.py
index ed3b30552a1f..1b5c6b53f921 100644
--- a/torch/ao/quantization/pt2e/representation/rewrite.py
+++ b/torch/ao/quantization/pt2e/representation/rewrite.py
@@ -4,6 +4,10 @@
 from typing import Any, Callable, Optional
 
 import torch
+<<<<<<< HEAD
+=======
+from torch._export.utils import _disable_aten_to_metadata_assertions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
 from torch.ao.quantization.pt2e.export_utils import _WrapperModule
@@ -798,6 +802,7 @@ def reference_representation_rewrite(model: GraphModule) -> GraphModule:
 
     remove_tensor_overload_for_qdq_ops(model)
 
+<<<<<<< HEAD
     for rewrite_info in _REWRITE_INFO_LIST:
         example_inputs = rewrite_info.example_inputs
         pattern = rewrite_info.pattern
@@ -815,5 +820,28 @@ def reference_representation_rewrite(model: GraphModule) -> GraphModule:
         pattern.recompile()  # type: ignore[attr-defined]
         replacement.recompile()  # type: ignore[attr-defined]
         replace_pattern(model, pattern, replacement)
+=======
+    with _disable_aten_to_metadata_assertions():
+        for rewrite_info in _REWRITE_INFO_LIST:
+            example_inputs = rewrite_info.example_inputs
+            pattern = rewrite_info.pattern
+            replacement = rewrite_info.replacement
+            pattern_post_trans = rewrite_info.pattern_post_trans
+            replacement_post_trans = rewrite_info.replacement_post_trans
+            pattern = _get_aten_graph_module_for_pattern(pattern, example_inputs)  # type: ignore[arg-type, assignment]
+            remove_tensor_overload_for_qdq_ops(pattern)  # type: ignore[arg-type]
+            replacement = _get_aten_graph_module_for_pattern(  # type: ignore[assignment]
+                replacement,
+                example_inputs,  # type: ignore[arg-type]
+            )
+            remove_tensor_overload_for_qdq_ops(replacement)  # type: ignore[arg-type]
+            if pattern_post_trans:
+                pattern = pattern_post_trans(pattern)
+            if replacement_post_trans:
+                replacement = replacement_post_trans(replacement)
+            pattern.recompile()  # type: ignore[attr-defined]
+            replacement.recompile()  # type: ignore[attr-defined]
+            replace_pattern(model, pattern, replacement)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return model
diff --git a/torch/ao/quantization/pt2e/utils.py b/torch/ao/quantization/pt2e/utils.py
index 47e939f7596a..2f7a2ca433b2 100644
--- a/torch/ao/quantization/pt2e/utils.py
+++ b/torch/ao/quantization/pt2e/utils.py
@@ -85,10 +85,18 @@ def _find_q_dq_node_for_user(
 
     q_node = None
     if (
+<<<<<<< HEAD
         dq_node.args[0].op == "call_function"  # type: ignore[union-attr]
         and dq_node.args[0].target in _QUANTIZE_OPS  # type: ignore[union-attr]
     ):
         q_node = dq_node.args[0]
+=======
+        isinstance(arg := dq_node.args[0], torch.fx.Node)
+        and arg.op == "call_function"
+        and arg.target in _QUANTIZE_OPS
+    ):
+        q_node = arg
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (q_node, dq_node)
 
 
@@ -167,7 +175,15 @@ def _is_conv_node(n: Node):
     """
     return n.op == "call_function" and n.target in [
         torch.ops.aten.conv1d.default,
+<<<<<<< HEAD
+        torch.ops.aten.conv2d.default,
+=======
+        torch.ops.aten.conv1d.padding,
         torch.ops.aten.conv2d.default,
+        torch.ops.aten.conv2d.padding,
+        torch.ops.aten.conv3d.default,
+        torch.ops.aten.conv3d.padding,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 
 
@@ -355,6 +371,10 @@ def _get_aten_graph_module_for_pattern(
         pattern,  # type: ignore[arg-type]
         example_inputs,
         kwargs,
+<<<<<<< HEAD
+=======
+        strict=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ).module()
 
     aten_pattern.graph.eliminate_dead_code()  # type: ignore[operator, union-attr]
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index 246d74b601c8..ac0726de2c96 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -98,7 +98,12 @@ class QConfig(namedtuple("QConfig", ["activation", "weight"])):
 
       my_qconfig = QConfig(
           activation=MinMaxObserver.with_args(dtype=torch.qint8),
+<<<<<<< HEAD
           weight=default_observer.with_args(dtype=torch.qint8))
+=======
+          weight=default_observer.with_args(dtype=torch.qint8),
+      )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     """
 
@@ -561,9 +566,15 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N
                 torch.ao.quantization.MovingAveragePerChannelMinMaxObserver,
             ),
         )
+<<<<<<< HEAD
         assert (
             not is_per_channel
         ), "Per channel weight observer is not supported yet for ConvTranspose{n}d."
+=======
+        assert not is_per_channel, (
+            "Per channel weight observer is not supported yet for ConvTranspose{n}d."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 QConfigAny = Optional[QConfig]
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index a43b69e4fa8f..4fad69dea80f 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -232,9 +232,15 @@ class QConfigMapping:
     def __init__(self) -> None:
         # In increasing match priority:
         self.global_qconfig: QConfigAny = None
+<<<<<<< HEAD
         self.object_type_qconfigs: OrderedDict[
             Union[Callable, str], QConfigAny
         ] = OrderedDict()
+=======
+        self.object_type_qconfigs: OrderedDict[Union[Callable, str], QConfigAny] = (
+            OrderedDict()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.module_name_regex_qconfigs: OrderedDict[str, QConfigAny] = OrderedDict()
         self.module_name_qconfigs: OrderedDict[str, QConfigAny] = OrderedDict()
         self.module_name_object_type_order_qconfigs: OrderedDict[
diff --git a/torch/ao/quantization/quantization_mappings.py b/torch/ao/quantization/quantization_mappings.py
index a77edc2698ea..5c7c1993e887 100644
--- a/torch/ao/quantization/quantization_mappings.py
+++ b/torch/ao/quantization/quantization_mappings.py
@@ -98,7 +98,10 @@
     nn.modules.linear.NonDynamicallyQuantizableLinear: nnq.Linear,
     nn.Linear: nnq.Linear,
     nn.ReLU6: nnq.ReLU6,
+<<<<<<< HEAD
     nn.Dropout: nnq.Dropout,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nn.PReLU: nnq.PReLU,
     # Wrapper Modules:
     nnq.FloatFunctional: nnq.QFunctional,
@@ -334,9 +337,15 @@ def get_default_compare_output_module_list() -> set[Callable]:
     return copy.deepcopy(NUMERIC_SUITE_COMPARE_MODEL_OUTPUT_MODULE_LIST)
 
 
+<<<<<<< HEAD
 def get_default_float_to_quantized_operator_mappings() -> (
     dict[Union[Callable, str], Callable]
 ):
+=======
+def get_default_float_to_quantized_operator_mappings() -> dict[
+    Union[Callable, str], Callable
+]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return copy.deepcopy(DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS)
 
 
@@ -344,9 +353,15 @@ def get_default_float_to_quantized_operator_mappings() -> (
 def get_quantized_operator(float_op: Union[Callable, str]) -> Callable:
     """Get the quantized operator corresponding to the float operator"""
     quantized_op = DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS.get(float_op, None)
+<<<<<<< HEAD
     assert (
         quantized_op is not None
     ), f"Operator {str(float_op)} does not have corresponding quantized op"
+=======
+    assert quantized_op is not None, (
+        f"Operator {str(float_op)} does not have corresponding quantized op"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return quantized_op
 
 
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index cae2da91d81b..51ec3b81ff7b 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -2,6 +2,10 @@
 import copy
 import inspect
 import itertools
+<<<<<<< HEAD
+=======
+import typing_extensions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 
 import torch
@@ -30,7 +34,15 @@
 from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper
 from torch.nn.utils.parametrize import type_before_parametrizations
 
+<<<<<<< HEAD
 from .utils import get_qparam_dict, has_no_children_ignoring_parametrizations
+=======
+from .utils import (
+    DEPRECATION_WARNING,
+    get_qparam_dict,
+    has_no_children_ignoring_parametrizations,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -153,9 +165,15 @@ def _observer_forward_pre_hook(self, input):
 
 
 def _register_activation_post_process_hook(module, pre_hook=False):
+<<<<<<< HEAD
     assert hasattr(
         module, "activation_post_process"
     ), "Expect activation_post_process attribute already attached to the module"
+=======
+    assert hasattr(module, "activation_post_process"), (
+        "Expect activation_post_process attribute already attached to the module"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if pre_hook:
         module.register_forward_pre_hook(_observer_forward_pre_hook, prepend=True)
     else:
@@ -193,9 +211,15 @@ def _add_observer_(
     # respect device affinity when adding observers
     if device is None:
         devices = _get_unique_devices_(module)
+<<<<<<< HEAD
         assert (
             len(devices) <= 1
         ), f"_add_observer_ only works with cpu or single-device CUDA modules, but got devices {devices}"
+=======
+        assert len(devices) <= 1, (
+            f"_add_observer_ only works with cpu or single-device CUDA modules, but got devices {devices}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device = next(iter(devices)) if len(devices) > 0 else None
 
     def get_activation_post_process(qconfig, device, special_act_post_process=None):
@@ -238,9 +262,15 @@ def insert_activation_post_process(m, special_act_post_process=None):
             type_before_parametrizations(child), (nnq.FloatFunctional, nnq.QFunctional)
         ):
             if needs_observation(child):
+<<<<<<< HEAD
                 assert hasattr(
                     child, "activation_post_process"
                 ), f"functional class {type_before_parametrizations(child)} has no pre-defined `activation_post_process`"
+=======
+                assert hasattr(child, "activation_post_process"), (
+                    f"functional class {type_before_parametrizations(child)} has no pre-defined `activation_post_process`"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 child.activation_post_process = get_activation_post_process(
                     child.qconfig, device
                 )
@@ -332,6 +362,10 @@ def add_quant_dequant(module):
     return module
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def prepare(
     model,
     inplace=False,
@@ -361,10 +395,15 @@ def prepare(
            # user will manually define the corresponding observed
            # module class which has a from_float class method that converts
            # float custom module to observed custom module
+<<<<<<< HEAD
            "float_to_observed_custom_module_class": {
                CustomModule: ObservedCustomModule
            }
         }
+=======
+           "float_to_observed_custom_module_class": {CustomModule: ObservedCustomModule}
+       }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     """
     torch._C._log_api_usage_once("quantization_api.quantize.prepare")
@@ -442,6 +481,10 @@ def _remove_qconfig(module):
     _remove_activation_post_process(module)
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def quantize(model, run_fn, run_args, mapping=None, inplace=False):
     r"""Quantize the input float model with post training static quantization.
 
@@ -471,6 +514,10 @@ def quantize(model, run_fn, run_args, mapping=None, inplace=False):
     return model
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def quantize_dynamic(
     model, qconfig_spec=None, dtype=torch.qint8, mapping=None, inplace=False
 ):
@@ -561,6 +608,10 @@ def quantize_dynamic(
     return model
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def prepare_qat(model, mapping=None, inplace=False):
     r"""
     Prepares a copy of the model for quantization calibration or
@@ -590,6 +641,10 @@ def prepare_qat(model, mapping=None, inplace=False):
     return model
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def quantize_qat(model, run_fn, run_args, inplace=False):
     r"""Do quantization aware training and output a quantized model
 
@@ -613,6 +668,10 @@ def quantize_qat(model, run_fn, run_args, inplace=False):
     return model
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def convert(
     module,
     mapping=None,
@@ -780,7 +839,13 @@ def swap_module(
             devices = _get_unique_devices_(mod)
             assert len(devices) <= 1 or (
                 len(devices) == 2 and torch.device("meta") in devices
+<<<<<<< HEAD
             ), f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
+=======
+            ), (
+                f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device = next(iter(devices)) if len(devices) > 0 else None
             if device:
                 new_mod.to(device)
@@ -800,9 +865,15 @@ def get_prefix(prefix):
         return prefix if prefix == "" else prefix + "."
 
     if hasattr(mod, "activation_post_process"):
+<<<<<<< HEAD
         target_dict[
             get_prefix(prefix) + "activation_post_process"
         ] = mod.activation_post_process
+=======
+        target_dict[get_prefix(prefix) + "activation_post_process"] = (
+            mod.activation_post_process
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for name, child in mod.named_children():
         module_prefix = get_prefix(prefix) + name if prefix else name
         _get_observer_dict(child, target_dict, module_prefix)
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index c7c34cf252c3..63ff0f694eff 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -1,4 +1,8 @@
 import copy
+<<<<<<< HEAD
+=======
+import typing_extensions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import warnings
 from typing import Any, Optional, Union
 
@@ -18,6 +22,10 @@
     get_skipped_module_name_and_classes,
 )
 from .qconfig_mapping import QConfigMapping
+<<<<<<< HEAD
+=======
+from .utils import DEPRECATION_WARNING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def attach_preserved_attrs_to_model(
@@ -82,9 +90,13 @@ def _fuse_fx(
         model: GraphModule object from symbolic tracing (torch.fx.symbolic_trace)
     """
     _check_is_graph_module(model)
+<<<<<<< HEAD
     return fuse(
         model, is_qat, fuse_custom_config, backend_config
     )  # type: ignore[operator]
+=======
+    return fuse(model, is_qat, fuse_custom_config, backend_config)  # type: ignore[operator]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _prepare_fx(
@@ -217,6 +229,10 @@ def fuse_fx(
     Example::
 
         from torch.ao.quantization import fuse_fx
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = Model().eval()
         m = fuse_fx(m)
 
@@ -249,6 +265,10 @@ def fuse_fx(
     return graph_module
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def prepare_fx(
     model: torch.nn.Module,
     qconfig_mapping: Union[QConfigMapping, dict[str, Any]],
@@ -400,6 +420,10 @@ def calibrate(model, data_loader):
     )
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def prepare_qat_fx(
     model: torch.nn.Module,
     qconfig_mapping: Union[QConfigMapping, dict[str, Any]],
@@ -426,14 +450,26 @@ def prepare_qat_fx(
         from torch.ao.quantization import get_default_qat_qconfig_mapping
         from torch.ao.quantization.quantize_fx import prepare_qat_fx
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Submodule(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
                 self.linear = torch.nn.Linear(5, 5)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def forward(self, x):
                 x = self.linear(x)
                 return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class M(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -445,17 +481,29 @@ def forward(self, x):
                 x = self.sub(x) + x
                 return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # initialize a floating point model
         float_model = M().train()
         # (optional, but preferred) load the weights from pretrained model
         # float_model.load_weights(...)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # define the training loop for quantization aware training
         def train_loop(model, train_data):
             model.train()
             for image, target in data_loader:
                 ...
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # qconfig is the configuration for how we insert observers for a particular
         # operator
         # qconfig = get_default_qconfig("fbgemm")
@@ -470,7 +518,11 @@ def train_loop(model, train_data):
         # in the model through qconfig_mapping
         # the following call will get the qconfig_mapping that works best for models
         # that target "fbgemm" backend
+<<<<<<< HEAD
         qconfig_mapping = get_default_qat_qconfig("fbgemm")
+=======
+        qconfig_mapping = get_default_qat_qconfig_mapping("fbgemm")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # We can customize qconfig_mapping in different ways, please take a look at
         # the docstring for :func:`~torch.ao.quantization.prepare_fx` for different ways
@@ -554,6 +606,10 @@ def _convert_fx(
     return quantized
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def convert_fx(
     graph_module: GraphModule,
     convert_custom_config: Union[ConvertCustomConfig, dict[str, Any], None] = None,
diff --git a/torch/ao/quantization/quantize_jit.py b/torch/ao/quantization/quantize_jit.py
index 59e546458f8c..3a46cab35845 100644
--- a/torch/ao/quantization/quantize_jit.py
+++ b/torch/ao/quantization/quantize_jit.py
@@ -157,12 +157,21 @@ def _convert_ondevice_jit(
     model, method_name, inplace=False, debug=False, quant_type=QuantType.STATIC
 ):
     _check_is_script_module(model)
+<<<<<<< HEAD
     assert (
         quant_type == QuantType.DYNAMIC
     ), "This API, while should work for static quant, is only tested for dynamic quant."
     assert not method_name.startswith(
         "observe_"
     ), "Pass in valid method to be quantized, e.g. forward"
+=======
+    assert quant_type == QuantType.DYNAMIC, (
+        "This API, while should work for static quant, is only tested for dynamic quant."
+    )
+    assert not method_name.startswith("observe_"), (
+        "Pass in valid method to be quantized, e.g. forward"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     observe_method_name = "observe_" + method_name
     quantize_method_name = "quantize_" + method_name
     model_c = model._c
@@ -230,12 +239,21 @@ def _quantize_jit(
         model = prepare_dynamic_jit(model, qconfig_dict, inplace)
         model = convert_dynamic_jit(model, True, debug)
     else:
+<<<<<<< HEAD
         assert (
             run_fn
         ), "Must provide calibration function for post training static quantization"
         assert (
             run_args
         ), "Must provide calibration dataset for post training static quantization"
+=======
+        assert run_fn, (
+            "Must provide calibration function for post training static quantization"
+        )
+        assert run_args, (
+            "Must provide calibration dataset for post training static quantization"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model = prepare_jit(model, qconfig_dict, inplace)
         run_fn(model, *run_args)
         model = convert_jit(model, True, debug)
@@ -280,19 +298,35 @@ def quantize_jit(model, qconfig_dict, run_fn, run_args, inplace=False, debug=Fal
     from torch.ao.quantization import get_default_qconfig
     from torch.ao.quantization import quantize_jit
 
+<<<<<<< HEAD
     ts_model = torch.jit.script(float_model.eval())  # or torch.jit.trace(float_model, input)
     qconfig = get_default_qconfig('fbgemm')
+=======
+    ts_model = torch.jit.script(
+        float_model.eval()
+    )  # or torch.jit.trace(float_model, input)
+    qconfig = get_default_qconfig("fbgemm")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def calibrate(model, data_loader):
         model.eval()
         with torch.no_grad():
             for image, target in data_loader:
                 model(image)
 
+<<<<<<< HEAD
     quantized_model = quantize_jit(
         ts_model,
         {'': qconfig},
         calibrate,
         [data_loader_test])
+=======
+
+    quantized_model = quantize_jit(
+        ts_model, {"": qconfig}, calibrate, [data_loader_test]
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ```
     """
     torch._C._log_api_usage_once("quantization_api.quantize_jit.quantize_jit")
@@ -330,19 +364,35 @@ def quantize_dynamic_jit(model, qconfig_dict, inplace=False, debug=False):
     from torch.ao.quantization import per_channel_dynamic_qconfig
     from torch.ao.quantization import quantize_dynamic_jit
 
+<<<<<<< HEAD
     ts_model = torch.jit.script(float_model.eval())  # or torch.jit.trace(float_model, input)
     qconfig = get_default_qconfig('fbgemm')
+=======
+    ts_model = torch.jit.script(
+        float_model.eval()
+    )  # or torch.jit.trace(float_model, input)
+    qconfig = get_default_qconfig("fbgemm")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def calibrate(model, data_loader):
         model.eval()
         with torch.no_grad():
             for image, target in data_loader:
                 model(image)
 
+<<<<<<< HEAD
     quantized_model = quantize_dynamic_jit(
         ts_model,
         {'': qconfig},
         calibrate,
         [data_loader_test])
+=======
+
+    quantized_model = quantize_dynamic_jit(
+        ts_model, {"": qconfig}, calibrate, [data_loader_test]
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ```
     """
     torch._C._log_api_usage_once("quantization_api.quantize_jit.quantize_dynamic_jit")
@@ -401,6 +451,7 @@ def _quantize_ondevice_dynamic_jit(
     from torch.ao.quantization import per_channel_dynamic_qconfig
     from torch.ao.quantization.quantize_jit import _quantize_ondevice_dynamic_jit
 
+<<<<<<< HEAD
     ts_model = torch.jit.script(float_model.eval())  # or torch.jit.trace(float_model, input)
     qconfig = get_default_qconfig('fbgemm')
     quant_ready_model = _quantize_ondevice_dynamic_jit(
@@ -408,6 +459,15 @@ def _quantize_ondevice_dynamic_jit(
         {'': qconfig},
         'forward',
         True)
+=======
+    ts_model = torch.jit.script(
+        float_model.eval()
+    )  # or torch.jit.trace(float_model, input)
+    qconfig = get_default_qconfig("fbgemm")
+    quant_ready_model = _quantize_ondevice_dynamic_jit(
+        ts_model, {"": qconfig}, "forward", True
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ```
     """
     return _quantize_ondevice_dynamic_jit_impl(
diff --git a/torch/ao/quantization/quantize_pt2e.py b/torch/ao/quantization/quantize_pt2e.py
index 36a9834a3981..7285b1c64b0d 100644
--- a/torch/ao/quantization/quantize_pt2e.py
+++ b/torch/ao/quantization/quantize_pt2e.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+import typing_extensions
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch._export.passes.constant_folding import constant_fold
 from torch.ao.quantization.pt2e.duplicate_dq_pass import DuplicateDQPass
@@ -19,6 +24,10 @@
 from .pt2e.representation import reference_representation_rewrite
 from .pt2e.utils import _disallow_eval_train, _fuse_conv_bn_, _get_node_name_to_scope
 from .quantize_fx import _convert_to_reference_decomposed_fx
+<<<<<<< HEAD
+=======
+from .utils import DEPRECATION_WARNING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -28,6 +37,10 @@
 ]
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def prepare_pt2e(
     model: GraphModule,
     quantizer: Quantizer,
@@ -107,6 +120,10 @@ def calibrate(model, data_loader):
     return model
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def prepare_qat_pt2e(
     model: GraphModule,
     quantizer: Quantizer,
@@ -203,6 +220,10 @@ def _quant_node_constraint(n: Node) -> bool:
     return n.op == "call_function" and n.target in _QUANT_OPS
 
 
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(DEPRECATION_WARNING)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def convert_pt2e(
     model: GraphModule,
     use_reference_representation: bool = False,
@@ -228,7 +249,11 @@ def convert_pt2e(
         # for detailed explanation of output quantized model
         quantized_model = convert_pt2e(prepared_model)
 
+<<<<<<< HEAD
     """  # flake8: noqa
+=======
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._C._log_api_usage_once("quantization_api.quantize_pt2e.convert_pt2e")
     if not isinstance(use_reference_representation, bool):
         raise ValueError(
diff --git a/torch/ao/quantization/quantizer/composable_quantizer.py b/torch/ao/quantization/quantizer/composable_quantizer.py
index 6b95edbc2193..6160df8d8b29 100644
--- a/torch/ao/quantization/quantizer/composable_quantizer.py
+++ b/torch/ao/quantization/quantizer/composable_quantizer.py
@@ -28,8 +28,17 @@ class ComposableQuantizer(Quantizer):
     ```
     embedding_quantizer = EmbeddingQuantizer()
     linear_quantizer = MyLinearQuantizer()
+<<<<<<< HEAD
     xnnpack_quantizer = XNNPackQuantizer() # to handle ops not quantized by previous two quantizers
     composed_quantizer = ComposableQuantizer([embedding_quantizer, linear_quantizer, xnnpack_quantizer])
+=======
+    xnnpack_quantizer = (
+        XNNPackQuantizer()
+    )  # to handle ops not quantized by previous two quantizers
+    composed_quantizer = ComposableQuantizer(
+        [embedding_quantizer, linear_quantizer, xnnpack_quantizer]
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prepared_m = prepare_pt2e(model, composed_quantizer)
     ```
     """
diff --git a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
index 25a5dfc4a193..86aeb8bc94c5 100644
--- a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
@@ -84,6 +84,10 @@ class _X86InductorQuantizationAnnotation(QuantizationAnnotation):
 # Operators support the int8 data type
 # and recipe is configured by default in X86InductorQuantizer.
 default_quantizable_ops = propagation_quantizable_ops | {
+<<<<<<< HEAD
+=======
+    torch.ops.aten.conv1d.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.ops.aten.conv2d.default,
     torch.ops.aten.linear.default,
 }
@@ -185,6 +189,10 @@ def _global_config_filter(nodes: list[Node]) -> bool:
 def _map_module_function_to_aten_operator_type():
     module_function_to_aten_operator: dict[Callable, torch._ops.OpOverloadPacket] = {}
     map_list = (
+<<<<<<< HEAD
+=======
+        ([torch.nn.Conv2d, F.conv1d], torch.ops.aten.conv1d.default),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ([torch.nn.Conv2d, F.conv2d], torch.ops.aten.conv2d.default),
         ([torch.nn.Linear, F.linear], torch.ops.aten.linear.default),
         ([torch.nn.MaxPool2d, F.max_pool2d], torch.ops.aten.max_pool2d.default),
@@ -701,8 +709,13 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
         # Once we've annotated the model with quantization configurations, we also need to annotate
         # the output of quantizable operations. For example, if we annotated `maxpool2d` to quantize its inputs,
         # we will quantize its output accordingly. This enables us to fuse the dq-operator-q into a quantized op.
+<<<<<<< HEAD
         # Refer to https://github.com/intel/intel-extension-for-pytorch/blob/
         # 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487
+=======
+        # Refer to
+        # https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._annotate_output_for_int8_in_int8_out_pattern_entry(model)
 
@@ -730,8 +743,13 @@ def _annotate_with_config(
 
         # Step2: Recipe to propagate annotation for patterns beside conv/linear.
         # Go through all the nodes from start to end.
+<<<<<<< HEAD
         # Recipe refer to https://github.com/intel/intel-extension-for-pytorch/blob/
         # 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538
+=======
+        # Recipe refer to
+        # https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._annotate_propagation_quantizable_pattern_entry(
             model, quantization_config, filter_fn
@@ -805,6 +823,7 @@ def _annotate_qat_conv2d_bn_binary_unary(
                 binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec(
                     quantization_config
                 )
+<<<<<<< HEAD
                 binary_node.meta[
                     QUANT_ANNOTATION_KEY
                 ] = _X86InductorQuantizationAnnotation(
@@ -818,6 +837,21 @@ def _annotate_qat_conv2d_bn_binary_unary(
                     output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
                     _annotated=True,
                     _is_output_of_quantized_pattern=True,
+=======
+                binary_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        input_qspec_map=binary_node_input_qspec_map,
+                        _annotated=True,
+                    )
+                )
+                unary_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                        output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=True,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 _annotate_nodes_not_quantize([binary_node, unary_node])
@@ -875,6 +909,7 @@ def _annotate_qat_conv2d_bn_binary(
                 binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec(
                     quantization_config
                 )
+<<<<<<< HEAD
                 binary_node.meta[
                     QUANT_ANNOTATION_KEY
                 ] = _X86InductorQuantizationAnnotation(
@@ -883,6 +918,16 @@ def _annotate_qat_conv2d_bn_binary(
                     output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
                     _annotated=True,
                     _is_output_of_quantized_pattern=True,
+=======
+                binary_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        input_qspec_map=binary_node_input_qspec_map,
+                        # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                        output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=True,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 _annotate_nodes_not_quantize(binary_node)
@@ -932,6 +977,7 @@ def _annotate_qat_conv2d_bn_unary(
 
             self._annotate_conv_node_helper(conv_node, False, quantization_config)
             if quantization_config is not None:
+<<<<<<< HEAD
                 unary_node.meta[
                     QUANT_ANNOTATION_KEY
                 ] = _X86InductorQuantizationAnnotation(
@@ -939,6 +985,15 @@ def _annotate_qat_conv2d_bn_unary(
                     output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
                     _annotated=True,
                     _is_output_of_quantized_pattern=True,
+=======
+                unary_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                        output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=True,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 _annotate_nodes_not_quantize(unary_node)
@@ -973,6 +1028,7 @@ def _annotate_qat_conv2d_bn(
 
             self._annotate_conv_node_helper(conv_node, False, quantization_config)
             if quantization_config is not None:
+<<<<<<< HEAD
                 bn_output_node.meta[
                     QUANT_ANNOTATION_KEY
                 ] = _X86InductorQuantizationAnnotation(
@@ -980,6 +1036,15 @@ def _annotate_qat_conv2d_bn(
                     output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
                     _annotated=True,
                     _is_output_of_quantized_pattern=True,
+=======
+                bn_output_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                        output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=True,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 _annotate_nodes_not_quantize(bn_output_node)
@@ -1156,6 +1221,10 @@ def _annotate_conv2d_unary(
             [torch.nn.Conv2d, torch.nn.Hardswish],
             [torch.nn.Conv2d, torch.nn.ReLU6],
             [torch.nn.Conv2d, torch.nn.SiLU],
+<<<<<<< HEAD
+=======
+            [torch.nn.Conv1d, torch.nn.ReLU],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         for unary_pattern in unary_patterns:
             partitions = find_sequential_partitions(gm, unary_pattern)
@@ -1168,9 +1237,15 @@ def _annotate_conv2d_unary(
             conv_node, unary_node = self._get_output_nodes_of_partitions(
                 [conv_partition, unary_partition]
             )
+<<<<<<< HEAD
             if (
                 conv_node.op != "call_function"
                 or conv_node.target != torch.ops.aten.conv2d.default
+=======
+            if conv_node.op != "call_function" or conv_node.target not in (
+                torch.ops.aten.conv2d.default,
+                torch.ops.aten.conv1d.default,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 continue
             if _skip_annotate([unary_node, conv_node], filter_fn):
@@ -1378,9 +1453,15 @@ def _annotate_output_for_int8_in_int8_out_pattern(
     ) -> None:
         r"""
         Check and insert observer at output of node in int8_in_int8_out_ops if needed.
+<<<<<<< HEAD
         Recipe refers to https://github.com/intel/intel-extension-for-pytorch/blob/
         90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495
         """
+=======
+        Recipe refers to
+        https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495
+        """  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         edge_or_node: tuple[Node, Node]
         if (node.target in int8_in_int8_out_ops) and (_is_any_annotated([node])):
             if node.target == torch.ops.aten.max_pool2d.default:
@@ -1553,6 +1634,7 @@ def _annotate_linear_binary_unary(
                     linear_node, False, quantization_config
                 )
                 # We don't insert q-dq before the binary input node due to accuracy issues
+<<<<<<< HEAD
                 binary_node.meta[
                     QUANT_ANNOTATION_KEY
                 ] = _X86InductorQuantizationAnnotation(
@@ -1566,6 +1648,21 @@ def _annotate_linear_binary_unary(
                     ] = _X86InductorQuantizationAnnotation(
                         _annotated=True,
                         _is_output_of_quantized_pattern=True,
+=======
+                binary_node.meta[QUANT_ANNOTATION_KEY] = (
+                    _X86InductorQuantizationAnnotation(
+                        input_qspec_map={},
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=(not has_unary),
+                    )
+                )
+                if unary_node is not None:
+                    unary_node.meta[QUANT_ANNOTATION_KEY] = (
+                        _X86InductorQuantizationAnnotation(
+                            _annotated=True,
+                            _is_output_of_quantized_pattern=True,
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
     def validate(self, model: torch.fx.GraphModule) -> None:
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer.py b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
index ed7541e1cd36..9d68391f1286 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
@@ -3,7 +3,11 @@
 
 import copy
 import functools
+<<<<<<< HEAD
 import warnings
+=======
+import typing_extensions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Callable, Optional, TYPE_CHECKING
 
 import torch
@@ -238,6 +242,13 @@ def not_module_type_or_name_filter(n: Node) -> bool:
 
 
 @compatibility(is_backward_compatible=False)
+<<<<<<< HEAD
+=======
+@typing_extensions.deprecated(
+    "XNNPACKQuantizer is deprecated! Please use xnnpack quantizer in "
+    "ExecuTorch (https://github.com/pytorch/executorch/tree/main/backends/xnnpack/quantizer) instead."
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class XNNPACKQuantizer(Quantizer):
     """
     !!! DEPRECATED !!!
@@ -278,7 +289,10 @@ class XNNPACKQuantizer(Quantizer):
 
     def __init__(self) -> None:
         super().__init__()
+<<<<<<< HEAD
         warnings.warn(f"{self.__class__.__name__} is deprecated!")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.global_config: Optional[QuantizationConfig] = None
         self.operator_type_config: dict[
             torch._ops.OpOverloadPacket, Optional[QuantizationConfig]
@@ -342,9 +356,15 @@ def set_module_name(
         quantizer.set_module_name("blocks.sub"), it will quantize all supported operator/operator
         patterns in the submodule with this module name with the given `quantization_config`
         """
+<<<<<<< HEAD
         assert (
             quantization_config is not None
         ), " quantization_config == None is not supported yet"
+=======
+        assert quantization_config is not None, (
+            " quantization_config == None is not supported yet"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.module_name_config[module_name] = quantization_config
         return self
 
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
index c9891bc7add8..f9f69a751def 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
@@ -165,9 +165,15 @@ def get_bias_qspec(quantization_config: Optional[QuantizationConfig]):
     if quantization_config.bias is None:
         return None
     quantization_spec: QuantizationSpec = quantization_config.bias
+<<<<<<< HEAD
     assert (
         quantization_spec.dtype == torch.float
     ), "Only float dtype for bias is supported for bias right now"
+=======
+    assert quantization_spec.dtype == torch.float, (
+        "Only float dtype for bias is supported for bias right now"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return quantization_spec
 
 
diff --git a/torch/ao/quantization/stubs.py b/torch/ao/quantization/stubs.py
index 916d7de35c90..f8e1fd6379ef 100644
--- a/torch/ao/quantization/stubs.py
+++ b/torch/ao/quantization/stubs.py
@@ -1,6 +1,17 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
 
 from torch import nn
+=======
+from typing import Any, Optional
+
+import torch
+from torch import nn
+from torch.ao.quantization import QConfig
+
+
+__all__ = ["QuantStub", "DeQuantStub", "QuantWrapper"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class QuantStub(nn.Module):
@@ -12,12 +23,20 @@ class QuantStub(nn.Module):
             if qconfig is not provided, we will get qconfig from parent modules
     """
 
+<<<<<<< HEAD
     def __init__(self, qconfig=None):
+=======
+    def __init__(self, qconfig: Optional[QConfig] = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         if qconfig:
             self.qconfig = qconfig
 
+<<<<<<< HEAD
     def forward(self, x):
+=======
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x
 
 
@@ -30,12 +49,20 @@ class DeQuantStub(nn.Module):
             if qconfig is not provided, we will get qconfig from parent modules
     """
 
+<<<<<<< HEAD
     def __init__(self, qconfig=None):
+=======
+    def __init__(self, qconfig: Optional[Any] = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         if qconfig:
             self.qconfig = qconfig
 
+<<<<<<< HEAD
     def forward(self, x):
+=======
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return x
 
 
@@ -50,11 +77,19 @@ class QuantWrapper(nn.Module):
     will be swapped to `nnq.Quantize` which does actual quantization. Similarly
     for `DeQuantStub`.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     quant: QuantStub
     dequant: DeQuantStub
     module: nn.Module
 
+<<<<<<< HEAD
     def __init__(self, module):
+=======
+    def __init__(self, module: nn.Module):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         qconfig = getattr(module, "qconfig", None)
         self.add_module("quant", QuantStub(qconfig))
@@ -62,7 +97,11 @@ def __init__(self, module):
         self.add_module("module", module)
         self.train(module.training)
 
+<<<<<<< HEAD
     def forward(self, X):
+=======
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         X = self.quant(X)
         X = self.module(X)
         return self.dequant(X)
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index 9a2352e2d454..e0a6906dcc84 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -2,6 +2,10 @@
 """
 Utils shared by different modes of quantization (eager/graph)
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import warnings
 from collections import OrderedDict
@@ -414,9 +418,15 @@ def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
 
         assert min_val <= max_val, f"min {min_val} should be less than max {max_val}"
     else:
+<<<<<<< HEAD
         assert torch.all(
             min_val <= max_val
         ), f"min {min_val} should be less than max {max_val}"
+=======
+        assert torch.all(min_val <= max_val), (
+            f"min {min_val} should be less than max {max_val}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return True
 
@@ -451,6 +461,7 @@ def calculate_qmin_qmax(
 
         qrange_len = initial_quant_max - initial_quant_min + 1
         if dtype in [torch.qint8, torch.int8]:
+<<<<<<< HEAD
             assert (
                 0 < qrange_len <= 256
             ), "quantization range should be positive and not exceed the maximum bit range (=256)."
@@ -458,6 +469,15 @@ def calculate_qmin_qmax(
             assert (
                 0 < qrange_len <= 2**32
             ), "quantization range should be positive and not exceed the maximum bit range (=4294967296)."
+=======
+            assert 0 < qrange_len <= 256, (
+                "quantization range should be positive and not exceed the maximum bit range (=256)."
+            )
+        elif dtype in [torch.qint32, torch.int32]:
+            assert 0 < qrange_len <= 2**32, (
+                "quantization range should be positive and not exceed the maximum bit range (=4294967296)."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if reduce_range:
             quant_min, quant_max = quant_min // 2, quant_max // 2
     else:
@@ -605,12 +625,21 @@ def validate_qmin_qmax(quant_min: int, quant_max: int) -> None:
     """
     # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted
     # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer.
+<<<<<<< HEAD
     assert (
         quant_min <= 0 <= quant_max
     ), "Used-specified quantization range must include 0."
     assert (
         quant_min < quant_max
     ), "qmin must be strictly less than qmax for user-specified quantization range."
+=======
+    assert quant_min <= 0 <= quant_max, (
+        "Used-specified quantization range must include 0."
+    )
+    assert quant_min < quant_max, (
+        "qmin must be strictly less than qmax for user-specified quantization range."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Functionally equivalent to '_calculate_qparams' in observer.py. Observers must be torchscriptable however and qscheme
@@ -790,6 +819,23 @@ def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
     return device
 
 
+<<<<<<< HEAD
+=======
+DEPRECATION_WARNING = (
+    "torch.ao.quantization is deprecated and will be removed in 2.10. \n"
+    "For migrations of users: \n"
+    "1. Eager mode quantization (torch.ao.quantization.quantize, "
+    "torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode "
+    "quantize_ API instead \n"
+    "2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,"
+    "torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization "
+    "API instead (prepare_pt2e, convert_pt2e) \n"
+    "3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) \n"
+    "see https://github.com/pytorch/ao/issues/2259 for more details"
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "NodePattern",
     "Pattern",
@@ -819,4 +865,8 @@ def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
     "to_underlying_dtype",
     "determine_qparams",
     "validate_qmin_qmax",
+<<<<<<< HEAD
+=======
+    "DEPRECATION_WARNING",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index c370a0368d71..dcdc23560d16 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -138,7 +138,11 @@ def _make_grads(
                 shape_matches = expect_true(sym_eq(out_size, first_grad.size()))
 
             if not shape_matches:
+<<<<<<< HEAD
                 out = cast(Union[torch.Tensor, graph.GradientEdge], out)
+=======
+                out = cast(Union[torch.Tensor, graph.GradientEdge], out)  # type: ignore[redundant-cast]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out_shape, grad_shape = _calculate_shape(
                     out, first_grad, is_grads_batched
                 )
@@ -325,8 +329,21 @@ def backward(
                 "arguments both passed to `backward()`. Please only "
                 "use `grad_tensors`."
             )
+<<<<<<< HEAD
     if inputs is not None and len(inputs) == 0:
         raise RuntimeError("`inputs` argument to `backward()` cannot be empty.")
+=======
+
+    inputs_tuple: tuple[Union[torch.Tensor, graph.GradientEdge], ...]
+    if inputs is None:
+        inputs_tuple = ()
+    elif isinstance(inputs, (torch.Tensor, graph.GradientEdge)):
+        inputs_tuple = (inputs,)
+    else:
+        inputs_tuple = tuple(inputs)
+        if len(inputs_tuple) == 0:
+            raise RuntimeError("`inputs` argument to `backward()` cannot be empty.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if is_tensor_like(tensors) or isinstance(tensors, graph.GradientEdge):
         tensors = cast(
@@ -334,6 +351,7 @@ def backward(
         )
     else:
         tensors = tuple(tensors)
+<<<<<<< HEAD
     inputs = (
         (inputs,)
         if isinstance(inputs, (torch.Tensor, graph.GradientEdge))
@@ -341,6 +359,8 @@ def backward(
         if inputs is not None
         else ()
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
     grad_tensors_ = _make_grads(tensors, grad_tensors_, is_grads_batched=False)
@@ -355,7 +375,11 @@ def backward(
         grad_tensors_,
         retain_graph,
         create_graph,
+<<<<<<< HEAD
         inputs,
+=======
+        inputs_tuple,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         allow_unreachable=True,
         accumulate_grad=True,
     )
diff --git a/torch/autograd/_functions/tensor.py b/torch/autograd/_functions/tensor.py
index a37c8bf58593..b627fb2dd362 100644
--- a/torch/autograd/_functions/tensor.py
+++ b/torch/autograd/_functions/tensor.py
@@ -25,7 +25,11 @@ def backward(ctx, grad_output):
         if ctx.input_device == -1:
             return grad_output.type(ctx.input_type), None
         else:
+<<<<<<< HEAD
             with torch.cuda.device(ctx.input_device):
+=======
+            with torch.accelerator.device_index(ctx.input_device):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return grad_output.type(ctx.input_type), None
 
 
diff --git a/torch/autograd/forward_ad.py b/torch/autograd/forward_ad.py
index 426523865296..501b4e133a9f 100644
--- a/torch/autograd/forward_ad.py
+++ b/torch/autograd/forward_ad.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import os
+<<<<<<< HEAD
 from collections import namedtuple
 from typing import Any
+=======
+from typing import Any, NamedTuple, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -129,6 +133,7 @@ def make_dual(tensor, tangent, *, level=None):
     return torch._VF._make_dual(tensor, tangent, level=level)
 
 
+<<<<<<< HEAD
 _UnpackedDualTensor = namedtuple("_UnpackedDualTensor", ["primal", "tangent"])
 
 
@@ -139,6 +144,17 @@ class UnpackedDualTensor(_UnpackedDualTensor):
 
     """
 
+=======
+class UnpackedDualTensor(NamedTuple):
+    r"""Namedtuple returned by :func:`unpack_dual` containing the primal and tangent components of the dual tensor.
+
+    See :func:`unpack_dual` for more details.
+    """
+
+    primal: torch.Tensor
+    tangent: Optional[torch.Tensor]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def unpack_dual(tensor, *, level=None):
     r"""Unpack a "dual tensor" to get both its Tensor value and its forward AD gradient.
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index 219759ea37b3..12dd29e5f814 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -42,6 +42,10 @@ def save_for_backward(self, *tensors: torch.Tensor):
         with ``save_for_backward`` (as opposed to directly on ``ctx``) to prevent
         incorrect gradients and memory leaks, and enable the application of saved
         tensor hooks. See :class:`torch.autograd.graph.saved_tensors_hooks`.
+<<<<<<< HEAD
+=======
+        See :ref:`extending-autograd` for more details.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Note that if intermediary tensors, tensors that are neither inputs
         nor outputs of :func:`forward`, are saved for backward, your custom Function
@@ -63,6 +67,10 @@ def save_for_backward(self, *tensors: torch.Tensor):
         See :ref:`extending-autograd` for more details on how to use this method.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
             >>> class Func(Function):
             >>>     @staticmethod
@@ -106,6 +114,10 @@ def save_for_forward(self, *tensors: torch.Tensor):
         See :ref:`extending-autograd` for more details on how to use this method.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> # xdoctest: +SKIP
             >>> class Func(torch.autograd.Function):
             >>>     @staticmethod
@@ -233,6 +245,10 @@ def set_materialize_grads(self, value: bool):
         prior to calling the :func:`backward` and :func:`jvp` methods.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
             >>> class SimpleFunc(Function):
             >>>     @staticmethod
@@ -331,9 +347,12 @@ def __init__(cls, name, bases, attrs):
             name + "Backward", (BackwardCFunction,), {"_forward_cls": cls}
         )
         backward_fn._autograd_function_id = next(AUTOGRAD_FUNCTION_COUNTER)  # type: ignore[attr-defined]
+<<<<<<< HEAD
         backward_fn._bw_module = None  # type: ignore[attr-defined]
         if getattr(cls, "_lazy_backward_info", None):
             backward_fn._bw_module = cls._lazy_backward_info.bw_module  # type: ignore[attr-defined]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cls._backward_cls = backward_fn
 
         super().__init__(name, bases, attrs)
@@ -811,7 +830,11 @@ def save_for_backward(self, *args: Any) -> None:
         self._to_save_nested = args
 
     @property
+<<<<<<< HEAD
     def saved_tensors(self):
+=======
+    def saved_tensors(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""
         See :meth:`Function.saved_tensors`.
         """
diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py
index de1c13433476..9c2acb28cbe3 100644
--- a/torch/autograd/functional.py
+++ b/torch/autograd/functional.py
@@ -653,6 +653,19 @@ def jacobian(
                 [0.0000, 3.3963]]),
          tensor([[3., 0.],
                  [0., 3.]]))
+<<<<<<< HEAD
+=======
+
+        >>> def linear_model(x):
+        ...     W = torch.tensor([[2.0, -1.0], [0.0, 1.0]])
+        ...     b = torch.tensor([1.0, 0.5])
+        ...     return x @ W.T + b
+
+        >>> x = torch.randn(4, 2, requires_grad=True)
+        >>> jac = jacobian(linear_model, x, vectorize=True)
+        >>> jac.shape
+        torch.Size([4, 2, 4, 2])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     assert strategy in ("forward-mode", "reverse-mode"), (
         'Expected strategy to be either "forward-mode" or "reverse-mode". Hint: If your '
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 73c072948198..aa66fa18f00a 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -196,6 +196,15 @@ def __enter__(self) -> None:
     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
         torch._C._set_grad_enabled(self.prev)
 
+<<<<<<< HEAD
+=======
+    def __str__(self) -> str:
+        return f"{torch.typename(self)}(mode={self.mode})"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def clone(self) -> "set_grad_enabled":
         r"""
         Create a copy of this class
diff --git a/torch/autograd/graph.py b/torch/autograd/graph.py
index 549f31d349e0..f51221d1412f 100644
--- a/torch/autograd/graph.py
+++ b/torch/autograd/graph.py
@@ -272,7 +272,11 @@ class saved_tensors_hooks:
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
         >>> def pack_hook(x):
         ...     print("Packing", x)
+<<<<<<< HEAD
         ...     return x
+=======
+        ...     return x.detach()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>>
         >>> def unpack_hook(x):
         ...     print("Unpacking", x)
@@ -295,6 +299,14 @@ class saved_tensors_hooks:
     .. warning ::
         Only one pair of hooks is allowed at a time. When recursively nesting this
         context-manager, only the inner-most pair of hooks will be applied.
+<<<<<<< HEAD
+=======
+
+    .. warning ::
+        To avoid reference cycle, the return value of ``pack_hook`` cannot hold a
+        reference to the input tensor. For example, use `lambda x: x.detach()`
+        instead of `lambda x: x` as the pack hook.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -791,7 +803,11 @@ def iter_graph(roots: list[Node]) -> Iterator[Node]:
 
     def fmt(t: Optional[torch.Tensor]) -> str:
         # Avoid circular import
+<<<<<<< HEAD
         from torch.testing._internal.common_utils import dtype_abbrs
+=======
+        from torch.utils._dtype_abbrs import dtype_abbrs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if t is None:
             return "None"
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 50745586ca63..f25f94a2282b 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -160,6 +160,7 @@ class profile:
         acc_events (bool): Enable the accumulation of FunctionEvents across multiple profiling cycles
 
 
+<<<<<<< HEAD
     .. warning:
         Enabling memory profiling or source attribution incurs additional profiler
         overhead
@@ -170,6 +171,18 @@ class profile:
 
     .. warning:
         Due to some CUDA multiprocessing limitations (multiprocessing-cuda-note_),
+=======
+    .. warning::
+        Enabling memory profiling or source attribution incurs additional profiler
+        overhead
+
+    .. warning::
+        This context managers should not be called recursively, i.e. no nested
+        instances are allowed
+
+    .. warning::
+        Due to some CUDA multiprocessing limitations (see :ref:`multiprocessing-cuda-note`),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         one cannot use the profiler with ``use_device = 'cuda'`` to benchmark
         DataLoaders with ``num_workers > 0``. If you wish to benchmark data loading,
         please use ``use_device = None`` or ``num_workers = 0``.
@@ -563,7 +576,16 @@ def _device_memory_usage(mem_record):
             return (
                 mem_record.nbytes()
                 if mem_record.device_type()
+<<<<<<< HEAD
                 in [DeviceType.CUDA, DeviceType.PrivateUse1, DeviceType.HIP]
+=======
+                in [
+                    DeviceType.CUDA,
+                    DeviceType.PrivateUse1,
+                    DeviceType.HIP,
+                    DeviceType.XPU,
+                ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else 0
             )
 
@@ -629,7 +651,11 @@ def _device_memory_usage(mem_record):
             )
             max_evt_id = max(max_evt_id, fe.id)
             if fe.device_type == DeviceType.CPU and not fe.is_async:
+<<<<<<< HEAD
                 if self.use_device == "privateuseone":
+=======
+                if self.use_device == _get_privateuse1_backend_name():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     privateuse1_time = kineto_event.privateuse1_elapsed_us()
                     if privateuse1_time > 0:
                         fe.append_kernel(fe.name, fe.device_index, privateuse1_time)
diff --git a/torch/autograd/profiler_util.py b/torch/autograd/profiler_util.py
index 1cf36a8df7ef..94ef8c705896 100644
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@@ -398,6 +398,7 @@ def _format_memory(nbytes):
     MB = 1024 * KB
     GB = 1024 * MB
     if abs(nbytes) >= GB:
+<<<<<<< HEAD
         return f"{nbytes * 1.0 / GB:.2f} Gb"
     elif abs(nbytes) >= MB:
         return f"{nbytes * 1.0 / MB:.2f} Mb"
@@ -405,6 +406,15 @@ def _format_memory(nbytes):
         return f"{nbytes * 1.0 / KB:.2f} Kb"
     else:
         return str(nbytes) + " b"
+=======
+        return f"{nbytes * 1.0 / GB:.2f} GB"
+    elif abs(nbytes) >= MB:
+        return f"{nbytes * 1.0 / MB:.2f} MB"
+    elif abs(nbytes) >= KB:
+        return f"{nbytes * 1.0 / KB:.2f} KB"
+    else:
+        return str(nbytes) + " B"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _attr_formatter(name):
@@ -609,6 +619,10 @@ def device_time_total(self):
                 DeviceType.CUDA,
                 DeviceType.PrivateUse1,
                 DeviceType.MTIA,
+<<<<<<< HEAD
+=======
+                DeviceType.HPU,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             return self.time_range.elapsed_us()
 
@@ -633,6 +647,10 @@ def self_device_time_total(self):
                 DeviceType.CUDA,
                 DeviceType.PrivateUse1,
                 DeviceType.MTIA,
+<<<<<<< HEAD
+=======
+                DeviceType.HPU,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
             return self.device_time_total
 
diff --git a/torch/backends/_coreml/preprocess.py b/torch/backends/_coreml/preprocess.py
index f05e0bcee9fb..9e53f3e0ca1e 100644
--- a/torch/backends/_coreml/preprocess.py
+++ b/torch/backends/_coreml/preprocess.py
@@ -55,6 +55,10 @@ def CompileSpec(
     allow_low_precision=True,
     quantization_mode=CoreMLQuantizationMode.NONE,
     mlmodel_export_path=None,
+<<<<<<< HEAD
+=======
+    convert_to=None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     return (
         inputs,
@@ -63,6 +67,10 @@ def CompileSpec(
         allow_low_precision,
         quantization_mode,
         mlmodel_export_path,
+<<<<<<< HEAD
+=======
+        convert_to,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -91,6 +99,10 @@ def preprocess(script_module: torch._C.ScriptObject, compile_spec: dict[str, tup
         allow_low_precision,
         quantization_mode,
         mlmodel_export_path,
+<<<<<<< HEAD
+=======
+        convert_to,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) = spec
     mil_inputs = []
     inputs = []
@@ -101,7 +113,11 @@ def preprocess(script_module: torch._C.ScriptObject, compile_spec: dict[str, tup
         ml_type = _convert_to_mil_type(shape, dtype, name)
         mil_inputs.append(ml_type)
     model = torch.jit.RecursiveScriptModule._construct(script_module, lambda x: None)
+<<<<<<< HEAD
     mlmodel = ct.convert(model, inputs=mil_inputs)
+=======
+    mlmodel = ct.convert(model, inputs=mil_inputs, convert_to=convert_to)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if quantization_mode != CoreMLQuantizationMode.NONE:
         quant_model_spec = quantization_utils.quantize_weights(
diff --git a/torch/backends/mkldnn/__init__.py b/torch/backends/mkldnn/__init__.py
index 04b77ed2e166..437cb2f4eb25 100644
--- a/torch/backends/mkldnn/__init__.py
+++ b/torch/backends/mkldnn/__init__.py
@@ -94,6 +94,12 @@ class MkldnnModule(PropModule):
     def __init__(self, m, name):
         super().__init__(m, name)
 
+<<<<<<< HEAD
+=======
+    def is_available(self):
+        return is_available()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     enabled = ContextProp(torch._C._get_mkldnn_enabled, torch._C._set_mkldnn_enabled)
     deterministic = ContextProp(
         torch._C._get_mkldnn_deterministic, torch._C._set_mkldnn_deterministic
diff --git a/torch/compiler/__init__.py b/torch/compiler/__init__.py
index 321ededbb24a..a796f0cc547d 100644
--- a/torch/compiler/__init__.py
+++ b/torch/compiler/__init__.py
@@ -21,6 +21,10 @@
     "list_backends",
     "disable",
     "set_stance",
+<<<<<<< HEAD
+=======
+    "set_enable_guard_collectives",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "cudagraph_mark_step_begin",
     "wrap_numpy",
     "is_compiling",
@@ -28,6 +32,14 @@
     "is_exporting",
     "save_cache_artifacts",
     "load_cache_artifacts",
+<<<<<<< HEAD
+=======
+    "skip_guard_on_inbuilt_nn_modules_unsafe",
+    "skip_guard_on_all_nn_modules_unsafe",
+    "keep_tensor_guards_unsafe",
+    "skip_guard_on_globals_unsafe",
+    "nested_compile_region",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -228,7 +240,11 @@ def assume_constant_result(fn):
     return torch._dynamo.assume_constant_result(fn)
 
 
+<<<<<<< HEAD
 def disable(fn=None, recursive=True):
+=======
+def disable(fn=None, recursive=True, *, reason=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     This function provides a decorator to disable compilation on a function.
     It also provides the option of recursively disabling called functions.
@@ -236,10 +252,18 @@ def disable(fn=None, recursive=True):
     Args:
         fn (optional): The function to disable
         recursive (optional): A boolean value indicating whether the disabling should be recursive.
+<<<<<<< HEAD
     """
     import torch._dynamo
 
     return torch._dynamo.disable(fn, recursive)
+=======
+        reason (optional): A string value indicating the reason for disabling the function.
+    """
+    import torch._dynamo
+
+    return torch._dynamo.disable(fn, recursive, reason=reason)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def set_stance(
@@ -283,6 +307,18 @@ def bar():
             - "eager_on_recompile": Run code eagerly when a recompile is necessary.
               If there is cached compiled code valid for the input, it will still be used.
             - "fail_on_recompile": Raise an error when recompiling a function.
+<<<<<<< HEAD
+=======
+            - "eager_then_compile": Run the first invocation in eager mode, then compile on
+              subsequent calls. This is beneficial for dynamic shapes as it allows inferring
+              dynamism from the first two invocations instead of wasting a static compile on
+              the first invocation.
+            - "aot_eager_then_compile": Run the first invocation with AOT eager to get memory
+              benefits from activation checkpointing, then compile on subsequent calls. Like
+              eager_then_compile, this improves handling of dynamic shapes by avoiding an
+              initial static compile.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         skip_guard_eval_unsafe: A flag to run only differentiating guards.
             CAUTION - This flag is unsafe and should only be used if your setup
@@ -315,6 +351,38 @@ def bar():
 set_stance._dynamo_forbidden = True  # type: ignore[attr-defined]
 
 
+<<<<<<< HEAD
+=======
+def set_enable_guard_collectives(enabled: bool):
+    """
+    Enables use of collectives *during* guard evaluation to synchronize behavior
+    across ranks.  This is expensive: we have to issue a collective every time
+    we enter a compiled code region, even if no rank actually would need to
+    compile.  This can help prevent NCCL hangs by ensuring that we never have a
+    situation where one rank starts recompiling while other ranks don't compile;
+    it is especially useful in conjunction with enable_compiler_collectives
+    where such a situation would immediately cause a hang (as it is necessary
+    for all ranks to compile at the same time to run compiler collectives).  Like
+    compiler collectives, you can only run this on SPMD programs; you will hang
+    otherwise.  Note that a guard collective is only issued if there is any
+    compiled code to guard on; if this the first time we encounter a frame or
+    the frame is skipped, we don't issue collectives.
+
+    Returns the previous setting of enabled.
+    """
+    from torch._C._dynamo.eval_frame import set_guard_complete_hook  # noqa: F401
+    from torch._dynamo.eval_frame import guard_collectives_hook
+
+    if enabled:
+        return set_guard_complete_hook(guard_collectives_hook) is not None
+    else:
+        return set_guard_complete_hook(None) is not None
+
+
+set_enable_guard_collectives._dynamo_forbidden = True  # type: ignore[attr-defined]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def cudagraph_mark_step_begin():
     """
     Indicates that a new iteration of inference or training is about to begin.
@@ -463,4 +531,129 @@ def load_cache_artifacts(serialized_artifacts: bytes) -> Optional["CacheInfo"]:
     """
     from ._cache import CacheArtifactManager, CacheInfo
 
+<<<<<<< HEAD
     return CacheArtifactManager.deserialize(serialized_artifacts)
+=======
+    artifacts = CacheArtifactManager.deserialize(serialized_artifacts)
+    if artifacts is not None:
+        return CacheArtifactManager.populate_caches(artifacts)
+    return None
+
+
+def skip_guard_on_inbuilt_nn_modules_unsafe(guard_entries):
+    """
+    A common function to skip guards on the inbuilt nn modules like
+    torch.nn.Linear. This is unsafe to use by default. But for majority of
+    torch.compile users, the model code does not modify the inbuilt nn module
+    attributes. They can benefit from reduction in guard latency overhead using
+    this API.
+
+    To use this API, use guard_filter_fn argument while calling torch.compile
+
+    >> opt_mod = torch.compile(
+    >>     mod,
+    >>     options={"guard_filter_fn": torch.compiler.skip_guard_on_all_nn_modules_unsafe},
+    >> )
+    """
+    return [
+        not entry.orig_guard.source.is_unspecialized_builtin_nn_module()
+        for entry in guard_entries
+    ]
+
+
+def skip_guard_on_all_nn_modules_unsafe(guard_entries):
+    """
+    A common function to skip guards on all nn modules, both user defined as
+    well inbuilt nn modules (like torch.nn.Linear). This is unsafe to use by
+    default. But for majority of torch.compile users, the model code does not
+    modify the nn module attributes. They can benefit from reduction in guard
+    latency overhead using this API.
+
+    To use this API, use guard_filter_fn argument while calling torch.compile
+
+    >> opt_mod = torch.compile(
+    >>     mod,
+    >>     options={"guard_filter_fn": torch.compiler.skip_guard_on_all_nn_modules_unsafe},
+    >> )
+    """
+
+    return [
+        not entry.orig_guard.source.is_unspecialized_nn_module()
+        for entry in guard_entries
+    ]
+
+
+def keep_tensor_guards_unsafe(guard_entries, keep_parameters=False):
+    """
+    A common function to keep tensor guards on all tensors. This is unsafe to
+    use by default. But if you don't expect any changes in the model code, you
+    can just keep the tensor guards.
+
+
+    >> opt_mod = torch.compile(
+    >>     mod,
+    >>     options={"guard_filter_fn": torch.compiler.keep_tensor_guards},
+    >> )
+    """
+
+    keep_flags = []
+    for entry in guard_entries:
+        if entry.guard_type == "TENSOR_MATCH":
+            if not isinstance(entry.value, torch.nn.Parameter):
+                keep_flags.append(True)
+            elif keep_parameters:
+                keep_flags.append(True)
+            else:
+                keep_flags.append(False)
+        else:
+            keep_flags.append(False)
+    return keep_flags
+
+
+def skip_guard_on_globals_unsafe(guard_entries):
+    """
+    A common function to skip guards on all globals. This is unsafe to use by
+    default. But if you don't expect any changes in the globals, you can just
+    keep the tensor guards.
+
+    >> opt_mod = torch.compile(
+    >>     mod,
+    >>     options={"guard_filter_fn": torch.compiler.skip_guard_on_globals},
+    >> )
+    """
+
+    return [not entry.is_global for entry in guard_entries]
+
+
+def nested_compile_region(fn=None):
+    """
+    Tells **``torch.compile``** that the marked set of operations forms a nested
+    compile region (which is often repeated in the full model) whose code can be
+    compiled once and safely reused.  ``nested_compile_region`` can also be used
+    as a decorator.
+
+    During **``torch.compile``** tracing, the compiler applies *hierarchical
+    compilation* with ``nested_compile_region``: it emits optimized code for the
+    marked region the first time it is encountered and re-emits (or “stamps
+    out”) the previously compiled code on every subsequent invocation.  This can
+    substantially reduce overall compile time for deeply-stacked,
+    structurally-identical components such as the transformer layers of a
+    large-language-model (LLM).
+
+    Outside a ``torch.compile`` context—i.e., in standard eager execution—the
+    call is a no-op, so existing workflows remain unaffected.
+
+    Note that ``nested_compile_region`` **does not** promise that a region will
+    be compiled exactly once.  If the compiler detects that new input conditions
+    (shape, dtype, device, stride, globals etc.) make the cached version invalid
+    to reuse, it will transparently re-compile the region.  Using it is
+    therefore *safe*: correctness is always preserved, and you pay the extra
+    compilation cost only when required.
+    """
+
+    from torch._higher_order_ops.invoke_subgraph import (
+        mark_compile_region as _mark_compile_region,
+    )
+
+    return _mark_compile_region(fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/compiler/_cache.py b/torch/compiler/_cache.py
index 9794c41d8a69..942eb50bd321 100644
--- a/torch/compiler/_cache.py
+++ b/torch/compiler/_cache.py
@@ -1,12 +1,22 @@
 import copy
 import dataclasses
 import logging
+<<<<<<< HEAD
 import os
 from enum import Enum
 from typing import Optional, Union
 
 from torch._inductor.remote_cache import JsonDataTy, RemoteCacheJsonSerde
 from torch._inductor.runtime.runtime_utils import cache_dir
+=======
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from collections.abc import Generator
+from contextlib import contextmanager
+from itertools import chain
+from typing import Any, Optional
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._appending_byte_serializer import (
     AppendingByteSerializer,
     BytesReader,
@@ -18,6 +28,7 @@
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 class CacheArtifactType(Enum):
     """
     Type of cache
@@ -31,26 +42,106 @@ class CacheArtifactType(Enum):
 
 @dataclasses.dataclass(frozen=True)
 class CacheArtifact:
+=======
+@dataclasses.dataclass(frozen=True)
+class CacheArtifact(ABC):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Data for each cache artifact that will be serialized and deserialized
     """
 
+<<<<<<< HEAD
     type: CacheArtifactType
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     key: str
     content: bytes = dataclasses.field(repr=False)  # Do not display potential binary
 
     @staticmethod
     def serialize(writer: BytesWriter, cls: "CacheArtifact") -> None:
+<<<<<<< HEAD
         writer.write_uint64(cls.type.value)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         writer.write_str(cls.key)
         writer.write_bytes(cls.content)
 
     @staticmethod
+<<<<<<< HEAD
     def deserialize(reader: BytesReader) -> "CacheArtifact":
         type = reader.read_uint64()
         key = reader.read_str()
         content = reader.read_bytes()
         return CacheArtifact(CacheArtifactType(type), key, content)
+=======
+    def deserialize(artifact_type: str, reader: BytesReader) -> "CacheArtifact":
+        key = reader.read_str()
+        content = reader.read_bytes()
+        return CacheArtifactFactory.create(artifact_type, key, content)
+
+    @staticmethod
+    def encode(content: Any) -> bytes:
+        assert isinstance(content, bytes), f"Expected bytes, got {type(content)}"
+        return content
+
+    @abstractmethod
+    def populate_cache(self) -> None:
+        pass
+
+    def precompile_compatible(self) -> bool:
+        return False
+
+    @staticmethod
+    def type() -> str:
+        """
+        Returns the type of the artifact. Must be unique across all CacheArtifact classes.
+
+        CacheArtifactFactory.register will add property method to CacheInfo based on this (def {type}_artifacts)
+        that returns all artifacts for specific cache.
+        """
+        raise RuntimeError("CacheArtifact is an abstract class, please use a subclass")
+
+
+class CacheArtifactFactory:
+    """
+    Factory for creating CacheArtifact objects based on their type
+    """
+
+    _artifact_types: dict[str, type[CacheArtifact]] = {}
+
+    @classmethod
+    def register(cls, artifact_cls: type[CacheArtifact]) -> type[CacheArtifact]:
+        artifact_type_key = artifact_cls.type()
+        assert (
+            artifact_cls.type() not in cls._artifact_types
+        ), f"Artifact of type={artifact_type_key} already registered in mega-cache artifact factory"
+        cls._artifact_types[artifact_type_key] = artifact_cls
+        setattr(
+            CacheInfo,
+            f"{artifact_type_key}_artifacts",
+            property(lambda self: self.artifacts[artifact_type_key]),
+        )
+        return artifact_cls
+
+    @classmethod
+    def _get_artifact_type(cls, artifact_type_key: str) -> type[CacheArtifact]:
+        assert (
+            artifact_type_key in cls._artifact_types
+        ), f"Artifact of type={artifact_type_key} not registered in mega-cache artifact factory"
+        return cls._artifact_types[artifact_type_key]
+
+    @classmethod
+    def create(cls, artifact_type_key: str, key: str, content: bytes) -> CacheArtifact:
+        artifact_cls = cls._get_artifact_type(artifact_type_key)
+        return artifact_cls(key, content)
+
+    @classmethod
+    def encode_create(
+        cls, artifact_type_key: str, key: str, content: Any
+    ) -> CacheArtifact:
+        artifact_cls = cls._get_artifact_type(artifact_type_key)
+        return artifact_cls(key, artifact_cls.encode(content))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -60,6 +151,7 @@ class CacheInfo:
     instrumentation
     """
 
+<<<<<<< HEAD
     inductor_artifacts: list[str] = dataclasses.field(default_factory=list)
     autotune_artifacts: list[str] = dataclasses.field(default_factory=list)
     aot_autograd_artifacts: list[str] = dataclasses.field(default_factory=list)
@@ -82,6 +174,65 @@ def clear(self) -> None:
         self.autotune_artifacts.clear()
         self.aot_autograd_artifacts.clear()
         self.pgo_artifacts.clear()
+=======
+    artifacts: defaultdict[str, list[str]] = dataclasses.field(
+        default_factory=lambda: defaultdict(list)
+    )
+
+    # Methods set by CacheArtifactFactory.register based on CacheArtifact.type()
+    @property
+    def inductor_artifacts(self) -> list[str]:  # type: ignore[empty-body]
+        ...
+
+    @property
+    def autotune_artifacts(self) -> list[str]:  # type: ignore[empty-body]
+        ...
+
+    @property
+    def aot_autograd_artifacts(self) -> list[str]:  # type: ignore[empty-body]
+        ...
+
+    @property
+    def pgo_artifacts(self) -> list[str]:  # type: ignore[empty-body]
+        ...
+
+    @property
+    def precompile_aot_autograd_artifacts(self) -> list[str]:  # type: ignore[empty-body]
+        ...
+
+    def add(self, artifact: CacheArtifact) -> None:
+        self.artifacts[artifact.type()].append(artifact.key)
+
+    def clear(self) -> None:
+        self.artifacts.clear()
+
+    def empty(self) -> bool:
+        return not self.artifacts
+
+
+def _serialize_single_cache(
+    writer: BytesWriter, cls: "tuple[str, list[CacheArtifact]]"
+) -> None:
+    writer.write_str(cls[0])
+    writer.write_uint64(len(cls[1]))
+    for artifact in cls[1]:
+        CacheArtifact.serialize(writer, artifact)
+
+
+def _deserialize_single_cache(
+    reader: BytesReader,
+) -> "tuple[str, list[CacheArtifact]]":
+    artifacts = []
+    artifact_type_key = reader.read_str()
+    num_artifacts = reader.read_uint64()
+    for _ in range(num_artifacts):
+        artifacts.append(CacheArtifact.deserialize(artifact_type_key, reader))
+
+    return artifact_type_key, artifacts
+
+
+CacheArtifactsResult = dict[str, list[CacheArtifact]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class CacheArtifactManager:
@@ -102,16 +253,26 @@ class CacheArtifactManager:
     """
 
     # Protected by the compile_lock
+<<<<<<< HEAD
     _new_cache_artifacts: list[CacheArtifact] = []
+=======
+    _new_cache_artifacts: CacheArtifactsResult = defaultdict(list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Keep a seperate seen artifacts list to make avoid unnecessary duplicates
     # This list will not be cleared between serialize() calls
     _seen_artifacts: OrderedSet[CacheArtifact] = OrderedSet()
     # When serialize() is called, artifacts are transferred from _cache_artifacts to
     # internal data structure of the _serializer
     # This allows us to only pay the cost of serialization if serialize() is called
+<<<<<<< HEAD
     _serializer: AppendingByteSerializer[CacheArtifact] = AppendingByteSerializer(
         serialize_fn=CacheArtifact.serialize
     )
+=======
+    _serializer: AppendingByteSerializer[
+        tuple[str, list[CacheArtifact]]
+    ] = AppendingByteSerializer(serialize_fn=_serialize_single_cache)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _cache_info: CacheInfo = CacheInfo()
 
     @classmethod
@@ -122,16 +283,45 @@ def clear(cls) -> None:
         cls._cache_info.clear()
 
     @classmethod
+<<<<<<< HEAD
     def record_artifact(
         cls,
         artifact_type: CacheArtifactType,
         key: str,
         content: Union[bytes, JsonDataTy],
+=======
+    @contextmanager
+    def with_fresh_cache(cls) -> Generator[None, None, None]:
+        original_new_cache_artifacts = cls._new_cache_artifacts
+        original_seen_artifacts = cls._seen_artifacts
+        original_serializer = cls._serializer
+        original_cache_info = cls._cache_info
+
+        cls._new_cache_artifacts = defaultdict(list)
+        cls._seen_artifacts = OrderedSet()
+        cls._serializer = AppendingByteSerializer(serialize_fn=_serialize_single_cache)
+        cls._cache_info = cls._cache_info.__class__()
+        try:
+            yield
+        finally:
+            cls._new_cache_artifacts = original_new_cache_artifacts
+            cls._seen_artifacts = original_seen_artifacts
+            cls._serializer = original_serializer
+            cls._cache_info = original_cache_info
+
+    @classmethod
+    def record_artifact(
+        cls,
+        artifact_type: str,
+        key: str,
+        content: Any,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """
         Called from each caching operation to record the artifact in this
         "mega" list
         """
+<<<<<<< HEAD
         if artifact_type == CacheArtifactType.AUTOTUNE:
             assert not isinstance(content, bytes)
             serde = RemoteCacheJsonSerde()
@@ -142,6 +332,13 @@ def record_artifact(
             return
         log.debug("Recording %s", str(artifact))
         cls._new_cache_artifacts.append(artifact)
+=======
+        artifact = CacheArtifactFactory.encode_create(artifact_type, key, content)
+        if artifact in cls._seen_artifacts:
+            return
+        log.debug("Recording %s", str(artifact))
+        cls._new_cache_artifacts[artifact_type].append(artifact)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cls._seen_artifacts.add(artifact)
 
     @classmethod
@@ -156,14 +353,30 @@ def serialize(cls) -> Optional[tuple[bytes, CacheInfo]]:
         """
         Converts the "mega" list into portable format
         """
+<<<<<<< HEAD
         for artifact in cls._new_cache_artifacts:
             log.debug("saving: %s", artifact)
             cls._cache_info.add(artifact)
+=======
+        for artifact in chain(*cls._new_cache_artifacts.values()):
+            log.debug("saving: %s", artifact)
+            cls._cache_info.add(artifact)
+
+        if cls._cache_info.empty():
+            # If there are not artifacts, dont just return bytes with
+            # version.
+            return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             # We deep copy cls._cache_info since later compilations
             # can keep adding to cache_info
             info = copy.deepcopy(cls._cache_info)
+<<<<<<< HEAD
             cls._serializer.extend(cls._new_cache_artifacts)
+=======
+            cls._serializer.extend(cls._new_cache_artifacts.items())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             artifact_bytes = cls._serializer.to_bytes()
             cls._new_cache_artifacts.clear()
             return artifact_bytes, info
@@ -172,6 +385,7 @@ def serialize(cls) -> Optional[tuple[bytes, CacheInfo]]:
         return None
 
     @staticmethod
+<<<<<<< HEAD
     def deserialize(serialized_artifacts: bytes) -> Optional[CacheInfo]:
         """
         Converts the portable format back into various filesystem caches
@@ -179,11 +393,25 @@ def deserialize(serialized_artifacts: bytes) -> Optional[CacheInfo]:
         try:
             artifacts = AppendingByteSerializer.to_list(
                 serialized_artifacts, deserialize_fn=CacheArtifact.deserialize
+=======
+    def deserialize(serialized_artifacts: bytes) -> Optional[CacheArtifactsResult]:
+        """
+        Converts the portable format back into CacheArtifacts
+        """
+        try:
+            CacheArtifactManager._ensure_cache_artifacts_registered()
+            artifacts = dict(
+                AppendingByteSerializer.to_list(
+                    serialized_artifacts,
+                    deserialize_fn=_deserialize_single_cache,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         except Exception:
             log.warning("Failed to un-pickle cache artifacts", exc_info=True)
             return None
 
+<<<<<<< HEAD
         from torch._dynamo.pgo import write_local_impl
         from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
         from torch._inductor.codecache import FxGraphCache
@@ -209,3 +437,31 @@ def deserialize(serialized_artifacts: bytes) -> Optional[CacheInfo]:
             else:
                 log.warning(f"Unsupported artifact type {artifact.type}")  # noqa: G004
         return info
+=======
+        return artifacts
+
+    @staticmethod
+    def populate_caches(artifacts: CacheArtifactsResult) -> CacheInfo:
+        info = CacheInfo()
+        for artifact in chain(*artifacts.values()):
+            log.debug("writing: %s", artifact)
+            info.add(artifact)
+            artifact.populate_cache()
+
+        return info
+
+    @classmethod
+    def _ensure_cache_artifacts_registered(cls) -> None:
+        """When deserializing caches in fresh process, we need to ensure that all
+        cache artifacts are registered in the cache registry. This is done by
+        simply importing all the cache artifacts already wrapped with register call.
+        """
+        from torch._dynamo.pgo import PGOCacheArtifact  # noqa: F401
+        from torch._functorch._aot_autograd.autograd_cache import (  # noqa: F401
+            AOTAutogradCacheArtifact,
+        )
+        from torch._inductor.codecache import InductorCacheArtifact  # noqa: F401
+        from torch._inductor.runtime.autotune_cache import (  # noqa: F401
+            AutotuneCacheArtifact,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/compiler/config.py b/torch/compiler/config.py
index cb30173e2e8e..d473d96d4258 100644
--- a/torch/compiler/config.py
+++ b/torch/compiler/config.py
@@ -29,7 +29,14 @@
 # FB-internal note: you do NOT have to specify this explicitly specify this if
 # you run on MAST, we will automatically default this to
 # mast:MAST_JOB_NAME:MAST_JOB_VERSION.
+<<<<<<< HEAD
 job_id: Optional[str] = Config(env_name_default="TORCH_COMPILE_JOB_ID", default=None)
+=======
+job_id: Optional[str] = Config(
+    env_name_default=["TORCH_COMPILE_JOB_ID", "TORCH_COMPILE_STICKY_PGO_KEY"],
+    default=None,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 Semantically, this should be an identifier that uniquely identifies, e.g., a
 training job.  You might have multiple attempts of the same job, e.g., if it was
@@ -74,5 +81,18 @@
 and force_parameter_static_shapes.
 """
 
+<<<<<<< HEAD
+=======
+unbacked_sources: str = Config(
+    env_name_default="TORCH_COMPILE_UNBACKED_SOURCES", default=""
+)
+"""
+Comma delimited list of sources that should be marked as unbacked. Primarily useful for large
+models with graph breaks where you need intermediate tensors marked unbacked.
+
+This whitelist is dominant over all other flags dynamic=False, force_nn_module_property_static_shapes
+and force_parameter_static_shapes.
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 install_config_module(sys.modules[__name__])
diff --git a/torch/cpu/__init__.py b/torch/cpu/__init__.py
index 7f273dbe4bcb..afd704776f45 100644
--- a/torch/cpu/__init__.py
+++ b/torch/cpu/__init__.py
@@ -15,6 +15,10 @@
 
 __all__ = [
     "is_available",
+<<<<<<< HEAD
+=======
+    "is_initialized",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "synchronize",
     "current_device",
     "current_stream",
@@ -193,3 +197,14 @@ def current_device() -> str:
     N.B. This function only exists to facilitate device-agnostic code
     """
     return "cpu"
+<<<<<<< HEAD
+=======
+
+
+def is_initialized() -> bool:
+    r"""Returns True if the CPU is initialized. Always True.
+
+    N.B. This function only exists to facilitate device-agnostic code
+    """
+    return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/DataLoader.cpp b/torch/csrc/DataLoader.cpp
index 7303ef5f6804..19f6e9bd7665 100644
--- a/torch/csrc/DataLoader.cpp
+++ b/torch/csrc/DataLoader.cpp
@@ -37,7 +37,11 @@ using namespace torch;
     auto _w =                                                              \
         write(STDERR_FILENO, ERROR_MSG, sizeof(ERROR_MSG) / sizeof(char)); \
     (void)_w;                                                              \
+<<<<<<< HEAD
     struct sigaction sa {};                                                \
+=======
+    struct sigaction sa{};                                                 \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     sa.sa_handler = SIG_DFL;                                               \
     sa.sa_flags = 0;                                                       \
     if (sigemptyset(&sa.sa_mask) != 0 ||                                   \
@@ -54,7 +58,11 @@ static void setSignalHandler(
     int signal,
     void (*handler)(int, siginfo_t*, void*),
     struct sigaction* old_sa_ptr) {
+<<<<<<< HEAD
   struct sigaction sa {};
+=======
+  struct sigaction sa{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   sa.sa_sigaction = handler;
   sa.sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP | SA_NODEFER;
   if (sigemptyset(&sa.sa_mask) != 0 ||
@@ -92,7 +100,11 @@ static void handler_SIGTERM(int sig, siginfo_t* info, void* ctx) {
   if (info->si_pid == getppid()) {
     _exit(EXIT_SUCCESS);
   }
+<<<<<<< HEAD
   struct sigaction sa {};
+=======
+  struct sigaction sa{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   sa.sa_handler = SIG_DFL;
   sa.sa_flags = 0;
   if (sigemptyset(&sa.sa_mask) != 0 || sigaction(SIGTERM, &sa, nullptr) != 0) {
@@ -102,6 +114,10 @@ static void handler_SIGTERM(int sig, siginfo_t* info, void* ctx) {
   }
 }
 
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __attribute__((weak)) void setDataLoaderSignalHandlers() {}
 
 static PyObject* THPModule_setWorkerSignalHandlers(
diff --git a/torch/csrc/DeviceAccelerator.cpp b/torch/csrc/DeviceAccelerator.cpp
index 0ef8618cc603..19b2ddea2c56 100644
--- a/torch/csrc/DeviceAccelerator.cpp
+++ b/torch/csrc/DeviceAccelerator.cpp
@@ -16,12 +16,15 @@ void initModule(PyObject* module) {
     }
   });
 
+<<<<<<< HEAD
   m.def("_accelerator_deviceCount", []() {
     auto device_type = at::accelerator::getAccelerator(false);
     torch::utils::maybe_initialize_device(device_type);
     return at::accelerator::deviceCount();
   });
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("_accelerator_setDeviceIndex", [](c10::DeviceIndex device_index) {
     // If device index is negative, no-op
     if (device_index < 0) {
@@ -66,6 +69,21 @@ void initModule(PyObject* module) {
       at::accelerator::synchronizeDevice(device_index);
     }
   });
+<<<<<<< HEAD
+=======
+
+  m.def("_accelerator_exchangeDevice", [](c10::DeviceIndex device_index) {
+    const auto device_type = at::accelerator::getAccelerator(true).value();
+    torch::utils::maybe_initialize_device(device_type);
+    return at::accelerator::exchangeDevice(device_index);
+  });
+
+  m.def("_accelerator_maybeExchangeDevice", [](c10::DeviceIndex device_index) {
+    const auto device_type = at::accelerator::getAccelerator(true).value();
+    torch::utils::maybe_initialize_device(device_type);
+    return at::accelerator::maybeExchangeDevice(device_index);
+  });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace torch::accelerator
diff --git a/torch/csrc/Event.cpp b/torch/csrc/Event.cpp
index cd5e9f080e74..04ed50071943 100644
--- a/torch/csrc/Event.cpp
+++ b/torch/csrc/Event.cpp
@@ -28,7 +28,11 @@ static PyObject* THPEvent_pynew(
   unsigned char interprocess = 0;
 
   static torch::PythonArgParser parser({
+<<<<<<< HEAD
       "Event(Device device=None, *, bool enable_timing=True, bool blocking=False, bool interprocess=False)",
+=======
+      "Event(Device device=None, *, bool enable_timing=False, bool blocking=False, bool interprocess=False)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 
   torch::ParsedArgs<4> parsed_args;
@@ -39,7 +43,11 @@ static PyObject* THPEvent_pynew(
   if (!device.has_value()) {
     device = at::Device(at::getAccelerator(false).value_or(at::kCPU));
   }
+<<<<<<< HEAD
   enable_timing = r.toBoolWithDefault(1, true);
+=======
+  enable_timing = r.toBoolWithDefault(1, false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   blocking = r.toBoolWithDefault(2, false);
   interprocess = r.toBoolWithDefault(3, false);
 
@@ -257,6 +265,7 @@ static struct PyGetSetDef THPEvent_properties[] = {
 
 // NOLINTNEXTLINE(*c-arrays*, *global-variables)
 static PyMethodDef THPEvent_methods[] = {
+<<<<<<< HEAD
     {(char*)"from_ipc_handle",
      castPyCFunctionWithKeywords(THPEvent_from_ipc_handle),
      METH_CLASS | METH_VARARGS | METH_KEYWORDS,
@@ -273,6 +282,24 @@ static PyMethodDef THPEvent_methods[] = {
     {(char*)"elapsed_time", THPEvent_elapsed_time, METH_O, nullptr},
     {(char*)"synchronize", THPEvent_synchronize, METH_NOARGS, nullptr},
     {(char*)"ipc_handle", THPEvent_ipc_handle, METH_NOARGS, nullptr},
+=======
+    {"from_ipc_handle",
+     castPyCFunctionWithKeywords(THPEvent_from_ipc_handle),
+     METH_CLASS | METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"record",
+     castPyCFunctionWithKeywords(THPEvent_record),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"wait",
+     castPyCFunctionWithKeywords(THPEvent_wait),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"query", THPEvent_query, METH_NOARGS, nullptr},
+    {"elapsed_time", THPEvent_elapsed_time, METH_O, nullptr},
+    {"synchronize", THPEvent_synchronize, METH_NOARGS, nullptr},
+    {"ipc_handle", THPEvent_ipc_handle, METH_NOARGS, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 PyTypeObject THPEventType = {
diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
index 65e9639f27fb..514aeec51e27 100644
--- a/torch/csrc/Exceptions.cpp
+++ b/torch/csrc/Exceptions.cpp
@@ -14,7 +14,12 @@
 PyObject *THPException_FatalError, *THPException_LinAlgError,
     *THPException_OutOfMemoryError, *THPException_DistError,
     *THPException_DistBackendError, *THPException_DistNetworkError,
+<<<<<<< HEAD
     *THPException_DistStoreError;
+=======
+    *THPException_DistStoreError, *THPException_DistQueueEmptyError,
+    *THPException_AcceleratorError;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define ASSERT_TRUE(cond) \
   if (!(cond))            \
@@ -113,6 +118,33 @@ could not be completed because the input matrix is singular.",
       PyModule_AddObject(
           module, "_DistStoreError", THPException_DistStoreError) == 0);
 
+<<<<<<< HEAD
+=======
+  // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+  ASSERT_TRUE(
+      THPException_DistQueueEmptyError = PyErr_NewExceptionWithDoc(
+          "torch.distributed.QueueEmptyError",
+          "Exception raised when an error occurs in the distributed store",
+          THPException_DistStoreError,
+          nullptr));
+  ASSERT_TRUE(
+      PyModule_AddObject(
+          module, "_DistQueueEmptyError", THPException_DistQueueEmptyError) ==
+      0);
+
+  // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+  ASSERT_TRUE(
+      THPException_AcceleratorError = PyErr_NewExceptionWithDoc(
+          "torch.AcceleratorError",
+          "Exception raised while executing on device",
+          PyExc_RuntimeError,
+          nullptr));
+  type = (PyTypeObject*)THPException_AcceleratorError;
+  ASSERT_TRUE(
+      PyModule_AddObject(
+          module, "AcceleratorError", THPException_AcceleratorError) == 0);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return true;
 }
 
@@ -232,6 +264,7 @@ TypeError::TypeError(const char* format, ...) {
   va_end(fmt_args);
 }
 
+<<<<<<< HEAD
 AttributeError::AttributeError(const char* format, ...) {
   va_list fmt_args{};
   va_start(fmt_args, format);
@@ -239,6 +272,8 @@ AttributeError::AttributeError(const char* format, ...) {
   va_end(fmt_args);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void PyWarningHandler::InternalHandler::process(const c10::Warning& warning) {
   warning_buffer_.push_back(warning);
 }
@@ -329,4 +364,21 @@ PyWarningHandler::~PyWarningHandler() noexcept(false) {
   }
 }
 
+<<<<<<< HEAD
+=======
+namespace detail {
+PyObject* _new_accelerator_error_object(const c10::AcceleratorError& e) {
+  auto msg = torch::get_cpp_stacktraces_enabled() ? e.what()
+                                                  : e.what_without_backtrace();
+
+  auto py_msg = PyUnicode_FromString(msg);
+  auto rc = PyObject_CallOneArg(THPException_AcceleratorError, py_msg);
+  auto error_code = PyInt_FromLong(e.get_error_code());
+  PyObject_SetAttrString(rc, "error_code", error_code);
+  Py_XDECREF(py_msg);
+  Py_XDECREF(error_code);
+  return rc;
+}
+} // namespace detail
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index 4806b4c456c6..121d3a165862 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -82,8 +82,21 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
       DistBackendError, THPException_DistBackendError, retstmnt)              \
   _CATCH_GENERIC_ERROR(                                                       \
       DistNetworkError, THPException_DistNetworkError, retstmnt)              \
+<<<<<<< HEAD
   _CATCH_GENERIC_ERROR(DistStoreError, THPException_DistStoreError, retstmnt) \
   _CATCH_GENERIC_ERROR(DistError, THPException_DistError, retstmnt)           \
+=======
+  _CATCH_GENERIC_ERROR(                                                       \
+      DistQueueEmptyError, THPException_DistQueueEmptyError, retstmnt)        \
+  _CATCH_GENERIC_ERROR(DistStoreError, THPException_DistStoreError, retstmnt) \
+  _CATCH_GENERIC_ERROR(DistError, THPException_DistError, retstmnt)           \
+  catch (c10::AcceleratorError & e) {                                         \
+    auto exc = torch::detail::_new_accelerator_error_object(e);               \
+    PyErr_SetObject(THPException_AcceleratorError, exc);                      \
+    Py_XDECREF(exc);                                                          \
+    retstmnt;                                                                 \
+  }                                                                           \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   _CATCH_GENERIC_ERROR(Error, PyExc_RuntimeError, retstmnt)                   \
   catch (torch::PyTorchError & e) {                                           \
     auto msg = torch::processErrorMsg(e.what());                              \
@@ -108,6 +121,7 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
     throw;                                                          \
   }                                                                 \
   }                                                                 \
+<<<<<<< HEAD
   catch (py::error_already_set & e) {                               \
     throw;                                                          \
   }                                                                 \
@@ -118,6 +132,18 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
     throw;                                                          \
   }                                                                 \
   catch (const std::exception& e) {                                 \
+=======
+  catch (py::error_already_set&) {                                  \
+    throw;                                                          \
+  }                                                                 \
+  catch (py::builtin_exception&) {                                  \
+    throw;                                                          \
+  }                                                                 \
+  catch (torch::jit::JITException&) {                               \
+    throw;                                                          \
+  }                                                                 \
+  catch (const std::exception&) {                                   \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch::translate_exception_to_python(std::current_exception()); \
     throw py::error_already_set();                                  \
   }
@@ -139,7 +165,12 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
 extern PyObject *THPException_FatalError, *THPException_LinAlgError,
     *THPException_OutOfMemoryError, *THPException_DistError,
     *THPException_DistBackendError, *THPException_DistNetworkError,
+<<<<<<< HEAD
     *THPException_DistStoreError;
+=======
+    *THPException_DistStoreError, *THPException_DistQueueEmptyError,
+    *THPException_AcceleratorError;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Throwing this exception means that the python error flags have been already
 // set and control should be immediately returned to the interpreter.
@@ -294,7 +325,11 @@ struct TypeError : public PyTorchError {
 
 // Translates to Python AttributeError
 struct AttributeError : public PyTorchError {
+<<<<<<< HEAD
   AttributeError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
+=======
+  using PyTorchError::PyTorchError;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PyObject* python_type() override {
     return PyExc_AttributeError;
   }
@@ -320,7 +355,11 @@ struct PyWarningHandler {
 
   /** Call if an exception has been thrown
 
+<<<<<<< HEAD
    *  Necessary to determine if it is safe to throw from the desctructor since
+=======
+   *  Necessary to determine if it is safe to throw from the destructor since
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    *  std::uncaught_exception is buggy on some platforms and generally
    *  unreliable across dynamic library calls.
    */
@@ -367,6 +406,11 @@ auto wrap_pybind_function_impl_(
     END_HANDLE_TH_ERRORS_PYBIND
   };
 }
+<<<<<<< HEAD
+=======
+
+PyObject* _new_accelerator_error_object(const c10::AcceleratorError&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace detail
 
 // Wrap a function with TH error and warning handling.
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index fc6e10fbbc06..6e7ad3b427e0 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -112,6 +112,10 @@
 #include <ATen/ROCmFABackend.h>
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/inductor/static_cuda_launcher.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef __HIP_PLATFORM_AMD__
 #include <ATen/native/cudnn/hip/BatchNorm.h>
 #else
@@ -132,6 +136,13 @@
 #include <callgrind.h>
 #endif
 
+<<<<<<< HEAD
+=======
+#ifdef USE_ITT
+#include <torch/csrc/itt.h>
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace py = pybind11;
 
 static PyObject* module;
@@ -275,6 +286,10 @@ static PyObject* THPModule_crashIfvptrUBSAN(PyObject* module, PyObject* noarg) {
     virtual ~Baz() = default;
   };
   Baz x{};
+<<<<<<< HEAD
+=======
+  // NOLINTNEXTLINE(bugprone-casting*)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto y = static_cast<Foo*>(static_cast<void*>(&x));
   auto rc = y->bar();
   return THPUtils_packInt32(rc);
@@ -1756,12 +1771,15 @@ void initModule(PyObject* module);
 } // namespace torch::xpu
 #endif
 
+<<<<<<< HEAD
 #ifdef USE_ITT
 namespace torch::profiler {
 void initIttBindings(PyObject* module);
 } // namespace torch::profiler
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static std::vector<PyMethodDef> methods;
 
 // In Python we can't use the trick of C10_LOG_API_USAGE_ONCE
@@ -1886,6 +1904,12 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   torch::cuda::initModule(module);
 #endif
+<<<<<<< HEAD
+=======
+#if defined(USE_CUDA) && !defined(USE_ROCM)
+  ASSERT_TRUE(StaticCudaLauncher_init(module));
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_MPS
   torch::mps::initModule(module);
 #endif
@@ -2010,6 +2034,15 @@ Call this whenever a new thread is created in order to propagate values from
     return at::caching::is_cached_tensor(t);
   });
 
+<<<<<<< HEAD
+=======
+  py_module.def("_storage_Use_Count", [](size_t storage_impl_ptr) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
+    return c10::raw::weak_intrusive_ptr::use_count(storage_impl);
+  });
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ASSERT_TRUE(
       set_module_attr("has_openmp", at::hasOpenMP() ? Py_True : Py_False));
   ASSERT_TRUE(set_module_attr("has_mkl", at::hasMKL() ? Py_True : Py_False));
@@ -2363,7 +2396,11 @@ Call this whenever a new thread is created in order to propagate values from
         auto acc = at::getAccelerator(check.value_or(false));
         if (acc.has_value()) {
           bool is_available = at::globalContext()
+<<<<<<< HEAD
                                   .getAcceleratorHooksInterface(acc.value())
+=======
+                                  .getAcceleratorHooksInterface(acc)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                   .isAvailable();
 
           if (!is_available) {
@@ -2400,12 +2437,16 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(
       set_module_attr("_has_mkldnn", at::hasMKLDNN() ? Py_True : Py_False));
 
+<<<<<<< HEAD
 #ifdef _GLIBCXX_USE_CXX11_ABI
   ASSERT_TRUE(set_module_attr(
       "_GLIBCXX_USE_CXX11_ABI", _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False));
 #else
   ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_False));
 #endif
+=======
+  ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_True));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // See note [Pybind11 ABI constants]
 #define SET_STR_DEFINE(name) \
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index ce7414d31b76..6097ec9b3fe6 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -635,7 +635,11 @@ static c10::ArrayRef<T> get_set_cached_attr(
   // is also to <=5 elements, we don't need to reallocate.
   // Note: I tried removing this optimization and tripped ASAN
   // in a batchnorm kernel here:
+<<<<<<< HEAD
   // https://pipelinesghubeus21.actions.githubusercontent.com/mBh68xKhi8LyM7tp3vECvYXNFvuV4gyVGgmYCteuEZP9JH92QN/_apis/pipelines/1/runs/3373307/signedlogcontent/790?urlExpires=2023-09-15T21%3A13%3A51.4327798Z&urlSigningMethod=HMACV1&urlSignature=tDeX7ZqaARVU5NNwyr5yYqqkWq3A2j4z8FFdqYwGr0Q%3D
+=======
+  // https://pipelinesghubeus21.actions.githubusercontent.com/mBh68xKhi8LyM7tp3vECvYXNFvuV4gyVGgmYCteuEZP9JH92QN/_apis/pipelines/1/runs/3373307/signedlogcontent/790?urlExpires=2023-09-15T21%3A13%3A51.4327798Z&urlSigningMethod=HMACV1&urlSignature=tDeX7ZqaARVU5NNwyr5yYqqkWq3A2j4z8FFdqYwGr0Q%3D@lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We should fix this instead.
   bool needs_resize = false;
   // We need to resize if:
diff --git a/torch/csrc/Size.cpp b/torch/csrc/Size.cpp
index 094645030b77..1ec2e7b5947e 100644
--- a/torch/csrc/Size.cpp
+++ b/torch/csrc/Size.cpp
@@ -161,17 +161,66 @@ static PyObject* wrap_tuple_fn(Args... args) {
   return result.release();
 }
 
+<<<<<<< HEAD
 // We use an anonymous namespace instead of static to work around
 // (what @peterjc123 think is) a bug in Visual Studio
 namespace {
 auto sq_concat = PyTuple_Type.tp_as_sequence->sq_concat;
+=======
+static PyObject* THPSize_concat(PyObject* left, PyObject* right) {
+  // wrap tuple's sq_concat with a customized error message
+  HANDLE_TH_ERRORS
+  TORCH_CHECK_TYPE(
+      PyTuple_Check(right),
+      "can only concatenate tuple (not ",
+      Py_TYPE(right)->tp_name,
+      ") to torch.Size");
+  static binaryfunc tuple_concat = PyTuple_Type.tp_as_sequence->sq_concat;
+  static binaryfunc size_concat =
+      wrap_tuple_fn<decltype(&tuple_concat), &tuple_concat>;
+  return size_concat(left, right);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPSize_add(PyObject* left, PyObject* right) {
+  /* NOTE: The python interpreter tries, in order:
+   *   1. right.nb_add(left, right)  (only if right is a subclass of left)
+   *   2. left.nb_add(left, right)
+   *   3. right.nb_add(left, right)
+   *   4. left.sq_concat(right)
+   * Hence, to support tuple + size -> size, we need to implement nb_add.
+   */
+  HANDLE_TH_ERRORS
+  if (!PyTuple_Check(left) || !PyTuple_Check(right)) {
+    Py_RETURN_NOTIMPLEMENTED;
+  }
+  return THPSize_concat(left, right);
+  END_HANDLE_TH_ERRORS
+}
+
+// Needed to ensure tuple + size returns a size instead of a tuple
+static PyNumberMethods THPSize_as_number = {
+    &THPSize_add, // nb_add
+    nullptr, // nb_subtract
+    nullptr, // nb_multiply
+    // ... rest nullptr
+};
+
+// We use an anonymous namespace instead of static to work around
+// (what @peterjc123 think is) a bug in Visual Studio
+namespace {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 auto sq_repeat = PyTuple_Type.tp_as_sequence->sq_repeat;
 binaryfunc mp_subscript = PyTuple_Type.tp_as_mapping->mp_subscript;
 } // namespace
 
 static PySequenceMethods THPSize_as_sequence = {
     nullptr, /* sq_length */
+<<<<<<< HEAD
     wrap_tuple_fn<decltype(&sq_concat), &sq_concat>,
+=======
+    &THPSize_concat, /* sq_concat */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     wrap_tuple_fn<decltype(&sq_repeat), &sq_repeat>,
     nullptr, /* sq_item */
     nullptr, /* sq_slice */
@@ -242,7 +291,11 @@ PyTypeObject THPSizeType = {
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
     (reprfunc)THPSize_repr, /* tp_repr */
+<<<<<<< HEAD
     nullptr, /* tp_as_number */
+=======
+    &THPSize_as_number, /* tp_as_number */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     &THPSize_as_sequence, /* tp_as_sequence */
     &THPSize_as_mapping, /* tp_as_mapping */
     nullptr, /* tp_hash  */
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index 8074406cdcad..3b9ba5ec56bb 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -101,7 +101,11 @@ PyObject* THPStorage_Wrap(c10::Storage storage) {
   // If the StorageImpl has a PyObject that is managed by a different
   // interpreter than the current one, create a new StorageImpl that points to
   // the same data and then create the Python storage from that.
+<<<<<<< HEAD
   // NOTE: This is only supposed to happen in MultiPy
+=======
+  // NOTE: This is only supposed to happen in MultiPy  // codespell:ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (pyobj_slot->has_pyobj_nonhermetic() &&
       !pyobj_slot->check_interpreter(getPyInterpreter())) {
     return THPStorage_NewWithStorage(
diff --git a/torch/csrc/StorageSharing.cpp b/torch/csrc/StorageSharing.cpp
index 614ea9d6f5d2..3a467f5d764c 100644
--- a/torch/csrc/StorageSharing.cpp
+++ b/torch/csrc/StorageSharing.cpp
@@ -331,8 +331,12 @@ static PyObject* THPStorage_shareCuda(PyObject* self, PyObject* noargs) {
     _ref_counter = PyBytes_FromString((sent_data->handle()).c_str());
     _ref_counter_offset = THPUtils_packUInt64(sent_data->offset());
 
+<<<<<<< HEAD
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     cudaIpcEventHandle_t ipc_event_handle;
+=======
+    cudaIpcEventHandle_t ipc_event_handle{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (sent_data->event_sync_required_) {
       C10_CUDA_CHECK(
diff --git a/torch/csrc/api/include/torch/data/dataloader/stateless.h b/torch/csrc/api/include/torch/data/dataloader/stateless.h
index cdd4c2cc069c..680a761be719 100644
--- a/torch/csrc/api/include/torch/data/dataloader/stateless.h
+++ b/torch/csrc/api/include/torch/data/dataloader/stateless.h
@@ -15,7 +15,11 @@ namespace torch::data {
 /// A dataloader for stateless datasets.
 ///
 /// This dataloader follows the traditional PyTorch dataloader design, whereby a
+<<<<<<< HEAD
 /// (posssibly) stateful sampler produces *batch requests* for a stateless
+=======
+/// (possibly) stateful sampler produces *batch requests* for a stateless
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /// dataset, which acts as a simple batch request to batch mapping. The batch
 /// request will often be an array of indices, and if the dataset is a simple
 /// image dataset, the dataset would produce the images at those indices.
diff --git a/torch/csrc/api/include/torch/data/datasets/chunk.h b/torch/csrc/api/include/torch/data/datasets/chunk.h
index a32a7b21b569..fe91bd3dca63 100644
--- a/torch/csrc/api/include/torch/data/datasets/chunk.h
+++ b/torch/csrc/api/include/torch/data/datasets/chunk.h
@@ -234,7 +234,11 @@ class BatchDataBuffer {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   ExampleSampler& example_sampler_;
 
+<<<<<<< HEAD
   // configurable maximun number of elements the queue can hold at one time.
+=======
+  // configurable maximum number of elements the queue can hold at one time.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t queue_capacity_;
 
   // When set to true, it wakes the writer threads from the wait and exit
@@ -286,7 +290,11 @@ struct ChunkDatasetOptions {
   /// The capacity of the queue for batch caching.
   TORCH_ARG(size_t, cache_size) = 2048;
 
+<<<<<<< HEAD
   // The number of chunks to perfrom cross-chunk shuffling. Default to 1 meaning
+=======
+  // The number of chunks to perform cross-chunk shuffling. Default to 1 meaning
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // no cross-chunk shuffling. When it is equal to n (n > 1), n random
   // chunks will be loaded at once and example shuffling will be performed
   // across all those n chunks.
@@ -303,9 +311,16 @@ struct ChunkDatasetOptions {
 ///
 /// Unlike regular dataset, chunk dataset require two samplers to operate and
 /// keeps an internal state. `ChunkSampler` selects, which chunk to load next,
+<<<<<<< HEAD
 /// while the `ExampleSampler` determins the order of Examples that are returned
 /// in each `get_batch` call. The hierarchical sampling approach used here is
 /// inspired by this paper http://martin.zinkevich.org/publications/nips2010.pdf
+=======
+/// while the `ExampleSampler` determines the order of Examples that are
+/// returned in each `get_batch` call. The hierarchical sampling approach used
+/// here is inspired by this paper
+/// http://martin.zinkevich.org/publications/nips2010.pdf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <
     typename ChunkReader,
     typename ChunkSampler = samplers::RandomSampler,
@@ -346,7 +361,11 @@ class ChunkDataset final
   }
 
   /// Default get_batch method of BatchDataset. This method returns
+<<<<<<< HEAD
   /// Example batches created from the preloaded chunks. The implemenation
+=======
+  /// Example batches created from the preloaded chunks. The implementation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// is dataset agnostic and does not need overriding in different chunk
   /// datasets.
   BatchType get_batch(size_t batch_size) override {
diff --git a/torch/csrc/api/include/torch/data/samplers/base.h b/torch/csrc/api/include/torch/data/samplers/base.h
index 67c1ad5ea7cb..550e211929c7 100644
--- a/torch/csrc/api/include/torch/data/samplers/base.h
+++ b/torch/csrc/api/include/torch/data/samplers/base.h
@@ -24,7 +24,11 @@ class Sampler {
 
   /// Resets the `Sampler`'s internal state.
   /// Typically called before a new epoch.
+<<<<<<< HEAD
   /// Optionally, accepts a new size when reseting the sampler.
+=======
+  /// Optionally, accepts a new size when resetting the sampler.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void reset(std::optional<size_t> new_size) = 0;
 
   /// Returns the next index if possible, or an empty optional if the
diff --git a/torch/csrc/api/include/torch/detail/static.h b/torch/csrc/api/include/torch/detail/static.h
index d855f0007498..a5648b3250fc 100644
--- a/torch/csrc/api/include/torch/detail/static.h
+++ b/torch/csrc/api/include/torch/detail/static.h
@@ -40,10 +40,16 @@ struct has_forward {
 
 template <typename Head = void, typename... Tail>
 constexpr bool check_not_lvalue_references() {
+<<<<<<< HEAD
   return (
       !std::is_lvalue_reference_v<Head> ||
       std::is_const_v<std::remove_reference_t<
           Head>>)&&check_not_lvalue_references<Tail...>();
+=======
+  return (!std::is_lvalue_reference_v<Head> ||
+          std::is_const_v<std::remove_reference_t<Head>>) &&
+      check_not_lvalue_references<Tail...>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <>
diff --git a/torch/csrc/api/include/torch/nn/functional/activation.h b/torch/csrc/api/include/torch/nn/functional/activation.h
index 5073c62c52e7..60107784c610 100644
--- a/torch/csrc/api/include/torch/nn/functional/activation.h
+++ b/torch/csrc/api/include/torch/nn/functional/activation.h
@@ -344,7 +344,11 @@ namespace detail {
 inline Tensor glu(const Tensor& input, int64_t dim) {
   TORCH_CHECK(
       input.dim() != 0,
+<<<<<<< HEAD
       "glu does not suppport scalars because halving size must be even");
+=======
+      "glu does not support scalars because halving size must be even");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return torch::glu(input, dim);
 }
 } // namespace detail
@@ -370,7 +374,11 @@ inline Tensor glu(const Tensor& input, const GLUFuncOptions& options = {}) {
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace detail {
+<<<<<<< HEAD
 inline Tensor gelu(const Tensor& input, const string& approximate) {
+=======
+inline Tensor gelu(const Tensor& input, const std::string& approximate) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return torch::gelu(input, approximate);
 }
 } // namespace detail
diff --git a/torch/csrc/api/include/torch/nn/modules/container/moduledict.h b/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
index 16c9c94489b0..6f2b917c7615 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
@@ -130,7 +130,11 @@ class ModuleDictImpl : public Cloneable<ModuleDictImpl> {
     return modules_.is_empty();
   }
 
+<<<<<<< HEAD
   /// Check if the centain parameter with the key in the `ModuleDict`.
+=======
+  /// Check if the certain parameter with the key in the `ModuleDict`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool contains(const std::string& key) const noexcept {
     return modules_.contains(key);
   }
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
index df6d003750ab..fdc717050664 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
@@ -107,7 +107,11 @@ class ParameterDictImpl : public Cloneable<ParameterDictImpl> {
     parameters_.clear();
   }
 
+<<<<<<< HEAD
   /// Check if the centain parameter with the key in the ParameterDict
+=======
+  /// Check if the certain parameter with the key in the ParameterDict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool contains(const std::string& key) const noexcept {
     return parameters_.contains(key);
   }
diff --git a/torch/csrc/api/include/torch/nn/options/vision.h b/torch/csrc/api/include/torch/nn/options/vision.h
index bbbcbee92ff3..b05076fbcda4 100644
--- a/torch/csrc/api/include/torch/nn/options/vision.h
+++ b/torch/csrc/api/include/torch/nn/options/vision.h
@@ -16,7 +16,13 @@ namespace torch::nn::functional {
 /// F::GridSampleFuncOptions().mode(torch::kBilinear).padding_mode(torch::kZeros).align_corners(true));
 /// ```
 struct TORCH_API GridSampleFuncOptions {
+<<<<<<< HEAD
   typedef std::variant<enumtype::kBilinear, enumtype::kNearest> mode_t;
+=======
+  typedef std::
+      variant<enumtype::kBilinear, enumtype::kNearest, enumtype::kBicubic>
+          mode_t;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   typedef std::
       variant<enumtype::kZeros, enumtype::kBorder, enumtype::kReflection>
           padding_mode_t;
diff --git a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
index 8dc06d9dd440..567208927f41 100644
--- a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
+++ b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
@@ -59,7 +59,11 @@ namespace {
 // in data parallel, and should not be exposed as a user API.
 struct ReduceAdd : public autograd::Node {
   explicit ReduceAdd(const at::Device& destination_device)
+<<<<<<< HEAD
       : destination_device_(destination_device){};
+=======
+      : destination_device_(destination_device) {};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~ReduceAdd() override = default;
 
   // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
diff --git a/torch/csrc/api/include/torch/serialize/input-archive.h b/torch/csrc/api/include/torch/serialize/input-archive.h
index f399ac63d5e7..dfc2e8f0c624 100644
--- a/torch/csrc/api/include/torch/serialize/input-archive.h
+++ b/torch/csrc/api/include/torch/serialize/input-archive.h
@@ -101,7 +101,11 @@ class TORCH_API InputArchive final {
   std::vector<std::string> keys();
 
   /// Forwards all arguments to `read()`.
+<<<<<<< HEAD
   /// Useful for generic code that can be re-used for both `InputArchive` and
+=======
+  /// Useful for generic code that can be reused for both `InputArchive` and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// `OutputArchive` (where `operator()` forwards to `write()`).
   template <typename... Ts>
   void operator()(Ts&&... ts) {
diff --git a/torch/csrc/api/include/torch/serialize/output-archive.h b/torch/csrc/api/include/torch/serialize/output-archive.h
index 29052bfe6c68..0e22eaf0f23f 100644
--- a/torch/csrc/api/include/torch/serialize/output-archive.h
+++ b/torch/csrc/api/include/torch/serialize/output-archive.h
@@ -66,7 +66,11 @@ class TORCH_API OutputArchive final {
   void save_to(const std::function<size_t(const void*, size_t)>& func);
 
   /// Forwards all arguments to `write()`.
+<<<<<<< HEAD
   /// Useful for generic code that can be re-used for both `OutputArchive` and
+=======
+  /// Useful for generic code that can be reused for both `OutputArchive` and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// `InputArchive` (where `operator()` forwards to `read()`).
   template <typename... Ts>
   void operator()(Ts&&... ts) {
diff --git a/torch/csrc/api/include/torch/types.h b/torch/csrc/api/include/torch/types.h
index 0adf8f4f63c2..a457f9b9203c 100644
--- a/torch/csrc/api/include/torch/types.h
+++ b/torch/csrc/api/include/torch/types.h
@@ -7,9 +7,12 @@
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/autograd/variable.h>
 
+<<<<<<< HEAD
 // TODO: These don't really belong here but torchvision builds in CI need them
 // Remove once the torchvision version being compiled in CI is updated
 #include <ATen/core/dispatch/Dispatcher.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/library.h>
 
 namespace torch {
diff --git a/torch/csrc/api/src/nn/modules/rnn.cpp b/torch/csrc/api/src/nn/modules/rnn.cpp
index eff69a32a856..d41d39787a7e 100644
--- a/torch/csrc/api/src/nn/modules/rnn.cpp
+++ b/torch/csrc/api/src/nn/modules/rnn.cpp
@@ -21,7 +21,11 @@ using namespace torch::nn::utils::rnn;
 namespace torch::nn {
 
 /// These must line up with the CUDNN mode codes:
+<<<<<<< HEAD
 /// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t
+=======
+/// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/api/cudnn-adv-library.html#cudnnrnnmode-t
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 enum class CuDNNMode { RNN_RELU = 0, RNN_TANH = 1, LSTM = 2, GRU = 3 };
 
 static CuDNNMode get_cudnn_mode_for_rnn(
@@ -852,7 +856,11 @@ void RNNCellImplBase<Derived>::pretty_print(std::ostream& stream) const {
 template <typename Derived>
 void RNNCellImplBase<Derived>::check_forward_input(
     const Tensor& input,
+<<<<<<< HEAD
     const string& name) const {
+=======
+    const std::string& name) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       input.dim() == 1 || input.dim() == 2,
       "Expected ",
diff --git a/torch/csrc/api/src/nn/modules/transformer.cpp b/torch/csrc/api/src/nn/modules/transformer.cpp
index c755c61b7510..da01eec7aacd 100644
--- a/torch/csrc/api/src/nn/modules/transformer.cpp
+++ b/torch/csrc/api/src/nn/modules/transformer.cpp
@@ -19,7 +19,11 @@ TransformerEncoderLayerImpl::TransformerEncoderLayerImpl(
 
 void TransformerEncoderLayerImpl::reset() {
   // NOTE: reset() is for initializing the model only, calling reset() after the
+<<<<<<< HEAD
   // model is created will throw exceptionss. Call reset_parameter() if the
+=======
+  // model is created will throw exceptions. Call reset_parameter() if the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // created model needs a reset
 
   self_attn = this->register_module(
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index e5ee41e6fd56..13154b563844 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -823,7 +823,11 @@ Tensor prod_backward(
   if (input.dim() == 0) {
     return grad;
   }
+<<<<<<< HEAD
   dim = at::maybe_wrap_dim(dim, static_cast<int64_t>(input.sym_sizes().size()));
+=======
+  dim = at::maybe_wrap_dim(dim, input.dim());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!keepdim) {
     // `prod` reduces the dimension at `dim`,
     // so, unsqueeze `grad` and `result` at dim.
@@ -876,8 +880,13 @@ Tensor logsumexp_backward(
     IntArrayRef dim,
     bool keepdim) {
   if (!keepdim && self.dim() != 0) {
+<<<<<<< HEAD
     grad = unsqueeze_multiple(grad, dim, self.sym_sizes().size());
     result = unsqueeze_multiple(result, dim, self.sym_sizes().size());
+=======
+    grad = unsqueeze_multiple(grad, dim, self.dim());
+    result = unsqueeze_multiple(result, dim, self.dim());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return grad * (self - result).exp().conj();
 }
@@ -891,8 +900,13 @@ Tensor logcumsumexp_backward(
     return grad;
   }
 
+<<<<<<< HEAD
   // Reference: https://github.com/tensorflow/tensorflow/blob/
   // 2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863
+=======
+  // Reference:
+  // https://github.com/tensorflow/tensorflow/blob/2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto scalar_min = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
       at::ScalarType::BFloat16,
@@ -1450,6 +1464,65 @@ Tensor mm_mat2_backward(
   return maybe_multiply(mat1.t().conj().mm(grad), alpha.conj());
 }
 
+<<<<<<< HEAD
+=======
+Tensor _grouped_mm_mat1_backward(
+    const Tensor& grad,
+    const Tensor& mat2,
+    at::SymIntArrayRef mat1_sizes,
+    at::SymIntArrayRef mat1_strides,
+    c10::Layout mat1_layout,
+    std::optional<Tensor> offs,
+    const Scalar& alpha) {
+  TORCH_CHECK(
+      grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided &&
+          mat1_layout == c10::kStrided,
+      "only strided layout supported for grouped mm");
+  // if input was column-major, return grad as column-order for efficiency
+  if (offs.has_value() && !offs->defined()) {
+    offs = std::nullopt;
+  }
+  auto mat1_dim = mat1_sizes.size();
+  if (mat1_strides[mat1_dim - 2] == 1 &&
+      mat1_strides[mat1_dim - 1] == mat1_sizes[mat1_dim - 2]) {
+    auto grad_inp =
+        (at::_grouped_mm(mat2, grad.transpose(-2, -1), offs)).transpose(-2, -1);
+    return maybe_multiply(grad_inp, alpha.conj());
+  } else {
+    auto grad_inp = (at::_grouped_mm(grad, mat2.transpose(-2, -1), offs));
+    return maybe_multiply(grad_inp, alpha.conj());
+  }
+}
+
+Tensor _grouped_mm_mat2_backward(
+    const Tensor& grad,
+    const Tensor& mat1,
+    at::SymIntArrayRef mat2_sizes,
+    at::SymIntArrayRef mat2_strides,
+    c10::Layout mat2_layout,
+    std::optional<Tensor> offs,
+    const Scalar& alpha) {
+  TORCH_CHECK(
+      grad.layout() == c10::kStrided && mat1.layout() == c10::kStrided &&
+          mat2_layout == c10::kStrided,
+      "only strided layout supported for grouped mm");
+  // if input was column-major, return grad as column-order for efficiency
+  auto mat2_dim = mat2_sizes.size();
+  if (offs.has_value() && !offs->defined()) {
+    offs = std::nullopt;
+  }
+  if (mat2_strides[mat2_dim - 2] == 1 &&
+      mat2_strides[mat2_dim - 1] == mat2_sizes[mat2_dim - 2]) {
+    auto grad_inp =
+        at::_grouped_mm(grad.transpose(-2, -1), mat1, offs).transpose(-2, -1);
+    return maybe_multiply(grad_inp, alpha.conj());
+  } else {
+    auto grad_inp = at::_grouped_mm(mat1.transpose(-2, -1), grad, offs);
+    return maybe_multiply(grad_inp, alpha.conj());
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Tensor mm_mat1_sparse_backward(
     const Tensor& grad,
     const Tensor& mat1,
@@ -1832,7 +1905,11 @@ Tensor var_backward(
   }
   auto dim = dim_opt.value();
   if (!keepdim && self.dim() > 1) {
+<<<<<<< HEAD
     grad = unsqueeze_multiple(grad, dim, self.sym_sizes().size());
+=======
+    grad = unsqueeze_multiple(grad, dim, self.dim());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   const c10::SymFloat rnumel(_safe_size(self.sym_sizes(), dim));
   return (c10::SymFloat(2.0) / (rnumel - correction)) * grad *
@@ -2848,7 +2925,11 @@ Tensor softplus_double_backward(
 //   4. Return the as_strided view of the storage tensor using input geometry.
 //
 // See NOTE [ Detecting Memory Overlap Within A Strided Tensor ] on how to
+<<<<<<< HEAD
 // roughly detech overlapping memory.
+=======
+// roughly detect overlapping memory.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // NOTE [ Detecting Memory Overlap Within A Strided Tensor ]
 //
@@ -2938,7 +3019,11 @@ Tensor softplus_double_backward(
 //              Now that we established the above claim (***), we consider the
 //              view operation as first sorting the dimensions (i.e., blocks),
 //              apply the original view (since it only cares dimensions being
+<<<<<<< HEAD
 //              consecutive and contiguous withtin each block), and then undo
+=======
+//              consecutive and contiguous within each block), and then undo
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //              the sort.
 //
 //              Consider a single block B in the output,
@@ -2990,7 +3075,11 @@ Tensor softplus_double_backward(
 //                  size'[i] <= floor(size[i] / k)
 //
 //              If size'[i] = 1, invariant is obviously satisfied as we are
+<<<<<<< HEAD
 //              just removing a dimension (afte step (1)).
+=======
+//              just removing a dimension (after step (1)).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //
 //              Assume size'[i] > 1.
 //
@@ -4662,10 +4751,17 @@ static Tensor sum_exclude_dim1(const Tensor& to_sum, bool keepdim = true) {
 // reductions were done with keepdim=True
 static Tensor unsqueeze_dim1(const Tensor& src, const Tensor& target) {
   auto src_expanded = src;
+<<<<<<< HEAD
   while (src_expanded.sizes().size() < target.sizes().size() - 1) {
     src_expanded = src_expanded.unsqueeze(1);
   }
   if (src_expanded.sizes().size() == target.sizes().size() - 1) {
+=======
+  while (src_expanded.dim() < target.dim() - 1) {
+    src_expanded = src_expanded.unsqueeze(1);
+  }
+  if (src_expanded.dim() == target.dim() - 1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     src_expanded = src_expanded.unsqueeze(0);
   }
   return src_expanded;
@@ -4676,7 +4772,11 @@ static Tensor unsqueeze_dim1(const Tensor& src, const Tensor& target) {
 // do a straight expansion because it won't follow the broadcasting rules.
 static Tensor expand_as_dim1(const Tensor& src, const Tensor& target) {
   auto src_expanded = src;
+<<<<<<< HEAD
   while (src_expanded.sizes().size() < target.sizes().size() - 1) {
+=======
+  while (src_expanded.dim() < target.dim() - 1) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     src_expanded = src_expanded.unsqueeze(1);
   }
   return src_expanded.expand_as(target);
@@ -5188,7 +5288,11 @@ bool any_variable_defined(const variable_list& variables) {
 // Derivations for the householder_product.backward method.
 //
 // Given a sequence of vectors v_1, ..., v_n and a sequence of scalars tau_1,
+<<<<<<< HEAD
 // ..., tau_k, the torch.linalg.householder_product computes the firt n columns
+=======
+// ..., tau_k, the torch.linalg.householder_product computes the first n columns
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // of the following product: Q = (I - tau_1 v_1 v_1^H) ... (I - tau_k v_k
 // v_k^H). Let
 //     H_i(sigma) := I - sigma v_i v_i^H, so Q = (H_1(sigma_1) ...
@@ -5592,7 +5696,11 @@ std::tuple<Tensor, Tensor, Tensor> ormqr_backward(
       // left = false and transpose = true is very much similar with just
       // transposed arguments passed into householder_product_backward.
       // Ormqr computes B = H_1 * ... * H_k * A.
+<<<<<<< HEAD
       // The sensivity wrt H_i is given by (see notes in
+=======
+      // The sensitivity wrt H_i is given by (see notes in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // householder_product_backward) Tr(H_i_plus B B_grad^H H_i_minus dH_i),
       // so, since householder_product_backward respects `for i in range(k)`, we
       // could reuse householder_product_backward with
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 8d01a80eb406..8c4b813c4377 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -306,6 +306,25 @@ at::Tensor mm_mat2_backward(
     at::SymIntArrayRef strides,
     c10::Layout layout,
     const at::Scalar& alpha);
+<<<<<<< HEAD
+=======
+at::Tensor _grouped_mm_mat1_backward(
+    const Tensor& grad,
+    const Tensor& mat2,
+    at::SymIntArrayRef mat1_sizes,
+    at::SymIntArrayRef mat1_strides,
+    c10::Layout mat1_layout,
+    std::optional<Tensor> offs,
+    const Scalar& alpha);
+at::Tensor _grouped_mm_mat2_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mat1,
+    at::SymIntArrayRef sizes,
+    at::SymIntArrayRef strides,
+    c10::Layout layout,
+    std::optional<Tensor> offs,
+    const at::Scalar& alpha);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 at::Tensor mm_mat1_sparse_backward(
     const at::Tensor& grad,
     const at::Tensor& mat1,
diff --git a/torch/csrc/autograd/TraceTypeManual.cpp b/torch/csrc/autograd/TraceTypeManual.cpp
index 1c6e1d29e010..d9bf81adac7e 100644
--- a/torch/csrc/autograd/TraceTypeManual.cpp
+++ b/torch/csrc/autograd/TraceTypeManual.cpp
@@ -278,7 +278,11 @@ static void general_trace_function(
           tracer::addOutput(node, iter->toTensorList());
         } else {
           throw std::runtime_error(
+<<<<<<< HEAD
               "unsupported ouptut list type: " + elem_type->str());
+=======
+              "unsupported output list type: " + elem_type->str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       } else if (type->kind() == TypeKind::ClassType) {
         AT_ASSERT(iter->isObject());
diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h
index e29d1bbf054c..7f75d48177ba 100644
--- a/torch/csrc/autograd/anomaly_mode.h
+++ b/torch/csrc/autograd/anomaly_mode.h
@@ -30,7 +30,11 @@ struct TORCH_API AnomalyMode {
 ///
 /// Anomaly detection mode is useful for debugging problems happening
 /// in the backward, such as unexpectedly modified tensors or NaNs
+<<<<<<< HEAD
 /// occuring in the backward.
+=======
+/// occurring in the backward.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ///
 /// The enabling of anomaly mode is global - as soon as there is one
 /// such guard, it is enabled for all computation and threads. It also
diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp
index c705ba11d5e9..1bb598fbcf21 100644
--- a/torch/csrc/autograd/autograd_meta.cpp
+++ b/torch/csrc/autograd/autograd_meta.cpp
@@ -53,7 +53,11 @@ using at::Tensor;
 //
 // This layout constraint is ensured in the `set_fw_grad` function below
 
+<<<<<<< HEAD
 // More complex cases arrise when non-dual Tensor interact with dual Tensors.
+=======
+// More complex cases arise when non-dual Tensor interact with dual Tensors.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // The two most important cases are:
 //
 //     # Have:
@@ -222,7 +226,11 @@ void AutogradMeta::set_fw_grad(
           if (utils::has_same_meta(new_grad, base) &&
               utils::has_same_meta(new_grad, self)) {
             // TODO extend this special case to when the underlying storage of
+<<<<<<< HEAD
             // new_grad can be re-used.
+=======
+            // new_grad can be reused.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_base_fw_grad = new_grad;
           } else {
             new_base_fw_grad =
diff --git a/torch/csrc/autograd/cpp_hook.cpp b/torch/csrc/autograd/cpp_hook.cpp
index c08068d535c8..164db5e030a3 100644
--- a/torch/csrc/autograd/cpp_hook.cpp
+++ b/torch/csrc/autograd/cpp_hook.cpp
@@ -64,4 +64,12 @@ variable_list CppFunctionSingleTensorPreHook::operator()(
   return results;
 }
 
+<<<<<<< HEAD
+=======
+void CppFunctionSingleTensorPreHook::compiled_args(
+    torch::dynamo::autograd::CompiledNodeArgs& args) const {
+  args.add_cpp_single_tensor_pre_hook(hook_, value_idx_);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::autograd
diff --git a/torch/csrc/autograd/cpp_hook.h b/torch/csrc/autograd/cpp_hook.h
index e48c587296b2..5a775060a467 100644
--- a/torch/csrc/autograd/cpp_hook.h
+++ b/torch/csrc/autograd/cpp_hook.h
@@ -22,6 +22,12 @@ struct CppFunctionSingleTensorPreHook : public FunctionPreHook {
       size_t value_idx);
   variable_list operator()(const variable_list& values) override;
 
+<<<<<<< HEAD
+=======
+  void compiled_args(
+      torch::dynamo::autograd::CompiledNodeArgs& args) const override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::function<at::TensorBase(const at::TensorBase&)> hook_;
   size_t value_idx_;
 };
diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index 1df7efb35d10..d77b59c75c05 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -1,4 +1,8 @@
 #include <c10/util/irange.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/autograd/VariableTypeUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/autograd/autograd.h>
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/csrc/autograd/functions/accumulate_grad.h>
@@ -309,9 +313,19 @@ static optional_variable_list _process_backward_mode_ad(
       }
       // No need to mark as modified Tensors that are not inputs.
       if (!is_input) {
+<<<<<<< HEAD
         TORCH_WARN(
             "Only input Tensors should be given to ctx.mark_dirty(). If a Tensor is not an input, there"
             " is no need to pass it to mark_dirty().");
+=======
+        const char* mark_dirty_error_msg =
+            "ctx.mark_dirty() received a tensor that was not an input. "
+            "Only input Tensors that have been mutated should be passed to "
+            "ctx.mark_dirty().";
+        // We reach this path in the view of intermediate case
+        TORCH_CHECK(!var.is_view(), mark_dirty_error_msg);
+        TORCH_WARN(mark_dirty_error_msg);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       // If the input is a view, the rebase will need to rewrite the graph and
       // this only works if we have a single output to this Function.
@@ -333,14 +347,24 @@ static optional_variable_list _process_backward_mode_ad(
         auto& grad_acc = dynamic_cast<AccumulateGrad&>(*grad_acc_fn);
         grad_acc.variable.reset();
       }
+<<<<<<< HEAD
       if (cdata) {
         impl::rebase_history(var, {cdata, output_nr});
       }
+=======
+      // This repeats the mutation of leaf variables check already done above
+      check_inplace(var, true);
+      impl::rebase_history(var, {cdata, output_nr});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else if (is_input) {
       TORCH_CHECK(!is_saved_and_setup_context, error_msg_input_returned_as_is)
       var = _view_as_self_with_no_grad(var, view_as_self_fn);
       impl::set_gradient_edge(var, {cdata, output_nr});
+<<<<<<< HEAD
     } else if (cdata) {
+=======
+    } else {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       impl::set_gradient_edge(var, {cdata, output_nr});
     }
   };
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index a33fb25e500b..83527d831b5c 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -75,17 +75,28 @@ inline bool should_run_in_cpu_ready_queue(c10::DeviceType device) {
 std::atomic<Engine::compiled_autograd_fn> the_compiled_autograd = nullptr;
 #define COMPILED_AUTOGRAD_POISON \
   reinterpret_cast<Engine::compiled_autograd_fn>(1)
+<<<<<<< HEAD
 std::atomic<int32_t> num_threads_in_backwards;
 struct CompiledAutogradThreadingDebugCheck {
   CompiledAutogradThreadingDebugCheck() {
     num_threads_in_backwards++;
+=======
+std::atomic<int32_t> num_threads_in_compiled_autograd;
+struct CompiledAutogradThreadingDebugCheck {
+  CompiledAutogradThreadingDebugCheck() {
+    num_threads_in_compiled_autograd++;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   ~CompiledAutogradThreadingDebugCheck() {
     release();
   }
   void release() {
     if (std::exchange(incremented, false)) {
+<<<<<<< HEAD
       num_threads_in_backwards--;
+=======
+      num_threads_in_compiled_autograd--;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 
@@ -289,8 +300,15 @@ void Engine::stop() {
   stopped_ = true;
   // Under some conditions, autograd threads can hang on shutdown
   // Do not wait for them to shutdown indefinitely but rely on timeout
+<<<<<<< HEAD
   auto wait_duration_str = getenv("TORCH_AUTOGRAD_SHUTDOWN_WAIT_LIMIT");
   auto wait_duration = wait_duration_str ? std::atof(wait_duration_str) : 10.0;
+=======
+  auto wait_duration_str =
+      c10::utils::get_env("TORCH_AUTOGRAD_SHUTDOWN_WAIT_LIMIT");
+  auto wait_duration =
+      wait_duration_str ? std::atof(wait_duration_str->c_str()) : 10.0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool noBackward = true;
   for (auto& queue : device_ready_queues_) {
     noBackward = noBackward && queue->empty();
@@ -609,7 +627,11 @@ auto Engine::thread_main(const std::shared_ptr<GraphTask>& graph_task) -> void {
   }
 }
 
+<<<<<<< HEAD
 // Reentrant call will re-use the graph_task's owner thread ready_queue for
+=======
+// Reentrant call will reuse the graph_task's owner thread ready_queue for
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // queueing tasks (NOTE: this is not true in the async_mode of the engine).
 // While we can create separate ready queue for each new reentrant
 // thread, but sharing the same cpu_ready_queue with parent thread is a
@@ -868,7 +890,11 @@ void set_device(int device) {
 
 // Given an Edge or optional<InputMetdata>, return the InputMetadata
 template <typename T>
+<<<<<<< HEAD
 const InputMetadata& get_input_metadata(const T& thing);
+=======
+const static InputMetadata& get_input_metadata(const T& thing);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <>
 const InputMetadata& get_input_metadata<std::optional<InputMetadata>>(
@@ -884,7 +910,11 @@ const InputMetadata& get_input_metadata<Edge>(const Edge& thing) {
 
 // Given an Edge or optional<InputMetdata>, return if there is an InputMetadata.
 template <typename T>
+<<<<<<< HEAD
 bool has_input_metadata(const T& thing);
+=======
+static bool has_input_metadata(const T& thing);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <>
 bool has_input_metadata<std::optional<InputMetadata>>(
@@ -914,7 +944,11 @@ std::vector<std::optional<InputMetadata>> collect_input_metadata(
 // outputs. This involves using the InputMetadata to check the outputs and also
 // potentially calling .sum_to on the outputs.
 template <typename T>
+<<<<<<< HEAD
 void validate_outputs_impl(
+=======
+static void validate_outputs_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::vector<T>& input_metadata_container,
     variable_list& grads,
     const std::function<std::string(const std::string&)>& format_error) {
@@ -1063,6 +1097,7 @@ void Engine::evaluate_function(
     Node* func,
     InputBuffer& inputs,
     const std::shared_ptr<ReadyQueue>& cpu_ready_queue) {
+<<<<<<< HEAD
   // The InputBuffer::adds that supplied incoming grads took pains to
   // ensure they're safe to consume in the context of the present
   // func's stream (if applicable). So we guard onto that stream
@@ -1070,6 +1105,31 @@ void Engine::evaluate_function(
   auto opt_parent_stream = (*func).stream();
   c10::OptionalStreamGuard parent_stream_guard{opt_parent_stream};
 
+=======
+  // Locally set the current stream to func's associated stream
+  auto opt_parent_stream = (*func).stream();
+  c10::OptionalStreamGuard parent_stream_guard{opt_parent_stream};
+
+  // Ensure that the incoming gradients are ready
+  for (size_t pos = 0; pos < inputs.ready_events.size(); ++pos) {
+    if (!inputs.buffer[pos].defined()) {
+      continue;
+    }
+    const auto device = inputs.buffer[pos].device();
+    bool is_accelerator = at::accelerator::isAccelerator(device.type());
+    if (!is_accelerator) {
+      continue;
+    }
+    auto& opt_ready_stream = inputs.ready_streams[pos];
+    auto& opt_ready_event = inputs.ready_events[pos];
+    TORCH_INTERNAL_ASSERT(opt_ready_stream && opt_parent_stream);
+    if (*opt_parent_stream != *opt_ready_stream) {
+      TORCH_INTERNAL_ASSERT(opt_ready_event);
+      opt_parent_stream->wait(opt_ready_event.value());
+    }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // If exec_info_ is not empty, we have to instrument the execution
   auto& exec_info_ = graph_task->exec_info_;
   if (!exec_info_.empty()) {
@@ -1207,7 +1267,11 @@ void Engine::evaluate_function(
 }
 
 static uint64_t compute_min_topological_nr(const edge_list& outputs) {
+<<<<<<< HEAD
   // Computes the mininum topological number among all the outputs
+=======
+  // Computes the minimum topological number among all the outputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (outputs.empty()) {
     return 0;
   }
@@ -1278,8 +1342,11 @@ auto Engine::execute(
         "your parameters to None after use to break the cycle and avoid the leak.");
   }
 
+<<<<<<< HEAD
   // Allows us to assert no other threads are in backwards
   CompiledAutogradThreadingDebugCheck _thread_check;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto compiled_autograd = the_compiled_autograd.load();
   TORCH_INTERNAL_ASSERT(compiled_autograd != COMPILED_AUTOGRAD_POISON);
 
@@ -1305,8 +1372,13 @@ auto Engine::execute(
 
   auto graph_task = std::make_shared<GraphTask>(
       /* keep_graph */ keep_graph,
+<<<<<<< HEAD
       /* create_graph */ create_graph,
       /* depth */ not_reentrant_backward_call ? 0 : total_depth + 1,
+=======
+      /* grad_mode */ create_graph,
+      /* reentrant_depth */ not_reentrant_backward_call ? 0 : total_depth + 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       /* cpu_ready_queue */ local_ready_queue,
       /* graph_roots */ std::move(temp_roots));
 
@@ -1326,6 +1398,7 @@ auto Engine::execute(
   }
 
   if (compiled_autograd != nullptr) {
+<<<<<<< HEAD
     // see [Note: Compiled Autograd]
     TORCH_CHECK(
         !create_graph, "compiled_autograd does not support create_graph");
@@ -1333,6 +1406,15 @@ auto Engine::execute(
     TORCH_CHECK(
         !AnomalyMode::is_enabled(),
         "compiled_autograd does not support AnomalyMode")
+=======
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        num_threads_in_compiled_autograd.load() == 0,
+        "Re-entrant into Compiled Autograd from a parent Compiled Autograd call is not yet supported. Consider disabling Compiled Autograd on the re-entrant call.");
+    // Allows us to assert no other threads are in backwards
+    CompiledAutogradThreadingDebugCheck _thread_check;
+    // see [Note: Compiled Autograd]
+    _thread_check.release();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     GraphTaskGuard guard(graph_task);
     CheckpointValidGuard cpvguard(graph_task);
     return (*compiled_autograd)(
@@ -1455,7 +1537,11 @@ c10::intrusive_ptr<at::ivalue::Future> Engine::execute_with_graph_task(
   return graph_task->future_result_;
 }
 
+<<<<<<< HEAD
 // note that when python is present, this base engine will be overriden
+=======
+// note that when python is present, this base engine will be overridden
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // with a PythonEngine. Because this typically happens before get_default_engine
 // is called, this base engine will never be created.
 Engine& Engine::get_base_engine() {
@@ -1463,7 +1549,11 @@ Engine& Engine::get_base_engine() {
   return engine;
 }
 
+<<<<<<< HEAD
 std::atomic<EngineStub> engine_stub(Engine::get_base_engine);
+=======
+static std::atomic<EngineStub> engine_stub(Engine::get_base_engine);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void set_default_engine_stub(EngineStub stub) {
   engine_stub.store(stub);
@@ -1479,8 +1569,13 @@ void Engine::set_compiled_autograd(Engine::compiled_autograd_fn fn) {
   }
   auto prior = the_compiled_autograd.exchange(COMPILED_AUTOGRAD_POISON);
   TORCH_CHECK(
+<<<<<<< HEAD
       num_threads_in_backwards.load() == 0 && prior != COMPILED_AUTOGRAD_POISON,
       "compiled_autograd._enable() requires no threads in backwards()");
+=======
+      prior != COMPILED_AUTOGRAD_POISON,
+      "compiled_autograd._enable() does not support multiple Python threads");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   the_compiled_autograd.store(fn);
 }
 
diff --git a/torch/csrc/autograd/forward_grad.h b/torch/csrc/autograd/forward_grad.h
index 9b111ac6b484..2009ffc4a65f 100644
--- a/torch/csrc/autograd/forward_grad.h
+++ b/torch/csrc/autograd/forward_grad.h
@@ -27,7 +27,11 @@ struct ForwardGrad;
 //   - Ensure that we can keep the level that we expose to the user API simple
 //   (an integer
 //     that represents the nesting depth) while avoiding confusions when the
+<<<<<<< HEAD
 //     level index is re-used.
+=======
+//     level index is reused.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // The important external APIs from this file are:
 //   - ForwardADLevel::get_next_idx() that can be used to enter a new level and
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 106ff5ee0f2f..29ce36312d2e 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -67,7 +67,11 @@ TORCH_API std::shared_ptr<Node> get_current_node();
 // or more input `Variable`s and producing zero or more output `Variable`s. All
 // functions in PyTorch's autograd machinery derive from this class and
 // override its `apply` method. Instances of such subclasses will then be
+<<<<<<< HEAD
 // invokable via the call operator.
+=======
+// invocable via the call operator.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //
 //                    Nodes in the Autograd Graph
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -592,10 +596,17 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   //   1) Extract tensors/symint args
   //   2) Collect node information for specialization and caching
   // Implementations in subclasses should call args.collect() with all node
+<<<<<<< HEAD
   // attrs. These functions are only called durring backward.
   virtual void compiled_args(CompiledNodeArgs& args) const {
     throw std::runtime_error(
         std::string("compiled_args not implemented: ") + name());
+=======
+  // attrs. These functions are only called during backward.
+  virtual void compiled_args(CompiledNodeArgs& args) const {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, std::string("compiled_args not implemented: ") + name());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Used by compiled autograd to call apply() with different saved tensors
@@ -604,8 +615,13 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   virtual variable_list apply_with_saved(
       const variable_list& inputs,
       SwapSavedVariables& saved) {
+<<<<<<< HEAD
     throw std::runtime_error(
         std::string("apply_with_saved not implemented: ") + name());
+=======
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, std::string("apply_with_saved not implemented: ") + name());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // If this node is the AOTBackward node produced by torch.compile.
diff --git a/torch/csrc/autograd/function_hook.h b/torch/csrc/autograd/function_hook.h
index 08d0b8d4c4cc..c889e9eef262 100644
--- a/torch/csrc/autograd/function_hook.h
+++ b/torch/csrc/autograd/function_hook.h
@@ -24,9 +24,16 @@ struct TORCH_API FunctionPreHook {
   // only implemented for python hooks, registers hook with compiled autograd
   virtual void compiled_args(
       torch::dynamo::autograd::CompiledNodeArgs& args) const {
+<<<<<<< HEAD
     throw std::runtime_error(
         std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
         typeid(*this).name());
+=======
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+            typeid(*this).name());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -38,9 +45,16 @@ struct TORCH_API FunctionPostHook {
   // only implemented for python hooks, registers hook with compiled autograd
   virtual void compiled_args(
       torch::dynamo::autograd::CompiledNodeArgs& args) const {
+<<<<<<< HEAD
     throw std::runtime_error(
         std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
         typeid(*this).name());
+=======
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+            typeid(*this).name());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -51,17 +65,31 @@ struct TORCH_API PostAccumulateGradHook {
   // autograd
   virtual void compiled_args(
       torch::dynamo::autograd::CompiledNodeArgs& args) const {
+<<<<<<< HEAD
     throw std::runtime_error(
         std::string("not yet implemented for compiled autograd: ") +
         typeid(*this).name());
+=======
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+            typeid(*this).name());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   virtual void apply_with_saved(
       Variable&,
       torch::dynamo::autograd::SwapSavedVariables&) {
+<<<<<<< HEAD
     throw std::runtime_error(
         std::string("not yet implemented for compiled autograd: ") +
         typeid(*this).name());
+=======
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+            typeid(*this).name());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
diff --git a/torch/csrc/autograd/functions/accumulate_grad.cpp b/torch/csrc/autograd/functions/accumulate_grad.cpp
index 3df791821556..f4b2cd0fd6ac 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.cpp
+++ b/torch/csrc/autograd/functions/accumulate_grad.cpp
@@ -14,6 +14,7 @@
 
 namespace torch::autograd {
 
+<<<<<<< HEAD
 // AccumulateGrad sets sequence_nr to the max value so it's always called
 // ASAP during backwards.
 AccumulateGrad::AccumulateGrad(Variable variable_)
@@ -32,6 +33,25 @@ auto AccumulateGrad::apply(variable_list&& grads) -> variable_list {
         "leaf variable has been moved into the graph interior");
   if (!variable.requires_grad())
     return {};
+=======
+using torch::dynamo::autograd::IValuePacker;
+
+namespace {
+
+void AccumulateGrad_apply_impl(
+    variable_list&& grads,
+    at::Tensor& variable,
+    at::Tensor& variable_grad,
+    int64_t num_expected_refs,
+    const std::function<void(at::Tensor&&)>& grad_update,
+    std::mutex* mutex = nullptr) {
+  check_input_variables("AccumulateGrad", grads, 1, 0);
+
+  if (!grads[0].defined())
+    return;
+  if (!variable.requires_grad())
+    return;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // std::move(grads[0]) to avoid bumping up refcount
   at::Tensor new_grad = std::move(grads[0]);
@@ -41,6 +61,7 @@ auto AccumulateGrad::apply(variable_list&& grads) -> variable_list {
   // when updating the gradients. We don't ensure thread safety on hooks
   // and rely on user to provide thread safe hooks
   // see Note [Thread Safety on Autograd Node]
+<<<<<<< HEAD
   std::lock_guard<std::mutex> lock(mutex_);
 
   at::Tensor& grad = variable.mutable_grad();
@@ -57,6 +78,84 @@ auto AccumulateGrad::apply(variable_list&& grads) -> variable_list {
       new_grad,
       1 + !post_hooks().empty() /* num_expected_refs */,
       [&grad](at::Tensor&& grad_update) { grad = std::move(grad_update); });
+=======
+  // need to still lock for eager here
+  std::optional<std::lock_guard<std::mutex>> lock;
+  if (mutex != nullptr) {
+    lock.emplace(*mutex);
+  }
+
+  AccumulateGrad::accumulateGrad(
+      variable, variable_grad, new_grad, num_expected_refs, grad_update);
+}
+
+variable_list AccumulateGrad_apply_functional_no_hooks_ivalue(
+    const variable_list& grads,
+    const ivalue_list& args) {
+  PackedArgs r(args);
+  auto variable = r.unpack<at::Tensor>();
+  auto variable_grad = r.unpack<at::Tensor>();
+  auto has_post_hooks = r.unpack<bool>();
+
+  // Functional Tensors insert an Error node to assert that backward is never
+  // called
+  if (variable.grad_fn() &&
+      std::dynamic_pointer_cast<Error>(variable.grad_fn()) == nullptr) {
+    throw std::logic_error(
+        "leaf variable has been moved into the graph interior");
+  }
+
+  at::Tensor functional_grad;
+  AccumulateGrad_apply_impl(
+      variable_list(grads),
+      variable,
+      variable_grad,
+      1 + has_post_hooks,
+      [&functional_grad](at::Tensor&& grad_update) {
+        functional_grad = std::move(grad_update);
+      },
+      nullptr // no mutex needed since this is executed under a single thread
+  );
+  if (!functional_grad.defined()) {
+    // In-place accumulation (Case 2.3) does not execute grad_update
+    functional_grad = std::move(variable_grad);
+  }
+  return {functional_grad};
+}
+} // namespace
+
+// AccumulateGrad sets sequence_nr to the max value so it's always called
+// ASAP during backwards.
+AccumulateGrad::AccumulateGrad(Variable variable_)
+    : Node(/*sequence_nr=*/UINT64_MAX), variable(std::move(variable_)) {
+  add_input_metadata(variable);
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+auto AccumulateGrad::apply(variable_list&& grads) -> variable_list {
+  if (variable.grad_fn()) {
+    throw std::logic_error(
+        "leaf variable has been moved into the graph interior");
+  }
+
+  at::Tensor& variable_grad = variable.mutable_grad();
+
+  // If the function has post hooks (for example, a DDP allreduce hook),
+  // call_function in Engine.cpp will temporarily bump the expected refcount
+  // by one, hence the addition of !post_hooks().empty() for
+  // 'num_expected_refs' in addition to the one reference that we're holding.
+  // 'num_expected_refs' is used to determine whether or not we should clone
+  // the grad or can steal the grad.
+  AccumulateGrad_apply_impl(
+      std::move(grads),
+      variable,
+      variable_grad,
+      1 + !post_hooks().empty() /* num_expected_refs */,
+      [&variable_grad](at::Tensor&& grad_update) {
+        variable_grad = std::move(grad_update);
+      },
+      &mutex_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto& hook = tensor_post_acc_grad_hooks();
   if (hook != nullptr) {
@@ -71,11 +170,19 @@ void AccumulateGrad::compiled_args(CompiledNodeArgs& args) const {
     args.collect(variable);
     args.collect(variable.grad());
   }
+<<<<<<< HEAD
+=======
+  args.collect(GradMode::is_enabled());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto& hook = tensor_post_acc_grad_hooks();
   if (hook != nullptr) {
     hook->compiled_args(args);
   }
 }
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 variable_list AccumulateGrad::apply_with_saved(
     const variable_list& grads,
     SwapSavedVariables& saved) {
@@ -89,11 +196,39 @@ variable_list AccumulateGrad::apply_with_saved(
   saved.before(variable_copy);
   saved.before(grad_copy);
   variable_copy.mutable_grad() = grad_copy;
+<<<<<<< HEAD
   // op is intentionally static
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("inductor::accumulate_grad_", "")
                        .typed<void(const at::Tensor&, const at::Tensor&)>();
   op.call(variable_copy, grads[0]);
+=======
+
+  // name() includes namespace for historical reasons:
+  // torch::autograd::AcumulateGrad For Compiled Autograd, we just want the op
+  // name without the namespace
+  std::string name = "AccumulateGrad";
+
+  // proxy a call to torch.ops.inductor.accumulate_grad_.default
+  static bool flag [[maybe_unused]] = [&]() {
+    std::vector<at::TypePtr> schema = {
+        IValuePacker<at::Tensor>::packed_type(),
+        IValuePacker<at::Tensor>::packed_type(),
+        IValuePacker<bool>::packed_type()};
+    const auto& interface = torch::dynamo::autograd::getPyCompilerInterface();
+    interface->bind_function(
+        saved.get_py_compiler(),
+        name,
+        AccumulateGrad_apply_functional_no_hooks_ivalue,
+        schema);
+    return true;
+  }();
+
+  const auto& interface = torch::dynamo::autograd::getPyCompilerInterface();
+  interface->call_accumulate_grad(
+      saved.get_py_compiler(), variable_copy, grads[0], !post_hooks().empty());
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& hook = tensor_post_acc_grad_hooks();
   if (hook != nullptr) {
     hook->apply_with_saved(variable_copy, saved);
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index b1768ee2a93c..ca9b0cc33475 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -58,11 +58,15 @@ struct TORCH_API AccumulateGrad : public Node {
     return impl::post_acc_grad_hooks(variable);
   }
 
+<<<<<<< HEAD
   // Given a variable with its current grad as variable_grad, accumulates
   // new_grad into variable_grad if in place accumulation is possible.
   // Otherwise, uses 'update_grad' to update the grad for the variable.
 
   // "Gradient Layout Contract"
+=======
+  // Note: Gradient Layout Contract
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //
   // AccumulateGrad tries to stash strided (non-sparse) grads with memory layout
   // (strides) such that variables and grads interact efficiently in later
@@ -101,6 +105,71 @@ struct TORCH_API AccumulateGrad : public Node {
   // degraded performance in Reducer.cpp or optimizer kernels, not death by
   // assert or silently bad numerics.
 
+<<<<<<< HEAD
+=======
+  // Gradient Accumulation
+  // Given a variable with its current grad as variable_grad, accumulates
+  // new_grad into variable_grad if in place accumulation is possible.
+  // Otherwise, uses 'update_grad' to update the grad for the variable.
+  //
+  // Branch breakdown:
+  // - Case 1: Param has no existing grad
+  //   - Case 1.1: Stealable dense new_grad
+  //     . We aren't setting up for double-backward.
+  //     . No other user-visible tensor references new_grad.
+  //     . new_grad obeys the "Gradient Layout Contract", there has a special
+  //       case, For MKLDNN tensor, which is a opaque tensor, assuming it obeys
+  //       layout_contract.
+  //   - Case 1.2: Stealable sparse new_grad
+  //     . Can't detach sparse tensor (since metadata changes are not allowed
+  //       after detach), so just create a new one for the grad which is a
+  //       shallow copy. We need a shallow copy so that modifying the original
+  //       grad tensor doesn't modify the grad we accumulate.
+  //     . We only skip clone if indices and values themselves are contiguous
+  //       for backward compatibility reasons. Since without this optimization,
+  //       earlier we would clone the entire SparseTensor which cloned indices
+  //       and values. For details see
+  //       https://github.com/pytorch/pytorch/issues/34375.
+  //   - Case 1.3: Cloning sparse/nested new_grad
+  //   - Case 1.4: Cloning MKLDNN new_grad
+  //   - Case 1.5: Deep copies new_grad according to the Gradient Layout
+  //   Contract.
+  // - Case 2: Param has existing grad and grad mode is not enabled
+  //   - This case is not strictly necessary, but it makes the first-order only
+  //     case slightly more efficient.
+  //   - Case 2.1: Sparse variable_grad + Dense new_grad
+  //     . If `variable_grad` is sparse and `new_grad` is not sparse, their
+  //       sum is not sparse, and we must change the TensorImpl type of
+  //       `variable_grad` for it to store the result. However, changing the
+  //       TensorImpl type of a tensor requires changing the tensor itself, and
+  //       thus in this case we have to change the grad tensor.
+  //   - Case 2.2: Vmap-incompatible
+  //     . Ideally we'd perform an in-place operation to avoid changing
+  //       the grad tensor. However, if that's impossible because the grads
+  //       are vmap-incompatible (See NOTE: [vmap-incompatible in-place
+  //       operations]), then we just add them out-of-place.
+  //   - Case 2.3: In-place addition
+  //     . In this case we can avoid changing the grad tensor. There are three
+  //       scenarios when we'll hit this case:
+  //       . `variable_grad` is sparse, and `new_grad` is sparse.
+  //       . `variable_grad` is dense, and `new_grad` is sparse.
+  //       . `variable_grad` is dense, and `new_grad` is dense.
+  //       . `variable_grad` is mkldnn, and `new_grad` is mkldnn.
+  //
+  //       In all of these four cases, `variable_grad += new_grad` is a
+  //       valid operation which adds `new_grad` to `variable_grad` in
+  //       place. `variable_grad` is thus still referring to the same tensor
+  //       after the operation.
+  //     . DistributedDataParallel(DDP) package relies on grad being
+  //       mutated in place for saving peak memory usage. DDP will still
+  //       work correctly if it is mutated out of place here, but DDP will
+  //       maintain one extra copy of grad tensors in buffer and thus
+  //       increase peak memory usage.
+  // - Case 3: Param has existing grad and grad mode is enabled
+  //   - Case 3.1: Sparse variable_grad + Dense new_grad
+  //   - Case 3.2: Not Sparse variable_grad + Dense new_grad
+  //
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // variable: the variable whose grad we're accumulating.
   // variable_grad: the current grad for the variable.
   // new_grad: new grad we want to accumulate for the variable.
@@ -125,6 +194,7 @@ struct TORCH_API AccumulateGrad : public Node {
           at::caching::adjusted_use_count(new_grad) <= num_expected_refs &&
           (new_grad.is_mkldnn() ||
            utils::obeys_layout_contract(new_grad, variable))) {
+<<<<<<< HEAD
         // we aren't setting up for double-backward
         // not sparse
         // no other user-visible tensor references new_grad
@@ -132,6 +202,9 @@ struct TORCH_API AccumulateGrad : public Node {
         // case, For MKLDNN tensor, which is a opaque tensor, assuming it obeys
         // layout_contract. Under these conditions, we can steal new_grad
         // without a deep copy.
+=======
+        // See Case 1.1: Stealable dense new_grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         update_grad(new_grad.detach());
       } else if (
           !GradMode::is_enabled() && new_grad.is_sparse() &&
@@ -142,6 +215,7 @@ struct TORCH_API AccumulateGrad : public Node {
           new_grad._indices().use_count() <= 1 &&
           new_grad._values().use_count() <= 1 &&
           new_grad.use_count() <= num_expected_refs) {
+<<<<<<< HEAD
         // Can't detach sparse tensor (since metadata changes are not allowed
         // after detach), so just create a new one for the grad which is a
         // shallow copy. We need a shallow copy so that modifying the original
@@ -152,6 +226,9 @@ struct TORCH_API AccumulateGrad : public Node {
         // and values.
         // For details see https://github.com/pytorch/pytorch/issues/34375.
 
+=======
+        // Case 1.2: Stealable sparse new_grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // No scenario where we expect this to be true currently
         TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
             !at::caching::is_cached_tensor(new_grad._indices()) &&
@@ -166,17 +243,30 @@ struct TORCH_API AccumulateGrad : public Node {
       } else {
         if (new_grad.is_sparse() || new_grad.is_sparse_csr() ||
             new_grad.is_nested()) {
+<<<<<<< HEAD
           update_grad(new_grad.clone());
         } else {
           if (new_grad.is_mkldnn()) {
             update_grad(new_grad.clone());
           } else {
             // Deep copies new_grad according to the "Gradient Layout Contract."
+=======
+          // Case 1.3: Cloning sparse/nested new_grad
+          update_grad(new_grad.clone());
+        } else {
+          if (new_grad.is_mkldnn()) {
+            // Case 1.4: Cloning MKLDNN new_grad
+            update_grad(new_grad.clone());
+          } else {
+            // Case 1.5: Deep copies new_grad according to the "Gradient
+            // Layout Contract."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             update_grad(utils::clone_obey_contract(new_grad, variable));
           }
         }
       }
     } else if (!GradMode::is_enabled()) {
+<<<<<<< HEAD
       // This case is not strictly necessary, but it makes the first-order only
       // case slightly more efficient.
       if (variable_grad.is_sparse() && !new_grad.is_sparse()) {
@@ -185,18 +275,28 @@ struct TORCH_API AccumulateGrad : public Node {
         // `variable_grad` for it to store the result. However, changing the
         // TensorImpl type of a tensor requires changing the tensor itself, and
         // thus in this case we have to change the grad tensor.
+=======
+      // Case 2: Param has existing grad and grad mode is not enabled
+      if (variable_grad.is_sparse() && !new_grad.is_sparse()) {
+        // Case 2.1: Sparse variable_grad + Dense new_grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto result = new_grad + variable_grad;
         CHECK_RESULT(result, variable);
         update_grad(std::move(result));
       } else if (!at::inplaceIsVmapCompatible(variable_grad, new_grad)) {
+<<<<<<< HEAD
         // Ideally we'd perform an in-place operation to avoid changing
         // the grad tensor. However, if that's impossible because the grads
         // are vmap-incompatible (See NOTE: [vmap-incompatible in-place
         // operations]), then we just add them out-of-place.
+=======
+        // Case 2.2: Vmap-incompatible
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto result = variable_grad + new_grad;
         CHECK_RESULT(result, variable);
         update_grad(std::move(result));
       } else {
+<<<<<<< HEAD
         // In this case we can avoid changing the grad tensor. There are three
         // scenarios when we'll hit this case:
         //
@@ -214,6 +314,9 @@ struct TORCH_API AccumulateGrad : public Node {
         // work correctly if it is mutated out of place here, but DDP will
         // maintain one extra copy of grad tensors in buffer and thus
         // increase peak memory usage.
+=======
+        // Case 2.3: In-place addition
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         variable_grad += new_grad;
         CHECK_RESULT(variable_grad, variable);
         // ^ We could enforce the contract more aggressively here by writing:
@@ -231,12 +334,24 @@ struct TORCH_API AccumulateGrad : public Node {
         // which may break user code.
       }
     } else {
+<<<<<<< HEAD
       at::Tensor result;
       if (variable_grad.is_sparse() && !new_grad.is_sparse()) {
         // CPU backend throws an error on sparse + dense, so prefer dense +
         // sparse here.
         result = new_grad + variable_grad;
       } else {
+=======
+      // Case 3: Param has existing grad and grad mode is enabled
+      at::Tensor result;
+      if (variable_grad.is_sparse() && !new_grad.is_sparse()) {
+        // Case 3.1: Sparse variable_grad + Dense new_grad
+        // CPU backend throws an error on sparse + dense, so
+        // prefer dense + sparse here.
+        result = new_grad + variable_grad;
+      } else {
+        // Case 3.2: Not Sparse variable_grad + Dense new_grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // Assumes operator+ result typically matches strides of first arg,
         // and hopes variable_grad was originally created obeying layout
         // contract.
diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp
index a310be58e288..977da416cd81 100644
--- a/torch/csrc/autograd/functions/basic_ops.cpp
+++ b/torch/csrc/autograd/functions/basic_ops.cpp
@@ -21,7 +21,11 @@ variable_list Error::apply(variable_list&& inputs) const {
 }
 
 void Error::compiled_args(CompiledNodeArgs& args) const {
+<<<<<<< HEAD
   // throw the error durring collect, the graph won't get compiled
+=======
+  // throw the error during collect, the graph won't get compiled
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   apply(variable_list());
 }
 
diff --git a/torch/csrc/autograd/functions/init.cpp b/torch/csrc/autograd/functions/init.cpp
index 70c2f54e18aa..0a4effd162e9 100644
--- a/torch/csrc/autograd/functions/init.cpp
+++ b/torch/csrc/autograd/functions/init.cpp
@@ -5,6 +5,10 @@
 #include <torch/csrc/autograd/functions/pybind.h>
 #include <torch/csrc/autograd/functions/tensor.h>
 #include <torch/csrc/autograd/generated/python_functions.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/autograd/python_autograd.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/autograd/python_cpp_function.h>
 #include <torch/csrc/autograd/python_variable.h>
 #ifdef USE_DISTRIBUTED
@@ -70,10 +74,17 @@ template <
     typename T,
     typename ValueT,
     typename ParamsT,
+<<<<<<< HEAD
     ValueT ParamsT::*ptr,
     typename ConvertArgT,
     PyObject* (*Convert)(ConvertArgT)>
 PyObject* getTupleAttr(PyObject* obj, void* _unused) {
+=======
+    ValueT ParamsT::* ptr,
+    typename ConvertArgT,
+    PyObject* (*Convert)(ConvertArgT)>
+static PyObject* getTupleAttr(PyObject* obj, void* _unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   THPCppFunction* self = (THPCppFunction*)obj;
   auto& arr = ((T*)(self->cdata.get()))->*ptr;
@@ -92,10 +103,17 @@ template <
     typename T,
     typename ValueT,
     typename ParamsT,
+<<<<<<< HEAD
     ValueT ParamsT::*ptr,
     typename ConvertArgT,
     PyObject* (*Convert)(ConvertArgT)>
 PyObject* getValueAttr(PyObject* obj, void* _unused) {
+=======
+    ValueT ParamsT::* ptr,
+    typename ConvertArgT,
+    PyObject* (*Convert)(ConvertArgT)>
+static PyObject* getValueAttr(PyObject* obj, void* _unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   THPCppFunction* self = (THPCppFunction*)obj;
   auto& val = ((T*)(self->cdata.get()))->*ptr;
diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
index 5f035c6f3320..c272ae8c3f36 100644
--- a/torch/csrc/autograd/functions/tensor.cpp
+++ b/torch/csrc/autograd/functions/tensor.cpp
@@ -69,8 +69,12 @@ variable_list CopyBackwards::apply_with_saved(
     SwapSavedVariables& saved) {
   saved.before(src_options);
 
+<<<<<<< HEAD
   static c10::once_flag flag;
   c10::call_once(flag, [&]() {
+=======
+  static bool flag [[maybe_unused]] = [&]() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<at::TypePtr> schema = {
         IValuePacker<std::array<bool, 2>>::packed_type(),
         IValuePacker<c10::TensorOptions>::packed_type()};
@@ -80,7 +84,12 @@ variable_list CopyBackwards::apply_with_saved(
         name(),
         CopyBackwards_apply_functional_ivalue,
         schema);
+<<<<<<< HEAD
   });
+=======
+    return true;
+  }();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   PackedArgs packed_args;
   packed_args.pack<std::array<bool, 2>>(
@@ -270,9 +279,15 @@ variable_list CopySlices::apply_with_saved(
     TORCH_INTERNAL_ASSERT(stuff.size() == 3);
     // These variables are named the same as in CopySlices::apply_impl.
     // Follow along there.
+<<<<<<< HEAD
     auto result = stuff[0];
     auto grad_slice = stuff[1];
     auto grad_slice_clone = stuff[2];
+=======
+    const auto& result = stuff[0];
+    const auto& grad_slice = stuff[1];
+    const auto& grad_slice_clone = stuff[2];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto res = fn->apply_with_saved({grad_slice_clone}, saved);
     results = interface->call_copy_slices_epilogue(
         saved.get_py_compiler(), needs_input_grad, result, res, grad_slice);
diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h
index 0494f7f89a8b..6892f9b9bff4 100644
--- a/torch/csrc/autograd/functions/utils.h
+++ b/torch/csrc/autograd/functions/utils.h
@@ -71,7 +71,14 @@ inline void set_history(
     // If the codegen triggers this, you most likely want to add your newly
     // added function to the DONT_REQUIRE_DERIVATIVE list in
     // tools/autograd/gen_variable_type.py
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(isDifferentiableType(variable.scalar_type()));
+=======
+    TORCH_CHECK(
+        isDifferentiableType(variable.scalar_type()),
+        "Autograd not support dtype: ",
+        variable.scalar_type());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto output_nr = grad_fn->add_input_metadata(variable);
     impl::set_gradient_edge(variable, {grad_fn, output_nr});
   } else {
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 59893dfdce16..7efee706f269 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -19,6 +19,10 @@
 #include <torch/csrc/autograd/input_metadata.h>
 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/profiler_python.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/autograd/python_autograd.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/autograd/python_function.h>
 #include <torch/csrc/autograd/python_saved_variable_hooks.h>
 #include <torch/csrc/autograd/python_variable.h>
@@ -454,7 +458,15 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       "_saved_tensors_hooks_is_enabled",
       at::SavedTensorDefaultHooks::is_enabled);
   m.def("_saved_tensors_hooks_enable", at::SavedTensorDefaultHooks::enable);
+<<<<<<< HEAD
   m.def("_saved_tensors_hooks_disable", at::SavedTensorDefaultHooks::disable);
+=======
+  m.def(
+      "_saved_tensors_hooks_disable",
+      at::SavedTensorDefaultHooks::disable,
+      py::arg("error_message"),
+      py::arg("fail_if_non_empty") = true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def(
       "_saved_tensors_hooks_set_tracing",
       at::SavedTensorDefaultHooks::set_tracing);
@@ -470,6 +482,30 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
   m.def("_pop_saved_tensors_default_hooks", []() {
     torch::autograd::PyDefaultSavedVariableHooks::pop_hooks();
   });
+<<<<<<< HEAD
+=======
+  m.def(
+      "_top_saved_tensors_default_hooks",
+      [](bool ignore_is_tracing)
+          -> std::optional<std::pair<py::function, py::function>> {
+        auto out = at::SavedTensorDefaultHooks::get_hooks(ignore_is_tracing);
+
+        if (!out.has_value()) {
+          return std::nullopt;
+        }
+
+        auto [pack_hook, unpack_hook] = *out;
+        // gil for destructor of pack_hook, unpack_hook that decrements
+        // reference
+        py::gil_scoped_acquire gil;
+
+        return std::make_pair(
+            py::reinterpret_steal<py::function>(pack_hook.release()),
+            py::reinterpret_steal<py::function>(unpack_hook.release()));
+      }
+
+  );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def("_get_creation_meta", [](const at::Tensor& t) {
     auto* meta = torch::autograd::impl::get_view_autograd_meta(t);
@@ -579,7 +615,11 @@ static PyObject* set_autocast_enabled(
   HANDLE_TH_ERRORS
   static PythonArgParser parser(
       {"set_autocast_enabled(std::string_view device_type, bool enabled)",
+<<<<<<< HEAD
        "set_autocast_enabled(bool enabled)"}); // this signature is depracated.
+=======
+       "set_autocast_enabled(bool enabled)"}); // this signature is deprecated.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   // Set at::kCUDA as default value to prevent BC-breaking changes.
@@ -602,7 +642,11 @@ static PyObject* is_autocast_enabled(
   HANDLE_TH_ERRORS
   static PythonArgParser parser(
       {"is_autocast_enabled(std::string_view device_type)",
+<<<<<<< HEAD
        "is_autocast_enabled()"}); // this signature is depracated.
+=======
+       "is_autocast_enabled()"}); // this signature is deprecated.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ParsedArgs<1> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   // Set at::kCUDA as default value to prevent BC-breaking changes.
@@ -954,6 +998,136 @@ static PyObject* is_fwd_grad_enabled(PyObject* _unused, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
+=======
+template <bool skip_tensors_in_non_tensorlist>
+static bool visit(
+    PyObject* o,
+    const std::function<bool(at::Tensor&)>& visit_tensor) {
+  if (THPVariable_Check(o)) {
+    auto t = THPVariable_Unpack(o);
+    if (visit_tensor(t)) {
+      return true;
+    }
+  } else if (PyList_Check(o)) {
+    // Check that this List is TensorList
+    if constexpr (skip_tensors_in_non_tensorlist) {
+      for (const auto i : c10::irange(PyList_GET_SIZE(o))) {
+        if (!THPVariable_Check(PyList_GET_ITEM(o, i))) {
+          return false;
+        }
+      }
+    }
+    for (const auto i : c10::irange(PyList_GET_SIZE(o))) {
+      if (visit<skip_tensors_in_non_tensorlist>(
+              PyList_GET_ITEM(o, i), visit_tensor)) {
+        return true;
+      };
+    }
+  }
+  return false;
+}
+
+// Visiting of tensors in args and kwargs,
+// only List container is visited.
+// skip_tensors_in_non_tensorlist will skip any List with non-Tensor.
+// Lambda returning true means short circuit, traversal stops after that.
+template <bool skip_tensors_in_non_tensorlist>
+static void visit_tensors(
+    PyObject* args,
+    PyObject* kwargs,
+    const std::function<bool(at::Tensor&)>& visit_tensor) {
+  if (args && PyTuple_Check(args)) {
+    for (const auto i : c10::irange(PyTuple_GET_SIZE(args))) {
+      if (visit<skip_tensors_in_non_tensorlist>(
+              PyTuple_GET_ITEM(args, i), visit_tensor)) {
+        return;
+      }
+    }
+  }
+  if (kwargs && PyDict_Check(kwargs)) {
+    auto vals = THPObjectPtr{PyDict_Values(kwargs)};
+    for (const auto i : c10::irange(PyList_Size(vals))) {
+      if (visit<skip_tensors_in_non_tensorlist>(
+              PyList_GetItem(vals, i), visit_tensor)) {
+        return;
+      }
+    }
+  }
+}
+
+// Returns true if any of the args, kwargs tensor leaves have requires_grad.
+// Only List[Tensor] container in args is supported.
+static PyObject* any_requires_grad(
+    PyObject* _unused,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  bool has_requires_grad = false;
+  visit_tensors<true>(args, kwargs, [&has_requires_grad](at::Tensor& t) {
+    if (t.requires_grad()) {
+      has_requires_grad = true;
+      return true;
+    }
+    return false;
+  });
+  if (has_requires_grad) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+  END_HANDLE_TH_ERRORS
+}
+
+// Checks aliasing constraint for custom ops:
+// Returns true if any of outputs is alias to any of inputs or another output
+// Args:
+// args[0] - inputs args
+// args[1] - inputs kwargs
+// args[2] - outputs
+// Only List container is supported.
+// Tensors in Lists that has not only Tensor are checked.
+static PyObject* any_output_is_alias_to_input_or_output(
+    PyObject* _unused,
+    PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject* inps = PyTuple_GET_ITEM(args, 0);
+  PyObject* inps_kwargs = PyTuple_GET_ITEM(args, 1);
+  PyObject* outs = PyTuple_GET_ITEM(args, 2);
+  std::unordered_set<void*> s;
+  visit_tensors<false>(inps, inps_kwargs, [&s](at::Tensor& t) {
+    if (!t.storage()) {
+      return false;
+    }
+    auto* cp = t.storage().data_ptr().get_context();
+    if (cp) {
+      s.insert(cp);
+    }
+    return false;
+  });
+  bool ret = false;
+  visit_tensors<false>(outs, nullptr, [&s, &ret](at::Tensor& t) {
+    if (!t.storage()) {
+      return false;
+    }
+    auto* cp = t.storage().data_ptr().get_context();
+    if (!cp) {
+      return false;
+    }
+    if (s.find(cp) != s.end()) {
+      ret = true;
+      return true;
+    }
+    s.insert(cp);
+    return false;
+  });
+  if (ret) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+  END_HANDLE_TH_ERRORS
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static PyObject* set_multithreading_enabled(
     PyObject* self,
     PyObject* args,
@@ -1295,7 +1469,11 @@ static PyObject* len_torch_dispatch_stack(PyObject* _unused, PyObject* args) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPModule_increment_version(
+=======
+static PyObject* THPModule_increment_version(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* _unused,
     PyObject* tensor_list) {
   HANDLE_TH_ERRORS
@@ -1325,6 +1503,17 @@ static PyMethodDef methods[] = {
      nullptr},
     {"is_grad_enabled", is_grad_enabled, METH_NOARGS, nullptr},
     {"_set_fwd_grad_enabled", set_fwd_grad_enabled, METH_O, nullptr},
+<<<<<<< HEAD
+=======
+    {"_any_requires_grad",
+     castPyCFunctionWithKeywords(any_requires_grad),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"_any_output_is_alias_to_input_or_output",
+     any_output_is_alias_to_input_or_output,
+     METH_VARARGS,
+     nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"_is_fwd_grad_enabled", is_fwd_grad_enabled, METH_NOARGS, nullptr},
     {"is_inference_mode_enabled",
      is_inference_mode_enabled,
diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index f127ab3c04df..7cc687b59c20 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -26,8 +26,18 @@ namespace {
 // See https://github.com/pytorch/pytorch/issues/60306
 // TODO: clean this up when https://github.com/pytorch/pytorch/issues/60306 is
 // improved
+<<<<<<< HEAD
 void record_stream_any_impl(Variable& var, c10::Stream& stream) {
   // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+=======
+void record_stream_any_impl(Variable& var, const c10::Stream& stream) {
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+
+  if (stream.device_index() != var.device().index()) {
+    return;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto guard = c10::impl::VirtualGuardImpl(device_of(var).value().type());
 
   if (C10_UNLIKELY(at::isBatchedTensor(var))) {
@@ -126,6 +136,7 @@ static void accumulate(
   }
 }
 
+<<<<<<< HEAD
 void InputBuffer::add(
     size_t pos,
     Variable&& var,
@@ -219,6 +230,174 @@ void InputBuffer::add(
       c10::OptionalDeviceGuard device_guard{device};
       accumulate(buffer, pos, std::move(var));
     }
+=======
+// Note: [Stream sync contract when dealing with multi-deviced-ness]
+//
+// An operator can deal with multiple devices, e.g. if it does a device
+// transfer, etc. However, for the purpose of stream synchronization, the engine
+// is only aware of single canonical device/stream for each autograd Node.
+//
+// For the proper synchronization, the Node author should make sure of the
+// following:
+//
+// 1) A node consuming a gradient should wait on the canonical stream before
+//    using it.
+// 2) A node producing a gradient should have it ready on the canonical
+//    stream during node execution.
+//
+
+// Note: [Autograd Producer-Consumer Stream Syncs]
+//
+// The producer-consumer stream syncs are partially handled in this method
+// and partially handled in the engine prior to the consumer's execution.
+// The logic here is mainly responsible for handling the synchronization needed
+// for accumulation and recording the event that the consumer should wait on
+// later. The corresponding wait and record_stream happens in the engine.
+//
+// First producer
+// ==============
+// There are several things we need to do upon seeing the first producer:
+// 1) Determine the accumulation stream (which may or may not be used):
+//    case A) var's device matches consumer node's canonical device
+//            (The producer node's canonical device may or may not match)
+//            -> accumulator stream = consumer stream
+//    case B) var's device matches producer node's canonical device
+//            and does not match consumer node's canonical device
+//            -> accumulator stream = producer stream
+//    case C) var device matches neither
+//            -> accumulator stream = var device's current stream
+//            See Note [Stream sync contract when dealing with
+//            multi-deviced-ness]
+// 2) Because we are the first producer, there's no accumulation necessary.
+//    Just move var into the buffer.
+// 3) Update the ready_events and streams for the current position.**
+//    ready_events are events you need to wait for to ensure the corresponding
+//    buffers are ready. The events are updated as we accumulate into the
+//    buffer.
+//
+// Nth producer
+// ============
+// 1) Synchronize for accumulation. Accumulation operates on both the new
+//   incoming gradient and the existing gradient in the buffer.
+//   (i) wait stream and (ii) record stream to make sure both are ready to be
+//   used on the accumulation stream.
+// 2) Accumulate on the accumulation stream
+// 3) Update the ready event and stream for the current position.**
+//
+// **As an optimization, we avoid creating and recording an event if we
+// know that we won't need to wait on it, saving on the order of microseconds.
+//
+void InputBuffer::add(
+    size_t pos,
+    Variable&& var,
+    const std::optional<c10::Stream>& opt_producer_stream_,
+    const std::optional<c10::Stream>& opt_consumer_stream_) {
+  TORCH_INTERNAL_ASSERT(pos < buffer.size());
+
+  if (!var.defined()) {
+    return;
+  }
+  const auto device = var.device();
+  const auto device_type = device.type();
+  bool is_accelerator = at::accelerator::isAccelerator(device.type());
+  //
+  // Non-accelerator case
+  //
+  if (!is_accelerator) {
+    if (!buffer[pos].defined()) {
+      buffer[pos] = std::move(var);
+    } else {
+      c10::OptionalDeviceGuard device_guard{device};
+      accumulate(buffer, pos, std::move(var));
+    }
+    return;
+  }
+  // Handle the case where var is on an accelerator but producer node has no
+  // canonical stream, e.g. this can happen if forward is DtoH
+  const std::optional<c10::Stream>& opt_producer_stream =
+      (opt_producer_stream_.has_value()
+           ? opt_producer_stream_
+           : std::optional<c10::Stream>(
+                 at::accelerator::getCurrentStream(device.index())));
+
+  // opt_consumer_stream is always non-null when is_accelerator is true
+  // when InputBuffer is used in the engine. InputBuffer is also called
+  // elsewhere however! (e.g. other engine implementations)
+  const std::optional<c10::Stream>& opt_consumer_stream =
+      (opt_consumer_stream_.has_value()
+           ? opt_consumer_stream_
+           : std::optional<c10::Stream>(
+                 at::accelerator::getCurrentStream(device.index())));
+
+  TORCH_INTERNAL_ASSERT(opt_consumer_stream && opt_producer_stream);
+
+  // See Note: [Autograd Producer-Consumer Stream Syncs]
+  if (!opt_accum_streams[pos].has_value()) {
+    // [ First producer ]
+    TORCH_INTERNAL_ASSERT(!buffer[pos].defined());
+    // 1)
+    if (opt_consumer_stream->device() == device) {
+      // Case A
+      opt_accum_streams[pos] = opt_consumer_stream;
+      if (*opt_consumer_stream != *opt_producer_stream) {
+        // We will end up doing record_stream on the accumulation stream
+        // (which is the consumer stream) later, but we also need to do
+        // it here in case we don't end up accumulating.
+        record_stream_any_impl(var, *opt_consumer_stream);
+      }
+    } else if (opt_producer_stream->device() == device) {
+      // Case B
+      opt_accum_streams[pos] = opt_producer_stream;
+    } else {
+      // Case C
+      opt_accum_streams[pos] =
+          at::accelerator::getCurrentStream(device.index());
+    }
+    // 2)
+    buffer[pos] = std::move(var);
+    // 3)
+    auto& opt_accum_stream = opt_accum_streams[pos];
+    TORCH_INTERNAL_ASSERT(opt_accum_stream.has_value());
+    if (*opt_consumer_stream != *opt_producer_stream ||
+        *opt_accum_stream != *opt_producer_stream) {
+      // Either the consumer or accum stream waits for the producer
+      // stream depending on whether accumulation is needed.
+      auto event = c10::Event{device_type};
+      event.record(*opt_producer_stream);
+      ready_events[pos] = std::move(event);
+    }
+    ready_streams[pos] = opt_producer_stream;
+  } else {
+    // [ Nth producer ]
+    auto accum_stream = opt_accum_streams[pos];
+    auto& ready_event = ready_events[pos];
+    auto& ready_stream = ready_streams[pos];
+    TORCH_INTERNAL_ASSERT(accum_stream && ready_stream);
+    // 1)
+    if (*accum_stream != *opt_producer_stream) {
+      auto event = c10::Event{device_type};
+      event.record(*opt_producer_stream);
+      accum_stream->wait(event);
+      record_stream_any_impl(var, *accum_stream);
+    }
+    if (*accum_stream != *ready_stream) {
+      TORCH_INTERNAL_ASSERT(ready_event);
+      accum_stream->wait(*ready_event);
+      // This is redundant for case A, but needed for case C
+      record_stream_any_impl(buffer[pos], *accum_stream);
+    }
+    // 2)
+    c10::OptionalStreamGuard stream_guard{accum_stream};
+    accumulate(buffer, pos, std::move(var));
+    // 3)
+    if (*opt_consumer_stream != *accum_stream) {
+      // Only the consumer stream needs to wait for this event
+      auto event = c10::Event{device_type};
+      event.record(*accum_stream);
+      ready_events[pos] = std::move(event);
+    }
+    ready_streams[pos] = accum_stream;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h
index 5c3b46fbdaa8..12b28b581a48 100644
--- a/torch/csrc/autograd/input_buffer.h
+++ b/torch/csrc/autograd/input_buffer.h
@@ -15,7 +15,15 @@
 namespace torch::autograd {
 
 struct InputBuffer {
+<<<<<<< HEAD
   explicit InputBuffer(size_t size) : buffer(size) {}
+=======
+  explicit InputBuffer(size_t size)
+      : buffer(size),
+        opt_accum_streams(size),
+        ready_events(size),
+        ready_streams(size) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   InputBuffer(const InputBuffer& other) = delete;
   InputBuffer(InputBuffer&& other) = default;
   explicit InputBuffer(variable_list&& inputs) : buffer(std::move(inputs)) {}
@@ -38,6 +46,17 @@ struct InputBuffer {
   static std::vector<Variable> variables(InputBuffer&& g);
 
   std::vector<Variable> buffer;
+<<<<<<< HEAD
+=======
+  // The stream used for accumulation when a variable is used multiple times.
+  std::vector<std::optional<c10::Stream>> opt_accum_streams;
+  // The events you need to wait for to ensure the corresponding buffers
+  // are ready. The events are updated as we accumulate into the buffer.
+  std::vector<std::optional<c10::Event>> ready_events;
+  // The streams corresponding to the events above. This is only used to
+  // check if more synchronization is needed or not.
+  std::vector<std::optional<c10::Stream>> ready_streams;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace torch::autograd
diff --git a/torch/csrc/autograd/input_metadata.cpp b/torch/csrc/autograd/input_metadata.cpp
index 3758e7bf601d..e97631dc329f 100644
--- a/torch/csrc/autograd/input_metadata.cpp
+++ b/torch/csrc/autograd/input_metadata.cpp
@@ -91,10 +91,17 @@ at::Tensor InputMetadata::maybe_reduce(
     const auto& target = desired[target_dim - i - 1];
     // The conditions here are written carefully so that we are able to
     // infer deferred runtime asserts
+<<<<<<< HEAD
     if (TORCH_GUARD_SIZE_OBLIVIOUS(size.sym_eq(1))) {
       // NB: we could short circuit this once needs_reduce is true but there's
       // no point since the reduction function will guard on this anyway
       if (!c10::definitely_true(size.sym_eq(target), __FILE__, __LINE__)) {
+=======
+    if (TORCH_GUARD_OR_FALSE(size.sym_eq(1))) {
+      // NB: we could short circuit this once needs_reduce is true but there's
+      // no point since the reduction function will guard on this anyway
+      if (!c10::guard_or_false(size.sym_eq(target), __FILE__, __LINE__)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         needs_reduce = true;
       }
     } else {
diff --git a/torch/csrc/autograd/jit_decomp_interface.h b/torch/csrc/autograd/jit_decomp_interface.h
index d20c425d28bb..072e2643cec6 100644
--- a/torch/csrc/autograd/jit_decomp_interface.h
+++ b/torch/csrc/autograd/jit_decomp_interface.h
@@ -41,10 +41,18 @@ struct TORCH_API JitDecompInterface {
 TORCH_API void setJitDecompImpl(JitDecompInterface* impl);
 TORCH_API JitDecompInterface* getJitDecompImpl();
 
+<<<<<<< HEAD
 struct TORCH_API JitDecompRegisterer {
   explicit JitDecompRegisterer(JitDecompInterface* impl) {
     setJitDecompImpl(impl);
   }
 };
+=======
+struct TORCH_API JitDecompRegisterer{explicit JitDecompRegisterer(
+    JitDecompInterface * impl){setJitDecompImpl(impl);
+} // namespace torch::autograd::impl
+}
+;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::autograd::impl
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 2b1e6f2e0104..68b6d69c3b28 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -8,7 +8,10 @@
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
 #include <c10/util/overloaded.h>
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/profiler/api.h>
 #include <torch/csrc/profiler/collection.h>
 #include <torch/csrc/profiler/containers.h>
@@ -21,8 +24,11 @@
 #include <torch/csrc/profiler/standalone/privateuse1_observer.h>
 #include <torch/csrc/profiler/util.h>
 
+<<<<<<< HEAD
 #include <ATen/Context.h>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <stdexcept>
 #include <utility>
 
@@ -625,7 +631,11 @@ void prepareProfiler(
     /*
      * Sending a warning and passing the non-standard event to the backend
      * Backend can abort if the event is not supported.
+<<<<<<< HEAD
      * TODO Should we gracefully drop the invalid event if we have atleast one
+=======
+     * TODO Should we gracefully drop the invalid event if we have at least one
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      * valid?
      */
     auto is_standard_event = [](const std::string& event) -> bool {
@@ -693,6 +703,7 @@ void toggleCollectionDynamic(
     const bool enable,
     const std::set<torch::profiler::impl::ActivityType>& activities) {
   if (activities.count(torch::autograd::profiler::ActivityType::CPU) > 0 &&
+<<<<<<< HEAD
       activities.count(torch::autograd::profiler::ActivityType::CUDA) == 0) {
     LOG(WARNING)
         << "Toggling CPU activity with CUDA activity on may result in traces with CUDA events on artibrary tracks";
@@ -704,6 +715,22 @@ void toggleCollectionDynamic(
   }
   for (auto act : activities) {
     if (act == torch::autograd::profiler::ActivityType::CUDA) {
+=======
+      (activities.count(torch::autograd::profiler::ActivityType::CUDA) == 0 ||
+       activities.count(torch::autograd::profiler::ActivityType::XPU) == 0)) {
+    LOG(WARNING)
+        << "Toggling CPU activity with GPU activity on may result in traces with GPU events on artibrary tracks";
+  } else if (
+      (activities.count(torch::autograd::profiler::ActivityType::CUDA) > 0 ||
+       activities.count(torch::autograd::profiler::ActivityType::XPU) > 0) &&
+      activities.count(torch::autograd::profiler::ActivityType::CPU) == 0) {
+    LOG(WARNING)
+        << "Toggling GPU activity with CPU activity on may result in traces with incorrect correlation between CPU and GPU events";
+  }
+  for (auto act : activities) {
+    if (act == torch::autograd::profiler::ActivityType::CUDA ||
+        act == torch::autograd::profiler::ActivityType::XPU) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       torch::profiler::impl::kineto::toggleCollectionDynamic(enable);
     } else if (act == torch::autograd::profiler::ActivityType::CPU) {
       toggleCPUCollectionDynamic(enable);
@@ -860,6 +887,25 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
 
   return result;
 }
+<<<<<<< HEAD
+=======
+namespace tracer = torch::profiler::impl::python_tracer;
+static std::unique_ptr<tracer::PythonMemoryTracerBase> memory_tracer;
+void startMemoryProfile() {
+  if (memory_tracer == nullptr) {
+    memory_tracer = tracer::PythonMemoryTracerBase::make();
+  }
+  memory_tracer->start();
+}
+
+void stopMemoryProfile() {
+  memory_tracer->stop();
+}
+
+void exportMemoryProfile(const std::string& filename) {
+  memory_tracer->export_memory_history(filename);
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 KinetoEvent::KinetoEvent(
     const std::shared_ptr<const torch::profiler::impl::Result>& result,
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index cedf58123381..fba7f832c7f1 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -185,6 +185,13 @@ TORCH_API void toggleCollectionDynamic(
     const bool enable,
     const std::set<torch::profiler::impl::ActivityType>& activities);
 
+<<<<<<< HEAD
+=======
+TORCH_API void startMemoryProfile();
+TORCH_API void stopMemoryProfile();
+TORCH_API void exportMemoryProfile(const std::string& path);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /**
  * When a C++ thread really has no control over how the profiler was enabled,
  * for example, by some unreachable Python code, it can call these functions
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index bc82b2360252..3c08ec49ead8 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -27,6 +27,10 @@
 #include <torch/csrc/profiler/util.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_compat.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/utils/python_numbers.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/utils/python_strings.h>
 #include <optional>
 
@@ -646,7 +650,17 @@ struct ThreadLocalResults {
   ThreadLocalResults& operator=(const ThreadLocalResults&&) = delete;
 
   ~ThreadLocalResults() {
+<<<<<<< HEAD
     Py_DECREF((PyObject*)ctx_);
+=======
+    // Currently, there is a bug in Profiler when using Python 3.12 that causes
+    // a segfault when decrementing the refcount of a TraceContext during
+    // on-demand. We are purposefully allowing for a small leak in this
+    // situation to avoid the segfault. This should be fixed in the future.
+#if PY_MAJOR_VERSION < 3 || (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 12)
+    Py_DECREF((PyObject*)ctx_);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   template <CallType C, EventType E, typename Ephemeral, typename... Args>
@@ -705,10 +719,20 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
   void recordCCall(
       ThreadLocalResults& tls,
       PyFrameObject* frame,
+<<<<<<< HEAD
       PyObject* arg);
 
   const std::vector<PyThreadState*> interpreterThreads() const;
 
+=======
+      PyObject* arg,
+      bool start_frame = false);
+
+  const std::vector<PyThreadState*> interpreterThreads() const;
+
+  PyObject* get_callable_from_frame(PyFrameObject* frame);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::atomic<bool> active_lock_{false};
   bool active_{false};
 
@@ -787,6 +811,19 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
 
     for (auto it = current_stack.rbegin(); it != current_stack.rend(); it++) {
       recordPyCall(thread_local_results_.back(), it->get(), true);
+<<<<<<< HEAD
+=======
+      PyFrameObject* frame = it->get();
+      PyObject* callable = get_callable_from_frame(frame);
+      if (callable) {
+        // If the frame has a callable, record it as a C call since
+        // PyEval_GetFrame only gets the python frame. We need to record this C
+        // call so that when exiting the profiler we don't have a mismatched C
+        // call.
+        recordCCall(thread_local_results_.back(), it->get(), callable, true);
+      }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto frame_refcount = Py_REFCNT(it->get());
 
       // We hold one reference in `current_stack`, and the interpreter holds
@@ -901,8 +938,18 @@ void PythonTracer::recordPyCall(
 void PythonTracer::recordCCall(
     ThreadLocalResults& tls,
     PyFrameObject* frame,
+<<<<<<< HEAD
     PyObject* arg) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(PyCFunction_Check(arg));
+=======
+    PyObject* arg,
+    bool start_frame) {
+  // for starting frames we duplicate callable python functions to avoid having
+  // empty C frames in trace when exiting
+  if (!start_frame) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(PyCFunction_Check(arg));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto fn = reinterpret_cast<PyCFunctionObject*>(arg);
 
   // NB: For C calls a new frame is not created, so we use `frame` rather than
@@ -912,6 +959,29 @@ void PythonTracer::recordCCall(
   queue_->getSubqueue()->emplace_py_call(key, c10::getApproximateTime());
 }
 
+<<<<<<< HEAD
+=======
+PyObject* PythonTracer::get_callable_from_frame(PyFrameObject* frame) {
+  if (frame == nullptr) {
+    return nullptr;
+  }
+  // Get the code object associated with the frame
+  auto code = THPCodeObjectPtr(PyFrame_GetCode(frame));
+  if (code == nullptr) {
+    return nullptr;
+  }
+  // Get the function name (if needed)
+  auto name = THPUtils_unpackStringView(code->co_name).data();
+  // To get the function object, you will need to look in the globals or the
+  // frame's f_globals
+  PyObject* func = PyDict_GetItemString(PyFrame_GetGlobals(frame), name);
+  if (func) {
+    Py_INCREF(func); // Make sure the returned function has a reference
+  }
+  return func; // Returns a PyObject* (the function)
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // ============================================================================
 // == Post processing =========================================================
 // ============================================================================
@@ -994,9 +1064,19 @@ class PostProcess {
     using stack_t = std::vector<std::shared_ptr<Result>>;
     const auto initial_size = out.size();
     auto pop = [](stack_t& stack, c10::time_t t) {
+<<<<<<< HEAD
       TORCH_INTERNAL_ASSERT(!stack.empty(), "Python replay stack is empty.");
       std::get<ExtraFields<E>>(stack.back()->extra_fields_).end_time_ns_ = t;
       stack.pop_back();
+=======
+      if (!stack.empty()) {
+        std::get<ExtraFields<E>>(stack.back()->extra_fields_).end_time_ns_ = t;
+        stack.pop_back();
+      } else {
+        TORCH_WARN_ONCE(
+            "Python replay stack is empty during pop operation! May result in incorrect stack tracing.");
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     };
 
     ska::flat_hash_map<size_t, stack_t> stacks;
@@ -1113,6 +1193,79 @@ std::vector<std::shared_ptr<Result>> PythonTracer::getEvents(
 
   return out;
 }
+<<<<<<< HEAD
+=======
+// ============================================================================
+// == Memory Tracer ======================================================
+// ============================================================================
+
+// Assuming python_tracer::PythonMemoryTracerBase is defined elsewhere
+class PythonMemoryTracer final : public python_tracer::PythonMemoryTracerBase {
+ public:
+  explicit PythonMemoryTracer() = default;
+  ~PythonMemoryTracer() override = default;
+  void start() override;
+  void stop() override;
+  void export_memory_history(const std::string& path) override;
+};
+
+static void toggle_memory_tracing(bool enable) {
+  pybind11::gil_scoped_acquire gil;
+  THPObjectPtr torch_cuda_memory_module(
+      PyImport_ImportModule("torch.cuda.memory"));
+  if (!torch_cuda_memory_module) {
+    return;
+  }
+  THPObjectPtr snapshot_func(PyObject_GetAttrString(
+      torch_cuda_memory_module.get(), "_record_memory_history_impl"));
+  if (!snapshot_func) {
+    return;
+  }
+  // Call the function with arguments
+  PyObject* args = PyTuple_New(6);
+  PyTuple_SetItem(args, 0, enable ? PyUnicode_FromString("all") : Py_None);
+  PyTuple_SetItem(args, 1, PyUnicode_FromString("all")); // context
+  PyTuple_SetItem(args, 2, PyUnicode_FromString("all")); // stacks
+  PyTuple_SetItem(args, 3, THPUtils_packInt64(100000)); // max_entries
+  PyTuple_SetItem(args, 4, Py_None); // device (None)
+  PyTuple_SetItem(args, 5, PyBool_FromLong(0)); // clear_history (False)
+  PyObject* result = PyObject_Call(snapshot_func.get(), args, nullptr);
+  Py_DECREF(args);
+  if (result == nullptr) {
+    return;
+  }
+}
+
+void PythonMemoryTracer::start() {
+  toggle_memory_tracing(true);
+}
+
+void PythonMemoryTracer::export_memory_history(const std::string& path) {
+  pybind11::gil_scoped_acquire gil;
+  THPObjectPtr torch_cuda_memory_module(
+      PyImport_ImportModule("torch.cuda.memory"));
+  if (!torch_cuda_memory_module) {
+    return;
+  }
+  THPObjectPtr snapshot_func(
+      PyObject_GetAttrString(torch_cuda_memory_module.get(), "_dump_snapshot"));
+  if (!snapshot_func) {
+    return;
+  }
+  PyObject* py_filename = PyUnicode_FromString(path.c_str());
+  // Call the function with arguments (e.g., a file path)
+  PyObject* args = PyTuple_Pack(1, py_filename);
+  PyObject* result = PyObject_Call(snapshot_func.get(), args, nullptr);
+  Py_DECREF(args);
+  if (result == nullptr) {
+    return;
+  }
+}
+
+void PythonMemoryTracer::stop() {
+  toggle_memory_tracing(false);
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // ============================================================================
 // == API =====================================================================
@@ -1150,6 +1303,14 @@ std::unique_ptr<python_tracer::PythonTracerBase> getTracer(
     torch::profiler::impl::RecordQueue* queue) {
   return std::make_unique<PythonTracer>(queue);
 }
+<<<<<<< HEAD
+=======
+
+std::unique_ptr<python_tracer::PythonMemoryTracerBase> getMemoryTracer() {
+  return std::make_unique<PythonMemoryTracer>();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 } // namespace torch::profiler::impl
 
@@ -1160,5 +1321,10 @@ void init() {
   TORCH_CHECK(PyType_Ready(&torch::profiler::impl::TraceContextType) == 0);
   torch::profiler::impl::python_tracer::registerTracer(
       &torch::profiler::impl::getTracer);
+<<<<<<< HEAD
+=======
+  torch::profiler::impl::python_tracer::registerMemoryTracer(
+      &torch::profiler::impl::getMemoryTracer);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace torch::autograd::profiler::python_tracer
diff --git a/torch/csrc/autograd/python_cpp_function.cpp b/torch/csrc/autograd/python_cpp_function.cpp
index cbaa06211113..f813c0161661 100644
--- a/torch/csrc/autograd/python_cpp_function.cpp
+++ b/torch/csrc/autograd/python_cpp_function.cpp
@@ -200,7 +200,11 @@ PyObject* THPCppFunction_sequence_nr(PyObject* self, PyObject* noargs) {
   return THPUtils_packUInt64(fn.sequence_nr());
 }
 
+<<<<<<< HEAD
 PyObject* THPCppFunction_set_sequence_nr(
+=======
+static PyObject* THPCppFunction_set_sequence_nr(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* self,
     PyObject* sequence_nr) {
   HANDLE_TH_ERRORS
@@ -278,7 +282,11 @@ struct DefaultFunctionType {
   PyTypeObject type;
 };
 
+<<<<<<< HEAD
 PyTypeObject* get_default_type() {
+=======
+static PyTypeObject* get_default_type() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static DefaultFunctionType default_type;
   return &(default_type.type);
 }
@@ -339,7 +347,11 @@ bool THPCppFunction_Check(PyObject* obj) {
   }
 }
 
+<<<<<<< HEAD
 PyObject* callRegisterFn(PyObject* dict, PyObject* hook) {
+=======
+static PyObject* callRegisterFn(PyObject* dict, PyObject* hook) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   THPObjectPtr register_fn(
       PyObject_GetAttrString(THPFunctionClass, "_register_hook"));
   if (!register_fn) {
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index 60641f4403ba..fc2a904bfea1 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -160,8 +160,11 @@ c10::intrusive_ptr<at::ivalue::Future> PythonEngine::execute_with_graph_task(
 }
 } // namespace torch::autograd::python
 
+<<<<<<< HEAD
 PyObject* THPEngineClass = nullptr;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static Edge parseGradientEdge(PyObject* obj, int64_t index) {
   PyObject* grad_fn = PyTuple_GetItem(obj, 0);
   auto output_nr = THPUtils_unpackLong(PyTuple_GetItem(obj, 1));
@@ -181,7 +184,11 @@ static Edge parseGradientEdge(PyObject* obj, int64_t index) {
 }
 
 // Implementation of torch._C._EngineBase.run_backward
+<<<<<<< HEAD
 PyObject* THPEngine_run_backward(
+=======
+static PyObject* THPEngine_run_backward(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* self,
     PyObject* args,
     PyObject* kwargs) {
@@ -396,7 +403,11 @@ PyObject* THPEngine_run_backward(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPEngine_queue_callback(PyObject* self, PyObject* _callback) {
+=======
+static PyObject* THPEngine_queue_callback(PyObject* self, PyObject* _callback) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   auto& engine = python::PythonEngine::get_python_engine();
   std::shared_ptr<PyObject> callback(_callback, [](PyObject* obj) {
@@ -431,7 +442,13 @@ PyObject* THPEngine_queue_callback(PyObject* self, PyObject* _callback) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPEngine_is_checkpoint_valid(PyObject* self, PyObject* noargs) {
+=======
+static PyObject* THPEngine_is_checkpoint_valid(
+    PyObject* self,
+    PyObject* noargs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   auto& engine = python::PythonEngine::get_python_engine();
   if (engine.is_checkpoint_valid()) {
@@ -442,24 +459,44 @@ PyObject* THPEngine_is_checkpoint_valid(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPEngine_new(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
+=======
+static PyObject* THPEngine_new(
+    PyTypeObject* type,
+    PyObject* args,
+    PyObject* kwargs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return type->tp_alloc(type, 0);
 }
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 static struct PyMethodDef THPEngine_methods[] = {
+<<<<<<< HEAD
     {(char*)"run_backward",
      castPyCFunctionWithKeywords(THPEngine_run_backward),
      METH_VARARGS | METH_KEYWORDS,
      nullptr},
     {(char*)"queue_callback", THPEngine_queue_callback, METH_O, nullptr},
     {(char*)"is_checkpoint_valid",
+=======
+    {"run_backward",
+     castPyCFunctionWithKeywords(THPEngine_run_backward),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"queue_callback", THPEngine_queue_callback, METH_O, nullptr},
+    {"is_checkpoint_valid",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      THPEngine_is_checkpoint_valid,
      METH_NOARGS,
      nullptr},
     {nullptr}};
 
+<<<<<<< HEAD
 PyTypeObject THPEngineType = {
+=======
+static PyTypeObject THPEngineType = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyVarObject_HEAD_INIT(nullptr, 0)
     "torch._C._EngineBase", /* tp_name */
     sizeof(THPEngine), /* tp_basicsize */
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 978cf5c43f3a..f0cee0e3241f 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -34,6 +34,10 @@
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/utils/tensor_dtypes.h>
 
+<<<<<<< HEAD
+=======
+#include <torch/csrc/autograd/function.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <functional>
 #include <memory>
 #include <stdexcept>
@@ -59,6 +63,23 @@ PyObject* THPGradientEdgeClass = nullptr;
 // Anonymous namespace for helpful functions used in this file
 namespace {
 
+<<<<<<< HEAD
+=======
+inline void check_legacy_fn_attr_access(
+    const std::shared_ptr<torch::autograd::Node>& cdata,
+    const char* attr) {
+  TORCH_CHECK(
+      cdata,
+      "Attribute '",
+      attr,
+      "' is invalid for this instance of _C._FunctionBase. "
+      "Accessing this attribute directly on an instance of autograd.Function "
+      "is a legacy access pattern that is no longer supported. For examples "
+      "on how to use new‑style autograd functions, see "
+      "https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function ");
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // TODO: We shouldn't need to call this function because the engine
 // can already persist the errors for us. This still seems to be
 // needed for the DistEngine however.
@@ -245,7 +266,10 @@ auto PyNode::apply_with_saved_impl(
     Py_CLEAR(py_fn->compiled_autograd_backward_state);
   }
   THPObjectPtr r(PyObject_CallMethod(
+<<<<<<< HEAD
       // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       saved.get_py_compiler(),
       "proxy_call_backward",
       "OOOiOO",
@@ -1141,6 +1165,7 @@ PyObject* process_outputs(
 PyObject* THPFunction_name(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   auto cdata = ((THPFunction*)self)->cdata.lock();
+<<<<<<< HEAD
   TORCH_CHECK(
       cdata,
       "Attribute 'name' is invalid for this instance of _C._FunctionBase. "
@@ -1148,6 +1173,9 @@ PyObject* THPFunction_name(PyObject* self, PyObject* noargs) {
       "access pattern that is no longer supported. For examples on how to use new-style "
       "autograd functions, see "
       "https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function ");
+=======
+  check_legacy_fn_attr_access(cdata, "name");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_packString(cdata->name());
   END_HANDLE_TH_ERRORS
 }
@@ -1155,6 +1183,10 @@ PyObject* THPFunction_name(PyObject* self, PyObject* noargs) {
 PyObject* THPFunction_sequence_nr(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS;
   auto cdata = ((THPFunction*)self)->cdata.lock();
+<<<<<<< HEAD
+=======
+  check_legacy_fn_attr_access(cdata, "_sequence_nr");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_packUInt64(cdata->sequence_nr());
   END_HANDLE_TH_ERRORS
 }
@@ -1162,6 +1194,10 @@ PyObject* THPFunction_sequence_nr(PyObject* self, PyObject* noargs) {
 PyObject* THPFunction_set_sequence_nr(PyObject* self, PyObject* sequence_nr) {
   HANDLE_TH_ERRORS;
   auto cdata = ((THPFunction*)self)->cdata.lock();
+<<<<<<< HEAD
+=======
+  check_legacy_fn_attr_access(cdata, "_set_sequence_nr");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cdata->set_sequence_nr(THPUtils_unpackUInt64(sequence_nr));
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -1170,6 +1206,10 @@ PyObject* THPFunction_set_sequence_nr(PyObject* self, PyObject* sequence_nr) {
 PyObject* THPFunction_input_metadata(PyObject* self, void* unused) {
   HANDLE_TH_ERRORS;
   auto cdata = ((THPFunction*)self)->cdata.lock();
+<<<<<<< HEAD
+=======
+  check_legacy_fn_attr_access(cdata, "_input_metadata");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto num_inputs = cdata->num_inputs();
   THPObjectPtr list(PyTuple_New(num_inputs));
   if (!list) {
@@ -1387,6 +1427,7 @@ PyObject* THPFunction__register_hook_dict(PyObject* _self, PyObject* _var) {
       new PyFunctionTensorPreHook(var->backward_hooks, tensor.output_nr()));
   auto self = (THPFunction*)_self;
   auto cdata = self->cdata.lock();
+<<<<<<< HEAD
   TORCH_CHECK(
       cdata,
       "Attribute '_register_hook_dict' is invalid for this instance of _C._FunctionBase. "
@@ -1394,6 +1435,9 @@ PyObject* THPFunction__register_hook_dict(PyObject* _self, PyObject* _var) {
       "access pattern that is no longer supported. For examples on how to use new-style "
       "autograd functions, see "
       "https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function ");
+=======
+  check_legacy_fn_attr_access(cdata, "_register_hook_dict");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   cdata->add_tensor_pre_hook(std::move(hook));
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -1403,6 +1447,7 @@ PyObject* THPFunction_register_hook(PyObject* _self, PyObject* hook) {
   HANDLE_TH_ERRORS
   auto self = (THPFunction*)_self;
   auto cdata = self->cdata.lock();
+<<<<<<< HEAD
   TORCH_CHECK(
       cdata,
       "Attribute 'register_hook' is invalid for this instance of _C._FunctionBase. "
@@ -1410,6 +1455,9 @@ PyObject* THPFunction_register_hook(PyObject* _self, PyObject* hook) {
       "access pattern that is no longer supported. For examples on how to use new-style "
       "autograd functions, see "
       "https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function ");
+=======
+  check_legacy_fn_attr_access(cdata, "register_hook");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return torch::autograd::registerFunctionHook(*cdata, hook);
   END_HANDLE_TH_ERRORS
 }
@@ -1418,6 +1466,7 @@ PyObject* THPFunction_register_prehook(PyObject* _self, PyObject* hook) {
   HANDLE_TH_ERRORS
   auto self = (THPFunction*)_self;
   auto cdata = self->cdata.lock();
+<<<<<<< HEAD
   TORCH_CHECK(
       cdata,
       "Attribute 'register_prehook' is invalid for this instance of _C._FunctionBase. "
@@ -1425,6 +1474,9 @@ PyObject* THPFunction_register_prehook(PyObject* _self, PyObject* hook) {
       "access pattern that is no longer supported. For examples on how to use new-style "
       "autograd functions, see "
       "https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function ");
+=======
+  check_legacy_fn_attr_access(cdata, "register_prehook");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return torch::autograd::registerFunctionPreHook(*cdata, hook);
   END_HANDLE_TH_ERRORS
 }
@@ -1567,6 +1619,7 @@ PyObject* THPFunction_raw_saved_tensors(THPFunction* self, void* _unused) {
 PyObject* THPFunction_next_functions(THPFunction* self, void* _unused) {
   HANDLE_TH_ERRORS
   auto cdata = self->cdata.lock();
+<<<<<<< HEAD
   TORCH_CHECK(
       cdata,
       "Attribute 'next_functions' is invalid for this instance of _C._FunctionBase. "
@@ -1574,6 +1627,9 @@ PyObject* THPFunction_next_functions(THPFunction* self, void* _unused) {
       "access pattern that is no longer supported. For examples on how to use new-style "
       "autograd functions, see "
       "https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function ");
+=======
+  check_legacy_fn_attr_access(cdata, "next_functions");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto num_outputs = cdata->num_outputs();
   THPObjectPtr result(PyTuple_New(num_outputs));
   if (!result)
@@ -1624,7 +1680,11 @@ using setter = int (*)(PyObject*, PyObject*, void*);
 
 namespace {
 
+<<<<<<< HEAD
 template <PyObject* THPFunction::*ptr>
+=======
+template <PyObject* THPFunction::* ptr>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyObject* getObject(PyObject* obj, void* _unused) {
   auto self = (THPFunction*)obj;
   PyObject* value = self->*ptr;
@@ -1635,7 +1695,11 @@ PyObject* getObject(PyObject* obj, void* _unused) {
   return value;
 }
 
+<<<<<<< HEAD
 template <PyObject* THPFunction::*ptr>
+=======
+template <PyObject* THPFunction::* ptr>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 int setObject(PyObject* obj, PyObject* value, void* _unused) {
   auto self = (THPFunction*)obj;
   if (value == Py_None) {
@@ -1647,13 +1711,21 @@ int setObject(PyObject* obj, PyObject* value, void* _unused) {
   return 0;
 }
 
+<<<<<<< HEAD
 template <typename M, M THPFunction::*ptr, PyObject* (*Convert)(long)>
+=======
+template <typename M, M THPFunction::* ptr, PyObject* (*Convert)(long)>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyObject* getMember(PyObject* obj, void* _unused) {
   auto self = (THPFunction*)obj;
   return Convert(self->*ptr);
 }
 
+<<<<<<< HEAD
 template <typename M, M autograd::Node::*ptr, PyObject* (*Convert)(long)>
+=======
+template <typename M, M autograd::Node::* ptr, PyObject* (*Convert)(long)>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 PyObject* getImplMember(PyObject* obj, void* _unused) {
   auto self = (THPFunction*)obj;
   return Convert(self->cdata.*ptr);
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
index 3c6e9378f55d..b6e45972a863 100644
--- a/torch/csrc/autograd/python_legacy_variable.cpp
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -108,7 +108,11 @@ static PyObject* THPVariable_pynew(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyTypeObject THPLegacyVariableType = {
+=======
+static PyTypeObject THPLegacyVariableType = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyVarObject_HEAD_INIT(nullptr, 0)
     "torch._C._LegacyVariableBase", /* tp_name */
     0, /* tp_basicsize */
diff --git a/torch/csrc/autograd/python_nested_functions_manual.cpp b/torch/csrc/autograd/python_nested_functions_manual.cpp
index ecc29e3bb73c..d863a9fb4515 100644
--- a/torch/csrc/autograd/python_nested_functions_manual.cpp
+++ b/torch/csrc/autograd/python_nested_functions_manual.cpp
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+#include <torch/csrc/autograd/python_nested_functions.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/utils/nested.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <torch/csrc/utils/python_arg_parser.h>
diff --git a/torch/csrc/autograd/python_torch_functions.h b/torch/csrc/autograd/python_torch_functions.h
index 61442c46341d..b59310290112 100644
--- a/torch/csrc/autograd/python_torch_functions.h
+++ b/torch/csrc/autograd/python_torch_functions.h
@@ -20,6 +20,10 @@ inline PyObject* TypeError_to_NotImplemented_(
   return ret;
 }
 
+<<<<<<< HEAD
 void initTorchFunctions();
+=======
+void initTorchFunctions(PyObject* module);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::autograd
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 39d4c4e49dd0..ece89e97a4ca 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -46,7 +46,11 @@ namespace torch::autograd {
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 PyObject* THPVariableFunctionsModule = nullptr;
 
+<<<<<<< HEAD
 inline Tensor dispatch_range(
+=======
+inline static Tensor dispatch_range(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
@@ -56,7 +60,11 @@ inline Tensor dispatch_range(
   return at::range_out(result, start, end, step);
 }
 
+<<<<<<< HEAD
 inline Tensor dispatch_range(
+=======
+inline static Tensor dispatch_range(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Scalar& start,
     const Scalar& end,
     const Scalar& step,
@@ -221,7 +229,11 @@ static PyObject* THPVariable_sparse_coo_tensor(
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
   static PythonArgParser parser({
+<<<<<<< HEAD
       "sparse_coo_tensor(PyObject* indices, PyObject* values, *, ScalarType dtype=None, Device? device=None, bool pin_memory=False, bool requires_grad=False, bool check_invariants=None)",
+=======
+      "sparse_coo_tensor(PyObject* indices, PyObject* values, *, ScalarType dtype=None, Device? device=None, bool pin_memory=False, bool requires_grad=False, bool check_invariants=None, bool is_coalesced=None)",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "sparse_coo_tensor(PyObject* indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool pin_memory=False, bool requires_grad=False, bool check_invariants=None, bool is_coalesced=None)",
       "sparse_coo_tensor(IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False, bool check_invariants=None)",
   });
@@ -486,11 +498,22 @@ static PyObject* THPVariable_numel(
 }
 
 // Sharded function definitions
+<<<<<<< HEAD
 void gatherTorchFunctions_0(std::vector<PyMethodDef>& torch_functions);
 void gatherTorchFunctions_1(std::vector<PyMethodDef>& torch_functions);
 void gatherTorchFunctions_2(std::vector<PyMethodDef>& torch_functions);
 
 void gatherTorchFunctions(std::vector<PyMethodDef>& torch_functions) {
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+void gatherTorchFunctions_0(std::vector<PyMethodDef>& torch_functions);
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+void gatherTorchFunctions_1(std::vector<PyMethodDef>& torch_functions);
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+void gatherTorchFunctions_2(std::vector<PyMethodDef>& torch_functions);
+
+static void gatherTorchFunctions(std::vector<PyMethodDef>& torch_functions) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   constexpr size_t num_functions =
       sizeof(torch_functions_manual) / sizeof(torch_functions_manual[0]);
   torch_functions.assign(
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 91dc82e62e44..d578224d738c 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -17,6 +17,10 @@
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/python_cpp_function.h>
 #include <torch/csrc/autograd/python_hook.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/autograd/python_torch_functions.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/autograd/python_variable_indexing.h>
 #include <torch/csrc/autograd/utils/error_messages.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
@@ -237,7 +241,11 @@ void registerPythonTensorClass(
   c10::Device dev(device);
 
   TORCH_CHECK(
+<<<<<<< HEAD
       dev.type() == kXLA, "Only the python class for XLA can be overriden");
+=======
+      dev.type() == kXLA, "Only the python class for XLA can be overridden");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (device_to_py_class_[static_cast<size_t>(dev.type())] != nullptr) {
     TORCH_WARN(
         "Overriding a previously registered python class for ", dev.str());
@@ -317,7 +325,11 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
   return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var, status);
 }
 
+<<<<<<< HEAD
 bool isResurrectable(THPVariable* self) {
+=======
+static bool isResurrectable(THPVariable* self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We want to divide this check into 2 cases.
 
   // 1. C++ owns PyObject (in this case, self->cdata.unsafeIsBorrowed() is
@@ -406,6 +418,7 @@ static bool THPVariable_tryResurrect(THPVariable* self) {
   return true;
 }
 
+<<<<<<< HEAD
 int THPFake_traverse(THPVariable* self, visitproc visit, void* arg) {
   TORCH_INTERNAL_ASSERT(
       false, "TensorBase tp_traverse function was not overriden properly");
@@ -419,6 +432,21 @@ int THPFake_clear(THPVariable* self) {
 }
 
 PyObject* THPVariable_pynew(
+=======
+static int THPFake_traverse(THPVariable* self, visitproc visit, void* arg) {
+  TORCH_INTERNAL_ASSERT(
+      false, "TensorBase tp_traverse function was not overridden properly");
+  return 0;
+}
+
+static int THPFake_clear(THPVariable* self) {
+  TORCH_INTERNAL_ASSERT(
+      false, "TensorBase tp_clear function was not overridden properly");
+  return 0;
+}
+
+static PyObject* THPVariable_pynew(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyTypeObject* type,
     PyObject* args,
     PyObject* kwargs);
@@ -799,7 +827,13 @@ static PyObject* THPVariable_make_wrapper_subclass(
 using getter = PyObject* (*)(PyObject*, void*);
 using setter = int (*)(PyObject*, PyObject*, void*);
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_python_dispatch(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_python_dispatch(
+    THPVariable* self,
+    void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   const auto& var = THPVariable_Unpack(self);
   return torch::autograd::utils::wrap(
@@ -814,6 +848,10 @@ PyObject* THPVariable_get_python_dispatch(THPVariable* self, void* unused) {
 // - static Tensor fn(const Tensor&);
 //   - This function calls the relevant ATen on the tensor
 template <typename T>
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(bugprone-crtp-constructor-accessibility)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct GetterBase {
   static PyObject* getter(THPVariable* self, void* /*unused*/) {
     HANDLE_TH_ERRORS
@@ -881,7 +919,11 @@ struct PropertyImag : GetterBase<PropertyImag> {
   }
 };
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_cdata(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_cdata(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "_cdata");
@@ -891,7 +933,11 @@ PyObject* THPVariable_get_cdata(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_version(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_version(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "_version");
@@ -901,7 +947,11 @@ PyObject* THPVariable_get_version(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_grad_fn(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_grad_fn(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "grad_fn");
@@ -938,7 +988,14 @@ static PyObject* THPVariable_is_leaf(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 int THPVariable_set_data(THPVariable* self, PyObject* data, void* unused) {
+=======
+static int THPVariable_set_data(
+    THPVariable* self,
+    PyObject* data,
+    void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_setter(self, "data", data);
@@ -955,7 +1012,14 @@ int THPVariable_set_data(THPVariable* self, PyObject* data, void* unused) {
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+<<<<<<< HEAD
 int THPVariable_set_grad(THPVariable* self, PyObject* py_grad, void* unused) {
+=======
+static int THPVariable_set_grad(
+    THPVariable* self,
+    PyObject* py_grad,
+    void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_setter(self, "grad", py_grad);
@@ -1013,7 +1077,11 @@ int THPVariable_set_grad(THPVariable* self, PyObject* py_grad, void* unused) {
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_volatile(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_volatile(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "volatile");
@@ -1026,7 +1094,14 @@ PyObject* THPVariable_get_volatile(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 int THPVariable_set_volatile(THPVariable* self, PyObject* obj, void* unused) {
+=======
+static int THPVariable_set_volatile(
+    THPVariable* self,
+    PyObject* obj,
+    void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_setter(self, "volatile", obj);
@@ -1038,7 +1113,11 @@ int THPVariable_set_volatile(THPVariable* self, PyObject* obj, void* unused) {
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_output_nr(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_output_nr(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "output_nr");
@@ -1049,7 +1128,13 @@ PyObject* THPVariable_get_output_nr(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_requires_grad(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_requires_grad(
+    THPVariable* self,
+    void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "requires_grad");
@@ -1062,7 +1147,11 @@ PyObject* THPVariable_get_requires_grad(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_retains_grad(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_retains_grad(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "retains_grad");
@@ -1075,7 +1164,11 @@ PyObject* THPVariable_retains_grad(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_ndim(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_ndim(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "ndim");
@@ -1084,7 +1177,11 @@ PyObject* THPVariable_get_ndim(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_names(PyObject* self, void* unused) {
+=======
+static PyObject* THPVariable_get_names(PyObject* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
     return handle_torch_function_getter((THPVariable*)self, "names");
@@ -1122,7 +1219,14 @@ PyObject* THPVariable_get_names(PyObject* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 int THPVariable_set_names(PyObject* self, PyObject* names, void* unused) {
+=======
+static int THPVariable_set_names(
+    PyObject* self,
+    PyObject* names,
+    void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
     return handle_torch_function_setter((THPVariable*)self, "names", names);
@@ -1140,7 +1244,11 @@ int THPVariable_set_names(PyObject* self, PyObject* names, void* unused) {
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+<<<<<<< HEAD
 int THPVariable_set_requires_grad(
+=======
+static int THPVariable_set_requires_grad(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     THPVariable* self,
     PyObject* obj,
     void* unused) {
@@ -1167,7 +1275,11 @@ int THPVariable_set_requires_grad(
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_name(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_name(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (check_has_torch_function((PyObject*)self)) {
     HANDLE_TH_ERRORS
     return handle_torch_function_getter(self, "name");
@@ -1179,7 +1291,13 @@ PyObject* THPVariable_get_name(THPVariable* self, void* unused) {
   return THPUtils_packString(tensor.name().c_str());
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_backwards_hooks(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_backwards_hooks(
+    THPVariable* self,
+    void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "_backward_hooks");
@@ -1192,7 +1310,11 @@ PyObject* THPVariable_get_backwards_hooks(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 int THPVariable_set_backwards_hooks(
+=======
+static int THPVariable_set_backwards_hooks(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     THPVariable* self,
     PyObject* obj,
     void* unused) {
@@ -1217,7 +1339,11 @@ int THPVariable_set_backwards_hooks(
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_post_accumulate_grad_hooks(
+=======
+static PyObject* THPVariable_get_post_accumulate_grad_hooks(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     THPVariable* self,
     void* unused) {
   HANDLE_TH_ERRORS
@@ -1232,7 +1358,11 @@ PyObject* THPVariable_get_post_accumulate_grad_hooks(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 int THPVariable_set_post_accumulate_grad_hooks(
+=======
+static int THPVariable_set_post_accumulate_grad_hooks(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     THPVariable* self,
     PyObject* obj,
     void* unused) {
@@ -1257,7 +1387,11 @@ int THPVariable_set_post_accumulate_grad_hooks(
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_base(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_base(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "_base");
@@ -1270,7 +1404,11 @@ PyObject* THPVariable_get_base(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_get_shape(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_get_shape(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "shape");
@@ -1279,7 +1417,11 @@ PyObject* THPVariable_get_shape(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_cpu(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_cpu(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_cpu");
@@ -1289,7 +1431,11 @@ PyObject* THPVariable_is_cpu(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_cuda(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_cuda(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_cuda");
@@ -1299,7 +1445,11 @@ PyObject* THPVariable_is_cuda(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_mtia(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_mtia(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_mtia");
@@ -1309,7 +1459,11 @@ PyObject* THPVariable_is_mtia(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_xla(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_xla(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_xla");
@@ -1319,7 +1473,11 @@ PyObject* THPVariable_is_xla(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_ipu(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_ipu(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_ipu");
@@ -1329,7 +1487,11 @@ PyObject* THPVariable_is_ipu(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_xpu(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_xpu(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_xpu");
@@ -1339,7 +1501,11 @@ PyObject* THPVariable_is_xpu(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_sparse(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_sparse(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_sparse");
@@ -1349,7 +1515,11 @@ PyObject* THPVariable_is_sparse(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_sparse_csr(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_sparse_csr(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_sparse_csr");
@@ -1359,7 +1529,11 @@ PyObject* THPVariable_is_sparse_csr(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_mkldnn(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_mkldnn(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_mkldnn");
@@ -1369,7 +1543,11 @@ PyObject* THPVariable_is_mkldnn(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_mps(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_mps(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_mps");
@@ -1379,7 +1557,11 @@ PyObject* THPVariable_is_mps(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_maia(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_maia(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_maia");
@@ -1389,7 +1571,11 @@ PyObject* THPVariable_is_maia(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_vulkan(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_vulkan(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_vulkan");
@@ -1399,7 +1585,11 @@ PyObject* THPVariable_is_vulkan(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_quantized(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_quantized(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_quantized");
@@ -1409,7 +1599,11 @@ PyObject* THPVariable_is_quantized(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_meta(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_meta(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_meta");
@@ -1419,7 +1613,11 @@ PyObject* THPVariable_is_meta(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_complex(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_complex(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_complex");
@@ -1429,7 +1627,11 @@ PyObject* THPVariable_is_complex(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_is_nested(THPVariable* self, void* unused) {
+=======
+static PyObject* THPVariable_is_nested(THPVariable* self, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "is_nested");
@@ -1439,7 +1641,11 @@ PyObject* THPVariable_is_nested(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable_has_symbolic_sizes_strides(
+=======
+static PyObject* THPVariable_has_symbolic_sizes_strides(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     THPVariable* self,
     void* unused) {
   HANDLE_TH_ERRORS
@@ -1496,7 +1702,11 @@ static PyObject* THPVariable_get_itemsize(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 int THPVariable_set_real(PyObject* self, PyObject* real, void* unused) {
+=======
+static int THPVariable_set_real(PyObject* self, PyObject* real, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   auto& self_ = THPVariable_Unpack(self);
   auto self_real = at::real(self_);
@@ -1509,7 +1719,11 @@ int THPVariable_set_real(PyObject* self, PyObject* real, void* unused) {
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+<<<<<<< HEAD
 int THPVariable_set_imag(PyObject* self, PyObject* imag, void* unused) {
+=======
+static int THPVariable_set_imag(PyObject* self, PyObject* imag, void* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   auto& self_ = THPVariable_Unpack(self);
   auto self_imag = at::imag(self_);
@@ -1522,7 +1736,11 @@ int THPVariable_set_imag(PyObject* self, PyObject* imag, void* unused) {
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+<<<<<<< HEAD
 PyObject* THPVariable__use_count(PyObject* self, PyObject* noargs) {
+=======
+static PyObject* THPVariable__use_count(PyObject* self, PyObject* noargs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   const auto& t = THPVariable_Unpack(self);
   return THPUtils_packUInt64(t.use_count());
@@ -1687,9 +1905,18 @@ struct THPVariableMeta {
   PyHeapTypeObject base;
 };
 
+<<<<<<< HEAD
 int THPVariableMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs);
 
 PyTypeObject THPVariableMetaType = {
+=======
+static int THPVariableMetaType_init(
+    PyObject* cls,
+    PyObject* args,
+    PyObject* kwargs);
+
+static PyTypeObject THPVariableMetaType = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyVarObject_HEAD_INIT(DEFERRED_ADDRESS(&PyType_Type), 0)
     "torch._C._TensorMeta", /* tp_name */
     sizeof(THPVariableMeta), /* tp_basicsize */
@@ -1731,7 +1958,11 @@ PyTypeObject THPVariableMetaType = {
     nullptr, /* tp_new */
 };
 
+<<<<<<< HEAD
 PyTypeObject THPVariableType = {
+=======
+static PyTypeObject THPVariableType = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyVarObject_HEAD_INIT(&THPVariableMetaType, 0)
     "torch._C.TensorBase", /* tp_name */
     sizeof(THPVariable), /* tp_basicsize */
@@ -1928,7 +2159,11 @@ static int THPVariable_subclass_clear(THPVariable* self) {
 // NB: this is not the tp_dealloc on THPVariable; instead, its the dealloc
 // on subclasses.  It's never valid to construct a THPVariable so it's not
 // necessary to implement the dealloc for that case
+<<<<<<< HEAD
 void THPVariable_subclass_dealloc(PyObject* self) {
+=======
+static void THPVariable_subclass_dealloc(PyObject* self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (THPVariable_tryResurrect((THPVariable*)self))
     return;
 
@@ -2308,7 +2543,11 @@ int THPVariableMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs) {
   if (PyType_Type.tp_init(cls, args, kwargs) < 0) {
     return -1;
   }
+<<<<<<< HEAD
   // It is important for all three of these to be overriden correctly for the
+=======
+  // It is important for all three of these to be overridden correctly for the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // resurrection checks to properly happen. In particular, an older version
   // was not overriding tp_clear here. This lead to the default subtype_clear
   // running on the Tensor object (as only TensorBase tp_clear was custom),
@@ -2375,9 +2614,14 @@ namespace torch::autograd {
 
 // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 extern PyMethodDef variable_methods[];
+<<<<<<< HEAD
 extern void initTorchFunctions(PyObject* module);
 
 void initTensorImplConversion(PyObject* module) {
+=======
+
+static void initTensorImplConversion(PyObject* module) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto m = py::handle(module).cast<py::module>();
   m.def("_wrap_tensor_impl", [](void* ptr) {
     auto p = c10::intrusive_ptr<c10::TensorImpl, at::UndefinedTensorImpl>::
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index ae1780e66ba7..572acd8eaa51 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -25,6 +25,10 @@
 #include <ATen/TracerMode.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <c10/core/TensorOptions.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/Exception.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 
 #include <c10/core/Layout.h>
@@ -302,9 +306,29 @@ static bool treatSequenceAsTuple(PyObject* index) {
     }
     if (THPVariable_Check(obj.get()) || PySequence_Check(obj.get()) ||
         PySlice_Check(obj.get())) {
+<<<<<<< HEAD
       return true;
     }
     if (obj.get() == Py_Ellipsis || obj.get() == Py_None) {
+=======
+      TORCH_WARN(
+          "Using a non-tuple sequence for "
+          "multidimensional indexing is deprecated and will be changed in "
+          "pytorch 2.9; use x[tuple(seq)] instead of "
+          "x[seq]. In pytorch 2.9 this will be interpreted as tensor index, "
+          "x[torch.tensor(seq)], which will result either in an error or a "
+          "different result");
+      return true;
+    }
+    if (obj.get() == Py_Ellipsis || obj.get() == Py_None) {
+      TORCH_WARN(
+          "Using a non-tuple sequence for "
+          "multidimensional indexing is deprecated and will be changed in "
+          "pytorch 2.9; use x[tuple(seq)] instead of "
+          "x[seq]. In pytorch 2.9 this will be interpreted as tensor index, "
+          "x[torch.tensor(seq)], which will result either in an error or a "
+          "different result");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return true;
     }
   }
diff --git a/torch/csrc/autograd/record_function_ops.cpp b/torch/csrc/autograd/record_function_ops.cpp
index d005951341ba..2398740908be 100644
--- a/torch/csrc/autograd/record_function_ops.cpp
+++ b/torch/csrc/autograd/record_function_ops.cpp
@@ -75,7 +75,11 @@ static void record_function_exit_new(
 }
 
 template <typename Func>
+<<<<<<< HEAD
 c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut(
+=======
+static c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Func get_record,
     const c10::intrusive_ptr<c10::ivalue::Future>& fut) {
   // Profiling callback that ends the associated record_function
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index 4eeccb18977e..52475f29437f 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -52,8 +52,14 @@ SavedVariable::SavedVariable(
       TORCH_INTERNAL_ASSERT(!is_leaf_ && is_output);
       weak_grad_fn_ = variable.grad_fn();
     }
+<<<<<<< HEAD
 
     auto maybe_hooks = get_default_hooks();
+=======
+    std::unique_ptr<SavedVariableHooks> maybe_hooks =
+        at::SavedTensorDefaultHooks::is_enabled() ? get_default_hooks()
+                                                  : nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Avoid wrapped numbers from being leaked to the user
     if (maybe_hooks && !variable.unsafeGetTensorImpl()->is_wrapped_number()) {
diff --git a/torch/csrc/autograd/utils/lambda_post_hook.h b/torch/csrc/autograd/utils/lambda_post_hook.h
index a98fab04afb9..2fb5c463bc71 100644
--- a/torch/csrc/autograd/utils/lambda_post_hook.h
+++ b/torch/csrc/autograd/utils/lambda_post_hook.h
@@ -27,7 +27,16 @@ class LambdaPostHook : public torch::autograd::FunctionPostHook {
     return fn_(outputs, inputs);
   }
 
+<<<<<<< HEAD
   void compiled_args(CompiledNodeArgs& args) const override {}
+=======
+  void compiled_args(CompiledNodeArgs& args) const override {
+    if (compiled_fn_ != nullptr) {
+      return compiled_fn_(args);
+    }
+    return FunctionPostHook::compiled_args(args);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  protected:
   std::function<variable_list(const variable_list&, const variable_list&)> fn_;
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 20c1fc9543ad..6d7fc5a0745c 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -45,7 +45,11 @@ namespace torch::autograd {
 /// If you change this, update the doc at the top of the
 /// torch/autograd/__init__.py file and
 /// "test_set_requires_grad_only_for_continuous_types" in test/test_autograd.py
+<<<<<<< HEAD
 static inline bool isDifferentiableType(at::ScalarType t) {
+=======
+inline bool isDifferentiableType(at::ScalarType t) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return isFloatingType(t) || isComplexType(t);
 }
 
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
index 7195a87e67eb..544147f6b053 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -184,7 +184,12 @@ void CUDAPluggableAllocator::setMemoryFraction(
   }
 }
 
+<<<<<<< HEAD
 void CUDAPluggableAllocator::emptyCache() {
+=======
+void CUDAPluggableAllocator::emptyCache(
+    /*unused*/ c10::cuda::MempoolId_t mempool_id) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (reset_fn_) {
     return reset_fn_();
   }
@@ -237,8 +242,13 @@ void CUDAPluggableAllocator::resetPeakStats(c10::DeviceIndex device) {
       "If you need it, please file an issue describing your use case.");
 }
 
+<<<<<<< HEAD
 c10::cuda::CUDACachingAllocator::SnapshotInfo CUDAPluggableAllocator::
     snapshot() {
+=======
+c10::cuda::CUDACachingAllocator::SnapshotInfo CUDAPluggableAllocator::snapshot(
+    c10::cuda::MempoolId_t mempool_id) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       false,
       "CUDAPluggableAllocator does not yet support snapshot. "
@@ -290,7 +300,12 @@ void CUDAPluggableAllocator::recordHistory(
     bool enabled,
     c10::cuda::CUDACachingAllocator::CreateContextFn context_recorder,
     size_t alloc_trace_max_entries,
+<<<<<<< HEAD
     c10::cuda::CUDACachingAllocator::RecordContext when) {
+=======
+    c10::cuda::CUDACachingAllocator::RecordContext when,
+    bool clearHistory) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       false,
       "CUDAPluggableAllocator does not yet support recordHistory. "
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h
index d80b3782937e..0587a1080c57 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.h
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -114,7 +114,11 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
   bool initialized() override;
   double getMemoryFraction(c10::DeviceIndex device) override;
   void setMemoryFraction(double fraction, c10::DeviceIndex device) override;
+<<<<<<< HEAD
   void emptyCache() override;
+=======
+  void emptyCache(c10::cuda::MempoolId_t mempool_id = {0, 0}) override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void enable(bool) override {}
   bool isEnabled() const override {
     return true;
@@ -128,7 +132,12 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
       c10::DeviceIndex device) override;
   void resetAccumulatedStats(c10::DeviceIndex device) override;
   void resetPeakStats(c10::DeviceIndex device) override;
+<<<<<<< HEAD
   c10::cuda::CUDACachingAllocator::SnapshotInfo snapshot() override;
+=======
+  c10::cuda::CUDACachingAllocator::SnapshotInfo snapshot(
+      c10::cuda::MempoolId_t mempool) override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void beginAllocateToPool(
       c10::DeviceIndex device,
       c10::cuda::MempoolId_t mempool_id,
@@ -145,7 +154,12 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
       bool enabled,
       c10::cuda::CUDACachingAllocator::CreateContextFn context_recorder,
       size_t alloc_trace_max_entries,
+<<<<<<< HEAD
       c10::cuda::CUDACachingAllocator::RecordContext when) override;
+=======
+      c10::cuda::CUDACachingAllocator::RecordContext when,
+      bool clearHistory) override;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void attachOutOfMemoryObserver(
       c10::cuda::CUDACachingAllocator::OutOfMemoryObserver observer) override;
   void attachAllocatorTraceTracker(
@@ -184,7 +198,11 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
   std::function<void(int, c10::cuda::MempoolId_t)> end_allocate_to_pool_fn_;
   std::function<void(int, c10::cuda::MempoolId_t)> relase_pool_fn_;
   std::mutex allocator_mutex_;
+<<<<<<< HEAD
   // We do the bookeeping here in order to simplify custom allocators
+=======
+  // We do the bookkeeping here in order to simplify custom allocators
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unordered_map<void*, _AllocationMetadata> allocation_metadata_;
 
   bool initialized_ = false;
diff --git a/torch/csrc/cuda/Event.cpp b/torch/csrc/cuda/Event.cpp
index 912dcd6fae26..c55c0566a3ea 100644
--- a/torch/csrc/cuda/Event.cpp
+++ b/torch/csrc/cuda/Event.cpp
@@ -23,6 +23,7 @@ static PyObject* THCPEvent_pynew(
   unsigned char enable_timing = 0;
   unsigned char blocking = 0;
   unsigned char interprocess = 0;
+<<<<<<< HEAD
 
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
   constexpr const char* kwlist[] = {
@@ -31,11 +32,27 @@ static PyObject* THCPEvent_pynew(
           args,
           kwargs,
           "|bbb",
+=======
+  unsigned char external = 0;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  constexpr const char* kwlist[] = {
+      "enable_timing", "blocking", "interprocess", "external", nullptr};
+  if (!PyArg_ParseTupleAndKeywords(
+          args,
+          kwargs,
+          "|bbbb",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
           const_cast<char**>(kwlist),
           &enable_timing,
           &blocking,
+<<<<<<< HEAD
           &interprocess)) {
+=======
+          &interprocess,
+          &external)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return nullptr;
   }
 
@@ -47,7 +64,12 @@ static PyObject* THCPEvent_pynew(
   THCPEvent* self = (THCPEvent*)ptr.get();
   unsigned int flags = (blocking ? cudaEventBlockingSync : cudaEventDefault) |
       (enable_timing ? cudaEventDefault : cudaEventDisableTiming) |
+<<<<<<< HEAD
       (interprocess ? cudaEventInterprocess : cudaEventDefault);
+=======
+      (interprocess ? cudaEventInterprocess : cudaEventDefault) |
+      (external ? cudaEventExternal : cudaEventDefault);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   new (&self->cuda_event) at::cuda::CUDAEvent(flags);
 
@@ -89,8 +111,12 @@ static PyObject* THCPEvent_from_ipc_handle(
   }
   THCPEvent* self = (THCPEvent*)ptr.get();
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   cudaIpcEventHandle_t handle;
+=======
+  cudaIpcEventHandle_t handle{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::memcpy(&handle, handle_string.c_str(), handle_string.size());
   new (&self->cuda_event) at::cuda::CUDAEvent(device.index(), &handle);
 
@@ -172,8 +198,12 @@ static PyObject* THCPEvent_synchronize(PyObject* _self, PyObject* noargs) {
 static PyObject* THCPEvent_ipc_handle(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   auto self = (THCPEvent*)_self;
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   cudaIpcEventHandle_t handle;
+=======
+  cudaIpcEventHandle_t handle{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   self->cuda_event.ipc_handle(&handle);
   return PyBytes_FromStringAndSize((const char*)&handle, sizeof(handle));
   END_HANDLE_TH_ERRORS
@@ -183,6 +213,10 @@ static PyObject* THCPEvent_ipc_handle(PyObject* _self, PyObject* noargs) {
 static struct PyGetSetDef THCPEvent_properties[] = {
     {"device", (getter)THCPEvent_get_device, nullptr, nullptr, nullptr},
     {"cuda_event", (getter)THCPEvent_get_cuda_event, nullptr, nullptr, nullptr},
+<<<<<<< HEAD
+=======
+    {"event_id", (getter)THCPEvent_get_cuda_event, nullptr, nullptr, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 // NOLINTNEXTLINE(*c-arrays*, *global-variables)
diff --git a/torch/csrc/cuda/GdsFile.cpp b/torch/csrc/cuda/GdsFile.cpp
index 34a2184b177a..4b18ae58e883 100644
--- a/torch/csrc/cuda/GdsFile.cpp
+++ b/torch/csrc/cuda/GdsFile.cpp
@@ -1,5 +1,9 @@
 #include <c10/util/error.h>
 #include <pybind11/pybind11.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/cuda/GdsFile.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/utils/pybind.h>
 
 #if defined(USE_CUFILE)
@@ -20,7 +24,11 @@ std::string cuGDSFileGetErrorString(T status) {
                                : std::string(c10::utils::str_error(errno));
 }
 
+<<<<<<< HEAD
 // To get error message for Buf/Handle registeration APIs that return
+=======
+// To get error message for Buf/Handle registration APIs that return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // CUfileError_t
 template <
     class T,
@@ -90,8 +98,12 @@ void gds_deregister_buffer(const at::Storage& storage) {
 
 int64_t gds_register_handle(int fd) {
   CUfileDescr_t cf_descr;
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   CUfileHandle_t cf_handle;
+=======
+  CUfileHandle_t cf_handle{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   memset((void*)&cf_descr, 0, sizeof(CUfileDescr_t));
   cf_descr.handle.fd = fd;
   cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
diff --git a/torch/csrc/cuda/GdsFile.h b/torch/csrc/cuda/GdsFile.h
index 0edf927393db..f79e71187b98 100644
--- a/torch/csrc/cuda/GdsFile.h
+++ b/torch/csrc/cuda/GdsFile.h
@@ -3,5 +3,11 @@
 
 #include <torch/csrc/python_headers.h>
 
+<<<<<<< HEAD
 void initGdsBindings(PyObject* module);
+=======
+namespace torch::cuda::shared {
+void initGdsBindings(PyObject* module);
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // THCP_GDSFILE_INC
diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp
index 827cfec858a5..d6d0128e0958 100644
--- a/torch/csrc/cuda/Graph.cpp
+++ b/torch/csrc/cuda/Graph.cpp
@@ -26,7 +26,11 @@ void THCPGraph_init(PyObject* module) {
   torch_C_m.def("_graph_pool_handle", &::at::cuda::graph_pool_handle);
 
   shared_ptr_class_<::at::cuda::CUDAGraph>(torch_C_m, "_CUDAGraph")
+<<<<<<< HEAD
       .def(py::init<>())
+=======
+      .def(py::init<bool>(), py::arg("keep_graph") = false)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def(
           "capture_begin",
           [](::at::cuda::CUDAGraph& self,
@@ -57,6 +61,12 @@ void THCPGraph_init(PyObject* module) {
           "capture_end",
           torch::wrap_pybind_function_no_gil(&at::cuda::CUDAGraph::capture_end))
       .def(
+<<<<<<< HEAD
+=======
+          "instantiate",
+          torch::wrap_pybind_function_no_gil(&at::cuda::CUDAGraph::instantiate))
+      .def(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "register_generator_state",
           [](::at::cuda::CUDAGraph& self, py::handle raw_generator) {
             auto generator = THPGenerator_Unwrap(raw_generator.ptr());
@@ -87,5 +97,20 @@ void THCPGraph_init(PyObject* module) {
           "debug_dump",
           torch::wrap_pybind_function_no_gil(
               &::at::cuda::CUDAGraph::debug_dump),
+<<<<<<< HEAD
           py::arg("debug_path"));
+=======
+          py::arg("debug_path"))
+      .def(
+          "raw_cuda_graph",
+          [](::at::cuda::CUDAGraph& self) {
+            cudaGraph_t graph = self.raw_cuda_graph();
+            // We return a raw int here, since otherwise pybind11 will
+            // try to return the underlying struct of cudaGraph_t
+            // points to, which is opaque and therefore causes a
+            // compile error.
+            return reinterpret_cast<uintptr_t>(graph);
+          },
+          py::call_guard<py::gil_scoped_release>());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/torch/csrc/cuda/MemPool.cpp b/torch/csrc/cuda/MemPool.cpp
index d5e0030ee7b7..d553ff2d1ea3 100644
--- a/torch/csrc/cuda/MemPool.cpp
+++ b/torch/csrc/cuda/MemPool.cpp
@@ -9,11 +9,16 @@
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void THCPMemPool_init(PyObject* module) {
   auto torch_C_m = py::handle(module).cast<py::module>();
   shared_ptr_class_<::c10::cuda::MemPool>(torch_C_m, "_MemPool")
       .def(
           py::init([](c10::cuda::CUDACachingAllocator::CUDAAllocator* allocator,
+<<<<<<< HEAD
                       bool is_user_created) {
             torch::utils::device_lazy_init(at::kCUDA);
             return std::make_shared<::c10::cuda::MemPool>(
@@ -26,4 +31,18 @@ void THCPMemPool_init(PyObject* module) {
       .def(py::init<c10::cuda::MemPool*>())
       .def_static(
           "active_pool", &::c10::cuda::MemPoolContext::getActiveMemPool);
+=======
+                      bool is_user_created,
+                      bool use_on_oom,
+                      bool symmetric) {
+            torch::utils::device_lazy_init(at::kCUDA);
+            return std::make_shared<::c10::cuda::MemPool>(
+                allocator, is_user_created, use_on_oom, symmetric);
+          }))
+      .def_property_readonly("id", &::c10::cuda::MemPool::id)
+      .def_property_readonly(
+          "is_symmetric", &::c10::cuda::MemPool::is_symmetric)
+      .def_property_readonly("allocator", &::c10::cuda::MemPool::allocator)
+      .def("use_count", &::c10::cuda::MemPool::use_count);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index b81ff5d4eb72..2c364d1cebe6 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -51,6 +51,7 @@
 #include <sstream>
 #include <thread>
 #include <unordered_map>
+<<<<<<< HEAD
 #ifndef WIN32
 #include <pthread.h>
 #endif
@@ -77,6 +78,11 @@ static void poison_fork() {
 #endif
 }
 
+=======
+
+using namespace torch;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ////////////////////////////////////////////////////////////////////////////////
 // CUDA management methods
 ////////////////////////////////////////////////////////////////////////////////
@@ -87,7 +93,11 @@ PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
   auto device = THPUtils_unpackLong(arg);
 
   torch::utils::device_lazy_init(at::kCUDA);
+<<<<<<< HEAD
   c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));
+=======
+  c10::cuda::set_device(static_cast<c10::DeviceIndex>(device), /*force*/ true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -160,14 +170,25 @@ PyObject* THCPModule_canDeviceAccessPeer_wrap(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   poison_fork();
+=======
+  // Note: This is distinct from initExtension because a stub cuda
+  // implementation has some working functions (e.g. device_count) but cannot
+  // fully initialize.
+  torch::utils::register_fork_handler_for_device_init(at::kCUDA);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_packUInt64(at::cuda::device_count());
   END_HANDLE_TH_ERRORS
 }
 
 PyObject* THCPModule_getArchFlags(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   poison_fork();
+=======
+  torch::utils::register_fork_handler_for_device_init(at::kCUDA);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef CUDA_ARCH_FLAGS
   static const char* flags = C10_STRINGIZE(CUDA_ARCH_FLAGS);
   return THPUtils_packString(flags);
@@ -179,7 +200,11 @@ PyObject* THCPModule_getArchFlags(PyObject* self, PyObject* noargs) {
 
 static PyObject* THCPModule_isInBadFork(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   return PyBool_FromLong(in_bad_fork);
+=======
+  return PyBool_FromLong(torch::utils::is_device_in_bad_fork(at::kCUDA));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -284,7 +309,11 @@ PyObject* THCPModule_getCompiledVersion(PyObject* self, PyObject* noargs) {
 
 PyObject* THCPModule_cudaHostAllocator(PyObject* _unused, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   c10::Allocator* allocator = at::cuda::getCachingHostAllocator();
+=======
+  c10::Allocator* allocator = at::getHostAllocator(at::kCUDA);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return PyLong_FromVoidPtr(allocator);
   END_HANDLE_TH_ERRORS
 }
@@ -573,7 +602,11 @@ PyObject* THCPModule_setMemoryFraction(PyObject* _unused, PyObject* args) {
 PyObject* THCPModule_hostEmptyCache(PyObject* _unused, PyObject* noargs) {
   HANDLE_TH_ERRORS {
     pybind11::gil_scoped_release no_gil;
+<<<<<<< HEAD
     at::cuda::CachingHostAllocator_emptyCache();
+=======
+    at::getHostAllocator(at::kCUDA)->empty_cache();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   END_HANDLE_TH_ERRORS
   Py_RETURN_NONE;
@@ -697,7 +730,11 @@ PyObject* THCPModule_hostMemoryStats(PyObject* _unused, PyObject* noargs) {
     return dict;
   };
 
+<<<<<<< HEAD
   const HostStats stats = at::cuda::CachingHostAllocator_getStats();
+=======
+  const HostStats stats = at::getHostAllocator(at::kCUDA)->get_stats();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py::dict result;
   result["num_host_alloc"] = stats.num_host_alloc;
@@ -717,7 +754,11 @@ PyObject* THCPModule_resetAccumulatedHostMemoryStats(
     PyObject* _unused,
     PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   at::cuda::CachingHostAllocator_resetAccumulatedStats();
+=======
+  at::getHostAllocator(at::kCUDA)->reset_accumulated_stats();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
   Py_RETURN_NONE;
 }
@@ -726,7 +767,11 @@ PyObject* THCPModule_resetPeakHostMemoryStats(
     PyObject* _unused,
     PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   at::cuda::CachingHostAllocator_resetPeakStats();
+=======
+  at::getHostAllocator(at::kCUDA)->reset_peak_stats();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
   Py_RETURN_NONE;
 }
@@ -741,8 +786,29 @@ CapturedTraceback* getFromContext(
       "attempting to gather stack context from the wrong StackContext type.");
 }
 
+<<<<<<< HEAD
 PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
   HANDLE_TH_ERRORS
+=======
+PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  c10::cuda::MempoolId_t mempool_id = {0, 0};
+  if (arg && arg != Py_None) {
+    TORCH_CHECK(PyTuple_Check(arg), "mempool_id must be a tuple");
+    Py_ssize_t size = PyTuple_Size(arg);
+    TORCH_CHECK(size == 2, "mempool_id must be a tuple of 2 integers");
+
+    auto id1 = THPObjectPtr(PyTuple_GetItem(arg, 0));
+    auto id2 = THPObjectPtr(PyTuple_GetItem(arg, 1));
+    TORCH_CHECK(
+        THPUtils_checkLong(id1) && THPUtils_checkLong(id2),
+        "mempool_id elements must be integers");
+
+    mempool_id = c10::cuda::MempoolId_t(
+        static_cast<int64_t>(THPUtils_unpackLong(id1)),
+        static_cast<int64_t>(THPUtils_unpackLong(id2)));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   using c10::cuda::CUDACachingAllocator::BlockInfo;
   using c10::cuda::CUDACachingAllocator::SegmentInfo;
@@ -769,6 +835,10 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
   py::str is_expandable_s = "is_expandable";
   py::str frames_s = "frames";
   py::str time_us_s = "time_us";
+<<<<<<< HEAD
+=======
+  py::str compile_context_s = "compile_context";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py::list empty_frames;
   std::vector<CapturedTraceback*> to_gather_frames;
@@ -821,7 +891,11 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
     return segmentDict;
   };
 
+<<<<<<< HEAD
   auto snapshot = c10::cuda::CUDACachingAllocator::snapshot();
+=======
+  auto snapshot = c10::cuda::CUDACachingAllocator::snapshot(mempool_id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py::list segments;
 
@@ -885,6 +959,10 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
       trace_entry[size_s] = te.size_;
       trace_entry[stream_s] = int64_t(te.stream_);
       trace_entry[time_us_s] = te.time_.t_;
+<<<<<<< HEAD
+=======
+      trace_entry[compile_context_s] = te.compile_context_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       trace.append(trace_entry);
     }
     traces.append(trace);
@@ -1054,8 +1132,11 @@ std::string uuid_to_string(const char* uuid_bytes) {
 static void registerCudaDeviceProperties(PyObject* module) {
   // Add _cudaDevicePropertires class to torch._C
   auto m = py::handle(module).cast<py::module>();
+<<<<<<< HEAD
   // until internal build is using a rocm version with uuid attr
 #ifndef FBCODE_CAFFE2
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // CUuuid is defined in either cuda.h or driver_types.h
   // hipified to hipUUID which is defined in hip_runtime_api.h
   py::class_<CUuuid>(m, "_CUuuid")
@@ -1067,7 +1148,10 @@ static void registerCudaDeviceProperties(PyObject* module) {
       .def("__str__", [](const CUuuid& uuid) {
         return uuid_to_string(uuid.bytes);
       });
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::class_<cudaDeviceProp>(m, "_CudaDeviceProperties")
       .def_readonly("name", &cudaDeviceProp::name)
       .def_readonly("major", &cudaDeviceProp::major)
@@ -1105,9 +1189,16 @@ static void registerCudaDeviceProperties(PyObject* module) {
           &cudaDeviceProp::name
 #endif // USE_ROCM
           )
+<<<<<<< HEAD
 #ifndef FBCODE_CAFFE2
       .def_readonly("uuid", &cudaDeviceProp::uuid)
 #endif
+=======
+      .def_readonly("uuid", &cudaDeviceProp::uuid)
+      .def_readonly("pci_bus_id", &cudaDeviceProp::pciBusID)
+      .def_readonly("pci_device_id", &cudaDeviceProp::pciDeviceID)
+      .def_readonly("pci_domain_id", &cudaDeviceProp::pciDomainID)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def_readonly("L2_cache_size", &cudaDeviceProp::l2CacheSize)
       .def("__repr__", [](const cudaDeviceProp& prop) {
         std::ostringstream stream;
@@ -1118,9 +1209,16 @@ static void registerCudaDeviceProperties(PyObject* module) {
 #endif // USE_ROCM
                << ", total_memory=" << prop.totalGlobalMem / (1024ull * 1024)
                << "MB, multi_processor_count=" << prop.multiProcessorCount
+<<<<<<< HEAD
 #ifndef FBCODE_CAFFE2
                << ", uuid=" << uuid_to_string(prop.uuid.bytes)
 #endif
+=======
+               << ", uuid=" << uuid_to_string(prop.uuid.bytes)
+               << ", pci_bus_id=" << prop.pciBusID
+               << ", pci_device_id=" << prop.pciDeviceID
+               << ", pci_domain_id=" << prop.pciDomainID
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                << ", L2_cache_size=" << prop.l2CacheSize / (1024ull * 1024)
                << "MB)";
         return stream.str();
@@ -1128,7 +1226,11 @@ static void registerCudaDeviceProperties(PyObject* module) {
 
   m.def(
       "_cuda_record_memory_history_legacy",
+<<<<<<< HEAD
       static_cast<void (*)(bool, bool, int64_t, bool, bool)>(
+=======
+      static_cast<void (*)(bool, bool, int64_t, bool, bool, bool, bool, bool)>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           torch::cuda::_record_memory_history));
 
   m.def(
@@ -1137,7 +1239,14 @@ static void registerCudaDeviceProperties(PyObject* module) {
           std::optional<std::string>,
           std::optional<std::string>,
           const std::string&,
+<<<<<<< HEAD
           size_t)>(torch::cuda::_record_memory_history));
+=======
+          size_t,
+          bool,
+          bool,
+          bool)>(torch::cuda::_record_memory_history));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def("_cuda_isHistoryEnabled", []() {
     return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
@@ -1353,12 +1462,15 @@ static void registerCudaPluggableAllocator(PyObject* module) {
     return (storage_impl->data_ptr().get_deleter() == alloc->raw_deleter());
   });
 
+<<<<<<< HEAD
   m.def("_storage_Use_Count", [](size_t storage_impl_ptr) {
     // NOLINTNEXTLINE(performance-no-int-to-ptr)
     c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
     return c10::raw::weak_intrusive_ptr::use_count(storage_impl);
   });
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def(
       "_tensors_data_ptrs_at_indices_equal",
       [](py::list& tensors, py::list& data_ptrs, py::list& indices) {
@@ -1409,7 +1521,23 @@ static void registerCudaPluggableAllocator(PyObject* module) {
       });
 
   m.def(
+<<<<<<< HEAD
       "_cuda_endAllocateCurrentStreamToPool",
+=======
+      "_cuda_beginAllocateCurrentThreadToPool",
+      [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
+        auto tid = std::this_thread::get_id();
+
+        c10::cuda::CUDACachingAllocator::beginAllocateToPool(
+            device, mempool_id, [=](cudaStream_t) {
+              auto current_tid = std::this_thread::get_id();
+              return current_tid == tid;
+            });
+      });
+
+  m.def(
+      "_cuda_endAllocateToPool",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
         c10::cuda::CUDACachingAllocator::endAllocateToPool(device, mempool_id);
       });
@@ -1512,8 +1640,13 @@ static PyObject* THCPModule_initExtension(PyObject* self, PyObject* noargs) {
       "please rebuild pytorch without asan if you need to use this module");
 #endif
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
   poison_fork();
+=======
+  TORCH_INTERNAL_ASSERT(!torch::utils::is_device_in_bad_fork(at::kCUDA));
+  torch::utils::register_fork_handler_for_device_init(at::kCUDA);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
 
   auto m = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
@@ -1530,10 +1663,17 @@ static PyObject* THCPModule_initExtension(PyObject* self, PyObject* noargs) {
   auto num_gpus = c10::cuda::device_count();
   auto default_cuda_generators = PyTuple_New(static_cast<Py_ssize_t>(num_gpus));
   for (const auto i : c10::irange(num_gpus)) {
+<<<<<<< HEAD
     auto cast_gen = (THPGenerator*)THPGenerator_initDefaultGenerator(
         at::cuda::detail::getDefaultCUDAGenerator(i));
     // This reference is meant to be given away, so no need to incref here.
     PyTuple_SetItem(default_cuda_generators, i, (PyObject*)cast_gen);
+=======
+    auto cast_gen = THPGenerator_initDefaultGenerator(
+        at::cuda::detail::getDefaultCUDAGenerator(i));
+    // This reference is meant to be given away, so no need to incref here.
+    PyTuple_SetItem(default_cuda_generators, i, cast_gen);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   set_module_attr("default_generators", default_cuda_generators);
   bindGetDeviceProperties(m);
@@ -2016,7 +2156,11 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_resetPeakMemoryStats,
      METH_O,
      nullptr},
+<<<<<<< HEAD
     {"_cuda_memorySnapshot", THCPModule_memorySnapshot, METH_NOARGS, nullptr},
+=======
+    {"_cuda_memorySnapshot", THCPModule_memorySnapshot, METH_O, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"_cuda_attach_out_of_memory_observer",
      THCPModule_attachOutOfMemoryObserver,
      METH_O,
@@ -2189,7 +2333,10 @@ namespace shared {
 
 void initCudartBindings(PyObject* module);
 void initNvtxBindings(PyObject* module);
+<<<<<<< HEAD
 void initGdsBindings(PyObject* module);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(USE_CUDNN) || defined(USE_ROCM)
 void initCudnnBindings(PyObject* module);
 #endif
diff --git a/torch/csrc/cuda/THCP.h b/torch/csrc/cuda/THCP.h
index 697a66dc3ee9..809bb4468b58 100644
--- a/torch/csrc/cuda/THCP.h
+++ b/torch/csrc/cuda/THCP.h
@@ -1,10 +1,19 @@
+<<<<<<< HEAD
 #ifndef THCP_H
 #define THCP_H
+=======
+#pragma once
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/csrc/THP.h>
 #include <torch/csrc/cuda/Event.h>
 #include <torch/csrc/cuda/Module.h>
 #include <torch/csrc/cuda/Stream.h>
+<<<<<<< HEAD
 #include <torch/csrc/python_headers.h>
 
 #endif
+=======
+#include <torch/csrc/cuda/utils.h>
+#include <torch/csrc/python_headers.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index 202022262a7b..d59618a74a14 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -15,6 +15,41 @@ using torch::jit::Pickler;
 using c10::cuda::CUDACachingAllocator::SegmentInfo;
 
 namespace {
+<<<<<<< HEAD
+=======
+
+class CallbackManager {
+ public:
+  // Constructor
+  CallbackManager() = default;
+  // Destructor
+  ~CallbackManager() = default;
+  // Methods to get and set the callback handles
+  at::CallbackHandle getAnnotationHandle() const {
+    return annotationHandle_;
+  }
+  void setAnnotationHandle(at::CallbackHandle handle) {
+    annotationHandle_ = handle;
+  }
+  at::CallbackHandle getCompileContextHandle() const {
+    return compileContextHandle_;
+  }
+  void setCompileContextHandle(at::CallbackHandle handle) {
+    compileContextHandle_ = handle;
+  }
+  std::unique_lock<std::mutex> lockCallbackMutex() const {
+    return std::unique_lock<std::mutex>(callbackMutex_);
+  }
+
+ private:
+  mutable std::mutex callbackMutex_;
+  at::CallbackHandle annotationHandle_{0};
+  at::CallbackHandle compileContextHandle_{0};
+};
+
+CallbackManager callbackManager;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::string write_pickle(const IValue& v) {
   std::vector<char> result;
   {
@@ -97,6 +132,7 @@ CapturedTraceback* getFromContext(
       "attempting to gather stack context from the wrong StackContext type.");
 }
 
+<<<<<<< HEAD
 void _initRecordAnnotations() {
   static auto init_placeholder [[maybe_unused]] = [&] {
     // Save user annotations to CCA memory snapshot tool
@@ -115,6 +151,75 @@ void _initRecordAnnotations() {
             .scopes({at::RecordScope::USER_SCOPE}));
     return true;
   }();
+=======
+#define ADD_CALLBACK(callbackType) at::add##callbackType##Callback
+at::CallbackHandle _initRecordAnnotations(bool useGlobalCallback) {
+  auto addCallback =
+      useGlobalCallback ? ADD_CALLBACK(Global) : ADD_CALLBACK(ThreadLocal);
+  return addCallback(
+      at::RecordFunctionCallback(
+          [](const at::RecordFunction& fn)
+              -> std::unique_ptr<at::ObserverContext> {
+            c10::cuda::CUDACachingAllocator::recordAnnotation(
+                {{"name", fn.name()}, {"stage", "START"}});
+            return nullptr;
+          },
+          [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
+            c10::cuda::CUDACachingAllocator::recordAnnotation(
+                {{"name", fn.name()}, {"stage", "END"}});
+          })
+          .scopes({at::RecordScope::USER_SCOPE}));
+}
+
+at::CallbackHandle _initCompileContexts() {
+  return at::addGlobalCallback(
+      at::RecordFunctionCallback(
+          [](const at::RecordFunction& fn)
+              -> std::unique_ptr<at::ObserverContext> {
+            std::string functionName = fn.name();
+            const std::string functionNamePrefix = "Torch-Compiled Region";
+            if (functionName.compare(
+                    0, functionNamePrefix.size(), functionNamePrefix) == 0) {
+              c10::cuda::CUDACachingAllocator::pushCompileContext(functionName);
+            }
+            return nullptr;
+          },
+          [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
+            std::string functionName = fn.name();
+            const std::string functionNamePrefix = "Torch-Compiled Region";
+            if (functionName.compare(
+                    0, functionNamePrefix.size(), functionNamePrefix) == 0) {
+              c10::cuda::CUDACachingAllocator::popCompileContext();
+            }
+          })
+          .scopes({at::RecordScope::FUNCTION}));
+}
+
+void setRecordFunctionCallbacks(
+    bool enabled,
+    bool compileContext,
+    bool globalRecordAnnotations) {
+  // Handle Callbacks under mutex
+  auto lock = callbackManager.lockCallbackMutex();
+  if (enabled) {
+    if (callbackManager.getAnnotationHandle() == 0) {
+      callbackManager.setAnnotationHandle(
+          _initRecordAnnotations(globalRecordAnnotations));
+    }
+    if (compileContext && callbackManager.getCompileContextHandle() == 0) {
+      callbackManager.setCompileContextHandle(_initCompileContexts());
+    }
+  } else {
+    if (callbackManager.getAnnotationHandle() != 0) {
+      at::removeCallback(callbackManager.getAnnotationHandle());
+      callbackManager.setAnnotationHandle(0);
+    }
+    if (callbackManager.getCompileContextHandle() != 0) {
+      at::removeCallback(callbackManager.getCompileContextHandle());
+      callbackManager.setCompileContextHandle(0);
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace
@@ -124,7 +229,14 @@ void _record_memory_history(
     bool record_context,
     int64_t trace_alloc_max_entries,
     bool trace_alloc_record_context,
+<<<<<<< HEAD
     bool record_cpp_context) {
+=======
+    bool record_cpp_context,
+    bool clearHistory,
+    bool compileContext,
+    bool globalRecordAnnotations) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::cuda::CUDACachingAllocator::CreateContextFn recorder = gather;
   if (enabled && record_cpp_context &&
       (trace_alloc_record_context || record_context)) {
@@ -139,9 +251,16 @@ void _record_memory_history(
     when = c10::cuda::CUDACachingAllocator::RecordContext::STATE;
   }
   at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
+<<<<<<< HEAD
   _initRecordAnnotations();
   c10::cuda::CUDACachingAllocator::recordHistory(
       enabled, recorder, trace_alloc_max_entries, when);
+=======
+
+  setRecordFunctionCallbacks(enabled, compileContext, globalRecordAnnotations);
+  c10::cuda::CUDACachingAllocator::recordHistory(
+      enabled, recorder, trace_alloc_max_entries, when, clearHistory);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static void checkOptionIn(
@@ -156,7 +275,14 @@ void _record_memory_history(
     std::optional<std::string> enabled,
     std::optional<std::string> context,
     const std::string& stacks,
+<<<<<<< HEAD
     size_t max_entries) {
+=======
+    size_t max_entries,
+    bool clearHistory,
+    bool compileContext,
+    bool globalRecordAnnotations) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (enabled) {
     checkOptionIn(
         *enabled,
@@ -190,9 +316,16 @@ void _record_memory_history(
     }
   }
   at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
+<<<<<<< HEAD
   _initRecordAnnotations();
   c10::cuda::CUDACachingAllocator::recordHistory(
       enabled.has_value(), recorder, max_entries, when);
+=======
+  setRecordFunctionCallbacks(
+      enabled.has_value(), compileContext, globalRecordAnnotations);
+  c10::cuda::CUDACachingAllocator::recordHistory(
+      enabled.has_value(), recorder, max_entries, when, clearHistory);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::string _memory_snapshot_pickled() {
@@ -220,6 +353,10 @@ std::string _memory_snapshot_pickled() {
   IValue blocks_s = "blocks";
   IValue is_expandable_s = "is_expandable";
   IValue time_us_s = "time_us";
+<<<<<<< HEAD
+=======
+  IValue compile_contexts_s = "compile_context";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto empty_frames = new_list();
 
@@ -336,6 +473,10 @@ std::string _memory_snapshot_pickled() {
           static_cast<int64_t>(te.addr_));
       trace_entry.insert(size_s, (int64_t)te.size_);
       trace_entry.insert(stream_s, int64_t(te.stream_));
+<<<<<<< HEAD
+=======
+      trace_entry.insert(compile_contexts_s, te.compile_context_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (te.context_) {
         auto sc = getFromContext(te.context_);
         frame_tracebacks.push_back(sc);
diff --git a/torch/csrc/cuda/memory_snapshot.h b/torch/csrc/cuda/memory_snapshot.h
index fe5699af4160..503c52b93e84 100644
--- a/torch/csrc/cuda/memory_snapshot.h
+++ b/torch/csrc/cuda/memory_snapshot.h
@@ -14,13 +14,27 @@ TORCH_CUDA_CU_API void _record_memory_history(
     bool record_context = true,
     int64_t trace_alloc_max_entries = 1,
     bool trace_alloc_record_context = false,
+<<<<<<< HEAD
     bool record_cpp_context = false);
+=======
+    bool record_cpp_context = false,
+    bool clearHistory = false,
+    bool compileContext = false,
+    bool globalRecordAllocations = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TORCH_CUDA_CU_API void _record_memory_history(
     std::optional<std::string> enabled = "all",
     std::optional<std::string> context = "all",
     const std::string& stacks = "all",
+<<<<<<< HEAD
     size_t max_entries = SIZE_MAX);
+=======
+    size_t max_entries = SIZE_MAX,
+    bool clearHistory = false,
+    bool compileContext = false,
+    bool globalRecordAllocations = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TORCH_CUDA_CU_API std::string _memory_snapshot_pickled();
 
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index 7053ad561e87..8565cc80b130 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -173,10 +173,17 @@ bool nccl_use_nonblocking() {
 static int nccl_nonblocking_timeout() {
   static int timeout = -2; // -2 means not initialized
   if (timeout == -2) {
+<<<<<<< HEAD
     const char* val = getenv("TORCH_NCCL_NONBLOCKING_TIMEOUT");
     if (val && strlen(val) > 0) {
       // NOLINTNEXTLINE(*-narrowing-conversions)
       timeout = strtol(val, nullptr, 0);
+=======
+    const auto val = c10::utils::get_env("TORCH_NCCL_NONBLOCKING_TIMEOUT");
+    if (val && !val.value().empty()) {
+      // NOLINTNEXTLINE(*-narrowing-conversions)
+      timeout = strtol(val->c_str(), nullptr, 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       // Default value consistent with kBackendDefaultTimeout
       timeout = 30 * 60;
diff --git a/torch/csrc/cuda/shared/cusparselt.cpp b/torch/csrc/cuda/shared/cusparselt.cpp
index 02be708e9139..65c1e92a5d0f 100644
--- a/torch/csrc/cuda/shared/cusparselt.cpp
+++ b/torch/csrc/cuda/shared/cusparselt.cpp
@@ -9,7 +9,11 @@ size_t getVersionInt() {
   return CUSPARSELT_VERSION;
 }
 
+<<<<<<< HEAD
 std::tuple<int64_t, int64_t, bool, int64_t> mmSearch(
+=======
+std::tuple<int64_t, int64_t, int64_t, int64_t> mmSearch(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const at::Tensor& compressed_A,
     const at::Tensor& dense_B,
     const std::optional<at::Tensor>& bias_opt,
@@ -18,7 +22,11 @@ std::tuple<int64_t, int64_t, bool, int64_t> mmSearch(
     bool transpose_result) {
   int alg_id_int = 0;
   int split_k = 1;
+<<<<<<< HEAD
   bool split_k_one_kernel = true;
+=======
+  int split_k_mode = -1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto result = at::native::_cslt_sparse_mm_impl(
       compressed_A,
       dense_B,
@@ -28,12 +36,20 @@ std::tuple<int64_t, int64_t, bool, int64_t> mmSearch(
       transpose_result,
       alg_id_int,
       split_k,
+<<<<<<< HEAD
       split_k_one_kernel,
+=======
+      split_k_mode,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       true);
   return {
       (int64_t)std::get<1>(result),
       (int64_t)std::get<2>(result),
+<<<<<<< HEAD
       (bool)std::get<3>(result),
+=======
+      (int64_t)std::get<3>(result),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       (int64_t)std::get<4>(result)};
 }
 
diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp
index d13562883bc1..bbe2300c9c37 100644
--- a/torch/csrc/cuda/shared/nvtx.cpp
+++ b/torch/csrc/cuda/shared/nvtx.cpp
@@ -3,11 +3,15 @@
 #endif
 
 #ifndef ROCM_ON_WINDOWS
+<<<<<<< HEAD
 #ifdef TORCH_CUDA_USE_NVTX3
 #include <nvtx3/nvtx3.hpp>
 #else // TORCH_CUDA_USE_NVTX3
 #include <nvToolsExt.h>
 #endif // TORCH_CUDA_USE_NVTX3
+=======
+#include <nvtx3/nvtx3.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS
@@ -41,7 +45,11 @@ static void device_callback_range_start(void* userData) {
 }
 
 static void* device_nvtxRangeStart(const char* msg, std::intptr_t stream) {
+<<<<<<< HEAD
   RangeHandle* handle = (RangeHandle*)calloc(sizeof(RangeHandle), 1);
+=======
+  auto handle = static_cast<RangeHandle*>(calloc(1, sizeof(RangeHandle)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   handle->msg = strdup(msg);
   handle->id = 0;
   TORCH_CHECK(
@@ -54,11 +62,15 @@ static void* device_nvtxRangeStart(const char* msg, std::intptr_t stream) {
 void initNvtxBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
+<<<<<<< HEAD
 #ifdef TORCH_CUDA_USE_NVTX3
   auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings");
 #else
   auto nvtx = m.def_submodule("_nvtx", "libNvToolsExt.so bindings");
 #endif
+=======
+  auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   nvtx.def("rangePushA", nvtxRangePushA);
   nvtx.def("rangePop", nvtxRangePop);
   nvtx.def("rangeStartA", nvtxRangeStartA);
diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp
index c799080e71bd..7282ef0d8909 100644
--- a/torch/csrc/cuda/utils.cpp
+++ b/torch/csrc/cuda/utils.cpp
@@ -1,9 +1,16 @@
+<<<<<<< HEAD
+=======
+#include <torch/csrc/Stream.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/cuda/THCP.h>
 #include <torch/csrc/python_headers.h>
 #include <cstdarg>
 #include <string>
 
+<<<<<<< HEAD
 #ifdef USE_CUDA
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // NB: It's a list of *optional* CUDAStream; when nullopt, that means to use
 // whatever the current stream of the device the input is associated with was.
 std::vector<std::optional<at::cuda::CUDAStream>>
@@ -23,6 +30,7 @@ THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
   for (Py_ssize_t i = 0; i < length; i++) {
     PyObject* stream = PySequence_Fast_GET_ITEM(seq.get(), i);
 
+<<<<<<< HEAD
     if (PyObject_IsInstance(stream, THCPStreamClass)) {
       // Spicy hot reinterpret cast!!
       streams.emplace_back(at::cuda::CUDAStream::unpack3(
@@ -31,6 +39,16 @@ THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
               reinterpret_cast<THCPStream*>(stream)->device_index),
           static_cast<c10::DeviceType>(
               (reinterpret_cast<THCPStream*>(stream))->device_type)));
+=======
+    if (PyObject_IsInstance(stream, (PyObject*)THPStreamClass)) {
+      // Spicy hot reinterpret cast!!
+      streams.emplace_back(at::cuda::CUDAStream::unpack3(
+          (reinterpret_cast<THPStream*>(stream))->stream_id,
+          static_cast<c10::DeviceIndex>(
+              reinterpret_cast<THPStream*>(stream)->device_index),
+          static_cast<c10::DeviceType>(
+              (reinterpret_cast<THPStream*>(stream))->device_type)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else if (stream == Py_None) {
       streams.emplace_back();
     } else {
@@ -40,5 +58,8 @@ THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
   }
   return streams;
 }
+<<<<<<< HEAD
 
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/cuda/utils.h b/torch/csrc/cuda/utils.h
new file mode 100644
index 000000000000..8a3be9126431
--- /dev/null
+++ b/torch/csrc/cuda/utils.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+#include <vector>
+
+std::vector<std::optional<at::cuda::CUDAStream>>
+THPUtils_PySequence_to_CUDAStreamList(PyObject* obj);
diff --git a/torch/csrc/deploy/README.md b/torch/csrc/deploy/README.md
index c757287f8e1b..dacc95972594 100644
--- a/torch/csrc/deploy/README.md
+++ b/torch/csrc/deploy/README.md
@@ -1,2 +1,7 @@
+<<<<<<< HEAD
 # torch::deploy has been moved to pytorch/multipy
 Please check out [https://github.com/pytorch/multipy](https://github.com/pytorch/multipy) to find the new home for torch::deploy.
+=======
+# torch::deploy has been moved to pytorch/multipy <!-- codespell:ignore -->
+Please check out [https://github.com/pytorch/multipy](https://github.com/pytorch/multipy) to find the new home for torch::deploy. <!-- codespell:ignore -->
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.h b/torch/csrc/distributed/autograd/engine/dist_engine.h
index 362c78fa07b1..7f44661ad874 100644
--- a/torch/csrc/distributed/autograd/engine/dist_engine.h
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.h
@@ -15,7 +15,11 @@ class BackwardPassCleanupGuard;
 
 // This is a singleton class responsible for running distributed backward
 // passes. This engine relies heavily on the vanilla autograd engine and tries
+<<<<<<< HEAD
 // to re-use it as much as possible. This class is mostly responsible for the
+=======
+// to reuse it as much as possible. This class is mostly responsible for the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // distributed aspects of autograd and tries to hook into the autograd engine
 // where convenient.
 
diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp
index 19db3671c7de..06e06d6cea99 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp
@@ -45,7 +45,11 @@ RpcWithProfilingReq::RpcWithProfilingReq(
       tensors_(std::move(tensors)),
       profilerConfig_(std::move(profilerConfig)),
       profilingKeyId_(profilingKeyId) {
+<<<<<<< HEAD
   TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cant be null");
+=======
+  TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc can't be null");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 rpc::MessageType RpcWithProfilingReq::wrappedMessageType() const {
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index 3d498627efa3..fdf0bfbac0ff 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -73,6 +73,13 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     return false;
   }
 
+<<<<<<< HEAD
+=======
+  virtual bool supportsTimeEstimation() const {
+    return false;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual void startCoalescing() {
     TORCH_CHECK(
         false,
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
index 57b5f8f1835e..064466c77441 100644
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -297,13 +297,25 @@ FileStore::FileStore(std::string path, int numWorkers)
   addHelper(refCountKey_, 1);
 }
 
+<<<<<<< HEAD
+=======
+c10::intrusive_ptr<Store> FileStore::clone() {
+  return c10::make_intrusive<FileStore>(path_, numWorkers_);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // NOLINTNEXTLINE(bugprone-exception-escape)
 FileStore::~FileStore() {
   // If the file does not exist - exit.
   // This can happen when FileStore is invoked from python language which has
   // GC. If python code has directory cleanup procedure, the race condition may
+<<<<<<< HEAD
   // occur between that code and this deconstructor. As a result, we check for
   // file existense before cleanup
+=======
+  // occur between that code and this destructor. As a result, we check for
+  // file existence before cleanup
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef _WIN32
   int res = syscall(std::bind(::_access, path_.c_str(), 0));
 #else
@@ -319,7 +331,11 @@ FileStore::~FileStore() {
   auto numFinishedWorker = addHelper(cleanupKey_, 1);
   auto refCount = addHelper(refCountKey_, -1);
   // The last worker cleans up the file. If numWorkers was not initialized to
+<<<<<<< HEAD
   // a specific postive value (i.e. meaning that there was not a fixed number
+=======
+  // a specific positive value (i.e. meaning that there was not a fixed number
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // of workers), we don't attempt to clean.
   // Clean up the file if number of references is 0.
   if (refCount == 0 && numWorkers_ >= 0 && numFinishedWorker >= numWorkers_) {
diff --git a/torch/csrc/distributed/c10d/FileStore.hpp b/torch/csrc/distributed/c10d/FileStore.hpp
index 0bb0756e0616..db2c19afb0f5 100644
--- a/torch/csrc/distributed/c10d/FileStore.hpp
+++ b/torch/csrc/distributed/c10d/FileStore.hpp
@@ -13,6 +13,11 @@ class TORCH_API FileStore : public Store {
  public:
   explicit FileStore(std::string path, int numWorkers);
 
+<<<<<<< HEAD
+=======
+  c10::intrusive_ptr<Store> clone() override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~FileStore() override;
 
   void set(const std::string& key, const std::vector<uint8_t>& value) override;
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
index 397dd84ef240..f1e435ef518b 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 // TODO: Make Fligth Recorder device agnostic
 #ifdef USE_C10D_NCCL
 
@@ -152,6 +153,12 @@ bool recursive_mkdir(const std::string& dir) {
   return ret == 0;
 }
 
+=======
+#include <torch/csrc/distributed/c10d/FlightRecorderDetail.hpp>
+
+namespace c10d {
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void DebugInfoWriter::write(const std::string& trace) {
   // Open a file for writing. The ios::binary flag is used to write data as
   // binary.
@@ -186,6 +193,7 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
     // Attempt to write to running user's HOME directory cache folder - if it
     // exists.
     auto homeDir = getCvarString({"HOME"}, "/tmp");
+<<<<<<< HEAD
     std::string cacheDirPath = homeDir + "/.cache/torch";
     // Create the .cache directory if it doesn't exist
     recursive_mkdir(cacheDirPath);
@@ -193,6 +201,17 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
 
     std::string fileNamePrefix = getCvarString(
         {"TORCH_NCCL_DEBUG_INFO_TEMP_FILE"}, defaultLocation.c_str());
+=======
+    auto cacheDirPath = std::filesystem::path(homeDir + "/.cache/torch");
+    // Create the .cache directory if it doesn't exist
+    std::filesystem::create_directories(cacheDirPath);
+    auto defaultLocation = cacheDirPath / "nccl_trace_rank_";
+
+    // For internal bc compatibility, we keep the old the ENV check.
+    std::string fileNamePrefix = getCvarString(
+        {"TORCH_FR_DUMP_TEMP_FILE", "TORCH_NCCL_DEBUG_INFO_TEMP_FILE"},
+        defaultLocation.string().c_str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Using std::unique_ptr here to auto-delete the writer object
     // when the pointer itself is destroyed.
     std::unique_ptr<DebugInfoWriter> writerPtr(
@@ -203,14 +222,24 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
 }
 
 void DebugInfoWriter::registerWriter(std::unique_ptr<DebugInfoWriter> writer) {
+<<<<<<< HEAD
   TORCH_CHECK_WITH(
       DistBackendError,
       hasWriterRegistered_.load() == false,
       "debugInfoWriter already registered");
+=======
+  if (hasWriterRegistered_.load()) {
+    TORCH_WARN_ONCE(
+        "DebugInfoWriter has already been registered, and since we need the writer to stay "
+        "outside ProcessGroup, user needs to ensure that this extra registration is indeed needed. "
+        "And we will only use the last registered writer.");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   hasWriterRegistered_.store(true);
   writer_ = std::move(writer);
 }
 
+<<<<<<< HEAD
 // Returns the traceback of current entry, in string form.
 // Note: `getTraceback` invokes `torch::symbolize`, which may need to acquire
 // the GIL. If you don't want to block the current thread or take the risk of a
@@ -722,3 +751,42 @@ float getDurationFromEvent(
 } // namespace c10d
 
 #endif // USE_C10D_NCCL
+=======
+std::unique_ptr<DebugInfoWriter> DebugInfoWriter::writer_ = nullptr;
+std::atomic<bool> DebugInfoWriter::hasWriterRegistered_(false);
+
+template <>
+float getDurationFromEvent<c10::Event>(
+    c10::Event& startEvent,
+    c10::Event& endEvent) {
+  TORCH_CHECK(false, "getDuration not supported by c10::Event.");
+}
+
+// For any third party library that uses the flight recorder, if one wants to
+// use an Event type other than c10::Event, one also needs to registers here to
+// avoid linking errors.
+template struct FlightRecorder<c10::Event>;
+
+std::string dump_fr_trace(
+    bool includeCollectives,
+    bool includeStackTraces,
+    bool onlyActive) {
+  return FlightRecorder<c10::Event>::get()->dump(
+      std::unordered_map<
+          std::string,
+          std::unordered_map<std::string, std::string>>{},
+      includeCollectives,
+      includeStackTraces,
+      onlyActive);
+}
+
+std::string dump_fr_trace_json(bool includeCollectives, bool onlyActive) {
+  return FlightRecorder<c10::Event>::get()->dump_json(
+      std::unordered_map<
+          std::string,
+          std::unordered_map<std::string, std::string>>{},
+      includeCollectives,
+      onlyActive);
+}
+} // namespace c10d
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.hpp b/torch/csrc/distributed/c10d/FlightRecorder.hpp
index e15f153d70c7..f54bc6217d56 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.hpp
@@ -1,8 +1,11 @@
 #pragma once
+<<<<<<< HEAD
 
 // TODO: Make Fligth Recorder device agnostic
 #ifdef USE_C10D_NCCL
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <cstdio>
 #include <cstdlib>
 
@@ -10,9 +13,15 @@
 #include <mutex>
 
 #include <ATen/ATen.h>
+<<<<<<< HEAD
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/distributed/c10d/TraceUtils.h>
+=======
+#include <c10/util/Exception.h>
+#include <torch/csrc/distributed/c10d/TraceUtils.h>
+#include <torch/csrc/distributed/c10d/logger.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 
 namespace c10d {
@@ -24,9 +33,16 @@ namespace c10d {
 // (minor when adding fields, major when changing existing fields)
 // Also update both JSON and Pickle dumps to make use of the newly defined
 // field(s).
+<<<<<<< HEAD
 DEFINE_CONSTANT(version_val, "2.4")
 DEFINE_CONSTANT(entries_key, "entries")
 DEFINE_CONSTANT(nccl_comm_key, "nccl_comm_state")
+=======
+DEFINE_CONSTANT(version_val, "2.9")
+DEFINE_CONSTANT(entries_key, "entries")
+DEFINE_CONSTANT(nccl_comm_key, "nccl_comm_state")
+DEFINE_CONSTANT(nccl_version_key, "nccl_version")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DEFINE_CONSTANT(version_key, "version")
 DEFINE_CONSTANT(pg_config_key, "pg_config")
 DEFINE_CONSTANT(pg_status_key, "pg_status")
@@ -56,6 +72,11 @@ DEFINE_CONSTANT(time_discovered_completed_key, "time_discovered_completed_ns")
 DEFINE_CONSTANT(completed_state, "completed")
 DEFINE_CONSTANT(scheduled_state, "scheduled")
 DEFINE_CONSTANT(started_state, "started")
+<<<<<<< HEAD
+=======
+DEFINE_CONSTANT(thread_id_key, "thread_id")
+DEFINE_CONSTANT(thread_name_key, "thread_name")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #undef DEFINE_CONSTANT
 
 // Write NCCL debug info to local disk or any storage users define.
@@ -87,6 +108,7 @@ class TORCH_API DebugInfoWriter {
   static std::atomic<bool> hasWriterRegistered_;
 };
 
+<<<<<<< HEAD
 /* Helper used by work::getDuration() and nccl flight recorder */
 float getDurationFromEvent(
     at::cuda::CUDAEvent& ncclStartEvent,
@@ -105,6 +127,24 @@ struct FlightRecorder {
     enabled_ = max_entries_ > 0;
   }
   using Event = at::cuda::CUDAEvent;
+=======
+template <typename EventType>
+struct FlightRecorder {
+  static FlightRecorder<EventType>* get() {
+    // intentionally leak on exit
+    // because this will hold python state that may get destructed
+    static FlightRecorder<EventType>* instance =
+        new FlightRecorder<EventType>();
+    return instance;
+  }
+  FlightRecorder() {
+    max_entries_ =
+        getCvarInt({"TORCH_FR_BUFFER_SIZE", "TORCH_NCCL_TRACE_BUFFER_SIZE"}, 0);
+    capture_cpp_stack_ = getCvarBool(
+        {"TORCH_FR_CPP_STACK", "TORCH_NCCL_TRACE_CPP_STACK"}, false);
+    enabled_ = max_entries_ > 0;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   struct Entry {
     size_t id_; // incremented id in the trace buffer
                 // used to figure out where in the circular entries
@@ -128,7 +168,11 @@ struct FlightRecorder {
     // we borrow pointers to start_ and end_ so we can query the state
     // on reporting. However, once the event is completed, the call
     // to `complete` will clear these.
+<<<<<<< HEAD
     Event *start_, *end_;
+=======
+    EventType *start_, *end_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // timestamp when the entry was created, likely close to the time the work
     // was 'enqueued'- not necessarily started
@@ -148,7 +192,11 @@ struct FlightRecorder {
     std::optional<c10::time_t> time_discovered_started_;
 
     // timestamp when our CPU threads discovered that the kernel completed.
+<<<<<<< HEAD
     // will always be _after_ it actually complated, and can be the same time
+=======
+    // will always be _after_ it actually completed, and can be the same time
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // as the discovery of the start if the watchdog thread is stuck on CUDA
     // APIs
     std::optional<c10::time_t> time_discovered_completed_;
@@ -159,11 +207,24 @@ struct FlightRecorder {
     c10::SmallVector<int64_t, 4> output_dims_;
     std::vector<c10::ScalarType> output_dtypes_;
     c10::SmallVector<int64_t, 8> sizes_; // flattened from inputs, outputs
+<<<<<<< HEAD
+=======
+    std::thread::id thread_id_;
+    std::string thread_name_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool retired_ = false; // is this work entry no longer in the workMetaList_?
                            // a retired but not completed event has timed out
 
     // Returns the traceback of current entry, in string form.
+<<<<<<< HEAD
     std::string getTraceback();
+=======
+    // Note: `getTraceback` invokes `torch::symbolize`, which may need to
+    // acquire the GIL. If you don't want to block the current thread or take
+    // the risk of a GIL deadlock, you can use an asynchronous calling mechanism
+    // like std::async.
+    TORCH_API std::string getTraceback();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   bool enabled_ = false;
@@ -176,6 +237,10 @@ struct FlightRecorder {
   std::map<size_t, std::shared_ptr<ProcessGroupStatus>> all_pg_status_ = {};
   std::map<std::tuple<std::string, std::string>, std::vector<uint64_t>>
       pg_name_to_ranks_ = {};
+<<<<<<< HEAD
+=======
+  std::string nccl_version_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::optional<size_t> record(
       size_t pg_id,
@@ -186,23 +251,41 @@ struct FlightRecorder {
       std::string profiling_name,
       const std::vector<at::Tensor>& inputs,
       const std::vector<at::Tensor>& outputs,
+<<<<<<< HEAD
       Event* start,
       Event* end,
+=======
+      EventType* start,
+      EventType* end,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::chrono::milliseconds timeout_ms,
       std::shared_ptr<ProcessGroupStatus> pg_status,
       bool isP2P);
 
+<<<<<<< HEAD
   void record_pg_ranks(
       const std::tuple<std::string, std::string>& pg_name,
       std::vector<uint64_t> ranks);
 
+=======
+  TORCH_API void record_pg_ranks(
+      const std::tuple<std::string, std::string>& pg_name,
+      std::vector<uint64_t> ranks);
+
+  void record_accelerator_version(const std::string nccl_version);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void update_state(Entry& r);
 
   std::vector<Entry> dump_entries();
 
   // Returns the entry with the given id, if it exists. Otherwise, returns
   // std::nullopt.
+<<<<<<< HEAD
   std::optional<Entry> getEntry(std::optional<size_t> id);
+=======
+  TORCH_API std::optional<Entry> getEntry(std::optional<size_t> id);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /*
   Mark an Event as completed and free its events.
@@ -214,7 +297,13 @@ struct FlightRecorder {
   never hang. (timing must also be enabled for compute_duration - see
   TORCH_NCCL_ENABLE_TIMING).
   */
+<<<<<<< HEAD
   void retire_id(std::optional<size_t> id, bool compute_duration = true);
+=======
+  TORCH_API void retire_id(
+      std::optional<size_t> id,
+      bool compute_duration = true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const c10::List<c10::IValue> getCollectiveTrace(
       bool includeStacktraces,
@@ -235,6 +324,7 @@ struct FlightRecorder {
   std::string dump_json(
       const std::optional<std::unordered_map<
           std::string,
+<<<<<<< HEAD
           std::unordered_map<std::string, std::string>>>& ncclDumpMap,
       bool includeCollectives,
       bool onlyActive);
@@ -244,11 +334,38 @@ struct FlightRecorder {
       const std::optional<std::unordered_map<
           std::string,
           std::unordered_map<std::string, std::string>>>& ncclDumpMap,
+=======
+          std::unordered_map<std::string, std::string>>>& extraDumpMap,
+      bool includeCollectives,
+      bool onlyActive);
+
+  std::string dump(
+      const std::optional<std::unordered_map<
+          std::string,
+          std::unordered_map<std::string, std::string>>>& extraDumpMap,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       bool includeCollectives,
       bool includeStackTraces,
       bool onlyActive);
 };
 
+<<<<<<< HEAD
 } // namespace c10d
 
 #endif // USE_C10D_NCCL
+=======
+// Dumps the fr traces and additional information about the Process
+// Group.
+TORCH_API std::string dump_fr_trace(
+    bool includeCollectives,
+    bool includeStackTraces,
+    bool onlyActive);
+
+// Dumps the fr traces and additional information about the Process
+// Group in JSON formatted string.
+// We don't include stack traces in JSON format as it is far too much data.
+TORCH_API std::string dump_fr_trace_json(
+    bool includeCollectives,
+    bool onlyActive);
+} // namespace c10d
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp b/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp
new file mode 100644
index 000000000000..25ac1279d62e
--- /dev/null
+++ b/torch/csrc/distributed/c10d/FlightRecorderCuda.cpp
@@ -0,0 +1,122 @@
+#ifdef USE_C10D_NCCL
+#include <ATen/cuda/CUDAEvent.h>
+#include <cuda_runtime.h>
+
+#include <nlohmann/json.hpp>
+#include <filesystem>
+#include <fstream>
+#include <mutex>
+#include <vector>
+
+#include <torch/csrc/distributed/c10d/FlightRecorderDetail.hpp>
+#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
+#include <torch/csrc/distributed/c10d/control_plane/Handlers.hpp>
+
+namespace c10d {
+control_plane::RegisterHandler dumpHandler{
+    "dump_nccl_trace_pickle",
+    [](const control_plane::Request& req, control_plane::Response& res) {
+      const auto& params = req.params();
+      size_t validParamCount = 0;
+
+      // valid params
+      const std::string includeCollectivesStr = "includecollectives";
+      const std::string includeStackTracesStr = "includestacktraces";
+      const std::string onlyActiveStr = "onlyactive";
+
+      std::unordered_map<std::string, bool> processedParams = {
+          {includeCollectivesStr, true},
+          {includeStackTracesStr, true},
+          {onlyActiveStr, false}};
+
+      for (const auto& [paramName, paramValue] : params) {
+        auto it = processedParams.find(paramName);
+        if (it != processedParams.end()) {
+          validParamCount++;
+          if (paramValue == "true") {
+            it->second = true;
+          } else if (paramValue == "false") {
+            it->second = false;
+          } else {
+            res.setStatus(400);
+            res.setContent(
+                "Invalid value for " + paramName +
+                    " valid values are true or false",
+                "text/plain");
+            return;
+          }
+        }
+      }
+      if (validParamCount < params.size()) {
+        res.setStatus(400);
+        res.setContent(
+            "Invalid parameters - unexpected param passed in", "text/plain");
+        return;
+      }
+      res.setContent(
+          dump_nccl_trace(
+              processedParams[includeCollectivesStr],
+              processedParams[includeStackTracesStr],
+              processedParams[onlyActiveStr]),
+          "application/octet-stream");
+    }};
+
+control_plane::RegisterHandler jsonDumpHandler{
+    "dump_nccl_trace_json",
+    [](const control_plane::Request& req, control_plane::Response& res) {
+      const auto& params = req.params();
+      size_t validParamCount = 0;
+
+      // valid params
+      const std::string includeCollectivesStr = "includecollectives";
+      const std::string onlyActiveStr = "onlyactive";
+
+      std::unordered_map<std::string, bool> processedParams = {
+          {includeCollectivesStr, true}, {onlyActiveStr, false}};
+
+      for (const auto& [paramName, paramValue] : params) {
+        auto it = processedParams.find(paramName);
+        if (it != processedParams.end()) {
+          validParamCount++;
+          if (paramValue == "true") {
+            it->second = true;
+          } else if (paramValue == "false") {
+            it->second = false;
+          } else {
+            res.setStatus(400);
+            res.setContent(
+                "Invalid value for " + paramName +
+                    " valid values are true or false",
+                "text/plain");
+            return;
+          }
+        }
+      }
+      if (validParamCount < params.size()) {
+        res.setStatus(400);
+        res.setContent(
+            "Invalid parameters - unexpected param passed in", "text/plain");
+        return;
+      }
+      res.setStatus(200);
+      res.setContent(
+          dump_nccl_trace_json(
+              processedParams[includeCollectivesStr],
+              processedParams[onlyActiveStr]),
+          "application/json");
+    }};
+
+/* Helper used by work::getDuration() and nccl flight recorder */
+template <>
+float getDurationFromEvent<at::cuda::CUDAEvent>(
+    at::cuda::CUDAEvent& ncclStartEvent,
+    at::cuda::CUDAEvent& ncclEndEvent) {
+  TORCH_CHECK(
+      ncclEndEvent.query(),
+      "getDuration can only be called after work is succeeded.")
+  return ncclStartEvent.elapsed_time(ncclEndEvent);
+}
+
+template struct FlightRecorder<at::cuda::CUDAEvent>;
+} // namespace c10d
+#endif // USE_C10D_NCCL
diff --git a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
new file mode 100644
index 000000000000..608b9157ac39
--- /dev/null
+++ b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
@@ -0,0 +1,550 @@
+#include <nlohmann/json.hpp>
+
+#include <c10/util/WaitCounter.h>
+#include <c10/util/thread_name.h>
+
+#include <torch/csrc/distributed/c10d/FlightRecorder.hpp>
+
+namespace c10d {
+
+template <typename EventType>
+float getDurationFromEvent(EventType& start, EventType& end);
+
+// Returns the traceback of current entry, in string form.
+// Note: `getTraceback` invokes `torch::symbolize`, which may need to acquire
+// the GIL. If you don't want to block the current thread or take the risk of a
+// GIL deadlock, you can use an asynchronous calling mechanism like std::async.
+template <typename EventType>
+std::string FlightRecorder<EventType>::Entry::getTraceback() {
+  torch::CapturedTraceback* traceback = traceback_.get();
+  torch::SymbolizedTracebacks s_tbs = torch::symbolize({traceback});
+  // We use 0 because we only have one traceback here.
+  const auto& s_tb = s_tbs.tracebacks.at(0);
+  std::stringstream oss;
+  for (auto idx : c10::irange(s_tb.size())) {
+    auto frame_id = s_tb[idx];
+    const auto& frame = s_tbs.all_frames.at(frame_id);
+    oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
+        << ":" << frame.lineno << '\n';
+  }
+  /* Resulted format is like:
+    #0 all_reduce from pytorch/torch/distributed/distributed_c10d.py:2696
+    #1 wrapper from pytorch/torch/distributed/c10d_logger.py:83
+    #2 bar from /home/user/repro.py:15
+    #3 foo from /home/user/repro.py:24
+    #4 main from /home/user/repro.py:34
+    #5 <module> from /home/user/repro.py:40
+  */
+  return oss.str();
+}
+
+template <typename EventType>
+std::optional<size_t> FlightRecorder<EventType>::record(
+    size_t pg_id,
+    const std::tuple<std::string, std::string>& pg_name,
+    size_t collective_seq_id,
+    size_t p2p_seq_id,
+    size_t op_id,
+    std::string profiling_name,
+    const std::vector<at::Tensor>& inputs,
+    const std::vector<at::Tensor>& outputs,
+    EventType* start,
+    EventType* end,
+    std::chrono::milliseconds timeout_ms,
+    std::shared_ptr<ProcessGroupStatus> pg_status,
+    bool isP2P) {
+  if (!enabled_) {
+    return std::nullopt;
+  }
+  if (all_pg_status_.find(pg_id) == all_pg_status_.end()) {
+    // Current pg_status is not in FR.
+    all_pg_status_[pg_id] = std::move(pg_status);
+  }
+  auto traceback =
+      torch::CapturedTraceback::gather(true, true, capture_cpp_stack_);
+  std::lock_guard<std::mutex> guard(mutex_);
+
+  auto te = Entry{
+      id_,
+      pg_id,
+      pg_name,
+      collective_seq_id,
+      p2p_seq_id,
+      op_id,
+      std::move(profiling_name),
+      std::move(traceback),
+      start,
+      end,
+      c10::getTime(),
+      timeout_ms.count(),
+      isP2P,
+      std::nullopt,
+      std::nullopt,
+      std::nullopt,
+      {},
+      {},
+      {},
+      {},
+      {},
+      std::this_thread::get_id(),
+      c10::getThreadName(),
+      false};
+
+  for (const auto& input : inputs) {
+    c10::IntArrayRef sizes = input.sizes();
+    te.input_dtypes_.push_back(input.dtype().toScalarType());
+    te.input_dims_.push_back(static_cast<int64_t>(sizes.size()));
+    te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end());
+  }
+
+  for (const auto& output : outputs) {
+    c10::IntArrayRef sizes = output.sizes();
+    te.output_dtypes_.push_back(output.dtype().toScalarType());
+    te.output_dims_.push_back(static_cast<int64_t>(sizes.size()));
+    te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end());
+  }
+
+  if (entries_.size() < max_entries_) {
+    entries_.emplace_back(std::move(te));
+  } else {
+    entries_[next_++] = std::move(te);
+    if (next_ == max_entries_) {
+      next_ = 0;
+    }
+  }
+  return id_++;
+}
+
+template <typename EventType>
+void FlightRecorder<EventType>::record_pg_ranks(
+    const std::tuple<std::string, std::string>& pg_name,
+    std::vector<uint64_t> ranks) {
+  if (!enabled_) {
+    return;
+  }
+  std::lock_guard<std::mutex> guard(mutex_);
+  pg_name_to_ranks_[pg_name] = std::move(ranks);
+}
+
+template <typename EventType>
+void FlightRecorder<EventType>::record_accelerator_version(
+    const std::string nccl_version) {
+  if (!enabled_) {
+    return;
+  }
+  std::lock_guard<std::mutex> guard(mutex_);
+  nccl_version_ = std::move(nccl_version);
+}
+
+template <typename EventType>
+void FlightRecorder<EventType>::update_state(Entry& r) {
+  try {
+    if (r.start_ != nullptr) {
+      bool started = r.start_->query();
+      if (started && !r.time_discovered_started_) {
+        r.time_discovered_started_ = c10::getTime();
+      }
+    }
+    if (r.end_ != nullptr) {
+      bool completed = r.end_->query();
+      if (completed && !r.time_discovered_completed_) {
+        r.time_discovered_completed_ = c10::getTime();
+      }
+    }
+  } catch (std::exception& e) {
+    LOG(ERROR) << "Failed to update state for entry " << r.id_ << ": "
+               << r.profiling_name_ << " with error: " << e.what();
+  }
+}
+
+template <typename EventType>
+std::vector<typename FlightRecorder<EventType>::Entry> FlightRecorder<
+    EventType>::dump_entries() {
+  std::vector<Entry> result;
+  {
+    std::lock_guard<std::mutex> guard(mutex_);
+    result.reserve(entries_.size());
+    result.insert(
+        result.end(),
+        entries_.begin() + static_cast<std::ptrdiff_t>(next_),
+        entries_.end());
+    result.insert(
+        result.end(),
+        entries_.begin(),
+        entries_.begin() + static_cast<std::ptrdiff_t>(next_));
+  }
+  // query any remaining events
+  for (auto& r : result) {
+    update_state(r);
+    r.start_ = r.end_ = nullptr;
+  }
+  return result;
+}
+
+template <typename EventType>
+// Returns the entry with the given id, if it exists. Otherwise, returns
+// std::nullopt.
+std::optional<typename FlightRecorder<EventType>::Entry> FlightRecorder<
+    EventType>::getEntry(std::optional<size_t> id) {
+  if (!enabled_ || !id) {
+    return std::nullopt;
+  }
+
+  std::unique_lock<std::mutex> guard(mutex_);
+  Entry entry = entries_.at(*id % max_entries_);
+  if (entry.id_ == *id) {
+    return entry;
+  } else {
+    return std::nullopt;
+  }
+}
+
+template <typename EventType>
+void FlightRecorder<EventType>::retire_id(
+    std::optional<size_t> id,
+    bool compute_duration) {
+  if (!enabled_ || !id) {
+    return;
+  }
+
+  bool can_compute_duration = false;
+  EventType* startEvent = nullptr;
+  EventType* endEvent = nullptr;
+  std::optional<float> duration = std::nullopt;
+
+  std::unique_lock<std::mutex> guard(mutex_);
+
+  Entry* entry = &entries_.at(*id % max_entries_);
+  if (entry->id_ == *id) {
+    update_state(*entry);
+
+    if (compute_duration) {
+      can_compute_duration = entry->time_discovered_completed_.has_value() &&
+          entry->start_ && entry->end_;
+      startEvent = entry->start_;
+      endEvent = entry->end_;
+    }
+    entry->retired_ = true;
+    entry->start_ = entry->end_ = nullptr;
+  }
+
+  if (can_compute_duration) {
+    // Compute duration without without holding the lock, because
+    // cudaEventDuration() can hang, and we need to acquire the lock before we
+    // can dump(), which we never want to block.
+    guard.unlock();
+    duration = getDurationFromEvent<EventType>(*startEvent, *endEvent);
+    guard.lock();
+
+    // Refresh the entry pointer, see if the entry has been overwritten
+    entry = &entries_.at(*id % max_entries_);
+    if (entry->id_ != *id) {
+      LOG(INFO) << "retire_id abandoned for id " << *id
+                << ", event was overwritten while waiting to compute duration.";
+      return;
+    }
+    if (duration.has_value()) {
+      entry->duration_ = duration;
+    }
+  }
+}
+
+template <typename EventType>
+const c10::List<c10::IValue> FlightRecorder<EventType>::getCollectiveTrace(
+    bool includeStacktraces,
+    bool onlyActive) {
+  auto entries = new_list();
+  // Entries are returned in the order they were recorded
+  auto result = dump_entries();
+  std::vector<torch::CapturedTraceback*> tracebacks;
+  torch::SymbolizedTracebacks stracebacks;
+  std::vector<c10::IValue> all_frames;
+  if (includeStacktraces) {
+    for (auto& e : result) {
+      tracebacks.push_back(e.traceback_.get());
+    }
+    stracebacks = torch::symbolize(tracebacks);
+    for (const auto& f : stracebacks.all_frames) {
+      auto d = new_dict();
+      d.insert(name_key, f.funcname);
+      d.insert(filename_key, f.filename);
+      d.insert(line_key, int64_t(f.lineno));
+      all_frames.emplace_back(std::move(d));
+    }
+  }
+  for (auto i : c10::irange(result.size())) {
+    auto dict = new_dict();
+    auto& e = result.at(i);
+    // Skip completed events
+    if (onlyActive && e.time_discovered_completed_.has_value()) {
+      continue;
+    }
+    if (includeStacktraces) {
+      auto& tb = stracebacks.tracebacks.at(i);
+      auto frames = new_list();
+      for (auto frame : tb) {
+        frames.push_back(all_frames.at(frame));
+      }
+      dict.insert(frames_key, frames);
+    }
+
+    dict.insert(record_id_key, int64_t(e.id_));
+    dict.insert(pg_id_key, int64_t(e.pg_id_));
+    dict.insert(pg_name_key, e.pg_name_);
+    dict.insert(thread_name_key, e.thread_name_);
+    dict.insert(thread_id_key, c10::str(e.thread_id_));
+    dict.insert(collective_seq_id_key, int64_t(e.collective_seq_id_));
+    dict.insert(p2p_seq_id_key, int64_t(e.p2p_seq_id_));
+    dict.insert(op_id_key, int64_t(e.op_id_));
+    dict.insert(profiling_name_key, e.profiling_name_);
+    dict.insert(time_created_key, int64_t(e.time_created_));
+    if (e.duration_) {
+      dict.insert(duration_key, *e.duration_);
+    }
+
+    auto it = e.sizes_.begin();
+    auto read_sizes = [&](const c10::SmallVector<int64_t, 4>& dims) {
+      auto sizes = new_list();
+      for (auto dim : dims) {
+        auto arg_sizes = new_list();
+        for ([[maybe_unused]] auto i : c10::irange(dim)) {
+          arg_sizes.push_back(*it++);
+        }
+        sizes.push_back(arg_sizes);
+      }
+      return sizes;
+    };
+
+    dict.insert(input_sizes_key, read_sizes(e.input_dims_));
+    std::vector<std::string> input_dtypes_strs;
+    input_dtypes_strs.reserve(e.input_dtypes_.size());
+    for (const auto& input_dtype : e.input_dtypes_) {
+      input_dtypes_strs.emplace_back(c10::toString(input_dtype));
+    }
+    dict.insert(input_dtypes_key, input_dtypes_strs);
+    dict.insert(output_sizes_key, read_sizes(e.output_dims_));
+    std::vector<std::string> output_dtypes_strs;
+    output_dtypes_strs.reserve(e.output_dtypes_.size());
+    for (const auto& output_dtype : e.output_dtypes_) {
+      output_dtypes_strs.emplace_back(c10::toString(output_dtype));
+    }
+    dict.insert(output_dtypes_key, output_dtypes_strs);
+    if (e.time_discovered_completed_.has_value()) {
+      dict.insert(state_key, completed_state);
+    } else if (e.time_discovered_started_.has_value()) {
+      dict.insert(state_key, started_state);
+    } else {
+      dict.insert(state_key, scheduled_state);
+    }
+
+    dict.insert(
+        time_discovered_started_key,
+        e.time_discovered_started_.has_value()
+            ? int64_t(*e.time_discovered_started_)
+            : c10::IValue());
+    dict.insert(
+        time_discovered_completed_key,
+        e.time_discovered_completed_.has_value()
+            ? int64_t(*e.time_discovered_completed_)
+            : c10::IValue());
+    dict.insert(retired_key, e.retired_);
+    dict.insert(timeout_key, e.timeout_ms_);
+    dict.insert(is_p2p_key, e.isP2P_);
+
+    entries.push_back(dict);
+  }
+  return entries;
+}
+
+template <typename EventType>
+const c10::Dict<c10::IValue, c10::IValue> FlightRecorder<
+    EventType>::getPgConfig() {
+  auto pg_config = new_dict();
+  for (const auto& [pg_name, ranks] : pg_name_to_ranks_) {
+    auto pg_info = new_dict();
+    pg_info.insert("name", std::get<0>(pg_name));
+    pg_info.insert("desc", std::get<1>(pg_name));
+    pg_info.insert("ranks", ranks_str(ranks));
+    pg_config.insert(std::get<0>(pg_name), pg_info);
+  }
+  return pg_config;
+}
+
+template <typename EventType>
+const std::map<std::string, std::map<std::string, std::string>> FlightRecorder<
+    EventType>::getPgConfigJson() {
+  std::map<std::string, std::map<std::string, std::string>> result;
+  for (const auto& [pg_name, ranks] : pg_name_to_ranks_) {
+    auto pg_info = std::map<std::string, std::string>();
+    pg_info["name"] = std::get<0>(pg_name);
+    pg_info["desc"] = std::get<1>(pg_name);
+    pg_info["ranks"] = ranks_str(ranks);
+    result.emplace(std::get<0>(pg_name), pg_info);
+  }
+  return result;
+}
+
+template <typename EventType>
+const c10::Dict<c10::IValue, c10::IValue> FlightRecorder<
+    EventType>::getPgStatus() {
+  auto all_pg_status = new_dict();
+  for (const auto& [pg_id, status] : all_pg_status_) {
+    auto pg_status = new_dict();
+    pg_status.insert("last_enqueued_collective", status->lastEnqueuedSeq);
+    pg_status.insert("last_started_collective", status->lastStartedSeq);
+    pg_status.insert("last_completed_collective", status->lastCompletedSeq);
+    all_pg_status.insert(std::to_string(pg_id), pg_status);
+  }
+  return all_pg_status;
+}
+
+template <typename EventType>
+const std::map<std::string, std::map<std::string, std::string>> FlightRecorder<
+    EventType>::getPgStatusJson() {
+  std::map<std::string, std::map<std::string, std::string>> result;
+  for (const auto& [pg_id, status] : all_pg_status_) {
+    auto pg_status = std::map<std::string, std::string>();
+    pg_status["last_enqueued_collective"] =
+        std::to_string(status->lastEnqueuedSeq);
+    pg_status["last_started_collective"] =
+        std::to_string(status->lastStartedSeq);
+    pg_status["last_completed_collective"] =
+        std::to_string(status->lastCompletedSeq);
+    result[std::to_string(pg_id)] = pg_status;
+  }
+  return result;
+}
+
+using json = nlohmann::json;
+template <typename EventType>
+std::string FlightRecorder<EventType>::dump_json(
+    const std::optional<std::unordered_map<
+        std::string,
+        std::unordered_map<std::string, std::string>>>& extraDumpMap,
+    bool includeCollectives,
+    bool onlyActive) {
+  json result;
+  result[version_key_str] = version_val_str;
+  result[nccl_version_key_str] = nccl_version_;
+  result[pg_config_key_str] = getPgConfigJson();
+  result[pg_status_key_str] = getPgStatusJson();
+
+  // collective trace
+  if (includeCollectives) {
+    std::list<json> entries;
+    for (auto& e : dump_entries()) {
+      json j;
+      if (onlyActive && e.time_discovered_completed_.has_value()) {
+        continue;
+      }
+      j[record_id_key_str] = int64_t(e.id_);
+      j[pg_id_key_str] = int64_t(e.pg_id_);
+      j[pg_name_key_str] = e.pg_name_;
+      j[thread_name_key_str] = e.thread_name_;
+      j[thread_id_key_str] = c10::str(e.thread_id_);
+      j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_);
+      j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_);
+      j[op_id_key_str] = int64_t(e.op_id_);
+      j[profiling_name_key_str] = e.profiling_name_;
+      j[time_created_key_str] = int64_t(e.time_created_);
+      if (e.duration_) {
+        j[duration_key_str] = *e.duration_;
+      }
+      auto it = e.sizes_.begin();
+      auto read_sizes = [&](const c10::SmallVector<int64_t, 4>& dims) {
+        auto sizes = std::list<std::list<int64_t>>();
+        for (auto dim : dims) {
+          auto arg_sizes = std::list<int64_t>();
+          for (auto i : c10::irange(dim)) {
+            (void)i;
+            arg_sizes.push_back(*it++);
+          }
+          sizes.push_back(arg_sizes);
+        }
+        return sizes;
+      };
+      j[input_sizes_key_str] = read_sizes(e.input_dims_);
+      std::vector<std::string> input_dtypes_strs;
+      input_dtypes_strs.reserve(e.input_dtypes_.size());
+      for (const auto& input_dtype : e.input_dtypes_) {
+        input_dtypes_strs.emplace_back(c10::toString(input_dtype));
+      }
+      j[input_dtypes_key_str] = input_dtypes_strs;
+      j[output_sizes_key_str] = read_sizes(e.output_dims_);
+      std::vector<std::string> output_dtypes_strs;
+      output_dtypes_strs.reserve(e.output_dtypes_.size());
+      for (const auto& output_dtype : e.output_dtypes_) {
+        output_dtypes_strs.emplace_back(c10::toString(output_dtype));
+      }
+      j[output_dtypes_key_str] = output_dtypes_strs;
+      if (e.time_discovered_completed_.has_value()) {
+        j[state_key_str] = completed_state_str;
+      } else if (e.time_discovered_started_.has_value()) {
+        j[state_key_str] = started_state_str;
+      } else {
+        j[state_key_str] = scheduled_state_str;
+      }
+      j[time_discovered_started_key_str] =
+          e.time_discovered_started_.has_value()
+          ? int64_t(*e.time_discovered_started_)
+          : 0;
+      j[time_discovered_completed_key_str] =
+          e.time_discovered_completed_.has_value()
+          ? int64_t(*e.time_discovered_completed_)
+          : 0;
+      j[retired_key_str] = e.retired_;
+      j[timeout_key_str] = e.timeout_ms_;
+      j[is_p2p_key_str] = e.isP2P_;
+      entries.emplace_back(j);
+    }
+
+    if (!entries.empty()) {
+      result[entries_key_str] = entries;
+    }
+  }
+
+  if (extraDumpMap.has_value()) {
+    result[nccl_comm_key_str] = extraDumpMap.value();
+  }
+  return result.dump();
+}
+
+template <typename EventType>
+std::string FlightRecorder<EventType>::dump(
+    const std::optional<std::unordered_map<
+        std::string,
+        std::unordered_map<std::string, std::string>>>& extraDumpMap,
+    bool includeCollectives,
+    bool includeStackTraces,
+    bool onlyActive) {
+  STATIC_SCOPED_WAIT_COUNTER(pytorch.wait_counter.FlightRecorder__dump);
+  auto result = new_dict();
+  // common values
+  result.insert(version_key, version_val);
+  result.insert(pg_config_key, getPgConfig());
+  result.insert(nccl_version_key_str, nccl_version_);
+  result.insert(pg_status_key, getPgStatus());
+
+  // collective trace
+  if (includeCollectives) {
+    result.insert(
+        entries_key, getCollectiveTrace(includeStackTraces, onlyActive));
+  }
+
+  // convert extraDumpMap into a dictionary
+  auto per_comm_dict = new_dict();
+  if (extraDumpMap.has_value()) {
+    for (const auto& [ncclId, ncclDump] : extraDumpMap.value()) {
+      auto inner_dict = new_dict();
+      for (const auto& [key, value] : ncclDump) {
+        inner_dict.insert(key, value);
+      }
+      per_comm_dict.insert(ncclId, inner_dict);
+    }
+  }
+  if (!per_comm_dict.empty()) {
+    result.insert(nccl_comm_key, per_comm_dict);
+  }
+  return pickle_str(result);
+}
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/Functional.cpp b/torch/csrc/distributed/c10d/Functional.cpp
index 56655aa99910..5359e0080aa2 100644
--- a/torch/csrc/distributed/c10d/Functional.cpp
+++ b/torch/csrc/distributed/c10d/Functional.cpp
@@ -266,34 +266,54 @@ TORCH_LIBRARY(_c10d_functional, m) {
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd,
           ::all_gather_into_tensor_out),
+<<<<<<< HEAD
       {at::Tag::pt2_compliant_tag});
+=======
+      {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def(
       "all_gather_into_tensor(Tensor input, int group_size, str group_name) -> Tensor",
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd,
           ::all_gather_into_tensor),
+<<<<<<< HEAD
       {at::Tag::pt2_compliant_tag});
+=======
+      {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def(
       "all_gather_into_tensor_coalesced(Tensor[] inputs, int group_size, str group_name) -> Tensor[]",
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd,
           ::all_gather_into_tensor_coalesced),
+<<<<<<< HEAD
       {at::Tag::pt2_compliant_tag});
+=======
+      {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def(
       "reduce_scatter_tensor(Tensor input, str reduce_op, int group_size, str group_name) -> Tensor",
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd, ::reduce_scatter_tensor),
+<<<<<<< HEAD
       {at::Tag::pt2_compliant_tag});
+=======
+      {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def(
       "reduce_scatter_tensor_coalesced(Tensor[] inputs, str reduce_op, int group_size, str group_name) -> Tensor[]",
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd,
           ::reduce_scatter_tensor_coalesced),
+<<<<<<< HEAD
       {at::Tag::pt2_compliant_tag});
+=======
+      {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def(
       "all_to_all_single("
@@ -303,7 +323,11 @@ TORCH_LIBRARY(_c10d_functional, m) {
       "str group_name) -> Tensor",
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd, ::all_to_all_single),
+<<<<<<< HEAD
       {at::Tag::pt2_compliant_tag});
+=======
+      {at::Tag::pt2_compliant_tag, at::Tag::needs_contiguous_strides});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def(
       "broadcast(Tensor input, int src, str group_name) -> Tensor",
diff --git a/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp b/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
index af09ba39470c..0c885f480d02 100644
--- a/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
+++ b/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
@@ -1,5 +1,10 @@
 #include <torch/csrc/distributed/c10d/GlooDeviceFactory.hpp>
 
+<<<<<<< HEAD
+=======
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_C10D_GLOO
 
 #include <cstdlib>
@@ -19,6 +24,13 @@
 #include <gloo/transport/uv/device.h>
 #endif
 
+<<<<<<< HEAD
+=======
+#if GLOO_HAVE_TRANSPORT_IBVERBS
+#include <gloo/transport/ibverbs/device.h>
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // On Linux, check that the tcp transport is available.
 #ifdef __linux__
 #if !GLOO_HAVE_TRANSPORT_TCP
@@ -39,12 +51,22 @@ C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING(
     GlooDeviceRegistry,
     ::gloo::transport::Device,
     const std::string& /* interface */,
+<<<<<<< HEAD
     const std::string& /* hostname */)
+=======
+    const std::string& /* hostname */,
+    bool /* lazyInit */)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if GLOO_HAVE_TRANSPORT_TCP
 static std::shared_ptr<::gloo::transport::Device> makeTCPDevice(
     const std::string& interfaceName,
+<<<<<<< HEAD
     const std::string& hostname) {
+=======
+    const std::string& hostname,
+    bool lazyInit) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       !interfaceName.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeTCPDevice(): interface or hostname "
@@ -56,7 +78,15 @@ static std::shared_ptr<::gloo::transport::Device> makeTCPDevice(
   } else {
     attr.hostname = hostname;
   }
+<<<<<<< HEAD
   return ::gloo::transport::tcp::CreateDevice(attr);
+=======
+  if (lazyInit) {
+    return ::gloo::transport::tcp::CreateLazyDevice(attr);
+  } else {
+    return ::gloo::transport::tcp::CreateDevice(attr);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Registry priority is per key identifier. We register TCP to `LINUX` for
@@ -69,12 +99,22 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice)
 #if GLOO_HAVE_TRANSPORT_TCP_TLS
 static std::shared_ptr<::gloo::transport::Device> makeTCPTLSDevice(
     const std::string& interface,
+<<<<<<< HEAD
     const std::string& hostname) {
+=======
+    const std::string& hostname,
+    bool lazyInit) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       !interface.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeTCPTLSDevice(): interface or hostname "
       "can't be empty");
 
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(!lazyInit, "TCP_TLS transport does not support lazy init");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ::gloo::transport::tcp::attr attr;
   if (!interface.empty()) {
     attr.iface = interface;
@@ -105,12 +145,22 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP_TLS, makeTCPTLSDevice)
 #if GLOO_HAVE_TRANSPORT_UV
 static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
     const std::string& interfaceName,
+<<<<<<< HEAD
     const std::string& hostname) {
+=======
+    const std::string& hostname,
+    bool lazyInit) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       !interfaceName.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeUVDevice(): interface or hostname "
       "can't be empty");
 
+<<<<<<< HEAD
+=======
+  TORCH_CHECK(!lazyInit, "UV transport does not support lazy init");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ::gloo::transport::uv::attr attr;
   if (!interfaceName.empty()) {
     attr.iface = interfaceName;
@@ -128,6 +178,7 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice)
 C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice)
 #endif
 
+<<<<<<< HEAD
 namespace {
 std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
     const std::string& interfaceName,
@@ -148,6 +199,74 @@ std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
 
 #ifdef _WIN32
   return GlooDeviceRegistry()->Create("WIN32", interfaceName, hostName);
+=======
+#if GLOO_HAVE_TRANSPORT_IBVERBS
+static std::shared_ptr<::gloo::transport::Device> makeIBVerbsDevice(
+    const std::string& interface,
+    const std::string& hostname,
+    bool lazyInit) {
+  if (!hostname.empty()) {
+    TORCH_WARN(
+        "ibverbs transport does not support hostname, defaulting to any");
+  }
+
+  TORCH_CHECK(!lazyInit, "transport does not support lazy init");
+
+  ::gloo::transport::ibverbs::attr attr;
+  attr.name = getCvarString(
+      {
+          "TORCH_GLOO_IBV_NAME",
+      },
+      "");
+  attr.port = getCvarInt(
+      {
+          "TORCH_GLOO_IBV_PORT",
+      },
+      1);
+  attr.index = getCvarInt(
+      {
+          "TORCH_GLOO_IBV_INDEX",
+      },
+      0);
+
+  if (!interface.empty()) {
+    attr.name = interface;
+  }
+
+  // use global port
+  attr.port = 1;
+
+  return ::gloo::transport::ibverbs::CreateDevice(attr);
+}
+
+C10_REGISTER_CREATOR(GlooDeviceRegistry, IBVERBS, makeIBVerbsDevice)
+#endif
+
+namespace {
+std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
+    const std::string& interfaceName,
+    const std::string& hostName,
+    bool lazyInit) {
+  static auto transportName = c10::utils::get_env("GLOO_DEVICE_TRANSPORT");
+  if (transportName.has_value()) {
+    return GlooDeviceRegistry()->Create(
+        transportName.value().c_str(), interfaceName, hostName, lazyInit);
+  }
+
+#ifdef __linux__
+  return GlooDeviceRegistry()->Create(
+      "LINUX", interfaceName, hostName, lazyInit);
+#endif
+
+#ifdef __APPLE__
+  return GlooDeviceRegistry()->Create(
+      "APPLE", interfaceName, hostName, lazyInit);
+#endif
+
+#ifdef _WIN32
+  return GlooDeviceRegistry()->Create(
+      "WIN32", interfaceName, hostName, lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
   return nullptr;
@@ -155,8 +274,13 @@ std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
 } // anonymous namespace
 
 std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
+<<<<<<< HEAD
     makeDeviceForInterface(const std::string& interfaceName) {
   auto device = makeGlooDevice(interfaceName, "");
+=======
+    makeDeviceForInterface(const std::string& interfaceName, bool lazyInit) {
+  auto device = makeGlooDevice(interfaceName, "", lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!device) {
     TORCH_CHECK(false, "makeDeviceForInterface(): unsupported gloo device");
   }
@@ -164,8 +288,13 @@ std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
 }
 
 std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
+<<<<<<< HEAD
     makeDeviceForHostname(const std::string& hostname) {
   auto device = makeGlooDevice("", hostname);
+=======
+    makeDeviceForHostname(const std::string& hostname, bool lazyInit) {
+  auto device = makeGlooDevice("", hostname, lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!device) {
     TORCH_CHECK(false, "makeDeviceForHostname(): unsupported gloo device");
   }
diff --git a/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp b/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp
index 1221e9d033f2..145d7563734b 100644
--- a/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp
+++ b/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp
@@ -14,18 +14,33 @@ class TORCH_API GlooDeviceFactory {
  public:
   // Create new device instance for specific interface.
   static std::shared_ptr<::gloo::transport::Device> makeDeviceForInterface(
+<<<<<<< HEAD
       const std::string& interface);
 
   // Create new device instance for specific hostname or address.
   static std::shared_ptr<::gloo::transport::Device> makeDeviceForHostname(
       const std::string& hostname);
+=======
+      const std::string& interface,
+      bool lazyInit);
+
+  // Create new device instance for specific hostname or address.
+  static std::shared_ptr<::gloo::transport::Device> makeDeviceForHostname(
+      const std::string& hostname,
+      bool lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 TORCH_DECLARE_SHARED_REGISTRY(
     GlooDeviceRegistry,
     ::gloo::transport::Device,
     const std::string&, /* interface */
+<<<<<<< HEAD
     const std::string& /* hostname */);
+=======
+    const std::string&, /* hostname */
+    bool /* lazyInit */);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace c10d
 
diff --git a/torch/csrc/distributed/c10d/HashStore.cpp b/torch/csrc/distributed/c10d/HashStore.cpp
index 50ed5ca5eb8d..575d083fbb68 100644
--- a/torch/csrc/distributed/c10d/HashStore.cpp
+++ b/torch/csrc/distributed/c10d/HashStore.cpp
@@ -9,6 +9,13 @@
 
 namespace c10d {
 
+<<<<<<< HEAD
+=======
+c10::intrusive_ptr<Store> HashStore::clone() {
+  return c10::intrusive_ptr<Store>::unsafe_reclaim_from_nonowning(this);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void HashStore::set(const std::string& key, const std::vector<uint8_t>& data) {
   std::unique_lock<std::mutex> lock(m_);
   map_[key] = data;
@@ -57,6 +64,7 @@ std::vector<uint8_t> HashStore::get(const std::string& key) {
 void HashStore::wait(
     const std::vector<std::string>& keys,
     const std::chrono::milliseconds& timeout) {
+<<<<<<< HEAD
   const auto end = std::chrono::steady_clock::now() + timeout;
   auto pred = [&]() {
     auto done = true;
@@ -70,6 +78,19 @@ void HashStore::wait(
   };
 
   std::unique_lock<std::mutex> lock(m_);
+=======
+  std::unique_lock<std::mutex> lock(m_);
+  waitLocked(lock, keys, timeout);
+}
+
+void HashStore::waitLocked(
+    std::unique_lock<std::mutex>& lock,
+    const std::vector<std::string>& keys,
+    const std::chrono::milliseconds& timeout) {
+  const auto end = std::chrono::steady_clock::now() + timeout;
+  auto pred = [&]() { return checkLocked(lock, keys); };
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (timeout == kNoTimeout) {
     cv_.wait(lock, pred);
   } else {
@@ -108,8 +129,23 @@ bool HashStore::deleteKey(const std::string& key) {
 
 bool HashStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> lock(m_);
+<<<<<<< HEAD
   for (const auto& key : keys) {
     if (map_.find(key) == map_.end()) {
+=======
+
+  return checkLocked(lock, keys);
+}
+
+bool HashStore::checkLocked(
+    const std::unique_lock<std::mutex>& lock,
+    const std::vector<std::string>& keys) {
+  for (const auto& key : keys) {
+    auto foundKV = map_.find(key) != map_.end();
+    auto foundQueue =
+        queues_.find(key) != queues_.end() && !queues_[key].empty();
+    if (!foundKV && !foundQueue) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return false;
     }
   }
@@ -170,4 +206,42 @@ bool HashStore::hasExtendedApi() const {
   return true;
 }
 
+<<<<<<< HEAD
+=======
+void HashStore::queuePush(
+    const std::string& key,
+    const std::vector<uint8_t>& value) {
+  std::unique_lock<std::mutex> lock(m_);
+
+  queues_[key].push_back(value);
+
+  cv_.notify_one();
+}
+
+std::vector<uint8_t> HashStore::queuePop(const std::string& key, bool block) {
+  std::unique_lock<std::mutex> lock(m_);
+
+  if (block) {
+    waitLocked(lock, {key}, timeout_);
+  }
+
+  auto& queue = queues_[key];
+  TORCH_CHECK_WITH(DistQueueEmptyError, !queue.empty(), "queue is empty");
+
+  auto val = queue.front();
+  queue.pop_front();
+  return val;
+}
+
+int64_t HashStore::queueLen(const std::string& key) {
+  std::unique_lock<std::mutex> lock(m_);
+
+  auto it = queues_.find(key);
+  if (it == queues_.end()) {
+    return 0;
+  }
+  return static_cast<int64_t>(it->second.size());
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/HashStore.hpp b/torch/csrc/distributed/c10d/HashStore.hpp
index 3697d62301ba..05b25b4038a3 100644
--- a/torch/csrc/distributed/c10d/HashStore.hpp
+++ b/torch/csrc/distributed/c10d/HashStore.hpp
@@ -10,6 +10,11 @@ namespace c10d {
 
 class TORCH_API HashStore : public Store {
  public:
+<<<<<<< HEAD
+=======
+  c10::intrusive_ptr<Store> clone() override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~HashStore() override = default;
 
   void set(const std::string& key, const std::vector<uint8_t>& data) override;
@@ -50,8 +55,31 @@ class TORCH_API HashStore : public Store {
   // Returns true if this store support append, multiGet and multiSet
   bool hasExtendedApi() const override;
 
+<<<<<<< HEAD
+ protected:
+  std::unordered_map<std::string, std::vector<uint8_t>> map_;
+=======
+  void queuePush(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<uint8_t> queuePop(const std::string& key, bool block) override;
+
+  int64_t queueLen(const std::string& key) override;
+
+ protected:
+  bool checkLocked(
+      const std::unique_lock<std::mutex>& lock,
+      const std::vector<std::string>& keys);
+
+  void waitLocked(
+      std::unique_lock<std::mutex>& lock,
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout);
+
  protected:
   std::unordered_map<std::string, std::vector<uint8_t>> map_;
+  std::unordered_map<std::string, std::deque<std::vector<uint8_t>>> queues_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::mutex m_;
   std::condition_variable cv_;
 };
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index dff8a5f78775..271a2d67e148 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -92,7 +92,13 @@ std::shared_ptr<NCCLComm> NCCLComm::create_scalable(
     int numRanks,
     int rank,
     std::vector<ncclUniqueId>& commIds,
+<<<<<<< HEAD
     ncclConfig_t& config) {
+=======
+    at::DeviceIndex deviceIndex,
+    ncclConfig_t& config) {
+  at::cuda::OptionalCUDAGuard gpuGuard(deviceIndex);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto comm = std::make_shared<NCCLComm>();
   comm->nonBlocking_ = config.blocking == 0;
   LOG(INFO) << "Rank " << rank << ": creating NCCL communicator with mode: "
@@ -112,6 +118,10 @@ std::shared_ptr<NCCLComm> NCCLComm::create_scalable(
   // in the log file and in the replay tool.
   comm->ncclId_ = commIds[0];
   comm->rank_ = rank;
+<<<<<<< HEAD
+=======
+  comm->deviceIndex_ = deviceIndex;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   comm->initialized_ = !comm->nonBlocking_;
   return comm;
 }
@@ -150,6 +160,13 @@ ncclComm_t NCCLComm::getNcclComm() {
   return ncclComm_;
 }
 
+<<<<<<< HEAD
+=======
+at::DeviceIndex NCCLComm::getDeviceIndex() {
+  return deviceIndex_;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Wait for the communicator to be ready. This is a blocking function.
 // Arguments:
 //   longInterval: if true, wait with sleep of an interval; otherwise, wait
@@ -343,7 +360,12 @@ ncclResult_t NCCLComm::checkForNcclError() {
 ncclResult_t NCCLComm::registerSegment(
     void* ptr,
     size_t size,
+<<<<<<< HEAD
     bool errorOnRereg /*=true*/) {
+=======
+    bool errorOnRereg, /*=true*/
+    bool window /*=false*/) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   LockType lock(mutex_);
 #ifdef NCCL_HAS_COMM_REGISTER
   // We register only segments from cache allocator
@@ -364,6 +386,33 @@ ncclResult_t NCCLComm::registerSegment(
   void* handle = nullptr;
   // Use getNcclComm to make sure comm is ready before calling nccl APIs
   auto comm = getNcclComm();
+<<<<<<< HEAD
+=======
+#ifdef NCCL_HAS_COMM_WINDOW_REGISTER
+  if (window) {
+    C10D_NCCL_CHECK(
+        ncclCommWindowRegister(
+            comm, ptr, size, (ncclWindow_t*)&handle, NCCL_WIN_COLL_SYMMETRIC),
+        c10::str(
+            "Failed to window register segment with ptr ",
+            ptr,
+            ", size ",
+            size,
+            " on ncclComm_ ",
+            comm));
+  } else {
+    C10D_NCCL_CHECK(
+        ncclCommRegister(comm, ptr, size, &handle),
+        c10::str(
+            "Failed to register segment with ptr ",
+            ptr,
+            ", size ",
+            size,
+            " on ncclComm_ ",
+            comm));
+  }
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10D_NCCL_CHECK(
       ncclCommRegister(comm, ptr, size, &handle),
       c10::str(
@@ -373,6 +422,10 @@ ncclResult_t NCCLComm::registerSegment(
           size,
           " on ncclComm_ ",
           comm));
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   registeredSegmentHandles_[ptr] = handle;
   return ncclSuccess;
 #else
@@ -380,7 +433,11 @@ ncclResult_t NCCLComm::registerSegment(
 #endif
 }
 
+<<<<<<< HEAD
 ncclResult_t NCCLComm::deregisterSegment(void* ptr) {
+=======
+ncclResult_t NCCLComm::deregisterSegment(void* ptr, bool window /*false*/) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   LockType lock(mutex_);
 #ifdef NCCL_HAS_COMM_REGISTER
   TORCH_CHECK(
@@ -393,6 +450,32 @@ ncclResult_t NCCLComm::deregisterSegment(void* ptr) {
   void* handle = registeredSegmentHandles_[ptr];
   // Use getNcclComm to make sure comm is ready before calling nccl APIs
   auto comm = getNcclComm();
+<<<<<<< HEAD
+=======
+#ifdef NCCL_HAS_COMM_WINDOW_REGISTER
+  if (window) {
+    C10D_NCCL_CHECK(
+        ncclCommWindowDeregister(comm, (ncclWindow_t)handle),
+        c10::str(
+            "Failed to window deregister segment handle ",
+            handle,
+            ", with ptr ",
+            ptr,
+            " on ncclComm_ ",
+            comm));
+  } else {
+    C10D_NCCL_CHECK(
+        ncclCommDeregister(comm, handle),
+        c10::str(
+            "Failed to deregister segment handle ",
+            handle,
+            ", with ptr ",
+            ptr,
+            " on ncclComm_ ",
+            comm));
+  }
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10D_NCCL_CHECK(
       ncclCommDeregister(comm, handle),
       c10::str(
@@ -402,6 +485,10 @@ ncclResult_t NCCLComm::deregisterSegment(void* ptr) {
           ptr,
           " on ncclComm_ ",
           comm));
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   registeredSegmentHandles_.erase(ptr);
   return ncclSuccess;
 #else
@@ -427,6 +514,7 @@ std::unordered_map<std::string, std::string> NCCLComm::ncclCommDump() {
 
 std::string getNcclVersion() {
   static std::string versionString = []() {
+<<<<<<< HEAD
     int version = 0;
     std::string versionString;
     ncclResult_t status = ncclGetVersion(&version);
@@ -442,6 +530,13 @@ std::string getNcclVersion() {
       auto ncclMinor = (version % majorBase) / minorBase;
       auto ncclPatch =
           version % (ncclMajor * majorBase + ncclMinor * minorBase);
+=======
+    auto [ncclMajor, ncclMinor, ncclPatch] = getNcclVersionTuple();
+    std::string versionString;
+    if (ncclMajor == 0 && ncclMinor == 0 && ncclPatch == 0) {
+      versionString = "Unknown NCCL version";
+    } else {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       versionString = std::to_string(ncclMajor) + "." +
           std::to_string(ncclMinor) + "." + std::to_string(ncclPatch);
 #ifdef NCCL_SUFFIX
@@ -457,6 +552,40 @@ std::string getNcclVersion() {
   return versionString;
 }
 
+<<<<<<< HEAD
+=======
+std::tuple<int, int, int> getNcclVersionTuple() {
+  static std::tuple<int, int, int> versionTuple = []() {
+    int version = getNcclVersionNumber();
+    // can't compute the version if call did not return successfully or version
+    // code < 100 (corresponding to 0.1.0)
+    if (version < 100) {
+      return std::make_tuple(0, 0, 0);
+    }
+    // NCCL changed version coding starting 2.9
+    const int majorBase = version < 2900 ? 1000 : 10000;
+    const int minorBase = 100;
+    auto ncclMajor = version / majorBase;
+    auto ncclMinor = (version % majorBase) / minorBase;
+    auto ncclPatch = version % minorBase;
+    return std::make_tuple(ncclMajor, ncclMinor, ncclPatch);
+  }();
+  return versionTuple;
+}
+
+int getNcclVersionNumber() {
+  static int version = []() {
+    int version = 0;
+    ncclResult_t status = ncclGetVersion(&version);
+    if (status != ncclSuccess) {
+      return 0; // Error.
+    }
+    return version;
+  }();
+  return version;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 size_t hashTensors(const std::vector<at::Tensor>& tensors) {
   size_t hash = 0;
   for (auto& tensor : tensors) {
@@ -555,6 +684,20 @@ std::string getNcclErrorDetailStr(
   return interpret + err;
 }
 
+<<<<<<< HEAD
+=======
+// Helper function that gets the data type and issues error if not supported
+ncclDataType_t getNcclDataType(at::ScalarType type) {
+  auto it = ncclDataType.find(type);
+  TORCH_CHECK_WITH(
+      TypeError,
+      it != ncclDataType.end(),
+      "Input tensor data type is not supported for NCCL process group: ",
+      type);
+  return it->second;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Dump proxyTrace log to stdout
 void printNcclCommProxyTrace(
     const std::string& dumpReason,
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index c7cd0a30924e..6ec0ebc95cce 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -13,6 +13,10 @@
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/util/Exception.h>
 #include <nccl.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/cuda/nccl.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/distributed/c10d/TraceUtils.h>
 #include <optional>
 
@@ -62,10 +66,40 @@ static_assert(
 #define NCCL_HAS_COMM_REGISTER
 #endif
 
+<<<<<<< HEAD
+=======
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
+#define NCCL_HAS_COMM_WINDOW_REGISTER
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0)
 #define NCCL_HAS_MEM_ALLOC
 #endif
 
+<<<<<<< HEAD
+=======
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 26, 0)
+#define NCCL_HAS_QOS
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 24, 0)
+#define NCCL_SUPPORTS_FP8
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
+#define NCCL_HAS_COLLNET
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
+#define NCCL_HAS_CTA_POLICY
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
+#define NCCL_HAS_NVLS_CTAS
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Macro to throw on a non-successful NCCL return value.
 #define C10D_NCCL_CHECK(cmd, failureReason)                                   \
   do {                                                                        \
@@ -180,8 +214,41 @@ static_assert(
 
 namespace c10d {
 
+<<<<<<< HEAD
+TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
+TORCH_API std::string getNcclVersion();
+=======
+// NCCL type typing
+static std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
+    {at::kChar, ncclInt8},
+    {at::kByte, ncclUint8},
+    {at::kFloat, ncclFloat},
+    {at::kDouble, ncclDouble},
+    {at::kInt, ncclInt32},
+    {at::kLong, ncclInt64},
+    {at::kHalf, ncclHalf},
+    {at::kBool, ncclUint8},
+#ifdef NCCL_SUPPORTS_FP8
+    {at::kFloat8_e5m2, ncclFloat8e5m2},
+    {at::kFloat8_e4m3fn, ncclFloat8e4m3},
+#else
+    {at::kFloat8_e5m2, ncclUint8},
+    {at::kFloat8_e4m3fn, ncclUint8},
+#endif
+    // NVIDIA GPUs does not support the UZ version standing for "no negative
+    // zero".  See https://onnx.ai/onnx/technical/float8.html
+    {at::kFloat8_e4m3fnuz, ncclUint8},
+    {at::kFloat8_e5m2fnuz, ncclUint8},
+#if HAS_NCCL_BF16_DATATYPE
+    {at::kBFloat16, ncclBfloat16},
+#endif // HAS_NCCL_BF16_DATATYPE
+};
+
 TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
 TORCH_API std::string getNcclVersion();
+TORCH_API std::tuple<int, int, int> getNcclVersionTuple();
+TORCH_API int getNcclVersionNumber();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_API std::string ncclGetErrorWithVersion(ncclResult_t error);
 int nccl_nonblocking_timeout();
 
@@ -191,6 +258,12 @@ TORCH_API std::string getNcclErrorDetailStr(
     ncclResult_t error,
     std::optional<std::string> processGroupFailureReason = std::nullopt);
 
+<<<<<<< HEAD
+=======
+// Helper function that gets the data type and issues error if not supported
+ncclDataType_t getNcclDataType(at::ScalarType type);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // RAII wrapper for NCCL communicator
 class NCCLComm {
   using MutexType = std::recursive_mutex;
@@ -221,6 +294,10 @@ class NCCLComm {
       int numRanks,
       int rank,
       std::vector<ncclUniqueId>& commIds,
+<<<<<<< HEAD
+=======
+      at::DeviceIndex deviceIndex,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ncclConfig_t& config);
 #endif // NCCL_HAS_INIT_RANK_SCALABLE
 #endif // NCCL_HAS_CONFIG
@@ -239,6 +316,10 @@ class NCCLComm {
 #endif
 
   ncclUniqueId getNcclId();
+<<<<<<< HEAD
+=======
+  at::DeviceIndex getDeviceIndex();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Must not be copyable
   NCCLComm(const NCCLComm&) = delete;
@@ -287,9 +368,16 @@ class NCCLComm {
   ncclResult_t registerSegment(
       void* ptr,
       size_t size,
+<<<<<<< HEAD
       bool errorOnRereg = true);
 
   ncclResult_t deregisterSegment(void* ptr);
+=======
+      bool errorOnRereg = true,
+      bool window = false);
+
+  ncclResult_t deregisterSegment(void* ptr, bool window = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::string repr() const;
 
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 6251bfa1817d..7c63f5034ccb 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -17,6 +17,7 @@ TORCH_LIBRARY(c10d, m) {
       .def("wait", [](const c10::intrusive_ptr<Work>& self) { self->wait(); });
   m.class_<ReduceOp>("ReduceOp").def(torch::init<>());
   m.def(
+<<<<<<< HEAD
       "broadcast_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int root_rank, int root_tensor, bool asyncOp, int timeout) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
   m.def(
       "allreduce_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, Tensor? sparse_indices, int timeout) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
@@ -48,6 +49,39 @@ TORCH_LIBRARY(c10d, m) {
       "alltoall_base_(Tensor output, Tensor input, __torch__.torch.classes.c10d.ProcessGroup process_group, int[] output_split_sizes, int[] input_split_sizes, int timeout) -> __torch__.torch.classes.c10d.Work");
   m.def(
       "barrier(Tensor tensor, __torch__.torch.classes.c10d.ProcessGroup process_group, int[] device_ids, int timeout) -> __torch__.torch.classes.c10d.Work");
+=======
+      "broadcast_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int root_rank, int root_tensor, bool async_op=True, int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
+  m.def(
+      "allreduce_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, Tensor? sparse_indices, bool async_op=True, int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
+  m.def(
+      "allreduce_coalesced_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, bool async_op=True, int timeout=-1) -> __torch__.torch.classes.c10d.Work");
+  m.def(
+      "allgather_(Tensor[][] output_tensors, Tensor[] input_tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, bool async_op=True, int timeout=-1) -> (Tensor[][], __torch__.torch.classes.c10d.Work)");
+  m.def(
+      "_allgather_base_(Tensor output_tensor, Tensor input_tensor, __torch__.torch.classes.c10d.ProcessGroup process_group, bool async_op=True, int timeout=-1) -> (Tensor, __torch__.torch.classes.c10d.Work)");
+  m.def(
+      "allgather_coalesced_(Tensor[][] output_lists, Tensor[] input_list, __torch__.torch.classes.c10d.ProcessGroup process_group, bool async_op=True) -> __torch__.torch.classes.c10d.Work");
+  m.def(
+      "allgather_into_tensor_coalesced_(Tensor[] outputs, Tensor[] inputs, __torch__.torch.classes.c10d.ProcessGroup process_group, bool async_op=True) -> __torch__.torch.classes.c10d.Work");
+  m.def(
+      "reduce_scatter_(Tensor[] output_tensors, Tensor[][] input_tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, bool async_op=True, int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
+  m.def(
+      "_reduce_scatter_base_(Tensor output_tensor, Tensor input_tensor, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, bool async_op=True, int timeout=-1) -> (Tensor, __torch__.torch.classes.c10d.Work)");
+  m.def(
+      "reduce_scatter_tensor_coalesced_(Tensor[] outputs, Tensor[] inputs, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, bool async_op=True, int timeout=-1) -> __torch__.torch.classes.c10d.Work");
+  m.def(
+      "reduce_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, __torch__.torch.classes.c10d.ReduceOp reduce_op, int root_rank, int root_tensor, bool async_op=True, int timeout=-1) -> __torch__.torch.classes.c10d.Work");
+  m.def(
+      "gather_(Tensor[][] output_tensors, Tensor[] input_tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int root_rank, bool async_op=True, int timeout=-1) -> __torch__.torch.classes.c10d.Work");
+  m.def(
+      "scatter_(Tensor[] output_tensors, Tensor[][] input_tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int root_rank, bool async_op=True, int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
+  m.def(
+      "alltoall_(Tensor[] output_tensors, Tensor[] input_tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, bool async_op=True, int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
+  m.def(
+      "alltoall_base_(Tensor output, Tensor input, __torch__.torch.classes.c10d.ProcessGroup process_group, int[] output_split_sizes, int[] input_split_sizes, bool async_op=True, int timeout=-1) -> __torch__.torch.classes.c10d.Work");
+  m.def(
+      "barrier(Tensor tensor, __torch__.torch.classes.c10d.ProcessGroup process_group, int[] device_ids, bool async_op=True, int timeout=-1) -> __torch__.torch.classes.c10d.Work");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def(
       "monitored_barrier_(Tensor tensor, __torch__.torch.classes.c10d.ProcessGroup process_group, int[] device_ids, int timeout, bool wait_all_ranks) -> ()");
   m.def(
@@ -118,6 +152,10 @@ IMPL_RECV_ANY_SOURCE(PrivateUse1)
       const c10::intrusive_ptr<ReduceOp>& reduce_op,         \
       int64_t root_rank,                                     \
       int64_t root_tensor,                                   \
+<<<<<<< HEAD
+=======
+      bool asyncOp,                                          \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       int64_t timeout) {                                     \
     auto tensor_vec = tensors.vec();                         \
     return process_group->getBackend(c10::DeviceType::DEV)   \
@@ -127,13 +165,19 @@ IMPL_RECV_ANY_SOURCE(PrivateUse1)
                 *reduce_op.get(),                            \
                 root_rank,                                   \
                 root_tensor,                                 \
+<<<<<<< HEAD
                 std::chrono::milliseconds(timeout)});        \
+=======
+                std::chrono::milliseconds(timeout),          \
+                asyncOp});                                   \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_REDUCE(CPU)
 IMPL_REDUCE(CUDA)
 IMPL_REDUCE(PrivateUse1)
 
+<<<<<<< HEAD
 #define IMPL_BROADCAST(DEV)                                                   \
   std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>               \
       broadcast_##DEV(                                                        \
@@ -153,6 +197,28 @@ IMPL_REDUCE(PrivateUse1)
             asyncOp});                                                        \
     return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(     \
         std::move(tensor_vec), work);                                         \
+=======
+#define IMPL_BROADCAST(DEV)                                               \
+  std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>           \
+      broadcast_##DEV(                                                    \
+          at::TensorList tensors,                                         \
+          const c10::intrusive_ptr<ProcessGroup>& process_group,          \
+          int64_t root_rank,                                              \
+          int64_t root_tensor,                                            \
+          bool asyncOp,                                                   \
+          int64_t timeout) {                                              \
+    auto tensor_vec = tensors.vec();                                      \
+    auto work = process_group->getBackend(c10::DeviceType::DEV)           \
+                    ->broadcast(                                          \
+                        tensor_vec,                                       \
+                        BroadcastOptions{                                 \
+                            root_rank,                                    \
+                            root_tensor,                                  \
+                            std::chrono::milliseconds(timeout),           \
+                            asyncOp});                                    \
+    return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>( \
+        std::move(tensor_vec), work);                                     \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_BROADCAST(CPU)
@@ -162,6 +228,7 @@ IMPL_BROADCAST(PrivateUse1)
 // Return input tensors as output tensors to make inplace allreduce look like
 // a functional API, so that make_fx can correctly build the dependencies in
 // the graph later.
+<<<<<<< HEAD
 #define IMPL_ALLREDUCE(DEV)                                                   \
   std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>               \
       allreduce_##DEV(                                                        \
@@ -177,6 +244,27 @@ IMPL_BROADCAST(PrivateUse1)
             *reduce_op.get(), std::chrono::milliseconds(timeout)});           \
     return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(     \
         std::move(tensor_vec), work);                                         \
+=======
+#define IMPL_ALLREDUCE(DEV)                                               \
+  std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>           \
+      allreduce_##DEV(                                                    \
+          at::TensorList tensors,                                         \
+          const c10::intrusive_ptr<ProcessGroup>& process_group,          \
+          const c10::intrusive_ptr<ReduceOp>& reduce_op,                  \
+          const std::optional<at::Tensor>& sparse_indices,                \
+          bool asyncOp,                                                   \
+          int64_t timeout) {                                              \
+    auto tensor_vec = tensors.vec();                                      \
+    auto work = process_group->getBackend(c10::DeviceType::DEV)           \
+                    ->allreduce(                                          \
+                        tensor_vec,                                       \
+                        AllreduceOptions{                                 \
+                            *reduce_op.get(),                             \
+                            std::chrono::milliseconds(timeout),           \
+                            asyncOp});                                    \
+    return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>( \
+        std::move(tensor_vec), work);                                     \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_ALLREDUCE(CPU)
@@ -188,11 +276,19 @@ IMPL_ALLREDUCE(PrivateUse1)
       at::TensorList tensors,                                     \
       const c10::intrusive_ptr<ProcessGroup>& process_group,      \
       const c10::intrusive_ptr<ReduceOp>& reduce_op,              \
+<<<<<<< HEAD
+=======
+      bool asyncOp,                                               \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       int64_t timeout) {                                          \
     auto tensor_vec = tensors.vec();                              \
     AllreduceCoalescedOptions opts = AllreduceCoalescedOptions{}; \
     opts.reduceOp = *reduce_op.get();                             \
     opts.timeout = std::chrono::milliseconds(timeout);            \
+<<<<<<< HEAD
+=======
+    opts.asyncOp = asyncOp;                                       \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return process_group->getBackend(c10::DeviceType::DEV)        \
         ->allreduce_coalesced(tensor_vec, opts);                  \
   }
@@ -209,12 +305,25 @@ IMPL_ALLREDUCE_COALESCED(PrivateUse1)
           const std::vector<std::vector<at::Tensor>>& output_tensors,          \
           at::TensorList input_tensors,                                        \
           const c10::intrusive_ptr<ProcessGroup>& process_group,               \
+<<<<<<< HEAD
           int64_t timeout) {                                                   \
     auto input_tensors_vec = input_tensors.vec();                              \
     auto work = process_group->getBackend(c10::DeviceType::DEV) -> allgather(  \
         const_cast<std::vector<std::vector<at::Tensor>>&>(output_tensors),     \
         input_tensors_vec,                                                     \
         AllgatherOptions{std::chrono::milliseconds(timeout)});                 \
+=======
+          bool asyncOp,                                                        \
+          int64_t timeout) {                                                   \
+    auto input_tensors_vec = input_tensors.vec();                              \
+    auto work = process_group->getBackend(c10::DeviceType::DEV)                \
+                    ->allgather(                                               \
+                        const_cast<std::vector<std::vector<at::Tensor>>&>(     \
+                            output_tensors),                                   \
+                        input_tensors_vec,                                     \
+                        AllgatherOptions{                                      \
+                            std::chrono::milliseconds(timeout), asyncOp});     \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::                                                               \
         tuple<std::vector<std::vector<at::Tensor>>, c10::intrusive_ptr<Work>>( \
             output_tensors, work);                                             \
@@ -225,6 +334,7 @@ IMPL_ALLGATHER(CPU)
 IMPL_ALLGATHER(CUDA)
 IMPL_ALLGATHER(PrivateUse1)
 
+<<<<<<< HEAD
 #define IMPL__ALLGATHER_BASE(DEV)                                           \
   std::tuple<at::Tensor, c10::intrusive_ptr<Work>> _allgather_base_##DEV(   \
       at::Tensor& output_tensor,                                            \
@@ -239,6 +349,23 @@ IMPL_ALLGATHER(PrivateUse1)
             AllgatherOptions{std::chrono::milliseconds(timeout), asyncOp}); \
     return std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(                \
         output_tensor, work);                                               \
+=======
+#define IMPL__ALLGATHER_BASE(DEV)                                          \
+  std::tuple<at::Tensor, c10::intrusive_ptr<Work>> _allgather_base_##DEV(  \
+      at::Tensor& output_tensor,                                           \
+      at::Tensor& input_tensor,                                            \
+      const c10::intrusive_ptr<ProcessGroup>& process_group,               \
+      bool asyncOp,                                                        \
+      int64_t timeout) {                                                   \
+    auto work = process_group->getBackend(c10::DeviceType::DEV)            \
+                    ->_allgather_base(                                     \
+                        output_tensor,                                     \
+                        input_tensor,                                      \
+                        AllgatherOptions{                                  \
+                            std::chrono::milliseconds(timeout), asyncOp}); \
+    return std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(               \
+        output_tensor, work);                                              \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL__ALLGATHER_BASE(CPU)
@@ -249,12 +376,25 @@ IMPL__ALLGATHER_BASE(PrivateUse1)
   c10::intrusive_ptr<Work> allgather_coalesced_##DEV(                        \
       const std::vector<std::vector<at::Tensor>>& output_lists,              \
       const at::TensorList& input_list,                                      \
+<<<<<<< HEAD
       const c10::intrusive_ptr<ProcessGroup>& process_group) {               \
     auto input_list_vec = input_list.vec();                                  \
     return process_group->getBackend(c10::DeviceType::DEV)                   \
         ->allgather_coalesced(                                               \
             const_cast<std::vector<std::vector<at::Tensor>>&>(output_lists), \
             input_list_vec);                                                 \
+=======
+      const c10::intrusive_ptr<ProcessGroup>& process_group,                 \
+      bool asyncOp) {                                                        \
+    auto input_list_vec = input_list.vec();                                  \
+    auto opts = AllgatherOptions{};                                          \
+    opts.asyncOp = asyncOp;                                                  \
+    return process_group->getBackend(c10::DeviceType::DEV)                   \
+        ->allgather_coalesced(                                               \
+            const_cast<std::vector<std::vector<at::Tensor>>&>(output_lists), \
+            input_list_vec,                                                  \
+            opts);                                                           \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_ALLGATHER_COALESCED(CPU)
@@ -265,17 +405,29 @@ IMPL_ALLGATHER_COALESCED(PrivateUse1)
   c10::intrusive_ptr<c10d::Work> allgather_into_tensor_coalesced_##DEV( \
       at::TensorList outputs,                                           \
       at::TensorList inputs,                                            \
+<<<<<<< HEAD
       const c10::intrusive_ptr<ProcessGroup>& process_group) {          \
     auto output_vec = outputs.vec();                                    \
     auto input_vec = inputs.vec();                                      \
     return process_group->getBackend(c10::DeviceType::DEV)              \
         ->allgather_into_tensor_coalesced(output_vec, input_vec);       \
+=======
+      const c10::intrusive_ptr<ProcessGroup>& process_group,            \
+      bool asyncOp) {                                                   \
+    auto output_vec = outputs.vec();                                    \
+    auto input_vec = inputs.vec();                                      \
+    auto opts = AllgatherOptions{};                                     \
+    opts.asyncOp = asyncOp;                                             \
+    return process_group->getBackend(c10::DeviceType::DEV)              \
+        ->allgather_into_tensor_coalesced(output_vec, input_vec, opts); \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_ALLGATHER_INTO_TENSOR_COALESCED(CPU)
 IMPL_ALLGATHER_INTO_TENSOR_COALESCED(CUDA)
 IMPL_ALLGATHER_INTO_TENSOR_COALESCED(PrivateUse1)
 
+<<<<<<< HEAD
 #define IMPL_REDUCE_SCATTER(DEV)                                              \
   std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>               \
       reduce_scatter_##DEV(                                                   \
@@ -293,6 +445,29 @@ IMPL_ALLGATHER_INTO_TENSOR_COALESCED(PrivateUse1)
                 *reduce_op.get(), std::chrono::milliseconds(timeout)});       \
     return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(     \
         output_tensors_vec, work);                                            \
+=======
+#define IMPL_REDUCE_SCATTER(DEV)                                           \
+  std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>            \
+      reduce_scatter_##DEV(                                                \
+          const at::TensorList& output_tensors,                            \
+          const std::vector<std::vector<at::Tensor>>& input_tensors,       \
+          const c10::intrusive_ptr<ProcessGroup>& process_group,           \
+          const c10::intrusive_ptr<ReduceOp>& reduce_op,                   \
+          bool asyncOp,                                                    \
+          int64_t timeout) {                                               \
+    auto output_tensors_vec = output_tensors.vec();                        \
+    auto work = process_group->getBackend(c10::DeviceType::DEV)            \
+                    ->reduce_scatter(                                      \
+                        output_tensors_vec,                                \
+                        const_cast<std::vector<std::vector<at::Tensor>>&>( \
+                            input_tensors),                                \
+                        ReduceScatterOptions{                              \
+                            *reduce_op.get(),                              \
+                            std::chrono::milliseconds(timeout),            \
+                            asyncOp});                                     \
+    return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(  \
+        output_tensors_vec, work);                                         \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_REDUCE_SCATTER(CPU)
@@ -308,7 +483,11 @@ IMPL_REDUCE_SCATTER(PrivateUse1)
       bool asyncOp,                                                            \
       int64_t timeout) {                                                       \
     auto work = process_group->getBackend(c10::DeviceType::DEV)                \
+<<<<<<< HEAD
                     -> _reduce_scatter_base(                                   \
+=======
+                    ->_reduce_scatter_base(                                    \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         output_tensor,                                         \
                         input_tensor,                                          \
                         ReduceScatterOptions{                                  \
@@ -329,6 +508,10 @@ IMPL__REDUCE_SCATTER_BASE(PrivateUse1)
       at::TensorList inputs,                                            \
       const c10::intrusive_ptr<ProcessGroup>& process_group,            \
       const c10::intrusive_ptr<ReduceOp>& reduce_op,                    \
+<<<<<<< HEAD
+=======
+      bool asyncOp,                                                     \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       int64_t timeout) {                                                \
     auto output_vec = outputs.vec();                                    \
     auto input_vec = inputs.vec();                                      \
@@ -337,7 +520,13 @@ IMPL__REDUCE_SCATTER_BASE(PrivateUse1)
             output_vec,                                                 \
             input_vec,                                                  \
             ReduceScatterOptions{                                       \
+<<<<<<< HEAD
                 *reduce_op.get(), std::chrono::milliseconds(timeout)}); \
+=======
+                *reduce_op.get(),                                       \
+                std::chrono::milliseconds(timeout),                     \
+                asyncOp});                                              \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_REDUCE_SCATTER_TENSOR_COALESCED(CPU)
@@ -350,13 +539,22 @@ IMPL_REDUCE_SCATTER_TENSOR_COALESCED(PrivateUse1)
       const at::TensorList& input_tensors,                                     \
       const c10::intrusive_ptr<ProcessGroup>& process_group,                   \
       int64_t root_rank,                                                       \
+<<<<<<< HEAD
+=======
+      bool asyncOp,                                                            \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       int64_t timeout) {                                                       \
     auto input_tensors_vec = input_tensors.vec();                              \
     return process_group->getBackend(c10::DeviceType::DEV)                     \
         ->gather(                                                              \
             const_cast<std::vector<std::vector<at::Tensor>>&>(output_tensors), \
             input_tensors_vec,                                                 \
+<<<<<<< HEAD
             GatherOptions{root_rank, std::chrono::milliseconds(timeout)});     \
+=======
+            GatherOptions{                                                     \
+                root_rank, std::chrono::milliseconds(timeout), asyncOp});      \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_GATHER(CPU)
@@ -372,11 +570,22 @@ IMPL_GATHER(PrivateUse1)
       bool asyncOp,                                                            \
       int64_t timeout) {                                                       \
     auto output_tensors_vec = output_tensors.vec();                            \
+<<<<<<< HEAD
     auto work = process_group->getBackend(c10::DeviceType::DEV) -> scatter(    \
         output_tensors_vec,                                                    \
         const_cast<std::vector<std::vector<at::Tensor>>&>(input_tensors),      \
         ScatterOptions{                                                        \
             root_rank, std::chrono::milliseconds(timeout), asyncOp});          \
+=======
+    auto work =                                                                \
+        process_group->getBackend(c10::DeviceType::DEV)                        \
+            ->scatter(                                                         \
+                output_tensors_vec,                                            \
+                const_cast<std::vector<std::vector<at::Tensor>>&>(             \
+                    input_tensors),                                            \
+                ScatterOptions{                                                \
+                    root_rank, std::chrono::milliseconds(timeout), asyncOp});  \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(      \
         std::move(output_tensors_vec), work);                                  \
   }
@@ -385,6 +594,7 @@ IMPL_SCATTER(CPU)
 IMPL_SCATTER(CUDA)
 IMPL_SCATTER(PrivateUse1)
 
+<<<<<<< HEAD
 #define IMPL_ALLTOALL(DEV)                                                   \
   std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>              \
       alltoall_##DEV(                                                        \
@@ -400,12 +610,33 @@ IMPL_SCATTER(PrivateUse1)
         AllToAllOptions{std::chrono::milliseconds(timeout)});                \
     return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(    \
         std::move(output_tensors_vec), work);                                \
+=======
+#define IMPL_ALLTOALL(DEV)                                                     \
+  std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>                \
+      alltoall_##DEV(                                                          \
+          const at::TensorList& output_tensors,                                \
+          const at::TensorList& input_tensors,                                 \
+          const c10::intrusive_ptr<ProcessGroup>& process_group,               \
+          bool asyncOp,                                                        \
+          int64_t timeout) {                                                   \
+    auto output_tensors_vec = output_tensors.vec();                            \
+    auto input_tensors_vec = input_tensors.vec();                              \
+    auto work =                                                                \
+        process_group->getBackend(c10::DeviceType::DEV)                        \
+            ->alltoall(                                                        \
+                output_tensors_vec,                                            \
+                input_tensors_vec,                                             \
+                AllToAllOptions{std::chrono::milliseconds(timeout), asyncOp}); \
+    return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(      \
+        std::move(output_tensors_vec), work);                                  \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_ALLTOALL(CPU)
 IMPL_ALLTOALL(CUDA)
 IMPL_ALLTOALL(PrivateUse1)
 
+<<<<<<< HEAD
 #define IMPL_ALLTOALL_BASE(DEV)                                   \
   c10::intrusive_ptr<Work> alltoall_base_##DEV(                   \
       at::Tensor& output,                                         \
@@ -421,6 +652,24 @@ IMPL_ALLTOALL(PrivateUse1)
             output_split_sizes,                                   \
             input_split_sizes,                                    \
             AllToAllOptions{std::chrono::milliseconds(timeout)}); \
+=======
+#define IMPL_ALLTOALL_BASE(DEV)                                            \
+  c10::intrusive_ptr<Work> alltoall_base_##DEV(                            \
+      at::Tensor& output,                                                  \
+      at::Tensor& input,                                                   \
+      const c10::intrusive_ptr<ProcessGroup>& process_group,               \
+      std::vector<int64_t> output_split_sizes,                             \
+      std::vector<int64_t> input_split_sizes,                              \
+      bool asyncOp,                                                        \
+      int64_t timeout) {                                                   \
+    return process_group->getBackend(c10::DeviceType::DEV)                 \
+        ->alltoall_base(                                                   \
+            output,                                                        \
+            input,                                                         \
+            output_split_sizes,                                            \
+            input_split_sizes,                                             \
+            AllToAllOptions{std::chrono::milliseconds(timeout), asyncOp}); \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_ALLTOALL_BASE(CPU)
@@ -428,6 +677,7 @@ IMPL_ALLTOALL_BASE(CUDA)
 IMPL_ALLTOALL_BASE(PrivateUse1)
 
 // NOLINTBEGIN(performance-unnecessary-value-param)
+<<<<<<< HEAD
 #define IMPL_BARRIER(DEV)                                                    \
   c10::intrusive_ptr<Work> barrier##DEV(                                     \
       at::Tensor /* unused */,                                               \
@@ -437,6 +687,20 @@ IMPL_ALLTOALL_BASE(PrivateUse1)
     return process_group->getBackend(c10::DeviceType::DEV)                   \
         ->barrier(                                                           \
             BarrierOptions{device_ids, std::chrono::milliseconds(timeout)}); \
+=======
+#define IMPL_BARRIER(DEV)                                                  \
+  c10::intrusive_ptr<Work> barrier##DEV(                                   \
+      at::Tensor /* unused */,                                             \
+      const c10::intrusive_ptr<ProcessGroup>& process_group,               \
+      const std::vector<int64_t>& device_ids,                              \
+      bool asyncOp,                                                        \
+      int64_t timeout) {                                                   \
+    auto opts = BarrierOptions{};                                          \
+    opts.device_ids = device_ids;                                          \
+    opts.timeout = std::chrono::milliseconds(timeout);                     \
+    opts.asyncOp = asyncOp;                                                \
+    return process_group->getBackend(c10::DeviceType::DEV)->barrier(opts); \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
 IMPL_BARRIER(CPU)
@@ -464,6 +728,10 @@ allreduce_sparse_cuda_(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const c10::intrusive_ptr<ReduceOp>& reduce_op,
     const std::optional<at::Tensor>& sparse_indices,
+<<<<<<< HEAD
+=======
+    bool asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t timeout) {
   auto tensor_vec = tensors.vec();
   auto work = process_group->getBackend(c10::DeviceType::CUDA)
@@ -472,6 +740,10 @@ allreduce_sparse_cuda_(
                       AllreduceOptions{
                           *reduce_op,
                           std::chrono::milliseconds(timeout),
+<<<<<<< HEAD
+=======
+                          asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                           sparse_indices});
 
   return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
diff --git a/torch/csrc/distributed/c10d/PrefixStore.cpp b/torch/csrc/distributed/c10d/PrefixStore.cpp
index bd69c9ad3dc6..f8a289bac9ed 100644
--- a/torch/csrc/distributed/c10d/PrefixStore.cpp
+++ b/torch/csrc/distributed/c10d/PrefixStore.cpp
@@ -6,6 +6,13 @@ namespace c10d {
 PrefixStore::PrefixStore(std::string prefix, c10::intrusive_ptr<Store> store)
     : prefix_(std::move(prefix)), store_(std::move(store)) {}
 
+<<<<<<< HEAD
+=======
+c10::intrusive_ptr<Store> PrefixStore::clone() {
+  return c10::make_intrusive<PrefixStore>(prefix_, store_->clone());
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::string PrefixStore::joinKey(const std::string& key) {
   return prefix_ + "/" + key;
 }
@@ -106,6 +113,23 @@ bool PrefixStore::hasExtendedApi() const {
   return store_->hasExtendedApi();
 }
 
+<<<<<<< HEAD
+=======
+void PrefixStore::queuePush(
+    const std::string& key,
+    const std::vector<uint8_t>& value) {
+  store_->queuePush(joinKey(key), value);
+}
+
+std::vector<uint8_t> PrefixStore::queuePop(const std::string& key, bool block) {
+  return store_->queuePop(joinKey(key), block);
+}
+
+int64_t PrefixStore::queueLen(const std::string& key) {
+  return store_->queueLen(joinKey(key));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 c10::intrusive_ptr<Store> PrefixStore::getUnderlyingStore() {
   return store_;
 }
diff --git a/torch/csrc/distributed/c10d/PrefixStore.hpp b/torch/csrc/distributed/c10d/PrefixStore.hpp
index 19098f0c38f0..d73686793b3b 100644
--- a/torch/csrc/distributed/c10d/PrefixStore.hpp
+++ b/torch/csrc/distributed/c10d/PrefixStore.hpp
@@ -8,6 +8,11 @@ class TORCH_API PrefixStore : public Store {
  public:
   explicit PrefixStore(std::string prefix, c10::intrusive_ptr<Store> store);
 
+<<<<<<< HEAD
+=======
+  c10::intrusive_ptr<Store> clone() override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   using Store::set;
   void set(const std::string& key, const std::vector<uint8_t>& value) override;
 
@@ -50,6 +55,16 @@ class TORCH_API PrefixStore : public Store {
   // Returns true if this store support append, multiGet and multiSet
   bool hasExtendedApi() const override;
 
+<<<<<<< HEAD
+=======
+  void queuePush(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<uint8_t> queuePop(const std::string& key, bool block) override;
+
+  int64_t queueLen(const std::string& key) override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::intrusive_ptr<Store> getUnderlyingStore();
 
   // Recursively to fetch the store before layers of wrapping with PrefixStore.
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 58f9a7340ede..8689dc0be40b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -224,6 +224,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                     const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                     const c10::intrusive_ptr<::c10d::ReduceOp>&,
                     const std::optional<at::Tensor>& sparse_indices,
+<<<<<<< HEAD
+=======
+                    bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     int64_t)>();
 
     auto work = std::get<1>(op.call(
@@ -231,6 +235,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<ReduceOp>(opts.reduceOp),
         opts.sparseIndices,
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.timeout.count()));
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
@@ -250,12 +258,20 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              at::TensorList,
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              const c10::intrusive_ptr<::c10d::ReduceOp>&,
+<<<<<<< HEAD
+=======
+                             bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                              int64_t)>();
 
     auto work = op.call(
         tensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<ReduceOp>(opts.reduceOp),
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.timeout.count());
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
@@ -277,6 +293,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              const c10::intrusive_ptr<::c10d::ReduceOp>&,
                              int64_t,
                              int64_t,
+<<<<<<< HEAD
+=======
+                             bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                              int64_t)>();
     auto work = op.call(
         tensors,
@@ -284,6 +304,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         c10::make_intrusive<ReduceOp>(opts.reduceOp),
         opts.rootRank,
         opts.rootTensor,
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.timeout.count());
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
@@ -306,12 +330,20 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              const std::vector<std::vector<at::Tensor>>&,
                              at::TensorList,
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+<<<<<<< HEAD
+=======
+                             bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                              int64_t)>();
 
     auto work = std::get<1>(op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.timeout.count()));
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
@@ -363,6 +395,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
       std::vector<std::vector<at::Tensor>>& outputTensorLists,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& opts = AllgatherOptions()) {
+<<<<<<< HEAD
     static auto op =
         c10::Dispatcher::singleton()
             .findSchemaOrThrow("c10d::allgather_coalesced_", "")
@@ -370,11 +403,25 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                 const std::vector<std::vector<at::Tensor>>&,
                 const at::TensorList&,
                 const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
+=======
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::allgather_coalesced_", "")
+                         .typed<c10::intrusive_ptr<Work>(
+                             const std::vector<std::vector<at::Tensor>>&,
+                             const at::TensorList&,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             bool)>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto work = op.call(
         outputTensorLists,
         inputTensors,
+<<<<<<< HEAD
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this));
+=======
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.asyncOp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
       for (const auto& tensor_list : outputTensorLists) {
@@ -399,12 +446,22 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
             .typed<c10::intrusive_ptr<Work>(
                 const at::TensorList,
                 const at::TensorList,
+<<<<<<< HEAD
                 const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
+=======
+                const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                bool)>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     auto work = op.call(
         outputTensors,
         inputTensors,
+<<<<<<< HEAD
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this));
+=======
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.asyncOp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
       for (const auto& tensor : outputTensors) {
@@ -425,12 +482,20 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              const at::TensorList&,
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              int64_t,
+<<<<<<< HEAD
+=======
+                             bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                              int64_t)>();
     auto work = op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         opts.rootRank,
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.timeout.count());
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
@@ -487,12 +552,20 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                     const std::vector<std::vector<at::Tensor>>&,
                     const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                     const c10::intrusive_ptr<::c10d::ReduceOp>&,
+<<<<<<< HEAD
+=======
+                    bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     int64_t)>();
     auto work = std::get<1>(op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.timeout.count()));
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
@@ -546,6 +619,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                 const at::TensorList,
                 const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                 const c10::intrusive_ptr<::c10d::ReduceOp>&,
+<<<<<<< HEAD
+=======
+                bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 int64_t)>();
 
     auto work = op.call(
@@ -553,6 +630,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.timeout.count());
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
@@ -577,6 +658,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              std::vector<int64_t>,
                              std::vector<int64_t>,
+<<<<<<< HEAD
+=======
+                             bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                              int64_t)>();
     auto work = op.call(
         outputBuffer,
@@ -584,6 +669,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         outputSplitSizes,
         inputSplitSizes,
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.timeout.count());
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
@@ -604,11 +693,19 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                     const at::TensorList&,
                     const at::TensorList&,
                     const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+<<<<<<< HEAD
+=======
+                    bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     int64_t)>();
     auto work = std::get<1>(op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.timeout.count()));
 
     if (c10d::allow_inflight_collective_as_graph_input()) {
@@ -778,12 +875,20 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              at::Tensor,
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              const std::vector<int64_t>&,
+<<<<<<< HEAD
+=======
+                             bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                              int64_t)>();
 
     auto work = op.call(
         tensor,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         opts.device_ids,
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         opts.timeout.count());
     if (c10d::allow_inflight_collective_as_graph_input()) {
       c10d::register_work(tensor, work);
@@ -821,14 +926,24 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   }
 
   c10::intrusive_ptr<Backend> getDefaultBackend() const {
+<<<<<<< HEAD
     TORCH_CHECK(
         backendTypeToBackend_.find(backendType_) != backendTypeToBackend_.end(),
+=======
+    auto backend_iter = backendTypeToBackend_.find(backendType_);
+    TORCH_CHECK(
+        backend_iter != backendTypeToBackend_.end(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "Could not find the default backend type ",
         uint16_t(backendType_),
         " for Process Group with name ",
         getBackendName(),
         ".");
+<<<<<<< HEAD
     return backendTypeToBackend_.at(backendType_);
+=======
+    return backend_iter->second;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void setDefaultBackend(const BackendType& backendType) {
@@ -887,6 +1002,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   }
 
   bool hasHooks() const {
+<<<<<<< HEAD
     // `getDefaultBackend` will throw today if the backend is set to `undefined`
     // (in case of `init_process_group(nothing)`)
     try {
@@ -898,6 +1014,20 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
           ". Assuming no hooks are registered.");
       return false;
     }
+=======
+    auto backend_iter = backendTypeToBackend_.find(backendType_);
+    if (backend_iter == backendTypeToBackend_.end()) {
+      TORCH_WARN(
+          "No backend of type ",
+          uint16_t(backendType_),
+          " found for Process Group with name ",
+          getBackendName(),
+          ". Assuming no hooks are registered.");
+      return false;
+    }
+
+    return backend_iter->second->hasHooks();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   virtual const std::string& getGroupName() const;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index a9982125f637..33b6097e021f 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -4,9 +4,18 @@
 
 #ifdef USE_C10D_GLOO
 
+<<<<<<< HEAD
 #include <torch/csrc/distributed/c10d/GlooDeviceFactory.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+=======
+#include <torch/csrc/distributed/c10d/FlightRecorder.hpp>
+#include <torch/csrc/distributed/c10d/GlooDeviceFactory.hpp>
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <chrono>
 #include <exception>
 
@@ -24,6 +33,7 @@
 #include <type_traits>
 #include <utility>
 
+<<<<<<< HEAD
 #include <gloo/allgather.h>
 #include <gloo/allgatherv.h>
 #include <gloo/allreduce.h>
@@ -35,6 +45,8 @@
 #include <gloo/reduce.h>
 #include <gloo/scatter.h>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/ThreadLocalState.h>
 #include <ATen/native/SparseTensorUtils.h>
 
@@ -45,6 +57,7 @@
 #include <gloo/rendezvous/context.h>
 #include <gloo/rendezvous/prefix_store.h>
 
+<<<<<<< HEAD
 #ifdef _WIN32
 #define GENERATE_ALL_TYPES(type, func, ...)      \
   switch (type) {                                \
@@ -111,6 +124,8 @@
   }
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace c10d {
 
 namespace {
@@ -172,6 +187,7 @@ void checkRemainingTime(
   }
 }
 
+<<<<<<< HEAD
 typedef void (*ReduceFunc)(void*, const void*, const void*, size_t);
 
 template <typename T, std::enable_if_t<!std::is_integral_v<T>, int> = 0>
@@ -323,6 +339,11 @@ at::Tensor pinnedLike(at::Tensor& tensor) {
   return at::empty({0}, tensor.options().device(at::kCPU))
       .set_(storage, 0, tensor.sizes(), tensor.strides());
 }
+=======
+const auto kLoopbackAddress = "127.0.0.1";
+
+} // namespace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // This function initializes a vector of CUDA streams, one for every
 // tensor in the input tensor vector, and ensures that these streams are
@@ -342,7 +363,11 @@ void initializeStreamsEvents(
     events.emplace_back(device.type());
     events[i].record(impl.getStream(device));
     // Get a non-default stream to execute asynchronous CUDA operations
+<<<<<<< HEAD
     // on for this device. This ensures that the default stream used
+=======
+    // on this device. This ensures that the default stream used
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // by the caller is not occupied by c10d related operations.
     streams.push_back(
         impl.getStreamFromGlobalPool(device, /*isHighPriority=*/true));
@@ -415,9 +440,15 @@ void initializeStreamsEvents(
   }
 }
 
+<<<<<<< HEAD
 const auto kLoopbackAddress = "127.0.0.1";
 
 } // namespace
+=======
+bool getDefaultGlooLazyInit() {
+  return ::c10d::getCvarBool(TORCH_GLOO_LAZY_INIT, false);
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // static
 void ProcessGroupGloo::AsyncWork::execute(
@@ -456,6 +487,13 @@ c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupGloo::AsyncWork::
   return future_;
 }
 
+<<<<<<< HEAD
+=======
+std::chrono::milliseconds ProcessGroupGloo::AsyncWork::getTimeout() const {
+  return context_->getTimeout();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 c10::intrusive_ptr<c10::ivalue::Future> createFutureAsOutput(
     const std::vector<std::vector<at::Tensor>>& outputTensors) {
@@ -513,6 +551,10 @@ inline void ProcessGroupGloo::AsyncWork::recordAsyncWorkProfilingInfo(
 }
 
 ProcessGroupGloo::AsyncWork::AsyncWork(
+<<<<<<< HEAD
+=======
+    std::shared_ptr<gloo::Context> context,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<std::vector<at::Tensor>> outputTensors,
     OpType opType,
     uint64_t seq,
@@ -522,11 +564,19 @@ ProcessGroupGloo::AsyncWork::AsyncWork(
     // replace default profiler implementation with async version that reports
     // correct timestamps for work that is asynchronously executed.
     : Work(-1, opType, nullptr, inputTensors),
+<<<<<<< HEAD
+=======
+      context_(std::move(context)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       outputTensors_(std::move(outputTensors)),
       future_(createFutureAsOutput(outputTensors_)),
       seq_(seq) {
   if (profilingTitle != nullptr) {
     recordAsyncWorkProfilingInfo(profilingTitle, inputTensors);
+<<<<<<< HEAD
+=======
+    profilingTitle_ = profilingTitle;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -659,7 +709,11 @@ void socketInitialize() {
 // gracefully fall back to an alternative if it doesn't.
 bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
   socketInitialize();
+<<<<<<< HEAD
   struct addrinfo hints {};
+=======
+  struct addrinfo hints{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   hints.ai_family = AF_UNSPEC;
   hints.ai_socktype = SOCK_STREAM;
   struct addrinfo* result = nullptr;
@@ -691,23 +745,41 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
 } // namespace
 
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
+<<<<<<< HEAD
     createDeviceForInterface(const std::string& interface_name) {
   return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface_name);
 }
 
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
     createDeviceForHostname(const std::string& hostname) {
+=======
+    createDeviceForInterface(const std::string& interface_name, bool lazyInit) {
+  return ::c10d::GlooDeviceFactory::makeDeviceForInterface(
+      interface_name, lazyInit);
+}
+
+std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
+    createDeviceForHostname(const std::string& hostname, bool lazyInit) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       doesHostnameResolveToUsableAddress(hostname),
       "Cannot resolve ",
       hostname,
       " to a (local) address");
+<<<<<<< HEAD
   return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname);
+=======
+  return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname, lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #if defined(__linux__) || defined(_WIN32)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
+<<<<<<< HEAD
     createDefaultDevice() {
+=======
+    createDefaultDevice(bool lazyInit) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Use the hostname to resolve the network address to
   // use. Note: if the hostname does not resolve to an address (e.g.
   // because of misconfigured /etc/hosts file), this will not work.
@@ -720,7 +792,12 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
 
   // Use this machine's hostname if it resolves to an address.
   if (doesHostnameResolveToUsableAddress(hostname.data())) {
+<<<<<<< HEAD
     return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname.data());
+=======
+    return ::c10d::GlooDeviceFactory::makeDeviceForHostname(
+        hostname.data(), lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Otherwise, use the loopback address.
@@ -728,13 +805,21 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
       "Unable to resolve hostname to a (local) address. ",
       "Using the loopback address as fallback. ",
       "Manually set the network interface to bind to with GLOO_SOCKET_IFNAME.");
+<<<<<<< HEAD
   return createDeviceForHostname(kLoopbackAddress);
+=======
+  return createDeviceForHostname(kLoopbackAddress, lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #endif
 
 #ifdef __APPLE__
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
+<<<<<<< HEAD
     createDefaultDevice() {
+=======
+    createDefaultDevice(bool lazyInit) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Use the hostname to resolve the network address to
   // use. Note: if the hostname does not resolve to an address (e.g.
   // because of misconfigured /etc/hosts file), this will not work.
@@ -747,7 +832,12 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
 
   // Use this machine's hostname if it resolves to an address.
   if (doesHostnameResolveToUsableAddress(hostname.get())) {
+<<<<<<< HEAD
     return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname.get());
+=======
+    return ::c10d::GlooDeviceFactory::makeDeviceForHostname(
+        hostname.get(), lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Otherwise, use the loopback address.
@@ -755,10 +845,19 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
       "Unable to resolve hostname to a (local) address. ",
       "Using the loopback address as fallback. ",
       "Manually set the network interface to bind to with GLOO_SOCKET_IFNAME.");
+<<<<<<< HEAD
   return createDeviceForHostname(kLoopbackAddress);
 }
 #endif
 
+=======
+  return createDeviceForHostname(kLoopbackAddress, lazyInit);
+}
+#endif
+
+static std::atomic<size_t> process_group_id = 0;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ProcessGroupGloo::ProcessGroupGloo(
     const c10::intrusive_ptr<Store>& store,
     int rank,
@@ -768,7 +867,12 @@ ProcessGroupGloo::ProcessGroupGloo(
       store_(new GlooStore(store)),
       options_(std::move(options)),
       stop_(false),
+<<<<<<< HEAD
       collectiveCounter_(0) {
+=======
+      collectiveCounter_(0),
+      local_id_(process_group_id++) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto& devices = options_->devices;
   if (devices.empty()) {
     TORCH_CHECK(false, "No device(s) specified");
@@ -789,10 +893,32 @@ ProcessGroupGloo::ProcessGroupGloo(
   contexts_.reserve(options_->devices.size());
   for (const auto i : c10::irange(options_->devices.size())) {
     auto context = std::make_shared<::gloo::rendezvous::Context>(rank_, size_);
+<<<<<<< HEAD
     auto store = ::gloo::rendezvous::PrefixStore(std::to_string(i), *store_);
     context->setTimeout(options_->timeout);
     try {
       context->connectFullMesh(store, options_->devices[i]);
+=======
+
+#ifdef GLOO_SHARED_STORE
+    auto underlyingStore = store_;
+#else
+    auto& underlyingStore = *store_;
+#endif
+
+    auto store = std::make_shared<::gloo::rendezvous::PrefixStore>(
+        std::to_string(i), underlyingStore);
+
+#ifdef GLOO_SHARED_STORE
+    auto connectStore = store;
+#else
+    auto& connectStore = *store;
+#endif
+
+    context->setTimeout(options_->timeout);
+    try {
+      context->connectFullMesh(connectStore, options_->devices[i]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } catch (const std::runtime_error& e) {
       auto err = e.what();
       // TORCH_CHECK to print the cpp stacktrace.
@@ -812,8 +938,19 @@ ProcessGroupGloo::ProcessGroupGloo(
   for (const auto i : c10::irange(threads_.size())) {
     threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this, i);
   }
+<<<<<<< HEAD
 
   init();
+=======
+  this->setGroupUid(options_->group_name);
+
+  // TODO: If gloo has version, we also need to log gloo version into FR.
+  FlightRecorder<c10::Event>::get()->record_pg_ranks(
+      std::make_tuple(pg_uid_, pg_desc_), groupRanks());
+  init();
+
+  // TODO: Add configs print like ProcessGroupNCCL.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 ProcessGroupGloo::~ProcessGroupGloo() {
@@ -861,13 +998,60 @@ void ProcessGroupGloo::runLoop(int workerIndex) {
     workConsumeCV_.notify_one();
 
     AsyncWork::execute(work);
+<<<<<<< HEAD
+=======
+    // TODO: Need to find a way to calculate the difference of duration of two
+    // c10d::Event
+    pgStatus_->lastCompletedSeq = static_cast<int64_t>(work->seq_);
+    pgStatus_->lastCompletedWorkName = opTypeToString(work->opType_);
+    // TODO: We need to have numel of tensors for gloo as well.
+    pgStatus_->lastCompletedNumelIn = 0;
+    pgStatus_->lastCompletedNumelOut = 0;
+    FlightRecorder<c10::Event>::get()->retire_id(work->trace_id_, false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     lock.lock();
     workInProgress_[workerIndex].reset();
   }
 }
 
+<<<<<<< HEAD
 void ProcessGroupGloo::enqueue(c10::intrusive_ptr<AsyncWork> work) {
   std::unique_lock<std::mutex> lock(workMutex_);
+=======
+const std::vector<uint64_t>& ProcessGroupGloo::groupRanks() const {
+  if (options_->global_ranks_in_group.empty() && local_id_ == 0) {
+    static std::vector<uint64_t> globalRanks(size_);
+    std::iota(globalRanks.begin(), globalRanks.end(), 0);
+    return globalRanks;
+  }
+  return options_->global_ranks_in_group;
+}
+
+void ProcessGroupGloo::enqueue(c10::intrusive_ptr<AsyncWork> work) {
+  std::unique_lock<std::mutex> lock(workMutex_);
+  pgStatus_->lastEnqueuedSeq = static_cast<int64_t>(work->seq_);
+  pgStatus_->lastEnqueuedWorkName = opTypeToString(work->opType_);
+  // TODO: We need to have numel of tensors for gloo as well.
+  pgStatus_->lastEnqueuedNumelIn = 0;
+  pgStatus_->lastEnqueuedNumelOut = 0;
+  // using c10d::FlightRecorder;
+  // TODO: We need to have a way to use c10::Event inside gloo as well.
+  work->trace_id_ = FlightRecorder<c10::Event>::get()->record(
+      local_id_,
+      std::make_tuple(pg_uid_, pg_desc_),
+      collectiveCounter_,
+      0, // p2p_seq_id, set 0 for now since p2p does not call enqueue
+      work->getSequencenumber(), // We need to differentiate between p2p and
+                                 // non-p2p op.
+      work->getProfilerTitle(),
+      work->getInputTensors(),
+      work->getOutputTensors(),
+      nullptr,
+      nullptr,
+      work->getTimeout(),
+      pgStatus_,
+      false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   workQueue_.push_back(std::move(work));
   lock.unlock();
 
@@ -888,18 +1072,28 @@ class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
       uint32_t tag,
       uint64_t seq)
       : ProcessGroupGloo::AsyncWork(
+<<<<<<< HEAD
+=======
+            std::move(context),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {inputs},
             OpType::BROADCAST,
             seq,
             "gloo:broadcast",
             inputs),
+<<<<<<< HEAD
         context(std::move(context)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputs(inputs),
         rootRank(rootRank),
         rootTensor(rootTensor),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::shared_ptr<gloo::Context> context;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<at::Tensor> inputs{};
   const int rootRank;
   const int rootTensor;
@@ -907,13 +1101,28 @@ class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
 
   void broadcast(at::Tensor& tensor) {
     const auto& scalarType = tensor.scalar_type();
+<<<<<<< HEAD
     gloo::BroadcastOptions opts(context);
+=======
+    gloo::BroadcastOptions opts(context_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opts.setRoot(rootRank);
     opts.setTag(tag);
     GENERATE_ALL_TYPES(scalarType, setOutput, opts, tensor);
     gloo::broadcast(opts);
   }
 
+<<<<<<< HEAD
+=======
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return inputs;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void run() override {
     broadcast(inputs[rootTensor]);
 
@@ -942,7 +1151,11 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
     // Create pinned host side tensors.
     tmp = pinnedLike(inputs[rootTensor]);
     c10::OptionalStreamGuard guard;
+<<<<<<< HEAD
     if (context->rank == rootRank) {
+=======
+    if (context_->rank == rootRank) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       guard.reset_stream(streams[rootTensor]);
       tmp.copy_(inputs[rootTensor], /* non_blocking */ true);
     }
@@ -950,7 +1163,11 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
 
   void run() override {
     // Synchronize with copy operation if applicable.
+<<<<<<< HEAD
     if (context->rank == rootRank) {
+=======
+    if (context_->rank == rootRank) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       streams[rootTensor].synchronize();
     }
 
@@ -1025,6 +1242,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::broadcast(
   return work;
 }
 
+<<<<<<< HEAD
 namespace {
 
 class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
@@ -1495,6 +1713,8 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
 
 } // namespace
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce(
     std::vector<at::Tensor>& inputs,
     const AllreduceOptions& opts) {
@@ -1529,6 +1749,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce(
   auto tag = nextTag();
   auto context = getContext(tag);
   ++seq_;
+<<<<<<< HEAD
   if (device.type() == at::kCPU) {
     if (layout == c10::kStrided) {
       work = c10::make_intrusive<AsyncAllreduceWork>(
@@ -1552,17 +1773,64 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce(
   } else {
     TORCH_CHECK(false, "Invalid backend");
   }
+=======
+
+  work = GlooAllreduceRegistry()->Create(
+      device.type(), context, inputs, opts.reduceOp, tag, seq_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   enqueue(work);
   return work;
 }
 
+<<<<<<< HEAD
+=======
+static c10::intrusive_ptr<ProcessGroupGloo::AsyncWork> makeAllreduceCPUWork(
+    std::shared_ptr<gloo::Context> context,
+    std::vector<at::Tensor>& inputs,
+    ReduceOp reduceOp,
+    uint32_t tag,
+    uint64_t seq) {
+  auto layout = inputs[0].layout();
+
+  if (layout == c10::kStrided) {
+    return c10::make_intrusive<AsyncAllreduceWork>(
+        std::move(context), inputs, reduceOp, tag, seq);
+  } else if (layout == c10::kSparse) {
+    return c10::make_intrusive<AsyncSparseAllreduceWork>(
+        std::move(context), inputs, tag, seq);
+  } else {
+    TORCH_CHECK(false, "ProcessGroupGloo::allreduce: unsupported layout");
+  }
+}
+
+C10_DEFINE_TYPED_REGISTRY(
+    GlooAllreduceRegistry,
+    c10::DeviceType,
+    ProcessGroupGloo::AsyncWork,
+    c10::intrusive_ptr,
+    std::shared_ptr<gloo::Context>,
+    std::vector<at::Tensor>&,
+    ReduceOp,
+    uint32_t,
+    uint64_t)
+
+C10_REGISTER_TYPED_CREATOR(
+    GlooAllreduceRegistry,
+    at::kCPU,
+    makeAllreduceCPUWork)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce_sparse(
     std::vector<at::Tensor>& inputs,
     const AllreduceOptions& opts) {
   // all reduce sparse calls into default allreduce which
   // implemented with all_gathering indices and values
+<<<<<<< HEAD
   // we do ths we do not have a native cuda implementation
+=======
+  // we do this we do not have a native cuda implementation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return allreduce(inputs, opts);
 }
 
@@ -1639,19 +1907,29 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
       uint32_t tag,
       uint64_t seq)
       : ProcessGroupGloo::AsyncWork(
+<<<<<<< HEAD
+=======
+            std::move(context),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {inputs},
             OpType::REDUCE,
             seq,
             "gloo:reduce",
             inputs),
+<<<<<<< HEAD
         context(std::move(context)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputs(inputs),
         rootRank(rootRank),
         rootTensor(rootTensor),
         reduceOp(std::move(reduceOp)),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::shared_ptr<gloo::Context> context;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<at::Tensor> inputs{};
   const int rootRank;
   const int rootTensor;
@@ -1660,18 +1938,41 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
 
   void reduce(std::vector<at::Tensor>& tensors) {
     const auto& scalarType = tensors[0].scalar_type();
+<<<<<<< HEAD
     gloo::ReduceOptions opts(context);
+=======
+    gloo::ReduceOptions opts(context_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opts.setRoot(rootRank);
     opts.setTag(tag);
     opts.setReduceFunction(getFunction(scalarType, reduceOp));
     GENERATE_ALL_TYPES(scalarType, setOutput, opts, tensors[0]);
     gloo::reduce(opts);
+<<<<<<< HEAD
+=======
+
+    // Gloo doesn't support AVG so we use SUM + division.
+    if (reduceOp == ReduceOp::AVG) {
+      tensors[0] /= context_->size;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   void run() override {
     reduce(inputs);
   }
 
+<<<<<<< HEAD
+=======
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return inputs;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  protected:
   template <typename T>
   void getFunction(gloo::ReduceOptions::Func& fn, const ReduceOp op) {
@@ -1815,17 +2116,27 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
       uint32_t tag,
       uint64_t seq)
       : ProcessGroupGloo::AsyncWork(
+<<<<<<< HEAD
+=======
+            std::move(context),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             outputs,
             OpType::ALLGATHER,
             seq,
             "gloo:all_gather",
             inputs),
+<<<<<<< HEAD
         context(std::move(context)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputs(outputs),
         inputs(inputs),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::shared_ptr<gloo::Context> context;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<std::vector<at::Tensor>> outputs{};
   std::vector<at::Tensor> inputs{};
   const uint32_t tag;
@@ -1834,7 +2145,11 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs) {
     const auto& scalarType = inputs[0].scalar_type();
+<<<<<<< HEAD
     gloo::AllgatherOptions opts(context);
+=======
+    gloo::AllgatherOptions opts(context_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opts.setTag(tag);
 
     // Use single flattened input tensor.
@@ -1856,6 +2171,17 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
     }
   }
 
+<<<<<<< HEAD
+=======
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return {newLikeFlat(outputs[0])};
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void run() override {
     allgather(outputs, inputs);
   }
@@ -2087,17 +2413,27 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
       uint32_t tag,
       uint64_t seq)
       : ProcessGroupGloo::AsyncWork(
+<<<<<<< HEAD
+=======
+            std::move(context),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_lists,
             OpType::ALLGATHER_COALESCED,
             seq,
             "gloo:all_gather",
             input_list),
+<<<<<<< HEAD
         context(std::move(context)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_lists(output_lists),
         input_list(input_list),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::shared_ptr<gloo::Context> context;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<std::vector<at::Tensor>> output_lists{};
   std::vector<at::Tensor> input_list{};
   const uint32_t tag;
@@ -2108,7 +2444,11 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
     assert(!input_list.empty());
 
     const auto& scalarType = input_list[0].scalar_type();
+<<<<<<< HEAD
     gloo::AllgatherOptions opts(context);
+=======
+    gloo::AllgatherOptions opts(context_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opts.setTag(tag);
 
     // Use single flattened input tensor.
@@ -2140,6 +2480,17 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
     }
   }
 
+<<<<<<< HEAD
+=======
+  const std::vector<at::Tensor> getInputTensors() override {
+    return input_list;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return {newLikeFlat(output_lists[0])};
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void run() override {
     allgather_coalesced();
   }
@@ -2230,18 +2581,28 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
       uint32_t tag,
       uint64_t seq)
       : ProcessGroupGloo::AsyncWork(
+<<<<<<< HEAD
+=======
+            std::move(context),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             outputs,
             OpType::GATHER,
             seq,
             "gloo:gather",
             inputs),
+<<<<<<< HEAD
         context(std::move(context)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputs(outputs),
         inputs(inputs),
         root(root),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::shared_ptr<gloo::Context> context;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<std::vector<at::Tensor>> outputs{};
   std::vector<at::Tensor> inputs{};
   const int root;
@@ -2251,14 +2612,22 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs) {
     const auto scalarType = inputs[0].scalar_type();
+<<<<<<< HEAD
     gloo::GatherOptions opts(context);
+=======
+    gloo::GatherOptions opts(context_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opts.setRoot(root);
     opts.setTag(tag);
 
     // Set single temporary tensor on root process.
     // This is later scattered to the separate output tensors.
     at::Tensor flatOutputTensor;
+<<<<<<< HEAD
     if (context->rank == root) {
+=======
+    if (context_->rank == root) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       flatOutputTensor = newLikeFlat(outputs[0]);
       GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
     }
@@ -2268,13 +2637,29 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
     gloo::gather(opts);
 
     // Unflatten into output tensors on root process.
+<<<<<<< HEAD
     if (context->rank == root) {
+=======
+    if (context_->rank == root) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       for (const auto i : c10::irange(outputs[0].size())) {
         outputs[0][i].copy_(flatOutputTensor[static_cast<int64_t>(i)]);
       }
     }
   }
 
+<<<<<<< HEAD
+=======
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return outputs.empty() ? std::vector<at::Tensor>{}
+                           : std::vector<at::Tensor>{newLikeFlat(outputs[0])};
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void run() override {
     gather(outputs, inputs);
   }
@@ -2435,19 +2820,29 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
       uint32_t tag,
       uint64_t seq)
       : ProcessGroupGloo::AsyncWork(
+<<<<<<< HEAD
+=======
+            std::move(context),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {outputs},
             OpType::SCATTER,
             seq,
             "gloo:scatter",
             !inputs.empty() ? std::optional<std::vector<at::Tensor>>(inputs[0])
                             : std::nullopt),
+<<<<<<< HEAD
         context(std::move(context)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputs(outputs),
         inputs(inputs),
         root(root),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::shared_ptr<gloo::Context> context;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<at::Tensor> outputs{};
   std::vector<std::vector<at::Tensor>> inputs{};
   const int root;
@@ -2457,12 +2852,20 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
       std::vector<at::Tensor>& outputs,
       std::vector<std::vector<at::Tensor>>& inputs) {
     const auto scalarType = outputs[0].scalar_type();
+<<<<<<< HEAD
     gloo::ScatterOptions opts(context);
+=======
+    gloo::ScatterOptions opts(context_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opts.setRoot(root);
     opts.setTag(tag);
 
     // Set list of input tensors on root process
+<<<<<<< HEAD
     if (context->rank == root) {
+=======
+    if (context_->rank == root) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       GENERATE_ALL_TYPES(scalarType, setInputs, opts, inputs[0]);
     }
 
@@ -2471,6 +2874,18 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
     gloo::scatter(opts);
   }
 
+<<<<<<< HEAD
+=======
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs.empty() ? std::vector<at::Tensor>{}
+                          : std::vector<at::Tensor>{newLikeFlat(inputs[0])};
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return outputs;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void run() override {
     scatter(outputs, inputs);
   }
@@ -2615,7 +3030,48 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::reduce_scatter(
     std::vector<at::Tensor>& outputs,
     std::vector<std::vector<at::Tensor>>& inputs,
     const ReduceScatterOptions& opts) {
+<<<<<<< HEAD
   TORCH_CHECK(false, "ProcessGroupGloo does not support reduce_scatter");
+=======
+  const auto rank = getRank();
+  const auto worldSize = getSize();
+
+  TORCH_CHECK(outputs.size() == 1, "reduce_scatter only supports 1 output");
+  TORCH_CHECK(
+      outputs.size() == inputs.size(),
+      "requires input/output tensor lists to have the same length");
+  TORCH_CHECK(
+      static_cast<int>(inputs[0].size()) == worldSize,
+      "invalid input tensor list size, must be world size");
+
+  std::vector<at::Tensor> buffers;
+  for (const auto i : c10::irange(worldSize)) {
+    if (i == rank) {
+      TORCH_CHECK_EQ(outputs[0].dtype(), inputs[0][i].dtype());
+      TORCH_CHECK_EQ(outputs[0].sizes().vec(), inputs[0][i].sizes().vec());
+
+      // for our own input, we can just use the output tensor instead of
+      // allocating a new tensor
+      outputs[0].copy_(inputs[0][i]);
+      buffers.push_back(outputs[0]);
+    } else {
+      buffers.push_back(inputs[0][i].clone());
+    }
+  }
+  std::vector<c10::intrusive_ptr<Work>> works;
+  for (const auto i : c10::irange(buffers.size())) {
+    std::vector<at::Tensor> inp = {buffers[i]};
+    AllreduceOptions arOpts;
+    arOpts.reduceOp = opts.reduceOp;
+    works.push_back(allreduce(inp));
+  }
+  return c10::make_intrusive<LambdaWork>(
+      [worldSize, works = std::move(works)]() {
+        for (const auto i : c10::irange(worldSize)) {
+          works[i]->wait();
+        }
+      });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 namespace {
@@ -2631,19 +3087,29 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
       uint32_t tag,
       uint64_t seq)
       : ProcessGroupGloo::AsyncWork(
+<<<<<<< HEAD
+=======
+            std::move(context),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {{outputTensor}},
             OpType::ALLTOALL,
             seq,
             "gloo:all_to_all",
             std::optional<std::vector<at::Tensor>>({inputTensor})),
+<<<<<<< HEAD
         context(std::move(context)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputTensor(outputTensor),
         inputTensor(inputTensor),
         outputCounts(std::move(outputCounts)),
         inputCounts(std::move(inputCounts)),
         tag(tag) {}
 
+<<<<<<< HEAD
   std::shared_ptr<gloo::Context> context;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::Tensor outputTensor;
   at::Tensor inputTensor;
   std::vector<int64_t> outputCounts{};
@@ -2654,24 +3120,41 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
     const auto scalarType = outputTensor.scalar_type();
     if (outputCounts.empty() && inputCounts.empty()) {
       // Gloo alltoall
+<<<<<<< HEAD
       gloo::AlltoallOptions opts(context);
+=======
+      gloo::AlltoallOptions opts(context_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       opts.setTag(tag);
       GENERATE_ALL_TYPES(scalarType, setInput, opts, inputTensor);
       GENERATE_ALL_TYPES(scalarType, setOutput, opts, outputTensor);
       gloo::alltoall(opts);
     } else {
       // Gloo alltoallv
+<<<<<<< HEAD
       c10d::checkSplitSizes(inputCounts, inputTensor, context->size);
       c10d::checkSplitSizes(outputCounts, outputTensor, context->size);
       std::vector<int64_t> sendCounts(context->size);
       std::vector<int64_t> recvCounts(context->size);
       std::vector<int64_t> sendOffsets(context->size);
       std::vector<int64_t> recvOffsets(context->size);
+=======
+      c10d::checkSplitSizes(inputCounts, inputTensor, context_->size);
+      c10d::checkSplitSizes(outputCounts, outputTensor, context_->size);
+      std::vector<int64_t> sendCounts(context_->size);
+      std::vector<int64_t> recvCounts(context_->size);
+      std::vector<int64_t> sendOffsets(context_->size);
+      std::vector<int64_t> recvOffsets(context_->size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       c10d::computeLengthsAndOffsets(
           inputCounts, inputTensor, &sendCounts, &sendOffsets);
       c10d::computeLengthsAndOffsets(
           outputCounts, outputTensor, &recvCounts, &recvOffsets);
+<<<<<<< HEAD
       gloo::AlltoallvOptions opts(context);
+=======
+      gloo::AlltoallvOptions opts(context_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       opts.setTag(tag);
       GENERATE_ALL_TYPES(scalarType, setInput, opts, inputTensor, sendCounts);
       GENERATE_ALL_TYPES(scalarType, setOutput, opts, outputTensor, recvCounts);
@@ -2679,6 +3162,17 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
     }
   }
 
+<<<<<<< HEAD
+=======
+  const std::vector<at::Tensor> getInputTensors() override {
+    return {inputTensor};
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return {outputTensor};
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void run() override {
     alltoall(outputTensor, inputTensor);
   }
@@ -2903,11 +3397,16 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
       uint32_t tag,
       uint64_t seq)
       : ProcessGroupGloo::AsyncWork(
+<<<<<<< HEAD
+=======
+            std::move(context),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {},
             OpType::BARRIER,
             seq,
             "gloo:barrier",
             std::nullopt),
+<<<<<<< HEAD
         context(std::move(context)),
         priorWork(std::move(priorWork)),
         tag(tag) {}
@@ -2915,6 +3414,22 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
   std::shared_ptr<gloo::Context> context;
   std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork{};
   const uint32_t tag;
+=======
+        priorWork(std::move(priorWork)),
+        tag(tag) {}
+
+  std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork{};
+  const uint32_t tag;
+  std::vector<at::Tensor> inputs{};
+
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return inputs;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void run() override {
     // Wait on prior work to complete
@@ -2925,7 +3440,11 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
       }
     }
 
+<<<<<<< HEAD
     gloo::BarrierOptions opts(context);
+=======
+    gloo::BarrierOptions opts(context_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     opts.setTag(tag);
     gloo::barrier(opts);
   }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index b44cba9f35a4..e506bc804b1f 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -21,6 +21,10 @@
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/Types.hpp>
 #include <torch/csrc/distributed/c10d/Utils.hpp>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/distributed/c10d/logger.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <ATen/ThreadLocalState.h>
 
@@ -28,6 +32,16 @@ namespace c10d {
 
 constexpr const char* GLOO_BACKEND_NAME = "gloo";
 
+<<<<<<< HEAD
+=======
+// Control whether or not connections are established in a full mesh or lazily
+// as needed.
+static std::vector<std::string> TORCH_GLOO_LAZY_INIT = {"TORCH_GLOO_LAZY_INIT"};
+
+// Returns default value for lazyInit.
+bool TORCH_API getDefaultGlooLazyInit();
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // ProcessGroupGloo implements Gloo bindings for c10d.
 //
 // All functions on this class are expected to be called in the same
@@ -57,6 +71,10 @@ class TORCH_API ProcessGroupGloo : public Backend {
   class TORCH_API AsyncWork : public Work {
    public:
     explicit AsyncWork(
+<<<<<<< HEAD
+=======
+        std::shared_ptr<gloo::Context> context,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::vector<std::vector<at::Tensor>> outputTensors,
         OpType opType,
         uint64_t seq,
@@ -74,13 +92,29 @@ class TORCH_API ProcessGroupGloo : public Backend {
 
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
     uint64_t getSequencenumber() const override;
-
+<<<<<<< HEAD
+
+=======
+    std::chrono::milliseconds getTimeout() const;
+    virtual const std::vector<at::Tensor> getInputTensors() = 0;
+    virtual const std::vector<at::Tensor> getOutputTensors() = 0;
+    inline std::string getProfilerTitle() const {
+      return profilingTitle_;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     inline at::ThreadLocalState getTLS() const {
       return tls_;
     }
 
    protected:
     friend class ProcessGroupGloo;
+<<<<<<< HEAD
+=======
+    // unique id used to tell the trace buffer that this
+    // work has completed
+    std::optional<uint64_t> trace_id_;
+    std::shared_ptr<gloo::Context> context_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
    private:
     void finishWorkGloo();
@@ -93,6 +127,10 @@ class TORCH_API ProcessGroupGloo : public Backend {
     c10::intrusive_ptr<at::ivalue::Future> future_;
     std::function<void()> recordFunctionBeforeCallback_;
     const uint64_t seq_;
+<<<<<<< HEAD
+=======
+    std::string profilingTitle_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::ThreadLocalState tls_;
   };
 
@@ -230,6 +268,11 @@ class TORCH_API ProcessGroupGloo : public Backend {
       return c10::make_intrusive<Options>(timeout);
     }
 
+<<<<<<< HEAD
+=======
+    std::vector<uint64_t> global_ranks_in_group;
+    std::string group_name;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<std::shared_ptr<::gloo::transport::Device>> devices;
     int threads;
   };
@@ -244,16 +287,27 @@ class TORCH_API ProcessGroupGloo : public Backend {
 
   // Create new device instance for specific interface.
   static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
+<<<<<<< HEAD
       const std::string& interface);
 
   // Create new device instance for specific hostname or address.
   static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
       const std::string& hostname);
+=======
+      const std::string& interface,
+      bool lazyInit = false);
+
+  // Create new device instance for specific hostname or address.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
+      const std::string& hostname,
+      bool lazyInit = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Create new device instance.
   // It tries to resolve this machine's hostname and bind to that address.
   // If that fails (i.e. the hostname doesn't resolve to an address), it
   // falls back to binding to the loopback address.
+<<<<<<< HEAD
   static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
 
   // Create ProcessGroupGloo instance.
@@ -262,6 +316,10 @@ class TORCH_API ProcessGroupGloo : public Backend {
       int rank,
       int size,
       std::chrono::milliseconds timeout);
+=======
+  static std::shared_ptr<::gloo::transport::Device> createDefaultDevice(
+      bool lazyInit = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   explicit ProcessGroupGloo(
       const c10::intrusive_ptr<Store>& store,
@@ -275,6 +333,11 @@ class TORCH_API ProcessGroupGloo : public Backend {
     return options_;
   }
 
+<<<<<<< HEAD
+=======
+  const std::vector<uint64_t>& groupRanks() const;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& tensors,
       const BroadcastOptions& opts = BroadcastOptions()) override;
@@ -367,7 +430,11 @@ class TORCH_API ProcessGroupGloo : public Backend {
 
   void enableCollectivesTiming() override;
 
+<<<<<<< HEAD
   const std::unique_ptr<::gloo::rendezvous::Store>& _getStore() const {
+=======
+  const std::shared_ptr<::gloo::rendezvous::Store>& _getStore() const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return store_;
   }
 
@@ -393,7 +460,11 @@ class TORCH_API ProcessGroupGloo : public Backend {
   }
 
  protected:
+<<<<<<< HEAD
   std::unique_ptr<::gloo::rendezvous::Store> store_;
+=======
+  std::shared_ptr<::gloo::rendezvous::Store> store_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const c10::intrusive_ptr<Options> options_;
 
   // Every Gloo context represents a set of connections to its peers.
@@ -435,6 +506,12 @@ class TORCH_API ProcessGroupGloo : public Backend {
   std::condition_variable workProduceCV_;
   std::condition_variable workConsumeCV_;
   uint64_t seq_{0};
+<<<<<<< HEAD
+=======
+  size_t local_id_;
+  std::shared_ptr<ProcessGroupStatus> pgStatus_ =
+      std::make_shared<ProcessGroupStatus>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
new file mode 100644
index 000000000000..dae9d3044fa7
--- /dev/null
+++ b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
@@ -0,0 +1,207 @@
+#ifdef USE_C10D_GLOO
+#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
+#include <torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp>
+
+#include <gloo/cuda_allreduce_ring_chunked.h>
+
+namespace c10d {
+
+class AsyncAllreduceCUDADeviceWork : public ProcessGroupGloo::AsyncWork {
+ public:
+  AsyncAllreduceCUDADeviceWork(
+      const std::shared_ptr<gloo::Context>& context,
+      std::vector<at::Tensor>& inputs,
+      ReduceOp reduceOp,
+      uint32_t tag,
+      uint64_t seq)
+      : ProcessGroupGloo::AsyncWork(
+            std::move(context),
+            {inputs},
+            OpType::ALLREDUCE,
+            seq,
+            "gloo:all_reduce",
+            inputs),
+        inputs_(inputs),
+        reduceOp_(reduceOp) {}
+
+  template <typename T>
+  void createAlgorithm(std::unique_ptr<gloo::Algorithm>& algo) {
+    auto count = inputs_.at(0).numel();
+    std::vector<T*> ptrs;
+    for (const auto& tensor : inputs_) {
+      TORCH_CHECK_EQ(tensor.numel(), count);
+      ptrs.push_back(static_cast<T*>(tensor.data_ptr()));
+    }
+    algo = std::make_unique<
+        gloo::CudaAllreduceRingChunked<T, gloo::CudaDeviceWorkspace<T>>>(
+        context_, ptrs, count);
+  }
+
+  void run() override {
+    const auto& scalarType = inputs_.at(0).scalar_type();
+
+    std::unique_ptr<gloo::Algorithm> algo;
+    GENERATE_ALL_TYPES(scalarType, createAlgorithm, algo);
+    algo->run();
+
+    // Gloo doesn't support AVG so we use SUM + division.
+    if (reduceOp_ == ReduceOp::AVG) {
+      inputs_[0] /= context_->size;
+    } else {
+      TORCH_CHECK_EQ(reduceOp_, ReduceOp::SUM);
+    }
+  }
+
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs_;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return inputs_;
+  }
+
+  void synchronize() override {
+    // TODO: is synchronization needed?
+  }
+
+ private:
+  std::vector<at::Tensor> inputs_;
+  const ReduceOp reduceOp_;
+};
+
+class AsyncAllreduceCUDAHostWork : public AsyncAllreduceWork {
+ public:
+  AsyncAllreduceCUDAHostWork(
+      const std::shared_ptr<gloo::Context>& context,
+      std::vector<at::Tensor>& inputs,
+      ReduceOp reduceOp,
+      uint32_t tag,
+      uint64_t seq)
+      : AsyncAllreduceWork(context, inputs, std::move(reduceOp), tag, seq) {
+    initializeStreamsEvents(inputs, streams, events);
+
+    // Kick off copy from CUDA tensors to pinned CPU tensors.
+    tmp.reserve(inputs.size());
+    c10::OptionalStreamGuard guard;
+    for (const auto i : c10::irange(inputs.size())) {
+      guard.reset_stream(streams[i]);
+      tmp.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
+    }
+  }
+
+  void run() override {
+    // Synchronize with copy operations.
+    for (const auto i : c10::irange(inputs.size())) {
+      streams[i].synchronize();
+    }
+
+    // Run allreduce on host side tensors.
+    allreduce(tmp);
+
+    c10::OptionalStreamGuard guard;
+    for (const auto i : c10::irange(inputs.size())) {
+      guard.reset_stream(streams[i]);
+      inputs[i].copy_(tmp[i], /* non_blocking */ true);
+      events[i].record(streams[i]);
+    }
+  }
+
+  void synchronize() override {
+    // Synchronize with the copy back to CUDA tensors.
+    for (const auto i : c10::irange(inputs.size())) {
+      c10::Device device = inputs[i].device();
+      events[i].block(
+          c10::impl::VirtualGuardImpl(device.type()).getStream(device));
+    }
+  }
+
+  std::vector<at::Tensor> tmp;
+  std::vector<c10::Stream> streams{};
+  std::vector<c10::Event> events{};
+};
+
+class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
+ public:
+  AsyncSparseAllreduceCUDAWork(
+      const std::shared_ptr<gloo::Context>& context,
+      std::vector<at::Tensor>& inputs,
+      uint32_t tag,
+      uint64_t seq)
+      : AsyncSparseAllreduceWork(context, inputs, tag, seq) {
+    initializeStreamsEvents(inputs, streams, events);
+
+    // Kick off copy from CUDA tensors to CPU tensors.
+    // Note that both coalescing the sparse tensor and copying it to CPU
+    // memory must be performed asynchronously, or we block the caller.
+    tmp.reserve(inputs.size());
+    c10::OptionalStreamGuard guard;
+    for (const auto i : c10::irange(inputs.size())) {
+      guard.reset_stream(streams[i]);
+      tmp.push_back(
+          inputs[i].coalesce().to(at::DeviceType::CPU, /*non_blocking=*/true));
+    }
+  }
+
+  void run() override {
+    // Synchronize with copy operations.
+    for (const auto i : c10::irange(inputs.size())) {
+      streams[i].synchronize();
+    }
+
+    // Run allreduce on host side tensors.
+    auto output = allreduce(tmp);
+
+    // Kick off copy back to the CUDA tensors.
+    c10::OptionalStreamGuard guard;
+    for (const auto i : c10::irange(inputs.size())) {
+      guard.reset_stream(streams[i]);
+      inputs[i].copy_(output, /*non_blocking=*/true);
+      events[i].record(streams[i]);
+    }
+  }
+
+  void synchronize() override {
+    // Synchronize with the copy back to CUDA tensors.
+    for (const auto i : c10::irange(inputs.size())) {
+      c10::Device device = inputs[i].device();
+      events[i].block(
+          c10::impl::VirtualGuardImpl(device.type()).getStream(device));
+    }
+  }
+
+  std::vector<at::Tensor> tmp{};
+  std::vector<c10::Stream> streams{};
+  std::vector<c10::Event> events{};
+};
+
+static c10::intrusive_ptr<ProcessGroupGloo::AsyncWork> makeAllreduceCUDAWork(
+    std::shared_ptr<gloo::Context> context,
+    std::vector<at::Tensor>& inputs,
+    ReduceOp reduceOp,
+    uint32_t tag,
+    uint64_t seq) {
+  auto layout = inputs[0].layout();
+
+  if (layout == c10::kStrided) {
+    if (context->getDevice()->hasGPUDirect()) {
+      return c10::make_intrusive<AsyncAllreduceCUDADeviceWork>(
+          std::move(context), inputs, reduceOp, tag, seq);
+    } else {
+      return c10::make_intrusive<AsyncAllreduceCUDAHostWork>(
+          std::move(context), inputs, reduceOp, tag, seq);
+    }
+  } else if (layout == c10::kSparse) {
+    return c10::make_intrusive<AsyncSparseAllreduceCUDAWork>(
+        std::move(context), inputs, tag, seq);
+  } else {
+    TORCH_CHECK(false, "ProcessGroupGloo::allreduce: unsupported layout");
+  }
+}
+
+C10_REGISTER_TYPED_CREATOR(
+    GlooAllreduceRegistry,
+    at::kCUDA,
+    makeAllreduceCUDAWork)
+} // namespace c10d
+
+#endif // USE_C10D_GLOO
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp b/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
new file mode 100644
index 000000000000..1cf6cf25fff6
--- /dev/null
+++ b/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
@@ -0,0 +1,647 @@
+#pragma once
+
+#ifdef USE_C10D_GLOO
+
+#include <c10/util/Registry.h>
+#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
+
+#include <gloo/allgather.h>
+#include <gloo/allgatherv.h>
+#include <gloo/allreduce.h>
+#include <gloo/alltoall.h>
+#include <gloo/alltoallv.h>
+#include <gloo/barrier.h>
+#include <gloo/broadcast.h>
+#include <gloo/gather.h>
+#include <gloo/reduce.h>
+#include <gloo/scatter.h>
+
+#ifdef _WIN32
+#define GENERATE_ALL_TYPES(type, func, ...)      \
+  switch (type) {                                \
+    case ::at::ScalarType::Float:                \
+      func<float>(__VA_ARGS__);                  \
+      break;                                     \
+    case ::at::ScalarType::Double:               \
+      func<double>(__VA_ARGS__);                 \
+      break;                                     \
+    case ::at::ScalarType::Half:                 \
+      func<c10::Half>(__VA_ARGS__);              \
+      break;                                     \
+    case ::at::ScalarType::BFloat16:             \
+      func<c10::BFloat16>(__VA_ARGS__);          \
+      break;                                     \
+    case ::at::ScalarType::Char:                 \
+      func<int8_t>(__VA_ARGS__);                 \
+      break;                                     \
+    case ::at::ScalarType::Byte:                 \
+    case ::at::ScalarType::Bool:                 \
+      func<uint8_t>(__VA_ARGS__);                \
+      break;                                     \
+    case ::at::ScalarType::Int:                  \
+      func<int32_t>(__VA_ARGS__);                \
+      break;                                     \
+    case ::at::ScalarType::Long:                 \
+      func<int64_t>(__VA_ARGS__);                \
+      break;                                     \
+    default:                                     \
+      TORCH_CHECK(false, "Invalid scalar type"); \
+  }
+
+#define HOST_NAME_MAX 256
+#else
+#define GENERATE_ALL_TYPES(type, func, args...)  \
+  switch (type) {                                \
+    case ::at::ScalarType::Float:                \
+      func<float>(args);                         \
+      break;                                     \
+    case ::at::ScalarType::Double:               \
+      func<double>(args);                        \
+      break;                                     \
+    case ::at::ScalarType::Half:                 \
+      func<c10::Half>(args);                     \
+      break;                                     \
+    case ::at::ScalarType::BFloat16:             \
+      func<c10::BFloat16>(args);                 \
+      break;                                     \
+    case ::at::ScalarType::Char:                 \
+      func<int8_t>(args);                        \
+      break;                                     \
+    case ::at::ScalarType::Byte:                 \
+    case ::at::ScalarType::Bool:                 \
+      func<uint8_t>(args);                       \
+      break;                                     \
+    case ::at::ScalarType::Int:                  \
+      func<int32_t>(args);                       \
+      break;                                     \
+    case ::at::ScalarType::Long:                 \
+      func<int64_t>(args);                       \
+      break;                                     \
+    default:                                     \
+      TORCH_CHECK(false, "Invalid scalar type"); \
+  }
+#endif
+
+namespace c10d {
+
+TORCH_DECLARE_TYPED_REGISTRY(
+    GlooAllreduceRegistry,
+    c10::DeviceType,
+    ProcessGroupGloo::AsyncWork,
+    c10::intrusive_ptr,
+    std::shared_ptr<gloo::Context>,
+    std::vector<at::Tensor>&,
+    ReduceOp,
+    uint32_t,
+    uint64_t);
+
+// This function initializes a vector of CUDA streams, one for every
+// tensor in the input tensor vector, and ensures that these streams are
+// synchronized with the current default streams. This is needed so
+// that new work on the new streams is serialized w.r.t. all operations
+// on the tensors.
+TORCH_API void initializeStreamsEvents(
+    const std::vector<at::Tensor>& tensors,
+    std::vector<c10::Stream>& streams,
+    std::vector<c10::Event>& events);
+
+// This function initializes a vector of CUDA streams, one per device,
+// and ensures that these streams are synchronized with the current default
+// streams. It is assumed that the tensors in the nested tensor vectors are
+// on the same device.
+TORCH_API void initializeStreamsEvents(
+    std::vector<std::vector<at::Tensor>>& tensors,
+    std::vector<c10::Stream>& streams,
+    std::vector<c10::Event>& events);
+
+typedef void (*ReduceFunc)(void*, const void*, const void*, size_t);
+
+template <typename T, std::enable_if_t<!std::is_integral_v<T>, int> = 0>
+ReduceFunc toFunction(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+    case ReduceOp::AVG:
+      return ReduceFunc(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return ReduceFunc(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return ReduceFunc(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return ReduceFunc(&::gloo::max<T>);
+    case ReduceOp::BAND:
+      TORCH_CHECK(false, "Cannot use ReduceOp.BAND with non-integral dtype");
+      break;
+    case ReduceOp::BOR:
+      TORCH_CHECK(false, "Cannot use ReduceOp.BOR with non-integral dtype");
+      break;
+    case ReduceOp::BXOR:
+      TORCH_CHECK(false, "Cannot use ReduceOp.BXOR with non-integral dtype");
+      break;
+    case ReduceOp::PREMUL_SUM:
+      TORCH_CHECK(false, "Cannot use ReduceOp.PREMUL_SUM with Gloo");
+      break;
+    case ReduceOp::UNUSED:
+    default:
+      break;
+  }
+
+  TORCH_CHECK(false, "Unhandled ReduceOp");
+}
+
+// Bitwise AND with SFINAE guard for integral types.
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+void band(void* c, const void* a, const void* b, size_t n) {
+  auto tc = static_cast<T*>(c);
+  auto ta = static_cast<const T*>(a);
+  auto tb = static_cast<const T*>(b);
+  for (const auto i : c10::irange(n)) {
+    tc[i] = ta[i] & tb[i];
+  }
+}
+
+// Bitwise OR with SFINAE guard for integral types.
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+void bor(void* c, const void* a, const void* b, size_t n) {
+  auto tc = static_cast<T*>(c);
+  auto ta = static_cast<const T*>(a);
+  auto tb = static_cast<const T*>(b);
+  for (const auto i : c10::irange(n)) {
+    tc[i] = ta[i] | tb[i];
+  }
+}
+
+// Bitwise XOR with SFINAE guard for integral types.
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+void bxor(void* c, const void* a, const void* b, size_t n) {
+  auto tc = static_cast<T*>(c);
+  auto ta = static_cast<const T*>(a);
+  auto tb = static_cast<const T*>(b);
+  for (const auto i : c10::irange(n)) {
+    tc[i] = ta[i] ^ tb[i];
+  }
+}
+
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+ReduceFunc toFunction(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+    case ReduceOp::AVG:
+      return ReduceFunc(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return ReduceFunc(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return ReduceFunc(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return ReduceFunc(&::gloo::max<T>);
+    case ReduceOp::BAND:
+      return ReduceFunc(&band<T>);
+    case ReduceOp::BOR:
+      return ReduceFunc(&bor<T>);
+    case ReduceOp::BXOR:
+      return ReduceFunc(&bxor<T>);
+    case ReduceOp::PREMUL_SUM:
+      TORCH_CHECK(false, "Cannot use ReduceOp.PREMUL_SUM with Gloo");
+      break;
+    case ReduceOp::UNUSED:
+    default:
+      break;
+  }
+
+  TORCH_CHECK(false, "Unhandled ReduceOp");
+}
+
+template <typename T, typename O>
+void setInputs(O& opts, std::vector<at::Tensor>& tensors) {
+  opts.setInputs(getDataPointers<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename O>
+void setInput(O& opts, at::Tensor& tensor) {
+  opts.setInput(getDataPointer<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename O>
+void setInput(O& opts, at::Tensor& tensor, std::vector<size_t>& counts) {
+  opts.setInput(getDataPointer<T>(tensor), counts);
+}
+
+template <typename T, typename O>
+void setInput(O& opts, at::Tensor& tensor, std::vector<int64_t>& counts) {
+  opts.setInput(getDataPointer<T>(tensor), counts);
+}
+
+template <typename T, typename O>
+void setOutputs(O& opts, std::vector<at::Tensor>& tensors) {
+  opts.setOutputs(getDataPointers<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename O>
+void setOutput(O& opts, at::Tensor& tensor) {
+  opts.setOutput(getDataPointer<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename O>
+void setOutput(O& opts, at::Tensor& tensor, std::vector<size_t>& counts) {
+  opts.setOutput(getDataPointer<T>(tensor), counts);
+}
+
+template <typename T, typename O>
+void setOutput(O& opts, at::Tensor& tensor, std::vector<int64_t>& counts) {
+  opts.setOutput(getDataPointer<T>(tensor), counts);
+}
+
+static at::Tensor pinnedLike(at::Tensor& tensor) {
+  auto* allocator = at::detail::getCUDAHooks().getPinnedMemoryAllocator();
+  auto storage = c10::Storage(
+      c10::Storage::use_byte_size_t(),
+      static_cast<int64_t>(at::detail::computeStorageNbytes(
+          tensor.sizes(), tensor.strides(), tensor.dtype().itemsize())),
+      allocator,
+      /*resizable=*/false);
+  return at::empty({0}, tensor.options().device(at::kCPU))
+      .set_(storage, 0, tensor.sizes(), tensor.strides());
+}
+
+class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
+ public:
+  AsyncAllreduceWork(
+      std::shared_ptr<gloo::Context> context,
+      std::vector<at::Tensor>& inputs,
+      ReduceOp reduceOp,
+      uint32_t tag,
+      uint64_t seq)
+      : ProcessGroupGloo::AsyncWork(
+            std::move(context),
+            {inputs},
+            OpType::ALLREDUCE,
+            seq,
+            "gloo:all_reduce",
+            inputs),
+        inputs(inputs),
+        reduceOp(std::move(reduceOp)),
+        tag(tag) {}
+
+  std::vector<at::Tensor> inputs{};
+  const ReduceOp reduceOp;
+  const uint32_t tag;
+
+  void allreduce(std::vector<at::Tensor>& tensors) {
+    const auto& scalarType = tensors[0].scalar_type();
+    gloo::AllreduceOptions opts(context_);
+    opts.setReduceFunction(getFunction(scalarType, reduceOp));
+    opts.setTag(tag);
+    GENERATE_ALL_TYPES(scalarType, setOutputs, opts, tensors);
+    gloo::allreduce(opts);
+
+    // Gloo doesn't support AVG so we use SUM + division.
+    if (reduceOp == ReduceOp::AVG) {
+      tensors[0] /= context_->size;
+    }
+  }
+
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return inputs;
+  }
+
+  void run() override {
+    allreduce(inputs);
+  }
+
+  template <typename T>
+  void getFunction(gloo::AllreduceOptions::Func& fn, const ReduceOp op) {
+    fn = toFunction<T>(op);
+  }
+
+  gloo::AllreduceOptions::Func getFunction(
+      const at::ScalarType& dtype,
+      const ReduceOp& op) {
+    gloo::AllreduceOptions::Func fn;
+    GENERATE_ALL_TYPES(dtype, getFunction, fn, op);
+    return fn;
+  }
+};
+
+class AsyncAllreduceCoalescedWork : public AsyncAllreduceWork {
+ public:
+  AsyncAllreduceCoalescedWork(
+      const std::shared_ptr<gloo::Context>& context,
+      std::vector<at::Tensor>& inputs,
+      ReduceOp reduceOp,
+      uint32_t tag,
+      uint64_t seq)
+      : AsyncAllreduceWork(context, inputs, std::move(reduceOp), tag, seq) {}
+
+  void run() override {
+    allreduceCoalesced(inputs);
+  }
+
+ private:
+  void allreduceCoalesced(std::vector<at::Tensor>& tensors) {
+    // reduce coalesced, flattened tensors.
+    at::Tensor coalescedTensor = flattenDenseTensors(tensors);
+    std::vector<at::Tensor> allreduceInput = {coalescedTensor};
+    allreduce(allreduceInput);
+
+    // separate and reshape tensors.
+    size_t offset = 0;
+    for (at::Tensor& tensor : tensors) {
+      const int64_t tensorNumel = tensor.numel();
+      const c10::IntArrayRef tensorShape = tensor.sizes();
+      tensor.copy_(coalescedTensor.slice(0, offset, offset + tensorNumel)
+                       .view(tensorShape));
+      offset += tensorNumel;
+    }
+  }
+};
+
+class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
+ public:
+  AsyncSparseAllreduceWork(
+      std::shared_ptr<gloo::Context> context,
+      std::vector<at::Tensor>& inputs,
+      uint32_t tag,
+      uint64_t seq)
+      : ProcessGroupGloo::AsyncWork(
+            std::move(context),
+            {inputs},
+            OpType::_ALLREDUCE_SPARSE,
+            seq,
+            "gloo:sparse_all_reduce",
+            inputs),
+        inputs(inputs),
+        tag(tag) {}
+
+  std::vector<at::Tensor> inputs{};
+  const uint32_t tag;
+
+  // We share dimensionality about the sparse tensors before collecting
+  // their contents. We assume here that the maximum number of sparse
+  // and dense dimensions is 4. This is stored in a contiguous piece of
+  // memory so that we can easily run allgather on it.
+  //
+  // The layout of this memory is as follows:
+  //
+  //   - [0:4]: sparse dims
+  //   - [4:8]: dense dims
+  //   -   [8]: nnz
+  //
+  class SparseTensorMetadata {
+   public:
+    static constexpr auto dim = 9;
+
+    // Construct from an existing metadata tensor to facilitate structured
+    // access to metadata from peers, after gathering it.
+    explicit SparseTensorMetadata(at::Tensor metadata)
+        : metadata_(std::move(metadata)),
+          data_(metadata_.mutable_data_ptr<int64_t>()) {
+      AT_ASSERT(metadata_.scalar_type() == at::kLong);
+      AT_ASSERT(metadata_.dim() == 1);
+      AT_ASSERT(metadata_.size(0) == dim);
+    }
+
+    // Populate the metadata.
+    void populate_from_sparse_tensor(const at::Tensor& tensor) {
+      const auto sparse_dim = tensor.sparse_dim();
+      AT_ASSERT(sparse_dim <= 4);
+      for (const auto i : c10::irange(4)) {
+        if (i < sparse_dim) {
+          data_[i] = tensor.size(i);
+        }
+      }
+      const auto dense_dim = tensor.dense_dim();
+      AT_ASSERT(dense_dim <= 4);
+      for (const auto i : c10::irange(4)) {
+        if (i < dense_dim) {
+          data_[i + 4] = tensor.size(sparse_dim + i);
+        }
+      }
+      data_[8] = tensor._nnz();
+    }
+
+    std::vector<int64_t> sizes() const {
+      std::vector<int64_t> sizes;
+      // Sparse sizes
+      for (const auto i : c10::irange(4)) {
+        if (data_[i] <= 0) {
+          break;
+        }
+        sizes.push_back(data_[i]);
+      }
+      // Dense sizes
+      for (const auto i : c10::irange(4, 8)) {
+        if (data_[i] <= 0) {
+          break;
+        }
+        sizes.push_back(data_[i]);
+      }
+      return sizes;
+    }
+
+    int64_t nnz() const {
+      return data_[8];
+    }
+
+   protected:
+    at::Tensor metadata_;
+    int64_t* data_;
+  };
+
+  // Sparse allreduce is implemented with allgather on indices and values.
+  // Every process then sums the resulting sparse tensors locally.
+  // The nnz for sparse tensors may be different across processes, so first
+  // we run allgather on the nnz, and then allgather with max(nnz).
+  at::Tensor allreduce(std::vector<at::Tensor>& tensors) {
+    // TODO: This is a massive hack!  There is some confusion about
+    // Variable/Tensor inside the body of this function.  Turning off
+    // grad smooths over the confusion for now.  This fixes
+    // test/test_c10d_gloo.py ProcessGroupGlooTest.test_sparse_allreduce_basics
+    //
+    // The correct fix is to stop allocating tensors that are not variables,
+    // but to conveniently do this c10d must depend on torch not ATen
+    at::AutoDispatchBelowAutograd guard;
+    auto input = tensors[0];
+
+    // Perform local reduction if we have multiple inputs.
+    for (const auto i : c10::irange(1, tensors.size())) {
+      input += tensors[i];
+    }
+
+    // Need to coalesce before we can access indices and values.
+    input = input.coalesce();
+
+    // Gather metadata information from all ranks.
+    auto metadata = allgather_metadata(input);
+
+    // Sanity check dimensionality across ranks.
+    {
+      const auto expected = metadata[context_->rank].sizes();
+      for (const auto i : c10::irange(context_->size)) {
+        if (i == context_->rank) {
+          continue;
+        }
+        const auto actual = metadata[i].sizes();
+        TORCH_CHECK(actual == expected, "Sparse dimensions do not match");
+      }
+    }
+
+    // Gather all indices and all values.
+    auto indices = allgather_indices(input, metadata);
+    auto values = allgather_values(input, metadata);
+
+    // Perform global reduction.
+    AT_ASSERT(static_cast<int>(indices.size()) == context_->size);
+    AT_ASSERT(static_cast<int>(values.size()) == context_->size);
+    auto output = at::sparse_coo_tensor(
+        indices[0], values[0], input.sizes(), input.options());
+    for (const auto i : c10::irange(1, context_->size)) {
+      output += at::sparse_coo_tensor(
+          indices[i], values[i], input.sizes(), input.options());
+    }
+
+    // Coalesce for good measure.
+    return output.coalesce();
+  }
+
+  void run() override {
+    auto output = allreduce(inputs);
+
+    // This copy is needed when we run a multi-gpu version of reduce (multiple
+    // inputs per rank).
+    for (const auto i : c10::irange(inputs.size())) {
+      inputs[i].copy_(output);
+    }
+  }
+
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return inputs;
+  }
+
+ private:
+  std::vector<SparseTensorMetadata> allgather_metadata(
+      const at::Tensor& tensor) {
+    auto buffer =
+        at::zeros({context_->size, SparseTensorMetadata::dim}, at::kLong);
+
+    // Prepare metadata vector (1 entry per rank)
+    std::vector<SparseTensorMetadata> metadata;
+    metadata.reserve(context_->size);
+    for (const auto i : c10::irange(context_->size)) {
+      metadata.emplace_back(buffer.select(0, i));
+    }
+
+    // Populate data for this rank
+    metadata[context_->rank].populate_from_sparse_tensor(tensor);
+
+    // Allgather metadata
+    gloo::AllgatherOptions opts(context_);
+    opts.setOutput(buffer.mutable_data_ptr<int64_t>(), buffer.numel());
+    opts.setTag(tag);
+    gloo::allgather(opts);
+
+    return metadata;
+  }
+
+  std::vector<at::Tensor> allgather_indices(
+      const at::Tensor& tensor,
+      const std::vector<SparseTensorMetadata>& metadata) {
+    const auto sparseDim = tensor.sparse_dim();
+
+    std::vector<size_t> counts(context_->size);
+    size_t totalSize = 0;
+    for (const auto i : c10::irange(metadata.size())) {
+      counts[i] = metadata[i].nnz() * sparseDim;
+      totalSize += counts[i];
+    }
+
+    auto output = at::empty({static_cast<int64_t>(totalSize)}, at::kLong);
+
+    // tensors copied from cuda may not be contiguous, get a contiguous
+    // tensor before use its data_ptr
+    auto input = tensor.indices().contiguous();
+
+    // Allgatherv indices.
+    gloo::AllgathervOptions opts(context_);
+    opts.setInput(
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        const_cast<int64_t*>(input.const_data_ptr<int64_t>()),
+        input.numel());
+    opts.setOutput(output.mutable_data_ptr<int64_t>(), counts);
+    opts.setTag(tag);
+    gloo::allgatherv(opts);
+
+    // Compile indices tensor per rank.
+    std::vector<at::Tensor> indices;
+    indices.reserve(metadata.size());
+    int64_t offset = 0;
+    for (const auto& i : metadata) {
+      const auto nnz = i.nnz();
+      const auto numel = sparseDim * nnz;
+      indices.push_back(
+          output.narrow(0, offset, numel).reshape({sparseDim, nnz}));
+      offset += numel;
+    }
+
+    return indices;
+  }
+
+  std::vector<at::Tensor> allgather_values(
+      const at::Tensor& tensor,
+      const std::vector<SparseTensorMetadata>& metadata) {
+    // There are nnz #dense_dim()-dimensional tensors per rank.
+    const auto valueShape = tensor.sizes().slice(tensor.sparse_dim());
+    int64_t denseNumel = 1;
+    for (auto dim : valueShape) {
+      denseNumel *= dim;
+    }
+
+    std::vector<size_t> counts(context_->size);
+    int64_t totalSize = 0;
+    for (const auto i : c10::irange(metadata.size())) {
+      counts[i] = metadata[i].nnz() * denseNumel;
+      totalSize += static_cast<int64_t>(counts[i]);
+    }
+
+    auto output = at::empty({totalSize}, tensor.scalar_type());
+
+    // Allgatherv indices.
+    gloo::AllgathervOptions opts(context_);
+    // tensors copied from cuda may not be contiguous, get a contiguous
+    // tensor before use its data_ptr
+    at::Tensor valueTensor = tensor.values().contiguous();
+    GENERATE_ALL_TYPES(valueTensor.scalar_type(), setInput, opts, valueTensor);
+    GENERATE_ALL_TYPES(
+        valueTensor.scalar_type(), setOutput, opts, output, counts);
+    opts.setTag(tag);
+    gloo::allgatherv(opts);
+
+    // Compile values tensor per rank.
+    std::vector<at::Tensor> values;
+    values.reserve(metadata.size());
+    int64_t offset = 0;
+    for (const auto& i : metadata) {
+      const auto nnz = i.nnz();
+      const auto numel = denseNumel * nnz;
+      auto tensorShape = std::vector<int64_t>({(int64_t)nnz});
+      std::copy(
+          valueShape.begin(),
+          valueShape.end(),
+          std::back_inserter(tensorShape));
+      values.push_back(output.narrow(0, offset, numel).reshape(tensorShape));
+      offset += numel;
+    }
+
+    return values;
+  }
+};
+
+} // namespace c10d
+
+#endif
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
index b9fbba06b22e..f6f70a63bf26 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
@@ -695,7 +695,51 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
+<<<<<<< HEAD
   TORCH_CHECK(false, "ProcessGroupMPI does not support reduce_scatter");
+=======
+  checkSingleTensor(outputTensors);
+  if (inputTensors.size() != 1) {
+    TORCH_CHECK(
+        false,
+        "MPI process group only supports a single "
+        "tensor op");
+  }
+  if (static_cast<size_t>(size_) != inputTensors[0].size()) {
+    TORCH_CHECK(
+        false,
+        "Reduce scatter: number of input tensors should equal "
+        "to the world size");
+  }
+  checkSameSizeAndType(outputTensors[0], inputTensors[0]);
+
+  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
+      [opts, this](std::unique_ptr<WorkEntry>& entry) {
+        auto data = (entry->dst)[0];
+        auto flatInputTensor = newLikeFlat(entry->src);
+        for (const auto i : c10::irange(entry->src.size())) {
+          flatInputTensor[static_cast<int64_t>(i)].copy_(entry->src[i]);
+        }
+        int recvcount = flatInputTensor.numel() / size_;
+
+        c10::DeviceGuard guard(data.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Reduce_scatter_block(
+            flatInputTensor.data_ptr(),
+            data.data_ptr(),
+            recvcount,
+            mpiDatatype.at(data.scalar_type()),
+            mpiOp.at(opts.reduceOp),
+            pgComm_));
+      };
+
+  auto entry = std::make_unique<WorkEntry>(
+      &inputTensors[0], &outputTensors, std::move(runFunc));
+  return enqueue(
+      std::move(entry),
+      "mpi:reduce_scatter",
+      std::optional<std::vector<at::Tensor>>(inputTensors[0]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::alltoall_base(
@@ -941,10 +985,77 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::barrier(const BarrierOptions& opts) {
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::_allgather_base(
+<<<<<<< HEAD
     at::Tensor& /*unused */,
     at::Tensor& /*unused */,
     const AllgatherOptions& /*unused */) {
   TORCH_CHECK(false, "no support for _allgather_base in MPI process group");
+=======
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const AllgatherOptions& opts) {
+  TORCH_CHECK(
+      outputTensor.numel() == inputTensor.numel() * size_,
+      "All gather: output tensor size must be equal to input tensor size times the world size");
+
+  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
+      [this](std::unique_ptr<WorkEntry>& entry) {
+        auto dstdata = (entry->dst)[0];
+        auto srcdata = (entry->src)[0];
+        c10::DeviceGuard guard(srcdata.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Allgather(
+            srcdata.data_ptr(),
+            srcdata.numel(),
+            mpiDatatype.at(srcdata.scalar_type()),
+            dstdata.data_ptr(),
+            srcdata.numel(),
+            mpiDatatype.at(dstdata.scalar_type()),
+            pgComm_));
+      };
+
+  auto inputTensors = std::vector<at::Tensor>({inputTensor});
+  auto outputTensors = std::vector<at::Tensor>({outputTensor});
+  auto entry = std::make_unique<WorkEntry>(
+      &inputTensors, &outputTensors, std::move(runFunc));
+  return enqueue(
+      std::move(entry),
+      "mpi:_allgather_base",
+      std::optional<std::vector<at::Tensor>>(inputTensors));
+}
+
+c10::intrusive_ptr<Work> ProcessGroupMPI::_reduce_scatter_base(
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const ReduceScatterOptions& opts) {
+  TORCH_CHECK(
+      outputTensor.numel() * size_ == inputTensor.numel(),
+      "Reduce scatter: input tensor size must be equal to output tensor size times the world size");
+
+  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
+      [opts, this](std::unique_ptr<WorkEntry>& entry) {
+        auto dstdata = (entry->dst)[0];
+        auto srcdata = (entry->src)[0];
+        c10::DeviceGuard guard(srcdata.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Reduce_scatter_block(
+            srcdata.data_ptr(),
+            dstdata.data_ptr(),
+            dstdata.numel(),
+            mpiDatatype.at(srcdata.scalar_type()),
+            mpiOp.at(opts.reduceOp),
+            pgComm_));
+      };
+
+  auto inputTensors = std::vector<at::Tensor>({inputTensor});
+  auto outputTensors = std::vector<at::Tensor>({outputTensor});
+  auto entry = std::make_unique<WorkEntry>(
+      &inputTensors, &outputTensors, std::move(runFunc));
+  return enqueue(
+      std::move(entry),
+      "mpi:_reduce_scatter_base",
+      std::optional<std::vector<at::Tensor>>(inputTensors));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
index bcfda88af13d..08bb0b574fe4 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@@ -65,7 +65,11 @@ struct WorkEntry {
 // That is, The process may be multi-threaded, and multiple threads may make
 // MPI calls, but only one at a time: MPI calls are not made concurrently from
 // two distinct threads (all MPI calls are serialized). However, with
+<<<<<<< HEAD
 // MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
+=======
+// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a single process
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // group. In other words, no more than 1 process group can be created globally.
 //
 // If you would like to use multiple ProcessGroupMPI, it requires your MPI
@@ -197,6 +201,14 @@ class TORCH_API ProcessGroupMPI : public Backend {
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
+<<<<<<< HEAD
+=======
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index cf41ba6a3809..c91e7232dd3d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -17,6 +17,10 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
 #include <c10/util/WaitCounter.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/hash.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 #include <c10/util/thread_name.h>
 #include <torch/csrc/cuda/CUDAPluggableAllocator.h>
@@ -36,6 +40,10 @@
 namespace c10d {
 
 constexpr const char* const kNCCLAbortedCommStoreKey = "NCCLABORTEDCOMM";
+<<<<<<< HEAD
+=======
+using FlightRecorderCUDA = FlightRecorder<at::cuda::CUDAEvent>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 
@@ -55,6 +63,7 @@ const std::map<ReduceOp::RedOpType, ncclRedOp_t> ncclOp = {
 #endif // NCCL_HAS_AVG
 };
 
+<<<<<<< HEAD
 // NCCL type typing
 std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
     {at::kChar, ncclInt8},
@@ -83,6 +92,17 @@ ncclDataType_t getNcclDataType(at::ScalarType type) {
       "Input tensor data type is not supported for NCCL process group: ",
       type);
   return it->second;
+=======
+inline bool isUnsupportedFloat8(at::ScalarType t) {
+  return (
+      t == at::ScalarType::Float8_e5m2fnuz ||
+      t == at::ScalarType::Float8_e4m3fnuz ||
+      t == at::ScalarType::Float8_e8m0fnu
+#ifndef NCCL_SUPPORTS_FP8
+      || t == at::ScalarType::Float8_e5m2 || t == at::ScalarType::Float8_e4m3fn
+#endif
+  );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool complexViewAsRealAllowed(const ReduceOp& reduceOp) {
@@ -200,6 +220,7 @@ inline std::string getKeyFromDevice(at::Device& device) {
   return std::to_string(device.index());
 }
 
+<<<<<<< HEAD
 inline at::DeviceIndex getIndexFromDeviceKey(const std::string& deviceKey) {
   // initialize the device index to -1, which is an invalid value.
   int index = -1;
@@ -214,6 +235,8 @@ inline at::DeviceIndex getIndexFromDeviceKey(const std::string& deviceKey) {
   return static_cast<at::DeviceIndex>(index);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::string getKeySendRecv(int myRank, int peer) {
   int lowRank = myRank < peer ? myRank : peer;
   int highRank = myRank < peer ? peer : myRank;
@@ -289,6 +312,7 @@ inline void errorIfCapturingNonCapturableNCCL(c10::cuda::CaptureStatus status) {
   }
 }
 
+<<<<<<< HEAD
 } // namespace
 
 // Map from each communicator to its device index.
@@ -296,14 +320,60 @@ inline void errorIfCapturingNonCapturableNCCL(c10::cuda::CaptureStatus status) {
 // allocator. See design notes below:
 // - Each segment should be registered only to the communicator on the
 //   same device.
+=======
+// When TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK is set, all tensors (no
+// matter how they have been allocated) are registered with all NCCL comms.
+bool shouldAllCommunicatorsRegisterAllTensors() {
+#ifdef NCCL_HAS_COMM_REGISTER
+  static const bool flag = [] {
+    const bool flag =
+        getCvarBool(TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK, false);
+    if (flag &&
+        c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
+            expandable_segments()) {
+      LOG(INFO)
+          << "disables TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK because it is not compatible with CUDA allocator expandable segments mode.";
+      return false;
+    }
+    return flag;
+  }();
+  return flag;
+#else
+  return false;
+#endif // NCCL_HAS_COMM_REGISTER
+}
+
+} // namespace
+
+// Map each communicator to the memory pools registered with it.
+// This map is used when the caching allocator allocates or frees segments, in
+// order to register or deregister them with the relevant NCCL communicators.
+// There are two modes to do so:
+// - If TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK=1 then *ALL* segments
+//   will be registered with *ALL* NCCL communicators (for the same device),
+//   even if they were allocated with cudaMalloc (which NCCL doesn't like).
+// - If a MemPool is explicitly registered with a ProcessGroup, then all its
+//   segments (current and future) will be registered with the NCCL communicator
+//   corresponding to the pool's device. This works best if the MemPool is set
+//   up to use ncclMemAlloc (which is exposed by the ProcessGroup).
+// Implementation notes:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // - We cannot reuse devNCCLCommMap_ in each ProcessGroup because the key may be
 //   ranks rather than device in point-to-point case.
 // - This map has also to be maintained as global variable since the register
 //   hooks are called outside the scope of any PG, thus we need traverse
 //   communicators in all PGs.
+<<<<<<< HEAD
 static std::unordered_map<std::shared_ptr<NCCLComm>, int> ncclCommDevIdxMap;
 static std::mutex ncclCommDevIdxMapMutex;
 static bool allocatorHooksAttached = false;
+=======
+using MemPoolSet = std::
+    unordered_set<c10::cuda::MempoolId_t, c10::hash<c10::cuda::MempoolId_t>>;
+static std::unordered_map<std::shared_ptr<NCCLComm>, MemPoolSet>
+    ncclCommMemPoolMap;
+static std::mutex ncclCommMemPoolMapMutex;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 std::atomic<bool> ProcessGroupNCCL::shouldDump_(false);
 
@@ -315,6 +385,7 @@ static void cacheAllocatorRegisterHook(
     return;
   }
 
+<<<<<<< HEAD
   std::lock_guard<std::mutex> lock(ncclCommDevIdxMapMutex);
   for (auto& it : ncclCommDevIdxMap) {
     auto& ncclComm = it.first;
@@ -322,6 +393,16 @@ static void cacheAllocatorRegisterHook(
     if (te.device_ == devIdx) {
       // NOLINTNEXTLINE(performance-no-int-to-ptr)
       ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
+=======
+  std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+  for (auto& [ncclComm, memPools] : ncclCommMemPoolMap) {
+    if (te.device_ == ncclComm->getDeviceIndex()) {
+      if (shouldAllCommunicatorsRegisterAllTensors() ||
+          memPools.find(te.mempool_) != memPools.end()) {
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
+        ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 }
@@ -334,6 +415,7 @@ static void cacheAllocatorDeregisterHook(
     return;
   }
 
+<<<<<<< HEAD
   std::lock_guard<std::mutex> lock(ncclCommDevIdxMapMutex);
   for (auto& it : ncclCommDevIdxMap) {
     auto& ncclComm = it.first;
@@ -341,10 +423,37 @@ static void cacheAllocatorDeregisterHook(
     if (te.device_ == devIdx) {
       // NOLINTNEXTLINE(performance-no-int-to-ptr)
       ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
+=======
+  std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+  for (auto& [ncclComm, memPools] : ncclCommMemPoolMap) {
+    if (te.device_ == ncclComm->getDeviceIndex()) {
+      if (shouldAllCommunicatorsRegisterAllTensors() ||
+          memPools.find(te.mempool_) != memPools.end()) {
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
+        ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 }
 
+<<<<<<< HEAD
+=======
+static void attachAllocatorHooks() {
+  static c10::once_flag flag;
+  c10::call_once(flag, [] {
+    // Attaching hooks fails if CUDACachingAllocator is not initialized, so
+    // Init for CUDA is called (and is a no-op if CUDA is already
+    // initialized).
+    at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
+    c10::cuda::CUDACachingAllocator::attachAllocatorTraceTracker(
+        &cacheAllocatorRegisterHook);
+    c10::cuda::CUDACachingAllocator::attachAllocatorTraceTracker(
+        &cacheAllocatorDeregisterHook);
+  });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static std::
     unordered_map<std::string, std::unordered_map<std::string, std::string>>
     getNCCLCommDumpMap() {
@@ -354,16 +463,29 @@ static std::
       std::unordered_map<std::string, std::string> /* dump from this comm */>
       ncclDumpMap;
   // dump_nccl_trace is only called from the default PG (local_id_=0), but we
+<<<<<<< HEAD
   // want to dump from all comms so we need to iterate over ncclCommDevIdxMap,
+=======
+  // want to dump from all comms so we need to iterate over ncclCommMemPoolMap,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // which is static
   std::vector<std::shared_ptr<NCCLComm>> allNCCLComms;
   // within the critical section, we don't want to dump while holding the lock
   // as dump might hang
+<<<<<<< HEAD
   ncclCommDevIdxMapMutex.lock();
   for (auto& [ncclComm, _] : ncclCommDevIdxMap) {
     allNCCLComms.push_back(ncclComm);
   }
   ncclCommDevIdxMapMutex.unlock();
+=======
+  {
+    std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+    for (auto& [ncclComm, _] : ncclCommMemPoolMap) {
+      allNCCLComms.push_back(ncclComm);
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto& ncclComm : allNCCLComms) {
     std::string ncclUniqueIDStr = buildNcclUniqueIdStr(ncclComm->getNcclId());
     ncclDumpMap[ncclUniqueIDStr] = ncclComm->ncclCommDump();
@@ -386,13 +508,21 @@ std::string dump_nccl_trace(
     printNcclCommProxyTrace("Received dump signal " + ncclUniqueIDStr, dump);
   }
 #endif // defined(USE_ROCM) && defined(NCCL_COMM_DUMP)
+<<<<<<< HEAD
   return FlightRecorder::get()->dump(
+=======
+  return FlightRecorderCUDA::get()->dump(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ncclDumpMap, includeCollectives, includeStackTraces, onlyActive);
 }
 
 std::string dump_nccl_trace_json(bool includeCollectives, bool onlyActive) {
   auto ncclDumpMap = getNCCLCommDumpMap();
+<<<<<<< HEAD
   return FlightRecorder::get()->dump_json(
+=======
+  return FlightRecorderCUDA::get()->dump_json(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ncclDumpMap, includeCollectives, onlyActive);
 }
 
@@ -454,6 +584,39 @@ std::ostream& operator<<(
   return output << workInfo;
 }
 
+<<<<<<< HEAD
+=======
+/* Implementation of TensorShelf class */
+
+void TensorShelf::stash(std::vector<at::Tensor>& tensors) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  tVector_.insert(tVector_.end(), tensors.begin(), tensors.end());
+}
+
+void TensorShelf::stash(TensorShelf& other) {
+  std::vector<at::Tensor>& otherVec = other.get();
+  this->stash(otherVec);
+}
+
+void TensorShelf::unstash() {
+  this->clear();
+}
+
+bool TensorShelf::empty() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return tVector_.empty();
+}
+
+void TensorShelf::clear() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  tVector_.clear();
+}
+
+std::vector<at::Tensor>& TensorShelf::get() {
+  return tVector_;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ProcessGroupNCCL::WorkNCCL::WorkNCCL(
     std::string pgUID,
     std::string pgDesc,
@@ -464,7 +627,10 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(
     bool isP2P,
     const char* profilingTitle,
     const std::optional<std::vector<at::Tensor>>& inputs,
+<<<<<<< HEAD
     bool desyncDebug,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool enableTiming,
     bool cudaEventCacheEnabled,
     DebugLevel distDebugLevel)
@@ -496,6 +662,11 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(
   }
   futureWorkResult_ =
       c10::make_intrusive<at::ivalue::Future>(c10::AnyEnumType::get());
+<<<<<<< HEAD
+=======
+  // other functions expect an initialized ptr
+  stashed_for_allocator_safety_ = std::make_shared<TensorShelf>();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
@@ -517,6 +688,14 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
       numelIn_(w.numelIn_),
       numelOut_(w.numelOut_),
       store_(w.store_),
+<<<<<<< HEAD
+=======
+      // Note: the `work` returned to user and the `work` enqueued to watchdog
+      // share the pointer to the tensor stash.  At least one of them should
+      // clean the tensor stash, the earlier the better, i.e. user calling
+      // `work.wait` than watchdog detecting work completion.
+      stashed_for_allocator_safety_(w.stashed_for_allocator_safety_),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       futureWorkResult_(w.futureWorkResult_),
       timingEnabled_(w.timingEnabled_),
       trace_id_(w.trace_id_),
@@ -647,11 +826,19 @@ bool ProcessGroupNCCL::WorkNCCL::checkTimeout(
 }
 
 // Print the traceback of the collective at call time
+<<<<<<< HEAD
 void ProcessGroupNCCL::WorkNCCL::printTraceback() const {
   // First step we get the corresponding record entry from FR, based on work's
   // trace_id_
   std::optional<FlightRecorder::Entry> entry =
       FlightRecorder::get()->getEntry(trace_id_);
+=======
+std::string ProcessGroupNCCL::WorkNCCL::getTraceback() const {
+  // First step we get the corresponding record entry from FR, based on work's
+  // trace_id_
+  std::optional<FlightRecorderCUDA::Entry> entry =
+      FlightRecorderCUDA::get()->getEntry(trace_id_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (entry.has_value()) {
     auto entryVal = entry.value();
     // Get stack trace from FR entry, in string format
@@ -663,10 +850,26 @@ void ProcessGroupNCCL::WorkNCCL::printTraceback() const {
     // Wait for the future to complete or timeout
     auto status = future.wait_for(std::chrono::seconds(8));
     if (status == std::future_status::ready) {
+<<<<<<< HEAD
       std::string tracebackStr = future.get();
       LOG(ERROR) << "Stack trace of the failed collective: \n" << tracebackStr;
     } // else, symbolizer probably timed out, we skip logging the stack trace.
   } else {
+=======
+      return future.get();
+    }
+  }
+  return "";
+}
+
+// Print the traceback of the collective at call time
+void ProcessGroupNCCL::WorkNCCL::printTraceback() const {
+  std::string tracebackStr = getTraceback();
+  if (!tracebackStr.empty()) {
+    LOG(ERROR) << "Stack trace of the failed collective: \n" << tracebackStr;
+  } // else, symbolizer probably timed out, we skip logging the stack trace.
+  else {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     LOG(ERROR)
         << "Stack trace of the failed collective not found, "
         << "potentially because FlightRecorder is disabled. "
@@ -714,10 +917,16 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeStream() {
   auto currentStream = at::cuda::getCurrentCUDAStream(device_.index());
   // Block the current stream on the NCCL stream
   ncclEndEvent_->block(currentStream);
+<<<<<<< HEAD
 
   if (avoidRecordStreams_) {
     stashed_for_allocator_safety_->clear();
   }
+=======
+  // Unstage the stashed tensors so that CachingAllocator can recycle them
+  // THIS MUST HAPPEN AFTER THE BLOCKING CALL ABOVE
+  stashed_for_allocator_safety_->unstash();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Same as calling synchronize() when blockingWait_ is false
@@ -781,7 +990,11 @@ bool ProcessGroupNCCL::WorkNCCL::wait(std::chrono::milliseconds timeout) {
   // upgrade. Once a NCCL version is qualified, this code should not be needed
   // at runtime.
 #ifdef PGNCCL_ENABLE_HASH
+<<<<<<< HEAD
   if (distDebugLevel_ >= DebugLevel::Detail) {
+=======
+  if (enableCollectiveHashDebug_.load()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto numel = getTensorsNumel(*outputs_);
     auto hashValue = hashTensors(*outputs_);
     PRINT_COLLECTIVE_HASH_SIGNATURE(
@@ -802,9 +1015,16 @@ void ProcessGroupNCCL::WorkNCCL::abort() {
   // Abort all communicators of this work
   ncclComm_->abort();
 
+<<<<<<< HEAD
   ncclCommDevIdxMapMutex.lock();
   ncclCommDevIdxMap.erase(ncclComm_);
   ncclCommDevIdxMapMutex.unlock();
+=======
+  {
+    std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+    ncclCommMemPoolMap.erase(ncclComm_);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 ProcessGroupNCCL::CUDAEventCache::CUDAEventCache() = default;
@@ -880,7 +1100,10 @@ ProcessGroupNCCL::ProcessGroupNCCL(
       store_(std::move(store)),
       options_(std::move(options)),
       terminateProcessGroup_(false),
+<<<<<<< HEAD
       terminateHeartbeatMonitorThread_(false),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       local_id_(process_group_id++),
       intraNodeComm_(initIntraNodeComm()) {
   TORCH_CHECK_WITH(
@@ -899,6 +1122,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   blockingWait_ = getCvarBool(TORCH_NCCL_BLOCKING_WAIT, false);
   asyncErrorHandling_ = static_cast<ErrorHandlingMode>(
       getCvarInt(TORCH_NCCL_ASYNC_ERROR_HANDLING, 3 /*SkipCleanUp*/));
+<<<<<<< HEAD
   desyncDebug_ = getCvarBool(TORCH_NCCL_DESYNC_DEBUG, false) ||
       (dist_debug_level_ >= DebugLevel::Detail);
   rethrowCUDAErrors_ = getCvarBool(TORCH_NCCL_RETHROW_CUDA_ERRORS, true);
@@ -922,6 +1146,12 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   coordCheckIntervalMilSec_ = getCvarInt(TORCH_NCCL_COORD_CHECK_MILSEC, 1000);
   traceBufferSize_ = getCvarInt(TORCH_NCCL_TRACE_BUFFER_SIZE, 2000);
   enableCollecticeHashDebug_ = (dist_debug_level_ >= DebugLevel::Detail);
+=======
+  enableNanCheck_ = getCvarBool(TORCH_NCCL_NAN_CHECK, false);
+  cudaEventCacheEnabled_.store(getCvarBool(TORCH_NCCL_CUDA_EVENT_CACHE, true));
+  traceBufferSize_ = getCvarInt(TORCH_NCCL_TRACE_BUFFER_SIZE, 2000);
+  enableCollectiveHashDebug_ = (dist_debug_level_ >= DebugLevel::Detail);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // store_ usually is wrapped with PrefixStore and the prefix is different
   // across different ProcessGroupNCCL(PG) instances. We need to get the
   // underlying non-PrefixStore for sharing global information shared across
@@ -929,6 +1159,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   PrefixStore* prefixStore = dynamic_cast<PrefixStore*>(store_.get());
   globalStore_ =
       prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_;
+<<<<<<< HEAD
 #ifdef ENABLE_NCCL_ERROR_CHECKING
   enableTiming_.store(
       getCvarBool(TORCH_NCCL_ENABLE_TIMING, false) || desyncDebug_);
@@ -945,13 +1176,29 @@ ProcessGroupNCCL::ProcessGroupNCCL(
         << "disables TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK because it is not compatible with CUDA allocator expandable segments mode.";
   }
 #endif // NCCL_HAS_COMM_REGISTER
+=======
+  auto desyncDebug = getCvarBool(TORCH_NCCL_DESYNC_DEBUG, false) ||
+      (dist_debug_level_ >= DebugLevel::Detail);
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+  enableTiming_.store(
+      getCvarBool(TORCH_NCCL_ENABLE_TIMING, false) || desyncDebug);
+#endif // ENABLE_NCCL_ERROR_CHECKING
+  if (getCvarBool(TORCH_NCCL_AVOID_RECORD_STREAMS, false)) {
+    TORCH_WARN_ONCE(
+        "TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated.");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (blockingWait_) {
     LOG(INFO)
         << logPrefix()
         << "TORCH_NCCL_BLOCKING_WAIT is enabled, NO watchdog thread is created.";
   } else {
+<<<<<<< HEAD
     if (desyncDebug_ && asyncErrorHandling_ == NoHandling) {
+=======
+    if (desyncDebug && asyncErrorHandling_ == NoHandling) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       LOG(INFO)
           << logPrefix()
           << "TORCH_NCCL_DESYNC_DEBUG and TORCH_NCCL_ASYNC_ERROR_HANDLING "
@@ -961,13 +1208,42 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     }
   }
 
+<<<<<<< HEAD
+=======
+  // If deterministic mode is enabled, we need to disable the NVLS algorithm in
+  // NCCL.
+  // TODO: remove this once NVLS supports deterministic mode.
+  if (at::globalContext().deterministicAlgorithms()) {
+    // Check if user have already set NCCL_ALGO. If already set, leave it.
+    auto nccl_algo = c10::utils::get_env("NCCL_ALGO");
+    if (!nccl_algo.has_value()) {
+      LOG(INFO)
+          << "torch deterministic mode is enabled, "
+          << "disabling NVLS algorithm in NCCL which can lead to non-deterministic reduction.";
+      // Sorry we have to disable NVLS for all collectives, be it all-reduce
+      // or all-gather, because NCCL does not support per-collective
+      // algorithm selection today.
+      c10::utils::set_env("NCCL_ALGO", "^NVLS");
+    }
+  }
+
+  // Initialize the heartbeat monitor/watchdog instance. This has to be done
+  // before the corresponding thread is launched to avoid the error.
+  heartbeatMonitor_ = std::make_unique<HeartbeatMonitor>(this);
+  watchdog_ = std::make_unique<Watchdog>(this);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef ENABLE_NCCL_ERROR_CHECKING
   // in blockingWait mode, we don't need to enable the watchdog thread to check
   // the timeout or nccl error because the main thread would throw an exception
   // and it is the user's responsibility to handle the exception.
   if (!blockingWait_) {
+<<<<<<< HEAD
     ncclCommWatchdogThread_ =
         std::thread(&ProcessGroupNCCL::ncclCommWatchdog, this);
+=======
+    watchdog_->start();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 #endif // ENABLE_NCCL_ERROR_CHECKING
 
@@ -987,16 +1263,20 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   LOG(INFO) << logPrefix() << "ProcessGroupNCCL environments: "
             << "NCCL version: " << ncclVersion
             << ", TORCH_NCCL_ASYNC_ERROR_HANDLING: " << asyncErrorHandling_
+<<<<<<< HEAD
             << ", TORCH_NCCL_DUMP_ON_TIMEOUT: " << dumpOnTimeoutOrEx_
             << ", TORCH_NCCL_PROPAGATE_ERROR: " << propagatePgError_
             << ", TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: "
             << waitTimeoutDumpInMilSec_
             << ", TORCH_NCCL_DESYNC_DEBUG: " << desyncDebug_
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             << ", TORCH_NCCL_ENABLE_TIMING: " << enableTiming_.load()
             << ", TORCH_NCCL_BLOCKING_WAIT: " << blockingWait_
             << ", TORCH_DISTRIBUTED_DEBUG: " << torch_distributed_debug
 #ifdef NCCL_HAS_COMM_REGISTER
             << ", TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK: "
+<<<<<<< HEAD
             << useTensorRegisterAllocatorHook_
 #endif // NCCL_HAS_COMM_REGISTER
             << ", TORCH_NCCL_ENABLE_MONITORING: "
@@ -1013,11 +1293,24 @@ ProcessGroupNCCL::ProcessGroupNCCL(
       options_->global_ranks_in_group,
       this->globalRankStart,
       this->globalRankStride);
+=======
+            << shouldAllCommunicatorsRegisterAllTensors()
+#endif // NCCL_HAS_COMM_REGISTER
+            << ", TORCH_NCCL_TRACE_BUFFER_SIZE: " << traceBufferSize_
+            << ", TORCH_NCCL_NAN_CHECK: " << enableNanCheck_
+            << ", TORCH_NCCL_CUDA_EVENT_CACHE: " << cudaEventCacheEnabled_;
+
+  getGlobalRankStartAndStride(
+      options_->global_ranks_in_group,
+      this->globalRankStart_,
+      this->globalRankStride_);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Attach hooks to cache allocator to trigger the hooks whenever a traced
   // action is called. In the following hooks, we register a newly allocated
   // segment when SEGMENT_ALLOC action occurs, and deregister a segment when
   // SEGMENT_FREE action occurs.
+<<<<<<< HEAD
   // We attach hooks only once at the first PG creation.
   // Attaching hooks fails if CUDACachingAllocator is not initialized, so
   // Init for CUDA is called (and is a no-op if CUDA is already
@@ -1034,6 +1327,11 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   // Enable Desync Debugger per user setting
   if (desyncDebug_) {
     desyncDebugger_.init(rank, size, store_);
+=======
+  if (shouldAllCommunicatorsRegisterAllTensors()) {
+    // This call is idempotent.
+    attachAllocatorHooks();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -1140,8 +1438,28 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool) {
     ncclComm = initNCCLComm(key, device, OpType::ALLREDUCE);
   }
   TORCH_INTERNAL_ASSERT(ncclComm != nullptr);
+<<<<<<< HEAD
   auto ctx = c10::cuda::MemPoolContext(pool);
   auto snapshot = c10::cuda::CUDACachingAllocator::snapshot();
+=======
+  {
+    std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+    auto iter = ncclCommMemPoolMap.find(ncclComm);
+    iter->second.insert(pool->id());
+  }
+  // We must ensure we're listening for allocator trace events in order to
+  // register future segments allocated in this pool (this call is idempotent).
+  attachAllocatorHooks();
+  auto snapshot = c10::cuda::CUDACachingAllocator::snapshot(pool->id());
+  // TODO:
+  // if(pool->is_symmetric()) {
+  //   Allgather to verify len(mempool.snapshot.segments) matches across GPUs
+  //   Allgather to verify mempool.alloc_request_counter matches across GPUs
+  //   add alloc_request_counter per mempool (How many allocations a mempool has
+  //   served during its lifetime) this should guarantee pool is used in a
+  //   symmetric/SPMD manner
+  // }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& segmentInfo : snapshot.segments) {
     TORCH_INTERNAL_ASSERT(
         segmentInfo.device == pool->device(),
@@ -1150,7 +1468,13 @@ void ProcessGroupNCCL::registerMemPool(c10::cuda::MemPool* pool) {
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<void*>(segmentInfo.address),
         segmentInfo.total_size,
+<<<<<<< HEAD
         /*errorOnRereg=*/false); // ignores reregistration error
+=======
+        /*errorOnRereg=*/false, // ignores reregistration error
+        /*window=*/pool->is_symmetric()); // whether to use NCCL symmetric
+                                          // memory
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -1170,14 +1494,28 @@ void ProcessGroupNCCL::deregisterMemPool(c10::cuda::MemPool* pool) {
     ncclComm = initNCCLComm(key, device, OpType::ALLREDUCE);
   }
   TORCH_INTERNAL_ASSERT(ncclComm != nullptr);
+<<<<<<< HEAD
   auto ctx = c10::cuda::MemPoolContext(pool);
   auto snapshot = c10::cuda::CUDACachingAllocator::snapshot();
+=======
+  {
+    std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+    auto iter = ncclCommMemPoolMap.find(ncclComm);
+    iter->second.erase(pool->id());
+  }
+  auto snapshot = c10::cuda::CUDACachingAllocator::snapshot(pool->id());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& segmentInfo : snapshot.segments) {
     TORCH_INTERNAL_ASSERT(
         segmentInfo.device == pool->device(),
         "Mismatch between CUDA memory segment device and pool's device");
     // NOLINTNEXTLINE(performance-no-int-to-ptr)
+<<<<<<< HEAD
     ncclComm->deregisterSegment(reinterpret_cast<void*>(segmentInfo.address));
+=======
+    ncclComm->deregisterSegment(
+        reinterpret_cast<void*>(segmentInfo.address), pool->is_symmetric());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -1240,7 +1578,11 @@ void ProcessGroupNCCL::waitForPendingWorks() {
   //    completedWorkList_ before it finishes.
   // 3. We have three threads and two locks.
   //      a. main thread (this function) grabs two locks atomically
+<<<<<<< HEAD
   //      b. watchdog thread (watchdogHandler function) always grabs
+=======
+  //      b. watchdog thread (runLoop function) always grabs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   //      workMetaListMutex_
   //         first and then grabs completedWorkListMutex_.
   //      c. hook thread (runHookLoop function) only grabs
@@ -1298,7 +1640,11 @@ bool ProcessGroupNCCL::waitForFutureOrTimeout(
           e.what());
 
       debugLog.strings["status"] = "EXCEPTION";
+<<<<<<< HEAD
       debugLog.strings["exception"] = e.what();
+=======
+      debugLog.strings["exception_msg"] = e.what();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       LOG(ERROR) << errorMsg;
     } catch (...) {
       errorMsg = c10::str(
@@ -1306,7 +1652,11 @@ bool ProcessGroupNCCL::waitForFutureOrTimeout(
           "Unknown exception thrown when waiting for future ",
           futDescription);
       debugLog.strings["status"] = "EXCEPTION";
+<<<<<<< HEAD
       debugLog.strings["exception"] = "Unknown exception";
+=======
+      debugLog.strings["exception_msg"] = "Unknown exception";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       LOG(ERROR) << errorMsg;
     }
   } else {
@@ -1331,6 +1681,14 @@ void ProcessGroupNCCL::abortCommsFromMap(
     const std::optional<std::string>& abortReason) {
   // The process may control multiple devices, loop through the communicators on
   // each device
+<<<<<<< HEAD
+=======
+  // NCCL expects Group abort when there are multiple communicators created in a
+  // device. Group abort requires 2.22.0 release and up.
+  if (getNcclVersionNumber() >= NCCL_VERSION(2, 22, 0)) {
+    groupStart();
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto& it : ncclCommsMap) {
     auto& devName = it.first;
     auto& ncclComm = it.second;
@@ -1351,6 +1709,12 @@ void ProcessGroupNCCL::abortCommsFromMap(
     VLOG(2) << logPrefix() << "ProcessGroupNCCL destroyed "
             << " communicator on CUDA device: " << devName;
   }
+<<<<<<< HEAD
+=======
+  if (getNcclVersionNumber() >= NCCL_VERSION(2, 22, 0)) {
+    groupEnd();
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Abort all communicators on this rank
@@ -1359,6 +1723,7 @@ void ProcessGroupNCCL::abortCommsFromMap(
 // method calls `abortComms` but does more destruction than the latter.
 bool ProcessGroupNCCL::abortComms(
     const std::optional<std::string>& abortReason) {
+<<<<<<< HEAD
   // Remove record from global ncclCommDevIdxMapMutex before aboarting,
   // so that a new cache segment would not register to already aborded
   // communicators. Note that ncclCommDevIdxMap is a global container which may
@@ -1370,6 +1735,19 @@ bool ProcessGroupNCCL::abortComms(
     ncclCommDevIdxMap.erase(ncclComm);
   }
   ncclCommDevIdxMapMutex.unlock();
+=======
+  // Remove record from global ncclCommMemPoolMapMutex before aboarting,
+  // so that a new cache segment would not register to already aborted
+  // communicators. Note that ncclCommMemPoolMap is a global container which may
+  // contain other PG's communicators, thus we need to only erase communicators
+  // for the current PG.
+  {
+    std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+    for (auto& [_, ncclComm] : devNCCLCommMap_) {
+      ncclCommMemPoolMap.erase(ncclComm);
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::lock_guard<std::mutex> lock(mutex_);
   abortCommsFromMap(devNCCLCommMap_, abortReason);
@@ -1386,11 +1764,19 @@ void ProcessGroupNCCL::abort() {
   // communicators and signal the threads to exit. Joining on the threads could
   // potentially block and hence avoid it in this method.
   terminateProcessGroup_.store(true);
+<<<<<<< HEAD
   workMetaListCV_.notify_one();
 
   // lauch abort asynchrounously and wait for it to complete or timeout
   LOG(INFO) << logPrefix()
             << "Launching ProcessGroupNCCL abort asynchrounously.";
+=======
+  watchdog_->notify();
+
+  // launch abort asynchronously and wait for it to complete or timeout
+  LOG(INFO) << logPrefix()
+            << "Launching ProcessGroupNCCL abort asynchronously.";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::future<bool> fut =
       std::async(std::launch::async, [this]() { return this->abortComms(); });
 
@@ -1401,8 +1787,12 @@ void ProcessGroupNCCL::abort() {
 
   // We need to wait for abort to finish before we can safely shut down
   // heartbeat monitoring thread.
+<<<<<<< HEAD
   terminateHeartbeatMonitorThread_.store(true);
   monitorWakeUpCV_.notify_one();
+=======
+  heartbeatMonitor_->stop();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Difference between `abort()` and `shutdown()`:
@@ -1442,17 +1832,26 @@ void ProcessGroupNCCL::shutdown() {
   // anymore because I am going to destroy them now
   LOG(INFO) << logPrefix() << "Operations flushed, joining watchdog thread.";
   terminateProcessGroup_.store(true);
+<<<<<<< HEAD
   workMetaListCV_.notify_one();
   if (ncclCommWatchdogThread_.joinable()) {
     ncclCommWatchdogThread_.join();
   }
+=======
+  watchdog_->notify();
+  watchdog_->join();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (onCompletionHookThread_.joinable()) {
     onCompletionHookThread_.join();
   }
   // Watchdog thread exiting, retire heartbeat monitoring thread now to avoid
   // false alarm
+<<<<<<< HEAD
   terminateHeartbeatMonitorThread_.store(true);
   monitorWakeUpCV_.notify_one();
+=======
+  heartbeatMonitor_->stop();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Destroy the communicator, reclaim resources
   LOG(INFO) << logPrefix() << "Watchdog joined, destroying NCCL communicators.";
   {
@@ -1505,6 +1904,7 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
   // Make sure we've told threads to stop; doesn't hurt if we'd done so before.
   // Tell watchdog and onCompletionHook:
   terminateProcessGroup_.store(true);
+<<<<<<< HEAD
   workMetaListCV_.notify_one();
   // Tell heartbeat thread:
   terminateHeartbeatMonitorThread_.store(true);
@@ -1520,6 +1920,15 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
     LOG(INFO) << logPrefix()
               << "ProcessGroupNCCL heart beat monitor thread joined.";
   }
+=======
+  watchdog_->notify();
+  // Tell heartbeat thread:
+  heartbeatMonitor_->stop();
+
+  // Wait for all threads to finish before returning
+  watchdog_->join();
+  heartbeatMonitor_->join();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (onCompletionHookThread_.joinable()) {
     onCompletionHookThread_.join();
     LOG(INFO) << logPrefix()
@@ -1528,11 +1937,20 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
 }
 
 bool ProcessGroupNCCL::dumpDebuggingInfo(bool includeStackTrace /*=true*/) {
+<<<<<<< HEAD
+=======
+  // This will log counter for how long dumpDebuggingInfo actually takes.
+  STATIC_SCOPED_WAIT_COUNTER(pytorch.ProcessGroupNCCL__dumpDebuggingInfo);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Serialize all calls to this function to avoid corrupting data, but allow
   // multiple calls in one runtime. User is responsible for preserving the
   // output file from an earlier call before a later call overwrites it.
   static std::mutex writeDebugInfoMutex;
+<<<<<<< HEAD
   std::lock_guard<std::mutex> lock(writeDebugInfoMutex);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   LOG(ERROR)
       << logPrefix()
       << "ProcessGroupNCCL preparing to dump debug info. Include stack trace: "
@@ -1542,10 +1960,20 @@ bool ProcessGroupNCCL::dumpDebuggingInfo(bool includeStackTrace /*=true*/) {
     // their customized writer by inheriting `DebugInfoWriter` via
     // `registerDebugInfoWriter`.
     auto ncclTrace = dump_nccl_trace(true, includeStackTrace, false);
+<<<<<<< HEAD
+=======
+    // dump_nccl_trace will hang so we don't grab the global lock until we get
+    // the trace.
+    std::lock_guard<std::mutex> lock(writeDebugInfoMutex);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     DebugInfoWriter& writer = DebugInfoWriter::getWriter(globalRank());
     LOG(INFO) << logPrefix() << "ProcessGroupNCCL dumping nccl trace to "
               << writer.getWriterTarget();
     writer.write(ncclTrace);
+<<<<<<< HEAD
+=======
+    LOG(INFO) << logPrefix() << "Flight Recorder trace successfully dumped.";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return true;
   }
   return false;
@@ -1564,17 +1992,34 @@ static long computeDeltaMS(
       .count();
 }
 
+<<<<<<< HEAD
 std::string ProcessGroupNCCL::getNCCLWatchdogTimeoutErrorMsg(
     const std::string& extraMsg) {
   return c10::str(
       logPrefix(),
+=======
+void ProcessGroupNCCL::setEnableNanCheck(bool enableNanCheck) {
+  enableNanCheck_ = enableNanCheck;
+}
+
+std::string ProcessGroupNCCL::HeartbeatMonitor::getNCCLWatchdogTimeoutErrorMsg(
+    const std::string& extraMsg) {
+  return c10::str(
+      pg_->logPrefix(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Received a dump signal due to a collective timeout from ",
       extraMsg,
       " and we will try our best to dump the debug info. ",
       "Last enqueued NCCL work: ",
+<<<<<<< HEAD
       pgStatus_->lastEnqueuedSeq,
       ", last completed NCCL work: ",
       pgStatus_->lastCompletedSeq,
+=======
+      pg_->pgStatus_->lastEnqueuedSeq,
+      ", last completed NCCL work: ",
+      pg_->pgStatus_->lastCompletedSeq,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ".",
       "This is most likely caused by incorrect usages of collectives, e.g., wrong ",
       "sizes used across ranks, the order of collectives is not same for all ranks ",
@@ -1583,21 +2028,95 @@ std::string ProcessGroupNCCL::getNCCLWatchdogTimeoutErrorMsg(
       "bugs in the communications library (e.g. NCCL), etc. ");
 }
 
+<<<<<<< HEAD
 std::string ProcessGroupNCCL::getNCCLWatchdogTimeoutExitMsg(
     const std::string& exitReason) {
   return c10::str(
       logPrefix(),
+=======
+std::string ProcessGroupNCCL::HeartbeatMonitor::getNCCLWatchdogTimeoutExitMsg(
+    const std::string& exitReason) {
+  return c10::str(
+      pg_->logPrefix(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Terminating the process after attempting to dump debug info, due to ",
       exitReason,
       ".");
 }
 
+<<<<<<< HEAD
 void ProcessGroupNCCL::heartbeatMonitor() {
+=======
+void ProcessGroupNCCL::HeartbeatMonitor::setLastWorkListUpdateTime(
+    std::chrono::time_point<std::chrono::steady_clock> time) {
+  // We intentionally let the race condition to happen but this is ok
+  // as long as we update the time, we know we are making progress.
+  lastWorkListUpdateTime_ = time;
+}
+
+int ProcessGroupNCCL::HeartbeatMonitor::getDumpTimeout() const {
+  return waitTimeoutDumpInMilSec_;
+}
+
+ProcessGroupNCCL::HeartbeatMonitor::HeartbeatMonitor(ProcessGroupNCCL* pg) {
+  pg_ = pg;
+  heartbeatTimeoutInSec_ =
+      getCvarInt(TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC, 60 * 8 /*8 Mins*/);
+  waitTimeoutDumpInMilSec_ =
+      getCvarInt(TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC, 15 * 1000 /*15 Sec*/);
+  coordCheckIntervalMilSec_ = getCvarInt(TORCH_NCCL_COORD_CHECK_MILSEC, 1000);
+  // TODO, we should either deprecate TORCH_NCCL_DUMP_ON_TIMEOUT
+  // or change its name to reflect that dump happens on exception including
+  // both timeout and other errors.
+  dumpOnTimeoutOrEx_ = getCvarBool(TORCH_NCCL_DUMP_ON_TIMEOUT, true);
+  // logging C++ stack isn't safe. Gate it with an ENV.
+  logCppStackOnUncleanShutdown_ =
+      getCvarBool(TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN, true);
+  watchdogHeartbeatMonitorEnabled_ =
+      getCvarBool(TORCH_NCCL_ENABLE_MONITORING, true);
+
+  // print out ENV settings for the heartbeat monitor thread.
+  LOG(INFO)
+      << pg_->logPrefix() << "HeartbeatMonitor environments: "
+      << "TORCH_NCCL_ENABLE_MONITORING (Whether to kill program when no watchdog heartbeat detected): "
+      << watchdogHeartbeatMonitorEnabled_
+      << ", TORCH_NCCL_DUMP_ON_TIMEOUT: " << dumpOnTimeoutOrEx_
+      << ", TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: " << waitTimeoutDumpInMilSec_
+      << ", TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: " << heartbeatTimeoutInSec_
+      << ", TORCH_NCCL_COORD_CHECK_MILSEC: " << coordCheckIntervalMilSec_
+      << ", TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN: "
+      << logCppStackOnUncleanShutdown_;
+}
+
+void ProcessGroupNCCL::HeartbeatMonitor::stop() {
+  terminateHeartbeatMonitorThread_.store(true);
+  monitorWakeUpCV_.notify_one();
+}
+
+void ProcessGroupNCCL::HeartbeatMonitor::start() {
+  TORCH_CHECK(
+      !ncclHeartbeatMonitorThread_.joinable(),
+      "HeartbeatMonitor thread already started");
+  ncclHeartbeatMonitorThread_ =
+      std::thread(&ProcessGroupNCCL::HeartbeatMonitor::runLoop, this);
+}
+
+void ProcessGroupNCCL::HeartbeatMonitor::join() {
+  if (ncclHeartbeatMonitorThread_.joinable()) {
+    ncclHeartbeatMonitorThread_.join();
+    LOG(INFO) << pg_->logPrefix()
+              << "ProcessGroupNCCL heart beat monitor thread joined.";
+  }
+}
+
+void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::setThreadName("pt_nccl_heartbt");
 
   uint64_t heartBeatCounter = 0ULL;
   std::string errorMsg;
   std::string exitReason;
+<<<<<<< HEAD
   bool checkDumpSignal = (dumpOnTimeoutOrEx_ && local_id_ == 0);
   int monitorPollInterval = checkDumpSignal || propagatePgError_
       ? coordCheckIntervalMilSec_
@@ -1610,6 +2129,23 @@ void ProcessGroupNCCL::heartbeatMonitor() {
     // after 'global' ranks in the system, So we assume processgroup (uid)==0 is
     // the global PG and has globally unique rank ids across trainers.
     dumpPipe.emplace(rank_);
+=======
+  bool checkDumpSignal = (dumpOnTimeoutOrEx_ && pg_->getUid() == 0);
+  int monitorPollInterval = checkDumpSignal ? coordCheckIntervalMilSec_
+                                            : heartbeatTimeoutInSec_ * 1000;
+  auto lastTimePollStore = std::chrono::steady_clock::now();
+  auto lastTimeHeartBeatCheck = std::chrono::steady_clock::now();
+  std::optional<DumpPipe> dumpPipe = std::nullopt;
+  // Use a pool to temporarily store the futures to avoid blocking when the code
+  // exits the scope of when future is generated by std::async.
+  std::vector<std::future<bool>> futures;
+
+  if (pg_->getUid() == 0) {
+    // DumpPipe is one per-trainer process, and its convenient to name them
+    // after 'global' ranks in the system, So we assume processgroup (uid)==0 is
+    // the global PG and has globally unique rank ids across trainers.
+    dumpPipe.emplace(pg_->globalRank());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   while (true) {
     // This won't have any lock since this lock is only used here.
@@ -1626,11 +2162,14 @@ void ProcessGroupNCCL::heartbeatMonitor() {
     }
     auto currentTime = std::chrono::steady_clock::now();
 
+<<<<<<< HEAD
     if (propagatePgError_) {
       // Check and set remote error if it has not been set before
       checkAndSetRemoteError();
     }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // We put extra functionality in the thread for the default PG (aka,
     // local_id_=0) because the signal is same across different PGs. We only
     // need to run once per process to avoid duplicate things performed in too
@@ -1643,7 +2182,11 @@ void ProcessGroupNCCL::heartbeatMonitor() {
       // 1. The current rank is the first to observe a timeout in watchdog.
       // (shouldDump_ was set to true by the watchdog thread).
       // 2. Other ranks detected the timeout and signal the current rank to
+<<<<<<< HEAD
       // dump. In addtion, monitor threads will dump if watchdog threads has no
+=======
+      // dump. In addition, monitor threads will dump if watchdog threads has no
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // heartbeat or dumpPipe is not empty.
       if (shouldDump_.load()) {
         errorMsg = getNCCLWatchdogTimeoutErrorMsg("this local rank");
@@ -1660,7 +2203,11 @@ void ProcessGroupNCCL::heartbeatMonitor() {
         lastTimePollStore = currentTime;
         auto handleError = [&](const std::string& errorMessage) {
           LOG(WARNING)
+<<<<<<< HEAD
               << logPrefix()
+=======
+              << pg_->logPrefix()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               << "Failed to check the \"should dump\" flag on TCPStore, "
               << "(maybe TCPStore server has shut down too early), with error: "
               << errorMessage;
@@ -1672,7 +2219,11 @@ void ProcessGroupNCCL::heartbeatMonitor() {
         bool checkExceptionDump = false;
         try {
           checkExceptionDump =
+<<<<<<< HEAD
               globalStore_->check({std::string(kStoreDumpKey)});
+=======
+              pg_->globalStore()->check({std::string(kStoreDumpKey)});
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         } catch (const c10::DistNetworkError& e) {
           handleError(e.msg());
         } catch (const std::exception& e) {
@@ -1683,19 +2234,31 @@ void ProcessGroupNCCL::heartbeatMonitor() {
           int timeOutRank = -1;
           if (!shouldDump_.load()) {
             LOG(ERROR)
+<<<<<<< HEAD
                 << logPrefix()
+=======
+                << pg_->logPrefix()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 << "Observed flight recorder dump signal from another rank via TCPStore.";
           }
           shouldDump_.store(true);
           try {
+<<<<<<< HEAD
             auto vec = globalStore_->get(std::string(kStoreDumpKey));
+=======
+            auto vec = pg_->globalStore()->get(std::string(kStoreDumpKey));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             TORCH_CHECK_WITH(
                 DistBackendError,
                 vec.size() == sizeof(int),
                 "Invalid size for the timeout rank ID");
             std::memcpy(&timeOutRank, vec.data(), vec.size());
           } catch (const std::exception& e) {
+<<<<<<< HEAD
             LOG(ERROR) << logPrefix()
+=======
+            LOG(ERROR) << pg_->logPrefix()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                        << "Failed to get timeout rank ID from TCPStore."
                        << e.what();
           }
@@ -1711,14 +2274,22 @@ void ProcessGroupNCCL::heartbeatMonitor() {
         heartbeatTimeoutInSec_ * 1000l) {
       // Check the heart beat of watchdog thread.
       lastTimeHeartBeatCheck = currentTime;
+<<<<<<< HEAD
       auto heartbeat = heartbeat_.load();
+=======
+      auto heartbeat = pg_->getWatchdogHeartbt();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (heartbeat != heartBeatCounter) {
         heartBeatCounter = heartbeat;
       } else {
         shouldDump_.store(true);
         // Watchdog heartbeat timeout.
         errorMsg = c10::str(
+<<<<<<< HEAD
             logPrefix(),
+=======
+            pg_->logPrefix(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "ProcessGroupNCCL's watchdog got stuck for ",
             heartbeatTimeoutInSec_,
             " seconds without making progress in monitoring enqueued collectives. ",
@@ -1739,8 +2310,16 @@ void ProcessGroupNCCL::heartbeatMonitor() {
     // recorder and dump. After dump, the training should continue.
     if (dumpPipe.has_value() && dumpPipe->shouldDump()) {
       // best effort dump, not waiting for the dump here
+<<<<<<< HEAD
       std::future<bool> fut = std::async(
           std::launch::async, [this]() { return this->dumpDebuggingInfo(); });
+=======
+      LOG(INFO) << pg_->logPrefix()
+                << "Dump signal received through pipe, triggering FR dump.";
+      futures.emplace_back(std::async(std::launch::async, [this]() {
+        return this->pg_->dumpDebuggingInfo();
+      }));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   LOG(ERROR) << errorMsg;
@@ -1759,19 +2338,34 @@ void ProcessGroupNCCL::heartbeatMonitor() {
     // local disk)
     bool dumpStackTrace = true;
     ::c10d::C10dLoggingData debugLog;
+<<<<<<< HEAD
     debugLog.integers["pg_id"] = static_cast<int64_t>(local_id_);
     debugLog.integers["rank"] = rank_;
     debugLog.integers["global_rank"] = globalRank();
     debugLog.integers["world_size"] = getSize();
+=======
+    debugLog.integers["pg_id"] = static_cast<int64_t>(pg_->getUid());
+    debugLog.integers["rank"] = pg_->getRank();
+    debugLog.integers["global_rank"] = pg_->globalRank();
+    debugLog.integers["world_size"] = pg_->getSize();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     debugLog.strings["flight_recorder_version"] = c10d::version_val_str;
     for (int i = 0; i < 2; i++) {
       std::future<bool> asyncDebugDump =
           std::async(std::launch::async, [this, dumpStackTrace]() {
+<<<<<<< HEAD
             return this->dumpDebuggingInfo(dumpStackTrace);
           });
 
       // wait for the dump until timeout - log data
       auto complete = waitForFutureOrTimeout(
+=======
+            return this->pg_->dumpDebuggingInfo(dumpStackTrace);
+          });
+
+      // wait for the dump until timeout - log data
+      auto complete = pg_->waitForFutureOrTimeout(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           asyncDebugDump,
           std::chrono::milliseconds(waitTimeoutDumpInMilSec_),
           "Flight recorder dump in heartbeatMonitor",
@@ -1780,21 +2374,36 @@ void ProcessGroupNCCL::heartbeatMonitor() {
 
       if (complete) {
         LOG(INFO)
+<<<<<<< HEAD
             << logPrefix()
             << "Finished flight recorder successfully. Output can be analyzed using the fr_trace script.";
+=======
+            << pg_->logPrefix()
+            << "Finished flight recorder successfully. Output can be analyzed using the fr_trace script.";
+        if (i > 0) {
+          debugLog.strings["exception_msg"] = "Dump with stack trace failed.";
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         break;
       }
       // If we failed to dump, try dumping without stack trace in the 2nd
       // iteration.
       dumpStackTrace = false;
+<<<<<<< HEAD
+=======
+      futures.emplace_back(std::move(asyncDebugDump));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     debugLog.integers["trace_enabled"] = int64_t(dumpStackTrace);
     auto logger = c10d::C10dLogger::getLogger();
     if (logger) {
       logger->log(debugLog);
     }
+<<<<<<< HEAD
     // Indicate to watchdog thread that we have finished dumping.
     promiseFlightRecorderDump_.set_value();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // GIL deadlock check.
@@ -1807,26 +2416,43 @@ void ProcessGroupNCCL::heartbeatMonitor() {
           futStatus != std::future_status::deferred,
           "Expected the future to have been launched eagerly.");
       LOG(ERROR)
+<<<<<<< HEAD
           << logPrefix()
+=======
+          << pg_->logPrefix()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           << "Could not acquire GIL within 300 ms on exit, possible GIL induced hang";
     }
   } else {
     VLOG(2)
+<<<<<<< HEAD
         << logPrefix()
+=======
+        << pg_->logPrefix()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         << "GIL checker was not registered, perhaps this is a no-python build?";
   }
 
   // Dump the c++ stacktraces.
   auto& cpp_dumper = get_cpp_trace_dumper();
   if (logCppStackOnUncleanShutdown_ && cpp_dumper.has_value()) {
+<<<<<<< HEAD
     LOG(INFO) << logPrefix() << "Dumping c++ stacktraces:";
     cpp_dumper.value()(
         [&](const std::string& line) { LOG(INFO) << logPrefix() << line; });
     LOG(INFO) << logPrefix() << "Finished c++ stacktraces dump.";
+=======
+    LOG(INFO) << pg_->logPrefix() << "Dumping c++ stacktraces:";
+    cpp_dumper.value()([&](const std::string& line) {
+      LOG(INFO) << pg_->logPrefix() << line;
+    });
+    LOG(INFO) << pg_->logPrefix() << "Finished c++ stacktraces dump.";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // There are two possible cases for the watchdog thread exit:
   // Case one: desync report runs quickly, and it follows the step:
+<<<<<<< HEAD
   // collective timeout -> desync -> exception handling -> destructors
   // -> set terminateHeartbeatMonitorThread_ -> notify monitorWakeUpCV_.
   // So the code either early returns above or will skip the sleep below.
@@ -1840,6 +2466,23 @@ void ProcessGroupNCCL::heartbeatMonitor() {
     std::this_thread::sleep_for(std::chrono::seconds(heartbeatTimeoutInSec_));
     LOG(INFO) << logPrefix() << "slept for " << heartbeatTimeoutInSec_
               << " waiting for desync report or process group destroy.";
+=======
+  // collective timeout -> desync -> exception handling -> throwing exception.
+  // The program will exit because of exception thrown and the code below will
+  // not be run.
+  //
+  // Case two: desync might be slow or get stuck and we need to wait
+  // extra time to avoid we kill the program too early.
+  //
+  // Or we get stuck in destructors, we will sleep for some time before calling
+  // std::abort() to kill the whole process.
+  if ((pg_->terminateProcessGroup_.load() || shouldDump_.load()) &&
+      !terminateHeartbeatMonitorThread_.load()) {
+    std::this_thread::sleep_for(std::chrono::seconds(heartbeatTimeoutInSec_));
+    LOG(INFO)
+        << pg_->logPrefix() << "slept for " << heartbeatTimeoutInSec_
+        << " because we want to wait longer to verify there is indeed a watchdog hang.";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // At this point, we either already sleep for another `heartbeatTimeoutInSec_`
@@ -1851,20 +2494,32 @@ void ProcessGroupNCCL::heartbeatMonitor() {
   // We already log completion inside the thread, so it may not be necessary to
   // check the return value here.  We mainly use a future so we can exit early
   // if done.
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!terminateHeartbeatMonitorThread_.load()) {
     // Create a error message reported from MonitorThread, so
     // we throw exception and make the whole process to be killed.
     // TODO(fduwjj): After having a hang debug wiki, we need to update the wiki
     // url here.
+<<<<<<< HEAD
     if (monitorThreadEnabled_.load()) {
       terminateProcess(getNCCLWatchdogTimeoutExitMsg(exitReason));
+=======
+    if (watchdogHeartbeatMonitorEnabled_) {
+      pg_->terminateProcess(getNCCLWatchdogTimeoutExitMsg(exitReason));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       // Ideally we want to merge this one with the above one, but we are going
       // to remove the kill switch for monitor thread soon, so we keep this one
       // for now.
       LOG(ERROR)
+<<<<<<< HEAD
           << logPrefix()
+=======
+          << pg_->logPrefix()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           << "ProcessGroupNCCL monitor thread is disabled, but would have terminated the process"
           << "after attempting to dump debug info, due to " << exitReason
           << ".";
@@ -1872,6 +2527,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
   }
 }
 
+<<<<<<< HEAD
 void ProcessGroupNCCL::ncclCommWatchdog() {
   c10::setThreadName("pt_nccl_watchdg");
 
@@ -1881,19 +2537,81 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
         std::thread(&ProcessGroupNCCL::heartbeatMonitor, this);
     watchdogHandler();
     VLOG(2) << logPrefix()
+=======
+ProcessGroupNCCL::Watchdog::Watchdog(ProcessGroupNCCL* pg) {
+  pg_ = pg;
+  heartbeat_ = 1ULL;
+  rethrowCUDAErrors_ = getCvarBool(TORCH_NCCL_RETHROW_CUDA_ERRORS, true);
+  propagatePgError_ = getCvarBool(TORCH_NCCL_PROPAGATE_ERROR, false);
+  desyncDebug_ = getCvarBool(TORCH_NCCL_DESYNC_DEBUG, false) ||
+      (pg_->dist_debug_level_ >= DebugLevel::Detail);
+
+  // print out ENV settings for the watchdog thread.
+  LOG(INFO) << pg_->logPrefix() << "PGNCCL Watchdog environments: "
+            << "TORCH_NCCL_RETHROW_CUDA_ERRORS: " << rethrowCUDAErrors_
+            << ", TORCH_NCCL_PROPAGATE_ERROR: " << propagatePgError_
+            << ", TORCH_NCCL_DESYNC_DEBUG: " << desyncDebug_;
+
+  // Enable Desync Debugger per user setting
+  if (desyncDebug_) {
+    desyncDebugger_.init(
+        pg_->getRank(),
+        pg_->getSize(),
+        pg_->globalRank(),
+        pg_->getUid(),
+        pg_->store_);
+  }
+}
+
+void ProcessGroupNCCL::Watchdog::notify() {
+  workMetaListCV_.notify_one();
+}
+
+void ProcessGroupNCCL::Watchdog::start() {
+  TORCH_CHECK(
+      !ncclCommWatchdogThread_.joinable(), "Watchdog thread already started");
+  ncclCommWatchdogThread_ = std::thread(&ProcessGroupNCCL::Watchdog::run, this);
+}
+
+void ProcessGroupNCCL::Watchdog::join() {
+  if (ncclCommWatchdogThread_.joinable()) {
+    ncclCommWatchdogThread_.join();
+    LOG(INFO) << pg_->logPrefix() << "ProcessGroupNCCL watchdog thread joined.";
+  }
+}
+
+void ProcessGroupNCCL::Watchdog::run() {
+  c10::setThreadName("pt_nccl_watchdg");
+
+  try {
+    VLOG(2) << pg_->logPrefix() << "Process group watchdog thread started!";
+    pg_->heartbeatMonitor_->start();
+    runLoop();
+    VLOG(2) << pg_->logPrefix()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             << "Process group watchdog thread terminated normally";
   } catch (std::exception& e) {
     if (std::string(e.what()).find("driver shutting down") !=
         std::string::npos) {
       VLOG(2)
+<<<<<<< HEAD
           << logPrefix()
+=======
+          << pg_->logPrefix()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           << "main process destroyed cuda before watchdog loop exited, terminating watchdog."
           << " (Watchdog caught exception: " << e.what();
 
     } else {
+<<<<<<< HEAD
       // Append error message reported from watchdogHandler
       const auto exitMsg = c10::str(
           logPrefix(),
+=======
+      // Append error message reported from runLoop
+      const auto exitMsg = c10::str(
+          pg_->logPrefix(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "Process group watchdog thread terminated with exception: ",
           e.what());
       LOG(ERROR) << exitMsg;
@@ -1908,7 +2626,11 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
     }
   } catch (...) {
     const auto exitMsg = c10::str(
+<<<<<<< HEAD
         logPrefix(),
+=======
+        pg_->logPrefix(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "Process group watchdog thread terminated with exception: unknown");
     LOG(ERROR) << exitMsg;
     watchDogException_ =
@@ -1917,13 +2639,347 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
   }
 }
 
+<<<<<<< HEAD
+=======
+int ProcessGroupNCCL::Watchdog::getSignalSrcRank(
+    c10::intrusive_ptr<Store>& store,
+    const std::string& signal) {
+  // This function is 'non blocking'. We first 'check' if the key exists in the
+  // store, then read/get the value only if the key exists.
+  int srcRank = -1;
+  bool signalExists = false;
+  try {
+    signalExists = store->check({signal});
+  } catch (const std::exception& e) {
+    LOG(WARNING) << pg_->logPrefix() << "Failed to check the signal " << signal
+                 << " on TCPStore, " << e.what();
+  }
+  if (!signalExists) {
+    return srcRank;
+  }
+
+  // key exists, now read and parse the value (source rank)
+  std::vector<uint8_t> vec;
+  try {
+    vec = store->get(std::string(signal));
+  } catch (const std::exception& e) {
+    LOG(ERROR) << pg_->logPrefix() << "Failed to get source rank of the signal "
+               << signal << " from TCPStore." << e.what();
+  }
+  TORCH_CHECK_WITH(
+      DistBackendError,
+      vec.size() == sizeof(int),
+      "Invalid size for the timeout rank ID");
+  std::memcpy(&srcRank, vec.data(), vec.size());
+  return srcRank;
+}
+
+void ProcessGroupNCCL::Watchdog::checkAndSetRemoteError() {
+  // if the error is already set, no need to check again
+  if (pg_->getError() != ErrorType::SUCCESS) {
+    return;
+  }
+  // key/signal to read from the tcpstore is a string and pg specific:
+  // format is: remote_error:pg_uid
+  int remoteErrorRank = getSignalSrcRank(
+      pg_->store_, std::string(kStoreErrorSignalKey) + ':' + pg_->pg_uid_);
+  if (remoteErrorRank != -1) {
+    std::lock_guard<std::mutex> lock(pg_->errorMutex_);
+    pg_->error_ = ErrorType::REMOTE_ERROR;
+    LOG(ERROR) << c10::str(
+        pg_->logPrefix(),
+        " remote error detected from rank: ",
+        remoteErrorRank);
+  }
+}
+
+void ProcessGroupNCCL::Watchdog::runLoop() {
+  bool done = false;
+  pg_->heartbeatMonitor_->setLastWorkListUpdateTime(
+      std::chrono::steady_clock::now());
+  auto lastStatusUpdateTime = std::chrono::steady_clock::now();
+  std::list<ProcessGroupNCCL::WorkNCCL> completedWorkList;
+
+  while (!done || !pg_->terminateProcessGroup_.load()) {
+    std::unique_lock<std::mutex> lock(pg_->workMetaListMutex_);
+    // We busy-poll the work vector every kWatchdogThreadSleepMillis
+    // milliseconds as long as the atomic is True.
+    workMetaListCV_.wait_for(
+        lock,
+        std::chrono::milliseconds(kWatchdogThreadSleepMillis),
+        [&]() -> bool { return pg_->terminateProcessGroup_.load(); });
+    // Bump up heart beat by one.
+    heartbeat_++;
+
+// Some versions of GLOG support less-spammy version of LOG_EVERY_MS
+// in which case we don't want to spam the logs.
+#ifdef LOG_EVERY_MS
+    // Log the progress of this PG periodically
+    C10_LOG_EVERY_MS(INFO, kWorkStatusUpdatePeriodMs) << c10::str(
+        logPrefix(),
+        "NCCL Work update periodically: ",
+        "last enqueued NCCL work: ",
+        pg_->pgStatus_->lastEnqueuedSeq,
+        ", last completed NCCL work: ",
+        pg_->pgStatus_->lastCompletedSeq,
+        ".");
+#endif // LOG_EVERY_MS
+    auto logger = ::c10d::C10dLogger::getLogger();
+    if (logger &&
+        computeDeltaMS(
+            lastStatusUpdateTime, std::chrono::steady_clock::now()) >=
+            kWorkStatusUpdatePeriodMs) {
+      ::c10d::C10dLoggingData data;
+      // logging integers
+      data.integers["pg_id"] = static_cast<int64_t>(pg_->local_id_);
+      data.integers["rank"] = pg_->rank_;
+      data.integers["global_rank"] = pg_->globalRank();
+      data.integers["last_enqueued_work"] = pg_->pgStatus_->lastEnqueuedSeq;
+      data.integers["last_started_work"] = pg_->pgStatus_->lastStartedSeq;
+      data.integers["last_completed_work"] = pg_->pgStatus_->lastCompletedSeq;
+      data.integers["last_enqueued_numel_in"] =
+          static_cast<int64_t>(pg_->pgStatus_->lastEnqueuedNumelIn);
+      data.integers["last_enqueued_numel_out"] =
+          static_cast<int64_t>(pg_->pgStatus_->lastEnqueuedNumelOut);
+      data.integers["last_completed_numel_in"] =
+          static_cast<int64_t>(pg_->pgStatus_->lastCompletedNumelIn);
+      data.integers["last_completed_numel_out"] =
+          static_cast<int64_t>(pg_->pgStatus_->lastCompletedNumelOut);
+      data.integers["last_started_numel_in"] =
+          static_cast<int64_t>(pg_->pgStatus_->lastStartedNumelIn);
+      data.integers["last_started_numel_out"] =
+          static_cast<int64_t>(pg_->pgStatus_->lastStartedNumelOut);
+      // logging strings
+      data.strings["last_enqueued_work_name"] =
+          pg_->pgStatus_->lastEnqueuedWorkName;
+      data.strings["last_started_work_name"] =
+          pg_->pgStatus_->lastStartedWorkName;
+      data.strings["last_completed_work_name"] =
+          pg_->pgStatus_->lastCompletedWorkName;
+      data.strings["pg_name"] = pg_->pg_uid_;
+      data.strings["pg_desc"] = pg_->pg_desc_;
+      logger->log(data);
+      lastStatusUpdateTime = std::chrono::steady_clock::now();
+    }
+
+    if (propagatePgError_) {
+      // Check and set remote error if it has not been set before
+      checkAndSetRemoteError();
+    }
+
+    for (auto it = pg_->workMetaList_.begin(); it != pg_->workMetaList_.end();
+         /* no increment */) {
+      auto& work = *it;
+      // When terminateProcessGroup_ is true, communicators have already been
+      // aborted, So cannot check exception based on them. But watchdog needs to
+      // finish the check for the works that have already been enqueued to
+      // workMetaList_
+
+      // check NCCL errors first
+      if (!pg_->terminateProcessGroup_.load()) {
+        work.checkAndSetException();
+      }
+
+      if (work.exception()) {
+        // set the error to the first error found
+        std::lock_guard<std::mutex> lock(pg_->errorMutex_);
+        if (pg_->error_ == ErrorType::SUCCESS) {
+          pg_->error_ = ErrorType::COMM_ERROR;
+        }
+      }
+
+      // Then check if work has timed out
+      // Skip if work has encountered an error
+      bool timedout = !work.exception() && work.checkTimeout();
+
+      // Report desync state in case of timeout (if TORCH_NCCL_DESYNC_DEBUG is
+      // turned on; otherwise, run() is no-op)
+      if (timedout) {
+        std::lock_guard<std::mutex> lock(pg_->errorMutex_);
+        if (pg_->error_ == ErrorType::SUCCESS) {
+          pg_->error_ = ErrorType::TIMEOUT;
+        }
+        desyncDebugger_.run();
+      }
+
+      // If work hits an exception (either an error or timeout)
+      if (work.exception()) {
+        LOG(ERROR) << c10::str(
+            pg_->logPrefix(),
+            " failure detected by watchdog at work sequence id: ",
+            work.seq_,
+            " PG status: last enqueued work: ",
+            pg_->pgStatus_->lastEnqueuedSeq,
+            ", last completed work: ",
+            pg_->pgStatus_->lastCompletedSeq);
+
+        // Print the traceback of the collective at call time
+        work.printTraceback();
+
+        // broadcast remote error signal to all other ranks in this specific PG.
+        // key/signal to write in the tcpstore is a string and pg specific:
+        // format is: remote_error:pg_uid
+        if (propagatePgError_) {
+          pg_->broadcastSignal(
+              pg_->store_,
+              std::string(kStoreErrorSignalKey) + ':' + pg_->pg_uid_,
+              pg_->rank_);
+        }
+
+        // try to notify other ranks via global TCPStore to dump the flight
+        // recorder when a collective timeout or exception happens. Flight
+        // recorder behavior is independent of desync Debug.
+        pg_->broadcastDumpSignal();
+        // Give time for dumping before throwing exception for all ranks.
+        // It is hard to presume or control what the pattern of watchdog might
+        // look like, so it is better to let all ranks universally sleep for a
+        // short period of time, in this case, 60 seconds, which is also the
+        // maximum time we leave for FR dump.
+        std::this_thread::sleep_for(std::chrono::milliseconds(
+            pg_->heartbeatMonitor_->getDumpTimeout() * 4));
+
+        if (SHOULD_CLEAN_UP(pg_->asyncErrorHandling_)) {
+          // Abort work and corresponding communicators
+          work.abort();
+          // PG level abort, which would abort all other communicators on this
+          // rank
+          pg_->abortComms();
+        }
+        // Throw exception
+        work.handleException(pg_->asyncErrorHandling_);
+      }
+
+      // Work status logging for desync debug
+      desyncDebugger_.logWorkStart(work);
+
+      // a work could be started but not completed, so we should not update
+      // lastStartedSeq and lastStartedOpName if the work state is checked
+      // multiple times after the start
+      if (pg_->pgStatus_->lastStartedSeq < static_cast<int64_t>(work.seq_) &&
+          work.isStarted()) {
+        pg_->pgStatus_->lastStartedSeq = static_cast<int64_t>(work.seq_);
+        pg_->pgStatus_->lastStartedWorkName = opTypeToString(work.opType_);
+        pg_->pgStatus_->lastStartedNumelIn = work.numelIn_;
+        pg_->pgStatus_->lastStartedNumelOut = work.numelOut_;
+      }
+
+      // allow watchdog to do an event query on a side thread
+      at::cuda::CUDAGuard device_guard(work.ncclEndEvent_->device_index());
+      at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeThreadLocal};
+
+      // Clean up completed work
+      if (work.isCompleted()) {
+        // In case user didn't call `work.wait()` with async collectives,
+        // watchdog would unstage the stashed tensors when detecting completion
+        // of the collective, to prevent ProcessGroupNCCL from holding reference
+        // to those tensors forever.
+        // work.stashed_for_allocator_safety_->unstash();
+        // Update: it seems directly unstashing from watchdog thread would cause
+        // some rare problems. We thus move the unstashing to main thread,
+        // triggered by a next user call, see `workEnqueue`. But `work` is going
+        // to be destructed, so we transfer the work's shelf to a shelves
+        // structure owned by the PG.
+        if (!work.stashed_for_allocator_safety_->empty()) {
+          std::lock_guard<std::mutex> lock(pg_->shelvesMutex_);
+          // We are just pushing back a shared_ptr here, so the cost should be
+          // minimal
+          pg_->shelvesToUnstash_.push_back(work.stashed_for_allocator_safety_);
+        }
+
+        if (pg_->enableTiming_ && logger) {
+          ::c10d::C10dLoggingData data;
+          // logging integers
+          data.strings["collective_duration"] =
+              std::to_string(work.getDuration());
+          data.integers["global_rank"] = pg_->globalRank();
+          data.integers["pg_id"] = static_cast<int64_t>(pg_->local_id_);
+          data.strings["pg_name"] = pg_->pg_uid_;
+          data.strings["pg_desc"] = pg_->pg_desc_;
+          data.integers["pg_rank"] = pg_->rank_;
+          data.integers["world_size"] = pg_->size_;
+          data.strings["comm_backend"] = "nccl";
+          data.strings["comm_backend_version"] = getNcclVersion();
+          // TODO: We see errors for this line, revert it for now.
+          data.strings["collective_stack"] = "";
+          data.strings["collective_name"] = opTypeToString(work.opType_);
+          logger->log(data);
+        }
+
+        // Work status logging for desync debug
+        desyncDebugger_.logWorkEnd(work);
+
+        if (work.futureWorkResult_ && work.finishedGPUExecutionInternal() &&
+            !work.futureWorkResult_->completed()) {
+          work.futureWorkResult_->markCompleted(
+              at::IValue(static_cast<uint8_t>(WorkResult::SUCCESS)));
+        }
+        {
+          // Reset the timeout and first work if the work is completed.
+          std::lock_guard<std::mutex> timeoutLock(pg_->mtxTimeoutExtension_);
+          if (work.ownedEphermeralTimeout_.count() > 0) {
+            pg_->ephemeralTimeoutActive_ -= work.ownedEphermeralTimeout_;
+            pg_->ephemeralTimeoutInflight_ -= work.ownedEphermeralTimeout_;
+          }
+        }
+        pg_->pgStatus_->lastCompletedSeq = static_cast<int64_t>(work.seq_);
+        pg_->pgStatus_->lastCompletedWorkName = opTypeToString(work.opType_);
+        pg_->pgStatus_->lastCompletedNumelIn = work.numelIn_;
+        pg_->pgStatus_->lastCompletedNumelOut = work.numelOut_;
+        FlightRecorderCUDA::get()->retire_id(work.trace_id_, true);
+        if (pg_->onCompletionHook_) {
+          // Move Work object to completedWorkList_ to be consumed by the hook
+          // thread
+          {
+            const std::lock_guard<std::mutex> lock(
+                pg_->completedWorkListMutex_);
+            pg_->completedWorkList_.splice(
+                pg_->completedWorkList_.end(), pg_->workMetaList_, it++);
+          }
+          pg_->completedWorkListCV_.notify_one();
+        } else {
+          it = pg_->workMetaList_.erase(it);
+          pg_->heartbeatMonitor_->setLastWorkListUpdateTime(
+              std::chrono::steady_clock::now());
+        }
+      } else {
+        // Increment the iterator if the current WorkNCCL object is not
+        // completed.
+        ++it;
+      }
+      // Increment heartbeat after each work processed,
+      // in case processing is slowed down (but not hung) by cuda api contention
+      heartbeat_++;
+    }
+    done = pg_->workMetaList_.empty();
+  }
+}
+
+uint64_t ProcessGroupNCCL::Watchdog::getHeartbt() const {
+  return heartbeat_.load();
+}
+
+void ProcessGroupNCCL::Watchdog::setDesyncDebug(bool desyncDebug) {
+  desyncDebug_ = desyncDebug;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Initialize and enable DesyncDebugger
 void ProcessGroupNCCL::DesyncDebugger::init(
     int rank,
     int size,
+<<<<<<< HEAD
+    c10::intrusive_ptr<Store> store) {
+  rank_ = rank;
+  size_ = size;
+=======
+    int globalRank,
+    int pgId,
     c10::intrusive_ptr<Store> store) {
   rank_ = rank;
   size_ = size;
+  globalRank_ = globalRank;
+  pgId_ = pgId;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   store_ = std::move(store);
   enabled_ = true;
   traceKeyStart_ = getTraceStartKey("NCCL", rank);
@@ -1935,21 +2991,51 @@ void ProcessGroupNCCL::DesyncDebugger::run() {
   if (!enabled_)
     return;
   auto logPrefix = c10::str("Rank ", rank_);
+<<<<<<< HEAD
   try {
     std::string desyncMsg = retrieveDesyncReport(store_, "NCCL", rank_, size_);
     LOG(ERROR) << logPrefix << desyncMsg;
   } catch (const std::exception& e) {
+=======
+  ::c10d::C10dLoggingData log;
+  log.integers["pg_id"] = pgId_;
+  log.integers["rank"] = rank_;
+  log.integers["global_rank"] = globalRank_;
+  log.integers["world_size"] = size_;
+  // Use this to differentiate between flight recorder and desync debug report.
+  log.strings["flight_recorder_version"] = "-1";
+
+  try {
+    std::string desyncMsg = retrieveDesyncReport(store_, "NCCL", rank_, size_);
+    log.strings["status"] = "SUCCESS";
+    LOG(ERROR) << logPrefix << desyncMsg;
+  } catch (const std::exception& e) {
+    log.strings["status"] = "EXCEPTION";
+    log.strings["exception_msg"] = e.what();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     enabled_ = false;
     LOG(ERROR) << logPrefix
                << " Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
                << " Please file an issue. Error: " << e.what();
   } catch (...) {
     enabled_ = false;
+<<<<<<< HEAD
+=======
+    log.strings["status"] = "EXCEPTION";
+    log.strings["exception_msg"] = "Unknown exception";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     LOG(ERROR)
         << logPrefix
         << " Failed to rerieve TORCH_NCCL_DESYNC_DEBUG report with unknown error."
         << " Please file an issue.";
   }
+<<<<<<< HEAD
+=======
+  auto logger = c10d::C10dLogger::getLogger();
+  if (logger) {
+    logger->log(log);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Log work start to store.
@@ -2017,6 +3103,13 @@ const int& ProcessGroupNCCL::globalRank() const {
   return globalRank;
 }
 
+<<<<<<< HEAD
+=======
+const c10::intrusive_ptr<Store>& ProcessGroupNCCL::globalStore() const {
+  return globalStore_;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 const std::vector<uint64_t>& ProcessGroupNCCL::groupRanks() const {
   if (options_->global_ranks_in_group.empty() && local_id_ == 0) {
     static std::vector<uint64_t> globalRanks(size_);
@@ -2061,6 +3154,7 @@ void ProcessGroupNCCL::broadcastSignal(
   }
 }
 
+<<<<<<< HEAD
 int ProcessGroupNCCL::getSignalSrcRank(
     c10::intrusive_ptr<Store>& store,
     const std::string& signal) {
@@ -2094,10 +3188,13 @@ int ProcessGroupNCCL::getSignalSrcRank(
   return srcRank;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void ProcessGroupNCCL::broadcastDumpSignal() {
   // broadcast dump signal to all other global ranks.
   broadcastSignal(globalStore_, std::string(kStoreDumpKey), globalRank());
   // signal the local rank to start dumping
+<<<<<<< HEAD
   if (shouldDump_.load()) {
     // already signaled dump, skipping signal again and wait for the dump
     // future.
@@ -2136,6 +3233,12 @@ void ProcessGroupNCCL::checkAndSetRemoteError() {
     error_ = ErrorType::REMOTE_ERROR;
     LOG(ERROR) << c10::str(
         logPrefix(), " remote error detected from rank: ", remoteErrorRank);
+=======
+  if (!shouldDump_.load()) {
+    LOG(ERROR) << logPrefix() << "First PG on this rank to signal dumping.";
+    // signal the monitor thread on PG0 to start dumping
+    shouldDump_.store(true);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -2165,6 +3268,7 @@ static int getRootIndex(const int rank, const int nRanks, const int nIds) {
   }
 }
 
+<<<<<<< HEAD
 void ProcessGroupNCCL::watchdogHandler() {
   bool done = false;
   lastWorkListUpdateTime_ = std::chrono::steady_clock::now();
@@ -2373,6 +3477,8 @@ void ProcessGroupNCCL::watchdogHandler() {
   }
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void ProcessGroupNCCL::runHookLoop() {
   c10::setThreadName("pt_nccl_runhook");
 
@@ -2395,15 +3501,26 @@ void ProcessGroupNCCL::runHookLoop() {
         // Hook might grab GIL, unlock first to prevent deadlock
         lock.unlock();
 
+<<<<<<< HEAD
         auto timeStarted =
             std::chrono::system_clock::now() +
             std::chrono::duration_cast<std::chrono::system_clock::duration>(
+=======
+        auto timeFinished = std::chrono::system_clock::now();
+        auto timeStarted =
+            timeFinished +
+            std::chrono::duration_cast<std::chrono::steady_clock::duration>(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 work.workStartTime_ - std::chrono::steady_clock::now());
         onCompletionHook_(std::make_shared<WorkInfo>(
             work.retrieveOpType(), // OpType
             work.getSequencenumber(), // seq
             timeStarted, // timeStarted
+<<<<<<< HEAD
             std::chrono::system_clock::now(), // timeFinished
+=======
+            timeFinished, // timeFinished
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             std::chrono::duration<float, std::milli>(
                 work.getDuration()) // activeDuration
             ));
@@ -2503,6 +3620,23 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(
   // of sequence number for p2p communications.
 
   std::string storeKey;
+<<<<<<< HEAD
+=======
+  RECORD_PARAM_COMMS(
+      std::make_tuple(0, false), // seq
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      rank_, // TODO: this might not work for P2P
+      "broadcastUniqueNCCLID", // collective name
+      0, // inNelems
+      0, // outNelems
+      at::kByte, // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+      size_); // worldSize
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!isSingleP2POp) {
     storeKey = std::to_string(ncclCommCounter_++);
   } else {
@@ -2559,6 +3693,23 @@ void ProcessGroupNCCL::allgatherUniqueNCCLIDs(
     std::vector<ncclUniqueId>& ncclIDs) {
   std::vector<std::string> storeKeys;
   std::vector<std::vector<uint8_t>> results;
+<<<<<<< HEAD
+=======
+  RECORD_PARAM_COMMS(
+      std::make_tuple(0, false), // seq
+      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
+      rank_, // rank
+      "allgatherUniqueNCCLIDs", // collective name
+      0, // inNelems
+      0, // outNelems
+      at::kByte, // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+      size_); // worldSize
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (size_t r = 0; r < ncclIDs.size(); r++) {
     storeKeys.emplace_back("UniqueNCCLID:" + std::to_string(r));
   }
@@ -2625,9 +3776,16 @@ void ProcessGroupNCCL::destroyNCCLComms(const std::string& devNCCLCommMapKey) {
   // Clear used device indices.
   usedDeviceIdxs_.clear();
 
+<<<<<<< HEAD
   ncclCommDevIdxMapMutex.lock();
   ncclCommDevIdxMap.erase(ncclComm);
   ncclCommDevIdxMapMutex.unlock();
+=======
+  {
+    std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+    ncclCommMemPoolMap.erase(ncclComm);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
@@ -2726,8 +3884,13 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
       at::kByte, // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       size_); // worldSize
 
 #ifdef NCCL_HAS_COMM_NONBLOCKING
@@ -2764,7 +3927,11 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
 
   bool useScalableInit = false;
   // (nranks / nroots) == 128 was the default NCCL recommended
+<<<<<<< HEAD
   // accoring to
+=======
+  // according to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // https://github.com/pytorch/pytorch/pull/136789#discussion_r1779171615.
   auto ranksPerRoot = getCvarInt(TORCH_NCCL_RANKS_PER_ROOT, 128);
 #if defined(NCCL_HAS_INIT_RANK_SCALABLE) && defined(NCCL_HAS_CONFIG)
@@ -2794,8 +3961,13 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
                 << "ProcessGroupNCCL all-gather unique IDs through store took "
                 << timerDeltaMs << " ms";
 #if defined(NCCL_HAS_INIT_RANK_SCALABLE) && defined(NCCL_HAS_CONFIG)
+<<<<<<< HEAD
       ncclComm =
           NCCLComm::create_scalable(numRanks, rank, ncclIDs, options_->config);
+=======
+      ncclComm = NCCLComm::create_scalable(
+          numRanks, rank, ncclIDs, deviceIndex, options_->config);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
       C10_THROW_ERROR(
           DistBackendError,
@@ -2849,8 +4021,14 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
     inInitializationCommMap_.emplace(deviceKey, ncclComm);
   }
 
+<<<<<<< HEAD
   FlightRecorder::get()->record_pg_ranks(
       std::make_tuple(pg_uid_, pg_desc_), groupRanks());
+=======
+  FlightRecorderCUDA::get()->record_pg_ranks(
+      std::make_tuple(pg_uid_, pg_desc_), groupRanks());
+  FlightRecorderCUDA::get()->record_accelerator_version(getNcclVersion());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   VLOG(2) << logPrefix() << "ProcessGroupNCCL created ncclComm_ "
           << ncclComm->repr()
@@ -2887,7 +4065,11 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
     // Now ncclComms are fully initialized.
     // Register all active CUDA memory segments in cache allocator to
     // the new NCCL communicators
+<<<<<<< HEAD
     if (useTensorRegisterAllocatorHook_) {
+=======
+    if (shouldAllCommunicatorsRegisterAllTensors()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto snapshot = c10::cuda::CUDACachingAllocator::snapshot();
       // Register the segment to a new NCCL communicator if on the same device
       for (const auto& segmentInfo : snapshot.segments) {
@@ -2905,9 +4087,16 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
     // on the same device.
     // NOTE: we need remove the communicator from this map when it is
     // destroyed, otherwise may register onto an invalid communicator.
+<<<<<<< HEAD
     ncclCommDevIdxMapMutex.lock();
     ncclCommDevIdxMap.emplace(ncclComm, device.index());
     ncclCommDevIdxMapMutex.unlock();
+=======
+    {
+      std::lock_guard<std::mutex> lock(ncclCommMemPoolMapMutex);
+      ncclCommMemPoolMap.emplace(ncclComm, MemPoolSet{});
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   it = devNCCLCommMap_.find(deviceKey);
@@ -2916,6 +4105,24 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
   return it->second;
 }
 
+<<<<<<< HEAD
+=======
+int64_t ProcessGroupNCCL::getCommPtr() {
+  // Get the collective communicator on the current CUDA device.
+  auto device = at::Device(at::kCUDA, at::cuda::current_device());
+  std::string deviceKey = getKeyFromDevice(device);
+  auto ncclComm = getNCCLComm(deviceKey);
+
+  // ncclComm is a nullptr if the communicator does not exist.
+  ncclComm_t comm = nullptr;
+  if (ncclComm != nullptr) {
+    comm = ncclComm->getNcclComm();
+  }
+  const int64_t commPtr = reinterpret_cast<int64_t>(comm);
+  return commPtr;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
     const std::string& deviceKey) {
   std::lock_guard<std::mutex> lock(mutex_);
@@ -3028,10 +4235,17 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
       profilingTitle,
       profilingTitle != nullptr ? std::optional<std::vector<at::Tensor>>(inputs)
                                 : std::nullopt,
+<<<<<<< HEAD
       desyncDebug_,
       enableTiming_.load(),
       cudaEventCacheEnabled_.load(),
       dist_debug_level_);
+=======
+      enableTiming_.load(),
+      cudaEventCacheEnabled_.load(),
+      dist_debug_level_);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (record) {
     bool isP2P = isP2POp(opType);
     // Ideally record every work that we enqueue, rather than every work we
@@ -3044,10 +4258,17 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
     // - initially, moved record() into workEnqueue(), but found that makes it
     //   hard to get access to profilingTitle,
     //   inputs, and outputs for metadata recording, and we don't want to attach
+<<<<<<< HEAD
     //   these objects to the Work becuase it has implications for keeping those
     //   tensors alive longer and adds overhead when copying Work objects
     //   between threads
     r->trace_id_ = FlightRecorder::get()->record(
+=======
+    //   these objects to the Work because it has implications for keeping those
+    //   tensors alive longer and adds overhead when copying Work objects
+    //   between threads
+    r->trace_id_ = FlightRecorderCUDA::get()->record(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_id_,
         std::make_tuple(pg_uid_, pg_desc_),
         seqCollective_,
@@ -3111,6 +4332,20 @@ void ProcessGroupNCCL::assignTimeoutToWork(
 
 void ProcessGroupNCCL::workEnqueue(
     const c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+<<<<<<< HEAD
+=======
+  // We clean up the TensorShelf's in case user hasn't called `work.wait()`.
+  // This has nothing to do with new work enqueue. We are just using a place
+  // that would be triggered by a next user call.
+  {
+    std::lock_guard<std::mutex> lock(shelvesMutex_);
+    for (auto& shelf : shelvesToUnstash_) {
+      shelf->unstash();
+    }
+    shelvesToUnstash_.clear();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // in blockingWait_ mode, we don't need watchdog thread, so no need to enqueue
   // the work
   if (!terminateProcessGroup_.load() && !blockingWait_) {
@@ -3125,7 +4360,12 @@ void ProcessGroupNCCL::workEnqueue(
     pgStatus_->lastEnqueuedWorkName = opTypeToString(work->opType_);
     pgStatus_->lastEnqueuedNumelIn = work->numelIn_;
     pgStatus_->lastEnqueuedNumelOut = work->numelOut_;
+<<<<<<< HEAD
     lastWorkListUpdateTime_ = std::chrono::steady_clock::now();
+=======
+    heartbeatMonitor_->setLastWorkListUpdateTime(
+        std::chrono::steady_clock::now());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -3135,6 +4375,13 @@ ProcessGroupNCCL::Options::Options(bool is_high_priority_stream)
 
 static constexpr int CoalActive = 0x01, CoalColl = 0x02, CoalP2P = 0x04;
 
+<<<<<<< HEAD
+=======
+uint64_t ProcessGroupNCCL::getWatchdogHeartbt() const {
+  return watchdog_->getHeartbt();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void ProcessGroupNCCL::startCoalescing() {
   // Other collective ops bump seq_ before creating a work. Thus, if coalesced
   // ops bump seq_ only after initing a work they will collide with (reuse) the
@@ -3143,10 +4390,18 @@ void ProcessGroupNCCL::startCoalescing() {
   // ops from a coalesce group into the flight recorder, we want to have the
   // same seq_ for those ops and its 'endCoalescing' op. Hence we bump during
   // start, which has one minor downside- we burn a seq_ if someone ever does a
+<<<<<<< HEAD
   // 'start' and 'end' coalescing region without doing an operation inbetween.
 
   coalescedDevice_.set_index(-1);
   coalescedComm_ = nullptr;
+=======
+  // 'start' and 'end' coalescing region without doing an operation in between.
+
+  coalescedDevice_.set_index(-1);
+  coalescedComm_ = nullptr;
+  coalescedTensors_.clear();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   coalescing_state_ |= CoalActive;
   groupStart();
 }
@@ -3162,7 +4417,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
   }
   TORCH_CHECK(
       coalescedDevice_.index() >= 0,
+<<<<<<< HEAD
       "Somthing went wrong. Did you call end_coalescing before start_coalescing?");
+=======
+      "Something went wrong. Did you call end_coalescing before start_coalescing?");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // `coalescedComm_` should have same set of comms across collectives
   auto comm = coalescedComm_;
@@ -3172,6 +4431,12 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
   // `getKeyFromDevice` is how we get keys for both collectives and batch P2P
   const auto key = getKeyFromDevice(device);
   auto ncclStream = ncclStreams_.at(key);
+<<<<<<< HEAD
+=======
+  auto opProfilerTitle = optype != OpType::COALESCED
+      ? "nccl:" + opTypeToString(optype) + "_coalesced"
+      : "nccl:coalesced";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Create Work object
   c10::cuda::CaptureStatus capture_status =
@@ -3183,16 +4448,29 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
       rank_,
       optype,
       coalescing_state_ & CoalP2P,
+<<<<<<< HEAD
       "nccl:coalesced",
+=======
+      opProfilerTitle.c_str(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       {},
       {},
       enqueue);
   work->ncclComm_ = comm;
   work->blockingWait_ = blockingWait_;
+<<<<<<< HEAD
   work->avoidRecordStreams_ = avoidRecordStreams_;
   work->store_ = store_;
   assignTimeoutToWork(work, options_);
 
+=======
+  work->store_ = store_;
+  assignTimeoutToWork(work, options_);
+
+  // Hand over references to tensors during coalescing to work's stash
+  work->stashed_for_allocator_safety_->stash(coalescedTensors_);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Record start before ncclGroupEnd
   if (work->timingEnabled_) {
     work->ncclStartEvent_->record(ncclStream);
@@ -3208,19 +4486,32 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
   // TODO(eqy): is this still necessary if avoidRecordStreams_ is set?
   work->ncclEndEvent_->record(ncclStream);
 
+<<<<<<< HEAD
   if (avoidRecordStreams_) {
     // other functions expect an initialized ptr if avoidRecordStreams_ is set
     work->stashed_for_allocator_safety_ =
         std::make_shared<std::vector<at::Tensor>>();
   }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (enqueue) {
     workEnqueue(work);
   }
 
+<<<<<<< HEAD
   coalescing_state_ = 0;
   coalescedComm_ = nullptr;
   return work;
+=======
+  // Reset coalescing state
+  coalescing_state_ = 0;
+  coalescedComm_ = nullptr;
+  coalescedTensors_.clear();
+  // If in async mode, return work; otherwise, kernel is enqueued on current
+  // stream, no need to return work
+  return coalescedAsync_ ? work : nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing() {
@@ -3228,6 +4519,26 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing() {
   return endCoalescing(OpType::COALESCED);
 }
 
+<<<<<<< HEAD
+=======
+void ProcessGroupNCCL::startTimeEstimate() {
+  groupStart();
+}
+
+float ProcessGroupNCCL::endTimeEstimate() {
+#ifdef NCCL_SIM_INFO_INITIALIZER
+  ncclSimInfo_t simInfo = NCCL_SIM_INFO_INITIALIZER;
+  C10D_NCCL_CHECK(ncclGroupSimulateEnd(&simInfo), std::nullopt);
+  return simInfo.estimatedTime;
+#else
+  TORCH_CHECK(
+      false,
+      c10::str(
+          "The current nccl version does not support nccl comm time estimation. "));
+#endif
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename Fn, typename PreProcess, typename PostProcess>
 c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     std::vector<at::Tensor>& inputs,
@@ -3236,11 +4547,18 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     PreProcess pre,
     PostProcess post,
     OpType opType,
+<<<<<<< HEAD
     const char* profilingTitle,
     bool avoidRecordStreams,
     bool nanCheck) {
   // Environment setting by the user may add onto collective call's option
   avoidRecordStreams |= avoidRecordStreams_;
+=======
+    bool asyncOp,
+    const char* profilingTitle,
+    bool nanCheck) {
+  // Environment setting by the user may add onto collective call's option
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   nanCheck &= enableNanCheck_;
 
   auto device = getDevice(inputs[0]);
@@ -3281,6 +4599,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     } else {
       TORCH_CHECK(coalescedComm_ == ncclComm, MULTI_DEVICE_ERROR_MSG);
     }
+<<<<<<< HEAD
   }
 
   // Used many times below, so we stash the unordered_map lookup
@@ -3288,18 +4607,71 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
 
   // First let NCCL streams wait for input tensors allocation streams
   syncStream(device, ncclEvents_[key], ncclStream);
+=======
+    coalescedAsync_ = asyncOp;
+  }
+
+  // in asyncOp=false [default] mode, we use currentStream as ncclStream
+  // otherwise, we use separate ncclStream and let it sync on currentStream
+  auto ncclStream = asyncOp ? ncclStreams_.at(key)
+                            : at::cuda::getCurrentCUDAStream(device.index());
+  if (asyncOp) {
+    // First let NCCL streams wait for input tensors allocation streams
+    syncStream(device, ncclEvents_[key], ncclStream);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool enqueue =
       !coalescing_state_ && capture_status == c10::cuda::CaptureStatus::None;
   auto work = initWork(
       device, rank_, opType, false, profilingTitle, inputs, outputs, enqueue);
+<<<<<<< HEAD
+=======
+  if (coalescing_state_) {
+    // When coalescing, we record events per op that lack timing/state
+    // information because there is no 'work' associated with them, and then
+    // later in endCoalescing we record a 'coalesced' Work which has
+    // timing/state updates via watchdog thread, but lacks op metadata such as
+    // input/output sizes and profilingTitle per-op in the group.
+    FlightRecorderCUDA::get()->record(
+        local_id_,
+        std::make_tuple(pg_uid_, pg_desc_),
+        seqCollective_,
+        seqP2P_,
+        op_id_,
+        profilingTitle,
+        inputs,
+        outputs,
+        nullptr,
+        nullptr,
+        options_->timeout,
+        pgStatus_,
+        /*isP2P=*/false);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Store references to outputs to be used by WorkNCCL::result and operator<<.
   work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
 
+<<<<<<< HEAD
   if (avoidRecordStreams) {
     work->stashed_for_allocator_safety_ =
         std::make_shared<std::vector<at::Tensor>>(inputs);
+=======
+  // If we are performing sync operations, i.e. equeuing kernel onto "current"
+  // stream, we don't need to do anything for tensor lifetime management.
+  // Otherwise, we need to stage the tensors will `work.wait()`.
+  if (asyncOp) {
+    // First select which shelf to stash onto: to `work` if single collective;
+    // to an inflight shelf if coalescing.
+    if (coalescing_state_) {
+      coalescedTensors_.stash(inputs);
+      coalescedTensors_.stash(outputs);
+    } else {
+      work->stashed_for_allocator_safety_->stash(inputs);
+      work->stashed_for_allocator_safety_->stash(outputs);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   if (nanCheck) {
@@ -3309,7 +4681,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
   }
 
   // Start event should only be recorded before the ncclGroupStart()
+<<<<<<< HEAD
   if (work->timingEnabled_) {
+=======
+  if (work->timingEnabled_ && !coalescing_state_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     work->ncclStartEvent_->record(ncclStream);
   }
 
@@ -3325,6 +4701,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
   // operations where `inputs' and `outputs' are not the same.
   //
   // See [Sync Streams].
+<<<<<<< HEAD
   if (!avoidRecordStreams) {
     for (const auto& input : inputs) {
       if (!input.is_sparse()) {
@@ -3340,6 +4717,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
       }
     }
   }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Not all collectives have the same signature, e.g, all-reduce take in a Tensor
 // as the input and output while all-to-all take in a vector of Tensors as input
@@ -3391,7 +4770,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
 
   // Set appropriate work parameters.
   work->blockingWait_ = blockingWait_;
+<<<<<<< HEAD
   work->avoidRecordStreams_ = avoidRecordStreams;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   work->store_ = store_;
   assignTimeoutToWork(work, options_);
   // Record size info for debug. We only record the size on the first device as
@@ -3409,7 +4791,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     workEnqueue(work);
   }
 
+<<<<<<< HEAD
   return work;
+=======
+  return asyncOp ? work : nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename Fn>
@@ -3418,11 +4804,16 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
     std::vector<at::Tensor>& outputs,
     Fn fn,
     OpType opType,
+<<<<<<< HEAD
     const char* profilingTitle,
     bool avoidRecordStreams) {
   // Environment setting by the user may add onto collective call's option
   avoidRecordStreams |= avoidRecordStreams_;
 
+=======
+    bool asyncOp,
+    const char* profilingTitle) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Currently, the API permits one scenario where inputs.size() and
   // outputs.size() are > 0.
   // 1. If the call was a _coalesced call, all inputs must be on the same
@@ -3446,7 +4837,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
   // collective so there is no flight record and we increment seqCollective_ and
   // op_id_ together. Compare this to startCoalescing/endCoalescing flow where
   // we increment either seqP2P_ or seqCollective_ once per group and increment
+<<<<<<< HEAD
   // op_id_ once per indvidual operation within the group
+=======
+  // op_id_ once per individual operation within the group
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   op_id_++;
 
   const auto key = getKeyFromDevice(device);
@@ -3468,6 +4863,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
     } else {
       TORCH_CHECK(coalescedComm_ == ncclComm, MULTI_DEVICE_ERROR_MSG);
     }
+<<<<<<< HEAD
   }
 
   // Used many times below, so we stash the unordered_map lookup
@@ -3475,6 +4871,19 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
 
   // First let NCCL streams wait for input tensors allocation streams
   syncStream(device, ncclEvents_[key], ncclStream);
+=======
+    coalescedAsync_ = asyncOp;
+  }
+
+  // in asyncOp=false [default] mode, we use currentStream as ncclStream
+  // otherwise, we use separate ncclStream and let it sync on currentStream
+  auto ncclStream = asyncOp ? ncclStreams_.at(key)
+                            : at::cuda::getCurrentCUDAStream(device.index());
+  if (asyncOp) {
+    // First let NCCL streams wait for input tensors allocation streams
+    syncStream(device, ncclEvents_[key], ncclStream);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto work = initWork(
       device,
@@ -3489,9 +4898,18 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
   // Store references to outputs to be used by WorkNCCL::result and operator<<.
   work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
 
+<<<<<<< HEAD
   if (avoidRecordStreams) {
     work->stashed_for_allocator_safety_ =
         std::make_shared<std::vector<at::Tensor>>(inputs);
+=======
+  // If we are performing sync operations, i.e. equeuing kernel onto "current"
+  // stream, we don't need to do anything for tensor lifetime management.
+  // Otherwise, we need to stage the tensors will `work.wait()`.
+  if (asyncOp) {
+    work->stashed_for_allocator_safety_->stash(inputs);
+    work->stashed_for_allocator_safety_->stash(outputs);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Start event should only be recorded before the ncclGroupStart() (which
@@ -3506,7 +4924,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
 // upgrade. Once a NCCL version is qualified, this code should not be needed at
 // runtime.
 #ifdef PGNCCL_ENABLE_HASH
+<<<<<<< HEAD
   if (enableCollecticeHashDebug_.load()) {
+=======
+  if (enableCollectiveHashDebug_.load()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto numel = getTensorsNumel(inputs);
     auto hashValue = hashTensors(inputs);
     PRINT_COLLECTIVE_HASH_SIGNATURE(
@@ -3517,6 +4939,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
   {
     torch::cuda::nccl::AutoNcclGroup nccl_group_guard(comm, useNonblocking());
     for (const auto i : c10::irange(inputs.size())) {
+<<<<<<< HEAD
       // Both `inputs' and `outputs' are created on a worker stream and used in
       // different ncclStreams.  Hence, both must record the ncclStream to
       // prevent being freed before the collective finishes.
@@ -3538,6 +4961,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
               inputs[i].indices().storage().data_ptr(), ncclStream);
         }
       }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifndef NCCL_HAS_COMM_NONBLOCKING
       C10D_NCCL_CHECK(
           fn(inputs[i], outputs[i], comm, ncclStream),
@@ -3578,7 +5003,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
 
   // Set appropriate work parameters.
   work->blockingWait_ = blockingWait_;
+<<<<<<< HEAD
   work->avoidRecordStreams_ = avoidRecordStreams;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   work->store_ = store_;
   assignTimeoutToWork(work, options_);
   // Record size info for debug. We only record the size on the first device as
@@ -3609,7 +5037,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
   // it, since interactions with it by usercode won't behave normally - they
   // won't observe work completion, for instance.  Will this lead to silent
   // problems during capture?
+<<<<<<< HEAD
   return work;
+=======
+  return asyncOp ? work : nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 template <typename Fn, typename PreProcess, typename PostProcess>
@@ -3627,6 +5059,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
   // to wait() on the returned handle, so ProcessGroupNCCL can't know
   // when it's safe to release the input back to the allocator,
   // and the present call has no way to know it's not an isend.
+<<<<<<< HEAD
   // Therefore, we warn and fall back to the typical recordStream logic:
   if (avoidRecordStreams_) {
     TORCH_WARN_ONCE(
@@ -3634,6 +5067,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
         "collectives.");
   }
 
+=======
+  // Therefore, we warn and fall back to the typical recordStream logic.
+  // TODO( kwen2501 ): revisit this when we have a better solution.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto device = getDevice(tensor);
   at::cuda::OptionalCUDAGuard gpuGuard(device);
 
@@ -3688,6 +5125,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     } else {
       TORCH_CHECK(coalescedComm_ == ncclComm, MULTI_DEVICE_ERROR_MSG);
     }
+<<<<<<< HEAD
+=======
+    // For now, P2P ops are always put on internal stream
+    coalescedAsync_ = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Used many times below, so we stash the unordered_map lookup
@@ -3699,11 +5141,19 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
   c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work;
   if (coalescing_state_) {
     // When coalescing, we record events per op that lack timing/state
+<<<<<<< HEAD
     // information becuase there is no 'work' associated with them, and then
     // later in endCoalescing we record a 'coalesced' Work which has
     // timing/state updates via watchdog thread, but lacks op metadata such as
     // input/output sizes and profilingTitle per-op in the group.
     auto trace_id = FlightRecorder::get()->record(
+=======
+    // information because there is no 'work' associated with them, and then
+    // later in endCoalescing we record a 'coalesced' Work which has
+    // timing/state updates via watchdog thread, but lacks op metadata such as
+    // input/output sizes and profilingTitle per-op in the group.
+    FlightRecorderCUDA::get()->record(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_id_,
         std::make_tuple(pg_uid_, pg_desc_),
         seqCollective_,
@@ -3722,7 +5172,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     // coalesce group gets its update, we could accumulate these trace_ids
     // together and ask FlightRecorder to take the update from one Work and
     // apply it to multiple entries
+<<<<<<< HEAD
     (void)trace_id;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     // Store references to outputs to be used by WorkNCCL::result and
     // operator<<. Note that these outputs are only valid for recv(), as send()
@@ -3745,7 +5198,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     // TODO(whc) because we don't pass output {tensor} to initWork, we tell
     // initWork to not record, and then we manually call record passing all the
     // information it wants.
+<<<<<<< HEAD
     work->trace_id_ = FlightRecorder::get()->record(
+=======
+    work->trace_id_ = FlightRecorderCUDA::get()->record(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_id_,
         std::make_tuple(pg_uid_, pg_desc_),
         seqCollective_,
@@ -3859,8 +5316,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     PreProcess pre,
     PostProcess post,
     OpType opType,
+<<<<<<< HEAD
     const char* profilingTitle,
     bool avoidRecordStreams,
+=======
+    bool asyncOp,
+    const char* profilingTitle,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool nanCheck) {
   auto inputs = std::vector<at::Tensor>{input};
   auto outputs = std::vector<at::Tensor>{output};
@@ -3871,8 +5333,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
       pre,
       post,
       opType,
+<<<<<<< HEAD
       profilingTitle,
       avoidRecordStreams,
+=======
+      asyncOp,
+      profilingTitle,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       nanCheck);
 }
 
@@ -3882,8 +5349,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     at::Tensor& output,
     Fn fn,
     OpType opType,
+<<<<<<< HEAD
     const char* profilingTitle,
     bool avoidRecordStreams,
+=======
+    bool asyncOp,
+    const char* profilingTitle,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bool nanCheck) {
   auto inputs = std::vector<at::Tensor>{input};
   auto outputs = std::vector<at::Tensor>{output};
@@ -3896,8 +5368,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
       [](at::cuda::CUDAStream&,
          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
       opType,
+<<<<<<< HEAD
       profilingTitle,
       avoidRecordStreams,
+=======
+      asyncOp,
+      profilingTitle,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       nanCheck);
 }
 
@@ -3925,8 +5402,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_sparse(
   TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto tensor = tensors.back();
   TORCH_CHECK(
+<<<<<<< HEAD
       !isFloat8Type(tensor.scalar_type()),
       "Float8 dtypes are not currenlty supported for NCCL reductions");
+=======
+      !isUnsupportedFloat8(tensor.scalar_type()),
+      "Unsupported Float8 type for NCCL reduction");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef IS_NCCLX
   tensor = tensor.coalesce();
   at::Tensor outputTensor =
@@ -3949,6 +5431,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_sparse(
         auto recvIndices = indices[0] * colSize;
 
         // prevent output and recvIndices from being freed
+<<<<<<< HEAD
+=======
+        // TODO: not changing the lifetime management of outputs this time,
+        // revisit later
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         c10::cuda::CUDACachingAllocator::recordStream(
             output.storage().data_ptr(), stream);
         c10::cuda::CUDACachingAllocator::recordStream(
@@ -3980,6 +5467,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_sparse(
         }
       },
       OpType::_ALLREDUCE_SPARSE,
+<<<<<<< HEAD
+=======
+      opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "nccl:all_reduce_sparse");
   return work;
 #else
@@ -4014,6 +5505,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_impl(
             stream.stream());
       },
       OpType::ALLREDUCE,
+<<<<<<< HEAD
+=======
+      opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       profilingTitle);
 }
 
@@ -4041,8 +5536,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
     }
   }
   TORCH_CHECK(
+<<<<<<< HEAD
       !isFloat8Type(tensor.scalar_type()),
       "Float8 dtypes are not currenlty supported for NCCL reductions");
+=======
+      !isUnsupportedFloat8(tensor.scalar_type()),
+      "Unsupported Float8 type for NCCL reduction");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   RECORD_PARAM_COMMS_DATA(
       std::make_tuple(
           static_cast<int64_t>(seqCollective_) + 1,
@@ -4057,8 +5557,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   // avoidRecordStreams_ note: collective() will stash tensors.
@@ -4070,14 +5575,23 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
     const AllreduceCoalescedOptions& opts) {
   auto total_numel = check_gpu_tensors_same_device(tensors);
   TORCH_CHECK(
+<<<<<<< HEAD
       !isFloat8Type(tensors.back().scalar_type()),
       "Float8 dtypes are not currenlty supported for NCCL reductions");
+=======
+      !isUnsupportedFloat8(tensors.back().scalar_type()),
+      "Unsupported Float8 type for NCCL reduction");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   RECORD_PARAM_COMMS_DATA(
       std::make_tuple(
           static_cast<int64_t>(seqCollective_) + 1,
           false), // seq + 1 to match collective and assume only one collective
+<<<<<<< HEAD
                   // in coalesed range
+=======
+                  // in coalesced range
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
       tensors, // inputTensors
       tensors, // outputTensors
@@ -4089,8 +5603,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
       // I'm not sure what in,outSplitSizes mean here.
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   // avoidRecordStreams_ note: collective() will stash tensors.
@@ -4114,6 +5633,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
             stream.stream());
       },
       OpType::COALESCED,
+<<<<<<< HEAD
+=======
+      opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "nccl:allreduce_coalesced");
 }
 
@@ -4141,6 +5664,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
       this->getSize()); // worldSize
@@ -4151,6 +5675,16 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
   const auto root = opts.rootRank + opts.rootTensor;
   bool nanCheck = (root == rank_);
 
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+      this->getSize()); // worldSize
+
+  const auto root = opts.rootRank + opts.rootTensor;
+  bool nanCheck = (root == rank_);
+
+  // avoidRecordStreams_ note: collective() will stash tensors.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return collective(
       tensor,
       tensor,
@@ -4167,8 +5701,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
             stream.stream());
       },
       OpType::BROADCAST,
+<<<<<<< HEAD
       "nccl:broadcast",
       avoidRecordStreams,
+=======
+      opts.asyncOp,
+      "nccl:broadcast",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       nanCheck);
 }
 
@@ -4207,8 +5746,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_broadcast_oop(
             stream.stream());
       },
       OpType::BROADCAST,
+<<<<<<< HEAD
       "nccl:_broadcast_oop",
       /*avoidRecordStreams=*/false,
+=======
+      opts.asyncOp,
+      "nccl:_broadcast_oop",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       nanCheck);
 }
 
@@ -4240,8 +5784,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   // avoidRecordStreams_ note: collective() will stash tensors.
@@ -4267,6 +5816,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
             stream.stream());
       },
       OpType::REDUCE,
+<<<<<<< HEAD
+=======
+      opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "nccl:reduce");
 }
 
@@ -4308,6 +5861,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_oop(
             stream.stream());
       },
       OpType::REDUCE,
+<<<<<<< HEAD
+=======
+      opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "nccl:_reduce_oop");
 }
 
@@ -4335,8 +5892,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
       inputTensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSize
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   bool same_size = check_same_size(outputTensors_);
@@ -4351,10 +5913,14 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
             at::Tensor& output,
             ncclComm_t comm,
             at::cuda::CUDAStream& stream) {
+<<<<<<< HEAD
           if (!avoidRecordStreams_) {
             c10::cuda::CUDACachingAllocator::recordStream(
                 output.storage().data_ptr(), stream);
           }
+=======
+          // See [We actually don't need to stash anything here].
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           return ncclAllGather(
               input.data_ptr(),
               output.data_ptr(),
@@ -4370,6 +5936,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
           //  - inputTensors is stashed onto work->stashed_for_allocator_safety_
           //    in collective().
           //  - outputFlattened is stashed onto work->outputs_ in collective().
+<<<<<<< HEAD
           //  - User-facing outputTensors should be held by the user until after
           //    waiting on work_, or the call makes no sense.
           // So all participating tensors are accounted for, and won't be
@@ -4386,11 +5953,32 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
               c10::cuda::CUDACachingAllocator::recordStream(
                   outputTensors_[j].storage().data_ptr(), ncclStream);
             }
+=======
+        },
+        [&](at::cuda::CUDAStream& ncclStream,
+            c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+          // User-facing outputTensors should be held by the user until after
+          // waiting on work_, or the call makes no sense. We do a stashing here
+          // in case user doesn't hold the outputTensors in downstream code,
+          // which can cause an early recycle by the CachingAllocator, which can
+          // lead to segfault or data corruption.
+          if (opts.asyncOp) {
+            work->stashed_for_allocator_safety_->stash(outputTensors_);
+          }
+          // Copy the flattened output tensors to the outputs.
+          at::cuda::CUDAStreamGuard guard(ncclStream);
+          for (const auto j : c10::irange(outputTensors_.size())) {
+            // See [We actually don't need to stash anything here].
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             outputTensors_[j].copy_(
                 outputFlattened[static_cast<int64_t>(j)], true);
           }
         },
         OpType::ALLGATHER,
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "nccl:all_gather");
   } else {
     const auto num_reduces = outputTensors_.size();
@@ -4398,7 +5986,12 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
     for (const int64_t i : c10::irange(static_cast<int64_t>(num_reduces))) {
       auto& output = outputTensors_[i];
       auto& input = (i == rank_) ? inputTensor : output;
+<<<<<<< HEAD
       auto broadcastOpts = BroadcastOptions{i, int64_t(0), opts.timeout};
+=======
+      auto broadcastOpts =
+          BroadcastOptions{i, int64_t(0), opts.timeout, opts.asyncOp};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       _broadcast_oop(output, input, broadcastOpts);
     }
     auto work = endCoalescing(OpType::ALLGATHER);
@@ -4423,7 +6016,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
       std::make_tuple(
           static_cast<int64_t>(seqCollective_) + 1,
           false), // seq + 1 to match collective and assume only one collective
+<<<<<<< HEAD
                   // in coalesed range
+=======
+                  // in coalesced range
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
       inputs, // inputTensors
       outputs, // outputTensors
@@ -4434,8 +6031,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
       inputs[0].scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   return collectiveCoalesced(
@@ -4454,6 +6056,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
             stream.stream());
       },
       OpType::COALESCED,
+<<<<<<< HEAD
+=======
+      opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "nccl:all_gather_into_tensor_coalesced");
 }
 
@@ -4466,8 +6072,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
   check_gpu_single_tensor(outputTensor);
   auto inputTensors_ = inputTensors.back();
   TORCH_CHECK(
+<<<<<<< HEAD
       !isFloat8Type(outputTensor.scalar_type()),
       "Float8 dtypes are not currenlty supported for NCCL reductions");
+=======
+      !isUnsupportedFloat8(outputTensor.scalar_type()),
+      "Unsupported Float8 type for NCCL reduction");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   RECORD_PARAM_COMMS_DATA(
       std::make_tuple(
@@ -4483,8 +6094,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
       outputTensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   bool same_size = check_same_size(inputTensors_);
@@ -4499,10 +6115,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
             at::Tensor& output,
             ncclComm_t comm,
             at::cuda::CUDAStream& stream) {
+<<<<<<< HEAD
           if (!avoidRecordStreams_) {
             c10::cuda::CUDACachingAllocator::recordStream(
                 output.storage().data_ptr(), stream);
           }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           const auto ncclDataType = getNcclDataType(input.scalar_type());
           const auto ncclReduceOp =
               getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
@@ -4517,6 +6136,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
         },
         [&](at::cuda::CUDAStream& ncclStream,
             c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+<<<<<<< HEAD
           if (avoidRecordStreams_) {
             // We only need to stash inputTensors.
             //  - inputFlattened is stashed onto
@@ -4538,6 +6158,20 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
               c10::cuda::CUDACachingAllocator::recordStream(
                   inputTensors_[j].storage().data_ptr(), ncclStream);
             }
+=======
+          // We only need to stash inputTensors.
+          //  - inputFlattened is stashed onto
+          //  work->stashed_for_allocator_safety_ in collective().
+          //  - User-facing outputTensors is stashed onto work->outputs_ in
+          //  collective(), and should also be held by the user until after
+          //  waiting on work_.
+          if (opts.asyncOp) {
+            work->stashed_for_allocator_safety_->stash(inputTensors_);
+          }
+          // Copy the input tensors to the flattened inputs.
+          at::cuda::CUDAStreamGuard guard(ncclStream);
+          for (const auto j : c10::irange(inputTensors_.size())) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inputFlattened[static_cast<int64_t>(j)].copy_(
                 inputTensors_[j], true);
           }
@@ -4545,6 +6179,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
         [&](at::cuda::CUDAStream&,
             c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
         OpType::REDUCE_SCATTER,
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "nccl:reduce_scatter");
   } else {
     const auto num_reduces = inputTensors_.size();
@@ -4556,7 +6194,12 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
           opts.reduceOp,
           static_cast<int64_t>(i),
           static_cast<int64_t>(0),
+<<<<<<< HEAD
           opts.timeout};
+=======
+          opts.timeout,
+          opts.asyncOp};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       _reduce_oop(output, input, reduceOpts);
     }
     auto work = endCoalescing(OpType::REDUCE_SCATTER);
@@ -4581,8 +6224,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
 
   const auto& tensor = outputTensor;
   TORCH_CHECK(
+<<<<<<< HEAD
       !isFloat8Type(tensor.scalar_type()),
       "Float8 dtypes are not currenlty supported for NCCL reductions");
+=======
+      !isUnsupportedFloat8(tensor.scalar_type()),
+      "Unsupported Float8 type for NCCL reduction");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   RECORD_PARAM_COMMS_DATA(
       std::make_tuple(
           static_cast<int64_t>(seqCollective_) + 1,
@@ -4597,8 +6245,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
       tensor.scalar_type(), // dtype
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   // avoidRecordStreams_ note: collective() will stash inputs and outputs.
@@ -4610,7 +6263,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
   // stream so that the caching allocator can reuse memory pool for this stream
   // in a clever way. This setting is added for libraries like FSDP which uses
   // `reduce_scatter_tensor`.
+<<<<<<< HEAD
   bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return collective(
       inputTensor,
@@ -4619,10 +6275,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
+<<<<<<< HEAD
         if (!avoidRecordStreams) {
           c10::cuda::CUDACachingAllocator::recordStream(
               output.storage().data_ptr(), stream);
         }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto ncclDataType = getNcclDataType(input.scalar_type());
         auto ncclReduceOp =
             getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
@@ -4636,8 +6295,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
             stream.stream());
       },
       OpType::_REDUCE_SCATTER_BASE,
+<<<<<<< HEAD
       "nccl:_reduce_scatter_base",
       avoidRecordStreams);
+=======
+      opts.asyncOp,
+      "nccl:_reduce_scatter_base");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
@@ -4645,14 +6309,23 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
     std::vector<at::Tensor>& inputs,
     const ReduceScatterOptions& opts) {
   TORCH_CHECK(
+<<<<<<< HEAD
       !isFloat8Type(inputs.back().scalar_type()),
       "Float8 dtypes are not currenlty supported for NCCL reductions");
+=======
+      !isUnsupportedFloat8(inputs.back().scalar_type()),
+      "Unsupported Float8 type for NCCL reduction");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   RECORD_PARAM_COMMS_DATA(
       std::make_tuple(
           static_cast<int64_t>(seqCollective_) + 1,
           false), // seq + 1 to match collective and assume only one collective
+<<<<<<< HEAD
                   // in coalesed range
+=======
+                  // in coalesced range
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
       inputs, // inputTensors
       outputs, // outputTensors
@@ -4663,8 +6336,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
       inputs[0].scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   return collectiveCoalesced(
@@ -4674,10 +6352,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
+<<<<<<< HEAD
         if (!avoidRecordStreams_) {
           c10::cuda::CUDACachingAllocator::recordStream(
               output.storage().data_ptr(), stream);
         }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto ncclDataType = getNcclDataType(input.scalar_type());
         auto ncclReduceOp =
             getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
@@ -4691,6 +6372,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
             stream.stream());
       },
       OpType::COALESCED,
+<<<<<<< HEAD
+=======
+      opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "nccl:reduce_scatter_tensor_coalesced");
 }
 
@@ -4722,7 +6407,11 @@ c10::DeviceIndex ProcessGroupNCCL::guessDeviceId() const {
              devIdx,
              " as device used by this process is currently unknown. ",
              "This can potentially cause a hang if this rank to GPU mapping is incorrect. ",
+<<<<<<< HEAD
              "You can pecify device_id in init_process_group() to force use of a particular device.");
+=======
+             "You can specify device_id in init_process_group() to force use of a particular device.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return static_cast<c10::DeviceIndex>(devIdx);
 }
 
@@ -4739,8 +6428,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
       at::kByte, // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   // Device to use for barrier
@@ -4769,6 +6463,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
       at::zeros({1}, at::TensorOptions().device(barDevice).dtype(at::kFloat));
 
   // All reduce to achieve the barrier
+<<<<<<< HEAD
   auto work = allreduce_impl(barrierTensor, "nccl:all_reduce_barrier");
 
   // Work will take over barrierTensors
@@ -4776,6 +6471,30 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
   TORCH_CHECK(ncclWork);
   ncclWork->isBarrierOp_ = true;
   return work;
+=======
+  AllreduceOptions arOpts = AllreduceOptions();
+  arOpts.asyncOp = opts.asyncOp;
+  auto work = allreduce_impl(barrierTensor, "nccl:all_reduce_barrier", arOpts);
+
+  if (opts.asyncOp) {
+    // Work will take over barrierTensors
+    auto ncclWork = dynamic_cast<ProcessGroupNCCL::WorkNCCL*>(work.get());
+    // If user specified async, the work should not be nullptr
+    TORCH_CHECK(ncclWork);
+    // Put a marker here so that `work.wait()` issue by users does
+    // barrier-specific thing: CPU sync
+    ncclWork->isBarrierOp_ = true;
+    return work;
+  }
+
+  // Otherwise, we are in sync mode, we directly wait here.
+  // (It is a CPU wait for barrier)
+  auto currentStream = at::cuda::getCurrentCUDAStream(barDevIdx);
+  // CUDAStream wrapper will correctly use a DeviceGuard here
+  currentStream.synchronize();
+  // No work to return
+  return nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
@@ -4783,7 +6502,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
     at::Tensor& inputTensor,
     std::vector<int64_t>& outputSplitSizes,
     std::vector<int64_t>& inputSplitSizes,
+<<<<<<< HEAD
     const AllToAllOptions& /* unused */) {
+=======
+    const AllToAllOptions& opts) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   check_gpu_single_tensor(outputTensor);
   check_gpu_single_tensor(inputTensor);
   if (outputSplitSizes.empty() && inputSplitSizes.empty()) {
@@ -4795,14 +6518,23 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
         inputTensor, // inputTensor
         outputTensor, // outputTensor
         rank_, // rank
+<<<<<<< HEAD
         "all_to_all", // collective name
+=======
+        "all_to_allv", // collective name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inputTensor.numel(), // inNelems
         outputTensor.numel(), // outNelems
         inputTensor.scalar_type(), // dType
         std::vector<int64_t>(), // inSplitSizes
         std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
         globalRankStart, // globalRankStart
         globalRankStride, // globalRankStride
+=======
+        globalRankStart_, // globalRankStart_
+        globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         this->getSize()); // worldSize
 
     // avoidRecordStreams_ note: collective() will stash inputTensors and
@@ -4814,16 +6546,23 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
             at::Tensor& output,
             ncclComm_t comm,
             at::cuda::CUDAStream& stream) {
+<<<<<<< HEAD
           // See [Sync Streams].
           if (!avoidRecordStreams_) {
             c10::cuda::CUDACachingAllocator::recordStream(
                 output.storage().data_ptr(), stream);
           }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           torch::cuda::nccl::all2all_single_equal_split(
               input, output, this->getSize(), comm, stream);
           return ncclSuccess;
         },
         OpType::ALLTOALL_BASE,
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "nccl:all_to_all");
   } else {
     c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
@@ -4843,8 +6582,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
         inputTensor.scalar_type(), // dType
         inputSplitSizes, // inSplitSizes
         outputSplitSizes, // outSplitSizes
+<<<<<<< HEAD
         globalRankStart, // globalRankStart
         globalRankStride, // globalRankStride
+=======
+        globalRankStart_, // globalRankStart_
+        globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         this->getSize()); // worldSize
 
     // avoidRecordStreams_ note: collective() will stash inputTensors and
@@ -4865,10 +6609,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
           c10d::computeLengthsAndOffsets(
               outputSplitSizes, output, &recv_lengths, &recv_offsets);
           // See [Sync Streams].
+<<<<<<< HEAD
           if (!avoidRecordStreams_) {
             c10::cuda::CUDACachingAllocator::recordStream(
                 output.storage().data_ptr(), stream);
           }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           torch::cuda::nccl::all2all_single_unequal_split(
               input.data_ptr(),
               send_lengths.data(),
@@ -4883,6 +6630,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
           return ncclSuccess;
         },
         OpType::ALLTOALL_BASE,
+<<<<<<< HEAD
+=======
+        opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "nccl:all_to_all");
   }
 }
@@ -4890,10 +6641,20 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
 c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
     std::vector<at::Tensor>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
+<<<<<<< HEAD
     const AllToAllOptions& /* unused */) {
   std::vector<int64_t> inSplitSizes;
   std::vector<int64_t> outSplitSizes;
   int64_t total_numel = 0;
+=======
+    const AllToAllOptions& opts) {
+  int64_t input_total_numel = 0;
+  int64_t output_total_numel = 0;
+  // considering uneven all2all bw calculation
+  // use split sizes field to record tensor list sizes
+  std::vector<int64_t> inSplitSizes;
+  std::vector<int64_t> outSplitSizes;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   auto device = outputTensors[0].device();
   for (const auto r : c10::irange(outputTensors.size())) {
@@ -4903,9 +6664,16 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
         device == outputTensors[r].device() &&
             device == inputTensors[r].device(),
         "Tensors must be on the same device")
+<<<<<<< HEAD
     inSplitSizes.push_back(inputTensors[r].numel());
     outSplitSizes.push_back(outputTensors[r].numel());
     total_numel += inputTensors[r].numel();
+=======
+    input_total_numel += inputTensors[r].numel();
+    output_total_numel += outputTensors[r].numel();
+    inSplitSizes.push_back(inputTensors[r].numel());
+    outSplitSizes.push_back(outputTensors[r].numel());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   RECORD_PARAM_COMMS_DATA(
@@ -4917,6 +6685,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
       outputTensors, // outputTensors
       rank_, // rank
       "all_to_all", // collective name
+<<<<<<< HEAD
       total_numel, // inNelems
       total_numel, // outNelems
       inputTensors.front().scalar_type(), // dType
@@ -4924,6 +6693,15 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
       outSplitSizes, // outSplitSizes
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      input_total_numel, // inNelems
+      output_total_numel, // outNelems
+      inputTensors.front().scalar_type(), // dType
+      inSplitSizes, // inSplitSizes
+      outSplitSizes, // outSplitSizes
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   return collective(
@@ -4937,6 +6715,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
         return ncclSuccess;
       },
       [&](at::cuda::CUDAStream&,
+<<<<<<< HEAD
           c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
         if (avoidRecordStreams_) {
           // inputTensor0 and outputTensor0 are stashed redundantly by
@@ -4949,6 +6728,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
       [](at::cuda::CUDAStream&,
          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
       OpType::ALLTOALL,
+=======
+          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
+      [](at::cuda::CUDAStream&,
+         c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
+      OpType::ALLTOALL,
+      opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "nccl:all_to_all");
 }
 
@@ -4975,8 +6761,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   auto ret = pointToPoint(
@@ -5023,8 +6814,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
       tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSizes
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   auto ret = pointToPoint(
@@ -5130,8 +6926,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
       inputTensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSize
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   // avoidRecordStreams_ note: collective() will stash inputTensors and
@@ -5146,6 +6947,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
         const auto root = opts.rootRank;
+<<<<<<< HEAD
         if (getRank() == root) {
           if (!avoidRecordStreams_) {
             for (auto const& output : outputs) {
@@ -5154,6 +6956,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
             }
           }
         }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch::cuda::nccl::gather(
             inputTensor, outputs, comm, stream, static_cast<int32_t>(root));
         return ncclSuccess;
@@ -5163,6 +6967,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
       [](at::cuda::CUDAStream&,
          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
       OpType::GATHER,
+<<<<<<< HEAD
+=======
+      opts.asyncOp,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "nccl:gather");
 }
 
@@ -5225,14 +7033,22 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
       outputTensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSize
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   // avoidRecordStreams_ note: collective() will stash outputTensors and
   // inputs, which == inputTensors[0] on the root rank where it matters.
+<<<<<<< HEAD
   bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const auto root = opts.rootRank;
   bool nanCheck = (rank_ == root);
 
@@ -5244,6 +7060,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
           at::Tensor& /* unused */,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
+<<<<<<< HEAD
         if (getRank() == root) {
           if (!avoidRecordStreams) {
             for (auto const& input : inputs) {
@@ -5252,6 +7069,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
             }
           }
         }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch::cuda::nccl::scatter(
             inputs, outputTensor, comm, stream, static_cast<int32_t>(root));
         return ncclSuccess;
@@ -5261,8 +7080,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
       [](at::cuda::CUDAStream&,
          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
       OpType::SCATTER,
+<<<<<<< HEAD
       "nccl:scatter",
       avoidRecordStreams,
+=======
+      opts.asyncOp,
+      "nccl:scatter",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       nanCheck);
 }
 
@@ -5305,8 +7129,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
       output_tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSize
+<<<<<<< HEAD
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
+=======
+      globalRankStart_, // globalRankStart_
+      globalRankStride_, // globalRankStride_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       this->getSize()); // worldSize
 
   // avoidRecordStreams_ note: collective() will stash inputs and outputs.
@@ -5318,7 +7147,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
   // stream so that the caching allocator can reuse memory pool for this stream
   // in a clever way. This setting is added for libraries like FSDP which uses
   // `all_gather_into_tensor`.
+<<<<<<< HEAD
   bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   return collective(
       input_tensor,
@@ -5327,10 +7159,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
+<<<<<<< HEAD
         if (!avoidRecordStreams) {
           c10::cuda::CUDACachingAllocator::recordStream(
               output.storage().data_ptr(), stream);
         }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ncclAllGather(
             input.data_ptr(),
             output.data_ptr(),
@@ -5340,8 +7175,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
             stream.stream());
       },
       OpType::_ALLGATHER_BASE,
+<<<<<<< HEAD
       "nccl:_all_gather_base",
       avoidRecordStreams);
+=======
+      opts.asyncOp,
+      "nccl:_all_gather_base");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 // Create a memory allocator for NCCL. This allocator is used to allocate memory
@@ -5426,10 +7266,16 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
             getMemAllocator().get());
     // Pool is created
     memPool_ = std::make_unique<c10::cuda::MemPool>(allocator);
+<<<<<<< HEAD
+=======
+    // Register so that we call ncclCommRegister on all new allocations
+    registerMemPool(memPool_.get());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     LOG(INFO) << logPrefix() << "Created memory pool";
   }
 
   // Allocate tensor under this MemPool's context
+<<<<<<< HEAD
   auto ctx = c10::cuda::MemPoolContext(memPool_.get());
   c10::cuda::CUDACachingAllocator::beginAllocateToPool(
       memPool_->device(), memPool_->id(), [](cudaStream_t) { return true; });
@@ -5437,6 +7283,15 @@ at::Tensor ProcessGroupNCCL::allocateTensor(
   // Also need to ncclCommRegister the pool in case new segments are created;
   // reregistration of old segments will be ignored
   registerMemPool(memPool_.get());
+=======
+  auto tid = std::this_thread::get_id();
+  c10::cuda::CUDACachingAllocator::beginAllocateToPool(
+      memPool_->device(), memPool_->id(), [=](cudaStream_t) {
+        auto current_tid = std::this_thread::get_id();
+        return current_tid == tid;
+      });
+  at::Tensor tensor = at::empty({size}, options);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::cuda::CUDACachingAllocator::endAllocateToPool(
       memPool_->device(), memPool_->id());
   c10::cuda::CUDACachingAllocator::releasePool(
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 4a2f60ccac19..2bed01e958bc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -23,8 +23,13 @@
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
+<<<<<<< HEAD
 #include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
 #include <torch/csrc/distributed/c10d/logger.hpp>
+=======
+#include <torch/csrc/distributed/c10d/logger.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <ATen/DynamicLibrary.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -195,11 +200,23 @@ struct DumpPipe {
     TORCH_CHECK(
         unlink(filename.c_str()) != -1 || errno == ENOENT,
         "Error removing existing named pipe ",
+<<<<<<< HEAD
         filename);
     TORCH_CHECK(
         mkfifo(filename.c_str(), 0666) != -1,
         "Error creating named pipe ",
         filename);
+=======
+        filename,
+        ", Error: ",
+        std::strerror(errno));
+    TORCH_CHECK(
+        mkfifo(filename.c_str(), 0666) != -1,
+        "Error creating named pipe ",
+        filename,
+        ", Error: ",
+        std::strerror(errno));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fd_ = open(filename.c_str(), O_RDONLY | O_NONBLOCK);
     LOG(INFO) << "Pipe file " << filename
               << " has been opened, write to it to trigger NCCL Debug Dump.";
@@ -235,6 +252,37 @@ struct DumpPipe {
 };
 #endif
 
+<<<<<<< HEAD
+=======
+// A shelf for stashing tensors between op call and `work.wait()`.
+// Used in case of async ops.
+class TensorShelf {
+ public:
+  // Stash tensors so that CachingAllocator cannot recycle them prematurely.
+  void stash(std::vector<at::Tensor>& tensors);
+  // Stash tensors from another shelf.
+  void stash(TensorShelf& other);
+  // Unstage the stashed tensors so that CachingAllocator can recycle them.
+  // Same as `clear()`.
+  void unstash();
+  // Whether shelf is empty.
+  bool empty();
+  // Clear the shelf.
+  void clear();
+
+ protected:
+  // Get the inner tensor vector. Use with caution as it is not protected by
+  // mutex.
+  std::vector<at::Tensor>& get();
+
+ private:
+  std::vector<at::Tensor> tVector_;
+  // Need a mutex to protect `tVector_` because it can be potentially accessed
+  // from both main thread and watchdog thread.
+  std::mutex mutex_;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // ProcessGroupNCCL implements NCCL bindings for c10d.
 //
 // All functions of the class are expected to be called in the same order
@@ -287,7 +335,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
         bool isP2P = false,
         const char* profilingTitle = nullptr,
         const std::optional<std::vector<at::Tensor>>& inputs = std::nullopt,
+<<<<<<< HEAD
         bool desyncDebug = false,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bool enableTiming = false,
         bool cudaEventCacheEnabled = false,
         DebugLevel distDebugLevel = DebugLevel::Off);
@@ -354,6 +405,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // Print the traceback of the collective at call time
     void printTraceback() const;
 
+<<<<<<< HEAD
+=======
+    std::string getTraceback() const;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<at::Tensor> result() override;
 
    protected:
@@ -382,9 +438,12 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // Clone of blockingWait_ from ProcessGroupNCCL.
     bool blockingWait_{false};
 
+<<<<<<< HEAD
     // Clone of avoidRecordStreams_ from ProcessGroupNCCL.
     bool avoidRecordStreams_{false};
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Clone of opTimeout_ from ProcessGroupNCCL.
     std::chrono::milliseconds opTimeout_{};
 
@@ -419,7 +478,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
         std::ostream& output,
         const WorkNCCL& workNCCL);
 
+<<<<<<< HEAD
    private:
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Checks for NCCL errors and sets an appropriate exception_ptr.
     void checkAndSetException();
 
@@ -449,7 +511,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // caching allocator safety without any recordStream calls.
     // For in-place collectives, some refs stashed here may alias outputs_,
     // but that doesn't do any harm.
+<<<<<<< HEAD
     std::shared_ptr<std::vector<at::Tensor>> stashed_for_allocator_safety_;
+=======
+    std::shared_ptr<TensorShelf> stashed_for_allocator_safety_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // The future returned by getFuture.
     c10::intrusive_ptr<at::ivalue::Future> future_;
@@ -531,7 +597,16 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   class DesyncDebugger {
    public:
     // Initialize and enable DesyncDebugger
+<<<<<<< HEAD
     void init(int rank, int size, c10::intrusive_ptr<Store> store);
+=======
+    void init(
+        int rank,
+        int size,
+        int globalRank,
+        int pgId,
+        c10::intrusive_ptr<Store> store);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Run desync debug. This function is called by watchdog at time of timeout.
     void run();
@@ -550,6 +625,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // From ProcessGroupNCCL
     int rank_;
     int size_;
+<<<<<<< HEAD
+=======
+    int globalRank_;
+    int pgId_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Reference to the store so that we can log start/end event.
     c10::intrusive_ptr<Store> store_;
@@ -561,6 +641,165 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     std::string traceKeyEnd_;
   };
 
+<<<<<<< HEAD
+=======
+  // Class that runs as a separate thread aside from watchdog
+  // thread because we need to check the heartbeat from watchdog thread
+  // so that when we get stuck in some NCCL/CUDA calls,
+  // we can dump the debugging information and abort the process.
+  class HeartbeatMonitor {
+   public:
+    HeartbeatMonitor(ProcessGroupNCCL* pg);
+    virtual ~HeartbeatMonitor() = default;
+
+    // Start the heartbeat monitor thread.
+    void start();
+
+    // Join the heartbeat monitor thread.
+    void join();
+
+    // Run the actual loop to check watchdog heartbeat.
+    virtual void runLoop();
+
+    // Set the terminal flag and notify the heartbeat monitor thread to stop.
+    void stop();
+
+    // Set the last update time of watchdog thread.
+    void setLastWorkListUpdateTime(
+        std::chrono::time_point<std::chrono::steady_clock> time);
+
+    int getDumpTimeout() const;
+
+    // Util function to get the timeout error message
+    std::string getNCCLWatchdogTimeoutErrorMsg(const std::string& extraMsg);
+
+    // Util function to get the timeout exit message
+    std::string getNCCLWatchdogTimeoutExitMsg(const std::string& exitReason);
+
+   protected:
+    // We need to keep a reference to the PG instance so that we can access
+    // the member functions of the PG instance. We store a raw pointer on
+    // purpose because the heartbeat monitor thread now still lives within the
+    // lifetime of the PG instance.
+    ProcessGroupNCCL* pg_;
+
+   private:
+    // Whether or not to print C++ stack traces to logs on unclean shutdown.
+    bool logCppStackOnUncleanShutdown_;
+
+    // The time interval used for deciding whether there is no watchdog
+    // heartbeat.
+    int heartbeatTimeoutInSec_;
+
+    // timeout for the dump to finish.
+    int waitTimeoutDumpInMilSec_;
+
+    // Interval of check coordinated signals in ProcessGroupNCCL from other
+    // ranks e.g., trigger the dump of the debugging info for timeout when
+    // notified.
+    int coordCheckIntervalMilSec_;
+
+    // We gate the heartbeat monitor thread so that we can roll it out
+    // gradually.
+    bool watchdogHeartbeatMonitorEnabled_;
+
+    // Monitor thread which checks the heartbeat of Watchdog thread.
+    // If the monitor thread finds there is no heartbeat, it will dump debug
+    // info and then kill the watchdog thread to avoid hang.
+    std::thread ncclHeartbeatMonitorThread_;
+
+    // Whether or not we should terminate the heartbeat monitoring threads.
+    std::atomic<bool> terminateHeartbeatMonitorThread_{false};
+
+    // Condition Variable for monitor thread to wake up early
+    std::condition_variable monitorWakeUpCV_;
+
+    // Whether or not to dump debug info on exception including both watchdog
+    // timeout and nccl errors.
+    bool dumpOnTimeoutOrEx_;
+
+    // Mutex to Guard monitorWakeUpCV_
+    std::mutex monitorMutex_;
+
+    // The last update time of WorkList inside watchdog thread.
+    std::chrono::time_point<std::chrono::steady_clock> lastWorkListUpdateTime_;
+  };
+
+  // Class that runs as a side thread to check whether the NCCL collective
+  // is timed out or errors on the cached NCCL communicators.
+  class Watchdog {
+   public:
+    Watchdog(ProcessGroupNCCL* pg);
+    virtual ~Watchdog() = default;
+
+    // Start the watchdog thread.
+    void start();
+
+    // Join the watchdog thread.
+    void join();
+
+    // Function that runs as part of a separate thread and checks for errors on
+    // NCCL communicators. We need a separate thread to check for NCCL errors
+    // since we can't rely on the user calling certain methods like wait(),
+    // isCompleted() etc. to detect and remediate errors. In addition to this,
+    // we need a mechanism to safely abort and remove NCCL communicators from
+    // our cache. This can be done cleanly by having a thread for the
+    // ProcessGroupNCCL class. Attempting to modify the communicator cache from
+    // the WorkNCCL class might run into issues with object lifetime since the
+    // ProcessGroupNCCL object might get destroyed before the WorkNCCL object.
+    void run();
+
+    // Watchdog's inside loop.
+    // Takes care of cleaning up completed work, and aborting upon failure or
+    // timeout.
+    void runLoop();
+
+    // Notify the loop inside watchdog.
+    void notify();
+
+    void checkAndSetRemoteError();
+
+    // A helper function to get the src rank of a signal from the Store. This is
+    // nonblocking function returning -1 if the signal is not available yet.
+    int getSignalSrcRank(
+        c10::intrusive_ptr<Store>& store,
+        const std::string& signal);
+
+    uint64_t getHeartbt() const;
+
+    void setDesyncDebug(bool desyncDebug);
+
+   private:
+    std::thread ncclCommWatchdogThread_;
+
+    // We need to keep a reference to the PG instance so that we can access
+    // the member functions of the PG instance. We store a raw pointer on
+    // purpose because the watchdog thread now still lives within the
+    // lifetime of the PG instance.
+    ProcessGroupNCCL* pg_;
+
+    // Whether the NCCL watchdog should rethrow CUDA errors.
+    bool rethrowCUDAErrors_ = false;
+
+    std::exception_ptr watchDogException_ = nullptr;
+
+    // Condition Variable for watchdog thread sleep
+    std::condition_variable workMetaListCV_;
+
+    // Heartbeat of watchdog thread.
+    std::atomic_uint64_t heartbeat_{};
+
+    // Whether or not to propagate detected errors to all ranks in the same PG
+    // through TCPStore.
+    bool propagatePgError_;
+
+    // Whether or not to enable timeout root cause analysis.
+    bool desyncDebug_;
+
+    DesyncDebugger desyncDebugger_;
+  };
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // If you wish to create multiple process groups, each with a potentially
   // different rank and size, you can do so by passing a new store instance
   // to each one. If you have only a single store object, you can
@@ -615,10 +854,28 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     return true;
   }
 
+<<<<<<< HEAD
+=======
+  bool supportsTimeEstimation() const override {
+#ifdef NCCL_SIM_INFO_INITIALIZER
+    return true;
+#else
+    return false;
+#endif
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void startCoalescing() override;
 
   c10::intrusive_ptr<Work> endCoalescing() override;
 
+<<<<<<< HEAD
+=======
+  void startTimeEstimate();
+
+  float endTimeEstimate();
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // For specifying a composite optype, such as ALLGATHER and REDUCE_SCATTER
   c10::intrusive_ptr<Work> endCoalescing(OpType optype);
 
@@ -713,6 +970,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       int srcRank,
       int tag) override;
 
+<<<<<<< HEAD
+=======
+  int64_t getCommPtr();
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void groupStart();
 
   void groupEnd();
@@ -810,7 +1072,21 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       const c10::intrusive_ptr<Work>& work,
       const std::chrono::milliseconds& timeout);
 
+<<<<<<< HEAD
+ protected:
+=======
+  void setEnableNanCheck(bool enableNanCheck);
+
  protected:
+  uint64_t getWatchdogHeartbt() const;
+
+  // Instance of the heartbeat monitor thread.
+  std::unique_ptr<HeartbeatMonitor> heartbeatMonitor_;
+
+  // Instance of the watchdog thread.
+  std::unique_ptr<Watchdog> watchdog_;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Helper that broadcasts nccl unique ID to all ranks through the store
   void broadcastUniqueNCCLID(
       ncclUniqueId* ncclID,
@@ -862,9 +1138,15 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Use this helper instead of directly checking `useNonblocking_` variable.
   bool useNonblocking();
 
+<<<<<<< HEAD
  private:
   int globalRankStart;
   int globalRankStride;
+=======
+ protected:
+  int globalRankStart_;
+  int globalRankStride_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Helper that encapsulates work shared across all collective communication
   // primitives.  The callbacks have the following signatures:
@@ -878,8 +1160,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       at::Tensor& output,
       Fn fn,
       OpType opType,
+<<<<<<< HEAD
       const char* profilingTitle = nullptr,
       bool avoidRecordStreams = false,
+=======
+      bool asyncOp,
+      const char* profilingTitle = nullptr,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       bool nanCheck = true);
 
   template <typename Fn, typename PreProcess, typename PostProcess>
@@ -890,8 +1177,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       PreProcess pre,
       PostProcess post,
       OpType opType,
+<<<<<<< HEAD
       const char* profilingTitle = nullptr,
       bool avoidRecordStreams = false,
+=======
+      bool asyncOp,
+      const char* profilingTitle = nullptr,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       bool nanCheck = true);
 
   template <typename Fn, typename PreProcess, typename PostProcess>
@@ -902,8 +1194,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       PreProcess pre,
       PostProcess post,
       OpType opType,
+<<<<<<< HEAD
       const char* profilingTitle = nullptr,
       bool avoidRecordStreams = false,
+=======
+      bool asyncOp,
+      const char* profilingTitle = nullptr,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       bool nanCheck = true);
 
   template <typename Fn>
@@ -912,8 +1209,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       std::vector<at::Tensor>& output,
       Fn fn,
       OpType opType,
+<<<<<<< HEAD
       const char* profilingTitle = nullptr,
       bool avoidRecordStreams = false);
+=======
+      bool asyncOp,
+      const char* profilingTitle = nullptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Helper that encapsulates work shared across point-to-point communication
   // primitives. It is the same structure as the helper used for collective
@@ -946,6 +1248,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   static std::exception_ptr checkForNCCLErrorsInternal(
       std::shared_ptr<NCCLComm>& ncclComm);
 
+<<<<<<< HEAD
   // Function that runs as part of a separate thread and checks for errors on
   // NCCL communicators. We need a separate thread to check for NCCL errors
   // since we can't rely on the user calling certain methods like wait(),
@@ -957,6 +1260,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // object might get destroyed before the WorkNCCL object.
   void ncclCommWatchdog();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Return the CUDA device most likely associated with this backend.
   // If we aren't bound to a specific device, there is no strict
   // guarantee that this heuristic is the correct assignment of ranks
@@ -970,11 +1275,14 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // communicators from the cache and clears used device indices.
   void destroyNCCLComms(const std::string& devNCCLCommMapKey);
 
+<<<<<<< HEAD
   // Watchdog's inside loop.
   // Takes care of cleaning up completed work, and aborting upon failure or
   // timeout.
   void watchdogHandler();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void runHookLoop();
 
   // Generates a prefix that is unique to this process group and rank, for
@@ -990,6 +1298,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // return the rank_ of the the very first PG created, aka, default global PG.
   const int& globalRank() const;
 
+<<<<<<< HEAD
+=======
+  const c10::intrusive_ptr<Store>& globalStore() const;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Returns the global ranks of a PG.
   const std::vector<uint64_t>& groupRanks() const;
 
@@ -1008,6 +1321,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       const std::string& signal,
       int srcRank);
 
+<<<<<<< HEAD
   // A helper function to get the src rank of a signal from the Store. This is
   // nonblocking function returning -1 if the signal is not available yet.
   int getSignalSrcRank(
@@ -1021,6 +1335,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // we can dump the debugging information and abort the process.
   virtual void heartbeatMonitor();
 
+=======
+ protected:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Function that directly trigger std::abort so that the whole process
   // gets terminated.
   virtual void terminateProcess(const std::string& errMsg);
@@ -1034,12 +1351,15 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       ::c10d::C10dLoggingData& debugLog,
       bool throwException = false);
 
+<<<<<<< HEAD
   std::string getNCCLWatchdogTimeoutErrorMsg(const std::string& extraMsg);
 
   std::string getNCCLWatchdogTimeoutExitMsg(const std::string& exitReason);
 
   void checkAndSetRemoteError();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // A helper function to guess the device id of the current rank, based on
   // bounded device or used device. Do not use this function if you already know
   // the device id to operate on.
@@ -1107,7 +1427,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // communication, the key will be "1:2" on both processes. Note: this is for
   // the scenario where there is only 1 GPU per process. When it comes to
   // multiple GPUs per process, this part may need to redesigned.
+<<<<<<< HEAD
   // TODO: we probably need a separte map for P2P comms
+=======
+  // TODO: we probably need a separate map for P2P comms
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unordered_map<std::string, std::shared_ptr<NCCLComm>> devNCCLCommMap_;
 
   // The NCCL communicators currently in process of being initialized.
@@ -1117,6 +1441,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Mutex to guard maps like devNCCLCommMap_.
   std::mutex mutex_;
 
+<<<<<<< HEAD
   // Heartbeat of watchdog thread.
   std::atomic_uint64_t heartbeat_{};
 
@@ -1150,19 +1475,34 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Watchdog thread which looks for errors on the cached NCCL communicators.
   std::thread ncclCommWatchdogThread_;
 
+=======
+  // Size of ring buffer where we store NCCL Traces for debugging.
+  int traceBufferSize_;
+
+  // We gate the cudaEventCache so that we can roll it out gradually.
+  std::atomic<bool> cudaEventCacheEnabled_{};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::thread onCompletionHookThread_;
 
   // Whether or not we should terminate the watchdog and workCleanup threads.
   std::atomic<bool> terminateProcessGroup_;
 
+<<<<<<< HEAD
   // Whether or not we should terminate the heartbeat monitoring threads.
   std::atomic<bool> terminateHeartbeatMonitorThread_;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Whether there are hooks pending to be fired
   std::atomic<bool> hasPendingHooks_{};
 
   // This is the signal from watchdog threads to indicate whether the monitor
+<<<<<<< HEAD
   // thread should dump. Making it static so that it is accessiable from all the
+=======
+  // thread should dump. Making it static so that it is accessible from all the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // PGs. With this flag, monitor thread would dump debug info under any one of
   // the three conditions:
   //
@@ -1177,6 +1517,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Mutex to Guard workMetaList_
   std::mutex workMetaListMutex_;
 
+<<<<<<< HEAD
   // Mutex to Guard monitorWakeUpCV_
   std::mutex monitorMutex_;
 
@@ -1193,6 +1534,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   std::chrono::time_point<std::chrono::steady_clock> lastWorkListUpdateTime_;
 
+=======
+  bool writeDebugInfo_ = false;
+
+  // Vector to store WorkNCCL pointers
+  std::list<ProcessGroupNCCL::WorkNCCL> workMetaList_;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Mutex to Guard workMetaList_
   std::mutex completedWorkListMutex_;
 
@@ -1222,14 +1570,36 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Stores communicators for all collectives run inside a coalescing block
   std::shared_ptr<NCCLComm> coalescedComm_ = nullptr;
 
+<<<<<<< HEAD
+=======
+  // Whether the coalesced calls are sync or async.
+  bool coalescedAsync_;
+
+  // keeps track of input and output tensors when coalescing is in flight.  Will
+  // hand over these tensors to WorkNCCL's stash when coalescing is ended.
+  TensorShelf coalescedTensors_;
+
+  // Some ops may have completed, but user still hasn't called `work.wait()`.
+  // When watchdog detects this, it transfers the TensorShelf from `work` to
+  // this `shelves` structure. Next time we execute ProcessGroupNCCL's methods
+  // on main thread, we clear the `shelves` in one shot. This is mainly because
+  // watchdog (a side thread) unstashing the shelf directly seems to cause some
+  // problem.
+  std::vector<std::shared_ptr<TensorShelf>> shelvesToUnstash_;
+  std::mutex shelvesMutex_;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Whether or not wait() and synchronize() are blocking operations that wait
   // for the operation to complete.
   bool blockingWait_ = false;
 
+<<<<<<< HEAD
   // Whether or not to hook the cache allocator to register all allocated
   // tensors
   bool useTensorRegisterAllocatorHook_ = false;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Whether or not the workCleanupThread is used to perform async error
   // handling.
   ErrorHandlingMode asyncErrorHandling_ = NoHandling;
@@ -1238,6 +1608,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   std::mutex errorMutex_;
 
+<<<<<<< HEAD
   // Whether or not to enable timeout root cause analysis.
   bool desyncDebug_;
   DesyncDebugger desyncDebugger_;
@@ -1250,15 +1621,20 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // through TCPStore.
   bool propagatePgError_;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Whether or not to sleep after an exception is thrown in the watchdog.
   bool sleepAfterException_{};
 
   // Whether or not to enable nan check for input tensors to collectives.
   bool enableNanCheck_;
 
+<<<<<<< HEAD
   // Whether or not to print C++ stack traces to logs on unclean shutdown.
   bool logCppStackOnUncleanShutdown_;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Whether or not to create start CUDAEvent and enable timing for start
   // and end events. Note that enableTiming_ is always true if desyncDebug_
   // is set to true.
@@ -1266,14 +1642,21 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   // Flag to enable the print of hash value of input/output of collectives for
   // verification.
+<<<<<<< HEAD
   std::atomic<bool> enableCollecticeHashDebug_{};
+=======
+  std::atomic<bool> enableCollectiveHashDebug_{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Whether or not TORCH_NCCL_AVOID_RECORD_STREAMS was set
   bool avoidRecordStreams_ = false;
 
+<<<<<<< HEAD
   // Whether the NCCL watchdog should rethrow CUDA errors.
   bool rethrowCUDAErrors_ = false;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // The number of active ncclGroupStart() calls. This counter will be increased
   // by 1 when ncclGroupStart() is called and decreased by 1 when ncclGroupEnd()
   // is called.
@@ -1291,8 +1674,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // the ProcessGroup
   uint64_t op_id_{0};
 
+<<<<<<< HEAD
   std::exception_ptr watchDogException_ = nullptr;
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // The number of ProcessGroupNCCL created on the current rank.
   size_t local_id_;
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
index 167b8e17cfb4..3b6e6c053558 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
@@ -1325,7 +1325,11 @@ c10::intrusive_ptr<Work> ProcessGroupUCC::gather(
     SAVE_TENSORS(outputs, data->dst);
   } else {
     // for non-root ranks, outputTensors should be an empty list
+<<<<<<< HEAD
     if (outputTensors.size() != 0) {
+=======
+    if (!outputTensors.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_UCC_LOG_ERROR(
           TORCH_UCC_COLL_POST, "requires empty output on non-root");
     }
@@ -1550,7 +1554,11 @@ c10::intrusive_ptr<Work> ProcessGroupUCC::scatter(
     SAVE_TENSORS(inputTensors[0], data->src);
   } else {
     // for non-root ranks, inputTensors should be an empty list
+<<<<<<< HEAD
     if (inputTensors.size() != 0) {
+=======
+    if (!inputTensors.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_UCC_LOG_ERROR(
           TORCH_UCC_COLL_POST, "requires empty output on non-root");
     }
diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp
index 0b6dfe48d0d0..79803dce87d8 100644
--- a/torch/csrc/distributed/c10d/Store.hpp
+++ b/torch/csrc/distributed/c10d/Store.hpp
@@ -32,6 +32,13 @@ class TORCH_API Store : public torch::CustomClassHolder {
 
   ~Store() override = default;
 
+<<<<<<< HEAD
+=======
+  // Clone a thread safe copy of this store object that points to the same
+  // underlying store.
+  virtual c10::intrusive_ptr<Store> clone() = 0;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void set(const std::string& key, const std::string& value);
 
   virtual void set(
@@ -47,7 +54,11 @@ class TORCH_API Store : public torch::CustomClassHolder {
       const std::string& key,
       const std::vector<uint8_t>& currentValue,
       const std::vector<uint8_t>& newValue) {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(false, "Not implemented.");
+=======
+    C10_THROW_ERROR(NotImplementedError, "Not implemented.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::string get_to_str(const std::string& key);
@@ -77,7 +88,13 @@ class TORCH_API Store : public torch::CustomClassHolder {
       const std::string& /* unused */,
       // NOLINTNEXTLINE(performance-unnecessary-value-param)
       WatchKeyCallback /* unused */) {
+<<<<<<< HEAD
     TORCH_CHECK(false, "watchKey is deprecated, no implementation support it.");
+=======
+    C10_THROW_ERROR(
+        NotImplementedError,
+        "watchKey is deprecated, no implementation support it.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   virtual void append(
@@ -94,6 +111,23 @@ class TORCH_API Store : public torch::CustomClassHolder {
   // Returns true if this store support append, multiGet and multiSet
   virtual bool hasExtendedApi() const;
 
+<<<<<<< HEAD
+=======
+  virtual void queuePush(
+      const std::string& key,
+      const std::vector<uint8_t>& value) {
+    C10_THROW_ERROR(NotImplementedError, "queue support is not implemented.");
+  }
+
+  virtual std::vector<uint8_t> queuePop(const std::string& key, bool block) {
+    C10_THROW_ERROR(NotImplementedError, "queue support is not implemented.");
+  }
+
+  virtual int64_t queueLen(const std::string& key) {
+    C10_THROW_ERROR(NotImplementedError, "queue support is not implemented.");
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  protected:
   std::chrono::milliseconds timeout_;
 };
diff --git a/torch/csrc/distributed/c10d/TCPStore.cpp b/torch/csrc/distributed/c10d/TCPStore.cpp
index a8ea5080f56b..e14e747cf99a 100644
--- a/torch/csrc/distributed/c10d/TCPStore.cpp
+++ b/torch/csrc/distributed/c10d/TCPStore.cpp
@@ -128,9 +128,23 @@ class TCPClient {
   }
   template <typename T>
   std::optional<T> receiveValueWithTimeout(std::chrono::milliseconds timeout) {
+<<<<<<< HEAD
     if (!socket_.waitForInput(timeout))
       return {};
     return tcputil::recvValue<T>(socket_.handle());
+=======
+    if (!socket_.waitForInput(timeout)) {
+      return {};
+    }
+
+    try {
+      return tcputil::recvValue<T>(socket_.handle());
+    } catch (const std::exception& e) {
+      C10D_WARNING(
+          "recvValueWithTimeout failed on {}: {}", socket_.repr(), e.what());
+      throw;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   void setTimeout(std::chrono::milliseconds value);
 
@@ -250,6 +264,7 @@ TCPStore::TCPStore(std::string host, const TCPStoreOptions& opts)
         DistStoreError,
         ::c10d::detail::is_libuv_tcpstore_backend_available(),
         "use_libuv was requested but PyTorch was built without libuv support, run with USE_LIBUV=0 to disable it.");
+<<<<<<< HEAD
 
     if (opts.masterListenFd.has_value()) {
       // TODO(xilunwu): support this init method after testing
@@ -261,6 +276,8 @@ TCPStore::TCPStore(std::string host, const TCPStoreOptions& opts)
       C10_THROW_ERROR(NotImplementedError, msg);
       return;
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   Socket::initialize();
@@ -361,6 +378,20 @@ TCPStore::TCPStore(std::string host, const TCPStoreOptions& opts)
 
 TCPStore::~TCPStore() = default;
 
+<<<<<<< HEAD
+=======
+c10::intrusive_ptr<Store> TCPStore::clone() {
+  TCPStoreOptions opts;
+  opts.port = addr_.port;
+  opts.isServer = false;
+  opts.waitWorkers = false;
+  opts.timeout = timeout_;
+  opts.useLibUV = usingLibUv_;
+
+  return c10::make_intrusive<TCPStore>(addr_.host, opts);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void TCPStore::waitForWorkers() {
   STATIC_SCOPED_WAIT_COUNTER(pytorch.wait_counter.TCPStore__waitForWorkers);
   if (!numWorkers_.has_value()) {
@@ -650,6 +681,68 @@ void TCPStore::multiSet(
   buffer.flush();
 }
 
+<<<<<<< HEAD
+=======
+void TCPStore::queuePush(
+    const std::string& key,
+    const std::vector<uint8_t>& data) {
+  TORCH_CHECK_WITH(
+      NotImplementedError,
+      usingLibUv_,
+      "queues not implemented on legacy TCPStore backend");
+
+  STATIC_SCOPED_WAIT_COUNTER(pytorch.wait_counter.TCPStore__queuePush);
+
+  const std::lock_guard<std::mutex> lock(activeOpLock_);
+
+  detail::SendBuffer buffer(*client_, detail::QueryType::QUEUE_PUSH);
+  buffer.appendString(keyPrefix_ + key);
+  buffer.appendBytes(data);
+  buffer.flush();
+}
+
+std::vector<uint8_t> TCPStore::queuePop(const std::string& key, bool block) {
+  TORCH_CHECK_WITH(
+      NotImplementedError,
+      usingLibUv_,
+      "queues not implemented on legacy TCPStore backend");
+
+  STATIC_SCOPED_WAIT_COUNTER(pytorch.wait_counter.TCPStore__queuePop);
+
+  const std::lock_guard<std::mutex> lock(activeOpLock_);
+
+  if (block) {
+    doWait(keyPrefix_ + key, timeout_);
+  }
+
+  detail::SendBuffer buffer(*client_, detail::QueryType::QUEUE_POP);
+  buffer.appendString(keyPrefix_ + key);
+  buffer.flush();
+
+  auto keys = client_->receiveValue<int64_t>();
+  TORCH_CHECK_WITH(DistQueueEmptyError, keys > 0, "queue is empty");
+
+  return client_->receiveBits();
+}
+
+int64_t TCPStore::queueLen(const std::string& key) {
+  TORCH_CHECK_WITH(
+      NotImplementedError,
+      usingLibUv_,
+      "queues not implemented on legacy TCPStore backend");
+
+  STATIC_SCOPED_WAIT_COUNTER(pytorch.wait_counter.TCPStore__queueLen);
+
+  const std::lock_guard<std::mutex> lock(activeOpLock_);
+
+  detail::SendBuffer buffer(*client_, detail::QueryType::QUEUE_LEN);
+  buffer.appendString(keyPrefix_ + key);
+  buffer.flush();
+
+  return client_->receiveValue<int64_t>();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool TCPStore::hasExtendedApi() const {
   return true;
 }
diff --git a/torch/csrc/distributed/c10d/TCPStore.hpp b/torch/csrc/distributed/c10d/TCPStore.hpp
index 37862fc25fac..295433b607a3 100644
--- a/torch/csrc/distributed/c10d/TCPStore.hpp
+++ b/torch/csrc/distributed/c10d/TCPStore.hpp
@@ -77,6 +77,11 @@ class TORCH_API TCPStore : public Store {
 
   ~TCPStore() override;
 
+<<<<<<< HEAD
+=======
+  c10::intrusive_ptr<Store> clone() override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void set(const std::string& key, const std::vector<uint8_t>& value) override;
 
   std::vector<uint8_t> compareSet(
@@ -112,6 +117,16 @@ class TORCH_API TCPStore : public Store {
 
   bool hasExtendedApi() const override;
 
+<<<<<<< HEAD
+=======
+  void queuePush(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<uint8_t> queuePop(const std::string& key, bool block) override;
+
+  int64_t queueLen(const std::string& key) override;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Waits for all workers to join.
   void waitForWorkers();
 
diff --git a/torch/csrc/distributed/c10d/TCPStoreBackend.hpp b/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
index d3f68d59fca1..e83873a30149 100644
--- a/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
+++ b/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
@@ -33,6 +33,12 @@ enum class QueryType : uint8_t {
   MULTI_SET,
   CANCEL_WAIT,
   PING,
+<<<<<<< HEAD
+=======
+  QUEUE_PUSH,
+  QUEUE_POP,
+  QUEUE_LEN,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 enum class CheckResponseType : uint8_t { READY, NOT_READY };
diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
index 7c8de630830d..d042d27df3ab 100644
--- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
@@ -120,10 +120,15 @@ class UvTcpSocket : public UvHandle {
     if (nread > 0) {
       try {
         uv_socket->processBuf(buf, nread);
+<<<<<<< HEAD
+=======
+        return; // We do free inside processBuf.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } catch (std::exception& ex) {
         C10D_WARNING("Error processing client message: {}", ex.what());
         uv_socket->close();
       }
+<<<<<<< HEAD
     } else {
       // Handle error and EOF cases
       if (nread < 0) {
@@ -139,6 +144,21 @@ class UvTcpSocket : public UvHandle {
       // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
       free(buf->base);
     }
+=======
+    } else if (nread == UV_EOF) { // Handle EOF cases
+      C10D_DEBUG("Remote peer closed the connection.");
+      uv_socket->close();
+    } else if (nread < 0) { // Handle error and EOF cases
+      C10D_DEBUG(
+          "Read callback failed. code:{} name:{} desc:{}",
+          nread,
+          uv_err_name(nread),
+          uv_strerror(nread));
+      uv_socket->close();
+    }
+    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+    free(buf->base);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
  public:
@@ -156,7 +176,11 @@ class UvTcpSocket : public UvHandle {
   }
 
   void startRead() {
+<<<<<<< HEAD
     struct ::sockaddr_storage addr {};
+=======
+    struct ::sockaddr_storage addr{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int addrLen{sizeof(struct ::sockaddr_storage)};
 
     if (int err = uv_tcp_getpeername(
@@ -219,8 +243,13 @@ class UvTcpServer : public UvTcpSocket {
     res->handleReady();
     try {
       int uv_res = uv_tcp_open((uv_tcp_t*)res->unsafeGetStream(), socket);
+<<<<<<< HEAD
       TORCH_CHECK_WITH(
           DistStoreError,
+=======
+      C10D_CHECK_WITH(
+          SocketError,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           uv_res == 0,
           "Failed to open existing socket. ",
           "socket: ",
@@ -232,6 +261,21 @@ class UvTcpServer : public UvTcpSocket {
           ", message: ",
           uv_strerror(uv_res));
 
+<<<<<<< HEAD
+=======
+      uv_res =
+          uv_listen(res->unsafeGetStream(), DEFAULT_BACKLOG, on_new_connection);
+      C10D_CHECK_WITH(
+          SocketError,
+          uv_res == 0,
+          fmt::format(
+              "The server socket has failed to listen on provided socket. "
+              "socket: {}, code: {}, name: {}, message: {}",
+              socket,
+              uv_res,
+              uv_err_name(uv_res),
+              uv_strerror(uv_res)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       res->cacheSocketPort();
     } catch (std::exception& ex) {
       res->close();
@@ -252,7 +296,11 @@ class UvTcpServer : public UvTcpSocket {
     auto res = c10::make_intrusive<UvTcpServer>(loop);
     res->handleReady();
     try {
+<<<<<<< HEAD
       struct sockaddr_storage addr {};
+=======
+      struct sockaddr_storage addr{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       int uv_res = 0;
       if (useIpv6) {
         uv_res = uv_ip6_addr("::", port, (struct sockaddr_in6*)&addr);
@@ -386,7 +434,11 @@ class WriterPayload : public c10::intrusive_ptr_target {
   void registeredInLoop() {
     /*
     This refcount increment must be matched by a reclaim call.
+<<<<<<< HEAD
     Call this method after sucessfully scheduling this handle with a loop.
+=======
+    Call this method after successfully scheduling this handle with a loop.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     */
     at::raw::intrusive_ptr::incref(this);
   }
@@ -655,6 +707,15 @@ class LibUVStoreDaemon : public BackgroundThread {
   int64_t deleteKey(const std::string& key);
   void append(const std::string& key, const std::vector<uint8_t>& value);
 
+<<<<<<< HEAD
+=======
+  void queuePush(const std::string& queueName, const std::vector<uint8_t>& val);
+  void queuePop(
+      const std::string& queueName,
+      const c10::intrusive_ptr<UvHandle>& client);
+  int64_t queueLen(const std::string& queueName);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void registerClient(const c10::intrusive_ptr<UvHandle>& client);
   void unregisterClient(const c10::intrusive_ptr<UvHandle>& client);
   void clearClientWaitState(const c10::intrusive_ptr<UvHandle>& client);
@@ -680,6 +741,13 @@ class LibUVStoreDaemon : public BackgroundThread {
   std::unordered_map<c10::intrusive_ptr<UvHandle>, size_t> keysAwaited_;
   std::unordered_set<c10::intrusive_ptr<UvHandle>> clients_;
   std::unordered_set<c10::intrusive_ptr<UvHandle>> miscellaneousClients_;
+<<<<<<< HEAD
+=======
+
+  // Queues
+  std::unordered_map<std::string, std::deque<std::vector<uint8_t>>> queues_;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int port_;
 
   static LibUVStoreDaemon& from_uv(uv_handle_t* stream) {
@@ -697,6 +765,10 @@ class LibUVStoreDaemon : public BackgroundThread {
   void onConnect(int status);
   void onExitRequest();
   void wakeupWaitingClients(const std::string& key);
+<<<<<<< HEAD
+=======
+  void wakeupOneWaitingClient(const std::string& key);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // bool tryListen(bool use_ipv6);
 
   static void print_active_handles(uv_handle_t* handle, void* arg);
@@ -776,6 +848,21 @@ class UvClient : public UvTcpSocket {
             if (!parse_cancel_wait_command())
               return;
             break;
+<<<<<<< HEAD
+=======
+          case QueryType::QUEUE_PUSH:
+            if (!parse_queue_push_command())
+              return;
+            break;
+          case QueryType::QUEUE_POP:
+            if (!parse_queue_pop_command())
+              return;
+            break;
+          case QueryType::QUEUE_LEN:
+            if (!parse_queue_len_command())
+              return;
+            break;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           default:
             C10D_DEBUG(
                 "Client sent invalid command. client:{} command:{}",
@@ -905,7 +992,15 @@ class UvClient : public UvTcpSocket {
         return false;
     }
 
+<<<<<<< HEAD
     C10D_TRACE("check key_count:{} address:{}", key_count, this->address());
+=======
+    C10D_TRACE(
+        "check key_count:{} keys[0]:{} address:{}",
+        key_count,
+        keys[0],
+        this->address());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // Now we have received all the keys
     StreamWriter sw(iptr());
@@ -938,7 +1033,15 @@ class UvClient : public UvTcpSocket {
         return false;
     }
 
+<<<<<<< HEAD
     C10D_TRACE("wait key_count:{} address:{}", key_count, this->address());
+=======
+    C10D_TRACE(
+        "wait key_count:{} keys[0]:{} address:{}",
+        key_count,
+        keys[0],
+        this->address());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (store->waitKeys(keys, iptr())) {
       StreamWriter sw(iptr());
@@ -1058,7 +1161,11 @@ class UvClient : public UvTcpSocket {
   bool parse_cancel_wait_command() {
     store->clearClientWaitState(iptr());
 
+<<<<<<< HEAD
     C10D_TRACE("cancel_wait key_count:{} address:{}", this->address());
+=======
+    C10D_TRACE("cancel_wait address:{}", this->address());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     StreamWriter sw(iptr());
     sw.write1((uint8_t)WaitResponseType::WAIT_CANCELED);
@@ -1067,6 +1174,52 @@ class UvClient : public UvTcpSocket {
     return true;
   }
 
+<<<<<<< HEAD
+=======
+  bool parse_queue_push_command() {
+    std::string key;
+    if (!stream.read_key(key)) {
+      return false;
+    }
+
+    std::vector<uint8_t> data;
+    if (!stream.read_payload(data)) {
+      return false;
+    }
+
+    C10D_TRACE("queue_push key:{} address:{}", key, this->address());
+
+    store->queuePush(key, data);
+    return true;
+  }
+
+  bool parse_queue_pop_command() {
+    std::string key;
+    if (!stream.read_key(key)) {
+      return false;
+    }
+
+    C10D_TRACE("queue_pop key:{} address:{}", key, this->address());
+
+    store->queuePop(key, iptr());
+    return true;
+  }
+
+  bool parse_queue_len_command() {
+    std::string key;
+    if (!stream.read_key(key)) {
+      return false;
+    }
+
+    C10D_TRACE("queue_len key:{} address:{}", key, this->address());
+
+    StreamWriter sw(iptr());
+    sw.write_value<int64_t>(store->queueLen(key));
+    sw.send();
+    return true;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  public:
   explicit UvClient(uv_loop_t* loop, LibUVStoreDaemon* store)
       : UvTcpSocket(loop), store(store) {}
@@ -1336,7 +1489,17 @@ int64_t LibUVStoreDaemon::add(const std::string& key, int64_t addVal) {
 
 bool LibUVStoreDaemon::checkKeys(const std::vector<std::string>& keys) {
   return std::all_of(keys.begin(), keys.end(), [&](const std::string& s) {
+<<<<<<< HEAD
     return tcpStore_.count(s) > 0;
+=======
+    if (tcpStore_.count(s) > 0) {
+      return true;
+    }
+    if (auto it = queues_.find(s); it != queues_.end() && !it->second.empty()) {
+      return true;
+    }
+    return false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   });
 }
 
@@ -1395,6 +1558,53 @@ void LibUVStoreDaemon::wakeupWaitingClients(const std::string& key) {
   }
 }
 
+<<<<<<< HEAD
+=======
+void LibUVStoreDaemon::wakeupOneWaitingClient(const std::string& key) {
+  auto socketsToWait = waitingSockets_.find(key);
+  if (socketsToWait != waitingSockets_.end()) {
+    for (const auto& client : socketsToWait->second) {
+      if (--keysAwaited_[client] == 0) {
+        StreamWriter sw(client->iptr());
+        sw.write1((uint8_t)WaitResponseType::STOP_WAITING);
+        sw.send();
+        return;
+      }
+    }
+  }
+}
+
+void LibUVStoreDaemon::queuePush(
+    const std::string& key,
+    const std::vector<uint8_t>& value) {
+  queues_[key].push_back(value);
+  wakeupOneWaitingClient(key);
+}
+
+void LibUVStoreDaemon::queuePop(
+    const std::string& key,
+    const c10::intrusive_ptr<UvHandle>& client) {
+  auto& queue = queues_[key];
+
+  StreamWriter sw(client->iptr());
+  sw.write_value<int64_t>(queue.size());
+
+  if (!queue.empty()) {
+    auto value = queue.front();
+    queue.pop_front();
+    sw.write_vector(value);
+  }
+
+  sw.send();
+}
+int64_t LibUVStoreDaemon::queueLen(const std::string& key) {
+  auto it = queues_.find(key);
+  if (it == queues_.end()) {
+    return 0;
+  }
+  return static_cast<int64_t>(it->second.size());
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 std::unique_ptr<BackgroundThread> create_libuv_tcpstore_backend(
diff --git a/torch/csrc/distributed/c10d/TraceUtils.h b/torch/csrc/distributed/c10d/TraceUtils.h
index fcd00fc6bca8..5957fbca9d2e 100644
--- a/torch/csrc/distributed/c10d/TraceUtils.h
+++ b/torch/csrc/distributed/c10d/TraceUtils.h
@@ -2,20 +2,36 @@
 #include <c10/core/ScalarType.h>
 #include <c10/util/ApproximateClock.h>
 #include <c10/util/irange.h>
+<<<<<<< HEAD
 #include <c10/util/string_view.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/Types.hpp>
 #include <torch/csrc/distributed/c10d/Utils.hpp>
 #include <torch/csrc/jit/serialization/pickler.h>
 #include <torch/csrc/profiler/combined_traceback.h>
 
+<<<<<<< HEAD
 #include <sys/types.h>
 #include <cstdlib>
+=======
+#include <fmt/compile.h>
+#include <fmt/core.h>
+#include <fmt/ostream.h> // optional, for ostream fallback
+#include <fmt/ranges.h> // for fmt::join
+
+#include <sys/types.h>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <string>
 #include <vector>
 
 namespace c10d {
 
+<<<<<<< HEAD
 // A struct to hold the latest status of the process group.
 struct ProcessGroupStatus {
   // the sequential number of the last collective enqueued into workMetaList_
@@ -53,6 +69,14 @@ inline std::string getTraceStartKey(const std::string& pgName, int rank) {
 
 inline std::string getTraceEndKey(const std::string& pgName, int rank) {
   return pgName + "_" + std::to_string(rank) + "_trace_end";
+=======
+inline std::string getTraceStartKey(const std::string& pgName, int rank) {
+  return fmt::format(FMT_COMPILE("{}_{}_trace_start"), pgName, rank);
+}
+
+inline std::string getTraceEndKey(const std::string& pgName, int rank) {
+  return fmt::format(FMT_COMPILE("{}_{}_trace_end"), pgName, rank);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline bool traceUpdate(
@@ -61,8 +85,13 @@ inline bool traceUpdate(
     uint64_t seq,
     const std::string& col) {
   std::vector<uint8_t> value(col.size() + sizeof(seq) + 1);
+<<<<<<< HEAD
   memcpy(value.data(), &seq, sizeof(seq));
   memcpy(value.data() + sizeof(seq), col.data(), col.size());
+=======
+  std::memcpy(value.data(), &seq, sizeof(seq));
+  std::memcpy(value.data() + sizeof(seq), col.data(), col.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   try {
     store->set(key, value);
     return true;
@@ -83,6 +112,7 @@ using TraceMap =
     std::map<uint64_t, std::map<int, std::pair<std::string, TraceDebugEvent>>>;
 
 inline std::string ranksToString(const std::vector<int>& ranks) {
+<<<<<<< HEAD
   std::string str;
   for (int rank : ranks) {
     if (str.empty()) {
@@ -92,10 +122,14 @@ inline std::string ranksToString(const std::vector<int>& ranks) {
     }
   }
   return str;
+=======
+  return fmt::to_string(fmt::join(ranks, ", "));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline std::string ranksFromTrace(
     const std::vector<std::pair<int, std::string>>& items) {
+<<<<<<< HEAD
   std::string ranks;
   for (auto& p : items) {
     if (ranks.empty()) {
@@ -105,6 +139,18 @@ inline std::string ranksFromTrace(
     }
   }
   return ranks;
+=======
+  fmt::memory_buffer buf;
+  bool first = true;
+  for (const auto& [rank, _] : items) {
+    if (!first) {
+      fmt::format_to(std::back_inserter(buf), ", ");
+    }
+    fmt::format_to(std::back_inserter(buf), "{}", rank);
+    first = false;
+  }
+  return fmt::to_string(buf);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline std::string analyzeMissingRanks(const std::vector<int>& missingRanks) {
@@ -159,7 +205,11 @@ inline std::string dumpSnapshot(TraceMap& traceMap) {
 
     std::unordered_map<std::string, std::vector<int>> collectivesStart;
     std::unordered_map<std::string, std::vector<int>> collectivesEnd;
+<<<<<<< HEAD
     for (auto& p : subMap) {
+=======
+    for (const auto& p : subMap) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       int rank = p.first;
       const std::string& col = p.second.first;
       if (p.second.second == kEventStart) {
@@ -200,7 +250,11 @@ inline bool parseTraceValue(
     std::string& col) {
   try {
     std::vector<uint8_t> traceValue = store->get(key);
+<<<<<<< HEAD
     memcpy(&seq, traceValue.data(), sizeof(seq));
+=======
+    std::memcpy(&seq, traceValue.data(), sizeof(seq));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::string colName((char*)traceValue.data() + sizeof(seq));
     col = colName;
     return true;
@@ -318,6 +372,7 @@ inline std::string get_python_cpp_trace() {
           /*python=*/true, /*script=*/true, /*cpp=*/true);
   torch::SymbolizedTracebacks s_tbs = torch::symbolize({tb.get()});
   const auto& s_tb = s_tbs.tracebacks.at(0);
+<<<<<<< HEAD
   std::stringstream oss;
   for (auto idx : c10::irange(s_tb.size())) {
     auto frame_id = s_tb[idx];
@@ -326,6 +381,23 @@ inline std::string get_python_cpp_trace() {
         << ":" << frame.lineno << '\n';
   }
   return oss.str();
+=======
+  constexpr auto TB_FMT_CSTR = FMT_COMPILE("#{} {} from {}:{}\n");
+  fmt::memory_buffer buf;
+  auto buf_iter = std::back_inserter(buf);
+  for (auto idx : c10::irange(s_tb.size())) {
+    auto frame_id = s_tb[idx];
+    const auto& frame = s_tbs.all_frames.at(frame_id);
+    fmt::format_to(
+        buf_iter,
+        TB_FMT_CSTR,
+        idx,
+        frame.funcname,
+        frame.filename,
+        frame.lineno);
+  }
+  return fmt::to_string(buf);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline c10::Dict<c10::IValue, c10::IValue> new_dict() {
@@ -338,6 +410,7 @@ inline c10::List<c10::IValue> new_list() {
 }
 
 inline std::string ranks_str(const std::vector<uint64_t>& ranks) {
+<<<<<<< HEAD
   std::string str;
   for (const auto& rank : ranks) {
     if (str.empty()) {
@@ -347,6 +420,9 @@ inline std::string ranks_str(const std::vector<uint64_t>& ranks) {
     }
   }
   return c10::str("[", str, "]");
+=======
+  return fmt::format("[{}]", fmt::join(ranks, ", "));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
index 5d15708c953e..beddfbb907f9 100644
--- a/torch/csrc/distributed/c10d/Types.hpp
+++ b/torch/csrc/distributed/c10d/Types.hpp
@@ -122,6 +122,10 @@ struct BroadcastOptions {
 struct AllreduceOptions {
   ReduceOp reduceOp = ReduceOp::SUM;
   std::chrono::milliseconds timeout = kUnsetTimeout;
+<<<<<<< HEAD
+=======
+  bool asyncOp = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::optional<at::Tensor> sparseIndices = std::nullopt;
 };
 
@@ -132,6 +136,10 @@ struct ReduceOptions {
   int64_t rootRank = 0;
   int64_t rootTensor = 0;
   std::chrono::milliseconds timeout = kUnsetTimeout;
+<<<<<<< HEAD
+=======
+  bool asyncOp = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct AllgatherOptions {
@@ -142,6 +150,10 @@ struct AllgatherOptions {
 struct GatherOptions {
   int64_t rootRank = 0;
   std::chrono::milliseconds timeout = kUnsetTimeout;
+<<<<<<< HEAD
+=======
+  bool asyncOp = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct ScatterOptions {
@@ -158,12 +170,20 @@ struct ReduceScatterOptions {
 
 struct AllToAllOptions {
   std::chrono::milliseconds timeout = kUnsetTimeout;
+<<<<<<< HEAD
+=======
+  bool asyncOp = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct BarrierOptions {
   std::vector<int64_t> device_ids;
   std::chrono::milliseconds timeout = kUnsetTimeout;
   std::optional<at::Device> device;
+<<<<<<< HEAD
+=======
+  bool asyncOp = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct DistributedBackendOptions {
diff --git a/torch/csrc/distributed/c10d/UCCTracing.cpp b/torch/csrc/distributed/c10d/UCCTracing.cpp
index c61acdf824da..7e44b657c178 100644
--- a/torch/csrc/distributed/c10d/UCCTracing.cpp
+++ b/torch/csrc/distributed/c10d/UCCTracing.cpp
@@ -4,11 +4,19 @@
 #include <torch/csrc/distributed/c10d/UCCTracing.hpp>
 #include <torch/csrc/distributed/c10d/UCCUtils.hpp>
 
+<<<<<<< HEAD
+=======
+#include <fmt/format.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
 
 #include <sys/stat.h>
 #include <cstdlib>
 #include <ctime>
+<<<<<<< HEAD
+=======
+#include <filesystem>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <fstream>
 
 namespace c10d {
@@ -32,11 +40,16 @@ void ProcessGroupUCCLogger::flushComms(int rank, int world_size) {
         "_", (1 + ltm->tm_mon), "_", ltm->tm_mday, "_", (1900 + ltm->tm_year));
   }
 
+<<<<<<< HEAD
   std::string fullpath = "/tmp/" + dirname;
+=======
+  std::filesystem::path fullpath = std::filesystem::path("/tmp") / dirname;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto user_path = c10::utils::get_env("TORCH_UCC_COMMS_TRACE_OUTPUT_DIR");
   if (user_path.has_value()) {
     fullpath = std::move(user_path.value());
   }
+<<<<<<< HEAD
   std::string trace_filename = c10::str(fullpath, "/rank", rank, ".json");
   std::ofstream _outfile;
   if (!_outfile.is_open()) {
@@ -47,6 +60,18 @@ void ProcessGroupUCCLogger::flushComms(int rank, int world_size) {
     }
     _outfile.open(trace_filename, std::ofstream::out | std::ofstream::trunc);
   }
+=======
+  std::filesystem::path trace_filename =
+      fullpath / fmt::format("rank{}.json", rank);
+  std::error_code ec{};
+  if (!std::filesystem::create_directories(fullpath, ec)) {
+    LOG(INFO) << getLogPrefix() << "[INFO] failed to mkdir " << fullpath
+              << " with error " << ec.message();
+    return;
+  }
+  std::ofstream _outfile;
+  _outfile.open(trace_filename, std::ofstream::out | std::ofstream::trunc);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // flush the traced comms
   if (_outfile.is_open()) {
     _outfile << "[" << c10::Join(",", trace_generator->getCommsTrace())
@@ -92,7 +117,11 @@ void CommTraceLogger::recordComms(
       (!outputTensors.empty()) ? outputTensors[0].scalar_type() : at::kByte;
   auto devType = (!outputTensors.empty()) ? outputTensors[0].device().type()
                                           : c10::DeviceType::CPU;
+<<<<<<< HEAD
   auto now = std::chrono::system_clock::now();
+=======
+  auto now = std::chrono::steady_clock::now();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static auto startTS = now;
   int64_t time_since_begin =
       std::chrono::duration_cast<std::chrono::nanoseconds>(now - startTS)
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index 92c498cdcff5..cdc8d0acc17a 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -130,15 +130,24 @@ inline int getCvarInt(const std::vector<std::string>& env, int def) {
    * versions of a variable get higher priority than the latter
    * versions of the same variable */
   for (ssize_t i = static_cast<ssize_t>(env.size()) - 1; i >= 0; i--) {
+<<<<<<< HEAD
     char* val = std::getenv(env[i].c_str());
     if (val == nullptr) {
+=======
+    const auto val = c10::utils::get_env(env[i].c_str());
+    if (!val.has_value()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       continue;
     } else if (i) {
       WARN_ENV_VAR_ONCE(env[i], env[0]);
     }
 
     try {
+<<<<<<< HEAD
       ret = std::stoi(val);
+=======
+      ret = std::stoi(val.value());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } catch (std::exception&) {
       TORCH_CHECK(false, "Invalid value for environment variable: " + env[i]);
     }
@@ -573,9 +582,15 @@ using SizeType = uint64_t;
 // (https://stackoverflow.com/a/20295079), and thus `errno` should really only
 // be inspected if an error occurred.
 //
+<<<<<<< HEAD
 // `success_cond` is an expression used to check if an error has happend. So for
 // `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function output
 // is stored in variable `__output` and may be used in `success_cond`.
+=======
+// `success_cond` is an expression used to check if an error has happened. So
+// for `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function
+// output is stored in variable `__output` and may be used in `success_cond`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef _WIN32
 #define SYSCHECK(expr, success_cond)                                           \
   while (true) {                                                               \
@@ -653,7 +668,15 @@ void sendBytes(
     SYSCHECK_ERR_RETURN_NEG1(
         bytesSent = ::send(socket, currentBytes, bytesToSend, flags))
     if (bytesSent == 0) {
+<<<<<<< HEAD
       C10_THROW_ERROR(DistNetworkError, "failed to send, sent 0 bytes");
+=======
+      C10_THROW_ERROR(
+          DistNetworkError,
+          "Failed to send, sent 0 bytes. "
+          "Connection was likely closed. "
+          "Did the remote server shutdown or crash?");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     bytesToSend -= bytesSent;
@@ -675,7 +698,15 @@ void recvBytes(int socket, T* buffer, size_t length) {
     SYSCHECK_ERR_RETURN_NEG1(
         bytesReceived = recv(socket, currentBytes, bytesToReceive, 0))
     if (bytesReceived == 0) {
+<<<<<<< HEAD
       C10_THROW_ERROR(DistNetworkError, "failed to recv, got 0 bytes");
+=======
+      C10_THROW_ERROR(
+          DistNetworkError,
+          "Failed to recv, got 0 bytes. "
+          "Connection was likely closed. "
+          "Did the remote server shutdown or crash?");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     bytesToReceive -= bytesReceived;
diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp
index 5fd6c6c73788..d2b9dfea7447 100644
--- a/torch/csrc/distributed/c10d/Work.hpp
+++ b/torch/csrc/distributed/c10d/Work.hpp
@@ -118,7 +118,11 @@ class TORCH_API Work : public torch::CustomClassHolder {
 
   // Get a Future object that would be marked as either success or failure
   // This API can be used by the user to track the completion of the work
+<<<<<<< HEAD
   // and hanlde the exception if any.
+=======
+  // and handle the exception if any.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   virtual c10::intrusive_ptr<c10::ivalue::Future> getFutureResult();
 
   virtual float getDuration() const;
diff --git a/torch/csrc/distributed/c10d/comm.hpp b/torch/csrc/distributed/c10d/comm.hpp
index 6f9203e21434..26b76b744b09 100644
--- a/torch/csrc/distributed/c10d/comm.hpp
+++ b/torch/csrc/distributed/c10d/comm.hpp
@@ -67,7 +67,12 @@ class TORCH_API GradBucket {
     return parameters_;
   }
 
+<<<<<<< HEAD
   // Returns whther this bucket is the last bucket to allreduce in an iteration.
+=======
+  // Returns whether this bucket is the last bucket to allreduce in an
+  // iteration.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool isLast() const {
     return index_ == bucket_count_ - 1;
   }
diff --git a/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp b/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
index cc1539a9527b..f41635e2fb11 100644
--- a/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
@@ -40,5 +40,9 @@ RegisterHandler tracebackHandler{
 
       res.setContent(std::move(file_contents), "text/plain");
     }};
+<<<<<<< HEAD
 }
+=======
+} // namespace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace c10d::control_plane
diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
index 5d406656b094..4dff61e5abf3 100644
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+#include <filesystem>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <sstream>
 #include <unordered_map>
 
@@ -7,6 +11,7 @@
 #include <torch/csrc/distributed/c10d/control_plane/WorkerServer.hpp>
 #include <torch/csrc/distributed/c10d/logging.h>
 
+<<<<<<< HEAD
 // NS: TODO: Use `std::filesystem` regardless of OS when it's possible
 // to use it without leaking symbols on PRECXX11 ABI Linux OSes
 // See https://github.com/pytorch/pytorch/issues/133437 for more details
@@ -16,6 +21,8 @@
 #include <sys/stat.h>
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace c10d::control_plane {
 
 namespace {
@@ -80,6 +87,7 @@ std::string jsonStrEscape(const std::string& str) {
   }
   return ostream.str();
 }
+<<<<<<< HEAD
 
 bool file_exists(const std::string& path) {
 #ifdef _WIN32
@@ -89,6 +97,8 @@ bool file_exists(const std::string& path) {
   return lstat(path.c_str(), &rc) == 0;
 #endif
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
@@ -96,9 +106,14 @@ WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
       "/",
       [](const httplib::Request& req [[maybe_unused]], httplib::Response& res) {
         res.set_content(
+<<<<<<< HEAD
             R"BODY(<h1>torch.distributed.WorkerServer</h1>
 <a href="/handler/">Handler names</a>
 )BODY",
+=======
+            "<h1>torch.distributed.WorkerServer</h1>\n"
+            "<a href=\"/handler/\">Handler names</a>\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "text/html");
       });
   server_.Get(
@@ -163,7 +178,11 @@ WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
     // using unix sockets
     server_.set_address_family(AF_UNIX);
 
+<<<<<<< HEAD
     if (file_exists(hostOrFile)) {
+=======
+    if (std::filesystem::exists(hostOrFile)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       throw std::runtime_error(fmt::format("{} already exists", hostOrFile));
     }
 
diff --git a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
index c07dcd46a015..04b777b70d27 100644
--- a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
+++ b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
@@ -5,7 +5,11 @@
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/cuda/CUDAGuard.h>
 
+<<<<<<< HEAD
 // Two warninngs in Cutlass included header files
+=======
+// Two warnings in Cutlass included header files
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
 
@@ -157,13 +161,21 @@ at::Tensor async_input_mm_impl(
   };
 
   TORCH_CHECK(
+<<<<<<< HEAD
       a_chunk_signals.sizes().size() == 1,
+=======
+      a_chunk_signals.dim() == 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "async_input_mm: `a_chunk_signals` must be a 1D tensor.");
   size_t num_chunks_M = a_chunk_signals.numel();
 
   TORCH_CHECK(
       M % num_chunks_M == 0,
+<<<<<<< HEAD
       "async_input_mm: `a.shape(0)` must be an interger multiple of `a_chunk_signals.numel()`");
+=======
+      "async_input_mm: `a.shape(0)` must be an integer multiple of `a_chunk_signals.numel()`");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t chunk_size_M = M / num_chunks_M;
   size_t tile_size_M = cute::get<0>(TileShape_MNK{});
 
@@ -248,7 +260,11 @@ at::Tensor async_input_mm_out(
   });
 #else
   TORCH_CHECK(
+<<<<<<< HEAD
       false, "async_input_mm is not currenlty supported on your device");
+=======
+      false, "async_input_mm is not currently supported on your device");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   return out;
 }
diff --git a/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh b/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh
index 76f0ffddc37b..7406d3fd44f4 100644
--- a/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh
+++ b/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh
@@ -3,7 +3,11 @@
  * that supports consuming asynchronous input. This tile scheduler introduces the following arguments:
  *
  * - tiles_per_chunk_m – Specifies the size of an M chunk. Chunks are the granularity at which the
+<<<<<<< HEAD
  *   asynchronous input becomes ready. It must be an interger multiple of the size of an M tile.
+=======
+ *   asynchronous input becomes ready. It must be an integer multiple of the size of an M tile.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  *
  * - chunk_signals – chunk_signals[i] == 1 indicates that chunk i is ready. Before returning a work
  *   tile, get_current_work() waits for the signal to ensure that the corresponding chunk is ready.
@@ -138,6 +142,23 @@ public:
   using RasterOrderOptions = typename Params::RasterOrderOptions;
   static constexpr bool IsDynamicPersistent = false;
 
+<<<<<<< HEAD
+=======
+  using Pipeline = PipelineEmpty;
+  using PipelineStorage = typename Pipeline::SharedStorage;
+  using ThrottlePipeline = PipelineEmpty;
+  using ThrottlePipelineStorage = typename ThrottlePipeline::SharedStorage;
+
+  struct CLCResponse {};
+
+  class SharedStorage {
+  public:
+    CUTLASS_DEVICE PipelineStorage pipeline() { return PipelineStorage{}; }
+    CUTLASS_DEVICE ThrottlePipelineStorage throttle_pipeline() { return ThrottlePipelineStorage{}; }
+    CUTLASS_DEVICE CLCResponse* data() { return nullptr; }
+  };
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 public:
   // ==============================
   // CUSTOM LOGIC BEGIN
@@ -313,7 +334,11 @@ public:
         wait_signal(scheduler_params.chunk_signals + chunk_idx);
       }
 
+<<<<<<< HEAD
       // An arbirary, non-default id
+=======
+      // An arbitrary, non-default id
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       constexpr int barrier_id = 8;
       arch::NamedBarrier barrier(NumThreadsPerWarp, barrier_id);
       barrier.arrive_and_wait();
@@ -410,6 +435,20 @@ public:
     return cute::make_tuple(get_current_work(), true);
   }
 
+<<<<<<< HEAD
+=======
+  // Kernel helper function to get next work tile
+  template <class TileSchedulerPipeline, class TileSchedulerPipelineState>
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(
+      WorkTileInfo work_tile_info,
+      TileSchedulerPipeline& scheduler_pipeline,
+      TileSchedulerPipelineState scheduler_pipe_consumer_state) {
+    return fetch_next_work(work_tile_info);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Given the inputs, computes the total number of output blocks over which this problem will compute.
   // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
   template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
@@ -655,13 +694,25 @@ public:
 template <
   class KernelSchedule,
   class TileShape,
+<<<<<<< HEAD
   class ClusterShape
+=======
+  class ClusterShape,
+  uint32_t SchedulerPipelineStageCount,
+  class ProblemShapeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 >
 struct TileSchedulerSelector<
   PersistentAsyncInputScheduler<KernelSchedule>,
   arch::Sm90,
   TileShape,
+<<<<<<< HEAD
   ClusterShape
+=======
+  ClusterShape,
+  SchedulerPipelineStageCount,
+  ProblemShapeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   > {
   using Scheduler = PersistentTileSchedulerSm90AsyncInput;
 };
diff --git a/torch/csrc/distributed/c10d/cuda/utils.cpp b/torch/csrc/distributed/c10d/cuda/utils.cpp
index 7884be53a1a7..a308486eda57 100644
--- a/torch/csrc/distributed/c10d/cuda/utils.cpp
+++ b/torch/csrc/distributed/c10d/cuda/utils.cpp
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+#include <cuda_runtime.h>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/distributed/c10d/cuda/utils.hpp>
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 50683f9e29a4..9998c7798e5e 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1,8 +1,13 @@
 #include <torch/csrc/python_headers.h>
 
 #include <c10/util/intrusive_ptr.h>
+<<<<<<< HEAD
 #include <c10/util/string_view.h>
 #include <torch/csrc/distributed/c10d/FileStore.hpp>
+=======
+#include <torch/csrc/distributed/c10d/FileStore.hpp>
+#include <torch/csrc/distributed/c10d/FlightRecorder.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/distributed/c10d/Functional.hpp>
 #include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
@@ -10,6 +15,10 @@
 #include <torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp>
 #include <torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp>
 #include <torch/csrc/distributed/c10d/control_plane/WorkerServer.hpp>
+<<<<<<< HEAD
+=======
+#include <string_view>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <utility>
 #include <vector>
 #ifndef _WIN32
@@ -31,7 +40,11 @@
 #ifdef USE_C10D_NCCL
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
+<<<<<<< HEAD
 #include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
+=======
+#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #ifdef USE_C10D_MPI
@@ -44,9 +57,16 @@
 
 #include <fmt/format.h>
 #include <pybind11/chrono.h>
+<<<<<<< HEAD
 #include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
+=======
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <torch/csrc/distributed/c10d/comm.hpp>
 #include <torch/csrc/distributed/c10d/debug.h>
@@ -191,6 +211,13 @@ template <typename T>
 using intrusive_ptr_no_gil_destructor_class_ =
     py::class_<T, IntrusivePtrNoGilDestructor<T>>;
 
+<<<<<<< HEAD
+=======
+template <typename T, typename Trampoline>
+using intrusive_ptr_no_gil_destructor_trampoline_class_ =
+    py::class_<T, IntrusivePtrNoGilDestructor<T>, Trampoline>;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // PythonStore is a pybind11 trampoline class to allow a Python
 // class to inherit from c10d.Store and implement its interface.
 class PythonStore : public ::c10d::Store {
@@ -274,6 +301,13 @@ class PythonStore : public ::c10d::Store {
     PYBIND11_OVERLOAD_PURE(void, ::c10d::Store, wait, keys, timeout);
   }
 
+<<<<<<< HEAD
+=======
+  c10::intrusive_ptr<Store> clone() override {
+    PYBIND11_OVERLOAD_PURE(c10::intrusive_ptr<Store>, ::c10d::Store, clone);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Note: this function manually calls the Python-side overload
   // for this function instead of using the PYBIND11_OVERLOAD_XYZ
   // macros. This is done so that we can call the Python-side
@@ -410,7 +444,11 @@ static PyObject* reduceopmeta___instancecheck__(
 // NOLINTNEXTLINE(*c-arrays)
 static PyMethodDef reduceopmeta_methods[] = {
     {"__instancecheck__",
+<<<<<<< HEAD
      (PyCFunction)reduceopmeta___instancecheck__,
+=======
+     reduceopmeta___instancecheck__,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      METH_O,
      "Custom `__instancecheck__` for ReduceOp"},
     {nullptr, nullptr}};
@@ -555,7 +593,13 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
                  bool find_unused_parameters,
                  bool gradient_as_bucket_view,
                  std::unordered_map<size_t, std::string> param_to_name_mapping,
+<<<<<<< HEAD
                  int64_t first_bucket_bytes_cap) {
+=======
+                 int64_t first_bucket_bytes_cap,
+                 bool skip_all_reduce_unused_params,
+                 bool use_python_reducer) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 // gil_scoped_release is not safe as a call_guard in init.
                 // https://github.com/pybind/pybind11/issues/5473
                 py::gil_scoped_release nogil{};
@@ -569,7 +613,13 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
                     find_unused_parameters,
                     gradient_as_bucket_view,
                     std::move(param_to_name_mapping),
+<<<<<<< HEAD
                     first_bucket_bytes_cap);
+=======
+                    first_bucket_bytes_cap,
+                    skip_all_reduce_unused_params,
+                    use_python_reducer);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               }),
           py::arg("params"),
           py::arg("bucket_indices"),
@@ -581,7 +631,13 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
           py::arg("gradient_as_bucket_view") = false,
           py::arg("param_to_name_mapping") =
               std::unordered_map<size_t, std::string>(),
+<<<<<<< HEAD
           py::arg("first_bucket_bytes_cap") = ::c10d::kDefaultFirstBucketBytes)
+=======
+          py::arg("first_bucket_bytes_cap") = ::c10d::kDefaultFirstBucketBytes,
+          py::arg("skip_all_reduce_unused_params") = false,
+          py::arg("use_python_reducer") = false)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def(
           "prepare_for_forward",
           &::c10d::Reducer::prepare_for_forward,
@@ -989,6 +1045,22 @@ This class does not support ``__members__`` property.)");
     return ::c10d::unregister_all_process_groups();
   });
 
+<<<<<<< HEAD
+=======
+#ifdef USE_NVSHMEM
+  // Initializes the device state in CUmodule so that it’s able to perform
+  // NVSHMEM operations.
+  module.def(
+      "_nvshmemx_cumodule_init",
+      ::c10d::nvshmem_extension::nvshmemx_cumodule_init,
+      py::arg("module"));
+
+  // Check if NVSHMEM is available on current system.
+  module.def(
+      "_is_nvshmem_available", ::c10d::nvshmem_extension::is_nvshmem_available);
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::class_<::c10d::BroadcastOptions>(module, "BroadcastOptions")
       .def(py::init<>())
       .def_readwrite("rootRank", &::c10d::BroadcastOptions::rootRank)
@@ -999,20 +1071,35 @@ This class does not support ``__members__`` property.)");
   py::class_<::c10d::AllreduceOptions>(module, "AllreduceOptions")
       .def(py::init<>())
       .def_readwrite("reduceOp", &::c10d::AllreduceOptions::reduceOp)
+<<<<<<< HEAD
       .def_readwrite("timeout", &::c10d::AllreduceOptions::timeout);
+=======
+      .def_readwrite("timeout", &::c10d::AllreduceOptions::timeout)
+      .def_readwrite("asyncOp", &::c10d::AllreduceOptions::asyncOp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py::class_<::c10d::AllreduceCoalescedOptions>(
       module, "AllreduceCoalescedOptions")
       .def(py::init<>())
       .def_readwrite("reduceOp", &::c10d::AllreduceCoalescedOptions::reduceOp)
+<<<<<<< HEAD
       .def_readwrite("timeout", &::c10d::AllreduceCoalescedOptions::timeout);
+=======
+      .def_readwrite("timeout", &::c10d::AllreduceCoalescedOptions::timeout)
+      .def_readwrite("asyncOp", &::c10d::AllreduceCoalescedOptions::asyncOp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py::class_<::c10d::ReduceOptions>(module, "ReduceOptions")
       .def(py::init<>())
       .def_readwrite("reduceOp", &::c10d::ReduceOptions::reduceOp)
       .def_readwrite("rootRank", &::c10d::ReduceOptions::rootRank)
       .def_readwrite("rootTensor", &::c10d::ReduceOptions::rootTensor)
+<<<<<<< HEAD
       .def_readwrite("timeout", &::c10d::ReduceOptions::timeout);
+=======
+      .def_readwrite("timeout", &::c10d::ReduceOptions::timeout)
+      .def_readwrite("asyncOp", &::c10d::ReduceOptions::asyncOp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py::class_<::c10d::AllgatherOptions>(module, "AllgatherOptions")
       .def(py::init<>())
@@ -1022,7 +1109,12 @@ This class does not support ``__members__`` property.)");
   py::class_<::c10d::GatherOptions>(module, "GatherOptions")
       .def(py::init<>())
       .def_readwrite("rootRank", &::c10d::GatherOptions::rootRank)
+<<<<<<< HEAD
       .def_readwrite("timeout", &::c10d::GatherOptions::timeout);
+=======
+      .def_readwrite("timeout", &::c10d::GatherOptions::timeout)
+      .def_readwrite("asyncOp", &::c10d::GatherOptions::asyncOp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py::class_<::c10d::ScatterOptions>(module, "ScatterOptions")
       .def(py::init<>())
@@ -1040,11 +1132,21 @@ This class does not support ``__members__`` property.)");
       .def(py::init<>())
       .def_readwrite("device_ids", &::c10d::BarrierOptions::device_ids)
       .def_readwrite("timeout", &::c10d::BarrierOptions::timeout)
+<<<<<<< HEAD
       .def_readwrite("device", &::c10d::BarrierOptions::device);
 
   py::class_<::c10d::AllToAllOptions>(module, "AllToAllOptions")
       .def(py::init<>())
       .def_readwrite("timeout", &::c10d::AllToAllOptions::timeout);
+=======
+      .def_readwrite("device", &::c10d::BarrierOptions::device)
+      .def_readwrite("asyncOp", &::c10d::BarrierOptions::asyncOp);
+
+  py::class_<::c10d::AllToAllOptions>(module, "AllToAllOptions")
+      .def(py::init<>())
+      .def_readwrite("timeout", &::c10d::AllToAllOptions::timeout)
+      .def_readwrite("asyncOp", &::c10d::AllToAllOptions::asyncOp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py::class_<::c10d::DistributedBackendOptions>(
       module, "_DistributedBackendOptions")
@@ -1202,6 +1304,19 @@ and :class:`~torch.distributed.HashStore`).
 )")
           // Default constructor.
           .def(py::init<>())
+<<<<<<< HEAD
+=======
+          .def(
+              "clone",
+              &::c10d::Store::clone,
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Clones the store and returns a new object that points to the same underlying
+store. The returned store can be used concurrently with the original object.
+This is intended to provide a safe way to use a store from multiple threads by
+cloning one store per thread.
+)")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           // Convert from std::string to std::vector<uint8>.
           .def(
               "set",
@@ -1327,7 +1442,11 @@ Calling :meth:`~torch.distributed.store.check` with a list of keys that
 one wants to check whether stored in the store or not.
 
 Arguments:
+<<<<<<< HEAD
     keys (lisr[str]): The keys to query whether stored in the store.
+=======
+    keys (list[str]): The keys to query whether stored in the store.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Example::
     >>> import torch.distributed as dist
@@ -1536,6 +1655,66 @@ Example::
     >>> store.get("first_key")
 )")
           .def(
+<<<<<<< HEAD
+=======
+              "queue_push",
+              [](::c10d::Store& store,
+                 const std::string& key,
+                 const std::string& value) {
+                store.queuePush(key, toVec8(value));
+              },
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Pushes a value into the specified queue.
+
+Using the same key for queues and set/get operations may result in unexpected
+behavior.
+
+wait/check operations are supported for queues.
+
+wait with queues will only wake one waiting worker rather than all.
+
+Arguments:
+    key (str): The key of the queue to push to.
+    value (str): The value to push into the queue.
+)")
+          .def(
+              "queue_pop",
+              [](::c10d::Store& store, const std::string& key, bool block) {
+                auto out = [&]() {
+                  py::gil_scoped_release guard;
+                  return store.queuePop(key, block);
+                }();
+                return toPyBytes(out);
+              },
+              py::arg("key"),
+              py::arg("block") = true,
+              R"(
+Pops a value from the specified queue or waits until timeout if the queue is empty.
+
+See queue_push for more details.
+
+If block is False, a dist.QueueEmptyError will be raised if the queue is empty.
+
+Arguments:
+    key (str): The key of the queue to pop from.
+    block (bool): Whether to block waiting for the key or immediately return.
+)")
+          .def(
+              "queue_len",
+              &::c10d::Store::queueLen,
+              R"(
+Returns the length of the specified queue.
+
+If the queue doesn't exist it returns 0.
+
+See queue_push for more details.
+
+Arguments:
+    key (str): The key of the queue to get the length.
+)")
+          .def(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               "has_extended_api",
               &::c10d::Store::hasExtendedApi,
               R"(Returns true if the store supports extended operations.)");
@@ -1609,7 +1788,11 @@ the server to establish a connection.
     timeout (timedelta, optional): Timeout used by the store during initialization and for methods such as :meth:`~torch.distributed.store.get` and :meth:`~torch.distributed.store.wait`. Default is timedelta(seconds=300)
     wait_for_workers (bool, optional): Whether to wait for all the workers to connect with the server store. This is only applicable when world_size is a fixed value. Default is True.
     multi_tenant (bool, optional): If True, all ``TCPStore`` instances in the current process with the same host/port will use the same underlying ``TCPServer``. Default is False.
+<<<<<<< HEAD
     master_listen_fd (int, optional): If specified, the underlying ``TCPServer`` will listen on this file descriptor, which must be a socket already bound to ``port``. Useful to avoid port assignment races in some scenarios. Default is None (meaning the server creates a new socket and attempts to bind it to ``port``).
+=======
+    master_listen_fd (int, optional): If specified, the underlying ``TCPServer`` will listen on this file descriptor, which must be a socket already bound to ``port``. To bind an ephemeral port we recommend setting the port to 0 and reading ``.port``. Default is None (meaning the server creates a new socket and attempts to bind it to ``port``).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     use_libuv (bool, optional): If True, use libuv for ``TCPServer`` backend. Default is True.
 Example::
     >>> import torch.distributed as dist
@@ -1927,10 +2110,15 @@ communication mechanism.
           py::arg("world_size"));
 
   auto processGroup =
+<<<<<<< HEAD
       py::class_<
           ::c10d::ProcessGroup,
           c10::intrusive_ptr<::c10d::ProcessGroup>,
           ::c10d::PyProcessGroup>(module, "ProcessGroup",
+=======
+      intrusive_ptr_no_gil_destructor_trampoline_class_<
+          ::c10d::ProcessGroup, ::c10d::PyProcessGroup>(module, "ProcessGroup",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           R"(A ProcessGroup is a communication primitive that allows for
           collective operations across a group of processes.
 
@@ -2506,6 +2694,13 @@ The hook must have the following signature:
               "supports_coalescing",
               &::c10d::Backend::supportsCoalescing,
               "(test whether the backend supports coalescing)")
+<<<<<<< HEAD
+=======
+          .def_property_readonly(
+              "supports_time_estimate",
+              &::c10d::Backend::supportsTimeEstimation,
+              "(test whether the backend supports collective time estimation)")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .def(
               "broadcast",
               &::c10d::Backend::broadcast,
@@ -2791,6 +2986,30 @@ The hook must have the following signature:
               "_end_coalescing",
               &::c10d::Backend::endCoalescing,
               py::call_guard<py::gil_scoped_release>())
+<<<<<<< HEAD
+=======
+          .def(
+              "supports_tensor_alloc",
+              [](::c10d::Backend& self, c10::Device device) {
+                return self.supportsTensorAlloc(device.index());
+              },
+              py::arg("device"),
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "allocate_tensor",
+              [](::c10d::Backend& self,
+                 long size,
+                 c10::ScalarType dtype,
+                 c10::Device device) {
+                return self.allocateTensor(
+                    size, at::TensorOptions().dtype(dtype).device(device));
+              },
+              py::arg("size"),
+              py::kw_only(),
+              py::arg("dtype"),
+              py::arg("device"),
+              py::call_guard<py::gil_scoped_release>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           .def_property_readonly(
               "mem_allocator", &::c10d::Backend::getMemAllocator);
 
@@ -2834,11 +3053,21 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
       processGroupGloo, "_Options", backendOptions)
       .def(py::init<>())
       .def_readwrite("_devices", &::c10d::ProcessGroupGloo::Options::devices)
+<<<<<<< HEAD
       .def_readwrite("_threads", &::c10d::ProcessGroupGloo::Options::threads);
+=======
+      .def_readwrite("_threads", &::c10d::ProcessGroupGloo::Options::threads)
+      .def_readwrite(
+          "global_ranks_in_group",
+          &::c10d::ProcessGroupGloo::Options::global_ranks_in_group)
+      .def_readwrite(
+          "group_name", &::c10d::ProcessGroupGloo::Options::group_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   processGroupGloo
       .def_static(
           "create_device",
+<<<<<<< HEAD
           [](const std::string& hostname, const std::string& interface)
               -> std::shared_ptr<::gloo::transport::Device> {
             if (!hostname.empty()) {
@@ -2848,15 +3077,45 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
             if (!interface.empty()) {
               return ::c10d::ProcessGroupGloo::createDeviceForInterface(
                   interface);
+=======
+          [](const std::string& hostname,
+             const std::string& interface,
+             std::optional<bool> lazyInit_)
+              -> std::shared_ptr<::gloo::transport::Device> {
+            bool lazyInit =
+                lazyInit_.value_or(::c10d::getDefaultGlooLazyInit());
+
+            if (!hostname.empty()) {
+              return ::c10d::ProcessGroupGloo::createDeviceForHostname(
+                  hostname, lazyInit);
+            }
+            if (!interface.empty()) {
+              return ::c10d::ProcessGroupGloo::createDeviceForInterface(
+                  interface, lazyInit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
             throw std::invalid_argument(
                 "Specify either `hostname` or `interface` argument.");
           },
           py::arg("hostname") = "",
+<<<<<<< HEAD
           py::arg("interface") = "")
       .def_static(
           "create_default_device",
           &::c10d::ProcessGroupGloo::createDefaultDevice);
+=======
+          py::arg("interface") = "",
+          py::arg("lazy_init") = std::nullopt)
+      .def_static(
+          "create_default_device",
+          [](std::optional<bool> lazyInit_) {
+            bool lazyInit =
+                lazyInit_.value_or(::c10d::getDefaultGlooLazyInit());
+
+            return ::c10d::ProcessGroupGloo::createDefaultDevice(lazyInit);
+          },
+          py::arg("lazy_init") = std::nullopt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   processGroupGloo
       .def(
@@ -2888,6 +3147,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
             py::gil_scoped_release nogil{};
 
             auto options = ::c10d::ProcessGroupGloo::Options::create();
+<<<<<<< HEAD
 
             // Use interfaces listed in "GLOO_SOCKET_IFNAME", if set.
             char* ifnameEnv = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
@@ -2895,13 +3155,29 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               for (const auto& iface : ::c10d::split(',', ifnameEnv)) {
                 options->devices.push_back(
                     ::c10d::ProcessGroupGloo::createDeviceForInterface(iface));
+=======
+            bool lazyInit = ::c10d::getDefaultGlooLazyInit();
+
+            // Use interfaces listed in "GLOO_SOCKET_IFNAME", if set.
+            auto ifnameEnv =
+                c10::utils::get_env(GLOO_SOCKET_IFNAME_ENV.c_str());
+            if (ifnameEnv && ifnameEnv->size() > 1) {
+              for (const auto& iface : ::c10d::split(',', ifnameEnv->c_str())) {
+                options->devices.push_back(
+                    ::c10d::ProcessGroupGloo::createDeviceForInterface(
+                        iface, lazyInit));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               }
             } else {
               // If no hostname is specified, this function looks up
               // the machine's hostname and returns a device instance
               // associated with the address that the hostname resolves to.
               options->devices.push_back(
+<<<<<<< HEAD
                   ::c10d::ProcessGroupGloo::createDefaultDevice());
+=======
+                  ::c10d::ProcessGroupGloo::createDefaultDevice(lazyInit));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
 
             options->timeout = timeout;
@@ -2992,9 +3268,33 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               py::arg("size"),
               py::arg("timeout") = ::c10d::kProcessGroupNCCLDefaultTimeout,
               R"(Create a new ProcessGroupNCCL instance.)")
+<<<<<<< HEAD
           .def("_group_start", &::c10d::ProcessGroupNCCL::groupStart)
           .def("_group_end", &::c10d::ProcessGroupNCCL::groupEnd)
           .def(
+=======
+          .def(
+              "_comm_ptr",
+              &::c10d::ProcessGroupNCCL::getCommPtr,
+              R"(
+            Get the communicator of the current device.
+
+            .. warning ::
+                Unsafe to use. The collectives launched into the communicator
+                externally outside ProcessGroupNCCL are not monitored by the
+                watchdog. Please do not modify or free the communicator as the
+                communicator is managed by the ProcessGroupNCCL. Please also
+                check the readiness of the communicator before launching any
+                collectives into the communicator.
+            )")
+          .def("_group_start", &::c10d::ProcessGroupNCCL::groupStart)
+          .def("_group_end", &::c10d::ProcessGroupNCCL::groupEnd)
+          .def(
+              "_start_time_estimate",
+              &::c10d::ProcessGroupNCCL::startTimeEstimate)
+          .def("_end_time_estimate", &::c10d::ProcessGroupNCCL::endTimeEstimate)
+          .def(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               "comm_split_count",
               &::c10d::ProcessGroupNCCL::getCommSplitCounter)
           .def(
@@ -3046,7 +3346,27 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def(
               "get_error",
               &::c10d::ProcessGroupNCCL::getError,
+<<<<<<< HEAD
               py::call_guard<py::gil_scoped_release>());
+=======
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "_set_enable_nan_check",
+              [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self,
+                 bool enable_nan_check) {
+                self->setEnableNanCheck(enable_nan_check);
+              },
+              py::arg("enable_nan_check"),
+              py::call_guard<py::gil_scoped_release>())
+          .def_static(
+              "get_build_nccl_version",
+              [] {
+                return std::make_tuple(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH);
+              })
+          .def_static("get_runtime_nccl_version", [] {
+            return ::c10d::getNcclVersionTuple();
+          });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   module.def(
       "_get_intra_node_comm_usage_counter",
@@ -3061,7 +3381,14 @@ ncclConfig_t data type for configuring NCCL communicators.
 See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t
 for details.
 )")
+<<<<<<< HEAD
       .def(py::init<>())
+=======
+      .def(py::init([]() {
+        ncclConfig_t defaultCfg = NCCL_CONFIG_INITIALIZER;
+        return std::make_unique<ncclConfig_t>(defaultCfg);
+      }))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def_readwrite("blocking", &ncclConfig_t::blocking)
       .def_readwrite("cga_cluster_size", &ncclConfig_t::cgaClusterSize)
       .def_readwrite("min_ctas", &ncclConfig_t::minCTAs)
@@ -3069,6 +3396,21 @@ for details.
 #ifdef NCCL_HAS_COMM_SPLIT
       .def_readwrite("split_share", &ncclConfig_t::splitShare)
 #endif
+<<<<<<< HEAD
+=======
+#ifdef NCCL_HAS_QOS
+      .def_readwrite("traffic_class", &ncclConfig_t::trafficClass)
+#endif
+#ifdef NCCL_HAS_COLLNET
+      .def_readwrite("collnet_enable", &ncclConfig_t::collnetEnable)
+#endif
+#ifdef NCCL_HAS_CTA_POLICY
+      .def_readwrite("cta_policy", &ncclConfig_t::CTAPolicy)
+#endif
+#ifdef NCCL_HAS_NVLS_CTAS
+      .def_readwrite("nvls_ctas", &ncclConfig_t::nvlsCTAs)
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def_property(
           "net_name",
           [](const ncclConfig_t& self) { return self.netName; },
@@ -3077,7 +3419,20 @@ for details.
           // shouldn't leak because of allocation in strdup.
           [](ncclConfig_t& self, const char* tmp) {
             self.netName = strdup(tmp);
+<<<<<<< HEAD
           });
+=======
+          })
+      .def(
+          "__copy__",
+          [](const ncclConfig_t& self) { return ncclConfig_t(self); })
+      .def(
+          "__deepcopy__",
+          [](const ncclConfig_t& self, const py::dict& memo) {
+            return ncclConfig_t(self);
+          },
+          py::arg("memo"));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // NCCL_HAS_CONFIG
 
   intrusive_ptr_class_<::c10d::ProcessGroupNCCL::Options>(
@@ -3094,7 +3449,11 @@ ProcessGroup options for the NCCL backend
             Default is False.
 
 Attributes:
+<<<<<<< HEAD
     config (NCCLConfig): configures NCCL communicators (only avaiable for
+=======
+    config (NCCLConfig): configures NCCL communicators (only available for
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             builds using NCCL 2.17+). This can be used to improve
             communication-computation overlap for NCCL kernels by tuning
             available parameters in the config. See
@@ -3128,7 +3487,24 @@ Example::
           "global_ranks_in_group",
           &::c10d::ProcessGroupNCCL::Options::global_ranks_in_group)
       .def_readwrite(
+<<<<<<< HEAD
           "group_name", &::c10d::ProcessGroupNCCL::Options::group_name);
+=======
+          "group_name", &::c10d::ProcessGroupNCCL::Options::group_name)
+      .def(
+          "__copy__",
+          [](const ::c10d::ProcessGroupNCCL::Options& self) {
+            return ::c10d::ProcessGroupNCCL::Options(self);
+          })
+      .def(
+          "__deepcopy__",
+          [](const ::c10d::ProcessGroupNCCL::Options& self,
+             const py::dict& memo) {
+            return ::c10d::ProcessGroupNCCL::Options(self);
+          },
+          py::arg("memo"));
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 
 #ifdef USE_C10D_MPI
@@ -3346,7 +3722,11 @@ such as `dist.all_reduce(tensor, async_op=True)`.
 
             Example::
                 Below is an example of a simple allreduce DDP communication hook that uses
+<<<<<<< HEAD
                 ``get_future` API to retrieve a Future associated with the completion of
+=======
+                ``get_future`` API to retrieve a Future associated with the completion of
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ``allreduce``.
 
                 >>> def allreduce(process_group: dist.ProcessGroup, bucket: dist.GradBucket): -> torch.futures.Future
@@ -3544,6 +3924,17 @@ such as `dist.all_reduce(tensor, async_op=True)`.
         if (get("key3") != "15") {
           TORCH_CHECK(false, "assertion failed");
         }
+<<<<<<< HEAD
+=======
+
+        auto cloned = store->clone();
+        store->set("foo", "bar");
+
+        auto ret = cloned->get("foo");
+        TORCH_CHECK(
+            std::string(ret.begin(), ret.end()) == "bar",
+            "checked clone behavior");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       },
       py::call_guard<py::gil_scoped_release>());
 
@@ -3650,6 +4041,49 @@ such as `dist.all_reduce(tensor, async_op=True)`.
       )");
 #endif
 
+<<<<<<< HEAD
+=======
+  module.def(
+      "_dump_fr_trace_json",
+      [](std::optional<bool> includeCollectives,
+         std::optional<bool> onlyActive) {
+        return py::bytes(::c10d::dump_fr_trace_json(
+            includeCollectives.value_or(true), onlyActive.value_or(false)));
+      },
+      py::arg("includeCollectives") = std::optional<bool>(),
+      py::arg("onlyActive") = std::optional<bool>(),
+      R"(
+        Arguments:
+                includeCollectives(bool, optional): Whether to include collective work traces. Default is True.
+                onlyActive (bool, optional): Whether to only include active collective work traces. Default is False.
+        Returns:
+                Stringified json work traces.
+                Default settings return everything.
+    )");
+  module.def(
+      "_dump_fr_trace",
+      [](std::optional<bool> includeCollectives,
+         std::optional<bool> includeStackTraces,
+         std::optional<bool> onlyActive) {
+        return py::bytes(::c10d::dump_fr_trace(
+            includeCollectives.value_or(true),
+            includeStackTraces.value_or(true),
+            onlyActive.value_or(false)));
+      },
+      py::arg("includeCollectives") = std::optional<bool>(),
+      py::arg("includeStackTraces") = std::optional<bool>(),
+      py::arg("onlyActive") = std::optional<bool>(),
+      R"(
+            Arguments:
+                includeCollectives(bool, optional): Whether to include collective work traces. Default is True.
+                includeStackTraces(bool, optional): Whether to include stacktraces in the collective work traces. Default is True.
+                onlyActive (bool, optional): Whether to only include active collective work traces. Default is False.
+            Returns:
+                Stringified pickle work traces.
+                Default settings return everything.
+        )");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   intrusive_ptr_class_<::c10d::control_plane::WorkerServer>(
       module, "_WorkerServer", R"(
 )")
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index 9f015ecec21a..53b118acc5fd 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -276,6 +276,7 @@ void Logger::set_runtime_stats_and_log() {
   num_iterations_stats_recorded_++;
   // Set ith iteration when the runtime stats are set.
   ddp_logging_data_->ints_map["iteration"] = reducer_->num_iterations_;
+<<<<<<< HEAD
   // When get_ddp_logging_data() is called, "unused_parameter_size",
   // "has_rebuilt_buckets" and "rebuilt_bucket_sizes" are updated in the latest
   // sampling iteration.
@@ -287,6 +288,14 @@ void Logger::set_runtime_stats_and_log() {
     // No unused params in this iteration
     ddp_logging_data_->ints_map["unused_parameter_size"] = 0;
   }
+=======
+  ddp_logging_data_->ints_map["num_buckets_reduced"] =
+      reducer_->num_buckets_reduced_;
+  // When get_ddp_logging_data() is called, "unused_parameter_size",
+  // "has_rebuilt_buckets" and "rebuilt_bucket_sizes" are updated in the latest
+  // sampling iteration.
+  ddp_logging_data_->ints_map["unused_parameter_size"] = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& unused_index : reducer_->unused_parameters_) {
     const auto& v = reducer_->params_[unused_index];
     ddp_logging_data_->ints_map["unused_parameter_size"] +=
diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp
index c1797046a979..92ab2a01af63 100644
--- a/torch/csrc/distributed/c10d/logger.hpp
+++ b/torch/csrc/distributed/c10d/logger.hpp
@@ -7,6 +7,40 @@
 
 namespace c10d {
 
+<<<<<<< HEAD
+=======
+// A struct to hold the latest status of the process group.
+struct ProcessGroupStatus {
+  // the sequential number of the last collective enqueued into workMetaList_
+  // This is useful for identifying a rank that has not join a collective
+  // initialized to be -1 to indicate no collective has been enqueued
+  int64_t lastEnqueuedSeq{-1};
+  // the sequential number of the last collective started as the kernel
+  int64_t lastStartedSeq{-1};
+  // the sequential number of the last collective completed marked by
+  // the watchdog thread
+  // initialized to be -1 to indicate no collective has been completed
+  int64_t lastCompletedSeq{-1};
+
+  // the name of the last collective enqueued into workMetaList_
+  std::string lastEnqueuedWorkName;
+  // the name of the last collective started as the kernel
+  std::string lastStartedWorkName;
+  // the name of the last collective completed
+  std::string lastCompletedWorkName;
+
+  // the sizes of the last work enqueued
+  size_t lastEnqueuedNumelIn;
+  size_t lastEnqueuedNumelOut;
+  // the sizes of the last work completed
+  size_t lastCompletedNumelIn;
+  size_t lastCompletedNumelOut;
+  // the sizes of the last work started
+  size_t lastStartedNumelIn;
+  size_t lastStartedNumelOut;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TORCH_API Logger {
  public:
   explicit Logger(std::shared_ptr<c10d::Reducer> reducer);
diff --git a/torch/csrc/distributed/c10d/python_comm_hook.cpp b/torch/csrc/distributed/c10d/python_comm_hook.cpp
index adf73452bd7b..5d4f5a7c8e1c 100644
--- a/torch/csrc/distributed/c10d/python_comm_hook.cpp
+++ b/torch/csrc/distributed/c10d/python_comm_hook.cpp
@@ -28,7 +28,11 @@ c10::intrusive_ptr<c10::ivalue::Future> PythonCommHook::runHook(
   try {
     return py_fut.cast<std::shared_ptr<torch::jit::PythonFutureWrapper>>()->fut;
   } catch (const py::cast_error& e) {
+<<<<<<< HEAD
     auto type = py_fut.get_type();
+=======
+    auto type = py::type::handle_of(py_fut);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto errMsg = c10::str(
         e.what(),
         ". DDP communication hook's callback must return a "
diff --git a/torch/csrc/distributed/c10d/python_comm_hook.h b/torch/csrc/distributed/c10d/python_comm_hook.h
index 48ad7cefae94..096aba086490 100644
--- a/torch/csrc/distributed/c10d/python_comm_hook.h
+++ b/torch/csrc/distributed/c10d/python_comm_hook.h
@@ -15,7 +15,11 @@ class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
   // The state is passed to the hook in runHook method, and it can be used to
   // maintain and update any state information during the execution of the hook.
   // The hook performs user-specified processing and returns a future indicating
+<<<<<<< HEAD
   // asychronous communication of gradients.
+=======
+  // asynchronous communication of gradients.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PythonCommHook(py::object state, py::object hook)
       : state_(std::move(state)), hook_(std::move(hook)) {}
 
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 0cec78443ea3..30cfdcefa47e 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -96,7 +96,13 @@ Reducer::Reducer(
     bool find_unused_parameters,
     bool gradient_as_bucket_view,
     std::unordered_map<size_t, std::string> param_names,
+<<<<<<< HEAD
     int64_t first_bucket_bytes_cap)
+=======
+    int64_t first_bucket_bytes_cap,
+    bool skip_all_reduce_unused_params,
+    bool use_python_reducer)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     : params_(std::move(params)),
       process_group_(std::move(process_group)),
       expect_sparse_gradients_(std::move(expect_sparse_gradients)),
@@ -111,14 +117,27 @@ Reducer::Reducer(
       num_bwd_calls_(0),
       first_autograd_hook_called_(false),
       num_buckets_ready_(0),
+<<<<<<< HEAD
+=======
+      num_buckets_reduced_(0),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       has_rebuilt_bucket_(false),
       bucket_bytes_cap_(bucket_bytes_cap),
       div_factor_(kUnsetDivFactor),
       static_graph_(false),
+<<<<<<< HEAD
       comm_hook_(nullptr),
       ddp_debug_level_(debug_level()),
       param_names_(std::move(param_names)),
       first_bucket_bytes_cap_(first_bucket_bytes_cap) {
+=======
+      skip_all_reduce_unused_params_(skip_all_reduce_unused_params),
+      comm_hook_(nullptr),
+      ddp_debug_level_(debug_level()),
+      param_names_(std::move(param_names)),
+      first_bucket_bytes_cap_(first_bucket_bytes_cap),
+      use_python_reducer_(use_python_reducer) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   C10_LOG_API_USAGE_ONCE("torch.distributed.ddp.reducer");
   TORCH_INTERNAL_ASSERT(!params_.empty(), "Expected at least one parameter.");
 
@@ -196,8 +215,14 @@ Reducer::Reducer(
                 this->autograd_hook(variable_index);
                 return outputs;
               },
+<<<<<<< HEAD
               [=](torch::autograd::CompiledNodeArgs& args) {
                 TORCH_INTERNAL_ASSERT(
+=======
+              [this](torch::autograd::CompiledNodeArgs& args) {
+                TORCH_CHECK(
+                    this->use_python_reducer_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "Compiled autograd is not compatible with C++ DDP Reducer, please use torch._dynamo.config.optimize_ddp=\"python_reducer\".");
               })),
           grad_accumulator);
@@ -1020,6 +1045,25 @@ std::vector<at::Tensor> Reducer::get_variables_for_bucket(
   }
 }
 
+<<<<<<< HEAD
+=======
+bool Reducer::is_unused_bucket(Bucket& bucket) {
+  for (const auto& variable_index : bucket.variable_indices) {
+    if (std::find(
+            unused_parameters_.begin(),
+            unused_parameters_.end(),
+            variable_index) == unused_parameters_.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool Reducer::should_skip_all_reduce_bucket(Bucket& bucket) {
+  return is_unused_bucket(bucket) && skip_all_reduce_unused_params_;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Called when the bucket at the specified index is ready to be reduced.
 void Reducer::mark_bucket_ready(size_t bucket_index) {
   TORCH_INTERNAL_ASSERT(bucket_index >= next_bucket_);
@@ -1040,7 +1084,14 @@ void Reducer::mark_bucket_ready(size_t bucket_index) {
       record_backward_comm_start_time();
     }
     auto& bucket = buckets_[next_bucket_];
+<<<<<<< HEAD
     all_reduce_bucket(bucket);
+=======
+    if (!should_skip_all_reduce_bucket(bucket)) {
+      all_reduce_bucket(bucket);
+      num_buckets_reduced_++;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -1220,7 +1271,11 @@ void Reducer::initialize_buckets(
       // patterns when copy_ing grad data in and out of its bucket view.
       // However, numerics remain correct, because the bucket view is the same
       // on either end of the raw allreduce.  bucket_view_in.copy(grad)
+<<<<<<< HEAD
       // tranposes
+=======
+      // transposes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // (+ densifies) to the bucket view's layout, the data is allreduced,
       // then grad.copy_(bucket_view_out) transposes it back to grad's layout.
       //
@@ -1357,6 +1412,11 @@ void Reducer::reset_bucket_counting() {
   // in each iteration.
   num_buckets_ready_ = 0;
 
+<<<<<<< HEAD
+=======
+  num_buckets_reduced_ = 0;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto& bucket : buckets_) {
     bucket.pending = bucket.variables.size();
   }
@@ -1643,10 +1703,23 @@ void Reducer::finalize_backward() {
   // flattened `gradients` tensor.
   for (auto& bucket : buckets_) {
     // See Note [DDP Communication Hook]
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(
         bucket.future_work,
         "Expected bucket.future_work not to be null. "
         "This may indicate that communication hook was not properly installed.");
+=======
+    // It is possible that the bucket all_reduce is skipped if the bucket is
+    // unused bucket and skip_all_reduce_unused_params_ is true.
+    if (bucket.future_work == nullptr) {
+      TORCH_INTERNAL_ASSERT(
+          skip_all_reduce_unused_params_,
+          "currently only support to skip all reduce for unused params "
+          "when skip_all_reduce_unused_params_ is true.");
+      continue;
+    }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bucket.future_work->wait();
     auto future_result = comm_hook_ == nullptr
         ? detail::parseCppCommHookResult(bucket.future_work->value())
@@ -1815,6 +1888,14 @@ void Reducer::sync_bucket_indices(
   indices_accessor_Index = 0;
   for (const auto i : c10::irange(num_buckets)) {
     const auto& bucket_size = bucket_sizes_accessor[static_cast<int64_t>(i)];
+<<<<<<< HEAD
+=======
+    TORCH_CHECK_WITH(
+        IndexError,
+        bucket_size >= 0 && bucket_size <= indices_accessor.size(0),
+        "received invalid bucket_size, was abort called?");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<size_t> bucket;
     bucket.reserve(bucket_size);
     for (const auto j : c10::irange(bucket_size)) {
diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp
index 11ca3afaca24..9ad97bf4d509 100644
--- a/torch/csrc/distributed/c10d/reducer.hpp
+++ b/torch/csrc/distributed/c10d/reducer.hpp
@@ -57,7 +57,13 @@ class TORCH_API Reducer {
       bool find_unused_parameters,
       bool gradient_as_bucket_view,
       std::unordered_map<size_t, std::string> param_names,
+<<<<<<< HEAD
       int64_t first_bucket_bytes_cap);
+=======
+      int64_t first_bucket_bytes_cap,
+      bool skip_all_reduce_unused_params,
+      bool use_python_reducer);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   ~Reducer() noexcept(false);
 
@@ -431,6 +437,11 @@ class TORCH_API Reducer {
   // track the number of buckets that have been ready for
   // communication calls like allReduce or communication hooks.
   int num_buckets_ready_;
+<<<<<<< HEAD
+=======
+  // track the number of buckets that have been reduced.
+  int num_buckets_reduced_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Timing information.
   int64_t backward_compute_start_time_ = -1;
@@ -492,6 +503,11 @@ class TORCH_API Reducer {
 
   bool static_graph_;
 
+<<<<<<< HEAD
+=======
+  bool skip_all_reduce_unused_params_;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Key: size_t (index), Value: the number of times that a variable's
   // autograd_hook() should be triggered before marking this variable's grad as
   // ready for communication. Map will not change after 1st iteration.
@@ -522,6 +538,12 @@ class TORCH_API Reducer {
   bool static_graph_first_iteration();
   bool static_graph_after_first_iteration();
 
+<<<<<<< HEAD
+=======
+  bool is_unused_bucket(Bucket& bucket);
+  bool should_skip_all_reduce_bucket(Bucket& bucket);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // comm_hook_ is used to access the DDP communication hook if registered.
   std::unique_ptr<CommHookInterface> comm_hook_;
 
@@ -554,6 +576,12 @@ class TORCH_API Reducer {
   void checkAndRaiseMarkedTwiceError(size_t curVariableIndex);
   // Retrieves parameter corresponding to the given VariableIndex.
   at::Tensor& get_param_from_index(size_t index);
+<<<<<<< HEAD
+=======
+  // Python reducer keeps C++ reducer initialized. To remove this flag,
+  // we need to refactor the DDP wrapper's initialization.
+  bool use_python_reducer_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Cached bucket index to model parameter mapping. Populated after buckets
   // are rebuilt after which this mapping is static.
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
new file mode 100644
index 000000000000..f8e958b7f9fa
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
@@ -0,0 +1,368 @@
+#pragma once
+
+#include <atomic>
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && CUDART_VERSION >= 12010
+#define NVCC_SUPPORTS_MULTICAST 1
+#endif
+
+#include <ATen/ATen.h>
+#if defined(USE_ROCM)
+#include <hip/hip_bf16.h>
+#endif
+#if !defined(USE_ROCM)
+#include <cuda_bf16.h>
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+#include <cuda/atomic>
+#endif
+#endif
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+namespace c10d::symmetric_memory {
+
+template <int Size>
+using Vec = at::native::memory::Vec<Size>;
+
+template <class... T>
+inline constexpr bool dependent_false =
+    at::native::memory::dependent_false<T...>;
+
+using at::native::memory::get_alignment;
+
+template <std::memory_order Sem>
+__device__ __forceinline__ uint32_t
+cas(uint32_t* addr, uint32_t compare, uint32_t val) {
+#if !defined(USE_ROCM) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+  ::cuda::atomic_ref<uint32_t, ::cuda::thread_scope_system> ref(*addr);
+  ref.compare_exchange_strong(compare, val, ::cuda::std::memory_order(Sem));
+  return compare;
+#elif defined(USE_ROCM)
+  __atomic_compare_exchange_n(
+      addr, &compare, val, false, static_cast<int>(Sem), __ATOMIC_RELAXED);
+  return compare;
+#else
+  CUDA_KERNEL_ASSERT(false);
+  return 0;
+#endif
+}
+
+__device__ __forceinline__ void trap() {
+#if defined(USE_ROCM)
+  // abort() calls trap() under the covers. However, on ROCm, the trap is
+  // handled differently inside hip runtime. It collects a gpu core dump and
+  // causes linux kernel to create a core dump of the host application.
+  abort();
+#else
+  __trap();
+#endif
+}
+
+__device__ __forceinline__ size_t global_timer_ns() {
+#if defined(USE_ROCM)
+  static constexpr double MI300_FREQ_GHZ = 2.1;
+  return clock64() / MI300_FREQ_GHZ;
+#else
+  size_t val;
+  asm volatile("mov.u64 %0, %globaltimer;" : "=l"(val) : : "memory");
+  return val;
+#endif
+}
+
+constexpr size_t ns_per_ms = 1e6;
+
+template <std::memory_order Sem>
+__device__ __forceinline__ bool try_put_signal(
+    uint32_t* addr,
+    size_t timeout_ms) {
+  size_t deadline = global_timer_ns() + timeout_ms * ns_per_ms;
+  while (cas<Sem>(addr, 0, 1) != 0) {
+    if (timeout_ms != 0 && global_timer_ns() > deadline) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <std::memory_order Sem>
+__device__ __forceinline__ bool try_wait_signal(
+    uint32_t* addr,
+    size_t timeout_ms) {
+  size_t deadline = global_timer_ns() + timeout_ms * ns_per_ms;
+  while (cas<Sem>(addr, 1, 0) != 1) {
+    if (timeout_ms != 0 && global_timer_ns() > deadline) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <std::memory_order Sem>
+__device__ __forceinline__ void put_signal(uint32_t* addr) {
+  while (cas<Sem>(addr, 0, 1) != 0)
+    ;
+}
+
+template <std::memory_order Sem>
+__device__ __forceinline__ void wait_signal(uint32_t* addr) {
+  while (cas<Sem>(addr, 1, 0) != 1)
+    ;
+}
+
+// Synchronizes blocks with matching blockIdx across participating devices.
+// Note: sync_remote_block itself is not a system level barrier/fence. It is a
+// building block for expressing different synchronization patterns.
+//
+// Pattern 0: Ensures that all writes to symm_mem buffers from previous
+// kernels across all devices are visible to the current kernel:
+//
+//   sync_remote_blocks<std::memory_order_relaxed>(...);
+//   __syncthreads();
+//
+// Pattern 1: Ensures that all writes to symm_mem buffers from the current
+// block are visible to all remote blocks with matching blockIdx:
+//
+//   __syncthreads();
+//   sync_remote_blocks<std::memory_order_acq_rel>(...);
+//   __syncthreads();
+//
+// Pattern 2: Ensures that symm_mem buffers read by the current kernel are safe
+// for writing by subsequent kernels across all devices.
+//
+//   __syncthreads();
+//   sync_remote_blocks<std::memory_order_relaxed>(...);
+template <std::memory_order Sem>
+__device__ __forceinline__ void sync_remote_blocks(
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size);
+
+template <>
+__device__ __forceinline__ void sync_remote_blocks<std::memory_order_relaxed>(
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size) {
+  if (threadIdx.x < world_size) {
+    auto target_rank = threadIdx.x;
+    put_signal<std::memory_order_relaxed>(
+        signal_pads[target_rank] + blockIdx.x * world_size + rank);
+    wait_signal<std::memory_order_relaxed>(
+        signal_pads[rank] + blockIdx.x * world_size + target_rank);
+  }
+}
+
+template <>
+__device__ __forceinline__ void sync_remote_blocks<std::memory_order_acq_rel>(
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size) {
+  if (threadIdx.x < world_size) {
+    auto target_rank = threadIdx.x;
+    put_signal<std::memory_order_release>(
+        signal_pads[target_rank] + blockIdx.x * world_size + rank);
+    wait_signal<std::memory_order_acquire>(
+        signal_pads[rank] + blockIdx.x * world_size + target_rank);
+  }
+}
+
+template <typename T>
+struct MultimemLdReduce {
+  template <int Alignment>
+  __device__ __inline__ Vec<Alignment> operator()(T* mc_ptr) {
+    static_assert(dependent_false<T>);
+  }
+};
+
+template <int Alignment, typename T>
+__device__ __inline__ Vec<Alignment> multimem_ld_reduce_add(T* mc_ptr) {
+  MultimemLdReduce<T> functor;
+  return functor.template operator()<Alignment>(mc_ptr);
+}
+
+#if defined(USE_ROCM) || !defined(NVCC_SUPPORTS_MULTICAST)
+#define SPECIALIZE_MULTIMEM_LD_REDUCE_VEC_32(type, asm_type, acc_prec) \
+  template <>                                                          \
+  struct MultimemLdReduce<type> {                                      \
+    template <int Alignment>                                           \
+    __device__ __inline__ Vec<Alignment> operator()(type* mc_ptr) {    \
+      CUDA_KERNEL_ASSERT(false);                                       \
+    }                                                                  \
+  };
+#else
+#define SPECIALIZE_MULTIMEM_LD_REDUCE_VEC_32(type, asm_type, acc_prec)    \
+  template <>                                                             \
+  struct MultimemLdReduce<type> {                                         \
+    template <int Alignment>                                              \
+    __device__ __inline__ Vec<Alignment> operator()(type* mc_ptr) {       \
+      Vec<Alignment> vec;                                                 \
+      if constexpr (Alignment == 16) {                                    \
+        asm("multimem.ld_reduce.relaxed.sys.global.add" acc_prec          \
+            ".v4" asm_type " {%0,%1,%2,%3}, [%4];"                        \
+            : "=r"(vec.u32[0]),                                           \
+              "=r"(vec.u32[1]),                                           \
+              "=r"(vec.u32[2]),                                           \
+              "=r"(vec.u32[3])                                            \
+            : "l"(mc_ptr)                                                 \
+            : "memory");                                                  \
+      } else if constexpr (Alignment == 8) {                              \
+        asm("multimem.ld_reduce.relaxed.sys.global.add" acc_prec          \
+            ".v2" asm_type " {%0,%1}, [%2];"                              \
+            : "=r"(vec.u32[0]), "=r"(vec.u32[1])                          \
+            : "l"(mc_ptr)                                                 \
+            : "memory");                                                  \
+      } else if constexpr (Alignment == 4) {                              \
+        asm("multimem.ld_reduce.relaxed.sys.global.add" acc_prec asm_type \
+            " %0, [%1];"                                                  \
+            : "=r"(vec.u32)                                               \
+            : "l"(mc_ptr)                                                 \
+            : "memory");                                                  \
+      }                                                                   \
+      return vec;                                                         \
+    }                                                                     \
+  };
+#endif
+
+SPECIALIZE_MULTIMEM_LD_REDUCE_VEC_32(at::BFloat16, ".bf16x2", ".acc::f32");
+SPECIALIZE_MULTIMEM_LD_REDUCE_VEC_32(float, ".f32", "");
+
+template <int Alignment, typename T>
+__device__ __inline__ void multimem_st(T* mc_ptr, Vec<Alignment>& vec) {
+#if defined(USE_ROCM) || !defined(NVCC_SUPPORTS_MULTICAST)
+  CUDA_KERNEL_ASSERT(false);
+#else
+  if constexpr (Alignment == 16) {
+    asm("multimem.st.relaxed.sys.global.v4.f32 [%0], {%1,%2,%3,%4};"
+        :
+        : "l"(mc_ptr),
+          "r"(vec.u32[0]),
+          "r"(vec.u32[1]),
+          "r"(vec.u32[2]),
+          "r"(vec.u32[3])
+        : "memory");
+  } else if constexpr (Alignment == 8) {
+    asm("multimem.st.relaxed.sys.global.v2.f32 [%0], {%1,%2};"
+        :
+        : "l"(mc_ptr), "r"(vec.u32[0]), "r"(vec.u32[1])
+        : "memory");
+  } else if constexpr (Alignment == 4) {
+    asm("multimem.st.relaxed.sys.global.f32 [%0], %1;"
+        :
+        : "l"(mc_ptr), "r"(vec.u32)
+        : "memory");
+  } else {
+    static_assert(dependent_false<T>);
+  }
+#endif
+}
+
+template <typename T>
+__device__ __inline__ T add_bf16x2(T a, T b) {
+  static_assert(sizeof(T) == 4);
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+  CUDA_KERNEL_ASSERT(false);
+  return T{};
+#elif defined(USE_ROCM)
+  union bf2f {
+    float f;
+    __hip_bfloat16 bf[2];
+  } _bf2f_a = {.f = 0}, _bf2f_b = {.f = 0};
+
+  //__hip_bfloat162 is a struct wtih two __hip_bfloat16 elements called x and y
+  // This typecasts input a and b as bfloat16 and maps to low bits of a float
+  // and does the addition in float
+  _bf2f_a.bf[1] = reinterpret_cast<__hip_bfloat162*>(&a)->x;
+  _bf2f_b.bf[1] = reinterpret_cast<__hip_bfloat162*>(&b)->x;
+  union f2bf {
+    float f;
+    __hip_bfloat16 bf[2];
+  } _f2bf_res0, _f2bf_res1;
+  _f2bf_res0.f = _bf2f_a.f + _bf2f_b.f;
+
+  // Same thing for y elements of __hip_bfloat162
+  _bf2f_a.bf[1] = reinterpret_cast<__hip_bfloat162*>(&a)->y;
+  _bf2f_b.bf[1] = reinterpret_cast<__hip_bfloat162*>(&b)->y;
+  _f2bf_res1.f = _bf2f_a.f + _bf2f_b.f;
+
+  // Put the two results together
+  __hip_bfloat162 rtn(_f2bf_res0.bf[1], _f2bf_res1.bf[1]);
+  return *reinterpret_cast<T*>(&rtn);
+#else
+  auto res = __hadd2(
+      *reinterpret_cast<__nv_bfloat162*>(&a),
+      *reinterpret_cast<__nv_bfloat162*>(&b));
+  return *reinterpret_cast<T*>(&res);
+#endif
+}
+
+template <int Alignment, typename T>
+__device__ __inline__ Vec<Alignment> add_vec(
+    const Vec<Alignment>& a,
+    const Vec<Alignment>& b) {
+  Vec<Alignment> c{};
+  if constexpr (std::is_same_v<T, float>) {
+    if constexpr (Alignment == 16) {
+      c.f32[0] = a.f32[0] + b.f32[0];
+      c.f32[1] = a.f32[1] + b.f32[1];
+      c.f32[2] = a.f32[2] + b.f32[2];
+      c.f32[3] = a.f32[3] + b.f32[3];
+    } else if constexpr (Alignment == 8) {
+      c.f32[0] = a.f32[0] + b.f32[0];
+      c.f32[1] = a.f32[1] + b.f32[1];
+    } else if constexpr (Alignment == 4) {
+      c.f32 = a.f32 + b.f32;
+    } else {
+      static_assert(dependent_false<T>);
+    }
+  } else if constexpr (std::is_same_v<T, at::BFloat16>) {
+    if constexpr (Alignment == 16) {
+      c.u32[0] = add_bf16x2(a.u32[0], b.u32[0]);
+      c.u32[1] = add_bf16x2(a.u32[1], b.u32[1]);
+      c.u32[2] = add_bf16x2(a.u32[2], b.u32[2]);
+      c.u32[3] = add_bf16x2(a.u32[3], b.u32[3]);
+    } else if constexpr (Alignment == 8) {
+      c.u32[0] = add_bf16x2(a.u32[0], b.u32[0]);
+      c.u32[1] = add_bf16x2(a.u32[1], b.u32[1]);
+    } else if constexpr (Alignment == 4) {
+      c.u32 = add_bf16x2(a.u32, b.u32);
+    } else {
+      static_assert(dependent_false<T>);
+    }
+  } else {
+    static_assert(dependent_false<T>);
+  }
+  return c;
+}
+
+// With world_size specialization: perform balanced load from all peers before
+// performing reduction.
+template <typename T, int alignment, int k_world_size>
+__device__ inline std::enable_if_t<(k_world_size > 0), Vec<alignment>>
+load_and_reduce(T** ptrs, size_t rank, size_t world_size, size_t offset) {
+  Vec<alignment> vecs[k_world_size];
+#pragma unroll k_world_size
+  for (size_t step = 0; step < k_world_size; ++step) {
+    size_t remote_rank = (rank + step) % k_world_size;
+    vecs[remote_rank] =
+        at::native::memory::ld_vec<alignment>(ptrs[remote_rank] + offset);
+  }
+  auto acc = vecs[0];
+#pragma unroll k_world_size - 1
+  for (size_t r = 1; r < world_size; ++r) {
+    acc = add_vec<alignment, T>(acc, vecs[r]);
+  }
+  return acc;
+}
+
+// Without world_size specialization: perform ordered (unbalanced) load and
+// accumulate on each load.
+template <typename T, int alignment, int k_world_size>
+__device__ inline std::enable_if_t<(k_world_size <= 0), Vec<alignment>>
+load_and_reduce(T** ptrs, size_t rank, size_t world_size, size_t offset) {
+  Vec<alignment> acc{};
+  for (size_t step = 0; step < world_size; ++step) {
+    auto vec = at::native::memory::ld_vec<alignment>(ptrs[step] + offset);
+    acc = add_vec<alignment, T>(acc, vec);
+  }
+  return acc;
+}
+
+} // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
new file mode 100644
index 000000000000..20ccf0d74b60
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -0,0 +1,749 @@
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
+#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
+
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/error.h>
+
+#include <sys/socket.h>
+#include <unistd.h>
+
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#elif defined(USE_ROCM)
+#include <hip/hip_runtime_api.h>
+#endif
+
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 12030
+#define CUDART_SUPPORTS_MULTICAST
+#endif
+
+namespace c10d {
+namespace symmetric_memory {
+
+/* Start of CUDASymmetricMemory implementation */
+
+// A set of exchange methods with prefix "CUDASymmetricMemory"
+static StoreExchange storeExchange = StoreExchange("CUDASymmetricMemory");
+
+AllocationRef::AllocationRef(
+    void* ptr,
+    HandleType handle,
+    size_t block_size,
+    int device_idx)
+    : ptr(ptr),
+      handle(handle),
+      block_size(block_size),
+      device_idx(device_idx) {}
+
+AllocationRef::~AllocationRef() {
+  if (is_finalizing()) {
+    return;
+  }
+  c10::cuda::CUDAGuard guard(device_idx);
+  C10_CUDA_CHECK(cudaDeviceSynchronize());
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  // Leak the cuda allocations during static deinitialization
+  auto driver_api = c10::cuda::DriverAPI::get();
+  C10_CUDA_DRIVER_CHECK(
+      driver_api->cuMemUnmap_(reinterpret_cast<CUdeviceptr>(ptr), block_size));
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMemRelease_(handle));
+#elif defined(USE_ROCM)
+  C10_HIP_CHECK(hipMemUnmap(reinterpret_cast<hipDeviceptr_t>(ptr), block_size));
+  C10_HIP_CHECK(hipMemRelease(handle));
+#else
+  TORCH_CHECK(
+      false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED");
+#endif
+}
+
+CUDASymmetricMemory::CUDASymmetricMemory(
+    std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs,
+    std::vector<void*> buffers,
+    std::vector<void*> signal_pads,
+    HandleType mc_handle,
+    void* mc_addr,
+    size_t buffer_size,
+    int local_device_idx,
+    int rank,
+    int world_size)
+    : alloc_refs_(std::move(alloc_refs)),
+      buffers_(std::move(buffers)),
+      signal_pads_(std::move(signal_pads)),
+      mc_handle_(mc_handle),
+      mc_addr_(mc_addr),
+      buffer_size_(buffer_size),
+      local_device_idx_(local_device_idx),
+      rank_(rank),
+      world_size_(world_size) {
+  const size_t arr_size = sizeof(void*) * world_size_;
+  buffers_dev_ = reinterpret_cast<void**>(
+      c10::cuda::CUDACachingAllocator::raw_alloc(arr_size));
+  signal_pads_dev_ = reinterpret_cast<void**>(
+      c10::cuda::CUDACachingAllocator::raw_alloc(arr_size));
+
+  c10::cuda::CUDAGuard guard(local_device_idx);
+  AT_CUDA_CHECK(cudaMemcpy(
+      buffers_dev_, buffers_.data(), arr_size, cudaMemcpyHostToDevice));
+  AT_CUDA_CHECK(cudaMemcpy(
+      signal_pads_dev_, signal_pads_.data(), arr_size, cudaMemcpyHostToDevice));
+}
+
+std::vector<void*> CUDASymmetricMemory::get_buffer_ptrs() {
+  return buffers_;
+}
+
+std::vector<void*> CUDASymmetricMemory::get_signal_pad_ptrs() {
+  return signal_pads_;
+}
+
+void** CUDASymmetricMemory::get_buffer_ptrs_dev() {
+  return buffers_dev_;
+}
+
+void** CUDASymmetricMemory::get_signal_pad_ptrs_dev() {
+  return signal_pads_dev_;
+}
+
+size_t CUDASymmetricMemory::get_buffer_size() {
+  return buffer_size_;
+}
+
+size_t CUDASymmetricMemory::get_signal_pad_size() {
+  return signal_pad_size;
+}
+
+bool CUDASymmetricMemory::has_multicast_support() {
+  return mc_addr_ != nullptr;
+}
+
+void* CUDASymmetricMemory::get_multicast_ptr() {
+  return mc_addr_;
+}
+
+at::Tensor CUDASymmetricMemory::get_buffer(
+    int rank,
+    c10::IntArrayRef sizes,
+    c10::ScalarType dtype,
+    int64_t storage_offset) {
+  const size_t numel = std::accumulate(
+      sizes.begin(),
+      sizes.end(),
+      static_cast<size_t>(1),
+      std::multiplies<size_t>());
+  const auto element_size = c10::elementSize(dtype);
+  const auto req_size = (numel + storage_offset) * element_size;
+  TORCH_CHECK(
+      req_size <= buffer_size_,
+      "CUDASymmetricMemory::get_buffer: the requested size (",
+      req_size,
+      " bytes) exceeds the allocated size (",
+      buffer_size_,
+      " bytes)");
+  auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
+      storage_offset * element_size;
+  auto device = c10::Device(c10::DeviceType::CUDA, local_device_idx_);
+  auto options = at::TensorOptions().dtype(dtype).device(device);
+  return at::for_blob(data_ptr, sizes)
+      .options(options)
+      .target_device(device)
+      .make_tensor();
+}
+
+at::Tensor CUDASymmetricMemory::get_signal_pad(
+    int rank,
+    c10::IntArrayRef sizes,
+    std::optional<c10::ScalarType> dtype,
+    int64_t storage_offset) {
+  // If the dtype is unspecified, default it to UInt32, as it
+  // is the most common type for signaling purposes.
+  if (!dtype.has_value()) {
+    dtype = c10::ScalarType::UInt32;
+  }
+
+  // If the shape is unspecified, treat the signal pad as a 1d tensor.
+  const auto element_size = c10::elementSize(*dtype);
+  std::vector<int64_t> shape;
+  if (!sizes.empty()) {
+    shape = sizes.vec();
+  } else {
+    shape.push_back(signal_pad_size / element_size);
+  }
+
+  const size_t numel = std::accumulate(
+      shape.begin(),
+      shape.end(),
+      static_cast<size_t>(1),
+      std::multiplies<size_t>());
+  const auto req_size = (numel + storage_offset) * element_size;
+  TORCH_CHECK(
+      req_size <= signal_pad_size,
+      "CUDASymmetricMemory::get_signal_pad: the requested size (",
+      req_size,
+      " bytes) exceeds the allocated size (",
+      signal_pad_size,
+      " bytes)");
+  auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
+      storage_offset * element_size;
+  auto device = c10::Device(c10::DeviceType::CUDA, local_device_idx_);
+  auto options = at::TensorOptions().dtype(*dtype).device(device);
+  return at::for_blob(data_ptr, shape)
+      .options(options)
+      .target_device(device)
+      .make_tensor();
+}
+
+void check_channel(int channel, int world_size) {
+  TORCH_CHECK(
+      channel >= 0,
+      "channel for barrier(), put_signal() and wait_signal() ",
+      "must be greater than 0 (got ",
+      channel,
+      ")");
+  const size_t num_channels = signal_pad_size / sizeof(uint32_t) * world_size;
+  TORCH_CHECK(
+      static_cast<size_t>(channel) < num_channels,
+      "The maximum supported channel for barrier(), put_signal() and wait_signal() is ",
+      num_channels - 1,
+      " (got ",
+      channel,
+      ")");
+}
+
+static __global__ void barrier_kernel(
+    uint32_t** signal_pads,
+    int channel,
+    int rank,
+    int world_size,
+    size_t timeout_ms) {
+  if (threadIdx.x < world_size) {
+    auto target_rank = threadIdx.x;
+    if (target_rank == rank) {
+      return;
+    }
+    auto put_success = try_put_signal<std::memory_order_release>(
+        signal_pads[target_rank] + world_size * channel + rank, timeout_ms);
+    if (!put_success) {
+      printf(
+          "[FATAL] CUDASymmetricMemory::barrier: rank %d failed to send signal "
+          "to rank %d on channel %d after %lu microseconds\n",
+          rank,
+          target_rank,
+          channel,
+          timeout_ms);
+      trap();
+    }
+    auto wait_success = try_wait_signal<std::memory_order_acquire>(
+        signal_pads[rank] + world_size * channel + target_rank, timeout_ms);
+    if (!wait_success) {
+      printf(
+          "[FATAL] CUDASymmetricMemory::barrier: rank %d failed to receive signal "
+          "from rank %d on channel %d after %lu microseconds\n",
+          rank,
+          target_rank,
+          channel,
+          timeout_ms);
+      trap();
+    }
+  }
+}
+
+void CUDASymmetricMemory::barrier(int channel, size_t timeout_ms) {
+  check_channel(channel, world_size_);
+  c10::cuda::CUDAGuard guard(local_device_idx_);
+  barrier_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
+      reinterpret_cast<uint32_t**>(signal_pads_dev_),
+      channel,
+      rank_,
+      world_size_,
+      timeout_ms);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+static __global__ void put_signal_kernel(
+    uint32_t** signal_pads,
+    int dst_rank,
+    int channel,
+    int rank,
+    int world_size,
+    size_t timeout_ms) {
+  if (threadIdx.x == 0) {
+    bool success = try_put_signal<std::memory_order_release>(
+        signal_pads[dst_rank] + world_size * channel + rank, timeout_ms);
+    if (!success) {
+      printf(
+          "[FATAL] CUDASymmetricMemory::put_signal: rank %d failed to send signal "
+          "to rank %d on channel %d after %lu microseconds\n",
+          rank,
+          dst_rank,
+          channel,
+          timeout_ms);
+      trap();
+    }
+  }
+}
+
+void CUDASymmetricMemory::put_signal(
+    int dst_rank,
+    int channel,
+    size_t timeout_ms) {
+  check_channel(channel, world_size_);
+  c10::cuda::CUDAGuard guard(local_device_idx_);
+  put_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
+      reinterpret_cast<uint32_t**>(signal_pads_dev_),
+      dst_rank,
+      channel,
+      rank_,
+      world_size_,
+      timeout_ms);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+static __global__ void wait_signal_kernel(
+    uint32_t** signal_pads,
+    int src_rank,
+    int channel,
+    int rank,
+    int world_size,
+    size_t timeout_ms) {
+  if (threadIdx.x == 0) {
+    bool success = try_wait_signal<std::memory_order_acquire>(
+        signal_pads[rank] + world_size * channel + src_rank, timeout_ms);
+    if (!success) {
+      printf(
+          "[FATAL] CUDASymmetricMemory::wait_signal rank %d failed to receive signal "
+          "from rank %d on channel %d after %lu microseconds\n",
+          rank,
+          src_rank,
+          channel,
+          timeout_ms);
+#if !defined(USE_ROCM)
+      __trap();
+#else
+      assert(0);
+#endif
+    }
+  }
+  __threadfence_system();
+}
+
+void CUDASymmetricMemory::wait_signal(
+    int src_rank,
+    int channel,
+    size_t timeout_ms) {
+  check_channel(channel, world_size_);
+  c10::cuda::CUDAGuard guard(local_device_idx_);
+  wait_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
+      reinterpret_cast<uint32_t**>(signal_pads_dev_),
+      src_rank,
+      channel,
+      rank_,
+      world_size_,
+      timeout_ms);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+int CUDASymmetricMemory::get_rank() {
+  return rank_;
+}
+
+int CUDASymmetricMemory::get_world_size() {
+  return world_size_;
+}
+
+Block::Block(
+    c10::intrusive_ptr<AllocationRef> alloc_ref,
+    int device_idx,
+    size_t block_size,
+    size_t buffer_size,
+    size_t signal_pad_offset,
+    const std::optional<std::string>& group_name)
+    : alloc_ref(std::move(alloc_ref)),
+      device_idx(device_idx),
+      block_size(block_size),
+      buffer_size(buffer_size),
+      signal_pad_offset(signal_pad_offset),
+      default_group_name(std::move(group_name)) {}
+
+void* CUDASymmetricMemoryAllocator::alloc(
+    size_t size,
+    int device_idx,
+    const std::optional<std::string>& group_name) {
+
+  size_t signal_pad_offset = at::round_up(size, 16UL);
+  size_t block_size = signal_pad_offset + signal_pad_size;
+  c10::cuda::CUDAGuard guard(device_idx);
+  device_idx = static_cast<int>(guard.current_device().index());
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+  prop.location.id = device_idx;
+  prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
+
+  size_t granularity;
+  auto driver_api = c10::cuda::DriverAPI::get();
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMemGetAllocationGranularity_(
+      &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+  block_size = at::round_up(block_size, granularity);
+
+  HandleType handle;
+  C10_CUDA_DRIVER_CHECK(
+      driver_api->cuMemCreate_(&handle, block_size, &prop, 0));
+
+#elif defined(USE_ROCM)
+  hipMemAllocationProp prop = {};
+  prop.type = hipMemAllocationTypePinned;
+  prop.location.type = hipMemLocationTypeDevice;
+  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+  prop.location.id = device_idx;
+  prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor;
+
+
+  size_t granularity;
+  C10_HIP_CHECK(hipMemGetAllocationGranularity(
+      &granularity, &prop, hipMemAllocationGranularityRecommended));
+  block_size = at::round_up(block_size, granularity);
+
+  HandleType handle;
+  C10_HIP_CHECK(hipMemCreate(reinterpret_cast<hipMemGenericAllocationHandle_t*>(&handle), block_size, &prop, 0));
+
+#else
+  TORCH_CHECK(
+      false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED");
+#endif
+  void* ptr = nullptr;
+  map_block(&ptr, handle, block_size, device_idx);
+
+  AT_CUDA_CHECK(cudaMemset(ptr, 0, block_size));
+
+  auto alloc_ref =
+      c10::make_intrusive<AllocationRef>(ptr, handle, block_size, device_idx);
+  auto block = c10::make_intrusive<Block>(
+      std::move(alloc_ref),
+      device_idx,
+      block_size,
+      size,
+      signal_pad_offset,
+      group_name);
+  {
+    std::unique_lock lock(mutex_);
+    ptr_to_block_.emplace(ptr, std::move(block));
+  }
+  return ptr;
+}
+
+void CUDASymmetricMemoryAllocator::free(void* ptr) {
+  std::unique_lock lock(mutex_);
+  ptr_to_block_.erase(ptr);
+}
+
+size_t CUDASymmetricMemoryAllocator::get_alloc_size(void* ptr) {
+  auto block = find_block(ptr);
+  TORCH_CHECK(
+      block != nullptr,
+      "CUDASymmetricMemoryAllocator::get_alloc_size: input must be allocated ",
+      "via CUDASymmetricMemoryAllocator::alloc");
+  return block->buffer_size;
+}
+
+struct RendezvousRequest {
+  int device_idx;
+  int pid;
+  size_t block_size;
+  size_t buffer_size;
+  size_t signal_pad_offset;
+  bool has_multicast_support;
+};
+
+void validate_rendezvous_requests(
+    const std::vector<RendezvousRequest>& reqs,
+    int world_size) {
+  TORCH_CHECK(reqs.size() == (size_t)world_size);
+
+  std::unordered_set<int> device_indices;
+  device_indices.reserve(world_size);
+  for (auto req : reqs) {
+    device_indices.insert(req.device_idx);
+  }
+  if (!allow_overlapping_devices() &&
+      device_indices.size() < (size_t)world_size) {
+    TORCH_CHECK(
+        false,
+        "CUDASymmetricMemoryAllocator::rendezvous: ",
+        "detected allocations from overlapping devices ",
+        "from different ranks.");
+  }
+
+  for (int r = 1; r < world_size; ++r) {
+    TORCH_CHECK(reqs[r].block_size == reqs[0].block_size);
+    TORCH_CHECK(reqs[r].buffer_size == reqs[0].buffer_size);
+    TORCH_CHECK(reqs[r].signal_pad_offset == reqs[0].signal_pad_offset);
+  }
+}
+
+static bool check_group_multicast_support(
+    const std::vector<RendezvousRequest>& reqs) {
+  std::vector<size_t> ranks_with_multicast_support;
+  for (size_t r = 0; r < reqs.size(); ++r) {
+    if (reqs[r].has_multicast_support) {
+      ranks_with_multicast_support.push_back(r);
+    }
+  }
+  if (ranks_with_multicast_support.size() == reqs.size()) {
+    return true;
+  } else {
+    // We don't expect this to happen. But we want to let the user to know if
+    // this happens.
+    if (ranks_with_multicast_support.size() != 0) {
+      LOG(WARNING)
+          << "Only a subset of ranks in the group has multicast support: "
+          << ranks_with_multicast_support << " (world_size=" << reqs.size()
+          << "). Skipping multicast initialization because this is unexpected.";
+    }
+    return false;
+  }
+}
+
+static void init_multicast_for_block(
+    HandleType& mc_handle,
+    void*& mc_addr,
+    const c10::intrusive_ptr<Block>& block,
+    IpcChannel& ipc_channel,
+    const std::vector<int>& pids,
+    const c10::intrusive_ptr<c10d::Store>& store,
+    int rank,
+    int world_size) {
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) && \
+    defined(CUDART_SUPPORTS_MULTICAST)
+  auto driver_api = c10::cuda::DriverAPI::get();
+  if (rank == 0) {
+    CUmulticastObjectProp mc_prop{};
+    mc_prop.numDevices = world_size;
+    mc_prop.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+    mc_prop.size = block->block_size;
+
+    // create a multicast object, which acts as a handle that allows multiple
+    // devices or processes to access the same memory allocation coherently.
+    auto err = driver_api->cuMulticastCreate_(&mc_handle, &mc_prop);
+    if (err != CUDA_SUCCESS) {
+      const char* err_str;
+      CUresult get_error_str_err = driver_api->cuGetErrorString_(err, &err_str);
+      if (get_error_str_err != CUDA_SUCCESS) {
+        err_str = "unknown cuda driver error";
+      }
+      LOG(WARNING)
+          << "SymmetricMemory: cuMulticastCreate failed with: \"" << err_str
+          << "\". Gracefully skipping multicast initialization. "
+          << "However, this is unexpected. Please report the issue on GitHub.";
+      // Allow peers gracefully skip multicast initialization by sending -1
+      ipc_channel.broadcast_fds(rank, 0, pids, -1);
+      return;
+    }
+
+    int mc_fd;
+    // using the CUDA Driver API to export a multicast object into a POSIX file descriptor.
+    C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_(
+        &mc_fd, mc_handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
+    ipc_channel.broadcast_fds(rank, 0, pids, mc_fd);
+    // Ref count is incremented as soon as SCM_RIGHTS send happens
+    close(mc_fd);
+  } else {
+    int mc_fd = ipc_channel.broadcast_fds(rank, 0, pids, -1);
+    if (mc_fd == -1) {
+      return;
+    }
+    // Convert back to a handle from the broadcasted POSIX file descriptor.
+    C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
+        &mc_handle,
+        (void*)(uintptr_t)mc_fd,
+        CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+    close(mc_fd);
+  }
+
+  // All rank adds their physical allocation to the multicast object
+  C10_CUDA_DRIVER_CHECK(
+      driver_api->cuMulticastAddDevice_(mc_handle, block->device_idx));
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMulticastBindMem_(
+      mc_handle, 0, block->alloc_ref->handle, 0, block->block_size, 0));
+
+  map_block(&mc_addr, mc_handle, block->block_size, block->device_idx);
+  storeExchange.barrier(store, rank, world_size);
+#endif
+}
+
+c10::intrusive_ptr<SymmetricMemory> CUDASymmetricMemoryAllocator::rendezvous(
+    void* ptr,
+    const std::optional<std::string>& group_name) {
+
+  auto block = find_block(ptr);
+  if (block == nullptr) {
+    return nullptr;
+  }
+
+  // The group_name passed to rendezvous() takes precedence over
+  // the default group_name specified during allocation.
+  std::string group_name_;
+  // Treat empty string and std::nullopt the same as empty string seems to be
+  // implicitly used that way
+  if (group_name.has_value() && group_name != "") {
+    group_name_ = *group_name;
+  } else {
+    if (!block->default_group_name.has_value()) {
+      TORCH_CHECK(
+          false,
+          "CUDASymmetricMemory::rendezvous: `group_name` is neither "
+          "specified during allocation nor passed to rendezvous().");
+    }
+    group_name_ = *block->default_group_name;
+  }
+
+  auto it = block->symm_mems.find(group_name_);
+  if (it != block->symm_mems.end()) {
+    return it->second;
+  }
+
+  c10::cuda::CUDAGuard guard(block->device_idx);
+
+  // Currently, IpcChannel is using a file based socket for inter-process communication
+  IpcChannel ipc_channel;
+  auto group_info = get_group_info(group_name_);
+  auto store = group_info.store;
+  int rank = group_info.rank;
+  int world_size = group_info.world_size;
+  int block_fd;
+
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  auto driver_api = c10::cuda::DriverAPI::get();
+  // using the CUDA Driver API to export a GPU memory block as a
+  // POSIX file descriptor (FD), so it can be shared across processes via IPC.
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMemExportToShareableHandle_(
+      &block_fd,
+      block->alloc_ref->handle,
+      CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+      0));
+#elif defined (USE_ROCM)
+  C10_HIP_CHECK(hipMemExportToShareableHandle(
+      &block_fd, block->alloc_ref->handle, hipMemHandleTypePosixFileDescriptor, 0));
+#else
+  TORCH_CHECK(
+      false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED");
+#endif
+
+  auto local_req = RendezvousRequest{
+      .device_idx = block->device_idx,
+      .pid = getpid(),
+      .block_size = block->block_size,
+      .buffer_size = block->buffer_size,
+      .signal_pad_offset = block->signal_pad_offset,
+      .has_multicast_support = device_has_multicast_support(block->device_idx)};
+  auto reqs = storeExchange.all_gather(store, rank, world_size, local_req);
+  validate_rendezvous_requests(reqs, world_size);
+
+  std::vector<int> pids(world_size);
+  for (int r = 0; r < world_size; ++r) {
+    pids[r] = reqs[r].pid;
+  }
+  auto imported_fds = ipc_channel.all_gather_fds(rank, pids, block_fd);
+
+  std::vector<HandleType> handles(world_size);
+  std::vector<void*> buffers(world_size, nullptr);
+  std::vector<void*> signal_pads(world_size, nullptr);
+
+  for (int r = 0; r < world_size; ++r) {
+    if (r == rank) {
+      handles[r] = block->alloc_ref->handle;
+      buffers[r] = ptr;
+      signal_pads[r] = (void*)((uintptr_t)ptr + block->signal_pad_offset);
+      continue;
+    }
+    // This api imports a GPU memory allocation that was previously exported as a file
+    // descriptor and it returns a memory handle.
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+    C10_CUDA_DRIVER_CHECK(driver_api->cuMemImportFromShareableHandle_(
+        &handles[r],
+        (void*)(uintptr_t)imported_fds[r],
+        CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+#elif defined (USE_ROCM)
+    C10_HIP_CHECK(hipMemImportFromShareableHandle(
+        &handles[r],
+        (void*)(uintptr_t)&(imported_fds[r]),
+        hipMemHandleTypePosixFileDescriptor));
+#else
+  TORCH_CHECK(
+      false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED");
+#endif
+    map_block(&buffers[r], handles[r], block->block_size, block->device_idx);
+    signal_pads[r] = (void*)((uintptr_t)buffers[r] + block->signal_pad_offset);
+    close(imported_fds[r]);
+  }
+  storeExchange.barrier(store, rank, world_size);
+  close(block_fd);
+
+  HandleType mc_handle{};
+  void* mc_addr = nullptr;
+  bool group_has_multicast_support = check_group_multicast_support(reqs);
+  if (!allow_overlapping_devices() && group_has_multicast_support) {
+    init_multicast_for_block(
+        mc_handle, mc_addr, block, ipc_channel, pids, store, rank, world_size);
+  }
+
+  std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs;
+  for (int r = 0; r < world_size; ++r) {
+    if (r == rank) {
+      alloc_refs.emplace_back(block->alloc_ref);
+      continue;
+    }
+    alloc_refs.push_back(c10::make_intrusive<AllocationRef>(
+        buffers[r], handles[r], block->block_size, block->device_idx));
+  }
+
+  auto symm_mem = c10::make_intrusive<CUDASymmetricMemory>(
+      std::move(alloc_refs),
+      std::move(buffers),
+      std::move(signal_pads),
+      mc_handle,
+      mc_addr,
+      block->buffer_size,
+      block->device_idx,
+      group_info.rank,
+      group_info.world_size);
+  block->symm_mems[group_name_] = symm_mem;
+  return symm_mem;
+}
+
+bool CUDASymmetricMemoryAllocator::has_multicast_support(int device_idx) {
+  return device_has_multicast_support(device_idx);
+}
+
+c10::intrusive_ptr<Block> CUDASymmetricMemoryAllocator::find_block(void* ptr) {
+  std::shared_lock lock(mutex_);
+  auto it = ptr_to_block_.find(ptr);
+  if (it == ptr_to_block_.end()) {
+    return nullptr;
+  }
+  return it->second;
+}
+
+struct RegisterCUDASymmetricMemoryAllocator {
+  RegisterCUDASymmetricMemoryAllocator() {
+    // Query backend used for CUDA tensor
+    // "CUDA" backend stands for this implementation
+    if (getSymmMemBackendCUDA() == "CUDA") {
+      register_allocator(
+          c10::DeviceType::CUDA,
+          c10::make_intrusive<CUDASymmetricMemoryAllocator>());
+    }
+  }
+};
+
+static RegisterCUDASymmetricMemoryAllocator register_allocator_;
+
+} // namespace symmetric_memory
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
new file mode 100644
index 000000000000..c65c9677d8d3
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+namespace c10d::symmetric_memory {
+
+// Resource wrapper that owns a (vaddr, allocation handle) pair. Upon
+// destruction, it unmaps the vaddr and releases the allocation handle.
+struct AllocationRef : public c10::intrusive_ptr_target {
+  void* ptr;
+  HandleType handle;
+  size_t block_size;
+  int device_idx;
+
+  AllocationRef(
+      void* ptr,
+      HandleType handle,
+      size_t block_size,
+      int device_idx);
+
+  ~AllocationRef();
+};
+
+class CUDASymmetricMemory : public SymmetricMemory {
+ public:
+  CUDASymmetricMemory(
+      std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs,
+      std::vector<void*> buffers,
+      std::vector<void*> signal_pads,
+      HandleType mc_handle,
+      void* mc_addr,
+      size_t buffer_size,
+      int local_device_idx,
+      int rank,
+      int world_size);
+
+  ~CUDASymmetricMemory() override {};
+
+  std::vector<void*> get_buffer_ptrs() override;
+  std::vector<void*> get_signal_pad_ptrs() override;
+  void** get_buffer_ptrs_dev() override;
+  void** get_signal_pad_ptrs_dev() override;
+  size_t get_buffer_size() override;
+  size_t get_signal_pad_size() override;
+
+  bool has_multicast_support() override;
+  void* get_multicast_ptr() override;
+
+  at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset) override;
+
+  at::Tensor get_signal_pad(
+      int rank,
+      c10::IntArrayRef sizes,
+      std::optional<c10::ScalarType> dtype,
+      int64_t storage_offset) override;
+
+  void barrier(int channel, size_t timeout_ms) override;
+  void put_signal(int dst_rank, int channel, size_t timeout_ms) override;
+  void wait_signal(int src_rank, int channel, size_t timeout_ms) override;
+
+  int get_rank() override;
+  int get_world_size() override;
+
+ private:
+  std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs_;
+  std::vector<void*> buffers_;
+  std::vector<void*> signal_pads_;
+  HandleType mc_handle_;
+  void* mc_addr_;
+  size_t buffer_size_;
+  int local_device_idx_;
+  int rank_;
+  int world_size_;
+  void** buffers_dev_;
+  void** signal_pads_dev_;
+};
+
+// Metadata associated with each allocation performed by
+// `CUDASymmetricMemoryAllocator`.
+struct Block : public c10::intrusive_ptr_target {
+  c10::intrusive_ptr<AllocationRef> alloc_ref;
+  int device_idx;
+  size_t block_size;
+  size_t buffer_size;
+  size_t signal_pad_offset;
+  std::optional<std::string> default_group_name;
+  std::map<std::string, c10::intrusive_ptr<CUDASymmetricMemory>> symm_mems;
+
+  Block(
+      c10::intrusive_ptr<AllocationRef> alloc_ref,
+      int device_idx,
+      size_t block_size,
+      size_t buffer_size,
+      size_t signal_pad_offset,
+      const std::optional<std::string>& group_name);
+};
+
+class CUDASymmetricMemoryAllocator : public SymmetricMemoryAllocator {
+ public:
+  void* alloc(
+      size_t size,
+      int device_idx,
+      const std::optional<std::string>& group_name) override;
+
+  void free(void* ptr) override;
+  size_t get_alloc_size(void* ptr) override;
+  c10::intrusive_ptr<SymmetricMemory> rendezvous(
+      void* ptr,
+      const std::optional<std::string>& group_name) override;
+  bool has_multicast_support(int device_idx) override;
+
+ private:
+  c10::intrusive_ptr<Block> find_block(void* ptr);
+
+  std::shared_mutex mutex_;
+  std::unordered_map<void*, c10::intrusive_ptr<Block>> ptr_to_block_;
+};
+
+} // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
new file mode 100644
index 000000000000..a2d5f8f9f67b
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
@@ -0,0 +1,1201 @@
+#include <ATen/ATen.h>
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#endif
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#endif
+
+#include <torch/csrc/distributed/c10d/cuda/AsyncMM.cuh>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
+
+#if defined(USE_ROCM) || (defined(CUDART_VERSION) && CUDART_VERSION >= 12030)
+
+#define INT_SWITCH_CASE(name, val, ...) \
+  case val: {                           \
+    constexpr int name = val;           \
+    __VA_ARGS__();                      \
+    break;                              \
+  }
+
+#define DISPATCH_WORLD_SIZES(world_size, ...)      \
+  switch (world_size) {                            \
+    INT_SWITCH_CASE(k_world_size, 8, __VA_ARGS__); \
+    INT_SWITCH_CASE(k_world_size, 4, __VA_ARGS__); \
+    INT_SWITCH_CASE(k_world_size, 2, __VA_ARGS__); \
+    default: {                                     \
+      constexpr int k_world_size = -1;             \
+      __VA_ARGS__();                               \
+    }                                              \
+  }
+
+#define DISPATCH_WORLD_SIZES_NO_DEFAULT(world_size, ...)                 \
+  switch (world_size) {                                                  \
+    INT_SWITCH_CASE(k_world_size, 8, __VA_ARGS__);                       \
+    INT_SWITCH_CASE(k_world_size, 4, __VA_ARGS__);                       \
+    INT_SWITCH_CASE(k_world_size, 2, __VA_ARGS__);                       \
+    default: {                                                           \
+      TORCH_CHECK(false, "Not implemented for world_size=", world_size); \
+    }                                                                    \
+  }
+
+#define DISPATCH_ALIGNMENTS_16_8_4(alignment, ...)                    \
+  switch (alignment) {                                                \
+    INT_SWITCH_CASE(k_alignment, 16, __VA_ARGS__);                    \
+    INT_SWITCH_CASE(k_alignment, 8, __VA_ARGS__);                     \
+    INT_SWITCH_CASE(k_alignment, 4, __VA_ARGS__);                     \
+    default: {                                                        \
+      TORCH_CHECK(false, "Not implemented for alignment=", alignment); \
+    }                                                                 \
+  }
+
+#define AT_DISPATCH_FLOAT_AND_BFLOAT16(scalar_type, name, ...)         \
+  AT_DISPATCH_SWITCH(                                                  \
+      scalar_type, name, AT_DISPATCH_CASE(at::kBFloat16, __VA_ARGS__); \
+      AT_DISPATCH_CASE(at::kFloat, __VA_ARGS__));
+
+namespace {
+
+using namespace c10d::symmetric_memory;
+
+size_t get_and_verify_alignment(const at::Tensor& input, const char* op_name) {
+  const size_t min_alignment = std::max(4l, input.element_size());
+  // Only check the offset since the multicast address is always at least
+  // 128-bit aligned
+  const size_t ptr_alignment = at::native::memory::get_alignment(
+      static_cast<size_t>(input.storage_offset() * input.element_size()));
+  TORCH_CHECK(
+      ptr_alignment >= min_alignment,
+      op_name,
+      "<",
+      input.scalar_type(),
+      ">: input ptr + offset must be at least ",
+      min_alignment,
+      "-byte aligned.");
+
+  const size_t size_alignment =
+      at::native::memory::get_alignment(static_cast<size_t>(input.numel() * input.element_size()));
+  TORCH_CHECK(
+      size_alignment >= min_alignment,
+      op_name,
+      "<",
+      input.scalar_type(),
+      ">: input size must be at least ",
+      min_alignment,
+      "-byte aligned.");
+  return std::min(ptr_alignment, size_alignment);
+}
+
+void init_elementwise_launch_config(
+    size_t numel,
+    size_t element_size,
+    size_t alignment,
+    size_t splits,
+    size_t max_num_blocks,
+    size_t max_num_threads,
+    int& num_blocks,
+    int& num_threads) {
+  // Align to preserve alignment in each split
+  const size_t aligned_numel = at::round_up(numel, alignment * splits);
+  const size_t numel_per_split = aligned_numel / splits;
+  const size_t numel_per_thread = alignment / element_size;
+
+  if (numel_per_split <= max_num_threads * numel_per_thread) {
+    num_blocks = 1;
+    num_threads = at::round_up(
+        at::ceil_div(numel_per_split, numel_per_thread),
+        static_cast<size_t>(at::cuda::warp_size()));
+  } else {
+    num_blocks = std::min(
+        at::ceil_div(numel_per_split, max_num_threads * numel_per_thread),
+        max_num_blocks);
+    num_threads = max_num_threads;
+  }
+}
+
+#if !defined(USE_ROCM) //No multi-cast support on ROCm yet
+template <typename T, int alignment>
+static __global__ void multimem_all_reduce_kernel(
+    T* input_mc_ptr,
+    size_t numel,
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size) {
+  static_assert(alignment % sizeof(T) == 0);
+  constexpr size_t numel_per_thread = alignment / sizeof(T);
+
+  sync_remote_blocks<std::memory_order_relaxed>(signal_pads, rank, world_size);
+  __syncthreads();
+
+  const size_t numel_per_rank =
+      at::round_up(numel, alignment * world_size) / world_size;
+  const size_t start = numel_per_rank * rank;
+
+  auto offset = (blockDim.x * blockIdx.x + threadIdx.x) * numel_per_thread;
+  auto stride = blockDim.x * gridDim.x * numel_per_thread;
+  for (size_t i = offset; i < numel_per_rank; i += stride) {
+    if (start + i >= numel) {
+      continue;
+    }
+    auto vec = multimem_ld_reduce_add<alignment>(input_mc_ptr + start + i);
+    multimem_st<alignment>(input_mc_ptr + start + i, vec);
+  }
+
+  __syncthreads();
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+}
+
+at::Tensor multimem_all_reduce_(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name) {
+  TORCH_CHECK(
+      input.is_contiguous(), "multimem_all_reduce_: input must be contiguous.");
+  TORCH_CHECK(
+      reduce_op == "sum",
+      "multimem_all_reduce_: only sum is supported for now.");
+
+  auto symm_mem = c10d::symmetric_memory::rendezvous(input, group_name);
+  TORCH_CHECK(
+      symm_mem != nullptr,
+      "multimem_all_reduce_: input must be allocated with empty_strided_p2p().");
+  TORCH_CHECK(
+      symm_mem->has_multicast_support(),
+      "multimem_all_reduce_: multicast support is required.");
+
+  const size_t alignment =
+      get_and_verify_alignment(input, "multimem_all_reduce_");
+
+  int num_blocks = 0, num_threads = 0;
+  init_elementwise_launch_config(
+      input.numel(),
+      input.element_size(),
+      alignment,
+      symm_mem->get_world_size(),
+      8,
+      1024,
+      num_blocks,
+      num_threads);
+
+  AT_DISPATCH_FLOAT_AND_BFLOAT16(
+      input.scalar_type(), "multimem_all_reduce_", [&]() {
+        DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+          multimem_all_reduce_kernel<scalar_t, k_alignment>
+              <<<num_blocks,
+                 num_threads,
+                 0,
+                 at::cuda::getCurrentCUDAStream()>>>(
+                  reinterpret_cast<scalar_t*>(symm_mem->get_multicast_ptr()) +
+                      input.storage_offset(),
+                  input.numel(),
+                  reinterpret_cast<uint32_t**>(
+                      symm_mem->get_signal_pad_ptrs_dev()),
+                  symm_mem->get_rank(),
+                  symm_mem->get_world_size());
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        });
+      });
+  return input;
+}
+
+template <typename T, int alignment>
+static __global__ void multimem_one_shot_all_reduce_kernel(
+    T* input_mc_ptr,
+    T* output_ptr,
+    size_t numel,
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size) {
+  static_assert(alignment % sizeof(T) == 0);
+  constexpr size_t numel_per_thread = alignment / sizeof(T);
+
+  sync_remote_blocks<std::memory_order_relaxed>(signal_pads, rank, world_size);
+  __syncthreads();
+
+  auto offset = (blockDim.x * blockIdx.x + threadIdx.x) * numel_per_thread;
+  auto stride = blockDim.x * gridDim.x * numel_per_thread;
+  for (size_t i = offset; i < numel; i += stride) {
+    auto vec = multimem_ld_reduce_add<alignment>(input_mc_ptr + i);
+    at::native::memory::st_vec<alignment>(output_ptr + i, vec);
+  }
+
+  __syncthreads();
+  sync_remote_blocks<std::memory_order_relaxed>(signal_pads, rank, world_size);
+}
+
+at::Tensor multimem_one_shot_all_reduce_out(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name,
+    at::Tensor out) {
+  TORCH_CHECK(
+      input.is_contiguous(),
+      "multimem_one_shot_all_reduce: input must be contiguous.");
+  TORCH_CHECK(
+      out.is_contiguous(),
+      "multimem_one_shot_all_reduce: output must be contiguous.");
+  TORCH_CHECK(
+      out.sizes() == input.sizes(),
+      "multimem_one_shot_all_reduce: input/output size mismatch.");
+  TORCH_CHECK(
+      reduce_op == "sum",
+      "multimem_one_shot_all_reduce: only sum is supported for now.");
+
+  auto symm_mem = c10d::symmetric_memory::rendezvous(input, group_name);
+  TORCH_CHECK(
+      symm_mem != nullptr,
+      "multimem_one_shot_all_reduce: input must be allocated with empty_strided_p2p().");
+  TORCH_CHECK(
+      symm_mem->has_multicast_support(),
+      "multimem_one_shot_all_reduce: requires multicast support.");
+
+  const size_t alignment =
+      get_and_verify_alignment(input, "multimem_one_shot_all_reduce");
+
+  int num_blocks = 0, num_threads = 0;
+  init_elementwise_launch_config(
+      input.numel(),
+      input.element_size(),
+      alignment,
+      1,
+      8,
+      1024,
+      num_blocks,
+      num_threads);
+
+  AT_DISPATCH_FLOAT_AND_BFLOAT16(
+      input.scalar_type(), "multimem_one_shot_all_reduce", [&]() {
+        DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+          multimem_one_shot_all_reduce_kernel<scalar_t, k_alignment>
+              <<<num_blocks,
+                 num_threads,
+                 0,
+                 at::cuda::getCurrentCUDAStream()>>>(
+                  reinterpret_cast<scalar_t*>(symm_mem->get_multicast_ptr()) +
+                      input.storage_offset(),
+                  out.data_ptr<scalar_t>(),
+                  input.numel(),
+                  reinterpret_cast<uint32_t**>(
+                      symm_mem->get_signal_pad_ptrs_dev()),
+                  symm_mem->get_rank(),
+                  symm_mem->get_world_size());
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+        });
+      });
+  return out;
+}
+
+at::Tensor multimem_one_shot_all_reduce(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name) {
+  auto out = at::empty_like(input);
+  return multimem_one_shot_all_reduce_out(input, reduce_op, group_name, out);
+}
+
+template <int alignment>
+static __global__ void multimem_all_gather_kernel(
+    char* input_ptr,
+    char* output_mc_ptr,
+    size_t bytes_per_rank,
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size) {
+  sync_remote_blocks<std::memory_order_relaxed>(signal_pads, rank, world_size);
+  __syncthreads();
+
+  const size_t start = bytes_per_rank * rank;
+
+  auto offset = (blockDim.x * blockIdx.x + threadIdx.x) * alignment;
+  auto stride = blockDim.x * gridDim.x * alignment;
+  for (size_t i = offset; i < bytes_per_rank; i += stride) {
+    auto vec = at::native::memory::ld_vec<alignment>(input_ptr + i);
+    multimem_st<alignment>(output_mc_ptr + start + i, vec);
+  }
+
+  __syncthreads();
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+}
+
+at::Tensor multimem_all_gather_out(
+    const at::Tensor& input,
+    std::string group_name,
+    at::Tensor out) {
+  auto symm_mem = c10d::symmetric_memory::rendezvous(out, group_name);
+  TORCH_CHECK(
+      symm_mem != nullptr,
+      "multimem_all_gather_out: output must be allocated with empty_strided_p2p().");
+  TORCH_CHECK(
+      symm_mem->has_multicast_support(),
+      "multimem_all_gather_out: output must have multicast support.");
+
+  TORCH_CHECK(
+      input.is_contiguous(),
+      "multimem_all_gather_out: input must be contiguous.");
+  TORCH_CHECK(
+      out.is_contiguous(),
+      "multimem_all_gather_out: output must be contiguous.");
+
+  TORCH_CHECK(
+      input.dim() == out.dim(),
+      "multimem_all_gather_out: input/output dimension mismatch.");
+
+  TORCH_CHECK(
+      out.sizes()[0] == input.sizes()[0] * symm_mem->get_world_size(),
+      "multimem_all_gather_out: out.sizes()[0] must be equal to input.sizes[0] * world_size. (out.sizes():",
+      out.sizes(),
+      ", input.sizes(): ",
+      input.sizes(),
+      ", world_size: ",
+      symm_mem->get_world_size(),
+      ")");
+
+  for (auto d = 1; d < input.dim(); ++d) {
+    TORCH_CHECK(
+        out.sizes()[d] == input.sizes()[d],
+        "multimem_all_gather_out: all non-0th dimension of input and output must match.");
+  }
+
+  const size_t alignment =
+      get_and_verify_alignment(out, "multimem_all_gather_out");
+
+  int num_blocks = 0, num_threads = 0;
+  init_elementwise_launch_config(
+      input.numel() * input.element_size(),
+      1,
+      alignment,
+      1,
+      8,
+      1024,
+      num_blocks,
+      num_threads);
+
+  DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+    multimem_all_gather_kernel<k_alignment>
+        <<<num_blocks, num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+            static_cast<char*>(input.data_ptr()),
+            reinterpret_cast<char*>(symm_mem->get_multicast_ptr()) +
+                out.storage_offset() * out.element_size(),
+            input.numel() * input.element_size(),
+            reinterpret_cast<uint32_t**>(symm_mem->get_signal_pad_ptrs_dev()),
+            symm_mem->get_rank(),
+            symm_mem->get_world_size());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
+  return out;
+}
+
+#endif //no multi-cast support on ROCm
+
+// One-shot all-reduce is register-intensive because it stages values loaded
+// from peers in registers before performing reduction. Setting the thread
+// count to 512 to prevent/alleviate register spill.
+constexpr size_t one_shot_all_reduce_max_num_blocks = 24;
+constexpr size_t one_shot_all_reduce_max_num_threads = 512;
+
+template <typename T, int alignment, int k_world_size>
+static __launch_bounds__(one_shot_all_reduce_max_num_threads) __global__
+    void one_shot_all_reduce_kernel(
+        T** input_ptrs,
+        T* output_ptr,
+        T* input_ptr,
+        size_t input_offset,
+        size_t numel,
+        uint32_t** signal_pads,
+        size_t rank,
+        size_t world_size) {
+  static_assert(alignment % sizeof(T) == 0);
+  constexpr size_t numel_per_thread = alignment / sizeof(T);
+  // copy input to shared ptr
+  auto offset = (blockDim.x * blockIdx.x + threadIdx.x) * numel_per_thread;
+  auto stride = blockDim.x * gridDim.x * numel_per_thread;
+  if (input_ptr) {
+    for (size_t i = offset; i < numel; i += stride) {
+      Vec<alignment> vec_st = at::native::memory::ld_vec<alignment>(input_ptr + i);
+      at::native::memory::st_vec<alignment>(input_ptrs[rank] + input_offset + i, vec_st);
+    }
+  }
+  // TODO make it sync with one block for no-copy case
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+  __syncthreads();
+
+  for (size_t i = offset; i < numel; i += stride) {
+    auto vec = load_and_reduce<T, alignment, k_world_size>(
+        input_ptrs, rank, world_size, input_offset + i);
+    at::native::memory::st_vec<alignment>(output_ptr + i, vec);
+  }
+
+  __syncthreads();
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+}
+
+at::Tensor one_shot_all_reduce_out_impl(
+    const at::Tensor& input,
+    const std::optional<at::Tensor>& local_input,
+    std::string reduce_op,
+    std::string group_name,
+    at::Tensor out) {
+  TORCH_CHECK(
+      input.is_contiguous(), "one_shot_all_reduce: input must be contiguous.");
+  TORCH_CHECK(
+      out.is_contiguous(), "one_shot_all_reduce: output must be contiguous.");
+  TORCH_CHECK(
+      out.sizes() == input.sizes(),
+      "one_shot_all_reduce: input/output size mismatch, input.sizes(): ",
+      input.sizes(),
+      ", output.sizes(): ",
+      out.sizes());
+  TORCH_CHECK(
+      reduce_op == "sum",
+      "one_shot_all_reduce: only sum is supported for now.");
+  if (local_input.has_value()) {
+    TORCH_CHECK(
+        local_input->is_contiguous(),
+        "one_shot_all_reduce: local input must be contiguous.");
+    TORCH_CHECK(
+        local_input->numel() <= input.numel(),
+        "one_shot_all_reduce: local input size must be smaller than symm buffer size.");
+  }
+  if (input.numel() == 0) {
+    TORCH_CHECK(input.scalar_type() == out.scalar_type());
+    return out;
+  }
+  auto symm_mem = c10d::symmetric_memory::rendezvous(input, group_name);
+  TORCH_CHECK(
+      symm_mem != nullptr,
+      "one_shot_all_reduce: input must be allocated with empty_strided_p2p().");
+
+  const size_t alignment =
+      get_and_verify_alignment(input, "one_shot_all_reduce");
+  if (local_input.has_value()) {
+    const size_t local_alignment =
+        get_and_verify_alignment(*local_input, "one_shot_all_reduce");
+    TORCH_CHECK(
+        alignment == local_alignment,
+        "one_shot_all_reduce: local input and symm buffer must have the same alignment.");
+  }
+
+  int num_blocks = 0, num_threads = 0;
+  init_elementwise_launch_config(
+      input.numel(),
+      input.element_size(),
+      alignment,
+      1,
+      one_shot_all_reduce_max_num_blocks,
+      one_shot_all_reduce_max_num_threads,
+      num_blocks,
+      num_threads);
+
+  AT_DISPATCH_FLOAT_AND_BFLOAT16(
+      input.scalar_type(), "one_shot_all_reduce", [&]() {
+        DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+          DISPATCH_WORLD_SIZES(symm_mem->get_world_size(), [&]() {
+            one_shot_all_reduce_kernel<scalar_t, k_alignment, k_world_size>
+                <<<num_blocks,
+                   num_threads,
+                   0,
+                   at::cuda::getCurrentCUDAStream()>>>(
+                    reinterpret_cast<scalar_t**>(
+                        symm_mem->get_buffer_ptrs_dev()),
+                    out.data_ptr<scalar_t>(),
+                    local_input.has_value() ? local_input->data_ptr<scalar_t>()
+                                            : nullptr,
+                    input.storage_offset(),
+                    input.numel(),
+                    reinterpret_cast<uint32_t**>(
+                        symm_mem->get_signal_pad_ptrs_dev()),
+                    symm_mem->get_rank(),
+                    symm_mem->get_world_size());
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+          });
+        });
+      });
+  return out;
+}
+
+at::Tensor one_shot_all_reduce_out(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name,
+    at::Tensor out) {
+  return one_shot_all_reduce_out_impl(
+      input, std::nullopt, reduce_op, group_name, out);
+}
+
+at::Tensor one_shot_all_reduce_copy_out(
+    const at::Tensor& input,
+    const at::Tensor& local_input,
+    std::string reduce_op,
+    std::string group_name,
+    at::Tensor out) {
+  return one_shot_all_reduce_out_impl(
+      input, local_input, reduce_op, group_name, out);
+}
+
+at::Tensor one_shot_all_reduce(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name) {
+  auto out = at::empty_like(input);
+  return one_shot_all_reduce_out_impl(
+      input, std::nullopt, reduce_op, group_name, out);
+}
+
+at::Tensor one_shot_all_reduce_copy(
+    const at::Tensor& input,
+    const at::Tensor& local_input,
+    std::string reduce_op,
+    std::string group_name) {
+  auto out = at::empty_like(local_input);
+  return one_shot_all_reduce_out_impl(
+      input, local_input, reduce_op, group_name, out);
+}
+
+constexpr size_t two_shot_all_reduce_max_num_blocks = 24;
+constexpr size_t two_shot_all_reduce_max_num_threads = 1024;
+
+template <
+    typename T,
+    int alignment,
+    int k_world_size,
+    bool reduce_scatter = false,
+    bool split_last_dim = false>
+static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
+    void two_shot_all_reduce_kernel(
+        T** input_ptrs,
+        T* output_ptr,
+        size_t input_offset,
+        size_t numel,
+        uint32_t** signal_pads,
+        size_t rank,
+        size_t world_size,
+        size_t last_dim_size = 0) {
+  static_assert(alignment % sizeof(T) == 0);
+  constexpr size_t numel_per_thread = alignment / sizeof(T);
+  int32_t N_last_dim =
+      last_dim_size / world_size; // used only for split_last_dim reduce_scatter
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+  __syncthreads();
+
+  const size_t numel_per_rank =
+      at::round_up(numel, numel_per_thread * world_size) / world_size;
+  const size_t start = split_last_dim ? last_dim_size / world_size * rank
+                                      : numel_per_rank * rank;
+
+  auto offset = (blockDim.x * blockIdx.x + threadIdx.x) * numel_per_thread;
+  auto stride = blockDim.x * gridDim.x * numel_per_thread;
+  for (size_t i = offset; i < numel_per_rank; i += stride) {
+    if constexpr (!reduce_scatter) {
+      // we call reduce-scatter only with evenly divisible number of elements
+      if (start + i >= numel) {
+        continue;
+      }
+    }
+    size_t idx = i;
+    if constexpr (split_last_dim) {
+      idx = i / N_last_dim * last_dim_size + i % N_last_dim;
+    }
+    auto vec = load_and_reduce<T, alignment, k_world_size>(
+        input_ptrs, rank, world_size, input_offset + start + idx);
+    // store to local buffer or to output
+    if constexpr (reduce_scatter) {
+      at::native::memory::st_vec<alignment>(output_ptr + i, vec);
+    } else {
+      at::native::memory::st_vec<alignment>(input_ptrs[rank] + input_offset + start + i, vec);
+    }
+  }
+
+  __syncthreads();
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+  if constexpr (reduce_scatter) {
+    return;
+  }
+  __syncthreads();
+  for (size_t i = offset; i < numel_per_rank; i += stride) {
+    Vec<alignment> tmp[k_world_size];
+#pragma unroll k_world_size
+    for (size_t step = 0; step < k_world_size; ++step) {
+      size_t remote_rank = (rank + step) % k_world_size;
+      size_t remote_start = numel_per_rank * remote_rank;
+      if (remote_start + i >= numel) {
+        continue;
+      }
+      tmp[step] = at::native::memory::ld_vec<alignment>(
+          input_ptrs[remote_rank] + input_offset + remote_start + i);
+    }
+#pragma unroll k_world_size
+    for (size_t step = 0; step < k_world_size; ++step) {
+      size_t remote_rank = (rank + step) % k_world_size;
+      size_t remote_start = numel_per_rank * remote_rank;
+      if (remote_start + i >= numel) {
+        continue;
+      }
+      at::native::memory::st_vec<alignment>(output_ptr + remote_start + i, tmp[step]);
+    }
+  }
+  // need to make sure all blocks exit simultaneously so that the data
+  // is not corrupted by the subsequent kernels
+  __syncthreads();
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+}
+
+template <typename T, int alignment, int k_world_size>
+static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
+    void two_shot_all_reduce_kernel_inplace(
+        T** input_ptrs,
+        size_t input_offset,
+        size_t numel,
+        uint32_t** signal_pads,
+        size_t rank,
+        size_t world_size) {
+  static_assert(alignment % sizeof(T) == 0);
+  constexpr size_t numel_per_thread = alignment / sizeof(T);
+
+  sync_remote_blocks<std::memory_order_relaxed>(signal_pads, rank, world_size);
+  __syncthreads();
+
+  const size_t numel_per_rank =
+      at::round_up(numel, alignment * world_size) / world_size;
+  const size_t start = numel_per_rank * rank;
+
+  auto offset = (blockDim.x * blockIdx.x + threadIdx.x) * numel_per_thread;
+  auto stride = blockDim.x * gridDim.x * numel_per_thread;
+  for (size_t i = offset; i < numel_per_rank; i += stride) {
+    if (start + i >= numel) {
+      continue;
+    }
+    auto vec = load_and_reduce<T, alignment, k_world_size>(
+        input_ptrs, rank, world_size, input_offset + start + i);
+    for (size_t step = 0; step < world_size; ++step) {
+      size_t remote_rank = (rank + step) % world_size;
+      at::native::memory::st_vec<alignment>(
+          input_ptrs[remote_rank] + input_offset + start + i, vec);
+    }
+  }
+
+  __syncthreads();
+  sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+}
+
+at::Tensor two_shot_all_reduce_impl(
+    at::Tensor input,
+    std::optional<at::Tensor> output,
+    std::string reduce_op,
+    std::string group_name) {
+  TORCH_CHECK(
+      input.is_contiguous(), "two_shot_all_reduce: input must be contiguous.");
+  TORCH_CHECK(
+      reduce_op == "sum",
+      "two_shot_all_reduce: only sum is supported for now.");
+
+  auto symm_mem = c10d::symmetric_memory::rendezvous(input, group_name);
+  TORCH_CHECK(
+      symm_mem != nullptr,
+      "two_shot_all_reduce: input must be allocated with empty_strided_p2p().");
+
+  const size_t alignment =
+      get_and_verify_alignment(input, "two_shot_all_reduce");
+
+  if (output.has_value()) {
+    TORCH_CHECK(
+        output->is_contiguous(),
+        "two_shot_all_reduce: output must be contiguous.");
+    const size_t output_alignment =
+        get_and_verify_alignment(*output, "two_shot_all_reduce");
+    TORCH_CHECK(
+        alignment <= output_alignment,
+        "two_shot_all_reduce: output alignment must be equal to or larger than input.");
+    TORCH_CHECK(
+        output->sizes() == input.sizes(),
+        "two_shot_all_reduce: input/output size mismatch, input.sizes(): ",
+        input.sizes(),
+        ", output.sizes(): ",
+        output->sizes());
+    if (input.numel() == 0) {
+      TORCH_CHECK(output->scalar_type() == input.scalar_type());
+      return *output;
+    }
+  } else {
+    if (input.numel() == 0) {
+      return input;
+    }
+  }
+
+  int num_blocks = 0, num_threads = 0;
+  init_elementwise_launch_config(
+      input.numel(),
+      input.element_size(),
+      alignment,
+      symm_mem->get_world_size(),
+      two_shot_all_reduce_max_num_blocks,
+      two_shot_all_reduce_max_num_threads,
+      num_blocks,
+      num_threads);
+
+  if (!output.has_value()) {
+    AT_DISPATCH_FLOAT_AND_BFLOAT16(
+        input.scalar_type(), "two_shot_all_reduce", [&]() {
+          DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+            DISPATCH_WORLD_SIZES(symm_mem->get_world_size(), [&]() {
+              two_shot_all_reduce_kernel_inplace<
+                  scalar_t,
+                  k_alignment,
+                  k_world_size>
+                  <<<num_blocks,
+                     num_threads,
+                     0,
+                     at::cuda::getCurrentCUDAStream()>>>(
+                      reinterpret_cast<scalar_t**>(
+                          symm_mem->get_buffer_ptrs_dev()),
+                      input.storage_offset(),
+                      input.numel(),
+                      reinterpret_cast<uint32_t**>(
+                          symm_mem->get_signal_pad_ptrs_dev()),
+                      symm_mem->get_rank(),
+                      symm_mem->get_world_size());
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
+          });
+        });
+    return input;
+  } else {
+    AT_DISPATCH_FLOAT_AND_BFLOAT16(
+        input.scalar_type(), "two_shot_all_reduce", [&]() {
+          DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+            DISPATCH_WORLD_SIZES_NO_DEFAULT(symm_mem->get_world_size(), [&]() {
+              two_shot_all_reduce_kernel<scalar_t, k_alignment, k_world_size>
+                  <<<num_blocks,
+                     num_threads,
+                     0,
+                     at::cuda::getCurrentCUDAStream()>>>(
+                      reinterpret_cast<scalar_t**>(
+                          symm_mem->get_buffer_ptrs_dev()),
+                      output->data_ptr<scalar_t>(),
+                      input.storage_offset(),
+                      input.numel(),
+                      reinterpret_cast<uint32_t**>(
+                          symm_mem->get_signal_pad_ptrs_dev()),
+                      symm_mem->get_rank(),
+                      symm_mem->get_world_size());
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
+          });
+        });
+    return *output;
+  }
+}
+
+at::Tensor two_shot_all_reduce_(
+    at::Tensor input,
+    std::string reduce_op,
+    std::string group_name) {
+  return two_shot_all_reduce_impl(input, std::nullopt, reduce_op, group_name);
+}
+
+at::Tensor two_shot_all_reduce_out(
+    at::Tensor input,
+    std::string reduce_op,
+    std::string group_name,
+    at::Tensor output) {
+  return two_shot_all_reduce_impl(input, output, reduce_op, group_name);
+}
+
+at::Tensor reduce_scatter_out(
+    at::Tensor input,
+    std::string group_name,
+    bool split_last_dim,
+    at::Tensor output) {
+  TORCH_CHECK(
+      input.is_contiguous(), "reduce_scatter: input must be contiguous.");
+  TORCH_CHECK(
+      output.is_contiguous(), "reduce_scatter: output must be contiguous.");
+
+  auto symm_mem = c10d::symmetric_memory::rendezvous(input, group_name);
+  TORCH_CHECK(
+      symm_mem != nullptr,
+      "reduce_scatter: input must be allocated with empty_strided_p2p().");
+
+  const size_t alignment = get_and_verify_alignment(input, "reduce_scatter");
+
+  const size_t output_alignment =
+      get_and_verify_alignment(input, "reduce_scatter");
+
+  TORCH_CHECK(
+      input.numel() %
+              (symm_mem->get_world_size() *
+               (alignment / input.element_size())) ==
+          0,
+      "expected number of elements to be divisible by world_size * alignment, number of elements ",
+      input.numel(),
+      " world size ",
+      symm_mem->get_world_size(),
+      "alignment ",
+      alignment);
+
+  if (split_last_dim) {
+    TORCH_CHECK(input.dim() == output.dim());
+    bool are_equal_except_last = std::equal(
+        input.sizes().begin(), input.sizes().end() - 1, output.sizes().begin());
+    TORCH_CHECK(
+        are_equal_except_last,
+        "reduce_scatter expected input and output to have same sizes except in the last dimension");
+    TORCH_CHECK(
+        output.size(-1) == input.size(-1) / symm_mem->get_world_size(),
+        "reduce_scatter expected output last dim size to be input last dim size / world_size");
+
+    TORCH_CHECK(
+        input.size(-1) %
+                (symm_mem->get_world_size() *
+                 (alignment / input.element_size())) ==
+            0,
+        "expected last dimension to be divisible by world_size * alignment, last dimension ",
+        input.size(-1),
+        " world size ",
+        symm_mem->get_world_size(),
+        "alignment ",
+        alignment);
+  } else {
+    TORCH_CHECK(input.dim() == 1, "reduce_scatter expected 1D input");
+    TORCH_CHECK(output.dim() == 1, "reduce_scatter expected 1D output");
+    TORCH_CHECK(output.numel() == input.numel() / symm_mem->get_world_size());
+  }
+  if (input.numel() == 0) {
+    TORCH_CHECK(input.scalar_type() == output.scalar_type());
+    return output;
+  }
+
+  TORCH_CHECK(
+      output_alignment >= alignment,
+      "reduce_scatter: output alignment should be not smaller than input alignment");
+
+  int num_blocks = 0, num_threads = 0;
+  init_elementwise_launch_config(
+      input.numel(),
+      input.element_size(),
+      alignment,
+      symm_mem->get_world_size(),
+      two_shot_all_reduce_max_num_blocks,
+      two_shot_all_reduce_max_num_threads,
+      num_blocks,
+      num_threads);
+  if (split_last_dim) {
+    AT_DISPATCH_FLOAT_AND_BFLOAT16(
+        input.scalar_type(), "two_shot_all_reduce", [&]() {
+          DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+            DISPATCH_WORLD_SIZES_NO_DEFAULT(symm_mem->get_world_size(), [&]() {
+              two_shot_all_reduce_kernel<
+                  scalar_t,
+                  k_alignment,
+                  k_world_size,
+                  true,
+                  true>
+                  <<<num_blocks,
+                     num_threads,
+                     0,
+                     at::cuda::getCurrentCUDAStream()>>>(
+                      reinterpret_cast<scalar_t**>(
+                          symm_mem->get_buffer_ptrs_dev()),
+                      output.data_ptr<scalar_t>(),
+                      input.storage_offset(),
+                      input.numel(),
+                      reinterpret_cast<uint32_t**>(
+                          symm_mem->get_signal_pad_ptrs_dev()),
+                      symm_mem->get_rank(),
+                      symm_mem->get_world_size(),
+                      input.size(-1));
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
+          });
+        });
+  } else {
+    AT_DISPATCH_FLOAT_AND_BFLOAT16(
+        input.scalar_type(), "two_shot_all_reduce", [&]() {
+          DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+            DISPATCH_WORLD_SIZES_NO_DEFAULT(symm_mem->get_world_size(), [&]() {
+              two_shot_all_reduce_kernel<
+                  scalar_t,
+                  k_alignment,
+                  k_world_size,
+                  true,
+                  false>
+                  <<<num_blocks,
+                     num_threads,
+                     0,
+                     at::cuda::getCurrentCUDAStream()>>>(
+                      reinterpret_cast<scalar_t**>(
+                          symm_mem->get_buffer_ptrs_dev()),
+                      output.data_ptr<scalar_t>(),
+                      input.storage_offset(),
+                      input.numel(),
+                      reinterpret_cast<uint32_t**>(
+                          symm_mem->get_signal_pad_ptrs_dev()),
+                      symm_mem->get_rank(),
+                      symm_mem->get_world_size(),
+                      input.size(-1));
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
+          });
+        });
+  }
+  return output;
+}
+} // namespace
+#elif defined(CUDART_VERSION) && CUDART_VERSION < 12030
+namespace {
+at::Tensor multimem_all_reduce_(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name) {
+  TORCH_CHECK(false, "multimem_all_reduce_: requires CUDA 12.3+.");
+  return input;
+}
+
+at::Tensor multimem_one_shot_all_reduce_out(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name,
+    at::Tensor out) {
+  TORCH_CHECK(false, "multimem_one_shot_all_reduce_out: requires CUDA 12.3+.");
+  return out;
+}
+
+at::Tensor multimem_one_shot_all_reduce(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name) {
+  TORCH_CHECK(false, "multimem_one_shot_all_reduce: requires CUDA 12.3+.");
+  return input;
+}
+
+at::Tensor multimem_all_gather_out(
+    const at::Tensor& input,
+    std::string group_name,
+    at::Tensor out) {
+  TORCH_CHECK(false, "multimem_all_gather_out: requires CUDA 12.3+.");
+  return out;
+}
+
+at::Tensor one_shot_all_reduce_out(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name,
+    at::Tensor out) {
+  TORCH_CHECK(false, "one_shot_all_reduce_out: requires CUDA 12.3+.");
+  return out;
+}
+
+at::Tensor one_shot_all_reduce_copy_out(
+    const at::Tensor& input,
+    const at::Tensor& local_input,
+    std::string reduce_op,
+    std::string group_name,
+    at::Tensor out) {
+  TORCH_CHECK(false, "one_shot_all_reduce_copy_out: requires CUDA 12.3+.");
+  return out;
+}
+
+at::Tensor one_shot_all_reduce(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name) {
+  TORCH_CHECK(false, "one_shot_all_reduce: requires CUDA 12.3+.");
+  return input;
+}
+
+at::Tensor one_shot_all_reduce_copy(
+    const at::Tensor& input,
+    const at::Tensor& local_input,
+    std::string reduce_op,
+    std::string group_name) {
+  TORCH_CHECK(false, "one_shot_all_reduce_copy: requires CUDA 12.3+.");
+  return input;
+}
+
+at::Tensor two_shot_all_reduce_(
+    at::Tensor input,
+    std::string reduce_op,
+    std::string group_name) {
+  TORCH_CHECK(false, "two_shot_all_reduce_: requires CUDA 12.3+.");
+  return input;
+}
+
+at::Tensor two_shot_all_reduce_out(
+    at::Tensor input,
+    std::string reduce_op,
+    std::string group_name,
+    at::Tensor output) {
+  TORCH_CHECK(false, "two_shot_all_reduce_out: requires CUDA 12.3+.");
+  return output;
+}
+
+at::Tensor reduce_scatter_out(
+    at::Tensor input,
+    std::string group_name,
+    bool split_last_dim,
+    at::Tensor output) {
+  TORCH_CHECK(false, "reduce_scatter_out: requires CUDA 12.3+.");
+  return output;
+}
+} // namespace
+#endif // #if defined(CUDART_VERSION) && CUDART_VERSION < 12030
+
+namespace {
+
+at::Tensor memset32_(
+    at::Tensor& input,
+    int64_t offset,
+    int64_t val,
+    int64_t count) {
+  TORCH_CHECK(
+      input.dim() == 1 && input.is_contiguous() &&
+          input.scalar_type() == c10::ScalarType::UInt32,
+      "symm_mem::memset32_: input must be a flat, contiguous uint32 tensor.");
+
+  TORCH_CHECK(
+      offset >= 0,
+      "symm_mem::memset32_: offset must be greater than or equal to 0 (got ",
+      offset,
+      ")");
+
+  TORCH_CHECK(
+      count > 0,
+      "symm_mem::memset32_: count must be a positive integer (got ",
+      count,
+      ")");
+
+  TORCH_CHECK(
+      val >= 0 &&
+          static_cast<size_t>(val) <= std::numeric_limits<uint32_t>::max(),
+      "symm_mem::memset32_: val must be in the range of "
+      "[0, 4294967295] (uint32_t).")
+
+  TORCH_CHECK(
+      offset + count <= input.numel(),
+      "symm_mem::memset32_: offset + count (",
+      offset + count,
+      ") exceeded the numel of the input (",
+      input.numel(),
+      ")");
+
+  auto addr = reinterpret_cast<uint32_t*>(input.data_ptr()) + offset;
+  c10::cuda::CUDAGuard guard(input.device());
+
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  auto driver_api = c10::cuda::DriverAPI::get();
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMemsetD32Async_(
+      reinterpret_cast<CUdeviceptr>(addr),
+      val,
+      count,
+      at::cuda::getCurrentCUDAStream()));
+#elif defined(USE_ROCM)
+  C10_HIP_CHECK(hipMemsetD32Async(reinterpret_cast<hipDeviceptr_t>(addr),
+                                   val,
+                                   count,
+                                   at::cuda::getCurrentCUDAStream()));
+#else
+  TORCH_CHECK(
+      false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED");
+#endif
+  return input;
+}
+
+at::Tensor stream_write_value32_(
+    at::Tensor& input,
+    int64_t offset,
+    int64_t val) {
+  TORCH_CHECK(
+      input.dim() == 1 && input.is_contiguous() &&
+          input.scalar_type() == c10::ScalarType::UInt32,
+      "symm_mem::stream_write_value32_: input must be a flat, contiguous "
+      "uint32 tensor.");
+
+  TORCH_CHECK(
+      offset >= 0,
+      "symm_mem::stream_write_value32_: offset must be greater than or "
+      "equal to 0 (got ",
+      offset,
+      ")");
+
+  TORCH_CHECK(
+      val >= 0 &&
+          static_cast<size_t>(val) <= std::numeric_limits<uint32_t>::max(),
+      "symm_mem::stream_write_value32_: "
+      "val must be in the range of [0, 4294967295] (uint32_t).")
+
+  TORCH_CHECK(
+      offset < input.numel(),
+      "symm_mem::stream_write_value32_: offset (",
+      offset,
+      ") exceeded the numel of the input (",
+      input.numel(),
+      ")");
+
+  auto addr = reinterpret_cast<uint32_t*>(input.data_ptr()) + offset;
+  c10::cuda::CUDAGuard guard(input.device());
+
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  auto driver_api = c10::cuda::DriverAPI::get();
+  // According to the documentation of CUstreamWriteValue_flags,
+  // cuStreamWriteValue32 will provide a memory fence before the write, which
+  // has similar semantics to __threadfence_system() but is scoped to the
+  // stream rather than a CUDA thread.
+  C10_CUDA_DRIVER_CHECK(driver_api->cuStreamWriteValue32_(
+      at::cuda::getCurrentCUDAStream(),
+      reinterpret_cast<CUdeviceptr>(addr),
+      val,
+      0));
+#elif defined(USE_ROCM)
+  C10_HIP_CHECK(hipStreamWriteValue32(
+                                      at::cuda::getCurrentCUDAStream(),
+                                      reinterpret_cast<void*>(addr),
+                                      val,
+                                      0));
+#else
+  TORCH_CHECK(
+      false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED");
+#endif
+  return input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(symm_mem, CUDA, m) {
+#if defined(USE_ROCM) || defined(CUDART_VERSION)
+  m.impl("one_shot_all_reduce", ::one_shot_all_reduce);
+  m.impl("one_shot_all_reduce_out", ::one_shot_all_reduce_out);
+  m.impl("one_shot_all_reduce_copy", ::one_shot_all_reduce_copy);
+  m.impl("one_shot_all_reduce_copy_out", ::one_shot_all_reduce_copy_out);
+  m.impl("two_shot_all_reduce_", ::two_shot_all_reduce_);
+  m.impl("two_shot_all_reduce_out", ::two_shot_all_reduce_out);
+  m.impl("reduce_scatter_out", ::reduce_scatter_out);
+
+  m.impl("_async_input_mm", c10d::cuda::detail::async_input_mm);
+#endif
+#if defined(CUDART_VERSION)
+  m.impl("multimem_all_reduce_", ::multimem_all_reduce_);
+
+  // NOTE: [multimem_one_shot_all_reduce]
+  // multimem.ld_reduce does not guarantee a fixed accumulation order. This
+  // means that while multimem_one_shot_all_reduce is faster and has higher
+  // numerical accuracy than one_shot_all_reduce, it doesn't guarantee
+  // identical results across ranks. There may be use cases that can take
+  // advantage of this property, but it should not be used without
+  // understanding the caveats.
+  m.impl("multimem_one_shot_all_reduce", ::multimem_one_shot_all_reduce);
+  m.impl(
+      "multimem_one_shot_all_reduce_out", ::multimem_one_shot_all_reduce_out);
+  m.impl("multimem_all_gather_out", ::multimem_all_gather_out);
+#endif
+  m.impl("stream_write_value32_", ::stream_write_value32_);
+  m.impl("memset32_", ::memset32_);
+}
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
new file mode 100644
index 000000000000..958b547bd4cf
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+namespace c10d::symmetric_memory {
+
+constexpr size_t signal_pad_size = 2048;
+
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+using HandleType = CUmemGenericAllocationHandle;
+#elif defined(USE_ROCM)
+using HandleType = hipMemGenericAllocationHandle_t;
+#else
+using HandleType = void*;
+#endif
+
+} // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
new file mode 100644
index 000000000000..225304faca65
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
@@ -0,0 +1,275 @@
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <c10/util/error.h>
+
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#elif defined(USE_ROCM)
+#include <c10/hip/HIPException.h>
+#include <hip/hip_runtime_api.h>
+#endif
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
+
+namespace c10d::symmetric_memory {
+
+bool device_has_multicast_support(int device_idx) {
+  if (c10::utils::check_env("TORCH_SYMM_MEM_DISABLE_MULTICAST") == true) {
+    return false;
+  }
+  return c10d::cuda::deviceSupportsMulticast(device_idx);
+}
+
+bool allow_overlapping_devices() {
+  return c10::utils::check_env("TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES") ==
+      true;
+}
+
+// Query environment variable to get the backend used for CUDA Symmetric Memory.
+std::string getSymmMemBackendCUDA() {
+  // TORCH_SYMMMEM environment variable can be used to indicate the preferred
+  // backend.
+  static auto val = c10::utils::get_env("TORCH_SYMMMEM");
+  if (val.has_value()) {
+    TORCH_CHECK(
+        val.value() == "CUDA" || val.value() == "NVSHMEM" ||
+            val.value() == "NCCL",
+        "TORCH_SYMMMEM environment variable must be one of 'CUDA', 'NVSHMEM', 'NCCL'.")
+    return val.value();
+  }
+  // If TORCH_SYMMMEM is not set, check if NVSHMEM is available (for broader
+  // support).
+  // TODO: uncomment this once all single-node tests work with NVSHMEM
+  // if (is_nvshmem_available()) {
+  //   return "NVSHMEM";
+  // }
+  return "CUDA";
+}
+
+IpcChannel::IpcChannel()
+    : socket_name_(get_socket_name(getpid())),
+      socket_(socket(AF_UNIX, SOCK_DGRAM, 0)) {
+  // On success, a file descriptor for the new socket is returned.
+  //  On error, -1 is returned, and errno is set to indicate the error.
+  TORCH_CHECK(
+      socket_ != -1, "Failed to create socket: ", c10::utils::str_error(errno));
+
+  struct sockaddr_un addr = {.sun_family = AF_UNIX};
+  std::copy(socket_name_.begin(), socket_name_.end(), addr.sun_path);
+
+  TORCH_CHECK(
+      bind(socket_, (struct sockaddr*)&addr, SUN_LEN(&addr)) == 0,
+      "Failed to bind socket: ",
+      c10::utils::str_error(errno));
+}
+
+IpcChannel::~IpcChannel() {
+  close(socket_);
+  unlink(socket_name_.c_str());
+}
+
+void IpcChannel::send_fd(int dst_pid, int fd) {
+  // Because file descriptors are process-local kernel objects, and we can’t
+  // pass them via normal socket payloads (like write() or send()).  Unix domain
+  // sockets provide a mechanism to pass actual FDs via sendmsg()/recvmsg().
+  // Define destination socket address
+  struct sockaddr_un addr = {.sun_family = AF_UNIX};
+  auto socket_name = get_socket_name(dst_pid);
+  std::copy(socket_name.begin(), socket_name.end(), addr.sun_path);
+
+  // Prepare data to send
+  // Data being sent is "fd", the value of fd will be sent as auxiliary data
+  // (control message)
+  struct iovec io = {.iov_base = (void*)("fd"), .iov_len = 2};
+
+  // Prepare control message data buffer and zero it out
+  // NOLINTNEXTLINE(*array*)
+  char cbuf[CMSG_SPACE(sizeof(int))];
+  memset(cbuf, 0, sizeof(cbuf));
+
+  // Create message header
+  struct msghdr msg{
+      // destination socket address and size of it
+      // message content in msg_iov and number of such structs (1 in our case)
+      // auxiliary data with the value of fd and size of it
+      .msg_name = (void*)&addr,
+      .msg_namelen = sizeof(struct sockaddr_un),
+      .msg_iov = &io,
+      .msg_iovlen = 1,
+      .msg_control = cbuf,
+      .msg_controllen = sizeof(cbuf)};
+
+  // This points to the first control message header
+  // With SCM_RIGHTS we let the kernel know that we are passing file
+  // descriptors.
+  auto cmsg = CMSG_FIRSTHDR(&msg);
+  cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+  // Specify socket level message
+  cmsg->cmsg_level = SOL_SOCKET;
+  // SCM_RIGHTS is the type used to pass file descriptors
+  cmsg->cmsg_type = SCM_RIGHTS;
+
+  if (fd != -1) {
+    std::copy(
+        reinterpret_cast<const char*>(&fd),
+        reinterpret_cast<const char*>(&fd) + sizeof(fd),
+        reinterpret_cast<char*>(CMSG_DATA(cmsg)));
+  } else {
+    msg.msg_controllen = 0;
+  }
+
+  // Finally send the the message
+  TORCH_CHECK(
+      sendmsg(socket_, &msg, 0) > 0,
+      "Failed to send fd: ",
+      c10::utils::str_error(errno));
+}
+
+int IpcChannel::recv_fd() {
+  // Prepare buffer for regular message "fd"
+  // NOLINTNEXTLINE(*array*)
+  char buf[2];
+  memset(&buf, 0, sizeof(buf));
+  struct iovec io = {.iov_base = (void*)buf, .iov_len = sizeof(buf)};
+
+  // Prepare buffer for control message and zero it out
+  // NOLINTNEXTLINE(*array*)
+  char cbuf[CMSG_SPACE(sizeof(int))];
+  memset(cbuf, 0, sizeof(cbuf));
+
+  // Define socket address to receive on: family AF_UNIX means unix domain
+  // socket
+  struct sockaddr_un addr = {.sun_family = AF_UNIX};
+  std::copy(socket_name_.begin(), socket_name_.end(), addr.sun_path);
+
+  // Prepare message header
+  struct msghdr msg = {
+      .msg_name = (void*)&addr,
+      .msg_namelen = sizeof(struct sockaddr_un),
+      .msg_iov = &io,
+      .msg_iovlen = 1,
+      .msg_control = cbuf,
+      .msg_controllen = sizeof(cbuf)};
+
+  // Receive message on socket_
+  TORCH_CHECK(
+      recvmsg(socket_, &msg, 0) > 0,
+      "Failed to receive fd: ",
+      c10::utils::str_error(errno));
+
+  if (msg.msg_controllen == 0) {
+    return -1;
+  }
+
+  // Extract control message and validate its content
+  auto cmsg = CMSG_FIRSTHDR(&msg);
+  TORCH_CHECK(cmsg != nullptr);
+  TORCH_CHECK(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
+  TORCH_CHECK(cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS);
+  return *reinterpret_cast<int*>(CMSG_DATA(cmsg));
+}
+
+std::vector<int> IpcChannel::all_gather_fds(
+    int rank,
+    const std::vector<int>& pids,
+    int fd) {
+  int world_size = (int)pids.size();
+  std::vector<int> fds(pids.size());
+  fds[rank] = fd;
+
+  int dst_rank = (rank + 1) % world_size;
+  for (int step = 1; step < world_size; ++step) {
+    int src_rank = (rank + world_size - step) % world_size;
+    send_fd(pids[dst_rank], fd);
+    fd = recv_fd();
+    fds[src_rank] = fd;
+  }
+  return fds;
+}
+
+int IpcChannel::broadcast_fds(
+    int rank,
+    int src_rank,
+    const std::vector<int>& pids,
+    int fd) {
+  int world_size = (int)pids.size();
+
+  if (rank == src_rank) {
+    for (int dst_rank = 0; dst_rank < (int)world_size; ++dst_rank) {
+      if (dst_rank == rank) {
+        continue;
+      }
+      send_fd(pids[dst_rank], fd);
+    }
+    return fd;
+  }
+  return recv_fd();
+}
+
+std::string IpcChannel::get_socket_name(int pid) {
+  const char* tmp_dir = "/tmp";
+  for (const char* env_var : {"TMPDIR", "TMP", "TEMP", "TEMPDIR"}) {
+    if (const char* path = getenv(env_var)) {
+      tmp_dir = path;
+      break;
+    }
+  }
+  std::ostringstream oss;
+  oss << tmp_dir << "/symm_mem-" << pid;
+  return oss.str();
+}
+
+void map_block(
+    void** ptr,
+    c10d::symmetric_memory::HandleType handle,
+    size_t size,
+    int device_idx) {
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  auto driver_api = c10::cuda::DriverAPI::get();
+  auto dev_ptr = reinterpret_cast<CUdeviceptr*>(ptr);
+  // Allocate virtual address space
+  C10_CUDA_DRIVER_CHECK(
+      driver_api->cuMemAddressReserve_(dev_ptr, size, 0ULL, 0, 0ULL));
+  // Map the physical memory to the virtual address
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMemMap_(*dev_ptr, size, 0, handle, 0ULL));
+
+  // Set access permissions
+  CUmemAccessDesc desc;
+  desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+  desc.location.id = static_cast<int>(device_idx);
+  desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMemSetAccess_(*dev_ptr, size, &desc, 1));
+#elif defined(USE_ROCM)
+  C10_HIP_CHECK(hipMemAddressReserve(ptr, size, 0ULL, 0, 0ULL));
+  C10_HIP_CHECK(hipMemMap(
+      *ptr,
+      size,
+      0,
+      reinterpret_cast<hipMemGenericAllocationHandle_t>(handle),
+      0ULL));
+  C10_HIP_CHECK(hipMemMap(
+      *ptr,
+      size,
+      0,
+      reinterpret_cast<hipMemGenericAllocationHandle_t>(handle),
+      0ULL));
+
+  hipMemAccessDesc desc;
+  desc.location.type = hipMemLocationTypeDevice;
+  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+  desc.location.id = static_cast<int>(device_idx);
+  desc.flags = hipMemAccessFlagsProtReadWrite;
+  C10_HIP_CHECK(hipMemSetAccess(*ptr, size, &desc, 1));
+#else
+  TORCH_CHECK(
+      false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED");
+#endif
+}
+
+} // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
new file mode 100644
index 000000000000..77dd36b778ae
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+namespace c10d {
+namespace symmetric_memory {
+
+bool device_has_multicast_support(int device_idx);
+
+bool allow_overlapping_devices();
+
+// Query environment variable to get the backend used for CUDA Symmetric Memory.
+std::string getSymmMemBackendCUDA();
+
+class IpcChannel {
+ public:
+  IpcChannel();
+  ~IpcChannel();
+
+  void send_fd(int dst_pid, int fd);
+  int recv_fd();
+
+  std::vector<int> all_gather_fds(
+      int rank,
+      const std::vector<int>& pids,
+      int fd);
+
+  int broadcast_fds(
+      int rank,
+      int src_rank,
+      const std::vector<int>& pids,
+      int fd);
+
+ private:
+  static std::string get_socket_name(int pid);
+
+  std::string socket_name_;
+  int socket_;
+};
+
+// A set of store-based exchange methods with a preset prefix typically type of
+// the SymmetricMemory.  Most used as static instances at respective
+// SymmetricMemory implementation files.
+class StoreExchange {
+ public:
+  StoreExchange(const std::string& store_prefix)
+      : store_prefix_(store_prefix) {}
+
+  // Put template function in header file so that compiler can easily access it.
+  template <typename T>
+  std::vector<T> all_gather(
+      const c10::intrusive_ptr<c10d::Store>& store,
+      int rank,
+      int world_size,
+      T val) {
+    static_assert(std::is_trivially_copyable_v<T>);
+
+    std::vector<std::string> peer_keys;
+    peer_keys.reserve(world_size);
+    for (int r = 0; r < world_size; ++r) {
+      std::ostringstream oss;
+      oss << store_prefix_ << "/" << seq_id_ << "/" << r;
+      peer_keys.push_back(oss.str());
+    }
+    ++seq_id_;
+
+    {
+      std::vector<uint8_t> payload(
+          reinterpret_cast<uint8_t*>(&val),
+          reinterpret_cast<uint8_t*>(&val) + sizeof(T));
+      store->set(peer_keys[rank], payload);
+    }
+
+    std::vector<T> peer_vals;
+    peer_vals.reserve(world_size);
+    for (int r = 0; r < world_size; ++r) {
+      if (r == rank) {
+        peer_vals.push_back(val);
+        continue;
+      }
+      store->wait({peer_keys[r]});
+      auto payload = store->get(peer_keys[r]);
+      TORCH_CHECK(payload.size() == sizeof(T));
+      T peer_val{};
+      std::memcpy(&peer_val, payload.data(), sizeof(T));
+      peer_vals.push_back(peer_val);
+    }
+    return peer_vals;
+  }
+
+  void barrier(
+      const c10::intrusive_ptr<c10d::Store>& store,
+      int rank,
+      int world_size) {
+    // TODO: implement an efficient one?
+    all_gather(store, rank, world_size, 0);
+  }
+
+ private:
+  const std::string store_prefix_;
+  size_t seq_id_ = 0;
+};
+
+// Teturns a pointer of virtual address that is mapped to the physical memory
+// held by the handle.
+void map_block(
+    void** ptr,
+    c10d::symmetric_memory::HandleType handle,
+    size_t size,
+    int device_idx);
+
+} // namespace symmetric_memory
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp b/torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
new file mode 100644
index 000000000000..b5efcfeb3006
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
@@ -0,0 +1,136 @@
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
+
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/driver_api.h>
+#include <fmt/printf.h>
+
+#include <cuda_runtime.h>
+#include <nvml.h>
+
+namespace {
+
+constexpr int max_nvlinks = 64;
+
+std::string get_bus_id(int device_idx) {
+  cudaDeviceProp prop{};
+  C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_idx));
+  return fmt::sprintf(
+      NVML_DEVICE_PCI_BUS_ID_FMT,
+      prop.pciDomainID,
+      prop.pciBusID,
+      prop.pciDeviceID);
+}
+
+struct C10_EXPORT NVLinkDetector : public c10d::DMAConnectivityDetector {
+  c10::intrusive_ptr<c10d::DMAConnectivity> detect() override {
+    int num_devices = 0;
+    C10_CUDA_CHECK(cudaGetDeviceCount(&num_devices));
+
+    std::vector<std::vector<int>> matrix;
+    matrix.reserve(num_devices);
+    for (int i = 0; i < num_devices; ++i) {
+      matrix.emplace_back(num_devices, 0);
+    }
+
+    // Obtain the bus_id for all visible devices
+    std::unordered_map<std::string, int> bus_id_to_device_idx;
+    bus_id_to_device_idx.reserve(num_devices);
+    std::vector<std::string> bus_ids;
+    bus_ids.reserve(num_devices);
+    for (int i = 0; i < num_devices; ++i) {
+      auto bus_id = get_bus_id(i);
+      bus_id_to_device_idx.emplace(bus_id, i);
+      bus_ids.push_back(std::move(bus_id));
+    }
+
+    static constexpr const char* warning_msg =
+        "PyTorch features that use NVLinkDetector may assume no NVLink presence.";
+
+    auto driver_api = c10::cuda::DriverAPI::get();
+    if (driver_api->nvmlInit_v2_() != NVML_SUCCESS) {
+      LOG(WARNING)
+          << "NVLinkDetector: Failed to initialize NVML via nvmlInit_v2. "
+          << warning_msg;
+      return c10::make_intrusive<c10d::DMAConnectivity>(
+          c10::DeviceType::CUDA, "nvlink", std::move(matrix));
+    }
+
+    // Obtain the nvml device for all bus_ids
+    std::vector<nvmlDevice_t> nvml_devices(num_devices, nullptr);
+    for (int i = 0; i < num_devices; ++i) {
+      auto res = driver_api->nvmlDeviceGetHandleByPciBusId_v2_(
+          bus_ids[i].c_str(), &nvml_devices[i]);
+      if (res != NVML_SUCCESS) {
+        LOG(WARNING) << "NVLinkDetector: Failed to obtain NVML device via "
+                     << "nvmlDeviceGetHandleByPciBusId_v2. " << warning_msg;
+        return c10::make_intrusive<c10d::DMAConnectivity>(
+            c10::DeviceType::CUDA, "nvlink", std::move(matrix));
+      }
+    }
+
+    std::vector<int> switch_link_count(num_devices, 0);
+    for (int i = 0; i < num_devices; ++i) {
+      for (int link = 0; link < max_nvlinks; ++link) {
+        nvmlIntNvLinkDeviceType_t deviceType{};
+        auto ret = driver_api->nvmlDeviceGetNvLinkRemoteDeviceType_(
+            nvml_devices[i], link, &deviceType);
+        if (ret != NVML_SUCCESS) {
+          // We've exhausted the NVLinks connected to this device. This error
+          // is benign. There doesn't seem to be a reliable way to obtain the
+          // maximum link value that can be passed to the API. Therefore, we
+          // simply increment the link value until the API fails or we reach a
+          // predefined maximum value.
+          break;
+        }
+        // Remote device is GPU
+        if (deviceType == NVML_NVLINK_DEVICE_TYPE_GPU) {
+          nvmlPciInfo_t pciInfo;
+          auto res = driver_api->nvmlDeviceGetNvLinkRemotePciInfo_v2_(
+              nvml_devices[i], link, &pciInfo);
+          if (res != NVML_SUCCESS) {
+            LOG(WARNING) << "NVLinkDetector: Failed to obtain NVML device via "
+                         << "nvmlDeviceGetHandleByPciBusId_v2. " << warning_msg;
+            return c10::make_intrusive<c10d::DMAConnectivity>(
+                c10::DeviceType::CUDA, "nvlink", std::move(matrix));
+          }
+          auto it = bus_id_to_device_idx.find(pciInfo.busId);
+          if (it != bus_id_to_device_idx.end()) {
+            if (i != it->second) {
+              matrix[i][it->second] += 1;
+            }
+          }
+          // Remote device is NVSwitch
+        } else if (deviceType == NVML_NVLINK_DEVICE_TYPE_SWITCH) {
+          switch_link_count[i] += 1;
+        }
+      }
+    }
+
+    // Process NVSwitch connections.
+    // For simplicity, we assume that all NVSwitches are interconnected.
+    for (int i = 0; i < num_devices; ++i) {
+      for (int j = 0; j < num_devices; ++j) {
+        if (i == j) {
+          continue;
+        }
+        matrix[i][j] += std::min(switch_link_count[i], switch_link_count[j]);
+      }
+    }
+
+    return c10::make_intrusive<c10d::DMAConnectivity>(
+        c10::DeviceType::CUDA, "nvlink", std::move(matrix));
+  }
+};
+
+struct RegisterDetector {
+  RegisterDetector() {
+    register_dma_connectivity_detector(
+        c10::DeviceType::CUDA, "nvlink", c10::make_intrusive<NVLinkDetector>());
+  }
+};
+
+static RegisterDetector register_detector_;
+
+} // namespace
+#endif
diff --git a/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
new file mode 100644
index 000000000000..0d54c389ddee
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
@@ -0,0 +1,97 @@
+#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
+#include <utility>
+
+namespace {
+
+std::string get_detector_key(
+    c10::DeviceType device_type,
+    const std::string& connection_type) {
+  std::ostringstream oss;
+  oss << device_type << "/" << connection_type;
+  return oss.str();
+}
+
+class DetectorMap {
+ public:
+  DetectorMap(const DetectorMap&) = delete;
+  DetectorMap& operator=(const DetectorMap&) = delete;
+  DetectorMap(DetectorMap&&) = delete;
+  DetectorMap& operator=(DetectorMap&&) = delete;
+  ~DetectorMap() = default;
+  static DetectorMap& get() {
+    static DetectorMap instance;
+    return instance;
+  }
+
+  void register_detector(
+      c10::DeviceType device_type,
+      const std::string& connection_type,
+      c10::intrusive_ptr<c10d::DMAConnectivityDetector> detector) {
+    auto key = get_detector_key(device_type, connection_type);
+    detector_map_[key] = std::move(detector);
+  }
+
+  c10::intrusive_ptr<c10d::DMAConnectivity> detect(
+      c10::DeviceType device_type,
+      const std::string& connection_type) {
+    auto key = get_detector_key(device_type, connection_type);
+    {
+      auto it = cached_.find(key);
+      if (it != cached_.end()) {
+        return it->second;
+      }
+    }
+
+    auto it = detector_map_.find(key);
+    TORCH_CHECK(
+        it != detector_map_.end(),
+        "DMA connectivity detector for ",
+        device_type,
+        " over ",
+        connection_type,
+        " is not available");
+    auto detector = it->second;
+    auto connectivity = detector->detect();
+    cached_[key] = connectivity;
+    return connectivity;
+  }
+
+ private:
+  DetectorMap() = default;
+
+  std::unordered_map<
+      std::string,
+      c10::intrusive_ptr<c10d::DMAConnectivityDetector>>
+      detector_map_;
+
+  std::unordered_map<std::string, c10::intrusive_ptr<c10d::DMAConnectivity>>
+      cached_;
+};
+
+} // namespace
+
+namespace c10d {
+
+DMAConnectivity::DMAConnectivity(
+    c10::DeviceType device_type,
+    std::string connection_type,
+    std::vector<std::vector<int>> matrix)
+    : device_type(device_type),
+      connection_type(std::move(connection_type)),
+      matrix(std::move(matrix)) {}
+
+void register_dma_connectivity_detector(
+    c10::DeviceType device_type,
+    const std::string& connection_type,
+    c10::intrusive_ptr<DMAConnectivityDetector> detector) {
+  return DetectorMap::get().register_detector(
+      device_type, connection_type, std::move(detector));
+}
+
+c10::intrusive_ptr<DMAConnectivity> detect_dma_connectivity(
+    c10::DeviceType device_type,
+    const std::string& connection_type) {
+  return DetectorMap::get().detect(device_type, connection_type);
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp
new file mode 100644
index 000000000000..db6baa3969ef
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace c10d {
+
+struct TORCH_API DMAConnectivity : c10::intrusive_ptr_target {
+  c10::DeviceType device_type;
+  std::string connection_type;
+
+  // This is an NxN matrix representing the connectivity between N devices,
+  // where each element matrix[i][j] indicates the connectivity between device
+  // i and device j. A value of 0 denotes that there is no connection between
+  // device i and j. The meaning of non-zero values are specific to the
+  // connection type (e.g., for NVLink it represents the number of NVLinks).
+  std::vector<std::vector<int>> matrix;
+
+  explicit DMAConnectivity(
+      c10::DeviceType device_type,
+      std::string connection_type,
+      std::vector<std::vector<int>> matrix);
+};
+
+struct DMAConnectivityDetector : c10::intrusive_ptr_target {
+  virtual c10::intrusive_ptr<DMAConnectivity> detect() = 0;
+  ~DMAConnectivityDetector() override = default;
+};
+
+C10_EXPORT void register_dma_connectivity_detector(
+    c10::DeviceType device_type,
+    const std::string& connection_type,
+    c10::intrusive_ptr<DMAConnectivityDetector> detector);
+
+TORCH_API c10::intrusive_ptr<DMAConnectivity> detect_dma_connectivity(
+    c10::DeviceType device_type,
+    const std::string& connection_type);
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
new file mode 100644
index 000000000000..9aae358af85d
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
@@ -0,0 +1,355 @@
+#ifdef USE_C10D_NCCL
+#include <nccl.h>
+#include <torch/csrc/cuda/nccl.h>
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 1)
+#define NCCL_HAS_SYMMEM_SUPPORT
+#endif
+
+#ifdef NCCL_HAS_SYMMEM_SUPPORT
+#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
+#include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
+#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
+#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/error.h>
+
+namespace c10d {
+namespace symmetric_memory {
+
+/* Start of NCCLAllocation implementation */
+
+static StoreExchange storeExchange = StoreExchange("NCCLAllocation");
+
+struct NCCLAllocation {
+  void* ptr;
+  size_t buffer_size;
+  int device_idx;
+
+  NCCLAllocation(void* ptr, size_t buffer_size, int device_idx)
+      : ptr(ptr), buffer_size(buffer_size), device_idx(device_idx) {}
+};
+
+class NCCLSymmetricMemory : public SymmetricMemory {
+ public:
+ NCCLSymmetricMemory(
+      std::shared_ptr<NCCLAllocation> allocation,
+      const std::string& group_name,
+      ncclWindow_t handle,
+      ncclWindow_t signal_handle)
+      : allocation_(allocation),
+        buffer_size_(allocation->buffer_size),
+        device_idx_(allocation->device_idx),
+        group_name_(group_name),
+        handle_(handle),
+        signal_handle_(signal_handle) {
+    c10::cuda::CUDAGuard guard(device_idx_);
+
+    // We need some API like nvshmem_extension::nvshmem_ptr()
+    // put API to get the reference of remote memory.
+    // WIP
+  }
+
+  ~NCCLSymmetricMemory() override = default;
+
+  std::vector<void*> get_buffer_ptrs() override {
+    return buffers_;
+  }
+
+  std::vector<void*> get_signal_pad_ptrs() override {
+    return signal_pads_;
+  }
+
+  void** get_buffer_ptrs_dev() override {
+    return buffers_dev_;
+  }
+
+  void** get_signal_pad_ptrs_dev() override {
+    return signal_pads_dev_;
+  }
+
+  size_t get_buffer_size() override {
+    return buffer_size_;
+  }
+
+  size_t get_signal_pad_size() override {
+    return signal_pad_size;
+  };
+
+  bool has_multicast_support() override {
+    // TODO
+    return false;
+  }
+
+  void* get_multicast_ptr() override {
+    // TODO
+    return nullptr;
+  }
+
+  // TODO: This is up for change.
+  at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset) {
+    // TODO: deduplicate
+    const size_t numel = std::accumulate(
+        sizes.begin(),
+        sizes.end(),
+        static_cast<size_t>(1),
+        std::multiplies<size_t>());
+    const auto element_size = c10::elementSize(dtype);
+    const auto req_size = (numel + storage_offset) * element_size;
+    TORCH_CHECK(
+        req_size <= buffer_size_,
+        "NCCLSymmetricMemory::get_buffer: the requested size (",
+        req_size,
+        " bytes) exceeds the allocated size (",
+        buffer_size_,
+        " bytes)");
+    auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
+        storage_offset * element_size;
+    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
+    auto options = at::TensorOptions().dtype(dtype).device(device);
+    return at::for_blob(data_ptr, sizes)
+        .options(options)
+        .target_device(device)
+        .make_tensor();
+  }
+
+  // TODO: This is up for change.
+  at::Tensor get_signal_pad(
+      int rank,
+      c10::IntArrayRef sizes,
+      std::optional<c10::ScalarType> dtype,
+      int64_t storage_offset) override {
+    // TODO: deduplicate
+    // If the dtype is unspecified, default it to UInt32, as it
+    // is the most common type for signaling purposes.
+    if (!dtype.has_value()) {
+      dtype = c10::ScalarType::UInt32;
+    }
+
+    // If the shape is unspecified, treat the signal pad as a 1d tensor.
+    const auto element_size = c10::elementSize(*dtype);
+    std::vector<int64_t> shape;
+    if (!sizes.empty()) {
+      shape = sizes.vec();
+    } else {
+      shape.push_back(signal_pad_size / element_size);
+    }
+
+    const size_t numel = std::accumulate(
+        shape.begin(),
+        shape.end(),
+        static_cast<size_t>(1),
+        std::multiplies<size_t>());
+    const auto req_size = (numel + storage_offset) * element_size;
+    TORCH_CHECK(
+        req_size <= signal_pad_size,
+        "NCCLSymmetricMemory::get_signal_pad: the requested size (",
+        req_size,
+        " bytes) exceeds the allocated size (",
+        signal_pad_size,
+        " bytes)");
+    auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
+        storage_offset * element_size;
+    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
+    auto options = at::TensorOptions().dtype(*dtype).device(device);
+    return at::for_blob(data_ptr, shape)
+        .options(options)
+        .target_device(device)
+        .make_tensor();
+  }
+
+  void barrier(int channel, size_t timeout_ms) override {
+    // TODO
+  }
+
+  void put_signal(int dst_rank, int channel, size_t timeout_ms) override {
+    // TODO
+  }
+
+  void wait_signal(int src_rank, int channel, size_t timeout_ms) override {
+    // TODO
+  }
+
+  int get_rank() override {
+    return rank_;
+  }
+
+  int get_world_size() override {
+    return world_size_;
+  }
+
+  virtual std::vector<int>& get_rank_to_global_rank() override {
+    return rank_to_global_rank_;
+  };
+
+  int* get_rank_to_global_rank_dev() override {
+    return rank_to_global_rank_dev_;
+  };
+
+ private:
+  std::shared_ptr<NCCLAllocation> allocation_;
+  size_t buffer_size_;
+  // TODO: We need to finalize what booking variables we need for nccl backend.
+  std::vector<void*> buffers_;
+  std::vector<void*> signal_pads_;
+  int device_idx_;
+  int rank_;
+  int world_size_;
+  void** buffers_dev_;
+  void** signal_pads_dev_;
+  std::string group_name_;
+  ncclWindow_t handle_;
+  ncclWindow_t signal_handle_;
+
+  std::vector<int> rank_to_global_rank_;
+  int* rank_to_global_rank_dev_;
+};
+
+class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
+ public:
+  void* alloc(
+      size_t size,
+      int device_idx,
+      const std::optional<std::string>& group_name) override {
+    TORCH_CHECK(
+        group_name == std::nullopt,
+        "NCCLSymmetricMemoryAllocator::alloc "
+        "must not be called with a group_name");
+
+    auto group_info = get_group_info("0");
+    auto store = group_info.store;
+    int rank = group_info.rank;
+    int world_size = group_info.world_size;
+    c10::cuda::CUDAGuard guard(device_idx);
+    // TODO: we might need to use a roundup or mempool for mem allocation.
+    void* ptr;
+    C10D_NCCL_CHECK(ncclMemAlloc(&ptr, size), "ncclMemAlloc");
+    auto allocation =
+        std::make_shared<NCCLAllocation>(ptr, size, device_idx);
+    // TODO: thread safety
+    allocations_.emplace(ptr, allocation);
+    return ptr;
+  }
+
+  void free(void* ptr) override {
+    // TODO: thread safety
+    ptr_to_symm_mem_.erase(ptr);
+    allocations_.erase(ptr);
+  };
+
+  size_t get_alloc_size(void* ptr) override {
+    auto it = ptr_to_symm_mem_.find(ptr);
+    if (it == ptr_to_symm_mem_.end()) {
+      TORCH_CHECK(
+          false, ptr, " is not allocated with NCCLSymmetricMemoryAllocator");
+    }
+    return it->second->get_buffer_size();
+  };
+
+  c10::intrusive_ptr<SymmetricMemory> rendezvous(
+      void* ptr,
+      const std::optional<std::string>& group_name) override {
+    TORCH_CHECK(group_name.has_value(), "group_name must be provided");
+    {
+      auto it = symm_mems_.find(std::make_tuple(ptr, *group_name));
+      if (it != symm_mems_.end()) {
+        return it->second;
+      }
+    }
+    auto it = allocations_.find(ptr);
+    TORCH_CHECK(it != allocations_.end(), "memory needs to be first allocated before calling rendezvous.");
+
+
+    auto group = resolve_process_group(group_name.value());
+    auto alloc = it->second;
+    c10::cuda::CUDAGuard guard(alloc->device_idx);
+    ncclWindow_t handle;
+    ncclWindow_t signal_handle;
+
+    auto group_info = get_group_info(group_name.value());
+    auto global_rank = get_group_info("0").rank;
+    auto buffer_size_map =
+        storeExchange.all_gather(group_info.store, group_info.rank, group_info.world_size, it->second->buffer_size);
+
+    LOG(INFO) << "[rank " << group_info.rank << "]"
+              << "buffer_size_map: " << buffer_size_map;
+    // NCCL window registration api requires all ranks to have the same buffer size
+    // we have this check to make sure all ranks have the same buffer size.
+    for (auto r = 0; r < group_info.world_size; ++r) {
+      TORCH_CHECK(alloc->buffer_size == buffer_size_map[r], "buffer size mismatch");
+    }
+    auto* ncclPg = dynamic_cast<c10d::ProcessGroupNCCL*>(
+        group->getBackend(c10::DeviceType::CUDA).get());
+    TORCH_CHECK(ncclPg != nullptr, "backend must be a NCCL process group");
+    ncclComm_t comm = reinterpret_cast<ncclComm_t>(ncclPg->getCommPtr());
+    C10D_NCCL_CHECK(
+      ncclCommWindowRegister(comm, ptr, alloc->buffer_size, (ncclWindow_t*)&handle, NCCL_WIN_COLL_SYMMETRIC),
+      c10::str(
+          "Failed to window register segment with ptr ",
+          ptr,
+          ", size ",
+          alloc->buffer_size,
+          " on ncclComm_ ",
+          comm));
+
+    void* signal_pad_ptr;
+    TORCH_CHECK(ncclMemAlloc(&signal_pad_ptr, signal_pad_size) == ncclSuccess, "ncclMemAlloc failed");
+    C10D_NCCL_CHECK(
+    ncclCommWindowRegister(comm, signal_pad_ptr, signal_pad_size, (ncclWindow_t*)&signal_handle, NCCL_WIN_COLL_SYMMETRIC),
+    c10::str(
+        "Failed to window register segment with ptr ",
+        signal_pad_ptr,
+        ", size ",
+        signal_pad_size,
+        " on ncclComm_ ",
+        comm));
+
+    auto symm_mem =
+        c10::make_intrusive<NCCLSymmetricMemory>(alloc, *group_name, std::move(handle), std::move(signal_handle));
+
+    symm_mems_[std::make_tuple(ptr, *group_name)] = symm_mem;
+    return symm_mem;
+  };
+
+  bool has_multicast_support(int device_idx) override {
+    // TODO
+    return false;
+  };
+
+ private:
+  std::unordered_map<void*, c10::intrusive_ptr<SymmetricMemory>>
+      ptr_to_symm_mem_;
+
+  std::unordered_map<void*, std::shared_ptr<NCCLAllocation>> allocations_;
+  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<SymmetricMemory>>
+      symm_mems_;
+};
+
+struct RegisterNCCLSymmetricMemoryAllocator {
+    RegisterNCCLSymmetricMemoryAllocator() {
+    // Query backend used for CUDA tensor
+    if (getSymmMemBackendCUDA() == "NCCL") {
+      register_allocator(
+          c10::DeviceType::CUDA,
+          c10::make_intrusive<NCCLSymmetricMemoryAllocator>());
+    }
+  }
+};
+
+static RegisterNCCLSymmetricMemoryAllocator register_allocator_;
+
+} // namespace symmetric_memory
+} // namespace c10d
+#endif // NCCL_HAS_SYMMEM_SUPPORT
+#endif // USE_C10D_NCCL
diff --git a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
new file mode 100644
index 000000000000..3b3c33562be0
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@@ -0,0 +1,350 @@
+#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/error.h>
+#include <utility>
+
+#include <nvshmem.h>
+
+namespace c10d {
+namespace symmetric_memory {
+
+/* Start of CUDASymmetricMemory implementation */
+
+static StoreExchange storeExchange = StoreExchange("NVSHMEMSymmetricMemory");
+
+struct NVSHMEMAllocation {
+  void* ptr;
+  size_t buffer_size;
+  int device_idx;
+
+  NVSHMEMAllocation(void* ptr, size_t buffer_size, int device_idx)
+      : ptr(ptr), buffer_size(buffer_size), device_idx(device_idx) {}
+
+  ~NVSHMEMAllocation() {
+    // Avoid calling CUDA functions after driver shutting down
+    if (is_finalizing()) {
+      return;
+    }
+    c10::cuda::CUDAGuard guard(device_idx);
+    nvshmem_free(ptr);  // nvshmem_free has no return value
+  }
+};
+
+class NVSHMEMSymmetricMemory : public SymmetricMemory {
+ public:
+  NVSHMEMSymmetricMemory(
+      std::shared_ptr<NVSHMEMAllocation> allocation,
+      const std::string& group_name)
+      : allocation_(allocation),
+        buffer_size_(allocation->buffer_size),
+        device_idx_(allocation->device_idx),
+        group_name_(group_name) {
+    // For logging only
+    static int exchanged_n_times = 0;
+    c10::cuda::CUDAGuard guard(device_idx_);
+
+    auto global_rank = get_group_info("0").rank;
+    GroupInfo& group_info = get_group_info(group_name_);
+    auto store = group_info.store;
+    rank_ = group_info.rank;
+    world_size_ = group_info.world_size;
+    // Exchange rank to global rank mapping for this group.
+    // If it is already available, skip the exchange.
+    if (group_info.rank_to_global_rank.empty()) {
+      group_info.rank_to_global_rank =
+          storeExchange.all_gather(store, rank_, world_size_, global_rank);
+      exchanged_n_times++;
+      if (rank_ == 0) {
+        LOG(INFO) << "[rank " << rank_ << "]"
+                  << " rank_to_global_rank: " << group_info.rank_to_global_rank
+                  << ", group_name: " << group_name_
+                  << ", exchanged_n_times: " << exchanged_n_times;
+      }
+    }
+    TORCH_INTERNAL_ASSERT(!group_info.rank_to_global_rank.empty());
+    rank_to_global_rank_ = group_info.rank_to_global_rank;
+    for (int r = 0; r < world_size_; ++r) {
+      buffers_.push_back(nvshmem_ptr(
+          allocation->ptr, rank_to_global_rank_[r]));
+    }
+
+    // TODO: use the same allocation for signal pad
+    void* signal_pad_ptr = nvshmem_malloc(signal_pad_size);
+    AT_CUDA_CHECK(cudaMemset(signal_pad_ptr, 0, signal_pad_size));
+
+    for (int r = 0; r < world_size_; ++r) {
+      signal_pads_.push_back(nvshmem_ptr(
+          signal_pad_ptr, rank_to_global_rank_[r]));
+    }
+
+    const size_t arr_size = sizeof(void*) * world_size_;
+    buffers_dev_ = reinterpret_cast<void**>(
+        c10::cuda::CUDACachingAllocator::raw_alloc(arr_size));
+    signal_pads_dev_ = reinterpret_cast<void**>(
+        c10::cuda::CUDACachingAllocator::raw_alloc(arr_size));
+
+    AT_CUDA_CHECK(cudaMemcpy(
+        buffers_dev_, buffers_.data(), arr_size, cudaMemcpyHostToDevice));
+    AT_CUDA_CHECK(cudaMemcpy(
+        signal_pads_dev_,
+        signal_pads_.data(),
+        arr_size,
+        cudaMemcpyHostToDevice));
+
+    rank_to_global_rank_dev_ = reinterpret_cast<int*>(
+        c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int) * world_size_));
+    AT_CUDA_CHECK(cudaMemcpy(
+        rank_to_global_rank_dev_,
+        rank_to_global_rank_.data(),
+        sizeof(int) * world_size_,
+        cudaMemcpyHostToDevice));
+  }
+
+  ~NVSHMEMSymmetricMemory() override{
+      // TODO
+  };
+
+  std::vector<void*> get_buffer_ptrs() override {
+    return buffers_;
+  }
+
+  std::vector<void*> get_signal_pad_ptrs() override {
+    return signal_pads_;
+  }
+
+  void** get_buffer_ptrs_dev() override {
+    return buffers_dev_;
+  }
+
+  void** get_signal_pad_ptrs_dev() override {
+    return signal_pads_dev_;
+  }
+
+  size_t get_buffer_size() override {
+    return buffer_size_;
+  }
+
+  size_t get_signal_pad_size() override {
+    return signal_pad_size;
+  };
+
+  bool has_multicast_support() override {
+    // TODO
+    return false;
+  }
+
+  void* get_multicast_ptr() override {
+    // TODO
+    return nullptr;
+  }
+
+  at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset) {
+    // TODO: deduplicate
+    const size_t numel = std::accumulate(
+        sizes.begin(),
+        sizes.end(),
+        static_cast<size_t>(1),
+        std::multiplies<size_t>());
+    const auto element_size = c10::elementSize(dtype);
+    const auto req_size = (numel + storage_offset) * element_size;
+    TORCH_CHECK(
+        req_size <= buffer_size_,
+        "NVSHMEMSymmetricMemory::get_buffer: the requested size (",
+        req_size,
+        " bytes) exceeds the allocated size (",
+        buffer_size_,
+        " bytes)");
+    auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
+        storage_offset * element_size;
+    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
+    auto options = at::TensorOptions().dtype(dtype).device(device);
+    return at::for_blob(data_ptr, sizes)
+        .options(options)
+        .target_device(device)
+        .make_tensor();
+  }
+
+  at::Tensor get_signal_pad(
+      int rank,
+      c10::IntArrayRef sizes,
+      std::optional<c10::ScalarType> dtype,
+      int64_t storage_offset) override {
+    // TODO: deduplicate
+    // If the dtype is unspecified, default it to UInt32, as it
+    // is the most common type for signaling purposes.
+    if (!dtype.has_value()) {
+      dtype = c10::ScalarType::UInt32;
+    }
+
+    // If the shape is unspecified, treat the signal pad as a 1d tensor.
+    const auto element_size = c10::elementSize(*dtype);
+    std::vector<int64_t> shape;
+    if (!sizes.empty()) {
+      shape = sizes.vec();
+    } else {
+      shape.push_back(signal_pad_size / element_size);
+    }
+
+    const size_t numel = std::accumulate(
+        shape.begin(),
+        shape.end(),
+        static_cast<size_t>(1),
+        std::multiplies<size_t>());
+    const auto req_size = (numel + storage_offset) * element_size;
+    TORCH_CHECK(
+        req_size <= signal_pad_size,
+        "NVSHMEMSymmetricMemory::get_signal_pad: the requested size (",
+        req_size,
+        " bytes) exceeds the allocated size (",
+        signal_pad_size,
+        " bytes)");
+    auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
+        storage_offset * element_size;
+    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
+    auto options = at::TensorOptions().dtype(*dtype).device(device);
+    return at::for_blob(data_ptr, shape)
+        .options(options)
+        .target_device(device)
+        .make_tensor();
+  }
+
+  void barrier(int channel, size_t timeout_ms) override {
+    // TODO
+  }
+
+  void put_signal(int dst_rank, int channel, size_t timeout_ms) override {
+    // TODO
+  }
+
+  void wait_signal(int src_rank, int channel, size_t timeout_ms) override {
+    // TODO
+  }
+
+  int get_rank() override {
+    return rank_;
+  }
+
+  int get_world_size() override {
+    return world_size_;
+  }
+
+  virtual const std::vector<int>& get_rank_to_global_rank() override {
+    return rank_to_global_rank_;
+  };
+
+  int* get_rank_to_global_rank_dev() override {
+    return rank_to_global_rank_dev_;
+  };
+
+ private:
+  std::shared_ptr<NVSHMEMAllocation> allocation_;
+  size_t buffer_size_;
+  std::vector<void*> buffers_;
+  std::vector<void*> signal_pads_;
+  int device_idx_;
+  int rank_;
+  int world_size_;
+  void** buffers_dev_;
+  void** signal_pads_dev_;
+  std::string group_name_;
+
+  std::vector<int> rank_to_global_rank_;
+  int* rank_to_global_rank_dev_;
+};
+
+class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
+ public:
+  void* alloc(
+      size_t size,
+      int device_idx,
+      const std::optional<std::string>& group_name) override {
+    TORCH_CHECK(
+        group_name == std::nullopt,
+        "NVSHMEMSymmetricMemoryAllocator::alloc "
+        "must not be called with a group_name");
+
+    auto group_info = get_group_info("0");
+    auto store = group_info.store;
+    int rank = group_info.rank;
+    int world_size = group_info.world_size;
+
+    nvshmem_extension::initialize_nvshmem_with_store(store, rank, world_size);
+    auto ptr = nvshmem_malloc(size);
+    auto allocation =
+        std::make_shared<NVSHMEMAllocation>(ptr, size, device_idx);
+    // TODO: thread safety
+    allocations_.try_emplace(ptr, std::move(allocation));
+    return ptr;
+  }
+
+  void free(void* ptr) override {
+    // TODO: thread safety
+    allocations_.erase(ptr);
+  };
+
+  size_t get_alloc_size(void* ptr) override {
+    auto it = allocations_.find(ptr);
+    if (it == allocations_.end()) {
+      TORCH_CHECK(
+          false, ptr, " is not allocated with NVSHMEMSymmetricMemoryAllocator");
+    }
+    return it->second->buffer_size;
+  };
+
+  c10::intrusive_ptr<SymmetricMemory> rendezvous(
+      void* ptr,
+      const std::optional<std::string>& group_name) override {
+    TORCH_CHECK(group_name.has_value());
+    {
+      auto it = symm_mems_.find(std::make_tuple(ptr, *group_name));
+      if (it != symm_mems_.end()) {
+        return it->second;
+      }
+    }
+    auto it = allocations_.find(ptr);
+    TORCH_CHECK(it != allocations_.end());
+    auto symm_mem =
+        c10::make_intrusive<NVSHMEMSymmetricMemory>(it->second, *group_name);
+
+    symm_mems_[std::make_tuple(ptr, *group_name)] = symm_mem;
+    return symm_mem;
+  };
+
+  bool has_multicast_support(int device_idx) override {
+    // TODO
+    return false;
+  };
+
+ private:
+  std::unordered_map<void*, std::shared_ptr<NVSHMEMAllocation>> allocations_;
+  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<SymmetricMemory>>
+      symm_mems_;
+};
+
+struct RegisterNVSHMEMSymmetricMemoryAllocator {
+  RegisterNVSHMEMSymmetricMemoryAllocator() {
+    // Query backend used for CUDA tensor
+    if (getSymmMemBackendCUDA() == "NVSHMEM") {
+      register_allocator(
+          c10::DeviceType::CUDA,
+          c10::make_intrusive<NVSHMEMSymmetricMemoryAllocator>());
+    }
+  }
+};
+
+static RegisterNVSHMEMSymmetricMemoryAllocator register_allocator_;
+
+} // namespace symmetric_memory
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
new file mode 100644
index 000000000000..59e0bb54fc8e
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
@@ -0,0 +1,294 @@
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+namespace {
+
+using namespace c10d::symmetric_memory;
+
+static bool is_finalizing_ = false;
+
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+class AllocatorMap {
+ public:
+  AllocatorMap(const AllocatorMap&) = delete;
+  AllocatorMap& operator=(const AllocatorMap&) = delete;
+  static AllocatorMap& get() {
+    static AllocatorMap instance;
+    return instance;
+  }
+
+  void register_allocator(
+      c10::DeviceType device_type,
+      c10::intrusive_ptr<SymmetricMemoryAllocator> allocator) {
+    map_[device_type] = std::move(allocator);
+  }
+
+  c10::intrusive_ptr<SymmetricMemoryAllocator> get_allocator(
+      c10::DeviceType device_type) {
+    auto it = map_.find(device_type);
+    TORCH_CHECK(
+        it != map_.end(),
+        "SymmetricMemory does not support device type ",
+        device_type);
+    return it->second;
+  }
+
+  bool has_allocator(c10::DeviceType device_type) {
+    auto it = map_.find(device_type);
+    return it != map_.end();
+  }
+
+  ~AllocatorMap() {
+    LOG(INFO) << "Destroying Symmetric Memory Allocators";
+    is_finalizing_ = true;
+  }
+
+ private:
+  AllocatorMap() = default;
+
+  std::unordered_map<
+      c10::DeviceType,
+      c10::intrusive_ptr<SymmetricMemoryAllocator>>
+      map_;
+};
+
+static std::unordered_map<std::string, GroupInfo> group_info_map{};
+
+// Data structures for tracking persistent allocations
+static std::unordered_map<uint64_t, void*> alloc_id_to_dev_ptr{};
+static std::unordered_map<uint64_t, c10::weak_intrusive_ptr<c10::StorageImpl>>
+    alloc_id_to_storage{};
+
+static at::Tensor empty_strided_p2p_persistent(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    c10::ScalarType dtype,
+    c10::Device device,
+    const std::optional<std::string>& group_name,
+    uint64_t alloc_id) {
+  // Make the allocation fails if a previous allocation with the same alloc_id
+  // is still active.
+  auto storage = alloc_id_to_storage.find(alloc_id);
+  if (storage != alloc_id_to_storage.end() && storage->second.use_count() > 0) {
+    TORCH_CHECK(
+        false,
+        "SymmetricMemory::empty_strided_p2p_persistent: ",
+        "can not allocate with alloc_id == ",
+        alloc_id,
+        " because a previous allocation with the same alloc_id "
+        "is still active.");
+  }
+
+  const size_t numel = std::accumulate(
+      size.begin(),
+      size.end(),
+      size_t(1),
+      // NOLINTNEXTLINE(modernize-use-transparent-functors)
+      std::multiplies<size_t>());
+  const size_t element_size = c10::elementSize(dtype);
+  const size_t alloc_size = numel * element_size;
+
+  auto allocator = get_allocator(device.type());
+  void* dev_ptr = nullptr;
+  if (alloc_id_to_dev_ptr.find(alloc_id) != alloc_id_to_dev_ptr.end()) {
+    dev_ptr = alloc_id_to_dev_ptr[alloc_id];
+    TORCH_CHECK(
+        alloc_size == allocator->get_alloc_size(dev_ptr),
+        "SymmetricMemory::empty_strided_p2p_persistent: ",
+        "requested allocation size (",
+        alloc_size,
+        ") is different from the size of a previous allocation ",
+        "with the same alloc_id ",
+        allocator->get_alloc_size(dev_ptr));
+  } else {
+    dev_ptr = allocator->alloc(alloc_size, device.index(), group_name);
+    alloc_id_to_dev_ptr[alloc_id] = dev_ptr;
+  }
+
+  auto options = at::TensorOptions().dtype(dtype).device(device);
+  auto allocated = at::from_blob(dev_ptr, size, stride, options);
+
+  // Track the allocation's activeness
+  alloc_id_to_storage.erase(alloc_id);
+  alloc_id_to_storage.emplace(
+      alloc_id, allocated.storage().getWeakStorageImpl());
+  return allocated;
+}
+
+} // namespace
+
+namespace c10d::symmetric_memory {
+
+bool is_finalizing() {
+  return is_finalizing_;
+}
+
+void register_allocator(
+    c10::DeviceType device_type,
+    c10::intrusive_ptr<SymmetricMemoryAllocator> allocator) {
+  return AllocatorMap::get().register_allocator(
+      device_type, std::move(allocator));
+}
+
+bool has_allocator(c10::DeviceType device_type) {
+  return AllocatorMap::get().has_allocator(device_type);
+}
+
+c10::intrusive_ptr<SymmetricMemoryAllocator> get_allocator(
+    c10::DeviceType device_type) {
+  return AllocatorMap::get().get_allocator(device_type);
+}
+
+void set_group_info(
+    const std::string& group_name,
+    int rank,
+    int world_size,
+    c10::intrusive_ptr<Store> store) {
+  TORCH_CHECK(group_info_map.find(group_name) == group_info_map.end());
+  GroupInfo group_info;
+  group_info.rank = rank;
+  group_info.world_size = world_size;
+  group_info.store = std::move(store);
+  group_info_map.emplace(group_name, std::move(group_info));
+}
+
+GroupInfo& get_group_info(const std::string& group_name) {
+  TORCH_CHECK(
+      group_info_map.find(group_name) != group_info_map.end(),
+      "get_group_info: no group info associated with the group name ",
+      group_name);
+  return group_info_map[group_name];
+}
+
+at::Tensor empty_strided_p2p(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    c10::ScalarType dtype,
+    c10::Device device,
+    const std::optional<std::string>& group_name,
+    std::optional<uint64_t> alloc_id) {
+  if (alloc_id.has_value()) {
+    return empty_strided_p2p_persistent(
+        size, stride, dtype, device, group_name, *alloc_id);
+  }
+  const size_t numel = std::accumulate(
+      size.begin(),
+      size.end(),
+      size_t(1),
+      // NOLINTNEXTLINE(modernize-use-transparent-functors)
+      std::multiplies<size_t>());
+  const size_t element_size = c10::elementSize(dtype);
+  const size_t alloc_size = numel * element_size;
+
+  auto allocator = get_allocator(device.type());
+  void* dev_ptr = allocator->alloc(alloc_size, device.index(), group_name);
+
+  auto options = at::TensorOptions().dtype(dtype).device(device);
+  return at::from_blob(
+      dev_ptr,
+      size,
+      stride,
+      [allocator = std::move(allocator)](void* ptr) { allocator->free(ptr); },
+      options);
+}
+
+TORCH_API c10::intrusive_ptr<SymmetricMemory> rendezvous(
+    const at::Tensor& tensor,
+    const std::optional<std::string>& group_name) {
+  auto allocator = get_allocator(tensor.device().type());
+  return allocator->rendezvous(tensor.storage().data_ptr().get(), group_name);
+}
+
+TORCH_API bool has_multicast_support(
+    c10::DeviceType device_type,
+    int device_idx) {
+  if (!has_allocator(device_type)) {
+    return false;
+  } else {
+    auto allocator = get_allocator(device_type);
+    return allocator->has_multicast_support(device_idx);
+  }
+}
+} // namespace c10d::symmetric_memory
+
+namespace {
+
+at::Tensor one_shot_all_reduce_meta(
+    const at::Tensor& input,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
+    std::string reduce_op,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
+    std::string group_name) {
+  return at::empty_like(input);
+}
+
+at::Tensor one_shot_all_reduce_copy_meta(
+    const at::Tensor& symm_buffer,
+    const at::Tensor& local_input,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
+    std::string reduce_op,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
+    std::string group_name) {
+  return at::empty_like(local_input);
+}
+
+TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
+  m.def(
+      "multimem_all_reduce_(Tensor(a!) input, str reduce_op, str group_name) -> Tensor(a!)");
+  m.def(
+      "multimem_one_shot_all_reduce(Tensor input, str reduce_op, str group_name) -> Tensor");
+  m.def(
+      "multimem_one_shot_all_reduce_out(Tensor input, str reduce_op, str group_name, Tensor(a!) out) -> Tensor(a!)");
+  m.def(
+      "multimem_all_gather_out(Tensor input, str group_name, Tensor(a!) out) -> Tensor(a!)");
+  m.def(
+      "one_shot_all_reduce(Tensor input, str reduce_op, str group_name) -> Tensor");
+  m.def(
+      "one_shot_all_reduce_out(Tensor input, str reduce_op, str group_name, Tensor(a!) out) -> Tensor(a!)");
+  m.def(
+      "one_shot_all_reduce_copy(Tensor symm_buffer, Tensor local_input, str reduce_op, str group_name) -> Tensor");
+  m.def(
+      "one_shot_all_reduce_copy_out(Tensor symm_buffer, Tensor local_input, str reduce_op, str group_name, Tensor(a!) out) -> Tensor(a!)");
+
+  m.def(
+      "two_shot_all_reduce_(Tensor(a!) input, str reduce_op, str group_name) -> Tensor(a!)");
+
+  // note this implementation also modified the input tensor
+  m.def(
+      "two_shot_all_reduce_out(Tensor(a!) input, str reduce_op, str group_name, Tensor(b!) output) -> Tensor(b!)");
+
+  // note this implementation also modified the input tensor
+  m.def(
+      "reduce_scatter_out(Tensor(a!) input, str group_name, bool split_last_dim, Tensor(b!) output) -> Tensor(b!)");
+
+  // An mm that supports consuming asynchronous input. It guarantees the
+  // following rasterization order, and that the corresponding signal arrives
+  // before an input chunk is consumed.
+  //
+  // num_chunks = a_chunks_signals.numel()
+  // for chunk_idx in range(a_chunk_pivot, num_chunks + a_chunk_pivot):
+  //     chunk_idx = chunk_idx % num_chunks
+  //     wait_signal(a_chunk_signals, chunk_idx)
+  //     # Compute output tiles that consumes the input chunk
+  m.def(
+      "_async_input_mm(Tensor a, Tensor b, Tensor a_chunk_signals, int a_chunk_pivot) -> Tensor");
+  m.def(
+      "stream_write_value32_(Tensor(a!) input, int offset, int val) -> Tensor(a!)");
+  m.def(
+      "memset32_(Tensor(a!) input, int offset, int val, int count) -> Tensor(a!)");
+
+  m.def("nvshmem_put(Tensor(a!) tensor, int peer) -> ()");
+  m.def("nvshmem_broadcast(Tensor(a!) input, str group_name) -> Tensor(a!)");
+  m.def(
+      "nvshmem_all_to_all(Tensor input, Tensor(a!) out, str group_name) -> Tensor(a!)");
+  m.def(
+      "all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name) -> Tensor(a!)");
+  m.def(
+      "all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name, int? major_align=None) -> Tensor(a!)");
+}
+
+TORCH_LIBRARY_IMPL(symm_mem, Meta, m) {
+  m.impl("one_shot_all_reduce", one_shot_all_reduce_meta);
+  m.impl("one_shot_all_reduce_copy", one_shot_all_reduce_copy_meta);
+}
+
+} // namespace
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
new file mode 100644
index 000000000000..dd11980fb706
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
@@ -0,0 +1,176 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d::symmetric_memory {
+
+// SymmetricMemory represents symmetric allocations across a group of devices.
+// The allocations represented by a SymmetricMemory object are accessible by
+// all devices in the group. The class can be used for op-level custom
+// communication patterns (via the get_buffer APIs and the synchronization
+// primitives), as well as custom communication kernels (via the buffer and
+// signal_pad device pointers).
+//
+// To acquire a SymmetricMemory object, each rank first allocates
+// identical-sized memory via SymmetricMemoryAllocator::alloc(), then invokes
+// SymmetricMemoryAllocator::rendezvous() on the memory to establish the
+// association across peer buffers. The rendezvous is a one-time process, and
+// the mapping between a local memory memory and the associated SymmetricMemory
+// object is unique.
+//
+// NOTE [symmetric memory signal pad]
+// Signal pads are P2P-accessible memory regions designated for
+// synchronization. SymmetricMemory offers built-in synchronization primitives
+// such as barriers, put_signal, and wait_signal, which are all based on signal
+// pads. Users may utilize signal pads for their own synchronization logic,
+// provided that the signal pads remain zero-filled following successful
+// synchronization.
+//
+// NOTE [symmetric memory synchronization channel]
+// Synchronization channels allow users to use a single SymmetricMemory object
+// to perform isolated synchronizations on different streams. For example,
+// consider the case in which two barriers are issued on two streams for
+// different purposes. Without the concept of channels, we cannot guarantee the
+// correctness of the barriers since signals issued from barrier on stream A
+// can be received by the barrier on stream B. By specifying different channels
+// for these two barriers, they can operate correctly in parallel.
+class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
+ public:
+  ~SymmetricMemory() override = default;
+
+  virtual std::vector<void*> get_buffer_ptrs() = 0;
+  virtual std::vector<void*> get_signal_pad_ptrs() = 0;
+
+  // get_buffer_ptrs_dev() and get_signal_pad_ptrs_dev() each return a pointer
+  // to a device array of size world_size, containing buffer pointers and
+  // signal pad pointers, respectively.
+  virtual void** get_buffer_ptrs_dev() = 0;
+  virtual void** get_signal_pad_ptrs_dev() = 0;
+  virtual size_t get_buffer_size() = 0;
+  virtual size_t get_signal_pad_size() = 0;
+
+  virtual bool has_multicast_support() = 0;
+  virtual void* get_multicast_ptr() = 0;
+
+  virtual at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset) = 0;
+
+  virtual at::Tensor get_signal_pad(
+      int rank,
+      c10::IntArrayRef sizes,
+      std::optional<c10::ScalarType> dtype = std::nullopt,
+      int64_t storage_offset = 0) = 0;
+
+  virtual void barrier(int channel, size_t timeout_ms) = 0;
+  virtual void put_signal(int dst_rank, int channel, size_t timeout_ms) = 0;
+  virtual void wait_signal(int src_rank, int channel, size_t timeout_ms) = 0;
+
+  virtual int get_rank() = 0;
+  virtual int get_world_size() = 0;
+
+  virtual const std::vector<int>& get_rank_to_global_rank() {
+    TORCH_CHECK(false, "NYI");
+  }
+
+  virtual int* get_rank_to_global_rank_dev() {
+    TORCH_CHECK(false, "NYI");
+  }
+};
+
+class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
+ public:
+  ~SymmetricMemoryAllocator() override = default;
+
+  virtual void* alloc(
+      size_t size,
+      int device_idx,
+      const std::optional<std::string>& group_name) = 0;
+
+  virtual void free(void* ptr) = 0;
+  virtual size_t get_alloc_size(void* ptr) = 0;
+  virtual c10::intrusive_ptr<SymmetricMemory> rendezvous(
+      void* ptr,
+      const std::optional<std::string>& group_name) = 0;
+  virtual bool has_multicast_support(int device_idx) = 0;
+};
+
+C10_EXPORT bool is_finalizing();
+
+C10_EXPORT void register_allocator(
+    c10::DeviceType device_type,
+    c10::intrusive_ptr<SymmetricMemoryAllocator> allocator);
+
+C10_EXPORT bool has_allocator(c10::DeviceType device_type);
+
+C10_EXPORT c10::intrusive_ptr<SymmetricMemoryAllocator> get_allocator(
+    c10::DeviceType device_type);
+
+// Set a store for rendezvousing symmetric allocations on a group of devices
+// identified by `group_name`. The concept of groups is logical; users can
+// utilize predefined groups (e.g., a group of device identified by a
+// ProcessGroup) or create custom ones. Note that a SymmetricMemoryAllocator
+// backends might employ a more efficient communication channel for the actual
+// rendezvous process and only use the store for bootstrapping purposes.
+TORCH_API void set_group_info(
+    const std::string& group_name,
+    int rank,
+    int world_size,
+    c10::intrusive_ptr<Store> store);
+
+struct GroupInfo {
+  int rank;
+  int world_size;
+  c10::intrusive_ptr<c10d::Store> store;
+  // Note this field is not automatically populated by set_group_info().  If a
+  // SymmetricMemory implementation needs to use it, it must be populated by a
+  // call to exchange_global_ranks() first.
+  std::vector<int> rank_to_global_rank;
+};
+
+C10_EXPORT GroupInfo& get_group_info(const std::string& group_name);
+
+// Identical to empty_strided, but allows symmetric memory access to be
+// established for the allocated tensor via SymmetricMemory::rendezvous(). This
+// function itself is not a collective operation. It invokes
+// SymmetricMemoryAllocator::alloc() for the requested device under the hood.
+//
+// NOTE [symmetric memory persistent allocation]
+// If an `alloc_id` is supplied, empty_strided_p2p will perform persistent
+// allocation. This makes the function cache allocated memory and ensure that
+// invocations with the same `alloc_id` receive tensors backed by the same
+// memory address. For safety, if a previous persistent allocation is still
+// active (i.e., the storage of the returned tensor is still alive), persistent
+// allocations with the same `alloc_id` will fail. This determinism coupled
+// with memory planning of communication buffers (e.g., by Inductor) allows
+// communication algorithms to reliably reuse previously established remote
+// memory access.
+TORCH_API at::Tensor empty_strided_p2p(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    c10::ScalarType dtype,
+    c10::Device device,
+    const std::optional<std::string>& group_name,
+    std::optional<uint64_t> alloc_id);
+
+// Establishes symmetric memory access on tensors allocated via
+// empty_strided_p2p() and empty_strided_p2p_persistent(). rendezvous() is a
+// one-time process, and the mapping between a local memory region and the
+// associated SymmetricMemory object is unique. Subsequent calls to
+// rendezvous() with the same tensor, or tensors allocated with
+// empty_strided_p2p_persistent() using the same alloc_id, will receive the
+// cached SymmetricMemory object.
+//
+// The function has a collective semantic and must be invoked simultaneously
+// from all rendezvous participants.
+TORCH_API c10::intrusive_ptr<SymmetricMemory> rendezvous(
+    const at::Tensor& tensor,
+    const std::optional<std::string>& group_name = std::nullopt);
+
+TORCH_API bool has_multicast_support(
+    c10::DeviceType device_type,
+    int device_idx);
+} // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
new file mode 100644
index 000000000000..0d53d100cee7
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
@@ -0,0 +1,227 @@
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
+
+#if defined(USE_ROCM)
+#include <rocm_smi/rocm_smi.h>
+#endif
+
+namespace c10d::intra_node_comm {
+
+static std::vector<std::string> ENABLE_INTRA_NODE_COMM = {
+    "ENABLE_INTRA_NODE_COMM"};
+// Forces detectedTopology() to return Topology::FULLY_CONNECTED, so
+// IntraNodeComm can be used even without NVLink connection. This is only used
+// for testing purposes.
+static std::vector<std::string> TEST_INTRA_NODE_COMM = {"TEST_INTRA_NODE_COMM"};
+static int intraNodeCommIdx = 0;
+
+/**
+ * Query the nvlink connection among devices.
+ */
+static NvlMesh getNvlMesh(const std::vector<int>& rankToDeviceIdx) {
+#if !defined(USE_RCOM)
+  auto connectivity = detect_dma_connectivity(c10::DeviceType::CUDA, "nvlink");
+  NvlMesh nvlMesh = {};
+  for (size_t srcRank = 0; srcRank < kMaxDevices; ++srcRank) {
+    for (size_t dstRank = 0; dstRank < kMaxDevices; ++dstRank) {
+      if (srcRank < rankToDeviceIdx.size() &&
+          dstRank < rankToDeviceIdx.size()) {
+        nvlMesh[srcRank][dstRank] =
+            connectivity
+                ->matrix[rankToDeviceIdx[srcRank]][rankToDeviceIdx[dstRank]];
+      }
+    }
+  }
+  return nvlMesh;
+#else
+  NvlMesh nvlMesh = {};
+  const auto worldSize = rankToDeviceIdx.size();
+  // For each device, loop over devices connected to it
+  for (size_t idx = 0; idx < worldSize; ++idx) {
+    for (size_t link = 0; link < kMaxDevices; ++link) {
+      if (idx == link)
+        continue;
+
+      bool conn = false;
+      auto ret = rsmi_is_P2P_accessible(idx, link, &conn);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(ERROR)
+            << "IntraNodeComm: getNvlMesh: rsmi_is_P2P_accessible returned error ret="
+            << ret;
+        return {};
+      }
+
+      if (conn) {
+        nvlMesh[idx][link] += 1;
+      }
+    }
+  }
+  return nvlMesh;
+#endif
+}
+
+/**
+ * Detect topology given a NvlMesh.
+ */
+static Topology detectTopology(const NvlMesh nvlMesh, size_t worldSize) {
+  if (getCvarBool(TEST_INTRA_NODE_COMM, false)) {
+    return Topology::FULLY_CONNECTED;
+  }
+  bool fullyConnected = true;
+  for (size_t i = 0; i < worldSize - 1; ++i) {
+    for (size_t j = i + 1; j < worldSize; ++j) {
+      if (nvlMesh[i][j] == 0 || nvlMesh[j][i] == 0) {
+        fullyConnected = false;
+      }
+    }
+  }
+  if (fullyConnected) {
+    LOG(INFO) << "IntraNodeComm: Topology::FULLY_CONNECTED";
+    return Topology::FULLY_CONNECTED;
+  }
+  LOG(INFO) << "IntraNodeComm: Topology::UNKNOWN";
+  return Topology::UNKNOWN;
+}
+
+IntraNodeComm::IntraNodeComm(
+    c10::intrusive_ptr<c10d::Store> store,
+    size_t rank,
+    size_t worldSize,
+    std::optional<size_t> bufferSize)
+    : store_(std::move(store)),
+      rank_(rank),
+      worldSize_(worldSize),
+      bufferSize_(bufferSize.has_value() ? *bufferSize : kDefaultBufferSize) {}
+
+IntraNodeComm::~IntraNodeComm() {
+  if (!isInitialized_) {
+    return;
+  }
+  auto allocator = get_allocator(c10::DeviceType::CUDA);
+  allocator->free(symmetricMemoryPtr_);
+}
+
+bool IntraNodeComm::isEnabled() {
+  return getCvarBool(ENABLE_INTRA_NODE_COMM, false);
+}
+
+/**
+ * Use c10d::Store to perform allgather on a trivially copyable type.
+ */
+template <typename T>
+static std::vector<T> storeAllGather(
+    const c10::intrusive_ptr<c10d::Store>& store,
+    const std::string& prefix,
+    size_t rank,
+    size_t worldSize,
+    T val) {
+  static_assert(std::is_trivially_copyable_v<T>);
+
+  std::vector<std::string> peerKeys;
+  for (size_t r = 0; r < worldSize; ++r) {
+    std::ostringstream oss;
+    oss << prefix << "-" << r;
+    peerKeys.push_back(oss.str());
+  }
+
+  {
+    std::vector<uint8_t> payload(
+        reinterpret_cast<uint8_t*>(&val),
+        reinterpret_cast<uint8_t*>(&val) + sizeof(T));
+    store->set(peerKeys[rank], payload);
+  }
+
+  std::vector<T> peerVals;
+  for (size_t r = 0; r < worldSize; ++r) {
+    if (r == rank) {
+      peerVals.push_back(val);
+      continue;
+    }
+    store->wait({peerKeys[r]});
+    auto payload = store->get(peerKeys[r]);
+    TORCH_CHECK(payload.size() == sizeof(T));
+    T peerVal{};
+    std::memcpy(&peerVal, payload.data(), sizeof(T));
+    peerVals.push_back(peerVal);
+  }
+  return peerVals;
+}
+
+bool IntraNodeComm::rendezvous() {
+  if (isInitialized_) {
+    return true;
+  }
+  if (!isIntraNodeCommSupported() || worldSize_ < 2 ||
+      worldSize_ > kMaxDevices) {
+    return false;
+  }
+
+  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+  deviceIdx_ = at::cuda::current_device();
+
+  // Exchange hostname and device bus ID
+  struct DevInfo {
+    // NOLINTNEXTLINE
+    char hostname[HOST_NAME_MAX + 1];
+    int deviceIdx;
+  };
+
+  DevInfo devInfo{};
+  gethostname(devInfo.hostname, sizeof(devInfo.hostname));
+  devInfo.deviceIdx = deviceIdx_;
+
+#if defined(USE_ROCM)
+  auto ret = rsmi_init(0);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    LOG(ERROR) << "IntraNodeComm:: rendezvous failed in rsmi_init, ret=" << ret;
+    return false;
+  }
+#endif
+
+  auto peerDevInfos =
+      storeAllGather(store_, "handshake-0", rank_, worldSize_, devInfo);
+
+  std::vector<int> rankToDeviceIdx;
+  for (const auto& info : peerDevInfos) {
+    if (strcmp(info.hostname, peerDevInfos.front().hostname) != 0) {
+      LOG(WARNING) << "Aborting IntraNodeComm::rendezvous because some "
+                      "participants are not on the same host ("
+                   << info.hostname << ", " << devInfo.hostname << ")";
+      return false;
+    }
+    rankToDeviceIdx.emplace_back(info.deviceIdx);
+  }
+
+  {
+    std::unordered_set uniqueDeviceIdxs(
+        rankToDeviceIdx.begin(), rankToDeviceIdx.end());
+    if (uniqueDeviceIdxs.size() != worldSize_) {
+      LOG(WARNING)
+          << "Skipping IntraNodeComm::rendezvous() because participants have "
+             "overlapping devices. To resolve this, call torch.cuda.set_device() "
+             "before init_process_group().";
+      return false;
+    }
+  }
+
+  // Query nvlink connection
+  auto nvlMesh = getNvlMesh(rankToDeviceIdx);
+
+  // Detect topology
+  topology_ = detectTopology(nvlMesh, worldSize_);
+  if (topology_ != Topology::FULLY_CONNECTED) {
+    return false;
+  }
+
+  auto groupName = "IntraNodeComm" + std::to_string(intraNodeCommIdx++);
+  set_group_info(
+      groupName, static_cast<int>(rank_), static_cast<int>(worldSize_), store_);
+  auto allocator = get_allocator(c10::DeviceType::CUDA);
+  symmetricMemoryPtr_ = allocator->alloc(bufferSize_, deviceIdx_, groupName);
+  symmetricMemory_ = allocator->rendezvous(symmetricMemoryPtr_, std::nullopt);
+  isInitialized_ = true;
+  return true;
+}
+
+} // namespace c10d::intra_node_comm
diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu
new file mode 100644
index 000000000000..6a6a6520e36b
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu
@@ -0,0 +1,124 @@
+#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
+
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
+
+namespace c10d {
+namespace intra_node_comm {
+
+static constexpr size_t kOneShotThreshBytes = 256 * 1024;
+static constexpr size_t kTwoShotThreshBytes = 10 * 1024 * 1024;
+
+static void checkInput(const at::Tensor& input, int deviceIdx) {
+  TORCH_CHECK(
+      input.dtype() == at::kBFloat16 || input.dtype() == at::kFloat,
+      "oneShotAllReduce only supports float and bf16 for now");
+  TORCH_CHECK(input.is_non_overlapping_and_dense());
+  TORCH_CHECK(input.device().is_cuda());
+  TORCH_CHECK(
+      input.get_device() == deviceIdx,
+      "IntraNodeComm: expect input to be on device ",
+      deviceIdx,
+      ", got device ",
+      input.get_device());
+}
+
+bool isIntraNodeCommSupported() {
+#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+  return false;
+#else
+  return true;
+#endif
+}
+
+at::Tensor IntraNodeComm::oneShotAllReduce(
+    const at::Tensor& input,
+    at::cuda::CUDAStream& stream) {
+  checkInput(input, deviceIdx_);
+
+  auto op = c10::Dispatcher::singleton()
+                .findSchemaOrThrow("symm_mem::one_shot_all_reduce_out", "")
+                .typed<at::Tensor(
+                    const at::Tensor&, std::string, std::string, at::Tensor)>();
+
+  auto symmMemTensor = at::from_blob(
+      symmetricMemoryPtr_,
+      input.sizes(),
+      at::TensorOptions().dtype(input.dtype()).device(input.device()));
+
+  symmMemTensor.copy_(input);
+  op.call(symmMemTensor, "sum", "", input);
+  return input;
+}
+
+at::Tensor IntraNodeComm::twoShotAllReduce(
+    const at::Tensor& input,
+    at::cuda::CUDAStream& stream) {
+  checkInput(input, deviceIdx_);
+
+  auto op = c10::Dispatcher::singleton()
+                .findSchemaOrThrow("symm_mem::two_shot_all_reduce_", "")
+                .typed<at::Tensor(at::Tensor, std::string, std::string)>();
+
+  auto symmMemTensor = at::from_blob(
+      symmetricMemoryPtr_,
+      input.sizes(),
+      at::TensorOptions().dtype(input.dtype()).device(input.device()));
+
+  symmMemTensor.copy_(input);
+  op.call(symmMemTensor, "sum", "");
+  input.copy_(symmMemTensor);
+  return input;
+}
+
+AllReduceAlgo IntraNodeComm::selectAllReduceAlgo(const at::Tensor& input) {
+  // Only support float and bf16 for now
+  if (input.dtype() != at::kBFloat16 && input.dtype() != at::kFloat) {
+    return AllReduceAlgo::NONE;
+  }
+  const auto inputSize =
+      static_cast<size_t>(input.numel() * input.element_size());
+  const size_t ptrAlignment = get_alignment(
+      static_cast<size_t>(input.storage_offset() * input.element_size()));
+  const size_t sizeAlignment = get_alignment(inputSize);
+  const size_t alignment = std::min(ptrAlignment, sizeAlignment);
+
+  if (topology_ == Topology::FULLY_CONNECTED) {
+    // Both symm_mem::one_shot_all_reduce and symm_mem::two_shot_all_reduce_
+    // currently requires the input to be at least 4-bytes aligned.
+    if (alignment >= 4 && inputSize <= kOneShotThreshBytes &&
+        inputSize <= bufferSize_) {
+      return AllReduceAlgo::ONE_SHOT;
+    }
+    if (alignment >= 4 && inputSize <= kTwoShotThreshBytes &&
+        inputSize <= bufferSize_) {
+      return AllReduceAlgo::TWO_SHOT;
+    }
+  }
+  return AllReduceAlgo::NONE;
+}
+
+static int64_t usageCounter = 0;
+
+at::Tensor IntraNodeComm::allReduce(
+    const at::Tensor& input,
+    AllReduceAlgo algo) {
+  // Report usage for testing purposes.
+  // We don't care about overflowing.
+  ++usageCounter;
+  auto stream = at::cuda::getCurrentCUDAStream();
+  switch (algo) {
+    case AllReduceAlgo::ONE_SHOT:
+      return oneShotAllReduce(input, stream);
+    case AllReduceAlgo::TWO_SHOT:
+      return twoShotAllReduce(input, stream);
+    default:
+      C10_THROW_ERROR(ValueError, "IntraNodeComm: invalid algo");
+  }
+}
+
+int64_t getIntraNodeCommUsageCounter() {
+  return usageCounter;
+}
+
+} // namespace intra_node_comm
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp
new file mode 100644
index 000000000000..7b5e8ff999c5
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/Work.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+namespace c10d::intra_node_comm {
+
+using namespace c10d::symmetric_memory;
+
+constexpr size_t kMaxDevices = 8;
+constexpr size_t kDefaultBufferSize = 10ull * 1024 * 1024;
+
+using NvlMesh = std::array<std::array<size_t, kMaxDevices>, kMaxDevices>;
+
+enum class Topology : uint8_t {
+  UNKNOWN = 0,
+  FULLY_CONNECTED = 1,
+};
+
+enum class AllReduceAlgo : uint8_t {
+  NONE = 0,
+  ONE_SHOT = 1,
+  TWO_SHOT = 2,
+};
+
+// NOTE: this class will be be removed soon in favor of SymmetricMemory
+class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
+ public:
+  IntraNodeComm(
+      c10::intrusive_ptr<c10d::Store> store,
+      size_t rank,
+      size_t worldSize,
+      std::optional<size_t> bufferSize = std::nullopt);
+
+  ~IntraNodeComm() override;
+
+  static bool isEnabled();
+
+  /**
+   * Performs rendezvous.
+   * If rendezvous fails, the IntraNodeComm object will be in an invalid
+   * state and it is the caller's responsibility to dispose it.
+   */
+  bool rendezvous();
+
+  /**
+   * Selects a AllReduceAlgo that we think will outperform nccl.
+   * Returns AllReduceAlgo::NONE if we don't think we can outperform nccl.
+   */
+  AllReduceAlgo selectAllReduceAlgo(const at::Tensor& input);
+
+  at::Tensor allReduce(const at::Tensor& input, AllReduceAlgo algo);
+
+ private:
+  at::Tensor oneShotAllReduce(
+      const at::Tensor& input,
+      at::cuda::CUDAStream& stream);
+
+  at::Tensor twoShotAllReduce(
+      const at::Tensor& input,
+      at::cuda::CUDAStream& stream);
+
+  c10::intrusive_ptr<Store> store_;
+  size_t rank_;
+  size_t worldSize_;
+  size_t bufferSize_;
+
+  /**
+   * Members initialized after rendezvous
+   */
+  bool isInitialized_ = false;
+  int deviceIdx_{0};
+  Topology topology_ = Topology::UNKNOWN;
+  void* symmetricMemoryPtr_ = nullptr;
+  c10::intrusive_ptr<SymmetricMemory> symmetricMemory_ = nullptr;
+};
+
+class IntraNodeCommWork : public c10d::Work {
+ public:
+  bool wait(std::chrono::milliseconds timeout = kNoTimeout) override {
+    return true;
+  }
+};
+
+TORCH_API int64_t getIntraNodeCommUsageCounter();
+
+bool isIntraNodeCommSupported();
+} // namespace c10d::intra_node_comm
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
new file mode 100644
index 000000000000..f38f9814c5cc
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -0,0 +1,659 @@
+#include <dlfcn.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+#include <cuda_awbarrier_primitives.h>
+// Use torch's cub wrapper instead of CUDA's <cub/cub.cuh>, see #55292
+#include <ATen/cuda/cub.cuh>
+#include <nvshmem.h>
+
+namespace c10d::nvshmem_extension {
+
+using c10d::symmetric_memory::StoreExchange;
+static StoreExchange storeExchange = StoreExchange("nvshmem_ext");
+
+#define THREADS_PER_BLOCK 512
+#define WARP_SIZE 32
+
+constexpr int MiB = 1024 * 1024;
+
+// Check if NVSHMEM is available
+bool is_nvshmem_available() {
+  // Runtime check
+  static std::mutex mutex;
+  static int is_available = -2;
+  std::lock_guard<std::mutex> lock(mutex);
+  if (is_available == -2) {
+    void* handle{};
+    // Open the shared library, RTLD_LAZY defers symbol resolution until needed
+    handle = dlopen("libnvshmem_host.so.3", RTLD_LAZY);
+    if (!handle) {
+      std::cerr << dlerror() << "\n";
+      is_available = 0;
+    } else {
+      is_available = 1;
+      // Close the shared library
+      dlclose(handle);
+    }
+  }
+  return is_available == 1;
+}
+
+// Bootstrap based on user's setting for NCCL
+// Long term, this may be a bit unclean; short term, it improves UX
+void maybe_initialize_env_vars() {
+  auto nccl_socket_if_name = c10::utils::get_env("NCCL_SOCKET_IFNAME");
+  auto nccl_hca_list = c10::utils::get_env("NCCL_IB_HCA");
+  auto nccl_ib_gid_index = c10::utils::get_env("NCCL_IB_GID_INDEX");
+  auto nvshmem_socket_if_name =
+      c10::utils::get_env("NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME");
+  auto nvshmem_hca_list = c10::utils::get_env("NCCL_IB_HCA");
+  auto nvshmem_ib_gid_index = c10::utils::get_env("NVSHMEM_IB_GID_INDEX");
+
+  if (!nvshmem_socket_if_name.has_value() && nccl_socket_if_name.has_value()) {
+    c10::utils::set_env(
+        "NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME", nccl_socket_if_name->c_str());
+  }
+  if (!nvshmem_hca_list.has_value() && nccl_hca_list.has_value()) {
+    c10::utils::set_env("NVSHMEM_ENABLE_NIC_PE_MAPPING", "1");
+    c10::utils::set_env("NVSHMEM_HCA_LIST", nccl_hca_list->c_str());
+  }
+  if (!nvshmem_ib_gid_index.has_value() && nccl_ib_gid_index.has_value()) {
+    c10::utils::set_env("NVSHMEM_IB_GID_INDEX", nccl_ib_gid_index->c_str());
+  }
+}
+
+void initialize_nvshmem_with_store(
+    c10::intrusive_ptr<c10d::Store> store,
+    int rank,
+    int world_size) {
+  static bool is_initialized = false;
+  if (is_initialized) {
+    return;
+  }
+
+  maybe_initialize_env_vars();
+
+  nvshmemx_uniqueid_t unique_id;
+  TORCH_CHECK(
+      nvshmemx_get_uniqueid(&unique_id) == 0, "nvshmemx_get_uniqueid failed");
+
+  // Using an existing store_all_gather due to laziness.
+  // TODO(yifu): should use broadcast
+  auto unique_ids = storeExchange.all_gather(store, rank, world_size, unique_id);
+
+  nvshmemx_init_attr_t attr;
+  nvshmemx_set_attr_uniqueid_args(rank, world_size, &unique_ids[0], &attr);
+
+  TORCH_CHECK(
+      nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr) == 0,
+      "nvshmemx_init_attr failed");
+
+  is_initialized = true;
+
+  // Print version
+  int major, minor;
+  ::nvshmem_info_get_version(&major, &minor);
+  LOG(INFO) << "NVSHMEM is available, version: " << major << "." << minor;
+}
+
+// Initializes the device state in CUmodule so that it’s able to perform NVSHMEM
+// operations.
+void nvshmemx_cumodule_init(uintptr_t module) {
+  auto cumodule = reinterpret_cast<CUmodule>(module);
+  TORCH_CHECK(
+    ::nvshmemx_cumodule_init(cumodule) == 0,
+    "nvshmemx_cumodule_init failed");
+}
+
+std::unordered_map<std::string, nvshmem_team_t> group_name_to_team_;
+
+nvshmem_team_t group_to_team(
+    const std::string& group_name,
+    const std::vector<int>& global_ranks) {
+  auto it = group_name_to_team_.find(group_name);
+  if (it != group_name_to_team_.end()) {
+    return it->second;
+  }
+  TORCH_CHECK(global_ranks.size() > 1);
+  int stride = global_ranks[1] - global_ranks[0];
+  for (size_t r = 1; r < global_ranks.size(); ++r) {
+    TORCH_CHECK(global_ranks[r] - global_ranks[r - 1] == stride);
+  }
+
+  nvshmem_team_t team;
+  TORCH_CHECK(
+      nvshmem_team_split_strided(
+          NVSHMEM_TEAM_WORLD,
+          global_ranks[0],
+          stride,
+          global_ranks.size(),
+          nullptr,
+          0,
+          &team) == 0);
+  group_name_to_team_[group_name] = team;
+  TORCH_CHECK(team != NVSHMEM_TEAM_INVALID);
+  return team;
+}
+
+at::Tensor nvshmem_broadcast(at::Tensor& input, const std::string& group_name) {
+  auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
+  int rank = input_hdl->get_rank();
+  int world_size = input_hdl->get_world_size();
+  auto team = group_to_team(group_name, input_hdl->get_rank_to_global_rank());
+  void* buffer_ptr = input_hdl->get_buffer_ptrs()[rank];
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  nvshmemx_broadcastmem_on_stream(team, buffer_ptr, buffer_ptr, input_hdl->get_buffer_size(), 0, stream);
+  return input;
+}
+
+void nvshmem_put(at::Tensor& tensor, int64_t peer) {
+  // TODO: support non-contiguous tensors
+  TORCH_CHECK(tensor.is_contiguous(),
+      "put op currently supports contiguous tensors only");
+  // TODO: rendezvous should remember the group name
+  auto hdl = c10d::symmetric_memory::rendezvous(tensor, "0");
+  auto rank = hdl->get_rank();
+  void* buffer_ptr = hdl->get_buffer_ptrs()[rank];
+  auto buffer_size = tensor.numel() * tensor.element_size();
+
+  c10::cuda::CUDAGuard guard(tensor.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+  nvshmemx_putmem_on_stream(buffer_ptr, tensor.data_ptr(), buffer_size, peer, stream);
+}
+
+at::Tensor nvshmem_all_to_all(
+    at::Tensor& input,
+    at::Tensor& out,
+    std::string group_name) {
+  auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
+  auto out_hdl = c10d::symmetric_memory::rendezvous(out, group_name);
+  int rank = input_hdl->get_rank();
+  int world_size = input_hdl->get_world_size();
+  auto team = group_to_team(group_name, input_hdl->get_rank_to_global_rank());
+
+  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
+  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
+  size_t bytes_per_rank = input_hdl->get_buffer_size() / world_size;
+
+  auto stream = at::cuda::getCurrentCUDAStream(input.device().index());
+  nvshmemx_alltoallmem_on_stream(team, output_ptr, input_ptr, bytes_per_rank, stream);
+  return out;
+}
+
+// This is an exclusive prefix sum function that calculates read (or write) offsets for each peer.
+__device__ int64_t prefixSum(int64_t *odata, int64_t *idata, int n) {
+  // Specialize BlockScan for a 1D block of threads, of type int64_t.
+  // - `BLOCK_SCAN_WARP_SCANS` is a low-latency scan algorithm (instead of high
+  // throughput which we don't need here).
+  // - `at_cuda_detail::cub` is torch's cub wrapper, see #55292.
+  using BlockScanT = at_cuda_detail::cub::BlockScan<int64_t, THREADS_PER_BLOCK, at_cuda_detail::cub::BLOCK_SCAN_WARP_SCANS>;
+  // Allocate shared memory for BlockScan
+  __shared__ typename BlockScanT::TempStorage temp_storage;
+
+  // TODO: currently it is assumed that the number of PE's is smaller than
+  // `THREADS_PER_BLOCK`
+  CUDA_KERNEL_ASSERT(n <= THREADS_PER_BLOCK);
+
+  // Obtain input item for each thread
+  int tid = threadIdx.x;
+  int64_t thread_data = (tid < n) ? idata[tid] : 0;
+
+  // Collectively compute the block-wide exclusive prefix sum
+  int64_t block_aggregate;
+  BlockScanT(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+
+  // Store the result
+  odata[tid] = thread_data;
+  return block_aggregate;
+}
+
+// This kernel is used to exchange output splits and source offsets between peers.
+// `in_out_splits` is of size (3, npes) and contains:
+// - input splits (IN)
+// - output splits (OUT) and
+// - source offsets (OUT).
+__global__ void exchangeSplitAndOffset(int64_t* in_out_splits, int mype, int npes) {
+  auto input_splits = in_out_splits;
+  auto output_splits = in_out_splits + npes;
+  auto source_offsets = in_out_splits + npes * 2;
+  int tid = threadIdx.x;
+
+  __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
+
+  // Scan input splits to get the source offsets
+  prefixSum(peer_offsets, input_splits, npes);
+  __syncthreads();;
+
+  // Use 1 block to do the exchange
+  if (tid < npes) {
+    int peer = tid;
+    nvshmem_int64_p(source_offsets + mype, peer_offsets[peer], peer);
+    nvshmem_int64_p(output_splits + mype, input_splits[peer], peer);
+  }
+  // This barrier ensures that all remote PEs see the updated values
+  nvshmemx_barrier_all_block();
+}
+
+// This kernel is used to do the actual data exchange.
+// `in_out_splits` has the same definition as in `exchangeSplitAndOffset`.
+// `stride` is the stride at dim 0, unit in byte.
+__global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_splits, size_t stride, int mype, int npes) {
+  auto output_splits = in_out_splits + npes;
+  auto source_offsets = in_out_splits + npes * 2;
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+  int blocks_per_peer = max(gridDim.x / npes, 1);
+
+  // Calculate the output offsets
+  __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
+  prefixSum(peer_offsets, output_splits, npes);
+  __syncthreads();
+
+  // Target a different peer based on bid
+  for (int i = bid / blocks_per_peer; i < npes; i += gridDim.x / blocks_per_peer) {
+    int peer = (mype + i) % npes;
+    // Total amount from `peer`
+    auto peer_size = output_splits[peer] * stride;
+    // Amount to get from `peer` in this block
+    auto block_size = peer_size / blocks_per_peer;
+    // Being lazy here, we should handle the residual if the division is not exact
+    CUDA_KERNEL_ASSERT(block_size * blocks_per_peer == peer_size);
+    // This block's offset in the data from `peer`
+    auto block_offset = block_size * (bid % blocks_per_peer);
+    auto source_offset = source_offsets[peer] * stride + block_offset;
+    auto write_offset = peer_offsets[peer] * stride + block_offset;
+    nvshmemx_getmem_block(
+      (char*)recv_data + write_offset,
+      (char*)send_data + source_offset,
+      block_size,
+      peer);
+  }
+  // Write out the output offsets (to the scratchpad line)
+  if (bid == 0 && tid < npes) {
+    source_offsets[tid] = peer_offsets[tid];
+  }
+}
+
+at::Tensor all_to_all_vdev(
+    at::Tensor& input,
+    at::Tensor& out,
+    at::Tensor& in_out_splits,
+    std::string group_name) {
+  /* Perform AllToAllv operation using NVSHMEM, with split information provided on device.
+   * Arguments:
+   *  - `input` is the input tensor
+   *  - `out` is the output tensor
+   *  - `in_out_splits` is a 2D tensor of size (3, npes). The rows are (in order):
+        input splits (IN)
+        output splits (OUT) and
+        output offsets (OUT).
+  */
+  auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
+  auto out_hdl = c10d::symmetric_memory::rendezvous(out, group_name);
+  auto splits_hdl = c10d::symmetric_memory::rendezvous(in_out_splits, group_name);
+  int rank = input_hdl->get_rank();
+  int world_size = input_hdl->get_world_size();
+
+  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
+  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
+  int64_t* splits_ptr = (int64_t*)(splits_hdl->get_buffer_ptrs()[rank]);
+
+  auto stream = at::cuda::getCurrentCUDAStream(input.device().index());
+
+  // Exchange output splits and source offsets
+  // Use collective launch because kernel involves nvshmem barrier
+  void* args0[] = {
+      &splits_ptr,
+      &rank,
+      &world_size};
+  nvshmemx_collective_launch(
+      (const void*)exchangeSplitAndOffset,
+      dim3(1),
+      dim3(THREADS_PER_BLOCK),
+      args0,
+      0,
+      stream);
+
+  // CTA Tuning
+  // Intra-node: use multiple blocks per peer to increase data parallelism, up to 8.
+  // Up to 1 MB -> 1 block
+  // Up to 2 MB -> 2 blocks
+  // Up to 4 MB -> 4 blocks
+  // More -> 8 blocks
+  // The tuning for `num_blocks` below multiplies these numbers by world_size
+  // (e.g. 8 -> 8 * 8). If world_size is smaller, we simply shift the blocks
+  // towards data parallelism. (There may be room for improvement here)
+  auto input_size = input.numel() * input.element_size();
+  int num_blocks = input_size < MiB ? 8 :
+      (input_size < 2 * MiB ? 16 :
+      (input_size < 4 * MiB ? 32 : 64));
+
+  // Inter-node: limit the total the number of blocks:
+  // = 16 for 16GPUs which is enough to max out 90 GB/s bandwidth perf
+  // = 8 for more than 16 GPUs which is enough to max out approx 50 GB/s bandwidth perf
+  // Above assumes 400Gb/s NIC for inter-node and 400GB/s NVLinks for intra-node comms.
+  // TODO: better intra vs inter detection, currently it is based on world_size.
+  int max_inter_node_blocks = world_size <= 16 ? 16 : 8;
+  if (world_size > 8) {
+    num_blocks = std::min(num_blocks, max_inter_node_blocks);
+  }
+
+  // Stride at dim 0 (assuming input is contiguous, TODO)
+  size_t stride_bytes = input.stride(0) * input.element_size();
+
+  // All to all data exchange
+  void* args1[] = {
+      &input_ptr,
+      &output_ptr,
+      &splits_ptr,
+      &stride_bytes,
+      &rank,
+      &world_size};
+  nvshmemx_collective_launch(
+      (const void*)allToAllV,
+      dim3(num_blocks),
+      dim3(THREADS_PER_BLOCK),
+      args1,
+      0,
+      stream);
+  return out;
+}
+
+// Start of `all_to_all_vdev_2d`
+// This kernel is used to exchange output splits and source offsets between peers.
+// For meaning of `mype` and `npes`, see the docstring of `all_to_all_vdev_2d`.
+// `in_out_splits` is of size (3, npes * ne) and contains:
+// - input splits (IN)
+// - output splits (OUT) and
+// - source offsets (OUT).
+__global__ void exchangeSplitAndOffset_2d(int64_t* in_out_splits, int mype, int npes, int ne, size_t input_dim0) {
+  int nsplits = npes * ne;
+  auto input_splits = in_out_splits;
+  auto output_splits = in_out_splits + nsplits;
+  auto source_offsets = in_out_splits + nsplits * 2;
+  int tid = threadIdx.x;
+
+  __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
+
+  // Scan input splits to get the source offsets
+  auto sum_of_splits = prefixSum(peer_offsets, input_splits, nsplits);
+  __syncthreads();;
+  CUDA_KERNEL_ASSERT(sum_of_splits <= input_dim0);
+
+  // Use 1 block to do the exchange
+  if (tid < nsplits) {
+    int peer = tid / ne;
+    int e = tid % ne;
+    // This does a transpose from rank-major order to expert-major order
+    int dst_offset = e * npes + mype;
+    auto split_val = input_splits[tid];
+    CUDA_KERNEL_ASSERT(split_val >= 0);
+    nvshmem_int64_p(source_offsets + dst_offset, peer_offsets[tid], peer);
+    nvshmem_int64_p(output_splits + dst_offset, split_val, peer);
+  }
+  // This barrier ensures that all remote PEs see the updated values
+  nvshmemx_barrier_all_block();
+}
+
+// This is an warp-scope, exclusive prefix sum. When called by a block of
+// threads, each warp will perform an independent prefix sum, concurrently.
+// Returns the sum of all elements in the warp.
+// `NUM_WARPS` is the number of warps participating the concurrent prefix sum.
+template <int NUM_WARPS>
+__device__ int64_t prefixSum_warp(int64_t *odata, int64_t *idata, int n) {
+  CUDA_KERNEL_ASSERT(n <= WARP_SIZE);
+
+  // Specialize WarpScan for type int
+  using WarpScan = at_cuda_detail::cub::WarpScan<int64_t>;
+  // Allocate WarpScan shared memory for N warps
+  __shared__ typename WarpScan::TempStorage temp_storage[NUM_WARPS];
+
+  int warp_id = threadIdx.x / WARP_SIZE;
+  if (warp_id >= NUM_WARPS) {
+    return 0;
+  }
+
+  // Obtain input item for each thread
+  int tid = threadIdx.x % WARP_SIZE;
+  int64_t thread_data = (tid < n) ? idata[tid] : 0;
+
+  // Total sum of all elements in the warp
+  int64_t warp_aggregate;
+  // Compute the warp-wide exclusive prefix sum
+  WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
+
+  // Store the result
+  odata[tid] = thread_data;
+  return warp_aggregate;
+}
+
+// This is for abstracting a thread-group-scope, exclusive prefix sum.
+// Since we use warp-scope prefix sum, the thread group size is limited to warp size.
+#define A2AV_TILE_SIZE WARP_SIZE
+
+// This kernel is used to do the actual data exchange.
+// `in_out_splits` has the same definition as in `exchangeSplitAndOffset`.
+// `stride` is the stride at dim 0, unit in byte.
+// For meaning of `mype` and `npes`, see the docstring of `all_to_all_vdev_2d`.
+__global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_out_splits, size_t stride, int mype, int npes, int ne, int64_t major_align) {
+  int nsplits = npes * ne;
+  auto output_splits = in_out_splits + nsplits;
+  auto source_offsets = in_out_splits + nsplits * 2;
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+
+  // Split the thread block into tiles
+  constexpr int NUM_TILES = THREADS_PER_BLOCK / A2AV_TILE_SIZE;
+  int tileId = tid / A2AV_TILE_SIZE;
+  int laneId = tid % A2AV_TILE_SIZE;
+  // Each tile calculates its own prefix sum
+  __shared__ int64_t tile_prefix_sums[NUM_TILES][A2AV_TILE_SIZE];
+  // A tile takes care of npes worth of splits
+  int nsplits_per_tile = min(npes, nsplits - tileId * npes);
+  // TODO: currently it is assumed that the number of PE's is smaller than
+  // `A2AV_TILE_SIZE` bc the warp-scope prefix sum can only handle up to
+  // WARP_SIZE elements
+  CUDA_KERNEL_ASSERT(npes <= A2AV_TILE_SIZE);
+  // Similarly, the number of experts per rank is also assumed to be smaller
+  // than `NUM_TILES`
+  CUDA_KERNEL_ASSERT(ne <= NUM_TILES);
+
+  // Total length of each tile
+  __shared__ int64_t len_per_tile[NUM_TILES];
+  // When `nsplits` is small, not every tile gets data to sum. They can skip
+  // this local prefix sum.
+  if (nsplits_per_tile > 0) {
+    // Each tile calculates its own prefix sum, return value is the sum of all elements in the tile.
+    int64_t my_tile_len = prefixSum_warp<NUM_TILES>(tile_prefix_sums[tileId], output_splits + tileId * npes, nsplits_per_tile);
+    // Last thread in each tile does the up aligning.
+    if (laneId == A2AV_TILE_SIZE - 1) {
+      auto aligned_len = (my_tile_len + major_align - 1) / major_align * major_align;
+      // In case `aligned_len` is 0, we set it to `major_align` to avoid an
+      // empty bin, bc cutlass currently does not support it. See
+      // https://github.com/pytorch/pytorch/issues/152668.
+      len_per_tile[tileId] = max(aligned_len, major_align);
+    }
+  }
+  __syncthreads();
+
+  // Starting offset of each tile
+  __shared__ int64_t start_offset_per_tile[NUM_TILES];
+  // Prefix sum again to get the tiles' start offsets.
+  // `NUM_TILES` is typically not greater than 32, because 32 tiles * 32 threads
+  // = 1024 threads, and this kernel is launched within 1024 threads. Thus, we
+  // can use warp-scope prefix sum.
+  static_assert(NUM_TILES <= WARP_SIZE);
+  // Only 1 warp is needed
+  prefixSum_warp<1>(start_offset_per_tile, len_per_tile, NUM_TILES);
+  __syncthreads();
+
+  // Add tile offset to every element in the tile
+  tile_prefix_sums[tileId][laneId] += start_offset_per_tile[tileId];
+  __syncthreads();
+
+  // Target a different e based on bid
+  for (int eid = bid; eid < nsplits; eid += gridDim.x) {
+    int peer = eid % npes;
+    // Amount from `peer` for `e`
+    auto peer_size = output_splits[eid] * stride;
+    auto source_offset = source_offsets[eid] * stride;
+    auto e_offset = tile_prefix_sums[eid / npes][peer];
+    auto write_offset = e_offset * stride;
+    nvshmemx_getmem_block(
+      (char*)recv_data + write_offset,
+      (char*)send_data + source_offset,
+      peer_size,
+      peer);
+  }
+  // Write out the output offsets (to the scratchpad line)
+  if (bid == 0 && tid < nsplits) {
+    source_offsets[tid] = tile_prefix_sums[tid / npes][tid % npes];
+  }
+}
+
+at::Tensor all_to_all_vdev_2d(
+    at::Tensor& input,
+    at::Tensor& out,
+    at::Tensor& in_out_splits,
+    std::string group_name,
+    std::optional<int64_t> major_align) {
+  /* Perform a 2D AllToAllv shuffle operation using NVSHMEM, with split information provided on device.
+   * Arguments:
+   *  - `input` is the input tensor
+   *  - `out` is the output tensor
+   *  - `in_out_splits` is a 2D tensor of size (3, `world_size` * `ne`). In the
+        scenario of Mixture-of-Experts models, `ne` is the number of experts per
+        rank. The rows of `in_out_splits` are (in order):
+        input splits (IN)
+        output splits (OUT) and
+        output offsets (OUT).
+   *  - `group_name` is the name of the group to use for the collective operation.
+   *  - `major_align` is the alignment of the "major dimension" of the output
+        sequence. See below for details.
+
+   *  A 2D AllToAllv shuffle is illustrated below:
+        (world_size = 2, ne = 2, total number of experts = 4)
+        Source: |       Rank 0      |       Rank 1      |
+                | c0 | c1 | c2 | c3 | d0 | d1 | d2 | d3 |
+
+        Dest  : |       Rank 0      |       Rank 1      |
+                | c0 | d0 | c1 | d1 | c2 | d2 | c3 | d3 |
+        where each `c_i` / `d_i` are slices of the `input` tensor, targeting
+        expert `i`, with length indicated by input splits (in
+        `in_out_splits[0]`).  That is, the 2D AllToAllv shuffle achieves a
+        transpose from rank-major order at input to expert-major order at
+        output.
+
+   *  If `major_align` is not 1, the output offsets of c1, c2, c3 will be
+      up-aligned to this value. For example, if c0 has length 5 and d0 has
+      length 7 (making a total of 12), and if the `major_align` is set to 16,
+      the output offset of c1 will be 16. Similar for c2 and c3. This value has
+      no effect on the offset of the minor dimension, i.e.  d0, d1, d2 and d3.
+      Note: since cutlass does not support empty bins, we set the aligned length
+      to `major_align` if it is 0. See
+      https://github.com/pytorch/pytorch/issues/152668.
+  */
+  auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
+  auto out_hdl = c10d::symmetric_memory::rendezvous(out, group_name);
+  auto splits_hdl = c10d::symmetric_memory::rendezvous(in_out_splits, group_name);
+  int rank = input_hdl->get_rank();
+  int world_size = input_hdl->get_world_size();
+  // TODO: world_size is currently limited by the number of elements in a WarpScan.
+  TORCH_CHECK(world_size <= A2AV_TILE_SIZE, "world_size must be smaller than A2AV_TILE_SIZE", A2AV_TILE_SIZE);
+
+  // If `major_align` is not provided, use 1 as the default value.
+  int64_t major_align_val = major_align.value_or(1);
+  TORCH_CHECK(major_align_val > 0, "major_align must be positive");
+
+  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
+  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
+  int64_t* splits_ptr = (int64_t*)(splits_hdl->get_buffer_ptrs()[rank]);
+
+  // Shape checks
+  auto split_shape = in_out_splits.sizes();
+  TORCH_CHECK(in_out_splits.is_contiguous()
+      && input.is_contiguous()
+      && out.is_contiguous(),
+      "input, out and in_out_splits must be contiguous");
+  TORCH_CHECK(split_shape.size() == 2
+      && split_shape[0] == 3
+      && split_shape[1] % world_size == 0,
+      "in_out_splits must be 2D with 3 rows, "
+      "each row must be a multiple of world_size");
+
+  // Consistency checks
+  TORCH_CHECK(input.dtype() == out.dtype()
+      && input.stride(0) == out.stride(0),
+      "input and out must have the same dtype and same stride at dim 0");
+  TORCH_CHECK(in_out_splits.scalar_type() == at::kLong, "in_out_splits must be int64");
+
+  // Number of experts per rank
+  int ne = split_shape[1] / world_size;
+  constexpr int NUM_TILES = THREADS_PER_BLOCK / A2AV_TILE_SIZE;
+  TORCH_CHECK(ne <= NUM_TILES, "Number of experts must be smaller than NUM_TILES", NUM_TILES);
+
+  // Set device context for getting the stream and launching kernels below
+  c10::cuda::CUDAGuard guard(input.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  // Exchange output splits and source offsets
+  auto input_dim0 = input.size(0);
+  // Use collective launch because kernel involves nvshmem barrier
+  void* args0[] = {
+      &splits_ptr,
+      &rank,
+      &world_size,
+      &ne,
+      &input_dim0};
+  nvshmemx_collective_launch(
+      (const void*)exchangeSplitAndOffset_2d,
+      dim3(1),
+      dim3(THREADS_PER_BLOCK),
+      args0,
+      0,
+      stream);
+
+  // CTA Tuning
+  // Naive for now, use 1 block per expert.
+  // Total number of blocks is limited to 64 (intra-node) or 8 (inter-node).
+  int num_blocks = std::min(world_size * ne, world_size > 8 ? 8 : 64);
+
+  // Stride at dim 0
+  size_t stride_bytes = input.stride(0) * input.element_size();
+
+  // All to all data exchange
+  void* args1[] = {
+      &input_ptr,
+      &output_ptr,
+      &splits_ptr,
+      &stride_bytes,
+      &rank,
+      &world_size,
+      &ne,
+      &major_align_val};
+  nvshmemx_collective_launch(
+      (const void*)allToAllV_2d,
+      dim3(num_blocks),
+      dim3(THREADS_PER_BLOCK),
+      args1,
+      0,
+      stream);
+  return out;
+}
+
+} // namespace c10d::nvshmem_extension
+
+
+TORCH_LIBRARY_IMPL(symm_mem, CUDA, m) {
+  m.impl("nvshmem_broadcast", c10d::nvshmem_extension::nvshmem_broadcast);
+  m.impl("nvshmem_put", c10d::nvshmem_extension::nvshmem_put);
+  m.impl("nvshmem_all_to_all", c10d::nvshmem_extension::nvshmem_all_to_all);
+  m.impl("all_to_all_vdev", c10d::nvshmem_extension::all_to_all_vdev);
+  m.impl("all_to_all_vdev_2d", c10d::nvshmem_extension::all_to_all_vdev_2d);
+}
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
new file mode 100644
index 000000000000..41a7344ee914
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d::nvshmem_extension {
+
+void initialize_nvshmem_with_store(
+    c10::intrusive_ptr<c10d::Store> store,
+    int rank,
+    int world_size);
+
+// Check if NVSHMEM is available
+TORCH_API bool is_nvshmem_available();
+
+// Initializes the device state in CUmodule so that it’s able to perform NVSHMEM
+// operations.
+TORCH_API void nvshmemx_cumodule_init(uintptr_t module);
+
+TORCH_API void nvshmem_put(at::Tensor& tensor, int64_t peer);
+
+at::Tensor nvshmem_broadcast(at::Tensor& input, const std::string& group_name);
+
+at::Tensor nvshmem_all_to_all(
+    at::Tensor& input,
+    at::Tensor& out,
+    std::string group_name);
+
+at::Tensor all_to_all_vdev(
+    at::Tensor& input,
+    at::Tensor& out,
+    at::Tensor& in_out_splits,
+    std::string group_name);
+
+at::Tensor all_to_all_vdev_2d(
+    at::Tensor& input,
+    at::Tensor& out,
+    at::Tensor& in_out_splits,
+    std::string group_name,
+    std::optional<int64_t> major_align = std::nullopt);
+
+} // namespace c10d::nvshmem_extension
diff --git a/torch/csrc/distributed/rpc/agent_utils.cpp b/torch/csrc/distributed/rpc/agent_utils.cpp
index ab4ef317d6b6..1329bb03f3a0 100644
--- a/torch/csrc/distributed/rpc/agent_utils.cpp
+++ b/torch/csrc/distributed/rpc/agent_utils.cpp
@@ -146,10 +146,17 @@ void removeCurrentName(
   store.set(allWorkerInfosKey, newAllWorkerInfosVector);
 }
 
+<<<<<<< HEAD
 const string storeKeyBarrierId = "_ID_";
 const string storeKeyProcessCount = "PROCESS_COUNT";
 const string storeKeyActiveCallCount = "ACTIVE_CALLS";
 const string storeKeyReady = "READY";
+=======
+constexpr const auto storeKeyBarrierId = "_ID_";
+constexpr const auto storeKeyProcessCount = "PROCESS_COUNT";
+constexpr const auto storeKeyActiveCallCount = "ACTIVE_CALLS";
+constexpr const auto storeKeyReady = "READY";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static std::atomic<int> barrierId(0);
 
 static std::tuple<std::string, std::string, std::string> getNextKeyIds() {
diff --git a/torch/csrc/distributed/rpc/agent_utils.h b/torch/csrc/distributed/rpc/agent_utils.h
index 016f6110e13e..2d8a46b2994f 100644
--- a/torch/csrc/distributed/rpc/agent_utils.h
+++ b/torch/csrc/distributed/rpc/agent_utils.h
@@ -24,7 +24,11 @@ TORCH_API std::unordered_map<std::string, worker_id_t> collectCurrentNames(
     const worker_id_t selfId,
     const std::string& selfName);
 
+<<<<<<< HEAD
 // Remove name frmo Store, used in dynamic RPC groups.
+=======
+// Remove name from Store, used in dynamic RPC groups.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // NOTE: This needs to be called with the Dynamic RPC group
 // membership management token held.
 TORCH_API void removeCurrentName(
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index 8c98a68352da..abab7b627634 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -30,6 +30,10 @@ template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
 PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
+<<<<<<< HEAD
+=======
+  HANDLE_TH_ERRORS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto rpc_module =
       THPObjectPtr(PyImport_ImportModule("torch.distributed.rpc"));
   if (!rpc_module) {
@@ -631,8 +635,13 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           py::call_guard<py::gil_scoped_release>())
       .def(
           "_get_device_map",
+<<<<<<< HEAD
           (DeviceMap(TensorPipeAgent::*)(const WorkerInfo& dst) const) &
               TensorPipeAgent::getDeviceMap,
+=======
+          (DeviceMap(TensorPipeAgent::*)(const WorkerInfo& dst)
+               const)&TensorPipeAgent::getDeviceMap,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           py::call_guard<py::gil_scoped_release>())
       .def(
           "_get_backend_options",
@@ -845,6 +854,10 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
   module.def("_disable_jit_rref_pickle", &disableJitRRefPickle);
 
   Py_RETURN_TRUE;
+<<<<<<< HEAD
+=======
+  END_HANDLE_TH_ERRORS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace
diff --git a/torch/csrc/distributed/rpc/py_rref.cpp b/torch/csrc/distributed/rpc/py_rref.cpp
index d5274289d610..9712d2e00064 100644
--- a/torch/csrc/distributed/rpc/py_rref.cpp
+++ b/torch/csrc/distributed/rpc/py_rref.cpp
@@ -16,7 +16,11 @@ namespace torch::distributed::rpc {
 namespace {
 
 py::tuple toPyTuple(const RRefForkData& rrefForkData) {
+<<<<<<< HEAD
   // add GIL as it is contructing a py::object
+=======
+  // add GIL as it is constructing a py::object
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   pybind11::gil_scoped_acquire ag;
   return py::make_tuple(
       rrefForkData.ownerId_,
@@ -86,12 +90,23 @@ TypePtr tryInferTypeWithTypeHint(
   // Check if value is an instance of a ScriptClass. If not, skip type inference
   // because it will try to script the class that value is in instance of, and
   // this should be avoided.
+<<<<<<< HEAD
   py::bool_ can_compile = py::module::import("torch._jit_internal")
                               .attr("can_compile_class")(value.get_type());
 
   if (py::cast<bool>(can_compile)) {
     py::object existing_ty = py::module::import("torch.jit._state")
                                  .attr("_get_script_class")(value.get_type());
+=======
+  py::bool_ can_compile =
+      py::module::import("torch._jit_internal")
+          .attr("can_compile_class")(py::type::handle_of(value));
+
+  if (py::cast<bool>(can_compile)) {
+    py::object existing_ty =
+        py::module::import("torch.jit._state")
+            .attr("_get_script_class")(py::type::handle_of(value));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if (existing_ty.is_none()) {
       return PyObjectType::get();
diff --git a/torch/csrc/distributed/rpc/python_rpc_handler.cpp b/torch/csrc/distributed/rpc/python_rpc_handler.cpp
index 99dce7135832..529cab046f1e 100644
--- a/torch/csrc/distributed/rpc/python_rpc_handler.cpp
+++ b/torch/csrc/distributed/rpc/python_rpc_handler.cpp
@@ -14,7 +14,11 @@ constexpr auto kInternalModule = "torch.distributed.rpc.internal";
 #define PROFILE_GIL_SCOPED_ACQUIRE                                       \
   std::chrono::time_point<std::chrono::high_resolution_clock> startTime; \
   auto shouldProfileGIL =                                                \
+<<<<<<< HEAD
       RpcAgent::getCurrentRpcAgent() -> isGILProfilingEnabled();         \
+=======
+      RpcAgent::getCurrentRpcAgent()->isGILProfilingEnabled();           \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (shouldProfileGIL) {                                                \
     startTime = std::chrono::high_resolution_clock::now();               \
   }                                                                      \
@@ -121,7 +125,11 @@ PythonRpcHandler& PythonRpcHandler::getInstance() {
   // initialization by calling `new PythonRpcHandler()`, inside of which GIL is
   // also required. Static data initialization is thread-safe, so the thread
   // holding the GIL will wait for the other thread to finish static data
+<<<<<<< HEAD
   // initializating before going forward. Because the initialization can't
+=======
+  // initializing before going forward. Because the initialization can't
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // proceed without GIL, there is a deadlock. We ask the calling thread to
   // release GIL to avoid this situation.
   TORCH_INTERNAL_ASSERT(!PyGILState_Check());
@@ -174,7 +182,11 @@ void PythonRpcHandler::handleExceptionGILHeld(const py::object& obj) {
 
 bool PythonRpcHandler::isRemoteException(const py::object& obj) {
   PROFILE_GIL_SCOPED_ACQUIRE;
+<<<<<<< HEAD
   auto type = obj.get_type();
+=======
+  auto type = py::type::handle_of(obj);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto moduleName = type.attr("__module__").cast<std::string>();
   auto qualName = type.attr("__qualname__").cast<std::string>();
   return moduleName == kInternalModule && qualName == "RemoteException";
diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp
index 438f2ff9954f..19a09feebf40 100644
--- a/torch/csrc/distributed/rpc/request_callback_impl.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp
@@ -134,7 +134,11 @@ c10::intrusive_ptr<JitFuture> RequestCallbackImpl::runPythonFunction(
   try {
     return result.cast<jit::PythonFutureWrapper&>().fut;
   } catch (const py::cast_error& e) {
+<<<<<<< HEAD
     auto type = result.get_type();
+=======
+    auto type = py::type::handle_of(result);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto errMsg = c10::str(
         e.what(),
         ". Functions decorated with @rpc.async_function must return a "
diff --git a/torch/csrc/distributed/rpc/rref_context.cpp b/torch/csrc/distributed/rpc/rref_context.cpp
index 1022d6ff97d7..50d4723d45d9 100644
--- a/torch/csrc/distributed/rpc/rref_context.cpp
+++ b/torch/csrc/distributed/rpc/rref_context.cpp
@@ -348,7 +348,11 @@ c10::intrusive_ptr<OwnerRRef> RRefContext::getOrCreateOwnerRRef(
     // here is a plain TensorType, they are not equal relationship:
     // specialized TensorType <: plain TensorType
     //
+<<<<<<< HEAD
     // In RPC we don't care the difference as we ser/de with just the
+=======
+    // In RPC we don't care the difference as we ser'de with just the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // plain TensorType. This is not a issue for UserRRef creation either,
     // since Tensor can only get specialized with a previous run of local
     // JIT function, and we shouldn't preserve the specialized SubTensorType
diff --git a/torch/csrc/distributed/rpc/rref_context.h b/torch/csrc/distributed/rpc/rref_context.h
index 3282e8c0e108..2ac609e3a45d 100644
--- a/torch/csrc/distributed/rpc/rref_context.h
+++ b/torch/csrc/distributed/rpc/rref_context.h
@@ -318,7 +318,11 @@ class TORCH_API RRefContext {
   //     RRef is forwarded to the callee as new UserRRefs (if the callee is not
   //     the owner). In this case, we block running the user function until all
   //     UserRRefs are confirmed by the owner.
+<<<<<<< HEAD
   // This contract gurantees that no UserRRefs can be used remotely without
+=======
+  // This contract guarantees that no UserRRefs can be used remotely without
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // confirmation. Note that, however, the UserRRef created by rpc.remote can
   // still be passed to local functions as arguments and used there. This is by
   // design, because this feature is especially useful when, say a master node
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 9801a0327ddf..a7c204d9c363 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -131,6 +131,10 @@ std::vector<c10::Device> getDevicesOfTensors(
   devices.reserve(deviceCount);
   for (const auto idx : c10::irange(indexBitset.size())) {
     if (indexBitset[idx]) {
+<<<<<<< HEAD
+=======
+      // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       devices.emplace_back(impl->type(), static_cast<c10::DeviceIndex>(idx));
     }
   }
@@ -371,7 +375,11 @@ void TensorPipeAgent::checkAndSetStaticGroup(
       isStaticGroupKey, std::vector<uint8_t>(), isStaticGroupVec);
   std::string returnedVal = std::string(returnedVec.begin(), returnedVec.end());
   // In both cases, the returned value should be the value of isStaticGroupStr,
+<<<<<<< HEAD
   // otherwise there is a discrepency with initialization among one of the
+=======
+  // otherwise there is a discrepancy with initialization among one of the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // members
   TORCH_CHECK(
       returnedVal == isStaticGroupStr,
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h
index aaa2e9699e4e..7a7372fbb882 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -121,8 +121,13 @@ struct TORCH_API TensorPipeRpcBackendOptions : public RpcBackendOptions {
       deviceMaps[workerName] = deviceMap;
     } else {
       for (auto& entry : deviceMap) {
+<<<<<<< HEAD
         // c10::Device has no default constructor, hence map[device] dosn't work
         // In C++-17 we can use insert_or_assign.
+=======
+        // c10::Device has no default constructor, hence map[device] doesn't
+        // work In C++-17 we can use insert_or_assign.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto entryIter = iter->second.find(entry.first);
         if (entryIter == iter->second.end()) {
           iter->second.emplace(entry.first, entry.second);
diff --git a/torch/csrc/dynamo/compiled_autograd.cpp b/torch/csrc/dynamo/compiled_autograd.cpp
index a9d751c9a9a6..59fa60c65899 100644
--- a/torch/csrc/dynamo/compiled_autograd.cpp
+++ b/torch/csrc/dynamo/compiled_autograd.cpp
@@ -3,7 +3,11 @@
 
 namespace torch::dynamo::autograd {
 
+<<<<<<< HEAD
 std::unique_ptr<PyCompilerInterface> kActivePyCompilerInterface;
+=======
+static std::unique_ptr<PyCompilerInterface> kActivePyCompilerInterface;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 const std::unique_ptr<PyCompilerInterface>& getPyCompilerInterface() {
   TORCH_INTERNAL_ASSERT(kActivePyCompilerInterface != nullptr);
diff --git a/torch/csrc/dynamo/compiled_autograd.h b/torch/csrc/dynamo/compiled_autograd.h
index 3db220bcecbc..ea8c2b3bc7ec 100644
--- a/torch/csrc/dynamo/compiled_autograd.h
+++ b/torch/csrc/dynamo/compiled_autograd.h
@@ -39,7 +39,11 @@ struct TORCH_API PyCompilerInterface {
       // NOLINTNEXTLINE(performance-unnecessary-value-param)
       std::vector<at::TypePtr> packed_args_schema,
       bool is_custom_function = false,
+<<<<<<< HEAD
       bool is_traceable = true) {
+=======
+      bool is_traceable = true) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
   }
 
@@ -51,14 +55,22 @@ struct TORCH_API PyCompilerInterface {
       const std::string& fn_name,
       const variable_list& inputs,
       const ivalue_list& packed_args,
+<<<<<<< HEAD
       const c10::IValue& output_metadata) {
+=======
+      const c10::IValue& output_metadata) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
   }
   virtual variable_list call_copy_slices_prologue(
       PyObject* py_compiler,
       const variable_list& inputs,
       const at::TensorGeometry& base,
+<<<<<<< HEAD
       const at::TensorGeometry& view) {
+=======
+      const at::TensorGeometry& view) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
   }
   virtual variable_list call_copy_slices_epilogue(
@@ -66,13 +78,28 @@ struct TORCH_API PyCompilerInterface {
       const std::vector<bool>& needs_input_grad,
       const at::Tensor& result,
       const variable_list& res,
+<<<<<<< HEAD
       const at::Tensor& grad_slice) {
+=======
+      const at::Tensor& grad_slice) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
   }
   virtual at::Tensor call_unpack(
       PyObject* py_compiler,
       std::optional<size_t> hook_id,
+<<<<<<< HEAD
       size_t hook_input_id) {
+=======
+      size_t hook_input_id) const {
+    TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
+  }
+  virtual void call_accumulate_grad(
+      PyObject* py_compiler,
+      const at::Tensor& variable,
+      const at::Tensor& grad,
+      bool has_post_hooks) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
   }
 };
@@ -91,7 +118,11 @@ struct TORCH_API PyCompilerGuard {
 // including torch/csrc/autograd/engine.h breaks BC by somehow introducing
 // symbol resolution issues. Instead requiring downstream users to include
 // engine.h to access collect_input_metadata, we provide it here (with a
+<<<<<<< HEAD
 // different name to avoid ambigous symbols...)
+=======
+// different name to avoid ambiguous symbols...)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_API std::vector<std::optional<InputMetadata>> get_input_metadata(
     const edge_list& edges);
 
@@ -158,6 +189,10 @@ struct NodeCall {
   uint32_t id;
   std::shared_ptr<Node> node;
   std::vector<std::pair<int, int>> tensor_pre_hooks;
+<<<<<<< HEAD
+=======
+  std::vector<std::pair<int, int>> cpp_tensor_pre_hooks;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<int> pre_hooks;
   std::vector<int> post_hooks;
   std::vector<int> post_acc_grad_hooks;
@@ -327,6 +362,15 @@ struct AutogradCompilerCall {
     return hooks.size() - 1;
   }
 
+<<<<<<< HEAD
+=======
+  size_t emplace_cpp_tensor_pre_hook(
+      std::function<at::TensorBase(const at::TensorBase&)>&& fn) {
+    cpp_tensor_pre_hooks.emplace_back(std::move(fn));
+    return cpp_tensor_pre_hooks.size() - 1;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t emplace_packed_input(c10::SafePyObject&& input) {
     packed_inputs.emplace_back(std::move(input));
     return packed_inputs.size() - 1;
@@ -342,6 +386,11 @@ struct AutogradCompilerCall {
   LiftedIValueArgs lifted_ivalue_args;
   std::vector<int64_t> dyn_size_inputs;
   std::vector<c10::SafePyObject> hooks;
+<<<<<<< HEAD
+=======
+  std::vector<std::function<at::TensorBase(const at::TensorBase&)>>
+      cpp_tensor_pre_hooks;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<c10::SafePyObject> packed_inputs;
   NodeCalls node_calls;
   SizeInput::DynType default_dyn_type;
@@ -556,7 +605,12 @@ class CompiledNodeArgs {
     }
   }
   void collect(const InputMetadata& t) {
+<<<<<<< HEAD
     TORCH_CHECK(!t.is_nested_tensor(), "NestedTensor not implemented");
+=======
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        !t.is_nested_tensor(), "NestedTensor support not implemented. ");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     collect(t.options());
     collect(t.is_tensor_subclass());
     collect(t.shape_as_dim_vector());
@@ -596,12 +650,21 @@ class CompiledNodeArgs {
 #undef COLLECT_AS_BYTES
 
   void collect_hooks_from(Node* fn) {
+<<<<<<< HEAD
     TORCH_CHECK(
         fn->retains_grad_hooks().empty(),
         "retains_grad_hooks not implemented for compiled autograd");
     for (auto& i : fn->tensor_pre_hooks()) {
       i->compiled_args(*this);
     }
+=======
+    for (auto& i : fn->tensor_pre_hooks()) {
+      i->compiled_args(*this);
+    }
+    for (auto& [_, i] : fn->retains_grad_hooks()) {
+      i->compiled_args(*this);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (auto& i : fn->pre_hooks()) {
       i->compiled_args(*this);
     }
@@ -641,6 +704,26 @@ class CompiledNodeArgs {
     _node_call.tensor_pre_hooks.emplace_back(fn_id, index);
   }
 
+<<<<<<< HEAD
+=======
+  void add_cpp_single_tensor_pre_hook(
+      const std::function<at::TensorBase(const at::TensorBase&)>& hook,
+      size_t idx) {
+    auto wrapper = [hook](const at::TensorBase& grad) {
+      // handle when hook returns nothing
+      auto out = hook(grad);
+      if (!out.defined()) {
+        return grad;
+      }
+      return out;
+    };
+
+    auto hook_id = _compiler.emplace_cpp_tensor_pre_hook(std::move(wrapper));
+    collect_size(hook_id);
+    _node_call.cpp_tensor_pre_hooks.emplace_back(hook_id, idx);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void add_pre_hook(c10::SafePyObject&& obj) {
     auto fn_id = _compiler.emplace_hook(std::move(obj));
     collect_size(fn_id);
@@ -1034,7 +1117,11 @@ class SwapSavedVariables {
 // (e.g. MulBackward0_apply_functional). Compiled Autograd's initial graph
 // capture wants to take a variant of this function and proxy it into the graph.
 // Every autograd node defines an apply_with_saved function, that when invoked,
+<<<<<<< HEAD
 // proxys a call to a function into the Compiled Autograd graph.
+=======
+// proxies a call to a function into the Compiled Autograd graph.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //
 // Some requirements that we have are:
 // - The proxy'ed function must have inputs that are FX-graphable types.
@@ -1077,7 +1164,12 @@ struct IValuePacker {
     // with certain compiler settings
     // (see https://github.com/pytorch/pytorch/pull/144707 for examples).
     // It's not clear what the problem is, so we're going to ignore it for now.
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(false, "torch.compile not supported on Windows");
+=======
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "torch.compile not supported on Windows");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
     if constexpr (::std::is_same_v<T, at::Tensor>) {
       return at::TensorType::get();
@@ -1114,7 +1206,12 @@ struct IValuePacker {
       // define how to pack and unpack an object of this time into an IValue
       // by creating a specialization of IValuePacker for this type.
       // See NOTE: [Compiled Autograd and backward functions] for context.
+<<<<<<< HEAD
       TORCH_INTERNAL_ASSERT(false, "IValuePacker not implemented for type");
+=======
+      TORCH_CHECK_NOT_IMPLEMENTED(
+          false, "IValuePacker not implemented for type");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return at::NoneType::get();
     }
 #endif
@@ -1237,11 +1334,19 @@ inline at::TensorOptions unpack_TensorOptions(
   at::TensorOptions result;
   auto maybe_requires_grad = std::get<0>(tuple);
   if (maybe_requires_grad.has_value()) {
+<<<<<<< HEAD
     result = result.requires_grad(maybe_requires_grad.value());
   }
   auto maybe_memory_format = std::get<1>(tuple);
   if (maybe_memory_format.has_value()) {
     result = result.memory_format(maybe_memory_format.value());
+=======
+    result = result.requires_grad(maybe_requires_grad);
+  }
+  auto maybe_memory_format = std::get<1>(tuple);
+  if (maybe_memory_format.has_value()) {
+    result = result.memory_format(maybe_memory_format);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   auto maybe_device = std::get<2>(tuple);
   if (maybe_device.has_value()) {
@@ -1254,11 +1359,19 @@ inline at::TensorOptions unpack_TensorOptions(
   }
   auto maybe_layout = std::get<4>(tuple);
   if (maybe_layout.has_value()) {
+<<<<<<< HEAD
     result = result.layout(maybe_layout.value());
   }
   auto maybe_pinned_memory = std::get<5>(tuple);
   if (maybe_pinned_memory.has_value()) {
     result = result.pinned_memory(maybe_pinned_memory.value());
+=======
+    result = result.layout(maybe_layout);
+  }
+  auto maybe_pinned_memory = std::get<5>(tuple);
+  if (maybe_pinned_memory.has_value()) {
+    result = result.pinned_memory(maybe_pinned_memory);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return result;
 }
diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
index 048bb4e2c843..4a4dd1cf56b0 100644
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@@ -11,6 +11,10 @@
 #include <torch/csrc/utils/python_compat.h>
 
 PyObject* guard_error_hook = NULL;
+<<<<<<< HEAD
+=======
+PyObject* guard_complete_hook = NULL;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 typedef struct {
   int active_dynamo_threads;
@@ -589,7 +593,14 @@ static PyObject* set_skip_guard_eval_unsafe(
 }
 
 static PyObject* get_eval_frame_callback_py(PyObject* dummy, PyObject* args) {
+<<<<<<< HEAD
   return eval_frame_callback_get();
+=======
+  // New reference
+  PyObject* callback = eval_frame_callback_get();
+  Py_INCREF(callback);
+  return callback;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static PyObject* reset_code(PyObject* dummy, PyObject* code) {
@@ -623,6 +634,25 @@ static PyObject* set_guard_error_hook(PyObject* dummy, PyObject* obj) {
   Py_RETURN_NONE;
 }
 
+<<<<<<< HEAD
+=======
+static PyObject* set_guard_complete_hook(PyObject* dummy, PyObject* obj) {
+  PyObject* old_hook = guard_complete_hook;
+
+  if (obj == Py_None) {
+    obj = NULL;
+  }
+
+  guard_complete_hook = Py_XNewRef(obj);
+
+  if (old_hook == NULL) {
+    Py_RETURN_NONE;
+  } else {
+    return old_hook;
+  }
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Debugging function for GNU C only.
 // Used to set gdb breakpoints in hot CPython sites from Python.
 // Code example:
@@ -663,6 +693,10 @@ static PyMethodDef _methods[] = {
     {"unsupported", unsupported, METH_VARARGS, NULL},
     {"set_code_exec_strategy", set_code_exec_strategy, METH_VARARGS, NULL},
     {"set_guard_error_hook", set_guard_error_hook, METH_O, NULL},
+<<<<<<< HEAD
+=======
+    {"set_guard_complete_hook", set_guard_complete_hook, METH_O, NULL},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"raise_sigtrap", raise_sigtrap, METH_NOARGS, NULL},
     {NULL, NULL, 0, NULL}};
 
diff --git a/torch/csrc/dynamo/eval_frame_cpp.cpp b/torch/csrc/dynamo/eval_frame_cpp.cpp
index f029fec223ac..b91eadb65d67 100644
--- a/torch/csrc/dynamo/eval_frame_cpp.cpp
+++ b/torch/csrc/dynamo/eval_frame_cpp.cpp
@@ -7,7 +7,16 @@
 #include <torch/csrc/dynamo/framelocals_mapping.h>
 #include <torch/csrc/utils/python_compat.h>
 
+<<<<<<< HEAD
 const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";
+=======
+extern "C" {
+extern PyObject* guard_complete_hook;
+}
+
+static constexpr const char* cache_lookup_profiler_str =
+    "TorchDynamo Cache Lookup";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Remember to update the type signature for DynamoCallbackFn.__call__ in
 // torch/_dynamo/types.py if this function's signature changes.
@@ -196,7 +205,27 @@ PyObject* dynamo__custom_eval_frame(
     // guard eval failed, keep propagating
     fail();
     return eval_result;
+<<<<<<< HEAD
   } else if (maybe_cached_code != Py_None) {
+=======
+  }
+
+  // NB: We only do guard collectives when there are any compiled code entries
+  // at all; these reduces overtriggering and we don't need to do guard
+  // collectives the very first time we've seen a frame
+  // TODO: We could also check if we had just created extra for the first
+  // time?  Not too sure the best condition for extra->cache_entry_list
+  if (guard_complete_hook != nullptr && !extra->cache_entry_list.empty()) {
+    py::handle guard_complete_hook_handle(guard_complete_hook);
+    // False means force compilation (someone cache missed)
+    py::object res = guard_complete_hook_handle(maybe_cached_code != Py_None);
+    if (!py::cast<bool>(res)) {
+      maybe_cached_code = Py_None; // NB: non-owning
+    }
+  }
+
+  if (maybe_cached_code != Py_None) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cached_code = (PyCodeObject*)maybe_cached_code;
     // used cached version
     DEBUG_TRACE("cache hit %s", get_frame_name(frame));
@@ -273,7 +302,11 @@ PyObject* dynamo__custom_eval_frame(
     // NB: We could use extract_cache_entry to get the cache_entry, but
     // extract_cache_entry returns a borrowed reference. Modifying a borrowed
     // reference seems wrong. Therefore, we directly access the
+<<<<<<< HEAD
     // extra->cache_entry. extra wont be NULL here.
+=======
+    // extra->cache_entry. extra won't be NULL here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CacheEntry* new_cache_entry =
         create_cache_entry(extra, guarded_code, backend);
 
diff --git a/torch/csrc/dynamo/extra_state.cpp b/torch/csrc/dynamo/extra_state.cpp
index 2e60816aa2df..30c278d7ff26 100644
--- a/torch/csrc/dynamo/extra_state.cpp
+++ b/torch/csrc/dynamo/extra_state.cpp
@@ -121,12 +121,22 @@ ExtraState* init_and_set_extra_state(PyCodeObject* code) {
 static bool backend_match(PyObject* saved_backend, PyObject* backend) {
   // Pointer equality check for common case
   if (saved_backend != backend) {
+<<<<<<< HEAD
     // The Py_TYPE check should not be required but there is a pre-existing
     // issue where backend is possibly deallocated (or nullptr) and causes
     // segfaults. Check test - test_inplace_custom_op_intermediate
     return (
         Py_TYPE(saved_backend) == Py_TYPE(backend) &&
         PyObject_RichCompareBool(saved_backend, backend, Py_EQ));
+=======
+    int result = PyObject_RichCompareBool(saved_backend, backend, Py_EQ);
+    // Check for exception
+    if (result == -1) {
+      PyErr_Clear();
+      return false;
+    }
+    return (result == 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return true;
 }
@@ -140,6 +150,17 @@ void lookup(
     bool is_skip_guard_eval_unsafe) {
   size_t index = 0;
   CacheEntry* found = nullptr;
+<<<<<<< HEAD
+=======
+
+  for (const auto& entry : extra_state->precompile_entries) {
+    if (torch::dynamo::run_root_guard_manager(entry.root_mgr, f_locals)) {
+      *maybe_cached_code = entry.code.ptr();
+      return;
+    }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (CacheEntry& cache_entry : extra_state->cache_entry_list) {
     // Check backend. Py_False means run only mode.
 
@@ -220,3 +241,45 @@ py::list _debug_get_cache_entry_list(const py::handle& code_obj) {
   }
   return result;
 }
+<<<<<<< HEAD
+=======
+
+PrecompileEntry::PrecompileEntry(py::object gm, py::object c)
+    : guard_manager(std::move(gm)), code(std::move(c)) {
+  if (!PyCode_Check(code.ptr())) {
+    throw std::runtime_error("Expecting CodeType from PrecompileEntry.");
+  }
+  root_mgr =
+      torch::dynamo::convert_to_root_guard_manager(guard_manager.attr("root"));
+}
+
+void _reset_precompile_entries(const py::handle& code_obj) {
+  if (!py::isinstance(code_obj, py::module::import("types").attr("CodeType"))) {
+    throw py::type_error("expected a code object!");
+  }
+  PyCodeObject* code = (PyCodeObject*)code_obj.ptr();
+  ExtraState* extra = get_extra_state(code);
+  py::list result;
+  if (extra != nullptr) {
+    extra->precompile_entries.clear();
+  }
+}
+
+void _load_precompile_entry(
+    const py::handle& code_obj,
+    py::object guard_manager,
+    py::object dynamo_code) {
+  if (!py::isinstance(code_obj, py::module::import("types").attr("CodeType"))) {
+    throw py::type_error("expected a code object!");
+  }
+  PyCodeObject* code = (PyCodeObject*)code_obj.ptr();
+  ExtraState* extra = get_extra_state(code);
+  py::list result;
+  if (extra == nullptr) {
+    extra = init_and_set_extra_state(code);
+  }
+  auto entry =
+      PrecompileEntry(std::move(guard_manager), std::move(dynamo_code));
+  extra->precompile_entries.push_back(std::move(entry));
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/dynamo/extra_state.h b/torch/csrc/dynamo/extra_state.h
index 288d6cd3e5cf..2c109865f649 100644
--- a/torch/csrc/dynamo/extra_state.h
+++ b/torch/csrc/dynamo/extra_state.h
@@ -47,10 +47,25 @@ typedef struct CacheEntry CacheEntry;
 
 #ifdef __cplusplus
 
+<<<<<<< HEAD
+=======
+typedef struct VISIBILITY_HIDDEN PrecompileEntry {
+  py::object guard_manager;
+  py::object code;
+  void* root_mgr;
+
+  PrecompileEntry(py::object gm, py::object c);
+} PrecompileEntry;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 typedef struct VISIBILITY_HIDDEN ExtraState {
   // A pointer to the orig_code object to prevent race conditions in invalidate
   // function.
   PyCodeObject* orig_code;
+<<<<<<< HEAD
+=======
+  std::list<PrecompileEntry> precompile_entries;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // List of cache entries for compiled code objects
   std::list<CacheEntry> cache_entry_list;
   // Frame state to detect dynamic shape dims
@@ -68,6 +83,10 @@ typedef struct VISIBILITY_HIDDEN ExtraState {
 #else
 
 typedef struct ExtraState ExtraState;
+<<<<<<< HEAD
+=======
+typedef struct PrecompileEntry PrecompileEntry;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #endif
 
@@ -122,7 +141,11 @@ void destroy_extra_state(void* obj);
 // Clears the existing object sitting on the extra scratch spance and sets it
 // up with the new state. Note that _PyCode_SetExtra calls the
 // destroy_extra_state deleter internally, and therefore we don't call it
+<<<<<<< HEAD
 // explicity here.
+=======
+// explicitly here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Ownership contract
 // args
@@ -138,7 +161,11 @@ void destroy_extra_state(void* obj);
 // scratch space.
 void set_extra_state(PyCodeObject* code, ExtraState* extra_state);
 
+<<<<<<< HEAD
 // Creates a new extra state and put it on the extra scrach space of the code
+=======
+// Creates a new extra state and put it on the extra scratch space of the code
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // object.
 
 // Ownership contract
@@ -187,5 +214,13 @@ PyObject* get_backend(PyObject* callback);
 // Returns the list of CacheEntry corresponding to code_obj.
 // Warning: returns references whose lifetimes are controlled by C++
 py::list _debug_get_cache_entry_list(const py::handle& code_obj);
+<<<<<<< HEAD
+=======
+void _reset_precompile_entries(const py::handle& code_obj);
+void _load_precompile_entry(
+    const py::handle& code_obj,
+    py::object guard_manager,
+    py::object dynamo_code);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #endif
diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp
index 37ed08cf1203..6df672bbf48a 100644
--- a/torch/csrc/dynamo/guards.cpp
+++ b/torch/csrc/dynamo/guards.cpp
@@ -20,6 +20,11 @@
 
 #include <torch/csrc/dynamo/debug_macros.h>
 
+<<<<<<< HEAD
+=======
+#include <nlohmann/json.hpp>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_CUDA
 #include <ATen/cuda/EmptyTensor.h>
 #endif
@@ -58,7 +63,11 @@ typedef struct {
   PyTupleObject* it_seq; /* Set to NULL when iterator is exhausted */
 } _PyTupleIterObject;
 
+<<<<<<< HEAD
 // Copied from CPython, and given a unified name for different Python verions.
+=======
+// Copied from CPython, and given a unified name for different Python versions.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // https://github.com/python/cpython/blob/7f71003b222ad398713514c2b55d34dc05dba6bc/Objects/rangeobject.c#L765-L771
 typedef struct {
   PyObject_HEAD
@@ -86,10 +95,18 @@ TensorCheck::TensorCheck(
     const LocalState& state,
     PyTypeObject* pt,
     const at::Tensor& v,
+<<<<<<< HEAD
     std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes,
     std::vector<std::optional<c10::SymInt>> dynamic_dims_strides)
     : pytype(pt),
       dispatch_key_(state.apply(v.key_set()).raw_repr()),
+=======
+    c10::DispatchKeySet dispatch_key_set,
+    std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes,
+    std::vector<std::optional<c10::SymInt>> dynamic_dims_strides)
+    : pytype(pt),
+      dispatch_key_(state.apply(dispatch_key_set).raw_repr()),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       dtype_(v.dtype().toScalarType()),
       device_index_(v.device().index()),
       requires_grad_(v.requires_grad()),
@@ -121,7 +138,11 @@ TensorCheck::TensorCheck(
 // See note in guards.py [Note - On Export Tensor Guards]
 // Logic parallel to here must be maintained in python
 bool TensorCheck::check(const LocalState& state, const at::Tensor& v) {
+<<<<<<< HEAD
   // In terms of a sparse_csr tensor, it does not support strides informatio
+=======
+  // In terms of a sparse_csr tensor, it does not support strides information
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::SymIntArrayRef sym_strides(std::vector<SymInt>(v.ndimension(), -1));
   bool does_not_support_stride = v.layout() == c10::kSparseCsr ||
       v.layout() == c10::kSparseCsc || v.layout() == c10::kSparseBsc ||
@@ -374,6 +395,10 @@ static int TensorGuards_init(
         state,
         Py_TYPE(item),
         std::move(tensor),
+<<<<<<< HEAD
+=======
+        tensor.key_set(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::move(tensor_dims_size),
         std::move(tensor_dims_stride));
   }
@@ -525,11 +550,19 @@ static PyTypeObject TensorGuardsType = { PyVarObject_HEAD_INIT(nullptr, 0)
 
 struct AutocastState {
   static constexpr auto& DEVICES = at::autocast::_AUTOCAST_SUPPORTED_DEVICES;
+<<<<<<< HEAD
   std::array<bool, DEVICES.size()> enabled;
   std::array<at::ScalarType, DEVICES.size()> dtype;
   bool cache_enabled;
 
   AutocastState() : enabled{}, dtype{} {
+=======
+  std::array<bool, DEVICES.size()> enabled{};
+  std::array<at::ScalarType, DEVICES.size()> dtype{};
+  bool cache_enabled;
+
+  AutocastState() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (size_t i = 0; i < DEVICES.size(); i++) {
       enabled[i] = at::autocast::is_autocast_enabled(DEVICES[i]);
       dtype[i] = at::autocast::get_autocast_dtype(DEVICES[i]);
@@ -552,6 +585,23 @@ struct AutocastState {
     }
     return true;
   }
+<<<<<<< HEAD
+=======
+
+  template <typename T>
+  friend void to_json(T& json_j, const AutocastState& json_t) {
+    json_j["enabled"] = json_t.enabled;
+    json_j["dtype"] = json_t.dtype;
+    json_j["cached_enabled"] = json_t.cache_enabled;
+  }
+
+  template <typename T>
+  friend void from_json(const T& json_j, AutocastState& json_t) {
+    json_t.enabled = json_j.at("enabled");
+    json_t.dtype = json_j.at("dtype");
+    json_t.cache_enabled = json_j.at("cached_enabled");
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // TODO (janimesh) - Remove the PyObject_HEAD part when C++ guard manager is
@@ -623,6 +673,43 @@ struct GlobalStateGuard {
     return os.str();
   }
 
+<<<<<<< HEAD
+=======
+  template <typename T>
+  friend void to_json(T& json_j, const GlobalStateGuard& json_t) {
+    json_j["grad_mode"] = json_t._grad_mode;
+    json_j["autocast_state"] = json_t._autocast_state;
+    json_j["torch_function"] = json_t._torch_function;
+    json_j["torch_function_all_disabled"] = json_t._torch_function_all_disabled;
+    json_j["deterministic_algorithms"] = json_t._deterministic_algorithms;
+    json_j["deterministic_algorithms_warn_only"] =
+        json_t._deterministic_algorithms_warn_only;
+    json_j["allow_tf32"] = json_t._allow_tf32;
+    json_j["allow_fp16_reduce"] = json_t._allow_fp16_reduce;
+    json_j["allow_bf16_reduce"] = json_t._allow_bf16_reduce;
+    json_j["num_threads"] = json_t._num_threads;
+    json_j["default_dtype"] = json_t._default_dtype.toScalarType();
+  }
+
+  template <typename T>
+  friend void from_json(const T& json_j, GlobalStateGuard& json_t) {
+    json_t._grad_mode = json_j.at("grad_mode");
+    json_t._autocast_state = json_j.at("autocast_state");
+    json_t._torch_function = json_j.at("torch_function");
+    json_t._torch_function_all_disabled =
+        json_j.at("torch_function_all_disabled");
+    json_t._deterministic_algorithms = json_j.at("deterministic_algorithms");
+    json_t._deterministic_algorithms_warn_only =
+        json_j.at("deterministic_algorithms_warn_only");
+    json_t._allow_tf32 = json_j.at("allow_tf32");
+    json_t._allow_fp16_reduce = json_j.at("allow_fp16_reduce");
+    json_t._allow_bf16_reduce = json_j.at("allow_bf16_reduce");
+    json_t._num_threads = json_j.at("num_threads");
+    json_t._default_dtype =
+        caffe2::TypeMeta::fromScalarType(json_j.at("default_dtype"));
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool _grad_mode;
   AutocastState _autocast_state;
   bool _torch_function;
@@ -663,6 +750,28 @@ PyObject* GlobalStateGuard_reason(
   return PyUnicode_FromString(self->reason().c_str());
 }
 
+<<<<<<< HEAD
+=======
+PyObject* GlobalStateGuard_dump(
+    GlobalStateGuard* self,
+    PyObject* args,
+    PyObject* kwargs) {
+  return PyUnicode_FromString(nlohmann::json(*self).dump().c_str());
+}
+
+PyObject* GlobalStateGuard_load(
+    GlobalStateGuard* self,
+    PyObject* args,
+    PyObject* kwargs) {
+  char* json;
+  if (!PyArg_ParseTuple(args, "s", &json)) {
+    throw std::runtime_error("Cannot parse as json string.");
+  }
+  nlohmann::json::parse(json).get_to(*self);
+  Py_RETURN_NONE;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // NOLINTNEXTLINE(*array*)
 static PyMethodDef GlobalStateGuard_methods[] = {
     {"check",
@@ -673,6 +782,17 @@ static PyMethodDef GlobalStateGuard_methods[] = {
      (PyCFunction)(void*)GlobalStateGuard_reason,
      METH_NOARGS,
      "Return string reason for guard check failing"},
+<<<<<<< HEAD
+=======
+    {"dump",
+     (PyCFunction)(void*)GlobalStateGuard_dump,
+     METH_NOARGS,
+     "Return serialized json format"},
+    {"load",
+     (PyCFunction)(void*)GlobalStateGuard_load,
+     METH_VARARGS,
+     "Parse serialized json format"},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 static PyTypeObject GlobalStateGuardType = { PyVarObject_HEAD_INIT(nullptr, 0)
 };
@@ -765,6 +885,7 @@ static PyObject* assert_size_stride(PyObject* dummy, PyObject* args) {
   PyObject* item = nullptr;
   PyObject* size = nullptr;
   PyObject* stride = nullptr;
+<<<<<<< HEAD
   if (!PyArg_ParseTuple(args, "OOO", &item, &size, &stride)) {
     return nullptr;
   }
@@ -774,12 +895,44 @@ static PyObject* assert_size_stride(PyObject* dummy, PyObject* args) {
   }
   if (!PyTuple_CheckExact(size) || !PyTuple_CheckExact(stride)) {
     PyErr_SetString(PyExc_TypeError, "expected tuple()");
+=======
+  const char* op_name = nullptr;
+
+  if (!PyArg_ParseTuple(args, "OOO|s", &item, &size, &stride, &op_name)) {
+    return nullptr;
+  }
+  if (!THPVariable_CheckExact(item) && !THPVariable_Check(item)) {
+    std::stringstream msg;
+    msg << "expected Tensor()";
+    if (op_name) {
+      msg << " for op: " << op_name;
+    }
+    PyErr_SetString(PyExc_TypeError, msg.str().c_str());
+    return nullptr;
+  }
+  if (!PyTuple_CheckExact(size) || !PyTuple_CheckExact(stride)) {
+    std::stringstream msg;
+    msg << "expected tuple()";
+    if (op_name) {
+      msg << " for op: " << op_name;
+    }
+    PyErr_SetString(PyExc_TypeError, msg.str().c_str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return nullptr;
   }
   at::Tensor tensor = THPVariable_Unpack(item);
   int64_t ndim = tensor.ndimension();
   if (PyTuple_GET_SIZE(size) != ndim || PyTuple_GET_SIZE(stride) != ndim) {
+<<<<<<< HEAD
     PyErr_SetString(PyExc_AssertionError, "wrong number of dimensions");
+=======
+    std::stringstream msg;
+    msg << "wrong number of dimensions" << ndim;
+    if (op_name) {
+      msg << " for op: " << op_name;
+    }
+    PyErr_SetString(PyExc_AssertionError, msg.str().c_str());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return nullptr;
   }
 
@@ -808,6 +961,12 @@ static PyObject* assert_size_stride(PyObject* dummy, PyObject* args) {
   }
 
   if (num_errors) {
+<<<<<<< HEAD
+=======
+    if (op_name) {
+      msg << "\nError in op: " << op_name;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     msg << "\nThis error most often comes from a incorrect fake (aka meta) kernel for a custom op.";
     msg << "\nUse torch.library.opcheck to test your custom op.";
     msg << "\nSee https://pytorch.org/docs/stable/library.html#torch.library.opcheck";
@@ -818,6 +977,59 @@ static PyObject* assert_size_stride(PyObject* dummy, PyObject* args) {
   Py_RETURN_TRUE;
 }
 
+<<<<<<< HEAD
+=======
+static PyObject* assert_alignment(PyObject* dummy, PyObject* args) {
+  /*
+   * Asserts that a given tensor meets certain alignment.
+   * This C++ version of torch._inductor.utils.tensor_is_aligned
+   */
+  PyObject* item = nullptr;
+  unsigned long alignment = 0;
+  const char* op_name = nullptr;
+
+  if (!PyArg_ParseTuple(args, "Ok|s", &item, &alignment, &op_name)) {
+    return nullptr;
+  }
+  if (!THPVariable_CheckExact(item) && !THPVariable_Check(item)) {
+    std::stringstream msg;
+    msg << "expected Tensor()";
+    if (op_name) {
+      msg << " for op: " << op_name;
+    }
+    PyErr_SetString(PyExc_TypeError, msg.str().c_str());
+    return nullptr;
+  }
+  if (alignment == 0) {
+    std::stringstream msg;
+    msg << "alignment cannot be 0";
+    if (op_name) {
+      msg << " in op: " << op_name;
+    }
+    PyErr_SetString(PyExc_AssertionError, msg.str().c_str());
+    return nullptr;
+  }
+
+  at::Tensor tensor = THPVariable_Unpack(item);
+
+  int64_t storage_offset = tensor.storage_offset();
+  size_t itemsize = tensor.itemsize();
+  if (storage_offset * itemsize % alignment != 0) {
+    std::stringstream msg;
+    if (op_name) {
+      msg << "\nError in op: " << op_name;
+    }
+    msg << "\nExpect the tensor to be " << alignment
+        << " bytes aligned. Fail due to storage_offset=" << storage_offset
+        << " itemsize=" << itemsize;
+    PyErr_SetString(PyExc_AssertionError, msg.str().c_str());
+    return nullptr;
+  }
+
+  Py_RETURN_TRUE;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 template <typename T>
 static void unwrap_size_tuple(PyObject* obj, T& output) {
   TORCH_CHECK(PyTuple_CheckExact(obj));
@@ -921,6 +1133,10 @@ static PyMethodDef _methods[] = {
     {"check_type_id", check_type_id, METH_VARARGS, nullptr},
     {"check_obj_id", check_obj_id, METH_VARARGS, nullptr},
     {"assert_size_stride", assert_size_stride, METH_VARARGS, nullptr},
+<<<<<<< HEAD
+=======
+    {"assert_alignment", assert_alignment, METH_VARARGS, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {"dict_version", dict_version, METH_VARARGS, nullptr},
     {"_empty_strided_cpu", _empty_strided_cpu, METH_VARARGS, nullptr},
     {"_empty_strided_cuda", _empty_strided_cuda, METH_VARARGS, nullptr},
@@ -940,13 +1156,21 @@ std::string get_exception_message() {
   PyErr_Fetch(&ptype, &pvalue, &ptraceback);
 
   PyObject* exc_message_pyobj = PyObject_Str(pvalue);
+<<<<<<< HEAD
   const char* exc_message = PyUnicode_AsUTF8(exc_message_pyobj);
+=======
+  std::string exc_message = PyUnicode_AsUTF8(exc_message_pyobj);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   Py_DECREF(exc_message_pyobj);
   Py_XDECREF(ptype);
   Py_XDECREF(pvalue);
   Py_XDECREF(ptraceback);
+<<<<<<< HEAD
   return std::string(exc_message);
+=======
+  return exc_message;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool is_immutable_object(py::handle example_value) {
@@ -1262,7 +1486,11 @@ class StorageOverlapChecker {
    */
   std::vector<Tensor> _tensors_from(
       const std::vector<PyObject*>& objects,
+<<<<<<< HEAD
       int64_t size) {
+=======
+      size_t size) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<Tensor> tensors;
     tensors.reserve(size);
     std::transform(
@@ -1486,6 +1714,39 @@ class ID_MATCH : public LeafGuard {
   intptr_t _expected;
 };
 
+<<<<<<< HEAD
+=======
+class NONE_MATCH : public LeafGuard {
+ public:
+  NONE_MATCH(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    return value == Py_None;
+  }
+};
+
+class TRUE_MATCH : public LeafGuard {
+ public:
+  TRUE_MATCH(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    return value == Py_True;
+  }
+};
+
+class FALSE_MATCH : public LeafGuard {
+ public:
+  FALSE_MATCH(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    return value == Py_False;
+  }
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class EQUALS_MATCH : public LeafGuard {
  public:
   EQUALS_MATCH(py::object value, py::object verbose_code_parts)
@@ -1736,6 +1997,7 @@ class GLOBAL_STATE : public LeafGuard {
   std::unique_ptr<GlobalStateGuard> _guard;
 };
 
+<<<<<<< HEAD
 class DATA_PTR_MATCH : public LeafGuard {
  public:
   DATA_PTR_MATCH(py::object tensor, py::object verbose_code_parts)
@@ -1760,6 +2022,8 @@ class DATA_PTR_MATCH : public LeafGuard {
   void* _data_ptr;
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Checks that an attr is absent in the object. We don't need the opposite
 // HASATTR guard because we can just rely on GetAttrGuardAccessor to act as
 // HASATTR guard.
@@ -1947,8 +2211,12 @@ class SYMBOLIC_SHAPE_GUARD : public RelationalGuard {
       py::object py_addr_keep_alive,
       py::object verbose_code_parts)
       : RelationalGuard(std::move(verbose_code_parts)),
+<<<<<<< HEAD
         _py_addr_keep_alive(std::move(py_addr_keep_alive)),
         _args_seen{0} {
+=======
+        _py_addr_keep_alive(std::move(py_addr_keep_alive)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _nargs_int = PyLong_AsSize_t(nargs_int.ptr());
     _nargs_float = PyLong_AsSize_t(nargs_float.ptr());
     _nargs = _nargs_int + _nargs_float;
@@ -2018,7 +2286,11 @@ class SYMBOLIC_SHAPE_GUARD : public RelationalGuard {
     bool result = check_nopybind(value);
 
     if (!result) {
+<<<<<<< HEAD
       std::string msg = "Shape guard failed with values: ";
+=======
+      std::string msg = "\"Shape guard failed with values: ";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       for (auto v : _args_int) {
         msg += std::to_string(v) + ",";
       }
@@ -2026,6 +2298,10 @@ class SYMBOLIC_SHAPE_GUARD : public RelationalGuard {
         msg += std::to_string(v) + ",";
       }
       msg.pop_back();
+<<<<<<< HEAD
+=======
+      msg += "\"";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto msgs = py::list();
       for (auto code_part : verbose_code_parts()) {
         msgs.append(code_part);
@@ -2042,7 +2318,11 @@ class SYMBOLIC_SHAPE_GUARD : public RelationalGuard {
 
  private:
   py::object _py_addr_keep_alive;
+<<<<<<< HEAD
   size_t _args_seen, _nargs_float, _nargs_int, _nargs;
+=======
+  size_t _args_seen{0}, _nargs_float, _nargs_int, _nargs;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<int64_t> _args_int;
   std::vector<double> _args_float;
   std::function<int8_t(int64_t*, double*)> _guard_check_fn;
@@ -2251,7 +2531,11 @@ class GuardAccessor {
  * value passed to the check function to call the check function of the child
  * guard manager.
  *
+<<<<<<< HEAD
  * Performace optimization for fail fast - An optimization for runtime here is
+=======
+ * Performance optimization for fail fast - An optimization for runtime here is
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * to sort the execution of child guards depending on the failure count.  This
  * ensures that we run the guards that are more prone to fail statistically
  * first. This can improve the cache lookup time when we have multiple cache
@@ -2675,7 +2959,11 @@ class RootGuardManager : public GuardManager {
   template <typename T>
   bool check_nopybind_template(T* value) { // borrowed ref
     // Check [Note on GIL interaction with mutex lock] for details on why we
+<<<<<<< HEAD
     // need mutex and its interactions wth GIL.
+=======
+    // need mutex and its interactions with GIL.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyThreadState* _save = nullptr;
     Py_UNBLOCK_THREADS; // ; is added to avoid clang-formatting
     std::lock_guard<std::mutex> lock_guard(_lock);
@@ -2733,7 +3021,11 @@ class RootGuardManager : public GuardManager {
   GuardDebugInfo check_verbose_nopybind(
       PyObject* value) override { // borrowed ref
     // Check [Note on GIL interaction with mutex lock] for details on why we
+<<<<<<< HEAD
     // need mutex and its interactions wth GIL.
+=======
+    // need mutex and its interactions with GIL.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyThreadState* _save = nullptr;
     Py_UNBLOCK_THREADS; // ; is added to avoid clang-formatting
     std::lock_guard<std::mutex> lock_guard(_lock);
@@ -2836,7 +3128,11 @@ class RootGuardManager : public GuardManager {
   LocalState _local_state;
 
  private:
+<<<<<<< HEAD
   // All the relational guards under this guard mananger. We only use these
+=======
+  // All the relational guards under this guard manager. We only use these
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // when the guard evaluates to False. This ensures that guard state is reset
   // on guard failure so that next invocation is clean.
   std::vector<std::shared_ptr<RelationalGuard>> _relational_guard_resetters;
@@ -3334,7 +3630,13 @@ class TENSOR_MATCH : public LeafGuard {
       py::object dynamic_dims_sizes_py,
       py::object dynamic_dims_strides_py,
       py::object tensor_name,
+<<<<<<< HEAD
       py::object verbose_code_parts)
+=======
+      py::object verbose_code_parts,
+      py::object pytype,
+      py::object dispatch_keys)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
         _tensor_name(py::cast<std::string>(std::move(tensor_name))) {
     root_guard_manager->set_init_local_state_flag();
@@ -3343,6 +3645,13 @@ class TENSOR_MATCH : public LeafGuard {
       PyErr_SetString(PyExc_TypeError, "expected Tensor()");
       return;
     }
+<<<<<<< HEAD
+=======
+    if (!PyType_Check(pytype.ptr())) {
+      PyErr_SetString(PyExc_TypeError, "expected type object");
+      return;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto tensor = THPVariable_Unpack(item);
 
     std::vector<std::optional<c10::SymInt>> tensor_dims_size =
@@ -3359,8 +3668,14 @@ class TENSOR_MATCH : public LeafGuard {
     LocalState state;
     _tensor_check = std::make_unique<TensorCheck>(
         state,
+<<<<<<< HEAD
         Py_TYPE(item),
         std::move(tensor),
+=======
+        (PyTypeObject*)pytype.ptr(),
+        std::move(tensor),
+        dispatch_keys.cast<c10::DispatchKeySet>(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         std::move(tensor_dims_size),
         std::move(tensor_dims_stride));
   }
@@ -3412,7 +3727,11 @@ class TENSOR_MATCH : public LeafGuard {
 };
 
 /**
+<<<<<<< HEAD
  * Represents __getattr__ acccessor.
+=======
+ * Represents __getattr__ accessor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 class GetAttrGuardAccessor : public GuardAccessor {
  public:
@@ -3460,13 +3779,20 @@ class GetAttrGuardAccessor : public GuardAccessor {
   }
 
   std::string repr() const override {
+<<<<<<< HEAD
     // Helpful when priting GuardManager tree structure.
+=======
+    // Helpful when printing GuardManager tree structure.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return "GetAttrGuardAccessor(" + py::str(_attr_name).cast<std::string>() +
         ")";
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   GetAttrGuardAccessor(GuardManager* guard_manager, GetAttrGuardAccessor* from)
       : GuardAccessor(guard_manager, from) {
     from->clone_visitor(this);
@@ -3485,11 +3811,19 @@ class GetAttrGuardAccessor : public GuardAccessor {
  private:
   // no need of py::object here because the attr_name is already passed on to
   // the base class as accessor_key which is a py::object.
+<<<<<<< HEAD
   PyObject* _attr_name;
 };
 
 /**
  * Represents object.__getattribute__(obj, attr_name) acccessor.
+=======
+  PyObject* _attr_name{nullptr};
+};
+
+/**
+ * Represents object.__getattribute__(obj, attr_name) accessor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 class GenericGetAttrGuardAccessor : public GuardAccessor {
  public:
@@ -3537,7 +3871,11 @@ class GenericGetAttrGuardAccessor : public GuardAccessor {
   }
 
   std::string repr() const override {
+<<<<<<< HEAD
     // Helpful when priting GuardManager tree structure.
+=======
+    // Helpful when printing GuardManager tree structure.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return "GenericGetAttrGuardAccessor(" +
         py::str(_attr_name).cast<std::string>() + ")";
   }
@@ -3568,7 +3906,11 @@ class GenericGetAttrGuardAccessor : public GuardAccessor {
 };
 
 /**
+<<<<<<< HEAD
  * Represents x.__dict__ acccessor.
+=======
+ * Represents x.__dict__ accessor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 class GetGenericDictGuardAccessor : public GuardAccessor {
  public:
@@ -3615,12 +3957,19 @@ class GetGenericDictGuardAccessor : public GuardAccessor {
   }
 
   std::string repr() const override {
+<<<<<<< HEAD
     // Helpful when priting GuardManager tree structure.
+=======
+    // Helpful when printing GuardManager tree structure.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return "GetGenericDictGuardAccessor";
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   GetGenericDictGuardAccessor(
       GuardManager* guard_manager,
       GetGenericDictGuardAccessor* from)
@@ -3637,7 +3986,11 @@ class GetGenericDictGuardAccessor : public GuardAccessor {
 };
 
 /**
+<<<<<<< HEAD
  * Represents __getitem__ acccessor.
+=======
+ * Represents __getitem__ accessor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 class GetItemGuardAccessor : public GuardAccessor {
  public:
@@ -3688,7 +4041,10 @@ class GetItemGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   GetItemGuardAccessor(GuardManager* guard_manager, GetItemGuardAccessor* from)
       : GuardAccessor(guard_manager, from) {
     from->clone_visitor(this);
@@ -3707,7 +4063,11 @@ class GetItemGuardAccessor : public GuardAccessor {
  private:
   // no need of py::object here because the attr_name is already passed on to
   // the base class as accessor_key which is a py::object.
+<<<<<<< HEAD
   PyObject* _attr_name;
+=======
+  PyObject* _attr_name{nullptr};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /**
@@ -3806,7 +4166,10 @@ class FrameLocalsGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   FrameLocalsGuardAccessor(
       GuardManager* guard_manager,
       FrameLocalsGuardAccessor* from)
@@ -3827,6 +4190,7 @@ class FrameLocalsGuardAccessor : public GuardAccessor {
   }
 
  private:
+<<<<<<< HEAD
   PyObject* _key;
   int _framelocals_idx;
 
@@ -3837,6 +4201,18 @@ class FrameLocalsGuardAccessor : public GuardAccessor {
 
 /**
  * Represents dict[name] acccessor. Needed since DictGuardManager does not
+=======
+  PyObject* _key{nullptr};
+  int _framelocals_idx{-1};
+
+  // If immutable object and dict tag matches, we can skip the guard subtree and
+  // return true.
+  bool _is_immutable_object{false};
+};
+
+/**
+ * Represents dict[name] accessor. Needed since DictGuardManager does not
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * support sorting. We differentiate it from GetItemGuardAccessor because
  * PyDict_GetItem should be faster than PyObject_GetItem.
  */
@@ -3864,7 +4240,11 @@ class DictGetItemGuardAccessor : public GuardAccessor {
         _guard_manager->has_no_accessors()) {
       // immutable object and dict tag matches, we can skip the guard subtree.
       // NB: We only skip the subtree if there are no accessors in the subtree.
+<<<<<<< HEAD
       // This is specificallly for tensors which are used in symbolic shape C++
+=======
+      // This is specifically for tensors which are used in symbolic shape C++
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // guards, and therefore have accessors on the tensor GuardManager itself.
       return true;
     }
@@ -3896,7 +4276,10 @@ class DictGetItemGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   DictGetItemGuardAccessor(
       GuardManager* guard_manager,
       DictGetItemGuardAccessor* from)
@@ -3916,11 +4299,19 @@ class DictGetItemGuardAccessor : public GuardAccessor {
   }
 
  private:
+<<<<<<< HEAD
   PyObject* _key;
 
   // If immutable object and dict tag matches, we can skip the guard subtree and
   // return true.
   bool _is_immutable_object;
+=======
+  PyObject* _key{nullptr};
+
+  // If immutable object and dict tag matches, we can skip the guard subtree and
+  // return true.
+  bool _is_immutable_object{false};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /**
@@ -3973,7 +4364,10 @@ class ListGetItemGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ListGetItemGuardAccessor(
       GuardManager* guard_manager,
       ListGetItemGuardAccessor* from)
@@ -3992,7 +4386,11 @@ class ListGetItemGuardAccessor : public GuardAccessor {
   }
 
  private:
+<<<<<<< HEAD
   Py_ssize_t _index;
+=======
+  Py_ssize_t _index{-1};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /**
@@ -4045,7 +4443,10 @@ class TupleGetItemGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TupleGetItemGuardAccessor(
       GuardManager* guard_manager,
       TupleGetItemGuardAccessor* from)
@@ -4065,7 +4466,11 @@ class TupleGetItemGuardAccessor : public GuardAccessor {
   }
 
  private:
+<<<<<<< HEAD
   Py_ssize_t _index;
+=======
+  Py_ssize_t _index{-1};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 enum class TensorProperty {
@@ -4088,7 +4493,11 @@ std::string to_string(TensorProperty prop) {
 }
 
 /**
+<<<<<<< HEAD
  * Represents tensor.size/shape/storage_offset acccessor.
+=======
+ * Represents tensor.size/shape/storage_offset accessor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 template <TensorProperty _prop>
 class TensorPropertyGuardAccessor : public GuardAccessor {
@@ -4186,13 +4595,20 @@ class TensorPropertyGuardAccessor : public GuardAccessor {
   }
 
   std::string repr() const override {
+<<<<<<< HEAD
     // Helpful when priting GuardManager tree structure.
+=======
+    // Helpful when printing GuardManager tree structure.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return "TensorPropertyGuardAccessor<" + to_string(_prop) + +">(" +
         std::to_string(_index) + ")";
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TensorPropertyGuardAccessor(
       GuardManager* guard_manager,
       TensorPropertyGuardAccessor<_prop>* from)
@@ -4212,7 +4628,11 @@ class TensorPropertyGuardAccessor : public GuardAccessor {
   }
 
  private:
+<<<<<<< HEAD
   Py_ssize_t _index;
+=======
+  Py_ssize_t _index{-1};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /**
@@ -4259,7 +4679,10 @@ class IndexedGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   IndexedGuardAccessor(GuardManager* guard_manager, IndexedGuardAccessor* from)
       : GuardAccessor(guard_manager, from) {
     from->clone_visitor(this);
@@ -4276,11 +4699,19 @@ class IndexedGuardAccessor : public GuardAccessor {
   }
 
  private:
+<<<<<<< HEAD
   py::int_ _index;
 };
 
 /**
  * Represents tensor.grad acccessor.
+=======
+  py::int_ _index{-1};
+};
+
+/**
+ * Represents tensor.grad accessor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 class GradGuardAccessor : public GuardAccessor {
  public:
@@ -4331,12 +4762,19 @@ class GradGuardAccessor : public GuardAccessor {
   }
 
   std::string repr() const override {
+<<<<<<< HEAD
     // Helpful when priting GuardManager tree structure.
+=======
+    // Helpful when printing GuardManager tree structure.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return "GradGuardAccessor(grad)";
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   GradGuardAccessor(GuardManager* guard_manager, GradGuardAccessor* from)
       : GuardAccessor(guard_manager, from) {
     from->clone_visitor(this);
@@ -4410,7 +4848,10 @@ class FuncDefaultsGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   FuncDefaultsGuardAccessor(
       GuardManager* guard_manager,
       FuncDefaultsGuardAccessor* from)
@@ -4486,7 +4927,10 @@ class FuncKwDefaultsGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   FuncKwDefaultsGuardAccessor(
       GuardManager* guard_manager,
       FuncKwDefaultsGuardAccessor* from)
@@ -4503,7 +4947,11 @@ class FuncKwDefaultsGuardAccessor : public GuardAccessor {
 };
 
 /**
+<<<<<<< HEAD
  * Represents f_globals acccessor. This sits as a child accessor of the
+=======
+ * Represents f_globals accessor. This sits as a child accessor of the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * RootGuardManager.
  */
 class GlobalsGuardAccessor : public GuardAccessor {
@@ -4543,7 +4991,10 @@ class GlobalsGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   GlobalsGuardAccessor(GuardManager* guard_manager, GlobalsGuardAccessor* from)
       : GuardAccessor(guard_manager, from) {
     from->clone_visitor(this);
@@ -4562,7 +5013,11 @@ class GlobalsGuardAccessor : public GuardAccessor {
  private:
   // no need of py::object here because the globals_dict is already passed on to
   // the base class as accessor_key which is a py::object.
+<<<<<<< HEAD
   PyObject* _globals_dict;
+=======
+  PyObject* _globals_dict{nullptr};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /**
@@ -4603,7 +5058,10 @@ class TypeGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TypeGuardAccessor(GuardManager* guard_manager, TypeGuardAccessor* from)
       : GuardAccessor(guard_manager, from) {
     from->clone_visitor(this);
@@ -4672,7 +5130,10 @@ class TupleIteratorGetItemAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TupleIteratorGetItemAccessor(
       GuardManager* guard_manager,
       TupleIteratorGetItemAccessor* from)
@@ -4692,14 +5153,22 @@ class TupleIteratorGetItemAccessor : public GuardAccessor {
   }
 
  private:
+<<<<<<< HEAD
   Py_ssize_t _index;
+=======
+  Py_ssize_t _index{-1};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /**
  * GlobalWeakRef accessor. Dynamo can insert a weakref object into the frame
  * globals. This accessor reads the globals and then calls the weakref object
  * to get the underlying object. This is a child of GlobalsGuardAccessor.
+<<<<<<< HEAD
  * Therefore, we will get the globals dict while caling check_nopybind.
+=======
+ * Therefore, we will get the globals dict while calling check_nopybind.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 class GlobalWeakRefGuardAccessor : public GuardAccessor {
  public:
@@ -4788,7 +5257,10 @@ class GlobalWeakRefGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   GlobalWeakRefGuardAccessor(
       GuardManager* guard_manager,
       GlobalWeakRefGuardAccessor* from)
@@ -4807,7 +5279,11 @@ class GlobalWeakRefGuardAccessor : public GuardAccessor {
   }
 
  private:
+<<<<<<< HEAD
   PyObject* _global_name;
+=======
+  PyObject* _global_name{nullptr};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 /**
@@ -4879,7 +5355,10 @@ class WeakRefCallGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   WeakRefCallGuardAccessor(
       GuardManager* guard_manager,
       WeakRefCallGuardAccessor* from)
@@ -4959,7 +5438,10 @@ class CallFunctionNoArgsGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   CallFunctionNoArgsGuardAccessor(
       GuardManager* guard_manager,
       CallFunctionNoArgsGuardAccessor* from)
@@ -5031,7 +5513,10 @@ class PythonLambdaGuardAccessor : public GuardAccessor {
   }
 
  public: // cloning functions
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   PythonLambdaGuardAccessor(
       GuardManager* guard_manager,
       PythonLambdaGuardAccessor* from)
@@ -5063,7 +5548,11 @@ void install_object_aliasing_guard(
   std::shared_ptr<RelationalGuard> guard =
       std::make_shared<OBJECT_ALIASING>(std::move(verbose_code_parts));
 
+<<<<<<< HEAD
   // Register the resetter on the root guard mananger, so that it can reset
+=======
+  // Register the resetter on the root guard manager, so that it can reset
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // the newly added relational guard when the guard eval fails.
   x->get_root()->add_relational_guard_resetter(guard);
 
@@ -5083,7 +5572,11 @@ void install_no_tensor_aliasing_guard(
   std::shared_ptr<RelationalGuard> guard = std::make_shared<NO_TENSOR_ALIASING>(
       tensor_names, std::move(verbose_code_parts));
 
+<<<<<<< HEAD
   // Register the resetter on the root guard mananger, so that it can reset
+=======
+  // Register the resetter on the root guard manager, so that it can reset
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // the newly added relational guard when the guard eval fails.
   py::cast<GuardManager*>(guard_managers[0])
       ->get_root()
@@ -5111,7 +5604,11 @@ void install_symbolic_shape_guard(
           std::move(py_addr_keep_alive),
           std::move(verbose_code_parts));
 
+<<<<<<< HEAD
   // Register the resetter on the root guard mananger, so that it can reset
+=======
+  // Register the resetter on the root guard manager, so that it can reset
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // the newly added relational guard when the guard eval fails.
   py::cast<GuardManager*>(guard_managers[0])
       ->get_root()
@@ -5167,23 +5664,56 @@ void install_storage_overlapping_guard(
       /* overlapping= */ false);
 }
 
+<<<<<<< HEAD
+=======
+char flush_cache_by_eviction() {
+  constexpr size_t evict_size = 32 * 1024 * 1024;
+  std::vector<char> buffer(evict_size, 1);
+
+  volatile char sink = 0;
+  for (size_t i = 0; i < buffer.size(); i += 64) {
+    sink ^= buffer[i];
+  }
+  return sink;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 double profile_guard_manager(
     RootGuardManager* root,
     py::object f_locals,
     int n_iters) {
   PyObject* locals = f_locals.ptr();
 
+<<<<<<< HEAD
   // Warmup
+=======
+  // Warmup to setup fast paths (like dict_tags) for the actual profiling
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (int i = 0; i < 5; i++) {
     root->check_nopybind(locals);
   }
 
+<<<<<<< HEAD
   auto start = std::chrono::high_resolution_clock::now();
   for (int i = 0; i < n_iters; i++) {
     root->check_nopybind(locals);
   }
   auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double> total_elapsed = end - start;
+=======
+  std::chrono::duration<double> total_elapsed{0.0};
+  for (int i = 0; i < n_iters; i++) {
+    // Flush the caches to accurately measure the overhead
+    // store into a volatile to prevent optimization
+    volatile char dummy = flush_cache_by_eviction();
+    (void)dummy;
+
+    auto start = std::chrono::high_resolution_clock::now();
+    root->check_nopybind(locals);
+    auto end = std::chrono::high_resolution_clock::now();
+    total_elapsed += end - start;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Calculate the average time per iteration in microseconds
   return (total_elapsed.count() * 1e6) / n_iters;
@@ -5215,6 +5745,7 @@ bool run_root_guard_manager(void* root, FrameLocalsMapping* f_locals) {
   if (root == nullptr) {
     return false;
   }
+<<<<<<< HEAD
   py::object config_module = py::module_::import("torch._dynamo.config");
   bool enable_cpp_framelocals_guard_eval =
       config_module.attr("enable_cpp_framelocals_guard_eval").cast<bool>();
@@ -5224,6 +5755,9 @@ bool run_root_guard_manager(void* root, FrameLocalsMapping* f_locals) {
     return ((RootGuardManager*)root)
         ->check_nopybind((PyObject*)f_locals->to_dict());
   }
+=======
+  return ((RootGuardManager*)root)->check_nopybind(f_locals);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 PyObject* torch_c_dynamo_guards_init() {
@@ -5312,6 +5846,21 @@ PyObject* torch_c_dynamo_guards_init() {
   py::class_<ID_MATCH, LeafGuard, std::shared_ptr<ID_MATCH>>(py_m, "ID_MATCH")
       .def(py::init<py::object, py::list>())
       .def("__call__", &ID_MATCH::check);
+<<<<<<< HEAD
+=======
+  py::class_<NONE_MATCH, LeafGuard, std::shared_ptr<NONE_MATCH>>(
+      py_m, "NONE_MATCH")
+      .def(py::init<py::list>())
+      .def("__call__", &NONE_MATCH::check);
+  py::class_<TRUE_MATCH, LeafGuard, std::shared_ptr<TRUE_MATCH>>(
+      py_m, "TRUE_MATCH")
+      .def(py::init<py::list>())
+      .def("__call__", &TRUE_MATCH::check);
+  py::class_<FALSE_MATCH, LeafGuard, std::shared_ptr<FALSE_MATCH>>(
+      py_m, "FALSE_MATCH")
+      .def(py::init<py::list>())
+      .def("__call__", &FALSE_MATCH::check);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::class_<EQUALS_MATCH, LeafGuard, std::shared_ptr<EQUALS_MATCH>>(
       py_m, "EQUALS_MATCH")
       .def(py::init<py::object, py::list>())
@@ -5361,10 +5910,13 @@ PyObject* torch_c_dynamo_guards_init() {
       py_m, "TORCH_FUNCTION_MODE_STACK")
       .def(py::init<py::list, py::list>())
       .def("__call__", &TORCH_FUNCTION_MODE_STACK::check);
+<<<<<<< HEAD
   py::class_<DATA_PTR_MATCH, LeafGuard, std::shared_ptr<DATA_PTR_MATCH>>(
       py_m, "DATA_PTR_MATCH")
       .def(py::init<py::object, py::list>())
       .def("__call__", &DATA_PTR_MATCH::check);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::class_<NO_HASATTR, LeafGuard, std::shared_ptr<NO_HASATTR>>(
       py_m, "NO_HASATTR")
       .def(py::init<py::object, py::list>())
@@ -5395,6 +5947,7 @@ PyObject* torch_c_dynamo_guards_init() {
            py::object,
            py::object,
            py::str,
+<<<<<<< HEAD
            py::list>())
       .def("__call__", &TENSOR_MATCH::check);
   // NOLINTNEXTLINE(bugprone-unused-raii)
@@ -5404,16 +5957,42 @@ PyObject* torch_c_dynamo_guards_init() {
   py::class_<
       NO_TENSOR_ALIASING,
       LeafGuard,
+=======
+           py::list,
+           py::type,
+           py::object>())
+      .def("__call__", &TENSOR_MATCH::check);
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<RelationalGuard, LeafGuard, std::shared_ptr<RelationalGuard>>(
+      py_m, "RelationalGuard");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      OBJECT_ALIASING,
+      RelationalGuard,
+      std::shared_ptr<OBJECT_ALIASING>>(py_m, "OBJECT_ALIASING");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      NO_TENSOR_ALIASING,
+      RelationalGuard,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::shared_ptr<NO_TENSOR_ALIASING>>(py_m, "NO_TENSOR_ALIASING");
   // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<
       STORAGE_OVERLAPPING,
+<<<<<<< HEAD
       LeafGuard,
+=======
+      RelationalGuard,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::shared_ptr<STORAGE_OVERLAPPING>>(py_m, "STORAGE_OVERLAPPING");
   // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<
       SYMBOLIC_SHAPE_GUARD,
+<<<<<<< HEAD
       LeafGuard,
+=======
+      RelationalGuard,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::shared_ptr<SYMBOLIC_SHAPE_GUARD>>(py_m, "SYMBOLIC_SHAPE_GUARD");
 
   // Guard Accessors - These are present so that we can iterate over the
@@ -5564,6 +6143,30 @@ PyObject* torch_c_dynamo_guards_init() {
                 std::move(value), std::move(verbose_code_parts)));
           })
       .def(
+<<<<<<< HEAD
+=======
+          "add_none_match_guard",
+          [](GuardManager& self, py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("NONE_MATCH");
+            self.add_leaf_guard(
+                std::make_shared<NONE_MATCH>(std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_true_match_guard",
+          [](GuardManager& self, py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("TRUE_MATCH");
+            self.add_leaf_guard(
+                std::make_shared<TRUE_MATCH>(std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_false_match_guard",
+          [](GuardManager& self, py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("FALSE_MATCH");
+            self.add_leaf_guard(
+                std::make_shared<FALSE_MATCH>(std::move(verbose_code_parts)));
+          })
+      .def(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "add_equals_match_guard",
           [](GuardManager& self,
              py::object value,
@@ -5666,6 +6269,7 @@ PyObject* torch_c_dynamo_guards_init() {
                 initial_stack, std::move(verbose_code_parts)));
           })
       .def(
+<<<<<<< HEAD
           "add_data_ptr_guard",
           [](GuardManager& self,
              py::object data_ptr,
@@ -5675,6 +6279,8 @@ PyObject* torch_c_dynamo_guards_init() {
                 std::move(data_ptr), std::move(verbose_code_parts)));
           })
       .def(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "add_no_hasattr_guard",
           [](GuardManager& self,
              py::object attr_name,
@@ -5715,7 +6321,13 @@ PyObject* torch_c_dynamo_guards_init() {
              py::object sizes,
              py::object strides,
              py::object tensor_name,
+<<<<<<< HEAD
              py::object verbose_code_parts) -> void {
+=======
+             py::object verbose_code_parts,
+             py::object pytype,
+             py::object dispatch_keys) -> void {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             SKIP_IF_GUARD_ALREADY_PRESENT("TENSOR_MATCH");
             self.add_leaf_guard(std::make_shared<TENSOR_MATCH>(
                 self.get_root(),
@@ -5723,7 +6335,13 @@ PyObject* torch_c_dynamo_guards_init() {
                 std::move(sizes),
                 std::move(strides),
                 std::move(tensor_name),
+<<<<<<< HEAD
                 std::move(verbose_code_parts)));
+=======
+                std::move(verbose_code_parts),
+                std::move(pytype),
+                std::move(dispatch_keys)));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
 
       // return by reference because GuardManager has the ownership of accessors
@@ -6125,7 +6743,11 @@ PyObject* torch_c_dynamo_guards_init() {
             self.add_permitted_leaf_guard(std::make_shared<NO_HASATTR>(
                 std::move(attr_name), std::move(verbose_code_parts)));
           })
+<<<<<<< HEAD
       // Not permitted accesssors
+=======
+      // Not permitted accessors
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def("lambda_manager", &DictGuardManager::fail_on_get_child_manager)
       .def("getitem_manager", &DictGuardManager::fail_on_get_child_manager)
       .def("dict_getitem_manager", &DictGuardManager::fail_on_get_child_manager)
diff --git a/torch/csrc/dynamo/guards.h b/torch/csrc/dynamo/guards.h
index 03ea45b0a67e..84066993c2f9 100644
--- a/torch/csrc/dynamo/guards.h
+++ b/torch/csrc/dynamo/guards.h
@@ -43,6 +43,10 @@ class TensorCheck {
       const LocalState& state,
       PyTypeObject* pt,
       const at::Tensor& v,
+<<<<<<< HEAD
+=======
+      c10::DispatchKeySet dispatch_key_set,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes,
       std::vector<std::optional<c10::SymInt>> dynamic_dims_strides);
 
diff --git a/torch/csrc/dynamo/init.cpp b/torch/csrc/dynamo/init.cpp
index e62d3cb8b17a..67c80edde304 100644
--- a/torch/csrc/dynamo/init.cpp
+++ b/torch/csrc/dynamo/init.cpp
@@ -257,6 +257,11 @@ void initDynamoBindings(PyObject* torch) {
       .def_readwrite("recursive_action", &FrameExecStrategy::recursive_action);
 
   m.def("_debug_get_cache_entry_list", &_debug_get_cache_entry_list);
+<<<<<<< HEAD
+=======
+  m.def("_reset_precompile_entries", &_reset_precompile_entries);
+  m.def("_load_precompile_entry", &_load_precompile_entry);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::bind_vector<std::vector<uint8_t>>(m, "VectorUInt8");
   m.attr("py_opcode_caches") = _PyOpcode_Caches_vec;
   m.def("code_framelocals_names", &code_framelocals_names);
diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp
index 652f9c4bb12f..9cbe545b8328 100644
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@@ -10,6 +10,10 @@
 #include <iostream>
 #include <sstream>
 #include <string>
+<<<<<<< HEAD
+=======
+#include <string_view>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <vector>
 
 /*
@@ -56,8 +60,83 @@ namespace {
 PyObject* the_autograd_compiler = nullptr;
 int default_dyn_type_int = 0;
 PyObject* python_verbose_logger = nullptr;
+<<<<<<< HEAD
 } // namespace
 
+=======
+
+constexpr std::string_view _TURN_OFF_COMPILED_AUTOGRAD_MSG = R"(
+  You can disable compiled autograd for this operation by:
+  1.  Relocating the unsupported autograd call outside the compiled region.
+  2.  Wrapping the unsupported autograd call within a scope that disables compiled autograd.
+  3.  Configuring the specific compilation unit to disable compiled autograd.
+  4.  Globally disabling compiled autograd at the application's initialization.
+  )";
+
+std::string TURN_OFF_COMPILED_AUTOGRAD_MSG() {
+  return std::string(_TURN_OFF_COMPILED_AUTOGRAD_MSG);
+}
+
+} // namespace
+
+// see https://github.com/pytorch/pytorch/pull/34845
+static void throw_python_error() {
+  python_error err;
+  err.persist();
+  throw std::move(err);
+}
+
+// RuntimeState contains arbitrary callables created during the forward pass.
+// e.g. .retains_grad(). It is created during the compiled_args stage, and is
+// used at runtime.  The lifetime of RuntimeState is a single backward pass.
+struct RuntimeState {
+  at::TensorBase call_cpp_tensor_pre_hooks(
+      size_t idx,
+      const at::TensorBase& grad) {
+    TORCH_INTERNAL_ASSERT(
+        cpp_tensor_pre_hooks.size() > static_cast<size_t>(idx));
+    return cpp_tensor_pre_hooks[idx](grad);
+  }
+
+  std::vector<std::function<at::TensorBase(const at::TensorBase&)>>
+      cpp_tensor_pre_hooks;
+  size_t next_id = 0;
+};
+
+static RuntimeState* active_rstate;
+struct RuntimeStateGuard {
+  RuntimeStateGuard() : _state(std::make_unique<RuntimeState>()) {
+    active_rstate = _state.get();
+  }
+  RuntimeStateGuard(const RuntimeStateGuard&) = delete;
+  RuntimeStateGuard& operator=(const RuntimeStateGuard&) = delete;
+  RuntimeStateGuard(RuntimeStateGuard&&) = delete;
+  RuntimeStateGuard& operator=(RuntimeStateGuard&&) = delete;
+
+  ~RuntimeStateGuard() {
+    active_rstate = nullptr;
+  }
+
+  std::unique_ptr<RuntimeState> _state;
+};
+
+static PyObject* call_cpp_tensor_pre_hooks(PyObject* dummy, PyObject* args) {
+  HANDLE_TH_ERRORS;
+  int idx = -1;
+  PyObject* grad = nullptr;
+  if (!PyArg_ParseTuple(args, "iO", &idx, &grad)) {
+    throw_python_error();
+  }
+  TORCH_INTERNAL_ASSERT(idx > -1);
+  TORCH_INTERNAL_ASSERT(grad != nullptr);
+  TORCH_INTERNAL_ASSERT(active_rstate != nullptr);
+  auto res = active_rstate->call_cpp_tensor_pre_hooks(
+      static_cast<size_t>(idx), THPVariable_Unpack(grad));
+  return THPVariable_Wrap(res);
+  END_HANDLE_TH_ERRORS;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // List[Optional[Tensor]] in Python can't be directly parsed into a
 // List[Tensor], so we need to do this conversion manually.
 static std::vector<at::Tensor> toTensorList(
@@ -152,7 +231,11 @@ static variable_list call_function(
       jit::toPyObject(output_metadata));
 
   // Convert the output from PyObject* to vector<Tensor>
+<<<<<<< HEAD
   auto tmp = py::cast<std::vector<std::optional<at::Tensor>>>(stuff);
+=======
+  auto tmp = py::cast<std::vector<std::optional<at::Tensor>>>(std::move(stuff));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return toTensorList(tmp);
 }
 
@@ -163,7 +246,11 @@ struct PyCompilerInterfaceImpl : PyCompilerInterface {
       functional_apply_t fn,
       std::vector<at::TypePtr> packed_args_schema,
       bool is_custom_function = false,
+<<<<<<< HEAD
       bool is_traceable = true) override {
+=======
+      bool is_traceable = true) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch::dynamo::autograd::bind_function(
         py_compiler,
         fn_name,
@@ -178,7 +265,11 @@ struct PyCompilerInterfaceImpl : PyCompilerInterface {
       const std::string& fn_name,
       const variable_list& inputs,
       const ivalue_list& packed_args,
+<<<<<<< HEAD
       const c10::IValue& output_metadata) override {
+=======
+      const c10::IValue& output_metadata) const override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return torch::dynamo::autograd::call_function(
         py_compiler,
         method_name,
@@ -191,27 +282,51 @@ struct PyCompilerInterfaceImpl : PyCompilerInterface {
       PyObject* py_compiler,
       const variable_list& inputs,
       const at::TensorGeometry& base,
+<<<<<<< HEAD
       const at::TensorGeometry& view) override {
     py::handle handle(py_compiler);
     py::object stuff =
         handle.attr("call_copy_slices_prologue")(inputs, base, view);
     return py::cast<std::vector<at::Tensor>>(stuff);
+=======
+      const at::TensorGeometry& view) const override {
+    py::handle handle(py_compiler);
+    py::object stuff = handle.attr("call_copy_slices_prologue")(
+        inputs,
+        base.sym_sizes(),
+        base.sym_strides(),
+        base.sym_storage_offset(),
+        view.sym_sizes(),
+        view.sym_strides(),
+        view.sym_storage_offset());
+    return py::cast<std::vector<at::Tensor>>(std::move(stuff));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   variable_list call_copy_slices_epilogue(
       PyObject* py_compiler,
       const std::vector<bool>& needs_input_grad,
       const at::Tensor& result,
       const variable_list& res,
+<<<<<<< HEAD
       const at::Tensor& grad_slice) override {
     py::handle handle(py_compiler);
     py::object stuff = handle.attr("call_copy_slices_epilogue")(
         needs_input_grad, result, res, grad_slice);
     auto output = py::cast<std::vector<std::optional<at::Tensor>>>(stuff);
+=======
+      const at::Tensor& grad_slice) const override {
+    py::handle handle(py_compiler);
+    py::object stuff = handle.attr("call_copy_slices_epilogue")(
+        needs_input_grad, result, res, grad_slice);
+    auto output =
+        py::cast<std::vector<std::optional<at::Tensor>>>(std::move(stuff));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return toTensorList(output);
   }
   at::Tensor call_unpack(
       PyObject* py_compiler,
       std::optional<size_t> hook_id,
+<<<<<<< HEAD
       size_t hook_input_id) override {
     py::handle handle(py_compiler);
     py::object proxy = handle.attr("unpack_hook")(hook_id, hook_input_id);
@@ -225,6 +340,31 @@ static PyObject* wrap_int_list(const std::vector<int64_t>& inputs) {
   PyObject* pyinput = PyTuple_New(static_cast<Py_ssize_t>(inputs.size()));
   for (const auto i : c10::irange(inputs.size())) {
     PyTuple_SET_ITEM(pyinput, i, PyLong_FromSsize_t(inputs[i]));
+=======
+      size_t hook_input_id) const override {
+    py::handle handle(py_compiler);
+    py::object proxy = handle.attr("unpack_hook")(hook_id, hook_input_id);
+    auto tmp = py::cast<std::optional<at::Tensor>>(std::move(proxy));
+    TORCH_INTERNAL_ASSERT(tmp.has_value());
+    return tmp.value();
+  }
+  void call_accumulate_grad(
+      PyObject* py_compiler,
+      const at::Tensor& variable,
+      const at::Tensor& grad,
+      bool has_post_hooks) const override {
+    py::handle handle(py_compiler);
+    py::object stuff =
+        handle.attr("accumulate_grad")(variable, grad, has_post_hooks);
+    TORCH_INTERNAL_ASSERT(stuff.is_none());
+  }
+};
+
+static PyObject* wrap_int_list(const std::vector<int64_t>& inputs) {
+  PyObject* pyinput = PyList_New(static_cast<Py_ssize_t>(inputs.size()));
+  for (const auto i : c10::irange(inputs.size())) {
+    PyList_SET_ITEM(pyinput, i, PyLong_FromSsize_t(inputs[i]));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return pyinput;
 }
@@ -238,6 +378,7 @@ static PyObject* convert_pyobj_list(std::vector<c10::SafePyObject>& inputs) {
   return pyinput;
 }
 
+<<<<<<< HEAD
 // see https://github.com/pytorch/pytorch/pull/34845
 static void throw_python_error() {
   python_error err;
@@ -245,6 +386,8 @@ static void throw_python_error() {
   throw std::move(err);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static PyObject* check(PyObject* pyresult) {
   if (C10_UNLIKELY(pyresult == nullptr)) {
     throw_python_error();
@@ -593,6 +736,13 @@ static PyMethodDef _methods[] = {
     {"clear_cache", clear_cache, METH_NOARGS, nullptr},
     {"is_cache_empty", is_cache_empty, METH_NOARGS, nullptr},
     {"set_verbose_logger", set_verbose_logger, METH_VARARGS, nullptr},
+<<<<<<< HEAD
+=======
+    {"call_cpp_tensor_pre_hooks",
+     call_cpp_tensor_pre_hooks,
+     METH_VARARGS,
+     nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr, nullptr, 0, nullptr}};
 
 static struct PyModuleDef _module = {
@@ -707,7 +857,11 @@ static at::Tensor call_accumulate(
   }
   py::handle handle(py_compiler);
   py::object stuff = handle.attr("accumulate")(old_var, new_var);
+<<<<<<< HEAD
   return py::cast<at::Tensor>(stuff);
+=======
+  return py::cast<at::Tensor>(std::move(stuff));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 static TraceState call_begin_capture(
@@ -715,7 +869,13 @@ static TraceState call_begin_capture(
     CacheNode& cache,
     AutogradCompilerCall& compiler_call,
     size_t num_outputs,
+<<<<<<< HEAD
     std::optional<std::string>&& maybe_compile_reason) {
+=======
+    std::optional<std::string>&& maybe_compile_reason,
+    bool accumulate_grad,
+    bool check_nans) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static PyObject* method_name = PyUnicode_InternFromString("begin_capture");
   THPObjectPtr py_input(THPVariable_WrapList(compiler_call.tensor_args.inputs));
   THPObjectPtr py_size_input(cache.wrap_dynamic_inputs());
@@ -730,6 +890,11 @@ static TraceState call_begin_capture(
       py_size_input.get(),
       py_ivalue_args_input.get(),
       py_node_origins.get(),
+<<<<<<< HEAD
+=======
+      PyBool_FromLong(accumulate_grad),
+      PyBool_FromLong(check_nans),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       nullptr)));
 
   PyObject *compile_id_str{nullptr}, *fake_inputs{nullptr},
@@ -808,7 +973,12 @@ static CacheNode* _compiled_autograd_impl(
     THPObjectPtr* graph_arg_sizes,
     THPObjectPtr* graph_arg_ivalue_args,
     THPObjectPtr* graph_arg_hooks,
+<<<<<<< HEAD
     THPObjectPtr* graph_arg_packed_inputs) {
+=======
+    THPObjectPtr* graph_arg_packed_inputs,
+    RuntimeState* rstate) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const std::unordered_map<Node*, int>& dependencies = graph_task.dependencies_;
   std::unordered_map<Node*, int> visited_dependencies;
   visited_dependencies.reserve(dependencies.size());
@@ -899,7 +1069,13 @@ static CacheNode* _compiled_autograd_impl(
         *cache,
         compiler_call,
         output_edges.size(),
+<<<<<<< HEAD
         std::move(compile_reason));
+=======
+        std::move(compile_reason),
+        accumulate_grad,
+        AnomalyMode::is_enabled() && AnomalyMode::should_check_nan());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     InputBuffers input_buffers;
 
     for (size_t i = 0; i < ordered_calls.size(); i++) {
@@ -942,6 +1118,23 @@ static CacheNode* _compiled_autograd_impl(
         }
         inputs = THPVariable_UnpackList(pyinputs);
       }
+<<<<<<< HEAD
+=======
+      if (!call.cpp_tensor_pre_hooks.empty()) {
+        // proxy a call to runtimestate
+        THPObjectPtr pyinputs(THPVariable_WrapList(inputs));
+        for (const auto& [hook_id, idx] : call.cpp_tensor_pre_hooks) {
+          pyinputs = check(PyObject_CallMethod(
+              py_compiler,
+              "cpp_tensor_pre_hook",
+              "Oii",
+              pyinputs.get(),
+              hook_id,
+              idx));
+        }
+        inputs = THPVariable_UnpackList(pyinputs);
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       for (const auto& graph_output : call.graph_output) {
         int input_nr = graph_output.first;
         int output_index = graph_output.second;
@@ -971,8 +1164,12 @@ static CacheNode* _compiled_autograd_impl(
       TORCH_INTERNAL_ASSERT(input_metadata.size() == outputs.size());
 
       // Lazily bind the `validate_outputs` function to Python.
+<<<<<<< HEAD
       static c10::once_flag flag;
       c10::call_once(flag, [&]() {
+=======
+      static bool flag [[maybe_unused]] = [&]() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto schema = std::vector<at::TypePtr>{IValuePacker<
             std::vector<std::optional<InputMetadata>>>::packed_type()};
         bind_function(
@@ -982,7 +1179,12 @@ static CacheNode* _compiled_autograd_impl(
             schema,
             /*is_custom_function=*/false,
             /*is_traceable=*/true);
+<<<<<<< HEAD
       });
+=======
+        return true;
+      }();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       // Don't emit validate_outputs nodes that follow a CompiledBackward node.
       // These nodes would otherwise prevent reordering of accumulate_grad
@@ -1069,6 +1271,10 @@ static CacheNode* _compiled_autograd_impl(
       wrap_lifted_ivalue_args(compiler_call.lifted_ivalue_args.args);
   *graph_arg_hooks = convert_pyobj_list(compiler_call.hooks);
   *graph_arg_packed_inputs = convert_pyobj_list(compiler_call.packed_inputs);
+<<<<<<< HEAD
+=======
+  rstate->cpp_tensor_pre_hooks = std::move(compiler_call.cpp_tensor_pre_hooks);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return cache;
 }
 
@@ -1079,9 +1285,16 @@ struct LockGuardWithErrorLogs {
     // performance reasons, but it shouldn't happen here since we:
     // 1. disable multithreaded autograd
     // 2. plenty of latency between backward calls
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(
         mtx_.try_lock(),
         "Trying to run compiled autograd within another compiled autograd call (e.g. reentrant checkpointing), this is not supported yet.");
+=======
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        mtx_.try_lock(),
+        "Trying to run compiled autograd within another compiled autograd call, this is not supported yet. " +
+            TURN_OFF_COMPILED_AUTOGRAD_MSG());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   ~LockGuardWithErrorLogs() {
@@ -1097,19 +1310,27 @@ static variable_list compiled_autograd(
     const GraphTask& graph_task,
     bool accumulate_grad,
     const edge_list& output_edges) {
+<<<<<<< HEAD
   TORCH_CHECK(
       c10::impl::TorchDispatchModeTLS::stack_len() == 0,
       "TorchDispatchMode not yet implemented for compiled autograd")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static std::mutex mtx;
   LockGuardWithErrorLogs lock_guard(mtx);
   pybind11::gil_scoped_acquire gil;
   at::ThreadLocalStateGuard tls_guard(graph_task.thread_locals_);
+<<<<<<< HEAD
+=======
+  RuntimeStateGuard rstate_guard;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   THPObjectPtr inputs;
   THPObjectPtr sizes;
   THPObjectPtr ivalue_args;
   THPObjectPtr hooks;
   THPObjectPtr packed_inputs;
+<<<<<<< HEAD
   CacheNode* cache = _compiled_autograd_impl(
       graph_root,
       graph_task,
@@ -1120,6 +1341,29 @@ static variable_list compiled_autograd(
       &ivalue_args,
       &hooks,
       &packed_inputs);
+=======
+  CacheNode* cache = nullptr;
+  try {
+    torch_dispatch_mode::StashTorchDispatchStackGuard stash_stack_guard;
+    TORCH_INTERNAL_ASSERT(c10::impl::TorchDispatchModeTLS::stack_len() == 0);
+    cache = _compiled_autograd_impl(
+        graph_root,
+        graph_task,
+        accumulate_grad,
+        output_edges,
+        &inputs,
+        &sizes,
+        &ivalue_args,
+        &hooks,
+        &packed_inputs,
+        active_rstate);
+    TORCH_INTERNAL_ASSERT(c10::impl::TorchDispatchModeTLS::stack_len() == 0);
+  } catch (const c10::NotImplementedError& e) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, std::string(e.what()) + " " + TURN_OFF_COMPILED_AUTOGRAD_MSG());
+  }
+  TORCH_INTERNAL_ASSERT(cache != nullptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   THPObjectPtr pyresult(check(PyObject_CallFunctionObjArgs(
       cache->runtime_wrapper.get(),
diff --git a/torch/csrc/dynamo/utils.cpp b/torch/csrc/dynamo/utils.cpp
index eea523e2747f..e175895c1805 100644
--- a/torch/csrc/dynamo/utils.cpp
+++ b/torch/csrc/dynamo/utils.cpp
@@ -9,7 +9,11 @@ static std::array<PyMethodDef, 1> _methods = {{
      nullptr} // Sentinel value indicating the end of the array
 }};
 
+<<<<<<< HEAD
 bool is_instancemethod(py::object obj) {
+=======
+static bool is_instancemethod(py::object obj) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return PyInstanceMethod_Check(obj.ptr());
 }
 
diff --git a/torch/csrc/export/pt2_archive_constants.h b/torch/csrc/export/pt2_archive_constants.h
new file mode 100644
index 000000000000..804cadccbd43
--- /dev/null
+++ b/torch/csrc/export/pt2_archive_constants.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <array>
+#include <string_view>
+
+namespace torch::_export::archive_spec {
+
+#define FORALL_CONSTANTS(DO)                                                   \
+  DO(ARCHIVE_ROOT_NAME, "package")                                             \
+  /* Archive format */                                                         \
+  DO(ARCHIVE_FORMAT_PATH, "archive_format")                                    \
+  DO(ARCHIVE_FORMAT_VALUE, "pt2")                                              \
+  /* Archive version */                                                        \
+  DO(ARCHIVE_VERSION_PATH, "archive_version")                                  \
+  DO(ARCHIVE_VERSION_VALUE, "0") /* Sep.4.2024: This is the initial version of \
+                                    the PT2 Archive Spec */                    \
+  /*                                                                           \
+   * ######## Note on updating ARCHIVE_VERSION_VALUE ########                  \
+   * When there is a BC breaking change to the PT2 Archive Spec,               \
+   * e.g. deleting a folder, or changing the naming convention of the          \
+   * following fields it would require bumping the ARCHIVE_VERSION_VALUE       \
+   * Archive reader would need corresponding changes to support loading both   \
+   * the current and older versions of the PT2 Archive.                        \
+   */                                                                          \
+  /* Model definitions */                                                      \
+  DO(MODELS_DIR, "models/")                                                    \
+  DO(MODELS_FILENAME_FORMAT, "models/{}.json") /* {model_name} */              \
+  /* AOTInductor artifacts */                                                  \
+  DO(AOTINDUCTOR_DIR, "data/aotinductor/")                                     \
+  /* MTIA artifacts */                                                         \
+  DO(MTIA_DIR, "data/mtia")                                                    \
+  /* weights, including parameters and buffers */                              \
+  DO(WEIGHTS_DIR, "data/weights/")                                             \
+  DO(WEIGHT_FILENAME_PREFIX, "weight_")                                        \
+  DO(WEIGHTS_PARAM_CONFIG_FORMAT, "data/weights/{}_model_param_config.json")   \
+  /* constants, including tensor_constants, non-persistent buffers and script  \
+   * objects */                                                                \
+  DO(CONSTANTS_DIR, "data/constants/")                                         \
+  DO(CONSTANTS_PARAM_CONFIG_FORMAT,                                            \
+     "data/constants/{}_model_constants_config.json")                          \
+  DO(TENSOR_CONSTANT_FILENAME_PREFIX, "tensor_")                               \
+  DO(CUSTOM_OBJ_FILENAME_PREFIX, "custom_obj_")                                \
+  /* example inputs */                                                         \
+  DO(SAMPLE_INPUTS_DIR, "data/sample_inputs/")                                 \
+  DO(SAMPLE_INPUTS_FILENAME_FORMAT,                                            \
+     "data/sample_inputs/{}.pt") /* {model_name} */                            \
+  /* extra folder */                                                           \
+  DO(EXTRA_DIR, "extra/")                                                      \
+  DO(MODULE_INFO_PATH, "extra/module_info.json")                               \
+  /* xl_model_weights, this folder is used for storing per-feature-weights for \
+   * remote net data in this folder is consume by Predictor, and is not        \
+   * intended to be used by Sigmoid */                                         \
+  DO(XL_MODEL_WEIGHTS_DIR, "xl_model_weights/")                                \
+  DO(XL_MODEL_WEIGHTS_PARAM_CONFIG_PATH, "xl_model_weights/model_param_config")
+
+#define DEFINE_GLOBAL(NAME, VALUE) \
+  inline constexpr std::string_view NAME = VALUE;
+FORALL_CONSTANTS(DEFINE_GLOBAL)
+#undef DEFINE_GLOBAL
+
+#define DEFINE_ENTRY(NAME, VALUE) std::pair(#NAME, VALUE),
+inline constexpr std::array kAllConstants{FORALL_CONSTANTS(DEFINE_ENTRY)};
+#undef DEFINE_ENTRY
+
+#undef FORALL_CONSTANTS
+} // namespace torch::_export::archive_spec
diff --git a/torch/csrc/export/pybind.cpp b/torch/csrc/export/pybind.cpp
index 71464be05cac..bdf004220408 100644
--- a/torch/csrc/export/pybind.cpp
+++ b/torch/csrc/export/pybind.cpp
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+#include <torch/csrc/export/pt2_archive_constants.h>
+#include <torch/csrc/export/pybind.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/utils/generated_serialization_types.h>
 #include <torch/csrc/utils/pybind.h>
 
@@ -5,6 +10,7 @@ namespace torch::_export {
 
 void initExportBindings(PyObject* module) {
   auto rootModule = py::handle(module).cast<py::module>();
+<<<<<<< HEAD
   auto m = rootModule.def_submodule("_export");
 
   // NOLINTNEXTLINE(bugprone-unused-raii)
@@ -17,5 +23,25 @@ void initExportBindings(PyObject* module) {
   m.def("serialize_exported_program", [](const ExportedProgram& ep) {
     return nlohmann::json(ep).dump();
   });
+=======
+  auto exportModule = rootModule.def_submodule("_export");
+  auto pt2ArchiveModule = exportModule.def_submodule("pt2_archive_constants");
+
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<ExportedProgram>(exportModule, "CppExportedProgram");
+
+  exportModule.def(
+      "deserialize_exported_program", [](const std::string& serialized) {
+        return nlohmann::json::parse(serialized).get<ExportedProgram>();
+      });
+
+  exportModule.def("serialize_exported_program", [](const ExportedProgram& ep) {
+    return nlohmann::json(ep).dump();
+  });
+
+  for (const auto& entry : torch::_export::archive_spec::kAllConstants) {
+    pt2ArchiveModule.attr(entry.first) = entry.second;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace torch::_export
diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp
index 0960989dad62..b03a18d320b9 100644
--- a/torch/csrc/functorch/init.cpp
+++ b/torch/csrc/functorch/init.cpp
@@ -6,6 +6,10 @@
 
 #include <ATen/FunctionalTensorWrapper.h>
 #include <ATen/WrapDimUtils.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/functorch/init.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/utils/python_raii.h>
 #include <torch/python.h>
 
@@ -35,17 +39,32 @@ static bool has_level(const Tensor& self, int64_t level) {
   return batched->level() >= level;
 }
 
+<<<<<<< HEAD
 Tensor _add_batch_dim(const Tensor& self, int64_t batch_dim, int64_t level) {
   return addBatchDim(self, batch_dim, level);
 }
 
 Tensor _wrap_functional_tensor(const Tensor& self, int64_t level) {
+=======
+static Tensor _add_batch_dim(
+    const Tensor& self,
+    int64_t batch_dim,
+    int64_t level) {
+  return addBatchDim(self, batch_dim, level);
+}
+
+static Tensor _wrap_functional_tensor(const Tensor& self, int64_t level) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto t = at::functionalization::impl::to_functional_tensor(self);
   at::functionalization::impl::unsafeGetFunctionalWrapper(t)->set_level(level);
   return t;
 }
 
+<<<<<<< HEAD
 void _assert_wrapped_functional(
+=======
+static void _assert_wrapped_functional(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& unwrapped,
     const Tensor& wrapped) {
   TORCH_INTERNAL_ASSERT(
@@ -59,7 +78,11 @@ void _assert_wrapped_functional(
       unwrapped.unsafeGetTensorImpl() == wrapped_inner.unsafeGetTensorImpl())
 }
 
+<<<<<<< HEAD
 void _propagate_functional_input_mutation(
+=======
+static void _propagate_functional_input_mutation(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& unwrapped,
     const Tensor& wrapped) {
   TORCH_INTERNAL_ASSERT(
@@ -139,7 +162,11 @@ static Tensor _movedim(const Tensor& self, int64_t src, int64_t dst) {
 //
 // `out_dim` controls where we should put the batch dimension in the output
 // tensor.
+<<<<<<< HEAD
 Tensor _remove_batch_dim(
+=======
+static Tensor _remove_batch_dim(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Tensor& self,
     int64_t level,
     const c10::SymInt& batch_size,
@@ -166,7 +193,13 @@ Tensor _remove_batch_dim(
   return result;
 }
 
+<<<<<<< HEAD
 Tensor _unwrap_functional_tensor(const Tensor& self, bool add_back_views) {
+=======
+static Tensor _unwrap_functional_tensor(
+    const Tensor& self,
+    bool add_back_views) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We only ever call that after popping out of a functionalize() call, in
   // which case the current tensors should always be wrapped in a
   // FunctionalTensorWrapper.
@@ -187,7 +220,11 @@ Tensor _unwrap_functional_tensor(const Tensor& self, bool add_back_views) {
   return functional->value();
 }
 
+<<<<<<< HEAD
 Tensor _wrap_for_grad(const Tensor& self, int64_t level) {
+=======
+static Tensor _wrap_for_grad(const Tensor& self, int64_t level) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NB: different behavior inside??
   // return self;
   // TORCH_INTERNAL_ASSERT(!maybeGetTensorWrapper(self));
@@ -195,7 +232,11 @@ Tensor _wrap_for_grad(const Tensor& self, int64_t level) {
   return makeTensorWrapper(self, level);
 }
 
+<<<<<<< HEAD
 Tensor _unwrap_for_grad(const Tensor& self, int64_t level) {
+=======
+static Tensor _unwrap_for_grad(const Tensor& self, int64_t level) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto* result = maybeGetTensorWrapper(self);
   if (!result) {
     return self;
@@ -207,7 +248,11 @@ Tensor _unwrap_for_grad(const Tensor& self, int64_t level) {
   return self;
 }
 
+<<<<<<< HEAD
 int64_t dlevel(const Tensor& tensor) {
+=======
+static int64_t dlevel(const Tensor& tensor) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto* wrapped = maybeGetTensorWrapper(tensor);
   if (!wrapped) {
     return 0;
@@ -219,12 +264,20 @@ int64_t dlevel(const Tensor& tensor) {
   return wrapped->level().value();
 }
 
+<<<<<<< HEAD
 bool dump_tensor(const Tensor& self) {
+=======
+static bool dump_tensor(const Tensor& self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   dumpTensorCout(self);
   return true;
 }
 
+<<<<<<< HEAD
 RandomnessType get_randomness_enum(const std::string& randomness) {
+=======
+static RandomnessType get_randomness_enum(const std::string& randomness) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (randomness == "error") {
     return RandomnessType::Error;
   } else if (randomness == "same") {
@@ -237,20 +290,32 @@ RandomnessType get_randomness_enum(const std::string& randomness) {
   }
 }
 
+<<<<<<< HEAD
 int64_t _grad_increment_nesting() {
+=======
+static int64_t _grad_increment_nesting() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // See NOTE [grad and vjp interaction with no_grad]
   bool prev_grad_mode = c10::GradMode::is_enabled();
   return initAndPushDynamicLayer(
       TransformType::Grad, std::nullopt, std::nullopt, prev_grad_mode);
 }
 
+<<<<<<< HEAD
 int64_t _grad_decrement_nesting() {
+=======
+static int64_t _grad_decrement_nesting() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto layer = popDynamicLayerAndDeleteMetadata();
   TORCH_INTERNAL_ASSERT(layer.key() == TransformType::Grad);
   return layer.layerId();
 }
 
+<<<<<<< HEAD
 int64_t _jvp_increment_nesting() {
+=======
+static int64_t _jvp_increment_nesting() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // See NOTE [grad and vjp interaction with no_grad]
   bool prev_fwd_grad_mode =
       c10::AutogradState::get_tls_state().get_fw_grad_mode();
@@ -262,13 +327,21 @@ int64_t _jvp_increment_nesting() {
       prev_fwd_grad_mode);
 }
 
+<<<<<<< HEAD
 int64_t _jvp_decrement_nesting() {
+=======
+static int64_t _jvp_decrement_nesting() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto layer = popDynamicLayerAndDeleteMetadata();
   TORCH_INTERNAL_ASSERT(layer.key() == TransformType::Jvp);
   return layer.layerId();
 }
 
+<<<<<<< HEAD
 int64_t _vmap_increment_nesting(
+=======
+static int64_t _vmap_increment_nesting(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::SymInt batch_size,
     const std::string& randomness) {
   return initAndPushDynamicLayer(
@@ -277,13 +350,21 @@ int64_t _vmap_increment_nesting(
       get_randomness_enum(randomness));
 }
 
+<<<<<<< HEAD
 int64_t _vmap_decrement_nesting() {
+=======
+static int64_t _vmap_decrement_nesting() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto layer = popDynamicLayerAndDeleteMetadata();
   TORCH_INTERNAL_ASSERT(layer.key() == TransformType::Vmap);
   return layer.layerId();
 }
 
+<<<<<<< HEAD
 int64_t _func_increment_nesting(bool reapply_views) {
+=======
+static int64_t _func_increment_nesting(bool reapply_views) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return initAndPushDynamicLayer(
       TransformType::Functionalize,
       std::nullopt,
@@ -293,7 +374,11 @@ int64_t _func_increment_nesting(bool reapply_views) {
       /*functionalize_add_back_views=*/reapply_views);
 }
 
+<<<<<<< HEAD
 int64_t _func_decrement_nesting() {
+=======
+static int64_t _func_decrement_nesting() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto layer = popDynamicLayerAndDeleteMetadata();
   TORCH_INTERNAL_ASSERT(layer.key() == TransformType::Functionalize);
   return layer.layerId();
@@ -569,7 +654,13 @@ void initFuncTorchBindings(PyObject* module) {
       .value("Different", RandomnessType::Different);
   py::class_<Interpreter>(m, "CInterpreter")
       .def("key", &Interpreter::key)
+<<<<<<< HEAD
       .def("level", &Interpreter::level);
+=======
+      .def("level", &Interpreter::level)
+      .def("serialize", &Interpreter::serialize)
+      .def_static("deserialize", &Interpreter::deserialize);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::class_<GradInterpreterPtr>(m, "CGradInterpreterPtr")
       .def(py::init<const Interpreter*>())
       .def("key", &GradInterpreterPtr::key)
diff --git a/torch/csrc/fx/node.cpp b/torch/csrc/fx/node.cpp
index 46f443d903b0..b746ddfad415 100644
--- a/torch/csrc/fx/node.cpp
+++ b/torch/csrc/fx/node.cpp
@@ -335,7 +335,11 @@ PyTypeObject NodeBaseType = {
     "torch._C._NodeBase", /* tp_name */
     sizeof(NodeBase), /* tp_basicsize */
     0, /* tp_itemsize */
+<<<<<<< HEAD
     (destructor)NodeBase_dealloc, /* tp_dealloc */
+=======
+    NodeBase_dealloc, /* tp_dealloc */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     0, /* tp_vectorcall_offset */
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
@@ -419,7 +423,11 @@ static int NodeIter_init_fn(NodeIter* self, PyObject* args, PyObject* kwargs) {
 }
 
 template <bool reversed>
+<<<<<<< HEAD
 PyObject* NodeIter_iternext_helper(NodeIter* self) {
+=======
+static PyObject* NodeIter_iternext_helper(NodeIter* self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // It should be possible to relax the ref counting here
   // but in practice, we do not have that many _erased Nodes,
   // so probably not worth it.
@@ -451,7 +459,11 @@ PyObject* NodeIter_iternext_helper(NodeIter* self) {
   return nullptr;
 }
 
+<<<<<<< HEAD
 PyObject* NodeIter_iternext(PyObject* _self) {
+=======
+static PyObject* NodeIter_iternext(PyObject* _self) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   NodeIter* self = (NodeIter*)_self;
   if (self->_reversed) {
     return NodeIter_iternext_helper<true>(self);
diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
index d1f6ca4025ba..ff295081cea4 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
+++ b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
@@ -110,7 +110,11 @@ std::vector<ParameterMetadata> unpack_input_parameters(
     }
 
     if (stack[idx].isScalar()) {
+<<<<<<< HEAD
       // Beyond c10::Scalar, the floating value and interger value are also
+=======
+      // Beyond c10::Scalar, the floating value and integer value are also
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // represented as Scalar.
       inputs_metadata.emplace_back(stack[idx].toScalar(), arg_order);
     } else if (stack[idx].isTensorList()) {
@@ -421,6 +425,10 @@ std::shared_ptr<AOTIModelContainerRunner> AOTIPythonKernelHolder::
       "AOTI for eager does not support ",
       c10::DeviceTypeName(device_.type()),
       " now.");
+<<<<<<< HEAD
+=======
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (device_.type() == c10::DeviceType::CUDA) {
 #ifdef USE_CUDA
     return std::make_shared<AOTIModelContainerRunnerCuda>(so_path);
@@ -528,7 +536,11 @@ std::string AOTIPythonKernelHolder::produce_aoti_kernel_lib(
   auto kernel_lib_path = py::cast<std::string>(result);
   TORCH_CHECK(
       !kernel_lib_path.empty(),
+<<<<<<< HEAD
       "Failed to produce kernel libarary by using AOTI for ",
+=======
+      "Failed to produce kernel library by using AOTI for ",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       c10::DeviceTypeName(device_.type()),
       ". Operator Name is ",
       op.operator_name().name,
diff --git a/torch/csrc/inductor/aoti_include/common.h b/torch/csrc/inductor/aoti_include/common.h
index e942e48823fa..3df9649b6acd 100644
--- a/torch/csrc/inductor/aoti_include/common.h
+++ b/torch/csrc/inductor/aoti_include/common.h
@@ -9,8 +9,11 @@
 
 #include <c10/util/generic_math.h>
 #include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+<<<<<<< HEAD
 using half = at::Half;
 using bfloat16 = at::BFloat16;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Round up to the nearest multiple of 64
 [[maybe_unused]] inline int64_t align(int64_t nbytes) {
diff --git a/torch/csrc/inductor/aoti_include/mps.h b/torch/csrc/inductor/aoti_include/mps.h
new file mode 100644
index 000000000000..a96ea0f7eed7
--- /dev/null
+++ b/torch/csrc/inductor/aoti_include/mps.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_include/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/mps.h>
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.cpp b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
index d0d00cd14f7b..53c5d23f2275 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@@ -1,6 +1,10 @@
 #if !defined(C10_MOBILE) && !defined(ANDROID)
 
 #include <c10/util/error.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/string_view.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/inductor/aoti_package/model_package_loader.h>
 #include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
@@ -34,11 +38,19 @@ namespace fs = std::filesystem;
 #endif
 
 namespace {
+<<<<<<< HEAD
 bool file_exists(std::string& path) {
 #ifdef _WIN32
   return fs::exists(path);
 #else
   struct stat rc {};
+=======
+bool file_exists(const std::string& path) {
+#ifdef _WIN32
+  return fs::exists(path);
+#else
+  struct stat rc{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return lstat(path.c_str(), &rc) == 0;
 #endif
 }
@@ -62,13 +74,20 @@ const std::string k_separator = "\\";
 #else
 const std::string k_separator = "/";
 #endif
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 namespace torch::inductor {
 
 namespace {
+<<<<<<< HEAD
 const nlohmann::json& load_json_file(std::string json_path) {
+=======
+const nlohmann::json& load_json_file(const std::string& json_path) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!file_exists(json_path)) {
     throw std::runtime_error("File not found: " + json_path);
   }
@@ -187,7 +206,11 @@ bool recursive_mkdir(const std::string& dir) {
   }
 
   // Find folder separator and check if we are at the top
+<<<<<<< HEAD
   auto pos = dir.find_last_of("/\\");
+=======
+  auto pos = dir.find_last_of(k_separator);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (pos == std::string::npos) {
     return false;
   }
@@ -217,7 +240,11 @@ bool recursive_rmdir(const std::string& path) {
   }
 
   struct dirent* entry = nullptr;
+<<<<<<< HEAD
   struct stat statbuf {};
+=======
+  struct stat statbuf{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool success = true;
 
   // Iterate through directory entries
@@ -264,7 +291,11 @@ bool recursive_rmdir(const std::string& path) {
 
 std::string compile_so(
     const std::string& cpp_filename,
+<<<<<<< HEAD
     const std::string& consts_filename) {
+=======
+    std::vector<std::string>& obj_filenames) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Compile the cpp file into a .so
 
   size_t lastindex = cpp_filename.find_last_of('.');
@@ -280,8 +311,14 @@ std::string compile_so(
       cpp_filename.substr(0, lastindex) + "_linker_flags.json";
   const nlohmann::json linker_flags = load_json_file(linker_flags_path);
 
+<<<<<<< HEAD
   auto [link_cmd, output_so] = get_cpp_compile_command(
       filename, {output_o, consts_filename}, linker_flags);
+=======
+  obj_filenames.push_back(output_o);
+  auto [link_cmd, output_so] =
+      get_cpp_compile_command(filename, obj_filenames, linker_flags);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Run the commands to generate a .so file
   int status = system(compile_cmd.c_str());
@@ -341,7 +378,25 @@ void AOTIModelPackageLoader::load_metadata(const std::string& cpp_filename) {
 AOTIModelPackageLoader::AOTIModelPackageLoader(
     const std::string& model_package_path,
     const std::string& model_name,
+<<<<<<< HEAD
     const bool run_single_threaded) {
+=======
+    const bool run_single_threaded,
+    const size_t num_runners,
+    const c10::DeviceIndex device_index) {
+  if (run_single_threaded) {
+    if (num_runners != 1) {
+      throw std::runtime_error(
+          "num_runners must be 1 when run_single_threaded is true");
+    }
+  } else {
+    if (num_runners < 1) {
+      throw std::runtime_error(
+          "num_runners must be >=1 when run_single_threaded is false");
+    }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Extract all files within the zipfile to a temporary directory
   mz_zip_archive zip_archive;
   memset(&zip_archive, 0, sizeof(zip_archive));
@@ -352,6 +407,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
         mz_zip_get_error_string(mz_zip_get_last_error(&zip_archive)));
   }
 
+<<<<<<< HEAD
   temp_dir_ = create_temp_dir();
   std::string so_filename;
   std::string cpp_filename;
@@ -360,12 +416,16 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
   std::string model_directory =
       "data" + k_separator + "aotinductor" + k_separator + model_name;
 
+=======
+  std::vector<std::string> found_filenames;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (uint32_t i = 0; i < zip_archive.m_total_files; i++) {
     uint32_t filename_len =
         mz_zip_reader_get_filename(&zip_archive, i, nullptr, 0);
     if (filename_len == 0) {
       throw std::runtime_error("Failed to read filename");
     }
+<<<<<<< HEAD
     char* filename = new char[filename_len + 1];
     if (!mz_zip_reader_get_filename(&zip_archive, i, filename, filename_len)) {
       throw std::runtime_error("Failed to read filename");
@@ -384,6 +444,77 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
 
       // Create the parent directory if it doesn't exist
       size_t parent_path_idx = output_path_str.find_last_of("/\\");
+=======
+    // filename_len returned by mz_zip_reader_get_filename includes the null
+    // terminator, so we need to subtract 1 here
+    std::string filename_str(filename_len - 1, '\0');
+    if (!mz_zip_reader_get_filename(
+            &zip_archive, i, filename_str.data(), filename_len)) {
+      throw std::runtime_error("Failed to read filename");
+    }
+    found_filenames.push_back(filename_str);
+  }
+
+  if (found_filenames.empty()) {
+    throw std::runtime_error("No files found in zip archive.");
+  }
+
+  // All the paths are prepended with a tmp/ directory. We need to find the
+  // prefix.
+  std::string file_prefix;
+  size_t pos = found_filenames[0].find('/');
+  std::string prefix0 = found_filenames[0].substr(0, pos);
+  pos = found_filenames[1].find('/');
+  std::string prefix1 = found_filenames[1].substr(0, pos);
+
+  if (!prefix0.empty() && !prefix1.empty() && prefix0 == prefix1) {
+    file_prefix = prefix0 + "/";
+  } else {
+    LOG(WARNING)
+        << "You are using an outdated version of the pt2 archive which do not have a prefix in front of each filename. Example: \n"
+        << found_filenames[0] << "\n"
+        << found_filenames[1];
+  }
+
+  temp_dir_ = create_temp_dir();
+
+  std::string so_filename;
+  std::string cpp_filename;
+  std::vector<std::string> obj_filenames;
+  std::string model_directory = file_prefix + "data" + k_separator +
+      "aotinductor" + k_separator + model_name;
+  std::string const_directory =
+      file_prefix + "data" + k_separator + "constants";
+
+  for (const std::string& filename_str : found_filenames) {
+    // Only compile files in the specified model directory
+    if (c10::starts_with(filename_str, model_directory) ||
+        c10::starts_with(filename_str, const_directory)) {
+      std::string output_path_str = temp_dir_;
+
+      if (c10::starts_with(filename_str, model_directory)) {
+        output_path_str += k_separator;
+        output_path_str += filename_str;
+      } else { // startsWith(filename_str, const_directory)
+        // Extract constants to the same directory as the rest of the files
+        // to be consistent with internal implementation
+        size_t lastSlash = filename_str.find_last_of(k_separator);
+        std::string filename = filename_str;
+        if (lastSlash != std::string::npos) {
+          filename = filename_str.substr(lastSlash + 1);
+        }
+        output_path_str.append(k_separator)
+            .append(model_directory)
+            .append(k_separator)
+            .append(filename);
+      }
+
+      LOG(INFO) << "Extract file: " << filename_str << " to "
+                << output_path_str;
+
+      // Create the parent directory if it doesn't exist
+      size_t parent_path_idx = output_path_str.find_last_of(k_separator);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (parent_path_idx == std::string::npos) {
         throw std::runtime_error(
             "Failed to find parent path in " + output_path_str);
@@ -398,7 +529,11 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
 
       // Extracts file to the temp directory
       mz_zip_reader_extract_file_to_file(
+<<<<<<< HEAD
           &zip_archive, filename, output_path_str.c_str(), 0);
+=======
+          &zip_archive, filename_str.c_str(), output_path_str.c_str(), 0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       // Save the file for bookkeeping
       size_t extension_idx = output_path_str.find_last_of('.');
@@ -406,11 +541,17 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
         std::string filename_extension = output_path_str.substr(extension_idx);
         if (filename_extension == ".cpp") {
           cpp_filename = output_path_str;
+<<<<<<< HEAD
         }
         if (filename_extension == ".o") {
           consts_filename = output_path_str;
         }
         if (filename_extension == ".so") {
+=======
+        } else if (filename_extension == ".o") {
+          obj_filenames.push_back(output_path_str);
+        } else if (filename_extension == ".so") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           so_filename = output_path_str;
         }
       }
@@ -426,29 +567,50 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
   }
 
   if (cpp_filename.empty() && so_filename.empty()) {
+<<<<<<< HEAD
     throw std::runtime_error(
         "No AOTInductor generate cpp file or so file found in zip archive. Loaded the following:\n" +
         found_filenames);
+=======
+    std::string found_filenames_str;
+    for (const std::string& filename : found_filenames) {
+      found_filenames_str += filename + "\n";
+    }
+    throw std::runtime_error(
+        "No AOTInductor generate cpp file or so file found in zip archive with the prefix " +
+        model_directory + "Loaded the following:\n" + found_filenames_str);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // Compile the .so
   std::string so_path = !so_filename.empty()
       ? so_filename
+<<<<<<< HEAD
       : compile_so(cpp_filename, consts_filename);
+=======
+      : compile_so(cpp_filename, obj_filenames);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Load metadata which can be queried by user
   load_metadata(cpp_filename);
 
   // Construct the runner depending on the device information
+<<<<<<< HEAD
   std::string device = metadata_["AOTI_DEVICE_KEY"];
 
   if (device.empty()) {
+=======
+  std::string device_key = metadata_["AOTI_DEVICE_KEY"];
+
+  if (device_key.empty()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw std::runtime_error("No device information found.");
   }
 
   std::unordered_map<std::string, CreateAOTIModelRunnerFunc>
       registered_aoti_runner = getAOTIModelRunnerRegistry();
 
+<<<<<<< HEAD
   if (registered_aoti_runner.find(device) == registered_aoti_runner.end()) {
     throw std::runtime_error("Unsupported device found: " + device);
   }
@@ -456,6 +618,18 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
   std::string cubin_dir = temp_dir_ + k_separator + model_directory;
   runner_ = registered_aoti_runner[device](
       so_path, 1, device, cubin_dir, run_single_threaded);
+=======
+  if (registered_aoti_runner.find(device_key) == registered_aoti_runner.end()) {
+    throw std::runtime_error("Unsupported device key found: " + device_key);
+  }
+
+  c10::Device device = c10::Device(device_key);
+  device.set_index(device_index);
+
+  std::string cubin_dir = temp_dir_ + k_separator + model_directory;
+  runner_ = registered_aoti_runner[device_key](
+      so_path, num_runners, device.str(), cubin_dir, run_single_threaded);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 AOTIModelPackageLoader::~AOTIModelPackageLoader() {
@@ -493,10 +667,18 @@ std::vector<std::string> AOTIModelPackageLoader::get_call_spec() {
 void AOTIModelPackageLoader::load_constants(
     std::unordered_map<std::string, at::Tensor>& constants_map,
     bool use_inactive,
+<<<<<<< HEAD
     bool check_full_update) {
   std::unordered_map<std::string, std::string> constant_name_to_fqn =
       runner_->getConstantNamesToOriginalFQNs();
   std::unordered_map<std::string, at::string> fqn_to_constant_name;
+=======
+    bool check_full_update,
+    bool user_managed) {
+  std::unordered_map<std::string, std::string> constant_name_to_fqn =
+      runner_->getConstantNamesToOriginalFQNs();
+  std::unordered_map<std::string, std::string> fqn_to_constant_name;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (const auto& it : constant_name_to_fqn) {
     fqn_to_constant_name.emplace(it.second, it.first);
   }
@@ -511,7 +693,11 @@ void AOTIModelPackageLoader::load_constants(
   }
 
   return runner_->update_constant_buffer(
+<<<<<<< HEAD
       updated_constants_map, use_inactive, check_full_update);
+=======
+      updated_constants_map, use_inactive, check_full_update, user_managed);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::vector<std::string> AOTIModelPackageLoader::get_constant_fqns() {
@@ -525,5 +711,16 @@ std::vector<std::string> AOTIModelPackageLoader::get_constant_fqns() {
   return constant_fqns;
 }
 
+<<<<<<< HEAD
+=======
+void AOTIModelPackageLoader::update_constant_buffer(
+    std::unordered_map<std::string, at::Tensor>& tensor_map,
+    bool use_inactive,
+    bool validate_full_updates,
+    bool user_managed) {
+  runner_->update_constant_buffer(
+      tensor_map, use_inactive, validate_full_updates, user_managed);
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::inductor
 #endif
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.h b/torch/csrc/inductor/aoti_package/model_package_loader.h
index f834df004b38..5437c9a229a1 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.h
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.h
@@ -2,6 +2,10 @@
 #pragma once
 
 #include <ATen/Tensor.h>
+<<<<<<< HEAD
+=======
+#include <c10/core/Device.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
 
 namespace torch::inductor {
@@ -10,7 +14,13 @@ class TORCH_API AOTIModelPackageLoader {
   AOTIModelPackageLoader(
       const std::string& model_package_path,
       const std::string& model_name = "model",
+<<<<<<< HEAD
       const bool run_single_threaded = false);
+=======
+      const bool run_single_threaded = false,
+      const size_t num_runners = 1,
+      const c10::DeviceIndex device_index = -1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~AOTIModelPackageLoader();
 
   AOTIModelContainerRunner* get_runner();
@@ -29,9 +39,22 @@ class TORCH_API AOTIModelPackageLoader {
   void load_constants(
       std::unordered_map<std::string, at::Tensor>& constants_map,
       bool use_inactive,
+<<<<<<< HEAD
       bool check_full_update);
   std::vector<std::string> get_constant_fqns();
 
+=======
+      bool check_full_update,
+      bool user_managed = false);
+  std::vector<std::string> get_constant_fqns();
+
+  void update_constant_buffer(
+      std::unordered_map<std::string, at::Tensor>& tensor_map,
+      bool use_inactive,
+      bool validate_full_updates,
+      bool user_managed = false);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   std::string temp_dir_;
   std::unique_ptr<AOTIModelContainerRunner> runner_;
diff --git a/torch/csrc/inductor/aoti_package/pybind.cpp b/torch/csrc/inductor/aoti_package/pybind.cpp
index 58807d0ae455..3f7e6a396f55 100644
--- a/torch/csrc/inductor/aoti_package/pybind.cpp
+++ b/torch/csrc/inductor/aoti_package/pybind.cpp
@@ -1,10 +1,18 @@
 #include <torch/csrc/inductor/aoti_package/model_package_loader.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/inductor/aoti_package/pybind.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
 #ifdef USE_CUDA
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
 #endif
 
+<<<<<<< HEAD
+=======
+#include <c10/core/Device.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/inductor/aoti_runner/pybind.h>
 #include <torch/csrc/utils/pybind.h>
@@ -16,11 +24,23 @@ class AOTIModelPackageLoaderPybind : public AOTIModelPackageLoader {
   AOTIModelPackageLoaderPybind(
       const std::string& model_package_path,
       const std::string& model_name,
+<<<<<<< HEAD
       const bool run_single_threaded)
       : AOTIModelPackageLoader(
             model_package_path,
             model_name,
             run_single_threaded) {}
+=======
+      const bool run_single_threaded,
+      const size_t num_runners,
+      const c10::DeviceIndex device_index)
+      : AOTIModelPackageLoader(
+            model_package_path,
+            model_name,
+            run_single_threaded,
+            num_runners,
+            device_index) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   py::list boxed_run(py::list& inputs, void* stream_handle = nullptr) {
     std::vector<at::Tensor> input_tensors;
@@ -46,9 +66,19 @@ class AOTIModelPackageLoaderPybind : public AOTIModelPackageLoader {
 void initAOTIPackageBindings(PyObject* module) {
   auto rootModule = py::handle(module).cast<py::module>();
   auto m = rootModule.def_submodule("_aoti");
+<<<<<<< HEAD
 
   py::class_<AOTIModelPackageLoaderPybind>(m, "AOTIModelPackageLoader")
       .def(py::init<const std::string&, const std::string&, const bool>())
+=======
+  py::class_<AOTIModelPackageLoaderPybind>(m, "AOTIModelPackageLoader")
+      .def(py::init<
+           const std::string&,
+           const std::string&,
+           const bool,
+           const size_t,
+           const c10::DeviceIndex>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def("get_metadata", &AOTIModelPackageLoaderPybind::get_metadata)
       .def(
           "run",
@@ -61,9 +91,28 @@ void initAOTIPackageBindings(PyObject* module) {
           py::arg("inputs"),
           py::arg("stream_handle") = nullptr)
       .def("get_call_spec", &AOTIModelPackageLoaderPybind::get_call_spec)
+<<<<<<< HEAD
       .def("load_constants", &AOTIModelPackageLoaderPybind::load_constants)
       .def(
           "get_constant_fqns",
           &AOTIModelPackageLoaderPybind::get_constant_fqns);
+=======
+      .def(
+          "get_constant_fqns", &AOTIModelPackageLoaderPybind::get_constant_fqns)
+      .def(
+          "load_constants",
+          &AOTIModelPackageLoaderPybind::load_constants,
+          py::arg("constants_map"),
+          py::arg("use_inactive"),
+          py::arg("check_full_update"),
+          py::arg("user_managed") = false)
+      .def(
+          "update_constant_buffer",
+          &AOTIModelPackageLoaderPybind::update_constant_buffer,
+          py::arg("tensor_map"),
+          py::arg("use_inactive"),
+          py::arg("validate_full_updates"),
+          py::arg("user_managed") = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace torch::inductor
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
index 3a2c7c517f80..948020b8f372 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
@@ -17,7 +17,11 @@ bool file_exists(std::string& path) {
 #ifdef _WIN32
   return fs::exists(path);
 #else
+<<<<<<< HEAD
   struct stat rc {};
+=======
+  struct stat rc{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return lstat(path.c_str(), &rc) == 0;
 #endif
 }
@@ -31,6 +35,7 @@ AOTIModelContainerRunner::AOTIModelContainerRunner(
     const std::string& device_str,
     const std::string& cubin_dir,
     const bool run_single_threaded) {
+<<<<<<< HEAD
   model_so_ = std::make_unique<at::DynamicLibrary>(model_so_path.c_str());
   TORCH_CHECK(model_so_, "Failed to load model: ", model_so_path);
   create_func_ = reinterpret_cast<decltype(create_func_)>(
@@ -66,6 +71,82 @@ AOTIModelContainerRunner::AOTIModelContainerRunner(
           model_so_->sym("AOTInductorModelContainerSwapConstantBuffer"));
   get_call_spec_func_ = reinterpret_cast<decltype(get_call_spec_func_)>(
       model_so_->sym("AOTInductorModelContainerGetCallSpec"));
+=======
+  if (run_single_threaded) {
+    if (num_models != 1) {
+      throw std::runtime_error(
+          "num_models must be 1 when run_single_threaded is true");
+    }
+  } else {
+    if (num_models < 1) {
+      throw std::runtime_error(
+          "num_models must be >=1 when run_single_threaded is false");
+    }
+  }
+  model_so_ = std::make_unique<at::DynamicLibrary>(model_so_path.c_str());
+  TORCH_CHECK(model_so_, "Failed to load model: ", model_so_path);
+
+#define LOAD_SYMBOL(var, name_str) \
+  var = reinterpret_cast<decltype(var)>(model_so_->sym(name_str));
+  LOAD_SYMBOL(create_func_, "AOTInductorModelContainerCreateWithDevice")
+  LOAD_SYMBOL(delete_func_, "AOTInductorModelContainerDelete")
+  LOAD_SYMBOL(get_num_outputs_func_, "AOTInductorModelContainerGetNumOutputs")
+  LOAD_SYMBOL(
+      get_num_constants_func_, "AOTInductorModelContainerGetNumConstants")
+  LOAD_SYMBOL(
+      get_constant_name_func_, "AOTInductorModelContainerGetConstantName")
+  LOAD_SYMBOL(
+      get_constant_original_fqn_func_,
+      "AOTInductorModelContainerGetConstantOriginalFQN")
+  LOAD_SYMBOL(
+      get_constant_dtype_func_, "AOTInductorModelContainerGetConstantDtype")
+  LOAD_SYMBOL(
+      update_constant_buffer_func_,
+      "AOTInductorModelContainerUpdateConstantBuffer")
+  LOAD_SYMBOL(
+      update_inactive_constant_buffer_func_,
+      "AOTInductorModelContainerUpdateInactiveConstantBuffer")
+  LOAD_SYMBOL(
+      run_const_fold_func_, "AOTInductorModelContainerRunConstantFolding")
+  LOAD_SYMBOL(
+      swap_constant_buffer_func_, "AOTInductorModelContainerSwapConstantBuffer")
+  LOAD_SYMBOL(get_call_spec_func_, "AOTInductorModelContainerGetCallSpec")
+#undef LOAD_SYMBOL
+
+// NOLINTBEGIN(performance-avoid-endl)
+#define TRY_LOAD_SYMBOL(var, name_str)                                               \
+  try {                                                                              \
+    var = reinterpret_cast<decltype(var)>(model_so_->sym(name_str));                 \
+  } catch (const at::DynamicLibraryError& e) {                                       \
+    std::cerr                                                                        \
+        << "[WARNING] Could not dlsym " << name_str                                  \
+        << ". This is okay if you don't need functionality from " << name_str        \
+        << ". Otherwise consider rebuilding your model with the latest AOTInductor." \
+        << std::endl;                                                                \
+  }
+  // NOLINTEND(performance-avoid-endl)
+
+  const char* run_func_name = run_single_threaded
+      ? "AOTInductorModelContainerRunSingleThreaded"
+      : "AOTInductorModelContainerRun";
+  TRY_LOAD_SYMBOL(run_func_, run_func_name)
+  if (run_func_ == nullptr && run_single_threaded) {
+    throw std::runtime_error(
+        "No AOTInductorModelContainerRunSingleThreaded function in .so! To use AOTInductor-compiled model in the single-threaded mode,\
+consider rebuild your model with the latest AOTInductor.");
+  }
+
+  TRY_LOAD_SYMBOL(
+      free_inactive_constant_buffer_func_,
+      "AOTInductorModelContainerFreeInactiveConstantBuffer")
+  TRY_LOAD_SYMBOL(
+      extract_constants_map_func_,
+      "AOTInductorModelContainerExtractConstantsMap")
+  TRY_LOAD_SYMBOL(
+      update_user_managed_constant_buffer_func_,
+      "AOTInductorModelContainerUpdateUserManagedConstantBuffer")
+#undef TRY_LOAD_SYMBOL
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Hack to find the json file name from the model so file
   size_t lastindex = model_so_path.find_last_of('.');
@@ -169,6 +250,7 @@ std::unordered_map<std::string, int32_t> AOTIModelContainerRunner::
   return result;
 }
 
+<<<<<<< HEAD
 void AOTIModelContainerRunner::update_constant_buffer(
     const TensorConstantMap& const_map,
     bool use_inactive,
@@ -178,21 +260,77 @@ void AOTIModelContainerRunner::update_constant_buffer(
       (AOTInductorConstantMapHandle)&const_map,
       use_inactive,
       check_full_update));
+=======
+const std::unordered_map<std::string, at::Tensor> AOTIModelContainerRunner::
+    extract_constants_map(bool use_inactive) const {
+  TensorConstantMap extracted_map;
+  AOTI_RUNTIME_ERROR_CODE_CHECK(extract_constants_map_func_(
+      container_handle_,
+      (AOTInductorConstantMapHandle)&extracted_map,
+      use_inactive));
+
+  std::unordered_map<std::string, at::Tensor> result;
+  for (const auto& pair : extracted_map) {
+    result.emplace(pair.first, *(pair.second));
+  }
+  return result;
+}
+
+void AOTIModelContainerRunner::update_constant_buffer(
+    const TensorConstantMap& const_map,
+    bool use_inactive,
+    bool check_full_update,
+    bool user_managed) {
+  if (user_managed) {
+    AOTI_RUNTIME_ERROR_CODE_CHECK(update_user_managed_constant_buffer_func_(
+        container_handle_,
+        (AOTInductorConstantMapHandle)&const_map,
+        use_inactive,
+        check_full_update));
+  } else {
+    AOTI_RUNTIME_ERROR_CODE_CHECK(update_constant_buffer_func_(
+        container_handle_,
+        (AOTInductorConstantMapHandle)&const_map,
+        use_inactive,
+        check_full_update));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void AOTIModelContainerRunner::update_constant_buffer(
     std::unordered_map<std::string, at::Tensor>& tensor_map,
     bool use_inactive,
+<<<<<<< HEAD
     bool check_full_update) {
+=======
+    bool check_full_update,
+    bool user_managed) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TensorConstantMap const_map;
   for (auto& [k, v] : tensor_map) {
     const_map.emplace(k, &v);
   }
+<<<<<<< HEAD
   AOTI_RUNTIME_ERROR_CODE_CHECK(update_constant_buffer_func_(
       container_handle_,
       (AOTInductorConstantMapHandle)&const_map,
       use_inactive,
       check_full_update));
+=======
+  if (user_managed) {
+    AOTI_RUNTIME_ERROR_CODE_CHECK(update_user_managed_constant_buffer_func_(
+        container_handle_,
+        (AOTInductorConstantMapHandle)&const_map,
+        use_inactive,
+        check_full_update));
+  } else {
+    AOTI_RUNTIME_ERROR_CODE_CHECK(update_constant_buffer_func_(
+        container_handle_,
+        (AOTInductorConstantMapHandle)&const_map,
+        use_inactive,
+        check_full_update));
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void AOTIModelContainerRunner::update_inactive_constant_buffer(
@@ -215,6 +353,18 @@ void AOTIModelContainerRunner::swap_constant_buffer() {
   AOTI_RUNTIME_ERROR_CODE_CHECK(swap_constant_buffer_func_(container_handle_));
 }
 
+<<<<<<< HEAD
+=======
+void AOTIModelContainerRunner::free_inactive_constant_buffer() {
+  if (!free_inactive_constant_buffer_func_) {
+    throw std::runtime_error(
+        "No free_inactive_constant_buffer in .so! Consider rebuild your model with the latest AOTInductor.");
+  }
+  AOTI_RUNTIME_ERROR_CODE_CHECK(
+      free_inactive_constant_buffer_func_(container_handle_));
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::vector<std::string> AOTIModelContainerRunner::get_call_spec() {
   const char* in_spec = nullptr;
   const char* out_spec = nullptr;
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner.h b/torch/csrc/inductor/aoti_runner/model_container_runner.h
index 1ca559c37c55..91fc419a55f2 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner.h
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner.h
@@ -37,19 +37,38 @@ class TORCH_API AOTIModelContainerRunner {
       const;
   std::unordered_map<std::string, int32_t> getConstantNamesToDtypes() const;
 
+<<<<<<< HEAD
+=======
+  const std::unordered_map<std::string, at::Tensor> extract_constants_map(
+      bool use_inactive) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void update_inactive_constant_buffer(const TensorConstantMap& const_map);
   void update_constant_buffer(
       std::unordered_map<std::string, at::Tensor>& tensor_map,
       bool use_inactive,
+<<<<<<< HEAD
       bool validate_full_updates);
   void update_constant_buffer(
       const TensorConstantMap& const_map,
       bool use_inactive,
       bool validate_full_updates);
+=======
+      bool validate_full_updates,
+      bool user_managed = false);
+  void update_constant_buffer(
+      const TensorConstantMap& const_map,
+      bool use_inactive,
+      bool validate_full_updates,
+      bool user_managed = false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void run_const_fold(
       bool use_inactive,
       AOTInductorStreamHandle cuda_stream_handle = nullptr);
   void swap_constant_buffer();
+<<<<<<< HEAD
+=======
+  void free_inactive_constant_buffer();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::vector<std::string> get_call_spec();
 
@@ -79,6 +98,13 @@ class TORCH_API AOTIModelContainerRunner {
       get_constant_original_fqn_func_{nullptr};
   decltype(&AOTInductorModelContainerGetConstantDtype) get_constant_dtype_func_{
       nullptr};
+<<<<<<< HEAD
+=======
+  decltype(&AOTInductorModelContainerExtractConstantsMap)
+      extract_constants_map_func_{nullptr};
+  decltype(&AOTInductorModelContainerUpdateUserManagedConstantBuffer)
+      update_user_managed_constant_buffer_func_{nullptr};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   decltype(&AOTInductorModelContainerUpdateConstantBuffer)
       update_constant_buffer_func_{nullptr};
   decltype(&AOTInductorModelContainerUpdateInactiveConstantBuffer)
@@ -87,6 +113,11 @@ class TORCH_API AOTIModelContainerRunner {
       nullptr};
   decltype(&AOTInductorModelContainerSwapConstantBuffer)
       swap_constant_buffer_func_{nullptr};
+<<<<<<< HEAD
+=======
+  decltype(&AOTInductorModelContainerFreeInactiveConstantBuffer)
+      free_inactive_constant_buffer_func_{nullptr};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   decltype(&AOTInductorModelContainerGetCallSpec) get_call_spec_func_{nullptr};
 
   AOTInductorModelContainerHandle container_handle_ = nullptr;
@@ -110,6 +141,7 @@ TORCH_API std::unordered_map<std::string, CreateAOTIModelRunnerFunc>&
 getAOTIModelRunnerRegistry();
 
 // To register a new external backend in AOTI one needs to create an instance of
+<<<<<<< HEAD
 // this struct. It is not thread-safe. Becase it is expected to be called during
 // the initialization of the program.
 struct TORCH_API RegisterAOTIModelRunner {
@@ -119,6 +151,17 @@ struct TORCH_API RegisterAOTIModelRunner {
     getAOTIModelRunnerRegistry()[name] = create_aoti_model_runner_fn;
   }
 };
+=======
+// this struct. It is not thread-safe. Because it is expected to be called
+// during the initialization of the program.
+struct TORCH_API RegisterAOTIModelRunner{RegisterAOTIModelRunner(
+    const std::string& name,
+    CreateAOTIModelRunnerFunc create_aoti_model_runner_fn){
+    getAOTIModelRunnerRegistry()[name] = create_aoti_model_runner_fn;
+} // namespace torch::inductor
+}
+;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::inductor
 #endif
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
index 7ddc4ed66554..b85441b9529d 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
@@ -33,7 +33,13 @@ std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_cpu(
 }
 } // namespace
 
+<<<<<<< HEAD
 RegisterAOTIModelRunner register_cpu_runner("cpu", &create_aoti_runner_cpu);
+=======
+static RegisterAOTIModelRunner register_cpu_runner(
+    "cpu",
+    &create_aoti_runner_cpu);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::inductor
 #endif
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp
new file mode 100644
index 000000000000..95dda420602f
--- /dev/null
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp
@@ -0,0 +1,39 @@
+#if defined(__APPLE__)
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_mps.h>
+
+namespace torch::inductor {
+
+AOTIModelContainerRunnerMps::AOTIModelContainerRunnerMps(
+    const std::string& model_so_path,
+    size_t num_models,
+    bool run_single_threaded)
+    : AOTIModelContainerRunner(
+          model_so_path,
+          num_models,
+          "mps",
+          "",
+          run_single_threaded) {}
+
+AOTIModelContainerRunnerMps::~AOTIModelContainerRunnerMps() = default;
+
+namespace {
+std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_mps(
+    const std::string& model_so_path,
+    size_t num_models,
+    const std::string& device_str,
+    const std::string& cubin_dir,
+    const bool run_single_threaded) {
+  if (device_str != "mps") {
+    throw std::runtime_error("Incorrect device passed to aoti_runner_mps");
+  }
+  return std::make_unique<AOTIModelContainerRunnerMps>(
+      model_so_path, num_models, run_single_threaded);
+}
+} // namespace
+
+static RegisterAOTIModelRunner register_mps_runner(
+    "mps",
+    &create_aoti_runner_mps);
+
+} // namespace torch::inductor
+#endif
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner_mps.h b/torch/csrc/inductor/aoti_runner/model_container_runner_mps.h
new file mode 100644
index 000000000000..37cb3e5f8b1c
--- /dev/null
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner_mps.h
@@ -0,0 +1,18 @@
+#if defined(__APPLE__)
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+
+namespace torch::inductor {
+class TORCH_API AOTIModelContainerRunnerMps : public AOTIModelContainerRunner {
+ public:
+  AOTIModelContainerRunnerMps(
+      const std::string& model_so_path,
+      size_t num_models = 1,
+      const bool run_single_threaded = false);
+
+  ~AOTIModelContainerRunnerMps() override;
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/torch/csrc/inductor/aoti_runner/pybind.cpp b/torch/csrc/inductor/aoti_runner/pybind.cpp
index 55801d1dc75d..1bb80fbf47e3 100644
--- a/torch/csrc/inductor/aoti_runner/pybind.cpp
+++ b/torch/csrc/inductor/aoti_runner/pybind.cpp
@@ -5,6 +5,13 @@
 #ifdef USE_XPU
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_xpu.h>
 #endif
+<<<<<<< HEAD
+=======
+#ifdef __APPLE__
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_mps.h>
+#endif
+#include <torch/csrc/inductor/aoti_runner/pybind.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 
@@ -31,10 +38,30 @@ void initAOTIRunnerBindings(PyObject* module) {
           "get_constant_names_to_dtypes",
           &AOTIModelContainerRunnerCpu::getConstantNamesToDtypes)
       .def(
+<<<<<<< HEAD
           "update_constant_buffer",
           static_cast<void (AOTIModelContainerRunnerCpu::*)(
               std::unordered_map<std::string, at::Tensor>&, bool, bool)>(
               &AOTIModelContainerRunnerCpu::update_constant_buffer));
+=======
+          "extract_constants_map",
+          &AOTIModelContainerRunnerCpu::extract_constants_map)
+      .def(
+          "update_constant_buffer",
+          static_cast<void (AOTIModelContainerRunnerCpu::*)(
+              std::unordered_map<std::string, at::Tensor>&, bool, bool, bool)>(
+              &AOTIModelContainerRunnerCpu::update_constant_buffer),
+          py::arg("tensor_map"),
+          py::arg("use_inactive"),
+          py::arg("validate_full_updates"),
+          py::arg("user_managed") = false)
+      .def(
+          "swap_constant_buffer",
+          &AOTIModelContainerRunnerCpu::swap_constant_buffer)
+      .def(
+          "free_inactive_constant_buffer",
+          &AOTIModelContainerRunnerCpu::free_inactive_constant_buffer);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef USE_CUDA
   py::class_<AOTIModelContainerRunnerCuda>(m, "AOTIModelContainerRunnerCuda")
@@ -58,10 +85,30 @@ void initAOTIRunnerBindings(PyObject* module) {
           "get_constant_names_to_dtypes",
           &AOTIModelContainerRunnerCuda::getConstantNamesToDtypes)
       .def(
+<<<<<<< HEAD
           "update_constant_buffer",
           static_cast<void (AOTIModelContainerRunnerCuda::*)(
               std::unordered_map<std::string, at::Tensor>&, bool, bool)>(
               &AOTIModelContainerRunnerCuda::update_constant_buffer));
+=======
+          "extract_constants_map",
+          &AOTIModelContainerRunnerCuda::extract_constants_map)
+      .def(
+          "update_constant_buffer",
+          static_cast<void (AOTIModelContainerRunnerCuda::*)(
+              std::unordered_map<std::string, at::Tensor>&, bool, bool, bool)>(
+              &AOTIModelContainerRunnerCuda::update_constant_buffer),
+          py::arg("tensor_map"),
+          py::arg("use_inactive"),
+          py::arg("validate_full_updates"),
+          py::arg("user_managed") = false)
+      .def(
+          "swap_constant_buffer",
+          &AOTIModelContainerRunnerCuda::swap_constant_buffer)
+      .def(
+          "free_inactive_constant_buffer",
+          &AOTIModelContainerRunnerCuda::free_inactive_constant_buffer);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
 #ifdef USE_XPU
   py::class_<AOTIModelContainerRunnerXpu>(m, "AOTIModelContainerRunnerXpu")
@@ -85,10 +132,66 @@ void initAOTIRunnerBindings(PyObject* module) {
           "get_constant_names_to_dtypes",
           &AOTIModelContainerRunnerXpu::getConstantNamesToDtypes)
       .def(
+<<<<<<< HEAD
           "update_constant_buffer",
           static_cast<void (AOTIModelContainerRunnerXpu::*)(
               std::unordered_map<std::string, at::Tensor>&, bool, bool)>(
               &AOTIModelContainerRunnerXpu::update_constant_buffer));
+=======
+          "extract_constants_map",
+          &AOTIModelContainerRunnerXpu::extract_constants_map)
+      .def(
+          "update_constant_buffer",
+          static_cast<void (AOTIModelContainerRunnerXpu::*)(
+              std::unordered_map<std::string, at::Tensor>&, bool, bool, bool)>(
+              &AOTIModelContainerRunnerXpu::update_constant_buffer),
+          py::arg("tensor_map"),
+          py::arg("use_inactive"),
+          py::arg("validate_full_updates"),
+          py::arg("user_managed") = false)
+      .def(
+          "swap_constant_buffer",
+          &AOTIModelContainerRunnerXpu::swap_constant_buffer)
+      .def(
+          "free_inactive_constant_buffer",
+          &AOTIModelContainerRunnerXpu::free_inactive_constant_buffer);
+
+#endif
+#if defined(USE_MPS) && defined(__APPLE__) && \
+    !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
+  py::class_<AOTIModelContainerRunnerMps>(m, "AOTIModelContainerRunnerMps")
+      .def(py::init<const std::string&, int>())
+      .def(
+          "run",
+          &AOTIModelContainerRunnerMps::run,
+          py::arg("inputs"),
+          py::arg("stream_handle") = nullptr)
+      .def("get_call_spec", &AOTIModelContainerRunnerMps::get_call_spec)
+      .def(
+          "get_constant_names_to_original_fqns",
+          &AOTIModelContainerRunnerMps::getConstantNamesToOriginalFQNs)
+      .def(
+          "get_constant_names_to_dtypes",
+          &AOTIModelContainerRunnerMps::getConstantNamesToDtypes)
+      .def(
+          "extract_constants_map",
+          &AOTIModelContainerRunnerMps::extract_constants_map)
+      .def(
+          "update_constant_buffer",
+          static_cast<void (AOTIModelContainerRunnerMps::*)(
+              std::unordered_map<std::string, at::Tensor>&, bool, bool, bool)>(
+              &AOTIModelContainerRunnerMps::update_constant_buffer),
+          py::arg("tensor_map"),
+          py::arg("use_inactive"),
+          py::arg("validate_full_updates"),
+          py::arg("user_managed") = false)
+      .def(
+          "swap_constant_buffer",
+          &AOTIModelContainerRunnerMps::swap_constant_buffer)
+      .def(
+          "free_inactive_constant_buffer",
+          &AOTIModelContainerRunnerMps::free_inactive_constant_buffer);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #endif
 
diff --git a/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h b/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
index 3a5840bd265d..a3d641b6dca7 100644
--- a/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
+++ b/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
@@ -1,5 +1,9 @@
 #pragma once
 
+<<<<<<< HEAD
+=======
+#include <torch/csrc/inductor/aoti_runtime/mini_array_ref.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/inductor/aoti_runtime/utils.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
@@ -9,6 +13,7 @@
 
 namespace torch::aot_inductor {
 
+<<<<<<< HEAD
 // Can't use c10::ArrayRef because it's not truly header-only and
 // pulls in other c10 headers. This is (sadly) copy-pasted and
 // adapted.
@@ -158,6 +163,8 @@ class MiniArrayRef final {
       std::initializer_list<U>) = delete;
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 using MiniIntArrayRef = MiniArrayRef<int64_t>;
 
 static_assert(
diff --git a/torch/csrc/inductor/aoti_runtime/constant_type.h b/torch/csrc/inductor/aoti_runtime/constant_type.h
new file mode 100644
index 000000000000..053eed728fb0
--- /dev/null
+++ b/torch/csrc/inductor/aoti_runtime/constant_type.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <cstdint>
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+
+namespace torch::aot_inductor {
+
+enum ConstantType : uint8_t {
+  Unknown = 0,
+  Parameter = 1,
+  Buffer = 2,
+  TensorConstant = 3,
+  FoldedConstant = 4,
+};
+
+} // namespace torch::aot_inductor
diff --git a/torch/csrc/inductor/aoti_runtime/interface.h b/torch/csrc/inductor/aoti_runtime/interface.h
index bbdd6273b9da..19b2c82dc96b 100644
--- a/torch/csrc/inductor/aoti_runtime/interface.h
+++ b/torch/csrc/inductor/aoti_runtime/interface.h
@@ -117,6 +117,31 @@ AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
     size_t idx,
     int32_t* dtype);
 
+<<<<<<< HEAD
+=======
+// Retrieves a constant's data size.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    size_t* data_size);
+
+// Extract the constants that is being used in the container.
+AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive);
+
+// Setup the constant buffer in model container with provided ConstantMap.
+// The ConstantMap is user managed, and the user would retain ownership.
+AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Setup the constant buffer in model container with provided ConstantMap
 // use_inactive should be set as true if the inactive buffer is to be updated.
 // validate_full_update checks if all constants are included in the ConstantMap
@@ -132,6 +157,13 @@ AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
     AOTInductorModelContainerHandle container_handle,
     AOTInductorConstantMapHandle constant_map_handle);
 
+<<<<<<< HEAD
+=======
+// Free the inactive constant buffer in model container.
+AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Run constant folding on constant buffer.
 AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
     AOTInductorModelContainerHandle container_handle,
diff --git a/torch/csrc/inductor/aoti_runtime/mini_array_ref.h b/torch/csrc/inductor/aoti_runtime/mini_array_ref.h
new file mode 100644
index 000000000000..84a7dddb77ad
--- /dev/null
+++ b/torch/csrc/inductor/aoti_runtime/mini_array_ref.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace torch::aot_inductor {
+
+// Can't use c10::ArrayRef because it's not truly header-only and
+// pulls in other c10 headers. This is (sadly) copy-pasted and
+// adapted.
+template <typename T>
+class MiniArrayRef final {
+ public:
+  using iterator = T*;
+  using const_iterator = const T*;
+  using size_type = size_t;
+  using value_type = T;
+
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+ private:
+  /// The start of the array, in an external buffer.
+  T* Data;
+
+  /// The number of elements.
+  size_type Length;
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty MiniArrayRef.
+  /* implicit */ constexpr MiniArrayRef() : Data(nullptr), Length(0) {}
+
+  /// Construct an MiniArrayRef from a single element.
+  // TODO Make this explicit
+  constexpr MiniArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
+
+  /// Construct an MiniArrayRef from a pointer and length.
+  constexpr MiniArrayRef(T* data, size_t length) : Data(data), Length(length) {}
+
+  /// Construct an MiniArrayRef from a range.
+  constexpr MiniArrayRef(T* begin, T* end) : Data(begin), Length(end - begin) {}
+
+  template <
+      typename Container,
+      typename = std::enable_if_t<std::is_same_v<
+          std::remove_const_t<decltype(std::declval<Container>().data())>,
+          T*>>>
+  /* implicit */ MiniArrayRef(Container& container)
+      : Data(container.data()), Length(container.size()) {}
+
+  /// Construct an MiniArrayRef from a std::vector.
+  // The enable_if stuff here makes sure that this isn't used for
+  // std::vector<bool>, because MiniArrayRef can't work on a std::vector<bool>
+  // bitfield.
+  template <typename A>
+  /* implicit */ MiniArrayRef(const std::vector<T, A>& Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    static_assert(
+        !std::is_same_v<T, bool>,
+        "MiniArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
+  }
+
+  /// Construct an MiniArrayRef from a std::array
+  template <size_t N>
+  /* implicit */ constexpr MiniArrayRef(std::array<T, N>& Arr)
+      : Data(Arr.data()), Length(N) {}
+
+  /// Construct an MiniArrayRef from a C array.
+  template <size_t N>
+  // NOLINTNEXTLINE(*c-array*)
+  /* implicit */ constexpr MiniArrayRef(T (&Arr)[N]) : Data(Arr), Length(N) {}
+
+  // /// Construct an MiniArrayRef from an empty C array.
+  /* implicit */ constexpr MiniArrayRef(const volatile void* Arr)
+      : Data(nullptr), Length(0) {}
+
+  /// Construct an MiniArrayRef from a std::initializer_list.
+  /* implicit */ constexpr MiniArrayRef(const std::initializer_list<T>& Vec)
+      : Data(
+            std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
+                                             : std::begin(Vec)),
+        Length(Vec.size()) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  constexpr iterator begin() const {
+    return Data;
+  }
+  constexpr iterator end() const {
+    return Data + Length;
+  }
+
+  // These are actually the same as iterator, since MiniArrayRef only
+  // gives you const iterators.
+  constexpr const_iterator cbegin() const {
+    return Data;
+  }
+  constexpr const_iterator cend() const {
+    return Data + Length;
+  }
+
+  constexpr reverse_iterator rbegin() const {
+    return reverse_iterator(end());
+  }
+  constexpr reverse_iterator rend() const {
+    return reverse_iterator(begin());
+  }
+
+  /// empty - Check if the array is empty.
+  constexpr bool empty() const {
+    return Length == 0;
+  }
+
+  constexpr T* data() const {
+    return Data;
+  }
+
+  /// size - Get the array size.
+  constexpr size_t size() const {
+    return Length;
+  }
+
+  /// equals - Check for element-wise equality.
+  constexpr bool equals(MiniArrayRef RHS) const {
+    return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  constexpr const T& operator[](size_t Index) const {
+    return Data[Index];
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, MiniArrayRef<T>>& operator=(
+      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+      U&& Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, MiniArrayRef<T>>& operator=(
+      std::initializer_list<U>) = delete;
+};
+
+} // namespace torch::aot_inductor
diff --git a/torch/csrc/inductor/aoti_runtime/model.h b/torch/csrc/inductor/aoti_runtime/model.h
index 0015b476d178..449140c61751 100644
--- a/torch/csrc/inductor/aoti_runtime/model.h
+++ b/torch/csrc/inductor/aoti_runtime/model.h
@@ -15,11 +15,22 @@
 // C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
 // applies to other files under torch/csrc/inductor/aoti_runtime/.
 #include <torch/csrc/inductor/aoti_runtime/device_utils.h>
+<<<<<<< HEAD
+=======
+#ifdef USE_MPS
+#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
+#endif // USE_MPS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_XPU
 #include <torch/csrc/inductor/aoti_runtime/utils_xpu.h>
 #else
 #include <torch/csrc/inductor/aoti_runtime/utils.h>
+<<<<<<< HEAD
 #endif
+=======
+#endif // USE_XPU
+#include <torch/csrc/inductor/aoti_runtime/constant_type.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #define AOTI_RUNTIME_CHECK(EXPR, MSG) \
   do {                                \
@@ -73,6 +84,18 @@ RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
   return RAIIDataPtr(data_ptr, deleter);
 }
 
+<<<<<<< HEAD
+=======
+#elif defined(USE_MPS)
+
+RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
+  void* data_ptr = nullptr;
+  aoti_torch_mps_malloc(&data_ptr, num_bytes);
+  auto deleter = [](void* ptr) { aoti_torch_mps_free(ptr); };
+  return RAIIDataPtr(data_ptr, deleter);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
 
 RAIIDataPtr RAII_cpuMalloc(size_t num_bytes) {
@@ -89,6 +112,7 @@ RAIIDataPtr RAII_cpuMalloc(size_t num_bytes) {
 } // anonymous namespace
 
 namespace torch::aot_inductor {
+<<<<<<< HEAD
 enum ConstantType : uint8_t {
   Unknown = 0,
   Parameter = 1,
@@ -98,6 +122,11 @@ enum ConstantType : uint8_t {
 };
 
 using ConstantMap = std::unordered_map<std::string, RAIIAtenTensorHandle>;
+=======
+
+using ConstantMap =
+    std::unordered_map<std::string, MaybeOwningAtenTensorHandle>;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // valid device strs are: cpu, cuda, cuda:0, cuda:1, ...
 // Update the list here if more devices are supported in the future
@@ -105,7 +134,11 @@ inline void parse_device_str(
     const std::string& device_str,
     int32_t& device_type,
     int32_t& device_idx) {
+<<<<<<< HEAD
   std::regex re("(cpu|cuda|xpu)(:([0-9]+))?");
+=======
+  std::regex re("(cpu|cuda|xpu|mps)(:([0-9]+))?");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::smatch sm;
   bool matched = std::regex_match(device_str, sm, re);
   AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str);
@@ -118,6 +151,13 @@ inline void parse_device_str(
   } else if (sm[1].str() == "xpu") {
     device_type = aoti_torch_device_type_xpu();
 #endif
+<<<<<<< HEAD
+=======
+#ifdef USE_MPS
+  } else if (sm[1].str() == "mps") {
+    device_type = aoti_torch_device_type_mps();
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else {
     AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str);
   }
@@ -166,6 +206,14 @@ class AOTInductorModelBase {
       aoti_torch_set_current_xpu_device(device_idx_);
     }
 #endif // USE_XPU
+<<<<<<< HEAD
+=======
+#ifdef USE_MPS
+    if (device_idx_ == -1) {
+      device_idx_ = 0;
+    }
+#endif // USE_MPS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   // NOLINTNEXTLINE(modernize-use-equals-default)
@@ -290,19 +338,36 @@ class AOTInductorModelBase {
 
   void load_constants() {
     size_t num_constants = this->num_constants();
+<<<<<<< HEAD
     constants_map_->reserve(num_constants);
 
     std::vector<size_t> constants_internal_offset(num_constants);
     size_t blob_size = 0;
     compute_constant_blob(blob_size, constants_internal_offset);
 #if defined(USE_CUDA) || defined(USE_XPU)
+=======
+    size_t num_folded_constants = this->num_folded_constants();
+    constants_map_->reserve(num_constants);
+
+    std::vector<size_t> constants_internal_offset(
+        num_constants - num_folded_constants);
+    size_t blob_size = 0;
+    compute_constant_blob(blob_size, constants_internal_offset);
+    if (!include_weights) {
+      return;
+    }
+#if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_MPS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     constant_blob_ = RAII_gpuMalloc(blob_size);
 #else
     constant_blob_ = RAII_cpuMalloc(blob_size);
 #endif
+<<<<<<< HEAD
     if (!include_weights) {
       return;
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     size_t bytes_read = 0;
     for (size_t i = 0; i < num_constants; i++) {
@@ -317,7 +382,11 @@ class AOTInductorModelBase {
                 constants_internal_offset[i],
                 bytes_read,
                 data_size,
+<<<<<<< HEAD
                 from_folded)
+=======
+                /* skip_copy = */ false)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           : nullptr;
       bytes_read += data_size;
 
@@ -326,7 +395,16 @@ class AOTInductorModelBase {
       auto ndim = this->constant_ndim(i);
       auto size = this->constant_shape(i);
       auto stride = this->constant_stride(i);
+<<<<<<< HEAD
       auto offset = this->constant_offset(i);
+=======
+#ifdef USE_MPS
+      auto offset = this->constant_offset(i) +
+          (constants_internal_offset[i] / aoti_torch_dtype_element_size(dtype));
+#else
+      auto offset = this->constant_offset(i);
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto layout = this->constant_layout(i);
       auto opaque_metadata_ptr = this->opaque_metadata(i);
       auto opaque_metadata_size = this->opaque_metadata_size(i);
@@ -389,6 +467,17 @@ class AOTInductorModelBase {
           _get_constants_start() + bytes_read,
           data_size,
           cudaMemcpyHostToDevice));
+<<<<<<< HEAD
+=======
+#elif USE_MPS
+      aoti_torch_mps_memcpy(
+          constants_ptr,
+          constant_offset,
+          bytes_read,
+          data_size,
+          _get_constants_start());
+      return constants_ptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
       memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size);
 #endif
@@ -401,13 +490,25 @@ class AOTInductorModelBase {
       std::vector<size_t>& constants_internal_offset) {
     size_t num_constants = this->num_constants();
     blob_size = 0;
+<<<<<<< HEAD
     for (size_t i = 0; i < num_constants; i++) {
+=======
+    size_t curr_idx = 0;
+    for (size_t i = 0; i < num_constants; i++) {
+      if (this->constant_from_folded(i)) {
+        continue;
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       size_t data_size = this->constant_data_size(i);
       if (data_size % AOTI_CONST_ALIGNMENT) {
         data_size = AOTI_CONST_ALIGNMENT +
             (data_size / AOTI_CONST_ALIGNMENT) * AOTI_CONST_ALIGNMENT;
       }
+<<<<<<< HEAD
       constants_internal_offset[i] = blob_size;
+=======
+      constants_internal_offset[curr_idx++] = blob_size;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       blob_size += data_size;
     }
   }
@@ -424,6 +525,20 @@ class AOTInductorModelBase {
     return constants_info_.size();
   }
 
+<<<<<<< HEAD
+=======
+  size_t num_folded_constants() const {
+    size_t total_consts = this->num_constants();
+    size_t folded_consts = 0;
+    for (size_t i = 0; i < total_consts; i++) {
+      if (this->constant_from_folded(i)) {
+        folded_consts++;
+      }
+    }
+    return folded_consts;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const char* input_name(int64_t idx) const {
     return inputs_info_.at(idx).name;
   }
@@ -613,7 +728,11 @@ class AOTInductorModelBase {
     AOTI_RUNTIME_CHECK(
         reinterpret_cast<uint64_t*>(
             self_mmap + weights_size - sizeof(uint64_t))[0] == magic_number,
+<<<<<<< HEAD
         "Weigths data seems corrupt");
+=======
+        "Weights data seems corrupt");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return self_mmap;
 #endif
   }
@@ -661,7 +780,11 @@ class AOTInductorModelBase {
   bool include_weights;
 
   // Record if the model finishes an inference run so that its owning
+<<<<<<< HEAD
   // AOTModelContainer can re-use this instance.
+=======
+  // AOTModelContainer can reuse this instance.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_CUDA
   std::optional<cudaEvent_t> run_finished_;
 #elif defined(USE_XPU)
@@ -687,8 +810,12 @@ class AOTInductorModel : public AOTInductorModelBase<AOTInductorModel> {
       std::shared_ptr<ConstantMap> constants_map,
       std::shared_ptr<std::vector<ConstantHandle>> constants_array,
       const std::string& device_str,
+<<<<<<< HEAD
       std::optional<std::string> cubin_dir,
       bool include_weights = true);
+=======
+      std::optional<std::string> cubin_dir);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::unordered_map<std::string, AtenTensorHandle> const_run_impl(
       DeviceStreamType stream,
diff --git a/torch/csrc/inductor/aoti_runtime/model_container.h b/torch/csrc/inductor/aoti_runtime/model_container.h
index dc39bad57b3f..d7d0e8127310 100644
--- a/torch/csrc/inductor/aoti_runtime/model_container.h
+++ b/torch/csrc/inductor/aoti_runtime/model_container.h
@@ -13,6 +13,32 @@
 #include <torch/csrc/inductor/aoti_runtime/model.h>
 
 namespace torch::aot_inductor {
+<<<<<<< HEAD
+=======
+// The state transition is done by:
+// (1) NONE state: The default state when created. This state should only exist
+// when model_container is created and no constants are being loaded or updated.
+// (2) INITIALIZED state: This state get set whenever we load the constants into
+// the buffer. This could be done by load_constants or update_constants_buffer.
+// (3) FOLDED state: This state should transition from INITIALIZED after
+// const_fold is being invoked.
+enum class ConstantState : uint8_t { NONE, INITIALIZED, FOLDED, UNKNOWN };
+
+inline std::string toStringConstantState(ConstantState state) {
+  switch (state) {
+    case ConstantState::NONE:
+      return "ConstantState::NONE";
+    case ConstantState::INITIALIZED:
+      return "ConstantState::INITIALIZED";
+    case ConstantState::FOLDED:
+      return "ConstantState::FOLDED";
+    case ConstantState::UNKNOWN:
+      return "ConstantState::UNKNOWN";
+    default:
+      return "Unknown enum class state for ConstantState";
+  }
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class AOTInductorModelContainer {
  public:
@@ -53,8 +79,15 @@ class AOTInductorModelContainer {
     }
     model->load_constants();
     constant_blob_ = model->release_constant_blob();
+<<<<<<< HEAD
     constants_internal_offset_.resize(model->num_constants());
     model->compute_constant_blob(blob_size_, constants_internal_offset_);
+=======
+    constants_internal_offset_.resize(
+        model->num_constants() - model->num_folded_constants());
+    model->compute_constant_blob(blob_size_, constants_internal_offset_);
+    constant_folded_ = ConstantState::INITIALIZED;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for (auto& model : models_) {
       model->update_constants_map(constants_map_);
@@ -77,24 +110,44 @@ class AOTInductorModelContainer {
     std::shared_lock model_lk(model_exec_mutex_);
     auto* model = get_available_model();
 
+<<<<<<< HEAD
     if (!constant_folded_) {
+=======
+    ConstantState& const_folded =
+        use_secondary_ ? constant_folded_secondary_ : constant_folded_;
+    if (const_folded == ConstantState::INITIALIZED) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // At this point, constant is not ready yet. We need to call constant
       // folding before we execute the model. We obtain a unique lock at this
       // point to make sure constant is ready for all.
       model_lk.unlock();
       std::unique_lock constants_folding_lk(model_exec_mutex_);
       // Double locking to make sure constant folding is only ran once.
+<<<<<<< HEAD
       if (!constant_folded_) {
+=======
+      if (const_folded == ConstantState::INITIALIZED) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto folded_const_map = model->run_const_fold(
             stream, proxy_executor, /* initialization = */ true);
         update_constant_buffer(
             std::move(folded_const_map),
             /* use_inactive = */ false,
             /* validate_full_update = */ false);
+<<<<<<< HEAD
         constant_folded_ = true;
       }
       constants_folding_lk.unlock();
       model_lk.lock();
+=======
+        const_folded = ConstantState::FOLDED;
+      }
+      constants_folding_lk.unlock();
+      model_lk.lock();
+    } else if (const_folded != ConstantState::FOLDED) {
+      throw std::runtime_error(
+          "Unknown constant state: " + toStringConstantState(constant_folded_));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     try {
@@ -126,20 +179,62 @@ class AOTInductorModelContainer {
       AOTIProxyExecutorHandle proxy_executor) {
     auto* model = available_models_[0];
 
+<<<<<<< HEAD
     if (!constant_folded_) {
+=======
+    ConstantState& const_folded =
+        use_secondary_ ? constant_folded_secondary_ : constant_folded_;
+    if (const_folded == ConstantState::INITIALIZED) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto folded_const_map = model->run_const_fold(
           stream, proxy_executor, /* initialization = */ true);
       update_constant_buffer(
           std::move(folded_const_map),
           /* use_inactive = */ false,
           /* validate_full_update = */ false);
+<<<<<<< HEAD
       constant_folded_ = true;
+=======
+      const_folded = ConstantState::FOLDED;
+    } else if (constant_folded_ != ConstantState::FOLDED) {
+      throw std::runtime_error(
+          "Unknown constant state: " + toStringConstantState(constant_folded_));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     model->run_single_threaded(
         input_handles, output_handles, stream, proxy_executor);
   }
 
+<<<<<<< HEAD
+=======
+  const std::unordered_map<std::string, AtenTensorHandle> extract_constants_map(
+      bool use_inactive) const {
+    size_t n_consts = this->num_constants();
+    std::unordered_map<std::string, AtenTensorHandle> ret;
+    ret.reserve(n_consts);
+
+    std::shared_ptr<ConstantMap> extract_map = constants_map_;
+    // Essentially a XOR
+    if (use_inactive != use_secondary_) {
+      extract_map = constants_map_secondary_;
+    }
+    for (size_t idx = 0; idx < n_consts; idx++) {
+      if (this->constant_from_folded(idx)) {
+        continue;
+      }
+
+      auto it = extract_map->find(this->constant_name(idx));
+      if (it != extract_map->end()) {
+        ret.emplace(this->constant_original_fqn(idx), it->second);
+        continue;
+      }
+    }
+
+    return ret;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t num_constants() const {
     if (this->num_models() == 0) {
       throw std::runtime_error("No available models in container!");
@@ -171,6 +266,16 @@ class AOTInductorModelContainer {
     return models_[0]->constant_from_folded(static_cast<int64_t>(idx));
   }
 
+<<<<<<< HEAD
+=======
+  size_t constant_data_size(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_data_size(static_cast<int64_t>(idx));
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // retrieve type of constants_info_[idx]
   int32_t constant_type(size_t idx) const {
     if (this->num_models() == 0) {
@@ -191,6 +296,7 @@ class AOTInductorModelContainer {
       bool inactive_buffer,
       DeviceStreamType stream,
       AOTIProxyExecutorHandle proxy_executor) {
+<<<<<<< HEAD
     std::shared_lock model_lk(model_exec_mutex_);
     auto* model = get_available_model();
 
@@ -199,20 +305,42 @@ class AOTInductorModelContainer {
       // folding on the active buffer.
       model_lk.unlock();
       std::unique_lock constants_folding_lk(model_exec_mutex_);
+=======
+    AOTInductorModel* model;
+    ConstantState& const_folded = inactive_buffer == use_secondary_
+        ? constant_folded_
+        : constant_folded_secondary_;
+    if (!inactive_buffer) {
+      // We would need to acquire a unique lock if we want to run constant
+      // folding on the active buffer.
+      std::unique_lock constants_folding_lk(model_exec_mutex_);
+      model = get_available_model();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       try {
         auto folded_const_map = model->run_const_fold(stream, proxy_executor);
         update_constant_buffer(
             std::move(folded_const_map),
             /* use_inactive = */ false,
             /* validate_full_update = */ false);
+<<<<<<< HEAD
+=======
+        const_folded = ConstantState::FOLDED;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } catch (...) {
         std::lock_guard lk(models_mutex_);
         available_models_.push_back(model);
         throw;
       }
+<<<<<<< HEAD
       constants_folding_lk.unlock();
       model_lk.lock();
     } else {
+=======
+    } else {
+      std::shared_lock model_lk(model_exec_mutex_);
+      model = get_available_model();
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // We swap the constant mapping to the inactive buffer in the model to run
       // const run.
       auto constants_map = get_constants_map(/* get_inactive= */ true);
@@ -235,6 +363,10 @@ class AOTInductorModelContainer {
         model->update_constants_map(
             constants_map, /* remap_constants_array= */ false);
         model->update_constants_array(constants_array);
+<<<<<<< HEAD
+=======
+        const_folded = ConstantState::FOLDED;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } catch (...) {
         std::lock_guard lk(models_mutex_);
         available_models_.push_back(model);
@@ -249,19 +381,34 @@ class AOTInductorModelContainer {
     pending_models_available_.notify_one();
   }
 
+<<<<<<< HEAD
   bool _should_skip_update(const size_t idx) const {
+=======
+  bool _is_tensor_constant_type(const size_t idx) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto constant_type = models_[0]->constant_type(static_cast<int64_t>(idx));
     // We should skip constants
     return constant_type == ConstantType::TensorConstant;
   }
 
+<<<<<<< HEAD
   bool _could_skip_update(const size_t idx) const {
+=======
+  bool _is_buffer_type(const size_t idx) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto constant_type = models_[0]->constant_type(static_cast<int64_t>(idx));
     // Buffer can be optionally skipped, so if it not provided by upstream
     // services, it is OK to relax the check.
     return constant_type == ConstantType::Buffer;
   }
 
+<<<<<<< HEAD
+=======
+  bool _is_tensor_constant_or_buffer_type(const size_t idx) const {
+    return _is_tensor_constant_type(idx) || _is_buffer_type(idx);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   void assert_all_constants(
       const std::unordered_map<std::string, AtenTensorHandle>& constants_map) {
     auto num_constants = models_[0]->num_constants();
@@ -274,7 +421,11 @@ class AOTInductorModelContainer {
           std::string(models_[0]->constant_name(static_cast<int64_t>(idx)));
       auto it = constants_map.find(constant_name);
       if (it == constants_map.end()) {
+<<<<<<< HEAD
         if (_should_skip_update(idx) || _could_skip_update(idx)) {
+=======
+        if (_is_tensor_constant_or_buffer_type(idx)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           // tracing sometimes creates tensors that are non-existent in
           // original graph. We could skip those and do a direct copy.
           std::cerr << "[WARNING] Found constant or module state buffer "
@@ -301,6 +452,14 @@ class AOTInductorModelContainer {
       assert_all_constants(constants_map);
     }
 
+<<<<<<< HEAD
+=======
+    ConstantState& const_folded = use_inactive == use_secondary_
+        ? constant_folded_
+        : constant_folded_secondary_;
+    const_folded = ConstantState::INITIALIZED;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto original_constants_map = get_constants_map(!use_inactive);
     auto constants_map_to_update = get_constants_map(use_inactive);
 
@@ -309,19 +468,33 @@ class AOTInductorModelContainer {
       auto constant_name =
           std::string(models_[0]->constant_name(static_cast<int64_t>(idx)));
       auto it = constants_map.find(constant_name);
+<<<<<<< HEAD
       if (it == constants_map.end() && !use_inactive) {
+=======
+      if (it == constants_map.end() &&
+          !(use_inactive && _is_tensor_constant_type(idx))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue;
       }
 
       AtenTensorHandle tensor;
+<<<<<<< HEAD
       if (it == constants_map.end() && use_inactive) {
+=======
+      if (it == constants_map.end()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         aoti_torch_clone(
             original_constants_map->find(constant_name)->second.get(), &tensor);
       } else {
         tensor = it->second;
       }
 
+<<<<<<< HEAD
       constants_map_to_update->insert_or_assign(constant_name, tensor);
+=======
+      constants_map_to_update->insert_or_assign(
+          constant_name, RAIIAtenTensorHandle(tensor));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     // Update the inactive constant array.
     update_array_from_map(
@@ -333,7 +506,12 @@ class AOTInductorModelContainer {
   void update_constant_buffer(
       const std::unordered_map<std::string, AtenTensorHandle>& constants_map,
       bool use_inactive,
+<<<<<<< HEAD
       bool validate_full_update) {
+=======
+      bool validate_full_update,
+      bool user_managed = false) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (this->num_models() == 0) {
       throw std::runtime_error("No model available in container!");
     }
@@ -341,6 +519,14 @@ class AOTInductorModelContainer {
       assert_all_constants(constants_map);
     }
 
+<<<<<<< HEAD
+=======
+    ConstantState& const_folded = use_inactive == use_secondary_
+        ? constant_folded_
+        : constant_folded_secondary_;
+    const_folded = ConstantState::INITIALIZED;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto original_constants_map = get_constants_map(!use_inactive);
     auto constants_map_to_update = get_constants_map(use_inactive);
 
@@ -349,16 +535,38 @@ class AOTInductorModelContainer {
       auto constant_name =
           std::string(models_[0]->constant_name(static_cast<int64_t>(idx)));
       auto it = constants_map.find(constant_name);
+<<<<<<< HEAD
       if (it == constants_map.end() && !use_inactive) {
+=======
+      if (it == constants_map.end() &&
+          !(use_inactive && _is_tensor_constant_or_buffer_type(idx))) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         continue;
       }
 
       AtenTensorHandle tensor;
+<<<<<<< HEAD
       if (it == constants_map.end() && use_inactive) {
+=======
+      if (it == constants_map.end()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor = original_constants_map->find(constant_name)->second.get();
       } else {
         tensor = it->second;
       }
+<<<<<<< HEAD
+=======
+
+      if (user_managed) {
+        // If user managed, we pass in the pointer directly, and skip the
+        // copy.
+        constants_map_to_update->insert_or_assign(
+            constant_name,
+            MaybeOwningAtenTensorHandle(tensor, /* user_managed = */ true));
+        continue;
+      }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto* constants_blob_ptr =
           static_cast<uint8_t*>(get_constant_blob_ptr(use_inactive));
 
@@ -406,9 +614,16 @@ class AOTInductorModelContainer {
           device_idx,
           &tensor_handle));
 
+<<<<<<< HEAD
       // Now place the tensor to constants_map. Note at this point the ownership
       // of the tensor_handle will be taken over.
       constants_map_to_update->insert_or_assign(constant_name, tensor_handle);
+=======
+      // Now place the tensor to constants_map. Note at this point the
+      // ownership of the tensor_handle will be taken over.
+      constants_map_to_update->insert_or_assign(
+          constant_name, RAIIAtenTensorHandle(tensor_handle));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     // Update the inactive constant array.
     update_array_from_map(
@@ -445,6 +660,32 @@ class AOTInductorModelContainer {
     use_secondary_ = !use_secondary_;
   }
 
+<<<<<<< HEAD
+=======
+  void free_inactive_constant_buffer() {
+    if (use_secondary_) {
+      constant_folded_ = ConstantState::NONE;
+      constant_blob_.reset();
+    } else {
+      constant_folded_secondary_ = ConstantState::NONE;
+      constant_blob_secondary_.reset();
+    }
+    // Free the internally held constants
+    int num_constants = static_cast<int>(models_[0]->num_constants());
+    std::shared_ptr<ConstantMap> to_free_map =
+        use_secondary_ ? constants_map_ : constants_map_secondary_;
+
+    for (int i = 0; i < num_constants; i++) {
+      if (models_[0]->constant_from_folded(i)) {
+        auto it = to_free_map->find(models_[0]->constant_name(i));
+        if (it != to_free_map->end()) {
+          it->second.reset();
+        }
+      }
+    }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t num_inputs() const {
     return input_names_.size();
   }
@@ -494,7 +735,12 @@ class AOTInductorModelContainer {
   bool use_secondary_{false};
 
   // Determine whether we have ran constant folding
+<<<<<<< HEAD
   bool constant_folded_{false};
+=======
+  ConstantState constant_folded_{ConstantState::NONE};
+  ConstantState constant_folded_secondary_{ConstantState::NONE};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   // Holds the mapping of constants to at::Tensor.
   // The underlying data of at::Tensor is in either constant_blob_ (for CUDA).
@@ -540,6 +786,7 @@ class AOTInductorModelContainer {
   // make sure no one is executing the model.
   std::shared_mutex model_exec_mutex_;
 
+<<<<<<< HEAD
   void* get_constant_blob_ptr(bool get_inactive) {
     if ((get_inactive && use_secondary_) ||
         (!get_inactive && !use_secondary_)) {
@@ -551,6 +798,26 @@ class AOTInductorModelContainer {
 #else
         constant_blob_secondary_ = RAII_cpuMalloc(blob_size_);
 #endif // USE_CUDA
+=======
+  RAIIDataPtr allocate_constant_blob() {
+#if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_MPS)
+    return RAII_gpuMalloc(blob_size_);
+#else
+    return RAII_cpuMalloc(blob_size_);
+#endif // USE_CUDA
+  }
+
+  void* get_constant_blob_ptr(bool get_inactive) {
+    if ((get_inactive && use_secondary_) ||
+        (!get_inactive && !use_secondary_)) {
+      if (!constant_blob_) {
+        constant_blob_ = allocate_constant_blob();
+      }
+      return constant_blob_.get();
+    } else {
+      if (!constant_blob_secondary_) {
+        constant_blob_secondary_ = allocate_constant_blob();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       return constant_blob_secondary_.get();
     }
diff --git a/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h b/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
index fafbb6f8b0e7..66d9cfbd0574 100644
--- a/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
+++ b/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
@@ -17,6 +17,7 @@
     }                                                                     \
   }
 
+<<<<<<< HEAD
 static ze_module_handle_t create_module(
     ze_context_handle_t context,
     ze_device_handle_t device,
@@ -48,11 +49,49 @@ static ze_module_handle_t create_module(
   }
   if (buildlog) {
     ZE_CHECK(zeModuleBuildLogDestroy(buildlog));
+=======
+static ze_module_handle_t _createModule(
+    const uint8_t* binaryPtr,
+    size_t binarySize) {
+  sycl::device& syclDevice =
+      c10::xpu::get_raw_device(c10::xpu::current_device());
+  auto& syclContext = c10::xpu::get_device_context();
+  auto device =
+      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclDevice);
+  auto context =
+      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclContext);
+
+  const char* buildFlags = "";
+  const ze_module_format_t format = ZE_MODULE_FORMAT_IL_SPIRV;
+  ze_module_desc_t moduleDescription = {};
+  moduleDescription.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+  moduleDescription.format = format;
+  moduleDescription.inputSize = binarySize;
+  moduleDescription.pInputModule = (uint8_t*)binaryPtr;
+  moduleDescription.pBuildFlags = buildFlags;
+  ze_module_build_log_handle_t buildLog = nullptr;
+  ze_module_handle_t module = nullptr;
+  auto error_no = ZE_RESULT_SUCCESS;
+  error_no =
+      zeModuleCreate(context, device, &moduleDescription, &module, &buildLog);
+
+  if (error_no != ZE_RESULT_SUCCESS) {
+    size_t szLog = 0;
+    ZE_CHECK(zeModuleBuildLogGetString(buildLog, &szLog, nullptr));
+    char* strLog = (char*)malloc(szLog);
+    ZE_CHECK(zeModuleBuildLogGetString(buildLog, &szLog, strLog));
+    std::cerr << "L0 build module failed. Log: " << strLog << std::endl;
+    free(strLog);
+  }
+  if (buildLog) {
+    ZE_CHECK(zeModuleBuildLogDestroy(buildLog));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   ZE_CHECK(error_no);
   return module;
 }
 
+<<<<<<< HEAD
 ze_kernel_handle_t create_function(
     ze_module_handle_t module,
     ze_kernel_flags_t flag,
@@ -115,6 +154,34 @@ static std::unique_ptr<sycl::kernel> getKernel(
   return std::make_unique<sycl::kernel>(fun);
 }
 
+=======
+static std::unique_ptr<sycl::kernel> _createKernel(
+    ze_module_handle_t module,
+    const char* kernelName) {
+  assert(module);
+  assert(kernelName);
+  ze_kernel_handle_t kernel = nullptr;
+  ze_kernel_desc_t kernelDescription = {};
+  kernelDescription.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC;
+  kernelDescription.pNext = nullptr;
+  kernelDescription.flags = ZE_KERNEL_FLAG_FORCE_RESIDENCY;
+  kernelDescription.pKernelName = kernelName;
+  ZE_CHECK(zeKernelCreate(module, &kernelDescription, &kernel));
+
+  auto& syclContext = c10::xpu::get_device_context();
+  auto mod = sycl::make_kernel_bundle<
+      sycl::backend::ext_oneapi_level_zero,
+      sycl::bundle_state::executable>(
+      {module, sycl::ext::oneapi::level_zero::ownership::transfer},
+      syclContext);
+  auto fun = sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
+      {mod, kernel, sycl::ext::oneapi::level_zero::ownership::transfer},
+      syclContext);
+  return std::make_unique<sycl::kernel>(fun);
+}
+
+// GPU Cpp Wrapper API
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 [[maybe_unused]] static std::unique_ptr<sycl::kernel> loadKernel(
     std::string filePath,
     const std::string& funcName,
@@ -125,6 +192,7 @@ static std::unique_ptr<sycl::kernel> getKernel(
     std::filesystem::path p2{filePath};
     filePath = (p1 / p2.filename()).string();
   }
+<<<<<<< HEAD
   auto mod = loadModule(filePath);
   return getKernel(mod, funcName.c_str());
 }
@@ -174,5 +242,79 @@ static std::unique_ptr<sycl::kernel> getKernel(
     }
   };
   auto event = queue_ptr->submit(cgf);
+=======
+
+  std::ifstream IFS(filePath.c_str(), std::ios::binary);
+  std::ostringstream OSS;
+  OSS << IFS.rdbuf();
+  std::string data(OSS.str());
+
+  auto mod = _createModule(
+      reinterpret_cast<const uint8_t*>(data.c_str()), data.size());
+
+  return _createKernel(mod, funcName.c_str());
+}
+
+// GPU Cpp Wrapper API
+[[maybe_unused]] static std::unique_ptr<sycl::kernel> loadKernel(
+    const void* start,
+    const void* end,
+    const std::string& funcName,
+    uint32_t sharedMemBytes) {
+  size_t size = reinterpret_cast<const uint8_t*>(end) -
+      reinterpret_cast<const uint8_t*>(start);
+
+  auto mod = _createModule(reinterpret_cast<const uint8_t*>(start), size);
+
+  return _createKernel(mod, funcName.c_str());
+}
+
+// GPU Cpp Wrapper API
+[[maybe_unused]] static void launchKernel(
+    std::unique_ptr<sycl::kernel>& kernelPtr,
+    uint32_t gridX,
+    uint32_t gridY,
+    uint32_t gridZ,
+    uint32_t numWarps,
+    uint32_t sharedMemory,
+    void** params,
+    sycl::queue* queuePtr) {
+  std::string kernelName =
+      kernelPtr->get_info<sycl::info::kernel::function_name>();
+  // Currently threadsPerWarp is hard code to 32 from torch.compile to triton
+  // stack.
+  int threadsPerWarp = 32;
+  uint32_t numParams = kernelPtr->get_info<sycl::info::kernel::num_args>();
+  size_t globalRangeX = gridX * threadsPerWarp * numWarps;
+  size_t globalRangeY = gridY;
+  size_t globalRangeZ = gridZ;
+  size_t localRangeX = numWarps * threadsPerWarp;
+  size_t localRangeY = 1;
+  size_t localRangeZ = 1;
+  sycl::range<3> globalRange(globalRangeZ, globalRangeY, globalRangeX);
+  sycl::range<3> localRange(localRangeZ, localRangeY, localRangeX);
+  sycl::nd_range<3> parallelWorkSize(globalRange, localRange);
+  if (sharedMemory) {
+    // numParams from sycl info  = user provided args + sharedMemroyBuffer
+    numParams -= 1;
+  }
+  // Submit the imported kernel.
+  auto cgf = [&](sycl::handler& cgh) {
+    for (uint32_t i = 0; i < numParams; ++i) {
+      cgh.set_arg(i, *(static_cast<void**>(params[i])));
+    }
+
+    if (sharedMemory > 0) {
+      constexpr int dimensions = 1;
+      using share_mem_t = sycl::local_accessor<int8_t, dimensions>;
+      share_mem_t localBuffer = share_mem_t(sharedMemory, cgh);
+      cgh.set_arg(numParams, localBuffer);
+      cgh.parallel_for(parallelWorkSize, *kernelPtr);
+    } else {
+      cgh.parallel_for(parallelWorkSize, *kernelPtr);
+    }
+  };
+  auto event = queuePtr->submit(cgf);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #endif
diff --git a/torch/csrc/inductor/aoti_runtime/utils.h b/torch/csrc/inductor/aoti_runtime/utils.h
index 2f23826be77f..08f9d4b291f1 100644
--- a/torch/csrc/inductor/aoti_runtime/utils.h
+++ b/torch/csrc/inductor/aoti_runtime/utils.h
@@ -135,6 +135,125 @@ class RAIIAtenTensorHandle {
   std::unique_ptr<AtenTensorOpaque, DeleterFnPtr> handle_;
 };
 
+<<<<<<< HEAD
+=======
+class MaybeOwningAtenTensorHandle {
+ public:
+  MaybeOwningAtenTensorHandle() : handle_(nullptr), raii_handle_() {}
+  // We skip copy constructor as MaybeOwningAtenTensorHandle might be RAII which
+  // makes it undefined.
+  MaybeOwningAtenTensorHandle(const MaybeOwningAtenTensorHandle& other) =
+      delete;
+  MaybeOwningAtenTensorHandle& operator=(
+      const MaybeOwningAtenTensorHandle& other) = delete;
+
+  // Move constructor and move assignment operator
+  MaybeOwningAtenTensorHandle(MaybeOwningAtenTensorHandle&& other) = default;
+  MaybeOwningAtenTensorHandle& operator=(MaybeOwningAtenTensorHandle&& other) =
+      default;
+
+  // Steal the ownership from another RAIIAtenTensorHandle using std::move
+  MaybeOwningAtenTensorHandle(RAIIAtenTensorHandle&& other)
+      : raii_handle_(std::move(other)) {
+    handle_ = raii_handle_.get();
+  }
+  MaybeOwningAtenTensorHandle& operator=(RAIIAtenTensorHandle&& other) {
+    raii_handle_ = std::move(other);
+    handle_ = raii_handle_.get();
+    return *this;
+  }
+
+  // By default, steal the ownership from raw AtenTensorHandle
+  MaybeOwningAtenTensorHandle(AtenTensorHandle handle) : raii_handle_(handle) {
+    handle_ = raii_handle_.get();
+  }
+
+  // If user_managed is true, we do not steal the ownership.
+  MaybeOwningAtenTensorHandle(AtenTensorHandle handle, bool user_managed) {
+    if (user_managed) {
+      aoti_torch_new_tensor_handle(handle, &handle_);
+    } else {
+      raii_handle_ = RAIIAtenTensorHandle(handle);
+      handle_ = raii_handle_.get();
+    }
+  }
+
+  ~MaybeOwningAtenTensorHandle() {
+    // This is no-op if we don't hold raii_handle with the
+    // MaybeOwningAtenTensorHandle.
+    raii_handle_.reset();
+  }
+
+  // Return a raw AtenTensorHandle to be used by aoti_torch functions
+  // Note: this function does NOT transfer the ownership of the handle
+  operator AtenTensorHandle() const {
+    return handle_;
+  }
+
+  AtenTensorHandle release() {
+    if (raii_handle_) {
+      return raii_handle_.release();
+    } else {
+      AtenTensorHandle handle = handle_;
+      handle_ = nullptr;
+      return handle;
+    }
+  }
+
+  AtenTensorHandle get() const {
+    return handle_;
+  }
+
+  void reset() {
+    handle_ = nullptr;
+    raii_handle_.reset();
+  }
+
+  int64_t size(int64_t d) {
+    int64_t size = 0;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(handle_, d, &size));
+    return size;
+  }
+
+  int64_t stride(int64_t d) {
+    int64_t stride = 0;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_stride(handle_, d, &stride));
+    return stride;
+  }
+
+  int64_t storage_offset() {
+    int64_t storage_offset = 0;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_storage_offset(handle_, &storage_offset));
+    return storage_offset;
+  }
+
+  void* data_ptr() const {
+    void* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(handle_, &result));
+    return result;
+  }
+
+  int64_t* sizes() const {
+    int64_t* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes(handle_, &result));
+    return result;
+  }
+
+  int64_t* strides() const {
+    int64_t* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(handle_, &result));
+    return result;
+  }
+
+ private:
+  // handle_ is the underlying AtenTensorHandle of raii_handle_ if raii_handle_
+  // exists. Otherwise it would just be the AtenTensorHandle passed in by users.
+  AtenTensorHandle handle_;
+  RAIIAtenTensorHandle raii_handle_;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Steal the ownership from raw AtenTensorHandle to RAIIAtenTensorHandle
 inline std::vector<RAIIAtenTensorHandle> steal_from_raw_handles_to_raii_handles(
     AtenTensorHandle* handles,
diff --git a/torch/csrc/inductor/aoti_runtime/utils_cuda.h b/torch/csrc/inductor/aoti_runtime/utils_cuda.h
index d024c3ac411f..79ad80054eab 100644
--- a/torch/csrc/inductor/aoti_runtime/utils_cuda.h
+++ b/torch/csrc/inductor/aoti_runtime/utils_cuda.h
@@ -9,6 +9,14 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+<<<<<<< HEAD
+=======
+#ifndef USE_ROCM
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::aot_inductor {
 
diff --git a/torch/csrc/inductor/aoti_torch/c/shim.h b/torch/csrc/inductor/aoti_torch/c/shim.h
index f56f6eca7449..daee81994c2d 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -106,6 +106,10 @@ AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cpu();
 AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cuda();
 AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_meta();
 AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_xpu();
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_mps();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_privateuse1();
 
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e5m2();
@@ -128,6 +132,10 @@ AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_bool();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex32();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex64();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex128();
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 AOTI_TORCH_EXPORT int32_t aoti_torch_layout_strided();
 AOTI_TORCH_EXPORT int32_t aoti_torch_layout_sparse_coo();
@@ -272,6 +280,12 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset(
     AtenTensorHandle tensor,
     int64_t* ret_storage_offset);
 
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_is_contiguous(AtenTensorHandle tensor, bool* ret_is_contiguous);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_new_tensor_handle(
     AtenTensorHandle orig_handle,
     AtenTensorHandle* new_handle);
@@ -783,11 +797,14 @@ int32_t aoti_torch_dtype() = delete;
     return aoti_torch_dtype_##typename();            \
   }
 
+<<<<<<< HEAD
 namespace c10 {
 struct BFloat16;
 struct Half;
 } // namespace c10
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DEFINE_DTYPE_SPECIALIZATION(c10::BFloat16, bfloat16)
 DEFINE_DTYPE_SPECIALIZATION(c10::Half, float16)
 DEFINE_DTYPE_SPECIALIZATION(c10::complex<float>, complex64)
diff --git a/torch/csrc/inductor/aoti_torch/c/shim_cpu.h b/torch/csrc/inductor/aoti_torch/c/shim_cpu.h
new file mode 100644
index 000000000000..c7b713bf7f87
--- /dev/null
+++ b/torch/csrc/inductor/aoti_torch/c/shim_cpu.h
@@ -0,0 +1,251 @@
+#ifndef AOTI_TORCH_SHIM_CPU
+#define AOTI_TORCH_SHIM_CPU
+
+#include <ATen/Config.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if AT_MKLDNN_ENABLED()
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu_mkldnn__convolution_pointwise_binary(
+    AtenTensorHandle X,
+    AtenTensorHandle other,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu_mkldnn__convolution_pointwise_binary_(
+    AtenTensorHandle other,
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn__convolution_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu_mkldnn__convolution_transpose_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* output_padding,
+    int64_t output_padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer(
+    AtenTensorHandle input,
+    AtenTensorHandle weight0,
+    AtenTensorHandle weight1,
+    AtenTensorHandle weight2,
+    AtenTensorHandle weight3,
+    AtenTensorHandle hx_,
+    AtenTensorHandle cx_,
+    int32_t reverse,
+    const int64_t* batch_sizes,
+    int64_t batch_sizes_len_,
+    int64_t mode,
+    int64_t hidden_size,
+    int64_t num_layers,
+    int32_t has_biases,
+    int32_t bidirectional,
+    int32_t batch_first,
+    int32_t train,
+    AtenTensorHandle* ret0,
+    AtenTensorHandle* ret1,
+    AtenTensorHandle* ret2,
+    AtenTensorHandle* ret3);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linear_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linear_pointwise_binary(
+    AtenTensorHandle X,
+    AtenTensorHandle other,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const char* attr,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__qlinear_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* post_op_name,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char* post_op_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu__qlinear_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* other,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    const char* binary_post_op,
+    double binary_alpha,
+    const char* unary_post_op,
+    const double** unary_post_op_args,
+    int64_t unary_post_op_args_len_,
+    const char* unary_post_op_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__qconv_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* attr,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu__qconv2d_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle accum,
+    AtenTensorHandle* B,
+    const int64_t* stride_args,
+    int64_t stride_len_,
+    const int64_t* padding_args,
+    int64_t padding_len_,
+    const int64_t* dilation_args,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
+#if AT_MKL_ENABLED()
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mkl_linear(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle origin_W,
+    AtenTensorHandle* B,
+    int64_t prepack_batch_size,
+    AtenTensorHandle* ret0);
+
+#endif // AT_MKL_ENABLED
+
+#endif // AT_MKLDNN_ENABLED()
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_int4pack_mm_cpu_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle w,
+    AtenTensorHandle qGroupSize,
+    AtenTensorHandle qScaleAndZeros,
+    AtenTensorHandle* ret0);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOTI_TORCH_SHIM_CPU
diff --git a/torch/csrc/inductor/aoti_torch/c/shim_mps.h b/torch/csrc/inductor/aoti_torch/c/shim_mps.h
new file mode 100644
index 000000000000..bd86885de13c
--- /dev/null
+++ b/torch/csrc/inductor/aoti_torch/c/shim_mps.h
@@ -0,0 +1,39 @@
+#ifndef AOTI_TORCH_SHIM_MPS
+#define AOTI_TORCH_SHIM_MPS
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AOTIMetalKernelFunctionOpaque;
+using AOTIMetalKernelFunctionHandle = AOTIMetalKernelFunctionOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set_arg_tensor(
+    AOTIMetalKernelFunctionHandle func,
+    unsigned idx,
+    AtenTensorHandle tensor);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set_arg_int(
+    AOTIMetalKernelFunctionHandle func,
+    unsigned idx,
+    int64_t val);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_mps_malloc(void** buffer, size_t num_bytes);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_free(void* ptr);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_memcpy(
+    void* buffer,
+    size_t constant_offset,
+    size_t bytes_read,
+    size_t data_size,
+    uint8_t* constants_start);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOTI_TORCH_SHIM_MPS
diff --git a/torch/csrc/inductor/aoti_torch/c/shim_xpu.h b/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
index baecfc352179..b487764d5129 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
@@ -1,6 +1,10 @@
 #ifndef AOTI_TORCH_SHIM_XPU
 #define AOTI_TORCH_SHIM_XPU
 
+<<<<<<< HEAD
+=======
+#include <ATen/Config.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
 #ifdef USE_XPU
@@ -45,6 +49,71 @@ aoti_torch_set_current_xpu_device(const int32_t& device_index);
 
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_current_sycl_queue(void** ret);
 
+<<<<<<< HEAD
+=======
+#if AT_MKLDNN_ENABLED()
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_xpu_mkldnn__convolution_pointwise_binary(
+    AtenTensorHandle X,
+    AtenTensorHandle other,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_xpu_mkldnn__convolution_pointwise_binary_(
+    AtenTensorHandle other,
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
+#endif // AT_MKLDNN_ENABLED()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
index 682364e950c4..b99ac2dd3b70 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
@@ -37,10 +37,19 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_flash_attent
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_flash_attention_for_cpu_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
@@ -70,6 +79,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummin(AtenTensorHandle self, in
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill__Scalar(AtenTensorHandle self, double value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
@@ -77,6 +90,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d_backward(A
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_gcd(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
@@ -100,10 +117,19 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mode(AtenTensorHandle self, int6
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
@@ -134,6 +160,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_Tensor(AtenTensorHandle se
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
index 277267d315a9..a2bcb4f3bb02 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
@@ -49,6 +49,11 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__segment_reduce_backward(AtenTe
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_lstm_cell(AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle cx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScaleAndZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
@@ -78,6 +83,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummin(AtenTensorHandle self, i
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fill__Scalar(AtenTensorHandle self, double value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
@@ -85,6 +94,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool3d_backward(
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_gcd(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
@@ -107,10 +120,19 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mode(AtenTensorHandle self, int
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
@@ -141,6 +163,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_Tensor(AtenTensorHandle s
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
new file mode 100644
index 000000000000..7002a7153d83
--- /dev/null
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
@@ -0,0 +1,119 @@
+
+
+// WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND.
+// See https://github.com/pytorch/pytorch/blob/7e86a7c0155295539996e0cf422883571126073e/torchgen/gen.py#L2424-L2436 for details
+
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__adaptive_avg_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__histogramdd_from_bin_cts(AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__weight_int4pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScaleAndZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_add_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_add_Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_addmv(AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_angle(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_avg_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_baddbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_bernoulli__Tensor(AtenTensorHandle self, AtenTensorHandle p, AtenGeneratorHandle* generator);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_bernoulli__float(AtenTensorHandle self, double p, AtenGeneratorHandle* generator);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_fill__Scalar(AtenTensorHandle self, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_lu_unpack(AtenTensorHandle LU_data, AtenTensorHandle LU_pivots, int32_t unpack_data, int32_t unpack_pivots, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_masked_select(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_max_pool2d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_max_pool2d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_median(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_rand_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randint_generator(int64_t high, const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randint_low(int64_t low, int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randn_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_replication_pad1d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_reshape(AtenTensorHandle self, const int64_t* shape, int64_t shape_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_searchsorted_Scalar(AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_searchsorted_Tensor(AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set__source_Tensor(AtenTensorHandle self, AtenTensorHandle source);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_uniform(AtenTensorHandle self, double from, double to, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_upsample_linear1d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_upsample_trilinear3d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_view_as_complex(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_view_as_real(AtenTensorHandle self, AtenTensorHandle* ret0);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
index b8e16eac74f2..1ec215ed7608 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
@@ -16,6 +16,11 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__fused_moving_avg_obs_fq_helper_
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__weight_int4pack_mm_with_scales_and_zeros(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScale, AtenTensorHandle qZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_add_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
@@ -28,6 +33,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_convolution_backward(AtenTensorH
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
@@ -35,7 +44,14 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_masked_scatter(AtenTensorHandle
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+<<<<<<< HEAD
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_rand_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
@@ -51,6 +67,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_resize_as_(AtenTensorHandle self
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+<<<<<<< HEAD
+=======
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_uniform(AtenTensorHandle self, double from, double to, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
index f33301a7ed56..c0c7b06701bd 100644
--- a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
+++ b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
@@ -5,11 +5,31 @@
 
 #include <c10/util/Exception.h>
 #include <torch/csrc/inductor/aoti_torch/oss_proxy_executor.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/pickle.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace {
 at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle) {
   return reinterpret_cast<at::Tensor*>(handle);
 }
+<<<<<<< HEAD
+=======
+
+bool has_key(
+    const std::unordered_map<std::string, c10::IValue>& map,
+    const std::string& key) {
+  return map.find(key) != map.end();
+}
+
+#ifdef _WIN32
+const std::string k_separator = "\\";
+#else
+const std::string k_separator = "/";
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 namespace torch::aot_inductor {
@@ -18,20 +38,59 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
     size_t index,
     const at::TypePtr& schema_arg_type,
     const nlohmann::json& serialized_arg,
+<<<<<<< HEAD
     OSSOpKernel& op_kernel) {
   auto& stack = op_kernel.stack_;
   auto& dynamic_args = op_kernel.dynamic_args_;
+=======
+    OSSOpKernel* op_kernel,
+    const std::string& torchbind_obj_name) {
+  auto& stack = op_kernel->stack_;
+  auto& dynamic_args = op_kernel->dynamic_args_;
+  auto& torchbind_args = op_kernel->torchbind_args_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   TORCH_CHECK(serialized_arg.size() == 1);
   std::string serialized_arg_type = serialized_arg.begin().key();
   auto& serialized_arg_val = serialized_arg.begin().value();
 
   switch (schema_arg_type->kind()) {
+<<<<<<< HEAD
+=======
+    case c10::TypeKind::ClassType: {
+      TORCH_CHECK(
+          serialized_arg_type == "as_custom_obj",
+          "Expected extern kernel ",
+          op_kernel->target_,
+          " to have serialized argument type as_custom_obj for argument ",
+          index,
+          " but got ",
+          serialized_arg_type);
+
+      TORCH_CHECK(
+          has_key(custom_objs_, torchbind_obj_name),
+          "ProxyExecutor does not have a custom object named ",
+          torchbind_obj_name,
+          " from extern kernel ",
+          op_kernel->target_,
+          " argument ",
+          index);
+
+      LOG(INFO) << "Prefilling stack with torchbind argument "
+                << torchbind_obj_name;
+      torchbind_args.emplace_back(index, torchbind_obj_name);
+      break;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     case c10::TypeKind::TensorType: {
       TORCH_CHECK(
           serialized_arg_type == "as_tensor",
           "Expected extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " to have serialized argument type as_tensor for argument ",
           index,
           " but got ",
@@ -43,7 +102,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
       TORCH_CHECK(
           serialized_arg_type == "as_int",
           "Expected extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " to have serialized argument type as_int for argument ",
           index,
           " but got ",
@@ -56,7 +119,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
           serialized_arg_type == "as_int" ||
               serialized_arg_type == "as_sym_int",
           "Expected extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " to have serialized argument type as_int or as_sym_int for argument ",
           index,
           " but got ",
@@ -68,7 +135,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
       TORCH_CHECK(
           serialized_arg_type == "as_float",
           "Expected extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " to have serialized argument type as_float for argument ",
           index,
           " but got ",
@@ -80,7 +151,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
       TORCH_CHECK(
           serialized_arg_type == "as_bool",
           "Expected extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " to have serialized argument type as_bool for argument ",
           index,
           " but got ",
@@ -100,7 +175,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
         TORCH_CHECK(
             false,
             "Expected extern kernel ",
+<<<<<<< HEAD
             op_kernel.target_,
+=======
+            op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " to have a scalar input for argument ",
             index,
             " but got ",
@@ -112,7 +191,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
       TORCH_CHECK(
           serialized_arg_type == "as_string",
           "Expected extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " to have serialized argument type as_string for argument ",
           index,
           " but got ",
@@ -124,7 +207,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
       TORCH_CHECK(
           serialized_arg_type == "as_scalar_type",
           "Expected extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " to have serialized argument type as_scalar_type for argument ",
           index,
           " but got ",
@@ -136,7 +223,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
       TORCH_CHECK(
           serialized_arg_type == "as_memory_format",
           "Expected extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " to have serialized argument type as_memory_format for argument ",
           index,
           " but got ",
@@ -148,7 +239,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
       TORCH_CHECK(
           serialized_arg_type == "as_layout",
           "Expected extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " to have serialized argument type as_layout for argument ",
           index,
           " but got ",
@@ -160,7 +255,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
       TORCH_CHECK(
           serialized_arg_type == "as_device",
           "Expected extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " to have serialized argument type as_device for argument ",
           index,
           " but got ",
@@ -169,14 +268,26 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
       std::string device_string = serialized_arg_val["type"].get<std::string>();
       if (serialized_arg_val.contains("index") &&
           serialized_arg_val["index"].is_number()) {
+<<<<<<< HEAD
         device_string += ":" + serialized_arg_val["index"].get<std::string>();
+=======
+        auto index = serialized_arg_val["index"].get<int>();
+        device_string += ":" + std::to_string(index);
+        device_->set_index(static_cast<int8_t>(index));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
 
       c10::Device device(device_string);
 
+<<<<<<< HEAD
       if (device != *device_) {
         VLOG(1) << "ProxyExecutor is using " << *device_ << " for "
                 << op_kernel.target_ << " argument #" << index
+=======
+      if (device.type() != device_->type()) {
+        VLOG(1) << "ProxyExecutor is using " << *device_ << " for "
+                << op_kernel->target_ << " argument #" << index
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 << ", which is different from the one serialized in thrift: "
                 << device << ". Please ensure this is intentional.";
       }
@@ -189,7 +300,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
         TORCH_CHECK(
             serialized_arg_type == "as_tensors",
             "Expected extern kernel ",
+<<<<<<< HEAD
             op_kernel.target_,
+=======
+            op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " to have serialized argument type as_tensors for argument ",
             index,
             " but got ",
@@ -201,7 +316,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
         TORCH_CHECK(
             serialized_arg_type == "as_ints",
             "Expected extern kernel ",
+<<<<<<< HEAD
             op_kernel.target_,
+=======
+            op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " to have serialized argument type as_ints for argument ",
             index,
             " but got ",
@@ -213,7 +332,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
             serialized_arg_type == "as_ints" ||
                 serialized_arg_type == "as_sym_ints",
             "Expected extern kernel ",
+<<<<<<< HEAD
             op_kernel.target_,
+=======
+            op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " to have serialized argument type as_ints or as_sym_ints for argument ",
             index,
             " but got ",
@@ -224,7 +347,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
         TORCH_CHECK(
             serialized_arg_type == "as_floats",
             "Expected extern kernel ",
+<<<<<<< HEAD
             op_kernel.target_,
+=======
+            op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " to have serialized argument type as_floats for argument ",
             index,
             " but got ",
@@ -238,7 +365,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
         TORCH_CHECK(
             serialized_arg_type == "as_bools",
             "Expected extern kernel ",
+<<<<<<< HEAD
             op_kernel.target_,
+=======
+            op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " to have serialized argument type as_bools for argument ",
             index,
             " but got ",
@@ -268,7 +399,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
           TORCH_CHECK(
               false,
               "Expected extern kernel ",
+<<<<<<< HEAD
               op_kernel.target_,
+=======
+              op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               " to have a List[Scalar] input for argument ",
               index,
               " but got ",
@@ -293,7 +428,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
           TORCH_CHECK(
               false,
               "Expected extern kernel ",
+<<<<<<< HEAD
               op_kernel.target_,
+=======
+              op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               " to have a Tensor?[] input for argument ",
               index,
               " but got ",
@@ -303,7 +442,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
         TORCH_CHECK(
             serialized_arg_type == "as_strings",
             "Expected extern kernel ",
+<<<<<<< HEAD
             op_kernel.target_,
+=======
+            op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " to have serialized argument type as_strings for argument ",
             index,
             " but got ",
@@ -319,7 +462,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
             "NYI: Unsupported list type ",
             serialized_arg_type,
             " for extern kernel ",
+<<<<<<< HEAD
             op_kernel.target_,
+=======
+            op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             " argument ",
             index);
       }
@@ -352,7 +499,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
         }
       } else {
         prefill_stack_with_static_arguments(
+<<<<<<< HEAD
             index, inner_type, serialized_arg, op_kernel);
+=======
+            index, inner_type, serialized_arg, op_kernel, torchbind_obj_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       }
       break;
     }
@@ -363,7 +514,11 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
           "Unsupported input type ",
           serialized_arg_type,
           " for extern kernel ",
+<<<<<<< HEAD
           op_kernel.target_,
+=======
+          op_kernel->target_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           " argument ",
           index);
   }
@@ -375,12 +530,24 @@ void OSSProxyExecutor::get_input_info_from_serialized(
     const nlohmann::json& serialized_node,
     OSSOpKernel& op_kernel) {
   std::vector<bool> filled(schema_args.size(), false);
+<<<<<<< HEAD
   TORCH_CHECK(op_kernel.stack_.size() == 0);
+=======
+  TORCH_CHECK(op_kernel.stack_.empty());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   op_kernel.stack_.resize(schema_args.size());
   for (const auto& named_argument : serialized_node["inputs"]) {
     const auto& arg = named_argument["arg"];
     const auto& name = named_argument["name"].get<std::string>();
 
+<<<<<<< HEAD
+=======
+    std::string custom_obj_name = "";
+    if (arg.contains("as_custom_obj")) {
+      custom_obj_name = arg["as_custom_obj"]["name"].get<std::string>();
+    }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Doing a linear lookup in the schema to find the index
     // of a static argument. Should be fine performance wise
     // because we usually only have small amount of arguments.
@@ -388,7 +555,11 @@ void OSSProxyExecutor::get_input_info_from_serialized(
       auto& schema_arg = schema_args[index];
       if (schema_arg.name() == name) {
         prefill_stack_with_static_arguments(
+<<<<<<< HEAD
             index, schema_arg.real_type(), arg, op_kernel);
+=======
+            index, schema_arg.real_type(), arg, &op_kernel, custom_obj_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         filled[index] = true;
         break;
       }
@@ -398,9 +569,15 @@ void OSSProxyExecutor::get_input_info_from_serialized(
   // If an argument is not filled and has a default value, we should
   // also prefill the default value.
   for (size_t index = 0; index < schema_args.size(); index++) {
+<<<<<<< HEAD
     if (!filled[index] && schema_args[index].default_value()) {
       auto default_value = *schema_args[index].default_value();
       op_kernel.stack_.at(index) = default_value;
+=======
+    auto default_value = schema_args[index].default_value();
+    if (!filled[index] && default_value.has_value()) {
+      op_kernel.stack_.at(index) = std::move(default_value.value());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
 }
@@ -439,6 +616,20 @@ void OSSProxyExecutor::get_output_info_from_serialized(
         outputs.emplace_back(output_index, DynamicArgType::TensorType, 1);
         break;
       }
+<<<<<<< HEAD
+=======
+      case c10::TypeKind::NoneType: {
+        TORCH_CHECK(
+            serialized_output_type == "as_none",
+            "Expected extern kernel ",
+            serialized_node["target"],
+            " to have serialized output type as_none, ",
+            " but got ",
+            serialized_output_type);
+        outputs.emplace_back(output_index, DynamicArgType::NoneType, 1);
+        break;
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       case c10::TypeKind::ListType: {
         if (schema_return_type->isSubtypeOf(at::ListType::ofTensors())) {
           TORCH_CHECK(
@@ -460,6 +651,37 @@ void OSSProxyExecutor::get_output_info_from_serialized(
         }
         break;
       }
+<<<<<<< HEAD
+=======
+      case c10::TypeKind::OptionalType: {
+        auto inner_type =
+            schema_return_type->castRaw<at::OptionalType>()->getElementType();
+        if (inner_type->kind() == c10::TypeKind::TensorType) {
+          TORCH_CHECK(serialized_output_type == "as_optional_tensor");
+          if (serialized_output_val.begin().key() == "as_none") {
+            outputs.emplace_back(output_index, DynamicArgType::NoneType, 1);
+          } else if (serialized_output_val.begin().key() == "as_tensor") {
+            outputs.emplace_back(output_index, DynamicArgType::TensorType, 1);
+          } else {
+            TORCH_CHECK(
+                false,
+                "Only as_none or as_tensor is supported for as_optional_tensor");
+          }
+        }
+        break;
+      }
+      case c10::TypeKind::IntType: {
+        TORCH_CHECK(
+            serialized_output_type == "as_int",
+            "Expected extern kernel ",
+            serialized_node["target"],
+            " to have serialized output type as_int, ",
+            " but got ",
+            serialized_output_type);
+        outputs.emplace_back(output_index, DynamicArgType::IntType, 1);
+        break;
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       default: {
         TORCH_CHECK(
             false,
@@ -474,7 +696,58 @@ void OSSProxyExecutor::get_output_info_from_serialized(
   }
 }
 
+<<<<<<< HEAD
 OSSProxyExecutor::OSSProxyExecutor(const std::string& json_path, bool is_cpu) {
+=======
+std::unique_ptr<OSSCallTorchBindKernel> OSSProxyExecutor::
+    get_call_torch_bind_kernel(const nlohmann::json& serialized_node) {
+  // const std::string& target = serialized_node["target"].get<std::string>();
+  TORCH_CHECK(
+      serialized_node["inputs"].size() > 1,
+      "Expects higher_order.call_torchbind to only have at least 2 attributes, object and methodName");
+
+  const auto first_input = serialized_node["inputs"][0]["arg"]["as_custom_obj"];
+  const std::string torchbind_obj_name = first_input["name"].get<std::string>();
+  const std::string class_fqn = first_input["class_fqn"].get<std::string>();
+  const std::string method_name =
+      serialized_node["inputs"][1]["arg"]["as_string"].get<std::string>();
+
+  auto customClassType_ = torch::jit::getCustomClass(class_fqn);
+  auto method = customClassType_->findMethod(method_name);
+
+  CHECK(method != nullptr) << "method not found: " << method_name;
+
+  TORCH_CHECK(
+      has_key(custom_objs_, torchbind_obj_name),
+      "ProxyExecutor does not have a custom object named ",
+      torchbind_obj_name,
+      " from call_torchbind ");
+
+  const c10::FunctionSchema& schema = method->getSchema();
+
+  const auto& schema_args = schema.arguments();
+  const auto& schema_returns = schema.returns();
+
+  std::unique_ptr<OSSCallTorchBindKernel> op_kernel =
+      std::make_unique<OSSCallTorchBindKernel>("call_torchbind", method);
+  auto modified_serialized_node = serialized_node;
+  // Remove the second elements (the method string) from inputs because they
+  // are only for HOP
+  auto& inputs = modified_serialized_node["inputs"];
+  // Erase the second element (index 1)
+  inputs.erase(inputs.begin() + 1);
+
+  get_input_info_from_serialized(
+      schema_args, modified_serialized_node, *op_kernel);
+  get_output_info_from_serialized(schema_returns, serialized_node, *op_kernel);
+  return op_kernel;
+}
+
+OSSProxyExecutor::OSSProxyExecutor(
+    const std::string& json_path,
+    bool is_cpu,
+    std::optional<std::unordered_map<std::string, c10::IValue>> custom_objs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (is_cpu) {
     device_ = std::make_unique<c10::Device>(c10::DeviceType::CPU);
   } else {
@@ -482,7 +755,59 @@ OSSProxyExecutor::OSSProxyExecutor(const std::string& json_path, bool is_cpu) {
     device_ = std::make_unique<c10::Device>(c10::DeviceType::CUDA, device_idx);
   }
 
+<<<<<<< HEAD
   std::string extern_kernel_nodes_serialized;
+=======
+  // If custom_objs is provided, use it instead of loading from
+  // custom_objs_config.json If custom_objs is not provided, try to load from
+  // custom_objs_config.json
+  if (custom_objs.has_value()) {
+    custom_objs_ = std::move(custom_objs.value());
+  } else {
+    // Load custom objects from custom_objs_config.json file
+    // Get the constants json path from the extern_kernel_nodes .json file
+
+    size_t lastSlash = json_path.find_last_of("/\\");
+    std::string folder_path = json_path.substr(0, lastSlash);
+    std::string custom_objs_json_path =
+        folder_path + k_separator + "custom_objs_config.json";
+    LOG(INFO) << "Loading custom_objs_config .json file from "
+              << custom_objs_json_path;
+
+    std::ifstream custom_objs_json_file(custom_objs_json_path);
+
+    if (!custom_objs_json_file.is_open()) {
+      // BC-compatible with old files that don't have custom_objs_config.json
+      LOG(INFO) << "Unable to open custom objs json file "
+                << custom_objs_json_path;
+    } else {
+      nlohmann::json custom_objs_json;
+      custom_objs_json_file >> custom_objs_json;
+      // Load custom objects from binary torchbind file
+      for (auto& [customObjName, file_name] : custom_objs_json.items()) {
+        std::string customObjPath =
+            folder_path + k_separator + file_name.get<std::string>();
+        LOG(INFO) << "Loading custom object to FbProxyExecutor from: "
+                  << customObjPath;
+
+        std::ifstream custom_obj_file(customObjPath, std::ios::binary);
+        TORCH_CHECK(
+            custom_obj_file.is_open(), "Failed to open custom obj file");
+        std::vector<char> customObjData(
+            (std::istreambuf_iterator<char>(custom_obj_file)),
+            std::istreambuf_iterator<char>());
+        custom_obj_file.close();
+
+        std::string customObjBytes(customObjData.data(), customObjData.size());
+
+        c10::IValue custom_obj = torch::jit::pickle_load_obj(customObjBytes);
+        CHECK(custom_obj.isCustomClass());
+        CHECK(!custom_obj.isNone());
+        custom_objs_[customObjName] = std::move(custom_obj);
+      }
+    }
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::ifstream json_file(json_path);
   TORCH_CHECK(json_file.is_open(), "Unable to open file ", json_path);
@@ -512,6 +837,7 @@ OSSProxyExecutor::OSSProxyExecutor(const std::string& json_path, bool is_cpu) {
       overloadName = target.substr(pos + 1, target.length() - pos);
     }
 
+<<<<<<< HEAD
     c10::OperatorHandle op_handle =
         c10::Dispatcher::singleton().findSchemaOrThrow(
             opName.c_str(), overloadName.c_str());
@@ -525,6 +851,29 @@ OSSProxyExecutor::OSSProxyExecutor(const std::string& json_path, bool is_cpu) {
     get_output_info_from_serialized(schema_returns, serialized_node, op_kernel);
 
     op_kernels_.emplace_back(std::move(op_kernel));
+=======
+    if (target == "call_torchbind") {
+      // Special handling for CallTorchBind HOP
+      std::unique_ptr<OSSCallTorchBindKernel> op_kernel =
+          get_call_torch_bind_kernel(serialized_node);
+      op_kernels_.emplace_back(std::move(op_kernel));
+    } else {
+      c10::OperatorHandle op_handle =
+          c10::Dispatcher::singleton().findSchemaOrThrow(
+              opName.c_str(), overloadName.c_str());
+      const c10::FunctionSchema& schema = op_handle.schema();
+
+      const auto& schema_args = schema.arguments();
+      const auto& schema_returns = schema.returns();
+
+      std::unique_ptr<OSSOpKernelOperator> op_kernel =
+          std::make_unique<OSSOpKernelOperator>(target, op_handle);
+      get_input_info_from_serialized(schema_args, serialized_node, *op_kernel);
+      get_output_info_from_serialized(
+          schema_returns, serialized_node, *op_kernel);
+      op_kernels_.emplace_back(std::move(op_kernel));
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -537,10 +886,18 @@ void OSSProxyExecutor::call_function(
   TORCH_CHECK(
       extern_node_index < static_cast<int>(op_kernels_.size()),
       "Invalid extern node index");
+<<<<<<< HEAD
   OSSOpKernel& op_kernel = op_kernels_[extern_node_index];
 
   std::vector<c10::IValue> stack = op_kernel.stack_;
   auto& dynamic_args = op_kernel.dynamic_args_;
+=======
+  auto& op_kernel = op_kernels_[extern_node_index];
+
+  std::vector<c10::IValue> stack = op_kernel->stack_;
+  auto& dynamic_args = op_kernel->dynamic_args_;
+  auto& torchbind_args = op_kernel->torchbind_args_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   int tensor_id = 0;
   int int_id = 0;
@@ -608,6 +965,7 @@ void OSSProxyExecutor::call_function(
     }
   }
 
+<<<<<<< HEAD
   int num_output_tensors = op_kernel.num_output_tensors();
   TORCH_CHECK(
       tensor_id == num_tensors - num_output_tensors,
@@ -631,6 +989,36 @@ void OSSProxyExecutor::call_function(
 
   TORCH_CHECK(op_kernel.outputs_.size() == stack.size());
   // TODO: what about optional outputs? This assert may not hold
+=======
+  for (auto& torchbind_arg : torchbind_args) {
+    int arg_index = torchbind_arg.arg_index;
+    stack[arg_index] = custom_objs_[torchbind_arg.arg_name];
+  }
+
+  int num_output_tensors = op_kernel->num_output_tensors();
+  TORCH_CHECK(
+      tensor_id == num_tensors - num_output_tensors,
+      "Mismatch between tensors consumed and num of input tensor, got tensor_id = ",
+      tensor_id,
+      ", expected num = ",
+      num_tensors - num_output_tensors);
+
+  int num_output_ints = op_kernel->num_output_ints();
+  TORCH_CHECK(
+      int_id == num_ints - num_output_ints,
+      "Mismatch between ints consumed and num_ints, got int_id = ",
+      int_id,
+      ", num_ints = ",
+      num_ints - num_output_ints);
+
+  // Call the op with the prepared stack.
+  op_kernel->run(stack);
+
+  const c10::FunctionSchema& schema = op_kernel->schema();
+  const auto& schema_returns = schema.returns();
+
+  TORCH_CHECK(op_kernel->outputs_.size() == stack.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(stack.size() == schema_returns.size());
 
   int index = 0;
@@ -639,6 +1027,11 @@ void OSSProxyExecutor::call_function(
       at::Tensor* tensor =
           tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
       *tensor = stack[index++].toTensor();
+<<<<<<< HEAD
+=======
+    } else if (schema_return.type()->kind() == c10::TypeKind::NoneType) {
+      continue;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else if (
         schema_return.type()->kind() == c10::TypeKind::ListType &&
         schema_return.type()->isSubtypeOf(at::ListType::ofTensors())) {
@@ -648,6 +1041,39 @@ void OSSProxyExecutor::call_function(
             tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
         *tensor = t;
       }
+<<<<<<< HEAD
+=======
+    } else if (
+        schema_return.type()->kind() == c10::TypeKind::OptionalType &&
+        schema_return.type()
+                ->castRaw<at::OptionalType>()
+                ->getElementType()
+                ->kind() == c10::TypeKind::TensorType) {
+      if (op_kernel->outputs_[index].arg_type == DynamicArgType::TensorType) {
+        auto stack_tensor = stack[index++].toOptional<at::Tensor>();
+        at::Tensor* tensor =
+            tensor_handle_to_tensor_pointer(flatten_tensor_args[tensor_id++]);
+        if (stack_tensor.has_value()) {
+          *tensor = stack_tensor.value();
+        } else {
+          TORCH_CHECK(false, "Expected tensor, got None");
+        }
+      } else {
+        index++;
+      }
+    } else if (schema_return.real_type()->kind() == c10::TypeKind::IntType) {
+      // need to use real_type() to differentiate between IntType and SymIntType
+      // for int type, it is already specialized in downstream kernels. So we
+      // don't need to do anything here.
+      auto returned_int_value = stack[index++].toInt();
+      auto serialized_int_value = flatten_int_args[int_id++];
+      TORCH_CHECK(
+          returned_int_value == serialized_int_value,
+          "Expect returned int value to match the serialized int value, but got returned int value: ",
+          returned_int_value,
+          " and serialized int value: ",
+          serialized_int_value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       TORCH_CHECK(
           false,
@@ -662,6 +1088,16 @@ void OSSProxyExecutor::call_function(
       tensor_id,
       ", expected num = ",
       num_tensors);
+<<<<<<< HEAD
+=======
+
+  TORCH_CHECK(
+      int_id == num_ints,
+      "Mismatch between tensors consumed and num_ints, got tensor_id = ",
+      int_id,
+      ", expected num = ",
+      num_ints);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 } // namespace torch::aot_inductor
diff --git a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.h b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.h
index e32ba303bb65..9ab92cfb345c 100644
--- a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.h
+++ b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.h
@@ -6,11 +6,16 @@
 #include <nlohmann/json.hpp>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/proxy_executor.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/api/function_impl.h> // @manual
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <iostream>
 #include <utility>
 
 namespace torch::aot_inductor {
 
+<<<<<<< HEAD
 enum class DynamicArgType : int {
   TensorType = 0,
   ListTensorType = 1,
@@ -19,17 +24,22 @@ enum class DynamicArgType : int {
   ListIntType = 4,
 };
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline std::ostream& operator<<(std::ostream& os, DynamicArgType arg_type) {
   os << static_cast<int>(arg_type);
   return os;
 }
 
+<<<<<<< HEAD
 inline bool isTensorType(DynamicArgType arg_type) {
   return arg_type == DynamicArgType::TensorType ||
       arg_type == DynamicArgType::ListTensorType ||
       arg_type == DynamicArgType::ListOptionalTensorType;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct OSSDynamicArg {
   OSSDynamicArg(
       int arg_index,
@@ -47,6 +57,7 @@ struct OSSDynamicArg {
       list_item_types; // only used for parsing list of optional tensors
 };
 
+<<<<<<< HEAD
 struct OSSOpKernel {
   OSSOpKernel(std::string target, c10::OperatorHandle op_handle)
       : target_(std::move(target)), op_handle_(std::move(op_handle)) {}
@@ -54,6 +65,28 @@ struct OSSOpKernel {
   std::string target_;
   c10::OperatorHandle op_handle_;
   std::vector<OSSDynamicArg> dynamic_args_;
+=======
+struct OSSTorchBindArg {
+  OSSTorchBindArg(int arg_index, std::string arg_name)
+      : arg_index(arg_index), arg_name(std::move(arg_name)) {}
+  int arg_index;
+  // arg_name is used to find the corresponding IValue in customObjs_
+  std::string arg_name;
+};
+
+struct OSSOpKernel {
+  explicit OSSOpKernel(std::string target) : target_(std::move(target)) {}
+  // Explicitly declare copy and move constructors
+  OSSOpKernel(const OSSOpKernel&) = default;
+  OSSOpKernel(OSSOpKernel&&) = default;
+  // Explicitly declare copy and move assignment operators
+  OSSOpKernel& operator=(const OSSOpKernel&) = default;
+  OSSOpKernel& operator=(OSSOpKernel&&) = default;
+
+  std::string target_;
+  std::vector<OSSDynamicArg> dynamic_args_;
+  std::vector<OSSTorchBindArg> torchbind_args_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<OSSDynamicArg> outputs_;
   std::vector<c10::IValue> stack_;
 
@@ -66,11 +99,63 @@ struct OSSOpKernel {
     }
     return num_output_tensors;
   }
+<<<<<<< HEAD
+=======
+
+  int num_output_ints() const {
+    int num_output_ints = 0;
+    for (const auto& output : outputs_) {
+      if (output.arg_type == DynamicArgType::IntType) {
+        num_output_ints += output.length;
+      }
+    }
+    return num_output_ints;
+  }
+
+  virtual void run(std::vector<c10::IValue>& stack) = 0;
+  virtual c10::FunctionSchema schema() const = 0;
+  virtual ~OSSOpKernel() = default;
+};
+
+struct OSSOpKernelOperator : public OSSOpKernel {
+  OSSOpKernelOperator(std::string target, c10::OperatorHandle op_handle)
+      : OSSOpKernel(std::move(target)), op_handle_(std::move(op_handle)) {}
+
+  c10::OperatorHandle op_handle_;
+  void run(std::vector<c10::IValue>& stack) override {
+    op_handle_.callBoxed(stack);
+  }
+
+  c10::FunctionSchema schema() const override {
+    return op_handle_.schema();
+  }
+};
+
+struct OSSCallTorchBindKernel : public OSSOpKernel {
+  OSSCallTorchBindKernel(std::string target, torch::jit::Function* method)
+      : OSSOpKernel(std::move(target)), method_(method) {}
+  torch::jit::Function* method_;
+  void run(std::vector<c10::IValue>& stack) override {
+    method_->run(stack);
+  }
+
+  c10::FunctionSchema schema() const override {
+    return method_->getSchema();
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 class OSSProxyExecutor : public ProxyExecutor {
  public:
+<<<<<<< HEAD
   explicit OSSProxyExecutor(const std::string& json_path, bool is_cpu);
+=======
+  explicit OSSProxyExecutor(
+      const std::string& json_path,
+      bool is_cpu,
+      std::optional<std::unordered_map<std::string, c10::IValue>> custom_objs =
+          std::nullopt);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void call_function(
       int extern_node_index,
@@ -84,7 +169,12 @@ class OSSProxyExecutor : public ProxyExecutor {
       size_t index,
       const at::TypePtr& schema_arg_type,
       const nlohmann::json& serialized_arg,
+<<<<<<< HEAD
       OSSOpKernel& op_kernel);
+=======
+      OSSOpKernel* op_kernel,
+      const std::string& torchbind_arg_name);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void get_input_info_from_serialized(
       const std::vector<c10::Argument>& schema_args,
@@ -96,8 +186,17 @@ class OSSProxyExecutor : public ProxyExecutor {
       const nlohmann::json& serialized_node,
       OSSOpKernel& op_kernel);
 
+<<<<<<< HEAD
   std::vector<OSSOpKernel> op_kernels_;
   std::unique_ptr<c10::Device> device_;
+=======
+  std::unique_ptr<OSSCallTorchBindKernel> get_call_torch_bind_kernel(
+      const nlohmann::json& serialized_node);
+
+  std::vector<std::unique_ptr<OSSOpKernel>> op_kernels_;
+  std::unique_ptr<c10::Device> device_;
+  std::unordered_map<std::string, c10::IValue> custom_objs_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 } // namespace torch::aot_inductor
diff --git a/torch/csrc/inductor/aoti_torch/proxy_executor.h b/torch/csrc/inductor/aoti_torch/proxy_executor.h
index 6943bca5df49..a4ca63d3f1a2 100644
--- a/torch/csrc/inductor/aoti_torch/proxy_executor.h
+++ b/torch/csrc/inductor/aoti_torch/proxy_executor.h
@@ -6,6 +6,24 @@
 
 namespace torch::aot_inductor {
 
+<<<<<<< HEAD
+=======
+enum class DynamicArgType : int {
+  TensorType = 0,
+  ListTensorType = 1,
+  ListOptionalTensorType = 2,
+  IntType = 3,
+  ListIntType = 4,
+  NoneType = 5,
+};
+
+inline bool isTensorType(DynamicArgType arg_type) {
+  return arg_type == DynamicArgType::TensorType ||
+      arg_type == DynamicArgType::ListTensorType ||
+      arg_type == DynamicArgType::ListOptionalTensorType;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ProxyExecutor {
  public:
   ProxyExecutor() = default;
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index 59366b69389f..4bd44e80f2f5 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -87,7 +87,11 @@ bool file_exists(std::string& path) {
 #ifdef _WIN32
   return fs::exists(path);
 #else
+<<<<<<< HEAD
   struct stat rc {};
+=======
+  struct stat rc{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return lstat(path.c_str(), &rc) == 0;
 #endif
 }
@@ -129,6 +133,10 @@ AOTI_TORCH_DEVICE_TYPE_IMPL(cpu, CPU)
 AOTI_TORCH_DEVICE_TYPE_IMPL(cuda, CUDA)
 AOTI_TORCH_DEVICE_TYPE_IMPL(meta, Meta)
 AOTI_TORCH_DEVICE_TYPE_IMPL(xpu, XPU)
+<<<<<<< HEAD
+=======
+AOTI_TORCH_DEVICE_TYPE_IMPL(mps, MPS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTI_TORCH_DEVICE_TYPE_IMPL(privateuse1, PrivateUse1)
 #undef AOTI_TORCH_DEVICE_TYPE_IMPL
 
@@ -252,6 +260,14 @@ void aoti_torch_grad_mode_set_enabled(bool enabled) {
   return c10::GradMode::set_enabled(enabled);
 }
 
+<<<<<<< HEAD
+=======
+size_t aoti_torch_dtype_element_size(int32_t dtype) {
+  auto scalar_type = static_cast<at::ScalarType>(dtype);
+  return c10::elementSize(scalar_type);
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError aoti_torch_delete_tensor_object(AtenTensorHandle tensor) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
     at::Tensor* t = tensor_handle_to_tensor_pointer(tensor);
@@ -387,6 +403,18 @@ AOTITorchError aoti_torch_get_storage_offset(
   });
 }
 
+<<<<<<< HEAD
+=======
+AOTITorchError aoti_torch_is_contiguous(
+    AtenTensorHandle tensor,
+    bool* ret_is_contiguous) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* t = tensor_handle_to_tensor_pointer(tensor);
+    *ret_is_contiguous = t->is_contiguous();
+  });
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError aoti_torch_new_tensor_handle(
     AtenTensorHandle orig_handle,
     AtenTensorHandle* new_handle) {
@@ -1205,10 +1233,28 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
     if (!is_complex_type) {
       // "min_all_cuda" function is not implemented for 'ComplexFloat' type.
       // (similar for max) Skip printing min/max value for complex type tensors
+<<<<<<< HEAD
       // here If encountered complex dtypes (rare occasions), suggest to print
       // out the whole value of the tensor.
       std::cout << "Min value: " << t->min().item<float>() << '\n';
       std::cout << "Max value: " << t->max().item<float>() << '\n';
+=======
+      // here if encountered complex (rare occasions), suggest to print
+      // out the whole value of the tensor.
+      std::cout << "Min value: " << t->to(float_dtype).min().item() << '\n';
+      std::cout << "Max value: " << t->to(float_dtype).max().item() << '\n';
+    } else {
+      // Set the numel threshold to print as 256 to avoid printing out too much
+      // More info for aten native cuda kernel for "min_all_cuda" implementation
+      // source:
+      // https://github.com/pytorch/pytorch/blob/4b3983241263b03abd25ae381ae4743ac49b648e/aten/src/ATen/native/cuda/ReduceMinValuesKernel.cu#L51
+      if (numel <= 256) {
+        std::cout
+            << "[INFO] Aten built-in function `min_all_cuda/max_all_cuda` not implemented for current dtype: "
+            << t->dtype() << ". Printing out the whole value:\n"
+            << *t << "\n";
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   }
   std::cout << "Device: " << t->device() << '\n';
@@ -1364,9 +1410,14 @@ static c10::IValue to_ivalue(
     case c10::TypeKind::TensorType: {
       auto ret_raiiath = torch::aot_inductor::RAIIAtenTensorHandle(
           to<AtenTensorHandle>(stable_ivalue));
+<<<<<<< HEAD
       at::Tensor arg = *torch::aot_inductor::tensor_handle_to_tensor_pointer(
           ret_raiiath.get());
       return (c10::IValue(arg));
+=======
+      return (c10::IValue(*torch::aot_inductor::tensor_handle_to_tensor_pointer(
+          ret_raiiath.get())));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     case c10::TypeKind::IntType: {
       return c10::IValue(to<int64_t>(stable_ivalue));
@@ -1433,7 +1484,12 @@ class StableIValueBoxedKernel : public c10::OperatorKernel {
     const auto num_returns = schema.returns().size();
     const auto num_arguments = schema.arguments().size();
 
+<<<<<<< HEAD
     std::vector<StableIValue> ministack(std::max(num_arguments, num_returns));
+=======
+    auto ministack =
+        std::make_unique<StableIValue[]>(std::max(num_arguments, num_returns));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for (const auto idx : c10::irange(num_arguments)) {
       const auto ministack_idx = num_arguments - idx - 1;
@@ -1443,7 +1499,11 @@ class StableIValueBoxedKernel : public c10::OperatorKernel {
 
     // boxed function is going to take a stack of StableIValues, cast them to
     // our schema values, and run the function and modify the StableIValue stack
+<<<<<<< HEAD
     fn_(ministack.data(), num_arguments, num_returns);
+=======
+    fn_(ministack.get(), num_arguments, num_returns);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     // read the output from the end of the stack and wrap that back into
     // IValue from StableIValue
diff --git a/torch/csrc/inductor/aoti_torch/shim_cpu.cpp b/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
new file mode 100644
index 000000000000..85467bbad859
--- /dev/null
+++ b/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
@@ -0,0 +1,541 @@
+
+#include <torch/csrc/inductor/aoti_torch/c/shim_cpu.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/CPUFunctions.h>
+#else
+#include <ATen/ops/mkldnn_rnn_layer_cpu_dispatch.h>
+#endif
+#include <ATen/native/mkldnn/Conv.h>
+#include <ATen/native/mkldnn/Linear.h>
+#include <ATen/native/quantized/cpu/qconv.h>
+#include <ATen/native/quantized/cpu/qlinear.h>
+
+using namespace torch::aot_inductor;
+
+#if AT_MKLDNN_ENABLED()
+
+template <typename T>
+static c10::List<T> convert_to_c10_List(const T* scalars, const int64_t len) {
+  c10::List<T> scalars_list;
+  scalars_list.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    scalars_list.emplace_back(scalars[i]);
+  }
+  return scalars_list;
+}
+
+AOTITorchError aoti_torch_cpu_mkldnn__convolution_pointwise_binary(
+    AtenTensorHandle X,
+    AtenTensorHandle other,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> unary_scalars_list;
+    unary_scalars_list.reserve(unary_scalars_len_);
+    for (int64_t i = 0; i < unary_scalars_len_; i++) {
+      unary_scalars_list.emplace_back(pointer_to_optional(unary_scalars[i]));
+    }
+    auto tmp_result = at::native::mkldnn_convolution_pointwise_binary(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(other),
+        *tensor_handle_to_tensor_pointer(W),
+        pointer_to_optional<at::Tensor>(B),
+        pointer_to_list<int64_t>(padding, padding_len_),
+        pointer_to_list<int64_t>(stride, stride_len_),
+        pointer_to_list<int64_t>(dilation, dilation_len_),
+        groups,
+        binary_attr,
+        pointer_to_optional<c10::Scalar>(alpha),
+        pointer_to_optional<std::string_view>(unary_attr),
+        unary_scalars_list,
+        pointer_to_optional<std::string_view>(unary_algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_cpu_mkldnn__convolution_pointwise_binary_(
+    AtenTensorHandle other,
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> unary_scalars_list;
+    unary_scalars_list.reserve(unary_scalars_len_);
+    for (int64_t i = 0; i < unary_scalars_len_; i++) {
+      unary_scalars_list.emplace_back(pointer_to_optional(unary_scalars[i]));
+    }
+    auto tmp_result = at::native::mkldnn_convolution_pointwise_binary_(
+        *tensor_handle_to_tensor_pointer(other),
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(W),
+        pointer_to_optional<at::Tensor>(B),
+        pointer_to_list<int64_t>(padding, padding_len_),
+        pointer_to_list<int64_t>(stride, stride_len_),
+        pointer_to_list<int64_t>(dilation, dilation_len_),
+        groups,
+        binary_attr,
+        pointer_to_optional<c10::Scalar>(alpha),
+        pointer_to_optional<std::string_view>(unary_attr),
+        unary_scalars_list,
+        pointer_to_optional<std::string_view>(unary_algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_cpu_mkldnn__convolution_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(scalars_len_);
+    for (int64_t i = 0; i < scalars_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(scalars[i]));
+    }
+    auto tmp_result = at::native::mkldnn_convolution_pointwise(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(W),
+        pointer_to_optional<at::Tensor>(B),
+        pointer_to_list<int64_t>(padding, padding_len_),
+        pointer_to_list<int64_t>(stride, stride_len_),
+        pointer_to_list<int64_t>(dilation, dilation_len_),
+        groups,
+        attr,
+        scalars_list,
+        pointer_to_optional<std::string_view>(algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_cpu_mkldnn__convolution_transpose_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* output_padding,
+    int64_t output_padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(scalars_len_);
+    for (int64_t i = 0; i < scalars_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(scalars[i]));
+    }
+    auto tmp_result = at::native::mkldnn_convolution_transpose_pointwise(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(W),
+        pointer_to_optional<at::Tensor>(B),
+        pointer_to_list<int64_t>(padding, padding_len_),
+        pointer_to_list<int64_t>(output_padding, output_padding_len_),
+        pointer_to_list<int64_t>(stride, stride_len_),
+        pointer_to_list<int64_t>(dilation, dilation_len_),
+        groups,
+        attr,
+        scalars_list,
+        pointer_to_optional<std::string_view>(algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer(
+    AtenTensorHandle input,
+    AtenTensorHandle weight0,
+    AtenTensorHandle weight1,
+    AtenTensorHandle weight2,
+    AtenTensorHandle weight3,
+    AtenTensorHandle hx_,
+    AtenTensorHandle cx_,
+    int32_t reverse,
+    const int64_t* batch_sizes,
+    int64_t batch_sizes_len_,
+    int64_t mode,
+    int64_t hidden_size,
+    int64_t num_layers,
+    int32_t has_biases,
+    int32_t bidirectional,
+    int32_t batch_first,
+    int32_t train,
+    AtenTensorHandle* ret0,
+    AtenTensorHandle* ret1,
+    AtenTensorHandle* ret2,
+    AtenTensorHandle* ret3) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto tmp_result = at::cpu::mkldnn_rnn_layer(
+        *tensor_handle_to_tensor_pointer(input),
+        *tensor_handle_to_tensor_pointer(weight0),
+        *tensor_handle_to_tensor_pointer(weight1),
+        *tensor_handle_to_tensor_pointer(weight2),
+        *tensor_handle_to_tensor_pointer(weight3),
+        *tensor_handle_to_tensor_pointer(hx_),
+        *tensor_handle_to_tensor_pointer(cx_),
+        reverse,
+        pointer_to_list<int64_t>(batch_sizes, batch_sizes_len_),
+        mode,
+        hidden_size,
+        num_layers,
+        has_biases,
+        bidirectional,
+        batch_first,
+        train);
+    *ret0 = new_tensor_handle(std::move(std::get<0>(tmp_result)));
+    *ret1 = new_tensor_handle(std::move(std::get<1>(tmp_result)));
+    *ret2 = new_tensor_handle(std::move(std::get<2>(tmp_result)));
+    *ret3 = new_tensor_handle(std::move(std::get<3>(tmp_result)));
+  });
+}
+
+AOTITorchError aoti_torch_cpu__linear_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(scalars_len_);
+    for (int64_t i = 0; i < scalars_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(scalars[i]));
+    }
+    auto tmp_result = at::native::mkldnn_linear_pointwise(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(W),
+        pointer_to_optional<at::Tensor>(B),
+        attr,
+        scalars_list,
+        pointer_to_optional<std::string_view>(algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_cpu__linear_pointwise_binary(
+    AtenTensorHandle X,
+    AtenTensorHandle other,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const char* attr,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto tmp_result = at::native::mkldnn_linear_pointwise_binary(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(other),
+        *tensor_handle_to_tensor_pointer(W),
+        pointer_to_optional<at::Tensor>(B),
+        attr);
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_cpu__qlinear_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* post_op_name,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char* post_op_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(post_op_args_len_);
+    for (int64_t i = 0; i < post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(post_op_args[i]));
+    }
+
+    auto tmp_result = at::native::QLinearOnednn::run_pointwise_tensor(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(act_scale),
+        *tensor_handle_to_tensor_pointer(act_zero_point),
+        *tensor_handle_to_tensor_pointer(onednn_weight),
+        *tensor_handle_to_tensor_pointer(weight_scales),
+        *tensor_handle_to_tensor_pointer(weight_zero_points),
+        pointer_to_optional<at::Tensor>(B),
+        output_scale,
+        output_zero_point,
+        pointer_to_optional<at::ScalarType>(output_dtype),
+        post_op_name,
+        scalars_list,
+        post_op_algorithm);
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_cpu__qlinear_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* other,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    const char* binary_post_op,
+    double binary_alpha,
+    const char* unary_post_op,
+    const double** unary_post_op_args,
+    int64_t unary_post_op_args_len_,
+    const char* unary_post_op_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(unary_post_op_args_len_);
+    for (int64_t i = 0; i < unary_post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(unary_post_op_args[i]));
+    }
+
+    auto tmp_result = at::native::QLinearOnednn::run_pointwise_binary_tensor(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(act_scale),
+        *tensor_handle_to_tensor_pointer(act_zero_point),
+        *tensor_handle_to_tensor_pointer(onednn_weight),
+        *tensor_handle_to_tensor_pointer(weight_scales),
+        *tensor_handle_to_tensor_pointer(weight_zero_points),
+        pointer_to_optional<at::Tensor>(other),
+        pointer_to_optional<at::Tensor>(B),
+        output_scale,
+        output_zero_point,
+        pointer_to_optional<at::ScalarType>(output_dtype),
+        other_scale,
+        other_zero_point,
+        binary_post_op,
+        binary_alpha,
+        unary_post_op,
+        scalars_list,
+        unary_post_op_algorithm);
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_cpu__qconv_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    const int64_t* stride_args,
+    int64_t stride_len_,
+    const int64_t* padding_args,
+    int64_t padding_len_,
+    const int64_t* dilation_args,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* attr,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(post_op_args_len_);
+    for (int64_t i = 0; i < post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(post_op_args[i]));
+    }
+
+    c10::List<int64_t> stride_list =
+        convert_to_c10_List<int64_t>(stride_args, stride_len_);
+    c10::List<int64_t> padding_list =
+        convert_to_c10_List<int64_t>(padding_args, padding_len_);
+    c10::List<int64_t> dilation_list =
+        convert_to_c10_List<int64_t>(dilation_args, dilation_len_);
+
+    auto tmp_result = at::native::QConvoneDNN::run_pointwise_tensor(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(act_scale),
+        *tensor_handle_to_tensor_pointer(act_zero_point),
+        *tensor_handle_to_tensor_pointer(onednn_weight),
+        *tensor_handle_to_tensor_pointer(weight_scales),
+        *tensor_handle_to_tensor_pointer(weight_zero_points),
+        pointer_to_optional<at::Tensor>(B),
+        stride_list,
+        padding_list,
+        dilation_list,
+        groups,
+        output_scale,
+        output_zero_point,
+        pointer_to_optional<at::ScalarType>(output_dtype),
+        attr,
+        scalars_list,
+        pointer_to_optional<std::string_view>(algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_cpu__qconv2d_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle accum,
+    AtenTensorHandle* B,
+    const int64_t* stride_args,
+    int64_t stride_len_,
+    const int64_t* padding_args,
+    int64_t padding_len_,
+    const int64_t* dilation_args,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> unary_scalars_list;
+    unary_scalars_list.reserve(unary_scalars_len_);
+    for (int64_t i = 0; i < unary_scalars_len_; i++) {
+      unary_scalars_list.emplace_back(pointer_to_optional(unary_scalars[i]));
+    }
+
+    c10::List<int64_t> stride_list =
+        convert_to_c10_List<int64_t>(stride_args, stride_len_);
+    c10::List<int64_t> padding_list =
+        convert_to_c10_List<int64_t>(padding_args, padding_len_);
+    c10::List<int64_t> dilation_list =
+        convert_to_c10_List<int64_t>(dilation_args, dilation_len_);
+
+    auto tmp_result = at::native::QConvoneDNN::run_pointwise_binary_tensor(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(act_scale),
+        *tensor_handle_to_tensor_pointer(act_zero_point),
+        *tensor_handle_to_tensor_pointer(onednn_weight),
+        *tensor_handle_to_tensor_pointer(weight_scales),
+        *tensor_handle_to_tensor_pointer(weight_zero_points),
+        *tensor_handle_to_tensor_pointer(accum),
+        pointer_to_optional<at::Tensor>(B),
+        stride_list,
+        padding_list,
+        dilation_list,
+        groups,
+        output_scale,
+        output_zero_point,
+        pointer_to_optional<at::ScalarType>(output_dtype),
+        accum_scale,
+        accum_zero_point,
+        binary_attr,
+        pointer_to_optional<c10::Scalar>(alpha),
+        pointer_to_optional<std::string_view>(unary_attr),
+        unary_scalars_list,
+        pointer_to_optional<std::string_view>(unary_algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+#if AT_MKL_ENABLED()
+
+AOTITorchError aoti_torch_cpu__mkl_linear(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle origin_W,
+    AtenTensorHandle* B,
+    int64_t prepack_batch_size,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto tmp_result = at::native::mkl_linear(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(W),
+        *tensor_handle_to_tensor_pointer(origin_W),
+        pointer_to_optional<at::Tensor>(B),
+        prepack_batch_size);
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+#endif // AT_MKL_ENABLED
+
+#endif // AT_MKLDNN_ENABLED()
+
+AOTITorchError aoti_torch_cpu__weight_int4pack_mm_cpu_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle w,
+    AtenTensorHandle qGroupSize,
+    AtenTensorHandle qScaleAndZeros,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto tmp_result = at::native::_weight_int4pack_mm_cpu_tensor(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(w),
+        *tensor_handle_to_tensor_pointer(qGroupSize),
+        *tensor_handle_to_tensor_pointer(qScaleAndZeros));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
diff --git a/torch/csrc/inductor/aoti_torch/shim_mps.cpp b/torch/csrc/inductor/aoti_torch/shim_mps.cpp
new file mode 100644
index 000000000000..47cb8f0f71f0
--- /dev/null
+++ b/torch/csrc/inductor/aoti_torch/shim_mps.cpp
@@ -0,0 +1,29 @@
+#include <ATen/native/mps/MetalShaderLibrary.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+
+using namespace torch::aot_inductor;
+
+AOTITorchError aoti_torch_mps_set_arg_tensor(
+    AOTIMetalKernelFunctionHandle handle,
+    unsigned idx,
+    AtenTensorHandle tensor) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto t = tensor_handle_to_tensor_pointer(tensor);
+    if (t == nullptr) {
+      throw std::runtime_error("Tensor is null.");
+    }
+    auto func = reinterpret_cast<at::native::mps::MetalKernelFunction*>(handle);
+    func->setArg(idx, *t);
+  });
+}
+
+AOTITorchError aoti_torch_mps_set_arg_int(
+    AOTIMetalKernelFunctionHandle handle,
+    unsigned idx,
+    int64_t val) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto func = reinterpret_cast<at::native::mps::MetalKernelFunction*>(handle);
+    func->setArg(idx, val);
+  });
+}
diff --git a/torch/csrc/inductor/aoti_torch/shim_mps.mm b/torch/csrc/inductor/aoti_torch/shim_mps.mm
new file mode 100644
index 000000000000..9f70331ffc0b
--- /dev/null
+++ b/torch/csrc/inductor/aoti_torch/shim_mps.mm
@@ -0,0 +1,42 @@
+#include <ATen/native/mps/MetalShaderLibrary.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
+#include <ATen/mps/MPSDevice.h>
+
+
+using namespace torch::aot_inductor;
+
+AOTITorchError aoti_torch_mps_malloc(
+    void** buffer,
+    size_t num_bytes) {
+  if (num_bytes == 0) {
+    *buffer = nullptr;
+    return AOTI_TORCH_SUCCESS;
+  }
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+      id<MTLDevice> device = at::mps::MPSDevice::getInstance()->device();
+      TORCH_CHECK(device, "Failed to get MPS device");
+      id<MTLBuffer> metal_buffer = [device newBufferWithLength:num_bytes options:MTLResourceCPUCacheModeWriteCombined | MTLResourceStorageModeShared];
+      TORCH_CHECK(metal_buffer, "Failed to allocate memory on MPS device");
+      *buffer = (void*)metal_buffer;
+  });
+}
+
+AOTITorchError aoti_torch_mps_free(
+    void* ptr) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto metal_buffer = (id<MTLBuffer>)ptr;
+    [metal_buffer release];
+  });
+}
+
+
+AOTITorchError
+aoti_torch_mps_memcpy(void* buffer, size_t constant_offset, size_t bytes_read, size_t data_size, uint8_t* constants_start) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto metal_buffer = (id<MTLBuffer>)buffer;
+    auto buffer_pointer = static_cast<uint8_t*>([metal_buffer contents]);
+    memcpy(buffer_pointer + constant_offset, constants_start + bytes_read, data_size);
+  });
+}
diff --git a/torch/csrc/inductor/aoti_torch/shim_xpu.cpp b/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
index ab4e8df4af37..848f91d21abe 100644
--- a/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
@@ -7,6 +7,11 @@
 #include <c10/core/StreamGuard.h>
 #include <c10/xpu/XPUStream.h>
 
+<<<<<<< HEAD
+=======
+using namespace torch::aot_inductor;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 AOTITorchError aoti_torch_create_xpu_guard(
     int32_t device_index,
     XPUGuardHandle* ret_guard // returns new reference
@@ -57,8 +62,15 @@ AOTITorchError aoti_torch_get_current_xpu_stream(
 }
 
 AOTITorchError aoti_torch_get_current_xpu_device(int32_t* device_index) {
+<<<<<<< HEAD
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
       { *device_index = static_cast<int32_t>(c10::xpu::current_device()); });
+=======
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *device_index =
+        static_cast<int32_t>(static_cast<uint16_t>(c10::xpu::current_device()));
+  });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 AOTITorchError aoti_torch_set_current_xpu_device(const int32_t& device_index) {
@@ -68,7 +80,143 @@ AOTITorchError aoti_torch_set_current_xpu_device(const int32_t& device_index) {
 
 AOTITorchError aoti_torch_get_current_sycl_queue(void** ret) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+<<<<<<< HEAD
     int32_t device_index = static_cast<int32_t>(c10::xpu::current_device());
     *ret = &(at::xpu::getCurrentXPUStream(device_index).queue());
   });
 }
+=======
+    int32_t device_index =
+        static_cast<int32_t>(static_cast<uint16_t>(c10::xpu::current_device()));
+    *ret = &(at::xpu::getCurrentXPUStream(device_index).queue());
+  });
+}
+
+#if AT_MKLDNN_ENABLED()
+#include <ATen/native/mkldnn/xpu/Conv.h>
+
+AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise_binary(
+    AtenTensorHandle X,
+    AtenTensorHandle other,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> unary_scalars_list;
+    unary_scalars_list.reserve(unary_scalars_len_);
+    for (int64_t i = 0; i < unary_scalars_len_; i++) {
+      unary_scalars_list.emplace_back(pointer_to_optional(unary_scalars[i]));
+    }
+    auto tmp_result = at::native::xpu::convolution_pointwise_binary(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(other),
+        *tensor_handle_to_tensor_pointer(W),
+        pointer_to_optional<at::Tensor>(B),
+        pointer_to_list<int64_t>(padding, padding_len_),
+        pointer_to_list<int64_t>(stride, stride_len_),
+        pointer_to_list<int64_t>(dilation, dilation_len_),
+        groups,
+        binary_attr,
+        pointer_to_optional<c10::Scalar>(alpha),
+        pointer_to_optional<std::string_view>(unary_attr),
+        unary_scalars_list,
+        pointer_to_optional<std::string_view>(unary_algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise_binary_(
+    AtenTensorHandle other,
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> unary_scalars_list;
+    unary_scalars_list.reserve(unary_scalars_len_);
+    for (int64_t i = 0; i < unary_scalars_len_; i++) {
+      unary_scalars_list.emplace_back(pointer_to_optional(unary_scalars[i]));
+    }
+    auto tmp_result = at::native::xpu::convolution_pointwise_binary_(
+        *tensor_handle_to_tensor_pointer(other),
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(W),
+        pointer_to_optional<at::Tensor>(B),
+        pointer_to_list<int64_t>(padding, padding_len_),
+        pointer_to_list<int64_t>(stride, stride_len_),
+        pointer_to_list<int64_t>(dilation, dilation_len_),
+        groups,
+        binary_attr,
+        pointer_to_optional<c10::Scalar>(alpha),
+        pointer_to_optional<std::string_view>(unary_attr),
+        unary_scalars_list,
+        pointer_to_optional<std::string_view>(unary_algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(scalars_len_);
+    for (int64_t i = 0; i < scalars_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(scalars[i]));
+    }
+    auto tmp_result = at::native::xpu::convolution_pointwise(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(W),
+        pointer_to_optional<at::Tensor>(B),
+        pointer_to_list<int64_t>(padding, padding_len_),
+        pointer_to_list<int64_t>(stride, stride_len_),
+        pointer_to_list<int64_t>(dilation, dilation_len_),
+        groups,
+        attr,
+        scalars_list,
+        pointer_to_optional<std::string_view>(algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+#endif // AT_MKLDNN_ENABLED()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/inductor/cpp_prefix.h b/torch/csrc/inductor/cpp_prefix.h
new file mode 100644
index 000000000000..5a3ef9865b7c
--- /dev/null
+++ b/torch/csrc/inductor/cpp_prefix.h
@@ -0,0 +1,1194 @@
+#pragma once
+
+#include <omp.h>
+#include <algorithm>
+#include <atomic>
+#include <cmath>
+#include <cstdlib>
+#include <limits>
+#include <map>
+#include <memory>
+#include <optional>
+
+// WARNING: be extra careful when including more ATen/c10 header files here!
+// Because AOTInductor generated code will copy-paste this cpp_prefix.h for
+// the CPU backend, we have to make sure the used headers are implemented
+// in a header-only way, i.e. all the function and class definitions are
+// in .h files instead of .cpp files, to avoid ABI backward-compatiblity
+// breakage.
+
+#include <ATen/NumericUtils.h>
+#include <ATen/core/PhiloxRNGEngine.h>
+
+#include <c10/util/BFloat16-math.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+#include <c10/util/TypeCast.h>
+#include <c10/util/generic_math.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) ||  \
+    defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || \
+    defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_SVE256)
+#define INDUCTOR_USE_VECTOR_TYPES() 1
+#else
+#define INDUCTOR_USE_VECTOR_TYPES() 0
+#endif
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#else
+// For calc_erfinv
+#include <ATen/native/Math.h>
+#endif
+
+template <typename T>
+struct Welford {
+  T mean = T(0);
+  T m2 = T(0);
+  // Use weight for tail cases since the index of each element in the vec may be
+  // different. A single index can not express masked welford reduction.
+  T weight = T(0);
+  uint64_t index = 0;
+};
+
+template <typename T>
+struct IsVecType : std::false_type {};
+
+template <typename T>
+struct IsVecMaskType : std::false_type {};
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+template <typename T>
+struct IsVecType<at::vec::Vectorized<T>> : std::true_type {};
+template <typename T, int N>
+struct IsVecType<at::vec::VectorizedN<T, N>> : std::true_type {};
+
+template <typename T, int N>
+struct IsVecMaskType<at::vec::VecMask<T, N>> : std::true_type {};
+#endif
+
+template <typename T, uint64_t kChunkSize>
+struct WelfordHelper {
+  // A data struct to help welford reduction:
+  // 1. Save the reciprocal of weights to avoid redundant divisions.
+  // 2. Save the welford stack, which is used to combine welford reduction
+  //    with cascade summation to improve numerical stability.
+  static std::vector<typename T::value_type> weight_recps;
+  std::vector<Welford<T>> welford_stk{};
+  uint64_t depth{0}; // depth of welford_stk.
+  uint64_t num_chunks{0}; // number of chunks stored in welford_stk.
+  WelfordHelper() = default;
+  WelfordHelper(uint64_t N) {
+    uint64_t m = (N + kChunkSize - 1) / kChunkSize; // div up
+    depth = m > 0
+        ? static_cast<std::uint64_t>(ceil(log2(static_cast<double>(m))))
+        : 0;
+    welford_stk.assign(depth, Welford<T>());
+  }
+};
+
+template <typename T, uint64_t kChunkSize>
+std::vector<typename T::value_type> WelfordHelper<T, kChunkSize>::weight_recps =
+    []() {
+      using scalar_t = typename T::value_type;
+      std::vector<scalar_t> temp(kChunkSize);
+      for (const auto i : c10::irange(kChunkSize)) {
+        temp[i] = scalar_t(static_cast<double>(1) / static_cast<double>(i + 1));
+      }
+      return temp;
+    }();
+
+template <typename T>
+Welford<T> welford_combine(
+    const Welford<T>& a,
+    const Welford<T>& b,
+    bool use_index = false) {
+  if (a.index == 0) {
+    return b;
+  }
+  if (b.index == 0) {
+    return a;
+  }
+  auto delta = b.mean - a.mean;
+  auto a_weight = use_index ? T(a.index) : a.weight;
+  auto b_weight = use_index ? T(b.index) : b.weight;
+  auto new_weight = a_weight + b_weight;
+  auto new_index = a.index + b.index;
+  auto wb_over_w = b_weight / new_weight;
+  if constexpr (IsVecType<T>::value) {
+    // Guard against division by zero
+    wb_over_w = T::blendv(wb_over_w, T(0), new_weight == T(0));
+  }
+  auto result = Welford<T>{
+      a.mean + delta * wb_over_w,
+      a.m2 + b.m2 + delta * delta * a_weight * wb_over_w,
+      new_weight,
+      new_index};
+  return result;
+}
+
+template <typename T, uint64_t kChunkSize = 0>
+Welford<T> welford_combine(
+    Welford<T>& acc,
+    T& data,
+    WelfordHelper<T, kChunkSize>* w = nullptr) {
+  // Combine welford reduction with cascade summation to improve numerical
+  // stability.
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+  // https://en.wikipedia.org/wiki/Pairwise_summation
+  if constexpr (IsVecType<T>::value) {
+    if (w != nullptr && w->depth > 0 && acc.index == kChunkSize) {
+      w->welford_stk[0] = welford_combine(w->welford_stk[0], acc);
+      w->num_chunks += 1;
+      acc.mean = T(0);
+      acc.m2 = T(0);
+      acc.weight = T(0);
+      acc.index = 0;
+      uint64_t mask = w->num_chunks;
+      for (uint64_t j = 1; j < w->depth && (mask & 1) == 0; ++j) {
+        w->welford_stk[j] =
+            welford_combine(w->welford_stk[j], w->welford_stk[j - 1]);
+        w->welford_stk[j - 1] = Welford<T>();
+        mask >>= 1;
+      }
+    }
+  }
+  // Add a single data point
+  uint64_t new_index = acc.index + 1;
+  auto new_weight = acc.weight + T(1);
+  auto delta = data - acc.mean;
+  T new_mean;
+  if constexpr (!IsVecType<T>::value) {
+    new_mean = acc.mean + delta / new_weight;
+  } else {
+    // use new_index to fecth 1 / new_weight to avoid divisions
+    new_mean = acc.mean +
+        ((w == nullptr || acc.index >= w->weight_recps.size())
+             ? delta / new_weight
+             : delta * T(w->weight_recps[acc.index]));
+  }
+  auto new_delta = data - new_mean;
+  auto result =
+      Welford<T>{new_mean, acc.m2 + delta * new_delta, new_weight, new_index};
+  return result;
+}
+
+template <typename T, uint64_t kChunkSize = 0>
+Welford<T> welford_combine(Welford<T>& acc, WelfordHelper<T, kChunkSize>* w) {
+  for (const auto i : c10::irange(w->depth)) {
+    acc = welford_combine(acc, w->welford_stk[i]);
+  }
+  return acc;
+}
+
+template <typename T>
+struct IndexValue {
+  int64_t index{};
+  T value;
+  IndexValue(int64_t idx, T val) : index(idx), value(val) {}
+  IndexValue() = default;
+};
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+template <typename T, uint64_t kChunkSize>
+Welford<T> welford_combine(
+    Welford<T>& acc,
+    T& data,
+    int64_t tail_size,
+    WelfordHelper<T, kChunkSize>* w = nullptr) {
+  auto out = welford_combine(acc, data, w);
+  return Welford<T>{
+      T::set(acc.mean, out.mean, tail_size),
+      T::set(acc.m2, out.m2, tail_size),
+      T::set(acc.weight, out.weight, tail_size),
+      out.index};
+}
+
+template <typename T>
+T max_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+  auto out = at::vec::maximum(a, b);
+  return T::set(a, out, tail_size);
+}
+
+template <typename T>
+T min_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+  auto out = at::vec::minimum(a, b);
+  return T::set(a, out, tail_size);
+}
+
+template <typename T>
+T sum_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+  auto out = a + b;
+  return T::set(a, out, tail_size);
+}
+
+template <typename T>
+T prod_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+  auto out = a * b;
+  return T::set(a, out, tail_size);
+}
+
+template <typename T>
+T xor_sum_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+  auto out = a ^ b;
+  return T::set(a, out, tail_size);
+}
+#endif
+
+// Refer to
+// https://github.com/pytorch/pytorch/blob/b5b36cf0c4e1958f1ff25120f5d4beeef3288187/
+// aten/src/ATen/native/SharedReduceOps.h#L419-L445
+template <typename scalar_t>
+inline bool greater_or_nan(
+    scalar_t a,
+    scalar_t b,
+    int64_t idx_a,
+    int64_t idx_b) {
+  // If (a == b), then choose the one with lower idx, else max(a, b)
+  if (at::_isnan(a)) {
+    if (at::_isnan(b)) {
+      return idx_a < idx_b;
+    }
+    return true;
+  }
+  return (a == b) ? idx_a < idx_b : (a > b);
+}
+
+template <typename scalar_t>
+inline bool less_or_nan(scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) {
+  // If (a == b), then choose the one with lower idx, else min(a, b)
+  if (at::_isnan(a)) {
+    if (at::_isnan(b)) {
+      return idx_a < idx_b;
+    }
+    return true;
+  }
+  return (a == b) ? idx_a < idx_b : (a < b);
+}
+
+template <typename T>
+inline IndexValue<T>& argmin_combine(
+    IndexValue<T>& a,
+    T next_value,
+    int64_t next_index) {
+  if (!(less_or_nan(a.value, next_value, a.index, next_index))) {
+    a.value = next_value;
+    a.index = next_index;
+  }
+  return a;
+}
+template <typename T>
+inline IndexValue<T>& argmax_combine(
+    IndexValue<T>& a,
+    T next_value,
+    int64_t next_index) {
+  if (!(greater_or_nan(a.value, next_value, a.index, next_index))) {
+    a.value = next_value;
+    a.index = next_index;
+  }
+  return a;
+}
+template <typename T>
+inline IndexValue<T>& argmin_combine(
+    IndexValue<T>& a,
+    const IndexValue<T>& next) {
+  return argmin_combine(a, next.value, next.index);
+}
+template <typename T>
+inline IndexValue<T>& argmax_combine(
+    IndexValue<T>& a,
+    const IndexValue<T>& next) {
+  return argmax_combine(a, next.value, next.index);
+}
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+
+template <typename scalar_t>
+inline at::vec::Vectorized<scalar_t> div_floor_floating_vec(
+    const at::vec::Vectorized<scalar_t>& a,
+    const at::vec::Vectorized<scalar_t>& b) {
+  using vec_t = at::vec::Vectorized<scalar_t>;
+  const auto basic_div = a / b;
+  vec_t inf(std::numeric_limits<scalar_t>::infinity());
+  auto mod = a.fmod(b);
+  // Fixup for a case that isn't properly handled by Sleef_fmod
+  auto floor =
+      vec_t::blendv(a - mod, a, (basic_div.abs() == inf) & (a.abs() != inf));
+  auto div = floor / b;
+  const auto zero = vec_t(0);
+  auto mask = (mod != zero) & ((b < zero) ^ (mod < zero));
+  const auto one = vec_t(1);
+  div = vec_t::blendv(div, div - one, mask);
+  auto floordiv = div.floor();
+  mask = (div - floordiv) > vec_t(0.5);
+  floordiv = vec_t::blendv(floordiv, floordiv + one, mask);
+  floordiv = vec_t::blendv(floordiv, zero.copysign(basic_div), div == zero);
+  floordiv = vec_t::blendv(floordiv, basic_div, b == zero);
+  return floordiv;
+};
+
+template <typename scalar_t, int N>
+inline at::vec::VectorizedN<scalar_t, N> div_floor_floating_vec(
+    const at::vec::VectorizedN<scalar_t, N>& a,
+    const at::vec::VectorizedN<scalar_t, N>& b) {
+  at::vec::VectorizedN<scalar_t, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (int i = 0; i < N; ++i) {
+    result[i] = div_floor_floating_vec(a[i], b[i]);
+  }
+  return result;
+}
+
+template <typename T, int NV, int NI>
+struct IndexValueVec {
+  at::vec::VectorizedN<T, NV> value;
+  at::vec::VectorizedN<int64_t, NI> index;
+
+  IndexValueVec(const T _value) {
+    value = at::vec::VectorizedN<T, NV>(_value);
+    index = at::vec::VectorizedN<int64_t, NI>(0);
+  };
+
+  IndexValueVec() {};
+};
+
+template <
+    typename T,
+    int NV,
+    int NI,
+    typename std::enable_if_t<at::vec::is_floating_point_v<T>, int> = 0>
+at::vec::VecMask<int64_t, NI> inline get_mask_for_argmin_argmax(
+    const at::vec::VecMask<T, NV>& vmask,
+    const IndexValueVec<T, NV, NI>& a,
+    const at::vec::VectorizedN<T, NV>& value,
+    const at::vec::VectorizedN<int64_t, NI>& index) {
+  /*
+  vec impl for less_or_nan and greater_or_nan
+  example for argmin:
+  a.value = [NaN, NaN, 0, 2, 1, 0]
+  value = [NaN, 0, 0, 1, 2, NaN]
+  vmask = [false, false, false, false, true, false]
+  all_nan_or_equal = [true, false, true, false, false, false]
+  imask = [a.index[0] < index[0], ..., a.index[-1] < index[-1]]
+  iv_mask = blendv (vmask, imask, all_nan_or_equal)
+          [a.index[0] < index[0], false, a.index[2] < index[2], false, true,
+  false] a_nan_b_not: [false, false, false, false, false, true] mask = iv_mask |
+  a_nan_b_not [a.index[0] < index[0], false, a.index[2] < index[2], false, true,
+  true]
+  */
+  using v_t = at::vec::VecMask<T, NV>;
+  using i_t = at::vec::VecMask<int64_t, NI>;
+  i_t vmask_itype = vmask.template cast<int64_t, NI>();
+  // use itype here since there is vec impl for operator~ for itype
+  // while there may not vec impl for vtype
+  v_t isnan_a = a.value.isnan();
+  i_t isnan_a_itype = isnan_a.template cast<int64_t, NI>();
+  v_t isnan_b = value.isnan();
+  i_t isnan_b_type = isnan_b.template cast<int64_t, NI>();
+  i_t all_nan_mask = isnan_a_itype & isnan_b_type;
+  v_t equal_mask = (a.value == value);
+  i_t equal_mask_itype = equal_mask.template cast<int64_t, NI>();
+  i_t all_nan_or_equal = all_nan_mask | equal_mask_itype;
+  i_t imask(a.index < index);
+  i_t iv_mask = i_t::blendv(vmask_itype, imask, all_nan_or_equal);
+  i_t isnan_a_notnan_b = isnan_a_itype & (~isnan_b_type);
+  return iv_mask | isnan_a_notnan_b;
+}
+
+template <
+    typename T,
+    int NV,
+    int NI,
+    typename std::enable_if_t<!at::vec::is_floating_point_v<T>, int> = 0>
+at::vec::VecMask<int64_t, NI> inline get_mask_for_argmin_argmax(
+    const at::vec::VecMask<T, NV>& vmask,
+    const IndexValueVec<T, NV, NI>& a,
+    const at::vec::VectorizedN<T, NV>& value,
+    const at::vec::VectorizedN<int64_t, NI>& index) {
+  using v_t = at::vec::VecMask<T, NV>;
+  using i_t = at::vec::VecMask<int64_t, NI>;
+  i_t vmask_itype = vmask.template cast<int64_t, NI>();
+  v_t equal_mask = (a.value == value);
+  i_t equal_mask_itype = equal_mask.template cast<int64_t, NI>();
+  i_t imask(a.index < index);
+  return i_t::blendv(vmask_itype, imask, equal_mask_itype);
+}
+
+template <typename T, int NV, int NI>
+inline IndexValueVec<T, NV, NI>& argmin_vec_impl(
+    IndexValueVec<T, NV, NI>& a,
+    at::vec::VectorizedN<T, NV> value,
+    at::vec::VectorizedN<int64_t, NI> index,
+    std::optional<int64_t> tail_size) {
+  at::vec::VecMask<T, NV> vmask(a.value < value);
+  at::vec::VecMask<int64_t, NI> final_mask =
+      get_mask_for_argmin_argmax<T, NV, NI>(vmask, a, value, index);
+  if (tail_size.has_value()) {
+    a.value = at::vec::VectorizedN<T, NV>::set(
+        a.value, at::vec::minimum(a.value, value), tail_size.value());
+    a.index = at::vec::VectorizedN<int64_t, NI>::set(
+        a.index,
+        at::vec::VecMask<int64_t, NI>::blendv(index, a.index, final_mask),
+        tail_size.value());
+  } else {
+    a.value = at::vec::minimum(a.value, value);
+    a.index = at::vec::VecMask<int64_t, NI>::blendv(index, a.index, final_mask);
+  }
+  return a;
+}
+
+template <typename T, int NV, int NI>
+inline IndexValueVec<T, NV, NI>& argmax_vec_impl(
+    IndexValueVec<T, NV, NI>& a,
+    at::vec::VectorizedN<T, NV> value,
+    at::vec::VectorizedN<int64_t, NI> index,
+    std::optional<int64_t> tail_size) {
+  at::vec::VecMask<T, NV> vmask(a.value > value);
+  at::vec::VecMask<int64_t, NI> final_mask =
+      get_mask_for_argmin_argmax<T, NV, NI>(vmask, a, value, index);
+  if (tail_size.has_value()) {
+    a.value = at::vec::VectorizedN<T, NV>::set(
+        a.value, at::vec::maximum(a.value, value), tail_size.value());
+    a.index = at::vec::VectorizedN<int64_t, NI>::set(
+        a.index,
+        at::vec::VecMask<int64_t, NI>::blendv(index, a.index, final_mask),
+        tail_size.value());
+  } else {
+    a.value = at::vec::maximum(a.value, value);
+    a.index = at::vec::VecMask<int64_t, NI>::blendv(index, a.index, final_mask);
+  }
+  return a;
+}
+
+template <typename T, int NI, bool horizontal>
+inline at::vec::VectorizedN<int64_t, NI> create_index(int64_t next_index) {
+  at::vec::VectorizedN<int64_t, NI> next_idx;
+  if constexpr (horizontal) {
+    next_idx = at::vec::VectorizedN<int64_t, NI>::arange(next_index, 1);
+  } else {
+    next_idx = at::vec::VectorizedN<int64_t, NI>(next_index);
+  }
+  return next_idx;
+}
+
+template <typename T, int NV, int NI, bool horizontal>
+inline IndexValueVec<T, NV, NI>& argmin_combine_vec(
+    IndexValueVec<T, NV, NI>& a,
+    at::vec::VectorizedN<T, NV> next_value,
+    int64_t next_index,
+    std::optional<int64_t> tail_size = std::nullopt) {
+  auto next_idx = create_index<T, NI, horizontal>(next_index);
+  return argmin_vec_impl(a, next_value, next_idx, tail_size);
+}
+
+template <typename T, int NV, int NI, bool horizontal>
+inline IndexValueVec<T, NV, NI>& argmax_combine_vec(
+    IndexValueVec<T, NV, NI>& a,
+    at::vec::VectorizedN<T, NV> next_value,
+    int64_t next_index,
+    std::optional<int64_t> tail_size = std::nullopt) {
+  auto next_idx = create_index<T, NI, horizontal>(next_index);
+  return argmax_vec_impl(a, next_value, next_idx, tail_size);
+}
+
+template <typename T, int NV, int NI>
+inline IndexValue<T> argmin_vec_reduce_all(
+    const IndexValueVec<T, NV, NI>& vec) {
+  constexpr int len = at::vec::VectorizedN<T, NV>::size();
+  __at_align__ T tmpval[len];
+  __at_align__ int64_t tmpidx[len];
+  vec.value.store(tmpval);
+  vec.index.store(tmpidx);
+  IndexValue res = IndexValue<T>(tmpidx[0], tmpval[0]);
+  for (int i = 1; i < len; i++) {
+    res = argmin_combine(res, tmpval[i], tmpidx[i]);
+  }
+  return res;
+}
+
+template <typename T, int NV, int NI>
+inline IndexValue<T> argmax_vec_reduce_all(
+    const IndexValueVec<T, NV, NI>& vec) {
+  constexpr int len = at::vec::VectorizedN<T, NV>::size();
+  __at_align__ T tmpval[len];
+  __at_align__ int64_t tmpidx[len];
+  vec.value.store(tmpval);
+  vec.index.store(tmpidx);
+  IndexValue res = IndexValue<T>(tmpidx[0], tmpval[0]);
+  for (int i = 1; i < len; i++) {
+    res = argmax_combine(res, tmpval[i], tmpidx[i]);
+  }
+  return res;
+}
+
+template <typename T, int NV, int NI>
+inline IndexValueVec<T, NV, NI>& argmin_combine_vec(
+    IndexValueVec<T, NV, NI>& vec_a,
+    const IndexValueVec<T, NV, NI>& vec_b,
+    std::optional<int64_t> tail_size = std::nullopt) {
+  return argmin_vec_impl(vec_a, vec_b.value, vec_b.index, tail_size);
+}
+
+template <typename T, int NV, int NI>
+inline IndexValueVec<T, NV, NI>& argmax_combine_vec(
+    IndexValueVec<T, NV, NI>& vec_a,
+    const IndexValueVec<T, NV, NI>& vec_b,
+    std::optional<int64_t> tail_size = std::nullopt) {
+  return argmax_vec_impl(vec_a, vec_b.value, vec_b.index, tail_size);
+}
+
+template <typename scalar_t>
+inline at::vec::Vectorized<scalar_t> vec_shuffle_down(
+    at::vec::Vectorized<scalar_t> x,
+    size_t n) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  alignas(alignof(Vec)) scalar_t array[Vec::size()];
+  x.store(array);
+  for (size_t i = 0; i + n < Vec::size(); i += 2 * n) {
+    array[i] = array[i + n];
+  }
+  return Vec::loadu(array);
+}
+
+#ifdef CPU_CAPABILITY_AVX2
+inline at::vec::Vectorized<float> vec_shuffle_down(
+    at::vec::Vectorized<float> x,
+    size_t n) {
+  using vec_t = at::vec::Vectorized<float>;
+#define SHUFFLE_MASK(z, y, x, w) ((z << 6) | (y << 4) | (x << 2) | w)
+  switch (n) {
+    case 1:
+      return vec_t(_mm256_permute_ps(x, SHUFFLE_MASK(1, 1, 3, 3)));
+    case 2:
+      return vec_t(_mm256_permute_ps(x, SHUFFLE_MASK(2, 2, 2, 2)));
+    case 4:
+      return vec_t(_mm256_permute2f128_ps(x, x, SHUFFLE_MASK(1, 1, 1, 1)));
+  }
+  throw std::runtime_error(
+      "Unhandled vec_shuffle_down value " + std::to_string(n));
+}
+#endif
+
+#ifdef CPU_CAPABILITY_AVX512
+inline at::vec::Vectorized<float> vec_shuffle_down(
+    at::vec::Vectorized<float> x,
+    size_t n) {
+  using vec_t = at::vec::Vectorized<float>;
+#define SHUFFLE_MASK(z, y, x, w) ((z << 6) | (y << 4) | (x << 2) | w)
+  switch (n) {
+    case 1:
+      return vec_t(_mm512_permute_ps(x, SHUFFLE_MASK(1, 1, 3, 3)));
+    case 2:
+      return vec_t(_mm512_permute_ps(x, SHUFFLE_MASK(2, 2, 2, 2)));
+    case 4:
+      return vec_t(_mm512_permutexvar_ps(
+          _mm512_set_epi32(
+              12, 12, 12, 12, 12, 12, 12, 12, 4, 4, 4, 4, 4, 4, 4, 4),
+          x));
+    case 8:
+      return vec_t(_mm512_permutexvar_ps(
+          _mm512_set_epi32(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8), x));
+  }
+  throw std::runtime_error(
+      "Unhandled vec_shuffle_down value " + std::to_string(n));
+}
+#endif
+
+template <typename scalar_t>
+Welford<scalar_t> welford_vec_reduce_all(
+    Welford<at::vec::Vectorized<scalar_t>> acc) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  Welford<scalar_t> result;
+  if (acc.index == 0) {
+    return result;
+  }
+  // if all values of acc.weight are same as index,
+  // use index to reduce to save the overhead of vec_shuffle_down for acc.weight
+  bool use_index = (acc.weight - Vec(acc.index)).zero_mask() ==
+      static_cast<int>((1 << Vec::size()) - 1);
+  for (size_t n = 1; n < Vec::size(); n *= 2) {
+    auto shuffled = Welford<Vec>{
+        vec_shuffle_down(acc.mean, n),
+        vec_shuffle_down(acc.m2, n),
+        use_index ? Vec(0) : vec_shuffle_down(acc.weight, n),
+        acc.index};
+    acc = welford_combine(acc, shuffled, use_index);
+  }
+
+  alignas(alignof(Vec)) scalar_t array[Vec::size()];
+  acc.mean.store(array);
+  result.mean = array[0];
+
+  acc.m2.store(array);
+  result.m2 = array[0];
+
+  acc.weight.store(array);
+  result.weight = array[0];
+  result.index = result.weight;
+
+  return result;
+}
+
+template <typename scalar_t>
+Welford<scalar_t> welford_vec_reduce_all(
+    Welford<at::vec::VectorizedN<scalar_t, 2>> acc) {
+  auto Welford0 = Welford<at::vec::Vectorized<scalar_t>>{
+      acc.mean[0], acc.m2[0], acc.weight[0], acc.index};
+  auto Welford1 = Welford<at::vec::Vectorized<scalar_t>>{
+      acc.mean[1], acc.m2[1], acc.weight[1], acc.index};
+  return welford_vec_reduce_all(welford_combine(Welford0, Welford1));
+}
+#endif
+
+template <typename T, typename U>
+inline typename std::common_type_t<T, U> mod(T a, U b) {
+  return a % b;
+}
+template <>
+inline float mod(float a, float b) {
+  return std::fmod(a, b);
+}
+template <>
+inline double mod(double a, double b) {
+  return std::fmod(a, b);
+}
+
+template <typename scalar_t>
+inline scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
+  if (at::_isnan(a)) {
+    return a;
+  }
+  return a > b ? a : b;
+}
+
+template <typename scalar_t>
+inline scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
+  if (at::_isnan(a)) {
+    return a;
+  }
+  return a < b ? a : b;
+}
+
+constexpr float uint32_to_uniform_float(uint32_t value) {
+  // maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
+  constexpr float scale = 4.6566127342e-10;
+  return static_cast<float>(value & 0x7FFFFFFF) * scale;
+}
+
+inline float normalized_rand_cpu(uint32_t seed, uint32_t offset) {
+  return uint32_to_uniform_float(at::Philox4_32(seed, 0, offset)());
+}
+
+inline float randn_cpu(uint32_t seed, uint32_t offset) {
+  at::Philox4_32 engine(seed, 0, offset);
+  return engine.randn(10);
+}
+
+inline int64_t randint64_cpu(
+    uint32_t seed,
+    uint32_t offset,
+    int64_t low,
+    int64_t high) {
+  auto gen = at::Philox4_32(seed, 0, offset);
+  uint64_t r0 = gen();
+  uint64_t r1 = gen();
+  uint64_t result = r0 | (r1 << 32);
+  return static_cast<int64_t>(result % (high - low)) + low;
+}
+
+template <typename T>
+struct AsIntegerType {
+  typedef T type;
+};
+template <>
+struct AsIntegerType<float> {
+  typedef uint32_t type;
+};
+template <>
+struct AsIntegerType<double> {
+  typedef uint64_t type;
+};
+template <>
+struct AsIntegerType<at::BFloat16> {
+  typedef uint16_t type;
+};
+
+template <typename T>
+typename std::enable_if_t<
+    !c10::is_reduced_floating_point_v<T>,
+    T> inline fetch_value(volatile T* addr) {
+  return *addr;
+}
+
+template <typename T>
+typename std::enable_if_t<
+    c10::is_reduced_floating_point_v<T>,
+    T> inline fetch_value(volatile T* addr) {
+  return T(addr->x, T::from_bits());
+}
+
+template <typename T>
+typename std::enable_if_t<!std::is_integral_v<T>> atomic_add(
+    volatile T* addr,
+    T offset) {
+  typedef typename AsIntegerType<T>::type alt_type;
+
+  static_assert(
+      sizeof(std::atomic<alt_type>) == sizeof(T), "std::atomic issue");
+
+  alt_type expected;
+
+  alt_type desired;
+
+  std::atomic<alt_type>* atomic_addr = (std::atomic<alt_type>*)addr;
+  do {
+    T val = fetch_value(addr);
+    reinterpret_cast<T*>(&expected)[0] = val;
+    reinterpret_cast<T*>(&desired)[0] = val + offset;
+  } while (!atomic_addr->compare_exchange_weak(
+      expected, desired, std::memory_order_relaxed));
+}
+
+// Since C++20 float is supported by fetch_add, but the performance may not
+// better than compare_exchange_weak, which can be checked by microbenchmark
+// inductor_cpu_atomic.py
+template <typename T>
+typename std::enable_if_t<std::is_integral_v<T>> atomic_add(
+    volatile T* addr,
+    T offset) {
+  static_assert(sizeof(std::atomic<T>) == sizeof(T), "std::atomic issue");
+  std::atomic<T>* atomic_addr = (std::atomic<T>*)addr;
+  atomic_addr->fetch_add(offset, std::memory_order_relaxed);
+}
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+template <typename T, int NI, int NV>
+void atomic_add_vec(
+    T* addr,
+    at::vec::VectorizedN<int64_t, NI> index,
+    at::vec::VectorizedN<T, NV> offset) {
+  constexpr int len = at::vec::VectorizedN<int64_t, NI>::size();
+  static_assert(len <= at::vec::VectorizedN<T, NV>::size());
+  __at_align__ std::array<T, len> tmpbuf;
+  __at_align__ std::array<int64_t, len> tmpidx;
+  offset.store(tmpbuf.data(), len);
+  index.store(tmpidx.data(), len);
+  for (int i = 0; i < len; i++) {
+    atomic_add(addr + tmpidx[i], tmpbuf[i]);
+  }
+}
+
+template <typename T, bool atomic_add>
+struct transpose_mxn_helper;
+
+template <typename T>
+struct transpose_mxn_helper<T, true> {
+  static void call(
+      const T* src,
+      int64_t ld_src,
+      T* dst,
+      int64_t ld_dst,
+      int M,
+      int N) {
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        atomic_add(&dst[j * ld_dst + i], src[i * ld_src + j]);
+      }
+    }
+  }
+};
+
+template <typename T>
+struct transpose_mxn_helper<T, false> {
+  static void call(
+      const T* src,
+      int64_t ld_src,
+      T* dst,
+      int64_t ld_dst,
+      int M,
+      int N) {
+    at::vec::transpose_mxn<T>(src, ld_src, dst, ld_dst, M, N);
+  }
+};
+
+template <typename T, bool atomic_add>
+inline void transpose_mxn(
+    const T* src,
+    int64_t ld_src,
+    T* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+  transpose_mxn_helper<T, atomic_add>::call(src, ld_src, dst, ld_dst, M, N);
+}
+
+template <typename T, int M, int N, bool atomic_add>
+inline void transpose_mxn(
+    const T* src,
+    int64_t ld_src,
+    T* dst,
+    int64_t ld_dst) {
+  transpose_mxn<T, atomic_add>(src, ld_src, dst, ld_dst, M, N);
+}
+#endif
+
+// NOLINTBEGIN(*-avoid-c-arrays)
+inline std::tuple<std::shared_ptr<int64_t[]>, int> _get_factors(
+    int64_t number) {
+  int count = 0;
+  for (auto i = static_cast<int64_t>(std::sqrt(number)); i > 0; --i) {
+    if (number % i == 0) {
+      count += 2;
+    }
+  }
+  auto factors = std::shared_ptr<int64_t[]>(new int64_t[count]);
+  int index = 0;
+  for (auto i = static_cast<int64_t>(std::sqrt(number)); i > 0; --i) {
+    if (number % i == 0) {
+      factors[index++] = number / i;
+      factors[index++] = i;
+    }
+  }
+  return std::make_tuple(factors, count);
+}
+
+inline std::tuple<std::shared_ptr<int64_t[]>, int> get_factors(int64_t number) {
+  thread_local std::map<int64_t, std::tuple<std::shared_ptr<int64_t[]>, int>>
+      cache;
+  auto it = cache.find(number);
+  if (it != cache.end()) {
+    return it->second;
+  } else {
+    auto factors = _get_factors(number);
+    cache[number] = factors;
+    return factors;
+  }
+}
+// NOLINTEND(*-avoid-c-arrays)
+
+inline void _mm_get_thread_blocking(
+    int num_threads,
+    int max_k_slices,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t Mr,
+    int64_t Nr,
+    int64_t Kr,
+    int64_t& Mt,
+    int64_t& Nt,
+    int64_t& Kt) {
+  // see NOTE [Thread blocking in Cpp GEMM] for heuristics
+  Mt = Nt = Kt = 0;
+
+  auto get_blocking = [](int64_t m_factor,
+                         int64_t n_factor,
+                         int64_t k_factor,
+                         int64_t m_blocks,
+                         int64_t n_blocks,
+                         int64_t k_blocks) {
+    int64_t thread_block_k = (k_blocks + k_factor - 1) / k_factor;
+    int64_t thread_block_n = (n_blocks + n_factor - 1) / n_factor;
+    int64_t thread_block_m = (m_blocks + m_factor - 1) / m_factor;
+    return std::make_tuple(thread_block_m, thread_block_n, thread_block_k);
+  };
+
+  auto is_better_blocking = [=](int64_t Mt_,
+                                int64_t Nt_,
+                                int64_t Kt_,
+                                int64_t Mt,
+                                int64_t Nt,
+                                int64_t Kt) {
+    return Mt == 0 || Kt_ < Kt || Mt_ * Mr + Nt_ * Nr < Mt * Mr + Nt * Nr;
+  };
+
+  int64_t m_blocks = (M + Mr - 1) / Mr;
+  int64_t n_blocks = (N + Nr - 1) / Nr;
+  int64_t k_blocks = (K + Kr - 1) / Kr;
+
+  auto [factors, count] = get_factors(num_threads);
+  assert(count > 0);
+
+  for (int i = 0; i < count; ++i) {
+    int64_t n_factor = factors[i];
+    int64_t m_factor = num_threads / n_factor;
+    if (n_blocks >= n_factor && m_blocks >= m_factor) {
+      auto [Mt_, Nt_, Kt_] =
+          get_blocking(m_factor, n_factor, 1, m_blocks, n_blocks, k_blocks);
+      if (is_better_blocking(Mt_, Nt_, Kt_, Mt, Nt, Kt)) {
+        std::tie(Mt, Nt, Kt) = std::make_tuple(Mt_, Nt_, Kt_);
+      }
+    }
+  }
+
+  if (Mt != 0) {
+    return;
+  }
+
+  for (int i = 0; i < count; ++i) {
+    int64_t k_factor = factors[i];
+    if (k_blocks >= k_factor &&
+        (max_k_slices == 0 || k_factor <= max_k_slices)) {
+      auto [mxn_factors, mxn_count] = get_factors(num_threads / k_factor);
+      for (int j = 0; j < mxn_count; ++j) {
+        int64_t n_factor = mxn_factors[j];
+        int64_t m_factor = num_threads / (k_factor * n_factor);
+        if (n_blocks >= n_factor && m_blocks >= m_factor) {
+          auto [Mt_, Nt_, Kt_] = get_blocking(
+              m_factor, n_factor, k_factor, m_blocks, n_blocks, k_blocks);
+          if (is_better_blocking(Mt_, Nt_, Kt_, Mt, Nt, Kt)) {
+            std::tie(Mt, Nt, Kt) = std::make_tuple(Mt_, Nt_, Kt_);
+          }
+        }
+      }
+    }
+  }
+
+  if (Mt != 0) {
+    return;
+  }
+
+  for (int i = 0; i < count; ++i) {
+    int64_t n_factor = factors[i];
+    int64_t m_factor = num_threads / n_factor;
+    if (n_blocks >= n_factor || m_blocks >= m_factor) {
+      auto [Mt_, Nt_, Kt_] =
+          get_blocking(m_factor, n_factor, 1, m_blocks, n_blocks, k_blocks);
+      if (is_better_blocking(Mt_, Nt_, Kt_, Mt, Nt, Kt)) {
+        std::tie(Mt, Nt, Kt) = std::make_tuple(Mt_, Nt_, Kt_);
+      }
+    }
+  }
+
+  assert(Mt != 0);
+}
+
+inline void mm_get_thread_blocking(
+    int num_threads,
+    int max_k_slices,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t Mr,
+    int64_t Nr,
+    int64_t Kr,
+    int64_t& Mt,
+    int64_t& Nt,
+    int64_t& Kt) {
+  thread_local std::map<
+      std::
+          tuple<int, int, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t>,
+      std::tuple<int64_t, int64_t, int64_t>>
+      cache;
+  auto key = std::make_tuple(num_threads, max_k_slices, M, N, K, Mr, Nr, Kr);
+  auto it = cache.find(key);
+  if (it != cache.end()) {
+    std::tie(Mt, Nt, Kt) = it->second;
+    return;
+  } else {
+    _mm_get_thread_blocking(
+        num_threads, max_k_slices, M, N, K, Mr, Nr, Kr, Mt, Nt, Kt);
+    cache[key] = std::make_tuple(Mt, Nt, Kt);
+  }
+}
+
+// NOLINTBEGIN(*-narrowing-conversions)
+template <typename X_t, typename W_t>
+void _mm_get_cache_blocking(
+    int num_threads,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t Mr,
+    int64_t Nr,
+    int64_t Kr,
+    int64_t Mt_blocks,
+    int64_t Nt_blocks,
+    int64_t Kt_blocks,
+    int64_t& Mc_blocks,
+    int64_t& Nc_blocks,
+    int64_t& Kc_blocks,
+    uint32_t L1_cache_size,
+    uint32_t L2_cache_size) {
+  // See NOTE [CPP GEMM Cache Blocking Algorithm] for the cache blocking
+  // algorithm.
+  // TODO(jgong5): cache cache blocking results
+  // TODO: tune the factor here
+  float L1_limit_factor = 0.8;
+  float L2_limit_factor = 0.5;
+
+  auto L1 = L1_cache_size * L1_limit_factor;
+  auto L2 = L2_cache_size * L2_limit_factor;
+
+  constexpr size_t num_byte_A = sizeof(X_t);
+  constexpr size_t num_byte_B = sizeof(W_t);
+
+  int64_t size_cache_B = Kr * Kt_blocks * Nr * num_byte_B;
+  Kc_blocks = Kt_blocks;
+  if (size_cache_B > L1) {
+    Kc_blocks = (int64_t)std::floor(L1 / (Kr * Nr * num_byte_B));
+  }
+
+  float min_Mc_ratio = 2;
+  int64_t min_Mc_blocks = std::ceil(min_Mc_ratio * Mr / Nr);
+  auto Kt_bytes = Kt_blocks * Kr * num_byte_A;
+  if (min_Mc_blocks * Mr * Kt_bytes < L2) {
+    Mc_blocks = std::min(Mt_blocks, (int64_t)std::floor(L2 / (Mr * Kt_bytes)));
+    Nc_blocks = 1;
+  } else {
+    Mc_blocks = Mt_blocks;
+    Nc_blocks =
+        std::min((int64_t)std::ceil((float)Mc_blocks * Mr / Nr), Nt_blocks);
+    auto Nc_bytes = Nc_blocks * Nr * 4;
+    auto Kc_bytes = Kc_blocks * Kr * num_byte_A;
+    if (Mc_blocks * Mr * (Kc_bytes + Nc_bytes) > L2) {
+      auto M_max = (std::sqrt(Kc_bytes * Kc_bytes + 16 * L2) - Kc_bytes) / 8;
+      if (M_max < Mc_blocks * Mr) {
+        Mc_blocks = (int64_t)std::floor(M_max / Mr);
+        Nc_blocks =
+            std::min((int64_t)std::ceil((float)Mc_blocks * Mr / Nr), Nt_blocks);
+      }
+    }
+  }
+}
+// NOLINTEND(*-narrowing-conversions)
+
+template <typename X_t, typename W_t>
+void mm_get_cache_blocking(
+    int num_threads,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t Mr,
+    int64_t Nr,
+    int64_t Kr,
+    int64_t Mt_blocks,
+    int64_t Nt_blocks,
+    int64_t Kt_blocks,
+    int64_t& Mc_blocks,
+    int64_t& Nc_blocks,
+    int64_t& Kc_blocks,
+    uint32_t L1_cache_size,
+    uint32_t L2_cache_size) {
+  thread_local std::map<
+      std::tuple<
+          int,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t>,
+      std::tuple<int64_t, int64_t, int64_t>>
+      cache;
+  auto key = std::make_tuple(
+      num_threads,
+      M,
+      N,
+      K,
+      Mr,
+      Nr,
+      Kr,
+      Mt_blocks,
+      Nt_blocks,
+      Kt_blocks,
+      L1_cache_size,
+      L2_cache_size);
+  auto it = cache.find(key);
+  if (it != cache.end()) {
+    std::tie(Mc_blocks, Nc_blocks, Kc_blocks) = it->second;
+    return;
+  } else {
+    _mm_get_cache_blocking<X_t, W_t>(
+        num_threads,
+        M,
+        N,
+        K,
+        Mr,
+        Nr,
+        Kr,
+        Mt_blocks,
+        Nt_blocks,
+        Kt_blocks,
+        Mc_blocks,
+        Nc_blocks,
+        Kc_blocks,
+        L1_cache_size,
+        L2_cache_size);
+    cache[key] = std::make_tuple(Mc_blocks, Nc_blocks, Kc_blocks);
+  }
+}
+
+struct amx_tilecfg {
+  uint8_t palette_id{0};
+  uint8_t start_row{0};
+  std::array<uint8_t, 14> reserved_0{};
+  std::array<uint16_t, 16> colsb{};
+  std::array<uint8_t, 16> rows{};
+};
+
+class AMXState {
+ private:
+  amx_tilecfg tilecfg_{};
+  uint8_t rows_{0};
+  uint16_t colsb_{0};
+  uint8_t num_tile_rows_{0};
+  uint8_t num_tile_columns_{0};
+
+ public:
+  AMXState() = default;
+
+  inline void configure(
+      uint8_t rows,
+      uint16_t colsb,
+      uint8_t num_tile_rows,
+      uint8_t num_tile_columns,
+      void (*loadconfig)(const amx_tilecfg&)) {
+    if (tilecfg_.palette_id == 1 && rows_ == rows && colsb_ == colsb &&
+        num_tile_rows_ == num_tile_rows &&
+        num_tile_columns_ == num_tile_columns) {
+      return;
+    }
+    tilecfg_.palette_id = 1;
+    rows_ = rows;
+    colsb_ = colsb;
+    num_tile_rows_ = num_tile_rows;
+    num_tile_columns_ = num_tile_columns;
+    const auto num_c_tiles = num_tile_rows * num_tile_columns;
+    // For C
+    for (int i = 0; i < num_c_tiles; i++) {
+      tilecfg_.rows[i] = rows;
+      tilecfg_.colsb[i] = 64;
+    }
+    // For A
+    for (int i = 0; i < num_tile_rows; i++) {
+      tilecfg_.rows[i + num_c_tiles] = rows;
+      tilecfg_.colsb[i + num_c_tiles] = colsb;
+    }
+    // For B
+    for (int i = 0; i < num_tile_columns; i++) {
+      tilecfg_.rows[i + num_c_tiles + num_tile_rows] = colsb / 4;
+      tilecfg_.colsb[i + num_c_tiles + num_tile_rows] = 64;
+    }
+    loadconfig(tilecfg_);
+  }
+
+  inline void release(void (*tile_release)()) {
+    tilecfg_.palette_id = 0;
+    tile_release();
+  }
+};
diff --git a/torch/csrc/inductor/cpp_wrapper/common.h b/torch/csrc/inductor/cpp_wrapper/common.h
index 3f77347f5274..fe28013d6607 100644
--- a/torch/csrc/inductor/cpp_wrapper/common.h
+++ b/torch/csrc/inductor/cpp_wrapper/common.h
@@ -3,16 +3,46 @@
 #include <array>
 #include <filesystem>
 #include <optional>
+<<<<<<< HEAD
+=======
+#include <utility>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <Python.h>
 #define PYBIND11_SIMPLE_GIL_MANAGEMENT
 #include <pybind11/gil.h>
+<<<<<<< HEAD
 namespace py = pybind11;
 
 class RAIIPyObject {
  public:
   RAIIPyObject() : obj_(nullptr) {}
   RAIIPyObject(PyObject* obj) : obj_(obj) {}
+=======
+
+// Include some often-used cpp_wrapper headers, for precompiling.
+#include <c10/util/BFloat16.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/utils/pythoncapi_compat.h>
+#include <torch/csrc/utils/tensor_memoryformats.h>
+
+namespace py = pybind11; // NOLINT(misc-unused-alias-decls)
+
+class RAIIPyObject {
+ public:
+  RAIIPyObject() = default;
+  // steals a reference to a PyObject
+  RAIIPyObject(PyObject* obj) : obj_{obj} {}
+  RAIIPyObject(const RAIIPyObject& other) : obj_{other.obj_} {
+    Py_XINCREF(obj_);
+  }
+  RAIIPyObject(RAIIPyObject&& other) noexcept {
+    // refcount doesn't change, and obj_ is currently nullptr
+    std::swap(obj_, other.obj_);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ~RAIIPyObject() {
     Py_XDECREF(obj_);
   }
@@ -24,6 +54,19 @@ class RAIIPyObject {
     }
     return *this;
   }
+<<<<<<< HEAD
+=======
+  RAIIPyObject& operator=(RAIIPyObject&& other) noexcept {
+    // refcount to the current object decreases, but refcount to other.obj_ is
+    // the same
+    Py_XDECREF(obj_);
+    obj_ = std::exchange(other.obj_, nullptr);
+    return *this;
+  }
+  operator bool() const noexcept {
+    return obj_;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   operator PyObject*() {
     return obj_;
   }
@@ -32,7 +75,11 @@ class RAIIPyObject {
   }
 
  private:
+<<<<<<< HEAD
   PyObject* obj_;
+=======
+  PyObject* obj_{nullptr};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 #include <torch/csrc/inductor/aoti_runtime/device_utils.h>
@@ -41,8 +88,11 @@ using namespace torch::aot_inductor;
 
 #include <c10/util/generic_math.h>
 #include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+<<<<<<< HEAD
 using half = at::Half;
 using bfloat16 = at::BFloat16;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Round up to the nearest multiple of 64
 [[maybe_unused]] inline int64_t align(int64_t nbytes) {
diff --git a/torch/csrc/inductor/cpp_wrapper/device_internal/mps.h b/torch/csrc/inductor/cpp_wrapper/device_internal/mps.h
new file mode 100644
index 000000000000..4a25af7ac07a
--- /dev/null
+++ b/torch/csrc/inductor/cpp_wrapper/device_internal/mps.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h>
diff --git a/torch/csrc/inductor/cpp_wrapper/mps.h b/torch/csrc/inductor/cpp_wrapper/mps.h
new file mode 100644
index 000000000000..a5847517bebd
--- /dev/null
+++ b/torch/csrc/inductor/cpp_wrapper/mps.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/cpp_wrapper/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/mps.h>
diff --git a/torch/csrc/inductor/static_cuda_launcher.cpp b/torch/csrc/inductor/static_cuda_launcher.cpp
new file mode 100644
index 000000000000..35756b704faa
--- /dev/null
+++ b/torch/csrc/inductor/static_cuda_launcher.cpp
@@ -0,0 +1,524 @@
+#if defined(USE_CUDA) && !defined(USE_ROCM)
+// We disable this file from being hipified because there are CUDA drivers hip
+// has not implemented yet. Also, we're passing in a cubin file directly, so it
+// would take more work to support ROCM anyway.
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+#include <ATen/Context.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/inductor/static_cuda_launcher.h>
+#include <cstdint>
+#include <stdexcept>
+
+#include <torch/csrc/utils/python_numbers.h>
+#include <filesystem>
+#include <optional>
+/**
+  Implements a static launcher for triton compiled CUDA kernels.
+  Given a path to a cubin file, a function name, and some metadata,
+  this class loads and launches the cubin.
+
+  Doing this avoids C++ codegen and compilation during compile, since we can
+  use a statically compiled library to launch the kernel. To avoid mallocing
+  for the arguments, we have a launcher for different numbers of arguments up
+  to a max. StaticCudaLauncher only supports # of arguments up until 10 for
+  now.
+
+  Note that we allocate 8 bytes per argument, no matter the types of each
+  argument, since we don't know ahead of time what the types of each argument
+  passed to the triton kernel are. This may take slightly more memory on the
+  stack, and will require some benchmarking. However, since the vast majority
+  of triton kernels have less than 10 args, this seems unlikely to be
+  expensive.
+
+  This launcher is paired with StaticallyLaunchedCudaKernel in
+  triton_heuristics.py.
+
+  TODO:
+  - Handle CutensorMap, NvtmDesc
+  - Handle launch_enter and launch_exit hooks (in python maybe?)
+ */
+
+// Use ATen/NVRTC.h to gain access to the CUDA driver API.
+// This function is only called when CUDA is enabled, and only called to load
+// and launch triton compiled CUDA kernels, so CUDA should always be
+// initialized.
+namespace {
+const at::cuda::NVRTC& nvrtc() {
+  return at::globalContext().getNVRTC();
+}
+
+// 120 max args + 1 for global scratch size
+#define MAX_ARGS 121
+
+CUdeviceptr getPointer(PyObject* obj) {
+  CUdeviceptr data_ptr = 0;
+  if (THPUtils_checkLong(obj)) {
+    data_ptr = THPUtils_unpackUInt64(obj);
+    return data_ptr;
+  }
+  if (obj == Py_None) {
+    // valid nullptr
+    return data_ptr;
+  }
+  auto ptr = THPObjectPtr{PyObject_GetAttrString(obj, "data_ptr")};
+  TORCH_CHECK(
+      ptr != nullptr,
+      "Pointer argument must be either uint64 or have data_ptr method")
+  auto empty_tuple = THPObjectPtr{PyTuple_New(0)};
+  auto ret = THPObjectPtr{PyObject_Call(ptr, empty_tuple, nullptr)};
+  TORCH_CHECK(
+      THPUtils_checkLong(ret),
+      "data_ptr method of Pointer object must return 64-bit int");
+  data_ptr = THPUtils_unpackUInt64(ret);
+  if (!data_ptr)
+    return data_ptr;
+
+  CUdeviceptr dev_ptr = 0;
+  AT_CUDA_DRIVER_CHECK(nvrtc().cuPointerGetAttribute(
+      &dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, data_ptr));
+  return dev_ptr;
+}
+
+#define SHARED_MEM_STATIC_MAX 49152 // 48 KB
+
+CUfunction loadKernel(
+    std::string filePath,
+    const std::string& funcName,
+    uint32_t sharedMemBytes,
+    CUdevice device,
+    const std::optional<std::string>& cubinDir = std::nullopt) {
+  if (cubinDir) {
+    std::filesystem::path p1{*cubinDir};
+    std::filesystem::path p2{filePath};
+    filePath = (p1 / p2.filename()).string();
+  }
+  CUmodule mod = nullptr;
+  CUfunction func = nullptr;
+  AT_CUDA_DRIVER_CHECK(nvrtc().cuModuleLoad(&mod, filePath.c_str()));
+  AT_CUDA_DRIVER_CHECK(
+      nvrtc().cuModuleGetFunction(&func, mod, funcName.c_str()));
+  int shared_optin = 0;
+  AT_CUDA_DRIVER_CHECK(nvrtc().cuDeviceGetAttribute(
+      &shared_optin,
+      CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+      device));
+  // Shared memory logic from triton/third-party/nvidia/backend/driver.c
+  // If we're using more than 48 KB of shared memory, and we have
+  // access to more than 48 KB of shared memory on the device,
+  // we set maximum dynamic shared memory to the difference between
+  // the static shared memory and total max shared memory allowed on the device.
+  // This prevents us from setting shared memory above the maximum
+  TORCH_CHECK_WITH(
+      OutOfMemoryError,
+      sharedMemBytes < static_cast<uint32_t>(shared_optin),
+      "out of resource: ",
+      funcName,
+      " Required: ",
+      sharedMemBytes,
+      " Hardware limit:",
+      shared_optin,
+      " Reducing block sizes or `num_stages` may help.");
+  if (sharedMemBytes > SHARED_MEM_STATIC_MAX &&
+      shared_optin > SHARED_MEM_STATIC_MAX) {
+    AT_CUDA_DRIVER_CHECK(
+        nvrtc().cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_SHARED));
+    int shared_total = 0, shared_static = 0;
+    AT_CUDA_DRIVER_CHECK(nvrtc().cuDeviceGetAttribute(
+        &shared_total,
+        CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
+        device));
+    AT_CUDA_DRIVER_CHECK(nvrtc().cuFuncGetAttribute(
+        &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func));
+    AT_CUDA_DRIVER_CHECK(nvrtc().cuFuncSetAttribute(
+        func,
+        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+        shared_optin - shared_static));
+  }
+  return func;
+}
+
+inline void launchKernel(
+    CUfunction func,
+    uint32_t gridX,
+    uint32_t gridY,
+    uint32_t gridZ,
+    uint32_t numWarps,
+    uint32_t sharedMemBytes,
+    void** args,
+    cudaStream_t stream) {
+  // cta_args is always 1 for inductor generated triton kernels,
+  // so we don't need to figure out grid dimension here
+  AT_CUDA_DRIVER_CHECK(nvrtc().cuLaunchKernel(
+      func,
+      gridX,
+      gridY,
+      gridZ,
+      32 * numWarps, // blockDim.x
+      1, // blockDim.y
+      1, // blockDim.z
+      sharedMemBytes,
+      stream,
+      args,
+      nullptr));
+}
+
+template <typename FINAL, typename F>
+void convertType(F converter, const char* name, void* slot, PyObject* item) {
+  auto temp = converter(item);
+  if (PyErr_Occurred()) {
+    std::string msg = "Failed to convert argument to ";
+    msg += name;
+    TORCH_CHECK(false, msg);
+  }
+  *reinterpret_cast<FINAL*>(slot) = static_cast<FINAL>(temp);
+}
+
+/**
+  Given a list of args and their types (in a string), along with two stack
+  allocated arrays, puts each argument arg_{i} into argStorage[i], and a
+  pointer to the argument in kernelArgs[i]. We then can pass `kernelArgs`
+  directly to launchKernel. Note that some args can be less than 8 bytes, but
+  we'll still allocate 8 bytes on the stack for them.
+
+  * TODO: Need to handle NvtmDesc here.
+*/
+void parseKernelArgs(
+    PyObject* varArgs,
+    const char* argTypes,
+    uint64_t* argStorage,
+    void** kernelArgs) {
+  int numKernelArgs = static_cast<int>(std::strlen(argTypes));
+  TORCH_CHECK(
+      PyTuple_Check(varArgs), "Kernel arguments must be provided as a tuple");
+  TORCH_CHECK(
+      PyTuple_Size(varArgs) == static_cast<Py_ssize_t>(numKernelArgs),
+      "Mismatch between number of argument types and provided arguments");
+
+  for (int i = 0; i < numKernelArgs; ++i) {
+    // Get pointer to the ith 8-byte slot.
+    void* slot = static_cast<void*>(&argStorage[i]);
+    PyObject* item = PyTuple_GetItem(varArgs, i);
+    char typeChar = argTypes[i];
+    switch (typeChar) {
+      case 'b':
+        convertType<int8_t>(THPUtils_unpackInt, "int8", slot, item);
+        break;
+      case 'h':
+        convertType<int16_t>(THPUtils_unpackInt, "int16", slot, item);
+        break;
+      case 'i':
+        convertType<int32_t>(THPUtils_unpackLong, "int32", slot, item);
+        break;
+      case 'l':
+        convertType<int64_t>(THPUtils_unpackLong, "int64", slot, item);
+        break;
+      case 'B':
+        convertType<uint8_t>(THPUtils_unpackUInt32, "uint8", slot, item);
+        break;
+      case 'H':
+        convertType<uint16_t>(THPUtils_unpackUInt32, "uint16", slot, item);
+        break;
+      case 'I':
+        convertType<uint32_t>(THPUtils_unpackUInt32, "uint32", slot, item);
+        break;
+      case 'K':
+        convertType<uint64_t>(THPUtils_unpackUInt64, "uint64", slot, item);
+        break;
+      case 'f':
+        convertType<float>(THPUtils_unpackDouble, "float", slot, item);
+        break;
+      case 'd':
+        convertType<double>(THPUtils_unpackDouble, "double", slot, item);
+        break;
+      case 'O': { // pointer; using helper getPointer() (which may call
+                  // data_ptr() if needed)
+        CUdeviceptr ptr = getPointer(item);
+        *reinterpret_cast<CUdeviceptr*>(slot) = ptr;
+        break;
+      }
+      default:
+        TORCH_CHECK(false, "Unknown type passed in: ", typeChar);
+    }
+    // Save the pointer to this slot.
+    kernelArgs[i] = slot;
+  }
+}
+
+/* Load the CUDA kernel into memory (called during torch.compile), and
+  return a pointer to it (along with nregs and nspills).
+  Called in python as:
+  (function, n_regs, n_spills) = load_kernel(cubin_path, func_name,
+  sharedMemBytes)
+*/
+PyObject* load_kernel(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  const char* filePath = nullptr;
+  const char* funcName = nullptr;
+  int sharedMemBytes = 0;
+  int n_regs = 0;
+  int n_spills = 0;
+  int device_ptr = 0;
+  if (!PyArg_ParseTuple(
+          args, "ssii", &filePath, &funcName, &sharedMemBytes, &device_ptr)) {
+    return nullptr;
+  }
+  CUdevice device = static_cast<CUdevice>(device_ptr); // NOLINT
+  CUfunction func = nullptr;
+  func = loadKernel(filePath, funcName, sharedMemBytes, device);
+  // Taken from triton/nvidia/backend/driver.c
+  AT_CUDA_DRIVER_CHECK(
+      nvrtc().cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, func));
+  AT_CUDA_DRIVER_CHECK(nvrtc().cuFuncGetAttribute(
+      &n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, func));
+  n_spills /= 4;
+  // Return a tuple of CUFunction, n_regs, n_spills
+  return Py_BuildValue(
+      "(Kii)", reinterpret_cast<uint64_t>(func), n_regs, n_spills);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* launch_kernel_inner(
+    CUfunction func,
+    int gridX,
+    int gridY,
+    int gridZ,
+    int numWarps,
+    int sharedMemBytes,
+    const char* argTypes,
+    PyObject* varArgs,
+    cudaStream_t cudaStream) {
+  // Launch the kernel
+  // Prepare the arguments for the kernel
+  // We allocate 8 bytes per argument on the stack. We then allocate 8 more
+  // bytes to point to each 8 byte slot in argStorage, and pass that array of
+  // pointers to launchKernel.
+  std::array<uint64_t, MAX_ARGS> argStorage = {};
+  std::array<void*, MAX_ARGS> kernelArgs = {};
+  parseKernelArgs(varArgs, argTypes, argStorage.data(), kernelArgs.data());
+
+  launchKernel(
+      func,
+      gridX,
+      gridY,
+      gridZ,
+      numWarps,
+      sharedMemBytes,
+      kernelArgs.data(),
+      cudaStream);
+  Py_RETURN_NONE;
+}
+
+PyObject* launch_kernel_slow(
+    CUfunction func,
+    int gridX,
+    int gridY,
+    int gridZ,
+    int numWarps,
+    int sharedMemBytes,
+    const char* argTypes,
+    PyObject* varArgs,
+    cudaStream_t cudaStream) {
+  /* For the slow case, allocate memory on the stack instead of the heap */
+  size_t numArgs = std::strlen(argTypes);
+  std::vector<uint64_t> argStorage(numArgs);
+  std::vector<void*> kernelArgs(numArgs);
+
+  parseKernelArgs(varArgs, argTypes, argStorage.data(), kernelArgs.data());
+
+  launchKernel(
+      func,
+      gridX,
+      gridY,
+      gridZ,
+      numWarps,
+      sharedMemBytes,
+      kernelArgs.data(),
+      cudaStream);
+  Py_RETURN_NONE;
+}
+
+/**
+*  Main entrypoint function called at runtime; called like this in python land:
+    launcher(
+      function, # CUfunction returned by load_kernel()
+      grid_x,
+      grid_y,
+      grid_z,
+      num_warps,
+      shared,
+      arg_tys, # e.g. "bO" for (int8_t, uint64_t)
+      args, # tuple of arguments passed to the kernel
+      stream,
+  )
+*
+*/
+PyObject* launch_kernel(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  // Pointer to CUfunction generated by load_kernel()
+  uint64_t func_ptr = 0;
+  int gridX = 0, gridY = 0, gridZ = 0, numWarps = 0, sharedMemBytes = 0;
+  // stream here should be the raw stream gotten from
+  // device_interface.get_raw_stream()
+  uint64_t stream = 0;
+  const char* argTypes = nullptr;
+  PyObject* varArgs = nullptr;
+  // Parse the fixed arguments and the format string
+  if (!PyArg_ParseTuple(
+          args,
+          "KiiiiisOl",
+          &func_ptr,
+          &gridX,
+          &gridY,
+          &gridZ,
+          &numWarps,
+          &sharedMemBytes,
+          &argTypes,
+          &varArgs,
+          &stream)) {
+    return nullptr;
+  }
+  if (gridX * gridY * gridZ <= 0) {
+    // No need to do any work if we're outside of grid bounds
+    Py_RETURN_NONE;
+  }
+  CUcontext pctx = nullptr;
+  AT_CUDA_DRIVER_CHECK(nvrtc().cuCtxGetCurrent(&pctx));
+  if (!pctx) {
+    // Ensure device context exists
+    CUdevice device = 0;
+    AT_CUDA_DRIVER_CHECK(nvrtc().cuDeviceGet(&device, 0));
+    AT_CUDA_DRIVER_CHECK(nvrtc().cuDevicePrimaryCtxRetain(&pctx, device));
+    AT_CUDA_DRIVER_CHECK(nvrtc().cuCtxSetCurrent(pctx));
+  }
+  CUfunction func = reinterpret_cast<CUfunction>(func_ptr); // NOLINT
+  cudaStream_t cudaStream = reinterpret_cast<cudaStream_t>(stream); // NOLINT
+  auto num_args = std::strlen(argTypes);
+  // Kernels with no arguments should just pass nullptr to cuLaunchKernel
+  if (num_args == 0) {
+    launchKernel(
+        func,
+        gridX,
+        gridY,
+        gridZ,
+        numWarps,
+        sharedMemBytes,
+        nullptr,
+        cudaStream);
+    Py_RETURN_NONE;
+  } else if (num_args <= MAX_ARGS) {
+    return launch_kernel_inner(
+        func,
+        gridX,
+        gridY,
+        gridZ,
+        numWarps,
+        sharedMemBytes,
+        argTypes,
+        varArgs,
+        cudaStream);
+  } else {
+    return launch_kernel_slow(
+        func,
+        gridX,
+        gridY,
+        gridZ,
+        numWarps,
+        sharedMemBytes,
+        argTypes,
+        varArgs,
+        cudaStream);
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+std::array<PyMethodDef, 2> StaticCudaLauncherMethods = {
+    PyMethodDef{
+        "_launch_kernel",
+        launch_kernel,
+        METH_VARARGS,
+        "Statically launch triton compiled CUDA kernels"},
+    PyMethodDef{
+        "_load_kernel",
+        load_kernel,
+        METH_VARARGS,
+        "Load CUDA kernel from cubin file"}};
+
+// Define a minimal type for StaticCudaLauncher.
+// We don't implement __new__ or __init__ because we're using it only as a
+// container for static methods.
+PyTypeObject StaticCudaLauncherType = {
+    PyVarObject_HEAD_INIT(nullptr, 0)
+    "torch._C._StaticCudaLauncher", // tp_name
+    sizeof(PyObject), // tp_basicsize
+    0, // tp_itemsize
+    nullptr, // tp_dealloc
+    0, // tp_print (deprecated)
+    nullptr, // tp_getattr
+    nullptr, // tp_setattr
+    nullptr, // tp_reserved
+    nullptr, // tp_repr
+    nullptr, // tp_as_number
+    nullptr, // tp_as_sequence
+    nullptr, // tp_as_mapping
+    nullptr, // tp_hash
+    nullptr, // tp_call
+    nullptr, // tp_str
+    nullptr, // tp_getattro
+    nullptr, // tp_setattro
+    nullptr, // tp_as_buffer
+    Py_TPFLAGS_DEFAULT,
+    "Statically defined launchers for triton compiled CUDA kernels", // tp_doc
+    nullptr, // tp_traverse
+    nullptr, // tp_clear
+    nullptr, // tp_richcompare
+    0, // tp_weaklistoffset
+    nullptr, // tp_iter
+    nullptr, // tp_iternext
+    nullptr, // tp_methods
+    nullptr, // tp_members
+    nullptr, // tp_getset
+    nullptr, // tp_base
+    nullptr, // tp_dict (automatically allocated)
+    nullptr, // tp_descr_get
+    nullptr, // tp_descr_set
+    0, // tp_dictoffset
+    nullptr, // tp_init
+    nullptr, // tp_alloc
+    nullptr, // tp_new
+};
+} // anonymous namespace
+// Module initialization: add StaticCudaLauncher to the module with our static
+// methods.
+bool StaticCudaLauncher_init(PyObject* module) {
+  if (PyType_Ready(&StaticCudaLauncherType) < 0) {
+    return false;
+  }
+  // Add our static methods to the type's dictionary.
+  PyObject* dict = StaticCudaLauncherType.tp_dict;
+  for (auto& def : StaticCudaLauncherMethods) {
+    PyObject* func = PyCFunction_New(&def, nullptr);
+    if (!func) {
+      return false;
+    }
+    PyObject* static_method = PyStaticMethod_New(func);
+    Py_DECREF(func);
+    if (PyDict_SetItemString(dict, def.ml_name, static_method) < 0) {
+      Py_DECREF(static_method);
+      return false;
+    }
+    Py_DECREF(static_method);
+  }
+  Py_INCREF(&StaticCudaLauncherType);
+  if (PyModule_AddObject(
+          module, "_StaticCudaLauncher", (PyObject*)&StaticCudaLauncherType) <
+      0) {
+    Py_DECREF(&StaticCudaLauncherType);
+    return false;
+  }
+  return true;
+}
+#endif
diff --git a/torch/csrc/inductor/static_cuda_launcher.h b/torch/csrc/inductor/static_cuda_launcher.h
new file mode 100644
index 000000000000..517036b9975e
--- /dev/null
+++ b/torch/csrc/inductor/static_cuda_launcher.h
@@ -0,0 +1,7 @@
+#pragma once
+#if defined(USE_CUDA) && !defined(USE_ROCM)
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/cuda.h>
+#include <torch/csrc/python_headers.h>
+
+bool StaticCudaLauncher_init(PyObject* module);
+#endif
diff --git a/torch/csrc/instruction_counter/Module.cpp b/torch/csrc/instruction_counter/Module.cpp
index 231a26ce43f4..c44977353935 100644
--- a/torch/csrc/instruction_counter/Module.cpp
+++ b/torch/csrc/instruction_counter/Module.cpp
@@ -9,6 +9,10 @@
 #include <stdexcept>
 
 #if defined(__linux__)
+<<<<<<< HEAD
+=======
+#include <fmt/printf.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <linux/perf_event.h>
 #include <sys/ioctl.h>
 #include <sys/syscall.h>
@@ -17,7 +21,11 @@
 
 namespace torch::instruction_counter {
 
+<<<<<<< HEAD
 long start() {
+=======
+static long start() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if !defined(__linux__)
   throw std::runtime_error("This systems seems not to be Linux");
 #else
@@ -36,7 +44,11 @@ long start() {
 
   long fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
   if (fd == -1) {
+<<<<<<< HEAD
     fprintf(
+=======
+    fmt::fprintf(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stderr,
         "Failed to open instruction count event: %s.\n",
         c10::utils::str_error(errno).c_str());
@@ -48,13 +60,21 @@ long start() {
 #endif
 }
 
+<<<<<<< HEAD
 uint64_t end(int fd) {
+=======
+static uint64_t end(int fd) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if !defined(__linux__)
   throw std::runtime_error("This systems seems not to be Linux");
 #else
   // Disable the event group
   if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
+<<<<<<< HEAD
     fprintf(
+=======
+    fmt::fprintf(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stderr,
         "Error disabling perf event (fd: %d): %s\n",
         fd,
@@ -67,7 +87,11 @@ uint64_t end(int fd) {
   // Read results
   long ret_val = read(fd, &total_instructions, sizeof(total_instructions));
   if (ret_val == -1) {
+<<<<<<< HEAD
     fprintf(
+=======
+    fmt::fprintf(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stderr,
         "Error reading perf event results: %s\n",
         c10::utils::str_error(errno).c_str());
diff --git a/torch/csrc/itt.cpp b/torch/csrc/itt.cpp
index 868235d90ad7..a8a0367e369a 100644
--- a/torch/csrc/itt.cpp
+++ b/torch/csrc/itt.cpp
@@ -1,5 +1,10 @@
+<<<<<<< HEAD
 #include <torch/csrc/itt_wrapper.h>
 #include <torch/csrc/utils/pybind.h>
+=======
+#include <torch/csrc/itt.h>
+#include <torch/csrc/itt_wrapper.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch::profiler {
 void initIttBindings(PyObject* module) {
diff --git a/torch/csrc/itt.h b/torch/csrc/itt.h
new file mode 100644
index 000000000000..6c409fbe3ff9
--- /dev/null
+++ b/torch/csrc/itt.h
@@ -0,0 +1,8 @@
+#ifndef ITT_H
+#define ITT_H
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::profiler {
+void initIttBindings(PyObject* module); // namespace torch::profiler
+}
+#endif // ITT_H
diff --git a/torch/csrc/itt_wrapper.cpp b/torch/csrc/itt_wrapper.cpp
index 2719e11e1952..9af9f4d4396e 100644
--- a/torch/csrc/itt_wrapper.cpp
+++ b/torch/csrc/itt_wrapper.cpp
@@ -3,7 +3,11 @@
 #include <torch/csrc/profiler/stubs/base.h>
 
 namespace torch::profiler {
+<<<<<<< HEAD
 __itt_domain* _itt_domain = __itt_domain_create("PyTorch");
+=======
+static __itt_domain* _itt_domain = __itt_domain_create("PyTorch");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool itt_is_available() {
   return torch::profiler::impl::ittStubs()->enabled();
diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md
index b15fe34d4397..62af0d9250ca 100644
--- a/torch/csrc/jit/OVERVIEW.md
+++ b/torch/csrc/jit/OVERVIEW.md
@@ -367,7 +367,11 @@ Values are abstract representations of data in the program. When executing, the
 
 ## Type ##
 
+<<<<<<< HEAD
 [aten/src/ATen/core/jit_type.h](/aten/src/ATen/core/jit_type.h)
+=======
+[aten/src/ATen/core/jit_type.h](../../../aten/src/ATen/core/jit_type.h)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TorchScript, unlike Python, is statically typed, so every `Value` has a Type associated with it, and every FunctionSchema has a list of argument types and a return type for a function. Type is the base class of a hierarchy of C++ objects that represent the built-in types of TorchScript. Types provide methods such as `Type::isSubtypeOf` that describe the typing relationships. Common type are:
 
@@ -389,7 +393,10 @@ JIT programs are created using either the tracing frontend (`torch.jit.trace`) o
 
 
 [tracer.h](frontend/tracer.h)
+<<<<<<< HEAD
 [tracer_state.h](frontend/tracer_state.h)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 The tracer produces graphs by recording what actual operations are done on `Tensors`.
 The entry point from Python into C++ for tracing using `torch.jit.trace` is `_create_method_from_trace`.
@@ -398,7 +405,11 @@ A thread local instance of the TracingState object maintains a mapping between a
 
 An initial `IValue` to `Value` mapping is set up between the inputs to the function being traced and symbolic `Value` inputs to the `Graph` being constructed. If we are tracing a `torch.nn.Module`, the tracer also adds Parameters and sub-Modules to the Module being constructed that correspond to the Python `torch.nn.Module` being traced.  Mappings for these values are also added so that uses of the Parameters in the trace will create uses of the Parameters in the `Graph`.
 
+<<<<<<< HEAD
 As the trace runs, individual operators create `Nodes` in the `Graph` being traced to record what happens. This code is currently generated per operator in [tools/autograd/gen_variable_type.py](/tools/autograd/gen_variable_type.py). It results in code that looks like the following:
+=======
+As the trace runs, individual operators create `Nodes` in the `Graph` being traced to record what happens. This code is currently generated per operator in [tools/autograd/gen_variable_type.py](../../../tools/autograd/gen_variable_type.py). It results in code that looks like the following:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ```cpp
 torch::jit::Node* node = nullptr;
@@ -434,7 +445,11 @@ The resulting `Graph` created by tracing is installed as the 'forward' method of
 
 ## Script ##
 
+<<<<<<< HEAD
 The script frontend directly converts Python syntax into Modules. Like many compilers this happens in two phases. First, we generate an abstract syntax tree (AST), which is constructed out of Tree objects. The IR emitter then does semantic analysis on the Tree and lowers it into a Module. We can generate Trees in two ways: (1) using frontend.py, which takes the Python AST and transliterates it into Tree objects, or (2) via the Lexer and Parser which parse Python syntax directly. The Lexer+Parser path may seem redundant but it is crucially important. We need to define builtin functions ([frontend/builtin_functions.cpp](frontend/builtin_functions.cpp)) when Python is not linked because we allow users to generate TorchScript programs directly from strings containing Python source code ([api/include/torch/jit.h](/torch/csrc/api/include/torch/jit.h)) without linking a full Python implementation (e.g. CPython). We also use this Python syntax as the serialization format for TorchScript, since it allows us to make changes to our IR without breaking backward compatibility. Furthermore, the Lexer is reused to implement the FunctionSchema parser, which turns FunctionSchema declarations from strings into FunctionSchema objects.
+=======
+The script frontend directly converts Python syntax into Modules. Like many compilers this happens in two phases. First, we generate an abstract syntax tree (AST), which is constructed out of Tree objects. The IR emitter then does semantic analysis on the Tree and lowers it into a Module. We can generate Trees in two ways: (1) using frontend.py, which takes the Python AST and transliterates it into Tree objects, or (2) via the Lexer and Parser which parse Python syntax directly. The Lexer+Parser path may seem redundant but it is crucially important. We need to define builtin functions ([frontend/builtin_functions.cpp](frontend/builtin_functions.cpp)) when Python is not linked because we allow users to generate TorchScript programs directly from strings containing Python source code ([api/include/torch/jit.h](../api/include/torch/jit.h)) without linking a full Python implementation (e.g. CPython). We also use this Python syntax as the serialization format for TorchScript, since it allows us to make changes to our IR without breaking backward compatibility. Furthermore, the Lexer is reused to implement the FunctionSchema parser, which turns FunctionSchema declarations from strings into FunctionSchema objects.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 The following sections look into each the stages in the script frontend in detail.
 
@@ -761,7 +776,11 @@ Optimization passes that wish to exploit multi-threaded execution may automatica
 
 ## IValue ##
 
+<<<<<<< HEAD
 [ivalue.h](/aten/src/ATen/core/ivalue.h)
+=======
+[ivalue.h](../../../aten/src/ATen/core/ivalue.h)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 All evaluation involves computation using `IValues`, 16-byte tagged unions that can hold the concrete representation of any type in TorchScript. TorchScript is statically typed, so it would be possible to operate on unboxed primitive types, but the interface between interpreter, built-in ops and user functions would be significantly more complicated. A single tagged union keeps these interfaces simple and since most objects are `Tensors` anyway, the overhead of storing a tag is small compared to the data stored in the `Tensors`.
 
@@ -1407,7 +1426,11 @@ def foo(a : Tensor, b : Tensor):
 ```
 Will produce a graph like this:
 
+<<<<<<< HEAD
 ![AliasTracker graph](/docs/source/_static/img/aliastracker_graph.png)
+=======
+![AliasTracker graph](../../../docs/source/_static/img/aliastracker_graph.png)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 A few things to note:
 - "Graph Input Element" is an example of an `Element` that isn't a first-class `Value`. Alias analysis happens on a per-function level, so we don't necessarily know the aliasing relationships of the inputs. The only safe assumption is that `a` and `b` may alias each other, so they point to a special `Element` that describes "the world outside of this function".
@@ -1459,8 +1482,13 @@ When differentiating a graph, each node that has a symbolic gradient will be inc
 Adding/updating symbolic gradient functions must be tested carefully as it's easy to get CI green by comparing autograd result with itself, but potentially cause an autodiff support regression.
 
 If your PR adds/updates a gradient formula for `torch`/`nn` functions, you **MUST** enable/update the corresponding tests in
+<<<<<<< HEAD
 - `torch` functions: `method_tests` in [common_method_tests.py](../../../test/common_method_tests.py)
 - `nn` functions: `nn_functional_tests` in [test_jit.py](../../../test/test_jit.py)
+=======
+- `torch` functions: `module_tests` in [common_nn.py](../../testing/_internal/common_nn.py)
+- `nn` functions: `nn_functional_tests` in [test_jit.py](../../testing/_internal/jit_metaprogramming_utils.py)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 To turn on autodiff check, you can add an optional `check_ad(should_autodiff_node[bool], nonfusible_nodes[str|list[str]], fusible_nodes[str|list[str]])` tuple after the optional test variant name field.
 If `should_autodiff_node=True`, the differentiated traced/script forward graph must have a `prim::DifferentiableGraph`.
diff --git a/torch/csrc/jit/README.md b/torch/csrc/jit/README.md
index 2b80d51a182a..dbb4b9d201d0 100644
--- a/torch/csrc/jit/README.md
+++ b/torch/csrc/jit/README.md
@@ -26,5 +26,9 @@ A brief summary of the source tree:
 **Refer** to each folder for more in-depth documentation.
 
 Other relevant parts of the codebase not contained here:
+<<<<<<< HEAD
 - [aten/src/ATen/core](/aten/src/ATen/core): contains JIT code re-used by other elements of the
+=======
+- [aten/src/ATen/core](../../../aten/src/ATen/core): contains JIT code re-used by other elements of the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   runtime system (eager, mobile, etc.)
diff --git a/torch/csrc/jit/api/function_impl.h b/torch/csrc/jit/api/function_impl.h
index b5d336db2b6e..d8bfa99bbdaf 100644
--- a/torch/csrc/jit/api/function_impl.h
+++ b/torch/csrc/jit/api/function_impl.h
@@ -176,5 +176,10 @@ struct TORCH_API GraphFunction : public Function {
 TORCH_API GraphFunction* tryToGraphFunction(Function&) noexcept;
 TORCH_API GraphFunction& toGraphFunction(Function&);
 TORCH_API const GraphFunction& toGraphFunction(const Function&);
+<<<<<<< HEAD
 
 } // namespace torch::jit
+=======
+} // namespace torch::jit
+C10_DECLARE_bool(torch_jit_do_not_store_optimized_graph);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp
index 9cd655ad930e..60cea90e182c 100644
--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@@ -148,7 +148,11 @@ Module::Module(
 // as we bring up the system since it will degrade performance
 // and may introduce bugs. test_jit.py provides context managers
 // that enable it for specific tests.
+<<<<<<< HEAD
 thread_local bool inline_everything = false;
+=======
+static thread_local bool inline_everything = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 bool& getInlineEverythingMode() {
   return inline_everything;
 }
diff --git a/torch/csrc/jit/backends/backend.h b/torch/csrc/jit/backends/backend.h
index a6b567c85480..45323f6652d1 100644
--- a/torch/csrc/jit/backends/backend.h
+++ b/torch/csrc/jit/backends/backend.h
@@ -7,7 +7,10 @@
 
 namespace torch::jit {
 namespace {
+<<<<<<< HEAD
 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline c10::FunctionSchema getIsAvailableSchema() {
   c10::Argument self("self", c10::AnyType::get());
   c10::Argument available("available", c10::BoolType::get());
@@ -21,7 +24,10 @@ inline c10::FunctionSchema getIsAvailableSchema() {
 
 constexpr static auto kBackendsNamespace = "__backends__";
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline c10::FunctionSchema getCompileSchema() {
   c10::Argument self("self", c10::AnyType::get());
   c10::Argument mod("processed", c10::AnyType::get());
@@ -38,7 +44,10 @@ inline c10::FunctionSchema getCompileSchema() {
   return compile_schema;
 }
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline c10::FunctionSchema getExecuteSchema() {
   auto any_list_ty = c10::ListType::create(c10::AnyType::get());
   c10::Argument self("self", c10::AnyType::get());
diff --git a/torch/csrc/jit/backends/backend_init.cpp b/torch/csrc/jit/backends/backend_init.cpp
index 380c9f0d096f..d5299aabe67e 100644
--- a/torch/csrc/jit/backends/backend_init.cpp
+++ b/torch/csrc/jit/backends/backend_init.cpp
@@ -10,7 +10,11 @@
 namespace torch::jit {
 
 // Get all types that are shared in the module hierarchy rooted at \p mod.
+<<<<<<< HEAD
 std::unordered_set<TypePtr> getSharedModuleTypes(Module& mod) {
+=======
+static std::unordered_set<TypePtr> getSharedModuleTypes(Module& mod) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Maintain a set of all TypePtrs.
   std::unordered_set<TypePtr> types;
   // Maintain another set of TypePtrs that have been encountered more than once.
@@ -32,7 +36,11 @@ std::unordered_set<TypePtr> getSharedModuleTypes(Module& mod) {
 // Selectively lower \p mod to a backend. \p to_backend
 // is called to lower modules. \p modules_to_lower contains
 // qualified names of submodules of \p mod that should be lowered.
+<<<<<<< HEAD
 void toBackendSelectiveImpl(
+=======
+static void toBackendSelectiveImpl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Module& mod,
     const py::function& to_backend,
     const std::vector<std::string>& modules_to_lower,
@@ -118,7 +126,11 @@ void toBackendSelectiveImpl(
   }
 }
 
+<<<<<<< HEAD
 Module codegen_func(
+=======
+static Module codegen_func(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::string& backend_name,
     const Module& orig_module,
     const py::dict& method_compile_spec) {
diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
index 5b811100fe06..b9d36c8a5b75 100644
--- a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
+++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
@@ -26,7 +26,11 @@ namespace py = pybind11;
 // torch.tensor([[1.0, -1.0, 2.0, -2.0]]).unsqueeze(-1).unsqueeze(-1)
 //
 // In the future, preprocess will accept a dedicated object
+<<<<<<< HEAD
 c10::IValue preprocess(
+=======
+static c10::IValue preprocess(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const torch::jit::Module& mod,
     const c10::Dict<c10::IValue, c10::IValue>& method_compile_spec,
     const torch::jit::BackendDebugHandleGenerator& generate_debug_handles) {
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
index 6de9d5d6357e..ac81b15dd20c 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
@@ -7,15 +7,23 @@
 #include <caffe2/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h>
 #include <torch/csrc/jit/backends/xnnpack/serialization/schema_generated.h>
 
+<<<<<<< HEAD
 namespace torch {
 namespace jit {
 namespace xnnpack {
 namespace delegate {
+=======
+namespace torch::jit::xnnpack::delegate {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class XNNModelWrapper : public CustomClassHolder {
  public:
   XNNExecutor executor_;
+<<<<<<< HEAD
   XNNModelWrapper(XNNExecutor executor) : executor_(std::move(executor)){};
+=======
+  XNNModelWrapper(XNNExecutor executor) : executor_(std::move(executor)) {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   XNNModelWrapper() = delete;
 
@@ -25,9 +33,14 @@ class XNNModelWrapper : public CustomClassHolder {
 class XNNPackBackend : public PyTorchBackendInterface {
  public:
   // Constructor.
+<<<<<<< HEAD
   // NOLINTNEXTLINE(modernize-use-equals-default)
   explicit XNNPackBackend() {}
   virtual ~XNNPackBackend() override = default;
+=======
+  explicit XNNPackBackend() = default;
+  ~XNNPackBackend() override = default;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   bool is_available() override {
     return xnn_status_success == xnn_initialize(/*allocator=*/nullptr);
@@ -81,8 +94,13 @@ class XNNPackBackend : public PyTorchBackendInterface {
     XNNExecutor& executor = model_wrapper->executor_;
 
     std::vector<float*> input_pointers;
+<<<<<<< HEAD
     for (int i = 0; i < inputs.size(); ++i) {
       at::IValue val = inputs.get(i);
+=======
+    input_pointers.reserve(inputs.size());
+    for (const at::IValue& val : inputs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_CHECK(val.isTensor(), "Non-tensor inputs not supported");
       input_pointers.push_back(val.toTensor().data_ptr<float>());
     }
@@ -90,8 +108,14 @@ class XNNPackBackend : public PyTorchBackendInterface {
     std::vector<at::Tensor> output_tensors;
     std::vector<float*> output_pointers;
     output_tensors.reserve(output_shapes.size());
+<<<<<<< HEAD
     for (int i = 0; i < output_shapes.size(); i++) {
       auto o_shape = output_shapes.get(i).toIntVector();
+=======
+    output_pointers.reserve(output_shapes.size());
+    for (const at::IValue& val : output_shapes) {
+      auto o_shape = val.toIntVector();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto output = at::empty(o_shape, c10::ScalarType::Float);
       output_tensors.push_back(output);
       output_pointers.push_back(output.data_ptr<float>());
@@ -112,7 +136,11 @@ constexpr auto backend_name = "xnnpack";
 static auto cls = torch::jit::backend<XNNPackBackend>(backend_name);
 } // namespace
 
+<<<<<<< HEAD
 } // namespace delegate
 } // namespace xnnpack
 } // namespace jit
 } // namespace torch
+=======
+} // namespace torch::jit::xnnpack::delegate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/jit/codegen/fuser/arg_spec.h b/torch/csrc/jit/codegen/fuser/arg_spec.h
index 7239e0391b8f..d85f3561e739 100644
--- a/torch/csrc/jit/codegen/fuser/arg_spec.h
+++ b/torch/csrc/jit/codegen/fuser/arg_spec.h
@@ -16,7 +16,10 @@ namespace torch::jit::fuser {
 // Note: the device to run on is included in the arg spec because kernels
 //  are compiled per-device.
 struct TORCH_API ArgSpec {
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ArgSpec(at::TensorList inputs, const int _device)
       : descs_{c10::fmap<TensorDesc>(inputs)},
         hash_code_{c10::get_hash(_device, inputs.size(), descs_)},
diff --git a/torch/csrc/jit/codegen/fuser/compiler.cpp b/torch/csrc/jit/codegen/fuser/compiler.cpp
index 3944e6f2df82..a6ecd3489766 100644
--- a/torch/csrc/jit/codegen/fuser/compiler.cpp
+++ b/torch/csrc/jit/codegen/fuser/compiler.cpp
@@ -68,8 +68,13 @@ size_t nCompiledKernels() {
 
 int debugFuser() {
   if (debug_fusion < 0) {
+<<<<<<< HEAD
     const char* debug_env = getenv("PYTORCH_FUSION_DEBUG");
     debug_fusion = debug_env ? atoi(debug_env) : 0;
+=======
+    const auto debug_env = c10::utils::get_env("PYTORCH_FUSION_DEBUG");
+    debug_fusion = debug_env ? atoi(debug_env->c_str()) : 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return debug_fusion;
 }
diff --git a/torch/csrc/jit/codegen/fuser/compiler.h b/torch/csrc/jit/codegen/fuser/compiler.h
index d9c0005b9d28..1b2c3fd197c9 100644
--- a/torch/csrc/jit/codegen/fuser/compiler.h
+++ b/torch/csrc/jit/codegen/fuser/compiler.h
@@ -45,6 +45,7 @@ TORCH_API void registerFusionBackend(
     at::Device::Type backend_type,
     FusedKernelConstructor ctor);
 TORCH_API bool hasFusionBackend(at::Device::Type backend_type);
+<<<<<<< HEAD
 struct TORCH_API RegisterFusionBackend {
   RegisterFusionBackend(
       at::Device::Type backend_type,
@@ -52,5 +53,14 @@ struct TORCH_API RegisterFusionBackend {
     registerFusionBackend(backend_type, std::move(ctor));
   }
 };
+=======
+struct TORCH_API RegisterFusionBackend{RegisterFusionBackend(
+    at::Device::Type backend_type,
+    FusedKernelConstructor ctor){
+    registerFusionBackend(backend_type, std::move(ctor));
+} // namespace torch::jit::fuser
+}
+;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::jit::fuser
diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
index f10f16c8331d..2d8190e796fd 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
@@ -3,6 +3,10 @@
 #include <ATen/DynamicLibrary.h>
 #include <ATen/code_template.h>
 #include <c10/util/Exception.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/codegen/fuser/compiler.h>
 #include <torch/csrc/jit/codegen/fuser/cpu/temp_file.h>
 #include <optional>
@@ -40,6 +44,7 @@ constexpr int so_suffix_len = 3;
 constexpr int cpp_suffix_len = 4;
 #endif
 
+<<<<<<< HEAD
 intptr_t run(const std::string& cmd);
 
 static bool programExists(const std::string& program) {
@@ -55,6 +60,8 @@ static bool programExists(const std::string& program) {
 #endif
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef _MSC_VER
 static std::optional<std::wstring> exec(const std::wstring& cmd) {
   std::array<wchar_t, 128> buffer;
@@ -143,9 +150,15 @@ static void activate() {
   }
 }
 
+<<<<<<< HEAD
 intptr_t run(const std::string& cmd) {
   // Getting the path of `cmd.exe`
   wchar_t* comspec = _wgetenv(L"COMSPEC");
+=======
+static intptr_t run(const std::string& cmd) {
+  // Getting the path of `cmd.exe`
+  const wchar_t* comspec = _wgetenv(L"COMSPEC");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!comspec) {
     comspec = L"C:\\Windows\\System32\\cmd.exe";
   }
@@ -168,14 +181,36 @@ intptr_t run(const std::string& cmd) {
 }
 #endif
 
+<<<<<<< HEAD
+=======
+static bool programExists(const std::string& program) {
+  std::stringstream ss;
+  c10::printQuotedString(ss, program);
+  at::jit::TemplateEnv env;
+  env.s("program", ss.str());
+  std::string cmd = format(check_exists_string, env);
+#ifdef _MSC_VER
+  return (run(cmd.c_str()) == 0);
+#else
+  return (system(cmd.c_str()) == 0);
+#endif
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // A single compiler config is accessed through getConfig() (below)
 // Controls compilation options and may be updated based on the result
 // of compilation attempts.
 struct CompilerConfig {
   CompilerConfig() {
+<<<<<<< HEAD
     const char* cxx_env = getenv("CXX");
     if (cxx_env != nullptr) {
       cxx = cxx_env;
+=======
+    const auto cxx_env = c10::utils::get_env("CXX");
+    if (cxx_env) {
+      cxx = cxx_env.value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
 #ifdef _MSC_VER
@@ -353,5 +388,9 @@ static std::shared_ptr<FusedKernel> createFusionKernel(
       has_random);
 }
 
+<<<<<<< HEAD
 RegisterFusionBackend reg(DeviceType::CPU, createFusionKernel);
+=======
+static RegisterFusionBackend reg(DeviceType::CPU, createFusionKernel);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit::fuser::cpu
diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
index b9e25430421d..a17bdbf13ea5 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
@@ -125,7 +125,11 @@ FusedKernelCUDA::FusedKernelCUDA(
   args.push_back("-hip-pch");
 #else
   const std::string compute = std::string("--gpu-architecture=") +
+<<<<<<< HEAD
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
+=======
+#if !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_)
       // which gives better backwards compatibility to work on older driver,
       // (since older driver doesn't necessrily recognize PTX emitted by new
@@ -156,7 +160,11 @@ FusedKernelCUDA::FusedKernelCUDA(
       [&] { AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcDestroyProgram(&program)); });
   AT_CUDA_NVRTC_CHECK(result);
   size_t ptx_size = 0;
+<<<<<<< HEAD
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
+=======
+#if !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // compile_to_sass determines whether we are generating SASS or PTX, hence
   // the different API.
   const auto getSize = compile_to_sass
diff --git a/torch/csrc/jit/codegen/fuser/fallback.cpp b/torch/csrc/jit/codegen/fuser/fallback.cpp
index 70d90bd94c0a..3651c5de38cd 100644
--- a/torch/csrc/jit/codegen/fuser/fallback.cpp
+++ b/torch/csrc/jit/codegen/fuser/fallback.cpp
@@ -19,7 +19,11 @@ c10::AliasAnalysisKind aliasAnalysisIsSpecialCase() {
 
 // Registers fused operators so that fused graphs can properly generate fallback
 // code.
+<<<<<<< HEAD
 RegisterOperators reg_fused_operators({Operator(
+=======
+static RegisterOperators reg_fused_operators({Operator(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prim::FusedConcat,
     [](const Node* node) -> Operation {
       int64_t dim = node->i(attr::dim);
diff --git a/torch/csrc/jit/codegen/fuser/interface.cpp b/torch/csrc/jit/codegen/fuser/interface.cpp
index 0d6f7c1facc3..c72797c184cf 100644
--- a/torch/csrc/jit/codegen/fuser/interface.cpp
+++ b/torch/csrc/jit/codegen/fuser/interface.cpp
@@ -15,11 +15,19 @@ namespace detail {
 #ifdef TORCH_ENABLE_LLVM
 bool cpu_fuser_enabled = true;
 #else
+<<<<<<< HEAD
 bool cpu_fuser_enabled = false;
 #endif
 
 // note: this doesn't necessarily enable NNC because NVFuser might override it
 bool gpu_fuser_enabled = true;
+=======
+static bool cpu_fuser_enabled = false;
+#endif
+
+// note: this doesn't necessarily enable NNC because NVFuser might override it
+static bool gpu_fuser_enabled = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace detail
 
diff --git a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
index d07e1fd2309e..4fd09f6212cd 100644
--- a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
+++ b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
@@ -31,7 +31,11 @@ dnnl::engine& Engine::getEngine() {
   static dnnl::graph::allocator alloc{
       pytorch_default_allocator, pytorch_default_deallocator};
   static dnnl::engine cpu_engine = dnnl::graph::make_engine_with_allocator(
+<<<<<<< HEAD
       dnnl::engine::kind::cpu, /* device_id = */ 0, alloc);
+=======
+      dnnl::engine::kind::cpu, /* index = */ 0, alloc);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return cpu_engine;
 }
 
diff --git a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
index 9b38cd525e76..572431d535e1 100644
--- a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
+++ b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
@@ -15,7 +15,12 @@ namespace torch::jit::fuser::onednn {
 // being created for each device. The device handle passed from PyTorch allows
 // oneDNN Graph implementation to work on the device specified by PyTorch, which
 // is currently CPU, so we only have one engine.
+<<<<<<< HEAD
 // Ref: https://spec.oneapi.io/onednn-graph/latest/programming_model.html#engine
+=======
+// Ref:
+// https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#engine
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct Engine {
   // CPU engine singleton
   static dnnl::engine& getEngine();
diff --git a/torch/csrc/jit/codegen/onednn/README.md b/torch/csrc/jit/codegen/onednn/README.md
index e3f3ec66734b..9671ade94a6e 100644
--- a/torch/csrc/jit/codegen/onednn/README.md
+++ b/torch/csrc/jit/codegen/onednn/README.md
@@ -1,5 +1,9 @@
 # Pytorch - oneDNN Graph API Bridge
+<<<<<<< HEAD
 This is a PyTorch JIT graph fuser based on [oneDNN Graph API](https://spec.oneapi.io/onednn-graph/latest/programming_model.html), which provides a flexible API for aggressive fusion. Float & BFloat16 inference is supported. However, BFloat16 only performs well on Intel Xeon Cooper Lake platform & beyond, as they have native BFloat16 support. Also, currently, PyTorch has divergent AMP support in JIT & eager modes, so one should disable JIT AMP support & leverage eager mode AMP support to use BFloat16. Please refer to the BFloat16 example below.
+=======
+This is a PyTorch JIT graph fuser based on [oneDNN Graph API](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model), which provides a flexible API for aggressive fusion. Float & BFloat16 inference is supported. However, BFloat16 only performs well on Intel Xeon Cooper Lake platform & beyond, as they have native BFloat16 support. Also, currently, PyTorch has divergent AMP support in JIT & eager modes, so one should disable JIT AMP support & leverage eager mode AMP support to use BFloat16. Please refer to the BFloat16 example below.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Currently, speedup is achieved only for static shapes, although we'd soon add dynamic-shape support. When oneDNN Graph is enabled, weights are cached, as they're constant during inference.
 
@@ -29,7 +33,11 @@ We have registered optimization passes in the custom pre-passes set of PyTorch:
 
 ## Graph Executor
 During runtime execution of a (re-written) PyTorch JIT graph, oneDNN graph partitions will be dispatched to the oneDNN graph JIT variadic Operator.
+<<<<<<< HEAD
 Inside the oneDNN graph JIT Op, input PyTorch tensors of each partition will be mapped to oneDNN graph tensors. The partition will then be [compiled](https://spec.oneapi.io/onednn-graph/latest/programming_model.html#partition) and [executed](https://spec.oneapi.io/onednn-graph/latest/programming_model.html#compiled-partition). The output oneDNN graph tensor will be mapped back to PyTorch tensors to be fed to the next operator on the PyTorch JIT graph.
+=======
+Inside the oneDNN graph JIT Op, input PyTorch tensors of each partition will be mapped to oneDNN graph tensors. The partition will then be [compiled](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#partition) and [executed](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#compiled-partition). The output oneDNN graph tensor will be mapped back to PyTorch tensors to be fed to the next operator on the PyTorch JIT graph.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 ## Tests
diff --git a/torch/csrc/jit/codegen/onednn/interface.cpp b/torch/csrc/jit/codegen/onednn/interface.cpp
index c3edd9f41613..aba000834bff 100644
--- a/torch/csrc/jit/codegen/onednn/interface.cpp
+++ b/torch/csrc/jit/codegen/onednn/interface.cpp
@@ -104,7 +104,11 @@ static Operation createLlgaKernel(const Node* node) {
   };
 }
 
+<<<<<<< HEAD
 RegisterOperators oneDNNFusionGroupOp({
+=======
+static RegisterOperators oneDNNFusionGroupOp({
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch::jit::Operator(
         prim::oneDNNFusionGroup,
         createLlgaKernel,
@@ -169,7 +173,11 @@ static Operation createLlgaGuardKernel(const Node* node) {
   };
 }
 
+<<<<<<< HEAD
 RegisterOperators oneDNNGuardOp({
+=======
+static RegisterOperators oneDNNGuardOp({
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch::jit::Operator(
         prim::oneDNNFusionGuard,
         createLlgaGuardKernel,
diff --git a/torch/csrc/jit/frontend/builtin_functions.cpp b/torch/csrc/jit/frontend/builtin_functions.cpp
index 2b3bdc42e4cc..8f44562370cd 100644
--- a/torch/csrc/jit/frontend/builtin_functions.cpp
+++ b/torch/csrc/jit/frontend/builtin_functions.cpp
@@ -7,7 +7,11 @@
 
 namespace torch::jit {
 
+<<<<<<< HEAD
 auto scalar_operators_source = at::jit::CodeTemplate(
+=======
+static auto scalar_operators_source = at::jit::CodeTemplate(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     R"SCRIPT(
 def mul(a : ${Scalar}, b : Tensor) -> Tensor:
   return b * a
@@ -23,7 +27,11 @@ def div(a : ${Scalar}, b : Tensor) -> Tensor:
   return torch.reciprocal(b) * a
 )SCRIPT");
 
+<<<<<<< HEAD
 auto scalar_operators_no_complex_source = at::jit::CodeTemplate(
+=======
+static auto scalar_operators_no_complex_source = at::jit::CodeTemplate(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     R"SCRIPT(
 def lt(a : ${Scalar}, b : Tensor) -> Tensor:
   return b > a
@@ -35,19 +43,31 @@ def ge(a : ${Scalar}, b : Tensor) -> Tensor:
   return b <= a
 )SCRIPT");
 
+<<<<<<< HEAD
 auto _ntuple_ops = at::jit::CodeTemplate(
+=======
+static auto _ntuple_ops = at::jit::CodeTemplate(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     R"SCRIPT(
 def _${name}(x: BroadcastingList${Length}[${Scalar}]) -> List[${Scalar}]:
   return x
 )SCRIPT");
 
+<<<<<<< HEAD
 auto floordiv = at::jit::CodeTemplate(
+=======
+static auto floordiv = at::jit::CodeTemplate(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     R"SCRIPT(
 def floordiv(self : Tensor, other : ${Rhs_Type}) -> Tensor:
   return torch.floor_divide(self, other)
 )SCRIPT");
 
+<<<<<<< HEAD
 auto tensor_properties =
+=======
+static auto tensor_properties =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     R"SCRIPT(
 def ndim(a : Tensor) -> int:
   return a.dim()
@@ -67,7 +87,11 @@ def shape(a : Tensor) -> List[int]:
 // aten::_assert_int_or_pair op which was removed once we were able to compile
 // torch.nn.functional.assert_int_or_pair
 // list_with_default also needs to be here for BC
+<<<<<<< HEAD
 auto aten_ops =
+=======
+static auto aten_ops =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     R"SCRIPT(
 def _assert_int_or_pair(vals: List[int], name: str, message: str):
   pass
diff --git a/torch/csrc/jit/frontend/edit_distance.cpp b/torch/csrc/jit/frontend/edit_distance.cpp
index 6390e69630a3..b4e40bd628d2 100644
--- a/torch/csrc/jit/frontend/edit_distance.cpp
+++ b/torch/csrc/jit/frontend/edit_distance.cpp
@@ -12,10 +12,17 @@ size_t ComputeEditDistance(
     const char* word1,
     const char* word2,
     size_t maxEditDistance) {
+<<<<<<< HEAD
   size_t m = strlen(word1);
   size_t n = strlen(word2);
 
   const unsigned small_buffer_size = 64;
+=======
+  size_t m = std::strlen(word1);
+  size_t n = std::strlen(word2);
+
+  constexpr unsigned small_buffer_size = 64;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
   unsigned small_buffer[small_buffer_size];
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
diff --git a/torch/csrc/jit/frontend/error_report.cpp b/torch/csrc/jit/frontend/error_report.cpp
index 1f87e5e0cd7e..70bca65da64a 100644
--- a/torch/csrc/jit/frontend/error_report.cpp
+++ b/torch/csrc/jit/frontend/error_report.cpp
@@ -6,7 +6,11 @@ namespace torch::jit {
 
 // Avoid storing objects with destructor in thread_local for mobile build.
 #ifndef C10_MOBILE
+<<<<<<< HEAD
 thread_local std::vector<Call> calls;
+=======
+static thread_local std::vector<Call> calls;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif // C10_MOBILE
 
 ErrorReport::ErrorReport(const ErrorReport& e)
diff --git a/torch/csrc/jit/frontend/function_schema_parser.cpp b/torch/csrc/jit/frontend/function_schema_parser.cpp
index d7e10a1177b2..c2030238d8a5 100644
--- a/torch/csrc/jit/frontend/function_schema_parser.cpp
+++ b/torch/csrc/jit/frontend/function_schema_parser.cpp
@@ -3,6 +3,10 @@
 #include <ATen/core/Reduction.h>
 #include <ATen/core/jit_type.h>
 #include <ATen/core/type_factory.h>
+<<<<<<< HEAD
+=======
+#include <fmt/format.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/frontend/lexer.h>
 #include <torch/csrc/jit/frontend/parse_string_literal.h>
 #include <torch/csrc/jit/frontend/schema_type_parser.h>
@@ -108,7 +112,11 @@ struct SchemaParser {
     std::string name = L.expect(TK_IDENT).text();
     if (L.nextIf(':')) {
       L.expect(':');
+<<<<<<< HEAD
       name = name + "::" + L.expect(TK_IDENT).text();
+=======
+      name = fmt::format("{}::{}", name, L.expect(TK_IDENT).text_view());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     std::string overload_name = "";
     if (L.nextIf('.')) {
@@ -125,7 +133,11 @@ struct SchemaParser {
         is_a_valid_overload_name,
         overload_name,
         " is not a legal overload name for aten operators");
+<<<<<<< HEAD
     return {name, overload_name};
+=======
+    return {std::move(name), std::move(overload_name)};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   std::vector<std::variant<OperatorName, FunctionSchema>> parseDeclarations() {
@@ -157,8 +169,13 @@ struct SchemaParser {
     std::string name;
     if (L.nextIf('[')) {
       // note: an array with a size hint can only occur at the Argument level
+<<<<<<< HEAD
       fake_type = ListType::create(std::move(fake_type));
       real_type = ListType::create(std::move(real_type));
+=======
+      fake_type = c10::TypeFactory::create<ListType>(std::move(fake_type));
+      real_type = c10::TypeFactory::create<ListType>(std::move(real_type));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       N = std::stoll(L.expect(TK_NUMBER).text());
       L.expect(']');
       auto container = type_parser.parseAliasAnnotation();
@@ -240,6 +257,7 @@ struct SchemaParser {
       }
       case TK_IDENT: {
         auto tok = L.next();
+<<<<<<< HEAD
         auto text = tok.text();
         // NB: float/complex/long are here for BC purposes. Other dtypes
         // are handled via str2dtype.
@@ -262,6 +280,33 @@ struct SchemaParser {
           return static_cast<int64_t>(str2dtype.at(text));
         } else {
           throw(ErrorReport(L.cur().range) << "invalid numeric default value");
+=======
+        auto text_view = tok.text_view();
+        // NB: float/complex/long are here for BC purposes. Other dtypes
+        // are handled via str2dtype.
+        // Please don't add more cases to this if-else block.
+        if ("float" == text_view) {
+          return static_cast<int64_t>(at::kFloat);
+        } else if ("complex" == text_view) {
+          return static_cast<int64_t>(at::kComplexFloat);
+        } else if ("long" == text_view) {
+          return static_cast<int64_t>(at::kLong);
+        } else if ("strided" == text_view) {
+          return static_cast<int64_t>(at::kStrided);
+        } else if ("Mean" == text_view) {
+          return static_cast<int64_t>(at::Reduction::Mean);
+        } else if ("contiguous_format" == text_view) {
+          return static_cast<int64_t>(c10::MemoryFormat::Contiguous);
+        } else {
+          auto text = tok.text();
+          if (isPossiblyOptionalScalarType(real_type) &&
+              str2dtype.count(text) > 0) {
+            return static_cast<int64_t>(str2dtype.at(text));
+          } else {
+            throw(
+                ErrorReport(L.cur().range) << "invalid numeric default value");
+          }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       }
       default:
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index e5166c76f643..2e6ac5702a44 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -3,6 +3,10 @@
 
 #include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 #include <caffe2/serialize/versions.h>
 #include <torch/csrc/jit/api/function_impl.h>
@@ -47,12 +51,20 @@ bool reportSourceLocation(size_t file_size) {
   if (file_size < 512ull * 1024) {
     return true;
   }
+<<<<<<< HEAD
   const char* enable_env =
       std::getenv("PYTORCH_JIT_ENABLE_LARGE_SOURCE_LOCATION");
   bool flag = true;
   if (enable_env == nullptr || std::strcmp(enable_env, "0") == 0 ||
       std::strcmp(enable_env, "FALSE") == 0 ||
       std::strcmp(enable_env, "false") == 0) {
+=======
+  const auto enable_env =
+      c10::utils::get_env("PYTORCH_JIT_ENABLE_LARGE_SOURCE_LOCATION");
+  bool flag = true;
+  if (!enable_env.has_value() || enable_env == "0" || enable_env == "FALSE" ||
+      enable_env == "false") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     flag = false;
   }
   return flag;
@@ -624,11 +636,14 @@ static Value* materializeConstant(
   return new_constant;
 }
 
+<<<<<<< HEAD
 inline bool isSupportedListElementType(const TypePtr& type) {
   return type->isSubtypeOf(*TensorType::get()) ||
       type->isSubtypeOf(*NumberType::get());
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Information for each def being emitted.
 // Defs can be nested to support closures so we need a stack of this information
 // Currently records information about the functions return type.
diff --git a/torch/csrc/jit/frontend/lexer.cpp b/torch/csrc/jit/frontend/lexer.cpp
index ed3d468378b9..8542639638a5 100644
--- a/torch/csrc/jit/frontend/lexer.cpp
+++ b/torch/csrc/jit/frontend/lexer.cpp
@@ -2,6 +2,10 @@
 
 #include <c10/util/Exception.h>
 
+<<<<<<< HEAD
+=======
+#include <cstring>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <string>
 #include <unordered_map>
 
@@ -65,9 +69,16 @@ bool SharedParserData::isBinary(int kind, int* prec) {
 C10_EXPORT int stringToKind(const std::string& str) {
   static std::unordered_map<std::string, int> str_to_kind = []() {
     std::unordered_map<std::string, int> ret_str_to_kind;
+<<<<<<< HEAD
     for (char tok : std::string(valid_single_char_tokens))
       // NOLINTNEXTLINE(bugprone-signed-char-misuse)
       ret_str_to_kind[std::string(1, tok)] = tok;
+=======
+    ret_str_to_kind.reserve(std::strlen(valid_single_char_tokens));
+    for (const char* tok = valid_single_char_tokens; *tok; tok++) {
+      ret_str_to_kind[std::string(1, *tok)] = static_cast<unsigned char>(*tok);
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define DEFINE_CASE(tok, _, str) \
   if (std::string(str) != "")    \
     ret_str_to_kind[str] = tok;
diff --git a/torch/csrc/jit/frontend/lexer.h b/torch/csrc/jit/frontend/lexer.h
index f36e421c8225..5e759108f511 100644
--- a/torch/csrc/jit/frontend/lexer.h
+++ b/torch/csrc/jit/frontend/lexer.h
@@ -397,9 +397,20 @@ struct Token {
   int kind;
   SourceRange range;
   Token(int kind, SourceRange range) : kind(kind), range(std::move(range)) {}
+<<<<<<< HEAD
   std::string text() {
     return std::string(range.token_text());
   }
+=======
+  std::string text() const {
+    return std::string(range.token_text());
+  }
+
+  std::string_view text_view() const {
+    return range.token_text();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string kindString() const {
     return kindToString(kind);
   }
diff --git a/torch/csrc/jit/frontend/parser_constants.h b/torch/csrc/jit/frontend/parser_constants.h
index cf51d10b0987..72abbde598bc 100644
--- a/torch/csrc/jit/frontend/parser_constants.h
+++ b/torch/csrc/jit/frontend/parser_constants.h
@@ -1,5 +1,10 @@
 #pragma once
 
 namespace torch::jit {
+<<<<<<< HEAD
 static const char* valid_single_char_tokens = "+-*/%@()[]:,={}><.?!&^|~";
+=======
+static constexpr const char* valid_single_char_tokens =
+    "+-*/%@()[]:,={}><.?!&^|~";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit
diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index 295c8552e490..6aebce084a97 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -84,7 +84,11 @@ TypePtr SchemaTypeParser::parseBaseType() {
     if (allow_typevars_ && !text.empty() && islower(text[0])) {
       // lower case identifiers that are not otherwise valid types
       // are treated as type variables
+<<<<<<< HEAD
       return c10::TypeFactory::createNamed<VarType>(text);
+=======
+      return c10::TypeFactory::createNamed<VarType>(std::move(text));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     if (text == "double") {
       throw(
@@ -116,9 +120,15 @@ TypePtr SchemaTypeParser::parseBaseType() {
 // Tensor(a! -> a|b) // Tensor is in set a, written to,
 //                      and after the write is in set a AND b.
 std::optional<AliasInfo> SchemaTypeParser::parseAliasAnnotation() {
+<<<<<<< HEAD
   AliasInfo alias_info;
   if (L.nextIf('(')) {
     // optional 'alias set annotation'
+=======
+  if (L.nextIf('(')) {
+    // optional 'alias set annotation'
+    AliasInfo alias_info;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parseList(TK_NOTHING, '|', TK_NOTHING, [&] {
       if (L.nextIf('*')) {
         alias_info.addBeforeSet(AliasInfo::wildcardSet());
@@ -153,6 +163,7 @@ std::optional<AliasInfo> SchemaTypeParser::parseAliasAnnotation() {
       }
     }
     L.expect(')');
+<<<<<<< HEAD
   } else if (L.nextIf('!')) {
     alias_info.addBeforeSet(
         Symbol::fromQualString("alias::$" + std::to_string(next_id++)));
@@ -162,6 +173,18 @@ std::optional<AliasInfo> SchemaTypeParser::parseAliasAnnotation() {
   }
 
   return alias_info;
+=======
+    return alias_info;
+  } else if (L.nextIf('!')) {
+    AliasInfo alias_info;
+    alias_info.addBeforeSet(
+        Symbol::fromQualString("alias::$" + std::to_string(next_id++)));
+    alias_info.setIsWrite(true);
+    return alias_info;
+  } else {
+    return std::nullopt;
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::optional<at::ScalarType> SchemaTypeParser::parseTensorDType(
@@ -194,11 +217,19 @@ std::optional<c10::Device> SchemaTypeParser::tryToParseDeviceType() {
       const std::string& num = L.expect(TK_NUMBER).text();
       try {
         device_idx = static_cast<c10::DeviceIndex>(std::stoi(num));
+<<<<<<< HEAD
       } catch (const std::invalid_argument& e) {
         throw(
             ErrorReport(L.cur())
             << "Device index cannot be converted to integer");
       } catch (const std::out_of_range& e) {
+=======
+      } catch (const std::invalid_argument&) {
+        throw(
+            ErrorReport(L.cur())
+            << "Device index cannot be converted to integer");
+      } catch (const std::out_of_range&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         throw(ErrorReport(L.cur()) << "Device index is too long");
       }
     }
@@ -217,11 +248,19 @@ std::optional<bool> SchemaTypeParser::tryToParseRequiresGrad() {
   const std::string& num = L.expect(TK_NUMBER).text();
   try {
     return (bool)std::stoi(num);
+<<<<<<< HEAD
   } catch (const std::invalid_argument& e) {
     throw(
         ErrorReport(L.cur())
         << "Field requires_grad cannot be converted to integer");
   } catch (const std::out_of_range& e) {
+=======
+  } catch (const std::invalid_argument&) {
+    throw(
+        ErrorReport(L.cur())
+        << "Field requires_grad cannot be converted to integer");
+  } catch (const std::out_of_range&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     throw(ErrorReport(L.cur()) << "Field requires_grad is too long");
   }
 }
@@ -277,11 +316,19 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
           try {
             auto stride = std::stoll(num);
             strides.push_back(stride);
+<<<<<<< HEAD
           } catch (const std::invalid_argument& e) {
             throw(
                 ErrorReport(L.cur())
                 << "The stride value cannot be converted to int");
           } catch (const std::out_of_range& e) {
+=======
+          } catch (const std::invalid_argument&) {
+            throw(
+                ErrorReport(L.cur())
+                << "The stride value cannot be converted to int");
+          } catch (const std::out_of_range&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             throw(ErrorReport(L.cur()) << "The stride is too big");
           }
         });
@@ -307,7 +354,11 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
       return;
     }
     bool shape_symbol = false;
+<<<<<<< HEAD
     if (L.cur().kind == TK_IDENT && L.cur().text() == "SS") {
+=======
+    if (L.cur().kind == TK_IDENT && L.cur().text_view() == "SS") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       L.next();
       L.expect('(');
       L.expect('-');
@@ -317,9 +368,15 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
     int64_t dim = 0;
     try {
       dim = std::stoll(num);
+<<<<<<< HEAD
     } catch (const std::invalid_argument& e) {
       throw(ErrorReport(L.cur()) << "The number can't be converted to int");
     } catch (const std::out_of_range& e) {
+=======
+    } catch (const std::invalid_argument&) {
+      throw(ErrorReport(L.cur()) << "The number can't be converted to int");
+    } catch (const std::out_of_range&) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       throw(ErrorReport(L.cur()) << "Number is too big");
     }
     if (shape_symbol) {
@@ -376,35 +433,61 @@ SchemaTypeParser::parseFakeAndRealType() {
     });
     fake_value = real_value =
         c10::TypeFactory::create<TupleType>(std::move(types));
+<<<<<<< HEAD
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Future") {
+=======
+  } else if (L.cur().kind == TK_IDENT && L.cur().text_view() == "Future") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     L.next(); // Future
     L.expect('(');
     auto p = parseType();
     auto subtype = std::move(p.first);
     auto subalias = std::move(p.second);
     L.expect(')');
+<<<<<<< HEAD
     fake_value = real_value = c10::TypeFactory::create<FutureType>(subtype);
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Await") {
+=======
+    fake_value = real_value =
+        c10::TypeFactory::create<FutureType>(std::move(subtype));
+  } else if (L.cur().kind == TK_IDENT && L.cur().text_view() == "Await") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     L.next(); // Await
     L.expect('(');
     auto p = parseType();
     auto subtype = std::move(p.first);
     auto subalias = std::move(p.second);
     L.expect(')');
+<<<<<<< HEAD
     fake_value = real_value = c10::TypeFactory::create<AwaitType>(subtype);
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "RRef") {
+=======
+    fake_value = real_value =
+        c10::TypeFactory::create<AwaitType>(std::move(subtype));
+  } else if (L.cur().kind == TK_IDENT && L.cur().text_view() == "RRef") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     L.next(); // RRef
     L.expect('(');
     auto p = parseType();
     auto subtype = std::move(p.first);
     auto subalias = std::move(p.second);
     L.expect(')');
+<<<<<<< HEAD
     fake_value = real_value = c10::TypeFactory::create<RRefType>(subtype);
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Tensor") {
     L.next();
     fake_value = real_value = c10::TypeFactory::get<TensorType>();
     alias_info = parseAliasAnnotation();
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Dict") {
+=======
+    fake_value = real_value =
+        c10::TypeFactory::create<RRefType>(std::move(subtype));
+  } else if (L.cur().kind == TK_IDENT && L.cur().text_view() == "Tensor") {
+    L.next();
+    fake_value = real_value = c10::TypeFactory::get<TensorType>();
+    alias_info = parseAliasAnnotation();
+  } else if (L.cur().kind == TK_IDENT && L.cur().text_view() == "Dict") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     L.next();
     L.expect('(');
     auto key_type = parseType().first;
@@ -412,9 +495,15 @@ SchemaTypeParser::parseFakeAndRealType() {
     auto value_type = parseType().first;
     L.expect(')');
     alias_info = parseAliasAnnotation();
+<<<<<<< HEAD
     fake_value = real_value =
         c10::TypeFactory::create<DictType>(key_type, value_type);
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Union") {
+=======
+    fake_value = real_value = c10::TypeFactory::create<DictType>(
+        std::move(key_type), std::move(value_type));
+  } else if (L.cur().kind == TK_IDENT && L.cur().text_view() == "Union") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     L.next();
     L.expect('(');
     std::vector<TypePtr> types;
@@ -432,7 +521,11 @@ SchemaTypeParser::parseFakeAndRealType() {
       parseTensorDType(L.cur().text())) {
     fake_value = real_value = parseRefinedTensor();
     alias_info = parseAliasAnnotation();
+<<<<<<< HEAD
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "__torch__") {
+=======
+  } else if (L.cur().kind == TK_IDENT && L.cur().text_view() == "__torch__") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     L.next();
     L.expect('.');
     auto torch_tok = L.expect(TK_IDENT);
@@ -477,8 +570,13 @@ SchemaTypeParser::parseFakeAndRealType() {
     if (L.cur().kind == '[' && L.lookahead().kind == ']') {
       L.next(); // [
       L.next(); // ]
+<<<<<<< HEAD
       fake_value = c10::TypeFactory::create<ListType>(fake_value);
       real_value = c10::TypeFactory::create<ListType>(real_value);
+=======
+      fake_value = c10::TypeFactory::create<ListType>(std::move(fake_value));
+      real_value = c10::TypeFactory::create<ListType>(std::move(real_value));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       auto container = parseAliasAnnotation();
       if (alias_info) {
         if (!container) {
diff --git a/torch/csrc/jit/frontend/source_range.cpp b/torch/csrc/jit/frontend/source_range.cpp
index 05067ac80f9a..4196ceb63e73 100644
--- a/torch/csrc/jit/frontend/source_range.cpp
+++ b/torch/csrc/jit/frontend/source_range.cpp
@@ -17,6 +17,10 @@ StringCordView::StringCordView(
     std::vector<std::string_view> inputs,
     std::vector<std::shared_ptr<std::string>> ownerships)
     : pieces_(std::move(inputs)), owned_strings_(std::move(ownerships)) {
+<<<<<<< HEAD
+=======
+  accumulated_sizes_.reserve(pieces_.size() + 1);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   accumulated_sizes_.push_back(0);
   size_t running_sum = 0;
   for (auto& s : pieces_) {
@@ -79,8 +83,13 @@ StringCordView StringCordView::substr(size_t start, size_t size) const {
   if (start + size >= this->size()) {
     size = this->size() - start;
   }
+<<<<<<< HEAD
   Iterator begin = iter_for_pos(start);
   Iterator end = iter_for_pos(start + size);
+=======
+  IteratorImpl begin = iter_impl_for_pos(start);
+  IteratorImpl end = iter_impl_for_pos(start + size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (begin.line_ == end.line_) {
     // same line
@@ -89,14 +98,22 @@ StringCordView StringCordView::substr(size_t start, size_t size) const {
     pieces.push_back(pieces_[begin.line_].substr(begin.pos_));
 
     size_t last_line = pieces_.size();
+<<<<<<< HEAD
     if (end != this->end() && end.line_ < last_line) {
+=======
+    if (end.has_next() && end.line_ < last_line) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // end is within the string
       last_line = end.line_;
     }
     for (size_t i = begin.line_ + 1; i < last_line; i++) {
       pieces.push_back(pieces_[i]);
     }
+<<<<<<< HEAD
     if (end != this->end()) {
+=======
+    if (end.has_next()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       pieces.push_back(pieces_[end.line_].substr(0, end.pos_));
     }
   }
@@ -129,6 +146,7 @@ bool StringCordView::operator==(const StringCordView& rhs) const {
 }
 
 StringCordView::Iterator StringCordView::iter_for_pos(size_t pos) const {
+<<<<<<< HEAD
   if (pos == 0) {
     return begin();
   }
@@ -144,6 +162,59 @@ StringCordView::Iterator StringCordView::iter_for_pos(size_t pos) const {
   assert(accumulated_sizes_[line] <= pos);
   assert(accumulated_sizes_[line + 1] > pos);
   return Iterator(this, line, pos - accumulated_sizes_[line], size() - pos);
+=======
+  if (pos >= size()) {
+    return end();
+  }
+  return begin() + pos;
+}
+
+StringCordView::IteratorImpl StringCordView::iter_impl_for_pos(
+    size_t pos) const {
+  if (pos >= size()) {
+    return end_impl();
+  }
+  return begin_impl() + pos;
+}
+
+StringCordView::IteratorImpl& StringCordView::IteratorImpl::operator+=(
+    size_t num) {
+  if (!has_next()) {
+    return *this;
+  }
+  size_t target_pos = pos_ + num;
+  if (target_pos >= str_->accumulated_sizes_[line_] &&
+      (line_ + 1) < str_->accumulated_sizes_.size() &&
+      target_pos < str_->accumulated_sizes_[line_ + 1]) {
+    pos_ = target_pos;
+    return *this;
+  }
+
+  size_t target_abs_pos = pos() + num;
+  if (target_abs_pos >= size_) {
+    *this = str_->end_impl();
+    return *this;
+  }
+  auto upper = std::upper_bound(
+      str_->accumulated_sizes_.begin(),
+      str_->accumulated_sizes_.end(),
+      target_abs_pos);
+  if (upper == str_->accumulated_sizes_.end()) {
+    *this = str_->end_impl();
+    return *this;
+  }
+  size_t line = upper - str_->accumulated_sizes_.begin() - 1;
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      str_->accumulated_sizes_[line] <= target_abs_pos);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      str_->accumulated_sizes_[line + 1] > target_abs_pos);
+  *this = IteratorImpl(
+      str_,
+      line,
+      target_abs_pos - str_->accumulated_sizes_[line],
+      str_->size() - target_abs_pos);
+  return *this;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 size_t SourceRangeHasher::operator()(const torch::jit::SourceRange& key) const {
diff --git a/torch/csrc/jit/frontend/source_range.h b/torch/csrc/jit/frontend/source_range.h
index 4e36f31f0e05..825d32d5ae2e 100644
--- a/torch/csrc/jit/frontend/source_range.h
+++ b/torch/csrc/jit/frontend/source_range.h
@@ -59,13 +59,20 @@ struct TORCH_API StringCordView {
     return pieces_[index];
   }
 
+<<<<<<< HEAD
   struct Iterator {
     Iterator(
+=======
+  // General-case iterator implementation.
+  struct IteratorImpl {
+    IteratorImpl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         const StringCordView* str,
         size_t start_line,
         size_t start_pos,
         size_t size)
         : line_(start_line), pos_(start_pos), str_(str), size_(size) {}
+<<<<<<< HEAD
     explicit Iterator(const StringCordView* str)
         : Iterator(str, 0, 0, str->size()) {}
 
@@ -77,6 +84,19 @@ struct TORCH_API StringCordView {
     Iterator& operator=(Iterator&&) = default;
 
     Iterator operator++() {
+=======
+    explicit IteratorImpl(const StringCordView* str)
+        : IteratorImpl(str, 0, 0, str->size()) {}
+
+    IteratorImpl() : IteratorImpl(nullptr, 0, 0, 0) {}
+
+    IteratorImpl(const IteratorImpl&) = default;
+    IteratorImpl(IteratorImpl&&) = default;
+    IteratorImpl& operator=(const IteratorImpl&) = default;
+    IteratorImpl& operator=(IteratorImpl&&) = default;
+
+    IteratorImpl& operator++() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (size_ == 0) {
         return *this;
       }
@@ -89,18 +109,29 @@ struct TORCH_API StringCordView {
       return *this;
     }
 
+<<<<<<< HEAD
     Iterator operator++(int) {
       Iterator prev(*this);
+=======
+    IteratorImpl operator++(int) {
+      IteratorImpl prev(*this);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ++(*this);
       return prev;
     }
 
+<<<<<<< HEAD
     Iterator next_iter() const {
       Iterator next(*this);
+=======
+    IteratorImpl next_iter() const {
+      IteratorImpl next(*this);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ++next;
       return next;
     }
 
+<<<<<<< HEAD
     Iterator& operator+=(size_t num) {
       if (!has_next()) {
         return *this;
@@ -119,12 +150,28 @@ struct TORCH_API StringCordView {
     }
 
     bool operator==(const Iterator& rhs) const {
+=======
+    IteratorImpl& operator+=(size_t num);
+
+    IteratorImpl operator+(size_t num) const {
+      IteratorImpl it(*this);
+      it += num;
+      return it;
+    }
+
+    bool operator==(const IteratorImpl& rhs) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (!has_next() && !rhs.has_next()) {
         return true;
       }
       return (str_ == rhs.str_) && (line_ == rhs.line_) && (pos_ == rhs.pos_);
     }
+<<<<<<< HEAD
     bool operator!=(const Iterator& rhs) {
+=======
+
+    bool operator!=(const IteratorImpl& rhs) const {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       return !((*this) == rhs);
     }
     bool has_next() const {
@@ -162,6 +209,152 @@ struct TORCH_API StringCordView {
     friend struct StringCordView;
   };
 
+<<<<<<< HEAD
+=======
+  // Either an IteratorImpl, or a simple std::string_view::iterator
+  // (which is faster) if possible.
+  struct Iterator {
+    Iterator() = default;
+
+    Iterator(
+        const StringCordView* str,
+        size_t start_line,
+        size_t start_pos,
+        size_t size)
+        : repr_(
+              str->pieces_.size() == 1
+                  ? repr_type(FastRepr(
+                        start_line ? str->pieces_[0].end()
+                                   : str->pieces_[0].begin() + start_pos,
+                        str))
+                  : repr_type(IteratorImpl(str, start_line, start_pos, size))) {
+    }
+
+    Iterator(const StringCordView* str) : Iterator(str, 0, 0, str->size()) {}
+
+    Iterator& operator++() {
+      if (auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        ++(*pit);
+      } else {
+        ++fast_repr().it;
+      }
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      Iterator prev(*this);
+      ++(*this);
+      return prev;
+    }
+
+    Iterator next_iter() const {
+      Iterator next(*this);
+      ++next;
+      return next;
+    }
+
+    Iterator& operator+=(size_t num) {
+      if (auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        *pit += num;
+      } else {
+        fast_repr().it += num;
+      }
+      return *this;
+    }
+
+    Iterator operator+(size_t num) const {
+      Iterator it(*this);
+      it += num;
+      return it;
+    }
+
+    bool operator==(const Iterator& rhs) const {
+      return repr_ == rhs.repr_;
+    }
+
+    bool operator!=(const Iterator& rhs) const {
+      return repr_ != rhs.repr_;
+    }
+
+    bool has_next() const {
+      if (const auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        return pit->has_next();
+      } else {
+        return fast_repr().it != fast_repr().str->pieces_[0].end();
+      }
+    }
+
+    char operator*() const {
+      if (const auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        return **pit;
+      } else {
+        return *fast_repr().it;
+      }
+    }
+
+    std::string_view rest_line() const {
+      if (const auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        return pit->rest_line();
+      } else {
+        // NOTE: std::string_view(it, end) ctor wasn't added until C++20.
+        const auto fast_repr_end = fast_repr().str->pieces_[0].end();
+        if (fast_repr().it != fast_repr_end) {
+          return std::string_view(
+              &*fast_repr().it, fast_repr_end - fast_repr().it);
+        }
+        return std::string_view();
+      }
+    }
+
+    size_t pos() const {
+      if (const auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        return pit->pos();
+      } else {
+        return fast_repr().it - fast_repr().str->pieces_[0].begin();
+      }
+    }
+
+   private:
+    // When we have only one entry in pieces_ (importantly, such as
+    // when called from torch::Library::def during startup), we can
+    // skip extra complexity and just use string_view::iterator
+    // directly.
+    struct FastRepr {
+      std::string_view::iterator it;
+      const StringCordView* str;
+
+      FastRepr() : str(nullptr) {}
+
+      explicit FastRepr(
+          std::string_view::iterator it_,
+          const StringCordView* str_)
+          : it(it_), str(str_) {}
+
+      bool operator==(const FastRepr& rhs) const {
+        return it == rhs.it && str == rhs.str;
+      }
+
+      bool operator!=(const FastRepr& rhs) const {
+        return !operator==(rhs);
+      }
+    };
+    using repr_type = std::variant<FastRepr, IteratorImpl>;
+    repr_type repr_;
+
+    FastRepr& fast_repr() {
+      // -Oz refuses to inline std::get.
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(std::holds_alternative<FastRepr>(repr_));
+      return *std::get_if<FastRepr>(&repr_);
+    }
+
+    const FastRepr& fast_repr() const {
+      // -Oz refuses to inline std::get.
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(std::holds_alternative<FastRepr>(repr_));
+      return *std::get_if<FastRepr>(&repr_);
+    }
+  };
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Iterator begin() const {
     return Iterator(this, 0, 0, size());
   }
@@ -171,6 +364,16 @@ struct TORCH_API StringCordView {
   Iterator iter_for_pos(size_t pos) const;
 
  private:
+<<<<<<< HEAD
+=======
+  IteratorImpl begin_impl() const {
+    return IteratorImpl(this, 0, 0, size());
+  }
+  IteratorImpl end_impl() const {
+    return IteratorImpl(this, pieces_.size(), 0, 0);
+  }
+  IteratorImpl iter_impl_for_pos(size_t pos) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<std::string_view> pieces_;
   std::vector<size_t> accumulated_sizes_;
   std::vector<std::shared_ptr<std::string>> owned_strings_;
@@ -192,6 +395,7 @@ struct TORCH_API Source {
       size_t starting_line_no = 0,
       std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr,
       CopiesString copies_str = COPIES_STRING)
+<<<<<<< HEAD
       : filename_(std::move(filename)),
         starting_line_no_(starting_line_no),
         gen_ranges_(std::move(gen_ranges)) {
@@ -203,6 +407,12 @@ struct TORCH_API Source {
       text_view_ = StringCordView({text_view}, {});
     }
 
+=======
+      : text_view_(create_text_view(copies_str, text_view)),
+        filename_(std::move(filename)),
+        starting_line_no_(starting_line_no),
+        gen_ranges_(std::move(gen_ranges)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     calc_line_start_offsets();
   }
 
@@ -287,6 +497,21 @@ struct TORCH_API Source {
     }
   }
 
+<<<<<<< HEAD
+=======
+  static StringCordView create_text_view(
+      CopiesString copies_str,
+      std::string_view text_view) {
+    if (copies_str == COPIES_STRING) {
+      auto allocated_str =
+          std::make_shared<std::string>(text_view.data(), text_view.size());
+      return StringCordView({*allocated_str}, {allocated_str});
+    } else {
+      return StringCordView({text_view}, {});
+    }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   StringCordView text_view_;
 
   std::optional<std::string> filename_;
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index 7049b681e7e7..cb7c00a1ba0b 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -34,14 +34,22 @@ namespace torch::jit::tracer {
 namespace detail {
 
 template <typename T>
+<<<<<<< HEAD
 void genericAddInput(Node* n, T value) {
+=======
+static void genericAddInput(Node* n, T value) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Value* v = n->owningGraph()->insertConstant(value);
   recordSourceLocation(v->node());
   n->addInput(v);
 }
 
 template <typename T>
+<<<<<<< HEAD
 void genericAddOptionalInput(
+=======
+static void genericAddOptionalInput(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Node* n,
     const char* name,
     const std::optional<T>& value) {
@@ -55,7 +63,11 @@ void genericAddOptionalInput(
 }
 
 template <typename T>
+<<<<<<< HEAD
 void badArgType(const T& v) {
+=======
+static void badArgType(const T& v) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(
       false,
       "Found an unsupported argument type in the JIT tracer: ",
@@ -63,7 +75,11 @@ void badArgType(const T& v) {
       ". File a bug report.");
 }
 
+<<<<<<< HEAD
 thread_local std::shared_ptr<TracingState> tracing_state;
+=======
+static thread_local std::shared_ptr<TracingState> tracing_state;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace detail
 
 static std::atomic<bool> tracer_state_warn_mode{true};
@@ -1055,8 +1071,13 @@ void ArgumentStash::stashValue(
 ////////////////////////////////////////////////////////////////////////////////
 // no python present so we just do not record source information
 static void defaultRecordSourceLocation(Node* n) {}
+<<<<<<< HEAD
 std::atomic<decltype(&defaultRecordSourceLocation)> record_source_location(
     defaultRecordSourceLocation);
+=======
+static std::atomic<decltype(&defaultRecordSourceLocation)>
+    record_source_location(defaultRecordSourceLocation);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void recordSourceLocation(Node* n) {
   return record_source_location.load()(n);
 }
@@ -1067,7 +1088,11 @@ void setRecordSourceLocation(void (*v)(Node*)) {
 static std::vector<StackEntry> defaultPythonCallstack() {
   return std::vector<StackEntry>();
 }
+<<<<<<< HEAD
 std::atomic<decltype(&defaultPythonCallstack)> python_callstack_fn(
+=======
+static std::atomic<decltype(&defaultPythonCallstack)> python_callstack_fn(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     defaultPythonCallstack);
 std::vector<StackEntry> pythonCallstack() {
   return python_callstack_fn.load()();
@@ -1079,7 +1104,11 @@ void setPythonCallstack(std::vector<StackEntry> (*v)()) {
 static void defaultWarn(const std::string& str) {
   TORCH_WARN(str);
 }
+<<<<<<< HEAD
 std::atomic<warn_fn_type> warn_callback{defaultWarn};
+=======
+static std::atomic<warn_fn_type> warn_callback{defaultWarn};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 const char* WARN_PYTHON_DATAFLOW =
     " might cause the trace to be incorrect. We can't record the data flow of "
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 432393861f61..0482d254564a 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -396,6 +396,17 @@ MemoryLocations AliasDb::getReads(Node* n) const {
   return reads;
 }
 
+<<<<<<< HEAD
+=======
+MemoryLocations AliasDb::getMemoryLocations(Value* v) const {
+  auto it = elementMap_.find(v);
+  if (it != elementMap_.end()) {
+    return memoryDAG_->getMemoryLocations(it->second);
+  }
+  return MemoryLocations();
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::string AliasDb::getElementName(const Element* e) const {
   if (e->values.empty()) {
     // Not the most efficient way, but given the fact there are
@@ -2014,4 +2025,30 @@ void Lint(const AliasDb* db) {
   // - All container values have contained elements
 }
 
+<<<<<<< HEAD
+=======
+ValueAndMemoryLocationSet AliasDb::getValueAndMemoryLocationSet() const {
+  return ValueAndMemoryLocationSet(this);
+}
+
+bool AliasDb::writesToAlias(Node* n, const ValueAndMemoryLocationSet& vls)
+    const {
+  const auto writtenTo = getWrites(n);
+  if (writtenTo.empty()) {
+    return false;
+  }
+
+  return writtenTo.intersects(vls.memoryLocations_);
+}
+
+void ValueAndMemoryLocationSet::insert(Value* v) {
+  valueSet_.insert(v);
+  memoryLocations_ |= aliasDb_->getMemoryLocations(v);
+}
+
+ValueSet& ValueAndMemoryLocationSet::getValueSet() {
+  return valueSet_;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit
diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h
index ba1aafb1cd25..51adf9e64734 100644
--- a/torch/csrc/jit/ir/alias_analysis.h
+++ b/torch/csrc/jit/ir/alias_analysis.h
@@ -9,6 +9,11 @@
 
 namespace torch::jit {
 
+<<<<<<< HEAD
+=======
+class ValueAndMemoryLocationSet;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 /**
  * Alias analysis pass.
  *
@@ -70,6 +75,15 @@ class AliasDb {
   // if `recurseBlocks` is true, consider writes on the nodes in `n`s sub-blocks
   TORCH_API bool writesToAlias(Node* n, const ValueSet& vs) const;
 
+<<<<<<< HEAD
+=======
+  // Does `n` write to any of the values in `vls`?
+  TORCH_API bool writesToAlias(Node* n, const ValueAndMemoryLocationSet& vls)
+      const;
+
+  TORCH_API ValueAndMemoryLocationSet getValueAndMemoryLocationSet() const;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Does `a` and `b` potentially share a memory location or do either
   // hold in memory any element that exists in the other
   TORCH_API bool mayContainAlias(Value* a, Value* b) const;
@@ -170,6 +184,10 @@ class AliasDb {
   void enablePreciseTupleContainerAnalysis();
 
   friend struct MutationRemover;
+<<<<<<< HEAD
+=======
+  friend class ValueAndMemoryLocationSet;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  private:
   // Helper for topologically-safe node moves.
@@ -197,6 +215,10 @@ class AliasDb {
   // if `recurseBlocks` is true, gather reads on the nodes in `n`s sub-blocks
   MemoryLocations getReads(Node* n) const;
   void getReadsImpl(Node* n, MemoryLocations& ret) const;
+<<<<<<< HEAD
+=======
+  MemoryLocations getMemoryLocations(Value* v) const;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   /**
    * Wildcard methods
@@ -317,4 +339,40 @@ class AliasDb {
 // the right thing.
 TORCH_API void Lint(const AliasDb* db);
 
+<<<<<<< HEAD
+=======
+/**
+ * ValueAndMemoryLocationSet
+ *
+ * A insert-only set of values which also maintains a MemoryLocations bitset
+ * of the memory locations that the values alias. It is insert-only. It
+ * should be constructed by calling aliasDb.getValueAndMemoryLocationSet().
+ *
+ * WARNING:
+ *  * The AliasDb must not be mutated after construction of a
+ *    ValueAndMemoryLocationsSet, or else the MemoryLocations stored in the
+ *    ValueAndMemoryLocationSet will no longer be accurate.
+ *  * A ValueAndMemoryLocationsSet is tied to an instsance of AliasDb but
+ *    does not own the AliasDb. It is the user's responsibility to ensure
+ *    that the AliasDb outlives the ValuesAndMemoryLocationsSet.
+ *
+ * The use case for this is to be able to implement writesToAlias
+ * more efficiently for a set of values.
+ */
+class ValueAndMemoryLocationSet {
+ public:
+  TORCH_API void insert(Value* v);
+  TORCH_API ValueSet& getValueSet();
+
+  friend class AliasDb;
+
+ private:
+  ValueAndMemoryLocationSet(const AliasDb* db) : aliasDb_(db) {}
+
+  const AliasDb* aliasDb_;
+  ValueSet valueSet_;
+  MemoryLocations memoryLocations_;
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::jit
diff --git a/torch/csrc/jit/ir/attributes.h b/torch/csrc/jit/ir/attributes.h
index fb2c44350d2d..30dc03c5cfd7 100644
--- a/torch/csrc/jit/ir/attributes.h
+++ b/torch/csrc/jit/ir/attributes.h
@@ -86,7 +86,10 @@ template <typename T, AttributeKind Kind>
 struct VectorAttributeValue : public AttributeValue {
   using ConstructorType = std::vector<T>;
   using ValueType = std::vector<T>;
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   VectorAttributeValue(Symbol name, ConstructorType value_)
       : AttributeValue(name), value_(std::move(value_)) {}
   ValueType& value() {
@@ -144,7 +147,10 @@ struct TORCH_API GraphAttr : public AttributeValue {
 struct TORCH_API GraphsAttr : public AttributeValue {
   using ConstructorType = std::vector<std::shared_ptr<Graph>>;
   using ValueType = std::vector<std::shared_ptr<Graph>>;
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   GraphsAttr(Symbol name, ConstructorType value_)
       : AttributeValue(name), value_(std::move(value_)) {}
   ValueType& value() {
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 0606a173cd0b..2640ec4608e2 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -100,7 +100,13 @@ void findAllNodes(
 // NB: This overload will become ambiguous with the one Caffe2 provides in its
 // logging, if they ever intersect.
 template <typename T>
+<<<<<<< HEAD
 std::ostream& operator<<(std::ostream& out, const std::vector<T>& nodes) {
+=======
+static std::ostream& operator<<(
+    std::ostream& out,
+    const std::vector<T>& nodes) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   out << at::ArrayRef<T>{nodes};
   return out;
 }
@@ -1671,7 +1677,11 @@ size_t Node::blocksFromGraphBlock() {
   return dist;
 }
 
+<<<<<<< HEAD
 inline const SourceRange& fakeRange() {
+=======
+static inline const SourceRange& fakeRange() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static SourceRange range(std::make_shared<Source>(std::string("")), 0, 1);
   return range;
 }
@@ -2038,7 +2048,11 @@ at::ArrayRef<Value*> createTupleUnpack(Value* v) {
   return g.insertNode(g.createTupleUnpack(v))->outputs();
 }
 
+<<<<<<< HEAD
 void inlineCallStackOfNode(
+=======
+static void inlineCallStackOfNode(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Node* n,
     std::unordered_map<InlinedCallStack*, InlinedCallStackPtr>& new_cs_entries,
     Function* callee,
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 44087074e891..dece9b7f01cd 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -1490,7 +1490,10 @@ struct WithCurrentScope {
   ScopePtr prev_scope_;
 };
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 inline Value::Value(Node* node_, size_t offset_)
     : node_(node_),
       offset_(offset_),
@@ -1651,7 +1654,10 @@ struct TORCH_API OperatorSet {
 };
 
 template <typename T>
+<<<<<<< HEAD
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct OperatorMap {
   // Type aliasing
   using OpMapType = typename std::pair<std::shared_ptr<Operator>, T>;
@@ -1659,12 +1665,18 @@ struct OperatorMap {
   using MapType = std::unordered_map<Symbol, ValueType>;
 
   OperatorMap() = default;
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   explicit OperatorMap(
       std::initializer_list<std::pair<std::shared_ptr<Operator>, T>> init) {
     insert(init);
   }
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   explicit OperatorMap(std::initializer_list<std::pair<const char*, T>> init) {
     insert(init);
   }
@@ -1760,7 +1772,10 @@ struct OperatorMap {
 };
 
 template <typename T>
+<<<<<<< HEAD
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct FunctionSchemaMap {
   // Type aliasing
   using FuncSchemaMapType = typename std::pair<FunctionSchema, T>;
diff --git a/torch/csrc/jit/jit_log.cpp b/torch/csrc/jit/jit_log.cpp
index b94fc346f5f1..2732eed08347 100644
--- a/torch/csrc/jit/jit_log.cpp
+++ b/torch/csrc/jit/jit_log.cpp
@@ -9,6 +9,10 @@
 #include <ATen/core/function.h>
 #include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/api/function_impl.h>
 #include <torch/csrc/jit/frontend/error_report.h>
 #include <torch/csrc/jit/ir/ir.h>
@@ -32,8 +36,15 @@ class JitLoggingConfig {
   std::ostream* out;
 
   JitLoggingConfig() : out(&std::cerr) {
+<<<<<<< HEAD
     const char* jit_log_level = std::getenv("PYTORCH_JIT_LOG_LEVEL");
     logging_levels.assign(jit_log_level == nullptr ? "" : jit_log_level);
+=======
+    const auto jit_log_level = c10::utils::get_env("PYTORCH_JIT_LOG_LEVEL");
+    if (jit_log_level.has_value()) {
+      logging_levels = jit_log_level.value();
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     parse();
   }
diff --git a/torch/csrc/jit/jit_log.h b/torch/csrc/jit/jit_log.h
index 6282e53f4bff..ad1a5379335d 100644
--- a/torch/csrc/jit/jit_log.h
+++ b/torch/csrc/jit/jit_log.h
@@ -1,4 +1,8 @@
 #pragma once
+<<<<<<< HEAD
+=======
+#include <c10/util/StringUtil.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/Export.h>
 #include <memory>
 #include <ostream>
diff --git a/torch/csrc/jit/jit_opt_limit.cpp b/torch/csrc/jit/jit_opt_limit.cpp
index bba31a33b43c..c9bb30e555b5 100644
--- a/torch/csrc/jit/jit_opt_limit.cpp
+++ b/torch/csrc/jit/jit_opt_limit.cpp
@@ -1,16 +1,32 @@
 #include <cstdlib>
+<<<<<<< HEAD
 #include <iomanip>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
+=======
+#include <sstream>
+#include <string>
+#include <utility>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <ATen/core/function.h>
 #include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
+<<<<<<< HEAD
 #include <torch/csrc/jit/api/function_impl.h>
 #include <torch/csrc/jit/jit_opt_limit.h>
 
+=======
+#include <c10/util/env.h>
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/jit_opt_limit.h>
+
+// NOTE: Don't try to migrate jit to C++17 yet
+// As it's used in some embedded platforms
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace torch::jit {
 
 static std::unordered_map<std::string, int64_t>& passes_to_current_counter() {
@@ -27,11 +43,17 @@ static int parseOptLimit(const std::string& opt_limit) {
 }
 
 static std::unordered_map<std::string, int64_t> parseJITOptLimitOption(
+<<<<<<< HEAD
     const char* option) {
   std::stringstream in_ss;
   if (option) {
     in_ss << option;
   }
+=======
+    const std::string& option) {
+  std::stringstream in_ss;
+  in_ss << option;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::unordered_map<std::string, int64_t> passes_to_opt_limits;
   std::string line;
   while (std::getline(in_ss, line, ':')) {
@@ -42,21 +64,35 @@ static std::unordered_map<std::string, int64_t> parseJITOptLimitOption(
     auto pass_name = line.substr(0, index_at);
     pass_name = c10::detail::ExcludeFileExtension(pass_name);
     auto opt_limit = parseOptLimit(line.substr(index_at + 1));
+<<<<<<< HEAD
     passes_to_opt_limits.insert({pass_name, opt_limit});
+=======
+    passes_to_opt_limits.emplace(std::move(pass_name), opt_limit);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   return passes_to_opt_limits;
 }
 
 bool opt_limit(const char* pass_name) {
+<<<<<<< HEAD
   static const char* opt_limit = std::getenv("PYTORCH_JIT_OPT_LIMIT");
   // if nothing is provided, let's allow everything
   if (!opt_limit) {
+=======
+  static const auto opt_limit = c10::utils::get_env("PYTORCH_JIT_OPT_LIMIT");
+  // if nothing is provided, let's allow everything
+  if (!opt_limit.has_value()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return true;
   }
 
   static const std::unordered_map<std::string, int64_t> passes_to_opt_limits =
+<<<<<<< HEAD
       parseJITOptLimitOption(opt_limit);
+=======
+      parseJITOptLimitOption(opt_limit.value());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string pass{pass_name};
   pass = c10::detail::StripBasename(pass);
   pass = c10::detail::ExcludeFileExtension(pass);
diff --git a/torch/csrc/jit/mobile/compatibility/backport.cpp b/torch/csrc/jit/mobile/compatibility/backport.cpp
index d945d023a1a3..dc89e4ed4721 100644
--- a/torch/csrc/jit/mobile/compatibility/backport.cpp
+++ b/torch/csrc/jit/mobile/compatibility/backport.cpp
@@ -16,7 +16,11 @@ const static BackportManager backportManager;
 
 // Forward declare so that _backport_for_mobile() overloads can
 // call this method directly.
+<<<<<<< HEAD
 bool _backport_for_mobile_impl(
+=======
+static bool _backport_for_mobile_impl(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::istream& oss,
     PyTorchStreamWriter& writer,
     const int64_t to_version);
diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
index 60f059cd9278..b265570cfdab 100644
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
@@ -137,7 +137,11 @@ uint64_t _get_model_bytecode_version(
 
 /********************** Operator Version **********************/
 
+<<<<<<< HEAD
 uint64_t _get_model_operator_version(
+=======
+static uint64_t _get_model_operator_version(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyTorchStreamReader& reader); // Forward Declare
 
 uint64_t _get_model_operator_version(std::istream& in) {
@@ -168,7 +172,11 @@ uint64_t _get_model_operator_version(PyTorchStreamReader& reader) {
 /********************** Operators and Info **********************/
 
 // Forward declare
+<<<<<<< HEAD
 std::unordered_map<std::string, OperatorInfo> _get_model_ops_and_info(
+=======
+static std::unordered_map<std::string, OperatorInfo> _get_model_ops_and_info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<IValue> bytecode_ivalues);
 
 std::unordered_map<std::string, OperatorInfo> _get_model_ops_and_info(
diff --git a/torch/csrc/jit/mobile/file_format.h b/torch/csrc/jit/mobile/file_format.h
index 73869a85a501..03f8dcd52155 100644
--- a/torch/csrc/jit/mobile/file_format.h
+++ b/torch/csrc/jit/mobile/file_format.h
@@ -134,7 +134,11 @@ static inline std::tuple<std::shared_ptr<char>, size_t> get_file_content(
     // failed to open file, chances are it's no such file or directory.
     file_not_found_error();
   }
+<<<<<<< HEAD
   struct stat statbuf {};
+=======
+  struct stat statbuf{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   fstat(fd, &statbuf);
   size_t size = statbuf.st_size;
   void* ptr = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp
index dd36b6428f19..b416c8d25610 100644
--- a/torch/csrc/jit/mobile/function.cpp
+++ b/torch/csrc/jit/mobile/function.cpp
@@ -10,7 +10,10 @@
 
 namespace torch::jit {
 
+<<<<<<< HEAD
 char const* toString(OpCode op);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace mobile {
 Function::Function(c10::QualifiedName name) : name_(std::move(name)) {}
 
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index 6c1bfd0ec3ec..627429034784 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -22,6 +22,10 @@
 #include <torch/csrc/jit/serialization/import_export_functions.h>
 #include <torch/csrc/jit/serialization/import_read.h>
 #include <torch/custom_class.h>
+<<<<<<< HEAD
+=======
+#include <torch/library.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 #include <string>
 #include <vector>
@@ -646,6 +650,12 @@ mobile::Module _load_for_mobile(
     std::optional<at::Device> device,
     ExtraFilesMap& extra_files,
     uint64_t module_load_options) {
+<<<<<<< HEAD
+=======
+#if defined(TORCH_LIBRARY_THREAD_UNSAFE_LAZY_INIT) && defined(C10_MOBILE)
+  torch::initialize_torch_libraries();
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto observer = torch::observerConfig().getModuleObserver();
   if (observer) {
     extra_files.insert(std::make_pair("model_path", filename));
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index 9eb2e7db2c59..6ddbd7f8693a 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -12,11 +12,18 @@
 #include <torch/csrc/jit/mobile/function.h>
 #include <torch/csrc/jit/mobile/observer.h>
 #include <torch/csrc/jit/mobile/promoted_prim_ops.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/runtime/instruction.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/runtime/jit_exception.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
 namespace torch::jit {
+<<<<<<< HEAD
 char const* toString(OpCode op);
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::ostream& operator<<(std::ostream& out, Instruction inst);
 namespace mobile {
 InterpreterState::InterpreterState(const Code& code) {
diff --git a/torch/csrc/jit/mobile/train/export_data.cpp b/torch/csrc/jit/mobile/train/export_data.cpp
index 56ff96c429f4..226a9fda8597 100644
--- a/torch/csrc/jit/mobile/train/export_data.cpp
+++ b/torch/csrc/jit/mobile/train/export_data.cpp
@@ -18,8 +18,11 @@
 namespace torch::jit {
 namespace mobile {
 
+<<<<<<< HEAD
 char const* toString(OpCode op);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace {
 
 /**
diff --git a/torch/csrc/jit/mobile/type_parser.cpp b/torch/csrc/jit/mobile/type_parser.cpp
index de479c4fed89..824d776a9643 100644
--- a/torch/csrc/jit/mobile/type_parser.cpp
+++ b/torch/csrc/jit/mobile/type_parser.cpp
@@ -2,9 +2,15 @@
 
 #include <ATen/core/jit_type.h>
 #include <ATen/core/type_factory.h>
+<<<<<<< HEAD
 #include <c10/util/string_view.h>
 #include <torch/csrc/jit/frontend/parser_constants.h>
 #include <torch/custom_class.h>
+=======
+#include <torch/csrc/jit/frontend/parser_constants.h>
+#include <torch/custom_class.h>
+#include <string_view>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 using torch::jit::valid_single_char_tokens;
 
@@ -19,11 +25,15 @@ static constexpr const char* kTypeTorchbindCustomClass =
 static constexpr const char* kTypeNamedTuple = "NamedTuple";
 
 bool isSpecialChar(char a) {
+<<<<<<< HEAD
   for (const char* c = valid_single_char_tokens; *c; c++) {
     if (a == *c)
       return true;
   }
   return false;
+=======
+  return std::strchr(valid_single_char_tokens, a);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 } // namespace
 
diff --git a/torch/csrc/jit/passes/autocast.cpp b/torch/csrc/jit/passes/autocast.cpp
index 1d5cb636e454..9b77d10d88c8 100644
--- a/torch/csrc/jit/passes/autocast.cpp
+++ b/torch/csrc/jit/passes/autocast.cpp
@@ -7,6 +7,10 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/quantization/helper.h>
+<<<<<<< HEAD
+=======
+#include <atomic>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <optional>
 
 #include <stack>
@@ -17,7 +21,11 @@ namespace torch::jit {
 
 namespace {
 
+<<<<<<< HEAD
 bool autocast_enabled = true;
+=======
+std::atomic<bool> autocast_enabled = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 struct AutocastContext {
   bool gpu_enabled = false;
@@ -509,9 +517,13 @@ void handleBlock(Block* block, AutocastContext initial_state) {
 } // namespace
 
 bool setAutocastMode(bool value) {
+<<<<<<< HEAD
   auto old_value = autocast_enabled;
   autocast_enabled = value;
   return old_value;
+=======
+  return autocast_enabled.exchange(value);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 bool autocastEnabled() {
diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp
index 82b813974ef5..7d5f6a5b21c0 100644
--- a/torch/csrc/jit/passes/batch_mm.cpp
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@@ -110,7 +110,11 @@ static bool shape_is_fast_for_reduce(
   return m < 512 || ((l < 256 && r < 256) || (l > 256 && r > 256));
 }
 
+<<<<<<< HEAD
 RegisterOperators mm_tree_reduction_reg({Operator(
+=======
+static RegisterOperators mm_tree_reduction_reg({Operator(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "prim::MMTreeReduce(...) -> Tensor",
     [](Stack& stack) {
       auto num_inputs = pop(stack).toInt();
@@ -323,7 +327,11 @@ static bool shape_is_fast_for_side(const at::Tensor& other_side_input) {
   return other_side_input.numel() <= 1024 * 2048;
 }
 
+<<<<<<< HEAD
 RegisterOperators mm_batch_side_reg({Operator(
+=======
+static RegisterOperators mm_batch_side_reg({Operator(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prim::MMBatchSide,
     [](const Node* node) -> Operation {
       size_t num_other_side_inputs = node->inputs().size() - 1;
diff --git a/torch/csrc/jit/passes/dead_code_elimination.cpp b/torch/csrc/jit/passes/dead_code_elimination.cpp
index d6aa121427c3..4ff9e4f62501 100644
--- a/torch/csrc/jit/passes/dead_code_elimination.cpp
+++ b/torch/csrc/jit/passes/dead_code_elimination.cpp
@@ -36,7 +36,11 @@ class DeadCodeEliminator {
 
     mark(block);
 
+<<<<<<< HEAD
     deleteCallback_(liveValues_);
+=======
+    deleteCallback_(getLiveValues());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     sweep(block, recurse);
   }
@@ -120,27 +124,45 @@ class DeadCodeEliminator {
           // Special handling for onnx loop.
           // The number of body carried inputs and outputs are different.
           // They cannot be mapped to each other easily by the same index.
+<<<<<<< HEAD
           liveValues_.insert(loop.bodyCarriedOutputs().at(i));
+=======
+          insertLiveValue(loop.bodyCarriedOutputs().at(i));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           continue;
         }
         auto innerInput = loop.bodyCarriedInputs().at(i);
         auto innerOutput = loop.bodyCarriedOutputs().at(i);
         auto outerOutput = loop.carriedOutputs().at(i);
+<<<<<<< HEAD
         if (liveValues_.count(outerOutput) || innerInput->hasUses()) {
           liveValues_.insert(innerOutput);
+=======
+        if (liveValuesContains(outerOutput) || innerInput->hasUses()) {
+          insertLiveValue(innerOutput);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       }
 
       // Also mark the loop next condition as live, since it will be used inside
       // the loop body.
+<<<<<<< HEAD
       liveValues_.insert(loop.nextCond());
+=======
+      insertLiveValue(loop.nextCond());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     } else {
       AT_ASSERT(outerNode->outputs().size() == node->inputs().size());
       for (const auto i : c10::irange(outerNode->outputs().size())) {
         auto innerOutput = node->inputs()[i];
         auto outerOutput = outerNode->outputs()[i];
+<<<<<<< HEAD
         if (liveValues_.count(outerOutput)) {
           liveValues_.insert(innerOutput);
+=======
+        if (liveValuesContains(outerOutput)) {
+          insertLiveValue(innerOutput);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
       }
     }
@@ -215,13 +237,22 @@ class DeadCodeEliminator {
   // Returns true iff this marked something we haven't marked before.
   bool markIfLive(Node* node) {
     for (const auto output : node->outputs()) {
+<<<<<<< HEAD
       if (liveValues_.count(output)) {
+=======
+      if (liveValuesContains(output)) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return mark(node);
       }
     }
 
     if (useAliasDb_) {
+<<<<<<< HEAD
       if (getOrCreateAliasDb()->writesToAlias(node, liveValues_)) {
+=======
+      if (getOrCreateAliasDb()->writesToAlias(
+              node, getLiveValuesAndMemoryLocations())) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return mark(node);
       }
     }
@@ -252,10 +283,17 @@ class DeadCodeEliminator {
     }
 
     for (const auto input : node->inputs()) {
+<<<<<<< HEAD
       if (liveValues_.count(input)) {
         continue;
       }
       liveValues_.insert(input);
+=======
+      if (liveValuesContains(input)) {
+        continue;
+      }
+      insertLiveValue(input);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return true;
   }
@@ -419,6 +457,49 @@ class DeadCodeEliminator {
     return aliasDb_.get();
   }
 
+<<<<<<< HEAD
+=======
+  ValueAndMemoryLocationSet& getLiveValuesAndMemoryLocations() {
+    if (!liveValuesAndMemoryLocations_) {
+      liveValuesAndMemoryLocations_ =
+          std::make_unique<ValueAndMemoryLocationSet>(
+              getOrCreateAliasDb()->getValueAndMemoryLocationSet());
+    }
+    return *liveValuesAndMemoryLocations_;
+  }
+
+  ValueSet& getLiveValuesSet() {
+    if (!liveValuesSet_) {
+      liveValuesSet_ = std::make_unique<ValueSet>();
+    }
+    return *liveValuesSet_;
+  }
+
+  ValueSet& getLiveValues() {
+    if (useAliasDb_) {
+      return getLiveValuesAndMemoryLocations().getValueSet();
+    } else {
+      return getLiveValuesSet();
+    }
+  }
+
+  void insertLiveValue(Value* v) {
+    if (useAliasDb_) {
+      getLiveValuesAndMemoryLocations().insert(v);
+    } else {
+      getLiveValuesSet().insert(v);
+    }
+  }
+
+  bool liveValuesContains(Value* v) {
+    if (useAliasDb_) {
+      return getLiveValuesAndMemoryLocations().getValueSet().count(v);
+    } else {
+      return getLiveValuesSet().count(v);
+    }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   DCESideEffectPolicy sideEffectPolicy_;
 
   std::shared_ptr<Graph> graph_;
@@ -427,7 +508,19 @@ class DeadCodeEliminator {
   std::unique_ptr<AliasDb> aliasDb_ = nullptr;
   std::unordered_map<Node*, bool> memo_;
   std::unordered_set<Node*> marked_;
+<<<<<<< HEAD
   std::unordered_set<const Value*> liveValues_;
+=======
+
+  // we should have at most 1 of these as a non-nullptr; they are lazily
+  // initialized. liveValuesAndMemoryLocations_ is used if we are using AliasDb
+  //   (in order to store aliasing info),
+  // otherwise liveValuesSet_ is used.
+  std::unique_ptr<ValueAndMemoryLocationSet> liveValuesAndMemoryLocations_ =
+      nullptr;
+  std::unique_ptr<ValueSet> liveValuesSet_ = nullptr;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::function<void(const std::unordered_set<const Value*>&)> deleteCallback_ =
       [](const std::unordered_set<const Value*>&) {};
 };
diff --git a/torch/csrc/jit/passes/decompose_ops.cpp b/torch/csrc/jit/passes/decompose_ops.cpp
index 703e09b640b9..baa4c95987fe 100644
--- a/torch/csrc/jit/passes/decompose_ops.cpp
+++ b/torch/csrc/jit/passes/decompose_ops.cpp
@@ -57,7 +57,11 @@ static bool isDecomposableNorm(Node* normalize_op) {
   return false;
 }
 
+<<<<<<< HEAD
 RegisterOperators reg_ops(
+=======
+static RegisterOperators reg_ops(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {Operator(
          "aten::_ncf_unsqueeze(Tensor(a) self, int ndim) -> Tensor(a)",
          [](Stack& stack) {
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index a1c493d17f5d..a7642a545161 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -980,6 +980,10 @@ class AttributePropagator {
   std::unordered_map<ClassTypePtr, IValue::HashAliasedIValues>
       SharedTypeSubModules_;
 
+<<<<<<< HEAD
+=======
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Module& module_;
 
   // Allow to freeze modules containing interfaces.
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 238b8e5c236e..baca659cbe73 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -18,7 +18,11 @@
 
 namespace torch::jit {
 
+<<<<<<< HEAD
 void removePrintOps(Block* block) {
+=======
+static void removePrintOps(Block* block) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto it = block->nodes().begin(), end = block->nodes().end(); it != end;
        ++it) {
     for (auto b : it->blocks()) {
@@ -46,7 +50,11 @@ void RemovePrintOps(std::shared_ptr<Graph>& graph) {
   GRAPH_DUMP("After RemovePrintOps: ", graph);
 }
 
+<<<<<<< HEAD
 void checkONNXCompatibility(const c10::FunctionSchema& schema) {
+=======
+static void checkONNXCompatibility(const c10::FunctionSchema& schema) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // in ONNX, all inputs are tensors, no support for tensor list
   // so at most one input tensor list is supported
   bool has_tensor_list = false;
@@ -74,7 +82,11 @@ void checkONNXCompatibility(const c10::FunctionSchema& schema) {
   }
 }
 
+<<<<<<< HEAD
 void preprocessCaffe2Ops(Block* block) {
+=======
+static void preprocessCaffe2Ops(Block* block) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto it = block->nodes().begin(), end = block->nodes().end(); it != end;
        ++it) {
     for (auto b : it->blocks()) {
@@ -246,7 +258,11 @@ py::dict BlockToONNX(
   return py::dict();
 }
 
+<<<<<<< HEAD
 bool ConstantFoldCondition(torch::jit::Value* output) {
+=======
+static bool ConstantFoldCondition(torch::jit::Value* output) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto fold_condition = output->node()->kind() != c10::onnx::Constant &&
       ConstantValueMap::HasValue(output->debugName());
   auto reliable_value =
@@ -448,7 +464,12 @@ void NodeToONNX(
       std::ostringstream ss;
       ss << "Error casting results of symbolic for " << op_name
          << ": expected to return list of op nodes, instead received type ''"
+<<<<<<< HEAD
          << py::str(raw_output.get_type()) << "': " << py::str(raw_output);
+=======
+         << py::str(py::type::handle_of(raw_output))
+         << "': " << py::str(raw_output);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       throw std::runtime_error(ss.str());
     }
 
diff --git a/torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.cpp b/torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.cpp
index 5a62e02b628e..da22f2937c26 100644
--- a/torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.cpp
+++ b/torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.cpp
@@ -14,7 +14,11 @@ using namespace ::c10::onnx;
 // many constant operators would have already been removed in the export before
 // this step. On the other hand if cast is inserted in symbolic, subsequent node
 // conversion will break if it depends on certain inputs being constant.
+<<<<<<< HEAD
 void CastAllConstantToFloating(Block* block) {
+=======
+static void CastAllConstantToFloating(Block* block) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto graph = block->owningGraph();
   auto it = block->nodes().begin();
   while (it != block->nodes().end()) {
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp
index 75f1ae4c349a..cd159a2462d0 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp
@@ -30,7 +30,11 @@ enum OnnxType : int {
   ONNX_UINT32,
 };
 
+<<<<<<< HEAD
 std::unordered_map<int, at::ScalarType> onnxTypeToScalarTypeMap = {
+=======
+static std::unordered_map<int, at::ScalarType> onnxTypeToScalarTypeMap = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     // Only conversion of ONNX numeric types is included here.
     // Unsigned ONNX types are mapped to the next higher signed
     // ScalarType type.
@@ -46,7 +50,11 @@ std::unordered_map<int, at::ScalarType> onnxTypeToScalarTypeMap = {
     {ONNX_UINT32, at::kLong},
 };
 
+<<<<<<< HEAD
 void handleNegativeStartEndIndex(
+=======
+static void handleNegativeStartEndIndex(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     int64_t& start,
     int64_t& end,
     int64_t& axis,
@@ -63,7 +71,11 @@ void handleNegativeStartEndIndex(
   }
 }
 
+<<<<<<< HEAD
 std::optional<at::Tensor> runTorchSlice_opset9(
+=======
+static std::optional<at::Tensor> runTorchSlice_opset9(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Node* node,
     std::vector<at::Tensor>& inputTensorValues) {
   assert(inputTensorValues.size() == 1);
@@ -103,7 +115,11 @@ std::optional<at::Tensor> runTorchSlice_opset9(
   return std::optional<at::Tensor>(updated_val);
 }
 
+<<<<<<< HEAD
 std::optional<at::Tensor> runTorchSlice_opset10(
+=======
+static std::optional<at::Tensor> runTorchSlice_opset10(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Node* node,
     std::vector<at::Tensor>& inputTensorValues) {
   const int maxSliceInputCount = 5;
@@ -198,7 +214,11 @@ std::optional<at::Tensor> runTorchSlice_opset10(
 }
 
 // Refer to AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF
+<<<<<<< HEAD
 at::Tensor runTorchArange_opset11(
+=======
+static at::Tensor runTorchArange_opset11(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Node* node,
     const std::vector<at::Tensor>& inputTensorValues) {
   TORCH_INTERNAL_ASSERT(inputTensorValues.size() == 3);
@@ -542,7 +562,11 @@ std::optional<at::Tensor> runTorchBackendForOnnx(
   }
 }
 
+<<<<<<< HEAD
 bool isConstant(Value* val, const ValueToParamPairMap& valsToParamsMap) {
+=======
+static bool isConstant(Value* val, const ValueToParamPairMap& valsToParamsMap) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto parentNode = val->node();
   return (parentNode->kind() == prim::Param &&
           valsToParamsMap.find(val) !=
@@ -553,7 +577,11 @@ bool isConstant(Value* val, const ValueToParamPairMap& valsToParamsMap) {
            AttributeKind::t); // Check other types?
 }
 
+<<<<<<< HEAD
 bool hasParamInput(Node* n, const ValueToParamPairMap& valsToParamsMap) {
+=======
+static bool hasParamInput(Node* n, const ValueToParamPairMap& valsToParamsMap) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto input : n->inputs()) {
     if (valsToParamsMap.find(input) != valsToParamsMap.end()) {
       return true;
@@ -562,7 +590,11 @@ bool hasParamInput(Node* n, const ValueToParamPairMap& valsToParamsMap) {
   return false;
 }
 
+<<<<<<< HEAD
 std::vector<at::Tensor> getValues(
+=======
+static std::vector<at::Tensor> getValues(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Node* node,
     const ValueToParamPairMap& valsToParamsMap) {
   size_t numInputs = node->inputs().size();
@@ -587,7 +619,11 @@ std::vector<at::Tensor> getValues(
   return inputTensorValues;
 }
 
+<<<<<<< HEAD
 bool areNodeInputsConstant(
+=======
+static bool areNodeInputsConstant(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Node* node,
     const ValueToParamPairMap& valsToParamsMap) {
   return std::all_of(
@@ -596,7 +632,11 @@ bool areNodeInputsConstant(
       [&valsToParamsMap](Value* v) { return isConstant(v, valsToParamsMap); });
 }
 
+<<<<<<< HEAD
 std::vector<Node*> getOnnxConstParentsToRemove(Node* node) {
+=======
+static std::vector<Node*> getOnnxConstParentsToRemove(Node* node) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<Node*> parentNodes;
   for (auto val : node->inputs()) {
     // If the parent of 'node' is an onnx::Constant node,
@@ -619,7 +659,14 @@ std::vector<Node*> getOnnxConstParentsToRemove(Node* node) {
 // This is more of a partial evaluation analysis, where operations on constant
 // nodes can be lifted so we run them earlier, before the usual parameters are
 // known.
+<<<<<<< HEAD
 void ConstantFoldONNX(Block* b, ParamMap& paramsDict, int opset_version) {
+=======
+static void ConstantFoldONNX(
+    Block* b,
+    ParamMap& paramsDict,
+    int opset_version) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (opset_version < ONNX_OPSET_9) {
     TORCH_WARN(
         "Constant folding supported for only opsets >= 9. "
diff --git a/torch/csrc/jit/passes/onnx/constant_map.cpp b/torch/csrc/jit/passes/onnx/constant_map.cpp
index 2a1f23edf4e0..78872f1d2d3c 100644
--- a/torch/csrc/jit/passes/onnx/constant_map.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_map.cpp
@@ -234,7 +234,11 @@ DimSymbolMap& ConstantValueMap::GetDimSymbolMap() {
 }
 
 template <typename Map>
+<<<<<<< HEAD
 void UpdateStrKey(
+=======
+static void UpdateStrKey(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Map& map,
     const std::string& old_key,
     const std::string& new_key) {
diff --git a/torch/csrc/jit/passes/onnx/deduplicate_initializers.cpp b/torch/csrc/jit/passes/onnx/deduplicate_initializers.cpp
index 009bb1577374..340d398ea88c 100644
--- a/torch/csrc/jit/passes/onnx/deduplicate_initializers.cpp
+++ b/torch/csrc/jit/passes/onnx/deduplicate_initializers.cpp
@@ -10,7 +10,11 @@ namespace onnx {
 using namespace ::c10::onnx;
 }
 
+<<<<<<< HEAD
 void DeduplicateInitializers(
+=======
+static void DeduplicateInitializers(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::shared_ptr<Graph>& g,
     ValueToParamPairMap& valsToParamsMap,
     bool (*comp)(at::Tensor&, at::Tensor&)) {
@@ -62,12 +66,20 @@ void DeduplicateInitializers(
   }
 }
 
+<<<<<<< HEAD
 bool DeduplicateInitializersByDataPtr(at::Tensor& t1, at::Tensor& t2) {
+=======
+static bool DeduplicateInitializersByDataPtr(at::Tensor& t1, at::Tensor& t2) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return t1.sizes().equals(t2.sizes()) && t1.strides().equals(t2.strides()) &&
       (t1.has_storage() && t2.has_storage() && t1.data_ptr() == t2.data_ptr());
 }
 
+<<<<<<< HEAD
 bool DeduplicateInitializersByValue(at::Tensor& t1, at::Tensor& t2) {
+=======
+static bool DeduplicateInitializersByValue(at::Tensor& t1, at::Tensor& t2) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (t1.dtype() != t2.dtype() || !t1.sizes().equals(t2.sizes()) ||
       !t1.strides().equals(t2.strides())) {
     return false;
diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.cpp b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
index d25a215ecc8a..70ceedb17ebc 100644
--- a/torch/csrc/jit/passes/onnx/eval_peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
@@ -12,7 +12,11 @@ namespace onnx {
 using namespace ::c10::onnx;
 }
 
+<<<<<<< HEAD
 std::vector<at::Tensor> getValues(
+=======
+static std::vector<at::Tensor> getValues(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Node* node,
     const ValueToParamPairMap& valsToParamsMap) {
   size_t numInputs = node->inputs().size();
@@ -140,7 +144,11 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) {
   }
 }
 
+<<<<<<< HEAD
 void EvalPeepholeONNX(Block* b, ParamMap& paramsDict) {
+=======
+static void EvalPeepholeONNX(Block* b, ParamMap& paramsDict) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto valsToParamsMap = buildValueToParamsMap(b, paramsDict);
   fuseConvBatchNorm(b, valsToParamsMap);
   buildParamsMapFromValueToParamsMap(valsToParamsMap, paramsDict);
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
index 75c44aead7ca..59d25d6c7543 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
@@ -347,7 +347,11 @@ void FixupONNXLoopNodeInputs(Node* node, int opset_version) {
 }
 } // anonymous namespace
 
+<<<<<<< HEAD
 std::vector<Value*> FixupONNXLoopNode(Node* node, int opset_version) {
+=======
+static std::vector<Value*> FixupONNXLoopNode(Node* node, int opset_version) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto output_size = node->outputs().size();
   GRAPH_DEBUG("before FixupONNXLoopBlockInputs: ", *node->owningGraph());
   FixupONNXLoopBlockInputs(node);
@@ -368,7 +372,11 @@ std::vector<Value*> FixupONNXLoopNode(Node* node, int opset_version) {
 
 // Check if node is prim::Uninitialized,
 // or output of prim::Uninitialized->onnx::Identity
+<<<<<<< HEAD
 bool IsUninitializedNode(Node* n) {
+=======
+static bool IsUninitializedNode(Node* n) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (n->kind() == ::c10::onnx::Identity &&
       n->inputs()[0]->node()->kind() == prim::Uninitialized)
     return true;
@@ -380,7 +388,11 @@ bool IsUninitializedNode(Node* n) {
 // Infer shape and type of the uninitialized_output from the corresponding
 // output of the other subblock. prim::Uninitialized node is proven to be
 // unused. So replace this node with one of the inferred shape and type.
+<<<<<<< HEAD
 void InferShapeTypeForUninitializedOutput(
+=======
+static void InferShapeTypeForUninitializedOutput(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Graph* graph,
     Block* block,
     Value* uninitialized_output,
@@ -399,7 +411,11 @@ void InferShapeTypeForUninitializedOutput(
     } else {
       const_node->t_(attr::value, at::zeros({}, elem_type));
       const_node->output()->setType(
+<<<<<<< HEAD
           TensorType::create(*(output_type->scalarType()), at::kCPU, {}, {}));
+=======
+          TensorType::create(output_type->scalarType(), at::kCPU, {}, {}));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   } else if (auto output_type = other_output->type()->cast<ListType>()) {
     TypePtr elem = output_type->getElementType();
@@ -456,7 +472,11 @@ void InferShapeTypeForUninitializedOutput(
 //       -> (%1, %y.1, %7)
 //   ...
 
+<<<<<<< HEAD
 void ONNXFixupUninitializedOutput(Node* node, int opset_version) {
+=======
+static void ONNXFixupUninitializedOutput(Node* node, int opset_version) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (node->kind() != ::c10::onnx::If) {
     return;
   }
@@ -510,7 +530,11 @@ void ONNXFixupUninitializedOutput(Node* node, int opset_version) {
   }
 }
 
+<<<<<<< HEAD
 void ONNXMergeIfBlockOutputShapes(Node* node) {
+=======
+static void ONNXMergeIfBlockOutputShapes(Node* node) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_INTERNAL_ASSERT(node->kind() == ::c10::onnx::If);
   Block* then_block = node->blocks().at(0);
   Block* else_block = node->blocks().at(1);
@@ -663,7 +687,11 @@ void ONNXMergeIfBlockOutputShapes(Node* node) {
   }
 }
 
+<<<<<<< HEAD
 std::vector<Value*> FixupONNXIfNode(Node* node, int opset_version) {
+=======
+static std::vector<Value*> FixupONNXIfNode(Node* node, int opset_version) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (node->kind() != ::c10::onnx::If) {
     return node->outputs().vec();
   }
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index c988b9243669..7535dff26fc5 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -1125,6 +1125,7 @@ NodeAttrNameMap ONNXFunctionExtraction(
   return fe.run();
 }
 
+<<<<<<< HEAD
 Node* ONNXGetPreviousScope(std::shared_ptr<Graph>& graph) {
   auto* last_node = graph->nodes().back()->prev();
   auto* scope_node = NodeOfMostRecentScope(last_node);
@@ -1139,6 +1140,8 @@ Node* ONNXGetPreviousScope(std::shared_ptr<Graph>& graph) {
   return attr_node;
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void ONNXClearScopeRecords() {
   scope_attr_map_.clear();
   scope_attr_graph_ = std::make_shared<Graph>();
diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp
index 64a4bb9bdfab..c2f3e47b3326 100644
--- a/torch/csrc/jit/passes/onnx/helper.cpp
+++ b/torch/csrc/jit/passes/onnx/helper.cpp
@@ -240,7 +240,11 @@ Node* transformToONNXConcatNode(
   return concat_node;
 }
 
+<<<<<<< HEAD
 void ONNXLintGraph(
+=======
+static void ONNXLintGraph(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Block* b,
     std::vector<NodeKind>& n_miss_source_range,
     std::vector<NodeKind>& n_miss_scope) {
diff --git a/torch/csrc/jit/passes/onnx/list_model_parameters.cpp b/torch/csrc/jit/passes/onnx/list_model_parameters.cpp
index dcef4da887f7..4618c9745ed3 100644
--- a/torch/csrc/jit/passes/onnx/list_model_parameters.cpp
+++ b/torch/csrc/jit/passes/onnx/list_model_parameters.cpp
@@ -22,7 +22,11 @@ using namespace ::c10::onnx;
 //   ...
 //   %weight = prim::GetAttr[name="scale"](%B)
 //   ...
+<<<<<<< HEAD
 std::deque<std::string> findSubModuleAttr(
+=======
+static std::deque<std::string> findSubModuleAttr(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Value* input,
     std::string& name,
     Module& attrModule,
@@ -48,7 +52,14 @@ std::deque<std::string> findSubModuleAttr(
   return moduleNames;
 }
 
+<<<<<<< HEAD
 Value* addParamAsArgument(Function* function, std::string& name, IValue& attr) {
+=======
+static Value* addParamAsArgument(
+    Function* function,
+    std::string& name,
+    IValue& attr) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto schema = function->getSchema();
   auto args = schema.arguments();
   args.emplace_back(name, nullptr, std::nullopt, attr);
@@ -64,7 +75,11 @@ Value* addParamAsArgument(Function* function, std::string& name, IValue& attr) {
       attr.type());
 }
 
+<<<<<<< HEAD
 std::vector<IValue> getParamAttributes(
+=======
+static std::vector<IValue> getParamAttributes(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Block* block,
     std::shared_ptr<Graph>& graph,
     const Module& module_,
@@ -163,7 +178,11 @@ std::vector<IValue> getParamAttributes(
   return parameterIValues;
 }
 
+<<<<<<< HEAD
 void insertMainModuleAsConstant(const std::shared_ptr<Graph>& graph) {
+=======
+static void insertMainModuleAsConstant(const std::shared_ptr<Graph>& graph) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto* constNode = graph->create(prim::CreateObject);
   constNode->output()->setType(graph->inputs().at(0)->type());
   auto it = graph->nodes().begin();
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.cpp
index 1f9b49c3c0a1..8743fa8feb27 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.cpp
@@ -6,7 +6,11 @@
 
 namespace torch::jit {
 
+<<<<<<< HEAD
 void convertSubgraphToSubBlock(Block* block) {
+=======
+static void convertSubgraphToSubBlock(Block* block) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto it = block->nodes().begin(), end = block->nodes().end();
        it != end;) {
     Node* node = *it++;
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 407a8c79dfb7..443be93b8434 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -29,12 +29,20 @@ namespace onnx {
 using namespace ::c10::onnx;
 }
 
+<<<<<<< HEAD
 bool isRNN(const Node* node) {
+=======
+static bool isRNN(const Node* node) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto k = node->kind();
   return k == onnx::RNN || k == onnx::LSTM || k == onnx::GRU;
 }
 
+<<<<<<< HEAD
 bool isNopTranspose(const std::vector<int64_t>& perm) {
+=======
+static bool isNopTranspose(const std::vector<int64_t>& perm) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (int64_t i = 0, perm_size = perm.size(); i < perm_size; i++) {
     if (perm[i] != i) {
       return false;
@@ -52,7 +60,11 @@ bool isNopTranspose(const std::vector<int64_t>& perm) {
 // iteration would have folded all the transposes up to that point. Thus,
 // `ret[i] = t1[t2[i]]` says "the output of t2 at position i takes the value of
 // the input tensor index contained in t1 at position `t2[i]``".
+<<<<<<< HEAD
 std::vector<int64_t> composeTransposes(
+=======
+static std::vector<int64_t> composeTransposes(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::vector<int64_t>& t1,
     const std::vector<int64_t>& t2) {
   TORCH_INTERNAL_ASSERT(t1.size() == t2.size());
@@ -65,7 +77,11 @@ std::vector<int64_t> composeTransposes(
   return ret;
 }
 
+<<<<<<< HEAD
 std::vector<size_t> getBroadcastPositions(Node* node) {
+=======
+static std::vector<size_t> getBroadcastPositions(Node* node) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Most of the element-wise ops in ONNX supports numpy broadcasting.
   // Only GEMM supports one-directional broadcasting, which broadcasts the bias
   // to the product.
@@ -100,7 +116,11 @@ std::vector<size_t> getBroadcastPositions(Node* node) {
 // Determine whether `from` can broadcast to `to`, and if so at which
 // position. `from` must be a suffix of `to`, except that any
 // occurrences of 1 in `from` are treated as wildcards.
+<<<<<<< HEAD
 std::optional<size_t> fusibleExpandTo(
+=======
+static std::optional<size_t> fusibleExpandTo(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::IntArrayRef from,
     at::IntArrayRef to) {
   if (from.size() > to.size()) {
@@ -122,7 +142,11 @@ std::optional<size_t> fusibleExpandTo(
 // easier for non-strided backends to more efficiently do broadcasts if this
 // is local information. This optimization is not useful for PyTorch as
 // 'expand' is free.
+<<<<<<< HEAD
 void fuseBroadcast(Block* b) {
+=======
+static void fuseBroadcast(Block* b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto n : b->nodes()) {
     for (auto* child_block : n->blocks()) {
       fuseBroadcast(child_block);
@@ -179,7 +203,11 @@ void fuseBroadcast(Block* b) {
   }
 }
 
+<<<<<<< HEAD
 void fuseConsecutiveTransposes(Block* b) {
+=======
+static void fuseConsecutiveTransposes(Block* b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto n : b->nodes()) {
     for (auto* child_block : n->blocks()) {
       fuseConsecutiveTransposes(child_block);
@@ -201,7 +229,11 @@ void fuseConsecutiveTransposes(Block* b) {
   }
 }
 
+<<<<<<< HEAD
 void eliminateNopTranspose(Block* b) {
+=======
+static void eliminateNopTranspose(Block* b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) {
     auto n = *it;
     for (auto* child_block : n->blocks()) {
@@ -217,7 +249,11 @@ void eliminateNopTranspose(Block* b) {
   }
 }
 
+<<<<<<< HEAD
 void fuseTransposeIntoGemm(Block* b) {
+=======
+static void fuseTransposeIntoGemm(Block* b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   static const std::vector<int64_t> simpleTransPerm({1, 0});
 
   for (auto n : b->nodes()) {
@@ -257,7 +293,11 @@ void fuseTransposeIntoGemm(Block* b) {
 //   the removeNopPacking pass removes the packing operations
 //   entirely by pairing them with their inverse PadPacked. If the
 //   input graph does not pair the operations, export will fail.
+<<<<<<< HEAD
 void pushPackingPastRnn(Block* b) {
+=======
+static void pushPackingPastRnn(Block* b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto it = b->nodes().begin(); it != b->nodes().end(); ++it) {
     auto* n = *it;
     for (auto* child_block : n->blocks()) {
@@ -396,7 +436,11 @@ void pushPackingPastRnn(Block* b) {
 // Despite the name, this actually removes the PadPacked node and leaves
 // the PackPadded node. The PackPadded should become dead code which will
 // be eliminated later.
+<<<<<<< HEAD
 void removeNopPacking(Block* graph) {
+=======
+static void removeNopPacking(Block* graph) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto it = graph->nodes().begin(); it != graph->nodes().end(); ++it) {
     auto* n = *it;
     for (auto* child_block : n->blocks()) {
@@ -424,7 +468,11 @@ void removeNopPacking(Block* graph) {
   }
 }
 
+<<<<<<< HEAD
 void hackFixupPadPackedShapes(Block* graph) {
+=======
+static void hackFixupPadPackedShapes(Block* graph) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // FIXME: the shape of the input to the fictional PadPacked node has
   // incorrect shape. For now, just copy the shape of PadPacked to the shape
   // of its input.
@@ -442,7 +490,11 @@ void hackFixupPadPackedShapes(Block* graph) {
   }
 }
 
+<<<<<<< HEAD
 void fixDefaultRNNState(
+=======
+static void fixDefaultRNNState(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Graph* graph,
     Node* n,
     int input_index,
@@ -535,7 +587,11 @@ void fixDefaultRNNState(
   }
 }
 
+<<<<<<< HEAD
 void fixDefaultRnnHiddenState(Block* b, int opset_version) {
+=======
+static void fixDefaultRnnHiddenState(Block* b, int opset_version) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto it = b->nodes().begin(); it != b->nodes().end(); ++it) {
     auto* n = *it;
     for (auto* child_block : n->blocks()) {
@@ -554,7 +610,11 @@ void fixDefaultRnnHiddenState(Block* b, int opset_version) {
   }
 }
 
+<<<<<<< HEAD
 void fixDefaultLstmCellState(Block* b, int opset_version) {
+=======
+static void fixDefaultLstmCellState(Block* b, int opset_version) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto it = b->nodes().begin(); it != b->nodes().end(); ++it) {
     auto* n = *it;
     for (auto* child_block : n->blocks()) {
@@ -791,7 +851,11 @@ static void eraseTupleConstruct(Block* block) {
   }
 }
 
+<<<<<<< HEAD
 void removeMaxPoolUnusedOutput(Block* b) {
+=======
+static void removeMaxPoolUnusedOutput(Block* b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) {
     auto n = *it;
     for (auto* child_block : n->blocks()) {
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 798636966df8..24ab141bc326 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -12,8 +12,11 @@
 #include <torch/csrc/jit/serialization/onnx.h>
 #include <torch/csrc/utils/python_strings.h>
 
+<<<<<<< HEAD
 #include <torch/csrc/onnx/diagnostics/diagnostics.h>
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <onnx/shape_inference/implementation.h>
 #include <algorithm>
 #include <cmath>
@@ -24,7 +27,11 @@
 
 namespace torch::jit {
 
+<<<<<<< HEAD
 inline bool PyNone_Check(PyObject* o) {
+=======
+static inline bool PyNone_Check(PyObject* o) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return o == Py_None;
 }
 
@@ -84,7 +91,10 @@ void MergeInferredTypeAndSetMap(
 namespace {
 namespace onnx_torch = ::torch::onnx;
 namespace onnx = ::ONNX_NAMESPACE;
+<<<<<<< HEAD
 namespace diagnostics = ::torch::onnx::diagnostics;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // SymbolDimMap is a Torch-to-ONNX shape look-up. This is built so it can be
 // returned by the export function. During the export however, when we come
@@ -1997,11 +2007,14 @@ void UpdateReliable(
         output->node()->kind().toDisplayString(),
         " type is missing, so it may result in wrong shape inference for the exported graph. ",
         "Please consider adding it in symbolic function.");
+<<<<<<< HEAD
     // Experimental, nothing sent to stdout nor stderr.
     diagnostics::Diagnose(
         diagnostics::Rule::kNodeMissingOnnxShapeInference,
         diagnostics::Level::kWarning,
         {{"op_name", output->node()->kind().toDisplayString()}});
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   auto reliable = false;
   if (inferred) {
@@ -2035,7 +2048,11 @@ void UpdateReliable(Node* n) {
 // Traverse the graph inputs and compute reliability (e.g., are shapes static).
 // Since the inputs do not change during export, we save computation time by
 // marking it as computed and subsequently skipping.
+<<<<<<< HEAD
 void SetGraphInputTypeReliable(const Graph* g) {
+=======
+static void SetGraphInputTypeReliable(const Graph* g) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!ConstantValueMap::GetAllGraphInputsReliableComputed()) {
     for (auto graph_input : g->inputs()) {
       if (!ConstantValueMap::HasTypeReliable(graph_input->debugName())) {
@@ -2263,7 +2280,11 @@ void ONNXSetDynamicInputShape(
   }
 }
 
+<<<<<<< HEAD
 bool HasSequenceTypeOutput(Node* node) {
+=======
+static bool HasSequenceTypeOutput(Node* node) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (node->kind() == ::c10::onnx::SplitToSequence ||
       node->kind() == ::c10::onnx::SequenceInsert ||
       node->kind() == ::c10::onnx::SequenceEmpty ||
@@ -2274,7 +2295,11 @@ bool HasSequenceTypeOutput(Node* node) {
   return false;
 }
 
+<<<<<<< HEAD
 void ONNXUpdateTypeFromTensor(
+=======
+static void ONNXUpdateTypeFromTensor(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Value* graph_output,
     const at::Tensor& output,
     bool onnx_shape_inference) {
@@ -2290,7 +2315,11 @@ void ONNXUpdateTypeFromTensor(
 // into flattened graph outputs. `outputs_index` is passed in to point to the
 // current index in flattened graph outputs. The updated `outputs_index` is
 // returned at the end of the function.
+<<<<<<< HEAD
 size_t ONNXAssignOutputShape(
+=======
+static size_t ONNXAssignOutputShape(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::shared_ptr<Graph>& graph,
     size_t outputs_index,
     PyObject* output_obj,
diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
index 1f67cb4f970f..58ca24e7c440 100644
--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
@@ -98,7 +98,11 @@ double getScaleFromInput(Node* input_node) {
       input_name);
 }
 
+<<<<<<< HEAD
 std::vector<Node*> CreateQuantizedWeights(
+=======
+static std::vector<Node*> CreateQuantizedWeights(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::shared_ptr<Graph>& graph,
     const at::Tensor& weight,
     int8_t* data,
@@ -191,7 +195,11 @@ std::vector<Node*> CreateQuantizedWeights(
   return {data_node, scale_node, zero_point_node, axis_node};
 }
 
+<<<<<<< HEAD
 Node* CreateQuantizedBias(
+=======
+static Node* CreateQuantizedBias(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<float> data,
     std::shared_ptr<Graph>& graph,
     const std::vector<int64_t>& shapes) {
@@ -206,7 +214,11 @@ Node* CreateQuantizedBias(
   return const_node_1;
 }
 
+<<<<<<< HEAD
 Node* createIntTuple(
+=======
+static Node* createIntTuple(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::vector<int64_t>& is,
     std::shared_ptr<Graph>& graph) {
   Node* const_node = graph->create(Symbol::onnx("Constant"));
@@ -214,13 +226,21 @@ Node* createIntTuple(
   return const_node;
 }
 
+<<<<<<< HEAD
 Node* createInt(int64_t i, std::shared_ptr<Graph>& graph) {
+=======
+static Node* createInt(int64_t i, std::shared_ptr<Graph>& graph) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Node* const_node = graph->create(Symbol::onnx("Constant"));
   const_node->i_(Symbol::attr("value"), i);
   return const_node;
 }
 
+<<<<<<< HEAD
 void ConvertQuantizedWeight(
+=======
+static void ConvertQuantizedWeight(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::shared_ptr<Graph>& graph,
     Node* node,
     at::Tensor& weight) {
@@ -254,7 +274,11 @@ enum class QuantizedParamsType { CONV1D, CONV, LINEAR };
 // passed to the appropriate unpack function using c10::Dispatcher. We insert
 // the unpacked weights and bias into the graph using
 // caffe2::Int8GivenTensorFill nodes.
+<<<<<<< HEAD
 void unpackQuantizedWeightsHelper(
+=======
+static void unpackQuantizedWeightsHelper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::shared_ptr<Graph>& graph,
     std::map<std::string, IValue>& paramsDict,
     const std::string& pattern,
@@ -547,7 +571,11 @@ static std::
 
 // Unpack quantized tensor inputs into {value, scale, zero_point},
 // Then create a prim::TupleConstruct node based on these three values.
+<<<<<<< HEAD
 void UnpackQuantizedTensorInputs(std::shared_ptr<Graph>& graph) {
+=======
+static void UnpackQuantizedTensorInputs(std::shared_ptr<Graph>& graph) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (size_t index = 0; index < graph->inputs().size();) {
     auto g_input = graph->inputs()[index];
     TensorTypePtr shape_type = g_input->type()->cast<TensorType>();
@@ -707,7 +735,11 @@ void UnpackQuantizedWeights(
 // Caffe2 expects quantized ops to be in NHWC format while pytorch inputs are in
 // NCHW. This pass inserts permutes to convert from NCHW to NHWC before each
 // conv op and add another permute from NHWC to NCHW after the conv op.
+<<<<<<< HEAD
 void insertPermutesHelper(
+=======
+static void insertPermutesHelper(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::shared_ptr<Graph>& graph,
     std::map<std::string, IValue>& paramsDict,
     const std::string& pattern) {
diff --git a/torch/csrc/jit/passes/quantization/helper.cpp b/torch/csrc/jit/passes/quantization/helper.cpp
index 1d623c82d322..53f50926f336 100644
--- a/torch/csrc/jit/passes/quantization/helper.cpp
+++ b/torch/csrc/jit/passes/quantization/helper.cpp
@@ -18,7 +18,11 @@ using AtenFuncArgs = std::vector<FuncArg>;
 using CallFuncArgs = std::vector<FuncArg>;
 
 // Lists of allowed quantizable operators
+<<<<<<< HEAD
 std::vector<std::string> _static_quantizable_call_funcs = {
+=======
+static std::vector<std::string> _static_quantizable_call_funcs = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "conv2d",
     "linear",
     "batch_norm",
@@ -31,7 +35,11 @@ std::vector<std::string> _static_quantizable_call_funcs = {
     "embedding_bag",
 };
 
+<<<<<<< HEAD
 std::vector<std::string> _static_quantizable_aten_funcs = {
+=======
+static std::vector<std::string> _static_quantizable_aten_funcs = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "conv1d",
     "conv2d",
     "conv3d",
@@ -51,6 +59,7 @@ std::vector<std::string> _static_quantizable_aten_funcs = {
     "embedding_bag",
 };
 
+<<<<<<< HEAD
 std::vector<std::string> _dynamic_quantizable_call_funcs = {
     "linear",
 };
@@ -63,6 +72,20 @@ std::vector<std::string> _static_weight_only_quant_aten_funcs = {
     "embedding_bag",
 };
 std::vector<std::string> _static_weight_only_quant_call_funcs = {
+=======
+static std::vector<std::string> _dynamic_quantizable_call_funcs = {
+    "linear",
+};
+
+static std::vector<std::string> _dynamic_quantizable_aten_funcs = {
+    "linear",
+};
+
+static std::vector<std::string> _static_weight_only_quant_aten_funcs = {
+    "embedding_bag",
+};
+static std::vector<std::string> _static_weight_only_quant_call_funcs = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "embedding_bag",
 };
 
@@ -73,7 +96,11 @@ std::vector<std::string> _static_weight_only_quant_call_funcs = {
 // output of the `prim::CallFunction`
 // Also these ops doesn't do computation on the value of Tensor, the
 // operation only depends on the shape of the Tensor
+<<<<<<< HEAD
 std::vector<std::string> _single_input_general_shape_call_funcs = {
+=======
+static std::vector<std::string> _single_input_general_shape_call_funcs = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "_max_pool1d",
     "_max_pool2d",
     "_max_pool3d",
@@ -86,7 +113,11 @@ std::vector<std::string> _single_input_general_shape_call_funcs = {
 // Also these ops doesn't do computation on the value of Tensor, the
 // operation only depends on the shape of the Tensor
 // e.g. `aten::flatten(%input_tensor, ...)`
+<<<<<<< HEAD
 std::vector<std::string> _single_input_general_shape_aten_funcs = {
+=======
+static std::vector<std::string> _single_input_general_shape_aten_funcs = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "max_pool1d",
     "max_pool2d",
     "max_pool3d",
@@ -121,7 +152,11 @@ std::vector<std::string> _single_input_general_shape_aten_funcs = {
 // Also these ops do computation on the value of Tensor
 // TODO: [Need verify] looks like we can quantize simple functionals that just
 // call into aten functions
+<<<<<<< HEAD
 std::vector<std::string> _single_input_general_value_call_funcs = {
+=======
+static std::vector<std::string> _single_input_general_value_call_funcs = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "avg_pool1d",
     "avg_pool2d",
     "avg_pool3d",
@@ -140,7 +175,11 @@ std::vector<std::string> _single_input_general_value_call_funcs = {
 // have a single input Tensor
 // Also these ops do computation on the value of Tensor
 // e.g. `aten::avg_pool2d(%input_tensor, ...)`
+<<<<<<< HEAD
 std::vector<std::string> _single_input_general_value_aten_funcs = {
+=======
+static std::vector<std::string> _single_input_general_value_aten_funcs = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "avg_pool1d",
     "avg_pool2d",
     "avg_pool3d",
@@ -163,7 +202,11 @@ std::vector<std::string> _single_input_general_value_aten_funcs = {
     "leaky_relu_",
 };
 
+<<<<<<< HEAD
 std::vector<std::string> _clamp_funcs = {
+=======
+static std::vector<std::string> _clamp_funcs = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "hardtanh",
     "hardtanh_",
     "clamp",
@@ -176,7 +219,11 @@ const float _sym_scale = 2.0f / 256.0f;
 const int _sym_zero_point = 128;
 // quantization parameters for ops with range 0 to 1
 // for example: aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
+<<<<<<< HEAD
 std::tuple<c10::QScheme, QParamVector> _per_tensor_asym_qparam =
+=======
+static std::tuple<c10::QScheme, QParamVector> _per_tensor_asym_qparam =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::make_tuple(
         c10::kPerTensorAffine,
         QParamVector(
@@ -186,6 +233,7 @@ std::tuple<c10::QScheme, QParamVector> _per_tensor_asym_qparam =
 
 // quantization parameters for ops with range -1 to 1
 // for example: aten/src/ATen/native/quantized/cpu/qtanh.cpp
+<<<<<<< HEAD
 std::tuple<c10::QScheme, QParamVector> _per_tensor_sym_qparam = std::make_tuple(
     c10::kPerTensorAffine,
     QParamVector(
@@ -196,6 +244,19 @@ std::tuple<c10::QScheme, QParamVector> _per_tensor_sym_qparam = std::make_tuple(
 // Map from aten op symbol to the quantization parameters
 // for the ops with fixed quantization parameters
 std::unordered_map<NodeKind, std::tuple<c10::QScheme, QParamVector>>
+=======
+static std::tuple<c10::QScheme, QParamVector> _per_tensor_sym_qparam =
+    std::make_tuple(
+        c10::kPerTensorAffine,
+        QParamVector(
+            {std::make_pair(".scale", IValue(_sym_scale)),
+             std::make_pair(".zero_point", IValue(_sym_zero_point)),
+             std::make_pair(".scalar_type", IValue(c10::kQUInt8))}));
+
+// Map from aten op symbol to the quantization parameters
+// for the ops with fixed quantization parameters
+static std::unordered_map<NodeKind, std::tuple<c10::QScheme, QParamVector>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _fixed_qparams_map = {
         {Symbol::aten("hardsigmoid"), _per_tensor_asym_qparam},
         {Symbol::aten("hardsigmoid_"), _per_tensor_asym_qparam},
@@ -208,6 +269,7 @@ std::unordered_map<NodeKind, std::tuple<c10::QScheme, QParamVector>>
 // Special checks for ops that do not require observers for all input tensors.
 // For each operator in this list observers are inserted for the input based
 // on the index specified.
+<<<<<<< HEAD
 AtenFuncArgs _observe_inputs_aten_func = {};
 CallFuncArgs _observe_inputs_call_func = {{"batch_norm", 1}};
 
@@ -217,13 +279,32 @@ std::vector<std::string> _tensor_info_funcs = {"size", "len", "dim", "numel"};
 // Aten functions whose output will be quantized or not quantized depending
 // on input tensor
 std::vector<std::string> _propagate_quant_single_input_ops = {"cat"};
+=======
+static AtenFuncArgs _observe_inputs_aten_func = {};
+static CallFuncArgs _observe_inputs_call_func = {{"batch_norm", 1}};
+
+// Aten functions for getting tensor information
+static std::vector<std::string> _tensor_info_funcs = {
+    "size",
+    "len",
+    "dim",
+    "numel"};
+
+// Aten functions whose output will be quantized or not quantized depending
+// on input tensor
+static std::vector<std::string> _propagate_quant_single_input_ops = {"cat"};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // Rules are slightly different for binary ops like `aten::add`, for these ops,
 // if both of the inputs are Tensor, we'll quantize the output only if both of
 // the inputs are quantized
 // if the second input is a Scalar, we'll only look at the first input to decide
 // if we need to quantize the output
+<<<<<<< HEAD
 std::vector<std::string> _propagate_quant_binary_ops = {
+=======
+static std::vector<std::string> _propagate_quant_binary_ops = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "add",
     "add_",
     "mul",
diff --git a/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp b/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
index 723f3c9cf75c..0c67bd4244da 100644
--- a/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
@@ -69,7 +69,11 @@ static std::map<int64_t, Value*> InsertSymbolicShapesCompute(
   return sym_shape_to_enclosing_graph_value;
 }
 
+<<<<<<< HEAD
 void insertDynamicShapesGuard(
+=======
+static void insertDynamicShapesGuard(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const ShapeComputeGraphMapping& shape_mapping,
     Node* guarded_node,
     bool add_composed_op,
@@ -115,7 +119,11 @@ StrideInput strideInputFromString(const std::string& si) {
 // in the runtime guard, strides are serialized as one flat
 // vector. stride_inputs_offset indexes into that vector
 // where the strides of this tensor begin
+<<<<<<< HEAD
 inline StrideInput summarizeStrideDim(
+=======
+static inline StrideInput summarizeStrideDim(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const c10::IntArrayRef sizes,
     const c10::IntArrayRef strides,
     size_t dim,
@@ -517,7 +525,11 @@ static Operation StaticRuntimeCopyOuts(const Node* node) {
   };
 }
 
+<<<<<<< HEAD
 RegisterOperators SRCopyOuts({
+=======
+static RegisterOperators SRCopyOuts({
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch::jit::Operator(
         prim::StaticRuntimeCopyOuts,
         StaticRuntimeCopyOuts,
@@ -529,7 +541,11 @@ RegisterOperators SRCopyOuts({
 // and also the that the symbolic shape dimensions are observed.
 // For any symbolic dimension we need to set its value on its first
 // use and for all subsequent uses check that the values are equal
+<<<<<<< HEAD
 RegisterOperators reg_guard({
+=======
+static RegisterOperators reg_guard({
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Operator(
         "prim::TensorExprDynamicGuard(...) -> bool",
         [](const Node* node) -> Operation {
@@ -736,7 +752,11 @@ static Operation createTensorExprDynamicGroup(const Node* node) {
   };
 }
 
+<<<<<<< HEAD
 RegisterOperators TensorExprDynamicOp({
+=======
+static RegisterOperators TensorExprDynamicOp({
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch::jit::Operator(
         prim::TensorExprDynamicGroup,
         createTensorExprDynamicGroup,
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 14f0b14aef78..aa86de4a8e98 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -16,6 +16,10 @@
 #include <torch/csrc/jit/passes/pass_manager.h>
 #include <torch/csrc/jit/passes/remove_redundant_profiles.h>
 #include <torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
@@ -155,11 +159,19 @@ void setTensorExprFuserEnabled(bool val) {
 }
 
 bool tensorExprFuserEnabled() {
+<<<<<<< HEAD
   static const char* enable_c_str = std::getenv("PYTORCH_TENSOREXPR");
   if (!enable_c_str) {
     return texpr_fuser_enabled_;
   }
   if (std::string(enable_c_str) == "0") {
+=======
+  static const auto enable_opt = c10::utils::get_env("PYTORCH_TENSOREXPR");
+  if (!enable_opt.has_value()) {
+    return texpr_fuser_enabled_;
+  }
+  if (enable_opt == "0") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
   return true;
@@ -1293,10 +1305,17 @@ class TensorExprFuser {
   // 'PYTORCH_TENSOREXPR_DONT_FUSE="clamp:mul:add"' disables fusion on
   // aten::clamp, aten::mul and aten::add.
   void parseTENotFuseOption() {
+<<<<<<< HEAD
     const char* option = std::getenv("PYTORCH_TENSOREXPR_DONT_FUSE");
     std::stringstream in_ss;
     if (option) {
       in_ss << option;
+=======
+    const auto option = c10::utils::get_env("PYTORCH_TENSOREXPR_DONT_FUSE");
+    std::stringstream in_ss;
+    if (option.has_value()) {
+      in_ss << option.value();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     std::string line;
@@ -1436,7 +1455,11 @@ static Operation createTensorExprOp(const Node* node) {
   };
 }
 
+<<<<<<< HEAD
 RegisterOperators TensorExprOps({
+=======
+static RegisterOperators TensorExprOps({
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch::jit::Operator(
         prim::TensorExprGroup,
         createTensorExprOp,
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
index dc2b9c11b4ac..692a0709bb89 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.h
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -69,5 +69,14 @@ TORCH_API bool isSupported(Node* node);
 /// @return Reference of the custome operator set
 ///
 TORCH_API OperatorSet& getCustomOperatorSet();
+<<<<<<< HEAD
 } // namespace tensorexpr
 } // namespace torch::jit
+=======
+
+} // namespace tensorexpr
+} // namespace torch::jit
+
+C10_DECLARE_bool(torch_jit_disable_cat);
+C10_DECLARE_bool(torch_jit_enable_dynamic_shape_fusion);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 5911064b22f2..9879de3d85e0 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -77,6 +77,10 @@
 #include <torch/csrc/jit/passes/utils/check_alias_annotation.h>
 #include <torch/csrc/jit/passes/vulkan_rewrite.h>
 #include <torch/csrc/jit/passes/xnnpack_rewrite.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/python/init.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/jit/python/python_arg_flatten.h>
 #include <torch/csrc/jit/python/python_custom_class.h>
@@ -162,7 +166,12 @@ std::optional<IValue> toTypeInferredIValueOptional(py::handle input) {
 }
 } // anonymous namespace
 
+<<<<<<< HEAD
 #if !defined(USE_ROCM)
+=======
+#if defined(BUILDING_TESTS) && !defined(USE_ROCM)
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TORCH_API void runJITCPPTests();
 #endif
 
@@ -292,6 +301,12 @@ void initJITBindings(PyObject* module) {
             return EliminateDeadCode(g->block()); // overload resolution
           })
       .def(
+<<<<<<< HEAD
+=======
+          "_jit_pass_dce_graph",
+          [](std::shared_ptr<Graph>& g) { return EliminateDeadCode(g); })
+      .def(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "_jit_pass_dce_allow_deleting_nodes_with_side_effects",
           [](std::shared_ptr<Graph>& g) {
             return EliminateDeadCode(
@@ -1259,6 +1274,19 @@ void initJITBindings(PyObject* module) {
             return a->guard_size_oblivious(file, line);
           })
       .def(
+<<<<<<< HEAD
+=======
+            "guard_or_false",
+            [](const c10::SymNode& a, const char* file, int64_t line) {
+              return a->guard_or_false(file, line);
+            })
+      .def(
+              "guard_or_true",
+              [](const c10::SymNode& a, const char* file, int64_t line) {
+                return a->guard_or_true(file, line);
+              })
+      .def(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "has_hint",
           [](const c10::SymNode& a) {
             return a->has_hint();
@@ -1637,7 +1665,11 @@ void initJITBindings(PyObject* module) {
           "get_record_offset_no_read",
           [](PyTorchStreamReader& self,
              size_t zipfile_header_offset,
+<<<<<<< HEAD
              const std::string filename,
+=======
+             const std::string& filename,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
              size_t size,
              uint64_t storage_alignment) {
             return self.getRecordOffsetNoRead(
@@ -1747,7 +1779,11 @@ void initJITBindings(PyObject* module) {
 
   m.def(
       "_jit_resolve_packet",
+<<<<<<< HEAD
       [](const char* op_name, py::args args, const py::kwargs& kwargs) {
+=======
+      [](const char* op_name, const py::args& args, const py::kwargs& kwargs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try {
           auto symbol = Symbol::fromQualString(op_name);
           bool allow_numbers_as_tensors = opAllowsNumbersAsTensors(symbol);
@@ -1936,6 +1972,16 @@ void initJITBindings(PyObject* module) {
         self.addArgumentValues(value_map);
       });
   py::class_<FunctionSchema>(m, "FunctionSchema")
+<<<<<<< HEAD
+=======
+      .def(py::init<
+           std::string,
+           std::string,
+           std::vector<Argument>,
+           std::vector<Argument>,
+           bool,
+           bool>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def_property_readonly(
           "name", [](FunctionSchema& self) { return self.name(); })
       .def_property_readonly(
@@ -1993,6 +2039,16 @@ void initJITBindings(PyObject* module) {
       .def_property_readonly(
           "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); });
   py::class_<Argument>(m, "Argument")
+<<<<<<< HEAD
+=======
+      .def(py::init<
+           std::string,
+           const TypePtr&,
+           std::optional<int32_t>,
+           std::optional<IValue>,
+           bool,
+           std::optional<AliasInfo>>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def_property_readonly("name", [](Argument& self) { return self.name(); })
       .def_property_readonly("type", [](Argument& self) { return self.type(); })
       .def_property_readonly(
@@ -2032,6 +2088,10 @@ void initJITBindings(PyObject* module) {
         return self.kwarg_only();
       });
   py::class_<AliasInfo>(m, "_AliasInfo")
+<<<<<<< HEAD
+=======
+      .def(py::init<bool, std::set<std::string>, std::set<std::string>>())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def_property_readonly(
           "is_write", [](AliasInfo& self) { return self.isWrite(); })
       .def_property_readonly(
@@ -2124,7 +2184,11 @@ void initJITBindings(PyObject* module) {
                 return py::make_tuple();
               },
               /* __setstate__ */
+<<<<<<< HEAD
               [](const py::tuple& /* unused */) { // NOLINT
+=======
+              [](const py::tuple& /* unused */) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 TORCH_CHECK(false, "Can not unpickle torch.futures.Future");
                 // Note that this return has no meaning since we always
                 // throw, it's only here to satisfy PyBind's API
@@ -2161,7 +2225,11 @@ void initJITBindings(PyObject* module) {
                 return py::make_tuple();
               },
               /* __setstate__ */
+<<<<<<< HEAD
               [](const py::tuple& /* unused */) { // NOLINT
+=======
+              [](const py::tuple& /* unused */) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 TORCH_CHECK(false, "Can not unpickle torch.jit._Await");
                 // Note that this return has no meaning since we always
                 // throw, it's only here to satisfy PyBind's API
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index 34867bf35633..27533d2d320a 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -59,7 +59,11 @@ void clear_registered_instances(void* ptr) {
 // SymIntList is in fact only ints, and if so, you called this with T=int64_t.
 // This precondition is NOT checked at runtime.
 template <typename T>
+<<<<<<< HEAD
 IValue listToIValue(py::handle obj) {
+=======
+static IValue listToIValue(py::handle obj) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   c10::List<T> rs;
   for (auto it = obj.begin(); it != obj.end(); it++) {
     auto elm = *it;
@@ -468,8 +472,14 @@ IValue toIValue(py::handle obj, const TypePtr& type, std::optional<int32_t> N) {
       } else {
         // We inspect the value to found the compiled TorchScript class
         // and then create a ivalue::Object from that class type.
+<<<<<<< HEAD
         py::str qualified_name = py::module::import("torch._jit_internal")
                                      .attr("_qualified_name")(obj.get_type());
+=======
+        py::str qualified_name =
+            py::module::import("torch._jit_internal")
+                .attr("_qualified_name")(py::type::handle_of(obj));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         auto pyCu = get_python_cu();
         classType = pyCu->get_class(c10::QualifiedName(qualified_name));
         if (!classType) {
@@ -641,7 +651,15 @@ py::object toPyObject(IValue ivalue) {
     for (const auto i : c10::irange(list.size())) {
       t[i] = toPyObject(IValue{list.get(i)});
     }
+<<<<<<< HEAD
     return std::move(t);
+=======
+#if C10_RETURN_MOVE_IF_OLD_COMPILER
+    return std::move(t);
+#else
+    return t;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (ivalue.isTuple()) {
     auto tuple = std::move(ivalue).toTuple();
     const auto& elements = tuple->elements();
@@ -676,7 +694,15 @@ py::object toPyObject(IValue ivalue) {
           .attr("_create_named_tuple")(
               t, unqualName, fieldNames, py::make_tuple(defaults));
     } else {
+<<<<<<< HEAD
       return std::move(t);
+=======
+#if C10_RETURN_MOVE_IF_OLD_COMPILER
+      return std::move(t);
+#else
+      return t;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
   } else if (ivalue.isDevice()) {
     return py::cast(std::move(ivalue).toDevice());
@@ -689,7 +715,15 @@ py::object toPyObject(IValue ivalue) {
       py_dict[toPyObject(IValue{pair.key()})] =
           toPyObject(IValue{pair.value()});
     }
+<<<<<<< HEAD
+    return std::move(py_dict);
+=======
+#if C10_RETURN_MOVE_IF_OLD_COMPILER
     return std::move(py_dict);
+#else
+    return py_dict;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   } else if (ivalue.isRRef()) {
 #ifdef USE_RPC
     auto RRefPtr =
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index bbac829782dc..9b0b94e37468 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -433,13 +433,18 @@ inline InferredType tryToInferType(py::handle input) {
   }
 
   py::bool_ isClass =
+<<<<<<< HEAD
       py::module::import("inspect").attr("isclass")(input.get_type());
+=======
+      py::module::import("inspect").attr("isclass")(py::type::handle_of(input));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (py::cast<bool>(isClass)) {
     // Assume that the class is compiled already or will compile. Invalidate
     // this later if needed.
     bool class_compiled = true;
 
     // Check if the type is already compiled.
+<<<<<<< HEAD
     py::object existing_ty = py::module::import("torch.jit._state")
                                  .attr("_get_script_class")(input.get_type());
 
@@ -447,6 +452,17 @@ inline InferredType tryToInferType(py::handle input) {
       // If not, try to compile it.
       py::bool_ can_compile = py::module::import("torch._jit_internal")
                                   .attr("can_compile_class")(input.get_type());
+=======
+    py::object existing_ty =
+        py::module::import("torch.jit._state")
+            .attr("_get_script_class")(py::type::handle_of(input));
+
+    if (existing_ty.is_none()) {
+      // If not, try to compile it.
+      py::bool_ can_compile =
+          py::module::import("torch._jit_internal")
+              .attr("can_compile_class")(py::type::handle_of(input));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       if (py::cast<bool>(can_compile)) {
         // Try to compile the class. This is wrapped in a try-catch because
@@ -456,7 +472,11 @@ inline InferredType tryToInferType(py::handle input) {
         try {
           py::module::import("torch.jit._script")
               .attr("_recursive_compile_class")(
+<<<<<<< HEAD
                   input.get_type(), SourceRange());
+=======
+                  py::type::handle_of(input), SourceRange());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         } catch (...) {
           // Invalidate the assumption that the class compiled so that we don't
           // look up and return its JIT type as the type for the input.
@@ -468,8 +488,14 @@ inline InferredType tryToInferType(py::handle input) {
     // If the class compiled successfully, look up the existing JIT type by
     // qualified name and return it.
     if (class_compiled) {
+<<<<<<< HEAD
       auto script_class = py::module::import("torch.jit._state")
                               .attr("_get_script_class")(input.get_type());
+=======
+      auto script_class =
+          py::module::import("torch.jit._state")
+              .attr("_get_script_class")(py::type::handle_of(input));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
       if (!script_class.is_none()) {
         auto class_type = py::cast<ClassTypePtr>(script_class);
@@ -642,7 +668,11 @@ inline InferredType tryToInferContainerType(
           "are supported ",
           "as inputs or outputs of traced functions",
           ", but instead got value of type ",
+<<<<<<< HEAD
           py::str(input.get_type().attr("__name__")),
+=======
+          py::str(py::type::handle_of(input).attr("__name__")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "."));
     } else {
       // TODO: this message is not correct anymore, since this InferredType is
@@ -653,7 +683,11 @@ inline InferredType tryToInferContainerType(
           "are supported ",
           "as inputs or outputs of traced functions",
           ", but instead got value of type ",
+<<<<<<< HEAD
           py::str(input.get_type().attr("__name__")),
+=======
+          py::str(py::type::handle_of(input).attr("__name__")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           "."));
     }
   }
@@ -780,7 +814,11 @@ inline std::string friendlyTypeName(py::handle obj) {
     auto field_names =
         py::cast<std::vector<std::string>>(py::getattr(obj, "_fields"));
     std::stringstream ss;
+<<<<<<< HEAD
     ss << py::str(obj.get_type().attr("__name__"));
+=======
+    ss << py::str(py::type::handle_of(obj).attr("__name__"));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ss << " (aka NamedTuple(";
     bool first = true;
     for (auto& field_name : field_names) {
@@ -793,7 +831,11 @@ inline std::string friendlyTypeName(py::handle obj) {
     ss << "))";
     return ss.str();
   } else {
+<<<<<<< HEAD
     return py::str(obj.get_type().attr("__name__"));
+=======
+    return py::str(py::type::handle_of(obj).attr("__name__"));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 }
 
@@ -841,7 +883,11 @@ inline IValue returnToIValue(const TypePtr& type, py::handle object) {
         " expected value of type ",
         type->str(),
         " for return value but instead got value of type ",
+<<<<<<< HEAD
         py::str(object.get_type().attr("__name__")),
+=======
+        py::str(py::type::handle_of(object).attr("__name__")),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ".",
         "\nValue: ",
         py::repr(object),
diff --git a/torch/csrc/jit/python/python_arg_flatten.cpp b/torch/csrc/jit/python/python_arg_flatten.cpp
index 0856eb392fb3..fcae5f3d01e8 100644
--- a/torch/csrc/jit/python/python_arg_flatten.cpp
+++ b/torch/csrc/jit/python/python_arg_flatten.cpp
@@ -63,8 +63,12 @@ void flatten_rec(PyObject* obj, ParsedArgs& args) {
     structure.push_back(D::DictClose);
     Py_DECREF(dict_items);
   } else if (THPUtils_checkString(obj)) {
+<<<<<<< HEAD
     string str = THPUtils_unpackString(obj);
     args.desc.strings.emplace_back(str);
+=======
+    args.desc.strings.emplace_back(THPUtils_unpackString(obj));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args.desc.structure.push_back(D::String);
   } else if (THPVariable_Check(obj)) {
     auto& var = THPVariable_Unpack(obj);
@@ -117,7 +121,15 @@ py::object cast_sequence(std::vector<py::object> objs) {
   for (const auto i : c10::irange(num_objs)) {
     sequence[i] = std::move(objs[i]);
   }
+<<<<<<< HEAD
   return std::move(sequence);
+=======
+#if C10_RETURN_MOVE_IF_OLD_COMPILER
+  return std::move(sequence);
+#else
+  return sequence;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 py::object cast_dict(std::vector<py::object> objs) {
@@ -127,15 +139,28 @@ py::object cast_dict(std::vector<py::object> objs) {
     py::tuple obj = py::reinterpret_borrow<py::tuple>(objs[i]);
     sequence[obj[0]] = obj[1];
   }
+<<<<<<< HEAD
   return std::move(sequence);
+=======
+#if C10_RETURN_MOVE_IF_OLD_COMPILER
+  return std::move(sequence);
+#else
+  return sequence;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 py::object unflatten_rec(
     ArrayRef<Variable>::iterator& var_it,
     ArrayRef<Variable>::iterator& var_it_end,
     std::string::const_iterator& desc_it,
+<<<<<<< HEAD
     std::vector<string>::const_iterator& str_it,
     std::vector<string>::const_iterator& str_it_end) {
+=======
+    std::vector<std::string>::const_iterator& str_it,
+    std::vector<std::string>::const_iterator& str_it_end) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   char type = *desc_it++;
   if (type == D::TupleOpen) {
     std::vector<py::object> objs;
diff --git a/torch/csrc/jit/python/python_custom_class.cpp b/torch/csrc/jit/python/python_custom_class.cpp
index b3ea056e0b29..b79fc54a1db2 100644
--- a/torch/csrc/jit/python/python_custom_class.cpp
+++ b/torch/csrc/jit/python/python_custom_class.cpp
@@ -66,7 +66,11 @@ void initPythonCustomClassBindings(PyObject* module) {
               return ScriptClassFunctionPtr(fn);
             }
 
+<<<<<<< HEAD
             throw AttributeError("%s does not exist", name.c_str());
+=======
+            throw AttributeError(fmt::format("{} does not exist", name));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
           })
       .def_property_readonly("__doc__", [](const ScriptClass& self) {
         return self.class_type_.type_->expectRef<ClassType>().doc_string();
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index c119c51f8100..39a77c9e3d46 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -26,13 +26,21 @@
 namespace torch::jit {
 
 // Controls whether graph source ranges are printed by default
+<<<<<<< HEAD
 bool global_print_source_ranges = true;
+=======
+static bool global_print_source_ranges = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Symbol ConcretePythonOp::Kind = prim::PythonOp;
 
 using c10::Type;
 
+<<<<<<< HEAD
 std::string getPythonName(const PyObject* obj_) {
+=======
+static std::string getPythonName(const PyObject* obj_) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   pybind11::gil_scoped_acquire gil;
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   PyObject* obj = const_cast<PyObject*>(obj_);
@@ -41,7 +49,11 @@ std::string getPythonName(const PyObject* obj_) {
   return py::str(v);
 }
 
+<<<<<<< HEAD
 std::ostream& printPyObject(std::ostream& out, const THPObjectPtr& obj) {
+=======
+static std::ostream& printPyObject(std::ostream& out, const THPObjectPtr& obj) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   pybind11::gil_scoped_acquire gil;
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   auto pyobj = py::handle(const_cast<PyObject*>(obj.get()));
@@ -81,7 +93,11 @@ std::ostream& printPyObject(std::ostream& out, const THPObjectPtr& obj) {
   }
 }
 
+<<<<<<< HEAD
 Node* findNode(
+=======
+static Node* findNode(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::ArrayRef<torch::jit::Block*> blocks,
     Symbol kind,
     bool recurse = true) {
@@ -101,7 +117,11 @@ Node* findNode(
   return nullptr;
 }
 
+<<<<<<< HEAD
 Node* findNode(Block* block, Symbol kind, bool recurse = true) {
+=======
+static Node* findNode(Block* block, Symbol kind, bool recurse = true) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<Block*> blocks = {block};
   return findNode(blocks, kind, recurse);
 }
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 363f7c65439b..87b064034bfd 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -21,7 +21,11 @@
 namespace torch::jit {
 
 std::string typeString(py::handle h) {
+<<<<<<< HEAD
   return py::str(h.get_type().attr("__name__"));
+=======
+  return py::str(py::type::handle_of(h).attr("__name__"));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::optional<StrongFunctionPtr> as_function(const py::object& obj) {
@@ -381,7 +385,11 @@ SugaredValuePtr ModuleValue::getitem(
       << "ParameterList, and ParameterDict modules are subscriptable");
 }
 
+<<<<<<< HEAD
 void checkInterface(
+=======
+static void checkInterface(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const SourceRange& loc,
     GraphFunction& m,
     const std::shared_ptr<ModuleValue>& self,
@@ -582,7 +590,11 @@ std::shared_ptr<SugaredValue> SugaredDict::attr(
   TORCH_INTERNAL_ASSERT(false);
 }
 
+<<<<<<< HEAD
 std::shared_ptr<SugaredEnumClass> createSugaredEnumClassFromObj(
+=======
+static std::shared_ptr<SugaredEnumClass> createSugaredEnumClassFromObj(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const py::object& obj,
     GraphFunction& m,
     const SourceRange& loc) {
@@ -595,7 +607,11 @@ std::shared_ptr<SugaredEnumClass> createSugaredEnumClassFromObj(
 }
 
 // helper function for instantiating a SugaredValue from an IValue
+<<<<<<< HEAD
 std::shared_ptr<SugaredValue> toSugaredValue(
+=======
+static std::shared_ptr<SugaredValue> toSugaredValue(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const IValue& v,
     GraphFunction& m,
     const SourceRange& loc) {
@@ -1057,7 +1073,11 @@ TypePtr registerNamedTuple(
   return tt;
 }
 
+<<<<<<< HEAD
 bool isEnumClass(py::object obj) {
+=======
+static bool isEnumClass(py::object obj) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto enum_type_obj =
       py::cast<py::object>(py::module::import("enum").attr("Enum"));
   int ret = PyObject_IsSubclass(obj.ptr(), enum_type_obj.ptr());
@@ -1068,7 +1088,11 @@ bool isEnumClass(py::object obj) {
   return ret == 1;
 }
 
+<<<<<<< HEAD
 std::shared_ptr<SugaredValue> createSimpleEnumValue(
+=======
+static std::shared_ptr<SugaredValue> createSimpleEnumValue(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const py::object& obj,
     GraphFunction& m,
     const SourceRange& loc) {
diff --git a/torch/csrc/jit/python/python_tracer.cpp b/torch/csrc/jit/python/python_tracer.cpp
index 8761867434f1..997c55f6572f 100644
--- a/torch/csrc/jit/python/python_tracer.cpp
+++ b/torch/csrc/jit/python/python_tracer.cpp
@@ -22,7 +22,11 @@ namespace torch::jit::tracer {
 
 // Python interpreter retrieval routine adapted from
 // https://stackoverflow.com/a/8706144
+<<<<<<< HEAD
 std::vector<StackEntry> _pythonCallstack() {
+=======
+static std::vector<StackEntry> _pythonCallstack() {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   pybind11::gil_scoped_acquire gil;
   PyFrameObject* frame = PyEval_GetFrame();
   Py_XINCREF(frame);
@@ -196,11 +200,19 @@ Node* preRecordPythonTrace(
   return n;
 }
 
+<<<<<<< HEAD
 void pythonRecordSourceLocation(Node* n) {
   n->setSourceRange(getPythonInterpreterSourceRange());
 }
 
 void pythonWarn(const std::string& reason) {
+=======
+static void pythonRecordSourceLocation(Node* n) {
+  n->setSourceRange(getPythonInterpreterSourceRange());
+}
+
+static void pythonWarn(const std::string& reason) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   pybind11::gil_scoped_acquire gil;
   auto warn_class = py::module::import("torch.jit").attr("TracerWarning");
   PyErr_WarnEx(warn_class.ptr(), reason.c_str(), 1);
diff --git a/torch/csrc/jit/python/python_tree_views.cpp b/torch/csrc/jit/python/python_tree_views.cpp
index b2ca6f42a431..0cd136979902 100644
--- a/torch/csrc/jit/python/python_tree_views.cpp
+++ b/torch/csrc/jit/python/python_tree_views.cpp
@@ -12,7 +12,11 @@ namespace py = pybind11;
 
 namespace torch::jit {
 
+<<<<<<< HEAD
 std::optional<std::string> maybeConvertToString(const py::object& obj) {
+=======
+static std::optional<std::string> maybeConvertToString(const py::object& obj) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (obj.is_none()) {
     return std::nullopt;
   }
@@ -58,14 +62,24 @@ struct SourceRangeFactory {
 };
 
 template <typename T>
+<<<<<<< HEAD
 List<T> wrap_list(const SourceRange& fallback_pos, std::vector<T>&& vec) {
+=======
+static List<T> wrap_list(
+    const SourceRange& fallback_pos,
+    std::vector<T>&& vec) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (vec.empty())
     return List<T>::create(fallback_pos, std::move(vec));
   return List<T>::create(vec.front().range(), std::move(vec));
 }
 
 template <typename T>
+<<<<<<< HEAD
 Maybe<T> wrap_maybe(const SourceRange& fallback_pos, T* val) {
+=======
+static Maybe<T> wrap_maybe(const SourceRange& fallback_pos, T* val) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return val ? Maybe<T>::create(val->range(), *val)
              : Maybe<T>::create(fallback_pos);
 }
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index a92cecb592ca..28f48ce7b2b6 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -246,7 +246,11 @@ FunctionDefaults calcOverloadedFunctionDefaults(
 
 } // namespace
 
+<<<<<<< HEAD
 bool checkMutableFunctionDefault(const py::object& def_arg) {
+=======
+static bool checkMutableFunctionDefault(const py::object& def_arg) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (py::isinstance<py::list>(def_arg) || py::isinstance<py::dict>(def_arg)) {
     return true;
   }
@@ -262,7 +266,11 @@ bool checkMutableFunctionDefault(const py::object& def_arg) {
   return false;
 }
 
+<<<<<<< HEAD
 void checkMutableFunctionDefault(
+=======
+static void checkMutableFunctionDefault(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const SourceRange& range,
     const Argument& arg,
     const py::object& def_arg) {
@@ -272,11 +280,19 @@ void checkMutableFunctionDefault(
         << "Mutable default parameters are not supported because Python binds them to the function"
         << " and they persist across function calls.\n As a workaround, make the default None and instantiate"
         << " the default parameter within the body of the function. Found "
+<<<<<<< HEAD
         << def_arg.get_type() << " on parameter " << arg.name());
   }
 }
 
 FunctionSchema getSchemaWithNameAndDefaults(
+=======
+        << py::type::handle_of(def_arg) << " on parameter " << arg.name());
+  }
+}
+
+static FunctionSchema getSchemaWithNameAndDefaults(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const SourceRange& range,
     const FunctionSchema& schema,
     const std::optional<std::string>& new_name,
@@ -472,7 +488,11 @@ static std::shared_ptr<Graph> _propagate_and_assign_input_shapes(
   return retval;
 }
 
+<<<<<<< HEAD
 void addFunctionToModule(Module& module, const StrongFunctionPtr& func) {
+=======
+static void addFunctionToModule(Module& module, const StrongFunctionPtr& func) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Make a graph with a fake self argument
   auto graph = toGraphFunction(*func.function_).graph()->copy();
   auto v = graph->insertInput(0, "self");
@@ -484,7 +504,11 @@ void addFunctionToModule(Module& module, const StrongFunctionPtr& func) {
 }
 
 // this is used in our test suite to check that we correctly preserved type tags
+<<<<<<< HEAD
 bool ivalue_tags_match(const Module& lhs, const Module& rhs) {
+=======
+static bool ivalue_tags_match(const Module& lhs, const Module& rhs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   struct Work {
     IValue a;
     IValue b;
@@ -605,7 +629,11 @@ struct slot_dict_impl {
 };
 
 template <typename T>
+<<<<<<< HEAD
 py::list debugMakeList(const T& list) {
+=======
+static py::list debugMakeList(const T& list) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::list result;
   for (const auto& elem : list) {
     result.append(py::cast(elem));
@@ -613,7 +641,11 @@ py::list debugMakeList(const T& list) {
   return result;
 }
 template <typename T>
+<<<<<<< HEAD
 py::list debugMakeNamedList(const T& list) {
+=======
+static py::list debugMakeNamedList(const T& list) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::list result;
   for (auto elem : list) {
     result.append(py::cast(std::make_pair(elem.name, elem.value)));
@@ -621,7 +653,11 @@ py::list debugMakeNamedList(const T& list) {
   return result;
 }
 template <typename T>
+<<<<<<< HEAD
 py::set debugMakeSet(const T& list) {
+=======
+static py::set debugMakeSet(const T& list) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   py::set result;
   for (const auto& elem : list) {
     result.add(py::cast(elem));
@@ -674,7 +710,11 @@ struct DeepCopyMemoTable {
   std::shared_ptr<IValue::HashIdentityIValueMap> map;
 };
 
+<<<<<<< HEAD
 IValue pyIValueDeepcopy(const IValue& ivalue, const py::dict& memo) {
+=======
+static IValue pyIValueDeepcopy(const IValue& ivalue, const py::dict& memo) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!memo.contains(py::str("__torch_script_memo_table"))) {
     memo["__torch_script_memo_table"] =
         DeepCopyMemoTable{std::make_shared<IValue::HashIdentityIValueMap>()};
@@ -684,7 +724,11 @@ IValue pyIValueDeepcopy(const IValue& ivalue, const py::dict& memo) {
   return ivalue.deepcopy(ivalue_memo);
 }
 
+<<<<<<< HEAD
 ExtraFilesMap extra_files_from_python(const py::dict& pydict) {
+=======
+static ExtraFilesMap extra_files_from_python(const py::dict& pydict) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ExtraFilesMap r;
   for (const auto& it : pydict) {
     r[py::cast<std::string>(it.first)] = "";
@@ -692,14 +736,24 @@ ExtraFilesMap extra_files_from_python(const py::dict& pydict) {
   return r;
 }
 
+<<<<<<< HEAD
 void extra_files_to_python(const ExtraFilesMap& m, const py::dict& pydict) {
+=======
+static void extra_files_to_python(
+    const ExtraFilesMap& m,
+    const py::dict& pydict) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // py::dict is pointer-like type so it gets modified despite const&
   for (const auto& it : m) {
     pydict[py::str(it.first)] = py::bytes(it.second);
   }
 }
 
+<<<<<<< HEAD
 void pyCompilationUnitDefine(
+=======
+static void pyCompilationUnitDefine(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     CompilationUnit& cu,
     const std::string& src,
     const ResolutionCallback* rcb,
@@ -1428,7 +1482,11 @@ void initJitScriptBindings(PyObject* module) {
               return StrongFunctionPtr(std::move(self), fn);
             } else {
               throw AttributeError(
+<<<<<<< HEAD
                   "'CompilationUnit' has no attribute '%s'", name.c_str());
+=======
+                  fmt::format("'CompilationUnit' has no attribute '{}'", name));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
           })
       .def(
@@ -1483,12 +1541,39 @@ void initJitScriptBindings(PyObject* module) {
           "__call__",
           [](py::args args, const py::kwargs& kwargs) {
             HANDLE_TH_ERRORS
+<<<<<<< HEAD
             // see: [pybind11 varargs]
             auto strongPtr = py::cast<StrongFunctionPtr>(args[0]);
             Function& callee = *strongPtr.function_;
             py::object result = invokeScriptFunctionFromPython(
                 callee, tuple_slice(std::move(args), 1), kwargs);
             return result;
+=======
+            auto strongPtr = py::cast<StrongFunctionPtr>(args[0]);
+            if (py::module::import("torch")
+                    .attr("compiler")
+                    .attr("is_exporting")()
+                    .cast<bool>()) {
+              TORCH_INTERNAL_ASSERT(
+                  py::hasattr(args[0], py::str("_torchdynamo_inline")),
+                  "During PT2 exporting, we encountered TorchScripted function",
+                  strongPtr.function_->name(),
+                  "When tracing through it, we cannot find its _torchdynamo_inline attribute, ",
+                  "which stores non scripted kcallable. ",
+                  "Please file an issue to PyTorch if you see this error.");
+
+              // remove the function itself with args[1:]
+              py::slice slice0(1, args.size(), 1);
+              return args[0].attr("_torchdynamo_inline")(
+                  *args[slice0], **kwargs);
+            } else {
+              // see: [pybind11 varargs]
+              Function& callee = *strongPtr.function_;
+              py::object result = invokeScriptFunctionFromPython(
+                  callee, tuple_slice(std::move(args), 1), kwargs);
+              return result;
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             END_HANDLE_TH_ERRORS_PYBIND
           })
       .def(
@@ -1588,12 +1673,38 @@ void initJitScriptBindings(PyObject* module) {
       .def(
           "__call__",
           [](py::args args, const py::kwargs& kwargs) {
+<<<<<<< HEAD
             // see: [pybind11 varargs]
             HANDLE_TH_ERRORS
             Method& method = py::cast<Method&>(args[0]);
 
             return invokeScriptMethodFromPython(
                 method, tuple_slice(std::move(args), 1), kwargs);
+=======
+            HANDLE_TH_ERRORS
+            if (py::module::import("torch")
+                    .attr("compiler")
+                    .attr("is_exporting")()
+                    .cast<bool>() &&
+                // TODO: fix all cases where ScriptMethod doesn't have
+                // __wrapped__, which is the non-scripted original method. E.g.
+                // it seems the top-level script module's scriptMethod doesn't
+                // have __wrapped__ attributes:
+                //  class M(torch.nn.Module):
+                //    def forward(self, x):
+                //        return x.cos() + x.sin()
+                //  traced_module = torch.jit.trace(M(), example_inputs=inps)
+                // , where traced_module.forward is a ScriptMethod but doesn't
+                // have __wrapped__.
+                py::hasattr(args[0], "__wrapped__")) {
+              return args[0].attr("__wrapped__")(*args, **kwargs);
+            } else {
+              // see: [pybind11 varargs]
+              Method& method = py::cast<Method&>(args[0]);
+              return invokeScriptMethodFromPython(
+                  method, tuple_slice(std::move(args), 1), kwargs);
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             END_HANDLE_TH_ERRORS_PYBIND
           })
       .def_property_readonly("graph", &Method::graph)
diff --git a/torch/csrc/jit/python/update_graph_executor_opt.cpp b/torch/csrc/jit/python/update_graph_executor_opt.cpp
index 339f3ce10086..888b293bfaec 100644
--- a/torch/csrc/jit/python/update_graph_executor_opt.cpp
+++ b/torch/csrc/jit/python/update_graph_executor_opt.cpp
@@ -1,10 +1,21 @@
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/jit_log.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/python/update_graph_executor_opt.h>
 
 namespace torch::jit {
 
+<<<<<<< HEAD
 thread_local bool kOptimize = true;
 void setGraphExecutorOptimize(bool o) {
   kOptimize = o;
+=======
+static thread_local bool kOptimize = true;
+void setGraphExecutorOptimize(bool o) {
+  kOptimize = o;
+  GRAPH_DEBUG("GraphExecutorOptimize set to ", o);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 bool getGraphExecutorOptimize() {
   return kOptimize;
diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h
index aab76adcf405..2f41a0eca2df 100644
--- a/torch/csrc/jit/runtime/argument_spec.h
+++ b/torch/csrc/jit/runtime/argument_spec.h
@@ -241,7 +241,11 @@ struct CompleteArgumentInfo;
 struct CompleteArgumentSpec {
   CompleteArgumentSpec(bool with_grad, at::ArrayRef<IValue> inputs)
       : ninputs(inputs.size()) {
+<<<<<<< HEAD
     int32_t all_dims = 0;
+=======
+    int64_t all_dims = 0;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const auto num_inputs = inputs.size();
     for (const auto i : c10::irange(num_inputs)) {
       if (!inputs[i].isTensor())
diff --git a/torch/csrc/jit/runtime/decomposition_registry.cpp b/torch/csrc/jit/runtime/decomposition_registry.cpp
index a7867feb9f8f..739ec359d0d7 100644
--- a/torch/csrc/jit/runtime/decomposition_registry.cpp
+++ b/torch/csrc/jit/runtime/decomposition_registry.cpp
@@ -165,8 +165,13 @@ struct JitDecomp final : torch::autograd::impl::JitDecompInterface {
       torch::jit::Stack* stack) const override;
 };
 
+<<<<<<< HEAD
 JitDecomp jitDecomp;
 torch::autograd::impl::JitDecompRegisterer registerJitDecomp(&jitDecomp);
+=======
+static JitDecomp jitDecomp;
+static torch::autograd::impl::JitDecompRegisterer registerJitDecomp(&jitDecomp);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void JitDecomp::run_jit_decomposition(
     const c10::OperatorHandle& op,
diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp
index 61dbc1c1b403..313549d9188a 100644
--- a/torch/csrc/jit/runtime/graph_executor.cpp
+++ b/torch/csrc/jit/runtime/graph_executor.cpp
@@ -82,7 +82,11 @@ c10::AliasAnalysisKind aliasAnalysisInternalSpecialCase() {
 // for debugging it is helpful to be able to force autodiff subgraphs
 // to be created, to check their correctness, even when the
 // size of the of the subgraph is too small to be profitable.
+<<<<<<< HEAD
 thread_local bool autodiff_subgraph_inlining = true;
+=======
+static thread_local bool autodiff_subgraph_inlining = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void debugSetAutodiffSubgraphInlining(bool state) {
   autodiff_subgraph_inlining = state;
 }
@@ -102,7 +106,11 @@ bool getFusionGroupInlining() {
   return fusion_group_inlining;
 }
 
+<<<<<<< HEAD
 thread_local std::weak_ptr<Graph> last_executed_optimized_graph;
+=======
+static thread_local std::weak_ptr<Graph> last_executed_optimized_graph;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::shared_ptr<Graph> lastExecutedOptimizedGraph() {
   return last_executed_optimized_graph.lock();
 }
@@ -542,7 +550,11 @@ Gradient getGradient(const Node* n) {
 }
 } // anonymous namespace
 
+<<<<<<< HEAD
 RegisterOperators reg_graph_executor_ops({Operator(
+=======
+static RegisterOperators reg_graph_executor_ops({Operator(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prim::DifferentiableGraph,
     [](const Node* n) -> Operation {
       return DifferentiableGraphOp(getGradient(n));
@@ -863,7 +875,11 @@ bool GraphExecutor::isOptimized() const {
 
 TORCH_API bool IsNewExecutorEnabled() {
   static const auto disable_new_executor =
+<<<<<<< HEAD
       std::getenv("TORCH_JIT_DISABLE_NEW_EXECUTOR");
+=======
+      c10::utils::has_env("TORCH_JIT_DISABLE_NEW_EXECUTOR");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return getExecutorMode() && FLAGS_torch_jit_enable_new_executor &&
       !disable_new_executor;
 }
diff --git a/torch/csrc/jit/runtime/graph_executor.h b/torch/csrc/jit/runtime/graph_executor.h
index 8295b9d6c378..56ca4faba30e 100644
--- a/torch/csrc/jit/runtime/graph_executor.h
+++ b/torch/csrc/jit/runtime/graph_executor.h
@@ -43,7 +43,10 @@ struct ExecutionPlan {
 // They are only valid only right after you call getDebugState() and should
 // never be used again once another GraphExecutor function is called.
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct GraphExecutorState {
   const Graph* graph = nullptr;
   ExecutionPlan fallback; // XXX: members of this field are optional
diff --git a/torch/csrc/jit/runtime/instruction.cpp b/torch/csrc/jit/runtime/instruction.cpp
index dffd0ae6b4aa..130f32e2b09c 100644
--- a/torch/csrc/jit/runtime/instruction.cpp
+++ b/torch/csrc/jit/runtime/instruction.cpp
@@ -44,7 +44,11 @@ static_assert(
     "Instructions should be 8 bytes");
 std::ostream& operator<<(std::ostream& out, Instruction inst) {
   // TODO: use op info to print out the op in a more user-friendly way
+<<<<<<< HEAD
   int nargs = std::strlen(OpInfo(inst.op));
+=======
+  auto nargs = std::strlen(OpInfo(inst.op));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   out << inst.op;
   if (nargs > 0) {
     out << " " << inst.X;
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index d42c4c69b6f2..75241df81ca6 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -57,6 +57,14 @@ C10_DEFINE_bool(
     false,
     "When true we will attemps to pre-expand node stacks and cache expanded stacks.")
 
+<<<<<<< HEAD
+=======
+C10_DEFINE_bool(
+    torch_jit_expanded_stacks_mangled,
+    false,
+    "When true pre-expanded stacks will use mangled names.")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace torch::jit {
 
 using CodeImpl = interpreter::CodeImpl;
@@ -101,7 +109,11 @@ inline int64_t getDistAutogradContextId() {
 }
 } // namespace
 
+<<<<<<< HEAD
 thread_local InterpreterStateImpl* tls_int_state_ptr_ = nullptr;
+=======
+static thread_local InterpreterStateImpl* tls_int_state_ptr_ = nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct TLSCurrentInterpreterGuard {
   TLSCurrentInterpreterGuard(InterpreterStateImpl* state)
       : prev_state_(tls_int_state_ptr_) {
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index e6a71dc0a0b9..1b18f8cdd359 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -111,7 +111,10 @@ struct Suspend : public std::exception {
     return "Suspend";
   }
 
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   explicit Suspend(c10::intrusive_ptr<Future> future_)
       : future(std::move(future_)) {}
 
diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h
index c0ddfc73a428..4ff32fa2dc32 100644
--- a/torch/csrc/jit/runtime/interpreter/code_impl.h
+++ b/torch/csrc/jit/runtime/interpreter/code_impl.h
@@ -16,10 +16,16 @@
 #include <torch/csrc/jit/runtime/interpreter/preprocess_graph.h>
 
 TORCH_DECLARE_bool(torch_jit_enable_expanded_stacks);
+<<<<<<< HEAD
 
 namespace torch::jit {
 
 namespace interpreter {
+=======
+TORCH_DECLARE_bool(torch_jit_expanded_stacks_mangled);
+
+namespace torch::jit::interpreter {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <class Ttarget, class Tsource>
 Ttarget safe_narrow_cast(Tsource v) {
@@ -63,7 +69,11 @@ struct NodeSourceInfo {
   const char* func_name_{nullptr};
   const char* file_name_{nullptr};
   size_t line_{0};
+<<<<<<< HEAD
   NodeSourceInfo() {}
+=======
+  NodeSourceInfo() = default;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 struct CodeImpl {
@@ -227,7 +237,11 @@ struct CodeImpl {
   NodeSourceInfo getSourceInfoFromSourceRange(const SourceRange& range) {
     NodeSourceInfo nodeSource;
     SourceRange r = range;
+<<<<<<< HEAD
     if (range.source()) {
+=======
+    if (!FLAGS_torch_jit_expanded_stacks_mangled && range.source()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (auto orig = range.source()->findSourceRangeThatGenerated(r)) {
         r = *orig;
       }
@@ -1059,5 +1073,9 @@ struct MobileCodeImpl : CodeImpl {
   bool emit_promoted_ops_;
 };
 
+<<<<<<< HEAD
 } // namespace interpreter
 } // namespace torch::jit
+=======
+} // namespace torch::jit::interpreter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/jit/runtime/logging.cpp b/torch/csrc/jit/runtime/logging.cpp
index efea5b582297..5f4b484ad8c2 100644
--- a/torch/csrc/jit/runtime/logging.cpp
+++ b/torch/csrc/jit/runtime/logging.cpp
@@ -42,7 +42,11 @@ void LockingLogger::setAggregationType(
   agg_types[stat_name] = type;
 }
 
+<<<<<<< HEAD
 std::atomic<LoggerBase*> global_logger{new NoopLogger()};
+=======
+static std::atomic<LoggerBase*> global_logger{new NoopLogger()};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 LoggerBase* getLogger() {
   return global_logger.load();
diff --git a/torch/csrc/jit/runtime/operator.h b/torch/csrc/jit/runtime/operator.h
index 2e609f18ecc0..5d960e37a32c 100644
--- a/torch/csrc/jit/runtime/operator.h
+++ b/torch/csrc/jit/runtime/operator.h
@@ -60,7 +60,10 @@ const std::array<at::Tag, 1> kJitOnlyOperatorTags = {
 // the concrete operator nature.
 struct TORCH_API Operator {
  private:
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   struct C10Operator final {
     c10::OperatorHandle handle_;
     Operation op_;
@@ -69,7 +72,10 @@ struct TORCH_API Operator {
     std::string schema_string_;
     mutable std::optional<c10::AliasAnalysisKind> alias_analysis_;
   };
+<<<<<<< HEAD
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   struct JitOnlyOperator final {
     // The only valid transition for schema_ is from right->left, i.e.
     // when the schema gets parsed.
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
index c64e0b123d65..8bd496299d03 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
@@ -7,6 +7,14 @@ TORCH_DECLARE_bool(torch_jit_static_then_dynamic);
 
 TORCH_DECLARE_bool(torch_jit_always_dynamic);
 
+<<<<<<< HEAD
+=======
+C10_DECLARE_bool(torch_jit_release_profiling_graph_after_optimization);
+C10_DECLARE_int32(torch_jit_release_profiling_graph_delay_in_seconds);
+C10_DECLARE_int64(torch_jit_num_profiled_runs);
+C10_DECLARE_int64(torch_jit_bailout_depth);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace torch::jit {
 
 TORCH_API void runNooptPassPipeline(std::shared_ptr<Graph>& graph);
diff --git a/torch/csrc/jit/runtime/register_distributed_ops.cpp b/torch/csrc/jit/runtime/register_distributed_ops.cpp
index 45bd029e3318..51d0ec3ed553 100644
--- a/torch/csrc/jit/runtime/register_distributed_ops.cpp
+++ b/torch/csrc/jit/runtime/register_distributed_ops.cpp
@@ -102,7 +102,11 @@ void prepare_and_call_rpc_op(
     std::vector<std::string> names;
     for (const auto& entry : kwargsDict) {
       const IValue& keyIValue = entry.key();
+<<<<<<< HEAD
       const string& keyStr = keyIValue.toStringRef();
+=======
+      const std::string& keyStr = keyIValue.toStringRef();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       names.emplace_back(keyStr);
     }
     throw std::runtime_error(functionSchema.findErrorInKwargs(names));
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 290352586ca5..9d16632e49bd 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -1347,7 +1347,11 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
         auto string = pop(stack).toStringRef();                        \
         push(                                                          \
             stack,                                                     \
+<<<<<<< HEAD
             string.size() != 0 &&                                      \
+=======
+            !string.empty() &&                                         \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 std::all_of(string.begin(), string.end(), [](char c) { \
                   return char_op(c);                                   \
                 }));                                                   \
diff --git a/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp b/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
index 69ccaf705e2c..0f5c4ebc4e94 100644
--- a/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
+++ b/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
@@ -16,7 +16,11 @@
 namespace torch::jit {
 
 
+<<<<<<< HEAD
 std::string shape_funcs = ""
+=======
+static std::string shape_funcs = ""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 + std::string(R"=====(
 def unary(self: List[int]) -> List[int]:
   out = annotate(List[int], [])
diff --git a/torch/csrc/jit/runtime/static/.clang-tidy b/torch/csrc/jit/runtime/static/.clang-tidy
index 81fd01834e6b..79eb798e4ef5 100644
--- a/torch/csrc/jit/runtime/static/.clang-tidy
+++ b/torch/csrc/jit/runtime/static/.clang-tidy
@@ -1,7 +1,11 @@
 # NOTE there must be no spaces before the '-', so put the comma after.
 # When making changes, be sure to verify the output of the following command to ensure
 # the desired checks are enabled (run from the directory containing a .clang-tidy file):
+<<<<<<< HEAD
 # ~/fbsource/tools/lint/clangtidy/clang-tidy-platform010 -list-checks
+=======
+# ~/fbsource/tools/lint/clangtidy/clang-tidy-platform010-clang-17 -list-checks
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NOTE: Please don't disable inheritance from the parent to make sure that common checks get propagated.
 
 # Remove noisy checks which are not necessary.
diff --git a/torch/csrc/jit/runtime/static/ProcessedNodeInputs.h b/torch/csrc/jit/runtime/static/ProcessedNodeInputs.h
index 81d4b06d1562..f0b8cb57b77f 100644
--- a/torch/csrc/jit/runtime/static/ProcessedNodeInputs.h
+++ b/torch/csrc/jit/runtime/static/ProcessedNodeInputs.h
@@ -2,6 +2,10 @@
 
 #include <cstddef>
 #include <cstdint>
+<<<<<<< HEAD
+=======
+#include <cstring>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <memory>
 
@@ -31,6 +35,10 @@ class ProcessedNodeInputs {
   }
 
   uint16_t operator[](uint16_t idx) const {
+<<<<<<< HEAD
+=======
+    // NOLINTNEXTLINE(*const-cast*)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (*const_cast<ProcessedNodeInputs*>(this))[idx];
   }
 
diff --git a/torch/csrc/jit/runtime/static/fusion.cpp b/torch/csrc/jit/runtime/static/fusion.cpp
index f52c24e9f01f..865c1f81150a 100644
--- a/torch/csrc/jit/runtime/static/fusion.cpp
+++ b/torch/csrc/jit/runtime/static/fusion.cpp
@@ -19,7 +19,11 @@
 
 namespace torch::jit {
 
+<<<<<<< HEAD
 void createFusionGroups(Block* block, AliasDb* aliasDb, size_t min_size);
+=======
+static void createFusionGroups(Block* block, AliasDb* aliasDb, size_t min_size);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void fuseStaticSubgraphs(std::shared_ptr<Graph> graph, size_t min_size) {
   Inline(*graph);
@@ -60,7 +64,11 @@ static Operation createStaticSubgraphRuntime(const Node* node) {
   };
 }
 
+<<<<<<< HEAD
 RegisterOperators StaticSubgraphOps({torch::jit::Operator(
+=======
+static RegisterOperators StaticSubgraphOps({torch::jit::Operator(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     prim::StaticSubgraph,
     createStaticSubgraphRuntime,
     AliasAnalysisKind::INTERNAL_SPECIAL_CASE)});
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index ec736d006be0..62fedf314dfc 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -38,7 +38,10 @@
 #include <iterator>
 #include <limits>
 #include <sstream>
+<<<<<<< HEAD
 #include <stdexcept>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef FBCODE_CAFFE2
 #include <common/logging/logging.h>
@@ -953,11 +956,18 @@ BlockRunner::BlockRunner(
   }
 }
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 BlockRunner::BlockRunner(BlockRunner&&) noexcept = default;
 
 BlockRunner::~BlockRunner() = default;
 
+<<<<<<< HEAD
+=======
+// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void BlockRunner::set_arg(const size_t idx, std::vector<IValue>&& args) {
   DCHECK(idx < args.size());
   Input(idx + first_input_is_self_) = std::move(args[idx]);
@@ -983,7 +993,12 @@ void check_type(const Argument& schema_arg, const IValue& arg) {
     return;
   }
   TORCH_CHECK(
+<<<<<<< HEAD
       arg.type()->isSubtypeOf(schema_arg.type()),
+=======
+      arg.type()->isSubtypeOf(schema_arg.type()) ||
+      arg.type()->isSubtypeOfExt(schema_arg.type(), /*why_not=*/nullptr),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       arg.type()->annotation_str(),
       " is not a subtype of ",
       schema_arg.type()->annotation_str(),
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 04a0862f9795..f5a87344f5ae 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -815,10 +815,15 @@ class TORCH_API BlockRunner {
   std::vector<ProcessedNode> nodes_;
 };
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 class TORCH_API StaticNodeInfo {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+class TORCH_API StaticNodeInfo {
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   StaticNodeInfo(
       Node* n,
       ProcessedFunction* fn,
@@ -873,6 +878,12 @@ class TORCH_API ProcessedNodeMetadata {
   // if the contained type (BlockRunner) is not copyable
   ProcessedNodeMetadata(const ProcessedNodeMetadata&) = delete;
   ProcessedNodeMetadata& operator=(const ProcessedNodeMetadata&) = delete;
+<<<<<<< HEAD
+=======
+  ProcessedNodeMetadata(ProcessedNodeMetadata&&) = delete;
+  ProcessedNodeMetadata&& operator=(ProcessedNodeMetadata&&) = delete;
+  ~ProcessedNodeMetadata() = default;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   std::vector<BlockRunner>& block_runners() {
     return block_runners_;
@@ -895,10 +906,15 @@ class TORCH_API ProcessedNodeMetadata {
   torch::jit::TaskLauncher* launcher_;
 };
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 class TORCH_API ProcessedNode {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+class TORCH_API ProcessedNode {
+ public:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ProcessedNode() = default;
 
   ProcessedNode(const StaticNodeInfo& other, IValue* values)
@@ -917,6 +933,10 @@ class TORCH_API ProcessedNode {
   ProcessedNode(const ProcessedNode&) = delete;
   ProcessedNode& operator=(const ProcessedNode& other) = delete;
   ProcessedNode& operator=(ProcessedNode&&) = default;
+<<<<<<< HEAD
+=======
+  ~ProcessedNode() = default;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void run();
 
@@ -1025,10 +1045,17 @@ class TORCH_API ProcessedNode {
 
   [[nodiscard]] bool verify_inputs_dont_overlap_outputs(bool force_check) const;
 
+<<<<<<< HEAD
   Node* node_;
   const ProcessedFunction* fn_;
   ProcessedNodeInputs inputs_;
   uint16_t outputs_offset_;
+=======
+  Node* node_{nullptr};
+  const ProcessedFunction* fn_{nullptr};
+  ProcessedNodeInputs inputs_;
+  uint16_t outputs_offset_{0};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   bool overlap_detected_{false};
   IValue* values_ = nullptr; // unowned
   // Metadata for ProcessedNode.
@@ -1145,3 +1172,7 @@ class TORCH_API StaticRuntime {
 };
 
 } // namespace torch::jit
+<<<<<<< HEAD
+=======
+C10_DECLARE_bool(static_runtime_disable_debug_memory_overlap_check);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 60fca2f87066..00be8349c7e2 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1344,7 +1344,10 @@ REGISTER_OPERATOR_FUNCTOR(aten::pow, aten_pow, [](Node* n) -> SROperator {
 
 namespace {
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 struct ToArgs {
   std::optional<at::ScalarType> dtype;
   c10::Layout layout;
diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
index eb3dafeb59e2..f868acebfc1f 100644
--- a/torch/csrc/jit/runtime/static/ops.h
+++ b/torch/csrc/jit/runtime/static/ops.h
@@ -33,7 +33,11 @@ TORCH_DECLARE_REGISTRY(SROperatorRegistry, SROperatorFunctor);
 
 #define REGISTER_OPERATOR_FUNCTOR(name, id, ...)             \
   struct SROperatorFunctor_##id : public SROperatorFunctor { \
+<<<<<<< HEAD
     const SROpFunctor fn = __VA_ARGS__;                      \
+=======
+    SROpFunctor fn = __VA_ARGS__;                            \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SROperator Generate(Node* n) override {                  \
       return fn(n);                                          \
     }                                                        \
@@ -43,7 +47,11 @@ TORCH_DECLARE_REGISTRY(SROperatorRegistry, SROperatorFunctor);
 TORCH_DECLARE_REGISTRY(SRNativeOperatorRegistry, SROperatorFunctor);
 #define REGISTER_NATIVE_OPERATOR_FUNCTOR(name, id, ...)            \
   struct SRNativeOperatorFunctor_##id : public SROperatorFunctor { \
+<<<<<<< HEAD
     const SROpFunctor fn = __VA_ARGS__;                            \
+=======
+    SROpFunctor fn = __VA_ARGS__;                                  \
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     SROperator Generate(Node* n) override {                        \
       return fn(n);                                                \
     }                                                              \
@@ -182,5 +190,11 @@ bool sr_schema_check(
 }
 
 bool sr_schema_check_kind(torch::jit::Node* node, c10::Symbol node_kind);
+<<<<<<< HEAD
 
 } // namespace torch::jit
+=======
+} // namespace torch::jit
+
+C10_DECLARE_bool(static_runtime_enable_fast_math);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/jit/runtime/static/passes.h b/torch/csrc/jit/runtime/static/passes.h
index 96f2e8236941..6624bf5f22d6 100644
--- a/torch/csrc/jit/runtime/static/passes.h
+++ b/torch/csrc/jit/runtime/static/passes.h
@@ -87,3 +87,8 @@ TORCH_API void UseInPlaceGetRealInputsFromOptionalInputsV2(
 TORCH_API void PrepackWeights(std::shared_ptr<Graph>& graph);
 
 } // namespace torch::jit
+<<<<<<< HEAD
+=======
+
+C10_DECLARE_bool(enable_clip_ranges_gather_fusions);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp
index beecdb770cba..26b2a4644930 100644
--- a/torch/csrc/jit/runtime/symbolic_script.cpp
+++ b/torch/csrc/jit/runtime/symbolic_script.cpp
@@ -936,8 +936,13 @@ const std::vector<std::string> functions = {
         def hardswish(self):
             result = torch.hardswish(self)
             def backward(grad_output):
+<<<<<<< HEAD
                 m = (self > 3.).type_as(result)
                 m = torch.where((self >= -3.) & (self <= 3.),  self / 3. + .5, m)
+=======
+                m = (self >= 3.).type_as(result)
+                m = torch.where((self > -3.) & (self < 3.),  self / 3. + .5, m)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return grad_output * m
             return result, backward
 
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index ac20016c7bbb..0c50e64a68f9 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -12,6 +12,10 @@
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/runtime/instruction.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/serialization/export.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/jit/serialization/import_export_constants.h>
 #include <torch/csrc/jit/serialization/import_export_functions.h>
 #include <torch/csrc/jit/serialization/import_export_helpers.h>
@@ -86,8 +90,13 @@ namespace {
 namespace onnx_torch = ::torch::onnx;
 namespace onnx = ::ONNX_NAMESPACE;
 
+<<<<<<< HEAD
 const static int kInvalidOpsetVersion = -1;
 const static int kMainOpsetVersion = 20;
+=======
+constexpr int kInvalidOpsetVersion = -1;
+constexpr int kMainOpsetVersion = 23;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Based on OP_SET_ID_VERSION_MAP in
 // https://github.com/onnx/onnx/blob/master/onnx/helper.py.
 constexpr static std::array<int64_t, kMainOpsetVersion + 1>
@@ -113,6 +122,12 @@ constexpr static std::array<int64_t, kMainOpsetVersion + 1>
         8, // opset 18
         9, // opset 19
         9, // opset 20
+<<<<<<< HEAD
+=======
+        10, // opset 21
+        10, // opset 22
+        11, // opset 23
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 std::string getNodeStackTraceString(const Node* n) {
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index 437efa518722..79b3b5430b40 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -547,7 +547,11 @@ void ScriptModuleSerializer::writeArchive(
   TORCH_INTERNAL_ASSERT(tensor_names.size() == data_pickle.tensorData().size());
 
   for (const auto& td : data_pickle.tensorData()) {
+<<<<<<< HEAD
     std::string tensor_name = tensor_names[i++];
+=======
+    const std::string& tensor_name = tensor_names[i++];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (td.is_meta() || skip_tensor_data) {
       writer_.writeRecord(tensor_dir + tensor_name, nullptr, 0);
       continue;
@@ -956,7 +960,11 @@ std::vector<std::string> export_opnames(const script::Module& m) {
 // Thread local flag (only happens in export, i.e. on server side)
 // to control if instructions for bytecode default inputs are emitted
 // or not. It's the major difference between bytecode v5 and v6.
+<<<<<<< HEAD
 thread_local bool emitBytecodeDefaultInputs =
+=======
+static thread_local bool emitBytecodeDefaultInputs =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     caffe2::serialize::kProducedBytecodeVersion <= 5 ? true : false;
 bool BytecodeEmitMode::is_default_value_for_unspecified_arg_enabled() {
   return emitBytecodeDefaultInputs;
@@ -966,7 +974,11 @@ void BytecodeEmitMode::set_default_value_for_unspecified_arg_enabled(
   emitBytecodeDefaultInputs = enabled;
 }
 
+<<<<<<< HEAD
 thread_local bool emitDefautlArgsWithOutArgs =
+=======
+static thread_local bool emitDefautlArgsWithOutArgs =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     caffe2::serialize::kProducedBytecodeVersion <= 6 ? false : true;
 bool BytecodeEmitMode::is_default_args_before_out_args_enabled() {
   return emitDefautlArgsWithOutArgs;
@@ -975,7 +987,11 @@ void BytecodeEmitMode::set_default_args_before_out_args_enabled(bool enabled) {
   emitDefautlArgsWithOutArgs = enabled;
 }
 
+<<<<<<< HEAD
 thread_local bool emitDefaultEmitPromotedOps =
+=======
+static thread_local bool emitDefaultEmitPromotedOps =
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     caffe2::serialize::kProducedBytecodeVersion <= 7 ? false : true;
 bool BytecodeEmitMode::is_emit_promoted_ops_enabled() {
   return emitDefaultEmitPromotedOps;
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
index fd6dfa6f8cd4..d4f34122c3ae 100644
--- a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
@@ -518,6 +518,10 @@ flatbuffers::Offset<mobile::serialization::ObjectType> FlatbufferSerializer::
   } else {
     size_t num_attr = class_ptr->numAttributes();
     std::vector<flatbuffers::Offset<flatbuffers::String>> names;
+<<<<<<< HEAD
+=======
+    names.reserve(num_attr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (size_t i = 0; i < num_attr; ++i) {
       names.push_back(fbb.CreateSharedString(class_ptr->getAttributeName(i)));
     }
@@ -588,6 +592,10 @@ flatbuffers::Offset<mobile::serialization::Object> FlatbufferSerializer::
   } else {
     size_t num_attr = type->numAttributes();
     std::vector<uint32_t> tuple_index;
+<<<<<<< HEAD
+=======
+    tuple_index.reserve(num_attr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (size_t i = 0; i < num_attr; ++i) {
       tuple_index.push_back(storeIValueAndGetIndex(fbb, obj->getSlot(i)));
     }
diff --git a/torch/csrc/jit/serialization/mobile_bytecode_generated.h b/torch/csrc/jit/serialization/mobile_bytecode_generated.h
index cffe8bc7a636..49877a25c21c 100644
--- a/torch/csrc/jit/serialization/mobile_bytecode_generated.h
+++ b/torch/csrc/jit/serialization/mobile_bytecode_generated.h
@@ -8,9 +8,15 @@
 
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
+<<<<<<< HEAD
 static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
               FLATBUFFERS_VERSION_MINOR == 3 &&
               FLATBUFFERS_VERSION_REVISION == 3,
+=======
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 12 &&
+              FLATBUFFERS_VERSION_REVISION == 23,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
              "Non-compatible flatbuffers version included");
 
 namespace torch {
@@ -2597,3 +2603,7 @@ inline void FinishSizePrefixedModuleBuffer(
 }  // namespace torch
 
 #endif  // FLATBUFFERS_GENERATED_MOBILEBYTECODE_TORCH_JIT_MOBILE_SERIALIZATION_H_
+<<<<<<< HEAD
+=======
+// @generated
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 8accfa229b84..81d39593ef6a 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -2,6 +2,10 @@
 
 #include <ATen/core/qualified_name.h>
 #include <string>
+<<<<<<< HEAD
+=======
+#include <string_view>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <utility>
 #include <vector>
 
@@ -11,7 +15,10 @@
 #include <c10/util/ArrayRef.h>
 #include <c10/util/FbcodeMaps.h>
 #include <c10/util/intrusive_ptr.h>
+<<<<<<< HEAD
 #include <c10/util/string_view.h>
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/Export.h>
 
 namespace torch::jit {
@@ -115,9 +122,12 @@ struct WriteableTensorData {
   uint64_t size_;
 };
 
+<<<<<<< HEAD
 void setTypeTags(bool state);
 bool getTypeTags();
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TORCH_API Pickler {
   AT_DISALLOW_COPY_AND_ASSIGN(Pickler);
 
@@ -286,9 +296,12 @@ class TORCH_API Pickler {
 TORCH_API WriteableTensorData
 getWriteableTensorData(const at::Tensor& tensor, bool to_cpu = true);
 
+<<<<<<< HEAD
 // return the value of the tensor's storage pointer
 uint64_t getStorageKey(const at::Tensor& tensor);
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // if the cls has __getstate__/__setstate__
 // assert they have the right schema and return true,
 // otherwise return false
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index 18d329d3a821..c2a9ee8fa36f 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -130,6 +130,13 @@ struct PythonPrintImpl {
         stack->push_back(n->sourceRange());
       }
     }
+<<<<<<< HEAD
+=======
+    WithSourceRange(const WithSourceRange&) = delete;
+    WithSourceRange(WithSourceRange&&) = delete;
+    WithSourceRange& operator=(const WithSourceRange&) = delete;
+    WithSourceRange& operator=(WithSourceRange&&) = delete;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     ~WithSourceRange() {
       stack->pop_back();
@@ -361,8 +368,14 @@ struct PythonPrintImpl {
       std::unordered_set<std::string>& used) {
     std::string name = candidate;
     while (used.count(name) || reserved_names.count(name)) {
+<<<<<<< HEAD
       // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
       name = candidate + std::to_string(next_id[name]++);
+=======
+      auto suffix = (next_id[name]++);
+      name.resize(candidate.size());
+      name.append(std::to_string(suffix));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     used.insert(name);
     return name;
diff --git a/torch/csrc/jit/serialization/source_range_serialization.cpp b/torch/csrc/jit/serialization/source_range_serialization.cpp
index 11d9664b60f2..00eff7fba183 100644
--- a/torch/csrc/jit/serialization/source_range_serialization.cpp
+++ b/torch/csrc/jit/serialization/source_range_serialization.cpp
@@ -13,7 +13,11 @@ namespace torch::jit {
 // "Whether to emit compact debug_pkl when saving a model to .pt file."
 // "Compact file is smaller but cannot be loaded by old torch binaries."
 // TODO(qihan) remove when all binaries are using string table.
+<<<<<<< HEAD
 thread_local bool should_use_format_with_string_table_ = true;
+=======
+static thread_local bool should_use_format_with_string_table_ = true;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class SourceRangeSerializer {
  public:
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 5a81a25c358e..ff0bab19d9e4 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -270,6 +270,7 @@ void Unpickler::setInput(size_t memo_id) {
   }
 }
 
+<<<<<<< HEAD
 // emplace_back on bool vectors does not exist on some systems
 // avoid it by calling push_back for bool
 template <typename T>
@@ -282,6 +283,8 @@ inline void append<bool>(std::vector<bool>& a, bool&& e) {
   a.push_back(e);
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static std::vector<int64_t> tupleToIntList(const IValue& v) {
   return fmap(v.toTupleRef().elements(), [](const IValue& v) -> int64_t {
     return v.toInt();
@@ -623,7 +626,12 @@ PickleOpCode Unpickler::readInstruction() {
       }
 
       if (device.is_cuda() || device.is_xpu() || device.is_meta() ||
+<<<<<<< HEAD
           device.is_hpu() || device.is_mps() || device.is_privateuseone()) {
+=======
+          device.is_mtia() || device.is_hpu() || device.is_mps() ||
+          device.is_privateuseone()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor = tensor.to(device, tensor.scalar_type());
       } else if (device.type() != DeviceType::CPU) {
         TORCH_CHECK(
@@ -1188,7 +1196,11 @@ void Unpickler::readList(IValue list_ivalue) {
   readListElements(std::move(list_ivalue), start);
 }
 
+<<<<<<< HEAD
 inline bool is_valid_python_id_char(char c) {
+=======
+static inline bool is_valid_python_id_char(char c) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return c == '_' || c == '.' || (c >= '0' && c <= '9') ||
       (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
 }
diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h
index 994825e506b1..5643c597e341 100644
--- a/torch/csrc/jit/serialization/unpickler.h
+++ b/torch/csrc/jit/serialization/unpickler.h
@@ -73,7 +73,10 @@ class TORCH_API Unpickler {
       TypeParserT type_parser = defaultTypeParser,
       std::shared_ptr<DeserializationStorageContext> storage_context = nullptr)
       : reader_(std::move(reader)),
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         type_resolver_(std::move(type_resolver)),
         obj_loader_(std::move(obj_loader)),
         read_record_(std::move(read_record)),
@@ -83,6 +86,13 @@ class TORCH_API Unpickler {
         storage_context_(std::move(storage_context)),
         version_(caffe2::serialize::kProducedFileFormatVersion) {}
 
+<<<<<<< HEAD
+=======
+  Unpickler(Unpickler&&) = delete;
+  Unpickler& operator=(Unpickler&&) = delete;
+  ~Unpickler() = default;
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // consume the pickle stream, producing an IValue from the contents.
   // Type Tags: the pickler will restore the type tags on
   // List and Dict objects when possible IValue is an Object.
diff --git a/torch/csrc/jit/tensorexpr/block_codegen.cpp b/torch/csrc/jit/tensorexpr/block_codegen.cpp
index d272704b7aba..39cbc1067909 100644
--- a/torch/csrc/jit/tensorexpr/block_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/block_codegen.cpp
@@ -363,6 +363,10 @@ void BlockCodeGen::call_raw(const std::vector<void*>& args) {
 }
 
 BlockCodeGen::~BlockCodeGen() = default;
+<<<<<<< HEAD
 RegisterCodeGen<BlockCodeGen> block_codegen_reg("block_codegen");
+=======
+static RegisterCodeGen<BlockCodeGen> block_codegen_reg("block_codegen");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/bounds_inference.cpp b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
index ff35a67ace7c..ab00b17dee75 100644
--- a/torch/csrc/jit/tensorexpr/bounds_inference.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
@@ -18,7 +18,11 @@ namespace torch::jit::tensorexpr {
 using namespace analysis;
 
 template <typename Container>
+<<<<<<< HEAD
 BoundsInfo mergeTensorAccesses(
+=======
+static BoundsInfo mergeTensorAccesses(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const Container& accesses,
     const std::unordered_map<VarPtr, BufPtr>& varToBuf,
     bool distinctAccessKinds) {
diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
index b9cc921c303a..56e47c039135 100644
--- a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
@@ -77,7 +77,11 @@ void CppPrinter::printPrologue() {
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline std::enable_if_t<!std::is_floating_point_v<T>, void> visit_mod(
+=======
+static inline std::enable_if_t<!std::is_floating_point_v<T>, void> visit_mod(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::ostream& os,
     const ExprPtr& lhs,
     const ExprPtr& rhs) {
@@ -85,7 +89,11 @@ inline std::enable_if_t<!std::is_floating_point_v<T>, void> visit_mod(
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline std::enable_if_t<std::is_floating_point_v<T>, void> visit_mod(
+=======
+static inline std::enable_if_t<std::is_floating_point_v<T>, void> visit_mod(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::ostream& os,
     const ExprPtr& lhs,
     const ExprPtr& rhs) {
@@ -93,35 +101,55 @@ inline std::enable_if_t<std::is_floating_point_v<T>, void> visit_mod(
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline std::
+=======
+static inline std::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     enable_if_t<std::is_floating_point_v<T> || std::is_integral_v<T>, void>
     visit_max(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
   os << "std::max(" << *lhs << ", " << *rhs << ")";
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline std::
+=======
+static inline std::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     enable_if_t<!std::is_floating_point_v<T> && !std::is_integral_v<T>, void>
     visit_max(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
   os << "(" << *lhs << " < " << *rhs << ") ? " << *rhs << " : " << *lhs;
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline std::
+=======
+static inline std::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     enable_if_t<std::is_floating_point_v<T> || std::is_integral_v<T>, void>
     visit_min(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
   os << "std::min(" << *lhs << ", " << *rhs << ")";
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline std::
+=======
+static inline std::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     enable_if_t<!std::is_floating_point_v<T> && !std::is_integral_v<T>, void>
     visit_min(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
   os << *lhs << " < " << *rhs << " ? " << *lhs << " : " << *rhs;
 }
 
 template <typename T>
+<<<<<<< HEAD
 void visit_binary_op(
+=======
+static void visit_binary_op(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::ostream& os,
     const ExprPtr& lhs,
     const ExprPtr& rhs,
@@ -142,7 +170,11 @@ void visit_binary_op(
 }
 
 template <typename Op>
+<<<<<<< HEAD
 void dispatch_binary_op(std::ostream& os, const BinaryOpNode<Op>* v) {
+=======
+static void dispatch_binary_op(std::ostream& os, const BinaryOpNode<Op>* v) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   switch (v->lhs()->dtype().scalar_type()) {
 #define TYPE_CASE(Type, Name)                                      \
   case ScalarType::Name:                                           \
@@ -400,6 +432,10 @@ void CppCodeGen::call_raw(const std::vector<void*>& args) {
   os() << "int main() {}" << '\n';
 }
 
+<<<<<<< HEAD
 RegisterCodeGen<CppCodeGen> cpp_codegen_reg("cpp_codegen");
+=======
+static RegisterCodeGen<CppCodeGen> cpp_codegen_reg("cpp_codegen");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 89638d5a2de8..cdb16e57a211 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -1286,7 +1286,11 @@ void CudaCodeGen::CompileToNVRTC(
   args.push_back("-hip-pch");
 #else
   const std::string compute = std::string("--gpu-architecture=") +
+<<<<<<< HEAD
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
+=======
+#if !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_)
       // which gives better backwards compatibility to work on older driver,
       // (since older driver doesn't necessarily recognize PTX emitted by new
@@ -1321,7 +1325,11 @@ void CudaCodeGen::CompileToNVRTC(
   AT_CUDA_NVRTC_CHECK(result);
   size_t ptx_size = 0;
   std::vector<char> ptx;
+<<<<<<< HEAD
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
+=======
+#if !defined(USE_ROCM)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // compile_to_sass determines whether we are generating SASS or PTX, hence
   // the different API.
   auto getSize = compile_to_sass
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index 12982d98b018..cf19fe729f05 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -10,7 +10,11 @@
 
 namespace torch::jit::tensorexpr {
 
+<<<<<<< HEAD
 RegisterCodeGen<SimpleIREvaluator> ir_eval_codegen_reg("simple_ir_eval");
+=======
+static RegisterCodeGen<SimpleIREvaluator> ir_eval_codegen_reg("simple_ir_eval");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 int64_t InterpValue::intValue() const {
 #define TYPE_CASE(Type, Name)        \
@@ -24,33 +28,58 @@ int64_t InterpValue::intValue() const {
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline std::enable_if_t<std::is_integral_v<T>, T> mod_value(T lhs, T rhs) {
+=======
+static inline std::enable_if_t<std::is_integral_v<T>, T> mod_value(
+    T lhs,
+    T rhs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return lhs % rhs;
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline std::enable_if_t<std::is_floating_point_v<T>, T> mod_value(
+=======
+static inline std::enable_if_t<std::is_floating_point_v<T>, T> mod_value(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     T lhs,
     T rhs) {
   return std::fmod(lhs, rhs);
 }
 
+<<<<<<< HEAD
 inline bool mod_value(bool lhs, bool rhs) {
+=======
+static inline bool mod_value(bool lhs, bool rhs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   throw std::runtime_error("Attempted modulus of bool");
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline std::enable_if_t<std::is_integral_v<T>, T> div_value(T lhs, T rhs) {
+=======
+static inline std::enable_if_t<std::is_integral_v<T>, T> div_value(
+    T lhs,
+    T rhs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   TORCH_CHECK(rhs != 0, "Division by zero");
   return lhs / rhs;
 }
 
 template <typename T>
+<<<<<<< HEAD
 inline std::enable_if_t<std::is_floating_point_v<T>, T>
+=======
+static inline std::enable_if_t<std::is_floating_point_v<T>, T>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __ubsan_ignore_float_divide_by_zero__ div_value(T lhs, T rhs) {
   return lhs / rhs;
 }
 
+<<<<<<< HEAD
 inline bool div_value(bool lhs, bool rhs) {
   LOG(FATAL) << "Attempted division of bool";
   return false;
@@ -61,6 +90,13 @@ inline c10::Half div_value(c10::Half lhs, c10::Half rhs) {
 }
 
 inline c10::BFloat16 div_value(c10::BFloat16 lhs, c10::BFloat16 rhs) {
+=======
+static inline c10::Half div_value(c10::Half lhs, c10::Half rhs) {
+  return lhs / rhs;
+}
+
+static inline c10::BFloat16 div_value(c10::BFloat16 lhs, c10::BFloat16 rhs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return lhs / rhs;
 }
 
diff --git a/torch/csrc/jit/tensorexpr/expr.cpp b/torch/csrc/jit/tensorexpr/expr.cpp
index ece08a2f08b7..2ac64cbcb64f 100644
--- a/torch/csrc/jit/tensorexpr/expr.cpp
+++ b/torch/csrc/jit/tensorexpr/expr.cpp
@@ -143,7 +143,11 @@ ExprHandle abs(const ExprHandle& v) {
 }
 
 // The default tanh is quite slow, use the Eigen version from here:
+<<<<<<< HEAD
 // https://bitbucket.org/eigen/eigen/src/94875feeeeb9abe5509b314197da1991ba2070f5/Eigen/src/Core/MathFunctionsImpl.h#lines-26
+=======
+// https://github.com/TUW-VieVS/VieSchedpp/blob/master/Eigen/src/Core/MathFunctionsImpl.h#L26
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ExprHandle fast_tanh(const ExprHandle& v) {
   // TODO: use a dedicated bind-var to make sure v is not evaluated multiple
   // times. Clamp the input expression to [-9, 9]
@@ -205,7 +209,11 @@ ExprHandle fast_sigmoid(const ExprHandle& x) {
 
 ExprHandle fast_log(const ExprHandle& v) {
   // this implementation is taken from sleef:
+<<<<<<< HEAD
   // https://github.com/shibatch/sleef/blob/master/src/libm/sleefsp.c#L1131
+=======
+  // https://github.com/shibatch/sleef/blob/master/src/libm/sleefsimdsp.c#L1277
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // to generate coefficients, this tool is provided
   // https://github.com/shibatch/sleef/blob/master/src/gencoef/gencoef.txt
   auto ilogb2kf = [](const ExprHandle& x) {
diff --git a/torch/csrc/jit/tensorexpr/external_functions.cpp b/torch/csrc/jit/tensorexpr/external_functions.cpp
index 00d22083dbad..fccec2394045 100644
--- a/torch/csrc/jit/tensorexpr/external_functions.cpp
+++ b/torch/csrc/jit/tensorexpr/external_functions.cpp
@@ -931,7 +931,11 @@ void nnc_aten_upsample_nearest2d(
   }
   auto tensors = constructTensors(
       bufs_num, buf_data, buf_ranks, buf_dims, buf_strides, buf_dtypes, qdata);
+<<<<<<< HEAD
   auto x = tensors[1];
+=======
+  const auto& x = tensors[1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   int64_t output_size_h = extra_args[3];
   int64_t output_size_w = extra_args[4];
@@ -1041,7 +1045,11 @@ void nnc_aten_quantize_per_tensor_out(
       std::nullopt,
       bufs_out_num);
   // NOLINTNEXTLINE(facebook-hte-LocalUncheckedArrayBounds)
+<<<<<<< HEAD
   at::Tensor x = tensors[1];
+=======
+  const at::Tensor& x = tensors[1];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const double qscale = ((double*)extra_args)[0];
   const int64_t qzero = extra_args[1];
   const c10::ScalarType qdtype = static_cast<c10::ScalarType>(extra_args[2]);
diff --git a/torch/csrc/jit/tensorexpr/intrinsic_symbols.cpp b/torch/csrc/jit/tensorexpr/intrinsic_symbols.cpp
index 937fffb10fed..f35ca3ed3dca 100644
--- a/torch/csrc/jit/tensorexpr/intrinsic_symbols.cpp
+++ b/torch/csrc/jit/tensorexpr/intrinsic_symbols.cpp
@@ -131,6 +131,7 @@ namespace torch::jit::tensorexpr {
 
 c10::ArrayRef<SymbolAddress> getIntrinsicSymbols() {
   static SymbolAddress symbolAddresses[] = {
+<<<<<<< HEAD
     {"log10f", reinterpret_cast<void*>(&log10f)},
     {"log1pf", reinterpret_cast<void*>(&log1pf)},
     {"logf", reinterpret_cast<void*>(&logf)},
@@ -278,6 +279,155 @@ c10::ArrayRef<SymbolAddress> getIntrinsicSymbols() {
     {"Sleef_atan2d4", reinterpret_cast<void*>(&Sleef_atan2d4_u10)},
     {"Sleef_powd4", reinterpret_cast<void*>(&Sleef_powd4_u10)},
     {"Sleef_fmodd4", reinterpret_cast<void*>(&Sleef_fmodd4)},
+=======
+      {"log10f", reinterpret_cast<void*>(&log10f)},
+      {"log1pf", reinterpret_cast<void*>(&log1pf)},
+      {"logf", reinterpret_cast<void*>(&logf)},
+      {"log2f", reinterpret_cast<void*>(&log2f)},
+      {"expf", reinterpret_cast<void*>(&expf)},
+      {"erff", reinterpret_cast<void*>(&erff)},
+      {"cosf", reinterpret_cast<void*>(&cosf)},
+      {"sinf", reinterpret_cast<void*>(&sinf)},
+      {"tanf", reinterpret_cast<void*>(&tanf)},
+      {"acosf", reinterpret_cast<void*>(&acosf)},
+      {"asinf", reinterpret_cast<void*>(&asinf)},
+      {"atanf", reinterpret_cast<void*>(&atanf)},
+      {"coshf", reinterpret_cast<void*>(&coshf)},
+      {"sinhf", reinterpret_cast<void*>(&sinhf)},
+      {"tanhf", reinterpret_cast<void*>(&tanhf)},
+      {"sqrtf", reinterpret_cast<void*>(&sqrtf)},
+      {"fabsf", reinterpret_cast<void*>(&fabsf)},
+      {"floorf", reinterpret_cast<void*>(&floorf)},
+      {"ceilf", reinterpret_cast<void*>(&ceilf)},
+      {"roundf", reinterpret_cast<void*>(&roundf)},
+      {"truncf", reinterpret_cast<void*>(&truncf)},
+      {"atan2f", reinterpret_cast<void*>(&atan2f)},
+      {"fmodf", reinterpret_cast<void*>(&fmodf)},
+      {"remainderf", reinterpret_cast<void*>(&remainderf)},
+
+      // float -> half & half -> float conversions
+      {"__gnu_h2f_ieee",
+       reinterpret_cast<void*>(&c10::detail::fp16_ieee_to_fp32_value)},
+      {"__gnu_f2h_ieee",
+       reinterpret_cast<void*>(&c10::detail::fp16_ieee_from_fp32_value)},
+
+#if !defined(_MSC_VER) && defined(__x86_64__)
+      // FP32 Sleef functions -- SSE
+      {"Sleef_acosf4", reinterpret_cast<void*>(&Sleef_acosf4_u10)},
+      {"Sleef_asinf4", reinterpret_cast<void*>(&Sleef_asinf4_u10)},
+      {"Sleef_atanf4", reinterpret_cast<void*>(&Sleef_atanf4_u10)},
+      {"Sleef_cosf4", reinterpret_cast<void*>(&Sleef_cosf4_u10)},
+      {"Sleef_sinf4", reinterpret_cast<void*>(&Sleef_sinf4_u10)},
+      {"Sleef_tanf4", reinterpret_cast<void*>(&Sleef_tanf4_u10)},
+      {"Sleef_coshf4", reinterpret_cast<void*>(&Sleef_coshf4_u10)},
+      {"Sleef_sinhf4", reinterpret_cast<void*>(&Sleef_sinhf4_u10)},
+      {"Sleef_tanhf4", reinterpret_cast<void*>(&Sleef_tanhf4_u10)},
+      {"Sleef_erff4", reinterpret_cast<void*>(&Sleef_erff4_u10)},
+      {"Sleef_erfcf4", reinterpret_cast<void*>(&Sleef_erfcf4_u15)},
+      {"Sleef_expf4", reinterpret_cast<void*>(&Sleef_expf4_u10)},
+      {"Sleef_expm1f4", reinterpret_cast<void*>(&Sleef_expm1f4_u10)},
+      {"Sleef_logf4", reinterpret_cast<void*>(&Sleef_logf4_u10)},
+      {"Sleef_log2f4", reinterpret_cast<void*>(&Sleef_log2f4_u10)},
+      {"Sleef_log10f4", reinterpret_cast<void*>(&Sleef_log10f4_u10)},
+      {"Sleef_log1pf4", reinterpret_cast<void*>(&Sleef_log1pf4_u10)},
+      {"Sleef_sqrtf4", reinterpret_cast<void*>(&Sleef_sqrtf4_u05)},
+      {"Sleef_fabsf4", reinterpret_cast<void*>(&Sleef_fabsf4)},
+      {"Sleef_floorf4", reinterpret_cast<void*>(&Sleef_floorf4)},
+      {"Sleef_ceilf4", reinterpret_cast<void*>(&Sleef_ceilf4)},
+      {"Sleef_truncf4", reinterpret_cast<void*>(&Sleef_truncf4)},
+      {"Sleef_roundf4", reinterpret_cast<void*>(&Sleef_roundf4)},
+      {"Sleef_lgammaf4", reinterpret_cast<void*>(&Sleef_lgammaf4_u10)},
+      {"Sleef_atan2f4", reinterpret_cast<void*>(&Sleef_atan2f4_u10)},
+      {"Sleef_powf4", reinterpret_cast<void*>(&Sleef_powf4_u10)},
+      {"Sleef_fmodf4", reinterpret_cast<void*>(&Sleef_fmodf4)},
+
+      // FP32 Sleef functions -- AVX2
+      {"Sleef_acosf8", reinterpret_cast<void*>(&Sleef_acosf8_u10)},
+      {"Sleef_asinf8", reinterpret_cast<void*>(&Sleef_asinf8_u10)},
+      {"Sleef_atanf8", reinterpret_cast<void*>(&Sleef_atanf8_u10)},
+      {"Sleef_cosf8", reinterpret_cast<void*>(&Sleef_cosf8_u10)},
+      {"Sleef_sinf8", reinterpret_cast<void*>(&Sleef_sinf8_u10)},
+      {"Sleef_tanf8", reinterpret_cast<void*>(&Sleef_tanf8_u10)},
+      {"Sleef_coshf8", reinterpret_cast<void*>(&Sleef_coshf8_u10)},
+      {"Sleef_sinhf8", reinterpret_cast<void*>(&Sleef_sinhf8_u10)},
+      {"Sleef_tanhf8", reinterpret_cast<void*>(&Sleef_tanhf8_u10)},
+      {"Sleef_erff8", reinterpret_cast<void*>(&Sleef_erff8_u10)},
+      {"Sleef_erfcf8", reinterpret_cast<void*>(&Sleef_erfcf8_u15)},
+      {"Sleef_expf8", reinterpret_cast<void*>(&Sleef_expf8_u10)},
+      {"Sleef_expm1f8", reinterpret_cast<void*>(&Sleef_expm1f8_u10)},
+      {"Sleef_logf8", reinterpret_cast<void*>(&Sleef_logf8_u10)},
+      {"Sleef_log2f8", reinterpret_cast<void*>(&Sleef_log2f8_u10)},
+      {"Sleef_log10f8", reinterpret_cast<void*>(&Sleef_log10f8_u10)},
+      {"Sleef_log1pf8", reinterpret_cast<void*>(&Sleef_log1pf8_u10)},
+      {"Sleef_sqrtf8", reinterpret_cast<void*>(&Sleef_sqrtf8_u05)},
+      {"Sleef_fabsf8", reinterpret_cast<void*>(&Sleef_fabsf8)},
+      {"Sleef_floorf8", reinterpret_cast<void*>(&Sleef_floorf8)},
+      {"Sleef_ceilf8", reinterpret_cast<void*>(&Sleef_ceilf8)},
+      {"Sleef_truncf8", reinterpret_cast<void*>(&Sleef_truncf8)},
+      {"Sleef_roundf8", reinterpret_cast<void*>(&Sleef_roundf8)},
+      {"Sleef_lgammaf8", reinterpret_cast<void*>(&Sleef_lgammaf8_u10)},
+      {"Sleef_atan2f8", reinterpret_cast<void*>(&Sleef_atan2f8_u10)},
+      {"Sleef_powf8", reinterpret_cast<void*>(&Sleef_powf8_u10)},
+      {"Sleef_fmodf8", reinterpret_cast<void*>(&Sleef_fmodf8)},
+
+      // FP64 Sleef functions -- SSE
+      {"Sleef_acosd2", reinterpret_cast<void*>(&Sleef_acosd2_u10)},
+      {"Sleef_asind2", reinterpret_cast<void*>(&Sleef_asind2_u10)},
+      {"Sleef_atand2", reinterpret_cast<void*>(&Sleef_atand2_u10)},
+      {"Sleef_cosd2", reinterpret_cast<void*>(&Sleef_cosd2_u10)},
+      {"Sleef_sind2", reinterpret_cast<void*>(&Sleef_sind2_u10)},
+      {"Sleef_tand2", reinterpret_cast<void*>(&Sleef_tand2_u10)},
+      {"Sleef_coshd2", reinterpret_cast<void*>(&Sleef_coshd2_u10)},
+      {"Sleef_sinhd2", reinterpret_cast<void*>(&Sleef_sinhd2_u10)},
+      {"Sleef_tanhd2", reinterpret_cast<void*>(&Sleef_tanhd2_u10)},
+      {"Sleef_erfd2", reinterpret_cast<void*>(&Sleef_erfd2_u10)},
+      {"Sleef_erfcd2", reinterpret_cast<void*>(&Sleef_erfcd2_u15)},
+      {"Sleef_expd2", reinterpret_cast<void*>(&Sleef_expd2_u10)},
+      {"Sleef_expm1d2", reinterpret_cast<void*>(&Sleef_expm1d2_u10)},
+      {"Sleef_logd2", reinterpret_cast<void*>(&Sleef_logd2_u10)},
+      {"Sleef_log2d2", reinterpret_cast<void*>(&Sleef_log2d2_u10)},
+      {"Sleef_log10d2", reinterpret_cast<void*>(&Sleef_log10d2_u10)},
+      {"Sleef_log1pd2", reinterpret_cast<void*>(&Sleef_log1pd2_u10)},
+      {"Sleef_sqrtd2", reinterpret_cast<void*>(&Sleef_sqrtd2_u05)},
+      {"Sleef_fabsd2", reinterpret_cast<void*>(&Sleef_fabsd2)},
+      {"Sleef_floord2", reinterpret_cast<void*>(&Sleef_floord2)},
+      {"Sleef_ceild2", reinterpret_cast<void*>(&Sleef_ceild2)},
+      {"Sleef_truncd2", reinterpret_cast<void*>(&Sleef_truncd2)},
+      {"Sleef_roundd2", reinterpret_cast<void*>(&Sleef_roundd2)},
+      {"Sleef_lgammad2", reinterpret_cast<void*>(&Sleef_lgammad2_u10)},
+      {"Sleef_atan2d2", reinterpret_cast<void*>(&Sleef_atan2d2_u10)},
+      {"Sleef_powd2", reinterpret_cast<void*>(&Sleef_powd2_u10)},
+      {"Sleef_fmodd2", reinterpret_cast<void*>(&Sleef_fmodd2)},
+
+      // FP64 Sleef functions -- AVX2
+      {"Sleef_acosd4", reinterpret_cast<void*>(&Sleef_acosd4_u10)},
+      {"Sleef_asind4", reinterpret_cast<void*>(&Sleef_asind4_u10)},
+      {"Sleef_atand4", reinterpret_cast<void*>(&Sleef_atand4_u10)},
+      {"Sleef_cosd4", reinterpret_cast<void*>(&Sleef_cosd4_u10)},
+      {"Sleef_sind4", reinterpret_cast<void*>(&Sleef_sind4_u10)},
+      {"Sleef_tand4", reinterpret_cast<void*>(&Sleef_tand4_u10)},
+      {"Sleef_coshd4", reinterpret_cast<void*>(&Sleef_coshd4_u10)},
+      {"Sleef_sinhd4", reinterpret_cast<void*>(&Sleef_sinhd4_u10)},
+      {"Sleef_tanhd4", reinterpret_cast<void*>(&Sleef_tanhd4_u10)},
+      {"Sleef_erfd4", reinterpret_cast<void*>(&Sleef_erfd4_u10)},
+      {"Sleef_erfcd4", reinterpret_cast<void*>(&Sleef_erfcd4_u15)},
+      {"Sleef_expd4", reinterpret_cast<void*>(&Sleef_expd4_u10)},
+      {"Sleef_expm1d4", reinterpret_cast<void*>(&Sleef_expm1d4_u10)},
+      {"Sleef_logd4", reinterpret_cast<void*>(&Sleef_logd4_u10)},
+      {"Sleef_log2d4", reinterpret_cast<void*>(&Sleef_log2d4_u10)},
+      {"Sleef_log10d4", reinterpret_cast<void*>(&Sleef_log10d4_u10)},
+      {"Sleef_log1pd4", reinterpret_cast<void*>(&Sleef_log1pd4_u10)},
+      {"Sleef_sqrtd4", reinterpret_cast<void*>(&Sleef_sqrtd4_u05)},
+      {"Sleef_fabsd4", reinterpret_cast<void*>(&Sleef_fabsd4)},
+      {"Sleef_floord4", reinterpret_cast<void*>(&Sleef_floord4)},
+      {"Sleef_ceild4", reinterpret_cast<void*>(&Sleef_ceild4)},
+      {"Sleef_truncd4", reinterpret_cast<void*>(&Sleef_truncd4)},
+      {"Sleef_roundd4", reinterpret_cast<void*>(&Sleef_roundd4)},
+      {"Sleef_lgammad4", reinterpret_cast<void*>(&Sleef_lgammad4_u10)},
+      {"Sleef_atan2d4", reinterpret_cast<void*>(&Sleef_atan2d4_u10)},
+      {"Sleef_powd4", reinterpret_cast<void*>(&Sleef_powd4_u10)},
+      {"Sleef_fmodd4", reinterpret_cast<void*>(&Sleef_fmodd4)},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #endif
   };
   return c10::ArrayRef<SymbolAddress>(symbolAddresses);
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 200561818dbb..eacf6be582a4 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -60,7 +60,11 @@ template <
     std::enable_if_t<std::is_same_v<
         decltype(detail::bin_op_deducer(std::declval<Op>())),
         void>>* = nullptr>
+<<<<<<< HEAD
 void visitBinaryOp(
+=======
+static void visitBinaryOp(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NodePtr<Op> v,
     const std::string& op_str,
     IRPrinter* printer,
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index f04ea5a70435..284e95e15e5e 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -8,7 +8,11 @@
 namespace torch::jit::tensorexpr {
 
 // Creates a new Expr of the given type with the provided lhs and rhs.
+<<<<<<< HEAD
 inline ExprPtr newBinaryOpOfType(
+=======
+static inline ExprPtr newBinaryOpOfType(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     IRNodeType expr_type,
     const ExprPtr& lhs,
     const ExprPtr& rhs,
@@ -72,7 +76,11 @@ static ExprPtr mutateBinaryOp(
 
 // Simple recursive GCD.
 template <typename T>
+<<<<<<< HEAD
 T gcd(T a, T b) {
+=======
+static T gcd(T a, T b) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (b == 0) {
     return a;
   }
@@ -205,7 +213,11 @@ void MinTerm::uniquefy() {
 
 // Handles optimization cases for Broadcast/Ramp +/- Broadcast/Ramp
 template <class Op>
+<<<<<<< HEAD
 ExprPtr combineMultilane(const ExprPtr& lhs, const ExprPtr& rhs) {
+=======
+static ExprPtr combineMultilane(const ExprPtr& lhs, const ExprPtr& rhs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (BroadcastPtr bc = to<Broadcast>(lhs)) {
     if (BroadcastPtr bcother = to<Broadcast>(rhs)) {
       if (bc->lanes() != bcother->lanes()) {
diff --git a/torch/csrc/jit/tensorexpr/ir_verifier.cpp b/torch/csrc/jit/tensorexpr/ir_verifier.cpp
index 5ec6dd938921..ffa0987ae045 100644
--- a/torch/csrc/jit/tensorexpr/ir_verifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_verifier.cpp
@@ -19,7 +19,11 @@ template <
     std::enable_if_t<
         std::is_same_v<decltype(detail::deducer(std::declval<D>())), void>>* =
         nullptr>
+<<<<<<< HEAD
 void verifyBitwiseOp(NodePtr<D> v, IRVerifier* verifier) {
+=======
+static void verifyBitwiseOp(NodePtr<D> v, IRVerifier* verifier) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!v->lhs()->dtype().is_integral()) {
     throw unsupported_dtype();
   }
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index c98bcd31caa1..84de3eac0dd7 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -54,17 +54,27 @@ bool setFallbackAllowed(bool value) {
 }
 
 bool fallbackAllowed() {
+<<<<<<< HEAD
   static const char* enable_c_str = std::getenv("PYTORCH_TENSOREXPR_FALLBACK");
   if (!enable_c_str) {
     return fallback_allowed;
   }
   if (std::string(enable_c_str) == "0") {
+=======
+  static const auto enable_opt =
+      c10::utils::get_env("PYTORCH_TENSOREXPR_FALLBACK");
+  if (!enable_opt.has_value()) {
+    return fallback_allowed;
+  }
+  if (enable_opt == "0") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return false;
   }
   return true;
 }
 
 static bool fallbackEnforced() {
+<<<<<<< HEAD
   static const char* enable_c_str = std::getenv("PYTORCH_TENSOREXPR_FALLBACK");
   if (tensorexpr::getTEGenerateBlockCode()) {
     return false;
@@ -73,28 +83,57 @@ static bool fallbackEnforced() {
     return fallback_allowed;
   }
   if (std::string(enable_c_str) == "2") {
+=======
+  static const auto enable_opt =
+      c10::utils::get_env("PYTORCH_TENSOREXPR_FALLBACK");
+  if (tensorexpr::getTEGenerateBlockCode()) {
+    return false;
+  }
+  if (!enable_opt.has_value()) {
+    return fallback_allowed;
+  }
+  if (enable_opt == "2") {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return true;
   }
   return false;
 }
 
 static int64_t randomTransformsRequested() {
+<<<<<<< HEAD
   const char* enable_c_str =
       std::getenv("PYTORCH_TENSOREXPR_RANDOM_TRANSFORM_SEED");
   if (!enable_c_str) {
     return 0;
   }
   return std::stoi(std::string(enable_c_str));
+=======
+  const auto enable_opt =
+      c10::utils::get_env("PYTORCH_TENSOREXPR_RANDOM_TRANSFORM_SEED");
+  if (!enable_opt.has_value()) {
+    return 0;
+  }
+  return std::stoi(enable_opt.value());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #ifdef TORCH_ENABLE_LLVM
 static bool dontUseLLVMFlag() {
+<<<<<<< HEAD
   static const char* enable_c_str =
       std::getenv("PYTORCH_TENSOREXPR_DONT_USE_LLVM");
   if (!enable_c_str) {
     return false;
   }
   return std::string(enable_c_str) == "1";
+=======
+  static const auto enable_opt =
+      c10::utils::get_env("PYTORCH_TENSOREXPR_DONT_USE_LLVM");
+  if (!enable_opt) {
+    return false;
+  }
+  return enable_opt == "1";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #endif
 
@@ -226,7 +265,11 @@ bool isContiguous(const torch::jit::Value* v, at::MemoryFormat memory_format) {
   }
 
   // Check dimension size first
+<<<<<<< HEAD
   int ndims = (*sizes).size();
+=======
+  auto ndims = (*sizes).size();
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if ((memory_format == at::MemoryFormat::ChannelsLast && ndims != 4) ||
       (memory_format == at::MemoryFormat::ChannelsLast3d && ndims != 5)) {
     return false;
@@ -483,7 +526,15 @@ ExprHandle TensorExprKernel::getVarForShape(const c10::ShapeSymbol& ss) {
   if (it == shapeSymbolToVar_.end()) {
     VarHandle var("ss" + std::to_string(-value), kLong);
     shapeSymbolToVar_.emplace(value, var);
+<<<<<<< HEAD
     return std::move(var);
+=======
+#if C10_RETURN_MOVE_IF_OLD_COMPILER
+    return std::move(var);
+#else
+    return var;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return it->second;
 }
@@ -754,7 +805,11 @@ static void pruneByThreadCount(std::vector<ForPtr>& loops) {
 // in the inner loop, and a maximum level of thread-level parallelism in the
 // outer loops.
 template <typename Bufs>
+<<<<<<< HEAD
 static void parallelizeOuterLoops(LoopNest& l, Bufs&& bufs) {
+=======
+static void parallelizeOuterLoops(LoopNest& l, const Bufs& bufs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto const& buf : bufs) {
     auto loops = l.getLoopStmtsFor(buf);
     pruneByGrainSize(loops);
@@ -1020,7 +1075,15 @@ ExprHandle TensorExprKernel::getStrideArg(
         kLong);
     strideArgToVar_[std::pair<size_t, size_t>(
         tensor_input_index, stride_index)] = var;
+<<<<<<< HEAD
+    return std::move(var);
+=======
+#if C10_RETURN_MOVE_IF_OLD_COMPILER
     return std::move(var);
+#else
+    return var;
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return it->second;
 }
@@ -1058,10 +1121,14 @@ std::vector<ExprHandle> TensorExprKernel::getInputStrides(
   }
 
   inputTensorStrides.resize(rank);
+<<<<<<< HEAD
   std::vector<bool> stride_set;
   for (size_t i = 0; i < rank; ++i) {
     stride_set.push_back(false);
   }
+=======
+  std::vector<bool> stride_set(rank, false);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // first, generate non-dependent values
   size_t generated_strides = 0;
   for (const auto i : c10::irange(rank)) {
@@ -1234,7 +1301,11 @@ NNCLoweringFunction TensorExprKernel::getCustomLoweringFor(
 }
 
 template <typename T>
+<<<<<<< HEAD
 std::vector<size_t> reverse_sort_indices(const std::vector<T>& v) {
+=======
+static std::vector<size_t> reverse_sort_indices(const std::vector<T>& v) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // initialize original index locations
   std::vector<size_t> idx(v.size());
   iota(idx.begin(), idx.end(), 0);
@@ -1283,8 +1354,12 @@ Tensor TensorExprKernel::convertSymbolicOutputToCorrectStrides(
         auto absolute_position = ExprHandle(immLike(axes[0], 0));
         for (size_t i = 0; i < axes.size(); ++i) {
           ExprHandle stride(default_strides[i]);
+<<<<<<< HEAD
           ExprHandle axis = axes[i];
           absolute_position = absolute_position + (stride * axis);
+=======
+          absolute_position = absolute_position + (stride * axes[i]);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         std::vector<ExprHandle> new_axes(
             sorted_stride_indices_descending.size());
@@ -2075,7 +2150,11 @@ void TensorExprKernel::runWithAllocatedOutputs(Stack& stack) const {
 
   std::vector<int64_t> int_inputs(nInputs_);
   for (auto i : c10::irange(nInputs_)) {
+<<<<<<< HEAD
     auto inp = stack_inputs[i];
+=======
+    const auto& inp = stack_inputs[i];
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (inp.isInt()) {
       int_inputs[i] = inp.toInt();
       args.emplace_back(&int_inputs[i]);
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 2641d68f636c..5c153f47feea 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -2603,8 +2603,15 @@ void LLVMCodeGenImpl::visit(const AllocatePtr& v) {
   }
 
 #if LLVM_VERSION_MAJOR > 17
+<<<<<<< HEAD
   llvm::Instruction* I = irb_.CreateMalloc(
       LongTy_, dtypeToLLVM(v->dtype()), size, nullptr, nullptr, "");
+=======
+  irb_.SetInsertPoint(irb_.GetInsertBlock());
+  llvm::Instruction* I = irb_.CreateMalloc(
+      LongTy_, dtypeToLLVM(v->dtype()), size, nullptr, nullptr, "");
+  varToVal_[v->buffer_var()] = I;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
   llvm::Instruction* I = llvm::CallInst::CreateMalloc(
       irb_.GetInsertBlock(),
@@ -2613,11 +2620,18 @@ void LLVMCodeGenImpl::visit(const AllocatePtr& v) {
       size,
       nullptr,
       nullptr);
+<<<<<<< HEAD
 #endif
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Insert the bitcast into the block.
   irb_.SetInsertPoint(irb_.GetInsertBlock());
   llvm::Value* malloc = irb_.Insert(I);
   varToVal_[v->buffer_var()] = malloc;
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 void LLVMCodeGenImpl::visit(const PlacementAllocatePtr& v) {
@@ -2641,7 +2655,11 @@ void LLVMCodeGenImpl::visit(const FreePtr& v) {
 
   if (!llvm::isa<llvm::AllocaInst>(ptr)) {
 #if LLVM_VERSION_MAJOR > 17
+<<<<<<< HEAD
     irb_.Insert(irb_.CreateFree(ptr));
+=======
+    irb_.CreateFree(ptr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else
     irb_.Insert(llvm::CallInst::CreateFree(ptr, irb_.GetInsertBlock()));
 #endif
diff --git a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
index 4da03b61a819..6f6b4f2faa23 100644
--- a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
@@ -60,7 +60,11 @@ static std::vector<std::vector<ForPtr>> GetAllPerfectlyNestedLoopNests(
 }
 
 template <typename T>
+<<<<<<< HEAD
 std::tuple<std::vector<T>, std::vector<int>> select_n_randomly(
+=======
+static std::tuple<std::vector<T>, std::vector<int>> select_n_randomly(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     std::vector<T>& objects,
     int n,
     std::default_random_engine& random_engine) {
@@ -100,7 +104,11 @@ static void printHistory(int index, std::string message) {
 }
 
 template <typename T>
+<<<<<<< HEAD
 std::string join(std::vector<T> indices, char sep = ',') {
+=======
+static std::string join(std::vector<T> indices, char sep = ',') {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::string s;
   for (const auto& index : indices) {
     s += std::to_string(index) + sep;
@@ -118,7 +126,11 @@ static std::string join(
   return s;
 }
 template <typename T>
+<<<<<<< HEAD
 std::string indexOf(const std::vector<T>& objects, const T& object) {
+=======
+static std::string indexOf(const std::vector<T>& objects, const T& object) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return std::to_string(std::distance(
       objects.begin(), std::find(objects.begin(), objects.end(), object)));
 }
diff --git a/torch/csrc/jit/tensorexpr/scripts/bisect.py b/torch/csrc/jit/tensorexpr/scripts/bisect.py
index fe12eab5097f..b898cb2bc29f 100644
--- a/torch/csrc/jit/tensorexpr/scripts/bisect.py
+++ b/torch/csrc/jit/tensorexpr/scripts/bisect.py
@@ -2,8 +2,11 @@
 
 import subprocess
 
+<<<<<<< HEAD
 import click
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def test(cmd, limit):
     print(f"Testing PYTORCH_JIT_OPT_LIMIT=tensorexpr_fuser={limit} {cmd}")
@@ -26,8 +29,11 @@ def test(cmd, limit):
     return 0
 
 
+<<<<<<< HEAD
 @click.command()
 @click.option("--cmd")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def bisect(cmd):
     last_good = 0
     first_bad = 10000
@@ -69,4 +75,15 @@ def keep_going():
 
 
 if __name__ == "__main__":
+<<<<<<< HEAD
     bisect()
+=======
+    import click
+
+    @click.command()
+    @click.option("--cmd", required=True)
+    def cli(cmd):
+        bisect(cmd)
+
+    cli()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index 62bde5c4c33e..af41bcde65a0 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -52,6 +52,12 @@ class StmtNode : public Stmt {
     visitor->visit(static_to<Op>(getptr()));
   }
   StmtPtr accept_mutator(IRMutator* mutator) override;
+<<<<<<< HEAD
+=======
+  friend Op;
+
+ private:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   StmtNode() = default;
 };
 
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index 66864cc22d9c..4d7829203cc7 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -15,6 +15,10 @@
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/lowerings.h>
 #include <torch/csrc/jit/tensorexpr/reduction.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/jit/tensorexpr/tensorexpr_init.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <utility>
 
@@ -25,7 +29,11 @@ struct pybind11::detail::type_caster<torch::jit::tensorexpr::ArgValue>
 namespace torch::jit {
 using namespace torch::jit::tensorexpr;
 
+<<<<<<< HEAD
 ArgValue convertPyToArgValue(py::handle inp) {
+=======
+static ArgValue convertPyToArgValue(py::handle inp) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (py::isinstance<BufHandle>(inp)) {
     return py::cast<BufHandle>(inp);
   } else if (py::isinstance<VarHandle>(inp)) {
@@ -54,7 +62,11 @@ ArgValue convertPyToArgValue(py::handle inp) {
   }
 }
 
+<<<<<<< HEAD
 Dtype parsePythonDtype(py::handle obj) {
+=======
+static Dtype parsePythonDtype(py::handle obj) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (THPDtype_Check(obj.ptr())) {
     return Dtype(reinterpret_cast<THPDtype*>(obj.ptr())->scalar_type);
   } else {
diff --git a/torch/csrc/lazy/core/cache.h b/torch/csrc/lazy/core/cache.h
index 5b2160c67789..1012fdb8827a 100644
--- a/torch/csrc/lazy/core/cache.h
+++ b/torch/csrc/lazy/core/cache.h
@@ -1,6 +1,10 @@
 /**
  * Cache utils in this file is adapted from PyTorch/XLA
+<<<<<<< HEAD
  * https://github.com/pytorch/xla/blob/master/third_party/xla_client/cache.h
+=======
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/cache.h
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 
 #pragma once
diff --git a/torch/csrc/lazy/core/config.cpp b/torch/csrc/lazy/core/config.cpp
index 09c9347dee45..35b2c4c65cf1 100644
--- a/torch/csrc/lazy/core/config.cpp
+++ b/torch/csrc/lazy/core/config.cpp
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/lazy/core/config.h>
 
 C10_DEFINE_bool(torch_lazy_ir_debug, false, "Enable lazy tensor IR debugging")
@@ -76,9 +80,15 @@ namespace torch::lazy {
 std::string& getLTCForceFallback() {
   static std::string config;
   static bool _ignore = [&]() {
+<<<<<<< HEAD
     char* envptr = std::getenv("LTC_FORCE_FALLBACK");
     if (envptr) {
       config = std::string(envptr);
+=======
+    auto env = c10::utils::get_env("LTC_FORCE_FALLBACK");
+    if (env.has_value()) {
+      config = std::string(env.value());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
     return true;
   }();
@@ -86,5 +96,8 @@ std::string& getLTCForceFallback() {
   return config;
 }
 
+<<<<<<< HEAD
 // NOLINTEND(misc-use-internal-linkage)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::lazy
diff --git a/torch/csrc/lazy/core/debug_util.cpp b/torch/csrc/lazy/core/debug_util.cpp
index 2ddaceab71a1..b162458db90c 100644
--- a/torch/csrc/lazy/core/debug_util.cpp
+++ b/torch/csrc/lazy/core/debug_util.cpp
@@ -1,3 +1,7 @@
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 #include <torch/csrc/lazy/core/debug_util.h>
 
@@ -17,8 +21,13 @@ namespace torch::lazy {
 namespace {
 
 std::string GetEnvString(const char* name, const std::string& defval) {
+<<<<<<< HEAD
   const char* env = std::getenv(name);
   return env != nullptr ? env : defval;
+=======
+  const auto env = c10::utils::get_env(name);
+  return env.value_or(defval);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 DebugUtil::GraphFormat DefaultGraphFormat() {
diff --git a/torch/csrc/lazy/core/hash.h b/torch/csrc/lazy/core/hash.h
index ea9e9e1be6b4..c26f92cc12cd 100644
--- a/torch/csrc/lazy/core/hash.h
+++ b/torch/csrc/lazy/core/hash.h
@@ -20,7 +20,11 @@ using size_t = std::size_t;
 
 class TORCH_API hash_t : public c10::uint128 {
  public:
+<<<<<<< HEAD
   // Swich from typedef hash_t = uint128 to provide explicit casters
+=======
+  // Switch from typedef hash_t = uint128 to provide explicit casters
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   hash_t(int8_t val) : uint128(static_cast<uint32_t>(val)) {}
   hash_t(int16_t val) : uint128(static_cast<uint32_t>(val)) {}
   hash_t(int32_t val) : uint128(static_cast<uint32_t>(val)) {}
@@ -69,7 +73,11 @@ hash_t Hash(const T& value) {
 // breaks falling through to the templated arithmetic types above
 hash_t TORCH_API Hash(const std::vector<bool>& value);
 
+<<<<<<< HEAD
 // Specialiazed implementations for proprietary types
+=======
+// Specialized implementations for proprietary types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 static inline hash_t Hash(const c10::ScalarType& value) {
   return DataHash(&value, sizeof(value));
 }
diff --git a/torch/csrc/lazy/core/helpers.cpp b/torch/csrc/lazy/core/helpers.cpp
index 7aaa926b0258..8157a60736fc 100644
--- a/torch/csrc/lazy/core/helpers.cpp
+++ b/torch/csrc/lazy/core/helpers.cpp
@@ -1,4 +1,8 @@
 #include <torch/csrc/lazy/core/helpers.h>
+<<<<<<< HEAD
+=======
+#include <algorithm>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <c10/util/Half.h>
 #include <c10/util/irange.h>
@@ -10,6 +14,10 @@ std::vector<int64_t> DropDimensions(
     c10::ArrayRef<int64_t> sizes,
     c10::ArrayRef<int64_t> drop_dims) {
   std::vector<int64_t> new_dims;
+<<<<<<< HEAD
+=======
+  new_dims.reserve(sizes.size() - drop_dims.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   size_t drop_index = 0;
   for (const auto i : c10::irange(sizes.size())) {
     if (drop_index < drop_dims.size() &&
@@ -45,6 +53,10 @@ std::vector<int64_t> GetCanonicalDimensionIndices(
     c10::ArrayRef<int64_t> dimensions,
     int64_t rank) {
   std::vector<int64_t> canonical_dim_indices;
+<<<<<<< HEAD
+=======
+  canonical_dim_indices.reserve(dimensions.size());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (int64_t dim : dimensions) {
     canonical_dim_indices.push_back(GetCanonicalDimensionIndex(dim, rank));
   }
@@ -131,8 +143,12 @@ std::vector<std::string> StrSplit(std::string_view text, char delim) {
   std::vector<std::string> tokens;
   while ((start = text.find_first_not_of(delim, end)) != std::string::npos) {
     end = text.find(delim, start);
+<<<<<<< HEAD
     auto token = text.substr(start, end - start);
     tokens.emplace_back(token.begin(), token.end());
+=======
+    tokens.emplace_back(text.substr(start, end - start));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   return tokens;
 }
diff --git a/torch/csrc/lazy/core/ir.cpp b/torch/csrc/lazy/core/ir.cpp
index 8dac3e563cb4..63698a5309c1 100644
--- a/torch/csrc/lazy/core/ir.cpp
+++ b/torch/csrc/lazy/core/ir.cpp
@@ -6,7 +6,10 @@
 #include <torch/csrc/lazy/core/ir_metadata.h>
 
 // Enables caching on for dynamic shapes (aka disable hash on shapes)
+<<<<<<< HEAD
 // NOLINTNEXTLINE(misc-use-internal-linkage)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // clang-format off
 C10_DEFINE_bool(
     ltc_enable_dynamic_shapes,
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index f0ca671fd59b..fa09babc9518 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -1042,7 +1042,11 @@ std::vector<BackendDataPtr> LazyGraphExecutor::GatherTensorsData(
 void LazyGraphExecutor::TensorCollectionBarrier(SyncTensorCollection* coll) {
   if (coll) {
     static const std::string invalid_device(
+<<<<<<< HEAD
         "Unknown0"); /* Temp solution to idetify unassigned devices */
+=======
+        "Unknown0"); /* Temp solution to identify unassigned devices */
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (coll->device.toString() == invalid_device || !coll->unlocker.empty()) {
       return;
     }
diff --git a/torch/csrc/lazy/core/metrics.h b/torch/csrc/lazy/core/metrics.h
index 05b525778d9a..105b0247a6cc 100644
--- a/torch/csrc/lazy/core/metrics.h
+++ b/torch/csrc/lazy/core/metrics.h
@@ -1,6 +1,10 @@
 /**
  * This file is adapted from PyTorch/XLA
+<<<<<<< HEAD
  * https://github.com/pytorch/xla/blob/master/third_party/xla_client/metrics.h
+=======
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/metrics.h
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 
 #pragma once
@@ -232,7 +236,11 @@ TORCH_API std::string CreateMetricReport(
     const std::vector<std::string>& metric_names);
 
 // Returns the currently registered metric names. Note that the list can grow
+<<<<<<< HEAD
 // since metrics are usually function intialized (they are static function
+=======
+// since metrics are usually function initialized (they are static function
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // variables).
 TORCH_API std::vector<std::string> GetMetricNames();
 
@@ -241,7 +249,11 @@ TORCH_API std::vector<std::string> GetMetricNames();
 TORCH_API MetricData* GetMetric(const std::string& name);
 
 // Returns the currently registered counter names. Note that the list can grow
+<<<<<<< HEAD
 // since counters are usually function intialized (they are static function
+=======
+// since counters are usually function initialized (they are static function
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // variables).
 TORCH_API std::vector<std::string> GetCounterNames();
 
diff --git a/torch/csrc/lazy/core/multi_wait.h b/torch/csrc/lazy/core/multi_wait.h
index a3a33ee3975f..781554662f3b 100644
--- a/torch/csrc/lazy/core/multi_wait.h
+++ b/torch/csrc/lazy/core/multi_wait.h
@@ -1,6 +1,10 @@
 /**
  * This file is adapted from PyTorch/XLA
+<<<<<<< HEAD
  * https://github.com/pytorch/xla/blob/master/third_party/xla_client/multi_wait.h
+=======
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/multi_wait.h
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 
 #pragma once
diff --git a/torch/csrc/lazy/core/shape.cpp b/torch/csrc/lazy/core/shape.cpp
index fbdb38cbdec1..1ab24faa404c 100644
--- a/torch/csrc/lazy/core/shape.cpp
+++ b/torch/csrc/lazy/core/shape.cpp
@@ -1,10 +1,17 @@
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 #include <torch/csrc/lazy/core/shape.h>
 #include <torch/csrc/lazy/core/tensor.h>
 
 #include <utility>
 
+<<<<<<< HEAD
 // NOLINTNEXTLINE(misc-use-internal-linkage)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DEFINE_bool(
     ltc_enable_symbolic_shapes,
     false,
@@ -58,7 +65,11 @@ Shape Shape::with_symbolic_dims(
 }
 
 bool symbolicShapeEnabled() {
+<<<<<<< HEAD
   static bool enabled = std::getenv("LTC_ENABLE_SYMBOLIC_SHAPES") != nullptr;
+=======
+  static bool enabled = c10::utils::has_env("LTC_ENABLE_SYMBOLIC_SHAPES");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return enabled || FLAGS_ltc_enable_symbolic_shapes;
 }
 
diff --git a/torch/csrc/lazy/core/shape.h b/torch/csrc/lazy/core/shape.h
index 8b657a19b256..6283cab56b3e 100644
--- a/torch/csrc/lazy/core/shape.h
+++ b/torch/csrc/lazy/core/shape.h
@@ -60,9 +60,15 @@ class TORCH_API Shape {
 
   // Sizes are the upper bound sizes for a tensor, used by XLA.
   std::vector<int64_t> sizes_;
+<<<<<<< HEAD
   // Stores which dimmensions are symbolic
   // If nullopt, either it hasn't been initialized or the symbolic
   // dimmensions are not calculatable
+=======
+  // Stores which dimensions are symbolic
+  // If nullopt, either it hasn't been initialized or the symbolic
+  // dimensions are not calculable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::optional<std::vector<bool>> is_symbolic_ = std::nullopt;
 };
 
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index e2e9795ad5a4..9dd973bd20f8 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -59,6 +59,10 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/native/ConvUtils.h>
+<<<<<<< HEAD
+=======
+#include <ATen/native/RangeUtils.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/TensorConversions.h>
 #include <c10/core/ScalarType.h>
@@ -72,7 +76,11 @@
 
 namespace torch::lazy {
 
+<<<<<<< HEAD
 // Copied from ATen/native/utils/ParamUtils.h, which aparently I can't include
+=======
+// Copied from ATen/native/utils/ParamUtils.h, which apparently I can't include
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // from here?
 static std::vector<int64_t> expand_param_if_needed(
     at::IntArrayRef list_param,
@@ -106,9 +114,12 @@ TORCH_API std::vector<Shape> compute_shape_arange_out(
         // Note: acc_type further defines an accumulataion type depending on the
         // scalar_t and whether its on cuda vs cpu.
         using accscalar_t = at::acc_type<scalar_t, false>;
+<<<<<<< HEAD
         auto xstart = start.to<accscalar_t>();
         auto xend = end.to<accscalar_t>();
         auto xstep = step.to<accscalar_t>();
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         // we use double precision for (start - end) / step
         // to compute size_d for consistency across devices.
@@ -129,6 +140,7 @@ TORCH_API std::vector<Shape> compute_shape_arange_out(
               step.to<double>());
         }
 
+<<<<<<< HEAD
         TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
         TORCH_CHECK(
             std::isfinite(static_cast<double>(xstart)) &&
@@ -141,6 +153,9 @@ TORCH_API std::vector<Shape> compute_shape_arange_out(
             ((xstep > 0) && (xend >= xstart)) ||
                 ((xstep < 0) && (xend <= xstart)),
             "upper bound and larger bound inconsistent with step sign");
+=======
+        at::native::arange_check_bounds(start, end, step);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         TORCH_CHECK(
             size_d >= 0 &&
@@ -294,7 +309,11 @@ std::vector<Shape> compute_shape_convolution(
   TORCH_CHECK(dim > 0, "weight should have at least three dimensions");
 
   // at::convolution performs parameter expansion before running kernels on
+<<<<<<< HEAD
   // expanded parameters we must do the same.  Shape formulae access differnent
+=======
+  // expanded parameters we must do the same.  Shape formulae access different
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // dimensions of e.g. output_padding, but output_padding may be passed in as a
   // scalar.  Sadly, accessing output_padding[1] in this case gives incorrect
   // results rather than indexing error
@@ -367,7 +386,11 @@ static std::vector<Shape> compute_shape_nonzero(
   for (auto dim_size : t.sizes()) {
     max_elements *= dim_size;
   }
+<<<<<<< HEAD
   return {Shape(at::kLong, {max_elements, (int64_t)t.sizes().size()})};
+=======
+  return {Shape(at::kLong, {max_elements, t.dim()})};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 std::vector<Shape> compute_shape_nonzero(const at::Tensor& self) {
@@ -540,7 +563,11 @@ std::vector<torch::lazy::Shape> compute_shape_native_batch_norm(
 
   // A separate mean and var needs to be kept for each channel.
   TORCH_CHECK(
+<<<<<<< HEAD
       input.sizes().size() >= 2,
+=======
+      input.dim() >= 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Input tensor must have at least batch and channel dimensions!");
   int64_t num_features = input.size(1);
 
@@ -581,7 +608,11 @@ std::vector<torch::lazy::Shape> compute_shape_native_batch_norm_backward(
 
   // A separate mean and var needs to be kept for each channel.
   TORCH_CHECK(
+<<<<<<< HEAD
       input.sizes().size() >= 2,
+=======
+      input.dim() >= 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       "Input tensor must have at least batch and channel dimensions!");
   int64_t num_features = input.size(1);
 
diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp
index 4f153399b1e6..ac36080fe2bb 100644
--- a/torch/csrc/lazy/core/tensor.cpp
+++ b/torch/csrc/lazy/core/tensor.cpp
@@ -252,7 +252,11 @@ at::Tensor LazyTensor::ToTensor(bool detached) {
     tensor = *tensor_data;
     if (detached) {
       if (data()->ir_value || data()->handle != nullptr) {
+<<<<<<< HEAD
         // If we have other authoritive sources, just drop our reference and
+=======
+        // If we have other authoritative sources, just drop our reference and
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         // transfer it to the caller.
         data()->tensor_data = std::nullopt;
       } else {
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index b739399b6bbd..6c6474d35c3b 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -125,7 +125,11 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
 
   // Retrieves the IR Node representing this LazyTensor. One will be created if
   // missing. Note that although this is a const API, it actually changes the
+<<<<<<< HEAD
   // internal state ofthe object.
+=======
+  // internal state of the object.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Value GetIrValue() const;
 
   void SetIrValue(Value ir_value);
@@ -231,7 +235,11 @@ TORCH_API at::Tensor CreateAtenFromLtcTensor(LazyTensor&& ltc_tensor);
 //   lazy tensors, then you should think of that function as an "entrypoint" to
 //   functionalization, and use functionalize_output=true Examples include:
 //   - factory functions (the LTC kernel for at::empty)
+<<<<<<< HEAD
 //   - CPU -> Lazy device converions (the LTC kernel for at::to_device)
+=======
+//   - CPU -> Lazy device conversions (the LTC kernel for at::to_device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //
 // Case 2: lazy -> lazy
 //   If you're implementing a function that takes in lazy tensors and returns
diff --git a/torch/csrc/lazy/core/thread_pool.h b/torch/csrc/lazy/core/thread_pool.h
index 2e0ae8f89d8e..03dcb6242600 100644
--- a/torch/csrc/lazy/core/thread_pool.h
+++ b/torch/csrc/lazy/core/thread_pool.h
@@ -1,6 +1,10 @@
 /**
  * This file is adapted from PyTorch/XLA
+<<<<<<< HEAD
  * https://github.com/pytorch/xla/blob/master/third_party/xla_client/metrics.h
+=======
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/metrics.h
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 
 #pragma once
diff --git a/torch/csrc/lazy/core/unique.h b/torch/csrc/lazy/core/unique.h
index 7f38c258658b..3310ad725a38 100644
--- a/torch/csrc/lazy/core/unique.h
+++ b/torch/csrc/lazy/core/unique.h
@@ -1,6 +1,10 @@
 /**
  * Unique in this file is adapted from PyTorch/XLA
+<<<<<<< HEAD
  * https://github.com/pytorch/xla/blob/master/third_party/xla_client/unique.h
+=======
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/unique.h
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 
 #pragma once
diff --git a/torch/csrc/lazy/core/util.h b/torch/csrc/lazy/core/util.h
index 694cda379a2a..de4e01608809 100644
--- a/torch/csrc/lazy/core/util.h
+++ b/torch/csrc/lazy/core/util.h
@@ -1,6 +1,10 @@
 /**
  * Most of the utils in this file is adapted from PyTorch/XLA
+<<<<<<< HEAD
  * https://github.com/pytorch/xla/blob/master/third_party/xla_client/util.h
+=======
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/util.h
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 
 #pragma once
diff --git a/torch/csrc/lazy/python/init.cpp b/torch/csrc/lazy/python/init.cpp
index f30615355e0e..ad388fe4a7d5 100644
--- a/torch/csrc/lazy/python/init.cpp
+++ b/torch/csrc/lazy/python/init.cpp
@@ -27,7 +27,12 @@ namespace torch::lazy {
 // be simplified but it should probably be done together with
 // designing/refactoring the overall approach to get/set of default eager/lazy
 // device types
+<<<<<<< HEAD
 torch::lazy::BackendDevice GetDeviceOrCurrent(const std::string& device_str) {
+=======
+static torch::lazy::BackendDevice GetDeviceOrCurrent(
+    const std::string& device_str) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (device_str.empty()) {
     getBackend()->GetDefaultDeviceType();
     return torch::lazy::BackendDevice();
@@ -35,15 +40,26 @@ torch::lazy::BackendDevice GetDeviceOrCurrent(const std::string& device_str) {
   return torch::lazy::atenDeviceToBackendDevice(c10::Device(device_str));
 }
 
+<<<<<<< HEAD
 std::ptrdiff_t GetTensorId(const at::Tensor& tensor) {
+=======
+static std::ptrdiff_t GetTensorId(const at::Tensor& tensor) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   torch::lazy::LazyTensorPtr lazy_tensor = torch::lazy::TryGetLtcTensor(tensor);
   return lazy_tensor->GetUniqueId();
 }
 
+<<<<<<< HEAD
 std::string GetTensorsDump(
     const std::vector<at::Tensor>& tensors,
     const std::function<std::string(c10::ArrayRef<const torch::lazy::Node*>)>&
         coverter) {
+=======
+static std::string GetTensorsDump(
+    const std::vector<at::Tensor>& tensors,
+    const std::function<std::string(c10::ArrayRef<const torch::lazy::Node*>)>&
+        converter) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<const torch::lazy::Node*> nodes;
   std::vector<torch::lazy::Value> values;
   for (auto& tensor : tensors) {
@@ -53,10 +69,17 @@ std::string GetTensorsDump(
     values.push_back(lazy_tensor->GetIrValue());
     nodes.push_back(values.back().node.get());
   }
+<<<<<<< HEAD
   return coverter(nodes);
 }
 
 std::vector<torch::lazy::LazyTensorPtr> GetLtcTensors(
+=======
+  return converter(nodes);
+}
+
+static std::vector<torch::lazy::LazyTensorPtr> GetLtcTensors(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::vector<at::Tensor>& tensors,
     bool want_all) {
   std::vector<torch::lazy::LazyTensorPtr> lazy_tensors;
@@ -76,14 +99,23 @@ std::vector<torch::lazy::LazyTensorPtr> GetLtcTensors(
   return lazy_tensors;
 }
 
+<<<<<<< HEAD
 std::string GetTensorsBackendGraph(const std::vector<at::Tensor>& tensors) {
+=======
+static std::string GetTensorsBackendGraph(
+    const std::vector<at::Tensor>& tensors) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   std::vector<torch::lazy::LazyTensorPtr> lazy_tensors =
       GetLtcTensors(tensors, /*want_all=*/false);
   return torch::lazy::LazyGraphExecutor::Get()->DumpBackendComputation(
       lazy_tensors);
 }
 
+<<<<<<< HEAD
 void SyncTensors(
+=======
+static void SyncTensors(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     const std::vector<at::Tensor>& tensors,
     const std::vector<std::string>& devices,
     bool wait,
@@ -101,7 +133,11 @@ void initLazyBindings(PyObject* module) {
 
   lazy.def(
       "_mark_step",
+<<<<<<< HEAD
       // TODO(whc) this API should probably change from vector<string> to
+=======
+      // TODO(whc) this API should probably change from vector<std::string> to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // vector<c10::device> but in a separate PR
       [](const std::string& device_str,
          const std::vector<std::string>& devices,
@@ -144,18 +180,32 @@ void initLazyBindings(PyObject* module) {
   lazy.def(
       "_get_tensors_text",
       [](const std::vector<at::Tensor>& tensors) -> std::string {
+<<<<<<< HEAD
         auto coverter = [](c10::ArrayRef<const torch::lazy::Node*> nodes) {
           return torch::lazy::DumpUtil::ToText(nodes);
         };
         return GetTensorsDump(tensors, coverter);
+=======
+        auto converter = [](c10::ArrayRef<const torch::lazy::Node*> nodes) {
+          return torch::lazy::DumpUtil::ToText(nodes);
+        };
+        return GetTensorsDump(tensors, converter);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       });
   lazy.def(
       "_get_tensors_dot",
       [](const std::vector<at::Tensor>& tensors) -> std::string {
+<<<<<<< HEAD
         auto coverter = [](c10::ArrayRef<const torch::lazy::Node*> nodes) {
           return torch::lazy::DumpUtil::ToDot(nodes);
         };
         return GetTensorsDump(tensors, coverter);
+=======
+        auto converter = [](c10::ArrayRef<const torch::lazy::Node*> nodes) {
+          return torch::lazy::DumpUtil::ToDot(nodes);
+        };
+        return GetTensorsDump(tensors, converter);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       });
   lazy.def(
       "_get_tensors_backend",
@@ -323,10 +373,18 @@ void initLazyBindings(PyObject* module) {
 #endif // !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
   });
 
+<<<<<<< HEAD
   // GetPythonFramesFunction() has not ever worked with torchdeploy/multipy
   // possibly becuase GetPythonFrames resolves to external cpython rather
   // than embedded cpython. So far this problem has only been observed
   // internally, so we will just block it off there.
+=======
+  // GetPythonFramesFunction() has not ever worked with
+  // torchdeploy/multipy possibly because  // codespell:ignore multipy
+  // GetPythonFrames resolves to external cpython rather than embedded cpython.
+  // So far this problem has only been observed internally, so we will just
+  // block it off there.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #if !(defined(USE_DEPLOY))
 
diff --git a/torch/csrc/lazy/ts_backend/config.cpp b/torch/csrc/lazy/ts_backend/config.cpp
index ec098d4dc6e9..d254738eb868 100644
--- a/torch/csrc/lazy/ts_backend/config.cpp
+++ b/torch/csrc/lazy/ts_backend/config.cpp
@@ -1,7 +1,13 @@
 #include <torch/csrc/lazy/core/config.h>
+<<<<<<< HEAD
 
 // TODO(whc) unclear if this is useful, has only been tested as true
 // NOLINTNEXTLINE(misc-use-internal-linkage)
+=======
+#include <torch/csrc/lazy/ts_backend/config.h>
+
+// TODO(whc) unclear if this is useful, has only been tested as true
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DEFINE_bool(
     torch_lazy_ts_tensor_update_sync,
     true,
@@ -9,7 +15,10 @@ C10_DEFINE_bool(
 
 // TODO(whc) we need to hook up these flags in a more useful way
 // possibly also keep LTC_TS_CUDA env working?
+<<<<<<< HEAD
 // NOLINTNEXTLINE(misc-use-internal-linkage)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 C10_DEFINE_bool(
     torch_lazy_ts_cuda,
     false,
diff --git a/torch/csrc/lazy/ts_backend/ops/device_data.cpp b/torch/csrc/lazy/ts_backend/ops/device_data.cpp
index 8567f1d2ed8c..407f088a5be9 100644
--- a/torch/csrc/lazy/ts_backend/ops/device_data.cpp
+++ b/torch/csrc/lazy/ts_backend/ops/device_data.cpp
@@ -30,7 +30,11 @@ NodePtr DeviceData::Create(const std::shared_ptr<BackendData>& data) {
   // ReuseOrMakeNode may return a reused node which has the same shape,
   // however, we need to replace the old data_ with the new one.
   // Ditching the old data_ is safe because tracing is done iteration
+<<<<<<< HEAD
   // by iteration, and after we lauch the async device execution for the
+=======
+  // by iteration, and after we launch the async device execution for the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // previous iteration, data_ in DeviceData nodes are not needed anymore.
   DeviceData* device_data = static_cast<DeviceData*>(node.get());
   device_data->SetData(data);
diff --git a/torch/csrc/lazy/ts_backend/ops/to_copy.h b/torch/csrc/lazy/ts_backend/ops/to_copy.h
index 53e0d76689c7..dc8f759b17a7 100644
--- a/torch/csrc/lazy/ts_backend/ops/to_copy.h
+++ b/torch/csrc/lazy/ts_backend/ops/to_copy.h
@@ -5,8 +5,13 @@
 namespace torch::lazy {
 
 // This IR was copied from code-generated output, but the entire _to_copy
+<<<<<<< HEAD
 // operator cannot be trivially code genereated since it is only desirable to
 // capture IR for certain permutaions of _to_copy (e.g. dtype), and for the
+=======
+// operator cannot be trivially code generated since it is only desirable to
+// capture IR for certain permutations of _to_copy (e.g. dtype), and for the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // others it is difficult to even invoke the aten/eager fallback necessitating
 // directly implementing the right to(device) behavior
 class ToCopy : public torch::lazy::TsNode {
diff --git a/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
index 055ca006528f..070f9e601909 100644
--- a/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
@@ -271,7 +271,11 @@ void ts_eager_fallback(
   // the temporary eager output tensor that we created.
   //
   // Note [Eager Fallback Does Not Handle View Operators]
+<<<<<<< HEAD
   // Also note that we are incapable of handling immutable alises properly.
+=======
+  // Also note that we are incapable of handling immutable aliases properly.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Why?
   // Schemas with an immutable alias'd tensor outputs correspond to view
   // operators. For example, the `view_as` schema from native_functions.yaml:
@@ -340,7 +344,11 @@ void ts_eager_fallback(
             // We should never hit this for a view op,
             // because LazyTensor should provide a lowering for the
             // corresponding view_copy operator. The functionalization pass will
+<<<<<<< HEAD
             // take care of calling the view_copy operator intead of the view.
+=======
+            // take care of calling the view_copy operator instead of the view.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             TORCH_CHECK(
                 false,
                 "The operator ",
diff --git a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
index 99fca62916d0..99d457a11e51 100644
--- a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
@@ -398,7 +398,11 @@ at::Tensor LazyNativeFunctions::lift_fresh(const at::Tensor& tensor) {
 
 // All of the below ops correspond to CompositeExplicitAutograd kernels from
 // core that call into view operators internally. These are all composite ops
+<<<<<<< HEAD
 // that LTC can technically re-use / get for free, but we need to
+=======
+// that LTC can technically reuse / get for free, but we need to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // "functionalize" them to remove the view ops before we can use them.
 at::Tensor LazyNativeFunctions::block_diag(at::TensorList tensors) {
   return at::functionalization::functionalize_aten_op<ATEN_OP(
@@ -529,7 +533,11 @@ at::Tensor LazyNativeFunctions::slice_backward_symint(
           std::move(step));
 }
 
+<<<<<<< HEAD
 // re-use the composite kernel from core, that way we don't need to provide a
+=======
+// reuse the composite kernel from core, that way we don't need to provide a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // backwards formula for native_group_norm
 std::tuple<Tensor, Tensor, Tensor> LazyNativeFunctions::native_group_norm(
     const at::Tensor& input,
diff --git a/torch/csrc/lazy/ts_backend/ts_node.h b/torch/csrc/lazy/ts_backend/ts_node.h
index 125d4c1283d8..fe4334953f9f 100644
--- a/torch/csrc/lazy/ts_backend/ts_node.h
+++ b/torch/csrc/lazy/ts_backend/ts_node.h
@@ -78,7 +78,11 @@ const OpKind tensor_list_opkind = OpKind::Get("lazy_tensors::tensor_list");
 // Note: shape is undefined for TensorList.  We assert in some places that
 // #shapes matches #outputs and this stems from
 //       the fact that currently all IR nodes represent tensors (there is no
+<<<<<<< HEAD
 //       type system for this IR).  Becuase of this, TensorList is a bit of a
+=======
+//       type system for this IR).  Because of this, TensorList is a bit of a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 //       hack.
 //
 // TODO(whc) once Shape() API is moved to Node base, also make it virtual, and
diff --git a/torch/csrc/lazy/tutorial.md b/torch/csrc/lazy/tutorial.md
index b72ae13eca7d..086bc84537cb 100644
--- a/torch/csrc/lazy/tutorial.md
+++ b/torch/csrc/lazy/tutorial.md
@@ -218,7 +218,11 @@ If we don't stop the trace after `optimizer_step` it will include two or more it
 Another important point is that after `mark_step()` we actually continue tracing the next iteration! And... start executing the previous one at the same time! Really, nothing stops us from tracing the next iteration ...and then the one after next until we hit `if batch_idx % log_interval == 0:` where
 we actually need to wait for execution to catch up, so we can print out `loss`. Remember to avoid accessing intermediate results too often if you would like to extract the maximum benefit out of Lazy Tensor.
 
+<<<<<<< HEAD
 Since every iteration looks exactly like the one before it, the TS backend will be re-using the same TS compilation.
+=======
+Since every iteration looks exactly like the one before it, the TS backend will be reusing the same TS compilation.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Alright, let's run it now!
 
diff --git a/torch/csrc/monitor/counters.h b/torch/csrc/monitor/counters.h
index 986dfb7b85ca..9c6b399b2c52 100644
--- a/torch/csrc/monitor/counters.h
+++ b/torch/csrc/monitor/counters.h
@@ -36,12 +36,20 @@ enum class C10_API_ENUM Aggregation {
   MIN = 6,
 };
 
+<<<<<<< HEAD
 struct TORCH_API AggregationHash {
   template <typename T>
   std::size_t operator()(T t) const {
     return static_cast<std::size_t>(t);
   }
 };
+=======
+struct TORCH_API AggregationHash{template <typename T> std::size_t operator()(
+    T t) const {return static_cast<std::size_t>(t);
+} // namespace torch::monitor
+}
+;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // aggregationName returns the human readable name corresponding to the
 // aggregation.
diff --git a/torch/csrc/monitor/events.cpp b/torch/csrc/monitor/events.cpp
index 61eda8bfd10a..ab786a585c4f 100644
--- a/torch/csrc/monitor/events.cpp
+++ b/torch/csrc/monitor/events.cpp
@@ -32,8 +32,13 @@ class EventHandlers {
   }
 
   static EventHandlers& get() noexcept {
+<<<<<<< HEAD
     static auto ehsPtr = new EventHandlers();
     return *ehsPtr;
+=======
+    static auto ehs = EventHandlers();
+    return ehs;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
  private:
diff --git a/torch/csrc/monitor/python_init.cpp b/torch/csrc/monitor/python_init.cpp
index a2a94dbffaa6..385816781d57 100644
--- a/torch/csrc/monitor/python_init.cpp
+++ b/torch/csrc/monitor/python_init.cpp
@@ -14,6 +14,10 @@
 
 #include <torch/csrc/monitor/counters.h>
 #include <torch/csrc/monitor/events.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/monitor/python_init.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace pybind11::detail {
 template <>
diff --git a/torch/csrc/mps/Module.cpp b/torch/csrc/mps/Module.cpp
index acc9b1a1b22f..bbb5dc7a9294 100644
--- a/torch/csrc/mps/Module.cpp
+++ b/torch/csrc/mps/Module.cpp
@@ -4,17 +4,26 @@
 #include <pybind11/pytypes.h>
 #include <torch/csrc/Generator.h>
 #include <torch/csrc/THP.h>
+<<<<<<< HEAD
 #include <torch/csrc/python_headers.h>
+=======
+#include <torch/csrc/mps/Module.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/device_lazy_init.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <memory>
 
+<<<<<<< HEAD
 // pthread.h is included for tracking bad forks
 #ifndef WIN32
 #include <pthread.h>
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #ifdef USE_MPS
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/MetalShaderLibrary.h>
@@ -22,6 +31,7 @@
 
 namespace torch::mps {
 
+<<<<<<< HEAD
 namespace {
 // True for children forked after mps init
 static bool in_bad_fork = false;
@@ -43,6 +53,11 @@ static void track_bad_mps_fork() {
 static PyObject* MPSModule_isInBadFork(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   return PyBool_FromLong(in_bad_fork);
+=======
+static PyObject* MPSModule_isInBadFork(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  return PyBool_FromLong(torch::utils::is_device_in_bad_fork(at::kMPS));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   END_HANDLE_TH_ERRORS
 }
 
@@ -50,7 +65,11 @@ static PyObject* MPSModule_getDefaultMPSGenerator(
     PyObject* _unused,
     PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   track_bad_mps_fork();
+=======
+  torch::utils::register_fork_handler_for_device_init(at::kMPS);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPGenerator_initDefaultGenerator(
       at::detail::getMPSHooks().getDefaultGenerator());
   END_HANDLE_TH_ERRORS
@@ -58,8 +77,13 @@ static PyObject* MPSModule_getDefaultMPSGenerator(
 
 static PyObject* MPSModule_isAvailable(PyObject* _unused, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   track_bad_mps_fork();
   if (at::detail::getMPSHooks().hasMPS()) {
+=======
+  if (at::detail::getMPSHooks().hasMPS()) {
+    torch::utils::register_fork_handler_for_device_init(at::kMPS);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
@@ -393,6 +417,18 @@ struct OptionalArgCaster {
       } else if (py::isinstance<py::float_>(element)) {
         auto values = arg.cast<std::vector<float>>();
         setValue(f, idx, values);
+<<<<<<< HEAD
+=======
+      } else if (THPVariable_Check(element.ptr())) {
+        /* List of tensors, most often to overcome the limits of 32-args per
+         * kernel */
+        auto tensorlist = py::cast<std::vector<at::Tensor>>(arg);
+        std::vector<void*> tl_ptrs;
+        for (auto& t : tensorlist) {
+          tl_ptrs.push_back(at::native::mps::get_tensor_gpu_address(t));
+        }
+        f.setArg(idx, tl_ptrs);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       } else {
         TORCH_CHECK(false, "Unexpected argument types");
       }
@@ -455,7 +491,11 @@ void initModule(PyObject* module) {
               }
               TORCH_CHECK(
                   threads.has_value() && threads->size() < 4,
+<<<<<<< HEAD
                   "Number of threads is undefined or has wrong dimention");
+=======
+                  "Number of threads is undefined or has wrong dimension");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               TORCH_CHECK(
                   !group_size.has_value() ||
                   threads->size() == group_size->size());
diff --git a/torch/csrc/mtia/Module.cpp b/torch/csrc/mtia/Module.cpp
index a273f4d8af3e..9dd57870785c 100644
--- a/torch/csrc/mtia/Module.cpp
+++ b/torch/csrc/mtia/Module.cpp
@@ -3,6 +3,7 @@
 #include <c10/core/Stream.h>
 #include <torch/csrc/Generator.h>
 #include <torch/csrc/Stream.h>
+<<<<<<< HEAD
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/pybind.h>
@@ -32,12 +33,26 @@ static void poison_fork() {
 #endif
 }
 
+=======
+#include <torch/csrc/mtia/Module.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/device_lazy_init.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::mtia {
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 void initModule(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
   m.def("_mtia_init", []() {
+<<<<<<< HEAD
     TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
     poison_fork();
+=======
+    TORCH_INTERNAL_ASSERT(!torch::utils::is_device_in_bad_fork(at::kMTIA));
+    torch::utils::register_fork_handler_for_device_init(at::kMTIA);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     at::globalContext().lazyInitDevice(c10::DeviceType::MTIA);
   });
 
@@ -46,19 +61,50 @@ void initModule(PyObject* module) {
     return at::detail::isMTIAHooksBuilt();
   });
 
+<<<<<<< HEAD
   m.def("_mtia_isInBadFork", []() { return in_bad_fork; });
+=======
+  m.def("_mtia_isInBadFork", []() {
+    return torch::utils::is_device_in_bad_fork(at::kMTIA);
+  });
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.def("_mtia_getCurrentStream", [](c10::DeviceIndex device_index) {
     torch::utils::device_lazy_init(at::kMTIA);
     return at::detail::getMTIAHooks().getCurrentStream(device_index);
   });
 
+<<<<<<< HEAD
+=======
+  m.def("_mtia_getCurrentRawStream", [](c10::DeviceIndex device_index) {
+    torch::utils::device_lazy_init(at::kMTIA);
+    return at::detail::getMTIAHooks().getCurrentRawStream(device_index);
+  });
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("_mtia_deviceSynchronize", []() {
     torch::utils::device_lazy_init(at::kMTIA);
     at::detail::getMTIAHooks().deviceSynchronize(
         at::detail::getMTIAHooks().getCurrentDevice());
   });
 
+<<<<<<< HEAD
+=======
+  m.def("_mtia_exchangeDevice", [](c10::DeviceIndex device_index) {
+    if (device_index < 0) {
+      return static_cast<c10::DeviceIndex>(-1);
+    }
+    return at::detail::getMTIAHooks().exchangeDevice(device_index);
+  });
+
+  m.def("_mtia_maybeExchangeDevice", [](c10::DeviceIndex device_index) {
+    if (device_index < 0) {
+      return static_cast<c10::DeviceIndex>(-1);
+    }
+    return at::detail::getMTIAHooks().maybeExchangeDevice(device_index);
+  });
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("_mtia_getDefaultStream", [](c10::DeviceIndex device_index) {
     torch::utils::device_lazy_init(at::kMTIA);
     return at::detail::getMTIAHooks().getDefaultStream(device_index);
@@ -85,6 +131,15 @@ void initModule(PyObject* module) {
     return py::reinterpret_steal<py::object>(raw_pyobject);
   });
 
+<<<<<<< HEAD
+=======
+  m.def("_mtia_getDeviceProperties", [](c10::DeviceIndex device_index) {
+    PyObject* raw_pyobject =
+        at::detail::getMTIAHooks().getDeviceProperties(device_index);
+    return py::reinterpret_steal<py::object>(raw_pyobject);
+  });
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("_mtia_emptyCache", []() { at::detail::getMTIAHooks().emptyCache(); });
 
   m.def(
@@ -97,10 +152,23 @@ void initModule(PyObject* module) {
       });
 
   m.def("_mtia_memorySnapshot", []() {
+<<<<<<< HEAD
     PyObject* raw_pyobject = at::detail::getMTIAHooks().memorySnapshot();
     return py::reinterpret_steal<py::object>(raw_pyobject);
   });
 
+=======
+    PyObject* raw_pyobject =
+        at::detail::getMTIAHooks().memorySnapshot(std::nullopt);
+    return py::reinterpret_steal<py::object>(raw_pyobject);
+  });
+
+  m.def("_mtia_attachOutOfMemoryObserver", [](const py::function& observer) {
+    at::detail::getMTIAHooks().attachOutOfMemoryObserver(observer.ptr());
+    return;
+  });
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   m.def("_mtia_getDeviceCount", []() {
     return at::detail::getMTIAHooks().deviceCount();
   });
diff --git a/torch/csrc/mtia/profiler/MTIAMemoryProfiler.cpp b/torch/csrc/mtia/profiler/MTIAMemoryProfiler.cpp
new file mode 100644
index 000000000000..4ecc4c9bcf60
--- /dev/null
+++ b/torch/csrc/mtia/profiler/MTIAMemoryProfiler.cpp
@@ -0,0 +1,35 @@
+#include <ATen/Context.h>
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <nlohmann/json.hpp>
+#include <torch/csrc/mtia/profiler/MTIAMemoryProfiler.h>
+
+using json = nlohmann::json;
+
+namespace torch::mtia {
+
+void MTIAMemoryProfiler::start() {
+  at::detail::getMTIAHooks().recordMemoryHistory("all", "all", 150000);
+}
+
+void MTIAMemoryProfiler::export_memory_history(const std::string& path) {
+  at::detail::getMTIAHooks().memorySnapshot(path);
+  return;
+}
+
+void MTIAMemoryProfiler::stop() {
+  at::detail::getMTIAHooks().recordMemoryHistory(std::nullopt, "all", 0);
+}
+
+std::unique_ptr<torch::profiler::impl::python_tracer::PythonMemoryTracerBase>
+getMemoryTracer() {
+  return std::make_unique<MTIAMemoryProfiler>();
+}
+
+void initMemoryProfiler() {
+  if (at::detail::isMTIAHooksBuilt()) {
+    fprintf(stderr, "Initializing MTIA Memory Tracer\n");
+    torch::profiler::impl::python_tracer::registerMemoryTracer(
+        &getMemoryTracer);
+  }
+}
+} // namespace torch::mtia
diff --git a/torch/csrc/mtia/profiler/MTIAMemoryProfiler.h b/torch/csrc/mtia/profiler/MTIAMemoryProfiler.h
new file mode 100644
index 000000000000..8ce22f2af780
--- /dev/null
+++ b/torch/csrc/mtia/profiler/MTIAMemoryProfiler.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <torch/csrc/profiler/orchestration/python_tracer.h>
+
+namespace torch::mtia {
+using namespace torch::profiler::impl::python_tracer;
+
+void initMemoryProfiler();
+
+std::unique_ptr<PythonMemoryTracerBase> getMemoryTracer();
+
+class MTIAMemoryProfiler final : public PythonMemoryTracerBase {
+ public:
+  explicit MTIAMemoryProfiler() = default;
+  ~MTIAMemoryProfiler() override = default;
+  void start() override;
+  void stop() override;
+  void export_memory_history(const std::string& path) override;
+};
+
+} // namespace torch::mtia
diff --git a/torch/csrc/multiprocessing/init.cpp b/torch/csrc/multiprocessing/init.cpp
index 0720393fb0a7..ef48dcbd5c73 100644
--- a/torch/csrc/multiprocessing/init.cpp
+++ b/torch/csrc/multiprocessing/init.cpp
@@ -1,5 +1,9 @@
 #include <c10/util/thread_name.h>
 #include <torch/csrc/Exceptions.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/multiprocessing/init.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/object_ptr.h>
 #include <torch/csrc/utils/pybind.h>
diff --git a/torch/csrc/profiler/README.md b/torch/csrc/profiler/README.md
index 36c743132b50..ddcc9e931c88 100644
--- a/torch/csrc/profiler/README.md
+++ b/torch/csrc/profiler/README.md
@@ -23,7 +23,11 @@ TODO
 
 ## `RecordFunction` ##
 
+<<<<<<< HEAD
 [/aten/src/ATen/record_function.h](/aten/src/ATen/record_function.h)
+=======
+[aten/src/ATen/record_function.h](../../../aten/src/ATen/record_function.h)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 `RecordFunction` is used by the profiler to instrument CPU-side events.
 
@@ -38,7 +42,11 @@ There is also a python binding for `RecordFunction` in python (`with torch.profi
 The autograd engine is responsible for automatically computing gradients.
 
 The profiler records two pieces of information from the autograd engine:
+<<<<<<< HEAD
 * [Sequence number](/aten/src/ATen/SequenceNumber.h): this is a unique-per-thread index assigned to each op call(\*) in the forward pass. When a backward op is triggered, it is also assigned a sequence number matching the sequence number of the forward op that caused that backward op to be executed. Using this information, the profiler is able to match forward and backward ops; in chrome traces, this feature can be enabled with the "fwd_bwd" flow events
+=======
+* [Sequence number](../../../aten/src/ATen/SequenceNumber.h): this is a unique-per-thread index assigned to each op call(\*) in the forward pass. When a backward op is triggered, it is also assigned a sequence number matching the sequence number of the forward op that caused that backward op to be executed. Using this information, the profiler is able to match forward and backward ops; in chrome traces, this feature can be enabled with the "fwd_bwd" flow events
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 * [Forward thread id](https://github.com/pytorch/pytorch/blob/2e3fce54506ba82eee2c890410bf7a1405a64ec6/aten/src/ATen/record_function.h#L357): Autograd can be used in multi-threaded environments. The forward thread ID indicates the ID of the thread on which the forward op was executed on. This information is needed because the sequence number, mentioned above, is only unique within a thread; the forward thread ID is used for differentiating different ops with the same sequence number.
 
 (\*) Note that only op invocations whose inputs require gradients are assigned a sequence number
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index bd3bd3cb33b9..2e021d0c96c5 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -808,6 +808,7 @@ void generateForwardBackwardLink(
 
 void generateForwardBackwardLinks(
     std::unique_ptr<torch::profiler::impl::kineto::trace_t>& cpu_trace,
+<<<<<<< HEAD
     const std::vector<std::shared_ptr<Result>>& results){
 #ifndef USE_KINETO
 }
@@ -858,6 +859,60 @@ for (auto& [profiler_result, activity] : torch_events) {
   generateForwardBackwardLink(
       *profiler_result, fwd_bwd_link_id, *activity, tidSeq2activity);
 }
+=======
+    const std::vector<std::shared_ptr<Result>>& results) {
+#ifndef USE_KINETO
+}
+#else // USE_KINETO
+  TORCH_INTERNAL_ASSERT(cpu_trace->activities.size() == results.size());
+
+  // startThreadId_seqNum to pointer of activity.
+  // Low-16bits of startThreadId and low-48bits seqNum are concatenated into
+  // one uint64_t variable as key.
+
+  std::unordered_map<uint64_t, libkineto::GenericTraceActivity*>
+      tidSeq2activity;
+  uint64_t fwd_bwd_link_id = 1;
+
+  using result_activity_t =
+      std::pair<Result*, libkineto::GenericTraceActivity*>;
+  std::vector<result_activity_t> torch_events;
+
+  for (const auto idx : c10::irange(cpu_trace->activities.size())) {
+    auto& profiler_result = results[idx];
+    auto& activity = cpu_trace->activities[idx];
+
+    // add information about an associated forward op, if a sequence number
+    // is available (e.g. during training)
+
+    profiler_result->visit_if_base<ExtraFields<EventType::TorchOp>>(
+        [&](const auto& e) {
+          if (e.sequence_number_ >= 0) {
+            torch_events.emplace_back(profiler_result.get(), activity.get());
+          }
+        });
+  }
+
+  // We need to visit the events in chronological order.
+  // So we sort them by end_time_ns_ before processing.
+  std::sort(
+      torch_events.begin(),
+      torch_events.end(),
+      [](const result_activity_t& left, const result_activity_t& right) {
+        auto left_end_time =
+            std::get<ExtraFields<EventType::TorchOp>>(left.first->extra_fields_)
+                .end_time_ns_;
+        auto right_end_time = std::get<ExtraFields<EventType::TorchOp>>(
+                                  right.first->extra_fields_)
+                                  .end_time_ns_;
+        return left_end_time < right_end_time;
+      });
+
+  for (auto& [profiler_result, activity] : torch_events) {
+    generateForwardBackwardLink(
+        *profiler_result, fwd_bwd_link_id, *activity, tidSeq2activity);
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 #endif // USE_KINETO
 
@@ -875,6 +930,7 @@ void passEventsToKineto(
   // Generate Kineto events for each event recorded by the PyTorch profiler.
   for (const auto i : c10::irange(results.size())) {
     const auto& e = results[i];
+<<<<<<< HEAD
     // (TODO): This is a temporary fix for async traces to make sure that we do
     // not use int64 MIN as end time in Kineto. If we use that value, the
     // duration will overflow and become a very large positive number. For a
@@ -882,6 +938,19 @@ void passEventsToKineto(
     int64_t act_end_time = std::max(e->endTimeNS(), e->start_time_ns_);
     auto* activity = cpu_trace.addCPUActivity(
         e->name(),
+=======
+    // Here we are essentially setting the duration to -1 if the event never
+    // ends. This way Kineto will extend the event to the end of the trace. This
+    // is useful so that we can still have 0 duration events if necessary
+    // without extension
+    int64_t act_end_time = std::max(e->endTimeNS(), e->start_time_ns_ - 1);
+    std::string name = e->name();
+    if (!e->overload_name().empty()) {
+      name = fmt::format("{}.{}", e->name(), e->overload_name());
+    }
+    auto* activity = cpu_trace.addCPUActivity(
+        name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         e->kinetoType(),
         e->kineto_info_,
         e->correlationID(),
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index ab653f07bef8..c4a273d20e76 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -212,7 +212,13 @@ struct RawAllocation {
 };
 
 // For performance.
+<<<<<<< HEAD
 static_assert(c10::is_pod_v<RawAllocation>, "Non-POD member of RawAllocation.");
+=======
+static_assert(
+    std::is_trivial_v<RawAllocation>,
+    "Non-Trivial member of RawAllocation.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 template <>
 struct ExtraFields<EventType::Allocation> : RawAllocation {
@@ -238,8 +244,13 @@ struct ExtraFields<EventType::OutOfMemory> {
 
 // For performance.
 static_assert(
+<<<<<<< HEAD
     c10::is_pod_v<ExtraFields<EventType::OutOfMemory>>,
     "Non-POD member of ExtraFields<EventType::OutOfMemory>.");
+=======
+    std::is_trivial_v<ExtraFields<EventType::OutOfMemory>>,
+    "Non-Trivial member of ExtraFields<EventType::OutOfMemory>.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 struct PyFrameState {
   int line_no_;
diff --git a/torch/csrc/profiler/data_flow.cpp b/torch/csrc/profiler/data_flow.cpp
index 6738db961d93..bb904735e58e 100644
--- a/torch/csrc/profiler/data_flow.cpp
+++ b/torch/csrc/profiler/data_flow.cpp
@@ -58,7 +58,11 @@ struct RawTensors {
 
 void calculateUniqueTensorIDs(
     std::vector<std::shared_ptr<Result>>& sorted_results) {
+<<<<<<< HEAD
   // This task is equivilent to https://leetcode.com/problems/number-of-islands/
+=======
+  // This task is equivalent to https://leetcode.com/problems/number-of-islands/
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // We first cluster events with a greedy index assignment, and then merge
   // groups that overlap.
   std::vector<RawTensorInfo> tensors;
diff --git a/torch/csrc/profiler/data_flow.h b/torch/csrc/profiler/data_flow.h
index e2c1ace7b071..0684eb57cf55 100644
--- a/torch/csrc/profiler/data_flow.h
+++ b/torch/csrc/profiler/data_flow.h
@@ -35,7 +35,11 @@ using AllocationID = strong::type<
     strong::regular,
     strong::hashable>;
 
+<<<<<<< HEAD
 // We use a Tensor's TensorImpl adress and StorageImpl data start to build the
+=======
+// We use a Tensor's TensorImpl address and StorageImpl data start to build the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // data flow graph. We do not hold an owning reference so we wrap them in strong
 // types to prevent direct access.
 using TensorImplAddress = strong::type<
diff --git a/torch/csrc/profiler/events.h b/torch/csrc/profiler/events.h
index 78bac1fea19a..c2e1e04e37d9 100644
--- a/torch/csrc/profiler/events.h
+++ b/torch/csrc/profiler/events.h
@@ -13,7 +13,11 @@ using perf_counters_t = std::vector<uint64_t>;
 /* Standard list of performance events independent of hardware or backend */
 constexpr std::array<const char*, 2> ProfilerPerfEvents = {
     /*
+<<<<<<< HEAD
      * Number of Processing Elelement (PE) cycles between two points of interest
+=======
+     * Number of Processing Element (PE) cycles between two points of interest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      * in time. This should correlate positively with wall-time. Measured in
      * uint64_t. PE can be non cpu. TBD reporting behavior for multiple PEs
      * participating (i.e. threadpool).
diff --git a/torch/csrc/profiler/kineto_client_interface.cpp b/torch/csrc/profiler/kineto_client_interface.cpp
index fd145f4c4fa6..07f6b28632b8 100644
--- a/torch/csrc/profiler/kineto_client_interface.cpp
+++ b/torch/csrc/profiler/kineto_client_interface.cpp
@@ -2,6 +2,10 @@
 #include <ATen/Context.h>
 #include <libkineto.h>
 #include <torch/csrc/autograd/profiler_kineto.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/mtia/profiler/MTIAMemoryProfiler.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/profiler/kineto_client_interface.h>
 #include <chrono>
 #include <thread>
@@ -23,7 +27,13 @@ using namespace torch::autograd::profiler;
 
 class LibKinetoClient : public libkineto::ClientInterface {
  public:
+<<<<<<< HEAD
   void init() override {}
+=======
+  void init() override {
+    ::torch::mtia::initMemoryProfiler();
+  }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   void prepare(
       bool report_input_shapes = false,
@@ -58,6 +68,23 @@ class LibKinetoClient : public libkineto::ClientInterface {
     (void)disableProfiler();
   }
 
+<<<<<<< HEAD
+=======
+  void start_memory_profile() override {
+    LOG(INFO) << "Starting on-demand memory profile";
+    startMemoryProfile();
+  }
+
+  void stop_memory_profile() override {
+    LOG(INFO) << "Stopping on-demand memory profile";
+    stopMemoryProfile();
+  }
+
+  void export_memory_profile(const std::string& path) override {
+    exportMemoryProfile(path);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  private:
   // Temporarily disable shape collection until
   // we re-roll out the feature for on-demand cases
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index b08884d55d9e..96532e66d000 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -1,11 +1,19 @@
 #include <torch/csrc/profiler/collection.h>
 #include <torch/csrc/profiler/kineto_shim.h>
+<<<<<<< HEAD
+=======
+#include <type_traits>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #ifdef USE_KINETO
 #include <libkineto.h>
 #endif
 
 #include <c10/util/Exception.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 namespace torch {
 
@@ -48,7 +56,10 @@ const std::set<libkineto::ActivityType> kXpuTypes = {
 const std::set<libkineto::ActivityType> kMtiaTypes = {
     libkineto::ActivityType::MTIA_CCP_EVENTS,
     libkineto::ActivityType::MTIA_RUNTIME,
+<<<<<<< HEAD
     libkineto::ActivityType::MTIA_WORKLOADD,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 const std::set<libkineto::ActivityType> hpuTypes = {
     libkineto::ActivityType::HPU_OP,
@@ -66,7 +77,11 @@ const std::set<libkineto::ActivityType> kPrivateUse1Types = {
 #endif // USE_KINETO
 
 static_assert(
+<<<<<<< HEAD
     c10::is_pod_v<DeviceAndResource>,
+=======
+    std::is_trivial_v<DeviceAndResource>,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Kineto specific details should be in `kineto_ids`.");
 
 const DeviceAndResource kineto_ids() {
@@ -176,6 +191,7 @@ class ExperimentalConfigWrapper {
     return !config_.profiler_metrics.empty();
   }
 
+<<<<<<< HEAD
   void prepareTraceWithExperimentalOptions(bool add_cpu_activity) {
 #ifdef USE_KINETO
     std::set<libkineto::ActivityType> k_activities{
@@ -183,6 +199,17 @@ class ExperimentalConfigWrapper {
 
     // Only add CPU activities if we are measuring per kernel ranges
     if (add_cpu_activity && config_.profiler_measure_per_kernel) {
+=======
+  void prepareTraceWithExperimentalOptions(
+      std::set<libkineto::ActivityType>&& enabled_activities) {
+    std::set<libkineto::ActivityType> k_activities =
+        std::move(enabled_activities);
+#ifdef USE_KINETO
+    k_activities.insert(libkineto::ActivityType::CUDA_PROFILER_RANGE);
+
+    // Add CPU activities if we are measuring per kernel ranges
+    if (config_.profiler_measure_per_kernel) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       k_activities.insert(kCpuTypes.begin(), kCpuTypes.end());
     }
 
@@ -220,11 +247,17 @@ bool collectivesProfilerExists() {
 #if defined(KINETO_HAS_HCCL_PROFILER)
   return true;
 #endif
+<<<<<<< HEAD
   const char* val = std::getenv("TORCH_PROFILER_ENABLE_COLLECTIVE_PROFILING");
   if (val == nullptr) {
     return false;
   }
   return std::strcmp(val, "1") == 0;
+=======
+  const auto val =
+      c10::utils::get_env("TORCH_PROFILER_ENABLE_COLLECTIVE_PROFILING");
+  return val == "1";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 #ifdef USE_KINETO
@@ -289,7 +322,11 @@ void prepareTrace(
 
   // Experimental Configuration options are present
   if (config && configWrap.assertValid()) {
+<<<<<<< HEAD
     configWrap.prepareTraceWithExperimentalOptions(has_cpu_activity);
+=======
+    configWrap.prepareTraceWithExperimentalOptions(std::move(k_activities));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return;
   }
 
@@ -393,7 +430,11 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
     }
     // TODO: T151322015
     case libkineto::ActivityType::MTIA_CCP_EVENTS:
+<<<<<<< HEAD
     case libkineto::ActivityType::MTIA_WORKLOADD: {
+=======
+    case libkineto::ActivityType::MTIA_INSIGHT: {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // PrivateUse1 kineto backend reuse above ActivityTypes,
       // If PrivateUse1 backend enabled, this should return
       // c10::DeviceType::PrivateUse1.
diff --git a/torch/csrc/profiler/orchestration/python_tracer.cpp b/torch/csrc/profiler/orchestration/python_tracer.cpp
index e570a69cb696..1d94c31cd6af 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.cpp
+++ b/torch/csrc/profiler/orchestration/python_tracer.cpp
@@ -3,6 +3,10 @@
 namespace torch::profiler::impl::python_tracer {
 namespace {
 MakeFn make_fn;
+<<<<<<< HEAD
+=======
+MakeMemoryFn memory_make_fn;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 struct NoOpPythonTracer : public PythonTracerBase {
   NoOpPythonTracer() = default;
@@ -17,6 +21,18 @@ struct NoOpPythonTracer : public PythonTracerBase {
     return {};
   }
 };
+<<<<<<< HEAD
+=======
+
+struct NoOpMemoryPythonTracer : public PythonMemoryTracerBase {
+  NoOpMemoryPythonTracer() = default;
+  ~NoOpMemoryPythonTracer() override = default;
+  void start() override {}
+  void stop() override {}
+  void export_memory_history(const std::string&) override {}
+};
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace
 
 void registerTracer(MakeFn make_tracer) {
@@ -29,4 +45,18 @@ std::unique_ptr<PythonTracerBase> PythonTracerBase::make(RecordQueue* queue) {
   }
   return make_fn(queue);
 }
+<<<<<<< HEAD
+=======
+
+void registerMemoryTracer(MakeMemoryFn make_memory_tracer) {
+  memory_make_fn = make_memory_tracer;
+}
+
+std::unique_ptr<PythonMemoryTracerBase> PythonMemoryTracerBase::make() {
+  if (memory_make_fn == nullptr) {
+    return std::make_unique<NoOpMemoryPythonTracer>();
+  }
+  return memory_make_fn();
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::profiler::impl::python_tracer
diff --git a/torch/csrc/profiler/orchestration/python_tracer.h b/torch/csrc/profiler/orchestration/python_tracer.h
index 580bf523e7f5..221b2180b896 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.h
+++ b/torch/csrc/profiler/orchestration/python_tracer.h
@@ -56,5 +56,24 @@ struct TORCH_API PythonTracerBase {
 
 using MakeFn = std::unique_ptr<PythonTracerBase> (*)(RecordQueue*);
 TORCH_API void registerTracer(MakeFn make_tracer);
+<<<<<<< HEAD
+=======
+
+/**
+ * Memory Tracer Implementation
+ */
+struct TORCH_API PythonMemoryTracerBase {
+  static std::unique_ptr<PythonMemoryTracerBase> make();
+  virtual ~PythonMemoryTracerBase() = default;
+
+  virtual void start() = 0;
+  virtual void stop() = 0;
+  virtual void export_memory_history(const std::string& path) = 0;
+};
+
+using MakeMemoryFn = std::unique_ptr<PythonMemoryTracerBase> (*)();
+TORCH_API void registerMemoryTracer(MakeMemoryFn make_memory_tracer);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace python_tracer
 } // namespace torch::profiler::impl
diff --git a/torch/csrc/profiler/perf.cpp b/torch/csrc/profiler/perf.cpp
index ddd0ced7ec18..cc0db6c36bbd 100644
--- a/torch/csrc/profiler/perf.cpp
+++ b/torch/csrc/profiler/perf.cpp
@@ -17,7 +17,11 @@ namespace torch::profiler::impl::linux_perf {
 /*
  * Syscall wrapper for perf_event_open(2)
  */
+<<<<<<< HEAD
 inline long perf_event_open(
+=======
+inline static long perf_event_open(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     struct perf_event_attr* hw_event,
     pid_t pid,
     int cpu,
@@ -63,7 +67,11 @@ void PerfEvent::Init() {
     TORCH_CHECK(false, "Unsupported profiler event name: ", name_);
   }
 
+<<<<<<< HEAD
   struct perf_event_attr attr {};
+=======
+  struct perf_event_attr attr{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   attr.size = sizeof(perf_event_attr);
   attr.type = it->second.first;
diff --git a/torch/csrc/profiler/python/combined_traceback.cpp b/torch/csrc/profiler/python/combined_traceback.cpp
index f9e20541ed86..8278c0c07c57 100644
--- a/torch/csrc/profiler/python/combined_traceback.cpp
+++ b/torch/csrc/profiler/python/combined_traceback.cpp
@@ -115,6 +115,52 @@ struct PythonTraceback : public CapturedTraceback::Python {
 
 } // namespace
 
+<<<<<<< HEAD
+=======
+std::vector<nlohmann::json> json_symbolize(
+    std::vector<CapturedTraceback*>& to_symbolize) {
+  std::unordered_map<CapturedTraceback*, uint64_t> cached_frames;
+  std::vector<CapturedTraceback*> unique_frames;
+  for (const auto& sc : to_symbolize) {
+    auto it = cached_frames.find(sc);
+    if (it == cached_frames.end()) {
+      cached_frames.try_emplace(sc, unique_frames.size());
+      unique_frames.push_back(sc);
+    }
+  }
+  auto s = symbolize(unique_frames);
+
+  std::string line_s = "line";
+  std::string name_s = "name";
+  std::string filename_s = "filename";
+  std::vector<nlohmann::json> all_frames;
+
+  for (const auto& f : s.all_frames) {
+    nlohmann::json d;
+    d[name_s] = f.funcname;
+    d[filename_s] = f.filename;
+    d[line_s] = f.lineno;
+    all_frames.emplace_back(std::move(d));
+  }
+
+  std::vector<nlohmann::json> py_unique_frames;
+  for (const auto& t : s.tracebacks) {
+    nlohmann::json l;
+    for (const auto& e : t) {
+      l.emplace_back(all_frames.at(e));
+    }
+    py_unique_frames.push_back(std::move(l));
+  }
+
+  std::vector<nlohmann::json> result;
+  result.reserve(to_symbolize.size());
+  for (const auto& sc : to_symbolize) {
+    result.push_back(py_unique_frames.at(cached_frames.at(sc)));
+  }
+  return result;
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 std::vector<py::object> py_symbolize(
     std::vector<CapturedTraceback*>& to_symbolize) {
   // we dedup repeated to_symbolize objects to prevent
diff --git a/torch/csrc/profiler/python/combined_traceback.h b/torch/csrc/profiler/python/combined_traceback.h
index 03b3846822de..1db11949862c 100644
--- a/torch/csrc/profiler/python/combined_traceback.h
+++ b/torch/csrc/profiler/python/combined_traceback.h
@@ -1,5 +1,9 @@
 #include <torch/csrc/profiler/combined_traceback.h>
 
+<<<<<<< HEAD
+=======
+#include <nlohmann/json.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <pybind11/pybind11.h>
 #include <torch/csrc/utils/pybind.h>
 
@@ -14,6 +18,13 @@ namespace torch {
 TORCH_API std::vector<pybind11::object> py_symbolize(
     std::vector<CapturedTraceback*>& to_symbolize);
 
+<<<<<<< HEAD
+=======
+// Return the callback in json format so that it can be used within cpp
+TORCH_API std::vector<nlohmann::json> json_symbolize(
+    std::vector<CapturedTraceback*>& to_symbolize);
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // requires GIL to be held, frees any pending free frames
 TORCH_PYTHON_API void freeDeadCapturedTracebackFrames();
 
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index b6277597e0b5..173096cfcf86 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -37,7 +37,11 @@ static void THPCapturedTraceback_dealloc(PyObject* self_) {
   PyObject_GC_Del(self);
 }
 
+<<<<<<< HEAD
 PyTypeObject THPCapturedTracebackType = {
+=======
+static PyTypeObject THPCapturedTracebackType = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyVarObject_HEAD_INIT(nullptr, 0)
     "torch._C._profiler.CapturedTraceback", /* tp_name */
     sizeof(THPCapturedTraceback), /* tp_basicsize */
@@ -394,7 +398,11 @@ void initPythonBindings(PyObject* module) {
           },
           [](const py::tuple& t) { // __setstate__
             if (t.size() >= 5) {
+<<<<<<< HEAD
               throw std::runtime_error("Expected atleast 5 values in state");
+=======
+              throw std::runtime_error("Expected at least 5 values in state");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
 
             py::list py_metrics = t[0].cast<py::list>();
@@ -673,8 +681,13 @@ void initPythonBindings(PyObject* module) {
       {nullptr},
   };
 
+<<<<<<< HEAD
   static PyTypeObject RecordFunctionFast_Type = { PyVarObject_HEAD_INIT(nullptr,
                                                                         0)
+=======
+  static PyTypeObject RecordFunctionFast_Type = {
+      PyVarObject_HEAD_INIT(nullptr, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   };
 
   RecordFunctionFast_Type.tp_name = "torch._C._profiler.RecordFunctionFast",
diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
index 9d5efcd1e6b8..0ad76a5b1873 100644
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@@ -25,6 +25,10 @@
 #include <ATen/core/function_schema.h>
 #include <ATen/core/stack.h>
 #include <ATen/record_function.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/irange.h>
 #include <torch/csrc/profiler/standalone/execution_trace_observer.h>
 #include <torch/csrc/profiler/util.h>
@@ -112,6 +116,44 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
   // Uses the underlying TensorImpl object pointer as the key and map to its
   // unique id.
   std::map<const void*, ID> objectId{};
+<<<<<<< HEAD
+=======
+
+  using weak_storage_ptr = c10::weak_intrusive_ptr<StorageImpl>;
+  std::unordered_map<const void*, ID> data_ptr_to_storage_id{};
+  std::unordered_map<const void*, weak_storage_ptr>
+      data_ptr_to_weak_storage_ptr{};
+
+  ID get_tensor_storage_ID(const c10::Storage& t_storage) {
+    const std::lock_guard<std::recursive_mutex> lock(gMutex);
+
+    const void* raw_data_ptr = t_storage.data();
+    auto iter = data_ptr_to_weak_storage_ptr.find(raw_data_ptr);
+    if (iter == data_ptr_to_weak_storage_ptr.end()) {
+      ID id = storage_id_++;
+      data_ptr_to_storage_id.emplace(raw_data_ptr, id);
+      data_ptr_to_weak_storage_ptr.emplace(
+          raw_data_ptr, t_storage.getWeakStorageImpl());
+      return id;
+    } else {
+      // check if the storage is still alive
+      if (iter->second.expired()) {
+        ID id = storage_id_++;
+        // std::unorder_map does not change if the key is already in the map.
+        // So we need to remove the key and insert the key with the new value.
+        data_ptr_to_storage_id.erase(raw_data_ptr);
+        data_ptr_to_storage_id[raw_data_ptr] = id;
+        data_ptr_to_weak_storage_ptr.erase(raw_data_ptr);
+        data_ptr_to_weak_storage_ptr.emplace(
+            raw_data_ptr, t_storage.getWeakStorageImpl());
+        return id;
+      } else {
+        return data_ptr_to_storage_id[raw_data_ptr];
+      }
+    }
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // Observer run state.
   enum class RunState { uninitialized, disabled, enabled };
 
@@ -170,10 +212,19 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
 
   // All tensors and operators have an unique id assigned. Increment id for each
   // new tensor or operator node.
+<<<<<<< HEAD
   // 0 -> unintialized
   // 1 -> root ID
   // 2 ... -> regular node ID
   std::atomic<ID> id_{2};
+=======
+  // 0 -> uninitialized
+  // 1 -> root ID
+  // 2 ... -> regular node ID
+  std::atomic<ID> id_{2};
+
+  std::atomic<ID> storage_id_{1};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 };
 
 // Using a singleton manager here to allow init and delete the observer object.
@@ -444,8 +495,13 @@ convertIValue(
     // symbolic sizes/strides implies t->storage_offset() will fail
     if (tensor_impl->has_storage() &&
         !tensor_impl->has_symbolic_sizes_strides()) {
+<<<<<<< HEAD
       auto& t_storage = tensor_impl->storage();
       storage_id = getObjectID(ob, t_storage.data());
+=======
+      const c10::Storage& t_storage = tensor_impl->storage();
+      storage_id = ob.get_tensor_storage_ID(t_storage);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       offset = tensor_impl->storage_offset();
       numel = tensor_impl->numel();
       itemsize = tensor_impl->itemsize();
@@ -606,6 +662,7 @@ static void handleKernelBackendInfo(
             fc.kernelFile.substr(fc.kernelFile.find_last_of('/') + 1);
       }
 
+<<<<<<< HEAD
       // get grid information
       TORCH_INTERNAL_ASSERT(
           kwinputs.find("grid") != kwinputs.end(),
@@ -615,6 +672,8 @@ static void handleKernelBackendInfo(
       fc.inputTypes.emplace_back("\"String\"");
       fc.inputShapes.emplace_back("[]");
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // get stream information
       TORCH_INTERNAL_ASSERT(
           kwinputs.find("stream") != kwinputs.end(),
@@ -907,18 +966,31 @@ bool addExecutionTraceObserver(const std::string& output_file_path) {
 
     // check if the environment variable is set to force recording integer
     // tensors
+<<<<<<< HEAD
     auto env_variable =
         getenv("ENABLE_PYTORCH_EXECUTION_TRACE_SAVE_INTEGRAL_TENSOR_RANGE");
     if (env_variable != nullptr) {
+=======
+    auto env_variable = c10::utils::get_env(
+        "ENABLE_PYTORCH_EXECUTION_TRACE_SAVE_INTEGRAL_TENSOR_RANGE");
+    if (env_variable.has_value()) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       ob.record_integral_tensor_range = true;
     }
 
     // check if the environment variable is set to force recording integer
     // tensors
+<<<<<<< HEAD
     env_variable =
         getenv("ENABLE_PYTORCH_EXECUTION_TRACE_SAVE_INTEGRAL_TENSOR_DATA");
     if (env_variable != nullptr) {
       std::istringstream stream(env_variable);
+=======
+    env_variable = c10::utils::get_env(
+        "ENABLE_PYTORCH_EXECUTION_TRACE_SAVE_INTEGRAL_TENSOR_DATA");
+    if (env_variable.has_value()) {
+      std::istringstream stream(env_variable.value());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       std::string token;
       while (std::getline(stream, token, ',')) {
         ob.nodeListForSavingIntegerTensor.insert(token);
diff --git a/torch/csrc/profiler/standalone/itt_observer.cpp b/torch/csrc/profiler/standalone/itt_observer.cpp
index b1933aac5e29..ae9ca67ca72a 100644
--- a/torch/csrc/profiler/standalone/itt_observer.cpp
+++ b/torch/csrc/profiler/standalone/itt_observer.cpp
@@ -32,7 +32,12 @@ struct ITTThreadLocalState : ProfilerStateBase {
 };
 
 template <bool report_input_shapes>
+<<<<<<< HEAD
 std::unique_ptr<at::ObserverContext> enterITT(const at::RecordFunction& fn) {
+=======
+static std::unique_ptr<at::ObserverContext> enterITT(
+    const at::RecordFunction& fn) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (ITTThreadLocalState::getTLS() != nullptr) {
     torch::profiler::impl::ittStubs()->rangePush(fn.name());
   }
diff --git a/torch/csrc/profiler/standalone/nvtx_observer.cpp b/torch/csrc/profiler/standalone/nvtx_observer.cpp
index ed43c0f7ce38..fe525ee86824 100644
--- a/torch/csrc/profiler/standalone/nvtx_observer.cpp
+++ b/torch/csrc/profiler/standalone/nvtx_observer.cpp
@@ -124,7 +124,12 @@ static void updateOutputTensorTracker(const at::RecordFunction& fn) {
 }
 
 template <bool report_input_shapes>
+<<<<<<< HEAD
 std::unique_ptr<at::ObserverContext> enterNVTX(const at::RecordFunction& fn) {
+=======
+static std::unique_ptr<at::ObserverContext> enterNVTX(
+    const at::RecordFunction& fn) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (NVTXThreadLocalState::getTLS() != nullptr) {
     auto input_op_ids = getInputTensorOpIds(fn);
     torch::profiler::impl::cudaStubs()->rangePush(
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index e08b2a3efd0f..9d425aa8f67d 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -1,11 +1,15 @@
 #include <sstream>
 
 #ifndef ROCM_ON_WINDOWS
+<<<<<<< HEAD
 #ifdef TORCH_CUDA_USE_NVTX3
 #include <nvtx3/nvtx3.hpp>
 #else
 #include <nvToolsExt.h>
 #endif
+=======
+#include <nvtx3/nvtx3.hpp>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS
diff --git a/torch/csrc/profiler/unwind/mem_file.h b/torch/csrc/profiler/unwind/mem_file.h
index 099678bf8afe..798127bc45ca 100644
--- a/torch/csrc/profiler/unwind/mem_file.h
+++ b/torch/csrc/profiler/unwind/mem_file.h
@@ -35,7 +35,11 @@ struct Section {
 /// Memory maps a file into the address space read-only, and manages the
 /// lifetime of the mapping. Here are a few use cases:
 /// 1. Used in the loader to read in initial image, and to inspect
+<<<<<<< HEAD
 // ELF files for dependencies before callling dlopen.
+=======
+// ELF files for dependencies before calling dlopen.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ///
 /// 2. Used in unity to load the elf file.
 struct MemFile {
@@ -46,7 +50,11 @@ struct MemFile {
         "failed to open {}: {}",
         filename_,
         c10::utils::str_error(errno));
+<<<<<<< HEAD
     struct stat s {};
+=======
+    struct stat s{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (-1 == fstat(fd_, &s)) {
       close(fd_); // destructors don't run during exceptions
       UNWIND_CHECK(
diff --git a/torch/csrc/profiler/unwind/range_table.h b/torch/csrc/profiler/unwind/range_table.h
index b8c405ddad6a..efb91425c8e4 100644
--- a/torch/csrc/profiler/unwind/range_table.h
+++ b/torch/csrc/profiler/unwind/range_table.h
@@ -9,7 +9,11 @@ namespace torch::unwind {
 template <typename T>
 struct RangeTable {
   RangeTable() {
+<<<<<<< HEAD
     // guarentee that lower_bound[-1] is always valid
+=======
+    // guarantee that lower_bound[-1] is always valid
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     addresses_.push_back(0);
     payloads_.emplace_back(std::nullopt);
   }
diff --git a/torch/csrc/profiler/unwind/unwind.cpp b/torch/csrc/profiler/unwind/unwind.cpp
index bed307245822..40cda7c028a1 100644
--- a/torch/csrc/profiler/unwind/unwind.cpp
+++ b/torch/csrc/profiler/unwind/unwind.cpp
@@ -1,5 +1,9 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/env.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/profiler/unwind/unwind.h>
 #include <torch/csrc/utils/cpp_stacktraces.h>
 
@@ -321,10 +325,17 @@ static std::string dladdr_lookup(void* addr) {
 
 struct Symbolizer {
   Symbolizer() {
+<<<<<<< HEAD
     auto envar = std::getenv("TORCH_ADDR2LINE_BINARY");
     if (envar != nullptr) {
       // currently we take user's input as is without checking
       addr2line_binary_ = envar;
+=======
+    auto envar = c10::utils::get_env("TORCH_ADDR2LINE_BINARY");
+    if (envar.has_value()) {
+      // currently we take user's input as is without checking
+      addr2line_binary_ = std::move(envar.value());
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       TORCH_WARN("Use custom addr2line binary: ", addr2line_binary_);
     } else {
       addr2line_binary_ = "addr2line"; // default
@@ -379,7 +390,11 @@ struct Symbolizer {
 
  private:
   static constexpr int BLOCK = 1024;
+<<<<<<< HEAD
   const char* addr2line_binary_;
+=======
+  std::string addr2line_binary_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   struct Entry {
     std::unique_ptr<Communicate> comm;
     std::vector<void*> queried;
@@ -394,12 +409,21 @@ struct Symbolizer {
     if (it == entries_.end()) {
       // NOLINTNEXTLINE(*-c-arrays*)
       const char* args[] = {
+<<<<<<< HEAD
           addr2line_binary_, "-C", "-f", "-e", name.c_str(), nullptr};
+=======
+          addr2line_binary_.c_str(), "-C", "-f", "-e", name.c_str(), nullptr};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       it = entries_
                .insert_or_assign(
                    name,
                    Entry{
+<<<<<<< HEAD
                        std::make_unique<Communicate>(addr2line_binary_, args),
+=======
+                       std::make_unique<Communicate>(
+                           addr2line_binary_.c_str(), args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                        {}})
                .first;
     }
diff --git a/torch/csrc/profiler/unwind/unwind_fb.cpp b/torch/csrc/profiler/unwind/unwind_fb.cpp
index f40005adae82..bd4370a1f2a6 100644
--- a/torch/csrc/profiler/unwind/unwind_fb.cpp
+++ b/torch/csrc/profiler/unwind/unwind_fb.cpp
@@ -1,6 +1,11 @@
 #if defined(__linux__) && (defined(__x86_64__) || defined(__aarch64__)) && \
+<<<<<<< HEAD
     defined(__has_include) &&                                              \
     __has_include("ext/stdio_filebuf.h") && defined(FBCODE_CAFFE2)
+=======
+    defined(FBCODE_CAFFE2)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <c10/util/flat_hash_map.h>
 #include <llvm/DebugInfo/Symbolize/Symbolize.h>
 #include <torch/csrc/profiler/unwind/unwind.h>
diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp
index a9cbdc07351b..7d25c920433f 100644
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@@ -8,10 +8,17 @@
 #include <torch/csrc/serialization.h>
 
 template <class io>
+<<<<<<< HEAD
 Py_ssize_t doPartialRead(io fildes, void* buf, size_t nbytes);
 
 template <class io>
 Py_ssize_t doPartialWrite(io fildes, void* buf, size_t nbytes);
+=======
+static Py_ssize_t doPartialRead(io fildes, void* buf, size_t nbytes);
+
+template <class io>
+static Py_ssize_t doPartialWrite(io fildes, void* buf, size_t nbytes);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 static Py_ssize_t doPartialPythonReadBuffered(
     PyObject* fildes,
diff --git a/torch/csrc/stable/library.h b/torch/csrc/stable/library.h
index 2df01127ae27..22c19af07cd2 100644
--- a/torch/csrc/stable/library.h
+++ b/torch/csrc/stable/library.h
@@ -1,8 +1,16 @@
+<<<<<<< HEAD
+=======
+#pragma once
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // this file can only have stable stuff! Akin to shim.h
 // but unlike shim.h, this file can contain header-only C++
 // code for better UX.
 
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+<<<<<<< HEAD
+=======
+#include <torch/csrc/stable/tensor.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <optional>
 
@@ -10,6 +18,7 @@
 // versions of this file that may be included by different sources
 namespace {
 
+<<<<<<< HEAD
 namespace detail {
 // utility functions to detect optional
 template <typename V>
@@ -35,6 +44,54 @@ StableIValue from(std::nullopt_t val) {
 }
 
 // Specialization for std::optional
+=======
+// =============================================================================
+//  helpers for converting between StableIValue and T
+// =============================================================================
+
+// forward declare so that from/to() calls in detail work
+template <typename T>
+StableIValue from(T val);
+template <typename T>
+T to(StableIValue val);
+
+namespace detail {
+
+// =============================================================================
+// FROM CONVERSIONS (T -> StableIValue)
+// =============================================================================
+
+// Specialization for general copyable types (catch-all) => StableIValue
+template <typename T>
+struct FromImpl {
+  static StableIValue call(T val) {
+    static_assert(
+        sizeof(T) <= sizeof(StableIValue),
+        "StableLibrary stack does not support parameter types larger than 64 bits.");
+    static_assert(std::is_trivially_copyable_v<T>);
+    // Initialization should be cheap enough; let's give people well-specified
+    // reproducible behavior.
+    StableIValue result = 0;
+    // NOTE [ -Wclass-memaccess ]: reinterpret_cast to suppress
+    // overzealous -Wclass-memaccess. (see
+    // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107361) We have a
+    // static_assert above that T is trivially copyable, which should be
+    // enough.
+    std::memcpy(&result, reinterpret_cast<const void*>(&val), sizeof(val));
+    return result;
+  }
+};
+
+// Specialization for std::nullopt_t => StableIValue
+template <>
+struct FromImpl<std::nullopt_t> {
+  static StableIValue call(std::nullopt_t val) {
+    return from(nullptr);
+  }
+};
+
+// Specialization for std::optional => StableIValue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // [Handling std::optional]
 // When the schema is represented by an optional type, say int?, then we
 // expect the custom extension representation to be a std::optional<int>
@@ -64,6 +121,7 @@ StableIValue from(std::nullopt_t val) {
 // The schema requests an optional (T?) so I must call `from` on a
 // std::optional<T> or a std::nullopt.
 template <typename T>
+<<<<<<< HEAD
 StableIValue from(std::optional<T> val) {
   if (!val.has_value()) {
     return from(std::nullopt);
@@ -109,6 +167,120 @@ T to(StableIValue val) {
   return std::make_optional(inner_val);
 }
 // end to helpers for converting between StableIValue and actual IValues
+=======
+struct FromImpl<std::optional<T>> {
+  static StableIValue call(const std::optional<T>& val) {
+    if (!val.has_value()) {
+      return from(std::nullopt);
+    }
+    StableIValue* heap_val = new StableIValue(from(val.value()));
+    return from(heap_val);
+  }
+};
+
+// Specialization for torch::stable::Tensor => StableIValue
+// Returns a new owning reference of the underlying Tensor.
+template <>
+struct FromImpl<torch::stable::Tensor> {
+  static StableIValue call(const torch::stable::Tensor& val) {
+    AtenTensorHandle new_ath;
+    aoti_torch_new_tensor_handle(val.get(), &new_ath);
+    return from(new_ath);
+  }
+};
+
+// =============================================================================
+// TO CONVERSIONS (StableIValue -> T)
+// =============================================================================
+
+// Specialization for StableIValue => general copyable types (catch-all)
+template <typename T>
+struct ToImpl {
+  static T call(StableIValue val) {
+    static_assert(std::is_trivially_copyable_v<T>);
+    // T may not have a default constructor. (For example, it might be
+    // c10::Device.) However, std::memcpy implicitly creates a T at the
+    // destination. So, we can use a union to work around this lack of
+    // default constructor.
+    union Result {
+      Result() {}
+      T t;
+    };
+    Result result;
+    // See NOTE[ -Wclass-memaccess ] above.
+    std::memcpy(reinterpret_cast<void*>(&result.t), &val, sizeof(result));
+    return result.t;
+  }
+};
+
+// Specialization for StableIValue => std::nullopt_t
+template <>
+struct ToImpl<std::nullopt_t> {
+  static std::nullopt_t call(StableIValue val) {
+    // val should be equivalent to from(nullptr)
+    return std::nullopt;
+  }
+};
+
+// Specialization for StableIValue => std::optional, see [Handling
+// std::optional] as the semantic is the same but in reverse direction as we go
+// from IValue --(from_ivalue)-> StableIValue --(to<T>)-> T in custom extension
+template <typename T>
+struct ToImpl<std::optional<T>> {
+  static std::optional<T> call(StableIValue val) {
+    auto sivp = to<StableIValue*>(val);
+
+    // sivp is either nullptr or a pointer to a StableIValue
+    if (sivp == nullptr) {
+      return {};
+    }
+    auto inner_val = to<T>(*sivp);
+
+    // free the memory associated with StableIValue* sivp
+    delete sivp;
+
+    return std::make_optional(inner_val);
+  }
+};
+
+// Specialization for StableIValue => torch::stable::Tensor
+// The resulting stable::Tensor steals ownership of the input's
+// underlying AtenTensorHandle.
+template <>
+struct ToImpl<torch::stable::Tensor> {
+  static torch::stable::Tensor call(StableIValue val) {
+    return torch::stable::Tensor(to<AtenTensorHandle>(val));
+  }
+};
+
+} // namespace detail
+
+// Expose the partially templated class functions through single functions
+template <typename T>
+StableIValue from(T val) {
+  return detail::FromImpl<T>::call(val);
+}
+
+template <typename T>
+StableIValue from(const std::optional<T>& val) {
+  return detail::FromImpl<std::optional<T>>::call(val);
+}
+
+// The below overload is used! See https://godbolt.org/z/859cshxrW
+// We are suppressing the warning for versions clang12- and gcc11-
+[[maybe_unused]] StableIValue from(const torch::stable::Tensor& val) {
+  return detail::FromImpl<torch::stable::Tensor>::call(val);
+}
+
+template <typename T>
+T to(StableIValue val) {
+  return detail::ToImpl<T>::call(val);
+}
+
+// =============================================================================
+//  end to helpers for converting between StableIValue and T
+// =============================================================================
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class StableLibrary final {
  private:
diff --git a/torch/csrc/stable/tensor.h b/torch/csrc/stable/tensor.h
new file mode 100644
index 000000000000..1b9b3fecb417
--- /dev/null
+++ b/torch/csrc/stable/tensor.h
@@ -0,0 +1,126 @@
+#pragma once
+
+// TODO ASAP: THIS FILE SHOULD BE HEADER ONLY BUT ISN'T ENFORCED:
+// I only need it for AOTI_TORCH_ERROR_CODE_CHECK, see #154908
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+namespace torch::stable {
+
+using DeviceIndex =
+    int8_t; // this is from c10/core/Device.h and can be header only
+
+// The torch::stable::Tensor class is a highlevel C++ wrapper around
+// the C shim Tensor APIs. We've modeled this class after TensorBase, as custom
+// op kernels only really need to interact with Tensor metadata (think sizes,
+// strides, device, dtype). Other functions on Tensor (like empty_like) should
+// live like the ATen op that they are and exist outside of this struct.
+//
+// There are several goals of this class over AtenTensorHandle and
+// RAIIAtenTensorHandle:
+// 1. torch::stable::Tensor is a nicer UX much closer to torch::Tensor than the
+//    C APIs with AtenTensorHandle. Under the hood we still call to these C shim
+//    APIs to preserve stability.
+// 2. RAIIAtenTensorHandle boils down to a uniq_ptr that forces the user to pass
+//    around ownership. This makes it difficult to pass one input into 2
+//    different functions, e.g., doing something like c = a(t) + b(t) for
+//    stable::Tensor t. Thus, we use a shared_ptr here.
+class Tensor {
+ private:
+  std::shared_ptr<AtenTensorOpaque> ath_;
+
+ public:
+  Tensor() = delete;
+
+  // Construct a stable::Tensor from an AtenTensorHandle (ATH)
+  // Steals ownership from the ATH
+  explicit Tensor(AtenTensorHandle ath)
+      : ath_(ath, [](AtenTensorHandle ath) {
+          AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_tensor_object(ath));
+        }) {}
+
+  // Copy and move constructors can be default cuz the underlying handle is a
+  // shared_ptr
+  Tensor(const Tensor& other) = default;
+  Tensor(Tensor&& other) noexcept = default;
+
+  // Copy and move assignment operators can be default cuz the underlying handle
+  // is a shared_ptr
+  Tensor& operator=(const Tensor& other) = default;
+  Tensor& operator=(Tensor&& other) noexcept = default;
+
+  // Destructor can be default: shared ptr has custom deletion logic
+  ~Tensor() = default;
+
+  // Returns a borrowed reference to the AtenTensorHandle
+  AtenTensorHandle get() const {
+    return ath_.get();
+  }
+
+  // =============================================================================
+  // C-shimified TensorBase APIs: the below APIs have the same signatures and
+  // semantics as their counterparts in TensorBase.h.
+  // =============================================================================
+
+  void* data_ptr() const {
+    void* data_ptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(ath_.get(), &data_ptr));
+    return data_ptr;
+  }
+
+  int64_t dim() const {
+    int64_t dim;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dim(ath_.get(), &dim));
+    return dim;
+  }
+
+  int64_t numel() const {
+    int64_t numel;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_numel(ath_.get(), &numel));
+    return numel;
+  }
+
+  // note: this is a subset of the original TensorBase API. It takes no
+  // arguments whereas the original API takes in a kwarg of memory format.
+  // Here, we assume the default contiguous memory format.
+  bool is_contiguous() const {
+    bool is_contiguous;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_is_contiguous(ath_.get(), &is_contiguous));
+    return is_contiguous;
+  }
+
+  int64_t stride(int64_t dim) const {
+    int64_t stride;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_stride(ath_.get(), dim, &stride));
+    return stride;
+  }
+
+  DeviceIndex get_device() const {
+    int32_t device_index;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_device_index(ath_.get(), &device_index));
+    return static_cast<DeviceIndex>(device_index);
+  }
+
+  bool is_cuda() const {
+    int32_t device_type;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_device_type(ath_.get(), &device_type));
+    return device_type == aoti_torch_device_type_cuda();
+  }
+
+  int64_t size(int64_t dim) const {
+    int64_t size;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(ath_.get(), dim, &size));
+    return size;
+  }
+
+  // =============================================================================
+  // END of C-shimified TensorBase APIs
+  // =============================================================================
+};
+
+} // namespace torch::stable
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index 0e1189e1a7bb..c3271de51e01 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -254,7 +254,11 @@ namespace torch::gdb {
 // Return an human-readable representation of the given Tensor. The resulting
 // string is stored into a malloc()ed buffer. The caller is responsible to
 // free() it. We use malloc() instead of new[] because it's much easier to
+<<<<<<< HEAD
 // call free than delete[] from withing gdb.
+=======
+// call free than delete[] from within gdb.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // Currently the code for computing the repr of a tensor is written in Python,
 // so we need to wrap the Tensor into a Python object first.
 char* tensor_repr(const at::Tensor& tensor) {
@@ -300,7 +304,11 @@ char* tensor_repr(const at::Tensor& tensor) {
   return result;
 
 error:
+<<<<<<< HEAD
   fprintf(stderr, "torch::gdb::tensor_repr: unexpected error\n");
+=======
+  fmt::print(stderr, "torch::gdb::tensor_repr: unexpected error\n");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (PyErr_Occurred())
     PyErr_Print();
   Py_XDECREF(pytensor);
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index 6431c34bd232..f2c97a6a0ad9 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -1,5 +1,9 @@
+<<<<<<< HEAD
 #ifndef THP_UTILS_H
 #define THP_UTILS_H
+=======
+#pragma once
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 #include <ATen/ATen.h>
 #include <c10/util/Exception.h>
@@ -12,10 +16,13 @@
 #include <type_traits>
 #include <vector>
 
+<<<<<<< HEAD
 #ifdef USE_CUDA
 #include <c10/cuda/CUDAStream.h>
 #endif
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define THPUtils_(NAME) TH_CONCAT_4(THP, Real, Utils_, NAME)
 
 #define THPUtils_typename(obj) (Py_TYPE(obj)->tp_name)
@@ -106,11 +113,18 @@
 #define THPBoolUtils_newReal(value) THPUtils_newReal_BOOL(value)
 #define THPBoolUtils_checkAccreal(object) THPUtils_checkReal_BOOL(object)
 #define THPBoolUtils_unpackAccreal(object) \
+<<<<<<< HEAD
   (int64_t) THPUtils_unpackReal_BOOL(object)
 #define THPBoolUtils_newAccreal(value) THPUtils_newReal_BOOL(value)
 #define THPLongUtils_checkReal(object) THPUtils_checkReal_INT(object)
 #define THPLongUtils_unpackReal(object) \
   (int64_t) THPUtils_unpackReal_INT(object)
+=======
+  (int64_t)THPUtils_unpackReal_BOOL(object)
+#define THPBoolUtils_newAccreal(value) THPUtils_newReal_BOOL(value)
+#define THPLongUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPLongUtils_unpackReal(object) (int64_t)THPUtils_unpackReal_INT(object)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #define THPLongUtils_newReal(value) THPUtils_newReal_INT(value)
 #define THPIntUtils_checkReal(object) THPUtils_checkReal_INT(object)
 #define THPIntUtils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
@@ -204,6 +218,7 @@ void setBackCompatKeepdimWarn(bool warn);
 bool getBackCompatKeepdimWarn();
 bool maybeThrowBackCompatKeepdimWarn(char* func);
 
+<<<<<<< HEAD
 // NB: This is in torch/csrc/cuda/utils.cpp, for whatever reason
 #ifdef USE_CUDA
 std::vector<std::optional<at::cuda::CUDAStream>>
@@ -215,3 +230,8 @@ void storage_set(const at::Storage& self, ptrdiff_t idx, uint8_t value);
 uint8_t storage_get(const at::Storage& self, ptrdiff_t idx);
 
 #endif
+=======
+void storage_fill(const at::Storage& self, uint8_t value);
+void storage_set(const at::Storage& self, ptrdiff_t idx, uint8_t value);
+uint8_t storage_get(const at::Storage& self, ptrdiff_t idx);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/csrc/utils/byte_order.cpp b/torch/csrc/utils/byte_order.cpp
index 5a5ec1909916..6dd137d41943 100644
--- a/torch/csrc/utils/byte_order.cpp
+++ b/torch/csrc/utils/byte_order.cpp
@@ -362,7 +362,11 @@ TORCH_API void THP_encodeBuffer<c10::complex<double>>(
 
 #define DEFINE_ENCODE(TYPE)                       \
   template TORCH_API void THP_encodeBuffer<TYPE>( \
+<<<<<<< HEAD
       uint8_t * dst, const TYPE* src, THPByteOrder order, size_t len);
+=======
+      uint8_t* dst, const TYPE* src, THPByteOrder order, size_t len);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 DEFINE_ENCODE(int16_t)
 DEFINE_ENCODE(int32_t)
diff --git a/torch/csrc/utils/device_lazy_init.cpp b/torch/csrc/utils/device_lazy_init.cpp
index 74adb6b5e6b0..dfe92b1cab57 100644
--- a/torch/csrc/utils/device_lazy_init.cpp
+++ b/torch/csrc/utils/device_lazy_init.cpp
@@ -1,13 +1,31 @@
 #include <c10/core/impl/TorchDispatchModeTLS.h>
+<<<<<<< HEAD
+=======
+#include <c10/util/CallOnce.h>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #include <torch/csrc/utils/device_lazy_init.h>
 
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/object_ptr.h>
+<<<<<<< HEAD
+=======
+
+#ifndef WIN32
+#include <pthread.h>
+#endif
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace torch::utils {
 namespace {
 
 std::array<bool, at::COMPILE_TIME_MAX_DEVICE_TYPES> is_initialized{};
+<<<<<<< HEAD
+=======
+std::array<bool, at::COMPILE_TIME_MAX_DEVICE_TYPES> is_in_bad_fork{};
+std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+    at_fork_once_flags{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // anonymous namespace
 
@@ -58,4 +76,31 @@ void set_requires_device_init(at::DeviceType device_type, bool value) {
   is_initialized[static_cast<int>(device_type)] = !value;
 }
 
+<<<<<<< HEAD
+=======
+bool is_device_in_bad_fork(at::DeviceType device_type) {
+  return is_in_bad_fork[static_cast<int>(device_type)];
+}
+
+void set_device_in_bad_fork(at::DeviceType device_type, bool value) {
+  is_in_bad_fork[static_cast<int>(device_type)] = value;
+}
+
+// Should be called before the first device runtime call.
+void register_fork_handler_for_device_init(at::DeviceType device_type) {
+#ifndef WIN32
+  auto& flag = at_fork_once_flags[static_cast<int>(device_type)];
+  c10::call_once(flag, [device_type]() {
+    static at::DeviceType at_fork_device_type = device_type;
+    pthread_atfork(nullptr, nullptr, []() {
+      set_device_in_bad_fork(at_fork_device_type, true);
+      if (is_device_lazy_init_supported(at_fork_device_type)) {
+        set_requires_device_init(at_fork_device_type, true);
+      }
+    });
+  });
+#endif
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::utils
diff --git a/torch/csrc/utils/device_lazy_init.h b/torch/csrc/utils/device_lazy_init.h
index e1f480a60f77..7817c4cf673d 100644
--- a/torch/csrc/utils/device_lazy_init.h
+++ b/torch/csrc/utils/device_lazy_init.h
@@ -67,4 +67,24 @@ inline void maybe_initialize_device(
 
 bool is_device_initialized(at::DeviceType device_type);
 
+<<<<<<< HEAD
+=======
+TORCH_PYTHON_API bool is_device_in_bad_fork(at::DeviceType device_type);
+
+TORCH_PYTHON_API void set_device_in_bad_fork(
+    at::DeviceType device_type,
+    bool value);
+
+TORCH_PYTHON_API void register_fork_handler_for_device_init(
+    at::DeviceType device_type);
+
+inline void maybe_register_fork_handler_for_device_init(
+    std::optional<at::DeviceType>& device_type) {
+  if (!device_type.has_value()) {
+    return;
+  }
+  register_fork_handler_for_device_init(device_type.value());
+}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace torch::utils
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index ce0d40afba5b..e0aa86a21459 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -7,8 +7,13 @@
 #include <ATen/PythonTorchFunctionTLS.h>
 
 namespace torch {
+<<<<<<< HEAD
 PyObject* disabled_torch_function = nullptr;
 PyObject* disabled_torch_dispatch = nullptr;
+=======
+static PyObject* disabled_torch_function = nullptr;
+static PyObject* disabled_torch_dispatch = nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 bool torch_function_enabled() {
   return at::impl::PythonTorchFunctionTLS::get_disabled_state() ==
@@ -38,7 +43,11 @@ typedef struct {
   at::impl::TorchFunctionDisabledState old_state;
 } DisableTorchFunctionSubclass;
 
+<<<<<<< HEAD
 PyObject* DisableTorchFunctionSubclass__enter(
+=======
+static PyObject* DisableTorchFunctionSubclass__enter(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* self,
     PyObject* unused) {
   const auto old_state = at::impl::PythonTorchFunctionTLS::get_disabled_state();
@@ -50,7 +59,13 @@ PyObject* DisableTorchFunctionSubclass__enter(
   Py_RETURN_NONE;
 }
 
+<<<<<<< HEAD
 PyObject* DisableTorchFunctionSubclass__exit(PyObject* self, PyObject* unused) {
+=======
+static PyObject* DisableTorchFunctionSubclass__exit(
+    PyObject* self,
+    PyObject* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::impl::PythonTorchFunctionTLS::set_disabled_state(
       ((DisableTorchFunctionSubclass*)self)->old_state);
   Py_RETURN_NONE;
@@ -79,7 +94,11 @@ static PyMethodDef DisableTorchFunctionSubclass_methods[] = { // NOLINT
     {"__exit__", DisableTorchFunctionSubclass__exit, METH_VARARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
+<<<<<<< HEAD
 PyTypeObject DisableTorchFunctionSubclassType = {
+=======
+static PyTypeObject DisableTorchFunctionSubclassType = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyVarObject_HEAD_INIT(nullptr, 0)
     "torch._C.DisableTorchFunctionSubclass", /* tp_name */
     sizeof(DisableTorchFunctionSubclass), /* tp_basicsize */
@@ -134,7 +153,11 @@ typedef struct {
   at::impl::TorchFunctionDisabledState old_state;
 } DisableTorchFunction;
 
+<<<<<<< HEAD
 PyObject* DisableTorchFunction__enter(PyObject* self, PyObject* unused) {
+=======
+static PyObject* DisableTorchFunction__enter(PyObject* self, PyObject* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ((DisableTorchFunctionSubclass*)self)->old_state =
       at::impl::PythonTorchFunctionTLS::get_disabled_state();
   at::impl::PythonTorchFunctionTLS::set_disabled_state(
@@ -142,7 +165,11 @@ PyObject* DisableTorchFunction__enter(PyObject* self, PyObject* unused) {
   Py_RETURN_NONE;
 }
 
+<<<<<<< HEAD
 PyObject* DisableTorchFunction__exit(PyObject* self, PyObject* unused) {
+=======
+static PyObject* DisableTorchFunction__exit(PyObject* self, PyObject* unused) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::impl::PythonTorchFunctionTLS::set_disabled_state(
       ((DisableTorchFunctionSubclass*)self)->old_state);
   Py_RETURN_NONE;
@@ -153,7 +180,11 @@ static PyMethodDef DisableTorchFunction_methods[] = { // NOLINT
     {"__exit__", DisableTorchFunction__exit, METH_VARARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
+<<<<<<< HEAD
 PyTypeObject DisableTorchFunctionType = {
+=======
+static PyTypeObject DisableTorchFunctionType = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyVarObject_HEAD_INIT(nullptr, 0)
     "torch._C.DisableTorchFunction", /* tp_name */
     sizeof(DisableTorchFunction), /* tp_basicsize */
@@ -304,7 +335,11 @@ static bool is_basic_python_type(PyTypeObject* tp) {
       false);
 }
 
+<<<<<<< HEAD
 inline bool has_torch_function_attr(PyObject* obj) {
+=======
+inline static bool has_torch_function_attr(PyObject* obj) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto attr = PyObject_FastGetAttrString(obj, "__torch_function__");
   return (
       attr.ptr() != nullptr && attr.ptr() != torch::disabled_torch_function);
@@ -321,7 +356,11 @@ auto check_has_torch_function(PyObject* obj, bool ignore_mode) -> bool {
 }
 } // namespace torch
 
+<<<<<<< HEAD
 inline bool sequence_has_torch_function(PyObject* args) {
+=======
+inline static bool sequence_has_torch_function(PyObject* args) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Py_ssize_t nargs = PySequence_Fast_GET_SIZE(args);
   for (Py_ssize_t i = 0; i < nargs; i++) {
     PyObject* obj = PySequence_Fast_GET_ITEM(args, i);
@@ -332,7 +371,13 @@ inline bool sequence_has_torch_function(PyObject* args) {
   return false;
 }
 
+<<<<<<< HEAD
 inline bool array_has_torch_function(PyObject* const* args, Py_ssize_t nargs) {
+=======
+inline static bool array_has_torch_function(
+    PyObject* const* args,
+    Py_ssize_t nargs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (Py_ssize_t i = 0; i < nargs; i++) {
     if (torch::check_has_torch_function(args[i])) {
       return true;
diff --git a/torch/csrc/utils/generated_serialization_types.h b/torch/csrc/utils/generated_serialization_types.h
index f348069b4fbb..3425d8f7f77a 100644
--- a/torch/csrc/utils/generated_serialization_types.h
+++ b/torch/csrc/utils/generated_serialization_types.h
@@ -1,5 +1,9 @@
 // @generated by update_schema.py
+<<<<<<< HEAD
 // checksum<<31c433c768b3f1bb61a5e8f4ceffc40c857bd80cf4fa0fc33fd03fa5ebb6c4d8>>
+=======
+// checksum<<110c364974d3b0f7dcbdf6862781212bdcc7178925c43c894c336fc2b6ca6628>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // clang-format off
 
 #pragma once
@@ -54,9 +58,15 @@ class ForwardRef {
 
  public:
   ForwardRef(): ptr_(std::make_unique<T>()) {}
+<<<<<<< HEAD
   ForwardRef(ForwardRef<T>&&) = default;
   ForwardRef(const ForwardRef<T>& other): ptr_(std::make_unique<T>(*other.ptr_)) {}
   ForwardRef<T>& operator=(ForwardRef<T>&&) = default;
+=======
+  ForwardRef(ForwardRef<T>&&);
+  ForwardRef(const ForwardRef<T>& other): ptr_(std::make_unique<T>(*other.ptr_)) {}
+  ForwardRef<T>& operator=(ForwardRef<T>&&);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ForwardRef<T>& operator=(const ForwardRef<T>& other) {
     ptr_ = std::make_unique<T>(*other.ptr_);
     return *this;
@@ -1196,11 +1206,19 @@ class Argument {
 
  public:
   enum class Tag {
+<<<<<<< HEAD
     AS_NONE, AS_TENSOR, AS_TENSORS, AS_INT, AS_INTS, AS_FLOAT, AS_FLOATS, AS_STRING, AS_STRINGS, AS_SYM_INT, AS_SYM_INTS, AS_SCALAR_TYPE, AS_MEMORY_FORMAT, AS_LAYOUT, AS_DEVICE, AS_BOOL, AS_BOOLS, AS_SYM_BOOL, AS_SYM_BOOLS, AS_GRAPH, AS_OPTIONAL_TENSORS, AS_CUSTOM_OBJ, AS_OPERATOR, AS_SYM_FLOAT, AS_SYM_FLOATS
   };
 
  private:
   std::variant<Void, bool, TensorArgument, std::vector<TensorArgument>, int64_t, std::vector<int64_t>, F64, std::vector<F64>, std::string, std::vector<std::string>, SymIntArgument, std::vector<SymIntArgument>, ScalarType, MemoryFormat, Layout, Device, bool, std::vector<bool>, SymBoolArgument, std::vector<SymBoolArgument>, GraphArgument, std::vector<OptionalTensorArgument>, CustomObjArgument, std::string, SymFloatArgument, std::vector<SymFloatArgument>> variant_;
+=======
+    AS_NONE, AS_TENSOR, AS_TENSORS, AS_INT, AS_INTS, AS_FLOAT, AS_FLOATS, AS_STRING, AS_STRINGS, AS_SYM_INT, AS_SYM_INTS, AS_SCALAR_TYPE, AS_MEMORY_FORMAT, AS_LAYOUT, AS_DEVICE, AS_BOOL, AS_BOOLS, AS_SYM_BOOL, AS_SYM_BOOLS, AS_GRAPH, AS_OPTIONAL_TENSORS, AS_CUSTOM_OBJ, AS_OPERATOR, AS_SYM_FLOAT, AS_SYM_FLOATS, AS_OPTIONAL_TENSOR
+  };
+
+ private:
+  std::variant<Void, bool, TensorArgument, std::vector<TensorArgument>, int64_t, std::vector<int64_t>, F64, std::vector<F64>, std::string, std::vector<std::string>, SymIntArgument, std::vector<SymIntArgument>, ScalarType, MemoryFormat, Layout, Device, bool, std::vector<bool>, SymBoolArgument, std::vector<SymBoolArgument>, GraphArgument, std::vector<OptionalTensorArgument>, CustomObjArgument, std::string, SymFloatArgument, std::vector<SymFloatArgument>, OptionalTensorArgument> variant_;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Tag tag_;
 
  public:
@@ -1433,6 +1451,18 @@ class Argument {
     tag_ = Tag::AS_SYM_FLOATS;
   }
 
+<<<<<<< HEAD
+=======
+  const OptionalTensorArgument& get_as_optional_tensor() const {
+    return std::get<26>(variant_);
+  }
+
+  void set_as_optional_tensor(OptionalTensorArgument def) {
+    variant_.emplace<26>(std::move(def));
+    tag_ = Tag::AS_OPTIONAL_TENSOR;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   friend void to_json(nlohmann::json& nlohmann_json_j, const Argument& nlohmann_json_t) {
 
     if (nlohmann_json_t.tag_ == Tag::AS_NONE) {
@@ -1535,6 +1565,13 @@ class Argument {
       nlohmann_json_j["as_sym_floats"] = nlohmann_json_t.get_as_sym_floats();
       return;
     }
+<<<<<<< HEAD
+=======
+    if (nlohmann_json_t.tag_ == Tag::AS_OPTIONAL_TENSOR) {
+      nlohmann_json_j["as_optional_tensor"] = nlohmann_json_t.get_as_optional_tensor();
+      return;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 
   friend void from_json(const nlohmann::json& nlohmann_json_j, Argument& nlohmann_json_t) {
@@ -1664,6 +1701,14 @@ class Argument {
       nlohmann_json_t.tag_ = Tag::AS_SYM_FLOATS;
       return;
     }
+<<<<<<< HEAD
+=======
+    if (nlohmann_json_j.contains("as_optional_tensor")) {
+      nlohmann_json_t.variant_.emplace<26>(nlohmann_json_j.at("as_optional_tensor").template get<OptionalTensorArgument>());
+      nlohmann_json_t.tag_ = Tag::AS_OPTIONAL_TENSOR;
+      return;
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
 };
 
@@ -1694,6 +1739,10 @@ inline std::string_view printEnum(const Argument::Tag& e) {
     case Argument::Tag::AS_OPERATOR: return "AS_OPERATOR";
     case Argument::Tag::AS_SYM_FLOAT: return "AS_SYM_FLOAT";
     case Argument::Tag::AS_SYM_FLOATS: return "AS_SYM_FLOATS";
+<<<<<<< HEAD
+=======
+    case Argument::Tag::AS_OPTIONAL_TENSOR: return "AS_OPTIONAL_TENSOR";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default:
       throw std::runtime_error("Unknown enum value");
   }
@@ -1725,6 +1774,10 @@ inline void parseEnum(std::string_view s, Argument::Tag& t) {
   if (s == "AS_OPERATOR") { t = Argument::Tag::AS_OPERATOR; return; }
   if (s == "AS_SYM_FLOAT") { t = Argument::Tag::AS_SYM_FLOAT; return; }
   if (s == "AS_SYM_FLOATS") { t = Argument::Tag::AS_SYM_FLOATS; return; }
+<<<<<<< HEAD
+=======
+  if (s == "AS_OPTIONAL_TENSOR") { t = Argument::Tag::AS_OPTIONAL_TENSOR; return; }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   throw std::runtime_error("Unknown enum value: " + std::string{s});
 }
 
@@ -3688,6 +3741,12 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, UserOutputSpec& nlo
   nlohmann_json_t.arg = nlohmann_json_j.value("arg", nlohmann_json_default_obj.arg);
 }
 
+<<<<<<< HEAD
+=======
+
+template <typename T> ForwardRef<T>::ForwardRef(ForwardRef<T>&&) = default;
+template <typename T> ForwardRef<T>& ForwardRef<T>::operator=(ForwardRef<T>&&) = default;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 } // namespace _export
 } // namespace torch
 
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index a22a08cc222f..6c91b40895f4 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -339,7 +339,11 @@ struct type_caster<c10::complex<T>> {
   bool load(handle src, bool) {
     PyObject* obj = src.ptr();
 
+<<<<<<< HEAD
     // Refered from `THPUtils_unpackComplexDouble`
+=======
+    // Referred from `THPUtils_unpackComplexDouble`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Py_complex py_complex = PyComplex_AsCComplex(obj);
     if (py_complex.real == -1.0 && PyErr_Occurred()) {
       return false;
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 5fbf972b2bc2..dca824f05ae3 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -299,7 +299,11 @@ static py::object maybe_get_registered_torch_dispatch_rule(
 #endif
   auto result = find_torch_dispatch_rule(
       py::reinterpret_borrow<py::object>(torch_api_function),
+<<<<<<< HEAD
       torch_dispatch_object.get_type());
+=======
+      py::type::handle_of(torch_dispatch_object));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return result;
 }
 
@@ -350,7 +354,11 @@ static py::object dispatch_on_subclass(
         auto py_arg = py::reinterpret_borrow<py::object>(arg);
         ret = py::reinterpret_steal<py::object>(PyObject_CallFunctionObjArgs(
             torch_function.ptr(),
+<<<<<<< HEAD
             py_arg.get_type().ptr(),
+=======
+            py::type::handle_of(py_arg).ptr(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch_api_function,
             py_types.ptr(),
             args,
@@ -889,7 +897,11 @@ static bool is_int_or_symint(PyObject* obj) {
   // for regular tensors it's redundant with the test below.
   if (THPVariable_Check(obj)) {
     auto& var = THPVariable_Unpack(obj);
+<<<<<<< HEAD
     if (TORCH_GUARD_SIZE_OBLIVIOUS(var.sym_numel().sym_eq(1)) &&
+=======
+    if (TORCH_GUARD_OR_FALSE(var.sym_numel().sym_eq(1)) &&
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         at::isIntegralType(var.dtype().toScalarType(), /*include_bool*/ true)) {
       return true;
     }
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 1531c78ce7eb..e199dbf8067d 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -1248,7 +1248,11 @@ auto handle_torch_function_indexing(
 /*
  * Check if the input obj is Tensor type, including its subclass, or overloaded
  * type. If the type defines __torch_function__, it also returns true.
+<<<<<<< HEAD
  * Otherwise returns flase. If the class is not torch.Tensor, and it defines
+=======
+ * Otherwise returns false. If the class is not torch.Tensor, and it defines
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * __torch_function__, we append obj to overloaded_args.
  *
  * 'obj': the input argument to be checked
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index c5a659f371da..e5f6c339f1c3 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -63,7 +63,11 @@ static c10::AliasAnalysisKind parseAliasAnalysisKind(const std::string& k) {
 }
 
 template <typename Func>
+<<<<<<< HEAD
 inline torch::CppFunction dispatch_str(const char* key, Func&& raw_f) {
+=======
+inline static torch::CppFunction dispatch_str(const char* key, Func&& raw_f) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (key[0] != '\0') {
     return torch::dispatch(
         c10::parseDispatchKey(key), std::forward<Func>(raw_f));
@@ -186,11 +190,20 @@ class PythonKernelHolder : public c10::OperatorKernel {
 
     auto arguments = torch::jit::pop(*stack, op.schema().arguments().size());
     py::gil_scoped_acquire g;
+<<<<<<< HEAD
     // Jan 2024: We're slated to get rid of multipy, so stop forcing hermetic
     // mode unconditionally in all situations when you're using multipy.
     // Eventually just delete this entirely.  (Note that you may break multipy
     // anyway this way with dispatcher registered functions that require
     // hermetic to be off.)
+=======
+    // Jan 2024: We're slated to get rid of multipy, // codespell:ignore multipy
+    // so stop forcing hermetic mode unconditionally in all situations when
+    // you're using multipy.  // codespell:ignore multipy
+    // Eventually just delete this entirely.  (Note that you may break
+    // multipy anyway this way with dispatcher  // codespell:ignore multipy
+    // registered functions that require hermetic to be off.)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #if defined(USE_DEPLOY)
     EnableHermeticPyObject g2;
 #endif
@@ -299,8 +312,13 @@ void initDispatchBindings(PyObject* module) {
             return;
           },
           "")
+<<<<<<< HEAD
       // Some of these APIs are only for testing and do not work in multipy
       // environment
+=======
+      // Some of these APIs are only for testing and do not work in
+      // multipy environment  // codespell:ignore multipy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       .def(
           "def_",
           [](py::object self, const char* schema, const char* alias) {
@@ -771,7 +789,27 @@ void initDispatchBindings(PyObject* module) {
             return self.add(k);
           })
       .def("has", &c10::DispatchKeySet::has)
+<<<<<<< HEAD
       .def("__repr__", [](c10::DispatchKeySet d) { return c10::toString(d); });
+=======
+      .def("__repr__", [](c10::DispatchKeySet d) { return c10::toString(d); })
+      .def(
+          "__eq__",
+          [](c10::DispatchKeySet self, c10::DispatchKeySet other) {
+            return self.raw_repr() == other.raw_repr();
+          })
+      .def(py::pickle(
+          [](const c10::DispatchKeySet&
+                 obj) { // __getstate__ : creates tuple of state
+            return py::make_tuple(obj.raw_repr());
+          },
+          [](const py::tuple& t) { // __setstate__ : restores state from tuple
+            TORCH_CHECK(
+                t.size() == 1, "__setstate__ expected tuple with one element");
+            return c10::DispatchKeySet::from_raw_repr(t[0].cast<uint64_t>());
+          }))
+      .def_static("from_raw_repr", &c10::DispatchKeySet::from_raw_repr);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   m.attr("_dispatch_autogradother_backends") =
       py::cast(c10::autogradother_backends);
diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h
index c22f752d7834..8ad4e5e60646 100644
--- a/torch/csrc/utils/python_numbers.h
+++ b/torch/csrc/utils/python_numbers.h
@@ -182,7 +182,11 @@ inline bool THPUtils_unpackNumberAsBool(PyObject* obj) {
   if (value == -1 && PyErr_Occurred()) {
     throw python_error();
   }
+<<<<<<< HEAD
   // No need to check overflow, because when overflow occured, it should
+=======
+  // No need to check overflow, because when overflow occurred, it should
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   // return true in order to keep the same behavior of numpy.
   return (bool)value;
 }
diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h
index 43ef85ad8fce..9af1e0fd1fb9 100644
--- a/torch/csrc/utils/python_symnode.h
+++ b/torch/csrc/utils/python_symnode.h
@@ -135,6 +135,24 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
     return getPyObj().attr("guard_size_oblivious")(file, line).cast<bool>();
   }
 
+<<<<<<< HEAD
+=======
+  bool guard_or_false(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("guard_or_false")(file, line).cast<bool>();
+  }
+
+  bool statically_known_true(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("statically_known_true")(file, line).cast<bool>();
+  }
+
+  bool guard_or_true(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("guard_or_true")(file, line).cast<bool>();
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int64_t int_() override {
     py::gil_scoped_acquire acquire;
     return getPyObj().attr("int_")().cast<int64_t>();
diff --git a/torch/csrc/utils/six.h b/torch/csrc/utils/six.h
index cfca55bb86ec..9d849b2574ee 100644
--- a/torch/csrc/utils/six.h
+++ b/torch/csrc/utils/six.h
@@ -13,8 +13,13 @@ namespace six {
 // by a pytorch operator.
 
 inline bool isStructSeq(pybind11::handle input) {
+<<<<<<< HEAD
   return pybind11::cast<std::string>(input.get_type().attr("__module__")) ==
       "torch.return_types";
+=======
+  return pybind11::cast<std::string>(pybind11::type::handle_of(input).attr(
+             "__module__")) == "torch.return_types";
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 inline bool isStructSeq(PyObject* obj) {
diff --git a/torch/csrc/utils/structseq.cpp b/torch/csrc/utils/structseq.cpp
index f23af7bf31f5..81ab62381014 100644
--- a/torch/csrc/utils/structseq.cpp
+++ b/torch/csrc/utils/structseq.cpp
@@ -5,7 +5,11 @@
  * https://github.com/python/cpython/blob/2.7/Objects/structseq.c
  *
  * The purpose of this file is to overwrite the default behavior
+<<<<<<< HEAD
  * of repr of structseq to provide better printting for returned
+=======
+ * of repr of structseq to provide better printing for returned
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  * structseq objects from operators, aka torch.return_types.*
  *
  * For more information on copyright of CPython, see:
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index d0c452ec2857..314f6b94aa8e 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -172,13 +172,20 @@ ScalarType infer_scalar_type(PyObject* obj) {
       Py_TYPE(obj)->tp_name,
       "'");
   if (PySequence_Check(obj)) {
+<<<<<<< HEAD
     std::optional<ScalarType> scalarType;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     auto length = PySequence_Length(obj);
     if (length < 0)
       throw python_error();
     // match NumPy semantics, except use default tensor type instead of double.
     if (length == 0)
       return torch::tensors::get_default_scalar_type();
+<<<<<<< HEAD
+=======
+    ScalarType scalarType{};
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (const auto i : c10::irange(length)) {
       THPObjectPtr handle(PySequence_GetItem(obj, i));
       if (!handle)
@@ -187,6 +194,7 @@ ScalarType infer_scalar_type(PyObject* obj) {
       TORCH_CHECK_TYPE(
           cur_item != obj, "new(): self-referential lists are incompatible");
       ScalarType item_scalarType = infer_scalar_type(cur_item);
+<<<<<<< HEAD
       scalarType = (scalarType) ? at::promoteTypes(*scalarType, item_scalarType)
                                 : item_scalarType;
       if (scalarType == ScalarType::ComplexDouble) {
@@ -197,6 +205,17 @@ ScalarType infer_scalar_type(PyObject* obj) {
     }
     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
     return *scalarType;
+=======
+      scalarType = (i > 0) ? at::promoteTypes(scalarType, item_scalarType)
+                           : item_scalarType;
+      if (scalarType == ScalarType::ComplexDouble) {
+        // this won't change (unless we hit undefined, but that will fail
+        // later).
+        return scalarType;
+      }
+    }
+    return scalarType;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   }
   TORCH_CHECK(false, "Could not infer dtype of ", Py_TYPE(obj)->tp_name);
 }
@@ -417,7 +436,11 @@ Tensor internal_new_from_data(
             " or an UntypedStorage, but got ",
             storage_scalar_type);
         tensor = at::empty(
+<<<<<<< HEAD
             sizes,
+=======
+            {0}, // sizes. Storage will be set later.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             at::initialTensorOptions()
                 .dtype(
                     is_typed_storage ? storage_scalar_type
@@ -705,7 +728,11 @@ c10::TensorOptions typeIdWithDefault(
 
 } // namespace
 
+<<<<<<< HEAD
 Tensor legacy_tensor_generic_ctor_new(
+=======
+static Tensor legacy_tensor_generic_ctor_new(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::DispatchKey dispatch_key,
     at::ScalarType scalar_type,
     PyObject* args,
@@ -1157,6 +1184,10 @@ Tensor sparse_coo_tensor_ctor(
     ARG_PIN_MEMORY,
     ARG_REQUIRES_GRAD,
     ARG_CHECK_INVARIANTS,
+<<<<<<< HEAD
+=======
+    ARG_IS_COALESCED,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ARGS_COUNT
   };
   enum {
@@ -1218,7 +1249,12 @@ Tensor sparse_coo_tensor_ctor(
     return at::sparse_coo_tensor(
                indices,
                values,
+<<<<<<< HEAD
                values.options().layout(at::kSparse).pinned_memory(pin_memory))
+=======
+               values.options().layout(at::kSparse).pinned_memory(pin_memory),
+               r.toBoolOptional(ARG_IS_COALESCED))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .set_requires_grad(r.toBool(ARG_REQUIRES_GRAD));
   } else if (r.idx == 1) {
     bool pin_memory = r.toBool(ARG_PIN_MEMORY1);
@@ -1360,7 +1396,11 @@ void _validate_sparse_compressed_tensor_args(
 }
 
 template <c10::Layout required_layout>
+<<<<<<< HEAD
 void _validate_sparse_compressed_tensor_args_template(
+=======
+static void _validate_sparse_compressed_tensor_args_template(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     c10::DispatchKey dispatch_key,
     at::ScalarType scalar_type,
     PyObject* args,
@@ -1784,7 +1824,11 @@ Tensor asarray(
         tensor = tensor.clone();
       }
     } else {
+<<<<<<< HEAD
       // If we are not copying, we have to check whther we have the tensor
+=======
+      // If we are not copying, we have to check whether we have the tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       // in the right device, with the right dtype.
       TORCH_CHECK_VALUE(
           !wrong_device,
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index b77e1c8d171d..e13722e606f4 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -132,7 +132,11 @@ PyObject* tensor_to_numpy(const at::Tensor& tensor, bool force /*=false*/) {
       "can't convert ",
       c10::str(tensor.layout()).c_str(),
       " layout tensor to numpy. ",
+<<<<<<< HEAD
       "Use Tensor.dense() first.");
+=======
+      "Use Tensor.to_dense() first.");
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   if (!force) {
     TORCH_CHECK_TYPE(
@@ -231,8 +235,18 @@ at::Tensor tensor_from_numpy(
   int ndim = PyArray_NDIM(array);
   auto sizes = to_aten_shape(ndim, PyArray_DIMS(array));
   auto strides = to_aten_shape(ndim, PyArray_STRIDES(array));
+<<<<<<< HEAD
   // NumPy strides use bytes. Torch strides use element counts.
   auto element_size_in_bytes = PyArray_ITEMSIZE(array);
+=======
+  // This must go before the INCREF and element_size checks
+  // in case the dtype mapping doesn't exist and an exception is thrown
+  auto torch_dtype = numpy_dtype_to_aten(PyArray_TYPE(array));
+  // NumPy strides use bytes. Torch strides use element counts.
+  const auto element_size_in_bytes = PyArray_ITEMSIZE(array);
+  TORCH_CHECK(element_size_in_bytes > 0, "element_size must be 0");
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   for (auto& stride : strides) {
     TORCH_CHECK_VALUE(
         stride % element_size_in_bytes == 0,
@@ -255,9 +269,12 @@ at::Tensor tensor_from_numpy(
       PyArray_EquivByteorders(PyArray_DESCR(array)->byteorder, NPY_NATIVE),
       "given numpy array has byte order different from the native byte order. "
       "Conversion between byte orders is currently not supported.");
+<<<<<<< HEAD
   // This has to go before the INCREF in case the dtype mapping doesn't
   // exist and an exception is thrown
   auto torch_dtype = numpy_dtype_to_aten(PyArray_TYPE(array));
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Py_INCREF(obj);
   return at::lift_fresh(at::from_blob(
       data_ptr,
@@ -507,7 +524,11 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
 
 // Mutated only once (during module init); behaves as an immutable variable
 // thereafter.
+<<<<<<< HEAD
 bool numpy_with_dlpack_deleter_bug_installed = false;
+=======
+static bool numpy_with_dlpack_deleter_bug_installed = false;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 // NumPy implemented support for Dlpack capsules in version 1.22.0. However, the
 // initial implementation did not correctly handle the invocation of
diff --git a/torch/csrc/utils/tensor_numpy.h b/torch/csrc/utils/tensor_numpy.h
index a067af44a452..fae39ace3eac 100644
--- a/torch/csrc/utils/tensor_numpy.h
+++ b/torch/csrc/utils/tensor_numpy.h
@@ -5,6 +5,7 @@
 
 namespace torch::utils {
 
+<<<<<<< HEAD
 PyObject* tensor_to_numpy(const at::Tensor& tensor, bool force = false);
 at::Tensor tensor_from_numpy(PyObject* obj, bool warn_if_not_writeable = true);
 
@@ -15,6 +16,23 @@ bool is_numpy_available();
 bool is_numpy_int(PyObject* obj);
 bool is_numpy_bool(PyObject* obj);
 bool is_numpy_scalar(PyObject* obj);
+=======
+TORCH_API PyObject* tensor_to_numpy(
+    const at::Tensor& tensor,
+    bool force = false);
+
+TORCH_API at::Tensor tensor_from_numpy(
+    PyObject* obj,
+    bool warn_if_not_writeable = true);
+
+TORCH_API int aten_to_numpy_dtype(const at::ScalarType scalar_type);
+TORCH_API at::ScalarType numpy_dtype_to_aten(int dtype);
+
+TORCH_API bool is_numpy_available();
+TORCH_API bool is_numpy_int(PyObject* obj);
+TORCH_API bool is_numpy_bool(PyObject* obj);
+TORCH_API bool is_numpy_scalar(PyObject* obj);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 void warn_numpy_not_writeable();
 at::Tensor tensor_from_cuda_array_interface(PyObject* obj);
diff --git a/torch/csrc/utils/throughput_benchmark.h b/torch/csrc/utils/throughput_benchmark.h
index 8cf2f97158f2..cb883194f63b 100644
--- a/torch/csrc/utils/throughput_benchmark.h
+++ b/torch/csrc/utils/throughput_benchmark.h
@@ -18,7 +18,11 @@ namespace torch::throughput_benchmark {
 
 /**
  * The struct is used to provide results of a benchmark to the caller
+<<<<<<< HEAD
  * In the future all additional statics should be added here.
+=======
+ * In the future all additional statistics should be added here.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
  */
 struct BenchmarkExecutionStats {
   float latency_avg_ms{-1};
diff --git a/torch/csrc/xpu/Event.cpp b/torch/csrc/xpu/Event.cpp
index 45e04edaa375..2b765a73eff3 100644
--- a/torch/csrc/xpu/Event.cpp
+++ b/torch/csrc/xpu/Event.cpp
@@ -115,6 +115,10 @@ static PyObject* THXPEvent_synchronize(PyObject* _self, PyObject* noargs) {
 static struct PyGetSetDef THXPEvent_properties[] = {
     {"device", (getter)THXPEvent_get_device, nullptr, nullptr, nullptr},
     {"sycl_event", (getter)THXPEvent_get_sycl_event, nullptr, nullptr, nullptr},
+<<<<<<< HEAD
+=======
+    {"event_id", (getter)THXPEvent_get_sycl_event, nullptr, nullptr, nullptr},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     {nullptr}};
 
 // NOLINTNEXTLINE(*c-arrays*, *global-variables)
@@ -126,7 +130,11 @@ static PyMethodDef THXPEvent_methods[] = {
     {(char*)"synchronize", THXPEvent_synchronize, METH_NOARGS, nullptr},
     {nullptr}};
 
+<<<<<<< HEAD
 PyTypeObject THXPEventType = {
+=======
+static PyTypeObject THXPEventType = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyVarObject_HEAD_INIT(nullptr, 0)
     "torch._C._XpuEventBase", /* tp_name */
     sizeof(THXPEvent), /* tp_basicsize */
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
index 4ddaed4aca98..4f7fc5f6060a 100644
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@@ -9,6 +9,7 @@
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>
+<<<<<<< HEAD
 
 #ifndef WIN32
 #include <pthread.h>
@@ -42,6 +43,18 @@ PyObject* THXPModule_getArchFlags(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
 #ifdef XPU_ARCH_FLAGS
   static const char* flags = C10_STRINGIZE(XPU_ARCH_FLAGS);
+=======
+#include <torch/csrc/xpu/Module.h>
+
+using namespace torch;
+
+// XPU management methods
+
+static PyObject* THXPModule_getArchFlags(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+#ifdef XPU_ARCH_FLAGS
+  static const std::string flags = std::string(C10_STRINGIZE(XPU_ARCH_FLAGS));
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_packString(flags);
 #else
   Py_RETURN_NONE;
@@ -51,11 +64,19 @@ PyObject* THXPModule_getArchFlags(PyObject* self, PyObject* noargs) {
 
 static PyObject* THXPModule_isInBadFork_wrap(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   return PyBool_FromLong(in_bad_fork);
   END_HANDLE_TH_ERRORS
 }
 
 PyObject* THXPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
+=======
+  return PyBool_FromLong(torch::utils::is_device_in_bad_fork(at::kXPU));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to set_device");
 
@@ -66,7 +87,11 @@ PyObject* THXPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_exchangeDevice_wrap(PyObject* self, PyObject* arg) {
+=======
+static PyObject* THXPModule_exchangeDevice_wrap(PyObject* self, PyObject* arg) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchange_device");
 
@@ -82,7 +107,13 @@ PyObject* THXPModule_exchangeDevice_wrap(PyObject* self, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_maybeExchangeDevice_wrap(PyObject* self, PyObject* arg) {
+=======
+static PyObject* THXPModule_maybeExchangeDevice_wrap(
+    PyObject* self,
+    PyObject* arg) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(arg), "invalid argument to maybe_exchange_device");
@@ -99,7 +130,11 @@ PyObject* THXPModule_maybeExchangeDevice_wrap(PyObject* self, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
+=======
+static PyObject* THXPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
 
   auto device_index = c10::xpu::current_device();
@@ -108,14 +143,28 @@ PyObject* THXPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   poison_fork();
+=======
+static PyObject* THXPModule_getDeviceCount_wrap(
+    PyObject* self,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  // Note: This is distinct from initExtension because a stub xpu implementation
+  // has some working functions (e.g. device_count) but cannot fully initialize.
+  torch::utils::register_fork_handler_for_device_init(at::kXPU);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   return THPUtils_packUInt64(at::xpu::device_count());
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_getCurrentStream_wrap(
+=======
+static PyObject* THXPModule_getCurrentStream_wrap(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* self,
     PyObject* device_index) {
   HANDLE_TH_ERRORS
@@ -136,7 +185,11 @@ PyObject* THXPModule_getCurrentStream_wrap(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_getCurrentStream_raw(
+=======
+static PyObject* THXPModule_getCurrentStream_raw(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* self,
     PyObject* device_index) {
   HANDLE_TH_ERRORS
@@ -149,7 +202,11 @@ PyObject* THXPModule_getCurrentStream_raw(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_setStream_wrap(
+=======
+static PyObject* THXPModule_setStream_wrap(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* self,
     PyObject* args,
     PyObject* kwargs) {
@@ -186,7 +243,11 @@ PyObject* THXPModule_setStream_wrap(
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_xpuSynchronize(PyObject* self, PyObject* arg) {
+=======
+static PyObject* THXPModule_xpuSynchronize(PyObject* self, PyObject* arg) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to synchronize");
   auto device_index = THPUtils_unpackDeviceIndex(arg);
@@ -200,14 +261,22 @@ PyObject* THXPModule_xpuSynchronize(PyObject* self, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_emptyCache(PyObject* self, PyObject* noargs) {
+=======
+static PyObject* THXPModule_emptyCache(PyObject* self, PyObject* noargs) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   c10::xpu::XPUCachingAllocator::emptyCache();
   END_HANDLE_TH_ERRORS
   Py_RETURN_NONE;
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_memoryStats(PyObject* self, PyObject* arg) {
+=======
+static PyObject* THXPModule_memoryStats(PyObject* self, PyObject* arg) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to memory_stats");
   const auto device_index = THPUtils_unpackDeviceIndex(arg);
@@ -250,7 +319,13 @@ PyObject* THXPModule_memoryStats(PyObject* self, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_resetPeakMemoryStats(PyObject* self, PyObject* arg) {
+=======
+static PyObject* THXPModule_resetPeakMemoryStats(
+    PyObject* self,
+    PyObject* arg) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(arg), "invalid argument to reset_peak_memory_stats");
@@ -260,7 +335,11 @@ PyObject* THXPModule_resetPeakMemoryStats(PyObject* self, PyObject* arg) {
   Py_RETURN_NONE;
 }
 
+<<<<<<< HEAD
 PyObject* THXPModule_resetAccumulatedMemoryStats(
+=======
+static PyObject* THXPModule_resetAccumulatedMemoryStats(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyObject* self,
     PyObject* arg) {
   HANDLE_TH_ERRORS
@@ -413,8 +492,13 @@ static void initXpuMethodBindings(PyObject* module) {
 // classes
 static PyObject* THXPModule_initExtension(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
+<<<<<<< HEAD
   TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
   poison_fork();
+=======
+  TORCH_INTERNAL_ASSERT(!torch::utils::is_device_in_bad_fork(at::kXPU));
+  torch::utils::register_fork_handler_for_device_init(at::kXPU);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   at::globalContext().lazyInitDevice(c10::DeviceType::XPU);
 
   auto m = THPObjectPtr(PyImport_ImportModule("torch.xpu"));
diff --git a/torch/csrc/xpu/Stream.cpp b/torch/csrc/xpu/Stream.cpp
index f3750aa7222b..3fee3b17fad4 100644
--- a/torch/csrc/xpu/Stream.cpp
+++ b/torch/csrc/xpu/Stream.cpp
@@ -138,7 +138,11 @@ static PyMethodDef THXPStream_methods[] = {
     {"__eq__", THXPStream_eq, METH_O, nullptr},
     {nullptr}};
 
+<<<<<<< HEAD
 PyTypeObject THXPStreamType = {
+=======
+static PyTypeObject THXPStreamType = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PyVarObject_HEAD_INIT(nullptr, 0)
     "torch._C._XpuStreamBase", /* tp_name */
     sizeof(THXPStream), /* tp_basicsize */
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index e5165f38a691..e574b3654776 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -13,17 +13,28 @@
 
 import importlib
 import os
+<<<<<<< HEAD
+=======
+import sys
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import threading
 import traceback
 import warnings
 from functools import lru_cache
+<<<<<<< HEAD
 from typing import Any, Callable, cast, Optional, Union
+=======
+from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._C
 from torch import device as _device
 from torch._utils import _dummy_type, _LazySeedTracker, classproperty
+<<<<<<< HEAD
 from torch.types import Device
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from . import gds
 from ._utils import _get_device_index
@@ -37,6 +48,12 @@
 from .streams import Event, ExternalStream, Stream
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from torch.types import Device
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 try:
     from torch._C import _cudart  # type: ignore[attr-defined]
 except ImportError:
@@ -49,7 +66,10 @@
     tuple[Callable[[], None], list[str]]
 ] = []  # don't invoke these until initialization occurs
 _is_in_bad_fork = getattr(torch._C, "_cuda_isInBadFork", lambda: False)
+<<<<<<< HEAD
 _device_t = Union[_device, str, int, None]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _HAS_PYNVML = False
 _PYNVML_ERR = None
@@ -159,7 +179,17 @@ def _nvml_based_avail() -> bool:
 
 
 def is_available() -> bool:
+<<<<<<< HEAD
     r"""Return a bool indicating if CUDA is currently available."""
+=======
+    r"""
+    Return a bool indicating if CUDA is currently available.
+
+    .. note:: This function will NOT poison fork if the environment variable
+        ``PYTORCH_NVML_BASED_CUDA_CHECK=1`` is set. For more details, see
+        :ref:`multiprocessing-poison-fork-note`.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not _is_compiled():
         return False
     if _nvml_based_avail():
@@ -201,7 +231,11 @@ def is_bf16_supported(including_emulation: bool = True):
 
 
 @lru_cache(maxsize=16)
+<<<<<<< HEAD
 def _check_bf16_tensor_supported(device: _device_t):
+=======
+def _check_bf16_tensor_supported(device: "Device"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     try:
         torch.tensor([1.0], dtype=torch.bfloat16, device=device)
         return True
@@ -236,6 +270,7 @@ def _extract_arch_version(arch_string: str):
 
 
 def _check_capability():
+<<<<<<< HEAD
     incorrect_binary_warn = """
     Found GPU%d %s which requires CUDA_VERSION >= %d to
      work properly, but your PyTorch was compiled
@@ -251,6 +286,29 @@ def _check_capability():
 
     if torch.version.cuda is not None:  # on ROCm we don't want this check
         CUDA_VERSION = torch._C._cuda_getCompiledVersion()  # noqa: F841
+=======
+    incompatible_gpu_warn = """
+    Found GPU%d %s which is of cuda capability %d.%d.
+    Minimum and Maximum cuda capability supported by this version of PyTorch is
+    (%d.%d) - (%d.%d)
+    """
+    matched_cuda_warn = """
+    Please install PyTorch with a following CUDA
+    configurations: {} following instructions at
+    https://pytorch.org/get-started/locally/
+    """
+
+    # Binary CUDA_ARCHES SUPPORTED by PyTorch
+    CUDA_ARCHES_SUPPORTED = {
+        "12.6": {"min": 50, "max": 90},
+        "12.8": {"min": 70, "max": 120},
+        "12.9": {"min": 70, "max": 120},
+    }
+
+    if (
+        torch.version.cuda is not None and torch.cuda.get_arch_list()
+    ):  # on ROCm we don't want this check
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for d in range(device_count()):
             capability = get_device_capability(d)
             major = capability[0]
@@ -259,6 +317,7 @@ def _check_capability():
             current_arch = major * 10 + minor
             min_arch = min(
                 (_extract_arch_version(arch) for arch in torch.cuda.get_arch_list()),
+<<<<<<< HEAD
                 default=35,
             )
             if current_arch < min_arch:
@@ -266,6 +325,37 @@ def _check_capability():
                     old_gpu_warn
                     % (d, name, major, minor, min_arch // 10, min_arch % 10)
                 )
+=======
+                default=50,
+            )
+            max_arch = max(
+                (_extract_arch_version(arch) for arch in torch.cuda.get_arch_list()),
+                default=50,
+            )
+            if current_arch < min_arch or current_arch > max_arch:
+                warnings.warn(
+                    incompatible_gpu_warn
+                    % (
+                        d,
+                        name,
+                        major,
+                        minor,
+                        min_arch // 10,
+                        min_arch % 10,
+                        max_arch // 10,
+                        max_arch % 10,
+                    )
+                )
+                matched_arches = ""
+                for arch, arch_info in CUDA_ARCHES_SUPPORTED.items():
+                    if (
+                        current_arch >= arch_info["min"]
+                        and current_arch <= arch_info["max"]
+                    ):
+                        matched_arches += f" {arch}"
+                if matched_arches != "":
+                    warnings.warn(matched_cuda_warn.format(matched_arches))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _check_cubins():
@@ -325,6 +415,10 @@ class DeferredCudaCallError(Exception):
     pass
 
 
+<<<<<<< HEAD
+=======
+AcceleratorError = torch._C.AcceleratorError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 OutOfMemoryError = torch._C.OutOfMemoryError
 
 
@@ -517,7 +611,11 @@ def __init__(self, obj):
         super().__init__(idx)
 
 
+<<<<<<< HEAD
 def set_device(device: _device_t) -> None:
+=======
+def set_device(device: "Device") -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Set the current device.
 
     Usage of this function is discouraged in favor of :any:`device`. In most
@@ -532,7 +630,11 @@ def set_device(device: _device_t) -> None:
         torch._C._cuda_setDevice(device)
 
 
+<<<<<<< HEAD
 def get_device_name(device: Optional[_device_t] = None) -> str:
+=======
+def get_device_name(device: "Device" = None) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Get the name of a device.
 
     Args:
@@ -547,7 +649,11 @@ def get_device_name(device: Optional[_device_t] = None) -> str:
     return get_device_properties(device).name
 
 
+<<<<<<< HEAD
 def get_device_capability(device: Optional[_device_t] = None) -> tuple[int, int]:
+=======
+def get_device_capability(device: "Device" = None) -> tuple[int, int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Get the cuda capability of a device.
 
     Args:
@@ -564,7 +670,11 @@ def get_device_capability(device: Optional[_device_t] = None) -> tuple[int, int]
     return prop.major, prop.minor
 
 
+<<<<<<< HEAD
 def get_device_properties(device: Optional[_device_t] = None) -> _CudaDeviceProperties:
+=======
+def get_device_properties(device: "Device" = None) -> _CudaDeviceProperties:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Get the properties of a device.
 
     Args:
@@ -583,7 +693,11 @@ def get_device_properties(device: Optional[_device_t] = None) -> _CudaDeviceProp
     return _get_device_properties(device)  # type: ignore[name-defined]
 
 
+<<<<<<< HEAD
 def can_device_access_peer(device: _device_t, peer_device: _device_t) -> bool:
+=======
+def can_device_access_peer(device: "Device", peer_device: "Device") -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Check if peer access between two devices is possible."""
     _lazy_init()
     device = _get_device_index(device, optional=True)
@@ -960,7 +1074,11 @@ def _device_count_nvml() -> int:
     return len(visible_devices)
 
 
+<<<<<<< HEAD
 def _get_nvml_device_index(device: Optional[Union[int, Device]]) -> int:
+=======
+def _get_nvml_device_index(device: "Device") -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the NVML index of the device, taking CUDA_VISIBLE_DEVICES into account."""
     idx = _get_device_index(device, optional=True)
     visible_devices = _parse_visible_devices()
@@ -983,7 +1101,16 @@ def _get_nvml_device_index(device: Optional[Union[int, Device]]) -> int:
 
 
 def device_count() -> int:
+<<<<<<< HEAD
     r"""Return the number of GPUs available."""
+=======
+    r"""
+    Return the number of GPUs available.
+
+    .. note:: This API will NOT posion fork if NVML discovery succeeds.
+        See :ref:`multiprocessing-poison-fork-note` for more details.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     global _cached_device_count
     if not _is_compiled():
         return 0
@@ -1030,7 +1157,11 @@ def current_device() -> int:
     return torch._C._cuda_getDevice()
 
 
+<<<<<<< HEAD
 def synchronize(device: _device_t = None) -> None:
+=======
+def synchronize(device: "Device" = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Wait for all kernels in all streams on a CUDA device to complete.
 
     Args:
@@ -1056,7 +1187,11 @@ def ipc_collect():
     return torch._C._cuda_ipc_collect()
 
 
+<<<<<<< HEAD
 def current_stream(device: Optional[_device_t] = None) -> Stream:
+=======
+def current_stream(device: "Device" = None) -> Stream:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the currently selected :class:`Stream` for a given device.
 
     Args:
@@ -1074,7 +1209,11 @@ def current_stream(device: Optional[_device_t] = None) -> Stream:
     )
 
 
+<<<<<<< HEAD
 def default_stream(device: Optional[_device_t] = None) -> Stream:
+=======
+def default_stream(device: "Device" = None) -> Stream:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the default :class:`Stream` for a given device.
 
     Args:
@@ -1092,9 +1231,13 @@ def default_stream(device: Optional[_device_t] = None) -> Stream:
     )
 
 
+<<<<<<< HEAD
 def get_stream_from_external(
     data_ptr: int, device: Optional[_device_t] = None
 ) -> Stream:
+=======
+def get_stream_from_external(data_ptr: int, device: "Device" = None) -> Stream:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return a :class:`Stream` from an externally allocated CUDA stream.
 
     This function is used to wrap streams allocated in other libraries in order
@@ -1159,7 +1302,11 @@ def get_sync_debug_mode() -> int:
     return torch._C._cuda_get_sync_debug_mode()
 
 
+<<<<<<< HEAD
 def _get_pynvml_handler(device: Optional[Union[Device, int]] = None):
+=======
+def _get_pynvml_handler(device: "Device" = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not _HAS_PYNVML:
         raise ModuleNotFoundError(
             "pynvml does not seem to be installed or it can't be imported."
@@ -1176,7 +1323,11 @@ def _get_pynvml_handler(device: Optional[Union[Device, int]] = None):
     return handle
 
 
+<<<<<<< HEAD
 def _get_amdsmi_handler(device: Optional[Union[Device, int]] = None):
+=======
+def _get_amdsmi_handler(device: "Device" = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not _HAS_PYNVML:
         raise ModuleNotFoundError(
             "amdsmi does not seem to be installed or it can't be imported."
@@ -1192,7 +1343,11 @@ def _get_amdsmi_handler(device: Optional[Union[Device, int]] = None):
     return handle
 
 
+<<<<<<< HEAD
 def _get_amdsmi_device_index(device: Optional[Union[int, Device]]) -> int:
+=======
+def _get_amdsmi_device_index(device: "Device") -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the amdsmi index of the device, taking visible_devices into account."""
     idx = _get_device_index(device, optional=True)
     visible_devices = _parse_visible_devices()
@@ -1212,15 +1367,21 @@ def _get_amdsmi_device_index(device: Optional[Union[int, Device]]) -> int:
     return idx_map[idx]
 
 
+<<<<<<< HEAD
 def _get_amdsmi_device_memory_used(device: Optional[Union[Device, int]] = None) -> int:
     handle = _get_amdsmi_handler()
     device = _get_amdsmi_device_index(device)
+=======
+def _get_amdsmi_device_memory_used(device: "Device" = None) -> int:
+    handle = _get_amdsmi_handler(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # amdsmi_get_gpu_vram_usage returns mem usage in megabytes
     mem_mega_bytes = amdsmi.amdsmi_get_gpu_vram_usage(handle)["vram_used"]
     mem_bytes = mem_mega_bytes * 1024 * 1024
     return mem_bytes
 
 
+<<<<<<< HEAD
 def _get_amdsmi_memory_usage(device: Optional[Union[Device, int]] = None) -> int:
     handle = _get_amdsmi_handler()
     device = _get_amdsmi_device_index(device)
@@ -1236,6 +1397,19 @@ def _get_amdsmi_utilization(device: Optional[Union[Device, int]] = None) -> int:
 
 
 def _get_amdsmi_temperature(device: Optional[Union[Device, int]] = None) -> int:
+=======
+def _get_amdsmi_memory_usage(device: "Device" = None) -> int:
+    handle = _get_amdsmi_handler(device)
+    return amdsmi.amdsmi_get_gpu_activity(handle)["umc_activity"]
+
+
+def _get_amdsmi_utilization(device: "Device" = None) -> int:
+    handle = _get_amdsmi_handler(device)
+    return amdsmi.amdsmi_get_gpu_activity(handle)["gfx_activity"]
+
+
+def _get_amdsmi_temperature(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     handle = _get_amdsmi_handler(device)
     return amdsmi.amdsmi_get_temp_metric(
         handle,
@@ -1244,12 +1418,17 @@ def _get_amdsmi_temperature(device: Optional[Union[Device, int]] = None) -> int:
     )
 
 
+<<<<<<< HEAD
 def _get_amdsmi_power_draw(device: Optional[Union[Device, int]] = None) -> int:
+=======
+def _get_amdsmi_power_draw(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     handle = _get_amdsmi_handler(device)
     socket_power = amdsmi.amdsmi_get_power_info(handle)["average_socket_power"]
     if socket_power != "N/A":
         return socket_power
     else:
+<<<<<<< HEAD
         return amdsmi.amdsmi_get_power_info(handle)["current_socket_power"]
 
 
@@ -1263,6 +1442,29 @@ def _get_amdsmi_clock_rate(device: Optional[Union[Device, int]] = None) -> int:
 
 
 def device_memory_used(device: Optional[Union[Device, int]] = None) -> int:
+=======
+        socket_power = amdsmi.amdsmi_get_power_info(handle)["current_socket_power"]
+        if socket_power != "N/A":
+            return socket_power
+        else:
+            return 0
+
+
+def _get_amdsmi_clock_rate(device: "Device" = None) -> int:
+    handle = _get_amdsmi_handler(device)
+    clock_info = amdsmi.amdsmi_get_clock_info(handle, amdsmi.AmdSmiClkType.GFX)
+    if "cur_clk" in clock_info:  # ROCm 6.2 deprecation
+        clock_rate = clock_info["cur_clk"]
+    else:
+        clock_rate = clock_info["clk"]
+    if clock_rate != "N/A":
+        return clock_rate
+    else:
+        return 0
+
+
+def device_memory_used(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return used global (device) memory in bytes as given by `nvidia-smi` or `amd-smi`.
 
     Args:
@@ -1280,7 +1482,11 @@ def device_memory_used(device: Optional[Union[Device, int]] = None) -> int:
         return _get_amdsmi_device_memory_used(device)
 
 
+<<<<<<< HEAD
 def memory_usage(device: Optional[Union[Device, int]] = None) -> int:
+=======
+def memory_usage(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the percent of time over the past sample period during which global (device)
     memory was being read or written as given by `nvidia-smi`.
 
@@ -1301,7 +1507,11 @@ def memory_usage(device: Optional[Union[Device, int]] = None) -> int:
         return _get_amdsmi_memory_usage(device)
 
 
+<<<<<<< HEAD
 def utilization(device: Optional[Union[Device, int]] = None) -> int:
+=======
+def utilization(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the percent of time over the past sample period during which one or
     more kernels was executing on the GPU as given by `nvidia-smi`.
 
@@ -1322,7 +1532,11 @@ def utilization(device: Optional[Union[Device, int]] = None) -> int:
         return _get_amdsmi_utilization(device)
 
 
+<<<<<<< HEAD
 def temperature(device: Optional[Union[Device, int]] = None) -> int:
+=======
+def temperature(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the average temperature of the GPU sensor in Degrees C (Centigrades).
 
     The average temperature is computed based on past sample period as given by `nvidia-smi`.
@@ -1343,7 +1557,11 @@ def temperature(device: Optional[Union[Device, int]] = None) -> int:
         return _get_amdsmi_temperature(device)
 
 
+<<<<<<< HEAD
 def power_draw(device: Optional[Union[Device, int]] = None) -> int:
+=======
+def power_draw(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the average power draw of the GPU sensor in mW (MilliWatts)
         over the past sample period as given by `nvidia-smi` for Fermi or newer fully supported devices.
 
@@ -1362,7 +1580,11 @@ def power_draw(device: Optional[Union[Device, int]] = None) -> int:
         return _get_amdsmi_power_draw(device)
 
 
+<<<<<<< HEAD
 def clock_rate(device: Optional[Union[Device, int]] = None) -> int:
+=======
+def clock_rate(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the clock speed of the GPU SM in MHz (megahertz) over the past sample period as given by `nvidia-smi`.
 
     Args:
@@ -1690,6 +1912,78 @@ def addmm_kernel_impl(*args, **kwargs):
 _lazy_call(_register_triton_kernels)
 
 
+<<<<<<< HEAD
+=======
+def _compile_kernel(
+    kernel_source: str,
+    kernel_name: str,
+    compute_capability: Optional[str] = None,
+    header_code: str = "",
+    cuda_include_dirs: Optional[list] = None,
+    nvcc_options: Optional[list] = None,
+):
+    """
+    Compiles a CUDA kernel using NVRTC and returns a callable function.
+
+    This function is a wrapper for NVRTC that enables runtime compilation of CUDA kernels.
+    Note that this returns a raw CUDA kernel that operates on raw memory pointers.
+    To use this kernel as a proper PyTorch operator, you should wrap it following the guide at:
+    pytorch.org/tutorials/advanced/python_custom_ops.html
+
+    Args:
+        kernel_source (str): The CUDA kernel source code as a string
+        kernel_name (str): The name of the kernel function to compile
+        compute_capability (str, optional): The compute capability to target (e.g., "86").
+                                           If None, will detect from current device.
+        header_code (str, optional): Additional header code to prepend to the kernel source
+        cuda_include_dirs (list, optional): List of directories containing CUDA headers
+        nvcc_options (list, optional): Additional options to pass to NVRTC
+
+    Returns:
+        callable: A Python function that can be called with PyTorch tensor arguments to execute the kernel
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> kernel_code = '''
+        extern "C"
+        __global__ void add_tensors(const float* a, const float* b, float* c, int n) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            if (i < n)
+                c[i] = a[i] + b[i];
+        }
+        '''
+        >>> add_kernel = torch.cuda.compile_kernel(kernel_code, "add_tensors")
+        >>> a = torch.randn(1024, device="cuda")
+        >>> b = torch.randn(1024, device="cuda")
+        >>> c = torch.empty_like(a)
+        >>> add_kernel(grid=(4,1,1), block=(256,1,1), args=[a, b, c, a.numel()])
+    """
+    import ctypes
+
+    from torch.cuda._utils import _cuda_load_module, _nvrtc_compile
+
+    # Compile the kernel to PTX
+    ptx = _nvrtc_compile(
+        kernel_source,
+        kernel_name,
+        compute_capability,
+        header_code,
+        cuda_include_dirs,
+        nvcc_options,
+    )
+
+    # Load the module and get the kernel
+    result = _cuda_load_module(ptx, [kernel_name])
+
+    if isinstance(result, dict):
+        return result[kernel_name]
+    else:
+        # This branch shouldn't be executed if kernel_names is provided,
+        # but MyPy needs this to understand type narrowing
+        return getattr(result, kernel_name)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from . import amp, jiterator, nvtx, profiler, sparse, tunable
 
 
@@ -1789,7 +2083,10 @@ def addmm_kernel_impl(*args, **kwargs):
     "memory_summary",
     "memory_usage",
     "MemPool",
+<<<<<<< HEAD
     "MemPoolContext",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "use_mem_pool",
     "temperature",
     "power_draw",
diff --git a/torch/cuda/_memory_viz.py b/torch/cuda/_memory_viz.py
index 07527c397e5a..52613e89ad49 100644
--- a/torch/cuda/_memory_viz.py
+++ b/torch/cuda/_memory_viz.py
@@ -91,6 +91,7 @@ def format_flamegraph(flamegraph_lines, flamegraph_script=None):
     if flamegraph_script is None:
         flamegraph_script = f"/tmp/{os.getuid()}_flamegraph.pl"
     if not os.path.exists(flamegraph_script):
+<<<<<<< HEAD
         import urllib.request
 
         print(f"Downloading flamegraph.pl to: {flamegraph_script}")
@@ -99,6 +100,23 @@ def format_flamegraph(flamegraph_lines, flamegraph_script=None):
             flamegraph_script,
         )
         subprocess.check_call(["chmod", "+x", flamegraph_script])
+=======
+        import tempfile
+        import urllib.request
+
+        print(f"Downloading flamegraph.pl to: {flamegraph_script}")
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=".pl") as f:
+            urllib.request.urlretrieve(
+                "https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph.pl",
+                f.name,
+            )
+            subprocess.check_call(["chmod", "+x", f.name])
+            try:
+                os.rename(f.name, flamegraph_script)
+            except OSError:  # noqa: B001,E722
+                # Ok to skip, the file will be removed by tempfile
+                pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     args = [flamegraph_script, "--countname", "bytes"]
     p = subprocess.Popen(
         args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, encoding="utf-8"
diff --git a/torch/cuda/_pin_memory_utils.py b/torch/cuda/_pin_memory_utils.py
new file mode 100644
index 000000000000..d3c01f3293f7
--- /dev/null
+++ b/torch/cuda/_pin_memory_utils.py
@@ -0,0 +1,24 @@
+import torch
+
+
+def pin_memory(data_ptr: int, size: int) -> None:
+    cudart = torch.cuda.cudart()
+    succ = int(
+        cudart.cudaHostRegister(
+            data_ptr,
+            size,
+            1,  # lines up with 'cudaHostRegisterPortable'
+        )
+    )
+
+    if succ != 0:
+        raise RuntimeError(
+            f"Registering memory failed with cudaError: {succ}."
+            " It's possible that this is an asynchronous error raised from a previous cuda operation."
+            " Consider launching with CUDA_LAUNCH_BLOCKING=1 to debug."
+        )
+
+
+def unpin_memory(data_ptr: int) -> None:
+    succ = int(torch.cuda.cudart().cudaHostUnregister(data_ptr))
+    assert succ == 0, f"Unpinning shared memory failed with error-code: {succ}"
diff --git a/torch/cuda/_utils.py b/torch/cuda/_utils.py
index 1d0ee8830bd6..86fd5a04a21d 100644
--- a/torch/cuda/_utils.py
+++ b/torch/cuda/_utils.py
@@ -1,4 +1,10 @@
+<<<<<<< HEAD
 from typing import Any
+=======
+import ctypes
+import sys
+from typing import Any, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -6,6 +12,332 @@
 from torch._utils import _get_device_index as _torch_get_device_index
 
 
+<<<<<<< HEAD
+=======
+# Load CUDA driver and NVRTC
+def _get_cuda_library() -> ctypes.CDLL:
+    if sys.platform == "win32":
+        return ctypes.CDLL("nvcuda.dll")
+    else:  # Unix-based systems
+        return ctypes.CDLL("libcuda.so.1")
+
+
+# Helper: check CUDA errors
+def _check_cuda(result: int) -> None:
+    if result == 0:
+        return
+    err_str = ctypes.c_char_p()
+    libcuda = _get_cuda_library()  # Get reference to CUDA library
+    libcuda.cuGetErrorString(result, ctypes.byref(err_str))
+    error_message = (
+        err_str.value.decode() if err_str.value is not None else "Unknown CUDA error"
+    )
+    raise RuntimeError(f"CUDA error: {error_message}")
+
+
+def _get_nvrtc_library() -> ctypes.CDLL:
+    # Since PyTorch already loads NVRTC, we can use the system library
+    # which should be compatible with PyTorch's version
+    if sys.platform == "win32":
+        return ctypes.CDLL("nvrtc64_120_0.dll")
+    else:
+        return ctypes.CDLL("libnvrtc.so")
+
+
+def _nvrtc_compile(
+    kernel_source: str,
+    kernel_name: str,
+    compute_capability: Optional[str] = None,
+    header_code: str = "",
+    cuda_include_dirs: Optional[list] = None,
+    nvcc_options: Optional[list] = None,
+) -> bytes:
+    """
+    Compiles a CUDA kernel using NVRTC and returns the PTX code.
+
+    Args:
+        kernel_source (str): The CUDA kernel source code as a string
+        kernel_name (str): The name of the kernel function to compile
+        compute_capability (str, None): The compute capability to target (e.g., "86").
+                                           If None, will detect from current device.
+        header_code (str, optional): Additional header code to prepend to the kernel source
+        cuda_include_dirs (list, None): List of directories containing CUDA headers
+        nvcc_options (list, None): Additional options to pass to NVRTC
+
+    Returns:
+        str: The compiled PTX code
+    """
+    # Ensure CUDA is initialized
+    import torch.cuda
+
+    # Load NVRTC library
+    libnvrtc = _get_nvrtc_library()
+
+    # NVRTC constants
+    NVRTC_SUCCESS = 0
+
+    # Helper: check NVRTC errors
+    def check_nvrtc(result: int) -> None:
+        if result != NVRTC_SUCCESS:
+            err_str = ctypes.c_char_p()
+            libnvrtc.nvrtcGetErrorString(result, ctypes.byref(err_str))
+            error_message = (
+                err_str.value.decode()
+                if err_str.value is not None
+                else "Unknown CUDA error"
+            )
+            raise RuntimeError(f"CUDA error: {error_message}")
+
+    # Add 'extern "C"' if not already present to ensure C linkage
+    if not kernel_source.strip().startswith('extern "C"'):
+        kernel_source = f'extern "C" {kernel_source}'
+
+    # Combine header code and kernel source
+    if header_code:
+        full_source = header_code + "\n" + kernel_source
+    else:
+        full_source = kernel_source
+
+    # Convert source to bytes
+    source_bytes = full_source.encode("utf-8")
+
+    # Get compute capability if not provided
+    if compute_capability is None:
+        props = torch.cuda.get_device_properties(torch.cuda.current_device())
+        compute_capability = f"{props.major}{props.minor}"
+
+    # Prepare compilation options
+    options = []
+    options.append(f"--gpu-architecture=sm_{compute_capability}".encode())
+
+    # Add custom include directories
+    if cuda_include_dirs:
+        for directory in cuda_include_dirs:
+            options.append(f"-I{directory}".encode())
+
+    # Add custom NVCC options
+    if nvcc_options:
+        for option in nvcc_options:
+            options.append(option.encode("utf-8"))
+
+    # TODO: Should we refactor flags into a common place?
+    from torch.utils.cpp_extension import COMMON_NVCC_FLAGS
+
+    # Filter out flags not supported by NVRTC
+    nvrtc_compatible_flags = [
+        flag for flag in COMMON_NVCC_FLAGS if flag != "--expt-relaxed-constexpr"
+    ]
+    options.extend([flag.encode("utf-8") for flag in nvrtc_compatible_flags])
+
+    # Convert options to C array
+    num_options = len(options)
+    options_array = (ctypes.c_char_p * num_options)(*options)
+
+    # Create program
+    prog = ctypes.c_void_p()
+    check_nvrtc(
+        libnvrtc.nvrtcCreateProgram(
+            ctypes.byref(prog),
+            source_bytes,
+            f"{kernel_name}.cu".encode(),
+            0,
+            None,
+            None,
+        )
+    )
+
+    # Compile program
+    res = libnvrtc.nvrtcCompileProgram(prog, num_options, options_array)
+
+    # Handle compilation errors
+    if res != NVRTC_SUCCESS:
+        # Get log
+        log_size = ctypes.c_size_t()
+        libnvrtc.nvrtcGetProgramLogSize(prog, ctypes.byref(log_size))
+        log = ctypes.create_string_buffer(log_size.value)
+        libnvrtc.nvrtcGetProgramLog(prog, log)
+        raise RuntimeError(f"Kernel compilation failed:\n{log.value.decode()}")
+
+    # Get PTX
+    ptx_size = ctypes.c_size_t()
+    check_nvrtc(libnvrtc.nvrtcGetPTXSize(prog, ctypes.byref(ptx_size)))
+    ptx = ctypes.create_string_buffer(ptx_size.value)
+    check_nvrtc(libnvrtc.nvrtcGetPTX(prog, ptx))
+    libnvrtc.nvrtcDestroyProgram(ctypes.byref(prog))
+
+    return ptx.value
+
+
+class _CudaModule:
+    def __init__(self, module: ctypes.c_void_p) -> None:
+        self._module = module
+        self._kernels: dict[str, _CudaKernel] = {}
+
+    def __getattr__(self, name: str) -> "_CudaKernel":
+        if name in self._kernels:
+            return self._kernels[name]
+
+        # Import the CUDA library inside the method
+        from torch.cuda._utils import _get_cuda_library
+
+        libcuda = _get_cuda_library()
+
+        func = ctypes.c_void_p()
+        try:
+            _check_cuda(
+                libcuda.cuModuleGetFunction(
+                    ctypes.byref(func), self._module, name.encode("utf-8")
+                )
+            )
+            kernel = _CudaKernel(func, self._module)
+            self._kernels[name] = kernel
+            return kernel
+
+        except RuntimeError as err:
+            raise AttributeError(f"No kernel named '{name}' in this module") from err
+
+
+class _CudaKernel:
+    """
+    Represents a compiled CUDA kernel that can be called with PyTorch tensors.
+    """
+
+    def __init__(self, func: ctypes.c_void_p, module: ctypes.c_void_p) -> None:
+        self.func = func
+        self.module = module
+
+    def __call__(
+        self,
+        grid: tuple[int, int, int] = (1, 1, 1),
+        block: tuple[int, int, int] = (1, 1, 1),
+        args: Optional[list] = None,
+        shared_mem: int = 0,
+        stream: Optional[Any] = None,
+    ) -> None:
+        """
+        Call the compiled CUDA kernel
+
+        Args:
+            grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
+            block (tuple): Block dimensions (block_x, block_y, block_z)
+            args (list): List of arguments to pass to the kernel.
+                         PyTorch tensor arguments will be automatically converted to pointers.
+            shared_mem (int): Shared memory size in bytes
+            stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
+        """
+        import torch
+
+        libcuda = torch.cuda._utils._get_cuda_library()
+
+        if not args:
+            args = []
+
+        # Process arguments and convert tensors to pointers
+        processed_args: list[ctypes.c_void_p] = []
+        c_args = []
+
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                if not arg.is_cuda and not (arg.is_cpu and arg.is_pinned()):
+                    raise ValueError(
+                        "All tensor arguments must be CUDA tensors or pinned CPU tensors"
+                    )
+                # Get pointer to tensor data
+                ptr = ctypes.c_void_p(arg.data_ptr())
+                processed_args.append(ptr)
+                c_args.append(ctypes.byref(ptr))
+            elif isinstance(arg, int):
+                # Convert integers to C int
+                c_int = ctypes.c_int(arg)
+                # Store the C int for reference keeping, not in processed_args
+                c_args.append(ctypes.byref(c_int))
+            # TODO: Python floats are actually doubles
+            elif isinstance(arg, float):
+                # Convert floats to C float
+                c_float = ctypes.c_float(arg)
+                # Store the C float for reference keeping, not in processed_args
+                c_args.append(ctypes.byref(c_float))
+            else:
+                raise TypeError(f"Unsupported argument type: {type(arg)}")
+
+        # Convert to array of void pointers
+        c_args_array = (ctypes.c_void_p * len(c_args))()
+        for i, arg in enumerate(c_args):
+            c_args_array[i] = ctypes.cast(arg, ctypes.c_void_p)
+
+        # Get the stream
+        if stream is None:
+            # Defer import to avoid circular imports
+            import torch.cuda
+
+            stream = torch.cuda.current_stream()
+
+        _check_cuda(
+            libcuda.cuLaunchKernel(
+                self.func,
+                grid[0],
+                grid[1],
+                grid[2],
+                block[0],
+                block[1],
+                block[2],
+                shared_mem,
+                stream._as_parameter_,
+                c_args_array,
+                None,
+            )
+        )
+
+
+def _cuda_load_module(
+    ptx: Union[str, bytes], kernel_names: Optional[list[str]] = None
+) -> Union[_CudaModule, dict[str, "_CudaKernel"]]:
+    """
+    Loads a CUDA module from PTX code and returns a module object that can access kernels.
+
+    Args:
+        ptx (bytes or str): The PTX code to load
+        kernel_names (list, optional): List of kernel names to extract from the module.
+                                      If None, will return a module object with __getattr__.
+
+    Returns:
+        object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
+               If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
+    """
+    # Ensure CUDA is initialized
+    import torch.cuda
+
+    # Load CUDA driver library
+    libcuda = _get_cuda_library()
+
+    # Convert PTX to bytes if it's a string
+    if isinstance(ptx, str):
+        ptx = ptx.encode("utf-8")
+
+    # Load PTX module
+    module = ctypes.c_void_p()
+    # Get the current stream without directly importing torch.cuda at module level
+    stream = torch.cuda.current_stream()
+    with stream:
+        _check_cuda(libcuda.cuModuleLoadData(ctypes.byref(module), ptx))
+
+    if not kernel_names:
+        return _CudaModule(module)
+
+    # Return specific kernels
+    kernels = {}
+    for name in kernel_names:
+        func = ctypes.c_void_p()
+        _check_cuda(
+            libcuda.cuModuleGetFunction(
+                ctypes.byref(func), module, name.encode("utf-8")
+            )
+        )
+        kernels[name] = _CudaKernel(func, module)
+    return kernels
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_device_index(
     device: Any, optional: bool = False, allow_cpu: bool = False
 ) -> int:
diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py
index 226278aabc1f..cedde112b2e6 100644
--- a/torch/cuda/graphs.py
+++ b/torch/cuda/graphs.py
@@ -46,12 +46,41 @@ def graph_pool_handle():
 class CUDAGraph(torch._C._CUDAGraph):
     r"""Wrapper around a CUDA graph.
 
+<<<<<<< HEAD
     .. warning::
         This API is in beta and may change in future releases.
     """
 
     def __new__(cls):
         return super().__new__(cls)
+=======
+    Arguments:
+        keep_graph (bool, optional): If ``keep_graph=False``, the
+            cudaGraphExec_t will be instantiated on GPU at the end of
+            ``capture_end`` and the underlying cudaGraph_t will be
+            destroyed. Users who want to query or otherwise modify the
+            underlying cudaGraph_t before instantiatiation can set
+            ``keep_graph=True`` and access it via ``raw_cuda_graph`` after
+            ``capture_end``. Note that the cudaGraphExec_t will not be
+            instantiated at the end of ``capture_end`` in this
+            case. Instead, it wil be instantiated via an explicit called
+            to ``instantiate`` or automatically on the first call to
+            ``replay`` if ``instantiate`` was not already called. Calling
+            ``instantiate`` manually before ``replay`` is recommended to
+            prevent increased latency on the first call to ``replay``. It
+            is allowed to modify the raw cudaGraph_t after first calling
+            ``instantiate``, but the user must call ``instantiate`` again
+            manually to make sure the instantiated graph has these
+            changes. Pytorch has no means of tracking these changes.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    """
+
+    def __new__(cls, keep_graph=False):
+        return super().__new__(cls, keep_graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def capture_begin(self, pool=None, capture_error_mode="global"):
         r"""Begin capturing CUDA work on the current stream.
@@ -83,6 +112,18 @@ def capture_end(self):
         """
         super().capture_end()
 
+<<<<<<< HEAD
+=======
+    def instantiate(self):
+        r"""Instantiate the CUDA graph. Will be called by
+        ``capture_end`` if ``keep_graph=False``, or by ``replay`` if
+        ``keep_graph=True`` and ``instantiate`` has not already been
+        explicitly called. Does not destroy the cudaGraph_t returned
+        by ``raw_cuda_graph``.
+        """
+        super().instantiate()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def replay(self):
         r"""Replay the CUDA work captured by this graph."""
         super().replay()
@@ -113,6 +154,16 @@ def debug_dump(self, debug_path):
         """
         return super().debug_dump(debug_path)
 
+<<<<<<< HEAD
+=======
+    def raw_cuda_graph(self):
+        r"""Returns the underlying cudaGraph_t. ``keep_graph`` must be True.
+
+        See the following for APIs for how to manipulate this object: `Graph Managmement <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html>`_ and `cuda-python Graph Management bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-management>`_
+        """  # noqa: B950
+        return super().raw_cuda_graph()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class graph:
     r"""Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 5da483fe2191..decfa7bb5022 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -8,13 +8,20 @@
 import sys
 import warnings
 from inspect import signature
+<<<<<<< HEAD
 from typing import Any, Literal, Optional, Union
+=======
+from typing import Any, Literal, Optional, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
 from torch import _C
 from torch._utils import _dummy_type
+<<<<<<< HEAD
 from torch.types import Device
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from . import (
     _get_amdsmi_device_index,
@@ -26,6 +33,13 @@
 from ._memory_viz import memory as _memory, segments as _segments
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from torch.types import Device
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "caching_allocator_alloc",
     "caching_allocator_delete",
@@ -57,7 +71,10 @@
     "CUDAPluggableAllocator",
     "change_current_allocator",
     "MemPool",
+<<<<<<< HEAD
     "MemPoolContext",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "use_mem_pool",
 ]
 
@@ -70,22 +87,42 @@
 if not hasattr(torch._C, "_MemPool"):
     # Define dummy base classes
     torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool")
+<<<<<<< HEAD
     torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext")
     torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type(
         "_cuda_beginAllocateToPool"
     )
     torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type(
         "_cuda_endAllocateCurrentStreamToPool"
+=======
+    torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type(
+        "_cuda_beginAllocateToPool"
+    )
+    torch._C.__dict__["_cuda_beginAllocateCurrentThreadToPool"] = _dummy_type(
+        "_cuda_beginAllocateCurrentThreadToPool"
+    )
+    torch._C.__dict__["_cuda_endAllocateToPool"] = _dummy_type(
+        "_cuda_endAllocateToPool"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool")
 
 from torch._C import (  # noqa: F401
+<<<<<<< HEAD
     _cuda_beginAllocateToPool,
     _cuda_CUDAAllocator,
     _cuda_endAllocateCurrentStreamToPool,
     _cuda_releasePool,
     _MemPool,
     _MemPoolContext,
+=======
+    _cuda_beginAllocateCurrentThreadToPool,
+    _cuda_beginAllocateToPool,
+    _cuda_CUDAAllocator,
+    _cuda_endAllocateToPool,
+    _cuda_releasePool,
+    _MemPool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -103,7 +140,11 @@ def _free_mutex():
         torch._C._cuda_unlock_mutex()
 
 
+<<<<<<< HEAD
 def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None):
+=======
+def caching_allocator_alloc(size, device: "Device" = None, stream=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Perform a memory allocation using the CUDA memory allocator.
 
     Memory is allocated for a given device and a stream, this
@@ -162,9 +203,13 @@ def caching_allocator_enable(value: bool = True) -> None:
         torch._C._cuda_cudaCachingAllocator_enable(value)
 
 
+<<<<<<< HEAD
 def set_per_process_memory_fraction(
     fraction, device: Union[Device, int] = None
 ) -> None:
+=======
+def set_per_process_memory_fraction(fraction, device: "Device" = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Set memory fraction for a process.
 
     The fraction is used to limit an caching allocator to allocated memory on a CUDA device.
@@ -191,7 +236,11 @@ def set_per_process_memory_fraction(
     torch._C._cuda_setMemoryFraction(fraction, device)
 
 
+<<<<<<< HEAD
 def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float:
+=======
+def get_per_process_memory_fraction(device: "Device" = None) -> float:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Get memory fraction for a process.
 
     Args:
@@ -222,7 +271,11 @@ def empty_cache() -> None:
         torch._C._cuda_emptyCache()
 
 
+<<<<<<< HEAD
 def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]:
+=======
+def memory_stats(device: "Device" = None) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return a dictionary of CUDA memory allocator statistics for a given device.
 
     The return value of this function is a dictionary of statistics, each of
@@ -327,7 +380,11 @@ def _recurse_add_to_result(prefix, obj):
     return collections.OrderedDict(result)
 
 
+<<<<<<< HEAD
 def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]:
+=======
+def memory_stats_as_nested_dict(device: "Device" = None) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary."""
     if not is_initialized():
         return {}
@@ -335,7 +392,11 @@ def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str,
     return torch._C._cuda_memoryStats(device)
 
 
+<<<<<<< HEAD
 def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
+=======
+def reset_accumulated_memory_stats(device: "Device" = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator.
 
     See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to
@@ -355,7 +416,11 @@ def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
     return torch._C._cuda_resetAccumulatedMemoryStats(device)
 
 
+<<<<<<< HEAD
 def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
+=======
+def reset_peak_memory_stats(device: "Device" = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Reset the "peak" stats tracked by the CUDA memory allocator.
 
     See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the
@@ -468,7 +533,11 @@ def reset_peak_host_memory_stats() -> None:
     return torch._C._cuda_resetPeakHostMemoryStats()
 
 
+<<<<<<< HEAD
 def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
+=======
+def reset_max_memory_allocated(device: "Device" = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device.
 
     See :func:`~torch.cuda.max_memory_allocated` for details.
@@ -494,7 +563,11 @@ def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
     return reset_peak_memory_stats(device=device)
 
 
+<<<<<<< HEAD
 def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
+=======
+def reset_max_memory_cached(device: "Device" = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device.
 
     See :func:`~torch.cuda.max_memory_cached` for details.
@@ -520,7 +593,11 @@ def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
     return reset_peak_memory_stats(device=device)
 
 
+<<<<<<< HEAD
 def memory_allocated(device: Union[Device, int] = None) -> int:
+=======
+def memory_allocated(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the current GPU memory occupied by tensors in bytes for a given device.
 
     Args:
@@ -537,7 +614,11 @@ def memory_allocated(device: Union[Device, int] = None) -> int:
     return memory_stats(device=device).get("allocated_bytes.all.current", 0)
 
 
+<<<<<<< HEAD
 def max_memory_allocated(device: Union[Device, int] = None) -> int:
+=======
+def max_memory_allocated(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the maximum GPU memory occupied by tensors in bytes for a given device.
 
     By default, this returns the peak allocated memory since the beginning of
@@ -558,7 +639,11 @@ def max_memory_allocated(device: Union[Device, int] = None) -> int:
     return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
 
 
+<<<<<<< HEAD
 def memory_reserved(device: Union[Device, int] = None) -> int:
+=======
+def memory_reserved(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the current GPU memory managed by the caching allocator in bytes for a given device.
 
     Args:
@@ -573,7 +658,11 @@ def memory_reserved(device: Union[Device, int] = None) -> int:
     return memory_stats(device=device).get("reserved_bytes.all.current", 0)
 
 
+<<<<<<< HEAD
 def max_memory_reserved(device: Union[Device, int] = None) -> int:
+=======
+def max_memory_reserved(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device.
 
     By default, this returns the peak cached memory since the beginning of this
@@ -598,7 +687,11 @@ def max_memory_reserved(device: Union[Device, int] = None) -> int:
     "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`",
     category=FutureWarning,
 )
+<<<<<<< HEAD
 def memory_cached(device: Union[Device, int] = None) -> int:
+=======
+def memory_cached(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Deprecated; see :func:`~torch.cuda.memory_reserved`."""
     return memory_reserved(device=device)
 
@@ -607,12 +700,20 @@ def memory_cached(device: Union[Device, int] = None) -> int:
     "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`",
     category=FutureWarning,
 )
+<<<<<<< HEAD
 def max_memory_cached(device: Union[Device, int] = None) -> int:
+=======
+def max_memory_cached(device: "Device" = None) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`."""
     return max_memory_reserved(device=device)
 
 
+<<<<<<< HEAD
 def memory_snapshot():
+=======
+def memory_snapshot(mempool_id=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return a snapshot of the CUDA memory allocator state across all devices.
 
     Interpreting the output of this function requires familiarity with the
@@ -622,10 +723,17 @@ def memory_snapshot():
         See :ref:`cuda-memory-management` for more details about GPU memory
         management.
     """
+<<<<<<< HEAD
     return torch._C._cuda_memorySnapshot()["segments"]
 
 
 def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str:
+=======
+    return torch._C._cuda_memorySnapshot(mempool_id)["segments"]
+
+
+def memory_summary(device: "Device" = None, abbreviated: bool = False) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return a human-readable printout of the current memory allocator statistics for a given device.
 
     This can be useful to display periodically during training, or when
@@ -752,7 +860,11 @@ def _format_count(cnt, pref_cnt):
     return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
 
 
+<<<<<<< HEAD
 def list_gpu_processes(device: Union[Device, int] = None) -> str:
+=======
+def list_gpu_processes(device: "Device" = None) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return a human-readable printout of the running processes and their GPU memory use for a given device.
 
     This can be useful to display periodically during training, or when
@@ -817,7 +929,11 @@ def list_gpu_processes(device: Union[Device, int] = None) -> str:
     return "\n".join(lines)
 
 
+<<<<<<< HEAD
 def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]:
+=======
+def mem_get_info(device: "Device" = None) -> tuple[int, int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo.
 
     Args:
@@ -841,15 +957,31 @@ def _record_memory_history_legacy(
     record_context=True,
     trace_alloc_max_entries=1,
     trace_alloc_record_context=False,
+<<<<<<< HEAD
     device: Union[Device, int] = None,
     record_context_cpp=False,
 ):
     _C._cuda_record_memory_history_legacy(
+=======
+    device: "Device" = None,
+    record_context_cpp=False,
+    clear_history=False,
+    compile_context=False,
+    global_record_annotations=False,
+):
+    _C._cuda_record_memory_history_legacy(  # type: ignore[call-arg]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         enabled,
         record_context,
         trace_alloc_max_entries,
         trace_alloc_record_context,
         record_context_cpp,
+<<<<<<< HEAD
+=======
+        clear_history,
+        compile_context,
+        global_record_annotations,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -860,12 +992,49 @@ def _record_memory_history(
     allocations, so you can tell what allocated any piece of memory in
     :func:`torch.cuda.memory._snapshot()`.
 
+<<<<<<< HEAD
     In addition too keeping stack traces with each current allocation and free,
+=======
+    In addition to keeping stack traces with each current allocation and free,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     this will also enable recording of a history of all alloc/free events.
 
     Use :func:`torch.cuda.memory._snapshot()` to retrieve this information,
     and the tools in `_memory_viz.py` to visualize snapshots.
 
+<<<<<<< HEAD
+=======
+    Buffer behavior
+    ---------------
+
+    This will store up to `max_entries` instances of `TraceEntry` when enabled.
+    Python trace collection defaults to `sys.maxsize`, meaning long-running
+    or indefinitely running jobs should set a reasonable limit to avoid excessive
+    memory use. Expect each entry to be several KB.
+
+    Longer running workflows or those with smaller `max_entries` values will only
+    store the last accumulated `max_entries` entries, meaning new entries overwrite
+    older entries.
+
+    C++ implementation for reference to ring buffer implemenation:
+
+    .. code-block:: cpp
+
+        if (record_history) {
+          if (alloc_trace->size() < alloc_trace_max_entries_) {
+            alloc_trace->emplace_back(te);
+          } else {
+            (*alloc_trace)[alloc_trace_next++] = te;
+            if (alloc_trace_next == alloc_trace_max_entries_) {
+              alloc_trace_next = 0;
+            }
+          }
+        }
+
+    Latency impact
+    --------------
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     The Python trace collection is fast (2us per trace), so you may consider
     enabling this on production jobs if you anticipate ever having to debug
     memory issues.
@@ -903,15 +1072,36 @@ def _record_memory_history_impl(
     context: Optional[str] = "all",
     stacks: str = "all",
     max_entries: int = sys.maxsize,
+<<<<<<< HEAD
     device: Union[Device, int] = None,
 ):
     _C._cuda_record_memory_history(enabled, context, stacks, max_entries)
+=======
+    device: "Device" = None,
+    clear_history: bool = False,
+    compile_context: bool = False,
+    global_record_annotations: bool = False,
+):
+    _C._cuda_record_memory_history(  # type: ignore[call-arg]
+        enabled,
+        context,
+        stacks,
+        max_entries,
+        clear_history,
+        compile_context,
+        global_record_annotations,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _record_memory_history.__signature__ = signature(_record_memory_history_impl)  # type: ignore[attr-defined]
 
 
+<<<<<<< HEAD
 def _snapshot(device: Union[Device, int] = None):
+=======
+def _snapshot(device: "Device" = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Save a snapshot of CUDA memory state at the time it was called.
 
     The state is represented as a dictionary with the following structure.
@@ -985,7 +1175,11 @@ class TraceEntry(TypedDict):
     Returns:
         The Snapshot dictionary object
     """
+<<<<<<< HEAD
     return _C._cuda_memorySnapshot()
+=======
+    return _C._cuda_memorySnapshot(None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _dump_snapshot(filename="dump_snapshot.pickle"):
@@ -994,6 +1188,13 @@ def _dump_snapshot(filename="dump_snapshot.pickle"):
 
     This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz
 
+<<<<<<< HEAD
+=======
+    Snapshot file sizes scale with `max_entries` and stack trace depth per entry,
+    with several KB per entry. These can easily be in the GB range for longer running
+    workflows with large `max_entries`.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
         filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle".
     """
@@ -1097,6 +1298,7 @@ def _get_current_allocator() -> _CUDAAllocator:
     return _CUDAAllocator(torch._C._cuda_getAllocator())
 
 
+<<<<<<< HEAD
 class MemPoolContext(_MemPoolContext):
     r"""MemPoolContext holds the currently active pool and stashes the previous
     pool. On deletion it makes the previous pool active.
@@ -1116,6 +1318,8 @@ def active_pool() -> Optional[_MemPool]:
         return _MemPoolContext.active_pool()
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MemPool(_MemPool):
     r"""MemPool represents a pool of memory in a caching allocator. Currently,
     it's just the ID of the pool object maintained in the CUDACachingAllocator.
@@ -1126,11 +1330,28 @@ class MemPool(_MemPool):
             define how memory gets allocated in the pool. If :attr:`allocator`
             is ``None`` (default), memory allocation follows the default/
             current configuration of the CUDACachingAllocator.
+<<<<<<< HEAD
 
     """
 
     def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None):
         super().__init__(allocator, True)
+=======
+        use_on_oom(bool): a bool that indicates if this pool can be used
+            as a last resort if a memory allocation outside of the pool fails due
+            to Out Of Memory. This is False by default.
+        symmetric(bool): a bool that indicates if this pool is symmetrical
+            across ranks. This is False by default.
+    """
+
+    def __init__(
+        self,
+        allocator: Optional[_cuda_CUDAAllocator] = None,
+        use_on_oom: bool = False,
+        symmetric: bool = False,
+    ):
+        super().__init__(allocator, True, use_on_oom, symmetric)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def id(self) -> tuple[int, int]:
@@ -1138,6 +1359,14 @@ def id(self) -> tuple[int, int]:
         return super().id
 
     @property
+<<<<<<< HEAD
+=======
+    def is_symmetric(self) -> bool:
+        r"""Returns whether this pool is used for NCCL's symmetric memory."""
+        return super().is_symmetric
+
+    @property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def allocator(self) -> Optional[_cuda_CUDAAllocator]:
         r"""Returns the allocator this MemPool routes allocations to."""
         return super().allocator
@@ -1157,16 +1386,24 @@ def snapshot(self):
             See :ref:`cuda-memory-management` for more details about GPU memory
             management.
         """
+<<<<<<< HEAD
         try:
             ctx = MemPoolContext(self)
             snapshot = torch.cuda.memory_snapshot()
         finally:
             del ctx
+=======
+        snapshot = torch.cuda.memory_snapshot(self.id)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return snapshot
 
 
 @contextlib.contextmanager
+<<<<<<< HEAD
 def use_mem_pool(pool: MemPool, device: Union[Device, int] = None):
+=======
+def use_mem_pool(pool: MemPool, device: "Device" = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""A context manager that routes allocations to a given pool.
 
     Args:
@@ -1176,6 +1413,7 @@ def use_mem_pool(pool: MemPool, device: Union[Device, int] = None):
             the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
 
+<<<<<<< HEAD
     """
     ctx = MemPoolContext(pool)
     device_index = (
@@ -1188,3 +1426,20 @@ def use_mem_pool(pool: MemPool, device: Union[Device, int] = None):
         _cuda_endAllocateCurrentStreamToPool(device_index, pool.id)
         _cuda_releasePool(device_index, pool.id)
         del ctx
+=======
+    .. note::
+        This context manager makes only current thread's allocations route to
+        the given pool. If a new thread is spawned inside the context manager
+        (e.g. by calling backward) the allocations in that thread will not
+        route to the given pool.
+    """
+    device_index = (
+        torch.cuda.current_device() if device is None else _get_device_index(device)
+    )
+    _cuda_beginAllocateCurrentThreadToPool(device_index, pool.id)
+    try:
+        yield
+    finally:
+        _cuda_endAllocateToPool(device_index, pool.id)
+        _cuda_releasePool(device_index, pool.id)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/cuda/random.py b/torch/cuda/random.py
index 39e3c6d18f7d..14143e62ecd9 100644
--- a/torch/cuda/random.py
+++ b/torch/cuda/random.py
@@ -5,7 +5,11 @@
 import torch
 from torch import Tensor
 
+<<<<<<< HEAD
 from . import _lazy_call, _lazy_init, current_device, device_count
+=======
+from . import _lazy_call, _lazy_init, current_device, device_count, is_initialized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -59,8 +63,16 @@ def set_rng_state(
         device (torch.device or int, optional): The device to set the RNG state.
             Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
     """
+<<<<<<< HEAD
     with torch._C._DisableFuncTorch():
         new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
+=======
+    if not is_initialized():
+        with torch._C._DisableFuncTorch():
+            # Clone the state because the callback will be triggered
+            # later when CUDA is lazy initialized.
+            new_state = new_state.clone(memory_format=torch.contiguous_format)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(device, str):
         device = torch.device(device)
     elif isinstance(device, int):
@@ -71,7 +83,11 @@ def cb():
         if idx is None:
             idx = current_device()
         default_generator = torch.cuda.default_generators[idx]
+<<<<<<< HEAD
         default_generator.set_state(new_state_copy)
+=======
+        default_generator.set_state(new_state)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _lazy_call(cb)
 
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index 15095868bbad..aa120a7ab966 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -158,17 +158,33 @@ class Event(torch._C._CudaEventBase):
         blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``)
         interprocess (bool): if ``True``, the event can be shared between processes
             (default: ``False``)
+<<<<<<< HEAD
 
     .. _CUDA Event Documentation:
        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
     """
 
     def __new__(cls, enable_timing=False, blocking=False, interprocess=False):
+=======
+        external (bool, optional): indicates whether this event should create event record and event wait nodes, or create an internal cross-stream dependency, when captured in a cuda graph. See `cross-stream dependencies <https://docs.nvidia.com/cuda/archive/12.9.0/cuda-c-programming-guide/index.html#cross-stream-dependencies-and-events>`_, `cudaEventRecordExternal <https://docs.nvidia.com/cuda/archive/12.9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3457b81d1d32c6a00f6132fbc2693d47>`_, and `cudaEventWaitExternal <https://docs.nvidia.com/cuda/archive/12.9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g0c23426b7252eaa9cef695859991304e>`_ for more information about internal vs. external events. (default: ``False``)
+
+    .. _CUDA Event Documentation:
+       https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
+    """  # noqa: B950
+
+    def __new__(
+        cls, enable_timing=False, blocking=False, interprocess=False, external=False
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return super().__new__(
             cls,
             enable_timing=enable_timing,
             blocking=blocking,
             interprocess=interprocess,
+<<<<<<< HEAD
+=======
+            external=external,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @classmethod
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index a4dd34597836..52321e24301c 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -33,6 +33,10 @@ def is_available() -> bool:
 DistBackendError = torch._C._DistBackendError
 DistNetworkError = torch._C._DistNetworkError
 DistStoreError = torch._C._DistStoreError
+<<<<<<< HEAD
+=======
+QueueEmptyError = torch._C._DistQueueEmptyError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if is_available():
     from torch._C._distributed_c10d import (
@@ -133,6 +137,10 @@ def breakpoint(rank: int = 0, skip: int = 0):
         _get_process_group_name,
         _rank_not_in_group,
         _reduce_scatter_base,
+<<<<<<< HEAD
+=======
+        _time_estimator,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         get_node_local_rank,
     )
     from .remote_device import _remote_device
diff --git a/torch/distributed/_checkpointable.py b/torch/distributed/_checkpointable.py
index bc0a288f1291..08fd73a7efe0 100644
--- a/torch/distributed/_checkpointable.py
+++ b/torch/distributed/_checkpointable.py
@@ -1,5 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
+<<<<<<< HEAD
 from typing import Protocol, runtime_checkable
+=======
+from typing_extensions import Protocol, runtime_checkable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
index 921d875455f7..56b78ba4fba2 100644
--- a/torch/distributed/_functional_collectives.py
+++ b/torch/distributed/_functional_collectives.py
@@ -275,7 +275,11 @@ def reduce_scatter_tensor(
     group_size = c10d._get_group_size_by_name(group_name)
 
     assert self.size(scatter_dim) % group_size == 0, (
+<<<<<<< HEAD
         f"input dimension 0 ({self.size(0)} must be a multiple of group_size {group_size}"
+=======
+        f"input dimension 0 ({self.size(0)} must be a multiple of group_size {group_size})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     if scatter_dim != 0:
         tensor_list = torch.chunk(self, group_size, dim=scatter_dim)
@@ -582,7 +586,11 @@ def functional_collective(self, group, tag):
 
     @staticmethod
     def __new__(cls, elem: torch.Tensor):
+<<<<<<< HEAD
         r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+=======
+        r = torch.Tensor._make_wrapper_subclass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls,
             elem.size(),
             strides=elem.stride(),
@@ -635,7 +643,11 @@ def _get_acs_underlying_tensor(self):
         return self.elem
 
     @classmethod
+<<<<<<< HEAD
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+=======
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if func == torch.ops.aten.view.default:
             # Fast handle aten.view as a lot of view related op goes to aten.view
             # eventually, this avoids pytree slowdown
@@ -731,8 +743,15 @@ def cast_listint(x):
             "Only 1D mesh is supported, pass in (DeviceMesh, int) together if mesh > 1D"
         )
         # TODO: it should run collective in the whole mesh instead of dim 0
+<<<<<<< HEAD
         tag, rankset, _ = group._dim_group_infos[0]
         group_size = len(rankset)
+=======
+        pg = group.get_group()
+        rankset = dist.get_process_group_ranks(pg)
+        group_size = len(rankset)
+        tag = tag or c10d._get_group_tag(pg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif isinstance(group, tuple):
         if (
             len(group) == 2
@@ -741,8 +760,15 @@ def cast_listint(x):
         ):
             dmesh = group[0]
             dim = group[1]
+<<<<<<< HEAD
             tag, rankset, _ = dmesh._dim_group_infos[dim]
             group_size = len(rankset)
+=======
+            pg = dmesh.get_group(dim)
+            rankset = dist.get_process_group_ranks(pg)
+            group_size = len(rankset)
+            tag = tag or c10d._get_group_tag(pg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise ValueError("Invalid tuple for group must be (DeviceMesh, int)")
     else:
@@ -767,7 +793,11 @@ def _resolve_group_name(group: RANK_TYPES, tag: str = "") -> str:
         assert group.ndim == 1, (
             "Only 1D mesh is supported, pass in (DeviceMesh, int) together if mesh > 1D"
         )
+<<<<<<< HEAD
         return group._dim_group_infos[0][2]
+=======
+        return group._dim_group_names[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif isinstance(group, tuple):
         if (
             len(group) == 2
@@ -776,7 +806,11 @@ def _resolve_group_name(group: RANK_TYPES, tag: str = "") -> str:
         ):
             dmesh = group[0]
             dim = group[1]
+<<<<<<< HEAD
             return dmesh._dim_group_infos[dim][2]
+=======
+            return dmesh._dim_group_names[dim]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise ValueError("Invalid tuple for group must be (DeviceMesh, int)")
     elif isinstance(group, list):
diff --git a/torch/distributed/_serialization.py b/torch/distributed/_serialization.py
index 4c49f2585bcb..97889be4bd85 100644
--- a/torch/distributed/_serialization.py
+++ b/torch/distributed/_serialization.py
@@ -97,7 +97,11 @@ def _streaming_save(
     This behaves similarly to :func:`torch.save` with a few notable differences:
 
     * A non-seekable file like object can be used when loading.
+<<<<<<< HEAD
     * No forwards/backwards compatiblity is provided for the serialization
+=======
+    * No forwards/backwards compatibility is provided for the serialization
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       format. This is only intended to be used with a single version of PyTorch
       with transient storage (i.e. sockets or temp files).
     * mmap is not supported
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
index e146a3598561..6c960940f517 100644
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@@ -104,7 +104,11 @@ def __new__(cls, sharding_spec: shard_spec.ShardingSpec, *size, **kwargs):
             sizes, tensor_properties=tensor_properties
         )
 
+<<<<<<< HEAD
         r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+=======
+        r = torch.Tensor._make_wrapper_subclass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls,
             sizes,
             dtype=dtype,
@@ -184,7 +188,11 @@ def _init_from_local_shards_and_global_metadata(
         return sharded_tensor_base
 
     @classmethod
+<<<<<<< HEAD
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+=======
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise RuntimeError(
             f"A {cls.__name__} object is being used from c++ while calling {func.__module__}.{func.__name__} "
             "but the there is no custom __torch_dispatch__ implementation for it."
@@ -470,7 +478,11 @@ def shard_size(shard_md):
                     warnings.warn(
                         "Gathering a tensor with zero elements on rank " + str(rank)
                     )
+<<<<<<< HEAD
                     return
+=======
+                    continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 shard_offset = shard_placement[shard.metadata][1]
                 data[shard_offset : shard_offset + src.numel()].copy_(src)
 
@@ -515,7 +527,11 @@ def cpu(
 
         .. note:: When moving a ShardedTensor from GPU to CPU, the ShardedTensor might
             need to be managed by a different type of ProcessGroup(i.e. ProcessGroupGloo),
+<<<<<<< HEAD
             it is the user's responsiblity to explicitly pass in a new process_group that
+=======
+            it is the user's responsibility to explicitly pass in a new process_group that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is compatible with CPU.
         """
         # TODO: make this a __torch_function__ op once ShardedTensor becomes a
@@ -575,7 +591,11 @@ def cuda(
         metadata, but no underlying data movements are performed.
         .. note:: When moving a ShardedTensor from CPU to GPU, the ShardedTensor might
             need to be managed by a different type of ProcessGroup(i.e. ProcessGroupNCCL),
+<<<<<<< HEAD
             it is the user's responsiblity to explicitly pass in a new process_group that
+=======
+            it is the user's responsibility to explicitly pass in a new process_group that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is compatible with GPU.
         """
         if (
@@ -735,6 +755,23 @@ def _init_from_local_shards(
         process_group=None,
         init_rrefs=False,
     ):
+<<<<<<< HEAD
+=======
+        # recalc metadata handles special ST creation cases like each rank only has tensor available
+        # caller need to provide None on the unknown dimension of the global size
+        # We will change None into zeros and go through the same amount of checks as before to create ST
+        # and use all_gather to calculate the offsets and global size for metadata
+        # It is compatible with the current use case since, conventionally we don't pass None as global size
+        # Therefore the old path won't trigger the new feature
+        recalc_metadata = False
+        for dim in global_size:
+            if dim is None:
+                recalc_metadata = True
+        if recalc_metadata:
+            global_size = tuple(
+                0 if dim_size is None else dim_size for dim_size in global_size
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # STEP 1: Validate the Shardmetadatas locally
         process_group = cls._normalize_pg(process_group)
         current_rank = dist.get_rank()  # intentional to get global rank
@@ -760,7 +797,33 @@ def _init_from_local_shards(
         else:
             gathered_metadatas = [local_sharded_tensor_metadata]
 
+<<<<<<< HEAD
         global_sharded_tensor_metadata = build_global_metadata(gathered_metadatas)
+=======
+        global_sharded_tensor_metadata = build_global_metadata(
+            gathered_metadatas, recalc_metadata=recalc_metadata
+        )
+        if recalc_metadata:
+            # for recalc use cases, we only support rw for now, limit the blast radius
+            # will modify here once we support more sharding type
+            assert (
+                len(local_shards) > 0
+                and len(global_sharded_tensor_metadata.shards_metadata) > current_rank
+            ), (
+                f"# for metadata recalculation, local_shards must be larger than 0 "
+                f"actual:{len(local_shards)}, # glb metadata must be greater than any rank id, "
+                f"# metadata:{len(global_sharded_tensor_metadata.shards_metadata)}, rank id:{current_rank}"
+            )
+            local_md = [
+                shard_md
+                for shard_md in global_sharded_tensor_metadata.shards_metadata
+                if shard_md.placement.rank() == current_rank
+            ]
+            assert len(local_md) == 1, (
+                f"should has and only has one metadata for local rank, actual:{local_md}"
+            )
+            local_shards[0].metadata = local_md[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor_properties = global_sharded_tensor_metadata.tensor_properties
 
         # STEP 3: Validation done, create the actual ShardedTensor and populate fields
diff --git a/torch/distributed/_shard/sharded_tensor/utils.py b/torch/distributed/_shard/sharded_tensor/utils.py
index 08b413f9736c..38ad124af85b 100644
--- a/torch/distributed/_shard/sharded_tensor/utils.py
+++ b/torch/distributed/_shard/sharded_tensor/utils.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
 import collections.abc
 import copy
+<<<<<<< HEAD
+=======
+import itertools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Sequence
 from typing import Optional, TYPE_CHECKING
 
@@ -202,6 +206,10 @@ def build_metadata_from_local_shards(
 
 def build_global_metadata(
     gathered_metadatas: Sequence[Optional[ShardedTensorMetadata]],
+<<<<<<< HEAD
+=======
+    recalc_metadata: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     global_sharded_tensor_metadata = None
     global_metadata_rank = 0
@@ -252,6 +260,15 @@ def build_global_metadata(
             )
 
     if global_sharded_tensor_metadata is not None:
+<<<<<<< HEAD
+=======
+        if recalc_metadata:
+            recalc_global_sharded_tensor_metadata(
+                global_sharded_tensor_metadata,
+                0,  # sharded on 0th dim
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # check if shards_metadata have overlap shards
         validate_non_overlapping_shards_metadata(
             global_sharded_tensor_metadata.shards_metadata
@@ -266,3 +283,53 @@ def build_global_metadata(
         raise ValueError("ShardedTensor have no local shards on all ranks!")
 
     return global_sharded_tensor_metadata
+<<<<<<< HEAD
+=======
+
+
+def recalc_global_sharded_tensor_metadata(
+    global_sharded_tensor_metadata: ShardedTensorMetadata, sharded_dim: int
+) -> None:
+    # recalculate global ShardedTensorMetadata
+
+    # reorder here in case shard metadata is not sorted on sharded_dim
+    placement_idx_pairs = []
+    for i, shard_metadata in enumerate(global_sharded_tensor_metadata.shards_metadata):
+        if shard_metadata.placement:
+            placement_idx_pairs.append((shard_metadata.placement.rank(), i))
+        else:
+            raise AssertionError(
+                "currently only support rw, it should always have valid rank info"
+            )
+    sorted_idx = sorted(placement_idx_pairs)
+    shard_sizes = [
+        global_sharded_tensor_metadata.shards_metadata[idx].shard_sizes[sharded_dim]
+        for _, idx in sorted_idx
+    ]
+    cum_sum = [0] + list(itertools.accumulate(shard_sizes))
+
+    for shard_id, shard_metadata in enumerate(
+        global_sharded_tensor_metadata.shards_metadata
+    ):
+        # update shard offset for each shard on the sharded dimension
+        shard_metadata.shard_offsets[sharded_dim] = cum_sum[shard_id]
+        for other_dim in range(
+            len(global_sharded_tensor_metadata.shards_metadata[0].shard_sizes)
+        ):
+            if other_dim != sharded_dim:
+                # shard offset for each shard on the unsharded dimension
+                shard_metadata.shard_offsets[other_dim] = 0
+
+    # update global size for ShardedTensorMetadata
+    global_size_list = []
+    for other_dim in range(
+        len(global_sharded_tensor_metadata.shards_metadata[0].shard_sizes)
+    ):
+        if other_dim != sharded_dim:
+            global_size_list.append(
+                global_sharded_tensor_metadata.shards_metadata[0].shard_sizes[other_dim]
+            )
+        else:
+            global_size_list.append(cum_sum[-1])
+    global_sharded_tensor_metadata.size = torch.Size(global_size_list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/_shard/sharding_spec/_internals.py b/torch/distributed/_shard/sharding_spec/_internals.py
index bcbacb409175..eee3f3ff2907 100644
--- a/torch/distributed/_shard/sharding_spec/_internals.py
+++ b/torch/distributed/_shard/sharding_spec/_internals.py
@@ -1,4 +1,8 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+import math
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional
 
 from torch.distributed._shard.metadata import ShardMetadata
@@ -94,6 +98,19 @@ def validate_non_overlapping_shards_metadata(shards: list[ShardMetadata]):
 
     pair: Optional[tuple[int, int]] = None
     if len(sharded_dims) == 0:
+<<<<<<< HEAD
+=======
+        # if shard is all zeros, we should consider as pass
+        all_zeros: bool = all(
+            # strictly limited all offsets to be 0 to pass
+            # could loose it later on
+            shard.shard_offsets == [0] * len(shards[0].shard_offsets)
+            and math.prod(shard.shard_sizes) == 0  # one dimension is 0
+            for shard in shards
+        )
+        if all_zeros:
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # All shards are the same, all dims are not partitioned. Choose any 2.
         pair = (0, 1)
     elif len(sharded_dims) == 1:
diff --git a/torch/distributed/_state_dict_utils.py b/torch/distributed/_state_dict_utils.py
index 640922762386..fb3990e8d4a6 100644
--- a/torch/distributed/_state_dict_utils.py
+++ b/torch/distributed/_state_dict_utils.py
@@ -7,6 +7,10 @@
 from typing import Any, Callable, cast, NamedTuple, Optional, TYPE_CHECKING, Union
 
 import torch
+<<<<<<< HEAD
+=======
+import torch.cuda._pin_memory_utils as pin_memory_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.distributed as dist
 import torch.nn.functional as F
 from torch.distributed._functional_collectives import AsyncCollectiveTensor
@@ -187,6 +191,15 @@ def _iterate_state_dict(
                     companion_obj._local_tensor.copy_(
                         ret._local_tensor, non_blocking=non_blocking
                     )
+<<<<<<< HEAD
+=======
+                elif isinstance(companion_obj, ShardedTensor):
+                    assert isinstance(ret, ShardedTensor)
+                    for idx, shard in enumerate(companion_obj.local_shards()):
+                        shard.tensor.copy_(
+                            ret.local_shards()[idx].tensor, non_blocking=non_blocking
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     companion_obj.copy_(ret, non_blocking=non_blocking)
                 ret = companion_obj
@@ -402,10 +415,23 @@ def tensor_func(
         if len(obj.size()) == 0:
             return torch.tensor(0, dtype=obj.dtype)
 
+<<<<<<< HEAD
+=======
+        # sometimes, a tensor might have non-zero size and 0 numel. In this case, pinning memory will fail
+        # so we take a best guess at how to replicate the tensor below to maintain symmetry in the returned
+        # state dict.
+        if obj.numel() == 0 or obj.data_ptr() == 0:
+            t = torch.zeros_like(obj, device="cpu")
+            if share_memory:
+                t = t.share_memory_()
+            return t
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if share_memory:
             t = torch.empty(*tuple(obj.size()), dtype=obj.dtype)
             t = t.share_memory_()
             if pin_memory:
+<<<<<<< HEAD
 
                 def unpin_memory(t):
                     succ = int(torch.cuda.cudart().cudaHostUnregister(t.data_ptr()))
@@ -424,6 +450,11 @@ def unpin_memory(t):
                 assert succ == 0, (
                     f"Pinning shared memory failed with error-code: {succ}"
                 )
+=======
+                pin_memory_utils.pin_memory(t.data_ptr(), t.numel() * t.element_size())
+                weakref.finalize(t, pin_memory_utils.unpin_memory, t)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return t
         elif pin_memory:
             return torch.empty(*tuple(obj.size()), dtype=obj.dtype).pin_memory()
@@ -446,9 +477,34 @@ def dtensor_func(
         ret._local_tensor = tensor_func(ret._local_tensor, pg, device, None)
         return ret
 
+<<<<<<< HEAD
     ret = _iterate_state_dict(
         state_dict,
         _identity_func,
+=======
+    def sharded_tensor_func(
+        obj: ShardedTensor,
+        pg: Optional[dist.ProcessGroup],
+        device: Optional[torch.device],
+        _: Any,
+    ) -> ShardedTensor:
+        if not obj.local_shards():
+            return obj
+
+        if obj.device != torch.device("cpu"):
+            ret = obj.to(device="cpu")
+        else:
+            ret = copy.deepcopy(obj)
+
+        for shards in ret.local_shards():
+            shards.tensor = tensor_func(shards.tensor, pg, device, None)
+
+        return ret
+
+    ret = _iterate_state_dict(
+        state_dict,
+        sharded_tensor_func,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtensor_func,
         tensor_func,
         pg=None,
@@ -514,16 +570,33 @@ def _broadcast_tensors(
     device: torch.device,
     pg: Optional[dist.ProcessGroup] = None,
 ) -> None:
+<<<<<<< HEAD
     tensors = []
+=======
+    if pg is None:
+        pg = dist.distributed_c10d._get_default_group()
+    pg_device = (
+        device
+        if device.type in {pg_device.type for pg_device in pg._device_types}
+        else pg._device_types[0]
+    )
+
+    tensors: list[torch.Tensor] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for key in keys:
         if dist.get_rank() == 0:
             full_state = full_state_dict[key]
             assert isinstance(full_state, torch.Tensor)
+<<<<<<< HEAD
             full_tensor = full_state.detach().to(device)
+=======
+            full_tensor = full_state.detach().to(pg_device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             tensor_info = full_state_dict[key]
             full_tensor = torch.empty(
                 size=tensor_info.size,
+<<<<<<< HEAD
                 device=device,
                 dtype=tensor_info.dtype,
             )
@@ -538,12 +611,43 @@ def _broadcast_tensors(
 
     if pg is None:
         pg = dist.distributed_c10d._get_default_group()
+=======
+                device=pg_device,
+                dtype=tensor_info.dtype,
+            )
+
+        tensors.append(full_tensor)
+
+        if (local_state := local_state_dict.get(key)) is None:
+            continue
+
+        local_state_dict[key] = (
+            (local_state, full_tensor)
+            if isinstance(local_state, DTensor)
+            else full_tensor
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if len(tensors) > 1:
         dist._broadcast_coalesced(pg, tensors, 500, 0)
     else:
         dist.broadcast(tensors[0], src=0, group=pg)
 
+<<<<<<< HEAD
+=======
+    if pg_device != device:
+        for key, full_tensor in zip(keys, tensors):
+            if (local_state := local_state_dict.get(key)) is not None:
+                local_state_dict[key] = (
+                    (local_state[0], full_tensor.to(device))
+                    if (
+                        isinstance(local_state, tuple)
+                        and isinstance(local_state[0], DTensor)
+                    )
+                    else full_tensor.to(device)
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _distribute_tensors(local_state_dict, keys, device, pg)
 
 
@@ -572,7 +676,11 @@ def _distribute_tensors(
         ]
         if local_state.is_meta:
             # Use .clone() here rather than view to clone and return only the sliced portion, minimizing memory access and cost.
+<<<<<<< HEAD
             local_tensor = full_tensor[slices].detach().clone()
+=======
+            local_tensor = full_tensor[tuple(slices)].detach().clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # TODO: currently, we cannot handle strided sharding if the dp dimension is not even. For example,
             # one of the case that is not yet supported is when placements = (Shard(0), _StridedShard(0, sf=2)).
             ret = DTensor.from_local(
@@ -585,7 +693,11 @@ def _distribute_tensors(
         else:
             ret = local_state
             # Copy full_tensor[slices] into local_state.to_local() to reduce memory footprint.
+<<<<<<< HEAD
             ret.to_local().copy_(full_tensor[slices])
+=======
+            ret.to_local().copy_(full_tensor[tuple(slices)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         local_state_dict[key] = ret
 
 
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 6f8f0b68fb5c..f6dee49dfd14 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -37,6 +37,7 @@ def enable_symm_mem_for_group(group_name: str) -> None:
         f"symmetric_memory-{global_ranks_str}",
         c10d._get_process_group_store(group),
     )
+<<<<<<< HEAD
     # Use one store-based broadcast to bootstrap a file store from the process
     # and simultaneously verify that all ranks are on the same host.
     hostname = socket.gethostname()
@@ -57,6 +58,8 @@ def enable_symm_mem_for_group(group_name: str) -> None:
             )
     store = torch._C._distributed_c10d.FileStore(f"/tmp/{uid}", group.size())
     # TODO: check device connectiivity
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _group_name_to_store[group_name] = store
     _SymmetricMemory.set_group_info(
         group_name,
@@ -67,10 +70,18 @@ def enable_symm_mem_for_group(group_name: str) -> None:
 
 
 _is_test_mode: bool = False
+<<<<<<< HEAD
 
 
 @contextmanager
 def _test_mode() -> Generator[None, None, None]:
+=======
+_mocked_group_names: Optional[set[str]] = None
+
+
+@contextmanager
+def _test_mode(group_names: Optional[set[str]] = None) -> Generator[None, None, None]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Forces ``is_symm_mem_enabled_for_group()`` to return ``True`` and the ops
     defined in the ``symm_mem`` namespace to use fallback implementations.
@@ -78,12 +89,25 @@ def _test_mode() -> Generator[None, None, None]:
     The context manager is not thread safe.
     """
     global _is_test_mode
+<<<<<<< HEAD
     prev = _is_test_mode
     try:
         _is_test_mode = True
         yield
     finally:
         _is_test_mode = prev
+=======
+    global _mocked_group_names
+    prev = _is_test_mode
+    prev_group_names = _mocked_group_names
+    try:
+        _is_test_mode = True
+        _mocked_group_names = group_names
+        yield
+    finally:
+        _is_test_mode = prev
+        _mocked_group_names = prev_group_names
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_symm_mem_enabled_for_group(group_name: str) -> bool:
@@ -93,7 +117,13 @@ def is_symm_mem_enabled_for_group(group_name: str) -> bool:
     Args:
         group_name (str): the name of the process group.
     """
+<<<<<<< HEAD
     return _is_test_mode or group_name in _group_name_to_store
+=======
+    if _is_test_mode:
+        return _mocked_group_names is None or group_name in _mocked_group_names
+    return group_name in _group_name_to_store
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _group_name_to_workspace_tensor: dict[str, Optional[torch.Tensor]] = {}
@@ -461,7 +491,11 @@ def get_p2p_buf(rank: int, idx: int) -> torch.Tensor:
 lib.define(
     "fused_scaled_matmul_reduce_scatter("
     "Tensor A, Tensor B, Tensor A_scale, Tensor B_scale, "
+<<<<<<< HEAD
     "str reduce_op, int scatter_dim, str group_name, "
+=======
+    "str reduce_op, int orig_scatter_dim, int scatter_dim_after_maybe_reshape, str group_name, int[]? output_shape, "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Tensor? bias = None, "
     "Tensor? result_scale = None, "
     "ScalarType? out_dtype = None, "
@@ -1005,11 +1039,66 @@ def restride_A_shard_for_fused_all_gather_matmul(
     return make_contiguous_for_perm(t, perm)
 
 
+<<<<<<< HEAD
+=======
+@torch.library.impl(lib, "fused_matmul_reduce_scatter", "CUDA")
+def _fused_matmul_reduce_scatter(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    reduce_op: str,
+    scatter_dim: int,
+    group_name: str,
+) -> torch.Tensor:
+    """
+    Perform the following logic with micro-pipelined computation and
+    communication:
+
+        reduce_scatter_tensor(A @ B, reduce_op, scatter_dim, group_name)
+
+    Optimal stride order for A - if A.movedim(scatter_dim, 0) is contiguous, no
+    extra copy is required for input layout transformation. Otherwise A needs
+    to be copied once.
+    """
+    if _is_test_mode:
+        return _fused_matmul_reduce_scatter_fallback(
+            A, B, reduce_op, scatter_dim, group_name
+        )
+
+    with torch.profiler.record_function("fused_matmul_reduce_scatter"):
+        return _fused_matmul_reduce_scatter_impl(
+            mm_out_op=torch.ops.aten.mm.out,
+            A=A,
+            B=B,
+            kwargs={},
+            out_dtype=A.dtype,
+            reduce_op=reduce_op,
+            scatter_dim=scatter_dim,
+            group_name=group_name,
+        )
+
+
+@torch.library.impl(lib, "fused_matmul_reduce_scatter", "Meta")
+def _fused_matmul_reduce_scatter_fallback(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    reduce_op: str,
+    scatter_dim: int,
+    group_name: str,
+) -> torch.Tensor:
+    res = funcol.reduce_scatter_tensor(A @ B, reduce_op, scatter_dim, group_name)
+    res = funcol.wait_tensor(res)
+    return res
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _fused_matmul_reduce_scatter_impl(
     mm_out_op: torch._ops.OpOverload,
     A: torch.Tensor,
     B: torch.Tensor,
+<<<<<<< HEAD
     A_scale: Optional[torch.Tensor],
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     kwargs: dict[str, Any],
     out_dtype: Optional[torch.dtype],
     reduce_op: str,
@@ -1040,6 +1129,7 @@ def _fused_matmul_reduce_scatter_impl(
     x = x.flatten(0, -2)
     A_shards = x.chunk(group.size())
 
+<<<<<<< HEAD
     A_scale_shards = None
     if A_scale is None:
         pass
@@ -1063,6 +1153,11 @@ def chunk_producer(rank: int, out: torch.Tensor) -> None:
             )
         else:
             mm_out_op(A_shards[rank], B, **kwargs, out=out)
+=======
+    # Computing block-wise matmul along the first dim of A
+    def chunk_producer(rank: int, out: torch.Tensor) -> None:
+        mm_out_op(A_shards[rank], B, **kwargs, out=out)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     stacked_partials = x.new_empty(x.shape[0], B.shape[1], dtype=out_dtype or A.dtype)
 
@@ -1071,6 +1166,10 @@ def chunk_producer(rank: int, out: torch.Tensor) -> None:
         stacked_partials,
         group_name,
     )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Ensures that the transpose and reduction produce contiguous result
     # in a single reduction kernel.
     return reduce_fn(
@@ -1081,6 +1180,7 @@ def chunk_producer(rank: int, out: torch.Tensor) -> None:
     )
 
 
+<<<<<<< HEAD
 @torch.library.impl(lib, "fused_matmul_reduce_scatter", "Meta")
 def _fused_matmul_reduce_scatter_fallback(
     A: torch.Tensor,
@@ -1128,6 +1228,59 @@ def _fused_matmul_reduce_scatter(
             reduce_op=reduce_op,
             scatter_dim=scatter_dim,
             group_name=group_name,
+=======
+@torch.library.impl(lib, "fused_scaled_matmul_reduce_scatter", "CUDA")
+def _fused_scaled_matmul_reduce_scatter(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    reduce_op: str,
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+    bias: Optional[torch.Tensor] = None,
+    result_scale: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    use_fast_accum: bool = False,
+) -> torch.Tensor:
+    if _is_test_mode:
+        return _fused_scaled_matmul_reduce_scatter_fallback(
+            A,
+            B,
+            A_scale,
+            B_scale,
+            reduce_op,
+            orig_scatter_dim,
+            scatter_dim_after_maybe_reshape,
+            group_name,
+            output_shape,
+            bias,
+            result_scale,
+            out_dtype,
+            use_fast_accum,
+        )
+    with torch.profiler.record_function("fused_scaled_matmul_reduce_scatter"):
+        return _fused_scaled_matmul_reduce_scatter_impl(
+            mm_out_op=torch.ops.aten._scaled_mm.out,
+            A=A,
+            B=B,
+            A_scale=A_scale,
+            kwargs={
+                "scale_b": B_scale,
+                "bias": bias,
+                "scale_result": result_scale,
+                "out_dtype": out_dtype,
+                "use_fast_accum": use_fast_accum,
+            },
+            out_dtype=out_dtype,
+            reduce_op=reduce_op,
+            orig_scatter_dim=orig_scatter_dim,
+            scatter_dim_after_maybe_reshape=scatter_dim_after_maybe_reshape,
+            group_name=group_name,
+            output_shape=output_shape,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -1138,8 +1291,15 @@ def _fused_scaled_matmul_reduce_scatter_fallback(
     A_scale: torch.Tensor,
     B_scale: torch.Tensor,
     reduce_op: str,
+<<<<<<< HEAD
     scatter_dim: int,
     group_name: str,
+=======
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     bias: Optional[torch.Tensor] = None,
     result_scale: Optional[torch.Tensor] = None,
     out_dtype: Optional[torch.dtype] = None,
@@ -1169,17 +1329,26 @@ def _fused_scaled_matmul_reduce_scatter_fallback(
         out_dtype,
         use_fast_accum,
     )
+<<<<<<< HEAD
     C = C.view(*A.shape[:-1], B.shape[1])
     res = funcol.reduce_scatter_tensor(
         C,
         reduce_op,
         scatter_dim,
+=======
+    C = C.view(*output_shape[:-1], B.shape[1])
+    res = funcol.reduce_scatter_tensor(
+        C,
+        reduce_op,
+        orig_scatter_dim,  # need original scatter dim for 3D+ output tensor here
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         group_name,
     )
     res = funcol.wait_tensor(res)
     return res
 
 
+<<<<<<< HEAD
 @torch.library.impl(lib, "fused_scaled_matmul_reduce_scatter", "CUDA")
 def _fused_scaled_matmul_reduce_scatter(
     A: torch.Tensor,
@@ -1226,6 +1395,134 @@ def _fused_scaled_matmul_reduce_scatter(
             scatter_dim=scatter_dim,
             group_name=group_name,
         )
+=======
+def _fused_scaled_matmul_reduce_scatter_impl(
+    mm_out_op: torch._ops.OpOverload,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    kwargs: dict[str, Any],
+    out_dtype: Optional[torch.dtype],
+    reduce_op: str,
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+) -> torch.Tensor:
+    if A.dim() < 2:
+        raise ValueError("A_shard must be a matrix")
+    if (
+        scatter_dim_after_maybe_reshape < 0
+        or scatter_dim_after_maybe_reshape >= A.dim()
+    ):
+        raise ValueError("Invalid scatter dim for 2D tensor input to scaled_mm")
+    if orig_scatter_dim < 0 or orig_scatter_dim >= len(output_shape):
+        raise ValueError("Invalid scatter dim for 3D+ output tensor")
+    if B.dim() != 2:
+        raise ValueError("B must be a matrix")
+    if reduce_op == "sum":
+        reduce_fn = partial(torch.sum, dim=0)
+    elif reduce_op == "avg":
+        reduce_fn = partial(torch.mean, dim=0)
+    else:
+        raise ValueError("reduce_op must be sum or avg")
+
+    group = c10d._resolve_process_group(group_name)
+
+    # Move scatter to first dim, then shard the tensor along the first dim, so the chunk producer
+    # can perform matmuls along the first dim.
+    A_with_scatter_dim_0 = A.movedim(scatter_dim_after_maybe_reshape, 0)
+
+    # To handle case where A is 3D+, reshape to 2D to prepare for mm which requires 2D inputs.
+    A_2D_with_scatter_dim_0 = A_with_scatter_dim_0.flatten(0, -2)
+
+    # Partition A along the first dim to prepare for sharding across TP process group.
+    A_shards = A_2D_with_scatter_dim_0.chunk(group.size())
+
+    # Now that 'A' is sharded along the first dim, we need to update its scale(s) accordingly.
+    # How we do this depends on if we are using tensorwise scaling, rowwise scaling, or no scaling.
+    tensorwise_scaling = A_scale is not None and A_scale.numel() == 1
+    rowwise_scaling = A_scale is not None and A_scale.numel() > 1
+
+    # For tensorwise scaling, the scale should be replicated so each shard has a copy.
+    if tensorwise_scaling:
+        A_scale_shards = [A_scale] * group.size()
+
+    # For rowwise scaling, we need to move the scatter dim to the first dim to match the
+    # dim swap of the 'A' tensor. Then we can shard the scales along the first dim, just like
+    # the 'A' tensor.
+    elif rowwise_scaling:
+        if A_scale.shape[:-1] != A.shape[:-1]:
+            raise ValueError(
+                "For row-wise scaling, the leading dims of A_scale "
+                "must match the leading dims of A "
+                f"(A shape: {A.shape}, A_scale shape: {A_scale.shape})"
+            )
+        A_scale = (
+            A_scale.movedim(scatter_dim_after_maybe_reshape, 0)
+            .contiguous()
+            .flatten(0, -2)
+        )
+        A_scale_shards = list(A_scale.chunk(group.size()))
+    else:
+        raise ValueError("A_scale cannot be none for scaled_mm")
+
+    # Computing block-wise matmul along the first dim of A
+    def chunk_producer(rank: int, out: torch.Tensor) -> None:
+        mm_out_op(A_shards[rank], B, scale_a=A_scale_shards[rank], **kwargs, out=out)
+
+    # Stacked partials will be the 2D outputs of the the pipelined scaled mm, and will
+    # have the shape (A_with_scatter_dim_0_tensor.shape[0], B.shape[1]) to align with the formula:
+    # (a*b,c) @ (c,d) = (a*b,d)
+    stacked_partials = A_with_scatter_dim_0.new_empty(
+        A_2D_with_scatter_dim_0.shape[0], B.shape[1], dtype=out_dtype or A.dtype
+    )
+
+    # Execute the pipelined mm/scaled_mm.
+    _pipelined_produce_and_all2all(
+        chunk_producer,
+        stacked_partials,
+        group_name,
+    )
+
+    # We now need to transform the *unreduced* stacked 2D partial mm outputs to an *unreduced* 3D+ output,
+    # then reduce-scatter. To do this, we first need to determine the shape of the unreduced 3D+ output,
+    # to reshape our stacked partials so we can apply the reduce-scatter.
+    #
+    # The *unreduced* 3D+ tensor will have dim 0 = `group_size`, as we have `group_size` instances of
+    # stacked partial outputs. The next dims will be A's leading dims (sharded along the original scatter dim),
+    # as it was the left operand of the mm op. We can use -1 as the final dim of the view to populate the rest.
+    stacked_partials_3D_leading_dims = [group.size()] + list(
+        # We use A from after the dim swap 0<=>scatter_dim, but before the flatten,
+        # to get the leading dims of the 3D+ view of stacked partials.
+        A_with_scatter_dim_0.shape[:-1]
+    )
+
+    # The `group_size` leading dim has been prepended to `stacked_partials_3D_leading_dims`,
+    # to capture the partial output from each rank. We need to divide the sharding/scatter dim
+    # by the group size. If the original scatter dim was 0, then it is now dim 1 in this
+    # tensor, since this new `group_size` dim was prepended.
+    stacked_partial_scatter_dim = orig_scatter_dim if orig_scatter_dim > 0 else 1
+    stacked_partials_3D_leading_dims[stacked_partial_scatter_dim] //= group.size()
+
+    # Ensures that the transpose and reduction produce contiguous result
+    # in a single reduction kernel.
+    reduced_out = reduce_fn(
+        # View 2D stacked partials as 3D+ tensor of shape (`group_size`, ...)
+        stacked_partials.view(*stacked_partials_3D_leading_dims, -1)
+        # We originally swapped 0<=>scatter_dim_after_maybe_reshape. Now after
+        # prepending the `group_size` dim, to undo this original swap, we
+        # must swap 1<=>scatter_dim_after_maybe_reshape+1.
+        .movedim(1, scatter_dim_after_maybe_reshape + 1),
+        # Reduce along the `group_size` dim (0).
+        dim=0,
+    )
+
+    # Output shape must be scattered along original scatter dim as well.
+    output_shape[orig_scatter_dim] //= group.size()
+    out = reduced_out.view(*output_shape)
+    return out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def restride_A_for_fused_matmul_reduce_scatter(
@@ -1609,4 +1906,24 @@ def rendezvous(
     return _SymmetricMemory.rendezvous(tensor, group_name)
 
 
+<<<<<<< HEAD
 __all__ = ["empty", "rendezvous"]
+=======
+def is_nvshmem_available() -> bool:
+    r"""
+    is_nvshmem_available() -> bool
+
+    Check if NVSHMEM is available in current build and on current system.
+    """
+    try:
+        from torch._C._distributed_c10d import _is_nvshmem_available
+    except ImportError:
+        # Not all builds have NVSHMEM support.
+        return False
+
+    # Check if NVSHMEM is available on current system.
+    return _is_nvshmem_available()
+
+
+__all__ = ["empty", "rendezvous", "is_nvshmem_available"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/_symmetric_memory/_nvshmem_triton.py b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
new file mode 100644
index 000000000000..aeded6d76df9
--- /dev/null
+++ b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
@@ -0,0 +1,181 @@
+import os
+import sysconfig
+from typing import Optional
+
+from torch.utils._triton import has_triton
+
+
+def enable_triton(lib_dir: Optional[str] = None) -> dict[str, str]:
+    """
+    Enable NVSHMEM device functions for Triton. It performs a NVSHMEM
+    device-side initialization on the kernel module created by Triton.
+
+    Args:
+        lib_dir (Optional[str]): The directory where the NVSHMEM device library
+        is located. If not provided, it will use the default path where NVSHMEM
+        wheel is installed.
+
+    Returns:
+        dict[str, str]: A dictionary containing the NVSHMEM device library name
+        and path.
+    """
+    from triton.runtime.jit import JITFunction
+
+    from torch._C._distributed_c10d import _nvshmemx_cumodule_init
+
+    # Detect NVSHMEM device library path from python library path
+    if lib_dir is None:
+        py_lib_path = sysconfig.get_path("purelib")
+        lib_dir = py_lib_path + "/nvidia/nvshmem/lib"
+
+    lib_path = os.path.join(lib_dir, "libnvshmem_device.bc")
+    if not os.path.exists(lib_path):
+        raise RuntimeError("NVSHMEM device library not found")
+
+    extern_libs = {"libnvshmem_device": lib_path}
+
+    # A hook function to initialize NVSHMEM in Triton
+    def nvshmem_init_hook(*args, **kwargs) -> None:  # type: ignore[no-untyped-def]
+        key = kwargs["key"]
+        device = kwargs["compile"]["device"]
+        jit_function = kwargs["fn"].jit_function
+        kernel_cache, _, _, _ = jit_function.device_caches[device]
+        kernel = kernel_cache.get(key, None)
+        kernel.run
+        _nvshmemx_cumodule_init(kernel.module)
+
+    # Register the function as a post-compile hook
+    JITFunction.compiled_hook = nvshmem_init_hook
+
+    # Return to user so that they can use it in Triton kernel invocation
+    return extern_libs
+
+
+if has_triton():
+    from triton.language import core
+
+    @core.extern
+    def putmem_block(dst, src, nelems, pe, _builder=None):  # type: ignore[no-untyped-def]
+        return core.extern_elementwise(
+            "",
+            "",
+            [dst, src, nelems, pe],
+            {
+                (
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                ): ("nvshmemx_putmem_block", core.dtype("int32"))
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def getmem_block(dst, src, nelems, pe, _builder=None):  # type: ignore[no-untyped-def]
+        return core.extern_elementwise(
+            "",
+            "",
+            [dst, src, nelems, pe],
+            {
+                (
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                ): ("nvshmemx_getmem_block", core.dtype("int32"))
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def putmem_signal_block(  # type: ignore[no-untyped-def]
+        dst,
+        src,
+        nelems,
+        sig_addr,
+        signal,
+        sig_op,
+        pe,
+        _builder=None,
+    ):  # type: ignore[no-untyped-def]
+        return core.extern_elementwise(
+            "",
+            "",
+            [dst, src, nelems, sig_addr, signal, sig_op, pe],
+            {
+                (
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                ): ("nvshmemx_putmem_signal_block", core.dtype("int32"))
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def wait_until(ivar, cmp, cmp_val, _builder=None):  # type: ignore[no-untyped-def]
+        return core.extern_elementwise(
+            "",
+            "",
+            [ivar, cmp, cmp_val],
+            {
+                (
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                ): ("nvshmem_longlong_wait_until", core.dtype("int32"))
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def signal_wait_until(sig_addr, cmp, cmp_val, _builder=None):  # type: ignore[no-untyped-def]
+        return core.extern_elementwise(
+            "",
+            "",
+            [sig_addr, cmp, cmp_val],
+            {
+                (
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                    core.dtype("int64"),
+                ): ("nvshmem_signal_wait_until", core.dtype("int32"))
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def fence(_builder=None):  # type: ignore[no-untyped-def]
+        return core.extern_elementwise(
+            "",
+            "",
+            [],
+            {
+                (): ("nvshmem_fence", core.dtype("int32")),
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
+
+    @core.extern
+    def quiet(_builder=None):  # type: ignore[no-untyped-def]
+        return core.extern_elementwise(
+            "",
+            "",
+            [],
+            {
+                (): ("nvshmem_quiet", core.dtype("int32")),
+            },
+            is_pure=False,
+            _builder=_builder,
+        )
diff --git a/torch/distributed/_tools/fsdp2_mem_tracker.py b/torch/distributed/_tools/fsdp2_mem_tracker.py
index 2eab61e12401..6c7c815c5dcb 100644
--- a/torch/distributed/_tools/fsdp2_mem_tracker.py
+++ b/torch/distributed/_tools/fsdp2_mem_tracker.py
@@ -170,7 +170,11 @@ def __init__(
     def _instrument_fsdp_sharded_params_grads(
         self, fsdp_param_group: FSDPParamGroup
     ) -> None:
+<<<<<<< HEAD
         # Track sharded params and grads after initilization
+=======
+        # Track sharded params and grads after initialization
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for fsdp_param in fsdp_param_group.fsdp_params:
             self._update_and_maybe_create_winfos(
                 fsdp_param.sharded_param,
@@ -199,7 +203,11 @@ def _fsdp_state_pre_forward(
         #         this module is called for the second time. If it is a root module, that means we are in the next
         #         iteration and we error out. If it is not a root module, that means it's a submodule that is being
         #         used multiple times in the same iteration, which we allow and track.
+<<<<<<< HEAD
         # For Case 1 and 3, we also initialiaze the ``local_peak`` and ``PEAK_FW`` snapshot for the module.
+=======
+        # For Case 1 and 3, we also initialize the ``local_peak`` and ``PEAK_FW`` snapshot for the module.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # For Case 2 we only capture 1 snapshot after ``FSDPState._pre_forward`` runs because it is a no-op.
         @wraps(orig_fsdp_state_pre_fw)
         def inner(
diff --git a/torch/distributed/_tools/mem_tracker.py b/torch/distributed/_tools/mem_tracker.py
index 38a25eb2a294..c877734fe0cb 100644
--- a/torch/distributed/_tools/mem_tracker.py
+++ b/torch/distributed/_tools/mem_tracker.py
@@ -643,7 +643,11 @@ def _pre_fw_hook(self, module: nn.Module, inputs: Any) -> None:
         #         this module is called for the second time. If it is a root module, that means we are in the next
         #         iteration and we error out. If it is not a root module, that means it's a submodule that is being
         #         used multiple times in the same iteration, which we allow and track.
+<<<<<<< HEAD
         # For Case 1 and 3, we also initialiaze the ``local_peak`` and ``PEAK_FW`` snapshot for the module.
+=======
+        # For Case 1 and 3, we also initialize the ``local_peak`` and ``PEAK_FW`` snapshot for the module.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod_name = self._mod_tracker.get_known_fqn(module)
         assert mod_name is not None
         if module not in self.memory_tracking:
diff --git a/torch/distributed/_tools/memory_tracker.py b/torch/distributed/_tools/memory_tracker.py
index f5226b9fb38f..03a13265f303 100644
--- a/torch/distributed/_tools/memory_tracker.py
+++ b/torch/distributed/_tools/memory_tracker.py
@@ -81,7 +81,12 @@ def __init__(self) -> None:
         self._markers: dict[str, int] = defaultdict(int)
         self._cur_module_name: str = ""
         self._op_index: int = 0
+<<<<<<< HEAD
         self._num_cuda_retries: int = 0
+=======
+        self._num_alloc_retries: int = 0
+        self._device_module = torch.get_device_module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @no_type_check
     def start_monitor(self, root_module: nn.Module) -> None:
@@ -106,7 +111,11 @@ def start_monitor(self, root_module: nn.Module) -> None:
             # clear and remove it for now as it does not really capture important info.
             # h3 = m.register_backward_hook(self._create_backward_hook(name))
             self._hooks.extend([h1, h2])
+<<<<<<< HEAD
         torch.cuda.empty_cache()
+=======
+        self._device_module.empty_cache()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert getattr(self, "profile_mode", None) is None
         self.profile_mode = MemoryProfileDispatchMode(self)
         self.profile_mode.__enter__()
@@ -116,9 +125,17 @@ def stop(self) -> None:
         """
         Remove module hooks and exit ``MemoryProfileDispatchMode`` to stop tracking memory stats at operator level.
 
+<<<<<<< HEAD
         Get some aggregated stats when the memory_tracker() is enabled, like cuda ``num_alloc_retries``.
         """
         self._num_cuda_retries = torch.cuda.memory_stats().get("num_alloc_retries", 0)
+=======
+        Get some aggregated stats when the memory_tracker() is enabled, like ``num_alloc_retries``.
+        """
+        self._num_alloc_retries = self._device_module.memory_stats().get(
+            "num_alloc_retries", 0
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for h in self._hooks:
             h.remove()
@@ -142,7 +159,11 @@ def summary(self, top: int = 20) -> None:
             previous_allocated_memory = current_allocated_memory
 
         print("------------------------------------------------")
+<<<<<<< HEAD
         print(f"The number of cuda retries are: {self._num_cuda_retries}")
+=======
+        print(f"The number of alloc retries are: {self._num_alloc_retries}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         print(f"Top {top} ops that generates memory are:")
         for k, v in sorted(op_diff.items(), key=operator.itemgetter(1), reverse=True)[
             :top
@@ -206,7 +227,11 @@ def save_stats(self, path: str) -> None:
             "memories_active": self.memories_active,
             "memories_reserved": self.memories_reserved,
             "markers": self._markers,
+<<<<<<< HEAD
             "num_alloc_retries": self._num_cuda_retries,
+=======
+            "num_alloc_retries": self._num_alloc_retries,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
         with open(path, "wb") as f:
@@ -221,7 +246,11 @@ def load(self, path: str) -> None:
         self.memories_active = stats["memories_active"]
         self.memories_reserved = stats["memories_reserved"]
         self._markers = stats["markers"]
+<<<<<<< HEAD
         self._num_cuda_retries = stats["num_alloc_retries"]
+=======
+        self._num_alloc_retries = stats["num_alloc_retries"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _create_pre_forward_hook(self, name: str) -> Callable:
         """Prefix operator name with current module and 'forward', and insert 'fw_start' marker at forward pass start."""
@@ -269,10 +298,18 @@ def _record_memory_stats(self, fn_name: str) -> None:
 
         The memory stats dict is indexed with ``self._op_index``.
         """
+<<<<<<< HEAD
         memory_allocated: float = torch.cuda.memory_allocated() / BYTES_PER_MB
         memory_reserved: float = torch.cuda.memory_reserved() / BYTES_PER_MB
         memory_active: float = (
             torch.cuda.memory_stats().get("active_bytes.all.current", 0) / BYTES_PER_MB
+=======
+        memory_allocated: float = self._device_module.memory_allocated() / BYTES_PER_MB
+        memory_reserved: float = self._device_module.memory_reserved() / BYTES_PER_MB
+        memory_active: float = (
+            self._device_module.memory_stats().get("active_bytes.all.current", 0)
+            / BYTES_PER_MB
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.memories_allocated[self._op_index] = (fn_name, memory_allocated)
         self.memories_reserved[self._op_index] = (fn_name, memory_reserved)
@@ -293,4 +330,8 @@ def _clear_state(self) -> None:
         self._markers.clear()
         self._cur_module_name = ""
         self._op_index = 0
+<<<<<<< HEAD
         self._num_cuda_retries = 0
+=======
+        self._num_alloc_retries = 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/_tools/runtime_estimator.py b/torch/distributed/_tools/runtime_estimator.py
index 5dabb23b6347..73f0f41373a8 100644
--- a/torch/distributed/_tools/runtime_estimator.py
+++ b/torch/distributed/_tools/runtime_estimator.py
@@ -83,7 +83,11 @@ class RuntimeEstimator(TorchDispatchMode):
     This class provides a ``TorchDispatchMode`` based context manager that can be used to estimate the eager
     runtime of PyTorch functions. It supports two estimation modes, benchmarking (`operator-level-benchmark`) and
     roofline cost modeling (`operator-level-cost-model`).
+<<<<<<< HEAD
     For modules executed under this context manager, it agggregates the forward and backward operation runtimes
+=======
+    For modules executed under this context manager, it aggregates the forward and backward operation runtimes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     and also records their execution orders.
 
     Attributes:
diff --git a/torch/distributed/_tools/sac_estimator.py b/torch/distributed/_tools/sac_estimator.py
index 2c1f4f5e9375..5f1ccdd382d9 100644
--- a/torch/distributed/_tools/sac_estimator.py
+++ b/torch/distributed/_tools/sac_estimator.py
@@ -50,7 +50,11 @@ def _display_stats_tabular(headers: list[str], table_data: list[list[Any]]) -> N
 
 
 # Based on:
+<<<<<<< HEAD
 # https://github.com/fairinternal/xformers/blob/0ded5697a2ea15711ce45131002d04e72053cc6d/xformers/checkpoint.py#L62
+=======
+# https://github.com/facebookresearch/xformers/blob/main/xformers/checkpoint.py#L71
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class _SACMetadata:
     """
@@ -125,7 +129,11 @@ class MSPS(NamedTuple):
 
     Attributes:
         func_names (set[str]): Set of operator/operator group names.
+<<<<<<< HEAD
         op_idx (int): Operator index (group head index incase of operator groups).
+=======
+        op_idx (int): Operator index (group head index in case of operator groups).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         memory (int): Memory usage in bytes.
         runtime (float): Runtime in milliseconds.
         msps (float): Memory per second calculated as memory/runtime.
@@ -194,7 +202,11 @@ class SACEstimator(TorchDispatchMode):
     estimation modes, `operator-level-benchmark` and (`operator-level-cost-model` (roofline model).
 
     Attributes:
+<<<<<<< HEAD
         sac_mod_stats (Dict[str, SACStats]): Dictionary from module FQN (fuly qualified name) to ``SACStats``.
+=======
+        sac_mod_stats (Dict[str, SACStats]): Dictionary from module FQN (fully qualified name) to ``SACStats``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sac_mod_tradeoff_stats (Dict[str, SACTradeOffStats]): Dictionary from module FQN to ``SACTradeOffStats``.
         sac_mod_greedy_order_meta (Dict[str, SACGreedyOrderMeta]): Dictionary from module FQN to ``SACGreedyOrderMeta``.
 
@@ -358,6 +370,7 @@ def _get_inplace_metadata(
         output_ids = tuple(hash(st) for st in out_storages)
         # 4. If the function is not inplace, return
         if not is_inplace(func):
+<<<<<<< HEAD
             return curr_idx, output_ids, {mod_fqn: () for mod_fqn in active_mod_fqns}
 
         op_idx = curr_idx
@@ -367,6 +380,15 @@ def _get_inplace_metadata(
         }
         for i, d in enumerate(self._sac_metadata):
             # 6. Find the first occurence of a tensor corresponding to each module that
+=======
+            return curr_idx, output_ids, dict.fromkeys(active_mod_fqns, ())
+
+        op_idx = curr_idx
+        # 5. Initialize the parent op ids of the inplace op for each of the active modules
+        mod_op_parent_idxs: dict[str, int] = dict.fromkeys(active_mod_fqns, -1)
+        for i, d in enumerate(self._sac_metadata):
+            # 6. Find the first occurrence of a tensor corresponding to each module that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # shares the same storage as the current tensor
             past_output_ids = d.output_ids
             if set(output_ids).issubset(set(past_output_ids)):
@@ -485,7 +507,11 @@ def _get_greedy_order_meta(self, sac_stats: SACStats) -> SACGreedyOrderMeta:
         #   a) If the head of this group is an inplace op, then we have to store the entire group.
         #   b) If any op in the group is random and force_store_random is set, then entire group will be stored.
         #   c) If none of ops in the group are random and the head of the group is not an in-place op, then
+<<<<<<< HEAD
         #       this group can be considered for recomputation in its entireity
+=======
+        #       this group can be considered for recomputation in its entirety
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         stored_ops: set[int] = set()
         recomputed_ops: set[int] = set()
         # Case 1:
@@ -535,7 +561,11 @@ def _get_greedy_order_meta(self, sac_stats: SACStats) -> SACGreedyOrderMeta:
             func_names = {sac_stats.func_names[op_idx] for op_idx in op_indices}
             msps = (mem / runtime) if runtime > 0 else sys.float_info.max
             msps_meta.append(MSPS(func_names, cand_idx, mem, runtime, msps))
+<<<<<<< HEAD
         # We choose canidates to be recomputed based on increasing msps
+=======
+        # We choose candidates to be recomputed based on increasing msps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         msps_meta.sort(key=lambda x: x.msps, reverse=True)
         return SACGreedyOrderMeta(
             recomputed_ops, stored_ops, inplace_op_groups, random_ops_group, msps_meta
@@ -562,7 +592,11 @@ def _get_sac_tradeoff_pwlf_stats(
             greedy_order_meta.random_ops_group,
             greedy_order_meta.msps_meta,
         )
+<<<<<<< HEAD
         # 1. Intitialize the discarded memory and recomputation runtime to sum of already chosen recomputed_ops
+=======
+        # 1. Initialize the discarded memory and recomputation runtime to sum of already chosen recomputed_ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         recomp_indices: set[int] = set()
         for r_idx in recomputed_ops:
             recomp_indices.add(r_idx)
@@ -576,7 +610,11 @@ def _get_sac_tradeoff_pwlf_stats(
         # 2. Initialize the max recomputation time and total recomputation memory
         sac_runtime = sum(sac_stats.runtimes)
         sac_memory = sum(sac_stats.memory)
+<<<<<<< HEAD
         # 3. Tradeoff curve stores the KV pair of the dicarded memory to total memory and,
+=======
+        # 3. Tradeoff curve stores the KV pair of the discarded memory to total memory and,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # recomputation time to total runtime incurred.
         delta = 1e-2
         tradeoff_curve = OrderedDict()
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
index a64b502255f6..baecf7ce6941 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -173,7 +173,11 @@ def bf16_compress_wrapper(
     Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
 
     This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
+<<<<<<< HEAD
     `Brain floating point format <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format> `_  (``torch.bfloat16``),
+=======
+    `Brain floating point format <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_  (``torch.bfloat16``),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     and casts the resulting tensor of the given hook back to the input data type, such as ``float32``.
 
     Therefore, ``bf16_compress_hook`` is equivalent to ``bf16_compress_wrapper(allreduce_hook)``.
diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py
index 407014418ecc..53970ff719f7 100644
--- a/torch/distributed/algorithms/model_averaging/utils.py
+++ b/torch/distributed/algorithms/model_averaging/utils.py
@@ -1,5 +1,8 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 # flake8: noqa C101
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 from collections.abc import Iterable, Iterator
 from typing import Union
diff --git a/torch/distributed/checkpoint/__init__.py b/torch/distributed/checkpoint/__init__.py
index ecd8b0723490..6f767a975d00 100644
--- a/torch/distributed/checkpoint/__init__.py
+++ b/torch/distributed/checkpoint/__init__.py
@@ -1,8 +1,15 @@
 from . import _extension
+<<<<<<< HEAD
 from ._hf_storage import _HuggingFaceStorageReader, _HuggingFaceStorageWriter
 from .api import CheckpointException
 from .default_planner import DefaultLoadPlanner, DefaultSavePlanner
 from .filesystem import FileSystemReader, FileSystemWriter
+=======
+from .api import CheckpointException
+from .default_planner import DefaultLoadPlanner, DefaultSavePlanner
+from .filesystem import FileSystemReader, FileSystemWriter
+from .hf_storage import HuggingFaceStorageReader, HuggingFaceStorageWriter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .metadata import (
     BytesStorageMetadata,
     ChunkStorageMetadata,
diff --git a/torch/distributed/checkpoint/_async_process_executor.py b/torch/distributed/checkpoint/_async_process_executor.py
index 801c8d79e8d6..bbc448c628cd 100644
--- a/torch/distributed/checkpoint/_async_process_executor.py
+++ b/torch/distributed/checkpoint/_async_process_executor.py
@@ -87,29 +87,82 @@ def __init__(
         pg_init_info: _ProcessGroupInitInfo,
     ):
         self.ctx = mp.get_context("spawn")
+<<<<<<< HEAD
         self._mp_queue_send: mp.Queue = self.ctx.Queue()
         self._mp_queue_recv: mp.Queue = self.ctx.Queue()
+=======
+        self._process_pipe, child_end = self.ctx.Pipe()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self._save_process = self.ctx.Process(
             target=self._checkpointing_subprocess,
             args=(
                 pg_init_info,
+<<<<<<< HEAD
                 self._mp_queue_send,
                 self._mp_queue_recv,
+=======
+                child_end,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
             daemon=True,
         )
 
         self._save_process.start()
+<<<<<<< HEAD
         response = self._wait_for_response()
+=======
+
+        # Close the parent's copy of child end after we pass it into the child,
+        # so the recv()s on it will fail-fast if the child process dies.
+        child_end.close()
+
+        # Wait for the checkpoint background process to initialize.
+        # Using default GLOO init timeout.
+        response = self._wait_for_response(timeout=1800)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert response == _CheckpointSaveProcessControlOpts.INIT_COMPLETE
 
     def __del__(self) -> None:
         if self._save_process.is_alive():
             logger.info("Terminating the checkpoint background process...")
+<<<<<<< HEAD
             self._mp_queue_send.put(_CheckpointSaveProcessControlOpts.TERMINATE)
             self._save_process.join()
 
+=======
+            self._send(_CheckpointSaveProcessControlOpts.TERMINATE)
+            self._save_process.join()
+
+    def _send(self, data: Any) -> None:
+        self._process_pipe.send(data)
+
+    def _wait_for_response(self, timeout: Optional[float] = None) -> Any:
+        if not self._save_process.is_alive():
+            logger.info("Checkpoint background process is dead calling join()...")
+            self._save_process.join()
+            raise RuntimeError(
+                f"Checkpoint background process is dead. Exit code: {self._save_process.exitcode}"
+            )
+
+        if timeout is not None and not self._process_pipe.poll(timeout=timeout):
+            raise RuntimeError(
+                f"Timed out after {timeout}s while waiting for response from checkpointer process pid: {self._save_process.pid}"
+            )
+
+        try:
+            response = self._process_pipe.recv()
+        except EOFError:
+            raise RuntimeError(  # noqa: B904
+                f"Checkpoint background process is dead. Exit code: {self._save_process.exitcode}"
+            )
+
+        if isinstance(response, BaseException):
+            raise response
+
+        return response
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def save(
         self,
         staged_state_dict: STATE_DICT_TYPE,
@@ -127,11 +180,16 @@ def save(
             storage_writer=storage_writer,
             planner=planner,
         )
+<<<<<<< HEAD
         self._mp_queue_send.put(async_cp_request)
+=======
+        self._send(async_cp_request)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = self._wait_for_response()
         assert isinstance(result, Metadata)
         return result
 
+<<<<<<< HEAD
     def _wait_for_response(self) -> Any:
         if not self._save_process.is_alive():
             logger.info("Checkpoint background process is dead calling join()...")
@@ -142,6 +200,8 @@ def _wait_for_response(self) -> Any:
             raise response
         return response
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     def _execute_save(
         state_dict: STATE_DICT_TYPE,
@@ -163,8 +223,12 @@ def _execute_save(
     @staticmethod
     def _checkpointing_subprocess(
         pg_init_info: _ProcessGroupInitInfo,
+<<<<<<< HEAD
         recv: mp.Queue,
         send: mp.Queue,
+=======
+        parent_conn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         try:
             _init_logger(pg_init_info.global_rank)
@@ -185,12 +249,20 @@ def _checkpointing_subprocess(
             dist.barrier()
 
             logger.info("Checkpoint background process is running...")
+<<<<<<< HEAD
             send.put(_CheckpointSaveProcessControlOpts.INIT_COMPLETE)
+=======
+            parent_conn.send(_CheckpointSaveProcessControlOpts.INIT_COMPLETE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Serving loop.
             while True:
                 logger.info("Waiting for checkpoint save request...")
+<<<<<<< HEAD
                 obj = recv.get()
+=======
+                obj = parent_conn.recv()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if (
                     isinstance(obj, _CheckpointSaveProcessControlOpts)
                     and obj == _CheckpointSaveProcessControlOpts.TERMINATE
@@ -208,7 +280,11 @@ def _checkpointing_subprocess(
                     storage_writer=obj.storage_writer,
                     planner=obj.planner,
                 )
+<<<<<<< HEAD
                 send.put(response)
+=======
+                parent_conn.send(response)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 logger.info(
                     f"Submitted checkpoint save request for checkpoint_id={obj.checkpoint_request_id}"  # noqa: G004
                 )
@@ -216,11 +292,19 @@ def _checkpointing_subprocess(
             logger.error(
                 f"Checkpoint background process encountered an exception: {e}"  # noqa: G004
             )
+<<<<<<< HEAD
             send.put(e)
+=======
+            parent_conn.send(e)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise
         finally:
             logger.info("Checkpoint background process is shutting down...")
             dist.destroy_process_group()
+<<<<<<< HEAD
+=======
+            parent_conn.close()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _CHECKPOINT_PROCESS: Optional[_AsyncCheckpointProcess] = None
diff --git a/torch/distributed/checkpoint/_dedup_save_plans.py b/torch/distributed/checkpoint/_dedup_save_plans.py
index e416030a816a..aed906e3449e 100644
--- a/torch/distributed/checkpoint/_dedup_save_plans.py
+++ b/torch/distributed/checkpoint/_dedup_save_plans.py
@@ -19,18 +19,38 @@ def dedup_save_plans(
     """
     Removes duplicate entries from appearing on multiple SavePlans. For each duplicate across
     a set of SavePlans, only the smallest SavePlan in terms of planned storage keeps the entry.
+<<<<<<< HEAD
     """
 
     write_item_to_plan_indices: dict[MetadataIndex, set[int]] = defaultdict(set)
     write_item_idx_to_write_item: dict[MetadataIndex, WriteItem] = {}
+=======
+
+    Please note that this function does not modify the original SavePlans, but rather returns
+    """
+
+    # Map to query the plan indices that a write item is duplicated in
+    write_item_to_plan_indices: dict[MetadataIndex, set[int]] = defaultdict(set)
+    # Map to query the write item from its index
+    write_item_idx_to_write_item: dict[MetadataIndex, WriteItem] = {}
+    # Set of write item indices that are present in each plan
+    # After deduplication, this will be the set of write item indices that are present in the final plans
+    plan_to_item_indices: list[set[MetadataIndex]] = [
+        {item.index for item in plan.items} for plan in all_plans
+    ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for plan_idx, plan in enumerate(all_plans):
         for write_item in plan.items:
             # map each write item to its plan
             write_item_to_plan_indices[write_item.index].add(plan_idx)
             write_item_idx_to_write_item[write_item.index] = write_item
+<<<<<<< HEAD
 
     # put item in the plan with the smallest size and remove it from the other plan_indices
     to_remove: list[set] = [set() for _ in range(len(all_plans))]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     plan_to_size = [0] * len(all_plans)
     for write_item_idx, plan_indices in write_item_to_plan_indices.items():
         if save_to_lowest_rank:
@@ -41,6 +61,7 @@ def dedup_save_plans(
             )
 
         write_item = write_item_idx_to_write_item[write_item_idx]
+<<<<<<< HEAD
         # essentially ignores the storage size of anything that is not a tensor, since
         # we don't know how much storage they represent
         plan_to_size[select_plan_idx] += write_item.tensor_storage_size() or 1
@@ -58,3 +79,19 @@ def dedup_save_plans(
         all_plans[plan_idx] = dataclasses.replace(all_plans[plan_idx], items=new_items)
 
     return all_plans
+=======
+        # Ignore the storage size of anything that is not a tensor, since
+        # we don't know how much storage they represent
+        plan_to_size[select_plan_idx] += write_item.tensor_storage_size() or 1
+        for plan_idx in plan_indices - {select_plan_idx}:
+            plan_to_item_indices[plan_idx].discard(write_item_idx)
+    # Sanity check
+    assert len(all_plans) == len(plan_to_item_indices)
+    # Create new plans with the updated write items post deduplication
+    return [
+        dataclasses.replace(
+            plan, items=[item for item in plan.items if item.index in item_indexes]
+        )
+        for plan, item_indexes in zip(all_plans, plan_to_item_indices)
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/checkpoint/_fsspec_filesystem.py b/torch/distributed/checkpoint/_fsspec_filesystem.py
index b7b71bdf4b2b..74c3842b77fb 100644
--- a/torch/distributed/checkpoint/_fsspec_filesystem.py
+++ b/torch/distributed/checkpoint/_fsspec_filesystem.py
@@ -15,6 +15,10 @@
     FileSystemBase,
     FileSystemReader,
     FileSystemWriter,
+<<<<<<< HEAD
+=======
+    SerializationFormat,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -90,6 +94,14 @@ def exists(self, path: Union[str, os.PathLike]) -> bool:
     def rm_file(self, path: Union[str, os.PathLike]) -> None:
         self.fs.rm(path)
 
+<<<<<<< HEAD
+=======
+    def ls(self, path: Union[str, os.PathLike]) -> list[str]:
+        # setting detail to False explicitly to keep the list[str] return type,
+        # instead of the list[Dict] return type when detail=True
+        return self.fs.ls(path, detail=False)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: add the dcp.async_save mixin
 class FsspecWriter(FileSystemWriter):
@@ -115,6 +127,10 @@ def __init__(
         per_thread_copy_ahead: int = 10_000_000,
         overwrite: bool = True,
         _extensions: Optional[Sequence[StreamTransformExtension]] = None,
+<<<<<<< HEAD
+=======
+        serialization_format: SerializationFormat = SerializationFormat.TORCH_SAVE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         **kwargs,
     ) -> None:
         """
@@ -139,6 +155,10 @@ def __init__(
             per_thread_copy_ahead,
             overwrite=overwrite,
             _extensions=_extensions,
+<<<<<<< HEAD
+=======
+            serialization_format=serialization_format,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.fs = FileSystem()
         self.path = self.fs.init_path(path, **kwargs)
diff --git a/torch/distributed/checkpoint/_hf_utils.py b/torch/distributed/checkpoint/_hf_utils.py
new file mode 100644
index 000000000000..7d3323d1c75d
--- /dev/null
+++ b/torch/distributed/checkpoint/_hf_utils.py
@@ -0,0 +1,105 @@
+import io
+import json
+import struct
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import torch
+
+
+_metadata_fn: str = "model.safetensors.index.json"
+
+FILE_NAME = "model-{cpt_idx}-of-{num_files}"
+SHARDED_FILE_NAME = "shard-{shard_idx}-model-{cpt_idx}-of-{num_files}"
+SUFFIX = ".safetensors"
+
+# metadata keys
+CUSTOM_METADATA_KEY = "DCP_SHARDING_INFO"
+DEFAULT_EXTRA_METADATA_KEY = "__metadata__"
+SAVED_OFFSETS_KEY = "saved_offsets"
+SHAPE_KEY = "shape"
+DATA_KEY = "data"
+DTYPE_KEY = "dtype"
+DATA_OFFSETS_KEY = "data_offsets"
+
+DTYPE_MAP = {
+    "F16": torch.float16,
+    "F32": torch.float32,
+    "F64": torch.float64,
+    "I8": torch.int8,
+    "U8": torch.uint8,
+    "I16": torch.int16,
+    "I32": torch.int32,
+    "I64": torch.int64,
+    "BF16": torch.bfloat16,
+}
+
+HF_DCP_VERSION: float = 1.0
+DCP_VERSION_KEY = "DCP_VERSION"
+DCP_SHARDING_INFO_KEY = "DCP_SHARDING_INFO"
+
+FORMAT_KEY = "format"
+FORMAT_VALUE = "pt"
+
+
+@dataclass
+class _HFStorageInfo:
+    """This is the per entry storage info."""
+
+    relative_path: str
+    offset: int
+    length: int
+    shape: torch.Size
+    dtype: torch.dtype
+
+
+def _gen_file_name(
+    index: int, largest_index: int, shard_index: Optional[int] = None
+) -> str:
+    if shard_index is not None:
+        return (
+            SHARDED_FILE_NAME.format(
+                shard_idx=f"{shard_index}".zfill(5),
+                cpt_idx=f"{index}".zfill(5),
+                num_files=f"{largest_index}".zfill(5),
+            )
+            + SUFFIX
+        )
+    else:
+        return (
+            FILE_NAME.format(
+                cpt_idx=f"{index}".zfill(5), num_files=f"{largest_index}".zfill(5)
+            )
+            + SUFFIX
+        )
+
+
+def _get_safetensors_file_metadata(file_bytes: io.IOBase) -> tuple[Any, int]:
+    # this uses the same logic that's done in HF code base
+    # https://github.com/2404589803/huggingface_hub/blob/main/src/huggingface_hub/hf_api.py#L5308
+    # and follows their documentation on how their files are serialized
+    # https://huggingface.co/docs/safetensors/index#format
+
+    num_bytes_for_header_len = 8
+    header_len_bytes = file_bytes.read(num_bytes_for_header_len)
+    header_len = struct.unpack("<Q", header_len_bytes)[0]
+    header_json = file_bytes.read(header_len)
+    metadata = json.loads(header_json)
+    return (metadata, header_len + num_bytes_for_header_len)
+
+
+def _get_dtype(dtype_str: str) -> torch.dtype:
+    try:
+        dtype = DTYPE_MAP[dtype_str]
+    except KeyError:
+        dtype = torch.get_default_dtype()
+
+    return dtype
+
+
+def _get_dcp_custom_metadata(metadata: Any) -> Optional[Any]:
+    if DEFAULT_EXTRA_METADATA_KEY in metadata:
+        custom_metadata = metadata[DEFAULT_EXTRA_METADATA_KEY]
+        if CUSTOM_METADATA_KEY in custom_metadata:
+            return json.loads(custom_metadata[CUSTOM_METADATA_KEY])
+    return None
diff --git a/torch/distributed/checkpoint/_state_dict_stager.py b/torch/distributed/checkpoint/_state_dict_stager.py
new file mode 100644
index 000000000000..e792c152f2a9
--- /dev/null
+++ b/torch/distributed/checkpoint/_state_dict_stager.py
@@ -0,0 +1,354 @@
+# mypy: allow-untyped-defs
+import logging
+import types
+import weakref
+from copyreg import dispatch_table
+from logging import getLogger
+from typing import Any
+
+import torch
+import torch.cuda._pin_memory_utils as pin_memory_utils
+from torch.storage import UntypedStorage
+from torch.utils.weak import WeakIdKeyDictionary
+
+
+logger = getLogger()
+logger.setLevel(logging.INFO)
+
+
+class StateDictStager:
+    """
+    A class for optimizing storage objects during staging for async checkpointing.
+
+    StateDictStager stages the state_dict to CPU DRAM while applying optimizations
+    like memory sharing and pinning to improve performance. It caches storage objects
+    to avoid redundant copies and can be configured to automatically share memory
+    (for multi-process usage) and pin memory (for faster CPU-GPU transfers).
+
+    Attributes:
+        pin_memory (bool): Whether to pin CPU memory for faster CPU-GPU transfers
+        share_memory (bool): Whether to share memory across processes
+        _cached_storage_mapping (WeakIdKeyDictionary): Maps storage objects to optimized CPU storages using weak references
+    """
+
+    def __init__(self, pin_memory: bool = False, share_memory: bool = False):
+        if pin_memory and not torch.cuda.is_available():
+            logger.warning(
+                "Ignoring pin_memory flag for checkpoint staging as pinning memory"
+                "requires CUDA, but CUDA is not available. "
+            )
+            self.pin_memory = False
+        else:
+            self.pin_memory = pin_memory
+        self.share_memory = share_memory
+        # Mapping from original storage objects to CPU storages using weak references
+        self._cached_storage_mapping = WeakIdKeyDictionary()
+
+        def _deepcopy_atomic(x, _):
+            return x
+
+        def _deepcopy_list(x, memo):
+            y: list = []
+            memo[id(x)] = y
+            append = y.append
+            for a in x:
+                append(self.deepcopy_with_tensor_offload(a, memo))
+            return y
+
+        def _deepcopy_tuple(x, memo):
+            y = [self.deepcopy_with_tensor_offload(a, memo) for a in x]
+            # We're not going to put the tuple in the memo, but it's still important we
+            # check for it, in case the tuple contains recursive mutable structures.
+            try:
+                return memo[id(x)]
+            except KeyError:
+                pass
+
+            # Check if any elements changed during deepcopy
+            for k, j in zip(x, y):
+                if k is not j:
+                    # At least one element changed, create new tuple
+                    return tuple(y)
+
+            # No elements changed, return original tuple
+            return x
+
+        def _deepcopy_dict(x, memo):
+            y: dict = {}
+            memo[id(x)] = y
+            for key, value in x.items():
+                y[self.deepcopy_with_tensor_offload(key, memo)] = (
+                    self.deepcopy_with_tensor_offload(value, memo)
+                )
+            return y
+
+        def _deepcopy_method(x, memo):  # Copy instance methods
+            return type(x)(
+                x.__func__, self.deepcopy_with_tensor_offload(x.__self__, memo)
+            )
+
+        d: dict[Any, Any] = {}
+        self._deepcopy_dispatch = d
+        d[type(None)] = _deepcopy_atomic
+        d[int] = _deepcopy_atomic
+        d[float] = _deepcopy_atomic
+        d[bool] = _deepcopy_atomic
+        d[complex] = _deepcopy_atomic
+        d[bytes] = _deepcopy_atomic
+        d[str] = _deepcopy_atomic
+        d[types.CodeType] = _deepcopy_atomic
+        d[type] = _deepcopy_atomic
+        d[range] = _deepcopy_atomic
+        d[types.BuiltinFunctionType] = _deepcopy_atomic
+        d[types.FunctionType] = _deepcopy_atomic
+        d[weakref.ref] = _deepcopy_atomic
+        d[property] = _deepcopy_atomic
+        d[types.MethodType] = _deepcopy_method
+        d[dict] = _deepcopy_dict
+        d[tuple] = _deepcopy_tuple
+        d[list] = _deepcopy_list
+
+    def _stage_untyped_storage(
+        self, storage: UntypedStorage, non_blocking: bool = False
+    ):
+        """
+        Called from the hooked storage_deepcopy function in torch.Tensor.__deepcopy__.
+
+        This method handles the storage optimization logic for the StagingStateDict class.
+        It checks if the storage has already been cached, and if so, reuses it.
+        Otherwise, it creates a new CPU storage and applies memory optimizations.
+
+        Args:
+            storage: The storage to optimize
+
+        Returns:
+            The optimized storage
+        """
+        # Check if we've already cached this storage
+        if storage in self._cached_storage_mapping:
+            cached_storage = self._cached_storage_mapping[storage]
+            assert cached_storage.size() == storage.size(), (
+                "For async checkpointing,  We cache storages in DRAM and reuse them."
+                "Cached storage size does not match original storage size."
+                "This should never happen as we track the original storage weakref "
+                "and clean up the cache storage. Please report this to PyTorch Distributed Checkpointing."
+            )
+            # Reuse cached storage but update with new data
+            cached_storage.copy_(storage, non_blocking=non_blocking)
+            return cached_storage
+
+        # Create new CPU storage
+        if self.share_memory:
+            new_storage = type(storage)._new_shared(storage.size(), device="cpu")
+        else:
+            new_storage = type(storage)(storage.size(), device="cpu")
+
+        if self.pin_memory and new_storage.nbytes() > 0:
+            pin_memory_utils.pin_memory(new_storage.data_ptr(), new_storage.nbytes())
+            # Set up a weak reference to unpin when cpu storage is garbage collected
+            f = weakref.finalize(
+                new_storage, pin_memory_utils.unpin_memory, new_storage.data_ptr()
+            )
+            # This makes sure that the finalizer is not called after
+            # cuda context is destroyed.
+            f.atexit = False
+
+        new_storage.copy_(storage, non_blocking=non_blocking)
+
+        # Cache the storage - WeakIdKeyDictionary will automatically clean up when storage is garbage collected
+        self._cached_storage_mapping[storage] = new_storage
+        return new_storage
+
+    @torch.no_grad()
+    def stage(
+        self,
+        state_dict: dict[str, Any],
+        non_blocking: bool = False,
+    ) -> dict[str, Any]:
+        return self.deepcopy_with_tensor_offload(state_dict, non_blocking=non_blocking)
+
+    def _offload_tensor(self, x, memo, non_blocking=False):
+        """
+        Deep copy a PyTorch tensor with optimized storage handling.
+
+        This method creates a CPU copy of a tensor while applying memory optimizations
+        like sharing and pinning based on the StateDictStager configuration.
+
+        Args:
+            x: The tensor to copy
+            memo: Memo dictionary for tracking already copied objects
+            non_blocking: Whether to perform non-blocking copies where possible
+
+        Returns:
+            A CPU copy of the tensor with optimized storage
+        """
+        # Create a new empty tensor on CPU
+        y = x.new_empty([], device="cpu")
+
+        # Store in memo dict early to handle recursive references
+        d = id(x)
+        memo[d] = y
+
+        if type(x) is torch.Tensor or x.data_ptr() != 0:
+            # Try to get the untyped storage and optimize it
+            untyped_storage = x.untyped_storage()
+            copied_storage = self._stage_untyped_storage(
+                untyped_storage, non_blocking=non_blocking
+            )
+            # Set the tensor data using the optimized storage
+            y.set_(copied_storage, x.storage_offset(), x.size(), x.stride())
+
+        # Copy any attributes the tensor might have
+        if hasattr(x, "__dict__"):
+            for attr_name, attr_value in x.__dict__.items():
+                setattr(
+                    y,
+                    attr_name,
+                    self.deepcopy_with_tensor_offload(
+                        attr_value, memo, non_blocking=non_blocking
+                    ),
+                )
+
+        if hasattr(x, "__slots__"):
+            for slot in x.__slots__:
+                if hasattr(x, slot):
+                    setattr(
+                        y,
+                        slot,
+                        self.deepcopy_with_tensor_offload(
+                            getattr(x, slot), memo, non_blocking=non_blocking
+                        ),
+                    )
+
+        return y
+
+    @torch.no_grad()
+    def deepcopy_with_tensor_offload(self, x, memo=None, _nil=[], non_blocking=False):  # noqa: B006
+        """Deep copy operation on arbitrary Python objects with special handling for PyTorch tensors.
+
+        This implementation extends the standard deepcopy functionality to handle PyTorch tensors
+        and their storages in a way that optimizes memory usage and performance, similar to the
+        stage method. It applies memory sharing and pinning optimizations based on the StateDictStager
+        configuration.
+
+        Args:
+            x: The object to deep copy
+            memo: Memo dictionary for tracking already copied objects
+            _nil: Sentinel value for memo dictionary
+            non_blocking: Whether to perform non-blocking copies where possible
+
+        Returns:
+            A deep copy of the input object with optimized tensor storage handling
+        """
+        if memo is None:
+            memo = {}
+
+        d = id(x)
+        y = memo.get(d, _nil)
+        if y is not _nil:
+            return y
+
+        cls = type(x)
+
+        # tensors and subclasses of tensors are handled separately
+        if isinstance(x, torch.Tensor):
+            y = self._offload_tensor(x, memo, non_blocking=non_blocking)
+
+        # Use the dispatch table for standard types
+        copier = self._deepcopy_dispatch.get(cls)
+        if copier is not None:
+            y = copier(x, memo)
+        else:
+            if issubclass(cls, type):
+                y = self._deepcopy_dispatch[type](x, memo)
+            else:
+                copier = getattr(x, "__deepcopy__", None)
+                if copier is not None:
+                    y = copier(memo)
+                else:
+                    reductor = dispatch_table.get(cls)
+                    if reductor:
+                        rv = reductor(x)
+                    else:
+                        reductor = getattr(x, "__reduce_ex__", None)
+                        if reductor is not None:
+                            rv = reductor(4)
+                        else:
+                            reductor = getattr(x, "__reduce__", None)
+                            if reductor:
+                                rv = reductor()
+                            else:
+                                raise RuntimeError(
+                                    f"un(deep)copyable object of type {cls}"
+                                )
+                    if isinstance(rv, str):
+                        y = x
+                    else:
+                        y = self._reconstruct(x, memo, *rv)
+
+        # If is its own copy, don't memoize.
+        if y is not x:
+            memo[d] = y
+            self._keep_alive(x, memo)  # Make sure x lives at least as long as d
+        return y
+
+    def _keep_alive(self, x, memo):
+        """Keeps a reference to the object x in the memo.
+
+        Because we remember objects by their id, we have
+        to assure that possibly temporary objects are kept
+        alive by referencing them.
+        We store a reference at the id of the memo, which should
+        normally not be used unless someone tries to deepcopy
+        the memo itself...
+        """
+        try:
+            memo[id(memo)].append(x)
+        except KeyError:
+            # aha, this is the first one :-)
+            memo[id(memo)] = [x]
+
+    def _reconstruct(
+        self, x, memo, func, args, state=None, listiter=None, dictiter=None
+    ):
+        deep = memo is not None
+        if deep and args:
+            args = (self.deepcopy_with_tensor_offload(arg, memo) for arg in args)
+        y = func(*args)
+        if deep:
+            memo[id(x)] = y
+
+        if state is not None:
+            if deep:
+                state = self.deepcopy_with_tensor_offload(state, memo)
+            if hasattr(y, "__setstate__"):
+                y.__setstate__(state)
+            else:
+                if isinstance(state, tuple) and len(state) == 2:
+                    state, slotstate = state
+                else:
+                    slotstate = None
+                if state is not None:
+                    y.__dict__.update(state)
+                if slotstate is not None:
+                    for key, value in slotstate.items():
+                        setattr(y, key, value)
+
+        if listiter is not None:
+            if deep:
+                for item in listiter:
+                    item = self.deepcopy_with_tensor_offload(item, memo)
+                    y.append(item)
+            else:
+                for item in listiter:
+                    y.append(item)
+        if dictiter is not None:
+            if deep:
+                for key, value in dictiter:
+                    key = self.deepcopy_with_tensor_offload(key, memo)
+                    value = self.deepcopy_with_tensor_offload(value, memo)
+                    y[key] = value
+            else:
+                for key, value in dictiter:
+                    y[key] = value
+        return y
diff --git a/torch/distributed/checkpoint/_storage_utils.py b/torch/distributed/checkpoint/_storage_utils.py
index 3d8d9a0806ae..63587363e2c9 100644
--- a/torch/distributed/checkpoint/_storage_utils.py
+++ b/torch/distributed/checkpoint/_storage_utils.py
@@ -17,7 +17,11 @@ def _storage_setup(
 
     if not checkpoint_id:
         raise RuntimeError(
+<<<<<<< HEAD
             "`checkpoint_id` must be specificed if "
+=======
+            "`checkpoint_id` must be specified if "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "storage_reader/storage_writer is None."
         )
 
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
index ffeb5a01ec72..9da289a2167b 100644
--- a/torch/distributed/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -1,7 +1,10 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
+<<<<<<< HEAD
 import copy
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import dataclasses
 import io
 import logging
@@ -40,6 +43,10 @@
 )
 from torch.distributed.checkpoint.planner_helpers import (
     _compare_save_plans,
+<<<<<<< HEAD
+=======
+    _contains_usable_plan,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _create_default_metadata_only_plan,
     _create_read_items,
     _create_write_items,
@@ -126,12 +133,24 @@ def create_local_plan(self) -> SavePlan:
 
         return self.plan
 
+<<<<<<< HEAD
     def _create_global_plan(
         self, all_plans: list[SavePlan]
     ) -> tuple[list[SavePlan], Metadata]:
         all_plans = dedup_save_plans(all_plans, self.dedup_save_to_lowest_rank)
 
         global_plan, metadata = create_default_global_save_plan(all_plans)
+=======
+    def _dedup_save_plans(self, all_plans: list[SavePlan]) -> list[SavePlan]:
+        return dedup_save_plans(all_plans, self.dedup_save_to_lowest_rank)
+
+    def _create_global_plan(
+        self, all_plans: list[SavePlan]
+    ) -> tuple[list[SavePlan], Metadata]:
+        deduped_plans = self._dedup_save_plans(all_plans)
+
+        global_plan, metadata = create_default_global_save_plan(deduped_plans)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if self.flatten_state_dict:
             # | does not work for Python 3.8 or older version.
@@ -157,6 +176,7 @@ def _create_global_plan_with_caching(
         global_plan_delta: list[SavePlan] = []
 
         if self._cached_plans_key not in SavePlanner._cached_all_plans:
+<<<<<<< HEAD
             # Make a deepcopy of all_plans to avoid caching the modified plans post de-dupe
             SavePlanner._cached_all_plans[self._cached_plans_key] = copy.deepcopy(
                 all_plans
@@ -191,6 +211,55 @@ def _create_global_plan_with_caching(
 
         # If the plans are cached, global_plan delta will be the delta
         # of new global plan and cached global plan.
+=======
+            # Case 1: If the plans are not cached, the cache will be hydrated with the
+            # all_plans, global_plans (Deduped), and metadata.
+
+            # Cache the original all_plans
+            SavePlanner._cached_all_plans[self._cached_plans_key] = all_plans
+            global_plan, metadata = self._create_global_plan(all_plans)
+            # Cache the deduped and validated global_plan
+            SavePlanner._cached_global_plan[self._cached_plans_key] = global_plan
+            # Cache the metadata
+            SavePlanner._cached_metadata[self._cached_plans_key] = metadata
+            # If plans are not cached, global_plan delta will be the same as global plan.
+            return global_plan, global_plan, metadata
+
+        # Case 2: Plans are cached
+        if not _contains_usable_plan(all_plans):
+            # Case 2.1: Plans are cached and the local plans have NOT changed (No usable plans).
+            # Global plan delta will be empty plans to avoid the collective overhead.
+            # We can reuse the deduped global plan and metadata from the cache directly.
+            global_plan_delta = [SavePlan([], usable=False)] * len(all_plans)
+            global_plan = SavePlanner._cached_global_plan[self._cached_plans_key]
+            metadata = SavePlanner._cached_metadata[self._cached_plans_key]
+        else:
+            # Case 2.2: Plans are cached but the local plans have changed.
+            # We will merge the changed local plans with the cached local plans.
+            # Updated plans will overwrite the cached plans. New global plan and metadata will be created and cached.
+            # Global plan delta will be created by comparing the new global plan with the cached global plan.
+            # Only the global plan delta (updated ones) will be sent to the coordinator to avoid the collective overhead.
+            merged_plans = _merge_delta_local_plans(
+                SavePlanner._cached_all_plans[self._cached_plans_key], all_plans
+            )
+            # Cache the updated local plans
+            SavePlanner._cached_all_plans[self._cached_plans_key] = merged_plans
+            global_plan, metadata = self._create_global_plan(merged_plans)
+
+            if self._cached_plans_key in self._cached_global_plan:
+                for cached_plan, new_plan in zip(
+                    SavePlanner._cached_global_plan[self._cached_plans_key], global_plan
+                ):
+                    if _compare_save_plans(cached_plan, new_plan):
+                        global_plan_delta.append(SavePlan([], usable=False))
+                    else:
+                        global_plan_delta.append(new_plan)
+
+            # Cache the new global plan and the metadata
+            SavePlanner._cached_global_plan[self._cached_plans_key] = global_plan
+            SavePlanner._cached_metadata[self._cached_plans_key] = metadata
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return global_plan_delta, global_plan, metadata
 
     def create_global_plan(
@@ -426,7 +495,11 @@ def set_up_planner(
 
             if isinstance(v, TensorStorageMetadata):
                 v = torch.empty(v.size, dtype=v.properties.dtype)  # type: ignore[assignment]
+<<<<<<< HEAD
             if k in metadata.planner_data:
+=======
+            if metadata.planner_data is not None and k in metadata.planner_data:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 set_element(state_dict, metadata.planner_data[k], v)
             else:
                 state_dict[k] = v
diff --git a/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py b/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py
index f2f03840b0d5..986a77ba38b6 100644
--- a/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py
+++ b/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py
@@ -20,7 +20,11 @@
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 
 
+<<<<<<< HEAD
 CHECKPOINT_DIR = f"/scratch/{os.environ['LOGNAME']}/checkpoint"
+=======
+CHECKPOINT_DIR = f"/scratch/{os.environ.get('LOGNAME', '')}/checkpoint"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def opt_at(opt, idx):
diff --git a/torch/distributed/checkpoint/examples/stateful_example.py b/torch/distributed/checkpoint/examples/stateful_example.py
index 1e5e2f7c967b..b62bfddfb944 100644
--- a/torch/distributed/checkpoint/examples/stateful_example.py
+++ b/torch/distributed/checkpoint/examples/stateful_example.py
@@ -20,7 +20,11 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 
 
+<<<<<<< HEAD
 CHECKPOINT_DIR = f"~/{os.environ['LOGNAME']}/checkpoint"
+=======
+CHECKPOINT_DIR = f"~/{os.environ.get('LOGNAME', '')}/checkpoint"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Model(torch.nn.Module):
diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index 89b82e7bc127..996df9e35bb1 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -2,6 +2,10 @@
 import collections
 import dataclasses
 import io
+<<<<<<< HEAD
+=======
+import json
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import operator
 import os
 import pickle
@@ -13,6 +17,10 @@
 from collections.abc import Generator, Iterable, Iterator, Sequence
 from contextlib import contextmanager
 from dataclasses import dataclass
+<<<<<<< HEAD
+=======
+from enum import Enum
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from io import UnsupportedOperation
 from pathlib import Path
 from typing import Any, Callable, cast, IO, Optional, Union
@@ -28,6 +36,16 @@
     ExtensionRegistry,
     StreamTransformExtension,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.checkpoint._hf_utils import (
+    CUSTOM_METADATA_KEY,
+    DCP_VERSION_KEY,
+    FORMAT_KEY,
+    FORMAT_VALUE,
+    HF_DCP_VERSION,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.metadata import Metadata, STATE_DICT_TYPE, StorageMeta
 from torch.distributed.checkpoint.planner import (
     LoadItemType,
@@ -49,7 +67,17 @@
 from torch.futures import Future
 
 
+<<<<<<< HEAD
 __all__ = ["FileSystemWriter", "FileSystemReader", "FileSystem", "FileSystemBase"]
+=======
+__all__ = [
+    "FileSystemWriter",
+    "FileSystemReader",
+    "FileSystem",
+    "FileSystemBase",
+    "SerializationFormat",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _metadata_fn: str = ".metadata"
 
@@ -72,6 +100,14 @@ class _StoragePrefix:
     prefix: str
 
 
+<<<<<<< HEAD
+=======
+class SerializationFormat(Enum):
+    TORCH_SAVE = "torch_save"
+    SAFETENSORS = "safetensors"
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 DEFAULT_SUFFIX = ".distcp"
 
 
@@ -298,7 +334,11 @@ def _write_item(
     data: Union[io.BytesIO, torch.Tensor],
     write_item: WriteItem,
     storage_key: str,
+<<<<<<< HEAD
     safe_tensors: bool = False,
+=======
+    serialization_format: SerializationFormat,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> WriteResult:
     offset = stream.tell()
 
@@ -312,12 +352,22 @@ def _write_item(
     else:
         assert isinstance(data, torch.Tensor)
         assert data.device == torch.device("cpu")
+<<<<<<< HEAD
         if not safe_tensors:
+=======
+        if serialization_format == SerializationFormat.TORCH_SAVE:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.save(data, transform_to)
 
     transform_to.close()
 
+<<<<<<< HEAD
     if not safe_tensors or isinstance(data, io.BytesIO):
+=======
+    if serialization_format == SerializationFormat.TORCH_SAVE or isinstance(
+        data, io.BytesIO
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         length = stream.tell() - offset
     else:
         length = data.numel() * data.element_size()
@@ -349,7 +399,11 @@ def _write_files_from_queue(
     inflight_threshhold: int,
     use_fsync: bool,
     thread_count: int,
+<<<<<<< HEAD
     safe_tensors: bool,
+=======
+    serialization_format: SerializationFormat,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     try:
         while True:
@@ -360,7 +414,11 @@ def _write_files_from_queue(
             custom_device_mod = getattr(torch, custom_backend_name, None)
 
             # TODO: Using the OverlappingCpuLoader with multiple threads creates significant
+<<<<<<< HEAD
             # performance degredation, observed as being related to cuda stream syncs. We
+=======
+            # performance degradation, observed as being related to cuda stream syncs. We
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # should try to fix this and use _OverlappingCpuLoader for all threaded cases
             if (
                 thread_count == 1
@@ -397,11 +455,19 @@ def _write_files_from_queue(
                             data,
                             write_item,
                             storage_key,
+<<<<<<< HEAD
                             safe_tensors,
+=======
+                            serialization_format,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     )
 
                 tensor_dict = {}
+<<<<<<< HEAD
+=======
+                metadata_dict = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for tensor, write_item in loader.values():
                     assert tensor.is_cpu
                     write_results.append(
@@ -409,6 +475,7 @@ def _write_files_from_queue(
                             transforms,
                             stream,
                             tensor,
+<<<<<<< HEAD
                             write_item,
                             storage_key,
                             safe_tensors,
@@ -420,6 +487,31 @@ def _write_files_from_queue(
                     from safetensors.torch import save  # type: ignore[import-not-found]
 
                     stream.write(save(tensor_dict))
+=======
+                            write_item,  # type: ignore[arg-type]
+                            storage_key,
+                            serialization_format,
+                        )
+                    )
+                    tensor_dict[write_item.index.fqn] = tensor  # type: ignore[attr-defined]
+                    metadata_dict[write_item.index.fqn] = {  # type: ignore[attr-defined]
+                        "saved_offsets": write_item.tensor_data.chunk.offsets  # type: ignore[attr-defined]
+                    }
+
+                if serialization_format == SerializationFormat.SAFETENSORS:
+                    from safetensors.torch import save  # type: ignore[import-not-found]
+
+                    stream.write(
+                        save(
+                            tensor_dict,
+                            metadata={
+                                CUSTOM_METADATA_KEY: json.dumps(metadata_dict),
+                                DCP_VERSION_KEY: str(HF_DCP_VERSION),
+                                FORMAT_KEY: FORMAT_VALUE,
+                            },
+                        )
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 if use_fsync:
                     try:
@@ -525,6 +617,14 @@ def rm_file(self, path: Union[str, os.PathLike]) -> None:
             path = Path(path)
         path.unlink()
 
+<<<<<<< HEAD
+=======
+    def ls(self, path: Union[str, os.PathLike]) -> list[str]:
+        if not isinstance(path, Path):
+            path = Path(path)
+        return [str(p) for p in path.iterdir()]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _FileSystemWriter(StorageWriter):
     """
@@ -549,6 +649,10 @@ def __init__(
         per_thread_copy_ahead: int = 10_000_000,
         overwrite: bool = True,
         _extensions: Optional[Sequence[StreamTransformExtension]] = None,
+<<<<<<< HEAD
+=======
+        serialization_format: SerializationFormat = SerializationFormat.TORCH_SAVE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *args: Any,
         **kwargs: Any,
     ) -> None:
@@ -576,6 +680,10 @@ def __init__(
         self.save_id = _generate_uuid()
         self.overwrite = overwrite
         self.transforms = _StorageWriterTransforms(_extensions)
+<<<<<<< HEAD
+=======
+        self.serialization_format = serialization_format
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         if checkpoint_id:
@@ -638,7 +746,10 @@ def _write_data(
         self,
         planner: SavePlanner,
         file_queue: queue.Queue,
+<<<<<<< HEAD
         safe_tensors: bool = False,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Future[list[WriteResult]]:
         result_queue: queue.Queue = queue.Queue()
 
@@ -655,7 +766,11 @@ def _write_data(
                     self.per_thread_copy_ahead,
                     self.sync_files,
                     self.thread_count,
+<<<<<<< HEAD
                     safe_tensors,
+=======
+                    self.serialization_format,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             )
             t.start()
@@ -670,7 +785,11 @@ def _write_data(
             inflight_threshhold=self.per_thread_copy_ahead,
             use_fsync=self.sync_files,
             thread_count=self.thread_count,
+<<<<<<< HEAD
             safe_tensors=safe_tensors,
+=======
+            serialization_format=self.serialization_format,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         for t in threads:
@@ -892,6 +1011,10 @@ def __init__(
         cache_staged_state_dict: bool = False,
         overwrite: bool = True,
         _extensions: Optional[Sequence[StreamTransformExtension]] = None,
+<<<<<<< HEAD
+=======
+        serialization_format: SerializationFormat = SerializationFormat.TORCH_SAVE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         """
         Initialize the writer pointing to `path`.
@@ -904,7 +1027,11 @@ def __init__(
             per_thread_copy_ahead: How many bytes to copy from the GPU ahead of saving then. Default 10Mb.
             cache_staged_state_dict: Whether to cache the staged state_dict. This option decreases staging latency
                 at the cost of increases memory usage. Additionally, if this parameter is set to True, it's the expectation
+<<<<<<< HEAD
                 that the stager is maintained and re-used for multiple dcp.async_save calls. Default to False.
+=======
+                that the stager is maintained and reused for multiple dcp.async_save calls. Default to False.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             overwrite: Whether to allow overwriting existing checkpoints. Defaults to True.
             _extensions: Extensions to apply to output streams (EXPERIMENTAL)
 
@@ -919,6 +1046,10 @@ def __init__(
             per_thread_copy_ahead=per_thread_copy_ahead,
             overwrite=overwrite,
             _extensions=_extensions,
+<<<<<<< HEAD
+=======
+            serialization_format=serialization_format,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         BlockingAsyncStager.__init__(
             self,
diff --git a/torch/distributed/checkpoint/hf_storage.py b/torch/distributed/checkpoint/hf_storage.py
new file mode 100644
index 000000000000..801aa1cd06bd
--- /dev/null
+++ b/torch/distributed/checkpoint/hf_storage.py
@@ -0,0 +1,334 @@
+# mypy: allow-untyped-defs
+import dataclasses
+import json
+import queue
+from typing import Any, Optional
+
+import torch
+from torch.distributed._shard._utils import narrow_tensor_by_index
+from torch.distributed.checkpoint._fsspec_filesystem import FsspecReader, FsspecWriter
+from torch.distributed.checkpoint._hf_utils import (
+    _gen_file_name,
+    _get_dtype,
+    _get_safetensors_file_metadata,
+    _HFStorageInfo,
+    _metadata_fn,
+    CUSTOM_METADATA_KEY,
+    DATA_KEY,
+    DATA_OFFSETS_KEY,
+    DEFAULT_EXTRA_METADATA_KEY,
+    DTYPE_KEY,
+    SAVED_OFFSETS_KEY,
+    SHAPE_KEY,
+    SUFFIX,
+)
+from torch.distributed.checkpoint.filesystem import SerializationFormat
+from torch.distributed.checkpoint.metadata import (
+    ChunkStorageMetadata,
+    Metadata,
+    MetadataIndex,
+    StorageMeta,
+    TensorProperties,
+    TensorStorageMetadata,
+)
+from torch.distributed.checkpoint.planner import (
+    LoadPlan,
+    LoadPlanner,
+    ReadItem,
+    SavePlan,
+    SavePlanner,
+    WriteItem,
+)
+from torch.distributed.checkpoint.storage import WriteResult
+from torch.futures import Future
+
+
+__all__ = ["HuggingFaceStorageWriter", "HuggingFaceStorageReader"]
+
+
+class HuggingFaceStorageWriter(FsspecWriter):
+    """
+    A writer that writes to a huggingface repository in the huggingface format.
+    Uses Fsspec back-end to communicate with back-end storage.
+    Fsspec registration of the storage solution is required.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        fqn_to_index_mapping: Optional[dict[str, int]] = None,
+        token: Optional[str] = None,
+        save_sharded: bool = False,
+    ) -> None:
+        """
+        Initialize the huggingface writer pointing to path.
+
+        Args:
+            path: hf directory where the checkpoint will be read from.
+                  Needs to have .safetensors files, but can be from any fsspec supported storage,
+                  including localFS and hf://.
+            fqn_to_index_mapping: A mapping from tensor FQN to the index of the file that the tensor should be written to.
+                              Indices are from 1 to N, where N is the number of files. If not provided,
+                              the tensors will be written to a single file. If none, then all the tensors on the
+                              same rank will be written to the same file.
+            token: The token to use to authenticate with huggingface hub.
+            save_sharded: If True, save the checkpoint as a sharded checkpoint where every rank saves its own shard.
+                        Default is False which assumes full tensors are being saved.
+
+        """
+
+        if token is not None:
+            super().__init__(
+                path=path,
+                token=token,
+                serialization_format=SerializationFormat.SAFETENSORS,
+            )
+        else:
+            super().__init__(
+                path=path,
+                serialization_format=SerializationFormat.SAFETENSORS,
+            )
+        self._fqn_to_index_mapping: Optional[dict[str, int]] = fqn_to_index_mapping
+        self._save_sharded = save_sharded
+
+    def prepare_global_plan(self, plans: list[SavePlan]) -> list[SavePlan]:
+        new_plans = []
+        for i, plan in enumerate(plans, start=1):
+            storage_data: dict[str, Any] = {}
+            if self._fqn_to_index_mapping is not None:
+                storage_data["fqn_to_index_mapping"] = self._fqn_to_index_mapping
+            if self._save_sharded:
+                storage_data["shard_index"] = i
+
+            new_plans.append(dataclasses.replace(plan, storage_data=storage_data))
+
+        return new_plans
+
+    def write_data(
+        self,
+        plan: SavePlan,
+        planner: SavePlanner,
+    ) -> Future[list[WriteResult]]:
+        if len(plan.items) == 0:
+            fut: Future = Future()
+            fut.set_result([])
+            return fut
+
+        # storage_plan is a map from key to file index
+        storage_data: dict[str, Any] = plan.storage_data
+        storage_plan: Optional[dict[str, int]] = None
+        shard_index: Optional[int] = None
+        if "fqn_to_index_mapping" in storage_data:
+            storage_plan = storage_data["fqn_to_index_mapping"]
+        if "shard_index" in storage_data:
+            shard_index = storage_data["shard_index"]
+
+        buckets = self._split_by_storage_plan(storage_plan, plan.items)
+        highest_index = max(storage_plan.values()) if storage_plan is not None else 1
+
+        file_queue: queue.Queue = queue.Queue()
+        for file_index, write_items in buckets.items():
+            file_name = _gen_file_name(file_index, highest_index, shard_index)
+            file_queue.put(
+                (self.fs.concat_path(self.path, file_name), file_name, write_items)
+            )
+
+        return super()._write_data(planner, file_queue)
+
+    def finish(self, metadata: Metadata, results: list[list[WriteResult]]) -> None:
+        if self._save_sharded:
+            return
+
+        metadata_to_write = {}
+        storage_md = {}
+        total_size = 0
+        for wr_list in results:
+            storage_md.update(
+                {wr.index.fqn: wr.storage_data.relative_path for wr in wr_list}
+            )
+            total_size += sum([wr.storage_data.length for wr in wr_list])
+        metadata_to_write["metadata"] = {"total_size": total_size}
+        metadata_to_write["weight_map"] = storage_md
+
+        metadata_path = self.fs.concat_path(self.path, f"{_metadata_fn}")
+        with self.fs.create_stream(metadata_path, "w") as metadata_file:
+            json.dump(metadata_to_write, metadata_file, indent=2)
+
+    def _split_by_storage_plan(
+        self, storage_plan: Optional[dict[str, int]], items: list[WriteItem]
+    ) -> dict[int, list[WriteItem]]:
+        # storage_plan is a map from key to index
+        if storage_plan is None:
+            return {1: items}
+
+        buckets = {}
+        for item in items:
+            key = item.index.fqn
+
+            idx = storage_plan[key]
+            if idx not in buckets:
+                buckets[idx] = [item]
+            else:
+                buckets[idx].append(item)
+
+        return buckets
+
+    @property
+    def metadata_path(self) -> str:
+        return _metadata_fn
+
+
+class HuggingFaceStorageReader(FsspecReader):
+    """
+    A reader that reads from a huggingface repository in the huggingface format.
+    Uses in Fsspec back-end to communicate with storage.
+    Fsspec registration of the storage solution is required.
+    """
+
+    def __init__(self, path: str, token: Optional[str] = None) -> None:
+        """
+        Initialize the huggingface reader pointing to path.
+
+        Args:
+            path: hf directory where the checkpoint will be read from.
+            Needs to have .safetensors file, but can be from any fsspec supported storage,
+            including localFS and hf://.
+            token: The token to use to authenticate with huggingface hub.
+        """
+
+        if token is not None:
+            super().__init__(path=path, token=token)
+        else:
+            super().__init__(path=path)
+
+    def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
+        from safetensors import deserialize  # type: ignore[import-not-found]
+
+        per_file: dict[str, list[ReadItem]] = {}
+
+        for read_item in plan.items:
+            item_md: _HFStorageInfo = self.storage_data[read_item.storage_index]
+            file_name = item_md.relative_path
+            per_file.setdefault(file_name, []).append(read_item)
+
+        for file_name, reqs in per_file.items():
+            with self.fs.create_stream(file_name, "rb") as stream:
+                # TODO: make this more efficient by doing offset reads instead of a
+                # full deserialization of the file
+                deserialized = deserialize(stream.read())
+                deserialized_dict: dict[str, dict[str, Any]] = {
+                    tensor_info[0]: tensor_info[1] for tensor_info in deserialized
+                }
+
+                for req in reqs:
+                    item_md = self.storage_data[req.storage_index]
+
+                    tensor_bytes = deserialized_dict[req.dest_index.fqn][DATA_KEY]
+
+                    tensor = torch.frombuffer(
+                        tensor_bytes,
+                        dtype=item_md.dtype,
+                    )
+                    tensor = tensor.reshape(item_md.shape)
+                    tensor = narrow_tensor_by_index(
+                        tensor, req.storage_offsets, req.lengths
+                    )
+                    target_tensor = planner.resolve_tensor(req).detach()
+
+                    assert target_tensor.size() == tensor.size(), (
+                        f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+                    )
+
+                    target_tensor.copy_(tensor)
+                    planner.commit_tensor(req, target_tensor)
+
+        fut: Future = Future()
+        fut.set_result(None)
+        return fut
+
+    def read_metadata(self) -> Metadata:
+        state_dict_metadata: dict[str, TensorStorageMetadata] = {}
+        storage_data: dict[MetadataIndex, _HFStorageInfo] = {}
+
+        safetensors_files = []
+        for file in self.fs.ls(self.path):
+            if file.endswith(SUFFIX):
+                safetensors_files.append(file)
+
+        for safetensor_file in safetensors_files:
+            with self.fs.create_stream(safetensor_file, "rb") as f:
+                safetensors_metadata, _ = _get_safetensors_file_metadata(f)
+                custom_metadata = safetensors_metadata.get(DEFAULT_EXTRA_METADATA_KEY)
+
+                dcp_sharding_info = None
+                if custom_metadata and custom_metadata.get(CUSTOM_METADATA_KEY):
+                    dcp_sharding_info = json.loads(
+                        custom_metadata.get(CUSTOM_METADATA_KEY)
+                    )
+
+                for key, val in safetensors_metadata.items():
+                    if key == DEFAULT_EXTRA_METADATA_KEY:
+                        continue
+
+                    # construct state_dict_metadata
+                    if dcp_sharding_info is not None:
+                        offset = dcp_sharding_info[key][SAVED_OFFSETS_KEY]
+                    else:
+                        offset = [0] * len(val[SHAPE_KEY])
+
+                    if key not in state_dict_metadata:
+                        state_dict_metadata[key] = TensorStorageMetadata(
+                            properties=TensorProperties(
+                                dtype=_get_dtype(val[DTYPE_KEY])
+                            ),
+                            size=torch.Size(
+                                [
+                                    saved + offset
+                                    for saved, offset in zip(val[SHAPE_KEY], offset)
+                                ]
+                            ),
+                            chunks=[
+                                ChunkStorageMetadata(
+                                    offsets=torch.Size(offset),
+                                    sizes=torch.Size(val[SHAPE_KEY]),
+                                )
+                            ],
+                        )
+                    else:
+                        state_dict_metadata[key].chunks.append(
+                            ChunkStorageMetadata(
+                                torch.Size(offset), sizes=torch.Size(val[SHAPE_KEY])
+                            )
+                        )
+                        size = list(state_dict_metadata[key].size)
+                        for i in range(len(size)):
+                            size[i] = max(size[i], val[SHAPE_KEY][i] + offset[i])
+                        state_dict_metadata[key].size = torch.Size(size)
+
+                    # construct storage data
+                    if dcp_sharding_info is not None:
+                        metadata_index = MetadataIndex(
+                            fqn=key, offset=dcp_sharding_info[key][SAVED_OFFSETS_KEY]
+                        )
+                    else:
+                        metadata_index = MetadataIndex(
+                            fqn=key, offset=[0] * len(val[SHAPE_KEY])
+                        )
+                    storage_data[metadata_index] = _HFStorageInfo(
+                        relative_path=safetensor_file,
+                        offset=val[DATA_OFFSETS_KEY][0],
+                        length=val[DATA_OFFSETS_KEY][1] - val[DATA_OFFSETS_KEY][0],
+                        shape=torch.Size(val[SHAPE_KEY]),
+                        dtype=_get_dtype(val[DTYPE_KEY]),
+                    )
+
+        metadata = Metadata(
+            state_dict_metadata=state_dict_metadata,  # type: ignore[arg-type]
+            storage_data=storage_data,
+        )
+
+        if getattr(metadata, "storage_meta", None) is None:
+            metadata.storage_meta = StorageMeta()
+        metadata.storage_meta.load_id = self.load_id  # type: ignore[union-attr]
+
+        return metadata
diff --git a/torch/distributed/checkpoint/optimizer.py b/torch/distributed/checkpoint/optimizer.py
index 43193afe6e67..43abed1eb1e5 100644
--- a/torch/distributed/checkpoint/optimizer.py
+++ b/torch/distributed/checkpoint/optimizer.py
@@ -89,7 +89,11 @@ def _is_nested_tensor(val: torch.Tensor) -> bool:
         if type(val.local_shards()[0].tensor) is ShardedTensor:
             return True
         if type(val.local_shards()[0].tensor) is DTensor:
+<<<<<<< HEAD
             raise ValueError("Cannot handle DTensor nested insided ShardedTensor")
+=======
+            raise ValueError("Cannot handle DTensor nested inside ShardedTensor")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif type(val) is DTensor and (
         type(val._local_tensor) is DTensor or type(val._local_tensor) is ShardedTensor
     ):
diff --git a/torch/distributed/checkpoint/planner.py b/torch/distributed/checkpoint/planner.py
index bc0b26dfe4d0..5a1b59a020ef 100644
--- a/torch/distributed/checkpoint/planner.py
+++ b/torch/distributed/checkpoint/planner.py
@@ -20,6 +20,10 @@
 __all__ = [
     "WriteItemType",
     "LoadItemType",
+<<<<<<< HEAD
+=======
+    "BytesIOWriteData",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "TensorWriteData",
     "WriteItem",
     "ReadItem",
@@ -42,6 +46,14 @@ class LoadItemType(Enum):
 
 
 @dataclass(frozen=True)
+<<<<<<< HEAD
+=======
+class BytesIOWriteData:
+    nbytes: int
+
+
+@dataclass(frozen=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TensorWriteData:
     chunk: ChunkStorageMetadata
     properties: TensorProperties
@@ -55,6 +67,12 @@ class WriteItem:
     index: MetadataIndex
     type: WriteItemType
 
+<<<<<<< HEAD
+=======
+    # Size of bytesIO data to be written.
+    bytes_io_data: Optional[BytesIOWriteData] = None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Value present if it's a tensor write
     tensor_data: Optional[TensorWriteData] = None
 
@@ -223,6 +241,12 @@ class SavePlanner(abc.ABC):
     # Global checkpoint plan as computed by `create_global_plan` API.
     # Cached on the coordinator rank.
     _cached_global_plan: dict[str, list[SavePlan]] = {}
+<<<<<<< HEAD
+=======
+    # Metadata for the global checkpoint plan as computed by `create_global_plan` API.
+    # Cached on the coordinator rank.
+    _cached_metadata: dict[str, Metadata] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @abc.abstractmethod
     def set_up_planner(
diff --git a/torch/distributed/checkpoint/planner_helpers.py b/torch/distributed/checkpoint/planner_helpers.py
index 66b2174b73f7..0de87af7a4ee 100644
--- a/torch/distributed/checkpoint/planner_helpers.py
+++ b/torch/distributed/checkpoint/planner_helpers.py
@@ -101,6 +101,21 @@ def _compare_save_plans(plan: SavePlan, other_plan: SavePlan) -> bool:
     return True
 
 
+<<<<<<< HEAD
+=======
+def _contains_usable_plan(delta_plans: list[SavePlan]) -> bool:
+    """
+    Check if any delta plan is usable, indicating the plan has changed.
+
+    Args:
+        delta_plans (List[SavePlan]): A list of delta plans to check.
+    Returns:
+        True if any delta plan is usable, False otherwise.
+    """
+    return any(delta_plan and delta_plan.usable for delta_plan in delta_plans)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _merge_delta_local_plans(
     cached_plans: list[SavePlan],
     delta_plans: list[SavePlan],
diff --git a/torch/distributed/checkpoint/resharding.py b/torch/distributed/checkpoint/resharding.py
index a911bda05485..599fb9c86615 100644
--- a/torch/distributed/checkpoint/resharding.py
+++ b/torch/distributed/checkpoint/resharding.py
@@ -1,5 +1,8 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.metadata import ChunkStorageMetadata
 
 
@@ -8,7 +11,11 @@
 
 def _check_shard_metadata_pair_overlap(
     shard1: ChunkStorageMetadata, shard2: ChunkStorageMetadata
+<<<<<<< HEAD
 ):
+=======
+) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Check if two shards overlap."""
     # For each dim of each shard, check if one shard resides on the other
     # end of second shard with respect to that dim. As an example for a 2D
diff --git a/torch/distributed/checkpoint/staging.py b/torch/distributed/checkpoint/staging.py
index 9f3233ad06d5..edf2ceeaad0a 100644
--- a/torch/distributed/checkpoint/staging.py
+++ b/torch/distributed/checkpoint/staging.py
@@ -1,5 +1,10 @@
+<<<<<<< HEAD
 from typing import Optional, runtime_checkable
 from typing_extensions import Protocol
+=======
+from typing import Optional
+from typing_extensions import Protocol, runtime_checkable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.distributed._state_dict_utils import _copy_state_dict, _create_cpu_state_dict
 from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
@@ -50,7 +55,11 @@ def should_synchronize_after_execute(self) -> bool:
     def stage(self, state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
         """
         Returns a "staged" copy of `state_dict`. The expectation of the staged copy is that it is
+<<<<<<< HEAD
         innoculated from any updates incurred after the stage call is complete.
+=======
+        inoculated from any updates incurred after the stage call is complete.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         raise NotImplementedError(
             f"{self.__class__.__name__} must implement stage method"
@@ -87,7 +96,11 @@ def __init__(
         Args:
             cache_staged_state_dict: Whether to cache the staged state_dict. This option decreases staging latency
                 at the cost of increases memory usage. Additionally, if this parameter is set to True, it's the expectation
+<<<<<<< HEAD
                 that the stager is maintained and re-used for multiple dcp.async_save calls. Default to False.
+=======
+                that the stager is maintained and reused for multiple dcp.async_save calls. Default to False.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             type_check: Whether to perform a type check during cpu_offload. Defaults to False.
 
         """
diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
index 033528093c8c..5deedef8cd55 100644
--- a/torch/distributed/checkpoint/state_dict.py
+++ b/torch/distributed/checkpoint/state_dict.py
@@ -154,7 +154,10 @@ class _StateDictInfo(StateDictOptions):
     fsdp_modules: list[nn.Module] = field(default_factory=list)
 
 
+<<<<<<< HEAD
 @functools.cache
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_fqns(
     model: nn.Module,
     name: str,
@@ -208,7 +211,11 @@ def _get_fqns(
             if not skip_compiler_prefix:
                 fqn_obj_names.append(curr_obj_name)
         else:
+<<<<<<< HEAD
             # In some modeuls, _fqn_modifiers would not shown in the state_dict keys,
+=======
+            # In some modules, _fqn_modifiers would not shown in the state_dict keys,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # skip them in the fqn to ensure load stat dict successfully for them.
             if hasattr(curr_obj, dsd_fqn_modifiers):
                 if removed_fqn := getattr(curr_obj, dsd_fqn_modifiers)().get(
@@ -531,10 +538,13 @@ def verify(key, fqn) -> bool:
             for fqn in fqns:
                 state_dict.pop(fqn)
 
+<<<<<<< HEAD
     for key, p in list(state_dict.items()):
         if torch.is_tensor(p) and p.is_meta:
             state_dict.pop(key)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _maybe_full_or_cpu_state_dict(state_dict, info)
 
 
@@ -596,8 +606,12 @@ def _load_model_state_dict(
             )
         elif info.full_state_dict:
             _distribute_state_dict(state_dict, local_state_dict, device=devices.pop())
+<<<<<<< HEAD
         for fqn, local_state in local_state_dict.items():
             state_dict[fqn] = local_state
+=======
+        state_dict.update(local_state_dict)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     with info.fsdp_context():
         return cast(
@@ -797,7 +811,11 @@ def _get_optim_state_dict(
             # We need to specially handle FlatParameter FSDP as
             # FlatParameter FSDP converts the FQNs.
             # There are no easy ways to do this conversion systematically.
+<<<<<<< HEAD
             # We can only use a string replacment without correctness check.
+=======
+            # We can only use a string replacement without correctness check.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not osd:
                 continue
             for k in list(osd[_STATE].keys()):
diff --git a/torch/distributed/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py
index b199d42a42c6..7f82f1122e67 100644
--- a/torch/distributed/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@@ -74,7 +74,11 @@ def load(
     For each ``Stateful`` object (having both a ``state_dict`` and a ``load_state_dict``),
     load will first call ``state_dict`` before attempting deserialization, followed by
     ``load_state_dict`` once the deserialization is complete.
+<<<<<<< HEAD
     For each non-``Stateful`` object, load will deserailize the object, and then replace
+=======
+    For each non-``Stateful`` object, load will deserialize the object, and then replace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     it in the ``state_dict`` with the deserialized object.
 
     .. warning::
@@ -110,7 +114,11 @@ def load(
             checkpoint_id. If checkpoint_id is also None, an exception will
             be raised. (Default: ``None``)
         planner (Optional[LoadPlanner]):
+<<<<<<< HEAD
             Instance of LoadPlanner. If this is not specificed, the default
+=======
+            Instance of LoadPlanner. If this is not specified, the default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             planner will be used. (Default: ``None``)
         process_group (Optional[ProcessGroup]):
             ProcessGroup to be used for cross-rank synchronization.
@@ -322,7 +330,11 @@ def _load_state_dict_from_keys(
         storage_reader=storage_reader,
         process_group=process_group,
         no_dist=no_dist,
+<<<<<<< HEAD
         planner=_EmptyStateDictLoadPlanner(keys=keys or set()),
+=======
+        planner=_EmptyStateDictLoadPlanner(keys=keys),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     return sd
diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
index d16c10783c95..4733d0024093 100644
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -125,7 +125,11 @@ def save(
             checkpoint_id. If checkpoint_id is also None, an exception will
             be raised. (Default: ``None``)
         planner (Optional[SavePlanner]):
+<<<<<<< HEAD
             Instance of SavePlanner. If this is not specificed, the default
+=======
+            Instance of SavePlanner. If this is not specified, the default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             planner will be used. (Default: ``None``)
         process_group (Optional[ProcessGroup]):
             ProcessGroup to be used for cross-rank synchronization.
@@ -211,7 +215,11 @@ def async_save(
             checkpoint_id. If checkpoint_id is also None, an exception will
             be raised. (Default: ``None``)
         planner (Optional[SavePlanner]):
+<<<<<<< HEAD
             Instance of SavePlanner. If this is not specificed, the default
+=======
+            Instance of SavePlanner. If this is not specified, the default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             planner will be used. (Default: ``None``)
         process_group (Optional[ProcessGroup]):
             ProcessGroup to be used for cross-rank synchronization.
@@ -254,11 +262,26 @@ def async_save(
     )
 
     state_dict = _stateful_to_state_dict(state_dict)
+<<<<<<< HEAD
     if isinstance(storage_writer, AsyncStager):
         staged_state_dict = storage_writer.stage(state_dict)
     else:  # provides bwc for storage_writers not implementing AsyncStager
         staged_state_dict = _create_cpu_state_dict(state_dict)
         _copy_state_dict(state_dict, staged_state_dict, type_check=False)
+=======
+
+    @_dcp_method_logger(log_exceptions=True)
+    def stage_state_dict():
+        if isinstance(storage_writer, AsyncStager):
+            staged_state_dict = storage_writer.stage(state_dict)
+        else:  # provides bwc for storage_writers not implementing AsyncStager
+            staged_state_dict = _create_cpu_state_dict(state_dict)
+            _copy_state_dict(state_dict, staged_state_dict, type_check=False)
+
+        return staged_state_dict
+
+    staged_state_dict = stage_state_dict()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     executor: _AsyncCheckpointExecutor = (
         _ProcessBasedAsyncCheckpointExecutor()
@@ -274,15 +297,31 @@ def async_save(
         process_group=process_group,
     )
 
+<<<<<<< HEAD
     if (
         isinstance(storage_writer, AsyncStager)
         and storage_writer.should_synchronize_after_execute
     ):
         storage_writer.synchronize_staging()
+=======
+    @_dcp_method_logger(log_exceptions=True)
+    def maybe_synchronize_staging():
+        if (
+            isinstance(storage_writer, AsyncStager)
+            and storage_writer.should_synchronize_after_execute
+        ):
+            storage_writer.synchronize_staging()
+
+    maybe_synchronize_staging()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return f
 
 
+<<<<<<< HEAD
+=======
+@_dcp_method_logger(log_exceptions=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _stateful_to_state_dict(state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
     """Creates a shallow copy of `state_dict` where `state_dict` is called for each Stateful object."""
     stateful_state_dict = {}
diff --git a/torch/distributed/checkpoint/stateful.py b/torch/distributed/checkpoint/stateful.py
index 95cbb1873d64..de87107f4fbd 100644
--- a/torch/distributed/checkpoint/stateful.py
+++ b/torch/distributed/checkpoint/stateful.py
@@ -1,5 +1,10 @@
+<<<<<<< HEAD
 from typing import Any, runtime_checkable, TypeVar
 from typing_extensions import Protocol
+=======
+from typing import Any, TypeVar
+from typing_extensions import Protocol, runtime_checkable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["Stateful", "StatefulT"]
diff --git a/torch/distributed/checkpoint/storage.py b/torch/distributed/checkpoint/storage.py
index 9c682bc1aff4..19928559d913 100644
--- a/torch/distributed/checkpoint/storage.py
+++ b/torch/distributed/checkpoint/storage.py
@@ -147,7 +147,11 @@ def finish(self, metadata: Metadata, results: list[list[WriteResult]]) -> None:
     @abc.abstractmethod
     def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
         """
+<<<<<<< HEAD
         Check if the given checkpoint_id is supported by the stroage. This allow
+=======
+        Check if the given checkpoint_id is supported by the storage. This allow
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         us to enable automatic storage selection.
         """
         ...
@@ -278,7 +282,11 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
     @abc.abstractmethod
     def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
         """
+<<<<<<< HEAD
         Check if the given checkpoint_id is supported by the stroage. This allow
+=======
+        Check if the given checkpoint_id is supported by the storage. This allow
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         us to enable automatic storage selection.
         """
         ...
diff --git a/torch/distributed/checkpoint/utils.py b/torch/distributed/checkpoint/utils.py
index 0615721228b0..e08c2784350f 100644
--- a/torch/distributed/checkpoint/utils.py
+++ b/torch/distributed/checkpoint/utils.py
@@ -119,7 +119,11 @@ def broadcast_object(self, object: Optional[T]) -> T:
             dist.broadcast_object_list(
                 object_list=object_list,
                 group=self.group,
+<<<<<<< HEAD
                 src=self.coordinator_rank,
+=======
+                src=self.global_coordinator_rank,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         return cast(T, object_list[0])
 
@@ -307,6 +311,19 @@ def broadcast(
             raise final_result
         return cast(T, final_result)
 
+<<<<<<< HEAD
+=======
+    def barrier(self) -> None:
+        """
+        Add a synchronization point across all processes when using distributed.
+        If torch.distributed is initialized, this function will invoke a barrier across the global process group.
+        If torch.distributed is not initialized, this function is a no-op.
+        """
+        if not self.use_dist:
+            return
+        dist.barrier(group=self.group)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _find_shard(tensor: ShardedTensor, index: MetadataIndex) -> Shard:
     if index.offset is None:
@@ -426,7 +443,11 @@ def _normalize_device_info(device_type: str, device_id: int) -> str:
 @contextmanager
 def _profile():
     # Only log the profiling when it is enable and is on rank0  or dist is not
+<<<<<<< HEAD
     # avaiable.
+=======
+    # available.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if ENABLE_PROFILE and (not dist.is_available() or dist.get_rank() == 0):
         profiler = cProfile.Profile()
         profiler.enable()
diff --git a/torch/distributed/collective_utils.py b/torch/distributed/collective_utils.py
index b77e1ba8956e..11b8b6a28828 100644
--- a/torch/distributed/collective_utils.py
+++ b/torch/distributed/collective_utils.py
@@ -46,7 +46,11 @@ def broadcast(
         data_or_fn: the data to broadcast or function to execute and broadcast result.
         success: False to stop all ranks.
         stage_name: the name of the logical stage for synchronization and debugging
+<<<<<<< HEAD
         rank: rank to broadcast data or execute function and broadcast resutls.
+=======
+        rank: rank to broadcast data or execute function and broadcast results.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pg: the process group for sync
     Throws:
         RuntimeError from original exception trace
diff --git a/torch/distributed/constants.py b/torch/distributed/constants.py
index b3754043644b..be2400fca551 100644
--- a/torch/distributed/constants.py
+++ b/torch/distributed/constants.py
@@ -11,7 +11,11 @@
 # To make an attempt at backwards compatibility with THD, we use an
 # extraordinarily high default timeout, given that THD did not have timeouts.
 default_pg_timeout: timedelta = _DEFAULT_PG_TIMEOUT
+<<<<<<< HEAD
 # Separate timeout for PGNCCL mainly becuase it's always been that way in the C++ layer, but until recently
+=======
+# Separate timeout for PGNCCL mainly because it's always been that way in the C++ layer, but until recently
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # there was one default that applied across all backends in the python layer.
 # Later, we could consider merging them back together at the c++ layer if we can align on a same value.
 # (only if TORCH_NCCL_BLOCKING_WAIT or TORCH_NCCL_ASYNC_ERROR_HANDLING is set to 1).
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index 8cd80ecae432..6d53f6f6a831 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -2,7 +2,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import logging
 import math
+<<<<<<< HEAD
 import threading
+=======
+import os
+import threading
+import warnings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functools import reduce
 from itertools import chain
 from typing import Optional, TYPE_CHECKING, Union
@@ -38,9 +44,14 @@ def _init_device_mesh_stub():
 else:
     from torch._C._distributed_c10d import Backend as C10dBackend
     from torch.distributed.distributed_c10d import (
+<<<<<<< HEAD
         _find_pg_by_ranks_and_tag,
         _get_default_group,
         _get_group_tag,
+=======
+        _get_default_group,
+        _resolve_process_group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         get_backend,
         get_process_group_ranks,
         get_rank,
@@ -101,9 +112,15 @@ def create_sub_mesh(
             ]
 
             mesh_tensor = device_mesh.mesh
+<<<<<<< HEAD
             # slice_dim_idx could be differnt from submesh_dims, as we may need to flatten out some dims.
             slice_dim_idx = []
             slice_dim_group_info = []
+=======
+            # slice_dim_idx could be different from submesh_dims, as we may need to flatten out some dims.
+            slice_dim_idx = []
+            slice_dim_group_name = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # keep track of the number of dims that have been flattened so we can get the correct slice_dim_idx in the
             # flattened mesh tensor.
             num_dims_flatten = 0
@@ -121,6 +138,7 @@ def create_sub_mesh(
                     # then the final slice_dim_idx should be [0, 1, 2].
                     slice_dim_idx.append(mesh_dim_indices[0] - num_dims_flatten)
                     num_dims_flatten += len(mesh_dim_indices) - 1
+<<<<<<< HEAD
                     slice_dim_group_info.append(
                         self.root_to_flatten_mapping[device_mesh][
                             mesh_dim_name
@@ -130,6 +148,17 @@ def create_sub_mesh(
                     slice_dim_idx.append(mesh_dim_indices[0] - num_dims_flatten)
                     slice_dim_group_info.append(
                         device_mesh._dim_group_infos[mesh_dim_indices[0]]
+=======
+                    slice_dim_group_name.append(
+                        self.root_to_flatten_mapping[device_mesh][
+                            mesh_dim_name
+                        ]._dim_group_names[0]
+                    )
+                else:
+                    slice_dim_idx.append(mesh_dim_indices[0] - num_dims_flatten)
+                    slice_dim_group_name.append(
+                        device_mesh._dim_group_names[mesh_dim_indices[0]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
             # mesh_tensor has already been flattened if needed. So mesh_tensor.ndim <= device_mesh.mesh.ndim now.
@@ -155,7 +184,11 @@ def create_sub_mesh(
                 if cur_rank in mesh_nd:
                     res_submesh = submesh
 
+<<<<<<< HEAD
             res_submesh._dim_group_infos = slice_dim_group_info  # type: ignore[possibly-undefined]
+=======
+            res_submesh._dim_group_names = slice_dim_group_name  # type: ignore[possibly-undefined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.child_to_root_mapping[res_submesh] = device_mesh
 
             return res_submesh
@@ -360,8 +393,13 @@ def _get_all_submeshes(
                     mesh_dim_names=(mesh_dim_name,),
                     _init_backend=False,
                 )
+<<<<<<< HEAD
                 submesh._dim_group_infos = (
                     [device_mesh._dim_group_infos[mesh_dim]]
+=======
+                submesh._dim_group_names = (
+                    [device_mesh._dim_group_names[mesh_dim]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if cur_rank in mesh_1d
                     else []
                 )
@@ -386,10 +424,21 @@ class DeviceMesh:
         represented as a n-d dimension array, and each value of the n-d dimensional
         array is the global id of the default process group ranks.
 
+<<<<<<< HEAD
         DeviceMesh could be used to describe the layout of devices across the cluster,
         and serves as a proxy for communication among the device lists within the cluster.
 
         DeviceMesh can be used as a context manager.
+=======
+        DeviceMesh could be used to setup the N dimensional device connections across the cluster,
+        and manage the ProcessGroups for N dimensional parallelisms. Communications could happen on
+        each dimension of the DeviceMesh separately. DeviceMesh respects the device that user selects
+        already (i.e. if user call `torch.cuda.set_device` before the DeviceMesh initialization),
+        and will select/set the device for the current process if user does not set the device
+        beforehand. Note that manual device selection should happen BEFORE the DeviceMesh initialization.
+
+        DeviceMesh can also be used as a context manager when using together with DTensor APIs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         .. note::
             DeviceMesh follows SPMD programming model, which means the same PyTorch Python program
@@ -412,6 +461,10 @@ class DeviceMesh:
         of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7).
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> # xdoctest: +SKIP("no rank")
             >>> from torch.distributed.device_mesh import DeviceMesh
             >>>
@@ -453,7 +506,11 @@ def __init__(
                 # already. The world pg is used for device mesh identity (rank) on each
                 # process (we need to know if the current global rank is in the mesh or not).
                 if _init_backend:
+<<<<<<< HEAD
                     self._get_or_create_default_group()
+=======
+                    self._setup_world_group_and_device()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self._init_process_groups()
 
                 if is_initialized() and get_backend() == "threaded":
@@ -466,8 +523,15 @@ def __init__(
                     rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
                 )
 
+<<<<<<< HEAD
         def _get_or_create_default_group(self):
             default_initialized = is_initialized()
+=======
+        def _setup_world_group_and_device(self):
+            default_initialized = is_initialized()
+            # TODO: think about how to allow pg options to be passed to world group
+            # or mesh dimension groups
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not default_initialized:
                 init_process_group()
 
@@ -477,6 +541,7 @@ def _get_or_create_default_group(self):
                     f"Mesh should not be bigger than default world size {world_size}, but found {self.mesh.numel()} ranks!"
                 )
 
+<<<<<<< HEAD
             device_handle = _get_device_handle(self.device_type)
             # TODO: if user want to pass pg_options, offer a way to do it
             if not default_initialized and device_handle:
@@ -492,10 +557,47 @@ def _get_or_create_default_group(self):
                         f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
                     )
                 device_handle.set_device(get_rank() % num_devices_per_host)
+=======
+            # ONLY set the device if the current device is not initialized, if user already
+            # set the device before DeviceMesh init, we respect the user's choice.
+            device_handle = _get_device_handle(self.device_type)
+            if device_handle and not device_handle.is_initialized():
+                # auto set the cuda/cuda-like device only if user has not set it, if there's LOCAL_RANK
+                # env variable from launchers, we use it to set the device.
+                if "LOCAL_RANK" in os.environ:
+                    local_rank = int(os.environ["LOCAL_RANK"])
+                    logger.info(
+                        "Setting default device for the current process based on LOCAL_RANK=%s",
+                        local_rank,
+                    )
+                    device_handle.set_device(local_rank)
+                else:
+                    warnings.warn(
+                        "It seems like you did not set/select the default device for the current process before the DeviceMesh "
+                        "initialization or use a launcher (i.e. torchrun) which populates `LOCAL_RANK` environment variable. "
+                        "It is recommended to set the current device for the process BEFORE the DeviceMesh initialization so that "
+                        "the underlying communicator (i.e. NCCL) can be initialized properly. "
+                        "Given that the current process has no default device selected, DeviceMesh will use a heuristic to set the "
+                        "device_id via `global_rank % num_devices_per_host`, assuming homogeneous hardware cluster. "
+                    )
+                    # heuristic to set the current cuda/cuda-like device base on num of gpu devices available in each host
+                    # NOTE: This device selection would only work for homogeneous hardware.
+                    num_devices_per_host = device_handle.device_count()
+                    if (
+                        world_size > num_devices_per_host
+                        and world_size % num_devices_per_host != 0
+                    ):
+                        raise RuntimeError(
+                            f"DeviceMesh only support homogeneous hardware, but found "
+                            f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
+                        )
+                    device_handle.set_device(get_rank() % num_devices_per_host)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return _get_default_group()
 
         def _init_process_groups(self):
+<<<<<<< HEAD
             # tag/ranks/group_name associated with each mesh dimension, each
             # mesh dimension should have one sub-group per rank
             #
@@ -503,6 +605,12 @@ def _init_process_groups(self):
             # functional collectives. See details in:
             # https://github.com/pytorch/pytorch/issues/93173#issuecomment-1907095208
             dim_group_infos: list[tuple[str, list[int], str]] = []
+=======
+            # group_name associated with each mesh dimension, each
+            # mesh dimension should have one sub-group per rank
+            #
+            dim_group_names: list[str] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             default_group = _get_default_group()
 
             if self.mesh.ndim == 1 and self.mesh.numel() == get_world_size():
@@ -519,6 +627,7 @@ def _init_process_groups(self):
                     and get_backend(default_group) == "gloo"
                     else default_group
                 )
+<<<<<<< HEAD
                 dim_group_infos.append(
                     (
                         _get_group_tag(dim_group),
@@ -526,6 +635,9 @@ def _init_process_groups(self):
                         dim_group.group_name,
                     )
                 )
+=======
+                dim_group_names.append(dim_group.group_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 # create sub pgs base on the mesh argument specified
                 for dim in range(self.mesh.ndim):
@@ -579,6 +691,7 @@ def _init_process_groups(self):
                         has_split_group = True
 
                     # If the subgroup has been already created through `split_group`, we simply loop over `pg_ranks_by_dim`
+<<<<<<< HEAD
                     # and append the `(group_tag, subgroup_ranks, and group_name)` tuple to the `dim_group_infos` list when
                     # the current rank is in the subgroup.
                     # Otherwise, we use `new_group` instead of `split_group` to create subgroups by looping over `pg_ranks_by_dim`
@@ -587,6 +700,15 @@ def _init_process_groups(self):
                         subgroup_ranks = dim_mesh.tolist()
 
                         # We temporarily revert the re-use subgroup, since it breaks two internal tests.
+=======
+                    # and append the `group_name` to the `dim_group_names` list when the current rank is in the subgroup.
+                    # Otherwise, we use `new_group` instead of `split_group` to create subgroups by looping over `pg_ranks_by_dim`
+                    # along with appending information to the `dim_group_names` list whenever necessary.
+                    for dim_mesh in pg_ranks_by_dim:
+                        subgroup_ranks = dim_mesh.tolist()
+
+                        # We temporarily revert the reuse subgroup, since it breaks two internal tests.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         # Temporarily reverting to resolve test timeout while root-causing.
                         # TODO: Add two tests to cover internal tests scenarios and re-enable reuse subgroup if exists.
                         if bound_device_id is None or not has_split_group:
@@ -599,11 +721,16 @@ def _init_process_groups(self):
 
                         # only add to dim_groups if the current rank in the subgroup
                         if self.get_rank() in subgroup_ranks:
+<<<<<<< HEAD
                             if len(dim_group_infos) > dim:
+=======
+                            if len(dim_group_names) > dim:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 raise RuntimeError(
                                     f"Each device mesh dimension should get only one process group, but got {self.get_rank()} "
                                     f"in {subgroup_ranks}!"
                                 )
+<<<<<<< HEAD
                             dim_group_infos.append(
                                 (
                                     _get_group_tag(not_none(dim_group)),
@@ -612,6 +739,10 @@ def _init_process_groups(self):
                                 )
                             )
             self._dim_group_infos = dim_group_infos
+=======
+                            dim_group_names.append(dim_group.group_name)  # type: ignore[union-attr]
+            self._dim_group_names = dim_group_names
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def __enter__(self) -> "DeviceMesh":
             # set this mesh as the current mesh in mesh env
@@ -690,6 +821,10 @@ def __getitem__(
                 Calling mesh_3d["cp", "dp"] on rank 2, 3, 6, 7 returns a 2D submesh of DeviceMesh:([[2, 6], [3, 7]]).
 
             Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 >>> # xdoctest: +SKIP("no rank")
                 >>> from torch.distributed.device_mesh import DeviceMesh
                 >>>
@@ -745,7 +880,11 @@ def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
             Returns:
                 A :class:`ProcessGroup` object.
             """
+<<<<<<< HEAD
             if not hasattr(self, "_dim_group_infos"):
+=======
+            if not hasattr(self, "_dim_group_names"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise RuntimeError("DeviceMesh process groups not initialized!")
 
             if self.mesh.ndim > 1 and mesh_dim is None:
@@ -758,28 +897,44 @@ def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
 
             # Quick return if the current device_mesh is a 1D mesh.
             if self.mesh.ndim == 1 and mesh_dim is None:
+<<<<<<< HEAD
                 return not_none(
                     _find_pg_by_ranks_and_tag(*self._dim_group_infos[0][:2])  # type: ignore[index]
                 )
+=======
+                return not_none(_resolve_process_group(self._dim_group_names[0]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             root_mesh = _mesh_resources.get_root_mesh(self)
             root_to_flatten_mapping = _mesh_resources.root_to_flatten_mapping.get(
                 root_mesh, None
             )
             if root_to_flatten_mapping and mesh_dim in root_to_flatten_mapping.keys():
+<<<<<<< HEAD
                 dim_group_infos = root_to_flatten_mapping[
                     mesh_dim  # type: ignore[index]
                 ]._dim_group_infos[0][:2]
                 return not_none(_find_pg_by_ranks_and_tag(*dim_group_infos))
+=======
+                dim_group_name = root_to_flatten_mapping[
+                    mesh_dim  # type: ignore[index]
+                ]._dim_group_names[0]
+                return not_none(_resolve_process_group(dim_group_name))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 mesh_dim = (
                     _mesh_resources.get_mesh_dim_by_name(self, mesh_dim)
                     if isinstance(mesh_dim, str)
                     else mesh_dim
                 )
+<<<<<<< HEAD
                 return not_none(
                     _find_pg_by_ranks_and_tag(*self._dim_group_infos[mesh_dim][:2])  # type: ignore[index]
                 )
+=======
+                assert isinstance(mesh_dim, int)
+                return not_none(_resolve_process_group(self._dim_group_names[mesh_dim]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def get_all_groups(self) -> list[ProcessGroup]:
             """
@@ -852,9 +1007,13 @@ def from_group(
                     mesh_dim_names=mesh_dim_names,
                     _init_backend=False,
                 )
+<<<<<<< HEAD
                 device_mesh._dim_group_infos = [
                     (_get_group_tag(group), group_ranks, group.group_name)
                 ]
+=======
+                device_mesh._dim_group_names = [group.group_name]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return device_mesh
 
             # nD scenario
@@ -880,6 +1039,7 @@ def from_group(
             device_mesh = DeviceMesh(
                 device_type, mesh, mesh_dim_names=mesh_dim_names, _init_backend=False
             )
+<<<<<<< HEAD
             device_mesh._dim_group_infos = [
                 (
                     _get_group_tag(group),
@@ -888,6 +1048,9 @@ def from_group(
                 )
                 for group in groups
             ]
+=======
+            device_mesh._dim_group_names = [group.group_name for group in groups]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return device_mesh
 
         def size(self, mesh_dim: Optional[int] = None) -> int:
@@ -928,6 +1091,10 @@ def get_local_rank(self, mesh_dim: Optional[Union[int, str]] = None) -> int:
             Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 3, 7 would return 3.
 
             Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 >>> # xdoctest: +SKIP("no rank")
                 >>> from torch.distributed.device_mesh import DeviceMesh
                 >>>
@@ -960,13 +1127,21 @@ def _flatten(self, mesh_dim_name: Optional[str] = None) -> "DeviceMesh":
             """
             Returns a 1D DeviceMesh by flattening the current DeviceMesh.
 
+<<<<<<< HEAD
             If no mesh_dim_name is provided, the default is a string concatentaing the mesh_dim_names of the
+=======
+            If no mesh_dim_name is provided, the default is a string concatenating the mesh_dim_names of the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             given submesh with each mesh_dim_name separated by "_". For example, if we have a 3D mesh
             DeviceMesh([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], mesh_dim_names=("dp", "cp", "tp")), calling
             mesh_3d["dp", "cp"]._flatten() will create a 1D submesh DeviceMesh([0, 1, 2, 3], mesh_dim_names=("dp_cp",))
             on rank 0, 1, 2, 3 and a 1D submesh DeviceMesh([4, 5, 6, 7], mesh_dim_names=("dp_cp",)) on rank 4, 5, 6, 7.
 
+<<<<<<< HEAD
             After the flattened dimension is created, to access the flattened dimesnion in mesh_3d, one can use the
+=======
+            After the flattened dimension is created, to access the flattened dimension in mesh_3d, one can use the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             existing slicing method to obtain the flattened mesh through calling mesh_3d["dp_cp"].
             """
             if not self.mesh_dim_names:
@@ -1010,6 +1185,10 @@ def init_device_mesh(
             DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> # xdoctest: +SKIP("no rank")
             >>> from torch.distributed.device_mesh import init_device_mesh
             >>>
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 547030cbd353..49737da7e5e9 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -270,6 +270,10 @@ class Backend(str):  # noqa: SLOT000
         "cpu": GLOO,
         "cuda": NCCL,
         "xpu": XCCL,
+<<<<<<< HEAD
+=======
+        "mps": GLOO,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     }
 
     backend_capability: dict[str, list[str]] = {
@@ -339,7 +343,11 @@ def register_backend(
 
         if devices is not None:
             for device in devices:
+<<<<<<< HEAD
                 if device != "cpu" and device != "cuda":
+=======
+                if device not in Backend.default_device_backend_map:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     Backend.default_device_backend_map[device] = name.lower()
         Backend.backend_type_map[name.lower()] = ProcessGroup.BackendType.CUSTOM
 
@@ -546,7 +554,11 @@ class _CollOp:
     Args:
         op (Callable): A collective function, e.g. ``torch.distributed.all_reduce``.
         tensor (Tensor): Tensor to operate on.
+<<<<<<< HEAD
         dst_tensor (Tensor, optional): Provided when source and destinaton tensors are not the same.
+=======
+        dst_tensor (Tensor, optional): Provided when source and destination tensors are not the same.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         redop (ReduceOp, optional): reduce operation.
         root (int, optional): root of broadcast or reduce.
     """
@@ -1073,17 +1085,30 @@ def _get_global_rank(group, rank) -> int:
     return get_global_rank(group, rank)
 
 
+<<<<<<< HEAD
 def get_process_group_ranks(group: ProcessGroup) -> list[int]:
+=======
+def get_process_group_ranks(group: Optional[ProcessGroup]) -> list[int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Get all ranks associated with ``group``.
 
     Args:
+<<<<<<< HEAD
         group (ProcessGroup): ProcessGroup to get all ranks from.
+=======
+        group (Optional[ProcessGroup]): ProcessGroup to get all ranks from.
+            If None, the default process group will be used.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         List of global ranks ordered by group rank.
     """
+<<<<<<< HEAD
     return list(_world.pg_group_ranks[group].keys())
+=======
+    return list(_world.pg_group_ranks[group or _get_default_group()].keys())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_group_size(group) -> int:
@@ -1152,12 +1177,24 @@ def _canonicalize_group_rank(
     if group_rank is not None:
         if global_rank is not None:
             raise ValueError("Can't specify both group_rank and global_rank")
+<<<<<<< HEAD
         global_rank = get_global_rank(group, group_rank)
     else:
         if global_rank is None:
             raise ValueError("Must specify global_rank or group_rank")
         group_rank = get_group_rank(group, global_rank)
     return global_rank if return_global else group_rank
+=======
+        if return_global:
+            return get_global_rank(group, group_rank)
+    else:
+        if global_rank is None:
+            raise ValueError("Must specify global_rank or group_rank")
+        if return_global:
+            return global_rank
+        group_rank = get_group_rank(group, global_rank)
+    return group_rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _check_not_self_rank(group: ProcessGroup, rank: int, rank_type: str):
@@ -1359,7 +1396,17 @@ def get_backend(group: Optional[ProcessGroup] = None) -> Backend:
     pg = group or _get_default_group()
     if _rank_not_in_group(pg):
         raise ValueError("Invalid process group specified")
+<<<<<<< HEAD
     pg_store = _world.pg_map[pg] if pg in _world.pg_map else None
+=======
+
+    pg_store = _world.pg_map.get(pg, None)
+    if pg_store is None:
+        raise ValueError(
+            f"Process group {pg} is not initialized in the world group map. Please initialize the group first."
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return Backend(not_none(pg_store)[0])
 
 
@@ -1368,7 +1415,11 @@ def get_default_backend_for_device(device: Union[str, torch.device]) -> str:
     Return the default backend for the given device.
 
     Args:
+<<<<<<< HEAD
         Union[str, torch.device]: The device to get the default backend for.
+=======
+        device (Union[str, torch.device]): The device to get the default backend for.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         The default backend for the given device as a lower case string.
@@ -1538,7 +1589,11 @@ def init_process_group(
     store: Optional[Store] = None,
     group_name: str = "",
     pg_options: Optional[Any] = None,
+<<<<<<< HEAD
     device_id: Optional[torch.device] = None,
+=======
+    device_id: Optional[Union[torch.device, int]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     Initialize the default distributed process group.
@@ -1573,6 +1628,11 @@ def init_process_group(
             process must have exclusive access to every GPU it uses, as sharing
             GPUs between processes can result in deadlock or NCCL invalid usage.
             ``ucc`` backend is experimental.
+<<<<<<< HEAD
+=======
+            Default backend for the device can be queried with
+            :func:`get_default_backend_for_device`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         init_method (str, optional): URL specifying how to initialize the
                                      process group. Default is "env://" if no
                                      ``init_method`` or ``store`` is specified.
@@ -1599,17 +1659,29 @@ def init_process_group(
             options we support is ``ProcessGroupNCCL.Options`` for the ``nccl``
             backend, ``is_high_priority_stream`` can be specified so that
             the nccl backend can pick up high priority cuda streams when
+<<<<<<< HEAD
             there're compute kernels waiting. For other availble options to config nccl,
             See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t
         device_id (torch.device, optional): a single, specific device
             to "bind" this process to, allowing for backend-specific
+=======
+            there're compute kernels waiting. For other available options to config nccl,
+            See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t
+        device_id (torch.device | int, optional): a single, specific device
+            this process will work on, allowing for backend-specific
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             optimizations.  Currently this has two effects, only under
             NCCL: the communicator is immediately formed (calling
             ``ncclCommInit*`` immediately rather than the normal lazy
             call) and sub-groups will use ``ncclCommSplit`` when
             possible to avoid unnecessary overhead of group creation. If you
             want to know NCCL initialization error early, you can also use this
+<<<<<<< HEAD
             field.
+=======
+            field. If an `int` is provided, the API assumes that the accelerator
+            type at compile time will be used.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. note:: To enable ``backend == Backend.MPI``, PyTorch needs to be built from source
         on a system that supports MPI.
@@ -1654,6 +1726,42 @@ def init_process_group(
     elif init_method is None:
         init_method = "env://"
 
+<<<<<<< HEAD
+=======
+    # Get the compile-time accelerator type.
+    # None indicates no accelerator support.
+    acc = torch.accelerator.current_accelerator()
+
+    # Auto complete device id
+    if isinstance(device_id, int):
+        if acc is None:
+            raise ValueError(
+                "device_id is an int, but no accelerator support is found from the current compilation. "
+                "Please use a different compiled version that supports your accelerator."
+            )
+        device_id = torch.device(acc.type, device_id)
+
+    # Sanity check device_id
+    if device_id is not None and device_id.type != "cpu":
+        # Type
+        if acc is None or device_id.type != acc.type:
+            raise ValueError(
+                f"device_id {device_id} does not match the current compilation's accelerator support: {acc}. "
+                "Please use a different compiled version that supports your accelerator."
+            )
+        # Index
+        if device_id.index is None:
+            raise ValueError("Please use a device_id with index.")
+        # Range
+        if device_id.index >= torch.accelerator.device_count():
+            raise ValueError(
+                f"device_id {device_id} is out of range. Please use a device index less than "
+                f"the number of accelerators available: {torch.accelerator.device_count()}."
+            )
+
+    logger.info("Using device: %s", device_id)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # If user did not provide a backend string but provided a device id, e.g.
     # >>> init_process_group(device_id=device)
     # we try to figure out the backend name based on the device type.
@@ -1939,9 +2047,19 @@ def _new_process_group_helper(
             # TODO: remove this check after lazy initialization is supported
             # if pg_options is not None:
             #     raise RuntimeError("GLOO options not supported")
+<<<<<<< HEAD
+            backend_class = ProcessGroupGloo(
+                backend_prefix_store, group_rank, group_size, timeout=timeout
+            )
+=======
+            if not is_gloo_available():
+                raise RuntimeError("Distributed package doesn't have Gloo built in")
             backend_class = ProcessGroupGloo(
                 backend_prefix_store, group_rank, group_size, timeout=timeout
             )
+            backend_class.options.global_ranks_in_group = global_ranks_in_group
+            backend_class.options.group_name = group_name
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             backend_type = ProcessGroup.BackendType.GLOO
         elif backend_str == Backend.NCCL:
             if not is_nccl_available():
@@ -2494,7 +2612,11 @@ class _CoalescingManager:
     def __init__(self) -> None:
         self.works: list[Work] = []
 
+<<<<<<< HEAD
     def append(self, work: Work):
+=======
+    def append(self, work: Optional[Work] = None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if work:
             self.works.append(work)
 
@@ -2507,7 +2629,11 @@ def wait(self):
 def _coalescing_manager(
     group: Optional[ProcessGroup] = None,
     device: Optional[torch.device] = None,
+<<<<<<< HEAD
     async_ops: Optional[bool] = False,
+=======
+    async_ops: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Context manager used to coalesce collectives or P2P operations when possible.
@@ -2546,6 +2672,10 @@ def _coalescing_manager(
         group._start_coalescing(device)
     cm = _CoalescingManager()
     yield cm
+<<<<<<< HEAD
+=======
+    work = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     op_list = _world.pg_coalesce_state.pop(group)
     if op_list:
         # Collectives supporting "Fast Path" coalescing are captured.
@@ -2559,6 +2689,10 @@ def _coalescing_manager(
             tensors = [op.tensor for op in op_list]
             all_reduce_opts = AllreduceCoalescedOptions()
             all_reduce_opts.reduceOp = not_none(op_list[0].redop)
+<<<<<<< HEAD
+=======
+            all_reduce_opts.asyncOp = async_ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             work = group.allreduce_coalesced(tensors, all_reduce_opts)
         elif op0 == all_gather_into_tensor:
             inputs = []
@@ -2566,6 +2700,11 @@ def _coalescing_manager(
             for op in op_list:
                 inputs.append(op.tensor)
                 outputs.append(not_none(op.dst_tensor))
+<<<<<<< HEAD
+=======
+            all_gather_opts = AllgatherOptions()
+            all_gather_opts.asyncOp = async_ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             work = group.allgather_into_tensor_coalesced(outputs, inputs)
         elif op0 == reduce_scatter_tensor:
             inputs = []
@@ -2575,6 +2714,10 @@ def _coalescing_manager(
                 outputs.append(not_none(op.dst_tensor))
             reduce_opts = ReduceScatterOptions()
             reduce_opts.reduceOp = not_none(op_list[0].redop)
+<<<<<<< HEAD
+=======
+            reduce_opts.asyncOp = async_ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             work = group.reduce_scatter_tensor_coalesced(outputs, inputs, reduce_opts)
         else:
             raise AssertionError(
@@ -2587,9 +2730,69 @@ def _coalescing_manager(
         work = group._end_coalescing(device)
 
     if async_ops:
+<<<<<<< HEAD
         cm.append(work)  # type: ignore[possibly-undefined]
     else:
         work.wait()  # type: ignore[possibly-undefined]
+=======
+        cm.append(work)
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+
+
+class _TimeEstimator:
+    def __init__(self) -> None:
+        self.estimated_time: Optional[float] = None
+
+
+@contextlib.contextmanager
+def _time_estimator(
+    group: Optional[ProcessGroup] = None,
+    device: Optional[torch.device] = None,
+):
+    """
+    Context manager used to estimate time of collectives.
+    Within the context manager, nothing is actually run and the backend just simulates
+    the collective time only.
+
+    Args:
+        group (`ProcessGroup`, optional): The process group to work on. If None,
+            the default process group will be used.
+        device (`torch.device`, optional): Default is None, set to a device if
+            there isn't a `**_coalesced` implementation by the backend.
+
+    Examples:
+        >>> # xdoctest: +SKIP("no rank")
+        >>> # Synchronous ops
+        >>> with _time_estimator() as cm:
+        >>>     for i in range(num_colls):
+        >>>         dist.all_reduce(tensors[i])
+        >>> # estimate time is stored in cm.estimated_time
+
+    .. warning::
+       :func:`_time_estimator` currently only support NCCL backend but it can
+       easily be extended to other backends.
+
+       Also a NCCL communicator needs to be created because only with a real communicator can we do accurate estimation.
+       The communicator internally has knowledge about the links it runs on
+       (e.g. intra-node or inter-node, whether the links are NVLink or PCI-e or IB).
+    """
+    # TODO: We need to also support torch inductor for the time estimator.
+    group = group or _get_default_group()
+    device = device or _get_pg_default_device(group)
+    backend = group._get_backend(device)
+    if not backend.supports_time_estimate:
+        raise NotImplementedError(
+            f"collective time estimator is not supported in the current version of backend {backend}"
+        )
+    backend._start_time_estimate()  # type: ignore[attr-defined]
+    cm = _TimeEstimator()
+    yield cm
+    cm.estimated_time = backend._end_time_estimate()  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def batch_isend_irecv(p2p_op_list: list[P2POp]) -> list[Work]:
@@ -2714,8 +2917,16 @@ def broadcast(
     work = group.broadcast([tensor], opts)
     if async_op:
         return work
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -2795,6 +3006,10 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
 
     opts = AllreduceOptions()
     opts.reduceOp = op
+<<<<<<< HEAD
+=======
+    opts.asyncOp = async_op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if group is None:
         group = _get_default_group()
 
@@ -2811,8 +3026,16 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
 
     if async_op:
         return work
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -2871,13 +3094,25 @@ def all_reduce_coalesced(tensors, op=ReduceOp.SUM, group=None, async_op=False):
 
     opts = AllreduceCoalescedOptions()
     opts.reduceOp = op
+<<<<<<< HEAD
+=======
+    opts.asyncOp = async_op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     group = group or _get_default_group()
     work = group.allreduce_coalesced(tensors, opts)
 
     if async_op:
         return work.get_future()
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -2922,11 +3157,23 @@ def reduce(
     opts = ReduceOptions()
     opts.reduceOp = op
     opts.rootRank = group_dst
+<<<<<<< HEAD
     work = group.reduce([tensor], opts)
     if async_op:
         return work
     else:
         work.wait()
+=======
+    opts.asyncOp = async_op
+    work = group.reduce([tensor], opts)
+    if async_op:
+        return work
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _object_to_tensor(obj, device, group):
@@ -2935,7 +3182,11 @@ def _object_to_tensor(obj, device, group):
         _pickler(f).dump(obj)
         byte_storage = torch.ByteStorage._from_buffer(f.getvalue())  # type: ignore[attr-defined]
         # Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor and specifying dtype.
+<<<<<<< HEAD
         # Otherwise, it will casue 100X slowdown.
+=======
+        # Otherwise, it will cause 100X slowdown.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # See: https://github.com/pytorch/pytorch/issues/65696
         byte_tensor = torch.ByteTensor(byte_storage).to(device)
         if get_debug_level() == DebugLevel.DETAIL and is_nccl_available():
@@ -2993,11 +3244,22 @@ def all_gather_object(object_list, obj, group=None):
     .. note:: For NCCL-based processed groups, internal tensor representations
         of objects must be moved to the GPU device before communication takes
         place. In this case, the device used is given by
+<<<<<<< HEAD
         ``torch.cuda.current_device()`` and it is the user's responsiblity to
+=======
+        ``torch.cuda.current_device()`` and it is the user's responsibility to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ensure that this is set so that each rank has an individual GPU, via
         ``torch.cuda.set_device()``.
 
     .. warning::
+<<<<<<< HEAD
+=======
+        Object collectives have a number of serious performance and scalability
+        limitations.  See :ref:`object_collectives` for details.
+
+    .. warning::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         :func:`all_gather_object` uses ``pickle`` module implicitly, which is
         known to be insecure. It is possible to construct malicious pickle data
         which will execute arbitrary code during unpickling. Only call this
@@ -3093,11 +3355,22 @@ def gather_object(
     .. note:: For NCCL-based processed groups, internal tensor representations
         of objects must be moved to the GPU device before communication takes
         place. In this case, the device used is given by
+<<<<<<< HEAD
         ``torch.cuda.current_device()`` and it is the user's responsiblity to
+=======
+        ``torch.cuda.current_device()`` and it is the user's responsibility to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ensure that this is set so that each rank has an individual GPU, via
         ``torch.cuda.set_device()``.
 
     .. warning::
+<<<<<<< HEAD
+=======
+        Object collectives have a number of serious performance and scalability
+        limitations.  See :ref:`object_collectives` for details.
+
+    .. warning::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         :func:`gather_object` uses ``pickle`` module implicitly, which is
         known to be insecure. It is possible to construct malicious pickle data
         which will execute arbitrary code during unpickling. Only call this
@@ -3127,14 +3400,23 @@ def gather_object(
     group = _group_or_default_group(group)
     if dst is None and group_dst is None:
         dst = 0
+<<<<<<< HEAD
     global_dst = _canonicalize_group_rank(group, dst, group_dst, return_global=True)
+=======
+    group_dst = _canonicalize_group_rank(group, dst, group_dst, return_global=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if _rank_not_in_group(group):
         _warn_not_in_group("gather_object")
         return
 
     # Ensure object_gather_list is specified appropriately.
+<<<<<<< HEAD
     my_global_rank = get_rank()
     _validate_output_list_for_rank(my_global_rank, global_dst, object_gather_list)
+=======
+    my_group_rank = group.rank()
+    _validate_output_list_for_rank(my_group_rank, group_dst, object_gather_list)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     current_device = _get_object_coll_device(group)
     input_tensor, local_size = _object_to_tensor(obj, current_device, group)
 
@@ -3155,7 +3437,11 @@ def gather_object(
     # Resize tensor to max size across all ranks.
     input_tensor.resize_(max_object_size)
     # Avoid populating output tensors if the result won't be gathered on this rank.
+<<<<<<< HEAD
     if my_global_rank == global_dst:
+=======
+    if my_group_rank == group_dst:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         coalesced_output_tensor = torch.empty(
             max_object_size * group_size, dtype=torch.uint8, device=current_device
         )
@@ -3167,11 +3453,19 @@ def gather_object(
     # All ranks call gather with equal-sized tensors.
     gather(
         input_tensor,
+<<<<<<< HEAD
         gather_list=output_tensors if my_global_rank == global_dst else None,  # type: ignore[possibly-undefined]
         dst=global_dst,
         group=group,
     )
     if my_global_rank != global_dst:
+=======
+        gather_list=output_tensors if my_group_rank == group_dst else None,  # type: ignore[possibly-undefined]
+        group_dst=group_dst,
+        group=group,
+    )
+    if my_group_rank != group_dst:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     assert object_gather_list is not None, "Must provide object_gather_list on dst rank"
@@ -3219,6 +3513,13 @@ def send_object_list(
         ``torch.cuda.set_device()``.
 
     .. warning::
+<<<<<<< HEAD
+=======
+        Object collectives have a number of serious performance and scalability
+        limitations.  See :ref:`object_collectives` for details.
+
+    .. warning::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         :func:`send_object_list` uses ``pickle`` module implicitly, which
         is known to be insecure. It is possible to construct malicious pickle
         data which will execute arbitrary code during unpickling. Only call this
@@ -3317,6 +3618,13 @@ def recv_object_list(
         ``torch.cuda.set_device()``.
 
     .. warning::
+<<<<<<< HEAD
+=======
+        Object collectives have a number of serious performance and scalability
+        limitations.  See :ref:`object_collectives` for details.
+
+    .. warning::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         :func:`recv_object_list` uses ``pickle`` module implicitly, which
         is known to be insecure. It is possible to construct malicious pickle
         data which will execute arbitrary code during unpickling. Only call this
@@ -3427,6 +3735,13 @@ def broadcast_object_list(
         will be a blocking call.
 
     .. warning::
+<<<<<<< HEAD
+=======
+        Object collectives have a number of serious performance and scalability
+        limitations.  See :ref:`object_collectives` for details.
+
+    .. warning::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         :func:`broadcast_object_list` uses ``pickle`` module implicitly, which
         is known to be insecure. It is possible to construct malicious pickle
         data which will execute arbitrary code during unpickling. Only call this
@@ -3455,7 +3770,11 @@ def broadcast_object_list(
     group = _group_or_default_group(group)
     if src is None and group_src is None:
         src = 0
+<<<<<<< HEAD
     global_src = _canonicalize_group_rank(group, src, group_src, return_global=True)
+=======
+    group_src = _canonicalize_group_rank(group, src, group_src, return_global=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if _rank_not_in_group(group):
         _warn_not_in_group("broadcast_object_list")
         return
@@ -3467,9 +3786,15 @@ def broadcast_object_list(
     # case it is not ``None`` we move the size and object tensors to be
     # broadcasted to this device.
     current_device = device or _get_object_coll_device(group)
+<<<<<<< HEAD
     my_global_rank = get_rank()
     # Serialize object_list elements to tensors on src rank.
     if my_global_rank == global_src:
+=======
+    my_group_rank = group.rank()
+    # Serialize object_list elements to tensors on src rank.
+    if my_group_rank == group_src:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor_list, size_list = zip(
             *[_object_to_tensor(obj, current_device, group) for obj in object_list]
         )
@@ -3480,12 +3805,20 @@ def broadcast_object_list(
         )
 
     # Broadcast object sizes
+<<<<<<< HEAD
     broadcast(object_sizes_tensor, src=global_src, group=group)
+=======
+    broadcast(object_sizes_tensor, group_src=group_src, group=group)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Concatenate and broadcast serialized object tensors
     # Note: torch.cat will do an extra memory copy to the current device, if the tensor_list
     # has only one element, we can skip the copy.
+<<<<<<< HEAD
     if my_global_rank == global_src:
+=======
+    if my_group_rank == group_src:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(tensor_list) == 1:  # type: ignore[possibly-undefined]
             object_tensor = tensor_list[0]
         else:
@@ -3497,10 +3830,17 @@ def broadcast_object_list(
             device=current_device,
         )
 
+<<<<<<< HEAD
     broadcast(object_tensor, src=global_src, group=group)
     # Deserialize objects using their stored sizes.
     offset = 0
     if my_global_rank != global_src:
+=======
+    broadcast(object_tensor, group_src=group_src, group=group)
+    # Deserialize objects using their stored sizes.
+    offset = 0
+    if my_group_rank != group_src:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i, obj_size in enumerate(object_sizes_tensor):
             obj_view = object_tensor[offset : offset + obj_size]
             obj_view = obj_view.type(torch.uint8)
@@ -3546,6 +3886,13 @@ def scatter_object_list(
         blocking call.
 
     .. warning::
+<<<<<<< HEAD
+=======
+        Object collectives have a number of serious performance and scalability
+        limitations.  See :ref:`object_collectives` for details.
+
+    .. warning::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         :func:`scatter_object_list` uses ``pickle`` module implicitly, which
         is known to be insecure. It is possible to construct malicious pickle
         data which will execute arbitrary code during unpickling. Only call this
@@ -3575,7 +3922,11 @@ def scatter_object_list(
     group = _group_or_default_group(group)
     if src is None and group_src is None:
         src = 0
+<<<<<<< HEAD
     global_src = _canonicalize_group_rank(group, src, group_src, return_global=True)
+=======
+    group_src = _canonicalize_group_rank(group, src, group_src, return_global=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if _rank_not_in_group(group):
         _warn_not_in_group("scatter_object_list")
         return
@@ -3588,9 +3939,15 @@ def scatter_object_list(
             "Expected argument scatter_object_output_list to be a list of size at least 1."
         )
 
+<<<<<<< HEAD
     my_global_rank = get_rank()
     pg_device = _get_object_coll_device(group)
     if my_global_rank == global_src:
+=======
+    my_group_rank = group.rank()
+    pg_device = _get_object_coll_device(group)
+    if my_group_rank == group_src:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if scatter_object_input_list is None:
             raise ValueError(
                 "source rank must provide non-None scatter_object_input_list"
@@ -3610,7 +3967,11 @@ def scatter_object_list(
             tensor.resize_(max_tensor_size)
     else:
         max_tensor_size = torch.tensor([0], dtype=torch.long, device=pg_device)
+<<<<<<< HEAD
     broadcast(max_tensor_size, src=global_src, group=group)
+=======
+    broadcast(max_tensor_size, group_src=group_src, group=group)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Scatter actual serialized objects
     output_tensor = torch.empty(
@@ -3618,8 +3979,13 @@ def scatter_object_list(
     )
     scatter(
         output_tensor,
+<<<<<<< HEAD
         scatter_list=None if my_global_rank != global_src else tensor_list,  # type: ignore[possibly-undefined]
         src=global_src,
+=======
+        scatter_list=None if my_group_rank != group_src else tensor_list,  # type: ignore[possibly-undefined]
+        group_src=group_src,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         group=group,
     )
 
@@ -3627,8 +3993,13 @@ def scatter_object_list(
     obj_tensor_size = torch.tensor([0], dtype=torch.long, device=pg_device)
     scatter(
         obj_tensor_size,
+<<<<<<< HEAD
         scatter_list=None if my_global_rank != global_src else tensor_sizes,  # type: ignore[possibly-undefined]
         src=global_src,
+=======
+        scatter_list=None if my_group_rank != group_src else tensor_sizes,  # type: ignore[possibly-undefined]
+        group_src=group_src,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         group=group,
     )
 
@@ -3725,12 +4096,26 @@ def all_gather(tensor_list, tensor, group=None, async_op=False):
     tensor = tensor if not tensor.is_complex() else torch.view_as_real(tensor)
 
     group = group or _get_default_group()
+<<<<<<< HEAD
     work = group.allgather([tensor_list], [tensor])
 
     if async_op:
         return work
     else:
         work.wait()
+=======
+    opts = AllgatherOptions()
+    opts.asyncOp = async_op
+    work = group.allgather([tensor_list], [tensor], opts)
+
+    if async_op:
+        return work
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -3783,10 +4168,13 @@ def all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=Fal
                 [3, 4]], device='cuda:0') # Rank 0
         tensor([[1, 2],
                 [3, 4]], device='cuda:1') # Rank 1
+<<<<<<< HEAD
 
     .. warning::
         The Gloo backend does not support this API.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # Dynamo has built-in logic to map legacy distributed ops to functional collectives.
     # Let's redirect to a torch function mode that can mimic this logic outside Dynamo
@@ -3837,8 +4225,16 @@ def all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=Fal
 
     if async_op:
         return work
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -3948,12 +4344,26 @@ def all_gather_coalesced(
     ]
 
     group = group or _get_default_group()
+<<<<<<< HEAD
     work = group.allgather_coalesced(output_tensor_lists, input_tensor_list)
 
     if async_op:
         return work.get_future()
     else:
         work.wait()
+=======
+    opts = AllgatherOptions()
+    opts.asyncOp = async_op
+    work = group.allgather_coalesced(output_tensor_lists, input_tensor_list, opts)
+
+    if async_op:
+        return work.get_future()
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _validate_output_list_for_rank(my_rank, dst, gather_list):
@@ -4031,21 +4441,40 @@ def gather(
         return
     if dst is None and group_dst is None:
         dst = 0
+<<<<<<< HEAD
     global_dst = _canonicalize_group_rank(group, dst, group_dst, return_global=True)
     group_dst = _canonicalize_group_rank(group, dst, group_dst, return_global=False)
     my_global_rank = get_rank()
     _validate_output_list_for_rank(my_global_rank, global_dst, gather_list)
     output_tensors = [gather_list] if global_dst == my_global_rank else []
+=======
+    group_dst = _canonicalize_group_rank(group, dst, group_dst, return_global=False)
+    my_group_rank = group.rank()
+    _validate_output_list_for_rank(my_group_rank, group_dst, gather_list)
+    output_tensors = [gather_list] if group_dst == my_group_rank else []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     input_tensors = [tensor]
 
     opts = GatherOptions()
     opts.rootRank = group_dst
+<<<<<<< HEAD
+=======
+    opts.asyncOp = async_op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     work = group.gather(output_tensors, input_tensors, opts)
 
     if async_op:
         return work
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -4114,7 +4543,10 @@ def scatter(
     group = _group_or_default_group(group)
     if src is None and group_src is None:
         src = 0
+<<<<<<< HEAD
     global_src = _canonicalize_group_rank(group, src, group_src, return_global=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     group_src = _canonicalize_group_rank(group, src, group_src, return_global=False)
     if _rank_not_in_group(group):
         _warn_not_in_group("scatter")
@@ -4124,8 +4556,13 @@ def scatter(
     ]
     tensor = tensor if not tensor.is_complex() else torch.view_as_real(tensor)
 
+<<<<<<< HEAD
     my_global_rank = get_rank()
     if global_src == my_global_rank:
+=======
+    my_group_rank = group.rank()
+    if group_src == my_group_rank:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not scatter_list:
             raise ValueError(
                 "Argument ``scatter_list`` must be specified on source rank."
@@ -4147,8 +4584,16 @@ def scatter(
 
     if async_op:
         return work
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -4180,14 +4625,26 @@ def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=None, async_op=Fal
 
     opts = ReduceScatterOptions()
     opts.reduceOp = op
+<<<<<<< HEAD
+=======
+    opts.asyncOp = async_op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     group = group or _get_default_group()
     work = group.reduce_scatter([output], [input_list], opts)
 
     if async_op:
         return work
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -4241,9 +4698,12 @@ def reduce_scatter_tensor(output, input, op=ReduceOp.SUM, group=None, async_op=F
         tensor([0, 2], device='cuda:0') # Rank 0
         tensor([4, 6], device='cuda:1') # Rank 1
 
+<<<<<<< HEAD
     .. warning::
         The Gloo backend does not support this API.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # Dynamo has built-in logic to map legacy distributed ops to functional collectives.
     # Let's redirect to a torch function mode that can mimic this logic outside Dynamo
@@ -4287,8 +4747,16 @@ def reduce_scatter_tensor(output, input, op=ReduceOp.SUM, group=None, async_op=F
 
     if async_op:
         return work
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @deprecated(
@@ -4441,6 +4909,10 @@ def all_to_all_single(
         return
 
     opts = AllToAllOptions()
+<<<<<<< HEAD
+=======
+    opts.asyncOp = async_op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _check_single_tensor(output, "output")
     _check_single_tensor(input, "input")
     _ensure_all_tensors_same_dtype(output, input)
@@ -4460,8 +4932,16 @@ def all_to_all_single(
 
     if async_op:
         return work
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -4562,6 +5042,10 @@ def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False
         return
 
     opts = AllToAllOptions()
+<<<<<<< HEAD
+=======
+    opts.asyncOp = async_op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _check_tensor_list(output_tensor_list, "output_tensor_list")
     _check_tensor_list(input_tensor_list, "input_tensor_list")
     _ensure_all_tensors_same_dtype(output_tensor_list, input_tensor_list)
@@ -4578,8 +5062,16 @@ def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False
 
     if async_op:
         return work
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @_exception_logger
@@ -4611,6 +5103,10 @@ def barrier(
         return
 
     opts = BarrierOptions()
+<<<<<<< HEAD
+=======
+    opts.asyncOp = async_op
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Detect the accelerator on the machine. If no accelerator is available, it
     # returns CPU.
     device = torch._C._get_accelerator()
@@ -4636,8 +5132,16 @@ def barrier(
 
     if async_op:
         return work
+<<<<<<< HEAD
     else:
         work.wait()
+=======
+    elif (
+        work is not None
+    ):  # Backward compatible with backends that don't sync at CPP level
+        work.wait()
+    # Otherwise, the backend has sync'ed at CPP level
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def monitored_barrier(
@@ -4704,7 +5208,11 @@ def monitored_barrier(
     if timeout is None:
         timeout = _get_default_timeout(get_backend(group))
     elif isinstance(timeout, float):
+<<<<<<< HEAD
         # TODO(whc) aparently some existing test case for monitored_barrier passes in a timeout in float format?
+=======
+        # TODO(whc) apparently some existing test case for monitored_barrier passes in a timeout in float format?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         warnings.warn(
             "Please specify timeout arg as a timedelta. "
             f"Converting current value of {timeout} assuming it represents seconds",
@@ -4802,16 +5310,28 @@ def split_group(
     group_desc: Optional[str] = None,
 ) -> Optional[ProcessGroup]:
     """
+<<<<<<< HEAD
     Create a new process group splitted from the given parent process group.
 
     warning:: This is an experimental API and only the ``NCCL`` backend supports this API.
     Other backends will raise an error.
     Users of this API must gurantee that all ranks in the parent group enter this API call,
+=======
+    Create a new process group split from the given parent process group.
+
+    warning:: This is an experimental API. Only the ``NCCL`` and custom plugin backends
+    are supported. Other backends will raise an error.
+    Users of this API must guarantee that all ranks in the parent group enter this API call,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     and the split of the sub groups is the same across all ranks in the parent group.
 
     Args:
         parent_pg (ProcessGroup, optional): The parent process group. If None,
+<<<<<<< HEAD
             the default process group will be used. Users need to gurantee that
+=======
+            the default process group will be used. Users need to guarantee that
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             the parent group is fully initialized (e.g, communicators are initialized)
         split_ranks (list[list[int]]): the split ranks, which is a list of list of ranks.
             Users need to make sure the validity of the split ranks such that one
@@ -4821,12 +5341,18 @@ def split_group(
             [[0, 1], [2, 3]]. Note [[0,1]] is also a valid split, in which case ranks 2, 3 would
             return a non-group member.
         timeout (timedelta, optional): see `init_process_group` for details and default value.
+<<<<<<< HEAD
         pg_options (ProcessGroupOptions, optional): only ProcessGroupNCCLOptions is supported now.
             specifying what additional options need to be passed in during
             the construction of specific process groups. i.e.``is_high_priority_stream``
             can be specified so that process group can pick up high priority cuda streams.
             For other availble options to config nccl,
             See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t
+=======
+        pg_options (ProcessGroupOptions, optional): Additional options need to be passed in during
+            the construction of specific process groups. i.e.``is_high_priority_stream``
+            can be specified so that process group can pick up high priority cuda streams.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         group_desc (str, optional): a string to describe the process group.
 
     Returns:
@@ -4870,18 +5396,26 @@ def split_group(
 
     # if the parent backend does not support splitting, raise error
     # currently this API only support NCCL backend
+<<<<<<< HEAD
     if (
         not parent_backend
         or not parent_backend.supports_splitting
         or not isinstance(parent_backend, ProcessGroupNCCL)
     ):
+=======
+    if not parent_backend or not parent_backend.supports_splitting:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise RuntimeError(
             "No backend for the parent process group or its backend does not support splitting"
         )
 
     # set the group_desc before the color or no_cloor split
     group_desc = (
+<<<<<<< HEAD
         f"{parent_pg.group_desc}:split:{parent_backend.comm_split_count()}"
+=======
+        f"{parent_pg.group_desc}:split:{parent_backend.comm_split_count()}"  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if group_desc is None
         else group_desc
     )
@@ -4891,11 +5425,15 @@ def split_group(
     backend = Backend(parent_backend_str)
     backend_config = BackendConfig(backend)
 
+<<<<<<< HEAD
     if pg_options is not None:
         assert isinstance(pg_options, ProcessGroupNCCL.Options), (
             "Expected pg_options argument to be of type ProcessGroupNCCL.Options"
         )
     else:
+=======
+    if pg_options is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # default pg_options same as the parent process group
         pg_options = parent_backend.options
 
@@ -4926,7 +5464,11 @@ def split_group(
     # if my rank does not belong to any sub group,
     # no_color split should be called
     if my_group is None or group_rank == -1:
+<<<<<<< HEAD
         parent_backend.perform_nocolor_split(device_id)
+=======
+        parent_backend.perform_nocolor_split(device_id)  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return None
 
     group_name = _process_group_name(my_group, use_hashed_name=False)
@@ -4939,6 +5481,7 @@ def split_group(
         group_rank,
         len(my_group),
     )
+<<<<<<< HEAD
     backend_type = ProcessGroup.BackendType.NCCL
     pg.bound_device_id = device_id
     pg._set_default_backend(backend_type)
@@ -4951,6 +5494,42 @@ def split_group(
     backend_class = ProcessGroupNCCL(
         prefix_store, group_rank, len(my_group), pg_options
     )
+=======
+    pg.bound_device_id = device_id  # type: ignore[union-attr]
+    pg_options._timeout = timeout  # type: ignore[union-attr]
+    pg_options.split_from = parent_backend  # type: ignore[union-attr]
+    pg_options.split_color = _process_group_color(my_group)  # type: ignore[union-attr]
+    pg_options.global_ranks_in_group = global_ranks_in_my_group  # type: ignore[union-attr]
+    pg_options.group_name = group_name  # type: ignore[union-attr]
+
+    if parent_backend_str == Backend.NCCL:
+        backend_type = ProcessGroup.BackendType.NCCL
+        if not isinstance(pg_options, ProcessGroupNCCL.Options):
+            raise RuntimeError(
+                "Expected pg_options argument to be of type ProcessGroupNCCL.Options"
+            )
+        backend_class = ProcessGroupNCCL(
+            prefix_store, group_rank, len(my_group), pg_options
+        )
+    else:
+        assert parent_backend_str.upper() in Backend._plugins, (
+            f"Unknown c10d backend type {parent_backend_str.upper()}"
+        )
+        backend_plugin = Backend._plugins[parent_backend_str.upper()]
+        creator_fn = backend_plugin.creator_fn
+        extended_api = backend_plugin.extended_api
+        backend_type = ProcessGroup.BackendType.CUSTOM
+        if not extended_api:
+            backend_class = creator_fn(prefix_store, group_rank, len(my_group), timeout)
+        else:
+            dist_backend_opts = _DistributedBackendOptions()
+            dist_backend_opts.store = prefix_store
+            dist_backend_opts.group_rank = group_rank
+            dist_backend_opts.group_size = len(my_group)
+            backend_class = creator_fn(dist_backend_opts, pg_options)
+
+    pg._set_default_backend(backend_type)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     backend_class._set_sequence_number_for_group()
 
     pg._register_backend(torch.device("cuda"), backend_type, backend_class)
@@ -5017,9 +5596,15 @@ def new_group(
         group, they must be synchronized with other cuda streams by calling `work.wait()`
         before using another process group.
 
+<<<<<<< HEAD
         See `Using multiple NCCL communicators concurrently <https://docs.nvid
         ia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using
         -multiple-nccl-communicators-concurrently>`_ for more details.
+=======
+        See `Using multiple NCCL communicators concurrently
+        <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using-multiple-nccl-communicators-concurrently>`
+        for more details.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         ranks (list[int]): List of ranks of group members. If ``None``, will be
@@ -5037,11 +5622,18 @@ def new_group(
             specifying what additional options need to be passed in during
             the construction of specific process groups. i.e. for the ``nccl``
             backend, ``is_high_priority_stream`` can be specified so that
+<<<<<<< HEAD
             process group can pick up high priority cuda streams. For other availble options to config nccl,
             See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t
         use_local_synchronization (bool, optional): perform a group-local
             barrier at the end of the process group creation. This is different
             in that non-member ranks don't need to call into API and don't
+=======
+            process group can pick up high priority cuda streams. For other available options to config nccl,
+            See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-tuse_local_synchronization
+            (bool, optional): perform a group-local barrier at the end of the process group creation.
+            This is different in that non-member ranks don't need to call into API and don't
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             join the barrier.
         group_desc (str, optional): a string to describe the process group.
         device_id (torch.device, optional): a single, specific device
@@ -5059,7 +5651,11 @@ def new_group(
     as non-member ranks don't join the group barrier().
 
     N.B. use_local_synchronization=True can lead to deadlocks when each rank creates
+<<<<<<< HEAD
     multiple overlaping process groups. To avoid that, make sure all ranks follow the
+=======
+    multiple overlapping process groups. To avoid that, make sure all ranks follow the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     same global creation order.
     """
     return _new_group_with_tag(
@@ -5237,6 +5833,11 @@ def new_subgroups(
             the default subgroup size is equal to the number of devices on each machine,
             based on the assumption that each machine has exactly the same
             number of devices. Default is ``None``.
+<<<<<<< HEAD
+=======
+        group (ProcessGroup, optional): The process group to work on. If
+            ``None``, the default process group will be used. Default is ``None``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         timeout (timedelta, optional): see `init_process_group` for details and default value.
         backend (str or Backend, optional): The backend to use. Depending on
             build-time configurations, valid values are ``gloo`` and ``nccl``.
@@ -5282,12 +5883,17 @@ def new_subgroups(
     if group_size <= 0:
         raise ValueError(f"The arg 'group_size' ({group_size}) must be positive")
 
+<<<<<<< HEAD
     world_size = get_world_size()
+=======
+    world_size = get_world_size(group=group)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if world_size < group_size:
         raise ValueError(
             f"The arg 'group_size' ({group_size}) must not exceed the world size ({world_size})"
         )
     if world_size % group_size != 0:
+<<<<<<< HEAD
         raise ValueError("The world size must be divisible by 'group_size'")
 
     subgroups = []
@@ -5312,6 +5918,24 @@ def new_subgroups(
             logger.info("Rank %s is assigned to subgroup %s", rank, ranks_in_subgroup)
 
     return cur_subgroup, subgroups
+=======
+        raise ValueError(
+            f"The world size ({world_size}) must be divisible by '{group_size=}'"
+        )
+
+    # TODO: Use itertools.batched(get_process_group_ranks(group=group), group_size) instead when Python 3.12 is supported.
+    ranks = get_process_group_ranks(group=group)
+    ranks_per_subgroup_list = [
+        ranks[i : i + group_size] for i in range(0, len(ranks), group_size)
+    ]
+    return new_subgroups_by_enumeration(
+        ranks_per_subgroup_list,
+        timeout=timeout,
+        backend=backend,
+        pg_options=pg_options,
+        group_desc=group_desc,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def new_subgroups_by_enumeration(
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index b468e1dd1b61..5966563b7cff 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -71,7 +71,12 @@ class WorkerSpec:
         tee: tees the specified std stream(s) to console + file,
              selectively tee for a particular local rank by passing a map,
              takes precedence over ``redirects`` settings.
+<<<<<<< HEAD
 
+=======
+        event_log_handler: name of the event logging handler as registered in
+          `elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     role: str
@@ -86,6 +91,10 @@ class WorkerSpec:
     master_port: Optional[int] = None
     master_addr: Optional[str] = None
     local_addr: Optional[str] = None
+<<<<<<< HEAD
+=======
+    event_log_handler: str = "null"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         assert self.local_world_size > 0
@@ -424,7 +433,11 @@ def get_worker_group(self, role: str = DEFAULT_ROLE) -> WorkerGroup:
 
         Note that the worker group is a mutable object and hence in a
         multi-threaded/process environment it may change state.
+<<<<<<< HEAD
         Implementors are encouraged (but not required) to return
+=======
+        Implementers are encouraged (but not required) to return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a defensive read-only copy.
         """
         raise NotImplementedError
@@ -457,12 +470,19 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
         raise NotImplementedError
 
     @abc.abstractmethod
+<<<<<<< HEAD
     def _stop_workers(
         self, worker_group: WorkerGroup, is_restart: bool = False
     ) -> None:
         r"""Stop all workers in the given worker group.
 
         Implementors must deal with workers in all states defined by
+=======
+    def _stop_workers(self, worker_group: WorkerGroup) -> None:
+        r"""Stop all workers in the given worker group.
+
+        Implementers must deal with workers in all states defined by
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ``WorkerState``. That is, it must gracefully handle stopping
         non-existent workers, unhealthy (stuck) workers, etc.
         """
@@ -477,9 +497,13 @@ def _monitor_workers(self, worker_group: WorkerGroup) -> RunResult:
         raise NotImplementedError
 
     @abc.abstractmethod
+<<<<<<< HEAD
     def _shutdown(
         self, death_sig: signal.Signals = signal.SIGTERM, is_restart: bool = False
     ) -> None:
+=======
+    def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Clean up any resources that were allocated during the agent's work.
 
         Args:
@@ -502,8 +526,13 @@ def _rendezvous(self, worker_group: WorkerGroup) -> None:
         group_rank = rdzv_info.rank
         group_world_size = rdzv_info.world_size
 
+<<<<<<< HEAD
         # master_addr/master_port could be explicitly overriden
         # TODO: BC - specific to static rdzv and can be simplifed further
+=======
+        # master_addr/master_port could be explicitly overridden
+        # TODO: BC - specific to static rdzv and can be simplified further
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         master_addr = spec.master_addr or rdzv_info.bootstrap_store_info.master_addr
         master_port = spec.master_port or rdzv_info.bootstrap_store_info.master_port
 
@@ -533,7 +562,12 @@ def _rendezvous(self, worker_group: WorkerGroup) -> None:
             "  role_ranks=%(role_ranks)s\n"
             "  global_ranks=%(global_ranks)s\n"
             "  role_world_sizes=%(role_world_sizes)s\n"
+<<<<<<< HEAD
             "  global_world_sizes=%(global_world_sizes)s\n",
+=======
+            "  global_world_sizes=%(global_world_sizes)s\n"
+            "  event_log_handler=%(event_log_handler)s\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             {
                 "role": spec.role,
                 "restart_count": restart_count,
@@ -546,6 +580,10 @@ def _rendezvous(self, worker_group: WorkerGroup) -> None:
                 "global_ranks": [worker.global_rank for worker in workers],
                 "role_world_sizes": [worker.role_world_size for worker in workers],
                 "global_world_sizes": [worker.world_size for worker in workers],
+<<<<<<< HEAD
+=======
+                "event_log_handler": spec.event_log_handler,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             },
         )
 
@@ -687,6 +725,13 @@ def _initialize_workers(self, worker_group: WorkerGroup) -> None:
         for local_rank, w_id in worker_ids.items():
             worker = worker_group.workers[local_rank]
             worker.id = w_id
+<<<<<<< HEAD
+=======
+            record(
+                self._construct_event("START", EventSource.WORKER, worker),
+                worker_group.spec.event_log_handler,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         worker_group.state = WorkerState.HEALTHY
 
@@ -697,7 +742,11 @@ def _restart_workers(self, worker_group: WorkerGroup) -> None:
         """Restart (stops, rendezvous, starts) all local workers in the group."""
         role = worker_group.spec.role
         logger.info("[%s] Stopping worker group", role)
+<<<<<<< HEAD
         self._stop_workers(worker_group, is_restart=True)
+=======
+        self._stop_workers(worker_group)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         worker_group.state = WorkerState.STOPPED
         self._initialize_workers(worker_group)
 
@@ -744,7 +793,14 @@ def _record_worker_events(self, result: RunResult) -> None:
             failure = result.failures.get(worker.global_rank)
             state: str = self._get_worker_state(worker, result)
             raw_error = json.dumps(failure.error_file_data) if failure else None
+<<<<<<< HEAD
             record(self._construct_event(state, EventSource.WORKER, worker, raw_error))
+=======
+            record(
+                self._construct_event(state, EventSource.WORKER, worker, raw_error),
+                self._worker_group.spec.event_log_handler,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _get_worker_state(self, worker: Worker, result: RunResult) -> str:
         failure = result.failures.get(worker.global_rank)
@@ -767,7 +823,12 @@ def record_duration(self, state: str):
             record(
                 self._construct_event(
                     state=state, source=EventSource.AGENT, duration_ms=duration_ms
+<<<<<<< HEAD
                 )
+=======
+                ),
+                self._worker_group.spec.event_log_handler,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def _construct_event(
@@ -809,6 +870,10 @@ def _construct_event(
             "agent_restarts": spec.max_restarts - self._remaining_restarts,
             "duration_ms": duration_ms,
         }
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return Event(
             f"torchelastic.worker.status.{state}", source=source, metadata=metadata
         )
diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
index a8906bbc9b5b..cc8d9402080a 100644
--- a/torch/distributed/elastic/agent/server/local_elastic_agent.py
+++ b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -275,15 +275,24 @@ def _log_watchdog_event(
         event = events.Event(
             name=name, source=events.EventSource.AGENT, metadata=metadata
         )
+<<<<<<< HEAD
         events.record(event)
+=======
+        events.record(event, self._worker_group.spec.event_log_handler)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
     @prof
+<<<<<<< HEAD
     def _stop_workers(
         self, worker_group: WorkerGroup, is_restart: bool = False
     ) -> None:
         self._shutdown(is_restart=is_restart)
+=======
+    def _stop_workers(self, worker_group: WorkerGroup) -> None:
+        self._shutdown()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
@@ -359,9 +368,13 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
 
         return self._pcontext.pids()
 
+<<<<<<< HEAD
     def _shutdown(
         self, death_sig: signal.Signals = signal.SIGTERM, is_restart: bool = False
     ) -> None:
+=======
+    def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self._worker_watchdog is not None:
             self._worker_watchdog.stop()
             self._worker_watchdog = None
@@ -370,8 +383,11 @@ def _shutdown(
             self._health_check_server = None
         if self._pcontext:
             self._pcontext.close(death_sig)
+<<<<<<< HEAD
         if not is_restart and self._rdzv_handler:
             self._rdzv_handler.shutdown()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
diff --git a/torch/distributed/elastic/control_plane.py b/torch/distributed/elastic/control_plane.py
index 8e47868e2977..39ff79dbae4d 100644
--- a/torch/distributed/elastic/control_plane.py
+++ b/torch/distributed/elastic/control_plane.py
@@ -23,8 +23,13 @@ def _worker_server(socket_path: str) -> Generator[None, None, None]:
         server.shutdown()
 
 
+<<<<<<< HEAD
 @contextmanager
 @record
+=======
+@record
+@contextmanager
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def worker_main() -> Generator[None, None, None]:
     """
     This is a context manager that wraps your main entry function. This combines
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index 6d899a95d6a7..648529654e3e 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -290,7 +290,11 @@ def reify(
         - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/error.json`
         """
         nprocs = len(envs)
+<<<<<<< HEAD
         global_env = {}  # use only to query properies that are not dependent on a rank
+=======
+        global_env = {}  # use only to query properties that are not dependent on a rank
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if nprocs > 0:
             global_env = envs[0]
         else:
@@ -452,7 +456,11 @@ def __init__(
         # all local ranks are accounted for
         nprocs = len(args)
 
+<<<<<<< HEAD
         # TODO log_line_prefixes can be exanded too
+=======
+        # TODO log_line_prefixes can be expanded too
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         logs_dest = logs_specs.reify(envs)
 
         _validate_full_rank(logs_dest.stdouts, nprocs, "stdouts")
diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py
index 57e445a3d02a..98ee03fd5081 100644
--- a/torch/distributed/elastic/multiprocessing/errors/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py
@@ -58,7 +58,12 @@
 from datetime import datetime
 from functools import wraps
 from string import Template
+<<<<<<< HEAD
 from typing import Any, Callable, Optional, TypeVar
+=======
+from typing import Any, Callable, Optional, TypeVar, Union
+from typing_extensions import ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.distributed.elastic.utils.logging import get_logger
 
@@ -82,7 +87,12 @@
 _EMPTY_ERROR_DATA = {"message": "<NONE>"}
 _NOT_AVAILABLE = "<N/A>"
 
+<<<<<<< HEAD
 T = TypeVar("T")
+=======
+_R = TypeVar("_R")
+_P = ParamSpec("_P")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -305,8 +315,13 @@ def _format_failure(
 
 
 def record(
+<<<<<<< HEAD
     fn: Callable[..., T], error_handler: Optional[ErrorHandler] = None
 ) -> Callable[..., T]:
+=======
+    fn: Callable[_P, _R], error_handler: Optional[ErrorHandler] = None
+) -> Callable[_P, Union[_R, None]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Syntactic sugar to record errors/exceptions that happened in the decorated
     function using the provided ``error_handler``.
@@ -346,9 +361,15 @@ def main():
     if not error_handler:
         error_handler = get_error_handler()
 
+<<<<<<< HEAD
     def wrap(f):
         @wraps(f)
         def wrapper(*args, **kwargs):
+=======
+    def wrap(f: Callable[_P, _R]) -> Callable[_P, Union[_R, None]]:
+        @wraps(f)
+        def wrapper(*args: _P.args, **kwargs: _P.kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert error_handler is not None  # assertion for mypy type checker
             error_handler.initialize()
             try:
diff --git a/torch/distributed/elastic/rendezvous/__init__.py b/torch/distributed/elastic/rendezvous/__init__.py
index 0766df8e5f3a..2a0c24b18091 100644
--- a/torch/distributed/elastic/rendezvous/__init__.py
+++ b/torch/distributed/elastic/rendezvous/__init__.py
@@ -66,7 +66,11 @@
 connectivity, etc), between joining the rendezvous and it being completed, then
 a re-rendezvous with remaining healthy nodes will happen automatically.
 
+<<<<<<< HEAD
 A node can also fail *after* it has completed (or *has been observered* by other
+=======
+A node can also fail *after* it has completed (or *has been observed* by other
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 nodes to have completed) the rendezvous - this scenario will be handled by the
 Torch Distributed Elastic ``train_loop`` instead (where it will also trigger a
 re-rendezvous).
diff --git a/torch/distributed/elastic/rendezvous/api.py b/torch/distributed/elastic/rendezvous/api.py
index be0d6e28536f..81ad42f5be99 100644
--- a/torch/distributed/elastic/rendezvous/api.py
+++ b/torch/distributed/elastic/rendezvous/api.py
@@ -157,9 +157,15 @@ def get_backend(self) -> str:
     @property
     def use_agent_store(self) -> bool:
         """Indicates that store reference returned by :py:meth:`next_rendezvous` can be shared with user
+<<<<<<< HEAD
         applications and will be available during application lifecyle.
 
         Rendezous handler impl will share store details as instance of :py:class:`RendezvousStoreInfo`.
+=======
+        applications and will be available during application lifecycle.
+
+        Rendezvous handler impl will share store details as instance of :py:class:`RendezvousStoreInfo`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Applications as a convention use `MASTER_ADDR`/`MASTER_PORT` env variables to lookup the store.
         """
         return False
diff --git a/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py b/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
index 2cbb37a1b510..0dbfef78717d 100644
--- a/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
@@ -805,7 +805,11 @@ def _remove_from_wait_list(self) -> None:
 
     def _remove_from_redundancy_list(self) -> None:
         msg = (
+<<<<<<< HEAD
             f"The node '{self._node}' removed itself from the redunant list of round "
+=======
+            f"The node '{self._node}' removed itself from the redundant list of round "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"{self._state.round + 1} of the rendezvous '{self._settings.run_id}'. Pending sync."
         )
         self._record(message=msg)
@@ -880,7 +884,11 @@ def __call__(self, ctx: _RendezvousContext, deadline: float) -> _Action:
             return _Action.ERROR_CLOSED
 
         if ctx.node in state.redundancy_list:
+<<<<<<< HEAD
             msg = f"The node {ctx.node} is in redunancy list"
+=======
+            msg = f"The node {ctx.node} is in redundancy list"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             logger.debug(msg)
             # don't apply the timeout logic here, since we want to allow the node to rejoin
             if len(state.participants) == ctx.settings.max_nodes:
@@ -890,7 +898,11 @@ def __call__(self, ctx: _RendezvousContext, deadline: float) -> _Action:
                     return _Action.SYNC
             else:
                 # transition to waiting state that will respect timeouts.
+<<<<<<< HEAD
                 msg = f"The node {ctx.node} is removed from redunancy list"
+=======
+                msg = f"The node {ctx.node} is removed from redundancy list"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 logger.debug(msg)
                 return _Action.REMOVE_FROM_REDUNDANCY_LIST
 
diff --git a/torch/distributed/elastic/timer/file_based_local_timer.py b/torch/distributed/elastic/timer/file_based_local_timer.py
index d1c91b63d998..6ecaf7fd158f 100644
--- a/torch/distributed/elastic/timer/file_based_local_timer.py
+++ b/torch/distributed/elastic/timer/file_based_local_timer.py
@@ -13,7 +13,12 @@
 import sys
 import threading
 import time
+<<<<<<< HEAD
 from typing import Callable, Optional
+=======
+from typing import Callable, Optional, TypeVar
+from typing_extensions import ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.distributed.elastic.timer.api import TimerClient, TimerRequest
 from torch.distributed.elastic.timer.debug_info_logging import (
@@ -22,6 +27,12 @@
 from torch.distributed.elastic.utils.logging import get_logger
 
 
+<<<<<<< HEAD
+=======
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = ["FileTimerClient", "FileTimerRequest", "FileTimerServer"]
 
 logger = get_logger(__name__)
@@ -36,8 +47,13 @@ def _retry(max_retries: int, sleep_time: float) -> Callable:
         sleep_time: float, the time to sleep between retries.
     """
 
+<<<<<<< HEAD
     def wrapper(func: Callable) -> Callable:
         def wrapper(*args, **kwargs):
+=======
+    def wrapper(func: Callable[_P, _R]) -> Callable[_P, _R]:
+        def wrapper(*args: _P.args, **kwargs: _P.kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i in range(max_retries):
                 try:
                     return func(*args, **kwargs)
diff --git a/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py b/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
index 8e378c6a1be1..fb7ce394355c 100644
--- a/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
+++ b/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
@@ -8,12 +11,29 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
+<<<<<<< HEAD
 
 import torch
 from torch.utils.data.distributed import DistributedSampler
 
 
 class ElasticDistributedSampler(DistributedSampler):
+=======
+from collections.abc import Iterator, Sized
+from typing import cast, Optional, TypeVar
+
+import torch
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+
+
+T = TypeVar("T")
+
+__all__ = ["ElasticDistributedSampler"]
+
+
+class ElasticDistributedSampler(DistributedSampler[T]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Sampler that restricts data loading to a subset of
     the dataset for elastic training.
@@ -34,6 +54,7 @@ class ElasticDistributedSampler(DistributedSampler):
         start_index (optional):  Which index of the dataset to start sampling from
     """
 
+<<<<<<< HEAD
     def __init__(self, dataset, num_replicas=None, rank=None, start_index=0):
         super().__init__(dataset=dataset, num_replicas=num_replicas, rank=rank)
         if start_index >= len(dataset):
@@ -53,6 +74,41 @@ def __iter__(self):
         g.manual_seed(self.epoch)
         indices = (
             torch.randperm(len(self.dataset) - self.start_index, generator=g)  # type: ignore[arg-type]
+=======
+    def __init__(
+        self,
+        dataset: Dataset[T],
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        start_index: int = 0,
+    ):
+        super().__init__(dataset=dataset, num_replicas=num_replicas, rank=rank)
+        if not isinstance(dataset, Sized):
+            raise TypeError("Dataset must be an instance of collections.abc.Sized")
+
+        # Cast to Sized for mypy
+        sized_dataset = cast(Sized, dataset)
+
+        if start_index >= len(sized_dataset):
+            raise ValueError(
+                f"Start index {start_index} should be less than dataset size {len(sized_dataset)}"
+            )
+
+        self.start_index = start_index
+        sized_dataset = cast(Sized, self.dataset)
+        self.num_samples = int(
+            math.ceil(float(len(sized_dataset) - self.start_index) / self.num_replicas)
+        )
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self) -> Iterator[T]:
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        sized_dataset = cast(Sized, self.dataset)
+        indices = (
+            torch.randperm(len(sized_dataset) - self.start_index, generator=g)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             .add(self.start_index)
             .tolist()
         )
@@ -67,5 +123,9 @@ def __iter__(self):
 
         return iter(indices)
 
+<<<<<<< HEAD
     def __len__(self):
+=======
+    def __len__(self) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.num_samples
diff --git a/torch/distributed/elastic/utils/logging.py b/torch/distributed/elastic/utils/logging.py
index d87504d255d6..89ed1e42d8cb 100644
--- a/torch/distributed/elastic/utils/logging.py
+++ b/torch/distributed/elastic/utils/logging.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
@@ -16,7 +19,11 @@
 from torch.distributed.elastic.utils.log_level import get_log_level
 
 
+<<<<<<< HEAD
 def get_logger(name: Optional[str] = None):
+=======
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Util function to set up a simple logger that writes
     into stderr. The loglevel is fetched from the LOGLEVEL
@@ -33,7 +40,11 @@ def get_logger(name: Optional[str] = None):
     return _setup_logger(name or _derive_module_name(depth=2))
 
 
+<<<<<<< HEAD
 def _setup_logger(name: Optional[str] = None):
+=======
+def _setup_logger(name: Optional[str] = None) -> logging.Logger:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     logger = logging.getLogger(name)
     logger.setLevel(os.environ.get("LOGLEVEL", get_log_level()))
     return logger
diff --git a/torch/distributed/elastic/utils/store.py b/torch/distributed/elastic/utils/store.py
index 0afe82c46d89..294629dabf0c 100644
--- a/torch/distributed/elastic/utils/store.py
+++ b/torch/distributed/elastic/utils/store.py
@@ -184,7 +184,11 @@ def barrier(
 
     Optionally, passing rank will enable tracing of missing ranks on timeouts.
     `rank_tracing_decoder` lambda arg can be used to convert rank data
+<<<<<<< HEAD
     into a more meaninful information at an app level (e.g. hostname).
+=======
+    into a more meaningful information at an app level (e.g. hostname).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Note: Since the data is not removed from the store, the barrier can be used
         once per unique ``key_prefix``.
diff --git a/torch/distributed/examples/memory_tracker_example.py b/torch/distributed/examples/memory_tracker_example.py
index e40cfb8b3f59..28d926bd4149 100644
--- a/torch/distributed/examples/memory_tracker_example.py
+++ b/torch/distributed/examples/memory_tracker_example.py
@@ -1,13 +1,22 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import torchvision
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch.distributed._tools import MemoryTracker
 
 
+<<<<<<< HEAD
 def run_one_model(net: torch.nn.Module, input: torch.Tensor):
     net.cuda()
     input = input.cuda()
+=======
+def run_one_model(net: torch.nn.Module, input: torch.Tensor, device: str = "cuda"):
+    net.to(device)
+    input = input.to(device)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Create the memory Tracker
     mem_tracker = MemoryTracker()
@@ -30,4 +39,16 @@ def run_one_model(net: torch.nn.Module, input: torch.Tensor):
     mem_tracker.show_traces()
 
 
+<<<<<<< HEAD
 run_one_model(torchvision.models.resnet34(), torch.rand(32, 3, 224, 224, device="cuda"))
+=======
+if __name__ == "__main__":
+    import torchvision
+
+    dev = "cuda"
+    run_one_model(
+        torchvision.models.resnet34(),
+        torch.rand(32, 3, 224, 224, device=dev),
+        device=dev,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/fsdp/_debug_utils.py b/torch/distributed/fsdp/_debug_utils.py
index 2103da08a976..33758d83a0a4 100644
--- a/torch/distributed/fsdp/_debug_utils.py
+++ b/torch/distributed/fsdp/_debug_utils.py
@@ -68,7 +68,11 @@ def _get_sharded_module_tree_with_module_name_to_fqns(
 ) -> tuple[str, dict[str, list[str]]]:
     """
     It is used for composable fully_shard() code path, it returns
+<<<<<<< HEAD
       1. sharded module tree info: each line reprents a submodule name that contats the
+=======
+      1. sharded module tree info: each line represents a submodule name that contains the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     submodule's FQN and its submodule class name, if the submodule is sharded by `fully_shard`,
     the submodule name will add a postfix with ' FULLY SHARDED'. Each increased tree
     level adds 4 spaces before the printed name. A printed sharded module tree info for a toy model
diff --git a/torch/distributed/fsdp/_flat_param.py b/torch/distributed/fsdp/_flat_param.py
index 0d508758d3fb..4759065c9714 100644
--- a/torch/distributed/fsdp/_flat_param.py
+++ b/torch/distributed/fsdp/_flat_param.py
@@ -593,6 +593,12 @@ def __init__(
         )
         self._use_unsharded_views(as_params=False)
 
+<<<<<<< HEAD
+=======
+    def __repr__(self):
+        return f"FlatParamHandle(flat_param.fqns={self.flat_param._fqns})"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _init_setattr_fns(self):
         use_unsafe_setattr = os.environ.get(_FSDP_USE_UNSAFE_SETATTR, "") == "1"
         self._setattr_tensor: Callable[[nn.Module, str, Tensor], None]
@@ -2746,7 +2752,11 @@ def _construct_padding_tensor(
 
 
 # Use `lru_cache(1)` to only log the warning once (assuming the fixed warning
+<<<<<<< HEAD
 # messasge is passed in)
+=======
+# message is passed in)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @functools.lru_cache(1)
 def _warn_skip_writeback_check(log: logging.Logger, warning: str):
     logger.warning(warning)
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
index c8d635cc8691..a699495731b6 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
@@ -4,7 +4,11 @@
 import torch
 import torch.distributed as dist
 from torch.distributed.device_mesh import _get_device_handle
+<<<<<<< HEAD
 from torch.distributed.distributed_c10d import ReduceOp
+=======
+from torch.distributed.distributed_c10d import _resolve_process_group, ReduceOp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor import DTensor
 
 from ._fsdp_common import (
@@ -29,6 +33,23 @@ class AllGatherResult(NamedTuple):
     all_gather_input_split_sizes: list[int]
 
 
+<<<<<<< HEAD
+=======
+def allocate_memory(
+    size: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    group: dist.ProcessGroup,
+    from_process_group: bool,
+) -> torch.Tensor:
+    if from_process_group:
+        backend = group._get_backend(device)
+        if backend.supports_tensor_alloc(device):
+            return backend.allocate_tensor(size, dtype=dtype, device=device)
+    return torch.empty((size,), dtype=dtype, device=device)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 lib = torch.library.Library("fsdp", "FRAGMENT")  # noqa: TOR901
 
 lib.define(
@@ -40,7 +61,13 @@ class AllGatherResult(NamedTuple):
         SymInt world_size,
         SymInt rank,
         ScalarType dtype,
+<<<<<<< HEAD
         Device device
+=======
+        Device device,
+        str group_name,
+        bool allocate_memory_from_process_group
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> (Tensor, Tensor)
     """
 )
@@ -55,6 +82,11 @@ def all_gather_copy_in_meta(
     rank: int,
     dtype: torch.dtype,
     device: torch.device,
+<<<<<<< HEAD
+=======
+    group_name: str,
+    allocate_memory_from_process_group: bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[torch.Tensor, torch.Tensor]:
     all_gather_output = torch.empty(
         (all_gather_input_numel * world_size,), dtype=dtype, device="meta"
@@ -70,6 +102,10 @@ def all_gather_copy_in_meta(
 @torch.library.impl(lib, "all_gather_copy_in", "HPU")
 @torch.library.impl(lib, "all_gather_copy_in", "CPU")
 @torch.library.impl(lib, "all_gather_copy_in", "MTIA")
+<<<<<<< HEAD
+=======
+@torch.library.impl(lib, "all_gather_copy_in", "PrivateUse1")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def all_gather_copy_in_cuda(
     all_gather_inputs: list[torch.Tensor],
     inp_split_sizes: list[int],
@@ -78,9 +114,21 @@ def all_gather_copy_in_cuda(
     rank: int,
     dtype: torch.dtype,
     device: torch.device,
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor, torch.Tensor]:
     all_gather_output = torch.empty(
         (all_gather_input_numel * world_size,), dtype=dtype, device=device
+=======
+    group_name: str,
+    allocate_memory_from_process_group: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    all_gather_output = allocate_memory(
+        all_gather_input_numel * world_size,
+        dtype=dtype,
+        device=device,
+        group=_resolve_process_group(group_name),
+        from_process_group=allocate_memory_from_process_group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     all_gather_input = all_gather_output.narrow(
         0, all_gather_input_numel * rank, all_gather_input_numel
@@ -102,6 +150,10 @@ def all_gather_copy_in_cuda(
 @torch.library.impl(lib, "split_with_sizes_copy", "HPU")
 @torch.library.impl(lib, "split_with_sizes_copy", "CPU")
 @torch.library.impl(lib, "split_with_sizes_copy", "MTIA")
+<<<<<<< HEAD
+=======
+@torch.library.impl(lib, "split_with_sizes_copy", "PrivateUse1")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def split_with_sizes_copy(
     all_gather_output: torch.Tensor,
     all_gather_input_split_sizes: list[int],
@@ -124,6 +176,10 @@ def split_with_sizes_copy(
 @torch.library.impl(lib, "chunk_cat", "HPU")
 @torch.library.impl(lib, "chunk_cat", "CPU")
 @torch.library.impl(lib, "chunk_cat", "MTIA")
+<<<<<<< HEAD
+=======
+@torch.library.impl(lib, "chunk_cat", "PrivateUse1")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def chunk_cat(
     tensors: list[torch.Tensor],
     dim: int,
@@ -141,6 +197,10 @@ def foreach_all_gather(
     all_gather_copy_in_stream: torch.Stream,
     all_gather_stream: torch.Stream,
     device: torch.device,
+<<<<<<< HEAD
+=======
+    allocate_memory_from_process_group: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Optional[AllGatherResult]:
     world_size, rank = group.size(), group.rank()
     device_handle = _get_device_handle(device.type)
@@ -167,6 +227,11 @@ def foreach_all_gather(
             rank,
             dtype,
             device,
+<<<<<<< HEAD
+=======
+            group.group_name,
+            allocate_memory_from_process_group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         del param_all_gather_inputs
     all_gather_stream.wait_stream(all_gather_copy_in_stream)
@@ -296,7 +361,18 @@ def foreach_all_gather_copy_out(
         out = [t.view(world_size, -1) for t in split_with_sizes_out]
 
     # only avoid VC bump if we are not in inference mode
+<<<<<<< HEAD
     non_inference_outs = [o for o in out if not o.is_inference()]
+=======
+    if torch._dynamo.is_compiling():
+        # For torch.compile, we turn off inference_mode for fake tensor
+        # propagation, and therefore graph break on is_inference. For `compile`,
+        # we don't care about VCs, so just skip the optimization.
+        non_inference_outs = []
+    else:
+        non_inference_outs = [o for o in out if not o.is_inference()]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if len(non_inference_outs) > 0:
         with torch.autograd._unsafe_preserve_version_counter(tuple(non_inference_outs)):
             torch.ops.fsdp.split_with_sizes_copy(
@@ -341,15 +417,27 @@ def foreach_reduce(
     unsharded_grads: list[torch.Tensor],
     reduce_scatter_group: dist.ProcessGroup,
     reduce_scatter_stream: torch.Stream,
+<<<<<<< HEAD
     orig_dtype: torch.dtype,
     reduce_dtype: Optional[torch.dtype],
     device: torch.device,
     reduce_scatter_reduce_op: Optional[Union[dist.ReduceOp, dist.ReduceOp.RedOpType]],
+=======
+    orig_dtype: Optional[torch.dtype],
+    reduce_dtype: Optional[torch.dtype],
+    device: torch.device,
+    gradient_divide_factor: Optional[float],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     all_reduce_group: Optional[dist.ProcessGroup],  # not `None` iff HSDP
     all_reduce_stream: torch.Stream,
     all_reduce_grads: bool,
     partial_reduce_output: Optional[torch.Tensor],  # only used for HSDP
     all_reduce_hook: Optional[Callable[[torch.Tensor], None]],
+<<<<<<< HEAD
+=======
+    allocate_memory_from_process_group: bool = False,
+    force_sum_reduction_for_comms: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[
     torch.Tensor,
     torch.Event,
@@ -371,8 +459,20 @@ def foreach_reduce(
         )
     grad_dtype = unsharded_grads[0].dtype
     reduce_dtype = reduce_dtype or grad_dtype
+<<<<<<< HEAD
     predivide_factor, postdivide_factor = _get_gradient_divide_factors(
         reduce_scatter_group, all_reduce_group, reduce_dtype, device.type
+=======
+    (predivide_factor, postdivide_factor, reduce_scatter_op, all_reduce_op) = (
+        _get_gradient_divide_factors(
+            reduce_scatter_group,
+            all_reduce_group,
+            reduce_dtype,
+            device.type,
+            gradient_divide_factor,
+            force_sum_reduction_for_comms,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     world_size = reduce_scatter_group.size()
     for i, (fsdp_param, unsharded_grad) in enumerate(zip(fsdp_params, unsharded_grads)):
@@ -388,8 +488,17 @@ def foreach_reduce(
     )
     reduce_scatter_input_numel = sum(s.numel() for s in padded_unsharded_sizes)
     reduce_scatter_output_numel = reduce_scatter_input_numel // world_size
+<<<<<<< HEAD
     reduce_scatter_input = torch.empty(
         (reduce_scatter_input_numel,), dtype=reduce_dtype, device=device
+=======
+    reduce_scatter_input = allocate_memory(
+        reduce_scatter_input_numel,
+        dtype=reduce_dtype,
+        device=device,
+        group=reduce_scatter_group,
+        from_process_group=allocate_memory_from_process_group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     device_handle = _get_device_handle(device.type)
     foreach_reduce_scatter_copy_in(unsharded_grads, reduce_scatter_input, world_size)
@@ -400,6 +509,7 @@ def foreach_reduce(
     all_reduce_input = None
     all_reduce_event = None
     with device_handle.stream(reduce_scatter_stream):
+<<<<<<< HEAD
         reduce_output = reduce_scatter_input.new_empty((reduce_scatter_output_numel,))
         _div_if_needed(reduce_scatter_input, predivide_factor)
         if reduce_scatter_reduce_op is None:
@@ -407,11 +517,25 @@ def foreach_reduce(
                 reduce_scatter_reduce_op = ReduceOp.AVG
             else:
                 reduce_scatter_reduce_op = ReduceOp.SUM
+=======
+        reduce_output = allocate_memory(
+            reduce_scatter_output_numel,
+            dtype=reduce_dtype,
+            device=device,
+            group=reduce_scatter_group,
+            from_process_group=allocate_memory_from_process_group,
+        )
+        _div_if_needed(reduce_scatter_input, predivide_factor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dist.reduce_scatter_tensor(
             output=reduce_output,
             input=reduce_scatter_input,
             group=reduce_scatter_group,
+<<<<<<< HEAD
             op=reduce_scatter_reduce_op,
+=======
+            op=reduce_scatter_op,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         reduce_scatter_event = reduce_scatter_stream.record_event()
         post_reduce_stream = reduce_scatter_stream
@@ -438,7 +562,11 @@ def foreach_reduce(
                 dist.all_reduce(
                     reduce_output,
                     group=all_reduce_group,
+<<<<<<< HEAD
                     op=ReduceOp.AVG if predivide_factor is None else ReduceOp.SUM,
+=======
+                    op=all_reduce_op,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 all_reduce_input = reduce_output
                 all_reduce_event = all_reduce_stream.record_event()
@@ -556,6 +684,7 @@ def _get_gradient_divide_factors(
     all_reduce_group: Optional[dist.ProcessGroup],
     reduce_dtype: torch.dtype,
     device_type: str = "",
+<<<<<<< HEAD
 ) -> Union[tuple[None, None], tuple[float, float]]:
     # For fp32/bf16, we do not need to worry about overflow/underflow, so we
     # use NCCL's built-in division to avoid separate div kernels
@@ -577,4 +706,57 @@ def _get_gradient_divide_factors(
 
 def _div_if_needed(tensor: torch.Tensor, div_factor: Optional[float]) -> None:
     if div_factor is not None and div_factor > 1:
+=======
+    factor: Optional[float] = None,
+    force_sum_reduction_for_comms: bool = False,
+) -> tuple[
+    Optional[float],
+    Optional[float],
+    Union[dist.ReduceOp, dist.ReduceOp.RedOpType],
+    Union[dist.ReduceOp, dist.ReduceOp.RedOpType],
+]:
+    # MTIA appears to only support SUM reduction, hence we force it implicitly
+    if device_type == "mtia":
+        force_sum_reduction_for_comms = True
+
+    # For fp32/bf16, we do not need to worry about overflow/underflow, so we
+    # use NCCL's built-in division to avoid separate div kernels
+    overflow_risk = reduce_dtype not in (torch.float32, torch.bfloat16)
+
+    data_parallel_size = reduce_scatter_group.size()
+    if all_reduce_group is not None:
+        data_parallel_size *= all_reduce_group.size()
+
+    if factor is None:
+        factor = float(data_parallel_size)
+
+    if not overflow_risk and not force_sum_reduction_for_comms:
+        if factor == data_parallel_size:
+            # Warning: NCCL ReduceOp.AVG may produce incorrect results with
+            # world size 1.
+            return None, None, ReduceOp.AVG, ReduceOp.AVG
+        else:
+            reduce_scatter_op = torch.distributed._make_nccl_premul_sum(1 / factor)
+            return None, None, reduce_scatter_op, ReduceOp.SUM
+
+    pre_factor: Optional[float]
+    if overflow_risk:
+        # Since fp16 has smaller dynamic range than fp32/bf16, we want to avoid
+        # overflow/underflow. For N data parallel workers, each worker computes
+        # g_i, and they collectively reduce (g_1 + ... + g_N) / N. To avoid
+        # overflow/underflow, we divide by ~sqrt(N) before/after the reduction.
+        pre_factor = 1
+        while factor % pre_factor == 0 and factor / pre_factor > pre_factor:
+            pre_factor *= 2
+        post_factor = factor / pre_factor
+    else:
+        # Prefer post-multiplying as it operates on less data and is thus faster
+        pre_factor, post_factor = None, factor
+
+    return pre_factor, post_factor, ReduceOp.SUM, ReduceOp.SUM
+
+
+def _div_if_needed(tensor: torch.Tensor, div_factor: Optional[float]) -> None:
+    if div_factor is not None and div_factor != 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         tensor.div_(div_factor)
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_init.py b/torch/distributed/fsdp/_fully_shard/_fsdp_init.py
index 444fa8d36ef8..61b59bd5af49 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_init.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_init.py
@@ -1,9 +1,17 @@
 import itertools
+<<<<<<< HEAD
+=======
+import logging
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional, Union
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+<<<<<<< HEAD
+=======
+from torch._logging import warning_once
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import _get_device_handle
 from torch.distributed.tensor import DeviceMesh, DTensor, init_device_mesh
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
@@ -12,6 +20,12 @@
 from ._fsdp_state import _get_module_fsdp_state
 
 
+<<<<<<< HEAD
+=======
+logger = logging.getLogger("torch.distributed.fsdp.fully_shard")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_post_forward_mesh_info(
     reshard_after_forward: Union[bool, int], mesh_info: FSDPMeshInfo
 ) -> Optional[FSDPMeshInfo]:
@@ -35,6 +49,14 @@ def _get_post_forward_mesh_info(
                 f"factor of {shard_mesh_size}, not {reshard_after_forward}"
             )
         elif reshard_after_forward == 1:
+<<<<<<< HEAD
+=======
+            msg = (
+                "reshard_after_forward=1 (int) means resharding parameters to world size 1, "
+                "instead of reshard_after_forward=True (bool)"
+            )
+            warning_once(logger, msg, stacklevel=2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             reshard_after_forward = False
         elif reshard_after_forward == shard_mesh_size:
             reshard_after_forward = True
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
index 274bb9d1cc10..0be80190d83b 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
@@ -42,7 +42,11 @@
 - Unsharded parameter: parameter used for forward/backward computation, derived
   from the all-gather output; autograd leaf
 
+<<<<<<< HEAD
 We define these tensors to describe the general framework that can accomodate
+=======
+We define these tensors to describe the general framework that can accommodate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 extensions, where:
 - all-gather-inputs = pre-all-gather-transform(sharded-parameter)
 - unsharded-parameter = post-all-gather-transform(all-gather-outputs)
@@ -327,6 +331,7 @@ def _init_sharded_param(
                 self._spmd_placements,
                 tensor_meta=self._tp_spec.tensor_meta,
             )
+<<<<<<< HEAD
             # TODO: Enable uneven sharding for FSDP+TP.
             if split_factor > 1:  # FSDP has strided sharding on tensor dim 0
                 num_shards = self._sharding_spec.num_shards_map[0]
@@ -337,6 +342,8 @@ def _init_sharded_param(
                         f"tensor dim 0 has size {tensor_size_dim_0} which cannot be "
                         f"evenly sharded into {num_shards} shards."
                     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             param_data = cast(DTensor, param)._local_tensor
         else:
             self._spmd_mesh = self.mesh_info.mesh
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
index e149005ffc2c..46addc9b6272 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
@@ -95,23 +95,39 @@ def get_all_gather_streams(
 # See [Note: Overlapping all-gather copy-in and all-gather]
 class AllGatherState(NamedTuple):
     all_gather_result: AllGatherResult
+<<<<<<< HEAD
     event: torch.Event  # all-gather copy-out
+=======
+    event: Optional[torch.Event]  # all-gather copy-out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ReduceScatterState(NamedTuple):
     reduce_scatter_input: torch.Tensor
+<<<<<<< HEAD
     event: torch.Event  # reduce-scatter event
+=======
+    event: Optional[torch.Event]  # reduce-scatter event
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class AllReduceState(NamedTuple):
     all_reduce_input: torch.Tensor
+<<<<<<< HEAD
     event: torch.Event  # all-reduce event
+=======
+    event: Optional[torch.Event]  # all-reduce event
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class FSDPParamGroup:
     """This class represents a parameter group to communicate together."""
 
+<<<<<<< HEAD
     _orig_dtype: torch.dtype
+=======
+    _orig_dtype: Optional[torch.dtype]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _reduce_dtype: Optional[torch.dtype]
 
     def __init__(
@@ -177,9 +193,18 @@ def __init__(
         # Whether to reshard parameters after backward (only useful for
         # gradient accumulation)
         self.reshard_after_backward: bool = True
+<<<<<<< HEAD
         # Optional custom reduce-scatter reduce op (e.g. to divide by a
         # factor other than the shard world size)
         self.reduce_scatter_reduce_op: Optional[dist.ReduceOp] = None
+=======
+        # Optional custom factor for the gradient reduction op (e.g. to divide
+        # by a factor other than the world size)
+        self.gradient_divide_factor: Optional[float] = None
+        # Whether reduce-scatter and all-reduce should be issued using only
+        # summations, potentially with separate pre-/post-scaling.
+        self.force_sum_reduction_for_comms: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # `async_op` arg used for pre-forward/pre-backward unshard; can be
         # overridden to only do explicit prefetching and avoid inter-stream
         # fragmentation from using separate unshard streams
@@ -187,6 +212,12 @@ def __init__(
         # Whether to unshard in backward: can be overridden by the user if the
         # parameters in this group are not needed for backward (e.g. embedding)
         self.unshard_in_backward: bool = True
+<<<<<<< HEAD
+=======
+        # Whether to (try to) use the ProcessGroup's allocate_tensor method for
+        # the staging buffers for collective comms.
+        self.allocate_memory_from_process_group = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # - CUDA events for stream synchronization
         # Holds the all-gather output buffer, sync objects, and metadata
@@ -212,6 +243,7 @@ def __init__(
     def _init_mp_dtypes(self) -> None:
         for fsdp_param in self.fsdp_params:
             fsdp_param.init_dtype_attrs(self.mp_policy)
+<<<<<<< HEAD
         orig_dtypes = {fsdp_param.orig_dtype for fsdp_param in self.fsdp_params}
         if len(orig_dtypes) != 1:
             # This can be relaxed if we copy-out for the reduce-scatter
@@ -221,13 +253,33 @@ def _init_mp_dtypes(self) -> None:
         self._orig_dtype = next(iter(orig_dtypes))
         reduce_dtypes = {fsdp_param.reduce_dtype for fsdp_param in self.fsdp_params}
         if len(reduce_dtypes) != 1:
+=======
+        trainable_params: list[FSDPParam] = [
+            p for p in self.fsdp_params if p.sharded_param.requires_grad
+        ]
+        orig_dtypes = {p.orig_dtype for p in trainable_params}
+        reduce_dtypes = {p.reduce_dtype for p in trainable_params}
+        if len(trainable_params) > 0 and len(orig_dtypes) != 1:
+            # Models may have no grad params
+            raise AssertionError(
+                f"FSDP expects uniform original parameter dtype but got {orig_dtypes}"
+            )
+        self._orig_dtype = next(iter(orig_dtypes)) if len(trainable_params) else None
+        if len(trainable_params) > 0 and len(reduce_dtypes) != 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # This can be relaxed if we issue one reduce-scatter per reduce
             # dtype (but we would need a way for users to specify multiple
             # reduce dtypes)
             raise AssertionError(
                 f"FSDP expects uniform reduce dtype but got {reduce_dtypes}"
             )
+<<<<<<< HEAD
         self._reduce_dtype = next(iter(reduce_dtypes))
+=======
+        self._reduce_dtype = (
+            next(iter(reduce_dtypes)) if len(trainable_params) else None
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def lazy_init(self):
         # Lazy init should be idempotent
@@ -271,11 +323,19 @@ def unshard(self, async_op: bool = False):
                 async_op,
                 *self.comm_ctx.get_all_gather_streams(async_op, self._training_state),
                 self.device,
+<<<<<<< HEAD
+=======
+                self.allocate_memory_from_process_group,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def wait_for_unshard(self):
         """
+<<<<<<< HEAD
         1. In forward with implict prefetching, to overlap the current copy-out
+=======
+        1. In forward with implicit prefetching, to overlap the current copy-out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with the next all-gather, we save a reference to the current all-gather
         result to free after the next copy-out.
         2. Otherwise (explicit prefetching or in backward), we free the
@@ -310,11 +370,19 @@ def wait_for_unshard(self):
             self._wait_all_gather_streams_on_event(all_gather_copy_out_event)
         self._all_gather_result = None  # free unless saved in `all_gather_state`
 
+<<<<<<< HEAD
     def _wait_all_gather_streams_on_event(self, event: torch.Event):
         # Calling `unshard` before lazy init means streams are not initialized
         if hasattr(self.comm_ctx, "all_gather_copy_in_stream"):
             self.comm_ctx.all_gather_copy_in_stream.wait_event(event)
         if hasattr(self.comm_ctx, "all_gather_stream"):
+=======
+    def _wait_all_gather_streams_on_event(self, event: Optional[torch.Event]):
+        # Calling `unshard` before lazy init means streams are not initialized
+        if hasattr(self.comm_ctx, "all_gather_copy_in_stream") and event is not None:
+            self.comm_ctx.all_gather_copy_in_stream.wait_event(event)
+        if hasattr(self.comm_ctx, "all_gather_stream") and event is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.comm_ctx.all_gather_stream.wait_event(event)
 
     def reshard(self):
@@ -414,11 +482,22 @@ def post_backward(self, *unused: Any):
         if len(fsdp_params_with_grad) == 0:
             return
         with record_function(self._with_fqn("FSDP::post_backward_reduce")):
+<<<<<<< HEAD
             if self.comm_ctx.reduce_scatter_state is not None:
                 self.device_handle.current_stream().wait_event(
                     self.comm_ctx.reduce_scatter_state.event
                 )
                 self.comm_ctx.reduce_scatter_state = None
+=======
+            if (
+                self.comm_ctx.reduce_scatter_state is not None
+                and self.comm_ctx.reduce_scatter_state.event is not None
+            ):
+                self.device_handle.current_stream().wait_event(
+                    self.comm_ctx.reduce_scatter_state.event
+                )
+            self.comm_ctx.reduce_scatter_state = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             all_reduce_pg = self._all_reduce_process_group if self._is_hsdp else None
             all_reduce_stream: torch.cuda.Stream
             if all_reduce_pg is None and self._all_reduce_hook_stream is not None:
@@ -447,18 +526,32 @@ def post_backward(self, *unused: Any):
                 self._orig_dtype,
                 self._reduce_dtype,
                 self.device,
+<<<<<<< HEAD
                 self.reduce_scatter_reduce_op,
+=======
+                self.gradient_divide_factor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._all_reduce_process_group if self._is_hsdp else None,
                 all_reduce_stream,
                 self.all_reduce_grads,
                 self._partial_reduce_output,
                 self._all_reduce_hook,
+<<<<<<< HEAD
+=======
+                self.allocate_memory_from_process_group,
+                self.force_sum_reduction_for_comms,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.comm_ctx.reduce_scatter_state = ReduceScatterState(
                 reduce_scatter_input, reduce_scatter_event
             )
             if all_reduce_input is not None:
+<<<<<<< HEAD
                 assert all_reduce_event is not None
+=======
+                if self.device.type != "cpu":
+                    assert all_reduce_event is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self._all_reduce_state = AllReduceState(
                     all_reduce_input, all_reduce_event
                 )
@@ -484,9 +577,18 @@ def _wait_for_post_backward(self):
         if self._post_reduce_event is not None:
             self.device_handle.current_stream().wait_event(self._post_reduce_event)
             self._post_reduce_event = None
+<<<<<<< HEAD
         if self._all_reduce_state is not None:
             self.device_handle.current_stream().wait_event(self._all_reduce_state.event)
             self._all_reduce_state = None
+=======
+        if (
+            self._all_reduce_state is not None
+            and self._all_reduce_state.event is not None
+        ):
+            self.device_handle.current_stream().wait_event(self._all_reduce_state.event)
+        self._all_reduce_state = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _backward_prefetch(self) -> None:
         if self._training_state == TrainingState.PRE_BACKWARD:
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_state.py b/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
index 5d11f0359f1f..5b999b4c7630 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
@@ -16,8 +16,13 @@
     _State,
 )
 from torch.distributed.device_mesh import _get_device_handle
+<<<<<<< HEAD
 from torch.distributed.utils import _to_kwargs
 from torch.utils._pytree import tree_flatten, tree_map
+=======
+from torch.distributed.utils import _apply_to_tensors, _to_kwargs
+from torch.utils._pytree import tree_flatten
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ._fsdp_api import MixedPrecisionPolicy
 from ._fsdp_common import (
@@ -59,7 +64,15 @@ def disable_if_config_true(func):
     @functools.wraps(func)
     def fsdp_hook_wrapper(*args, **kwargs):
         if torch._dynamo.config.skip_fsdp_hooks:
+<<<<<<< HEAD
             return torch._dynamo.disable(func, recursive=True)(*args, **kwargs)
+=======
+            return torch._dynamo.disable(
+                func,
+                recursive=True,
+                reason="skipping FSDP hooks since torch._dynamo.config.skip_fsdp_hooks is set",
+            )(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return func(*args, **kwargs)
 
@@ -77,6 +90,12 @@ def __init__(self) -> None:
         self._states_to_forward_prefetch: list[FSDPState] = []
         self._states_to_backward_prefetch: list[FSDPState] = []
         self._modules_to_run_forward: set[nn.Module] = set()
+<<<<<<< HEAD
+=======
+        # ``False`` when user set reshard_after_forward
+        # through ``fully_shard`` or ``set_reshard_after_forward``
+        self._auto_reshard_after_forward: Optional[bool] = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Define a separate init since `__init__` is called in the contract
     def init(
@@ -84,6 +103,10 @@ def init(
         modules: tuple[nn.Module, ...],
         device: torch.device,
         mp_policy: MixedPrecisionPolicy,
+<<<<<<< HEAD
+=======
+        auto_reshard_after_forward: bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         for module in modules:
             _insert_module_state(module, self)
@@ -91,6 +114,10 @@ def init(
         self._device = device
         self._device_handle = _get_device_handle(device.type)
         self._mp_policy = mp_policy
+<<<<<<< HEAD
+=======
+        self._auto_reshard_after_forward = auto_reshard_after_forward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(modules) == 1:
             self._pre_forward_hook_handle = modules[0].register_forward_pre_hook(
                 self._pre_forward, prepend=True, with_kwargs=True
@@ -127,7 +154,17 @@ def _root_pre_forward(
                 current_stream = self._device_handle.current_stream()
                 self._comm_ctx.all_gather_copy_in_stream.wait_stream(current_stream)
                 self._comm_ctx.all_gather_stream.wait_stream(current_stream)
+<<<<<<< HEAD
             if self._device.type in ["cuda", "hpu", "xpu", "mtia"]:
+=======
+            if self._device.type in [
+                "cuda",
+                "hpu",
+                "xpu",
+                "mtia",
+                torch._C._get_privateuse1_backend_name(),
+            ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 with torch.profiler.record_function("FSDP::inputs_to_device"):
                     args_tuple, kwargs_tuple = _to_kwargs(
                         args, kwargs, self._device, False
@@ -165,7 +202,11 @@ def _lazy_init(self) -> None:
                 state._is_root = False
             self._state_ctx.all_states.append(state)
             visited_states.add(state)
+<<<<<<< HEAD
         if self._fsdp_param_group:
+=======
+        if self._fsdp_param_group and self._auto_reshard_after_forward:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # For the root, do not reshard after forward since for training,
             # the parameters would be freed and all-gathered immediately
             self._fsdp_param_group.post_forward_mesh_info = None
@@ -225,7 +266,14 @@ def _pre_forward(
                 cast_fn = functools.partial(
                     _cast_fp_tensor, self._mp_policy.param_dtype
                 )
+<<<<<<< HEAD
                 args, kwargs = tree_map(cast_fn, args), tree_map(cast_fn, kwargs)
+=======
+                args, kwargs = (
+                    _apply_to_tensors(cast_fn, args),
+                    _apply_to_tensors(cast_fn, kwargs),
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self._fsdp_param_group:
             args, kwargs = self._fsdp_param_group.pre_forward(module, args, kwargs)
         for fsdp_state in self._states_to_forward_prefetch:
@@ -255,7 +303,11 @@ def _post_forward(self, module: nn.Module, input: Any, output: Any) -> Any:
             self._state_ctx.iter_forward_root = None
         if self._mp_policy.output_dtype is not None:
             with torch.profiler.record_function("FSDP::cast_forward_outputs"):
+<<<<<<< HEAD
                 output = tree_map(
+=======
+                output = _apply_to_tensors(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     functools.partial(_cast_fp_tensor, self._mp_policy.output_dtype),
                     output,
                 )
diff --git a/torch/distributed/fsdp/_fully_shard/_fully_shard.py b/torch/distributed/fsdp/_fully_shard/_fully_shard.py
index 2cc147171b30..ae87a0222a54 100644
--- a/torch/distributed/fsdp/_fully_shard/_fully_shard.py
+++ b/torch/distributed/fsdp/_fully_shard/_fully_shard.py
@@ -14,6 +14,10 @@
     TYPE_CHECKING,
     Union,
 )
+<<<<<<< HEAD
+=======
+from typing_extensions import deprecated
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.nn as nn
@@ -86,7 +90,11 @@ def fully_shard(
     module,
     *,
     mesh: Optional[DeviceMesh] = None,
+<<<<<<< HEAD
     reshard_after_forward: Union[bool, int] = True,
+=======
+    reshard_after_forward: Optional[Union[bool, int]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     shard_placement_fn: Optional[Callable[[nn.Parameter], Optional[Shard]]] = None,
     mp_policy: MixedPrecisionPolicy = MixedPrecisionPolicy(),
     offload_policy: OffloadPolicy = OffloadPolicy(),
@@ -139,25 +147,43 @@ def fully_shard(
             placement. The mesh's device type gives the device type used for
             communication; if a CUDA or CUDA-like device type, then we use the
             current device.
+<<<<<<< HEAD
         reshard_after_forward (Union[bool, int]): This controls the parameter
+=======
+        reshard_after_forward (Optional[Union[bool, int]]): This controls the parameter
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             behavior after forward and can trade off memory and communication:
 
             - If ``True``, then this reshards parameters after forward and
               re-all-gathers in backward.
             - If ``False``, then this keeps the unsharded parameters in memory
+<<<<<<< HEAD
               after forward and avoids the all-gather in backward.
+=======
+              after forward and avoids the all-gather in backward. For best performance,
+              we usually set ``False`` for the root module, because the root module
+              is typically required immediately when the backward pass begins.
+            - If ``None``, it is set to ``True`` for non-root modules and ``False``
+              for root modules.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             - If an ``int``, then this represents the world size to reshard to
               after forward. It should be a non-trivial divisor of the ``mesh``
               shard dim size (i.e. excluding 1 and the dim size itself). A
               choice may be the intra-node size (e.g. ``torch.cuda.device_count()``).
               This allows the all-gather in backward to be over a smaller world
               size at the cost of higher memory usage than setting to ``True``.
+<<<<<<< HEAD
             - The root FSDP state has its value specially set to ``False`` as a
               heuristic since its parameters would typically be immediately
               all-gathered for backward.
             - After forward, the parameters registered to the module depend on
               to this: The registered parameters are the sharded parameters if
               ``True``; unsharded parameters if ``False``; and the paramters
+=======
+            - After forward, the parameters registered to the module depend on
+              to this: The registered parameters are the sharded parameters if
+              ``True``; unsharded parameters if ``False``; and the parameters
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
               resharded to the smaller mesh otherwise. To modify the parameters
               between forward and backward, the registered parameters must be
               the sharded parameters. For ``False`` or an ``int``, this can be
@@ -176,12 +202,22 @@ def fully_shard(
         offload_policy (OffloadPolicy): This controls the offloading policy,
             which offers parameter/gradient/optimizer state offloading. See
             :class:`OffloadPolicy` and its subclasses for details.
+<<<<<<< HEAD
         ignored_params: Optional(Set[nn.Parameter]): The set of parameters that we
             don't want to shard with FSDP.
+=======
+        ignored_params: Optional(Set[nn.Parameter]): The set of parameters to be
+            ignored by FSDP. They will not be sharded, nor moved to the device
+            during init, nor have their gradients reduced in backward.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         FSDPModule: The module with FSDP applied (in-place).
     """
+<<<<<<< HEAD
+=======
+    torch._C._log_api_usage_once("torch.distributed.fsdp.fully_shard")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(module, (nn.ModuleList, nn.ModuleDict)):
         raise ValueError(
             f"fully_shard does not support containers that do not implement forward: {module}"
@@ -198,8 +234,17 @@ def fully_shard(
             )
         mesh_info = HSDPMeshInfo(mesh, shard_mesh_dim=1, replicate_mesh_dim=0)
     device = _get_device_from_mesh(mesh)
+<<<<<<< HEAD
     post_forward_mesh_info = _get_post_forward_mesh_info(
         reshard_after_forward, mesh_info
+=======
+    auto_reshard_after_forward = reshard_after_forward is None
+    # If the user does not provide ``reshard_after_forward``, we set it to True.
+    # During lazy_init, we identify which module is the root and override its value to False
+    post_forward_mesh_info = _get_post_forward_mesh_info(
+        reshard_after_forward if not auto_reshard_after_forward else True,  # type: ignore[arg-type]
+        mesh_info,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     arg_module = module
@@ -207,7 +252,11 @@ def fully_shard(
         (module,) if isinstance(module, nn.Module) else tuple(_get_root_modules(module))
     )
     state = fully_shard.state(modules[0])  # type: ignore[attr-defined] # see [1]
+<<<<<<< HEAD
     state.init(modules, device, mp_policy)
+=======
+    state.init(modules, device, mp_policy, auto_reshard_after_forward)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     managed_modules = _get_managed_modules(modules, ignored_params)
     params, buffers = _get_managed_states(managed_modules, ignored_params)
@@ -350,6 +399,43 @@ def set_requires_all_reduce(
                 if fsdp_param_group := state._fsdp_param_group:
                     fsdp_param_group.all_reduce_grads = requires_all_reduce
 
+<<<<<<< HEAD
+=======
+    def set_reshard_after_forward(
+        self, reshard_after_forward: bool, recurse: bool = True
+    ) -> None:
+        """
+        Sets if the module should reshard parameters after forward. This can be
+        used to change the ``reshard_after_forward`` FSDP arg at runtime. For
+        example, this can be used to set the FSDP root module's value to
+        ``True`` (since it is otherwise specially set to ``False``), or it can
+        set an FSDP module's value to ``False`` for running evals and set back
+        to ``True`` for training.
+
+        Args:
+            reshard_after_forward (bool): Whether to reshard parameters after
+                forward.
+            recurse (bool): Whether to set for all FSDP submodules or just the
+                passed-in module.
+        """
+        if not isinstance(reshard_after_forward, bool):
+            raise ValueError(
+                f"reshard_after_forward should be a bool, got {type(reshard_after_forward)}"
+            )
+        self_module = cast(nn.Module, self)
+        modules = list(self_module.modules()) if recurse else [self_module]
+        for module in modules:
+            if isinstance(module, FSDPModule):
+                state = module._get_fsdp_state()
+                state._auto_reshard_after_forward = False
+                if fsdp_param_group := state._fsdp_param_group:
+                    fsdp_param_group.post_forward_mesh_info = (
+                        _get_post_forward_mesh_info(
+                            reshard_after_forward, fsdp_param_group.mesh_info
+                        )
+                    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def set_reshard_after_backward(
         self, reshard_after_backward: bool, *, recurse: bool = True
     ) -> None:
@@ -457,10 +543,22 @@ def set_post_optim_event(self, event: torch.Event) -> None:
         """
         self._get_fsdp_state()._state_ctx.post_optim_event = event
 
+<<<<<<< HEAD
     def set_reduce_scatter_divide_factor(self, factor: float) -> None:
         """
         Sets a custom divide factor for the reduce-scatter. This becomes a
         custom reduce op using NCCL's PreMulSum, which allows multiplying by
+=======
+    @deprecated("Use `set_gradient_divide_factor` instead")
+    def set_reduce_scatter_divide_factor(self, factor: float) -> None:
+        """Use :py:meth:`set_gradient_divide_factor` instead"""
+        self.set_gradient_divide_factor(factor)
+
+    def set_gradient_divide_factor(self, factor: float) -> None:
+        """
+        Sets a custom divide factor for the gradient reduction. This might use
+        a custom reduce op using NCCL's PreMulSum, which allows multiplying by
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         the factor before reduction.
 
         Args:
@@ -468,9 +566,34 @@ def set_reduce_scatter_divide_factor(self, factor: float) -> None:
         """
         state = self._get_fsdp_state()
         if (fsdp_param_group := state._fsdp_param_group) is not None:
+<<<<<<< HEAD
             mul_factor = 1.0 / float(factor)
             reduce_op = torch.distributed._make_nccl_premul_sum(mul_factor)
             fsdp_param_group.reduce_scatter_reduce_op = reduce_op
+=======
+            fsdp_param_group.gradient_divide_factor = factor
+
+    def set_force_sum_reduction_for_comms(self, enable: bool) -> None:
+        """
+        Sets whether to require the low-level collective communication
+        primitives to exclusively use "sum"-type reductions, even if it comes
+        at the cost of separate additional pre- or post-scaling operations.
+        This is needed for example because NCCL currently supports zero-copy
+        transfers only for this kind of collectives.
+
+        NB: for MTIA devices, this is always implicitly enabled.
+
+        NB: if `set_all_reduce_hook` is used under FSDP setup, the caller needs
+        to ensure the custom all-reduce across FSDP units follow this strategy
+        as well, as FSDP can no longer automatically handle that.
+
+        Args:
+            enable (bool): Whether to only ever use ReduceOp.SUM for comms.
+        """
+        state = self._get_fsdp_state()
+        if (fsdp_param_group := state._fsdp_param_group) is not None:
+            fsdp_param_group.force_sum_reduction_for_comms = enable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def set_unshard_in_backward(self, unshard_in_backward: bool) -> None:
         """
@@ -483,6 +606,25 @@ def set_unshard_in_backward(self, unshard_in_backward: bool) -> None:
         if (fsdp_param_group := state._fsdp_param_group) is not None:
             fsdp_param_group.unshard_in_backward = unshard_in_backward
 
+<<<<<<< HEAD
+=======
+    def set_allocate_memory_from_process_group_for_comm(self, enable: bool) -> None:
+        """
+        Sets whether the temporary staging buffers used to send and receive data
+        over collective communications should be allocated using the custom
+        optimized allocator provided by the ProcessGroup itself (if any). This
+        might allow the ProcessGroup to be more efficient. For example, when
+        using NCCL, this enables it to leverage zero-copy transfers over SHARP
+        (for NVLink and/or InfiniBand).
+
+        Args:
+            enable (bool): Whether to turn on ProcessGroup allocation.
+        """
+        state = self._get_fsdp_state()
+        if (fsdp_param_group := state._fsdp_param_group) is not None:
+            fsdp_param_group.allocate_memory_from_process_group = enable
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _set_unshard_async_op(self, async_op: bool):
         """
         Sets whether to use ``async_op=True`` or ``False`` for the pre-forward
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index feaf8b882963..ed047ed240e6 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -361,7 +361,11 @@ def _init_device_handle(
     See the :ref:`Accelerators<accelerators>` for details.
 
 
+<<<<<<< HEAD
     This method will be called once ignored paramters was determined, as the device handle maybe needed
+=======
+    This method will be called once ignored parameters was determined, as the device handle maybe needed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for other initialization.
     """
     determined_device = None
@@ -517,7 +521,11 @@ def _init_extension(state: _FSDPState, device_mesh: DeviceMesh = None) -> _FSDPS
     if device_mesh and root_mesh != state._device_mesh:
         state._fsdp_extension = DTensorExtensions(state._device_handle)
     else:
+<<<<<<< HEAD
         # We need to explicilty set _fsdp_extension to None.
+=======
+        # We need to explicitly set _fsdp_extension to None.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Otherwise, we will run into an infinite recursion when getting the attribute.
         state._fsdp_extension = None
     return state
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index de33ed8ef3f2..ddb5e77dfb58 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -314,11 +314,17 @@ def _unflatten_communicated_optim_state(
             unflat_state_param[state_name] = optim_state
 
         # Add zero-dimension tensor state: take the target rank's value
+<<<<<<< HEAD
         for state_name, zero_dim_tensor in sorted_items(zero_dim_tensor_state):
             unflat_state_param[state_name] = zero_dim_tensor
         # Add non-tensor state: take the target rank's value
         for state_name, non_tensor in sorted_items(non_tensor_state):
             unflat_state_param[state_name] = non_tensor
+=======
+        unflat_state_param.update(sorted_items(zero_dim_tensor_state))
+        # Add non-tensor state: take the target rank's value
+        unflat_state_param.update(sorted_items(non_tensor_state))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unflat_param_state.append(unflat_state_param)
     return unflat_param_state
 
@@ -1419,7 +1425,11 @@ def _unflatten_orig_param_states(
 ) -> None:
     """
     Given a output state dict, ``output_states``, which the keys are FQNs to the
+<<<<<<< HEAD
     original parameters (not FlatParameters nor parmeter ID), and the values
+=======
+    original parameters (not FlatParameters nor parameter ID), and the values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     are gathered states, unflatten the states to the original dimensions.
 
     This function performs the unflattening process in-place.
@@ -1658,7 +1668,11 @@ def _gather_all_orig_param_state(
 ) -> dict[str, Any]:
     """
     Given a optimizer state dict, ``input_states``, which the keys are FQNs to the
+<<<<<<< HEAD
     original parameters (not FlatParameters nor parmeter ID), gather all the
+=======
+    original parameters (not FlatParameters nor parameter ID), gather all the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     states and unflatten them to the original dimensions. Note that all the
     params referred by the ``input_states`` must be managed by FSDP.
     """
@@ -1827,11 +1841,20 @@ def _convert_state_with_flat_params(
             )
             if to_save:
                 assert len(unflat_state) == len(optim_state_key.unflat_param_names)
+<<<<<<< HEAD
                 for unflat_param_name, unflat_param_state in zip(
                     optim_state_key.unflat_param_names,
                     unflat_state,
                 ):
                     fsdp_osd_state[unflat_param_name] = unflat_param_state
+=======
+                fsdp_osd_state.update(
+                    zip(
+                        optim_state_key.unflat_param_names,
+                        unflat_state,
+                    )
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif to_save:
             assert len(optim_state_key.unflat_param_names) == 1
             unflat_param_name = optim_state_key.unflat_param_names[0]
@@ -2058,7 +2081,11 @@ def _set_optim_use_dtensor(
     fsdp_state: _FSDPState,
     state_dict_settings: StateDictSettings,
 ) -> None:
+<<<<<<< HEAD
     # If device_mesh is passed in when initalizing FSDP, we automatically turn the
+=======
+    # If device_mesh is passed in when initializing FSDP, we automatically turn the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # _use_dtensor flag to be true for ShardedOptimStateDictConfig() if state_dict_type
     # has to be set to SHARDED_STATE_DICT.
     if getattr(fsdp_state, "_device_mesh", None):
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index f723e8e0464e..ccb7747b9e7f 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -518,7 +518,11 @@ def _root_pre_forward(
         _p_assert(state._is_root is not None, "Expects a root FSDP to have been set")
         if not state._is_root:
             # Always cast forward inputs in the root of this local FSDP unit for mixed
+<<<<<<< HEAD
             # precision, as this is where mixed precision could be configed.
+=======
+            # precision, as this is where mixed precision could be configured.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # This is more useful for auto wrapping that is recommended in composable path.
             # For manual wrapping, cast forward inputs on each local FSDP unit root will
             # increase some overhead, so not turned on for model wrapper path right now where
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 0d3285255df2..3b3a26ef8246 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -699,7 +699,11 @@ def _post_state_dict_hook(
     if fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD:
         context = _replace_with_full_state_dict_type(fsdp_state)
         warnings.warn(
+<<<<<<< HEAD
             "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
+=======
+            "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "be returned."
         )
     else:
@@ -761,7 +765,11 @@ def _pre_state_dict_hook(
     if fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD:
         context = _replace_with_full_state_dict_type(fsdp_state)
         warnings.warn(
+<<<<<<< HEAD
             "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
+=======
+            "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "be returned."
         )
     else:
@@ -784,7 +792,11 @@ def _pre_state_dict_hook(
 
 @no_type_check
 def _set_use_dtensor(fsdp_state: _FSDPState) -> None:
+<<<<<<< HEAD
     # If device_mesh is passed in when initalizing FSDP, we automatically turn the
+=======
+    # If device_mesh is passed in when initializing FSDP, we automatically turn the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # _use_dtensor flag to be true for ShardedStateDictConfig().
     if getattr(fsdp_state, "_device_mesh", None):
         state_dict_type = fsdp_state._state_dict_type
diff --git a/torch/distributed/fsdp/_traversal_utils.py b/torch/distributed/fsdp/_traversal_utils.py
index 5ca758c83a97..a6ff1873cd59 100644
--- a/torch/distributed/fsdp/_traversal_utils.py
+++ b/torch/distributed/fsdp/_traversal_utils.py
@@ -1,7 +1,11 @@
 """
 NOTE: This file must be imported like
 ``import torch.distributed.fsdp._traversal_utils`` and not like
+<<<<<<< HEAD
 ``from torch.distirbuted.fsdp._traversal_utils import ...`` to avoid circular
+=======
+``from torch.distributed.fsdp._traversal_utils import ...`` to avoid circular
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 imports. For brevity, we may import the file as ``traversal_utils``.
 """
 
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 0eafd26e31f9..282e6d245de3 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -117,12 +117,19 @@ class OptimStateKeyType(Enum):
 class FullyShardedDataParallel(nn.Module, _FSDPState):
     """A wrapper for sharding module parameters across data parallel workers.
 
+<<<<<<< HEAD
     This is inspired by `Xu et al.`_ as well as the ZeRO Stage 3 from DeepSpeed_.
     FullyShardedDataParallel is commonly shortened to FSDP.
 
     .. _`Xu et al.`: https://arxiv.org/abs/2004.13336
     .. _DeepSpeed: https://www.deepspeed.ai/
 
+=======
+    This is inspired by `Xu et al. <https://arxiv.org/abs/2004.13336>`_ as
+    well as the ZeRO Stage 3 from `DeepSpeed <https://www.deepspeed.ai/>`_.
+    FullyShardedDataParallel is commonly shortened to FSDP.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     To understand FSDP internals, refer to the
     :ref:`fsdp_notes`.
 
@@ -388,7 +395,11 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
             ``ignored_modules`` soon. For backward compatibility, we keep both
             ``ignored_states`` and `ignored_modules``, but FSDP only allows one
             of them to be specified as not ``None``.
+<<<<<<< HEAD
         device_mesh (Optional[DeviceMesh]): DeviceMesh can be used as an altenative to
+=======
+        device_mesh (Optional[DeviceMesh]): DeviceMesh can be used as an alternative to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             process_group. When device_mesh is passed, FSDP will use the underlying process
             groups for all-gather and reduce-scatter collective communications. Therefore,
             these two args need to be mutually exclusive. For hybrid sharding strategies such as
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index d8e2017e7e15..c4c2c3d6ed17 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -64,6 +64,12 @@ class LaunchConfig:
         local_addr: address of the local node if any. If not set, a lookup on the local
                 machine's FQDN will be performed.
         local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
+<<<<<<< HEAD
+=======
+        event_log_handler: name of the event logging handler as registered in
+          `elastic/events/handlers.py <https://docs.pytorch.org/docs/stable/elastic/events.html>`_.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. note::
         `rdzv_timeout` is a legacy argument that will be removed in future.
@@ -87,6 +93,10 @@ class LaunchConfig:
     log_line_prefix_template: Optional[str] = None
     metrics_cfg: dict[str, str] = field(default_factory=dict)
     local_addr: Optional[str] = None
+<<<<<<< HEAD
+=======
+    event_log_handler: str = "null"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __post_init__(self):
         default_timeout = 900
@@ -194,6 +204,7 @@ def launch_agent(
 
     logger.info(
         "Starting elastic_operator with launch configs:\n"
+<<<<<<< HEAD
         "  entrypoint       : %(entrypoint)s\n"
         "  min_nodes        : %(min_nodes)s\n"
         "  max_nodes        : %(max_nodes)s\n"
@@ -206,6 +217,21 @@ def launch_agent(
         "  monitor_interval : %(monitor_interval)s\n"
         "  log_dir          : %(log_dir)s\n"
         "  metrics_cfg      : %(metrics_cfg)s\n",
+=======
+        "  entrypoint         : %(entrypoint)s\n"
+        "  min_nodes          : %(min_nodes)s\n"
+        "  max_nodes          : %(max_nodes)s\n"
+        "  nproc_per_node     : %(nproc_per_node)s\n"
+        "  run_id             : %(run_id)s\n"
+        "  rdzv_backend       : %(rdzv_backend)s\n"
+        "  rdzv_endpoint      : %(rdzv_endpoint)s\n"
+        "  rdzv_configs       : %(rdzv_configs)s\n"
+        "  max_restarts       : %(max_restarts)s\n"
+        "  monitor_interval   : %(monitor_interval)s\n"
+        "  log_dir            : %(log_dir)s\n"
+        "  metrics_cfg        : %(metrics_cfg)s\n"
+        "  event_log_handler  : %(event_log_handler)s\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         {
             "entrypoint": entrypoint_name,
             "min_nodes": config.min_nodes,
@@ -219,6 +245,10 @@ def launch_agent(
             "monitor_interval": config.monitor_interval,
             "log_dir": config.logs_specs.root_log_dir,  # type: ignore[union-attr]
             "metrics_cfg": config.metrics_cfg,
+<<<<<<< HEAD
+=======
+            "event_log_handler": config.event_log_handler,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         },
     )
 
@@ -245,6 +275,10 @@ def launch_agent(
         master_addr=master_addr,
         master_port=master_port,
         local_addr=config.local_addr,
+<<<<<<< HEAD
+=======
+        event_log_handler=config.event_log_handler,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     agent = LocalElasticAgent(
@@ -260,7 +294,11 @@ def launch_agent(
 
         result = agent.run()
         # records that agent.run() has succeeded NOT that workers have succeeded
+<<<<<<< HEAD
         events.record(agent.get_event_succeeded())
+=======
+        events.record(agent.get_event_succeeded(), config.event_log_handler)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if result.is_failed():
             # ChildFailedError is treated specially by @record
@@ -280,10 +318,17 @@ def launch_agent(
         # since this closes the rendezvous on this rdzv_id permanently and
         # prevents any additional scaling events
         shutdown_rdzv = False
+<<<<<<< HEAD
         events.record(agent.get_event_failed())
         raise
     except Exception:
         events.record(agent.get_event_failed())
+=======
+        events.record(agent.get_event_failed(), config.event_log_handler)
+        raise
+    except Exception:
+        events.record(agent.get_event_failed(), config.event_log_handler)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise
     finally:
         if shutdown_rdzv:
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index e08b9cad1b03..998be12899a4 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -6,6 +6,10 @@
 import types
 from collections.abc import Iterator, Mapping
 from typing import Any, Callable, Optional, TypeVar, Union
+<<<<<<< HEAD
+=======
+from typing_extensions import Self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed.rpc as rpc
@@ -319,6 +323,7 @@ def register_parameter(self, name: str, param: Optional[Parameter]) -> None:
     def add_module(self, name: str, module: Optional[Module]) -> None:
         _raise_not_supported(self.add_module.__name__)
 
+<<<<<<< HEAD
     def apply(self: T, fn: Callable[[Module], None]) -> T:  # type: ignore[return]
         _raise_not_supported(self.apply.__name__)
 
@@ -347,6 +352,36 @@ def half(self: T) -> T:  # type: ignore[return]
         _raise_not_supported(self.half.__name__)
 
     def bfloat16(self: T) -> T:  # type: ignore[return]
+=======
+    def apply(self, fn: Callable[[Module], None]) -> Self:  # type: ignore[return]
+        _raise_not_supported(self.apply.__name__)
+
+    def cuda(self, device: Optional[Union[int, device]] = None) -> Self:  # type: ignore[return]
+        _raise_not_supported(self.cuda.__name__)
+
+    def ipu(self, device: Optional[Union[int, device]] = None) -> Self:  # type: ignore[return]
+        _raise_not_supported(self.ipu.__name__)
+
+    def xpu(self, device: Optional[Union[int, device]] = None) -> Self:  # type: ignore[return]
+        _raise_not_supported(self.xpu.__name__)
+
+    def cpu(self) -> Self:  # type: ignore[return]
+        _raise_not_supported(self.cpu.__name__)
+
+    def type(self, dst_type: Union[dtype, str]) -> Self:  # type: ignore[return]
+        _raise_not_supported(self.type.__name__)
+
+    def float(self) -> Self:  # type: ignore[return]
+        _raise_not_supported(self.float.__name__)
+
+    def double(self) -> Self:  # type: ignore[return]
+        _raise_not_supported(self.double.__name__)
+
+    def half(self) -> Self:  # type: ignore[return]
+        _raise_not_supported(self.half.__name__)
+
+    def bfloat16(self) -> Self:  # type: ignore[return]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _raise_not_supported(self.bfloat16.__name__)
 
     def to(self, *args, **kwargs) -> T:  # type: ignore[misc, return, type-var]
@@ -428,6 +463,7 @@ def named_modules(
     ):
         _raise_not_supported(self.named_modules.__name__)
 
+<<<<<<< HEAD
     def train(self: T, mode: bool = True) -> T:
         return self.module_rref.rpc_sync().train()  # type: ignore[operator, union-attr]
 
@@ -435,12 +471,25 @@ def eval(self: T) -> T:
         return self.module_rref.rpc_sync().eval()  # type: ignore[operator, union-attr]
 
     def requires_grad_(self: T, requires_grad: bool = True) -> T:  # type: ignore[return]
+=======
+    def train(self, mode: bool = True) -> Self:
+        return self.module_rref.rpc_sync().train()  # type: ignore[operator, union-attr]
+
+    def eval(self) -> Self:
+        return self.module_rref.rpc_sync().eval()  # type: ignore[operator, union-attr]
+
+    def requires_grad_(self, requires_grad: bool = True) -> Self:  # type: ignore[return]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _raise_not_supported(self.requires_grad_.__name__)
 
     def zero_grad(self, set_to_none: bool = True) -> None:
         _raise_not_supported(self.zero_grad.__name__)
 
+<<<<<<< HEAD
     def share_memory(self: T) -> T:  # type: ignore[return]
+=======
+    def share_memory(self) -> Self:  # type: ignore[return]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _raise_not_supported(self.share_memory.__name__)
 
     def extra_repr(self) -> str:  # type: ignore[return]
diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py
index c8be46e6d155..2c26d72cb62a 100644
--- a/torch/distributed/optim/named_optimizer.py
+++ b/torch/distributed/optim/named_optimizer.py
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import logging
 import warnings
 from collections.abc import Collection, Mapping
@@ -67,8 +70,13 @@ def __init__(
         optimizer_class: optim.Optimizer,
         param_groups: Optional[Collection[Mapping[str, Any]]] = None,
         module: Optional[nn.Module] = None,
+<<<<<<< HEAD
         *args,
         **kwargs,
+=======
+        *args: tuple[Any, ...],
+        **kwargs: dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         torch._C._log_api_usage_once("torch.distributed.optim._NamedOptimizer")
         self.param_groups: Collection[Mapping[str, Any]] = param_groups  # type: ignore[assignment]
@@ -103,7 +111,11 @@ def __init__(
         # Update param_groups from optimizer.
         self.param_groups = self._optimizer.param_groups
 
+<<<<<<< HEAD
     def _param_groups_check(self):
+=======
+    def _param_groups_check(self) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.param_groups is not None:
             for param_group in self.param_groups:
                 assert isinstance(param_group, dict), "param group must be a dict"
@@ -147,7 +159,11 @@ def state_dict(self) -> dict[str, Any]:
         return self._post_state_dict({"state": ret_state, "param_groups": ret_groups})
 
     @overload
+<<<<<<< HEAD
     def step(self, closure: None = ...) -> None: ...
+=======
+    def step(self, closure: None = None) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     def step(self, closure: Callable[[], float]) -> float: ...
@@ -165,7 +181,11 @@ def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]
     def state(self) -> Mapping[torch.Tensor, Any]:  # type: ignore[override]
         return self._optimizer.state
 
+<<<<<<< HEAD
     def load_state_dict(self, state_dict: Mapping[str, Any]) -> None:
+=======
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Define the default behavior to load a state_dict for ``_NamedOptimizer``.
 
@@ -183,7 +203,11 @@ def load_state_dict(self, state_dict: Mapping[str, Any]) -> None:
             ...
         ```
         Args:
+<<<<<<< HEAD
             state_dict (Dict[str, Any]) : A ``state_dict`` to load into the optimizer.
+=======
+            state_dict (dict[str, Any]) : A ``state_dict`` to load into the optimizer.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 Note that this state dict update is performed in place.
 
         .. note:: PyTorch is using lazy init to initialize the optim states.
@@ -306,7 +330,11 @@ def init_state(self) -> None:
         # Calling ``step`` will load the initial state for optimizer states.
         self.step(closure=None)
 
+<<<<<<< HEAD
     def _pre_load_state_dict(self, state_dict) -> dict[str, Any]:
+=======
+    def _pre_load_state_dict(self, state_dict: dict[str, Any]) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO(chienchin): This API should be FSDP agnostic and should support
         # general user hooks.
         if isinstance(self.module, FSDP):
@@ -315,7 +343,11 @@ def _pre_load_state_dict(self, state_dict) -> dict[str, Any]:
             )
         return state_dict
 
+<<<<<<< HEAD
     def _post_state_dict(self, state_dict) -> dict[str, Any]:
+=======
+    def _post_state_dict(self, state_dict: dict[str, Any]) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO(chienchin): This API should be FSDP agnostic and should support
         # general user hooks.
         if isinstance(self.module, FSDP):
@@ -324,5 +356,9 @@ def _post_state_dict(self, state_dict) -> dict[str, Any]:
 
 
 def _gen_param_group_key(param_keys: list[str]) -> str:
+<<<<<<< HEAD
     """Concatenate all param keys as a unique indentifier for one param group."""
+=======
+    """Concatenate all param keys as a unique identifier for one param group."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return "/".join(sorted(param_keys))
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index cb7fb8a26a26..7413f3d9808f 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-decorators
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # mypy: allow-untyped-defs
 import logging
 from collections import defaultdict
diff --git a/torch/distributed/optim/post_localSGD_optimizer.py b/torch/distributed/optim/post_localSGD_optimizer.py
index 3c0027d11240..e1968323178e 100644
--- a/torch/distributed/optim/post_localSGD_optimizer.py
+++ b/torch/distributed/optim/post_localSGD_optimizer.py
@@ -9,7 +9,11 @@ class PostLocalSGDOptimizer(torch.optim.Optimizer):
     r"""
     Wraps an arbitrary :class:`torch.optim.Optimizer` and runs `post-local SGD <https://arxiv.org/abs/1808.07217>`_,
     This optimizer runs local optimizer at every step.
+<<<<<<< HEAD
     After the warm-up stage, it averages parameters periodically afer the local optimizer is applied.
+=======
+    After the warm-up stage, it averages parameters periodically after the local optimizer is applied.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         optim: The local optimizer.
@@ -61,7 +65,11 @@ def __init__(self, optim: torch.optim.Optimizer, averager: averagers.ModelAverag
         self.averager = averager
 
     @property
+<<<<<<< HEAD
     def state(self):
+=======
+    def state(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.optim.state
 
     def __repr__(self):
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py
index e8414fd1374b..9104f5f158f4 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.py
+++ b/torch/distributed/optim/zero_redundancy_optimizer.py
@@ -284,7 +284,11 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable):
     r"""
     Wrap an arbitrary :class:`optim.Optimizer <torch.optim.Optimizer>` and shards its states across ranks in the group.
 
+<<<<<<< HEAD
     The sharing is done as described by ZeRO_.
+=======
+    The sharing is done as described by `ZeRO <https://arxiv.org/abs/1910.02054>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     The local optimizer instance in each rank is only
     responsible for updating approximately ``1 / world_size`` parameters and
@@ -365,9 +369,12 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable):
         is to prepend dummy inputs.
 
     .. warning:: ZeroRedundancyOptimizer is experimental and subject to change.
+<<<<<<< HEAD
 
     .. _ZeRO: https://arxiv.org/abs/1910.02054
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.pyi b/torch/distributed/optim/zero_redundancy_optimizer.pyi
index d69b365f2342..d8eccd3f238c 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.pyi
+++ b/torch/distributed/optim/zero_redundancy_optimizer.pyi
@@ -65,7 +65,11 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable):
     def add_param_group(self, param_group: dict[str, Any]) -> None: ...
     def consolidate_state_dict(self, to: int = ...) -> None: ...
     @overload
+<<<<<<< HEAD
     def step(self, closure: None = ..., **kwargs: Any) -> None: ...
+=======
+    def step(self, closure: None = None, **kwargs: Any) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @overload
     def step(self, closure: Callable[[], float], **kwargs: Any) -> float: ...
     def load_state_dict(self, state_dict: dict[str, Any]) -> None: ...
diff --git a/torch/distributed/pipelining/_IR.py b/torch/distributed/pipelining/_IR.py
index 416965e80ba3..ee754ced811e 100644
--- a/torch/distributed/pipelining/_IR.py
+++ b/torch/distributed/pipelining/_IR.py
@@ -1003,9 +1003,13 @@ def _trace_with_export(
         logger.info("Tracing model ...")
         try:
             ep = torch.export.export_for_training(
+<<<<<<< HEAD
                 mod,
                 example_args,
                 example_kwargs,
+=======
+                mod, example_args, example_kwargs, strict=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         except Exception as e:
             raise RuntimeError(
@@ -1211,7 +1215,11 @@ def pipeline(
     Arguments
     ---------
     module:
+<<<<<<< HEAD
         The module to be splitted.
+=======
+        The module to be split.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mb_args:
         Example positional inputs, in micro-batch form.
     mb_kwargs:
diff --git a/torch/distributed/pipelining/_backward.py b/torch/distributed/pipelining/_backward.py
index 4269375a1c6f..c21d79caab6c 100644
--- a/torch/distributed/pipelining/_backward.py
+++ b/torch/distributed/pipelining/_backward.py
@@ -196,12 +196,18 @@ def hook(grad_inputs):
             torch.ones_like(stage_output) for stage_output in stage_outputs_or_loss
         ]
 
+<<<<<<< HEAD
+=======
+    # Some inputs may not be used or may not require gradients, so we filter them out
+    input_values = [inp for inp in input_values if inp.requires_grad]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dinputs = torch.autograd.grad(
         stage_outputs_or_loss,
         inputs=input_values,
         grad_outputs=output_grads,
         retain_graph=True,
     )
+<<<<<<< HEAD
 
     # update the gradients for inputs
     for i, inp in enumerate(input_values):
@@ -209,6 +215,14 @@ def hook(grad_inputs):
             inp.grad = dinputs[i]
         else:
             inp.grad += dinputs[i]
+=======
+    # Update the gradients for inputs
+    for inp, dinput in zip(input_values, dinputs):
+        if inp.grad is None:
+            inp.grad = dinput
+        else:
+            inp.grad += dinput
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # stage_outputs_or_loss are not used in backwards after this point, so we can safely remove it from the autograd graph
     # this allows autograd to clear up the graph dedicated for this tensor and free up significant memory
@@ -243,7 +257,11 @@ def stage_backward_weight(
         # Break a reference cycle caused inside stage_backward_input->get_hook->hook
         # The summarized cycle is:
         # `hook` -> cell -> param_group -> intermediates -> `hook`
+<<<<<<< HEAD
         # becuase we install the hook function onto each of the intermediate autograd nodes.
+=======
+        # because we install the hook function onto each of the intermediate autograd nodes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # We need to keep intermediates alive up until backward_weight, but we can free it now.
         del param_group["intermediates"]
 
diff --git a/torch/distributed/pipelining/_schedule_visualizer.py b/torch/distributed/pipelining/_schedule_visualizer.py
new file mode 100644
index 000000000000..b39a806fa776
--- /dev/null
+++ b/torch/distributed/pipelining/_schedule_visualizer.py
@@ -0,0 +1,189 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+"""
+This visualizer requires matplotlib to be installed.
+
+Example usage:
+
+ops = get_schedule_ops("InterleavedZeroBubble", 4, 8)
+visualize_schedule(ops, "test.png")
+"""
+
+from typing import Optional, Union
+from unittest import mock
+
+from torch.distributed.pipelining.schedules import (
+    _Action,
+    _ComputationType,
+    _PipelineSchedule,
+    get_schedule_class,
+    PipelineScheduleMulti,
+    PipelineScheduleSingle,
+)
+from torch.distributed.pipelining.stage import PipelineStage
+
+
+def get_schedule_ops(
+    schedule: Union[str, _PipelineSchedule],
+    pp_degree: int,
+    num_microbatches: int,
+    num_stages_per_rank: Optional[int] = None,
+) -> list[list[Optional[_Action]]]:
+    """
+    Get all actions for a given schedule, pp_degree, and num_microbatches. The actions are returned in a list of lists
+    where each inner list represents a rank and each element in the inner list represents an action.
+
+    The schedule can be specified as a string which is passed into get_schedule_class() or a _PipelineSchedule instance.
+    """
+
+    if isinstance(schedule, str):
+        schedule_class = get_schedule_class(schedule)
+    elif type(schedule) == _PipelineSchedule:
+        schedule_class = schedule
+    else:
+        raise ValueError(f"Invalid schedule: {schedule}")
+
+    # Create a mock of the PipelineStage class
+    mock_pipeline_stage = mock.create_autospec(PipelineStage, instance=True)
+    # Set the return values for group_rank and group_size methods
+    mock_pipeline_stage.group_rank = 0
+    mock_pipeline_stage.group_size = pp_degree
+    mock_pipeline_stage.submod = None
+
+    # Check num_stages_per_rank is valid
+    if issubclass(schedule_class, PipelineScheduleSingle):
+        if num_stages_per_rank is None:
+            num_stages_per_rank = 1
+        assert num_stages_per_rank == 1
+        stages = mock_pipeline_stage
+        stages.num_stages = num_stages_per_rank * pp_degree
+    elif issubclass(schedule_class, PipelineScheduleMulti):
+        if num_stages_per_rank is None:
+            num_stages_per_rank = 2
+        assert num_stages_per_rank >= 2
+        stages = [mock_pipeline_stage for _ in range(num_stages_per_rank)]
+        for stage in stages:
+            stage.num_stages = num_stages_per_rank * pp_degree
+
+    else:
+        raise ValueError(f"Invalid schedule: {schedule_class}")
+
+    # Instantiate the schedule class
+    schedule_instance = schedule_class(stages, num_microbatches)
+
+    # Convert to List[List[_Action]]
+    all_actions = []
+    for rank in range(pp_degree):
+        all_actions.append(schedule_instance.pipeline_order[rank])
+
+    # Return the pipeline order
+    return all_actions
+
+
+class _ComputationTypeColor:
+    def __init__(
+        self,
+        color: str,
+        text: str = "",
+        width: int = 1,
+    ):
+        self.color = color
+        self.width = width
+        self.text = text
+
+
+# Update the mapping to use _ComputationTypeColor instances
+action_type_to_color_mapping = {
+    _ComputationType.FORWARD: _ComputationTypeColor("blue", "Forward"),
+    _ComputationType.BACKWARD_INPUT: _ComputationTypeColor("teal", "Backward Input"),
+    _ComputationType.BACKWARD_WEIGHT: _ComputationTypeColor("green", "Backward Weight"),
+    _ComputationType.FULL_BACKWARD: _ComputationTypeColor("orange", "Full Backward", 2),
+}
+
+
+def visualize_schedule(
+    schedule: list[list[Optional[_Action]]], filename: Optional[str] = None
+) -> None:
+    """
+    Visualize the schedule using matplotlib.
+    The schedule is a list of lists where each inner list represents a rank and each element in the inner list represents an action.
+    The actions are represented as rectangles with different colors based on their computation type.
+    The filename is optional and if provided, the plot will be saved to that file.
+    """
+
+    import matplotlib.pyplot as plt
+    from matplotlib.patches import Rectangle
+
+    plt.rcParams["font.family"] = (
+        "DejaVu Sans"  # or any other font available on your system
+    )
+    num_ranks = len(schedule)
+    max_actions = max(len(rank) for rank in schedule)
+
+    # Increase the figure size to provide more space for the legend
+    fig, ax = plt.subplots(figsize=(max_actions + 2, num_ranks + 2))
+    max_draw_position = -1
+    # Calculate dynamic font size based on figure size
+    font_size = min(max_actions, num_ranks) + 4
+    used_computation = set()
+    for rank_idx, actions in enumerate(schedule):
+        draw_position = 0  # Initialize drawing position for each rank
+        for action in actions:
+            if action is not None:
+                comp_type_color = action_type_to_color_mapping.get(
+                    action.computation_type, _ComputationTypeColor("black")
+                )
+                used_computation.add(action.computation_type)
+                color = comp_type_color.color
+                width = comp_type_color.width
+                # Draw the rectangle to represent the action duration
+                rect = Rectangle(
+                    (draw_position, num_ranks - rank_idx - 1),
+                    width,
+                    1,
+                    facecolor=color,
+                    edgecolor="black",
+                )
+                ax.add_patch(rect)
+                # Draw the text centered within the rectangle
+                ax.text(
+                    draw_position + width / 2,
+                    num_ranks - rank_idx - 1 + 0.5,
+                    str(action),
+                    ha="center",
+                    va="center",
+                    fontsize=font_size,
+                    color="white",
+                )
+                # Increment the drawing position by the width of the current action
+                draw_position += width
+            else:
+                draw_position += 1  # Move to the next
+            max_draw_position = max(max_draw_position, draw_position)
+    ax.set_xlim(-0.5, max_draw_position + 1)
+    ax.set_ylim(-0.5, num_ranks + 0.5)  # Add extra space at the top
+    # Set y-ticks to be in the middle of each rank's row
+    ax.set_yticks([num_ranks - rank_idx - 0.5 for rank_idx in range(num_ranks)])
+    ax.set_yticklabels([f"Rank {i}" for i in range(num_ranks)], fontsize=font_size)
+    ax.set_xticklabels([])
+
+    # Remove grid lines and ticks
+    ax.grid(False)
+    # Add legend with larger font size
+    legend_elements = [
+        Rectangle(
+            (0, 0),
+            1,
+            1,
+            facecolor=action_type_to_color_mapping[comp_type].color,
+            edgecolor="black",
+            label=action_type_to_color_mapping[comp_type].text,
+        )
+        for comp_type in used_computation
+    ]
+    ax.legend(handles=legend_elements, loc="upper right", fontsize=font_size)
+    # Save to file if filename is provided, otherwise display the plot
+    if filename:
+        plt.savefig(filename, bbox_inches="tight")
+    else:
+        plt.show()
diff --git a/torch/distributed/pipelining/microbatch.py b/torch/distributed/pipelining/microbatch.py
index 28d5daf8d236..9677bdea54d9 100644
--- a/torch/distributed/pipelining/microbatch.py
+++ b/torch/distributed/pipelining/microbatch.py
@@ -1,6 +1,10 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import logging
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Optional
 
 import torch
@@ -46,7 +50,11 @@ class _LossReducer(_CustomReducer):
     pass
 
 
+<<<<<<< HEAD
 sum_reducer = _LossReducer(torch.tensor(0.0), lambda a, b: a + b)
+=======
+sum_reducer = _LossReducer(torch.tensor(0.0), operator.add)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Default chunking dimension is 0. This is used for the case where the user did
 # not specify a chunking dimension.
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
index e431e29b77e6..daa9ce9534dc 100644
--- a/torch/distributed/pipelining/schedules.py
+++ b/torch/distributed/pipelining/schedules.py
@@ -9,7 +9,11 @@
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
 from enum import Enum
+<<<<<<< HEAD
 from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING, Union
+=======
+from typing import Any, Callable, NamedTuple, Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -23,9 +27,12 @@
 from .stage import _PipelineStageBase
 
 
+<<<<<<< HEAD
 if TYPE_CHECKING:
     from torch.distributed import Work
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "get_schedule_class",
     "PipelineScheduleSingle",
@@ -397,20 +404,37 @@ def _merge_outputs(self, output_chunks: list[Any]) -> Any:
         )
 
 
+<<<<<<< HEAD
 def _batch_p2p(p2p_ops: list[dist.P2POp], desc: Optional[str] = None):
+=======
+def _batch_p2p(
+    p2p_ops: list[dist.P2POp], desc: Optional[str] = None
+) -> list[dist.Work]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Simple wrapper over batch_isend_irecv from torch.distributed, which just adds a descriptive logger on top.
     """
     if len(p2p_ops) == 0:
+<<<<<<< HEAD
         return None
     desc_str = f"{desc}, " if desc else ""
     logger.debug("batch_p2p %s%s", desc_str, p2p_ops)
     return dist.batch_isend_irecv(p2p_ops).pop()
+=======
+        return []
+    desc_str = f"{desc}, " if desc else ""
+    logger.debug("batch_p2p %s%s", desc_str, p2p_ops)
+    return dist.batch_isend_irecv(p2p_ops)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _sorted_batch_p2p(
     p2p_ops: list[dist.P2POp], desc: Optional[str] = None
+<<<<<<< HEAD
 ) -> dict[int, dist.Work]:
+=======
+) -> dict[int, list[dist.Work]]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Sorts the list of P2P ops by the peer rank, and then calls
     batch_isend_irecv. Return a dictionary of works by peer rank. This function
@@ -420,7 +444,11 @@ def _sorted_batch_p2p(
     #   int is the peer rank;
     #   List is the list of ops towards the peer
     ops_by_peer: dict[int, list[dist.P2POp]] = defaultdict(list)
+<<<<<<< HEAD
     work_by_peer: dict[int, dist.Work] = {}
+=======
+    work_by_peer: dict[int, list[dist.Work]] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if len(p2p_ops) == 0:
         return work_by_peer
 
@@ -435,6 +463,17 @@ def _sorted_batch_p2p(
     return work_by_peer
 
 
+<<<<<<< HEAD
+=======
+def _wait_batch_p2p(work: list[dist.Work]):
+    """
+    Waits for a list of dist.Work (typically from _batch_p2p / _sorted_batch_p2p).
+    """
+    for w in work:
+        w.wait()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class PipelineScheduleSingle(_PipelineSchedule):
     """
     Base class for single-stage schedules.
@@ -478,6 +517,13 @@ def __init__(
 or equal to the number of stages ({self._num_stages})."
             )
 
+<<<<<<< HEAD
+=======
+        self.pipeline_order: Optional[dict[int, list[Optional[_Action]]]] = (
+            self._get_pipeline_order()
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _initialize_stage(self, args, kwargs):
         self._stage._prepare_forward_infra(self._n_microbatches, args, kwargs)
         if self._has_backward:
@@ -517,6 +563,27 @@ def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
         else:
             return None
 
+<<<<<<< HEAD
+=======
+    def _get_pipeline_order(self) -> Optional[dict[int, list[Optional[_Action]]]]:
+        """
+        Returns the pipeline execution order as a schedule IR.
+
+        The returned IR is a dictionary mapping rank IDs to lists of actions.
+        Each action is either an _Action object representing computation to perform,
+        or None representing a deliberate idle step.
+
+        The None values are used to represent pipeline bubbles where a rank
+        must wait for dependencies from other ranks before proceeding. However
+        during execution, with  the _PipelineScheduleRuntime, these Nones are
+        skipped since the relevant communication (send/recv) will be scheduled and waited on.
+
+        Returns:
+            A dictionary mapping rank -> list of actions
+        """
+        return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class _ScheduleForwardOnly(PipelineScheduleSingle):
     """
@@ -544,7 +611,11 @@ def _step_microbatches(
             self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
 
         # Delay send waits
+<<<<<<< HEAD
         fwd_sends_to_wait: list[dist.Work] = []
+=======
+        fwd_sends_to_wait: list[list[dist.Work]] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Run microbatches
         for i in range(self._n_microbatches):
@@ -552,7 +623,11 @@ def _step_microbatches(
                 ops = self._stage.get_fwd_recv_ops(i)
                 works = _sorted_batch_p2p(ops, desc="fwd_recv")
                 for work in works.values():
+<<<<<<< HEAD
                     work.wait()
+=======
+                    _wait_batch_p2p(work)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 self._stage.forward_one_chunk(i, arg_mbs[i], kwarg_mbs[i])  # type: ignore[index]
 
@@ -566,7 +641,11 @@ def _step_microbatches(
         # This should not have performance impact because by the time the first
         # backward arrives all the forward sends should have been finished.
         for work in fwd_sends_to_wait:
+<<<<<<< HEAD
             work.wait()
+=======
+            _wait_batch_p2p(work)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class ScheduleGPipe(PipelineScheduleSingle):
@@ -595,7 +674,11 @@ def _step_microbatches(
             self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
 
         # Delay send waits
+<<<<<<< HEAD
         fwd_sends_to_wait: list[dist.Work] = []
+=======
+        fwd_sends_to_wait: list[list[dist.Work]] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Run microbatches
         for i in range(self._n_microbatches):
@@ -603,7 +686,11 @@ def _step_microbatches(
                 ops = self._stage.get_fwd_recv_ops(i)
                 works = _sorted_batch_p2p(ops, desc="fwd_recv")
                 for work in works.values():
+<<<<<<< HEAD
                     work.wait()
+=======
+                    _wait_batch_p2p(work)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 output = self._stage.forward_one_chunk(i, arg_mbs[i], kwarg_mbs[i])  # type: ignore[index]
 
@@ -619,7 +706,11 @@ def _step_microbatches(
         # This should not have performance impact because by the time the first
         # backward arrives all the forward sends should have been finished.
         for work in fwd_sends_to_wait:
+<<<<<<< HEAD
             work.wait()
+=======
+            _wait_batch_p2p(work)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # No loss function, no need to run backward
         if not self._has_backward:
@@ -627,13 +718,21 @@ def _step_microbatches(
 
         # Run backward
         # Delay send waits
+<<<<<<< HEAD
         bwd_sends_to_wait: list[dist.Work] = []
+=======
+        bwd_sends_to_wait: list[list[dist.Work]] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for i in range(self._n_microbatches):
             with record_function(f"Backward {i}"):
                 ops = self._stage.get_bwd_recv_ops(i)
                 works = _sorted_batch_p2p(ops, desc="bwd_recv")
                 for work in works.values():
+<<<<<<< HEAD
                     work.wait()
+=======
+                    _wait_batch_p2p(work)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 loss = self._maybe_get_loss(self._stage, i)
                 self._stage.backward_one_chunk(
@@ -657,7 +756,43 @@ def _step_microbatches(
 
         # Wait for all backward sends to finish
         for work in bwd_sends_to_wait:
+<<<<<<< HEAD
             work.wait()
+=======
+            _wait_batch_p2p(work)
+
+    def _get_pipeline_order(self) -> Optional[dict[int, list[Optional[_Action]]]]:
+        """
+        Returns the pipeline order for GPipe schedule.
+
+        See base method in PipelineScheduleSingle for details on the schedule IR format.
+        """
+        pipeline_order = {}
+        pp_group_size = self._num_stages
+
+        for rank in range(pp_group_size):
+            actions: list[Optional[_Action]] = []
+
+            # 1. Initial delay based on rank position
+            warmup_delay = rank
+            actions.extend([None] * warmup_delay)
+
+            # 2. Forward passes for all microbatches
+            for mb_idx in range(self._n_microbatches):
+                actions.append(_Action(rank, _ComputationType.FORWARD, mb_idx))
+
+            # 3. Wait period before backward passes can begin
+            backward_delay = 3 * (pp_group_size - 1 - rank)
+            actions.extend([None] * backward_delay)
+
+            # 4. Backward passes for all microbatches
+            for mb_idx in range(self._n_microbatches):
+                actions.append(_Action(rank, _ComputationType.FULL_BACKWARD, mb_idx))
+
+            pipeline_order[rank] = actions
+
+        return pipeline_order
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Schedule1F1B(PipelineScheduleSingle):
@@ -697,13 +832,21 @@ def _step_microbatches(
         bwd_mb_index = 0
 
         # Warmup phase
+<<<<<<< HEAD
         send_work = None
+=======
+        send_work: list[dist.Work] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fwd_sends = []
         for _ in range(warmup_chunks):
             # Receive activations
             fwd_recvs = self._stage.get_fwd_recv_ops(fwd_mb_index)
+<<<<<<< HEAD
             if recv_work := _batch_p2p(fwd_recvs, desc="fwd_recv"):
                 recv_work.wait()
+=======
+            _wait_batch_p2p(_batch_p2p(fwd_recvs, desc="fwd_recv"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Compute
             output = self._stage.forward_one_chunk(
@@ -714,8 +857,12 @@ def _step_microbatches(
             # finished, otherwise, we are heavily communication bound, in which
             # case it doesn't create a lot of benefit to compute next chunk
             # eagerly either)
+<<<<<<< HEAD
             if send_work:
                 send_work.wait()
+=======
+            _wait_batch_p2p(send_work)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Send activations
             fwd_sends = self._stage.get_fwd_send_ops(fwd_mb_index)
@@ -723,7 +870,11 @@ def _step_microbatches(
                 # Safe to fire
                 send_work = _batch_p2p(fwd_sends, desc="fwd_send")
             # otherwise:
+<<<<<<< HEAD
             #   The last foward send is left for fuse with first 1B in 1B1F below
+=======
+            #   The last forward send is left for fuse with first 1B in 1B1F below
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Compute loss
             self._maybe_compute_loss(self._stage, output, target_mbs, fwd_mb_index)
@@ -737,8 +888,12 @@ def _step_microbatches(
             bwd_recvs = self._stage.get_bwd_recv_ops(bwd_mb_index)
 
             # Now, we need to fire the fwd_sends and bwd_recvs together
+<<<<<<< HEAD
             if fuse_work := _batch_p2p(fwd_sends + bwd_recvs, desc="fwd_send_bwd_recv"):
                 fuse_work.wait()
+=======
+            _wait_batch_p2p(_batch_p2p(fwd_sends + bwd_recvs, desc="fwd_send_bwd_recv"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Backward one chunk
             loss = self._maybe_get_loss(self._stage, bwd_mb_index)
@@ -760,8 +915,12 @@ def _step_microbatches(
             fwd_recvs = self._stage.get_fwd_recv_ops(fwd_mb_index)
 
             # Fuse it with bwd_sends above
+<<<<<<< HEAD
             if fuse_work := _batch_p2p(bwd_sends + fwd_recvs, desc="bwd_send_fwd_recv"):
                 fuse_work.wait()
+=======
+            _wait_batch_p2p(_batch_p2p(bwd_sends + fwd_recvs, desc="bwd_send_fwd_recv"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Now do the fwd
             output = self._stage.forward_one_chunk(
@@ -782,8 +941,12 @@ def _step_microbatches(
         while bwd_mb_index < self._n_microbatches:
             # prepare bwd recv ops
             bwd_recvs = self._stage.get_bwd_recv_ops(bwd_mb_index)
+<<<<<<< HEAD
             if recv_work := _batch_p2p(bwd_recvs, desc="bwd_recv"):
                 recv_work.wait()
+=======
+            _wait_batch_p2p(_batch_p2p(bwd_recvs, desc="bwd_recv"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Backward one chunk
             loss = self._maybe_get_loss(self._stage, bwd_mb_index)
@@ -794,8 +957,12 @@ def _step_microbatches(
             )
 
             # Clear previous chunk's backward sends (hopefully they have well finished)
+<<<<<<< HEAD
             if send_work:
                 send_work.wait()
+=======
+            _wait_batch_p2p(send_work)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Get the bwd send ops, fire it
             bwd_sends = self._stage.get_bwd_send_ops(bwd_mb_index)
@@ -807,12 +974,87 @@ def _step_microbatches(
         )
 
         # Wait for the last backward send to finish
+<<<<<<< HEAD
         if send_work:
             send_work.wait()
+=======
+        _wait_batch_p2p(send_work)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Return losses if there is a container passed in
         self._update_losses(self._stage, losses)
 
+<<<<<<< HEAD
+=======
+    def _get_pipeline_order(self) -> Optional[dict[int, list[Optional[_Action]]]]:
+        """
+        Returns the pipeline order for 1F1B schedule.
+
+        See base method in PipelineScheduleSingle for details on the schedule IR format.
+        """
+        pipeline_order = {}
+        pp_group_size = self._num_stages
+
+        for rank in range(pp_group_size):
+            actions: list[Optional[_Action]] = []
+
+            # 1. Warmup phase: initial delay based on rank
+            actions.extend([None] * rank)
+
+            # 2. Initial forward passes before 1F1B phase
+            num_forward = (pp_group_size - 1) - rank
+            forward_mb = 0
+            for i in range(num_forward):
+                actions.append(_Action(rank, _ComputationType.FORWARD, i))
+                forward_mb = i
+
+            # 3. Wait for backward to be ready
+            wait_for_1f1b = max(0, 2 * (pp_group_size - 1 - rank))
+            actions.extend([None] * wait_for_1f1b)
+
+            # 4. 1F1B steady state phase
+            backward_mb = 0
+            remaining_forward = self._n_microbatches - num_forward
+
+            while remaining_forward > 0:
+                # One forward
+                forward_mb += 1
+                actions.append(_Action(rank, _ComputationType.FORWARD, forward_mb))
+                remaining_forward -= 1
+
+                # One backward
+                actions.append(
+                    _Action(rank, _ComputationType.FULL_BACKWARD, backward_mb)
+                )
+                backward_mb += 1
+
+            # 5. Cooldown phase: remaining backward passes
+            remaining_backward = self._n_microbatches - backward_mb
+
+            while remaining_backward > 0:
+                # Add None and backward actions in alternating pattern
+                # based on distance from the last stage
+                if (pp_group_size - rank) > 0:
+                    actions.append(None)
+                    # Decrement the wait counter only if we still have backward passes to do
+                    if remaining_backward > 0:
+                        actions.append(
+                            _Action(rank, _ComputationType.FULL_BACKWARD, backward_mb)
+                        )
+                        backward_mb += 1
+                        remaining_backward -= 1
+                else:
+                    # If we're at the last stage, just add backward actions without None
+                    actions.append(
+                        _Action(rank, _ComputationType.FULL_BACKWARD, backward_mb)
+                    )
+                    backward_mb += 1
+                    remaining_backward -= 1
+
+            pipeline_order[rank] = actions
+        return pipeline_order
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _add_unshard_reshard(
     compute_actions: list[Optional[_Action]],
@@ -821,7 +1063,11 @@ def _add_unshard_reshard(
     """Given a basic schedule involving only compute actions (F,B,W), add UNSHARD/RESHARD actions for FSDP.
 
     UNSHARD refers to fetching the full contents of an FSDP-sharded layer, requiring an all-gather operation.
+<<<<<<< HEAD
     RESHARD does the opposite, releasing memory (but doing no commmunication)
+=======
+    RESHARD does the opposite, releasing memory (but doing no communication)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     We abandon the "timestep lock"  during lowering
 
@@ -1417,8 +1663,12 @@ def _step_microbatches(
                             )
 
                 # do the communication
+<<<<<<< HEAD
                 if ops:
                     _batch_p2p(ops).wait()
+=======
+                _wait_batch_p2p(_batch_p2p(ops))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             except Exception as e:
                 logger.error(
                     "[Rank %s] pipeline schedule %s caught the following exception \
@@ -1483,7 +1733,11 @@ def _load_actions(
             raise NotImplementedError(f"{format=} is not implemented")
 
     def _load_csv(self, filename: str, format: str = "compute_only"):
+<<<<<<< HEAD
         """Loads a csv in simple format and then lowers it to include comunication actions
+=======
+        """Loads a csv in simple format and then lowers it to include communication actions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         format must be either "compute_only" or "compute_comms".  If compute_only, the lowering passes
         will automatically be run to generate a compute_comms schedule.
@@ -1550,11 +1804,19 @@ def _step_microbatches(
         )
 
         # recv ops indexed by (stage_idx, mb_idx) need to be waited on before use
+<<<<<<< HEAD
         bwd_recv_ops: dict[tuple[int, int], Work] = {}
         fwd_recv_ops: dict[tuple[int, int], Work] = {}
 
         # send ops should be waited on before step() exists, mainly for hygeine
         send_ops: list[Work] = []
+=======
+        bwd_recv_ops: dict[tuple[int, int], list[dist.Work]] = {}
+        fwd_recv_ops: dict[tuple[int, int], list[dist.Work]] = {}
+
+        # send ops should be waited on before step() exists, mainly for hygiene
+        send_ops: list[list[dist.Work]] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # we track which stages are 'active' when used with FSDP, and wait on unshard ops before computing on stages
         unshard_ops: dict[int, UnshardHandle] = {}
@@ -1655,7 +1917,11 @@ def _assert_unsharded(stage_idx: int):
                             stage_idx,
                             mb_index,
                         ) in fwd_recv_ops, f"Computing {action=} before receiving input"
+<<<<<<< HEAD
                         fwd_recv_ops.pop((stage_idx, mb_index)).wait()
+=======
+                        _wait_batch_p2p(fwd_recv_ops.pop((stage_idx, mb_index)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                     output = stage.forward_one_chunk(
                         mb_index, arg_mbs[mb_index], kwarg_mbs[mb_index]
@@ -1684,7 +1950,11 @@ def _assert_unsharded(stage_idx: int):
                         ) in bwd_recv_ops, (
                             f"Attempted to run compute {action=} before receiving input"
                         )
+<<<<<<< HEAD
                         bwd_recv_ops.pop((stage_idx, mb_index)).wait()
+=======
+                        _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     loss = self._maybe_get_loss(stage, mb_index)
                     backward_counter[stage_idx] += 1
                     last_backward = backward_counter[stage_idx] == self._n_microbatches
@@ -1714,7 +1984,11 @@ def _assert_unsharded(stage_idx: int):
                         ) in bwd_recv_ops, (
                             f"Attempted to run compute {action=} before receiving input"
                         )
+<<<<<<< HEAD
                         bwd_recv_ops.pop((stage_idx, mb_index)).wait()
+=======
+                        _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     loss = self._maybe_get_loss(stage, mb_index)
                     stage.backward_one_chunk(
                         mb_index,
@@ -1757,7 +2031,11 @@ def _assert_unsharded(stage_idx: int):
 
         # Mostly these operations should have finished long ago, but there isn't an obvious time when to wait for them
         while len(send_ops):
+<<<<<<< HEAD
             send_ops.pop().wait()
+=======
+            _wait_batch_p2p(send_ops.pop())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert len(unshard_ops) == 0, "Unused unshard operations"
 
@@ -1769,7 +2047,11 @@ class ScheduleLoopedBFS(PipelineScheduleMulti):
     """
     Breadth-First Pipeline Parallelism.
     See https://arxiv.org/abs/2211.05953 for details.
+<<<<<<< HEAD
     Simliar to Interleaved 1F1B, Looped BFS supports multiple stages per rank.
+=======
+    Similar to Interleaved 1F1B, Looped BFS supports multiple stages per rank.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     What is different is that when microbatches are ready for multiple local
     stages, Loops BFS will prioritizes the earlier stage, running all available
     microbatches at once.
diff --git a/torch/distributed/pipelining/stage.py b/torch/distributed/pipelining/stage.py
index 71260fcae517..6fc03dea528e 100644
--- a/torch/distributed/pipelining/stage.py
+++ b/torch/distributed/pipelining/stage.py
@@ -55,7 +55,11 @@ def _normalize_model_output_as_tuple(output: Any) -> tuple[Any]:
         # output in list format
         output = tuple(output)
 
+<<<<<<< HEAD
     # Unify output form to tuple for easy correspondance with
+=======
+    # Unify output form to tuple for easy correspondence with
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # `act_send_info`
     output_tuple = output if type(output) is tuple else (output,)
     return output_tuple
@@ -267,7 +271,11 @@ def _create_grad_send_info(
         def map_recv_to_send(a):
             # Note: we send gradients back to previous stage as long as in
             # forward it is a received input, regardless of whether it requires
+<<<<<<< HEAD
             # grad. It is up to the previous stage to disgard this gradient.
+=======
+            # grad. It is up to the previous stage to discard this gradient.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(a, _RecvInfo):
                 grad_send_info.append(a.source)
                 return a.source
@@ -433,10 +441,14 @@ def get_fwd_send_ops(self, fwd_chunk_id: int) -> list[dist.P2POp]:
         """
         Get the activation send ops for current stage's forward.
         """
+<<<<<<< HEAD
         output = self.output_chunks[fwd_chunk_id]
         # Unify output form to tuple for easy correspondance with
         # `act_send_info`
         output_tuple = output if type(output) is tuple else (output,)
+=======
+        output_tuple, _ = self.fwd_cache[fwd_chunk_id]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ops: list[dist.P2POp] = []
 
@@ -719,7 +731,13 @@ def forward_one_chunk(
         output_tuple = _normalize_model_output_as_tuple(output)
 
         # Prepare for final output merge or reduction
+<<<<<<< HEAD
         self.output_chunks.append(output)
+=======
+        # Output chunks is only used for the last stage since we only merge the output of the last stage
+        if self.is_last:
+            self.output_chunks.append(output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Save activations and inputs for backward
         flat_args = flatten_args(composite_args)
@@ -920,7 +938,11 @@ def _validate_fwd_input(self, args, kwargs):
 
     def _validate_fwd_outputs(self, outputs: tuple[torch.Tensor, ...]):
         """Raises a RuntimeError if this stage produces an output of unexpected shape/dtype.
+<<<<<<< HEAD
         Most likely, this could be cause either by incorrect user specification of output shapes, or becuase
+=======
+        Most likely, this could be cause either by incorrect user specification of output shapes, or because
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shape inference was done on the original model but then at runtime the model is wrapped with something like
         mixed precision which changes output dtype.
         """
@@ -1011,7 +1033,11 @@ def _prepare_forward_infra(
         """
         # TODO(whc)
         # this method should be deleted once lazy buffer allocation is implemented
+<<<<<<< HEAD
         # for now, it ignores args/kwargs becuase it should not need to do shape inference
+=======
+        # for now, it ignores args/kwargs because it should not need to do shape inference
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for chunk in range(num_microbatches):
             self.args_recv_info[chunk] = self._create_act_recv_info()
 
@@ -1273,7 +1299,11 @@ def __init__(
         super().__init__(submodule, stage_index, num_stages, device, group, dw_builder)
         self.inputs: Optional[list[torch.Tensor]] = None
         self.inputs_meta: Optional[tuple[torch.Tensor, ...]] = None
+<<<<<<< HEAD
         # Note: inputs and submod should ideally be on meta device. We decided not to assert this (yet) becuase it
+=======
+        # Note: inputs and submod should ideally be on meta device. We decided not to assert this (yet) because it
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # might be breaking for existing users.
         if input_args is None:
             assert output_args is None, (
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 497d1579ad59..c7eb3a799c0f 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -162,7 +162,11 @@ def _create_c10d_store(
     hostname, port, rank, world_size, timeout, use_libuv=True
 ) -> Store:
     """
+<<<<<<< HEAD
     Smartly creates a c10d Store object on ``rank`` based on whether we need to re-use agent store.
+=======
+    Smartly creates a c10d Store object on ``rank`` based on whether we need to reuse agent store.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     The TCPStore server is assumed to be hosted
     on ``hostname:port``.
@@ -213,7 +217,11 @@ def _error(msg):
         return _rendezvous_error("tcp:// rendezvous: " + msg)
 
     result = urlparse(url)
+<<<<<<< HEAD
     if not result.port:
+=======
+    if result.port is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise _error("port number missing")
     query_dict = _query_to_dict(result.query)
     if "rank" not in query_dict:
diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py
index 3d71b2fc22bd..0dd752f62daa 100644
--- a/torch/distributed/rpc/__init__.py
+++ b/torch/distributed/rpc/__init__.py
@@ -30,6 +30,13 @@ def is_available() -> bool:
 
 
 if is_available():
+<<<<<<< HEAD
+=======
+    _is_tensorpipe_available = hasattr(
+        torch._C._distributed_rpc, "_TensorPipeRpcBackendOptionsBase"
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import numbers
 
     import torch.distributed.autograd as dist_autograd
@@ -37,7 +44,10 @@ def is_available() -> bool:
     from torch._C._distributed_rpc import (  # noqa: F401
         _cleanup_python_rpc_handler,
         _DEFAULT_INIT_METHOD,
+<<<<<<< HEAD
         _DEFAULT_NUM_WORKER_THREADS,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _DEFAULT_RPC_TIMEOUT_SEC,
         _delete_all_user_and_unforked_owner_rrefs,
         _destroy_rref_context,
@@ -58,7 +68,10 @@ def is_available() -> bool:
         _set_and_start_rpc_agent,
         _set_profiler_node_id,
         _set_rpc_timeout,
+<<<<<<< HEAD
         _TensorPipeRpcBackendOptionsBase,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _UNSET_RPC_TIMEOUT,
         enable_gil_profiling,
         get_rpc_timeout,
@@ -66,10 +79,23 @@ def is_available() -> bool:
         RemoteProfilerManager,
         RpcAgent,
         RpcBackendOptions,
+<<<<<<< HEAD
         TensorPipeAgent,
         WorkerInfo,
     )
 
+=======
+        WorkerInfo,
+    )
+
+    if _is_tensorpipe_available:
+        from torch._C._distributed_rpc import (  # noqa: F401
+            _DEFAULT_NUM_WORKER_THREADS,
+            _TensorPipeRpcBackendOptionsBase,
+            TensorPipeAgent,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from . import api, backend_registry, functions
     from .api import *  # noqa: F401,F403
     from .backend_registry import BackendType
diff --git a/torch/distributed/rpc/_utils.py b/torch/distributed/rpc/_utils.py
index 8925bc662b5f..53bfb0b55b99 100644
--- a/torch/distributed/rpc/_utils.py
+++ b/torch/distributed/rpc/_utils.py
@@ -3,8 +3,11 @@
 from contextlib import contextmanager
 from typing import cast
 
+<<<<<<< HEAD
 from . import api, TensorPipeAgent
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 logger = logging.getLogger(__name__)
 
@@ -40,6 +43,11 @@ def _group_membership_management(store, name, is_join):
 
 
 def _update_group_membership(worker_info, my_devices, reverse_device_map, is_join):
+<<<<<<< HEAD
+=======
+    from . import api, TensorPipeAgent
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     agent = cast(TensorPipeAgent, api._get_current_rpc_agent())
     ret = agent._update_group_membership(
         worker_info, my_devices, reverse_device_map, is_join
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index d4a6712e0d66..109fef0b7467 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -27,7 +27,10 @@
     get_rpc_timeout,
     PyRRef,
     RemoteProfilerManager,
+<<<<<<< HEAD
     TensorPipeAgent,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     WorkerInfo,
 )
 from torch.futures import Future
@@ -371,6 +374,11 @@ def shutdown(graceful=True, timeout=DEFAULT_SHUTDOWN_TIMEOUT):
     if graceful:
         try:
             agent = _get_current_rpc_agent()
+<<<<<<< HEAD
+=======
+            from torch._C._distributed_rpc import TensorPipeAgent
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not isinstance(agent, TensorPipeAgent) or agent.is_static_group:
                 _wait_all_workers(timeout)
                 _delete_all_user_and_unforked_owner_rrefs()
diff --git a/torch/distributed/rpc/options.py b/torch/distributed/rpc/options.py
index 9f1b13f948d0..c4921a5e1258 100644
--- a/torch/distributed/rpc/options.py
+++ b/torch/distributed/rpc/options.py
@@ -2,9 +2,14 @@
 from typing import Optional, Union
 
 import torch
+<<<<<<< HEAD
 from torch._C._distributed_rpc import _TensorPipeRpcBackendOptionsBase
 
 from . import constants as rpc_contants
+=======
+
+from . import _is_tensorpipe_available, constants as rpc_contants
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 DeviceType = Union[int, str, torch.device]
@@ -43,6 +48,15 @@ def _to_device_list(devices: list[DeviceType]) -> list[torch.device]:
     return list(map(_to_device, devices))
 
 
+<<<<<<< HEAD
+=======
+if _is_tensorpipe_available:  # type: ignore[has-type]
+    from torch._C._distributed_rpc import _TensorPipeRpcBackendOptionsBase
+else:
+    _TensorPipeRpcBackendOptionsBase = object  # type: ignore[assignment, misc]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
     r"""
     The backend options for
diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py
index b0cb1713bcc9..8f6753de6739 100644
--- a/torch/distributed/rpc/server_process_global_profiler.py
+++ b/torch/distributed/rpc/server_process_global_profiler.py
@@ -47,11 +47,19 @@ class _server_process_global_profile(profile):
 
         profile_memory (bool, optional): Whether to report memory usage, default: ``False``
 
+<<<<<<< HEAD
     .. warning:
         Enabling memory profiling incurs additional profiler overhead
 
     .. warning:
         Due to some CUDA multiprocessing limitations (multiprocessing-cuda-note_),
+=======
+    .. warning::
+        Enabling memory profiling incurs additional profiler overhead
+
+    .. warning::
+        Due to some CUDA multiprocessing limitations (see :ref:`multiprocessing-cuda-note`),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         one cannot use the profiler with ``use_cuda = True`` to benchmark
         DataLoaders with ``num_workers > 0``. If you wish to benchmark data loading,
         please use ``use_cuda = False`` or ``num_workers = 0``.
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index b1c073dc861f..69847b0a8356 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -487,6 +487,17 @@ def get_args_parser() -> ArgumentParser:
         help="Multiprocessing start method to use when creating workers.",
     )
     parser.add_argument(
+<<<<<<< HEAD
+=======
+        "--event-log-handler",
+        "--event_log_handler",
+        action=env,
+        type=str,
+        default="null",
+        help="name of a registered event logging handler (see: https://docs.pytorch.org/docs/stable/elastic/events.html)",
+    )
+    parser.add_argument(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "--role",
         action=env,
         type=str,
@@ -523,7 +534,11 @@ def get_args_parser() -> ArgumentParser:
         type=str,
         default=None,
         help="Base directory to use for log files (e.g. /var/log/torch/elastic). The same "
+<<<<<<< HEAD
         "directory is re-used for multiple runs (a unique job-level sub-directory is created with "
+=======
+        "directory is reused for multiple runs (a unique job-level sub-directory is created with "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "rdzv_id as the prefix).",
     )
     parser.add_argument(
@@ -712,7 +727,11 @@ def get_use_env(args) -> bool:
 
 def _get_logs_specs_class(logs_specs_name: Optional[str]) -> type[LogsSpecs]:
     """
+<<<<<<< HEAD
     Attemps to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param.
+=======
+    Attempts to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Provides plugin mechanism to provide custom implementation of LogsSpecs.
 
     Returns `DefaultLogsSpecs` when logs_spec_name is None.
@@ -817,6 +836,10 @@ def config_from_args(args) -> tuple[LaunchConfig, Union[Callable, str], list[str
         log_line_prefix_template=log_line_prefix_template,
         local_addr=args.local_addr,
         logs_specs=logs_specs,
+<<<<<<< HEAD
+=======
+        event_log_handler=args.event_log_handler,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     with_python = not args.no_python
diff --git a/torch/distributed/tensor/README.md b/torch/distributed/tensor/README.md
index fc7eb0135bcb..50c5defacfe7 100644
--- a/torch/distributed/tensor/README.md
+++ b/torch/distributed/tensor/README.md
@@ -49,7 +49,11 @@ We offer both a lower level DistributedTensor API and a module level API to crea
 Here are some basic DTensor API examples that showcase:
 1. How to construct a DTensor directly, to represent different types of sharding, replication, sharding + replication strategies.
 2. How to create DTensor from a local `torch.Tensor`.
+<<<<<<< HEAD
 3. How to “reshard” an existing DTensor to a different DTensor with modified placement strategy or world size.
+=======
+3. How to “reshard” an existing DTensor to a different DTensor with a new DTensor Layout.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 ```python
 # torchrun --standalone --nnodes=1 --nproc-per-node=4 dtensor_example.py
diff --git a/torch/distributed/tensor/_api.py b/torch/distributed/tensor/_api.py
index ec8e268f51fc..0ca6268f3394 100644
--- a/torch/distributed/tensor/_api.py
+++ b/torch/distributed/tensor/_api.py
@@ -11,6 +11,10 @@
 import torch.distributed.tensor._dispatch as op_dispatch
 import torch.distributed.tensor._random as random
 import torch.nn as nn
+<<<<<<< HEAD
+=======
+from torch._export.wrappers import mark_subclass_constructor_exportable_experimental
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
 from torch.distributed.tensor._collective_utils import check_tensor_meta, mesh_broadcast
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
@@ -268,7 +272,11 @@ def __new__(
         # new method instruct wrapper tensor from local_tensor and add
         # placement spec, it does not do actual distribution
         assert spec.tensor_meta is not None, "TensorMeta should not be None!"
+<<<<<<< HEAD
         r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+=======
+        r = torch.Tensor._make_wrapper_subclass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls,
             spec.tensor_meta.shape,
             strides=spec.tensor_meta.stride,
@@ -282,6 +290,14 @@ def __new__(
         r._local_tensor = local_tensor
         return r
 
+<<<<<<< HEAD
+=======
+    @torch._disable_dynamo
+    @mark_subclass_constructor_exportable_experimental
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # pyre-fixme[14]: `__repr__` overrides method defined in `DTensor` inconsistently.
     # pyre-fixme[3]: Return type must be annotated.
     def __repr__(self):  # type: ignore[override]
@@ -340,7 +356,11 @@ def __coerce_same_metadata_as_tangent__(self, flatten_spec, expected_type=None):
     @torch._disable_dynamo
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
+<<<<<<< HEAD
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+=======
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return DTensor._op_dispatcher.dispatch(
             func,
             args,
@@ -475,10 +495,19 @@ def redistribute(
         placements: Optional[Sequence[Placement]] = None,
         *,
         async_op: bool = False,
+<<<<<<< HEAD
     ) -> "DTensor":
         """
         ``redistribute`` performs necessary collective operations that redistribute the current
         DTensor from its current placements to a new placements, or from is current DeviceMesh
+=======
+        forward_dtype: Optional[torch.dtype] = None,
+        backward_dtype: Optional[torch.dtype] = None,
+    ) -> "DTensor":
+        """
+        ``redistribute`` performs necessary collective operations that redistribute the current
+        DTensor from its current placements to a new placements, or from its current DeviceMesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         to a new DeviceMesh. i.e. we can turn a Sharded DTensor to a Replicated DTensor by
         specifying a Replicate placement for each dimension of the DeviceMesh.
 
@@ -507,6 +536,15 @@ def redistribute(
         Keyword args:
             async_op (bool, optional): whether to perform the DTensor redistribute operation
                 asynchronously or not. Default: False
+<<<<<<< HEAD
+=======
+            forward_dtype (torch.dtype, optional): the local tensor datatype can be converted to
+                ``forward_dtype`` before redistributing the local tensor in its forward.
+                The result DTensor will be in ``forward_dtype`` Default: None.
+            backward_dtype (torch.dtype, optional): the local tensor datatype can be converted to
+                ``backward_dtype`` before redistributing the local tensor in its backward.
+                The result DTensor gradient would be converted back to the current DTensor dtype. Default: None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Returns:
             A :class:`DTensor` object
@@ -539,7 +577,13 @@ def redistribute(
         placements = tuple(placements)
 
         # pyre-fixme[16]: `Redistribute` has no attribute `apply`.
+<<<<<<< HEAD
         return Redistribute.apply(self, device_mesh, placements, async_op)
+=======
+        return Redistribute.apply(
+            self, device_mesh, placements, async_op, forward_dtype, backward_dtype
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def full_tensor(
         self, *, grad_placements: Optional[Sequence[Placement]] = None
@@ -980,8 +1024,11 @@ def _dtensor_init_helper(  # type: ignore[no-untyped-def]
     placements: Optional[Sequence[Placement]] = None,
     **kwargs,
 ) -> DTensor:
+<<<<<<< HEAD
     # from torch.distributed._tensor.placement_types import DTensorSpec, TensorMeta
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # if device_mesh is None, use the one from mesh resources
     device_mesh = device_mesh or _mesh_resources.get_current_mesh()
     kwargs["device"] = device_mesh.device_type
diff --git a/torch/distributed/tensor/_collective_utils.py b/torch/distributed/tensor/_collective_utils.py
index 371818470a92..a424e87abea4 100644
--- a/torch/distributed/tensor/_collective_utils.py
+++ b/torch/distributed/tensor/_collective_utils.py
@@ -9,6 +9,10 @@
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.tensor._dtensor_spec as dtensor_spec
 from torch._C._distributed_c10d import _resolve_process_group
+<<<<<<< HEAD
+=======
+from torch._logging import warning_once
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
 from torch.distributed.distributed_c10d import (
     _get_group_size_by_name,
@@ -50,10 +54,16 @@ def _shard_dim_alltoall_meta(input, gather_dim, shard_dim, group_name):
 def shard_dim_alltoall(input, gather_dim, shard_dim, mesh, mesh_dim):
     if mesh.device_type == "cpu":
         # Gloo does not support alltoall, so falling back to allgather + chunk
+<<<<<<< HEAD
 
         # TODO: This logs way too much
         logger.warning(
             "CPU process group does not support alltoall yet, falling back with allgather + chunk!"
+=======
+        warning_once(
+            logger,
+            "CPU process group does not support alltoall yet, falling back with allgather + chunk!",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         out = funcol.all_gather_tensor(input, gather_dim, (mesh, mesh_dim))
         if isinstance(out, funcol.AsyncCollectiveTensor):
@@ -200,9 +210,13 @@ def fill_empty_tensor_to_shards(
     if num_empty_tensors == 0:
         return shards
     tensor_size = list(shards[0].size())
+<<<<<<< HEAD
     tensor_size = [
         size if idx != shard_dim else 0 for idx, size in enumerate(tensor_size)
     ]
+=======
+    tensor_size[shard_dim] = 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tensor = shards[0].new_zeros(tensor_size)
     shards.extend(tensor for _ in range(num_empty_tensors))
     return shards
diff --git a/torch/distributed/tensor/_dispatch.py b/torch/distributed/tensor/_dispatch.py
index a4df48f40312..1e7d92ca0339 100644
--- a/torch/distributed/tensor/_dispatch.py
+++ b/torch/distributed/tensor/_dispatch.py
@@ -13,6 +13,7 @@
 import torch.distributed.tensor._random as random
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+<<<<<<< HEAD
 from torch.distributed.tensor._op_schema import (
     _is_inplace_op,
     _is_out_variant_op,
@@ -20,6 +21,9 @@
     OpSchema,
     OutputSpecType,
 )
+=======
+from torch.distributed.tensor._op_schema import OpInfo, OpSchema, OutputSpecType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._random import is_rng_supported_mesh
 from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor._sharding_prop import ShardingPropagator
@@ -40,6 +44,7 @@
 logger = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 def decompose_handler(
     op_call: torch._ops.OpOverload,
     args: tuple[object, ...],
@@ -56,6 +61,8 @@ def decompose_handler(
         raise RuntimeError("Decomposition failed")
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_same_size_handler(
     op_call: torch._ops.OpOverload,
     args: tuple[object, ...],
@@ -136,8 +143,11 @@ def __init__(self) -> None:
             aten.bernoulli_.float,
         }
         self._custom_op_handlers = {
+<<<<<<< HEAD
             aten.linear.default: decompose_handler,
             aten.matmul.default: decompose_handler,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten.is_same_size.default: is_same_size_handler,
             aten.convolution.default: convolution_handler,
             aten.convolution_backward.default: convolution_backward_handler,
@@ -160,6 +170,17 @@ def dispatch(
         Main dispatching logic
         """
         # operators that does not need to go through sharding propagation
+<<<<<<< HEAD
+=======
+        if torch._C._dispatch_has_kernel_for_dispatch_key(
+            op_call.name(), torch._C.DispatchKey.CompositeImplicitAutograd
+        ):
+            # When running under inference mode, CompositeImplicitAutograd ops show up in __torch_dispatch__,
+            # so we manually decompose them, here
+            out = op_call.decompose(*args, **kwargs)
+            assert out is not NotImplemented
+            return out
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if op_call in self._custom_op_handlers:
             return self._custom_op_handlers[op_call](op_call, args, kwargs)  # type: ignore[operator]
 
@@ -270,13 +291,21 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
                 # perform reduce on the collection with AND op
                 local_results = functools.reduce(operator.and_, obj_list, True)
 
+<<<<<<< HEAD
         if _is_inplace_op(op_call):
+=======
+        if op_info.schema.is_inplace_op():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # inplace op should return self instead of re-wrapping
             if output_sharding.output_spec is not None:
                 return args[0]
             else:
                 return None
+<<<<<<< HEAD
         elif _is_out_variant_op(op_call):
+=======
+        elif op_info.schema.is_out_variant_op():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # out variant could possibly have multiple out args (i.e. lu_unpack.out)
             output_specs = (
                 (output_sharding.output_spec,)
diff --git a/torch/distributed/tensor/_op_schema.py b/torch/distributed/tensor/_op_schema.py
index 7c12f4ba033a..b84111ac572a 100644
--- a/torch/distributed/tensor/_op_schema.py
+++ b/torch/distributed/tensor/_op_schema.py
@@ -44,6 +44,7 @@ def _rebuild_tensor_from_dtensor_meta(arg) -> object:
     )
 
 
+<<<<<<< HEAD
 def _is_inplace_op(op: OpOverload):
     # simple analysis of function schema to determine
     # if this is an inplace variant, it might not
@@ -58,6 +59,8 @@ def _is_out_variant_op(op: OpOverload):
     return "out" in op._schema.overload_name
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _pretty_print_spec(spec: object) -> str:
     if spec is None:
         return "None"
@@ -70,10 +73,17 @@ def _pretty_print_spec(spec: object) -> str:
 
 
 @dataclass
+<<<<<<< HEAD
 class PlacementStrategy:
     """
     A placement strategy describes acceptable sharding placements of the output
     and the tensor arguments of an operation.
+=======
+class OpSpec:
+    """
+    An OpSpec describes an acceptable sharding placements of an operation, with the
+    specified DTensorSpecs for both the output and the inputs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     note: when the op return value is a single DTensor object, output_specs is
     DTensorSpec; when the return value is a tuple of Optional[DTensor],
@@ -83,10 +93,16 @@ class PlacementStrategy:
     output_specs: Union[DTensorSpec, tuple[Optional[DTensorSpec], ...]]
     input_specs: Optional[Sequence[DTensorSpec]] = None
 
+<<<<<<< HEAD
     # redistribute costs for this op placement strategy
     # we need a nested list to record the cost for each
     # operand of this operator, and for each operand of
     # this operator it might have multiple placement strategies
+=======
+    # redistribute costs to redistribute the operator input shardings to this OpSpec.
+    # Note that We need a nested list to record the cost for each operand of this
+    # operator, and for each operand of this operator it might have multiple OpSpecs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     redistribute_cost: Optional[list[list[float]]] = None
 
     @cached_property
@@ -116,7 +132,11 @@ def mesh(self):
             )
 
     def input_spec(self, index: int = 0) -> DTensorSpec:
+<<<<<<< HEAD
         assert self.input_specs is not None, "input_specs of PlacementStrategy is None!"
+=======
+        assert self.input_specs is not None, "input_specs of OpSpec is None!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(self.input_specs) > index, (
             f"Invalid index {index} for input_specs of length "
             f"{len(self.input_specs)}: {self.input_specs}"
@@ -141,12 +161,22 @@ class StrategyType:
 
 class OpStrategy(StrategyType):
     """
+<<<<<<< HEAD
     OpStrategy that consists of a list of placement strategies associated with the op
     """
 
     def __init__(self, strategies: list[PlacementStrategy]) -> None:
         super().__init__()
         self.strategies: list[PlacementStrategy] = strategies
+=======
+    OpStrategy that consists of a list of sharding strategies associated with the op,
+    where each strategy is an OpSpec that describes the acceptable input/output sharding.
+    """
+
+    def __init__(self, strategies: list[OpSpec]) -> None:
+        super().__init__()
+        self.strategies: list[OpSpec] = strategies
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __str__(self) -> str:
         strategy_list_str = ", ".join([str(strategy) for strategy in self.strategies])
@@ -155,7 +185,11 @@ def __str__(self) -> str:
 
     def max_num_shards(self) -> int:
         """
+<<<<<<< HEAD
         Returns the max number of shards across all placement strategies
+=======
+        Returns the max number of shards across all OpSpecs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         return max(strategy.output_spec.num_shards for strategy in self.strategies)
 
@@ -178,6 +212,7 @@ def shape(self):
 
 class TupleStrategy(StrategyType):
     """
+<<<<<<< HEAD
     TupleStrategy represents the output strategy of this op is a tuple
     of strategy, i.e. If the output of this op is a tuple of tensors or list of tensors
     with possibly different placement strategies, we should return a TupleStrategy that
@@ -186,6 +221,16 @@ class TupleStrategy(StrategyType):
 
     NOTE: if the output of the op is a List[Tensor] and they share the same placement
     strategy, then we should return a single OpStrategy instead of a TupleStrategy
+=======
+    TupleStrategy represents the output strategy of this op is a tuple of OpStrategies,
+    i.e. If the output of this op is a tuple of tensors or list of tensors with possibly
+    different OpStrategies, we should return a TupleStrategy that contains a tuple of
+    OpStrategy, where each child represents the sharding strategy of "each element" of
+    the tuple/list of tensors the op returns.
+
+    NOTE: if the output of the op is a List[Tensor] and they share the same OpStrategy,
+    then we should return a single OpStrategy instead of a TupleStrategy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(self, childs: Sequence[StrategyType]) -> None:
@@ -229,8 +274,13 @@ class RuntimeSchemaInfo:
 class OpSchema:
     """
     OpSchema is a data class that describes an operator input schemas, it includes
+<<<<<<< HEAD
     DTensorSpecs (instead of DTensor) and non-tensor args/kwargs (positional order
     preserved). It is mainly used by the DTensor's dispatching logic to perform various
+=======
+    DTensorSpecs/OpStrategies (instead of DTensor) and non-tensor args/kwargs (positional
+    order preserved). It is mainly used by the DTensor's dispatching logic to perform various
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     actions (i.e. sharding propagation, caching sharding decisions, redistribute, etc.)
 
     NOTE: this should be used as a read only data class
@@ -296,9 +346,15 @@ def __str__(self) -> str:
                 args_schema.append(_pretty_print_spec(arg.strategies[0].output_specs))
                 mesh_shape = arg.mesh_shape
             elif isinstance(arg, TupleStrategy):
+<<<<<<< HEAD
                 first_op_strtgy = arg.childs[0]
                 assert isinstance(first_op_strtgy, OpStrategy)
                 mesh_shape = first_op_strtgy.mesh_shape
+=======
+                first_op_strategy = arg.childs[0]
+                assert isinstance(first_op_strategy, OpStrategy)
+                mesh_shape = first_op_strategy.mesh_shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 args_schema.append(str(arg))
             else:
                 args_schema.append(str(arg))
@@ -376,6 +432,21 @@ def get_mesh_from_args(self, validate: bool = True) -> DeviceMesh:
 
         return mesh
 
+<<<<<<< HEAD
+=======
+    def is_inplace_op(self) -> bool:
+        # simple analysis of function schema to determine
+        # if this is an inplace variant, it might not
+        # be entirely correct, but it's good enough for now.
+        return self.op._schema.name[-1] == "_"
+
+    def is_out_variant_op(self) -> bool:
+        # simple analysis of function schema to determine
+        # if this is an out variant, it might not
+        # be entirely correct, but it's good enough for now.
+        return "out" in self.op._schema.overload_name
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __hash__(self) -> int:
         # Only hash args and kwargs that op indicates to hash
         if not self.schema_info:
@@ -442,7 +513,14 @@ def gen_fake_args(self) -> ArgsType:
             to run the local tensor operator and get the output spec.
         """
         return tree_map_only(
+<<<<<<< HEAD
             DTensorSpec, _rebuild_tensor_from_dtensor_meta, self.args_schema
+=======
+            DTensorSpec,
+            _rebuild_tensor_from_dtensor_meta,
+            self.args_schema,
+            is_leaf=lambda x: isinstance(x, DTensorSpec),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def gen_fake_kwargs(self) -> KwargsType:
@@ -452,7 +530,14 @@ def gen_fake_kwargs(self) -> KwargsType:
             to run the local tensor operator and get the output spec.
         """
         return tree_map_only(
+<<<<<<< HEAD
             DTensorSpec, _rebuild_tensor_from_dtensor_meta, self.kwargs_schema
+=======
+            DTensorSpec,
+            _rebuild_tensor_from_dtensor_meta,
+            self.kwargs_schema,
+            is_leaf=lambda x: isinstance(x, DTensorSpec),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _inplace_rewrap_schema_suggestion(self, origin_schema: "OpSchema") -> None:
diff --git a/torch/distributed/tensor/_ops/__init__.py b/torch/distributed/tensor/_ops/__init__.py
index dec4665b1c8b..ac558847015b 100644
--- a/torch/distributed/tensor/_ops/__init__.py
+++ b/torch/distributed/tensor/_ops/__init__.py
@@ -1,7 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from ._conv_ops import *  # noqa: F403
 from ._embedding_ops import *  # noqa: F403
+<<<<<<< HEAD
 from ._experimental_ops import *  # noqa: F403
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._math_ops import *  # noqa: F403
 from ._matrix_ops import *  # noqa: F403
 from ._pointwise_ops import *  # noqa: F403
diff --git a/torch/distributed/tensor/_ops/_common_rules.py b/torch/distributed/tensor/_ops/_common_rules.py
index 6a5b472685e6..32c13d3b9b80 100644
--- a/torch/distributed/tensor/_ops/_common_rules.py
+++ b/torch/distributed/tensor/_ops/_common_rules.py
@@ -4,12 +4,16 @@
 
 import torch
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+<<<<<<< HEAD
 from torch.distributed.tensor._op_schema import (
     _is_inplace_op,
     _is_out_variant_op,
     OpSchema,
     OutputSharding,
 )
+=======
+from torch.distributed.tensor._op_schema import OpSchema, OutputSharding
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._ops.utils import prod
 from torch.distributed.tensor._utils import compute_local_shape_and_global_offset
 
@@ -265,13 +269,17 @@ def pointwise_rule(op_schema: OpSchema, linearity: bool = False) -> OutputShardi
     # check if we replace the all inputs dim char with singleton dimension,
     # if we replace all inputs, we also need to replace the output dimension.
     for output_dim_idx in range(len(out_dimchars)):
+<<<<<<< HEAD
         out_dimchar = out_dimchars[output_dim_idx]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if singleton_counter[output_dim_idx] == len(input_specs):
             out_dimchars = _replace_char_in_str(out_dimchars, "1", output_dim_idx)
 
     fmt = f"{','.join(p for p in dimchars)}->{out_dimchars}"
 
     enforce_sharding: dict[str, int] = {}
+<<<<<<< HEAD
     if _is_inplace_op(op_schema.op):
         # inplace op should keep the input sharding it writes to
         for out_dimchar, mesh_dim in zip(out_dimchars, input_specs[0].dim_map):
@@ -280,6 +288,14 @@ def pointwise_rule(op_schema: OpSchema, linearity: bool = False) -> OutputShardi
         out_spec = cast(DTensorSpec, op_schema.kwargs_schema["out"])
         for out_dimchar, mesh_dim in zip(out_dimchars, out_spec.dim_map):
             enforce_sharding[out_dimchar] = mesh_dim
+=======
+    if op_schema.is_inplace_op():
+        follow_spec = op_schema.args_spec[0]
+        enforce_sharding.update(zip(out_dimchars, follow_spec.dim_map))
+    elif op_schema.is_out_variant_op():
+        follow_spec = cast(DTensorSpec, op_schema.kwargs_schema["out"])
+        enforce_sharding.update(zip(out_dimchars, follow_spec.dim_map))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return einop_rule(
         fmt,
diff --git a/torch/distributed/tensor/_ops/_einsum_strategy.py b/torch/distributed/tensor/_ops/_einsum_strategy.py
index 5953721d219c..420b2a1fc17a 100644
--- a/torch/distributed/tensor/_ops/_einsum_strategy.py
+++ b/torch/distributed/tensor/_ops/_einsum_strategy.py
@@ -3,7 +3,11 @@
 
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
+<<<<<<< HEAD
 from torch.distributed.tensor._op_schema import OpStrategy, PlacementStrategy
+=======
+from torch.distributed.tensor._op_schema import OpSpec, OpStrategy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.placement_types import (
     Partial,
     Placement,
@@ -167,7 +171,11 @@ def gen_einsum_strategies(
     all_strategies = []
     for strategy_comb in strategy_combs:
         spec_list = [DTensorSpec(mesh, tuple(specs)) for specs in zip(*strategy_comb)]
+<<<<<<< HEAD
         strat = PlacementStrategy(output_specs=spec_list[0], input_specs=spec_list[1:])
+=======
+        strat = OpSpec(output_specs=spec_list[0], input_specs=spec_list[1:])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         all_strategies.append(strat)
 
     return OpStrategy(all_strategies)
diff --git a/torch/distributed/tensor/_ops/_embedding_ops.py b/torch/distributed/tensor/_ops/_embedding_ops.py
index 41eb21d6baa1..2b9844b3afb0 100644
--- a/torch/distributed/tensor/_ops/_embedding_ops.py
+++ b/torch/distributed/tensor/_ops/_embedding_ops.py
@@ -92,11 +92,18 @@ def _partition_value(
         assert self.offset_shape is not None, (
             "offset_shape needs to be set for _MaskPartial"
         )
+<<<<<<< HEAD
         local_shard_size, local_offset_on_dim = Shard._local_shard_size_on_dim(
             self.offset_shape[self.offset_dim],
             num_chunks,
             mesh.get_local_rank(mesh_dim),
             return_offset=True,
+=======
+        local_shard_size, local_offset_on_dim = Shard._local_shard_size_and_offset(
+            self.offset_shape[self.offset_dim],
+            num_chunks,
+            mesh.get_local_rank(mesh_dim),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # Build the input mask and save it for the current partial placement
         # this is so that the output of embedding op can reuse the same partial
diff --git a/torch/distributed/tensor/_ops/_math_ops.py b/torch/distributed/tensor/_ops/_math_ops.py
index a5d30a9e6605..171f0e163b0d 100644
--- a/torch/distributed/tensor/_ops/_math_ops.py
+++ b/torch/distributed/tensor/_ops/_math_ops.py
@@ -11,9 +11,15 @@
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (
     OpSchema,
+<<<<<<< HEAD
     OpStrategy,
     PlacementList,
     PlacementStrategy,
+=======
+    OpSpec,
+    OpStrategy,
+    PlacementList,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     RuntimeSchemaInfo,
     TupleStrategy,
 )
@@ -267,20 +273,35 @@ def common_reduction_strategy(
     # by default follow reduction input strategy
     reduction_strategy = OpStrategy([])
 
+<<<<<<< HEAD
     for strtg in input_strategy.strategies:
+=======
+    for op_spec in input_strategy.strategies:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not reduction_linear:
             # input placements for this strategy should clear out pending sum and sharding
             # on the reduction dimension
             input_placements = replicate_reduction_dims(
+<<<<<<< HEAD
                 strtg.output_spec.placements, reduce_dims
             )
         else:
             input_placements = strtg.output_spec.placements
+=======
+                op_spec.output_spec.placements, reduce_dims
+            )
+        else:
+            input_placements = op_spec.output_spec.placements
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         input_spec = DTensorSpec(
             mesh=input_strategy.mesh,
             placements=input_placements,
+<<<<<<< HEAD
             tensor_meta=strtg.output_spec.tensor_meta,
+=======
+            tensor_meta=op_spec.output_spec.tensor_meta,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         reduce_dims_map = _infer_reduce_dims_map(reduce_dims, input_spec.ndim, keep_dim)
@@ -289,7 +310,11 @@ def common_reduction_strategy(
         )
         redistribute_cost = [generate_redistribute_costs(input_strategy, input_spec)]
         reduction_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(
+=======
+            OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=DTensorSpec(
                     mesh=input_strategy.mesh,
                     placements=out_placements,
@@ -354,6 +379,22 @@ def linear_reduction_strategy(op_schema: OpSchema) -> OpStrategy:
     )
 
 
+<<<<<<< HEAD
+=======
+@register_op_strategy(aten.cumsum.default, schema_info=RuntimeSchemaInfo(1))
+def cumsum_strategy(op_schema: OpSchema) -> OpStrategy:
+    args_schema = op_schema.args_schema
+    input_strategy = args_schema[0]
+    assert isinstance(input_strategy, OpStrategy)
+    dim = args_schema[1]
+    assert isinstance(dim, int), f"{dim}"
+
+    return common_reduction_strategy(
+        input_strategy, [dim], keep_dim=True, reduction_linear=False
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_op_strategy(
     [aten.var.correction, aten.var.correction_out],
     schema_info=RuntimeSchemaInfo(1, ["keepdim"]),
@@ -452,7 +493,11 @@ def linalg_replicate_strategy(op_schema: OpSchema) -> OpStrategy:
     assert isinstance(input_strategy, OpStrategy), f"{input_strategy}"
     mesh = input_strategy.mesh
 
+<<<<<<< HEAD
     output_strategies: list[PlacementStrategy] = []
+=======
+    output_strategies: list[OpSpec] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for placement_strategy in input_strategy.strategies:
         replicate_placements = tuple(Replicate() for _ in range(mesh.ndim))
         replicate_spec = DTensorSpec(
@@ -463,7 +508,11 @@ def linalg_replicate_strategy(op_schema: OpSchema) -> OpStrategy:
         redistribute_cost = [
             generate_redistribute_costs(input_strategy, replicate_spec)
         ]
+<<<<<<< HEAD
         replicate_strategy = PlacementStrategy(
+=======
+        replicate_strategy = OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_specs=replicate_spec,
             input_specs=(replicate_spec,),
             redistribute_cost=redistribute_cost,
@@ -501,7 +550,11 @@ def softmax_strategy(op_schema: OpSchema) -> OpStrategy:
         )
         output_target_spec = input_target_spec
         output_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(
+=======
+            OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=output_target_spec,
                 input_specs=[input_target_spec],
                 redistribute_cost=redistribute_costs,
@@ -546,7 +599,11 @@ def softmax_backward_strategy(op_schema: OpSchema) -> OpStrategy:
         redist_grad_out_cost = generate_redistribute_costs(grad_out_strategy, tgt_spec)
         redist_out_cost = generate_redistribute_costs(out_strategy, tgt_spec)
         grad_in_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(
+=======
+            OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=tgt_spec,
                 redistribute_cost=[redist_grad_out_cost, redist_out_cost],
             )
@@ -669,7 +726,11 @@ def nll_loss_forward_strategy(op_schema: OpSchema) -> OpStrategy:
             )
 
         output_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(
+=======
+            OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=(output_expected_spec, total_weight_expected_spec),
                 input_specs=op_args_target_specs,
                 redistribute_cost=redistribute_costs,
@@ -784,7 +845,11 @@ def nll_loss_backward_strategy(op_schema: OpSchema) -> OpStrategy:
 
         grad_in_expected_spec = input_expected_spec
         grad_in_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(
+=======
+            OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=grad_in_expected_spec,
                 input_specs=op_args_target_specs,
                 redistribute_cost=redistribute_costs,
@@ -881,7 +946,11 @@ def layer_norm_strategy(op_schema: OpSchema) -> OpStrategy:
         # the output spec is the same as input spec
         output_target_spec = input_target_spec
         output_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(
+=======
+            OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=output_target_spec,
                 input_specs=op_args_target_specs,
                 redistribute_cost=redistribute_costs,
@@ -931,7 +1000,11 @@ def layer_norm_bwd_strategy(op_schema: OpSchema) -> OpStrategy:
     # output triple: (d_input, d_weight, d_bias)
     out_tuple_strategy = OpStrategy([])
     for idx, input_placement_strategy in enumerate(input_strategy.strategies):
+<<<<<<< HEAD
         # args for PlacementStrategy
+=======
+        # args for OpSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output_specs_list: list[Optional[DTensorSpec]] = []
         input_specs_list: list[DTensorSpec] = []
         redistribute_costs = []
@@ -1039,7 +1112,11 @@ def _add_target_input_spec(strategy) -> DTensorSpec:
             output_specs_list.append(None)
 
         out_tuple_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(
+=======
+            OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=tuple(output_specs_list),
                 input_specs=input_specs_list,
                 redistribute_cost=redistribute_costs,
diff --git a/torch/distributed/tensor/_ops/_matrix_ops.py b/torch/distributed/tensor/_ops/_matrix_ops.py
index fe0a9dfaa301..f29b781f0786 100644
--- a/torch/distributed/tensor/_ops/_matrix_ops.py
+++ b/torch/distributed/tensor/_ops/_matrix_ops.py
@@ -9,9 +9,15 @@
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (
     OpSchema,
+<<<<<<< HEAD
     OpStrategy,
     PlacementList,
     PlacementStrategy,
+=======
+    OpSpec,
+    OpStrategy,
+    PlacementList,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     RuntimeSchemaInfo,
 )
 from torch.distributed.tensor._ops._einsum_strategy import gen_einsum_strategies
@@ -24,7 +30,16 @@
     prod,
     register_op_strategy,
 )
+<<<<<<< HEAD
 from torch.distributed.tensor.placement_types import Placement, Replicate, Shard
+=======
+from torch.distributed.tensor.placement_types import (
+    Partial,
+    Placement,
+    Replicate,
+    Shard,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 aten = torch.ops.aten
@@ -43,7 +58,11 @@ def transpose_strategy(op_schema: OpSchema) -> OpStrategy:
             Shard(1 - p.dim) if isinstance(p, Shard) else p
             for p in input_spec.placements
         ]
+<<<<<<< HEAD
         transpose_strategy = PlacementStrategy(
+=======
+        transpose_strategy = OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_specs=DTensorSpec(
                 mesh=input_strategy.mesh,
                 placements=tuple(output_placements),
@@ -203,6 +222,15 @@ def _scaled_mm_like_strategy(
     return mm_strategy
 
 
+<<<<<<< HEAD
+=======
+@register_op_strategy(aten.dot.default)
+def dot_strategy(op_schema: OpSchema) -> OpStrategy:
+    mesh = op_schema.get_mesh_from_args()
+    return _mm_like_strategy("i,i->", mesh, op_schema)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_op_strategy(aten.mm.default)
 def mm_strategy(op_schema: OpSchema) -> OpStrategy:
     mesh = op_schema.get_mesh_from_args()
@@ -296,6 +324,27 @@ def scaled_dot_product_flash_attention_strategy(op_schema: OpSchema) -> OpStrate
     ]
     single_mesh_dim_strategies.append(num_heads_dim_sharding)
 
+<<<<<<< HEAD
+=======
+    # Shard on the batch dimension
+    single_mesh_dim_strategies.append(
+        [
+            Shard(0),  # output
+            Shard(0),  # logsumexp
+            None,  # cum_seq_q
+            None,  # cum_seq_k
+            None,  # max_q
+            None,  # max_k
+            Replicate(),  # rng_state
+            None,  # unused
+            Shard(0),  # debugattn
+            Shard(0),  # q
+            Shard(0),  # k
+            Shard(0),  # v
+        ]
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Context Parallelism: shards on the sequence dim
     single_mesh_dim_strategies.append(
         [
@@ -370,6 +419,27 @@ def scaled_dot_product_flash_attention_backward_strategy(
     num_heads_dim_sharding.extend([Replicate()] * (num_tensor_inputs - 6))
     single_mesh_dim_strategies.append(num_heads_dim_sharding)
 
+<<<<<<< HEAD
+=======
+    # Batch sharding
+    batch_dim_sharding: PlacementList = [
+        Shard(0),  # grad_q
+        Shard(0),  # grad_k
+        Shard(0),  # grad_v
+        Shard(0),  # grad_output
+        Shard(0),  # q
+        Shard(0),  # k
+        Shard(0),  # v
+        Shard(0),  # output
+        Shard(0),  # logsumexp
+    ]
+    # accept replicate on the rest tensor inputs, potentially
+    # cum_seq_q, cum_seq_k, philox_seed, philox_offset
+    # at indices 6, 7, 12, 13, respectively
+    batch_dim_sharding.extend([Replicate()] * (num_tensor_inputs - 6))
+    single_mesh_dim_strategies.append(batch_dim_sharding)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Context Parallelism: shards on the sequence dim
     seq_dim_sharding: PlacementList = [
         Shard(2),  # grad_q
@@ -400,7 +470,11 @@ def constant_pad_nd_strategy(op_schema: OpSchema) -> OpStrategy:
     # TODO(d4l3k); implement a more correct strategy for constant_pad_nd
     return OpStrategy(
         [
+<<<<<<< HEAD
             PlacementStrategy(
+=======
+            OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=DTensorSpec(mesh, (Replicate(),)),
                 input_specs=(
                     DTensorSpec(mesh, (Replicate(),)),
@@ -481,6 +555,29 @@ def scaled_dot_product_efficient_attention_strategy(op_schema: OpSchema) -> OpSt
         num_heads_dim_sharding.append(Shard(1))
     single_mesh_dim_strategies.append(num_heads_dim_sharding)
 
+<<<<<<< HEAD
+=======
+    # batch sharding
+    if compute_log_sumexp:
+        logsumexp_sharding_dp: Placement = Shard(0)
+    else:
+        # empty logsumexp, replicated
+        logsumexp_sharding_dp = Replicate()
+    batch_sharding = [
+        Shard(0),  # output
+        logsumexp_sharding_dp,  # logsumexp
+        None,  # philox_seed
+        None,  # philox_offset
+        Shard(0),  # q
+        Shard(0),  # k
+        Shard(0),  # v
+    ]
+    if has_attn_bias:
+        batch_sharding.append(Shard(0))
+
+    single_mesh_dim_strategies.append(batch_sharding)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return expand_to_full_mesh_op_strategy(
         mesh,
         op_schema,
@@ -545,6 +642,30 @@ def scaled_dot_product_efficient_attention_backward_strategy(
     num_heads_dim_sharding.extend([Replicate(), Replicate()])
     single_mesh_dim_strategies.append(num_heads_dim_sharding)
 
+<<<<<<< HEAD
+=======
+    # Shards on batch dim
+    batch_dim_sharding: PlacementList = [
+        Shard(0),  # grad_q
+        Shard(0),  # grad_k
+        Shard(0),  # grad_v
+        Shard(0) if has_attn_bias else None,  # grad_bias
+        Shard(0),  # grad_output
+        Shard(0),  # q
+        Shard(0),  # k
+        Shard(0),  # v
+        Shard(0),  # output
+        Shard(0),  # logsumexp
+    ]
+    # accept replicate on the rest tensor inputs, potentially
+    # cum_seq_q, cum_seq_k, philox_seed, philox_offset
+    # at indices 6, 7, 12, 13, respectively
+    if has_attn_bias:
+        batch_dim_sharding.insert(8, Shard(0))
+    batch_dim_sharding.extend([Replicate(), Replicate()])
+    single_mesh_dim_strategies.append(batch_dim_sharding)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Context Parallelism: shards on the sequence dim
     seq_dim_sharding: PlacementList = [
         Shard(2),  # grad_q
@@ -648,6 +769,28 @@ def scaled_dot_product_cudnn_attention_strategy(op_schema: OpSchema) -> OpStrate
     ]
     single_mesh_dim_strategies.append(num_heads_dim_sharding)
 
+<<<<<<< HEAD
+=======
+    # batch parallelism
+    logsumexp_sharding = Shard(0) if compute_log_sumexp else Replicate()
+    debug_attn_mask_sharding = Shard(0) if return_debug_mask else None
+    batch_dim_sharding: PlacementList = [
+        Shard(0),  # output
+        logsumexp_sharding,
+        None,  # cum_seq_q
+        None,  # cum_seq_k
+        None,  # max_q
+        None,  # max_k
+        None,  # philox_seed
+        None,  # philox_offset
+        debug_attn_mask_sharding,
+        Shard(0),  # q
+        Shard(0),  # k
+        Shard(0),  # v
+    ]
+    single_mesh_dim_strategies.append(batch_dim_sharding)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Context Parallelism: shards on the sequence dim
     cp_sharding = Shard(2)  # seq dim
     logsumexp_sharding = cp_sharding if compute_log_sumexp else Replicate()
@@ -767,6 +910,173 @@ def scaled_scaled_dot_product_cudnn_attention_backward_strategy(
     )
     single_mesh_dim_strategies.append(context_parallel_sharding)
 
+<<<<<<< HEAD
+    return expand_to_full_mesh_op_strategy(
+        mesh, op_schema, single_mesh_dim_strategies, input_index=3
+    )
+=======
+    # case 4: we can accept the sharding pattern of batch parallelism, which
+    #   shards on the batch dimension
+    qkv_sharding = Shard(0)
+    output_sharding = Shard(0)
+    logsumexp_sharding = Shard(0)
+
+    batch_dim_sharding_out: PlacementList = [qkv_sharding] * 3
+    batch_dim_sharding_inp: PlacementList = [qkv_sharding] * 4
+    batch_dim_sharding_inp += [output_sharding]
+    batch_dim_sharding_inp += [logsumexp_sharding]
+    batch_dim_sharding_inp += [
+        Replicate()
+    ] * 2  # philox_seed, philox_offset is casted to Replicate() in DTensor
+    batch_dim_sharding_inp += [Shard(0) if has_attn_bias else None]
+    batch_dim_sharding_inp += [None] * 6
+    if has_scale:
+        batch_dim_sharding_inp.append(None)
+
+    batch_dim_sharding = batch_dim_sharding_out + batch_dim_sharding_inp
+    single_mesh_dim_strategies.append(batch_dim_sharding)
+
     return expand_to_full_mesh_op_strategy(
         mesh, op_schema, single_mesh_dim_strategies, input_index=3
     )
+
+
+@register_op_strategy(aten._grouped_mm.default)
+def grouped_mm_strategy(op_schema: OpSchema) -> OpStrategy:
+    mesh = op_schema.get_mesh_from_args()
+
+    mat1_strategy = op_schema.args_schema[0]
+    assert isinstance(mat1_strategy, OpStrategy)
+    mat2_strategy = op_schema.args_schema[1]
+    assert isinstance(mat2_strategy, OpStrategy)
+    if len(op_schema.args_schema) > 3:
+        bias_strategy = op_schema.args_schema[3]
+        assert bias_strategy is None, "grouped_mm doesn't support bias yet"
+
+    single_mesh_dim_strategies = []
+
+    offs_placement = None
+    if len(op_schema.args_schema) > 2 and op_schema.args_schema[2] is not None:
+        offs_placement = Replicate()  # offs should always be replicated
+
+    all_replicate: PlacementList = [
+        Replicate(),
+        Replicate(),  # mat1
+        Replicate(),  # mat2
+        offs_placement,  # offs
+        None,  # bias
+    ]
+    partial_replicate: PlacementList = [
+        Partial(),
+        Partial(),  # mat1
+        Replicate(),  # mat2
+        offs_placement,  # offs
+        None,  # bias
+    ]
+    replicate_partial: PlacementList = [
+        Partial(),
+        Replicate(),  # mat1
+        Partial(),  # mat2
+        offs_placement,  # offs
+        None,  # bias
+    ]
+    single_mesh_dim_strategies = [all_replicate, partial_replicate, replicate_partial]
+
+    if mat1_strategy.ndim == 2 and mat2_strategy.ndim == 3:
+        # rowwise_replicate for 2dx3d not supported
+        replicate_colwise_2x3: PlacementList = [
+            Shard(1),
+            Replicate(),  # mat1
+            Shard(2),  # mat2
+            offs_placement,  # offs
+            None,  # bias
+        ]
+        colwise_rowwise_2x3: PlacementList = [
+            Partial(),
+            Shard(1),  # mat1
+            Shard(1),  # mat2
+            offs_placement,  # offs
+            None,  # bias
+        ]
+        single_mesh_dim_strategies.extend([replicate_colwise_2x3, colwise_rowwise_2x3])
+
+    if mat1_strategy.ndim == 3 and mat2_strategy.ndim == 2:
+        # replicate_colwise for 3dx2d not supported
+        colwise_rowwise_3x2: PlacementList = [
+            Partial(),
+            Shard(2),  # mat1
+            Shard(0),  # mat2
+            offs_placement,  # offs
+            None,  # bias
+        ]
+        rowwise_replicate_3x2: PlacementList = [
+            Shard(0),
+            Shard(1),  # mat1
+            Replicate(),  # mat2
+            offs_placement,  # offs
+            None,  # bias
+        ]
+        single_mesh_dim_strategies.extend([colwise_rowwise_3x2, rowwise_replicate_3x2])
+
+    if mat1_strategy.ndim == 2 and mat2_strategy.ndim == 2:
+        # colwise_rowwise for 2dx2d not supported
+        replicate_colwise_2x2: PlacementList = [
+            Shard(2),
+            Replicate(),  # mat1
+            Shard(1),  # mat2
+            offs_placement,  # offs
+            None,  # bias
+        ]
+        rowwise_replicate_2x2: PlacementList = [
+            Shard(1),
+            Shard(0),  # mat1
+            Replicate(),  # mat2
+            offs_placement,  # offs
+            None,  # bias
+        ]
+        single_mesh_dim_strategies.extend(
+            [replicate_colwise_2x2, rowwise_replicate_2x2]
+        )
+
+    if mat1_strategy.ndim == 3 and mat2_strategy.ndim == 3:
+        replicate_colwise_3x3: PlacementList = [
+            Shard(2),
+            Replicate(),  # mat1
+            Shard(2),  # mat2
+            offs_placement,  # offs
+            None,  # bias
+        ]
+        rowwise_replicate_3x3: PlacementList = [
+            Shard(1),
+            Shard(1),  # mat1
+            Replicate(),  # mat2
+            offs_placement,  # offs
+            None,  # bias
+        ]
+        colwise_rowwise_3x3: PlacementList = [
+            Partial(),
+            Shard(2),  # mat1
+            Shard(1),  # mat2
+            offs_placement,  # offs
+            None,  # bias
+        ]
+        batch_dim_sharding: PlacementList = [
+            Shard(0),
+            Shard(0),  # mat1
+            Shard(0),  # mat2
+            offs_placement,  # offs
+            None,  # bias
+        ]
+        single_mesh_dim_strategies.extend(
+            [
+                replicate_colwise_3x3,
+                rowwise_replicate_3x3,
+                colwise_rowwise_3x3,
+                batch_dim_sharding,
+            ]
+        )
+
+    return expand_to_full_mesh_op_strategy(
+        mesh, op_schema, single_mesh_dim_strategies, input_index=1
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/tensor/_ops/_pointwise_ops.py b/torch/distributed/tensor/_ops/_pointwise_ops.py
index 4144b937544d..f56d8ba91781 100644
--- a/torch/distributed/tensor/_ops/_pointwise_ops.py
+++ b/torch/distributed/tensor/_ops/_pointwise_ops.py
@@ -5,11 +5,17 @@
 import torch
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (
+<<<<<<< HEAD
     _is_inplace_op,
     _is_out_variant_op,
     OpSchema,
     OpStrategy,
     PlacementStrategy,
+=======
+    OpSchema,
+    OpSpec,
+    OpStrategy,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     RuntimeSchemaInfo,
     StrategyType,
     TupleStrategy,
@@ -422,24 +428,48 @@
 def pointwise_strategy(op_schema: OpSchema, linearity: bool = False) -> OpStrategy:
     max_shards_strategy_index = -1
     max_shards = -1
+<<<<<<< HEAD
 
     if _is_inplace_op(op_schema.op):
         # inplace op should follow the first arg strategy
         followed_strategy = op_schema.args_schema[0]
     elif _is_out_variant_op(op_schema.op):
+=======
+    max_ndim = -1
+
+    if op_schema.is_inplace_op():
+        # inplace op should follow the first arg strategy
+        followed_strategy = op_schema.args_schema[0]
+    elif op_schema.is_out_variant_op():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # out variant op should follow the out kwarg strategy
         followed_strategy = op_schema.kwargs_schema["out"]
     else:
         # normal pointwise op, we choose to follow the arg with
         # the max shards in case operands needs reshard
+<<<<<<< HEAD
+=======
+        # in case of multiple operands with max shard, we take
+        # the one with the max number of dimensions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for idx, arg_strategy in enumerate(op_schema.args_schema):
             if not isinstance(arg_strategy, OpStrategy):
                 continue
 
             arg_max_shards = arg_strategy.max_num_shards()
+<<<<<<< HEAD
             if arg_max_shards > max_shards:
                 max_shards_strategy_index = idx
                 max_shards = arg_max_shards
+=======
+            arg_max_ndim = arg_strategy.ndim
+            if (arg_max_shards > max_shards) or (
+                arg_max_shards == max_shards and arg_max_ndim > max_ndim
+            ):
+                max_shards_strategy_index = idx
+                max_shards = arg_max_shards
+                max_ndim = arg_max_ndim
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         followed_strategy = op_schema.args_schema[max_shards_strategy_index]
 
@@ -512,7 +542,11 @@ def common_pointwise_strategy(
                 )
 
         pointwise_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(
+=======
+            OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=DTensorSpec(
                     mesh=followed_strategy.mesh,
                     placements=tuple(out_placements),
diff --git a/torch/distributed/tensor/_ops/_random_ops.py b/torch/distributed/tensor/_ops/_random_ops.py
index 51b1faed14ea..ab072a4a3596 100644
--- a/torch/distributed/tensor/_ops/_random_ops.py
+++ b/torch/distributed/tensor/_ops/_random_ops.py
@@ -2,8 +2,13 @@
 import torch
 from torch.distributed.tensor._op_schema import (
     OpSchema,
+<<<<<<< HEAD
     OpStrategy,
     PlacementStrategy,
+=======
+    OpSpec,
+    OpStrategy,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     StrategyType,
 )
 from torch.distributed.tensor._ops.utils import is_tensor_partial, register_op_strategy
@@ -31,6 +36,10 @@ def random_op_strategy(op_schema: OpSchema) -> StrategyType:
         if is_tensor_partial(arg_spec):
             # TODO: figure out how inplace random op should behave when it's partial
             raise RuntimeError(f"{op_schema.op} with Partial is not supported yet!")
+<<<<<<< HEAD
         random_strategy.strategies.append(PlacementStrategy(output_specs=arg_spec))
+=======
+        random_strategy.strategies.append(OpSpec(output_specs=arg_spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return random_strategy
diff --git a/torch/distributed/tensor/_ops/_tensor_ops.py b/torch/distributed/tensor/_ops/_tensor_ops.py
index 5bc0aad73459..02daec990aaf 100644
--- a/torch/distributed/tensor/_ops/_tensor_ops.py
+++ b/torch/distributed/tensor/_ops/_tensor_ops.py
@@ -6,12 +6,20 @@
 import torch
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (
+<<<<<<< HEAD
     _is_inplace_op,
     OpSchema,
     OpStrategy,
     OutputSharding,
     PlacementList,
     PlacementStrategy,
+=======
+    OpSchema,
+    OpSpec,
+    OpStrategy,
+    OutputSharding,
+    PlacementList,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     RuntimeSchemaInfo,
     StrategyType,
     TupleStrategy,
@@ -20,6 +28,10 @@
 from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
 from torch.distributed.tensor._ops.utils import (
     expand_to_full_mesh_op_strategy,
+<<<<<<< HEAD
+=======
+    generate_redistribute_costs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_tensor_dim_sharded,
     is_tensor_evenly_shardable,
     is_tensor_partial,
@@ -45,7 +57,11 @@ def default_strategy(op_schema: OpSchema) -> StrategyType:
     # we create new DTensorSpecs even for default strategy to assure that
     # the tensor metas are distinct between the arguments and outputs
     default_strategy = [
+<<<<<<< HEAD
         PlacementStrategy(
+=======
+        OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_specs=DTensorSpec(
                 mesh=select_strategy.mesh,
                 placements=strategy.output_spec.placements,
@@ -108,11 +124,17 @@ def equal_strategy(op_schema: OpSchema) -> StrategyType:
                     for p in arg_spec.placements
                 ),
             )
+<<<<<<< HEAD
             equal_strategy.strategies.append(
                 PlacementStrategy(output_specs=output_spec)
             )
         else:
             equal_strategy.strategies.append(PlacementStrategy(arg_spec))
+=======
+            equal_strategy.strategies.append(OpSpec(output_specs=output_spec))
+        else:
+            equal_strategy.strategies.append(OpSpec(arg_spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return equal_strategy
 
 
@@ -156,7 +178,11 @@ def create_like_strategy(op_schema: OpSchema) -> StrategyType:
             ),
         )
         create_like_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(output_specs=output_spec, input_specs=(arg_spec,))
+=======
+            OpSpec(output_specs=output_spec, input_specs=(arg_spec,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return create_like_strategy
@@ -189,7 +215,11 @@ def new_factory_strategy(op_schema: OpSchema) -> StrategyType:
         input_spec = arg_strategy.output_spec
         replica_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
         new_factory_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(
+=======
+            OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=replica_spec,
                 input_specs=(input_spec,),
                 redistribute_cost=[[0.0] * mesh.ndim],
@@ -206,7 +236,11 @@ def new_factory_strategy(op_schema: OpSchema) -> StrategyType:
                 continue
 
             new_factory_strategy.strategies.append(
+<<<<<<< HEAD
                 PlacementStrategy(
+=======
+                OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     output_specs=input_spec,
                     input_specs=(input_spec,),
                     # encouraging new tensor placement to be the same as input
@@ -228,14 +262,116 @@ def gen_bucketize_strategy(op_schema: OpSchema) -> StrategyType:
         arg_spec = DTensorSpec(mesh, arg_strategy.output_spec.placements)
         replica_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
         bucketize_strategy.strategies.append(
+<<<<<<< HEAD
             PlacementStrategy(
                 output_specs=arg_spec, input_specs=(arg_spec, replica_spec)
             )
+=======
+            OpSpec(output_specs=arg_spec, input_specs=(arg_spec, replica_spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return bucketize_strategy
 
 
+<<<<<<< HEAD
+=======
+@register_op_strategy(aten.select.int, schema_info=RuntimeSchemaInfo(1))
+def select_int_strategy(op_schema: OpSchema) -> StrategyType:
+    """
+    In this select op, first determine the input specs, then determine the output specs.
+    - Input specs:
+        - If the input is sharded on the selected dim, unshard it and change to replicate.
+        - Otherwise, keep the original input specs.
+    - Output specs:
+        - It checks the input specs with the following cases:
+        - Case 1 shard_dim == selected_dim: not possible as the input is already unsharded.
+        - Case 2 shard_dim < selected_dim: keep the input specs.
+        - Case 3 shard_dim > selected_dim: shard_dim -= 1.
+    """
+    input_strategy = op_schema.args_schema[0]
+    assert isinstance(input_strategy, OpStrategy)
+    assert len(op_schema.args_schema) == 3
+    selected_dim, index = (
+        cast(int, op_schema.args_schema[1]),
+        cast(int, op_schema.args_schema[2]),
+    )
+    input_shape = input_strategy.shape
+    input_ndim = input_strategy.ndim
+    selected_dim = normalize_dim(selected_dim, input_ndim)
+    index = normalize_dim(index, input_shape[selected_dim])
+
+    select_strategy = OpStrategy([])
+    for arg_strategy in input_strategy.strategies:
+        arg_spec = arg_strategy.output_spec
+
+        # determine input spec
+        input_specs = arg_spec
+        if is_tensor_dim_sharded(arg_spec, dim=selected_dim):
+            # if input is sharded on the selected dim, need to unshard it, change to replicate
+            arg_target_placements = unshard_tensor_dim(
+                arg_spec.placements, dim=selected_dim
+            )
+            input_specs = DTensorSpec(arg_spec.mesh, arg_target_placements)  # R
+
+        # determine output spec
+        output_specs = input_specs
+        if input_specs.is_sharded():
+            # handle cases with sharded_dim != selected_dim
+            output_spec_placements = []
+            for placement in input_specs.placements:
+                if placement.is_shard():
+                    shard_dim = cast(Shard, placement).dim
+                    if shard_dim > selected_dim:
+                        shard_dim -= 1
+                    placement = Shard(dim=shard_dim)
+                output_spec_placements.append(placement)
+            output_specs = DTensorSpec(
+                arg_spec.mesh, placements=tuple(output_spec_placements)
+            )
+
+        select_strategy.strategies.append(
+            OpSpec(
+                output_specs=output_specs,
+                input_specs=(input_specs,),
+            )
+        )
+    return select_strategy
+
+
+@register_op_strategy(
+    aten.select_backward.default,
+    schema_info=RuntimeSchemaInfo(1),
+)
+def select_backward_strategy(op_schema: OpSchema) -> OpStrategy:
+    # func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
+    args_schema = op_schema.args_schema
+    input_strategy, dim = args_schema[0], args_schema[2]
+    assert isinstance(input_strategy, OpStrategy), f"{input_strategy}"
+    assert isinstance(dim, int)
+    output_strategies: list[OpSpec] = []
+    for placement_strategy in input_strategy.strategies:
+        input_spec = placement_strategy.output_spec
+        output_spec_placements: list[Placement] = []
+        for placement in input_spec.placements:
+            if isinstance(placement, Shard):
+                shard_dim = placement.dim
+                if shard_dim >= dim:
+                    # NOTE: shard_dim is guaranteed to exist because
+                    # grad_input has one more dim than grad_output
+                    output_spec_placements.append(Shard(shard_dim + 1))
+                else:
+                    output_spec_placements.append(Shard(shard_dim))
+            else:
+                output_spec_placements.append(placement)
+        output_specs = DTensorSpec(input_spec.mesh, tuple(output_spec_placements))
+        output_strategies.append(
+            OpSpec(output_specs=output_specs, input_specs=(input_spec,))
+        )
+    return OpStrategy(output_strategies)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @register_op_strategy(aten.slice.Tensor, schema_info=RuntimeSchemaInfo(1))
 def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
     """Forward all shardings except the slice dimension."""
@@ -271,7 +407,11 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
         if not is_tensor_dim_sharded(arg_spec, dim=slice_dim) or redundant_slice:
             # only add the strategy if the slice dim is not sharded
             out_spec = DTensorSpec(mesh, arg_spec.placements)
+<<<<<<< HEAD
             slice_strategy.strategies.append(PlacementStrategy(output_specs=out_spec))
+=======
+            slice_strategy.strategies.append(OpSpec(output_specs=out_spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not slice_strategy.strategies:
         # if all strategies are filtered out, unsharding all specs on slice dim
         # of the input strategy, and use that as the op strategy
@@ -280,12 +420,46 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
             unshard_spec = DTensorSpec(
                 mesh, unshard_tensor_dim(arg_spec.placements, dim=slice_dim)
             )
+<<<<<<< HEAD
             slice_strategy.strategies.append(
                 PlacementStrategy(output_specs=unshard_spec)
             )
     return slice_strategy
 
 
+=======
+            slice_strategy.strategies.append(OpSpec(output_specs=unshard_spec))
+    return slice_strategy
+
+
+@register_op_strategy(
+    aten.slice_backward.default,
+    schema_info=RuntimeSchemaInfo(1),
+)
+def slice_backward_rules(op_schema: OpSchema) -> OpStrategy:
+    # func: slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor
+    args_schema = op_schema.args_schema
+    input_strategy, dim = args_schema[0], args_schema[2]
+    assert isinstance(input_strategy, OpStrategy), f"{input_strategy}"
+    output_strategies: list[OpSpec] = []
+    for placement_strategy in input_strategy.strategies:
+        output_spec = placement_strategy.output_spec
+        new_placements: list[Placement] = []
+        for placement in output_spec.placements:
+            # Redistribute to replicate only if the dim is sharded and matches the slice dim
+            if isinstance(placement, Shard) and placement.dim == dim:
+                new_placements.append(Replicate())
+            else:
+                new_placements.append(placement)
+        new_spec = DTensorSpec(output_spec.mesh, tuple(new_placements))
+        redistribute_cost = [generate_redistribute_costs(input_strategy, new_spec)]
+        placement_strategy.redistribute_cost = redistribute_cost
+        new_strategy = OpSpec(output_specs=new_spec)
+        output_strategies.append(new_strategy)
+    return OpStrategy(output_strategies)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def unshard_tensor_dim(
     placements: Sequence[Placement], dim: int
 ) -> tuple[Placement, ...]:
@@ -335,9 +509,13 @@ def gen_slice_scatter_strategy(op_schema: OpSchema) -> StrategyType:
             or is_tensor_partial(arg_spec)
         ):
             # only add the strategy if the slice_scatter dim is not sharded or partial
+<<<<<<< HEAD
             slice_scatter_strategy.strategies.append(
                 PlacementStrategy(output_specs=arg_spec)
             )
+=======
+            slice_scatter_strategy.strategies.append(OpSpec(output_specs=arg_spec))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not slice_scatter_strategy.strategies:
         # if all strategies are filtered out, replicating all specs on slice_scatter dim
@@ -348,7 +526,11 @@ def gen_slice_scatter_strategy(op_schema: OpSchema) -> StrategyType:
                 mesh, replicate_tensor_dim(arg_spec.placements, dim=slice_dim)
             )
             slice_scatter_strategy.strategies.append(
+<<<<<<< HEAD
                 PlacementStrategy(output_specs=replicate_spec)
+=======
+                OpSpec(output_specs=replicate_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
     return slice_scatter_strategy
 
@@ -360,7 +542,11 @@ def replica_only_strategy(op_schema: OpSchema) -> StrategyType:
     assert isinstance(input_strategy, OpStrategy)
     mesh = input_strategy.mesh
     replicate_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
+<<<<<<< HEAD
     return OpStrategy([PlacementStrategy(replicate_spec)])
+=======
+    return OpStrategy([OpSpec(replicate_spec)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @register_op_strategy(
@@ -381,10 +567,18 @@ def scatter_strategy(op_schema: OpSchema) -> StrategyType:
     single_mesh_dim_strategies.append(all_replicate)
 
     # TODO: see if we can support input sharding pattern
+<<<<<<< HEAD
     inplace_op = _is_inplace_op(op_schema.op)
 
     op_strategy = expand_to_full_mesh_op_strategy(
         mesh, op_schema, single_mesh_dim_strategies, inplace_op=inplace_op
+=======
+    op_strategy = expand_to_full_mesh_op_strategy(
+        mesh,
+        op_schema,
+        single_mesh_dim_strategies,
+        inplace_op=op_schema.is_inplace_op(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     return op_strategy
 
@@ -538,7 +732,11 @@ def stack_strategy(op_schema: OpSchema) -> StrategyType:
     follow_placements = normalize_shard_for_stack(follow_placements, dim)
 
     op_strategy.strategies.append(
+<<<<<<< HEAD
         PlacementStrategy(
+=======
+        OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_specs=DTensorSpec(mesh, tuple(follow_placements)),
             input_specs=input_specs,
         )
@@ -574,7 +772,11 @@ def cat_strategy(op_schema: OpSchema) -> StrategyType:
         for _ in range(len(input_tuple_strategy.childs))
     )
     op_strategy.strategies.append(
+<<<<<<< HEAD
         PlacementStrategy(
+=======
+        OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output_specs=DTensorSpec(mesh, tuple(follow_placements)),
             input_specs=input_specs,
         )
@@ -737,12 +939,17 @@ def place(vp: Placement, ip: Placement) -> Placement:
         return result
 
 
+<<<<<<< HEAD
 @register_prop_rule(
+=======
+@register_op_strategy(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     [
         aten.split.Tensor,
         aten.split_with_sizes.default,
         aten.split_with_sizes_copy.default,
     ],
+<<<<<<< HEAD
     schema_info=RuntimeSchemaInfo(1),
 )
 def split_rule(op_schema: OpSchema) -> OutputSharding:
@@ -783,6 +990,29 @@ def split_rule(op_schema: OpSchema) -> OutputSharding:
                 kwargs_schema=op_schema.kwargs_schema,
             ),
         )
+=======
+    RuntimeSchemaInfo(1),
+)
+def split_strategy(op_schema: OpSchema) -> TupleStrategy:
+    input_strategy = op_schema.args_schema[0]
+    split_size_or_sections = op_schema.args_schema[1]
+    assert isinstance(input_strategy, OpStrategy)
+    input_ndim = input_strategy.ndim
+    split_dim = (
+        cast(int, op_schema.args_schema[2]) if len(op_schema.args_schema) > 2 else 0
+    )
+    dim = normalize_dim(split_dim, input_ndim)
+
+    # tensor to split cannot have Partial for now
+    for arg_strategy in input_strategy.strategies:
+        arg_spec = arg_strategy.output_spec
+        if is_tensor_partial(arg_spec):
+            raise NotImplementedError(
+                f"splitting distributed tensor with "
+                f"Partial placement is not implemented!\n"
+                f"DTensorSpec={arg_strategy}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def size_split(N, i) -> list:
         # Last chunk will be smaller if the tensor size N
@@ -791,11 +1021,16 @@ def size_split(N, i) -> list:
         return [i] * (N // i) + ([N % i] if N % i != 0 else [])
 
     output_size_list = (
+<<<<<<< HEAD
         size_split(input_spec.shape[dim], split_size_or_sections)
+=======
+        size_split(input_strategy.shape[dim], split_size_or_sections)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(split_size_or_sections, int)
         else split_size_or_sections
     )
     assert isinstance(output_size_list, Sized)
+<<<<<<< HEAD
     output_spec_list = [
         DTensorSpec(
             mesh=input_spec.mesh,
@@ -804,3 +1039,26 @@ def size_split(N, i) -> list:
         for _ in range(len(output_size_list))
     ]
     return OutputSharding(output_spec_list)
+=======
+
+    split_strategies = []
+
+    for _ in range(len(output_size_list)):
+        op_strategy = OpStrategy([])
+
+        for strategy in input_strategy.strategies:
+            spec = strategy.output_spec
+            placements = spec.placements
+            if is_tensor_dim_sharded(spec, dim=dim):
+                # if the input is sharded on the split dim, we need to unshard it
+                placements = unshard_tensor_dim(spec.placements, dim=dim)
+
+            spec = DTensorSpec(spec.mesh, placements)
+
+            op_strategy.strategies.append(
+                OpSpec(output_specs=spec, input_specs=([spec]))
+            )
+        split_strategies.append(op_strategy)
+
+    return TupleStrategy(split_strategies)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/tensor/_ops/_view_ops.py b/torch/distributed/tensor/_ops/_view_ops.py
index 20e7ee7fc2ed..a76ef92185a9 100644
--- a/torch/distributed/tensor/_ops/_view_ops.py
+++ b/torch/distributed/tensor/_ops/_view_ops.py
@@ -6,11 +6,20 @@
 
 import torch
 from torch import Tensor
+<<<<<<< HEAD
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (
     OpSchema,
     OpStrategy,
     PlacementStrategy,
+=======
+from torch._prims_common import DimsType
+from torch.distributed.tensor._dtensor_spec import DTensorSpec
+from torch.distributed.tensor._op_schema import (
+    OpSchema,
+    OpSpec,
+    OpStrategy,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     RuntimeSchemaInfo,
     StrategyType,
 )
@@ -226,8 +235,13 @@ def dim_flatten(ndim: int, start_dim=0, end_dim=-1) -> DimMap:
 
 def dim_movedim(
     ndim: int,
+<<<<<<< HEAD
     input: Union[int, Sequence[int]],
     destination: Union[int, Sequence[int]],
+=======
+    input: DimsType,
+    destination: DimsType,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> DimMap:
     input = normalize_dims(input, ndim)
     destination = normalize_dims(destination, ndim)
@@ -422,9 +436,13 @@ def dim_view_as_real(shape: Shape) -> DimMap:
     return tuple(results)
 
 
+<<<<<<< HEAD
 def dim_reduction(
     ndim: int, dim_or_dims: Optional[Union[int, Sequence[int]]], keepdim: bool
 ) -> DimMap:
+=======
+def dim_reduction(ndim: int, dim_or_dims: Optional[DimsType], keepdim: bool) -> DimMap:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     General fallback for reduction ops where Partial() does not apply.
 
@@ -470,9 +488,16 @@ def dim_reduction(
 
 def propagate_shape_and_sharding(
     input_src_placements: Sequence[Placement],
+<<<<<<< HEAD
     local_in_shape: Shape,
     rule: DimMap,
     mesh_sizes: Shape,
+=======
+    global_input_shape: Shape,
+    rule: DimMap,
+    mesh_sizes: Shape,
+    strict_view: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[Sequence[Placement], Sequence[Placement]]:
     """
     Determine input target sharding and output sharding based on
@@ -502,6 +527,7 @@ def collect_used_inputs(cmd: DimSpec) -> None:
 
     for cmd in rule:
         collect_used_inputs(cmd)
+<<<<<<< HEAD
     for dim in range(len(local_in_shape)):
         shardable_dims[dim] = [dim in seen_input_dims] * mesh_ndim
 
@@ -513,6 +539,58 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
                 if isinstance(dim, InputDim):
                     shardable_dims[dim.input_dim] = [False] * mesh_ndim
             dim0 = cmd.input_dims[0]
+=======
+    for dim in range(len(global_input_shape)):
+        shardable_dims[dim] = [dim in seen_input_dims] * mesh_ndim
+
+    def maybe_get_shard_mesh_dim_and_placement(
+        input_dim: InputDim,
+    ) -> tuple[Optional[int], Optional[Shard]]:
+        # if input_dim is sharded, return the mesh_dim and shard placement
+        for i, placement in enumerate(input_src_placements):
+            if isinstance(placement, Shard) and placement.dim == input_dim.input_dim:
+                return i, placement
+        return None, None
+
+    def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
+        # TODO(whc) this helper is pretty hard to understand, at least it should be better documented if not refactored
+        if isinstance(cmd, InputDim):
+            return cmd
+        elif isinstance(cmd, Flatten):
+            for i, dim in enumerate(cmd.input_dims):
+                if isinstance(dim, InputDim):
+                    can_shard_dim = True
+                    shard_mesh_dim, shard_placement = (
+                        maybe_get_shard_mesh_dim_and_placement(dim)
+                    )
+                    input_sharded = shard_mesh_dim is not None
+                    if i > 0:
+                        can_shard_dim = False
+                        if strict_view and input_sharded:
+                            raise RuntimeError(
+                                f"Attempted to flatten sharded dimension {i}, ",
+                                "but only the leftmost dim of a Flatten can be sharded.",
+                            )
+                    elif input_sharded:
+                        assert (
+                            shard_placement is not None and shard_mesh_dim is not None
+                        )
+                        tensor_dim_size = global_input_shape[shard_placement.dim]
+                        mesh_dim_size = mesh_sizes[shard_mesh_dim]
+                        if tensor_dim_size % mesh_dim_size != 0:
+                            can_shard_dim = False
+                            if strict_view:
+                                raise RuntimeError(
+                                    f"Attempted to flatten unevenly sharded dimension {i}, "
+                                    "which would require resharding the input. "
+                                    "Please explicitly redistribute the tensor instead."
+                                )
+
+                    shardable_dims[dim.input_dim] = [can_shard_dim] * mesh_ndim
+            dim0 = cmd.input_dims[0]
+            # TODO(whc) dim0 can be sharded or not sharded, can't it?
+            # should we only return it if its sharded in the placement?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return dim0 if isinstance(dim0, InputDim) else None
         elif isinstance(cmd, Split):
             in_dim = get_in_dim_to_shard(cmd.input_dim)
@@ -559,9 +637,17 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
             shard_dim_map[in_dim.input_dim] = dim
 
     input_tgt_placements = [
+<<<<<<< HEAD
         Replicate()
         if isinstance(p, Shard) and not shardable_dims[p.dim][mesh_dim]
         else p
+=======
+        (
+            Replicate()
+            if isinstance(p, Shard) and not shardable_dims[p.dim][mesh_dim]
+            else p
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for mesh_dim, p in enumerate(input_src_placements)
     ]
     output_placements = [
@@ -576,7 +662,21 @@ def register_op_strategy_map(
     aten_op_overload: torch._ops.OpOverload,
     local_op_name: Callable[..., torch.Tensor],
     schema_info: Optional[RuntimeSchemaInfo] = None,
+<<<<<<< HEAD
+) -> None:
+=======
+    strict_view: bool = False,
 ) -> None:
+    """
+    Helper that registers strategies for view-like operators that follow a pattern:
+      (1) define the way input dims are split/combined to form output dims (dim_maps)
+      (2) register a strategy for the op schema that uses the dim_map as a sharding prop rule
+
+    strict_view: if True, we will error out if the view-operation would require resharding the input.
+       Currently, this should be set to 'true' for any "view" ops.
+       We could diverge behavior for "reshape" ops which could perform a redistribute implicitly.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dim_map: Callable[..., DimMap] = dim_maps[local_op_name]
 
     @register_op_strategy(aten_op_overload, schema_info=schema_info)
@@ -597,6 +697,10 @@ def reshape_strategy(op_schema: OpSchema) -> StrategyType:
                 tuple(global_in_shape),
                 rules,
                 mesh.shape,
+<<<<<<< HEAD
+=======
+                strict_view,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             # TODO: optimize this. we shouldn't simply blindly replicate
@@ -608,13 +712,21 @@ def reshape_strategy(op_schema: OpSchema) -> StrategyType:
                 mesh=mesh,
                 tensor_meta=input_src_spec.tensor_meta,
             )
+<<<<<<< HEAD
             redistribute_costs = [
+=======
+            redistribute_costs: list[list[float]] = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 generate_redistribute_costs(input_strategy, input_tgt_spec)
             ]
 
             output_spec = DTensorSpec(mesh=mesh, placements=tuple(output_placements))
             output_strategy.strategies.append(
+<<<<<<< HEAD
                 PlacementStrategy(
+=======
+                OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     output_specs=output_spec,
                     input_specs=(input_tgt_spec,),
                     redistribute_cost=redistribute_costs,
@@ -629,13 +741,27 @@ def reshape_strategy(op_schema: OpSchema) -> StrategyType:
     aten.squeeze.dim, torch.squeeze, schema_info=RuntimeSchemaInfo(1)
 )
 register_op_strategy_map(
+<<<<<<< HEAD
     aten.view.default, Tensor.view, schema_info=RuntimeSchemaInfo(1)
+=======
+    aten.view.default,
+    Tensor.view,
+    schema_info=RuntimeSchemaInfo(1),
+    strict_view=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 register_op_strategy_map(
     aten.reshape.default, torch.reshape, schema_info=RuntimeSchemaInfo(1)
 )
 register_op_strategy_map(
+<<<<<<< HEAD
     aten._unsafe_view.default, Tensor.view, schema_info=RuntimeSchemaInfo(1)
+=======
+    aten._unsafe_view.default,
+    Tensor.view,
+    schema_info=RuntimeSchemaInfo(1),
+    strict_view=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 register_op_strategy_map(
     aten.unsqueeze.default, torch.unsqueeze, schema_info=RuntimeSchemaInfo(1)
diff --git a/torch/distributed/tensor/_ops/utils.py b/torch/distributed/tensor/_ops/utils.py
index 7f1894ed73ff..2d233ac376bd 100644
--- a/torch/distributed/tensor/_ops/utils.py
+++ b/torch/distributed/tensor/_ops/utils.py
@@ -8,15 +8,26 @@
 from typing_extensions import ParamSpec
 
 import torch
+<<<<<<< HEAD
+=======
+from torch._prims_common import DimsSequenceType, DimsType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor._api import DTensor
 from torch.distributed.tensor._collective_utils import redistribute_cost
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (
     OpSchema,
+<<<<<<< HEAD
     OpStrategy,
     OutputSharding,
     PlacementList,
     PlacementStrategy,
+=======
+    OpSpec,
+    OpStrategy,
+    OutputSharding,
+    PlacementList,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     RuntimeSchemaInfo,
 )
 from torch.distributed.tensor.device_mesh import DeviceMesh
@@ -118,7 +129,11 @@ def normalize_dim(dim: int, ndim: int) -> int:
     return dim if dim >= 0 else dim + ndim
 
 
+<<<<<<< HEAD
 def normalize_dims(dims: Union[int, Sequence[int]], ndim: int) -> Sequence[int]:
+=======
+def normalize_dims(dims: DimsType, ndim: int) -> DimsSequenceType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Normalize a dim or a sequence of dims, so that they are all positive."""
     if isinstance(dims, int):
         dims = (normalize_dim(dims, ndim),)
@@ -264,7 +279,11 @@ def expand_to_full_mesh_op_strategy(
         self_spec = input_args_strategy[0].strategies[0].output_spec
 
         if inplace_op and self_spec.placements != input_specs[0].placements:
+<<<<<<< HEAD
             # if it's inplace op, we would only allow the placement strategy to be added when the
+=======
+            # if it's inplace op, we would only allow the OpSpec to be added when the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # input_spec matches the first argument's runtime sharding, otherwise we skip
             continue
 
@@ -287,7 +306,11 @@ def expand_to_full_mesh_op_strategy(
                     output_specs = spec_list[0]  # type: ignore[assignment]
                 else:
                     raise RuntimeError("output spec is None")
+<<<<<<< HEAD
             strategy = PlacementStrategy(
+=======
+            strategy = OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 output_specs=output_specs,
                 input_specs=input_specs,
                 redistribute_cost=redistribute_cost,
diff --git a/torch/distributed/tensor/_random.py b/torch/distributed/tensor/_random.py
index 1dead9dc95d4..63c653f6bf78 100644
--- a/torch/distributed/tensor/_random.py
+++ b/torch/distributed/tensor/_random.py
@@ -329,12 +329,22 @@ def _set_pre_op_offset(self, spec: DTensorSpec) -> None:
             if isinstance(placement, Shard):
                 mesh_dim_size = mesh.size(idx)
                 shard_dim = placement.dim
+<<<<<<< HEAD
                 local_size_on_rank_0[shard_dim] = placement._local_shard_size_on_dim(
                     dtensor_shape[shard_dim],
                     mesh_dim_size,
                     0,
                     return_offset=False,
                 )[0]
+=======
+                local_size_on_rank_0[shard_dim], _ = (
+                    placement._local_shard_size_and_offset(
+                        dtensor_shape[shard_dim],
+                        mesh_dim_size,
+                        0,
+                    )
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         from torch.distributed.tensor._ops.utils import prod
 
diff --git a/torch/distributed/tensor/_redistribute.py b/torch/distributed/tensor/_redistribute.py
index 0d80225e7c2b..a6312ab87e90 100644
--- a/torch/distributed/tensor/_redistribute.py
+++ b/torch/distributed/tensor/_redistribute.py
@@ -2,7 +2,11 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import logging
 from functools import cache
+<<<<<<< HEAD
 from typing import cast, NamedTuple
+=======
+from typing import cast, NamedTuple, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed._functional_collectives as funcol
@@ -73,7 +77,11 @@ def _gen_transform_infos_non_cached(
             if i < device_mesh.ndim - 1:
                 # calculate and save the logical shape for this sharding
                 mesh_dim_size = device_mesh.size(mesh_dim=i)
+<<<<<<< HEAD
                 local_shard_size, _ = src._local_shard_size_on_dim(
+=======
+                local_shard_size, _ = src._local_shard_size_and_offset(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     current_logical_shape[src.dim],
                     mesh_dim_size,
                     my_coordinate[i],
@@ -171,7 +179,11 @@ def redistribute_local_tensor(
         # TODO: alltoall/permute reshuffling to change device_mesh if they are not the same
         raise NotImplementedError("Cross device mesh comm not supported yet!")
 
+<<<<<<< HEAD
     new_local_tensor = None
+=======
+    new_local_tensor = local_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device_mesh = current_spec.mesh
 
     my_coordinate = device_mesh.get_coordinate()
@@ -272,11 +284,16 @@ def redistribute_local_tensor(
                 # partial -> partial no op, should never hit
                 new_local_tensor = local_tensor
 
+<<<<<<< HEAD
         assert new_local_tensor is not None
         local_tensor = new_local_tensor
 
     assert new_local_tensor is not None, "redistribute failed!"
 
+=======
+        local_tensor = new_local_tensor
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not async_op and isinstance(new_local_tensor, funcol.AsyncCollectiveTensor):
         new_local_tensor = new_local_tensor.wait()
 
@@ -292,6 +309,7 @@ def forward(  # type: ignore[override]
         device_mesh: DeviceMesh,
         placements: tuple[Placement, ...],
         async_op: bool = False,
+<<<<<<< HEAD
     ):
         current_spec = input._spec
         ctx.current_spec = current_spec
@@ -303,12 +321,47 @@ def forward(  # type: ignore[override]
             )
 
             local_tensor = input._local_tensor
+=======
+        forward_dtype: Optional[torch.dtype] = None,
+        backward_dtype: Optional[torch.dtype] = None,
+    ):
+        ctx.async_op = async_op
+        ctx.backward_dtype = backward_dtype
+        ctx.original_dtype = input._local_tensor.dtype
+
+        if forward_dtype is not None and forward_dtype != input._local_tensor.dtype:
+            local_tensor = input._local_tensor.to(dtype=forward_dtype)
+            current_spec = DTensorSpec(
+                mesh=device_mesh,
+                placements=input._spec.placements,
+                tensor_meta=TensorMeta(
+                    shape=input.shape,
+                    stride=input.stride(),
+                    dtype=forward_dtype,
+                ),
+            )
+        else:
+            local_tensor = input._local_tensor
+            current_spec = input._spec
+
+        ctx.current_spec = current_spec
+
+        if current_spec.placements != placements:
+            target_spec = DTensorSpec(
+                device_mesh, placements, tensor_meta=current_spec.tensor_meta
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output = redistribute_local_tensor(
                 local_tensor, current_spec, target_spec, async_op=async_op
             )
         else:
             # use the same local tensor if placements are the same.
+<<<<<<< HEAD
             output = input._local_tensor
+=======
+            output = local_tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             target_spec = current_spec
 
         return dtensor.DTensor(
@@ -320,10 +373,36 @@ def forward(  # type: ignore[override]
     @staticmethod
     def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
         previous_spec = ctx.current_spec
+<<<<<<< HEAD
         current_spec = grad_output._spec
         async_op = ctx.async_op
 
         local_tensor = grad_output._local_tensor
+=======
+        async_op = ctx.async_op
+        backward_dtype = ctx.backward_dtype or ctx.original_dtype
+
+        if backward_dtype != grad_output._local_tensor.dtype:
+            local_tensor = grad_output._local_tensor.to(dtype=backward_dtype)
+            current_spec = DTensorSpec(
+                mesh=grad_output._spec.device_mesh,
+                placements=grad_output._spec.placements,
+                tensor_meta=TensorMeta(
+                    shape=grad_output.shape,
+                    stride=grad_output.stride(),
+                    dtype=backward_dtype,
+                ),
+            )
+            previous_spec = DTensorSpec(
+                mesh=previous_spec.device_mesh,
+                placements=previous_spec.placements,
+                tensor_meta=current_spec.tensor_meta,
+            )
+        else:
+            local_tensor = grad_output._local_tensor
+            current_spec = grad_output._spec
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         output = redistribute_local_tensor(
             local_tensor,
             current_spec,
@@ -331,6 +410,13 @@ def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
             async_op=async_op,
             is_backward=True,
         )
+<<<<<<< HEAD
+=======
+
+        if output.dtype != ctx.original_dtype:
+            output = output.to(ctx.original_dtype)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # normalize the target placement to replicate if it is partial
         normalized_placements: list[Placement] = []
         for previous_placement in previous_spec.placements:
@@ -346,7 +432,11 @@ def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
             tensor_meta=TensorMeta(
                 shape=grad_output.shape,
                 stride=grad_output.stride(),
+<<<<<<< HEAD
                 dtype=grad_output.dtype,
+=======
+                dtype=output.dtype,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         )
         output_dtensor = dtensor.DTensor(
@@ -360,4 +450,9 @@ def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
             None,
             None,
             None,
+<<<<<<< HEAD
+=======
+            None,
+            None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
index c5bb22a92b7d..3ca5116a8e13 100644
--- a/torch/distributed/tensor/_sharding_prop.py
+++ b/torch/distributed/tensor/_sharding_prop.py
@@ -12,10 +12,17 @@
 from torch.distributed.tensor._op_schema import (
     OpInfo,
     OpSchema,
+<<<<<<< HEAD
     OpStrategy,
     OutputSharding,
     OutputSpecType,
     PlacementStrategy,
+=======
+    OpSpec,
+    OpStrategy,
+    OutputSharding,
+    OutputSpecType,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     RuntimeSchemaInfo,
     StrategyType,
     TupleStrategy,
@@ -77,6 +84,11 @@ def __init__(self) -> None:
             aten.reshape.default: 1,
             aten.view.default: 1,
             aten._unsafe_view.default: 1,
+<<<<<<< HEAD
+=======
+            aten.select_backward.default: 1,
+            aten.slice_backward.default: 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
 
     def register_sharding_prop_rule(
@@ -225,7 +237,11 @@ def _wrap_with_op_strategy(self, op_schema: OpSchema) -> OpSchema:
 
         def spec_to_strategy(spec: object) -> object:
             if isinstance(spec, DTensorSpec):
+<<<<<<< HEAD
                 return OpStrategy([PlacementStrategy(spec)])
+=======
+                return OpStrategy([OpSpec(spec)])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif (
                 isinstance(spec, (list, tuple))
                 and len(spec) > 0
@@ -363,8 +379,13 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                 )
             elif isinstance(op_strategy, TupleStrategy):
                 # tuple strategy output sharding processing
+<<<<<<< HEAD
                 # runtime selected placement strategy for each TupleStrategy input arg
                 selected_strategies: list[PlacementStrategy] = []
+=======
+                # runtime select OpSpec for each TupleStrategy input arg
+                selected_strategies: list[OpSpec] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 out_spec_list: list[DTensorSpec] = []
                 for strategy in op_strategy.childs:
                     assert isinstance(strategy, OpStrategy)
@@ -485,6 +506,7 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                 f"Operator {op_schema.op} does not have a sharding strategy registered."
             )
 
+<<<<<<< HEAD
     def _select_strategy(self, strategy: OpStrategy) -> PlacementStrategy:
         if len(strategy.strategies) == 1:
             # short cut with only one possible strategy
@@ -500,6 +522,23 @@ def _select_strategy(self, strategy: OpStrategy) -> PlacementStrategy:
 
         # for eager execution, we just select the one with the minimal redistribute cost
         return strategy.strategies[strategy_costs.index(min(strategy_costs))]
+=======
+    def _select_strategy(self, strategy: OpStrategy) -> OpSpec:
+        if len(strategy.strategies) == 1:
+            # short cut with only one possible OpSpec
+            return strategy.strategies[0]
+
+        op_spec_costs: list[float] = []
+        for op_spec in strategy.strategies:
+            assert op_spec.redistribute_cost is not None, (
+                "must set redistribute cost each OpSpec!"
+            )
+            redistribute_cost = sum(chain.from_iterable(op_spec.redistribute_cost))
+            op_spec_costs.append(redistribute_cost)
+
+        # for eager execution, we just select the one with the minimal redistribute cost
+        return strategy.strategies[op_spec_costs.index(min(op_spec_costs))]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _adjust_shape_and_stride_args(
         self,
diff --git a/torch/distributed/tensor/_shards_wrapper.py b/torch/distributed/tensor/_shards_wrapper.py
index 11bdb4ec2ef2..d260c9324ee2 100644
--- a/torch/distributed/tensor/_shards_wrapper.py
+++ b/torch/distributed/tensor/_shards_wrapper.py
@@ -21,12 +21,19 @@
 )
 
 
+<<<<<<< HEAD
 aten = (
     torch.ops.aten
 )  # pyre-ignore[5]: Globally accessible variable `aten` has no type specified.
 
 
 class LocalShardsWrapper(torch.Tensor):  # pyre-ignore[13]: pyre is bad at __new__
+=======
+aten = torch.ops.aten
+
+
+class LocalShardsWrapper(torch.Tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     A wrapper class to hold local shards of a DTensor.
     This class is used largely for checkpointing purposes and implicity subtypes
@@ -41,18 +48,53 @@ class LocalShardsWrapper(torch.Tensor):  # pyre-ignore[13]: pyre is bad at __new
     def __new__(
         cls, local_shards: list[torch.Tensor], local_offsets: list[tuple[int, ...]]
     ) -> "LocalShardsWrapper":
+<<<<<<< HEAD
         assert len(local_shards) > 0
         assert len(local_shards) == len(local_offsets)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert all(
             tensor.device == local_shards[0].device for tensor in local_shards[1:]
         )
 
+<<<<<<< HEAD
         # we calculate the total tensor size by "concat" on second tensor dimension
         cat_tensor_shape = list(local_shards[0].size())
         if len(local_shards) > 1:  # column-wise sharding
             for shard in local_shards[1:]:
                 cat_tensor_shape[1] += shard.size()[1]
 
+=======
+        # if empty shard, we create a empty tensor
+        if len(local_shards) == 0:
+            r = torch.Tensor._make_wrapper_subclass(
+                cls,
+                torch.Size([0, 0]),
+            )
+            r._local_shards = []
+            r._storage_meta = TensorStorageMetadata(
+                properties=TensorProperties(),
+                size=torch.Size([0, 0]),
+                chunks=[
+                    ChunkStorageMetadata(
+                        offsets=torch.Size([0, 0]), sizes=torch.Size([0, 0])
+                    )
+                ],
+            )
+            return r
+
+        # we calculate the total tensor size by "concat" on second tensor dimension
+        cat_tensor_shape = list(local_shards[0].size())
+        if len(local_shards) > 1 and local_shards[0].ndim == 2:  # column-wise sharding
+            for shard in local_shards[1:]:
+                cat_tensor_shape[1] += shard.size()[1]
+
+        # in cases of sharding optimizer rowwise, we calculate total tensor size by "concat" on first tensor dimension
+        if len(local_shards) > 1 and local_shards[0].ndim == 1:  # column-wise sharding
+            for shard in local_shards[1:]:
+                cat_tensor_shape[0] += shard.size()[0]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         wrapper_properties = TensorProperties.create_from_tensor(local_shards[0])
         wrapper_shape = torch.Size(cat_tensor_shape)
         chunks_meta = [
@@ -63,7 +105,11 @@ def __new__(
             for shard, offset in zip(local_shards, local_offsets)
         ]
 
+<<<<<<< HEAD
         r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+=======
+        r = torch.Tensor._make_wrapper_subclass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls,
             torch.Size(cat_tensor_shape),
         )
@@ -78,9 +124,13 @@ def __new__(
 
     # necessary for ops dispatching from this subclass to its local shards
     @classmethod
+<<<<<<< HEAD
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+=======
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs = kwargs or {}
 
         dispatcher = {
@@ -91,21 +141,33 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
             aten.equal.default: cls.handle_equal,
             aten.detach.default: cls.handle_detach,
             aten.clone.default: cls.handle_clone,
+<<<<<<< HEAD
         }
 
         if func in dispatcher:
             return dispatcher[func](
                 args, kwargs
             )  # pyre-ignore [29] - `Variable[_VT]` is not a function.
+=======
+            aten.new_empty.default: cls.handle_new_empty,
+        }
+
+        if func in dispatcher:
+            return dispatcher[func](args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise NotImplementedError(
                 f"{func} is not supported for LocalShardsWrapper!"
             )
 
     @staticmethod
+<<<<<<< HEAD
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def handle_all_gather_into_tensor(args, kwargs):
+=======
+    def handle_all_gather_into_tensor(args, kwargs) -> torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dim = args[0].local_sizes()[0][1]
         cat_tensor = torch.cat(
             [t.view(-1) for t in args[0].local_shards()], dim=0
@@ -115,6 +177,7 @@ def handle_all_gather_into_tensor(args, kwargs):
         )
 
     @staticmethod
+<<<<<<< HEAD
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def handle_wait_tensor(args, kwargs):
@@ -124,6 +187,13 @@ def handle_wait_tensor(args, kwargs):
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def handle_to_copy(args, kwargs):
+=======
+    def handle_wait_tensor(args, kwargs) -> torch.Tensor:
+        return torch.ops._c10d_functional.wait_tensor(args[0])
+
+    @staticmethod
+    def handle_to_copy(args, kwargs) -> torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res_shards_list = [
             aten._to_copy.default(shard, *args[1:], **kwargs)
             for shard in args[0].local_shards()
@@ -131,6 +201,7 @@ def handle_to_copy(args, kwargs):
         return LocalShardsWrapper(res_shards_list, args[0].local_offsets())
 
     @staticmethod
+<<<<<<< HEAD
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def handle_view(args, kwargs):
@@ -145,6 +216,43 @@ def handle_view(args, kwargs):
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def handle_equal(args, kwargs):
+=======
+    def handle_view(args, kwargs) -> "LocalShardsWrapper":
+        view_shape = args[1]
+        res_shards_list = []
+        if len(args[0].local_shards()) > 1:
+            if args[0].local_shards()[0].ndim == 2:
+                assert (
+                    args[0].storage_metadata().size[0] == view_shape[0]
+                    and args[0].storage_metadata().size[1] == view_shape[1]
+                )
+                # This accounts for a DTensor quirk, when multiple shards are present on a rank, DTensor on
+                # init calls view_as() on the global tensor shape
+                # will fail because the view shape is not applicable to individual shards.
+                res_shards_list = [
+                    aten.view.default(shard, shard.shape, **kwargs)
+                    for shard in args[0].local_shards()
+                ]
+            elif args[0].local_shards()[0].ndim == 1:
+                assert args[0].storage_metadata().size[0] == view_shape[0]
+                # This case is for optimizer sharding as regardles of sharding type, optimizer state is row wise sharded
+                res_shards_list = [
+                    aten.view.default(shard, shard.shape, **kwargs)
+                    for shard in args[0].local_shards()
+                ]
+            else:
+                raise NotImplementedError("No support for view on tensors ndim > 2")
+        else:
+            # view is called per shard
+            res_shards_list = [
+                aten.view.default(shard, args[1], **kwargs)
+                for shard in args[0].local_shards()
+            ]
+        return LocalShardsWrapper(res_shards_list, args[0].local_offsets())
+
+    @staticmethod
+    def handle_equal(args, kwargs) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         LocalShardsWrapper equal impl also checks for equality of storage metadata
         and the order of shards
@@ -161,9 +269,13 @@ def handle_equal(args, kwargs):
         return True
 
     @staticmethod
+<<<<<<< HEAD
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def handle_detach(args, kwargs):
+=======
+    def handle_detach(args, kwargs) -> "LocalShardsWrapper":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self_ls = args[0]
         deatched_local_shards = [
             aten.detach.default(shard) for shard in self_ls.local_shards()
@@ -173,9 +285,13 @@ def handle_detach(args, kwargs):
         return self_ls
 
     @staticmethod
+<<<<<<< HEAD
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def handle_clone(args, kwargs):
+=======
+    def handle_clone(args, kwargs) -> "LocalShardsWrapper":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self_ls = args[0]
         desired_memory_format = kwargs.get("memory_format", None)
         if desired_memory_format and desired_memory_format != torch.preserve_format:
@@ -188,6 +304,7 @@ def handle_clone(args, kwargs):
         ]
         return LocalShardsWrapper(cloned_local_shards, self_ls.local_offsets())
 
+<<<<<<< HEAD
     @property
     def device(self) -> torch._C.device:  # type: ignore[override]
         return self._local_shards[0].device
@@ -201,6 +318,29 @@ def is_pinned(self) -> bool:  # type: ignore[override]
         return self._storage_meta.properties.pin_memory
 
     # pyre-ignore[14]
+=======
+    @staticmethod
+    def handle_new_empty(args, kwargs) -> "LocalShardsWrapper":
+        self_ls = args[0]
+        return LocalShardsWrapper(
+            [torch.empty_like(shard) for shard in self_ls._local_shards],
+            self_ls.local_offsets(),
+        )
+
+    @property
+    def device(self) -> torch._C.device:  # type: ignore[override]
+        return (
+            self._local_shards[0].device if self._local_shards else torch.device("meta")
+        )
+
+    @property
+    def is_meta(self) -> bool:  # type: ignore[override]
+        return self._local_shards[0].is_meta if self._local_shards else True
+
+    def is_pinned(self) -> bool:  # type: ignore[override]
+        return self._storage_meta.properties.pin_memory
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def requires_grad_(self, requires_grad: bool = True) -> "LocalShardsWrapper":
         self._storage_meta.properties.requires_grad = requires_grad
         [shard.requires_grad_(requires_grad) for shard in self._local_shards]
@@ -233,7 +373,11 @@ def local_offsets(self) -> list[torch.Size]:
     @property
     def local_chunks(self) -> list[ChunkStorageMetadata]:
         """
+<<<<<<< HEAD
         Returns a :class:`List[ChunkStorageMetadata]` object corresponding to the
+=======
+        Returns a :class:`list[ChunkStorageMetadata]` object corresponding to the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metadata for each tensor shard
         """
         return self._storage_meta.chunks
@@ -245,9 +389,20 @@ def storage_metadata(self) -> TensorStorageMetadata:
         """
         return self._storage_meta
 
+<<<<<<< HEAD
     def __create_write_items__(
         self, fqn: str, object: Any
     ) -> list[WriteItem]:  # pyre-ignore[2]
+=======
+    def is_empty_shard(self) -> bool:
+        """
+        Returns a :class:`bool` object indicating if the local tensor on current rank
+        is an empty tensor
+        """
+        return self._storage_meta.size[0] == 0 and self._storage_meta.size[1] == 0
+
+    def __create_write_items__(self, fqn: str, object: Any) -> list[WriteItem]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         For compatibility with DCP, we support creation of WriteItems
         such that they can be saved properly.
@@ -293,6 +448,15 @@ def __get_tensor_shard__(self, index: MetadataIndex) -> torch.Tensor:
                 if chunk.offsets == index.offset:
                     return shard
 
+<<<<<<< HEAD
+=======
+        # Empty shard case
+        if len(self._local_shards) == 0 and self._storage_meta.chunks[
+            0
+        ].sizes == torch.Size([0, 0]):
+            return torch.empty(0)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise ValueError(
             f"Could not find shard at '{index.offset}' for FQN: '{index.fqn}'"
         )
@@ -303,12 +467,18 @@ def _get_tensor_size_bytes(self) -> int:
             object_size += shard.nelement() * shard.element_size()
         return object_size
 
+<<<<<<< HEAD
     # pyre-fixme[3]: Return type must be annotated.
     def __hash__(self):
         return id(self)
 
     # pyre-fixme[14]: `__repr__` overrides method defined in `torch._tensor.Tensor` inconsistently.
     # pyre-fixme[3]: Return type must be annotated.
+=======
+    def __hash__(self) -> int:
+        return id(self)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __repr__(self) -> str:  # type: ignore[override]
         return f"LocalShardsWrapper:{self._local_shards} {self._storage_meta}"
 
diff --git a/torch/distributed/tensor/_utils.py b/torch/distributed/tensor/_utils.py
index 61705610f08f..221c06017f43 100644
--- a/torch/distributed/tensor/_utils.py
+++ b/torch/distributed/tensor/_utils.py
@@ -1,7 +1,16 @@
+<<<<<<< HEAD
 from collections.abc import Sequence
 from typing import cast
 
 import torch
+=======
+from collections import defaultdict
+from collections.abc import Sequence
+from typing import cast, Optional
+
+import torch
+import torch.distributed._functional_collectives as funcol
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.distributed.tensor._api as dtensor
 from torch._prims_common import ShapeType
 from torch.distributed.device_mesh import DeviceMesh
@@ -15,6 +24,62 @@
 )
 
 
+<<<<<<< HEAD
+=======
+def _explicit_order_placements(
+    mesh_shape: ShapeType, placements: Sequence[Placement]
+) -> Sequence[tuple[int, Placement]]:
+    """
+    Replace Strided Shards with regular shards in an adjusted order.
+
+    Returns a list of (mesh_dim, placement) tuples where the list order is the sharding order.
+
+    ex.
+    [Shard(0), _StridedShard(0, split_factor=2), Shard(0)] ->
+    [(0, Shard(0)), (2, Shard(0)), (1, Shard(0))]
+
+    """
+    if not len(placements) == len(mesh_shape):
+        raise RuntimeError(
+            "Expected one placement per mesh dim, "
+            f"but found {len(placements)} placements and {len(mesh_shape)} mesh dims."
+        )
+    ordered = []
+    deferred_strided_placements = defaultdict(list)
+    strided_part_ended_for_dim = set()
+    for mesh_dim, p in enumerate(placements):
+        if isinstance(p, _StridedShard):
+            # validate the stride is the correct multiple of the meshdim and the earlier shard
+            deferred_strided_placements[p.dim].append((mesh_dim, p))
+
+        else:
+            ordered.append((mesh_dim, p))
+            if isinstance(p, Shard):
+                if p.dim in strided_part_ended_for_dim:
+                    raise NotImplementedError(
+                        f"Strided sharding does not allow Shard() to appear after "
+                        f"the strided part has ended. {p} at mesh dim {mesh_dim} in "
+                        f"{placements} violates this assumption."
+                    )
+
+                if p.dim in deferred_strided_placements:
+                    strided_part_ended_for_dim.add(p.dim)
+                    strided_placements = deferred_strided_placements.pop(p.dim)
+                    aggregate_size = mesh_shape[mesh_dim]
+                    while len(strided_placements) > 0:
+                        strided_mesh_dim, strided = strided_placements.pop()
+                        if not strided.split_factor == aggregate_size:
+                            raise RuntimeError(
+                                f"Can only convert _StridedShard to ordered Shard if split_factor({strided.split_factor})"
+                                f" == aggregate mesh size ({aggregate_size})"
+                            )
+                        aggregate_size *= mesh_shape[strided_mesh_dim]
+                        ordered.append((strided_mesh_dim, Shard(p.dim)))
+
+    return ordered
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def compute_local_shape_and_global_offset(
     global_shape: ShapeType, mesh: DeviceMesh, placements: Sequence[Placement]
 ) -> tuple[tuple[int, ...], tuple[int, ...]]:
@@ -22,6 +87,7 @@ def compute_local_shape_and_global_offset(
     Compute the local tensor shape and the global offsets into the original tensor
     of a DTensor on its current global rank. This is useful for checkpointing purpose.
 
+<<<<<<< HEAD
     Example (2 host with 4GPUs each):
     # Below is a DeviceMesh with mesh_shape of (2, 4)
     mesh = DeviceMesh(device_type="cuda",
@@ -56,6 +122,55 @@ def compute_local_shape_and_global_offset(
     rank7 -- local_shape:[0,], global_offset:[2,]
     """
     my_coordinate = mesh.get_coordinate()
+=======
+    Example:
+    global_tensor = [[0,  1,  2,  3,  4], sharded on mesh (DP=2, TP=2) with (Shard(1), Shard(1))
+                     [10, 11, 12, 13, 14]]
+
+    This table shows the return value of local_shape and global_offset for each rank.
+    (`local_tensor` is for illustration only).
+
+    Note how the first coordinate of global_offset is always 0, corresponding to tensor dim 0 being replicated.
+
+    Rank        local_tensor        local_shape     global_offset
+    -------------------------------------------------------------
+    0           [[0, 1],            (2, 2)          (0, 0)
+                 [10, 11]]
+
+    1           [[2],               (2, 1)          (0, 2)
+                 [12]]
+
+    2           [[3],               (2, 1)          (0, 3)
+                 [13]]
+
+    3           [[4],               (2, 1)          (0, 4)
+                 [14]]
+
+    Args:
+        global_shape (ShapeType): The global shape of the DTensor.
+        mesh (:class:`DeviceMesh`): The device mesh this DTensor is distributed on.
+        placements (Sequence[:class:`Placement`]]): The placements of the DTensor.
+
+    Return:
+        local_shape: the shape of the DTensor's _local_tensor on the current rank.
+        global_offset: a tuple of offsets for each dimension of the global tensor shape,
+        identifying how this shard fits into the global tensor in each dimension.
+
+    """
+    return _compute_local_shape_and_global_offset(
+        global_shape, mesh.shape, mesh.get_coordinate(), placements
+    )
+
+
+# accept 'plain data types' to enable simpler unit testing without creating device mesh
+def _compute_local_shape_and_global_offset(
+    global_shape: ShapeType,
+    mesh_shape: ShapeType,
+    my_coordinate: Optional[list[int]],
+    placements: Sequence[Placement],
+) -> tuple[tuple[int, ...], tuple[int, ...]]:
+    ordered_placements = _explicit_order_placements(mesh_shape, placements)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if my_coordinate is None:
         # if rank not in the mesh, return empty offset
@@ -63,6 +178,7 @@ def compute_local_shape_and_global_offset(
     else:
         local_shape = list(global_shape)
         global_offset = [0] * len(global_shape)
+<<<<<<< HEAD
         shard_idx_stride_by_mesh_dim = [
             [0] * mesh.ndim for _ in range(len(global_shape))
         ]  # index by (shard_dim, mesh_dim)
@@ -70,21 +186,33 @@ def compute_local_shape_and_global_offset(
 
         for idx, placement in enumerate(placements):
             mesh_dim_size = mesh.size(idx)
+=======
+        for mesh_dim, placement in ordered_placements:
+            mesh_dim_size = mesh_shape[mesh_dim]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(placement, Shard):
                 shard_dim = placement.dim
                 local_offset = [0] * len(global_shape)
                 assert shard_dim < len(local_shape), (
                     f"Sharding dim {shard_dim} greater than tensor ndim {len(local_shape)}"
                 )
+<<<<<<< HEAD
                 shard_size, shard_offset = placement._local_shard_size_on_dim(
                     local_shape[shard_dim],
                     mesh_dim_size,
                     my_coordinate[idx],
                     return_offset=True,
+=======
+                shard_size, shard_offset = placement._local_shard_size_and_offset(
+                    local_shape[shard_dim],
+                    mesh_dim_size,
+                    my_coordinate[mesh_dim],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 local_shape[shard_dim] = shard_size
                 local_offset[shard_dim] = shard_offset
+<<<<<<< HEAD
 
                 # On a given dimension, if the local_offset[shard_dim] is smaller than global_offset[shard_dim],
                 # it means that this dimension has been already sharded in previous placement.
@@ -96,6 +224,22 @@ def compute_local_shape_and_global_offset(
                     global_offset[shard_dim] += local_offset[shard_dim]
 
                 num_shards_by_tensor_dim[shard_dim] *= mesh_dim_size
+=======
+                if shard_size == 0:
+                    # Special case to fill in a standardized non-garbage value for the global_offset
+                    # of zero-sized shards.  This value is out of bounds of the tensor, so it won't conflict
+                    # with any real offsets.  DCP may rely on this value to de-duplicate shards.
+                    global_offset[shard_dim] = global_shape[shard_dim]
+                else:
+                    # On a given dimension, if the local_offset[shard_dim] is smaller than global_offset[shard_dim],
+                    # it means that this dimension has been already sharded in previous placement.
+                    # Therefore, we cannot simply replace the global_offset[shard_dim] with local_offset[shard_dim].
+                    # Instead, for the given shard_dim, we need to add local_offset[shard_dim] to existing global_offset[shard_dim].
+                    if global_offset[shard_dim] <= local_offset[shard_dim]:
+                        global_offset[shard_dim] = local_offset[shard_dim]
+                    else:
+                        global_offset[shard_dim] += local_offset[shard_dim]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # NOTE: the offset compute relies on the local shard index and it has no
         # problem when strided sharding is not present. To correctly compute, we assume
@@ -120,6 +264,7 @@ def compute_local_shape_and_global_offset(
         # happen on mesh of 3 or more dimensions.
         # TODO: change this function to correctly address this.
         # TODO: this logic can be applied to contiguous sharding as well
+<<<<<<< HEAD
         strided_sharding = any(isinstance(p, _StridedShard) for p in placements)
         if strided_sharding:
             strided_part_seen = [False] * len(global_shape)
@@ -160,6 +305,8 @@ def compute_local_shape_and_global_offset(
 
             global_offset = [x * y for x, y in zip(local_shape, shard_idx)]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return tuple(local_shape), tuple(global_offset)
 
 
@@ -222,6 +369,71 @@ def compute_global_tensor_info(
     return tensor_shape, tensor_stride
 
 
+<<<<<<< HEAD
+=======
+def compute_global_tensor_shape(
+    shape: torch.Size, mesh: DeviceMesh, placements: Sequence[Placement]
+) -> torch.Size:
+    """
+    Compute the global size of a DTensor from the given local tensor shape,
+    the mesh and placements. Different from `compute_global_tensor_info`,
+    which assumes sharding is even, this util allgathers local shards' shapes
+    from all ranks and thus can support uneven sharding.
+    NOTE: Currently this function only supports 1D mesh.
+
+    Args:
+        shape (:class:`torch.Size`):
+            Shape of the local tensor
+        mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+        placements (Sequence[:class:`Placement`]]):
+            The attribute of the DTensor that describes its layout
+            on the mesh topology.
+
+    Return:
+        tensor_shape: Shape of the global DTensor.
+    """
+    if len(placements) != 1:
+        raise NotImplementedError(
+            "compute_global_tensor_shape only supports 1 placement for now."
+        )
+
+    if len(placements) != mesh.ndim:
+        raise RuntimeError(
+            "Expected one placement per mesh dim, "
+            f"but found {len(placements)} placements and {mesh.ndim} mesh dims."
+        )
+
+    if isinstance(placements[0], Replicate):
+        return shape
+    elif isinstance(placements[0], Shard):
+        local_shape = torch.tensor(list(shape))
+        gathered_shaped_tensors = [
+            torch.empty_like(local_shape, device=local_shape.device)
+            for _ in range(mesh.size())
+        ]
+        funcol.all_gather_inplace(gathered_shaped_tensors, local_shape)
+        sharded_dim_sum = 0
+        shard_dim = placements[0].dim
+        other_dims = [d for d in range(mesh.ndim) if d != shard_dim]
+        for shape_tensor in gathered_shaped_tensors:
+            if not torch.equal(local_shape[other_dims], shape_tensor[other_dims]):
+                raise RuntimeError(
+                    "Non-sharded dimentions should have identical size across ranks."
+                )
+            shape_tensor_list = shape_tensor.tolist()
+            sharded_dim_sum += shape_tensor_list[shard_dim]
+        global_shape = list(shape)
+        global_shape[placements[0].dim] = sharded_dim_sum
+        return torch.Size(global_shape)
+    else:
+        raise NotImplementedError(
+            f"Placement type {type(placements[0])} not supported."
+        )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def try_find_mesh_from_args(
     op_call: torch._ops.OpOverload, args: Sequence[object]
 ) -> DeviceMesh:
diff --git a/torch/distributed/tensor/debug/_visualize_sharding.py b/torch/distributed/tensor/debug/_visualize_sharding.py
index fc476514bf55..7396e5c6d128 100644
--- a/torch/distributed/tensor/debug/_visualize_sharding.py
+++ b/torch/distributed/tensor/debug/_visualize_sharding.py
@@ -1,15 +1,24 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from collections.abc import Sequence
+=======
+import importlib.util
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import numpy as np
 
 from torch._prims_common import ShapeType
+<<<<<<< HEAD
 from torch.distributed.tensor import DeviceMesh
 from torch.distributed.tensor.placement_types import Placement, Shard
+=======
+from torch.distributed.tensor._utils import _compute_local_shape_and_global_offset
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["visualize_sharding"]
 
+<<<<<<< HEAD
 
 def _mesh_to_coordinate(mesh, device_type):
     """
@@ -65,11 +74,28 @@ def _create_table(blocks):
     # Extract unique row and column ranges
     row_ranges = sorted({block["row_range"] for block in blocks})
     col_ranges = sorted({block["column_range"] for block in blocks})
+=======
+Color = tuple[float, float, float]
+
+
+def _create_table(
+    shards: list[tuple[tuple[int, int], tuple[int, int], int]], device_kind: str = ""
+):
+    """
+    Creates a tabulate table given row and column ranges with device name
+    """
+    from tabulate import tabulate
+
+    # Extract unique row and column ranges
+    row_ranges = sorted({block[0] for block in shards})
+    col_ranges = sorted({block[1] for block in shards})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Create a matrix initialized with empty strings
     matrix = [["" for _ in col_ranges] for _ in row_ranges]
 
     # Fill the matrix with values
+<<<<<<< HEAD
     for block in blocks:
         row_index = row_ranges.index(block["row_range"])
         col_index = col_ranges.index(block["column_range"])
@@ -77,6 +103,15 @@ def _create_table(blocks):
             matrix[row_index][col_index] = block["value"]
         else:
             matrix[row_index][col_index] += ", " + block["value"]
+=======
+    for block in shards:
+        row_index = row_ranges.index(block[0])
+        col_index = col_ranges.index(block[1])
+        if matrix[row_index][col_index] == "":
+            matrix[row_index][col_index] = device_kind + ":" + str(block[2])
+        else:
+            matrix[row_index][col_index] += "," + str(block[2])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Prepare headers
     row_headers = [f"Row {r[0]}-{r[1]}" for r in row_ranges]
@@ -85,6 +120,7 @@ def _create_table(blocks):
     return tabulate(matrix, headers=col_headers, showindex=row_headers)
 
 
+<<<<<<< HEAD
 def _compute_local_shape_and_global_offset(
     global_shape: ShapeType,
     mesh: DeviceMesh,
@@ -151,17 +187,148 @@ def visualize_sharding(dtensor, header=""):
     device_type = dtensor.device_mesh.device_type
 
     if device_mesh.get_coordinate() is None:  # current rank is not in the mesh
+=======
+def make_color_iter(color_map, num_rows, num_cols):
+    num_colors = num_rows * num_cols
+    for idx in range(num_colors):
+        yield color_map(idx)
+
+
+def _canonicalize_color(color: Color) -> str:
+    if isinstance(color, str):
+        return color
+    r, g, b = (int(a * 255) for a in color)
+    return f"#{r:02X}{g:02X}{b:02X}"
+
+
+def _get_text_color(color: str) -> str:
+    r, g, b = map(lambda x: int(x, 16), (color[1:3], color[3:5], color[5:7]))  # noqa: C417
+    if (r * 0.299 + g * 0.587 + b * 0.114) > 186:
+        return "#000000"
+    return "#ffffff"
+
+
+def _create_rich_table(
+    shape: ShapeType,
+    shards: list[tuple[tuple[int, int], tuple[int, int], int]],
+    device_kind: str = "",
+    scale: float = 1.0,
+    min_width: int = 9,
+    max_width: int = 80,
+):
+    import matplotlib
+    import rich.align
+    import rich.box
+    import rich.console
+    import rich.padding
+    import rich.style
+    import rich.table
+
+    dtensor_height = shape[0]
+    dtensor_width = shape[1] if len(shape) == 2 else 1
+
+    row_ranges = sorted({s[0] for s in shards})
+    col_ranges = sorted({s[1] for s in shards})
+    num_rows, num_cols = len(row_ranges), len(col_ranges)
+
+    console = rich.console.Console(width=max_width)
+    use_color = console.color_system
+    color_iter = make_color_iter(matplotlib.colormaps["tab20b"], num_rows, num_cols)
+
+    base_height = int(10 * scale)
+    aspect_ratio = (shape[1] if len(shape) == 2 else 1) / shape[0]
+    base_width = int(base_height * aspect_ratio)
+    height_to_width_ratio = 2.5
+
+    table = rich.table.Table(
+        show_header=False,
+        show_lines=not use_color,
+        padding=0,
+        highlight=not use_color,
+        pad_edge=False,
+        box=rich.box.SQUARE if not use_color else None,
+    )
+    for row in range(num_rows):
+        table_row = []
+        for col in range(num_cols):
+            entry = (
+                device_kind
+                + ":"
+                + ",".join(
+                    [
+                        str(device_id)
+                        for row_range, col_range, device_id in shards
+                        if row_range == row_ranges[row] and col_range == col_ranges[col]
+                    ]
+                )
+            )
+            width = (col_ranges[col][1] - col_ranges[col][0]) / dtensor_width
+            width = int(width * base_width * height_to_width_ratio)
+            height = (row_ranges[row][1] - row_ranges[row][0]) / dtensor_height
+            height = int(height * base_height)
+            left_padding, remainder = divmod(width - len(entry) - 2, 2)
+            right_padding = left_padding + remainder
+            top_padding, remainder = divmod(height - 2, 2)
+            bottom_padding = top_padding + remainder
+            if use_color:
+                color = _canonicalize_color(next(color_iter)[:3])
+                text_color = _get_text_color(color)
+                top_padding += 1
+                bottom_padding += 1
+                left_padding += 1
+                right_padding += 1
+            else:
+                color = None
+                text_color = None
+            padding = (
+                max(top_padding, 0),
+                max(right_padding, 0),
+                max(bottom_padding, 0),
+                max(left_padding, 0),
+            )
+            table_row.append(
+                rich.padding.Padding(
+                    rich.align.Align(entry, "center", vertical="middle"),
+                    padding,
+                    style=rich.style.Style(bgcolor=color, color=text_color),
+                )
+            )
+        table.add_row(*table_row)
+    console.print(table, end="\n\n")
+
+
+def visualize_sharding(dtensor, header="", use_rich: bool = False):
+    """
+    Visualizes sharding in the terminal for :class:`DTensor` that are 1D or 2D.
+
+    .. note:: This requires the ``tabulate`` package, or ``rich`` and ``matplotlib``.
+              No sharding info will be printed for empty tensors
+    """
+    if dtensor.numel() == 0:  # Do not print empty dtensors.
+        return
+
+    if len(dtensor.shape) >= 3:
+        raise RuntimeError("visualize sharding supports only 1D or 2D DTensor")
+
+    if dtensor.device_mesh.get_coordinate() is None:  # current rank is not in the mesh
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return
 
     # Only display the visualization once for each DTensor, on the rank whose
     # coordinate is 0 on all dimensions. For example, if the mesh is a full mesh,
     # we will only print on rank 0.
     local_rank_zero_on_all_dim = all(
+<<<<<<< HEAD
         device_mesh.get_local_rank(mesh_dim=dim) == 0 for dim in range(device_mesh.ndim)
+=======
+        dtensor.device_mesh.get_local_rank(mesh_dim=dim) == 0
+        for dim in range(dtensor.device_mesh.ndim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     if not local_rank_zero_on_all_dim:
         return
 
+<<<<<<< HEAD
     device_map = _mesh_to_coordinate(device_mesh, device_type)
     all_offsets = []
     for device in device_map:
@@ -176,3 +343,52 @@ def visualize_sharding(dtensor, header=""):
     # Print the table
     print(header)
     print(_create_table(blocks))
+=======
+    device_coords = {
+        int(device_index.item()): list(coord)
+        for coord, device_index in np.ndenumerate(
+            np.array(dtensor.device_mesh.mesh.tolist())
+        )
+    }
+
+    device_shard_shape_and_offsets = {
+        device_index: _compute_local_shape_and_global_offset(
+            dtensor.shape,
+            dtensor.device_mesh.shape,
+            device_coords[device_index],
+            dtensor.placements,
+        )
+        for device_index in device_coords
+    }
+
+    # Extend shards in a 1D tensor to 2D
+    device_shard_shape_and_offsets = {
+        device_index: (
+            shape if len(shape) == 2 else (shape[0], 1),
+            offset if len(offset) == 2 else (offset[0], 0),
+        )
+        for device_index, (shape, offset) in device_shard_shape_and_offsets.items()
+    }
+
+    shards = [
+        (
+            (offset[0], offset[0] + shape[0] - 1),
+            (offset[1], offset[1] + shape[1] - 1),
+            device_index,
+        )
+        for device_index, (shape, offset) in device_shard_shape_and_offsets.items()
+    ]
+
+    if (
+        importlib.util.find_spec("rich")
+        and importlib.util.find_spec("matplotlib")
+        and use_rich
+    ):
+        _create_rich_table(
+            dtensor.shape, shards, device_kind=dtensor.device_mesh.device_type
+        )
+    elif importlib.util.find_spec("tabulate"):
+        print(_create_table(shards, device_kind=dtensor.device_mesh.device_type))
+    else:
+        raise ValueError("`visualize_sharding` requires either `rich` or `tabulate`.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/tensor/examples/torchrec_sharding_example.py b/torch/distributed/tensor/examples/torchrec_sharding_example.py
index b78455bfebd9..ac04b9fc062d 100644
--- a/torch/distributed/tensor/examples/torchrec_sharding_example.py
+++ b/torch/distributed/tensor/examples/torchrec_sharding_example.py
@@ -69,7 +69,11 @@ def __new__(
             ChunkStorageMetadata(o, s.shape) for s, o in zip(local_shards, offsets)
         ]
 
+<<<<<<< HEAD
         r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+=======
+        r = torch.Tensor._make_wrapper_subclass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls,
             wrapper_shape,
         )
@@ -84,7 +88,11 @@ def __new__(
 
     # necessary for ops dispatching from this subclass to its local shards
     @classmethod
+<<<<<<< HEAD
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+=======
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         kwargs = kwargs or {}
 
         # TODO: we shall continually extend this function to support more ops if needed
diff --git a/torch/distributed/tensor/examples/visualize_sharding_example.py b/torch/distributed/tensor/examples/visualize_sharding_example.py
index 71cad75ef95f..c7e244338d0f 100644
--- a/torch/distributed/tensor/examples/visualize_sharding_example.py
+++ b/torch/distributed/tensor/examples/visualize_sharding_example.py
@@ -1,10 +1,15 @@
 """
 To run the example, use the following command:
+<<<<<<< HEAD
 torchrun --standalone --nnodes=1 --nproc-per-node=4 visualize_sharding_example.py
+=======
+TERM=xterm-256color torchrun --nproc-per-node=4 visualize_sharding_example.py
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 
 import os
 
+<<<<<<< HEAD
 import torch
 from torch.distributed.tensor import DeviceMesh, distribute_tensor, Replicate, Shard
 from torch.distributed.tensor.debug import visualize_sharding
@@ -88,3 +93,126 @@
 -------  ---------
 Row 0-3  cuda:3
 """
+=======
+import rich
+import rich.rule
+
+import torch
+import torch.distributed as dist
+import torch.distributed.tensor as dt
+import torch.distributed.tensor.debug
+
+
+assert int(os.getenv("WORLD_SIZE", "1")) >= 4, "We need at least 4 devices"
+rank = int(os.environ["RANK"])
+
+
+def section(msg: str) -> None:
+    if rank == 0:
+        rich.print(rich.rule.Rule(msg))
+
+
+def visualize(t: dt.DTensor, msg: str = "") -> None:
+    if rank == 0:
+        rich.print(msg)
+        dt.debug.visualize_sharding(t, use_rich=False)
+        dt.debug.visualize_sharding(t, use_rich=True)
+
+
+section("[bold]1D Tensor; 1D Mesh[/bold]")
+m = dist.init_device_mesh("cuda", (4,))
+t = torch.ones(4)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Replicate()]),
+    "Replicate along the only mesh dimension",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=0)]),
+    "Shard along the only tensor dimension",
+)
+
+section("[bold]2D Tensor; 1D Mesh[/bold]")
+m = dist.init_device_mesh("cuda", (4,))
+t = torch.ones(4, 4)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Replicate()]),
+    "Replicate along the only mesh dimension",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=0)]),
+    "Shard alone the first tensor dimension along the only mesh dimension",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=1)]),
+    "Shard along the second tensor dimension along the only mesh dimension",
+)
+
+section("[bold]1D Tensor; 2D Mesh[/bold]")
+m = dist.init_device_mesh("cuda", (2, 2))
+t = torch.ones(4)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Replicate(), dt.Replicate()]),
+    "Replicate along both mesh dimensions",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=0), dt.Shard(dim=0)]),
+    "Shard the only tensor dimension along both mesh dimensions",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=0), dt.Replicate()]),
+    "Shard the only tensor dimension along the first mesh dimension",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Replicate(), dt.Shard(dim=0)]),
+    "Shard the only tensor dimension along the second mesh dimension",
+)
+
+section("[bold]2D Tensor; 2D Mesh[/bold]")
+m = dist.init_device_mesh("cuda", (2, 2))
+t = torch.ones(4, 4)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Replicate(), dt.Replicate()]),
+    "Replicate along both mesh dimensions",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=0), dt.Shard(dim=0)]),
+    "Shard the first tensor dimension along both mesh dimensions",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=1), dt.Shard(dim=1)]),
+    "Shard the second tensor dimension along both mesh dimensions",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=0), dt.Shard(dim=1)]),
+    "Shard the first tensor dimension along the first mesh dimension, "
+    + "the second tensor dimension along the second mesh dimension",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=1), dt.Shard(dim=0)]),
+    "Shard the first tensor dimension along the second mesh dimension, "
+    + "the second tensor dimension along the first mesh dimension",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=0), dt.Replicate()]),
+    "Shard the first tensor dimension along the first mesh dimension, "
+    + "replicate the second tensor dimension along the second mesh dimension",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Replicate(), dt.Shard(dim=0)]),
+    "Shard the first tensor dimension along the second mesh dimension, "
+    + "replicate the second tensor dimension along the first mesh dimension",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Shard(dim=1), dt.Replicate()]),
+    "Shard the second tensor dimension along the first mesh dimension, "
+    + "replicate the second tensor dimension along the second mesh dimension",
+)
+visualize(
+    dt.distribute_tensor(t, m, [dt.Replicate(), dt.Shard(dim=1)]),
+    "Shard the second tensor dimension along the second mesh dimension, "
+    + "replicate the second tensor dimension along the first mesh dimension",
+)
+
+
+dist.destroy_process_group()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
index c323ea1593e0..fc9f3222ddc6 100644
--- a/torch/distributed/tensor/experimental/_attention.py
+++ b/torch/distributed/tensor/experimental/_attention.py
@@ -19,6 +19,10 @@
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor import distribute_module, DTensor, Replicate, Shard
 from torch.distributed.tensor.parallel.style import ParallelStyle
+<<<<<<< HEAD
+=======
+from torch.overrides import TorchFunctionMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["context_parallel", "set_rotate_method"]
@@ -39,6 +43,18 @@ class _RotateMethod(Enum):
 logger = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
+=======
+class _DispatchMode(Enum):
+    MONKEY_PATCH = auto()
+    TORCH_FUNCTION = auto()
+    TORCH_DISPATCH = auto()
+
+
+_dispatch_mode: _DispatchMode = _DispatchMode.MONKEY_PATCH
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass
 class _ContextParallelOptions:
     # Whether to upcast parameters and gradients to float32 to avoid accumulation
@@ -799,7 +815,11 @@ def _templated_ring_attention_backward(
     grad_query = grad_query.to(query.dtype)
     next_grad_kv = dkv_rotater.next_buffer().to(key.dtype)
     grad_key = next_grad_kv[: grad_key.numel()].reshape(grad_key.shape)
+<<<<<<< HEAD
     grad_value = next_grad_kv[grad_value.numel() :].reshape(grad_value.shape)
+=======
+    grad_value = next_grad_kv[grad_key.numel() :].reshape(grad_value.shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         grad_query,
         grad_key,
@@ -1166,6 +1186,7 @@ def attention_output_fn(mesh: DeviceMesh, outputs: Any) -> Any:
 
         return tuple(new_outputs)
 
+<<<<<<< HEAD
     # TODO: provide a more robust way to replace SDPA.
     # Currently we use monkey patch to replace scaled_dot_product_attention with the
     # wrapped fn. This is okay if users do `import torch.nn.functional` but will not
@@ -1182,6 +1203,62 @@ def attention_output_fn(mesh: DeviceMesh, outputs: Any) -> Any:
         yield
 
     _restore_function(F.scaled_dot_product_attention, F)
+=======
+    class DistributeFunction(TorchFunctionMode):
+        def __init__(
+            self,
+            fn: Callable,
+            device_mesh: DeviceMesh,
+            input_fn: Optional[Callable] = None,
+            output_fn: Optional[Callable] = None,
+        ):
+            self._device_mesh = device_mesh
+            self._input_fn = input_fn
+            self._output_fn = output_fn
+            self._fn = fn
+
+        def __torch_function__(
+            self,
+            func: Callable,
+            types: Any,
+            args: tuple[Any, ...] = (),
+            kwargs: Optional[dict[str, Any]] = None,
+        ) -> Any:
+            kwargs = kwargs or {}
+
+            if func != self._fn:
+                return func(*args, **kwargs)
+
+            if self._input_fn is not None:
+                args, kwargs = self._input_fn(self._device_mesh, *args, **kwargs)
+            output = func(*args, **kwargs)
+            if self._output_fn is not None:
+                output = self._output_fn(self._device_mesh, output)
+            return output
+
+    if _dispatch_mode == _DispatchMode.MONKEY_PATCH:
+        _distribute_function(
+            F.scaled_dot_product_attention,
+            F,
+            mesh,
+            attention_input_fn,
+            attention_output_fn,
+        )
+        with _enable_cp_dispatcher():
+            yield
+        _restore_function(F.scaled_dot_product_attention, F)
+    elif _dispatch_mode == _DispatchMode.TORCH_FUNCTION:
+        with DistributeFunction(
+            F.scaled_dot_product_attention,
+            mesh,
+            attention_input_fn,
+            attention_output_fn,
+        ):
+            with _enable_cp_dispatcher():
+                yield
+    else:
+        raise NotImplementedError("torch dispatch mode is not supported yet.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _LoadBalancer(ABC):
@@ -1324,7 +1401,11 @@ def context_parallel(
             these buffers can be put in this list to avoid extra restore time.
 
     .. warning::
+<<<<<<< HEAD
         `torch.distributed._tensor.experimental.attention.context_parallel` is a
+=======
+        `torch.distributed.tensor.experimental.context_parallel` is a
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prototype feature in PyTorch. The API is subject to change.
     """
     buffers = [] if buffers is None else buffers
diff --git a/torch/distributed/tensor/experimental/_func_map.py b/torch/distributed/tensor/experimental/_func_map.py
index 51861141af5b..edc8e0df023a 100644
--- a/torch/distributed/tensor/experimental/_func_map.py
+++ b/torch/distributed/tensor/experimental/_func_map.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
+<<<<<<< HEAD
+=======
+import functools
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Sequence
 from typing import Callable, Optional, Union
 
@@ -26,6 +30,10 @@ def local_map(
     func: Callable,
     out_placements: OutputPlacements,
     in_placements: Optional[InputPlacements] = None,
+<<<<<<< HEAD
+=======
+    in_grad_placements: Optional[InputPlacements] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device_mesh: Optional[DeviceMesh] = None,
     *,
     redistribute_inputs: bool = False,
@@ -65,6 +73,17 @@ def local_map(
             will be skipped and the argument will be directly passed to ``func``.
             If ``in_placements`` is ``None``, no placements examination will be performed.
             Default: None
+<<<<<<< HEAD
+=======
+        in_grad_placements (Tuple[`PlacementType`, ...], optional):
+            the placements hint of the :class:`DTensor` s gradient corresponds
+            to the flattened input DTensor. This argument is the hint that user
+            can give to :meth:`to_local` in case the gradient layout of the
+            local tensor input does not match its :class:`DTensor` input layout.
+            If not specified, we will assume the gradient layout of the local
+            tensor input remains the same as the original :class:`DTensor` input
+            and use that for gradient computation. Default: None.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device_mesh (:class:`DeviceMesh`, optional):
             the device mesh that all the :class:`DTensor` s are placed on. If not
             specified, this will be inferred from the input :class:`DTensor` s' device
@@ -126,7 +145,11 @@ def local_map(
     .. note:: This API is currently experimental and subject to change
     """
 
+<<<<<<< HEAD
     def wrapped(*args, **kwargs):
+=======
+    def wrapped(device_mesh: Optional[DeviceMesh], *args, **kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # process input args
         flat_args, args_spec = pytree.tree_flatten(args)
         if in_placements is not None:
@@ -137,7 +160,10 @@ def wrapped(*args, **kwargs):
 
         # we assume every DTensor object is placed on the same device mesh
         flat_local_args = []
+<<<<<<< HEAD
         nonlocal device_mesh  # access var device_mesh from the outer scope
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         seen_dtensor_arg = False
         for idx, arg in enumerate(flat_args):
             if isinstance(arg, DTensor):
@@ -177,7 +203,21 @@ def wrapped(*args, **kwargs):
                                 "redistribute_inputs=True to local_map."
                             )
 
+<<<<<<< HEAD
                 local_arg = arg.to_local()
+=======
+                if in_grad_placements is not None:
+                    spec = in_grad_placements[idx]
+                    assert spec is not None, (
+                        f"DTensor input {arg} expects in grad placements but received {spec}!"
+                    )
+                    if not isinstance(spec, tuple):
+                        spec = tuple(spec)
+                    local_arg = arg.to_local(grad_placements=spec)
+                else:
+                    local_arg = arg.to_local()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(local_arg, AsyncCollectiveTensor):
                     local_arg = local_arg.wait()
 
@@ -232,4 +272,8 @@ def wrapped(*args, **kwargs):
         else:
             return out
 
+<<<<<<< HEAD
     return wrapped
+=======
+    return functools.partial(wrapped, device_mesh)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/tensor/experimental/_register_sharding.py b/torch/distributed/tensor/experimental/_register_sharding.py
index 5d817912ac9f..096f631d20d8 100644
--- a/torch/distributed/tensor/experimental/_register_sharding.py
+++ b/torch/distributed/tensor/experimental/_register_sharding.py
@@ -8,7 +8,10 @@
 from torch._ops import OpOverload
 from torch.distributed.tensor import DTensor
 from torch.distributed.tensor._op_schema import (
+<<<<<<< HEAD
     _is_inplace_op,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpSchema,
     OpStrategy,
     PlacementList,
@@ -101,7 +104,11 @@ def strategy_to_spec(strategy: object) -> object:
             op_schema,
             single_mesh_dim_strategies,
             input_index=len(op_schema.op._schema.returns),
+<<<<<<< HEAD
             inplace_op=_is_inplace_op(op_schema.op),
+=======
+            inplace_op=op_schema.is_inplace_op(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def wrapper(custom_sharding_fn):
diff --git a/torch/distributed/tensor/experimental/_tp_transform.py b/torch/distributed/tensor/experimental/_tp_transform.py
index 52de6cebe684..12c6632033a0 100644
--- a/torch/distributed/tensor/experimental/_tp_transform.py
+++ b/torch/distributed/tensor/experimental/_tp_transform.py
@@ -10,9 +10,15 @@
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor._op_schema import (
     OpSchema,
+<<<<<<< HEAD
     OutputSharding,
     OutputSpecType,
     PlacementStrategy,
+=======
+    OpSpec,
+    OutputSharding,
+    OutputSpecType,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor.parallel.style import ColwiseParallel, ParallelStyle
@@ -69,8 +75,13 @@ def tensor_parallel_transformation(
 class _TensorParallelTransformPass(PassBase):
     """
     This pass is responsible for transforming a single-device graph into a tensor parallel
+<<<<<<< HEAD
     graph. It will mark the placement strategy of each node in the graph,
     partition the graph into distributed graph, then shard the parameters/buffers accordingly.
+=======
+    graph. It will mark the OpSpec of each node in the graph, partition the graph into
+    distributed graph, then shard the parameters/buffers accordingly.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -132,11 +143,19 @@ def _mark_tensor_parallel_shardings(
     graph_signature: ExportGraphSignature,
     mesh: DeviceMesh,
     parameter_placements: dict[str, Placement],
+<<<<<<< HEAD
 ) -> dict[Node, PlacementStrategy]:
     """
     Mark the placement strategies of the parameter and buffer placeholder nodes.
     """
     placement_strategies: dict[Node, PlacementStrategy] = {}
+=======
+) -> dict[Node, OpSpec]:
+    """
+    Mark the placement strategies of the parameter and buffer placeholder nodes.
+    """
+    placement_strategies: dict[Node, OpSpec] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     num_params_and_buffers = len(graph_signature.inputs_to_parameters) + len(
         graph_signature.inputs_to_buffers
     )
@@ -184,6 +203,7 @@ def _mark_sharding(
     graph_signature: ExportGraphSignature,
     mesh: DeviceMesh,
     parameter_placements: dict[str, Placement],
+<<<<<<< HEAD
 ) -> dict[Node, PlacementStrategy]:
     """
     Mark the sharding strategy for each node in the graph module.
@@ -195,6 +215,17 @@ def _mark_sharding(
             mesh,
             parameter_placements,
         )
+=======
+) -> dict[Node, OpSpec]:
+    """
+    Mark the sharding strategy for each node in the graph module.
+    """
+    placement_strategies: dict[Node, OpSpec] = _mark_tensor_parallel_shardings(
+        gm,
+        graph_signature,
+        mesh,
+        parameter_placements,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     for node in gm.graph.nodes:
@@ -238,7 +269,11 @@ def _mark_sharding(
                     output_sharding = DTensor._op_dispatcher.sharding_propagator.propagate_op_sharding(  # type: ignore[assignment]
                         op_schema,
                     )
+<<<<<<< HEAD
                 placement_strategies[node] = PlacementStrategy(
+=======
+                placement_strategies[node] = OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     output_specs=_get_output_spec_from_output_sharding(output_sharding),
                     input_specs=output_sharding.redistribute_schema.args_spec
                     if output_sharding.redistribute_schema is not None
@@ -273,11 +308,19 @@ def _create_placement_strategy(
     mesh: DeviceMesh,
     placements: tuple[Placement, ...],
     input_specs: Optional[Sequence[DTensorSpec]] = None,
+<<<<<<< HEAD
 ) -> PlacementStrategy:
     """
     Util function to construct a placement strategy for a given node.
     """
     placement = PlacementStrategy(
+=======
+) -> OpSpec:
+    """
+    Util function to construct an OpSpec for a given node.
+    """
+    placement = OpSpec(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_specs=input_specs,
         output_specs=DTensorSpec(
             mesh=mesh,
@@ -491,7 +534,11 @@ def _clean_up_graph_metadata(gm: torch.fx.GraphModule) -> None:
 
 
 def _get_input_node_specs(
+<<<<<<< HEAD
     node: Node, placement_strategies: dict[Node, PlacementStrategy]
+=======
+    node: Node, placement_strategies: dict[Node, OpSpec]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[DTensorSpec, ...]:
     """
     Get the input specs of a node.
@@ -507,9 +554,13 @@ def _get_input_node_specs(
     return tuple(input_specs_list)
 
 
+<<<<<<< HEAD
 def _get_op_schema(
     node: Node, placement_strategies: dict[Node, PlacementStrategy]
 ) -> OpSchema:
+=======
+def _get_op_schema(node: Node, placement_strategies: dict[Node, OpSpec]) -> OpSchema:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Util function to construct the operator schema of a node.
     """
@@ -526,14 +577,24 @@ def _get_op_schema(
 
 def _shard_state_dict(
     state_dict: dict[str, torch.Tensor],
+<<<<<<< HEAD
     placement_strategies: dict[Node, PlacementStrategy],
+=======
+    placement_strategies: dict[Node, OpSpec],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     graph_signature: ExportGraphSignature,
     mesh: DeviceMesh,
 ) -> None:
     """
+<<<<<<< HEAD
     Inplace partition the weights based on the placement strategy
     """
     for node, placement_strategy in placement_strategies.items():
+=======
+    Inplace partition the weights based on the OpSpec
+    """
+    for node, op_spec in placement_strategies.items():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if node.op != "placeholder":
             continue
         if node.name in graph_signature.inputs_to_parameters:
@@ -548,7 +609,11 @@ def _shard_state_dict(
         dtensor_param = distribute_tensor(
             original_param,
             mesh,
+<<<<<<< HEAD
             placement_strategy.output_spec.placements,
+=======
+            op_spec.output_spec.placements,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         local_param = dtensor_param.to_local()
         state_dict[fqn] = (
diff --git a/torch/distributed/tensor/parallel/__init__.py b/torch/distributed/tensor/parallel/__init__.py
index 9fe378c51b0d..0758b6b0395f 100644
--- a/torch/distributed/tensor/parallel/__init__.py
+++ b/torch/distributed/tensor/parallel/__init__.py
@@ -5,6 +5,10 @@
     ColwiseParallel,
     ParallelStyle,
     PrepareModuleInput,
+<<<<<<< HEAD
+=======
+    PrepareModuleInputOutput,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     PrepareModuleOutput,
     RowwiseParallel,
     SequenceParallel,
@@ -15,6 +19,10 @@
     "ColwiseParallel",
     "ParallelStyle",
     "PrepareModuleInput",
+<<<<<<< HEAD
+=======
+    "PrepareModuleInputOutput",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "PrepareModuleOutput",
     "RowwiseParallel",
     "SequenceParallel",
diff --git a/torch/distributed/tensor/parallel/loss.py b/torch/distributed/tensor/parallel/loss.py
index 9c01e4b15ef7..01268a1be3f3 100644
--- a/torch/distributed/tensor/parallel/loss.py
+++ b/torch/distributed/tensor/parallel/loss.py
@@ -17,6 +17,10 @@
     Reduction,
     replicate_reduction_dims,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor._ops.utils import normalize_dim
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.placement_types import Placement
 
 
@@ -160,6 +164,10 @@ def _log_softmax_handler(
     half_to_float = cast(bool, args[2])
 
     spec = x._spec
+<<<<<<< HEAD
+=======
+    dim = normalize_dim(dim, x.dim())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mesh_dim = _find_all_reduce_mesh_dim(spec.placements, dim)
 
     output_tensor_meta = _propagate_tensor_meta(op_call, args, kwargs)
diff --git a/torch/distributed/tensor/parallel/style.py b/torch/distributed/tensor/parallel/style.py
index ca0ba2b7d296..52eb3423ec16 100644
--- a/torch/distributed/tensor/parallel/style.py
+++ b/torch/distributed/tensor/parallel/style.py
@@ -23,6 +23,10 @@
     "SequenceParallel",
     "ColwiseParallel",
     "PrepareModuleInput",
+<<<<<<< HEAD
+=======
+    "PrepareModuleInputOutput",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "PrepareModuleOutput",
 ]
 
@@ -166,6 +170,17 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
             ),
         )
 
+<<<<<<< HEAD
+=======
+    def __repr__(self) -> str:
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += f"input_layouts={self.input_layouts}, "
+        tmpstr += f"output_layouts={self.output_layouts}, "
+        tmpstr += f"use_local_output={self.use_local_output}"
+        tmpstr += ")"
+        return tmpstr
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class RowwiseParallel(ParallelStyle):
     """
@@ -303,6 +318,17 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
             ),
         )
 
+<<<<<<< HEAD
+=======
+    def __repr__(self) -> str:
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += f"input_layouts={self.input_layouts}, "
+        tmpstr += f"output_layouts={self.output_layouts}, "
+        tmpstr += f"use_local_output={self.use_local_output}"
+        tmpstr += ")"
+        return tmpstr
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class SequenceParallel(ParallelStyle):
     """
@@ -398,6 +424,17 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
             partial(self._prepare_output_fn, self.use_local_output),
         )
 
+<<<<<<< HEAD
+=======
+    def __repr__(self) -> str:
+        tmpstr = self.__class__.__name__ + "("
+        if len(self.sequence_sharding) == 1:
+            tmpstr += f"sequence_dim={self.sequence_sharding[0].dim}, "
+        tmpstr += f"use_local_output={self.use_local_output}"
+        tmpstr += ")"
+        return tmpstr
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class PrepareModuleInput(ParallelStyle):
     """
@@ -557,6 +594,19 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
             )  # type: ignore[misc, call-arg]
         return module
 
+<<<<<<< HEAD
+=======
+    def __repr__(self) -> str:
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += f"input_layouts={self.input_layouts}, "
+        tmpstr += f"desired_input_layouts={self.desired_input_layouts}, "
+        tmpstr += f"input_kwarg_layouts={self.input_kwarg_layouts}, "
+        tmpstr += f"desired_input_kwarg_layouts={self.desired_input_kwarg_layouts}, "
+        tmpstr += f"use_local_output={self.use_local_output}"
+        tmpstr += ")"
+        return tmpstr
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class PrepareModuleOutput(ParallelStyle):
     """
@@ -656,3 +706,125 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
             lambda _, inputs, outputs: self._prepare_out_fn(outputs, device_mesh)
         )  # type: ignore[misc, call-arg]
         return module
+<<<<<<< HEAD
+=======
+
+    def __repr__(self) -> str:
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += f"output_layouts={self.output_layouts}, "
+        tmpstr += f"desired_output_layouts={self.desired_output_layouts}, "
+        tmpstr += f"use_local_output={self.use_local_output}"
+        tmpstr += ")"
+        return tmpstr
+
+
+class PrepareModuleInputOutput(ParallelStyle):
+    """
+    Configure the nn.Module's inputs (and outputs) to convert the input tensors (and output tensors, respectively) of the nn.Module
+    to DTensors at runtime according to ``input_layouts`` (and output_layouts, respectively), and perform layout redistribution
+    according to the ``desired_input_layouts`` (and ``desired_output_layouts``, respectively). This is a combination of
+    :class:`PrepareModuleInput` and :class:`PrepareModuleOutput`.
+
+    Keyword Args:
+        input_layouts (Union[Placement, Tuple[Optional[Placement]]]):
+            The DTensor layouts of input tensors for the nn.Module, this is used to convert the input tensors to
+            DTensors. If some inputs are not torch.Tensor or no need to convert to DTensors, ``None`` need to be specified
+            as a placeholder. default: None.
+        desired_input_layouts (Union[Placement, Tuple[Optional[Placement]]]):
+            The desired DTensor layout of input tensors for the nn.Module, this is used to ensure the inputs of the nn.Module
+            have the desired DTensor layouts. This argument needs to have the same length with ``input_layouts``. default: None.
+        input_kwarg_layouts (Dict[str, Placement]):
+            The DTensor layouts of input kwargs for the nn.Module, this is used to convert the input kwarg tensors to DTensors.
+            default: None
+        desired_input_kwarg_layouts: (Dict[str, Placement]):
+            The desired DTensor layout of input kwargs for the nn.Module, this is used to ensure the inputs of the nn.Module
+            have the desired DTensor layouts. default: None.
+        use_local_input (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module inputs, default: False.
+        output_layouts (Union[Placement, Tuple[Placement]]):
+            The DTensor layouts of output tensors for the nn.Module, this is used to convert the output tensors to
+            DTensors if they are :class:`torch.Tensor`. If some outputs are not torch.Tensor or no need to convert to DTensors,
+            ``None`` need to be specified as a placeholder.
+        desired_output_layouts (Union[Placement, Tuple[Placement]]):
+            The desired DTensor layouts of output tensors for the nn.Module, this is used to ensure the outputs of the nn.Module
+            have the desired DTensor layouts.
+        use_local_output (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module outputs, default: True.
+    Returns:
+        A :class:`ParallelStyle` object that prepares the sharding layouts of the nn.Module's inputs and outputs.
+
+    Example::
+        >>> # xdoctest: +SKIP(failing)
+        >>> from torch.distributed.tensor.parallel import parallelize_module, PrepareModuleInputOutput
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> block = TransformerBlock(...)  # block is a nn.Module that contains an "attn" Attention submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # According to the style specified below, the first input of attn will be annotated as Sharded DTensor
+        >>> # and then redistributed to Replicated DTensor, and the output of the TransformerBlock will be annotated
+        >>> # as Replicated DTensor and then redistributed to Sharded DTensor.
+        >>> parallelize_module(
+        >>>     block, # this can be a submodule or module
+        >>>     tp_mesh,
+        >>>     parallelize_plan={
+        >>>         "attn": PrepareModuleInputOutput(
+        >>>             input_layouts=(Shard(0), None, None, ...),
+        >>>             desired_input_layouts=(Replicate(), None, None, ...),
+        >>>             output_layouts=Replicate(),
+        >>>             desired_output_layouts=Shard(0),
+        >>>         ),
+        >>>     }
+        >>> )
+    """
+
+    def __init__(
+        self,
+        *,
+        input_layouts: Optional[Union[Placement, tuple[Optional[Placement]]]] = None,
+        desired_input_layouts: Optional[
+            Union[Placement, tuple[Optional[Placement]]]
+        ] = None,
+        input_kwarg_layouts: Optional[dict[str, Placement]] = None,
+        desired_input_kwarg_layouts: Optional[dict[str, Placement]] = None,
+        use_local_input: bool = False,
+        output_layouts: Union[Placement, tuple[Placement]],
+        desired_output_layouts: Union[Placement, tuple[Placement]],
+        use_local_output: bool = True,
+    ):
+        self.prepare_module_input = PrepareModuleInput(
+            input_layouts=input_layouts,
+            desired_input_layouts=desired_input_layouts,
+            input_kwarg_layouts=input_kwarg_layouts,
+            desired_input_kwarg_layouts=desired_input_kwarg_layouts,
+            use_local_output=use_local_input,
+        )
+        self.prepare_module_output = PrepareModuleOutput(
+            output_layouts=output_layouts,
+            desired_output_layouts=desired_output_layouts,
+            use_local_output=use_local_output,
+        )
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        self.prepare_module_input._apply(module, device_mesh)
+        self.prepare_module_output._apply(module, device_mesh)
+
+        return module
+
+    def __repr__(self) -> str:
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += f"input_layouts={self.prepare_module_input.input_layouts}, "
+        tmpstr += (
+            f"desired_input_layouts={self.prepare_module_input.desired_input_layouts}, "
+        )
+        tmpstr += (
+            f"input_kwarg_layouts={self.prepare_module_input.input_kwarg_layouts}, "
+        )
+        tmpstr += f"desired_input_kwarg_layouts={self.prepare_module_input.desired_input_kwarg_layouts}, "
+        tmpstr += f"use_local_input={self.prepare_module_input.use_local_output}, "
+        tmpstr += f"output_layouts={self.prepare_module_output.output_layouts}, "
+        tmpstr += f"desired_output_layouts={self.prepare_module_output.desired_output_layouts}, "
+        tmpstr += f"use_local_output={self.prepare_module_output.use_local_output}"
+        tmpstr += ")"
+        return tmpstr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
index ceb9f170fd3e..d5182b3c72b4 100644
--- a/torch/distributed/tensor/placement_types.py
+++ b/torch/distributed/tensor/placement_types.py
@@ -89,6 +89,7 @@ def _split_tensor(
 
         # chunk tensor over dimension `dim` into n slices
         tensor_list = list(torch.chunk(tensor, num_chunks, dim=self.dim))
+<<<<<<< HEAD
         num_empty_tensors = num_chunks - len(tensor_list)
 
         # if no need to have padding or tensor dim size is evenly sharded already
@@ -100,10 +101,16 @@ def _split_tensor(
                 fill_empty_tensor_to_shards(tensor_list, self.dim, num_empty_tensors),
                 [],
             )
+=======
+        tensor_list = fill_empty_tensor_to_shards(
+            tensor_list, self.dim, num_chunks - len(tensor_list)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # compute the chunk size inline with ``torch.chunk`` to calculate padding
         full_chunk_size = (tensor.size(self.dim) + num_chunks - 1) // num_chunks
 
+<<<<<<< HEAD
         # Compute chunk size for each chunk for ``self.dim``
         chunk_sizes = [
             tensor_list[idx].size(self.dim) if idx < len(tensor_list) else 0
@@ -122,10 +129,22 @@ def _split_tensor(
             if with_padding and pad_size > 0:
                 shard = pad_tensor(shard, self.dim, pad_size)
             shard = shard.contiguous() if contiguous else shard
+=======
+        shard_list: list[torch.Tensor] = []
+        pad_sizes: list[int] = []
+        for shard in tensor_list:
+            if with_padding:
+                pad_size = full_chunk_size - shard.size(self.dim)
+                shard = pad_tensor(shard, self.dim, pad_size)
+                pad_sizes.append(pad_size)
+            if contiguous:
+                shard = shard.contiguous()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             shard_list.append(shard)
         return shard_list, pad_sizes
 
     @staticmethod
+<<<<<<< HEAD
     def _local_shard_size_on_dim(
         size_on_dim: int,
         num_chunks: int,
@@ -152,6 +171,41 @@ def _local_shard_size_on_dim(
                 - shard_starting_idx
             )
             return local_shard_size, shard_starting_idx if return_offset else -1
+=======
+    def _local_shard_size_and_offset(
+        curr_local_size: int,
+        num_chunks: int,
+        rank: int,
+    ) -> tuple[int, int]:
+        """
+        Given the size of the current local tensor (which may already be sharded on some dimensions),
+        computes the new local shard size and offset given the desired number of chunks
+        (num_chunks is generally equal to the size of the current sharding dim).
+
+        Note: new local shard offset is relative to the current sharded tensor, not the global tensor.
+        See `_utils.compute_local_shape_and_global_offset` for computing global offset.
+
+        Returns (new local shard size, offset)
+
+        """
+        # Compute the chunk size inline with ``torch.chunk``
+        if curr_local_size % num_chunks == 0:
+            full_chunk_size = curr_local_size // num_chunks
+            return full_chunk_size, full_chunk_size * rank
+
+        # uneven sharding case
+        full_chunk_size = (curr_local_size + num_chunks - 1) // num_chunks
+        shard_starting_idx = full_chunk_size * rank
+
+        if curr_local_size < shard_starting_idx:
+            return 0, curr_local_size
+        else:
+            local_shard_size = (
+                min(curr_local_size, shard_starting_idx + full_chunk_size)
+                - shard_starting_idx
+            )
+            return local_shard_size, shard_starting_idx
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _shard_tensor(
         self,
@@ -186,14 +240,25 @@ def _shard_tensor(
             tensor, num_chunks, with_padding=True, contiguous=True
         )
         output = torch.empty_like(scatter_list[mesh_dim_local_rank])
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # perform scatter from the src_data_rank as data source when it is not None
         mesh_scatter(
             output, scatter_list, mesh, mesh_dim=mesh_dim, group_src=src_data_rank
         )
 
         # Only unpad if the local_tensor was padded on the dimension.
+<<<<<<< HEAD
         if pad_sizes and pad_sizes[mesh_dim_local_rank] > 0:
             output = unpad_tensor(output, self.dim, pad_sizes[mesh_dim_local_rank])
+=======
+        if pad_sizes[mesh_dim_local_rank] > 0:
+            output = unpad_tensor(output, self.dim, pad_sizes[mesh_dim_local_rank])
+            # Unpad might return a view, hence we need to remake it contiguous
+            output = output.contiguous()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return output
 
     def _reduce_shard_tensor(
@@ -243,15 +308,22 @@ def _to_replicate_tensor(
         is replicated on the previously sharded mesh dimension
         """
         num_chunks = mesh.size(mesh_dim=mesh_dim)
+<<<<<<< HEAD
         # check if it's uneven, so we need to pad input tensor before all_gather
         local_shape = list(local_tensor.size())
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         logical_dim_size = current_logical_shape[self.dim]
         is_padded = logical_dim_size % num_chunks != 0
 
         if is_padded:
             full_chunk_size = (logical_dim_size + num_chunks - 1) // num_chunks
+<<<<<<< HEAD
             pad_size = full_chunk_size - local_shape[self.dim]
+=======
+            pad_size = full_chunk_size - local_tensor.size(self.dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             local_tensor = pad_tensor(local_tensor, self.dim, pad_size)
 
         if not local_tensor.is_contiguous():
@@ -340,7 +412,11 @@ def _to_new_shard_dim(
             new_tensor = unpad_tensor(new_tensor, self.dim, old_dim_unpad_size)  # type: ignore[possibly-undefined]
 
         if new_dim_padding:
+<<<<<<< HEAD
             local_shard_size_on_new_dim = self._local_shard_size_on_dim(
+=======
+            local_shard_size_on_new_dim = self._local_shard_size_and_offset(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_dim_logical_size, num_chunks, my_coordinate[mesh_dim]
             )[0]
             new_dim_unpad_size = new_dim_full_chunk_size - local_shard_size_on_new_dim  # type: ignore[possibly-undefined]
@@ -427,8 +503,11 @@ class _StridedShard(Shard):
     dimension into 2 shards before being sharded on the "dp" dimension. Therefore, the
     `split_factor` of the _StridedShard placement on "dp" dim is 2.
 
+<<<<<<< HEAD
     TODO: strided sharding needs to work fine with uneven sharding. Now it forbids
     resharding if the tensor is unevenly sharded.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TODO: we should remove _StridedShard placement once we can unify it with Shard
     """
 
@@ -465,13 +544,17 @@ def _split_tensor(
         with_padding: bool = True,
         contiguous: bool = True,
     ) -> tuple[list[torch.Tensor], list[int]]:
+<<<<<<< HEAD
         """
         TODO: currently _StridedShard does not support padding
         """
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert self.dim <= tensor.ndim, (
             f"Sharding dim {self.dim} greater than tensor ndim {tensor.ndim}"
         )
 
+<<<<<<< HEAD
         total_split = num_chunks * self.split_factor
         assert tensor.size(self.dim) % total_split == 0, (
             "_StridedShard currently only allows even sharding but got tensor size"
@@ -496,6 +579,36 @@ def _split_tensor(
             tensor_list = [t.contiguous() for t in tensor_list]
 
         return tensor_list, []
+=======
+        # num_chunks represents the size of this StridedShard mesh dim, while self.split_factor
+        # represents the aggregate num chunks for other shardings applied logically earlier than this strided shard.
+        # (e.g. in FSDP+TP case, num_chunks is size(dp dim), split_factor is size(tp dim))
+        total_split = num_chunks * self.split_factor
+
+        tensor_list = list(torch.chunk(tensor, total_split, dim=self.dim))
+        tensor_list = fill_empty_tensor_to_shards(
+            tensor_list, self.dim, total_split - len(tensor_list)
+        )
+
+        # compute the chunk size inline with ``torch.chunk`` to calculate padding
+        full_chunk_size = (tensor.size(self.dim) + total_split - 1) // total_split
+
+        shard_list: list[torch.Tensor] = []
+        pad_sizes: list[int] = []
+        for i in range(num_chunks):
+            shard = torch.cat(
+                [tensor_list[i + j * num_chunks] for j in range(self.split_factor)],
+                dim=self.dim,
+            )
+            if with_padding:
+                pad_size = full_chunk_size * self.split_factor - shard.size(self.dim)
+                shard = pad_tensor(shard, self.dim, pad_size)
+                pad_sizes.append(pad_size)
+            if contiguous:
+                shard = shard.contiguous()
+            shard_list.append(shard)
+        return shard_list, pad_sizes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _to_replicate_tensor(
         self,
@@ -505,6 +618,7 @@ def _to_replicate_tensor(
         current_logical_shape: list[int],
     ) -> torch.Tensor:
         """
+<<<<<<< HEAD
         Note: currently _StridedShard does not support padding
         """
         num_chunks = mesh.size(mesh_dim=mesh_dim)
@@ -516,6 +630,83 @@ def _to_replicate_tensor(
             f"total split {total_split}=num_chunks {num_chunks} "
             f"* split_factor {self.split_factor}"
         )
+=======
+        Given a tensor with strided sharding (e.g. [StridedShard(d), Shard(d)]),
+        this function is called during the process of converting to [Replicate(), Replicate()],
+        and `local_tensor` represents the portion of the tensor on this rank after the intermediate step of
+        converting to [StridedShard(d), Replicate()] in right-to-left unsharding order.
+
+        note: this conversion logic is pretty specialized on this 2D case.  It could be generalized further. This
+        is a common enough case to be worth fixing (since it occurs when applying TP and then FSDP to a model).
+
+        note: this does not support 'reduce_scatter' for StridedShard.
+
+        Example
+        -------
+        mesh = (DP=2, TP=2)
+        # single-gpu "weight" of size 5, will be 'uneven' for sharding
+        original = torch.arange(5)
+
+        tp sharded tensor
+        -----------------
+        `tp = distribute_tensor(x, world_mesh['tp'], [Shard(0)])`
+
+        local_tensors:
+        rank0: [0,1,2]    rank1: [3,4]
+        rank1: [0,1,2]    rank3: [3,4]
+
+        fsdp+tp sharded tensor
+        ----------------------
+        `dp_tp = ...` (the process of creating a strided-shard tensor is skipped over as it is complicated
+        dp_tp has placement (_StridedShard(0, split_factor=2), Shard(0))
+        local_tensors:
+        rank0: [0,1]  rank1: [3]
+        rank1: [2]    rank3: [4]
+
+        Now, say someone wants to reconstruct dp_tp's full tensor. This will invoke 'redistribute' to replicate.
+        redistribute will first replicate the "Shard(0)" placement on the rightmost mesh dim, then replicate the
+        StridedShard placement second, which is implemented by this function.
+        So our starting point (`local_tensor` arg) is the result of replicating the Shard(0) placement across the
+        TP dim, which looks like this.
+
+        Note the discrepancy with the 'tp sharded tensor' line above!  We'll fix it by locally shuffling data.
+
+        local_tensors:
+        rank0: [0,1,3]  rank1: [0,1,3]
+        rank2: [2,4]    rank3: [2,4]
+
+        Step 1: replicate over the DP dimension.  Afterwards, each rank can locally sort the values.
+          note: we need padding to do this allgather, and we'll need to keep track of the padding amount for later
+                local_tensors:
+        rank0: [0,1,3,2,4]    rank1: [0,1,3,2,4]
+        rank2: [0,1,3,2,4]    rank3: [0,1,3,2,4]
+
+        Step 2: chunk and shuffle values around to account for the wrong order of operations above
+        and get the original tensor content back
+
+        01324#       <- our allgather includes padding, if padding was applied in step 1
+        01324        <- Remove the padding
+        013, 24      <- chunk once, 'undoing' the DP allgather
+        01, 3, 2, 4  <- chunk each chunk, 'undoing' the initial (wrong) TP allgather performed by Shard(0)->Replicate()
+        012, 34      <- interleave with stride=TP mesh dim size
+        01234        <- concatenate
+
+        Note: the current implementation of this function is incomplete, and supports only the common pattern of one
+        strided shard placement, which is used in the FSDP + TP case.  We could extend this implementation to handle
+        multiple strided shardings (e.g. [StridedShard, StridedShard, Shard]), by repeating the chunking step more times
+        and handling more complex shuffling in the last step.  On the other hand, we plan to replace 'StridedShard'
+        with using just Shard and specifying a sharding order, so it may be ok to leave this as-is for the time being.
+        """
+        num_chunks = mesh.size(mesh_dim=mesh_dim)
+        logical_dim_size = current_logical_shape[self.dim]
+        full_chunk_size = (logical_dim_size + num_chunks - 1) // num_chunks
+        local_pad_size = full_chunk_size - local_tensor.size(self.dim)
+
+        local_tensor = pad_tensor(local_tensor, self.dim, local_pad_size)
+
+        if not local_tensor.is_contiguous():
+            local_tensor = local_tensor.contiguous()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         result = funcol.all_gather_tensor(
             local_tensor,
@@ -525,6 +716,7 @@ def _to_replicate_tensor(
         if isinstance(result, funcol.AsyncCollectiveTensor):
             result = result.wait()
 
+<<<<<<< HEAD
         tensor_shard_list = torch.chunk(result, total_split, dim=self.dim)
         # rearrange the order
         new_tensor_shard_list = []
@@ -538,6 +730,24 @@ def _to_replicate_tensor(
             new_tensor_shard_list.append(tensor_shard_list[idx_after_split])
 
         return torch.cat(new_tensor_shard_list, dim=self.dim).contiguous()
+=======
+        if result.shape[self.dim] > logical_dim_size:
+            result = unpad_tensor(
+                result, self.dim, result.shape[self.dim] - logical_dim_size
+            )
+
+        # this reverses our 'all_gather' but gives every rank a copy
+        outer_shards = torch.chunk(result, num_chunks, dim=self.dim)
+        # this undoes the 'Shard(0)' -> Replicate() that happened over the wrong mesh dim in the first place
+        inner_shards: list[torch.Tensor] = []
+        for p in outer_shards:
+            inner_shards.extend(torch.chunk(p, self.split_factor, dim=self.dim))
+        # now we just have to correctly stride the shards
+        reordered_shards = []
+        for i in range(self.split_factor):
+            reordered_shards.extend(inner_shards[i :: self.split_factor])
+        return torch.cat(reordered_shards, dim=self.dim).contiguous()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass(frozen=True)
diff --git a/torch/distributed/utils.py b/torch/distributed/utils.py
index ebe36d0eb1bb..60652978b972 100644
--- a/torch/distributed/utils.py
+++ b/torch/distributed/utils.py
@@ -106,8 +106,12 @@ def to_map(obj):
                 return (obj.to(target_device),)
             else:
                 # If the custom module is not registered to torch, stream is not used for acceleration
+<<<<<<< HEAD
                 device_mod = getattr(torch, device.type, None)
                 if device.type == "cpu" or device_mod is None:
+=======
+                if device.type == "cpu":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return (obj.to(target_device),)
 
                 from torch.nn.parallel._functions import _get_stream
@@ -115,11 +119,19 @@ def to_map(obj):
                 # Perform CPU -> target_device copies in a background stream. This code is
                 # motivated from similar logic in torch/nn/parallel/_functions.py
                 stream = _get_stream(target_device)
+<<<<<<< HEAD
                 with device_mod.stream(stream):
                     output = obj.to(target_device)
                 # synchronize with the copy stream
                 with device_mod.device(target_device.index):
                     current_stream = device_mod.current_stream()
+=======
+                with stream:
+                    output = obj.to(target_device)
+                # synchronize with the copy stream
+                with torch.accelerator.device_index(target_device.index):
+                    current_stream = torch.accelerator.current_stream()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # Sync the current stream with the copy stream
                     current_stream.wait_stream(stream)
                     # Ensure tensor memory is not reused until work on
diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py
index fbc2775468d5..716f120d8820 100644
--- a/torch/distributions/__init__.py
+++ b/torch/distributions/__init__.py
@@ -86,6 +86,10 @@
 from .exponential import Exponential
 from .fishersnedecor import FisherSnedecor
 from .gamma import Gamma
+<<<<<<< HEAD
+=======
+from .generalized_pareto import GeneralizedPareto
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .geometric import Geometric
 from .gumbel import Gumbel
 from .half_cauchy import HalfCauchy
@@ -135,6 +139,10 @@
     "ExponentialFamily",
     "FisherSnedecor",
     "Gamma",
+<<<<<<< HEAD
+=======
+    "GeneralizedPareto",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "Geometric",
     "Gumbel",
     "HalfCauchy",
diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py
index 105038641bcc..da31da1e2079 100644
--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import nan, Tensor
 from torch.distributions import constraints
@@ -10,7 +15,11 @@
     probs_to_logits,
 )
 from torch.nn.functional import binary_cross_entropy_with_logits
+<<<<<<< HEAD
 from torch.types import _Number
+=======
+from torch.types import _Number, Number
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["Bernoulli"]
@@ -34,6 +43,10 @@ class Bernoulli(ExponentialFamily):
     Args:
         probs (Number, Tensor): the probability of sampling `1`
         logits (Number, Tensor): the log-odds of sampling `1`
+<<<<<<< HEAD
+=======
+        validate_args (bool, optional): whether to validate arguments, None by default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
@@ -41,7 +54,16 @@ class Bernoulli(ExponentialFamily):
     has_enumerate_support = True
     _mean_carrier_measure = 0
 
+<<<<<<< HEAD
     def __init__(self, probs=None, logits=None, validate_args=None):
+=======
+    def __init__(
+        self,
+        probs: Optional[Union[Tensor, Number]] = None,
+        logits: Optional[Union[Tensor, Number]] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (probs is None) == (logits is None):
             raise ValueError(
                 "Either `probs` or `logits` must be specified, but not both."
@@ -50,6 +72,10 @@ def __init__(self, probs=None, logits=None, validate_args=None):
             is_scalar = isinstance(probs, _Number)
             (self.probs,) = broadcast_all(probs)
         else:
+<<<<<<< HEAD
+=======
+            assert logits is not None  # helps mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is_scalar = isinstance(logits, _Number)
             (self.logits,) = broadcast_all(logits)
         self._param = self.probs if probs is not None else self.logits
diff --git a/torch/distributions/beta.py b/torch/distributions/beta.py
index e030b648a88e..77b8856fbda0 100644
--- a/torch/distributions/beta.py
+++ b/torch/distributions/beta.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -36,7 +41,16 @@ class Beta(ExponentialFamily):
     support = constraints.unit_interval
     has_rsample = True
 
+<<<<<<< HEAD
     def __init__(self, concentration1, concentration0, validate_args=None):
+=======
+    def __init__(
+        self,
+        concentration1: Union[Tensor, float],
+        concentration0: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(concentration1, _Number) and isinstance(concentration0, _Number):
             concentration1_concentration0 = torch.tensor(
                 [float(concentration1), float(concentration0)]
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index 6cbfae150844..d5ba0dedf93d 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -50,7 +55,17 @@ class Binomial(Distribution):
     }
     has_enumerate_support = True
 
+<<<<<<< HEAD
     def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
+=======
+    def __init__(
+        self,
+        total_count: Union[Tensor, int] = 1,
+        probs: Optional[Tensor] = None,
+        logits: Optional[Tensor] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (probs is None) == (logits is None):
             raise ValueError(
                 "Either `probs` or `logits` must be specified, but not both."
@@ -62,6 +77,10 @@ def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
             ) = broadcast_all(total_count, probs)
             self.total_count = self.total_count.type_as(self.probs)
         else:
+<<<<<<< HEAD
+=======
+            assert logits is not None  # helps mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (
                 self.total_count,
                 self.logits,
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 715429c66552..8953b8666912 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import nan, Tensor
 from torch.distributions import constraints
@@ -51,7 +56,16 @@ class Categorical(Distribution):
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
     has_enumerate_support = True
 
+<<<<<<< HEAD
     def __init__(self, probs=None, logits=None, validate_args=None):
+=======
+    def __init__(
+        self,
+        probs: Optional[Tensor] = None,
+        logits: Optional[Tensor] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (probs is None) == (logits is None):
             raise ValueError(
                 "Either `probs` or `logits` must be specified, but not both."
@@ -61,6 +75,10 @@ def __init__(self, probs=None, logits=None, validate_args=None):
                 raise ValueError("`probs` parameter must be at least one-dimensional.")
             self.probs = probs / probs.sum(-1, keepdim=True)
         else:
+<<<<<<< HEAD
+=======
+            assert logits is not None  # helps mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if logits.dim() < 1:
                 raise ValueError("`logits` parameter must be at least one-dimensional.")
             # Normalize
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
index 582c08ebb858..7418bd1060d1 100644
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import inf, nan, Tensor
@@ -34,7 +38,16 @@ class Cauchy(Distribution):
     support = constraints.real
     has_rsample = True
 
+<<<<<<< HEAD
     def __init__(self, loc, scale, validate_args=None):
+=======
+    def __init__(
+        self,
+        loc: Union[Tensor, float],
+        scale: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.loc, self.scale = broadcast_all(loc, scale)
         if isinstance(loc, _Number) and isinstance(scale, _Number):
             batch_shape = torch.Size()
diff --git a/torch/distributions/chi2.py b/torch/distributions/chi2.py
index f175bc44f69e..541978207eed 100644
--- a/torch/distributions/chi2.py
+++ b/torch/distributions/chi2.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch import Tensor
 from torch.distributions import constraints
 from torch.distributions.gamma import Gamma
@@ -25,7 +30,15 @@ class Chi2(Gamma):
 
     arg_constraints = {"df": constraints.positive}
 
+<<<<<<< HEAD
     def __init__(self, df, validate_args=None):
+=======
+    def __init__(
+        self,
+        df: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(0.5 * df, 0.5, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index dc27b170bb48..840d30f2261c 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -18,6 +18,10 @@
 - ``constraints.less_than(upper_bound)``
 - ``constraints.lower_cholesky``
 - ``constraints.lower_triangular``
+<<<<<<< HEAD
+=======
+- ``constraints.MixtureSameFamilyConstraint(base_constraint)``
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 - ``constraints.multinomial``
 - ``constraints.nonnegative``
 - ``constraints.nonnegative_integer``
@@ -56,6 +60,10 @@
     "less_than",
     "lower_cholesky",
     "lower_triangular",
+<<<<<<< HEAD
+=======
+    "MixtureSameFamilyConstraint",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "multinomial",
     "nonnegative",
     "nonnegative_integer",
@@ -265,6 +273,55 @@ def __repr__(self):
         return f"{self.__class__.__name__[1:]}({repr(self.base_constraint)}, {self.reinterpreted_batch_ndims})"
 
 
+<<<<<<< HEAD
+=======
+class MixtureSameFamilyConstraint(Constraint):
+    """
+    Constraint for the :class:`~torch.distribution.MixtureSameFamily`
+    distribution that adds back the rightmost batch dimension before
+    performing the validity check with the component distribution
+    constraint.
+
+    Args:
+        base_constraint: The ``Constraint`` object of
+            the component distribution of
+            the :class:`~torch.distribution.MixtureSameFamily` distribution.
+    """
+
+    def __init__(self, base_constraint):
+        assert isinstance(base_constraint, Constraint)
+        self.base_constraint = base_constraint
+        super().__init__()
+
+    @property
+    def is_discrete(self) -> bool:  # type: ignore[override]
+        return self.base_constraint.is_discrete
+
+    @property
+    def event_dim(self) -> int:  # type: ignore[override]
+        return self.base_constraint.event_dim
+
+    def check(self, value):
+        """
+        Check validity of ``value`` as a possible outcome of sampling
+        the :class:`~torch.distribution.MixtureSameFamily` distribution.
+        """
+        unsqueezed_value = value.unsqueeze(-1 - self.event_dim)
+        result = self.base_constraint.check(unsqueezed_value)
+        if value.dim() < self.event_dim:
+            raise ValueError(
+                f"Expected value.dim() >= {self.event_dim} but got {value.dim()}"
+            )
+        num_dim_to_keep = value.dim() - self.event_dim
+        result = result.reshape(result.shape[:num_dim_to_keep] + (-1,))
+        result = result.all(-1)
+        return result
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({repr(self.base_constraint)})"
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _Boolean(Constraint):
     """
     Constrain to the two values `{0, 1}`.
diff --git a/torch/distributions/continuous_bernoulli.py b/torch/distributions/continuous_bernoulli.py
index b1e8eddfb0ec..577d2ba4922a 100644
--- a/torch/distributions/continuous_bernoulli.py
+++ b/torch/distributions/continuous_bernoulli.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -13,7 +17,11 @@
     probs_to_logits,
 )
 from torch.nn.functional import binary_cross_entropy_with_logits
+<<<<<<< HEAD
 from torch.types import _Number, _size
+=======
+from torch.types import _Number, _size, Number
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["ContinuousBernoulli"]
@@ -52,7 +60,15 @@ class ContinuousBernoulli(ExponentialFamily):
     has_rsample = True
 
     def __init__(
+<<<<<<< HEAD
         self, probs=None, logits=None, lims=(0.499, 0.501), validate_args=None
+=======
+        self,
+        probs: Optional[Union[Tensor, Number]] = None,
+        logits: Optional[Union[Tensor, Number]] = None,
+        lims: tuple[float, float] = (0.499, 0.501),
+        validate_args: Optional[bool] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         if (probs is None) == (logits is None):
             raise ValueError(
@@ -68,6 +84,10 @@ def __init__(
                     raise ValueError("The parameter probs has invalid values")
             self.probs = clamp_probs(self.probs)
         else:
+<<<<<<< HEAD
+=======
+            assert logits is not None  # helps mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is_scalar = isinstance(logits, _Number)
             (self.logits,) = broadcast_all(logits)
         self._param = self.probs if probs is not None else self.logits
diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py
index f656a0582e89..5932ec20d2f6 100644
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.autograd import Function
@@ -54,7 +59,15 @@ class Dirichlet(ExponentialFamily):
     support = constraints.simplex
     has_rsample = True
 
+<<<<<<< HEAD
     def __init__(self, concentration, validate_args=None):
+=======
+    def __init__(
+        self,
+        concentration: Tensor,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if concentration.dim() < 1:
             raise ValueError(
                 "`concentration` parameter must be at least one-dimensional."
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index 75ea50d24860..1dee8534266b 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -16,6 +16,14 @@
 class Distribution:
     r"""
     Distribution is the abstract base class for probability distributions.
+<<<<<<< HEAD
+=======
+
+    Args:
+        batch_shape (torch.Size): The shape over which parameters are batched.
+        event_shape (torch.Size): The shape of a single sample (without batching).
+        validate_args (bool, optional): Whether to validate arguments. Default: None.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     has_rsample = False
@@ -44,7 +52,11 @@ def __init__(
         batch_shape: torch.Size = torch.Size(),
         event_shape: torch.Size = torch.Size(),
         validate_args: Optional[bool] = None,
+<<<<<<< HEAD
     ):
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._batch_shape = batch_shape
         self._event_shape = event_shape
         if validate_args is not None:
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
index 8ca2636e1f52..834de4124a93 100644
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -46,7 +51,15 @@ def stddev(self) -> Tensor:
     def variance(self) -> Tensor:
         return self.rate.pow(-2)
 
+<<<<<<< HEAD
     def __init__(self, rate, validate_args=None):
+=======
+    def __init__(
+        self,
+        rate: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (self.rate,) = broadcast_all(rate)
         batch_shape = torch.Size() if isinstance(rate, _Number) else self.rate.size()
         super().__init__(batch_shape, validate_args=validate_args)
diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py
index 053686c6de07..96b7af83a1ec 100644
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import nan, Tensor
 from torch.distributions import constraints
@@ -31,7 +36,16 @@ class FisherSnedecor(Distribution):
     support = constraints.positive
     has_rsample = True
 
+<<<<<<< HEAD
     def __init__(self, df1, df2, validate_args=None):
+=======
+    def __init__(
+        self,
+        df1: Union[Tensor, float],
+        df2: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.df1, self.df2 = broadcast_all(df1, df2)
         self._gamma1 = Gamma(self.df1 * 0.5, self.df1)
         self._gamma2 = Gamma(self.df2 * 0.5, self.df2)
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
index 5e0fe3fc7823..ebd9d8eb19e5 100644
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -52,7 +57,16 @@ def mode(self) -> Tensor:
     def variance(self) -> Tensor:
         return self.concentration / self.rate.pow(2)
 
+<<<<<<< HEAD
     def __init__(self, concentration, rate, validate_args=None):
+=======
+    def __init__(
+        self,
+        concentration: Union[Tensor, float],
+        rate: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.concentration, self.rate = broadcast_all(concentration, rate)
         if isinstance(concentration, _Number) and isinstance(rate, _Number):
             batch_shape = torch.Size()
diff --git a/torch/distributions/generalized_pareto.py b/torch/distributions/generalized_pareto.py
new file mode 100644
index 000000000000..4ee0a54b608f
--- /dev/null
+++ b/torch/distributions/generalized_pareto.py
@@ -0,0 +1,150 @@
+# mypy: allow-untyped-defs
+import math
+from numbers import Number, Real
+
+import torch
+from torch import inf, nan
+from torch.distributions import constraints, Distribution
+from torch.distributions.utils import broadcast_all
+
+
+__all__ = ["GeneralizedPareto"]
+
+
+class GeneralizedPareto(Distribution):
+    r"""
+    Creates a Generalized Pareto distribution parameterized by :attr:`loc`, :attr:`scale`, and :attr:`concentration`.
+
+    The Generalized Pareto distribution is a family of continuous probability distributions on the real line.
+    Special cases include Exponential (when :attr:`loc` = 0, :attr:`concentration` = 0), Pareto (when :attr:`concentration` > 0,
+    :attr:`loc` = :attr:`scale` / :attr:`concentration`), and Uniform (when :attr:`concentration` = -1).
+
+    This distribution is often used to model the tails of other distributions. This implementation is based on the
+    implementation in TensorFlow Probability.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = GeneralizedPareto(torch.tensor([0.1]), torch.tensor([2.0]), torch.tensor([0.4]))
+        >>> m.sample()  # sample from a Generalized Pareto distribution with loc=0.1, scale=2.0, and concentration=0.4
+        tensor([ 1.5623])
+
+    Args:
+        loc (float or Tensor): Location parameter of the distribution
+        scale (float or Tensor): Scale parameter of the distribution
+        concentration (float or Tensor): Concentration parameter of the distribution
+    """
+
+    arg_constraints = {
+        "loc": constraints.real,
+        "scale": constraints.positive,
+        "concentration": constraints.real,
+    }
+    has_rsample = True
+
+    def __init__(self, loc, scale, concentration, validate_args=None):
+        self.loc, self.scale, self.concentration = broadcast_all(
+            loc, scale, concentration
+        )
+        if (
+            isinstance(loc, Number)
+            and isinstance(scale, Number)
+            and isinstance(concentration, Number)
+        ):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.loc.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(GeneralizedPareto, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        new.concentration = self.concentration.expand(batch_shape)
+        super(GeneralizedPareto, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        u = torch.rand(shape, dtype=self.loc.dtype, device=self.loc.device)
+        return self.icdf(u)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        z = self._z(value)
+        eq_zero = torch.isclose(self.concentration, torch.tensor(0.0))
+        safe_conc = torch.where(
+            eq_zero, torch.ones_like(self.concentration), self.concentration
+        )
+        y = 1 / safe_conc + torch.ones_like(z)
+        where_nonzero = torch.where(y == 0, y, y * torch.log1p(safe_conc * z))
+        log_scale = (
+            math.log(self.scale) if isinstance(self.scale, Real) else self.scale.log()
+        )
+        return -log_scale - torch.where(eq_zero, z, where_nonzero)
+
+    def log_survival_function(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        z = self._z(value)
+        eq_zero = torch.isclose(self.concentration, torch.tensor(0.0))
+        safe_conc = torch.where(
+            eq_zero, torch.ones_like(self.concentration), self.concentration
+        )
+        where_nonzero = -torch.log1p(safe_conc * z) / safe_conc
+        return torch.where(eq_zero, -z, where_nonzero)
+
+    def log_cdf(self, value):
+        return torch.log1p(-torch.exp(self.log_survival_function(value)))
+
+    def cdf(self, value):
+        return torch.exp(self.log_cdf(value))
+
+    def icdf(self, value):
+        loc = self.loc
+        scale = self.scale
+        concentration = self.concentration
+        eq_zero = torch.isclose(concentration, torch.zeros_like(concentration))
+        safe_conc = torch.where(eq_zero, torch.ones_like(concentration), concentration)
+        logu = torch.log1p(-value)
+        where_nonzero = loc + scale / safe_conc * torch.expm1(-safe_conc * logu)
+        where_zero = loc - scale * logu
+        return torch.where(eq_zero, where_zero, where_nonzero)
+
+    def _z(self, x):
+        return (x - self.loc) / self.scale
+
+    @property
+    def mean(self):
+        concentration = self.concentration
+        valid = concentration < 1
+        safe_conc = torch.where(valid, concentration, 0.5)
+        result = self.loc + self.scale / (1 - safe_conc)
+        return torch.where(valid, result, nan)
+
+    @property
+    def variance(self):
+        concentration = self.concentration
+        valid = concentration < 0.5
+        safe_conc = torch.where(valid, concentration, 0.25)
+        result = self.scale**2 / ((1 - safe_conc) ** 2 * (1 - 2 * safe_conc))
+        return torch.where(valid, result, nan)
+
+    def entropy(self):
+        ans = torch.log(self.scale) + self.concentration + 1
+        return torch.broadcast_to(ans, self._batch_shape)
+
+    @property
+    def mode(self):
+        return self.loc
+
+    @constraints.dependent_property(is_discrete=False, event_dim=0)
+    def support(self):
+        lower = self.loc
+        upper = torch.where(
+            self.concentration < 0, lower - self.scale / self.concentration, inf
+        )
+        return constraints.interval(lower, upper)
diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py
index b8b05142db5b..00cbf850a8f4 100644
--- a/torch/distributions/geometric.py
+++ b/torch/distributions/geometric.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -10,7 +15,11 @@
     probs_to_logits,
 )
 from torch.nn.functional import binary_cross_entropy_with_logits
+<<<<<<< HEAD
 from torch.types import _Number
+=======
+from torch.types import _Number, Number
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["Geometric"]
@@ -45,7 +54,16 @@ class Geometric(Distribution):
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.nonnegative_integer
 
+<<<<<<< HEAD
     def __init__(self, probs=None, logits=None, validate_args=None):
+=======
+    def __init__(
+        self,
+        probs: Optional[Union[Tensor, Number]] = None,
+        logits: Optional[Union[Tensor, Number]] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (probs is None) == (logits is None):
             raise ValueError(
                 "Either `probs` or `logits` must be specified, but not both."
@@ -53,11 +71,19 @@ def __init__(self, probs=None, logits=None, validate_args=None):
         if probs is not None:
             (self.probs,) = broadcast_all(probs)
         else:
+<<<<<<< HEAD
+=======
+            assert logits is not None  # helps mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (self.logits,) = broadcast_all(logits)
         probs_or_logits = probs if probs is not None else logits
         if isinstance(probs_or_logits, _Number):
             batch_shape = torch.Size()
         else:
+<<<<<<< HEAD
+=======
+            assert probs_or_logits is not None  # helps mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             batch_shape = probs_or_logits.size()
         super().__init__(batch_shape, validate_args=validate_args)
         if self._validate_args and probs is not None:
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index 623cc7edbda6..74a73dafe314 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -33,7 +37,16 @@ class Gumbel(TransformedDistribution):
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
     support = constraints.real
 
+<<<<<<< HEAD
     def __init__(self, loc, scale, validate_args=None):
+=======
+    def __init__(
+        self,
+        loc: Union[Tensor, float],
+        scale: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.loc, self.scale = broadcast_all(loc, scale)
         finfo = torch.finfo(self.loc.dtype)
         if isinstance(loc, _Number) and isinstance(scale, _Number):
diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py
index da17c40da2ed..c6aee6e71ab9 100644
--- a/torch/distributions/half_cauchy.py
+++ b/torch/distributions/half_cauchy.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import inf, Tensor
@@ -33,8 +37,18 @@ class HalfCauchy(TransformedDistribution):
     arg_constraints = {"scale": constraints.positive}
     support = constraints.nonnegative
     has_rsample = True
+<<<<<<< HEAD
 
     def __init__(self, scale, validate_args=None):
+=======
+    base_dist: Cauchy
+
+    def __init__(
+        self,
+        scale: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_dist = Cauchy(0, scale, validate_args=False)
         super().__init__(base_dist, AbsTransform(), validate_args=validate_args)
 
diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py
index 5850f883e908..d77aa2da7c19 100644
--- a/torch/distributions/half_normal.py
+++ b/torch/distributions/half_normal.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import inf, Tensor
@@ -33,8 +37,18 @@ class HalfNormal(TransformedDistribution):
     arg_constraints = {"scale": constraints.positive}
     support = constraints.nonnegative
     has_rsample = True
+<<<<<<< HEAD
 
     def __init__(self, scale, validate_args=None):
+=======
+    base_dist: Normal
+
+    def __init__(
+        self,
+        scale: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_dist = Normal(0, scale, validate_args=False)
         super().__init__(base_dist, AbsTransform(), validate_args=validate_args)
 
diff --git a/torch/distributions/independent.py b/torch/distributions/independent.py
index 0442a4c1b483..ec36c98a752d 100644
--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@@ -1,7 +1,14 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 
 import torch
 from torch import Tensor
+=======
+from typing import Generic, Optional, TypeVar
+
+import torch
+from torch import Size, Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.utils import _sum_rightmost
@@ -11,7 +18,14 @@
 __all__ = ["Independent"]
 
 
+<<<<<<< HEAD
 class Independent(Distribution):
+=======
+D = TypeVar("D", bound=Distribution)
+
+
+class Independent(Distribution, Generic[D]):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""
     Reinterprets some of the batch dims of a distribution as event dims.
 
@@ -42,17 +56,33 @@ class Independent(Distribution):
     """
 
     arg_constraints: dict[str, constraints.Constraint] = {}
+<<<<<<< HEAD
 
     def __init__(
         self, base_distribution, reinterpreted_batch_ndims, validate_args=None
     ):
+=======
+    base_dist: D
+
+    def __init__(
+        self,
+        base_distribution: D,
+        reinterpreted_batch_ndims: int,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if reinterpreted_batch_ndims > len(base_distribution.batch_shape):
             raise ValueError(
                 "Expected reinterpreted_batch_ndims <= len(base_distribution.batch_shape), "
                 f"actual {reinterpreted_batch_ndims} vs {len(base_distribution.batch_shape)}"
             )
+<<<<<<< HEAD
         shape = base_distribution.batch_shape + base_distribution.event_shape
         event_dim = reinterpreted_batch_ndims + len(base_distribution.event_shape)
+=======
+        shape: Size = base_distribution.batch_shape + base_distribution.event_shape
+        event_dim: int = reinterpreted_batch_ndims + len(base_distribution.event_shape)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         batch_shape = shape[: len(shape) - event_dim]
         event_shape = shape[len(shape) - event_dim :]
         self.base_dist = base_distribution
diff --git a/torch/distributions/inverse_gamma.py b/torch/distributions/inverse_gamma.py
index aaee976b7f17..e10e509e25a3 100644
--- a/torch/distributions/inverse_gamma.py
+++ b/torch/distributions/inverse_gamma.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -38,8 +43,19 @@ class InverseGamma(TransformedDistribution):
     }
     support = constraints.positive
     has_rsample = True
+<<<<<<< HEAD
 
     def __init__(self, concentration, rate, validate_args=None):
+=======
+    base_dist: Gamma
+
+    def __init__(
+        self,
+        concentration: Union[Tensor, float],
+        rate: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_dist = Gamma(concentration, rate, validate_args=validate_args)
         neg_one = -base_dist.rate.new_ones(())
         super().__init__(
diff --git a/torch/distributions/kumaraswamy.py b/torch/distributions/kumaraswamy.py
index d38efb631e86..148cd244dd6e 100644
--- a/torch/distributions/kumaraswamy.py
+++ b/torch/distributions/kumaraswamy.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import nan, Tensor
 from torch.distributions import constraints
@@ -45,7 +50,16 @@ class Kumaraswamy(TransformedDistribution):
     support = constraints.unit_interval
     has_rsample = True
 
+<<<<<<< HEAD
     def __init__(self, concentration1, concentration0, validate_args=None):
+=======
+    def __init__(
+        self,
+        concentration1: Union[Tensor, float],
+        concentration0: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.concentration1, self.concentration0 = broadcast_all(
             concentration1, concentration0
         )
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
index 39ef9b1efdb7..2ca61ded9478 100644
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -46,7 +51,16 @@ def variance(self) -> Tensor:
     def stddev(self) -> Tensor:
         return (2**0.5) * self.scale
 
+<<<<<<< HEAD
     def __init__(self, loc, scale, validate_args=None):
+=======
+    def __init__(
+        self,
+        loc: Union[Tensor, float],
+        scale: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.loc, self.scale = broadcast_all(loc, scale)
         if isinstance(loc, _Number) and isinstance(scale, _Number):
             batch_shape = torch.Size()
diff --git a/torch/distributions/lkj_cholesky.py b/torch/distributions/lkj_cholesky.py
index a18f2ed9f52a..ecb07bfcb305 100644
--- a/torch/distributions/lkj_cholesky.py
+++ b/torch/distributions/lkj_cholesky.py
@@ -9,8 +9,15 @@
 """
 
 import math
+<<<<<<< HEAD
 
 import torch
+=======
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributions import Beta, constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.utils import broadcast_all
@@ -61,7 +68,16 @@ class LKJCholesky(Distribution):
     arg_constraints = {"concentration": constraints.positive}
     support = constraints.corr_cholesky
 
+<<<<<<< HEAD
     def __init__(self, dim, concentration=1.0, validate_args=None):
+=======
+    def __init__(
+        self,
+        dim: int,
+        concentration: Union[Tensor, float] = 1.0,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if dim < 2:
             raise ValueError(
                 f"Expected dim to be an integer greater than or equal to 2. Found dim={dim}."
diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py
index a048f94286c8..cdef8f861fc7 100644
--- a/torch/distributions/log_normal.py
+++ b/torch/distributions/log_normal.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch import Tensor
 from torch.distributions import constraints
 from torch.distributions.normal import Normal
@@ -32,8 +37,19 @@ class LogNormal(TransformedDistribution):
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
     support = constraints.positive
     has_rsample = True
+<<<<<<< HEAD
 
     def __init__(self, loc, scale, validate_args=None):
+=======
+    base_dist: Normal
+
+    def __init__(
+        self,
+        loc: Union[Tensor, float],
+        scale: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_dist = Normal(loc, scale, validate_args=validate_args)
         super().__init__(base_dist, ExpTransform(), validate_args=validate_args)
 
diff --git a/torch/distributions/logistic_normal.py b/torch/distributions/logistic_normal.py
index a8f7c099d1e8..a2a5b7419a7f 100644
--- a/torch/distributions/logistic_normal.py
+++ b/torch/distributions/logistic_normal.py
@@ -1,6 +1,13 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from torch import Tensor
 from torch.distributions import constraints
+=======
+from typing import Optional, Union
+
+from torch import Tensor
+from torch.distributions import constraints, Independent
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributions.normal import Normal
 from torch.distributions.transformed_distribution import TransformedDistribution
 from torch.distributions.transforms import StickBreakingTransform
@@ -36,8 +43,19 @@ class LogisticNormal(TransformedDistribution):
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
     support = constraints.simplex
     has_rsample = True
+<<<<<<< HEAD
 
     def __init__(self, loc, scale, validate_args=None):
+=======
+    base_dist: Independent[Normal]
+
+    def __init__(
+        self,
+        loc: Union[Tensor, float],
+        scale: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_dist = Normal(loc, scale, validate_args=validate_args)
         if not base_dist.batch_shape:
             base_dist = base_dist.expand([1])
diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py
index c6f739a595a3..ca1a5e879cf9 100644
--- a/torch/distributions/lowrank_multivariate_normal.py
+++ b/torch/distributions/lowrank_multivariate_normal.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
+=======
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -93,7 +97,17 @@ class LowRankMultivariateNormal(Distribution):
     support = constraints.real_vector
     has_rsample = True
 
+<<<<<<< HEAD
     def __init__(self, loc, cov_factor, cov_diag, validate_args=None):
+=======
+    def __init__(
+        self,
+        loc: Tensor,
+        cov_factor: Tensor,
+        cov_diag: Tensor,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if loc.dim() < 1:
             raise ValueError("loc must be at least one-dimensional.")
         event_shape = loc.shape[-1:]
diff --git a/torch/distributions/mixture_same_family.py b/torch/distributions/mixture_same_family.py
index 1fc2c1052d03..31553d75e52d 100644
--- a/torch/distributions/mixture_same_family.py
+++ b/torch/distributions/mixture_same_family.py
@@ -1,8 +1,16 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
 from torch.distributions import Categorical, constraints
+<<<<<<< HEAD
+=======
+from torch.distributions.constraints import MixtureSameFamilyConstraint
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributions.distribution import Distribution
 
 
@@ -59,7 +67,11 @@ def __init__(
         self,
         mixture_distribution: Categorical,
         component_distribution: Distribution,
+<<<<<<< HEAD
         validate_args=None,
+=======
+        validate_args: Optional[bool] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         self._mixture_distribution = mixture_distribution
         self._component_distribution = component_distribution
@@ -123,9 +135,13 @@ def expand(self, batch_shape, _instance=None):
 
     @constraints.dependent_property
     def support(self):
+<<<<<<< HEAD
         # FIXME this may have the wrong shape when support contains batched
         # parameters
         return self._component_distribution.support
+=======
+        return MixtureSameFamilyConstraint(self._component_distribution.support)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def mixture_distribution(self) -> Categorical:
diff --git a/torch/distributions/multinomial.py b/torch/distributions/multinomial.py
index 85a227f5c403..98bb9b622b95 100644
--- a/torch/distributions/multinomial.py
+++ b/torch/distributions/multinomial.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import inf, Tensor
 from torch.distributions import Categorical, constraints
@@ -59,7 +64,17 @@ def mean(self) -> Tensor:
     def variance(self) -> Tensor:
         return self.total_count * self.probs * (1 - self.probs)
 
+<<<<<<< HEAD
     def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
+=======
+    def __init__(
+        self,
+        total_count: int = 1,
+        probs: Optional[Tensor] = None,
+        logits: Optional[Tensor] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(total_count, int):
             raise NotImplementedError("inhomogeneous total_count is not supported")
         self.total_count = total_count
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index 849ee4170015..d8195387196e 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
+=======
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -133,12 +137,21 @@ class MultivariateNormal(Distribution):
 
     def __init__(
         self,
+<<<<<<< HEAD
         loc,
         covariance_matrix=None,
         precision_matrix=None,
         scale_tril=None,
         validate_args=None,
     ):
+=======
+        loc: Tensor,
+        covariance_matrix: Optional[Tensor] = None,
+        precision_matrix: Optional[Tensor] = None,
+        scale_tril: Optional[Tensor] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if loc.dim() < 1:
             raise ValueError("loc must be at least one-dimensional.")
         if (covariance_matrix is not None) + (scale_tril is not None) + (
@@ -167,6 +180,10 @@ def __init__(
             )
             self.covariance_matrix = covariance_matrix.expand(batch_shape + (-1, -1))
         else:
+<<<<<<< HEAD
+=======
+            assert precision_matrix is not None  # helps mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if precision_matrix.dim() < 2:
                 raise ValueError(
                     "precision_matrix must be at least two-dimensional, "
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index e5b0e128efe6..b4cbbdbfe17a 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.nn.functional as F
 from torch import Tensor
@@ -38,7 +43,17 @@ class NegativeBinomial(Distribution):
     }
     support = constraints.nonnegative_integer
 
+<<<<<<< HEAD
     def __init__(self, total_count, probs=None, logits=None, validate_args=None):
+=======
+    def __init__(
+        self,
+        total_count: Union[Tensor, float],
+        probs: Optional[Tensor] = None,
+        logits: Optional[Tensor] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (probs is None) == (logits is None):
             raise ValueError(
                 "Either `probs` or `logits` must be specified, but not both."
@@ -50,6 +65,10 @@ def __init__(self, total_count, probs=None, logits=None, validate_args=None):
             ) = broadcast_all(total_count, probs)
             self.total_count = self.total_count.type_as(self.probs)
         else:
+<<<<<<< HEAD
+=======
+            assert logits is not None  # helps mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             (
                 self.total_count,
                 self.logits,
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
index 86e30ba450f5..b8c568372ab0 100644
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -51,7 +55,16 @@ def stddev(self) -> Tensor:
     def variance(self) -> Tensor:
         return self.stddev.pow(2)
 
+<<<<<<< HEAD
     def __init__(self, loc, scale, validate_args=None):
+=======
+    def __init__(
+        self,
+        loc: Union[Tensor, float],
+        scale: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.loc, self.scale = broadcast_all(loc, scale)
         if isinstance(loc, _Number) and isinstance(scale, _Number):
             batch_shape = torch.Size()
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index 7e0bc03c5aba..eb20472750cc 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -44,7 +49,16 @@ class OneHotCategorical(Distribution):
     support = constraints.one_hot
     has_enumerate_support = True
 
+<<<<<<< HEAD
     def __init__(self, probs=None, logits=None, validate_args=None):
+=======
+    def __init__(
+        self,
+        probs: Optional[Tensor] = None,
+        logits: Optional[Tensor] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._categorical = Categorical(probs, logits)
         batch_shape = self._categorical.batch_shape
         event_shape = self._categorical.param_shape[-1:]
diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py
index 2cc1e298ba25..d3180aa9cc30 100644
--- a/torch/distributions/pareto.py
+++ b/torch/distributions/pareto.py
@@ -1,4 +1,8 @@
+<<<<<<< HEAD
 from typing import Optional
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch import Tensor
 from torch.distributions import constraints
@@ -31,7 +35,14 @@ class Pareto(TransformedDistribution):
     arg_constraints = {"alpha": constraints.positive, "scale": constraints.positive}
 
     def __init__(
+<<<<<<< HEAD
         self, scale: Tensor, alpha: Tensor, validate_args: Optional[bool] = None
+=======
+        self,
+        scale: Union[Tensor, float],
+        alpha: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         self.scale, self.alpha = broadcast_all(scale, alpha)
         base_dist = Exponential(self.alpha, validate_args=validate_args)
diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py
index c3b4bacc54cb..dbb1da18272c 100644
--- a/torch/distributions/poisson.py
+++ b/torch/distributions/poisson.py
@@ -1,10 +1,19 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
 from torch.distributions.exp_family import ExponentialFamily
 from torch.distributions.utils import broadcast_all
+<<<<<<< HEAD
 from torch.types import _Number
+=======
+from torch.types import _Number, Number
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["Poisson"]
@@ -45,7 +54,15 @@ def mode(self) -> Tensor:
     def variance(self) -> Tensor:
         return self.rate
 
+<<<<<<< HEAD
     def __init__(self, rate, validate_args=None):
+=======
+    def __init__(
+        self,
+        rate: Union[Tensor, Number],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (self.rate,) = broadcast_all(rate)
         if isinstance(rate, _Number):
             batch_shape = torch.Size()
diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py
index 4c1549660313..8f71bcfb3f40 100644
--- a/torch/distributions/relaxed_bernoulli.py
+++ b/torch/distributions/relaxed_bernoulli.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -12,7 +17,11 @@
     logits_to_probs,
     probs_to_logits,
 )
+<<<<<<< HEAD
 from torch.types import _Number, _size
+=======
+from torch.types import _Number, _size, Number
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["LogitRelaxedBernoulli", "RelaxedBernoulli"]
@@ -41,7 +50,17 @@ class LogitRelaxedBernoulli(Distribution):
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.real
 
+<<<<<<< HEAD
     def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+=======
+    def __init__(
+        self,
+        temperature: Tensor,
+        probs: Optional[Union[Tensor, Number]] = None,
+        logits: Optional[Union[Tensor, Number]] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.temperature = temperature
         if (probs is None) == (logits is None):
             raise ValueError(
@@ -51,6 +70,10 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None):
             is_scalar = isinstance(probs, _Number)
             (self.probs,) = broadcast_all(probs)
         else:
+<<<<<<< HEAD
+=======
+            assert logits is not None  # helps mypy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             is_scalar = isinstance(logits, _Number)
             (self.logits,) = broadcast_all(logits)
         self._param = self.probs if probs is not None else self.logits
@@ -131,8 +154,20 @@ class RelaxedBernoulli(TransformedDistribution):
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.unit_interval
     has_rsample = True
+<<<<<<< HEAD
 
     def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+=======
+    base_dist: LogitRelaxedBernoulli
+
+    def __init__(
+        self,
+        temperature: Tensor,
+        probs: Optional[Union[Tensor, Number]] = None,
+        logits: Optional[Union[Tensor, Number]] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_dist = LogitRelaxedBernoulli(temperature, probs, logits)
         super().__init__(base_dist, SigmoidTransform(), validate_args=validate_args)
 
diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py
index 97ae3ed1857b..8e2116156cfc 100644
--- a/torch/distributions/relaxed_categorical.py
+++ b/torch/distributions/relaxed_categorical.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -42,7 +47,17 @@ class ExpRelaxedCategorical(Distribution):
     )  # The true support is actually a submanifold of this.
     has_rsample = True
 
+<<<<<<< HEAD
     def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+=======
+    def __init__(
+        self,
+        temperature: Tensor,
+        probs: Optional[Tensor] = None,
+        logits: Optional[Tensor] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._categorical = Categorical(probs, logits)
         self.temperature = temperature
         batch_shape = self._categorical.batch_shape
@@ -121,8 +136,20 @@ class RelaxedOneHotCategorical(TransformedDistribution):
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
     support = constraints.simplex
     has_rsample = True
+<<<<<<< HEAD
 
     def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+=======
+    base_dist: ExpRelaxedCategorical
+
+    def __init__(
+        self,
+        temperature: Tensor,
+        probs: Optional[Tensor] = None,
+        logits: Optional[Tensor] = None,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         base_dist = ExpRelaxedCategorical(
             temperature, probs, logits, validate_args=validate_args
         )
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index e141939b2745..5b90913c94c6 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import inf, nan, Tensor
@@ -60,7 +64,17 @@ def variance(self) -> Tensor:
         m[self.df <= 1] = nan
         return m
 
+<<<<<<< HEAD
     def __init__(self, df, loc=0.0, scale=1.0, validate_args=None):
+=======
+    def __init__(
+        self,
+        df: Union[Tensor, float],
+        loc: Union[Tensor, float] = 0.0,
+        scale: Union[Tensor, float] = 1.0,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.df, self.loc, self.scale = broadcast_all(df, loc, scale)
         self._chi2 = Chi2(self.df)
         batch_shape = self.df.size()
diff --git a/torch/distributions/transformed_distribution.py b/torch/distributions/transformed_distribution.py
index 02792ce9d309..43a0f4a24cd9 100644
--- a/torch/distributions/transformed_distribution.py
+++ b/torch/distributions/transformed_distribution.py
@@ -1,4 +1,8 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
@@ -49,7 +53,16 @@ class TransformedDistribution(Distribution):
 
     arg_constraints: dict[str, constraints.Constraint] = {}
 
+<<<<<<< HEAD
     def __init__(self, base_distribution, transforms, validate_args=None):
+=======
+    def __init__(
+        self,
+        base_distribution: Distribution,
+        transforms: Union[Transform, list[Transform]],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(transforms, Transform):
             self.transforms = [
                 transforms,
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
index 8958f1a63c87..63e8567165ef 100644
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@@ -3,11 +3,22 @@
 import math
 import operator
 import weakref
+<<<<<<< HEAD
 from typing import Optional
 
 import torch
 import torch.nn.functional as F
 from torch.distributions import constraints
+=======
+from collections.abc import Sequence
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributions.utils import (
     _sum_rightmost,
     broadcast_all,
@@ -92,7 +103,11 @@ class Transform:
     domain: constraints.Constraint
     codomain: constraints.Constraint
 
+<<<<<<< HEAD
     def __init__(self, cache_size=0):
+=======
+    def __init__(self, cache_size: int = 0) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._cache_size = cache_size
         self._inv: Optional[weakref.ReferenceType[Transform]] = None
         if cache_size == 0:
@@ -218,7 +233,11 @@ class _InverseTransform(Transform):
     This class is private; please instead use the ``Transform.inv`` property.
     """
 
+<<<<<<< HEAD
     def __init__(self, transform: Transform):
+=======
+    def __init__(self, transform: Transform) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(cache_size=transform._cache_size)
         self._inv: Transform = transform  # type: ignore[assignment]
 
@@ -285,7 +304,11 @@ class ComposeTransform(Transform):
             the latest single value is cached. Only 0 and 1 are supported.
     """
 
+<<<<<<< HEAD
     def __init__(self, parts: list[Transform], cache_size=0):
+=======
+    def __init__(self, parts: list[Transform], cache_size: int = 0) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if cache_size:
             parts = [part.with_cache(cache_size) for part in parts]
         super().__init__(cache_size=cache_size)
@@ -413,7 +436,16 @@ class IndependentTransform(Transform):
             dimensions to treat as dependent.
     """
 
+<<<<<<< HEAD
     def __init__(self, base_transform, reinterpreted_batch_ndims, cache_size=0):
+=======
+    def __init__(
+        self,
+        base_transform: Transform,
+        reinterpreted_batch_ndims: int,
+        cache_size: int = 0,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(cache_size=cache_size)
         self.base_transform = base_transform.with_cache(cache_size)
         self.reinterpreted_batch_ndims = reinterpreted_batch_ndims
@@ -442,7 +474,11 @@ def bijective(self) -> bool:  # type: ignore[override]
         return self.base_transform.bijective
 
     @property
+<<<<<<< HEAD
     def sign(self) -> int:  # type: ignore[override]
+=======
+    def sign(self) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.base_transform.sign
 
     def _call(self, x):
@@ -486,7 +522,16 @@ class ReshapeTransform(Transform):
 
     bijective = True
 
+<<<<<<< HEAD
     def __init__(self, in_shape, out_shape, cache_size=0):
+=======
+    def __init__(
+        self,
+        in_shape: torch.Size,
+        out_shape: torch.Size,
+        cache_size: int = 0,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.in_shape = torch.Size(in_shape)
         self.out_shape = torch.Size(out_shape)
         if self.in_shape.numel() != self.out_shape.numel():
@@ -571,7 +616,11 @@ class PowerTransform(Transform):
     codomain = constraints.positive
     bijective = True
 
+<<<<<<< HEAD
     def __init__(self, exponent, cache_size=0):
+=======
+    def __init__(self, exponent: Tensor, cache_size: int = 0) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(cache_size=cache_size)
         (self.exponent,) = broadcast_all(exponent)
 
@@ -582,7 +631,11 @@ def with_cache(self, cache_size=1):
 
     @lazy_property
     def sign(self) -> int:  # type: ignore[override]
+<<<<<<< HEAD
         return self.exponent.sign()
+=======
+        return self.exponent.sign()  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __eq__(self, other):
         if not isinstance(other, PowerTransform):
@@ -734,7 +787,17 @@ class AffineTransform(Transform):
 
     bijective = True
 
+<<<<<<< HEAD
     def __init__(self, loc, scale, event_dim=0, cache_size=0):
+=======
+    def __init__(
+        self,
+        loc: Union[Tensor, float],
+        scale: Union[Tensor, float],
+        event_dim: int = 0,
+        cache_size: int = 0,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(cache_size=cache_size)
         self.loc = loc
         self.scale = scale
@@ -771,20 +834,32 @@ def __eq__(self, other):
             if self.loc != other.loc:
                 return False
         else:
+<<<<<<< HEAD
             if not (self.loc == other.loc).all().item():
+=======
+            if not (self.loc == other.loc).all().item():  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
 
         if isinstance(self.scale, _Number) and isinstance(other.scale, _Number):
             if self.scale != other.scale:
                 return False
         else:
+<<<<<<< HEAD
             if not (self.scale == other.scale).all().item():
+=======
+            if not (self.scale == other.scale).all().item():  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return False
 
         return True
 
     @property
+<<<<<<< HEAD
     def sign(self) -> int:
+=======
+    def sign(self) -> Union[Tensor, int]:  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(self.scale, _Number):
             return 1 if float(self.scale) > 0 else -1 if float(self.scale) < 0 else 0
         return self.scale.sign()
@@ -1022,7 +1097,11 @@ class PositiveDefiniteTransform(Transform):
     """
 
     domain = constraints.independent(constraints.real, 2)
+<<<<<<< HEAD
     codomain = constraints.positive_definite  # type: ignore[assignment]
+=======
+    codomain = constraints.positive_definite
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __eq__(self, other):
         return isinstance(other, PositiveDefiniteTransform)
@@ -1053,7 +1132,17 @@ class CatTransform(Transform):
 
     transforms: list[Transform]
 
+<<<<<<< HEAD
     def __init__(self, tseq, dim=0, lengths=None, cache_size=0):
+=======
+    def __init__(
+        self,
+        tseq: Sequence[Transform],
+        dim: int = 0,
+        lengths: Optional[Sequence[int]] = None,
+        cache_size: int = 0,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert all(isinstance(t, Transform) for t in tseq)
         if cache_size:
             tseq = [t.with_cache(cache_size) for t in tseq]
@@ -1157,7 +1246,13 @@ class StackTransform(Transform):
 
     transforms: list[Transform]
 
+<<<<<<< HEAD
     def __init__(self, tseq, dim=0, cache_size=0):
+=======
+    def __init__(
+        self, tseq: Sequence[Transform], dim: int = 0, cache_size: int = 0
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert all(isinstance(t, Transform) for t in tseq)
         if cache_size:
             tseq = [t.with_cache(cache_size) for t in tseq]
@@ -1237,12 +1332,20 @@ class CumulativeDistributionTransform(Transform):
     codomain = constraints.unit_interval
     sign = +1
 
+<<<<<<< HEAD
     def __init__(self, distribution, cache_size=0):
+=======
+    def __init__(self, distribution: Distribution, cache_size: int = 0) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(cache_size=cache_size)
         self.distribution = distribution
 
     @property
+<<<<<<< HEAD
     def domain(self) -> constraints.Constraint:  # type: ignore[override]
+=======
+    def domain(self) -> Optional[constraints.Constraint]:  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.distribution.support
 
     def _call(self, x):
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
index 31007c924de0..103d930b8ad7 100644
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import nan, Tensor
 from torch.distributions import constraints
@@ -27,6 +32,7 @@ class Uniform(Distribution):
         high (float or Tensor): upper range (exclusive).
     """
 
+<<<<<<< HEAD
     # TODO allow (loc,scale) parameterization to allow independent constraints.
     arg_constraints = {
         "low": constraints.dependent(is_discrete=False, event_dim=0),
@@ -35,6 +41,19 @@ class Uniform(Distribution):
     has_rsample = True
 
     @property
+=======
+    has_rsample = True
+
+    @property
+    def arg_constraints(self):
+        # TODO allow (loc,scale) parameterization to allow independent constraints.
+        return {
+            "low": constraints.less_than(self.high),
+            "high": constraints.greater_than(self.low),
+        }
+
+    @property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def mean(self) -> Tensor:
         return (self.high + self.low) / 2
 
@@ -50,7 +69,16 @@ def stddev(self) -> Tensor:
     def variance(self) -> Tensor:
         return (self.high - self.low).pow(2) / 12
 
+<<<<<<< HEAD
     def __init__(self, low, high, validate_args=None):
+=======
+    def __init__(
+        self,
+        low: Union[Tensor, float],
+        high: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.low, self.high = broadcast_all(low, high)
 
         if isinstance(low, _Number) and isinstance(high, _Number):
@@ -59,9 +87,12 @@ def __init__(self, low, high, validate_args=None):
             batch_shape = self.low.size()
         super().__init__(batch_shape, validate_args=validate_args)
 
+<<<<<<< HEAD
         if self._validate_args and not torch.lt(self.low, self.high).all():
             raise ValueError("Uniform is not defined when low>= high")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Uniform, _instance)
         batch_shape = torch.Size(batch_shape)
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index f83d75c904ab..39d41502ecd8 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
 from functools import update_wrapper
 from typing import Any, Callable, Generic, overload, Union
@@ -11,6 +12,20 @@
 
 
 euler_constant = 0.57721566490153286060  # Euler Mascheroni Constant
+=======
+from collections.abc import Sequence
+from functools import update_wrapper
+from typing import Any, Callable, Final, Generic, Optional, overload, TypeVar, Union
+
+import torch
+import torch.nn.functional as F
+from torch import SymInt, Tensor
+from torch.overrides import is_tensor_like
+from torch.types import _dtype, _Number, Device, Number
+
+
+euler_constant: Final[float] = 0.57721566490153286060  # Euler Mascheroni Constant
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __all__ = [
     "broadcast_all",
@@ -23,7 +38,13 @@
 ]
 
 
+<<<<<<< HEAD
 def broadcast_all(*values):
+=======
+# FIXME: Use (*values: *Ts) -> tuple[Tensor for T in Ts] if Mapping-Type is ever added.
+#   See https://github.com/python/typing/issues/1216#issuecomment-2126153831
+def broadcast_all(*values: Union[Tensor, Number]) -> tuple[Tensor, ...]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""
     Given a list of values (possibly containing numbers), returns a list where each
     value is broadcasted based on the following rules:
@@ -57,7 +78,15 @@ def broadcast_all(*values):
     return torch.broadcast_tensors(*values)
 
 
+<<<<<<< HEAD
 def _standard_normal(shape, dtype, device):
+=======
+def _standard_normal(
+    shape: Sequence[Union[int, SymInt]],
+    dtype: Optional[_dtype],
+    device: Optional[Device],
+) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if torch._C._get_tracing_state():
         # [JIT WORKAROUND] lack of support for .normal_()
         return torch.normal(
@@ -67,7 +96,11 @@ def _standard_normal(shape, dtype, device):
     return torch.empty(shape, dtype=dtype, device=device).normal_()
 
 
+<<<<<<< HEAD
 def _sum_rightmost(value, dim):
+=======
+def _sum_rightmost(value: Tensor, dim: int) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""
     Sum out ``dim`` many rightmost dimensions of a given tensor.
 
@@ -81,7 +114,11 @@ def _sum_rightmost(value, dim):
     return value.reshape(required_shape).sum(-1)
 
 
+<<<<<<< HEAD
 def logits_to_probs(logits, is_binary=False):
+=======
+def logits_to_probs(logits: Tensor, is_binary: bool = False) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""
     Converts a tensor of logits into probabilities. Note that for the
     binary case, each value denotes log odds, whereas for the
@@ -93,7 +130,11 @@ def logits_to_probs(logits, is_binary=False):
     return F.softmax(logits, dim=-1)
 
 
+<<<<<<< HEAD
 def clamp_probs(probs):
+=======
+def clamp_probs(probs: Tensor) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Clamps the probabilities to be in the open interval `(0, 1)`.
 
     The probabilities would be clamped between `eps` and `1 - eps`,
@@ -119,7 +160,11 @@ def clamp_probs(probs):
     return probs.clamp(min=eps, max=1 - eps)
 
 
+<<<<<<< HEAD
 def probs_to_logits(probs, is_binary=False):
+=======
+def probs_to_logits(probs: Tensor, is_binary: bool = False) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""
     Converts a tensor of probabilities into logits. For the binary case,
     this denotes the probability of occurrence of the event indexed by `1`.
diff --git a/torch/distributions/von_mises.py b/torch/distributions/von_mises.py
index 9a144fe10817..76668c82d601 100644
--- a/torch/distributions/von_mises.py
+++ b/torch/distributions/von_mises.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 import math
+<<<<<<< HEAD
+=======
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.jit
@@ -126,7 +130,16 @@ class VonMises(Distribution):
     support = constraints.real
     has_rsample = False
 
+<<<<<<< HEAD
     def __init__(self, loc, concentration, validate_args=None):
+=======
+    def __init__(
+        self,
+        loc: Tensor,
+        concentration: Tensor,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.loc, self.concentration = broadcast_all(loc, concentration)
         batch_shape = self.loc.shape
         event_shape = torch.Size()
diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py
index e7b3c5e0cebe..5e821d5811bc 100644
--- a/torch/distributions/weibull.py
+++ b/torch/distributions/weibull.py
@@ -1,4 +1,9 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
+=======
+from typing import Optional, Union
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 from torch import Tensor
 from torch.distributions import constraints
@@ -26,6 +31,10 @@ class Weibull(TransformedDistribution):
     Args:
         scale (float or Tensor): Scale parameter of distribution (lambda).
         concentration (float or Tensor): Concentration parameter of distribution (k/shape).
+<<<<<<< HEAD
+=======
+        validate_args (bool, optional): Whether to validate arguments. Default: None.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     arg_constraints = {
@@ -34,7 +43,16 @@ class Weibull(TransformedDistribution):
     }
     support = constraints.positive
 
+<<<<<<< HEAD
     def __init__(self, scale, concentration, validate_args=None):
+=======
+    def __init__(
+        self,
+        scale: Union[Tensor, float],
+        concentration: Union[Tensor, float],
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.scale, self.concentration = broadcast_all(scale, concentration)
         self.concentration_reciprocal = self.concentration.reciprocal()
         base_dist = Exponential(
diff --git a/torch/distributions/wishart.py b/torch/distributions/wishart.py
index 225aeeb97430..18645f1e2f9f 100644
--- a/torch/distributions/wishart.py
+++ b/torch/distributions/wishart.py
@@ -64,24 +64,44 @@ class Wishart(ExponentialFamily):
     [5] Ku, Y.-C. & Bloomfield, P., 2010. `Generating Random Wishart Matrices with Fractional Degrees of Freedom in OX`.
     """
 
+<<<<<<< HEAD
     arg_constraints = {
         "covariance_matrix": constraints.positive_definite,
         "precision_matrix": constraints.positive_definite,
         "scale_tril": constraints.lower_cholesky,
         "df": constraints.greater_than(0),
     }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     support = constraints.positive_definite
     has_rsample = True
     _mean_carrier_measure = 0
 
+<<<<<<< HEAD
+=======
+    @property
+    def arg_constraints(self):
+        return {
+            "covariance_matrix": constraints.positive_definite,
+            "precision_matrix": constraints.positive_definite,
+            "scale_tril": constraints.lower_cholesky,
+            "df": constraints.greater_than(self.event_shape[-1] - 1),
+        }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         df: Union[Tensor, Number],
         covariance_matrix: Optional[Tensor] = None,
         precision_matrix: Optional[Tensor] = None,
         scale_tril: Optional[Tensor] = None,
+<<<<<<< HEAD
         validate_args=None,
     ):
+=======
+        validate_args: Optional[bool] = None,
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert (covariance_matrix is not None) + (scale_tril is not None) + (
             precision_matrix is not None
         ) == 1, (
@@ -119,7 +139,10 @@ def __init__(
         elif precision_matrix is not None:
             self.precision_matrix = param.expand(batch_shape + (-1, -1))
 
+<<<<<<< HEAD
         self.arg_constraints["df"] = constraints.greater_than(event_shape[-1] - 1)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.df.lt(event_shape[-1]).any():
             warnings.warn(
                 "Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim."
diff --git a/torch/export/__init__.py b/torch/export/__init__.py
index 0ab7d1a80482..f482d3bda916 100644
--- a/torch/export/__init__.py
+++ b/torch/export/__init__.py
@@ -45,20 +45,32 @@
     "dims",
     "export",
     "export_for_training",
+<<<<<<< HEAD
     "export_for_inference",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "load",
     "register_dataclass",
     "save",
     "unflatten",
     "FlatArgsAdapter",
     "UnflattenedModule",
+<<<<<<< HEAD
+=======
+    "AdditionalInputs",
+    "draft_export",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 # To make sure export specific custom ops are loaded
 import torch.export.custom_ops
 
 from .decomp_utils import CustomDecompTable
+<<<<<<< HEAD
 from .dynamic_shapes import Constraint, Dim, dims, ShapesCollection
+=======
+from .dynamic_shapes import AdditionalInputs, Constraint, Dim, dims, ShapesCollection
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .exported_program import (
     default_decompositions,
     ExportedProgram,
@@ -78,7 +90,11 @@ def export_for_training(
     kwargs: Optional[dict[str, Any]] = None,
     *,
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
+<<<<<<< HEAD
     strict: bool = True,
+=======
+    strict: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     preserve_module_call_signature: tuple[str, ...] = (),
 ) -> ExportedProgram:
     """
@@ -132,6 +148,13 @@ def export_for_training(
          is passed here.
          WARNING: This option is experimental and use this at your own risk.
 
+<<<<<<< HEAD
+=======
+        preserve_module_call_signature: A list of submodule paths for which the original
+         calling conventions are preserved as metadata. The metadata will be used when calling
+         torch.export.unflatten to preserve the original calling conventions of modules.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Returns:
         An :class:`ExportedProgram` containing the traced callable.
 
@@ -167,6 +190,7 @@ def export_for_training(
     )
 
 
+<<<<<<< HEAD
 def export_for_inference(
     mod: torch.nn.Module,
     args: tuple[Any, ...],
@@ -252,13 +276,19 @@ def export_for_inference(
     return ep_for_training.run_decompositions(decomp_table=decomp_table)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def export(
     mod: torch.nn.Module,
     args: tuple[Any, ...],
     kwargs: Optional[dict[str, Any]] = None,
     *,
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
+<<<<<<< HEAD
     strict: bool = True,
+=======
+    strict: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     preserve_module_call_signature: tuple[str, ...] = (),
 ) -> ExportedProgram:
     """
@@ -322,6 +352,7 @@ def export(
          are denoted by None. Arguments that are dicts or tuples / lists of tensors are
          recursively specified by using mappings or sequences of contained specifications.
 
+<<<<<<< HEAD
         strict: When enabled (default), the export function will trace the program through
          TorchDynamo which will ensure the soundness of the resulting graph. Otherwise, the
          exported program will not validate the implicit assumptions baked into the graph and
@@ -331,6 +362,21 @@ def export(
          to be different and the model will be serialized in the same way regardless of what value
          is passed here.
          WARNING: This option is experimental and use this at your own risk.
+=======
+        strict: When disabled (default), the export function will trace the program through
+         Python runtime, which by itself will not validate some of the implicit assumptions
+         baked into the graph. It will still validate most critical assumptions like shape
+         safety. When enabled (by setting ``strict=True``), the export function will trace
+         the program through TorchDynamo which will ensure the soundness of the resulting
+         graph. TorchDynamo has limited Python feature coverage, thus you may experience more
+         errors. Note that toggling this argument does not affect the resulting IR spec to be
+         different and the model will be serialized in the same way regardless of what value
+         is passed here.
+
+        preserve_module_call_signature: A list of submodule paths for which the original
+         calling conventions are preserved as metadata. The metadata will be used when calling
+         torch.export.unflatten to preserve the original calling conventions of modules.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         An :class:`ExportedProgram` containing the traced callable.
@@ -357,6 +403,7 @@ def export(
             "Maybe try converting your ScriptModule to an ExportedProgram "
             "using `TS2EPConverter(mod, args, kwargs).convert()` instead."
         )
+<<<<<<< HEAD
     return _export(
         mod,
         args,
@@ -366,6 +413,44 @@ def export(
         preserve_module_call_signature=preserve_module_call_signature,
         pre_dispatch=True,
     )
+=======
+
+    try:
+        return _export(
+            mod,
+            args,
+            kwargs,
+            dynamic_shapes,
+            strict=strict,
+            preserve_module_call_signature=preserve_module_call_signature,
+            pre_dispatch=True,
+        )
+    except Exception as e:
+        draft_export_msg = (
+            "The error above occurred when calling torch.export.export. If you would "
+            "like to view some more information about this error, and get a list "
+            "of all other errors that may occur in your export call, you can "
+            "replace your `export()` call with `draft_export()`."
+        )
+
+        # For errors that we know can be caught by draft-export, add the message
+        # to ask users to try out draft-export
+        if isinstance(
+            e,
+            (
+                torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode,
+                torch._subclasses.fake_tensor.UnsupportedOperatorException,
+                torch._dynamo.exc.UserError,
+                torch.fx.experimental.symbolic_shapes.ConstraintViolationError,
+            ),
+        ):
+            new_msg = str(e) + "\n\n" + draft_export_msg
+            e.args = (new_msg,)
+        elif isinstance(e, RuntimeError) and "no fake impl registered" in str(e):
+            new_msg = str(e) + "\n\n" + draft_export_msg
+            e.args = (new_msg,)
+        raise e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 DEFAULT_PICKLE_PROTOCOL = 2
@@ -407,22 +492,39 @@ def save(
         import torch
         import io
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class MyModule(torch.nn.Module):
             def forward(self, x):
                 return x + 10
 
+<<<<<<< HEAD
         ep = torch.export.export(MyModule(), (torch.randn(5),))
 
         # Save to file
         torch.export.save(ep, 'exported_program.pt2')
+=======
+
+        ep = torch.export.export(MyModule(), (torch.randn(5),))
+
+        # Save to file
+        torch.export.save(ep, "exported_program.pt2")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Save to io.BytesIO buffer
         buffer = io.BytesIO()
         torch.export.save(ep, buffer)
 
         # Save with extra files
+<<<<<<< HEAD
         extra_files = {'foo.txt': b'bar'.decode('utf-8')}
         torch.export.save(ep, 'exported_program.pt2', extra_files=extra_files)
+=======
+        extra_files = {"foo.txt": b"bar".decode("utf-8")}
+        torch.export.save(ep, "exported_program.pt2", extra_files=extra_files)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     """
     if not isinstance(ep, ExportedProgram):
@@ -430,6 +532,7 @@ def forward(self, x):
             f"The 'ep' parameter must be an instance of 'ExportedProgram', got '{type(ep).__name__}' instead."
         )
 
+<<<<<<< HEAD
     from torch._export.serde.schema import SCHEMA_VERSION
     from torch._export.serde.serialize import serialize, SerializedArtifact
 
@@ -453,6 +556,17 @@ def forward(self, x):
             for extra_file_name, content in extra_files.items():
                 encoded_content = content.encode("utf-8")
                 zipf.writestr(f"extra_files/{extra_file_name}", encoded_content)
+=======
+    from torch.export.pt2_archive._package import package_pt2
+
+    package_pt2(
+        f,
+        exported_programs={"model": ep},
+        extra_files=extra_files,
+        pickle_protocol=pickle_protocol,
+        opset_version=opset_version,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def load(
@@ -490,18 +604,31 @@ def load(
         import io
 
         # Load ExportedProgram from file
+<<<<<<< HEAD
         ep = torch.export.load('exported_program.pt2')
 
         # Load ExportedProgram from io.BytesIO object
         with open('exported_program.pt2', 'rb') as f:
+=======
+        ep = torch.export.load("exported_program.pt2")
+
+        # Load ExportedProgram from io.BytesIO object
+        with open("exported_program.pt2", "rb") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             buffer = io.BytesIO(f.read())
         buffer.seek(0)
         ep = torch.export.load(buffer)
 
         # Load with extra files.
+<<<<<<< HEAD
         extra_files = {'foo.txt': ''}  # values will be replaced with data
         ep = torch.export.load('exported_program.pt2', extra_files=extra_files)
         print(extra_files['foo.txt'])
+=======
+        extra_files = {"foo.txt": ""}  # values will be replaced with data
+        ep = torch.export.load("exported_program.pt2", extra_files=extra_files)
+        print(extra_files["foo.txt"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         print(ep(torch.randn(5)))
     """
     if isinstance(f, (str, os.PathLike)):
@@ -509,12 +636,45 @@ def load(
 
     extra_files = extra_files or {}
 
+<<<<<<< HEAD
     with zipfile.ZipFile(f, "r") as zipf:
         # Check the version
         version = zipf.read("version").decode().split(".")
         from torch._export.serde.schema import SCHEMA_VERSION
 
         assert len(version) == len(SCHEMA_VERSION)
+=======
+    from torch.export.pt2_archive._package import load_pt2, PT2ArchiveContents
+
+    try:
+        pt2_contents = load_pt2(
+            f,
+            expected_opset_version=expected_opset_version,
+        )
+    except RuntimeError:
+        pt2_contents = PT2ArchiveContents({}, {}, {})
+
+    if len(pt2_contents.exported_programs) > 0 or len(pt2_contents.extra_files) > 0:
+        for k, v in pt2_contents.extra_files.items():
+            extra_files[k] = v
+
+        return pt2_contents.exported_programs["model"]
+
+    # TODO: For backward compatibility, we support loading a zip file from 2.7. Delete this path in 2.9(?)
+    warnings.warn(
+        "This version of file is deprecated. Please generate a new pt2 saved file."
+    )
+    with zipfile.ZipFile(f, "r") as zipf:
+        # Check the version
+        version = zipf.read("version").decode().split(".")
+        from torch._export.serde.schema import (
+            SCHEMA_VERSION,  # todo change archive version to schema version
+        )
+
+        assert len(version) == len(SCHEMA_VERSION), (
+            "Version in the saved file has incorrect length, double check if the file is generated by torch.export.save()"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if version[0] != str(SCHEMA_VERSION[0]):
             raise RuntimeError(
                 f"Serialized version {version} does not match our current "
@@ -568,6 +728,35 @@ def load(
         return ep
 
 
+<<<<<<< HEAD
+=======
+def draft_export(
+    mod: torch.nn.Module,
+    args: tuple[Any, ...],
+    kwargs: Optional[dict[str, Any]] = None,
+    *,
+    dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
+    preserve_module_call_signature: tuple[str, ...] = (),
+    strict: bool = False,
+) -> ExportedProgram:
+    """
+    A version of torch.export.export which is designed to consistently produce
+    an ExportedProgram, even if there are potential soundness issues, and to
+    generate a report listing the issues found.
+    """
+    from ._draft_export import draft_export
+
+    return draft_export(
+        mod=mod,
+        args=args,
+        kwargs=kwargs,
+        dynamic_shapes=dynamic_shapes,
+        preserve_module_call_signature=preserve_module_call_signature,
+        strict=strict,
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def register_dataclass(
     cls: type[Any],
     *,
@@ -587,23 +776,40 @@ def register_dataclass(
         import torch
         from dataclasses import dataclass
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @dataclass
         class InputDataClass:
             feature: torch.Tensor
             bias: int
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @dataclass
         class OutputDataClass:
             res: torch.Tensor
 
+<<<<<<< HEAD
         torch.export.register_dataclass(InputDataClass)
         torch.export.register_dataclass(OutputDataClass)
 
+=======
+
+        torch.export.register_dataclass(InputDataClass)
+        torch.export.register_dataclass(OutputDataClass)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Mod(torch.nn.Module):
             def forward(self, x: InputDataClass) -> OutputDataClass:
                 res = x.feature + x.bias
                 return OutputDataClass(res=res)
 
+<<<<<<< HEAD
         ep = torch.export.export(Mod(), (InputDataClass(torch.ones(2, 2), 1), ))
         print(ep)
 
@@ -614,3 +820,11 @@ def forward(self, x: InputDataClass) -> OutputDataClass:
     return register_dataclass_as_pytree_node(
         cls, serialized_type_name=serialized_type_name
     )
+=======
+
+        ep = torch.export.export(Mod(), (InputDataClass(torch.ones(2, 2), 1),))
+        print(ep)
+
+    """
+    pytree.register_dataclass(cls, serialized_type_name=serialized_type_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/export/_draft_export.py b/torch/export/_draft_export.py
index d7b036a0a956..b8793b03f7a0 100644
--- a/torch/export/_draft_export.py
+++ b/torch/export/_draft_export.py
@@ -11,10 +11,23 @@
 import torch
 import torch._logging._internal
 import torch._logging.structured
+<<<<<<< HEAD
 from torch._export.passes.insert_custom_op_guards import insert_custom_op_guards
 from torch.export import ExportedProgram
 from torch.export._trace import _export
 from torch.export.dynamic_shapes import refine_dynamic_shapes_from_suggested_fixes
+=======
+import torch.utils._pytree as pytree
+from torch._export.passes.insert_custom_op_guards import (
+    get_op_profiles,
+    insert_custom_op_guards,
+    OpProfile,
+)
+
+from ._trace import _export
+from .dynamic_shapes import _DimHint, _DimHintType, Dim
+from .exported_program import ExportedProgram
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 log = logging.getLogger(__name__)
@@ -23,7 +36,11 @@
 class FailureType(IntEnum):
     MISSING_FAKE_KERNEL = 1
     DATA_DEPENDENT_ERROR = 2
+<<<<<<< HEAD
     CONSTRAINT_VIOLATION_ERROR = 3
+=======
+    GUARD_ADDED = 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     MISMATCHED_FAKE_KERNEL = 4
 
     def __str__(self) -> str:
@@ -37,7 +54,11 @@ def prettify_stack(stack: list[dict[str, str]], str_to_filename: dict[int, str])
             continue
 
         res += f"""
+<<<<<<< HEAD
         File {str_to_filename[frame['filename']]}, lineno {frame['line']}, in {frame['name']}"""  # type: ignore[index]
+=======
+        File {str_to_filename[frame["filename"]]}, lineno {frame["line"]}, in {frame["name"]}"""  # type: ignore[index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     res += f"\n            {stack[-1]['loc']}"
     return res
@@ -94,17 +115,31 @@ def print(self, str_to_filename: dict[int, str]) -> str:
     Please refer to https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ahugy69p2jmz for more detailed instructions on how to write a meta implementation.
 """  # noqa: B950
 
+<<<<<<< HEAD
         elif self.failure_type == FailureType.CONSTRAINT_VIOLATION_ERROR:
+=======
+        elif self.failure_type == FailureType.GUARD_ADDED:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             locals_info = (
                 prettify_frame_locals(**self.data["frame_locals"])
                 if self.data["frame_locals"]
                 else ""
             )
+<<<<<<< HEAD
             return f"""Constraint violation error.
     The specified input dynamic_shapes spec was found to be incorrect during tracing.
     Specifically, this guard was added: {self.data["expr"]}, where {self.data["symbol_to_sources"]}.
     This occurred at the following stacktrace: {prettify_stack(self.data["stack"], str_to_filename)}:
         {locals_info}
+=======
+            return f"""Guard Added.
+    A guard was added during tracing, which might've resulted in some incorrect
+    tracing or constraint violation error.
+    Specifically, this guard was added: {self.data["expr"]}, where {self.data["symbol_to_sources"]}.
+    This occurred at the following stacktrace: {prettify_stack(self.data["user_stack"], str_to_filename)}:
+        {locals_info}
+    And the following framework stacktrace: {prettify_stack(self.data["stack"], str_to_filename)}\n
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Because of this, we have modified the dynamic shapes structure to be the
     following. You can also use torch.export.Dim.AUTO instead to specify your
     dynamic shapes, and we will automatically infer the dynamism for you.
@@ -151,10 +186,18 @@ def __init__(
         failures: list[FailureReport],
         str_to_filename: dict[int, str],
         expressions_created: dict[int, dict[str, Any]],
+<<<<<<< HEAD
+=======
+        op_profiles: dict[str, set[OpProfile]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         self.failures: list[FailureReport] = failures
         self.str_to_filename = str_to_filename
         self.expressions_created: dict[int, dict[str, Any]] = expressions_created
+<<<<<<< HEAD
+=======
+        self.op_profiles = op_profiles
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def successful(self) -> bool:
         return len(self.failures) == 0 or all(
@@ -216,6 +259,11 @@ def _hash(self, element: tuple[str, dict[str, Any]]) -> int:
             return hash((key, data["op"], data["reason"]))
         elif key == "propagate_real_tensors_provenance":
             return hash((key, json.dumps(data["user_stack"])))
+<<<<<<< HEAD
+=======
+        elif key == "guard_added":
+            return hash((key, json.dumps(data["user_stack"])))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif key == "create_unbacked_symbol":
             return hash((key, json.dumps(data["user_stack"])))
 
@@ -291,6 +339,25 @@ def __exit__(self, exc_type, exc_value, traceback) -> None:  # type: ignore[no-u
         self.prev_get_dtrace = False
 
     def emit(self, record: Any) -> None:
+<<<<<<< HEAD
+=======
+        def _log_expression_created(
+            emit_func: Callable[[Any], None], sym_node_id: int
+        ) -> None:
+            # Log all the relevant expression_created logs
+            if sym_node_id is None:
+                return
+            if res := self.expression_created_logs.get(sym_node_id, None):
+                # Don't log the expression if we have already
+                # printed it beforehand
+                if not res.visited:
+                    res.visited = True
+                    for arg in res.argument_ids:
+                        _log_expression_created(emit_func, arg)
+
+                emit_func(res.record)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         metadata = record.metadata
         for key in self.specific_log_keys:
             if key in metadata:
@@ -299,6 +366,7 @@ def emit(self, record: Any) -> None:
                         # We don't want to log all expression_created logs, only
                         # the ones that are relevant to the
                         # guards/propagate_real_tensor
+<<<<<<< HEAD
                         self.expression_created_logs[
                             metadata[key]["result_id"]
                         ] = ExpressionCreatedNode(
@@ -327,6 +395,32 @@ def _log_expression_created(
                                         _log_expression_created(emit_func, arg)
 
                                 emit_func(res.record)
+=======
+                        self.expression_created_logs[metadata[key]["result_id"]] = (
+                            ExpressionCreatedNode(
+                                metadata[key]["result_id"],
+                                metadata[key].get("argument_ids", []),
+                                record,
+                            )
+                        )
+                        return
+
+                    elif key == "propagate_real_tensors_provenance":
+                        _log_expression_created(
+                            super().emit, metadata[key].get("expr_node_id")
+                        )
+
+                    elif key == "guard_added":
+                        if len(metadata[key]["symbol_to_sources"]) == 0:
+                            # We only want to include guards added that are relevant to
+                            # the symbolic shapes corresponding to the inputs which were
+                            # specified in the dynamic_shapes arg. These have a source.
+                            return
+                        elif metadata[key]["prefix"] == "runtime_assert":
+                            # This should've been captured by a
+                            # propagate_real_tensors log
+                            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                         _log_expression_created(
                             super().emit, metadata[key].get("expr_node_id")
@@ -343,17 +437,31 @@ def draft_export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = None,
     preserve_module_call_signature: tuple[str, ...] = (),
     strict: bool = False,
+<<<<<<< HEAD
     pre_dispatch: bool = False,
+=======
+    pre_dispatch: bool = True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> ExportedProgram:
     kwargs = kwargs or {}
     dynamic_shapes = dynamic_shapes or {}
 
     capture_structured_log = CaptureStructuredTrace()
 
+<<<<<<< HEAD
     with torch._functorch.config.patch(
         fake_tensor_propagate_real_tensors=True,
         generate_fake_kernels_from_real_mismatches=True,
     ), capture_structured_log:
+=======
+    with (
+        torch._functorch.config.patch(
+            fake_tensor_propagate_real_tensors=True,
+            generate_fake_kernels_from_real_mismatches=True,
+        ),
+        capture_structured_log,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             new_shapes = None
             ep = _export(
@@ -365,10 +473,23 @@ def draft_export(
                 pre_dispatch=pre_dispatch,
                 preserve_module_call_signature=preserve_module_call_signature,
             )
+<<<<<<< HEAD
         except torch._dynamo.exc.UserError as exc:
             new_shapes = refine_dynamic_shapes_from_suggested_fixes(
                 exc.msg, dynamic_shapes
             )
+=======
+        except torch._dynamo.exc.UserError:
+
+            def convert_dim_to_auto(dim: Any) -> Any:
+                if isinstance(dim, Dim):
+                    return Dim.AUTO(min=dim.min, max=dim.max)
+                elif isinstance(dim, _DimHint) and dim.type == _DimHintType.DYNAMIC:
+                    return Dim.AUTO(min=dim.min, max=dim.max)
+                return dim
+
+            new_shapes = pytree.tree_map(convert_dim_to_auto, dynamic_shapes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ep = _export(
                 mod,
                 args,
@@ -383,9 +504,13 @@ def draft_export(
 
         str_to_filename: dict[int, str] = {}
         failures: list[FailureReport] = []
+<<<<<<< HEAD
         custom_ops_logs: dict[
             Any, tuple[dict[str, Any], FailureType]
         ] = {}  # For adding in assertions before custom ops
+=======
+        incorrect_custom_ops: set[str] = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expressions_created: dict[int, dict[str, Any]] = {}
 
         for log_name, log_contents in capture_structured_log.log_record.logs:
@@ -396,10 +521,17 @@ def draft_export(
                 continue
 
             elif log_name == "propagate_real_tensors_provenance":
+<<<<<<< HEAD
                 log_contents[
                     "occurrences"
                 ] = capture_structured_log.log_record.get_log_count(
                     (log_name, log_contents)
+=======
+                log_contents["occurrences"] = (
+                    capture_structured_log.log_record.get_log_count(
+                        (log_name, log_contents)
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 failure_type = FailureType.DATA_DEPENDENT_ERROR
@@ -408,6 +540,7 @@ def draft_export(
                 if new_shapes is None:
                     continue
 
+<<<<<<< HEAD
                 failure_type = FailureType.CONSTRAINT_VIOLATION_ERROR
                 if len(log_contents["symbol_to_sources"]) == 0:
                     # We only want to include guards added that are relevant to
@@ -426,6 +559,17 @@ def draft_export(
                     log_contents,
                     failure_type,
                 )
+=======
+                failure_type = FailureType.GUARD_ADDED
+                log_contents["new_dynamic_shapes"] = new_shapes
+            elif log_name == "missing_fake_kernel":
+                failure_type = FailureType.MISSING_FAKE_KERNEL
+                incorrect_custom_ops.add(log_contents["op"])
+
+            elif log_name == "mismatched_fake_kernel":
+                failure_type = FailureType.MISMATCHED_FAKE_KERNEL
+                incorrect_custom_ops.add(log_contents["op"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             else:
                 continue
@@ -442,15 +586,26 @@ def draft_export(
             if v.visited:
                 expressions_created[k] = v.record
 
+<<<<<<< HEAD
         report = DraftExportReport(failures, str_to_filename, expressions_created)
 
         # Add asserts around custom ops
         insert_custom_op_guards(ep.graph_module, list(custom_ops_logs.keys()))
+=======
+        op_profiles = get_op_profiles(ep.graph_module, incorrect_custom_ops)
+        report = DraftExportReport(
+            failures, str_to_filename, expressions_created, op_profiles
+        )
+
+        # Add asserts around custom ops
+        insert_custom_op_guards(ep.graph_module, incorrect_custom_ops)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     ep._report = report
     if not report.successful():
         log_filename = capture_structured_log.stream.name
 
+<<<<<<< HEAD
         log.warning(
             """
 ###################################################################################################
@@ -463,6 +618,31 @@ def draft_export(
             len(report.failures),
             log_filename,
         )
+=======
+        warning_msg = f"""
+###################################################################################################
+WARNING: {len(report.failures)} issue(s) found during export, and it was not able to soundly produce a graph.
+To view the report of failures in an html page, please run the command:
+    `tlparse {log_filename} --export`
+Or, you can view the errors in python by inspecting `print(ep._report)`.
+"""
+
+        if len(report.op_profiles) > 0:
+            warning_msg += f"""
+While tracing we found {len(report.op_profiles)} operator(s) which do not have a fake kernel registered.
+If you intend to retrace the exported graph or run it with fake tensors, please run it under the
+following context manager, which will register a fake kernel for those operators.
+```
+with torch._library.fake_profile.unsafe_generate_fake_kernels(ep._report.op_profiles):
+    # run with fake tensors
+```
+"""
+
+        warning_msg += """#################################################################################################"""
+
+        log.warning(warning_msg)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         log.info(
             """
diff --git a/torch/export/_remove_effect_tokens_pass.py b/torch/export/_remove_effect_tokens_pass.py
index 7a63a7f34826..82c1c4296401 100644
--- a/torch/export/_remove_effect_tokens_pass.py
+++ b/torch/export/_remove_effect_tokens_pass.py
@@ -53,6 +53,7 @@ def _remove_effect_tokens_from_graph_helper(
         assert isinstance(func, (torch._ops.OpOverload, torch._ops.HigherOrderOperator))
 
         if func == torch.ops.higher_order.call_torchbind:
+<<<<<<< HEAD
             custom_obj_meta = node.args[2].meta["val"]
             assert isinstance(custom_obj_meta, CustomObjArgument)
             if custom_obj_meta.fake_val:
@@ -60,6 +61,15 @@ def _remove_effect_tokens_from_graph_helper(
             elif node.args[2].name in inputs_to_lifted_custom_objs:
                 custom_obj = ep.constants[
                     inputs_to_lifted_custom_objs[node.args[2].name]
+=======
+            custom_obj_meta = node.args[2].meta["val"]  # type: ignore[union-attr]
+            assert isinstance(custom_obj_meta, CustomObjArgument)
+            if custom_obj_meta.fake_val:
+                custom_obj = custom_obj_meta.fake_val
+            elif node.args[2].name in inputs_to_lifted_custom_objs:  # type: ignore[union-attr]
+                custom_obj = ep.constants[
+                    inputs_to_lifted_custom_objs[node.args[2].name]  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
             else:
                 raise RuntimeError(f"Unable to find custom obj for node {node}")
@@ -71,6 +81,16 @@ def _remove_effect_tokens_from_graph_helper(
             new_node = ep.graph.call_function(func, node.args[2:], node.kwargs)
         for k, v in node.meta.items():
             new_node.meta[k] = v
+<<<<<<< HEAD
+=======
+            if k == "unbacked_bindings":
+                # Remove the extra layer for effect token
+                old_bindings = new_node.meta[k]
+                new_bindings = {
+                    k: path[1:] if path else path for k, path in old_bindings.items()
+                }
+                new_node.meta[k] = new_bindings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         node.replace_all_uses_with(new_node)
 
diff --git a/torch/export/_swap.py b/torch/export/_swap.py
index 74b564c9fccb..7ba612c10a1d 100644
--- a/torch/export/_swap.py
+++ b/torch/export/_swap.py
@@ -26,9 +26,15 @@ def _get_getitem_users(node: torch.fx.Node) -> set[torch.fx.Node]:
         if user.op == "output":
             continue
 
+<<<<<<< HEAD
         assert (
             user.op == "call_function" and user.target == operator.getitem
         ), f"Expected getitem node as user for {node}, instead got {user}"
+=======
+        assert user.op == "call_function" and user.target == operator.getitem, (
+            f"Expected getitem node as user for {node}, instead got {user}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         getitem_users.update(list(user.users.keys()))
     return getitem_users
 
@@ -63,9 +69,15 @@ def _try_remove_connecting_pytrees(curr_module_node: torch.fx.Node) -> None:
     log.debug("Trying to remove pytrees for module call %s", curr_module_node)
 
     curr_module_users = list(curr_module_node.users.keys())
+<<<<<<< HEAD
     assert (
         len(curr_module_users) == 1
     ), f"Expected only one user for module node, instead got {list(curr_module_users)}"
+=======
+    assert len(curr_module_users) == 1, (
+        f"Expected only one user for module node, instead got {list(curr_module_users)}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     flatten_node = curr_module_users[0]
     assert (
         flatten_node.op == "call_function"
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index 82a51681ad68..459ce4ab0759 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -26,6 +26,10 @@
     _fakify_script_objects,
     _gather_constant_attrs,
     _NonStrictTorchFunctionHandler,
+<<<<<<< HEAD
+=======
+    _override_builtin_ops,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     make_constraints,
     make_fake_inputs,
     produce_guards_and_solve_constraints,
@@ -64,14 +68,30 @@
     _detect_attribute_assignment,
     aot_export_module,
 )
+<<<<<<< HEAD
 from torch._guards import detect_fake_mode
+=======
+from torch._guards import detect_fake_mode, tracing, TracingContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._library.fake_class_registry import FakeScriptObject
 from torch._logging import dtrace_structured
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch._utils_internal import log_export_usage
 from torch.export._unlift import _check_input_constraints_pre_hook
+<<<<<<< HEAD
 from torch.export.dynamic_shapes import _check_dynamic_shapes, _combine_args
 from torch.export.exported_program import OutputKind
+=======
+from torch.export.dynamic_shapes import (
+    _check_dynamic_shapes,
+    _combine_args,
+    _DimHintType,
+    _IntWrapper,
+    _process_dynamic_shapes,
+)
+from torch.export.exported_program import OutputKind
+from torch.fx._symbolic_trace import _ConstantAttributeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.proxy_tensor import (
     get_proxy_slot,
     make_fx,
@@ -85,11 +105,18 @@
     ShapeEnv,
 )
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+<<<<<<< HEAD
 from torch.fx.graph_module import _get_attr
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._pytree import TreeSpec
 from torch.utils._sympy.value_ranges import ValueRangeError
 
 from ._safeguard import AutogradStateOpsFailSafeguard
+<<<<<<< HEAD
+=======
+from ._wrapper_utils import _WrapperModule
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .exported_program import (
     _disable_prexisiting_fake_mode,
     ExportedProgram,
@@ -117,13 +144,25 @@ class ExportDynamoConfig:
     # This isn't really necessary, and isn't much more efficient since the runtime asserts pass does CSE,
     # but if we want to reason more about what guards/runtime asserts to emit,
     # this makes it a bit cleaner to do from the export side. Also no real point in running this twice.
+<<<<<<< HEAD
     do_not_emit_runtime_asserts = True
+=======
+    do_not_emit_runtime_asserts: bool = True
+    specialize_int: bool = True
+    specialize_float: bool = True
+    assume_static_by_default: bool = False
+    automatic_dynamic_shapes: bool = False
+    capture_dynamic_output_shape_ops: bool = True
+    capture_scalar_outputs: bool = True
+    prefer_deferred_runtime_asserts_over_guards: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
 class ATenExportArtifact:
     gm: torch.fx.GraphModule
     sig: ExportGraphSignature
+<<<<<<< HEAD
     constants: dict[
         str,
         Union[
@@ -132,6 +171,9 @@ class ATenExportArtifact:
             torch.ScriptObject,
         ],
     ]
+=======
+    constants: dict[str, _ConstantAttributeType]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass(frozen=True)
@@ -217,12 +259,36 @@ def _rewrite_tracepoint_node(gm: torch.fx.GraphModule):
                     gm.graph.erase_node(node)
 
 
+<<<<<<< HEAD
+=======
+def detect_shape_env(inputs: Any = None):
+    shape_envs = []
+
+    for i, flat_input in enumerate(inputs):
+        if isinstance(flat_input, torch.SymInt):
+            shape_envs.append((flat_input.node.shape_env, "symint input", i))
+
+    if shape_envs:
+        shape_env, desc1, i1 = shape_envs[0]
+        for m, desc2, i2 in shape_envs[1:]:
+            assert shape_env is m, (
+                f"shape env ({shape_env}) from {desc1} {i1} doesn't match mode ({m}) from {desc2} {i2}\n\n"
+                f"shape env from {desc1} {i1} allocated at:\n{shape_env.stack}\n"
+                f"shape env from {desc2} {i2} allocated at:\n{m.stack}"
+            )
+        return shape_env
+    else:
+        return None
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _extract_fake_inputs(gm, args, kwargs):
     """
     Given a graph module, extract fakified input tensors from the metadata of
     its placeholders, and map them to the structure of given args and kwargs.
     Also return the fake mode used to fakify those inputs.
     """
+<<<<<<< HEAD
 
     fake_inps: list[torch.Tensor] = []
     fake_vals: list[torch.Tensor] = []
@@ -238,6 +304,31 @@ def _extract_fake_inputs(gm, args, kwargs):
 
     if detected_fake_mode := detect_fake_mode(fake_inps + fake_vals):
         fake_mode = detected_fake_mode
+=======
+    fake_inps: list[Any] = []
+    fake_vals: list[Any] = []
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            fake_inps.append(node.meta.get("val"))
+        else:
+            fake_vals.append(node.meta.get("example_value"))
+
+    # We get both because now we might have a combination of symint and tensor
+    # inputs, and we want to check that the shape env is consistent between
+    # both. Unforunately we can't see what fake mode is attached to the shape
+    # env, then we can just compare fake modes.
+    detected_fake_mode = detect_fake_mode(fake_inps + fake_vals)
+    detected_shape_env = detect_shape_env(fake_inps + fake_vals)
+
+    if detected_fake_mode:
+        if detected_shape_env:
+            assert detected_shape_env is detected_fake_mode.shape_env, (
+                "Detected shape env does not match fake mode's shape env"
+            )
+        fake_mode = detected_fake_mode
+    elif detected_shape_env:
+        fake_mode = FakeTensorMode(shape_env=detected_shape_env, export=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         fake_mode = FakeTensorMode(shape_env=ShapeEnv(), export=True)
 
@@ -245,12 +336,21 @@ def _extract_fake_inputs(gm, args, kwargs):
 
     def lookup_fake(x):
         nonlocal count
+<<<<<<< HEAD
         val = fake_inps[count]
         count += 1
         return val
 
     fake_args = pytree.tree_map_only(torch.Tensor, lookup_fake, args)
     fake_kwargs = pytree.tree_map_only(torch.Tensor, lookup_fake, kwargs)
+=======
+        val = fake_inps[count] if isinstance(x, (int, torch.Tensor)) else x
+        count += 1
+        return val
+
+    fake_args = pytree.tree_map(lookup_fake, args)
+    fake_kwargs = pytree.tree_map(lookup_fake, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return fake_args, fake_kwargs, fake_mode
 
@@ -373,7 +473,11 @@ def _preserve_requires_grad_pass(
     gm: torch.fx.GraphModule,
     sig: ExportGraphSignature,
     fake_params_buffers: dict[str, torch.Tensor],
+<<<<<<< HEAD
     constants: dict[str, Union[torch.Tensor, FakeScriptObject, torch.ScriptObject]],
+=======
+    constants: dict[str, _ConstantAttributeType],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     flat_fake_args: list[Any],
 ):
     placeholders = [node for node in gm.graph.nodes if node.op == "placeholder"]
@@ -411,7 +515,11 @@ def _preserve_requires_grad_pass(
 def _remap_constants(
     orig_constant_attrs: ConstantAttrMap,
     graph_signature: ExportGraphSignature,
+<<<<<<< HEAD
     constants: dict[str, Union[torch.Tensor, FakeScriptObject, torch.ScriptObject]],
+=======
+    constants: dict[str, _ConstantAttributeType],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """Rewrite the graph signature and constants table to use the FQN from the original module."""
     remap_table: dict[str, list[str]] = {}
@@ -493,11 +601,20 @@ def _produce_aten_artifact(
 
     It does:
     1. Applies runtime assertion pass
+<<<<<<< HEAD
     2. Populate meta val when missing
     3. Lift constants as placeholders
     4. Replace raw autograd and autocast ops with HOPs
     5. Prettify names for placeholders
     6. Preserve requires_grad value on node meta val
+=======
+    2. Recompute unbacked_bindings pass
+    3. Populate meta val when missing
+    4. Lift constants as placeholders
+    5. Replace raw autograd and autocast ops with HOPs
+    6. Prettify names for placeholders
+    7. Preserve requires_grad value on node meta val
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # Run runtime asserts pass before creating input/output specs, since size-related CSE/DCE might affect output signature.
     # Overwrite output specs afterwards.
@@ -726,8 +843,33 @@ def _export_to_torch_ir(
         )
 
     kwargs = kwargs or {}
+<<<<<<< HEAD
+    combined_args = _combine_args(f, args, kwargs)
+    _check_dynamic_shapes(combined_args, dynamic_shapes)
+=======
+
+    # Map ints to a wrapper structure to help us mark it as dynamic, if it is
+    # dynamic. We will unwrap ints in fakify later.
+    args, kwargs = pytree.tree_map_only(int, _IntWrapper, (args, kwargs))
+
     combined_args = _combine_args(f, args, kwargs)
     _check_dynamic_shapes(combined_args, dynamic_shapes)
+    constraints = _process_dynamic_shapes(combined_args, dynamic_shapes)
+
+    # Unwrap static ints -- in the case where we have an empty graph
+    # containing just integer computation, dynamo will run its generated
+    # bytecode with these args/kwargs, which will error because we cannot
+    # directly apply int operations on IntWrapper. So we will just unwrap
+    # them here.
+    args, kwargs = pytree.tree_map_only(
+        _IntWrapper,
+        lambda a: a.val
+        if a.dynamism is None or a.dynamism.type == _DimHintType.STATIC
+        else a,
+        (args, kwargs),
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with torch._dynamo.config.patch(dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)):
         try:
             module_call_specs: dict[str, dict[str, pytree.TreeSpec]] = {}
@@ -740,6 +882,10 @@ def _export_to_torch_ir(
                 gm_torch_level, _ = torch._dynamo.export(
                     f,
                     dynamic_shapes=dynamic_shapes,  # type: ignore[arg-type]
+<<<<<<< HEAD
+=======
+                    constraints=constraints,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     assume_static_by_default=True,
                     tracing_mode="symbolic",
                     disable_constraint_solver=disable_constraint_solver,
@@ -806,6 +952,7 @@ def _export_to_aten_ir(
     # This _reparametrize_module makes sure inputs and module.params/buffers have the same fake_mode,
     # otherwise aot_export_module will error out because it sees a mix of fake_modes.
     # And we want aot_export_module to use the fake_tensor mode in dynamo to keep the pipeline easy to reason about.
+<<<<<<< HEAD
     with torch.nn.utils.stateless._reparametrize_module(
         mod,
         fake_params_buffers,
@@ -813,6 +960,21 @@ def _export_to_aten_ir(
         strict=True,
         stack_weights=True,
     ), grad_safe_guard, _ignore_backend_decomps(), _compiling_state_context(), custom_triton_ops_decomposition_ctx():  # type: ignore[attr-defined]
+=======
+    with (
+        torch.nn.utils.stateless._reparametrize_module(
+            mod,
+            fake_params_buffers,
+            tie_weights=True,
+            strict=True,
+            stack_weights=True,
+        ),
+        grad_safe_guard,
+        _ignore_backend_decomps(),
+        _compiling_state_context(),
+        custom_triton_ops_decomposition_ctx(),
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gm, graph_signature = transform(aot_export_module)(
             mod,
             fake_args,
@@ -911,7 +1073,11 @@ def _rewrite_dynamo_tensor_constants(
     orig_mod_buffers: set[torch.Tensor],
     traced_mod_buffers: dict[str, torch.Tensor],
     graph_signature: ExportGraphSignature,
+<<<<<<< HEAD
     constants: dict[str, Union[torch.Tensor, FakeScriptObject, torch.ScriptObject]],
+=======
+    constants: dict[str, _ConstantAttributeType],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     Dynamo erroneously marks tensor attributes on modules as buffers.
@@ -932,7 +1098,11 @@ def _rewrite_dynamo_tensor_constants(
 def _move_non_persistent_buffers_to_tensor_constants(
     orig_mod: torch.nn.Module,
     graph_signature: ExportGraphSignature,
+<<<<<<< HEAD
     constants: dict[str, Union[torch.Tensor, FakeScriptObject, torch.ScriptObject]],
+=======
+    constants: dict[str, _ConstantAttributeType],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> None:
     """
     Moves non-persistent buffers to tensor constants.
@@ -1148,10 +1318,22 @@ def _process_export_inputs(mod, args, kwargs, dynamic_shapes):
     kwargs = kwargs if kwargs is not None else {}
     _, original_in_spec = pytree.tree_flatten((args, kwargs))
 
+<<<<<<< HEAD
     if isinstance(dynamic_shapes, torch.export.ShapesCollection):
         dynamic_shapes = dynamic_shapes.dynamic_shapes(mod, args, kwargs)
 
     return args, kwargs, original_in_spec, dynamic_shapes
+=======
+    if isinstance(dynamic_shapes, torch.export.AdditionalInputs):
+        verify_additional_inputs = dynamic_shapes.verify
+        dynamic_shapes = dynamic_shapes.dynamic_shapes(mod, args, kwargs)
+    else:
+        verify_additional_inputs = lambda ep: None  # noqa: E731
+        if isinstance(dynamic_shapes, torch.export.ShapesCollection):
+            dynamic_shapes = dynamic_shapes.dynamic_shapes(mod, args, kwargs)
+
+    return args, kwargs, original_in_spec, dynamic_shapes, verify_additional_inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_module_call_graph(
@@ -1166,9 +1348,15 @@ def _get_module_call_graph(
     """
     gm: torch.fx.GraphModule = export_artifact.aten.gm
     export_graph_signature: ExportGraphSignature = export_artifact.aten.sig
+<<<<<<< HEAD
     module_call_specs: dict[
         str, dict[str, TreeSpec]
     ] = export_artifact.module_call_specs
+=======
+    module_call_specs: dict[str, dict[str, TreeSpec]] = (
+        export_artifact.module_call_specs
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     in_spec: TreeSpec = export_artifact.in_spec
     out_spec: TreeSpec = export_artifact.out_spec
 
@@ -1202,7 +1390,16 @@ def _get_module_call_graph(
 
 
 def _get_range_constraints(
+<<<<<<< HEAD
     export_artifact: ExportArtifact, combined_args: dict[str, Any], dynamic_shapes
+=======
+    mod: torch.nn.Module,
+    export_artifact: ExportArtifact,
+    args,
+    kwargs,
+    dynamic_shapes,
+    _is_torch_jit_trace=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     gm: torch.fx.GraphModule = export_artifact.aten.gm
     export_graph_signature: ExportGraphSignature = export_artifact.aten.sig
@@ -1215,6 +1412,28 @@ def _get_range_constraints(
         ),
         len(export_graph_signature.input_specs),
     )
+<<<<<<< HEAD
+=======
+    combined_args = _combine_args(
+        mod, args, kwargs, _is_torch_jit_trace=_is_torch_jit_trace
+    )
+
+    # This is because we trace based on the kewargs passed in from user
+    # not based on the signature. I feel it would be better to just enforce
+    # one ordering at the start of tracing to avoid confusions, but that is
+    # bigger refactor, so do this to unblock for now.
+    if not _is_torch_jit_trace:
+        combined_args_traced_order = {}
+        for arg in combined_args:
+            if arg not in kwargs:
+                combined_args_traced_order[arg] = combined_args[arg]
+
+        for key in kwargs:
+            combined_args_traced_order[key] = kwargs[key]
+
+        combined_args = combined_args_traced_order
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     range_constraints = make_constraints(
         fake_mode,
         gm,
@@ -1262,6 +1481,7 @@ def _temp_disable_texpr_fuser():
         torch._C._jit_set_texpr_fuser_enabled(original_state)
 
 
+<<<<<<< HEAD
 class _WrapperModule(torch.nn.Module):
     def __init__(self, f):
         super().__init__()
@@ -1271,6 +1491,8 @@ def forward(self, *args, **kwargs):
         return self.f(*args, **kwargs)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _convert_ts_to_export_experimental(traced_callable, args, kwargs=None):
     with _temp_disable_texpr_fuser():
         from torch.jit._trace import TopLevelTracedModule
@@ -1287,7 +1509,12 @@ def _convert_ts_to_export_experimental(traced_callable, args, kwargs=None):
             ).module()
 
         elif isinstance(traced_callable, torch.ScriptMethod) and isinstance(
+<<<<<<< HEAD
             traced_callable.owner(), (torch._C.ScriptModule, torch.nn.Module)  # type: ignore[operator]
+=======
+            traced_callable.owner(),  # type: ignore[operator]
+            (torch._C.ScriptModule, torch.nn.Module),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             with patch_forward(traced_callable.owner(), traced_callable):  # type: ignore[operator]
                 return _export(
@@ -1314,6 +1541,7 @@ def _strict_export(
     kwargs: dict[str, Any],
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]],
     preserve_module_call_signature: tuple[str, ...],
+<<<<<<< HEAD
     pre_dispatch: bool,
     original_state_dict: dict[str, Any],
     orig_in_spec: TreeSpec,
@@ -1349,6 +1577,17 @@ def _strict_export_lower_to_aten_ir(
     _is_torch_jit_trace: bool,
     lower_to_aten_callback: Callable,
 ) -> ExportArtifact:
+=======
+    orig_in_spec: TreeSpec,
+    allow_complex_guards_as_runtime_asserts: bool,
+    _is_torch_jit_trace: bool,
+    _to_aten_func: Callable,
+) -> ExportArtifact:
+    """
+    _to_aten_func can either be `_export_to_aten_ir_make_fx` or `_export_to_aten_ir`
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gm_torch_level = _export_to_torch_ir(
         mod,
         args,
@@ -1378,9 +1617,15 @@ def _strict_export_lower_to_aten_ir(
             attr = getattr(gm_torch_level, node.target)
             # Checks if it is not a HigherOrderOp branch or a module
             if not isinstance(attr, torch.nn.Module):
+<<<<<<< HEAD
                 assert (
                     dynamo_fake_mode is not None
                 ), "Cannot find dynamo_fake_mode. This could be due to the exported graph module have no placeholders."
+=======
+                assert dynamo_fake_mode is not None, (
+                    "Cannot find dynamo_fake_mode. This could be due to the exported graph module have no placeholders."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 node.meta["val"] = dynamo_fake_mode.from_tensor(
                     attr, static_shapes=True
                 )
@@ -1436,8 +1681,15 @@ def _strict_export_lower_to_aten_ir(
         for name in non_persistent_buffers
         if name in reverse_name_lookup
     }
+<<<<<<< HEAD
     with dynamo_fake_mode:
         aten_export_artifact = lower_to_aten_callback(
+=======
+
+    tx = TracingContext(dynamo_fake_mode)
+    with dynamo_fake_mode, tracing(tx):
+        aten_export_artifact = _to_aten_func(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gm_torch_level,
             # NOTE: graph module expects only positional args
             _convert_to_positional_args(orig_arg_names, fake_args, fake_kwargs),
@@ -1630,8 +1882,14 @@ def override_getattribute_for_subclasses(args):
             if non_strict_root is not None:
                 input_names = _graph_input_names(gm)
                 buffer_input_names = {
+<<<<<<< HEAD
                     buf: input_names[param_len + i]
                     for i, buf in enumerate(non_strict_root._buffers)
+=======
+                    name: input_names[param_len + i]
+                    for i, (name, buf) in enumerate(non_strict_root._buffers.items())
+                    if buf is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 }
                 output_node = list(gm.graph.nodes)[-1]
                 # We copy nodes corresponding to buffer assignments to buffers in the graph.
@@ -1694,6 +1952,7 @@ def _is_impure(node):
     # This _reparametrize_module makes sure inputs and module.params/buffers have the same fake_mode,
     # otherwise aot_export_module will error out because it sees a mix of fake_modes.
     # And we want aot_export_module to use the fake_tensor mode in dynamo to keep the pipeline easy to reason about.
+<<<<<<< HEAD
     with torch.nn.utils.stateless._reparametrize_module(
         mod,
         fake_params_buffers,
@@ -1701,6 +1960,19 @@ def _is_impure(node):
         strict=True,
         stack_weights=True,
     ), _ignore_backend_decomps(), _compiling_state_context():  # type: ignore[attr-defined]
+=======
+    with (
+        torch.nn.utils.stateless._reparametrize_module(
+            mod,
+            fake_params_buffers,
+            tie_weights=True,
+            strict=True,
+            stack_weights=True,
+        ),
+        _ignore_backend_decomps(),
+        _compiling_state_context(),
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gm, graph_signature = transform(_make_fx_helper)(
             mod,
             fake_args,
@@ -1753,7 +2025,10 @@ def set_missing_meta_vals(gm, flat_args, num_params_buffers):
     #    need to have their metadata set before lifting them because it is needed
     #    for computing the exported program's signature.
     index = 0
+<<<<<<< HEAD
     fake_mode = detect_fake_mode(flat_args)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in gm.graph.nodes:
         if node.op == "placeholder":
             if index >= num_params_buffers:
@@ -1761,6 +2036,7 @@ def set_missing_meta_vals(gm, flat_args, num_params_buffers):
                 if not isinstance(user_arg, torch.Tensor):
                     node.meta["val"] = user_arg
             index += 1
+<<<<<<< HEAD
         if node.op == "get_attr":
             val = _get_attr(gm, node.target)
             if isinstance(val, torch.Tensor):
@@ -1771,6 +2047,8 @@ def set_missing_meta_vals(gm, flat_args, num_params_buffers):
                     "(2) we should have NOT yet fakified OR lifted tensor constants. "
                 )
                 node.meta["val"] = fake_mode.from_tensor(val, static_shapes=True)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _find_node(gm: torch.fx.GraphModule, name: str) -> torch.fx.Node:
@@ -1783,6 +2061,7 @@ def _non_strict_export(
     kwargs: dict[str, Any],
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]],
     preserve_module_call_signature: tuple[str, ...],
+<<<<<<< HEAD
     pre_dispatch: bool,
     original_state_dict: dict[str, Any],
     orig_in_spec: TreeSpec,
@@ -1795,6 +2074,17 @@ def _non_strict_export(
     _export_to_aten_ir_make_fx and _export_to_aten_ir, respectively.
     """
     assert dispatch_tracing_mode in ["make_fx", "aot_export"]
+=======
+    orig_in_spec: TreeSpec,
+    allow_complex_guards_as_runtime_asserts: bool,
+    _is_torch_jit_trace: bool,
+    _to_aten_func: Callable,
+) -> ExportArtifact:
+    """
+    _to_aten_func can either be `_export_to_aten_ir_make_fx` or `_export_to_aten_ir`
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     out_spec: Optional[TreeSpec] = None
     in_spec: Optional[TreeSpec] = None
 
@@ -1898,6 +2188,7 @@ def _produce_guards_callback(gm):
             _is_torch_jit_trace=_is_torch_jit_trace,
         )
 
+<<<<<<< HEAD
     with fake_mode, _NonStrictTorchFunctionHandler():
         with _fakify_script_objects(mod, fake_args, fake_kwargs, fake_mode) as (
             patched_mod,
@@ -1915,6 +2206,34 @@ def _produce_guards_callback(gm):
                     _is_torch_jit_trace=_is_torch_jit_trace,
                 )
             )
+=======
+    tx = TracingContext(fake_mode)
+
+    # We also need to attach dynamo configs as these will be used in HOOs that
+    # use torch.compile, like cond
+    dynamo_config = dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)
+    dynamo_config["do_not_emit_runtime_asserts"] = (
+        False  # We want to emit runtime asserts
+    )
+
+    with (
+        fake_mode,
+        _NonStrictTorchFunctionHandler(),
+        tracing(tx),
+        torch._dynamo.config.patch(dynamo_config),
+    ):
+        with (
+            _fakify_script_objects(mod, fake_args, fake_kwargs, fake_mode) as (
+                patched_mod,
+                new_fake_args,
+                new_fake_kwargs,
+                new_fake_constant_attrs,
+                map_fake_to_real,
+            ),
+            _fakify_module_inputs(fake_args, fake_kwargs, fake_mode),
+            _override_builtin_ops(),
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             aten_export_artifact = _to_aten_func(  # type: ignore[operator]
                 patched_mod,
                 new_fake_args,
@@ -1965,10 +2284,15 @@ def _export_for_training(
         kwargs,
         orig_in_spec,
         dynamic_shapes,
+<<<<<<< HEAD
+=======
+        verify_additional_inputs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) = _process_export_inputs(mod, args, kwargs, dynamic_shapes)
 
     original_state_dict = _get_original_state_dict(mod)
 
+<<<<<<< HEAD
     export_func = (
         functools.partial(
             _strict_export_lower_to_aten_ir,
@@ -1981,16 +2305,29 @@ def _export_for_training(
         )
     )
     export_artifact = export_func(  # type: ignore[operator]
+=======
+    # Call the appropriate export function based on the strictness of tracing.
+    export_func = _strict_export if strict else _non_strict_export
+
+    export_artifact = export_func(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mod=mod,
         args=args,
         kwargs=kwargs,
         dynamic_shapes=dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
+<<<<<<< HEAD
         pre_dispatch=False,
         original_state_dict=original_state_dict,
         orig_in_spec=orig_in_spec,
         allow_complex_guards_as_runtime_asserts=False,
         _is_torch_jit_trace=False,
+=======
+        orig_in_spec=orig_in_spec,
+        allow_complex_guards_as_runtime_asserts=False,
+        _is_torch_jit_trace=False,
+        _to_aten_func=_export_to_aten_ir_make_fx,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     export_graph_signature = export_artifact.aten.sig
@@ -2002,8 +2339,15 @@ def _export_for_training(
     # Note: _get_range_constraints depends on "inline_constraints" to be set.
     export_artifact.aten.gm.meta["inline_constraints"] = inline_constraints
     range_constraints = _get_range_constraints(
+<<<<<<< HEAD
         export_artifact,
         _combine_args(mod, args, kwargs, _is_torch_jit_trace=False),
+=======
+        mod,
+        export_artifact,
+        args,
+        kwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dynamic_shapes,
     )
     # The returned the gm is in-place modified
@@ -2034,6 +2378,10 @@ def _export_for_training(
         verifiers=[TrainingIRVerifier],
     )
 
+<<<<<<< HEAD
+=======
+    verify_additional_inputs(exported_program)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return exported_program
 
 
@@ -2056,7 +2404,11 @@ def _export(
     operations inside and produce a ExportedProgram.
 
     Args:
+<<<<<<< HEAD
         f: the `nn.Module` to trace.
+=======
+        mod: the `nn.Module` to trace.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         args: example positional inputs.
 
@@ -2092,7 +2444,11 @@ def _export(
          while not emitting runtime asserts, returning a cleaner graph with lesser guarantees around dynamic shapes.
 
     Returns:
+<<<<<<< HEAD
         An ExportedProgram containing the traced method.
+=======
+        An ExportedProgram containing the traced module.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     from torch._utils_internal import export_training_ir_rollout_check
@@ -2133,6 +2489,10 @@ def _export(
         kwargs,
         original_in_spec,
         dynamic_shapes,
+<<<<<<< HEAD
+=======
+        verify_additional_inputs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) = _process_export_inputs(mod, args, kwargs, dynamic_shapes)
 
     original_state_dict = _get_original_state_dict(mod)
@@ -2141,6 +2501,7 @@ def _export(
     export_func = _strict_export if strict else _non_strict_export
 
     export_artifact = export_func(  # type: ignore[operator]
+<<<<<<< HEAD
         mod,
         args,
         kwargs,
@@ -2151,6 +2512,21 @@ def _export(
         original_in_spec,
         allow_complex_guards_as_runtime_asserts,
         _is_torch_jit_trace,
+=======
+        mod=mod,
+        args=args,
+        kwargs=kwargs,
+        dynamic_shapes=dynamic_shapes,
+        preserve_module_call_signature=preserve_module_call_signature,
+        orig_in_spec=original_in_spec,
+        allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+        _is_torch_jit_trace=_is_torch_jit_trace,
+        _to_aten_func=functools.partial(
+            _export_to_aten_ir,
+            pre_dispatch=pre_dispatch,
+            _is_torch_jit_trace=_is_torch_jit_trace,
+        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     export_graph_signature: ExportGraphSignature = export_artifact.aten.sig
 
@@ -2163,9 +2539,18 @@ def _export(
     # Note: this step must be before _get_range_constraints.
     export_artifact.aten.gm.meta["inline_constraints"] = inline_constraints
     range_constraints = _get_range_constraints(
+<<<<<<< HEAD
         export_artifact,
         _combine_args(mod, args, kwargs, _is_torch_jit_trace=_is_torch_jit_trace),
         dynamic_shapes,
+=======
+        mod,
+        export_artifact,
+        args,
+        kwargs,
+        dynamic_shapes,
+        _is_torch_jit_trace=_is_torch_jit_trace,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     gm, module_call_graph = _get_module_call_graph(
         export_artifact,
@@ -2200,4 +2585,8 @@ def _export(
 
     dtrace_structured("exported_program", payload_fn=lambda: str(exported_program))
 
+<<<<<<< HEAD
+=======
+    verify_additional_inputs(exported_program)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return exported_program
diff --git a/torch/export/_unlift.py b/torch/export/_unlift.py
index ebbe927971e1..d7ba063cc5fd 100644
--- a/torch/export/_unlift.py
+++ b/torch/export/_unlift.py
@@ -14,7 +14,13 @@
 )
 from torch._export.utils import _check_input_constraints_for_graph
 from torch.export.unflatten import _assign_attr, _AttrKind
+<<<<<<< HEAD
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+=======
+from torch.fx.experimental.proxy_tensor import _pytree_subclasses_that_lose_info
+from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+from torch.fx.traceback import NodeSource, NodeSourceAction
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from ._remove_effect_tokens_pass import _remove_effect_tokens
 from ._tree_utils import reorder_kwargs
@@ -26,20 +32,58 @@
 )
 
 
+<<<<<<< HEAD
+=======
+def eq_spec(self: pytree.TreeSpec, other: pytree.TreeSpec) -> bool:
+    """
+    Refinement of TreeSpec.__eq__ where, e.g., torch.Size(...) matches tuple(...).
+    See _pytree_subclasses_that_lose_info in proxy_tensor.py for more details.
+    """
+
+    def _normalize_type(t):
+        return str(_pytree_subclasses_that_lose_info.get(t, t))
+
+    def _match_normalized_structure(a, b):
+        if a is b:
+            return True
+        if _normalize_type(a.type) != _normalize_type(b.type):
+            return False
+        if a.context != b.context:
+            return False
+        if len(a.children_specs) != len(b.children_specs):
+            return False
+        return all(
+            _match_normalized_structure(a, b)
+            for a, b in zip(a.children_specs, b.children_specs)
+        )
+
+    return _match_normalized_structure(self, other)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _check_inputs_match(args, kwargs, in_spec: pytree.TreeSpec) -> list:
     reordered_kwargs = reorder_kwargs(kwargs, in_spec)
     flat_args_with_path, received_spec = pytree.tree_flatten_with_path(
         (args, reordered_kwargs)
     )
 
+<<<<<<< HEAD
     if received_spec != in_spec:
+=======
+    if not eq_spec(received_spec, in_spec):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise ValueError(  # noqa: B904
             "Trying to flatten user inputs with exported input tree spec: \n"
             f"{in_spec}\n"
             "but actually got inputs with tree spec of: \n"
             f"{received_spec}.\n"
+<<<<<<< HEAD
             "Please check that the inputs have the same number of args "
             "and kwargs as the ones you used when tracing."
+=======
+            "Please check that the inputs have the same number and type of "
+            "args and kwargs as the ones you used when tracing."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return flat_args_with_path
@@ -78,11 +122,30 @@ def _unlift_inputs_as_getattr(
 
         else:
             with gm.graph.inserting_after(input_node):
+<<<<<<< HEAD
                 getattr_node = gm.graph.get_attr(lifted_node)
+=======
+                # It is fine to ignore this warning because
+                # it is guaranteed that we will populate this
+                # attr later.
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    getattr_node = gm.graph.get_attr(lifted_node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 input_node.replace_all_uses_with(getattr_node)
                 metadata = input_node.meta
                 gm.graph.erase_node(input_node)
                 getattr_node.meta = metadata
+<<<<<<< HEAD
+=======
+                getattr_node.meta["from_node"] = [
+                    NodeSource(
+                        input_node,
+                        "ExportedProgram.module().unlift()",
+                        [NodeSourceAction.CREATE, NodeSourceAction.REPLACE],
+                    )
+                ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 unlifted_name_to_node[lifted_node] = getattr_node
 
     return unlifted_name_to_node, input_name_to_node
@@ -140,6 +203,16 @@ def _insert_copy_for_mutations(
         gm.graph.erase_node(output_node)
         new_output.name = output_node.name
         new_output.meta.update(output_node.meta)
+<<<<<<< HEAD
+=======
+        new_output.meta["from_node"] = [
+            NodeSource(
+                output_node,
+                "ExportedProgram.module().unlift()",
+                [NodeSourceAction.CREATE, NodeSourceAction.REPLACE],
+            )
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_codegen(
@@ -414,6 +487,14 @@ def _unlift_exported_program_lifted_states(ep: ExportedProgram) -> torch.nn.Modu
         for out_spec in ep.graph_signature.output_specs
     ]
 
+<<<<<<< HEAD
+=======
+    for node in new_gm.graph.nodes:
+        node.meta["from_node"] = [
+            NodeSource(node, "ExportedProgram.module()", NodeSourceAction.CREATE)
+        ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     new_gm = _unlift(
         new_gm,
         lifted_inputs,
diff --git a/torch/export/_wrapper_utils.py b/torch/export/_wrapper_utils.py
new file mode 100644
index 000000000000..bc27a8575a0a
--- /dev/null
+++ b/torch/export/_wrapper_utils.py
@@ -0,0 +1,10 @@
+import torch
+
+
+class _WrapperModule(torch.nn.Module):
+    def __init__(self, f):  # type: ignore[no-untyped-def]
+        super().__init__()
+        self.f = f
+
+    def forward(self, *args, **kwargs):  # type: ignore[no-untyped-def]
+        return self.f(*args, **kwargs)
diff --git a/torch/export/custom_ops.py b/torch/export/custom_ops.py
index 54392ebaf884..1fab307bb253 100644
--- a/torch/export/custom_ops.py
+++ b/torch/export/custom_ops.py
@@ -9,6 +9,12 @@
 
 
 @torch.library.impl(lib, "access_subclass_inner_tensor", "Autograd")
+<<<<<<< HEAD
+=======
+# When running under torch.inference_mode(), we seem to skip AUtograd key
+# so we should desugar this op as soon as we start tracing to post-dispatch.
+@torch.library.impl(lib, "access_subclass_inner_tensor", "Python")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _access_subclass_inner_tensor(
     src_subclass_tensor: torch.Tensor, attr: str
 ) -> torch.Tensor:
diff --git a/torch/export/dynamic_shapes.py b/torch/export/dynamic_shapes.py
index 1bda46c3abd4..8abbc31495f4 100644
--- a/torch/export/dynamic_shapes.py
+++ b/torch/export/dynamic_shapes.py
@@ -17,6 +17,10 @@
     SequenceKey,
     SUPPORTED_NODES,
     tree_flatten,
+<<<<<<< HEAD
+=======
+    tree_map,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tree_map_with_path,
 )
 
@@ -34,13 +38,21 @@
     "Dim",
     "dims",
     "refine_dynamic_shapes_from_suggested_fixes",
+<<<<<<< HEAD
+=======
+    "AdditionalInputs",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 class _DimHint(Enum):
+=======
+class _DimHintType(Enum):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Enum for dynamic shape hints.
     - AUTO means automatic inference of shape (static or dynamic).
@@ -53,6 +65,7 @@ class _DimHint(Enum):
     DYNAMIC = auto()
 
 
+<<<<<<< HEAD
 class _Dim(type):
     """
     Metaclass for :func:`Dim` types.
@@ -66,6 +79,167 @@ def readable(name, min_, max_):
             min_ = None
         if max_ == int_oo:
             max_ = None
+=======
+@dataclasses.dataclass
+class _DimHint:
+    type: _DimHintType
+    min: Optional[int] = None
+    max: Optional[int] = None
+    _factory: Optional[bool] = True
+
+    @staticmethod
+    def AUTO():
+        return _DimHint(_DimHintType.AUTO)
+
+    @staticmethod
+    def DYNAMIC():
+        return _DimHint(_DimHintType.DYNAMIC)
+
+    @staticmethod
+    def STATIC():
+        return _DimHint(_DimHintType.STATIC)
+
+    def __call__(self, min=None, max=None) -> "_DimHint":
+        if not self._factory:
+            raise TypeError(f"'{type(self)}' object is not callable")
+        assert min is None or min >= 0, "min must be non-negative"
+        assert max is None or max >= 0, "max must be non-negative"
+        assert min is None or max is None or min <= max, "min must be <= max"
+        return _DimHint(self.type, min=min, max=max, _factory=False)
+
+
+class Dim:
+    """
+    The `Dim` class allows users to specify dynamism in their exported programs. By marking a dimension with a `Dim`,
+    the compiler associates the dimension with a symbolic integer containing a dynamic range.
+
+    The API can be used in 2 ways: Dim hints (i.e. automatic dynamic shapes: `Dim.AUTO`, `Dim.DYNAMIC`, `Dim.STATIC`),
+    or named Dims (i.e. `Dim("name", min=1, max=2)`).
+
+    Dim hints provide the lowest barrier to exportability, with the user only needing to specify if a dimension
+    if dynamic, static, or left for the compiler to decide (`Dim.AUTO`). The export process will automatically
+    infer the remaining constraints on min/max ranges and relationships between dimensions.
+
+    Example::
+
+        class Foo(nn.Module):
+            def forward(self, x, y):
+                assert x.shape[0] == 4
+                assert y.shape[0] >= 16
+                return x @ y
+
+
+        x = torch.randn(4, 8)
+        y = torch.randn(8, 16)
+        dynamic_shapes = {
+            "x": {0: Dim.AUTO, 1: Dim.AUTO},
+            "y": {0: Dim.AUTO, 1: Dim.AUTO},
+        }
+        ep = torch.export(Foo(), (x, y), dynamic_shapes=dynamic_shapes)
+
+    Here, export would raise an exception if we replaced all uses of `Dim.AUTO` with `Dim.DYNAMIC`,
+    as x.shape[0] is constrained to be static by the model.
+
+    More complex relations between dimensions may also be codegened as runtime assertion nodes by the compiler,
+    e.g. (x.shape[0] + y.shape[1]) % 4 == 0, to be raised if runtime inputs do not satisfy such constraints.
+
+    You may also specify min-max bounds for Dim hints, e.g. `Dim.AUTO(min=16, max=32)`, `Dim.DYNAMIC(max=64)`,
+    with the compiler inferring the remaining constraints within the ranges. An exception will be raised if
+    the valid range is entirely outside the user-specified range.
+
+    Named Dims provide a stricter way of specifying dynamism, where exceptions are raised if the compiler
+    infers constraints that do not match the user specification. For example, exporting the previous
+    model, the user would need the following `dynamic_shapes` argument::
+
+        s0 = Dim("s0")
+        s1 = Dim("s1", min=16)
+        dynamic_shapes = {
+            "x": {0: 4, 1: s0},
+            "y": {0: s0, 1: s1},
+        }
+        ep = torch.export(Foo(), (x, y), dynamic_shapes=dynamic_shapes)
+
+    Named Dims also allow specification of relationships between dimensions, up to univariate linear relations.
+    For example, the following indicates one dimension is a multiple of another plus 4::
+
+        s0 = Dim("s0")
+        s1 = 3 * s0 + 4
+
+    """
+
+    AUTO = _DimHint.AUTO()
+    DYNAMIC = _DimHint.DYNAMIC()
+    STATIC = _DimHint.STATIC()
+
+    def __init__(
+        self, name: str, *, min: Optional[int] = None, max: Optional[int] = None
+    ):
+        from torch.utils._sympy.numbers import int_oo
+
+        _min = 0 if min is None else min
+        _max = int_oo if max is None else max
+        assert _max > _min, f"Cannot create Dim with inconsistent min={min}, max={max}"
+        assert name.isidentifier(), f"Dim name must be a valid identifier, got {name}"
+        self.__name__ = name
+        self.min = _min
+        self.max = _max
+
+    def __add__(self, other) -> "Dim":
+        # e.g., dim + 1
+        if type(other) is not int:
+            raise NotImplementedError(
+                f"Attempted to add {other} to {self.__name__}, where an integer was expected. "
+                "(Only increasing linear operations with integer coefficients are supported.)"
+            )
+        return self._derive(lambda x: x + other)
+
+    def __radd__(self, other) -> "Dim":
+        return self + other
+
+    def __sub__(self, other) -> "Dim":
+        # e.g., dim - 1
+        if type(other) is not int:
+            raise NotImplementedError(
+                f"Attempted to subtract {other} from {self.__name__}, where an integer was expected. "
+                "(Only increasing linear operations with integer coefficients are supported.)"
+            )
+        return self._derive(lambda x: x - other)
+
+    def __rsub__(self, other) -> "Dim":
+        raise NotImplementedError(
+            f"Attempted to negate {self.__name__}. "
+            "(Only increasing linear operations with integer coefficients are supported.)"
+        )
+
+    def __mul__(self, other) -> "Dim":
+        # e.g., dim * 2
+        if type(other) is not int or other <= 0:
+            raise NotImplementedError(
+                f"Attempted to multiply {other} with {self.__name__}, where a positive integer was expected. "
+                "(Only increasing linear operations with integer coefficients are supported.)"
+            )
+        return self._derive(lambda x: x * other)
+
+    def __rmul__(self, other) -> "Dim":
+        return self * other
+
+    def _derived_name(self, fn) -> str:
+        from sympy import sympify
+
+        return str(fn(sympify(self.__name__)))
+
+    def _derive(self, fn) -> "Dim":
+        return _DerivedDim(self._derived_name(fn), self, fn)
+
+    @staticmethod
+    def _readable(name: str, min_: int, max_: int) -> str:
+        from torch.utils._sympy.numbers import int_oo
+
+        if min_ == 2:
+            min_ = None  # type: ignore[assignment]
+        if max_ == int_oo:
+            max_ = None  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if min_ is None and max_ is None:
             return f"Dim('{name}')"
         if min_ is None:
@@ -74,6 +248,7 @@ def readable(name, min_, max_):
             return f"Dim('{name}', min={min_})"
         return f"Dim('{name}', min={min_}, max={max_})"
 
+<<<<<<< HEAD
     def __add__(cls, other):
         # e.g., dim + 1
         if type(other) is not int:
@@ -125,11 +300,24 @@ def _derive(cls, fn):
 class _StaticDim(_Dim):
     """
     Meta class for static :func:`Dim` types.
+=======
+    def __repr__(self):
+        return Dim._readable(self.__name__, self.min, self.max)
+
+
+_Dim = Dim  # TODO(pianpwk): remove after it's no longer internally breaking
+
+
+class _StaticDim(Dim):
+    """
+    Class for static :func:`Dim` types.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     This class is only for setting and checking static dim constraints,
     and the user should never interact with it.
     """
 
+<<<<<<< HEAD
     @property
     def min(self):
         return self.value  # type: ignore[attr-defined]
@@ -142,6 +330,24 @@ def max(self):
 class _DerivedDim(_Dim):
     """
     Metaclass for derived :func:`Dim` types.
+=======
+    def __init__(self, value: int):
+        self.__name__ = str(value)
+        self.value = value
+
+    @property
+    def min(self):  # type: ignore[override]
+        return self.value  # type: ignore[attr-defined]
+
+    @property
+    def max(self):  # type: ignore[override]
+        return self.value  # type: ignore[attr-defined]
+
+
+class _DerivedDim(Dim):
+    """
+    Class for derived :func:`Dim` types.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Currently we only support increasing linear expressions with integer coefficients.
     In other words, a derived Dim can always be written in the form Ax + B, where
@@ -155,8 +361,18 @@ class _DerivedDim(_Dim):
     The range of a derived Dim is computed by mapping `fn` over the range of its `root`.
     """
 
+<<<<<<< HEAD
     @property
     def min(self):
+=======
+    def __init__(self, name: str, root: Dim, fn: Callable):
+        self.__name__ = name
+        self.root = root
+        self.fn = fn
+
+    @property
+    def min(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # assume that self.fn is an increasing function
         # TODO(avik): use sympy value range analysis instead?
         from sympy import Integer
@@ -176,7 +392,11 @@ def min(self):
         return int(_min_symint)
 
     @property
+<<<<<<< HEAD
     def max(self):
+=======
+    def max(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # assume that self.fn is an increasing function
         # TODO(avik): use sympy value range analysis instead?
         from sympy import Integer
@@ -201,6 +421,7 @@ def _derive(self, fn):
         # As a consequence, roots are always regular Dims (i.e., not derived Dims).
         return _DerivedDim(
             self._derived_name(fn),
+<<<<<<< HEAD
             (int,),
             {"root": self.root, "fn": lambda x: fn(self.fn(x))},  # type: ignore[attr-defined]
         )
@@ -238,18 +459,34 @@ def Dim(name: str, *, min: Optional[int] = None, max: Optional[int] = None):
 Dim.AUTO = _DimHint.AUTO  # type: ignore[attr-defined]
 Dim.STATIC = _DimHint.STATIC  # type: ignore[attr-defined]
 Dim.DYNAMIC = _DimHint.DYNAMIC  # type: ignore[attr-defined]
+=======
+            self.root,
+            lambda x: fn(self.fn(x)),
+        )
+
+    def __repr__(self):
+        return self.__name__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def dims(
     *names: str, min: Optional[int] = None, max: Optional[int] = None
+<<<<<<< HEAD
 ) -> tuple[_Dim, ...]:
+=======
+) -> tuple[Dim, ...]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Util to create multiple :func:`Dim` types.
 
     Returns:
         A tuple of :func:`Dim` types.
     """
+<<<<<<< HEAD
     return tuple(Dim(name, min=min, max=max) for name in names)
+=======
+    return tuple(Dim(name, min=min, max=max) for name in names)  # type: ignore[misc]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclasses.dataclass
@@ -399,6 +636,24 @@ def serializable_spec(self):
 Constraint = Union[_Constraint, _DerivedConstraint, _RelaxedConstraint]
 
 
+<<<<<<< HEAD
+=======
+@dataclasses.dataclass
+class _IntWrapper:
+    """
+    Dummy wrapper class to wrap around integer inputs so that when we parse the
+    dynamic_shapes structure, we can mark if any of the integers were marked as
+    dynamic.
+    """
+
+    val: int
+    # Disallow specifying dynamism
+    dynamism: Optional[Union[_DimHint, int]] = dataclasses.field(
+        init=False, default=None
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _process_equalities(
     constraint: Constraint,
     get_sources: Callable[[int, int], list["Source"]],
@@ -609,7 +864,11 @@ class ShapesCollection:
 
     Example::
 
+<<<<<<< HEAD
         args = ({"x": tensor_x, "others": [tensor_y, tensor_z]})
+=======
+        args = {"x": tensor_x, "others": [tensor_y, tensor_z]}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         dim = torch.export.Dim(...)
         dynamic_shapes = torch.export.ShapesCollection()
@@ -619,12 +878,34 @@ class ShapesCollection:
         # dynamic_shapes = {"x": (dim, dim + 1, 8), "others": [{0: dim * 2}, None]}
 
         torch.export(..., args, dynamic_shapes=dynamic_shapes)
+<<<<<<< HEAD
+=======
+
+    To specify dynamism for integers, we need to first wrap the integers using
+    _IntWrapper so that we have a "unique identification tag" for each integer.
+
+    Example::
+
+        args = {"x": tensor_x, "others": [int_x, int_y]}
+        # Wrap all ints with _IntWrapper
+        mapped_args = pytree.tree_map_only(int, lambda a: _IntWrapper(a), args)
+
+        dynamic_shapes = torch.export.ShapesCollection()
+        dynamic_shapes[tensor_x] = (dim, dim + 1, 8)
+        dynamic_shapes[mapped_args["others"][0]] = Dim.DYNAMIC
+
+        # This is equivalent to the following (now auto-generated):
+        # dynamic_shapes = {"x": (dim, dim + 1, 8), "others": [Dim.DYNAMIC, None]}
+
+        torch.export(..., args, dynamic_shapes=dynamic_shapes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(self):
         self._shapes = {}
 
     def __setitem__(self, t, shape):
+<<<<<<< HEAD
         assert isinstance(
             t, torch.Tensor
         ), f"Cannot assign shape to non-tensor type {type(t)}"
@@ -635,6 +916,20 @@ def __setitem__(self, t, shape):
             assert (
                 shape == _shape
             ), f"Shapes assigned to tensor do not match: expected {_shape}, got {shape}"
+=======
+        assert isinstance(t, (torch.Tensor, _IntWrapper)), (
+            f"Cannot assign shape to non-tensor or non-_IntWrapper type {type(t)}"
+        )
+
+        # TODO(avik): check that shape is indeed a Shape
+
+        t_id = id(t)
+        if t_id in self._shapes:
+            _shape = self._shapes[t_id]
+            assert shape == _shape, (
+                f"Shapes assigned to input do not match: expected {_shape}, got {shape}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             self._shapes[id(t)] = shape
 
@@ -673,6 +968,104 @@ def find_shape(path, t):
         return dynamic_shapes
 
 
+<<<<<<< HEAD
+=======
+class AdditionalInputs:
+    """
+    Infers dynamic_shapes based on additional inputs.
+
+    This is useful particularly for deployment engineers who, on the one hand, may
+    have access to ample testing or profiling data that can provide a fair sense of
+    representative inputs for a model, but on the other hand, may not know enough
+    about the model to guess which input shapes should be dynamic.
+
+    Input shapes that are different than the original are considered dynamic; conversely,
+    those that are the same as the original are considered static. Moreover, we verify
+    that the additional inputs are valid for the exported program. This guarantees that
+    tracing with them instead of the original would have generated the same graph.
+
+    Example::
+
+        args0, kwargs0 = ...  # example inputs for export
+
+        # other representative inputs that the exported program will run on
+        dynamic_shapes = torch.export.AdditionalInputs()
+        dynamic_shapes.add(args1, kwargs1)
+        ...
+        dynamic_shapes.add(argsN, kwargsN)
+
+        torch.export(..., args0, kwargs0, dynamic_shapes=dynamic_shapes)
+    """
+
+    def __init__(self):
+        self._examples = []
+
+    def add(self, args, kwargs=None):
+        """
+        Additional input :func:`args` and :func:`kwargs`.
+        """
+
+        assert type(args) is tuple, f"Representative args {args} must be a tuple"
+        assert kwargs is None or type(kwargs) is dict, (
+            f"Representative kwargs {kwargs} must be None or a dict"
+        )
+        self._examples.append((args, kwargs))
+
+    def dynamic_shapes(self, m, args, kwargs=None):
+        """
+        Infers a :func:`dynamic_shapes` pytree structure by merging shapes of the
+        original input :func:`args` and :func:`kwargs` and of each additional input
+        args and kwargs.
+        """
+
+        dynamic_shapes, *other_dynamic_shapes = [
+            _tree_map_with_path(
+                lambda path, t: tuple(t.shape) if isinstance(t, torch.Tensor) else t,
+                _combine_args(m, args, kwargs),
+            )
+            for args, kwargs in [(args, kwargs), *self._examples]
+        ]
+
+        def _mark_dynamism(v, *other_vs):
+            if not all(type(v) == type(other) for other in other_vs):
+                raise ValueError(
+                    "The following inputs were found to have differing types, "
+                    f"so they cannot be marked as dynamic: {(v,) + other_vs}."
+                )
+
+            if isinstance(v, int) and not isinstance(v, bool):
+                if all(other_v == v for other_v in other_vs):
+                    return None
+                else:
+                    return Dim.DYNAMIC
+            else:
+                if not all(other_v == v for other_v in other_vs):
+                    raise ValueError(
+                        "The following inputs were found to have differing values, "
+                        f"but they cannot be marked as dynamic: {(v,) + other_vs}."
+                    )
+                return None
+
+        return tree_map(
+            _mark_dynamism,
+            dynamic_shapes,
+            *other_dynamic_shapes,
+            is_leaf=lambda i: type(i) is int,
+        )
+
+    def verify(self, ep):
+        """
+        Verifies that an exported program is valid for each additional input.
+        """
+
+        epm = ep.module()
+        for args, kwargs in self._examples:
+            torch.export._unlift._check_input_constraints_pre_hook(
+                epm, args, kwargs or {}
+            )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _warn_on_None_dynamic_shape_dimension():
     msg = (
         "Using None as a dynamic shape dimension is deprecated. "
@@ -703,8 +1096,13 @@ def check_same_bounds(dim):
         if dim.__name__ in bounds:
             min_, max_ = bounds[dim.__name__]
             if dim.min != min_ or dim.max != max_:
+<<<<<<< HEAD
                 this_ = _Dim.readable(dim.__name__, min_, max_)
                 that_ = _Dim.readable(dim.__name__, dim.min, dim.max)
+=======
+                this_ = Dim._readable(dim.__name__, min_, max_)
+                that_ = Dim._readable(dim.__name__, dim.min, dim.max)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 raise UserError(
                     UserErrorType.INVALID_INPUT,
                     f"Found different definitions {this_} and {that_} "
@@ -716,7 +1114,11 @@ def check_same_bounds(dim):
     def check_symbols(path, tensor, shape):
         if isinstance(shape, dict):
             for i, dim in shape.items():
+<<<<<<< HEAD
                 if isinstance(dim, _Dim):
+=======
+                if isinstance(dim, Dim):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     check_same_bounds(dim)
                 elif dim is None:
                     _warn_on_None_dynamic_shape_dimension()
@@ -730,8 +1132,21 @@ def check_symbols(path, tensor, shape):
                         case_name="dynamic_shapes_validation",
                     )
         elif isinstance(shape, (tuple, list)):
+<<<<<<< HEAD
             for i, dim in enumerate(shape):
                 if isinstance(dim, _Dim):
+=======
+            if len(shape) != len(tensor.shape):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected dynamic shape spec {shape} specified at `dynamic_shapes{keystr(path)}` "
+                    f"to have the same length as the actual tensor shape {tensor.shape} "
+                    f"(expected {len(tensor.shape)}, but got {len(shape)} instead)",
+                    case_name="dynamic_shapes_validation",
+                )
+            for i, dim in enumerate(shape):
+                if isinstance(dim, Dim):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     check_same_bounds(dim)
                 elif dim is None:
                     _warn_on_None_dynamic_shape_dimension()
@@ -784,6 +1199,16 @@ def check_symbols(path, tensor, shape):
     def check_shape(path, t, dynamic_shape):
         if isinstance(t, torch.Tensor):
             check_symbols(path, t, dynamic_shape)
+<<<<<<< HEAD
+=======
+        elif isinstance(t, _IntWrapper):
+            if isinstance(dynamic_shape, _Dim):
+                raise ValueError(
+                    "Unable to specify input integers as dynamic through named "
+                    "Dims. Please use Dim.AUTO/DYNAMIC instead."
+                )
+            assert dynamic_shape is None or isinstance(dynamic_shape, (int, _DimHint))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             if dynamic_shape is not None:
                 rendered_path = keystr(path)
@@ -888,25 +1313,63 @@ def root_value():
                 i,
                 dim.__name__,
                 StrictMinMaxConstraint(
+<<<<<<< HEAD
                     vr=ValueRanges(lower=dim.value, upper=dim.value), warn_only=False  # type: ignore[attr-defined]
                 ),
             )
         else:
             assert isinstance(dim, _Dim)
+=======
+                    vr=ValueRanges(lower=dim.value, upper=dim.value),  # type: ignore[attr-defined]
+                    warn_only=False,
+                ),
+            )
+        else:
+            assert isinstance(dim, Dim)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             constraint = _Constraint(  # type: ignore[assignment]
                 id(tensor),
                 i,
                 dim.__name__,
                 StrictMinMaxConstraint(
+<<<<<<< HEAD
                     vr=ValueRanges(lower=dim.min, upper=dim.max), warn_only=False  # type: ignore[attr-defined]
+=======
+                    vr=ValueRanges(lower=dim.min, upper=dim.max),  # type: ignore[attr-defined]
+                    warn_only=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ),
             )
         return constraint
 
+<<<<<<< HEAD
     def update_symbols(path, tensor, shape):
         def _create_static_dim(tensor, i, value):
             return _StaticDim(str(value), (int,), {"value": value})
 
+=======
+    def _parse_tensor_dim(tensor, idx, dim) -> None:
+        def _create_static_dim(tensor, i, value):
+            return _StaticDim(value)
+
+        if isinstance(dim, (int, Dim)):
+            if isinstance(dim, int):
+                dim = _create_static_dim(tensor, idx, dim)
+            constraint = to_constraint(dim, tensor, idx)
+            symbols[dim.__name__].append(constraint)
+        elif isinstance(dim, _DimHint):
+            if dim.type == _DimHintType.AUTO:
+                torch._dynamo.maybe_mark_dynamic(tensor, idx)
+            elif dim.type == _DimHintType.STATIC:
+                torch._dynamo.mark_static(tensor, idx)
+            elif dim.type == _DimHintType.DYNAMIC:
+                torch._dynamo.mark_dynamic(tensor, idx)
+            constraints.append(_RelaxedConstraint(id(tensor), idx))
+        elif dim is None:
+            torch._dynamo.mark_static(tensor, idx)
+
+    def update_symbols(path, tensor, shape):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # clean out decorators from user side, or previous export call
         # we also delete these attributes in non_strict_utils.py/make_constraints()
         tensor._dynamo_weak_dynamic_indices = set()
@@ -917,6 +1380,7 @@ def _create_static_dim(tensor, i, value):
 
         if isinstance(shape, dict):
             for i, dim in shape.items():
+<<<<<<< HEAD
                 if isinstance(dim, (int, _Dim)):
                     if isinstance(dim, int):
                         dim = _create_static_dim(tensor, i, dim)
@@ -952,10 +1416,29 @@ def _create_static_dim(tensor, i, value):
         elif shape is None:
             for i in range(tensor.dim()):
                 torch._dynamo.mark_static(tensor, i)
+=======
+                _parse_tensor_dim(tensor, i, dim)
+        elif isinstance(shape, (tuple, list)):
+            for i, dim in enumerate(shape):
+                _parse_tensor_dim(tensor, i, dim)
+        elif shape is None:
+            for i in range(tensor.dim()):
+                _parse_tensor_dim(tensor, i, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def assoc_shape(path, t, dynamic_shape):
         if isinstance(t, torch.Tensor):
             update_symbols(path, t, dynamic_shape)
+<<<<<<< HEAD
+=======
+        elif isinstance(t, _IntWrapper):
+            # If tensor dimensions are marked as dynamic, the tensors themselves
+            # get marked using mark_dynamic. However since we can't mark
+            # integers as dynamic, we first wrap integers in this class, and
+            # then set the `dim` field of the class with the dynamic shapes dim
+            # to mark the integer as dynamic.
+            t.dynamism = dynamic_shape
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _tree_map_with_path(assoc_shape, combined_args, dynamic_shapes, tree_name="inputs")
 
@@ -978,19 +1461,31 @@ def assoc_shape(path, t, dynamic_shape):
 
 
 def _get_dim_name_mapping(
+<<<<<<< HEAD
     dynamic_shapes: Union[dict[str, Any], tuple[Any], list[Any], None]
+=======
+    dynamic_shapes: Union[dict[str, Any], tuple[Any], list[Any], None],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     name_to_dim = {}
     for dim in tree_flatten(
         dynamic_shapes,
+<<<<<<< HEAD
         is_leaf=lambda x: isinstance(x, _Dim),
+=======
+        is_leaf=lambda x: isinstance(x, Dim),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )[0]:
         if dim is None:
             # NOTE: this must denote a non-Tensor or automatic at this point.
             continue
         if isinstance(dim, int):
             continue
+<<<<<<< HEAD
         elif isinstance(dim, _Dim):
+=======
+        elif isinstance(dim, Dim):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             name_to_dim[dim.__name__] = dim
             if isinstance(dim, _DerivedDim):
                 name_to_dim[dim.root.__name__] = dim.root  # type: ignore[attr-defined]
@@ -1063,7 +1558,11 @@ def refine_dynamic_shapes_from_suggested_fixes(
             expr = sympy.sympify(expr)
             if isinstance(expr, sympy.Number):
                 # static, integer
+<<<<<<< HEAD
                 shape_fixes[name] = int(expr)
+=======
+                shape_fixes[name] = int(expr)  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 # relation or derived dim
                 shape_fixes[name] = expr
@@ -1073,7 +1572,11 @@ def refine_dynamic_shapes_from_suggested_fixes(
     # track derived dim roots
     roots: set[str] = set()
     for k, c in shape_fixes.items():
+<<<<<<< HEAD
         assert isinstance(c, (int, _Dim, _DerivedDim, sympy.Expr))
+=======
+        assert isinstance(c, (int, Dim, _DerivedDim, sympy.Expr))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(c, sympy.Expr):  # check dim/derived dim expression
             assert _is_supported_equivalence(c)
             shape_fixes[k] = c
diff --git a/torch/export/experimental/__init__.py b/torch/export/experimental/__init__.py
index b87750ec19c5..7e8dbc6969e2 100644
--- a/torch/export/experimental/__init__.py
+++ b/torch/export/experimental/__init__.py
@@ -1,5 +1,13 @@
 import copy
+<<<<<<< HEAD
 import typing
+=======
+import dataclasses
+import functools
+import types
+import typing
+import typing_extensions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch.export.exported_program import _decompose_exported_program
@@ -67,3 +75,259 @@ def _export_forward_backward(
     _remove_detach_pass(gm, new_graph_signature)
 
     return ep._update(gm, new_graph_signature)
+<<<<<<< HEAD
+=======
+
+
+@typing.no_type_check
+def _sticky_export(forward_func, dynamic_shapes_callback=None):
+    """
+    Lazily export the model on first forward call.
+    Usage:
+        model.forward = _sticky_export(model.forward, dynamic_shapes_callback=callback)
+    """
+    model = forward_func.__self__
+    original_forward = forward_func.__func__
+
+    @functools.wraps(forward_func)
+    def wrapper(*args, **kwargs):
+        # Unpatch forward to avoid recursion during export
+        model.forward = types.MethodType(original_forward, model)
+
+        dynamic_shapes_spec = None
+        if dynamic_shapes_callback:
+            dynamic_shapes_spec = dynamic_shapes_callback(*args, **kwargs)
+
+        try:
+            exported = torch.export.export(
+                model,
+                args,
+                kwargs,
+                dynamic_shapes=dynamic_shapes_spec,
+            ).module()
+            wrapper._exported_artifact = exported
+        finally:
+            # Restore the wrapper after export
+            model.forward = wrapper
+
+        return exported(*args, **kwargs)
+
+    return wrapper
+
+
+@dataclasses.dataclass
+class _ExportMethod:
+    overloads: dict[str, torch.export.ExportedProgram]
+    fallbacks: list[torch.export.ExportedProgram]
+
+
+_InputT = typing_extensions.ParamSpec("_InputT")
+_RetT = typing.TypeVar("_RetT")
+
+
+class _ExportPackage:
+    """
+    An export package is a collection of torch.export()-ed PyTorch models consisting of
+    a list of exported methods and their corresponding overloads. ExportPackage is introduced
+    on top of torch.export() to support the following use cases:
+        - Exporting a model with multiple methods if a model has multiple independent parts.
+        - Exporting a function with multiple overloads based on tensor shapes or other metadata.
+
+    ExportPackage is designed to contain multiple methods (associated with method names) and for
+    each method, it can have multiple overloads (associated with overload names).
+
+    Here is an example of the data structure for an ExportPackage:
+    ```
+    ExportPackage(
+        methods={
+            "decoder": ExportMethod(
+                overloads={
+                    "prefill": ExportedProgram(...),
+                    "decode": ExportedProgram(...),
+                },
+                fallbacks=[],
+            ),
+            "encoder": ExportMethod(overloads={}, fallbacks=[ExportedProgram(...)]),
+        },
+    )
+    ```
+
+    To export a model into an ExportPackage, users can use the exporter API provided by ExportPackage.
+    Exporter is a decorator that takes a callable and returns a wrapper. The wrapper will export the
+    function into an ExportPackage, when it's invoked with some sample inputs (similar to how
+    torch.compile() works). For more details, please refer to the document on .exporter() method.
+
+    This design allows users to decouple the exported callables from the actual sample inputs which can
+    be helpful for use cases where the exported callable is hidden behind helper functions or when sample
+    inpusts are hard to get.
+
+    NOTE: This is an experimental API and anything can be changed in the future.
+
+    Example usage:
+    ```
+        def fn(x):
+            return x + 1
+
+        def main(f, x):
+            x += 1
+            ret = f(x)
+            return ret + 1
+
+        package = ExportPackage()
+        main(package.exporter(fn), torch.randn(3, 2))
+    ```
+
+    """
+
+    def __init__(self) -> None:
+        self.methods: dict[str, _ExportMethod] = {}
+
+    def _exporter(
+        self,
+        method: str,
+        fn: typing.Callable[_InputT, _RetT],
+        *,
+        fallback: str = "once",
+    ) -> typing.Callable[_InputT, _RetT]:
+        """
+        A function/module decorator that sets up a callable to be exported later invoked.
+        By default the exporter will only trigger torch.export for once and error on
+        later invocations. To customize this behavior, users have the following two options:
+          1. Call .define_overload() method on the returned wrapper to define an overload.
+          2. Adjust the fallback policy using `fallback` argument.
+
+        An "overload" is a named branch for an ExportMethod with a user defined precondition,
+        typically based on input tensor shapes. It's up to a downstream backend implementation
+        of ExportMethod to respect the precondition later in inference.
+
+        define_overload() takes arguments like the following:
+          - A name, for indexing purposes in a backend.
+          - A callable (spec) that:
+            - Has the same model input signature as the original model code.
+            - Returns an optional dynamic shape spec.
+
+        Exporter will only export an overload when the spec callable successfully returns
+        a result without rasing AssertionError.
+
+        For example:
+        ```
+        package = ExportPackage()
+
+
+        def prefill(x, xa, kv_cache):
+            assert x.shape[1] == 3
+            assert kv_cache == {}
+
+
+        def decode(x, xa, kv_cache):
+            assert x.shape[1] > 1
+            assert len(kv_cache) > 0
+            return {...}  # dynamic shape specs here
+
+
+        exporter = (
+            package.exporter(decoder)
+            .define_overload("prefill", prefill)
+            .define_overload("decode", decode)
+        )
+        ```
+
+        A "fallback" is exported when no overload precondition matches a given set of sample
+        inputs. Overloads should
+        Fallbacks don't have names and are ordered in a list. It's up to a backend to decide
+        which fallback is used amony multiple ones.
+
+        A reference backend implementation of ExportMethod may look like the following:
+        ```
+        def execute(method: ExportMethod, *args, **kwargs):
+            for overload in method.overloads:
+                if match_precondition(overload, *args, **kwargs):
+                    return execute_overload(overload, *args, **kwargs)
+            for fallback in method.fallbacks:
+                if match_precondition(fallback, *args, **kwargs):
+                    return execute_fallback(fallback, *args, **kwargs)
+        ```
+
+        Args:
+            method(str): The method name for an exported part of PyTorch model. This
+                         will be saved together with the exported/compiled artifacts
+                         in any serialization format and can be used as the key to
+                         index ExportPackage methods later.
+            fn(callable): A PyTorch function/module to be exported.
+            fallback(str): The fallback policy to decide when to call torch.export
+              - "once" is the default policy. Under this policy a PyTorch program is assumed
+                to be only called once later and an error will be raised for subsequent
+                runs.
+              - "error" means the ExportMethod will never have any fallbacks, meaning
+                users should define all the possible overloads ahead of time.
+
+        """
+
+        fallbacks: list[torch.export.ExportedProgram] = []
+        specs: dict[str, typing.Callable[_InputT, typing.Any]] = {}
+        overloads: dict[str, torch.export.ExportedProgram] = {}
+        self.methods[method] = _ExportMethod(fallbacks=fallbacks, overloads=overloads)
+
+        @functools.wraps(fn)
+        def _exporter_context(*args, **kwargs):  # type: ignore[no-untyped-def]
+            import torch.export._wrapper_utils
+
+            model: torch.nn.Module
+            if not isinstance(fn, torch.nn.Module):
+                model = torch.export._wrapper_utils._WrapperModule(fn)
+            else:
+                model = fn
+
+            for k, v in specs.items():
+                try:
+                    if isinstance(fn, torch.nn.Module):
+                        dynamic_shapes = v(fn, *args, **kwargs)  # type: ignore[arg-type]
+                    else:
+                        dynamic_shapes = v(*args, **kwargs)
+                except AssertionError:
+                    continue
+                if k not in overloads:
+                    ep = torch.export.export(
+                        model, args, kwargs, dynamic_shapes=dynamic_shapes
+                    )
+                    overloads[k] = ep
+                ep = overloads[k]
+                return ep.module()(*args, **kwargs)
+
+            if fallback == "error":
+                raise RuntimeError(
+                    f"Exporter: Cannot export fallback {fn} when fallback policy is set to 'error',"
+                    + "please specify an overload or adjust the fallback policy."
+                )
+            elif fallback == "once":
+                if len(fallbacks) > 0:
+                    raise RuntimeError(
+                        f"Exporter: Cannot export {fn} more than once, "
+                        + "please specify an overload or adjust the fallback policy."
+                    )
+            else:
+                raise RuntimeError(f"Unknown fallback policy: {fallback}")
+            ep = torch.export.export(model, args, kwargs)
+
+            fallbacks.append(ep)
+            return ep.module()(*args, **kwargs)
+
+        if isinstance(fn, torch.nn.Module):
+            _exporter_context = torch._dynamo.eval_frame.OptimizedModule(  # type: ignore[assignment] # noqa: F811
+                fn, lambda _: _exporter_context
+            )
+
+        def _define_overload(
+            overload: str, spec: typing.Callable[_InputT, typing.Any]
+        ) -> typing.Any:
+            assert overload not in specs
+            assert callable(spec)
+            assert overload.isidentifier()
+            specs[overload] = spec
+            return _exporter_context
+
+        assert not hasattr(fn, "_define_overload")
+        _exporter_context._define_overload = _define_overload  # type: ignore[attr-defined]
+
+        return _exporter_context
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index 365c5a6c0034..25be3a26126e 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -7,11 +7,19 @@
 import operator
 import types
 import warnings
+<<<<<<< HEAD
 from collections import namedtuple
+=======
+from collections import defaultdict, namedtuple
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterator
 from contextlib import contextmanager
 from typing import Any, Callable, final, Optional, TYPE_CHECKING, Union
 
+<<<<<<< HEAD
+=======
+from torch._guards import tracing, TracingContext
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._higher_order_ops.utils import autograd_not_implemented
 from torch._library.fake_class_registry import FakeScriptObject
 from torch._subclasses.fake_impls import (
@@ -20,6 +28,10 @@
     register_op_impl,
 )
 from torch._subclasses.fake_tensor import FakeTensorMode
+<<<<<<< HEAD
+=======
+from torch.fx._symbolic_trace import _ConstantAttributeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx._utils import first_call_function_nn_module_stack
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
 from torch.fx.immutable_collections import immutable_dict, immutable_list
@@ -186,7 +198,11 @@ def _fx_collection_equivalence_fn(
 
 
 @contextmanager
+<<<<<<< HEAD
 def _override_composite_implicit_decomp(cia_ops_to_callable, safe=True):
+=======
+def _override_composite_implicit_decomp(cia_ops_to_callable):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This function overrides CompositeImplicitAutograd decomp for
     # functional composite ops that user specified. Ideally we want to not-decompose
     # ALL composite ops but today's C++ functinalization relies on
@@ -194,6 +210,7 @@ def _override_composite_implicit_decomp(cia_ops_to_callable, safe=True):
     # Hence we can only do it for functional ops. One caveat is that
     # there are some composite ops that lie about their schema (claimed to be
     # functional but not really aka dropout), for these cases, we just decompose.
+<<<<<<< HEAD
 
     # When safe=False, we will assume that ops_to_preserve can be mutating/aliasing
     # and their usual decompositions need to be shadowed rather than overridden.
@@ -201,6 +218,8 @@ def _override_composite_implicit_decomp(cia_ops_to_callable, safe=True):
     # replace their CompositeImplicitAutograd kernels with NotImplemented.
     # The only current users of this mode are variants of aten::to that we will
     # replace with aten::_to_copy in FunctionalTensorMode.__torch_dispatch__.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     saved_tables = {}
     patched_ops = set()
     for op_overload, decomp_callable in cia_ops_to_callable.items():
@@ -218,10 +237,16 @@ def _override_composite_implicit_decomp(cia_ops_to_callable, safe=True):
         if torch._C.DispatchKey.CompositeImplicitAutograd in op_overload.py_kernels:
             del op_overload.py_kernels[torch._C.DispatchKey.CompositeImplicitAutograd]
 
+<<<<<<< HEAD
         if safe:
             op_overload.py_impl(torch._C.DispatchKey.CompositeImplicitAutograd)(
                 decomp_callable
             )
+=======
+        op_overload.py_impl(torch._C.DispatchKey.CompositeImplicitAutograd)(
+            decomp_callable
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # [NOTE] Directly registering fake tensor rule to CIA ops
         # The problem we are facing here is if your CIA custom rule
@@ -277,6 +302,7 @@ def _force_dispatch_to_orig_cia_callable(fake_tensor_mode, op, *args, **kwargs):
             _deregister_op_impl(op)
 
 
+<<<<<<< HEAD
 @contextmanager
 def _override_decomp_aten_to_variants():
     # Preserve variants of aten::to understanding that they are mutating/aliasing
@@ -294,6 +320,10 @@ def _override_decomp_aten_to_variants():
 
 def _split_decomp_table_to_cia_and_python_decomp(
     decomp_table: dict[torch._ops.OperatorBase, Callable]
+=======
+def _split_decomp_table_to_cia_and_python_decomp(
+    decomp_table: dict[torch._ops.OperatorBase, Callable],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> tuple[dict[torch._ops.OperatorBase, Callable], ...]:
     all_preservable_cia_ops = set(_collect_all_valid_cia_ops())
     cia_ops_to_callable = {}
@@ -462,12 +492,24 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
                 else:
                     retracing_args.append(node.meta["val"])
 
+<<<<<<< HEAD
         with (
             fake_mode
         ), _override_decomp_aten_to_variants(), _override_composite_implicit_decomp(
             cia_to_decomp,
         ), _enable_graph_inputs_of_type_nn_module(
             ep.example_inputs
+=======
+        tx = TracingContext(fake_mode)
+
+        with (
+            fake_mode,
+            _override_composite_implicit_decomp(
+                cia_to_decomp,
+            ),
+            _enable_graph_inputs_of_type_nn_module(ep.example_inputs),
+            tracing(tx),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             retracing_args_unwrapped = pytree.tree_unflatten(
                 retracing_args, mod._in_spec
@@ -559,6 +601,7 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
         # the state dict of ep.module but ep.module only stores params
         # buffers that participate in forward. If we undo this behaviour,
         # it would break some downstream users.
+<<<<<<< HEAD
         for name, p in unwrapped_params_buffers.items():
             if name not in wrapped_params_buffers:
                 ep.state_dict[name] = p
@@ -573,6 +616,27 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
                     ep.state_dict.pop(name)
 
         return gm, new_graph_signature, ep.state_dict
+=======
+        new_state_dict = {
+            **ep.state_dict,
+            **{
+                name: p
+                for name, p in unwrapped_params_buffers.items()
+                if name not in wrapped_params_buffers
+            },
+        }
+
+        for name, p in wrapped_params_buffers.items():
+            # Buffers can be persistent/non-persistent
+            if name not in new_state_dict:
+                assert not isinstance(p, torch.nn.Parameter)
+
+            if name in new_state_dict:
+                if name not in unwrapped_params_buffers:
+                    new_state_dict.pop(name)
+
+        return gm, new_graph_signature, new_state_dict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     old_placeholders = [
         node for node in ep.graph_module.graph.nodes if node.op == "placeholder"
@@ -585,15 +649,28 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
 
     # TODO(zhxhchen17) Return the new graph_signature directly.
     fake_mode = detect_fake_mode(fake_args)
+<<<<<<< HEAD
     fake_mode = contextlib.nullcontext() if fake_mode is None else fake_mode
+=======
+    fake_mode = contextlib.nullcontext() if fake_mode is None else fake_mode  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     custom_triton_ops_decomposition_ctx = (
         contextlib.nullcontext
         if decompose_custom_triton_ops
         else _disable_custom_triton_op_functional_decomposition
     )
+<<<<<<< HEAD
     with _ignore_backend_decomps(), fake_mode, _override_composite_implicit_decomp(
         cia_to_decomp
     ), custom_triton_ops_decomposition_ctx():
+=======
+    with (
+        _ignore_backend_decomps(),
+        fake_mode,
+        _override_composite_implicit_decomp(cia_to_decomp),
+        custom_triton_ops_decomposition_ctx(),
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         gm, graph_signature = aot_export_module(
             ep.graph_module,
             fake_args,
@@ -834,25 +911,135 @@ def _common_getitem_elimination_pass(
 
 
 def _get_updated_module_call_graph(
+<<<<<<< HEAD
     gm: torch.fx.GraphModule,
+=======
+    old_gm: torch.fx.GraphModule,
+    old_graph_signature: ExportGraphSignature,
+    gm: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     old_module_call_graph: list[ModuleCallEntry],
 ):
     new_module_call_graph = copy.deepcopy(old_module_call_graph)
 
+<<<<<<< HEAD
     # use node-level provenance metadata to create a map
     # from old node names to new node names
     provenance: dict[str, str] = {}
+=======
+    old_nodes = {node.name: node for node in old_gm.graph.nodes}
+
+    old_graph_params_buffers = {
+        **old_graph_signature.inputs_to_parameters,
+        **old_graph_signature.inputs_to_buffers,
+    }
+    new_graph_params_buffers = {
+        **graph_signature.inputs_to_parameters,
+        **graph_signature.inputs_to_buffers,
+    }
+
+    # use node-level provenance metadata to create a map
+    # from old node names to new node names
+    provenance: dict[str, str] = {}
+
+    user_input_counter = 0
+    old_user_input_names = [
+        node.target for node in old_gm.graph.nodes if node.op == "placeholder"
+    ]
+    old_user_input_names = list(
+        filter(
+            lambda x: x not in old_graph_params_buffers
+            and x not in old_graph_signature.input_tokens,
+            old_user_input_names,
+        )
+    )
+    new_user_input_names = [
+        node.target for node in gm.graph.nodes if node.op == "placeholder"
+    ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in gm.graph.nodes:
         if history := node.meta.get("from_node", []):
             provenance[history[-1].name] = node.name
 
+<<<<<<< HEAD
+=======
+        # For params and buffers, we might have applied parameterizaiton rule
+        # so that the names might have changed. But for user inputs, we know we
+        # must preserve the old name.
+        elif node.op == "placeholder":
+            if not (
+                node.target in new_graph_params_buffers
+                or node.target in graph_signature.input_tokens
+            ):
+                if node.target in new_user_input_names:
+                    assert isinstance(node.name, str)
+                    old_name = old_user_input_names[user_input_counter]
+                    assert isinstance(old_name, str)
+                    provenance[old_name] = node.name
+                    user_input_counter += 1
+
+    # For all the parameters and buffers, we first see
+    # if they are result of paramerizaitons and if they
+    # are, we log them and error later
+    old_param_to_desugared = defaultdict(list)
+    for name, target in new_graph_params_buffers.items():
+        # if the parameters are not parametrized, the naming won't change.
+        if not target.startswith("parametrizations."):
+            # If we are in strict mode, we can't just reuse the param names
+            if name in old_graph_params_buffers:
+                provenance[name] = name
+        else:
+            old_target = ".".join(target.split(".")[1:-1])
+            old_param_to_desugared[old_target].append(name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # map old names to new names in module call signatures
     for entry in new_module_call_graph:
         signature = entry.signature
         if signature is None:
             continue
         for x in [*signature.inputs, *signature.outputs]:
+<<<<<<< HEAD
             x.name = provenance.get(x.name, x.name)
+=======
+            # We noticed that submodule is taking subclass as input. we can't
+            # preserve signature here.
+            if x.name in old_param_to_desugared:
+                raise ValueError(
+                    f"It looks like {x.name} is a tensor subclass. "
+                    f"Preserving submodule that takes subclass parameter is not supported"
+                    f" in inference IR because we desugar them, resulting in more tensors"
+                )
+
+            if x.name in provenance:
+                x.name = provenance[x.name]
+
+            # This can happen when aten.to is called at graph boundaries.
+            # Basically aten.to at post-dispatch level can either be copy
+            # or alias. In the alias case, we will no-op it so it will
+            # disappear from the graph. If we detect such case, we should
+            # reuse the input to aten.to as the new input to the submodule.
+            # Technically this can happen for other maybe aliasing ops,
+            # but aten.to is probably the most common one.
+            elif x.name in old_nodes:
+                old_node = old_nodes[x.name]
+                if old_node.op == "call_function" and old_node.target in [
+                    torch.ops.aten.to.dtype_layout,
+                    torch.ops.aten.to.device,
+                    torch.ops.aten.to.dtype,
+                ]:
+                    old_target = old_node.args[0].name
+                    if old_target not in provenance:
+                        raise ValueError(
+                            f"It looks like {old_target} is a tensor subclass. "
+                            f"Preserving submodule that takes subclass parameter is not supported"
+                            f" in inference IR because we desugar them, resulting in more tensors"
+                        )
+
+                    x.name = provenance[old_target]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return new_module_call_graph
 
@@ -882,7 +1069,14 @@ def _decompose_exported_program(
     # new nodes due to decompositions. So we need to update these signatures
     # in the decomposed exported program's module_call_graph.
     new_module_call_graph = _get_updated_module_call_graph(
+<<<<<<< HEAD
+        gm,
+=======
+        ep.graph_module,
+        ep.graph_signature,
         gm,
+        new_graph_signature,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ep.module_call_graph,
     )
 
@@ -934,9 +1128,13 @@ def __init__(
         range_constraints: "dict[sympy.Symbol, Any]",
         module_call_graph: list[ModuleCallEntry],
         example_inputs: Optional[tuple[tuple[Any, ...], dict[str, Any]]] = None,
+<<<<<<< HEAD
         constants: Optional[
             dict[str, Union[torch.Tensor, FakeScriptObject, torch._C.ScriptObject]]
         ] = None,
+=======
+        constants: Optional[dict[str, _ConstantAttributeType]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         *,
         verifiers: Optional[list[type[Verifier]]] = None,
     ):
@@ -1303,10 +1501,18 @@ def __str__(self) -> str:
         graph_module = self.graph_module.print_readable(
             print_output=False, colored=False
         ).replace("\n", "\n    ")
+<<<<<<< HEAD
         string = (
             "ExportedProgram:\n"
             f"    {graph_module}\n"
             f"Graph signature: {self.graph_signature}\n"
+=======
+        graph_signature = str(self.graph_signature).replace("\n", "\n    ")
+        string = (
+            "ExportedProgram:\n"
+            f"    {graph_module}\n"
+            f"Graph signature: {graph_signature}\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"Range constraints: {self.range_constraints}\n"
         )
         return string
@@ -1437,9 +1643,15 @@ def _get_updated_graph_signature(
                 if node.op != "placeholder":
                     break
 
+<<<<<<< HEAD
                 assert i < len(
                     old_signature.input_specs
                 ), "Number of inputs changed after transformation"
+=======
+                assert i < len(old_signature.input_specs), (
+                    "Number of inputs changed after transformation"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 old_input_spec = old_signature.input_specs[i]
                 arg = (
                     old_input_spec.arg
@@ -1462,9 +1674,15 @@ def _get_updated_graph_signature(
 
             new_output_specs = []
             for i, node in enumerate(output_node.args[0]):
+<<<<<<< HEAD
                 assert i < len(
                     old_signature.output_specs
                 ), "Number of outputs changed after transformation"
+=======
+                assert i < len(old_signature.output_specs), (
+                    "Number of outputs changed after transformation"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 old_output_spec = old_signature.output_specs[i]
                 arg = (
                     old_output_spec.arg
@@ -1522,9 +1740,15 @@ def validate(self):
     # TODO: remove this
     @final
     def _validate(self):
+<<<<<<< HEAD
         assert (
             len(self.verifiers) > 0
         ), "ExportedProgram must have at least one verifier."
+=======
+        assert len(self.verifiers) > 0, (
+            "ExportedProgram must have at least one verifier."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for v in self.verifiers:
             v().check(self)
 
diff --git a/torch/export/graph_signature.py b/torch/export/graph_signature.py
index 055e2ebd3233..5a3a5068a8f9 100644
--- a/torch/export/graph_signature.py
+++ b/torch/export/graph_signature.py
@@ -95,9 +95,15 @@ class InputSpec:
 
     def __post_init__(self):
         if self.kind == InputKind.BUFFER:
+<<<<<<< HEAD
             assert (
                 self.persistent is not None
             ), "Failed to specify persistent flag on BUFFER."
+=======
+            assert self.persistent is not None, (
+                "Failed to specify persistent flag on BUFFER."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(
             self.arg,
             (
@@ -111,6 +117,14 @@ def __post_init__(self):
             ),
         ), f"got {type(self.arg)}"
 
+<<<<<<< HEAD
+=======
+    def __str__(self):
+        target = "" if self.target is None else f" target='{self.target}'"
+        persistent = "" if self.persistent is None else f" persistent={self.persistent}"
+        return f"{str(self.arg.name)}: {str(self.kind.name)}{target}{persistent}"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class OutputKind(Enum):
     USER_OUTPUT = auto()
@@ -142,6 +156,13 @@ def __post_init__(self):
             ),
         ), self.arg
 
+<<<<<<< HEAD
+=======
+    def __str__(self):
+        target = "" if self.target is None else f" target='{self.target}'"
+        return f"{str(self.arg.name)}: {str(self.kind.name)}{target}"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @dataclasses.dataclass
 class ExportBackwardSignature:
@@ -178,6 +199,7 @@ def __init__(self) -> None:
                 self.my_parameter = nn.Parameter(torch.tensor(2.0))
 
                 # Define two buffers
+<<<<<<< HEAD
                 self.register_buffer('my_buffer1', torch.tensor(3.0))
                 self.register_buffer('my_buffer2', torch.tensor(4.0))
 
@@ -220,6 +242,87 @@ def forward(self, x1, x2):
                 OutputSpec(kind=<OutputKind.USER_OUTPUT: 1>, arg=TensorArgument(name='add_1'), target=None)
             ]
         )
+=======
+                self.register_buffer("my_buffer1", torch.tensor(3.0))
+                self.register_buffer("my_buffer2", torch.tensor(4.0))
+
+            def forward(self, x1, x2):
+                # Use the parameter, buffers, and both inputs in the forward method
+                output = (
+                    x1 + self.my_parameter
+                ) * self.my_buffer1 + x2 * self.my_buffer2
+
+                # Mutate one of the buffers (e.g., increment it by 1)
+                self.my_buffer2.add_(1.0)  # In-place addition
+
+                return output
+
+
+        mod = CustomModule()
+        ep = torch.export.export(mod, (torch.tensor(1.0), torch.tensor(2.0)))
+
+    Resulting Graph is non-functional::
+
+        graph():
+            %p_my_parameter : [num_users=1] = placeholder[target=p_my_parameter]
+            %b_my_buffer1 : [num_users=1] = placeholder[target=b_my_buffer1]
+            %b_my_buffer2 : [num_users=2] = placeholder[target=b_my_buffer2]
+            %x1 : [num_users=1] = placeholder[target=x1]
+            %x2 : [num_users=1] = placeholder[target=x2]
+            %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x1, %p_my_parameter), kwargs = {})
+            %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add, %b_my_buffer1), kwargs = {})
+            %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%x2, %b_my_buffer2), kwargs = {})
+            %add_1 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul, %mul_1), kwargs = {})
+            %add_ : [num_users=0] = call_function[target=torch.ops.aten.add_.Tensor](args = (%b_my_buffer2, 1.0), kwargs = {})
+            return (add_1,)
+
+    Resulting ExportGraphSignature of the non-functional Graph would be::
+
+        # inputs
+        p_my_parameter: PARAMETER target='my_parameter'
+        b_my_buffer1: BUFFER target='my_buffer1' persistent=True
+        b_my_buffer2: BUFFER target='my_buffer2' persistent=True
+        x1: USER_INPUT
+        x2: USER_INPUT
+
+        # outputs
+        add_1: USER_OUTPUT
+
+    To get a functional Graph, you can use :func:`run_decompositions`::
+
+        mod = CustomModule()
+        ep = torch.export.export(mod, (torch.tensor(1.0), torch.tensor(2.0)))
+        ep = ep.run_decompositions()
+
+    Resulting Graph is functional::
+
+        graph():
+            %p_my_parameter : [num_users=1] = placeholder[target=p_my_parameter]
+            %b_my_buffer1 : [num_users=1] = placeholder[target=b_my_buffer1]
+            %b_my_buffer2 : [num_users=2] = placeholder[target=b_my_buffer2]
+            %x1 : [num_users=1] = placeholder[target=x1]
+            %x2 : [num_users=1] = placeholder[target=x2]
+            %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x1, %p_my_parameter), kwargs = {})
+            %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add, %b_my_buffer1), kwargs = {})
+            %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%x2, %b_my_buffer2), kwargs = {})
+            %add_1 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul, %mul_1), kwargs = {})
+            %add_2 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%b_my_buffer2, 1.0), kwargs = {})
+            return (add_2, add_1)
+
+    Resulting ExportGraphSignature of the functional Graph would be::
+
+        # inputs
+        p_my_parameter: PARAMETER target='my_parameter'
+        b_my_buffer1: BUFFER target='my_buffer1' persistent=True
+        b_my_buffer2: BUFFER target='my_buffer2' persistent=True
+        x1: USER_INPUT
+        x2: USER_INPUT
+
+        # outputs
+        add_2: BUFFER_MUTATION target='my_buffer2'
+        add_1: USER_OUTPUT
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     input_specs: list[InputSpec]
@@ -487,6 +590,14 @@ def _(old, new, user):
 
         return _
 
+<<<<<<< HEAD
+=======
+    def __str__(self):
+        input_specs = "\n".join(str(s) for s in self.input_specs)
+        output_specs = "\n".join(str(s) for s in self.output_specs)
+        return f"\n# inputs\n{input_specs}\n\n# outputs\n{output_specs}\n"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _immutable_dict(items):
     """
@@ -506,9 +617,15 @@ def _make_argument_spec(node, token_names) -> ArgumentSpec:
         # For const outputs we just directly return this
         return ConstantArgument(name="", value=node)
 
+<<<<<<< HEAD
     assert (
         "val" in node.meta
     ), f"{node} is not a constant or a node with a 'val' metadata field"
+=======
+    assert "val" in node.meta, (
+        f"{node} is not a constant or a node with a 'val' metadata field"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     val = node.meta["val"]
     if node.name in token_names:
         return TokenArgument(name=node.name)
@@ -551,9 +668,27 @@ def _convert_to_export_graph_signature(
     user_outputs = set(graph_signature.user_outputs)
     buffer_mutations = graph_signature.buffers_to_mutate
     user_input_mutations = graph_signature.user_inputs_to_mutate
+<<<<<<< HEAD
     grad_params = graph_signature.backward_signature.gradients_to_parameter if is_joint else {}  # type: ignore[union-attr]
     grad_user_inputs = graph_signature.backward_signature.gradients_to_user_inputs if is_joint else {}  # type: ignore[union-attr]
     loss_output = graph_signature.backward_signature.loss_output if is_joint else None  # type: ignore[union-attr]
+=======
+    grad_params = (
+        graph_signature.backward_signature.gradients_to_parameter  # type: ignore[union-attr]
+        if is_joint
+        else {}
+    )
+    grad_user_inputs = (
+        graph_signature.backward_signature.gradients_to_user_inputs  # type: ignore[union-attr]
+        if is_joint
+        else {}
+    )
+    loss_output = (
+        graph_signature.backward_signature.loss_output  # type: ignore[union-attr]
+        if is_joint
+        else None
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     input_tokens = graph_signature.input_tokens
     output_tokens = graph_signature.output_tokens
 
diff --git a/torch/export/pt2_archive/__init__.py b/torch/export/pt2_archive/__init__.py
new file mode 100644
index 000000000000..b2bf26a275d9
--- /dev/null
+++ b/torch/export/pt2_archive/__init__.py
@@ -0,0 +1,4 @@
+from ._package import is_pt2_package, PT2ArchiveReader, PT2ArchiveWriter
+
+
+__all__ = ["PT2ArchiveWriter", "PT2ArchiveReader", "is_pt2_package"]
diff --git a/torch/export/pt2_archive/_package.py b/torch/export/pt2_archive/_package.py
new file mode 100644
index 000000000000..7c97e6abe171
--- /dev/null
+++ b/torch/export/pt2_archive/_package.py
@@ -0,0 +1,685 @@
+import glob
+import io
+import json
+import logging
+import os
+import tempfile
+import zipfile
+from dataclasses import dataclass
+from typing import Any, IO, Optional, TYPE_CHECKING, Union
+from typing_extensions import TypeAlias
+
+import torch
+import torch.utils._pytree as pytree
+from torch._export.serde.serialize import deserialize, serialize, SerializedArtifact
+from torch.export._tree_utils import reorder_kwargs
+from torch.export.exported_program import ExportedProgram
+from torch.export.pt2_archive._package_weights import (
+    get_complete,
+    group_weights,
+    Weights,
+)
+from torch.export.pt2_archive.constants import (
+    AOTINDUCTOR_DIR,
+    ARCHIVE_FORMAT_PATH,
+    ARCHIVE_FORMAT_VALUE,
+    ARCHIVE_VERSION_PATH,
+    ARCHIVE_VERSION_VALUE,
+    CONSTANTS_DIR,
+    CUSTOM_OBJ_FILENAME_PREFIX,
+    EXTRA_DIR,
+    MODELS_DIR,
+    MODELS_FILENAME_FORMAT,
+    SAMPLE_INPUTS_FILENAME_FORMAT,
+    WEIGHT_FILENAME_PREFIX,
+    WEIGHTS_DIR,
+)
+from torch.types import FileLike
+
+
+if TYPE_CHECKING:
+    from torch.utils._ordered_set import OrderedSet
+
+
+DEFAULT_PICKLE_PROTOCOL = 2
+AOTI_FILES: TypeAlias = Union[
+    list[Union[str, Weights]], dict[str, list[Union[str, Weights]]]
+]
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def is_pt2_package(serialized_model: Union[bytes, str]) -> bool:
+    """
+    Check if the serialized model is a PT2 Archive package.
+    """
+    try:
+        zip_reader = zipfile.ZipFile(
+            io.BytesIO(serialized_model)
+            if isinstance(serialized_model, bytes)
+            else serialized_model
+        )
+        root_folder = zip_reader.namelist()[0].split(os.path.sep)[0]
+        archive_format_path = f"{root_folder}/{ARCHIVE_FORMAT_PATH}"
+        if archive_format_path in zip_reader.namelist():
+            return zip_reader.read(archive_format_path) == b"pt2"
+    except Exception as ex:
+        logger.info("Model is not a PT2 package: %s", str(ex))
+    return False
+
+
+class PT2ArchiveWriter:
+    """
+    Context manager for writing a PT2 archive.
+    """
+
+    def __init__(self, archive_path_or_buffer: FileLike):
+        self.archive_file = torch._C.PyTorchFileWriter(archive_path_or_buffer)  # type: ignore[arg-type]
+        # NOTICE: version here is different from the archive_version
+        # this is the version of zip file format, which is used by PyTorchFileWriter, which write to /.data/version
+        # archive_version is the version of the PT2 archive spec, which write to /archive_version
+        self.archive_file.set_min_version(6)
+
+    def __enter__(self) -> "PT2ArchiveWriter":
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        if not self.has_record(ARCHIVE_FORMAT_PATH):
+            self.write_string(ARCHIVE_FORMAT_PATH, ARCHIVE_FORMAT_VALUE)
+
+        if not self.has_record(ARCHIVE_VERSION_PATH):
+            self.write_string(ARCHIVE_VERSION_PATH, ARCHIVE_VERSION_VALUE)
+
+        self.close()
+
+    def has_record(self, name: str) -> bool:
+        """
+        Check if a record exists in the archive.
+        """
+        return name in self.archive_file.get_all_written_records()
+
+    def count_prefix(self, prefix: str) -> int:
+        """
+        Count the number of records that start with a given prefix.
+        """
+        return sum(
+            1
+            for record in self.archive_file.get_all_written_records()
+            if record.startswith(prefix)
+        )
+
+    def write_bytes(self, name: str, data: bytes) -> None:
+        """
+        Write a bytes object to the archive.
+        name: The destination file inside the archive.
+        data: The bytes object to write.
+        """
+        assert isinstance(data, bytes), f"Expected bytes but got {type(data)}"
+        self.archive_file.write_record(name, data, len(data))
+
+    def write_string(self, name: str, data: str) -> None:
+        """
+        Write a string object to the archive.
+        name: The destination file inside the archive.
+        data: The string object to write.
+        """
+        assert isinstance(data, str), f"Expected string but got {type(data)}"
+        data_bytes = data.encode()
+        self.write_bytes(name, data_bytes)
+
+    def write_file(self, name: str, file_path: str) -> None:
+        """
+        Copy a file into the archive.
+        name: The destination file inside the archive.
+        file_path: The source file on disk.
+        """
+        assert os.path.isfile(file_path), f"{file_path} is not a valid file path"
+
+        with open(file_path, "rb") as f:
+            file_bytes = f.read()
+            self.write_bytes(name, file_bytes)
+
+    def write_folder(self, archive_dir: str, folder_dir: str) -> None:
+        """
+        Copy a folder into the archive.
+        archive_dir: The destination folder inside the archive.
+        folder_dir: The source folder on disk.
+        """
+        assert os.path.isdir(folder_dir), f"{folder_dir} is not a valid directory path"
+
+        file_paths = filter(
+            os.path.isfile, glob.glob(f"{folder_dir}/**", recursive=True)
+        )
+        for file_path in file_paths:
+            filename = os.path.relpath(file_path, folder_dir)
+            archive_path = os.path.join(archive_dir, filename)
+            self.write_file(archive_path, file_path)
+
+    def close(self) -> None:
+        """
+        Close the archive.
+        """
+        self.archive_file.write_end_of_file()
+
+
+class PT2ArchiveReader:
+    """
+    Context manager for reading a PT2 archive.
+    """
+
+    def __init__(self, archive_path_or_buffer: FileLike):
+        self.archive_file = torch._C.PyTorchFileReader(archive_path_or_buffer)  # type: ignore[arg-type]
+        assert self.read_string(ARCHIVE_FORMAT_PATH) == ARCHIVE_FORMAT_VALUE, (
+            "Invalid archive format"
+        )
+
+    def __enter__(self) -> "PT2ArchiveReader":
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        # torch._C.PyTorchFileReader doesn't have a close method
+        pass
+
+    def read_bytes(self, name: str) -> bytes:
+        """
+        Read a bytes object from the archive.
+        name: The source file inside the archive.
+        """
+        return self.archive_file.get_record(name)
+
+    def read_string(self, name: str) -> str:
+        """
+        Read a string object from the archive.
+        name: The source file inside the archive.
+        """
+        data = self.read_bytes(name)
+        return data.decode()
+
+    def archive_version(self) -> int:
+        """
+        Get the archive version.
+        """
+        try:
+            archive_version = self.read_string(ARCHIVE_VERSION_PATH)
+        except Exception:
+            # if archive_version is not found, it means the archive is older than version 0.
+            # In this case, we assume the archive is version 0.
+            archive_version = "0"
+
+        return int(archive_version)
+
+    def get_file_names(self) -> list[str]:
+        """
+        Get the file names in the archive.
+        """
+        return self.archive_file.get_all_records()
+
+
+def _package_aoti_files(
+    archive_writer: PT2ArchiveWriter,
+    aoti_files: Optional[AOTI_FILES],
+    pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+) -> None:
+    if aoti_files is None:
+        return
+
+    if isinstance(aoti_files, list):
+        aoti_files = {"model": aoti_files}
+
+    assert isinstance(aoti_files, dict)
+
+    all_weights: dict[str, Weights] = {}  # model_name -> weight
+    weights_configs: dict[
+        str, dict[str, Any]
+    ] = {}  # model_name -> (weight_name -> (filename, shape, stride, offset))
+
+    for model_name, files in aoti_files.items():
+        num_so_files = 0
+        weights_configs[model_name] = {}
+
+        for file in files:
+            if file == "":
+                continue
+
+            if isinstance(file, Weights):
+                all_weights[model_name] = file
+                continue
+
+            if file.endswith(".so"):
+                num_so_files += 1
+                if num_so_files > 1:
+                    raise RuntimeError(
+                        f"Multiple .so files found in {files}. "
+                        "You might need to clear your cache "
+                        "directory before calling aoti_compile again."
+                    )
+
+            filename = os.path.basename(file)
+            if filename.startswith(CUSTOM_OBJ_FILENAME_PREFIX):
+                new_filepath = os.path.join(CONSTANTS_DIR, filename)
+            else:
+                new_filepath = os.path.join(AOTINDUCTOR_DIR, model_name, filename)
+            logger.debug(
+                "Saving AOTI generated file %s to archive in %s", file, new_filepath
+            )
+            archive_writer.write_file(
+                str(new_filepath),
+                file,
+            )
+
+    if len(all_weights) > 0:
+        # Dedup weights
+        grouped_tensors: list[OrderedSet[tuple[str, str]]] = group_weights(all_weights)
+        for idx, group in enumerate(grouped_tensors):
+            filename = f"{WEIGHT_FILENAME_PREFIX}{idx}"
+            model_name, weight_name = get_complete(group, all_weights)
+            complete_tensor, _ = all_weights[model_name].get_weight(weight_name)
+            buffer = io.BytesIO()
+            torch.save(complete_tensor, buffer, pickle_protocol=pickle_protocol)
+            archive_writer.write_bytes(
+                os.path.join(WEIGHTS_DIR, filename), buffer.getvalue()
+            )
+            for model_name, weight_name in group:
+                _, w_property = all_weights[model_name].get_weight(weight_name)
+                weights_configs[model_name][weight_name] = (
+                    filename,
+                    w_property.shape,
+                    w_property.stride,
+                    w_property.offset,
+                )
+
+        for model_name, weights_config in weights_configs.items():
+            archive_writer.write_string(
+                os.path.join(AOTINDUCTOR_DIR, model_name, "weights_config.json"),
+                json.dumps(weights_config),
+            )
+            logger.debug("packaging weights_config for model %s", model_name)
+            logger.debug(weights_config)
+
+
+def _package_exported_programs(
+    archive_writer: PT2ArchiveWriter,
+    exported_programs: Optional[Union[ExportedProgram, dict[str, ExportedProgram]]],
+    opset_version: Optional[dict[str, int]] = None,
+    pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+) -> None:
+    if exported_programs is None:
+        return
+
+    if isinstance(exported_programs, ExportedProgram):
+        exported_programs = {"model", exported_programs}  # type: ignore[assignment]
+
+    assert isinstance(exported_programs, dict)
+
+    for model_name, ep in exported_programs.items():
+        artifact: SerializedArtifact = serialize(ep, opset_version, pickle_protocol)
+
+        archive_writer.write_bytes(
+            MODELS_FILENAME_FORMAT.format(model_name), artifact.exported_program
+        )
+        # TODO:Consider dedup this with the weights saved in package_aoti_files
+        archive_writer.write_bytes(f"{WEIGHTS_DIR}{model_name}.pt", artifact.state_dict)
+        archive_writer.write_bytes(
+            f"{CONSTANTS_DIR}{model_name}.pt", artifact.constants
+        )
+        archive_writer.write_bytes(
+            SAMPLE_INPUTS_FILENAME_FORMAT.format(model_name),
+            artifact.example_inputs,
+        )
+
+
+def _package_extra_files(
+    archive_writer: PT2ArchiveWriter, extra_files: Optional[dict[str, Any]]
+) -> None:
+    if extra_files is None:
+        return
+
+    for extra_file_name, content in extra_files.items():
+        archive_writer.write_string(f"{EXTRA_DIR}{extra_file_name}", content)
+
+
+def package_pt2(
+    f: FileLike,
+    *,
+    exported_programs: Optional[
+        Union[ExportedProgram, dict[str, ExportedProgram]]
+    ] = None,
+    aoti_files: Optional[AOTI_FILES] = None,
+    extra_files: Optional[dict[str, Any]] = None,
+    opset_version: Optional[dict[str, int]] = None,
+    pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+) -> FileLike:
+    """
+    Saves the artifacts to a PT2Archive format
+    (https://docs.google.com/document/d/1RQ4cmywilnFUT1VE-4oTGxwXdc8vowCSZsrRgo3wFA8/edit?tab=t.0#heading=h.v2y2jgnwc56a).
+    The artifact can then be loaded using ``load_pt2``.
+
+    Args:
+        f (str | os.PathLike[str] | IO[bytes]) A file-like object (has to
+         implement write and flush) or a string containing a file name.
+
+        exported_programs (Union[ExportedProgram, dict[str, ExportedProgram]]):
+         The exported program to save, or a dictionary mapping model name to an
+         exported program to save. The exported program will be saved under
+         models/*.json. If only one ExportedProgram is specified, this will
+         automatically be named "model".
+
+        aoti_files (Union[list[str], dict[str, list[str]]): A list of files
+         generated by AOTInductor via
+         ``torch._inductor.aot_compile(..., {"aot_inductor.package": True})``,
+         or a dictionary mapping model name to its AOTInductor generated files.
+         If only one set of files is specified, this will automatically be named
+         "model".
+
+        extra_files (Optional[Dict[str, Any]]): Map from filename to contents
+         which will be stored as part of the pt2.
+
+        opset_version (Optional[Dict[str, int]]): A map of opset names
+         to the version of this opset
+
+        pickle_protocol: can be specified to override the default protocol
+
+    """
+    assert not (
+        exported_programs is None and aoti_files is None and extra_files is None
+    ), (
+        "No value passed in for `exported_programs`, `aoti_files`, and "
+        "`extra_files`, implying that you do not plan on saving anything."
+    )
+
+    if not (
+        (isinstance(f, (io.IOBase, IO)) and f.writable() and f.seekable())
+        or (isinstance(f, (str, os.PathLike)) and os.fspath(f).endswith(".pt2"))
+    ):
+        # TODO: turn this into an error
+        logger.warning(
+            "Expect archive file to be a file ending in .pt2, or is a buffer. "
+            "Instead got {%s}",
+            f,
+        )
+
+    if isinstance(f, (str, os.PathLike)):
+        f = os.fspath(f)
+
+    with PT2ArchiveWriter(f) as archive_writer:
+        _package_exported_programs(
+            archive_writer, exported_programs, pickle_protocol=pickle_protocol
+        )
+        _package_aoti_files(
+            archive_writer,
+            aoti_files,
+            pickle_protocol=pickle_protocol,
+        )
+        _package_extra_files(archive_writer, extra_files)
+
+    if isinstance(f, (io.IOBase, IO)):
+        f.seek(0)
+    return f
+
+
+class AOTICompiledModel:
+    """
+    Callable AOT Inductor loaded model from a .pt2
+    """
+
+    def __init__(self, loader: torch._C._aoti.AOTIModelPackageLoader) -> None:
+        self.loader = loader
+
+    def __call__(self, *args, **kwargs):  # type: ignore[no-untyped-def]
+        call_spec = self.loader.get_call_spec()
+        in_spec = pytree.treespec_loads(call_spec[0])
+        out_spec = pytree.treespec_loads(call_spec[1])
+        flat_inputs = pytree.tree_flatten((args, reorder_kwargs(kwargs, in_spec)))[0]
+        flat_inputs = [x for x in flat_inputs if isinstance(x, torch.Tensor)]
+        flat_outputs = self.loader.boxed_run(flat_inputs)
+        return pytree.tree_unflatten(flat_outputs, out_spec)
+
+    def get_metadata(self) -> dict[str, str]:
+        return self.loader.get_metadata()
+
+    def load_constants(
+        self,
+        constants_map: dict[str, torch.Tensor],
+        *,
+        check_full_update: bool,
+        user_managed: bool = False,
+    ) -> None:
+        """
+        Given a mapping of constant fqns to tensors, load the constants into the model.
+        You can use ``get_constant_fqns`` to get the list of constant fqns that
+        are needed in the compiled model.
+
+        Args:
+            constants_map: A mapping of constant fqns to tensors.
+            check_full_update: Whether to add check to see if all the constants
+            are updated and have values.
+        """
+        self.loader.load_constants(
+            constants_map, False, check_full_update, user_managed
+        )
+
+    def get_constant_fqns(self) -> list[str]:
+        return self.loader.get_constant_fqns()
+
+    def __deepcopy__(self, memo: Optional[dict[Any, Any]]) -> "AOTICompiledModel":
+        logger.warning(
+            "AOTICompiledModel deepcopy warning: AOTICompiledModel.loader is not deepcopied."
+        )
+        return AOTICompiledModel(self.loader)
+
+
+@dataclass
+class PT2ArchiveContents:
+    exported_programs: dict[str, ExportedProgram]
+    aoti_runners: dict[str, AOTICompiledModel]
+    extra_files: dict[str, Any]
+
+
+def _load_exported_programs(
+    archive_reader: PT2ArchiveReader,
+    file_names: list[str],
+    expected_opset_version: Optional[dict[str, int]],
+) -> dict[str, ExportedProgram]:
+    exported_program_files = [
+        file for file in file_names if file.startswith(MODELS_DIR)
+    ]
+    exported_programs = {}
+    for file in exported_program_files:
+        prefix, suffix = MODELS_FILENAME_FORMAT.split(
+            "{}"
+        )  # split "models/{}.json" into "models/" and "json"
+        model_name = file[
+            len(prefix) : -len(suffix)
+        ]  # given "models/foo.json" we can now get "foo"
+
+        weights_file = f"{WEIGHTS_DIR}{model_name}.pt"
+        constants_file = f"{CONSTANTS_DIR}{model_name}.pt"
+        sample_inputs_file = SAMPLE_INPUTS_FILENAME_FORMAT.format(model_name)
+
+        serialized_exported_program = archive_reader.read_bytes(file)
+        serialized_weights = archive_reader.read_bytes(weights_file)
+        serialized_constants = archive_reader.read_bytes(constants_file)
+        serialized_sample_inputs = archive_reader.read_bytes(sample_inputs_file)
+
+        artifact: SerializedArtifact = SerializedArtifact(
+            serialized_exported_program,
+            serialized_weights,
+            serialized_constants,
+            serialized_sample_inputs,
+        )
+
+        # Deserialize ExportedProgram
+        ep = deserialize(artifact, expected_opset_version)
+        exported_programs[model_name] = ep
+
+    return exported_programs
+
+
+def _load_extra_files(
+    archive_reader: PT2ArchiveReader, file_names: list[str]
+) -> dict[str, Any]:
+    extra_files = [file for file in file_names if file.startswith(EXTRA_DIR)]
+
+    extra_file_contents: dict[str, Any] = {}
+    for file in extra_files:
+        contents = archive_reader.read_string(file)
+        extra_file_contents[file[len(EXTRA_DIR) :]] = contents
+
+    return extra_file_contents
+
+
+def load_pt2(
+    f: FileLike,
+    *,
+    expected_opset_version: Optional[dict[str, int]] = None,
+    run_single_threaded: bool = False,
+    num_runners: int = 1,
+    device_index: int = -1,
+    load_weights_from_disk: bool = False,
+) -> PT2ArchiveContents:  # type: ignore[type-arg]
+    """
+    Loads all the artifacts previously saved with ``package_pt2``.
+
+    Args:
+        f (str | os.PathLike[str] | IO[bytes]): A file-like object (has to
+         implement write and flush) or a string containing a file name.
+
+        expected_opset_version (Optional[Dict[str, int]]): A map of opset names
+         to expected opset versions
+
+        num_runners (int): Number of runners to load AOTInductor artifacts
+
+        run_single_threaded (bool): Whether the model should be run without
+            thread synchronization logic. This is useful to avoid conflicts with
+            CUDAGraphs.
+
+        device_index (int): The index of the device to which the PT2 package is
+            to be loaded. By default, `device_index=-1` is used, which corresponds
+            to the device `cuda` when using CUDA. Passing `device_index=1` would
+            load the package to `cuda:1`, for example.
+
+    Returns:
+        A ``PT2ArchiveContents`` object which contains all the objects in the PT2.
+    """
+
+    if not (
+        (isinstance(f, (io.IOBase, IO)) and f.readable() and f.seekable())
+        or (isinstance(f, (str, os.PathLike)) and os.fspath(f).endswith(".pt2"))
+    ):
+        # TODO: turn this into an error in 2.9
+        logger.warning(
+            "Unable to load package. f must be a buffer or a file ending in "
+            ".pt2. Instead got {%s}",
+            f,
+        )
+
+    if isinstance(f, (str, os.PathLike)):
+        f = os.fspath(f)
+
+    weights = {}
+    weight_maps = {}
+    with PT2ArchiveReader(f) as archive_reader:
+        version = archive_reader.read_string(ARCHIVE_VERSION_PATH)
+        if version != ARCHIVE_VERSION_VALUE:
+            raise ValueError(
+                f"Saved archive version {version} does not match our current "
+                f"archive version {ARCHIVE_VERSION_VALUE}."
+            )
+
+        file_names = archive_reader.get_file_names()
+
+        exported_programs = _load_exported_programs(
+            archive_reader, file_names, expected_opset_version
+        )
+        extra_files = _load_extra_files(archive_reader, file_names)
+
+        # Get a list of AOTI model names
+        aoti_model_names: set[str] = set()
+        for file in file_names:
+            if file.startswith(AOTINDUCTOR_DIR):
+                file_end = file[
+                    len(AOTINDUCTOR_DIR) :
+                ]  # remove data/aotinductor/ prefix
+                model_name = file_end.split("/")[
+                    0
+                ]  # split "model_name/...cpp" into "model_name"
+                aoti_model_names.add(model_name)
+                if load_weights_from_disk and file.endswith("weights_config.json"):
+                    weight_map = json.loads(archive_reader.read_string(file))
+                    weight_maps[model_name] = weight_map
+            elif load_weights_from_disk and file.startswith(WEIGHTS_DIR):
+                weight_file_name = file[
+                    len(WEIGHTS_DIR) :
+                ]  # remove data/weights/ prefix
+                weight_bytes = archive_reader.read_bytes(file)
+                loaded_weight = torch.load(io.BytesIO(weight_bytes))
+                weights[weight_file_name] = loaded_weight
+
+    if isinstance(f, (io.IOBase, IO)):
+        if len(aoti_model_names) > 0:
+            # Workaround for AOTIModelPackageLoader not reading buffers
+            with tempfile.NamedTemporaryFile(suffix=".pt2") as tf:
+                f.seek(0)
+                tf.write(f.read())
+                f.seek(0)
+                logger.debug("Writing buffer to tmp file located at %s.", tf.name)
+
+                aoti_runners = {
+                    model_name: AOTICompiledModel(
+                        torch._C._aoti.AOTIModelPackageLoader(
+                            tf.name,
+                            model_name,
+                            run_single_threaded,
+                            num_runners,
+                            device_index,
+                        )
+                    )
+                    for model_name in aoti_model_names
+                }
+        else:
+            aoti_runners = {}
+    else:
+        aoti_runners = {
+            model_name: AOTICompiledModel(
+                torch._C._aoti.AOTIModelPackageLoader(
+                    f, model_name, run_single_threaded, num_runners, device_index
+                )
+            )
+            for model_name in aoti_model_names
+        }
+
+    if weight_maps:
+        for model_name in aoti_model_names:
+            model_weights = {}
+            for weight_name, (file, shape, stride, storage_offset) in weight_maps[
+                model_name
+            ].items():
+                weight = weights[file]
+                model_weights[weight_name] = weight.as_strided(
+                    shape, stride, storage_offset
+                )
+
+            # user_managed=True ensures the weights updates are shared by all runners.
+            aoti_runners[model_name].load_constants(
+                model_weights, check_full_update=True, user_managed=True
+            )
+
+    return PT2ArchiveContents(exported_programs, aoti_runners, extra_files)
+
+
+def load_weights_to_pt2_contents(
+    pt2_contents: PT2ArchiveContents, weights_map: dict[str, Any]
+) -> None:
+    """
+    Load weights into the models in PT2 archive contents
+
+    Args:
+        pt2_contents (PT2ArchiveContents): The contents of the PT2 archive.
+    """
+    for model_name, weights in weights_map.items():
+        if model_name not in pt2_contents.aoti_runners:
+            raise RuntimeError(f"Model {model_name} not found in PT2 archive contents.")
+        pt2_contents.aoti_runners[model_name].load_constants(
+            weights, check_full_update=True, user_managed=True
+        )
diff --git a/torch/export/pt2_archive/_package_weights.py b/torch/export/pt2_archive/_package_weights.py
new file mode 100644
index 000000000000..e6721ea9229a
--- /dev/null
+++ b/torch/export/pt2_archive/_package_weights.py
@@ -0,0 +1,101 @@
+import collections
+
+import torch
+from torch.utils._ordered_set import OrderedSet
+
+
+def _end_ptr(tensor: torch.Tensor) -> int:
+    if tensor.nelement():
+        stop = tensor.view(-1)[-1].data_ptr() + tensor.element_size()
+    else:
+        stop = tensor.data_ptr()
+    return stop
+
+
+class TensorProperties:
+    def __init__(self, tensor: torch.Tensor):
+        # info about underlying storage
+        self.storage_ptr = tensor.untyped_storage().data_ptr()
+        self.storage_size = tensor.untyped_storage().nbytes()
+
+        # info to recover tensor
+        self.shape = tensor.shape
+        self.stride = tensor.stride()
+        self.offset = tensor.storage_offset()
+
+        self.start = tensor.data_ptr()
+        self.end = _end_ptr(tensor)
+
+    def is_complete(self) -> bool:
+        """
+        Whehter the tensor completely overlaps with its underlying storage
+        """
+        return (
+            self.start == self.storage_ptr
+            and self.end == self.storage_ptr + self.storage_size
+        )
+
+
+class Weights(dict):
+    """
+    A dictionary mapping from weight name to a tuple of (tensor, TensorProperties).
+    tensor represents the actual intial value of the weight.
+    TensorProperties represents the properties of the weight that are needed to recover the weight.
+
+    We use two separate entries because `tensor` could be a clone of the original weight tensor,
+    so it doesn't have the same property as the original weight (such as underlying storage pointer).
+    """
+
+    def __init__(self, weight_dict: dict[str, tuple[torch.Tensor, TensorProperties]]):
+        super().__init__(weight_dict)
+
+    def get_weight(self, name: str) -> tuple[torch.Tensor, TensorProperties]:
+        return self[name]
+
+    def get_weight_properties(self, name: str) -> TensorProperties:
+        return self[name][1]
+
+
+def get_complete(
+    group: OrderedSet[tuple[str, str]], models_weights: dict[str, Weights]
+) -> tuple[str, str]:
+    """
+    `group` is a (model_name, weight_name) tuple.
+    `model_weights` is a dictionary mapping from model name to its Weights.
+
+    One of the tensor in `group` must be complete and they must share the
+    same underlying storage.
+
+    Returns the name of the complete tensor in the `group`. If multiple
+    tensors are complete, returns an arbitrary one.
+    """
+
+    def get_tensor_properties(name_tuple: tuple[str, str]) -> TensorProperties:
+        # returns the tensor properties
+        (model_name, weight_name) = name_tuple
+        return models_weights[model_name].get_weight_properties(weight_name)
+
+    for name_tuple in group:
+        tensor_property = get_tensor_properties(name_tuple)
+        if tensor_property.is_complete():
+            return name_tuple
+
+    raise RuntimeError("No complete tensor found in the group!")
+
+
+def group_weights(all_weights: dict[str, Weights]) -> list[OrderedSet[tuple[str, str]]]:
+    """
+    Group weights that share the same underlying storage.
+
+    Returns a list of sets, each set contains a tuple of (model_name, weight_name).
+    """
+
+    weights_dict: dict[int, OrderedSet[tuple[str, str]]] = collections.defaultdict(
+        OrderedSet
+    )  # storage_key -> set(weight)
+
+    for model_name, weights in all_weights.items():
+        for weight_name, (_, properties) in weights.items():
+            weights_dict[properties.storage_ptr].add((model_name, weight_name))
+
+    return list(weights_dict.values())
diff --git a/torch/export/pt2_archive/constants.py b/torch/export/pt2_archive/constants.py
new file mode 100644
index 000000000000..3fbf9c69fc1b
--- /dev/null
+++ b/torch/export/pt2_archive/constants.py
@@ -0,0 +1,28 @@
+# Defined in torch/csrc/export/pt2_archive_constants.h
+from torch._C._export import pt2_archive_constants
+
+
+AOTINDUCTOR_DIR: str = pt2_archive_constants.AOTINDUCTOR_DIR
+ARCHIVE_FORMAT_PATH: str = pt2_archive_constants.ARCHIVE_FORMAT_PATH
+ARCHIVE_FORMAT_VALUE: str = pt2_archive_constants.ARCHIVE_FORMAT_VALUE
+ARCHIVE_ROOT_NAME: str = pt2_archive_constants.ARCHIVE_ROOT_NAME
+ARCHIVE_VERSION_PATH: str = pt2_archive_constants.ARCHIVE_VERSION_PATH
+ARCHIVE_VERSION_VALUE: str = pt2_archive_constants.ARCHIVE_VERSION_VALUE
+CONSTANTS_DIR: str = pt2_archive_constants.CONSTANTS_DIR
+CUSTOM_OBJ_FILENAME_PREFIX: str = pt2_archive_constants.CUSTOM_OBJ_FILENAME_PREFIX
+EXTRA_DIR: str = pt2_archive_constants.EXTRA_DIR
+MODELS_DIR: str = pt2_archive_constants.MODELS_DIR
+MODELS_FILENAME_FORMAT: str = pt2_archive_constants.MODELS_FILENAME_FORMAT
+MODULE_INFO_PATH: str = pt2_archive_constants.MODULE_INFO_PATH
+MTIA_DIR: str = pt2_archive_constants.MTIA_DIR
+SAMPLE_INPUTS_DIR: str = pt2_archive_constants.SAMPLE_INPUTS_DIR
+SAMPLE_INPUTS_FILENAME_FORMAT: str = pt2_archive_constants.SAMPLE_INPUTS_FILENAME_FORMAT
+TENSOR_CONSTANT_FILENAME_PREFIX: str = (
+    pt2_archive_constants.TENSOR_CONSTANT_FILENAME_PREFIX
+)
+WEIGHT_FILENAME_PREFIX: str = pt2_archive_constants.WEIGHT_FILENAME_PREFIX
+WEIGHTS_DIR: str = pt2_archive_constants.WEIGHTS_DIR
+XL_MODEL_WEIGHTS_DIR: str = pt2_archive_constants.XL_MODEL_WEIGHTS_DIR
+XL_MODEL_WEIGHTS_PARAM_CONFIG_PATH: str = (
+    pt2_archive_constants.XL_MODEL_WEIGHTS_PARAM_CONFIG_PATH
+)
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index cf82928b4741..0cca367fe57b 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -104,9 +104,15 @@ def _assign_attr(
             assert isinstance(from_obj, torch.Tensor)
             to_module.register_buffer(field, from_obj, persistent=persistent)
         elif attr_kind == _AttrKind.CONSTANT:
+<<<<<<< HEAD
             assert not isinstance(
                 from_obj, FakeScriptObject
             ), "FakeScriptObject should only exist during tracing."
+=======
+            assert not isinstance(from_obj, FakeScriptObject), (
+                "FakeScriptObject should only exist during tracing."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             assert isinstance(
                 from_obj,
                 (
@@ -143,7 +149,11 @@ def __init__(
         super().__init__()
         self.graph = graph
         self._ty = ty
+<<<<<<< HEAD
         self.graph.owning_module = self
+=======
+        self.graph.owning_module = self  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._run_with_interpreter = RUN_WITH_INTERPRETER
 
     def forward(self, *args, **kwargs):
@@ -275,6 +285,10 @@ def adapt(
         input_spec: pytree.TreeSpec,
         input_args: list[Any],
         metadata: Optional[dict[str, Any]] = None,
+<<<<<<< HEAD
+=======
+        obj: Optional[Any] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> list[Any]:
         """NOTE: This adapter may mutate given ``input_args_with_path``."""
         ...
@@ -295,7 +309,11 @@ def __init__(
         export_graph = deepcopy(export_module.graph)
         self.graph_signature = deepcopy(export_module.graph_signature)
         self.graph = torch.fx.Graph()
+<<<<<<< HEAD
         self.graph.owning_module = self
+=======
+        self.graph.owning_module = self  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.module_call_graph = deepcopy(export_module.module_call_graph)
         self.flat_args_adapter = flat_args_adapter
 
@@ -306,6 +324,7 @@ def __init__(
         self.adapted = False
         self._run_with_interpreter = RUN_WITH_INTERPRETER
 
+<<<<<<< HEAD
         _inplace_buffer_mutations(export_graph, self.graph_signature)
 
         self.ivals = _IVals()
@@ -316,6 +335,16 @@ def __init__(
         # and generate instructions to update the corresponding attribute;
         # finally, initialize all these attributes
         self.ivals.create(seen_modules.values(), self)
+=======
+        _inplace_buffer_and_input_mutations(export_graph, self.graph_signature)
+
+        self.ivals = _IVals()
+        # for any intermediate value of a mutation that is read, track the mutation
+        seen_modules, seen_attrs = _outline_submodules(export_graph, self)
+        # for each read intermediate value of a mutation, find where it was created,
+        # and perform the mutation
+        self.ivals.update(seen_modules.values())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # move attributes that correspond to graph arguments for HOPs
         # from exported program to unflattened submodules
         _copy_graph_attrs(export_module._graph_module, self, seen_attrs)
@@ -462,9 +491,15 @@ def add_to_consts_map(obj_id, node_name, target_name):
         # add constants that are aliased and don't appear in graph signature
         for const_name, const in export_module.constants.items():
             if const_name not in consts_targets:
+<<<<<<< HEAD
                 assert (
                     id(const) in consts_map
                 ), "Constants should be either aliased or appear in graph signature"
+=======
+                assert id(const) in consts_map, (
+                    "Constants should be either aliased or appear in graph signature"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ph_name, _ = consts_map[id(const)][0]
                 add_to_consts_map(id(const), ph_name, const_name)
                 added_params_buffers.add(s.target)
@@ -510,6 +545,10 @@ def add_to_consts_map(obj_id, node_name, target_name):
                 fqn_order[name] = len(fqn_order)
         _reorder_submodules(self, fqn_order)
         self.graph.lint()
+<<<<<<< HEAD
+=======
+        self.finalize()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _print_graph(self):
         for fqn, mod in self.named_modules():
@@ -517,14 +556,22 @@ def _print_graph(self):
             if hasattr(mod, "graph") and isinstance(mod.graph, torch.fx.Graph):
                 print(mod.graph)
 
+<<<<<<< HEAD
     def _adapt_flat_args(self, flat_args, in_spec):
+=======
+    def _adapt_flat_args(self, flat_args, in_spec, input):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         signature = self.module_call_graph[0].signature
         if in_spec == signature.in_spec:
             return flat_args
 
         if self.flat_args_adapter is None:
             raise TypeError(
+<<<<<<< HEAD
                 "There is no flat args adapter sepcified. "
+=======
+                "There is no flat args adapter specified. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Are you sure you are calling this with the right arguments? "
             )
         else:
@@ -533,6 +580,10 @@ def _adapt_flat_args(self, flat_args, in_spec):
                 input_spec=in_spec,
                 input_args=flat_args,
                 metadata=self.meta,
+<<<<<<< HEAD
+=======
+                obj=input,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             if len(flat_args) != signature.in_spec.num_leaves:
@@ -546,7 +597,13 @@ def _adapt_flat_args(self, flat_args, in_spec):
     def process_forward_inputs(self, *args, **kwargs):
         signature = self.module_call_graph[0].signature
 
+<<<<<<< HEAD
         reordered_kwargs = reorder_kwargs(kwargs, signature.in_spec)
+=======
+        reordered_kwargs = kwargs
+        if kwargs:
+            reordered_kwargs = reorder_kwargs(kwargs, signature.in_spec)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         flat_args_with_path, in_spec = pytree.tree_flatten_with_path(
             (args, reordered_kwargs)
@@ -564,7 +621,11 @@ def process_forward_inputs(self, *args, **kwargs):
                     f"Exported module treespec: {signature.in_spec}",
                 )
                 print("Adapting flat arg to match exported module's treespec")
+<<<<<<< HEAD
             flat_args = self._adapt_flat_args(flat_args, in_spec)
+=======
+            flat_args = self._adapt_flat_args(flat_args, in_spec, args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.adapted = True
 
         if self.check_input_constraints:
@@ -590,7 +651,14 @@ def process_forward_inputs(self, *args, **kwargs):
         return flat_args
 
     def forward(self, *args, **kwargs):
+<<<<<<< HEAD
         flat_args = torch._dynamo.disable(self.process_forward_inputs)(*args, **kwargs)
+=======
+        flat_args = torch._dynamo.disable(
+            self.process_forward_inputs,
+            reason="do not trace into preprocessing the inputs",
+        )(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         signature = self.module_call_graph[0].signature
 
         if is_fx_tracing():
@@ -602,14 +670,26 @@ def forward(self, *args, **kwargs):
                 return return_val[0]
             return return_val
 
+<<<<<<< HEAD
         if torch.compiler.is_dynamo_compiling() and not self._run_with_interpreter:
             tree_out = torch.fx.GraphModule(self, self.graph)(*flat_args)
+=======
+        if torch.compiler.is_dynamo_compiling() or not self._run_with_interpreter:
+            tree_out = type(self.graph_module).forward(self, *flat_args)  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             tree_out = torch.fx.Interpreter(self, graph=self.graph).run(
                 *flat_args, enable_io_processing=False
             )
         return pytree.tree_unflatten(tree_out, signature.out_spec)
 
+<<<<<<< HEAD
+=======
+    def finalize(self):
+        self.__dict__["graph_module"] = torch.fx.GraphModule(self, self.graph)
+        self.graph.lint()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _dispatch_modules(self, redirected_call_indices, consts_targets):
         """For a module whose call signatures are preserved, replace
         multiple modules corresponding to multiple calls to that module
@@ -711,6 +791,7 @@ def unflatten(
     return UnflattenedModule(module, flat_args_adapter)
 
 
+<<<<<<< HEAD
 def _inplace_buffer_mutations(
     graph: torch.fx.Graph,
     graph_signature: ExportGraphSignature,
@@ -719,27 +800,53 @@ def _inplace_buffer_mutations(
     node in the graph.
 
     Functionalization represents buffer mutation by passing the buffer as an input and output. So for example, the eager code:
+=======
+def _inplace_buffer_and_input_mutations(
+    graph: torch.fx.Graph,
+    graph_signature: ExportGraphSignature,
+) -> None:
+    """Transform buffer and input mutations from their functionalized form
+    into copy_ nodes in the graph.
+
+    Functionalization represents a buffer mutation by passing the buffer as
+    an input and output. For example, consider the eager code:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(self, x):
             self.buffer += x
             return x * x
 
+<<<<<<< HEAD
     Will become a graph that looks like:
+=======
+    This corresponds to a graph that looks like:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(self, buffer, x):
             mutated_buffer = aten.add(buffer, x)
             mul = aten.mul(x, x)
             return (mutated_buffer, mul)
 
+<<<<<<< HEAD
     We want to inplace this into something that looks like the original eager code:
+=======
+    We want to inplace this into something that looks like the original
+    eager code:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def forward(self, buffer, x):
             mutated_buffer = aten.add(buffer, x)
             buffer.copy_(mutated_buffer)
             mul = aten.mul(x, x)
             return (mul,)
+<<<<<<< HEAD
+=======
+
+    Input mutations are handled similarly.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     output_node = next(iter(reversed(graph.nodes)))
     assert output_node.op == "output" and len(output_node.args) == 1
     return_args = output_node.args[0]
 
+<<<<<<< HEAD
     mutation_node_to_buffer = graph_signature.buffers_to_mutate
     mutations = return_args[: len(mutation_node_to_buffer)]
     buffers_to_inputs = {v: k for k, v in graph_signature.inputs_to_buffers.items()}
@@ -767,6 +874,44 @@ def forward(self, buffer, x):
     user_outputs = tuple(
         return_args[len(mutation_node_to_buffer) :],
     )
+=======
+    input_name_to_node = {
+        node.name: node for node in graph.nodes if node.op == "placeholder"
+    }
+    mutation_name_to_input_name = {}
+
+    # Collect mutated buffers.
+    buffer_fqn_to_input_name = {
+        buffer_fqn: k for k, buffer_fqn in graph_signature.inputs_to_buffers.items()
+    }
+    mutation_name_to_input_name = {
+        k: buffer_fqn_to_input_name[buffer_fqn]
+        for k, buffer_fqn in graph_signature.buffers_to_mutate.items()
+    }
+    # Collect mutated user inputs.
+    mutation_name_to_input_name.update(graph_signature.user_inputs_to_mutate)
+
+    num_mutations = len(mutation_name_to_input_name)
+
+    for mutation in return_args[:num_mutations]:
+        input_name = mutation_name_to_input_name[mutation.name]
+        input_node = input_name_to_node[input_name]
+
+        with graph.inserting_after(mutation):
+            # Create a copy_ node that inplaces the mutation.
+            new_node = graph.create_node(
+                "call_function", torch.ops.aten.copy_.default, (input_node, mutation)
+            )
+            for k, v in mutation.meta.items():
+                new_node.meta[k] = v
+        # Replace all uses of the previously functional mutation with
+        # our copy_ node.
+        mutation.replace_all_uses_with(new_node, lambda x: x is not new_node)
+
+    # Remove the mutated buffer / input from the graph outputs, since we don't
+    # need to thread it through anymore.
+    user_outputs = tuple(return_args[num_mutations:])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     output_node.args = ((user_outputs),)
 
 
@@ -1018,9 +1163,15 @@ def create_module(fqn):
 
                     if arg.name in self.seen_nodes:
                         flat_arg_node.meta = copy.copy(self.seen_nodes[arg.name].meta)
+<<<<<<< HEAD
                         self.node_to_placeholder[
                             self.seen_nodes[arg.name]
                         ] = flat_arg_node
+=======
+                        self.node_to_placeholder[self.seen_nodes[arg.name]] = (
+                            flat_arg_node
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             with self.parent.graph.inserting_before(self.parent_call_module):
                 input_nodes: list[Optional[torch.fx.Node]] = []
@@ -1102,8 +1253,12 @@ def remap_input(self, x):
         if x in self.node_to_placeholder:
             return self.node_to_placeholder[x]
         elif (
+<<<<<<< HEAD
             x.op == "placeholder"
             or self.module_call_graph.get(self.fqn) is None
+=======
+            x.op == "placeholder" or self.module_call_graph.get(self.fqn) is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # allow placeholder creation if we are not preserving module call signature
         ):
             self.add_placeholder(x)
@@ -1130,9 +1285,15 @@ def remap_input(self, x):
             self.copy_sym_call_function(x)
             return self.node_map[x]
         elif self.module_call_graph.get(self.fqn) is not None:
+<<<<<<< HEAD
             # x is an ival that is not in placeholders, so create a
             # get_attr node corresponding to attribute __ival__x
             return self.ivals.read(self.fqn, self.graph, x)  # type: ignore[operator, union-attr]
+=======
+            # x is reading the intermediate value of a mutation, so record it;
+            # later we will find where it was created and perform the update
+            return self.ivals.read(self, x)  # type: ignore[operator, union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             raise RuntimeError(
                 f"Could not run remap_input() on op type: {x.op} for node {x}"
@@ -1148,7 +1309,17 @@ def finalize_outputs(self):
             for output in signature.outputs:
                 if isinstance(
                     output,
+<<<<<<< HEAD
                     (TensorArgument, SymIntArgument, SymBoolArgument, SymFloatArgument),
+=======
+                    (
+                        TensorArgument,
+                        SymIntArgument,
+                        SymBoolArgument,
+                        SymFloatArgument,
+                        ConstantArgument,
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     if output.name in self.seen_nodes:
                         orig_outputs.append(self.seen_nodes[output.name])
@@ -1403,10 +1574,14 @@ def _reorder_submodules(
 
 class _IVals:
     """
+<<<<<<< HEAD
     Collect the intermediate values of buffer mutations in a graph,
     along with the module call fqns that create and use them. Later,
     in each fqn associated with an intermediate value we will install
     a corresponding attribute, so that it can be updated and read.
+=======
+    Collect the intermediate values of mutations in a graph.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example: in the following graph, suppose that buf_in and buf_out
     are the input and output values of a buffer.
@@ -1423,6 +1598,7 @@ class _IVals:
     Here ival1 and ival2 are intermediate values created inside
     calls to n0 and n1 respectively, and used inside calls to
     n1 and n2 respectively.
+<<<<<<< HEAD
 
     Thus our analysis will produce {ival1: {n0, n1}, ival2: {n1, n2}}.
     """
@@ -1495,6 +1671,53 @@ def create(self, partitions, root_module):
                 # for a ival named x created in module call m,
                 # create attribute m.__ival__x, initially empty
                 setattr(mod, ival_name, self.storage[name])
+=======
+    """
+
+    def __init__(self):
+        # for each fqn, set of node names corresponding to intermediate values
+        self.node_names_by_fqn = defaultdict(set)
+
+    def _is_mutable(self, target):
+        if isinstance(target, torch._ops.OpOverload):
+            return target._schema.is_mutable
+        return False
+
+    def read(self, mf, node):
+        """
+        Read state corresponding to a given intermediate value.
+        """
+        # we can assume that the node must be from a mutation
+        assert node.op == "call_function"
+        b = self._is_mutable(node.target)
+        print("Checking mutability", node.target, b)
+        if not b:
+            # so the mutation was functionalized;
+            # we will apply the original mutation later (see below)
+            fqn, _ = next(reversed(node.meta["nn_module_stack"].values()))
+            self.node_names_by_fqn[fqn].add(node.name)
+        return mf.remap_input(node.args[0])
+
+    def update(self, partitions):
+        """
+        Update states corresponding to intermediate values that were read.
+        """
+        for shared_submodules in partitions:
+            for entry in shared_submodules:
+                graph = entry.module.graph
+                node_names = self.node_names_by_fqn[entry.fqn]
+                nodes = [n for n in graph.nodes if n.name in node_names]
+                for node in nodes:
+                    # so node must be from a functionalized mutation;
+                    # we perform the original mutation now
+                    with graph.inserting_after(node):
+                        new_node = graph.create_node(
+                            "call_function",
+                            torch.ops.aten.copy_.default,
+                            (node.args[0], node),
+                        )
+                        new_node.meta = copy.copy(node.meta)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _copy_graph_attrs(
@@ -1584,7 +1807,11 @@ def _sink_params(
     # explicitly want duplicate modules to show up in the traversal.
     for name, submodule in module._modules.items():
         submod_id_to_inputs_removed = _sink_params(
+<<<<<<< HEAD
             cast(torch.nn.Module, submodule),
+=======
+            cast("torch.nn.Module", submodule),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inputs_to_state,
             scope + [name],
             module_id_to_inputs_removed,
@@ -1600,7 +1827,11 @@ def _sink_params(
     assert isinstance(graph, torch.fx.Graph)
 
     inputs = list(filter(lambda n: n.op == "placeholder", graph.nodes))
+<<<<<<< HEAD
     the_last_input = inputs[-1]
+=======
+    the_last_input = None if len(inputs) == 0 else inputs[-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Also remove from call_module nodes
     call_module_nodes = filter(lambda n: n.op == "call_module", graph.nodes)
diff --git a/torch/fft/__init__.py b/torch/fft/__init__.py
index 3ad1748bab1a..c83052c35147 100644
--- a/torch/fft/__init__.py
+++ b/torch/fft/__init__.py
@@ -82,9 +82,13 @@
     >>> t = torch.tensor([0.+1.j, 2.+3.j, 4.+5.j, 6.+7.j])
     >>> torch.fft.fft(t)
     tensor([12.+16.j, -8.+0.j, -4.-4.j,  0.-8.j])
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 ifft = _add_docstr(
@@ -125,9 +129,13 @@
     >>> t = torch.tensor([ 6.+0.j, -2.+2.j, -2.+0.j, -2.-2.j])
     >>> torch.fft.ifft(t)
     tensor([0.+0.j, 1.+0.j, 2.+0.j, 3.+0.j])
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 fft2 = _add_docstr(
@@ -188,9 +196,13 @@
     >>> two_ffts = torch.fft.fft(torch.fft.fft(x, dim=0), dim=1)
     >>> torch.testing.assert_close(fft2, two_ffts, check_stride=False)
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 ifft2 = _add_docstr(
@@ -243,9 +255,13 @@
     >>> two_iffts = torch.fft.ifft(torch.fft.ifft(x, dim=0), dim=1)
     >>> torch.testing.assert_close(ifft2, two_iffts, check_stride=False)
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 fftn = _add_docstr(
@@ -305,9 +321,13 @@
     >>> two_ffts = torch.fft.fft(torch.fft.fft(x, dim=0), dim=1)
     >>> torch.testing.assert_close(fftn, two_ffts, check_stride=False)
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 ifftn = _add_docstr(
@@ -359,9 +379,13 @@
     >>> two_iffts = torch.fft.ifft(torch.fft.ifft(x, dim=0), dim=1)
     >>> torch.testing.assert_close(ifftn, two_iffts, check_stride=False)
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 rfft = _add_docstr(
@@ -417,9 +441,13 @@
     Notice that the symmetric element ``T[-1] == T[1].conj()`` is omitted.
     At the Nyquist frequency ``T[-2] == T[2]`` is it's own symmetric pair,
     and therefore must always be real-valued.
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 irfft = _add_docstr(
@@ -496,9 +524,13 @@
     >>> roundtrip = torch.fft.irfft(T, t.numel())
     >>> torch.testing.assert_close(roundtrip, t, check_stride=False)
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 rfft2 = _add_docstr(
@@ -565,9 +597,13 @@
     >>> two_ffts = torch.fft.fft(torch.fft.rfft(t, dim=1), dim=0)
     >>> torch.testing.assert_close(rfft2, two_ffts, check_stride=False)
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 irfft2 = _add_docstr(
@@ -649,9 +685,13 @@
     torch.Size([10, 9])
     >>> torch.testing.assert_close(roundtrip, t, check_stride=False)
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 rfftn = _add_docstr(
@@ -718,9 +758,13 @@
     >>> two_ffts = torch.fft.fft(torch.fft.rfft(t, dim=1), dim=0)
     >>> torch.testing.assert_close(rfftn, two_ffts, check_stride=False)
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 irfftn = _add_docstr(
@@ -801,9 +845,13 @@
     torch.Size([10, 9])
     >>> torch.testing.assert_close(roundtrip, t, check_stride=False)
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 hfft = _add_docstr(
@@ -894,9 +942,13 @@
 
     >>> torch.fft.hfft(T[:3])
     tensor([0.1250, 0.2809, 0.6250, 0.9691])
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 ihfft = _add_docstr(
@@ -951,9 +1003,13 @@
     >>> torch.fft.ifft(t)
     tensor([ 2.0000-0.0000j, -0.5000-0.6882j, -0.5000-0.1625j, -0.5000+0.1625j,
             -0.5000+0.6882j])
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 hfft2 = _add_docstr(
@@ -1025,9 +1081,13 @@
     >>> torch.allclose(roundtrip, T)
     True
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 ihfft2 = _add_docstr(
@@ -1092,9 +1152,13 @@
     >>> torch.allclose(t, two_ffts)
     True
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 hfftn = _add_docstr(
@@ -1187,9 +1251,13 @@
     >>> torch.allclose(roundtrip, T)
     True
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 ihfftn = _add_docstr(
@@ -1259,9 +1327,13 @@
     >>> torch.allclose(ihfftn, two_iffts)
     True
 
+<<<<<<< HEAD
 """.format(
         **common_args
     ),
+=======
+""".format(**common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 fftfreq = _add_docstr(
@@ -1310,9 +1382,13 @@
     >>> torch.fft.fftfreq(4)
     tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
 
+<<<<<<< HEAD
 """.format(
         **factory_common_args
     ),
+=======
+""".format(**factory_common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 rfftfreq = _add_docstr(
@@ -1361,9 +1437,13 @@
     >>> torch.fft.fftfreq(4)
     tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
 
+<<<<<<< HEAD
 """.format(
         **factory_common_args
     ),
+=======
+""".format(**factory_common_args),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 fftshift = _add_docstr(
diff --git a/torch/functional.py b/torch/functional.py
index 26bea98f3472..d48d2bf80501 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -1460,8 +1460,18 @@ def cdist(x1, x2, p=2.0, compute_mode="use_mm_for_euclid_dist_if_necessary"):
     r"""Computes batched the p-norm distance between each pair of the two collections of row vectors.
 
     Args:
+<<<<<<< HEAD
         x1 (Tensor): input tensor of shape :math:`B \times P \times M`.
         x2 (Tensor): input tensor of shape :math:`B \times R \times M`.
+=======
+        x1 (Tensor): input tensor where the last two dimensions represent the points and the feature dimension respectively.
+            The shape can be :math:`D_1 \times D_2 \times \cdots \times D_n \times P \times M`,
+            where :math:`P` is the number of points and :math:`M` is the feature dimension.
+        x2 (Tensor): input tensor where the last two dimensions also represent the points and the feature dimension respectively.
+            The shape can be :math:`D_1' \times D_2' \times \cdots \times D_m' \times R \times M`,
+            where :math:`R` is the number of points and :math:`M` is the feature dimension,
+            which should match the feature dimension of `x1`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         p: p value for the p-norm distance to calculate between each vector pair
             :math:`\in [0, \infty]`.
         compute_mode:
@@ -1808,6 +1818,10 @@ def norm(  # noqa: F811
     if input.layout == torch.strided and input.device.type in (
         "cpu",
         "cuda",
+<<<<<<< HEAD
+=======
+        "xpu",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         "meta",
         torch.utils.backend_registration._privateuse1_backend_name,
     ):
diff --git a/torch/futures/__init__.py b/torch/futures/__init__.py
index 236165f61efa..9419443f7175 100644
--- a/torch/futures/__init__.py
+++ b/torch/futures/__init__.py
@@ -149,6 +149,10 @@ def then(self, callback: Callable[[Future[T]], S]) -> Future[S]:
             on those futures independently.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
             >>> def callback(fut):
             ...     print(f"RPC return value is {fut.wait()}.")
@@ -197,6 +201,10 @@ def add_done_callback(self, callback: Callable[[Future[T]], None]) -> None:
             for handling completion/waiting on those futures independently.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
             >>> def callback(fut):
             ...     print("This will run after the future has finished.")
@@ -230,6 +238,10 @@ def set_result(self, result: T) -> None:
             result (object): the result object of this ``Future``.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
             >>> import threading
             >>> import time
@@ -259,6 +271,10 @@ def set_exception(self, result: T) -> None:
             result (BaseException): the exception for this ``Future``.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
             >>> fut = torch.futures.Future()
             >>> fut.set_exception(ValueError("foo"))
@@ -267,9 +283,15 @@ def set_exception(self, result: T) -> None:
             ...
             ValueError: foo
         """
+<<<<<<< HEAD
         assert isinstance(
             result, Exception
         ), f"{result} is of type {type(result)}, not an Exception."
+=======
+        assert isinstance(result, Exception), (
+            f"{result} is of type {type(result)}, not an Exception."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def raise_error(fut_result):
             raise fut_result
diff --git a/torch/fx/_graph_pickler.py b/torch/fx/_graph_pickler.py
index 809f67db2631..f4333355a6f5 100644
--- a/torch/fx/_graph_pickler.py
+++ b/torch/fx/_graph_pickler.py
@@ -23,14 +23,44 @@
 _SymNodeT = TypeVar("_SymNodeT", torch.SymInt, torch.SymFloat)
 
 
+<<<<<<< HEAD
+=======
+def _ops_filter_safe(name: str) -> bool:
+    """
+    An ops filter which allows pickle-safe ops. Pickle-safe ops are built-in
+    ones where it will be possible to unpickle on any machine which has PyTorch.
+    """
+    # TODO: This list is pretty pessimistic right now. What's the full list?
+    return name.startswith(
+        (
+            "torch.ops.aten",
+            "torch.ops.fbgemm",
+        )
+    )
+
+
+@dataclasses.dataclass
+class Options:
+    # A filter for which ops will cause the pickler to raise a
+    # BypassFxGraphCache exception. If None then all ops are allowed.
+    ops_filter: Optional[Callable[[str], bool]] = _ops_filter_safe
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class GraphPickler(pickle.Pickler):
     """
     GraphPickler is a Pickler which helps pickling fx graph - in particular
     GraphModule.
     """
 
+<<<<<<< HEAD
     def __init__(self, file: io.BytesIO) -> None:
         super().__init__(file)
+=======
+    def __init__(self, file: io.BytesIO, options: Optional[Options] = None) -> None:
+        super().__init__(file)
+        self.options = options or Options()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # This abomination is so we can pass external decoding state to the
         # unpickler functions. We serialize _unpickle_state as a persistent
@@ -93,12 +123,20 @@ def persistent_id(self, obj: object) -> Optional[str]:
             return None
 
     @classmethod
+<<<<<<< HEAD
     def dumps(cls, obj: object) -> bytes:
+=======
+    def dumps(cls, obj: object, options: Optional[Options] = None) -> bytes:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Pickle an object.
         """
         with io.BytesIO() as stream:
+<<<<<<< HEAD
             pickler = cls(stream)
+=======
+            pickler = cls(stream, options)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             pickler.dump(obj)
             return stream.getvalue()
 
@@ -231,9 +269,15 @@ def __init__(self, describer: MetaTensorDescriber, t: Tensor) -> None:
         for k in MetaTensorDesc._UNSERIALIZABLE:
             if k in ("fake_mode", "view_func"):
                 continue
+<<<<<<< HEAD
             assert (
                 getattr(self.metadata, k) is None
             ), f"not None: {k}: {getattr(self.metadata, k)}"
+=======
+            assert getattr(self.metadata, k) is None, (
+                f"not None: {k}: {getattr(self.metadata, k)}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def unpickle(self, unpickle_state: _UnpickleState) -> FakeTensor:
         # TODO: make common w/ _output_from_cache_entry() in fake_tensor.py?
@@ -314,11 +358,19 @@ def reduce_helper(
         tuple[Self, _UnpickleStateToken],
     ]:
         return cls.unpickle, (
+<<<<<<< HEAD
             cls(obj),
             pickler._unpickle_state,
         )
 
     def __init__(self, gm: torch.fx.GraphModule) -> None:
+=======
+            cls(obj, pickler.options),
+            pickler._unpickle_state,
+        )
+
+    def __init__(self, gm: torch.fx.GraphModule, options: Options) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Need to do this to ensure the code is created for later pickling.
         if isinstance(gm, torch.fx._lazy_graph_module._LazyGraphModule):
             _python_code = gm._real_recompile()
@@ -326,7 +378,11 @@ def __init__(self, gm: torch.fx.GraphModule) -> None:
             _python_code = gm.recompile()
         self.gm_dict = gm.__dict__.copy()
         del self.gm_dict["_graph"]
+<<<<<<< HEAD
         self.graph = _GraphPickleData(gm._graph)
+=======
+        self.graph = _GraphPickleData(gm._graph, options)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def unpickle(self, unpickle_state: _UnpickleState) -> torch.fx.GraphModule:
         gm = torch.fx.GraphModule.__new__(torch.fx.GraphModule)
@@ -337,7 +393,14 @@ def unpickle(self, unpickle_state: _UnpickleState) -> torch.fx.GraphModule:
 
 class _NodePickleData:
     def __init__(
+<<<<<<< HEAD
         self, node: torch.fx.Node, mapping: dict[torch.fx.Node, "_NodePickleData"]
+=======
+        self,
+        node: torch.fx.Node,
+        mapping: dict[torch.fx.Node, "_NodePickleData"],
+        options: Options,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         self.args = pytree.tree_map_only(torch.fx.Node, lambda n: mapping[n], node.args)
         self.kwargs = pytree.tree_map_only(
@@ -346,7 +409,11 @@ def __init__(
         # -- self.graph = node.graph
         self.name = node.name
         self.op = node.op
+<<<<<<< HEAD
         self.target = _OpPickleData.pickle(node.target)
+=======
+        self.target = _OpPickleData.pickle(node.target, options)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # self.input_nodes = node._input_nodes
         # self.users = node.users
         self.type = node.type
@@ -377,19 +444,33 @@ class _OpPickleData:
     def reduce_helper(
         cls, pickler: GraphPickler, op: object
     ) -> tuple[Callable[[_UnpickleState], object], tuple[_UnpickleStateToken]]:
+<<<<<<< HEAD
         result = cls.pickle(op)
         return (result.unpickle, (pickler._unpickle_state,))
 
     @classmethod
     def pickle(cls, op: object) -> "_OpPickleData":
+=======
+        result = cls.pickle(op, pickler.options)
+        return (result.unpickle, (pickler._unpickle_state,))
+
+    @classmethod
+    def pickle(cls, op: object, options: Options) -> "_OpPickleData":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(op, str):
             return _OpStrPickleData(op)
 
         name = torch.fx.Node._pretty_print_target(op)
         if isinstance(op, torch._ops.OpOverload):
+<<<<<<< HEAD
             return cls._pickle_op(name, _OpOverloadPickleData)
         elif isinstance(op, torch._ops.OpOverloadPacket):
             return cls._pickle_op(name, _OpOverloadPacketPickleData)
+=======
+            return cls._pickle_op(name, _OpOverloadPickleData, options)
+        elif isinstance(op, torch._ops.OpOverloadPacket):
+            return cls._pickle_op(name, _OpOverloadPacketPickleData, options)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif name.startswith(("builtins.", "math.", "torch.")):
             root, detail = name.split(".", 1)
             return _OpBuiltinPickleData(root, detail)
@@ -406,8 +487,14 @@ def _pickle_op(
         datacls: Union[
             type["_OpOverloadPickleData"], type["_OpOverloadPacketPickleData"]
         ],
+<<<<<<< HEAD
     ) -> "_OpPickleData":
         if not name.startswith("torch.ops.aten"):  # TODO: What's the full list?
+=======
+        options: Options,
+    ) -> "_OpPickleData":
+        if (ops_filter := options.ops_filter) and not ops_filter(name):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch._inductor.codecache import BypassFxGraphCache
 
             raise BypassFxGraphCache(f"Unable to pickle non-standard op: {name}")
@@ -497,13 +584,21 @@ def unpickle(self, unpickle_state: _UnpickleState) -> object:
 
 
 class _GraphPickleData:
+<<<<<<< HEAD
     def __init__(self, graph: torch.fx.Graph) -> None:
+=======
+    def __init__(self, graph: torch.fx.Graph, options: Options) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.tracer_cls = graph._tracer_cls
         self.tracer_extras = graph._tracer_extras
 
         nodes: dict[torch.fx.Node, _NodePickleData] = {}
         for node in graph.nodes:
+<<<<<<< HEAD
             nodes[node] = _NodePickleData(node, nodes)
+=======
+            nodes[node] = _NodePickleData(node, nodes, options)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.nodes = tuple(nodes.values())
 
         # Unpickled variables:
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index ca0f553c860d..ed1fd82fb018 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -10,7 +10,12 @@
 import warnings
 from itertools import chain
 from types import CodeType, FunctionType, ModuleType
+<<<<<<< HEAD
 from typing import Any, Callable, NamedTuple, Optional, Union
+=======
+from typing import Any, Callable, get_args, NamedTuple, Optional, Union
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.utils._pytree as pytree
@@ -35,6 +40,15 @@
 
 _is_fx_tracing_flag = False
 
+<<<<<<< HEAD
+=======
+_ConstantAttributeType: TypeAlias = Union[
+    torch.Tensor, torch.ScriptObject, FakeScriptObject, pytree.TreeSpec
+]
+
+_constant_attribute_types = get_args(_ConstantAttributeType)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def is_fx_tracing():
     return _is_fx_tracing_flag
@@ -395,17 +409,34 @@ def create_arg(self, a: Any) -> "Argument":
         # a get_attr to retrieve that tensor. Otherwise, we'll store away the
         # tensor value into a special attribute on the Module s.t. we can
         # retrieve it with a get_attr.
+<<<<<<< HEAD
         if isinstance(a, (torch.Tensor, ScriptObject, FakeScriptObject)):
+=======
+        if isinstance(a, _constant_attribute_types):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             qualname: Optional[str] = self.tensor_attrs.get(a)
 
             # Tensor was not found in the Module hierarchy, stow it away in a
             # special attribute and set the qualname to refer to that
             if not qualname:
+<<<<<<< HEAD
                 base_name = (
                     "_tensor_constant"
                     if isinstance(a, torch.Tensor)
                     else "_torchbind_obj"
                 )
+=======
+                if isinstance(a, torch.Tensor):
+                    base_name = "_tensor_constant"
+                elif isinstance(a, (FakeScriptObject, ScriptObject)):
+                    base_name = "_torchbind_obj"
+                elif isinstance(a, pytree.TreeSpec):
+                    base_name = "_tree_spec_constant"
+                else:
+                    raise RuntimeError(
+                        f"cannot create constant arg for {a} of type {type(a)}."
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 qualname = self.get_fresh_qualname(base_name)
                 assert isinstance(qualname, str)
                 self.tensor_attrs[a] = qualname
@@ -743,9 +774,15 @@ def trace(
 
                 self.root = root
 
+<<<<<<< HEAD
                 assert hasattr(
                     type(root), self.traced_func_name
                 ), f"traced_func_name={self.traced_func_name} doesn't exist in {type(root).__name__}"
+=======
+                assert hasattr(type(root), self.traced_func_name), (
+                    f"traced_func_name={self.traced_func_name} doesn't exist in {type(root).__name__}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 fn = getattr(type(root), self.traced_func_name)
                 self.root_module_name = root._get_name()
@@ -769,12 +806,21 @@ def trace(
             # values to the qualified name here for efficiency. This is used downstream
             # in create_arg
             self.tensor_attrs: dict[
+<<<<<<< HEAD
                 Union[torch.Tensor, ScriptObject, FakeScriptObject], str
+=======
+                _ConstantAttributeType,
+                str,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ] = {}
 
             def collect_tensor_attrs(m: torch.nn.Module, prefix_atoms: list[str]):
                 for k, v in m.__dict__.items():
+<<<<<<< HEAD
                     if isinstance(v, (torch.Tensor, ScriptObject, FakeScriptObject)):
+=======
+                    if isinstance(v, _constant_attribute_types):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.tensor_attrs[v] = ".".join(prefix_atoms + [k])
                 for k, v in m.named_children():
                     collect_tensor_attrs(v, prefix_atoms + [k])
@@ -1151,9 +1197,15 @@ def _maybe_revert_all_patches():
     finally:
         if current_patcher is not None:
             patches_made = current_patcher.reapply_all_patches()
+<<<<<<< HEAD
         assert (
             patches_made == patches_removed
         ), "CURRENT_PATCHER was changed during a revert_all_patches"
+=======
+        assert patches_made == patches_removed, (
+            "CURRENT_PATCHER was changed during a revert_all_patches"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _patch_wrapped_functions(patcher: _Patcher):
@@ -1235,9 +1287,15 @@ def my_custom_function(x, y):
         assert not isinstance(fn_or_name, str)  # to make mypy happy
         fn_name = fn_or_name.__name__
     else:
+<<<<<<< HEAD
         assert isinstance(
             fn_or_name, str
         ), "fn_or_name must be a global function or string name"
+=======
+        assert isinstance(fn_or_name, str), (
+            "fn_or_name must be a global function or string name"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn_name = fn_or_name
 
     currentframe = inspect.currentframe()
@@ -1295,7 +1353,13 @@ def f(x):
             return out
 
 
+<<<<<<< HEAD
         f = fx.symbolic_trace(f, concrete_args={"x": {"a": fx.PH, "b": fx.PH, "c": fx.PH}})
+=======
+        f = fx.symbolic_trace(
+            f, concrete_args={"x": {"a": fx.PH, "b": fx.PH, "c": fx.PH}}
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert f({"a": 1, "b": 2, "c": 4}) == 7
 
 
diff --git a/torch/fx/experimental/_config.py b/torch/fx/experimental/_config.py
index 58859607eee2..8a06190528b7 100644
--- a/torch/fx/experimental/_config.py
+++ b/torch/fx/experimental/_config.py
@@ -97,6 +97,12 @@
 # Currently an experimental option for export.
 backed_size_oblivious = False
 
+<<<<<<< HEAD
+=======
+# Skip dtype check in meta registrations. Only used for systems that does its own dtype checking.
+skip_dtype_check_in_meta_registrations = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._config_module import install_config_module
 
 
diff --git a/torch/fx/experimental/_dynamism.py b/torch/fx/experimental/_dynamism.py
index b6cfdef6147e..0923392b156c 100644
--- a/torch/fx/experimental/_dynamism.py
+++ b/torch/fx/experimental/_dynamism.py
@@ -29,6 +29,7 @@ def module_to_nested_dict(module: torch.nn.Module) -> dict[str, Any]:
     self_dict["_modules"] = {}
 
     for attr_name in dir(module):
+<<<<<<< HEAD
         if not attr_name.startswith("_") and not callable(getattr(module, attr_name)):
             attr_value = getattr(module, attr_name)
             if (
@@ -37,6 +38,23 @@ def module_to_nested_dict(module: torch.nn.Module) -> dict[str, Any]:
                 and type(attr_value) is not bool
             ):
                 self_dict[attr_name] = attr_value
+=======
+        try:
+            if not attr_name.startswith("_") and not callable(
+                getattr(module, attr_name)
+            ):
+                attr_value = getattr(module, attr_name)
+                if (
+                    not isinstance(attr_value, torch.nn.Module)
+                    and isinstance(attr_value, (int, float, torch.Tensor))
+                    and type(attr_value) is not bool
+                ):
+                    self_dict[attr_name] = attr_value
+        except NotImplementedError:
+            # Skip attributes that raise NotImplementedError since they won't
+            # contain any dynamism anyways.
+            continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for name, param in module.named_parameters(recurse=False):
         self_dict["_parameters"][name] = param
diff --git a/torch/fx/experimental/accelerator_partitioner.py b/torch/fx/experimental/accelerator_partitioner.py
index 29b8d4541b81..bc674594039f 100644
--- a/torch/fx/experimental/accelerator_partitioner.py
+++ b/torch/fx/experimental/accelerator_partitioner.py
@@ -450,9 +450,15 @@ def find_device_based_on_size(node) -> Device:
                         device = find_device_based_on_size(node)
                         occupied_devices.append(device)
                         # Update partition and its left mem size
+<<<<<<< HEAD
                         partition_to_left_mem_bytes[
                             partition
                         ] = device.available_mem_bytes
+=======
+                        partition_to_left_mem_bytes[partition] = (
+                            device.available_mem_bytes
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         # Update available mem for the current partition
                         partition.logical_device_ids.append(device.logical_id)
                     else:
@@ -475,9 +481,15 @@ def find_device_based_on_size(node) -> Device:
                             total_size_of_input_nodes = get_extra_size_of(
                                 node, partition.nodes
                             )
+<<<<<<< HEAD
                             partition_to_left_mem_bytes[
                                 partition
                             ] = device.available_mem_bytes
+=======
+                            partition_to_left_mem_bytes[partition] = (
+                                device.available_mem_bytes
+                            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             partition.logical_device_ids.append(device.logical_id)
                     partition.add_node(node)
                     partition_to_left_mem_bytes[partition] -= total_size_of_input_nodes
@@ -509,9 +521,15 @@ def saturate_host(self) -> None:
             no_device_partitions,
         ) = get_device_partition_stats(self.partitions, self.devices)
 
+<<<<<<< HEAD
         assert (
             len(no_device_partitions) == 0
         ), f"Expect no_device_partitions has 0 device, but get {len(no_device_partitions)}"
+=======
+        assert len(no_device_partitions) == 0, (
+            f"Expect no_device_partitions has 0 device, but get {len(no_device_partitions)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Devices that hold partitions
         used_devices = [d for d in self.devices if len(device_to_partitions[d]) > 0]
diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py
index 483b7e8b2ea2..9d1e9d2809de 100644
--- a/torch/fx/experimental/const_fold.py
+++ b/torch/fx/experimental/const_fold.py
@@ -252,13 +252,27 @@ def mod_partition(node: torch.fx.Node):
     #    %add : [num_users=1] = call_function[target=operator.add](args = (%inp_1, %inp_1), kwargs = {})
     #    return add
     root_const_gm = torch.fx.GraphModule(split, const_gm.graph)
+<<<<<<< HEAD
+=======
+
+    # The order of placeholders in the const_gm graph should match the order of
+    # args in the outer module, so we can simply use an index for the
+    # placeholder mapping
+    ph_idx = 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for node in root_const_gm.graph.nodes:
         if node.op == "output":
             multiple_outputs = isinstance(node.args[0], tuple)
             continue
         if node.op != "placeholder":
             continue
+<<<<<<< HEAD
         in_node = next(n for n in call_const_gm_args if n.name == node.target)
+=======
+        assert ph_idx < len(call_const_gm_args)
+        in_node = call_const_gm_args[ph_idx]
+        ph_idx += 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert in_node.op == "get_attr"
         with root_const_gm.graph.inserting_before(node):
             new_node = root_const_gm.graph.get_attr(in_node.target)
diff --git a/torch/fx/experimental/meta_tracer.py b/torch/fx/experimental/meta_tracer.py
index e2fc033e0b8d..dfcc4cca3fa2 100644
--- a/torch/fx/experimental/meta_tracer.py
+++ b/torch/fx/experimental/meta_tracer.py
@@ -126,7 +126,11 @@ def __init__(self, root, attr: str):
         self._node = None
 
     @property
+<<<<<<< HEAD
     def node(self):
+=======
+    def node(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # the node for attributes is added lazily, since most will just be method calls
         # which do not rely on the getitem call
         if self._node is None:
diff --git a/torch/fx/experimental/optimization.py b/torch/fx/experimental/optimization.py
index 13d9c2d9ac77..dcebe54c1ad7 100644
--- a/torch/fx/experimental/optimization.py
+++ b/torch/fx/experimental/optimization.py
@@ -368,12 +368,21 @@ class MklSupport(Enum):
                 supports_mkldnn = MklSupport.YES
                 sample_parameter = next(cur_module.parameters(), None)
                 if sample_parameter is not None:
+<<<<<<< HEAD
                     assert (
                         sample_parameter.dtype == torch.float
                     ), "this pass is only for torch.float modules"
                     assert sample_parameter.device == torch.device(
                         "cpu"
                     ), "this pass is only for CPU modules"
+=======
+                    assert sample_parameter.dtype == torch.float, (
+                        "this pass is only for torch.float modules"
+                    )
+                    assert sample_parameter.device == torch.device("cpu"), (
+                        "this pass is only for CPU modules"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif node.op == "call_function":
             if node.target in mkldnn_supported:
                 supports_mkldnn = MklSupport.YES
@@ -471,7 +480,11 @@ def get_color(n):
         if not use_mkl_heuristic(graph):
             for node in graph.start_nodes + graph.end_nodes:
                 prv = node.args[0]
+<<<<<<< HEAD
                 node.replace_all_uses_with(prv)
+=======
+                node.replace_all_uses_with(prv)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 fx_graph.erase_node(node)
             reset_modules(graph.nodes, modules, old_modules)
 
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index b3e7212937c3..b5565572544e 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -14,7 +14,10 @@
 import traceback
 import typing
 import typing_extensions
+<<<<<<< HEAD
 import warnings
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import weakref
 from collections import defaultdict, OrderedDict
 from collections.abc import Generator, Mapping, Sequence
@@ -30,7 +33,11 @@
     TypeVar,
     Union,
 )
+<<<<<<< HEAD
 from typing_extensions import Concatenate, ParamSpec, Self
+=======
+from typing_extensions import Concatenate, ParamSpec, Self, TypeVarTuple, Unpack
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from weakref import WeakKeyDictionary
 
 import torch
@@ -117,6 +124,10 @@
 U = TypeVar("U")
 _P = ParamSpec("_P")
 R = TypeVar("R")
+<<<<<<< HEAD
+=======
+_Ts = TypeVarTuple("_Ts")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 null_ctx_type = type(nullcontext)
 # We currently convert all SymInt to proxies before we use them.
@@ -131,6 +142,16 @@
     ),
     serialized_type_name="torch.Size",
 )
+<<<<<<< HEAD
+=======
+# Ideally unflattening should not lose info, but we unflatten
+# torch.Size to tuple (see above). This is necessary because the
+# torch.Size constructor only accepts ints whereas our infra often
+# transforms them to non-ints, e.g. symint proxies. Anyway, losing
+# such info can cause pytree mapping or spec matching to fail, so
+# work around this problem using the following dict as needed.
+_pytree_subclasses_that_lose_info = {torch.Size: tuple}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def fake_signature(fn: Callable[_P, R], nargs: int) -> Callable[_P, R]:
@@ -141,7 +162,11 @@ def fake_signature(fn: Callable[_P, R], nargs: int) -> Callable[_P, R]:
 
 @contextmanager
 def decompose(
+<<<<<<< HEAD
     decomposition_table: Optional[Mapping[OpOverload, Callable]]
+=======
+    decomposition_table: Optional[Mapping[OpOverload, Callable]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Generator[Mapping[OpOverload, Callable], None, None]:
     global CURRENT_DECOMPOSITION_TABLE
     old_decomposition_table = CURRENT_DECOMPOSITION_TABLE
@@ -175,22 +200,34 @@ def is_sym_node(node: _HasMeta) -> bool:
 
 
 @overload
+<<<<<<< HEAD
 def set_proxy_slot(obj: Tensor, tracer: _ProxyTracer, proxy: _ProxyTensor) -> None:
     ...
+=======
+def set_proxy_slot(obj: Tensor, tracer: _ProxyTracer, proxy: _ProxyTensor) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
 def set_proxy_slot(
     obj: _AnyScriptObjectType, tracer: _ProxyTracer, proxy: Proxy
+<<<<<<< HEAD
 ) -> None:
     ...
+=======
+) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
 def set_proxy_slot(
     obj: PySymType, tracer: _ProxyTracer, proxy: _PySymProxyType
+<<<<<<< HEAD
 ) -> None:
     ...
+=======
+) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def set_proxy_slot(
@@ -249,8 +286,12 @@ def has_proxy_slot(obj: Tensor, tracer: _ProxyTracer) -> bool:
 def get_proxy_slot(
     obj: Tensor,
     tracer: _ProxyTracer,
+<<<<<<< HEAD
 ) -> _ProxyTensor:
     ...
+=======
+) -> _ProxyTensor: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -258,8 +299,12 @@ def get_proxy_slot(
     obj: Tensor,
     tracer: _ProxyTracer,
     default: U,
+<<<<<<< HEAD
 ) -> Union[_ProxyTensor, U]:
     ...
+=======
+) -> Union[_ProxyTensor, U]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -268,16 +313,24 @@ def get_proxy_slot(
     tracer: _ProxyTracer,
     default: U,
     transform: Callable[[_ProxyTensor], R],
+<<<<<<< HEAD
 ) -> Union[R, U]:
     ...
+=======
+) -> Union[R, U]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
 def get_proxy_slot(
     obj: _AnyScriptObjectType,
     tracer: _ProxyTracer,
+<<<<<<< HEAD
 ) -> Proxy:
     ...
+=======
+) -> Proxy: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -285,8 +338,12 @@ def get_proxy_slot(
     obj: _AnyScriptObjectType,
     tracer: _ProxyTracer,
     default: U,
+<<<<<<< HEAD
 ) -> Union[Proxy, U]:
     ...
+=======
+) -> Union[Proxy, U]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -295,16 +352,24 @@ def get_proxy_slot(
     tracer: _ProxyTracer,
     default: U,
     transform: Callable[[Proxy], R],
+<<<<<<< HEAD
 ) -> Union[R, U]:
     ...
+=======
+) -> Union[R, U]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
 def get_proxy_slot(
     obj: PySymType,
     tracer: _ProxyTracer,
+<<<<<<< HEAD
 ) -> _PySymProxyType:
     ...
+=======
+) -> _PySymProxyType: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -312,8 +377,12 @@ def get_proxy_slot(
     obj: PySymType,
     tracer: _ProxyTracer,
     default: T,
+<<<<<<< HEAD
 ) -> Union[T, _PySymProxyType]:
     ...
+=======
+) -> Union[T, _PySymProxyType]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -322,8 +391,12 @@ def get_proxy_slot(
     tracer: _ProxyTracer,
     default: U,
     transform: Callable[[_PySymProxyType], R],
+<<<<<<< HEAD
 ) -> Union[R, U]:
     ...
+=======
+) -> Union[R, U]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # the default argument is what to return if the slot is not set.
@@ -360,12 +433,20 @@ def get_proxy_slot(
     return res
 
 
+<<<<<<< HEAD
 def snapshot_fake(val: Tensor) -> Optional[Tensor]:
+=======
+def snapshot_fake(val: Tensor, include_real: bool = False) -> Optional[Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # val.detach() will also eventually call fast_detach(),
     # but this saves us a full trip into __torch_dispatch__
     # (snapshot_fake is called a lot)
     if isinstance(val, FakeTensor):
+<<<<<<< HEAD
         return fast_detach(val.fake_mode, val)
+=======
+        return fast_detach(val.fake_mode, val, include_real)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         return val.detach()
 
@@ -386,9 +467,15 @@ def snapshot_fake(val: Tensor) -> Optional[Tensor]:
 ]
 
 
+<<<<<<< HEAD
 def extract_val(val: _ExtractValType) -> _ExtractValType:
     if is_fake(val):
         return snapshot_fake(val)
+=======
+def extract_val(val: _ExtractValType, include_real: bool = False) -> _ExtractValType:
+    if is_fake(val):
+        return snapshot_fake(val, include_real=include_real)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif isinstance(val, py_sym_types):
         return val
     elif isinstance(val, _AnyScriptObject):
@@ -487,7 +574,13 @@ def maybe_enable_thunkify() -> Generator[None, None, None]:
 # grad_fn, _base (_base actually may be set due to recursive call to
 # ADInplaceOrView, but you shouldn't rely on it.)
 def set_meta(proxy: Proxy, val: _ExtractValType) -> Proxy:
+<<<<<<< HEAD
     proxy.node.meta["val"] = extract_val(val)
+=======
+    proxy.node.meta["val"] = extract_val(
+        val, include_real=(proxy.node.op == "placeholder")
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     with _enable_thunkify(proxy.tracer):  # type: ignore[arg-type]
         # Best effort tensor_meta setting; prefer using val!
@@ -708,22 +801,36 @@ def inner(e: PySymType) -> Union[int, bool, float, Proxy]:
 
 
 @overload
+<<<<<<< HEAD
 def fetch_object_proxy(tracer: _ProxyTracer, t: Tensor) -> Union[_ProxyTensor, Tensor]:
     ...
+=======
+def fetch_object_proxy(
+    tracer: _ProxyTracer, t: Tensor
+) -> Union[_ProxyTensor, Tensor]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
 def fetch_object_proxy(
     tracer: _ProxyTracer, t: _AnyScriptObjectType
+<<<<<<< HEAD
 ) -> Union[Proxy, _AnyScriptObjectType]:
     ...
+=======
+) -> Union[Proxy, _AnyScriptObjectType]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
 def fetch_object_proxy(
     tracer: _ProxyTracer, t: PySymType
+<<<<<<< HEAD
 ) -> Union[_PySymProxyType, PySymType]:
     ...
+=======
+) -> Union[_PySymProxyType, PySymType]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def fetch_object_proxy(
@@ -806,6 +913,13 @@ def can_handle_tensor(x: Tensor) -> bool:
 
     if func is torch.ops.aten.is_nonzero.default:
         with proxy_mode:
+<<<<<<< HEAD
+=======
+            torch._check(
+                args[0].numel() == 1,  # type: ignore[attr-defined]
+                lambda: "Boolean value of Tensor with more than one value is ambiguous",
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (args[0] != 0).item()  # type: ignore[attr-defined]
 
     tracer = proxy_mode.tracer
@@ -1001,7 +1115,11 @@ def get(
     ) -> _PySymProxyType:
         # dict.get()'s annotation doesn't accept `None` when the value type
         # isn't Optional.
+<<<<<<< HEAD
         return self.sym_node_dict.get(key.node, default)  # type: ignore[arg-type]
+=======
+        return self.sym_node_dict.get(key.node, default)  # type: ignore[arg-type, return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __iter__(self) -> Any:
         raise NotImplementedError
@@ -1017,6 +1135,10 @@ class PythonKeyTracer(Tracer):
     tensor_tracker: MutableMapping[Tensor, _ProxyTensor]
     torch_fn_counts: dict[OpOverload, int]
     enable_thunkify: bool = False
+<<<<<<< HEAD
+=======
+    stack_trace: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self) -> None:
         super().__init__(autowrap_modules=())  # type: ignore[arg-type]
@@ -1068,18 +1190,29 @@ def create_arg(self, a: object) -> fx.node.Node:
         return super().create_arg(a)  # type: ignore[return-value]
 
     @overload
+<<<<<<< HEAD
     def unwrap_proxy(self, e: Tensor) -> Union[Proxy, Tensor]:
         ...
 
     @overload
     def unwrap_proxy(self, e: PySymType) -> Union[Proxy, PySymType]:
         ...
+=======
+    def unwrap_proxy(self, e: Tensor) -> Union[Proxy, Tensor]: ...
+
+    @overload
+    def unwrap_proxy(self, e: PySymType) -> Union[Proxy, PySymType]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     def unwrap_proxy(
         self, e: _AnyScriptObjectType
+<<<<<<< HEAD
     ) -> Union[Proxy, _AnyScriptObjectType]:
         ...
+=======
+    ) -> Union[Proxy, _AnyScriptObjectType]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def unwrap_proxy(self, e: T) -> object:
         if isinstance(e, Tensor):
@@ -1102,6 +1235,49 @@ def create_node(
     ) -> torch.fx.Node:
         node = super().create_node(kind, target, args, kwargs, name, type_expr)  # type: ignore[arg-type]
 
+<<<<<<< HEAD
+=======
+        # stack_trace
+        if (
+            self.stack_trace
+            and "stack_trace" not in node.meta
+            and node.op not in ["placeholder", "output"]
+        ):
+            user_frame_summary = CapturedTraceback.extract().summary()
+            if user_frame_summary:
+                # we retain frames from forward() calls, or ops
+                # located in torch/__init__.py (e.g. sym_int, sym_constrain_range, vmap)
+                stack_trace = [
+                    frame
+                    for frame in user_frame_summary
+                    if (
+                        frame.name == "forward"
+                        or frame.filename.endswith("torch/__init__.py")
+                    )
+                ]
+                # filter out forward() frames from fx/_symbolic_trace.py, export/_trace.py
+                # this is hardcoded, but leads to a much cleaner stack trace
+                stack_trace = [
+                    frame
+                    for frame in stack_trace
+                    if not frame.filename.endswith(
+                        ("fx/_symbolic_trace.py", "export/_trace.py")
+                    )
+                ]
+                if (
+                    stack_trace
+                ):  # empty list for strict mode, dynamo should handle stack_trace
+                    stack_trace = traceback.StackSummary.from_list(stack_trace)
+                    node.meta["stack_trace"] = "".join(stack_trace.format()).strip()
+
+        if kind == "get_attr":
+            assert isinstance(target, str)
+            attr = getattr(self.root, target)
+            if isinstance(attr, torch.Tensor):
+                with disable_proxy_modes_tracing():
+                    node.meta["val"] = extract_val(attr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def map_fn(v: Any) -> Optional[_ExtractValType]:
             if not isinstance(v, torch.fx.Node) or "val" not in v.meta:
                 return None
@@ -1112,6 +1288,7 @@ def map_fn(v: Any) -> Optional[_ExtractValType]:
                 return None
             return extract_val(v.meta["val"])
 
+<<<<<<< HEAD
         # TODO: opt-in mechanism ?
         if isinstance(
             target,
@@ -1122,10 +1299,67 @@ def map_fn(v: Any) -> Optional[_ExtractValType]:
         ):
             arg_inp, kwarg_inp = torch.fx.node.map_aggregate((args, kwargs), map_fn)  # type: ignore[misc, arg-type]
             node.meta["arg_kwarg_vals"] = (arg_inp, kwarg_inp)
+=======
+        if _should_save_eager_input_vals(target, (args, kwargs)):
+            # NOTE "eager_input_vals"
+            # We save the original (args, kwargs) FakeTensor values for nodes
+            # that have exact stride requirements. This is useful downstream.
+            # We use this information inside Inductor to ensure that inputs to
+            # stride-sensitive operators have the correct strides.
+            arg_inp, kwarg_inp = torch.fx.node.map_aggregate((args, kwargs), map_fn)  # type: ignore[misc, arg-type]
+            node.meta["eager_input_vals"] = (arg_inp, kwarg_inp)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return node
 
 
+<<<<<<< HEAD
+=======
+def _should_save_eager_input_vals(
+    target: Any,
+    args_kwargs: Optional[tuple[tuple[Argument, ...], dict[str, Argument]]] = None,
+) -> bool:
+    from torch._higher_order_ops.invoke_subgraph import InvokeSubgraphHOP
+
+    if not callable(target):
+        return False
+    if isinstance(
+        target,
+        (
+            torch._higher_order_ops.triton_kernel_wrap.TritonKernelWrapperFunctional,
+            torch._higher_order_ops.triton_kernel_wrap.TritonKernelWrapperMutation,
+            InvokeSubgraphHOP,
+        ),
+    ):
+        return True
+    if args_kwargs is not None and (
+        target is torch.ops.higher_order.auto_functionalized
+        or target is torch.ops.higher_order.auto_functionalized_v2
+    ):
+        args = args_kwargs[0]
+        assert isinstance(
+            args[0], (torch._ops.OpOverload, torch._ops.HigherOrderOperator)
+        )
+        return _should_save_eager_input_vals(args[0], None)
+    if target is torch.ops.higher_order.with_effects:
+        # TODO: inductor lowering for with_effects needs to be updated to propagate
+        # the arg_kwarg_vals
+        return False
+    if isinstance(target, torch._ops.HigherOrderOperator):
+        if pytree.tree_any(_should_save_eager_input_vals, args_kwargs):
+            raise RuntimeError(
+                f"NYI: The HOP {target} has an input that is an OpOverload that "
+                f"needs exact strides. We probably need special logic to "
+                f"propagate the FakeTensor vals. Please file an issue."
+            )
+    if isinstance(target, torch._ops.OpOverload):
+        from torch._library.utils import get_layout_constraint_tag
+
+        return get_layout_constraint_tag(target) == torch._C.Tag.needs_exact_strides
+    return False
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _make_temp_remove_mode_context_manager(
     mode_ty: type[TorchFunctionMode],
 ) -> Callable[[], _GeneratorContextManager[Optional[TorchFunctionMode]]]:
@@ -1211,7 +1445,14 @@ def impure_pred(n: fx.Node) -> bool:
 
 
 def wrap_key(
+<<<<<<< HEAD
     f: Callable[_P, R], tensors: _P.args, tracer: _ProxyTracer, pre_dispatch: bool
+=======
+    f: Callable[[Unpack[_Ts]], R],
+    tensors: tuple[Unpack[_Ts]],
+    tracer: _ProxyTracer,
+    pre_dispatch: bool,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable[_P, R]:
     flat_tensors, _tensors_spec = pytree.tree_flatten(tensors)
 
@@ -1357,10 +1598,17 @@ def __init__(
         # ProxyTorchDispatchMode state was (if there was any).
         # This lets us properly reset the state on exit.
         self.enter_stack: list[Optional[ProxyTorchDispatchMode]] = []
+<<<<<<< HEAD
         self.decomp_layers = 0
         from torch._inductor import config
 
         self.emulate_precision_casts = config.emulate_precision_casts
+=======
+        self.decomp_layers: int = 0
+        from torch._inductor import config
+
+        self.emulate_precision_casts: bool = config.emulate_precision_casts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @count
     def __torch_dispatch__(
@@ -1411,17 +1659,33 @@ def _compute_proxy(
         if len(args) == 1 and isinstance(args[0], (list, tuple)):
             n_args = (
                 tuple(
+<<<<<<< HEAD
                     get_proxy_slot(a, self.tracer).force().node
                     if isinstance(a, py_sym_types)
                     else a
+=======
+                    (
+                        get_proxy_slot(a, self.tracer).force().node
+                        if isinstance(a, py_sym_types)
+                        else a
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     for a in args[0]
                 ),
             )
         else:
             n_args = tuple(
+<<<<<<< HEAD
                 get_proxy_slot(a, self.tracer).force().node
                 if isinstance(a, py_sym_types)
                 else a
+=======
+                (
+                    get_proxy_slot(a, self.tracer).force().node
+                    if isinstance(a, py_sym_types)
+                    else a
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for a in args
             )
 
@@ -1508,7 +1772,14 @@ def __init__(
         self.mode = ProxyTorchDispatchMode(self.tracer, tracing_mode="real")
 
     def placeholder(
+<<<<<<< HEAD
         self, target: str, args: tuple[object, ...], kwargs: dict[str, object]  # type: ignore[override]
+=======
+        self,
+        target: str,  # type: ignore[override]
+        args: tuple[object, ...],
+        kwargs: dict[str, object],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> object:
         out = super().placeholder(target, args, kwargs)  # type: ignore[arg-type]
         proxy = fx.Proxy(self.new_graph.placeholder(target), self.tracer)
@@ -1517,7 +1788,14 @@ def placeholder(
         return out
 
     def get_attr(
+<<<<<<< HEAD
         self, target: str, args: tuple[object, ...], kwargs: dict[str, object]  # type: ignore[override]
+=======
+        self,
+        target: str,  # type: ignore[override]
+        args: tuple[object, ...],
+        kwargs: dict[str, object],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> object:
         out = super().get_attr(target, args, kwargs)  # type: ignore[arg-type]
         proxy = fx.Proxy(self.new_graph.get_attr(target), self.tracer)
@@ -1527,7 +1805,14 @@ def get_attr(
     # call_function, call_method, call_module get traced automatically by the outer mode.
 
     def output(
+<<<<<<< HEAD
         self, target: str, args: tuple[object, ...], kwargs: dict[str, object]  # type: ignore[override]
+=======
+        self,
+        target: str,  # type: ignore[override]
+        args: tuple[object, ...],
+        kwargs: dict[str, object],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> object:
         out = super().output(target, args, kwargs)  # type: ignore[arg-type]
 
@@ -1602,6 +1887,10 @@ class _ModuleStackTracer(PythonKeyTracer):
 
     def __init__(self, scope_root: GraphModule) -> None:
         super().__init__()
+<<<<<<< HEAD
+=======
+        self.stack_trace = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.scope_root = scope_root
         self.enable_attr_proxy = False
         self.submodule_paths = {}
@@ -1807,11 +2096,20 @@ def call_module(
         try:
             return Tracer.call_module(self, m, forward, args, kwargs)
         except _ModuleNotInstalledAsSubmoduleError:
+<<<<<<< HEAD
             warnings.warn(
                 f"Unable to find the path of the module {m}. "
                 "This might be because the module was not properly registered "
                 "as a submodule, which is not good practice. We will trace "
                 "through the module without recording stack information."
+=======
+            log.debug(
+                "Unable to find the path of the module %s. "
+                "This might be because the module was not properly registered "
+                "as a submodule, which is not good practice. We will trace "
+                "through the module without recording stack information.",
+                str(m),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return forward(*args, **kwargs)
 
@@ -1851,6 +2149,7 @@ def create_node(self, *args: object, **kwargs: object) -> fx.node.Node:
                 f"{self.torch_fn_metadata.__class__.__name__}.{self.torch_fn_metadata.__name__}",
             )
 
+<<<<<<< HEAD
         # stack_trace
         if "stack_trace" not in node.meta and node.op not in ["placeholder", "output"]:
             user_frame_summary = CapturedTraceback.extract().summary()
@@ -1881,6 +2180,8 @@ def create_node(self, *args: object, **kwargs: object) -> fx.node.Node:
                     stack_trace = traceback.StackSummary.from_list(stack_trace)
                     node.meta["stack_trace"] = "".join(stack_trace.format()).strip()
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return node
 
 
@@ -1894,6 +2195,10 @@ def __init__(
         record_module_stack: bool,
         _allow_fake_constant: bool,
         _error_on_data_dependent_ops: bool,
+<<<<<<< HEAD
+=======
+        stack_trace: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         # Configurations that are used to initialize the context managers and their states.
         # Should not modify them during tracing.
@@ -1912,6 +2217,7 @@ def __init__(
 
         # All context managers and their states should be initialized before tracing based on the inputs
         # and configurations. After tracing, their states should be cleaned except for shape_env.
+<<<<<<< HEAD
         # Remember to specify how to intialize it from user inputs and from parent tracer whenever
         # adding new modes in _MakefxTracer.
         self.fake_tensor_mode: Optional[FakeTensorMode] = None
@@ -1924,6 +2230,21 @@ def __init__(
         self.torch_fn_metadata_mode: Union[
             nullcontext, TorchFunctionMetadataMode
         ] = nullcontext()
+=======
+        # Remember to specify how to initialize it from user inputs and from parent tracer whenever
+        # adding new modes in _MakefxTracer.
+        self.fake_tensor_mode: Optional[FakeTensorMode] = None
+        self.proxy_mode: Union[nullcontext, ProxyTorchDispatchMode] = nullcontext()
+        self.proxy_function_mode: Union[nullcontext, PreDispatchTorchFunctionMode] = (
+            nullcontext()
+        )
+        self.fx_tracer: Optional[PythonKeyTracer] = None
+        self.python_dispatcher_mode: Union[nullcontext, Any] = nullcontext()
+        self.torch_fn_metadata_mode: Union[nullcontext, TorchFunctionMetadataMode] = (
+            nullcontext()
+        )
+        self.stack_trace = stack_trace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _checkpoint_modes(self) -> list[Any]:
         return [
@@ -1962,9 +2283,17 @@ def _init_modes_from_inputs(
 
             if hasattr(f, "_orig_mod") and self.record_module_stack:
                 scope_root = f._orig_mod
+<<<<<<< HEAD
+                self.fx_tracer = _ModuleStackTracer(scope_root)
+            else:
+                self.fx_tracer = PythonKeyTracer()
+=======
+                # _ModuleStackTracer always try to preserve stack trace
                 self.fx_tracer = _ModuleStackTracer(scope_root)
             else:
                 self.fx_tracer = PythonKeyTracer()
+                self.fx_tracer.stack_trace = self.stack_trace
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if self.tracing_mode == "fake":
                 import torch._dynamo
@@ -1995,9 +2324,15 @@ def _init_modes_from_inputs(
                             allow_non_fake_inputs=self._allow_non_fake_inputs,
                             shape_env=shape_env,
                         )
+<<<<<<< HEAD
                 assert (
                     fake_tensor_mode.shape_env is not None
                 ), "shape_env should be set if tracing with 'symbolic'"
+=======
+                assert fake_tensor_mode.shape_env is not None, (
+                    "shape_env should be set if tracing with 'symbolic'"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.fake_tensor_mode = fake_tensor_mode
             else:
                 if not self.tracing_mode == "real":
@@ -2085,9 +2420,15 @@ def inner_wrap_fake(x: object) -> object:
                     return self.fake_tensor_mode.from_tensor(x, source=source)
                 # NB: don't match on bools
                 elif type(x) is int and self.tracing_mode == "symbolic":
+<<<<<<< HEAD
                     assert (
                         self.fake_tensor_mode.shape_env is not None
                     ), "shape_env should be set if tracing with 'symbolic'"
+=======
+                    assert self.fake_tensor_mode.shape_env is not None, (
+                        "shape_env should be set if tracing with 'symbolic'"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return self.fake_tensor_mode.shape_env.create_symintnode(
                         self.fake_tensor_mode.shape_env.create_symbol(
                             x, source, positive=None
@@ -2100,9 +2441,15 @@ def inner_wrap_fake(x: object) -> object:
                         self.fake_tensor_mode, x
                     )
 
+<<<<<<< HEAD
                 assert not isinstance(
                     x, FakeScriptObject
                 ), f"ScriptObject {x} has been fakified. Cannot wrap_fake it again."
+=======
+                assert not isinstance(x, FakeScriptObject), (
+                    f"ScriptObject {x} has been fakified. Cannot wrap_fake it again."
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return x
 
             wrap_fn_map = {
@@ -2216,15 +2563,29 @@ def make_fx(
     record_module_stack: bool = False,
     _allow_fake_constant: bool = False,
     _error_on_data_dependent_ops: bool = True,
+<<<<<<< HEAD
+=======
+    stack_trace: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable[..., GraphModule]:
     """
     Given a function f, return a new function which when executed with valid
     arguments to f, returns an FX GraphModule representing the set of operations that
     were executed during the course of execution.
+<<<<<<< HEAD
+=======
+
+    If stack_trace is True, the stack_trace will be preserved on node.meta["stack_trace"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     assert tracing_mode in ["real", "fake", "symbolic"]
 
+<<<<<<< HEAD
+=======
+    from torch._inductor import config
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     make_fx_tracer = _MakefxTracer(
         decomposition_table,
         tracing_mode,
@@ -2233,6 +2594,10 @@ def make_fx(
         record_module_stack,
         _allow_fake_constant,
         _error_on_data_dependent_ops,
+<<<<<<< HEAD
+=======
+        stack_trace=stack_trace or config.trace.enabled,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     @functools.wraps(f)
@@ -2262,6 +2627,7 @@ def get_proxy_mode() -> Optional[ProxyTorchDispatchMode]:
         torch._C._TorchDispatchModeKey.PROXY
     )
     mode = torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY)
+<<<<<<< HEAD
     assert (
         pre_dispatch_mode is None or mode is None
     ), f"pre_dispatch_mode={pre_dispatch_mode}, mode={mode}"
@@ -2269,6 +2635,19 @@ def get_proxy_mode() -> Optional[ProxyTorchDispatchMode]:
 
 
 def handle_sym_dispatch(func: Callable[_P, R], args: _P.args, kwargs: _P.kwargs) -> R:
+=======
+    assert pre_dispatch_mode is None or mode is None, (
+        f"pre_dispatch_mode={pre_dispatch_mode}, mode={mode}"
+    )
+    return pre_dispatch_mode or mode
+
+
+def handle_sym_dispatch(
+    func: Callable[_P, R],
+    args: _P.args,  # type: ignore[valid-type]  # not allowed to use _P.args here
+    kwargs: _P.kwargs,  # type: ignore[valid-type]  # not allowed to use _P.kwargs here
+) -> R:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Call into the currently active proxy tracing mode to do a
     SymInt/SymFloat/SymBool dispatch trace on a function that operates on
diff --git a/torch/fx/experimental/recording.py b/torch/fx/experimental/recording.py
index dcaa6659571f..373a609d69f7 100644
--- a/torch/fx/experimental/recording.py
+++ b/torch/fx/experimental/recording.py
@@ -155,7 +155,11 @@ def replacearg(index: int, key: str, fn: Callable):
                 fn=lambda args: tuple(maybe_convert_node(a) for a in args),
             )
         if self.is_evaluate_expr() or self.is_defer_runtime_assert():
+<<<<<<< HEAD
             # ShapeEnv.evaluate_expr and ShapeEnv.defer_runtime_assert:
+=======
+            # ShapeEnv.evaluate_expr and ShapeEnv.guard_or_defer_runtime_assert:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # "fx_node" parameter is an (optional) FX node that represents the evaluate expression.
             # They must be replaced, since it will be part of a "call_function" FX node for
             # torch._assert, which will be added to the FX graph of the new shape_env.
@@ -175,7 +179,11 @@ def is_evaluate_expr(self) -> bool:
         return self.name == "evaluate_expr"
 
     def is_defer_runtime_assert(self) -> bool:
+<<<<<<< HEAD
         return self.name == "defer_runtime_assert"
+=======
+        return self.name == "guard_or_defer_runtime_assert"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 NEST = 0
@@ -214,6 +222,13 @@ def assert_equal(old: Optional[ShapeEnv], new: ShapeEnv) -> ShapeEnv:
 # save_tracked_fakes: saves a snapshot of the TrackedFake list.
 # This is used when calling ShapeEnv.produce_guards at arbitrary points in time.
 #
+<<<<<<< HEAD
+=======
+# name: the name of the function being recorded. Normally (and by default) this
+# is taken from the decorated function but can be set if you need to override
+# it.
+#
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # When to save the list of TrackedFake?
 # =====================================
 # We should save the list of TrackedFake whenever the translation validation
@@ -224,8 +239,15 @@ def assert_equal(old: Optional[ShapeEnv], new: ShapeEnv) -> ShapeEnv:
 #
 # At the moment, there are 2 methods that save the list:
 #   - ShapeEnv.evaluate_expr
+<<<<<<< HEAD
 #   - ShapeEnv.defer_runtime_assert
 def record_shapeenv_event(*, save_tracked_fakes: bool = False) -> Callable:
+=======
+#   - ShapeEnv.guard_or_defer_runtime_assert
+def record_shapeenv_event(
+    *, save_tracked_fakes: bool = False, name: Optional[str] = None
+) -> Callable:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def decorator(fn: Callable) -> Callable:
         assert callable(fn)
         args = inspect.getfullargspec(fn).args
@@ -233,7 +255,13 @@ def decorator(fn: Callable) -> Callable:
             "record_shapeenv_event should only wrap methods on ShapeEnv; refactor your "
             "code so that it calls into a method on ShapeEnv"
         )
+<<<<<<< HEAD
         name = fn.__name__
+=======
+        nonlocal name
+        if name is None:
+            name = fn.__name__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @functools.wraps(fn)
         def wrapper(*args, **kwargs):
@@ -252,8 +280,14 @@ def retlog(r):
                 trace_shape_events_log.debug("%s-> %s", " " * (NEST - 1), r)
                 return r
 
+<<<<<<< HEAD
             try:
                 shape_env = args[0]
+=======
+            shape_env = args[0]
+
+            try:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if not shape_env.should_record_events or shape_env.is_recording:  # type: ignore[has-type]
                     # If ShapeEnv is already recording an event, call the wrapped
                     # function directly.
@@ -280,7 +314,15 @@ def retlog(r):
                     )
                     # Record the event for 'fn'.
                     event = ShapeEnvEvent(
+<<<<<<< HEAD
                         fn, list(args), kwargs, tracked_fakes, name=fn.__name__
+=======
+                        fn,
+                        list(args),
+                        kwargs,
+                        tracked_fakes,
+                        name=name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     # Play the event on this ShapeEnv.
                     # NB: It's important to put the event first, because running
@@ -296,6 +338,12 @@ def retlog(r):
                         raise
 
             except Exception:
+<<<<<<< HEAD
+=======
+                if not shape_env.should_record_events or shape_env.is_recording:
+                    # If ShapeEnv is disabled or already recording an event, re-raise the exception without logging.
+                    raise
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 log.error(  # noqa: G201
                     "failed while running %s(*%s, **%s)",
                     name,
@@ -444,7 +492,11 @@ def value_to_str(value: Any) -> str:
     # Here, we allow the value of each field to be mapped, so that we appropriately
     # compare the two values.
     def compare_vars(
+<<<<<<< HEAD
         map_value: Callable[[str, Any], Any]
+=======
+        map_value: Callable[[str, Any], Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> list[tuple[str, str, str]]:
         env1_set, env2_set = set(env1_vars), set(env2_vars)
 
diff --git a/torch/fx/experimental/schema_type_annotation.py b/torch/fx/experimental/schema_type_annotation.py
index 335c027c9321..de4d5402af5b 100644
--- a/torch/fx/experimental/schema_type_annotation.py
+++ b/torch/fx/experimental/schema_type_annotation.py
@@ -103,7 +103,11 @@ def get_attr(
             for i, atom in enumerate(atoms):
                 if not hasattr(module_itr, atom):
                     raise RuntimeError(
+<<<<<<< HEAD
                         f'Node referenced nonextent target {".".join(atoms[:i])}!'
+=======
+                        f"Node referenced nonextent target {'.'.join(atoms[:i])}!"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 module_itr = getattr(module_itr, atom)
 
diff --git a/torch/fx/experimental/sym_node.py b/torch/fx/experimental/sym_node.py
index 4fe1421bc063..6ca501c38ce2 100644
--- a/torch/fx/experimental/sym_node.py
+++ b/torch/fx/experimental/sym_node.py
@@ -149,9 +149,15 @@ def compute_hint():
                 # This is technically not TV, but this assert is expensive so
                 # let's only do it when we're already doing expensive things
                 computed_hint = compute_hint()
+<<<<<<< HEAD
                 assert (
                     hint == computed_hint
                 ), f"{hint} != {computed_hint} (for {self.expr})"
+=======
+                assert hint == computed_hint, (
+                    f"{hint} != {computed_hint} (for {self.expr})"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             hint = compute_hint()
         self._hint = hint
@@ -460,7 +466,13 @@ def pow(self, other):
         return self.float_pow(other)
 
     def is_non_overlapping_and_dense(self, sizes, strides):
+<<<<<<< HEAD
         return self.is_non_overlapping_and_dense_indicator(sizes, strides).eq(to_node(self, 1))  # type: ignore[attr-defined]
+=======
+        return self.is_non_overlapping_and_dense_indicator(sizes, strides).eq(
+            to_node(self, 1)
+        )  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def int_(self):
         return self.guard_int("", 0)  # NB: uses Python backtrace
@@ -554,7 +566,11 @@ def expect_true(self, file, line):
         # a regular guard if we can!)
         # TODO: file/line here is very important, because the assert has been
         # deferred so you can't backtrace easily
+<<<<<<< HEAD
         return self.shape_env.defer_runtime_assert(
+=======
+        return self.shape_env.guard_or_defer_runtime_assert(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.expr, f"{file}:{line}", fx_node=self.fx_node
         )
 
@@ -572,6 +588,15 @@ def expect_size(self, file, line):
             _advise_is_size(SymInt(self))
         return r
 
+<<<<<<< HEAD
+=======
+    def statically_known_true(self, file, line):
+        from torch.fx.experimental.symbolic_shapes import statically_known_true
+
+        assert self.is_bool()
+        return statically_known_true(SymBool(self))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def guard_size_oblivious(self, file, line):
         """
         Like guard_bool, but if we encounter unbacked symbols, if those symbols
@@ -592,6 +617,21 @@ def guard_size_oblivious(self, file, line):
             log.warning("Failed to convert to bool: %s", r)
             raise
 
+<<<<<<< HEAD
+=======
+    def guard_or_false(self, file, line):
+        from torch.fx.experimental.symbolic_shapes import guard_or_false
+
+        assert self.is_bool()
+        return guard_or_false(SymBool(self))
+
+    def guard_or_true(self, file, line):
+        from torch.fx.experimental.symbolic_shapes import guard_or_true
+
+        assert self.is_bool()
+        return guard_or_true(SymBool(self))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def bool_(self):
         return self.guard_bool("", 0)
 
@@ -853,6 +893,10 @@ def _optimized_add(
     from sympy.core.basic import _args_sortkey as sortkey
 
     def make_optimized(ordered_args):
+<<<<<<< HEAD
+=======
+        assert ordered_args is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = sympy.Add(*ordered_args, evaluate=False)
         return (True, result)
 
@@ -869,15 +913,37 @@ def make_optimized(ordered_args):
         if sortkey(lhs._args[0]) > sortkey(rhs._args[-1]):
             return make_optimized(rhs._args + lhs._args)
 
+<<<<<<< HEAD
+    # (a0+a2) + a1 => (a0+a1+a2)
+    if lhs_is_optimized_summation and rhs.is_symbol:
+        new_args = _binary_search_insert_arg(list(lhs._args), rhs)
+=======
+        #  (a1+a3) + (a0+a2) => (a0+a1+a2+a3)
+        if len(lhs._args) <= 2 and len(rhs._args) <= 2:
+            new_args = list(lhs._args)
+            for a in rhs._args:
+                new_args = _binary_search_insert_arg(new_args, a)
+                if new_args is None:
+                    break
+            # None means an element already exists.
+            if new_args is not None:
+                return make_optimized(new_args)
+
     # (a0+a2) + a1 => (a0+a1+a2)
     if lhs_is_optimized_summation and rhs.is_symbol:
         new_args = _binary_search_insert_arg(list(lhs._args), rhs)
+        # None means an element already exists.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if new_args is not None:
             return make_optimized(new_args)
 
     # a1 + (a0+a2)=> (a0+a1+a2)
     if rhs_is_optimized_summation and lhs.is_symbol:
         new_args = _binary_search_insert_arg(list(rhs._args), lhs)
+<<<<<<< HEAD
+=======
+        # None means an element already exists.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if new_args is not None:
             return make_optimized(new_args)
 
@@ -1193,11 +1259,14 @@ def _sympy_is_non_overlapping_and_dense_indicator(sizes, strides):
     "is_non_overlapping_and_dense_indicator": _sympy_is_non_overlapping_and_dense_indicator,
 }
 
+<<<<<<< HEAD
 alternate_impl_if_hinted_methods = {
     "sym_min": builtins.min,
     "sym_max": builtins.max,
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def to_node(self, num):
     if isinstance(num, SymTypes):
@@ -1316,10 +1385,13 @@ def binary_magic_impl(self, other):
         if self.hint is not None and other.hint is not None:
             out_hint = op(self.hint, other.hint)
 
+<<<<<<< HEAD
         alternate_impl = alternate_impl_if_hinted_methods.get(method)
         if alternate_impl and out_hint is not None:
             return to_node(self, alternate_impl(wrap_node(self), wrap_node(other)))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if get_proxy_mode():
             return to_node(
                 self, handle_sym_dispatch(op, (wrap_node(self), wrap_node(other)), {})
@@ -1392,7 +1464,11 @@ def binary_magic_impl(self, other):
             out,
             self.shape_env,
             pytype,
+<<<<<<< HEAD
             out_hint,
+=======
+            out_hint,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fx_node=fx_node,
             optimized_summation=optimized_summation,  # see Note [optimized_summation]
         )
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 9be16476d40d..c450a0ac95e2 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -1,5 +1,13 @@
 from __future__ import annotations
 
+<<<<<<< HEAD
+=======
+import sympy
+from sympy import S
+
+from torch._prims_common import BoolLike, FloatLike, IntLike
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 """
 ``torch.fx.experimental.symbolic_shapes`` provides interfaces for interacting with
@@ -14,6 +22,10 @@
 import collections
 import dis
 import functools
+<<<<<<< HEAD
+=======
+import hashlib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import inspect
 import itertools
 import logging
@@ -25,7 +37,11 @@
 import threading
 import traceback
 from collections import Counter, defaultdict
+<<<<<<< HEAD
 from collections.abc import Iterator, Mapping, Sequence
+=======
+from collections.abc import Generator, Iterator, Mapping, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import _GeneratorContextManager, contextmanager
 from dataclasses import asdict, dataclass, field
 from enum import Enum
@@ -33,6 +49,10 @@
     Any,
     Callable,
     cast,
+<<<<<<< HEAD
+=======
+    Generic,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     NamedTuple,
     NoReturn,
     Optional,
@@ -40,7 +60,11 @@
     TypeVar,
     Union,
 )
+<<<<<<< HEAD
 from typing_extensions import deprecated, TypeAlias, TypeGuard
+=======
+from typing_extensions import deprecated, ParamSpec, TypeAlias, TypeGuard
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx
@@ -62,6 +86,10 @@
     ShapeEnvEvent,
 )
 from torch.fx.experimental.sym_node import SymNode, SymTypes
+<<<<<<< HEAD
+=======
+from torch.types import py_sym_types
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 from torch.utils._sympy.functions import (
@@ -70,10 +98,20 @@
     CleanDiv,
     FloorDiv,
     FloorToInt,
+<<<<<<< HEAD
+    IsNonOverlappingAndDenseIndicator,
+    Max,
+    Mod,
+    PythonMod,
+=======
+    IntTrueDiv,
     IsNonOverlappingAndDenseIndicator,
     Max,
+    Min,
     Mod,
     PythonMod,
+    TruncToInt,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.utils._sympy.numbers import int_oo
 from torch.utils._sympy.printers import CppPrinter, PythonPrinter
@@ -93,8 +131,14 @@
     import types
 
     from torch import Tensor
+<<<<<<< HEAD
     from torch._subclasses.fake_tensor import FakeTensor
     from torch.types import BoolLikeType
+=======
+    from torch._dynamo.source import TensorPropertySource
+    from torch._subclasses.fake_tensor import FakeTensor
+    from torch.types import BoolLikeType, FloatLikeType, IntLikeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 InputList = list
@@ -102,9 +146,12 @@
 
 log = logging.getLogger(__name__)
 
+<<<<<<< HEAD
 import sympy
 from sympy import Add, S
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class GuardOnDataDependentSymNode(RuntimeError):
     cond: sympy.Basic
@@ -121,11 +168,21 @@ class PendingUnbackedSymbolNotFound(RuntimeError):
 aten = torch._ops.ops.aten  # type: ignore[has-type]
 
 __all__ = [
+<<<<<<< HEAD
+=======
+    "guard_or_false",
+    "guard_or_true",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "has_symbolic_sizes_strides",
     "create_contiguous",
     "ShapeEnv",
     "is_concrete_int",
     "is_concrete_float",
+<<<<<<< HEAD
+=======
+    "is_concrete_bool",
+    "has_static_value",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "guard_int",
     "guard_float",
     "guard_scalar",
@@ -134,18 +191,34 @@ class PendingUnbackedSymbolNotFound(RuntimeError):
     "SYMPY_INTERP",
     "free_symbols",
     "is_symbol_binding_fx_node",
+<<<<<<< HEAD
     "is_concrete_bool",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "is_nested_int",
     "SHAPEENV_EVENT_KEY",
     "CURRENT_NODE_KEY",
     "has_free_symbols",
     "has_free_unbacked_symbols",
+<<<<<<< HEAD
+    "sym_eq",
+=======
+    "sym_and",
     "sym_eq",
+    "sym_or",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "SymbolicContext",
     "StatelessSymbolicContext",
     "StatefulSymbolicContext",
     "SubclassSymbolicContext",
+<<<<<<< HEAD
     "statically_known_true",
+=======
+    "SymIntSymbolicContext",
+    "TrackedFake",
+    "statically_known_true",
+    "statically_known_false",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "guard_size_oblivious",
     "check_consistent",
     "compute_unbacked_bindings",
@@ -155,6 +228,10 @@ class PendingUnbackedSymbolNotFound(RuntimeError):
     "is_accessor_node",
     "ValueRangesSLoc",
     "SymIntEqByExpr",
+<<<<<<< HEAD
+=======
+    "Specialization",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 # FX node metadata keys for symbolic shape FX graph.
@@ -164,7 +241,13 @@ class PendingUnbackedSymbolNotFound(RuntimeError):
 
 def log_lru_cache_stats(wrapped_f: functools._lru_cache_wrapper[object]) -> None:
     log.debug(
+<<<<<<< HEAD
         "lru_cache_stats %s: %s", wrapped_f.__name__, wrapped_f.cumulative_cache_info()  # type: ignore[attr-defined]
+=======
+        "lru_cache_stats %s: %s",
+        wrapped_f.__name__,  # type: ignore[attr-defined]
+        wrapped_f.cumulative_cache_info(),  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -229,8 +312,13 @@ def __hash__(self) -> int:
 
 
 def _nested_int_aware_sort(
+<<<<<<< HEAD
     tup: tuple[Union[SymInt, int], int]
 ) -> tuple[int, Union[SymInt, int], int]:
+=======
+    tup: tuple[IntLikeType, int],
+) -> tuple[int, IntLikeType, int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         # Order nested ints by their coefficients.
         # 1 here to order nested ints after non-nested-ints.
@@ -356,7 +444,11 @@ def has_hint(a: Scalar) -> bool:
     return True
 
 
+<<<<<<< HEAD
 def is_concrete_int(a: Union[int, SymInt]) -> bool:
+=======
+def is_concrete_int(a: IntLikeType) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Utility to check if underlying object
     in SymInt is concrete value. Also returns
@@ -376,7 +468,11 @@ def is_concrete_int(a: Union[int, SymInt]) -> bool:
     return False
 
 
+<<<<<<< HEAD
 def is_concrete_float(a: Union[float, SymFloat]) -> bool:
+=======
+def is_concrete_float(a: FloatLikeType) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Utility to check if underlying object
     in SymInt is concrete value. Also returns
     true if integer is passed in.
@@ -395,6 +491,54 @@ def is_concrete_float(a: Union[float, SymFloat]) -> bool:
     return False
 
 
+<<<<<<< HEAD
+=======
+def is_concrete_bool(a: BoolLikeType) -> bool:
+    """
+    Utility to check if underlying object
+    in SymBool is concrete value. Also returns
+    true if integer is passed in.
+
+    Args:
+        a (SymBool or bool): Object to test if it bool
+    """
+    assert isinstance(a, (SymBool, bool))
+
+    if isinstance(a, bool):
+        return True
+
+    if isinstance(
+        a.node.expr, (sympy.logic.boolalg.BooleanTrue, sympy.logic.boolalg.BooleanFalse)
+    ):
+        return True
+
+    return False
+
+
+def has_static_value(a: Union[SymBool, SymFloat, SymInt, bool, float, int]) -> bool:
+    """
+    User-code friendly utility to check if a value is static or dynamic.
+    Returns true if given a constant, or a symbolic expression with a fixed value.
+
+    Args:
+        a (Union[SymBool, SymFloat, SymInt, bool, float, int]): Object to test
+    """
+    assert isinstance(a, BoolLike + FloatLike + IntLike)
+    if (
+        isinstance(a, BoolLike)
+        and is_concrete_bool(a)  # type: ignore[arg-type]
+        or isinstance(a, FloatLike)
+        and is_concrete_float(a)  # type: ignore[arg-type]
+        or isinstance(a, IntLike)
+        and is_concrete_int(a)  # type: ignore[arg-type]
+    ):
+        return True
+
+    assert isinstance(a, py_sym_types)
+    return a.node.shape_env.bound_sympy(a.node.expr).is_singleton()  # type: ignore[union-attr]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def guard_size_oblivious(expr: Union[torch.SymBool, bool]) -> bool:
     """
     Perform a guard on a symbolic boolean expression in a size oblivious way.
@@ -411,6 +555,7 @@ def guard_size_oblivious(expr: Union[torch.SymBool, bool]) -> bool:
         return expr
 
 
+<<<<<<< HEAD
 def _guard_sizes_oblivious(
     lhs_sizes: Sequence[Union[torch.SymInt, bool]],
     rhs_sizes: Sequence[Union[torch.SymInt, bool]],
@@ -426,6 +571,8 @@ def _guard_sizes_oblivious(
     )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def check_consistent(new: _T, old: _T) -> None:
     """
     Test that two "meta" values (typically either Tensor or SymInt) have
@@ -449,9 +596,15 @@ def check_consistent(new: _T, old: _T) -> None:
             torch._check(i == j, lambda: f"{old.shape} != {new.shape} (old != new)")
     # NB: bool is subclass of int
     elif isinstance(new, scalar_types) and not isinstance(new, bool):
+<<<<<<< HEAD
         assert isinstance(old, scalar_types) and not isinstance(
             old, bool
         ), f"{old} != {new}"
+=======
+        assert isinstance(old, scalar_types) and not isinstance(old, bool), (
+            f"{old} != {new}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._check(old == new, lambda: f"{old} != {new} (old != new)")
 
 
@@ -459,6 +612,17 @@ def resolve_unbacked_bindings(
     shape_env: Optional[ShapeEnv],
     bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]],
 ) -> Optional[dict[sympy.Symbol, pytree.KeyPath]]:
+<<<<<<< HEAD
+=======
+    """
+    When we do fake tensor prop, we oftentimes will allocate new unbacked symints.
+    We then run proxy tensor mode, which populates node.meta["unbacked_bindings"]
+    with these new symints. To ensure consistency we use PropagateUnbackedSymInts
+    to rename unbacked bindings to their old ones. But all of the node metas are
+    still using the old bindings from before the renaming. This function helps to
+    post facto apply any renamings discovered in the PropogateUnbackedSymInts pass.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if bindings is None:
         return None
     assert shape_env is not None
@@ -573,9 +737,15 @@ def rebind_unbacked(
                 raw_u1 = new_raw_u1
 
             if not isinstance(raw_u1, sympy.Symbol):
+<<<<<<< HEAD
                 assert (
                     not raw_u1.free_symbols
                 ), f"should have been constant, but got {raw_u1}"
+=======
+                assert not raw_u1.free_symbols, (
+                    f"should have been constant, but got {raw_u1}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
 
             # The old and new could be the same if you improperly hit the memo
@@ -590,6 +760,15 @@ def rebind_unbacked(
 # dangerous in case someone adds a function that returns an int but is
 # mutating.  So manually whitelist for now.
 def is_accessor_node(node: torch.fx.Node) -> bool:
+<<<<<<< HEAD
+=======
+    """
+    Helper function to determine if a node is trying to access
+    a symbolic integer such as size, stride, offset or item. Currently
+    primarily only used in a DCE pass to figure out purity.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Dynamo only exercised condition
     if (
         node.op == "call_method"
@@ -598,6 +777,10 @@ def is_accessor_node(node: torch.fx.Node) -> bool:
         and node.target in ["size", "stride", "storage_offset", "item"]
     ):
         return True
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if node.op == "call_function" and node.target in [
         torch.ops.aten.sym_size,
         torch.ops.aten.sym_size.default,
@@ -610,6 +793,10 @@ def is_accessor_node(node: torch.fx.Node) -> bool:
         torch.ops.aten.sym_numel.default,
     ]:
         return True
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return False
 
 
@@ -646,8 +833,34 @@ def _sympy_from_args(
     sort: bool = True,
     is_commutative: Optional[bool] = None,
 ) -> sympy.Expr:
+<<<<<<< HEAD
     if not args:
         return cls.identity  # type: ignore[union-attr]
+=======
+    """
+    Create a sympy expression from a list of arguments, optimizing for performance.
+
+    This function creates a sympy Add or Mul expression from a list of arguments
+    while avoiding expensive operations like flattening. It handles sorting the
+    arguments appropriately based on the expression type.
+
+    Args:
+        cls: The sympy class to create (Add or Mul)
+        args: List of sympy expressions to combine
+        sort: Whether to sort the arguments (default: True)
+        is_commutative: Whether the operation is commutative (default: None)
+
+    Returns:
+        A sympy expression of type cls combining all arguments
+
+    Raises:
+        ValueError: If cls is not sympy.Add or sympy.Mul
+    """
+
+    if not args:
+        return cls.identity  # type: ignore[union-attr]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # These args are already in canonical form, so we avoid calling
     # Add(*args) to avoid expensive Add.flatten operation
     if sort:
@@ -766,6 +979,7 @@ def div_by_factor(x: sympy.Expr, factor: int) -> sympy.Expr:
     return expr
 
 
+<<<<<<< HEAD
 def is_concrete_bool(a: Union[bool, SymBool]) -> bool:
     """
     Utility to check if underlying object
@@ -789,6 +1003,9 @@ def is_concrete_bool(a: Union[bool, SymBool]) -> bool:
 
 
 def is_nested_int(s: Union[int, SymInt]) -> TypeGuard[SymInt]:
+=======
+def is_nested_int(s: IntLikeType) -> TypeGuard[SymInt]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return isinstance(s, torch.SymInt) and s.node.is_nested_int()
 
 
@@ -799,6 +1016,27 @@ def is_nested_int(s: Union[int, SymInt]) -> TypeGuard[SymInt]:
 
 
 def _iterate_exprs(val: IterateExprs) -> Iterator[sympy.Basic]:
+<<<<<<< HEAD
+=======
+    """
+    Recursively iterate through a value and yield all sympy expressions contained within it.
+
+    This function traverses various data structures (tensors, lists, tuples, etc.) and extracts
+    any symbolic expressions they contain. It's used for operations like finding free symbols
+    in complex nested structures.
+
+    Args:
+        val: The value to extract sympy expressions from. Can be a symbolic type (SymInt, SymFloat, SymBool),
+             a sympy expression, a primitive type (int, float, bool), a container (tuple, list),
+             a sparse tensor, a regular tensor, None, or a torch.Generator.
+
+    Yields:
+        sympy.Basic: Each sympy expression found in the value.
+
+    Raises:
+        AssertionError: If the value is of an unsupported type.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(val, SymTypes):
         # This allow applies to the jagged layout NestedTensor case as
         # nested ints are not symbolic
@@ -827,9 +1065,31 @@ def _iterate_exprs(val: IterateExprs) -> Iterator[sympy.Basic]:
 
 
 def free_symbols(val: IterateExprs) -> OrderedSet[sympy.Symbol]:
+<<<<<<< HEAD
+    if val is None:
+        return OrderedSet()
+    itr = _iterate_exprs(val)
+=======
+    """
+    Recursively collect all free symbols from a value.
+
+    This function traverses various data structures (tensors, lists, tuples, etc.) and extracts
+    all sympy symbols contained within them. It's useful for finding all symbolic variables
+    that a complex nested structure depends on.
+
+    Args:
+        val: The value to extract symbols from. Can be a symbolic type (SymInt, SymFloat, SymBool),
+             a container (tuple, list), a tensor, or None.
+
+    Returns:
+        OrderedSet[sympy.Symbol]: An ordered set of all free symbols found in the value.
+    """
     if val is None:
         return OrderedSet()
+
     itr = _iterate_exprs(val)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # we need at least 1 to call union, so we hand code the identity
     try:
         first_expr = next(itr)
@@ -843,7 +1103,11 @@ def free_symbols(val: IterateExprs) -> OrderedSet[sympy.Symbol]:
 
 def has_free_symbols(val: IterateExprs) -> bool:
     """Faster version of bool(free_symbols(val))"""
+<<<<<<< HEAD
     return not all(e.is_number for e in _iterate_exprs(val))
+=======
+    return not all((e.is_number or e.is_Boolean) for e in _iterate_exprs(val))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def has_free_unbacked_symbols(x: IterateExprs) -> bool:
@@ -859,8 +1123,14 @@ def has_free_unbacked_symbols(x: IterateExprs) -> bool:
     return False
 
 
+<<<<<<< HEAD
 # Like free_symbols, but filtered to only report unbacked symbols
 def free_unbacked_symbols(x: IterateExprs) -> OrderedSet[sympy.Symbol]:
+=======
+def free_unbacked_symbols(x: IterateExprs) -> OrderedSet[sympy.Symbol]:
+    """Like free_symbols, but filtered to only report unbacked symbols"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # NB: keep synced with is_unbacked_symint
     return OrderedSet(
         s
@@ -872,6 +1142,21 @@ def free_unbacked_symbols(x: IterateExprs) -> OrderedSet[sympy.Symbol]:
 # WARNING: Don't use this on Dynamo produced graphs, they don't have meta
 # setup!
 def is_symbol_binding_fx_node(node: torch.fx.Node) -> Optional[sympy.Symbol]:
+<<<<<<< HEAD
+=======
+    """
+    Check if a given FX node is a symbol binding node.
+
+    A symbol binding node is one that has a SymInt value in its meta that contains
+    a sympy Symbol expression, and is either a placeholder node or contains unbacked symbols.
+
+    Args:
+        node (torch.fx.Node): The FX node to check
+
+    Returns:
+        Optional[sympy.Symbol]: The sympy Symbol if the node is a symbol binding node, None otherwise
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (
         "val" in node.meta
         and isinstance(node.meta["val"], torch.SymInt)
@@ -888,6 +1173,22 @@ def is_symbol_binding_fx_node(node: torch.fx.Node) -> Optional[sympy.Symbol]:
 def find_symbol_binding_fx_nodes(
     graph: torch.fx.Graph,
 ) -> dict[sympy.Symbol, torch.fx.Node]:
+<<<<<<< HEAD
+=======
+    """
+    Find all nodes in an FX graph that bind sympy Symbols.
+
+    This function scans through all nodes in the given FX graph and identifies
+    nodes that bind sympy Symbols (typically placeholder nodes with SymInt values).
+    When multiple nodes bind the same symbol, only the first occurrence is kept.
+
+    Args:
+        graph: The FX graph to search for symbol binding nodes
+
+    Returns:
+        A dictionary mapping from sympy Symbols to their binding FX nodes
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r = {}
     # NB: Prefer first occurrence of symbol
     for node in graph.nodes:
@@ -896,13 +1197,34 @@ def find_symbol_binding_fx_nodes(
     return r
 
 
+<<<<<<< HEAD
+=======
+@dataclass(frozen=True)
+class Specialization:
+    """
+    This class is used in multi-graph compilation contexts where we generate
+    multiple specialized graphs and dispatch to the appropriate one at runtime.
+    This allows us to optimize the trade-off between performance and generality
+    by creating specialized versions for common patterns (e.g., x.shape[0] % 16 == 0)
+    while maintaining a general fallback.
+    """
+
+    source: TensorPropertySource
+    check_fn: Callable
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Analogous to ConvertIntSource
 @dataclass(frozen=True)
 class ConvertIntKey:
     def __str__(self) -> str:
         return ".cast_symbool_to_symint_guardless()"
 
+<<<<<<< HEAD
     def get(self, b: bool) -> Union[int, SymInt]:
+=======
+    def get(self, b: bool) -> IntLikeType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Get the int value from bool"""
         return cast_symbool_to_symint_guardless(b)
 
@@ -933,7 +1255,11 @@ def get(self, o: Any) -> Any:
 
 @dataclass(frozen=True)
 class DivideByKey:
+<<<<<<< HEAD
     divisor: Union[int, SymInt]
+=======
+    divisor: IntLikeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __str__(self) -> str:
         return f".__floordiv__({self.divisor})"
@@ -951,6 +1277,27 @@ def _free_unbacked_symbols_with_path(
     pending: Optional[set[sympy.Symbol]] = None,
     simplify: bool = False,
 ) -> dict[sympy.Symbol, pytree.KeyPath]:
+<<<<<<< HEAD
+=======
+    """
+    Recursively traverses a structure to find unbacked symbols and their access paths.
+
+    This function walks through tensors, lists, tuples, and symbolic values to locate
+    unbacked symbols that are in the pending set, and returns a mapping from those
+    symbols to their access paths in the structure.
+
+    Args:
+        a: The object to traverse (tensor, list, tuple, SymInt, etc.)
+        path: The current path in the object tree
+        real: Optional real tensor corresponding to the fake tensor being traversed
+        shape_env: Optional ShapeEnv to register unbacked values with
+        pending: Set of unbacked symbols to look for (will be modified in-place)
+        simplify: Whether to use simplified expressions
+
+    Returns:
+        A dictionary mapping unbacked symbols to their access paths
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     go = functools.partial(
         _free_unbacked_symbols_with_path,
         shape_env=shape_env,
@@ -1061,7 +1408,11 @@ def _symint_wrap(s: sympy.Symbol) -> SymInt:
             )
 
         unbacked = lhs if lhs in pending else rhs
+<<<<<<< HEAD
         divisor: Union[int, SymInt] = (
+=======
+        divisor: IntLikeType = (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             int(coeff)
             if shape_env and isinstance(coeff, sympy.Integer)
             else _symint_wrap(coeff)
@@ -1180,6 +1531,7 @@ def compute_unbacked_bindings(
     return symbol_to_path
 
 
+<<<<<<< HEAD
 def definitely_true(a: BoolLikeType) -> bool:
     """
     Returns True only if we can tell that a is True, possibly introducing
@@ -1223,6 +1575,111 @@ def definitely_false(a: BoolLikeType) -> bool:
 
 
 def statically_known_true(x: Union[bool, SymBool]) -> bool:
+=======
+# Note [guard_or_]
+# The following two functions are common utilities used while defining unbacked semantics
+# of various framework code. Those would be used in situations you prefer to guard and know
+# the result of the expression over not guarding, but in case you hit a data dependent error
+# you are ok with just returning true or false.
+#
+# When to use this?
+# (1) If you can use a higher level combinator prefer using those instead, they are definitely safe (modulo short-circuiting).
+#
+# (2) It can be used if the program would behave equivalently if _guard_or returned true or false.
+# Many inductor optimizations fall in this bracket for example.
+#
+# (3) Finally, it's even be OK if the program wouldn't behave equivalently, so long as the
+# change is semantics preserving.  It can be semantics preserving if the program errors in more
+# cases than it did previously (but otherwise behaves identically), or if it changes some quantity
+# in a way that doesn't matter (e.g., strides often fall in this bucket.)
+#
+# (4) Specialize for the general case and add a runtime assertion that would fail during
+#     runtime if the conditions for the general case are not satisfied. Examples for this are;
+#      assuming expand/reshape inputs are not -1. or assuming the non-broadcasting path.
+#
+def _guard_or(a: BoolLikeType, default: bool) -> bool:
+    """
+    Try to guard a, if data dependent error encountered just return default.
+    """
+    if not isinstance(a, SymBool):
+        assert isinstance(a, bool)
+        return a
+
+    # if backed_size_oblivious is True we treat backed as unbacked here.
+    if torch.fx.experimental._config.backed_size_oblivious:
+        result = _static_eval_sym_bool(a)
+        return result if result is not None else default
+
+    shape_env = getattr(a.node, "shape_env", None)
+
+    # xla symnode path.
+    if shape_env is None:
+        return guard_bool(a)
+
+    sym_node = a.node
+    r = sym_node.shape_env.evaluate_sym_node(
+        sym_node, size_oblivious=False, fallback_value=default
+    )
+    return bool(r)
+
+
+def guard_or_false(a: BoolLikeType) -> bool:
+    """
+    Try to guard a, if data dependent error encountered just return false.
+    """
+    return _guard_or(a, False)
+
+
+def guard_or_true(a: BoolLikeType) -> bool:
+    """
+    Try to guard a, if data dependent error encountered just return true.
+    """
+    return _guard_or(a, True)
+
+
+def _static_eval_sym_bool(x: SymBool) -> Optional[bool]:
+    assert isinstance(x, SymBool)
+    expr = x.node.expr
+
+    try:
+        # Shape env access is inside the try on purpose. xla symnode does not
+        # have it on its attributes.
+        shape_env = x.node.shape_env
+        simplified = shape_env._maybe_evaluate_static(expr)
+        if simplified is not None:
+            return bool(simplified)
+        else:
+            return None
+    except Exception:
+        log.debug("Could not simplify %s", expr)
+        return None
+
+
+def statically_known_false(x: BoolLikeType) -> bool:
+    """
+    Returns True if x can be simplified to a constant and is False.
+    If x cannot be evaluated from static, we return False
+
+    .. note::
+        This function doesn't introduce new guards, so the expression may end
+        up evaluating to False at runtime even if this function returns False.
+
+    Args:
+        x (bool, SymBool): The expression to try statically evaluating
+    """
+    if not isinstance(x, SymBool):
+        assert isinstance(x, bool)
+        return not x
+
+    result = _static_eval_sym_bool(x)
+    if result is None:
+        return False
+
+    return not result
+
+
+def statically_known_true(x: BoolLikeType) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Returns True if x can be simplified to a constant and is true.
 
@@ -1233,6 +1690,7 @@ def statically_known_true(x: Union[bool, SymBool]) -> bool:
     Args:
         x (bool, SymBool): The expression to try statically evaluating
     """
+<<<<<<< HEAD
     if isinstance(x, SymBool):
         expr = x.node.expr
         shape_env = x.node.shape_env
@@ -1248,13 +1706,42 @@ def statically_known_true(x: Union[bool, SymBool]) -> bool:
 
 
 def sym_eq(x: _T, y: _T) -> Union[bool, SymBool]:
+=======
+    if not isinstance(x, SymBool):
+        assert isinstance(x, bool)
+        return x
+
+    result = _static_eval_sym_bool(x)
+    if result is None:
+        return False
+
+    return result
+
+
+def sym_and(x: BoolLikeType, *others: BoolLikeType) -> BoolLikeType:
+    """
+    and, but for symbolic expressions, without bool casting.
+    """
+    if len(others) == 0:
+        return x
+    for y in others:
+        x = operator.and_(x, y)
+    return x
+
+
+def sym_eq(x: _T, y: _T) -> BoolLikeType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Like ==, but when run on list/tuple, it will recursively test equality
     and use sym_and to join the results together, without guarding.
     """
+<<<<<<< HEAD
     if (isinstance(x, tuple) and isinstance(y, tuple)) or (
         isinstance(x, list) and isinstance(y, list)
     ):
+=======
+    if isinstance(x, (tuple, list)) and isinstance(y, (list, tuple)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(x) != len(y):
             return False
         return functools.reduce(operator.and_, map(sym_eq, x, y), True)
@@ -1264,9 +1751,40 @@ def sym_eq(x: _T, y: _T) -> Union[bool, SymBool]:
         raise AssertionError(f"unexpected sym_eq between {type(x)} {type(y)}")
 
 
+<<<<<<< HEAD
 def guard_scalar(
     a: Union[SymBool, SymInt, SymFloat, int, bool, float]
 ) -> Union[bool, int, float]:
+=======
+def sym_or(x: BoolLikeType, *others: BoolLikeType) -> BoolLikeType:
+    """
+    or, but for symbolic expressions, without bool casting.
+    """
+    if len(others) == 0:
+        return x
+    for y in others:
+        x = operator.or_(x, y)
+    return x
+
+
+def guard_scalar(
+    a: Union[SymBool, SymInt, SymFloat, int, bool, float],
+) -> Union[bool, int, float]:
+    """
+    Guard a scalar value, which can be a symbolic or concrete boolean, integer, or float.
+
+    This function dispatches to the appropriate guard function based on the type of the input.
+
+    Args:
+        a: A symbolic or concrete scalar value (bool, int, or float)
+
+    Returns:
+        The concrete value after guarding
+
+    Raises:
+        AssertionError: If the input is not a recognized scalar type
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(a, (SymBool, bool)):
         return guard_bool(a)
     elif isinstance(a, (SymInt, int)):
@@ -1277,12 +1795,15 @@ def guard_scalar(
         raise AssertionError(f"unrecognized scalar {a}")
 
 
+<<<<<<< HEAD
 def _constrain_symbol_range(
     shape_env: ShapeEnv, s: sympy.Symbol, compiler_min: int, compiler_max: int
 ) -> None:
     shape_env.constrain_symbol_range(s, compiler_min, compiler_max)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _advise_is_size(a: SymInt) -> None:
     """
     Don't use this directly; use torch._check_is_size instead.
@@ -1325,7 +1846,11 @@ def _advise_is_size(a: SymInt) -> None:
         _constrain_range_for_size(a)
 
 
+<<<<<<< HEAD
 def _advise_is_bounded(a: SymInt, upper_bound: Union[int, SymInt]) -> None:
+=======
+def _advise_is_bounded(a: SymInt, upper_bound: IntLikeType) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (
         isinstance(a, SymInt)
         and isinstance(a.node, SymNode)
@@ -1449,7 +1974,11 @@ def constrain_unify(a: torch.SymInt, b: torch.SymInt) -> None:
 # in the unlikely branch.)  (I think expect is a good name; in recent
 # versions of C++, this is replaced with [[likely]], which is weaker
 # and not accurate for this function!)
+<<<<<<< HEAD
 def expect_true(a: Union[SymBool, bool], skip: int = 0) -> bool:
+=======
+def expect_true(a: BoolLikeType, skip: int = 0) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(a, SymBool):
         # TODO: check perf implications of this
         frame = inspect.currentframe()
@@ -1464,21 +1993,33 @@ def expect_true(a: Union[SymBool, bool], skip: int = 0) -> bool:
     return a
 
 
+<<<<<<< HEAD
 def guard_bool(a: Union[SymBool, bool]) -> bool:
+=======
+def guard_bool(a: BoolLikeType) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(a, SymBool):
         return a.node.guard_bool("", 0)  # NB: uses Python backtrace
     assert type(a) is bool, a
     return a
 
 
+<<<<<<< HEAD
 def guard_int(a: Union[SymInt, int]) -> int:
+=======
+def guard_int(a: IntLikeType) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(a, SymInt):
         return a.node.guard_int("", 0)  # NB: uses Python backtrace
     assert type(a) is int, a
     return a
 
 
+<<<<<<< HEAD
 def guard_float(a: Union[SymFloat, float]) -> float:
+=======
+def guard_float(a: FloatLikeType) -> float:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(a, SymFloat):
         return a.node.guard_float("", 0)  # NB: uses Python backtrace
     assert isinstance(a, float), a
@@ -1735,12 +2276,21 @@ def is_derived(
 
 
 def _assert_symbol_context(symbolic_context: object) -> TypeGuard[SymbolicContext]:
+<<<<<<< HEAD
     assert isinstance(
         symbolic_context, SymbolicContext
     ), "Invalid symbolic_context object"
     assert (
         type(symbolic_context) is not SymbolicContext
     ), "Illegal usage of symbolic_context ABC"
+=======
+    assert isinstance(symbolic_context, SymbolicContext), (
+        "Invalid symbolic_context object"
+    )
+    assert type(symbolic_context) is not SymbolicContext, (
+        "Illegal usage of symbolic_context ABC"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return True
 
 
@@ -1783,7 +2333,24 @@ class SymbolicContext:
 
 
 @dataclass(frozen=True)
+<<<<<<< HEAD
 class StatelessSymbolicContext(SymbolicContext):
+=======
+class SymIntSymbolicContext(SymbolicContext):
+    """
+    Data structure specifying any constraints on a SymInt input
+    """
+
+    constraint: DimConstraint
+
+
+_P1 = ParamSpec("_P1")
+_T1 = TypeVar("_T1")
+
+
+@dataclass(frozen=True)
+class StatelessSymbolicContext(Generic[_P1, _T1], SymbolicContext):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Create symbols in ``create_symbolic_sizes_strides_storage_offset`` via
     a symbolic_context determination as given by ``DimDynamic`` and ``DimConstraint``.
@@ -1794,6 +2361,10 @@ class StatelessSymbolicContext(SymbolicContext):
     dynamic_strides: DimList[DimDynamic] = None  # type: ignore[assignment]
     constraint_sizes: DimList[DimConstraint] = None  # type: ignore[assignment]
     constraint_strides: DimList[DimConstraint] = None  # type: ignore[assignment]
+<<<<<<< HEAD
+=======
+    specialize_on: Optional[list[list[Callable[_P1, _T1]]]] = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # If the tensor is a view, this should be populated for the base. It contains
     # information on how to allocate symbols when recursively fakeifying the base
     # during view fake-ification.
@@ -1801,6 +2372,15 @@ class StatelessSymbolicContext(SymbolicContext):
     # TODO: add storage offset and stride symbolic_context
 
     def __post_init__(self) -> None:
+<<<<<<< HEAD
+=======
+        if self.specialize_on is None:
+            object.__setattr__(
+                self,
+                "specialize_on",
+                [[]] * len(self.dynamic_sizes),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.dynamic_strides is None:
             object.__setattr__(
                 self,
@@ -1854,12 +2434,21 @@ class StatefulSymbolicContext(StatelessSymbolicContext):
     other values - dynamic_sizes and constraint_sizes will not be read if we cache
     hit.
 
+<<<<<<< HEAD
     It is the cache owners responsibility to maintain the lifecycle of the cache
     w/r/t different shape_envs, clearing, etc.
     """
 
     tensor_source: Source = None  # type: ignore[assignment]
     # Why is this keyd on int first?
+=======
+    It is the cache owner's responsibility to maintain the lifecycle of the cache
+    with respect to different shape_envs, clearing, etc.
+    """
+
+    tensor_source: Source = None  # type: ignore[assignment]
+    # Why is this keyed on int first?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # That integer is actually the id of the shape_env. This cache short-circuits symbol
     # creation, and we must store it per shape env. Now, while tracing invariants are a single
     # shape env per tracing context, and every new frame gets a new shape_env. So where would we have
@@ -1894,8 +2483,33 @@ def __post_init__(self) -> None:
             self.inner_contexts = {}
 
 
+<<<<<<< HEAD
 def is_symbolic(
     val: Union[int, SymInt, float, SymFloat, bool, SymBool]
+=======
+@dataclass
+class TrackedFake:
+    """
+    Tracks the sources of all fake tensors we wrap in Dynamo.
+    Used by shape guard computation.
+    """
+
+    fake: Union[FakeTensor, SymInt]
+    source: Source
+    symbolic_context: Optional[SymbolicContext]
+
+    def __hash__(self) -> int:
+        return hash((self.fake, self.source.name()))
+
+    def __eq__(self, other: object) -> bool:
+        if isinstance(other, TrackedFake):
+            return self.fake is other.fake and self.source.name() == other.source.name()
+        return False
+
+
+def is_symbolic(
+    val: Union[int, SymInt, float, SymFloat, bool, SymBool],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> TypeGuard[Union[SymInt, SymFloat, SymBool]]:
     if isinstance(val, (int, float, bool)):
         return False
@@ -1906,6 +2520,25 @@ def is_symbolic(
 
 
 def _expandsums(args: list[sympy.Expr]) -> tuple[sympy.Expr, bool]:
+<<<<<<< HEAD
+=======
+    """
+    Expand products of sums into sums of products.
+
+    This function takes a list of sympy expressions and separates them into
+    additive expressions (those with is_Add=True) and other expressions.
+    It then computes the distributive product, expanding (a+b)*(c+d) into a*c + a*d + b*c + b*d.
+
+    Args:
+        args: A list of sympy expressions to expand
+
+    Returns:
+        A tuple containing:
+        - The expanded expression as a sympy.Expr
+        - A boolean indicating whether expansion occurred (True if multiple additive
+          expressions were present or if there was at least one additive and one other expression)
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     adds, other = [], []
     for arg in args:
         if arg.is_Add:
@@ -1922,6 +2555,23 @@ def _expandsums(args: list[sympy.Expr]) -> tuple[sympy.Expr, bool]:
 
 
 def _fast_expand(expr: _SympyT) -> _SympyT:
+<<<<<<< HEAD
+=======
+    """
+    A faster implementation of sympy's expand function for common cases.
+
+    This function expands expressions like (a+b)^n or (a+b)*(c+d) into sums of products,
+    but avoids the expensive checks and features of sympy's full expand implementation.
+    It only recreates objects when necessary to avoid expensive operations.
+
+    Args:
+        expr: A sympy expression to expand
+
+    Returns:
+        The expanded expression
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # The expand algorithm in sympy is slow due to all the features is supports
     # For eg: e^(-x)*(x-1)/(x+1) is expanded to (x-1)/(e^x + e^x*x) if x is
     # positive and (e^(-x)*x-e^(-x))/(x+1) if x is negative. We do not implement
@@ -2099,6 +2749,22 @@ def eval_is_non_overlapping_and_dense(
 def _eval_is_non_overlapping_and_dense(
     sizes: Sequence[int], strides: Sequence[int]
 ) -> bool:
+<<<<<<< HEAD
+=======
+    """
+    Evaluates whether a tensor with the given sizes and strides is non-overlapping and dense.
+
+    A tensor is non-overlapping if there's no memory location that belongs to more than one element.
+    A tensor is dense if all elements are stored in memory without gaps.
+
+    Args:
+        sizes: Sequence of dimension sizes for the tensor
+        strides: Sequence of strides for the tensor
+
+    Returns:
+        True if the tensor is non-overlapping and dense, False otherwise
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dim = len(sizes)
 
     # Short-circuits for tensors of rank one, which are
@@ -2131,8 +2797,27 @@ def _sympy_cast_symbool_to_symint_guardless(x: SympyBoolean) -> sympy.Expr:
 
 
 def cast_symbool_to_symint_guardless(
+<<<<<<< HEAD
     symbool: Union[bool, torch.SymBool]
 ) -> Union[int, torch.SymInt]:
+=======
+    symbool: Union[bool, torch.SymBool],
+) -> Union[int, torch.SymInt]:
+    """
+    Converts a SymBool or bool to a SymInt or int without introducing guards.
+
+    This function maps True to 1 and False to 0, preserving the symbolic nature
+    of the input when it's a SymBool. Unlike regular casting which might introduce
+    guards, this function performs the conversion without adding any guards.
+
+    Args:
+        symbool: A boolean value, either a concrete bool or symbolic SymBool
+
+    Returns:
+        The corresponding integer value (1 for True, 0 for False) as either
+        a concrete int or symbolic SymInt
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(symbool, bool):
         return 1 if symbool else 0
     int_sym = _sympy_cast_symbool_to_symint_guardless(symbool.node.expr)
@@ -2182,9 +2867,15 @@ def wrapper(self: ShapeEnv, *args: Any, **kwargs: Any) -> _T:
                 prior_version = self._version_counter
                 prior_key = self._get_key()
             else:
+<<<<<<< HEAD
                 assert (
                     prior_key == self._get_key()
                 ), "ShapeEnv cache key changed without version being updated!"
+=======
+                assert prior_key == self._get_key(), (
+                    "ShapeEnv cache key changed without version being updated!"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return fn_cache(self, *args, **kwargs)
 
@@ -2204,12 +2895,24 @@ def wrapper(self: ShapeEnv, *args: Any, **kwargs: Any) -> _T:  # type: ignore[mi
     return wrapper  # type: ignore[return-value]
 
 
+<<<<<<< HEAD
 # This is pretty similar to ShapeGuard but it also comes with a message,
 # and is exclusively used for things that MUST be true (unlike guards,
 # which can evaluate False, in which case you just choose not to use
 # a particular specialization)
 @dataclass(frozen=True)
 class RuntimeAssert:
+=======
+@dataclass(frozen=True)
+class RuntimeAssert:
+    """
+    This is pretty similar to ShapeGuard but it also comes with a message,
+    and is exclusively used for things that MUST be true (unlike guards,
+    which can evaluate False, in which case you just choose not to use
+    a particular specialization)
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     expr: SympyBoolean
     msg: str = field(repr=False)
     stack: CapturedTraceback = field(repr=False)
@@ -2222,6 +2925,22 @@ def _print_Float(self, expr: sympy.Float) -> str:
 
 
 class _ShapeGuardPrinter(abc.ABC):
+<<<<<<< HEAD
+=======
+    """
+    Abstract base class for printers that convert symbolic expressions to string representations.
+
+    This class provides common functionality for printing symbolic expressions with
+    special handling for symbols that represent tensor shapes, strides, etc.
+    Subclasses implement specific formatting for different output languages.
+
+    Args:
+        symbol_to_source: Mapping from sympy symbols to their source objects
+        source_ref: Function to convert a source to its string representation
+        var_to_sources: Mapping from sympy symbols to their source objects (for error reporting)
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(
         self,
         symbol_to_source: Mapping[sympy.Symbol, list[Source]],
@@ -2234,9 +2953,31 @@ def __init__(
         super().__init__()
 
     def _print_Float(self, expr: sympy.Float) -> str:
+<<<<<<< HEAD
+        return str(float(expr))
+
+    def _print_Symbol(self, expr: sympy.Symbol) -> str:
+=======
+        """Convert a sympy Float to a Python float string representation."""
         return str(float(expr))
 
     def _print_Symbol(self, expr: sympy.Symbol) -> str:
+        """
+        Convert a sympy Symbol to its source representation.
+
+        This method looks up the symbol in symbol_to_source mapping and returns
+        the string representation of its first source.
+
+        Args:
+            expr: The sympy Symbol to convert
+
+        Returns:
+            String representation of the symbol's source
+
+        Raises:
+            AssertionError: If the symbol is not found in symbol_to_source
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert isinstance(expr, sympy.Symbol), str(type(expr))
 
         def repr_symbol_to_source() -> str:
@@ -2256,22 +2997,88 @@ def repr_symbol_to_source() -> str:
 
     @abc.abstractmethod
     def print_source(self, source: Source) -> str:
+<<<<<<< HEAD
+=======
+        """
+        Convert a source object to its string representation.
+
+        Args:
+            source: The source object to convert
+
+        Returns:
+            String representation of the source
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...
 
     @abc.abstractmethod
     def doprint(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
+=======
+        """
+        Convert a sympy expression to its string representation.
+
+        Args:
+            expr: The sympy expression to convert
+
+        Returns:
+            String representation of the expression
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...
 
 
 class ShapeGuardPythonPrinter(_ShapeGuardPrinter, PythonPrinter):
+<<<<<<< HEAD
+=======
+    """
+    Python printer for shape guards that extends the base ShapeGuardPrinter.
+
+    This class provides functionality to print symbolic expressions as Python code,
+    with caching to improve performance when printing the same expressions multiple times.
+    It handles printing of sources and expressions according to Python syntax.
+
+    Args:
+        *args: Arguments passed to the parent classes.
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, *args: Any) -> None:
         super().__init__(*args)
         self._print_cache: dict[sympy.Expr, str] = {}
 
     def print_source(self, source: Source) -> str:
+<<<<<<< HEAD
         return self.source_ref(source)
 
     def doprint(self, expr: sympy.Expr) -> str:
+=======
+        """
+        Convert a source object to its string representation using the source_ref function.
+
+        Args:
+            source: The source object to convert
+
+        Returns:
+            String representation of the source
+        """
+        return self.source_ref(source)
+
+    def doprint(self, expr: sympy.Expr) -> str:
+        """
+        Convert a sympy expression to its Python string representation with caching.
+
+        This method first checks if the expression is already in the cache.
+        If found, it returns the cached result; otherwise, it delegates to
+        PythonPrinter's doprint method and caches the result.
+
+        Args:
+            expr: The sympy expression to convert
+
+        Returns:
+            String representation of the expression in Python syntax
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         val = self._print_cache.get(expr, None)
         if val is not None:
             return val
@@ -2352,9 +3159,15 @@ def __init__(
 
     def _print_Symbol(self, expr: sympy.Symbol) -> str:
         assert isinstance(expr, sympy.Symbol), str(type(expr))
+<<<<<<< HEAD
         assert self.symbol_to_source.get(
             expr
         ), f"Unknown symbol {expr} created by constraints solver"
+=======
+        assert self.symbol_to_source.get(expr), (
+            f"Unknown symbol {expr} created by constraints solver"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.symbol_to_source[expr][0].name()
 
 
@@ -2372,9 +3185,15 @@ def __init__(
         source_name_to_debug_name: Mapping[str, str],
     ) -> None:
         # We try to solve systems of inequalities with 1 free variable.
+<<<<<<< HEAD
         self._univariate_inequalities: dict[
             sympy.Symbol, set[SympyBoolean]
         ] = defaultdict(set)
+=======
+        self._univariate_inequalities: dict[sympy.Symbol, set[SympyBoolean]] = (
+            defaultdict(set)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Among them, we prioritize solving for a free variable that has equalities.
         # NOTE: _symbols_with_equalities is always a subset of _univariate_inequalities.keys()
         # and removing a symbol from the former => removing it from the latter.
@@ -2457,9 +3276,16 @@ def mod_handler(*args: sympy.Expr) -> sympy.Expr:
             # With any hint (say) s = k, we'd rewrite this to: 3*s % (s + 1) == k - 2. But, substituting, we
             # would then get k - 2 == s - 2, and thus s = k as the (only, constant) solution!
             base, divisor = args
+<<<<<<< HEAD
             base, divisor = self.rewrite_with_congruences(
                 s, base
             ), self.rewrite_with_congruences(s, divisor)
+=======
+            base, divisor = (
+                self.rewrite_with_congruences(s, base),
+                self.rewrite_with_congruences(s, divisor),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod_reduced = base.xreplace(self._var_to_val) % divisor.xreplace(
                 self._var_to_val
             )
@@ -2476,9 +3302,16 @@ def floor_div_handler(*args: sympy.Expr) -> sympy.Expr:
             # NOTE(avik): This is exactly equivalent to rewriting b // d as (b - (b % d)) / d
             # and eliminating b % d as above.
             base, divisor = args
+<<<<<<< HEAD
             base, divisor = self.rewrite_with_congruences(
                 s, base
             ), self.rewrite_with_congruences(s, divisor)
+=======
+            base, divisor = (
+                self.rewrite_with_congruences(s, base),
+                self.rewrite_with_congruences(s, divisor),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mod_reduced = base.xreplace(self._var_to_val) % divisor.xreplace(
                 self._var_to_val
             )
@@ -2532,7 +3365,13 @@ def add(self, expr: SympyBoolean) -> bool:
         # a fix for this issue, we delay raising such failures. See solve().
         if orig_reduced == sympy.false:
             self._inconsistencies.append(f"{orig_expr} is inconsistent!")
+<<<<<<< HEAD
         if isinstance(expr, sympy.Ne) or self._has_unsupported_sympy_function(expr):
+=======
+        if isinstance(
+            expr, (sympy.Ne, sympy.Or, sympy.And)
+        ) or self._has_unsupported_sympy_function(expr):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # we're not going to do anything useful with these, so drop them
             return False
         free_symbols = expr.free_symbols
@@ -2638,9 +3477,15 @@ def solve(self) -> None:
                     (arg for arg in solution.args if isinstance(arg, sympy.Eq)),
                     solution,
                 )
+<<<<<<< HEAD
             assert isinstance(
                 solution, sympy.Eq
             ), f"Expected an equality constraint for {s}, got {solution}"
+=======
+            assert isinstance(solution, sympy.Eq), (
+                f"Expected an equality constraint for {s}, got {solution}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             symbol, val = solution.args
             assert symbol == s, f"Expected a constraint on {s} instead of on {symbol}"
             # because this is univariate, the solution is a specialization
@@ -2753,8 +3598,13 @@ def _is_derived_dim(
     ) -> TypeGuard[torch.export.dynamic_shapes._DerivedDim]:
         return isinstance(dim, torch.export.dynamic_shapes._DerivedDim)
 
+<<<<<<< HEAD
     def _is_dim(self, dim: object) -> TypeGuard[torch.export.dynamic_shapes._Dim]:
         return isinstance(dim, torch.export.dynamic_shapes._Dim) and not isinstance(
+=======
+    def _is_dim(self, dim: object) -> TypeGuard[torch.export.dynamic_shapes.Dim]:
+        return isinstance(dim, torch.export.dynamic_shapes.Dim) and not isinstance(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dim, torch.export.dynamic_shapes._DerivedDim
         )
 
@@ -2918,7 +3768,12 @@ def _check_same_range(c: Mapping[str, int], dim: object) -> bool:
                                 "max": try_solve(sympy.Eq(expr, c["max"]), s)[1],  # type: ignore[arg-type, index]
                             }
                             if not _check_same_range(
+<<<<<<< HEAD
                                 result, name_to_dim[mroot]  # type: ignore[index, arg-type]
+=======
+                                result,
+                                name_to_dim[mroot],  # type: ignore[index, arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             ):  # ignore if unchanged
                                 modified_root_values[mroot] = result  # type: ignore[index]
                                 break
@@ -3096,6 +3951,10 @@ class ShapeEnvSettings:
     duck_shape: bool
     prefer_deferred_runtime_asserts_over_guards: bool
     allow_complex_guards_as_runtime_asserts: bool
+<<<<<<< HEAD
+=======
+    trace_asserts: bool
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @dataclass
@@ -3238,6 +4097,10 @@ def _init(
         allow_complex_guards_as_runtime_asserts: bool = False,
         # XXX Add any new settings that could affect FakeTensor evaluation
         # to: torch._subclasses.fake_tensor._ShapeEnvSettings
+<<<<<<< HEAD
+=======
+        trace_asserts: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         if duck_shape is None:
             duck_shape = config.use_duck_shape
@@ -3252,10 +4115,22 @@ def _init(
             duck_shape=duck_shape,
             prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
             allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+<<<<<<< HEAD
+=======
+            trace_asserts=trace_asserts,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.guards: list[ShapeGuard] = []
         self.axioms: dict[sympy.Expr, sympy.Expr] = {}
+<<<<<<< HEAD
+=======
+
+        # A set of ids that have already been allocated. This is used
+        # for when we allocate symbol ids using the hash of the source
+        # names to ensure we don't have collisions via linear probing
+        self.unique_ids: set[int] = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Maps symbolic ints to their original concrete values
         # Currently populated from tensors
         self.var_to_val: dict[sympy.Symbol, sympy.Integer] = {}
@@ -3291,8 +4166,11 @@ def _init(
         # Duck-shaping says that if two input tensors have the same size,
         # they get assigned the same symbolic variable
         self.val_to_var: dict[int, sympy.Symbol] = {}
+<<<<<<< HEAD
         if specialize_zero_one:
             self.val_to_var = {0: sympy.S.Zero, 1: sympy.S.One}
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.unbacked_symfloat_counter = itertools.count()
         self.unbacked_symint_counter = itertools.count()
         # Similar to guards, but these MUST evaluate to true and can
@@ -3406,6 +4284,16 @@ def _init(
         # with something like effect token tracking.
         self.unbacked_alloc_order: dict[sympy.Symbol, int] = {}
 
+<<<<<<< HEAD
+=======
+        self.user_specialization_stacks: dict[Source, traceback.StackSummary] = {}
+        self.framework_specialization_stacks: dict[Source, traceback.StackSummary] = {}
+
+        self.trace_asserts = trace_asserts
+
+        self.specializations: OrderedSet[Specialization] = OrderedSet()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch.fx.experimental.validator import translation_validation_enabled
 
         self._translation_validation_enabled = translation_validation_enabled()
@@ -3457,6 +4345,55 @@ def prefer_deferred_runtime_asserts_over_guards(self) -> bool:
     def allow_complex_guards_as_runtime_asserts(self) -> bool:
         return self.settings.allow_complex_guards_as_runtime_asserts
 
+<<<<<<< HEAD
+=======
+    @contextmanager
+    def patch_source_specialization(
+        self, source: Source, check_fn: Callable[[sympy.Symbol], sympy.Expr]
+    ) -> Iterator[None]:
+        """
+        Temporarily add symbol-level axioms to the ShapeEnv. This is useful when you want to "fork"
+        and have parallel universes of ShapeEnvs. For example, we use this when doing multi-graph
+        compile so we can support various graphs with varying levels of specializations.
+
+        This context manager allows for temporarily adding constraints to the shape environment
+        based on a specialization function applied to a symbol associated with a source.
+
+        Args:
+            source: The source of the symbol to specialize
+            check_fn: A function that takes a sympy Symbol and returns a sympy expression
+                     representing a constraint/specialization to be applied
+        """
+        name = source.name()
+        sym = self.source_to_var[name]
+        expr = check_fn(SymInt(SymNode(sym, self, int, None))).node._expr
+        new_axioms = dict(self.get_implications(self.simplify(expr)))
+        added_replacements = {}
+
+        for axiom in new_axioms:
+            if (
+                isinstance(axiom, sympy.Eq)
+                and isinstance(axiom.lhs, sympy.Symbol)
+                and isinstance(axiom.rhs, sympy.Integer)
+                and axiom.lhs not in self.replacements
+            ):
+                self.replacements[axiom.lhs] = axiom.rhs
+                added_replacements[axiom.lhs] = axiom.rhs
+        self.axioms.update(new_axioms)
+
+        # We need to freeze the ShapeEnv becuase any additional modification of
+        # the ShapeEnv will cause unsoundness for subsequent specialization calls.
+        self.frozen = True
+        try:
+            yield
+        finally:
+            for k in new_axioms:
+                self.axioms.pop(k, None)
+            for k in added_replacements:
+                self.replacements.pop(k, None)
+            self.frozen = False
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def check_equal(self, other: ShapeEnv) -> None:
         """Compare another ShapeEnv for equivalence"""
         # ShapeEnv fields that are not relevant for the outcome of
@@ -3485,6 +4422,11 @@ def check_equal(self, other: ShapeEnv) -> None:
             "replacements_slocs",
             "_resimplify_floor_div_axioms",
             "_expr_sym_node_id",
+<<<<<<< HEAD
+=======
+            "user_specialization_stacks",
+            "framework_specialization_stacks",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         # Mapping of the value of each to-be-compared field into the values that
@@ -3641,9 +4583,15 @@ def _constrain_unify(self, a: SymInt, b: SymInt) -> None:
             if not isinstance(b, SymInt):
                 assert a == b
             else:
+<<<<<<< HEAD
                 assert isinstance(
                     b.node.expr, sympy.Symbol
                 ), "constraining non-Symbols NYI"
+=======
+                assert isinstance(b.node.expr, sympy.Symbol), (
+                    "constraining non-Symbols NYI"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert b.node.shape_env is self
                 self.replacements[b.node.expr] = sympy.Integer(a)
         else:
@@ -3656,9 +4604,15 @@ def _constrain_unify(self, a: SymInt, b: SymInt) -> None:
                 self.replacements[a.node.expr] = sympy.Integer(b)
             else:
                 assert a.node.shape_env is b.node.shape_env
+<<<<<<< HEAD
                 assert isinstance(
                     b.node.expr, sympy.Symbol
                 ), "constraining non-Symbols NYI"
+=======
+                assert isinstance(b.node.expr, sympy.Symbol), (
+                    "constraining non-Symbols NYI"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 new_var = self._find(a.node.expr)
                 self.replacements[b.node.expr] = new_var
 
@@ -3751,9 +4705,15 @@ def _create_fx_call_function(
 
             # If translation validation is enabled, all arguments must have its
             # own FX node.
+<<<<<<< HEAD
             assert all(
                 a is not None for a in args
             ), f"missing arg in FX graph ({op.__name__}): {args}"
+=======
+            assert all(a is not None for a in args), (
+                f"missing arg in FX graph ({op.__name__}): {args}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             node = self.fx_node_cache[node_key] = self.graph.call_function(op, args)
             self.name_to_node[node.name] = node
 
@@ -3799,7 +4759,12 @@ def _add_fx_node_metadata(self, node: torch.fx.Node) -> None:
             node.meta[SHAPEENV_EVENT_KEY] = self._last_event_index()
             node.meta[CURRENT_NODE_KEY] = get_current_node()
 
+<<<<<<< HEAD
     def _suppress_guards_tls(self) -> bool:
+=======
+    @staticmethod
+    def _suppress_guards_tls() -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return getattr(TLS, "suppress_guards", False)
 
     @record_shapeenv_event()
@@ -3856,7 +4821,11 @@ def _update_version_counter(self) -> None:
 
     def _produce_dyn_sizes(
         self,
+<<<<<<< HEAD
         ex_size: Sequence[Union[int, SymInt]],
+=======
+        ex_size: Sequence[IntLikeType],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         source: Source,
         symbolic_context: SymbolicContext,
     ) -> list[sympy.Expr]:
@@ -3866,6 +4835,7 @@ def _produce_dyn_sizes(
 
     def _produce_dyn_sizes_from_int_tuple(
         self,
+<<<<<<< HEAD
         tensor_size: Sequence[Union[int, SymInt]],
         source: Source,
         symbolic_context: SymbolicContext,
@@ -3873,6 +4843,15 @@ def _produce_dyn_sizes_from_int_tuple(
         assert all(
             not is_symbolic(val) for val in tensor_size
         ), f"Expect size to be a plain tuple of ints but got {tensor_size}"
+=======
+        tensor_size: Sequence[IntLikeType],
+        source: Source,
+        symbolic_context: SymbolicContext,
+    ) -> list[sympy.Expr]:
+        assert all(not is_symbolic(val) for val in tensor_size), (
+            f"Expect size to be a plain tuple of ints but got {tensor_size}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from torch._dynamo.source import TensorProperty, TensorPropertySource
 
         _assert_symbol_context(symbolic_context)
@@ -3889,6 +4868,20 @@ def _produce_dyn_sizes_from_int_tuple(
                 symbolic_context=symbolic_context,
             )
             if (
+<<<<<<< HEAD
+=======
+                isinstance(symbolic_context, StatelessSymbolicContext)
+                and symbolic_context.specialize_on
+            ):
+                for specialization in symbolic_context.specialize_on[i]:
+                    self.specializations.add(
+                        Specialization(
+                            TensorPropertySource(source, TensorProperty.SIZE, i),
+                            specialization,
+                        )
+                    )
+            if (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 config.backed_size_oblivious
                 and isinstance(sym, sympy.Symbol)  # could be static
                 and symbol_is_type(sym, SymT.SIZE)
@@ -3904,9 +4897,15 @@ def create_symbolic_sizes_strides_storage_offset(
         *,
         symbolic_context: Optional[SymbolicContext] = None,
     ) -> tuple[
+<<<<<<< HEAD
         tuple[Union[int, SymInt], ...],
         tuple[Union[int, SymInt], ...],
         Union[int, SymInt],
+=======
+        tuple[IntLikeType, ...],
+        tuple[IntLikeType, ...],
+        IntLikeType,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]:
         """
         Returns a list of symbolic sizes and strides for the given tensor.
@@ -3968,6 +4967,7 @@ def create_symbolic_sizes_strides_storage_offset(
     # If True branch guard check precedes False branch and for True branch, y.size(0) check precedes x == True,
     # we may have an unnessary shape speciliazation for y.
     def _maybe_specialize_sym_int_with_hint(
+<<<<<<< HEAD
         self, maybe_sym: Union[int, SymInt]
     ) -> Union[int, SymInt]:
         assert isinstance(maybe_sym, (int, torch.SymInt))
@@ -3975,6 +4975,15 @@ def _maybe_specialize_sym_int_with_hint(
             assert (
                 maybe_sym.node.shape_env is not self
             ), "expect the symbol is created from an shape env other than current one."
+=======
+        self, maybe_sym: IntLikeType
+    ) -> IntLikeType:
+        assert isinstance(maybe_sym, (int, torch.SymInt))
+        if is_symbolic(maybe_sym):
+            assert maybe_sym.node.shape_env is not self, (
+                "expect the symbol is created from an shape env other than current one."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return maybe_sym.node.require_hint()
         return maybe_sym
 
@@ -3983,17 +4992,29 @@ def _create_symbolic_sizes_strides_storage_offset(
         self,
         # NB: SymInt is allowed here due to nested int, normally you don't
         # actually pass true symbolic sizes to this function
+<<<<<<< HEAD
         ex_size: Sequence[Union[int, SymInt]],
         ex_stride: Sequence[Union[int, SymInt]],
         ex_storage_offset: Union[int, SymInt],
+=======
+        ex_size: Sequence[IntLikeType],
+        ex_stride: Sequence[IntLikeType],
+        ex_storage_offset: IntLikeType,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         is_dim_dynamic: Sequence[bool],
         source: Source,
         *,
         symbolic_context: Optional[SymbolicContext] = None,
     ) -> tuple[
+<<<<<<< HEAD
         tuple[Union[int, SymInt], ...],
         tuple[Union[int, SymInt], ...],
         Union[int, SymInt],
+=======
+        tuple[IntLikeType, ...],
+        tuple[IntLikeType, ...],
+        IntLikeType,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]:
         dim = len(ex_size)
 
@@ -4101,8 +5122,13 @@ def _compute_symbolic_stride(
         self,
         source: Source,
         size: Sequence[sympy.Expr],
+<<<<<<< HEAD
         ex_size: Sequence[Union[int, SymInt]],
         ex_stride: Sequence[Union[int, SymInt]],
+=======
+        ex_size: Sequence[IntLikeType],
+        ex_stride: Sequence[IntLikeType],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dynamic_strides: Sequence[DimDynamic],
         constraint_strides: Sequence[
             Optional[Union[StrictMinMaxConstraint, RelaxedUnspecConstraint]]
@@ -4113,7 +5139,11 @@ def _compute_symbolic_stride(
         from torch._dynamo.source import TensorProperty, TensorPropertySource
 
         stride: list[Optional[sympy.Expr]] = [None] * len(size)
+<<<<<<< HEAD
         candidates: dict[Union[int, SymInt], sympy.Expr] = {}
+=======
+        candidates: dict[IntLikeType, sympy.Expr] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # iterate over unbound strides in val ascending order with
         # index descending as a tie breaker since for cases like
@@ -4162,7 +5192,11 @@ def create_symintnode(
         *,
         hint: Optional[int],
         source: Optional[Source] = None,
+<<<<<<< HEAD
     ) -> Union[int, SymInt]:
+=======
+    ) -> IntLikeType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Create a SymInt value from a symbolic expression
 
         If you know what the current hint value of the SymInt to be created
@@ -4183,7 +5217,11 @@ def create_symintnode(
         else:
             fx_node = None
 
+<<<<<<< HEAD
         out: Union[int, SymInt]
+=======
+        out: IntLikeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(sym, sympy.Integer):
             if hint is not None:
                 assert int(sym) == hint
@@ -4206,7 +5244,11 @@ def create_symfloatnode(
         *,
         hint: Optional[int],
         source: Optional[Source] = None,
+<<<<<<< HEAD
     ) -> Union[float, SymFloat]:
+=======
+    ) -> FloatLikeType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Create a SymFloat value from a symbolic expression"""
         if self._translation_validation_enabled and source is not None:
             # Create a new symbol for this source.
@@ -4221,7 +5263,11 @@ def create_symfloatnode(
         else:
             fx_node = None
 
+<<<<<<< HEAD
         out: Union[float, SymFloat]
+=======
+        out: FloatLikeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(sym, sympy.Float):
             if hint is not None:
                 assert float(sym) == hint
@@ -4239,7 +5285,11 @@ def create_symfloatnode(
     @record_shapeenv_event()
     def create_unspecified_symint_and_symbol(
         self, value: int, source: Source, dynamic_dim: DimDynamic
+<<<<<<< HEAD
     ) -> Union[int, SymInt]:
+=======
+    ) -> IntLikeType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Create a SymInt wrapping a new unspecified symbol"""
         return self.create_symintnode(
             self.create_unspecified_symbol(
@@ -4500,11 +5550,19 @@ def create_symbol(
         sloc = self._get_sloc()
 
         if val in (0, 1) and specialize_zero_one:
+<<<<<<< HEAD
             r = self.val_to_var[val]
+=======
+            if val == 0:
+                return sympy.S.Zero
+            else:
+                return sympy.S.One
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif not duck or val not in self.val_to_var:
             # If we're not duck shaping, we always create a new symbol
             # Even if we're duck shaping, if we haven't seen this particular
             # value before, we also create a new symbol
+<<<<<<< HEAD
             if type(val) is int or is_nested_int(val):
                 sympy_expr = make_symbol(
                     SymT.SIZE, len(self.var_to_val), positive=positive, integer=True
@@ -4512,6 +5570,16 @@ def create_symbol(
             else:
                 sympy_expr = make_symbol(
                     SymT.FLOAT, len(self.var_to_val), positive=positive, real=True
+=======
+            symbol_id = self._generate_unique_id(source.name())
+            if type(val) is int or is_nested_int(val):
+                sympy_expr = make_symbol(
+                    SymT.SIZE, symbol_id, positive=positive, integer=True
+                )
+            else:
+                sympy_expr = make_symbol(
+                    SymT.FLOAT, symbol_id, positive=positive, real=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             self.source_to_var[source_name] = sympy_expr
             # We always associate vars to vals
@@ -4554,9 +5622,15 @@ def create_symbol(
                         sloc,
                     )
                 else:
+<<<<<<< HEAD
                     self.var_to_range[
                         sympy_expr
                     ] = self._default_unspecified_value_range()
+=======
+                    self.var_to_range[sympy_expr] = (
+                        self._default_unspecified_value_range()
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.var_to_range_sloc[sympy_expr] = ValueRangesSLoc(sloc, sloc)
 
                 # Small performance optimization: if we have a min-max constraint,
@@ -4747,9 +5821,15 @@ def produce_guards_verbose(
             shape_env = replay_shape_env_events(self.events)
             self.check_equal(shape_env)
 
+<<<<<<< HEAD
         assert len(placeholders) == len(
             sources
         ), f"len({placeholders}) != len({sources})"
+=======
+        assert len(placeholders) == len(sources), (
+            f"len({placeholders}) != len({sources})"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Tensorlike = (torch.Tensor, FakeTensorMeta)
 
         def _create_no_constraints_context(t: Tensor) -> StatelessSymbolicContext:
@@ -4845,9 +5925,15 @@ def _create_no_constraints_context(t: Tensor) -> StatelessSymbolicContext:
         symbol_to_source: dict[sympy.Symbol, list[Source]] = collections.defaultdict(
             list
         )
+<<<<<<< HEAD
         symbol_to_constraints: defaultdict[
             sympy.Symbol, set[Constraint]
         ] = collections.defaultdict(set)
+=======
+        symbol_to_constraints: defaultdict[sympy.Symbol, set[Constraint]] = (
+            collections.defaultdict(set)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         constraint_violations: list[tuple[bool, str, Callable[[], str]]] = []
 
         printers: list[_ShapeGuardPrinter] = []
@@ -4949,7 +6035,11 @@ def get_expression(tensor_dim_src: Source) -> sympy.Expr:
         # tensors that never actually become graph arguments (they are
         # pruned).  In this case, only Dynamo knows about these arguments.
         def track_symint(
+<<<<<<< HEAD
             source: Source, val: Union[SymInt, int], constraint: DimConstraint = None
+=======
+            source: Source, val: IntLikeType, constraint: DimConstraint = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ) -> None:
             log.debug("track_symint %s %s %s", LazyString(source.name), val, constraint)
             assert not isinstance(val, SymInt) or is_symbolic(val)
@@ -5023,15 +6113,41 @@ def hint(s: sympy.Expr) -> str:
                     var_with_range = self._render_range_for_constraint_violation(
                         source, constraint
                     )
+<<<<<<< HEAD
                     msg = (
                         f"Not all values of {var_with_range} are valid because "
                         f"{self._debug_name(source)} was inferred to be a constant ({val})."
+=======
+                    user_stack = self.user_specialization_stacks.get(source, None)
+                    framework_stack = self.framework_specialization_stacks.get(
+                        source, None
+                    )
+                    msg = (
+                        f"You marked {self._debug_name(source)} as dynamic but your code "
+                        f"specialized it to be a constant ({val}). If you're using mark_dynamic, "
+                        f"either remove it or use maybe_mark_dynamic. If you're using Dim.DYNAMIC, "
+                        f"replace it with either Dim.STATIC or Dim.AUTO."
+                        + (
+                            "\n\nFramework stack:\n" + "".join(framework_stack.format())
+                            if framework_stack
+                            else ""
+                        )
+                        + (
+                            "\n\nUser stack:\n" + "".join(user_stack.format())
+                            if user_stack
+                            else ""
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     record_constraint_violation(
                         constraint.warn_only, self._debug_name(source), msg
                     )
 
+<<<<<<< HEAD
         def track_symfloat(source: Source, val: Union[float, SymFloat]) -> None:
+=======
+        def track_symfloat(source: Source, val: FloatLikeType) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             log.debug("track_symfloat %s %s", LazyString(source.name), val)
             assert not isinstance(val, SymFloat) or is_symbolic(val)
 
@@ -5056,7 +6172,14 @@ def track_symfloat(source: Source, val: Union[float, SymFloat]) -> None:
             if t is None:
                 continue
             if isinstance(t, (SymInt, int)):
+<<<<<<< HEAD
                 track_symint(source, t)
+=======
+                constraint = (
+                    None if context is None else getattr(context, "constraint", None)
+                )
+                track_symint(source, t, constraint)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 continue
             elif isinstance(t, (SymFloat, float)):
                 track_symfloat(source, t)
@@ -5793,10 +6916,20 @@ def resimplify_floor_div(axioms: dict[sympy.Expr, sympy.Expr]) -> None:
 
     @_lru_cache
     def replace(self, expr: _SympyT) -> _SympyT:
+<<<<<<< HEAD
         """Apply symbol replacements to any symbols in the given expression"""
         replacements = {}
         for s in expr.free_symbols:
             r = self._find(s)
+=======
+        """
+        Apply symbol replacements to any symbols in the given expression.
+        """
+        replacements = {}
+        for s in expr.free_symbols:
+            r = self._find(s)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Micro-optimization: only do replacements if r and s are different
             # Otherwise, xreplace is not a no-op and will trigger expensive
             # assumption queries if expr has a relational node.
@@ -5824,13 +6957,22 @@ def simplify(self, expr: _SympyT, size_oblivious: bool = False) -> _SympyT:
         expr = safe_expand(expr)
         expr = self.replace(expr)
 
+<<<<<<< HEAD
         if size_oblivious and expr.has(Max):
             max_replacements = {}
             for atom in expr.atoms(Max):
+=======
+        if size_oblivious and (expr.has(Max) or expr.has(Min)):  # type: ignore[has-type]
+            min_max_replacements = {}
+            for atom in (*expr.atoms(Max), *expr.atoms(Min)):  # type: ignore[has-type]
+                if len(atom.args) > 2:
+                    continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 a, b = atom.args
                 if b == 1 or b == 0:
                     a, b = b, a
                 if a == 1 or a == 0:
+<<<<<<< HEAD
                     if (
                         isinstance(b, Add)
                         and len(b.free_symbols) == 2  # TODO: expand to N?
@@ -5841,6 +6983,25 @@ def simplify(self, expr: _SympyT, size_oblivious: bool = False) -> _SympyT:
             if max_replacements:
                 expr = expr.xreplace(max_replacements)
                 expr = safe_expand(expr)
+=======
+                    vr = self.bound_sympy(b, size_oblivious=True)
+                    if vr.lower >= a:
+                        min_max_replacements[atom] = b if atom.func is Max else a
+                    elif vr.upper <= a:
+                        min_max_replacements[atom] = a if atom.func is Max else b
+            if min_max_replacements:
+                expr = expr.xreplace(min_max_replacements)
+
+        if expr.has(TruncToInt):
+            trunc_replacements = {}
+            for atom in expr.atoms(TruncToInt):
+                if isinstance(atom.args[0], IntTrueDiv):
+                    base, divisor = atom.args[0].args
+                    if base % divisor == 0:
+                        trunc_replacements[atom] = base // divisor
+            if trunc_replacements:
+                expr = expr.xreplace(trunc_replacements)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO it would seem that this pass is not necessary given the
         # below replacement of // with /, but for nested FloorDivs
@@ -5950,7 +7111,11 @@ def size_hint(
                             ),
                         },
                     )
+<<<<<<< HEAD
                     self.defer_runtime_assert(
+=======
+                    self.guard_or_defer_runtime_assert(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         sympy.Eq(result_expr, unsound_expr),
                         f"propagate_real_tensors: {result_expr} == {unsound_expr}",
                     )
@@ -6006,7 +7171,11 @@ def _make_data_dependent_error(
             f"Caused by: {sloc}\n"
             'For more information, run with TORCH_LOGS="dynamic"\n'
             "For extended logs when we create symbols, also add "
+<<<<<<< HEAD
             f"TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL=\"{','.join(map(str, expr.free_symbols))}\"\n"
+=======
+            f'TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="{",".join(map(str, expr.free_symbols))}"\n'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "If you suspect the guard was triggered from C++, add TORCHDYNAMO_EXTENDED_DEBUG_CPP=1\n"
             "For more debugging help, see "
             "https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit?usp=sharing\n"
@@ -6140,9 +7309,15 @@ def _set_replacement(self, a: sympy.Symbol, tgt: sympy.Expr, msg: str) -> None:
                     )
                     self._update_var_to_range(b, b_bound, self.var_to_range_sloc[a])
                     tgt_bound = self.bound_sympy(tgt)
+<<<<<<< HEAD
                     assert tgt_bound.issubset(
                         src_bound
                     ), f"{tgt_bound=} not a subset of {src_bound=}"
+=======
+                    assert tgt_bound.issubset(src_bound), (
+                        f"{tgt_bound=} not a subset of {src_bound=}"
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # TODO: Should we propagate size-like-ness?
             #
@@ -6226,6 +7401,16 @@ def _set_replacement(self, a: sympy.Symbol, tgt: sympy.Expr, msg: str) -> None:
                 },
             )
 
+<<<<<<< HEAD
+=======
+            for source in self.var_to_sources.get(a, []):
+                if user_tb:
+                    self.user_specialization_stacks[source] = user_tb
+                self.framework_specialization_stacks[source] = (
+                    CapturedTraceback.extract(cpp=True)
+                )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if config.print_specializations:
                 self.log.warning(
                     "Specializing %s to %s", self.var_to_sources[a][0].name(), tgt
@@ -6268,12 +7453,28 @@ def _find(self, a: sympy.Symbol) -> sympy.Expr:
         return self.replacements[a]
 
     @lru_cache(256)
+<<<<<<< HEAD
     def _maybe_guard_rel(self, expr: sympy.Rel) -> None:
+=======
+    def _maybe_guard_rel(self, expr: sympy.Expr) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         The relational guard is guarded to be true.  Use this information to
         simplify shapes (i.e. a == b or a % 5 == 0)
         """
+<<<<<<< HEAD
         assert isinstance(expr, sympy.Rel)
+=======
+        if isinstance(expr, sympy.And):
+            for arg in expr.args:
+                self._maybe_guard_rel(arg)
+            return
+        elif not isinstance(expr, sympy.Rel):
+            log.warning(
+                "_maybe_guard_rel() was called on non-relation expression %s", expr
+            )
+            return
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # A good example of what goes wrong if you don't do this is
         # python test/functorch/test_aotdispatch.py -k
@@ -6283,9 +7484,15 @@ def _maybe_guard_rel(self, expr: sympy.Rel) -> None:
 
         free = list(expr.free_symbols)
 
+<<<<<<< HEAD
         assert (
             len(free) > 0
         ), f"The expression should not be static by this point: {expr}"
+=======
+        assert len(free) > 0, (
+            f"The expression should not be static by this point: {expr}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # In case of really gnarly expression, we don't blow up
         if len(free) > 5:
             return
@@ -6467,11 +7674,23 @@ def _check_frozen(self, expr: sympy.Basic, concrete_val: sympy.Basic) -> None:
                 stack_info=True if log.getEffectiveLevel() < logging.WARNING else False,
             )
 
+<<<<<<< HEAD
+=======
+    def _get_user_frame(self) -> Optional[types.FrameType]:
+        frame = inspect.currentframe()
+        while frame is not None:
+            if frame.f_code.co_filename not in uninteresting_files():
+                return frame
+            frame = frame.f_back
+        return frame
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _get_stack_summary(
         self, is_debug: bool = False, framework_loc: Optional[str] = None
     ) -> tuple[SLoc, str]:
         floc: Optional[Union[str, traceback.FrameSummary]] = framework_loc
         if floc is None:
+<<<<<<< HEAD
             frame = inspect.currentframe()
             try:
                 while frame is not None:
@@ -6483,6 +7702,16 @@ def _get_stack_summary(
                         )
                         break
                     frame = frame.f_back
+=======
+            frame = self._get_user_frame()
+            try:
+                if frame is not None:
+                    floc = traceback.FrameSummary(
+                        frame.f_code.co_filename,
+                        frame.f_lineno,
+                        frame.f_code.co_name,
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             finally:
                 del frame
 
@@ -6518,6 +7747,16 @@ def _get_sloc(self, framework_loc: Optional[str] = None) -> SLoc:
         sloc, _ = self._get_stack_summary(framework_loc=framework_loc)
         return sloc
 
+<<<<<<< HEAD
+=======
+    def _generate_unique_id(self, source_name: str) -> int:
+        attempt = int(hashlib.sha256(source_name.encode()).hexdigest(), 16) % 100
+        while attempt in self.unique_ids:
+            attempt += 1
+        self.unique_ids.add(attempt)
+        return attempt
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _find_frame_locals(self) -> _FrameLocalResult:
         """
         Given the current user code frame, finds the relevant lines of code,
@@ -6576,12 +7815,25 @@ def go(x: Any) -> Optional[str]:
             if (lineno := instr.starts_line) is not None:
                 last_lineno = max(last_lineno, lineno)
             if isinstance(instr.argval, str) and instr.argval in frame.f_locals:
+<<<<<<< HEAD
                 frame_locals[instr.argval] = pytree.tree_map(
                     go, frame.f_locals[instr.argval]  # type: ignore[index]
                 )
 
         # store LOC
         locs = co_lines[frame.f_lineno - offset : last_lineno + 1 - offset]
+=======
+                flat_locals = pytree.tree_flatten(frame.f_locals[instr.argval])[0]
+                frame_locals[instr.argval] = [
+                    go(flat_local) for flat_local in flat_locals
+                ]
+
+        # store LOC
+        locs = co_lines[frame.f_lineno - offset : last_lineno + 1 - offset]
+        if not locs:
+            return _FrameLocalResult()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         indent = len(locs[0]) - len(locs[0].lstrip())
         frame_loc = "".join([loc[indent:] for loc in locs]).strip()  # type: ignore[assignment]
         return _FrameLocalResult(
@@ -6593,9 +7845,16 @@ def _log_guard(self, prefix: str, g: SympyBoolean, forcing_spec: bool) -> None:
             "guard_added",
             metadata_fn=lambda: {
                 "expr": str(g),
+<<<<<<< HEAD
                 "stack": structured.from_traceback(
                     CapturedTraceback.extract(skip=1).summary()
                 ),
+=======
+                "prefix": prefix,
+                "expr_node_id": self._expr_sym_node_id,
+                "user_stack": structured.get_user_stack(3),
+                "stack": structured.get_framework_stack(3),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "symbol_to_sources": {
                     str(v): k
                     for k, v in self.source_to_var.items()
@@ -6646,6 +7905,10 @@ def evaluate_sym_node(
         self,
         sym_node: SymNode,
         size_oblivious: bool = False,
+<<<<<<< HEAD
+=======
+        fallback_value: Optional[bool] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> sympy.Basic:
         """
         Given a a SymNode, evaluates sym_node.expr, adding guards if necessary.
@@ -6653,26 +7916,148 @@ def evaluate_sym_node(
 
         self._expr_sym_node_id = id(sym_node)
         return self.evaluate_expr(
+<<<<<<< HEAD
             sym_node.expr, sym_node.hint, sym_node.fx_node, size_oblivious
         )
 
     @lru_cache(256)
     @record_shapeenv_event(save_tracked_fakes=True)
+=======
+            sym_node.expr,
+            sym_node.hint,
+            sym_node.fx_node,
+            size_oblivious,
+            fallback_value=fallback_value,
+        )
+
+    def _is_python_assert(self) -> bool:
+        # Check if this boolean is used in an assertion, bytecode pattern for
+        # assertions is pretty stable for Python 3.7--3.13, ported with minimal
+        # changes from torch/fx/proxy.py
+        # Bytecode pattern for `assert` statements:
+        #     TO_BOOL / COMPARE_OP  # Only for Python >= 3.13
+        #     POP_JUMP_IF_TRUE
+        #     LOAD_ASSERTION_ERROR
+        #     RAISE_VARARGS
+        frame = self._get_user_frame()
+        assert frame is not None
+
+        insts = list(dis.get_instructions(frame.f_code))
+        if sys.version_info >= (3, 11):
+            # For Python >= 3.11, instructions can be 2-4 bytes long.
+            from bisect import bisect_left
+
+            cur = bisect_left(insts, frame.f_lasti, key=lambda x: x.offset)
+        else:
+            # For Python <= 3.10, instructions are always 2 bytes.
+            cur = frame.f_lasti // 2
+
+        if sys.version_info >= (3, 13):
+            if insts[cur].opname in ("TO_BOOL", "COMPARE_OP"):
+                # Peek 1 instruction further.
+                cur += 1
+        inst = insts[cur]
+
+        if inst.opname == "POP_JUMP_IF_TRUE" and inst.arg is not None:
+            first = insts[cur + 1]
+
+            starts_with_assert = (
+                first.opname == "LOAD_GLOBAL"
+                and first.argval == "AssertionError"
+                or first.opname == "LOAD_ASSERTION_ERROR"
+            )
+            if starts_with_assert and insts[cur + 2].opname == "RAISE_VARARGS":
+                return True
+        return False
+
+    def _log_real_tensor_propagation(
+        self, orig_expr: sympy.Basic, unsound_result: sympy.Basic
+    ) -> None:
+        log.warning(
+            "propagate_real_tensors evaluate_expr(%s) -> %s",
+            orig_expr,
+            unsound_result,
+        )
+        trace_structured(
+            "propagate_real_tensors",
+            metadata_fn=lambda: {
+                "expr": repr(orig_expr),
+                "result": repr(unsound_result),
+                "stack": structured.from_traceback(
+                    CapturedTraceback.extract(skip=1).summary()
+                ),
+            },
+        )
+        dtrace_structured(
+            "propagate_real_tensors_provenance",
+            metadata_fn=lambda: {
+                "expr": repr(orig_expr),
+                "result": repr(unsound_result),
+                "expr_node_id": self._expr_sym_node_id,
+                "user_stack": structured.get_user_stack(3),
+                "stack": structured.get_framework_stack(3),
+                "symbol_to_sources": {
+                    str(v): k
+                    for k, v in self.source_to_var.items()
+                    if v in orig_expr.free_symbols
+                },
+                "frame_locals": asdict(self._find_frame_locals()),
+            },
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def evaluate_expr(
         self,
         orig_expr: sympy.Basic,
         hint: Optional[Union[int, bool, float]] = None,
         fx_node: Optional[torch.fx.Node] = None,
         size_oblivious: bool = False,
+<<<<<<< HEAD
+        *,
+        forcing_spec: bool = False,
+    ) -> sympy.Basic:
+=======
+        fallback_value: Optional[bool] = None,
         *,
         forcing_spec: bool = False,
     ) -> sympy.Basic:
+        """
+        Given an expression, evaluates it, adding guards if necessary
+        When fallback_value is not None the function return fallback_value instead of failing with data dependent error.
+        """
+
+        # Add extra state that evaluate_expr() depends on.
+        suppress_guards_tls = ShapeEnv._suppress_guards_tls()
+        return self._inner_evaluate_expr(
+            orig_expr,
+            hint,
+            fx_node,
+            size_oblivious,
+            forcing_spec,
+            suppress_guards_tls,
+            fallback_value,
+        )
+
+    @lru_cache(256)
+    @record_shapeenv_event(save_tracked_fakes=True, name="evaluate_expr")
+    def _inner_evaluate_expr(
+        self,
+        orig_expr: sympy.Basic,
+        hint: Optional[Union[int, bool, float]],
+        fx_node: Optional[torch.fx.Node],
+        size_oblivious: bool,
+        forcing_spec: bool,
+        _suppress_guards_tls: bool,
+        fallback_value: Optional[bool] = None,
+    ) -> sympy.Basic:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         try:
             return self._evaluate_expr(
                 orig_expr,
                 hint,
                 fx_node,
                 size_oblivious,
+<<<<<<< HEAD
                 forcing_spec=forcing_spec,
             )
         except Exception:
@@ -6685,12 +8070,41 @@ def evaluate_expr(
             )
             raise
 
+=======
+                fallback_value,
+                forcing_spec=forcing_spec,
+            )
+        except Exception as e:
+            if isinstance(e, GuardOnDataDependentSymNode):
+                pass
+            else:
+                self.log.warning(
+                    "failed during evaluate_expr(%s, hint=%s, size_oblivious=%s, forcing_spec=%s",
+                    orig_expr,
+                    hint,
+                    size_oblivious,
+                    forcing_spec,
+                )
+            raise
+
+    def _log_suppressed_dde(self, a: SymBool, assumed_value: bool) -> None:
+        sloc, extra = self._get_stack_summary(True)
+        log.info(
+            "could not evaluate %s due to data dependency, it was assumed to be %s with no runtime assertions %s %s",
+            a,
+            assumed_value,
+            sloc,
+            extra,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _evaluate_expr(
         self,
         orig_expr: sympy.Basic,
         hint: Optional[Union[bool, int, float]] = None,
         fx_node: Optional[torch.fx.Node] = None,
         size_oblivious: bool = False,
+<<<<<<< HEAD
         *,
         forcing_spec: bool = False,
     ) -> sympy.Basic:
@@ -6698,6 +8112,12 @@ def _evaluate_expr(
         Given an expression, evaluates it, adding guards if necessary
         """
 
+=======
+        fallback_value: Optional[bool] = None,
+        *,
+        forcing_spec: bool = False,
+    ) -> sympy.Basic:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: split conjunctions and evaluate them separately
 
         if isinstance(
@@ -6706,8 +8126,14 @@ def _evaluate_expr(
         ):
             return orig_expr
 
+<<<<<<< HEAD
         # Don't track this one
         @functools.lru_cache(None)
+=======
+        # Don't track this one. (Because this cache is inside this function the
+        # cache only lasts for the invocation of this function call)
+        @functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def compute_concrete_val() -> sympy.Basic:
             if hint is None:
                 # This is only ever called for expressions WITHOUT unbacked
@@ -6726,7 +8152,11 @@ def compute_concrete_val() -> sympy.Basic:
         #   3. the guard should not be suppressed
         #   4. the guard doesn't contain backed symfloat symbols
         #      since z3 can't handle floats
+<<<<<<< HEAD
         #
+=======
+        #   5. fallback_value is none.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If all of the above check, we create an FX node representing the
         # actual expression to be guarded.
         node = None
@@ -6737,6 +8167,10 @@ def compute_concrete_val() -> sympy.Basic:
             and not self._suppress_guards_tls()
             and not size_oblivious
             and not any(symbol_is_type(s, SymT.FLOAT) for s in orig_expr.free_symbols)
+<<<<<<< HEAD
+=======
+            and fallback_value is None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             # TODO: does this even worked with unbacked :think:
             concrete_val = compute_concrete_val()
@@ -6786,9 +8220,17 @@ def compute_concrete_val() -> sympy.Basic:
             if static_expr is not None:
                 self.log.debug(
                     "eval %s == %s [statically known]",
+<<<<<<< HEAD
                     f"size_oblivious({orig_expr})"
                     if size_oblivious
                     else size_oblivious,
+=======
+                    (
+                        f"size_oblivious({orig_expr})"
+                        if size_oblivious
+                        else size_oblivious
+                    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     static_expr,
                 )
                 if (
@@ -6810,6 +8252,7 @@ def compute_concrete_val() -> sympy.Basic:
                 new_expr = self._maybe_evaluate_static(expr, unbacked_only=True)
                 assert new_expr is not None
                 if not (new_expr.free_symbols <= self.var_to_val.keys()):
+<<<<<<< HEAD
                     size_oblivious_result = None
                     if not size_oblivious:
                         size_oblivious_result = self._maybe_evaluate_static(
@@ -6819,6 +8262,17 @@ def compute_concrete_val() -> sympy.Basic:
                     ok = False
 
                     # Last ditch
+=======
+                    ok = False
+
+                    # fallback_value is set when guard_or_true or guard_or_false are used.
+                    if not ok and fallback_value is not None:
+                        self._log_suppressed_dde(orig_expr, fallback_value)
+                        return fallback_value
+
+                    # oblivious_var_to_val will be defined iff we have sizes with DimDynamic.OBLIVIOUS_SIZE type.
+                    # See https://github.com/pytorch/pytorch/issues/137100#issuecomment-2495778113
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if (
                         self.oblivious_var_to_val
                         and not (
@@ -6846,6 +8300,12 @@ def compute_concrete_val() -> sympy.Basic:
                         # NB: do NOT transmute into runtime assert
                         ok = True
 
+<<<<<<< HEAD
+=======
+                    # unbacked_var_to_val is not None iff propagate_real_tensors is on.
+                    # if propagate_real_tensors is on, we check the example values to generate (unsound_result)
+                    # and if they pass we add a runtime assertions and continue.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if (
                         not ok
                         and self.unbacked_var_to_val
@@ -6855,6 +8315,7 @@ def compute_concrete_val() -> sympy.Basic:
                             ).xreplace(self.var_to_val)
                         ).free_symbols
                     ):
+<<<<<<< HEAD
                         log.warning(
                             "propagate_real_tensors evaluate_expr(%s) -> %s",
                             orig_expr,
@@ -6886,11 +8347,31 @@ def compute_concrete_val() -> sympy.Basic:
                                 "frame_locals": asdict(self._find_frame_locals()),
                             },
                         )
+=======
+                        self._log_real_tensor_propagation(orig_expr, unsound_result)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         transmute_into_runtime_assert = True
                         concrete_val = unsound_result
                         ok = True
 
+<<<<<<< HEAD
                     if not ok:
+=======
+                    # Check if this is coming from a python assert statement, if so, convert it to a runtime assertion
+                    # instead of failing.
+                    if not ok and self.trace_asserts and self._is_python_assert():
+                        concrete_val = sympy.true
+                        transmute_into_runtime_assert = True
+                        ok = True
+
+                    if not ok:
+                        size_oblivious_result = None
+                        # compute size_oblivious_result to suggest it as a fix for the user if it works.
+                        if not size_oblivious:
+                            size_oblivious_result = self._maybe_evaluate_static(
+                                expr, size_oblivious=True
+                            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         raise self._make_data_dependent_error(
                             expr.xreplace(self.var_to_val),
                             expr,
@@ -6921,7 +8402,11 @@ def compute_concrete_val() -> sympy.Basic:
                 g = sympy.Eq(expr, concrete_val)  # type: ignore[arg-type]
 
             if transmute_into_runtime_assert:
+<<<<<<< HEAD
                 self.defer_runtime_assert(
+=======
+                self.guard_or_defer_runtime_assert(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     g, f"propagate_real_tensors: {orig_expr} == {concrete_val}"
                 )
                 return concrete_val
@@ -6929,6 +8414,7 @@ def compute_concrete_val() -> sympy.Basic:
             if not self._suppress_guards_tls():
                 self._log_guard("eval", g, forcing_spec=forcing_spec)
 
+<<<<<<< HEAD
                 if isinstance(g, sympy.Rel):
                     # TODO: If we successfully eliminate a symbol via equality, it
                     # is not actually necessary to save a guard for the equality,
@@ -6938,6 +8424,16 @@ def compute_concrete_val() -> sympy.Basic:
                     # saying if it "subsumed" the guard (and therefore the guard
                     # is no longer necessary)
                     self._maybe_guard_rel(g)
+=======
+                # TODO: If we successfully eliminate a symbol via equality, it
+                # is not actually necessary to save a guard for the equality,
+                # as we will implicitly generate a guard when we match that
+                # input against the symbol.  Probably the easiest way to
+                # implement this is to have maybe_guard_rel return a bool
+                # saying if it "subsumed" the guard (and therefore the guard
+                # is no longer necessary)
+                self._maybe_guard_rel(g)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 if not self.allow_complex_guards_as_runtime_asserts:
                     # at this point, we've evaluated the concrete expr value, and have
@@ -6952,7 +8448,11 @@ def compute_concrete_val() -> sympy.Basic:
                     # it's fine to defer simple guards here without checking,
                     # the _maybe_guard_rel() call above will set replacements if possible,
                     # and so the result here will be statically known
+<<<<<<< HEAD
                     self.defer_runtime_assert(g, f"evaluate_expr: {orig_expr}")
+=======
+                    self.guard_or_defer_runtime_assert(g, f"evaluate_expr: {orig_expr}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 self._log_guard("eval [guard suppressed]", g, forcing_spec=forcing_spec)
 
@@ -6960,6 +8460,7 @@ def compute_concrete_val() -> sympy.Basic:
             if fresh:
                 self._remove_fx_node(node)
             raise
+<<<<<<< HEAD
         else:
             if not self._suppress_guards_tls():
                 if guard is not None:  # we might have deferred this to runtime assert
@@ -6979,6 +8480,27 @@ def compute_concrete_val() -> sympy.Basic:
                                 s,
                             )
                             self.evaluate_expr(s, forcing_spec=True)
+=======
+
+        if not self._suppress_guards_tls():
+            if guard is not None:  # we might have deferred this to runtime assert
+                for s in g.free_symbols:
+                    self.symbol_guard_counter[s] += 1
+                    # Forcing_spec to avoid infinite recursion
+                    if (
+                        not forcing_spec
+                        and config.symbol_guard_limit_before_specialize is not None
+                        and self.symbol_guard_counter[s]
+                        > config.symbol_guard_limit_before_specialize
+                    ):
+                        # Force specialization
+                        self.log.info(
+                            "symbol_guard_limit_before_specialize=%s exceeded on %s",
+                            config.symbol_guard_limit_before_specialize,
+                            s,
+                        )
+                        self.evaluate_expr(s, forcing_spec=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return concrete_val
 
@@ -6997,17 +8519,29 @@ def cleanup(self) -> None:
 
     @lru_cache(256)
     @record_shapeenv_event(save_tracked_fakes=True)
+<<<<<<< HEAD
     def defer_runtime_assert(
         self, orig_expr: SympyBoolean, msg: str, fx_node: Optional[torch.fx.Node] = None
     ) -> bool:
         """Create an assert that is checked at runtime
+=======
+    def guard_or_defer_runtime_assert(
+        self, orig_expr: SympyBoolean, msg: str, fx_node: Optional[torch.fx.Node] = None
+    ) -> bool:
+        """
+        Adds a guard that orig_expr is True if we can or fall back to adding an assert
+        that is checked at runtime.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Args:
             orig_expr (sympy.Expr): Boolean expression to assert is true
             msg (str): Message to display on assertion failure
             fx_node (Optional, torch.fx.Node): node in ``self.graph`` corresponding
                 to the expression, if applicable
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         expr = orig_expr
 
@@ -7048,11 +8582,18 @@ def defer_runtime_assert(
             # If you're here because of this assert, read Note [Backwards runtime asserts]
             # in torch/_inductor/graph.py
             if self.runtime_asserts_frozen:
+<<<<<<< HEAD
                 log.warning("runtime_asserts_frozen but then got %s", expr)
             self._check_frozen(expr, sympy.true)
             # eliminate symbols on equality tests / refine ranges
             if isinstance(expr, sympy.Rel):
                 self._maybe_guard_rel(expr)
+=======
+                log.debug("runtime_asserts_frozen but then got %s", expr)
+            self._check_frozen(expr, sympy.true)
+            # eliminate symbols on equality tests / refine ranges
+            self._maybe_guard_rel(expr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # canonicalise to remove equations that are trivially equal
             orig_expr = expr
@@ -7224,9 +8765,40 @@ def _print_Symbol(self, sym: sympy.Symbol) -> str:
         return self.src_map[sym.name][0]
 
 
+<<<<<<< HEAD
 def _suggest_torch_checks(
     e: GuardOnDataDependentSymNode, src_map: defaultdict[str, list[str]]
 ) -> None:
+=======
+def _is_non_negative_check(cond: sympy.Basic) -> Optional[str]:
+    """
+    Check if a condition (SymPy expression) is checking for non-negative values (>= 0).
+    Returns the variable name if it's a non-negative check (>= 0), None otherwise.
+    """
+    if isinstance(cond, sympy.Rel):
+        if cond.rel_op == ">=" and cond.rhs == 0:
+            return str(cond.lhs)
+    return None
+
+
+def _suggest_torch_checks(
+    e: GuardOnDataDependentSymNode, src_map: defaultdict[str, list[str]]
+) -> None:
+    """
+    Enhances a GuardOnDataDependentSymNode error with suggested fixes using torch._check.
+
+    This function analyzes the condition that caused the data-dependent error and generates
+    user-friendly suggestions for fixing it by adding appropriate torch._check calls.
+    It handles special cases like non-negative checks with specific recommendations.
+
+    Args:
+        e: The GuardOnDataDependentSymNode error to enhance with suggestions
+        src_map: A mapping from symbol names to their corresponding source-level variable names
+
+    Returns:
+        None. Modifies the error message in-place by updating e.args[0].
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # extract the unresolved condition on unbacked symints in the error
     cond = e.cond
     diff = ", ".join(s.name for s in cond.free_symbols if s.name not in src_map)
@@ -7236,12 +8808,38 @@ def _suggest_torch_checks(
     printer = _PythonMsgPrinter(src_map)
     msg = e.args[0]
     msg += "\nTo fix the error, insert one of the following checks before this call:"
+<<<<<<< HEAD
     # suggested fixes to resolve `cond`` are to tell the compiler to assume
     # either `cond` or its negation (the user will need to select which)
     suggested_fixes = [
         f"torch._check({printer.doprint(cond)})",
         f"torch._check({printer.doprint(sympy.Not(cond))})",
     ]
+=======
+
+    not_cond_str = printer.doprint(sympy.Not(cond))
+    var_name = _is_non_negative_check(cond)
+
+    # suggested fixes to resolve `cond` are to tell the compiler to assume
+    # either `cond` or its negation (the user will need to select which)
+    suggested_fixes = []
+
+    if var_name:
+        suggested_fixes = [
+            f"You can add either: torch._check_is_size({var_name}) or torch._check({var_name}>=0)"
+            f" Note: torch._check_is_size({var_name}) could prevent data dependent errors that"
+            + " happen in a guard_size_oblivious(..) context by opting into guard_size_oblivious reasoning."
+            + " See documentation on guard_size_oblivious for more details:"
+            + " https://pytorch.org/docs/stable/generated/torch.fx.experimental.symbolic_shapes.guard_size_oblivious.html",
+            f"torch._check({not_cond_str})",
+        ]
+    else:
+        suggested_fixes = [
+            f"torch._check({printer.doprint(cond)})",
+            f"torch._check({not_cond_str})",
+        ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, fix in enumerate(suggested_fixes):
         msg += f"\n  {i + 1}. {fix}"
     src_mapped = ", ".join(
@@ -7291,4 +8889,52 @@ def _suggest_fixes_for_data_dependent_error_non_strict(
 
         # add suggested torch.check()s based on `src_map` to the error message
         # replacing unbacked symints in the unresolved condition in the error
+<<<<<<< HEAD
         _suggest_torch_checks(e, src_map)
+=======
+        if isinstance(e.cond, sympy.logic.boolalg.Boolean):
+            _suggest_torch_checks(e, src_map)
+
+
+@contextmanager
+def _remove_effect_token_unbacked_bindings(
+    node: torch.fx.Node,
+) -> Generator[None, None, None]:
+    """
+    Temporarily modifies unbacked_bindings in a node's metadata by removing the first element
+    of each path, which corresponds to an effect token.
+
+    This is used when processing nodes that have effect tokens as the first element in their
+    unbacked_bindings paths. The context manager ensures that the original bindings are
+    restored after the operation is complete.
+
+    Args:
+        node: The FX node whose unbacked_bindings will be temporarily modified
+
+    Yields:
+        None
+    """
+    old_bindings = node.meta.get("unbacked_bindings", {})
+
+    # Remove the extra layer for effect token
+    new_bindings = {k: path[1:] if path else path for k, path in old_bindings.items()}
+
+    node.meta["unbacked_bindings"] = new_bindings
+
+    try:
+        yield
+    finally:
+        node.meta["unbacked_bindings"] = old_bindings
+
+
+# This helper function is used in passes that insert runtime assertions in the graph.
+# When accessing expressions representing input placeholders, we do not apply replacements
+# since those inputs should be seen by assertions that use them to be inserted. The only replacement
+# that we apply is unbacked renaming.
+def _get_placeholder_expr(sym_node: SymNode) -> sympy.Expr:
+    shape_env = sym_node.shape_env
+    result = sym_node._expr
+    if result in shape_env.unbacked_renamings:
+        return shape_env.unbacked_renamings[result]
+    return result
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/fx/experimental/unification/multipledispatch/dispatcher.py b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
index 4f160995cce0..9adc1cb61729 100644
--- a/torch/fx/experimental/unification/multipledispatch/dispatcher.py
+++ b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
@@ -354,7 +354,11 @@ def __setstate__(self, d):
         self._cache = {}
 
     @property
+<<<<<<< HEAD
     def __doc__(self):
+=======
+    def __doc__(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         docs = [f"Multiply dispatched method: {self.name}"]
 
         if self.doc:
diff --git a/torch/fx/experimental/validator.py b/torch/fx/experimental/validator.py
index 17a814b233c6..a231d57ab614 100644
--- a/torch/fx/experimental/validator.py
+++ b/torch/fx/experimental/validator.py
@@ -203,9 +203,13 @@ def floordiv(
             return _Z3Ops.to_real(result) if cast_result_to_real else result
 
         def ceil(self, number: z3.ArithRef) -> z3.ArithRef:
+<<<<<<< HEAD
             return z3.If(
                 self.floor(number) < number, self.floor(number + 1), number
             )  # type: ignore[return-value]
+=======
+            return z3.If(self.floor(number) < number, self.floor(number + 1), number)  # type: ignore[return-value]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def trunc(self, number: z3.ArithRef) -> z3.ArithRef:
             return z3.If(number >= 0, self.floor(number), self.ceil(number))  # type: ignore[return-value]
@@ -363,9 +367,15 @@ def call_function(
                 return super().call_function(z3op(target, self.validator), args, kwargs)  # type: ignore[arg-type]
             # Adds the Z3 expression corresponding to the first argument
             # as a validator input.
+<<<<<<< HEAD
             assert (
                 len(args) == 1
             ), f"expected 1 argument on assertion. Got: {len(args)} "
+=======
+            assert len(args) == 1, (
+                f"expected 1 argument on assertion. Got: {len(args)} "
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.validator.add_source_expr(args[0])  # type: ignore[arg-type]
 
     # Translates SymPy expressions into Z3 expressions.
@@ -536,9 +546,15 @@ def _check_freesymbols(self, e: sympy.Basic) -> None:
 
         def to_z3_boolean_expr(self, e: sympy.Basic) -> z3.BoolRef:
             z3expr = SympyToZ3(self).run(e)
+<<<<<<< HEAD
             assert isinstance(
                 z3expr, z3.BoolRef
             ), f"expected boolean expression. Got: {z3expr}"
+=======
+            assert isinstance(z3expr, z3.BoolRef), (
+                f"expected boolean expression. Got: {z3expr}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return z3expr
 
         def add_source_expr(self, e: z3.BoolRef) -> None:
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index ff475a14b740..a89c6d18e8df 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -12,7 +12,11 @@
 import typing
 import warnings
 from collections import defaultdict
+<<<<<<< HEAD
 from collections.abc import Iterable
+=======
+from collections.abc import Iterable, Iterator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Any, Callable, Literal, NamedTuple, Optional, TYPE_CHECKING
@@ -20,6 +24,10 @@
 import torch
 import torch.utils._pytree as pytree
 from torch._C import _fx_map_arg as map_arg, _NodeIter
+<<<<<<< HEAD
+=======
+from torch.utils._dtype_abbrs import dtype_abbrs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from . import _pytree as fx_pytree
 from ._compatibility import compatibility
@@ -212,6 +220,7 @@ def _rename_object(self, obj: Any, name: str):
         self._used_names.add(name)
 
 
+<<<<<<< HEAD
 dtype_abbrs = {
     torch.bfloat16: "bf16",
     torch.float64: "f64",
@@ -239,6 +248,8 @@ def _rename_object(self, obj: Any, name: str):
 }
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @compatibility(is_backward_compatible=True)
 @dataclass
 class PythonCode:
@@ -343,6 +354,12 @@ def _parse_stack_trace(stack_trace: str):
 
 @compatibility(is_backward_compatible=False)
 class CodeGen:
+<<<<<<< HEAD
+=======
+    # This is an override hook so we can customize the SymNode printer.
+    _sym_repr: Callable[["torch.types.PySymType"], str] = lambda x: repr(x)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self):
         self._body_transformer: Optional[TransformCodeFunc] = None
         self._func_name: str = "forward"
@@ -438,7 +455,11 @@ def add_global(name_hint: str, obj: Any):
             global_name = namespace.create_name(name_hint, obj)
 
             if global_name in globals_:
+<<<<<<< HEAD
                 assert globals_[global_name] is obj
+=======
+                assert globals_[global_name] == obj
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return global_name
             globals_[global_name] = obj
             return global_name
@@ -472,7 +493,11 @@ def type_repr(o: Any):
                         # This code-path used in Python < 3.9
                         return origin_typename
 
+<<<<<<< HEAD
                     return f'{origin_typename}[{",".join(args)}]'
+=======
+                    return f"{origin_typename}[{','.join(args)}]"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     # Bare type, such as `typing.Tuple` with no subscript
                     # This code-path used in Python 3.9+
@@ -596,7 +621,11 @@ def append_stacktrace_summary(node: Node):
                             summary_str = parsed_stack_trace.get_summary_str()
                         else:
                             summary_str = ""
+<<<<<<< HEAD
                         body.append(f'\n {dim(f"# {summary_str}")}\n')
+=======
+                        body.append(f"\n {dim(f'# {summary_str}')}\n")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 elif prev_stacktrace != "":
                     prev_stacktrace = ""
                     no_stacktrace_msg = "# No stacktrace found for following nodes"
@@ -635,7 +664,12 @@ def emit_node(node: Node):
                         f'{dim_blue(stride_annotation)}{dim_green(device_annotation)}"'
                     )
                 elif isinstance(meta_val, py_sym_types):
+<<<<<<< HEAD
                     maybe_type_annotation = f': "Sym({meta_val})"'
+=======
+                    val_str = CodeGen._sym_repr(meta_val)
+                    maybe_type_annotation = f': "Sym({val_str})"'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 elif isinstance(meta_val, TensorMetadata):
                     maybe_type_annotation = f': "{dtype_abbrs[meta_val.dtype]}{stringify_shape(meta_val.shape)}"'
 
@@ -864,7 +898,11 @@ def gen_fn_def(self, free_vars, maybe_return_annotation):
             if len(has_annotation) > 0:
                 fn_definition += "\n    " + "".join(has_annotation) + "\n"
             fn_definition += f"""
+<<<<<<< HEAD
     {', '.join(without_annotation)}, = fx_pytree.tree_flatten_spec({fn_signature})"""
+=======
+    {", ".join(without_annotation)}, = fx_pytree.tree_flatten_spec({fn_signature})"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fn_definition
 
     def generate_output(self, output_args):
@@ -1443,6 +1481,10 @@ def call_function(
         args: Optional[tuple["Argument", ...]] = None,
         kwargs: Optional[dict[str, "Argument"]] = None,
         type_expr: Optional[Any] = None,
+<<<<<<< HEAD
+=======
+        name: Optional[str] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> Node:
         """
         Insert a ``call_function`` ``Node`` into the ``Graph``. A ``call_function`` node
@@ -1463,6 +1505,11 @@ def call_function(
             type_expr (Optional[Any]): an optional type annotation representing the
                 Python type the output of this node will have.
 
+<<<<<<< HEAD
+=======
+            name (Optional[str]): The name of the node. If not specified, set to None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Returns:
 
             The newly created and inserted ``call_function`` node.
@@ -1472,7 +1519,11 @@ def call_function(
             as :meth:`Graph.create_node`.
         """
         return self.create_node(
+<<<<<<< HEAD
             "call_function", the_function, args, kwargs, type_expr=type_expr
+=======
+            "call_function", the_function, args, kwargs, name=name, type_expr=type_expr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @compatibility(is_backward_compatible=True)
@@ -1726,8 +1777,11 @@ def check_arg(arg: Node, n: Optional[Node] = None) -> None:
 
         # Check targets are legit
         if self.owning_module:
+<<<<<<< HEAD
             num_warnings = 0
             MAX_WARNINGS = 5
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for node in self.nodes:
                 if node.op == "call_function":
                     if not callable(node.target):
@@ -1759,6 +1813,7 @@ def check_arg(arg: Node, n: Optional[Node] = None) -> None:
                                 f"Node {node} target {node.target} {atom} of {seen_qualname} does "
                                 "not reference an nn.Module"
                             )
+<<<<<<< HEAD
                         elif (
                             node.op == "get_attr"
                             and not isinstance(new_m_itr, torch.nn.Module)
@@ -1782,6 +1837,10 @@ def check_arg(arg: Node, n: Optional[Node] = None) -> None:
                     f"Additional {num_warnings - MAX_WARNINGS} warnings "
                     "suppressed about get_attr references"
                 )
+=======
+
+                        m_itr = new_m_itr
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @compatibility(is_backward_compatible=True)
     def eliminate_dead_code(
@@ -1828,14 +1887,30 @@ def forward(self, x):
             of functional operations or you supply your own custom
             function for detecting side-effectful nodes.
         """
+<<<<<<< HEAD
+=======
+        from torch.utils._ordered_set import OrderedSet
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Lint the graph first to make sure its topologically sorted, otherwise
         # DCE below will not behave as expected.
         self.lint()
 
+<<<<<<< HEAD
         def has_side_effect(node):
             if is_impure_node is not None:
                 return is_impure_node(node)
             return node.is_impure()
+=======
+        impure_random = True
+        if torch._guards.TracingContext.try_get():
+            impure_random = torch._inductor.config.fallback_random
+
+        def has_side_effect(node):
+            if is_impure_node is not None:
+                return is_impure_node(node)
+            return node.is_impure(impure_random)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Reverse iterate so that when we remove a node, any nodes used as an
         # input to that node have an updated user count that no longer reflects
@@ -1846,6 +1921,23 @@ def has_side_effect(node):
                 self.erase_node(node)
                 changed = True
 
+<<<<<<< HEAD
+=======
+        # Call DCE on the subgraphs
+        if self.owning_module is not None:
+            subgraph_names = OrderedSet(
+                x.target for x in self.find_nodes(op="get_attr")
+            )
+            for child_name, child_module in self.owning_module.named_children():
+                # Sometimes an owning_module can have unused children. Skip them
+                # by checking them from get_attr node targets.
+                if child_name in subgraph_names and isinstance(
+                    child_module, torch.fx.GraphModule
+                ):
+                    changed |= child_module.graph.eliminate_dead_code()
+                    child_module.recompile()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return changed
 
     @compatibility(is_backward_compatible=False)
@@ -1899,7 +1991,13 @@ def insert_pdb(body):
             # through `insert_pdb`:
             gm.graph.on_generate_code(
                 lambda current_trans: (
+<<<<<<< HEAD
                     lambda body: insert_pdb(current_trans(body) if current_trans else body)
+=======
+                    lambda body: insert_pdb(
+                        current_trans(body) if current_trans else body
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
 
@@ -1936,6 +2034,21 @@ def on_generate_code_context_manager():
         return on_generate_code_context_manager()
 
 
+<<<<<<< HEAD
+=======
+@contextmanager
+def _override_sym_repr(
+    override: Callable[["torch.types.PySymType"], str],
+) -> Iterator[None]:
+    tmp = CodeGen._sym_repr
+    try:
+        CodeGen._sym_repr = override
+        yield
+    finally:
+        CodeGen._sym_repr = tmp
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _identity(x):
     return x
 
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 3910020cfad9..79a9e8b658fa 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -17,7 +17,18 @@
 from torch.package import Importer, PackageExporter, PackageImporter, sys_importer
 
 from ._compatibility import compatibility
+<<<<<<< HEAD
 from .graph import _custom_builtins, _is_from_torch, _PyTreeCodeGen, Graph, PythonCode
+=======
+from .graph import (
+    _custom_builtins,
+    _is_from_torch,
+    _override_sym_repr,
+    _PyTreeCodeGen,
+    Graph,
+    PythonCode,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -197,6 +208,17 @@ def is_leaf_module(self, _: torch.nn.Module, __: str) -> bool:
     tracer_extras = body.get("_tracer_extras", {})
     graph = KeepModules().trace(com, **tracer_extras)
 
+<<<<<<< HEAD
+=======
+    # Recover node.meta["stack_trace"] after re-tracing
+    node_meta_stack_trace = body.get("_graphmodule_graph_node_meta_stack_trace", None)
+    if node_meta_stack_trace is not None:
+        del body["_graphmodule_graph_node_meta_stack_trace"]
+        for node in graph.nodes:
+            if node_meta_stack_trace.get(node.name, None) is not None:
+                node.meta["stack_trace"] = node_meta_stack_trace[node.name]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Manually set Tracer class on the reconstructed Graph, to avoid
     # referencing the private local subclass KeepModules.
     graph._tracer_cls = tracer_cls
@@ -309,9 +331,15 @@ def _print_readable(
     colored=False,
 ):
     graph = module.graph
+<<<<<<< HEAD
     assert graph is not None and isinstance(
         graph, torch.fx.Graph
     ), "print_readable must be used on a module with a graph"
+=======
+    assert graph is not None and isinstance(graph, torch.fx.Graph), (
+        "print_readable must be used on a module with a graph"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     verbose_python_code = graph.python_code(
         root_module="self",
@@ -372,7 +400,14 @@ def _generate_error_message(frame_summary: traceback.FrameSummary) -> str:
         all_src_lines = linecache.getlines(frame_summary.filename)
 
         # constituent substrings of the error message
+<<<<<<< HEAD
         tb_repr = torch._dynamo.disable(traceback.format_exc)()
+=======
+        tb_repr = torch._dynamo.disable(
+            traceback.format_exc,
+            reason="do not trace into traceback.format_exc when generating error message",
+        )()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         custom_msg = (
             "Call using an FX-traced Module, "
             f"line {err_lineno} of the traced Module's "
@@ -849,6 +884,19 @@ def __reduce_package__(self, exporter: PackageExporter):
         dict_without_graph["_graphmodule_cls_name"] = self.__class__.__name__
         del dict_without_graph["_graph"]
 
+<<<<<<< HEAD
+=======
+        # Store node.meta["stack_trace"] so we can recover them after re-tracing during deserialization
+        node_meta_stack_trace = {
+            node.name: node.meta["stack_trace"]
+            for node in self.graph.nodes
+            if "stack_trace" in node.meta
+        }
+        dict_without_graph["_graphmodule_graph_node_meta_stack_trace"] = (
+            node_meta_stack_trace
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         generated_module_name = f"fx-generated._{exporter.get_unique_id()}"
         python_code = self.recompile()
         import_block = _format_import_block(python_code.globals, exporter.importer)
@@ -924,10 +972,18 @@ def print_readable(
         include_stride=False,
         include_device=False,
         colored=False,
+<<<<<<< HEAD
+=======
+        *,
+        # If `fast_sympy_print` is True then we use a sympy printer which is faster
+        # but may result in less-readable output.
+        fast_sympy_print: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """
         Return the Python code generated for current GraphModule and its children GraphModules
         """
+<<<<<<< HEAD
         return _print_readable(
             self,
             self._get_name(),
@@ -936,6 +992,27 @@ def print_readable(
             include_device,
             colored,
         )
+=======
+        ctx_mgr = contextlib.ExitStack()
+        with ctx_mgr:
+            if fast_sympy_print:
+                from torch._inductor.utils import sympy_str
+
+                def fast_repr(expr: torch.types.PySymType) -> str:
+                    return sympy_str(expr.node.expr)
+
+                ctx_mgr.enter_context(_override_sym_repr(fast_repr))
+
+            r = _print_readable(
+                self,
+                self._get_name(),
+                print_output,
+                include_stride,
+                include_device,
+                colored,
+            )
+            return r
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __str__(self) -> str:
         orig_str = super().__str__()
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 86648541e342..83584b883a09 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -51,7 +51,13 @@ class Interpreter:
         method equivalents). We could subclass Interpreter like so::
 
             class NegSigmSwapInterpreter(Interpreter):
+<<<<<<< HEAD
                 def call_function(self, target: Target, args: Tuple, kwargs: Dict) -> Any:
+=======
+                def call_function(
+                    self, target: Target, args: Tuple, kwargs: Dict
+                ) -> Any:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     if target == torch.sigmoid:
                         return torch.neg(*args, **kwargs)
                     return super().call_function(target, args, kwargs)
@@ -405,7 +411,11 @@ def fetch_attr(self, target: str):
         for i, atom in enumerate(target_atoms):
             if not hasattr(attr_itr, atom):
                 raise RuntimeError(
+<<<<<<< HEAD
                     f"Node referenced nonexistent target {'.'.join(target_atoms[:i + 1])}"
+=======
+                    f"Node referenced nonexistent target {'.'.join(target_atoms[: i + 1])}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             attr_itr = getattr(attr_itr, atom)
         return attr_itr
@@ -468,14 +478,28 @@ class Transformer(Interpreter):
 
             class NegSigmSwapXformer(Transformer):
                 def call_function(
+<<<<<<< HEAD
                     self, target: "Target", args: Tuple[Argument, ...], kwargs: Dict[str, Any]
+=======
+                    self,
+                    target: "Target",
+                    args: Tuple[Argument, ...],
+                    kwargs: Dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ) -> Any:
                     if target == torch.sigmoid:
                         return torch.neg(*args, **kwargs)
                     return super().call_function(target, args, kwargs)
 
                 def call_method(
+<<<<<<< HEAD
                     self, target: "Target", args: Tuple[Argument, ...], kwargs: Dict[str, Any]
+=======
+                    self,
+                    target: "Target",
+                    args: Tuple[Argument, ...],
+                    kwargs: Dict[str, Any],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ) -> Any:
                     if target == "neg":
                         call_self, *args_tail = args
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 8433e9ea651b..c8a825913f72 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -6,6 +6,10 @@
 import types
 from collections.abc import Mapping, Sequence
 from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
+<<<<<<< HEAD
+=======
+from typing_extensions import ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._C import _fx_map_aggregate, _fx_map_arg, _NodeBase
@@ -58,6 +62,11 @@
     ]
 ]
 ArgumentT = TypeVar("ArgumentT", bound=Argument)
+<<<<<<< HEAD
+=======
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _legal_ops = dict.fromkeys(
     [
@@ -74,7 +83,11 @@
 # Dynamo is unable to trace global set[Callable].__contains__.
 # See https://github.com/pytorch/pytorch/issues/145761. Since we only have
 # a handful of ops so switch to list of callables.
+<<<<<<< HEAD
 _side_effectful_need_to_be_preserved_pre_dispatch: list[Callable] = [
+=======
+_side_effectful_need_to_be_preserved_pre_dispatch: list[Callable[..., Any]] = [
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._C._set_grad_enabled,
     torch.amp._enter_autocast,
     torch.amp._exit_autocast,
@@ -82,7 +95,11 @@
 
 # TODO: Either refactor this into 2 functions 1 dce for functional graphs and 1 dce for all graphs,
 # or add logic to correctly mark all inplace ops as side effectful.
+<<<<<<< HEAD
 _side_effectful_functions: set[Callable] = {
+=======
+_side_effectful_functions: set[Callable[..., Any]] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._assert,
     torch._assert_async,
     _ops.aten._assert_async.msg,
@@ -95,14 +112,23 @@
     _ops.profiler._record_function_exit,
     _ops.inductor.accumulate_grad_.default,
     operator.setitem,
+<<<<<<< HEAD
 } | set(_side_effectful_need_to_be_preserved_pre_dispatch)
+=======
+    *_side_effectful_need_to_be_preserved_pre_dispatch,
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if hasattr(_ops.inductor, "resize_storage_bytes_"):
     _side_effectful_functions.add(_ops.inductor.resize_storage_bytes_.default)
 
 
 @compatibility(is_backward_compatible=False)
+<<<<<<< HEAD
 def has_side_effect(fn: Callable) -> Callable:
+=======
+def has_side_effect(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _side_effectful_functions.add(fn)
     return fn
 
@@ -511,9 +537,15 @@ def insert_arg(self, idx: int, arg: Argument) -> None:
             idx (int): The index of the element in ``self.args`` to be inserted before.
             arg (Argument): The new argument value to insert into ``args``
         """
+<<<<<<< HEAD
         assert (
             0 <= idx <= len(self.args)
         ), "insert_args index must be between 0 and len(self.args)"
+=======
+        assert 0 <= idx <= len(self.args), (
+            "insert_args index must be between 0 and len(self.args)"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args_left = self.args[:idx]
         args_right = self.args[idx:]
 
@@ -714,11 +746,21 @@ def maybe_replace_node(n: Node) -> Node:
         return [n for n in to_process if n not in skipped]
 
     @compatibility(is_backward_compatible=False)
+<<<<<<< HEAD
     def is_impure(self) -> bool:
+=======
+    def is_impure(self, impure_random: bool = True) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Returns whether this op is impure, i.e. if its op is a placeholder or
         output, or if a call_function or call_module which is impure.
 
+<<<<<<< HEAD
+=======
+        Args:
+            impure_random (bool): Whether to treat rand op as impure.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Returns:
 
             bool: If the op is impure or not.
@@ -732,14 +774,22 @@ def is_impure(self) -> bool:
                 # impure since it mutates inputs
                 return True
 
+<<<<<<< HEAD
             if getattr(self.target, "_nondeterministic_seeded", False):
                 # impure since it mutates RNG state
                 return True
+=======
+            if impure_random:
+                if getattr(self.target, "_nondeterministic_seeded", False):
+                    # impure since it mutates RNG state
+                    return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return self.target in _side_effectful_functions
 
         # Check if an impure module.
         if self.op == "call_module":
+<<<<<<< HEAD
             assert (
                 self.graph.owning_module is not None
             ), "self.graph.owning_module not set for purity check"
@@ -747,6 +797,15 @@ def is_impure(self) -> bool:
             assert (
                 target_mod is not None
             ), f"Did not find expected submodule target {self.target}"
+=======
+            assert self.graph.owning_module is not None, (
+                "self.graph.owning_module not set for purity check"
+            )
+            target_mod = self.graph.owning_module.get_submodule(self.target)
+            assert target_mod is not None, (
+                f"Did not find expected submodule target {self.target}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return getattr(target_mod, "_is_impure", False)
 
         return False
@@ -789,10 +848,24 @@ def normalized_arguments(
                 self.kwargs,
                 arg_types,
                 kwarg_types,
+<<<<<<< HEAD
             )
         elif self.op == "call_module":
             assert isinstance(self.target, str)
             return normalize_module(root, self.target, self.args, self.kwargs)  # type: ignore[arg-type]
+=======
+                normalize_to_only_use_kwargs=normalize_to_only_use_kwargs,
+            )
+        elif self.op == "call_module":
+            assert isinstance(self.target, str)
+            return normalize_module(
+                root,
+                self.target,
+                self.args,  # type: ignore[arg-type]
+                self.kwargs,
+                normalize_to_only_use_kwargs=normalize_to_only_use_kwargs,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return None
 
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index 411daa5aaa1f..b3d212eaea55 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -366,6 +366,21 @@ def normalize_function(
     if kwargs is None:
         kwargs = {}
     new_args_and_kwargs = None
+<<<<<<< HEAD
+=======
+    if (
+        not isinstance(target, types.BuiltinFunctionType)
+        and not (isinstance(target, (OpOverloadPacket, OpOverload)))
+        and hasattr(target, "_op")
+    ):
+        # ExecuTorch's EdgeOpOverload are a wrapper around PyTorch's OpOverload,
+        # so we can unwrap it here to get its schema
+        # Can't import EdgeOpOverload directly because of a circular dependency,
+        # so checking for "_op" existing is the next best thing.
+        target = target._op
+
+    # Repeat the condition after checking for the inner _op field.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not isinstance(target, types.BuiltinFunctionType) and not (
         isinstance(target, (OpOverloadPacket, OpOverload))
     ):
diff --git a/torch/fx/passes/README.md b/torch/fx/passes/README.md
index 1fd169bf54a3..1ae76297e7db 100644
--- a/torch/fx/passes/README.md
+++ b/torch/fx/passes/README.md
@@ -12,9 +12,13 @@ This folder contains the pass infrastructure and passes for transforming fx.Grap
 * [dialect](dialect) - dialect specific passes
     * [common](dialect/common) - common passes that can be shared by all dialects
         * [cse_pass.py](dialect/common/cse_pass.py) - a CSE pass
+<<<<<<< HEAD
     * [aten](dialect/aten) - aten dialect specific passes
     * [prims](dialect/prims) - prim dialect specific passes
 * [backends](backends) - Backend specific passes
     * [nvfuser](backends/nvfuser) - passes for nvfuser
         * [operator_support.py](backends/nvfuser/operator_support.py) - nvFuser supported ops
+=======
+* [backends](backends) - Backend specific passes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 * [conversion](conversion) - Conversion passes between dialects
diff --git a/torch/fx/passes/_tensorify_python_scalars.py b/torch/fx/passes/_tensorify_python_scalars.py
index a7a2cdfcb17e..854cb9136244 100644
--- a/torch/fx/passes/_tensorify_python_scalars.py
+++ b/torch/fx/passes/_tensorify_python_scalars.py
@@ -17,7 +17,15 @@
 from torch._subclasses.fake_tensor import FakeTensor
 from torch._utils_internal import justknobs_check
 from torch.fx._utils import lazy_format_graph_code
+<<<<<<< HEAD
 from torch.fx.experimental.symbolic_shapes import guard_scalar, ShapeEnv  # noqa: TCH001
+=======
+from torch.fx.experimental.symbolic_shapes import (  # noqa: TCH001
+    guard_scalar,
+    has_free_symbols,
+    ShapeEnv,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.graph_module import GraphModule  # noqa: TCH001
 
 # TODO: refactor
@@ -31,7 +39,11 @@
 __all__: list[str] = []
 
 log = logging.getLogger(__name__)
+<<<<<<< HEAD
 graph_code_log = torch._logging.getArtifactLogger(__name__, "graph_code")
+=======
+graph_code_log = torch._logging.getArtifactLogger(__name__, "graph_code_verbose")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # The general shape of this transformation is to look for Tensor operations
 # that take a backed SymFloat as an argument, and then redo them as tensor
@@ -186,6 +198,10 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
 
         return expr_to_tensor_proxy[expr]
 
+<<<<<<< HEAD
+=======
+    failed_tensorify_ops: set[str] = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     nodes = list(graph.nodes)
     for i, node in enumerate(nodes[:-1]):
         with graph.inserting_before(
@@ -298,8 +314,20 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                         metrics_context.set(
                             "tensorify_float_success", True, overwrite=True
                         )
+<<<<<<< HEAD
 
     failed_tensorify_ops: set[str] = set()
+=======
+            else:
+                for a in node.args:
+                    if (
+                        isinstance(a, fx.Node)
+                        and "val" in a.meta
+                        and isinstance(zf := a.meta["val"], torch.SymFloat)
+                    ):
+                        failed_tensorify_ops.update(str(node.target))
+                        log.info("Failed to tensorify %s", str(node.target))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Now do one more pass that specializes all symfloats we didn't manage
     # to tensorify away.
@@ -316,7 +344,11 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 (val := node.meta.get("val")),
                 (torch.SymFloat, torch.SymInt, torch.SymBool),
             ):
+<<<<<<< HEAD
                 if all(
+=======
+                if has_free_symbols(val.node.expr) and all(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     symbol_is_type(s, SymT.FLOAT) for s in val.node.expr.free_symbols
                 ):
                     # If all symbols are backed symfloats, we can just specialize the whole node
@@ -328,8 +360,11 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                     #
                     # It's better to guard on zf // 2 == 2.0 than zf == 5.0
 
+<<<<<<< HEAD
                     failed_tensorify_ops.update(str(key) for key in node.users.keys())
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     node.replace_all_uses_with(guard_scalar(val))
                     graph.erase_node(node)
 
diff --git a/torch/fx/passes/fake_tensor_prop.py b/torch/fx/passes/fake_tensor_prop.py
index 8036f5d0fd55..57f5d7fdda5d 100644
--- a/torch/fx/passes/fake_tensor_prop.py
+++ b/torch/fx/passes/fake_tensor_prop.py
@@ -7,6 +7,10 @@
 from torch.fx._compatibility import compatibility
 from torch.fx.experimental.proxy_tensor import py_sym_types, snapshot_fake
 from torch.fx.node import map_aggregate
+<<<<<<< HEAD
+=======
+from torch.utils._ordered_set import OrderedSet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["FakeTensorProp"]
@@ -36,6 +40,10 @@ def __init__(
         self._mode = mode
         mode.epoch += 1
         mode.reset_nt_tensor_id_counter()
+<<<<<<< HEAD
+=======
+        self.seen_subgraphs: OrderedSet[str] = OrderedSet()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def run_node(self, n: Node):
         from torch.fx.experimental.symbolic_shapes import (
@@ -43,6 +51,36 @@ def run_node(self, n: Node):
             rebind_unbacked,
         )
 
+<<<<<<< HEAD
+=======
+        if (
+            n.op == "call_function"
+            and n.target is torch.ops.higher_order.invoke_subgraph
+            and n.args[1] not in self.seen_subgraphs
+        ):
+            # Prevent redundant fake tensor prop for invoke_subgraphs. Note that
+            # there is also fake tensor caching for the entire subgraph. This
+            # happens the next time we call `run_node` for the same subgraph,
+            # which goes through super.run_node and caches the fake tensor prop.
+            # Therefore, we are propagating fake tensor through the subgraphs
+            # twice.
+            assert isinstance(n.args[1], str)
+            assert (
+                isinstance(n.args[0], torch.fx.Node)
+                and n.args[0].op == "get_attr"
+                and isinstance(n.args[0].target, str)
+            )
+            self.seen_subgraphs.add(n.args[1])
+            operands = n.args[2:]
+            example_inputs = []
+            for operand in operands:
+                assert isinstance(operand, torch.fx.Node) and "val" in operand.meta
+                example_inputs.append(operand.meta["val"])
+            return FakeTensorProp(
+                getattr(self.module, n.args[0].target), mode=self._mode
+            ).propagate(*example_inputs)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = super().run_node(n)
         rebind_unbacked(self._mode.shape_env, n, result)
 
diff --git a/torch/fx/passes/graph_drawer.py b/torch/fx/passes/graph_drawer.py
index 275b0d5f6f9e..6b33b85b6f02 100644
--- a/torch/fx/passes/graph_drawer.py
+++ b/torch/fx/passes/graph_drawer.py
@@ -2,6 +2,10 @@
 
 import hashlib
 from itertools import chain
+<<<<<<< HEAD
+=======
+from types import ModuleType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Optional, TYPE_CHECKING
 
 import torch
@@ -13,6 +17,7 @@
 from torch.fx.passes.shape_prop import TensorMetadata
 
 
+<<<<<<< HEAD
 try:
     import pydot
 
@@ -20,6 +25,21 @@
 except ModuleNotFoundError:
     HAS_PYDOT = False
     pydot = None
+=======
+if TYPE_CHECKING:
+    import pydot
+
+    HAS_PYDOT = True
+else:
+    pydot: Optional[ModuleType]
+    try:
+        import pydot
+
+        HAS_PYDOT = True
+    except ModuleNotFoundError:
+        HAS_PYDOT = False
+        pydot = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["FxGraphDrawer"]
@@ -416,7 +436,11 @@ def _to_dot(
                     label=self._get_node_label(
                         graph_module, node, skip_node_names_in_args, parse_stack_trace
                     ),
+<<<<<<< HEAD
                     **style,
+=======
+                    **style,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 current_graph = dot_graph
@@ -428,7 +452,11 @@ def _to_dot(
                         buf_name_to_subgraph[buf_name] = pydot.Cluster(
                             buf_name, label=buf_name
                         )
+<<<<<<< HEAD
                     current_graph = buf_name_to_subgraph.get(buf_name)
+=======
+                    current_graph = buf_name_to_subgraph.get(buf_name)  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 current_graph.add_node(dot_node)
 
@@ -445,7 +473,11 @@ def get_module_params_or_buffers():
                         dot_w_node = pydot.Node(
                             pname1,
                             label="{" + label1 + self._get_tensor_label(ptensor) + "}",
+<<<<<<< HEAD
                             **_WEIGHT_TEMPLATE,
+=======
+                            **_WEIGHT_TEMPLATE,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         dot_graph.add_node(dot_w_node)
                         dot_graph.add_edge(pydot.Edge(pname1, node.name))
@@ -461,7 +493,11 @@ def get_module_params_or_buffers():
             for subgraph in buf_name_to_subgraph.values():
                 subgraph.set("color", "royalblue")
                 subgraph.set("penwidth", "2")
+<<<<<<< HEAD
                 dot_graph.add_subgraph(subgraph)
+=======
+                dot_graph.add_subgraph(subgraph)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             for node in graph_module.graph.nodes:
                 if ignore_getattr and node.op == "get_attr":
diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py
index 7867a0a7a6ae..632e54f15dd6 100644
--- a/torch/fx/passes/infra/partitioner.py
+++ b/torch/fx/passes/infra/partitioner.py
@@ -2,8 +2,13 @@
 import collections
 import itertools
 import logging
+<<<<<<< HEAD
 from collections.abc import Iterable, Sequence
 from copy import copy
+=======
+import operator
+from collections.abc import Iterable, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Optional
 
 from torch.fx.graph_module import GraphModule
@@ -38,6 +43,7 @@ def size(self):
 
 class _DependencyViewer:
     def __init__(self, graph_module: GraphModule):
+<<<<<<< HEAD
         self.upstreams = collections.defaultdict(set)
         self.downstreams = collections.defaultdict(set)
 
@@ -47,6 +53,10 @@ def __init__(self, graph_module: GraphModule):
                 self.upstreams[node].add(input_node)
                 self.upstreams[node].update(self.upstreams[input_node])
 
+=======
+        self.downstreams = collections.defaultdict(set)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for node in reversed(graph_module.graph.nodes):
             for output_node in node.users:
                 # add output_node and output_node's downstream dependency
@@ -56,9 +66,12 @@ def __init__(self, graph_module: GraphModule):
     def downstreams_of(self, node: Node) -> set[Node]:
         return self.downstreams[node]
 
+<<<<<<< HEAD
     def upstreams_of(self, node: Node) -> set[Node]:
         return self.upstreams[node]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class CapabilityBasedPartitioner:
     def __init__(
@@ -80,7 +93,11 @@ def __init__(
         )
         self.dependency_viewer = _DependencyViewer(graph_module)
 
+<<<<<<< HEAD
     def __is_node_supported(self, node: Node) -> bool:
+=======
+    def _is_node_supported(self, node: Node) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.operator_support.is_node_supported(
             dict(self.graph_module.named_modules()), node
         )
@@ -102,6 +119,12 @@ def propose_partitions(self) -> list[Partition]:
         partitions_order: dict[
             int, int
         ] = {}  # mapping from partition_id to minimum topo order of nodes in partition
+<<<<<<< HEAD
+=======
+        partition_users: dict[
+            int, set
+        ] = {}  # mapping from partition_id to partition users
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_partition_id = itertools.count()
 
         # try to merge partition other_id into partition self_id
@@ -109,8 +132,13 @@ def propose_partitions(self) -> list[Partition]:
         # returns `True` when merge happens, `False` otherwise.
         def maybe_merge_partition(self_id: int, other_id: int):
             # merged_nodes is the union of nodes in two partition to-be-merged
+<<<<<<< HEAD
             merged_nodes = copy(partitions_by_id[self_id].nodes)
             merged_nodes.update(partitions_by_id[other_id].nodes)
+=======
+            self_nodes = partitions_by_id[self_id].nodes
+            other_nodes = partitions_by_id[other_id].nodes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def dfs_iter_find_cycle(all_user_nodes: set[Node]):
                 for user_node in all_user_nodes:
@@ -119,7 +147,11 @@ def dfs_iter_find_cycle(all_user_nodes: set[Node]):
                     for path_node in self.dependency_viewer.downstreams_of(user_node):
                         # If any of the nodes in the dfs path of this node are in the merged_nodes
                         # list then there is a cycle in the graph.
+<<<<<<< HEAD
                         if path_node in merged_nodes:
+=======
+                        if path_node in self_nodes or path_node in other_nodes:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             return True
 
                         # If any of the nodes in the dfs path of this node are in the assignment
@@ -141,6 +173,7 @@ def dfs_iter_find_cycle(all_user_nodes: set[Node]):
 
                 return False
 
+<<<<<<< HEAD
             # check if merge would create cyclic dependency.
             all_user_nodes = set()
             for node in merged_nodes:
@@ -173,6 +206,45 @@ def dfs_iter_find_cycle(all_user_nodes: set[Node]):
             del partition_map[other_id]
 
             return True
+=======
+            # find new partition users if merge.
+            all_user_nodes = partition_users[self_id] | partition_users[other_id]
+            all_user_nodes.difference_update(other_nodes, self_nodes)
+
+            # check if merge would create cyclic dependency.
+            if dfs_iter_find_cycle(all_user_nodes):
+                # return false indicating cyclic dependency found and
+                # merge is aborted
+                return self_id, False
+
+            # merge the smaller partition into the larger.
+            merge_id, removed_id = self_id, other_id
+            if len(self_nodes) < len(other_nodes):
+                merge_id, removed_id = removed_id, merge_id
+            # no cyclic dependency found, move forward with the merge
+            # updating partition nodes
+            partitions_by_id[merge_id].nodes.update(partitions_by_id[removed_id].nodes)
+            # updating assignment map
+            for node in partitions_by_id[removed_id].nodes:
+                assignment[node] = merge_id
+            # delete other partition
+            del partitions_by_id[removed_id]
+
+            partitions_order[merge_id] = min(
+                partitions_order[merge_id], partitions_order[removed_id]
+            )
+            del partitions_order[removed_id]
+
+            partition_map[merge_id] = partition_map[merge_id].union(
+                partition_map[removed_id]
+            )
+            del partition_map[removed_id]
+
+            partition_users[merge_id] = all_user_nodes
+            del partition_users[removed_id]
+
+            return merge_id, True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def merge_single_node(node: Node, id: Optional[int]):
             def _update_partition_map(node: Node, id: int):
@@ -184,6 +256,7 @@ def _update_partition_map(node: Node, id: int):
                         partition_map[id].add(target_id)
                         partition_map[id].update(partition_map[target_id])
 
+<<<<<<< HEAD
                 # Iterate through all the upstream nodes of this node and update the partition map
                 # to indicate that there is a path from the partition id of the upstream node to the
                 # current node's partition id.
@@ -193,6 +266,8 @@ def _update_partition_map(node: Node, id: int):
                     if source_id is not None:
                         partition_map[source_id].add(id)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if node in assignment:
                 partitions_by_id[assignment[node]].remove_node(node)
 
@@ -201,11 +276,18 @@ def _update_partition_map(node: Node, id: int):
             elif id not in partitions_by_id:
                 assignment[node] = id
                 partitions_by_id[id] = Partition(id=id, nodes=[node])
+<<<<<<< HEAD
+=======
+                partition_users[id] = set(node.users)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _update_partition_map(node, id)
             else:
                 assignment[node] = id
                 partitions_by_id[id].add_node(node)
+<<<<<<< HEAD
                 _update_partition_map(node, id)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         logger.debug("Proposing partitions...")
 
@@ -218,7 +300,11 @@ def _update_partition_map(node: Node, id: int):
             #
             # I don't see a need to add a knob to disable horizontal fusion yet, we can short-cut
             # the fusion by adding an `else` block here to skip horizontal fusion.
+<<<<<<< HEAD
             if self.__is_node_supported(node) and node not in assignment:
+=======
+            if self._is_node_supported(node) and node not in assignment:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 partition_id = next(new_partition_id)
                 nodes_order[node] = partition_id
                 partitions_order[partition_id] = partition_id
@@ -227,7 +313,11 @@ def _update_partition_map(node: Node, id: int):
 
             # merge all possible partitions
             for partition_id, _ in sorted(
+<<<<<<< HEAD
                 partitions_order.items(), key=lambda item: item[1]
+=======
+                partitions_order.items(), key=operator.itemgetter(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 merge_candidates[partition_id] = None
 
@@ -235,10 +325,16 @@ def _update_partition_map(node: Node, id: int):
             if len(merge_candidates_list) > 1:
                 self_id = merge_candidates_list[0]
                 for other_id in merge_candidates_list[1:]:
+<<<<<<< HEAD
                     # note: merge partition `other_id` into partition `self_id` if
                     # it doesn't create cyclic dependency in the graph, otherwise,
                     # this is a no-op
                     maybe_merge_partition(self_id, other_id)
+=======
+                    # note: merge partitions if it doesn't create cyclic dependency
+                    # in the graph, otherwise, this is a no-op
+                    self_id, _ = maybe_merge_partition(self_id, other_id)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # post processing to re-assign "getitem" nodes into upstream partition
         logger.debug("Reassigning getitem nodes to its producer node's partition...")
diff --git a/torch/fx/passes/net_min_base.py b/torch/fx/passes/net_min_base.py
index 6bfd7cdd8258..b4ef460b8d0b 100644
--- a/torch/fx/passes/net_min_base.py
+++ b/torch/fx/passes/net_min_base.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import logging
 from dataclasses import dataclass
+<<<<<<< HEAD
 from typing import Any, Callable, Optional
+=======
+from typing import Any, Callable, cast, Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.fx
@@ -396,17 +400,32 @@ def _run_and_compare(
             if self.module_exporter:
                 if isinstance(result_key, tuple):  # type: ignore[possibly-undefined]
                     result_key = result_key[-1]
+<<<<<<< HEAD
+=======
+                # If the result is still a tuple (happens in non-sequential mode),
+                # we only use the first element as name.
+                if isinstance(result_key, tuple):  # type: ignore[possibly-undefined]
+                    result_key = str(result_key[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # pyre-ignore[29]: not a function
                 self.module_exporter(
                     a_input,
                     submodule,
+<<<<<<< HEAD
                     str(result_key[0]) + "_cpu",  # type: ignore[index]
+=======
+                    result_key + "_cpu",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 # pyre-ignore[29]: not a function
                 self.module_exporter(
                     b_input,
                     submodule,
+<<<<<<< HEAD
                     str(result_key[0]) + "_acc",  # type: ignore[index]
+=======
+                    result_key + "_acc",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             raise FxNetMinimizerResultMismatchError(f"Result mismatch for {result_key}")  # type: ignore[possibly-undefined]
 
@@ -535,7 +554,11 @@ def _sequential_traverse(self, nodes: NodeList) -> NodeSet:
 
     def _block_traverse_impl(
         self, nodes: NodeList, start_idx: int, end_idx: int, find_last_node: bool
+<<<<<<< HEAD
     ) -> int:
+=======
+    ) -> Optional[int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Recursive block search implementation.
         find_last_node: If True, search for the last node which result in numerics difference
@@ -584,7 +607,11 @@ def _block_traverse_impl(
                 f"Culprits found from node {first_node_name} to {last_node_name}."
             )
 
+<<<<<<< HEAD
             if start_idx == mid:
+=======
+            if start_idx == mid == end_idx:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 report.extend(
                     [
                         "This is the last node in the sub-module. ",
@@ -612,6 +639,7 @@ def _block_traverse_impl(
                 f"Culprits not found from node start to {mid}:{nodes[mid].name}."
             )
 
+<<<<<<< HEAD
             if start_idx == mid:
                 report.extend(
                     [
@@ -622,6 +650,21 @@ def _block_traverse_impl(
                 )
                 self.print_report(report)
                 return start_idx + 1 if find_last_node else start_idx - 1
+=======
+            if start_idx == mid == end_idx:
+                # We did not find anything if the pointers have not moved
+                if (start_idx == 0 and not find_last_node) or (
+                    start_idx == len(nodes) - 1 and find_last_node
+                ):
+                    report.append(
+                        f"At {'last' if find_last_node else 'first'} node, no culprits found."
+                    )
+                    self.print_report(report)
+                    return None
+
+                # Otherwise, we have converged on the border between discrepancy and valid
+                return start_idx + (1 if find_last_node else -1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             report.append(
                 "Proceed to split and lower the halves of the current "
@@ -657,15 +700,37 @@ def _block_traverse(
 
         start_idx = 0
         end_idx = len(nodes) - 1
+<<<<<<< HEAD
+=======
+
+        final_start_idx: Optional[int] = start_idx
+        final_end_idx: Optional[int] = end_idx
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         run_both = True if find_last_node is None else False
 
         # step 1: find (0, end_idx) of culprit block
         if run_both or find_last_node:
             last_node_report.append("Start searching for last node in culprit")
             self.print_report(last_node_report)
+<<<<<<< HEAD
             end_idx = self._block_traverse_impl(nodes, start_idx, end_idx, True)
             last_node_report.extend(
                 ["Finish Pass 1", f"Find end_idx = {end_idx}:{nodes[end_idx].name}"]
+=======
+            final_end_idx = self._block_traverse_impl(nodes, start_idx, end_idx, True)
+
+            if final_end_idx is None:
+                last_node_report.append("No culprits found")
+                self.print_report(last_node_report)
+                return culprits
+
+            last_node_report.extend(
+                [
+                    "Finish Pass 1",
+                    f"Find end_idx = {final_end_idx}:{nodes[final_end_idx].name}",
+                ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self.print_report(last_node_report)
 
@@ -673,23 +738,47 @@ def _block_traverse(
         if run_both or not find_last_node:
             first_node_report = ["Start searching for first node in culprit"]
             self.print_report(first_node_report)
+<<<<<<< HEAD
             start_idx = self._block_traverse_impl(
                 nodes[0 : end_idx + 1], start_idx, end_idx, False
             )
+=======
+            final_start_idx = self._block_traverse_impl(
+                nodes[0 : end_idx + 1], start_idx, final_end_idx or end_idx, False
+            )
+
+            if final_start_idx is None:
+                last_node_report.append("No culprits found")
+                self.print_report(last_node_report)
+                return culprits
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             first_node_report.append("*" * 50)
             self.reports.append(first_node_report)
             first_node_report.extend(
                 [
                     "Finish Pass 2",
+<<<<<<< HEAD
                     f"Find start_idx = {start_idx}:{nodes[start_idx].name}",
+=======
+                    f"Find start_idx = {final_start_idx}:{nodes[final_start_idx].name}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
             )
             self.print_report(first_node_report)
 
+<<<<<<< HEAD
         # step 3: form module with minimum culprits
         culprits.update(nodes[start_idx : end_idx + 1])
         result_report = [
             f"Finish searching, found minimum block ({nodes[start_idx]},{nodes[end_idx]})"
+=======
+        # step 3: form module with minimum culprits. These indexes are guaranteed to exist
+        range_start, range_end = cast(int, final_start_idx), cast(int, final_end_idx)
+        culprits.update(nodes[range_start : range_end + 1])
+        result_report = [
+            f"Finish searching, found minimum block ({nodes[range_start]},{nodes[range_end]})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         self.reports.append(result_report)
         self.print_report(result_report)
@@ -743,9 +832,15 @@ def _accumulate_traverse(self, nodes: NodeList) -> NodeSet:
             node_name = node.name
             if node_name is not None and isinstance(node_name, tuple):
                 node_name = node_name[0]
+<<<<<<< HEAD
             assert node_name is not None and isinstance(
                 node_name, str
             ), f"minimize: node_name: {node_name}"
+=======
+            assert node_name is not None and isinstance(node_name, str), (
+                f"minimize: node_name: {node_name}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             report.append(f"Add node: {node_name}")
 
diff --git a/torch/fx/passes/pass_manager.py b/torch/fx/passes/pass_manager.py
index ddb1410f6840..f7dda494a0cf 100644
--- a/torch/fx/passes/pass_manager.py
+++ b/torch/fx/passes/pass_manager.py
@@ -93,9 +93,15 @@ def loop_pass(
         predicate (Callable[Object, bool], optional):
 
     """
+<<<<<<< HEAD
     assert (n_iter is not None) ^ (
         predicate is not None
     ), "Exactly one of `n_iter`or `predicate` must be specified."
+=======
+    assert (n_iter is not None) ^ (predicate is not None), (
+        "Exactly one of `n_iter`or `predicate` must be specified."
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @wraps(base_pass)
     def new_pass(source):
diff --git a/torch/fx/passes/reinplace.py b/torch/fx/passes/reinplace.py
index 0fcd72938367..e893f29d0963 100644
--- a/torch/fx/passes/reinplace.py
+++ b/torch/fx/passes/reinplace.py
@@ -3,6 +3,10 @@
 import itertools
 from collections import defaultdict
 from enum import Enum
+<<<<<<< HEAD
+=======
+from typing import Any, Callable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
@@ -187,7 +191,11 @@ def _maybe_get_inplace_op(op):
     return inplace_op
 
 
+<<<<<<< HEAD
 _VIEW_INVERSE_MAP = {
+=======
+_VIEW_INVERSE_MAP: dict[Callable[..., Any], Callable[..., Any]] = {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.ops.aten.diagonal_scatter.default: torch.ops.aten.diagonal.default,
     torch.ops.aten.select_scatter.default: torch.ops.aten.select.int,
     torch.ops.aten.slice_scatter.default: torch.ops.aten.slice.Tensor,
@@ -252,6 +260,10 @@ def matching_view_metadata(a, b):
         assert isinstance(base.meta["fake_result"], FakeTensor)
         assert isinstance(mutated_view, Node)
         assert isinstance(mutated_view.meta["fake_result"], FakeTensor)
+<<<<<<< HEAD
+=======
+        assert not isinstance(n.target, str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Check that this view_inverse op actually corresponds to taking doing the inverse
         # of one of our existing self_alias nodes.
         original_view = _VIEW_INVERSE_MAP[n.target]
diff --git a/torch/fx/passes/runtime_assert.py b/torch/fx/passes/runtime_assert.py
index f8c12327f318..498495cb75c3 100644
--- a/torch/fx/passes/runtime_assert.py
+++ b/torch/fx/passes/runtime_assert.py
@@ -28,7 +28,11 @@
 __all__ = ["insert_deferred_runtime_asserts"]
 
 log = logging.getLogger(__name__)
+<<<<<<< HEAD
 graph_code_log = torch._logging.getArtifactLogger(__name__, "graph_code")
+=======
+graph_code_log = torch._logging.getArtifactLogger(__name__, "graph_code_verbose")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_example_value(node: fx.Node) -> Optional[str]:
@@ -95,6 +99,10 @@ def insert_deferred_runtime_asserts(
 
     from torch._export.passes._node_metadata_hook import _set_node_metadata_hook
     from torch.fx.experimental.symbolic_shapes import (
+<<<<<<< HEAD
+=======
+        _get_placeholder_expr,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _has_uninterpretable_sympy_function,
         CallMethodKey,
         cast_symbool_to_symint_guardless,
@@ -258,6 +266,10 @@ def add_runtime_asserts(ras):
                 # nodes
                 with _set_node_metadata_hook(gm, _node_metadata_hook):
                     res = _sympy_interp(expr_to_proxy, ra.expr).node
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     graph.call_function(
                         torch.ops.aten._assert_scalar.default,
                         # TODO: use ra.msg here, but it's pretty
@@ -290,7 +302,13 @@ def match_symbol(symint, cb):
                     if (
                         isinstance(symint, torch.SymInt)
                         and isinstance(symint.node, SymNode)
+<<<<<<< HEAD
                         and isinstance(s := symint.node.expr, sympy.Symbol)
+=======
+                        and isinstance(
+                            s := _get_placeholder_expr(symint.node), sympy.Symbol
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         and s not in expr_to_proxy
                     ):
                         with _set_node_metadata_hook(gm, _node_metadata_hook):
@@ -328,7 +346,11 @@ def match_symbol(symint, cb):
             if node == first_non_placeholder:
                 add_runtime_asserts(ras_by_symbol.pop(None, []))  # type: ignore[call-overload]
 
+<<<<<<< HEAD
             # deduplicate asserts already present in graph
+=======
+            # deduplicate asserts already present in graph, and remove trivial asserts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if node.target in (
                 torch._check,
                 torch.ops.aten._assert_scalar.default,
@@ -336,10 +358,14 @@ def match_symbol(symint, cb):
                 if (
                     node.args[0] == True  # noqa: E712
                     or (assert_expr := _get_sym_val(node.args[0])) in expr_to_proxy
+<<<<<<< HEAD
                     or (
                         assert_expr is not None
                         and _is_bound_expr_for_symbol(assert_expr)
                     )
+=======
+                    and assert_expr in added_asserts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ):
                     arg = node.args[0]
                     gm.graph.erase_node(node)
@@ -396,7 +422,13 @@ def has_new_unbacked_bindings():
                                 nn_module_stack=node.meta.get("nn_module_stack"),
                             ),
                         ):
+<<<<<<< HEAD
                             expr_to_proxy[sym_expr] = _sympy_interp(expr_to_proxy, sym_expr)  # type: ignore[arg-type]
+=======
+                            expr_to_proxy[sym_expr] = _sympy_interp(
+                                expr_to_proxy, sym_expr
+                            )  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         # won't try DCE-ing tensor compute here
                     hash_node = expr_to_proxy[sym_expr].node  # type: ignore[arg-type]
                     node.replace_all_uses_with(hash_node)
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 1a88b73bba18..f016add83fa0 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -7,6 +7,10 @@
 import torch.fx
 from torch._dispatch.python import enable_python_dispatcher
 from torch._guards import detect_fake_mode
+<<<<<<< HEAD
+=======
+from torch._prims_common import definitely_contiguous_for_memory_format
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._subclasses.meta_utils import is_sparse_any
 from torch.fx._compatibility import compatibility
 from torch.fx.node import map_aggregate, Node
@@ -32,6 +36,13 @@ class TensorMetadata(NamedTuple):
     qparams: dict[str, Any]
 
 
+<<<<<<< HEAD
+=======
+# When include_contiguity is True, we will set contiguity when its always true for the tensor.
+# Some tensors can represent both contiguous and non-contiguous tensors. e.g: (u0, u1) with (u2, u3).
+# In such situation contiguity is not set. We could also make it a tri-state i.e: (definitely_contiguous,
+# contiguous, and unknown).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _extract_tensor_metadata(
     result: torch.Tensor, include_contiguity=True
 ) -> TensorMetadata:
@@ -52,7 +63,13 @@ def _extract_tensor_metadata(
             torch.channels_last_3d,
         }
         for query_format in memory_formats:
+<<<<<<< HEAD
             if result.is_contiguous(memory_format=query_format):
+=======
+            if definitely_contiguous_for_memory_format(
+                result, memory_format=query_format
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 memory_format = query_format
                 break
 
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index 59c560423d40..cbf589c9be9e 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -58,6 +58,10 @@ def split_module(
     qualname_map: Optional[dict[str, str]] = None,
     keep_original_order: Optional[bool] = False,
     keep_original_node_name: Optional[bool] = False,
+<<<<<<< HEAD
+=======
+    keep_original_input_name: bool = True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """
     Creates subgraphs out of main graph
@@ -76,7 +80,14 @@ def split_module(
             names in the original module.
         keep_original_order: Optional[bool]: keep the original order of the GraphModule
             or use the Topological order of the new constructed GraphModule
+<<<<<<< HEAD
 
+=======
+        keep_original_node_name: Optional[bool]: If the partitioned graphs should
+            have the same node names as the original graph.
+        keep_original_input_name: bool: If the partitioned graphs should
+            have the same input names as the original graph.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         GraphModule: the module after split.
@@ -419,11 +430,34 @@ def instantiate_node_partition_mapping(node):
     for partition_name in sorted_partitions:
         partition = partitions[partition_name]
         new_inputs: dict[str, None] = {}
+<<<<<<< HEAD
+=======
+
+        counter = 0
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for inp in partition.inputs:
             orig_node = orig_nodes[inp]
             # We don't pass in get_attr nodes as inputs to the partition, but
             # instead set them as targets and use getattr within the module
 
+<<<<<<< HEAD
+=======
+            def add_placeholder():
+                if keep_original_input_name:
+                    name = inp
+                else:
+                    nonlocal counter
+                    name = f"arg_{counter}"
+                    counter += 1
+                placeholder = partition.graph.placeholder(
+                    name,
+                    type_expr=orig_nodes[inp].type,
+                )
+                new_inputs[inp] = None
+                return placeholder
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if orig_node.op == "get_attr":
                 assert isinstance(orig_node.target, str)
 
@@ -432,6 +466,7 @@ def instantiate_node_partition_mapping(node):
                     placeholder = partition.graph.get_attr(orig_node.target)
                     partition.targets[orig_node.target] = orig_attr
                 else:
+<<<<<<< HEAD
                     placeholder = partition.graph.placeholder(
                         inp,
                         type_expr=orig_nodes[inp].type,
@@ -443,6 +478,11 @@ def instantiate_node_partition_mapping(node):
                     type_expr=orig_nodes[inp].type,
                 )
                 new_inputs[inp] = None
+=======
+                    placeholder = add_placeholder()
+            else:
+                placeholder = add_placeholder()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             placeholder.meta = orig_nodes[inp].meta.copy()
             partition.environment[orig_nodes[inp]] = placeholder
         partition.inputs = new_inputs
diff --git a/torch/fx/passes/split_utils.py b/torch/fx/passes/split_utils.py
index 926747b2a41f..e34b9b0b0bda 100644
--- a/torch/fx/passes/split_utils.py
+++ b/torch/fx/passes/split_utils.py
@@ -199,9 +199,15 @@ def flatten(x: torch.fx.node.Argument) -> NodeList:
         mx = max((c.order for c in upstream_components), default=0)
 
         # Expect the component for `node` has higher order then its upstream components.
+<<<<<<< HEAD
         assert (
             comp.order >= mx
         ), f"Component {comp.name} order must be >= max of its upstream components, order={comp.order} and max={mx}"
+=======
+        assert comp.order >= mx, (
+            f"Component {comp.name} order must be >= max of its upstream components, order={comp.order} and max={mx}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Map a input of `node` to nodes in the component's graph.
         def remap_func(x):
diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index 6ca9da390f35..edaa109047c4 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -462,7 +462,11 @@ def _get_node_style(self, node):
         drawer = CustomDrawer(mod, "node_support", ignore_getattr=True)
         dot_graph = drawer.get_main_dot_graph()
         # pyre-fixme[16]: `pydot.Dot` has no attribute `write_raw`.
+<<<<<<< HEAD
         dot_graph.write_raw("node_support.dot")
+=======
+        dot_graph.write_raw("node_support.dot")  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def node_support_preview(self, dump_graph: bool = False):
         submodules = dict(self.module.named_modules())
@@ -562,7 +566,11 @@ def split_preview(self, dump_graph: bool = False):
             dot_graphs = drawer.get_all_dot_graphs()
             for name, dot_graph in dot_graphs.items():
                 # pyre-fixme[16]: `pydot.Dot` has no attribute `write_raw`.
+<<<<<<< HEAD
                 dot_graph.write_raw(f"{name}.dot")
+=======
+                dot_graph.write_raw(f"{name}.dot")  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         max_qps: float = self.PCIe_BW
         bottleneck_module = ""
diff --git a/torch/fx/passes/utils/fuser_utils.py b/torch/fx/passes/utils/fuser_utils.py
index 7487bc2c6631..9fb18cbcfb0c 100644
--- a/torch/fx/passes/utils/fuser_utils.py
+++ b/torch/fx/passes/utils/fuser_utils.py
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import copy
 from queue import SimpleQueue
 from typing import Optional as _Optional
@@ -9,14 +12,22 @@
 from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node
 from torch.fx.passes.tools_common import legalize_graph, NodeList, NodeSet
+<<<<<<< HEAD
 from torch.fx.passes.utils import lift_subgraph_as_module
+=======
+from torch.fx.passes.utils import lift_subgraph_as_module  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @compatibility(is_backward_compatible=False)
 def topo_sort(nodes: NodeList) -> NodeList:
     # sort nodes according to the topological order
     indegree_map = dict.fromkeys(nodes, 0)
+<<<<<<< HEAD
     candidates: SimpleQueue = SimpleQueue()
+=======
+    candidates: SimpleQueue[Node] = SimpleQueue()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for node in nodes:
         for n in node.all_input_nodes:
@@ -36,9 +47,15 @@ def topo_sort(nodes: NodeList) -> NodeList:
                 if indegree_map[n] == 0:
                     candidates.put(n)
 
+<<<<<<< HEAD
     assert len(nodes) == len(
         sorted_nodes
     ), "topological sorted nodes doesn't have same length as input nodes"
+=======
+    assert len(nodes) == len(sorted_nodes), (
+        "topological sorted nodes doesn't have same length as input nodes"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return sorted_nodes
 
@@ -127,6 +144,7 @@ def fuse_as_graphmodule(
     # assumption: nodes are already sorted in topo order
 
     for node in nodes:
+<<<<<<< HEAD
         assert (
             node.graph.owning_module is gm
         ), f"{node} doesn't belong to passed in graph module {gm._get_name()}"
@@ -134,6 +152,15 @@ def fuse_as_graphmodule(
         assert (
             node in gm.graph._find_nodes_lookup_table
         ), f"{node} is not found in graph module {gm._get_name()}"
+=======
+        assert node.graph.owning_module is gm, (
+            f"{node} doesn't belong to passed in graph module {gm._get_name()}"
+        )
+        assert not node._erased, f"{node} has been removed from owning graph"
+        assert node in gm.graph._find_nodes_lookup_table, (
+            f"{node} is not found in graph module {gm._get_name()}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # validates partition doesn't introduce dependency circles in the graph
     assert validate_partition(nodes), "Invalid partition, found dependency cycles"
@@ -150,7 +177,11 @@ def fuse_as_graphmodule(
     node_map: dict[Node, Node] = {}  # mapping of nodes from old graph to new graph
 
     # handles inputs through graph.node_copy's arg_transform functions
+<<<<<<< HEAD
     def remap_inputs(x):
+=======
+    def remap_inputs(x: Node) -> Node:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if x.op == "get_attr":
             # TODO: do we really need copy the get_attr node into the graph?
             # do something here
@@ -195,7 +226,11 @@ def remap_inputs(x):
         subgraph.output(outs[0] if len(outs) == 1 else outs)
 
     # lint to ensure correctness
+<<<<<<< HEAD
     subgraph.lint()
+=======
+    subgraph.lint()  # type: ignore[no-untyped-call]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     fused_gm: GraphModule
     fused_gm, _ = lift_subgraph_as_module(
         gm, subgraph, comp_name="", class_name=module_name
@@ -216,7 +251,11 @@ def insert_subgm(
     sub_gm: GraphModule,
     orig_inputs: tuple[Node, ...],
     orig_outputs: tuple[Node, ...],
+<<<<<<< HEAD
 ):
+=======
+) -> GraphModule:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # add sub_gm into gm
     submodule_name = sub_gm.__class__.__name__
     gm.add_submodule(submodule_name, sub_gm)
@@ -241,7 +280,11 @@ def insert_subgm(
 
 
 @compatibility(is_backward_compatible=False)
+<<<<<<< HEAD
 def erase_nodes(gm: GraphModule, nodes: NodeList):
+=======
+def erase_nodes(gm: GraphModule, nodes: NodeList) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # erase original nodes in inversed topological order
     for node in reversed(nodes):
         gm.graph.erase_node(node)
diff --git a/torch/fx/passes/utils/matcher_utils.py b/torch/fx/passes/utils/matcher_utils.py
index 27d24ed29945..34b382a69495 100644
--- a/torch/fx/passes/utils/matcher_utils.py
+++ b/torch/fx/passes/utils/matcher_utils.py
@@ -96,9 +96,15 @@ def __init__(
 
         for node in pattern.nodes:
             if node.op != "output":
+<<<<<<< HEAD
                 assert (
                     len(node.users) > 0
                 ), "SubgraphMatcher cannot be initialized with an pattern with dead code"
+=======
+                assert len(node.users) > 0, (
+                    "SubgraphMatcher cannot be initialized with an pattern with dead code"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # TODO: assert pattern is a connected graph
 
@@ -192,9 +198,15 @@ def _remove_overlapping_matches(
         return non_overlapping_matches
 
     def _match_literals(self, pn: Any, gn: Any, match: InternalMatch) -> bool:
+<<<<<<< HEAD
         assert not (
             isinstance(pn, Node) and isinstance(gn, Node)
         ), "pn and gn cannot both be Node"
+=======
+        assert not (isinstance(pn, Node) and isinstance(gn, Node)), (
+            "pn and gn cannot both be Node"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if isinstance(pn, Node) and not isinstance(gn, Node):
             if pn.op == "placeholder":
diff --git a/torch/fx/passes/utils/matcher_with_name_node_map_utils.py b/torch/fx/passes/utils/matcher_with_name_node_map_utils.py
index 1fa9b721e9cc..66cc3edfed73 100644
--- a/torch/fx/passes/utils/matcher_with_name_node_map_utils.py
+++ b/torch/fx/passes/utils/matcher_with_name_node_map_utils.py
@@ -18,6 +18,7 @@ def _split_to_graph_and_name_node_map(
         if n.op == "output":
             assert gm._out_spec is not None
             output = tree_unflatten(n.args[0], gm._out_spec)
+<<<<<<< HEAD
             assert isinstance(
                 output, tuple
             ), "Expecting the pattern graph to return a tuple"
@@ -29,6 +30,19 @@ def _split_to_graph_and_name_node_map(
             assert isinstance(
                 name_node_map, dict
             ), "Expecting the input graph to have a dict output as the last element"
+=======
+            assert isinstance(output, tuple), (
+                "Expecting the pattern graph to return a tuple"
+            )
+            assert len(output) >= 2, (
+                "Expecting the pattern graph to have at least two outputs"
+            )
+            *out, name_node_map = output
+            flattened, out_spec = tree_flatten(out)
+            assert isinstance(name_node_map, dict), (
+                "Expecting the input graph to have a dict output as the last element"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             n.args = (flattened,)
             orig_pytree_info = gm._graph._codegen.pytree_info  # type: ignore[attr-defined]
             gm._graph._codegen.pytree_info = _PyTreeInfo(  # type: ignore[attr-defined]
@@ -53,12 +67,20 @@ def pattern(x, weight):
             relu = F.relu(conv)
             return relu, {"conv": conv, "relu": relu}
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def target_graph(x, weight):
             conv = F.conv2d(x, weight)
             relu = F.relu(conv)
             relu *= 2
             return relu
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pattern_gm = export_for_training(pattern, example_inputs).module()
         target_gm = export_for_training(target_graph, example_inputs).module()
         matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index ce1814dd7f29..e771799f1cdf 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -8,6 +8,10 @@
 import logging
 import operator
 import sys
+<<<<<<< HEAD
+=======
+import traceback
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections import OrderedDict
 from collections.abc import Iterator
 from dataclasses import fields, is_dataclass
@@ -116,7 +120,10 @@ def __exit__(self, *args):
     "_numeric_debug_handle",  # TODO deprecated
     "custom",
     "partitioner_tag",
+<<<<<<< HEAD
     "arg_kwarg_vals",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -246,7 +253,33 @@ def create_proxy(
             proxy = proxy_factory_fn(node)
 
         if self.record_stack_traces and not proxy.node.stack_trace:
+<<<<<<< HEAD
             proxy.node.stack_trace = "".join(CapturedTraceback.extract().format())
+=======
+            from torch.fx.experimental.symbolic_shapes import uninteresting_files
+
+            user_frame_summary = CapturedTraceback.extract().summary()
+            if user_frame_summary:
+                first_forward = -1
+                for i, frame in enumerate(user_frame_summary):
+                    if frame.name == "forward":
+                        user_frame_summary = user_frame_summary[i:]
+                        first_forward = i
+                        break
+
+                # Not having a "forward" call in the stacktrace implies the
+                # stacktrace will probably be irrelevant
+                if first_forward == -1:
+                    user_frame_summary = []
+
+                stack_trace = [
+                    frame
+                    for frame in user_frame_summary
+                    if frame.filename not in uninteresting_files()
+                ]
+                stack_trace = traceback.StackSummary.from_list(stack_trace)
+                proxy.node.stack_trace = "".join(stack_trace.format()).strip()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return proxy
 
@@ -632,9 +665,15 @@ def __torch_function__(cls, orig_method, types, args=None, kwargs=None):
                 meta_proxy = arg
                 break
 
+<<<<<<< HEAD
         assert (
             meta_proxy is not None
         ), "No MetaProxy found in arguments, but one is expected."
+=======
+        assert meta_proxy is not None, (
+            "No MetaProxy found in arguments, but one is expected."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         proxy = super().__torch_function__(orig_method, types, args, kwargs)
         with meta_proxy.fake_mode:
@@ -717,14 +756,22 @@ def impl(*args, **kwargs):
             return tracer.create_proxy("call_function", target, args, kwargs)
 
         impl.__name__ = method
+<<<<<<< HEAD
         as_magic = f'__{method.strip("_")}__'
+=======
+        as_magic = f"__{method.strip('_')}__"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         setattr(Proxy, as_magic, impl)
 
     _scope(method)
 
 
 def _define_reflectable(orig_method_name):
+<<<<<<< HEAD
     method_name = f'__r{orig_method_name.strip("_")}__'
+=======
+    method_name = f"__r{orig_method_name.strip('_')}__"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def impl(self, rhs):
         target = getattr(operator, orig_method_name)
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index ae6854f67887..5288d1087452 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -307,9 +307,15 @@ def _replace_pattern(
     elif callable(replacement):
         common_replacement_graph = symbolic_trace(replacement).graph
     else:
+<<<<<<< HEAD
         assert (
             replacement_callback is not None
         ), "Must provide either a replacement GraphModule or a replacement callback"
+=======
+        assert replacement_callback is not None, (
+            "Must provide either a replacement GraphModule or a replacement callback"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         common_replacement_graph = None
 
     # As we progressively replace nodes, we'll need to keep track of how the match results should change
@@ -322,9 +328,15 @@ def _replace_pattern(
                 match, original_graph, pattern_graph
             )
         else:
+<<<<<<< HEAD
             assert (
                 common_replacement_graph is not None
             ), "Must provide either a replacement GraphModule or a replacement callback"
+=======
+            assert common_replacement_graph is not None, (
+                "Must provide either a replacement GraphModule or a replacement callback"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             replacement_graph = common_replacement_graph
         replacement_placeholders = [
             n for n in replacement_graph.nodes if n.op == "placeholder"
diff --git a/torch/header_only_apis.txt b/torch/header_only_apis.txt
new file mode 100644
index 000000000000..49b2784e1df1
--- /dev/null
+++ b/torch/header_only_apis.txt
@@ -0,0 +1,52 @@
+# This file contains all the header-only C++ APIs/symbols in torch.
+# If a symbol is added in this file, it should be tested in a .cpp file
+# to guarantee that compiling these symbols do not require linking libtorch
+# to ensure header-only-ness.
+
+# c10/util/TypeCast.h
+convert
+
+# c10/util/bit_cast.h
+bit_cast
+
+# c10/util/BFloat16-math.h, c10/util/BFloat16.h
+BFloat16
+
+# c10/util/Float8_e4m3fn.h
+Float8_e4m3fn
+
+# c10/util/Float8_e4m3fnuz.h
+Float8_e4m3fnuz
+
+# c10/util/Float8_e5m2.h
+Float8_e5m2
+
+# c10/util/Float8_e5m2fnuz.h
+Float8_e5m2fnuz
+
+# c10/util/Half.h
+Half
+
+# c10/util/complex.h
+complex
+
+# ATen/NumericUtils.h, c10/util/generic_math.h
+div_floor_floating
+div_floor_integer
+_isnan
+
+# ATen/core/PhiloxRNGEngine.h
+Philox4_32
+randn
+
+# ATen/cpu/vec/vec.h
+Vectorized
+clamp_min
+convert
+loadu
+maximum
+minimum
+size
+
+# torch/headeronly/macros/Export.h
+C10_API
diff --git a/torch/headeronly/BUILD.bazel b/torch/headeronly/BUILD.bazel
new file mode 100644
index 000000000000..f4a27fac1f7f
--- /dev/null
+++ b/torch/headeronly/BUILD.bazel
@@ -0,0 +1,9 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+cc_library(
+    name = "torch_headeronly",
+    hdrs = glob([
+        "**/*.h"
+    ]),
+    visibility = ["//visibility:public"],
+)
diff --git a/torch/headeronly/macros/Export.h b/torch/headeronly/macros/Export.h
new file mode 100644
index 000000000000..183aeab56344
--- /dev/null
+++ b/torch/headeronly/macros/Export.h
@@ -0,0 +1,87 @@
+#pragma once
+
+/* Header file to define the common scaffolding for exported symbols.
+ *
+ * Export is by itself a quite tricky situation to deal with, and if you are
+ * hitting this file, make sure you start with the background here:
+ * - Linux: https://gcc.gnu.org/wiki/Visibility
+ * - Windows:
+ * https://docs.microsoft.com/en-us/cpp/cpp/dllexport-dllimport?view=vs-2017
+ *
+ * Do NOT include this file directly. Instead, use c10/macros/Macros.h
+ */
+
+// You do not need to edit this part of file unless you are changing the core
+// pytorch export abstractions.
+//
+// This part defines the C10 core export and import macros. This is controlled
+// by whether we are building shared libraries or not, which is determined
+// during build time and codified in c10/core/cmake_macros.h.
+// When the library is built as a shared lib, EXPORT and IMPORT will contain
+// visibility attributes. If it is being built as a static lib, then EXPORT
+// and IMPORT basically have no effect.
+
+// As a rule of thumb, you should almost NEVER mix static and shared builds for
+// libraries that depend on c10. AKA, if c10 is built as a static library, we
+// recommend everything dependent on c10 to be built statically. If c10 is built
+// as a shared library, everything dependent on it should be built as shared. In
+// the PyTorch project, all native libraries shall use the macro
+// C10_BUILD_SHARED_LIB to check whether pytorch is building shared or static
+// libraries.
+
+// For build systems that do not directly depend on CMake and directly build
+// from the source directory (such as Buck), one may not have a cmake_macros.h
+// file at all. In this case, the build system is responsible for providing
+// correct macro definitions corresponding to the cmake_macros.h.in file.
+//
+// In such scenarios, one should define the macro
+//     C10_USING_CUSTOM_GENERATED_MACROS
+// to inform this header that it does not need to include the cmake_macros.h
+// file.
+
+#ifdef _WIN32
+#define C10_HIDDEN
+#if defined(C10_BUILD_SHARED_LIBS)
+#define C10_EXPORT __declspec(dllexport)
+#define C10_IMPORT __declspec(dllimport)
+#else
+#define C10_EXPORT
+#define C10_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_EXPORT __attribute__((__visibility__("default")))
+#define C10_HIDDEN __attribute__((__visibility__("hidden")))
+#else // defined(__GNUC__)
+#define C10_EXPORT
+#define C10_HIDDEN
+#endif // defined(__GNUC__)
+#define C10_IMPORT C10_EXPORT
+#endif // _WIN32
+
+#ifdef NO_EXPORT
+#undef C10_EXPORT
+#define C10_EXPORT
+#endif
+
+// Definition of an adaptive XX_API macro, that depends on whether you are
+// building the library itself or not, routes to XX_EXPORT and XX_IMPORT.
+// Basically, you will need to do this for each shared library that you are
+// building, and the instruction is as follows: assuming that you are building
+// a library called libawesome.so. You should:
+// (1) for your cmake target (usually done by "add_library(awesome, ...)"),
+//     define a macro called AWESOME_BUILD_MAIN_LIB using
+//     target_compile_options.
+// (2) define the AWESOME_API macro similar to the one below.
+// And in the source file of your awesome library, use AWESOME_API to
+// annotate public symbols.
+
+// Here, for the C10 library, we will define the macro C10_API for both import
+// and export.
+
+// This one is being used by libc10.so
+#ifdef C10_BUILD_MAIN_LIB
+#define C10_API C10_EXPORT
+#else
+#define C10_API C10_IMPORT
+#endif
diff --git a/torch/jit/_builtins.py b/torch/jit/_builtins.py
index fb8ac26471a9..ccca1eea0e7b 100644
--- a/torch/jit/_builtins.py
+++ b/torch/jit/_builtins.py
@@ -18,7 +18,19 @@
 
 _builtin_table: Optional[dict[int, str]] = None
 
+<<<<<<< HEAD
 _modules_containing_builtins = (torch, torch._C._nn, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._sparse, torch._C._special)  # type: ignore[attr-defined] # noqa: B950
+=======
+_modules_containing_builtins = (
+    torch,
+    torch._C._nn,
+    torch._C._fft,  # type: ignore[attr-defined]
+    torch._C._linalg,  # type: ignore[attr-defined]
+    torch._C._nested,  # type: ignore[attr-defined]
+    torch._C._sparse,  # type: ignore[attr-defined]
+    torch._C._special,  # type: ignore[attr-defined]
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _builtin_ops = [
     # Pairs of (function, op_name)
@@ -94,7 +106,14 @@
     (torch.autograd.grad, "aten::grad"),
     (torch.autograd.backward, "aten::backward"),
     (torch._C._infer_size, "aten::_infer_size"),
+<<<<<<< HEAD
     (torch.nn.functional._no_grad_embedding_renorm_, "aten::_no_grad_embedding_renorm_"),  # type: ignore[attr-defined]
+=======
+    (
+        torch.nn.functional._no_grad_embedding_renorm_,  # type: ignore[attr-defined]
+        "aten::_no_grad_embedding_renorm_",
+    ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     (torch.nn.functional.assert_int_or_pair, "aten::_assert_int_or_pair"),
     (torch.nn.init._no_grad_fill_, "aten::_no_grad_fill_"),
     (torch.nn.init._no_grad_normal_, "aten::_no_grad_normal_"),
diff --git a/torch/jit/_decomposition_utils.py b/torch/jit/_decomposition_utils.py
index 795f9da8e073..99b61c55c9c0 100644
--- a/torch/jit/_decomposition_utils.py
+++ b/torch/jit/_decomposition_utils.py
@@ -4,9 +4,15 @@
 
 
 def _register_decomposition(op: OpOverload, graph: torch._C.Graph):
+<<<<<<< HEAD
     assert not isinstance(
         op, OpOverloadPacket
     ), f"Must pass specific op overload, not overload packet, found {op}"
+=======
+    assert not isinstance(op, OpOverloadPacket), (
+        f"Must pass specific op overload, not overload packet, found {op}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(op, OpOverload)
 
     torch._C._jit_register_decomposition_for_schema(op._schema, graph)
diff --git a/torch/jit/_decompositions.py b/torch/jit/_decompositions.py
index ba37fe5f0cac..6de0f1f52ec4 100644
--- a/torch/jit/_decompositions.py
+++ b/torch/jit/_decompositions.py
@@ -23,6 +23,7 @@ def check_decomposition_has_type_annotations(f):
     inspect_empty = inspect._empty  # type: ignore[attr-defined]
     sig = inspect.signature(f)
     for param in sig.parameters.values():
+<<<<<<< HEAD
         assert (
             param.annotation != inspect_empty
         ), f"No signature on param {param.name} for function {f.name}"
@@ -30,6 +31,15 @@ def check_decomposition_has_type_annotations(f):
     assert (
         sig.return_annotation != inspect_empty
     ), f"No return annotation for function {f.name}"
+=======
+        assert param.annotation != inspect_empty, (
+            f"No signature on param {param.name} for function {f.name}"
+        )
+
+    assert sig.return_annotation != inspect_empty, (
+        f"No return annotation for function {f.name}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def signatures_match(decomposition_sig, torch_op_sig):
@@ -75,9 +85,15 @@ def decomposition_decorator(f: Callable[_P, _T]) -> Callable[_P, _T]:
         assert isinstance(aten_op, torch._ops.OpOverload)
 
         # Need unique name for jit function serialization
+<<<<<<< HEAD
         assert (
             f.__name__ not in function_name_set
         ), f"Duplicated function name {f.__name__}"
+=======
+        assert f.__name__ not in function_name_set, (
+            f"Duplicated function name {f.__name__}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         function_name_set.add(f.__name__)
 
         scripted_func = torch.jit.script(f)
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index a6d6c4a673a7..f86d52de0dc3 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -129,6 +129,10 @@ def get_properties_names(module):
     torch.device,
     torch.layout,
     torch.dtype,
+<<<<<<< HEAD
+=======
+    torch.qscheme,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -587,9 +591,15 @@ def init_fn(script_module):
         #    recursively scripting them.
         for name, sub_concrete_type in concrete_type.get_modules():
             orig_value = getattr(nn_module, name)
+<<<<<<< HEAD
             assert isinstance(
                 orig_value, Module
             ), f"Expected Module but got {type(orig_value)}"
+=======
+            assert isinstance(orig_value, Module), (
+                f"Expected Module but got {type(orig_value)}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             module_type = sub_concrete_type.jit_type
             if isinstance(module_type, torch._C.InterfaceType):
                 # use the interface inference rule to compile the module
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 5777b047e74e..20ebf4fa6dc9 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -318,10 +318,17 @@ def make_stubs(module):
                     else:
                         return infer_methods_to_compile(module)
 
+<<<<<<< HEAD
                 self.__dict__[
                     "_actual_script_module"
                 ] = torch.jit._recursive.create_script_module(
                     self, make_stubs, share_types=not added_methods_in_init
+=======
+                self.__dict__["_actual_script_module"] = (
+                    torch.jit._recursive.create_script_module(
+                        self, make_stubs, share_types=not added_methods_in_init
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
                 # Delete the Python attributes that now shadow the ScriptModule
diff --git a/torch/jit/_shape_functions.py b/torch/jit/_shape_functions.py
index aa0dc2b82d54..e152f7b972ba 100644
--- a/torch/jit/_shape_functions.py
+++ b/torch/jit/_shape_functions.py
@@ -280,6 +280,7 @@ def max_pool2d(
     dilation: list[int],
     ceil_mode: bool,
 ):
+<<<<<<< HEAD
     assert (
         len(kernel_size) == 1 or len(kernel_size) == 2
     ), "max_pool2d: kernel_size must either be a single int, or a tuple of two ints"
@@ -289,6 +290,17 @@ def max_pool2d(
     assert (
         len(stride) == 0 or len(stride) == 1 or len(stride) == 2
     ), "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints"
+=======
+    assert len(kernel_size) == 1 or len(kernel_size) == 2, (
+        "max_pool2d: kernel_size must either be a single int, or a tuple of two ints"
+    )
+    kH = kernel_size[0]
+    kW = kH if len(kernel_size) == 1 else kernel_size[1]
+
+    assert len(stride) == 0 or len(stride) == 1 or len(stride) == 2, (
+        "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dH = kH if len(stride) == 0 else stride[0]
     if len(stride) == 0:
         dW = kW
@@ -297,6 +309,7 @@ def max_pool2d(
     else:
         dW = stride[1]
 
+<<<<<<< HEAD
     assert (
         len(padding) == 1 or len(padding) == 2
     ), "max_pool2d: padding must either be a single int, or a tuple of two ints"
@@ -306,6 +319,17 @@ def max_pool2d(
     assert (
         len(dilation) == 1 or len(dilation) == 2
     ), "max_pool2d: dilation must be either a single int, or a tuple of two ints"
+=======
+    assert len(padding) == 1 or len(padding) == 2, (
+        "max_pool2d: padding must either be a single int, or a tuple of two ints"
+    )
+    padH = padding[0]
+    padW = padH if len(padding) == 1 else padding[1]
+
+    assert len(dilation) == 1 or len(dilation) == 2, (
+        "max_pool2d: dilation must be either a single int, or a tuple of two ints"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dilationH = dilation[0]
     dilationW = dilationH if len(dilation) == 1 else dilation[1]
 
@@ -367,17 +391,29 @@ def upsample_nearest2d(
         assert 0, "Either output_size or scale_factors must be presented"
 
     if output_size is not None:
+<<<<<<< HEAD
         assert (
             scale_factors is None
         ), "Must specify exactly one of output_size and scale_factors"
+=======
+        assert scale_factors is None, (
+            "Must specify exactly one of output_size and scale_factors"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(output_size) == 2
         out.append(output_size[0])
         out.append(output_size[1])
 
     if scale_factors is not None:
+<<<<<<< HEAD
         assert (
             output_size is None
         ), "Must specify exactly one of output_size and scale_factors"
+=======
+        assert output_size is None, (
+            "Must specify exactly one of output_size and scale_factors"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         assert len(scale_factors) == 2
         out.append(int(input[2] * scale_factors[0]))
         out.append(int(input[3] * scale_factors[1]))
@@ -540,9 +576,15 @@ def check_cat_shape_except_dim(
     assert first_dims == second_dims, "Tensors must have same number of dimensions"
     for dim in range(0, first_dims):
         if dim != dimension:
+<<<<<<< HEAD
             assert (
                 first[dim] == second[dim]
             ), "Sizes of tensors must match except in dimension"
+=======
+            assert first[dim] == second[dim], (
+                "Sizes of tensors must match except in dimension"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def cat(tensors: list[list[int]], dim: int):
@@ -1088,9 +1130,15 @@ def topk(self: list[int], k: int, dim: int = -1) -> tuple[list[int], list[int]]:
     if len(self) == 0:
         result: list[int] = []
     else:
+<<<<<<< HEAD
         assert (
             k <= self[dim]
         ), f"k ({k}) is too big for dimension {dim} of size {self[dim]}"
+=======
+        assert k <= self[dim], (
+            f"k ({k}) is too big for dimension {dim} of size {self[dim]}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = _copy(self)
         result[dim] = k
     return result, result
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index eae30f415e9b..3586f84131e6 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -1205,7 +1205,14 @@ def weighted_kernel_sum(self, weight):
 
         # Trace specific methods on a module (specified in `inputs`), constructs
         # a `ScriptModule` with `forward` and `weighted_kernel_sum` methods
+<<<<<<< HEAD
         inputs = {"forward": example_forward_input, "weighted_kernel_sum": example_weight}
+=======
+        inputs = {
+            "forward": example_forward_input,
+            "weighted_kernel_sum": example_weight,
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module = torch.jit.trace_module(n, inputs)
 
     """
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index 9df5b7b4fb7c..dc4fa3d49fce 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -331,7 +331,11 @@ def try_real_annotations(fn, loc):
     try:
         # Note: anything annotated as `Optional[T]` will automatically
         # be returned as `Union[T, None]` per
+<<<<<<< HEAD
         # https://github.com/python/typing/blob/master/src/typing.py#L850
+=======
+        # https://github.com/python/cpython/blob/main/Lib/typing.py#L732
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sig = inspect.signature(fn)
     except ValueError:
         return None
@@ -486,6 +490,12 @@ def try_ann_to_type(ann, loc, rcb=None):
         return StreamObjType.get()
     if ann is torch.dtype:
         return IntType.get()  # dtype not yet bound in as its own type
+<<<<<<< HEAD
+=======
+    if ann is torch.qscheme:
+        return IntType.get()  # qscheme not yet bound in as its own type
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if inspect.isclass(ann) and issubclass(ann, enum.Enum):
         if _get_script_class(ann) is None:
             scripted_class = torch.jit._script._recursive_compile_class(ann, loc)
diff --git a/torch/lib/libshm/CMakeLists.txt b/torch/lib/libshm/CMakeLists.txt
index 8a7329ddab77..c886ef66c545 100644
--- a/torch/lib/libshm/CMakeLists.txt
+++ b/torch/lib/libshm/CMakeLists.txt
@@ -1,5 +1,9 @@
 project(libshm C CXX)
+<<<<<<< HEAD
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+=======
+cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 set(TORCH_ROOT ${CMAKE_CURRENT_LIST_DIR}/../../../)
 
diff --git a/torch/lib/libshm/core.cpp b/torch/lib/libshm/core.cpp
index 4524058dc6c6..a7019af1f100 100644
--- a/torch/lib/libshm/core.cpp
+++ b/torch/lib/libshm/core.cpp
@@ -8,10 +8,17 @@
 #include <libshm/libshm.h>
 #include <libshm/socket.h>
 
+<<<<<<< HEAD
 std::unordered_map<std::string, ClientSocket> managers;
 std::string manager_executable_path;
 
 AllocInfo get_alloc_info(const char* filename) {
+=======
+static std::unordered_map<std::string, ClientSocket> managers;
+static std::string manager_executable_path;
+
+static AllocInfo get_alloc_info(const char* filename) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   AllocInfo info = {};
   info.pid = getpid();
   info.free = false;
@@ -23,12 +30,20 @@ AllocInfo get_alloc_info(const char* filename) {
   return info;
 }
 
+<<<<<<< HEAD
 void start_manager() {
   std::array<int, 2> pipe_ends;
   SYSCHECK_ERR_RETURN_NEG1(pipe(pipe_ends.data()));
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   pid_t pid;
+=======
+static void start_manager() {
+  std::array<int, 2> pipe_ends;
+  SYSCHECK_ERR_RETURN_NEG1(pipe(pipe_ends.data()));
+
+  pid_t pid = -1;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   SYSCHECK_ERR_RETURN_NEG1(pid = fork());
   if (!pid) {
     SYSCHECK_ERR_RETURN_NEG1(close(pipe_ends[0]));
@@ -78,7 +93,11 @@ void start_manager() {
   managers.emplace(std::move(handle), std::move(manager));
 }
 
+<<<<<<< HEAD
 ClientSocket& get_manager_socket(const std::string& manager_handle) {
+=======
+static ClientSocket& get_manager_socket(const std::string& manager_handle) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   auto it = managers.find(manager_handle);
   if (it == managers.end()) {
     auto socket = ClientSocket(manager_handle);
@@ -99,8 +118,12 @@ THManagedMapAllocatorInit::THManagedMapAllocatorInit(
     : manager_handle_(manager_handle ? manager_handle : "") {
   // TODO: unlock GIL when contacting the manager
   try {
+<<<<<<< HEAD
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     ClientSocket* socket;
+=======
+    ClientSocket* socket = nullptr;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if (!manager_handle_.empty()) {
       socket = &get_manager_socket(manager_handle_);
     } else {
diff --git a/torch/lib/libshm/manager.cpp b/torch/lib/libshm/manager.cpp
index e3d50fa304ce..b86116f3d62e 100644
--- a/torch/lib/libshm/manager.cpp
+++ b/torch/lib/libshm/manager.cpp
@@ -32,19 +32,32 @@ struct ClientSession {
   pid_t pid;
 };
 
+<<<<<<< HEAD
 std::vector<struct pollfd> pollfds;
 std::unordered_map<int, ClientSession> client_sessions;
 // TODO: check if objects have been freed from time to time
 std::set<std::string> used_objects;
 
 void register_fd(int fd) {
+=======
+static std::vector<struct pollfd> pollfds;
+static std::unordered_map<int, ClientSession> client_sessions;
+// TODO: check if objects have been freed from time to time
+static std::set<std::string> used_objects;
+
+static void register_fd(int fd) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   struct pollfd pfd = {};
   pfd.fd = fd;
   pfd.events = POLLIN;
   pollfds.push_back(pfd);
 }
 
+<<<<<<< HEAD
 void unregister_fd(int fd) {
+=======
+static void unregister_fd(int fd) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   pollfds.erase(
       std::remove_if(
           pollfds.begin(),
@@ -54,7 +67,11 @@ void unregister_fd(int fd) {
   client_sessions.erase(fd);
 }
 
+<<<<<<< HEAD
 void print_init_message(std::string_view message) {
+=======
+static void print_init_message(std::string_view message) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   ssize_t written_bytes = -1;
   while (!message.empty()) {
     // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
@@ -69,7 +86,11 @@ void print_init_message(std::string_view message) {
   }
 }
 
+<<<<<<< HEAD
 bool object_exists(const char* name) {
+=======
+static bool object_exists(const char* name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   int fd = shm_open(name, O_RDONLY, 0);
   if (fd >= 0) {
     close(fd);
@@ -79,7 +100,11 @@ bool object_exists(const char* name) {
   }
 }
 
+<<<<<<< HEAD
 void free_used_object(const std::string& name) {
+=======
+static void free_used_object(const std::string& name) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   if (!object_exists(name.c_str())) {
     DEBUG("object %s appears to have been freed", name.c_str());
     used_objects.erase(name);
diff --git a/torch/lib/libshm/socket.h b/torch/lib/libshm/socket.h
index e3ff98cbc9fb..a4f10dcfe802 100644
--- a/torch/lib/libshm/socket.h
+++ b/torch/lib/libshm/socket.h
@@ -17,12 +17,19 @@
 class Socket {
  public:
   int socket_fd;
+<<<<<<< HEAD
+=======
+  Socket(const Socket& other) = delete;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
  protected:
   Socket() {
     SYSCHECK_ERR_RETURN_NEG1(socket_fd = socket(AF_UNIX, SOCK_STREAM, 0));
   }
+<<<<<<< HEAD
   Socket(const Socket& other) = delete;
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   Socket(Socket&& other) noexcept : socket_fd(other.socket_fd) {
     other.socket_fd = -1;
   };
@@ -122,7 +129,11 @@ class ManagerServerSocket : public Socket {
       SYSCHECK_ERR_RETURN_NEG1(unlink(socket_path.c_str()));
   }
 
+<<<<<<< HEAD
   virtual ~ManagerServerSocket() {
+=======
+  ~ManagerServerSocket() override {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     unlink(socket_path.c_str());
   }
 
diff --git a/torch/library.h b/torch/library.h
index ef92bee6c93b..4773faceacba 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -605,6 +605,7 @@ class TORCH_API Library final {
   /// }
   /// ```
 
+<<<<<<< HEAD
   template <typename Schema>
   Library& def(
       Schema&& raw_schema,
@@ -614,6 +615,22 @@ class TORCH_API Library final {
     return _def(std::move(s), nullptr, tags, rv);
   }
 
+=======
+  Library& def(
+      c10::FunctionSchema&& s,
+      const std::vector<at::Tag>& tags = {},
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
+    return _def(std::move(s), nullptr, tags, rv);
+  }
+
+  Library& def(
+      const char* raw_schema,
+      const std::vector<at::Tag>& tags = {},
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
+    return _def(schema(raw_schema), nullptr, tags, rv);
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   /// Declares that for all operators that are subsequently def'ed, their
   /// fake impls may be found in the given Python module (pymodule).
   /// This registers some help text that is used if the fake impl
@@ -884,8 +901,53 @@ class TORCH_API Library final {
   at::OperatorName _parseNameForLib(const char* name_str) const;
 };
 
+<<<<<<< HEAD
+namespace detail {
+
+=======
+#if defined(TORCH_LIBRARY_THREAD_UNSAFE_LAZY_INIT) && defined(C10_MOBILE)
+void initialize_torch_libraries();
+#endif
+
 namespace detail {
 
+#if defined(TORCH_LIBRARY_THREAD_UNSAFE_LAZY_INIT) && defined(C10_MOBILE)
+// This is an experimental feature to defer TorchLibraryInit cost to run either
+// at model load time, or when a client application explicitly calls
+// torch::initialize_torch_libraries().
+//
+// This is not thread safe, the client is required to ensure that libraries
+// containing TORCH_LIBRARY initializers are loaded in a thread safe manner.
+extern std::vector<TorchLibraryInit*> torch_library_initializers;
+class TorchLibraryInit final {
+    private:
+      using InitFn = void(Library&);
+      Library::Kind kind;
+      InitFn* init_function;
+      const char* ns;
+      std::optional<c10::DispatchKey> key;
+      const char* file;
+      uint32_t line;
+      std::unique_ptr<Library> lib = nullptr;
+
+    public:
+      TorchLibraryInit(
+            Library::Kind kind,
+            InitFn* fn,
+            const char* ns,
+            std::optional<c10::DispatchKey> k,
+            const char* file,
+            uint32_t line) : kind(kind), init_function(fn), ns(ns), key(k), file(file), line(line) {
+              torch_library_initializers.push_back(this);
+            }
+
+      void initialize() {
+        lib = std::unique_ptr<Library>(new Library(kind, ns, key, file, line));
+        init_function(*lib);
+      }
+};
+#else
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TorchLibraryInit final {
  private:
   using InitFn = void(Library&);
@@ -903,6 +965,10 @@ class TorchLibraryInit final {
     fn(lib_);
   }
 };
+<<<<<<< HEAD
+=======
+#endif
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 } // namespace detail
 
diff --git a/torch/library.py b/torch/library.py
index ccb328e92e67..eab0d61e19b4 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -87,11 +87,20 @@ class Library:
 
     Args:
         ns: library name
+<<<<<<< HEAD
         kind: "DEF", "IMPL" (default: "IMPL"), "FRAGMENT"
+=======
+        kind: "DEF", "IMPL", "FRAGMENT"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dispatch_key: PyTorch dispatch key (default: "")
     """
 
     def __init__(self, ns, kind, dispatch_key=""):
+<<<<<<< HEAD
+=======
+        from torch.fx.operator_schemas import _SCHEMA_TO_SIGNATURE_CACHE
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if kind not in ("IMPL", "DEF", "FRAGMENT"):
             raise ValueError("Unsupported kind: ", kind)
 
@@ -127,6 +136,11 @@ def __init__(self, ns, kind, dispatch_key=""):
             _defs,
             self._op_defs,
             self._registration_handles,
+<<<<<<< HEAD
+=======
+            self.m,
+            _SCHEMA_TO_SIGNATURE_CACHE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def __repr__(self):
@@ -148,6 +162,10 @@ def define(self, schema, alias_analysis="", *, tags=()):
             name of the operator as inferred from the schema.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> my_lib = Library("mylib", "DEF")
             >>> my_lib.define("sum(Tensor self) -> Tensor")
         """
@@ -184,7 +202,11 @@ def define(self, schema, alias_analysis="", *, tags=()):
         _defs.add(qualname)
         return result
 
+<<<<<<< HEAD
     def _register_fake(self, op_name, fn, _stacklevel=1):
+=======
+    def _register_fake(self, op_name, fn, _stacklevel=1, *, allow_override=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Registers the fake impl for an operator defined in the library."""
         if torch._running_with_deploy():
             _library.utils.warn_deploy()
@@ -211,7 +233,13 @@ def _register_fake(self, op_name, fn, _stacklevel=1):
         else:
             func_to_register = fn
 
+<<<<<<< HEAD
         handle = entry.fake_impl.register(func_to_register, source)
+=======
+        handle = entry.fake_impl.register(
+            func_to_register, source, lib=self, allow_override=allow_override
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._registration_handles.append(handle)
 
     def _register_torch_dispatch_rule(self, op_name, torch_dispatch_class, fn):
@@ -248,6 +276,10 @@ def _impl_with_aoti_compile(self, op_name, dispatch_key=""):
                           the dispatch key that the library was created with.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> my_lib = Library("aten", "IMPL")
             >>> my_lib._impl_with_aoti_compile("div.Tensor", "CPU")
         """
@@ -290,7 +322,13 @@ def _impl_with_aoti_compile(self, op_name, dispatch_key=""):
         _impls.add(key)
         self._op_impls.add(key)
 
+<<<<<<< HEAD
     def impl(self, op_name, fn, dispatch_key="", *, with_keyset=False):
+=======
+    def impl(
+        self, op_name, fn, dispatch_key="", *, with_keyset=False, allow_override=False
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Registers the function implementation for an operator defined in the library.
 
         Args:
@@ -301,8 +339,19 @@ def impl(self, op_name, fn, dispatch_key="", *, with_keyset=False):
                           the dispatch key that the library was created with.
             with_keyset: flag controlling if the current dispatcher call keyset should be passed as the first argument
                          to :attr:`fn` when calling. This should be used to create the appropriate keyset for redispatch calls.
+<<<<<<< HEAD
+
+        Example::
+=======
+            allow_override: Flag controlling if we want to override an
+                         existing registered kernel implementation. This is by
+                         default off, and will error you're trying to register a
+                         kernel to a dispatch key with a kernel already
+                         registered.
 
         Example::
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> my_lib = Library("aten", "IMPL")
             >>> def div_cpu(self, other):
             >>>     return self * (1 / other)
@@ -332,7 +381,11 @@ def impl(self, op_name, fn, dispatch_key="", *, with_keyset=False):
             )
 
         key = self.ns + "/" + name.split("::")[-1] + "/" + dispatch_key
+<<<<<<< HEAD
         if key in _impls:
+=======
+        if (not allow_override) and key in _impls:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # TODO: in future, add more info about where the existing function is registered (this info is
             # today already returned by the C++ warning when impl is called but we error out before that)
             raise RuntimeError(
@@ -386,6 +439,10 @@ def fallback(self, fn, dispatch_key="", *, with_keyset=False):
                          to :attr:`fn` when calling. This should be used to create the appropriate keyset for redispatch calls.
 
         Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             >>> my_lib = Library("_", "IMPL")
             >>> def fallback_kernel(op, *args, **kwargs):
             >>>     # Handle all autocast ops generically
@@ -442,12 +499,35 @@ def _del_library(
     captured_defs,
     op_defs,
     registration_handles,
+<<<<<<< HEAD
+):
+=======
+    m,
+    schema_to_signature_cache,
 ):
+    for op_def in op_defs:
+        name = op_def
+        overload_name = ""
+        if "." in op_def:
+            name, overload_name = op_def.split(".")
+        if (
+            name,
+            overload_name,
+        ) in schema_to_signature_cache:
+            del schema_to_signature_cache[(name, overload_name)]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     captured_impls -= op_impls
     captured_defs -= op_defs
     for handle in registration_handles:
         handle.destroy()
 
+<<<<<<< HEAD
+=======
+    if m is not None:
+        m.reset()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @contextlib.contextmanager
 def _scoped_library(*args, **kwargs):
@@ -916,6 +996,10 @@ def register_fake(
     *,
     lib: Optional[Library] = None,
     _stacklevel: int = 1,
+<<<<<<< HEAD
+=======
+    allow_override: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     r"""Register a FakeTensor implementation ("fake impl") for this operator.
 
@@ -940,6 +1024,21 @@ def register_fake(
     For a detailed guide on custom ops, please see
     https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html
 
+<<<<<<< HEAD
+=======
+    Args:
+        op_name: Operator name (along with the overload) or OpOverload object.
+        func: Fake tensor implementation.
+        lib (Optional[Library]): Library to register the fake tensor to.
+        allow_override: Flag controlling if we want to override an
+                        existing registered fake impl. This is by default off,
+                        and will error you're trying to register a fake impl to
+                        an operator that already has a fake impl. This also only
+                        applies if the custom operator was not created via
+                        torch.library.custom_op, as overriding and existing fake
+                        impl is already allowed.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Examples:
         >>> import torch
         >>> import numpy as np
@@ -1020,7 +1119,13 @@ def register(func):
             _keep_alive.append(use_lib)
         else:
             use_lib = lib
+<<<<<<< HEAD
         use_lib._register_fake(op_name, func, _stacklevel=stacklevel + 1)
+=======
+        use_lib._register_fake(
+            op_name, func, _stacklevel=stacklevel + 1, allow_override=allow_override
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return func
 
     if func is None:
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 1aa04204164f..4bdb1c0b0fed 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -329,7 +329,11 @@
     tensor(0, dtype=torch.int32)
 
 .. _LAPACK's getrf:
+<<<<<<< HEAD
     https://www.netlib.org/lapack/explore-html/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html
+=======
+    https://www.netlib.org/lapack/explore-html-3.6.1/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -967,7 +971,11 @@
     tensor([1, 2, 3], dtype=torch.int32)
 
 .. _LAPACK's sytrf:
+<<<<<<< HEAD
     https://www.netlib.org/lapack/explore-html/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html
+=======
+    https://www.netlib.org/lapack/explore-html-3.6.1/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -1025,7 +1033,11 @@
     tensor(0, dtype=torch.int32)
 
 .. _LAPACK's sytrf:
+<<<<<<< HEAD
     https://www.netlib.org/lapack/explore-html/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html
+=======
+    https://www.netlib.org/lapack/explore-html-3.6.1/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -1369,6 +1381,7 @@
 
 :attr:`ord` defines the norm that is computed. The following norms are supported:
 
+<<<<<<< HEAD
 ======================     =========================  ========================================================
 :attr:`ord`                norm for matrices          norm for vectors
 ======================     =========================  ========================================================
@@ -1384,6 +1397,23 @@
 `-2`                       smallest singular value    as below
 other `int` or `float`     -- not supported --        `sum(abs(x)^{ord})^{(1 / ord)}`
 ======================     =========================  ========================================================
+=======
+======================     ==========================   ======================================================
+:attr:`ord`                norm for matrices            norm for vectors
+======================     ==========================   ======================================================
+`None` (default)           Frobenius norm               `2`-norm (see below)
+`'fro'`                    Frobenius norm               -- not supported --
+`'nuc'`                    nuclear norm                 -- not supported --
+`inf`                      `max(sum(abs(x), dim=1))`    `max(abs(x))`
+`-inf`                     `min(sum(abs(x), dim=1))`    `min(abs(x))`
+`0`                        -- not supported --          `sum(x != 0)`
+`1`                        `max(sum(abs(x), dim=0))`    as below
+`-1`                       `min(sum(abs(x), dim=0))`    as below
+`2`                        largest `singular value`_    as below
+`-2`                       smallest `singular value`_   as below
+other `int` or `float`     -- not supported --          `sum(abs(x)^{ord})^{(1 / ord)}`
+======================     ==========================   ======================================================
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 where `inf` refers to `float('inf')`, NumPy's `inf` object, or any equivalent object.
 
@@ -1483,6 +1513,12 @@
     tensor([ 3.7417, 11.2250])
     >>> LA.norm(A[0, :, :]), LA.norm(A[1, :, :])
     (tensor(3.7417), tensor(11.2250))
+<<<<<<< HEAD
+=======
+
+.. _singular value:
+    https://en.wikipedia.org/wiki/Singular_value_decomposition#Singular_values,_singular_vectors,_and_their_relation_to_the_SVD
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
@@ -2513,7 +2549,11 @@
     A named tuple `(LU, pivots, info)`.
 
 .. _LAPACK's getrf:
+<<<<<<< HEAD
     https://www.netlib.org/lapack/explore-html/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html
+=======
+    https://www.netlib.org/lapack/explore-html-3.6.1/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """,
 )
 
diff --git a/torch/masked/_ops.py b/torch/masked/_ops.py
index b4f96c7b1c45..98e93779e885 100644
--- a/torch/masked/_ops.py
+++ b/torch/masked/_ops.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 import warnings
 from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
+<<<<<<< HEAD
 from typing_extensions import ParamSpec
+=======
+from typing_extensions import ParamSpec, TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import sym_float, Tensor
@@ -12,6 +16,7 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from torch.types import _dtype as DType
 
     DimOrDims = Optional[Union[int, tuple[int], list[int]]]
@@ -19,6 +24,16 @@
     # The JIT doesn't understand Union, nor torch.dtype here
     DType = int
     DimOrDims = Optional[tuple[int]]
+=======
+    from torch._prims_common import DimsType
+    from torch.types import _dtype as DType
+
+    DimOrDims: TypeAlias = Optional[DimsType]
+else:
+    # The JIT doesn't understand Union, nor torch.dtype here
+    DType = int
+    DimOrDims = Optional[tuple[int, ...]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__: list[str] = []
@@ -308,14 +323,22 @@ def _generate_docstring(func):
     operation_args, operation_kwargs = args_and_kwargs[func.__name__]
     arg_declarations = [
         "\n    ".join(
+<<<<<<< HEAD
             argument_declarations.get(a, f'{a.split("__", 1)[0]}: TBD.').splitlines()
+=======
+            argument_declarations.get(a, f"{a.split('__', 1)[0]}: TBD.").splitlines()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         for a in operation_args
     ]
     kwarg_declarations = [
         "\n    ".join(
             argument_declarations.get(
+<<<<<<< HEAD
                 a.split("=", 1)[0], f'{a.split("__", 1)[0]}: TBD.'
+=======
+                a.split("=", 1)[0], f"{a.split('__', 1)[0]}: TBD."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             .format(default=a.split("=", 1)[1])
             .splitlines()
@@ -744,9 +767,15 @@ def _sparse_csr_segment_reduction_helper(
 ) -> Tensor:
     # Currently, while sparse CSR is always 2D with no dense dimensions keepdim must be True
     # FIXME: when dense dimensions are implemented for CSR tensors
+<<<<<<< HEAD
     assert (
         keepdim
     ), "reduction operations on CSR tensors with keepdim=False is unsupported"
+=======
+    assert keepdim, (
+        "reduction operations on CSR tensors with keepdim=False is unsupported"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reduce = op.__name__
     valid_reductions = ["sum", "prod", "mean", "amax", "amin"]
     if reduce not in valid_reductions:
@@ -780,9 +809,15 @@ def _sparse_csr_segment_reduction_helper(
             )
             new_shape = [1, mask_input.size(1)]
         else:
+<<<<<<< HEAD
             assert (
                 dims[0] == 1
             ), "Sparse CSR tensors are 2D and only support reduction along dim 0 or 1."
+=======
+            assert dims[0] == 1, (
+                "Sparse CSR tensors are 2D and only support reduction along dim 0 or 1."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # all intervals new_crow_indices[i] - new_crow_indices[i-1] are 1
             # except for where crow_indices[i] == crow_indices[i-1] where the interval remains as 0
             new_crow_indices = torch.cat(
@@ -793,7 +828,11 @@ def _sparse_csr_segment_reduction_helper(
                 0,
             )
             new_nnz = new_crow_indices[-1]
+<<<<<<< HEAD
             new_col_indices = col_indices.new_zeros(new_nnz)
+=======
+            new_col_indices = col_indices.new_zeros(new_nnz)  # type: ignore[call-overload]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             new_values = torch._segment_reduce(values, reduce, offsets=crow_indices)  # type: ignore[attr-defined]
             new_shape = [mask_input.size(0), 1]
     else:
@@ -1597,9 +1636,15 @@ def _std_var(
     mask: Optional[Tensor],
     take_sqrt: Optional[bool],
 ) -> Tensor:
+<<<<<<< HEAD
     assert (
         unbiased is None or correction_opt is None
     ), "Only one of unbiased and correction may be given"
+=======
+    assert unbiased is None or correction_opt is None, (
+        "Only one of unbiased and correction may be given"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     correction = 1.0
     if unbiased is not None:
         correction = 1.0 if unbiased else 0.0
@@ -1635,7 +1680,15 @@ def _std_var(
             total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype)
         else:
             total = sum(
+<<<<<<< HEAD
                 x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype, mask=inmask  # type: ignore[possibly-undefined]
+=======
+                x * x.conj(),
+                dim,
+                keepdim=keepdim,
+                dtype=compute_dtype,
+                mask=inmask,  # type: ignore[possibly-undefined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         if not keepdim:
             count = count.reshape(total.shape)
diff --git a/torch/masked/maskedtensor/__init__.py b/torch/masked/maskedtensor/__init__.py
index e38e03c87086..67019bb8e282 100644
--- a/torch/masked/maskedtensor/__init__.py
+++ b/torch/masked/maskedtensor/__init__.py
@@ -1,5 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
+<<<<<<< HEAD
 # flake8: noqa
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from .binary import _apply_native_binary, _is_native_binary
 from .core import is_masked_tensor, MaskedTensor
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index b9867def26a3..53b28d155b6e 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -25,7 +25,11 @@ def is_masked_tensor(obj: Any, /) -> TypeIs["MaskedTensor"]:
 
         >>> # xdoctest: +SKIP
         >>> from torch.masked import MaskedTensor
+<<<<<<< HEAD
         >>> data = torch.arange(6).reshape(2,3)
+=======
+        >>> data = torch.arange(6).reshape(2, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> mask = torch.tensor([[True, False, False], [True, True, False]])
         >>> mt = MaskedTensor(data, mask)
         >>> is_masked_tensor(mt)
@@ -174,7 +178,11 @@ def __new__(cls, data, mask, requires_grad=False):
                 UserWarning,
                 stacklevel=2,
             )
+<<<<<<< HEAD
         return torch.Tensor._make_wrapper_subclass(cls, data.size(), **kwargs)  # type: ignore[attr-defined]
+=======
+        return torch.Tensor._make_wrapper_subclass(cls, data.size(), **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _preprocess_data(self, data, mask):
         from .._ops import _sparse_coo_where, _sparse_csr_where
@@ -304,7 +312,11 @@ def unary(cls, fn, data, mask):
         return MaskedTensor(fn(data), mask)
 
     @classmethod
+<<<<<<< HEAD
     def __torch_dispatch__(cls, func, types, args, kwargs):
+=======
+    def __torch_dispatch__(cls, func, types, args, kwargs):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         func = func.overloadpacket
 
         from ._ops_refs import _MASKEDTENSOR_DISPATCH_TABLE
@@ -355,5 +367,9 @@ def is_sparse_csr(self):  # type: ignore[override]
 
     # Update later to support more sparse layouts
     @property
+<<<<<<< HEAD
     def is_sparse(self):
+=======
+    def is_sparse(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.is_sparse_coo() or self.is_sparse_csr()
diff --git a/torch/mps/__init__.py b/torch/mps/__init__.py
index b0a62c182578..e81f37c2a788 100644
--- a/torch/mps/__init__.py
+++ b/torch/mps/__init__.py
@@ -5,6 +5,10 @@
 performance can be achieved, by running work on the metal GPU(s).
 See https://developer.apple.com/documentation/metalperformanceshaders for more details.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Union
 
 import torch
diff --git a/torch/mtia/__init__.py b/torch/mtia/__init__.py
index b413dd4b5722..423b3bb28d21 100644
--- a/torch/mtia/__init__.py
+++ b/torch/mtia/__init__.py
@@ -30,6 +30,29 @@
 _lazy_seed_tracker = _LazySeedTracker()
 
 
+<<<<<<< HEAD
+=======
+if hasattr(torch._C, "_mtia_exchangeDevice"):
+    _exchange_device = torch._C._mtia_exchangeDevice
+else:
+
+    def _exchange_device(device: int) -> int:
+        if device < 0:
+            return -1
+        raise RuntimeError("PyTorch was compiled without MTIA support")
+
+
+if hasattr(torch._C, "_mtia_maybeExchangeDevice"):
+    _maybe_exchange_device = torch._C._mtia_maybeExchangeDevice
+else:
+
+    def _maybe_exchange_device(device: int) -> int:
+        if device < 0:
+            return -1
+        raise RuntimeError("PyTorch was compiled without MTIA support")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def init():
     _lazy_init()
 
@@ -177,6 +200,16 @@ def snapshot() -> dict[str, Any]:
     return torch._C._mtia_memorySnapshot()
 
 
+<<<<<<< HEAD
+=======
+def attach_out_of_memory_observer(
+    observer: Callable[[int, int, int, int], None],
+) -> None:
+    r"""Attach an out-of-memory observer to MTIA memory allocator"""
+    torch._C._mtia_attachOutOfMemoryObserver(observer)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_device_capability(device: Optional[_device_t] = None) -> tuple[int, int]:
     r"""Return capability of a given device as a tuple of (major version, minor version).
 
@@ -219,6 +252,20 @@ def set_device(device: _device_t) -> None:
         torch._C._accelerator_hooks_set_current_device(device)
 
 
+<<<<<<< HEAD
+=======
+def get_device_properties(device: Optional[_device_t] = None) -> dict[str, Any]:
+    r"""Return a dictionary of MTIA device properties
+
+    Args:
+        device (torch.device or int, optional) selected device. Returns
+            statistics for the current device, given by current_device(),
+            if device is None (default).
+    """
+    return torch._C._mtia_getDeviceProperties(_get_device_index(device, optional=True))
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class device:
     r"""Context-manager that changes the selected device.
 
@@ -356,8 +403,15 @@ def set_rng_state(
     "max_memory_allocated",
     "reset_peak_memory_stats",
     "get_device_capability",
+<<<<<<< HEAD
+    "record_memory_history",
+    "snapshot",
+=======
+    "get_device_properties",
     "record_memory_history",
     "snapshot",
+    "attach_out_of_memory_observer",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "empty_cache",
     "set_device",
     "set_stream",
diff --git a/torch/multiprocessing/__init__.py b/torch/multiprocessing/__init__.py
index 745c180d8c41..89d579595952 100644
--- a/torch/multiprocessing/__init__.py
+++ b/torch/multiprocessing/__init__.py
@@ -14,6 +14,10 @@
 Because of the similarity of APIs we do not document most of this package
 contents, and we recommend referring to very good docs of the original module.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import multiprocessing
 import sys
 
@@ -98,3 +102,27 @@ def _get_thread_name() -> str:
 
 
 init_reductions()
+<<<<<<< HEAD
+=======
+
+# Leak ResourceTracker at exit for Python-3.12 on MacOS
+# See https://github.com/pytorch/pytorch/issues/153050 and
+# https://github.com/python/cpython/issues/88887 for more details
+from multiprocessing.resource_tracker import ResourceTracker as _RT
+
+
+if (
+    sys.platform == "darwin"
+    and sys.version_info >= (3, 12, 2)
+    and hasattr(_RT, "__del__")
+):
+    import atexit
+
+    def _leak_RT_at_exit():
+        def _noop(x):
+            pass
+
+        _RT.__del__ = _noop  # type: ignore[attr-defined]
+
+    atexit.register(_leak_RT_at_exit)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/nativert/OVERVIEW.md b/torch/nativert/OVERVIEW.md
new file mode 100644
index 000000000000..2ce87a6aa3c6
--- /dev/null
+++ b/torch/nativert/OVERVIEW.md
@@ -0,0 +1,415 @@
+# NativeRT Technical Overview
+
+NativeRT is a flexible C++ inference engine for torch-exported models. For
+high-performance CPU inference on the PT2 stack, it's designed to be a drop-in
+replacement for
+[Static Runtime](https://github.com/pytorch/pytorch/blob/main/torch/csrc/jit/runtime/static/README.md).
+
+However, it's support doesn't end there; by default it integrates with the torch
+dispatcher, inheriting it's backend
+[support matrix](https://docs.pytorch.org/docs/stable/backends.html). Moreover,
+it supports the execution of
+[AOTInductor](https://github.com/pytorch/pytorch/blob/main/docs/source/torch.compiler_aot_inductor.rst)-lowered
+artifacts, and can be extended to support other delegate backends as seen fit.
+
+This document is intended to provide an overview of the foundational NativeRT
+components, features, and their interactions.
+
+## Table of Contents
+
+<!-- toc -->
+
+- [Getting Started](#getting-started)
+- [Components](#components)
+  - [Graph](#graph)
+  - [Node](#node)
+  - [Value](#value)
+  - [IValue](#ivalue)
+  - [OpKernel](#opkernel)
+  - [Weights](#weights)
+  - [Pytree](#pytree)
+- [Threading Model](#threading-model)
+  - [Execution Frame](#execution-frame)
+- [Features](#features)
+  - [Delegation](#delegation)
+  - [Static Dispatch Kernels](#static-dispatch-kernels)
+  - [Constant Folding](#constant-folding)
+  - [Quantization](#quantization)
+  - [Memory Planning](#memory-planning)
+  - [Inter-op parallelism](#inter-op-parallelism)
+
+<!-- tocstop -->
+
+## Getting Started
+
+To get up and running, there isn't much you need to do if you have the path to
+your PT2 archive. Below is the minimal code you will need to run inference on
+the sample inputs included with the archive.
+
+```cpp
+#include <nativert/core/ModelRunner.h>
+
+int main(int argc, char** argv) {
+  auto model_name = "my_model";
+  auto model_path = "/path/to/my/model";
+  const auto device = torch::Device(torch::kCUDA, 0);
+  ExecutorType executor_type = ExecutorType::INTERPRETER;
+
+  RuntimeConfigs cfg;
+  auto reader =
+      std::make_shared<caffe2::serialize::PyTorchStreamReader>(
+          std::make_unique<caffe2::serialize::FileAdapter>(
+              build::getResourcePath(std::move(model_path)).string()));
+
+  auto runner = ModelRunner(
+    std::move(reader),
+    std::move(model_name),
+    executor_type,
+    std::move(cfg),
+    Placement(device));
+
+  const auto [args, kwargs] =
+      runner.loadSampleInputs(std::move(reader), Placement(device));
+
+  auto output = runner.run(args, kwargs);
+
+  return 0;
+}
+```
+
+## Components
+
+### Graph
+
+NativeRT is a graph-centric runtime.
+
+In the front-end, torch.export produces an 'ExportedProgram' which contains an
+[FX Graph](https://github.com/pytorch/pytorch/tree/main/torch/fx#graph). At this
+point, it's possible for additional lowering (e.g., to AOTInductor),
+transformations, and optimizations to take place.
+
+[This graph](https://github.com/pytorch/pytorch/blob/main/torch/csrc/utils/generated_serialization_types.h#L1822)
+is then serialized into a PT2 archive, which can be deserialized into our
+internal, in-memory representation. Following the deserialization stage, upon
+initialization, additional graph-passes may be applied; these passes include but
+are not limited to [constant folding](#constant-folding), and
+[quantization](#quantization).
+
+Here is an example of how we would register a custom pass.
+
+```cpp
+#include <nativert/core/passes/pass_manager/GraphPassRegistry.h>
+
+GraphPassRegistry::add_pass("MyPass", [](Graph* graph) {
+    bool mutated = do_my_pass(graph);
+    return mutated;
+});
+```
+
+> :warning: **TODO** the graph must be added to the ModelRunner Pipeline s.t.,
+> it can be executed during initialization.
+
+After the graph passes have concluded, the graph is deemed immutable and is
+ready to run inference.
+
+### Node
+
+A node represents a vertex in the in-memory [graph representation](#graph).
+Nodes have a designated target (i.e., the op name), and a list of inputs and
+output values.
+
+### Value
+
+Values are a internal construct that represent the edges of our graph. In the
+graph, a value could only carry one of these permissible types: Tensor,
+TensorList, SymInt, SymIntList and CustomObj. Values are uniquely identifiable
+by an integer. Values also have a string name for logging purposes, but we don’t
+use it as the identifier. In short, a node consumes some values as inputs, and
+produces some values as outputs. Nodes are connected by values to form a graph.
+
+### IValue
+
+Not to be confused with value, which is a graph concept, an IValue (more
+formally known as an interpreter value), is a pytorch class. It is a union class
+that can hold many datatypes (e.g., at::Tensor, TensorList, SymInt,
+CustomClassObject) generically.
+
+Both graph inputs and outputs are IValue's, and execution frames store the state
+of the graphs execution using IValues.
+
+### OpKernel
+
+An OpKernel is an internal abstraction representing the computation unit for a
+particular graph vertex. As such, each OpKernel has an associated [Node](#node).
+OpKernel's are callable; the implementation is responsible for executing the
+computation for the associated Node.
+
+The most common OpKernel variant is the C10Kernel, which will by default offload
+the computation to the C10 dispatcher. For many performance-sensitive CPU
+operators, we provide [static-dispatch kernels](#static-dispatch-kernels) that
+can be executed without invoking the dispatcher.
+
+The computation of an OpKernel requires an [ExecutionFrame](#execution-frame) to
+be supplied. The frame contains the backed values, and we can use the associated
+Node's spec to map inputs/outputs to their runtime values during the kernel
+execution.
+
+### Weights
+
+Weights is a class that manages the static states of a model, which remains
+immutable throughout the execution. For example module parameters, buffers, and
+constants are all managed in Weights. These tensors are "read-only constants" in
+a single inference run(). As such, they can be shared across all threads.
+Weights is also an user-facing class that provides some APIs for advanced weight
+management, such as customized weight loading, weight swapping, and/or in-place
+updates.
+
+### Pytree
+
+> :warning: **TODO**
+
+## Threading Model
+
+Our threading model relies heavily on the concept of an 'Execution Frame.'
+
+### Execution Frame
+
+An execution frame encapsulates the state of a particular graph execution. We
+maintain a pool of them, and during each execution the calling thread will
+acquire one from the pool, execute the graph using that frame, and then return
+it back to the pool.
+
+In practice, it's a bit more complicated than this, but here is a pseudo-graphic
+attempting to explain the flow.
+
+```
+                                         ┌──────────────────── thread_n
+                                         │                        ▲
+┌────────────────────────────────────────┼────────────────────────┼─────────────────────────────────────────────┐
+│ModelRunner(name, ...)                  │run(args)               │                                             │
+│┌───────────────────────────────────────┼────────────────────────┼────────────────────────────────────────────┐│
+││Executor(Graph, Weights, ...)          │                        │                                            ││
+││                                       1                        5                                            ││
+││ ┌───────────────────┐                 │                        │                                            ││
+││ │ Execution Frames  │                 │                        └─────┐                                      ││
+││ │                   │                 │                              │                                      ││
+││ │ ┌──────────────┐  │                 │                              │   ┌─────────────────────────────────┐││
+││ │ │   Frame_0    │◀─┼──get_frame()──2─┴────3───execute(args,frame)───┼──▶│frame.parse_inputs(args)         │││
+││ │ ├──────────────┤  │                                                │   │                                 │││
+││ │ │   Frame_1    │  │                                                │   │for kernel in op_kernels:        │││
+││ │ ├──────────────┤  │                                                │   │    kernel.compute(frame)        │││
+││ │ │   Frame_N    │◀─┼────return_frame()──────┐                       │   │                                 │││
+││ │ └──────────────┘  │                        └─────────4─────────────┴───┤return frame.outputs()           │││
+││ └───────────────────┘                                                    └─────────────────────────────────┘││
+│└─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘│
+└───────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+To keep the memory footprint at bay, we have a garbage-collection mechanism to
+clean up frames that haven't been used in a while since it's possible that a
+traffic spike causes a bunch of frames to get created that will be left dormant
+after the spike has subsided.
+
+To set the maximum number of frames that NativeRT should create, you should use
+the following configuration.
+
+```cpp
+RuntimeConfigs {
+    .maxNumConcurrentThreads = n > 0,
+};
+```
+
+## Features
+
+### Delegation
+
+Compiled graphs/subgraphs are modeled as delegates in PT2 inference.
+
+For AOTInductor, it tends to compile the whole graph. In this case, the
+resulting lowered graph contains a single call_delegate node. We commonly call
+this full graph delegation.
+
+For other delegates, it would compile some selected regions of the graph, and
+leave the remaining uncompiled regions running on CPU. In the resulting lowered
+graph, each compiled subgraph region will be fused as a call_delegate node. We
+refer to this as partial graph delegation.
+
+NativeRT supports both full graph and partial graph delegation. As the name
+suggests, the runtime would execute the "call_delegate" node by delegating the
+computation to its compiled binary.
+
+### Static Dispatch Kernels
+
+For CPU kernels, it is extremely inefficient to go through the dispatcher. For
+one, the dispatcher doesn't deal with kernel out-variants.
+
+> **_NOTE:_** an out-variant of a kernel is one that takes the outputs as
+> mutable references. this has a few benefits... namely, it allows us to re-use
+> the storage/manage from the previous execution.
+
+In addition, the dispatcher acts as a stack machine. You push the inputs to the
+stack, run the op on the specified device, and then pop the outputs. This in
+itself is much more inefficient then accessing the values directly without
+having to play musical chairs.
+
+Registering statically-dispatched cpu kernels is pretty easy. Here is an example
+of how we would override the default relu kernel.
+
+```cpp
+#include <nativert/core/kernels/KernelRegistry.h>
+
+REGISTER_CPU_KERNEL("torch.ops.aten.relu.default", aten_relu, {
+  const auto& in0_t = KernelInput(0).toTensor();
+  if (KernelOutput(0).isNone()) {
+    KernelOutput(0) = create_empty_from(in0_t);
+  }
+  auto& out_t = KernelOutput(0).toTensor();
+  fastResizeToZero(out_t);
+  at::cpu::threshold_out(out_t, in0_t, 0, 0);
+});
+```
+
+Static dispatch kernel registration can be enabled using the following
+configurations.
+
+```cpp
+RuntimeConfigs {
+    .enableStaticCPUKernels = true,
+};
+```
+
+### Constant Folding
+
+Constant folding is the process of finding all of the constant-evaluable
+subgraphs, evaluating them at startup, and then storing thier results as
+constants as opposed to re-evaluting them every time.
+
+To enable constant folding, you can set the following configurations.
+
+```cpp
+RuntimeConfigs {
+    .enableRuntimeConstFolding = true,
+};
+```
+
+### Quantization
+
+For performance-sensitive models, we add the option to swap
+
+```
+torch.ops.aten.linear.default
+```
+
+with
+
+```
+torch.ops.quantized.linear_prepack_fp16.default
++
+torch.ops.quantized.linear_dynamic_fp16.default
+```
+
+which should give a ~2x speedup over the fp32 variant with minimal effect on
+correctness.
+
+The linear_prepack_fp16 op will be constant-folded, so it's imperitive that
+these two features are used together.
+
+To enable this feature, use the following configurations.
+
+```cpp
+RuntimeConfigs {
+    .enableQuantizationPasses = true,
+    .enableRuntimeConstFolding = true,
+};
+```
+
+### Memory Planning
+
+> :warning: **This is an experimental feature**
+
+The main upside of memory planning comes from the efficient re-use of tensor
+buffers, which is extremely important in memory-bound services. That is, if two
+tensors don’t have an overlapping lifetime during execution, and the first
+tensor is larger than the second, then the second tensor can share the same
+chunk of memory as the first. As such, the main goal of our planning mechanism
+is to pack tensors efficiently in memory with minimal impact on E2E execution
+latency.
+
+That said, there is a caveat -- the planning is best-effort for
+dynamically-sized tensors. Because we slab-allocate a buffer for all tensors
+before the graph is executed, and we cannot infer the exact size of some tensors
+(namely those with data-dependent shapes), we opt to plan based on their
+historical maximums.
+
+Keeping in mind the [threading model](#threading-model), we make the following
+additions to enable memory planning.
+
+```
+┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+│                                                                                                                                                                 │
+│                                                                                      ┌──────────────────── thread_n                                             │
+│                                                                                      │                        ▲                                                 │
+│                                            ┌─────────────────────────────────────────┼────────────────────────┼───────────────────────────────────────────────┐ │
+│    frame-local layout manager ─────────┐   │ModelRunner(name, ...)                   │run(args)               │                                               │ │
+│                                        │   │ ┌───────────────────────────────────────┼────────────────────────┼─────────────────────────────────────────────┐ │ │
+│    responsible for:                    │   │ │Executor(Graph, Weights, ...)          │                        │                                             │ │ │
+│                                        │   │ │                                       1                        6                                             │ │ │
+│    1. managing a buffer created        │   │ │ ┌───────────────────┐                 │                        │                                             │ │ │
+│    from the most-recent layout         │   │ │ │ Execution Frames  │                 │                        └─────┐                                       │ │ │
+│    plan                                │   │ │ │                   │                 │                              │                                       │ │ │
+│                                        │   │ │ │ ┌──────────────┐  │                 │                              │   ┌─────────────────────────────────┐ │ │ │
+│    2. ensuring tensor storages         └───┼─┼─┼─▶   Frame_0    │◀─┼──get_frame()──2─┴────3───execute(args,frame)───┼──▶│frame.layout_manager.allocate()  │ │ │ │
+│    are mapped to the correct,              │ │ │ ├──────────────┤  │                                                │   │...                              │ │ │ │
+│    owned buffer offsets                    │ │ │ │   Frame_1    │  │                                                │   │executor.execute(args, frame)    │ │ │ │
+│                                            │ │ │ ├──────────────┤  │                                                │   │...                              │ │ │ │
+│    3. telling the layout planner           │ │ │ │   Frame_N    │◀─┼────return_frame()────────────5─────────────────┴───│frame.layout_manager.deallocate()│ │ │ │
+│    when a tensor have outgrown             │ │ │ └──────────────┘  │                                   ┌───────4────────┴─────────────────────────────────┘ │ │ │
+│                                            │ │ └───────────────────┘                                   │                                                    │ │ │
+│                                            │ │ ┌───────────────────┐                                   │                                                    │ │ │
+│                                            │ │ │                   │                                   │                                                    │ │ │
+│                                       ┌────┼─┼─▶  Layout Planner   │◀─────update_max_tensor_sizes()────┘                                                    │ │ │
+│                                       │    │ │ │                   │                                                                                        │ │ │
+│                                       │    │ │ └─────────▲─────────┘                                                                                        │ │ │
+│                                       │    │ └───────────┼──────────────────────────────────────────────────────────────────────────────────────────────────┘ │ │
+│                                       │    └─────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────┘ │
+│                                       │                  │                                                                                                      │
+│                                       │                  └──┐                                                                                                   │
+│                                       │                     │                                                                                                   │
+│                                       │                     │                                                                                                   │
+│      cross-frame asynchronous ────────┘                     │                                                                                                   │
+│      best-effort layout planner                             │                                                                                                   │
+│                                                             │                                                                                                   │
+│      responsible for:                                                                                                                                           │
+│                                               plan is updated on interval.                                                                                      │
+│      1. aggregating historical maximum      most up-to-date plan accessible                                                                                     │
+│      tensor sizes across associated                 from each frame.                                                                                            │
+│      layout managers                                                                                                                                            │
+│                                                                                                                                                                 │
+│      2. re-planning based on these                                                                                                                              │
+│      historical maximums on some                                                                                                                                │
+│      predefined interval                                                                                                                                        │
+│                                                                                                                                                                 │
+│      3. giving associated layout managers                                                                                                                       │
+│      access to the most up-to-date plan                                                                                                                         │
+│                                                                                                                                                                 │
+└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+### Inter-op parallelism
+
+> :warning: **This is an experimental feature**
+
+Inter-op parallelism allows us to execute graph nodes concurrently as long as
+all node inputs have been realized. This is an experimental feature, and its
+effectiveness is highly dependent on the graph shape, the ops being executed,
+and the model traffic patterns. This functionality does not currently work with
+memory planning enabled, as the planner makes assumptions about lifetimes that
+do not hold when node execution order is undefined.
+
+To enable this feature, use the following configurations.
+
+```cpp
+RuntimeConfigs {
+    .maxParallelOps = n > 1,
+};
+```
diff --git a/torch/nativert/common/FileUtil.cpp b/torch/nativert/common/FileUtil.cpp
new file mode 100644
index 000000000000..10c03638740f
--- /dev/null
+++ b/torch/nativert/common/FileUtil.cpp
@@ -0,0 +1,207 @@
+#include <torch/nativert/common/FileUtil.h>
+
+#ifdef _WIN32
+#include <io.h>
+#define open _open
+#define read _read
+#define write _write
+#define fileno _fileno
+#define dup _dup
+#else
+#include <unistd.h>
+#endif
+#include <cerrno>
+
+#include <fmt/core.h>
+
+namespace torch::nativert {
+
+namespace {
+
+int unistd_close(int fh) {
+#ifdef _WIN32
+  return ::_close(fh);
+#else
+  return ::close(fh);
+#endif
+}
+
+inline void incr(ssize_t) {}
+template <typename Offset>
+inline void incr(ssize_t n, Offset& offset) {
+  offset += static_cast<Offset>(n);
+}
+
+// Wrap call to read/pread/write/pwrite(fd, buf, count, offset?) to retry on
+// incomplete reads / writes.  The variadic argument magic is there to support
+// an additional argument (offset) for pread / pwrite; see the incr() functions
+// above which do nothing if the offset is not present and increment it if it
+// is.
+template <class F, class... Offset>
+ssize_t wrapFull(F f, int fd, void* buf, size_t count, Offset... offset) {
+  char* b = static_cast<char*>(buf);
+  ssize_t totalBytes = 0;
+  ssize_t r = -1;
+  do {
+    r = f(fd, b, count, offset...);
+    if (r == -1) {
+      if (errno == EINTR) {
+        continue;
+      }
+      return r;
+    }
+
+    totalBytes += r;
+    b += r;
+    count -= r;
+    incr(r, offset...);
+  } while (r != 0 && count); // 0 means EOF
+
+  return totalBytes;
+}
+
+int filterCloseReturn(int r) {
+  // Ignore EINTR.  On Linux, close() may only return EINTR after the file
+  // descriptor has been closed, so you must not retry close() on EINTR --
+  // in the best case, you'll get EBADF, and in the worst case, you'll end up
+  // closing a different file (one opened from another thread).
+  //
+  // Interestingly enough, the Single Unix Specification says that the state
+  // of the file descriptor is unspecified if close returns EINTR.  In that
+  // case, the safe thing to do is also not to retry close() -- leaking a file
+  // descriptor is definitely better than closing the wrong file.
+  if (r == -1 && errno == EINTR) {
+    return 0;
+  }
+  return r;
+}
+
+//  The following wrapX() funcions are private functions for wrapping file-io
+//  against interrupt and partial op completions.
+
+// Wrap call to f(args) in loop to retry on EINTR
+template <class F, class... Args>
+ssize_t wrapNoInt(F f, Args... args) {
+  ssize_t r = -1;
+  do {
+    r = f(std::forward<Args>(args)...);
+  } while (r == -1 && errno == EINTR);
+  return r;
+}
+
+} // namespace
+
+int openNoInt(const char* name, int flags, mode_t mode) {
+  // Android NDK bionic with FORTIFY has this definition:
+  // https://android.googlesource.com/platform/bionic/+/9349b9e51b/libc/include/bits/fortify/fcntl.h
+  // ```
+  // __BIONIC_ERROR_FUNCTION_VISIBILITY
+  // int open(const char* pathname, int flags, mode_t modes, ...) __overloadable
+  //         __errorattr(__open_too_many_args_error);
+  // ```
+  // This is originally to prevent open() with incorrect parameters.
+  //
+  // However, combined with folly wrapNotInt, template deduction will fail.
+  // In this case, we create a custom lambda to bypass the error.
+  // The solution is referenced from
+  // https://github.com/llvm/llvm-project/commit/0a0e411204a2baa520fd73a8d69b664f98b428ba
+  //
+  auto openWrapper = [&] { return open(name, flags, mode); };
+  return int(wrapNoInt(openWrapper));
+}
+
+int closeNoInt(int fd) {
+  return filterCloseReturn(unistd_close(fd));
+}
+
+ssize_t writeFull(int fd, const void* buf, size_t count) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+  return wrapFull(write, fd, const_cast<void*>(buf), count);
+}
+
+ssize_t readFull(int fd, void* buf, size_t count) {
+  return wrapFull(read, fd, buf, count);
+}
+
+File::File(int fd, bool ownsFd) noexcept : fd_(fd), ownsFd_(ownsFd) {
+  TORCH_CHECK(fd >= -1, "fd must be -1 or non-negative");
+  TORCH_CHECK(fd != -1 || !ownsFd, "cannot own -1");
+}
+
+File::File(std::string_view name, int flags, mode_t mode)
+    : fd_(::open(std::string(name).c_str(), flags, mode)), ownsFd_(false) {
+  if (fd_ == -1) {
+    throw std::runtime_error(fmt::format(
+        "open(\"{}\", {}, 0{}) failed with errno {}.",
+        name,
+        flags,
+        mode,
+        errno));
+  }
+  ownsFd_ = true;
+}
+
+File::File(File&& other) noexcept : fd_(other.fd_), ownsFd_(other.ownsFd_) {
+  other.release();
+}
+
+File& File::operator=(File&& other) noexcept {
+  closeNoThrow();
+  swap(other);
+  return *this;
+}
+
+File::~File() {
+  auto fd = fd_;
+  if (!closeNoThrow()) { // ignore most errors
+    TORCH_CHECK(
+        errno != EBADF,
+        "closing fd ",
+        fd,
+        ", it may already ",
+        "have been closed. Another time, this might close the wrong FD.");
+  }
+}
+
+/* static */ File File::temporary() {
+  // make a temp file with tmpfile(), dup the fd, then return it in a File.
+  FILE* tmpFile = tmpfile();
+  if (!tmpFile) {
+    throw std::runtime_error("tmpfile() failed");
+  }
+  auto guard = c10::make_scope_exit([&]() { fclose(tmpFile); });
+
+  int fd = ::dup(fileno(tmpFile));
+  if (fd == -1) {
+    throw std::runtime_error("dup() failed");
+  }
+
+  return File(fd, true);
+}
+
+int File::release() noexcept {
+  int released = fd_;
+  fd_ = -1;
+  ownsFd_ = false;
+  return released;
+}
+
+void File::swap(File& other) noexcept {
+  using std::swap;
+  swap(fd_, other.fd_);
+  swap(ownsFd_, other.ownsFd_);
+}
+
+void File::close() {
+  if (!closeNoThrow()) {
+    throw std::runtime_error("close() failed");
+  }
+}
+
+[[nodiscard]] bool File::closeNoThrow() {
+  int r = ownsFd_ ? unistd_close(fd_) : 0;
+  release();
+  return r == 0;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/common/FileUtil.h b/torch/nativert/common/FileUtil.h
new file mode 100644
index 000000000000..28fc7c11bc35
--- /dev/null
+++ b/torch/nativert/common/FileUtil.h
@@ -0,0 +1,258 @@
+#pragma once
+
+/*
+ * Ported from folly/FileUtil.h
+ */
+#include <limits>
+#include <string_view>
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+// Copied from folly/portability/SysTypes.h
+#ifdef _WIN32
+#include <basetsd.h>
+
+// This is a massive pain to have be an `int` due to the pthread implementation
+// we support, but it's far more compatible with the rest of the windows world
+// as an `int` than it would be as a `void*`
+using pid_t = int;
+
+using uid_t = int;
+using gid_t = int;
+
+// This isn't actually supposed to be defined here, but it's the most
+// appropriate place without defining a portability header for stdint.h
+// with just this single typedef.
+using ssize_t = SSIZE_T;
+
+#ifndef HAVE_MODE_T
+#define HAVE_MODE_T 1
+// The Windows headers don't define this anywhere, nor do any of the libs
+// that Folly depends on, so define it here.
+using mode_t = unsigned int;
+#endif
+
+// Copied from folly/portability/Fcntl.h
+#define O_CLOEXEC _O_NOINHERIT
+#endif
+
+#include <c10/util/Exception.h>
+#include <c10/util/ScopeExit.h>
+
+namespace torch::nativert {
+class File {
+ public:
+  /**
+   * Creates an empty File object, for late initialization.
+   */
+  constexpr File() noexcept : fd_(-1), ownsFd_(false) {}
+
+  /**
+   * Create a File object from an existing file descriptor.
+   *
+   * @param fd Existing file descriptor
+   * @param ownsFd Takes ownership of the file descriptor if ownsFd is true.
+   */
+  explicit File(int fd, bool ownsFd = false) noexcept;
+
+  /**
+   * Open and create a file object.  Throws on error.
+   * Owns the file descriptor implicitly.
+   */
+  explicit File(
+      std::string_view name,
+      int flags = O_RDONLY,
+      mode_t mode = 0666);
+
+  ~File();
+
+  /**
+   * Create and return a temporary, owned file (uses tmpfile()).
+   */
+  static File temporary();
+
+  /**
+   * Return the file descriptor, or -1 if the file was closed.
+   */
+  int fd() const {
+    return fd_;
+  }
+
+  /**
+   * Returns 'true' iff the file was successfully opened.
+   */
+  explicit operator bool() const {
+    return fd_ != -1;
+  }
+
+  /**
+   * If we own the file descriptor, close the file and throw on error.
+   * Otherwise, do nothing.
+   */
+  void close();
+
+  /**
+   * Closes the file (if owned).  Returns true on success, false (and sets
+   * errno) on error.
+   */
+  bool closeNoThrow();
+
+  /**
+   * Returns and releases the file descriptor; no longer owned by this File.
+   * Returns -1 if the File object didn't wrap a file.
+   */
+  int release() noexcept;
+
+  /**
+   * Swap this File with another.
+   */
+  void swap(File& other) noexcept;
+
+  // movable
+  File(File&&) noexcept;
+  File& operator=(File&&) noexcept;
+
+ private:
+  // unique
+  File(const File&) = delete;
+  File& operator=(const File&) = delete;
+
+  int fd_;
+  bool ownsFd_;
+};
+
+/**
+ * Convenience wrappers around some commonly used system calls.  The *NoInt
+ * wrappers retry on EINTR.  The *Full wrappers retry on EINTR and also loop
+ * until all data is written.  Note that *Full wrappers weaken the thread
+ * semantics of underlying system calls.
+ */
+int openNoInt(const char* name, int flags, mode_t mode = 0666);
+int closeNoInt(int fd);
+
+/**
+ * Similar to readFull and preadFull above, wrappers around write() and
+ * pwrite() that loop until all data is written.
+ *
+ * Generally, the write() / pwrite() system call may always write fewer bytes
+ * than requested, just like read().  In certain cases (such as when writing to
+ * a pipe), POSIX provides stronger guarantees, but not in the general case.
+ * For example, Linux (even on a 64-bit platform) won't write more than 2GB in
+ * one write() system call.
+ *
+ * Note that writevFull and pwritevFull require iov to be non-const, unlike
+ * writev and pwritev.  The contents of iov after these functions return
+ * is unspecified.
+ *
+ * These functions return -1 on error, or the total number of bytes written
+ * (which is always the same as the number of requested bytes) on success.
+ */
+ssize_t writeFull(int fd, const void* buf, size_t count);
+
+/**
+ * Wrapper around read() (and pread()) that, in addition to retrying on
+ * EINTR, will loop until all data is read.
+ *
+ * This wrapper is only useful for blocking file descriptors (for non-blocking
+ * file descriptors, you have to be prepared to deal with incomplete reads
+ * anyway), and only exists because POSIX allows read() to return an incomplete
+ * read if interrupted by a signal (instead of returning -1 and setting errno
+ * to EINTR).
+ *
+ * Note that this wrapper weakens the thread safety of read(): the file pointer
+ * is shared between threads, but the system call is atomic.  If multiple
+ * threads are reading from a file at the same time, you don't know where your
+ * data came from in the file, but you do know that the returned bytes were
+ * contiguous.  You can no longer make this assumption if using readFull().
+ * You should probably use pread() when reading from the same file descriptor
+ * from multiple threads simultaneously, anyway.
+ *
+ * Note that readvFull and preadvFull require iov to be non-const, unlike
+ * readv and preadv.  The contents of iov after these functions return
+ * is unspecified.
+ */
+[[nodiscard]] ssize_t readFull(int fd, void* buf, size_t count);
+
+/**
+ * Read entire file (if num_bytes is defaulted) or no more than
+ * num_bytes (otherwise) into container *out. The container is assumed
+ * to be contiguous, with element size equal to 1, and offer size(),
+ * reserve(), and random access (e.g. std::vector<char>, std::string,
+ * fbstring).
+ *
+ * Returns: true on success or false on failure. In the latter case
+ * errno will be set appropriately by the failing system primitive.
+ */
+template <class Container>
+bool readFile(
+    int fd,
+    Container& out,
+    size_t num_bytes = std::numeric_limits<size_t>::max()) {
+  static_assert(
+      sizeof(out[0]) == 1,
+      "readFile: only containers with byte-sized elements accepted");
+
+  size_t soFar = 0; // amount of bytes successfully read
+  auto guard = c10::make_scope_exit([&]() {
+    assert(out.size() >= soFar); // resize better doesn't throw
+    out.resize(soFar);
+  });
+
+  // Obtain file size:
+  struct stat buf;
+  if (fstat(fd, &buf) == -1) {
+    return false;
+  }
+  // Some files (notably under /proc and /sys on Linux) lie about
+  // their size, so treat the size advertised by fstat under advise
+  // but don't rely on it. In particular, if the size is zero, we
+  // should attempt to read stuff. If not zero, we'll attempt to read
+  // one extra byte.
+  constexpr size_t initialAlloc = 1024 * 4;
+  out.resize(std::min(
+      buf.st_size > 0 ? (size_t(buf.st_size) + 1) : initialAlloc, num_bytes));
+
+  while (soFar < out.size()) {
+    const auto actual = readFull(fd, &out[soFar], out.size() - soFar);
+    if (actual == -1) {
+      return false;
+    }
+    soFar += actual;
+    if (soFar < out.size()) {
+      // File exhausted
+      break;
+    }
+    // Ew, allocate more memory. Use exponential growth to avoid
+    // quadratic behavior. Cap size to num_bytes.
+    out.resize(std::min(out.size() * 3 / 2, num_bytes));
+  }
+
+  return true;
+}
+
+/**
+ * Same as above, but takes in a file name instead of fd
+ */
+template <class Container>
+bool readFile(
+    const char* file_name,
+    Container& out,
+    size_t num_bytes = std::numeric_limits<size_t>::max()) {
+  TORCH_CHECK(file_name);
+
+  const auto fd = openNoInt(file_name, O_RDONLY | O_CLOEXEC);
+  if (fd == -1) {
+    return false;
+  }
+
+  auto guard = c10::make_scope_exit([&]() {
+    // Ignore errors when closing the file
+    closeNoInt(fd);
+  });
+
+  return readFile(fd, out, num_bytes);
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/detail/ITree.cpp b/torch/nativert/detail/ITree.cpp
new file mode 100644
index 000000000000..123ee4498d06
--- /dev/null
+++ b/torch/nativert/detail/ITree.cpp
@@ -0,0 +1,485 @@
+#include <ATen/record_function.h>
+#include <torch/nativert/detail/ITree.h>
+
+#include <iterator>
+#include <string_view>
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/Synchronized.h>
+#include <nlohmann/json.hpp>
+
+namespace torch::nativert::detail {
+
+namespace {
+inline constexpr int kDefaultTreeSpecSerializationProtocol = 1;
+
+c10::IValue dynamicToIValue(const nlohmann::json& obj) {
+  if (obj.is_string()) {
+    return obj.get<std::string>();
+  } else if (obj.is_number_integer()) {
+    return obj.get<int64_t>();
+  } else {
+    TORCH_CHECK(false, "Unsupported dynamic type: ", obj);
+  }
+}
+
+void itreeFlatten(
+    const c10::IValue& nested,
+    const ITreeSpec& spec,
+    std::vector<c10::IValue>& ivalues) {
+  if (spec.isIValue()) {
+    ivalues.push_back(nested);
+    return;
+  }
+  auto flattenFn = spec.nodeDefCache().flattenFn;
+  flattenFn(nested, spec, ivalues);
+}
+
+class PytreeNodeRegistry {
+ public:
+  PytreeNodeRegistry() {
+    // Add some law of physics here.
+    registerNode(
+        "builtins.tuple",
+        NodeDef{
+            [](const c10::IValue& nested,
+               const ITreeSpec& spec,
+               std::vector<c10::IValue>& ivalues) {
+              const auto& tuple = nested.toTupleRef().elements();
+              TORCH_CHECK_EQ(tuple.size(), spec.children().size());
+              for (size_t i = 0; i < tuple.size(); i++) {
+                itreeFlatten(tuple[i], spec.children(i), ivalues);
+              }
+            },
+            [](std::vector<c10::IValue> flats,
+               const nlohmann::json& obj) -> c10::IValue {
+              TORCH_INTERNAL_ASSERT_DEBUG_ONLY(obj.is_null());
+              return c10::ivalue::Tuple::create(std::move(flats));
+            },
+            [](ITreeMapNoReturnFn fn,
+               const c10::IValue& nested,
+               const ITreeSpec& spec) {
+              const auto& tuple = nested.toTupleRef().elements();
+              TORCH_CHECK_EQ(tuple.size(), spec.children().size());
+              for (size_t i = 0; i < tuple.size(); i++) {
+                ivalueApply(fn, tuple[i], spec.children(i));
+              }
+            }});
+    const auto& tupleNodeDef = getNodeDef("builtins.tuple");
+    registerNode(
+        "collections.namedtuple",
+        NodeDef{
+            tupleNodeDef.flattenFn,
+            [](std::vector<c10::IValue> flats,
+               const nlohmann::json& obj) -> c10::IValue {
+              TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!obj.is_null());
+              return c10::ivalue::Tuple::create(std::move(flats));
+            },
+            tupleNodeDef.ivalueApplyFn,
+            [](std::string_view context) { return nlohmann::json{context}; }});
+    registerNode(
+        "builtins.list",
+        NodeDef{
+            [](const c10::IValue& nested,
+               const ITreeSpec& spec,
+               std::vector<c10::IValue>& ivalues) {
+              auto list = nested.toListRef();
+              for (size_t i = 0; i < list.size(); i++) {
+                itreeFlatten(list[i], spec.children(i), ivalues);
+              }
+            },
+            [](std::vector<c10::IValue> flats,
+               const nlohmann::json& obj) -> c10::IValue {
+              TORCH_INTERNAL_ASSERT_DEBUG_ONLY(obj.is_null());
+              c10::List<c10::IValue> list(c10::AnyType::get());
+              list.reserve(flats.size());
+              for (auto& flat : flats) {
+                list.push_back(std::move(flat));
+              }
+              return list;
+            },
+            [](ITreeMapNoReturnFn fn,
+               const c10::IValue& nested,
+               const ITreeSpec& spec) {
+              auto list = nested.toListRef();
+              for (size_t i = 0; i < list.size(); i++) {
+                ivalueApply(fn, list[i], spec.children(i));
+              }
+            }});
+    registerNode(
+        "torch.fx.immutable_collections.immutable_list",
+        getNodeDef("builtins.list"));
+    registerNode(
+        "builtins.dict",
+        NodeDef{
+            [](const c10::IValue& nested,
+               const ITreeSpec& spec,
+               std::vector<c10::IValue>& ivalues) {
+              auto dict = nested.toGenericDict();
+              const auto& contextKeys = spec.contextKeys();
+              // allow the dict size less than the spec, missing key will be
+              // filled with empty tensor
+              TORCH_CHECK_LE(dict.size(), contextKeys.size());
+              size_t i = 0;
+              for (const auto& key : contextKeys) {
+                auto it = dict.find(key);
+
+                if (it != dict.end()) {
+                  itreeFlatten(it->value(), spec.children(i), ivalues);
+                } else {
+                  // when we have a dict with missing keys, we fill the missing
+                  // ivalues with an empty tensor which is required for
+                  // validation
+                  for (size_t j = 0; j < spec.children(i).numIValues(); ++j) {
+                    at::Tensor empty_tensor;
+                    ivalues.emplace_back(std::move(empty_tensor));
+                  }
+                }
+                i++;
+              }
+            },
+            [](std::vector<c10::IValue> flats,
+               const nlohmann::json& obj) -> c10::IValue {
+              c10::Dict<c10::IValue, c10::IValue> dict(
+                  c10::AnyType::get(), c10::AnyType::get());
+              TORCH_CHECK(obj.is_array());
+              TORCH_CHECK_EQ(obj.size(), flats.size());
+              dict.reserve(flats.size());
+              for (size_t i = 0; i < flats.size(); i++) {
+                dict.insert(dynamicToIValue(obj[i]), std::move(flats[i]));
+              }
+              return dict;
+            },
+            [](ITreeMapNoReturnFn fn,
+               const c10::IValue& nested,
+               const ITreeSpec& spec) {
+              auto dict = nested.toGenericDict();
+              const auto& contextKeys = spec.contextKeys();
+
+              size_t i = 0;
+              for (const auto& key : contextKeys) {
+                if (spec.children(i).isUsed()) {
+                  auto it = dict.find(key);
+                  if (it != dict.end()) {
+                    ivalueApply(fn, it->value(), spec.children(i));
+                  } else {
+                    TORCH_CHECK(false, "input arg is missing key ", key);
+                  }
+                }
+                i++;
+              }
+            }});
+    registerNode(
+        "torch.fx.immutable_collections.immutable_dict",
+        getNodeDef("builtins.dict"));
+  }
+  bool hasNodeDef(std::string_view typeName) const {
+    return registry_.find(std::string{typeName}) != registry_.end();
+  }
+  const NodeDef& getNodeDef(std::string_view typeName) const {
+    return registry_.at(std::string{typeName});
+  }
+  void registerNode(std::string_view typeName, NodeDef nodeDef) {
+    TORCH_CHECK(!hasNodeDef(typeName));
+    registry_.emplace(typeName, nodeDef);
+  }
+
+ private:
+  std::unordered_map<std::string, NodeDef> registry_;
+};
+
+c10::Synchronized<PytreeNodeRegistry>& getPytreeNodeRegistry() {
+  static auto* registry = new c10::Synchronized<PytreeNodeRegistry>();
+  return *registry;
+}
+
+ITreeSpec makeITreeSpec(
+    const nlohmann::json& obj,
+    const std::vector<const Value*>& values,
+    int start) {
+  TORCH_CHECK(obj.is_object());
+  TORCH_CHECK(obj.find("type") != obj.end());
+  if (obj["type"].is_null()) {
+    TORCH_CHECK_EQ(obj["children_spec"].size(), 0);
+    TORCH_CHECK(obj["context"].is_null());
+
+    const Value* value = values[start];
+    if (value) {
+      bool isUsed = !value->users().empty();
+      return ITreeSpec(value, isUsed);
+    } else {
+      return ITreeSpec(value, false);
+    }
+  }
+  const auto& name = obj["type"].get<std::string>();
+  NodeDef nodeDefCache;
+  getPytreeNodeRegistry().withLock([&](auto& registry) {
+    TORCH_CHECK(registry.hasNodeDef(name), "Unknown pytree node type: ", name);
+    nodeDefCache = registry.getNodeDef(name);
+  });
+  auto context = nodeDefCache.contextLoadFn(obj["context"].get<std::string>());
+  const auto& childrenSpec = obj["children_spec"];
+  TORCH_CHECK(childrenSpec.is_array());
+  std::vector<ITreeSpec> children;
+  int offset = 0;
+  for (const auto& child : childrenSpec) {
+    children.push_back(makeITreeSpec(child, values, start + offset));
+    // NOLINTNEXTLINE(*-narrowing-conversions)
+    offset += children.back().numIValues();
+  }
+
+  return ITreeSpec(name, context, std::move(children), nodeDefCache);
+}
+
+} // namespace
+
+void registerPytreeNode(std::string_view typeName, NodeDef nodeDef) {
+  getPytreeNodeRegistry().withLock([&](auto& registry) {
+    registry.registerNode(typeName, std::move(nodeDef));
+  });
+}
+
+ITreeSpec itreeSpecLoads(
+    std::string_view json,
+    const std::vector<const Value*>& values) {
+  const auto obj = nlohmann::json::parse(json);
+  TORCH_CHECK(obj.is_array());
+  TORCH_CHECK_EQ(obj.size(), 2);
+  TORCH_CHECK_EQ(obj[0].get<int64_t>(), kDefaultTreeSpecSerializationProtocol);
+  auto result = makeITreeSpec(obj[1], values, 0);
+
+  TORCH_CHECK_EQ(result.numIValues(), values.size());
+  return result;
+}
+
+c10::IValue itreeUnflatten(
+    std::vector<c10::IValue> ivalues,
+    const ITreeSpec& spec) {
+  RECORD_USER_SCOPE("nativert::itreeUnflatten");
+  TORCH_CHECK_EQ(ivalues.size(), spec.numIValues());
+  if (spec.isIValue()) {
+    return std::move(ivalues[0]);
+  }
+  auto unflattenFn = spec.nodeDefCache().unflattenFn;
+  if (spec.allIValues()) {
+    return unflattenFn(std::move(ivalues), spec.context());
+  }
+  size_t start = 0;
+  std::vector<c10::IValue> childrenPytrees;
+  for (const auto& child : spec.children()) {
+    if (child.isIValue()) {
+      childrenPytrees.push_back(std::move(ivalues[start]));
+      start++;
+      continue;
+    }
+    size_t numIValues = child.numIValues();
+    std::vector<c10::IValue> slice(
+        // NOLINTNEXTLINE(*-narrowing-conversions)
+        std::make_move_iterator(ivalues.begin() + start),
+        // NOLINTNEXTLINE(*-narrowing-conversions)
+        std::make_move_iterator(ivalues.begin() + start + numIValues));
+    childrenPytrees.push_back(itreeUnflatten(std::move(slice), child));
+    start += numIValues;
+  }
+  return unflattenFn(std::move(childrenPytrees), spec.context());
+}
+
+std::vector<c10::IValue> itreeFlatten(
+    const c10::IValue& nested,
+    const ITreeSpec& spec) {
+  std::vector<c10::IValue> ivalues;
+  ivalues.reserve(spec.numIValues());
+  itreeFlatten(nested, spec, ivalues);
+  return ivalues;
+}
+
+std::vector<c10::IValue> itreeFlattenFromArgs(
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs,
+    const ITreeSpec& spec) {
+  RECORD_USER_SCOPE("nativert::itreeFlattenFromArgs");
+  TORCH_CHECK(!spec.isIValue());
+  TORCH_CHECK_EQ(spec.children().size(), 2);
+
+  std::vector<c10::IValue> ivalues;
+  ivalues.reserve(spec.numIValues());
+  const auto& specArgs = spec.children(0);
+  TORCH_CHECK(!specArgs.isIValue());
+  TORCH_CHECK_EQ(specArgs.children().size(), args.size());
+  for (size_t i = 0; i < args.size(); i++) {
+    itreeFlatten(args[i], specArgs.children(i), ivalues);
+  }
+
+  const auto& specKwargs = spec.children(1);
+  TORCH_CHECK(!specKwargs.isIValue());
+  TORCH_CHECK_EQ(specKwargs.context().size(), kwargs.size());
+  for (size_t i = 0; i < specKwargs.context().size(); i++) {
+    itreeFlatten(
+        kwargs.at(specKwargs.context()[i].get_ref<const std::string&>()),
+        specKwargs.children(i),
+        ivalues);
+  }
+  return ivalues;
+}
+
+void ivalueApplyFromArgs(
+    ITreeMapNoReturnFn fn,
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs,
+    const ITreeSpec& spec) {
+  RECORD_USER_SCOPE("nativert::ivalueApplyFromArgs");
+  TORCH_CHECK(!spec.isIValue());
+  TORCH_CHECK_EQ(spec.children().size(), 2);
+
+  const auto& specArgs = spec.children(0);
+  TORCH_CHECK(!specArgs.isIValue());
+  TORCH_CHECK_EQ(specArgs.children().size(), args.size());
+  for (size_t i = 0; i < args.size(); i++) {
+    ivalueApply(fn, args[i], specArgs.children(i));
+  }
+
+  const auto& specKwargs = spec.children(1);
+  TORCH_CHECK(!specKwargs.isIValue());
+
+  const auto& ctx = specKwargs.context();
+  TORCH_CHECK_EQ(ctx.size(), kwargs.size());
+
+  for (size_t i = 0; i < ctx.size(); i++) {
+    ivalueApply(
+        fn,
+        kwargs.at(ctx[i].get_ref<const std::string&>()),
+        specKwargs.children(i));
+  }
+}
+
+std::vector<at::Tensor> itreeFlattenToTensorList(
+    const c10::IValue& nested,
+    const ITreeSpec& spec) {
+  auto flats = itreeFlatten(nested, spec);
+  std::vector<at::Tensor> tensors;
+  tensors.reserve(flats.size());
+  for (const auto& flat : flats) {
+    tensors.push_back(flat.toTensor());
+  }
+  return tensors;
+}
+
+c10::IValue itreeMap(
+    ITreeMapFn f,
+    const c10::IValue& nested,
+    const ITreeSpec& spec) {
+  const auto flats = itreeFlatten(nested, spec);
+  std::vector<c10::IValue> mapped;
+  mapped.reserve(flats.size());
+  for (const auto& flat : flats) {
+    mapped.push_back(f(flat));
+  }
+  return itreeUnflatten(std::move(mapped), spec);
+}
+
+c10::IValue argsToIValue(
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs) {
+  c10::Dict<c10::IValue, c10::IValue> dict(
+      c10::StringType::get(), c10::AnyType::get());
+  for (const auto& [key, arg] : kwargs) {
+    dict.insert(key, arg);
+  }
+  return c10::ivalue::Tuple::create({c10::ivalue::Tuple::create(args), dict});
+}
+
+std::
+    pair<std::vector<c10::IValue>, std::unordered_map<std::string, c10::IValue>>
+    itreeMapArgs(
+        ITreeMapFn f,
+        const std::vector<c10::IValue>& args,
+        const std::unordered_map<std::string, c10::IValue>& kwargs,
+        const ITreeSpec& spec) {
+  const auto val = argsToIValue(args, kwargs);
+  const auto mapVal = itreeMap(f, val, spec);
+  auto mapArgs =
+      mapVal.toTupleRef().elements()[0].toTupleRef().elements().vec();
+  std::unordered_map<std::string, c10::IValue> mapKwargs;
+  for (const auto& entry : mapVal.toTupleRef().elements()[1].toGenericDict()) {
+    mapKwargs.emplace(entry.key().toStringRef(), entry.value());
+  }
+  return {std::move(mapArgs), std::move(mapKwargs)};
+}
+
+void ivalueApply(
+    ITreeMapNoReturnFn fn,
+    const c10::IValue& nested,
+    const ITreeSpec& spec) {
+  if (spec.isIValue()) {
+    if (spec.isUsed()) {
+      fn(nested, spec.value());
+    }
+    return;
+  }
+  auto ivalueApplyFn = spec.nodeDefCache().ivalueApplyFn;
+  ivalueApplyFn(fn, nested, spec);
+}
+
+nlohmann::json defaultContextLoadFn(std::string_view context) {
+  return nlohmann::json::parse(context);
+}
+
+ITreeSpec::ITreeSpec(
+    std::string_view uniformName,
+    nlohmann::json context,
+    std::vector<ITreeSpec> children,
+    NodeDef nodeDefCache)
+    : uniformName_(uniformName),
+      context_(std::move(context)),
+      children_(std::move(children)),
+      nodeDefCache_(nodeDefCache),
+      numIValues_(0),
+      value_(nullptr),
+      isUsed_(false) {
+  for (auto& child : children_) {
+    numIValues_ += child.numIValues();
+    allIValues_ &= child.isIValue();
+    isUsed_ |= child.isUsed();
+  }
+
+  if (uniformName_ == "builtins.dict" ||
+      uniformName_ == "torch.fx.immutable_collections.immutable_dict") {
+    for (const auto& keyObj : context_) {
+      contextKeys_.push_back(dynamicToIValue(keyObj));
+    }
+  }
+}
+
+c10::TypePtr ITreeSpec::toAtenType() const {
+  if (isIValue()) {
+    return c10::AnyType::get();
+  } else if (uniformName_ == "builtins.tuple") {
+    std::vector<c10::TypePtr> childrenType;
+    childrenType.reserve(children_.size());
+    for (const auto& childrenSpec : children_) {
+      childrenType.emplace_back(childrenSpec.toAtenType());
+    }
+    return c10::TupleType::create(std::move(childrenType));
+  } else if (
+      uniformName_ == "builtins.list" ||
+      uniformName_ == "torch.fx.immutable_collections.immutable_list") {
+    if (children_.empty()) {
+      return c10::ListType::create(c10::AnyType::get());
+    } else {
+      return c10::ListType::create(children_[0].toAtenType());
+    }
+  } else if (
+      uniformName_ == "builtins.dict" ||
+      uniformName_ == "torch.fx.immutable_collections.immutable_dict") {
+    if (children_.empty()) {
+      return c10::DictType::create(c10::AnyType::get(), c10::AnyType::get());
+    } else {
+      return c10::DictType::create(
+          dynamicToIValue(context_[0]).type(), children_[0].toAtenType());
+    }
+  } else {
+    TORCH_CHECK(false, "Unsupported uniform name: ", uniformName());
+  }
+}
+
+} // namespace torch::nativert::detail
diff --git a/torch/nativert/detail/ITree.h b/torch/nativert/detail/ITree.h
new file mode 100644
index 000000000000..0d46129623e4
--- /dev/null
+++ b/torch/nativert/detail/ITree.h
@@ -0,0 +1,178 @@
+/*
+ * A C++ extension bridge with the Python pytree
+ * serialization/unserialization format for torch.export.
+ */
+
+#pragma once
+
+#include <optional>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <nlohmann/json.hpp>
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert::detail {
+
+using torch::nativert::Value;
+
+class ITreeSpec;
+
+using ITreeFlattenFn =
+    void (*)(const c10::IValue&, const ITreeSpec&, std::vector<c10::IValue>&);
+using ITreeUnflattenFn =
+    c10::IValue (*)(std::vector<c10::IValue>, const nlohmann::json&);
+
+using ContextLoadFn = nlohmann::json (*)(std::string_view);
+
+using ITreeMapFn = c10::function_ref<c10::IValue(const c10::IValue&)>;
+using ITreeMapNoReturnFn =
+    c10::function_ref<void(const c10::IValue&, const Value*)>;
+
+using IValueApplyFn =
+    void (*)(ITreeMapNoReturnFn, const c10::IValue&, const ITreeSpec&);
+
+nlohmann::json defaultContextLoadFn(std::string_view);
+
+struct NodeDef {
+  ITreeFlattenFn flattenFn;
+  ITreeUnflattenFn unflattenFn;
+  IValueApplyFn ivalueApplyFn;
+
+  ContextLoadFn contextLoadFn = defaultContextLoadFn;
+};
+
+class ITreeSpec {
+ public:
+  // Leaf node.
+  ITreeSpec(const Value* value = nullptr, bool isUsed = true)
+      : numIValues_(1), value_(value), isUsed_(isUsed) {}
+
+  // Non leaf node.
+  ITreeSpec(
+      std::string_view uniformName,
+      nlohmann::json context,
+      std::vector<ITreeSpec> children,
+      NodeDef nodeDefCache);
+
+  bool isIValue() const {
+    return !uniformName_;
+  }
+
+  std::string_view uniformName() const {
+    TORCH_CHECK(uniformName_);
+    return uniformName_.value();
+  }
+
+  const nlohmann::json& context() const {
+    return context_;
+  }
+
+  const std::vector<c10::IValue>& contextKeys() const {
+    return contextKeys_;
+  }
+
+  const auto& children() const {
+    return children_;
+  }
+
+  const ITreeSpec& children(size_t i) const {
+    return children_[i];
+  }
+
+  const NodeDef& nodeDefCache() const {
+    return nodeDefCache_;
+  }
+
+  size_t numIValues() const {
+    return numIValues_;
+  }
+
+  bool allIValues() const {
+    return allIValues_;
+  }
+
+  c10::TypePtr toAtenType() const;
+
+  bool isUsed() const {
+    return isUsed_;
+  }
+
+  const Value* value() const {
+    return value_;
+  }
+
+ private:
+  // Only non leaf nodes have names.
+  // Examples of uniform name: "builtins.tuple", "builtins.dict".
+  std::optional<std::string> uniformName_;
+  nlohmann::json context_;
+  std::vector<ITreeSpec> children_;
+
+  std::vector<c10::IValue> contextKeys_;
+
+  // Cached fields.
+  NodeDef nodeDefCache_;
+  size_t numIValues_;
+  bool allIValues_ = true;
+
+  const Value* value_;
+  bool isUsed_;
+};
+
+void registerPytreeNode(std::string_view typeName, NodeDef nodeDef);
+
+// Serialized json tree spec should be dumped from treespec_dumps() in
+// torch.utils._pytree directly .
+ITreeSpec itreeSpecLoads(
+    std::string_view json,
+    const std::vector<const Value*>& values);
+
+c10::IValue itreeUnflatten(
+    std::vector<c10::IValue> ivalues,
+    const ITreeSpec& spec);
+
+std::vector<c10::IValue> itreeFlatten(
+    const c10::IValue& nested,
+    const ITreeSpec& spec);
+
+std::vector<c10::IValue> itreeFlattenFromArgs(
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs,
+    const ITreeSpec& spec);
+
+std::vector<at::Tensor> itreeFlattenToTensorList(
+    const c10::IValue& nested,
+    const ITreeSpec& spec);
+
+c10::IValue itreeMap(
+    ITreeMapFn f,
+    const c10::IValue& nested,
+    const ITreeSpec& spec);
+
+c10::IValue TORCH_API argsToIValue(
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs);
+
+std::
+    pair<std::vector<c10::IValue>, std::unordered_map<std::string, c10::IValue>>
+    itreeMapArgs(
+        ITreeMapFn f,
+        const std::vector<c10::IValue>& args,
+        const std::unordered_map<std::string, c10::IValue>& kwargs,
+        const ITreeSpec& spec);
+
+void ivalueApply(
+    ITreeMapNoReturnFn f,
+    const c10::IValue& nested,
+    const ITreeSpec& spec);
+
+void ivalueApplyFromArgs(
+    ITreeMapNoReturnFn fn,
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs,
+    const ITreeSpec& spec);
+
+} // namespace torch::nativert::detail
diff --git a/torch/nativert/detail/MPMCQueue.h b/torch/nativert/detail/MPMCQueue.h
new file mode 100644
index 000000000000..3b90503887bb
--- /dev/null
+++ b/torch/nativert/detail/MPMCQueue.h
@@ -0,0 +1,63 @@
+/*
+ * A simple thread-safe multi-producer, multi-consumer queue.
+ *
+ * This is a wrapper around std::deque that provides non-blocking
+ * queue operations like readIfNotEmpty and writeIfNotFull using
+ * std mutexes and the underlying queue can only be accessed
+ * with synchronized sections.
+ *
+ * For now the goal is to provide a simple implementation that
+ * works in all cases and produces no surprises to users.
+ */
+
+#pragma once
+
+#include <deque>
+#include <mutex>
+#include <type_traits>
+
+namespace torch::nativert::detail {
+
+// TODO(zhxchen17) Add wrapper for concurrentqueue.
+template <typename T>
+class MPMCQueue {
+  static_assert(!std::is_reference_v<T>);
+
+ public:
+  explicit MPMCQueue(size_t capacity) : capacity_(capacity) {}
+
+  /**
+   * Read from the queue if it is not empty.
+   * @param out The value to read into.
+   * @return true if the read succeeded, false if the queue is empty.
+   */
+  bool readIfNotEmpty(T& out) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (storage_.empty()) {
+      return false;
+    }
+    out = std::move(storage_.front());
+    storage_.pop_front();
+    return true;
+  }
+
+  /**
+   * Write to the queue if it is not full.
+   * @param in The value to write. For now we only support moveable types.
+   * @return true if the write succeeded, false if the queue is full.
+   */
+  bool writeIfNotFull(T in) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (storage_.size() == capacity_) {
+      return false;
+    }
+    storage_.push_back(std::move(in));
+    return true;
+  }
+
+ private:
+  std::mutex mutex_;
+  std::deque<T> storage_;
+  size_t capacity_;
+};
+} // namespace torch::nativert::detail
diff --git a/torch/nativert/executor/DelegateExecutor.cpp b/torch/nativert/executor/DelegateExecutor.cpp
new file mode 100644
index 000000000000..9ac1292e2beb
--- /dev/null
+++ b/torch/nativert/executor/DelegateExecutor.cpp
@@ -0,0 +1,68 @@
+#include <torch/nativert/executor/DelegateExecutor.h>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <sys/stat.h>
+
+#include <c10/util/Logging.h>
+
+#include <c10/util/string_view.h>
+#include <torch/nativert/common/FileUtil.h>
+
+namespace torch::nativert {
+
+namespace {
+char* _mkdtemp(char* outputDir) {
+  // mkdtemp is not available on Windows
+#ifdef _WIN32
+  return nullptr;
+#else
+  return mkdtemp(outputDir);
+#endif
+}
+
+} // namespace
+
+std::string extractToTemporaryFolder(
+    caffe2::serialize::PyTorchStreamReader& packageReader,
+    const std::string& targetPath) {
+  char outputDir[] = "/tmp/delegate_model_XXXXXX";
+  char* tempdir = _mkdtemp(outputDir);
+  TORCH_CHECK(
+      tempdir != nullptr,
+      "error creating temporary directory for compiled model. errno: ",
+      errno);
+
+  std::vector<std::string> allRecords = packageReader.getAllRecords();
+
+  for (const auto& path : allRecords) {
+    if (!c10::starts_with(path, targetPath) || c10::ends_with(path, "/")) {
+      continue;
+    }
+
+    TORCH_CHECK(
+        packageReader.hasRecord(path), path, " not present in model package");
+    auto [dataPointer, dataSize] = packageReader.getRecord(path);
+
+    std::string fileName = path.substr(path.rfind('/') + 1);
+    std::string extractedFilename = std::string(outputDir) + "/" + fileName;
+
+    VLOG(1) << "Extracting " << extractedFilename
+            << " from archive path: " << path << " size: " << dataSize;
+
+    File extracted(extractedFilename, O_CREAT | O_WRONLY, 0640);
+    const auto bytesWritten = torch::nativert::writeFull(
+        extracted.fd(), const_cast<void*>(dataPointer.get()), dataSize);
+    TORCH_CHECK(
+        bytesWritten != -1,
+        "failure copying from archive path ",
+        path,
+        " to temporary file");
+  }
+
+  return std::string(outputDir);
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/DelegateExecutor.h b/torch/nativert/executor/DelegateExecutor.h
new file mode 100644
index 000000000000..13be11742e19
--- /dev/null
+++ b/torch/nativert/executor/DelegateExecutor.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <ATen/core/Tensor.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/inductor/aoti_torch/proxy_executor.h>
+#include <torch/nativert/executor/Weights.h>
+namespace torch::nativert {
+
+std::string extractToTemporaryFolder(
+    caffe2::serialize::PyTorchStreamReader& packageReader,
+    const std::string& targetPath);
+
+using MakeProxyExecutorFn =
+    std::function<std::unique_ptr<torch::aot_inductor::ProxyExecutor>(
+        const std::string&,
+        bool,
+        std::optional<std::unordered_map<std::string, c10::IValue>>)>;
+
+// This is the extension point for delegation backends.
+class DelegateExecutor {
+ public:
+  virtual ~DelegateExecutor() {}
+
+  // Runtime calls processWeights() to pass the weights to the delegate backend.
+  // Typically, a backend would perform some form of validation and processing,
+  // such as constant folding. The processed weights stays in the inactivate
+  // state until commitWeights() is called.
+  //
+  // Weights tensors are co-owned by the runtime and the delegate backend.
+  // In the regular inference run() path, neither Runtime or Delegate backend
+  // can modify the weights tensor.
+  // To support inplace weight update, weight tensors are be exposed by
+  // ModelRunner::getWeights() to an external caller. The external caller can
+  // then modify the weight tensors in-place. Such mutation would instantly
+  // affect the weight tensors in the delegate backend.
+  // When a weight tensor is no longer used by the delegate backend, the backend
+  // must release it by decreasing a refcount. Runtime would
+  // also release the refcount for weight tensor if it's no longer activte. The
+  // underlying storage for weight tensors will be freed when the refcount
+  // reaches 0.
+  virtual void processWeights(std::shared_ptr<Weights> weights) = 0;
+
+  // This call activate the processed weights.
+  virtual void commitWeights() = 0;
+
+  virtual std::vector<at::Tensor> run(std::vector<at::Tensor>& inputs) = 0;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/ExecutionFrame.cpp b/torch/nativert/executor/ExecutionFrame.cpp
new file mode 100644
index 000000000000..2aa11e6eaba3
--- /dev/null
+++ b/torch/nativert/executor/ExecutionFrame.cpp
@@ -0,0 +1,145 @@
+#include <c10/util/Enumerate.h>
+#include <c10/util/Logging.h>
+
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/executor/ExecutionPlanner.h>
+
+namespace torch::nativert {
+
+ExecutionFrame::ExecutionFrame(const Graph& graph)
+    : graph_(graph),
+      allValues_(graph.numValues()),
+      persistent_(graph.numValues()),
+      moveable_output_mask_(graph.userOutputs().size()) {
+  // load constant SymInts into execution frame
+  for (const auto& [valueId, constSymintValue] :
+       graph_.getConstantSymIntValues()) {
+    setPersistentIValue(valueId, constSymintValue);
+  }
+
+  for (const Node& node : graph_.nodes()) {
+    if (node.target() == "torch.ops.higher_order.run_const_graph") {
+      const auto& const_graph =
+          std::get<std::unique_ptr<Graph>>(node.attributes().at(0).value);
+      for (size_t i = 0; i < node.outputs().size(); ++i) {
+        foldedConstIds_[std::string{const_graph->outputs().at(i)->name()}] =
+            node.outputs()[i]->id();
+      }
+    }
+  }
+}
+
+ExecutionFrame::ExecutionFrame(const Graph& graph, const Weights& weights)
+    : ExecutionFrame(graph) {
+  setWeights(weights);
+}
+
+void ExecutionFrame::setWeights(const Weights& weights) {
+  weightVersion_ = weights.version();
+
+  const auto& inputsToWeights = graph_.signature().inputsToWeights();
+  for (const auto& [inputName, weightName] : inputsToWeights) {
+    const Value* value = graph_.getValue(inputName);
+    setPersistentIValue(value->id(), weights.at(weightName));
+  }
+
+  const auto& inputsToCustomObjs = graph_.signature().inputsToCustomObjs();
+  for (const auto& [inputName, customObjName] : inputsToCustomObjs) {
+    const Value* value = graph_.getValue(inputName);
+    setPersistentIValue(value->id(), weights.getCustomObj(customObjName));
+  }
+
+  for (const auto& [value, tensor] : weights.getFoldedConsts()) {
+    setPersistentIValue(foldedConstIds_.at(value), tensor);
+  }
+
+  for (const auto& [n, iv] : weights.getConstFoldedValues()) {
+    const Value* v = graph_.getValue(n);
+    setPersistentIValue(v->id(), iv);
+  }
+
+  updateMovableOutputs();
+}
+
+void ExecutionFrame::updateMovableOutputs() {
+  moveable_output_mask_.assign(moveable_output_mask_.size(), true);
+
+  c10::FastSet<ValueId> inputs;
+  for (const auto* input : graph_.userInputs()) {
+    if (input) {
+      inputs.insert(input->id());
+    }
+  }
+
+  const auto& outputs = graph_.userOutputs();
+  const size_t num_outputs = outputs.size();
+
+  c10::FastSet<ValueId> seen;
+  for (size_t i = 0; i < num_outputs; i++) {
+    auto idx = num_outputs - 1 - i;
+    if (const Value* const* valuePtr = std::get_if<Value*>(&outputs[idx]);
+        valuePtr && *valuePtr) {
+      auto id = (*valuePtr)->id();
+
+      /*
+          values are not moveable if:
+          1. they are persistent
+          2. they are inputs (since inputs are borrowed)
+          3. the value will be moved in a later (right-more) output
+      */
+
+      if (!seen.insert(id).second || persistent_[id] ||
+          inputs.find(id) != inputs.end()) {
+        moveable_output_mask_[idx] = false;
+      }
+    }
+  }
+}
+
+ExecutionFrame::ExecutionFrame(
+    const Graph& graph,
+    size_t numValues,
+    const std::vector<ValueId>&,
+    const std::vector<ValueId>&)
+    : graph_(graph) {
+  allValues_.resize(numValues);
+}
+
+void ExecutionFrame::setIValue(ValueId id, c10::IValue ivalue) {
+  DCHECK(static_cast<size_t>(id) < allValues_.size());
+  allValues_[id] = std::move(ivalue);
+}
+
+void ExecutionFrame::setBorrowedIValue(ValueId id, c10::IValue ivalue) {
+  DCHECK(static_cast<size_t>(id) < allValues_.size());
+  borrowedValueIds_.push_back(id);
+  allValues_[id] = std::move(ivalue);
+}
+
+at::Tensor ExecutionFrame::getTensor(ValueId id) const {
+  const auto& ivalue = getIValue(id);
+  if (C10_LIKELY(ivalue.isTensor())) {
+    return ivalue.toTensor();
+  } else {
+    throw std::runtime_error("getTensor called on non-tensor value");
+  }
+}
+
+std::vector<c10::IValue> ExecutionFrame::tryMoveUserOutputs() {
+  std::vector<c10::IValue> ret;
+  const auto& outputs = graph_.userOutputs();
+  ret.reserve(outputs.size());
+  for (const auto& [i, outputValue] : c10::enumerate(outputs)) {
+    if (const Value* const* valuePtr = std::get_if<Value*>(&outputValue);
+        valuePtr && *valuePtr) {
+      ret.push_back(
+          isOutputMovable(i) ? moveIValue((*valuePtr)->id())
+                             : getIValue((*valuePtr)->id()));
+    } else if (Constant const* constant = std::get_if<Constant>(&outputValue)) {
+      ret.push_back(constantToIValue(*constant));
+    }
+  }
+  return ret;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/ExecutionFrame.h b/torch/nativert/executor/ExecutionFrame.h
new file mode 100644
index 000000000000..bebf1cfa5b94
--- /dev/null
+++ b/torch/nativert/executor/ExecutionFrame.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <unordered_map>
+
+#include <torch/csrc/distributed/c10d/Work.hpp>
+#include <torch/nativert/executor/Weights.h>
+#include <torch/nativert/graph/Graph.h>
+
+#include <c10/util/Logging.h>
+
+namespace torch::nativert {
+
+/**
+ * This class encapsulate the stateful values of an execution,
+ * most notably, the tensor values passed between nodes, aka intermediate
+ * activations.
+ */
+class ExecutionFrame {
+ public:
+  // Constructor for weight-less graph, used for higher order ops, e.g.
+  // torch.cond
+  explicit ExecutionFrame(const Graph& graph);
+
+  explicit ExecutionFrame(const Graph& graph, const Weights& weights);
+
+  // Constructor for testing purpose
+  explicit ExecutionFrame(
+      const Graph& graph,
+      size_t numValues,
+      const std::vector<ValueId>& graphInputIds,
+      const std::vector<ValueId>& graphOutputIds);
+
+  ~ExecutionFrame() {
+    destroyBorrowedIValues();
+  }
+
+  std::vector<c10::IValue> tryMoveUserOutputs();
+
+  c10::IValue moveIValue(ValueId id) {
+    return std::move(allValues_[id]);
+  }
+
+  const c10::IValue& getIValue(ValueId id, bool allowNone = true) const {
+    const auto& iValue = allValues_[id];
+    if (allowNone && iValue.isNone()) {
+      return iValue;
+    }
+    DCHECK(!iValue.isNone());
+    return iValue;
+  }
+
+  c10::IValue& getIValue(ValueId id, bool allowNone = true) {
+    auto& iValue = allValues_[id];
+    if (allowNone && iValue.isNone()) {
+      return iValue;
+    }
+    DCHECK(!iValue.isNone());
+    return iValue;
+  }
+
+  void setIValue(ValueId id, c10::IValue ivalue);
+  void setBorrowedIValue(ValueId id, c10::IValue ivalue);
+
+  at::Tensor getTensor(ValueId id) const;
+
+  std::vector<at::Tensor> getTensorVector(ValueId id) const {
+    return getIValue(id).toTensorVector();
+  }
+
+  int64_t getSymInt(ValueId id) const {
+    return getIValue(id).toInt();
+  }
+
+  double getSymFloat(ValueId id) const {
+    return getIValue(id).toDouble();
+  }
+
+  void setPersistentIValue(ValueId id, c10::IValue ivalue) {
+    setIValue(id, std::move(ivalue));
+    persistent_[id] = true;
+  }
+
+  void releaseValue(ValueId id) {
+    CHECK(!persistent_[id]) << "Cannot release persistent value";
+    allValues_[id] = c10::IValue();
+  }
+
+  void destroyBorrowedIValues() {
+    for (const auto& id : borrowedValueIds_) {
+      c10::MaybeOwnedTraits<c10::IValue>::destroyBorrow(getIValue(id));
+    }
+    borrowedValueIds_.clear();
+  }
+
+  void setWork(int64_t workId, const c10::intrusive_ptr<c10d::Work>& work) {
+    work_[workId] = work;
+  }
+
+  c10::intrusive_ptr<c10d::Work> getWork(int64_t workId) const {
+    CHECK(work_.find(workId) != work_.end())
+        << "Couldn't find work with Id: " << workId;
+    return work_.at(workId);
+  }
+
+  WeightVersion weightVersion() const {
+    return weightVersion_;
+  }
+
+  void setWeights(const Weights& weights);
+
+ private:
+  bool isOutputMovable(size_t idx) const {
+    TORCH_CHECK_LT(idx, moveable_output_mask_.size());
+    return moveable_output_mask_[idx];
+  }
+  void updateMovableOutputs();
+
+  const Graph& graph_;
+  WeightVersion weightVersion_ = -1;
+
+  // All the intermediate values for the entire graph, including graph inputs
+  // and outputs This table is fixed once constructed
+  std::vector<c10::IValue> allValues_;
+  std::vector<bool> persistent_;
+
+  std::unordered_map<int64_t, c10::intrusive_ptr<c10d::Work>> work_;
+
+  std::vector<ValueId> borrowedValueIds_;
+
+  std::unordered_map<std::string, ValueId> foldedConstIds_;
+
+  // moveable_output_mask_[i] corresponds to user_outputs_[i]
+  //
+  // if moveable_output_mask_[i] is true, then user_outputs_[i]
+  // can be moved
+  std::vector<bool> moveable_output_mask_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/ExecutionPlanner.cpp b/torch/nativert/executor/ExecutionPlanner.cpp
new file mode 100644
index 000000000000..a3a3f58f3062
--- /dev/null
+++ b/torch/nativert/executor/ExecutionPlanner.cpp
@@ -0,0 +1,117 @@
+#include <unordered_map>
+
+#include <c10/util/Logging.h>
+
+#include <c10/util/Enumerate.h>
+#include <torch/nativert/executor/ExecutionPlanner.h>
+
+namespace torch::nativert {
+
+std::unique_ptr<ExecutionPlan> ExecutionPlanner::createPlan() {
+  auto plan = std::make_unique<ExecutionPlan>();
+
+  // Current implementation assume that nodes will be executed
+  // in the same order as the thrift graph.
+  // In the future, we can do execution order plan, as long as it's
+  // comply with topological order
+
+  generateDeallocationPlan(*plan);
+  return plan;
+}
+
+/* static */ c10::FastSet<ValueId> ExecutionPlanner::staticValues(
+    const Graph& graph) {
+  c10::FastSet<ValueId> staticValues;
+  // Filter lastUsedBy by graph inputs
+  // parameters/buffer values should not be freed
+  // It's a policy decision to whether to free user inputs. For now, we don't
+  // free user inputs.
+  // TODO: It should be fine to "free" the user inputs. If the user holds a ref
+  // to it, it won't be deallocated.
+  for (const auto* input : graph.inputs()) {
+    if (input) {
+      const auto& id = input->id();
+      staticValues.insert(id);
+    }
+  }
+
+  // Filter lastUsedBy by graph outputs, as they are still needed to be returned
+  for (const auto& output : graph.outputs()) {
+    const auto& id = output->id();
+    staticValues.insert(id);
+  }
+
+  for (const auto& [id, _] : graph.getConstantSymIntValues()) {
+    staticValues.insert(id);
+  }
+
+  for (const Node& node : graph.nodes()) {
+    if (node.target() == "torch.ops.higher_order.run_const_graph") {
+      for (const auto& output : node.outputs()) {
+        // Do not free the outputs of run_const_graph, as they are newly
+        // produced folded constants
+        staticValues.insert(output->id());
+      }
+    } else {
+      for (const auto& input : node.inputs()) {
+        if (input.value->isFolded()) {
+          staticValues.insert(input.value->id());
+        }
+      }
+    }
+  }
+
+  return staticValues;
+}
+
+void ExecutionPlanner::generateDeallocationPlan(ExecutionPlan& plan) {
+  const auto& nodes = graph_.nodes();
+  size_t numNodes = nodes.size();
+
+  std::unordered_map<ValueId, NodeIndex> lastUsedBy;
+
+  // Traverse from the last node to the first node
+  // For each Value, find out which is the last node that uses it
+  // the Value can freed after executing the node
+  size_t nodeIdx = nodes.size() - 1;
+  for (auto it = std::rbegin(nodes); it != std::rend(nodes); it++) {
+    const auto& inputs = it->inputs();
+    for (const auto& input : inputs) {
+      const auto& id = input.value->id();
+      if (lastUsedBy.find(id) == lastUsedBy.end()) {
+        lastUsedBy.insert({id, nodeIdx});
+      }
+    }
+    nodeIdx--;
+  }
+
+  std::vector<std::vector<ValueId>> valuesToFree(numNodes);
+
+  const auto& statics = staticValues(graph_);
+  for (auto& [id, nodeIndex] : lastUsedBy) {
+    if (statics.find(id) == statics.end()) {
+      valuesToFree[nodeIndex].push_back(id);
+    }
+  }
+
+  plan.valuesToFree = std::move(valuesToFree);
+
+  // print allocation plan
+  VLOG(2) << plan;
+
+  return;
+}
+
+std::ostream& operator<<(std::ostream& out, const ExecutionPlan& plan) {
+  out << "****** Deallocation Plan ******\n";
+  for (auto&& [i, values] : c10::enumerate(plan.valuesToFree)) {
+    out << "Node #" << i << ", valuesToFree = [";
+    for (const auto& value : values) {
+      out << value << ", ";
+    }
+    out << "]\n";
+  }
+  return out;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/ExecutionPlanner.h b/torch/nativert/executor/ExecutionPlanner.h
new file mode 100644
index 000000000000..af470341cc0c
--- /dev/null
+++ b/torch/nativert/executor/ExecutionPlanner.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <c10/util/FbcodeMaps.h>
+
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+// ExecutionPlan is the result produced by ExecutionPlanner
+// ATM, it only contains value deallocation plan.
+struct ExecutionPlan {
+  // i-th entry in this list are the Values can be freed *after* execution i-th
+  // node
+  std::vector<std::vector<ValueId>> valuesToFree;
+};
+
+class ExecutionPlanner {
+ public:
+  explicit ExecutionPlanner(const Graph& graph) : graph_(graph) {}
+
+  std::unique_ptr<ExecutionPlan> createPlan();
+  // get list of values we can't free
+  static c10::FastSet<ValueId> staticValues(const Graph& graph);
+
+ private:
+  void generateDeallocationPlan(ExecutionPlan& plan);
+  const Graph& graph_;
+};
+
+std::ostream& operator<<(std::ostream& out, const ExecutionPlan& plan);
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/ExecutorConfig.h b/torch/nativert/executor/ExecutorConfig.h
new file mode 100644
index 000000000000..099ba4ee445f
--- /dev/null
+++ b/torch/nativert/executor/ExecutorConfig.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+namespace torch::nativert {
+
+struct ExecutorConfig {
+  bool validateInputs = false;
+  bool debugNan = false;
+  bool enableStaticCPUKernels = false;
+  bool enableStaticMemoryPlanning = false;
+  bool runConstFolding = false;
+  bool doExecutionFrameCleanup = true;
+  // allows up to max number of concurrent threads.
+  int64_t maxNumConcurrentThreads = 8;
+  // allows up to max number of parallel ops.
+  int64_t maxParallelOps = 1;
+  int64_t minNumExecutionFrames = 1;
+  int64_t executionFramePoolCleanupIntervalSec = 600;
+  std::string modelName = "unknown";
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/GraphExecutorBase.cpp b/torch/nativert/executor/GraphExecutorBase.cpp
new file mode 100644
index 000000000000..3de46890334b
--- /dev/null
+++ b/torch/nativert/executor/GraphExecutorBase.cpp
@@ -0,0 +1,120 @@
+#include <ATen/record_function.h>
+#include <torch/nativert/executor/GraphExecutorBase.h>
+
+#include <c10/util/Logging.h>
+#include <caffe2/core/timer.h>
+
+namespace torch::nativert {
+
+GraphExecutorBase::GraphExecutorBase(
+    const Graph& graph,
+    std::vector<std::unique_ptr<OpKernel>> nodeKernels,
+    const ExecutorConfig& executorConfig)
+    : graph_(graph),
+      nodeKernels_(std::move(nodeKernels)),
+      executorConfig_(executorConfig),
+      execPlan_(ExecutionPlanner{graph_}.createPlan()) {};
+
+void GraphExecutorBase::fillUserInputs(
+    ExecutionFrame& frame,
+    std::vector<c10::IValue> inputs) {
+  RECORD_USER_SCOPE("Executor::fillUserInputs");
+  const auto& inputValues = graph_.userInputs();
+  TORCH_CHECK_EQ(inputValues.size(), inputs.size());
+
+  // load user input tensor into execution frame
+  for (size_t i = 0; i < inputValues.size(); i++) {
+    if (inputValues[i]) {
+      frame.setIValue(inputValues[i]->id(), std::move(inputs[i]));
+    }
+  }
+}
+
+ProfileMetrics GraphExecutorBase::benchmarkIndividualNodes(
+    ExecutionFrame& executionFrame,
+    std::vector<std::vector<c10::IValue>> inputsList,
+    const uint32_t warmupRuns,
+    const uint32_t mainRuns) {
+  // TODO: add support for memory profiling
+  TORCH_CHECK(warmupRuns >= 1 && mainRuns >= 1);
+
+  ProfileMetrics results;
+  const auto numNodes = static_cast<uint32_t>(nodeKernels_.size());
+  results.timePerNode.resize(numNodes, 0);
+  if (inputsList.empty()) {
+    auto i = 0;
+    for (const auto& nodeKernel : nodeKernels_) {
+      std::string target(nodeKernel->node()->target());
+      results.timePerNode[i] = 0;
+      results.timePerNodeType[target] = 0;
+      results.instancesPerNodeType[target]++;
+      if (nodeKernel->hasPrimKernel()) {
+        results.primNodesCount++;
+        results.primNodes.insert(target);
+      } else if (nodeKernel->hasStaticDispatch()) {
+        results.staticDispatchNodesCount++;
+        results.staticDispatchNodes.insert(target);
+      }
+      i++;
+    }
+    results.totalNodesCount = numNodes;
+    for (const auto& p : results.timePerNodeType) {
+      const std::string& kind = p.first;
+      results.percentPerNodeType[kind] = 0;
+    }
+    return results;
+  }
+
+  // Warmup
+  for (uint32_t i = 0; i < warmupRuns; i++) {
+    for (const auto& inputs : inputsList) {
+      execute(executionFrame, inputs);
+    }
+  }
+
+  // Execute kernels
+  caffe2::Timer timer;
+  for (uint32_t i = 0; i < mainRuns; i++) {
+    for (auto inputs : inputsList) {
+      const auto& inputValues = graph_.userInputs();
+
+      TORCH_CHECK_EQ(inputValues.size(), inputs.size());
+      for (size_t j = 0; j < inputValues.size(); j++) {
+        executionFrame.setIValue(inputValues[j]->id(), std::move(inputs[j]));
+      }
+      for (NodeIndex nodeIdx = 0; nodeIdx < nodeKernels_.size(); ++nodeIdx) {
+        timer.Start();
+        nodeKernels_[nodeIdx]->compute(executionFrame);
+        float millis = timer.MilliSeconds();
+        results.timePerNode[nodeIdx] += millis;
+      }
+    }
+  }
+
+  // Summarize results
+  const float numTotalIters =
+      (static_cast<float>(mainRuns) * static_cast<float>(inputsList.size()));
+  for (const auto i : c10::irange(numNodes)) {
+    const Node* node = nodeKernels_[i]->node();
+    std::string target(node->target());
+    results.timePerNode[i] /= numTotalIters;
+    results.timePerNodeType[target] += results.timePerNode[i];
+    results.instancesPerNodeType[target]++;
+    if (nodeKernels_[i]->hasPrimKernel()) {
+      results.primNodes.insert(target);
+      results.primNodesCount++;
+    } else if (nodeKernels_[i]->hasStaticDispatch()) {
+      results.staticDispatchNodes.insert(target);
+      results.staticDispatchNodesCount++;
+    }
+    results.totalTime += results.timePerNode[i];
+  }
+  results.totalNodesCount = numNodes;
+  for (const auto& r : results.timePerNodeType) {
+    const std::string& target = r.first;
+    results.percentPerNodeType[target] = r.second * 100.0 / results.totalTime;
+  }
+  return results;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/GraphExecutorBase.h b/torch/nativert/executor/GraphExecutorBase.h
new file mode 100644
index 000000000000..86c6ed61c1f9
--- /dev/null
+++ b/torch/nativert/executor/GraphExecutorBase.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/executor/ExecutionPlanner.h>
+#include <torch/nativert/executor/ExecutorConfig.h>
+#include <torch/nativert/executor/OpKernel.h>
+#include <torch/nativert/graph/Graph.h>
+#include <torch/nativert/graph/GraphSignature.h>
+
+namespace torch::nativert {
+
+struct ProfileMetrics {
+  size_t primNodesCount{0};
+  size_t staticDispatchNodesCount{0};
+  size_t totalNodesCount{0};
+  std::vector<float> timePerNode;
+  std::unordered_map<std::string, float> timePerNodeType;
+  std::unordered_map<std::string, float> percentPerNodeType;
+  std::unordered_map<std::string, int> instancesPerNodeType;
+  std::unordered_set<std::string> staticDispatchNodes;
+  std::unordered_set<std::string> primNodes;
+  float totalTime{0};
+};
+
+/**
+ * GraphExecutor is a lightweight abstraction to execute a graph with
+ * execution frames without actually owning the graph nor the weights. This is
+ * introduced to decouple the state management of the top level runtime from the
+ * kernel executions so that sub graphs from higher order ops can be supported.
+ */
+class GraphExecutorBase {
+ public:
+  GraphExecutorBase(
+      const Graph& graph,
+      std::vector<std::unique_ptr<OpKernel>> nodeKernels,
+      const ExecutorConfig& executorConfig);
+  virtual ~GraphExecutorBase() = default;
+
+  const Graph& graph() const {
+    return graph_;
+  }
+
+  // This API only returns the flattened UserOutputs,
+  // intended to be used for Inference path
+  virtual std::vector<c10::IValue> execute(
+      ExecutionFrame& frame,
+      std::vector<c10::IValue> inputs) = 0;
+
+  virtual std::vector<c10::IValue> executeWithPrefilledFrame(
+      ExecutionFrame& frame) = 0;
+
+  ProfileMetrics benchmarkIndividualNodes(
+      ExecutionFrame& executionFrame,
+      std::vector<std::vector<c10::IValue>> inputs,
+      const uint32_t warmup_runs,
+      const uint32_t main_runs);
+
+  std::vector<std::unique_ptr<OpKernel>> stealKernels() {
+    return std::move(nodeKernels_);
+  }
+
+  void setKernels(std::vector<std::unique_ptr<OpKernel>>&& kernels) {
+    nodeKernels_ = std::move(kernels);
+  }
+
+ protected:
+  void fillUserInputs(ExecutionFrame& frame, std::vector<c10::IValue> inputs);
+
+  const Graph& graph_;
+
+  // cache of the constructed kernels to avoid reconstruction per execution
+  std::vector<std::unique_ptr<OpKernel>> nodeKernels_;
+
+  const ExecutorConfig& executorConfig_;
+
+  std::unique_ptr<ExecutionPlan> execPlan_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/OpKernel.cpp b/torch/nativert/executor/OpKernel.cpp
new file mode 100644
index 000000000000..ee4a8503d5ce
--- /dev/null
+++ b/torch/nativert/executor/OpKernel.cpp
@@ -0,0 +1,163 @@
+#include <torch/nativert/executor/OpKernel.h>
+
+#include <fmt/ostream.h>
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/util/Logging.h>
+
+#include <c10/util/Enumerate.h>
+#include <c10/util/StringUtil.h>
+#include <c10/util/env.h>
+#include <torch/nativert/executor/ExecutionFrame.h>
+
+namespace torch::nativert {
+
+c10::OperatorHandle getOperatorForTarget(
+    std::string_view target,
+    const Node* node) {
+  // target could come as either "torch.ops.aten.add.default" or
+  // "aten.add.default"
+  std::vector<std::string_view> atoms = c10::split(target, '.');
+
+  size_t numAtoms = atoms.size();
+  if (numAtoms < 3) {
+    TORCH_CHECK(false, "Invalid target: ", target);
+  }
+
+  const std::string_view ns = atoms[numAtoms - 3];
+  const std::string_view opName = atoms[numAtoms - 2];
+  const std::string_view overloadName = atoms[numAtoms - 1];
+
+  const auto operatorName = fmt::format("{}::{}", ns, opName);
+  std::string normalizedOverloadName;
+  if (overloadName == "default") {
+    normalizedOverloadName = "";
+  } else {
+    normalizedOverloadName = overloadName;
+  }
+
+  auto handle = c10::Dispatcher::singleton().findSchemaOrThrow(
+      operatorName.c_str(), normalizedOverloadName.c_str());
+
+  return handle;
+}
+
+std::string readableArgs(
+    const c10::FunctionSchema& schema,
+    const std::vector<c10::IValue>& stack) {
+  const auto& schemaArgs = schema.arguments();
+  std::stringstream ss;
+  for (const auto& [i, arg] : c10::enumerate(stack)) {
+    ss << "arg" << i << ' ' << schemaArgs[i].name() << ": " << arg.tagKind()
+       << ' ';
+    if (arg.isTensor()) {
+      auto t = arg.toTensor();
+      ss << t.dtype() << t.sizes() << t.device();
+    } else if (arg.isTensorList()) {
+      auto tl = arg.toTensorVector();
+      ss << '[';
+      for (const auto& t : tl) {
+        ss << t.dtype() << t.sizes() << t.device() << ", ";
+      }
+      ss << ']';
+    } else if (arg.isNone()) {
+      // pass
+    } else {
+      ss << arg;
+    }
+    ss << "\n";
+  }
+  return ss.str();
+}
+
+const bool OpKernel::blockingEnabled_ =
+    c10::utils::get_env("CUDA_LAUNCH_BLOCKING").value_or("0") == "1";
+
+void OpKernel::compute(ExecutionFrame& executionFrame) const {
+  VLOG(2) << "Executing: " << *node_;
+
+  computeInternal(executionFrame);
+
+  VLOG(2) << "Completed: " << *node_;
+}
+
+Arguments prefillStackWithStaticArgs(
+    const Node* node,
+    const c10::FunctionSchema& schema) {
+  std::vector<c10::IValue> stackWithStaticArgs;
+  std::vector<Value*> dynamicArgs;
+  const auto& schemaArgs = schema.arguments();
+  stackWithStaticArgs.resize(schemaArgs.size());
+  dynamicArgs.resize(schemaArgs.size());
+
+  // initialized stackWithStaticArgs_ with static inputs
+  for (const auto& [idx, schemaArg] : c10::enumerate(schemaArgs)) {
+    const auto& argName = schemaArg.name();
+
+    // Check if this is a dynamic input to the op.
+    const auto input = node->tryGetInput(argName);
+    if (input != nullptr) {
+      stackWithStaticArgs.at(idx) = c10::IValue();
+      dynamicArgs.at(idx) = input->value;
+      continue;
+    }
+
+    // Check if this is a statically known input to the op.
+    const auto attribute = node->tryGetAttribute(argName);
+    if (attribute != nullptr) {
+      stackWithStaticArgs.at(idx) = constantToIValue(attribute->value);
+      continue;
+    }
+
+    // Otherwise, it must have a default value
+    auto defaultValueOpt = schemaArg.default_value();
+    if (defaultValueOpt.has_value()) {
+      stackWithStaticArgs.at(idx) = defaultValueOpt.value();
+      continue;
+    }
+
+    TORCH_CHECK(
+        false,
+        "Cannot initialize argument ",
+        argName,
+        " for node ",
+        *node,
+        " with schema ",
+        schema);
+  }
+  return Arguments{std::move(stackWithStaticArgs), std::move(dynamicArgs)};
+}
+
+void fillDynamicInputs(
+    const ExecutionFrame& executionFrame,
+    const Arguments& arguments,
+    std::vector<c10::IValue>& stack) {
+  // fill the stack with dynamic values from execution frame,
+  // including tensor, tensors, symint, symints
+
+  for (auto [idx, value] : arguments.getDynamicArgs()) {
+    TORCH_CHECK(
+        idx < stack.size(),
+        "Invalid index",
+        idx,
+        " for stack size ",
+        stack.size());
+    TORCH_CHECK(stack.at(idx).isNone(), "Encountered None at index ", idx);
+    if (value->type() == Type::Kind::TensorList) {
+      // TODO: This is for passing List<Tensor> as an input to op that takes a
+      // List<Optional<Tensor>>.
+      // Need to cast it to a vector and back to a list, otherwise will get
+      // list covariance problems where List<Tensor> is not a subtype
+      // of List<Optional<Tensor>> when trying to execute aten.index.Tensor.
+      // Our lists should be covariant because they are static,
+      // but IValues don't know that :(
+      stack[idx] = executionFrame.getIValue(value->id()).toTensorList().vec();
+    } else if (value->type() == Type::Kind::None) {
+      stack[idx] = c10::IValue();
+    } else {
+      stack[idx] = executionFrame.getIValue(value->id());
+    }
+  }
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/OpKernel.h b/torch/nativert/executor/OpKernel.h
new file mode 100644
index 000000000000..a4d6572fa0ec
--- /dev/null
+++ b/torch/nativert/executor/OpKernel.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/executor/OpKernelKind.h>
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+c10::OperatorHandle getOperatorForTarget(
+    std::string_view target,
+    const Node* node = nullptr);
+/**
+ * @brief Manages static and dynamic arguments for kernel execution.
+ *
+ * The `Arguments` class encapsulates both static and dynamic arguments
+ * used during the execution of operators in a graph.
+ * Static arguments are the inputs that were specialized to a fixed value
+ * during graph capture phase. For example, scalar inputs and device are
+ * considered static arguments.
+ * Dynamic arguments are the inputs that were not baked in the graph
+ * during graph capture, i.e. all the tensor inputs to operators
+ */
+class Arguments {
+ public:
+  Arguments(
+      std::vector<c10::IValue> stackWithStaticArgs,
+      std::vector<Value*> dynamicArgs)
+      : stackWithStaticArgs_(std::move(stackWithStaticArgs)),
+        dynamicArgs_(std::move(dynamicArgs)) {
+    for (size_t i = 0; i < dynamicArgs_.size(); i++) {
+      if (dynamicArgs_[i]) {
+        indices_.push_back(i);
+      }
+    }
+  }
+
+  // Returns a view of pairs consist of the argument index and
+  // the corresponding Value pointer from the graph.
+  auto getDynamicArgs() const {
+    std::vector<std::pair<size_t, Value*>> ret;
+    ret.reserve(indices_.size());
+    for (auto i : indices_) {
+      ret.emplace_back(i, dynamicArgs_[i]);
+    }
+    return ret;
+  }
+
+  // Argument i means the i-th input to the operator in the argument list.
+  // Will return nullptr if the argument is not dynamic.
+  Value* findDynamic(size_t i) const {
+    DCHECK(i < dynamicArgs_.size()) << "Invalid input index: " << i;
+    return dynamicArgs_[i];
+  }
+
+  // Argument i means the i-th input to the operator in the argument list.
+  // Will return None as IValue if the argument is not static.
+  const c10::IValue& getStatic(size_t i) const {
+    DCHECK(i < stackWithStaticArgs_.size()) << "Invalid input index: " << i;
+    return stackWithStaticArgs_[i];
+  }
+
+  const std::vector<c10::IValue>& getStackWithStaticArgs() const {
+    return stackWithStaticArgs_;
+  }
+
+ private:
+  // stack pre-populated with attributes, aka static arguments
+  const std::vector<c10::IValue> stackWithStaticArgs_;
+
+  // Argument can only be asTensor, asTensors, asSymInt, asSymInts
+  const std::vector<Value*> dynamicArgs_;
+  std::vector<size_t> indices_;
+};
+
+void fillDynamicInputs(
+    const ExecutionFrame& executionFrame,
+    const Arguments& arguments,
+    std::vector<c10::IValue>& stack);
+
+Arguments prefillStackWithStaticArgs(
+    const Node* node,
+    const c10::FunctionSchema& schema);
+
+std::string readableArgs(
+    const c10::FunctionSchema& schema,
+    const std::vector<c10::IValue>& stack);
+
+/**
+ * @brief Abstract interface representing a kernel, which is responsible for
+ * executing a single Node in the graph.
+ *
+ * The OpKernel class is responsible for executing a single Node in the graph.
+ * It provides an interface for accessing node inputs and outputs, determining
+ * the execution kind, and executing the node's computation.
+ */
+class OpKernel {
+ public:
+  explicit OpKernel(
+      const Node* node,
+      std::optional<c10::Device> device = std::nullopt,
+      torch::nativert::OpKernelKind kind =
+          torch::nativert::OpKernelKind::kInterpreterFallbackKernel)
+      : node_(node), device_(device), kind_(kind) {
+    VLOG(1) << "Initializing kernel for node: " << *node_;
+  }
+
+  const Node* node() const {
+    return node_;
+  }
+  void compute(ExecutionFrame& executionFrame) const;
+
+  torch::nativert::OpKernelKind kind() const {
+    return kind_;
+  }
+
+  bool hasPrimKernel() const {
+    return kind() == torch::nativert::OpKernelKind::kPrimKernel;
+  }
+
+  bool hasStaticDispatch() const {
+    return kind() == torch::nativert::OpKernelKind::kStaticDispatchKernel ||
+        kind() == torch::nativert::OpKernelKind::kNativeStaticDispatchKernel;
+  }
+
+  size_t numInputs() const {
+    return node_->inputs().size();
+  }
+
+  size_t numOutputs() const {
+    return node_->outputs().size();
+  }
+
+  // Input is readonly
+  [[nodiscard]] virtual const c10::IValue& input(
+      uint32_t i,
+      ExecutionFrame& executionFrame) const {
+    TORCH_CHECK(i < numInputs(), "Invalid input index: ", i);
+    return executionFrame.getIValue(node_->inputs()[i].value->id());
+  }
+
+  // Output is read/write
+  c10::IValue& output(uint32_t i, ExecutionFrame& executionFrame) const {
+    TORCH_CHECK(i < numOutputs(), "Invalid output index: ", i);
+    return executionFrame.getIValue(node_->outputs()[i]->id(), true);
+  }
+
+  virtual ~OpKernel() = default;
+
+ protected:
+  virtual void computeInternal(ExecutionFrame& executionFrame) const = 0;
+
+  const Node* node_;
+  std::optional<c10::Device> device_;
+  const static bool blockingEnabled_;
+  // this should be set in the ctor!
+  const torch::nativert::OpKernelKind kind_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/OpKernelKind.h b/torch/nativert/executor/OpKernelKind.h
new file mode 100644
index 000000000000..d5842778c79b
--- /dev/null
+++ b/torch/nativert/executor/OpKernelKind.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cstdint>
+
+namespace torch::nativert {
+
+enum class OpKernelKind : uint8_t {
+  kPrimKernel,
+  kStaticDispatchKernel,
+  kInterpreterFallbackKernel,
+  // static dispatch kernels that don't re-use
+  // out TensorImpl
+  kNativeStaticDispatchKernel,
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/Placement.cpp b/torch/nativert/executor/Placement.cpp
new file mode 100644
index 000000000000..be8b6e6df966
--- /dev/null
+++ b/torch/nativert/executor/Placement.cpp
@@ -0,0 +1,61 @@
+#include <torch/nativert/executor/Placement.h>
+
+#include <fmt/ostream.h>
+#include <ostream>
+
+namespace torch::nativert {
+
+std::ostream& operator<<(std::ostream& os, const Placement& placement) {
+  std::vector<std::pair<std::string, c10::Device>> sorted_keys;
+  sorted_keys.reserve(placement.deviceMap_.size());
+  for (const auto& pair : placement.deviceMap_) {
+    sorted_keys.emplace_back(pair.first.str(), pair.first);
+  }
+  std::sort(
+      sorted_keys.begin(), sorted_keys.end(), [](const auto& a, const auto& b) {
+        return a.first < b.first;
+      });
+
+  bool first = true;
+  for (const auto& pair : sorted_keys) {
+    if (!first) {
+      fmt::print(os, ",");
+    }
+    first = false;
+    const auto& key = pair.second;
+    const auto& value = placement.deviceMap_.at(key);
+    fmt::print(os, "{}|{}", pair.first, value.str());
+  }
+  if (placement.defaultDevice_.has_value()) {
+    fmt::print(os, "{}|{}", first ? "" : ",", placement.defaultDevice_->str());
+  }
+  return os;
+}
+
+Placement::Placement(std::optional<c10::Device> defaultDevice)
+    : Placement({}, defaultDevice) {}
+
+Placement::Placement(
+    const std::unordered_map<c10::Device, c10::Device>& deviceMap,
+    std::optional<c10::Device> defaultDevice) {
+  for (const auto& [srcDevice, dstDevice] : deviceMap) {
+    deviceMap_.try_emplace(
+        normalizeDevice(srcDevice), normalizeDevice(dstDevice));
+  }
+  if (defaultDevice.has_value()) {
+    defaultDevice_ = normalizeDevice(defaultDevice.value());
+  }
+}
+
+c10::Device Placement::getMappedDevice(const c10::Device& srcDevice) const {
+  auto it = deviceMap_.find(normalizeDevice(srcDevice));
+  if (it != deviceMap_.end()) {
+    return it->second;
+  }
+  if (defaultDevice_.has_value()) {
+    return defaultDevice_.value();
+  }
+  return srcDevice;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/Placement.h b/torch/nativert/executor/Placement.h
new file mode 100644
index 000000000000..9f9a2c627d25
--- /dev/null
+++ b/torch/nativert/executor/Placement.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/util/Logging.h>
+
+#include <optional>
+#include <unordered_map>
+
+namespace torch::nativert {
+
+/**
+ * This function returns a normalized version of the input device:
+ * - For CPU devices, the returned device will have no index (i.e., the default
+ * CPU device).
+ * - For CUDA devices, if no index is specified, index 0 is assumed.
+ * - For other device types, the function will raise an error.
+ *
+ * @param device The input c10::Device to normalize.
+ * @return A normalized c10::Device with standardized indexing.
+ *
+ * @throws c10::Error If the device type is not CPU or CUDA.
+ */
+
+c10::Device normalizeDevice(const c10::Device& device);
+
+/**
+ * Returns true if the two devices are the same and has the same device index
+ * (if cuda).
+ */
+bool isSameDevice(const c10::Device& device1, const c10::Device& device2);
+
+/**
+ * @brief A utility class for managing device placement mappings.
+ *
+ * The Placement class provides a way to map source devices to target devices.
+ * It supports both explicit per-device mappings and a default device fallback.
+ * This is the argument taken in NativeRT to map from model artifact device to
+ * the device it should run on.
+ */
+struct TORCH_API Placement {
+  Placement() = default;
+  explicit Placement(std::optional<c10::Device> defaultDevice);
+  explicit Placement(
+      const std::unordered_map<c10::Device, c10::Device>& deviceMap,
+      std::optional<c10::Device> defaultDevice = std::nullopt);
+  c10::Device getMappedDevice(const c10::Device& srcDevice) const;
+
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& os,
+      const Placement& obj);
+
+ protected:
+  std::unordered_map<c10::Device, c10::Device> deviceMap_;
+  std::optional<c10::Device> defaultDevice_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/PlacementUtils.cpp b/torch/nativert/executor/PlacementUtils.cpp
new file mode 100644
index 000000000000..988c9997ed03
--- /dev/null
+++ b/torch/nativert/executor/PlacementUtils.cpp
@@ -0,0 +1,37 @@
+#include <torch/nativert/executor/Placement.h>
+
+#include <fmt/ostream.h>
+
+namespace torch::nativert {
+
+c10::Device normalizeDevice(const c10::Device& device) {
+  // cpu device doesn't have index
+  // cuda device index must have a index
+  if (device.is_cpu()) {
+    return c10::Device(c10::DeviceType::CPU);
+  } else if (device.is_cuda()) {
+    return c10::Device(
+        c10::DeviceType::CUDA,
+        device.has_index() ? device.index() : static_cast<c10::DeviceIndex>(0));
+  } else {
+    TORCH_CHECK(false, "Unsupported device type", device);
+  }
+}
+
+bool isSameDevice(const c10::Device& a, const c10::Device& b) {
+  if (a.is_cpu()) {
+    return b.is_cpu();
+  }
+  if (a.is_cuda()) {
+    if (b.is_cuda()) {
+      auto aIndex = a.has_index() ? a.index() : 0;
+      auto bIndex = b.has_index() ? b.index() : 0;
+      return aIndex == bIndex;
+    } else {
+      return false;
+    }
+  }
+  TORCH_CHECK(false, "Unsupported device type", a, " and ", b);
+  return false;
+}
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/SerialGraphExecutor.cpp b/torch/nativert/executor/SerialGraphExecutor.cpp
new file mode 100644
index 000000000000..f1ef0491eda1
--- /dev/null
+++ b/torch/nativert/executor/SerialGraphExecutor.cpp
@@ -0,0 +1,33 @@
+#include <torch/nativert/executor/ExecutionPlanner.h>
+#include <torch/nativert/executor/ExecutorConfig.h>
+#include <torch/nativert/executor/SerialGraphExecutor.h>
+
+namespace torch::nativert {
+
+std::vector<c10::IValue> SerialGraphExecutor::execute(
+    ExecutionFrame& executionFrame,
+    std::vector<c10::IValue> inputs) {
+  fillUserInputs(executionFrame, std::move(inputs));
+
+  return executeWithPrefilledFrame(executionFrame);
+}
+
+std::vector<c10::IValue> SerialGraphExecutor::executeWithPrefilledFrame(
+    ExecutionFrame& executionFrame) {
+  // Execute kernels for all nodes except prim.Input and prim.Output
+  for (NodeIndex nodeIdx = 1; nodeIdx < nodeKernels_.size() - 1; ++nodeIdx) {
+    nodeKernels_[nodeIdx]->compute(executionFrame);
+
+    // don't free intermediate values when static memory planning is enabled
+    if (!executorConfig_.enableStaticMemoryPlanning) {
+      // Free the intermediate values that are no used anymore
+      for (const auto& valueKey : execPlan_->valuesToFree[nodeIdx]) {
+        executionFrame.releaseValue(valueKey);
+      }
+    }
+  }
+
+  return executionFrame.tryMoveUserOutputs();
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/SerialGraphExecutor.h b/torch/nativert/executor/SerialGraphExecutor.h
new file mode 100644
index 000000000000..cae3313e61e8
--- /dev/null
+++ b/torch/nativert/executor/SerialGraphExecutor.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/nativert/executor/GraphExecutorBase.h>
+
+namespace torch::nativert {
+
+class SerialGraphExecutor : public GraphExecutorBase {
+ public:
+  SerialGraphExecutor(
+      const Graph& graph,
+      std::vector<std::unique_ptr<OpKernel>> nodeKernels,
+      const ExecutorConfig& executorConfig)
+      : GraphExecutorBase(graph, std::move(nodeKernels), executorConfig) {}
+
+  std::vector<c10::IValue> execute(
+      ExecutionFrame& frame,
+      std::vector<c10::IValue> inputs) override;
+
+  std::vector<c10::IValue> executeWithPrefilledFrame(
+      ExecutionFrame& frame) override;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/SessionState.h b/torch/nativert/executor/SessionState.h
new file mode 100644
index 000000000000..e6daf3e64aae
--- /dev/null
+++ b/torch/nativert/executor/SessionState.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <atomic>
+
+#include <c10/macros/Macros.h>
+
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+using torch::nativert::ExecutionFrame;
+using torch::nativert::Node;
+
+template <typename T, typename __atomic_base = std::atomic<T>>
+struct copyable_atomic : public __atomic_base {
+ public:
+  copyable_atomic() = default;
+  ~copyable_atomic() = default;
+  copyable_atomic(const T& t) noexcept(__atomic_base::is_always_lock_free)
+      : __atomic_base(t) {}
+  copyable_atomic(const copyable_atomic& other) noexcept(
+      __atomic_base::is_always_lock_free)
+      : __atomic_base(other.load()) {}
+  copyable_atomic& operator=(const copyable_atomic& other) noexcept(
+      __atomic_base::is_always_lock_free) {
+    this->store(other.load());
+    return *this;
+  }
+  copyable_atomic(copyable_atomic&& other) = delete;
+  copyable_atomic& operator=(copyable_atomic&& other) = delete;
+};
+
+class SessionState {
+ public:
+  explicit SessionState(
+      ExecutionFrame& frame,
+      c10::FastMap<const Node*, copyable_atomic<std::uint_fast32_t>> producers =
+          {})
+      : producers_(std::move(producers)), frame_(frame) {}
+
+  C10_ALWAYS_INLINE void wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [&]() {
+      return workOutstanding_.load(std::memory_order_seq_cst) == 0;
+    });
+  }
+
+  C10_ALWAYS_INLINE void addWork(uint32_t ct = 1) {
+    workOutstanding_.fetch_add(ct, std::memory_order_seq_cst);
+  }
+
+  C10_ALWAYS_INLINE void removeWork() {
+    if (workOutstanding_.fetch_sub(1, std::memory_order_seq_cst) == 1) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      cv_.notify_one();
+    }
+  }
+
+  C10_ALWAYS_INLINE ExecutionFrame& frame() {
+    return frame_;
+  }
+
+  C10_ALWAYS_INLINE /* producersRemaining == 0 */ bool decrementProducers(
+      const Node* node) {
+    return producers_.at(node).fetch_sub(1, std::memory_order_seq_cst) == 1;
+  }
+
+  C10_ALWAYS_INLINE void setProducers(const Node* node, uint32_t v = 1) {
+    producers_[node] += v;
+  }
+
+ private:
+  std::atomic_uint_fast32_t workOutstanding_;
+  c10::FastMap<const Node*, copyable_atomic<std::uint_fast32_t>> producers_;
+
+  std::condition_variable cv_;
+  std::mutex mutex_;
+
+  ExecutionFrame& frame_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/Weights.cpp b/torch/nativert/executor/Weights.cpp
new file mode 100644
index 000000000000..44a29d95eb67
--- /dev/null
+++ b/torch/nativert/executor/Weights.cpp
@@ -0,0 +1,439 @@
+
+#include <c10/util/Logging.h>
+#include <utility>
+
+#include <torch/csrc/export/pt2_archive_constants.h>
+#include <torch/csrc/jit/serialization/import.h>
+#include <torch/csrc/jit/serialization/import_read.h>
+#include <torch/csrc/jit/serialization/pickle.h>
+#include <torch/nativert/executor/Weights.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/scalar_tensor.h>
+#endif
+
+#include <c10/util/string_view.h>
+#include <caffe2/serialize/inline_container.h>
+
+namespace torch::nativert {
+
+WeightVersion Weights::globalVersion_ = 0;
+
+Weights::Weights(
+    const Graph* graph,
+    const std::optional<std::unordered_map<std::string, c10::IValue>>&
+        stateDict,
+    Placement placement)
+    : graph_(graph),
+      weightsMeta_(graph->weightsMeta()),
+      placement_(std::move(placement)),
+      version_(globalVersion_++) {
+  if (stateDict.has_value()) {
+    loadStateDict(stateDict.value());
+  }
+}
+
+Weights::Weights(
+    const Graph* graph,
+    std::shared_ptr<caffe2::serialize::PyTorchStreamReader> pytorchStreamReader,
+    const std::unordered_map<std::string, std::string>& stateDictPaths,
+    std::string_view stateDictPathPrefix,
+    const std::unordered_map<std::string, std::string>& constantPaths,
+    std::string_view constantPathPrefix,
+    Placement placement,
+    std::function<bool(const std::string&)> skipSizeCheck,
+    std::function<bool(const std::string&)> skipDtypeCheck)
+    : graph_(graph),
+      weightsMeta_(graph->weightsMeta()),
+      placement_(std::move(placement)),
+      version_(globalVersion_++),
+      skipSizeCheck_(std::move(skipSizeCheck)),
+      skipDtypeCheck_(std::move(skipDtypeCheck)) {
+  auto loadAndInsert =
+      [&](const std::string& tensorName,
+          std::string_view pathPrefix,
+          const std::unordered_map<std::string, std::string>& tensorPaths,
+          bool isUsed) {
+        auto pathIt = tensorPaths.find(tensorName);
+        TORCH_CHECK(
+            pathIt != tensorPaths.end(),
+            "Couldn't find ",
+            tensorName,
+            " in tensorPaths");
+
+        const std::string tensorPath = std::string{pathPrefix} + pathIt->second;
+        VLOG(1) << "Loading weight from: " << tensorPath;
+        TORCH_CHECK(
+            pytorchStreamReader->hasRecord(tensorPath),
+            tensorPath,
+            " not found");
+
+        auto [tensorData, tensorDataSize] =
+            pytorchStreamReader->getRecord(tensorPath);
+
+        // TODO: We now have two copies of metadata for weights, one in
+        // model definition /models/<model_name>.json, another in
+        // /extra/xl_weights/<model_name>_model_param_config.json
+        // Currently, we only use the metadata from model definition.
+        std::optional<TensorMeta> tensorMeta;
+        if (weightsMeta_.find(tensorName) != weightsMeta_.end()) {
+          tensorMeta = weightsMeta_.at(tensorName);
+        } else {
+          TORCH_CHECK(false, "Tensor meta not found for: ", tensorName);
+        }
+
+        if (tensorDataSize == 0 && tensorMeta->numel() > 0) {
+          VLOG(1) << "Tensor " << tensorName
+                  << " does not have data and create on Meta device";
+          allValues_[tensorName] = at::empty_strided(
+              tensorMeta->sizes(),
+              tensorMeta->strides(),
+              tensorMeta->asTensorOptions().device(at::kMeta));
+          return;
+        }
+
+        if (!isUsed) {
+          VLOG(1) << "Tensor " << tensorName << " is not used during inference";
+          auto targetDevice = placement_.getMappedDevice(tensorMeta->device());
+          allValues_[tensorName] =
+              at::scalar_tensor(0, at::TensorOptions().device(targetDevice));
+          return;
+        }
+
+        size_t bytesPerEntry =
+            c10::scalarTypeToTypeMeta(tensorMeta->dtype()).itemsize();
+        auto device = tensorData.device();
+        auto storage = c10::Storage(
+            c10::Storage::use_byte_size_t(),
+            at::detail::computeStorageNbytes(
+                tensorMeta->sizes(), tensorMeta->strides(), bytesPerEntry),
+            std::move(tensorData), // ownership is transferred
+            nullptr,
+            false);
+        const auto tensorOptions = at::TensorOptions(device)
+                                       .dtype(tensorMeta->dtype())
+                                       .requires_grad(false);
+        auto tensor =
+            at::empty({0}, tensorOptions)
+                .set_(storage, 0, tensorMeta->sizes(), tensorMeta->strides());
+
+        auto targetDevice = placement_.getMappedDevice(tensorMeta->device());
+        VLOG(1) << "Loading weight " << tensorName << " on " << targetDevice;
+        if (!isSameDevice(targetDevice, tensor.device())) {
+          tensor = tensor.to(targetDevice);
+        }
+
+        allValues_[tensorName] = tensor;
+      };
+
+  auto loadAndInsertParamsBuffers = [&](const auto& tensorName, bool isUsed) {
+    return loadAndInsert(
+        std::string(tensorName), stateDictPathPrefix, stateDictPaths, isUsed);
+  };
+
+  size_t weightIndex = 0;
+  bool isUsed = true;
+  const auto& weightValues = graph->weightValues();
+
+  for (const auto& tensorName : graph->signature().parameters()) {
+    isUsed = !weightValues[weightIndex]->users().empty();
+    if (!isUsed) {
+      unusedWeights_.insert(std::string(tensorName));
+    }
+    loadAndInsertParamsBuffers(tensorName, isUsed);
+    weightIndex++;
+  }
+  for (const auto& tensorName : graph->signature().buffers()) {
+    isUsed = !weightValues[weightIndex]->users().empty();
+    if (!isUsed) {
+      unusedWeights_.insert(std::string(tensorName));
+    }
+    loadAndInsertParamsBuffers(tensorName, isUsed);
+    weightIndex++;
+  }
+
+  // Load tensor constants and custom object constants, they are both stored
+  // in the same directory in the archive, i.e. "extra/constants/" tensor
+  // constants are prefixed with "tensor_" custom objects are prefixed with
+  // "custom_obj_"
+  auto loadConstants = [&](const auto& constants) {
+    for (const auto& constantName : constants) {
+      auto pathIt = constantPaths.find(std::string(constantName));
+      TORCH_CHECK(
+          pathIt != constantPaths.end(),
+          "Couldn't find ",
+          constantName,
+          " in constantPaths");
+      auto& fileName = pathIt->second;
+
+      if (c10::starts_with(
+              fileName,
+              torch::_export::archive_spec::TENSOR_CONSTANT_FILENAME_PREFIX)) {
+        // tensor constants
+        isUsed = !weightValues[weightIndex]->users().empty();
+        if (!isUsed) {
+          unusedWeights_.insert(std::string(constantName));
+        }
+        loadAndInsert(
+            std::string(constantName),
+            constantPathPrefix,
+            constantPaths,
+            isUsed);
+        weightIndex++;
+      } else {
+        TORCH_CHECK(false, "Unknown constant path: ", fileName);
+      }
+    }
+  };
+  loadConstants(graph->signature().nonPersistentBuffers());
+  loadConstants(graph->signature().tensorConstants());
+
+  // custom object constants
+  for (const auto& customObjName : graph->signature().customObjs()) {
+    auto pathIt = constantPaths.find(std::string(customObjName));
+    TORCH_CHECK(
+        pathIt != constantPaths.end(),
+        "Couldn't find ",
+        customObjName,
+        " in constantPaths");
+    auto& fileName = pathIt->second;
+
+    if (!c10::starts_with(
+            fileName,
+            torch::_export::archive_spec::CUSTOM_OBJ_FILENAME_PREFIX)) {
+      TORCH_CHECK(false, "Unknown constant path: ", fileName);
+    }
+    std::string customObjPath = std::string{constantPathPrefix} + fileName;
+    LOG(INFO) << "Loading custom object from: " << customObjPath;
+
+    TORCH_CHECK(
+        pytorchStreamReader->hasRecord(customObjPath),
+        customObjPath,
+        " not found");
+
+    const auto& [customObjData, customObjDataSize] =
+        pytorchStreamReader->getRecord(customObjPath);
+
+    const char* customObjDataPtr =
+        reinterpret_cast<const char*>(customObjData.get());
+    std::string customObjBytes(
+        customObjDataPtr, customObjDataPtr + customObjDataSize);
+
+    c10::IValue customObj = torch::jit::pickle_load_obj(customObjBytes);
+    TORCH_CHECK(
+        customObj.isCustomClass(), "Custom object is not a custom class");
+    TORCH_CHECK(!customObj.isNone(), "Custom object is None");
+    customObjs_[std::string(customObjName)] = std::move(customObj);
+    customObjsPaths_[customObjPath] = std::string(customObjName);
+  }
+}
+
+std::unordered_map<std::string, at::Tensor> Weights::parameters() const {
+  std::unordered_map<std::string, at::Tensor> result;
+  for (const auto& name : graph_->signature().parameters()) {
+    result.emplace(name, allValues_.at(std::string(name)));
+  }
+  return result;
+}
+
+std::unordered_map<std::string, at::Tensor> Weights::buffers() const {
+  std::unordered_map<std::string, at::Tensor> result;
+  for (const auto& name : graph_->signature().buffers()) {
+    result.emplace(name, allValues_.at(std::string(name)));
+  }
+  return result;
+}
+
+std::unordered_map<std::string, at::Tensor> Weights::attributes() const {
+  return allValues_;
+}
+
+at::Tensor Weights::at(const std::string& name) const {
+  auto it = allValues_.find(name);
+  if (it != allValues_.end()) {
+    return it->second;
+  }
+
+  TORCH_CHECK(false, name, " not found in Weights ", toString());
+}
+
+at::Tensor& Weights::at(const std::string& name) {
+  auto it = allValues_.find(name);
+  if (it != allValues_.end()) {
+    return it->second;
+  }
+
+  TORCH_CHECK(false, name, " not found in Weights ", toString());
+}
+
+bool Weights::contains(const std::string& name) const {
+  return allValues_.find(name) != allValues_.end();
+}
+
+c10::IValue Weights::getCustomObj(const std::string& name) const {
+  auto it = customObjs_.find(name);
+  if (it != customObjs_.end()) {
+    return it->second;
+  }
+
+  TORCH_CHECK(false, "Custom objects ", name, " not found in Weights");
+}
+
+c10::IValue Weights::getCustomObjByFileName(const std::string& name) const {
+  auto it = customObjsPaths_.find(name);
+  TORCH_CHECK(
+      it != customObjsPaths_.end(),
+      "Custom objects with file name ",
+      name,
+      " not found in Weights");
+  const std::string obj_name = it->second;
+  return getCustomObj(obj_name);
+}
+
+void Weights::loadStateDict(
+    const std::unordered_map<std::string, c10::IValue>& stateDict) {
+  auto validateAndInsert = [&](const std::string& name) {
+    auto stateDictIt = stateDict.find(name);
+    TORCH_CHECK(
+        stateDictIt != stateDict.end(),
+        "Couldn't find ",
+        name,
+        " in stateDict");
+
+    // Verify that the tensor matches the tensorMeta
+    auto it = weightsMeta_.find(name);
+    TORCH_CHECK(
+        it != weightsMeta_.end(), "Couldn't find ", name, " in weightsMeta");
+
+    auto targetDevice = placement_.getMappedDevice(it->second.device());
+    auto tensor = stateDictIt->second.toTensor().to(targetDevice);
+
+    TORCH_CHECK(tensor.sizes() == it->second.sizes());
+    TORCH_CHECK(tensor.dtype() == it->second.dtype());
+
+    allValues_.emplace(name, tensor);
+  };
+
+  for (const auto& name : graph_->signature().parameters()) {
+    validateAndInsert(std::string(name));
+  }
+  for (const auto& name : graph_->signature().buffers()) {
+    validateAndInsert(std::string(name));
+  }
+  // TensorConstants_ not filled !!
+}
+
+void Weights::validateValue(const std::string& name, const at::Tensor& newValue)
+    const {
+  auto& weightMeta = weightsMeta_.at(name);
+
+  TORCH_CHECK(
+      weightMeta.sizes() == newValue.sizes() ||
+          (skipSizeCheck_ && skipSizeCheck_(name)) ||
+          unusedWeights_.find(name) != unusedWeights_.end(),
+      "Mismatched sizes for ",
+      name,
+      ": ",
+      weightMeta.sizes(),
+      " vs ",
+      newValue.sizes());
+  TORCH_CHECK(
+      weightMeta.dtype() == newValue.dtype() ||
+          (skipDtypeCheck_ && skipDtypeCheck_(name)) ||
+          unusedWeights_.find(name) != unusedWeights_.end(),
+      "Mismatched dtype for ",
+      name,
+      ": ",
+      weightMeta.dtype(),
+      " vs ",
+      newValue.dtype());
+
+  auto targetDevice = placement_.getMappedDevice(weightMeta.device());
+  if (targetDevice.is_cpu() && targetDevice.has_index()) {
+    LOG(WARNING) << "Target device is cpu but has index: " << targetDevice;
+  }
+  TORCH_CHECK(
+      isSameDevice(targetDevice, newValue.device()),
+      "Mismatched device for ",
+      name,
+      ": ",
+      targetDevice,
+      " vs ",
+      newValue.device());
+}
+
+void Weights::setValue(const std::string& name, const at::Tensor& newValue) {
+  if (allValues_.find(name) != allValues_.end()) {
+    validateValue(name, newValue);
+  } else {
+    LOG(WARNING) << name << " is not found in the registered weights";
+  }
+
+  allValues_[name] = newValue;
+}
+
+void Weights::updateValue(const std::string& name, const at::Tensor& newValue) {
+  auto it = allValues_.find(name);
+  TORCH_CHECK(
+      it != allValues_.end(), name, " not found in Weights ", toString());
+  validateValue(name, newValue);
+
+  it->second.copy_(newValue);
+}
+
+void Weights::updateValues(
+    const std::unordered_map<std::string, at::Tensor>& newValues) {
+  for (auto& [name, newValue] : newValues) {
+    updateValue(name, newValue);
+  }
+}
+
+std::string Weights::toString() const {
+  std::stringstream ss;
+  ss << '[';
+  for (const auto& [name, _] : allValues_) {
+    ss << name << ", ";
+  }
+  ss << ']';
+  ss << '[';
+  for (const auto& [name, _] : customObjs_) {
+    ss << name << ", ";
+  }
+  ss << ']';
+  return ss.str();
+}
+
+void Weights::validateAllWeightsLoaded() {
+  auto checkNames = [&](const auto& names) {
+    for (const auto& name : names) {
+      if (unusedWeights_.find(std::string(name)) != unusedWeights_.end()) {
+        continue;
+      }
+      auto it = allValues_.find(std::string(name));
+      TORCH_CHECK(it != allValues_.end(), "Missing weight: ", name);
+      TORCH_CHECK(it->second.defined(), "Weight not defined: ", name);
+      if (it->second.device().is_meta()) {
+        LOG(WARNING) << "Weight is on meta device: " << name;
+      }
+    }
+  };
+  checkNames(graph_->signature().parameters());
+  checkNames(graph_->signature().buffers());
+  checkNames(graph_->signature().nonPersistentBuffers());
+  checkNames(graph_->signature().tensorConstants());
+}
+
+void Weights::updateFoldedConst(std::string_view name, c10::IValue tensor) {
+  foldedConstsMap_[std::string{name}] = std::move(tensor);
+}
+
+const std::unordered_map<std::string, c10::IValue>& Weights::getFoldedConsts()
+    const {
+  return foldedConstsMap_;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/Weights.h b/torch/nativert/executor/Weights.h
new file mode 100644
index 000000000000..19f40114e2a2
--- /dev/null
+++ b/torch/nativert/executor/Weights.h
@@ -0,0 +1,145 @@
+#pragma once
+
+#include <unordered_map>
+
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/Logging.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/nativert/executor/Placement.h>
+
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+using WeightVersion = int;
+/**
+ * @brief A class that manages the weights of a graph, providing functionality
+ * to load, access, and manipulate them.
+ *
+ * It is responsible for handling the parameters, buffers, and constants
+ * associated with a graph It provides mechanisms to load weights from
+ * serialized data, access and modify them, and performs necessary validation
+ * checks.
+ */
+class Weights {
+ public:
+  explicit Weights(
+      const Graph* graph,
+      const std::optional<std::unordered_map<std::string, c10::IValue>>&
+          stateDict = std::nullopt,
+      Placement placement = Placement());
+
+  // Arguments
+  // - pytorchStreamReader: the reader for the model archive
+  // - stateDictPath: a map from parameter/buffer/constant name to file path in
+  // the archive
+  // - stateDictPathPrefix: a prefix that will be prepended to paths in
+  // stateDictPathPrefix
+  // - constantPaths: a map from constant name to file path in the archive
+  // - constantPathPrefix: a prefix that will be prepended to paths in
+  // constantPathPrefix
+  // - placement: the device placement of the weights, default to follow the
+  //   original device in the weight's metadata
+  explicit Weights(
+      const Graph* graph,
+      std::shared_ptr<caffe2::serialize::PyTorchStreamReader>
+          pytorchStreamReader,
+      const std::unordered_map<std::string, std::string>& stateDictPaths,
+      std::string_view stateDictPathPrefix,
+      const std::unordered_map<std::string, std::string>& constantPaths,
+      std::string_view constantPathPrefix,
+      Placement placement = Placement(),
+      std::function<bool(const std::string&)> skipSizeCheck = {},
+      std::function<bool(const std::string&)> skipDtypeCheck = {});
+
+  at::Tensor at(const std::string& name) const;
+  at::Tensor& at(const std::string& name);
+  bool contains(const std::string& name) const;
+  c10::IValue getCustomObj(const std::string& name) const;
+  c10::IValue getCustomObjByFileName(const std::string& name) const;
+
+  std::unordered_map<std::string, at::Tensor> parameters() const;
+
+  std::unordered_map<std::string, at::Tensor> buffers() const;
+
+  std::unordered_map<std::string, at::Tensor> attributes() const;
+
+  void loadStateDict(
+      const std::unordered_map<std::string, c10::IValue>& stateDict);
+
+  /*
+   * Replace the value stored at the weight with name "name".
+   */
+  void setValue(const std::string& name, const at::Tensor& newValue);
+
+  /*
+   * Update the value stored at the weight with name "name".
+   * This is done in-place.
+   */
+  void updateValue(const std::string& name, const at::Tensor& newValue);
+
+  void updateValues(
+      const std::unordered_map<std::string, at::Tensor>& newValues);
+
+  void validateValue(const std::string& name, const at::Tensor& newValue) const;
+
+  void validateAllWeightsLoaded();
+
+  void updateFoldedConst(std::string_view name, c10::IValue tensor);
+
+  const std::unordered_map<std::string, c10::IValue>& getFoldedConsts() const;
+
+  C10_ALWAYS_INLINE const c10::FastMap<std::string, c10::IValue>&
+  getConstFoldedValues() const {
+    return constFoldedValues_;
+  }
+
+  C10_ALWAYS_INLINE void setConstFoldedValue(
+      const std::string& n,
+      c10::IValue iv) {
+    constFoldedValues_.insert_or_assign(n, std::move(iv));
+  }
+
+  std::string toString() const;
+
+  WeightVersion version() const {
+    return version_;
+  }
+
+ private:
+  const Graph* graph_;
+  const std::unordered_map<std::string, TensorMeta>& weightsMeta_;
+  Placement placement_;
+
+  // keys are parameter/buffer/constant names, not graph input names!
+  std::unordered_map<std::string, at::Tensor> allValues_;
+
+  std::unordered_map<std::string, c10::IValue> customObjs_;
+
+  // contains CustomClassHolder map from a file name to an arbitray
+  // key in customObjs_ that hold the loaded content of the file.
+  // This is used in AOTIDelegateExecutor.
+  std::unordered_map<std::string, std::string> customObjsPaths_;
+
+  // The liftcycle of folded consts should be tied with the weights from which
+  // it was derived. The ordering of the constant should be consistent with
+  // the output order of const graph.
+  std::vector<c10::IValue> foldedConsts_;
+  std::unordered_map<std::string, c10::IValue> foldedConstsMap_;
+
+  c10::FastMap<std::string, c10::IValue> constFoldedValues_;
+
+  // unique version number for this instance of weight
+  const WeightVersion version_;
+
+  // every instance of Weight has a unique version number
+  static WeightVersion globalVersion_;
+
+  std::function<bool(const std::string&)> skipSizeCheck_ = {};
+  std::function<bool(const std::string&)> skipDtypeCheck_ = {};
+
+  // save the names of unused weights
+  std::unordered_set<std::string> unusedWeights_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/memory/Bump.cpp b/torch/nativert/executor/memory/Bump.cpp
new file mode 100644
index 000000000000..ac396e06a539
--- /dev/null
+++ b/torch/nativert/executor/memory/Bump.cpp
@@ -0,0 +1,24 @@
+#include <torch/nativert/executor/memory/Bump.h>
+
+namespace torch::nativert {
+
+LayoutPlan BumpAllocationPlanner(
+    const std::vector<AllocationSpec>& allocation_specs) {
+  LayoutPlan plan;
+
+  auto& allocations = plan.allocations;
+  auto& total_size = plan.total_size;
+
+  allocations.reserve(allocation_specs.size());
+  for (const auto& spec : allocation_specs) {
+    allocations.push_back(Allocation{
+        spec.size,
+        total_size,
+    });
+    total_size += spec.size;
+  }
+
+  return plan;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/memory/Bump.h b/torch/nativert/executor/memory/Bump.h
new file mode 100644
index 000000000000..d424e2bb6924
--- /dev/null
+++ b/torch/nativert/executor/memory/Bump.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/nativert/executor/memory/LayoutPlannerAlgorithm.h>
+
+namespace torch::nativert {
+
+// lay out all tensors contiguously in memory
+// this doesn't take into account lifetimes,
+// it literally just puts them all next to each other
+LayoutPlan BumpAllocationPlanner(
+    const std::vector<AllocationSpec>& allocation_specs);
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/memory/FunctionSchema.cpp b/torch/nativert/executor/memory/FunctionSchema.cpp
new file mode 100644
index 000000000000..81fd2cbe83f1
--- /dev/null
+++ b/torch/nativert/executor/memory/FunctionSchema.cpp
@@ -0,0 +1,26 @@
+#include <torch/nativert/executor/memory/FunctionSchema.h>
+
+namespace torch::nativert {
+
+bool FunctionSchema::alias(size_t input_idx, size_t output_idx) const {
+  // probably quicker than using a map since
+  // overridden inputs/outputs should be small
+  for (const auto& [i, o] : aliasing_spec_) {
+    if (i == input_idx && o == output_idx) {
+      return true;
+    }
+  }
+
+  if (!aliasing_spec_.empty()) {
+    VLOG(1) << "aliasing spec is not empty but no entry found for ("
+            << input_idx << "-->" << output_idx
+            << ") -- falling back to schema->may_contain_alias()";
+  }
+
+  return c10_fn_schema_.may_contain_alias(
+      {c10::SchemaArgType::output, output_idx},
+      {c10::SchemaArgType::input, input_idx},
+      /* bidirectional = */ false);
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/memory/FunctionSchema.h b/torch/nativert/executor/memory/FunctionSchema.h
new file mode 100644
index 000000000000..167bc0d1a1a1
--- /dev/null
+++ b/torch/nativert/executor/memory/FunctionSchema.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <ATen/core/function_schema.h>
+#include <torch/nativert/executor/OpKernelKind.h>
+
+namespace torch::nativert {
+
+struct InputOutputIdxPair {
+  size_t input_idx;
+  size_t output_idx;
+};
+
+using AliasingSpec = std::vector<InputOutputIdxPair>;
+
+class FunctionSchema {
+ public:
+  explicit FunctionSchema(
+      const c10::FunctionSchema& schema,
+      AliasingSpec&& aliasing_spec = {},
+      torch::nativert::OpKernelKind kernel_kind =
+          torch::nativert::OpKernelKind::kInterpreterFallbackKernel)
+      : aliasing_spec_(std::move(aliasing_spec)),
+        kernel_kind_(kernel_kind),
+        c10_fn_schema_(schema) {}
+
+  c10::FunctionSchema& base_schema() {
+    return c10_fn_schema_;
+  }
+
+  const c10::FunctionSchema& base_schema() const {
+    return c10_fn_schema_;
+  }
+
+  bool alias(size_t input_idx, size_t output_idx) const;
+
+  C10_ALWAYS_INLINE torch::nativert::OpKernelKind kernel_kind() const {
+    return kernel_kind_;
+  }
+
+ private:
+  AliasingSpec aliasing_spec_;
+  torch::nativert::OpKernelKind kernel_kind_;
+  c10::FunctionSchema c10_fn_schema_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/memory/GreedyBySize.cpp b/torch/nativert/executor/memory/GreedyBySize.cpp
new file mode 100644
index 000000000000..afc99e0df8cd
--- /dev/null
+++ b/torch/nativert/executor/memory/GreedyBySize.cpp
@@ -0,0 +1,166 @@
+#include <iomanip>
+#include <limits>
+#include <optional>
+
+#include <c10/util/Enumerate.h>
+#include <c10/util/Logging.h>
+#include <c10/util/irange.h>
+
+#include <torch/nativert/executor/memory/GreedyBySize.h>
+
+namespace {
+
+using namespace torch::nativert;
+
+// we need to track the original order in which allocations were made
+// since they will be re-sorted between iterations
+struct GreedyAllocation : public Allocation {
+  explicit GreedyAllocation(
+      Allocation allocation,
+      size_t allocation_idx,
+      size_t input_spec_idx)
+      : Allocation(allocation),
+        allocation_index(allocation_idx),
+        input_spec_index(input_spec_idx) {}
+  // we need to maintain the allocation ordering s.t., we can look up
+  // previous allocations directly from descending_allocation_specs_
+  // even after allocations has been re-sorted, which happens after
+  // each allocation is complete.
+  //
+  // i.e., this index represents the index of the spec that was used
+  // to create this allocation inside descending_allocation_specs_
+  // AFTER the sorting was completed.
+  size_t allocation_index{0};
+  // index of the spec associated with this allocation
+  // in the event that the specs get re-ordered
+  // in the process of creating allocations
+  // e.g.,
+  //              allocation_specs[sX, sY, sZ]
+  //                                ^   ^   ^
+  //                        values[vX, vY, vZ]
+  //
+  // means that an allocation created from sY
+  // will have an input_spec_index of 1
+  //
+  // this allows us to return to the original
+  // ordering before returning the allocations
+  size_t input_spec_index{0};
+};
+
+struct AllocationSpecWithIndex {
+  const AllocationSpec* spec;
+  size_t index;
+};
+
+// associate specs with their original (unsorted) index
+// and then sort them in descending order by byte size
+std::vector<AllocationSpecWithIndex> prepare_allocation_specs(
+    const std::vector<AllocationSpec>& allocation_specs) {
+  std::vector<AllocationSpecWithIndex> specs;
+  specs.reserve(allocation_specs.size());
+
+  for (const auto i : c10::irange(allocation_specs.size())) {
+    specs.push_back({&allocation_specs[i], i});
+  }
+
+  std::sort(specs.begin(), specs.end(), [](auto& lhs, auto& rhs) {
+    return lhs.spec->size > rhs.spec->size;
+  });
+
+  return specs;
+}
+
+} // namespace
+
+namespace torch::nativert {
+
+// https://arxiv.org/pdf/2001.03288
+LayoutPlan GreedyBySizeAllocationPlanner(
+    const std::vector<AllocationSpec>& allocation_specs) {
+  LayoutPlan plan;
+
+  auto descending_allocation_specs = prepare_allocation_specs(allocation_specs);
+
+  std::vector<GreedyAllocation> allocations;
+  allocations.reserve(allocation_specs.size());
+
+  auto get_next_offset = [&](const AllocationSpec& spec) -> size_t {
+    size_t prev_offset = 0;
+    std::optional<size_t> best_offset = std::nullopt;
+    size_t smallest_gap = std::numeric_limits<size_t>::max();
+
+    for (const auto& alloc : allocations) {
+      if (auto* allocated_spec =
+              descending_allocation_specs.at(alloc.allocation_index).spec;
+          allocated_spec->not_overlapping_with(spec)) {
+        continue;
+      }
+
+      if (alloc.offset > prev_offset) {
+        if (size_t gap = alloc.offset - prev_offset;
+            gap >= spec.size && gap < smallest_gap) {
+          smallest_gap = gap;
+          best_offset = prev_offset;
+        }
+      }
+
+      prev_offset = std::max(prev_offset, alloc.offset + alloc.size);
+    }
+
+    return best_offset.value_or(prev_offset);
+  };
+
+  size_t total_allocation_size = 0;
+  for (const auto&& [allocation_index, spec_with_original_index] :
+       c10::enumerate(descending_allocation_specs)) {
+    auto& spec = spec_with_original_index.spec;
+
+    auto new_allocation = GreedyAllocation(
+        Allocation{spec->size, get_next_offset(*spec)},
+        allocation_index,
+        spec_with_original_index.index);
+
+    total_allocation_size += new_allocation.size;
+    plan.total_size =
+        std::max(plan.total_size, new_allocation.offset + new_allocation.size);
+
+    VLOG(1) << "allocation with interval " << spec->lifetime.start << "-->"
+            << spec->lifetime.end << " placed at offset "
+            << new_allocation.offset;
+
+    // insert new allocation while maintaining relative-offset ordering
+    // the algorithm is already quadratic because of get_next_offset
+    // so this is negligible
+
+    auto it = std::lower_bound(
+        allocations.begin(),
+        allocations.end(),
+        new_allocation,
+        [](auto& lhs, auto& rhs) { return lhs.offset < rhs.offset; });
+    allocations.insert(it, new_allocation);
+  }
+
+  // sort allocations so their ordering is consistent with the input specs
+  std::sort(allocations.begin(), allocations.end(), [](auto& lhs, auto& rhs) {
+    return lhs.input_spec_index < rhs.input_spec_index;
+  });
+
+  plan.allocations.reserve(allocations.size());
+  std::move(
+      allocations.begin(),
+      allocations.end(),
+      std::back_inserter(plan.allocations));
+
+  if (plan.total_size > 0) {
+    VLOG(1) << std::fixed << std::setprecision(2)
+            << "greedy-by-size bytes saved over strictly increasing: "
+            << (1.0 - ((float)plan.total_size / (float)total_allocation_size)) *
+            100
+            << "% (" << total_allocation_size << " - " << plan.total_size
+            << " = " << (total_allocation_size - plan.total_size) << " bytes)";
+  }
+
+  return plan;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/memory/GreedyBySize.h b/torch/nativert/executor/memory/GreedyBySize.h
new file mode 100644
index 000000000000..0d5a61132cf9
--- /dev/null
+++ b/torch/nativert/executor/memory/GreedyBySize.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/nativert/executor/memory/LayoutPlannerAlgorithm.h>
+
+namespace torch::nativert {
+
+LayoutPlan GreedyBySizeAllocationPlanner(
+    const std::vector<AllocationSpec>& allocation_specs);
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/memory/LayoutPlannerAlgorithm.h b/torch/nativert/executor/memory/LayoutPlannerAlgorithm.h
new file mode 100644
index 000000000000..eda8e57c64d1
--- /dev/null
+++ b/torch/nativert/executor/memory/LayoutPlannerAlgorithm.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::nativert {
+
+// represents the inclusive lifetime of a tensor
+// i.e., the buffer used by tensor x with lifetime [m, n]
+// can only be safely used during intervals 0 --> m-1 and n+1 --> ...
+//
+// e.g.,
+//
+// g(x):           0
+//  a = op_a(x)    1
+//  b = op_b(a)    2
+//  c = op_c(a)    3
+//  return (b, c)  4
+//
+// gives:
+//
+//  lifetime(x) = 0 --> 1
+//  lifetime(a) = 1 --> 3
+//  lifetime(b) = 2 --> 4
+//  lifetime(c) = 3 --> 4
+//
+// assuming no aliasing...
+// however, if b aliases a we'd get
+//
+//  lifetime(x) = 0 --> 1
+//  lifetime(a) = 1 --> *4* (max{l_end(a), l_end(b)})
+//  lifetime(b) = 2 --> 4
+//  lifetime(c) = 3 --> 4
+
+struct AllocationLifetime {
+  AllocationLifetime() = default;
+  AllocationLifetime(size_t s, size_t e) : start(s), end(e) {}
+
+  // two lifetime intervals are considered not overlapping
+  // if their lifetimes are exclusive.
+  // e.g.,
+  //  l(a) = 0 --> 3
+  //  overlaps with
+  //  l(b) = 3 --> 5
+  // since both tensors can exist at t = 3
+  //
+  // however, if l(b) = 4 --> 5
+  // l(a) and l(b) do not overlap.
+  bool not_overlapping_with(const AllocationLifetime& other) const {
+    return this->end < other.start || this->start > other.end;
+  }
+
+  bool operator==(const AllocationLifetime& other) const {
+    return this->start == other.start && this->end == other.end;
+  }
+
+  size_t start{0};
+  size_t end{0};
+};
+
+struct AllocationSpec {
+  AllocationLifetime lifetime{};
+  size_t size{0};
+
+  bool not_overlapping_with(const AllocationSpec& other) const {
+    return this->lifetime.not_overlapping_with(other.lifetime);
+  }
+};
+
+struct Allocation {
+  size_t size{0};
+  size_t offset{0};
+};
+
+struct LayoutPlan {
+  size_t total_size{0};
+  // in practice, each allocation has an associated
+  // allocation spec
+  //
+  // for example, given:
+  //
+  // allocation_specs = [s1, s2, s3]
+  // plan = algorithm(allocation_specs)
+  //
+  // plan.allocations will be [a1, a2, a3]
+  //                            ^   ^   ^
+  // mapping back to          [s1, s2, s3]
+  std::vector<Allocation> allocations;
+};
+
+// a layout planner algorithm is provided a vector of
+// allocation specs, and returns a plan containing
+// a vector of allocations (i.e., offset & size)
+// whose order MUST correspond to that of the input
+//
+// specifically, provided:
+// auto plan = algorithm(allocation_specs);
+//
+// allocation_specs.size() == plan.allocations.size()
+//
+// AND
+//
+// allocation_specs[0] --> plan.allocations[0]
+// ...
+// allocation_specs[i] --> plan.allocations[i]
+// ...
+// allocation_specs[allocation_specs.size() - 1] -->
+// plan.allocations[plan.allocations.size() - 1]
+using LayoutPlannerAlgorithm =
+    LayoutPlan(const std::vector<AllocationSpec>& allocation_specs);
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/memory/LayoutPlannerSettings.h b/torch/nativert/executor/memory/LayoutPlannerSettings.h
new file mode 100644
index 000000000000..2c6a75cfd86d
--- /dev/null
+++ b/torch/nativert/executor/memory/LayoutPlannerSettings.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <chrono>
+
+namespace torch::nativert {
+
+enum class LayoutPlannerAlgorithmType {
+  Bump,
+  GreedyBySize,
+};
+
+class LayoutManagerSettings {
+ public:
+  LayoutManagerSettings() = default;
+
+  bool deallocateBetweenRequests() const {
+    return deallocateBetweenRequests_;
+  }
+
+  LayoutManagerSettings& setDeallocateBetweenRequests(
+      bool deallocateBetweenRequests) {
+    deallocateBetweenRequests_ = deallocateBetweenRequests;
+    return *this;
+  }
+
+ private:
+  friend class LayoutManager;
+  bool deallocateBetweenRequests_{true};
+};
+
+class LayoutPlannerSettings {
+ public:
+  LayoutPlannerSettings() = default;
+
+  bool enabled() const {
+    return enabled_;
+  }
+
+  LayoutPlannerAlgorithmType algorithmType() const {
+    return layoutPlannerAlgorithmType_;
+  }
+
+  std::chrono::seconds planningInterval() const {
+    return planningInterval_;
+  }
+
+  const LayoutManagerSettings& layoutManagerSettings() const {
+    return layoutManagerSettings_;
+  }
+
+  LayoutPlannerSettings& setEnabled(bool enabled) {
+    enabled_ = enabled;
+    return *this;
+  }
+
+  LayoutPlannerSettings& setAlgorithmType(
+      LayoutPlannerAlgorithmType layoutPlannerAlgorithmType) {
+    layoutPlannerAlgorithmType_ = layoutPlannerAlgorithmType;
+    return *this;
+  }
+
+  LayoutPlannerSettings& setPlanningInterval(
+      std::chrono::seconds planningInterval) {
+    planningInterval_ = planningInterval;
+    return *this;
+  }
+
+  LayoutPlannerSettings& setLayoutManagerSettings(
+      LayoutManagerSettings layoutManagerSettings) {
+    layoutManagerSettings_ = layoutManagerSettings;
+    return *this;
+  }
+
+ private:
+  friend class LayoutPlanner;
+  bool enabled_{false};
+  LayoutPlannerAlgorithmType layoutPlannerAlgorithmType_{
+      LayoutPlannerAlgorithmType::Bump};
+  std::chrono::seconds planningInterval_{5};
+  LayoutManagerSettings layoutManagerSettings_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/Graph.cpp b/torch/nativert/graph/Graph.cpp
new file mode 100644
index 000000000000..f817776ccb5a
--- /dev/null
+++ b/torch/nativert/graph/Graph.cpp
@@ -0,0 +1,1565 @@
+#include <torch/nativert/graph/Graph.h>
+
+#include <fmt/ostream.h>
+#include <fmt/ranges.h>
+#include <limits>
+#include <queue>
+
+#include <c10/util/Enumerate.h>
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/StringUtil.h>
+#include <c10/util/string_view.h>
+#include <torch/nativert/executor/Placement.h> // @manual
+#include <torch/nativert/graph/TensorMeta.h> // @manual
+
+namespace torch::nativert {
+
+namespace {
+
+// Workaround for MSVC bug: "std" ambiguous symbol.
+template <typename T, typename U>
+constexpr bool is_same_v = std::is_same_v<T, U>;
+
+bool isBlank(char n) {
+  return std::isspace(n);
+}
+
+size_t consumeWhitespaceImpl(std::string_view source, size_t curPos) {
+  while (isBlank(source.at(curPos))) {
+    curPos++;
+  }
+  return curPos;
+}
+
+size_t expectImpl(
+    std::string_view source,
+    std::string_view expected,
+    size_t curPos) {
+  curPos = consumeWhitespaceImpl(source, curPos);
+  const auto actual = source.substr(curPos, expected.size());
+  TORCH_CHECK(
+      expected == actual,
+      fmt::format(
+          "Parser error: expected '{}' at postition {}, but found '{}'.",
+          expected,
+          curPos,
+          actual));
+  curPos += expected.size();
+  return curPos;
+}
+
+size_t expectImpl(std::string_view source, char expected, size_t curPos) {
+  curPos = consumeWhitespaceImpl(source, curPos);
+  while (isBlank(source.at(curPos))) {
+    curPos++;
+  }
+  TORCH_CHECK(
+      expected == source[curPos],
+      "Parser error: expected '{}' at postition {}, but found '{}'.",
+      expected,
+      curPos,
+      source[curPos]);
+  curPos++;
+  return curPos;
+}
+} // namespace
+
+bool operator==(const Type& left, const Type& right) {
+  if (left.kind() != right.kind()) {
+    return false;
+  }
+  if (std::holds_alternative<Type::CustomObjData>(left.kind_) &&
+      std::holds_alternative<Type::CustomObjData>(right.kind_)) {
+    return std::get<Type::CustomObjData>(left.kind_).classFqn ==
+        std::get<Type::CustomObjData>(right.kind_).classFqn;
+  }
+  return true;
+}
+
+Graph::Graph()
+    : insertBefore_(nodes_.end()),
+      inputNode_(insertNode("prim.Input", {})),
+      outputNode_(insertNode("prim.Output", {})) {
+  // Set the insertion point to append to the graph
+  insertBefore_ = nodes_.iterator_to(*outputNode_);
+}
+
+std::string Graph::getUniqueValueName() {
+  auto name = fmt::format("v{}", uniqueValueName_);
+  while (values_.find(name) != values_.end()) {
+    name = fmt::format("v{}", uniqueValueName_++);
+  }
+  return name;
+}
+
+// If `name` is null, create a unique value name
+Value* Graph::addValue(
+    const std::optional<std::string>& name,
+    const Type& type,
+    Node* node) {
+  const auto valueName = name.value_or(getUniqueValueName());
+  ValueId valueId = getNextValueId();
+  const auto [it, success] = values_.insert(
+      {valueName, std::make_unique<Value>(valueId, valueName, type, node)});
+  TORCH_CHECK(
+      success,
+      fmt::format(
+          "Tried to create Value with name: '{}', but it already existed",
+          valueName));
+  return it->second.get();
+}
+
+Value* Graph::addInput(std::string_view name, const Type& type) {
+  return inputNode_->addOutput(name, type);
+}
+
+void Graph::addInput() {
+  inputNode_->addOutput();
+}
+
+Value* Graph::addOutput(Value* v) {
+  outputNode_->addInput({std::string(v->name()), v});
+  return v;
+}
+
+void Graph::addConstantOutput(Constant c) {
+  constantOutputs_.push_back(std::move(c));
+}
+
+// Create a node without inserting it into the execution graph.
+Node* Graph::createNode(
+    std::string target,
+    std::vector<NamedArgument> inputs,
+    std::unordered_map<std::string, std::string> metadata) {
+  auto& node = nodesOwner_.emplace_back(std::make_unique<Node>(
+      this, std::move(target), std::move(inputs), std::move(metadata)));
+  return node.get();
+}
+
+Node* Graph::insertBefore(Node* toInsert, Node* insertionPoint) {
+  TORCH_CHECK(insertionPoint != inputNode_, "can't insert before prim.Input");
+  TORCH_CHECK(
+      !toInsert->is_linked(), "expected node to be unlinked: ", *toInsert);
+  TORCH_CHECK(
+      insertionPoint->is_linked(),
+      "expected node to be linked: ",
+      *insertionPoint);
+  auto it = nodes_.insert(nodes_.iterator_to(*insertionPoint), *toInsert);
+  return &*it;
+}
+
+Node* Graph::insert(Node* toInsert) {
+  TORCH_CHECK(
+      !toInsert->is_linked(), "expected node to be unlinked: ", *toInsert);
+  nodes_.insert(insertBefore_, *toInsert);
+  return toInsert;
+}
+
+Node* Graph::insertAfter(Node* toInsert, Node* insertionPoint) {
+  TORCH_CHECK(insertionPoint != outputNode_, "can't insert after prim.Output");
+  TORCH_CHECK(
+      !toInsert->is_linked(), "expected node to be unlinked: ", *toInsert);
+  TORCH_CHECK(
+      insertionPoint->is_linked(),
+      "expected node to be linked: ",
+      *insertionPoint);
+
+  auto insertIt = nodes_.iterator_to(*insertionPoint);
+  // Increment once because we want to insert after the insertion point
+  ++insertIt;
+  auto it = nodes_.insert(insertIt, *toInsert);
+  return &*it;
+}
+
+Node* Graph::insertNode(
+    std::string target,
+    std::vector<NamedArgument> inputs,
+    std::unordered_map<std::string, std::string> metadata) {
+  auto node =
+      createNode(std::move(target), std::move(inputs), std::move(metadata));
+  nodes_.insert(insertBefore_, *node);
+  return node;
+}
+
+std::ostream& operator<<(std::ostream& out, const Type& ty) {
+  std::visit(
+      [&out](auto&& arg) {
+        using T = std::decay_t<decltype(arg)>;
+        if constexpr (is_same_v<T, Type::Kind>) {
+          switch (arg) {
+            case Type::Kind::None:
+              out << "None";
+              break;
+            case Type::Kind::Tensor:
+              out << "Tensor";
+              break;
+            case Type::Kind::TensorList:
+              out << "TensorList";
+              break;
+            case Type::Kind::OptionalTensorList:
+              out << "OptionalTensorList";
+              break;
+            case Type::Kind::SymInt:
+              out << "SymInt";
+              break;
+            case Type::Kind::SymFloat:
+              out << "SymFloat";
+              break;
+            case Type::Kind::SymIntList:
+              out << "SymIntList";
+              break;
+            case Type::Kind::CustomObj:
+              out << "CustomObj";
+              break;
+            default:
+              TORCH_CHECK(false, "Unhandled type");
+          }
+        } else if constexpr (is_same_v<T, Type::CustomObjData>) {
+          out << "CustomObj: " << arg.classFqn;
+        }
+      },
+      ty.kind_);
+  return out;
+}
+
+const NamedArgument* Node::tryGetInput(std::string_view name) const {
+  // Just do a scan over the inputs. We expect there to always be a very small
+  // number of elements, so it shouldn't be slow. This allows us to avoid a
+  // second datastructure for lookups.
+  // Drop a debug check here, just to make sure :)
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inputs_.size() < 1000);
+  for (const auto& input : inputs_) {
+    if (input.name == name) {
+      return &input;
+    }
+  }
+  return nullptr;
+}
+
+const NamedArgument& Node::getInput(std::string_view name) const {
+  const auto ret = tryGetInput(name);
+  if (ret == nullptr) {
+    TORCH_CHECK(
+        false,
+        fmt::format(
+            "Expected input '{}' on node: '{}' to exist, but it does not.",
+            name,
+            fmt::streamed(*this)));
+  }
+  return *ret;
+}
+
+const Attribute* Node::tryGetAttribute(std::string_view name) const {
+  // Just do a scan over the inputs. We expect there to always be a very small
+  // number of elements, so it shouldn't be slow. This allows us to avoid a
+  // second datastructure for lookups.
+  // Drop a debug check here, just to make sure :)
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(attributes_.size() < 1000);
+  for (const auto& attribute : attributes_) {
+    if (attribute.name == name) {
+      return &attribute;
+    }
+  }
+  return nullptr;
+}
+
+const Attribute& Node::getAttribute(std::string_view name) const {
+  const auto ret = tryGetAttribute(name);
+  if (ret == nullptr) {
+    TORCH_CHECK(
+        false,
+        fmt::format(
+            "Expected attribute '{}' on node: '{}' to exist, but it does not.",
+            name,
+            fmt::streamed(*this)));
+  }
+  return *ret;
+}
+
+void Node::applyDevicePlacement(const Placement& placement) {
+  for (auto& attribute : attributes_) {
+    if (std::holds_alternative<c10::Device>(attribute.value)) {
+      auto device = std::get<c10::Device>(attribute.value);
+      auto targetDevice =
+          placement.getMappedDevice(std::get<c10::Device>(attribute.value));
+      if (!torch::nativert::isSameDevice(targetDevice, device)) {
+        LOG(INFO) << "Overriding " << device.str() << " to "
+                  << targetDevice.str() << " for node " << *this;
+        attribute.value = targetDevice;
+      }
+    }
+  }
+}
+
+Node* Node::next() {
+  return owningGraph()->nodeAfter(this);
+}
+
+const Node* Node::next() const {
+  return owningGraph()->nodeAfter(this);
+}
+
+Node* Node::prev() {
+  return owningGraph()->nodeBefore(this);
+}
+
+const Node* Node::prev() const {
+  return owningGraph()->nodeBefore(this);
+}
+
+bool Node::isBefore(const Node* n) const {
+  if (this == n) {
+    return false;
+  }
+
+  for (const Node* cursor = this->next(); cursor != nullptr;
+       cursor = cursor->next()) {
+    if (cursor == n) {
+      return true;
+    }
+  }
+  // Reached the end without finding n
+  return false;
+}
+
+std::vector<Node*> Node::producers() const {
+  std::vector<Node*> ret;
+
+  if (this->prev() == nullptr /* prim.Input */) {
+    return ret;
+  }
+
+  if (this->next() == nullptr /* prim.Output */) {
+    for (auto& node : owningGraph_->nodes()) {
+      if (node.next() == nullptr /* prim.Output */ ||
+          node.prev() == nullptr /* prim.Input */) {
+        continue;
+      }
+      for (auto* dep : node.users()) {
+        if (dep == this /* prim.Output */) {
+          ret.push_back(&node);
+        }
+      }
+    }
+  } else {
+    std::unordered_set<const Node*> seen;
+
+    for (const auto& input : inputs()) {
+      auto* n = input.value->producer();
+      if (n == nullptr) {
+        continue;
+      }
+      if (const auto [_, inserted] = seen.insert(n); inserted) {
+        ret.push_back(n);
+      }
+    }
+
+    if (ret.empty()) {
+      ret.push_back(owningGraph_->inputNode());
+    }
+  }
+
+  return ret;
+}
+
+std::vector<Node*> Node::users() const {
+  std::vector<Node*> ret;
+
+  if (this->next() == nullptr /* prim.Output */) {
+    return ret;
+  }
+
+  if (this->prev() == nullptr /* prim.Input */) {
+    for (auto& node : owningGraph_->nodes()) {
+      if (node.prev() == nullptr /* prim.Input */ ||
+          node.next() == nullptr /* prim.Output */) {
+        continue;
+      }
+      for (auto* dep : node.producers()) {
+        if (dep == this /* prim.Input */) {
+          ret.push_back(&node);
+        }
+      }
+    }
+  } else {
+    std::unordered_set<const Node*> seen;
+
+    for (const auto* output : outputs()) {
+      for (auto* n : output->users()) {
+        if (const auto [_, inserted] = seen.insert(n); inserted) {
+          ret.push_back(n);
+        }
+      }
+    }
+
+    if (ret.empty()) {
+      ret.push_back(owningGraph_->outputNode());
+    }
+  }
+
+  return ret;
+}
+
+Node* Graph::createListPack(std::vector<Value*> inputs, const Type& inputType) {
+  std::vector<NamedArgument> nodeInputs;
+  nodeInputs.reserve(inputs.size());
+  for (auto [i, input] : c10::enumerate(inputs)) {
+    nodeInputs.push_back({fmt::format("l{}", i), input});
+  }
+  // Create a new named value for this
+  auto name = getUniqueValueName();
+  auto node = createNode("prim.ListPack", std::move(nodeInputs));
+
+  // Make sure all inputs are the same type
+  for (auto& input : inputs) {
+    TORCH_CHECK(input->type() == inputType);
+  }
+
+  if (inputType == Type::Kind::Tensor) {
+    node->addOutput(name, Type::Kind::TensorList);
+  } else if (inputType == Type::Kind::SymInt) {
+    node->addOutput(name, Type::Kind::SymIntList);
+  }
+
+  return node;
+}
+
+Node* Graph::createOptionalListPack(std::vector<Value*> inputs) {
+  std::vector<NamedArgument> nodeInputs;
+  nodeInputs.reserve(inputs.size());
+  for (auto [i, input] : c10::enumerate(inputs)) {
+    nodeInputs.push_back({fmt::format("l{}", i), input});
+  }
+  // Create a new named value for this
+  auto name = getUniqueValueName();
+  auto node = createNode("prim.ListPack", std::move(nodeInputs));
+  // Make sure all inputs are either None or Tensor
+  for (auto& input : inputs) {
+    TORCH_CHECK(
+        input->type() == Type::Kind::None ||
+        input->type() == Type::Kind::Tensor);
+  }
+  node->addOutput(name, Type::Kind::OptionalTensorList);
+
+  return node;
+}
+
+Value* Graph::createConstantSymIntValue(int value) {
+  auto valueName = getUniqueValueName();
+  ValueId valueId = getNextValueId();
+  const auto [it, success] = values_.insert(
+      {valueName,
+       std::make_unique<Value>(
+           valueId, valueName, Type::Kind::SymInt, nullptr)});
+  TORCH_CHECK(
+      success,
+      fmt::format(
+          "Tried to create constant SymInt Value with name: '{}', but it already existed",
+          valueName));
+  constantSymIntValues_[valueId] = value;
+  return it->second.get();
+}
+
+Value* Graph::getValue(std::string_view name) const {
+  // TODO: can eliminate this string copy by enabling heterogeneous lookup for
+  // the container
+  return values_.at(std::string(name)).get();
+}
+
+Value* Graph::tryGetValue(std::string_view name) const {
+  // TODO: can eliminate this string copy by enabling heterogeneous lookup for
+  // the container
+  const auto key = std::string(name);
+  if (values_.find(key) != values_.end()) {
+    return values_.at(key).get();
+  }
+  return nullptr;
+}
+
+void Graph::renumberValues() {
+  std::vector<Value*> currentValues;
+  currentValues.reserve(values_.size());
+  for (auto& kv : values_) {
+    currentValues.push_back(kv.second.get());
+  }
+
+  // Sort values in creation order (by value ids)
+  std::sort(currentValues.begin(), currentValues.end(), [](Value* a, Value* b) {
+    return a->id() < b->id();
+  });
+
+  // Build a new id map with all ids < values_.size()
+  std::unordered_map<ValueId, ValueId> oldToNew;
+  oldToNew.reserve(currentValues.size());
+  ValueId newId = 0;
+  for (Value* v : currentValues) {
+    oldToNew[v->id()] = newId;
+    v->setId(newId);
+    newId++;
+  }
+
+  std::unordered_map<ValueId, int> newSymIntMap;
+  for (auto& [oldId, symIntVal] : constantSymIntValues_) {
+    auto it = oldToNew.find(oldId);
+    if (it != oldToNew.end()) {
+      ValueId updatedId = it->second;
+      newSymIntMap[updatedId] = symIntVal;
+    }
+  }
+  constantSymIntValues_ = std::move(newSymIntMap);
+  uniqueValueId_ = newId;
+}
+
+bool Graph::cleanupDeadNodes() {
+  std::unordered_set<const Node*> visited;
+  std::vector<const Node*> visitStack;
+
+  // Mark reachable nodes from output
+  visitStack.push_back(outputNode_);
+  visited.insert(outputNode_);
+
+  while (!visitStack.empty()) {
+    const Node* current = visitStack.back();
+    visitStack.pop_back();
+
+    for (auto& namedArg : current->inputs()) {
+      Value* val = namedArg.value;
+      Node* producer = val->producer();
+
+      if (!producer) {
+        continue;
+      }
+      if (!visited.count(producer)) {
+        visited.insert(producer);
+        visitStack.push_back(producer);
+      }
+    }
+  }
+
+  // Remove all nodes not in visited (other than input/outputs)
+  std::vector<Node*> toRemove;
+  for (auto& n : nodes()) {
+    if (n.target() == "prim.Input" || n.target() == "prim.Output" ||
+        visited.count(&n)) {
+      continue;
+    }
+    toRemove.push_back(&n);
+  }
+
+  const bool mutated = !toRemove.empty();
+
+  // Remove nodes in reverse order to handle input/output dependencies
+  for (auto it = toRemove.rbegin(); it != toRemove.rend(); ++it) {
+    removeNode(*it);
+  }
+
+  renumberValues();
+  lint();
+
+  return mutated;
+}
+
+void Graph::lint() const {
+  // Check that every value has a producer marked.
+  for (const auto& [name, value] : values_) {
+    // Some constant symint and None don't have producer nodes
+    if (value->type().kind() != Type::Kind::SymInt &&
+        value->type().kind() != Type::Kind::None) {
+      TORCH_CHECK(value->isFolded() || value->producer() != nullptr);
+    }
+  }
+  for (const auto& node : nodes()) {
+    TORCH_CHECK_EQ(node.owningGraph(), this);
+  }
+  // Check that every list type is either produced by a prim.ListPack or
+  // immediately consumed by a prim.ListUnpack. We make use of this invariant
+  // to retrieve list elements in `getListElements`.
+  for (const auto& [_, value] : values_) {
+    if (value->type().kind() != Type::Kind::TensorList) {
+      continue;
+    }
+    const bool producedByListPack =
+        value->producer(/* resolve_folded = */ true)->target() ==
+        "prim.ListPack";
+    const bool consumedByListUnpack = value->users().size() == 1 &&
+        value->users()[0]->target() == "prim.ListUnpack";
+    TORCH_CHECK(producedByListPack || consumedByListUnpack);
+  }
+
+  auto getNames = [](const auto& values) {
+    c10::FastSet<std::string> names;
+    for (const auto* value : values) {
+      if (value) {
+        names.emplace(value->name());
+      }
+    }
+    return names;
+  };
+  signature_.lint(getNames(inputs()), getNames(outputs()));
+}
+
+void Graph::finalize() {
+  // build userOutputs_ view
+  userOutputs_.clear();
+  size_t constantIndex = 0;
+  for (auto& outputName : signature_.userOutputs()) {
+    if (outputName.has_value()) {
+      userOutputs_.emplace_back(getValue(*outputName));
+    } else {
+      if (constantIndex < constantOutputs_.size()) {
+        userOutputs_.emplace_back(std::move(constantOutputs_[constantIndex]));
+        constantIndex++;
+      } else {
+        TORCH_CHECK(false, "No more constant outputs available");
+      }
+    }
+  }
+}
+
+namespace {
+// Scan through a node's inputs, replacing ALL instances of `old` with
+// `replacement`.  Returns true if a replacement occurred, otherwise false.
+bool replace(Node* node, Value* old, Value* replacement) {
+  bool replacementOccurred = false;
+  for (auto& input : node->inputs()) {
+    if (input.value == old) {
+      input.value = replacement;
+      replacementOccurred = true;
+    }
+  }
+  return replacementOccurred;
+}
+} // namespace
+
+void Graph::replaceAllUses(Value* old, Value* replacement) {
+  for (auto user : old->users()) {
+    // Find this use in the input list and replace it
+    auto replaced = replace(user, old, replacement);
+    TORCH_CHECK(replaced);
+    replacement->addUser(user);
+  }
+  old->eraseAllUsers();
+  signature_.replaceAllUses(old->name(), replacement->name());
+}
+
+void Graph::replaceAllUsesAfterNode(
+    Value* old,
+    Value* replacement,
+    Node* afterThis) {
+  auto it = nodes_.iterator_to(*afterThis);
+  // Don't search `afterThis`
+  ++it;
+  // Scan through all node inputs linearly and replace uses
+  for (; it != nodes_.end(); ++it) {
+    Node* node = &*it;
+    const bool replaced = replace(node, old, replacement);
+    if (replaced) {
+      old->eraseUser(node);
+      replacement->addUser(node);
+    }
+  }
+  signature_.replaceAllUses(old->name(), replacement->name());
+}
+
+void Graph::applyDevicePlacement(const Placement& placement) {
+  // TODO: consolidate device info in weight loading here as well.
+  for (auto& node : nodes_) {
+    node.applyDevicePlacement(placement);
+  }
+}
+
+Node* Graph::nodeAfter(Node* n) {
+  TORCH_CHECK_EQ(n->owningGraph(), this);
+  if (n == outputNode_) {
+    return nullptr;
+  }
+  auto it = nodes_.iterator_to(*n);
+  return &*(++it);
+}
+
+const Node* Graph::nodeAfter(const Node* n) const {
+  TORCH_CHECK_EQ(n->owningGraph(), this);
+  if (n == outputNode_) {
+    return nullptr;
+  }
+  auto it = nodes_.iterator_to(*n);
+  return &*(++it);
+}
+
+Node* Graph::nodeBefore(Node* n) {
+  TORCH_CHECK_EQ(n->owningGraph(), this);
+  if (n == inputNode_) {
+    return nullptr;
+  }
+  auto it = nodes_.iterator_to(*n);
+  return &*(--it);
+}
+
+const Node* Graph::nodeBefore(const Node* n) const {
+  TORCH_CHECK_EQ(n->owningGraph(), this);
+  if (n == inputNode_) {
+    return nullptr;
+  }
+  auto it = nodes_.iterator_to(*n);
+  return &*(--it);
+}
+
+void Graph::removeNode(Node* n) {
+  TORCH_CHECK_EQ(n->owningGraph(), this)
+      << "Node does not belong to this graph!";
+
+  for (auto* outputVal : n->outputs()) {
+    TORCH_CHECK(
+        outputVal->users().empty(),
+        "Trying to erase a node that still has users: ",
+        outputVal->name());
+    outputVal->eraseAllUsers();
+    removeValue(outputVal);
+  }
+
+  for (const auto& input : n->inputs()) {
+    input.value->eraseUser(n);
+  }
+
+  TORCH_CHECK(n->is_linked(), "Node is not linked to the graph!");
+  n->unlink();
+
+  auto it = std::find_if(
+      nodesOwner_.begin(),
+      nodesOwner_.end(),
+      [n](const std::unique_ptr<Node>& ptr) { return ptr.get() == n; });
+
+  TORCH_CHECK(it != nodesOwner_.end(), "Node not found in nodesOwner_!");
+  nodesOwner_.erase(it);
+}
+
+void Graph::removeValue(Value* value) {
+  // TODO: assuming not removing from constantSymIntValues_
+  TORCH_CHECK(value->users().empty(), "Cannot erase a value with users.");
+  auto it = values_.find(std::string(value->name()));
+  TORCH_CHECK(
+      it != values_.end(),
+      "Attempted to erase a value not in graph ",
+      value->name());
+  values_.erase(it);
+}
+
+std::vector<Value*> Graph::insertGraph(
+    const Graph& subgraph,
+    std::vector<Value*> inputs,
+    std::unordered_map<const Value*, Value*>& valueMap) {
+  TORCH_CHECK_EQ(subgraph.inputs().size(), inputs.size())
+      << "Input size mismatch";
+  for (auto i : c10::irange(subgraph.inputs().size())) {
+    valueMap[subgraph.inputs()[i]] = inputs[i];
+  }
+
+  // Clone each node from subgraph
+  for (const auto& n : subgraph.nodes()) {
+    if (n.target() == "prim.Input" || n.target() == "prim.Output") {
+      continue;
+    }
+
+    std::vector<NamedArgument> clonedInputs;
+    auto inputs = n.inputs();
+    clonedInputs.reserve(inputs.size());
+    for (auto& inp : inputs) {
+      auto it = valueMap.find(inp.value);
+      TORCH_CHECK(it != valueMap.end(), "Missing input value in subgraph");
+      clonedInputs.push_back({inp.name, it->second});
+    }
+
+    Node* newNode = insertNode(
+        std::string(n.target()), std::move(clonedInputs), n.metadata());
+
+    for (const auto& attr : n.attributes()) {
+      Attribute newAttr;
+      newAttr.name = attr.name;
+
+      std::visit(
+          [&](auto&& val) -> void {
+            // Workaround for MSVC bug: "std" ambiguous symbol.
+            using std::unique_ptr;
+            using std::move;
+            using T = std::decay_t<decltype(val)>;
+            if constexpr (is_same_v<T, unique_ptr<Graph>>) {
+              LOG(ERROR)
+                  << "Graph attributes are not supported yet. Skipping attribute: "
+                  << attr.name;
+            } else {
+              newAttr.value = val;
+#ifdef __clang__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunknown-warning-option"
+#pragma GCC diagnostic ignored "-Wunqualified-std-cast-call"
+#endif
+              newNode->addAttribute(move(newAttr));
+#ifdef __clang__
+#pragma GCC diagnostic pop
+#endif
+            }
+          },
+          attr.value);
+    }
+
+    for (const auto* outVal : n.outputs()) {
+      const auto& uniqueName = getUniqueValueName();
+      Value* newOut = newNode->addOutput(uniqueName, outVal->type());
+      valueMap[outVal] = newOut;
+    }
+  }
+
+  auto subgraphOutputs = subgraph.outputs();
+  std::vector<Value*> outputValues;
+  outputValues.reserve(subgraphOutputs.size());
+  for (auto* outputValue : subgraphOutputs) {
+    outputValues.emplace_back(valueMap[outputValue]);
+  }
+  lint();
+  return outputValues;
+}
+
+Node::Node(
+    Graph* owningGraph,
+    std::string target,
+    std::vector<NamedArgument> inputs,
+    std::unordered_map<std::string, std::string> metadata)
+    : owningGraph_(owningGraph),
+      target_(std::move(target)),
+      inputs_(std::move(inputs)),
+      metadata_(std::move(metadata)) {
+  for (const auto& input : inputs_) {
+    input.value->addUser(this);
+  }
+}
+
+Value* Node::addInput(NamedArgument input) {
+  inputs_.push_back(std::move(input));
+  auto val = inputs_.back().value;
+  val->addUser(this);
+  return val;
+}
+
+void Node::addInputs(const std::vector<NamedArgument>& inputs) {
+  for (const auto& input : inputs) {
+    addInput(input);
+  }
+}
+
+void Node::addAttribute(Attribute attr) {
+  attributes_.push_back(std::move(attr));
+}
+
+void Node::addOutput() {
+  outputs_.push_back(nullptr);
+}
+
+Value* Node::addOutput(const Type& type) {
+  TORCH_CHECK_EQ(type, Type::Kind::None);
+  Value* v = owningGraph_->addValue(std::nullopt, type, this);
+  outputs_.push_back(v);
+  return v;
+}
+
+Value* Node::addOutput(std::string_view name, const Type& type) {
+  Value* v = owningGraph_->addValue(std::string(name), type, this);
+  outputs_.push_back(v);
+  return v;
+}
+
+void Node::destroy() {
+  owningGraph_->removeNode(this);
+}
+
+void Value::addUser(Node* node) {
+  for (const auto* user : users_) {
+    if (user == node) {
+      return;
+    }
+  }
+  users_.push_back(node);
+}
+
+void Value::eraseUser(Node* node) {
+  users_.erase(
+      std::remove_if(
+          users_.begin(), users_.end(), [&](Node* el) { return el == node; }),
+      users_.end());
+}
+
+std::vector<const Value*> Value::getListElements() const {
+  std::vector<const Value*> ret;
+  if (auto p = producer(); p && p->target() == "prim.ListPack") {
+    for (const auto& tv : p->inputs()) {
+      ret.push_back(tv.value);
+    }
+  } else {
+    TORCH_CHECK_EQ(users().size(), 1);
+    const auto listUnpack = users()[0];
+    TORCH_CHECK_EQ(listUnpack->target(), "prim.ListUnpack");
+    for (const auto v : listUnpack->outputs()) {
+      ret.push_back(v);
+    }
+  }
+  return ret;
+}
+
+template <class>
+[[maybe_unused]] inline constexpr bool AlwaysFalse = false;
+
+c10::IValue constantToIValue(const Constant& constant) {
+  // Workaround for MSVC bug: "std" ambiguous symbol.
+  using std::string;
+  using std::unique_ptr;
+  using std::vector;
+  return std::visit(
+      [](auto&& arg) -> c10::IValue {
+        using T = std::decay_t<decltype(arg)>;
+        if constexpr (is_same_v<T, None>) {
+          return c10::IValue();
+        } else if constexpr (std::is_convertible_v<T, c10::IValue>) {
+          return arg;
+        } else if constexpr (is_same_v<T, unique_ptr<Graph>>) {
+          TORCH_CHECK(
+              false, "subgraph arguments cannot be turned into ivalues!");
+        } else {
+          static_assert(AlwaysFalse<T>, "non-exhaustive visitor!");
+        }
+      },
+      constant);
+}
+
+namespace {
+
+template <class>
+[[maybe_unused]] inline constexpr bool always_false_v = false;
+
+void printDouble(std::ostream& out, double arg) {
+  fmt::print(out, "{}", arg);
+}
+
+template <typename T, typename F>
+std::ostream& printList(
+    std::ostream& out,
+    bool encloseInSquareBrackets,
+    const T& list,
+    F formatter) {
+  if (encloseInSquareBrackets) {
+    out << '[';
+  }
+  for (const auto& [idx, el] : c10::enumerate(list)) {
+    if (idx > 0) {
+      out << ", ";
+    }
+    formatter(out, el);
+  }
+  if (encloseInSquareBrackets) {
+    out << ']';
+  }
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const Constant& constant) {
+  // Workaround for MSVC bug: "std" ambiguous symbol.
+  using std::quoted;
+  using std::string;
+  using std::unique_ptr;
+  using std::vector;
+  std::visit(
+      [&](auto&& arg) {
+        using T = std::decay_t<decltype(arg)>;
+        if constexpr (is_same_v<T, None>) {
+          out << "None";
+        } else if constexpr (is_same_v<T, int64_t> || is_same_v<T, bool>) {
+          out << arg;
+        } else if constexpr (
+            is_same_v<T, vector<int64_t>> || is_same_v<T, vector<bool>>) {
+          out << fmt::format("{}", fmt::streamed(arg));
+        } else if constexpr (is_same_v<T, double>) {
+          printDouble(out, arg);
+        } else if constexpr (is_same_v<T, vector<double>>) {
+          printList(out, true, arg, printDouble);
+        } else if constexpr (is_same_v<T, string>) {
+          out << quoted(arg);
+        } else if constexpr (is_same_v<T, c10::ScalarType>) {
+          out << kScalarTypePrefix << arg;
+        } else if constexpr (is_same_v<T, c10::MemoryFormat>) {
+          out << kMemoryFormatPrefix << arg;
+        } else if constexpr (is_same_v<T, c10::Layout>) {
+          out << kLayoutPrefix << arg;
+        } else if constexpr (is_same_v<T, c10::Device>) {
+          out << kDevicePrefix << "{" << arg << "}";
+        } else if constexpr (is_same_v<T, vector<string>>) {
+          out << fmt::format("[{}]", fmt::join(arg, ","));
+        } else if constexpr (is_same_v<T, unique_ptr<Graph>>) {
+          out << fmt::format("<subgraph>");
+          VLOG(0) << "Subgraph pretty print is not implemented";
+        } else {
+          static_assert(always_false_v<T>, "non-exhaustive visitor!");
+        }
+      },
+      constant);
+  return out;
+}
+
+void printValue(std::ostream& out, const Value* v) {
+  if (!v) {
+    out << "<Constant>";
+    return;
+  }
+  out << *v;
+}
+
+void printNamedArgument(std::ostream& out, const NamedArgument& nv) {
+  out << nv.name << "=" << *nv.value;
+}
+
+void printAttribute(std::ostream& out, const Attribute& nv) {
+  out << nv.name << "=" << nv.value;
+}
+} // namespace
+
+std::ostream& operator<<(std::ostream& out, const Value& v) {
+  out << "%" << v.name();
+  // If a list, distinguish it by adding a []
+  // Looks like %my_list[]
+  if (v.type() == Type::Kind::TensorList) {
+    out << "[]";
+  }
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const Node& node) {
+  // special casing for inputs and outputs
+  if (node.target() == "prim.Input") {
+    out << "graph(";
+    printList(out, false, node.outputs(), printValue);
+    out << "):";
+    return out;
+  }
+  if (node.target() == "prim.Output") {
+    out << "return(";
+    printList(out, false, node.inputs(), [](std::ostream& out, const auto& nv) {
+      out << *nv.value;
+    });
+    out << ")";
+    return out;
+  }
+
+  printList(out, false, node.outputs_, printValue);
+
+  out << " = ";
+  out << node.target_ << "(";
+  printList(out, false, node.inputs_, printNamedArgument);
+  if (!node.inputs_.empty() && !node.attributes_.empty()) {
+    // Emit a connective ',' between inputs and attributes.
+    out << ", ";
+  }
+
+  printList(out, false, node.attributes_, printAttribute);
+  out << ")";
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const Graph& graph) {
+  for (const auto& node : graph.nodes_) {
+    out << node << "\n";
+  }
+  return out;
+}
+
+c10::Device convertDevice(std::string_view symbol) {
+  // Symbol looks like `Device{cuda:1}`
+  const auto typeStart = symbol.find('{') + 1;
+  TORCH_CHECK_LT(typeStart, symbol.size());
+
+  const auto typeEnd = symbol.find(':');
+  TORCH_CHECK_NE(typeEnd, std::string_view::npos);
+
+  const auto type = symbol.substr(typeStart, typeEnd - typeStart);
+  const auto indexStart = typeEnd + 1;
+  TORCH_CHECK_LT(indexStart, symbol.size());
+
+  const auto indexEnd = symbol.find('}');
+  TORCH_CHECK_NE(indexEnd, std::string_view::npos);
+
+  const auto index = symbol.substr(indexStart, indexEnd - indexStart);
+
+  c10::Device device((std::string(type)));
+  auto indexValue = c10::tryToNumber<int64_t>(std::string{index});
+  TORCH_CHECK(indexValue.has_value(), "Invalid device index format");
+  int64_t deviceIndex = indexValue.value();
+  TORCH_CHECK(
+      deviceIndex >= std::numeric_limits<c10::DeviceIndex>::min() &&
+          deviceIndex <= std::numeric_limits<c10::DeviceIndex>::max(),
+      "Device index out of range for int8_t");
+  device.set_index(static_cast<c10::DeviceIndex>(deviceIndex));
+  return device;
+}
+
+Constant convertAtomicConstant(std::string_view symbol) {
+  if (c10::starts_with(symbol, "\"")) {
+    // chop off the outer quotes and return the string
+    TORCH_CHECK_GE(symbol.size(), 2);
+    symbol.remove_prefix(1);
+    symbol.remove_suffix(1);
+    return std::string(symbol);
+  } else if (symbol == "None") {
+    return None();
+  } else if (symbol == "true") {
+    return true;
+  } else if (symbol == "false") {
+    return false;
+  } else if (c10::starts_with(symbol, kMemoryFormatPrefix)) {
+    torch::_export::MemoryFormat value = torch::_export::MemoryFormat::Unknown;
+    symbol.remove_prefix(kMemoryFormatPrefix.length());
+    torch::_export::parseEnum(symbol, value);
+    return convertJsonMemoryFormat(value);
+  } else if (c10::starts_with(symbol, kLayoutPrefix)) {
+    torch::_export::Layout value = torch::_export::Layout::Unknown;
+    symbol.remove_prefix(kLayoutPrefix.length());
+    torch::_export::parseEnum(symbol, value);
+    return convertJsonLayout(value);
+  } else if (c10::starts_with(symbol, kDevicePrefix)) {
+    return convertDevice(symbol);
+  } else if (c10::starts_with(symbol, kScalarTypePrefix)) {
+    torch::_export::ScalarType value = torch::_export::ScalarType::UNKNOWN;
+    symbol.remove_prefix(kScalarTypePrefix.length());
+    torch::_export::parseEnum(symbol, value);
+    return convertJsonScalarType(value);
+  }
+
+  // match number
+  // We need to disambiguate between int and float constants
+  const auto maybeInt = c10::tryToNumber<int64_t>(std::string{symbol});
+
+  // Libraries may happily convert "5.0" to an int 5, but we want that to
+  // become a float. So add an extra check for whether a '.' is in the string
+  // to guard against that.
+  bool hasDecimalSeparator = symbol.find('.') != std::string_view::npos;
+  if (maybeInt.has_value() && !hasDecimalSeparator) {
+    return maybeInt.value();
+  }
+
+  const auto maybeDouble = c10::tryToNumber<double>(std::string{symbol});
+  if (maybeDouble.has_value()) {
+    return maybeDouble.value();
+  }
+
+  TORCH_CHECK(false, "unhandled symbol: ", symbol);
+}
+
+Constant convertListConstant(std::string_view source) {
+  std::vector<Constant> values;
+  size_t curPos = 0;
+  Constant type = None();
+
+  // This basically the same as parseValueList, it's probably better to refactor
+  curPos = expectImpl(source, '[', curPos);
+  while (true) {
+    curPos = consumeWhitespaceImpl(source, curPos);
+
+    size_t start = curPos;
+    while (source.at(curPos) != ',' && source.at(curPos) != ']') {
+      curPos++;
+    }
+    auto symbol = source.substr(start, curPos - start);
+    auto val = convertAtomicConstant(symbol);
+    if (std::holds_alternative<None>(type)) {
+      // First time around; initialize our type sentinel with the first value.
+      // We will use this on subsequent iterations to check that all types are
+      // the same.
+      if (auto intPtr = std::get_if<int64_t>(&val)) {
+        type = *intPtr;
+      } else if (auto doublePtr = std::get_if<double>(&val)) {
+        type = *doublePtr;
+      } else if (auto boolPtr = std::get_if<bool>(&val)) {
+        type = *boolPtr;
+      } else {
+        TORCH_CHECK(false, "constant lists only support int, float, bool");
+      }
+    } else {
+      TORCH_CHECK_EQ(type.index(), val.index())
+          << "lists must have all the same type";
+    }
+    values.push_back(std::move(val));
+    if (source.at(curPos) == ']') {
+      break;
+    }
+    curPos = expectImpl(source, ',', curPos);
+  }
+  expectImpl(source, ']', curPos);
+
+  // Some annoying unwrapping
+  //   std::vector<Constant<T>> -->
+  //   Constant<std::vector<T>>
+  // Do it the dumb way.
+  if (std::holds_alternative<int64_t>(type)) {
+    std::vector<int64_t> inner;
+    inner.reserve(values.size());
+    for (const auto& el : values) {
+      inner.push_back(std::get<int64_t>(el));
+    }
+    return inner;
+  } else if (std::holds_alternative<double>(type)) {
+    std::vector<double> inner;
+    inner.reserve(values.size());
+    for (const auto& el : values) {
+      inner.push_back(std::get<double>(el));
+    }
+    return inner;
+  } else if (std::holds_alternative<bool>(type)) {
+    std::vector<bool> inner;
+    inner.reserve(values.size());
+    for (const auto& el : values) {
+      inner.push_back(std::get<bool>(el));
+    }
+    return inner;
+  }
+  TORCH_CHECK(false, "constant lists only support int, float, bool");
+}
+
+namespace {
+
+/**
+ * Deserialization for graphs: parse the output produced by operator<<(Graph).
+ * This parser really only expects the exact output generated by well-formed
+ * Graph objects, so it is not very permissive and does not give good error
+ * messages.
+ */
+class Parser {
+ public:
+  explicit Parser(std::string_view source)
+      : source_(source), graph_(Graph::createGraph()) {}
+  std::unique_ptr<Graph> parse();
+
+ private:
+  template <typename T>
+  std::vector<T> parseList(
+      char open,
+      char close,
+      const std::function<T()>& parseFn);
+
+  std::string_view parseUntil(
+      const std::function<bool()>& fn,
+      bool includeEnd = false);
+
+  void expect(std::string_view expected);
+  void expect(char expected);
+  bool nextEquals(std::string_view expected) const;
+  bool nextIf(std::string_view expected);
+  bool nextIf(char expected);
+  void consumeWhitespace();
+  bool validIdent(char n);
+  char cur();
+
+  void parseReturn();
+  void parseNode();
+  std::pair<std::string_view, Type> parseOutput();
+  void parseGraphInputs();
+  std::string_view parseString();
+  std::variant<Value*, Constant> parseArgument();
+  std::variant<NamedArgument, Attribute> parseNamedArgument();
+  Value* parseSymbolicArgument();
+  // Symbols look like %v109, with the same valid ident rules as Python
+  // This returns the symbol *without* the % at the front.
+  std::string_view parseAtomicSymbol();
+
+  size_t curPos_ = 0;
+  std::string_view source_;
+  std::unique_ptr<Graph> graph_;
+  torch::_export::GraphSignature signature_;
+};
+
+std::unique_ptr<Graph> Parser::parse() {
+  parseGraphInputs();
+  while (true) {
+    consumeWhitespace();
+    if (nextEquals("return")) {
+      parseReturn();
+      break;
+    }
+    parseNode();
+  }
+  // For graph textual format, it should be safe to assume all
+  // inputs/outputs are from users.
+  graph_->setSignature(torch::nativert::GraphSignature{signature_});
+  graph_->finalize();
+  graph_->lint();
+  // TODO: Might have some source left over, should check it if so.
+  return std::move(graph_);
+}
+
+bool Parser::nextIf(std::string_view expected) {
+  if (nextEquals(expected)) {
+    curPos_ += expected.size();
+    return true;
+  }
+  return false;
+}
+
+bool Parser::nextIf(char expected) {
+  if (cur() == expected) {
+    curPos_++;
+    return true;
+  }
+  return false;
+}
+
+void Parser::parseGraphInputs() {
+  TORCH_CHECK_EQ(curPos_, 0);
+  expect("graph");
+  const auto inputs = parseList<std::string_view>(
+      '(', ')', [&]() { return parseAtomicSymbol(); });
+  std::vector<torch::_export::InputSpec> inputSpecs;
+  inputSpecs.reserve(inputs.size());
+  for (const auto& input : inputs) {
+    graph_->addInput(input, Type::Kind::Tensor);
+
+    torch::_export::TensorArgument inputTensorArg;
+    inputTensorArg.set_name(std::string{input});
+    torch::_export::Argument inputArg;
+    inputArg.set_as_tensor(std::move(inputTensorArg));
+    torch::_export::UserInputSpec userInput;
+    userInput.set_arg(std::move(inputArg));
+    torch::_export::InputSpec inputSpec;
+    inputSpec.set_user_input(std::move(userInput));
+    inputSpecs.push_back(std::move(inputSpec));
+  }
+  signature_.set_input_specs(std::move(inputSpecs));
+  // TODO populate graphinputs
+  expect(":");
+}
+
+template <typename T>
+std::vector<T> Parser::parseList(
+    char open,
+    char close,
+    const std::function<T()>& parseFn) {
+  std::vector<T> ret;
+  expect(open);
+
+  // Handle empty list
+  if (nextIf(close)) {
+    return ret;
+  }
+  while (true) {
+    ret.push_back(parseFn());
+    if (cur() == close) {
+      break;
+    }
+    expect(',');
+  }
+  expect(close);
+  return ret;
+}
+
+// Parse until `fn` returns true, returning the segment of the source that was
+// consumed. If `includeEnd` is true, the returned segment will also include
+// final character, which caused `fn` to return true.
+std::string_view Parser::parseUntil(
+    const std::function<bool()>& fn,
+    bool includeEnd) {
+  size_t start = curPos_;
+  while (!fn()) {
+    curPos_++;
+  }
+  if (includeEnd) {
+    curPos_++;
+  }
+  return source_.substr(start, curPos_ - start);
+}
+
+// Parse a strng, including the outer quotes
+std::string_view Parser::parseString() {
+  size_t start = curPos_;
+  expect('"');
+  while (cur() != '"') {
+    // Handle escaped characters by skipping the next char when we see a
+    // backslash
+    if (cur() == '\\') {
+      curPos_++;
+    }
+    curPos_++;
+  }
+
+  // Consume final quote
+  curPos_++;
+  auto ret = source_.substr(start, curPos_ - start);
+  return ret;
+}
+
+bool Parser::validIdent(char n) {
+  return std::isalpha(n) || n == '_' || std::isdigit(n);
+}
+
+// Symbols look like %v109, with the same valid ident rules as Python
+// This returns the symbol *without* the % at the front.
+std::string_view Parser::parseAtomicSymbol() {
+  expect("%");
+  return parseUntil([&]() { return !validIdent(cur()); });
+}
+
+char Parser::cur() {
+  return source_.at(curPos_);
+}
+
+void Parser::consumeWhitespace() {
+  while (isBlank(cur())) {
+    curPos_++;
+  }
+}
+
+void Parser::expect(std::string_view expected) {
+  curPos_ = expectImpl(source_, expected, curPos_);
+}
+
+void Parser::expect(char expected) {
+  curPos_ = expectImpl(source_, expected, curPos_);
+}
+
+bool Parser::nextEquals(std::string_view expected) const {
+  const auto actual = source_.substr(curPos_, expected.size());
+  return expected == actual;
+}
+
+// %a, %b = aten.foo.default(input=%foo, foo=[7616], blah=%lol)
+void Parser::parseNode() {
+  std::vector<std::pair<std::string_view, Type>> outputs;
+
+  outputs.push_back(parseOutput());
+  while (nextIf(",")) {
+    outputs.push_back(parseOutput());
+  }
+  expect("=");
+  consumeWhitespace();
+
+  // parse target name
+  const auto target = parseUntil([&]() { return cur() == '('; });
+
+  Node* node = graph_->insertNode(std::string(target));
+  for (auto& [name, var] : outputs) {
+    node->addOutput(name, var);
+  }
+
+  auto arguments = parseList<std::variant<NamedArgument, Attribute>>(
+      '(', ')', [&]() { return parseNamedArgument(); });
+
+  // Split the arguments into symbolic inputs and constant attributes
+  for (auto& arg : arguments) {
+    if (std::holds_alternative<NamedArgument>(arg)) {
+      node->addInput(std::get<NamedArgument>(arg));
+    } else {
+      node->addAttribute(std::get<Attribute>(std::move(arg)));
+    }
+  }
+}
+
+void Parser::parseReturn() {
+  expect("return");
+  const auto returns =
+      parseList<Value*>('(', ')', [&]() { return parseSymbolicArgument(); });
+  std::vector<torch::_export::OutputSpec> outputSpecs;
+  outputSpecs.reserve(returns.size());
+  for (const auto ret : returns) {
+    graph_->addOutput(ret);
+
+    torch::_export::TensorArgument retTensorArg;
+    retTensorArg.set_name(std::string{ret->name()});
+    torch::_export::Argument retArg;
+    retArg.set_as_tensor(std::move(retTensorArg));
+    torch::_export::UserOutputSpec userOutput;
+    userOutput.set_arg(std::move(retArg));
+    torch::_export::OutputSpec outputSpec;
+    outputSpec.set_user_output(std::move(userOutput));
+    outputSpecs.push_back(std::move(outputSpec));
+  }
+  signature_.set_output_specs(std::move(outputSpecs));
+}
+
+std::variant<NamedArgument, Attribute> Parser::parseNamedArgument() {
+  consumeWhitespace();
+  // Parse name
+  const auto symbol = parseUntil([&]() { return cur() == '='; });
+  expect('=');
+
+  // Parse value
+  auto value = parseArgument();
+  if (std::holds_alternative<Value*>(value)) {
+    return NamedArgument{std::string(symbol), std::get<Value*>(value)};
+  } else {
+    return Attribute{std::string(symbol), std::get<Constant>(std::move(value))};
+  }
+}
+
+std::pair<std::string_view, Type> Parser::parseOutput() {
+  consumeWhitespace();
+  TORCH_CHECK(cur() == '%', fmt::format("expected % but got {}", cur()));
+
+  auto symbol = parseAtomicSymbol();
+  if (nextIf('[')) {
+    expect(']');
+    return {symbol, Type::Kind::TensorList};
+  } else {
+    return {symbol, Type::Kind::Tensor};
+  }
+}
+
+Value* Parser::parseSymbolicArgument() {
+  consumeWhitespace();
+  TORCH_CHECK(cur() == '%', fmt::format("expected % but got {}", cur()));
+
+  auto symbol = parseAtomicSymbol();
+  std::vector<Value*> listElements;
+  if (cur() == '[') {
+    listElements = parseList<Value*>(
+        '[', ']', [&]() { return graph_->getValue(parseAtomicSymbol()); });
+  }
+  return graph_->getValue(symbol);
+}
+
+std::variant<Value*, Constant> Parser::parseArgument() {
+  consumeWhitespace();
+
+  // match symbol
+  if (cur() == '%') {
+    return parseSymbolicArgument();
+  }
+
+  // match list
+  if (cur() == '[') {
+    const auto symbol =
+        parseUntil([&]() { return cur() == ']'; }, /*includeEnd=*/true);
+    return convertListConstant(symbol);
+  }
+
+  // match string
+  if (cur() == '"') {
+    return convertAtomicConstant(parseString());
+  }
+
+  // otherwise parse this as a value
+  const auto symbol =
+      parseUntil([&]() { return cur() == ',' || cur() == ')'; });
+  return convertAtomicConstant(symbol);
+}
+
+} // namespace
+
+std::unique_ptr<Graph> stringToGraph(std::string_view source) {
+  return Parser(source).parse();
+}
+
+std::string graphToString(const Graph& g, bool include_signature) {
+  std::stringstream ss;
+  ss << g;
+
+  if (include_signature) {
+    ss << "\nGraphSignature\n";
+    ss << g.signature();
+  }
+
+  return ss.str();
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/Graph.h b/torch/nativert/graph/Graph.h
new file mode 100644
index 000000000000..7202272a4aa1
--- /dev/null
+++ b/torch/nativert/graph/Graph.h
@@ -0,0 +1,717 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <variant>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/IntrusiveList.h>
+#include <c10/util/Logging.h>
+
+#include <torch/csrc/utils/generated_serialization_types.h>
+#include <torch/nativert/executor/Placement.h>
+#include <torch/nativert/graph/GraphSignature.h>
+#include <torch/nativert/graph/TensorMeta.h>
+
+namespace torch::nativert {
+
+using NodeIndex = size_t;
+
+class Value;
+
+class Type {
+ public:
+  enum class Kind {
+    None,
+    Tensor,
+    TensorList,
+    OptionalTensorList,
+    SymInt,
+    SymIntList,
+    SymBool,
+    SymFloat,
+    CustomObj,
+  };
+
+  // For simple kinds without classFqn
+  /*implicit*/ Type(Kind kind) : kind_(kind) {}
+
+  // For CustomObj kind with classFqn
+  explicit Type(Kind kind, const std::string& classFqn)
+      : kind_(CustomObjData{classFqn}) {
+    TORCH_CHECK(kind == Kind::CustomObj);
+    TORCH_CHECK(!classFqn.empty());
+  }
+
+  Kind kind() const {
+    if (std::holds_alternative<CustomObjData>(kind_)) {
+      return Kind::CustomObj;
+    }
+    return std::get<Kind>(kind_);
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const Type& ty);
+  friend bool operator==(const Type& left, const Type& right);
+
+  std::string classFqn() const {
+    TORCH_CHECK(
+        kind() == Kind::CustomObj, "Only CustomObj type can have classFqn");
+    return std::get<CustomObjData>(kind_).classFqn;
+  }
+
+ private:
+  struct CustomObjData {
+    std::string classFqn;
+  };
+  std::variant<Kind, CustomObjData> kind_;
+};
+
+// These are all the constant types that are allowed as attributes on Nodes.
+struct None {};
+// None always equals itself
+inline bool operator==(const None&, const None&) {
+  return true;
+}
+
+class Graph;
+
+/**
+ * We distinguish between a symbolic value (Tensor, TensorList, SymInt, SymInts,
+ * etc) and a constant value (int, bool, string, etc). Here Constant is the type
+ * for all possible constant values. Along with a name, they are represented as
+ * Attributes on a Node.
+ */
+using Constant = std::variant<
+    None,
+    int64_t,
+    std::vector<int64_t>,
+    double,
+    std::vector<double>,
+    std::string,
+    c10::ScalarType,
+    c10::MemoryFormat,
+    c10::Layout,
+    c10::Device,
+    bool,
+    std::vector<bool>,
+    std::vector<std::string>,
+    std::unique_ptr<Graph>>;
+
+c10::IValue constantToIValue(const Constant& constant);
+
+class Node;
+
+/**
+ * Represents a single symbolic value (tensor/symint/list of them). Values are
+ * inputs and outputs of Nodes.
+ */
+using ValueId = int;
+class Value {
+ public:
+  explicit Value(ValueId id, std::string name, Type t, Node* producer)
+      : name_(std::move(name)), id_(id), type_(t), producer_(producer) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(name_ == this->name());
+  }
+
+  // Each Value should be uniquely created and managed by a Graph. It's not
+  // allowed to copy/move Value instances.
+  Value(Value&&) = delete;
+  Value& operator=(Value&&) = delete;
+  Value(const Value&) = delete;
+  Value& operator=(Value&) = delete;
+
+  Type type() const {
+    return type_;
+  }
+
+  ValueId id() const {
+    return id_;
+  }
+
+  std::string_view name() const {
+    return name_;
+  }
+
+  const Node* producer(bool resolve_folded = false) const {
+    return (!resolve_folded && isFolded()) ? nullptr : producer_;
+  }
+
+  Node* producer() {
+    return producer_;
+  }
+
+  void addUser(Node* node);
+  void eraseUser(Node* node);
+  void eraseAllUsers() {
+    users_.clear();
+  }
+
+  // Throws an exception if the value is not a TensorList
+  std::vector<const Value*> getListElements() const;
+
+  const auto& users() const {
+    return users_;
+  }
+
+  auto& users() {
+    return users_;
+  }
+
+  void setId(ValueId newId) {
+    // This should only be used inside the renumberValues pass
+    id_ = newId;
+  }
+
+  void setIsFolded() {
+    isFolded_ = true;
+  }
+
+  bool isFolded() const {
+    return isFolded_;
+  }
+
+ private:
+  friend std::ostream& operator<<(std::ostream& out, const Value& v);
+  std::string name_;
+  bool isFolded_{false};
+  ValueId id_;
+  Type type_;
+  Node* producer_;
+  // All nodes which have this value as input.
+  // Note that this is a vector to avoid nondeterminism in iteration, but
+  // probably should be an unordered set given usage patterns. If this becomes a
+  // perf problem we should revise.
+  std::vector<Node*> users_;
+};
+
+struct NamedArgument {
+  std::string name;
+  Value* value;
+};
+
+struct Attribute {
+  std::string name;
+  Constant value;
+};
+
+/**
+ * Node represents a single unit of execution, typically a PyTorch operator.
+ * Using an intrusive list allows us to allocate all the memory at once for a
+ * node. This also allows us to track nodes safely without passing around the
+ * list object, as an intrusive list maintains a stronger invariant that
+ * expiration will always cause unlinking.
+ */
+class Node : public c10::IntrusiveListHook {
+ public:
+  Node(
+      Graph* owningGraph,
+      std::string target,
+      std::vector<NamedArgument> inputs,
+      std::unordered_map<std::string, std::string> metadata);
+
+  std::string_view target() const {
+    return target_;
+  }
+
+  void setTarget(std::string_view target) {
+    target_ = target;
+  }
+
+  const auto& inputs() const {
+    return inputs_;
+  }
+
+  auto& inputs() {
+    return inputs_;
+  }
+
+  // NOTE: this invalidates spans given out by inputs()
+  Value* addInput(NamedArgument input);
+  void addInputs(const std::vector<NamedArgument>& inputs);
+
+  // NOTE: this invalidates spans given out by attributes()
+  void addAttribute(Attribute attr);
+
+  // NOTE: this is ONLY for graph's constant inputs and NOT the common case
+  void addOutput();
+
+  Value* addOutput(const Type& type);
+
+  // NOTE: this invalidates spans given out by outputs()
+  Value* addOutput(std::string_view name, const Type& type);
+
+  size_t numInputs() const {
+    return inputs_.size();
+  }
+
+  size_t numOutputs() const {
+    return outputs_.size();
+  }
+
+  // Return the next node in the Graph's node ordering.
+  // NOTE: Calling next on the last node (prim.Output) returns nullptr.
+  Node* next();
+  const Node* next() const;
+
+  // Return the previous node in the Graph's node ordering.
+  // NOTE: Calling prev on the first node (prim.Input) returns nullptr.
+  Node* prev();
+  const Node* prev() const;
+
+  bool isBefore(const Node* n) const;
+
+  std::vector<Node*> producers() const;
+  std::vector<Node*> users() const;
+
+  // Returns nullptr if `name` is not an input
+  const NamedArgument* tryGetInput(std::string_view name) const;
+  // Throws an exception if `name` is not an input
+  const NamedArgument& getInput(std::string_view name) const;
+
+  const auto& attributes() const {
+    return attributes_;
+  }
+
+  // Returns nullptr if `name` is not an attribute
+  const Attribute* tryGetAttribute(std::string_view name) const;
+  // Throws an exception if `name` is not an attribute
+  const Attribute& getAttribute(std::string_view name) const;
+
+  const auto& outputs() const {
+    return outputs_;
+  }
+
+  void applyDevicePlacement(const Placement& placement);
+
+  std::optional<std::string_view> getMetadata(std::string_view key) const {
+    return metadata_.find(std::string{key}) != metadata_.end()
+        ? std::optional(std::string_view{metadata_.at(std::string{key})})
+        : std::nullopt;
+  }
+
+  Graph* owningGraph() {
+    return owningGraph_;
+  }
+
+  const Graph* owningGraph() const {
+    return owningGraph_;
+  }
+
+  void destroy();
+
+  const std::unordered_map<std::string, std::string>& metadata() const {
+    return metadata_;
+  }
+
+  std::string toString() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  void updateInputName(std::string_view oldName, std::string_view newName) {
+    for (auto& input : inputs_) {
+      if (input.name == oldName) {
+        input.name = newName;
+        break;
+      }
+    }
+  }
+
+  void updateAttributeName(std::string_view oldName, std::string_view newName) {
+    for (auto& attr : attributes_) {
+      if (attr.name == oldName) {
+        attr.name = newName;
+        break;
+      }
+    }
+  }
+
+ private:
+  friend std::ostream& operator<<(std::ostream& out, const Node& n);
+  Graph* owningGraph_;
+
+  // Target used to retrieve the actual thing to execute.
+  // If an aten operator, we expect this to be fully qualified, including an
+  // overload name, e.g. "aten.unsqueeze.default"
+  std::string target_;
+  // *Symbolic* inputs to this node. NOTE: this does not match the ATen operator
+  // schema inputs directly. It only represents things that actually participate
+  // in dataflow, like tensors/symints and lists thereof.
+  //
+  // The "name" of the NamedArgument refers to the name of the parameter.
+  std::vector<NamedArgument> inputs_;
+  // Constant inputs to the node. The "name" of the Attribute refers to the
+  // name of the parameter.
+  std::vector<Attribute> attributes_;
+  std::vector<Value*> outputs_;
+
+  // Extra bits of info added to the node. Contents that are guaranteed will be
+  // eventually moved to a first-class field on the json struct of schema.
+  std::unordered_map<std::string, std::string> metadata_;
+};
+
+/**
+ * Graph represents a model's computation graph, which is designed to
+ * facilitate transformation and analysis.
+ *
+ * Ownership semantics:
+ *  - Graph owns Nodes and Values
+ *  - Nodes own their constant attributes (which we treat as value types)
+ *  - Nodes have non-owning pointers back to the graph.
+ *
+ * NOTE: this class is marked noncopyable/nonmovable and only can be
+ * heap-allocated via `createGraph()`. This is to ensure stability of
+ * back-pointers held by Nodes/Values.
+ */
+class Graph {
+ public:
+  static std::unique_ptr<Graph> createGraph() {
+    return std::unique_ptr<Graph>(new Graph());
+  }
+
+  Graph(const Graph&) = delete;
+  Graph& operator=(const Graph&) = delete;
+  Graph(Graph&&) = delete;
+  Graph& operator=(Graph&&) = delete;
+  ~Graph() = default;
+
+  // NOTE: this invalidates spans given out by inputs()
+  Value* addInput(std::string_view name, const Type& type);
+
+  // NOTE: this is ONLY for graph's constant inputs and NOT the common case
+  void addInput();
+
+  // NOTE: this invalidates spans given out by outputs()
+  Value* addOutput(Value* v);
+
+  void addConstantOutput(Constant c);
+
+  // Create and insert a node at insertionPoint_
+  Node* insertNode(
+      std::string target,
+      std::vector<NamedArgument> inputs = {},
+      std::unordered_map<std::string, std::string> metadata = {});
+
+  // Returns the inserted node.
+  Node* insertBefore(Node* toInsert, Node* insertionPoint);
+  // Returns the inserted node.
+  Node* insertAfter(Node* toInsert, Node* insertionPoint);
+  // Insert at the insertionPoint. Returns the inserted node.
+  Node* insert(Node* toInsert);
+
+  // Create a node without inserting it into the execution graph.
+  // A raw pointer to the node is created when `createNode()` on the
+  // owner Graph object is called. It is guranateed that to be valid
+  // until the Graph object is destructed.
+  Node* createNode(
+      std::string target,
+      std::vector<NamedArgument> inputs = {},
+      std::unordered_map<std::string, std::string> metadata = {});
+
+  Value* createConstantSymIntValue(int value);
+
+  Node* createListPack(std::vector<Value*> inputs, const Type& inputType);
+
+  Node* createOptionalListPack(std::vector<Value*> inputs);
+
+  size_t numValues() const {
+    return values_.size();
+  }
+
+  // throws on missing name
+  Value* getValue(std::string_view name) const;
+  // returns nullptr on missing name
+  Value* tryGetValue(std::string_view name) const;
+
+  const std::unordered_map<ValueId, int> getConstantSymIntValues() const {
+    return constantSymIntValues_;
+  }
+
+  Value* addValue(
+      const std::optional<std::string>& name,
+      const Type& type,
+      Node* producer);
+  void removeValue(Value* value);
+
+  void replaceAllUses(Value* old, Value* replacement);
+  void replaceAllUsesAfterNode(Value* old, Value* replacement, Node* afterThis);
+  void removeNode(Node* node);
+
+  void applyDevicePlacement(const Placement& placement);
+
+  std::string getUniqueValueName();
+
+  ValueId getNextValueId() {
+    return uniqueValueId_++;
+  }
+
+  // NOTE: this range can be invalidated by mutations to the graph.
+  const auto& inputs() const {
+    return inputNode_->outputs();
+  }
+
+  c10::ArrayRef<const Value*> userInputs() const {
+    size_t offset = signature().inputsToWeights().size() +
+        signature().inputsToCustomObjs().size();
+    return {inputs().data() + offset, inputs().data() + inputs().size()};
+  }
+
+  c10::ArrayRef<const Value*> weightValues() const {
+    return {
+        inputs().data(),
+        inputs().data() + signature().inputsToWeights().size()};
+  }
+
+  // Return a bidirectional range over `const Value*`
+  // NOTE: this range can be invalidated by mutations to the graph.
+  auto outputs() const {
+    std::vector<const Value*> ret;
+    ret.reserve(outputNode_->inputs().size());
+    for (const auto& namedArg : outputNode_->inputs()) {
+      ret.push_back(namedArg.value);
+    }
+    return ret;
+  }
+
+  // Return a bidirectional range over `Value*`
+  // NOTE: this range can be invalidated by mutations to the graph.
+  auto outputs() {
+    std::vector<Value*> ret;
+    ret.reserve(outputNode_->inputs().size());
+    for (const auto& namedArg : outputNode_->inputs()) {
+      ret.push_back(namedArg.value);
+    }
+    return ret;
+  }
+
+  const auto& userOutputs() const {
+    return userOutputs_;
+  }
+
+  // Return a list over `const Node&`.
+  // NOTE: this can be invalidated by mutations to the graph.
+  const auto& nodes() const {
+    return nodes_;
+  }
+
+  auto& nodes() {
+    return nodes_;
+  }
+
+  // Return a forward range over `const Value*`.
+  // NOTE: this range can be invalidated by mutations to the graph.
+  auto values() const {
+    std::vector<const Value*> ret;
+    ret.reserve(values_.size());
+    for (const auto& [_, value] : values_) {
+      ret.push_back(value.get());
+    }
+    return ret;
+  }
+
+  Node* inputNode() {
+    return inputNode_;
+  }
+
+  Node* outputNode() {
+    return outputNode_;
+  }
+
+  const Node* outputNode() const {
+    return outputNode_;
+  }
+
+  // Assert various graph invariants
+  void lint() const;
+
+  bool /* removed > 0? */ cleanupDeadNodes();
+
+  void finalize();
+
+  Node* insertionPoint() {
+    // This should never happen, since the last-most insertion point is the
+    // prim.Outputs node, not end().
+    TORCH_CHECK(insertBefore_ != nodes_.end());
+    auto& node = *insertBefore_;
+    return &node;
+  }
+
+  void setInsertionPoint(Node* n) {
+    TORCH_CHECK(n != inputNode_, "can't insert before prim.Input");
+    insertBefore_ = nodes_.iterator_to(*n);
+  }
+
+  void setInsertionPointAfter(Node* n) {
+    TORCH_CHECK(n != outputNode_, "can't insert after prim.Output");
+    auto it = nodes_.iterator_to(*n);
+    ++it;
+    insertBefore_ = it;
+  }
+
+  // Return the next node in the Graph's node ordering.
+  // NOTE: Calling on the last node (prim.Output) returns nullptr.
+  Node* nodeAfter(Node* n);
+  const Node* nodeAfter(const Node* n) const;
+
+  // Return the previous node in the Graph's node ordering.
+  // NOTE: Calling on the first node (prim.Input) returns nullptr.
+  Node* nodeBefore(Node* n);
+  const Node* nodeBefore(const Node* n) const;
+
+  // Clone each node from subgraph (except prim.Input/prim.Output) into current
+  // graph.
+  // @param subgraph: the subgraph to be cloned
+  // @param inputs: values from the target graph that will serve as the
+  // subgraph's inputs
+  // @param valueMap: a map from the cloned subgraph's values to the target
+  // graph's values
+  std::vector<Value*> insertGraph(
+      const Graph& subgraph,
+      std::vector<Value*> inputs,
+      std::unordered_map<const Value*, Value*>& valueMap);
+
+  const GraphSignature& signature() const {
+    return signature_;
+  }
+
+  void setSignature(GraphSignature signature) {
+    signature_ = std::move(signature);
+  }
+
+  void setWeightsMeta(
+      const std::unordered_map<std::string, torch::_export::TensorMeta>&
+          tensorsMeta) {
+    for (auto [name, tensorMeta] : tensorsMeta) {
+      weightsMeta_.emplace(name, TensorMeta{tensorMeta});
+    }
+  }
+
+  const std::unordered_map<std::string, TensorMeta>& weightsMeta() const {
+    return weightsMeta_;
+  }
+
+  std::vector<TensorMeta> userInputsMeta() const {
+    std::vector<TensorMeta> userInputsMeta;
+    userInputsMeta.reserve(signature_.userInputs().size());
+    for (auto inputName : signature_.userInputs()) {
+      userInputsMeta.push_back(tensorValuesMeta_.at(inputName));
+    }
+    return userInputsMeta;
+  }
+
+  void setTensorValuesMeta(
+      const std::unordered_map<std::string, torch::_export::TensorMeta>&
+          tensorsMeta) {
+    for (auto [name, tensorMeta] : tensorsMeta) {
+      tensorValuesMeta_.emplace(name, TensorMeta{tensorMeta});
+    }
+  }
+
+  const std::unordered_map<std::string, TensorMeta>& tensorValuesMeta() const {
+    return tensorValuesMeta_;
+  }
+
+  std::string toString() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  /* Reassigns IDs to every Value in this Graph so that they are contiguous from
+   * 0..(numValues()-1). Should be used after values are removed
+   */
+  void renumberValues();
+
+ private:
+  Graph();
+  friend std::ostream& operator<<(std::ostream& out, const Graph& g);
+  GraphSignature signature_;
+
+  // keys are parameters, buffers, tensor_constants' names
+  std::unordered_map<std::string, TensorMeta> weightsMeta_;
+
+  // keys are tensor_values' names
+  std::unordered_map<std::string, TensorMeta> tensorValuesMeta_;
+
+  // Node lifetime is managed by nodesOwner_, but the actual ordering is
+  // maintained intrusively using nodes_.
+  // This is to facilitate quick insertion before/after a given Node*.
+  std::vector<std::unique_ptr<Node>> nodesOwner_;
+  c10::IntrusiveList<Node> nodes_;
+  // The current insertion point. New nodes are inserted before this node.
+  // Defaults to prim.Output.
+  c10::IntrusiveList<Node>::iterator insertBefore_;
+
+  // Graphs always start with an input and output node.
+  // "prim.input() -> Value[]" take no input, and produces some outputs. AKA
+  // "source“ of a graph.
+  Node* inputNode_; // target: prim.Input
+  // "prim.output(Value[]) -> None", take some inputs, but produce no output.
+  // AKA "sink" of a graph.
+  Node* outputNode_; // target: prim.Output
+
+  std::unordered_map<std::string, std::unique_ptr<Value>> values_;
+  // constantSymIntValues_ is a subset of values_
+  std::unordered_map<ValueId, int> constantSymIntValues_;
+  // Output values of the graph, which is a subset of values_.
+  std::vector<std::variant<Value*, Constant>> userOutputs_;
+  // Output constant values of the graph
+  std::vector<Constant> constantOutputs_;
+
+  size_t uniqueValueName_ = 0;
+
+  ValueId uniqueValueId_ = 0;
+};
+
+/**
+ * Scoped utility class for setting temporary insertion points.
+ *
+ * Use like:
+ *   {
+ *       InsertingAfter guard(node)
+ *       graph.insertNode(...)  // this will be inserted after `node`.
+ *   }
+ */
+class InsertingAfter {
+ public:
+  explicit InsertingAfter(Node* n)
+      : insertAfter_(n), prev_(n->owningGraph()->insertionPoint()) {
+    insertAfter_->owningGraph()->setInsertionPointAfter(insertAfter_);
+  }
+  ~InsertingAfter() {
+    insertAfter_->owningGraph()->setInsertionPoint(prev_);
+  }
+
+ private:
+  Node* insertAfter_;
+  Node* prev_;
+};
+
+inline constexpr std::string_view kMemoryFormatPrefix = "MemoryFormat::";
+inline constexpr std::string_view kLayoutPrefix = "Layout::";
+inline constexpr std::string_view kDevicePrefix = "Device";
+inline constexpr std::string_view kScalarTypePrefix = "ScalarType::";
+
+/**
+ * Debug format serialization. The format here is intended to be human readable
+ * and easy to work with, and is intended for debugging and testing only.
+ * If you want stable serialization, use the json conversion utils.
+ *
+ * NOTE: node metadata currently not serialized
+ */
+std::string graphToString(const Graph& g, bool include_signature = false);
+std::unique_ptr<Graph> stringToGraph(std::string_view source);
+
+// Standalone functions to parse common constructs
+// Parse something that looks like `Device{cuda:1}` to a device in json format.
+c10::Device convertDevice(std::string_view symbol);
+// We have separate functions for parsing atomic and list constants because
+// there are restrictive rules about which constants can go in lists (i.e.
+// it's not recursive).
+Constant convertAtomicConstant(std::string_view symbol);
+Constant convertListConstant(std::string_view symbol);
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/GraphPasses.cpp b/torch/nativert/graph/GraphPasses.cpp
new file mode 100644
index 000000000000..327f32185e91
--- /dev/null
+++ b/torch/nativert/graph/GraphPasses.cpp
@@ -0,0 +1,180 @@
+#include <torch/nativert/graph/GraphPasses.h>
+
+#include <unordered_set>
+
+#include <fmt/format.h>
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/function_schema.h>
+
+#include <c10/util/StringUtil.h>
+
+namespace torch::nativert {
+namespace {
+bool isScalar(const Constant& c) {
+  return std::holds_alternative<int64_t>(c) ||
+      std::holds_alternative<double>(c);
+}
+
+bool isScalar(const Value& v) {
+  return v.type() == Type::Kind::SymInt || v.type() == Type::Kind::SymFloat;
+}
+
+bool schemaTypeMatch(const c10::FunctionSchema& schema, const Node& node) {
+  std::unordered_set<std::string> inputNames;
+  for (const auto& input : node.inputs()) {
+    // The number of arguments is always O(10), so we can just do a linear scan.
+    for (const auto& schemaArg : schema.arguments()) {
+      if (schemaArg.name() == input.name) {
+        if (schemaArg.type() == c10::TensorType::get() && input.value &&
+            isScalar(*input.value)) {
+          return false;
+        }
+        break;
+      }
+    }
+    inputNames.insert(input.name);
+  }
+  for (const auto& constant : node.attributes()) {
+    for (const auto& schemaArg : schema.arguments()) {
+      if (schemaArg.name() == constant.name) {
+        if (schemaArg.type() == c10::TensorType::get() &&
+            isScalar(constant.value)) {
+          return false;
+        }
+        break;
+      }
+    }
+    inputNames.insert(constant.name);
+  }
+
+  // Make sure we have all the required arguments.
+  for (const auto& schemaArg : schema.arguments()) {
+    if (!schemaArg.default_value()) {
+      if (inputNames.find(schemaArg.name()) == inputNames.end()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+} // namespace
+
+// PT2 intentionally broadcast things like aten.sub.Scalar
+// to aten.sub.Tensor. https://github.com/pytorch/pytorch/issues/90923.
+std::string selectScalarOverloadName(const Node& node) {
+  // Copied from torch/csrc/utils/python_arg_parser.cpp
+  // torch::should_allow_numbers_as_tensors() to workaround
+  // some linking issues.
+  static std::unordered_set<std::string> allowed = {
+      "add",
+      "add_",
+      "add_out",
+      "div",
+      "div_",
+      "div_out",
+      "divide",
+      "divide_",
+      "divide_out", // alias of div
+      "mul",
+      "mul_",
+      "mul_out",
+      "multiply",
+      "multiply_",
+      "multiply_out", // alias of mul
+      "sub",
+      "sub_",
+      "sub_out",
+      "subtract",
+      "subtract_",
+      "subtract_out", // alias of sub
+      "true_divide",
+      "true_divide_",
+      "true_divide_out",
+      "to",
+      "_to_copy",
+      "copy_",
+      "copy",
+      "floor_divide",
+      "floor_divide_",
+      "floor_divide_out",
+      "_conj"};
+  std::vector<std::string_view> atoms = c10::split(node.target(), '.');
+  TORCH_CHECK_GE(atoms.size(), 3);
+
+  std::string ns = std::string{atoms[atoms.size() - 3]};
+  std::string opName = std::string{atoms[atoms.size() - 2]};
+  std::string overloadName = std::string{atoms[atoms.size() - 1]};
+  if (overloadName != "Tensor" && overloadName != "Tensor_Tensor" &&
+      overloadName != "Tensor_mode") {
+    return overloadName;
+  }
+  if (allowed.find(std::string{opName}) == allowed.end()) {
+    return overloadName;
+  }
+  auto op = c10::Dispatcher::singleton().findSchemaOrThrow(
+      fmt::format("{}::{}", ns, opName.c_str()).c_str(), overloadName.c_str());
+  if (schemaTypeMatch(op.schema(), node)) {
+    return overloadName;
+  }
+  for (const auto& variant :
+       {"Scalar_mode", "Scalar", "Scalar_Tensor", "Tensor_Scalar"}) {
+    if (auto schema = c10::Dispatcher::singleton().findSchema(
+            {fmt::format("{}::{}", ns, opName.c_str()).c_str(), variant})) {
+      if (schemaTypeMatch(schema->schema(), node)) {
+        return variant;
+      }
+    }
+  }
+  return overloadName;
+}
+
+void selectScalarOverload(Graph* graph) {
+  for (auto& node : graph->nodes()) {
+    for (auto& attr : node.attributes()) {
+      if (std::holds_alternative<std::unique_ptr<Graph>>(attr.value)) {
+        selectScalarOverload(
+            std::get<std::unique_ptr<Graph>>(attr.value).get());
+      }
+    }
+
+    auto target = node.target();
+    std::vector<std::string_view> atoms = c10::split(target, '.');
+
+    size_t numAtoms = atoms.size();
+    if (numAtoms != 5) {
+      continue;
+    }
+
+    const std::string_view ns = atoms[numAtoms - 3];
+    const std::string_view opName = atoms[numAtoms - 2];
+    if (atoms[0] != "torch" || atoms[1] != "ops" || ns != "aten") {
+      continue;
+    }
+
+    auto overloadName = selectScalarOverloadName(node);
+    if (overloadName != atoms[numAtoms - 1]) {
+      node.setTarget(
+          fmt::format("torch.ops.{}.{}.{}", ns, opName, overloadName));
+    } else if (ns == "aten" && opName == "sub" && overloadName == "Tensor") {
+      // Special case for aten.sub.Tensor.
+      if (auto i = node.tryGetInput("self")) {
+        if (isScalar(*i->value)) {
+          node.updateInputName("self", "other");
+          node.updateInputName("other", "self");
+          node.setTarget("torch.ops.aten.rsub.Scalar");
+        }
+      }
+      if (auto a = node.tryGetAttribute("self")) {
+        if (isScalar(a->value)) {
+          node.updateAttributeName("self", "other");
+          node.updateInputName("other", "self");
+          node.setTarget("torch.ops.aten.rsub.Scalar");
+        }
+      }
+    }
+  }
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/GraphPasses.h b/torch/nativert/graph/GraphPasses.h
new file mode 100644
index 000000000000..bbda9792a2a5
--- /dev/null
+++ b/torch/nativert/graph/GraphPasses.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+void selectScalarOverload(torch::nativert::Graph* graph);
+
+std::string selectScalarOverloadName(const torch::nativert::Node& node);
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/GraphSignature.cpp b/torch/nativert/graph/GraphSignature.cpp
new file mode 100644
index 000000000000..cd07af807198
--- /dev/null
+++ b/torch/nativert/graph/GraphSignature.cpp
@@ -0,0 +1,473 @@
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+#include <nlohmann/json.hpp>
+#include <algorithm>
+#include <array>
+#include <iostream>
+
+#include <torch/csrc/utils/generated_serialization_types.h>
+#include <torch/nativert/graph/GraphSignature.h>
+
+namespace torch::nativert {
+
+namespace {
+
+bool isSymbolicOutput(torch::_export::Argument::Tag t) {
+  switch (t) {
+    case torch::_export::Argument::Tag::AS_TENSOR:
+    case torch::_export::Argument::Tag::AS_TENSORS:
+    case torch::_export::Argument::Tag::AS_OPTIONAL_TENSORS:
+    case torch::_export::Argument::Tag::AS_SYM_BOOL:
+    case torch::_export::Argument::Tag::AS_SYM_BOOLS:
+    case torch::_export::Argument::Tag::AS_SYM_INT:
+    case torch::_export::Argument::Tag::AS_SYM_INTS:
+    case torch::_export::Argument::Tag::AS_SYM_FLOAT:
+    case torch::_export::Argument::Tag::AS_SYM_FLOATS:
+    case torch::_export::Argument::Tag::AS_CUSTOM_OBJ:
+      return true;
+    default:
+      return false;
+  }
+}
+
+std::pair<std::string, std::string> getSpecDetails(
+    const torch::_export::InputSpec& inputSpec) {
+  // Retrieve the argument name and spec tag name
+  switch (inputSpec.tag()) {
+    case torch::_export::InputSpec::Tag::PARAMETER:
+      return std::make_pair(
+          inputSpec.get_parameter().get_arg().get_name(), "PARAMETER");
+      break;
+    case torch::_export::InputSpec::Tag::BUFFER:
+      return std::make_pair(
+          inputSpec.get_buffer().get_arg().get_name(), "BUFFER");
+      break;
+    case torch::_export::InputSpec::Tag::TENSOR_CONSTANT:
+      return std::make_pair(
+          inputSpec.get_tensor_constant().get_arg().get_name(),
+          "TENSOR_CONSTANT");
+      break;
+    case torch::_export::InputSpec::Tag::CUSTOM_OBJ:
+      return std::make_pair(
+          inputSpec.get_custom_obj().get_arg().get_name(), "CUSTOM_OBJ");
+      break;
+    case torch::_export::InputSpec::Tag::USER_INPUT:
+      if (inputSpec.get_user_input().get_arg().tag() ==
+          torch::_export::Argument::Tag::AS_TENSOR) {
+        return std::make_pair(
+            inputSpec.get_user_input().get_arg().get_as_tensor().get_name(),
+            "USER_INPUT");
+      } else if (
+          inputSpec.get_user_input().get_arg().tag() ==
+          torch::_export::Argument::Tag::AS_CUSTOM_OBJ) {
+        return std::make_pair(
+            inputSpec.get_user_input().get_arg().get_as_custom_obj().get_name(),
+            "USER_INPUT");
+      } else {
+        TORCH_CHECK(false, "Unsupported USER_INPUT argument type.");
+      }
+      break;
+    case torch::_export::InputSpec::Tag::CONSTANT_INPUT:
+      return std::make_pair(
+          inputSpec.get_constant_input().get_name(), "CONSTANT_INPUT");
+      break;
+    case torch::_export::InputSpec::Tag::TOKEN:
+      TORCH_CHECK(false, "Token inputs not implemented yet.");
+    default:
+      TORCH_CHECK(false, "Unknown InputSpec tag encountered.");
+  }
+}
+
+void checkInputOrders(
+    const std::vector<torch::_export::InputSpec>& inputSpecs) {
+  // Map each tag to its index in the expected order
+  static constexpr std::
+      array<std::pair<torch::_export::InputSpec::Tag, uint32_t>, 5>
+          tagOrderArray = {
+              {{torch::_export::InputSpec::Tag::TOKEN, 0},
+               {torch::_export::InputSpec::Tag::PARAMETER, 1},
+               {torch::_export::InputSpec::Tag::BUFFER, 2},
+               {torch::_export::InputSpec::Tag::TENSOR_CONSTANT, 3},
+               {torch::_export::InputSpec::Tag::CUSTOM_OBJ, 4}}};
+  uint32_t currentOrderIndex = 0;
+  bool seenNonPersistentBuffer = false;
+  for (const auto& inputSpec : inputSpecs) {
+    if (inputSpec.tag() == torch::_export::InputSpec::Tag::USER_INPUT ||
+        inputSpec.tag() == torch::_export::InputSpec::Tag::CONSTANT_INPUT) {
+      continue;
+    }
+    auto it = std::find_if(
+        tagOrderArray.begin(),
+        tagOrderArray.end(),
+        [&inputSpec](const auto& pair) {
+          return pair.first == inputSpec.tag();
+        });
+    TORCH_CHECK(
+        it != tagOrderArray.end(), "Unknown InputSpec tag encountered.");
+    uint32_t tagIndex = it->second;
+    if (tagIndex < currentOrderIndex) {
+      auto [argName, tagName] = getSpecDetails(inputSpec);
+      TORCH_CHECK(
+          false,
+          fmt::format(
+              "Input arg {} with InputSpec {} is out of order!",
+              argName,
+              tagName));
+    }
+    currentOrderIndex = tagIndex;
+    // Additional check for buffers
+    if (inputSpec.tag() == torch::_export::InputSpec::Tag::BUFFER) {
+      if (!inputSpec.get_buffer().get_persistent()) {
+        seenNonPersistentBuffer = true;
+      } else {
+        TORCH_CHECK(
+            !seenNonPersistentBuffer,
+            "Persistent buffers must come before non-persistent buffers.");
+      }
+    }
+  }
+}
+
+void checkInputNames(
+    const c10::FastSet<std::string>& sigNames,
+    const c10::FastSet<std::string>& graphNames) {
+  if (sigNames == graphNames) {
+    return;
+  }
+
+  std::string errorMsg = fmt::format(
+      "Error: Value name difference detected between graph signature and graph nodes:\n"
+      "Signature value names:\n[{}]\n"
+      "Graph node names:\n[{}]",
+      fmt::join(sigNames, ", "),
+      fmt::join(graphNames, ", "));
+  TORCH_CHECK(false, errorMsg);
+}
+
+void checkOutputNames(
+    const c10::FastSet<std::optional<std::string>>& sigNames,
+    const c10::FastSet<std::string>& graphNames) {
+  std::vector<std::string> validNames;
+  for (const auto& nameOpt : sigNames) {
+    if (nameOpt.has_value()) {
+      validNames.push_back(*nameOpt);
+    }
+  }
+
+  for (const auto& name : validNames) {
+    if (graphNames.find(name) == graphNames.end()) {
+      std::string errorMsg = fmt::format(
+          "Error: Value name difference detected between graph signature and graph nodes:\n"
+          "Signature value names:\n[{}]\n"
+          "Graph node names:\n[{}]",
+          fmt::join(validNames, ", "),
+          fmt::join(graphNames, ", "));
+      TORCH_CHECK(false, errorMsg);
+    }
+  }
+}
+
+void replaceInMap(
+    c10::FastMap<std::string, std::string>& map,
+    std::string_view old,
+    std::string_view replacement) {
+  auto it = map.find(std::string{old});
+  if (it == map.end()) {
+    return;
+  }
+  std::string value = std::move(it->second);
+  map.erase(it);
+  map.emplace(replacement, std::move(value));
+}
+
+} // namespace
+
+GraphSignature::GraphSignature(const torch::_export::GraphSignature& storage) {
+  checkInputOrders(storage.get_input_specs());
+
+  for (const torch::_export::InputSpec& inputSpec : storage.get_input_specs()) {
+    switch (inputSpec.tag()) {
+      case torch::_export::InputSpec::Tag::USER_INPUT: {
+        const auto& userInputArg = inputSpec.get_user_input().get_arg();
+        if (userInputArg.tag() == torch::_export::Argument::Tag::AS_TENSOR) {
+          userInputs_.emplace_back(userInputArg.get_as_tensor().get_name());
+        } else if (
+            userInputArg.tag() ==
+            torch::_export::Argument::Tag::AS_CUSTOM_OBJ) {
+          userInputs_.emplace_back(userInputArg.get_as_custom_obj().get_name());
+        } else {
+          // TODO: handle other types
+          TORCH_CHECK(false, "Non tensor inputs not implemented yet.");
+        }
+        break;
+      }
+      case torch::_export::InputSpec::Tag::PARAMETER: {
+        numParameters_++;
+        const auto& inputName = inputSpec.get_parameter().get_arg().get_name();
+        const auto& weightName = inputSpec.get_parameter().get_parameter_name();
+        inputsToWeights_.emplace_back(inputName, weightName);
+        break;
+      }
+      case torch::_export::InputSpec::Tag::BUFFER: {
+        const bool isPersistent = inputSpec.get_buffer().get_persistent();
+        const auto& inputName = inputSpec.get_buffer().get_arg().get_name();
+        const auto& weightName = inputSpec.get_buffer().get_buffer_name();
+        if (isPersistent) {
+          numPersistentBuffers_++;
+        } else {
+          numNonPersistentBuffers_++;
+        }
+        inputsToWeights_.emplace_back(inputName, weightName);
+        break;
+      }
+      case torch::_export::InputSpec::Tag::TENSOR_CONSTANT: {
+        numTensorConstants_++;
+        const auto& inputName =
+            inputSpec.get_tensor_constant().get_arg().get_name();
+        const auto& weightName =
+            inputSpec.get_tensor_constant().get_tensor_constant_name();
+        inputsToWeights_.emplace_back(inputName, weightName);
+        break;
+      }
+      case torch::_export::InputSpec::Tag::CUSTOM_OBJ: {
+        numCustomObjs_++;
+        const auto& inputName = inputSpec.get_custom_obj().get_arg().get_name();
+        const auto& customObjName =
+            inputSpec.get_custom_obj().get_custom_obj_name();
+        inputsToCustomObjs_.emplace_back(inputName, customObjName);
+        break;
+      }
+      case torch::_export::InputSpec::Tag::CONSTANT_INPUT: {
+        break;
+      }
+      case torch::_export::InputSpec::Tag::TOKEN: {
+        TORCH_CHECK(false, "Token inputs not implemented yet.");
+      }
+      default:
+        TORCH_CHECK(false, "Unknown InputSpec tag encountered.");
+        break;
+    }
+  }
+
+  for (const torch::_export::OutputSpec& outputSpec :
+       storage.get_output_specs()) {
+    switch (outputSpec.tag()) {
+      case torch::_export::OutputSpec::Tag::LOSS_OUTPUT:
+        lossOutput_ = outputSpec.get_loss_output().get_arg().get_name();
+        break;
+      case torch::_export::OutputSpec::Tag::USER_OUTPUT:
+        if (isSymbolicOutput(outputSpec.get_user_output().get_arg().tag())) {
+          switch (outputSpec.get_user_output().get_arg().tag()) {
+            case torch::_export::Argument::Tag::AS_TENSOR: {
+              userOutputs_.emplace_back(outputSpec.get_user_output()
+                                            .get_arg()
+                                            .get_as_tensor()
+                                            .get_name());
+              break;
+            }
+            case torch::_export::Argument::Tag::AS_SYM_INT: {
+              userOutputs_.emplace_back(outputSpec.get_user_output()
+                                            .get_arg()
+                                            .get_as_sym_int()
+                                            .get_as_name());
+              break;
+            }
+            default: {
+              TORCH_CHECK(
+                  false, "Unsupported symbolic user output type encountered.");
+            }
+          }
+        } else {
+          // for constant outputs, we don't have a name
+          userOutputs_.emplace_back(std::nullopt);
+        }
+        break;
+      case torch::_export::OutputSpec::Tag::BUFFER_MUTATION:
+        buffersToMutate_.emplace(
+            outputSpec.get_buffer_mutation().get_arg().get_name(),
+            outputSpec.get_buffer_mutation().get_buffer_name());
+        break;
+      case torch::_export::OutputSpec::Tag::GRADIENT_TO_PARAMETER:
+        gradientsToParameters_.emplace(
+            outputSpec.get_gradient_to_parameter().get_arg().get_name(),
+            outputSpec.get_gradient_to_parameter().get_parameter_name());
+        break;
+      case torch::_export::OutputSpec::Tag::GRADIENT_TO_USER_INPUT:
+        gradientsToUserInputs_.emplace(
+            outputSpec.get_gradient_to_user_input().get_arg().get_name(),
+            outputSpec.get_gradient_to_user_input().get_user_input_name());
+        break;
+      case torch::_export::OutputSpec::Tag::USER_INPUT_MUTATION:
+        userInputsToMutate_.emplace(
+            outputSpec.get_user_input_mutation().get_arg().get_name(),
+            outputSpec.get_user_input_mutation().get_user_input_name());
+        break;
+      case torch::_export::OutputSpec::Tag::TOKEN: {
+        TORCH_CHECK(false, "Token outputs not implemented yet.");
+      }
+      default:
+        TORCH_CHECK(false, "Unknown OutputSpec tag encountered.");
+    }
+  }
+
+  if (FLAGS_caffe2_log_level > 2) {
+    std::cout << *this << "\n";
+  }
+}
+
+c10::FastSet<std::string> GraphSignature::inputNames() const {
+  c10::FastSet<std::string> ret;
+  size_t numInputs = userInputs().size() + inputsToWeights().size() +
+      inputsToCustomObjs().size();
+  ret.reserve(numInputs);
+  for (const auto& name : userInputs()) {
+    ret.insert(name);
+  }
+  for (const auto& [inputName, _] : inputsToWeights()) {
+    ret.insert(inputName);
+  }
+  for (const auto& [inputName, _] : inputsToCustomObjs()) {
+    ret.insert(inputName);
+  }
+  return ret;
+}
+
+c10::FastSet<std::optional<std::string>> GraphSignature::outputNames() const {
+  c10::FastSet<std::optional<std::string>> ret;
+  size_t numOutputs = userOutputs().size() + buffersToMutate().size() +
+      userInputsToMutate().size() +
+      (hasBackward() ? gradientsToParameters().size() +
+               gradientsToUserInputs().size() + (lossOutput().empty() ? 0 : 1)
+                     : 0);
+  ret.reserve(numOutputs);
+  for (const auto& name : userOutputs()) {
+    ret.insert(name);
+  }
+  for (const auto& [outputName, _] : buffersToMutate()) {
+    ret.insert(outputName);
+  }
+  for (const auto& [outputName, _] : userInputsToMutate()) {
+    ret.insert(outputName);
+  }
+  if (hasBackward()) {
+    if (!gradientsToParameters().empty()) {
+      for (const auto& [outputName, _] : gradientsToParameters()) {
+        ret.insert(outputName);
+      }
+    }
+    if (!gradientsToUserInputs().empty()) {
+      for (const auto& [outputName, _] : gradientsToUserInputs()) {
+        ret.insert(outputName);
+      }
+    }
+    if (!lossOutput().empty()) {
+      ret.insert(lossOutput());
+    }
+  }
+  return ret;
+}
+
+void GraphSignature::lint(
+    const c10::FastSet<std::string>& graphInputs,
+    const c10::FastSet<std::string>& graphOutputs) const {
+  checkInputNames(inputNames(), graphInputs);
+  checkOutputNames(outputNames(), graphOutputs);
+}
+
+void GraphSignature::replaceAllUses(
+    std::string_view old,
+    std::string_view replacement) {
+  if (old == replacement) {
+    return;
+  }
+  for (auto& name : userOutputs_) {
+    if (name == old) {
+      name = replacement;
+    }
+  }
+  replaceInMap(buffersToMutate_, old, replacement);
+  if (hasBackward()) {
+    replaceInMap(gradientsToParameters_, old, replacement);
+    replaceInMap(gradientsToUserInputs_, old, replacement);
+    if (old == lossOutput_) {
+      lossOutput_ = replacement;
+    }
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
+  if (!sig.inputsToParameters().empty()) {
+    out << "inputsToParameters: {\n";
+    for (const auto& [inputName, paramName] : sig.inputsToParameters()) {
+      out << "\t" << inputName << " : " << paramName << "\n";
+    }
+    out << "}\n";
+  }
+  if (!sig.inputsToBuffers().empty()) {
+    out << "inputsToBuffers: {\n";
+    for (const auto& [inputName, bufferName] : sig.inputsToBuffers()) {
+      out << "\t" << inputName << " : " << bufferName << "\n";
+    }
+    out << "}\n";
+  }
+  if (!sig.inputsToTensorConstants().empty()) {
+    out << "inputsToTensorConstants: {\n";
+    for (const auto& [inputName, tensorConstantName] :
+         sig.inputsToTensorConstants()) {
+      out << "\t" << inputName << " : " << tensorConstantName << "\n";
+    }
+    out << "}\n";
+  }
+  if (!sig.inputsToCustomObjs().empty()) {
+    out << "inputsToCustomObjs: {\n";
+    for (const auto& [inputName, customObjName] : sig.inputsToCustomObjs()) {
+      out << "\t" << inputName << " : " << customObjName << "\n";
+    }
+    out << "}\n";
+  }
+  if (!sig.userOutputs().empty()) {
+    out << "userOutputs: {\n";
+    for (const auto& outputName : sig.userOutputs()) {
+      out << "\t" << outputName.value_or("Constant") << "\n";
+    }
+    out << "}\n";
+  }
+  if (!sig.buffersToMutate().empty()) {
+    out << "buffersToMutate: {\n";
+    for (const auto& [outputName, mutatedBufferName] : sig.buffersToMutate()) {
+      out << "\t" << outputName << " : " << mutatedBufferName << "\n";
+    }
+    out << "}\n";
+  }
+  if (!sig.userInputsToMutate().empty()) {
+    out << "userInputsToMutate: {\n";
+    for (const auto& [outputName, mutatedUserInputName] :
+         sig.userInputsToMutate()) {
+      out << "\t" << outputName << " : " << mutatedUserInputName << "\n";
+    }
+    out << "}\n";
+  }
+  if (sig.hasBackward()) {
+    if (!sig.gradientsToParameters().empty()) {
+      out << "gradientsToParameters: {\n";
+      for (const auto& [outputName, paramName] : sig.gradientsToParameters()) {
+        out << "\t" << outputName << " : " << paramName << "\n";
+      }
+      out << "}\n";
+    }
+    if (!sig.gradientsToUserInputs().empty()) {
+      out << "gradientsToUserInputs: {\n";
+      for (const auto& [outputName, userInputName] :
+           sig.gradientsToUserInputs()) {
+        out << "\t" << outputName << " : " << userInputName << "\n";
+      }
+      out << "}\n";
+    }
+    out << "lossOutput: " << sig.lossOutput() << "\n";
+  }
+  return out;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/GraphSignature.h b/torch/nativert/graph/GraphSignature.h
new file mode 100644
index 000000000000..890fb9f71732
--- /dev/null
+++ b/torch/nativert/graph/GraphSignature.h
@@ -0,0 +1,196 @@
+#pragma once
+
+#include <string>
+
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/Logging.h>
+
+#include <torch/csrc/utils/generated_serialization_types.h>
+
+namespace torch::nativert {
+
+/**
+ * @brief An in-memory representation for input and output specs of a graph.
+ *
+ * The GraphSignature class models the input and output specs of an exported
+ * graph produced by torch.export, which is a fx.Graph with stronger invariants
+ * gurantees. It holds the graph information deserialized from the pt2 archive
+ * package. Runtime relies on the GraphSignature for weight name lookup and
+ * weight loading. The serialization schema is defined in
+ * torch/_export/serde/schema.py See more at:
+ * https://docs.pytorch.org/docs/stable/export.html#torch.export.ExportGraphSignature
+ */
+class GraphSignature {
+ public:
+  GraphSignature() = default;
+  explicit GraphSignature(const torch::_export::GraphSignature& storage);
+
+  const auto& lossOutput() const {
+    return lossOutput_;
+  }
+
+  const auto& gradientsToParameters() const {
+    return gradientsToParameters_;
+  }
+
+  const auto& gradientsToUserInputs() const {
+    return gradientsToUserInputs_;
+  }
+
+  auto inputsToParameters() const {
+    c10::FastMap<std::string_view, std::string_view> inputsToParameters;
+    inputsToParameters.reserve(numParameters_);
+    for (int i = 0; i < numParameters_; ++i) {
+      inputsToParameters.emplace(
+          inputsToWeights_[i].first, inputsToWeights_[i].second);
+    }
+    return inputsToParameters;
+  }
+
+  auto inputsToBuffers() const {
+    c10::FastMap<std::string_view, std::string_view> inputsToBuffers;
+    inputsToBuffers.reserve(numPersistentBuffers_ + numNonPersistentBuffers_);
+    for (int i = numParameters_;
+         i < numParameters_ + numPersistentBuffers_ + numNonPersistentBuffers_;
+         ++i) {
+      inputsToBuffers.emplace(
+          inputsToWeights_[i].first, inputsToWeights_[i].second);
+    }
+    return inputsToBuffers;
+  }
+
+  auto inputsToTensorConstants() const {
+    c10::FastMap<std::string_view, std::string_view> inputsToTensorConstants;
+    inputsToTensorConstants.reserve(numTensorConstants_);
+    for (int i =
+             numParameters_ + numPersistentBuffers_ + numNonPersistentBuffers_;
+         i < numParameters_ + numPersistentBuffers_ + numNonPersistentBuffers_ +
+             numTensorConstants_;
+         ++i) {
+      inputsToTensorConstants.emplace(
+          inputsToWeights_[i].first, inputsToWeights_[i].second);
+    }
+    return inputsToTensorConstants;
+  }
+
+  const auto& inputsToCustomObjs() const {
+    return inputsToCustomObjs_;
+  }
+
+  auto parameters() const {
+    std::vector<std::string_view> parameters;
+    parameters.reserve(numParameters_);
+    for (int i = 0; i < numParameters_; ++i) {
+      parameters.emplace_back(inputsToWeights_[i].second);
+    }
+    return parameters;
+  }
+
+  auto buffers() const {
+    std::vector<std::string_view> buffers;
+    buffers.reserve(numPersistentBuffers_);
+    for (int i = numParameters_; i < numParameters_ + numPersistentBuffers_;
+         i++) {
+      buffers.emplace_back(inputsToWeights_[i].second);
+    }
+    return buffers;
+  }
+
+  auto nonPersistentBuffers() const {
+    std::vector<std::string_view> buffers;
+    buffers.reserve(numNonPersistentBuffers_);
+    for (int i = numParameters_ + numPersistentBuffers_;
+         i < numParameters_ + numPersistentBuffers_ + numNonPersistentBuffers_;
+         i++) {
+      buffers.emplace_back(inputsToWeights_[i].second);
+    }
+    return buffers;
+  }
+
+  auto tensorConstants() const {
+    std::vector<std::string_view> tensorConstants;
+    tensorConstants.reserve(numTensorConstants_);
+    for (int i =
+             numParameters_ + numPersistentBuffers_ + numNonPersistentBuffers_;
+         i < numParameters_ + numPersistentBuffers_ + numNonPersistentBuffers_ +
+             numTensorConstants_;
+         i++) {
+      tensorConstants.emplace_back(inputsToWeights_[i].second);
+    }
+    return tensorConstants;
+  }
+
+  auto customObjs() const {
+    std::vector<std::string_view> customObjs;
+    customObjs.reserve(numCustomObjs_);
+    for (int i = 0; i < numCustomObjs_; ++i) {
+      customObjs.emplace_back(inputsToCustomObjs_[i].second);
+    }
+    return customObjs;
+  }
+
+  const auto& userInputs() const {
+    return userInputs_;
+  }
+
+  const auto& userOutputs() const {
+    return userOutputs_;
+  }
+
+  const auto& buffersToMutate() const {
+    return buffersToMutate_;
+  }
+
+  const auto& userInputsToMutate() const {
+    return userInputsToMutate_;
+  }
+
+  bool hasBackward() const {
+    return !(
+        lossOutput_.empty() && gradientsToParameters_.empty() &&
+        gradientsToUserInputs_.empty() && buffersToMutate_.empty());
+  }
+
+  // Mapping of FQNs to weights with stable iteration order.
+  const auto& inputsToWeights() const {
+    return inputsToWeights_;
+  }
+
+  void lint(
+      const c10::FastSet<std::string>& graphInputs,
+      const c10::FastSet<std::string>& graphOutputs) const;
+  void replaceAllUses(std::string_view old, std::string_view replacement);
+
+  torch::_export::GraphSignature serialize() const;
+
+ private:
+  c10::FastSet<std::string> inputNames() const;
+  c10::FastSet<std::optional<std::string>> outputNames() const;
+
+  c10::FastMap<std::string, std::string> gradientsToParameters_;
+  c10::FastMap<std::string, std::string> gradientsToUserInputs_;
+  c10::FastMap<std::string, std::string> buffersToMutate_;
+  c10::FastMap<std::string, std::string> userInputsToMutate_;
+
+  // Order is [inputsToParameters, inputsToBuffers,
+  // inputsToNonPersistentBuffers, inputsToTensorConstants]
+  // We need to maintain the order of these weight names as it is
+  // an important assumption in nativert for weight loading and
+  // unused weight optimization in Weights.cpp
+  std::vector<std::pair<std::string, std::string>> inputsToWeights_;
+  int numParameters_ = 0;
+  int numPersistentBuffers_ = 0;
+  int numNonPersistentBuffers_ = 0;
+  int numTensorConstants_ = 0;
+  int numCustomObjs_ = 0;
+
+  std::vector<std::pair<std::string, std::string>> inputsToCustomObjs_;
+
+  std::vector<std::string> userInputs_;
+  std::vector<std::optional<std::string>> userOutputs_;
+  std::string lossOutput_;
+};
+
+std::ostream& operator<<(std::ostream& out, const GraphSignature& sig);
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/Serialization.cpp b/torch/nativert/graph/Serialization.cpp
new file mode 100644
index 000000000000..767ced049e90
--- /dev/null
+++ b/torch/nativert/graph/Serialization.cpp
@@ -0,0 +1,550 @@
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <fmt/ranges.h>
+#include <torch/nativert/graph/Serialization.h>
+#include <limits>
+namespace torch::nativert {
+
+namespace {
+
+std::unique_ptr<Graph> jsonToSubgraph(
+    const torch::_export::Graph& jsonGraph,
+    const torch::_export::GraphSignature* signature,
+    bool loadNodeMetadata);
+
+Value* symbolicToValue(
+    const torch::_export::Argument& arg,
+    Graph& graph,
+    Node* insertBefore) {
+  switch (arg.tag()) {
+    case torch::_export::Argument::Tag::AS_TENSOR:
+      return graph.getValue(arg.get_as_tensor().get_name());
+    case torch::_export::Argument::Tag::AS_TENSORS: {
+      // Need to insert a list pack node
+      std::vector<Value*> listValue;
+      for (const auto& listEl : arg.get_as_tensors()) {
+        listValue.push_back(graph.getValue(listEl.get_name()));
+      }
+      auto listPack =
+          graph.createListPack(std::move(listValue), Type::Kind::Tensor);
+      return graph.insertBefore(listPack, insertBefore)->outputs()[0];
+    }
+    case torch::_export::Argument::Tag::AS_OPTIONAL_TENSORS: {
+      // Need to insert a list pack node
+      std::vector<Value*> listValue;
+      for (const auto& listEl : arg.get_as_optional_tensors()) {
+        switch (listEl.tag()) {
+          case torch::_export::OptionalTensorArgument::Tag::AS_TENSOR: {
+            listValue.push_back(
+                graph.getValue(listEl.get_as_tensor().get_name()));
+            break;
+          }
+          case torch::_export::OptionalTensorArgument::Tag::AS_NONE: {
+            listValue.push_back(
+                graph.addValue(std::nullopt, Type::Kind::None, nullptr));
+            break;
+          }
+          default:
+            TORCH_CHECK(
+                false,
+                fmt::format(
+                    "Unknown OptionalTensorArgument type: {}",
+                    torch::_export::printEnum(listEl.tag())));
+        }
+      }
+      auto listPack = graph.createOptionalListPack(std::move(listValue));
+      return graph.insertBefore(listPack, insertBefore)->outputs()[0];
+    }
+    case torch::_export::Argument::Tag::AS_SYM_INT: {
+      return graph.getValue(arg.get_as_sym_int().get_as_name());
+    }
+    case torch::_export::Argument::Tag::AS_SYM_INTS: {
+      // Need to insert a list pack node
+      std::vector<Value*> listValue;
+      for (const auto& listEl : arg.get_as_sym_ints()) {
+        switch (listEl.tag()) {
+          case torch::_export::SymIntArgument::Tag::AS_NAME: {
+            listValue.push_back(graph.getValue(listEl.get_as_name()));
+            break;
+          }
+          case torch::_export::SymIntArgument::Tag::AS_INT: {
+            // These are concrete int values in the SymIntList, e.g [s0, 8]
+            // We convert them into a constant Value in graph. These value
+            // doesn't have producer node
+            int64_t value = listEl.get_as_int();
+            TORCH_CHECK(
+                value >= std::numeric_limits<int>::min() &&
+                value <= std::numeric_limits<int>::max());
+            Value* symintValue =
+                graph.createConstantSymIntValue(static_cast<int>(value));
+            listValue.push_back(symintValue);
+            break;
+          }
+          default:
+            TORCH_CHECK(
+                false,
+                fmt::format(
+                    "Unknown SymIntArgument type: {}",
+                    torch::_export::printEnum(listEl.tag())));
+        }
+      }
+      auto listPack =
+          graph.createListPack(std::move(listValue), Type::Kind::SymInt);
+      return graph.insertBefore(listPack, insertBefore)->outputs()[0];
+    }
+    case torch::_export::Argument::Tag::AS_CUSTOM_OBJ: {
+      return graph.getValue(arg.get_as_custom_obj().get_name());
+    }
+    case torch::_export::Argument::Tag::AS_SYM_BOOL: {
+      return graph.getValue(arg.get_as_sym_bool().get_as_name());
+    }
+    case torch::_export::Argument::Tag::AS_SYM_FLOAT: {
+      return graph.getValue(arg.get_as_sym_float().get_as_name());
+    }
+    default:
+      TORCH_CHECK(
+          false,
+          fmt::format(
+              "This function should only be called with symbolic arguments, got {} instead",
+              torch::_export::printEnum(arg.tag())));
+  }
+}
+
+std::pair<
+    std::vector<torch::_export::InputSpec>,
+    std::vector<torch::_export::Argument>>
+enforceInputOrder(
+    const std::vector<torch::_export::InputSpec>& inputSpecs,
+    const std::vector<torch::_export::Argument>& graphInputs) {
+  // Enforce the order of inputSpecs and graphInputs to be the following:
+  // 1. token
+  // 2. parameter
+  // 3. persistent buffer, non-persistent buffer
+  // 4. tensor_constant
+  // 5. custom_obj
+  // 6. user_input/constant_input
+  std::vector<torch::_export::InputSpec> reorderedInputSpecs;
+  std::vector<torch::_export::Argument> reorderedGraphInputs;
+  std::vector<torch::_export::InputSpec::Tag> desiredOrder = {
+      torch::_export::InputSpec::Tag::TOKEN,
+      torch::_export::InputSpec::Tag::PARAMETER,
+      torch::_export::InputSpec::Tag::BUFFER,
+      torch::_export::InputSpec::Tag::TENSOR_CONSTANT,
+      torch::_export::InputSpec::Tag::CUSTOM_OBJ};
+
+  auto reorder = [&](auto condition) {
+    for (size_t i = 0; i < inputSpecs.size(); ++i) {
+      if (condition(inputSpecs[i])) {
+        reorderedInputSpecs.push_back(inputSpecs[i]);
+        reorderedGraphInputs.push_back(graphInputs[i]);
+      }
+    }
+  };
+
+  for (const auto& tag : desiredOrder) {
+    if (tag == torch::_export::InputSpec::Tag::BUFFER) {
+      // Add persistent buffers first, then non-persistent
+      reorder([&](const auto& spec) {
+        return spec.tag() == tag && spec.get_buffer().get_persistent();
+      });
+      reorder([&](const auto& spec) {
+        return spec.tag() == tag && !spec.get_buffer().get_persistent();
+      });
+    } else {
+      reorder([&](const auto& spec) { return spec.tag() == tag; });
+    }
+  }
+
+  // Append USER_INPUT and CONSTANT_INPUT without reordering
+  for (size_t i = 0; i < inputSpecs.size(); ++i) {
+    auto tag = inputSpecs[i].tag();
+    if (tag == torch::_export::InputSpec::Tag::USER_INPUT ||
+        tag == torch::_export::InputSpec::Tag::CONSTANT_INPUT) {
+      reorderedInputSpecs.push_back(inputSpecs[i]);
+      reorderedGraphInputs.push_back(graphInputs[i]);
+    }
+  }
+  return {std::move(reorderedInputSpecs), std::move(reorderedGraphInputs)};
+}
+
+std::unique_ptr<Graph> jsonToSubgraph(
+    const torch::_export::Graph& jsonGraph,
+    const torch::_export::GraphSignature* signature,
+    bool loadNodeMetadata) {
+  auto graphInputs = jsonGraph.get_inputs();
+  auto graph = Graph::createGraph();
+
+  if (signature) {
+    // enforcing the order signature inputspecs and graph inputs
+    const auto& inputSpecs = signature->get_input_specs();
+
+    auto [reorderedInputSpecs, reorderedGraphInputs] =
+        enforceInputOrder(inputSpecs, graphInputs);
+
+    graphInputs = std::move(reorderedGraphInputs);
+    auto reorderedSignature = *signature;
+    reorderedSignature.set_input_specs(reorderedInputSpecs);
+    graph->setSignature(torch::nativert::GraphSignature{reorderedSignature});
+  }
+
+  for (const auto& input : graphInputs) {
+    if (isSymbolic(input)) {
+      switch (input.tag()) {
+        case torch::_export::Argument::Tag::AS_TENSOR: {
+          const auto& asTensor = input.get_as_tensor();
+          const auto& name = asTensor.get_name();
+          graph->addInput(name, Type::Kind::Tensor);
+          break;
+        }
+        case torch::_export::Argument::Tag::AS_CUSTOM_OBJ: {
+          const auto& asCustomObj = input.get_as_custom_obj();
+          const std::string& name = asCustomObj.get_name();
+          const std::string& classFqn = asCustomObj.get_class_fqn();
+          graph->addInput(name, Type(Type::Kind::CustomObj, classFqn));
+          break;
+        }
+        default:
+          TORCH_CHECK(
+              false,
+              fmt::format(
+                  "Unsupported symbolic graph input type: {}",
+                  torch::_export::printEnum(input.tag())));
+      }
+    } else {
+      switch (input.tag()) {
+        case torch::_export::Argument::Tag::AS_INT:
+        case torch::_export::Argument::Tag::AS_FLOAT:
+        case torch::_export::Argument::Tag::AS_STRING:
+        case torch::_export::Argument::Tag::AS_BOOL:
+        case torch::_export::Argument::Tag::AS_NONE: {
+          // Constant graph inputs are specialized in the graph, here we simply
+          // add a nullptr of Value to the graph input node.
+          graph->addInput();
+          break;
+        }
+        default:
+          TORCH_CHECK(
+              false,
+              fmt::format(
+                  "Unsupported constant graph input type: {}",
+                  torch::_export::printEnum(input.tag())));
+      }
+    }
+  }
+
+  for (const auto& jsonNode : jsonGraph.get_nodes()) {
+    auto node = graph->insertNode(
+        jsonNode.get_target(),
+        {},
+        loadNodeMetadata ? jsonNode.get_metadata()
+                         : std::unordered_map<std::string, std::string>());
+
+    std::vector<NamedArgument> args;
+    std::vector<Attribute> attributes;
+    for (const auto& input : jsonNode.get_inputs()) {
+      // We handle constants and symbolic inputs differently.
+      const auto& arg = input.get_arg();
+      if (isSymbolic(arg)) {
+        // Symbolic values are made part of the inputs to the node
+        node->addInput(NamedArgument{
+            input.get_name(), symbolicToValue(input.get_arg(), *graph, node)});
+      } else if (arg.tag() == torch::_export::Argument::Tag::AS_NONE) {
+        node->addInput(NamedArgument{
+            input.get_name(),
+            graph->addValue(std::nullopt, Type::Kind::None, node)});
+      } else {
+        node->addAttribute(Attribute{
+            input.get_name(),
+            constantToValue(input.get_arg(), loadNodeMetadata)});
+        // Constant values are added as "attributes" to the node.
+      }
+    }
+
+    std::vector<Value*> outputs;
+    std::vector<Value*> listUnpacksToCreate;
+    for (const auto& output : jsonNode.get_outputs()) {
+      switch (output.tag()) {
+        case torch::_export::Argument::Tag::AS_NONE: {
+          node->addOutput(Type::Kind::None);
+          break;
+        }
+        case torch::_export::Argument::Tag::AS_TENSOR: {
+          const auto name = output.get_as_tensor().get_name();
+          node->addOutput(name, Type::Kind::Tensor);
+          break;
+        }
+        case torch::_export::Argument::Tag::AS_TENSORS: {
+          auto outputValue = node->addOutput(
+              graph->getUniqueValueName(), Type::Kind::TensorList);
+
+          Node* listUnpack =
+              graph->insertNode("prim.ListUnpack", {{"input", outputValue}});
+          for (const auto& arg : output.get_as_tensors()) {
+            listUnpack->addOutput(arg.get_name(), Type::Kind::Tensor);
+          }
+          break;
+        }
+        case torch::_export::Argument::Tag::AS_SYM_INT: {
+          const auto name = output.get_as_sym_int().get_as_name();
+          node->addOutput(name, Type::Kind::SymInt);
+          break;
+        }
+        case torch::_export::Argument::Tag::AS_SYM_INTS: {
+          TORCH_CHECK(
+              false,
+              "SymInts NYI. We currently don't have ops that produce SymInts as output");
+        }
+        case torch::_export::Argument::Tag::AS_SYM_BOOL: {
+          const auto name = output.get_as_sym_bool().get_as_name();
+          node->addOutput(name, Type::Kind::SymBool);
+          break;
+        }
+        case torch::_export::Argument::Tag::AS_SYM_BOOLS: {
+          TORCH_CHECK(
+              false,
+              "SymBools NYI. We currently don't have ops that produce SymBools as output");
+        }
+        case torch::_export::Argument::Tag::AS_SYM_FLOAT: {
+          const auto name = output.get_as_sym_float().get_as_name();
+          node->addOutput(name, Type::Kind::SymFloat);
+          break;
+        }
+        case torch::_export::Argument::Tag::AS_SYM_FLOATS: {
+          TORCH_CHECK(
+              false,
+              "SymFloats NYI. We currently doesn't have op that produces SymFloats as output");
+        }
+        default:
+          TORCH_CHECK(
+              false,
+              fmt::format(
+                  "Unsupported graph output type: {}",
+                  torch::_export::printEnum(output.tag())));
+      }
+    }
+  }
+
+  for (const auto& output : jsonGraph.get_outputs()) {
+    // handle symbolic outputs and constant outputs differently
+    if (isSymbolic(output)) {
+      switch (output.tag()) {
+        case torch::_export::Argument::Tag::AS_TENSOR: {
+          const auto& asTensor = output.get_as_tensor();
+          const auto& name = asTensor.get_name();
+          Value* outputValue = graph->getValue(name);
+          graph->addOutput(outputValue);
+          break;
+        }
+        case torch::_export::Argument::Tag::AS_SYM_INT: {
+          const auto& asSymInt = output.get_as_sym_int();
+          TORCH_CHECK(
+              asSymInt.tag() == torch::_export::SymIntArgument::Tag::AS_NAME);
+          const auto& name = asSymInt.get_as_name();
+          Value* outputValue = graph->getValue(name);
+          graph->addOutput(outputValue);
+          break;
+        }
+        default:
+          TORCH_CHECK(
+              false,
+              fmt::format(
+                  "Unsupported graph output type: {}",
+                  torch::_export::printEnum(output.tag())));
+      }
+    } else {
+      Constant constValue = constantToValue(output, loadNodeMetadata);
+      graph->addConstantOutput(std::move(constValue));
+    }
+  }
+
+  auto jsonTensorValue = jsonGraph.get_tensor_values();
+
+  if (!signature) {
+    // For subgraphs we just need to derive a graph signature that only
+    // contains user inputs and outputs, because we don't need to handle any
+    // special semantics for them, e.g. mutation or gradients.
+    torch::_export::GraphSignature sig;
+    std::vector<torch::_export::InputSpec> inputSpecs;
+    for (const auto& input : graph->inputs()) {
+      torch::_export::Argument arg;
+      if (input->type().kind() == Type::Kind::Tensor) {
+        torch::_export::TensorArgument targ;
+        targ.set_name(std::string{input->name()});
+        arg.set_as_tensor(std::move(targ));
+      } else {
+        TORCH_CHECK(
+            false,
+            fmt::format(
+                "Unsupported subgraph input type {}",
+                fmt::streamed(input->type())));
+      }
+      torch::_export::UserInputSpec userInputSpec;
+      userInputSpec.set_arg(std::move(arg));
+      torch::_export::InputSpec inputSpec;
+      inputSpec.set_user_input(std::move(userInputSpec));
+      inputSpecs.push_back(std::move(inputSpec));
+    }
+    sig.set_input_specs(std::move(inputSpecs));
+
+    std::vector<torch::_export::OutputSpec> outputSpecs;
+    for (const auto& output : graph->outputs()) {
+      torch::_export::Argument arg;
+      if (output->type().kind() == Type::Kind::Tensor) {
+        torch::_export::TensorArgument targ;
+        targ.set_name(std::string{output->name()});
+        arg.set_as_tensor(std::move(targ));
+      } else {
+        TORCH_CHECK(
+            false,
+            fmt::format(
+                "Unsupported subgraph output type {}",
+                fmt::streamed(output->type())));
+      }
+      torch::_export::UserOutputSpec userOutputSpec;
+      userOutputSpec.set_arg(std::move(arg));
+      torch::_export::OutputSpec outputSpec;
+      outputSpec.set_user_output(std::move(userOutputSpec));
+      outputSpecs.push_back(std::move(outputSpec));
+    }
+    sig.set_output_specs(std::move(outputSpecs));
+
+    graph->setSignature(torch::nativert::GraphSignature{sig});
+  }
+
+  // weightsTensorMeta are indexed by weight's name, not graph input's name
+  std::unordered_map<std::string, torch::_export::TensorMeta> weightsTensorMeta;
+  for (const auto& [inputName, weightName] :
+       graph->signature().inputsToWeights()) {
+    auto value = graph->getValue(inputName);
+    if (value->type().kind() == Type::Kind::CustomObj) {
+      // skip setting meta for non-tensor inputs
+      continue;
+    }
+
+    auto it = jsonTensorValue.find(inputName);
+    CHECK(it != jsonTensorValue.end())
+        << "Missing tensor metadata for " << inputName
+        << "in thriftGraph.tensorValue";
+    weightsTensorMeta[weightName] = it->second;
+  }
+  graph->setWeightsMeta(weightsTensorMeta);
+
+  graph->setTensorValuesMeta(jsonTensorValue);
+
+  graph->finalize();
+
+  graph->lint();
+  return graph;
+}
+
+} // namespace
+
+bool isSymbolic(const torch::_export::Argument& arg) {
+  switch (arg.tag()) {
+    case torch::_export::Argument::Tag::AS_TENSOR:
+    case torch::_export::Argument::Tag::AS_TENSORS:
+    case torch::_export::Argument::Tag::AS_OPTIONAL_TENSORS:
+    case torch::_export::Argument::Tag::AS_SYM_INT:
+    case torch::_export::Argument::Tag::AS_SYM_INTS:
+    case torch::_export::Argument::Tag::AS_SYM_BOOL:
+    case torch::_export::Argument::Tag::AS_SYM_BOOLS:
+    case torch::_export::Argument::Tag::AS_SYM_FLOAT:
+    case torch::_export::Argument::Tag::AS_SYM_FLOATS:
+    case torch::_export::Argument::Tag::AS_CUSTOM_OBJ:
+      return true;
+    default:
+      return false;
+  }
+}
+
+Constant constantToValue(
+    const torch::_export::Argument& jsonArg,
+    bool loadNodeMetadata) {
+  switch (jsonArg.tag()) {
+    case torch::_export::Argument::Tag::AS_NONE:
+      return torch::nativert::None();
+    case torch::_export::Argument::Tag::AS_INT:
+      return jsonArg.get_as_int();
+    case torch::_export::Argument::Tag::AS_INTS: {
+      std::vector<int64_t> ret;
+      for (const auto& arg : jsonArg.get_as_ints()) {
+        ret.push_back(arg);
+      }
+      return ret;
+    }
+    case torch::_export::Argument::Tag::AS_FLOAT:
+      return jsonArg.get_as_float().get();
+    case torch::_export::Argument::Tag::AS_FLOATS: {
+      std::vector<double> ret;
+      for (const auto& arg : jsonArg.get_as_floats()) {
+        ret.push_back(arg.get());
+      }
+      return ret;
+    }
+    case torch::_export::Argument::Tag::AS_STRING:
+      return jsonArg.get_as_string();
+    case torch::_export::Argument::Tag::AS_STRINGS: {
+      std::vector<std::string> ret;
+      for (const auto& arg : jsonArg.get_as_strings()) {
+        ret.push_back(arg);
+      }
+      return ret;
+    }
+    case torch::_export::Argument::Tag::AS_SCALAR_TYPE:
+      return torch::nativert::convertJsonScalarType(
+          jsonArg.get_as_scalar_type());
+    case torch::_export::Argument::Tag::AS_MEMORY_FORMAT:
+      return torch::nativert::convertJsonMemoryFormat(
+          jsonArg.get_as_memory_format());
+    case torch::_export::Argument::Tag::AS_LAYOUT:
+      return torch::nativert::convertJsonLayout(jsonArg.get_as_layout());
+    case torch::_export::Argument::Tag::AS_DEVICE:
+      return torch::nativert::convertJsonDevice(jsonArg.get_as_device());
+    case torch::_export::Argument::Tag::AS_BOOL:
+      return jsonArg.get_as_bool();
+    case torch::_export::Argument::Tag::AS_BOOLS: {
+      std::vector<bool> ret;
+      for (const auto& arg : jsonArg.get_as_bools()) {
+        ret.push_back(arg);
+      }
+      return ret;
+    }
+    case torch::_export::Argument::Tag::AS_GRAPH: {
+      return jsonToSubgraph(
+          *jsonArg.get_as_graph().get_graph(), nullptr, loadNodeMetadata);
+    }
+    case torch::_export::Argument::Tag::AS_TENSOR:
+    case torch::_export::Argument::Tag::AS_TENSORS:
+    case torch::_export::Argument::Tag::AS_OPTIONAL_TENSORS:
+      TORCH_CHECK(false, "Tensor values are symbolic, not constant.");
+    case torch::_export::Argument::Tag::AS_SYM_INT:
+    case torch::_export::Argument::Tag::AS_SYM_INTS:
+    case torch::_export::Argument::Tag::AS_SYM_BOOL:
+    case torch::_export::Argument::Tag::AS_SYM_BOOLS:
+      TORCH_CHECK(false, "Symint/Symbool Values are symbolic, not constant.");
+    case torch::_export::Argument::Tag::AS_CUSTOM_OBJ:
+      TORCH_CHECK(false, "custom obj is symbolic, not constant");
+    case torch::_export::Argument::Tag::AS_OPERATOR:
+      return jsonArg.get_as_operator();
+    case torch::_export::Argument::Tag::AS_SYM_FLOAT: {
+      TORCH_CHECK(false, "SymFloat is not yet implemented");
+    }
+    case torch::_export::Argument::Tag::AS_SYM_FLOATS: {
+      TORCH_CHECK(false, "SymFloats is not yet implemented");
+    }
+    default:
+      TORCH_CHECK(false, "Got unknown json argument");
+  }
+}
+
+std::unique_ptr<Graph> jsonToGraph(
+    const torch::_export::GraphModule& jsonGraphModule,
+    bool loadNodeMetadata) {
+  auto graph = jsonToSubgraph(
+      jsonGraphModule.get_graph(),
+      &jsonGraphModule.get_signature(),
+      loadNodeMetadata);
+  return graph;
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/Serialization.h b/torch/nativert/graph/Serialization.h
new file mode 100644
index 000000000000..6604bfbc5165
--- /dev/null
+++ b/torch/nativert/graph/Serialization.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/nativert/graph/Graph.h>
+
+#include <torch/csrc/utils/generated_serialization_types.h>
+
+namespace torch::nativert {
+/**
+ * This file contains serialization utilities for Graph.
+ *
+ * There are two serialized representations we care about:
+ * - Json: stable but hard to work with, not really human readable
+ * - Debug format: human-readable, not stable.
+ */
+
+// Json -> Graph
+std::unique_ptr<Graph> jsonToGraph(
+    const torch::_export::GraphModule& jsonGraph,
+    bool loadNodeMetadata = true);
+
+bool isSymbolic(const torch::_export::Argument& arg);
+
+Constant constantToValue(
+    const torch::_export::Argument& jsonArg,
+    bool loadNodeMetadata);
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/TensorMeta.cpp b/torch/nativert/graph/TensorMeta.cpp
new file mode 100644
index 000000000000..c42cb6b39d9e
--- /dev/null
+++ b/torch/nativert/graph/TensorMeta.cpp
@@ -0,0 +1,141 @@
+#include <torch/nativert/graph/TensorMeta.h>
+
+#include <c10/util/Logging.h>
+
+namespace torch::nativert {
+
+c10::ScalarType convertJsonScalarType(
+    const torch::_export::ScalarType& scalarType) {
+  switch (scalarType) {
+    case torch::_export::ScalarType::UNKNOWN:
+      TORCH_CHECK(false, "scalar type is not properly set");
+    case torch::_export::ScalarType::BYTE:
+      return c10::ScalarType::Byte;
+    case torch::_export::ScalarType::CHAR:
+      return c10::ScalarType::Char;
+    case torch::_export::ScalarType::SHORT:
+      return c10::ScalarType::Short;
+    case torch::_export::ScalarType::INT:
+      return c10::ScalarType::Int;
+    case torch::_export::ScalarType::LONG:
+      return c10::ScalarType::Long;
+    case torch::_export::ScalarType::HALF:
+      return c10::ScalarType::Half;
+    case torch::_export::ScalarType::FLOAT:
+      return c10::ScalarType::Float;
+    case torch::_export::ScalarType::DOUBLE:
+      return c10::ScalarType::Double;
+    case torch::_export::ScalarType::COMPLEXHALF:
+      return c10::ScalarType::ComplexHalf;
+    case torch::_export::ScalarType::COMPLEXFLOAT:
+      return c10::ScalarType::ComplexFloat;
+    case torch::_export::ScalarType::COMPLEXDOUBLE:
+      return c10::ScalarType::ComplexDouble;
+    case torch::_export::ScalarType::BOOL:
+      return c10::ScalarType::Bool;
+    case torch::_export::ScalarType::BFLOAT16:
+      return c10::ScalarType::BFloat16;
+    case torch::_export::ScalarType::UINT16:
+      return c10::ScalarType::UInt16;
+    case torch::_export::ScalarType::FLOAT8E4M3FN:
+      return c10::ScalarType::Float8_e4m3fn;
+    case torch::_export::ScalarType::FLOAT8E5M2:
+      return c10::ScalarType::Float8_e5m2;
+    default:
+      TORCH_CHECK(false, "unknown scalar type", static_cast<int>(scalarType));
+  }
+}
+
+c10::MemoryFormat convertJsonMemoryFormat(
+    const torch::_export::MemoryFormat& memoryFormat) {
+  switch (memoryFormat) {
+    case torch::_export::MemoryFormat::Unknown:
+      TORCH_CHECK(false, "got unknown scalar type");
+    case torch::_export::MemoryFormat::ContiguousFormat:
+      return c10::MemoryFormat::Contiguous;
+    case torch::_export::MemoryFormat::ChannelsLast:
+      return c10::MemoryFormat::ChannelsLast;
+    case torch::_export::MemoryFormat::ChannelsLast3d:
+      return c10::MemoryFormat::ChannelsLast3d;
+    case torch::_export::MemoryFormat::PreserveFormat:
+      return c10::MemoryFormat::Preserve;
+    default:
+      TORCH_CHECK(
+          false, "unknown memory format", static_cast<int>(memoryFormat));
+  }
+}
+
+c10::Layout convertJsonLayout(const torch::_export::Layout& layout) {
+  switch (layout) {
+    case torch::_export::Layout::Unknown:
+      TORCH_CHECK(false, "got unknown layout");
+    case torch::_export::Layout::SparseCoo:
+      // TODO is this the right translation
+      return c10::Layout::Sparse;
+    case torch::_export::Layout::SparseCsr:
+      return c10::Layout::SparseCsr;
+    case torch::_export::Layout::SparseCsc:
+      return c10::Layout::SparseCsc;
+    case torch::_export::Layout::SparseBsr:
+      return c10::Layout::SparseBsr;
+    case torch::_export::Layout::SparseBsc:
+      return c10::Layout::SparseBsc;
+    case torch::_export::Layout::_mkldnn:
+      return c10::Layout::Mkldnn;
+    case torch::_export::Layout::Strided:
+      return c10::Layout::Strided;
+    default:
+      TORCH_CHECK(false, "unknown layout", static_cast<int>(layout));
+  }
+}
+
+c10::Device convertJsonDevice(const torch::_export::Device& device) {
+  c10::Device d(device.get_type());
+  if (auto index = device.get_index()) {
+    d.set_index(static_cast<at::DeviceIndex>(*index));
+  }
+  return d;
+}
+
+TensorMeta::TensorMeta(const torch::_export::TensorMeta& tensorMeta)
+    : dtype_(convertJsonScalarType(tensorMeta.get_dtype())),
+      layout_(convertJsonLayout(tensorMeta.get_layout())),
+      requiresGrad_(tensorMeta.get_requires_grad()),
+      device_(convertJsonDevice(tensorMeta.get_device())) {
+  if (tensorMeta.get_storage_offset().tag() ==
+      torch::_export::SymInt::Tag::AS_INT) {
+    storage_offset_ = tensorMeta.get_storage_offset().get_as_int();
+  } else {
+    CHECK(false) << "SymInt not supported yet";
+  }
+
+  for (const auto& size : tensorMeta.get_sizes()) {
+    if (size.tag() == torch::_export::SymInt::Tag::AS_INT) {
+      int64_t val = size.get_as_int();
+      sizes_.emplace_back(val);
+      numel_ *= val;
+    } else if (size.tag() == torch::_export::SymInt::Tag::AS_EXPR) {
+      // TODO: it's still unclear how SymInt shape should be used in runtime
+      // One potential use cases is for verifing inputs shape matches constrain
+      // This would require unpacking the serialized constrain, which is NYI
+      //
+      // For the time being, we just set the symbolic dim to -1
+      hasSymbolicShape_ = true;
+      sizes_.emplace_back(-1);
+      numel_ = -1;
+    }
+  }
+
+  for (const auto& stride : tensorMeta.get_strides()) {
+    if (stride.tag() == torch::_export::SymInt::Tag::AS_INT) {
+      strides_.emplace_back(stride.get_as_int());
+    } else if (stride.tag() == torch::_export::SymInt::Tag::AS_EXPR) {
+      // TODO: it's still unclear how SymInt shape should be used in runtime
+      // Setting symbolic shape to -1 for now
+      hasSymbolicShape_ = true;
+      strides_.emplace_back(-1);
+    }
+  }
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/graph/TensorMeta.h b/torch/nativert/graph/TensorMeta.h
new file mode 100644
index 000000000000..5b0c90474a09
--- /dev/null
+++ b/torch/nativert/graph/TensorMeta.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/util/Logging.h>
+
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/ArrayRef.h>
+
+#include <torch/csrc/utils/generated_serialization_types.h>
+
+namespace torch::nativert {
+
+c10::ScalarType convertJsonScalarType(
+    const torch::_export::ScalarType& scalarType);
+c10::MemoryFormat convertJsonMemoryFormat(
+    const torch::_export::MemoryFormat& memoryFormat);
+c10::Layout convertJsonLayout(const torch::_export::Layout& layout);
+c10::Device convertJsonDevice(const torch::_export::Device& device);
+
+class TensorMeta {
+ public:
+  explicit TensorMeta(const torch::_export::TensorMeta& tensorMeta);
+
+  c10::IntArrayRef sizes() const {
+    CHECK(!hasSymbolicShape_) << "TensorMeta has symbolic shape";
+    return sizes_;
+  }
+
+  c10::IntArrayRef strides() const {
+    CHECK(!hasSymbolicShape_) << "TensorMeta has symbolic shape";
+    return strides_;
+  }
+
+  c10::Layout layout() const {
+    return layout_;
+  }
+
+  c10::ScalarType dtype() const {
+    return dtype_;
+  }
+
+  bool requires_grad() const {
+    return requiresGrad_;
+  }
+
+  int64_t storage_offset() const {
+    return storage_offset_;
+  }
+
+  int64_t dim() const {
+    return sizes_.size();
+  }
+
+  int64_t numel() const {
+    CHECK(!hasSymbolicShape_) << "TensorMeta has symbolic shape";
+    return numel_;
+  }
+
+  c10::Device device() const {
+    return device_;
+  }
+
+  c10::TensorOptions asTensorOptions() const {
+    return c10::TensorOptions().dtype(dtype_).layout(layout_).requires_grad(
+        requiresGrad_);
+  }
+
+  // NYI
+  // c10::SymIntArrayRef sym_sizes() const {}
+  // c10::SymIntArrayRef sym_strides() const {}
+  // c10::SymInt sym_storage_offset() const {}
+  // c10::SymInt sym_numel() const {}
+
+ private:
+  bool hasSymbolicShape_ = false;
+
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+  int64_t storage_offset_ = 0;
+  int64_t numel_ = 1;
+
+  c10::ScalarType dtype_;
+  c10::Layout layout_;
+  bool requiresGrad_;
+
+  c10::Device device_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/AutoFunctionalizeKernel.cpp b/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
new file mode 100644
index 000000000000..cbbd502d8215
--- /dev/null
+++ b/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
@@ -0,0 +1,64 @@
+#include <torch/nativert/kernels/AutoFunctionalizeKernel.h>
+
+#include <fmt/format.h>
+
+#include <c10/util/Enumerate.h>
+
+namespace torch::nativert {
+
+UnsafeAutoFunctionalizeKernel::UnsafeAutoFunctionalizeKernel(const Node* node)
+    : OpKernel(node),
+      op_(getOperatorForTarget(
+          std::get<std::string>(node->attributes()[0].value))),
+      schema_(op_.schema()),
+      arguments_(prefillStackWithStaticArgs(node, schema_)) {
+  for (const auto& [idx, schemaArg] : c10::enumerate(schema_.arguments())) {
+    if (schemaArg.alias_info() != nullptr &&
+        schemaArg.alias_info()->isWrite()) {
+      mutatingInputArgs_.push_back(node->getInput(schemaArg.name()).value);
+    }
+  }
+
+  numOutputs_ = schema_.returns().size();
+}
+
+void UnsafeAutoFunctionalizeKernel::computeInternal(
+    ExecutionFrame& executionFrame) const {
+  // Make a copy of the stack
+  std::vector<c10::IValue> stack = arguments_.getStackWithStaticArgs();
+
+  fillDynamicInputs(executionFrame, arguments_, stack);
+
+  // Call the op with the prepared stack.
+  try {
+    op_.callBoxed(stack);
+  } catch (const std::exception& ex) {
+    // TODO: this eats the original exception type. ATen returns different
+    // exception types that correspond to different Python errors (e.g.
+    // IndexError, ValueError). If retaining this information is important
+    // to us, we'll have to change this up a little.
+    auto stackTrace = node_->getMetadata("stack_trace");
+    throw std::runtime_error(fmt::format(
+        "Original Python stacktrace:\n{}\n{}",
+        stackTrace ? *stackTrace : "<no stack trace>",
+        ex.what()));
+  }
+
+  const auto& outputValues = node_->outputs();
+
+  for (int i = 0; i < numOutputs_; ++i) {
+    executionFrame.setIValue(outputValues[i]->id(), std::move(stack.at(i)));
+  }
+
+  // Copy over mutating inputs to outputs
+  int mutatingArgStartIndex = (numOutputs_ == 0) ? 1 : numOutputs_;
+  for (size_t i = mutatingArgStartIndex; i < outputValues.size(); ++i) {
+    executionFrame.setIValue(
+        outputValues[i]->id(),
+        executionFrame.getIValue(
+            mutatingInputArgs_.at(i - mutatingArgStartIndex)->id(),
+            true /*  allowNone */));
+  }
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/AutoFunctionalizeKernel.h b/torch/nativert/kernels/AutoFunctionalizeKernel.h
new file mode 100644
index 000000000000..b8d93288e8ff
--- /dev/null
+++ b/torch/nativert/kernels/AutoFunctionalizeKernel.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/function_schema.h>
+#include <c10/core/Device.h>
+
+#include <torch/nativert/executor/ExecutionFrame.h> // @manual
+#include <torch/nativert/executor/OpKernel.h> // @manual
+
+namespace torch::nativert {
+
+class UnsafeAutoFunctionalizeKernel : public OpKernel {
+ public:
+  UnsafeAutoFunctionalizeKernel() = delete; // deleted default constructor
+  UnsafeAutoFunctionalizeKernel(const Node* node);
+
+  void computeInternal(ExecutionFrame& executionFrame) const override final;
+
+ private:
+  c10::OperatorHandle op_;
+  c10::FunctionSchema schema_;
+
+  Arguments arguments_;
+
+  std::vector<Value*> mutatingInputArgs_;
+  int numOutputs_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/C10Kernel.cpp b/torch/nativert/kernels/C10Kernel.cpp
new file mode 100644
index 000000000000..450042e7c92d
--- /dev/null
+++ b/torch/nativert/kernels/C10Kernel.cpp
@@ -0,0 +1,265 @@
+#include <torch/nativert/kernels/C10Kernel.h>
+
+#include <fmt/ostream.h>
+
+#include <c10/util/Enumerate.h>
+
+#ifdef __SIGRID_USE_GPU__
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#endif
+
+namespace torch::nativert {
+
+C10Kernel::C10Kernel(
+    const Node* node,
+    c10::Device device,
+    OpKernelKind kind,
+    AliasingSpec&& aliasingSpec)
+    : OpKernel(node, device, kind),
+      op_(getOperatorForTarget(node->target(), node)),
+      schema_(op_.schema(), std::move(aliasingSpec), kind_),
+      arguments_(prefillStackWithStaticArgs(node, op_.schema())) {}
+
+void C10Kernel::computeInternal(ExecutionFrame& executionFrame) const {
+  // Make a copy of the stack
+  std::vector<c10::IValue> stack = arguments_.getStackWithStaticArgs();
+
+  fillDynamicInputs(executionFrame, arguments_, stack);
+
+  // Call the op with the prepared stack.
+  try {
+    op_.callBoxed(stack);
+  } catch (const std::exception& ex) {
+    auto stackTrace = node_->getMetadata("stack_trace");
+    throw std::runtime_error(fmt::format(
+        "Exception while executing node: {}\n"
+        "with args:\n{}\n"
+        "{}\n"
+        "Original Python stacktrace:\n{}",
+        fmt::streamed(*node_),
+        readableArgs(op_.schema(), stack),
+        ex.what(),
+        stackTrace ? *stackTrace : "<no stack trace>"));
+  }
+
+  // Write out results
+  // TODO: we store intermediates in a single table (symint and tensor alike).
+  // This can theoretically lead to name collisions, although based on how
+  // these are named I don't think it will ever happen in practice. We need to
+  // enforce it though.
+  const auto& outputValues = node_->outputs();
+  TORCH_CHECK_EQ(outputValues.size(), stack.size())
+      << "Output size mismatch for " << node_->toString();
+  for (auto&& [i, actualOutput] : c10::enumerate(stack)) {
+    executionFrame.setIValue(outputValues[i]->id(), std::move(actualOutput));
+  }
+}
+
+namespace {
+std::unordered_map<std::string, c10::IValue> getSymInputs(
+    const ExecutionFrame& executionFrame,
+    const Node& node) {
+  std::unordered_map<std::string, c10::IValue> inputs;
+  for (const auto& input : node.inputs()) {
+    const auto& val = executionFrame.getIValue(input.value->id());
+    if (val.isInt() || val.isDouble() || val.isBool()) {
+      inputs[input.name] = val;
+    } else {
+      throw std::runtime_error("unsupported type for symbolic input");
+    }
+  }
+  for (const auto& attribute : node.attributes()) {
+    if (std::holds_alternative<int64_t>(attribute.value)) {
+      inputs[attribute.name] = std::get<int64_t>(attribute.value);
+    } else if (std::holds_alternative<double>(attribute.value)) {
+      inputs[attribute.name] = std::get<double>(attribute.value);
+    } else if (std::holds_alternative<bool>(attribute.value)) {
+      inputs[attribute.name] = std::get<bool>(attribute.value);
+    } else {
+      throw std::runtime_error("unsupported type for symbolic input");
+    }
+  }
+  return inputs;
+}
+
+template <typename T>
+void computeScalarBinaryOp(
+    ExecutionFrame& executionFrame,
+    const Node& node,
+    std::enable_if_t<true, T> a,
+    std::enable_if_t<true, T> b) {
+  std::string_view target = node.target();
+  T out;
+
+  if (target == "_operator.add") {
+    out = a + b;
+  } else if (target == "_operator.sub") {
+    out = a - b;
+  } else if (target == "_operator.mul") {
+    out = a * b;
+  } else if (target == "_operator.pow") {
+    out = std::pow(a, b);
+  } else {
+    throw std::runtime_error(
+        fmt::format("unsupported operator for symbolic values: {}", target));
+  }
+
+  executionFrame.setIValue(node.outputs()[0]->id(), out);
+  VLOG(2) << fmt::format(
+      "Completed executing node: {} with a={}, b={}, out={}",
+      fmt::streamed(node),
+      a,
+      b,
+      out);
+}
+
+} // namespace
+
+void ScalarBinaryOpKernel::computeInternal(
+    ExecutionFrame& executionFrame) const {
+  auto inputs = getSymInputs(executionFrame, *node_);
+
+  const auto& a = inputs.at("a");
+  const auto& b = inputs.at("b");
+
+  auto coerceToDouble = [](const c10::IValue& x) -> double {
+    if (x.isInt()) {
+      return static_cast<double>(x.toInt());
+    } else if (x.isDouble()) {
+      return x.toDouble();
+    } else {
+      throw std::runtime_error("unsupported type for symbolic input");
+    }
+  };
+
+  if (a.isInt() && b.isInt()) {
+    computeScalarBinaryOp<int64_t>(
+        executionFrame, *node_, a.toInt(), b.toInt());
+  } else {
+    computeScalarBinaryOp<double>(
+        executionFrame, *node_, coerceToDouble(a), coerceToDouble(b));
+  }
+}
+
+void SymIntOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
+  auto inputs = getSymInputs(executionFrame, *node_);
+
+  int64_t a = inputs.at("a").toInt();
+  std::string_view target = node_->target();
+  if (target == "torch.sym_float") {
+    double out = static_cast<double>(a);
+    executionFrame.setIValue(node_->outputs()[0]->id(), out);
+    VLOG(2) << fmt::format(
+        "Completed executing node: {} with a={}, out={}",
+        fmt::streamed(*node_),
+        a,
+        out);
+    return;
+  }
+  int64_t b = inputs.at("b").toInt();
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  int64_t out;
+
+  if (target == "_operator.floordiv") {
+    out = a / b;
+  } else if (target == "_operator.mod") {
+    out = a % b;
+  } else if (target == "torch.sym_max") {
+    out = std::max(a, b);
+  } else if (target == "torch.sym_min") {
+    out = std::min(a, b);
+  } else {
+    throw std::runtime_error(
+        fmt::format("unsupported operator for SymInt: {}", node_->target()));
+  }
+
+  executionFrame.setIValue(node_->outputs()[0]->id(), out);
+  VLOG(2) << fmt::format(
+      "Completed executing node: {} with a={}, b={}, out={}",
+      fmt::streamed(*node_),
+      a,
+      b,
+      out);
+}
+
+void SymBoolOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
+  auto inputs = getSymInputs(executionFrame, *node_);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool out;
+
+  const std::string_view target = node_->target();
+  if (target == "torch.sym_not") {
+    bool a = inputs.at("a").toBool();
+    out = !a;
+  } else if (target == "_operator.ge") {
+    int64_t a = inputs.at("a").toInt();
+    int64_t b = inputs.at("b").toInt();
+    out = a >= b;
+  } else if (target == "_operator.le") {
+    int64_t a = inputs.at("a").toInt();
+    int64_t b = inputs.at("b").toInt();
+    out = a <= b;
+  } else if (target == "_operator.eq") {
+    int64_t a = inputs.at("a").toInt();
+    int64_t b = inputs.at("b").toInt();
+    out = a == b;
+  } else if (target == "_operator.gt") {
+    int64_t a = inputs.at("a").toInt();
+    int64_t b = inputs.at("b").toInt();
+    out = a > b;
+  } else if (target == "_operator.lt") {
+    int64_t a = inputs.at("a").toInt();
+    int64_t b = inputs.at("b").toInt();
+    out = a < b;
+  } else if (target == "_operator.and_") {
+    bool a = inputs.at("a").toBool();
+    bool b = inputs.at("b").toBool();
+    out = a && b;
+  } else {
+    throw std::runtime_error(
+        fmt::format("unsupported operator for SymBool: {}", node_->target()));
+  }
+
+  executionFrame.setIValue(node_->outputs()[0]->id(), out);
+}
+
+void SymFloatOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
+  auto inputs = getSymInputs(executionFrame, *node_);
+
+  const std::string_view target = node_->target();
+  if (target == "math.trunc") {
+    double x = inputs.at("x").toDouble();
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+    int64_t out = trunc(x);
+    executionFrame.setIValue(node_->outputs()[0]->id(), out);
+  } else if (target == "torch._sym_sqrt") {
+    double a = inputs.at("a").toDouble();
+    double out = std::sqrt(a);
+    executionFrame.setIValue(node_->outputs()[0]->id(), out);
+  } else if (target == "_operator.neg") {
+    auto a = inputs.at("a");
+    c10::IValue out;
+    if (a.isInt()) {
+      out = -a.toInt();
+    } else if (a.isDouble()) {
+      out = -a.toDouble();
+    } else {
+      throw std::runtime_error("unsupported type for symbolic input");
+    }
+    executionFrame.setIValue(node_->outputs()[0]->id(), out);
+  } else if (target == "_operator.truediv") {
+    auto ia = inputs.at("a");
+    double a = ia.isInt() ? static_cast<double>(ia.toInt()) : ia.toDouble();
+    auto ib = inputs.at("b");
+    double b = ib.isInt() ? static_cast<double>(ib.toInt()) : ib.toDouble();
+    double out = a / b;
+    executionFrame.setIValue(node_->outputs()[0]->id(), out);
+  } else {
+    throw std::runtime_error(
+        fmt::format("unsupported operator for SymFloat: {}", node_->target()));
+  }
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/C10Kernel.h b/torch/nativert/kernels/C10Kernel.h
new file mode 100644
index 000000000000..0f23096dd1f2
--- /dev/null
+++ b/torch/nativert/kernels/C10Kernel.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/function_schema.h>
+#include <c10/core/Device.h>
+#include <torch/nativert/executor/memory/FunctionSchema.h>
+
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/executor/OpKernel.h>
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+// Implementation of Kernel for ATen operators
+//
+// This class exists to amortize per-kernel overhead by computing things during
+// initialization instead of on every execution. Right now we are only
+// amortizing schema resolution, and static arguments parsing,
+// but in the future this could be extended to avoid operator dispatch and
+// do better "Register" allocation (e.g. convert input/outputs to directly
+// array accesses onto a set of registers, in concert with memory planning)
+class C10Kernel : public OpKernel {
+ public:
+  C10Kernel() = delete; // deleted default constructor
+  C10Kernel(
+      const Node* node,
+      c10::Device device,
+      OpKernelKind kind = OpKernelKind::kInterpreterFallbackKernel,
+      AliasingSpec&& aliasingSpec = {});
+  virtual ~C10Kernel() override = default;
+
+  [[nodiscard]] const c10::IValue& input(
+      uint32_t i,
+      ExecutionFrame& executionFrame) const override {
+    if (Value* dynamicArg = arguments_.findDynamic(i)) {
+      return executionFrame.getIValue(dynamicArg->id());
+    }
+    return attribute(i);
+  }
+
+  [[nodiscard]] const c10::IValue& attribute(uint32_t i) const {
+    return arguments_.getStatic(i);
+  }
+
+  C10_ALWAYS_INLINE const FunctionSchema& schema() const {
+    return schema_;
+  }
+
+  void computeInternal(ExecutionFrame& executionFrame) const override;
+
+ private:
+  c10::OperatorHandle op_;
+  FunctionSchema schema_;
+
+  Arguments arguments_;
+};
+
+class SymIntOpKernel : public OpKernel {
+ public:
+  explicit SymIntOpKernel(const Node* node) : OpKernel(node) {}
+  void computeInternal(ExecutionFrame& executionFrame) const override final;
+};
+
+class SymBoolOpKernel : public OpKernel {
+ public:
+  explicit SymBoolOpKernel(const Node* node) : OpKernel(node) {}
+  void computeInternal(ExecutionFrame& executionFrame) const override final;
+};
+
+class SymFloatOpKernel : public OpKernel {
+ public:
+  explicit SymFloatOpKernel(const Node* node) : OpKernel(node) {}
+  void computeInternal(ExecutionFrame& executionFrame) const override final;
+};
+
+// ScalarOpKernel does binary arithmetic operations on scalar values.
+// Integers and floats are supported as input types. The output will be
+// promoted to float if and only if there's at least one float input.
+class ScalarBinaryOpKernel : public OpKernel {
+ public:
+  explicit ScalarBinaryOpKernel(const Node* node) : OpKernel(node) {}
+  void computeInternal(ExecutionFrame& executionFrame) const override final;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/HigherOrderKernel.cpp b/torch/nativert/kernels/HigherOrderKernel.cpp
new file mode 100644
index 000000000000..7da057a75cdb
--- /dev/null
+++ b/torch/nativert/kernels/HigherOrderKernel.cpp
@@ -0,0 +1,115 @@
+#include <torch/nativert/kernels/HigherOrderKernel.h>
+
+#include <fmt/format.h>
+
+#include <c10/util/string_view.h>
+
+namespace torch::nativert {
+
+using torch::nativert::Graph;
+
+HigherOrderKernel::HigherOrderKernel(
+    const Node* node,
+    std::vector<std::unique_ptr<GraphExecutorBase>> graphExecutors)
+    : OpKernel(node), graphExecutors_(std::move(graphExecutors)) {
+  static constexpr std::string_view prefix = "torch.ops.higher_order.";
+  CHECK(c10::starts_with(node->target(), prefix));
+  auto opName = node->target().substr(prefix.size());
+  if (opName == "cond") {
+    opType_ = OpType::COND;
+    // Checking torch.cond schema is as expected:
+    // torch.cond(Tensor predicate, Graph graph1, Graph graph2, Tensor[] args)
+    // -> Tensor[]
+    TORCH_CHECK_EQ(node_->attributes().size(), 2);
+    TORCH_CHECK_EQ(node_->inputs().size(), 2);
+  } else if (opName == "while_loop") {
+    opType_ = OpType::WHILE_LOOP;
+    // Checking torch.while_loop schema is as expected:
+    // torch.while_loop(Graph cond, Graph body, Tensor[] args, Tensor[]
+    // additonal) -> Tensor[]
+    TORCH_CHECK_EQ(node_->attributes().size(), 2);
+    TORCH_CHECK_EQ(node_->inputs().size(), 2);
+  } else if (opName == "run_const_graph") {
+    opType_ = OpType::RUN_CONST_GRAPH;
+    // Checking torch.run_const_graph schema is as expected:
+    // torch.run_const_graph(Graph graph, Tensor[] args) -> Tensor[]
+    TORCH_CHECK_GE(node_->attributes().size(), 1);
+    TORCH_CHECK_EQ(node_->inputs().size(), 1);
+  } else {
+    throw std::runtime_error(
+        fmt::format("Unknown higher order op: {}", opName));
+  }
+}
+
+void HigherOrderKernel::computeInternal(ExecutionFrame& executionFrame) const {
+  switch (opType_) {
+    case OpType::COND: {
+      auto inputs = executionFrame.getIValue(node_->inputs()[1].value->id())
+                        .toList()
+                        .vec();
+      std::vector<c10::IValue> outputs;
+      auto cond = executionFrame.getIValue(node_->inputs()[0].value->id());
+      size_t branchIdx = 0;
+      if (cond.isTensor()) {
+        branchIdx = cond.toTensor().item().toBool() ? 0 : 1;
+      } else if (cond.isBool()) {
+        branchIdx = cond.toBool() ? 0 : 1;
+      } else {
+        throw std::runtime_error("Unsupported type for cond predicate");
+      }
+      ExecutionFrame branchFrame(*std::get<std::unique_ptr<Graph>>(
+          node_->attributes()[branchIdx].value));
+      auto ret =
+          graphExecutors_[branchIdx]->execute(branchFrame, std::move(inputs));
+      for (size_t i = 0; i < ret.size(); i++) {
+        executionFrame.setIValue(node_->outputs()[i]->id(), std::move(ret[i]));
+      }
+      break;
+    }
+    case OpType::WHILE_LOOP: {
+      auto carriedVals =
+          executionFrame.getIValue(node_->inputs()[0].value->id())
+              .toList()
+              .vec();
+      auto additonalVals =
+          executionFrame.getIValue(node_->inputs()[1].value->id())
+              .toList()
+              .vec();
+      size_t numCarriedVals = carriedVals.size();
+      ExecutionFrame condFrame(
+          *std::get<std::unique_ptr<Graph>>(node_->attributes()[0].value));
+      ExecutionFrame bodyFrame(
+          *std::get<std::unique_ptr<Graph>>(node_->attributes()[1].value));
+      while (true) {
+        auto inputs = carriedVals;
+        inputs.insert(inputs.end(), additonalVals.begin(), additonalVals.end());
+        auto cond = graphExecutors_[0]->execute(condFrame, inputs);
+
+        if (cond.at(0).isTensor() && !cond[0].toTensor().item().toBool()) {
+          break;
+        }
+        if (cond.at(0).isBool() && !cond[0].toBool()) {
+          break;
+        }
+        auto out = graphExecutors_[1]->execute(bodyFrame, std::move(inputs));
+        TORCH_CHECK(out.size() == numCarriedVals);
+        carriedVals = std::move(out);
+      }
+      for (size_t i = 0; i < carriedVals.size(); i++) {
+        executionFrame.setIValue(
+            node_->outputs()[i]->id(), std::move(carriedVals[i]));
+      }
+      break;
+    }
+    case OpType::RUN_CONST_GRAPH: {
+      // run_const_graph op is a special case of higher order op which has
+      // been executed during weights loading, therefore at runtime we can
+      // just make this a no-op.
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unknown higher order op");
+  }
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/HigherOrderKernel.h b/torch/nativert/kernels/HigherOrderKernel.h
new file mode 100644
index 000000000000..fb98e4bdec58
--- /dev/null
+++ b/torch/nativert/kernels/HigherOrderKernel.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/executor/GraphExecutorBase.h>
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+class HigherOrderKernel : public OpKernel {
+  enum class OpType {
+    UNKNOWN,
+    COND,
+    WHILE_LOOP,
+    RUN_CONST_GRAPH,
+  };
+
+ public:
+  HigherOrderKernel(
+      const Node* node,
+      std::vector<std::unique_ptr<GraphExecutorBase>> graphExecutors);
+  void computeInternal(ExecutionFrame& executionFrame) const final;
+
+ private:
+  std::vector<std::unique_ptr<GraphExecutorBase>> graphExecutors_;
+  OpType opType_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nested/_internal/nested_tensor.py b/torch/nested/_internal/nested_tensor.py
index 958ee96c499c..b3b47d983346 100644
--- a/torch/nested/_internal/nested_tensor.py
+++ b/torch/nested/_internal/nested_tensor.py
@@ -107,7 +107,11 @@ def __new__(
         stride = values.stride()
         _strides = (ragged_size * stride[r], *stride)
 
+<<<<<<< HEAD
         r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+=======
+        r = torch.Tensor._make_wrapper_subclass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls,
             _size,
             _strides,
@@ -200,10 +204,24 @@ def _get_min_seqlen(self):
     def _max_seqlen_tensor(self) -> Optional[torch.Tensor]:
         return self._metadata_cache.get("max_seqlen", None)
 
+<<<<<<< HEAD
+=======
+    @_max_seqlen_tensor.setter
+    def _max_seqlen_tensor(self, val: Optional[torch.Tensor]) -> None:
+        self._metadata_cache["max_seqlen"] = val
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @property
     def _min_seqlen_tensor(self) -> Optional[torch.Tensor]:
         return self._metadata_cache.get("min_seqlen", None)
 
+<<<<<<< HEAD
+=======
+    @_min_seqlen_tensor.setter
+    def _min_seqlen_tensor(self, val: Optional[torch.Tensor]) -> None:
+        self._metadata_cache["min_seqlen"] = val
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # These are old private @property accessors that are kept around for internal BC
     # reasons. TODO: Remove these!
     @property
@@ -233,7 +251,11 @@ def __repr__(self):  # type: ignore[override]
         )
         if self.grad_fn:
             grad_fn_str = f", grad_fn={self.grad_fn}"
+<<<<<<< HEAD
         return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str}, contiguous={self._lengths is None})"
+=======
+        return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str}, contiguous={self.is_contiguous()})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO: Remove this in favor of the default tensor subclass serialization logic.
     # We don't do this today because of https://github.com/pytorch/pytorch/issues/125622.
@@ -311,7 +333,11 @@ def __tensor_unflatten__(inner_tensors: Dict, meta, outer_size, outer_stride):
         )
 
     @classmethod
+<<<<<<< HEAD
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+=======
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # If you're wondering why there's a nested tensor with one of its
         # size = -1, see note: [NJT outer_size in AOTDispatcher]
         kwargs = {} if kwargs is None else kwargs
@@ -521,9 +547,15 @@ def jagged_from_tensor_and_lengths(
         )
 
     # Calculate jagged offsets
+<<<<<<< HEAD
     assert (
         len(tensor.shape) >= 2
     ), "tensor must at least be 2D for the nested narrow op to work"
+=======
+    assert len(tensor.shape) >= 2, (
+        "tensor must at least be 2D for the nested narrow op to work"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     max_seq_len = tensor.shape[1]
     offset_lengths = max_seq_len * torch.arange(
         0, batch_size, dtype=torch.int64, device=tensor.device
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index 9525508d7507..63e5914a674a 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -73,9 +73,15 @@ def _wrap_jagged_dims(ndim, dims, op_name, ragged_idx=1):
     """
     from torch._prims_common import canonicalize_dims
 
+<<<<<<< HEAD
     assert isinstance(
         dims, (tuple, list)
     ), f"_wrap_jagged_dims(): cannot iterate over dimensions of type {type(dims)}"
+=======
+    assert isinstance(dims, (tuple, list)), (
+        f"_wrap_jagged_dims(): cannot iterate over dimensions of type {type(dims)}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     wrapped_dims = [
         canonicalize_dims(ndim, d) for d in dims
@@ -535,9 +541,15 @@ def clone_default(func, *args, **kwargs):
             from .nested_tensor import jagged_from_list
 
             # TODO: We probably want the output to have the same ragged structure / nested int.
+<<<<<<< HEAD
             assert (
                 inp._ragged_idx == 1
             ), "NJT with ragged_idx != 1 not supported for contiguous clone"
+=======
+            assert inp._ragged_idx == 1, (
+                "NJT with ragged_idx != 1 not supported for contiguous clone"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             contig, _ = jagged_from_list(inp.unbind(), offsets=None)
             return contig
 
@@ -1730,8 +1742,13 @@ def native_layer_norm_default(func, *args, **kwargs):
         )  # a sum over (1, 2) ensures layer norm, whereas a sum over (1) would be an instance norm
 
         padded_normalized = (
+<<<<<<< HEAD
             padded_input - mean
         ) * padded_mask  # mask elements outside of the ragged dimension size for correct variance calculation
+=======
+            (padded_input - mean) * padded_mask
+        )  # mask elements outside of the ragged dimension size for correct variance calculation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         variance = (
             torch.sum(
diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py
index 3c83bbb0d0aa..f0ed9060a1c2 100644
--- a/torch/nn/attention/__init__.py
+++ b/torch/nn/attention/__init__.py
@@ -1,5 +1,10 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 """ This module contains functions and classes that alter the behavior of torch.nn.functional.scaled_dot_product_attention """
+=======
+"""This module contains functions and classes that alter the behavior of torch.nn.functional.scaled_dot_product_attention"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import contextlib
 from collections.abc import Iterable
 from typing import Union
@@ -73,6 +78,7 @@ def _backend_from_string(name: str):
     return getattr(SDPBackend, name)
 
 
+<<<<<<< HEAD
 def _cur_sdpa_kernel_backends():
     backends: list[SDPBackend] = []
     for name, val in _backend_names.items():
@@ -85,6 +91,33 @@ def _sdpa_kernel(backends: Iterable[SDPBackend]):
     for name, val in _backend_names.items():
         enabled = getattr(SDPBackend, val) in backends
         getattr(torch.backends.cuda, f"enable_{name}_sdp")(enabled)
+=======
+def _cur_sdpa_kernel_backends(with_priority: bool = False):
+    backends = []
+    for name, val in _backend_names.items():
+        if getattr(torch.backends.cuda, f"{name}_sdp_enabled")():
+            backends.append(getattr(SDPBackend, val))
+    if with_priority:
+        curr_priority = torch._C._get_sdp_priority_order()
+        backends = sorted(
+            backends, key=lambda backend: curr_priority.index(int(backend))
+        )
+    return backends
+
+
+def _sdpa_kernel(backends: Iterable, set_priority: bool = False):
+    for name, val in _backend_names.items():
+        enabled = getattr(SDPBackend, val) in backends
+        getattr(torch.backends.cuda, f"enable_{name}_sdp")(enabled)
+    if set_priority:
+        # backends should be a unique list
+        user_priority = [int(backend) for backend in backends]
+        previous_priority = torch._C._get_sdp_priority_order()
+        for backend in previous_priority:
+            if backend not in user_priority:
+                user_priority.append(int(backend))
+        torch._C._set_sdp_priority_order(user_priority)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @contextlib.contextmanager
@@ -106,6 +139,10 @@ def sdpa_kernel(
 
         from torch.nn.functional import scaled_dot_product_attention
         from torch.nn.attention import SDPBackend, sdpa_kernel
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Only enable flash attention backend
         with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
             scaled_dot_product_attention(...)
@@ -117,13 +154,20 @@ def sdpa_kernel(
     This context manager can be used to select which backend to use for scaled dot product attention.
     Upon exiting the context manager, the previous state of the flags will be restored, enabling all backends.
     """
+<<<<<<< HEAD
     assert isinstance(
         backends, (list, SDPBackend)
     ), "Backend must be an instance of SDPBackend or a list of SDPBackend instances"
+=======
+    assert isinstance(backends, (list, SDPBackend)), (
+        "Backend must be an instance of SDPBackend or a list of SDPBackend instances"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if isinstance(backends, SDPBackend):
         backends = [backends]
 
+<<<<<<< HEAD
     backends_set = set(backends)
     user_priority = None
     previous_priority = None
@@ -146,6 +190,16 @@ def sdpa_kernel(
         _sdpa_kernel(previous_backends)
         if set_priority:
             torch._C._set_sdp_priority_order(previous_priority)  # type: ignore[arg-type]
+=======
+    backends = list(dict.fromkeys(backends))
+
+    previous_backends = _cur_sdpa_kernel_backends(with_priority=set_priority)
+    try:
+        _sdpa_kernel(backends, set_priority)
+        yield {}
+    finally:
+        _sdpa_kernel(previous_backends, set_priority)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # variadic version of sdpa_kernel for dynamo to use while reconstructing
diff --git a/torch/nn/attention/_utils.py b/torch/nn/attention/_utils.py
index 7ec94e8189f7..f15e45459c66 100644
--- a/torch/nn/attention/_utils.py
+++ b/torch/nn/attention/_utils.py
@@ -1,7 +1,13 @@
 # mypy: allow-untyped-defs
 """Defines utilities for interacting with scaled_dot_product_attention"""
+<<<<<<< HEAD
 import math
 from typing import Optional, Union
+=======
+
+import math
+from typing import Optional
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 
@@ -31,6 +37,7 @@ def _calculate_scale(head_dim_size: int, scale: Optional[float]) -> float:
     return 1.0 / math.sqrt(head_dim_size)
 
 
+<<<<<<< HEAD
 _SUPPORTED_HEAD_DIMS = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
 
 
@@ -39,6 +46,8 @@ def _supported_head_dim(n: Union[int, torch.SymInt]) -> bool:
     return n in _SUPPORTED_HEAD_DIMS
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _validate_sdpa_input(
     query: torch.Tensor,
     key: torch.Tensor,
diff --git a/torch/nn/attention/bias.py b/torch/nn/attention/bias.py
index da7acb957d96..c8bbfcd614e6 100644
--- a/torch/nn/attention/bias.py
+++ b/torch/nn/attention/bias.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 """Defines bias subclasses that work with scaled_dot_product_attention"""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from enum import auto, IntEnum
 from typing import Optional
 from warnings import warn
@@ -36,14 +40,22 @@ class CausalVariant(IntEnum):
 
     Defines two types of causal biases:
 
+<<<<<<< HEAD
     `UPPER_LEFT`: Represents upper-left triangular bias for standard causal attention.
+=======
+    ``UPPER_LEFT``: Represents upper-left triangular bias for standard causal attention.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     The equivalent pytorch code for constructing this bias is:
 
     .. code-block:: python
 
         torch.tril(torch.ones(size, dtype=torch.bool))
 
+<<<<<<< HEAD
     For instance, with `shape=(3,4)`, the materialized bias tensor will be:
+=======
+    For instance, with ``shape=(3,4)``, the materialized bias tensor will be:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. code-block:: text
 
@@ -52,7 +64,11 @@ class CausalVariant(IntEnum):
          [1, 1, 1, 0]]
 
 
+<<<<<<< HEAD
     `LOWER_RIGHT`: Represents lower-right triangular bias, the include values are aligned to the lower
+=======
+    ``LOWER_RIGHT``: Represents lower-right triangular bias, the include values are aligned to the lower
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     right corner of the matrix.
 
     The equivalent pytorch code for constructing this bias is:
@@ -65,7 +81,11 @@ class CausalVariant(IntEnum):
             diagonal=diagonal_offset,
         )
 
+<<<<<<< HEAD
     For instance, with `shape=(3,4)`, the materialized bias tensor will be:
+=======
+    For instance, with ``shape=(3,4)``, the materialized bias tensor will be:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. code-block:: text
 
@@ -101,9 +121,21 @@ class CausalBias(torch.Tensor):
         # Create a lower-right causal bias
         attn_bias = causal_lower_right(seqlen_q, seqlen_kv)
 
+<<<<<<< HEAD
         q = torch.randn(bsz, num_heads, seqlen_q, head_dim, device="cuda", dtype=torch.float16)
         k = torch.randn(bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16)
         v = torch.randn(bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16)
+=======
+        q = torch.randn(
+            bsz, num_heads, seqlen_q, head_dim, device="cuda", dtype=torch.float16
+        )
+        k = torch.randn(
+            bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16
+        )
+        v = torch.randn(
+            bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         out = F.scaled_dot_product_attention(q, k, v, attn_bias)
 
@@ -283,11 +315,17 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         """Defines the behavior of torch.nn.functional.scaled_dot_product_attention when the attn_bias is an AttnBias"""
         if kwargs is None:
             kwargs = {}
+<<<<<<< HEAD
         if func != torch.nn.functional.scaled_dot_product_attention:
             raise NotImplementedError(
                 "CausalBias only supports scaled_dot_product_attention"
             )
         return cls._dispatch(*args, **kwargs)
+=======
+        if func is torch.nn.functional.scaled_dot_product_attention:
+            return cls._dispatch(*args, **kwargs)
+        return super().__torch_function__(func, types, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __repr__(self):  # type:ignore[override]
         return self._materialize().__repr__()
diff --git a/torch/nn/attention/experimental/_paged_attention.py b/torch/nn/attention/experimental/_paged_attention.py
index 06b0b632ab93..9a71dbad6021 100644
--- a/torch/nn/attention/experimental/_paged_attention.py
+++ b/torch/nn/attention/experimental/_paged_attention.py
@@ -182,9 +182,13 @@ def assign(
         logical_block_offset = input_pos % self.page_size  # [B, S]
         physical_block_idx = torch.gather(
             self.page_table[batch_idx], 1, logical_block_idx.to(torch.int64)
+<<<<<<< HEAD
         ).to(
             torch.int32
         )  # [B, S]
+=======
+        ).to(torch.int32)  # [B, S]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         addr = (physical_block_idx * self.page_size + logical_block_offset).view(
             -1
diff --git a/torch/nn/attention/flex_attention.py b/torch/nn/attention/flex_attention.py
index 379c82e53451..4b1b7f21145f 100644
--- a/torch/nn/attention/flex_attention.py
+++ b/torch/nn/attention/flex_attention.py
@@ -1,6 +1,12 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 # flake8: noqa C101
 """This module implements the user facing API for flex_attention in PyTorch."""
+=======
+# flake8: noqa: B950
+"""This module implements the user facing API for flex_attention in PyTorch."""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import inspect
 import itertools
@@ -12,14 +18,24 @@
 
 import torch
 from torch import Tensor
+<<<<<<< HEAD
 from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
 from torch._higher_order_ops.flex_attention import flex_attention as flex_attention_hop
 from torch._higher_order_ops.utils import _set_compilation_env
+=======
+from torch._higher_order_ops.flex_attention import flex_attention as flex_attention_hop
+from torch._higher_order_ops.utils import _set_compilation_env
+from torch._prims_common import DeviceLikeType
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.fx.experimental.proxy_tensor import (
     _temp_remove_metadata_torch_function_mode,
     _temp_remove_pre_dispatch_torch_function_mode,
 )
+<<<<<<< HEAD
 from torch.nn.attention._utils import _supported_head_dim, _validate_sdpa_input
+=======
+from torch.nn.attention._utils import _validate_sdpa_input
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._pytree import tree_map_only
 
 
@@ -205,8 +221,13 @@ class BlockMask:
     BlockMask is our format for representing a block-sparse attention mask.
     It is somewhat of a cross in-between BCSR and a non-sparse format.
 
+<<<<<<< HEAD
     Basics
     ------
+=======
+    **Basics**
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     A block-sparse mask means that instead of representing the sparsity of
     individual elements in the mask, a KV_BLOCK_SIZE x Q_BLOCK_SIZE block is
     considered sparse only if every element within that block is sparse.
@@ -239,8 +260,13 @@ class BlockMask:
     Notably, this format makes it easier to implement a reduction along the
     *rows* of the mask.
 
+<<<<<<< HEAD
     Details
     -------
+=======
+    **Details**
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     The basics of our format require only kv_num_blocks and kv_indices. But, we
     have up to 8 tensors on this object. This represents 4 pairs:
 
@@ -293,12 +319,21 @@ def __init__(
         assert kv_indices is not None, "kv_indices must be provided"
         assert q_num_blocks is not None, "q_num_blocks must be provided"
         assert q_indices is not None, "q_indices must be provided"
+<<<<<<< HEAD
         assert (full_kv_num_blocks is None) == (
             full_kv_indices is None
         ), "full_kv_num_blocks and full_kv_indices must be both provided or omitted"
         assert (full_q_num_blocks is None) == (
             full_q_indices is None
         ), "full_q_num_blocks and full_q_indices must be both provided or omitted"
+=======
+        assert (full_kv_num_blocks is None) == (full_kv_indices is None), (
+            "full_kv_num_blocks and full_kv_indices must be both provided or omitted"
+        )
+        assert (full_q_num_blocks is None) == (full_q_indices is None), (
+            "full_q_num_blocks and full_q_indices must be both provided or omitted"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.seq_lengths = seq_lengths
         self.kv_num_blocks = kv_num_blocks
@@ -344,9 +379,15 @@ def from_kv_blocks(
         if kv_indices.dim() < 2:
             raise RuntimeError("BlockMask must have at least 2 dimensions")
 
+<<<<<<< HEAD
         assert (full_kv_num_blocks is None) == (
             full_kv_indices is None
         ), "full_kv_num_blocks and full_kv_indices must be both provided or omitted"
+=======
+        assert (full_kv_num_blocks is None) == (full_kv_indices is None), (
+            "full_kv_num_blocks and full_kv_indices must be both provided or omitted"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Generate q_num_blocks and q_indices
         q_num_blocks, q_indices = _transpose_ordered(kv_num_blocks, kv_indices)
@@ -434,6 +475,7 @@ def __getitem__(self, index) -> "BlockMask":
                 def causal_mask(b, h, q_idx, kv_idx):
                     return q_idx >= kv_idx
 
+<<<<<<< HEAD
                 block_mask = create_block_mask(causal_mask, 4, 2, 512, 512, device="cuda")
                 assert block_mask.kv_num_blocks.shape == (4,2,4)
                 assert block_mask.kv_indices.shape == (4,2,4,4)
@@ -442,10 +484,24 @@ def causal_mask(b, h, q_idx, kv_idx):
                 new_block_mask = block_mask[0]
                 assert new_block_mask.kv_num_blocks.shape == (2,4)
                 assert new_block_mask.kv_indices.shape == (2,4,4)
+=======
+
+                block_mask = create_block_mask(
+                    causal_mask, 4, 2, 512, 512, device="cuda"
+                )
+                assert block_mask.kv_num_blocks.shape == (4, 2, 4)
+                assert block_mask.kv_indices.shape == (4, 2, 4, 4)
+
+                # Index on batch dimension
+                new_block_mask = block_mask[0]
+                assert new_block_mask.kv_num_blocks.shape == (2, 4)
+                assert new_block_mask.kv_indices.shape == (2, 4, 4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 # Index on batch and head dimension
                 new_block_mask = block_mask[0, 1]
                 assert new_block_mask.kv_num_blocks.shape == (4,)
+<<<<<<< HEAD
                 assert new_block_mask.kv_indices.shape == (4,4)
 
                 # slicing on batch and head dimension
@@ -457,6 +513,21 @@ def causal_mask(b, h, q_idx, kv_idx):
                 new_block_mask = block_mask[0:2, 1:2, torch.tensor([1], dtype=torch.int32)]
                 assert new_block_mask.kv_num_blocks.shape == (2,1,1)
                 assert new_block_mask.kv_indices.shape == (2,1,1,4)
+=======
+                assert new_block_mask.kv_indices.shape == (4, 4)
+
+                # slicing on batch and head dimension
+                new_block_mask = block_mask[0:2, 1:2]
+                assert new_block_mask.kv_num_blocks.shape == (2, 1, 4)
+                assert new_block_mask.kv_indices.shape == (2, 1, 4, 4)
+
+                # slicing on batch, head, and query dimension
+                new_block_mask = block_mask[
+                    0:2, 1:2, torch.tensor([1], dtype=torch.int32)
+                ]
+                assert new_block_mask.kv_num_blocks.shape == (2, 1, 1)
+                assert new_block_mask.kv_indices.shape == (2, 1, 1, 4)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         new_kv_num_blocks = self.kv_num_blocks[index]
         new_kv_indices = self.kv_indices[index]
@@ -485,7 +556,11 @@ def shape_or_none(x: Optional[torch.Tensor]):
             f"BlockMask(\n"
             f"    kv_num_blocks={self.kv_num_blocks.shape},\n"
             f"    kv_indices={self.kv_indices.shape},\n"
+<<<<<<< HEAD
             f"    full_kv_num_blocks={shape_or_none(self.full_kv_num_blocks )},\n"
+=======
+            f"    full_kv_num_blocks={shape_or_none(self.full_kv_num_blocks)},\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"    full_kv_indices={shape_or_none(self.full_kv_indices)},\n"
             f"    q_num_blocks={shape_or_none(self.q_num_blocks)},\n"
             f"    q_indices={shape_or_none(self.q_indices)},\n"
@@ -560,7 +635,11 @@ def to_dense(self) -> Tensor:
     def to_string(self, grid_size=(20, 20), limit=4):
         """Returns a string representation of the block mask. Quite nifty.
 
+<<<<<<< HEAD
         If grid_size is None, prints out an uncompressed version. Warning, it can be quite big!
+=======
+        If grid_size is -1, prints out an uncompressed version. Warning, it can be quite big!
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         dense_mask = self.to_dense()
         *batch_dims, num_rows, num_cols = dense_mask.shape
@@ -780,7 +859,11 @@ def create_mask(
     H: Optional[int],
     Q_LEN: int,
     KV_LEN: int,
+<<<<<<< HEAD
     device: str = "cuda",
+=======
+    device: DeviceLikeType = "cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Tensor:
     r"""This function creates a mask tensor from a mod_fn function.
 
@@ -805,6 +888,11 @@ def create_mask(
     n = torch.arange(0, KV_LEN, device=device)
     mod_type = _get_mod_type(mod_fn)
 
+<<<<<<< HEAD
+=======
+    from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with TransformGetItemToIndex():
         if mod_type == _ModificationType.SCORE_MOD:
             score_mod = mod_fn
@@ -827,7 +915,11 @@ def create_block_mask(
     H: Optional[int],
     Q_LEN: int,
     KV_LEN: int,
+<<<<<<< HEAD
     device: str = "cuda",
+=======
+    device: DeviceLikeType = "cuda",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     BLOCK_SIZE: Union[int, tuple[int, int]] = _DEFAULT_SPARSE_BLOCK_SIZE,
     _compile=False,
 ) -> BlockMask:
@@ -855,6 +947,10 @@ def create_block_mask(
             def causal_mask(b, h, q_idx, kv_idx):
                 return q_idx >= kv_idx
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             block_mask = create_block_mask(causal_mask, 1, 1, 8192, 8192, device="cuda")
             query = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16)
             key = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16)
@@ -862,9 +958,15 @@ def causal_mask(b, h, q_idx, kv_idx):
             output = flex_attention(query, key, value, block_mask=block_mask)
     """
     mod_type = _get_mod_type(mask_mod)
+<<<<<<< HEAD
     assert (
         mod_type == _ModificationType.MASK_MOD
     ), f"create-block_mask requires a mask_mod function! Got {mask_mod}"
+=======
+    assert mod_type == _ModificationType.MASK_MOD, (
+        f"create-block_mask requires a mask_mod function! Got {mask_mod}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if B is None:
         B = 1
     if H is None:
@@ -960,7 +1062,14 @@ def _build_seq_idx(offsets, total_length):
         kv_seq_idx = q_seq_idx
     else:
         # cross attention case
+<<<<<<< HEAD
         kv_seq_idx = _build_seq_idx(kv_offsets, kv_nt._values.shape[kv_nt._ragged_idx - 1])  # type: ignore[attr-defined]
+=======
+        kv_seq_idx = _build_seq_idx(
+            kv_offsets,
+            kv_nt._values.shape[kv_nt._ragged_idx - 1],  # type: ignore[attr-defined]
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Converts q_idx / kv_idx from [0, total_length) -> [0, S), where S refers
     # to the sequence length for each sequence in the NJT, for use in given
@@ -1037,10 +1146,21 @@ def create_nested_block_mask(
             key = torch.nested.nested_tensor(..., layout=torch.jagged)
             value = torch.nested.nested_tensor(..., layout=torch.jagged)
 
+<<<<<<< HEAD
             def causal_mask(b, h, q_idx, kv_idx):
                 return q_idx >= kv_idx
 
             block_mask = create_nested_block_mask(causal_mask, 1, 1, query, _compile=True)
+=======
+
+            def causal_mask(b, h, q_idx, kv_idx):
+                return q_idx >= kv_idx
+
+
+            block_mask = create_nested_block_mask(
+                causal_mask, 1, 1, query, _compile=True
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output = flex_attention(query, key, value, block_mask=block_mask)
 
         .. code-block:: python
@@ -1050,11 +1170,23 @@ def causal_mask(b, h, q_idx, kv_idx):
             key = torch.nested.nested_tensor(..., layout=torch.jagged)
             value = torch.nested.nested_tensor(..., layout=torch.jagged)
 
+<<<<<<< HEAD
             def causal_mask(b, h, q_idx, kv_idx):
                 return q_idx >= kv_idx
 
             # cross attention case: pass both query and key/value NJTs
             block_mask = create_nested_block_mask(causal_mask, 1, 1, query, key, _compile=True)
+=======
+
+            def causal_mask(b, h, q_idx, kv_idx):
+                return q_idx >= kv_idx
+
+
+            # cross attention case: pass both query and key/value NJTs
+            block_mask = create_nested_block_mask(
+                causal_mask, 1, 1, query, key, _compile=True
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output = flex_attention(query, key, value, block_mask=block_mask)
     """
     # use same structure for kv as for q by default
@@ -1116,6 +1248,7 @@ def _validate_embed_dim(query: Tensor, key: Tensor, value: Tensor):
             f"Expect query and key/value to have the same embedding dimension "
             f"but got E={query.size(-1)} and E={key.size(-1)}."
         )
+<<<<<<< HEAD
     return
     # TODO this config segfaults with Triton without:
     # https://github.com/triton-lang/triton/pull/4540
@@ -1126,15 +1259,27 @@ def _validate_embed_dim(query: Tensor, key: Tensor, value: Tensor):
             f"NYI: Currently non power of 2 embedding dimension are not supported. "
             f"Got E={query.size(-1)} and Ev={value.size(-1)}."
         )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _validate_device(query: Tensor, key: Tensor, value: Tensor):
     """TODO: Remove once non cuda/cpu devices support is added
     We only need to check query since we have already that q,k,v are on the same device
     """
+<<<<<<< HEAD
     if query.device.type != "cuda" and query.device.type != "cpu":
         raise ValueError(
             "FlexAttention is only supported on CUDA or CPU devices. "
+=======
+    if (
+        query.device.type != "cuda"
+        and query.device.type != "cpu"
+        and query.device.type != "hpu"
+    ):
+        raise ValueError(
+            "FlexAttention is only supported on CUDA, CPU or HPU devices. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             f"Found input tensors on {query.device.type} device."
         )
 
@@ -1158,6 +1303,68 @@ def _validate_nestedness(query: Tensor, key: Tensor, value: Tensor):
         )
 
 
+<<<<<<< HEAD
+=======
+def _enforce_mem_layouts(
+    query: Tensor, key: Tensor, value: Tensor
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Enforce memory layouts for query, key, and value tensors.
+
+    For non-FP8 dtypes, no action is taken.
+
+    For FP8 dtypes, we enforce the following memory layouts:
+    - Query tensor must be in row-major memory layout, as it will be the left-operand in the FP8 GEMM `q @ k.T`.
+    - Key tensor must be in row-major memory layout, as it will be transposed when used as the right-operand
+      in the FP8 GEMM `q @ k.T`, meaning it will correctly be in column-major memory layout for the GEMM.
+    - Value tensor must be in column-major memory layout, as it will be the right-operand in the FP8 GEMM `softmax_scores @ v`.
+
+    Returns the query, key, and value tensors with the enforced memory layouts.
+    """
+
+    def is_row_major(tensor: Tensor) -> bool:
+        return tensor.stride()[-1] == 1
+
+    def is_col_major(tensor: Tensor) -> bool:
+        return tensor.stride()[-2] == 1
+
+    # These memory layout constraint are only for FP8 GEMMs on NVIDIA GPU architectures >= SM89 and < SM100.
+    # This is because GPU arch < SM89 does not not support FP8 GEMMs, and
+    # SM100 has support for TN, NT, TT, NN layouts for FP8 GEMMs
+    # (i.e., left and right operands can be in row or column major layouts)
+    # so this check is only needed for older architectures.
+    # See: https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/blackwell_functionality.md
+    fp8_dtypes = (
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+    )
+    gemm_precision = query.dtype
+
+    should_enforce_mem_layout = (
+        gemm_precision in fp8_dtypes
+        and torch.version.cuda is not None
+        and torch.cuda.get_device_capability("cuda") >= (8, 9)
+        and torch.cuda.get_device_capability("cuda") < (10, 0)
+    )
+    if not should_enforce_mem_layout:
+        return query, key, value
+
+    # Query must be in row-major memory layout as the left-operand in the FP8 GEMM `q @ k.T`
+    if not is_row_major(query):
+        query = query.contiguous()
+
+    # Key must be in row-major memory layout as it will be transposed when used as the right-operand
+    # in the FP8 GEMM `q @ k.T`, meaning it will correctly be in column-major memory layout for the GEMM.
+    if not is_row_major(key):
+        key = key.contiguous()
+
+    # Value must be in column-major memory layout as the right-operand in the FP8 GEMM `softmax_scores @ v`
+    if not is_col_major(value):
+        value = value.transpose(-2, -1).contiguous().transpose(-2, -1)
+    return query, key, value
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def flex_attention(
     query: Tensor,
     key: Tensor,
@@ -1195,9 +1402,15 @@ def score_mod(
           These should have the ``torch.int`` data type and be located on the same device as the score tensor.
 
     Args:
+<<<<<<< HEAD
         query (Tensor): Query tensor; shape :math:`(B, Hq, L, E)`.
         key (Tensor): Key tensor; shape :math:`(B, Hkv, S, E)`.
         value (Tensor): Value tensor; shape :math:`(B, Hkv, S, Ev)`.
+=======
+        query (Tensor): Query tensor; shape :math:`(B, Hq, L, E)`. For FP8 dtypes, should be in row-major memory layout for optimal performance.
+        key (Tensor): Key tensor; shape :math:`(B, Hkv, S, E)`. For FP8 dtypes, should be in row-major memory layout for optimal performance.
+        value (Tensor): Value tensor; shape :math:`(B, Hkv, S, Ev)`. For FP8 dtypes, should be in column-major memory layout for optimal performance.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         score_mod (Optional[Callable]): Function to modify attention scores. By default no score_mod is applied.
         block_mask (Optional[BlockMask]): BlockMask object that controls the blocksparsity pattern of the attention.
         scale (Optional[float]): Scaling factor applied prior to softmax. If none, the default value is set to :math:`\frac{1}{\sqrt{E}}`.
@@ -1226,6 +1439,10 @@ def score_mod(
     _validate_embed_dim(query, key, value)
     _validate_device(query, key, value)
     _validate_nestedness(query, key, value)
+<<<<<<< HEAD
+=======
+    query, key, value = _enforce_mem_layouts(query, key, value)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if query.dim() != 4 or key.dim() != 4 or value.dim() != 4:
         raise NotImplementedError("NYI: query, key, and value must be 4D tensors")
     if (not enable_gqa) and query.size(-3) != key.size(-3):
@@ -1325,7 +1542,17 @@ def score_mod(
             torch._dynamo.mark_static(x, -1)
 
         out, lse = flex_attention_hop(
+<<<<<<< HEAD
             query, key, value, score_mod, block_mask.as_tuple(), scale, kernel_options  # type: ignore[union-attr]
+=======
+            query,
+            key,
+            value,
+            score_mod,
+            block_mask.as_tuple(),
+            scale,
+            kernel_options,  # type: ignore[union-attr]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if return_lse:
             return out, lse * math.log(2)
diff --git a/torch/nn/common_types.py b/torch/nn/common_types.py
index fc1731faa79e..24f667ab385a 100644
--- a/torch/nn/common_types.py
+++ b/torch/nn/common_types.py
@@ -1,14 +1,24 @@
 from typing import Optional, TypeVar, Union
+<<<<<<< HEAD
+=======
+from typing_extensions import TypeAlias as _TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch import Tensor
 
 
+<<<<<<< HEAD
+=======
+# ruff: noqa: PYI042,PYI047
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Create some useful type aliases
 
 # Template for arguments which can be supplied as a tuple, or which can be a scalar which PyTorch will internally
 # broadcast to a tuple.
 # Comes in several variants: A tuple of unknown size, and a fixed-size tuple for 1d, 2d, or 3d operations.
 T = TypeVar("T")
+<<<<<<< HEAD
 _scalar_or_tuple_any_t = Union[T, tuple[T, ...]]
 _scalar_or_tuple_1_t = Union[T, tuple[T]]
 _scalar_or_tuple_2_t = Union[T, tuple[T, T]]
@@ -37,8 +47,42 @@
 _ratio_any_t = _scalar_or_tuple_any_t[float]
 
 _tensor_list_t = _scalar_or_tuple_any_t[Tensor]
+=======
+_scalar_or_tuple_any_t: _TypeAlias = Union[T, tuple[T, ...]]
+_scalar_or_tuple_1_t: _TypeAlias = Union[T, tuple[T]]
+_scalar_or_tuple_2_t: _TypeAlias = Union[T, tuple[T, T]]
+_scalar_or_tuple_3_t: _TypeAlias = Union[T, tuple[T, T, T]]
+_scalar_or_tuple_4_t: _TypeAlias = Union[T, tuple[T, T, T, T]]
+_scalar_or_tuple_5_t: _TypeAlias = Union[T, tuple[T, T, T, T, T]]
+_scalar_or_tuple_6_t: _TypeAlias = Union[T, tuple[T, T, T, T, T, T]]
+
+# For arguments which represent size parameters (eg, kernel size, padding)
+_size_any_t: _TypeAlias = _scalar_or_tuple_any_t[int]
+_size_1_t: _TypeAlias = _scalar_or_tuple_1_t[int]
+_size_2_t: _TypeAlias = _scalar_or_tuple_2_t[int]
+_size_3_t: _TypeAlias = _scalar_or_tuple_3_t[int]
+_size_4_t: _TypeAlias = _scalar_or_tuple_4_t[int]
+_size_5_t: _TypeAlias = _scalar_or_tuple_5_t[int]
+_size_6_t: _TypeAlias = _scalar_or_tuple_6_t[int]
+
+# For arguments which represent optional size parameters (eg, adaptive pool parameters)
+_size_any_opt_t: _TypeAlias = _scalar_or_tuple_any_t[Optional[int]]
+_size_2_opt_t: _TypeAlias = _scalar_or_tuple_2_t[Optional[int]]
+_size_3_opt_t: _TypeAlias = _scalar_or_tuple_3_t[Optional[int]]
+
+# For arguments that represent a ratio to adjust each dimension of an input with (eg, upsampling parameters)
+_ratio_2_t: _TypeAlias = _scalar_or_tuple_2_t[float]
+_ratio_3_t: _TypeAlias = _scalar_or_tuple_3_t[float]
+_ratio_any_t: _TypeAlias = _scalar_or_tuple_any_t[float]
+
+_tensor_list_t: _TypeAlias = _scalar_or_tuple_any_t[Tensor]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # For the return value of max pooling operations that may or may not return indices.
 # With the proposed 'Literal' feature to Python typing, it might be possible to
 # eventually eliminate this.
+<<<<<<< HEAD
 _maybe_indices_t = _scalar_or_tuple_2_t[Tensor]
+=======
+_maybe_indices_t: _TypeAlias = _scalar_or_tuple_2_t[Tensor]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 2a51e4fc08fa..7fc82d41a381 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -55,9 +55,13 @@
 
 Note:
     This operator supports complex data types i.e. ``complex32, complex64, complex128``.
+<<<<<<< HEAD
 """.format(
         **reproducibility_notes, **tf32_notes
     )
+=======
+""".format(**reproducibility_notes, **tf32_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     + r"""
 
 Args:
@@ -106,9 +110,13 @@
 
 Note:
     This operator supports complex data types i.e. ``complex32, complex64, complex128``.
+<<<<<<< HEAD
 """.format(
         **reproducibility_notes, **tf32_notes
     )
+=======
+""".format(**reproducibility_notes, **tf32_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     + r"""
 
 Args:
@@ -159,9 +167,13 @@
 
 Note:
     This operator supports complex data types i.e. ``complex32, complex64, complex128``.
+<<<<<<< HEAD
 """.format(
         **reproducibility_notes, **tf32_notes
     )
+=======
+""".format(**reproducibility_notes, **tf32_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     + r"""
 
 Args:
@@ -208,9 +220,13 @@
 
 Note:
     {cudnn_reproducibility_note}
+<<<<<<< HEAD
 """.format(
         **reproducibility_notes, **tf32_notes
     )
+=======
+""".format(**reproducibility_notes, **tf32_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     + r"""
 
 Args:
@@ -251,9 +267,13 @@
 
 Note:
     {cudnn_reproducibility_note}
+<<<<<<< HEAD
 """.format(
         **reproducibility_notes, **tf32_notes
     )
+=======
+""".format(**reproducibility_notes, **tf32_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     + r"""
 
 Args:
@@ -296,9 +316,13 @@
 
 Note:
     {cudnn_reproducibility_note}
+<<<<<<< HEAD
 """.format(
         **reproducibility_notes, **tf32_notes
     )
+=======
+""".format(**reproducibility_notes, **tf32_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     + r"""
 
 Args:
@@ -350,6 +374,12 @@
 Applies a 1D average pooling over an input signal composed of several
 input planes.
 
+<<<<<<< HEAD
+=======
+.. note::
+    pad should be at most half of effective kernel size.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 See :class:`~torch.nn.AvgPool1d` for details and output shape.
 
 Args:
@@ -385,16 +415,31 @@
 :math:`sH \times sW` steps. The number of output features is equal to the number of
 input planes.
 
+<<<<<<< HEAD
+=======
+.. note::
+    pad should be at most half of effective kernel size.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 See :class:`~torch.nn.AvgPool2d` for details and output shape.
 
 Args:
     input: input tensor :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+<<<<<<< HEAD
     kernel_size: size of the pooling region. Can be a single number or a
       tuple `(kH, kW)`
     stride: stride of the pooling operation. Can be a single number or a
       tuple `(sH, sW)`. Default: :attr:`kernel_size`
     padding: implicit zero paddings on both sides of the input. Can be a
       single number or a tuple `(padH, padW)`. Default: 0
+=======
+    kernel_size: size of the pooling region. Can be a single number, a single-element tuple or a
+      tuple `(kH, kW)`
+    stride: stride of the pooling operation. Can be a single number, a single-element tuple or a
+      tuple `(sH, sW)`. Default: :attr:`kernel_size`
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number, a single-element tuple or a tuple `(padH, padW)`. Default: 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ceil_mode: when True, will use `ceil` instead of `floor` in the formula
         to compute the output shape. Default: ``False``
     count_include_pad: when True, will include the zero-padding in the
@@ -413,6 +458,12 @@
 size :math:`sT \times sH \times sW` steps. The number of output features is equal to
 :math:`\lfloor\frac{\text{input planes}}{sT}\rfloor`.
 
+<<<<<<< HEAD
+=======
+.. note::
+    pad should be at most half of effective kernel size.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 See :class:`~torch.nn.AvgPool3d` for details and output shape.
 
 Args:
@@ -2326,9 +2377,13 @@ def hardsigmoid(input: Tensor, inplace: bool = False) -> Tensor:
     - Weight: :math:`(out\_features, in\_features)` or :math:`(in\_features)`
     - Bias: :math:`(out\_features)` or :math:`()`
     - Output: :math:`(*, out\_features)` or :math:`(*)`, based on the shape of the weight
+<<<<<<< HEAD
 """.format(
         **sparse_support_notes
     ),
+=======
+""".format(**sparse_support_notes),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -2526,6 +2581,7 @@ def embedding(
         )
     if padding_idx is not None:
         if padding_idx > 0:
+<<<<<<< HEAD
             assert padding_idx < weight.size(
                 0
             ), "Padding_idx must be within num_embeddings"
@@ -2533,6 +2589,15 @@ def embedding(
             assert padding_idx >= -weight.size(
                 0
             ), "Padding_idx must be within num_embeddings"
+=======
+            assert padding_idx < weight.size(0), (
+                "Padding_idx must be within num_embeddings"
+            )
+        elif padding_idx < 0:
+            assert padding_idx >= -weight.size(0), (
+                "Padding_idx must be within num_embeddings"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             padding_idx = weight.size(0) + padding_idx
     else:
         padding_idx = -1
@@ -3021,7 +3086,11 @@ def ctc_loss(
     reduction: str = "mean",
     zero_infinity: bool = False,
 ) -> Tensor:
+<<<<<<< HEAD
     r"""Apply the Connectionist Temporal Classification loss.
+=======
+    r"""Compute the Connectionist Temporal Classification loss.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     See :class:`~torch.nn.CTCLoss` for details.
 
@@ -3037,7 +3106,12 @@ def ctc_loss(
             The logarithmized probabilities of the outputs
             (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
         targets: :math:`(N, S)` or `(sum(target_lengths))`.
+<<<<<<< HEAD
             Targets cannot be blank. In the second form, the targets are assumed to be concatenated.
+=======
+                May be an empty tensor if all entries in `target_lengths` are zero.
+                In the second form, the targets are assumed to be concatenated.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         input_lengths: :math:`(N)` or :math:`()`.
             Lengths of the inputs (must each be :math:`\leq T`)
         target_lengths: :math:`(N)` or :math:`()`.
@@ -3111,6 +3185,7 @@ def nll_loss(
         target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`,
             or :math:`(N, d_1, d_2, ..., d_K)` where :math:`K \geq 1` for
             K-dimensional loss.
+<<<<<<< HEAD
         weight (Tensor, optional): a manual rescaling weight given to each
             class. If given, has to be a Tensor of size `C`
         size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
@@ -3125,6 +3200,15 @@ def nll_loss(
             losses are averaged or summed over observations for each minibatch depending
             on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
             batch element instead and ignores :attr:`size_average`. Default: ``True``
+=======
+        weight (Tensor, optional): A manual rescaling weight given to each
+            class. If given, has to be a Tensor of size `C`
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When :attr:`size_average` is
+            ``True``, the loss is averaged over non-ignored targets. Default: -100
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
             ``'mean'``: the sum of the output will be divided by the number of
@@ -3170,11 +3254,16 @@ def poisson_nll_loss(
     reduce: Optional[bool] = None,
     reduction: str = "mean",
 ) -> Tensor:
+<<<<<<< HEAD
     r"""Poisson negative log likelihood loss.
+=======
+    r"""Compute the Poisson negative log likelihood loss.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     See :class:`~torch.nn.PoissonNLLLoss` for details.
 
     Args:
+<<<<<<< HEAD
         input: expectation of underlying Poisson distribution.
         target: random sample :math:`target \sim \text{Poisson}(input)`.
         log_input: if ``True`` the loss is computed as
@@ -3194,6 +3283,20 @@ def poisson_nll_loss(
             losses are averaged or summed over observations for each minibatch depending
             on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
             batch element instead and ignores :attr:`size_average`. Default: ``True``
+=======
+        input: Expectation of underlying Poisson distribution.
+        target: Random sample :math:`target \sim \text{Poisson}(input)`.
+        log_input: If ``True`` the loss is computed as
+            :math:`\exp(\text{input}) - \text{target} * \text{input}`, if ``False`` then loss is
+            :math:`\text{input} - \text{target} * \log(\text{input}+\text{eps})`. Default: ``True``
+        full: Whether to compute full loss, i. e. to add the Stirling
+            approximation term. Default: ``False``
+            :math:`\text{target} * \log(\text{target}) - \text{target} + 0.5 * \log(2 * \pi * \text{target})`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+        eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when
+            :attr:`log_input`\ =\ ``False``. Default: 1e-8
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
             ``'mean'``: the sum of the output will be divided by the number of
@@ -3235,11 +3338,16 @@ def gaussian_nll_loss(
     eps: float = 1e-6,
     reduction: str = "mean",
 ) -> Tensor:
+<<<<<<< HEAD
     r"""Gaussian negative log likelihood loss.
+=======
+    r"""Compute the Gaussian negative log likelihood loss.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     See :class:`~torch.nn.GaussianNLLLoss` for details.
 
     Args:
+<<<<<<< HEAD
         input: expectation of the Gaussian distribution.
         target: sample from the Gaussian distribution.
         var: tensor of positive variance(s), one for each of the expectations
@@ -3248,6 +3356,16 @@ def gaussian_nll_loss(
         full (bool, optional): include the constant term in the loss calculation. Default: ``False``.
         eps (float, optional): value added to var, for stability. Default: 1e-6.
         reduction (str, optional): specifies the reduction to apply to the output:
+=======
+        input: Expectation of the Gaussian distribution.
+        target: Sample from the Gaussian distribution.
+        var: Tensor of positive variance(s), one for each of the expectations
+            in the input (heteroscedastic), or a single one (homoscedastic),
+            or a positive scalar value to be used for all expectations.
+        full (bool, optional): Whether to include the constant term in the loss calculation. Default: ``False``.
+        eps (float, optional): Value added to var, for stability. Default: 1e-6.
+        reduction (str, optional): Specifies the reduction to apply to the output:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
             ``'mean'``: the output is the average of all batch member losses,
             ``'sum'``: the output is the sum of all batch member losses.
@@ -3337,6 +3455,7 @@ def kl_div(
         input: Tensor of arbitrary shape in log-probabilities.
         target: Tensor of the same shape as input. See :attr:`log_target` for
             the target's interpretation.
+<<<<<<< HEAD
         size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
             the losses are averaged over each loss element in the batch. Note that for
             some losses, there multiple elements per sample. If the field :attr:`size_average`
@@ -3346,6 +3465,10 @@ def kl_div(
             losses are averaged or summed over observations for each minibatch depending
             on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
             batch element instead and ignores :attr:`size_average`. Default: ``True``
+=======
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``'none'`` | ``'batchmean'`` | ``'sum'`` | ``'mean'``.
             ``'none'``: no reduction will be applied
@@ -3422,20 +3545,28 @@ def cross_entropy(
             see Shape section below for supported shapes.
         weight (Tensor, optional): a manual rescaling weight given to each
             class. If given, has to be a Tensor of size `C`
+<<<<<<< HEAD
         size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
             the losses are averaged over each loss element in the batch. Note that for
             some losses, there multiple elements per sample. If the field :attr:`size_average`
             is set to ``False``, the losses are instead summed for each minibatch. Ignored
             when reduce is ``False``. Default: ``True``
+=======
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ignore_index (int, optional): Specifies a target value that is ignored
             and does not contribute to the input gradient. When :attr:`size_average` is
             ``True``, the loss is averaged over non-ignored targets. Note that
             :attr:`ignore_index` is only applicable when the target contains class indices.
             Default: -100
+<<<<<<< HEAD
         reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
             losses are averaged or summed over observations for each minibatch depending
             on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
             batch element instead and ignores :attr:`size_average`. Default: ``True``
+=======
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
             ``'mean'``: the sum of the output will be divided by the number of
@@ -3509,7 +3640,11 @@ def binary_cross_entropy(
     reduce: Optional[bool] = None,
     reduction: str = "mean",
 ) -> Tensor:
+<<<<<<< HEAD
     r"""Measure Binary Cross Entropy between the target and input probabilities.
+=======
+    r"""Compute Binary Cross Entropy between the target and input probabilities.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     See :class:`~torch.nn.BCELoss` for details.
 
@@ -3518,6 +3653,7 @@ def binary_cross_entropy(
         target: Tensor of the same shape as input with values between 0 and 1.
         weight (Tensor, optional): a manual rescaling weight
                 if provided it's repeated to match input tensor shape
+<<<<<<< HEAD
         size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
             the losses are averaged over each loss element in the batch. Note that for
             some losses, there multiple elements per sample. If the field :attr:`size_average`
@@ -3527,6 +3663,10 @@ def binary_cross_entropy(
             losses are averaged or summed over observations for each minibatch depending
             on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
             batch element instead and ignores :attr:`size_average`. Default: ``True``
+=======
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
             ``'mean'``: the sum of the output will be divided by the number of
@@ -3578,7 +3718,11 @@ def binary_cross_entropy_with_logits(
     reduction: str = "mean",
     pos_weight: Optional[Tensor] = None,
 ) -> Tensor:
+<<<<<<< HEAD
     r"""Calculate Binary Cross Entropy between target and input logits.
+=======
+    r"""Compute Binary Cross Entropy between target and input logits.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     See :class:`~torch.nn.BCEWithLogitsLoss` for details.
 
@@ -3587,6 +3731,7 @@ def binary_cross_entropy_with_logits(
         target: Tensor of the same shape as input with values between 0 and 1
         weight (Tensor, optional): a manual rescaling weight
             if provided it's repeated to match input tensor shape
+<<<<<<< HEAD
         size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
             the losses are averaged over each loss element in the batch. Note that for
             some losses, there multiple elements per sample. If the field :attr:`size_average`
@@ -3596,6 +3741,10 @@ def binary_cross_entropy_with_logits(
             losses are averaged or summed over observations for each minibatch depending
             on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
             batch element instead and ignores :attr:`size_average`. Default: ``True``
+=======
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction (str, optional): Specifies the reduction to apply to the output:
             ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
             ``'mean'``: the sum of the output will be divided by the number of
@@ -3659,6 +3808,25 @@ def smooth_l1_loss(
     element-wise error falls below beta and an L1 term otherwise.
 
     See :class:`~torch.nn.SmoothL1Loss` for details.
+<<<<<<< HEAD
+=======
+
+    Args:
+        input (Tensor): Predicted values.
+        target (Tensor): Ground truth values.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+        reduction (str, optional): Specifies the reduction to apply to the output:
+                                   'none' | 'mean' | 'sum'. 'mean': the mean of the output is taken.
+                                   'sum': the output will be summed. 'none': no reduction will be applied.
+                                   Default: 'mean'.
+        beta (float, optional): Specifies the threshold at which to change from the squared
+            term to the L1 term in the loss calculation. This value must be positive.
+            Default: 1.0.
+
+    Returns:
+        Tensor: L1 loss (optionally weighted).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if has_torch_function_variadic(input, target):
         return handle_torch_function(
@@ -3700,9 +3868,13 @@ def huber_loss(
     delta: float = 1.0,
     weight: Optional[Tensor] = None,
 ) -> Tensor:
+<<<<<<< HEAD
     r"""huber_loss(input, target, reduction='mean', delta=1.0, weight=None) -> Tensor
 
     Computes the Huber loss, with optional weighting.
+=======
+    r"""Compute the Huber loss, with optional weighting.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Function uses a squared term if the absolute
     element-wise error falls below delta and a delta-scaled L1 term otherwise.
@@ -3710,6 +3882,11 @@ def huber_loss(
     When delta equals 1, this loss is equivalent to SmoothL1Loss.
     In general, Huber loss differs from SmoothL1Loss by a factor of delta (AKA beta in Smooth L1).
 
+<<<<<<< HEAD
+=======
+    See :class:`~torch.nn.HuberLoss` for details.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
         input (Tensor): Predicted values.
         target (Tensor): Ground truth values.
@@ -3781,11 +3958,32 @@ def l1_loss(
     reduction: str = "mean",
     weight: Optional[Tensor] = None,
 ) -> Tensor:  # noqa: D400,D402
+<<<<<<< HEAD
     r"""l1_loss(input, target, size_average=None, reduce=None, reduction='mean') -> Tensor
+=======
+    r"""Compute the L1 loss, with optional weighting.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Function that takes the mean element-wise absolute value difference.
 
     See :class:`~torch.nn.L1Loss` for details.
+<<<<<<< HEAD
+=======
+
+    Args:
+        input (Tensor): Predicted values.
+        target (Tensor): Ground truth values.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+        reduction (str, optional): Specifies the reduction to apply to the output:
+                                   'none' | 'mean' | 'sum'. 'mean': the mean of the output is taken.
+                                   'sum': the output will be summed. 'none': no reduction will be applied.
+                                   Default: 'mean'.
+        weight (Tensor, optional): Weights for each sample. Default: None.
+
+    Returns:
+        Tensor: L1 loss (optionally weighted).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if has_torch_function_variadic(input, target):
         return handle_torch_function(
@@ -3840,15 +4038,26 @@ def mse_loss(
     reduction: str = "mean",
     weight: Optional[Tensor] = None,
 ) -> Tensor:
+<<<<<<< HEAD
     r"""mse_loss(input, target, size_average=None, reduce=None, reduction='mean', weight=None) -> Tensor
 
     Measures the element-wise mean squared error, with optional weighting.
+=======
+    r"""Compute the element-wise mean squared error, with optional weighting.
+
+    See :class:`~torch.nn.MSELoss` for details.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         input (Tensor): Predicted values.
         target (Tensor): Ground truth values.
+<<<<<<< HEAD
         size_average (bool, optional): Deprecated (use reduction).
         reduce (bool, optional): Deprecated (use reduction).
+=======
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         reduction (str, optional): Specifies the reduction to apply to the output:
                                    'none' | 'mean' | 'sum'. 'mean': the mean of the output is taken.
                                    'sum': the output will be summed. 'none': no reduction will be applied.
@@ -3916,9 +4125,29 @@ def margin_ranking_loss(
     reduce: Optional[bool] = None,
     reduction: str = "mean",
 ) -> Tensor:  # noqa: D400,D402
+<<<<<<< HEAD
     r"""margin_ranking_loss(input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean') -> Tensor
 
     See :class:`~torch.nn.MarginRankingLoss` for details.
+=======
+    r"""Compute the margin ranking loss.
+
+    See :class:`~torch.nn.MarginRankingLoss` for details.
+
+    Args:
+        input1 (Tensor): Predicted values.
+        input2 (Tensor): Predicted values.
+        target (Tensor): Ground truth values.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+        reduction (str, optional): Specifies the reduction to apply to the output:
+                                   'none' | 'mean' | 'sum'. 'mean': the mean of the output is taken.
+                                   'sum': the output will be summed. 'none': no reduction will be applied.
+                                   Default: 'mean'.
+
+    Returns:
+        Tensor: Margin ranking loss.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if has_torch_function_variadic(input1, input2, target):
         return handle_torch_function(
@@ -3952,9 +4181,29 @@ def hinge_embedding_loss(
     reduce: Optional[bool] = None,
     reduction: str = "mean",
 ) -> Tensor:  # noqa: D400,D402
+<<<<<<< HEAD
     r"""hinge_embedding_loss(input, target, margin=1.0, size_average=None, reduce=None, reduction='mean') -> Tensor
 
     See :class:`~torch.nn.HingeEmbeddingLoss` for details.
+=======
+    r"""Compute the hinge embedding loss.
+
+    See :class:`~torch.nn.HingeEmbeddingLoss` for details.
+
+    Args:
+       input (Tensor): Predicted values.
+       target (Tensor): Ground truth values.
+       margin (float, optional): Margin for hinge loss. Has a default value of 1.
+       size_average (bool, optional): Deprecated (see :attr:`reduction`).
+       reduce (bool, optional): Deprecated (see :attr:`reduction`).
+       reduction (str, optional): Specifies the reduction to apply to the output:
+                                  'none' | 'mean' | 'sum'. 'mean': the mean of the output is taken.
+                                  'sum': the output will be summed. 'none': no reduction will be applied.
+                                  Default: 'mean'.
+
+    Returns:
+       Tensor: Hinge embedding loss.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if has_torch_function_variadic(input, target):
         return handle_torch_function(
@@ -3981,9 +4230,28 @@ def multilabel_margin_loss(
     reduce: Optional[bool] = None,
     reduction: str = "mean",
 ) -> Tensor:  # noqa: D400,D402
+<<<<<<< HEAD
     r"""multilabel_margin_loss(input, target, size_average=None, reduce=None, reduction='mean') -> Tensor
 
     See :class:`~torch.nn.MultiLabelMarginLoss` for details.
+=======
+    r"""Compute the multilabel margin loss.
+
+    See :class:`~torch.nn.MultiLabelMarginLoss` for details.
+
+    Args:
+       input (Tensor): Predicted values.
+       target (Tensor): Ground truth values.
+       size_average (bool, optional): Deprecated (see :attr:`reduction`).
+       reduce (bool, optional): Deprecated (see :attr:`reduction`).
+       reduction (str, optional): Specifies the reduction to apply to the output:
+                                  'none' | 'mean' | 'sum'. 'mean': the mean of the output is taken.
+                                  'sum': the output will be summed. 'none': no reduction will be applied.
+                                  Default: 'mean'.
+
+    Returns:
+       Tensor: Mutilabel margin loss.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if has_torch_function_variadic(input, target):
         return handle_torch_function(
@@ -4009,10 +4277,29 @@ def soft_margin_loss(
     reduce: Optional[bool] = None,
     reduction: str = "mean",
 ) -> Tensor:  # noqa: D400,D402
+<<<<<<< HEAD
     r"""
     soft_margin_loss(input, target, size_average=None, reduce=None, reduction='mean') -> Tensor
 
     See :class:`~torch.nn.SoftMarginLoss` for details.
+=======
+    r"""Compute the soft margin loss.
+
+    See :class:`~torch.nn.SoftMarginLoss` for details.
+
+    Args:
+       input (Tensor): Predicted values.
+       target (Tensor): Ground truth values.
+       size_average (bool, optional): Deprecated (see :attr:`reduction`).
+       reduce (bool, optional): Deprecated (see :attr:`reduction`).
+       reduction (str, optional): Specifies the reduction to apply to the output:
+                                  'none' | 'mean' | 'sum'. 'mean': the mean of the output is taken.
+                                  'sum': the output will be summed. 'none': no reduction will be applied.
+                                  Default: 'mean'.
+
+    Returns:
+       Tensor: Soft margin loss.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if has_torch_function_variadic(input, target):
         return handle_torch_function(
@@ -4039,9 +4326,28 @@ def multilabel_soft_margin_loss(
     reduce: Optional[bool] = None,
     reduction: str = "mean",
 ) -> Tensor:  # noqa: D400,D402
+<<<<<<< HEAD
     r"""multilabel_soft_margin_loss(input, target, weight=None, size_average=None, reduce=None, reduction='mean') -> Tensor
 
     See :class:`~torch.nn.MultiLabelSoftMarginLoss` for details.
+=======
+    r"""Compute the multilabel soft margin loss.
+
+    See :class:`~torch.nn.MultiLabelSoftMarginLoss` for details.
+
+    Args:
+       input (Tensor): Predicted values.
+       target (Tensor): Ground truth values.
+       size_average (bool, optional): Deprecated (see :attr:`reduction`).
+       reduce (bool, optional): Deprecated (see :attr:`reduction`).
+       reduction (str, optional): Specifies the reduction to apply to the output:
+                                  'none' | 'mean' | 'sum'. 'mean': the mean of the output is taken.
+                                  'sum': the output will be summed. 'none': no reduction will be applied.
+                                  Default: 'mean'.
+
+    Returns:
+       Tensor: Mutilabel soft margin loss.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if has_torch_function_variadic(input, target, weight):
         return handle_torch_function(
@@ -4087,9 +4393,30 @@ def cosine_embedding_loss(
     reduce: Optional[bool] = None,
     reduction: str = "mean",
 ) -> Tensor:  # noqa: D400,D402
+<<<<<<< HEAD
     r"""cosine_embedding_loss(input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean') -> Tensor
 
     See :class:`~torch.nn.CosineEmbeddingLoss` for details.
+=======
+    r"""Compute the cosine embedding loss.
+
+    See :class:`~torch.nn.CosineEmbeddingLoss` for details.
+
+    Args:
+       input1 (Tensor): Predicted values.
+       input2 (Tensor): Predicted values.
+       target (Tensor): Ground truth values.
+       margin (float, optional): Margin for cosine embedding. Has a default value of 0.
+       size_average (bool, optional): Deprecated (see :attr:`reduction`).
+       reduce (bool, optional): Deprecated (see :attr:`reduction`).
+       reduction (str, optional): Specifies the reduction to apply to the output:
+                                  'none' | 'mean' | 'sum'. 'mean': the mean of the output is taken.
+                                  'sum': the output will be summed. 'none': no reduction will be applied.
+                                  Default: 'mean'.
+
+    Returns:
+       Tensor: Cosine embedding loss.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if has_torch_function_variadic(input1, input2, target):
         return handle_torch_function(
@@ -4120,9 +4447,31 @@ def multi_margin_loss(
     reduce: Optional[bool] = None,
     reduction: str = "mean",
 ) -> Tensor:  # noqa: D400,D402
+<<<<<<< HEAD
     r"""multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=None, reduce=None, reduction='mean') -> Tensor
 
     See :class:`~torch.nn.MultiMarginLoss` for details.
+=======
+    r"""Compute the multi margin loss, with optional weighting.
+
+    See :class:`~torch.nn.MultiMarginLoss` for details.
+
+    Args:
+        input (Tensor): Predicted values.
+        target (Tensor): Ground truth values.
+        p (int, optional): Has a default value of 1. 1 and 2 are the only supported values.
+        margin (float, optional): Margin for multi margin loss. Has a default value of 1.
+        weight (Tensor, optional): Weights for each sample. Default: None.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`).
+        reduce (bool, optional): Deprecated (see :attr:`reduction`).
+        reduction (str, optional): Specifies the reduction to apply to the output:
+                                  'none' | 'mean' | 'sum'. 'mean': the mean of the output is taken.
+                                  'sum': the output will be summed. 'none': no reduction will be applied.
+                                  Default: 'mean'.
+
+    Returns:
+        Tensor: Multi margin loss (optionally weighted).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     if has_torch_function_variadic(input, target, weight):
         return handle_torch_function(
@@ -4507,10 +4856,17 @@ def interpolate(  # noqa: F811
             result for downsampling operation. Supported modes: ``'bilinear'``, ``'bicubic'``.
 
     .. note::
+<<<<<<< HEAD
         With ``mode='bicubic'``, it's possible to cause overshoot, in other words it can produce
         negative values or values greater than 255 for images.
         Explicitly call ``result.clamp(min=0, max=255)`` if you want to reduce the overshoot
         when displaying the image.
+=======
+        With ``mode='bicubic'``, it's possible to cause overshoot. For some dtypes, it can produce
+        negative values or values greater than 255 for images. Explicitly call ``result.clamp(min=0,max=255)``
+        if you want to reduce the overshoot when displaying the image.
+        For ``uint8`` inputs, it already performs saturating cast operation. So, no manual `clamp` operation is needed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. note::
         Mode ``mode='nearest-exact'`` matches Scikit-Image and PIL nearest neighbours interpolation
@@ -5705,6 +6061,7 @@ def _in_projection(
         Eq,
         Ev,
     ), f"expecting value weights shape of {(Eq, Ev)}, but got {w_v.shape}"
+<<<<<<< HEAD
     assert b_q is None or b_q.shape == (
         Eq,
     ), f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}"
@@ -5714,6 +6071,17 @@ def _in_projection(
     assert b_v is None or b_v.shape == (
         Eq,
     ), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
+=======
+    assert b_q is None or b_q.shape == (Eq,), (
+        f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}"
+    )
+    assert b_k is None or b_k.shape == (Eq,), (
+        f"expecting key bias shape of {(Eq,)}, but got {b_k.shape}"
+    )
+    assert b_v is None or b_v.shape == (Eq,), (
+        f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
 
 
@@ -5819,9 +6187,13 @@ def forward(self, ...):
     Note:
 
         {cudnn_reproducibility_note}
+<<<<<<< HEAD
     """.format(
         **reproducibility_notes
     )
+=======
+    """.format(**reproducibility_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     + r"""
     Args:
         query (Tensor): Query tensor; shape :math:`(N, ..., Hq, L, E)`.
@@ -5931,9 +6303,15 @@ def _mha_shape_check(
             )
             if attn_mask.dim() == 3:
                 expected_shape = (num_heads, query.shape[0], key.shape[0])
+<<<<<<< HEAD
                 assert (
                     attn_mask.shape == expected_shape
                 ), f"Expected `attn_mask` shape to be {expected_shape} but got {attn_mask.shape}"
+=======
+                assert attn_mask.shape == expected_shape, (
+                    f"Expected `attn_mask` shape to be {expected_shape} but got {attn_mask.shape}"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         raise AssertionError(
             f"query should be unbatched 2D or batched 3D tensor but received {query.dim()}-D query tensor"
@@ -6022,11 +6400,14 @@ def multi_head_attention_forward(
 ) -> tuple[Tensor, Optional[Tensor]]:
     r"""Forward method for MultiHeadAttention.
 
+<<<<<<< HEAD
     .. note::
         See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
         for an in depth discussion of the performant building blocks PyTorch offers for building your own
         transformer layers.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     See :class:`torch.nn.MultiheadAttention` for details.
 
     Args:
@@ -6199,14 +6580,21 @@ def multi_head_attention_forward(
             # longer causal.
             is_causal = False
 
+<<<<<<< HEAD
     assert (
         embed_dim == embed_dim_to_check
     ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
+=======
+    assert embed_dim == embed_dim_to_check, (
+        f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(embed_dim, torch.Tensor):
         # embed_dim can be a tensor when JIT tracing
         head_dim = embed_dim.div(num_heads, rounding_mode="trunc")
     else:
         head_dim = embed_dim // num_heads
+<<<<<<< HEAD
     assert (
         head_dim * num_heads == embed_dim
     ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
@@ -6219,11 +6607,26 @@ def multi_head_attention_forward(
         assert (
             key.shape == value.shape
         ), f"key shape {key.shape} does not match value shape {value.shape}"
+=======
+    assert head_dim * num_heads == embed_dim, (
+        f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
+    )
+    if use_separate_proj_weight:
+        # allow MHA to have different embedding dimensions when separate projection weights are used
+        assert key.shape[:2] == value.shape[:2], (
+            f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
+        )
+    else:
+        assert key.shape == value.shape, (
+            f"key shape {key.shape} does not match value shape {value.shape}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     #
     # compute in-projection
     #
     if not use_separate_proj_weight:
+<<<<<<< HEAD
         assert (
             in_proj_weight is not None
         ), "use_separate_proj_weight is False but in_proj_weight is None"
@@ -6238,6 +6641,22 @@ def multi_head_attention_forward(
         assert (
             v_proj_weight is not None
         ), "use_separate_proj_weight is True but v_proj_weight is None"
+=======
+        assert in_proj_weight is not None, (
+            "use_separate_proj_weight is False but in_proj_weight is None"
+        )
+        q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
+    else:
+        assert q_proj_weight is not None, (
+            "use_separate_proj_weight is True but q_proj_weight is None"
+        )
+        assert k_proj_weight is not None, (
+            "use_separate_proj_weight is True but k_proj_weight is None"
+        )
+        assert v_proj_weight is not None, (
+            "use_separate_proj_weight is True but v_proj_weight is None"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if in_proj_bias is None:
             b_q = b_k = b_v = None
         else:
@@ -6298,23 +6717,41 @@ def multi_head_attention_forward(
         k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
     else:
         # TODO finish disentangling control flow so we don't do in-projections when statics are passed
+<<<<<<< HEAD
         assert (
             static_k.size(0) == bsz * num_heads
         ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
         assert (
             static_k.size(2) == head_dim
         ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
+=======
+        assert static_k.size(0) == bsz * num_heads, (
+            f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
+        )
+        assert static_k.size(2) == head_dim, (
+            f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         k = static_k
     if static_v is None:
         v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
     else:
         # TODO finish disentangling control flow so we don't do in-projections when statics are passed
+<<<<<<< HEAD
         assert (
             static_v.size(0) == bsz * num_heads
         ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
         assert (
             static_v.size(2) == head_dim
         ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
+=======
+        assert static_v.size(0) == bsz * num_heads, (
+            f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
+        )
+        assert static_v.size(2) == head_dim, (
+            f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         v = static_v
 
     # add zero attention along batch dimension (now first)
@@ -6361,9 +6798,15 @@ def multi_head_attention_forward(
         _B, _Nt, E = q.shape
         q_scaled = q * math.sqrt(1.0 / float(E))
 
+<<<<<<< HEAD
         assert not (
             is_causal and attn_mask is None
         ), "FIXME: is_causal not implemented for need_weights"
+=======
+        assert not (is_causal and attn_mask is None), (
+            "FIXME: is_causal not implemented for need_weights"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if attn_mask is not None:
             attn_output_weights = torch.baddbmm(
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index 1a72b0f60817..2e3d7541835c 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -1,6 +1,7 @@
 # ${generated_comment}
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
 from typing import (
     Any,
     Callable,
@@ -10,6 +11,11 @@ from typing import (
     Sequence,
     Union,
 )
+=======
+from collections.abc import Sequence
+from typing import Any, Callable, Literal, overload
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch import Tensor
 from torch.types import _dtype, _int, _size
@@ -24,6 +30,14 @@ from .common_types import (
     _size_any_t,
 )
 
+<<<<<<< HEAD
+=======
+__all__ = [
+    "GRID_SAMPLE_INTERPOLATION_MODES",
+    "GRID_SAMPLE_PADDING_MODES",
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # 'TypedDict' is a new accepted type that represents a dictionary with a fixed set of allowed keys.
 # It is standards-track but not in `typing` yet. We leave this hear to be uncommented once the feature
 # is wide-spread.
@@ -33,8 +47,13 @@ from .common_types import (
 # GRID_SAMPLE_INTERPOLATION_MODES = TypedDict('GRID_SAMPLE_INTERPOLATION_MODES', {'bilinear': int, 'nearest': int})
 # GRID_SAMPLE_PADDING_MODES = TypedDict('GRID_SAMPLE_PADDING_MODES', {'zeros': int, 'border': int, 'reflection': int})
 
+<<<<<<< HEAD
 GRID_SAMPLE_INTERPOLATION_MODES = dict[str, int]
 GRID_SAMPLE_PADDING_MODES = dict[str, int]
+=======
+GRID_SAMPLE_INTERPOLATION_MODES: TypeAlias = dict[str, int]
+GRID_SAMPLE_PADDING_MODES: TypeAlias = dict[str, int]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # These stubs were generated by running stubgen (`stubgen --parse-only functional.py`), followed by manual cleaning.
 #
@@ -46,6 +65,7 @@ GRID_SAMPLE_PADDING_MODES = dict[str, int]
 # deleted from the stub and replaced by generated declarations. See `gen_pyi` for the implementation of the code
 # generation logic for those functions. In the future, it might be worth looking into using the mypy plugin system
 # to encode the type semantics of `_add_docstr`, should that system ever become widespread.
+<<<<<<< HEAD
 def fractional_max_pool2d_with_indices(
     input: Tensor,
     kernel_size: _size,
@@ -134,21 +154,59 @@ def lp_pool3d(
     stride: Union[Optional[_size], Optional[int]] = ...,
     ceil_mode: bool = ...,
 ) -> Tensor: ...
+=======
+def _canonical_mask(
+    mask: Tensor | None,
+    mask_name: str,
+    other_type: _dtype | None,
+    other_name: str,
+    target_type: _dtype,
+    check_other: bool = True,
+) -> Tensor | None: ...
+
+__all__ += ["_canonical_mask"]
+
+def _none_or_dtype(input: Tensor | None) -> _dtype | None: ...
+
+__all__ += ["_none_or_dtype"]
+
+def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_opt_t) -> Tensor: ...
+
+__all__ += ["adaptive_avg_pool2d"]
+
+def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_opt_t) -> Tensor: ...
+
+__all__ += ["adaptive_avg_pool3d"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def adaptive_max_pool1d_with_indices(
     input: Tensor,
     output_size: _size,
     return_indices: bool = ...,
 ) -> tuple[Tensor, Tensor]: ...
+<<<<<<< HEAD
+=======
+
+__all__ += ["adaptive_max_pool1d_with_indices"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def adaptive_max_pool2d_with_indices(
     input: Tensor,
     output_size: _size_2_opt_t,
     return_indices: bool = ...,
 ) -> tuple[Tensor, Tensor]: ...
+<<<<<<< HEAD
+=======
+
+__all__ += ["adaptive_max_pool2d_with_indices"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def adaptive_max_pool3d_with_indices(
     input: Tensor,
     output_size: _size_3_opt_t,
     return_indices: bool = ...,
 ) -> tuple[Tensor, Tensor]: ...
+<<<<<<< HEAD
 def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_opt_t) -> Tensor: ...
 def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_opt_t) -> Tensor: ...
 def dropout(
@@ -157,12 +215,26 @@ def dropout(
     training: bool = ...,
     inplace: bool = ...,
 ) -> Tensor: ...
+=======
+
+__all__ += ["adaptive_max_pool3d_with_indices"]
+
+def affine_grid(
+    theta: Tensor,
+    size: list[int],
+    align_corners: Any | None = ...,
+) -> Tensor: ...
+
+__all__ += ["affine_grid"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def alpha_dropout(
     input: Tensor,
     p: float = ...,
     training: bool = ...,
     inplace: bool = ...,
 ) -> Tensor: ...
+<<<<<<< HEAD
 def dropout1d(
     input: Tensor,
     p: float = ...,
@@ -278,10 +350,26 @@ def batch_norm(
     running_var: Optional[Tensor],
     weight: Optional[Tensor] = ...,
     bias: Optional[Tensor] = ...,
+=======
+
+__all__ += ["alpha_dropout"]
+
+def assert_int_or_pair(arg: Any, arg_name: Any, message: Any) -> None: ...
+
+__all__ += ["assert_int_or_pair"]
+
+def batch_norm(
+    input: Tensor,
+    running_mean: Tensor | None,
+    running_var: Tensor | None,
+    weight: Tensor | None = ...,
+    bias: Tensor | None = ...,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     training: bool = ...,
     momentum: float = ...,
     eps: float = ...,
 ) -> Tensor: ...
+<<<<<<< HEAD
 def instance_norm(
     input: Tensor,
     running_mean: Optional[Tensor] = ...,
@@ -319,6 +407,63 @@ def local_response_norm(
     beta: float = ...,
     k: float = ...,
 ) -> Tensor: ...
+=======
+
+__all__ += ["batch_norm"]
+
+def binary_cross_entropy_with_logits(
+    input: Tensor,
+    target: Tensor,
+    weight: Tensor | None = ...,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+    pos_weight: Tensor | None = ...,
+) -> Tensor: ...
+
+__all__ += ["binary_cross_entropy_with_logits"]
+
+def binary_cross_entropy(
+    input: Tensor,
+    target: Tensor,
+    weight: Tensor | None = ...,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["binary_cross_entropy"]
+
+def celu(input: Tensor, alpha: float = ..., inplace: bool = ...) -> Tensor: ...
+
+__all__ += ["celu"]
+
+def cosine_embedding_loss(
+    input1: Tensor,
+    input2: Tensor,
+    target: Tensor,
+    margin: float = ...,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["cosine_embedding_loss"]
+
+def cross_entropy(
+    input: Tensor,
+    target: Tensor,
+    weight: Tensor | None = ...,
+    size_average: bool | None = ...,
+    ignore_index: int = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+    label_smoothing: float = ...,
+) -> Tensor: ...
+
+__all__ += ["cross_entropy"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def ctc_loss(
     log_probs: Tensor,
     targets: Tensor,
@@ -328,6 +473,7 @@ def ctc_loss(
     reduction: str = ...,
     zero_infinity: bool = ...,
 ) -> Tensor: ...
+<<<<<<< HEAD
 def nll_loss(
     input: Tensor,
     target: Tensor,
@@ -555,6 +701,88 @@ def unfold(
     padding: _size_any_t = ...,
     stride: _size_any_t = ...,
 ) -> Tensor: ...
+=======
+
+__all__ += ["ctc_loss"]
+
+def dropout(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["dropout"]
+
+def dropout1d(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["dropout1d"]
+
+def dropout2d(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["dropout2d"]
+
+def dropout3d(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["dropout3d"]
+
+def elu(input: Tensor, alpha: float = ..., inplace: bool = ...) -> Tensor: ...
+
+__all__ += ["elu"]
+
+def embedding_bag(
+    input: Tensor,
+    weight: Tensor,
+    offsets: Tensor | None = ...,
+    max_norm: float | None = ...,
+    norm_type: float = ...,
+    scale_grad_by_freq: bool = ...,
+    mode: str = ...,
+    sparse: bool = ...,
+    per_sample_weights: Tensor | None = ...,
+    include_last_offset: bool = ...,
+    padding_idx: int | None = ...,
+) -> Tensor: ...
+
+__all__ += ["embedding_bag"]
+
+def embedding(
+    input: Tensor,
+    weight: Tensor,
+    padding_idx: int | None = ...,
+    max_norm: float | None = ...,
+    norm_type: float = ...,
+    scale_grad_by_freq: bool = ...,
+    sparse: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["embedding"]
+
+def feature_alpha_dropout(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["feature_alpha_dropout"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def fold(
     input: Tensor,
     output_size: _size_any_t,
@@ -563,6 +791,7 @@ def fold(
     padding: _size_any_t = ...,
     stride: _size_any_t = ...,
 ) -> Tensor: ...
+<<<<<<< HEAD
 def _canonical_mask(
     mask: Optional[Tensor],
     mask_name: str,
@@ -572,34 +801,614 @@ def _canonical_mask(
     check_other: bool = True,
 ) -> Optional[Tensor]: ...
 def _none_or_dtype(input: Optional[Tensor]) -> Optional[_dtype]: ...
-def multi_head_attention_forward(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    embed_dim_to_check: int,
-    num_heads: int,
-    in_proj_weight: Optional[Tensor],
-    in_proj_bias: Optional[Tensor],
-    bias_k: Optional[Tensor],
-    bias_v: Optional[Tensor],
-    add_zero_attn: bool,
-    dropout_p: float,
-    out_proj_weight: Tensor,
-    out_proj_bias: Optional[Tensor],
-    training: bool = True,
-    key_padding_mask: Optional[Tensor] = None,
-    need_weights: bool = True,
-    attn_mask: Optional[Tensor] = None,
-    use_separate_proj_weight: bool = False,
-    q_proj_weight: Optional[Tensor] = None,
-    k_proj_weight: Optional[Tensor] = None,
-    v_proj_weight: Optional[Tensor] = None,
-    static_k: Optional[Tensor] = None,
-    static_v: Optional[Tensor] = None,
-    average_attn_weights: bool = True,
-    is_causal: bool = False,
-) -> tuple[Tensor, Optional[Tensor]]: ...
+=======
 
-${imported_hints}
+__all__ += ["fold"]
 
-${dispatched_hints}
+def fractional_max_pool2d_with_indices(
+    input: Tensor,
+    kernel_size: _size,
+    output_size: _size | None = ...,
+    output_ratio: _ratio_any_t | None = ...,
+    return_indices: bool = ...,
+    _random_samples: Tensor | None = ...,
+) -> tuple[Tensor, Tensor]: ...
+
+__all__ += ["fractional_max_pool2d_with_indices"]
+
+def fractional_max_pool3d_with_indices(
+    input: Tensor,
+    kernel_size: _size,
+    output_size: _size | None = ...,
+    output_ratio: _ratio_any_t | None = ...,
+    return_indices: bool = ...,
+    _random_samples: Tensor | None = ...,
+) -> tuple[Tensor, Tensor]: ...
+
+__all__ += ["fractional_max_pool3d_with_indices"]
+
+def gaussian_nll_loss(
+    input: Tensor,
+    target: Tensor,
+    var: Tensor | float,
+    full: bool | None = ...,
+    eps: float | None = ...,
+    reduction: str | None = ...,
+) -> Tensor: ...
+
+__all__ += ["gaussian_nll_loss"]
+
+def glu(input: Tensor, dim: int = ...) -> Tensor: ...
+
+__all__ += ["glu"]
+
+def grid_sample(
+    input: Tensor,
+    grid: Tensor,
+    mode: str = ...,
+    padding_mode: str = ...,
+    align_corners: Any | None = ...,
+) -> Tensor: ...
+
+__all__ += ["grid_sample"]
+
+def group_norm(
+    input: Tensor,
+    num_groups: int,
+    weight: Tensor | None = ...,
+    bias: Tensor | None = ...,
+    eps: float = ...,
+) -> Tensor: ...
+
+__all__ += ["group_norm"]
+
+def gumbel_softmax(
+    logits: Tensor,
+    tau: float = ...,
+    hard: bool = ...,
+    eps: float = ...,
+    dim: int = ...,
+) -> Tensor: ...
+
+__all__ += ["gumbel_softmax"]
+
+def hardsigmoid(input: Tensor, inplace: bool = False) -> Tensor: ...
+
+__all__ += ["hardsigmoid"]
+
+def hardswish(input: Tensor, inplace: bool = False) -> Tensor: ...
+
+__all__ += ["hardswish"]
+
+def hardtanh(
+    input: Tensor,
+    min_val: float = ...,
+    max_val: float = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["hardtanh"]
+
+def hinge_embedding_loss(
+    input: Tensor,
+    target: Tensor,
+    margin: float = ...,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["hinge_embedding_loss"]
+
+def huber_loss(
+    input: Tensor,
+    target: Tensor,
+    reduction: str = ...,
+    delta: float = ...,
+) -> Tensor: ...
+
+__all__ += ["huber_loss"]
+
+def instance_norm(
+    input: Tensor,
+    running_mean: Tensor | None = ...,
+    running_var: Tensor | None = ...,
+    weight: Tensor | None = ...,
+    bias: Tensor | None = ...,
+    use_input_stats: bool = ...,
+    momentum: float = ...,
+    eps: float = ...,
+) -> Tensor: ...
+
+__all__ += ["instance_norm"]
+
+def interpolate(
+    input: Any,
+    size: Any | None = ...,
+    scale_factor: Any | None = ...,
+    mode: str = ...,
+    align_corners: Any | None = ...,
+    recompute_scale_factor: Any | None = ...,
+    antialias: bool = ...,
+): ...
+
+__all__ += ["interpolate"]
+
+def kl_div(
+    input: Tensor,
+    target: Tensor,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+    log_target: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["kl_div"]
+
+def l1_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["l1_loss"]
+
+def layer_norm(
+    input: Tensor,
+    normalized_shape: Sequence[int],
+    weight: Tensor | None = ...,
+    bias: Tensor | None = ...,
+    eps: float = ...,
+) -> Tensor: ...
+
+__all__ += ["layer_norm"]
+
+def leaky_relu(
+    input: Tensor,
+    negative_slope: float = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["leaky_relu"]
+
+def local_response_norm(
+    input: Tensor,
+    size: int,
+    alpha: float = ...,
+    beta: float = ...,
+    k: float = ...,
+) -> Tensor: ...
+
+__all__ += ["local_response_norm"]
+
+def log_softmax(
+    input: Tensor,
+    dim: int | None = ...,
+    _stacklevel: int = ...,
+    dtype: _dtype | None = ...,
+) -> Tensor: ...
+
+__all__ += ["log_softmax"]
+
+def lp_pool1d(
+    input: Tensor,
+    norm_type: float,
+    kernel_size: _size_1_t,
+    stride: _size | None | int = ...,
+    ceil_mode: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["lp_pool1d"]
+
+def lp_pool2d(
+    input: Tensor,
+    norm_type: float,
+    kernel_size: _size_2_t,
+    stride: _size | None | int = ...,
+    ceil_mode: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["lp_pool2d"]
+
+def lp_pool3d(
+    input: Tensor,
+    norm_type: float,
+    kernel_size: _size_3_t,
+    stride: _size | None | int = ...,
+    ceil_mode: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["lp_pool3d"]
+
+def margin_ranking_loss(
+    input1: Tensor,
+    input2: Tensor,
+    target: Tensor,
+    margin: float = ...,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["margin_ranking_loss"]
+
+def max_pool1d_with_indices(
+    input: Tensor,
+    kernel_size: _size,
+    stride: _size | None = ...,
+    padding: _size = ...,
+    dilation: _size = ...,
+    ceil_mode: bool = ...,
+    return_indices: bool = ...,
+) -> tuple[Tensor, Tensor]: ...
+
+__all__ += ["max_pool1d_with_indices"]
+
+def max_pool2d_with_indices(
+    input: Tensor,
+    kernel_size: _size,
+    stride: _size | None = ...,
+    padding: _size = ...,
+    dilation: _size = ...,
+    ceil_mode: bool = ...,
+    return_indices: bool = ...,
+) -> tuple[Tensor, Tensor]: ...
+
+__all__ += ["max_pool2d_with_indices"]
+
+def max_pool3d_with_indices(
+    input: Tensor,
+    kernel_size: _size,
+    stride: _size | None = ...,
+    padding: _size = ...,
+    dilation: _size = ...,
+    ceil_mode: bool = ...,
+    return_indices: bool = ...,
+) -> tuple[Tensor, Tensor]: ...
+
+__all__ += ["max_pool3d_with_indices"]
+
+def max_unpool1d(
+    input: Tensor,
+    indices: Tensor,
+    kernel_size: _size,
+    stride: _size | None = ...,
+    padding: _size = ...,
+    output_size: _size | None = ...,
+) -> Tensor: ...
+
+__all__ += ["max_unpool1d"]
+
+def max_unpool2d(
+    input: Tensor,
+    indices: Tensor,
+    kernel_size: _size,
+    stride: _size | None = ...,
+    padding: _size = ...,
+    output_size: _size | None = ...,
+) -> Tensor: ...
+
+__all__ += ["max_unpool2d"]
+
+def max_unpool3d(
+    input: Tensor,
+    indices: Tensor,
+    kernel_size: _size,
+    stride: _size | None = ...,
+    padding: _size = ...,
+    output_size: _size | None = ...,
+) -> Tensor: ...
+
+__all__ += ["max_unpool3d"]
+
+def mish(input: Tensor, inplace: bool = False) -> Tensor: ...
+
+__all__ += ["mish"]
+
+def mse_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["mse_loss"]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+def multi_head_attention_forward(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+<<<<<<< HEAD
+    in_proj_weight: Optional[Tensor],
+    in_proj_bias: Optional[Tensor],
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Optional[Tensor],
+    training: bool = True,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+    average_attn_weights: bool = True,
+    is_causal: bool = False,
+) -> tuple[Tensor, Optional[Tensor]]: ...
+=======
+    in_proj_weight: Tensor | None,
+    in_proj_bias: Tensor | None,
+    bias_k: Tensor | None,
+    bias_v: Tensor | None,
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Tensor | None,
+    training: bool = True,
+    key_padding_mask: Tensor | None = None,
+    need_weights: bool = True,
+    attn_mask: Tensor | None = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Tensor | None = None,
+    k_proj_weight: Tensor | None = None,
+    v_proj_weight: Tensor | None = None,
+    static_k: Tensor | None = None,
+    static_v: Tensor | None = None,
+    average_attn_weights: bool = True,
+    is_causal: bool = False,
+) -> tuple[Tensor, Tensor | None]: ...
+
+__all__ += ["multi_head_attention_forward"]
+
+def multi_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    p: int = ...,
+    margin: float = ...,
+    weight: Tensor | None = ...,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["multi_margin_loss"]
+
+def multilabel_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["multilabel_margin_loss"]
+
+def multilabel_soft_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    weight: Tensor | None = ...,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["multilabel_soft_margin_loss"]
+
+def nll_loss(
+    input: Tensor,
+    target: Tensor,
+    weight: Tensor | None = ...,
+    size_average: bool | None = ...,
+    ignore_index: int = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["nll_loss"]
+
+def normalize(
+    input: Tensor,
+    p: float = ...,
+    dim: int = ...,
+    eps: float = ...,
+    out: Tensor | None = ...,
+) -> Tensor: ...
+
+__all__ += ["normalize"]
+
+def poisson_nll_loss(
+    input: Tensor,
+    target: Tensor,
+    log_input: bool = ...,
+    full: bool = ...,
+    size_average: bool | None = ...,
+    eps: float = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["poisson_nll_loss"]
+
+def relu(input: Tensor, inplace: bool = ...) -> Tensor: ...
+
+__all__ += ["relu"]
+
+def relu6(input: Tensor, inplace: bool = ...) -> Tensor: ...
+
+__all__ += ["relu6"]
+
+def rms_norm(
+    input: Tensor,
+    normalized_shape: Sequence[int],
+    weight: Tensor | None = ...,
+    eps: float | None = ...,
+) -> Tensor: ...
+
+__all__ += ["rms_norm"]
+
+def rrelu(
+    input: Tensor,
+    lower: float = ...,
+    upper: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["rrelu"]
+
+def selu(input: Tensor, inplace: bool = ...) -> Tensor: ...
+
+__all__ += ["selu"]
+
+def sigmoid(input: Any) -> Tensor: ...
+
+__all__ += ["sigmoid"]
+
+def silu(input: Tensor, inplace: bool = False) -> Tensor: ...
+
+__all__ += ["silu"]
+
+def smooth_l1_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+    beta: float = ...,
+) -> Tensor: ...
+
+__all__ += ["smooth_l1_loss"]
+
+def soft_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["soft_margin_loss"]
+
+def softmax(
+    input: Tensor,
+    dim: int | None = ...,
+    _stacklevel: int = ...,
+    dtype: _dtype | None = ...,
+) -> Tensor: ...
+
+__all__ += ["softmax"]
+
+def softmin(
+    input: Tensor,
+    dim: int | None = ...,
+    _stacklevel: int = ...,
+    dtype: _dtype | None = ...,
+) -> Tensor: ...
+
+__all__ += ["softmin"]
+
+def softsign(input: Any): ...
+
+__all__ += ["softsign"]
+
+def tanh(input: Any): ...
+
+__all__ += ["tanh"]
+
+def tanhshrink(input: Any): ...
+
+__all__ += ["tanhshrink"]
+
+def threshold(
+    input: Tensor,
+    threshold: float,
+    value: float,
+    inplace: bool = ...,
+) -> Tensor: ...
+
+__all__ += ["threshold"]
+
+def triplet_margin_loss(
+    anchor: Tensor,
+    positive: Tensor,
+    negative: Tensor,
+    margin: float = ...,
+    p: float = ...,
+    eps: float = ...,
+    swap: bool = ...,
+    size_average: bool | None = ...,
+    reduce: bool | None = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["triplet_margin_loss"]
+
+def triplet_margin_with_distance_loss(
+    anchor: Tensor,
+    positive: Tensor,
+    negative: Tensor,
+    *,
+    distance_function: Callable[[Tensor, Tensor], Tensor] | None = ...,
+    margin: float = ...,
+    swap: bool = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+
+__all__ += ["triplet_margin_with_distance_loss"]
+
+def unfold(
+    input: Tensor,
+    kernel_size: _size_any_t,
+    dilation: _size_any_t = ...,
+    padding: _size_any_t = ...,
+    stride: _size_any_t = ...,
+) -> Tensor: ...
+
+__all__ += ["unfold"]
+
+def upsample_bilinear(
+    input: Any,
+    size: Any | None = ...,
+    scale_factor: Any | None = ...,
+): ...
+
+__all__ += ["upsample_bilinear"]
+
+def upsample_nearest(
+    input: Any,
+    size: Any | None = ...,
+    scale_factor: Any | None = ...,
+): ...
+
+__all__ += ["upsample_nearest"]
+
+def upsample(
+    input: Any,
+    size: Any | None = ...,
+    scale_factor: Any | None = ...,
+    mode: str = ...,
+    align_corners: Any | None = ...,
+): ...
+
+__all__ += ["upsample"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
+
+${imported_hints}
+
+${dispatched_hints}
+<<<<<<< HEAD
+=======
+
+${extra_nn_functional___all__}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/nn/init.py b/torch/nn/init.py
index 3d0600b43b68..9319335fe791 100644
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@@ -1,30 +1,122 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
 """This file contains utilities for initializing neural network parameters."""
 import math
 import warnings
 from typing import Optional as _Optional
+=======
+"""This file contains utilities for initializing neural network parameters."""
+
+import math
+import warnings
+from typing import Callable, Literal, Optional as _Optional, TypeVar, Union
+from typing_extensions import ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
 
 
+<<<<<<< HEAD
+=======
+__all__ = [
+    "calculate_gain",
+    "uniform_",
+    "normal_",
+    "trunc_normal_",
+    "constant_",
+    "ones_",
+    "zeros_",
+    "eye_",
+    "dirac_",
+    "xavier_uniform_",
+    "xavier_normal_",
+    "kaiming_uniform_",
+    "kaiming_normal_",
+    "orthogonal_",
+    "sparse_",
+    # Deprecated aliases (for backward compatibility)
+    "uniform",
+    "normal",
+    "constant",
+    "eye",
+    "dirac",
+    "xavier_uniform",
+    "xavier_normal",
+    "kaiming_uniform",
+    "kaiming_normal",
+    "orthogonal",
+    "sparse",
+]
+
+
+_R = TypeVar("_R")
+_P = ParamSpec("_P")
+
+_NonlinearityType = Literal[
+    "linear",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "sigmoid",
+    "tanh",
+    "relu",
+    "leaky_relu",
+    "selu",
+]
+
+_FanMode = Literal["fan_in", "fan_out"]
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # These no_grad_* functions are necessary as wrappers around the parts of these
 # functions that use `with torch.no_grad()`. The JIT doesn't support context
 # managers, so these need to be implemented as builtins. Using these wrappers
 # lets us keep those builtins small and re-usable.
+<<<<<<< HEAD
 def _no_grad_uniform_(tensor, a, b, generator=None):
+=======
+def _no_grad_uniform_(
+    tensor: Tensor, a: float, b: float, generator: _Optional[torch.Generator] = None
+) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with torch.no_grad():
         return tensor.uniform_(a, b, generator=generator)
 
 
+<<<<<<< HEAD
 def _no_grad_normal_(tensor, mean, std, generator=None):
+=======
+def _no_grad_normal_(
+    tensor: Tensor,
+    mean: float,
+    std: float,
+    generator: _Optional[torch.Generator] = None,
+) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with torch.no_grad():
         return tensor.normal_(mean, std, generator=generator)
 
 
+<<<<<<< HEAD
 def _no_grad_trunc_normal_(tensor, mean, std, a, b, generator=None):
     # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
     def norm_cdf(x):
+=======
+def _no_grad_trunc_normal_(
+    tensor: Tensor,
+    mean: float,
+    std: float,
+    a: float,
+    b: float,
+    generator: _Optional[torch.Generator] = None,
+) -> Tensor:
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x: float) -> float:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Computes standard normal cumulative distribution function
         return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
 
@@ -59,17 +151,31 @@ def norm_cdf(x):
         return tensor
 
 
+<<<<<<< HEAD
 def _no_grad_fill_(tensor, val):
+=======
+def _no_grad_fill_(tensor: Tensor, val: float) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with torch.no_grad():
         return tensor.fill_(val)
 
 
+<<<<<<< HEAD
 def _no_grad_zero_(tensor):
+=======
+def _no_grad_zero_(tensor: Tensor) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with torch.no_grad():
         return tensor.zero_()
 
 
+<<<<<<< HEAD
 def calculate_gain(nonlinearity, param=None):
+=======
+def calculate_gain(
+    nonlinearity: _NonlinearityType, param: _Optional[Union[int, float]] = None
+) -> float:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Return the recommended gain value for the given nonlinearity function.
 
     The values are as follows:
@@ -99,7 +205,13 @@ def calculate_gain(nonlinearity, param=None):
         param: optional parameter for the non-linear function
 
     Examples:
+<<<<<<< HEAD
         >>> gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with negative_slope=0.2
+=======
+        >>> gain = nn.init.calculate_gain(
+        ...     "leaky_relu", 0.2
+        ... )  # leaky_relu with negative_slope=0.2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. _Self-Normalizing Neural Networks: https://papers.nips.cc/paper/2017/hash/5d44ee6f2c3f71b73125876103c8f6c4-Abstract.html
     """
@@ -268,7 +380,11 @@ def zeros_(tensor: Tensor) -> Tensor:
     return _no_grad_zero_(tensor)
 
 
+<<<<<<< HEAD
 def eye_(tensor):
+=======
+def eye_(tensor: Tensor) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Fill the 2-dimensional input `Tensor` with the identity matrix.
 
     Preserves the identity of the inputs in `Linear` layers, where as
@@ -289,7 +405,11 @@ def eye_(tensor):
     return tensor
 
 
+<<<<<<< HEAD
 def dirac_(tensor, groups=1):
+=======
+def dirac_(tensor: Tensor, groups: int = 1) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Fill the {3, 4, 5}-dimensional input `Tensor` with the Dirac delta function.
 
     Preserves the identity of the inputs in `Convolutional`
@@ -342,7 +462,11 @@ def dirac_(tensor, groups=1):
     return tensor
 
 
+<<<<<<< HEAD
 def _calculate_fan_in_and_fan_out(tensor):
+=======
+def _calculate_fan_in_and_fan_out(tensor: Tensor) -> tuple[int, int]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     dimensions = tensor.dim()
     if dimensions < 2:
         raise ValueError(
@@ -387,7 +511,11 @@ def xavier_uniform_(
 
     Examples:
         >>> w = torch.empty(3, 5)
+<<<<<<< HEAD
         >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))
+=======
+        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain("relu"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Note:
         Be aware that ``fan_in`` and ``fan_out`` are calculated assuming
@@ -443,7 +571,11 @@ def xavier_normal_(
     return _no_grad_normal_(tensor, 0.0, std, generator)
 
 
+<<<<<<< HEAD
 def _calculate_correct_fan(tensor, mode):
+=======
+def _calculate_correct_fan(tensor: Tensor, mode: _FanMode) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     mode = mode.lower()
     valid_modes = ["fan_in", "fan_out"]
     if mode not in valid_modes:
@@ -456,10 +588,17 @@ def _calculate_correct_fan(tensor, mode):
 def kaiming_uniform_(
     tensor: Tensor,
     a: float = 0,
+<<<<<<< HEAD
     mode: str = "fan_in",
     nonlinearity: str = "leaky_relu",
     generator: _Optional[torch.Generator] = None,
 ):
+=======
+    mode: _FanMode = "fan_in",
+    nonlinearity: _NonlinearityType = "leaky_relu",
+    generator: _Optional[torch.Generator] = None,
+) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Fill the input `Tensor` with values using a Kaiming uniform distribution.
 
     The method is described in `Delving deep into rectifiers: Surpassing
@@ -486,7 +625,11 @@ def kaiming_uniform_(
 
     Examples:
         >>> w = torch.empty(3, 5)
+<<<<<<< HEAD
         >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
+=======
+        >>> nn.init.kaiming_uniform_(w, mode="fan_in", nonlinearity="relu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Note:
         Be aware that ``fan_in`` and ``fan_out`` are calculated assuming
@@ -521,10 +664,17 @@ def kaiming_uniform_(
 def kaiming_normal_(
     tensor: Tensor,
     a: float = 0,
+<<<<<<< HEAD
     mode: str = "fan_in",
     nonlinearity: str = "leaky_relu",
     generator: _Optional[torch.Generator] = None,
 ):
+=======
+    mode: _FanMode = "fan_in",
+    nonlinearity: _NonlinearityType = "leaky_relu",
+    generator: _Optional[torch.Generator] = None,
+) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Fill the input `Tensor` with values using a Kaiming normal distribution.
 
     The method is described in `Delving deep into rectifiers: Surpassing
@@ -551,7 +701,11 @@ def kaiming_normal_(
 
     Examples:
         >>> w = torch.empty(3, 5)
+<<<<<<< HEAD
         >>> nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
+=======
+        >>> nn.init.kaiming_normal_(w, mode="fan_out", nonlinearity="relu")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Note:
         Be aware that ``fan_in`` and ``fan_out`` are calculated assuming
@@ -572,10 +726,17 @@ def kaiming_normal_(
 
 
 def orthogonal_(
+<<<<<<< HEAD
     tensor,
     gain=1,
     generator: _Optional[torch.Generator] = None,
 ):
+=======
+    tensor: Tensor,
+    gain: float = 1,
+    generator: _Optional[torch.Generator] = None,
+) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Fill the input `Tensor` with a (semi) orthogonal matrix.
 
     Described in `Exact solutions to the nonlinear dynamics of learning in deep
@@ -623,11 +784,19 @@ def orthogonal_(
 
 
 def sparse_(
+<<<<<<< HEAD
     tensor,
     sparsity,
     std=0.01,
     generator: _Optional[torch.Generator] = None,
 ):
+=======
+    tensor: Tensor,
+    sparsity: float,
+    std: float = 0.01,
+    generator: _Optional[torch.Generator] = None,
+) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Fill the 2D input `Tensor` as a sparse matrix.
 
     The non-zero elements will be drawn from the normal distribution
@@ -661,11 +830,19 @@ def sparse_(
 
 
 # for backward compatibility
+<<<<<<< HEAD
 def _make_deprecate(meth):
     new_name = meth.__name__
     old_name = new_name[:-1]
 
     def deprecated_init(*args, **kwargs):
+=======
+def _make_deprecate(meth: Callable[_P, _R]) -> Callable[_P, _R]:
+    new_name = meth.__name__
+    old_name = new_name[:-1]
+
+    def deprecated_init(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         warnings.warn(
             f"`nn.init.{old_name}` is now deprecated in favor of `nn.init.{new_name}`.",
             FutureWarning,
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 564a516a2477..07b696dbe52e 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -66,10 +66,19 @@ class Threshold(Module):
         - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
         - Output: :math:`(*)`, same shape as the input.
 
+<<<<<<< HEAD
     Examples::
 
         >>> m = nn.Threshold(0.1, 20)
         >>> input = torch.randn(2)
+=======
+    .. image:: ../scripts/activation_images/Threshold.png
+
+    Examples::
+
+        >>> m = nn.Threshold(0, 0.5)
+        >>> input = torch.arange(-3, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> output = m(input)
     """
 
@@ -674,6 +683,11 @@ class GLU(Module):
           dimensions
         - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
 
+<<<<<<< HEAD
+=======
+    .. image:: ../scripts/activation_images/GLU.png
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Examples::
 
         >>> m = nn.GLU()
@@ -973,6 +987,7 @@ def _is_make_fx_tracing():
 class MultiheadAttention(Module):
     r"""Allows the model to jointly attend to information from different representation subspaces.
 
+<<<<<<< HEAD
     .. note::
         See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
         for an in depth discussion of the performant building blocks PyTorch offers for building your own
@@ -980,6 +995,16 @@ class MultiheadAttention(Module):
 
     Method described in the paper:
     `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+=======
+    This MultiheadAttention layer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Multi-Head Attention is defined as:
 
@@ -1072,9 +1097,15 @@ def __init__(
         self.dropout = dropout
         self.batch_first = batch_first
         self.head_dim = embed_dim // num_heads
+<<<<<<< HEAD
         assert (
             self.head_dim * num_heads == self.embed_dim
         ), "embed_dim must be divisible by num_heads"
+=======
+        assert self.head_dim * num_heads == self.embed_dim, (
+            "embed_dim must be divisible by num_heads"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not self._qkv_same_embed_dim:
             self.q_proj_weight = Parameter(
@@ -1271,8 +1302,15 @@ def forward(
         elif query.is_nested and (
             key_padding_mask is not None or attn_mask is not None
         ):
+<<<<<<< HEAD
             why_not_fast_path = "supplying both src_key_padding_mask and src_mask at the same time \
                                  is not supported with NestedTensor input"
+=======
+            why_not_fast_path = (
+                "supplying both src_key_padding_mask and src_mask at the same time \
+                                 is not supported with NestedTensor input"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif torch.is_autocast_enabled():
             why_not_fast_path = "autocast is enabled"
 
diff --git a/torch/nn/modules/adaptive.py b/torch/nn/modules/adaptive.py
index cde1ad0005d6..f944f1f91c1a 100644
--- a/torch/nn/modules/adaptive.py
+++ b/torch/nn/modules/adaptive.py
@@ -18,13 +18,23 @@
 
 
 class AdaptiveLogSoftmaxWithLoss(Module):
+<<<<<<< HEAD
     """Efficient softmax approximation.
+=======
+    (
+        """Efficient softmax approximation.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     As described in
     `Efficient softmax approximation for GPUs by Edouard Grave, Armand Joulin,
     Moustapha Ciss\u00e9, David Grangier, and Herv\u00e9 J\u00e9gou
     <https://arxiv.org/abs/1609.04309>`__.
+<<<<<<< HEAD
 """ r"""
+=======
+"""
+        r"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Adaptive softmax is an approximate strategy for training models with large
     output spaces. It is most effective when the label distribution is highly
     imbalanced, for example in natural language modelling, where the word
@@ -104,6 +114,10 @@ class AdaptiveLogSoftmaxWithLoss(Module):
 
     .. _Zipf's law: https://en.wikipedia.org/wiki/Zipf%27s_law
     """
+<<<<<<< HEAD
+=======
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     in_features: int
     n_classes: int
@@ -172,9 +186,15 @@ def __init__(
 
     def reset_parameters(self) -> None:
         self.head.reset_parameters()
+<<<<<<< HEAD
         for i2h, h2o in self.tail:
             i2h.reset_parameters()
             h2o.reset_parameters()
+=======
+        for i2h, h2o in self.tail:  # type: ignore[misc]
+            i2h.reset_parameters()  # type: ignore[has-type]
+            h2o.reset_parameters()  # type: ignore[has-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, input_: Tensor, target_: Tensor) -> _ASMoutput:
         targ_dim = target_.dim()
@@ -182,8 +202,12 @@ def forward(self, input_: Tensor, target_: Tensor) -> _ASMoutput:
         if targ_dim == 1:
             if input_.size(0) != target_.size(0):
                 raise RuntimeError(
+<<<<<<< HEAD
                     "Input and target should have the same size "
                     "in the batch dimension."
+=======
+                    "Input and target should have the same size in the batch dimension."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             if input_.dim() != 2:
                 raise RuntimeError(
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 05db876f4e5b..4df770893aad 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 import operator
@@ -5,6 +6,15 @@
 from collections.abc import Iterable, Iterator, Mapping
 from itertools import chain, islice
 from typing import Any, Optional, overload, TypeVar, Union
+=======
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import operator
+from collections import abc as container_abcs, OrderedDict
+from itertools import chain, islice
+from typing import Any, Optional, overload, TYPE_CHECKING, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated, Self
 
 import torch
@@ -14,6 +24,13 @@
 from .module import Module
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Iterator, Mapping
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "Container",
     "Sequential",
@@ -24,6 +41,10 @@
 ]
 
 T = TypeVar("T", bound=Module)
+<<<<<<< HEAD
+=======
+_V = TypeVar("_V")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Copied from torch.nn.modules.module, required for a custom __repr__ for ModuleList
@@ -81,6 +102,7 @@ class Sequential(Module):
         # for `Conv2d(20,64,5)`. Finally, the output of
         # `Conv2d(20,64,5)` will be used as input to the second `ReLU`
         model = nn.Sequential(
+<<<<<<< HEAD
                   nn.Conv2d(1,20,5),
                   nn.ReLU(),
                   nn.Conv2d(20,64,5),
@@ -95,17 +117,41 @@ class Sequential(Module):
                   ('conv2', nn.Conv2d(20,64,5)),
                   ('relu2', nn.ReLU())
                 ]))
+=======
+            nn.Conv2d(1, 20, 5), nn.ReLU(), nn.Conv2d(20, 64, 5), nn.ReLU()
+        )
+
+        # Using Sequential with OrderedDict. This is functionally the
+        # same as the above code
+        model = nn.Sequential(
+            OrderedDict(
+                [
+                    ("conv1", nn.Conv2d(1, 20, 5)),
+                    ("relu1", nn.ReLU()),
+                    ("conv2", nn.Conv2d(20, 64, 5)),
+                    ("relu2", nn.ReLU()),
+                ]
+            )
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     _modules: dict[str, Module]  # type: ignore[assignment]
 
     @overload
+<<<<<<< HEAD
     def __init__(self, *args: Module) -> None:
         ...
 
     @overload
     def __init__(self, arg: "OrderedDict[str, Module]") -> None:
         ...
+=======
+    def __init__(self, *args: Module) -> None: ...
+
+    @overload
+    def __init__(self, arg: OrderedDict[str, Module]) -> None: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, *args):
         super().__init__()
@@ -116,7 +162,11 @@ def __init__(self, *args):
             for idx, module in enumerate(args):
                 self.add_module(str(idx), module)
 
+<<<<<<< HEAD
     def _get_item_by_idx(self, iterator, idx) -> T:  # type: ignore[misc, type-var]
+=======
+    def _get_item_by_idx(self, iterator: Iterable[_V], idx: int) -> _V:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Get the idx-th item of the iterator."""
         size = len(self)
         idx = operator.index(idx)
@@ -126,7 +176,11 @@ def _get_item_by_idx(self, iterator, idx) -> T:  # type: ignore[misc, type-var]
         return next(islice(iterator, idx, None))
 
     @_copy_to_script_wrapper
+<<<<<<< HEAD
     def __getitem__(self, idx: Union[slice, int]) -> Union["Sequential", T]:
+=======
+    def __getitem__(self, idx: Union[slice, int]) -> Union[Sequential, Module]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(idx, slice):
             return self.__class__(OrderedDict(list(self._modules.items())[idx]))
         else:
@@ -151,7 +205,11 @@ def __delitem__(self, idx: Union[slice, int]) -> None:
     def __len__(self) -> int:
         return len(self._modules)
 
+<<<<<<< HEAD
     def __add__(self, other) -> "Sequential":
+=======
+    def __add__(self, other) -> Sequential:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(other, Sequential):
             ret = Sequential()
             for layer in self:
@@ -182,7 +240,11 @@ def __iadd__(self, other) -> Self:
                 f"of Sequential class, but {str(type(other))} is given."
             )
 
+<<<<<<< HEAD
     def __mul__(self, other: int) -> "Sequential":
+=======
+    def __mul__(self, other: int) -> Sequential:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(other, int):
             raise TypeError(
                 f"unsupported operand type(s) for *: {type(self)} and {type(other)}"
@@ -200,7 +262,11 @@ def __mul__(self, other: int) -> "Sequential":
                     offset += 1
             return combined
 
+<<<<<<< HEAD
     def __rmul__(self, other: int) -> "Sequential":
+=======
+    def __rmul__(self, other: int) -> Sequential:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.__mul__(other)
 
     def __imul__(self, other: int) -> Self:
@@ -222,7 +288,11 @@ def __imul__(self, other: int) -> Self:
             return self
 
     @_copy_to_script_wrapper
+<<<<<<< HEAD
     def __dir__(self):
+=======
+    def __dir__(self) -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         keys = super().__dir__()
         keys = [key for key in keys if not key.isdigit()]
         return keys
@@ -240,16 +310,58 @@ def forward(self, input):
             input = module(input)
         return input
 
+<<<<<<< HEAD
     def append(self, module: Module) -> "Sequential":
+=======
+    def append(self, module: Module) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Append a given module to the end.
 
         Args:
             module (nn.Module): module to append
+<<<<<<< HEAD
+=======
+
+        Example::
+
+            >>> import torch.nn as nn
+            >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3))
+            >>> n.append(nn.Linear(3, 4))
+            Sequential(
+                (0): Linear(in_features=1, out_features=2, bias=True)
+                (1): Linear(in_features=2, out_features=3, bias=True)
+                (2): Linear(in_features=3, out_features=4, bias=True)
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         self.add_module(str(len(self)), module)
         return self
 
+<<<<<<< HEAD
     def insert(self, index: int, module: Module) -> "Sequential":
+=======
+    def insert(self, index: int, module: Module) -> Self:
+        """
+        Inserts a module into the Sequential container at the specified index.
+
+        Args:
+            index (int): The index to insert the module.
+            module (Module): The module to be inserted.
+
+        Example::
+
+            >>> import torch.nn as nn
+            >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3))
+            >>> n.insert(0, nn.Linear(3, 4))
+            Sequential(
+                (0): Linear(in_features=3, out_features=4, bias=True)
+                (1): Linear(in_features=1, out_features=2, bias=True)
+                (2): Linear(in_features=2, out_features=3, bias=True)
+            )
+
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(module, Module):
             raise AssertionError(f"module should be of type: {Module}")
         n = len(self._modules)
@@ -262,7 +374,31 @@ def insert(self, index: int, module: Module) -> "Sequential":
         self._modules[str(index)] = module
         return self
 
+<<<<<<< HEAD
     def extend(self, sequential) -> "Sequential":
+=======
+    def extend(self, sequential: Iterable[Module]) -> Self:
+        """
+        Extends the current Sequential container with layers from another Sequential container.
+
+        Args:
+            sequential (Sequential): A Sequential container whose layers will be added to the current container.
+
+        Example::
+
+            >>> import torch.nn as nn
+            >>> n = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 3))
+            >>> other = nn.Sequential(nn.Linear(3, 4), nn.Linear(4, 5))
+            >>> n.extend(other) # or `n + other`
+            Sequential(
+                (0): Linear(in_features=1, out_features=2, bias=True)
+                (1): Linear(in_features=2, out_features=3, bias=True)
+                (2): Linear(in_features=3, out_features=4, bias=True)
+                (3): Linear(in_features=4, out_features=5, bias=True)
+            )
+
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for layer in sequential:
             self.append(layer)
         return self
@@ -309,6 +445,7 @@ def _get_abs_string_index(self, idx):
         return str(idx)
 
     @overload
+<<<<<<< HEAD
     def __getitem__(self, idx: slice) -> "ModuleList":
         ...
 
@@ -318,6 +455,15 @@ def __getitem__(self, idx: int) -> Module:
 
     @_copy_to_script_wrapper
     def __getitem__(self, idx: Union[int, slice]) -> Union[Module, "ModuleList"]:
+=======
+    def __getitem__(self, idx: slice) -> ModuleList: ...
+
+    @overload
+    def __getitem__(self, idx: int) -> Module: ...
+
+    @_copy_to_script_wrapper
+    def __getitem__(self, idx: Union[int, slice]) -> Union[Module, ModuleList]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if isinstance(idx, slice):
             return self.__class__(list(self._modules.values())[idx])
         else:
@@ -348,13 +494,21 @@ def __iter__(self) -> Iterator[Module]:
     def __iadd__(self, modules: Iterable[Module]) -> Self:
         return self.extend(modules)
 
+<<<<<<< HEAD
     def __add__(self, other: Iterable[Module]) -> "ModuleList":
+=======
+    def __add__(self, other: Iterable[Module]) -> ModuleList:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         combined = ModuleList()
         for i, module in enumerate(chain(self, other)):
             combined.add_module(str(i), module)
         return combined
 
+<<<<<<< HEAD
     def __repr__(self):
+=======
+    def __repr__(self) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Return a custom repr for ModuleList that compresses repeated module representations."""
         list_of_reprs = [repr(item) for item in self]
         if len(list_of_reprs) == 0:
@@ -387,7 +541,11 @@ def __repr__(self):
         return main_str
 
     @_copy_to_script_wrapper
+<<<<<<< HEAD
     def __dir__(self):
+=======
+    def __dir__(self) -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         keys = super().__dir__()
         keys = [key for key in keys if not key.isdigit()]
         return keys
@@ -403,7 +561,11 @@ def insert(self, index: int, module: Module) -> None:
             self._modules[str(i)] = self._modules[str(i - 1)]
         self._modules[str(index)] = module
 
+<<<<<<< HEAD
     def append(self, module: Module) -> "ModuleList":
+=======
+    def append(self, module: Module) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Append a given module to the end of the list.
 
         Args:
@@ -465,6 +627,7 @@ class ModuleDict(Module):
         class MyModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
+<<<<<<< HEAD
                 self.choices = nn.ModuleDict({
                         'conv': nn.Conv2d(10, 10, 3),
                         'pool': nn.MaxPool2d(3)
@@ -473,6 +636,14 @@ def __init__(self) -> None:
                         ['lrelu', nn.LeakyReLU()],
                         ['prelu', nn.PReLU()]
                 ])
+=======
+                self.choices = nn.ModuleDict(
+                    {"conv": nn.Conv2d(10, 10, 3), "pool": nn.MaxPool2d(3)}
+                )
+                self.activations = nn.ModuleDict(
+                    [["lrelu", nn.LeakyReLU()], ["prelu", nn.PReLU()]]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def forward(self, x, choice, act):
                 x = self.choices[choice](x)
@@ -524,17 +695,29 @@ def pop(self, key: str) -> Module:
         return v
 
     @_copy_to_script_wrapper
+<<<<<<< HEAD
     def keys(self) -> Iterable[str]:
+=======
+    def keys(self) -> container_abcs.KeysView[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Return an iterable of the ModuleDict keys."""
         return self._modules.keys()
 
     @_copy_to_script_wrapper
+<<<<<<< HEAD
     def items(self) -> Iterable[tuple[str, Module]]:
+=======
+    def items(self) -> container_abcs.ItemsView[str, Module]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Return an iterable of the ModuleDict key/value pairs."""
         return self._modules.items()
 
     @_copy_to_script_wrapper
+<<<<<<< HEAD
     def values(self) -> Iterable[Module]:
+=======
+    def values(self) -> container_abcs.ValuesView[Module]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Return an iterable of the ModuleDict values."""
         return self._modules.values()
 
@@ -597,7 +780,13 @@ class ParameterList(Module):
         class MyModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
+<<<<<<< HEAD
                 self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])
+=======
+                self.params = nn.ParameterList(
+                    [nn.Parameter(torch.randn(10, 10)) for i in range(10)]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def forward(self, x):
                 # ParameterList can act as an iterable, or be indexed using ints
@@ -622,12 +811,19 @@ def _get_abs_string_index(self, idx):
         return str(idx)
 
     @overload
+<<<<<<< HEAD
     def __getitem__(self, idx: int) -> Any:
         ...
 
     @overload
     def __getitem__(self: T, idx: slice) -> T:
         ...
+=======
+    def __getitem__(self, idx: int) -> Any: ...
+
+    @overload
+    def __getitem__(self: T, idx: slice) -> T: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
@@ -660,12 +856,20 @@ def __iter__(self) -> Iterator[Any]:
     def __iadd__(self, parameters: Iterable[Any]) -> Self:
         return self.extend(parameters)
 
+<<<<<<< HEAD
     def __dir__(self):
+=======
+    def __dir__(self) -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         keys = super().__dir__()
         keys = [key for key in keys if not key.isdigit()]
         return keys
 
+<<<<<<< HEAD
     def append(self, value: Any) -> "ParameterList":
+=======
+    def append(self, value: Any) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Append a given value at the end of the list.
 
         Args:
@@ -749,10 +953,19 @@ class ParameterDict(Module):
         class MyModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
+<<<<<<< HEAD
                 self.params = nn.ParameterDict({
                         'left': nn.Parameter(torch.randn(5, 10)),
                         'right': nn.Parameter(torch.randn(5, 10))
                 })
+=======
+                self.params = nn.ParameterDict(
+                    {
+                        "left": nn.Parameter(torch.randn(5, 10)),
+                        "right": nn.Parameter(torch.randn(5, 10)),
+                    }
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def forward(self, x, choice):
                 x = self.params[choice].mm(x)
@@ -804,9 +1017,15 @@ def __iter__(self) -> Iterator[str]:
         return iter(self._keys)
 
     def __reversed__(self) -> Iterator[str]:
+<<<<<<< HEAD
         return reversed(list(self._keys))
 
     def copy(self) -> "ParameterDict":
+=======
+        return reversed(self._keys)
+
+    def copy(self) -> ParameterDict:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Return a copy of this :class:`~torch.nn.ParameterDict` instance."""
         # We have to use an OrderedDict because the ParameterDict constructor
         # behaves differently on plain dict vs OrderedDict
@@ -865,7 +1084,11 @@ def get(self, key: str, default: Optional[Any] = None) -> Any:
 
     def fromkeys(
         self, keys: Iterable[str], default: Optional[Any] = None
+<<<<<<< HEAD
     ) -> "ParameterDict":
+=======
+    ) -> ParameterDict:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Return a new ParameterDict with the keys provided.
 
         Args:
@@ -874,7 +1097,11 @@ def fromkeys(
         """
         return ParameterDict((k, default) for k in keys)
 
+<<<<<<< HEAD
     def keys(self) -> Iterable[str]:
+=======
+    def keys(self) -> container_abcs.KeysView[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Return an iterable of the ParameterDict keys."""
         return self._keys.keys()
 
@@ -886,7 +1113,11 @@ def values(self) -> Iterable[Any]:
         r"""Return an iterable of the ParameterDict values."""
         return (self[k] for k in self._keys)
 
+<<<<<<< HEAD
     def update(self, parameters: Union[Mapping[str, Any], "ParameterDict"]) -> None:
+=======
+    def update(self, parameters: Union[Mapping[str, Any], ParameterDict]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Update the :class:`~torch.nn.ParameterDict` with key-value pairs from ``parameters``, overwriting existing keys.
 
         .. note::
@@ -951,16 +1182,28 @@ def extra_repr(self) -> str:
     def __call__(self, input):
         raise RuntimeError("ParameterDict should not be called.")
 
+<<<<<<< HEAD
     def __or__(self, other: "ParameterDict") -> "ParameterDict":
+=======
+    def __or__(self, other: ParameterDict) -> ParameterDict:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         copy = self.copy()
         copy.update(other)
         return copy
 
+<<<<<<< HEAD
     def __ror__(self, other: "ParameterDict") -> "ParameterDict":
+=======
+    def __ror__(self, other: ParameterDict) -> ParameterDict:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         copy = other.copy()
         copy.update(self)
         return copy
 
+<<<<<<< HEAD
     def __ior__(self, other: "ParameterDict") -> Self:
+=======
+    def __ior__(self, other: ParameterDict) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.update(other)
         return self
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index af9f5a8386cc..dd9282419cf2 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -66,8 +66,14 @@ class _ConvNd(Module):
     ]
     __annotations__ = {"bias": Optional[torch.Tensor]}
 
+<<<<<<< HEAD
     def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:  # type: ignore[empty-body]
         ...
+=======
+    def _conv_forward(  # type: ignore[empty-body]
+        self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     in_channels: int
     _reversed_padding_repeated_twice: list[int]
@@ -187,10 +193,14 @@ def reset_parameters(self) -> None:
                 init.uniform_(self.bias, -bound, bound)
 
     def extra_repr(self):
+<<<<<<< HEAD
         s = (
             "{in_channels}, {out_channels}, kernel_size={kernel_size}"
             ", stride={stride}"
         )
+=======
+        s = "{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.padding != (0,) * len(self.padding):
             s += ", padding={padding}"
         if self.dilation != (1,) * len(self.dilation):
@@ -279,9 +289,13 @@ class Conv1d(_ConvNd):
         padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
             ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
 
+<<<<<<< HEAD
     """.format(
             **reproducibility_notes, **convolution_notes
         )
+=======
+    """.format(**reproducibility_notes, **convolution_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         + r"""
 
     Shape:
@@ -450,9 +464,13 @@ class Conv2d(_ConvNd):
             output. Default: ``True``
         padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
             ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+<<<<<<< HEAD
     """.format(
             **reproducibility_notes, **convolution_notes
         )
+=======
+    """.format(**reproducibility_notes, **convolution_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         + r"""
 
     Shape:
@@ -619,9 +637,13 @@ class Conv3d(_ConvNd):
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
         bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
         padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+<<<<<<< HEAD
     """.format(
             **reproducibility_notes, **convolution_notes
         )
+=======
+    """.format(**reproducibility_notes, **convolution_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         + r"""
 
     Shape:
@@ -883,9 +905,13 @@ class ConvTranspose1d(_ConvTransposeNd):
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
         bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
         dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+<<<<<<< HEAD
     """.format(
             **reproducibility_notes, **convolution_notes
         )
+=======
+    """.format(**reproducibility_notes, **convolution_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         + r"""
 
     Shape:
@@ -998,7 +1024,14 @@ class ConvTranspose2d(_ConvTransposeNd):
 
     On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
 
+<<<<<<< HEAD
     * :attr:`stride` controls the stride for the cross-correlation.
+=======
+    * :attr:`stride` controls the stride for the cross-correlation. When stride > 1, ConvTranspose2d inserts zeros between input
+      elements along the spatial dimensions before applying the convolution kernel. This zero-insertion operation is the standard
+      behavior of transposed convolutions, which can increase the spatial resolution and is equivalent to a learnable
+      upsampling operation.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     * :attr:`padding` controls the amount of implicit zero padding on both
       sides for ``dilation * (kernel_size - 1) - padding`` number of points. See note
@@ -1048,9 +1081,13 @@ class ConvTranspose2d(_ConvTransposeNd):
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
         bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
         dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+<<<<<<< HEAD
     """.format(
             **reproducibility_notes, **convolution_notes
         )
+=======
+    """.format(**reproducibility_notes, **convolution_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         + r"""
 
     Shape:
@@ -1140,6 +1177,17 @@ def __init__(
         )
 
     def forward(self, input: Tensor, output_size: Optional[list[int]] = None) -> Tensor:
+<<<<<<< HEAD
+=======
+        """
+        Performs the forward pass.
+
+        Attributes:
+            input (Tensor): The input tensor.
+            output_size (list[int], optional): A list of integers representing
+                the size of the output tensor. Default is None.
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.padding_mode != "zeros":
             raise ValueError(
                 "Only `zeros` padding mode is supported for ConvTranspose2d"
@@ -1238,9 +1286,13 @@ class ConvTranspose3d(_ConvTransposeNd):
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
         bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
         dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+<<<<<<< HEAD
     """.format(
             **reproducibility_notes, **convolution_notes
         )
+=======
+    """.format(**reproducibility_notes, **convolution_notes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         + r"""
 
     Shape:
diff --git a/torch/nn/modules/distance.py b/torch/nn/modules/distance.py
index dfec05da1172..ce10892496b8 100644
--- a/torch/nn/modules/distance.py
+++ b/torch/nn/modules/distance.py
@@ -34,7 +34,11 @@ class PairwiseDistance(Module):
         - Output: :math:`(N)` or :math:`()` based on input dimension.
           If :attr:`keepdim` is ``True``, then :math:`(N, 1)` or :math:`(1)` based on input dimension.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> pdist = nn.PairwiseDistance(p=2)
         >>> input1 = torch.randn(100, 128)
         >>> input2 = torch.randn(100, 128)
@@ -71,9 +75,16 @@ class CosineSimilarity(Module):
     Shape:
         - Input1: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`
         - Input2: :math:`(\ast_1, D, \ast_2)`, same number of dimensions as x1, matching x1 size at dimension `dim`,
+<<<<<<< HEAD
               and broadcastable with x1 at other dimensions.
         - Output: :math:`(\ast_1, \ast_2)`
     Examples::
+=======
+          and broadcastable with x1 at other dimensions.
+        - Output: :math:`(\ast_1, \ast_2)`
+
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> input1 = torch.randn(100, 128)
         >>> input2 = torch.randn(100, 128)
         >>> cos = nn.CosineSimilarity(dim=1, eps=1e-6)
diff --git a/torch/nn/modules/flatten.py b/torch/nn/modules/flatten.py
index 39b702c38e17..d7886de6085c 100644
--- a/torch/nn/modules/flatten.py
+++ b/torch/nn/modules/flatten.py
@@ -96,8 +96,13 @@ class Unflatten(Module):
         >>> output.size()
         torch.Size([2, 2, 5, 5])
         >>> # With namedshape (tuple of tuples)
+<<<<<<< HEAD
         >>> input = torch.randn(2, 50, names=('N', 'features'))
         >>> unflatten = nn.Unflatten('features', (('C', 2), ('H', 5), ('W', 5)))
+=======
+        >>> input = torch.randn(2, 50, names=("N", "features"))
+        >>> unflatten = nn.Unflatten("features", (("C", 2), ("H", 5), ("W", 5)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> output = unflatten(input)
         >>> output.size()
         torch.Size([2, 2, 5, 5])
diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py
index 58397caec32c..3facf27ce81d 100644
--- a/torch/nn/modules/fold.py
+++ b/torch/nn/modules/fold.py
@@ -9,7 +9,12 @@
 
 
 class Fold(Module):
+<<<<<<< HEAD
     r"""Combines an array of sliding local blocks into a large containing tensor.
+=======
+    (
+        r"""Combines an array of sliding local blocks into a large containing tensor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Consider a batched :attr:`input` tensor containing sliding local blocks,
     e.g., patches of images, of shape :math:`(N, C \times  \prod(\text{kernel\_size}), L)`,
@@ -42,10 +47,19 @@ class Fold(Module):
     * :attr:`padding` controls the amount of implicit zero-paddings on both
       sides for :attr:`padding` number of points for each dimension before
       reshaping.
+<<<<<<< HEAD
 """ """
     * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
       It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
 """ r"""
+=======
+"""
+        """
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+"""
+        r"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
         output_size (int or tuple): the shape of the spatial dimensions of the
                                     output (i.e., ``output.sizes()[2:]``)
@@ -119,6 +133,10 @@ class Fold(Module):
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
 
     """
+<<<<<<< HEAD
+=======
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     __constants__ = ["output_size", "kernel_size", "dilation", "padding", "stride"]
     output_size: _size_any_t
@@ -162,7 +180,12 @@ def extra_repr(self) -> str:
 
 
 class Unfold(Module):
+<<<<<<< HEAD
     r"""Extracts sliding local blocks from a batched input tensor.
+=======
+    (
+        r"""Extracts sliding local blocks from a batched input tensor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Consider a batched :attr:`input` tensor of shape :math:`(N, C, *)`,
     where :math:`N` is the batch dimension, :math:`C` is the channel dimension,
@@ -194,10 +217,19 @@ class Unfold(Module):
     * :attr:`padding` controls the amount of implicit zero-paddings on both
       sides for :attr:`padding` number of points for each dimension before
       reshaping.
+<<<<<<< HEAD
 """ """
     * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
       It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
 """ r"""
+=======
+"""
+        """
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+"""
+        r"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
         kernel_size (int or tuple): the size of the sliding blocks
         dilation (int or tuple, optional): a parameter that controls the
@@ -283,6 +315,10 @@ class Unfold(Module):
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
 
     """
+<<<<<<< HEAD
+=======
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     __constants__ = ["kernel_size", "dilation", "padding", "stride"]
     kernel_size: _size_any_t
diff --git a/torch/nn/modules/lazy.py b/torch/nn/modules/lazy.py
index 41530324b826..2daf878a6276 100644
--- a/torch/nn/modules/lazy.py
+++ b/torch/nn/modules/lazy.py
@@ -15,11 +15,17 @@ class _LazyProtocol(Protocol):
     https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes
     """
 
+<<<<<<< HEAD
     def _register_load_state_dict_pre_hook(self, hook):
         ...
 
     def register_forward_pre_hook(self, hook, *, prepend=False, with_kwargs=False):
         ...
+=======
+    def _register_load_state_dict_pre_hook(self, hook): ...
+
+    def register_forward_pre_hook(self, hook, *, prepend=False, with_kwargs=False): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _lazy_load_hook(
         self,
@@ -30,6 +36,7 @@ def _lazy_load_hook(
         missing_keys,
         unexpected_keys,
         error_msgs,
+<<<<<<< HEAD
     ):
         ...
 
@@ -58,6 +65,28 @@ def _load_hook(self):
     @property
     def _initialize_hook(self):
         ...
+=======
+    ): ...
+
+    def _get_name(self): ...
+
+    def _infer_parameters(self, module, input): ...
+
+    @property
+    def _parameters(self): ...
+
+    @property
+    def _buffers(self): ...
+
+    @property
+    def _non_persistent_buffers_set(self): ...
+
+    @property
+    def _load_hook(self): ...
+
+    @property
+    def _initialize_hook(self): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class LazyModuleMixin:
@@ -86,6 +115,7 @@ class LazyModuleMixin:
 
     >>> # xdoctest: +SKIP
     >>> class LazyMLP(torch.nn.Module):
+<<<<<<< HEAD
     ...    def __init__(self) -> None:
     ...        super().__init__()
     ...        self.fc1 = torch.nn.LazyLinear(10)
@@ -97,11 +127,28 @@ class LazyModuleMixin:
     ...        x = self.relu1(self.fc1(input))
     ...        y = self.relu2(self.fc2(x))
     ...        return y
+=======
+    ...     def __init__(self) -> None:
+    ...         super().__init__()
+    ...         self.fc1 = torch.nn.LazyLinear(10)
+    ...         self.relu1 = torch.nn.ReLU()
+    ...         self.fc2 = torch.nn.LazyLinear(1)
+    ...         self.relu2 = torch.nn.ReLU()
+    ...
+    ...     def forward(self, input):
+    ...         x = self.relu1(self.fc1(input))
+    ...         y = self.relu2(self.fc2(x))
+    ...         return y
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> # constructs a network with lazy modules
     >>> lazy_mlp = LazyMLP()
     >>> # transforms the network's device and dtype
     >>> # NOTE: these transforms can and should be applied after construction and before any 'dry runs'
+<<<<<<< HEAD
     >>> lazy_mlp = lazy_mlp.cuda().double()
+=======
+    >>> lazy_mlp = lazy_mlp.cuda()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> lazy_mlp
     LazyMLP( (fc1): LazyLinear(in_features=0, out_features=10, bias=True)
       (relu1): ReLU()
@@ -109,7 +156,11 @@ class LazyModuleMixin:
       (relu2): ReLU()
     )
     >>> # performs a dry run to initialize the network's lazy modules
+<<<<<<< HEAD
     >>> lazy_mlp(torch.ones(10,10).cuda())
+=======
+    >>> lazy_mlp(torch.ones(10, 10).cuda())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> # after initialization, LazyLinear modules become regular Linear modules
     >>> lazy_mlp
     LazyMLP(
@@ -119,7 +170,11 @@ class LazyModuleMixin:
       (relu2): ReLU()
     )
     >>> # attaches an optimizer, since parameters can now be used as usual
+<<<<<<< HEAD
     >>> optim = torch.optim.SGD(mlp.parameters(), lr=0.01)
+=======
+    >>> optim = torch.optim.SGD(lazy_mlp.parameters(), lr=0.01)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     A final caveat when using lazy modules is that the order of initialization of a network's
     parameters may change, since the lazy modules are always initialized after other modules.
@@ -136,6 +191,7 @@ class LazyModuleMixin:
     >>> lazy_mlp = LazyMLP()
     >>> # The state dict shows the uninitialized parameters
     >>> lazy_mlp.state_dict()
+<<<<<<< HEAD
     OrderedDict([('fc1.weight', Uninitialized parameter),
                  ('fc1.bias',
                   tensor([-1.8832e+25,  4.5636e-41, -1.8832e+25,  4.5636e-41, -6.1598e-30,
@@ -143,6 +199,12 @@ class LazyModuleMixin:
                  ('fc2.weight', Uninitialized parameter),
                  ('fc2.bias', tensor([0.0019]))])
 
+=======
+    OrderedDict({'fc1.weight': <UninitializedParameter>,
+                 'fc1.bias': <UninitializedParameter>,
+                 'fc2.weight': <UninitializedParameter>,
+                 'fc2.bias': <UninitializedParameter>})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Lazy modules can load regular :class:`torch.nn.Parameter` s (i.e. you can serialize/deserialize
     initialized LazyModules and they will remain initialized)
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index 4e53df95acf5..2e4c6a65aac4 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -151,9 +151,15 @@ class Bilinear(Module):
     r"""Applies a bilinear transformation to the incoming data: :math:`y = x_1^T A x_2 + b`.
 
     Args:
+<<<<<<< HEAD
         in1_features: size of each first input sample
         in2_features: size of each second input sample
         out_features: size of each output sample
+=======
+        in1_features: size of each first input sample, must be > 0
+        in2_features: size of each second input sample, must be > 0
+        out_features: size of each output sample, must be > 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         bias: If set to ``False``, the layer will not learn an additive bias.
             Default: ``True``
 
@@ -202,6 +208,11 @@ def __init__(
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
+<<<<<<< HEAD
+=======
+        if in1_features <= 0:
+            raise ValueError(f"in1_features must be > 0, but got {in1_features}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.in1_features = in1_features
         self.in2_features = in2_features
         self.out_features = out_features
@@ -288,6 +299,16 @@ def initialize_parameters(self, input) -> None:  # type: ignore[override]
                 if self.bias is not None:
                     self.bias.materialize((self.out_features,))
                 self.reset_parameters()
+<<<<<<< HEAD
+=======
+        if self.in_features == 0:
+            assert input.shape[-1] == self.weight.shape[-1], (
+                f"The in_features inferred from input: {input.shape[-1]} "
+                f"is not equal to in_features from self.weight: "
+                f"{self.weight.shape[-1]}"
+            )
+            self.in_features = input.shape[-1]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # TODO: PartialLinear - maybe in sparse?
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index a18e0459234e..99825a947b4d 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -111,7 +111,11 @@ class L1Loss(_Loss):
         - Output: scalar. If :attr:`reduction` is ``'none'``, then
           :math:`(*)`, same shape as the input.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> loss = nn.L1Loss()
         >>> input = torch.randn(3, 5, requires_grad=True)
@@ -119,6 +123,10 @@ class L1Loss(_Loss):
         >>> output = loss(input, target)
         >>> output.backward()
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["reduction"]
 
     def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
@@ -154,8 +162,13 @@ class NLLLoss(_WeightedLoss):
     The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
 
     .. math::
+<<<<<<< HEAD
         \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
         l_n = - w_{y_n} x_{n,y_n}, \quad
+=======
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \\
+        l_n = - w_{y_n} x_{n,y_n}, \\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
 
     where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and
@@ -207,7 +220,11 @@ class NLLLoss(_WeightedLoss):
           :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss.
           Otherwise, scalar.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> log_softmax = nn.LogSoftmax(dim=1)
         >>> loss_fn = nn.NLLLoss()
@@ -233,6 +250,10 @@ class NLLLoss(_WeightedLoss):
         >>> loss = loss_fn(output, target)
         >>> loss.backward()
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["ignore_index", "reduction"]
     ignore_index: int
 
@@ -317,7 +338,11 @@ class PoissonNLLLoss(_Loss):
             and :attr:`reduce` are in the process of being deprecated, and in the meantime,
             specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> loss = nn.PoissonNLLLoss()
         >>> log_input = torch.randn(5, 2, requires_grad=True)
@@ -331,6 +356,10 @@ class PoissonNLLLoss(_Loss):
         - Output: scalar by default. If :attr:`reduction` is ``'none'``, then :math:`(*)`,
           the same shape as the input.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["log_input", "full", "eps", "reduction"]
     log_input: bool
     full: bool
@@ -402,7 +431,11 @@ class GaussianNLLLoss(_Loss):
           ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
           shape as the input
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> loss = nn.GaussianNLLLoss()
         >>> input = torch.randn(5, 2, requires_grad=True)
         >>> target = torch.randn(5, 2)
@@ -427,6 +460,10 @@ class GaussianNLLLoss(_Loss):
         Conference on Neural Networks (ICNN'94), Orlando, FL, USA, 1994, pp. 55-60
         vol.1, doi: 10.1109/ICNN.1994.374138.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["full", "eps", "reduction"]
     full: bool
     eps: float
@@ -467,7 +504,11 @@ class KLDivLoss(_Loss):
 
     .. code-block:: python
 
+<<<<<<< HEAD
         if not log_target: # default
+=======
+        if not log_target:  # default
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             loss_pointwise = target * (target.log() - input)
         else:
             loss_pointwise = target.exp() * (target - input)
@@ -515,18 +556,30 @@ class KLDivLoss(_Loss):
         - Output: scalar by default. If :attr:`reduction` is `'none'`, then :math:`(*)`,
           same shape as the input.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> kl_loss = nn.KLDivLoss(reduction="batchmean")
         >>> # input should be a distribution in the log space
         >>> input = F.log_softmax(torch.randn(3, 5, requires_grad=True), dim=1)
         >>> # Sample a batch of distributions. Usually this would come from the dataset
         >>> target = F.softmax(torch.rand(3, 5), dim=1)
         >>> output = kl_loss(input, target)
+<<<<<<< HEAD
 
+=======
+        >>>
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True)
         >>> log_target = F.log_softmax(torch.rand(3, 5), dim=1)
         >>> output = kl_loss(input, log_target)
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["reduction"]
 
     def __init__(
@@ -593,7 +646,11 @@ class MSELoss(_Loss):
         - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
         - Target: :math:`(*)`, same shape as the input.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> loss = nn.MSELoss()
         >>> input = torch.randn(3, 5, requires_grad=True)
@@ -601,6 +658,10 @@ class MSELoss(_Loss):
         >>> output = loss(input, target)
         >>> output.backward()
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["reduction"]
 
     def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
@@ -675,7 +736,11 @@ class BCELoss(_WeightedLoss):
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
           shape as input.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> m = nn.Sigmoid()
         >>> loss = nn.BCELoss()
@@ -684,6 +749,10 @@ class BCELoss(_WeightedLoss):
         >>> output = loss(m(input), target)
         >>> output.backward()
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["reduction"]
 
     def __init__(
@@ -746,7 +815,11 @@ class BCEWithLogitsLoss(_Loss):
     then ``pos_weight`` for the class should be equal to :math:`\frac{300}{100}=3`.
     The loss would act as if the dataset contains :math:`3\times 100=300` positive examples.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> target = torch.ones([10, 64], dtype=torch.float32)  # 64 classes, batch size = 10
         >>> output = torch.full([10, 64], 1.5)  # A prediction (logit)
@@ -794,7 +867,11 @@ class BCEWithLogitsLoss(_Loss):
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
           shape as input.
 
+<<<<<<< HEAD
      Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> loss = nn.BCEWithLogitsLoss()
         >>> input = torch.randn(3, requires_grad=True)
@@ -876,6 +953,10 @@ class HingeEmbeddingLoss(_Loss):
         - Target: :math:`(*)`, same shape as the input
         - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["margin", "reduction"]
     margin: float
 
@@ -939,7 +1020,11 @@ class MultiLabelMarginLoss(_Loss):
         - Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> loss = nn.MultiLabelMarginLoss()
         >>> x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
@@ -950,6 +1035,10 @@ class MultiLabelMarginLoss(_Loss):
         tensor(0.85...)
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["reduction"]
 
     def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
@@ -1030,6 +1119,10 @@ class SmoothL1Loss(_Loss):
         - Target: :math:`(*)`, same shape as the input.
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["reduction"]
 
     def __init__(
@@ -1092,6 +1185,10 @@ class HuberLoss(_Loss):
         - Target: :math:`(*)`, same shape as the input.
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["reduction", "delta"]
 
     def __init__(self, reduction: str = "mean", delta: float = 1.0) -> None:
@@ -1134,6 +1231,10 @@ class SoftMarginLoss(_Loss):
           shape as input.
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["reduction"]
 
     def __init__(self, size_average=None, reduce=None, reduction: str = "mean") -> None:
@@ -1215,7 +1316,11 @@ class probabilities only when a single class label per minibatch item is too res
 
     Args:
         weight (Tensor, optional): a manual rescaling weight given to each class.
+<<<<<<< HEAD
             If given, has to be a Tensor of size `C` and floating point dtype
+=======
+            If given, has to be a Tensor of size `C`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
             the losses are averaged over each loss element in the batch. Note that for
             some losses, there are multiple elements per sample. If the field :attr:`size_average`
@@ -1261,7 +1366,11 @@ class probabilities only when a single class label per minibatch item is too res
                 N ={} & \text{batch size} \\
             \end{aligned}
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> # Example of target with class indices
         >>> loss = nn.CrossEntropyLoss()
@@ -1276,6 +1385,10 @@ class probabilities only when a single class label per minibatch item is too res
         >>> output = loss(input, target)
         >>> output.backward()
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["ignore_index", "reduction", "label_smoothing"]
     ignore_index: int
     label_smoothing: float
@@ -1342,6 +1455,10 @@ class MultiLabelSoftMarginLoss(_WeightedLoss):
         - Target: :math:`(N, C)`, label targets must have the same shape as the input.
         - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["reduction"]
 
     def __init__(
@@ -1401,7 +1518,11 @@ class CosineEmbeddingLoss(_Loss):
         - Target: :math:`(N)` or :math:`()`.
         - Output: If :attr:`reduction` is ``'none'``, then :math:`(N)`, otherwise scalar.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> loss = nn.CosineEmbeddingLoss()
         >>> input1 = torch.randn(3, 5, requires_grad=True)
@@ -1410,6 +1531,10 @@ class CosineEmbeddingLoss(_Loss):
         >>> output = loss(input1, input2, target)
         >>> output.backward()
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["margin", "reduction"]
     margin: float
 
@@ -1466,7 +1591,11 @@ class MarginRankingLoss(_Loss):
         - Target: :math:`(N)` or :math:`()`, same shape as the inputs.
         - Output: scalar. If :attr:`reduction` is ``'none'`` and Input size is not :math:`()`, then :math:`(N)`.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> loss = nn.MarginRankingLoss()
         >>> input1 = torch.randn(3, requires_grad=True)
@@ -1475,6 +1604,10 @@ class MarginRankingLoss(_Loss):
         >>> output = loss(input1, input2, target)
         >>> output.backward()
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["margin", "reduction"]
     margin: float
 
@@ -1545,7 +1678,11 @@ class MultiMarginLoss(_WeightedLoss):
         - Target: :math:`(N)` or :math:`()`, where each value is :math:`0 \leq \text{targets}[i] \leq C-1`.
         - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the target.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         >>> loss = nn.MultiMarginLoss()
         >>> x = torch.tensor([[0.1, 0.2, 0.4, 0.8]])
@@ -1554,6 +1691,10 @@ class MultiMarginLoss(_WeightedLoss):
         >>> loss(x, y)
         tensor(0.32...)
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["p", "margin", "reduction"]
     margin: float
     p: int
@@ -1645,7 +1786,11 @@ class TripletMarginLoss(_Loss):
         - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'`` and
           input shape is :math:`(N, D)`; a scalar otherwise.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2, eps=1e-7)
     >>> anchor = torch.randn(100, 128, requires_grad=True)
@@ -1657,6 +1802,10 @@ class TripletMarginLoss(_Loss):
     .. _Learning shallow convolutional feature descriptors with triplet losses:
         https://bmva-archive.org.uk/bmvc/2016/papers/paper119/index.html
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["margin", "p", "eps", "swap", "reduction"]
     margin: float
     p: float
@@ -1756,7 +1905,11 @@ class TripletMarginWithDistanceLoss(_Loss):
         - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar
           otherwise.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     >>> # Initialize embeddings
     >>> embedding = nn.Embedding(1000, 128)
@@ -1794,6 +1947,10 @@ class TripletMarginWithDistanceLoss(_Loss):
         V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses:
         https://bmva-archive.org.uk/bmvc/2016/papers/paper119/index.html
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["margin", "swap", "reduction"]
     margin: float
     swap: bool
@@ -1886,6 +2043,7 @@ class CTCLoss(_Loss):
           ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N)` if input is batched or
           :math:`()` if input is unbatched, where :math:`N = \text{batch size}`.
 
+<<<<<<< HEAD
     Examples::
 
         >>> # Target are to be padded
@@ -1893,6 +2051,15 @@ class CTCLoss(_Loss):
         >>> C = 20      # Number of classes (including blank)
         >>> N = 16      # Batch size
         >>> S = 30      # Target sequence length of longest target in batch (padding length)
+=======
+    Examples:
+
+        >>> # Target are to be padded
+        >>> T = 50  # Input sequence length
+        >>> C = 20  # Number of classes (including blank)
+        >>> N = 16  # Batch size
+        >>> S = 30  # Target sequence length of longest target in batch (padding length)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> S_min = 10  # Minimum target length, for demonstration purposes
         >>>
         >>> # Initialize random batch of input vectors, for *size = (T,N,C)
@@ -1902,16 +2069,31 @@ class CTCLoss(_Loss):
         >>> target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
         >>>
         >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
+<<<<<<< HEAD
         >>> target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)
+=======
+        >>> target_lengths = torch.randint(
+        ...     low=S_min,
+        ...     high=S,
+        ...     size=(N,),
+        ...     dtype=torch.long,
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> ctc_loss = nn.CTCLoss()
         >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
         >>> loss.backward()
         >>>
         >>>
         >>> # Target are to be un-padded
+<<<<<<< HEAD
         >>> T = 50      # Input sequence length
         >>> C = 20      # Number of classes (including blank)
         >>> N = 16      # Batch size
+=======
+        >>> T = 50  # Input sequence length
+        >>> C = 20  # Number of classes (including blank)
+        >>> N = 16  # Batch size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>>
         >>> # Initialize random batch of input vectors, for *size = (T,N,C)
         >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
@@ -1919,15 +2101,29 @@ class CTCLoss(_Loss):
         >>>
         >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
         >>> target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
+<<<<<<< HEAD
         >>> target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.long)
+=======
+        >>> target = torch.randint(
+        ...     low=1,
+        ...     high=C,
+        ...     size=(sum(target_lengths),),
+        ...     dtype=torch.long,
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> ctc_loss = nn.CTCLoss()
         >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
         >>> loss.backward()
         >>>
         >>>
         >>> # Target are to be un-padded and unbatched (effectively N=1)
+<<<<<<< HEAD
         >>> T = 50      # Input sequence length
         >>> C = 20      # Number of classes (including blank)
+=======
+        >>> T = 50  # Input sequence length
+        >>> C = 20  # Number of classes (including blank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>>
         >>> # Initialize random batch of input vectors, for *size = (T,C)
         >>> # xdoctest: +SKIP("FIXME: error in doctest")
@@ -1936,7 +2132,16 @@ class CTCLoss(_Loss):
         >>>
         >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
         >>> target_lengths = torch.randint(low=1, high=T, size=(), dtype=torch.long)
+<<<<<<< HEAD
         >>> target = torch.randint(low=1, high=C, size=(target_lengths,), dtype=torch.long)
+=======
+        >>> target = torch.randint(
+        ...     low=1,
+        ...     high=C,
+        ...     size=(target_lengths,),
+        ...     dtype=torch.long,
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> ctc_loss = nn.CTCLoss()
         >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
         >>> loss.backward()
@@ -1963,6 +2168,10 @@ class CTCLoss(_Loss):
         True``.
         Please see the notes on :doc:`/notes/randomness` for background.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["blank", "reduction"]
     blank: int
     zero_infinity: bool
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 4840eff28221..2b9be8998893 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -118,6 +118,21 @@ def __setstate__(self, state: dict):
 _global_forward_hooks_always_called: dict[int, bool] = OrderedDict()
 _global_forward_hooks_with_kwargs: dict[int, bool] = OrderedDict()
 
+<<<<<<< HEAD
+=======
+
+def _has_any_global_hook():
+    return (
+        _global_backward_pre_hooks
+        or _global_backward_hooks
+        or _global_forward_pre_hooks
+        or _global_forward_hooks
+        or _global_forward_hooks_always_called
+        or _global_forward_hooks_with_kwargs
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _EXTRA_STATE_KEY_SUFFIX = "_extra_state"
 
 
@@ -400,6 +415,10 @@ class Module:
         import torch.nn as nn
         import torch.nn.functional as F
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         class Model(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -514,7 +533,11 @@ def register_buffer(
     ) -> None:
         r"""Add a buffer to the module.
 
+<<<<<<< HEAD
         This is typically used to register a buffer that should not to be
+=======
+        This is typically used to register a buffer that should not be
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         considered a model parameter. For example, BatchNorm's ``running_mean``
         is not a parameter, but is part of the module's state. Buffers, by
         default, are persistent and will be saved alongside parameters. This
@@ -942,9 +965,19 @@ def compute_should_use_set_data(tensor, tensor_applied):
                 param_applied = fn(param)
             p_should_use_set_data = compute_should_use_set_data(param, param_applied)
 
+<<<<<<< HEAD
             # subclasses may have multiple child tensors so we need to use swap_tensors
             p_should_use_swap_tensors = (
                 should_use_swap_tensors or is_traceable_wrapper_subclass(param_applied)
+=======
+            from torch._subclasses.fake_tensor import FakeTensor
+
+            # subclasses may have multiple child tensors so we need to use swap_tensors
+            p_should_use_swap_tensors = (
+                should_use_swap_tensors
+                or is_traceable_wrapper_subclass(param_applied)
+                or isinstance(param, FakeTensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             param_grad = param.grad
@@ -1004,7 +1037,11 @@ def compute_should_use_set_data(tensor, tensor_applied):
 
         return self
 
+<<<<<<< HEAD
     def apply(self: T, fn: Callable[["Module"], None]) -> T:
+=======
+    def apply(self, fn: Callable[["Module"], None]) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Apply ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self.
 
         Typical use includes initializing the parameters of a model
@@ -1045,7 +1082,11 @@ def apply(self: T, fn: Callable[["Module"], None]) -> T:
         fn(self)
         return self
 
+<<<<<<< HEAD
     def cuda(self: T, device: Optional[Union[int, device]] = None) -> T:
+=======
+    def cuda(self, device: Optional[Union[int, device]] = None) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Move all model parameters and buffers to the GPU.
 
         This also makes associated parameters and buffers different objects. So
@@ -1064,7 +1105,11 @@ def cuda(self: T, device: Optional[Union[int, device]] = None) -> T:
         """
         return self._apply(lambda t: t.cuda(device))
 
+<<<<<<< HEAD
     def ipu(self: T, device: Optional[Union[int, device]] = None) -> T:
+=======
+    def ipu(self, device: Optional[Union[int, device]] = None) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Move all model parameters and buffers to the IPU.
 
         This also makes associated parameters and buffers different objects. So
@@ -1083,7 +1128,11 @@ def ipu(self: T, device: Optional[Union[int, device]] = None) -> T:
         """
         return self._apply(lambda t: t.ipu(device))
 
+<<<<<<< HEAD
     def xpu(self: T, device: Optional[Union[int, device]] = None) -> T:
+=======
+    def xpu(self, device: Optional[Union[int, device]] = None) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Move all model parameters and buffers to the XPU.
 
         This also makes associated parameters and buffers different objects. So
@@ -1102,7 +1151,11 @@ def xpu(self: T, device: Optional[Union[int, device]] = None) -> T:
         """
         return self._apply(lambda t: t.xpu(device))
 
+<<<<<<< HEAD
     def mtia(self: T, device: Optional[Union[int, device]] = None) -> T:
+=======
+    def mtia(self, device: Optional[Union[int, device]] = None) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Move all model parameters and buffers to the MTIA.
 
         This also makes associated parameters and buffers different objects. So
@@ -1121,7 +1174,11 @@ def mtia(self: T, device: Optional[Union[int, device]] = None) -> T:
         """
         return self._apply(lambda t: t.mtia(device))
 
+<<<<<<< HEAD
     def cpu(self: T) -> T:
+=======
+    def cpu(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Move all model parameters and buffers to the CPU.
 
         .. note::
@@ -1132,7 +1189,11 @@ def cpu(self: T) -> T:
         """
         return self._apply(lambda t: t.cpu())
 
+<<<<<<< HEAD
     def type(self: T, dst_type: Union[dtype, str]) -> T:
+=======
+    def type(self, dst_type: Union[dtype, str]) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Casts all parameters and buffers to :attr:`dst_type`.
 
         .. note::
@@ -1146,7 +1207,11 @@ def type(self: T, dst_type: Union[dtype, str]) -> T:
         """
         return self._apply(lambda t: t.type(dst_type))
 
+<<<<<<< HEAD
     def float(self: T) -> T:
+=======
+    def float(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Casts all floating point parameters and buffers to ``float`` datatype.
 
         .. note::
@@ -1157,7 +1222,11 @@ def float(self: T) -> T:
         """
         return self._apply(lambda t: t.float() if t.is_floating_point() else t)
 
+<<<<<<< HEAD
     def double(self: T) -> T:
+=======
+    def double(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Casts all floating point parameters and buffers to ``double`` datatype.
 
         .. note::
@@ -1168,7 +1237,11 @@ def double(self: T) -> T:
         """
         return self._apply(lambda t: t.double() if t.is_floating_point() else t)
 
+<<<<<<< HEAD
     def half(self: T) -> T:
+=======
+    def half(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Casts all floating point parameters and buffers to ``half`` datatype.
 
         .. note::
@@ -1179,7 +1252,11 @@ def half(self: T) -> T:
         """
         return self._apply(lambda t: t.half() if t.is_floating_point() else t)
 
+<<<<<<< HEAD
     def bfloat16(self: T) -> T:
+=======
+    def bfloat16(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Casts all floating point parameters and buffers to ``bfloat16`` datatype.
 
         .. note::
@@ -1191,8 +1268,13 @@ def bfloat16(self: T) -> T:
         return self._apply(lambda t: t.bfloat16() if t.is_floating_point() else t)
 
     def to_empty(
+<<<<<<< HEAD
         self: T, *, device: Optional[DeviceLikeType], recurse: bool = True
     ) -> T:
+=======
+        self, *, device: Optional[DeviceLikeType], recurse: bool = True
+    ) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Move the parameters and buffers to the specified device without copying storage.
 
         Args:
@@ -1214,6 +1296,7 @@ def to(
         device: Optional[DeviceLikeType] = ...,
         dtype: Optional[dtype] = ...,
         non_blocking: bool = ...,
+<<<<<<< HEAD
     ) -> Self:
         ...
 
@@ -1224,6 +1307,15 @@ def to(self, dtype: dtype, non_blocking: bool = ...) -> Self:
     @overload
     def to(self, tensor: Tensor, non_blocking: bool = ...) -> Self:
         ...
+=======
+    ) -> Self: ...
+
+    @overload
+    def to(self, dtype: dtype, non_blocking: bool = ...) -> Self: ...
+
+    @overload
+    def to(self, tensor: Tensor, non_blocking: bool = ...) -> Self: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def to(self, *args, **kwargs):
         r"""Move and/or cast the parameters and buffers.
@@ -1436,10 +1528,21 @@ def register_full_backward_hook(
     ) -> RemovableHandle:
         r"""Register a backward hook on the module.
 
+<<<<<<< HEAD
         The hook will be called every time the gradients with respect to a module
         are computed, i.e. the hook will execute if and only if the gradients with
         respect to module outputs are computed. The hook should have the following
         signature::
+=======
+        The hook will be called every time the gradients with respect to a module are computed, and its firing rules are as follows:
+
+            1. Ordinarily, the hook fires when the gradients are computed with respect to the module inputs.
+            2. If none of the module inputs require gradients, the hook will fire when the gradients are computed
+               with respect to module outputs.
+            3. If none of the module outputs require gradients, then the hooks will not fire.
+
+        The hook should have the following signature::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             hook(module, grad_input, grad_output) -> tuple(Tensor) or None
 
@@ -1732,7 +1835,15 @@ def _slow_forward(self, *input, **kwargs):
         if recording_scopes:
             # type ignore was added because at this point one knows that
             # torch.jit._trace._trace_module_map is not Optional and has type Dict[Any, Any]
+<<<<<<< HEAD
             name = torch.jit._trace._trace_module_map[self] if self in torch.jit._trace._trace_module_map else None  # type: ignore[index, operator] # noqa: B950
+=======
+            name = (
+                torch.jit._trace._trace_module_map[self]  # type: ignore[index]
+                if self in torch.jit._trace._trace_module_map  # type: ignore[operator]
+                else None
+            )  # noqa: B950
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if name:
                 tracing_state.push_scope(name)
             else:
@@ -2019,7 +2130,14 @@ def remove_from(*dicts_or_sets):
                     # register_buffer() method that doesn't have the "persistent"
                     # argument. Only pass it in if it is accepted otherwise assume
                     # it is always true
+<<<<<<< HEAD
                     if self.register_buffer is torch.nn.Module.register_buffer:
+=======
+                    if (
+                        getattr(self.register_buffer, "__func__", None)
+                        is torch.nn.Module.register_buffer
+                    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.register_buffer(name, value, persistent)
                     else:
                         sign = inspect.signature(self.register_buffer)
@@ -2141,6 +2259,7 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
 
     @overload
     def state_dict(
+<<<<<<< HEAD
         self, *, destination: T_destination, prefix: str = ..., keep_vars: bool = ...
     ) -> T_destination:
         ...
@@ -2148,6 +2267,22 @@ def state_dict(
     @overload
     def state_dict(self, *, prefix: str = ..., keep_vars: bool = ...) -> dict[str, Any]:
         ...
+=======
+        self,
+        *,
+        destination: T_destination,
+        prefix: str = ...,
+        keep_vars: bool = ...,
+    ) -> T_destination: ...
+
+    @overload
+    def state_dict(
+        self,
+        *,
+        prefix: str = ...,
+        keep_vars: bool = ...,
+    ) -> dict[str, Any]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO: Change `*args` to `*` and remove the corresponding warning in docs when BC allows.
     # Also remove the logic for arg parsing together.
@@ -2505,6 +2640,7 @@ def load_state_dict(
             assign (bool, optional): When set to ``False``, the properties of the tensors
                 in the current module are preserved whereas setting it to ``True`` preserves
                 properties of the Tensors in the state dict. The only
+<<<<<<< HEAD
                 exception is the ``requires_grad`` field of :class:`~torch.nn.Parameter`s
                 for which the value from the module is preserved.
                 Default: ``False``
@@ -2514,6 +2650,16 @@ def load_state_dict(
                 * **missing_keys** is a list of str containing any keys that are expected
                     by this module but missing from the provided ``state_dict``.
                 * **unexpected_keys** is a list of str containing the keys that are not
+=======
+                exception is the ``requires_grad`` field of :class:`~torch.nn.Parameter`
+                for which the value from the module is preserved. Default: ``False``
+
+        Returns:
+            ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
+                * ``missing_keys`` is a list of str containing any keys that are expected
+                    by this module but missing from the provided ``state_dict``.
+                * ``unexpected_keys`` is a list of str containing the keys that are not
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     expected by this module but present in the provided ``state_dict``.
 
         Note:
@@ -2833,7 +2979,11 @@ def named_modules(
                     memo, submodule_prefix, remove_duplicate
                 )
 
+<<<<<<< HEAD
     def train(self: T, mode: bool = True) -> T:
+=======
+    def train(self, mode: bool = True) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Set the module in training mode.
 
         This has an effect only on certain modules. See the documentation of
@@ -2855,7 +3005,11 @@ def train(self: T, mode: bool = True) -> T:
             module.train(mode)
         return self
 
+<<<<<<< HEAD
     def eval(self: T) -> T:
+=======
+    def eval(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Set the module in evaluation mode.
 
         This has an effect only on certain modules. See the documentation of
@@ -2873,7 +3027,11 @@ def eval(self: T) -> T:
         """
         return self.train(False)
 
+<<<<<<< HEAD
     def requires_grad_(self: T, requires_grad: bool = True) -> T:
+=======
+    def requires_grad_(self, requires_grad: bool = True) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Change if autograd should record operations on parameters in this module.
 
         This method sets the parameters' :attr:`requires_grad` attributes
@@ -2924,7 +3082,11 @@ def zero_grad(self, set_to_none: bool = True) -> None:
                         p.grad.requires_grad_(False)
                     p.grad.zero_()
 
+<<<<<<< HEAD
     def share_memory(self: T) -> T:
+=======
+    def share_memory(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""See :meth:`torch.Tensor.share_memory_`."""
         return self._apply(lambda t: t.share_memory_())
 
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
index 4063239581a1..0448d0c38078 100644
--- a/torch/nn/modules/normalization.py
+++ b/torch/nn/modules/normalization.py
@@ -343,7 +343,11 @@ class RMSNorm(Module):
 
             If a single integer is used, it is treated as a singleton list, and this module will
             normalize over the last dimension which is expected to be of that specific size.
+<<<<<<< HEAD
         eps: a value added to the denominator for numerical stability. Default: :func:`torch.finfo(x.dtype).eps`
+=======
+        eps: a value added to the denominator for numerical stability. Default: ``torch.finfo(x.dtype).eps``
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elementwise_affine: a boolean value that when set to ``True``, this module
             has learnable per-element affine parameters initialized to ones (for weights). Default: ``True``.
 
@@ -358,6 +362,10 @@ class RMSNorm(Module):
         >>> rms_norm(input)
 
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
     normalized_shape: tuple[int, ...]
     eps: Optional[float]
diff --git a/torch/nn/modules/padding.py b/torch/nn/modules/padding.py
index 13b4b8307b73..192911d07287 100644
--- a/torch/nn/modules/padding.py
+++ b/torch/nn/modules/padding.py
@@ -58,6 +58,10 @@ class CircularPad1d(_CircularPadNd):
         padding (int, tuple): the size of the padding. If is `int`, uses the same
             padding in all boundaries. If a 2-`tuple`, uses
             (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+<<<<<<< HEAD
+=======
+            Note that padding size should be less than or equal to the corresponding input dimension.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Shape:
         - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
@@ -107,6 +111,10 @@ class CircularPad2d(_CircularPadNd):
         padding (int, tuple): the size of the padding. If is `int`, uses the same
             padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
             :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+<<<<<<< HEAD
+=======
+            Note that padding size should be less than or equal to the corresponding input dimension.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
@@ -168,6 +176,10 @@ class CircularPad3d(_CircularPadNd):
             (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
             :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
             :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+<<<<<<< HEAD
+=======
+            Note that padding size should be less than or equal to the corresponding input dimension.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
@@ -380,6 +392,10 @@ class ReflectionPad1d(_ReflectionPadNd):
         padding (int, tuple): the size of the padding. If is `int`, uses the same
             padding in all boundaries. If a 2-`tuple`, uses
             (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+<<<<<<< HEAD
+=======
+            Note that padding size should be less than the corresponding input dimension.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Shape:
         - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
@@ -476,6 +492,10 @@ class ReflectionPad3d(_ReflectionPadNd):
             (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
             :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
             :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+<<<<<<< HEAD
+=======
+            Note that padding size should be less than the corresponding input dimension.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
@@ -539,6 +559,10 @@ class ReplicationPad1d(_ReplicationPadNd):
         padding (int, tuple): the size of the padding. If is `int`, uses the same
             padding in all boundaries. If a 2-`tuple`, uses
             (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+<<<<<<< HEAD
+=======
+            Note that the output dimensions must remain positive.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Shape:
         - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
@@ -580,6 +604,10 @@ class ReplicationPad2d(_ReplicationPadNd):
         padding (int, tuple): the size of the padding. If is `int`, uses the same
             padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
             :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+<<<<<<< HEAD
+=======
+            Note that the output dimensions must remain positive.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
@@ -634,6 +662,10 @@ class ReplicationPad3d(_ReplicationPadNd):
             (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
             :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
             :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+<<<<<<< HEAD
+=======
+            Note that the output dimensions must remain positive.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index 602174abc94c..8eeb224da38e 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -108,11 +108,30 @@ class MaxPool1d(_MaxPoolNd):
 
     Shape:
         - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+<<<<<<< HEAD
         - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
 
           .. math::
               L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
                     \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+=======
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`,
+
+          where ``ceil_mode = False``
+
+          .. math::
+              L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                   \times (\text{kernel\_size} - 1) - 1}{\text{stride}}\right\rfloor + 1
+
+          where ``ceil_mode = True``
+
+          .. math::
+              L_{out} = \left\lceil \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                    \times (\text{kernel\_size} - 1) - 1 + (stride - 1)}{\text{stride}}\right\rceil + 1
+
+        - Ensure that the last pooling starts inside the image, make :math:`L_{out} = L_{out} - 1`
+          when :math:`(L_{out} - 1) * \text{stride} >= L_{in} + \text{padding}`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Examples::
 
@@ -596,6 +615,12 @@ class AvgPool1d(_AvgPoolNd):
         When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
         or the input. Sliding windows that would start in the right padded region are ignored.
 
+<<<<<<< HEAD
+=======
+    .. note::
+        pad should be at most half of effective kernel size.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can each be
     an ``int`` or a one-element tuple.
 
@@ -677,9 +702,18 @@ class AvgPool2d(_AvgPoolNd):
         When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
         or the input. Sliding windows that would start in the right padded region are ignored.
 
+<<<<<<< HEAD
     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be:
 
         - a single ``int`` -- in which case the same value is used for the height and width dimension
+=======
+    .. note::
+        pad should be at most half of effective kernel size.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be:
+
+        - a single ``int`` or a single-element tuple -- in which case the same value is used for the height and width dimension
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
           and the second `int` for the width dimension
 
@@ -786,6 +820,12 @@ class AvgPool3d(_AvgPoolNd):
         When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
         or the input. Sliding windows that would start in the right padded region are ignored.
 
+<<<<<<< HEAD
+=======
+    .. note::
+        pad should be at most half of effective kernel size.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     The parameters :attr:`kernel_size`, :attr:`stride` can either be:
 
         - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index 8455b7299a4c..9d277d695937 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -253,7 +253,12 @@ def flatten_parameters(self) -> None:
         # alias would break the assumptions of the uniqueness check in
         # Module.named_parameters().
         unique_data_ptrs = {
+<<<<<<< HEAD
             p.data_ptr() for p in self._flat_weights  # type: ignore[union-attr]
+=======
+            p.data_ptr()  # type: ignore[union-attr]
+            for p in self._flat_weights
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         }
         if len(unique_data_ptrs) != len(self._flat_weights):
             return
@@ -484,11 +489,18 @@ class RNN(RNNBase):
     .. code-block:: python
 
         # Efficient implementation equivalent to the following with bidirectional=False
+<<<<<<< HEAD
         def forward(x, hx=None):
+=======
+        rnn = nn.RNN(input_size, hidden_size, num_layers)
+        params = dict(rnn.named_parameters())
+        def forward(x, hx=None, batch_first=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if batch_first:
                 x = x.transpose(0, 1)
             seq_len, batch_size, _ = x.size()
             if hx is None:
+<<<<<<< HEAD
                 hx = torch.zeros(num_layers, batch_size, hidden_size)
             h_t_minus_1 = hx
             h_t = hx
@@ -503,6 +515,23 @@ def forward(x, hx=None):
                     )
                 output.append(h_t[-1])
                 h_t_minus_1 = h_t
+=======
+                hx = torch.zeros(rnn.num_layers, batch_size, rnn.hidden_size)
+            h_t_minus_1 = hx.clone()
+            h_t = hx.clone()
+            output = []
+            for t in range(seq_len):
+                for layer in range(rnn.num_layers):
+                    input_t = x[t] if layer == 0 else h_t[layer - 1]
+                    h_t[layer] = torch.tanh(
+                        input_t @ params[f"weight_ih_l{layer}"].T
+                        + h_t_minus_1[layer] @ params[f"weight_hh_l{layer}"].T
+                        + params[f"bias_hh_l{layer}"]
+                        + params[f"bias_ih_l{layer}"]
+                    )
+                output.append(h_t[-1].clone())
+                h_t_minus_1 = h_t.clone()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             output = torch.stack(output)
             if batch_first:
                 output = output.transpose(0, 1)
@@ -608,12 +637,19 @@ def __init__(
         bidirectional: bool = False,
         device=None,
         dtype=None,
+<<<<<<< HEAD
     ) -> None:
         ...
 
     @overload
     def __init__(self, *args, **kwargs):
         ...
+=======
+    ) -> None: ...
+
+    @overload
+    def __init__(self, *args, **kwargs): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, *args, **kwargs):
         if "proj_size" in kwargs:
@@ -966,12 +1002,19 @@ def __init__(
         proj_size: int = 0,
         device=None,
         dtype=None,
+<<<<<<< HEAD
     ) -> None:
         ...
 
     @overload
     def __init__(self, *args, **kwargs):
         ...
+=======
+    ) -> None: ...
+
+    @overload
+    def __init__(self, *args, **kwargs): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, *args, **kwargs):
         super().__init__("LSTM", *args, **kwargs)
@@ -1301,12 +1344,19 @@ def __init__(
         bidirectional: bool = False,
         device=None,
         dtype=None,
+<<<<<<< HEAD
     ) -> None:
         ...
 
     @overload
     def __init__(self, *args, **kwargs):
         ...
+=======
+    ) -> None: ...
+
+    @overload
+    def __init__(self, *args, **kwargs): ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, *args, **kwargs):
         if "proj_size" in kwargs:
diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py
index d9991073ee8c..d7f4c2b98acb 100644
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@@ -59,9 +59,17 @@ class Embedding(Module):
             embedding = nn.Embedding(n, d, max_norm=1.0)
             W = torch.randn((m, d), requires_grad=True)
             idx = torch.tensor([1, 2])
+<<<<<<< HEAD
             a = embedding.weight.clone() @ W.t()  # weight must be cloned for this to be differentiable
             b = embedding(idx) @ W.t()  # modifies weight in-place
             out = (a.unsqueeze(0) + b.unsqueeze(1))
+=======
+            a = (
+                embedding.weight.clone() @ W.t()
+            )  # weight must be cloned for this to be differentiable
+            b = embedding(idx) @ W.t()  # modifies weight in-place
+            out = a.unsqueeze(0) + b.unsqueeze(1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             loss = out.sigmoid().prod()
             loss.backward()
 
@@ -150,6 +158,7 @@ def __init__(
         self.embedding_dim = embedding_dim
         if padding_idx is not None:
             if padding_idx > 0:
+<<<<<<< HEAD
                 assert (
                     padding_idx < self.num_embeddings
                 ), "Padding_idx must be within num_embeddings"
@@ -157,6 +166,15 @@ def __init__(
                 assert (
                     padding_idx >= -self.num_embeddings
                 ), "Padding_idx must be within num_embeddings"
+=======
+                assert padding_idx < self.num_embeddings, (
+                    "Padding_idx must be within num_embeddings"
+                )
+            elif padding_idx < 0:
+                assert padding_idx >= -self.num_embeddings, (
+                    "Padding_idx must be within num_embeddings"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 padding_idx = self.num_embeddings + padding_idx
         self.padding_idx = padding_idx
         self.max_norm = max_norm
@@ -248,9 +266,15 @@ def from_pretrained(
             >>> embedding(input)
             tensor([[ 4.0000,  5.1000,  6.3000]])
         """
+<<<<<<< HEAD
         assert (
             embeddings.dim() == 2
         ), "Embeddings parameter is expected to be 2-dimensional"
+=======
+        assert embeddings.dim() == 2, (
+            "Embeddings parameter is expected to be 2-dimensional"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rows, cols = embeddings.shape
         embedding = cls(
             num_embeddings=rows,
@@ -391,6 +415,7 @@ def __init__(
         self.scale_grad_by_freq = scale_grad_by_freq
         if padding_idx is not None:
             if padding_idx > 0:
+<<<<<<< HEAD
                 assert (
                     padding_idx < self.num_embeddings
                 ), "padding_idx must be within num_embeddings"
@@ -398,6 +423,15 @@ def __init__(
                 assert (
                     padding_idx >= -self.num_embeddings
                 ), "padding_idx must be within num_embeddings"
+=======
+                assert padding_idx < self.num_embeddings, (
+                    "padding_idx must be within num_embeddings"
+                )
+            elif padding_idx < 0:
+                assert padding_idx >= -self.num_embeddings, (
+                    "padding_idx must be within num_embeddings"
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 padding_idx = self.num_embeddings + padding_idx
         self.padding_idx = padding_idx
         if _weight is None:
@@ -526,9 +560,15 @@ def from_pretrained(
             >>> embeddingbag(input)
             tensor([[ 2.5000,  3.7000,  4.6500]])
         """
+<<<<<<< HEAD
         assert (
             embeddings.dim() == 2
         ), "Embeddings parameter is expected to be 2-dimensional"
+=======
+        assert embeddings.dim() == 2, (
+            "Embeddings parameter is expected to be 2-dimensional"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rows, cols = embeddings.shape
         embeddingbag = cls(
             num_embeddings=rows,
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 4218bddc71e3..86f2cb20ccc9 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -55,6 +55,7 @@ def _get_seq_len(src: Tensor, batch_first: bool) -> Optional[int]:
 
 
 class Transformer(Module):
+<<<<<<< HEAD
     r"""A transformer model.
 
     .. note::
@@ -64,6 +65,19 @@ class Transformer(Module):
 
     User is able to modify the attributes as needed. The architecture
     is based on the paper `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+=======
+    r"""A basic transformer layer.
+
+
+    This Transformer layer implements the original Transformer architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer Transformer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build an efficient transformer layer from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         d_model: the number of expected features in the encoder/decoder inputs (default=512).
@@ -84,7 +98,11 @@ class Transformer(Module):
         bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
             bias. Default: ``True``.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
         >>> src = torch.rand((10, 32, 512))
         >>> tgt = torch.rand((20, 32, 512))
@@ -254,7 +272,13 @@ def forward(
 
         Examples:
             >>> # xdoctest: +SKIP
+<<<<<<< HEAD
             >>> output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
+=======
+            >>> output = transformer_model(
+            ...     src, tgt, src_mask=src_mask, tgt_mask=tgt_mask
+            ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         is_batched = src.dim() == 3
         if not self.batch_first and src.size(1) != tgt.size(1) and is_batched:
@@ -307,12 +331,27 @@ def _reset_parameters(self):
 class TransformerEncoder(Module):
     r"""TransformerEncoder is a stack of N encoder layers.
 
+<<<<<<< HEAD
     .. note::
         See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
         for an in depth discussion of the performant building blocks PyTorch offers for building your own
         transformer layers.
 
     Users can build the BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
+=======
+    This TransformerEncoder layer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer Transformer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+
+    .. warning::
+        All layers in the TransformerEncoder are initialized with the same parameters.
+        It is recommended to manually initialize the layers after creating the TransformerEncoder instance.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         encoder_layer: an instance of the TransformerEncoderLayer() class (required).
@@ -322,7 +361,11 @@ class TransformerEncoder(Module):
             (and convert back on output). This will improve the overall performance of
             TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
         >>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
         >>> src = torch.rand(10, 32, 512)
@@ -530,17 +573,36 @@ def forward(
 class TransformerDecoder(Module):
     r"""TransformerDecoder is a stack of N decoder layers.
 
+<<<<<<< HEAD
     .. note::
         See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
         for an in depth discussion of the performant building blocks PyTorch offers for building your own
         transformer layers.
+=======
+    This TransformerDecoder layer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer Transformer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+
+    .. warning::
+        All layers in the TransformerDecoder are initialized with the same parameters.
+        It is recommended to manually initialize the layers after creating the TransformerDecoder instance.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         decoder_layer: an instance of the TransformerDecoderLayer() class (required).
         num_layers: the number of sub-decoder-layers in the decoder (required).
         norm: the layer normalization component (optional).
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
         >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
         >>> memory = torch.rand(10, 32, 512)
@@ -627,6 +689,7 @@ def forward(
 class TransformerEncoderLayer(Module):
     r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
 
+<<<<<<< HEAD
     .. note::
         See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
         for an in depth discussion of the performant building blocks PyTorch offers for building your own
@@ -634,6 +697,16 @@ class TransformerEncoderLayer(Module):
 
     This standard encoder layer is based on the paper `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
     Users may modify or implement in a different way during application.
+=======
+    This TransformerEncoderLayer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer Transformer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     TransformerEncoderLayer can handle either traditional torch.tensor inputs,
     or Nested Tensor inputs.  Derived classes are expected to similarly accept
@@ -663,13 +736,23 @@ class TransformerEncoderLayer(Module):
         bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
             bias. Default: ``True``.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
         >>> src = torch.rand(10, 32, 512)
         >>> out = encoder_layer(src)
 
     Alternatively, when ``batch_first`` is ``True``:
+<<<<<<< HEAD
         >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
+=======
+        >>> encoder_layer = nn.TransformerEncoderLayer(
+        ...     d_model=512, nhead=8, batch_first=True
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> src = torch.rand(32, 10, 512)
         >>> out = encoder_layer(src)
 
@@ -945,6 +1028,7 @@ def _ff_block(self, x: Tensor) -> Tensor:
 class TransformerDecoderLayer(Module):
     r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
 
+<<<<<<< HEAD
     .. note::
         See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
         for an in depth discussion of the performant building blocks PyTorch offers for building your own
@@ -952,6 +1036,16 @@ class TransformerDecoderLayer(Module):
 
     This standard decoder layer is based on the paper `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
     Users may modify or implement in a different way during application.
+=======
+    This TransformerDecoderLayer implements the original architecture described
+    in the `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_ paper. The
+    intent of this layer is as a reference implementation for foundational understanding
+    and thus it contains only limited features relative to newer Transformer architectures.
+    Given the fast pace of innovation in transformer-like architectures, we recommend
+    exploring this `tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+    to build efficient layers from building blocks in core or using higher
+    level libraries from the `PyTorch Ecosystem <https://landscape.pytorch.org/>`_.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         d_model: the number of expected features in the input (required).
@@ -969,14 +1063,24 @@ class TransformerDecoderLayer(Module):
         bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
             bias. Default: ``True``.
 
+<<<<<<< HEAD
     Examples::
+=======
+    Examples:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
         >>> memory = torch.rand(10, 32, 512)
         >>> tgt = torch.rand(20, 32, 512)
         >>> out = decoder_layer(tgt, memory)
 
     Alternatively, when ``batch_first`` is ``True``:
+<<<<<<< HEAD
         >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=True)
+=======
+        >>> decoder_layer = nn.TransformerDecoderLayer(
+        ...     d_model=512, nhead=8, batch_first=True
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> memory = torch.rand(32, 10, 512)
         >>> tgt = torch.rand(32, 20, 512)
         >>> out = decoder_layer(tgt, memory)
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
index ef232aa18054..b3af3b49b933 100644
--- a/torch/nn/parallel/_functions.py
+++ b/torch/nn/parallel/_functions.py
@@ -11,9 +11,15 @@
 class Broadcast(Function):
     @staticmethod
     def forward(ctx, target_gpus, *inputs):
+<<<<<<< HEAD
         assert all(
             i.device.type != "cpu" for i in inputs
         ), "Broadcast function not implemented for CPU tensors"
+=======
+        assert all(i.device.type != "cpu" for i in inputs), (
+            "Broadcast function not implemented for CPU tensors"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         target_gpus = [_get_device_index(x, True) for x in target_gpus]
         ctx.target_gpus = target_gpus
         if len(inputs) == 0:
@@ -56,9 +62,15 @@ def backward(ctx, *grad_outputs):
 class Gather(Function):
     @staticmethod
     def forward(ctx, target_device, dim, *inputs):
+<<<<<<< HEAD
         assert all(
             i.device.type != "cpu" for i in inputs
         ), "Gather function not implemented for CPU tensors"
+=======
+        assert all(i.device.type != "cpu" for i in inputs), (
+            "Gather function not implemented for CPU tensors"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if target_device == "cpu":
             ctx.target_device = "cpu"
         else:
@@ -96,17 +108,28 @@ def forward(ctx, target_gpus, chunk_sizes, dim, input):
         ctx.dim = dim
         ctx.input_device = input.get_device() if input.device.type != "cpu" else -1
         streams = None
+<<<<<<< HEAD
         if torch.cuda.is_available() and ctx.input_device == -1:
             # Perform CPU to GPU copies in a background stream
             streams = [
                 _get_stream(torch.device("cuda", device)) for device in target_gpus
             ]
+=======
+        if torch.accelerator.is_available() and ctx.input_device == -1:
+            # Perform CPU to GPU copies in a background stream
+            streams = [_get_stream(torch.device(device)) for device in target_gpus]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
         # Synchronize with the copy stream
         if streams is not None:
             for i, output in enumerate(outputs):
+<<<<<<< HEAD
                 with torch.cuda.device(target_gpus[i]):
                     main_stream = torch.cuda.current_stream()
+=======
+                with torch.accelerator.device_index(target_gpus[i]):
+                    main_stream = torch.accelerator.current_stream()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     main_stream.wait_stream(streams[i])
                     output.record_stream(main_stream)
         return outputs
@@ -123,6 +146,7 @@ def backward(ctx, *grad_output):
 def _get_stream(device: torch.device):
     """Get a background stream for copying between CPU and target device."""
     global _streams
+<<<<<<< HEAD
     if device.type == "cpu":
         return None
     device_mod = getattr(torch, device.type, None)
@@ -132,4 +156,13 @@ def _get_stream(device: torch.device):
         _streams = [None] * device_mod.device_count()
     if _streams[device.index] is None:
         _streams[device.index] = device_mod.Stream(device.index)
+=======
+    if device.type == "cpu" or not torch.accelerator.is_available():
+        return None
+    assert torch.accelerator.current_accelerator().type == device.type
+    if _streams is None:
+        _streams = [None] * torch.accelerator.device_count()
+    if _streams[device.index] is None:
+        _streams[device.index] = torch.Stream(device.index)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _streams[device.index]
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 3945809f0f4b..8f881a67b5cf 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -616,6 +616,13 @@ class DistributedDataParallel(Module, Joinable):
                     as these named params will be ignored by DDP reducer.
         param_to_hook_all_reduce (torch.nn.Parameter): a parameter to hook delayed all reduce
                     of parameters specified in ``delay_all_reduce_named_params``.
+<<<<<<< HEAD
+=======
+        skip_all_reduce_unused_params: When set to True, DDP will skip reducing unused parameters.
+                    This requires that unused parameters remain the same across all ranks throughout
+                    the entire training process. If this condition is not met, it may cause
+                    desynchronization and result in training hang.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
     Attributes:
@@ -649,9 +656,19 @@ def __init__(
         param_to_hook_all_reduce=None,
         mixed_precision: Optional[_MixedPrecision] = None,
         device_mesh=None,
+<<<<<<< HEAD
     ):
         super().__init__()
         Joinable.__init__(self)
+=======
+        skip_all_reduce_unused_params=False,
+    ):
+        super().__init__()
+        Joinable.__init__(self)
+        self._use_python_reducer = (
+            torch._dynamo.utils.get_optimize_ddp_mode() == "python_reducer"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.logger: Optional[dist.Logger] = None
         if bool(delay_all_reduce_named_params is not None) != bool(
             param_to_hook_all_reduce is not None
@@ -751,7 +768,11 @@ def __init__(
                     "DistributedDataParallel device_ids and output_device arguments "
                     "only work with single-device/multiple-device GPU modules or CPU modules, "
                     f"but got device_ids {device_ids}, output_device {output_device}, "
+<<<<<<< HEAD
                     f"and module parameters {({p.device for p in self._module_parameters})}.",
+=======
+                    f"and module parameters { ({p.device for p in self._module_parameters}) }.",  # noqa: E201,E202
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             self.device_ids = None
@@ -826,6 +847,11 @@ def __init__(
             if self._delay_all_reduce_all_params:
                 return
 
+<<<<<<< HEAD
+=======
+        self.skip_all_reduce_unused_params = skip_all_reduce_unused_params
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Build parameters for reducer.
         parameters, expect_sparse_gradient = self._build_params_for_reducer()
 
@@ -908,8 +934,11 @@ def __init__(
         # True. The hooks will be deregistered if compiled_autograd is not
         # enabled.
         self._accum_grad_hooks: list[RemovableHandle] = []
+<<<<<<< HEAD
         optimize_ddp = torch._dynamo.utils.get_optimize_ddp_mode()
         self._use_python_reducer = optimize_ddp == "python_reducer"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self._use_python_reducer:
             torch._inductor.config._fuse_ddp_communication = True
             torch._inductor.config._fuse_ddp_bucket_size = bucket_cap_mb
@@ -1220,6 +1249,11 @@ def _ddp_init_helper(
                 if self.bucket_bytes_cap_default
                 else self.bucket_bytes_cap
             ),
+<<<<<<< HEAD
+=======
+            self.skip_all_reduce_unused_params,
+            self._use_python_reducer,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.logger = dist.Logger(self.reducer)
@@ -1510,7 +1544,12 @@ def _pre_forward(self, *inputs, **kwargs):
         work = Join.notify_join_context(self)
         if work:
             self.reducer._set_forward_pass_work_handle(
+<<<<<<< HEAD
                 work, self._divide_by_initial_world_size  # type: ignore[arg-type]
+=======
+                work,
+                self._divide_by_initial_world_size,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # Calling _rebuild_buckets before forward computation,
diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py
index eb6c8ec29b20..7150da0b3d68 100644
--- a/torch/nn/parallel/parallel_apply.py
+++ b/torch/nn/parallel/parallel_apply.py
@@ -46,9 +46,15 @@ def parallel_apply(
     element of :attr:`inputs` can either be a single object as the only argument
     to a module, or a collection of positional arguments.
     """
+<<<<<<< HEAD
     assert len(modules) == len(
         inputs
     ), f"The number of modules {len(modules)} is not equal to the number of inputs {len(inputs)}"
+=======
+    assert len(modules) == len(inputs), (
+        f"The number of modules {len(modules)} is not equal to the number of inputs {len(inputs)}"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if kwargs_tup is not None:
         assert len(modules) == len(kwargs_tup)
     else:
@@ -88,9 +94,17 @@ def _worker(
         if stream is None:
             stream = torch.cuda.current_stream(device)
         try:
+<<<<<<< HEAD
             with torch.cuda.device(device), torch.cuda.stream(
                 stream
             ), torch.amp.autocast("cuda", enabled=autocast_enabled):
+=======
+            with (
+                torch.cuda.device(device),
+                torch.cuda.stream(stream),
+                torch.amp.autocast("cuda", enabled=autocast_enabled),
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # this also avoids accidental slicing of `input` if it is a Tensor
                 if not isinstance(input, (list, tuple)):
                     input = (input,)
diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py
index 34c7d5116eec..52eeff0266a5 100644
--- a/torch/nn/parallel/replicate.py
+++ b/torch/nn/parallel/replicate.py
@@ -184,7 +184,11 @@ def replicate(
                     # so setattr them as non-parameter attributes
                     setattr(replica, key, param_copy)
                     # expose the parameter for DDP
+<<<<<<< HEAD
                     replica._former_parameters[key] = param_copy
+=======
+                    replica._former_parameters[key] = param_copy  # type: ignore[operator, index]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for key, buf in module._buffers.items():  # type: ignore[assignment]
             if buf is None:
                 for j in range(num_replicas):
diff --git a/torch/nn/parallel/scatter_gather.py b/torch/nn/parallel/scatter_gather.py
index c70d3d5a7de5..2efd536e1bb1 100644
--- a/torch/nn/parallel/scatter_gather.py
+++ b/torch/nn/parallel/scatter_gather.py
@@ -35,8 +35,12 @@ def scatter(
     inputs: torch.Tensor,
     target_gpus: Sequence[Union[int, torch.device]],
     dim: int = ...,
+<<<<<<< HEAD
 ) -> tuple[torch.Tensor, ...]:
     ...
+=======
+) -> tuple[torch.Tensor, ...]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @overload
@@ -44,8 +48,12 @@ def scatter(
     inputs: T,
     target_gpus: Sequence[Union[int, torch.device]],
     dim: int = ...,
+<<<<<<< HEAD
 ) -> list[T]:
     ...
+=======
+) -> list[T]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def scatter(inputs, target_gpus, dim=0):
diff --git a/torch/nn/qat/__init__.py b/torch/nn/qat/__init__.py
index 01a17572bc2e..a62d835737ed 100644
--- a/torch/nn/qat/__init__.py
+++ b/torch/nn/qat/__init__.py
@@ -4,6 +4,10 @@
 This package is in the process of being deprecated.
 Please, use `torch.ao.nn.qat.dynamic` instead.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.nn.qat import dynamic, modules  # noqa: F403
 from torch.nn.qat.modules import *  # noqa: F403
 
diff --git a/torch/nn/qat/dynamic/__init__.py b/torch/nn/qat/dynamic/__init__.py
index b8a05d8bde0f..659096a1fde6 100644
--- a/torch/nn/qat/dynamic/__init__.py
+++ b/torch/nn/qat/dynamic/__init__.py
@@ -4,4 +4,8 @@
 This package is in the process of being deprecated.
 Please, use `torch.ao.nn.qat.dynamic` instead.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.nn.qat.dynamic.modules import *  # noqa: F403
diff --git a/torch/nn/qat/dynamic/modules/linear.py b/torch/nn/qat/dynamic/modules/linear.py
index ea69fba158d3..8e848dca5daf 100644
--- a/torch/nn/qat/dynamic/modules/linear.py
+++ b/torch/nn/qat/dynamic/modules/linear.py
@@ -7,4 +7,8 @@
 appropriate file under the `torch/ao/nn/qat/dynamic/modules`,
 while adding an import statement here.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.nn.qat.dynamic.modules.linear import Linear
diff --git a/torch/nn/qat/modules/__init__.py b/torch/nn/qat/modules/__init__.py
index 667ae790b648..47accb08179a 100644
--- a/torch/nn/qat/modules/__init__.py
+++ b/torch/nn/qat/modules/__init__.py
@@ -4,6 +4,10 @@
 This package is in the process of being deprecated.
 Please, use `torch.ao.nn.qat.modules` instead.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.nn.qat.modules.conv import Conv1d, Conv2d, Conv3d
 from torch.ao.nn.qat.modules.embedding_ops import Embedding, EmbeddingBag
 from torch.ao.nn.qat.modules.linear import Linear
diff --git a/torch/nn/qat/modules/linear.py b/torch/nn/qat/modules/linear.py
index f5841a46096c..c97f94eedab0 100644
--- a/torch/nn/qat/modules/linear.py
+++ b/torch/nn/qat/modules/linear.py
@@ -7,4 +7,8 @@
 appropriate file under the `torch/ao/nn/qat/modules`,
 while adding an import statement here.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.nn.qat.modules.linear import Linear
diff --git a/torch/nn/quantizable/modules/activation.py b/torch/nn/quantizable/modules/activation.py
index e4f7a5ca3b54..d31af1db9eba 100644
--- a/torch/nn/quantizable/modules/activation.py
+++ b/torch/nn/quantizable/modules/activation.py
@@ -7,4 +7,8 @@
 appropriate file under the `torch/ao/nn/quantizable/modules`,
 while adding an import statement here.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.nn.quantizable.modules.activation import MultiheadAttention
diff --git a/torch/nn/quantized/dynamic/modules/linear.py b/torch/nn/quantized/dynamic/modules/linear.py
index 592384dbdb34..989b59a0f2f9 100644
--- a/torch/nn/quantized/dynamic/modules/linear.py
+++ b/torch/nn/quantized/dynamic/modules/linear.py
@@ -7,4 +7,8 @@
 appropriate file under the `torch/ao/nn/quantized/dynamic/modules`,
 while adding an import statement here.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.nn.quantized.dynamic.modules.linear import Linear
diff --git a/torch/nn/utils/_expanded_weights/conv_expanded_weights.py b/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
index aee3a75e70f4..d89caa49e8bd 100644
--- a/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
@@ -1,7 +1,20 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
 import torch
 import torch.nn.functional as F
 
+=======
+from typing import Any, Callable, TypeVar
+from typing_extensions import ParamSpec
+
+import torch
+import torch.nn.functional as F
+
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from .conv_utils import (
     conv_args_and_kwargs,
     conv_backward,
@@ -17,7 +30,16 @@
 @implements_per_sample_grads(F.conv3d)
 class ConvPerSampleGrad(torch.autograd.Function):
     @staticmethod
+<<<<<<< HEAD
     def forward(ctx, kwarg_names, conv_fn, *expanded_args_and_kwargs):
+=======
+    def forward(
+        ctx: Any,
+        kwarg_names: list[str],
+        conv_fn: Callable[_P, _R],
+        *expanded_args_and_kwargs: Any,
+    ) -> torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expanded_args, expanded_kwargs = conv_args_and_kwargs(
             kwarg_names, expanded_args_and_kwargs
         )
@@ -64,5 +86,10 @@ def forward(ctx, kwarg_names, conv_fn, *expanded_args_and_kwargs):
         return output
 
     @staticmethod
+<<<<<<< HEAD
     def backward(ctx, grad_output):
         return conv_backward(ctx.conv_fn, ctx, grad_output)
+=======
+    def backward(ctx: Any, *grad_outputs: Any) -> Any:
+        return conv_backward(ctx.conv_fn, ctx, grad_outputs[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/nn/utils/_expanded_weights/conv_utils.py b/torch/nn/utils/_expanded_weights/conv_utils.py
index eb14df567095..3c4edb79d577 100644
--- a/torch/nn/utils/_expanded_weights/conv_utils.py
+++ b/torch/nn/utils/_expanded_weights/conv_utils.py
@@ -1,8 +1,11 @@
 # mypy: allow-untyped-defs
 from typing import Optional
 
+<<<<<<< HEAD
 import numpy as np
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.nn.functional as F
 
@@ -213,6 +216,11 @@ def conv_unfold_weight_grad_sample(
     groups,
     func,
 ):
+<<<<<<< HEAD
+=======
+    import numpy as np
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     n = input.shape[0]
     in_channels = input.shape[1]
 
@@ -314,10 +322,20 @@ def unfold3d(
     Example:
         >>> # xdoctest: +SKIP
         >>> B, C, D, H, W = 3, 4, 5, 6, 7
+<<<<<<< HEAD
         >>> tensor = torch.arange(1, B * C * D * H * W + 1.).view(B, C, D, H, W)
         >>> unfold3d(tensor, kernel_size=2, padding=0, stride=1).shape
         torch.Size([3, 32, 120])
     """
+=======
+        >>> tensor = torch.arange(1, B * C * D * H * W + 1.0).view(B, C, D, H, W)
+        >>> unfold3d(tensor, kernel_size=2, padding=0, stride=1).shape
+        torch.Size([3, 32, 120])
+    """
+
+    import numpy as np
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if len(tensor.shape) != 5:
         raise ValueError(
             f"Input tensor must be of the shape [B, C, D, H, W]. Got{tensor.shape}"
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
index 1935c591346d..6c00410d33d7 100644
--- a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
@@ -72,8 +72,15 @@ def reset(ew):
 
 @contextmanager
 def setup_rnn(use_input_variant, args, kwargs):
+<<<<<<< HEAD
     with batch_second(args, kwargs) if use_input_variant else allow_smaller_batches(
         args, kwargs
+=======
+    with (
+        batch_second(args, kwargs)
+        if use_input_variant
+        else allow_smaller_batches(args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         yield
 
@@ -150,6 +157,7 @@ def __torch_function__(cls, func, _, args=(), kwargs=None):
         )
 
     @property
+<<<<<<< HEAD
     def dtype(self):
         return self.orig_weight.dtype
 
@@ -167,6 +175,25 @@ def device(self):
 
     @property
     def is_cuda(self):
+=======
+    def dtype(self):  # type: ignore[override]
+        return self.orig_weight.dtype
+
+    @property
+    def data(self):  # type: ignore[override]
+        return self.orig_weight.data
+
+    @property
+    def shape(self):  # type: ignore[override]
+        return self.orig_weight.shape
+
+    @property
+    def device(self):  # type: ignore[override]
+        return self.orig_weight.device
+
+    @property
+    def is_cuda(self):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.orig_weight.is_cuda
 
     def data_ptr(self):
diff --git a/torch/nn/utils/_per_sample_grad.py b/torch/nn/utils/_per_sample_grad.py
index eeb6e1eeaf3c..2ee84ccb4067 100644
--- a/torch/nn/utils/_per_sample_grad.py
+++ b/torch/nn/utils/_per_sample_grad.py
@@ -49,7 +49,13 @@ def call_for_per_sample_grads(
     grad_outputs by 1 / batch_size from cross batch interaction.
         >>> model = nn.Linear(4, 3)
         >>> batched_input = torch.randn(5, 4)  # batch size of 5
+<<<<<<< HEAD
         >>> res = call_for_per_sample_grads(model, 5, loss_reduction="mean")(batched_input).mean()
+=======
+        >>> res = call_for_per_sample_grads(model, 5, loss_reduction="mean")(
+        ...     batched_input
+        ... ).mean()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> res.backward()
 
     Note::
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
index 976dceaccf52..cf8346af6abd 100644
--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@@ -1,7 +1,13 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 import functools
+<<<<<<< HEAD
 import typing
+=======
+import types
+import typing
+import warnings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import cast, Optional, Union
 from typing_extensions import deprecated
 
@@ -14,11 +20,15 @@
 )
 
 
+<<<<<<< HEAD
 __all__ = [
     "clip_grad_norm_",
     "clip_grad_norm",
     "clip_grad_value_",
 ]
+=======
+__all__: list[str] = []
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 _tensor_or_tensors = Union[
@@ -152,9 +162,13 @@ def _clip_grads_with_norm_(
         return
     grouped_grads: dict[
         tuple[torch.device, torch.dtype], tuple[list[list[Tensor]], list[int]]
+<<<<<<< HEAD
     ] = _group_tensors_by_device_and_dtype(
         [grads]
     )  # type: ignore[assignment]
+=======
+    ] = _group_tensors_by_device_and_dtype([grads])  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     clip_coef = max_norm / (total_norm + 1e-6)
     # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so
@@ -197,12 +211,21 @@ def clip_grad_norm_(
         parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
             single Tensor that will have gradients normalized
         max_norm (float): max norm of the gradients
+<<<<<<< HEAD
         norm_type (float): type of the used p-norm. Can be ``'inf'`` for
             infinity norm.
         error_if_nonfinite (bool): if True, an error is thrown if the total
             norm of the gradients from :attr:`parameters` is ``nan``,
             ``inf``, or ``-inf``. Default: False (will switch to True in the future)
         foreach (bool): use the faster foreach-based implementation.
+=======
+        norm_type (float, optional): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm. Default: 2.0
+        error_if_nonfinite (bool, optional): if True, an error is thrown if the total
+            norm of the gradients from :attr:`parameters` is ``nan``,
+            ``inf``, or ``-inf``. Default: False
+        foreach (bool, optional): use the faster foreach-based implementation.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently
             fall back to the slow implementation for other device types.
             Default: ``None``
@@ -213,8 +236,19 @@ def clip_grad_norm_(
     if isinstance(parameters, torch.Tensor):
         parameters = [parameters]
     else:
+<<<<<<< HEAD
+        # prevent generators from being exhausted
+        parameters = list(parameters)
+=======
+        is_generator = isinstance(parameters, types.GeneratorType)
         # prevent generators from being exhausted
         parameters = list(parameters)
+        if is_generator and len(parameters) == 0:
+            warnings.warn(
+                "`parameters` is an empty generator, no gradient clipping will occur.",
+                stacklevel=3,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grads = [p.grad for p in parameters if p.grad is not None]
     total_norm = _get_total_norm(grads, norm_type, error_if_nonfinite, foreach)
     _clip_grads_with_norm_(parameters, max_norm, total_norm, foreach)
@@ -258,7 +292,11 @@ def clip_grad_value_(
         clip_value (float): maximum allowed value of the gradients.
             The gradients are clipped in the range
             :math:`\left[\text{-clip\_value}, \text{clip\_value}\right]`
+<<<<<<< HEAD
         foreach (bool): use the faster foreach-based implementation
+=======
+        foreach (bool, optional): use the faster foreach-based implementation
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             If ``None``, use the foreach implementation for CUDA and CPU native tensors and
             silently fall back to the slow implementation for other device types.
             Default: ``None``
@@ -284,3 +322,11 @@ def clip_grad_value_(
         else:
             for grad in grads:
                 cast(Tensor, grad).clamp_(min=-clip_value, max=clip_value)
+<<<<<<< HEAD
+=======
+
+
+clip_grad_norm.__module__ = "torch.nn.utils"
+clip_grad_norm_.__module__ = "torch.nn.utils"
+clip_grad_value_.__module__ = "torch.nn.utils"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/nn/utils/fusion.py b/torch/nn/utils/fusion.py
index c9878b0697ee..978049e9f143 100644
--- a/torch/nn/utils/fusion.py
+++ b/torch/nn/utils/fusion.py
@@ -135,9 +135,15 @@ def fuse_linear_bn_eval(
     2. the number of features in bn is 1
     Otherwise, skip the folding path
     """
+<<<<<<< HEAD
     assert (
         linear.out_features == bn.num_features or bn.num_features == 1
     ), "To fuse, linear.out_features == bn.num_features or bn.num_features == 1"
+=======
+    assert linear.out_features == bn.num_features or bn.num_features == 1, (
+        "To fuse, linear.out_features == bn.num_features or bn.num_features == 1"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     assert bn.running_mean is not None and bn.running_var is not None
     fused_linear.weight, fused_linear.bias = fuse_linear_bn_weights(
diff --git a/torch/nn/utils/memory_format.py b/torch/nn/utils/memory_format.py
index ab723b4697b9..aa85c9af8357 100644
--- a/torch/nn/utils/memory_format.py
+++ b/torch/nn/utils/memory_format.py
@@ -1,8 +1,24 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
 import torch
 
 
 def convert_conv2d_weight_memory_format(module, memory_format):
+=======
+from __future__ import annotations
+
+from typing import TypeVar
+
+import torch
+
+
+_M = TypeVar("_M", bound="torch.nn.Module")
+
+
+def convert_conv2d_weight_memory_format(
+    module: _M, memory_format: torch.memory_format
+) -> _M:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Convert ``memory_format`` of ``nn.Conv2d.weight`` to ``memory_format``.
 
     The conversion recursively applies to nested ``nn.Module``, including ``module``.
@@ -55,20 +71,36 @@ def convert_conv2d_weight_memory_format(module, memory_format):
     Example:
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
         >>> # xdoctest: +REQUIRES(env:CUBLAS_WORKSPACE_CONFIG)
+<<<<<<< HEAD
         >>> input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float16, device="cuda")
+=======
+        >>> input = torch.randint(
+        ...     1, 10, (2, 8, 4, 4), dtype=torch.float16, device="cuda"
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> model = nn.Sequential(
         >>>     nn.Conv2d(8, 4, 3)).cuda().half()
         >>> # This is identical to:
         >>> # nn.utils.convert_conv2d_weight_memory_format(model, torch.channels_last)
+<<<<<<< HEAD
         >>> model = nn.utils.convert_conv2d_weight_memory_format(model, torch.channels_last)
+=======
+        >>> model = nn.utils.convert_conv2d_weight_memory_format(
+        ...     model, torch.channels_last
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> out = model(input)
     """
     # TODO: expand this to `_ConvNd` when channels_last support is extended
     # beyond only 4d tensors.
     if isinstance(module, (torch.nn.Conv2d, torch.nn.ConvTranspose2d)):
+<<<<<<< HEAD
         weight_data = (
             module.weight.detach().clone().contiguous(memory_format=memory_format)
         )
+=======
+        weight_data = module.weight.detach().clone(memory_format=memory_format)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module.weight.data = weight_data.resize_(
             weight_data.size(), memory_format=memory_format
         )
@@ -77,7 +109,13 @@ def convert_conv2d_weight_memory_format(module, memory_format):
     return module
 
 
+<<<<<<< HEAD
 def convert_conv3d_weight_memory_format(module, memory_format):
+=======
+def convert_conv3d_weight_memory_format(
+    module: _M, memory_format: torch.memory_format
+) -> _M:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""Convert ``memory_format`` of ``nn.Conv3d.weight`` to ``memory_format``
     The conversion recursively applies to nested ``nn.Module``, including ``module``.
     Note that it only changes the memory_format, but not the semantics of each dimensions.
@@ -129,24 +167,49 @@ def convert_conv3d_weight_memory_format(module, memory_format):
     Example:
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
         >>> # xdoctest: +REQUIRES(env:CUBLAS_WORKSPACE_CONFIG)
+<<<<<<< HEAD
         >>> input = torch.randint(1, 10, (2, 8, 4, 4, 4), dtype=torch.float16, device="cuda")
+=======
+        >>> input = torch.randint(
+        ...     1, 10, (2, 8, 4, 4, 4), dtype=torch.float16, device="cuda"
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> model = nn.Sequential(
         >>>     nn.Conv3d(8, 4, 3)).cuda().half()
         >>> # This is identical to:
         >>> # nn.utils.convert_conv3d_weight_memory_format(model, torch.channels_last_3d)
+<<<<<<< HEAD
         >>> model = nn.utils.convert_conv3d_weight_memory_format(model, torch.channels_last_3d)
+=======
+        >>> model = nn.utils.convert_conv3d_weight_memory_format(
+        ...     model, torch.channels_last_3d
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> out = model(input)
     """
 
     # TODO: expand this to `_ConvNd` when channels_last support is extended
     # beyond only 4d tensors.
     if isinstance(module, (torch.nn.Conv3d, torch.nn.ConvTranspose3d)):
+<<<<<<< HEAD
         weight_data = (
             module.weight.detach().clone().contiguous(memory_format=memory_format)
         )
+=======
+        weight_data = module.weight.detach().clone(memory_format=memory_format)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module.weight.data = weight_data.resize_(
             weight_data.size(), memory_format=memory_format
         )
     for child in module.children():
         convert_conv3d_weight_memory_format(child, memory_format)
     return module
+<<<<<<< HEAD
+=======
+
+
+__all__ = [
+    "convert_conv2d_weight_memory_format",
+    "convert_conv3d_weight_memory_format",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index 544397e5378f..28082e0dc0a5 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -46,6 +46,10 @@ def cached():
     .. code-block:: python
 
         import torch.nn.utils.parametrize as P
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...
         with P.cached():
             output = model(inputs)
@@ -519,13 +523,18 @@ def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]]
         >>> print(torch.allclose(m.weight, m.weight.T))  # m.weight is now symmetric
         True
         >>> A = torch.rand(5, 5)
+<<<<<<< HEAD
         >>> A = A + A.T   # A is now symmetric
+=======
+        >>> A = A + A.T  # A is now symmetric
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> m.weight = A  # Initialize the weight to be the symmetric matrix A
         >>> print(torch.allclose(m.weight, A))
         True
 
         >>> class RankOne(nn.Module):
         >>>     def forward(self, x, y):
+<<<<<<< HEAD
         >>>         # Form a rank 1 matrix multiplying two vectors
         >>>         return x.unsqueeze(-1) @ y.unsqueeze(-2)
         >>>
@@ -537,6 +546,21 @@ def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]]
         >>>         return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt
         >>>
         >>> linear_rank_one = P.register_parametrization(nn.Linear(4, 4), "weight", RankOne())
+=======
+        >>> # Form a rank 1 matrix multiplying two vectors
+        >>>         return x.unsqueeze(-1) @ y.unsqueeze(-2)
+        >>>
+        >>>     def right_inverse(self, Z):
+        >>> # Project Z onto the rank 1 matrices
+        >>>         U, S, Vh = torch.linalg.svd(Z, full_matrices=False)
+        >>> # Return rescaled singular vectors
+        >>>         s0_sqrt = S[0].sqrt().unsqueeze(-1)
+        >>>         return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt
+        >>>
+        >>> linear_rank_one = P.register_parametrization(
+        ...     nn.Linear(4, 4), "weight", RankOne()
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> print(torch.linalg.matrix_rank(linear_rank_one.weight).item())
         1
 
@@ -594,9 +618,15 @@ def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]]
 
         # add the new parametrization to the parametrization list
         assert isinstance(module.parametrizations, ModuleDict)  # Make mypy happy
+<<<<<<< HEAD
         module.parametrizations[tensor_name].append(parametrization)
         # If unsafe was True in previous parametrization, keep it enabled
         module.parametrizations[tensor_name].unsafe |= unsafe  # type: ignore[index, union-attr]
+=======
+        module.parametrizations[tensor_name].append(parametrization)  # type: ignore[operator]
+        # If unsafe was True in previous parametrization, keep it enabled
+        module.parametrizations[tensor_name].unsafe |= unsafe  # type: ignore[index, union-attr, operator]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif tensor_name in module._buffers or tensor_name in module._parameters:
         # Set the parametrization mechanism
         # Fetch the original buffer or parameter
@@ -686,6 +716,10 @@ def remove_parametrizations(
     parametrizations = module.parametrizations[tensor_name]
     if parametrizations.is_tensor:
         original = parametrizations.original
+<<<<<<< HEAD
+=======
+        assert isinstance(original, torch.Tensor), "is_tensor promised us a Tensor"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if leave_parametrized:
             with torch.no_grad():
                 t = getattr(module, tensor_name)
@@ -792,7 +826,13 @@ def transfer_parametrizations_and_params(
                 )
 
             # apply the params's parametrizations to to_module
+<<<<<<< HEAD
             for param_func in from_module.parametrizations[parameter_name]:
+=======
+            for param_func in from_module.parametrizations[  # type: ignore[attr-defined]
+                parameter_name
+            ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 register_parametrization(to_module, parameter_name, param_func)
             assert isinstance(to_module.parametrizations, ModuleDict)  # for mypy
 
diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py
index 583620dfa40d..232461e57149 100644
--- a/torch/nn/utils/prune.py
+++ b/torch/nn/utils/prune.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 r"""Pruning methods."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import numbers
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
@@ -63,9 +67,15 @@ def apply_mask(self, module):
         """
         # to carry out the multiplication, the mask needs to have been computed,
         # so the pruning method must know what tensor it's operating on
+<<<<<<< HEAD
         assert (
             self._tensor_name is not None
         ), f"Module {module} has to be pruned"  # this gets set in apply()
+=======
+        assert self._tensor_name is not None, (
+            f"Module {module} has to be pruned"
+        )  # this gets set in apply()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         mask = getattr(module, self._tensor_name + "_mask")
         orig = getattr(module, self._tensor_name + "_orig")
         pruned_tensor = mask.to(dtype=orig.dtype) * orig
@@ -109,10 +119,17 @@ def _get_composite_method(cls, module, name, *args, **kwargs):
                     old_method = hook
                     hooks_to_remove.append(k)
                     found += 1
+<<<<<<< HEAD
             assert (
                 found <= 1
             ), f"Avoid adding multiple pruning hooks to the\
                 same tensor {name} of module {module}. Use a PruningContainer."
+=======
+            assert found <= 1, (
+                f"Avoid adding multiple pruning hooks to the\
+                same tensor {name} of module {module}. Use a PruningContainer."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             for k in hooks_to_remove:
                 del module._forward_pre_hooks[k]
@@ -153,9 +170,15 @@ def _get_composite_method(cls, module, name, *args, **kwargs):
 
         orig = getattr(module, name)
         if importance_scores is not None:
+<<<<<<< HEAD
             assert (
                 importance_scores.shape == orig.shape
             ), f"importance_scores should have the same shape as parameter                 {name} of {module}"
+=======
+            assert importance_scores.shape == orig.shape, (
+                f"importance_scores should have the same shape as parameter                 {name} of {module}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             importance_scores = orig
 
@@ -222,9 +245,15 @@ def prune(self, t, default_mask=None, importance_scores=None):
             pruned version of tensor ``t``.
         """
         if importance_scores is not None:
+<<<<<<< HEAD
             assert (
                 importance_scores.shape == t.shape
             ), "importance_scores should have the same shape as tensor t"
+=======
+            assert importance_scores.shape == t.shape, (
+                "importance_scores should have the same shape as tensor t"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             importance_scores = t
         default_mask = default_mask if default_mask is not None else torch.ones_like(t)
@@ -241,9 +270,15 @@ def remove(self, module):
             Pruning itself is NOT undone or reversed!
         """
         # before removing pruning from a tensor, it has to have been applied
+<<<<<<< HEAD
         assert (
             self._tensor_name is not None
         ), f"Module {module} has to be pruned            before pruning can be removed"  # this gets set in apply()
+=======
+        assert self._tensor_name is not None, (
+            f"Module {module} has to be pruned            before pruning can be removed"
+        )  # this gets set in apply()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # to update module[name] to latest trained weights
         weight = self.apply_mask(module)  # masked weights
@@ -394,6 +429,11 @@ def _combine_masks(method, t, mask):
                 raise ValueError(f"Unrecognized PRUNING_TYPE {method.PRUNING_TYPE}")
 
             # compute the new mask on the unpruned slice of the tensor t
+<<<<<<< HEAD
+=======
+            if isinstance(slc, list):
+                slc = tuple(slc)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             partial_mask = method.compute_mask(t[slc], default_mask=mask[slc])
             new_mask[slc] = partial_mask.to(dtype=new_mask.dtype)
 
@@ -414,7 +454,11 @@ def compute_mask(self, t, default_mask):
         return mask
 
     @classmethod
+<<<<<<< HEAD
     def apply(cls, module, name):
+=======
+    def apply(cls, module, name):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Add pruning on the fly and reparametrization of a tensor.
 
         Adds the forward pre-hook that enables pruning on the fly and
@@ -469,7 +513,11 @@ def compute_mask(self, t, default_mask):
         return mask
 
     @classmethod
+<<<<<<< HEAD
     def apply(cls, module, name, amount):
+=======
+    def apply(cls, module, name, amount):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Add pruning on the fly and reparametrization of a tensor.
 
         Adds the forward pre-hook that enables pruning on the fly and
@@ -528,7 +576,11 @@ def compute_mask(self, t, default_mask):
         return mask
 
     @classmethod
+<<<<<<< HEAD
     def apply(cls, module, name, amount, importance_scores=None):
+=======
+    def apply(cls, module, name, amount, importance_scores=None):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Add pruning on the fly and reparametrization of a tensor.
 
         Adds the forward pre-hook that enables pruning on the fly and
@@ -625,6 +677,10 @@ def make_mask(t, dim, nchannels, nchannels_toprune):
             mask = torch.zeros_like(t)
             slc = [slice(None)] * len(t.shape)
             slc[dim] = channel_mask
+<<<<<<< HEAD
+=======
+            slc = tuple(slc)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             mask[slc] = 1
             return mask
 
@@ -638,7 +694,11 @@ def make_mask(t, dim, nchannels, nchannels_toprune):
         return mask
 
     @classmethod
+<<<<<<< HEAD
     def apply(cls, module, name, amount, dim=-1):
+=======
+    def apply(cls, module, name, amount, dim=-1):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Add pruning on the fly and reparametrization of a tensor.
 
         Adds the forward pre-hook that enables pruning on the fly and
@@ -739,6 +799,10 @@ def make_mask(t, dim, indices):
             # replace a None at position=dim with indices
             # e.g.: slc = [None, None, [0, 2, 3]] if dim=2 & indices=[0,2,3]
             slc[dim] = indices
+<<<<<<< HEAD
+=======
+            slc = tuple(slc)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # use slc to slice mask and replace all its entries with 1s
             # e.g.: mask[:, :, [0, 2, 3]] = 1
             mask[slc] = 1
@@ -753,7 +817,11 @@ def make_mask(t, dim, indices):
         return mask
 
     @classmethod
+<<<<<<< HEAD
     def apply(cls, module, name, amount, n, dim, importance_scores=None):
+=======
+    def apply(cls, module, name, amount, n, dim, importance_scores=None):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Add pruning on the fly and reparametrization of a tensor.
 
         Adds the forward pre-hook that enables pruning on the fly and
@@ -800,7 +868,11 @@ def compute_mask(self, t, default_mask):
         return mask
 
     @classmethod
+<<<<<<< HEAD
     def apply(cls, module, name, mask):
+=======
+    def apply(cls, module, name, mask):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Add pruning on the fly and reparametrization of a tensor.
 
         Adds the forward pre-hook that enables pruning on the fly and
@@ -842,7 +914,11 @@ def identity(module, name):
 
     Examples:
         >>> # xdoctest: +SKIP
+<<<<<<< HEAD
         >>> m = prune.identity(nn.Linear(2, 3), 'bias')
+=======
+        >>> m = prune.identity(nn.Linear(2, 3), "bias")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> print(m.bias_mask)
         tensor([1., 1., 1.])
     """
@@ -878,7 +954,11 @@ def random_unstructured(module, name, amount):
 
     Examples:
         >>> # xdoctest: +SKIP
+<<<<<<< HEAD
         >>> m = prune.random_unstructured(nn.Linear(2, 3), 'weight', amount=1)
+=======
+        >>> m = prune.random_unstructured(nn.Linear(2, 3), "weight", amount=1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> torch.sum(m.weight_mask == 0)
         tensor(1)
 
@@ -921,7 +1001,11 @@ def l1_unstructured(module, name, amount, importance_scores=None):
 
     Examples:
         >>> # xdoctest: +SKIP
+<<<<<<< HEAD
         >>> m = prune.l1_unstructured(nn.Linear(2, 3), 'weight', amount=0.2)
+=======
+        >>> m = prune.l1_unstructured(nn.Linear(2, 3), "weight", amount=0.2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> m.state_dict().keys()
         odict_keys(['bias', 'weight_orig', 'weight_mask'])
     """
@@ -961,9 +1045,13 @@ def random_structured(module, name, amount, dim):
 
     Examples:
         >>> # xdoctest: +SKIP
+<<<<<<< HEAD
         >>> m = prune.random_structured(
         ...     nn.Linear(5, 3), 'weight', amount=3, dim=1
         ... )
+=======
+        >>> m = prune.random_structured(nn.Linear(5, 3), "weight", amount=3, dim=1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> columns_pruned = int(sum(torch.sum(m.weight, dim=0) == 0))
         >>> print(columns_pruned)
         3
@@ -1010,7 +1098,11 @@ def ln_structured(module, name, amount, n, dim, importance_scores=None):
     Examples:
         >>> from torch.nn.utils import prune
         >>> m = prune.ln_structured(
+<<<<<<< HEAD
         ...     nn.Conv2d(5, 3, 2), 'weight', amount=0.3, dim=1, n=float('-inf')
+=======
+        ...     nn.Conv2d(5, 3, 2), "weight", amount=0.3, dim=1, n=float("-inf")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ... )
     """
     LnStructured.apply(
@@ -1063,6 +1155,7 @@ def global_unstructured(parameters, pruning_method, importance_scores=None, **kw
     Examples:
         >>> from torch.nn.utils import prune
         >>> from collections import OrderedDict
+<<<<<<< HEAD
         >>> net = nn.Sequential(OrderedDict([
         ...     ('first', nn.Linear(10, 4)),
         ...     ('second', nn.Linear(4, 1)),
@@ -1070,6 +1163,19 @@ def global_unstructured(parameters, pruning_method, importance_scores=None, **kw
         >>> parameters_to_prune = (
         ...     (net.first, 'weight'),
         ...     (net.second, 'weight'),
+=======
+        >>> net = nn.Sequential(
+        ...     OrderedDict(
+        ...         [
+        ...             ("first", nn.Linear(10, 4)),
+        ...             ("second", nn.Linear(4, 1)),
+        ...         ]
+        ...     )
+        ... )
+        >>> parameters_to_prune = (
+        ...     (net.first, "weight"),
+        ...     (net.second, "weight"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ... )
         >>> prune.global_unstructured(
         ...     parameters_to_prune,
@@ -1161,7 +1267,11 @@ def custom_from_mask(module, name, mask):
     Examples:
         >>> from torch.nn.utils import prune
         >>> m = prune.custom_from_mask(
+<<<<<<< HEAD
         ...     nn.Linear(5, 3), name='bias', mask=torch.tensor([0, 1, 0])
+=======
+        ...     nn.Linear(5, 3), name="bias", mask=torch.tensor([0, 1, 0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ... )
         >>> print(m.bias_mask)
         tensor([0., 1., 0.])
@@ -1187,8 +1297,13 @@ def remove(module, name):
             will act.
 
     Examples:
+<<<<<<< HEAD
         >>> m = random_unstructured(nn.Linear(5, 7), name='weight', amount=0.2)
         >>> m = remove(m, name='weight')
+=======
+        >>> m = random_unstructured(nn.Linear(5, 7), name="weight", amount=0.2)
+        >>> m = remove(m, name="weight")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     for k, hook in module._forward_pre_hooks.items():
         if isinstance(hook, BasePruningMethod) and hook._tensor_name == name:
@@ -1219,7 +1334,11 @@ def is_pruned(module):
         >>> m = nn.Linear(5, 7)
         >>> print(prune.is_pruned(m))
         False
+<<<<<<< HEAD
         >>> prune.random_unstructured(m, name='weight', amount=0.2)
+=======
+        >>> prune.random_unstructured(m, name="weight", amount=0.2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> print(prune.is_pruned(m))
         True
     """
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index 0b5e85b87cf4..87d3aa730803 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -105,8 +105,12 @@ def to(
         dtype: torch.dtype,
         non_blocking: bool = ...,
         copy: bool = ...,
+<<<<<<< HEAD
     ) -> Self:
         ...
+=======
+    ) -> Self: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     def to(
@@ -115,8 +119,12 @@ def to(
         dtype: Optional[torch.dtype] = ...,
         non_blocking: bool = ...,
         copy: bool = ...,
+<<<<<<< HEAD
     ) -> Self:
         ...
+=======
+    ) -> Self: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @overload
     def to(
@@ -124,8 +132,12 @@ def to(
         other: Tensor,
         non_blocking: bool = ...,
         copy: bool = ...,
+<<<<<<< HEAD
     ) -> Self:
         ...
+=======
+    ) -> Self: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def to(self, *args: Any, **kwargs: Any) -> Self:
         r"""Perform dtype and/or device conversion on `self.data`.
@@ -354,7 +366,13 @@ def pad_packed_sequence(
         >>> from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
         >>> seq = torch.tensor([[1, 2, 0], [3, 0, 0], [4, 5, 6]])
         >>> lens = [2, 1, 3]
+<<<<<<< HEAD
         >>> packed = pack_padded_sequence(seq, lens, batch_first=True, enforce_sorted=False)
+=======
+        >>> packed = pack_padded_sequence(
+        ...     seq, lens, batch_first=True, enforce_sorted=False
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> packed
         PackedSequence(data=tensor([4, 1, 3, 5, 2, 6]), batch_sizes=tensor([3, 2, 1]),
                        sorted_indices=tensor([2, 0, 1]), unsorted_indices=tensor([1, 2, 0]))
@@ -473,7 +491,14 @@ def pad_sequence(
     # assuming trailing dimensions and type of all the Tensors
     # in sequences are same and fetching those from sequences[0]
     return torch._C._nn.pad_sequence(
+<<<<<<< HEAD
         sequences, batch_first, padding_value, padding_side  # type: ignore[arg-type]
+=======
+        sequences,  # type: ignore[arg-type]
+        batch_first,
+        padding_value,
+        padding_side,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/torch/nn/utils/spectral_norm.py b/torch/nn/utils/spectral_norm.py
index 3474a127a0b4..05e021f7bce8 100644
--- a/torch/nn/utils/spectral_norm.py
+++ b/torch/nn/utils/spectral_norm.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 """Spectral Normalization from https://arxiv.org/abs/1802.05957."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, Optional, TypeVar
 
 import torch
diff --git a/torch/nn/utils/weight_norm.py b/torch/nn/utils/weight_norm.py
index 0eb51d4df132..8f27ff3b6bb4 100644
--- a/torch/nn/utils/weight_norm.py
+++ b/torch/nn/utils/weight_norm.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 r"""Weight Normalization from https://arxiv.org/abs/1602.07868."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any, TypeVar
 from typing_extensions import deprecated
 
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 2e0b691318df..4ac432fcf43a 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -4,9 +4,16 @@
 
 __all__ = [
     # Modules
+<<<<<<< HEAD
     "symbolic_helper",
     "utils",
     "errors",
+=======
+    "errors",
+    "ops",
+    "symbolic_helper",
+    "utils",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # All opsets
     "symbolic_caffe2",
     "symbolic_opset7",
@@ -36,12 +43,17 @@
     "unregister_custom_op_symbolic",
     # Base error
     "OnnxExporterError",
+<<<<<<< HEAD
     # Dynamo Exporter
     "DiagnosticOptions",
     "ExportOptions",
     "ONNXProgram",
     "ONNXRuntimeOptions",
     "OnnxRegistry",
+=======
+    "ExportOptions",
+    "ONNXProgram",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "dynamo_export",
     "enable_fake_mode",
     # DORT / torch.compile
@@ -52,10 +64,17 @@
 from typing_extensions import deprecated
 
 import torch
+<<<<<<< HEAD
 from torch import _C
 from torch._C import _onnx as _C_onnx
 from torch._C._onnx import OperatorExportTypes, TensorProtoDataType, TrainingMode
 
+=======
+from torch._C import _onnx as _C_onnx
+from torch._C._onnx import OperatorExportTypes, TensorProtoDataType, TrainingMode
+
+from ._internal._exporter_legacy import enable_fake_mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from ._internal.exporter._onnx_program import ONNXProgram
 from ._internal.onnxruntime import (
     is_onnxrt_backend_supported,
@@ -68,7 +87,10 @@
 from .utils import (
     _run_symbolic_function,
     _run_symbolic_method,
+<<<<<<< HEAD
     is_in_onnx_export,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     register_custom_op_symbolic,
     select_model_mode_for_export,
     unregister_custom_op_symbolic,
@@ -77,6 +99,10 @@
 
 from . import (  # usort: skip. Keep the order instead of sorting lexicographically
     errors,
+<<<<<<< HEAD
+=======
+    ops,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     symbolic_caffe2,
     symbolic_helper,
     symbolic_opset7,
@@ -97,6 +123,7 @@
 )
 
 
+<<<<<<< HEAD
 from ._internal._exporter_legacy import (  # usort: skip. needs to be last to avoid circular import
     DiagnosticOptions,
     ExportOptions,
@@ -106,11 +133,14 @@
 )
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if TYPE_CHECKING:
     import os
     from collections.abc import Collection, Mapping, Sequence
 
 # Set namespace for exposed private names
+<<<<<<< HEAD
 DiagnosticOptions.__module__ = "torch.onnx"
 ExportOptions.__module__ = "torch.onnx"
 JitScalarType.__module__ = "torch.onnx"
@@ -118,6 +148,11 @@
 ONNXRuntimeOptions.__module__ = "torch.onnx"
 OnnxExporterError.__module__ = "torch.onnx"
 OnnxRegistry.__module__ = "torch.onnx"
+=======
+JitScalarType.__module__ = "torch.onnx"
+ONNXProgram.__module__ = "torch.onnx"
+OnnxExporterError.__module__ = "torch.onnx"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _OrtBackend.__module__ = "torch.onnx"
 _OrtBackendOptions.__module__ = "torch.onnx"
 _OrtExecutionProvider.__module__ = "torch.onnx"
@@ -169,6 +204,28 @@ def export(
 ) -> ONNXProgram | None:
     r"""Exports a model into ONNX format.
 
+<<<<<<< HEAD
+=======
+    Setting ``dynamo=True`` enables the new ONNX export logic
+    which is based on :class:`torch.export.ExportedProgram` and a more modern
+    set of translation logic. This is the recommended way to export models
+    to ONNX.
+
+    When ``dynamo=True``:
+
+    The exporter tries the following strategies to get an ExportedProgram for conversion to ONNX.
+
+    #. If the model is already an ExportedProgram, it will be used as-is.
+    #. Use :func:`torch.export.export` and set ``strict=False``.
+    #. Use :func:`torch.export.export` and set ``strict=True``.
+    #. Use ``draft_export`` which removes some soundness guarantees in data-dependent
+       operations to allow export to proceed. You will get a warning if the exporter
+       encounters any unsound data-dependent operation.
+    #. Use :func:`torch.jit.trace` to trace the model then convert to ExportedProgram.
+       This is the most unsound strategy but may be useful for converting TorchScript
+       models to ONNX.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
         model: The model to be exported.
         args: Example positional inputs. Any non-Tensor arguments will be hard-coded into the
@@ -361,6 +418,19 @@ def forward(self, x):
 
         if isinstance(args, torch.Tensor):
             args = (args,)
+<<<<<<< HEAD
+=======
+        # Prepare legacy export parameters for potential fallback
+        legacy_export_kwargs = {
+            "training": training,
+            "operator_export_type": operator_export_type,
+            "do_constant_folding": do_constant_folding,
+            "custom_opsets": custom_opsets,
+            "export_modules_as_functions": export_modules_as_functions,
+            "autograd_inlining": autograd_inlining,
+        }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return _compat.export_compat(
             model,
             args,
@@ -383,10 +453,32 @@ def forward(self, x):
             dump_exported_program=dump_exported_program,
             artifacts_dir=artifacts_dir,
             fallback=fallback,
+<<<<<<< HEAD
         )
     else:
         from torch.onnx.utils import export
 
+=======
+            legacy_export_kwargs=legacy_export_kwargs,
+        )
+    else:
+        import warnings
+
+        from torch.onnx.utils import export
+
+        warnings.warn(
+            "You are using the legacy TorchScript-based ONNX export. Starting in PyTorch 2.9, "
+            "the new torch.export-based ONNX exporter will be the default. To switch now, set "
+            "dynamo=True in torch.onnx.export. This new exporter supports features like exporting "
+            "LLMs with DynamicCache. We encourage you to try it and share feedback to help improve "
+            "the experience. Learn more about the new export logic: "
+            "https://pytorch.org/docs/stable/onnx_dynamo.html. For exporting control flow: "
+            "https://pytorch.org/tutorials/beginner/onnx/export_control_flow_model_to_onnx_tutorial.html.",
+            category=DeprecationWarning,
+            stacklevel=2,
+        )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if dynamic_shapes:
             raise ValueError(
                 "The exporter only supports dynamic shapes "
@@ -416,7 +508,31 @@ def forward(self, x):
 
 
 @deprecated(
+<<<<<<< HEAD
     "torch.onnx.dynamo_export is deprecated since 2.6.0. Please use torch.onnx.export(..., dynamo=True) instead."
+=======
+    "torch.onnx.dynamo_export is deprecated since 2.7.0. Please use torch.onnx.export(..., dynamo=True) instead."
+)
+class ExportOptions:
+    """Options for dynamo_export.
+
+    .. deprecated:: 2.7
+        Please use ``torch.onnx.export(..., dynamo=True)`` instead.
+
+    Attributes:
+        dynamic_shapes: Shape information hint for input/output tensors.
+            When ``None``, the exporter determines the most compatible setting.
+            When ``True``, all input shapes are considered dynamic.
+            When ``False``, all input shapes are considered static.
+    """
+
+    def __init__(self, *, dynamic_shapes: bool | None = None):
+        self.dynamic_shapes: bool | None = dynamic_shapes
+
+
+@deprecated(
+    "torch.onnx.dynamo_export is deprecated since 2.7.0. Please use torch.onnx.export(..., dynamo=True) instead."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 def dynamo_export(
     model: torch.nn.Module | Callable | torch.export.ExportedProgram,  # type: ignore[name-defined]
@@ -442,7 +558,10 @@ def dynamo_export(
 
     import warnings
 
+<<<<<<< HEAD
     from torch.onnx import _flags
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from torch.onnx._internal.exporter import _compat
     from torch.utils import _pytree
 
@@ -457,6 +576,7 @@ def dynamo_export(
             export_params=True,
             fallback=True,
         )
+<<<<<<< HEAD
     elif _flags.USE_EXPERIMENTAL_LOGIC:
         if export_options is not None:
             warnings.warn(
@@ -503,3 +623,52 @@ def _to_dynamic_shape(x):
         return dynamo_export(
             model, *model_args, export_options=export_options, **model_kwargs
         )
+=======
+    if export_options is not None:
+        warnings.warn(
+            "You are using an experimental ONNX export logic, which currently only supports dynamic shapes. "
+            "For a more comprehensive set of export options, including advanced features, please consider using "
+            "`torch.onnx.export(..., dynamo=True)`. ",
+            category=DeprecationWarning,
+        )
+
+    if export_options is not None and export_options.dynamic_shapes:
+        # Make all shapes dynamic if it's possible
+        def _to_dynamic_shape(x):
+            if isinstance(x, torch.Tensor):
+                rank = len(x.shape)
+                dynamic_shape = {}
+                for i in range(rank):
+                    dynamic_shape[i] = torch.export.Dim.AUTO
+                return dynamic_shape
+            else:
+                return None
+
+        # model_args could be nested
+        dynamic_shapes = _pytree.tree_map(
+            _to_dynamic_shape,
+            model_args,
+        )
+    else:
+        dynamic_shapes = None
+
+    return _compat.export_compat(
+        model,  # type: ignore[arg-type]
+        model_args,
+        f=None,
+        kwargs=model_kwargs,
+        dynamic_shapes=dynamic_shapes,
+        opset_version=18,
+        external_data=True,
+        export_params=True,
+        fallback=True,
+    )
+
+
+def is_in_onnx_export() -> bool:
+    """Returns whether it is in the middle of ONNX export."""
+    from torch.onnx._globals import GLOBALS
+    from torch.onnx._internal.exporter import _flags
+
+    return GLOBALS.in_onnx_export or _flags._is_onnx_exporting
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/_constants.py b/torch/onnx/_constants.py
index 6c91b245ed70..1d3801b78dfe 100644
--- a/torch/onnx/_constants.py
+++ b/torch/onnx/_constants.py
@@ -4,10 +4,16 @@
 
 ONNX_BASE_OPSET = 9
 ONNX_MIN_OPSET = 7
+<<<<<<< HEAD
 ONNX_MAX_OPSET = 20
 ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET = 20
 # ONNX_DEFAULT_OPSET generated by tools/onnx/update_default_opset_version.py
 ONNX_DEFAULT_OPSET = 17
+=======
+ONNX_MAX_OPSET = 23
+ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET = 20
+ONNX_DEFAULT_OPSET = 18
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ONNX_CONSTANT_FOLDING_MIN_OPSET = 9
 
 PYTORCH_GITHUB_ISSUES_URL = "https://github.com/pytorch/pytorch/issues"
diff --git a/torch/onnx/_flags.py b/torch/onnx/_flags.py
index c62005e66b51..43b9c3131726 100644
--- a/torch/onnx/_flags.py
+++ b/torch/onnx/_flags.py
@@ -43,8 +43,14 @@ def _load_boolean_flag(
     return state
 
 
+<<<<<<< HEAD
 USE_EXPERIMENTAL_LOGIC: bool = _load_boolean_flag(
     "TORCH_ONNX_USE_EXPERIMENTAL_LOGIC",
     this_will="use ExportedProgram and the new torch.onnx export logic",
+=======
+PLACEHOLDER: bool = _load_boolean_flag(
+    "TORCH_ONNX_PLACEHOLDER",
+    this_will="do nothing",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     default=True,
 )
diff --git a/torch/onnx/_globals.py b/torch/onnx/_globals.py
index f3dd273386f8..ca745d3a34b6 100644
--- a/torch/onnx/_globals.py
+++ b/torch/onnx/_globals.py
@@ -54,11 +54,14 @@ def export_onnx_opset_version(self) -> int:
 
     @export_onnx_opset_version.setter
     def export_onnx_opset_version(self, value: int):
+<<<<<<< HEAD
         supported_versions = range(
             _constants.ONNX_MIN_OPSET, _constants.ONNX_MAX_OPSET + 1
         )
         if value not in supported_versions:
             raise ValueError(f"Unsupported ONNX opset version: {value}")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._export_onnx_opset_version = value
 
     @property
diff --git a/torch/onnx/_internal/_exporter_legacy.py b/torch/onnx/_internal/_exporter_legacy.py
index 86e91d7b9740..09323c2a0be6 100644
--- a/torch/onnx/_internal/_exporter_legacy.py
+++ b/torch/onnx/_internal/_exporter_legacy.py
@@ -3,6 +3,7 @@
 
 
 __all__ = [
+<<<<<<< HEAD
     "DiagnosticOptions",
     "ExportOptions",
     "ONNXRuntimeOptions",
@@ -10,6 +11,11 @@
     "OnnxRegistry",
     "UnsatisfiedDependencyError",
     "dynamo_export",
+=======
+    "ExportOptions",
+    "ONNXRuntimeOptions",
+    "OnnxRegistry",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "enable_fake_mode",
 ]
 
@@ -20,17 +26,27 @@
 import logging
 import warnings
 from collections import defaultdict
+<<<<<<< HEAD
 from typing import Any, Callable, TYPE_CHECKING, TypeVar
+=======
+from typing import Any, Callable, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import deprecated
 
 import torch
 import torch._ops
+<<<<<<< HEAD
 import torch.utils._pytree as pytree
 from torch.onnx import errors
 from torch.onnx._internal import io_adapter
 from torch.onnx._internal._lazy_import import onnxscript_apis, onnxscript_ir as ir
 from torch.onnx._internal.diagnostics import infra
 from torch.onnx._internal.exporter import _constants, _onnx_program
+=======
+from torch.onnx._internal import io_adapter
+from torch.onnx._internal._lazy_import import onnxscript_apis
+from torch.onnx._internal.exporter import _constants
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.onnx._internal.fx import (
     decomposition_table,
     patcher as patcher,
@@ -49,6 +65,7 @@
     import onnxscript
 
     from torch._subclasses import fake_tensor
+<<<<<<< HEAD
     from torch.onnx._internal.fx import diagnostics
 
 _PYTORCH_GITHUB_ISSUES_URL = "https://github.com/pytorch/pytorch/issues"
@@ -57,13 +74,18 @@
 _DEFAULT_FAILED_EXPORT_SARIF_LOG_PATH = "report_dynamo_export.sarif"
 """The default path to write the SARIF log to if the export fails."""
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 log = logging.getLogger(__name__)
 
 
+<<<<<<< HEAD
 DiagnosticOptions = infra.DiagnosticOptions
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclasses.dataclass
 class ONNXFakeContext:
     """A dataclass used to store context for model export using FakeTensor.
@@ -245,11 +267,15 @@ class ExportOptions:
             When ``None``, the exporter determines the most compatible setting.
             When ``True``, all input shapes are considered dynamic.
             When ``False``, all input shapes are considered static.
+<<<<<<< HEAD
         diagnostic_options: The diagnostic options for the exporter.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fake_context: The fake context used for symbolic tracing.
         onnx_registry: The ONNX registry used to register ATen operators to ONNX functions.
     """
 
+<<<<<<< HEAD
     dynamic_shapes: bool | None = None
     """Shape information hint for input/output tensors.
 
@@ -274,11 +300,22 @@ def __init__(
         fake_context: ONNXFakeContext | None = None,
         onnx_registry: OnnxRegistry | None = None,
         diagnostic_options: DiagnosticOptions | None = None,
+=======
+    def __init__(
+        self,
+        *,
+        dynamic_shapes: bool | None = True,
+        fake_context: ONNXFakeContext | None = None,
+        onnx_registry: OnnxRegistry | None = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         self.dynamic_shapes = dynamic_shapes
         self.fake_context = fake_context
         self.onnx_registry = onnx_registry
+<<<<<<< HEAD
         self.diagnostic_options = diagnostic_options or DiagnosticOptions()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @deprecated(
@@ -291,6 +328,7 @@ class ResolvedExportOptions(ExportOptions):
     This is an internal class and its API may be changed at any time without notice.
     """
 
+<<<<<<< HEAD
     # Public attributes MUST be redefined below without ``Optional[]`` from ``ExportOptions``
     dynamic_shapes: bool
     diagnostic_options: DiagnosticOptions
@@ -376,6 +414,28 @@ def resolve(value: T | None, fallback: T | Callable[[], T]) -> T:
             for key in dir(options):
                 if not key.startswith("_"):  # skip private attributes
                     assert hasattr(self, key), f"Unresolved option '{key}'"
+=======
+    def __init__(self):
+        from torch.onnx._internal.fx import (
+            dynamo_graph_extractor,
+            onnxfunction_dispatcher,
+        )
+
+        self.dynamic_shapes: bool = True
+        self.fx_tracer: dynamo_graph_extractor.DynamoExport = (
+            dynamo_graph_extractor.DynamoExport()
+        )
+        self.fake_context = None
+        self.onnx_registry: OnnxRegistry = OnnxRegistry()
+        self.decomposition_table = (
+            decomposition_table.create_onnx_friendly_decomposition_table(  # type: ignore[assignment]
+                self.onnx_registry
+            )
+        )
+        self.onnxfunction_dispatcher = onnxfunction_dispatcher.OnnxFunctionDispatcher(
+            self.onnx_registry,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @contextlib.contextmanager
@@ -536,6 +596,7 @@ def pre_export_passes(
         ...
 
 
+<<<<<<< HEAD
 class Exporter:
     def __init__(
         self,
@@ -826,6 +887,8 @@ def forward(self, x, bias=None):
         raise errors.OnnxExporterError(message) from e
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def common_pre_export_passes(
     options: ResolvedExportOptions,
     original_model: torch.nn.Module | Callable,
@@ -833,6 +896,7 @@ def common_pre_export_passes(
     fx_module_args: Sequence[Any],
 ):
     # TODO: Import here to prevent circular dependency
+<<<<<<< HEAD
     from torch.onnx._internal.fx import analysis, passes
 
     diagnostic_context = options.diagnostic_context
@@ -842,6 +906,14 @@ def common_pre_export_passes(
         diagnostic_context,
         fx_module,
         options.decomposition_table,
+=======
+    from torch.onnx._internal.fx import passes
+
+    # Apply decomposition table to the input graph.
+    module = passes.Decompose(
+        fx_module,
+        options.decomposition_table,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         enable_dynamic_axes=options.dynamic_shapes,
         allow_fake_constant=options.fake_context is not None,
     ).run(*fx_module_args)
@@ -849,7 +921,10 @@ def common_pre_export_passes(
     # ONNX does not support views and mutations.
     # Functionalize to get a semantically equivalent graph without mutations.
     module = passes.Functionalize(
+<<<<<<< HEAD
         diagnostic_context,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module,
         enable_dynamic_axes=options.dynamic_shapes,
         allow_fake_constant=options.fake_context is not None,
@@ -857,6 +932,7 @@ def common_pre_export_passes(
 
     # Input mutations are detected and distilled after `Functionalize` pass.
     # Remove them since ONNX inference does not need them.
+<<<<<<< HEAD
     module = passes.RemoveInputMutation(diagnostic_context, module).run(*fx_module_args)
 
     # ONNX does not support concept of (implicit) type promotion.
@@ -875,6 +951,16 @@ def common_pre_export_passes(
     # This operation should be invoked as the last pre export pass.
     # See [NOTE: Modularize pass ordering]
     module = passes.Modularize(diagnostic_context, module).run()
+=======
+    module = passes.RemoveInputMutation(module).run(*fx_module_args)
+
+    # ONNX does not support concept of (implicit) type promotion.
+    # Insert type casts explicitly where needed.
+    module = passes.InsertTypePromotion(module).run()
+
+    if isinstance(original_model, torch.nn.Module):
+        module = passes.RestoreParameterAndBufferNames(module, original_model).run()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # ONNX does not support None inputs. During graph building, all None inputs
     # are removed. Here we register this step to input adapter.
diff --git a/torch/onnx/_internal/_lazy_import.py b/torch/onnx/_internal/_lazy_import.py
index 59aa0e498757..8c2c9ddd593d 100644
--- a/torch/onnx/_internal/_lazy_import.py
+++ b/torch/onnx/_internal/_lazy_import.py
@@ -28,13 +28,26 @@ def __getattr__(self, attr: str) -> object:
 # NOTE: Add additional used imports here.
 if TYPE_CHECKING:
     import onnx
+<<<<<<< HEAD
     import onnxscript
     import onnxscript._framework_apis.torch_2_7 as onnxscript_apis
 
     onnxscript_ir = onnxscript.ir
+=======
+    import onnx_ir  # type: ignore[import-untyped]
+    import onnxscript
+    import onnxscript._framework_apis.torch_2_8 as onnxscript_apis
+
+    onnxscript_ir = onnx_ir
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 else:
     onnx = _LazyModule("onnx")
     onnxscript = _LazyModule("onnxscript")
+<<<<<<< HEAD
     onnxscript_ir = _LazyModule("onnxscript.ir")
     onnxscript_apis = _LazyModule("onnxscript._framework_apis.torch_2_7")
+=======
+    onnxscript_ir = _LazyModule("onnx_ir")
+    onnxscript_apis = _LazyModule("onnxscript._framework_apis.torch_2_8")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/_internal/exporter/_analysis.py b/torch/onnx/_internal/exporter/_analysis.py
index 2eb5adab3a7d..7d894faf089c 100644
--- a/torch/onnx/_internal/exporter/_analysis.py
+++ b/torch/onnx/_internal/exporter/_analysis.py
@@ -5,6 +5,10 @@
 from __future__ import annotations
 
 import dataclasses
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import textwrap
 import traceback
 from collections import defaultdict
@@ -99,7 +103,13 @@ def _format_model_info(model_info: ModelInfo) -> str:
     lines.append("\n")
     lines.append("Of the call_function nodes, the counts of operators used are:\n")
     sorted_targets = sorted(
+<<<<<<< HEAD
         model_info.fx_node_target_count.items(), key=lambda x: x[1], reverse=True
+=======
+        model_info.fx_node_target_count.items(),
+        key=operator.itemgetter(1),
+        reverse=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     for target, count in sorted_targets:
         lines.append(f"- `{target}`: {count}")
@@ -127,7 +137,11 @@ def _format_model_info(model_info: ModelInfo) -> str:
                 target_to_messages[str(node.target)] = message
 
         for target, nodes in sorted(
+<<<<<<< HEAD
             target_to_nodes.items(), key=lambda x: x[0], reverse=True
+=======
+            target_to_nodes.items(), key=operator.itemgetter(0), reverse=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             message = textwrap.indent(
                 f"{target_to_messages[target]}. Example node: `{nodes[0].format_node()}`. All nodes: `{nodes}`",
diff --git a/torch/onnx/_internal/exporter/_capture_strategies.py b/torch/onnx/_internal/exporter/_capture_strategies.py
index f95f8f9276b2..6d4f12d8ec52 100644
--- a/torch/onnx/_internal/exporter/_capture_strategies.py
+++ b/torch/onnx/_internal/exporter/_capture_strategies.py
@@ -12,7 +12,11 @@
 from typing import Any, Callable, TYPE_CHECKING
 
 import torch
+<<<<<<< HEAD
 from torch.utils import _pytree
+=======
+from torch.export import _draft_export
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -62,6 +66,16 @@ class Result:
 
     @property
     def success(self) -> bool:
+<<<<<<< HEAD
+=======
+        """Whether the capture was successful.
+
+        An exception can still be recorded even if the capture was successful. In
+        this case the exception is informational only. For example, draft_export
+        can record an exception if there are warnings during the export. The exceptions
+        will go into the onnx export report when report=True.
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.exported_program is not None
 
 
@@ -71,7 +85,11 @@ class CaptureStrategy(abc.ABC):
     To use a strategy, create an instance and call it with the model, args, kwargs, and dynamic_shapes.
     Example::
 
+<<<<<<< HEAD
         strategy = TorchExportStrategy(verbose=True)
+=======
+        strategy = TorchExportNonStrictStrategy(verbose=True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         result = strategy(model, args, kwargs, dynamic_shapes)
     """
 
@@ -95,6 +113,10 @@ def __init__(
         self._timestamp = timestamp or datetime.datetime.now().strftime(
             "%Y-%m-%d_%H-%M-%S-%f"
         )
+<<<<<<< HEAD
+=======
+        self._exception: Exception | None = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __call__(
         self,
@@ -116,7 +138,15 @@ def __call__(
                 exception=e,
             )
         self._success(model)
+<<<<<<< HEAD
         return Result(exported_program, strategy=self.__class__.__name__)
+=======
+        return Result(
+            exported_program,
+            strategy=self.__class__.__name__,
+            exception=self._exception,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @abc.abstractmethod
     def _capture(
@@ -136,11 +166,23 @@ def _failure(
         return
 
 
+<<<<<<< HEAD
 class TorchExportStrategy(CaptureStrategy):
     def _capture(
         self, model, args, kwargs, dynamic_shapes
     ) -> torch.export.ExportedProgram:
         with _patch_dynamo_unsupported_functions():
+=======
+class TorchExportStrictStrategy(CaptureStrategy):
+    def _capture(
+        self, model, args, kwargs, dynamic_shapes
+    ) -> torch.export.ExportedProgram:
+        with (
+            _patch_dynamo_unsupported_functions(),
+            # Support the dynamism with 0/1 input dim
+            torch.fx.experimental._config.patch(backed_size_oblivious=True),  # type: ignore[attr-defined]
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 return torch.export.export(
                     model,
@@ -165,20 +207,32 @@ def _capture(
     def _enter(self, model) -> None:
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
+<<<<<<< HEAD
             f"Obtain model graph for `{model_repr}` with `torch.export.export`..."
+=======
+            f"Obtain model graph for `{model_repr}` with `torch.export.export(..., strict=True)`..."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _success(self, model) -> None:
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
+<<<<<<< HEAD
             f"Obtain model graph for `{model_repr}` with `torch.export.export`... ✅"
+=======
+            f"Obtain model graph for `{model_repr}` with `torch.export.export(..., strict=True)`... ✅"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _failure(self, model, e) -> None:
         del e  # Unused
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
+<<<<<<< HEAD
             f"Obtain model graph for `{model_repr}` with `torch.export.export`... ❌"
+=======
+            f"Obtain model graph for `{model_repr}` with `torch.export.export(..., strict=True)`... ❌"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -186,6 +240,7 @@ class TorchExportNonStrictStrategy(CaptureStrategy):
     def _capture(
         self, model, args, kwargs, dynamic_shapes
     ) -> torch.export.ExportedProgram:
+<<<<<<< HEAD
         try:
             return torch.export.export(
                 model, args, kwargs=kwargs, dynamic_shapes=dynamic_shapes, strict=False
@@ -202,6 +257,32 @@ def _capture(
             return torch.export.export(
                 model, args, kwargs=kwargs, dynamic_shapes=new_shapes, strict=False
             )
+=======
+        with (
+            # Support the dynamism with 0/1 input dim
+            torch.fx.experimental._config.patch(backed_size_oblivious=True),  # type: ignore[attr-defined]
+        ):
+            try:
+                return torch.export.export(
+                    model,
+                    args,
+                    kwargs=kwargs,
+                    dynamic_shapes=dynamic_shapes,
+                    strict=False,
+                )
+            except torch._dynamo.exc.UserError as exc:
+                # Refine the dynamic shapes based on the suggested fixes.
+                try:
+                    new_shapes = torch.export.dynamic_shapes.refine_dynamic_shapes_from_suggested_fixes(
+                        exc.msg, dynamic_shapes
+                    )
+                except Exception:
+                    # If the dynamic shapes cannot be refined, re-raise the exception.
+                    raise exc from None
+                return torch.export.export(
+                    model, args, kwargs=kwargs, dynamic_shapes=new_shapes, strict=False
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _enter(self, model) -> None:
         model_repr = _take_first_line(repr(model))
@@ -223,6 +304,7 @@ def _failure(self, model, e) -> None:
         )
 
 
+<<<<<<< HEAD
 class JitTraceConvertStrategy(CaptureStrategy):
     def _capture(
         self, model, args, kwargs, dynamic_shapes
@@ -300,30 +382,60 @@ def forward(self, *_args):
                 dynamic_shapes=dynamic_shapes,
                 strict=False,
             )
+=======
+class TorchExportDraftExportStrategy(CaptureStrategy):
+    def _capture(
+        self, model, args, kwargs, dynamic_shapes
+    ) -> torch.export.ExportedProgram:
+        ep = _draft_export.draft_export(
+            model, args, kwargs=kwargs, dynamic_shapes=dynamic_shapes
+        )
+        report = ep._report  # type: ignore[attr-defined]
+        if not report.successful():
+            self._exception = RuntimeError(str(report))
+            self._verbose_print(f"Draft Export report:\n{report}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ep
 
     def _enter(self, model) -> None:
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
+<<<<<<< HEAD
             f"Obtain model graph for `{model_repr}` with Torch Script..."
+=======
+            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`..."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _success(self, model) -> None:
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
+<<<<<<< HEAD
             f"Obtain model graph for `{model_repr}` with Torch Script... ✅"
+=======
+            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`... ✅"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _failure(self, model, e) -> None:
         del e  # Unused
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
+<<<<<<< HEAD
             f"Obtain model graph for `{model_repr}` with Torch Script... ❌"
+=======
+            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`... ❌"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
 CAPTURE_STRATEGIES = (
     TorchExportNonStrictStrategy,  # strict=False is preferred over strict=True because it does not have dynamo issues
+<<<<<<< HEAD
     TorchExportStrategy,
     JitTraceConvertStrategy,
+=======
+    TorchExportStrictStrategy,
+    TorchExportDraftExportStrategy,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
diff --git a/torch/onnx/_internal/exporter/_compat.py b/torch/onnx/_internal/exporter/_compat.py
index a38203d2314d..891b13094236 100644
--- a/torch/onnx/_internal/exporter/_compat.py
+++ b/torch/onnx/_internal/exporter/_compat.py
@@ -50,7 +50,11 @@ def export_compat(
     verbose: bool | None = None,
     input_names: Sequence[str] | None = None,
     output_names: Sequence[str] | None = None,
+<<<<<<< HEAD
     opset_version: int | None = None,
+=======
+    opset_version: int | None = _constants.TORCHLIB_OPSET,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     custom_translation_table: dict[Callable, Callable | Sequence[Callable]]
     | None = None,
     dynamic_axes: Mapping[str, Mapping[int, str]]
@@ -66,6 +70,11 @@ def export_compat(
     dump_exported_program: bool = False,
     artifacts_dir: str | os.PathLike = ".",
     fallback: bool = False,
+<<<<<<< HEAD
+=======
+    # Legacy export parameters for fallback
+    legacy_export_kwargs: dict[str, Any] | None = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> _onnx_program.ONNXProgram:
     if opset_version is None:
         opset_version = _constants.TORCHLIB_OPSET
@@ -105,8 +114,12 @@ def export_compat(
     dynamic_shapes_with_export_dim, need_axis_mapping = (
         _dynamic_shapes.convert_str_to_export_dim(dynamic_shapes)
     )
+<<<<<<< HEAD
 
     registry = _registration.ONNXRegistry.from_torchlib()
+=======
+    registry = _registration.ONNXRegistry().from_torchlib(opset_version=opset_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if custom_translation_table is not None:
         for torch_op, onnx_ops in custom_translation_table.items():
             # TODO(justinchuby): Support complex inputs with annotations
@@ -152,6 +165,13 @@ def export_compat(
                 dynamic_axes = _dynamic_shapes.from_dynamic_shapes_to_dynamic_axes(
                     dynamic_shapes=dynamic_shapes, input_names=input_names, exception=e
                 )
+<<<<<<< HEAD
+=======
+            # Use the legacy export kwargs prepared in __init__.py
+            if legacy_export_kwargs is None:
+                legacy_export_kwargs = {}
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             torch.onnx.utils.export(
                 model,  # type: ignore[arg-type]
                 args,
@@ -160,9 +180,16 @@ def export_compat(
                 export_params=export_params,
                 input_names=input_names,
                 output_names=output_names,
+<<<<<<< HEAD
                 opset_version=17,  # TODO(justinchuby): Hard coded to 17 for now
                 dynamic_axes=dynamic_axes,
                 keep_initializers_as_inputs=keep_initializers_as_inputs,
+=======
+                opset_version=opset_version,
+                dynamic_axes=dynamic_axes,
+                keep_initializers_as_inputs=keep_initializers_as_inputs,
+                **legacy_export_kwargs,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             onnx_program = _onnx_program.ONNXProgram(ir.load(f), None)
 
diff --git a/torch/onnx/_internal/exporter/_core.py b/torch/onnx/_internal/exporter/_core.py
index f33e384e0503..1a899ca8acc9 100644
--- a/torch/onnx/_internal/exporter/_core.py
+++ b/torch/onnx/_internal/exporter/_core.py
@@ -31,12 +31,20 @@
     _constants,
     _dispatching,
     _errors,
+<<<<<<< HEAD
+=======
+    _flags,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _fx_passes,
     _ir_passes,
     _onnx_program,
     _registration,
     _reporting,
     _tensors,
+<<<<<<< HEAD
+=======
+    _type_casting,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _verification,
 )
 
@@ -60,6 +68,10 @@
     torch.float8_e4m3fnuz: ir.DataType.FLOAT8E4M3FNUZ,
     torch.float8_e5m2: ir.DataType.FLOAT8E5M2,
     torch.float8_e5m2fnuz: ir.DataType.FLOAT8E5M2FNUZ,
+<<<<<<< HEAD
+=======
+    torch.float4_e2m1fn_x2: ir.DataType.FLOAT4E2M1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.int16: ir.DataType.INT16,
     torch.int32: ir.DataType.INT32,
     torch.int64: ir.DataType.INT64,
@@ -101,29 +113,69 @@
 current_tracer: _building.OpRecorder | None = None
 
 
+<<<<<<< HEAD
 def _torch_dtype_to_onnx_dtype(dtype: torch.dtype) -> ir.DataType:
+=======
+def torch_dtype_to_onnx_dtype(dtype: torch.dtype) -> ir.DataType:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return _TORCH_DTYPE_TO_ONNX[dtype]
 
 
 class TorchTensor(ir.Tensor):
     def __init__(self, tensor: torch.Tensor, name: str | None = None):
         # Pass the tensor as the raw data to ir.Tensor's constructor
+<<<<<<< HEAD
         super().__init__(
             tensor, dtype=_torch_dtype_to_onnx_dtype(tensor.dtype), name=name
+=======
+        if tensor.dtype == torch.float4_e2m1fn_x2:
+            # Change the shape to the unpacked shape
+            shape = ir.Shape(_type_casting.get_float4_shape(tensor), frozen=True)
+        else:
+            # The base class will set the shape to the tensor's shape
+            shape = None
+        super().__init__(
+            tensor,
+            dtype=torch_dtype_to_onnx_dtype(tensor.dtype),
+            shape=shape,
+            name=name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def numpy(self) -> npt.NDArray:
         self.raw: torch.Tensor
+<<<<<<< HEAD
         if self.dtype == ir.DataType.BFLOAT16:
             return self.raw.view(torch.uint16).numpy(force=True)
+=======
+
+        # Handle dtypes that are not natively supported by NumPy:
+        # We pick an uint dtype that has the same size as the original dtype,
+        # view the tensor as that dtype so that it is convertible to NumPy,
+        # and then view it back to the proper dtype (using ml_dtypes obtained by
+        # calling dtype.numpy()).
+        if self.dtype == ir.DataType.BFLOAT16:
+            return (
+                self.raw.view(torch.uint16).numpy(force=True).view(self.dtype.numpy())
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.dtype in {
             ir.DataType.FLOAT8E4M3FN,
             ir.DataType.FLOAT8E4M3FNUZ,
             ir.DataType.FLOAT8E5M2,
             ir.DataType.FLOAT8E5M2FNUZ,
         }:
+<<<<<<< HEAD
             # TODO: Use ml_dtypes
             return self.raw.view(torch.uint8).numpy(force=True)
+=======
+            return self.raw.view(torch.uint8).numpy(force=True).view(self.dtype.numpy())
+        if self.dtype == ir.DataType.FLOAT4E2M1:
+            return _type_casting.unpack_float4x2_as_uint8(self.raw).view(
+                self.dtype.numpy()
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.raw.numpy(force=True)
 
     def __array__(self, dtype: Any = None, copy: bool | None = None) -> npt.NDArray:
@@ -200,6 +252,7 @@ def _set_shape_type(
     | tuple[torch.Tensor],
     complex_to_float: bool,
 ) -> None:
+<<<<<<< HEAD
     # TODO: Consider using meta["tensor_meta"] for this? Would it be faster?
     if isinstance(meta_val, tuple):
         logger.warning("Setting shape and type of tensors is not supported yet")
@@ -207,10 +260,24 @@ def _set_shape_type(
         # FIXME: Consider shape for complex values
         dims = []
         for dim in meta_val.shape:
+=======
+    if isinstance(meta_val, tuple):
+        logger.warning("Setting shape and type of tensors is not supported yet")
+    if isinstance(meta_val, torch.Tensor):
+        dims = []
+        shape: tuple[int, ...]
+        if meta_val.dtype == torch.float4_e2m1fn_x2:
+            # Change the shape to the unpacked shape
+            shape = _type_casting.get_float4_shape(meta_val)
+        else:
+            shape = meta_val.shape
+        for dim in shape:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if isinstance(dim, int):
                 dims.append(dim)
             else:
                 dims.append(str(dim.node))
+<<<<<<< HEAD
         value.dtype = _torch_dtype_to_onnx_dtype(meta_val.dtype)
         if complex_to_float:
             if meta_val.dtype == torch.complex64:
@@ -221,6 +288,26 @@ def _set_shape_type(
                 value.dtype = ir.DataType.DOUBLE
                 # Add 2 as the last dimension if the tensor is complex to hold the real/imag parts
                 dims.append(2)
+=======
+
+        # If the dtype is set already (e.g. by the onnx_symbolic ops),
+        # we don't need to set it again.
+        #
+        # When a user specifies complex in onnx_symbolic, we consider that to
+        # be the intention even though non of the ONNX ops deals with complex values.
+        # In this case, we don't change the dtype or the shape of the tensor.
+        if value.dtype is None:
+            value.dtype = torch_dtype_to_onnx_dtype(meta_val.dtype)
+            if complex_to_float:
+                if meta_val.dtype == torch.complex64:
+                    value.dtype = ir.DataType.FLOAT
+                    # Add 2 as the last dimension if the tensor is complex to hold the real/imag parts
+                    dims.append(2)
+                elif meta_val.dtype == torch.complex128:
+                    value.dtype = ir.DataType.DOUBLE
+                    # Add 2 as the last dimension if the tensor is complex to hold the real/imag parts
+                    dims.append(2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         value.shape = ir.Shape(dims)
     elif isinstance(meta_val, (int, torch.SymInt)):
@@ -432,7 +519,11 @@ def _convert_fx_arg_to_onnx_arg(
     if isinstance(arg, (torch.device, torch.memory_format, torch.layout)):
         return str(arg)
     if isinstance(arg, torch.dtype):
+<<<<<<< HEAD
         return _torch_dtype_to_onnx_dtype(arg)
+=======
+        return torch_dtype_to_onnx_dtype(arg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Maybe a Python value
     return arg
 
@@ -441,6 +532,23 @@ def _get_onnxscript_opset(opset_version: int) -> onnxscript.values.Opset:
     return onnxscript.values.Opset("", opset_version)
 
 
+<<<<<<< HEAD
+=======
+def _is_onnx_op(op: Any) -> bool:
+    """Whether the op overload is an ONNX custom op implemented with PyTorch."""
+    if not isinstance(op, torch._ops.OpOverload):
+        return False
+    return op.name().startswith("onnx::")
+
+
+def _parse_onnx_op(op: torch._ops.OpOverload) -> tuple[str, int]:
+    """Parse the ONNX custom op overload name to get the op type and opset version."""
+    name = op.name()[len("onnx::") :]
+    name, _, opset = name.partition(".opset")
+    return name, int(opset)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _handle_call_function_node_with_lowering(
     model: ir.Model,
     node: torch.fx.Node,
@@ -476,6 +584,7 @@ def _handle_call_function_node_with_lowering(
             # use SequenceAt to get the value. This is handled by torchlib
             pass
 
+<<<<<<< HEAD
     # Find the matching ONNX overload for the node
     # NOTE: Create different registries for different ONNX opset versions
     # TODO: Log the message here to expose false positives
@@ -487,6 +596,8 @@ def _handle_call_function_node_with_lowering(
             f"No ONNX function found for {node.target!r}. Failure message: {message}"
         )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Map FX inputs to ONNX inputs and fill optional inputs.
     # torch_args and torch_kwargs are for op-level validation
     fx_args = node.args
@@ -510,6 +621,7 @@ def _handle_call_function_node_with_lowering(
             # TODO(justinchuby): Maybe keep it as None?
             onnx_kwargs[key] = -1
 
+<<<<<<< HEAD
     with onnxscript.evaluator.default_as(
         tracer := _building.OpRecorder(opset, constant_farm)
     ):
@@ -523,6 +635,70 @@ def _handle_call_function_node_with_lowering(
             ) from e
         finally:
             current_tracer = None
+=======
+    if _is_onnx_op(node.target):
+        # Handle torch.ops.onnx.* ops. These ops can be directly added to the graph
+        op_type, opset_version = _parse_onnx_op(node.target)  # type: ignore[arg-type]
+        # If final inputs are None, strip them from the node inputs
+        for input_ in reversed(onnx_args):
+            if input_ is not None:
+                break
+            onnx_args.pop()
+        onnx_node = ir.Node(
+            "",
+            op_type,
+            onnx_args,
+            ir.convenience.convert_attributes(onnx_kwargs),
+            name=node.name,
+            num_outputs=len(node.target._schema.returns),  # type: ignore[union-attr]
+            version=opset_version,
+        )
+        # Store the single node in a list to be consistent with the rest of the code for further processing
+        onnx_nodes = [onnx_node]
+        if len(onnx_node.outputs) == 1:
+            outputs = onnx_node.outputs[0]
+        else:
+            outputs = onnx_node.outputs  # type: ignore[assignment]
+    else:
+        # Find the matching ONNX overload for the node
+        # TODO: Log the message here to expose false positives
+        onnx_function, message = _dispatching.dispatch(node, registry)
+
+        if onnx_function is None:
+            raise _errors.DispatchError(
+                f"No ONNX function found for {node.target!r}. Failure message: {message}"
+            )
+
+        with onnxscript.evaluator.default_as(
+            tracer := _building.OpRecorder(opset, constant_farm)
+        ):
+            global current_tracer
+            current_tracer = tracer
+            try:
+                outputs = onnx_function(*onnx_args, **onnx_kwargs)
+            except Exception as e:
+                raise _errors.GraphConstructionError(
+                    f"Error when calling function '{onnx_function}' with args '{onnx_args}' and kwargs '{onnx_kwargs}'"
+                ) from e
+            finally:
+                current_tracer = None
+
+        # Add the defined functions to the model
+        for identifier, onnxscript_function in tracer.functions.items():
+            if identifier in model.functions:
+                continue
+            if isinstance(onnxscript_function, ir.Function):
+                ir_function = onnxscript_function
+            else:
+                # TODO: Get IR function directly when onnxscript is updated
+                proto = onnxscript_function.to_function_proto()
+                ir_function = ir.serde.deserialize_function(proto)
+            model.functions[identifier] = ir_function
+            # Opset imports are added to the model in the final add_opset_imports pass
+
+        onnx_nodes = tracer.nodes
+        del tracer  # tracer is no longer needed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # NOTE: Instead of using the output names from node.target._schema,
     # we always use the index if there are more than one outputs so the
@@ -536,18 +712,34 @@ def _handle_call_function_node_with_lowering(
         node_name_to_values[node.name] = outputs
         for i, output in enumerate(outputs):
             output.name = f"{node.name}__{i}"
+<<<<<<< HEAD
+=======
+            # Set the name of the producing node using the value name for correspondence
+            producer = output.producer()
+            if producer is not None:
+                producer.name = f"node_{output.name}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         _set_shape_type(outputs, node.meta["val"], complex_to_float=True)
         node_name_to_values[node.name] = outputs
         outputs.name = node.name
+<<<<<<< HEAD
 
     for ir_node in tracer.nodes:
+=======
+        producer = outputs.producer()
+        if producer is not None:
+            producer.name = f"node_{outputs.name}"
+
+    for ir_node in onnx_nodes:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ir_node.meta["node"] = node
         # Record the nn.Module stack for the node
         _set_node_metadata(node, ir_node)
 
     # Add the traced nodes to the current graph
     # Must add nodes to this graph, not model.graph, because it can be a subgraph that is currently being constructed
+<<<<<<< HEAD
     graph_like.extend(tracer.nodes)
     # Add the defined functions to the model
     for identifier, onnxscript_function in tracer.functions.items():
@@ -561,6 +753,9 @@ def _handle_call_function_node_with_lowering(
             ir_function = ir.serde.deserialize_function(proto)
         model.functions[identifier] = ir_function
         # Opset imports are added to the model in the final add_opset_imports pass
+=======
+    graph_like.extend(onnx_nodes)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _handle_placeholder_node(
@@ -775,7 +970,11 @@ def _get_inputs_and_attributes(
             elif isinstance(arg, torch.device):
                 attributes[schema_arg.name] = str(arg)
             elif isinstance(arg, torch.dtype):
+<<<<<<< HEAD
                 attributes[schema_arg.name] = _torch_dtype_to_onnx_dtype(arg)
+=======
+                attributes[schema_arg.name] = torch_dtype_to_onnx_dtype(arg)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 attributes[schema_arg.name] = arg
         for schema_arg in node_schema.arguments:
@@ -791,7 +990,11 @@ def _get_inputs_and_attributes(
             } or isinstance(kwarg, torch.device):
                 attr = str(kwarg)
             elif isinstance(kwarg, torch.dtype):
+<<<<<<< HEAD
                 attr = _torch_dtype_to_onnx_dtype(kwarg)  # type: ignore[assignment]
+=======
+                attr = torch_dtype_to_onnx_dtype(kwarg)  # type: ignore[assignment]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 attr = kwarg  # type: ignore[assignment]
 
@@ -1163,6 +1366,10 @@ def _verbose_printer(verbose: bool | None) -> Callable[..., None]:
     return lambda *args, **kwargs: print("[torch.onnx]", *args, **kwargs)
 
 
+<<<<<<< HEAD
+=======
+@_flags.set_onnx_exporting_flag
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def export(
     model: torch.nn.Module
     | torch.export.ExportedProgram
@@ -1229,7 +1436,12 @@ def export(
         # We know the model is already exported program, so the args, kwargs, and dynamic_shapes
         # are not used.
         program = model
+<<<<<<< HEAD
         export_status.torch_export = True
+=======
+        # torch.export.export has strict default to False
+        export_status.torch_export_non_strict = True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         # Convert an nn.Module to an ExportedProgram
         # Try everything 🐰 (all paths for getting an ExportedProgram)
@@ -1247,6 +1459,7 @@ def export(
             # Record the status
             if strategy_class is _capture_strategies.TorchExportNonStrictStrategy:
                 export_status.torch_export_non_strict = result.success
+<<<<<<< HEAD
             elif strategy_class is _capture_strategies.TorchExportStrategy:
                 export_status.torch_export = result.success
             elif strategy_class is _capture_strategies.JitTraceConvertStrategy:
@@ -1257,6 +1470,19 @@ def export(
                 break
             else:
                 failed_results.append(result)
+=======
+            elif strategy_class is _capture_strategies.TorchExportStrictStrategy:
+                export_status.torch_export_strict = result.success
+            elif strategy_class is _capture_strategies.TorchExportDraftExportStrategy:
+                export_status.torch_export_draft_export = result.success
+
+            if result.exception is not None:
+                failed_results.append(result)
+            if result.success:
+                assert result.exported_program is not None
+                program = result.exported_program
+                break
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         assert result is not None
         capture_strategy = result.strategy
diff --git a/torch/onnx/_internal/exporter/_dispatching.py b/torch/onnx/_internal/exporter/_dispatching.py
index b7dbb7286006..d697575fc9a9 100644
--- a/torch/onnx/_internal/exporter/_dispatching.py
+++ b/torch/onnx/_internal/exporter/_dispatching.py
@@ -27,11 +27,21 @@
     torch.float8_e4m3fnuz: ir.DataType.FLOAT8E4M3FNUZ,
     torch.float8_e5m2: ir.DataType.FLOAT8E5M2,
     torch.float8_e5m2fnuz: ir.DataType.FLOAT8E5M2FNUZ,
+<<<<<<< HEAD
+=======
+    torch.float4_e2m1fn_x2: ir.DataType.FLOAT4E2M1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.int16: ir.DataType.INT16,
     torch.int32: ir.DataType.INT32,
     torch.int64: ir.DataType.INT64,
     torch.int8: ir.DataType.INT8,
     torch.uint8: ir.DataType.UINT8,
+<<<<<<< HEAD
+=======
+    torch.uint16: ir.DataType.UINT16,
+    torch.uint32: ir.DataType.UINT32,
+    torch.uint64: ir.DataType.UINT64,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 
@@ -92,6 +102,10 @@ def _param_type_compatible_with_arg(
         ir.TensorType(ir.DataType.INT32),
         ir.TensorType(ir.DataType.INT64),
         # Int inputs can be casted to a float too
+<<<<<<< HEAD
+=======
+        ir.TensorType(ir.DataType.FLOAT4E2M1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ir.TensorType(ir.DataType.FLOAT8E4M3FN),
         ir.TensorType(ir.DataType.FLOAT8E4M3FNUZ),
         ir.TensorType(ir.DataType.FLOAT8E5M2),
@@ -102,6 +116,10 @@ def _param_type_compatible_with_arg(
     }:
         return True
     if isinstance(value, float) and param.type_constraint.allowed_types & {
+<<<<<<< HEAD
+=======
+        ir.TensorType(ir.DataType.FLOAT4E2M1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ir.TensorType(ir.DataType.FLOAT8E4M3FN),
         ir.TensorType(ir.DataType.FLOAT8E4M3FNUZ),
         ir.TensorType(ir.DataType.FLOAT8E5M2),
diff --git a/torch/onnx/_internal/exporter/_dynamic_shapes.py b/torch/onnx/_internal/exporter/_dynamic_shapes.py
index 5aa687f52fe3..7ed30b6b790c 100644
--- a/torch/onnx/_internal/exporter/_dynamic_shapes.py
+++ b/torch/onnx/_internal/exporter/_dynamic_shapes.py
@@ -8,7 +8,11 @@
 from typing import Any, TYPE_CHECKING
 
 import torch
+<<<<<<< HEAD
 from torch.export.dynamic_shapes import _Dim, _DimHint
+=======
+from torch.export.dynamic_shapes import _DimHint, Dim
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.onnx._internal._lazy_import import onnxscript_ir as ir
 from torch.utils import _pytree
 
@@ -27,17 +31,28 @@ def from_dynamic_axes_to_dynamic_shapes(
     input_names: Sequence[str] | None = None,
 ) -> tuple[dict[str, Any | None] | None, tuple[Any, ...], dict[str, Any] | None]:
     """
+<<<<<<< HEAD
     Converts dynamic_axes into dynamic_shapes by wrapping the axis names with ``torch.export.Dim.AUTO``.
+=======
+    Converts dynamic_axes into dynamic_shapes by wrapping the axis names with ``torch.export.Dim.DYNAMIC``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     dynamic_axes examples:
     (1) dynamic_axes = {"x": {0: "my_custom_axis_name_1"}, "y": {1: "my_custom_axis_name_2"}}
     (2) dynamic_axes = {"x": [0], "y": [1]}
 
     these will be converted to dynamic_shapes respectively:
+<<<<<<< HEAD
     (1) dynamic_shapes = {"x": {0: Dim.AUTO}, "y": {1: Dim.AUTO}}
     (2) dynamic_shapes = {"x": {0: Dim.AUTO}, "y": {1: Dim.AUTO}}
 
     Detail on Dim.AUTO: `#133620 <https://github.com/pytorch/pytorch/pull/133620>`_
+=======
+    (1) dynamic_shapes = {"x": {0: Dim.DYNAMIC}, "y": {1: Dim.DYNAMIC}}
+    (2) dynamic_shapes = {"x": {0: Dim.DYNAMIC}, "y": {1: Dim.DYNAMIC}}
+
+    Detail on Dim.DYNAMIC: `#133620 <https://github.com/pytorch/pytorch/pull/133620>`_
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     # https://github.com/pytorch/pytorch/pull/128371
     # 1. The function does not need to provide dynamic_shapes to torch.export.export
@@ -52,7 +67,11 @@ def from_dynamic_axes_to_dynamic_shapes(
 
     dynamic_shapes: dict[str, Any | None] = {}
     for input_name, axes in dynamic_axes.items():
+<<<<<<< HEAD
         # NOTE: torch.export.Dim.AUTO does its best to infer the min and max values
+=======
+        # NOTE: torch.export.Dim.DYNAMIC does its best to infer the min and max values
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # from the model, but it's not guaranteed to be dynamic.
         if input_name in output_names:
             # output names are not needed for dynamic_shapes
@@ -63,15 +82,23 @@ def from_dynamic_axes_to_dynamic_shapes(
                     "The axis in dynamic_axes must be in the form of: dict[int, str] or list[int]."
                 )
             dynamic_shapes[input_name] = {
+<<<<<<< HEAD
                 k: torch.export.Dim.AUTO  # type: ignore[attr-defined]
                 for k, _ in axes.items()
+=======
+                k: torch.export.Dim.DYNAMIC for k, _ in axes.items()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         elif isinstance(axes, list):
             if any(not isinstance(k, int) for k in axes):
                 raise ValueError(
                     "The axis in dynamic_axes must be in the form of: dict[int, str] or list[int]."
                 )
+<<<<<<< HEAD
             dynamic_shapes[input_name] = {k: torch.export.Dim.AUTO for k in axes}  # type: ignore[attr-defined]
+=======
+            dynamic_shapes[input_name] = dict.fromkeys(axes, torch.export.Dim.DYNAMIC)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif axes is None:
             dynamic_shapes[input_name] = None
         else:
@@ -158,7 +185,11 @@ def from_dynamic_shapes_to_dynamic_axes(
 def _any_str_or_dim_in_dynamic_shapes(
     dynamic_shapes: dict[str, Any] | tuple[Any, ...] | list[Any],
 ) -> bool:
+<<<<<<< HEAD
     """Check if there is any string or _Dim in the dynamic_shapes."""
+=======
+    """Check if there is any string or Dim in the dynamic_shapes."""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     flat_dynamic_shapes, _ = _flatten_dynamic_shapes_to_axes(dynamic_shapes)
     # This indicates the dynamic_shapes includes something we don't support in axes, and it's flattened
     # to itself. Otherwise, flat_dynamic_shapes should be a list of dict/list/tuple (or None).
@@ -167,6 +198,7 @@ def _any_str_or_dim_in_dynamic_shapes(
         for axes in flat_dynamic_shapes
     ):
         return False
+<<<<<<< HEAD
     # both str and _Dim can provide custom names
     for axes in flat_dynamic_shapes:
         if isinstance(axes, dict):
@@ -176,6 +208,17 @@ def _any_str_or_dim_in_dynamic_shapes(
         elif isinstance(axes, (list, tuple)):
             for dim in axes:
                 if isinstance(dim, (str, _Dim)):
+=======
+    # both str and Dim can provide custom names
+    for axes in flat_dynamic_shapes:
+        if isinstance(axes, dict):
+            for dim in axes.values():
+                if isinstance(dim, (str, Dim)):
+                    return True
+        elif isinstance(axes, (list, tuple)):
+            for dim in axes:
+                if isinstance(dim, (str, Dim)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return True
     return False
 
@@ -186,12 +229,21 @@ def convert_str_to_export_dim(
     # 1. If there is no string in dynamic_shapes, we do not touch dynamic_shapes
     if dynamic_shapes is None or not _any_str_or_dim_in_dynamic_shapes(dynamic_shapes):
         return dynamic_shapes, False
+<<<<<<< HEAD
     # 2. Convert "name" to Dim.AUTO with flattening and identify if there is any string
     #    to be replaced with Dim.AUTO, and then unflatten it back to the original structure.
     #    for example: {"y": {0: "dim_0"}, "x": {1: "dim_1"}}
     #    to {"y": {0: Dim.AUTO}, "x": {1: Dim.AUTO}}
     dynamic_shapes_with_export_dim: list[
         list[_Dim | _DimHint | None] | dict[int, _Dim | _DimHint | None] | None
+=======
+    # 2. Convert "name" to Dim.DYNAMIC with flattening and identify if there is any string
+    #    to be replaced with Dim.DYNAMIC, and then unflatten it back to the original structure.
+    #    for example: {"y": {0: "dim_0"}, "x": {1: "dim_1"}}
+    #    to {"y": {0: Dim.DYNAMIC}, "x": {1: Dim.DYNAMIC}}
+    dynamic_shapes_with_export_dim: list[
+        list[Dim | _DimHint | None] | dict[int, Dim | _DimHint | None] | None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ] = []
     flat_dynamic_shapes, tree_structure = _flatten_dynamic_shapes_to_axes(
         dynamic_shapes
@@ -200,18 +252,32 @@ def convert_str_to_export_dim(
         if axes is None:
             dynamic_shapes_with_export_dim.append(None)
         elif isinstance(axes, dict):
+<<<<<<< HEAD
             converted_axes_dict: dict[int, _Dim | _DimHint | None] = {}
             for axis, dim in axes.items():
                 if isinstance(dim, str):
                     converted_axes_dict[axis] = torch.export.Dim.AUTO  # type: ignore[attr-defined]
+=======
+            converted_axes_dict: dict[int, Dim | _DimHint | None] = {}
+            for axis, dim in axes.items():
+                if isinstance(dim, str):
+                    converted_axes_dict[axis] = torch.export.Dim.DYNAMIC
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     converted_axes_dict[axis] = dim
             dynamic_shapes_with_export_dim.append(converted_axes_dict)
         elif isinstance(axes, (list, tuple)):
+<<<<<<< HEAD
             converted_axes_list: list[_Dim | _DimHint | None] = []
             for dim in axes:
                 if isinstance(dim, str):
                     converted_axes_list.append(torch.export.Dim.AUTO)  # type: ignore[attr-defined]
+=======
+            converted_axes_list: list[Dim | _DimHint | None] = []
+            for dim in axes:
+                if isinstance(dim, str):
+                    converted_axes_list.append(torch.export.Dim.DYNAMIC)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     converted_axes_list.append(dim)
             dynamic_shapes_with_export_dim.append(converted_axes_list)
@@ -293,9 +359,15 @@ def create_rename_mapping(
     return rename_mapping
 
 
+<<<<<<< HEAD
 def _get_custom_axis_name(axis: _Dim | str) -> str:
     """Get the custom axis name from a torch.export.Dim."""
     if isinstance(axis, _Dim):
+=======
+def _get_custom_axis_name(axis: Dim | str) -> str:
+    """Get the custom axis name from a torch.export.Dim."""
+    if isinstance(axis, Dim):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return axis.__name__
     return axis
 
@@ -311,18 +383,30 @@ def _unflatten_dynamic_shapes_with_inputs_tree(
 def _flatten_dynamic_shapes_to_axes(
     dynamic_shapes: dict[str, Any | None] | tuple[Any, ...] | list[Any],
 ) -> tuple[list[Any], _pytree.TreeSpec]:
+<<<<<<< HEAD
     # If it's a dict/list/tuple with torch.export._Dim, we consider it's an axis to dim mapping
+=======
+    # If it's a dict/list/tuple with torch.export.Dim, we consider it's an axis to dim mapping
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def is_axes(x) -> bool:
         return (
             isinstance(x, dict)
             and all(
                 isinstance(k, int)
+<<<<<<< HEAD
                 and (v is None or isinstance(v, (_Dim, _DimHint, str, int)))
+=======
+                and (v is None or isinstance(v, (Dim, _DimHint, str, int)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for k, v in x.items()
             )
         ) or (
             isinstance(x, (list, tuple))
+<<<<<<< HEAD
             and all(v is None or isinstance(v, (_Dim, _DimHint, str, int)) for v in x)
+=======
+            and all(v is None or isinstance(v, (Dim, _DimHint, str, int)) for v in x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     return _pytree.tree_flatten(dynamic_shapes, is_leaf=is_axes)
diff --git a/torch/onnx/_internal/exporter/_flags.py b/torch/onnx/_internal/exporter/_flags.py
new file mode 100644
index 000000000000..de20e27418df
--- /dev/null
+++ b/torch/onnx/_internal/exporter/_flags.py
@@ -0,0 +1,25 @@
+"""Internal flags for ONNX export."""
+
+from __future__ import annotations
+
+import functools
+from typing import Any, Callable, cast, TypeVar
+
+
+_is_onnx_exporting = False
+
+TCallable = TypeVar("TCallable", bound=Callable[..., Any])
+
+
+def set_onnx_exporting_flag(func: TCallable) -> TCallable:
+    @functools.wraps(func)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        global _is_onnx_exporting
+        _is_onnx_exporting = True
+        try:
+            return func(*args, **kwargs)
+        finally:
+            # Ensure it resets even if an exception occurs
+            _is_onnx_exporting = False
+
+    return cast(TCallable, wrapper)
diff --git a/torch/onnx/_internal/exporter/_fx_passes.py b/torch/onnx/_internal/exporter/_fx_passes.py
index 9d78441cdc91..49c2f3c56f68 100644
--- a/torch/onnx/_internal/exporter/_fx_passes.py
+++ b/torch/onnx/_internal/exporter/_fx_passes.py
@@ -4,7 +4,11 @@
 import torch.export
 import torch.fx
 from torch.onnx._internal.exporter import _decomp, _registration
+<<<<<<< HEAD
 from torch.onnx._internal.fx import diagnostics, passes
+=======
+from torch.onnx._internal.fx import passes
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def decompose_with_registry(
@@ -25,11 +29,15 @@ def insert_type_promotion_nodes(
     """Inplace pass to insert explicit type promotion nodes, recursively through nested modules."""
     for module in graph_module.modules():
         assert isinstance(module, torch.fx.GraphModule)
+<<<<<<< HEAD
         diagnostic_context = diagnostics.DiagnosticContext(
             "torch.onnx.export",
             torch.__version__,
         )
         passes.InsertTypePromotion(diagnostic_context, module).run()
+=======
+        passes.InsertTypePromotion(module).run()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def remove_assertion_nodes(graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
diff --git a/torch/onnx/_internal/exporter/_ir_passes.py b/torch/onnx/_internal/exporter/_ir_passes.py
index 804e93acbd6f..efed52263cf2 100644
--- a/torch/onnx/_internal/exporter/_ir_passes.py
+++ b/torch/onnx/_internal/exporter/_ir_passes.py
@@ -90,7 +90,13 @@ def rename_axis(model: ir.Model, rename_mapping: dict[str, str]) -> None:
             value.shape = ir.Shape(new_shape)
 
 
+<<<<<<< HEAD
 def add_torchlib_common_imports(model: ir.Model) -> None:
+=======
+def add_torchlib_common_imports(
+    model: ir.Model, opset_version: int = _constants.TORCHLIB_OPSET
+) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Hack to add torchlib common imports to the model."""
 
     try:
@@ -99,9 +105,17 @@ def add_torchlib_common_imports(model: ir.Model) -> None:
 
         model.opset_imports["pkg.onnxscript.torch_lib.common"] = 1
         rank_func = ir.serde.deserialize_function(common_ops.Rank.to_function_proto())
+<<<<<<< HEAD
+        is_scalar_func = ir.serde.deserialize_function(
+            common_ops.IsScalar.to_function_proto()
+        )
+=======
+        rank_func.opset_imports[""] = opset_version
         is_scalar_func = ir.serde.deserialize_function(
             common_ops.IsScalar.to_function_proto()
         )
+        is_scalar_func.opset_imports[""] = opset_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         model.functions[rank_func.identifier()] = rank_func
         model.functions[is_scalar_func.identifier()] = is_scalar_func
     except Exception:
diff --git a/torch/onnx/_internal/exporter/_isolated.py b/torch/onnx/_internal/exporter/_isolated.py
index 4a5c5fcdf793..1909b7f08738 100644
--- a/torch/onnx/_internal/exporter/_isolated.py
+++ b/torch/onnx/_internal/exporter/_isolated.py
@@ -1,11 +1,15 @@
 """Isolated calls to methods that may segfault."""
 
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from __future__ import annotations
 
 import multiprocessing
 import os
 import warnings
+<<<<<<< HEAD
 from typing import Callable
 
 
@@ -13,6 +17,22 @@
 
 
 def _call_function_and_return_exception(func, args, kwargs):
+=======
+from typing import Any, Callable, TypeVar, TypeVarTuple, Union, Unpack
+from typing_extensions import ParamSpec
+
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+_Ts = TypeVarTuple("_Ts")
+
+_IS_WINDOWS = os.name == "nt"
+
+
+def _call_function_and_return_exception(
+    func: Callable[[Unpack[_Ts]], _R], args: tuple[Unpack[_Ts]], kwargs: dict[str, Any]
+) -> Union[_R, Exception]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Call function and return a exception if there is one."""
 
     try:
@@ -21,7 +41,11 @@ def _call_function_and_return_exception(func, args, kwargs):
         return e
 
 
+<<<<<<< HEAD
 def safe_call(func: Callable, *args, **kwargs):
+=======
+def safe_call(func: Callable[_P, _R], *args: _P.args, **kwargs: _P.kwargs) -> _R:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """Call a function in a separate process.
 
     Args:
diff --git a/torch/onnx/_internal/exporter/_onnx_program.py b/torch/onnx/_internal/exporter/_onnx_program.py
index 2d37f5121beb..5239ccf04b43 100644
--- a/torch/onnx/_internal/exporter/_onnx_program.py
+++ b/torch/onnx/_internal/exporter/_onnx_program.py
@@ -30,6 +30,17 @@
     import onnxruntime as ort
 
 _LARGE_MODEL_THRESHOLD = 1536 * 1024 * 1024  # 1536MB
+<<<<<<< HEAD
+=======
+_NP_UNSUPPORTED_DTYPES_8BIT = frozenset(
+    {
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+    }
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 logger = logging.getLogger(__name__)
 
@@ -73,7 +84,11 @@ def _set_graph_outputs(
         graph: The graph to set the outputs for.
         outputs: The outputs to set.
     """
+<<<<<<< HEAD
     original_outputs = graph.outputs.copy()
+=======
+    original_outputs = list(graph.outputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     graph.outputs.clear()
     graph.outputs.extend(outputs)
     try:
@@ -94,7 +109,11 @@ def _create_value_mapping(graph: ir.Graph) -> dict[str, ir.Value]:
     Returns:
         A dictionary mapping names to values.
     """
+<<<<<<< HEAD
     values = {}
+=======
+    values: dict[str, ir.Value] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     values.update(graph.initializers)
     # The names of the values can be None or "", which we need to exclude
     for input in graph.inputs:
@@ -109,8 +128,64 @@ def _create_value_mapping(graph: ir.Graph) -> dict[str, ir.Value]:
     return values
 
 
+<<<<<<< HEAD
 class ONNXProgram:
     """A class to represent an ONNX program that is callable with torch tensors."""
+=======
+def _to_ort_value(tensor: torch.Tensor) -> ort.OrtValue:
+    """Convert a PyTorch tensor to an ONNX Runtime OrtValue."""
+    import onnxruntime as ort
+
+    from torch.onnx._internal.exporter import _core
+
+    if tensor.dtype == torch.bfloat16 or tensor.dtype in _NP_UNSUPPORTED_DTYPES_8BIT:
+        if hasattr(ort.OrtValue, "ortvalue_from_numpy_with_onnx_type"):
+            # This requires ONNX Runtime 1.21 or newer
+            if tensor.dtype == torch.bfloat16:
+                uint_type = torch.uint16
+            else:
+                uint_type = torch.uint8
+            onnx_type = _core.torch_dtype_to_onnx_dtype(tensor.dtype)
+            # Make tensor contiguous to ensure view() works
+            tensor = tensor.contiguous()
+            return ort.OrtValue.ortvalue_from_numpy_with_onnx_type(
+                tensor.view(uint_type).numpy(force=True), onnx_element_type=onnx_type
+            )
+        raise RuntimeError(
+            f"Failed to convert tensor of type '{tensor.dtype}' to OrtValue. "
+            "Please ensure that ONNX Runtime is built with DLPack support or is the latest version"
+        )
+    # TODO(#151064): Use dlpack when ORT properly supports it
+    return ort.OrtValue.ortvalue_from_numpy(tensor.numpy(force=True))
+
+
+def _from_ort_value(value: ort.OrtValue) -> torch.Tensor:
+    if value.element_type() in (
+        ir.DataType.BFLOAT16,
+        ir.DataType.FLOAT8E4M3FN,
+        ir.DataType.FLOAT8E4M3FNUZ,
+        ir.DataType.FLOAT8E5M2,
+        ir.DataType.FLOAT8E5M2FNUZ,
+    ):
+        # This requires ONNX Runtime 1.21 or newer
+        try:
+            return torch.from_dlpack(value._get_c_value())
+        except Exception as e:
+            raise RuntimeError(
+                "Failed to convert OrtValue to torch.Tensor. "
+                "Please ensure that ONNX Runtime is built with DLPack support or is the latest version"
+            ) from e
+    return torch.from_numpy(value.numpy())
+
+
+class ONNXProgram:
+    """A class to represent an ONNX program that is callable with torch tensors.
+
+    Attributes:
+        model: The ONNX model as an ONNX IR model object.
+        exported_program: The exported program that produced the ONNX model.
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self, model: ir.Model, exported_program: torch.export.ExportedProgram | None
@@ -151,16 +226,28 @@ def __call__(self, *args, **kwargs) -> Sequence[torch.Tensor]:
 
         # We don't expect non-tensor as inputs
         ort_input = {
+<<<<<<< HEAD
             k.name: v.numpy(force=True)
+=======
+            k.name: _to_ort_value(v)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for k, v in zip(self.model.graph.inputs, flatten_args)
         }
         run_options = ort.RunOptions()
         run_options.log_severity_level = 3  # 3: Error
         logger.debug("Running the inference session with %s arguments.", len(ort_input))
+<<<<<<< HEAD
         outputs = self._inference_session.run(None, ort_input, run_options=run_options)
         logger.debug("Inference session run completed.")
         # TODO(justinchuby): Maybe output complex tensors as needed
         return tuple(torch.from_numpy(output) for output in outputs)
+=======
+        outputs = self._inference_session.run_with_ort_values(
+            None, ort_input, run_options=run_options
+        )
+        logger.debug("Inference session run completed.")
+        return tuple(_from_ort_value(output) for output in outputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def compute_values(
         self, value_names: Sequence[str], args=(), kwargs=None
@@ -221,6 +308,7 @@ def save(
         the weights are saved as external data in a separate file.
 
         Initializer (model weights) serialization behaviors:
+<<<<<<< HEAD
         * ``include_initializers=True``, ``keep_initializers_as_inputs=False`` (default):
         The initializers are included in the saved model.
         * ``include_initializers=True``, ``keep_initializers_as_inputs=True``:
@@ -235,6 +323,23 @@ def save(
         The initializers are not included in the saved model but are listed as model
         inputs. Choose this option if you want to supply the initializers during
         inference and want to minimize the size of the saved model.
+=======
+
+        * ``include_initializers=True``, ``keep_initializers_as_inputs=False`` (default):
+          The initializers are included in the saved model.
+        * ``include_initializers=True``, ``keep_initializers_as_inputs=True``:
+          The initializers are included in the saved model and kept as model inputs.
+          Choose this option if you want the ability to override the model weights
+          during inference.
+        * ``include_initializers=False``, ``keep_initializers_as_inputs=False``:
+          The initializers are not included in the saved model and are not listed
+          as model inputs. Choose this option if you want to attach the initializers
+          to the ONNX model in a separate, post-processing, step.
+        * ``include_initializers=False``, ``keep_initializers_as_inputs=True``:
+          The initializers are not included in the saved model but are listed as model
+          inputs. Choose this option if you want to supply the initializers during
+          inference and want to minimize the size of the saved model.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Args:
             destination: The path to save the ONNX model to.
@@ -256,6 +361,7 @@ def save(
         if keep_initializers_as_inputs:
             self.model.graph.inputs.extend(original_initializers.values())  # type: ignore[arg-type]
 
+<<<<<<< HEAD
         # Save the model to disk
         if (
             external_data
@@ -271,6 +377,24 @@ def save(
         if keep_initializers_as_inputs:
             self.model.graph.inputs.clear()
             self.model.graph.inputs.extend(original_inputs)
+=======
+        try:
+            # Save the model to disk
+            if (
+                external_data
+                or _count_initializer_size(self.model.graph) > _LARGE_MODEL_THRESHOLD
+            ):
+                onnxscript_apis.save_model_with_external_data(self.model, destination)
+            else:
+                ir.save(self.model, destination)
+        finally:
+            # Revert the changes to the model
+            if not include_initializers:
+                self.model.graph.initializers.update(original_initializers)
+            if keep_initializers_as_inputs:
+                self.model.graph.inputs.clear()
+                self.model.graph.inputs.extend(original_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def apply_weights(self, state_dict: dict[str, torch.Tensor]) -> None:
         """Apply the weights from the specified state dict to the ONNX model.
diff --git a/torch/onnx/_internal/exporter/_registration.py b/torch/onnx/_internal/exporter/_registration.py
index ac81d2301cc2..1df97c3e8c9e 100644
--- a/torch/onnx/_internal/exporter/_registration.py
+++ b/torch/onnx/_internal/exporter/_registration.py
@@ -42,6 +42,12 @@ class OnnxDecompMeta:
     signature: The ONNX signature of the function. When None, the signature is inferred.
     is_custom: Whether the function is a custom function.
     is_complex: Whether the function is a function that handles complex valued inputs.
+<<<<<<< HEAD
+=======
+    opset_introduced:
+        The ONNX opset version in which the function was introduced.
+        Its specifies the minimum ONNX opset version required to use the function.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device: The device the function is registered to. If None, it is registered to all devices.
     skip_signature_inference: Whether to skip signature inference for the function.
     """
@@ -51,6 +57,10 @@ class OnnxDecompMeta:
     signature: _schemas.OpSignature | None
     is_custom: bool = False
     is_complex: bool = False
+<<<<<<< HEAD
+=======
+    opset_introduced: int = 18
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     device: Literal["cuda", "cpu"] | str | None = None  # noqa: PYI051
     skip_signature_inference: bool = False
 
@@ -150,13 +160,21 @@ def opset_version(self) -> int:
         return self._opset_version
 
     @classmethod
+<<<<<<< HEAD
     def from_torchlib(cls) -> ONNXRegistry:
+=======
+    def from_torchlib(cls, opset_version=_constants.TORCHLIB_OPSET) -> ONNXRegistry:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Populates the registry with ATen functions from torchlib.
 
         Args:
             torchlib_registry: The torchlib registry to use for populating the registry.
         """
         registry = cls()
+<<<<<<< HEAD
+=======
+        registry._opset_version = opset_version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for meta in _torchlib_registry.get_torchlib_ops():
             registry._register(meta.fx_target, meta)
 
@@ -185,6 +203,10 @@ def from_torchlib(cls) -> ONNXRegistry:
                 logger.exception("Failed to register '%s'. Skipped", qualified_name)
                 continue
 
+<<<<<<< HEAD
+=======
+        registry._cleanup_registry_based_on_opset_version()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return registry
 
     def _register(
@@ -274,5 +296,27 @@ def is_registered(self, target: TorchOp) -> bool:
         """
         return bool(self.get_decomps(target))
 
+<<<<<<< HEAD
+=======
+    def _cleanup_registry_based_on_opset_version(self) -> None:
+        """Pick the implementation with the highest opset version valid until the current opset version."""
+        cleaned_functions = {}
+        for target_or_name, decomps in self.functions.items():
+            # Filter decompositions to only include those with opset_introduced <= opset_version
+            decomps = [d for d in decomps if d.opset_introduced <= self.opset_version]
+
+            # Keep only the decomposition with the highest opset_introduced
+            if decomps:
+                # Find the maximum opset_introduced
+                max_opset = max(d.opset_introduced for d in decomps)
+
+                # Keep all decompositions with the maximum opset_introduced
+                cleaned_functions[target_or_name] = [
+                    d for d in decomps if d.opset_introduced == max_opset
+                ]
+
+        self.functions = cleaned_functions
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(functions={self.functions})"
diff --git a/torch/onnx/_internal/exporter/_reporting.py b/torch/onnx/_internal/exporter/_reporting.py
index dcb91bdffa37..859750565447 100644
--- a/torch/onnx/_internal/exporter/_reporting.py
+++ b/torch/onnx/_internal/exporter/_reporting.py
@@ -18,12 +18,21 @@
 
 @dataclasses.dataclass
 class ExportStatus:
+<<<<<<< HEAD
     # Whether torch.export.export.export() succeeds
     torch_export: bool | None = None
     # Whether torch.export.export.export(..., strict=False) succeeds
     torch_export_non_strict: bool | None = None
     # Whether torch.jit.trace succeeds
     torch_jit: bool | None = None
+=======
+    # Whether torch.export.export(..., strict=True) succeeds
+    torch_export_strict: bool | None = None
+    # Whether torch.export.export(..., strict=False) succeeds
+    torch_export_non_strict: bool | None = None
+    # Whether torch.export._draft_export.draft_export() succeeds
+    torch_export_draft_export: bool | None = None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Whether decomposition succeeds
     decomposition: bool | None = None
     # Whether ONNX translation succeeds
@@ -46,8 +55,13 @@ def _format_export_status(status: ExportStatus) -> str:
     return (
         f"```\n"
         f"{_status_emoji(status.torch_export_non_strict)} Obtain model graph with `torch.export.export(..., strict=False)`\n"
+<<<<<<< HEAD
         f"{_status_emoji(status.torch_export)} Obtain model graph with `torch.export.export(..., strict=True)`\n"
         f"{_status_emoji(status.torch_jit)} Obtain model graph with `torch.jit.trace`\n"
+=======
+        f"{_status_emoji(status.torch_export_strict)} Obtain model graph with `torch.export.export(..., strict=True)`\n"
+        f"{_status_emoji(status.torch_export_draft_export)} Obtain model graph with `torch.export._draft_export.draft_export`\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f"{_status_emoji(status.decomposition)} Decompose operators for ONNX compatibility\n"
         f"{_status_emoji(status.onnx_translation)} Translate the graph into ONNX\n"
         f"{_status_emoji(status.onnx_checker)} Run `onnx.checker` on the ONNX model\n"
@@ -77,7 +91,15 @@ def _format_exported_program(exported_program: torch.export.ExportedProgram) ->
 
 def construct_report_file_name(timestamp: str, status: ExportStatus) -> str:
     # Status could be None. So we need to check for False explicitly.
+<<<<<<< HEAD
     if not (status.torch_export or status.torch_export_non_strict or status.torch_jit):
+=======
+    if not (
+        status.torch_export_non_strict
+        or status.torch_export_strict
+        or status.torch_export_draft_export
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # All strategies failed
         postfix = "pt_export"
     elif status.decomposition is False:
@@ -90,7 +112,15 @@ def construct_report_file_name(timestamp: str, status: ExportStatus) -> str:
         postfix = "runtime"
     elif status.output_accuracy is False:
         postfix = "accuracy"
+<<<<<<< HEAD
     elif status.torch_export is False or status.torch_export_non_strict is False:
+=======
+    elif (
+        status.torch_export_strict is False
+        or status.torch_export_non_strict is False
+        or status.torch_export_draft_export is False
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Some strategies failed
         postfix = "strategies"
     else:
diff --git a/torch/onnx/_internal/exporter/_testing.py b/torch/onnx/_internal/exporter/_testing.py
index 5323e90bfdca..6c240499961e 100644
--- a/torch/onnx/_internal/exporter/_testing.py
+++ b/torch/onnx/_internal/exporter/_testing.py
@@ -35,7 +35,11 @@ def assert_onnx_program(
         kwargs: The keyword arguments to pass to the program.
             If None, the default example inputs in the ExportedProgram will be used.
         strategy: Assert the capture strategy used to export the program. Values can be
+<<<<<<< HEAD
             class names like "TorchExportStrategy" or "TorchExportNonStrictStrategy" etc.
+=======
+            class names like "TorchExportNonStrictStrategy".
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             If None, the strategy is not asserted.
     """
     if strategy is not None:
@@ -66,15 +70,30 @@ class names like "TorchExportStrategy" or "TorchExportNonStrictStrategy" etc.
     torch_module = exported_program.module()
     torch_outputs, _ = _pytree.tree_flatten(torch_module(*args, **kwargs))
     # ONNX outputs are always real, so we need to convert torch complex outputs to real representations
+<<<<<<< HEAD
     torch_outputs = [
         torch.view_as_real(output) if torch.is_complex(output) else output
         for output in torch_outputs
     ]
+=======
+    torch_outputs_adapted = []
+    for output in torch_outputs:
+        if not isinstance(output, torch.Tensor):
+            torch_outputs_adapted.append(torch.tensor(output))
+        elif torch.is_complex(output):
+            torch_outputs_adapted.append(torch.view_as_real(output))
+        else:
+            torch_outputs_adapted.append(output)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     onnx_outputs = program(*args, **kwargs)
     # TODO(justinchuby): Include output names in the error message
     torch.testing.assert_close(
         tuple(onnx_outputs),
+<<<<<<< HEAD
         tuple(torch_outputs),
+=======
+        tuple(torch_outputs_adapted),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rtol=rtol,
         atol=atol,
         equal_nan=True,
diff --git a/torch/onnx/_internal/exporter/_torchlib/_tensor_typing.py b/torch/onnx/_internal/exporter/_torchlib/_tensor_typing.py
index e8c7318807d0..fa3b49e75ec2 100644
--- a/torch/onnx/_internal/exporter/_torchlib/_tensor_typing.py
+++ b/torch/onnx/_internal/exporter/_torchlib/_tensor_typing.py
@@ -29,7 +29,11 @@
 # NOTE: We do not care about unsigned types beyond UINT8 because PyTorch does not us them.
 # More detail can be found: https://pytorch.org/docs/stable/tensors.html
 
+<<<<<<< HEAD
 _TensorType = Union[
+=======
+TensorType = Union[
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     BFLOAT16,
     BOOL,
     COMPLEX64,
@@ -56,11 +60,19 @@
     INT64,
 ]
 
+<<<<<<< HEAD
 TTensor = TypeVar("TTensor", bound=_TensorType)
 # Duplicate TTensor for inputs/outputs that accept the same set of types as TTensor
 # but do not constrain the type to be the same as the other inputs/outputs
 TTensor2 = TypeVar("TTensor2", bound=_TensorType)
 TTensorOrString = TypeVar("TTensorOrString", bound=Union[_TensorType, STRING])
+=======
+TTensor = TypeVar("TTensor", bound=TensorType)
+# Duplicate TTensor for inputs/outputs that accept the same set of types as TTensor
+# but do not constrain the type to be the same as the other inputs/outputs
+TTensor2 = TypeVar("TTensor2", bound=TensorType)
+TTensorOrString = TypeVar("TTensorOrString", bound=Union[TensorType, STRING])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TFloat = TypeVar("TFloat", bound=_FloatType)
 TFloatOrUInt8 = TypeVar(
     "TFloatOrUInt8", bound=Union[FLOAT, FLOAT16, DOUBLE, INT8, UINT8]
diff --git a/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py b/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
index e71bdeb0c68e..cac53747965c 100644
--- a/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
+++ b/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
@@ -30,6 +30,10 @@ def onnx_impl(
     *,
     trace_only: bool = False,
     complex: bool = False,
+<<<<<<< HEAD
+=======
+    opset_introduced: int = 18,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     no_compile: bool = False,
     private: bool = False,
 ) -> Callable[[_T], _T]:
@@ -74,6 +78,10 @@ def wrapper(
                         fx_target=t,
                         signature=None,
                         is_complex=complex,
+<<<<<<< HEAD
+=======
+                        opset_introduced=opset_introduced,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         skip_signature_inference=no_compile,
                     )
                 )
diff --git a/torch/onnx/_internal/exporter/_torchlib/ops/__init__.py b/torch/onnx/_internal/exporter/_torchlib/ops/__init__.py
index 88f569708bfa..c631bef518ae 100644
--- a/torch/onnx/_internal/exporter/_torchlib/ops/__init__.py
+++ b/torch/onnx/_internal/exporter/_torchlib/ops/__init__.py
@@ -1,6 +1,12 @@
 from __future__ import annotations
 
 
+<<<<<<< HEAD
 __all__ = ["core", "hop"]
 
 from torch.onnx._internal.exporter._torchlib.ops import core, hop
+=======
+__all__ = ["core", "hop", "nn", "symbolic", "symops"]
+
+from torch.onnx._internal.exporter._torchlib.ops import core, hop, nn, symbolic, symops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/_internal/exporter/_torchlib/ops/core.py b/torch/onnx/_internal/exporter/_torchlib/ops/core.py
index 78f1bafd2df2..7f3ec38c4546 100644
--- a/torch/onnx/_internal/exporter/_torchlib/ops/core.py
+++ b/torch/onnx/_internal/exporter/_torchlib/ops/core.py
@@ -1,7 +1,10 @@
 """torch.ops.aten operators under the `core` module."""
 # mypy: disable-error-code="misc,arg-type,type-arg,valid-type,assignment,return-value,type-var,operator,no-untyped-def,index"
 # ruff: noqa: TCH001,TCH002
+<<<<<<< HEAD
 # flake8: noqa
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from __future__ import annotations
 
diff --git a/torch/onnx/_internal/exporter/_torchlib/ops/hop.py b/torch/onnx/_internal/exporter/_torchlib/ops/hop.py
index 986d7ef16b50..d550f04f7610 100644
--- a/torch/onnx/_internal/exporter/_torchlib/ops/hop.py
+++ b/torch/onnx/_internal/exporter/_torchlib/ops/hop.py
@@ -19,7 +19,11 @@ def call_op(
     *args: ir.Value,
     _num_outputs: int = 1,
     _domain: str = "",
+<<<<<<< HEAD
     **kwargs: int | float | str | bool | ir.Graph | ir.TensorProtocol,
+=======
+    **kwargs: int | float | str | bool | ir.Graph | ir.TensorProtocol | Sequence[int],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Sequence[ir.Value]:
     """Call an operator with the given arguments and keyword arguments.
 
@@ -92,3 +96,69 @@ def higher_order_cond(
             (), else_node.outputs, nodes=[else_node], name=false_func.name
         ),
     )
+<<<<<<< HEAD
+=======
+
+
+@onnx_impl(torch.ops.higher_order.scan, no_compile=True)
+def higher_order_scan(
+    body_func: ir.Function,
+    scan_inits: Sequence[ir.Value],
+    scan_inputs: Sequence[ir.Value],
+    additional_inputs: Sequence[ir.Value] | None,
+    reverse: bool = False,
+) -> Sequence[ir.Value]:
+    """https://github.com/pytorch/pytorch/blob/66ac724b56e6c37a534f3e066423ef2f41d7477f/torch/_higher_order_ops/scan.py#L109"""
+    subgraph_inputs = [
+        *[
+            ir.Value(
+                name=f"{inp.name}_{body_func.name}__subgraph_in",
+                shape=inp.shape,
+                type=ir.TensorType(inp.dtype),  # type: ignore[arg-type]
+            )
+            for inp in scan_inits
+        ],
+        *[
+            ir.Value(
+                name=f"{inp.name}_{body_func.name}__subgraph_in",
+                # The iterated element passed to the body subgraph does not have a sequence axis.
+                # It will have a rank one less than the rank of the corresponding scan_input.
+                shape=ir.Shape(inp.shape[1:]),  # type: ignore[index]
+                type=ir.TensorType(inp.dtype),  # type: ignore[arg-type]
+            )
+            for inp in scan_inputs
+        ],
+    ]
+    # The one and only node in the Scan subgraph that calls the body_func
+    body_node = ir.Node(
+        body_func.domain,
+        body_func.name,
+        [
+            *subgraph_inputs,
+            *(additional_inputs or []),
+        ],
+        num_outputs=len(body_func.outputs),
+    )
+
+    # ONNX Runtime complains about duplicate output names if we don't rename them.
+    # But the doesn't seem to be an actual violation of SSA form without renaming.
+    for func_out, out in zip(body_func.outputs, body_node.outputs):
+        out.name = f"{func_out.name}_{body_func.name}"
+
+    n_outputs = len(body_func.outputs) - len(scan_inits)
+    return call_op(
+        "Scan",
+        *scan_inits,
+        *scan_inputs,
+        _num_outputs=len(body_func.outputs),
+        body=ir.Graph(
+            subgraph_inputs,
+            body_node.outputs,
+            nodes=[body_node],
+            name=body_func.name,
+        ),
+        num_scan_inputs=len(scan_inputs),
+        scan_input_directions=[(1 if reverse else 0) for _ in scan_inputs],
+        scan_output_directions=[(1 if reverse else 0) for _ in range(n_outputs)],
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/_internal/exporter/_torchlib/ops/nn.py b/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
new file mode 100644
index 000000000000..8f08f70c3969
--- /dev/null
+++ b/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
@@ -0,0 +1,285 @@
+"""torch.ops.aten operators under the `core` module."""
+# mypy: disable-error-code="misc,arg-type,type-arg,valid-type,assignment,return-value,type-var,operator,no-untyped-def,index"
+# ruff: noqa: TCH001,TCH002
+# flake8: noqa: B950
+
+from __future__ import annotations
+
+from typing import Optional, TYPE_CHECKING
+
+from onnxscript.onnx_opset import (  # type: ignore[attr-defined]
+    opset20 as op20,
+    opset21 as op21,
+    opset23 as op23,
+)
+
+import torch
+from torch.onnx._internal._lazy_import import onnxscript_ir as ir
+from torch.onnx._internal.exporter._torchlib._tensor_typing import TFloat, TReal
+from torch.onnx._internal.exporter._torchlib._torchlib_registry import onnx_impl
+
+
+if TYPE_CHECKING:
+    from onnxscript.values import Opset
+
+aten = torch.ops.aten
+
+_INT64_MAX = 9223372036854775807
+_INT64_MIN = -9223372036854775808
+
+
+@onnx_impl(aten.gelu.default, trace_only=True, opset_introduced=20)
+def aten_gelu_opset20(
+    self: TReal,
+    approximate: str = "none",
+) -> TReal:
+    """gelu(Tensor self, *, bool approximate=False) -> Tensor"""
+    return op20.Gelu(self, approximate=approximate)
+
+
+@onnx_impl(aten.group_norm.default, trace_only=True, opset_introduced=21)
+def aten_group_norm(
+    input: TFloat,
+    num_groups: int,
+    weight: Optional[TFloat] = None,
+    bias: Optional[TFloat] = None,
+    eps: float = 1e-05,
+    cudnn_enabled: bool = True,
+) -> TFloat:
+    """group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor"""
+
+    c = op21.Shape(input, start=1, end=2)
+    if weight is None:
+        weight = op21.ConstantOfShape(c, value=ir.tensor(1.0, dtype=input.dtype))
+    if bias is None:
+        bias = op21.ConstantOfShape(c, value=ir.tensor(0.0, dtype=input.dtype))
+    return op21.GroupNormalization(
+        input, weight, bias, epsilon=eps, num_groups=num_groups
+    )
+
+
+@onnx_impl(
+    aten.scaled_dot_product_attention.default, trace_only=True, opset_introduced=23
+)
+def aten_scaled_dot_product_attention_23(
+    query: TFloat,
+    key: TFloat,
+    value: TFloat,
+    attn_mask: Optional[TFloat] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+    enable_gqa: bool = False,
+) -> TFloat:
+    """scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None, bool enable_gqa=False) -> Tensor
+
+    Reference:
+        1. https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+        2. https://onnx.ai/onnx/operators/onnx__Attention.html
+
+    Attempts to convert SDPA to Attention onnx op and fallbacks to an onnx graph equivivalent to the following PyTorch code::
+        scale_factor = 1 / math.sqrt(Q.size(-1)) if scale is None else scale
+        attn_mask = (
+            torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+            if is_causal
+            else attn_mask
+        )
+        attn_mask = (
+            attn_mask.masked_fill(not attn_mask, -float("inf"))
+            if attn_mask.dtype == torch.bool
+            else attn_mask
+        )
+        attn_weight = torch.softmax(
+            (Q @ K.transpose(-2, -1) *  attn_mask, dim=-1
+        )
+        attn_weight = torch.dropout(attn_weight, dropout_p)
+        return attn_weight @ V
+
+    where Q, K, V are the query, key, and value tensors, respectively.
+    L is the target sequence length, S is the source sequence length, and E is the embedding size.
+    """
+    assert (not is_causal) or (is_causal and attn_mask is None), (
+        "is_causal and attn_mask cannot be set at the same time"
+    )
+    assert len(query.shape) == 4 and len(key.shape) == 4 and len(value.shape) == 4, (
+        "only 4D query, key, and value are supported"
+    )
+
+    # Attention onnx op can only handle non-training scenarios where dropout is disabled.
+    if dropout_p == 0:
+        if enable_gqa:
+            assert (
+                query.shape[1] > key.shape[1] == value.shape[1]
+                and query.shape[1] % key.shape[1] == 0
+            ), (
+                "SDPA (GQA or MQA) requires q_num_heads > kv_num_heads & q_num_heads % kv_num_heads == 0"
+            )
+        else:
+            assert query.shape[1] == key.shape[1] == value.shape[1], (
+                "SDPA (MHA) requires q_num_heads = kv_num_heads"
+            )
+
+        # NOTE: num_heads attributes (q_num_heads/kv_num_heads) should not be specified for 4D.
+        # They are not populated with 4D inputs because this information directy comes from input shapes:
+        # `q_num_heads=query.shape[1]` and `kv_num_heads=key.shape[1]`.
+        # This dimension is usually static but it could not be dynamic if also given as an attribute.
+        # num_heads attributes are needed for 3D attention inputs:
+        # (shape: [B, S, N*H]), 4D shape is ([B, N, S, H]).
+
+        Y, _, _, _ = op23.Attention(
+            query,
+            key,
+            value,
+            attn_mask=attn_mask,
+            scale=scale,
+            is_causal=is_causal,
+        )
+        return Y
+
+    if scale is None:
+        scale = _attention_scale(query, op23)
+    scale = op23.CastLike(scale, query)
+
+    if is_causal:
+        attn_mask = _causal_attention_mask(query, key, op23)
+
+    if attn_mask is None:
+        return _aten_scaled_dot_product_attention_no_mask_onnx(
+            query, key, value, scale, dropout_p, op23
+        )
+
+    return _aten_scaled_dot_product_attention_float_mask_onnx(
+        query, key, value, attn_mask, scale, dropout_p, op23
+    )
+
+
+def _attention_scale(query: TFloat, op: Opset) -> TFloat:
+    """Calculate the scale factor for the attention result.
+
+    Args:
+        query: Tensor of shape [..., L, E]
+
+    Returns:
+        Scalar scale factor := 1 / math.sqrt(query.size(-1))
+    """
+    q_shape = op.Shape(query)
+    q_last_dim = op.Gather(q_shape, op.Constant(value_ints=[-1]))
+    embedding_size = op.CastLike(q_last_dim, query)
+    one = op.Constant(value_float=1.0)
+    cast_one = op.CastLike(one, query)
+    scale = op.Div(cast_one, op.Sqrt(embedding_size))
+    return scale
+
+
+def _causal_attention_mask(query: TFloat, key: TFloat, op: Opset) -> TFloat:
+    """Create a causal mask for the given query and key tensors.
+
+    Equivalent to::
+        mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_mask = torch.zeros(L, S, dtype=torch.float)
+        attn_mask = attn_mask.masked_fill(not mask, -float("inf"))
+
+    Args:
+        query: Tensor of shape [..., L, E]
+        key: Tensor of shape [..., S, E]
+
+    Returns:
+        Tensor of shape [L, S]
+    """
+    q_shape = op.Shape(query)
+    k_shape = op.Shape(key)
+
+    target_length = op.Slice(
+        q_shape, op.Constant(value_ints=[-2]), op.Constant(value_ints=[-1])
+    )
+    source_length = op.Slice(
+        k_shape, op.Constant(value_ints=[-2]), op.Constant(value_ints=[-1])
+    )
+    # attn_mask = torch.ones(L, S) := {
+    size = op.Concat(target_length, source_length, axis=0)
+    attn_mask = op.Expand(op.Constant(value_float=1.0), size)
+    # }
+    attn_mask = op.Trilu(attn_mask, upper=0)
+    # The causal mask has 0s in the lower triangle and -inf in the upper triangle.
+    attn_mask = op.Where(
+        op.Equal(attn_mask, op.Constant(value_float=0.0)),
+        op.Constant(value_float=-float("inf")),
+        op.Constant(value_float=0.0),
+    )
+    attn_mask = op.CastLike(attn_mask, query)
+    return attn_mask
+
+
+def _aten_scaled_dot_product_attention_no_mask_onnx(
+    query: TFloat,
+    key: TFloat,
+    value: TFloat,
+    scale: TFloat,
+    dropout_p: float,
+    op: Opset,
+) -> TFloat:
+    # Swap the last two axes of key
+    key_last_dim = op.Shape(key, start=-1)
+    key_second_last_dim = op.Shape(key, start=-2, end=-1)
+    key_first_dims = op.Shape(key, end=-2)
+    # Contract the dimensions that are not the last two so we can transpose
+    # with a static permutation.
+    key_squeezed_shape = op.Concat(
+        op.Constant(value_ints=[-1]), key_second_last_dim, key_last_dim, axis=0
+    )
+    key_squeezed = op.Reshape(key, key_squeezed_shape)
+    key_squeezed_transposed = op.Transpose(key_squeezed, perm=[0, 2, 1])
+    key_transposed_shape = op.Concat(
+        key_first_dims, key_last_dim, key_second_last_dim, axis=0
+    )
+    key_transposed = op.Reshape(key_squeezed_transposed, key_transposed_shape)
+
+    # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
+    # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
+    query_scaled = op.Mul(query, op.Sqrt(scale))
+    key_transposed_scaled = op.Mul(
+        key_transposed, op.CastLike(op.Sqrt(scale), key_transposed)
+    )
+    attn_weight = op.Softmax(
+        op.MatMul(query_scaled, key_transposed_scaled),
+        axis=-1,
+    )
+    attn_weight, _ = op.Dropout(attn_weight, dropout_p)
+    return op.MatMul(attn_weight, value)
+
+
+def _aten_scaled_dot_product_attention_float_mask_onnx(
+    query: TFloat,
+    key: TFloat,
+    value: TFloat,
+    attn_mask: TFloat,
+    scale: TFloat,
+    dropout_p: float,
+    op: Opset,
+) -> TFloat:
+    # Swap the last two axes of key
+    key_last_dim = op.Shape(key, start=-1)
+    key_second_last_dim = op.Shape(key, start=-2, end=-1)
+    key_first_dims = op.Shape(key, end=-2)
+    # Contract the dimensions that are not the last two so we can transpose
+    # with a static permutation.
+    key_squeezed_shape = op.Concat(
+        op.Constant(value_ints=[-1]), key_second_last_dim, key_last_dim, axis=0
+    )
+    key_squeezed = op.Reshape(key, key_squeezed_shape)
+    key_squeezed_transposed = op.Transpose(key_squeezed, perm=[0, 2, 1])
+    key_transposed_shape = op.Concat(
+        key_first_dims, key_last_dim, key_second_last_dim, axis=0
+    )
+    key_transposed = op.Reshape(key_squeezed_transposed, key_transposed_shape)
+
+    # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
+    # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
+    query_scaled = op.Mul(query, op.Sqrt(scale))
+    key_transposed_scaled = op.Mul(key_transposed, op.Sqrt(scale))
+    attn_weight = op.Softmax(
+        op.Add(op.MatMul(query_scaled, key_transposed_scaled), attn_mask),
+        axis=-1,
+    )
+    attn_weight, _ = op.Dropout(attn_weight, dropout_p)
+    return op.MatMul(attn_weight, value)
diff --git a/torch/onnx/_internal/exporter/_torchlib/ops/symbolic.py b/torch/onnx/_internal/exporter/_torchlib/ops/symbolic.py
new file mode 100644
index 000000000000..3a30d30cc4fe
--- /dev/null
+++ b/torch/onnx/_internal/exporter/_torchlib/ops/symbolic.py
@@ -0,0 +1,149 @@
+"""Implementation for higher-order operators."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from onnxscript.ir import convenience as ir_convenience
+
+import torch
+from torch.onnx._internal._lazy_import import onnxscript_ir as ir
+from torch.onnx._internal.exporter import _core
+from torch.onnx._internal.exporter._torchlib._torchlib_registry import onnx_impl
+from torch.onnx.ops import _symbolic_impl
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+def _call_symbolic_op(
+    op_type: str,
+    domain: str,
+    args: Sequence[ir.Value | None],
+    kwargs: dict[str, int | float | str | bool | list[int] | list[float] | list[str]],
+    dtypes: Sequence[int],
+    version: int | None,
+    metadata_props: dict[str, str] | None,
+) -> Sequence[ir.Value]:
+    """Call an operator with the given arguments and keyword arguments.
+
+    Arguments are always inputs, while keyword arguments are attributes.
+    """
+    # This is a wrapper around the IR node creation that hooks into the _builder.OpRecorder
+    # tracer so that all nodes created are recorded the same way as if we were to use
+    # onnxscript ops directly.
+
+    assert _core.current_tracer is not None
+    tracer = _core.current_tracer
+
+    inputs = list(args)
+
+    # If final inputs are None, strip them from the node inputs
+    for input in reversed(inputs):
+        if input is not None:
+            break
+        inputs.pop()
+
+    # Construct and filter out None attributes
+    attributes = [
+        attr
+        for attr in ir_convenience.convert_attributes(kwargs)  # type: ignore[arg-type]
+        if attr.value is not None  # type: ignore[union-attr]
+    ]
+    tracer.nodes.append(
+        node := ir.Node(
+            domain,
+            op_type,
+            inputs=inputs,
+            attributes=attributes,
+            num_outputs=len(dtypes),
+            version=version,
+            metadata_props=metadata_props,
+        )
+    )
+    # Set the dtypes for the outputs. We set them here because the graph builder
+    # Uses PyTorch types which are sometimes inaccurate when they are ONNX only
+    # types like float4e2m1.
+    for value, dtype in zip(node.outputs, dtypes):
+        value.dtype = ir.DataType(dtype)
+        # The shape is set by the graph builder. We don't need to set it here.
+    return node.outputs
+
+
+@onnx_impl(torch.ops.onnx_symbolic._symbolic.default, no_compile=True)
+def onnx_symbolic_symbolic(
+    inputs: Sequence[ir.Value | None],
+    op_type: str,
+    onnx_dtype: int,
+    *,
+    shape: Sequence[int | ir.Value],
+    attr_keys: Sequence[str],
+    attr_types: Sequence[str],
+    attr_pos: Sequence[tuple[int, int]],
+    attr_ints: Sequence[int],
+    attr_floats: Sequence[float],
+    attr_strs: Sequence[str],
+    metadata_props_keys: Sequence[str] = (),
+    metadata_props_values: Sequence[str] = (),
+    domain: str = "",
+    version: int | None = None,
+) -> ir.Value:
+    del shape  # Unused. The shapes are set by the graph builder
+    encoded = _symbolic_impl.EncodedAttrs(
+        attr_keys=list(attr_keys),
+        attr_types=list(attr_types),
+        attr_pos=list(attr_pos),
+        attr_ints=list(attr_ints),
+        attr_floats=list(attr_floats),
+        attr_strs=list(attr_strs),
+    )
+    attrs = encoded.to_dict()
+    return _call_symbolic_op(
+        op_type,
+        domain,
+        inputs,
+        attrs,
+        dtypes=[onnx_dtype],
+        version=version,
+        metadata_props=dict(zip(metadata_props_keys, metadata_props_values)),
+    )[0]
+
+
+@onnx_impl(torch.ops.onnx_symbolic._symbolic_multi_out.default, no_compile=True)
+def onnx_symbolic_symbolic_multi_out(
+    inputs: Sequence[ir.Value | None],
+    op_type: str,
+    onnx_dtypes: Sequence[int],
+    *,
+    shapes: Sequence[Sequence[int | ir.Value]],
+    attr_keys: Sequence[str],
+    attr_types: Sequence[str],
+    attr_pos: Sequence[tuple[int, int]],
+    attr_ints: Sequence[int],
+    attr_floats: Sequence[float],
+    attr_strs: Sequence[str],
+    metadata_props_keys: Sequence[str] = (),
+    metadata_props_values: Sequence[str] = (),
+    domain: str = "",
+    version: int | None = None,
+) -> Sequence[ir.Value]:
+    del shapes  # Unused. The shapes are set by the graph builder
+    encoded = _symbolic_impl.EncodedAttrs(
+        attr_keys=list(attr_keys),
+        attr_types=list(attr_types),
+        attr_pos=list(attr_pos),
+        attr_ints=list(attr_ints),
+        attr_floats=list(attr_floats),
+        attr_strs=list(attr_strs),
+    )
+    attrs = encoded.to_dict()
+    return _call_symbolic_op(
+        op_type,
+        domain,
+        inputs,
+        attrs,
+        dtypes=onnx_dtypes,
+        version=version,
+        metadata_props=dict(zip(metadata_props_keys, metadata_props_values)),
+    )
diff --git a/torch/onnx/_internal/exporter/_torchlib/ops/symops.py b/torch/onnx/_internal/exporter/_torchlib/ops/symops.py
new file mode 100644
index 000000000000..bba780fed535
--- /dev/null
+++ b/torch/onnx/_internal/exporter/_torchlib/ops/symops.py
@@ -0,0 +1,41 @@
+"""Implementation for torch.sym* ops."""
+
+# mypy: disable-error-code="misc,arg-type,type-arg,valid-type,assignment,return-value,type-var,operator,no-untyped-def,index"
+# ruff: noqa: TCH001,TCH002
+
+from __future__ import annotations
+
+from onnxscript.onnx_opset import opset18 as op
+
+import torch
+from torch.onnx._internal.exporter._torchlib._tensor_typing import (
+    BOOL,
+    FLOAT,
+    IntType,
+    TensorType,
+)
+from torch.onnx._internal.exporter._torchlib._torchlib_registry import onnx_impl
+
+
+@onnx_impl(torch.sym_float, trace_only=True)
+def sym_float(self: TensorType) -> FLOAT:
+    """sym_float(SymInt self) -> SymFloat"""
+    return op.Cast(self, to=FLOAT.dtype)
+
+
+@onnx_impl(torch.sym_max, trace_only=True)
+def sym_max(x: IntType, y: IntType) -> IntType:
+    """sym_max(SymInt x, SymInt y) -> SymInt"""
+    return op.Max(x, y)
+
+
+@onnx_impl(torch.sym_min, trace_only=True)
+def sym_min(x: IntType, y: IntType) -> IntType:
+    """sym_min(SymInt x, SymInt y) -> SymInt"""
+    return op.Min(x, y)
+
+
+@onnx_impl(torch.sym_not, trace_only=True)
+def sym_not(self: BOOL) -> BOOL:
+    """sym_not(SymBool self) -> SymBool"""
+    return op.Not(self)
diff --git a/torch/onnx/_internal/exporter/_type_casting.py b/torch/onnx/_internal/exporter/_type_casting.py
new file mode 100644
index 000000000000..7f2141fe577e
--- /dev/null
+++ b/torch/onnx/_internal/exporter/_type_casting.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+import torch
+
+
+def unpack_float4x2_as_uint8(tensor: torch.Tensor) -> np.ndarray:
+    """Convert a float4x2 tensor to unpacked uint8 np array."""
+    assert tensor.dtype == torch.float4_e2m1fn_x2
+    data = tensor.view(torch.uint8).numpy(force=True).flatten()
+    result_size = tensor.numel() * 2
+    result = np.empty([result_size], dtype=np.uint8)
+    array_low = data & np.uint8(0x0F)
+    array_high = data & np.uint8(0xF0)
+    array_high >>= np.uint8(4)
+    result[0::2] = array_low
+    result[1::2] = array_high
+    result.resize(get_float4_shape(tensor), refcheck=False)
+    return result
+
+
+def get_float4_shape(tensor: torch.Tensor) -> tuple[int, ...]:
+    """Get the shape of an unpacked float4 tensor.
+
+    The float4_e2m1fn_x2 type is a shell type described in
+    https://github.com/pytorch/pytorch/issues/146414.
+
+    the shell dtype is takes up 1 byte per element and semantically represents
+    two fp4 values packed into 1 byte. Semantically it represents (*tensor.shape[:-1], tensor.shape[-1]*2)
+    fp4 elements.
+    """
+    assert tensor.dtype == torch.float4_e2m1fn_x2
+    return (*tensor.shape[:-1], tensor.shape[-1] * 2)
diff --git a/torch/onnx/_internal/exporter/_verification.py b/torch/onnx/_internal/exporter/_verification.py
index 8b98db5f85de..dcbb4b7fdfb8 100644
--- a/torch/onnx/_internal/exporter/_verification.py
+++ b/torch/onnx/_internal/exporter/_verification.py
@@ -98,6 +98,31 @@ def from_tensors(
             actual_dtype=actual.dtype,
         )
 
+<<<<<<< HEAD
+=======
+    def asdict(self) -> dict[str, Any]:
+        """Convert the VerificationInfo object to a dictionary.
+
+        Returns:
+            A dictionary representation of the VerificationInfo object.
+        """
+        return {
+            "name": self.name,
+            "max_abs_diff": self.max_abs_diff,
+            "max_rel_diff": self.max_rel_diff,
+            "abs_diff_hist": [
+                self.abs_diff_hist[0].tolist(),
+                self.abs_diff_hist[1].tolist(),
+            ],
+            "rel_diff_hist": [
+                self.rel_diff_hist[0].tolist(),
+                self.rel_diff_hist[1].tolist(),
+            ],
+            "expected_dtype": str(self.expected_dtype),
+            "actual_dtype": str(self.actual_dtype),
+        }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _compare_tensors(
     expected: torch.Tensor,
@@ -202,7 +227,11 @@ def _create_value_mapping(graph: ir.Graph) -> dict[str, ir.Value]:
     Returns:
         A dictionary mapping names to values.
     """
+<<<<<<< HEAD
     values = {}
+=======
+    values: dict[str, ir.Value] = {}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     values.update(graph.initializers)
     # The names of the values can be None or "", which we need to exclude
     for input in graph.inputs:
@@ -300,6 +329,10 @@ def run_node(self, n: torch.fx.Node) -> Any:
                 "Failed to compute value for node %s: %s",
                 node_name,
                 e,
+<<<<<<< HEAD
+=======
+                exc_info=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             return result
         info = VerificationInfo.from_tensors(
diff --git a/torch/onnx/_internal/fx/_pass.py b/torch/onnx/_internal/fx/_pass.py
index 5246788756f3..4cb39c3886f3 100644
--- a/torch/onnx/_internal/fx/_pass.py
+++ b/torch/onnx/_internal/fx/_pass.py
@@ -6,14 +6,20 @@
 import dataclasses
 import difflib
 import io
+<<<<<<< HEAD
 import logging
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import sys
 from typing import Any, Callable, TYPE_CHECKING
 
 import torch
 import torch.fx
 from torch._subclasses.fake_tensor import unset_fake_temporarily
+<<<<<<< HEAD
 from torch.onnx._internal.fx import diagnostics, onnxfunction_dispatcher
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -179,6 +185,7 @@ class Transform(abc.ABC):
     are needed to reconcile :attr:`ONNXProgram.model_proto`.
     That is, the model signature and the model representation must match.
 
+<<<<<<< HEAD
     As an additional feature, this class provides builtin support for transformation recording using the diagnostics.
     The granularity of overriding is up to the user. And it affects the granularity of
     the diagnostics information. For example, if `_run()` is overridden, the
@@ -186,13 +193,18 @@ class Transform(abc.ABC):
     if `call_function()` is overridden, the diagnostics information will additionally
     contain the node level information of `call_function()`.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     TODO(bowbao): Add more overridable methods in call hierarchy
     TODO(bowbao): Create an example once more overridable methods are added.
     """
 
+<<<<<<< HEAD
     diagnostic_context: diagnostics.DiagnosticContext
     """The diagnostic context for recording diagnostics."""
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     module: torch.fx.GraphModule
     """The module to be transformed."""
 
@@ -201,16 +213,24 @@ class Transform(abc.ABC):
 
     def __init__(
         self,
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module: torch.fx.GraphModule,
     ):
         """Initialize the transform.
 
         Args:
+<<<<<<< HEAD
             diagnostic_context: The diagnostic context for recording diagnostics.
             module: The module to be transformed.
         """
         self.diagnostic_context = diagnostic_context
+=======
+            module: The module to be transformed.
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.module = module
         self.fake_mode = self._detect_fake_mode()
 
@@ -237,10 +257,13 @@ def _maybe_fakefy_args(
     @abc.abstractmethod
     def _run(self, *args, **kwargs) -> torch.fx.GraphModule: ...
 
+<<<<<<< HEAD
     @diagnostics.diagnose_call(
         diagnostics.rules.fx_pass,
         diagnostic_message_formatter=_transform_diagnose_call_message_formatter,
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def run(self, *args, **kwargs) -> torch.fx.GraphModule:
         """Run the transform on `self.module`.
 
@@ -251,6 +274,7 @@ def run(self, *args, **kwargs) -> torch.fx.GraphModule:
             *args: Positional arguments for `self.module` to run.
             **kwargs: Keyword arguments for `self.module` to run.
         """
+<<<<<<< HEAD
         diagnostic = self.diagnostic_context.inflight_diagnostic(
             rule=diagnostics.rules.fx_pass
         )
@@ -321,3 +345,6 @@ def __init__(
 
     @abc.abstractmethod
     def analyze(self, diagnostic_level: diagnostics.infra.Level) -> AnalysisResult: ...
+=======
+        return self._run(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/onnx/_internal/fx/decomposition_table.py b/torch/onnx/_internal/fx/decomposition_table.py
index 1b830f91a9fd..21712dadd219 100644
--- a/torch/onnx/_internal/fx/decomposition_table.py
+++ b/torch/onnx/_internal/fx/decomposition_table.py
@@ -80,7 +80,11 @@ def create_onnx_friendly_decomposition_table(
     built-in aten-to-aten decomposition.
 
     Args:
+<<<<<<< HEAD
         registry (torch.onnx.OnnxRegistry): The ONNX registry for PyTorch.
+=======
+        registry: The ONNX registry for PyTorch.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Returns:
         Dict[torch._ops.OperatorBase, Callable]: A dictionary that maps op overloads to their corresponding
diff --git a/torch/onnx/_internal/fx/fx_onnx_interpreter.py b/torch/onnx/_internal/fx/fx_onnx_interpreter.py
index 1dcf7dafe761..8fc5a32fe3b8 100644
--- a/torch/onnx/_internal/fx/fx_onnx_interpreter.py
+++ b/torch/onnx/_internal/fx/fx_onnx_interpreter.py
@@ -2,9 +2,13 @@
 from __future__ import annotations
 
 import inspect
+<<<<<<< HEAD
 import logging
 import operator
 import re
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Callable, TYPE_CHECKING
 
 import onnxscript
@@ -17,7 +21,10 @@
 from torch.onnx import _type_utils as jit_type_utils
 from torch.onnx._internal.fx import (
     _pass,
+<<<<<<< HEAD
     diagnostics,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     onnxfunction_dispatcher,
     type_utils as fx_type_utils,
 )
@@ -48,6 +55,7 @@ def _fx_graph_to_onnx_message_formatter(
     return f"FX Graph: {fx_graph_module._get_name()}. "
 
 
+<<<<<<< HEAD
 def _location_from_fx_stack_trace(
     node_stack_trace: str,
 ) -> diagnostics.infra.Location | None:
@@ -86,6 +94,8 @@ def _location_from_fx_stack_trace(
     return None
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _retrieve_or_adapt_input_to_graph_set(
     fx_node_arg: fx_type_utils.Argument,
     fx_name_to_onnxscript_value: dict[
@@ -379,6 +389,7 @@ class FxOnnxInterpreter:
     Each operator's implementation returns either an `onnxscript.OnnxFunction` or
     `onnxscript.TracedOnnxFunction` instance based on the dispatch algorithm. They can
     also raise RuntimeError: If there are no overloaded functions available for the given FX node.
+<<<<<<< HEAD
 
     TODO: Convert methods to @staticmethod when the diagnostic system supports it
           DO NOT ADD NEW ATTRIBUTES TO THIS CLASS!
@@ -397,6 +408,10 @@ def __init__(
         diagnostics.rules.fx_node_to_onnx,
         diagnostic_message_formatter=_fx_node_to_onnx_message_formatter,
     )
+=======
+    """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def run_node(
         self,
         node,
@@ -423,6 +438,7 @@ def run_node(
         Raises:
             RuntimeError: When a node.op is not supported.
         """
+<<<<<<< HEAD
         # Record stack trace of node in diagnostic.
         node_stack_trace = node.stack_trace
         if node_stack_trace:
@@ -435,6 +451,8 @@ def run_node(
             if location is not None:
                 diagnostic.with_location(location)
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if node.op == "placeholder":
             self.placeholder(node, onnxscript_graph, fx_name_to_onnxscript_value)
         elif node.op == "get_attr":
@@ -468,10 +486,13 @@ def run_node(
         else:
             raise RuntimeError(f"Found node type not defined in torch.fx: {node.op}")
 
+<<<<<<< HEAD
     @diagnostics.diagnose_call(
         diagnostics.rules.fx_graph_to_onnx,
         diagnostic_message_formatter=_fx_graph_to_onnx_message_formatter,
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def run(
         self,
         fx_graph_module: torch.fx.GraphModule,
@@ -488,6 +509,7 @@ def run(
                 `fx_graph_module` is a submodule. If not provided,
                 `fx_graph_module` is assumed to be the root module.
         """
+<<<<<<< HEAD
         diagnostic = self.diagnostic_context.inflight_diagnostic()
         with diagnostic.log_section(logging.DEBUG, "FX Graph:"):
             diagnostic.debug(
@@ -495,6 +517,8 @@ def run(
                 diagnostics.LazyString(fx_graph_module.print_readable, False),
             )
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if parent_onnxscript_graph is not None:
             # If parent_onnxscript_graph is provided, we assume fx_graph_module is a
             # submodule representing a forward call of an nn.Module.
@@ -551,9 +575,12 @@ def run(
                     fx_name_to_onnxscript_value,
                 )
 
+<<<<<<< HEAD
         with diagnostic.log_section(logging.DEBUG, "ONNX Graph:"):
             diagnostic.debug("```\n%s\n```", onnxscript_graph.torch_graph)  # type: ignore[attr-defined]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return onnxscript_graph
 
     def placeholder(
@@ -655,7 +682,10 @@ def call_function(
             node=node,
             onnx_args=onnx_args,  # type: ignore[arg-type]
             onnx_kwargs=onnx_kwargs,
+<<<<<<< HEAD
             diagnostic_context=self.diagnostic_context,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         with onnxscript.evaluator.default_as(onnxscript_tracer):
             output: (
diff --git a/torch/onnx/_internal/fx/onnxfunction_dispatcher.py b/torch/onnx/_internal/fx/onnxfunction_dispatcher.py
index 2dea3d831bed..cc98c2ded389 100644
--- a/torch/onnx/_internal/fx/onnxfunction_dispatcher.py
+++ b/torch/onnx/_internal/fx/onnxfunction_dispatcher.py
@@ -1,21 +1,36 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 """Dispatcher for AtenLib functions from onnx-script."""
+=======
+"""Dispatcher for AtenLib functions from onnx-script.
+
+This is a deprecated module to be removed.
+"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from __future__ import annotations
 
 import logging
 import operator
 import types
+<<<<<<< HEAD
 from typing import Any, Callable, TYPE_CHECKING
+=======
+from typing import Any, TYPE_CHECKING
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._ops
 import torch.fx
+<<<<<<< HEAD
 from torch.onnx._internal.fx import (
     diagnostics,
     registration,
     type_utils as fx_type_utils,
 )
+=======
+from torch.onnx._internal.fx import registration, type_utils as fx_type_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -26,6 +41,7 @@
         graph_building as onnxscript_graph_building,
     )
 
+<<<<<<< HEAD
     from torch.onnx import OnnxRegistry
 
 
@@ -54,6 +70,12 @@ def _find_operator_overloads_in_onnx_registry_disagnostic_message_formatter(
 ) -> str:
     """Format the diagnostic message for the nearest match warning."""
     return f"Searching operator overload: '{node.target}' in onnx registry...\n"
+=======
+    from torch.onnx._internal._exporter_legacy import OnnxRegistry
+
+
+logger = logging.getLogger(__name__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class OnnxFunctionDispatcher:
@@ -84,16 +106,24 @@ class OnnxFunctionDispatcher:
     def __init__(
         self,
         onnx_registry: OnnxRegistry,
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """Initialize the ONNX Function dispatcher.
 
         Args:
             onnx_registry: The ONNX registry.
+<<<<<<< HEAD
             diagnostic_context: The diagnostic context to use for reporting errors.
         """
         self.onnx_registry = onnx_registry
         self.diagnostic_context = diagnostic_context
+=======
+        """
+        self.onnx_registry = onnx_registry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def dispatch(
         self,
@@ -102,14 +132,21 @@ def dispatch(
             fx_type_utils.TensorLike | str | int | float | bool | list | complex | None
         ],
         onnx_kwargs: dict[str, fx_type_utils.Argument],
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> onnxscript.OnnxFunction | onnxscript.TracedOnnxFunction:
         """Dispatches an ONNX function based on the given FX node, arguments, and keyword arguments.
         Args:
             node: The TorchFX node to dispatch the function for.
             onnx_args: The arguments of the ONNX function.
             onnx_kwargs: The keyword arguments of the ONNX function.
+<<<<<<< HEAD
             diagnostic_context: The diagnostic context to use for reporting errors.
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Returns:
             Either an `onnxscript.OnnxFunction` or `onnxscript.TracedOnnxFunction` instance based on the dispatch algorithm.
         Raises:
@@ -117,9 +154,13 @@ def dispatch(
         """
         # If there are no overloaded functions available for the given FX node, raise an
         # unsupported error
+<<<<<<< HEAD
         default_and_custom_functions = self.get_function_overloads(
             node, diagnostic_context
         )
+=======
+        default_and_custom_functions = self.get_function_overloads(node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # If there are overloaded functions available, we will find one that perfect or
         # nearest matches the given arguments and keyword arguments
@@ -128,14 +169,20 @@ def dispatch(
             default_and_custom_functions,
             onnx_args,
             onnx_kwargs,
+<<<<<<< HEAD
             diagnostic_context,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _filter_or_keep_complex(
         self,
         node,
         default_and_custom_functions: list[registration.ONNXFunction],
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> list[registration.ONNXFunction]:
         """Filter the complex functions if the input has complex dtype."""
 
@@ -146,6 +193,7 @@ def _filter_or_keep_complex(
             ]
             # If we can't find the complex function group, raise error.
             if not default_and_custom_functions:
+<<<<<<< HEAD
                 op_full_name = self._get_aten_name(
                     node, diagnostic_context
                 ).qualified_name()
@@ -158,12 +206,20 @@ def _filter_or_keep_complex(
                 )
                 diagnostic_context.log(diagnostic)
                 raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+=======
+                op_full_name = self._get_aten_name(node).qualified_name()
+                raise RuntimeError(
+                    f"Cannot find any COMPLEX symbolic function for {op_full_name}, "
+                    f"which should be registered under {node.target}.",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             default_and_custom_functions = [
                 func for func in default_and_custom_functions if not func.is_complex
             ]
             # If we can't find the complex function group, raise error.
             if not default_and_custom_functions:
+<<<<<<< HEAD
                 op_full_name = self._get_aten_name(
                     node, diagnostic_context
                 ).qualified_name()
@@ -185,12 +241,27 @@ def _filter_or_keep_complex(
     def _find_the_perfect_or_nearest_match_onnxfunction(
         self,
         node: torch.fx.Node,  # this is used in diagnostic_message_formatter
+=======
+                op_full_name = self._get_aten_name(node).qualified_name()
+                raise RuntimeError(
+                    f"Can ONLY find COMPLEX symbolic function for {op_full_name}, "
+                    f"which should be registered under {node.target}.",
+                )
+        return default_and_custom_functions
+
+    def _find_the_perfect_or_nearest_match_onnxfunction(
+        self,
+        node: torch.fx.Node,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         default_and_custom_functions: list[registration.ONNXFunction],
         onnx_args: Sequence[
             fx_type_utils.TensorLike | str | int | float | bool | list | complex | None
         ],
         onnx_kwargs: dict[str, fx_type_utils.Argument],
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         """Find the perfect/nearest matched OnnxFunction for the given FX node, arguments, and keyword arguments.
 
@@ -199,7 +270,10 @@ def _find_the_perfect_or_nearest_match_onnxfunction(
                 custom ones appearing after the default ones.
             onnx_args: Arguments organized in PyTorch inputs way.
             onnx_kwargs: Keyword arguments organized in PyTorch inputs way.
+<<<<<<< HEAD
             diagnostic_context: The diagnostic context to use for reporting errors.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             Returns:
                 Either an `onnxscript.OnnxFunction` or `onnxscript.TracedOnnxFunction` instance based on the dispatch algorithm.
@@ -207,7 +281,10 @@ def _find_the_perfect_or_nearest_match_onnxfunction(
                 RuntimeError: If there are no overloaded functions available for the given FX node.
         """
         overload_match_ranking: dict[registration.ONNXFunction, int | None] = {}
+<<<<<<< HEAD
         diagnostic = diagnostic_context.inflight_diagnostic()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Iterate the overloaded functions in reverse order to prioritize the custom ones
         # over the default ones, and find the perfect match.
@@ -215,9 +292,13 @@ def _find_the_perfect_or_nearest_match_onnxfunction(
             function_opschema = _OnnxSchemaChecker(symbolic_function.onnx_function)
 
             # NOTE: 1. If the perfect match is found, return the function
+<<<<<<< HEAD
             if function_opschema.perfect_match_inputs(
                 diagnostic, onnx_args, onnx_kwargs
             ):
+=======
+            if function_opschema.perfect_match_inputs(onnx_args, onnx_kwargs):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return symbolic_function.onnx_function
             # Record the match score for the nearest match if it's not the perfect match
             overload_match_ranking[symbolic_function] = function_opschema.match_score
@@ -230,6 +311,7 @@ def _find_the_perfect_or_nearest_match_onnxfunction(
         if not overload_match_ranking:
             # If there are no overloaded functions available for the given FX node, raise an
             # unsupported error
+<<<<<<< HEAD
             op_full_name = self._get_aten_name(
                 node, diagnostic_context
             ).qualified_name()
@@ -249,6 +331,14 @@ def _find_the_perfect_or_nearest_match_onnxfunction(
             "a nearest match is found. Please check the ONNX output carefully. \n",
         )
         diagnostic.level = diagnostics.levels.WARNING
+=======
+            op_full_name = self._get_aten_name(node).qualified_name()
+            raise RuntimeError(
+                f"Cannot find any perfect/nearest match of symbolic function for {op_full_name},"
+                f"which should be registered under {node.target}.",
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # NOTE: 3. Tie breaker: if there are multiple nearest matches, we will choose the one
         # that is custom first. If there are multiple custom ones, we will choose the one
         # that is added lastly in the list.
@@ -263,14 +353,21 @@ def _find_the_perfect_or_nearest_match_onnxfunction(
         )
         return symbolic_function_list[0].onnx_function
 
+<<<<<<< HEAD
     def _get_aten_name(
         self, node: torch.fx.Node, diagnostic_context: diagnostics.DiagnosticContext
     ) -> registration.OpName:
+=======
+    def _get_aten_name(self, node: torch.fx.Node) -> registration.OpName:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Get the OpName from the target.
 
         Args:
             node: The TorchFX node to get the aten name for.
+<<<<<<< HEAD
             diagnostic_context: The diagnostic context to use for reporting errors.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Returns:
             The internal op name within dataclass: registration.OpName.
@@ -283,6 +380,7 @@ def _get_aten_name(
             # aten::sym_size is the only OverloadPacket that we support.
             # schema: aten::sym_size(Tensor self, int dim) -> Tensor
             if node.target != torch.ops.aten.sym_size:
+<<<<<<< HEAD
                 diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
                     diagnostics.rules.no_symbolic_function_for_call_function,
                     diagnostics.levels.ERROR,
@@ -291,6 +389,11 @@ def _get_aten_name(
                 )
                 diagnostic_context.log(diagnostic)
                 raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+=======
+                raise RuntimeError(
+                    f"Unsupported OverloadPacket: {node.target}, aten.sym_size is the only allowed OverloadPacket!",
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # TODO(titaiwang): aten::sym_size has overload, but fx graph is using
             # overloadpacket for some reasons.
             # https://github.com/pytorch/pytorch/issues/97201
@@ -304,6 +407,7 @@ def _get_aten_name(
                     isinstance(node_arg, torch.fx.Node)
                     and not fx_type_utils.is_torch_symbolic_type(node_arg.meta["val"])
                 ):
+<<<<<<< HEAD
                     diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
                         diagnostics.rules.no_symbolic_function_for_call_function,
                         diagnostics.levels.ERROR,
@@ -313,12 +417,19 @@ def _get_aten_name(
                     )
                     diagnostic_context.log(diagnostic)
                     raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+=======
+                    raise RuntimeError(
+                        f"Unsupported node arg: {node_arg} (type {type(node_arg)}) with builtin function: {node.target},"
+                        " only int/float/SymInt/SymFloat is supported with built-in ops!",
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return registration.OpName.from_builtin_function(node.target)
 
         if isinstance(node.target, torch._ops.OpOverload):
             return registration.OpName.from_op_overload(op_overload=node.target)
 
         # Unexpected target, raise error.
+<<<<<<< HEAD
         diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
             diagnostics.rules.no_symbolic_function_for_call_function,
             diagnostics.levels.ERROR,
@@ -336,21 +447,35 @@ def get_function_overloads(
         self,
         node: torch.fx.Node,
         diagnostic_context: diagnostics.DiagnosticContext,
+=======
+        raise RuntimeError(f"Unknown call_function target: {node.target}")
+
+    def get_function_overloads(
+        self,
+        node: torch.fx.Node,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> list[registration.ONNXFunction]:
         """Get the function overloads from the registry.
 
         Args:
             node: The node to get the function overloads for.
+<<<<<<< HEAD
             diagnostic_context: The diagnostic context to use for reporting errors.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Returns:
             The list contains ONNXFunctions, starting with the default ones and
             followed by any custom ones.
         """
 
+<<<<<<< HEAD
         internal_opname: registration.OpName = self._get_aten_name(
             node=node, diagnostic_context=diagnostic_context
         )
+=======
+        internal_opname: registration.OpName = self._get_aten_name(node=node)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # If the ATen/Custom operators are not registered, the group will be None.
         # And non-registered ATen/Custom operators will trigger error in the next step.
@@ -371,6 +496,7 @@ def get_function_overloads(
             )
             if function_group is not None:
                 op_full_name = internal_opname.qualified_name()
+<<<<<<< HEAD
                 diagnostic = diagnostic_context.inflight_diagnostic()
                 diagnostic.warning(
                     "### The operator overload is not found in onnx registry!\n"
@@ -396,6 +522,19 @@ def get_function_overloads(
         )
         diagnostic_context.log(diagnostic)
         raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+=======
+
+        if function_group is not None:
+            # NOTE: If the input has complex dtype, we will only dispatch to the complex functions.
+            function_group = self._filter_or_keep_complex(node, function_group)
+            return function_group  # type: ignore[return-value]
+
+        op_full_name = internal_opname.qualified_name()
+        raise RuntimeError(
+            f"Cannot find symbolic function for {op_full_name}, "
+            f"which should be registered under {node.target}.",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _OnnxSchemaChecker:
@@ -530,7 +669,10 @@ def match_score(self) -> int | None:
 
     def perfect_match_inputs(
         self,
+<<<<<<< HEAD
         diagnostic: diagnostics.Diagnostic,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         args: Sequence[
             fx_type_utils.TensorLike | str | int | float | bool | list | complex | None
         ],
@@ -550,7 +692,10 @@ def perfect_match_inputs(
         nearest matching.
 
         Args:
+<<<<<<< HEAD
             diagnostic: The diagnostic to use for logging detailed info.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             args: The input arguments organized in PyTorch inputs way.
             kwargs: The input keyword arguments organized in PyTorch inputs way.
 
@@ -569,6 +714,7 @@ def perfect_match_inputs(
             kwargs,
             fill_defaults=True,  # fill defaults for optional arguments to match
         )
+<<<<<<< HEAD
         with diagnostic.log_section(logging.INFO, "Checking perfect match..."):
             diagnostic.info(
                 "%s",
@@ -659,6 +805,67 @@ def perfect_match_inputs(
             self._record_matching_score(function_inputs, function_attributes)
             diagnostic.info("match score: %d", self.match_score)
             return is_perfect_match
+=======
+        # NOTE: 1. Check if the input number and attribute names match the
+        # OpSchema. If it's not, we know the function is not eligible to be a perfect
+        # match, nor a nearest match.
+        # We use is_perfect_match to postpone the return value to the end
+        # of the function, as we want to log all the mismatch info.
+        is_perfect_match = True
+        if len(function_inputs) != len(self.op_schema.inputs):
+            logger.info(
+                "Actual %d vs expected %d",
+                len(function_inputs),
+                len(self.op_schema.inputs),
+            )
+            logger.info("The function is not a nearest match candidate.")
+            is_perfect_match = False
+
+        if set(function_attributes) != set(self.attributes):
+            logger.info("The function is not a nearest match candidate.")
+            is_perfect_match = False
+
+        # If it's already not a perfect match, we can return False directly. Further
+        # checking is only for the functions that are eligible for nearest match.
+        if not is_perfect_match:
+            return False
+
+        # NOTE: 2. The dtypes of inputs and attributes should be in the
+        # type constraints of the OpSchema. If they are not, we know the function is not
+        # eligible to be a perfect match, but can be a nearest match candidate.
+        for schema_input, torch_input in zip(self.op_schema.inputs, function_inputs):
+            torch_input_compatible_types = _find_onnx_data_type(torch_input)
+            allowed_types = self.type_constraints[schema_input.type_str]
+            if not allowed_types.intersection(torch_input_compatible_types) and not any(
+                fx_type_utils.is_optional_onnx_dtype_str(onnx_type_str)
+                for onnx_type_str in allowed_types
+            ):
+                # If torch_input_compatible_types isn't in allowed_types
+                # of this input defined in the OpSchema, we know the function
+                # and the input are not compatible
+                logger.info(
+                    "Actual %s vs\nExpected %s",
+                    torch_input_compatible_types,
+                    allowed_types,
+                )
+                is_perfect_match = False
+
+        for attribute_name, attribute in function_attributes.items():
+            if not self._match_onnx_attribute_type(attribute_name, attribute):
+                # If the attribute type of the OpSchema and the attribute type don't match,
+                # we know the function and the input are not compatible
+                logger.info(
+                    "Actual %s vs\nExpected %s",
+                    type(attribute),
+                    self.attributes[attribute_name].type,
+                )
+                is_perfect_match = False
+
+        # NOTE: This is still a candidate for nearest match, as it only mismatches attributes on dtype.
+        self._record_matching_score(function_inputs, function_attributes)
+        logger.info("match score: %d", self.match_score)
+        return is_perfect_match
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _match_onnx_attribute_type(
         self,
@@ -701,8 +908,12 @@ def _record_matching_score(
             score += 1 if one input/attribute type is in the type constraints.
 
         Limitations:
+<<<<<<< HEAD
             None/NoeType/[] could result in zero matches, and the same score of overloads,
             which will be recorded in SARIF.
+=======
+            None/NoeType/[] could result in zero matches, and the same score of overloads.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         Args:
             inputs: The input arguments.
diff --git a/torch/onnx/_internal/fx/passes/decomp.py b/torch/onnx/_internal/fx/passes/decomp.py
index 9a8c9a9ddb19..4b137d92746d 100644
--- a/torch/onnx/_internal/fx/passes/decomp.py
+++ b/torch/onnx/_internal/fx/passes/decomp.py
@@ -9,7 +9,11 @@
 from torch._dispatch import python as python_dispatch
 from torch._subclasses import fake_tensor
 from torch.fx.experimental import proxy_tensor
+<<<<<<< HEAD
 from torch.onnx._internal.fx import _pass, diagnostics
+=======
+from torch.onnx._internal.fx import _pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.onnx._internal.fx.passes import _utils
 
 
@@ -22,13 +26,20 @@
 class Decompose(_pass.Transform):
     def __init__(
         self,
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module: torch.fx.GraphModule,
         decomposition_table: Mapping[torch._ops.OpOverload, Callable],
         enable_dynamic_axes: bool,
         allow_fake_constant: bool | None = False,
     ):
+<<<<<<< HEAD
         super().__init__(diagnostic_context, module)
+=======
+        super().__init__(module)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.decomposition_table = decomposition_table
         self.enable_dynamic_axes = enable_dynamic_axes
         self.allow_fake_constant = allow_fake_constant
diff --git a/torch/onnx/_internal/fx/passes/functionalization.py b/torch/onnx/_internal/fx/passes/functionalization.py
index 14455546411f..242f9b3cf410 100644
--- a/torch/onnx/_internal/fx/passes/functionalization.py
+++ b/torch/onnx/_internal/fx/passes/functionalization.py
@@ -10,7 +10,11 @@
 import torch.fx
 from torch._subclasses import fake_tensor
 from torch.fx.experimental import proxy_tensor
+<<<<<<< HEAD
 from torch.onnx._internal.fx import _pass, diagnostics
+=======
+from torch.onnx._internal.fx import _pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.onnx._internal.fx.passes import _utils
 from torch.utils import _pytree as pytree
 
@@ -62,12 +66,19 @@ def fn(a, b):
 
     def __init__(
         self,
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module: torch.fx.GraphModule,
         enable_dynamic_axes: bool,
         allow_fake_constant: bool | None = False,
     ):
+<<<<<<< HEAD
         super().__init__(diagnostic_context, module)
+=======
+        super().__init__(module)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.enable_dynamic_axes = enable_dynamic_axes
         self.allow_fake_constant = allow_fake_constant
 
diff --git a/torch/onnx/_internal/fx/passes/modularization.py b/torch/onnx/_internal/fx/passes/modularization.py
index 1609a88ddf06..c17f8f2e7583 100644
--- a/torch/onnx/_internal/fx/passes/modularization.py
+++ b/torch/onnx/_internal/fx/passes/modularization.py
@@ -9,7 +9,11 @@
 
 import torch
 import torch.fx
+<<<<<<< HEAD
 from torch.onnx._internal.fx import _pass, diagnostics
+=======
+from torch.onnx._internal.fx import _pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils import _pytree as pytree
 
 
@@ -793,7 +797,10 @@ class Modularize(_pass.Transform):
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
         >>> import torch
         >>> from torch.onnx._internal.fx import passes
+<<<<<<< HEAD
         >>> from torch.onnx._internal.diagnostics import infra
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>>
         >>> class CustomModule(torch.nn.Module):
         >>>     def __init__(self) -> None:
@@ -823,7 +830,10 @@ class Modularize(_pass.Transform):
         >>> gm.print_readable()
 
         >>> gm = passes.Modularize(
+<<<<<<< HEAD
         ...     infra.DiagnosticContext("test_context", "1.0"),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...     gm,
         ... ).run()
         >>> gm.print_readable()
@@ -832,11 +842,18 @@ class Modularize(_pass.Transform):
 
     def __init__(
         self,
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
         module: torch.fx.GraphModule,
         is_exported_program: bool = False,
     ):
         super().__init__(diagnostic_context, module)
+=======
+        module: torch.fx.GraphModule,
+        is_exported_program: bool = False,
+    ):
+        super().__init__(module)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.module = module
         self.is_exported_program = is_exported_program
 
diff --git a/torch/onnx/_internal/fx/passes/readability.py b/torch/onnx/_internal/fx/passes/readability.py
index 83993cf25d25..bb62d9d2175c 100644
--- a/torch/onnx/_internal/fx/passes/readability.py
+++ b/torch/onnx/_internal/fx/passes/readability.py
@@ -1,16 +1,30 @@
 # mypy: allow-untyped-defs
 from __future__ import annotations
 
+<<<<<<< HEAD
 from typing import TYPE_CHECKING
 
 import torch
 from torch.onnx._internal.fx import _pass, diagnostics
+=======
+import logging
+from typing import TYPE_CHECKING
+
+import torch
+from torch.onnx._internal.fx import _pass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
 
+<<<<<<< HEAD
+=======
+logger = logging.getLogger(__name__)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class RestoreParameterAndBufferNames(_pass.Transform):
     """Restore parameter and buffer names from original nn.module.
 
@@ -26,16 +40,26 @@ class RestoreParameterAndBufferNames(_pass.Transform):
 
     def __init__(
         self,
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
         fx_module: torch.fx.GraphModule,
         original_nn_module: torch.nn.Module,
     ):
         super().__init__(diagnostic_context, fx_module)
+=======
+        fx_module: torch.fx.GraphModule,
+        original_nn_module: torch.nn.Module,
+    ):
+        super().__init__(fx_module)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.original_nn_module = original_nn_module
 
     def _rename_param_and_buffer(
         self,
+<<<<<<< HEAD
         diagnostic: diagnostics.Diagnostic,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         nodes: Sequence[torch.fx.Node],
         new_name: str,
     ) -> None:
@@ -57,7 +81,11 @@ def _rename_param_and_buffer(
                 new_node.meta = node.meta
                 node.replace_all_uses_with(new_node)
                 self.module.graph.erase_node(node)
+<<<<<<< HEAD
         diagnostic.info(
+=======
+        logger.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "Renamed 'self.%s' to 'self.%s', "
             "normalized from original parameter name '%s'.",
             old_name,
@@ -86,7 +114,10 @@ def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
         state_to_readable_name.update(
             {v: k for k, v in self.original_nn_module.named_buffers()}
         )
+<<<<<<< HEAD
         diagnostic = self.diagnostic_context.inflight_diagnostic()
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # old_name_to_nodes[old_name] returns a tuple of (nodes, new_name)
         # where `nodes` is a list of `get_attr` nodes with `old_name` as `target` and
@@ -117,11 +148,16 @@ def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
                     old_name_to_nodes[node.target] = ([node], readable_name)
                     continue
 
+<<<<<<< HEAD
                 diagnostic.info(
+=======
+                logger.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "Cannot find readable name for self.%s: %s. The name is unchanged.",
                     node.target,
                     type(attr_value),
                 )
+<<<<<<< HEAD
                 if isinstance(attr_value, torch.nn.Parameter):
                     # If it is a parameter we treat it more seriously.
                     diagnostic.level = diagnostics.levels.WARNING
@@ -130,5 +166,10 @@ def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
 
         for nodes, new_name in old_name_to_nodes.values():
             self._rename_param_and_buffer(diagnostic, nodes, new_name)
+=======
+
+        for nodes, new_name in old_name_to_nodes.values():
+            self._rename_param_and_buffer(nodes, new_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return self.module
diff --git a/torch/onnx/_internal/fx/passes/type_promotion.py b/torch/onnx/_internal/fx/passes/type_promotion.py
index f0878493e99d..5a943e6c8566 100644
--- a/torch/onnx/_internal/fx/passes/type_promotion.py
+++ b/torch/onnx/_internal/fx/passes/type_promotion.py
@@ -21,7 +21,11 @@
 from torch._refs import linalg as _linalg_refs, nn as _nn_refs, special as _special_refs
 from torch._refs.nn import functional as _functional_refs
 from torch.fx.experimental import proxy_tensor
+<<<<<<< HEAD
 from torch.onnx._internal.fx import _pass, diagnostics, type_utils as fx_type_utils
+=======
+from torch.onnx._internal.fx import _pass, type_utils as fx_type_utils
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils import _python_dispatch, _pytree
 
 
@@ -1230,14 +1234,20 @@ def get_rule(self, py_op: torch._ops.OpOverloadPacket) -> TypePromotionRule | No
 
 
 def get_type_promotion_rule(
+<<<<<<< HEAD
     diagnostic: diagnostics.Diagnostic,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     node: torch.fx.Node,
     type_promotion_table: TypePromotionTable,
 ) -> TypePromotionRule | None:
     """Get type promotion rule for a node.
 
     Args:
+<<<<<<< HEAD
         diagnostic: Diagnostic object.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node: Node to get type promotion rule for.
         type_promotion_table: Type promotion table.
 
@@ -1247,6 +1257,7 @@ def get_type_promotion_rule(
     """
     op = node.target
     if not isinstance(op, torch._ops.OpOverload):
+<<<<<<< HEAD
         # TODO(bowbao): diagnostic.emit and diagnostic.set_message api.
         diagnostic.message = (
             f"Skipped for {diagnostics.format_argument(node)}: "
@@ -1261,6 +1272,12 @@ def get_type_promotion_rule(
         return None
 
     diagnostic.info("Found type promotion rule: %s", rule)
+=======
+        return None
+    if (rule := type_promotion_table.get_rule(op.overloadpacket)) is None:
+        return None
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return rule
 
 
@@ -1339,12 +1356,18 @@ class _TypePromotionInterpreter(torch.fx.Interpreter):
 
     def __init__(
         self,
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         module: torch.fx.GraphModule,
         type_promotion_table: TypePromotionTable,
     ):
         super().__init__(module)
+<<<<<<< HEAD
         self.diagnostic_context = diagnostic_context
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.type_promotion_table = type_promotion_table
 
     def _run_node_and_set_meta(self, node) -> Any:
@@ -1389,7 +1412,10 @@ def _create_node(
 
     def _rerun_node_after_type_promotion(
         self,
+<<<<<<< HEAD
         diagnostic: diagnostics.Diagnostic,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node: torch.fx.Node,
         expected_out_dtype: torch.dtype,
     ) -> None:
@@ -1433,7 +1459,11 @@ def _rerun_node_after_type_promotion(
                     )
                     node.replace_all_uses_with(output_cast_node)
                     output_cast_node.args = (node,)
+<<<<<<< HEAD
                     diagnostic.info(
+=======
+                    logger.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "Node '%s' output dtype becomes %s due to op math. "
                         "Cast back to %s.",
                         node,
@@ -1454,14 +1484,21 @@ def _rerun_node_after_type_promotion(
 
     def _maybe_promote_arg(
         self,
+<<<<<<< HEAD
         diagnostic: diagnostics.Diagnostic,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node: torch.fx.Node,
         fx_arg: torch.fx.node.Argument,
         dtype: torch.dtype | None,
     ) -> torch.fx.node.Argument:
         """Promote fx_arg to dtype if necessary."""
         if dtype is None:
+<<<<<<< HEAD
             diagnostic.info(
+=======
+            logger.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "Argument %s is not promoted. Not mentioned by type promotion rule.",
                 fx_arg,
             )
@@ -1474,7 +1511,11 @@ def _maybe_promote_arg(
                     # Promote tensor to dtype.
                     graph = node.graph
                     with graph.inserting_before(node):
+<<<<<<< HEAD
                         diagnostic.info(
+=======
+                        logger.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             "Argument %s(%s) is promoted to %s.",
                             fx_arg,
                             old_dtype,
@@ -1487,9 +1528,13 @@ def _maybe_promote_arg(
                             (fx_arg,),
                             {"dtype": dtype},
                         )
+<<<<<<< HEAD
                 diagnostic.info(
                     "Argument %s is not promoted. Already %s.", fx_arg, dtype
                 )
+=======
+                logger.info("Argument %s is not promoted. Already %s.", fx_arg, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return fx_arg
             elif fx_type_utils.is_torch_symbolic_type(arg_val):
                 arg_type = type(arg_val)
@@ -1501,7 +1546,11 @@ def _maybe_promote_arg(
                     # Promote Sym number to tensor of dtype.
                     graph = node.graph
                     with graph.inserting_before(node):
+<<<<<<< HEAD
                         diagnostic.info(
+=======
+                        logger.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             "Argument %s(Scalar of equivalent dtype: %s) "
                             "is promoted to %s.",
                             fx_arg,
@@ -1515,9 +1564,13 @@ def _maybe_promote_arg(
                             (fx_arg,),
                             {"dtype": dtype},
                         )
+<<<<<<< HEAD
                 diagnostic.info(
                     "Argument %s is not promoted. Already %s.", fx_arg, dtype
                 )
+=======
+                logger.info("Argument %s is not promoted. Already %s.", fx_arg, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 return fx_arg
         elif (
             equivalent_dtype := fx_type_utils.from_scalar_type_to_torch_dtype(
@@ -1530,7 +1583,11 @@ def _maybe_promote_arg(
                 # the type promotion rule should not suggest promoting this arg.
                 graph = node.graph
                 with graph.inserting_before(node):
+<<<<<<< HEAD
                     diagnostic.info(
+=======
+                    logger.info(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         "Argument %s(Scalar of equivalent dtype: %s) "
                         "is promoted to %s.",
                         fx_arg,
@@ -1544,6 +1601,7 @@ def _maybe_promote_arg(
                         (fx_arg,),
                         {"dtype": dtype},
                     )
+<<<<<<< HEAD
             diagnostic.info("Argument %s is not promoted. Already %s.", fx_arg, dtype)
             return fx_arg
         elif isinstance(fx_arg, (tuple, list)):
@@ -1552,6 +1610,14 @@ def _maybe_promote_arg(
             )
             return type(fx_arg)(
                 self._maybe_promote_arg(diagnostic, node, fx_arg_elem, dtype)
+=======
+            logger.info("Argument %s is not promoted. Already %s.", fx_arg, dtype)
+            return fx_arg
+        elif isinstance(fx_arg, (tuple, list)):
+            logger.info("Argument %s is a tuple/list. Promoting each element.", fx_arg)
+            return type(fx_arg)(
+                self._maybe_promote_arg(node, fx_arg_elem, dtype)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for fx_arg_elem in fx_arg
             )
 
@@ -1559,7 +1625,10 @@ def _maybe_promote_arg(
 
     def _maybe_promote_node(
         self,
+<<<<<<< HEAD
         diagnostic: diagnostics.Diagnostic,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         node: torch.fx.Node,
         rule: TypePromotionRule,
     ) -> torch.fx.Node:
@@ -1571,17 +1640,26 @@ def _maybe_promote_node(
         for i, arg in enumerate(node.args):
             new_args.append(
                 self._maybe_promote_arg(
+<<<<<<< HEAD
                     diagnostic, node, arg, type_promotion_info.args_dtypes.get(i, None)
+=======
+                    node, arg, type_promotion_info.args_dtypes.get(i, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
 
         for name, arg in node.kwargs.items():
             new_kwargs[name] = self._maybe_promote_arg(
+<<<<<<< HEAD
                 diagnostic, node, arg, type_promotion_info.kwargs_dtypes.get(name, None)
+=======
+                node, arg, type_promotion_info.kwargs_dtypes.get(name, None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         new_args = tuple(new_args)
 
         if node.args != new_args or node.kwargs != new_kwargs:
+<<<<<<< HEAD
             diagnostic.message = f"Applied type promotion for {node}. "
             node.args = new_args
             node.kwargs = new_kwargs
@@ -1598,6 +1676,15 @@ def _maybe_promote_node(
         level=diagnostics.levels.NONE,
     )
     def run_node(self, node: torch.fx.Node) -> Any:
+=======
+            node.args = new_args
+            node.kwargs = new_kwargs
+            self._rerun_node_after_type_promotion(node, type_promotion_info.out_dtype)
+
+        return node
+
+    def run_node(self, n: torch.fx.Node) -> Any:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """This method is an override which inserts type promotion nodes as needed.
 
         For each `call_function` node, an initial check is conducted to determine if a type
@@ -1610,6 +1697,7 @@ def run_node(self, node: torch.fx.Node) -> Any:
         In the case of new or modified nodes, the result of `super().run_node(node)` is
         used to update its `node.meta["val"]` value.
         """
+<<<<<<< HEAD
         diagnostic = self.diagnostic_context.inflight_diagnostic()
         with self._set_current_node(node):
             if node.op != "call_function":
@@ -1620,16 +1708,27 @@ def run_node(self, node: torch.fx.Node) -> Any:
                 self._maybe_promote_node(diagnostic, node, rule)
 
         return super().run_node(node)
+=======
+        with self._set_current_node(n):
+            if rule := get_type_promotion_rule(n, self.type_promotion_table):
+                self._maybe_promote_node(n, rule)
+
+        return super().run_node(n)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class InsertTypePromotion(_pass.Transform):
     """Explicitly insert type promotion ops to the graph.
 
+<<<<<<< HEAD
     This class subclasses `_pass.Transform` to provide graph level diagnostic tracking.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Underneath, the main pass is driven by `_TypePromotionInterpreter`, which is a subclass
     of `torch.fx.Interpreter` to interpret the fx.Graph and perform the insertion of type
     promotion operations.
 
+<<<<<<< HEAD
     The interpreter is extended with ability to track diagnostic information for each node.
 
     By re-running the new and modified nodes using the interpreter, we can update the
@@ -1637,10 +1736,16 @@ class InsertTypePromotion(_pass.Transform):
     reflects the latest changes.
 
     See [FXE0015: fx_node_insert_type_promotion](https://pytorch.org/docs/main/generated/onnx_dynamo_diagnostics_rules/FXE0015%3Afx-node-insert-type-promotion.html) for more details.  # noqa: B950
+=======
+    By re-running the new and modified nodes using the interpreter, we can update the
+    metadata, specifically the fake tensor stored under node.meta["val"], and ensure it
+    reflects the latest changes.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
         self,
+<<<<<<< HEAD
         diagnostic_context: diagnostics.DiagnosticContext,
         module: torch.fx.GraphModule,
         type_promotion_table: TypePromotionTable | None = None,
@@ -1648,6 +1753,14 @@ def __init__(
         super().__init__(diagnostic_context, module)
         self.interpreter = _TypePromotionInterpreter(
             diagnostic_context, module, type_promotion_table or TypePromotionTable()
+=======
+        module: torch.fx.GraphModule,
+        type_promotion_table: TypePromotionTable | None = None,
+    ):
+        super().__init__(module)
+        self.interpreter = _TypePromotionInterpreter(
+            module, type_promotion_table or TypePromotionTable()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _fetch_fake_args(
diff --git a/torch/onnx/_internal/fx/patcher.py b/torch/onnx/_internal/fx/patcher.py
index f8a52efda2c6..c0cf8203a0b1 100644
--- a/torch/onnx/_internal/fx/patcher.py
+++ b/torch/onnx/_internal/fx/patcher.py
@@ -11,7 +11,11 @@
 
 
 # TODO: Remove after https://github.com/huggingface/safetensors/pull/318
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def has_safetensors_and_transformers():
     try:
         # safetensors is not an exporter requirement, but needed for some huggingface models
diff --git a/torch/onnx/_internal/fx/type_utils.py b/torch/onnx/_internal/fx/type_utils.py
index 623c2f4e29ef..b6ca73522ab6 100644
--- a/torch/onnx/_internal/fx/type_utils.py
+++ b/torch/onnx/_internal/fx/type_utils.py
@@ -4,9 +4,14 @@
 from __future__ import annotations
 
 from collections.abc import Mapping, Sequence
+<<<<<<< HEAD
 from typing import Any, Optional, Protocol, runtime_checkable, TYPE_CHECKING, Union
 
 import numpy
+=======
+from typing import Any, Optional, TYPE_CHECKING, Union
+from typing_extensions import Protocol, runtime_checkable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import onnx
 
@@ -15,7 +20,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     import onnx.defs.OpSchema.AttrType  # type: ignore[import]  # noqa: TCH004
+=======
+    import onnx.defs  # noqa: TCH004
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Enable both TorchScriptTensor and torch.Tensor to be tested
@@ -71,6 +80,7 @@ def from_python_type_to_onnx_attribute_type(
     return _PYTHON_TYPE_TO_ONNX_ATTRIBUTE_TYPE.get(dtype)
 
 
+<<<<<<< HEAD
 def from_python_type_to_onnx_tensor_element_type(type: type):
     """
     Converts a Python type to the corresponding ONNX tensor element type.
@@ -92,6 +102,8 @@ def from_python_type_to_onnx_tensor_element_type(type: type):
     return _PYTHON_TYPE_TO_ONNX_TENSOR_ELEMENT_TYPE.get(type)
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_torch_symbolic_type(value: Any) -> bool:
     return isinstance(value, (torch.SymBool, torch.SymInt, torch.SymFloat))
 
@@ -183,6 +195,7 @@ def from_scalar_type_to_torch_dtype(scalar_type: type) -> torch.dtype | None:
     torch.uint8: "u8",
 }
 
+<<<<<<< HEAD
 _TORCH_DTYPE_TO_NUMPY_DTYPE = {
     torch.float16: numpy.float16,
     torch.float32: numpy.float32,
@@ -214,6 +227,8 @@ def from_scalar_type_to_torch_dtype(scalar_type: type) -> torch.dtype | None:
 _TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE = {
     value: key for key, value in _ONNX_TENSOR_ELEMENT_TYPE_TO_TORCH_DTYPE.items()
 }
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 SYM_VALUE_TYPE = Union[torch.SymInt, torch.SymFloat, torch.SymBool]
 META_VALUE_TYPE = Union[fake_tensor.FakeTensor, SYM_VALUE_TYPE, int, float, bool]
diff --git a/torch/onnx/_internal/io_adapter.py b/torch/onnx/_internal/io_adapter.py
index f93e68ce5a14..eb9ecb9d3893 100644
--- a/torch/onnx/_internal/io_adapter.py
+++ b/torch/onnx/_internal/io_adapter.py
@@ -1,7 +1,12 @@
 # mypy: allow-untyped-defs
 from __future__ import annotations
 
+<<<<<<< HEAD
 from typing import Any, Callable, Protocol, runtime_checkable, TYPE_CHECKING
+=======
+from typing import Any, Callable, TYPE_CHECKING
+from typing_extensions import Protocol, runtime_checkable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.export as torch_export
@@ -12,8 +17,11 @@
     import inspect
     from collections.abc import Mapping, Sequence
 
+<<<<<<< HEAD
 # TODO(bowbao): Add diagnostics for IO adapters.
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @runtime_checkable
 class InputAdaptStep(Protocol):
@@ -176,7 +184,10 @@ def _assert_identical_pytree_spec(
     Raises:
         ValueError: If the two `TreeSpec` objects are not identical.
     """
+<<<<<<< HEAD
     # TODO(bowbao): Turn this check into diagnostic. Consider warning instead of error.
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     pass_if_any_checks: Sequence[Callable[[], bool]] = [
         lambda: spec1 == spec2,
         # FIXME: Bug in `dynamo.export`. Sometimes outputs returned in 'list' instead of 'tuple'.
diff --git a/torch/onnx/_internal/onnxruntime.py b/torch/onnx/_internal/onnxruntime.py
index 85f0cfe0d31f..7f77dcd58bc7 100644
--- a/torch/onnx/_internal/onnxruntime.py
+++ b/torch/onnx/_internal/onnxruntime.py
@@ -12,6 +12,10 @@
 import torch._ops
 import torch._prims.executor
 import torch.fx
+<<<<<<< HEAD
+=======
+import torch.onnx._internal._lazy_import
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.fx._compatibility import compatibility
 from torch.fx.passes.fake_tensor_prop import FakeTensorProp
@@ -28,7 +32,10 @@
     import torch.onnx
     import torch.onnx._internal
     import torch.onnx._internal._exporter_legacy
+<<<<<<< HEAD
     import torch.onnx._internal.diagnostics
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     import torch.onnx._internal.fx.decomposition_table
     import torch.onnx._internal.fx.passes  # noqa: TCH004
 
@@ -78,7 +85,10 @@ def is_onnxrt_backend_supported() -> bool:
             import torch.onnx  # noqa: F401
             import torch.onnx._internal  # noqa: F401
             import torch.onnx._internal._exporter_legacy  # noqa: F401
+<<<<<<< HEAD
             import torch.onnx._internal.diagnostics  # noqa: F401
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             from torch.onnx._internal.fx import (  # noqa: F401
                 decomposition_table,
                 fx_onnx_interpreter,
@@ -345,10 +355,28 @@ def _map_tensor_or_sym_to_device(
 def _get_ortvalues_from_torch_tensors(
     tensors: tuple[torch.Tensor, ...], devices: tuple["ORTC.OrtDevice", ...]
 ) -> tuple[torch.Tensor, ...]:
+<<<<<<< HEAD
     from onnxruntime.capi import _pybind_state as ORTC
 
     from torch.onnx._internal.fx.type_utils import _TORCH_DTYPE_TO_NUMPY_DTYPE
 
+=======
+    # TODO(justinchuby): Refactor this function
+    import numpy as np
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    torch_dtype_to_numpy_dtype = {
+        torch.float16: np.float16,
+        torch.float32: np.float32,
+        torch.float64: np.float64,
+        torch.uint8: np.uint8,
+        torch.int8: np.int8,
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.longlong,
+        torch.bool: np.bool_,
+    }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ortvalues = ORTC.OrtValueVector()
     ortvalues.reserve(len(tensors))
     dtypes = []
@@ -356,7 +384,11 @@ def _get_ortvalues_from_torch_tensors(
     data_ptrs = []
 
     for tensor in tensors:
+<<<<<<< HEAD
         dtypes.append(_TORCH_DTYPE_TO_NUMPY_DTYPE[tensor.dtype])
+=======
+        dtypes.append(torch_dtype_to_numpy_dtype[tensor.dtype])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         shapes.append(tensor.size())
         data_ptrs.append(tensor.data_ptr())
     ortvalues.push_back_batch(tensors, data_ptrs, dtypes, shapes, devices)
@@ -496,6 +528,11 @@ def _run_onnx_session_with_ortvaluevector(
         _nvtx_range_pop()
         return pth_outputs
     else:
+<<<<<<< HEAD
+=======
+        import onnxruntime.training
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Profile the two ORT-to-PyTorch type casts below
         _nvtx_range_push("after run_with_ortvaluevector")
         # Map ORTValue to torch.Tensor.
@@ -549,6 +586,32 @@ def _run_onnx_session_with_fetch(
     return pth_outputs
 
 
+<<<<<<< HEAD
+=======
+def _from_python_type_to_onnx_tensor_element_type(type: type):
+    """
+    Converts a Python type to the corresponding ONNX tensor element type.
+    For example, `_from_python_type_to_onnx_tensor_element_type(float)` returns
+    `onnx.TensorProto.FLOAT`.
+
+    Args:
+      type (type): The Python type to convert.
+
+    Returns:
+      int: The corresponding ONNX tensor element type.
+
+    """
+    import onnx
+
+    _PYTHON_TYPE_TO_ONNX_TENSOR_ELEMENT_TYPE = {
+        float: onnx.TensorProto.FLOAT,  # type: ignore[attr-defined]
+        int: onnx.TensorProto.INT64,  # type: ignore[attr-defined]
+        bool: onnx.TensorProto.BOOL,  # type: ignore[attr-defined]
+    }
+    return _PYTHON_TYPE_TO_ONNX_TENSOR_ELEMENT_TYPE.get(type)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class OrtExecutionInfoPerSession:
     """Information required to execute torch.fx.GraphModule using onnxruntime.InferenceSession"""
 
@@ -586,10 +649,35 @@ def __init__(
         )
 
     def is_supported(self, *args):
+<<<<<<< HEAD
         from torch.onnx._internal.fx.type_utils import (
             _TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE,
             from_python_type_to_onnx_tensor_element_type,
         )
+=======
+        # TODO(justinchuby): Simplify
+        import onnx
+
+        _onnx_tensor_element_type_to_torch_dtype = {
+            onnx.TensorProto.FLOAT: torch.float32,  # type: ignore[attr-defined]
+            onnx.TensorProto.FLOAT16: torch.float16,  # type: ignore[attr-defined]
+            onnx.TensorProto.FLOAT8E5M2: torch.float8_e5m2,  # type: ignore[attr-defined]
+            onnx.TensorProto.FLOAT8E5M2FNUZ: torch.float8_e5m2fnuz,  # type: ignore[attr-defined]
+            onnx.TensorProto.FLOAT8E4M3FN: torch.float8_e4m3fn,  # type: ignore[attr-defined]
+            onnx.TensorProto.FLOAT8E4M3FNUZ: torch.float8_e4m3fnuz,  # type: ignore[attr-defined]
+            onnx.TensorProto.DOUBLE: torch.float64,  # type: ignore[attr-defined]
+            onnx.TensorProto.BOOL: torch.bool,  # type: ignore[attr-defined]
+            onnx.TensorProto.UINT8: torch.uint8,  # type: ignore[attr-defined]
+            onnx.TensorProto.INT8: torch.int8,  # type: ignore[attr-defined]
+            onnx.TensorProto.INT16: torch.int16,  # type: ignore[attr-defined]
+            onnx.TensorProto.INT32: torch.int32,  # type: ignore[attr-defined]
+            onnx.TensorProto.INT64: torch.int64,  # type: ignore[attr-defined]
+        }
+        _torch_dtype_to_onnx_tensor_element_type = {
+            value: key
+            for key, value in _onnx_tensor_element_type_to_torch_dtype.items()
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Compare the args and the input schema in ONNX model and
         # return the first match.
@@ -602,7 +690,11 @@ def is_supported(self, *args):
             # Check Python scalars such as int, float, and bool.
             if isinstance(arg, (int, float, bool)):
                 # Map, e.g., float to onnx.TensorProto.FLOAT.
+<<<<<<< HEAD
                 onnx_dtype = from_python_type_to_onnx_tensor_element_type(type(arg))
+=======
+                onnx_dtype = _from_python_type_to_onnx_tensor_element_type(type(arg))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if onnx_dtype != value_info.type.tensor_type.elem_type:
                     return False
                 if len(value_info.type.tensor_type.shape.dim) != 0:
@@ -610,7 +702,11 @@ def is_supported(self, *args):
                 continue
 
             # Check tensor.
+<<<<<<< HEAD
             onnx_dtype = _TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE[arg.dtype]
+=======
+            onnx_dtype = _torch_dtype_to_onnx_tensor_element_type[arg.dtype]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if onnx_dtype != value_info.type.tensor_type.elem_type:
                 return False
             for dim, onnx_dim in zip(arg.shape, value_info.type.tensor_type.shape.dim):
@@ -724,9 +820,12 @@ class OrtBackendOptions:
     sub-graphs are compiled by ``OrtBackend``.
     """
 
+<<<<<<< HEAD
     export_options: Optional["torch.onnx.ExportOptions"] = None
     """Options for the TorchDynamo-based ONNX exporter used by the ``OrtBackend``."""
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ort_session_options: Optional["onnxruntime.SessionOptions"] = None
     """Options for the ``onnxruntime.InferenceSession`` used by the ``OrtBackend``."""
 
@@ -771,11 +870,15 @@ def __init__(self, options: Optional[OrtBackendOptions] = None):
         # - self._resolved_onnx_exporter_options.onnx_registry records what
         #   aten/prim ops are supported by exporter and their exporters (type: callable).
         self._resolved_onnx_exporter_options = (
+<<<<<<< HEAD
             torch.onnx._internal._exporter_legacy.ResolvedExportOptions(
                 torch.onnx.ExportOptions()
                 if self._options.export_options is None
                 else self._options.export_options
             )
+=======
+            torch.onnx._internal._exporter_legacy.ResolvedExportOptions()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         #  Given DORT's computation flow:
@@ -899,7 +1002,10 @@ def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwar
             # (type: onnxruntime.InferenceSession) for it.
 
             graph_module = passes.MovePlaceholderToFront(
+<<<<<<< HEAD
                 self._resolved_onnx_exporter_options.diagnostic_context,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 graph_module,
             ).run()
             # Generate reference outputs. They are used to indicate output
@@ -940,6 +1046,7 @@ def maybe_map_to_meta_val(value):
 
             # Create the object to iterate through the nodes in graph one-by-one
             # and calls the corresponding ONNX exporter for each node.
+<<<<<<< HEAD
             fx_interpreter = fx_onnx_interpreter.FxOnnxInterpreter(
                 diagnostic_context=self._resolved_onnx_exporter_options.diagnostic_context
             )
@@ -949,6 +1056,13 @@ def maybe_map_to_meta_val(value):
             graph_module = passes.InsertTypePromotion(
                 self._resolved_onnx_exporter_options.diagnostic_context, graph_module
             ).run()
+=======
+            fx_interpreter = fx_onnx_interpreter.FxOnnxInterpreter()
+            # Cast FX variables if they will result schema-mismatch when searching
+            # for ONNX operator. E.g., add(double_tensor, int_tensor) is fine in PyTorch,
+            # but ONNX expects add(double_tensor, double_tensor).
+            graph_module = passes.InsertTypePromotion(graph_module).run()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Start the per-node exporting process. It's conceptually a for loop
             # scanning through the nodes in the graph.
             exported = fx_interpreter.run(
@@ -1176,6 +1290,7 @@ def reusable(a: OrtBackendOptions, b: OrtBackendOptions):
             if a.ort_session_options is not None or b.ort_session_options is not None:
                 return False
 
+<<<<<<< HEAD
             if a.export_options is b.export_options:
                 return True
 
@@ -1192,6 +1307,9 @@ def reusable(a: OrtBackendOptions, b: OrtBackendOptions):
 
             # We can't account for how the two option sets may differ, so it's not safe to reuse.
             return False
+=======
+            return True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if not isinstance(options, OrtBackendOptions):
             options = OrtBackendOptions(**(options or {}))
diff --git a/torch/onnx/_internal/registration.py b/torch/onnx/_internal/registration.py
index 0fa895122742..9846808ab557 100644
--- a/torch/onnx/_internal/registration.py
+++ b/torch/onnx/_internal/registration.py
@@ -4,6 +4,10 @@
 import warnings
 from collections.abc import Collection, Sequence
 from typing import Callable, Generic, Optional, TypeVar, Union
+<<<<<<< HEAD
+=======
+from typing_extensions import ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.onnx import _constants, errors
 
@@ -51,6 +55,11 @@ def _dispatch_opset_version(
 
 _K = TypeVar("_K")
 _V = TypeVar("_V")
+<<<<<<< HEAD
+=======
+_R = TypeVar("_R")
+_P = ParamSpec("_P")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class OverrideDict(Collection[_K], Generic[_K, _V]):
@@ -287,7 +296,11 @@ def symbolic_b(g: _C.Graph, x: _C.Value, y: _C.Value, arg1: bool) -> _C.Value: .
         ValueError: If the separator '::' is not in the name.
     """
 
+<<<<<<< HEAD
     def wrapper(func: Callable) -> Callable:
+=======
+    def wrapper(func: Callable[_P, _R]) -> Callable[_P, _R]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         decorated = func
         if decorate is not None:
             for decorate_func in decorate:
diff --git a/torch/onnx/errors.py b/torch/onnx/errors.py
index f6e035a8a85f..5032909945e0 100644
--- a/torch/onnx/errors.py
+++ b/torch/onnx/errors.py
@@ -31,6 +31,7 @@ class UnsupportedOperatorError(OnnxExporterError):
     # NOTE: This is legacy and is only used by the torchscript exporter
     # Clean up when the torchscript exporter is removed
     def __init__(self, name: str, version: int, supported_version: int | None):
+<<<<<<< HEAD
         from torch.onnx import _constants
         from torch.onnx._internal import diagnostics
 
@@ -51,6 +52,26 @@ def __init__(self, name: str, version: int, supported_version: int | None):
                 diagnostic_rule = diagnostics.rules.missing_custom_symbolic_function
                 msg = diagnostic_rule.format_message(name)
                 diagnostics.diagnose(diagnostic_rule, diagnostics.levels.ERROR, msg)
+=======
+        if supported_version is not None:
+            msg = (
+                f"Exporting the operator '{name}' to ONNX opset version {version} "
+                "is not supported. Support for this operator was added in version "
+                f"{supported_version}, try exporting with this version"
+            )
+        elif name.startswith(("aten::", "prim::", "quantized::")):
+            msg = (
+                f"Exporting the operator '{name}' to ONNX opset version {version} "
+                "is not supported"
+            )
+        else:
+            msg = (
+                "ONNX export failed on an operator with unrecognized namespace {op_name}. "
+                "If you are trying to export a custom operator, make sure you registered it with "
+                "the right domain and version."
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__(msg)
 
 
diff --git a/torch/onnx/ops/__init__.py b/torch/onnx/ops/__init__.py
new file mode 100644
index 000000000000..3bdcd417e1ac
--- /dev/null
+++ b/torch/onnx/ops/__init__.py
@@ -0,0 +1,467 @@
+"""ONNX operators as native torch.fx operators.
+
+This module provides a set of functions to create ONNX operators in the FX graph
+which are exportable to ONNX.
+"""
+
+# flake8: noqa: B950
+from __future__ import annotations
+
+
+__all__ = [
+    "aten_decompositions",
+    "symbolic",
+    "symbolic_multi_out",
+    "rotary_embedding",
+    "attention",
+]
+
+
+from typing import Callable, TYPE_CHECKING
+
+import torch
+from torch.onnx.ops import _impl, _symbolic_impl
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+# https://github.com/onnx/onnx/blob/f542e1f06699ea7e1db5f62af53355b64338c723/onnx/onnx.proto#L597
+_TORCH_DTYPE_TO_ONNX_DTYPE = {
+    torch.float32: 1,  # FLOAT
+    torch.uint8: 2,  # UINT8
+    torch.int8: 3,  # INT8
+    torch.uint16: 4,  # UINT16
+    torch.int16: 5,  # INT16
+    torch.int32: 6,  # INT32
+    torch.int64: 7,  # INT64
+    str: 8,  # STRING
+    torch.bool: 9,  # BOOL
+    torch.float16: 10,  # FLOAT16
+    torch.double: 11,  # DOUBLE
+    torch.uint32: 12,  # UINT32
+    torch.uint64: 13,  # UINT64
+    torch.complex64: 14,  # COMPLEX64
+    torch.complex128: 15,  # COMPLEX128
+    torch.bfloat16: 16,  # BFLOAT16
+    torch.float8_e4m3fn: 17,  # FLOAT8E4M3FN
+    torch.float8_e4m3fnuz: 18,  # FLOAT8E4M3FNUZ
+    torch.float8_e5m2: 19,  # FLOAT8E5M2
+    torch.float8_e5m2fnuz: 20,  # FLOAT8E5M2FNUZ
+    # 21 = UINT4
+    # 22 = INT4
+    torch.float4_e2m1fn_x2: 23,  # FLOAT4E2M1
+}
+
+
+def aten_decompositions() -> dict[torch._ops.OpOverload, Callable]:
+    """Return the ONNX to ATen decomp table."""
+    return _impl.ONNX_ATEN_DECOMP_TABLE
+
+
+def _parse_domain_op_type(domain_op: str) -> tuple[str, str]:
+    splitted = domain_op.split("::", 1)
+    if len(splitted) == 1:
+        domain = ""
+        op_type = splitted[0]
+    else:
+        domain = splitted[0]
+        op_type = splitted[1]
+    return domain, op_type
+
+
+def symbolic(
+    domain_op: str,
+    /,
+    inputs: Sequence[torch.Tensor | None],
+    attrs: dict[
+        str,
+        int
+        | float
+        | str
+        | bool
+        | Sequence[int]
+        | Sequence[float]
+        | Sequence[str]
+        | Sequence[bool],
+    ]
+    | None = None,
+    *,
+    dtype: torch.dtype | int,
+    shape: Sequence[int | torch.SymInt],
+    version: int | None = None,
+    metadata_props: dict[str, str] | None = None,
+) -> torch.Tensor:
+    """Create a symbolic FX operator to represent an arbitrary ONNX operator.
+
+    This function is used to create a symbolic operator with a single output.
+    To create an operator with multiple outputs, use :func:`symbolic_multi_out`.
+
+    You may use ``if torch.onnx.is_in_onnx_export()`` to conditionally enable the
+    symbolic logic only during ``torch.onnx.export()``.
+
+    Example::
+
+        class CustomOp(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                # Normal torch operators can interleave with the symbolic ops during ONNX export
+                x = x + 1
+
+                # Create a symbolic ONNX operator with the name "CustomOp" in the "custom_domain" domain.
+                # The output tensor will have the specified dtype and shape
+                val = torch.onnx.ops.symbolic(
+                    "custom_domain::CustomOp",
+                    (x,),
+                    dict(attr_key="attr_value"),
+                    dtype=x.dtype,
+                    shape=x.shape,
+                    version=1,
+                )
+
+                # The result of the symbolic op can be used in normal torch operations during ONNX export
+                return torch.nn.functional.relu(val)
+
+
+        # You may then export this model to ONNX using torch.onnx.export(..., dynamo=True).
+
+    Args:
+        domain_op: The domain and operator name, separated by "::". For example,
+            "custom_domain::CustomOp".
+        inputs: The input tensors to the operator.
+        attrs: The attributes of the operator. The keys are attribute names and
+            the values are attribute values. Valid attribute types are int, float,
+            str, bool, and lists of int, float, str, and bool. Tensor attributes
+            are unsupported.
+        dtype: The data type of the output tensor.This can be either a torch.dtype
+            or an integer representing the ONNX data type.
+        shape: The shape of the output tensor. This can be a list of integers or
+            SymInt values.
+        version: The version of the opset used for the operator.
+        metadata_props: Metadata properties for the ONNX node.
+            This is a dictionary of str-str pairs.
+
+    Returns:
+        The output tensor of the operator.
+    """
+    if not isinstance(dtype, int):
+        torch._check(
+            dtype in _TORCH_DTYPE_TO_ONNX_DTYPE, lambda: f"Unsupported dtype: {dtype}"
+        )
+        dtype = _TORCH_DTYPE_TO_ONNX_DTYPE[dtype]
+    domain, op_type = _parse_domain_op_type(domain_op)
+    if attrs is None:
+        attrs = {}
+    encoded_attrs = _symbolic_impl.EncodedAttrs.from_dict(attrs)
+    # TODO: Parse domain
+    return _symbolic_impl._symbolic(
+        inputs,
+        op_type,
+        dtype,
+        shape=shape,
+        attr_keys=encoded_attrs.attr_keys,
+        attr_types=encoded_attrs.attr_types,
+        attr_pos=encoded_attrs.attr_pos,
+        attr_ints=encoded_attrs.attr_ints,
+        attr_floats=encoded_attrs.attr_floats,
+        attr_strs=encoded_attrs.attr_strs,
+        metadata_props_keys=metadata_props.keys() if metadata_props else [],
+        metadata_props_values=metadata_props.values() if metadata_props else [],
+        domain=domain,
+        version=version,
+    )
+
+
+def symbolic_multi_out(
+    domain_op: str,
+    /,
+    inputs: Sequence[torch.Tensor | None],
+    attrs: dict[
+        str,
+        int
+        | float
+        | str
+        | bool
+        | Sequence[int]
+        | Sequence[float]
+        | Sequence[str]
+        | Sequence[bool],
+    ]
+    | None = None,
+    *,
+    dtypes: Sequence[torch.dtype | int],
+    shapes: Sequence[Sequence[int | torch.SymInt]],
+    version: int | None = None,
+    metadata_props: dict[str, str] | None = None,
+) -> Sequence[torch.Tensor]:
+    """Create a symbolic FX operator to represent an arbitrary ONNX operator with multiple outputs.
+
+    You may use ``if torch.onnx.is_in_onnx_export()`` to conditionally enable the
+    symbolic logic only during ``torch.onnx.export()``.
+
+    Example::
+
+        class CustomOp(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                # Normal torch operators can interleave with the symbolic ops during ONNX export
+                x = x + 1
+
+                # Create a symbolic ONNX operator with the name "CustomOp" in the "custom_domain" domain.
+                # The output tensors will have the specified dtypes and shapes
+                (out1, out2) = torch.onnx.ops.symbolic(
+                    "custom_domain::CustomOp",
+                    (x,),
+                    dict(attr_key="attr_value"),
+                    dtypes=(x.dtype, torch.float32),
+                    shapes=(x.shape, [1, 2, 3]),
+                    version=1,
+                )
+
+                # The result of the symbolic op can be used in normal torch operations during ONNX export
+                return torch.nn.functional.relu(out1 + out2)
+
+
+        # You may then export this model to ONNX using torch.onnx.export(..., dynamo=True).
+
+    Args:
+        domain_op: The domain and operator name, separated by "::". For example,
+            "custom_domain::CustomOp".
+        inputs: The input tensors to the operator.
+        attrs: The attributes of the operator. The keys are attribute names and
+            the values are attribute values. Valid attribute types are int, float,
+            str, bool, and lists of int, float, str, and bool. Tensor attributes
+            are unsupported.
+        dtypes: The data types of the output tensors. This can be a list of
+            torch.dtype or integers representing the ONNX data types. The length
+            of this list must be the number of outputs.
+        shapes: The shapes of the output tensors. This can be a list of lists of
+            integers or SymInt values. The length of this list must be the number of outputs.
+        version: The version of the opset used for the operator.
+        metadata_props: Metadata properties for the ONNX node.
+            This is a dictionary of str-str pairs.
+
+    Returns:
+        A list of output tensors of the operator.
+    """
+    torch._check(
+        len(shapes) == len(dtypes),
+        lambda: f"Number of shapes ({len(shapes)}) must match number of dtypes ({len(dtypes)})",
+    )
+    onnx_dtypes = []
+    for dtype in dtypes:
+        if not isinstance(dtype, int):
+            torch._check(
+                dtype in _TORCH_DTYPE_TO_ONNX_DTYPE,
+                lambda: f"Unsupported dtype: {dtype}",
+            )
+            onnx_dtypes.append(_TORCH_DTYPE_TO_ONNX_DTYPE[dtype])
+        else:
+            onnx_dtypes.append(dtype)
+    domain, op_type = _parse_domain_op_type(domain_op)
+    if attrs is None:
+        attrs = {}
+    encoded_attrs = _symbolic_impl.EncodedAttrs.from_dict(attrs)
+    # Use the size of dtypes to determine the number of outputs
+    return _symbolic_impl._symbolic_multi_out(
+        inputs,
+        op_type,
+        onnx_dtypes,
+        shapes=shapes,
+        attr_keys=encoded_attrs.attr_keys,
+        attr_types=encoded_attrs.attr_types,
+        attr_pos=encoded_attrs.attr_pos,
+        attr_ints=encoded_attrs.attr_ints,
+        attr_floats=encoded_attrs.attr_floats,
+        attr_strs=encoded_attrs.attr_strs,
+        metadata_props_keys=metadata_props.keys() if metadata_props else [],
+        metadata_props_values=metadata_props.values() if metadata_props else [],
+        domain=domain,
+        version=version,
+    )
+
+
+def rotary_embedding(
+    X: torch.Tensor,
+    cos_cache: torch.Tensor,
+    sin_cache: torch.Tensor,
+    position_ids: torch.Tensor | None = None,
+    *,
+    interleaved: bool = False,
+    num_heads: int = 0,
+    rotary_embedding_dim: int = 0,
+) -> torch.Tensor:
+    """RotaryEmbedding op in ONNX.
+
+    https://onnx.ai/onnx/operators/onnx__RotaryEmbedding.html
+
+    RotaryEmbedding is the implementation of rotary positional embeddings (RoPE) based on the paper https://arxiv.org/pdf/2104.09864.
+    The key advantage of RoPE is that it allows the model to understand both the absolute position of a token and the relative distances
+    between tokens. This is achieved through a rotational mechanism where the extent of rotation is computed based on the token's absolute position (position_ids).
+
+    The rotational mechanism is defined by sine and cosine functions that are used to represent the rotation angles.
+    For each token in the sequence, its positional embedding is computed by rotating its embedding vector. This is done by splitting the
+    embedding vector either into two halves or interleaving every alternate token and applying the rotation matrix to each half of the embedding vector.
+    The rotation matrix is parameterized by the token's position in the sequence. The rotated halves of the embedding vector are concatenated
+    to form the final positional embedding for each token. The rotated positional embeddings are used in the self-attention mechanism.
+    The rotation ensures that the model captures both absolute and relative positional information.
+
+    Args:
+        X: The input tensor representing the token embeddings. 4D tensor with
+            shape `(batch_size, num_heads, sequence_length, head_size)` or 3D tensor
+            with shape `(batch_size, sequence_length, hidden_size)`. For cases with
+            a 4D input tensor, `head_size` has to be even. For cases with a 3D input
+            tensor, `num_heads` attribute must be provided and `hidden_size` must
+            be an even multiple of `num_heads` where `hidden_size = num_heads * head_size`
+        cos_cache: The cosine values for the rotation. 2D tensor with shape `(max_position_id_plus_1, head_size / 2)`
+            for full rotation or `(max_position_id_plus_1, rotary_embedding_dim / 2)`
+            for partial rotation when `position_ids` are provided. 3D tensor with shape
+            `(batch_size, sequence_length, head_size / 2)` for full rotation or
+            `(batch_size, sequence_length, rotary_embedding_dim / 2)` for partial
+            rotation when `position_ids` are not provided. `max_position_id_plus_1`
+            is a parameter to the model.
+        sin_cache: The sine values for the rotation. 2D tensor with shape `(max_position_id_plus_1, head_size / 2)`
+            for full rotation or `(max_position_id_plus_1, rotary_embedding_dim / 2)`
+            for partial rotation when `position_ids` are provided. 3D tensor with shape
+            `(batch_size, sequence_length, head_size / 2)` for full rotation or
+            `(batch_size, sequence_length, rotary_embedding_dim / 2)` for partial rotation
+            when `position_ids` are not provided. `max_position_id_plus_1` is a parameter
+            to the model.
+        position_ids: The position indices for the tokens. 2D tensor with shape
+            `(batch_size, sequence_length)`.
+        interleaved: Rotate using interleaved pattern. Default value is 0 (False).
+        num_heads: Number of attention heads. Must be provided when input is a 3D tensor.
+        rotary_embedding_dim: Rotary embedding dimension used to apply partial rotary embeddings.
+
+    Returns:
+        Tensor with same shape as input.
+    """
+    return _impl.rotary_embedding_23(
+        X,
+        cos_cache,
+        sin_cache,
+        position_ids=position_ids,
+        interleaved=interleaved,
+        num_heads=num_heads,
+        rotary_embedding_dim=rotary_embedding_dim,
+    )
+
+
+def attention(
+    Q: torch.Tensor,
+    K: torch.Tensor,
+    V: torch.Tensor,
+    attn_mask: torch.Tensor | None = None,
+    past_key: torch.Tensor | None = None,
+    past_value: torch.Tensor | None = None,
+    *,
+    is_causal: bool = False,
+    kv_num_heads: int = 0,
+    q_num_heads: int = 0,
+    qk_matmul_output_mode: int = 0,
+    scale: float | None = None,
+    softcap: float = 0.0,
+    softmax_precision: int | None = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Attention op in ONNX.
+
+    https://onnx.ai/onnx/operators/onnx__Attention.html
+
+    Computes scaled dot product attention on query, key and value tensors, using an optional attention mask if passed.
+
+    This operator covers self and cross variants of the attention operation based on sequence lengths of K, Q and V.
+
+    For self attention, ``kv_sequence_length`` equals to ``q_sequence_length``.
+
+    For cross attention, query and key might have different lengths.
+
+    This operator also covers the 3 following variants based on the number of heads:
+
+    1. Multi-headed Attention (MHA): Described in the paper https://arxiv.org/pdf/1706.03762, `q_num_heads = kv_num_heads`.
+    2. Group-query Attention (GQA): Described in the paper https://arxiv.org/pdf/2305.13245, `q_num_heads > kv_num_heads`, `q_num_heads % kv_num_heads == 0`.
+    3. Multi-query Attention (MQA): Described in the paper https://arxiv.org/pdf/1911.02150, `q_num_heads > kv_num_heads`, `kv_num_heads=1`.
+
+    Attention bias to be added is calculated based on ``attn_mask`` input and ``is_causal` `attribute``, only one of which can be provided.
+
+    1. If ``is_causal`` is set to `1`, the attention masking is a lower triangular matrix when the mask is a square matrix. The attention masking has the form of the upper left causal bias due to the alignment.
+    2. `attn_mask`: A boolean mask where a value of `True` indicates that the element should take part in attention or a float mask of the same type as query, key, value that is added to the attention score.
+
+    Both past and present state key/values are optional. They shall be used together, and not allowed to use only one of them.
+    The following pattern is applied to the Q, K and V inputs after appropriate reshaping of K and V inputs based on sequence lengths and num heads provided::
+
+        The following pattern is applied by this operator:
+                Q          K          V
+                |          |          |
+        Q*sqrt(scale) K*sqrt(scale) |
+                |          |          |
+                |       Transpose     |
+                |          |          |
+                ---MatMul---          |
+                    |               |
+        at_mask---Add              |
+                    |               |
+            softcap (if provided)     |
+                    |               |
+                Softmax            |
+                    |               |
+                    -----MatMul------
+                            |
+                            Y
+
+    Args:
+        Q: Query tensor. 4D tensor with shape `(batch_size, q_num_heads, q_sequence_length, head_size)` or 3D tensor
+            with shape `(batch_size, q_sequence_length, q_hidden_size)`. For cases with a 3D input tensor,
+            `q_hidden_size = q_num_heads * head_size`
+        K: Key tensor. 4D tensor with shape `(batch_size, kv_num_heads, kv_sequence_length, head_size)` or 3D tensor
+            with shape `(batch_size, kv_sequence_length, k_hidden_size)`. For cases with a 3D input tensor,
+            `k_hidden_size = kv_num_heads * head_size`
+        V: Value tensor. 4D tensor with shape `(batch_size, kv_num_heads, kv_sequence_length, v_head_size)` or 3D tensor
+            with shape `(batch_size, kv_sequence_length, v_hidden_size)`. For cases with a 3D input tensor,
+            `v_hidden_size = kv_num_heads * v_head_size`
+        attn_mask: Attention mask. Shape must be broadcastable to 4D tensor with shape
+            `(batch_size, q_num_heads, q_sequence_length, total_sequence_length)` where
+            `total_sequence_length = past_sequence_length + kv_sequence_length`. Two types of masks are supported.
+            A boolean mask where a value of True indicates that the element should take part in attention.
+            Also supports a float mask of the same type as query, key, value that is added to the attention score.
+        past_key: Past state cache for key with shape `(batch_size, kv_num_heads, past_sequence_length, head_size)`
+        past_value: Past state cache for value with shape `(batch_size, kv_num_heads, past_sequence_length, v_head_size)`
+        is_causal: If set to True, the attention masking is a lower triangular matrix when the mask is a square matrix.
+            The attention masking has the form of the upper left causal bias due to the alignment.
+        kv_num_heads: Number of heads of key and value. Must be used with 3D inputs of Q, K and V.
+        q_num_heads: Number of heads of query. Must be used with 3D inputs of Q, K and V.
+        qk_matmul_output_mode: If set to 0, qk_matmul_output is the output of qk matmul. If set to 1,
+            qk_matmul_output includes the addition of the attention mask to the output of qk matmul.
+            If set to 2, qk_matmul_output is the output after the softcap operation. If set to 3,
+            qk_matmul_output is the output after the softmax operation. Default value is 0.
+        scale: Scaling factor applied to Q*K^T. Default value is 1/sqrt(head_size). To prevent numerical overflow,
+            scale Q, K by sqrt(scale) before matmul.
+        softcap: Softcap value for attention weights. Default value is 0.
+        softmax_precision: The floating-point precision used in softmax computation. If softmax precision is not provided,
+            the same precision as the input of softmax (Q and K) is used.
+
+    Returns:
+        A tuple containing:
+        - The output tensor. 4D tensor with shape `(batch_size, q_num_heads, q_sequence_length, v_head_size)` or 3D tensor
+          with shape `(batch_size, q_sequence_length, hidden_size)`. For cases with a 3D input tensor,
+          `hidden_size = q_num_heads * v_head_size`
+        - Updated key cache with shape `(batch_size, kv_num_heads, total_sequence_length, head_size)` where
+          `total_sequence_length = past_sequence_length + kv_sequence_length`.
+        - Updated value cache with shape `(batch_size, kv_num_heads, total_sequence_length, v_head_size)` where
+          `total_sequence_length = past_sequence_length + kv_sequence_length`.
+        - The output of QK matmul. 4D tensor with shape `(batch_size, q_num_heads, q_sequence_length, total_sequence_length)`
+          where `total_sequence_length = past_sequence_length + kv_sequence_length`.
+    """
+    return _impl.attention_23(
+        Q,
+        K,
+        V,
+        attn_mask=attn_mask,
+        past_key=past_key,
+        past_value=past_value,
+        is_causal=is_causal,
+        kv_num_heads=kv_num_heads,
+        q_num_heads=q_num_heads,
+        qk_matmul_output_mode=qk_matmul_output_mode,
+        scale=scale,
+        softcap=softcap,
+        softmax_precision=softmax_precision,
+    )
diff --git a/torch/onnx/ops/_dtype_mappings.py b/torch/onnx/ops/_dtype_mappings.py
new file mode 100644
index 000000000000..0023e356d89f
--- /dev/null
+++ b/torch/onnx/ops/_dtype_mappings.py
@@ -0,0 +1,27 @@
+import torch
+
+
+ONNX_DTYPE_TO_TORCH_DTYPE: dict[int, torch.dtype] = {
+    1: torch.float32,  # FLOAT
+    2: torch.uint8,  # UINT8
+    3: torch.int8,  # INT8
+    4: torch.uint16,  # UINT16
+    5: torch.int16,  # INT16
+    6: torch.int32,  # INT32
+    7: torch.int64,  # INT64
+    9: torch.bool,  # BOOL
+    10: torch.float16,  # FLOAT16
+    11: torch.double,  # DOUBLE
+    12: torch.uint32,  # UINT32
+    13: torch.uint64,  # UINT64
+    14: torch.complex64,  # COMPLEX64
+    15: torch.complex128,  # COMPLEX128
+    16: torch.bfloat16,  # BFLOAT16
+    17: torch.float8_e4m3fn,  # FLOAT8E4M3FN
+    18: torch.float8_e4m3fnuz,  # FLOAT8E4M3FNUZ
+    19: torch.float8_e5m2,  # FLOAT8E5M2
+    20: torch.float8_e5m2fnuz,  # FLOAT8E5M2FNUZ
+    21: torch.uint8,  # UINT4
+    22: torch.uint8,  # INT4
+    23: torch.float4_e2m1fn_x2,  # FLOAT4E2M1
+}
diff --git a/torch/onnx/ops/_impl.py b/torch/onnx/ops/_impl.py
new file mode 100644
index 000000000000..7127716872f7
--- /dev/null
+++ b/torch/onnx/ops/_impl.py
@@ -0,0 +1,396 @@
+# flake8: noqa: B950
+import math
+import typing
+from typing import Callable, Optional
+
+import torch
+from torch.onnx.ops import _dtype_mappings
+
+
+_T = typing.TypeVar("_T", bound=Callable)
+
+# ONNX to ATen decomp table
+ONNX_ATEN_DECOMP_TABLE: dict[torch._ops.OpOverload, Callable] = {}
+_ATTENTION_23_ALLOWED_INTERMEDIATE_PRECISIONS = frozenset(
+    {
+        1,  # FLOAT
+        10,  # FLOAT16
+        11,  # DOUBLE
+        16,  # BFLOAT16
+    }
+)
+
+
+def _onnx_op(op_type: str, opset_version: int) -> Callable[[_T], _T]:
+    """Decorator to register an ONNX operator with a custom implementation."""
+
+    def decorator(func: _T) -> _T:
+        overload = f"opset{opset_version}"
+        torch_op = torch.library.custom_op(
+            f"onnx::{op_type}.{overload}", mutates_args=()
+        )(func)
+        ONNX_ATEN_DECOMP_TABLE[getattr(getattr(torch.ops.onnx, op_type), overload)] = (
+            func  # type: ignore[assignment]
+        )
+        # Use the same implementation for the fake implementation
+        # This is possible because we use pure aten ops to implement ONNX ops
+        torch_op.register_fake(func)
+        return torch_op  # type: ignore[return-value]
+
+    return decorator
+
+
+@_onnx_op("RotaryEmbedding", 23)
+def rotary_embedding_23(
+    x: torch.Tensor,
+    cos_cache: torch.Tensor,
+    sin_cache: torch.Tensor,
+    position_ids: Optional[torch.Tensor] = None,
+    *,
+    interleaved: bool = False,
+    num_heads: int = 0,
+    rotary_embedding_dim: int = 0,
+) -> torch.Tensor:
+    """RotaryEmbedding-23 https://onnx.ai/onnx/operators/onnx__RotaryEmbedding.html#rotaryembedding-23"""
+    # First ensure x has shape [batch_size, num_heads, seq_len, head_size]
+    batch_size = x.shape[0]
+    sequence_length = x.shape[1]
+    if len(x.shape) == 3:
+        hidden_size = x.shape[2]
+        torch._check(
+            num_heads != 0,
+            lambda: f"num_heads must be provided for 3D inputs. Received input tensor with shape {x.shape}",
+        )
+        head_size = hidden_size // num_heads
+        new_shape = [batch_size, sequence_length, num_heads, head_size]
+        x = torch.reshape(x, new_shape)
+    torch._check(len(x.shape) == 4, lambda: "x should be a 4D tensor by now")
+    head_size = x.shape[3]
+
+    # Fully or partially perform rotation on x based on rotary_embedding_dim attribute
+    if rotary_embedding_dim == 0:
+        # If rotary_embedding_dim not provided, perform full rotation by using head_size
+        rotary_embedding_dim = head_size
+    x_rotate = x[:, :, :, :rotary_embedding_dim]
+    x_not_rotate = x[:, :, :, rotary_embedding_dim:]
+    rotary_embedding_dim_half = rotary_embedding_dim // 2
+
+    # Retrieve sin and cos caches using position ids
+    if position_ids is not None:
+        cos = cos_cache[
+            position_ids
+        ]  # Shape: [batch_size, sequence_length, head_size/2]
+        sin = sin_cache[
+            position_ids
+        ]  # Shape: [batch_size, sequence_length, head_size/2]
+    else:
+        cos = cos_cache
+        sin = sin_cache
+    cos = cos[
+        :, :, :rotary_embedding_dim_half
+    ]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+    sin = sin[
+        :, :, :rotary_embedding_dim_half
+    ]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+    cos = torch.unsqueeze(
+        cos, 2
+    )  # Shape: [batch_size, sequence_length, 1, rotary_embedding_dim/2]
+    sin = torch.unsqueeze(
+        sin, 2
+    )  # Shape: [batch_size, sequence_length, 1, rotary_embedding_dim/2]
+
+    # Either divide the x in halves or interleave (based on interleaved attribute)
+    if interleaved:
+        x1 = x_rotate[:, :, :, 0::2]
+        x2 = x_rotate[:, :, :, 1::2]
+    else:
+        x1, x2 = torch.chunk(x_rotate, 2, dim=-1)
+
+    # Calculate real and imaginary values
+    real = cos * x1 - sin * x2
+    imag = sin * x1 + cos * x2
+
+    # Inserted rotated embeddings back to the original x
+    if interleaved:
+        # x_rotate[:, :, :, 0::2] = real
+        # x_rotate[:, :, :, 1::2] = imag
+        real = torch.unsqueeze(real, -1)
+        imag = torch.unsqueeze(imag, -1)
+        x_rotate_concat = torch.cat((real, imag), dim=-1)
+        x_rotate = torch.reshape(x_rotate_concat, x_rotate.shape)
+    else:
+        x_rotate = torch.cat((real, imag), dim=-1)
+    output = torch.cat((x_rotate, x_not_rotate), dim=-1)
+    if len(x.shape) == 3:
+        output = torch.reshape(output, x.shape)
+    return output
+
+
+def _get_scale_factor(scale: Optional[float], head_size: int) -> float:
+    """Get the scale factor for attention computation."""
+    return scale if scale is not None else (1.0 / math.sqrt(head_size))
+
+
+def _reshape_3d_to_4d(
+    tensor: torch.Tensor, batch_size: int, num_heads: int
+) -> torch.Tensor:
+    """Reshape 3D tensor to 4D for multi-head attention."""
+    sequence_length, hidden_size = tensor.shape[1], tensor.shape[2]
+    head_size = hidden_size // num_heads
+    return (
+        tensor.view(batch_size, sequence_length, num_heads, head_size)
+        .transpose(1, 2)
+        .contiguous()
+    )
+
+
+def _get_qk_output_for_aten_spda(
+    Q: torch.Tensor,
+    K: torch.Tensor,
+    current_q_num_heads: int,
+    current_kv_num_heads: int,
+    scale: Optional[float],
+    qk_matmul_output_mode: int,
+) -> torch.Tensor:
+    """Get QK output tensor based on the specified mode."""
+    if qk_matmul_output_mode == 0:
+        return _compute_qk_output_for_mode_0(
+            Q, K, current_q_num_heads, current_kv_num_heads, scale
+        )
+    else:
+        # For other modes, return a zero tensor with correct shape
+        return torch.zeros_like(torch.matmul(Q, K.transpose(-2, -1)))
+
+
+def _validate_gqa_configuration(
+    current_q_num_heads: int, current_kv_num_heads: int
+) -> None:
+    """Validate Group Query Attention configuration."""
+    torch._check(
+        current_q_num_heads % current_kv_num_heads == 0,
+        lambda: f"q_num_heads ({current_q_num_heads}) must be divisible by kv_num_heads ({current_kv_num_heads}) for GQA",
+    )
+
+
+def _compute_qk_output_for_mode_0(
+    Q: torch.Tensor,
+    K: torch.Tensor,
+    current_q_num_heads: int,
+    current_kv_num_heads: int,
+    scale: Optional[float],
+) -> torch.Tensor:
+    """Helper function to compute QK output for qk_matmul_output_mode == 0."""
+    # Handle GQA manually for QK output
+    K_for_qk = K
+    if current_q_num_heads != current_kv_num_heads:
+        repeat_factor = current_q_num_heads // current_kv_num_heads
+        K_for_qk = K.repeat_interleave(repeat_factor, dim=1)
+
+    scale_factor = _get_scale_factor(scale, Q.shape[3])
+    # Scale both Q and K by sqrt(scale_factor) for numerical stability
+    sqrt_scale = math.sqrt(scale_factor)
+    Q_scaled = Q * sqrt_scale
+    K_scaled = K_for_qk * sqrt_scale
+    return torch.matmul(Q_scaled, K_scaled.transpose(-2, -1))
+
+
+@_onnx_op("Attention", 23)
+def attention_23(
+    Q: torch.Tensor,
+    K: torch.Tensor,
+    V: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    past_key: Optional[torch.Tensor] = None,
+    past_value: Optional[torch.Tensor] = None,
+    *,
+    is_causal: bool = False,
+    kv_num_heads: int = 0,
+    q_num_heads: int = 0,
+    qk_matmul_output_mode: int = 0,
+    scale: Optional[float] = None,
+    softcap: float = 0.0,
+    softmax_precision: Optional[int] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Attention-23 https://onnx.ai/onnx/operators/onnx__Attention.html#attention-23"""
+
+    num_head_dim, sequence_dim, head_dim = 1, 2, 3
+
+    # Store original input shape to determine output shape
+    input_shape_len = len(Q.shape)
+    batch_size = Q.shape[0]
+
+    # Reshape 3D inputs to 4D format
+    if len(Q.shape) == 3:
+        torch._check(
+            q_num_heads != 0 and kv_num_heads != 0,
+            lambda: "q_num_heads and kv_num_heads must be provided for 3D inputs",
+        )
+        q_sequence_length = Q.shape[1]
+        Q = _reshape_3d_to_4d(Q, batch_size, q_num_heads)
+        K = _reshape_3d_to_4d(K, batch_size, kv_num_heads)
+        V = _reshape_3d_to_4d(V, batch_size, kv_num_heads)
+
+    torch._check(
+        len(Q.shape) == 4 and len(K.shape) == 4 and len(V.shape) == 4,
+        lambda: "Q, K, and V should be 4D tensors by now",
+    )
+
+    # Calculate scale factor if not provided
+    q_head_size = Q.shape[head_dim]
+    scale = _get_scale_factor(scale, q_head_size)
+
+    # Handle past key/value caches
+    present_key = (
+        torch.cat([past_key, K], dim=sequence_dim)
+        if past_key is not None
+        else K.clone()
+    )
+    present_value = (
+        torch.cat([past_value, V], dim=sequence_dim)
+        if past_value is not None
+        else V.clone()
+    )
+
+    # Update K and V to include past states
+    K, V = present_key, present_value
+
+    # Get current dimensions
+    current_q_num_heads = Q.shape[num_head_dim]
+    current_kv_num_heads = K.shape[num_head_dim]
+    q_sequence_length = Q.shape[sequence_dim]
+    kv_sequence_length = K.shape[sequence_dim]
+
+    # Check if we can use the optimized scaled_dot_product_attention (most optimized)
+    can_use_sdpa = (
+        softcap == 0.0  # No softcap
+        and qk_matmul_output_mode == 0  # Default QK output mode
+        and softmax_precision is None  # No custom softmax precision
+        and (attn_mask is None or attn_mask.dtype == torch.bool)
+    )
+
+    _validate_gqa_configuration(current_q_num_heads, current_kv_num_heads)
+
+    if can_use_sdpa:
+        # Use PyTorch's optimized scaled_dot_product_attention
+
+        # Prepare attention mask for SDPA
+        sdpa_attn_mask = None
+        if attn_mask is not None:
+            # Convert boolean mask: True means participate, SDPA expects True to mask out
+            sdpa_attn_mask = ~attn_mask if attn_mask.dtype == torch.bool else attn_mask
+
+        output = torch.nn.functional.scaled_dot_product_attention(
+            Q,
+            K,
+            V,
+            attn_mask=sdpa_attn_mask,
+            dropout_p=0.0,
+            is_causal=is_causal,
+            scale=scale,
+            enable_gqa=bool(
+                current_q_num_heads != current_kv_num_heads
+            ),  # Ensure enable_gqa is not SymBool
+        )
+
+        qk_output = _get_qk_output_for_aten_spda(
+            Q,
+            K,
+            current_q_num_heads,
+            current_kv_num_heads,
+            scale,
+            qk_matmul_output_mode,
+        )
+    else:
+        # Fallback to manual implementation for complex cases
+
+        # Handle Group Query Attention (GQA) and Multi-Query Attention (MQA)
+        if current_q_num_heads != current_kv_num_heads:
+            repeat_factor = current_q_num_heads // current_kv_num_heads
+            K = K.repeat_interleave(repeat_factor, dim=num_head_dim)
+            V = V.repeat_interleave(repeat_factor, dim=num_head_dim)
+
+        # Create attention bias
+        attn_bias = torch.zeros(
+            q_sequence_length, kv_sequence_length, dtype=Q.dtype, device=Q.device
+        )
+
+        # Apply causal masking
+        if is_causal:
+            torch._check(
+                attn_mask is None, lambda: "Cannot use both is_causal and attn_mask"
+            )
+            causal_mask = torch.tril(
+                torch.ones(
+                    q_sequence_length,
+                    kv_sequence_length,
+                    dtype=torch.bool,
+                    device=Q.device,
+                )
+            )
+            attn_bias = attn_bias.masked_fill(~causal_mask, float("-inf"))
+
+        # Apply attention mask
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                # Boolean mask: True means participate in attention
+                attn_bias = attn_bias.masked_fill(~attn_mask, float("-inf"))
+            else:
+                # Float mask: added to attention scores
+                attn_bias = attn_bias + attn_mask
+
+        # Apply scaling factor
+        scale_factor = _get_scale_factor(scale, Q.shape[3])
+
+        # Scale both Q and K by sqrt(scale_factor) for numerical stability
+        sqrt_scale = math.sqrt(scale_factor)
+        Q_scaled = Q * sqrt_scale
+        K_scaled = K * sqrt_scale
+
+        # Compute Q @ K^T
+        qk_matmul_output = torch.matmul(Q_scaled, K_scaled.transpose(-2, -1))
+
+        # Initialize QK output based on mode
+        qk_output = qk_matmul_output  # Default case for mode 0
+
+        # Add attention bias
+        qk_with_bias = qk_matmul_output + attn_bias
+
+        if qk_matmul_output_mode == 1:
+            qk_output = qk_with_bias
+
+        # Apply softcap if provided
+        if softcap > 0.0:
+            qk_with_bias = softcap * torch.tanh(qk_with_bias / softcap)
+
+        if qk_matmul_output_mode == 2:
+            qk_output = qk_with_bias
+
+        # Apply softmax with optional precision casting
+        if softmax_precision is not None:
+            # Map ONNX data type to torch dtype
+            if softmax_precision in _ATTENTION_23_ALLOWED_INTERMEDIATE_PRECISIONS:
+                original_dtype = qk_with_bias.dtype
+                qk_with_bias = qk_with_bias.to(
+                    _dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE[softmax_precision]
+                )
+                qk_softmax = torch.softmax(qk_with_bias, dim=-1)
+                qk_softmax = qk_softmax.to(original_dtype)
+            else:
+                qk_softmax = torch.softmax(qk_with_bias, dim=-1)
+        else:
+            qk_softmax = torch.softmax(qk_with_bias, dim=-1)
+
+        if qk_matmul_output_mode == 3:
+            qk_output = qk_softmax
+
+        # Compute attention output
+        output = torch.matmul(qk_softmax, V)
+
+    # Reshape output back to 3D if input was 3D
+    if input_shape_len == 3:
+        # output: (batch_size, q_num_heads, q_sequence_length, v_head_size) -> (batch_size, q_sequence_length, hidden_size)
+        output = (
+            output.transpose(1, 2).contiguous().view(batch_size, q_sequence_length, -1)
+        )
+
+    return output, present_key, present_value, qk_output
diff --git a/torch/onnx/ops/_symbolic_impl.py b/torch/onnx/ops/_symbolic_impl.py
new file mode 100644
index 000000000000..4876612ad978
--- /dev/null
+++ b/torch/onnx/ops/_symbolic_impl.py
@@ -0,0 +1,319 @@
+"""Implementation of symbolic FX ops to represent arbitrary ONNX ops.
+
+This module provides a way to create symbolic FX operators that can represent
+arbitrary ONNX operators.
+
+The operators are called "symbolic" because they don't do any actual computation
+but instead serve as placeholders in the computation graph.
+
+Each implementation contains two parts: A "real" implementation that produce all
+zeros based on the input shape and dtype, and a "fake" implementation that does more
+or less the same thing but is required by the `torch.library.custom_op` interface.
+"""
+
+# flake8: noqa: B950
+import dataclasses
+from collections.abc import Sequence
+from typing import Optional, Union
+
+import torch
+from torch.onnx.ops import _dtype_mappings
+
+
+_INT_TYPE = "i"
+_FLOAT_TYPE = "f"
+_STRING_TYPE = "s"
+_INT_SEQ_TYPE = "is"
+_FLOAT_SEQ_TYPE = "fs"
+_STRING_SEQ_TYPE = "ss"
+
+
+@dataclasses.dataclass
+class EncodedAttrs:
+    """Class to encode attributes from dictionary into lists of FX compatible attributes.
+
+    Since FX does not support dictionaries, we need to encode the attributes into
+    lists. This class provides a way to encode and decode the attributes.
+
+    Attributes:
+        attr_keys: List of attribute keys.
+        attr_types: List of attribute types. Values can be "i" (int), "f" (float),
+            "s" (string), "is" (int sequence), "fs" (float sequence), or "ss" (string sequence).
+        attr_pos: List of tuples representing the start and end positions of each
+            attribute in the corresponding list.
+        attr_ints: List of integer attributes.
+        attr_floats: List of float attributes.
+        attr_strs: List of string attributes.
+    """
+
+    attr_keys: list[str]
+    attr_types: list[str]
+    attr_pos: list[tuple[int, int]]
+    attr_ints: list[int]
+    attr_floats: list[float]
+    attr_strs: list[str]
+
+    @classmethod
+    def from_dict(
+        cls,
+        attrs: dict[
+            str,
+            Union[
+                int,
+                float,
+                str,
+                bool,
+                Sequence[int],
+                Sequence[float],
+                Sequence[str],
+                Sequence[bool],
+            ],
+        ],
+    ) -> "EncodedAttrs":
+        encoded = cls(
+            attr_keys=[],
+            attr_types=[],
+            attr_pos=[],
+            attr_ints=[],
+            attr_floats=[],
+            attr_strs=[],
+        )
+        for i, (k, v) in enumerate(attrs.items()):
+            encoded.attr_keys.append(k)
+            if isinstance(v, int):
+                start_pos = len(encoded.attr_ints)
+                encoded.attr_ints.append(v)
+                encoded.attr_pos.append((start_pos, start_pos + 1))
+                encoded.attr_types.append(_INT_TYPE)
+            elif isinstance(v, float):
+                start_pos = len(encoded.attr_floats)
+                encoded.attr_floats.append(v)
+                encoded.attr_pos.append((start_pos, start_pos + 1))
+                encoded.attr_types.append(_FLOAT_TYPE)
+            elif isinstance(v, str):
+                start_pos = len(encoded.attr_strs)
+                encoded.attr_strs.append(v)
+                encoded.attr_pos.append((start_pos, start_pos + 1))
+                encoded.attr_types.append(_STRING_TYPE)
+            elif isinstance(v, Sequence):
+                if len(v) == 0:
+                    raise ValueError(f"Empty sequence for attribute {k}")
+                if any(isinstance(elem, float) for elem in v):
+                    start_pos = len(encoded.attr_floats)
+                    encoded.attr_floats.extend([float(elem) for elem in v])
+                    encoded.attr_pos.append((start_pos, start_pos + len(v)))
+                    encoded.attr_types.append(_FLOAT_SEQ_TYPE)
+                elif isinstance(v[0], int):
+                    start_pos = len(encoded.attr_ints)
+                    encoded.attr_ints.extend([int(elem) for elem in v])
+                    encoded.attr_pos.append((start_pos, start_pos + len(v)))
+                    encoded.attr_types.append(_INT_SEQ_TYPE)
+                elif isinstance(v[0], str):
+                    start_pos = len(encoded.attr_strs)
+                    encoded.attr_strs.extend([str(elem) for elem in v])
+                    encoded.attr_pos.append((start_pos, start_pos + len(v)))
+                    encoded.attr_types.append(_STRING_SEQ_TYPE)
+                else:
+                    raise ValueError(f"Unsupported sequence type for attribute {k}")
+            else:
+                raise ValueError(f"Unsupported attribute type for {k}: {type(v)}")
+        assert len(encoded.attr_keys) == len(encoded.attr_types), (
+            f"Mismatch between number of attribute keys and types: {len(encoded.attr_keys)} != {len(encoded.attr_types)}"
+        )
+        assert len(encoded.attr_keys) == len(encoded.attr_pos), (
+            f"Mismatch between number of attribute keys and positions: {len(encoded.attr_keys)} != {len(encoded.attr_pos)}"
+        )
+        return encoded
+
+    def to_dict(
+        self,
+    ) -> dict[
+        str,
+        Union[
+            int,
+            float,
+            str,
+            list[int],
+            list[float],
+            list[str],
+        ],
+    ]:
+        """Convert the encoded attributes back to a dictionary for creating an ONNX node."""
+        attrs: dict[
+            str,
+            Union[
+                int,
+                float,
+                str,
+                list[int],
+                list[float],
+                list[str],
+            ],
+        ] = {}
+        for i, key in enumerate(self.attr_keys):
+            attr_type = self.attr_types[i]
+            if attr_type == _INT_TYPE:
+                attrs[key] = self.attr_ints[self.attr_pos[i][0]]
+            elif attr_type == _FLOAT_TYPE:
+                attrs[key] = self.attr_floats[self.attr_pos[i][0]]
+            elif attr_type == _STRING_TYPE:
+                attrs[key] = self.attr_strs[self.attr_pos[i][0]]
+            elif attr_type == _FLOAT_SEQ_TYPE:
+                attrs[key] = self.attr_floats[self.attr_pos[i][0] : self.attr_pos[i][1]]
+            elif attr_type == _INT_SEQ_TYPE:
+                attrs[key] = self.attr_ints[self.attr_pos[i][0] : self.attr_pos[i][1]]
+            elif attr_type == _STRING_SEQ_TYPE:
+                attrs[key] = self.attr_strs[self.attr_pos[i][0] : self.attr_pos[i][1]]
+            else:
+                raise ValueError(f"Unsupported attribute type: {attr_type}")
+        return attrs
+
+
+@torch.library.custom_op(
+    "onnx_symbolic::_symbolic",
+    mutates_args=(),
+    schema=(
+        "(Tensor?[] inputs, str op_type, int onnx_dtype, *,"
+        " SymInt[] shape, str[] attr_keys, str[] attr_types, int[][] attr_pos,"
+        " int[] attr_ints, float[] attr_floats, str[] attr_strs, str[] metadata_props_keys,"
+        " str[] metadata_props_values, str domain='', int? version=None"
+        ") -> Tensor"
+    ),
+)
+def _symbolic(
+    inputs: Sequence[Optional[torch.Tensor]],
+    op_type: str,
+    onnx_dtype: int,
+    *,
+    shape: Sequence[Union[int, torch.SymInt]],
+    attr_keys: Sequence[str],
+    attr_types: Sequence[str],
+    attr_pos: Sequence[tuple[int, int]],
+    attr_ints: Sequence[int],
+    attr_floats: Sequence[float],
+    attr_strs: Sequence[str],
+    metadata_props_keys: Sequence[str] = (),
+    metadata_props_values: Sequence[str] = (),
+    domain: str = "",
+    version: Optional[int] = None,
+) -> torch.Tensor:
+    torch._check(
+        onnx_dtype in _dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE,
+        lambda: f"{onnx_dtype} is invalid as an ONNX data type. Valid values are {list(_dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE.keys())}",
+    )
+    return torch.zeros(
+        shape, dtype=_dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE[onnx_dtype]
+    )
+
+
+@_symbolic.register_fake
+def _(
+    inputs: Sequence[torch.Tensor],
+    op_type: str,
+    onnx_dtype: int,
+    *,
+    shape: Sequence[Union[int, torch.SymInt]],
+    attr_keys: Sequence[str],
+    attr_types: Sequence[str],
+    attr_pos: Sequence[tuple[int, int]],
+    attr_ints: Sequence[int],
+    attr_floats: Sequence[float],
+    attr_strs: Sequence[str],
+    metadata_props_keys: Sequence[str] = (),
+    metadata_props_values: Sequence[str] = (),
+    domain: str = "",
+    version: Optional[int] = None,
+) -> torch.Tensor:
+    torch._check(
+        onnx_dtype in _dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE,
+        lambda: f"{onnx_dtype} is invalid as an ONNX data type. Valid values are {list(_dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE.keys())}",
+    )
+    # NOTE(justinchuby): Use zeros instead of torch.empty because I haven't figured
+    # out how it can handle empty shapes
+    return torch.zeros(
+        shape, dtype=_dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE[onnx_dtype]
+    )
+
+
+@torch.library.custom_op(
+    "onnx_symbolic::_symbolic_multi_out",
+    mutates_args=(),
+    schema=(
+        "(Tensor?[] inputs, str op_type, int[] onnx_dtypes, *,"
+        " SymInt[][] shapes, str[] attr_keys, str[] attr_types, int[][] attr_pos,"
+        " int[] attr_ints, float[] attr_floats, str[] attr_strs, str[] metadata_props_keys,"
+        " str[] metadata_props_values, str domain='', int? version=None"
+        ") -> Tensor[]"
+    ),
+)
+def _symbolic_multi_out(
+    inputs: Sequence[Optional[torch.Tensor]],
+    op_type: str,
+    onnx_dtypes: Sequence[int],
+    *,
+    shapes: Sequence[Sequence[Union[int, torch.SymInt]]],
+    attr_keys: Sequence[str],
+    attr_types: Sequence[str],
+    attr_pos: Sequence[tuple[int, int]],
+    attr_ints: Sequence[int],
+    attr_floats: Sequence[float],
+    attr_strs: Sequence[str],
+    metadata_props_keys: Sequence[str] = (),
+    metadata_props_values: Sequence[str] = (),
+    domain: str = "",
+    version: Optional[int] = None,
+) -> list[torch.Tensor]:
+    outputs = []
+    torch._check(
+        len(shapes) == len(onnx_dtypes),
+        lambda: f"Number of shapes ({len(shapes)}) must match number of ONNX dtypes ({len(onnx_dtypes)})",
+    )
+    for shape, onnx_dtype in zip(shapes, onnx_dtypes):
+        torch._check(
+            onnx_dtype in _dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE,
+            lambda: f"{onnx_dtype} is invalid as an ONNX data type. Valid values are {list(_dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE.keys())}",
+        )
+        outputs.append(
+            torch.zeros(
+                shape, dtype=_dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE[onnx_dtype]
+            )
+        )
+    return outputs
+
+
+@_symbolic_multi_out.register_fake
+def _(
+    inputs: Sequence[torch.Tensor],
+    op_type: str,
+    onnx_dtypes: Sequence[int],
+    *,
+    shapes: Sequence[Sequence[Union[int, torch.SymInt]]],
+    attr_keys: Sequence[str],
+    attr_types: Sequence[str],
+    attr_pos: Sequence[tuple[int, int]],
+    attr_ints: Sequence[int],
+    attr_floats: Sequence[float],
+    attr_strs: Sequence[str],
+    metadata_props_keys: Sequence[str] = (),
+    metadata_props_values: Sequence[str] = (),
+    domain: str = "",
+    version: Optional[int] = None,
+) -> list[torch.Tensor]:
+    outputs = []
+    torch._check(
+        len(shapes) == len(onnx_dtypes),
+        lambda: f"Number of shapes ({len(shapes)}) must match number of ONNX dtypes ({len(onnx_dtypes)})",
+    )
+    for shape, onnx_dtype in zip(shapes, onnx_dtypes):
+        torch._check(
+            onnx_dtype in _dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE,
+            lambda: f"{onnx_dtype} is invalid as an ONNX data type. Valid values are {list(_dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE.keys())}",
+        )
+        # NOTE(justinchuby): Use zeros instead of torch.empty because I haven't figured
+        # out how it can handle empty shapes
+        outputs.append(
+            torch.zeros(
+                shape, dtype=_dtype_mappings.ONNX_DTYPE_TO_TORCH_DTYPE[onnx_dtype]
+            )
+        )
+    return outputs
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 371745664f4b..9a3a578d165c 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -5315,7 +5315,11 @@ def try_mask_to_index(index):
         #   2. prim::Constant[value=...] or tensor output
         #           representing advanced indexing. E.g. tensor[[0, 1], [2, 0]].
         # For more info on advanced indexing,
+<<<<<<< HEAD
         # check https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing
+=======
+        # check https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Consider a general case of
         #       t: [x_1, y_1, y_2, ..., x_m, ..., y_n]
@@ -5389,7 +5393,11 @@ def try_mask_to_index(index):
 
             cum_adv_index_shape_tensor = _shape_as_tensor(g, cum_adv_index)
             # check if all advanced indices are consecutive.
+<<<<<<< HEAD
             # Refer to https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#combining-advanced-and-basic-indexing
+=======
+            # Refer to https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # to understand how the subarray position is decided.
             if adv_idx_indices == list(
                 range(adv_idx_indices[0], adv_idx_indices[-1] + 1)
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index ccbb2f2f6c56..6a170319916f 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -23,7 +23,11 @@
 from torch import _C
 from torch.onnx import _constants, errors, symbolic_helper  # noqa: F401
 from torch.onnx._globals import GLOBALS
+<<<<<<< HEAD
 from torch.onnx._internal import diagnostics, jit_utils, onnx_proto_utils, registration
+=======
+from torch.onnx._internal import jit_utils, onnx_proto_utils, registration
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if typing.TYPE_CHECKING:
@@ -31,7 +35,10 @@
 
 
 __all__ = [
+<<<<<<< HEAD
     "is_in_onnx_export",
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "select_model_mode_for_export",
     "disable_apex_o2_state_dict_hook",
     "setup_onnx_logging",
@@ -46,11 +53,14 @@
 ]
 
 
+<<<<<<< HEAD
 def is_in_onnx_export() -> bool:
     """Returns whether it is in the middle of ONNX export."""
     return GLOBALS.in_onnx_export
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO(justinchuby): Remove dependency to this global variable from constant_fold.cpp
 # Skip check due to cannot import IValue from torch._C
 _params_dict = {}  # type: ignore[var-annotated]
@@ -183,9 +193,14 @@ def exporter_context(model, mode: _C_onnx.TrainingMode, verbose: bool):
         select_model_mode_for_export(model, mode) as mode_ctx,
         disable_apex_o2_state_dict_hook(model) as apex_ctx,
         setup_onnx_logging(verbose) as log_ctx,
+<<<<<<< HEAD
         diagnostics.create_export_diagnostic_context() as diagnostic_ctx,
     ):
         yield (mode_ctx, apex_ctx, log_ctx, diagnostic_ctx)
+=======
+    ):
+        yield (mode_ctx, apex_ctx, log_ctx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _get_torch_export_args(
@@ -360,9 +375,15 @@ def export(
 
                     Models exported this way are probably runnable only by Caffe2.
 
+<<<<<<< HEAD
         opset_version (int, default 17): The version of the
             `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
             to target. Must be >= 7 and <= 17.
+=======
+        opset_version (int, default 18): The version of the
+            `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
+            to target. Must be >= 7.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         do_constant_folding: Apply the constant-folding optimization.
             Constant-folding will replace some of the ops that have all constant inputs
             with pre-computed constant nodes.
@@ -1400,10 +1421,14 @@ def _export(
     if opset_version is None:
         opset_version = _constants.ONNX_DEFAULT_OPSET
 
+<<<<<<< HEAD
     # torch.onnx.export does not support opset versions >=18
     if opset_version > _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET:
         # We do not want to fail because we should still allow users to create
         # custom symbolic functions for opset>17
+=======
+    if opset_version > _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         warnings.warn(
             f"Exporting to ONNX opset version {opset_version} is not supported. "
             f"by 'torch.onnx.export()'. "
diff --git a/torch/optim/_adafactor.py b/torch/optim/_adafactor.py
index f499045dbbbc..e0cf741a9d24 100644
--- a/torch/optim/_adafactor.py
+++ b/torch/optim/_adafactor.py
@@ -10,6 +10,10 @@
     _get_scalar_dtype,
     _maximize_doc,
     _params_doc,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Optimizer,
     ParamsT,
     TensorListList,
@@ -226,7 +230,11 @@ def step(self, closure=None):
     Args:
         {_params_doc}
         lr (float, Tensor, optional): unlike other optimizers, Adafactor does not require a
+<<<<<<< HEAD
             learning rate, and Shazeer, Noam, and Mitchell Stern do not use lr at all.
+=======
+            learning rate, and Noam Shazeer and Mitchell Stern do not use lr at all.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             Deviating from the paper, this implementation uses lr for applying weight
             decay and as the maximum value for relative step size rho_t. Note that in
             the paper, a constant of 0.01 is used as the maximum value for relative
@@ -252,11 +260,19 @@ def step(self, closure=None):
         {_maximize_doc}"""
     + r"""
     .. Note::
+<<<<<<< HEAD
         The implementation of Adafactor subtly differs from Shazeer, Noam, and Mitchell Stern
         and implementations in some other frameworks with its use of learning rate and
         :math:`\epsilon_1`.
 
         Regarding the learning rate hyperparameter: Shazeer, Noam, and Mitchell Stern do not
+=======
+        The implementation of Adafactor subtly differs from Noam Shazeer and Mitchell Stern
+        and implementations in some other frameworks with its use of learning rate and
+        :math:`\epsilon_1`.
+
+        Regarding the learning rate hyperparameter: Noam Shazeer and Mitchell Stern do not
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         use lr at all, as the stated algorithm uses :math:`\rho_t` and update clipping to
         affect the step size.
 
@@ -267,7 +283,11 @@ def step(self, closure=None):
                 &\hspace{5mm}\rho_t \leftarrow min(lr, \frac{1}{\sqrt{t}})
             \end{aligned}
 
+<<<<<<< HEAD
         This differs from Shazeer, Noam, and Mitchell Stern, who use a constant of 0.01 as
+=======
+        This differs from Noam Shazeer and Mitchell Stern, who use a constant of 0.01 as
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         the maximum value of :math:`\rho_t`
 
         .. math::
@@ -275,12 +295,20 @@ def step(self, closure=None):
                 &\hspace{5mm}\rho_t \leftarrow min(0.01, \frac{1}{\sqrt{t}})
             \end{aligned}
 
+<<<<<<< HEAD
         Shazeer, Noam, and Mitchell Stern do not enforce an opinion on how weight decay should
+=======
+        Noam Shazeer and Mitchell Stern do not enforce an opinion on how weight decay should
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         be computed, and so we use the learning rate as a coefficient for decoupled weight
         decay, similar to what is suggested in `Decoupled Weight Decay Regularization`_.
 
         Regarding the use of :math:`\epsilon_1`: The implementation attempts to replicate the
+<<<<<<< HEAD
         presumed intention of Shazeer, Noam, and Mitchell Stern to use :math:`\epsilon_1` as
+=======
+        presumed intention of Noam Shazeer and Mitchell Stern to use :math:`\epsilon_1` as
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         a stabilizing term when the squared gradient becomes small.
 
         This stabilization can be written as
@@ -300,7 +328,11 @@ def step(self, closure=None):
         are left alone, and we apply :math:`\epsilon_1` at the final calculation of
         the variance estimate :math:`\widehat{V}_t` and for the update :math:`U_t`.
 
+<<<<<<< HEAD
         This is in contrast to Shazeer, Noam, and Mitchell Stern and other frameworks which
+=======
+        This is in contrast to Noam Shazeer and Mitchell Stern and other frameworks which
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         apply :math:`\epsilon_1` to both row and column factors of the squared gradient, but
         not in the calculations after:
 
@@ -346,15 +378,26 @@ def _single_tensor_adafactor(
     maximize: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
     assert (
         grad_scale is None and found_inf is None
     ), "Grad scaling should occur outside of optimizer.step()"
+=======
+    assert grad_scale is None and found_inf is None, (
+        "Grad scaling should occur outside of optimizer.step()"
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if torch.jit.is_scripting():
         # this assert is due to JIT being dumb and not realizing that the ops below
         # have overloads to handle both float and Tensor lrs, so we just assert it's
         # a float since most people using JIT are using floats
         assert isinstance(lr, float)
+<<<<<<< HEAD
+=======
+    else:
+        lr = _to_scalar(lr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for i, param in enumerate(params):
         grad = grads[i] if not maximize else -grads[i]
@@ -378,9 +421,15 @@ def _single_tensor_adafactor(
             param.mul_(1 - lr * weight_decay)
 
         if grad.dim() > 1:
+<<<<<<< HEAD
             assert (
                 row_var is not None and col_var is not None
             ), "row_var and col_var should be defined when grad is multidimensional"
+=======
+            assert row_var is not None and col_var is not None, (
+                "row_var and col_var should be defined when grad is multidimensional"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # same as (g * g).mean(dim=-1) w/o materializing an intermediate size g
             row_mean = (
                 torch.norm(grad, dim=-1, keepdim=True).square_().div_(grad.size(-1))
@@ -394,9 +443,15 @@ def _single_tensor_adafactor(
             var_estimate = row_var @ col_var
             var_estimate.div_(row_var.mean(dim=-2, keepdim=True).clamp_(min=eps1))
         else:
+<<<<<<< HEAD
             assert (
                 variance is not None
             ), "variance should be defined when grad is a vector"
+=======
+            assert variance is not None, (
+                "variance should be defined when grad is a vector"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             grad_squared = grad * grad
             variance.lerp_(grad_squared, one_minus_beta2_t)
             # avoid writing into variance during update
@@ -469,9 +524,17 @@ def _multi_tensor_adafactor(
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
     assert (
         grad_scale is None and found_inf is None
     ), "Grad scaling should occur outside of optimizer.step()"
+=======
+    assert grad_scale is None and found_inf is None, (
+        "Grad scaling should occur outside of optimizer.step()"
+    )
+
+    lr = _to_scalar(lr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grouped_tensors = _group_tensors_by_device_dtype_and_is_multidim(
         [params, grads, row_vars, col_vars, variances, state_steps]  # type: ignore[list-item]
@@ -490,9 +553,15 @@ def _multi_tensor_adafactor(
         device_grads = cast(list[Tensor], device_grads_)
         device_state_steps = cast(list[Tensor], device_state_steps_)
         if eps1 is None:
+<<<<<<< HEAD
             assert (
                 dtype is not None
             ), "dtype is needed to compute eps1 when eps1 is unset"
+=======
+            assert dtype is not None, (
+                "dtype is needed to compute eps1 when eps1 is unset"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             eps1 = torch.finfo(dtype).eps
 
         if TYPE_CHECKING:
@@ -532,9 +601,15 @@ def _multi_tensor_adafactor(
         if is_multidim:
             device_row_vars = cast(list[Tensor], device_row_vars_)
             device_col_vars = cast(list[Tensor], device_col_vars_)
+<<<<<<< HEAD
             assert (
                 device_row_vars[0] is not None and device_col_vars[0] is not None
             ), "row_var and col_var should be defined when grad is multidimensional"
+=======
+            assert device_row_vars[0] is not None and device_col_vars[0] is not None, (
+                "row_var and col_var should be defined when grad is multidimensional"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # same as (g * g).mean(dim=-1) w/o materializing an intermediate size g
             row_means = [
                 torch.norm(grad, dim=-1, keepdim=True) for grad in device_grads
@@ -565,9 +640,15 @@ def _multi_tensor_adafactor(
             del row_var_means
         else:
             device_variances = cast(list[Tensor], device_variances_)
+<<<<<<< HEAD
             assert (
                 device_variances[0] is not None
             ), "variance should be defined when grad is a vector"
+=======
+            assert device_variances[0] is not None, (
+                "variance should be defined when grad is a vector"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             grads_squared = torch._foreach_mul(device_grads, device_grads)
             torch._foreach_lerp_(device_variances, grads_squared, one_minus_beta2_ts)
diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py
index f48311fb11d8..420e0f08cf2a 100644
--- a/torch/optim/_functional.py
+++ b/torch/optim/_functional.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 r"""Functional interface."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import math
 
 from torch import Tensor
diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py
index 41a195713b92..44743e25f70a 100644
--- a/torch/optim/_multi_tensor/__init__.py
+++ b/torch/optim/_multi_tensor/__init__.py
@@ -5,6 +5,10 @@
 enough, so that more sophisticated ones can be also easily integrated in the
 future.
 """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functools import partialmethod
 
 from torch import optim
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index e1d2f3d203bf..51204d48136d 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -14,6 +14,10 @@
     _get_scalar_dtype,
     _maximize_doc,
     _params_doc,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
@@ -266,7 +270,16 @@ def _single_tensor_adadelta(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
             for p, step in zip(params, state_steps)
+<<<<<<< HEAD
         ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+
+    if not torch.jit.is_scripting():
+        lr = _to_scalar(lr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     for param, grad, square_avg, acc_delta, step in zip(
         params, grads, square_avgs, acc_deltas, state_steps
@@ -322,11 +335,22 @@ def _multi_tensor_adadelta(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
             for p, step in zip(params, state_steps)
+<<<<<<< HEAD
         ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
+=======
+    lr = _to_scalar(lr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, square_avgs, acc_deltas, state_steps]  # type: ignore[list-item]
     )
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 451135c1ad83..b3be0fec8c73 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -13,6 +13,10 @@
     _get_value,
     _maximize_doc,
     _params_doc,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
@@ -336,6 +340,13 @@ def _single_tensor_adagrad(
     has_complex: bool,
 ):
     assert grad_scale is None and found_inf is None
+<<<<<<< HEAD
+=======
+
+    if not torch.jit.is_scripting():
+        lr = _to_scalar(lr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for param, grad, state_sum, step_t in zip(params, grads, state_sums, state_steps):
         # update step
         step_t += 1
@@ -403,6 +414,11 @@ def _multi_tensor_adagrad(
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
+=======
+    lr = _to_scalar(lr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grouped_tensorlists = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, state_sums, state_steps]  # type: ignore[list-item]
     )
@@ -513,6 +529,11 @@ def _fused_adagrad(
             "adagrad with fused=True does not support differentiable=True"
         )
 
+<<<<<<< HEAD
+=======
+    lr = _to_scalar(lr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grad_scale_dict = (
         {grad_scale.device: grad_scale} if grad_scale is not None else None
     )
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 9623236f47d0..50c21cc91e75 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -18,6 +18,10 @@
     _maximize_doc,
     _params_doc,
     _stack_if_compiling,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _use_grad_for_differentiable,
     _view_as_real,
     DeviceDict,
@@ -372,6 +376,12 @@ def _single_tensor_adam(
         assert isinstance(lr, float)
         assert isinstance(beta1, float)
         assert isinstance(beta2, float)
+<<<<<<< HEAD
+=======
+    else:
+        lr = _to_scalar(lr)
+        # TODO: Support nonzero-dim Tensor betas, see #147921
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # We only shuffle around the beta when it is a Tensor, otherwise, we prefer
     # treating it as a scalar.
@@ -394,7 +404,13 @@ def _single_tensor_adam(
             assert (
                 param.device.type == step_t.device.type
                 and param.device.type in capturable_supported_devices
+<<<<<<< HEAD
             ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # update step
         step_t += 1
@@ -429,7 +445,13 @@ def _single_tensor_adam(
             # cast to workaround https://github.com/pytorch/pytorch/issues/140601
             key = (device, dtype)
             if key not in beta1_dict:
+<<<<<<< HEAD
                 beta1_dict[key] = beta1.to(device=device, dtype=dtype, non_blocking=True)  # type: ignore[union-attr]
+=======
+                beta1_dict[key] = beta1.to(  # type: ignore[union-attr]
+                    device=device, dtype=dtype, non_blocking=True
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             device_beta1: Union[float, Tensor] = beta1_dict[key]
         else:
@@ -556,10 +578,20 @@ def _multi_tensor_adam(
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
     if isinstance(lr, Tensor) and not capturable:
         raise RuntimeError(
             "lr as a Tensor is not supported for capturable=False and foreach=True"
         )
+=======
+    if isinstance(lr, Tensor):
+        if not capturable:
+            raise RuntimeError(
+                "lr as a Tensor is not supported for capturable=False and foreach=True"
+            )
+        if lr.numel() != 1:
+            raise ValueError("Tensor lr must be 1-element")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if isinstance(beta1, Tensor):
         if not capturable:
@@ -586,12 +618,24 @@ def _multi_tensor_adam(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
             for p, step in zip(params, state_steps)
+<<<<<<< HEAD
         ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     assert grad_scale is None and found_inf is None
 
     assert not differentiable, "_foreach ops don't support autograd"
 
+<<<<<<< HEAD
+=======
+    lr = _to_scalar(lr)
+    # TODO: Support nonzero-dim Tensor betas, see #147921
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]  # type: ignore[list-item]
     )
@@ -759,7 +803,14 @@ def _multi_tensor_adam(
             torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
             torch._foreach_add_(exp_avg_sq_sqrt, eps)
             torch._foreach_addcdiv_(
+<<<<<<< HEAD
                 device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size  # type: ignore[arg-type]
+=======
+                device_params,
+                device_exp_avgs,
+                exp_avg_sq_sqrt,
+                step_size,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
@@ -822,9 +873,12 @@ def _fused_adam(
         device_exp_avg_sqs = cast(list[Tensor], device_exp_avg_sqs_)
         device_state_steps = cast(list[Tensor], device_state_steps_)
 
+<<<<<<< HEAD
         if device.type == "mps":  # type: ignore[union-attr]
             assert found_inf is None and grad_scale is None
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         device_grad_scale, device_found_inf = None, None
         if grad_scale is not None:
             device_grad_scale = grad_scale_dict.setdefault(
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 588950aa5be5..ba87612483cb 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -15,6 +15,10 @@
     _get_value,
     _maximize_doc,
     _params_doc,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
@@ -239,6 +243,12 @@ def _single_tensor_adamax(
     capturable: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
+=======
+    if not torch.jit.is_scripting():
+        lr = _to_scalar(lr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, param in enumerate(params):
         grad = grads[i]
         grad = grad if not maximize else -grad
@@ -252,7 +262,13 @@ def _single_tensor_adamax(
             assert (
                 param.device.type == step_t.device.type
                 and param.device.type in capturable_supported_devices
+<<<<<<< HEAD
             ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # update step
         step_t += 1
@@ -327,7 +343,15 @@ def _multi_tensor_adamax(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
             for p, step in zip(params, state_steps)
+<<<<<<< HEAD
         ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+
+    lr = _to_scalar(lr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, exp_avgs, exp_infs, state_steps]  # type: ignore[list-item]
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index ca798a24f38a..c87cfc2b6301 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -15,6 +15,10 @@
     _get_value,
     _maximize_doc,
     _params_doc,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
@@ -101,7 +105,13 @@ def _init_group(self, group, params_with_grad, grads, mus, axs, etas, state_step
                     )
                     state["eta"] = (
                         torch.as_tensor(
+<<<<<<< HEAD
                             group["lr"], device=p.device, dtype=_get_scalar_dtype()
+=======
+                            _to_scalar(group["lr"]),
+                            device=p.device,
+                            dtype=_get_scalar_dtype(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         .clone()
                         .detach()
@@ -186,7 +196,11 @@ def step(self, closure=None):
         {_capturable_doc}
 
     .. _Acceleration of stochastic approximation by averaging:
+<<<<<<< HEAD
         https://dl.acm.org/citation.cfm?id=131098
+=======
+        https://meyn.ece.ufl.edu/wp-content/uploads/sites/77/archive/spm_files/Courses/ECE555-2011/555media/poljud92.pdf
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     """
 
@@ -209,6 +223,12 @@ def _single_tensor_asgd(
     capturable: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
+=======
+    if not torch.jit.is_scripting():
+        lr = _to_scalar(lr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, param in enumerate(params):
         grad = grads[i]
         grad = grad if not maximize else -grad
@@ -299,7 +319,15 @@ def _multi_tensor_asgd(
             p.device.type == mu.device.type == eta.device.type == step.device.type
             and p.device.type in capturable_supported_devices
             for p, mu, eta, step in zip(params, mus, etas, state_steps)
+<<<<<<< HEAD
         ), f"If capturable=True, params, mus, etas, and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+        ), (
+            f"If capturable=True, params, mus, etas, and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+
+    lr = _to_scalar(lr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, axs, mus, etas, state_steps]  # type: ignore[list-item]
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index 11c3c7e0441a..39c2e6fd2351 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -4,7 +4,11 @@
 import torch
 from torch import Tensor
 
+<<<<<<< HEAD
 from .optimizer import Optimizer, ParamsT
+=======
+from .optimizer import _to_scalar, Optimizer, ParamsT
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["LBFGS"]
@@ -299,7 +303,11 @@ def _directional_evaluate(self, closure, x, t, d):
         return loss, flat_grad
 
     @torch.no_grad()
+<<<<<<< HEAD
     def step(self, closure):
+=======
+    def step(self, closure):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Perform a single optimization step.
 
         Args:
@@ -312,7 +320,11 @@ def step(self, closure):
         closure = torch.enable_grad()(closure)
 
         group = self.param_groups[0]
+<<<<<<< HEAD
         lr = group["lr"]
+=======
+        lr = _to_scalar(group["lr"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         max_iter = group["max_iter"]
         max_eval = group["max_eval"]
         tolerance_grad = group["tolerance_grad"]
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 4a736e3a8780..90d3257d1992 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -1,11 +1,20 @@
 # mypy: allow-untyped-defs
 r"""Learning Rate Scheduler."""
+<<<<<<< HEAD
+=======
+
+from __future__ import annotations
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import math
 import types
 import warnings
 from bisect import bisect_right
 from collections import Counter
+<<<<<<< HEAD
 from collections.abc import Iterable, Sequence
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from functools import partial, wraps
 from typing import (
     Any,
@@ -14,14 +23,30 @@
     Literal,
     Optional,
     SupportsFloat,
+<<<<<<< HEAD
+    TypedDict,
+    Union,
+)
+=======
+    TYPE_CHECKING,
     TypedDict,
     Union,
 )
+from typing_extensions import override, Self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from weakref import ref
 
 from torch import inf, Tensor
 
+<<<<<<< HEAD
 from .optimizer import Optimizer
+=======
+from .optimizer import _to_scalar, Optimizer
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -75,12 +100,20 @@ class LRScheduler:
     r"""Adjusts the learning rate during optimization."""
 
     _get_lr_called_within_step: bool = False
+<<<<<<< HEAD
+=======
+    _is_initial: bool = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
         optimizer: Optimizer,
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Attach optimizer
         if not isinstance(optimizer, Optimizer):
             raise TypeError(f"{type(optimizer).__name__} is not an Optimizer")
@@ -131,12 +164,22 @@ def wrapper(*args, **kwargs):
         patch_track_step_called(self.optimizer)
         self._initial_step()
 
+<<<<<<< HEAD
     def _initial_step(self):
         """Initialize step counts and perform a step."""
         self._step_count = 0
         self.step()
 
     def state_dict(self):
+=======
+    def _initial_step(self) -> None:
+        """Initialize step counts and perform a step."""
+        self._step_count = 0
+        with _initial_mode(self):
+            self.step()
+
+    def state_dict(self) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Return the state of the scheduler as a :class:`dict`.
 
         It contains an entry for every variable in self.__dict__ which
@@ -163,7 +206,11 @@ def get_lr(self) -> list[float]:
         """Compute learning rate using chainable form of the scheduler."""
         raise NotImplementedError
 
+<<<<<<< HEAD
     def step(self, epoch: Optional[int] = None):
+=======
+    def step(self, epoch: Optional[int] = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Perform a step."""
         # Raise a warning if old pattern is detected
         # https://github.com/pytorch/pytorch/issues/20124
@@ -188,6 +235,10 @@ def step(self, epoch: Optional[int] = None):
                     "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate",
                     UserWarning,
                 )
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._step_count += 1
 
         with _enable_get_lr_call(self):
@@ -204,7 +255,11 @@ def step(self, epoch: Optional[int] = None):
 
         for param_group, lr in zip(self.optimizer.param_groups, values):
             if isinstance(param_group["lr"], Tensor):
+<<<<<<< HEAD
                 param_group["lr"].fill_(lr)
+=======
+                param_group["lr"].fill_(_to_scalar(lr))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 param_group["lr"] = lr
 
@@ -213,7 +268,11 @@ def step(self, epoch: Optional[int] = None):
         ]
 
 
+<<<<<<< HEAD
 def _warn_get_lr_called_within_step(lr_scheduler: LRScheduler):
+=======
+def _warn_get_lr_called_within_step(lr_scheduler: LRScheduler) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if not lr_scheduler._get_lr_called_within_step:
         warnings.warn(
             "To get the last learning rate computed by the scheduler, "
@@ -230,15 +289,37 @@ class _LRScheduler(LRScheduler):
 
 
 class _enable_get_lr_call:
+<<<<<<< HEAD
+=======
+    def __init__(self, o: LRScheduler) -> None:
+        self.o = o
+
+    def __enter__(self) -> Self:
+        self.o._get_lr_called_within_step = True
+        return self
+
+    def __exit__(self, type, value, traceback) -> None:
+        self.o._get_lr_called_within_step = False
+
+
+class _initial_mode:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, o: LRScheduler):
         self.o = o
 
     def __enter__(self):
+<<<<<<< HEAD
         self.o._get_lr_called_within_step = True
         return self
 
     def __exit__(self, type, value, traceback):
         self.o._get_lr_called_within_step = False
+=======
+        self.o._is_initial = True
+
+    def __exit__(self, type, value, traceback):
+        self.o._is_initial = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class LambdaLR(LRScheduler):
@@ -257,6 +338,7 @@ class LambdaLR(LRScheduler):
     Example:
         >>> # xdoctest: +SKIP
         >>> # Assuming optimizer has two groups.
+<<<<<<< HEAD
         >>> lambda1 = lambda epoch: epoch // 30
         >>> lambda2 = lambda epoch: 0.95 ** epoch
         >>> scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])
@@ -264,6 +346,25 @@ class LambdaLR(LRScheduler):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
+=======
+        >>> num_epochs = 100
+        >>> lambda1 = lambda epoch: epoch // 30
+        >>> lambda2 = lambda epoch: 0.95**epoch
+        >>> scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])
+        >>> for epoch in range(num_epochs):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+        >>>
+        >>> # Alternatively, you can use a single lambda function for all groups.
+        >>> scheduler = LambdaLR(opt, lr_lambda=lambda epoch: epoch // 30)
+        >>> for epoch in range(num_epochs):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+
+    .. image:: ../scripts/lr_scheduler_images/LambdaLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -271,7 +372,11 @@ def __init__(
         optimizer: Optimizer,
         lr_lambda: Union[Callable[[int], float], list[Callable[[int], float]]],
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.optimizer = optimizer
 
         self.lr_lambdas: list[Callable[[int], float]]
@@ -285,7 +390,12 @@ def __init__(
             self.lr_lambdas = list(lr_lambda)
         super().__init__(optimizer, last_epoch)
 
+<<<<<<< HEAD
     def state_dict(self):
+=======
+    @override
+    def state_dict(self) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Return the state of the scheduler as a :class:`dict`.
 
         It contains an entry for every variable in self.__dict__ which
@@ -308,7 +418,12 @@ def state_dict(self):
 
         return state_dict
 
+<<<<<<< HEAD
     def load_state_dict(self, state_dict):
+=======
+    @override
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Load the scheduler's state.
 
         When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
@@ -327,7 +442,12 @@ def load_state_dict(self, state_dict):
             if fn is not None:
                 self.lr_lambdas[idx].__dict__.update(fn)
 
+<<<<<<< HEAD
     def get_lr(self):
+=======
+    @override
+    def get_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Compute learning rate."""
         _warn_get_lr_called_within_step(self)
 
@@ -357,6 +477,11 @@ class MultiplicativeLR(LRScheduler):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
+<<<<<<< HEAD
+=======
+
+    .. image:: ../scripts/lr_scheduler_images/MultiplicativeLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -364,7 +489,11 @@ def __init__(
         optimizer: Optimizer,
         lr_lambda: Union[Callable[[int], float], list[Callable[[int], float]]],
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.optimizer = optimizer
 
         self.lr_lambdas: list[Callable[[int], float]]
@@ -376,9 +505,21 @@ def __init__(
                     f"Expected {len(optimizer.param_groups)} lr_lambdas, but got {len(lr_lambda)}"
                 )
             self.lr_lambdas = list(lr_lambda)
+<<<<<<< HEAD
         super().__init__(optimizer, last_epoch)
 
     def state_dict(self):
+=======
+        for lr_lambda in self.lr_lambdas:
+            if not callable(lr_lambda):
+                raise TypeError(
+                    f"lr_lambda should be a function, but got {type(lr_lambda).__name__}"
+                )
+        super().__init__(optimizer, last_epoch)
+
+    @override
+    def state_dict(self) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Return the state of the scheduler as a :class:`dict`.
 
         It contains an entry for every variable in self.__dict__ which
@@ -399,7 +540,12 @@ def state_dict(self):
 
         return state_dict
 
+<<<<<<< HEAD
     def load_state_dict(self, state_dict):
+=======
+    @override
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Load the scheduler's state.
 
         Args:
@@ -416,11 +562,20 @@ def load_state_dict(self, state_dict):
             if fn is not None:
                 self.lr_lambdas[idx].__dict__.update(fn)
 
+<<<<<<< HEAD
     def get_lr(self):
         """Compute the learning rate of each parameter group."""
         _warn_get_lr_called_within_step(self)
 
         if self.last_epoch > 0:
+=======
+    @override
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate of each parameter group."""
+        _warn_get_lr_called_within_step(self)
+
+        if not self._is_initial:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [
                 group["lr"] * lmbda(self.last_epoch)
                 for lmbda, group in zip(self.lr_lambdas, self.optimizer.param_groups)
@@ -454,6 +609,11 @@ class StepLR(LRScheduler):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
+<<<<<<< HEAD
+=======
+
+    .. image:: ../scripts/lr_scheduler_images/StepLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -462,12 +622,21 @@ def __init__(
         step_size: int,
         gamma: float = 0.1,
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.step_size = step_size
         self.gamma = gamma
         super().__init__(optimizer, last_epoch)
 
+<<<<<<< HEAD
     def get_lr(self):
+=======
+    @override
+    def get_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Compute the learning rate of each parameter group."""
         _warn_get_lr_called_within_step(self)
 
@@ -475,7 +644,11 @@ def get_lr(self):
             return [group["lr"] for group in self.optimizer.param_groups]
         return [group["lr"] * self.gamma for group in self.optimizer.param_groups]
 
+<<<<<<< HEAD
     def _get_closed_form_lr(self):
+=======
+    def _get_closed_form_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [
             base_lr * self.gamma ** (self.last_epoch // self.step_size)
             for base_lr in self.base_lrs
@@ -501,11 +674,20 @@ class MultiStepLR(LRScheduler):
         >>> # lr = 0.05     if epoch < 30
         >>> # lr = 0.005    if 30 <= epoch < 80
         >>> # lr = 0.0005   if epoch >= 80
+<<<<<<< HEAD
         >>> scheduler = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)
+=======
+        >>> scheduler = MultiStepLR(optimizer, milestones=[30, 80], gamma=0.1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
+<<<<<<< HEAD
+=======
+
+    .. image:: ../scripts/lr_scheduler_images/MultiStepLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -514,12 +696,21 @@ def __init__(
         milestones: Iterable[int],
         gamma: float = 0.1,
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.milestones = Counter(milestones)
         self.gamma = gamma
         super().__init__(optimizer, last_epoch)
 
+<<<<<<< HEAD
     def get_lr(self):
+=======
+    @override
+    def get_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Compute the learning rate of each parameter group."""
         _warn_get_lr_called_within_step(self)
 
@@ -560,12 +751,23 @@ class ConstantLR(LRScheduler):
         >>> # lr = 0.025   if epoch == 1
         >>> # lr = 0.025   if epoch == 2
         >>> # lr = 0.025   if epoch == 3
+<<<<<<< HEAD
         >>> # lr = 0.05    if epoch >= 4
         >>> scheduler = ConstantLR(optimizer, factor=0.5, total_iters=4)
+=======
+        >>> # ...
+        >>> # lr = 0.05    if epoch >= 40
+        >>> scheduler = ConstantLR(optimizer, factor=0.5, total_iters=40)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
+<<<<<<< HEAD
+=======
+
+    .. image:: ../scripts/lr_scheduler_images/ConstantLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -574,7 +776,11 @@ def __init__(
         factor: float = 1.0 / 3,
         total_iters: int = 5,
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if factor > 1.0 or factor < 0:
             raise ValueError(
                 "Constant multiplicative factor expected to be between 0 and 1."
@@ -584,7 +790,12 @@ def __init__(
         self.total_iters = total_iters
         super().__init__(optimizer, last_epoch)
 
+<<<<<<< HEAD
     def get_lr(self):
+=======
+    @override
+    def get_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Compute the learning rate of each parameter group."""
         _warn_get_lr_called_within_step(self)
 
@@ -627,16 +838,31 @@ class LinearLR(LRScheduler):
     Example:
         >>> # xdoctest: +SKIP
         >>> # Assuming optimizer uses lr = 0.05 for all groups
+<<<<<<< HEAD
         >>> # lr = 0.025    if epoch == 0
         >>> # lr = 0.03125  if epoch == 1
         >>> # lr = 0.0375   if epoch == 2
         >>> # lr = 0.04375  if epoch == 3
         >>> # lr = 0.05    if epoch >= 4
         >>> scheduler = LinearLR(optimizer, start_factor=0.5, total_iters=4)
+=======
+        >>> # lr = 0.003687  if epoch == 0
+        >>> # lr = 0.004875  if epoch == 1
+        >>> # lr = 0.006062  if epoch == 2
+        >>> # lr = 0.00725   if epoch == 3
+        >>> # ...
+        >>> # lr = 0.05      if epoch >= 40
+        >>> scheduler = LinearLR(optimizer, start_factor=0.05, total_iters=40)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
+<<<<<<< HEAD
+=======
+
+    .. image:: ../scripts/lr_scheduler_images/LinearLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -646,7 +872,11 @@ def __init__(
         end_factor: float = 1.0,
         total_iters: int = 5,
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if start_factor > 1.0 or start_factor <= 0:
             raise ValueError(
                 "Starting multiplicative factor expected to be greater than 0 and less or equal to 1."
@@ -662,7 +892,12 @@ def __init__(
         self.total_iters = total_iters
         super().__init__(optimizer, last_epoch)
 
+<<<<<<< HEAD
     def get_lr(self):
+=======
+    @override
+    def get_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Compute the learning rate."""
         _warn_get_lr_called_within_step(self)
 
@@ -671,7 +906,11 @@ def get_lr(self):
                 group["lr"] * self.start_factor for group in self.optimizer.param_groups
             ]
 
+<<<<<<< HEAD
         if self.last_epoch > self.total_iters:
+=======
+        if self._is_initial or self.last_epoch > self.total_iters:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [group["lr"] for group in self.optimizer.param_groups]
 
         return [
@@ -709,6 +948,19 @@ class ExponentialLR(LRScheduler):
         optimizer (Optimizer): Wrapped optimizer.
         gamma (float): Multiplicative factor of learning rate decay.
         last_epoch (int): The index of last epoch. Default: -1.
+<<<<<<< HEAD
+=======
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> scheduler = ExponentialLR(optimizer, gamma=0.95)
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+
+    .. image:: ../scripts/lr_scheduler_images/ExponentialLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -716,6 +968,7 @@ def __init__(
         optimizer: Optimizer,
         gamma: float,
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
         self.gamma = gamma
         super().__init__(optimizer, last_epoch)
@@ -725,6 +978,20 @@ def get_lr(self):
         _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0:
+=======
+    ) -> None:  # noqa: D107
+        self.gamma = gamma
+        super().__init__(optimizer, last_epoch)
+
+    @override
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate of each parameter group."""
+        _warn_get_lr_called_within_step(self)
+
+        # when loading from a checkpoint, we don't want _initial_step (called from the constructor)
+        # to update the lr one more step ahead of itself.
+        if self._is_initial:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [group["lr"] for group in self.optimizer.param_groups]
         return [group["lr"] * self.gamma for group in self.optimizer.param_groups]
 
@@ -746,6 +1013,7 @@ class SequentialLR(LRScheduler):
 
     Example:
         >>> # xdoctest: +SKIP
+<<<<<<< HEAD
         >>> # Assuming optimizer uses lr = 1. for all groups
         >>> # lr = 0.1     if epoch == 0
         >>> # lr = 0.1     if epoch == 1
@@ -755,10 +1023,32 @@ class SequentialLR(LRScheduler):
         >>> scheduler1 = ConstantLR(optimizer, factor=0.1, total_iters=2)
         >>> scheduler2 = ExponentialLR(optimizer, gamma=0.9)
         >>> scheduler = SequentialLR(optimizer, schedulers=[scheduler1, scheduler2], milestones=[2])
+=======
+        >>> # Assuming optimizer uses lr = 0.05 for all groups
+        >>> # lr = 0.005     if epoch == 0
+        >>> # lr = 0.005     if epoch == 1
+        >>> # lr = 0.005     if epoch == 2
+        >>> # ...
+        >>> # lr = 0.05      if epoch == 20
+        >>> # lr = 0.045     if epoch == 21
+        >>> # lr = 0.0405    if epoch == 22
+        >>> scheduler1 = ConstantLR(optimizer, factor=0.1, total_iters=20)
+        >>> scheduler2 = ExponentialLR(optimizer, gamma=0.9)
+        >>> scheduler = SequentialLR(
+        ...     optimizer,
+        ...     schedulers=[scheduler1, scheduler2],
+        ...     milestones=[20],
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
+<<<<<<< HEAD
+=======
+
+    .. image:: ../scripts/lr_scheduler_images/SequentialLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -767,7 +1057,11 @@ def __init__(
         schedulers: list[LRScheduler],
         milestones: list[int],
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(schedulers) < 1:
             raise ValueError(
                 f"{self.__class__.__name__} expects at least one scheduler, but got no scheduler."
@@ -827,7 +1121,11 @@ def recursive_undo(self, sched=None):
         elif hasattr(scheds, "last_epoch"):
             scheds.last_epoch -= 1
 
+<<<<<<< HEAD
     def step(self):  # type: ignore[override]
+=======
+    def step(self) -> None:  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Perform a step."""
         self.last_epoch += 1
         idx = bisect_right(self._milestones, self.last_epoch)
@@ -839,7 +1137,12 @@ def step(self):  # type: ignore[override]
 
         self._last_lr = scheduler.get_last_lr()
 
+<<<<<<< HEAD
     def state_dict(self):
+=======
+    @override
+    def state_dict(self) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Return the state of the scheduler as a :class:`dict`.
 
         It contains an entry for every variable in self.__dict__ which
@@ -858,7 +1161,12 @@ def state_dict(self):
 
         return state_dict
 
+<<<<<<< HEAD
     def load_state_dict(self, state_dict):
+=======
+    @override
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Load the scheduler's state.
 
         Args:
@@ -887,6 +1195,7 @@ class PolynomialLR(LRScheduler):
 
     Example:
         >>> # xdoctest: +SKIP("undefined vars")
+<<<<<<< HEAD
         >>> # Assuming optimizer uses lr = 0.001 for all groups
         >>> # lr = 0.001     if epoch == 0
         >>> # lr = 0.00075   if epoch == 1
@@ -894,10 +1203,24 @@ class PolynomialLR(LRScheduler):
         >>> # lr = 0.00025   if epoch == 3
         >>> # lr = 0.0       if epoch >= 4
         >>> scheduler = PolynomialLR(optimizer, total_iters=4, power=1.0)
+=======
+        >>> # Assuming optimizer uses lr = 0.05 for all groups
+        >>> # lr = 0.0490   if epoch == 0
+        >>> # lr = 0.0481   if epoch == 1
+        >>> # lr = 0.0472   if epoch == 2
+        >>> # ...
+        >>> # lr = 0.0      if epoch >= 50
+        >>> scheduler = PolynomialLR(optimizer, total_iters=50, power=0.9)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
+<<<<<<< HEAD
+=======
+
+    .. image:: ../scripts/lr_scheduler_images/PolynomialLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -906,16 +1229,29 @@ def __init__(
         total_iters: int = 5,
         power: float = 1.0,
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.total_iters = total_iters
         self.power = power
         super().__init__(optimizer, last_epoch)
 
+<<<<<<< HEAD
     def get_lr(self):
         """Compute the learning rate."""
         _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0 or self.last_epoch > self.total_iters:
+=======
+    @override
+    def get_lr(self) -> list[float]:
+        """Compute the learning rate."""
+        _warn_get_lr_called_within_step(self)
+
+        if self._is_initial or self.last_epoch > self.total_iters:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [group["lr"] for group in self.optimizer.param_groups]
 
         decay_factor = (
@@ -936,6 +1272,7 @@ def _get_closed_form_lr(self):
 
 
 class CosineAnnealingLR(LRScheduler):
+<<<<<<< HEAD
     r"""Set the learning rate of each parameter group using a cosine annealing schedule.
 
     The :math:`\eta_{max}` is set to the initial lr and
@@ -963,15 +1300,62 @@ class CosineAnnealingLR(LRScheduler):
     It has been proposed in
     `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this only
     implements the cosine annealing part of SGDR, and not the restarts.
+=======
+    r"""
+    Set the learning rate of each parameter group using a cosine annealing schedule.
+
+    The learning rate is updated recursively using:
+
+    .. math::
+        \eta_{t+1} = \eta_{\min} + (\eta_t - \eta_{\min}) \cdot
+        \frac{1 + \cos\left(\frac{(T_{cur}+1) \pi}{T_{max}}\right)}
+            {1 + \cos\left(\frac{T_{cur} \pi}{T_{max}}\right)}
+
+    This implements a recursive approximation of the closed-form schedule proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_:
+
+    .. math::
+        \eta_t = \eta_{\min} + \frac{1}{2}(\eta_{\max} - \eta_{\min}) \left(
+            1 + \cos\left(\frac{T_{cur} \pi}{T_{max}}\right) \right)
+
+    where:
+
+    - :math:`\eta_t` is the learning rate at step :math:`t`
+    - :math:`T_{cur}` is the number of epochs since the last restart
+    - :math:`T_{max}` is the maximum number of epochs in a cycle
+
+    Note:
+        Although SGDR includes periodic restarts, this implementation performs cosine annealing
+        **without restarts**, so :math:`T_{cur} = t` and increases monotonically with each call
+        to :meth:`step`.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Args:
         optimizer (Optimizer): Wrapped optimizer.
         T_max (int): Maximum number of iterations.
         eta_min (float): Minimum learning rate. Default: 0.
+<<<<<<< HEAD
         last_epoch (int): The index of last epoch. Default: -1.
 
     .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
         https://arxiv.org/abs/1608.03983
+=======
+        last_epoch (int): The index of the last epoch. Default: -1.
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> num_epochs = 100
+        >>> scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
+        >>> for epoch in range(num_epochs):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+
+    .. image:: ../scripts/lr_scheduler_images/CosineAnnealingLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -980,16 +1364,29 @@ def __init__(
         T_max: int,
         eta_min: float = 0.0,
         last_epoch: int = -1,
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.T_max = T_max
         self.eta_min = eta_min
         super().__init__(optimizer, last_epoch)
 
+<<<<<<< HEAD
     def get_lr(self):
         """Retrieve the learning rate of each parameter group."""
         _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0:
+=======
+    @override
+    def get_lr(self) -> list[float]:
+        """Retrieve the learning rate of each parameter group."""
+        _warn_get_lr_called_within_step(self)
+
+        if self._is_initial:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return [group["lr"] for group in self.optimizer.param_groups]
         elif self._step_count == 1 and self.last_epoch > 0:
             return [
@@ -1013,7 +1410,11 @@ def get_lr(self):
             for group in self.optimizer.param_groups
         ]
 
+<<<<<<< HEAD
     def _get_closed_form_lr(self):
+=======
+    def _get_closed_form_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [
             self.eta_min
             + (base_lr - self.eta_min)
@@ -1035,6 +1436,7 @@ class ChainedScheduler(LRScheduler):
 
     Example:
         >>> # xdoctest: +SKIP
+<<<<<<< HEAD
         >>> # Assuming optimizer uses lr = 1. for all groups
         >>> # lr = 0.09     if epoch == 0
         >>> # lr = 0.081    if epoch == 1
@@ -1042,17 +1444,37 @@ class ChainedScheduler(LRScheduler):
         >>> # lr = 0.6561   if epoch == 3
         >>> # lr = 0.59049  if epoch >= 4
         >>> scheduler1 = ConstantLR(optimizer, factor=0.1, total_iters=2)
+=======
+        >>> # Assuming optimizer uses lr = 0.05 for all groups
+        >>> # lr = 0.05      if epoch == 0
+        >>> # lr = 0.0450    if epoch == 1
+        >>> # lr = 0.0405    if epoch == 2
+        >>> # ...
+        >>> # lr = 0.00675   if epoch == 19
+        >>> # lr = 0.06078   if epoch == 20
+        >>> # lr = 0.05470   if epoch == 21
+        >>> scheduler1 = ConstantLR(optimizer, factor=0.1, total_iters=20)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> scheduler2 = ExponentialLR(optimizer, gamma=0.9)
         >>> scheduler = ChainedScheduler([scheduler1, scheduler2], optimizer=optimizer)
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
+<<<<<<< HEAD
+=======
+
+    .. image:: ../scripts/lr_scheduler_images/ChainedScheduler.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
         self, schedulers: Sequence[LRScheduler], optimizer: Optional[Optimizer] = None
+<<<<<<< HEAD
     ):  # noqa: D107
+=======
+    ) -> None:  # noqa: D107
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if len(schedulers) < 1:
             raise ValueError(
                 f"{self.__class__.__name__} expects at least one scheduler to be chained, but got no scheduler."
@@ -1082,7 +1504,11 @@ def __init__(
             group["lr"] for group in self._schedulers[-1].optimizer.param_groups
         ]
 
+<<<<<<< HEAD
     def step(self):  # type: ignore[override]
+=======
+    def step(self) -> None:  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Perform a step."""
         for scheduler in self._schedulers:
             scheduler.step()
@@ -1090,7 +1516,12 @@ def step(self):  # type: ignore[override]
             group["lr"] for group in self._schedulers[-1].optimizer.param_groups
         ]
 
+<<<<<<< HEAD
     def state_dict(self):
+=======
+    @override
+    def state_dict(self) -> dict[str, Any]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Return the state of the scheduler as a :class:`dict`.
 
         It contains an entry for every variable in self.__dict__ which
@@ -1109,7 +1540,12 @@ def state_dict(self):
 
         return state_dict
 
+<<<<<<< HEAD
     def load_state_dict(self, state_dict):
+=======
+    @override
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Load the scheduler's state.
 
         Args:
@@ -1173,12 +1609,23 @@ class ReduceLROnPlateau(LRScheduler):
     Example:
         >>> # xdoctest: +SKIP
         >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+<<<<<<< HEAD
         >>> scheduler = ReduceLROnPlateau(optimizer, 'min')
         >>> for epoch in range(10):
         >>>     train(...)
         >>>     val_loss = validate(...)
         >>>     # Note that step should be called after validate()
         >>>     scheduler.step(val_loss)
+=======
+        >>> scheduler = ReduceLROnPlateau(optimizer, "min")
+        >>> for epoch in range(10):
+        >>>     train(...)
+        >>>     val_loss = validate(...)
+        >>> # Note that step should be called after validate()
+        >>>     scheduler.step(val_loss)
+
+    .. image:: ../scripts/lr_scheduler_images/ReduceLROnPlateau.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -1214,6 +1661,7 @@ def __init__(
             self.min_lrs = [min_lr] * len(optimizer.param_groups)
 
         self.patience = patience
+<<<<<<< HEAD
 
         self.cooldown = cooldown
         self.cooldown_counter = 0
@@ -1223,6 +1671,9 @@ def __init__(
         self.best: float
         self.num_bad_epochs: int
         self.mode_worse: float  # the worse value for the chosen mode
+=======
+        self.cooldown = cooldown
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.eps = eps
         self.last_epoch = 0
         self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
@@ -1237,7 +1688,11 @@ def _reset(self):
         self.cooldown_counter = 0
         self.num_bad_epochs = 0
 
+<<<<<<< HEAD
     def step(self, metrics: SupportsFloat, epoch=None):  # type: ignore[override]
+=======
+    def step(self, metrics: SupportsFloat, epoch=None) -> None:  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Perform a step."""
         # convert `metrics` to float, in case it's a zero-dim Tensor
         current = float(metrics)
@@ -1310,6 +1765,10 @@ def _init_is_better(self, mode, threshold, threshold_mode):
         if threshold_mode not in {"rel", "abs"}:
             raise ValueError("threshold mode " + threshold_mode + " is unknown!")
 
+<<<<<<< HEAD
+=======
+        # the worse value for the chosen mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if mode == "min":
             self.mode_worse = inf
         else:  # mode == 'max':
@@ -1319,12 +1778,17 @@ def _init_is_better(self, mode, threshold, threshold_mode):
         self.threshold = threshold
         self.threshold_mode = threshold_mode
 
+<<<<<<< HEAD
     def state_dict(self):  # noqa: D102
         return {
             key: value for key, value in self.__dict__.items() if key != "optimizer"
         }
 
     def load_state_dict(self, state_dict):
+=======
+    @override
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Load the scheduler's state."""
         self.__dict__.update(state_dict)
         self._init_is_better(
@@ -1413,13 +1877,26 @@ class CyclicLR(LRScheduler):
     Example:
         >>> # xdoctest: +SKIP
         >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+<<<<<<< HEAD
         >>> scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.01, max_lr=0.1)
+=======
+        >>> scheduler = torch.optim.lr_scheduler.CyclicLR(
+        ...     optimizer,
+        ...     base_lr=0.01,
+        ...     max_lr=0.1,
+        ...     step_size_up=10,
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> data_loader = torch.utils.data.DataLoader(...)
         >>> for epoch in range(10):
         >>>     for batch in data_loader:
         >>>         train_batch(...)
         >>>         scheduler.step()
 
+<<<<<<< HEAD
+=======
+    .. image:: ../scripts/lr_scheduler_images/CyclicLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. _Cyclical Learning Rates for Training Neural Networks: https://arxiv.org/abs/1506.01186
     .. _bckenstler/CLR: https://github.com/bckenstler/CLR
@@ -1536,7 +2013,12 @@ def _triangular2_scale_fn(x: float) -> float:
     def _exp_range_scale_fn(gamma: float, x: float) -> float:
         return gamma**x
 
+<<<<<<< HEAD
     def get_lr(self):
+=======
+    @override
+    def get_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Calculate the learning rate at batch index.
 
         This function treats `self.last_epoch` as the last batch index.
@@ -1583,7 +2065,12 @@ def get_lr(self):
 
         return lrs
 
+<<<<<<< HEAD
     def state_dict(self):  # noqa: D102
+=======
+    @override
+    def state_dict(self) -> dict[str, Any]:  # noqa: D102
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         state = super().state_dict()
         # We are dropping the `_scale_fn_ref` attribute because it is a
         # `weakref.WeakMethod` and can't be pickled.
@@ -1597,7 +2084,12 @@ def state_dict(self):  # noqa: D102
 
         return state
 
+<<<<<<< HEAD
     def load_state_dict(self, state_dict):
+=======
+    @override
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Load the scheduler's state."""
         fn = state_dict.pop("_scale_fn_custom")
         super().load_state_dict(state_dict)
@@ -1632,6 +2124,22 @@ class CosineAnnealingWarmRestarts(LRScheduler):
 
     .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
         https://arxiv.org/abs/1608.03983
+<<<<<<< HEAD
+=======
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
+        >>> scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
+        ...     optimizer, T_0=20
+        ... )
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+
+    .. image:: ../scripts/lr_scheduler_images/CosineAnnealingWarmRestarts.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     def __init__(
@@ -1657,7 +2165,12 @@ def __init__(
         self.T_cur = last_epoch
         super().__init__(optimizer, last_epoch)
 
+<<<<<<< HEAD
     def get_lr(self):
+=======
+    @override
+    def get_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Compute the initial learning rate."""
         _warn_get_lr_called_within_step(self)
 
@@ -1669,7 +2182,12 @@ def get_lr(self):
             for base_lr in self.base_lrs
         ]
 
+<<<<<<< HEAD
     def step(self, epoch=None):
+=======
+    @override
+    def step(self, epoch=None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Step could be called after every batch update.
 
         Example:
@@ -1694,7 +2212,11 @@ def step(self, epoch=None):
             >>> for epoch in range(20):
             >>>     scheduler.step()
             >>> scheduler.step(26)
+<<<<<<< HEAD
             >>> scheduler.step() # scheduler.step(27), instead of scheduler(20)
+=======
+            >>> scheduler.step()  # scheduler.step(27), instead of scheduler(20)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if epoch is None and self.last_epoch < 0:
             epoch = 0
@@ -1703,7 +2225,11 @@ def step(self, epoch=None):
             epoch = self.last_epoch + 1
             self.T_cur = self.T_cur + 1
             if self.T_cur >= self.T_i:
+<<<<<<< HEAD
                 self.T_cur = self.T_cur - self.T_i
+=======
+                self.T_cur = self.T_cur % self.T_i
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.T_i = self.T_i * self.T_mult
         else:
             if epoch < 0:
@@ -1830,13 +2356,23 @@ class OneCycleLR(LRScheduler):
         >>> # xdoctest: +SKIP
         >>> data_loader = torch.utils.data.DataLoader(...)
         >>> optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
+<<<<<<< HEAD
         >>> scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=len(data_loader), epochs=10)
+=======
+        >>> scheduler = torch.optim.lr_scheduler.OneCycleLR(
+        ...     optimizer, max_lr=0.01, steps_per_epoch=len(data_loader), epochs=10
+        ... )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         >>> for epoch in range(10):
         >>>     for batch in data_loader:
         >>>         train_batch(...)
         >>>         optimizer.step()
         >>>         scheduler.step()
 
+<<<<<<< HEAD
+=======
+    .. image:: ../scripts/lr_scheduler_images/OneCycleLR.png
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
         https://arxiv.org/abs/1708.07120
@@ -1998,7 +2534,12 @@ def _annealing_linear(start, end, pct):
         """Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0."""
         return (end - start) * pct + start
 
+<<<<<<< HEAD
     def get_lr(self):
+=======
+    @override
+    def get_lr(self) -> list[float]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Compute the learning rate of each parameter group."""
         _warn_get_lr_called_within_step(self)
 
@@ -2007,7 +2548,11 @@ def get_lr(self):
 
         if step_num > self.total_steps:
             raise ValueError(
+<<<<<<< HEAD
                 f"Tried to step {step_num} times. The specified number of total steps is {self.total_steps}"  # noqa: UP032
+=======
+                f"Tried to step {step_num} times. The specified number of total steps is {self.total_steps}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         for group in self.optimizer.param_groups:
@@ -2033,8 +2578,12 @@ def get_lr(self):
                 if self.use_beta1:
                     group["betas"] = (computed_momentum, *group["betas"][1:])  # type: ignore[possibly-undefined]
                 else:
+<<<<<<< HEAD
                     group[
                         "momentum"
                     ] = computed_momentum  # type: ignore[possibly-undefined]
+=======
+                    group["momentum"] = computed_momentum  # type: ignore[possibly-undefined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return lrs
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index f469029ed13b..464573acea25 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 r"""Implementation for the NAdam algorithm."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import cast, Optional, Union
 
 import torch
@@ -17,6 +21,10 @@
     _maximize_doc,
     _params_doc,
     _stack_if_compiling,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
@@ -296,6 +304,12 @@ def _single_tensor_nadam(
     differentiable: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
+=======
+    if not torch.jit.is_scripting():
+        lr = _to_scalar(lr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, param in enumerate(params):
         grad = grads[i] if not maximize else -grads[i]
         exp_avg = exp_avgs[i]
@@ -404,7 +418,17 @@ def _multi_tensor_nadam(
             p.device.type == mp.device.type == step.device.type
             and p.device.type in capturable_supported_devices
             for p, mp, step in zip(params, mu_products, state_steps)
+<<<<<<< HEAD
         ), f"If capturable=True, params, mu_products, and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+        ), (
+            "If capturable=True, "
+            "params, mu_products, and state_steps must be on supported devices: "
+            f"{capturable_supported_devices}."
+        )
+
+    lr = _to_scalar(lr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps]  # type: ignore[list-item]
@@ -570,10 +594,23 @@ def _multi_tensor_nadam(
             )
 
             torch._foreach_addcdiv_(
+<<<<<<< HEAD
                 grouped_params, grouped_grads, exp_avg_sq_sqrt, step_size_grads  # type: ignore[arg-type]
             )
             torch._foreach_addcdiv_(
                 grouped_params, grouped_exp_avgs, exp_avg_sq_sqrt, step_size_expavg  # type: ignore[arg-type]
+=======
+                grouped_params,
+                grouped_grads,
+                exp_avg_sq_sqrt,
+                step_size_grads,  # type: ignore[arg-type]
+            )
+            torch._foreach_addcdiv_(
+                grouped_params,
+                grouped_exp_avgs,
+                exp_avg_sq_sqrt,
+                step_size_expavg,  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
 
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index a0c49e480efd..1dd73e9464af 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 """Base optimizer."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import functools
 import warnings
 from collections import defaultdict, OrderedDict
@@ -56,10 +60,18 @@ def __repr__(self) -> str:
 required = _RequiredParameter()
 
 
+<<<<<<< HEAD
 def _use_grad_for_differentiable(func):
     def _use_grad(self, *args, **kwargs):
         import torch._dynamo
 
+=======
+def _use_grad_for_differentiable(func: Callable[_P, _T]) -> Callable[_P, _T]:
+    def _use_grad(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+        import torch._dynamo
+
+        self = cast(Optimizer, args[0])  # assume first positional arg is `self`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prev_grad = torch.is_grad_enabled()
         try:
             # Note on graph break below:
@@ -76,7 +88,11 @@ def _use_grad(self, *args, **kwargs):
             # see https://github.com/pytorch/pytorch/issues/104053
             torch.set_grad_enabled(self.defaults["differentiable"])
             torch._dynamo.graph_break()
+<<<<<<< HEAD
             ret = func(self, *args, **kwargs)
+=======
+            ret = func(*args, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         finally:
             torch._dynamo.graph_break()
             torch.set_grad_enabled(prev_grad)
@@ -102,7 +118,11 @@ def _stack_if_compiling(x):
 
 
 def _disable_dynamo_if_unsupported(
+<<<<<<< HEAD
     single_tensor_fn: Optional[Callable[..., object]] = None
+=======
+    single_tensor_fn: Optional[Callable[..., object]] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
     # workaround for torchscript BC
     # it requires all called functions to be in the
@@ -224,6 +244,28 @@ def _get_capturable_supported_devices(supports_xla: bool = True) -> list[str]:
     return capturable_supported_devices
 
 
+<<<<<<< HEAD
+=======
+def _to_scalar(x):
+    r"""This function converts a hyperparameter to a 0-dimension (scalar) tensor
+    if it is a nonzero-dimensions 1-element tensor. If it is not a tensor, it is
+    kept as is.
+
+    Args:
+        x (float or Tensor): A hyperparameter of the optimizer.
+            If it is Tensor, it is needed to be 1-element.
+
+    Returns:
+        float or Tensor:
+            a scalar tensor if x is Tensor otherwise Python scalar (float) value.
+    """
+    if isinstance(x, torch.Tensor) and x.dim() != 0:
+        return x.squeeze()
+    else:
+        return x
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Common doc strings among optimizers
 _params_doc = r"""params (iterable): iterable of parameters or named_parameters to optimize
             or iterable of dicts defining parameter groups. When using named_parameters,
@@ -251,9 +293,16 @@ def _get_capturable_supported_devices(supports_xla: bool = True) -> list[str]:
               implementation, pass False for either foreach or fused. """
 
 _capturable_doc = r"""capturable (bool, optional): whether this instance is safe to
+<<<<<<< HEAD
             capture in a CUDA graph. Passing True can impair ungraphed performance,
             so if you don't intend to graph capture this instance, leave it False
             (default: False)"""
+=======
+            capture in a graph, whether for CUDA graphs or for torch.compile support.
+            Tensors are only capturable when on supported :ref:`accelerators<accelerators>`.
+            Passing True can impair ungraphed performance, so if you don't intend to graph
+            capture this instance, leave it False (default: False)"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 _differentiable_doc = r"""differentiable (bool, optional): whether autograd should
             occur through the optimizer step in training. Otherwise, the step()
@@ -328,15 +377,34 @@ class Optimizer:
             options (used when a parameter group doesn't specify them).
     """
 
+<<<<<<< HEAD
     OptimizerPreHook: TypeAlias = Callable[[Self, Args, Kwargs], Optional[tuple[Args, Kwargs]]]  # type: ignore[misc]
+=======
+    OptimizerPreHook: TypeAlias = Callable[
+        [Self, Args, Kwargs],  # type: ignore[misc]
+        Optional[tuple[Args, Kwargs]],
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OptimizerPostHook: TypeAlias = Callable[[Self, Args, Kwargs], None]  # type: ignore[misc]
 
     _optimizer_step_pre_hooks: dict[int, OptimizerPreHook]
     _optimizer_step_post_hooks: dict[int, OptimizerPostHook]
     _optimizer_state_dict_pre_hooks: 'OrderedDict[int, Callable[["Optimizer"], None]]'
+<<<<<<< HEAD
     _optimizer_state_dict_post_hooks: 'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
     _optimizer_load_state_dict_pre_hooks: 'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
     _optimizer_load_state_dict_post_hooks: 'OrderedDict[int, Callable[["Optimizer"], None]]'
+=======
+    _optimizer_state_dict_post_hooks: (
+        'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
+    )
+    _optimizer_load_state_dict_pre_hooks: (
+        'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
+    )
+    _optimizer_load_state_dict_post_hooks: (
+        'OrderedDict[int, Callable[["Optimizer"], None]]'
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, params: ParamsT, defaults: dict[str, Any]) -> None:  # noqa: D107
         torch._C._log_api_usage_once("python.optimizer")
@@ -826,7 +894,13 @@ def register_load_state_dict_post_hook(
         handle = hooks.RemovableHandle(self._optimizer_load_state_dict_post_hooks)
         self._optimizer_load_state_dict_post_hooks[handle.id] = hook
         if prepend:
+<<<<<<< HEAD
             self._optimizer_load_state_dict_post_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
+=======
+            self._optimizer_load_state_dict_post_hooks.move_to_end(
+                handle.id, last=False
+            )  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return handle
 
     @torch._disable_dynamo
@@ -837,6 +911,13 @@ def load_state_dict(self, state_dict: StateDict) -> None:
             state_dict (dict): optimizer state. Should be an object returned
                 from a call to :meth:`state_dict`.
 
+<<<<<<< HEAD
+=======
+        .. warning::
+            Make sure this method is called after initializing :class:`torch.optim.lr_scheduler.LRScheduler`,
+            as calling it beforehand will overwrite the loaded learning rates.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         .. note::
             The names of the parameters (if they exist under the "param_names" key of each param group
             in :meth:`state_dict`) will not affect the loading process.
@@ -847,6 +928,34 @@ def load_state_dict(self, state_dict: StateDict) -> None:
             If ``param_names`` exist in loaded state dict ``param_groups`` they will be saved and override
             the current names, if present, in the optimizer state. If they do not exist in loaded state dict,
             the optimizer ``param_names`` will remain unchanged.
+<<<<<<< HEAD
+=======
+
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> model = torch.nn.Linear(10, 10)
+            >>> optim = torch.optim.SGD(model.parameters(), lr=3e-4)
+            >>> scheduler1 = torch.optim.lr_scheduler.LinearLR(
+            ...     optim,
+            ...     start_factor=0.1,
+            ...     end_factor=1,
+            ...     total_iters=20,
+            ... )
+            >>> scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR(
+            ...     optim,
+            ...     T_max=80,
+            ...     eta_min=3e-5,
+            ... )
+            >>> lr = torch.optim.lr_scheduler.SequentialLR(
+            ...     optim,
+            ...     schedulers=[scheduler1, scheduler2],
+            ...     milestones=[20],
+            ... )
+            >>> lr.load_state_dict(torch.load("./save_seq.pt"))
+            >>> # now load the optimizer checkpoint after loading the LRScheduler
+            >>> optim.load_state_dict(torch.load("./save_optim.pt"))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         # shallow copy, to be consistent with module API
         state_dict = state_dict.copy()
@@ -896,7 +1005,14 @@ def _cast(param, value, param_id=None, param_groups=None, key=None):
                     for k, v in value.items()
                 }
             elif isinstance(value, Iterable):
+<<<<<<< HEAD
                 return type(value)(_cast(param, v, param_id=param_id, param_groups=param_groups) for v in value)  # type: ignore[call-arg]
+=======
+                return type(value)(
+                    _cast(param, v, param_id=param_id, param_groups=param_groups)
+                    for v in value
+                )  # type: ignore[call-arg]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 return value
 
@@ -984,12 +1100,19 @@ def zero_grad(self, set_to_none: bool = True) -> None:
                         torch._foreach_zero_(grads)
 
     @overload
+<<<<<<< HEAD
     def step(self, closure: None = ...) -> None:
         ...
 
     @overload
     def step(self, closure: Callable[[], float]) -> float:
         ...
+=======
+    def step(self, closure: None = None) -> None: ...
+
+    @overload
+    def step(self, closure: Callable[[], float]) -> float: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
         r"""Perform a single optimization step to update parameter.
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index 75dda0e64135..44b7903b1f68 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 r"""Implementation for the RAdam algorithm."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import cast, Optional, Union
 
 import torch
@@ -16,6 +20,10 @@
     _get_value,
     _maximize_doc,
     _params_doc,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
@@ -269,6 +277,12 @@ def _single_tensor_radam(
     capturable: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
+=======
+    if not torch.jit.is_scripting():
+        lr = _to_scalar(lr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, param in enumerate(params):
         grad = grads[i] if not maximize else -grads[i]
         exp_avg = exp_avgs[i]
@@ -281,7 +295,13 @@ def _single_tensor_radam(
             assert (
                 param.device.type == step_t.device.type
                 and param.device.type in capturable_supported_devices
+<<<<<<< HEAD
             ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if torch.is_complex(param):
             param = torch.view_as_real(param)
@@ -382,7 +402,15 @@ def _multi_tensor_radam(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
             for p, step in zip(params, state_steps)
+<<<<<<< HEAD
         ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+
+    lr = _to_scalar(lr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, exp_avgs, exp_avg_sqs, state_steps]  # type: ignore[list-item]
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index 21c06721165f..756cc0eaa673 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 r"""Implementation for the RMSprop algorithm."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import cast, Optional, Union
 
 import torch
@@ -15,6 +19,10 @@
     _get_scalar_dtype,
     _maximize_doc,
     _params_doc,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
@@ -279,6 +287,12 @@ def _single_tensor_rmsprop(
     capturable: bool,
     has_complex: bool,
 ):
+<<<<<<< HEAD
+=======
+    if not torch.jit.is_scripting():
+        lr = _to_scalar(lr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, param in enumerate(params):
         step = state_steps[i]
 
@@ -288,7 +302,13 @@ def _single_tensor_rmsprop(
             assert (
                 param.device.type == step.device.type
                 and param.device.type in capturable_supported_devices
+<<<<<<< HEAD
             ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         grad = grads[i]
         grad = grad if not maximize else -grad
@@ -362,7 +382,15 @@ def _multi_tensor_rmsprop(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
             for p, step in zip(params, state_steps)
+<<<<<<< HEAD
         ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+
+    lr = _to_scalar(lr)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, square_avgs, grad_avgs, momentum_buffer_list, state_steps]  # type: ignore[list-item]
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 69f489fc9458..68cc0b0e0053 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 r"""Implementation for the Resilient backpropagation."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import cast, Optional, Union
 
 import torch
@@ -15,6 +19,10 @@
     _get_scalar_dtype,
     _maximize_doc,
     _params_doc,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _use_grad_for_differentiable,
     _view_as_real,
     Optimizer,
@@ -105,7 +113,11 @@ def _init_group(self, group, params, grads, prevs, step_sizes, state_steps):
                         grad, complex(group["lr"], group["lr"])
                     )
                 else:
+<<<<<<< HEAD
                     state["step_size"] = torch.full_like(grad, group["lr"])
+=======
+                    state["step_size"] = torch.full_like(grad, _to_scalar(group["lr"]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             prevs.append(state["prev"])
             step_sizes.append(state["step_size"])
@@ -247,7 +259,13 @@ def _single_tensor_rprop(
             assert (
                 param.device.type == step.device.type
                 and param.device.type in capturable_supported_devices
+<<<<<<< HEAD
             ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         step += 1
 
@@ -314,7 +332,13 @@ def _multi_tensor_rprop(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
             for p, step in zip(params, state_steps)
+<<<<<<< HEAD
         ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+=======
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, prevs, step_sizes, state_steps]  # type: ignore[list-item]
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index 7e9a964c2f21..ad1214341427 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 r"""Implementation for Stochastic Gradient Descent optimizer."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import cast, Optional, Union
 
 import torch
@@ -13,6 +17,10 @@
     _fused_doc,
     _maximize_doc,
     _params_doc,
+<<<<<<< HEAD
+=======
+    _to_scalar,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _use_grad_for_differentiable,
     DeviceDict,
     Optimizer,
@@ -160,7 +168,14 @@ def step(self, closure=None):
             \:\textit{ nesterov,}\:\textit{ maximize}                                     \\[-1.ex]
             &\rule{110mm}{0.4pt}                                                                 \\
             &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+<<<<<<< HEAD
             &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+=======
+            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
+            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
+            &\hspace{5mm}\textbf{else}                                                           \\
+            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             &\hspace{5mm}\textbf{if} \: \lambda \neq 0                                           \\
             &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
             &\hspace{5mm}\textbf{if} \: \mu \neq 0                                               \\
@@ -169,6 +184,7 @@ def step(self, closure=None):
             &\hspace{10mm}\textbf{else}                                                          \\
             &\hspace{15mm} \textbf{b}_t \leftarrow g_t                                           \\
             &\hspace{10mm}\textbf{if} \: \textit{nesterov}                                       \\
+<<<<<<< HEAD
             &\hspace{15mm} g_t \leftarrow g_{t} + \mu \textbf{b}_t                             \\
             &\hspace{10mm}\textbf{else}                                                   \\[-1.ex]
             &\hspace{15mm} g_t  \leftarrow  \textbf{b}_t                                         \\
@@ -176,6 +192,12 @@ def step(self, closure=None):
             &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} + \gamma g_t                   \\[-1.ex]
             &\hspace{5mm}\textbf{else}                                                    \\[-1.ex]
             &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma g_t                   \\[-1.ex]
+=======
+            &\hspace{15mm} g_t \leftarrow g_{t} + \mu \textbf{b}_t                               \\
+            &\hspace{10mm}\textbf{else}                                                   \\[-1.ex]
+            &\hspace{15mm} g_t  \leftarrow  \textbf{b}_t                                         \\
+            &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \gamma g_t                    \\[-1.ex]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
             &\bf{return} \:  \theta_t                                                     \\[-1.ex]
             &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
@@ -237,7 +259,13 @@ def step(self, closure=None):
 
         Moreover, the initial value of the momentum buffer is set to the
         gradient value at the first step. This is in contrast to some other
+<<<<<<< HEAD
         frameworks that initialize it to all zeros.
+=======
+        frameworks that initialize it to all zeros. One notable side effect
+        of this decision is that the first momentum value will not be scaled
+        by dampening. Dampening will be applied starting at the second step.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     """
 )
@@ -330,6 +358,12 @@ def _single_tensor_sgd(
 ):
     assert grad_scale is None and found_inf is None
 
+<<<<<<< HEAD
+=======
+    if not torch.jit.is_scripting():
+        lr = _to_scalar(lr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for i, param in enumerate(params):
         grad = grads[i] if not maximize else -grads[i]
 
@@ -388,10 +422,19 @@ def _multi_tensor_sgd(
     if len(params) == 0:
         return
 
+<<<<<<< HEAD
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, momentum_buffer_list], with_indices=True  # type: ignore[list-item]
     )
 
+=======
+    lr = _to_scalar(lr)
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, momentum_buffer_list],  # type: ignore[list-item]
+        with_indices=True,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for (
         device_params_,
         device_grads_,
@@ -495,7 +538,12 @@ def _fused_sgd(
         for i, g in enumerate(grads):
             momentum_buffer_list[i] = torch.empty_like(g)
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+<<<<<<< HEAD
         [params, grads, momentum_buffer_list], with_indices=False  # type: ignore[list-item]
+=======
+        [params, grads, momentum_buffer_list],  # type: ignore[list-item]
+        with_indices=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     for (device, _), (
         (device_params_, device_grads_, device_momentum_buffer_list),
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index 09814a9746c0..f90c6c8ec6d7 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -5,7 +5,11 @@
 from torch import Tensor
 
 from . import _functional as F
+<<<<<<< HEAD
 from .optimizer import _maximize_doc, _params_doc, Optimizer, ParamsT
+=======
+from .optimizer import _maximize_doc, _params_doc, _to_scalar, Optimizer, ParamsT
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = ["SparseAdam"]
@@ -37,9 +41,15 @@ def __init__(
         sparse_params = []
         complex_params = []
         for index, param_group in enumerate(self.param_groups):
+<<<<<<< HEAD
             assert isinstance(
                 param_group, dict
             ), f"param_groups must be a list of dicts, but got {type(param_group)}"
+=======
+            assert isinstance(param_group, dict), (
+                f"param_groups must be a list of dicts, but got {type(param_group)}"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # given param group, convert given params to a list first before iterating
             for d_index, d_param in enumerate(param_group["params"]):
                 if d_param.is_sparse:
@@ -117,7 +127,11 @@ def step(self, closure=None):
                 eps=group["eps"],
                 beta1=beta1,
                 beta2=beta2,
+<<<<<<< HEAD
                 lr=group["lr"],
+=======
+                lr=_to_scalar(group["lr"]),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 maximize=maximize,
             )
 
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index fffd9462dd22..f87f3ef7a727 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -1,5 +1,9 @@
 # mypy: allow-untyped-defs
 r"""Implementation for Stochastic Weight Averaging implementation."""
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import itertools
 import math
 import warnings
@@ -225,9 +229,15 @@ def __init__(
         use_buffers=False,
     ):  # noqa: D107
         super().__init__()
+<<<<<<< HEAD
         assert (
             avg_fn is None or multi_avg_fn is None
         ), "Only one of avg_fn and multi_avg_fn should be provided"
+=======
+        assert avg_fn is None or multi_avg_fn is None, (
+            "Only one of avg_fn and multi_avg_fn should be provided"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.module = deepcopy(model)
         if device is not None:
             self.module = self.module.to(device)
@@ -274,7 +284,13 @@ def update_parameters(self, model: Module):
                 ) in grouped_tensors.items():
                     if self.multi_avg_fn:
                         self.multi_avg_fn(
+<<<<<<< HEAD
                             self_params, model_params, self.n_averaged.to(device)  # type: ignore[arg-type]
+=======
+                            self_params,  # type: ignore[arg-type]
+                            model_params,  # type: ignore[arg-type]
+                            self.n_averaged.to(device),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                     elif (
                         device is not None
diff --git a/torch/overrides.py b/torch/overrides.py
index e7cb75e624b4..ae6170321b00 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -29,7 +29,12 @@
 import warnings
 from collections.abc import Iterable
 from functools import wraps
+<<<<<<< HEAD
 from typing import Any, Callable, Optional
+=======
+from typing import Any, Callable, Optional, TypeVar
+from typing_extensions import ParamSpec
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch._C import (
@@ -58,12 +63,24 @@
     "enable_reentrant_dispatch",
 ]
 
+<<<<<<< HEAD
 
 def _disable_user_warnings(
     func: Callable,
     regex: str = ".*is deprecated, please use.*",
     module: str = "torch",
 ) -> Callable:
+=======
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+
+def _disable_user_warnings(
+    func: Callable[_P, _R],
+    regex: str = ".*is deprecated, please use.*",
+    module: str = "torch",
+) -> Callable[_P, _R]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Decorator that temporarily disables ``UserWarning``s for the given ``module`` if the warning message matches the
     given ``regex`` pattern.
@@ -84,7 +101,11 @@ def _disable_user_warnings(
     """
 
     @wraps(func)
+<<<<<<< HEAD
     def wrapper(*args, **kwargs):
+=======
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with warnings.catch_warnings():
             warnings.filterwarnings(
                 "ignore", category=UserWarning, message=regex, module=module
@@ -94,7 +115,11 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @_disable_user_warnings
 def get_ignored_functions() -> set[Callable]:
     """
@@ -374,7 +399,11 @@ def get_ignored_functions() -> set[Callable]:
     }
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_default_nowrap_functions() -> set[Callable]:
     """
     Return public functions that do not wrap in a subclass when invoked by
@@ -400,7 +429,11 @@ def get_default_nowrap_functions() -> set[Callable]:
     }
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @_disable_user_warnings
 def get_testing_overrides() -> dict[Callable, Callable]:
     """Return a dict containing dummy overrides for all overridable functions
@@ -420,7 +453,11 @@ def get_testing_overrides() -> dict[Callable, Callable]:
     >>> inspect.signature(my_add)
     <Signature (input, other, out=None)>
     """
+<<<<<<< HEAD
     # Every function in the PyTorchAPI that can be overriden needs an entry
+=======
+    # Every function in the PyTorchAPI that can be overridden needs an entry
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # in this dict.
     #
     # Optimally we would use inspect to get the function signature and define
@@ -495,7 +532,11 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         torch.bitwise_left_shift: lambda input, other, out=None: -1,
         torch.bitwise_right_shift: lambda input, other, out=None: -1,
         torch.block_diag: lambda *tensors: -1,
+<<<<<<< HEAD
         torch.bmm: lambda input, mat2, out=None: -1,
+=======
+        torch.bmm: lambda input, mat2, out_dtype=None, out=None: -1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.broadcast_tensors: lambda *tensors: -1,
         torch.broadcast_to: lambda self, size: -1,
         torch.bucketize: lambda input, boundaries, out_int32=False, right=False, out=None: -1,
@@ -576,7 +617,11 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         torch.divide: lambda input, other, rounding_mode=None, out=None: -1,
         torch.dot: lambda input, other, out=None: -1,
         torch.dropout: lambda input, p, train, inplace=False: -1,
+<<<<<<< HEAD
         torch.dsmm: lambda input, mat2: -1,
+=======
+        torch.dsmm: lambda input, mat2, out_dtype=None: -1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.hsmm: lambda mat1, mat2: -1,
         torch.dsplit: lambda input, indices_or_sections: -1,
         torch.dstack: lambda tensors, out=None: -1,
@@ -800,7 +845,11 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         torch.miopen_rnn: (
             lambda input, weight, weight_stride0, hx, cx, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state: -1  # noqa: B950
         ),
+<<<<<<< HEAD
         torch.mm: lambda input, mat2, out=None: -1,
+=======
+        torch.mm: lambda input, mat2, out_dtype=None, out=None: -1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.mode: lambda input, dim=-1, keepdim=False, out=None: -1,
         torch.movedim: lambda input, source, destination: -1,
         torch.moveaxis: lambda input, source, destination: -1,
@@ -1114,8 +1163,13 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         torch.sinh: lambda input, out=None: -1,
         torch.slogdet: lambda input: -1,
         torch.linalg.slogdet: lambda input: -1,
+<<<<<<< HEAD
         torch.smm: lambda input, mat2: -1,
         torch.spmm: lambda input, mat2: -1,
+=======
+        torch.smm: lambda input, mat2, out_dtype=None: -1,
+        torch.spmm: lambda input, mat2, out_dtype=None: -1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.softmax: lambda input, dim, dtype=None: -1,
         torch.linalg.solve: lambda A, B, left=True, out=None: -1,
         torch.linalg.solve_ex: lambda A, B, left=True, check_errors=False, out=None: -1,
@@ -1448,7 +1502,11 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         Tensor.long: lambda self, memory_format=torch.preserve_format: -1,
         Tensor.map_: lambda self, tensor, callable: -1,
         Tensor.map2_: lambda self, x, y, callable: -1,
+<<<<<<< HEAD
         Tensor.mm: lambda self, mat2: -1,
+=======
+        Tensor.mm: lambda self, mat2, out_dtype=None: -1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Tensor.module_load: lambda self, other, assign=False: -1,
         Tensor.narrow_copy: lambda self, dimension, start, length: -1,
         Tensor.ndimension: lambda self: -1,
@@ -1804,7 +1862,11 @@ def handle_torch_function(
 )
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_overridable_functions() -> tuple[
     dict[Any, list[Callable]], dict[Callable, str]
 ]:
@@ -1877,7 +1939,11 @@ def _get_overridable_functions() -> tuple[
             if ignore:
                 continue
 
+<<<<<<< HEAD
             # cannot be overriden by __torch_function__
+=======
+            # cannot be overridden by __torch_function__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if func in get_ignored_functions():
                 msg = (
                     "{}.{} is in the tuple returned by torch._overrides.get_ignored_functions "
@@ -1925,7 +1991,11 @@ def resolve_name(f):
     return _get_overridable_functions()[1].get(f)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_tensor_methods() -> set[Callable]:
     """Returns a set of the overridable methods on ``torch.Tensor``"""
     overridable_funcs = get_overridable_functions()
diff --git a/torch/package/_stdlib.py b/torch/package/_stdlib.py
index 951f8bf8de8e..fd2a820b6461 100644
--- a/torch/package/_stdlib.py
+++ b/torch/package/_stdlib.py
@@ -20,7 +20,11 @@ def _get_stdlib_modules():
     if sys.version_info.major == 3:
         if sys.version_info.minor == 9:
             return stdlib3_9
+<<<<<<< HEAD
         if sys.version_info.minor >= 10:
+=======
+        if sys.version_info.minor >= 10:  # noqa: YTT204
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return sys.stdlib_module_names  # type: ignore[attr-defined]
     elif sys.version_info.major > 3:
         return sys.stdlib_module_names  # type: ignore[attr-defined]
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 42e346c626e3..2b44e5e0ad72 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -675,7 +675,11 @@ def _check_mocked_error(module: Optional[str], field: Optional[str]):
                         memo_count += 1
                     elif opcode.name == "STACK_GLOBAL":
                         if module is None:
+<<<<<<< HEAD
                             # If not module was passed on in the entries preceeding this one, continue.
+=======
+                            # If not module was passed on in the entries preceding this one, continue.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             continue
                         assert isinstance(module, str)
                         if module not in all_dependencies:
diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py
index ff997a8faecc..8a5cf7d0f63c 100644
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@@ -40,7 +40,11 @@
 # This is a list of imports that are implicitly allowed even if they haven't
 # been marked as extern. This is to work around the fact that Torch implicitly
 # depends on numpy and package can't track it.
+<<<<<<< HEAD
 # https://github.com/pytorch/MultiPy/issues/46
+=======
+# https://github.com/pytorch/multipy/issues/46  # codespell:ignore multipy
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 IMPLICIT_IMPORT_ALLOWLIST: Iterable[str] = [
     "numpy",
     "numpy.core",
@@ -386,13 +390,21 @@ def _make_module(
         assert module.__name__ not in _package_imported_modules
         _package_imported_modules[module.__name__] = module
 
+<<<<<<< HEAD
         # pre-emptively install on the parent to prevent IMPORT_FROM from trying to
+=======
+        # preemptively install on the parent to prevent IMPORT_FROM from trying to
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # access sys.modules
         self._install_on_parent(parent, name, module)
 
         if filename is not None:
             assert mangled_filename is not None
+<<<<<<< HEAD
             # pre-emptively install the source in `linecache` so that stack traces,
+=======
+            # preemptively install the source in `linecache` so that stack traces,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # `inspect`, etc. work.
             assert filename not in linecache.cache  # type: ignore[attr-defined]
             linecache.lazycache(mangled_filename, ns)
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index f10831ade397..14bccbafe289 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -91,7 +91,11 @@ def __hash__(self) -> int:
 
 @dataclasses.dataclass(eq=True, unsafe_hash=True, frozen=True)
 class TensorKey(Key):
+<<<<<<< HEAD
     """Hashable identifier for a storage which has been asigned an ID.
+=======
+    """Hashable identifier for a storage which has been assigned an ID.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     A detailed description of Tensor IDs and why they are needed is given in
     `torch/csrc/profiler/collection.h` when `TensorID` is declared. To
diff --git a/torch/profiler/_pattern_matcher.py b/torch/profiler/_pattern_matcher.py
index 41748ea39545..98a87ffb206b 100644
--- a/torch/profiler/_pattern_matcher.py
+++ b/torch/profiler/_pattern_matcher.py
@@ -90,7 +90,11 @@ def format_time(time_ns: int):
     def match(self, event: _ProfilerEvent):
         """
         Return True if the event matches the pattern.
+<<<<<<< HEAD
         This method should be overriden in subclass.
+=======
+        This method should be overridden in subclass.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         raise NotImplementedError
 
@@ -150,7 +154,11 @@ class ExtraCUDACopyPattern(Pattern):
     example: torch.zeros((100, 100)).to("cuda")
 
     Pattern:
+<<<<<<< HEAD
     build-in method                 |build-in method
+=======
+    built-in method                 |built-in method
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...                         |    aten::to
             aten::fill_/aten::zero_ |        aten::_to_copy
 
@@ -209,7 +217,11 @@ def match(self, event):
             return False
         while event.children:
             event = event.children[-1]
+<<<<<<< HEAD
             # aten::zero_ is a special optimzation case where fill_ is not called
+=======
+            # aten::zero_ is a special optimization case where fill_ is not called
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if event.name in self.init_ops:
                 return True
         return event.name in self.init_ops
@@ -367,7 +379,11 @@ def __init__(self, prof: profile, should_benchmark: bool = False):
         self.name = "Optimizer Single Tensor Pattern"
         self.optimizers_with_foreach = ["adam", "sgd", "adamw"]
         self.description = (
+<<<<<<< HEAD
             "Deteced optimizer running with single tensor implementation. "
+=======
+            "Detected optimizer running with single tensor implementation. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "Please enable multi tensor implementation by passing 'foreach=True' into optimizer."
         )
         self.url = ""
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index ecc848c20e7e..c0a49a44d921 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -23,6 +23,10 @@
     _remove_execution_trace_observer,
 )
 from torch._environment import is_fbcode
+<<<<<<< HEAD
+=======
+from torch._utils_internal import profiler_allow_cudagraph_cupti_lazy_reinit_cuda12
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.autograd import kineto_available, ProfilerActivity
 from torch.profiler._memory_profiler import MemoryProfile, MemoryProfileTimeline
 
@@ -156,6 +160,10 @@ def __init__(
         self.acc_events = acc_events
         self.custom_trace_id_callback = custom_trace_id_callback
         self.profiler: Optional[prof.profile] = None
+<<<<<<< HEAD
+=======
+        self.has_cudagraphs = False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.mem_tl: Optional[MemoryProfileTimeline] = None
         self.use_device = None
         if ProfilerActivity.CUDA in self.activities:
@@ -180,6 +188,13 @@ def stop(self):
         self.stop_trace()
 
     def prepare_trace(self):
+<<<<<<< HEAD
+=======
+        if hasattr(torch, "_inductor"):
+            import torch._inductor.config as inductor_config
+
+            self.has_cudagraphs = inductor_config.triton.cudagraphs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (self.profiler is None) or (not self.acc_events):
             self.profiler = prof.profile(
                 use_cpu=(ProfilerActivity.CPU in self.activities),
@@ -220,6 +235,7 @@ def start_trace(self):
                     "distributedInfo", json.dumps(dist_info, cls=_NumpyEncoder)
                 )
 
+<<<<<<< HEAD
             if hasattr(torch, "_inductor"):
                 import torch._inductor.config as inductor_config
 
@@ -231,6 +247,25 @@ def start_trace(self):
                     #   2) crashes on 2nd non-lazy CUPTI re-init after teardown (CUDA 12)
                     # Workaround: turn off CUPTI teardown when using CUDA Graphs.
                     os.environ["TEARDOWN_CUPTI"] = "0"
+=======
+            cuda_version = None
+            if hasattr(torch, "version"):
+                from torch.torch_version import TorchVersion
+
+                cuda_version = TorchVersion(getattr(torch.version, "cuda", "0.0"))
+
+            if self.has_cudagraphs and (
+                (cuda_version and cuda_version < "12.6")
+                or not profiler_allow_cudagraph_cupti_lazy_reinit_cuda12()
+            ):
+                os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
+                self.add_metadata_json("DISABLE_CUPTI_LAZY_REINIT", "1")
+                # FIXME: CUDA Graph does not work well with CUPTI teardown.
+                #   1) crashes on 1st lazy CUPTI re-init after teardown (CUDA 11)
+                #   2) crashes on 2nd non-lazy CUPTI re-init after teardown (CUDA 12)
+                # Workaround: turn off CUPTI teardown when using CUDA Graphs.
+                os.environ["TEARDOWN_CUPTI"] = "0"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Insert the preset user metadata to the trace
             for k, v in self.preset_metadata.items():
diff --git a/torch/quantization/fuse_modules.py b/torch/quantization/fuse_modules.py
index 6b704fa8094e..81f9e4da6c6e 100644
--- a/torch/quantization/fuse_modules.py
+++ b/torch/quantization/fuse_modules.py
@@ -18,5 +18,9 @@
     get_fuser_method,
 )
 
+<<<<<<< HEAD
 # for backward compatiblity
+=======
+# for backward compatibility
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn, fuse_conv_bn_relu
diff --git a/torch/random.py b/torch/random.py
index 46b3b28c2b81..56d4c1776554 100644
--- a/torch/random.py
+++ b/torch/random.py
@@ -143,8 +143,13 @@ def fork_rng(
         enabled (bool): if ``False``, the RNG is not forked.  This is a convenience
             argument for easily disabling the context manager without having
             to delete it and unindent your Python code under it.
+<<<<<<< HEAD
         device_type (str): device type str, default is `cuda`. As for custom device,
             see details in [Note: support the custom device with privateuse1]
+=======
+        device_type (str): device type str, default is `cuda`. As for supported device,
+            see details in :ref:`accelerator<accelerators>`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     if device_type == "meta":
diff --git a/torch/serialization.py b/torch/serialization.py
index 8dbd510b0b49..c1003ac045e0 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -825,7 +825,11 @@ def _open_zipfile_writer(name_or_buffer: Union[str, IO[bytes]]) -> _opener:
         container = _open_zipfile_writer_file
     else:
         container = _open_zipfile_writer_buffer
+<<<<<<< HEAD
     return container(name_or_buffer)
+=======
+    return container(name_or_buffer)  # type: ignore[arg-type]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _is_compressed_file(f) -> bool:
@@ -924,6 +928,11 @@ def save(
 
     See also: :ref:`saving-loading-tensors`
 
+<<<<<<< HEAD
+=======
+    See :ref:`layout-control` for more advanced tools to manipulate a checkpoint.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
         obj: saved object
         f: a file-like object (has to implement write and flush) or a string or
@@ -1313,6 +1322,11 @@ def load(
     User extensions can register their own location tags and tagging and
     deserialization methods using :func:`torch.serialization.register_package`.
 
+<<<<<<< HEAD
+=======
+    See :ref:`layout-control` for more advanced tools to manipulate a checkpoint.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
         f: a file-like object (has to implement :meth:`read`, :meth:`readline`, :meth:`tell`, and :meth:`seek`),
             or a string or os.PathLike object containing a file name
@@ -1328,7 +1342,12 @@ def load(
             Typically, tensor storages in the file will first be moved from disk to CPU memory, after which they
             are moved to the location that they were tagged with when saving, or specified by ``map_location``. This
             second step is a no-op if the final location is CPU. When the ``mmap`` flag is set, instead of copying the
+<<<<<<< HEAD
             tensor storages from disk to CPU memory in the first step, ``f`` is mmaped.
+=======
+            tensor storages from disk to CPU memory in the first step, ``f`` is mmaped, which means tensor storages
+            will be lazily loaded when their data is accessed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         pickle_load_args: (Python 3 only) optional keyword arguments passed over to
             :func:`pickle_module.load` and :func:`pickle_module.Unpickler`, e.g.,
             :attr:`errors=...`.
diff --git a/torch/signal/windows/windows.py b/torch/signal/windows/windows.py
index ee9b61f71642..241845c87fd9 100644
--- a/torch/signal/windows/windows.py
+++ b/torch/signal/windows/windows.py
@@ -701,7 +701,11 @@ def bartlett(
     >>> torch.signal.windows.general_cosine(10, a=[0.46, 0.23, 0.31], sym=True)
     tensor([0.5400, 0.3376, 0.1288, 0.4200, 0.9136, 0.9136, 0.4200, 0.1288, 0.3376, 0.5400])
 
+<<<<<<< HEAD
     >>> # Generates a periodic general cosine window wit 2 coefficients.
+=======
+    >>> # Generates a periodic general cosine window with 2 coefficients.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.signal.windows.general_cosine(10, a=[0.5, 1 - 0.5], sym=False)
     tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
 """.format(
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index 858cb7fbd861..88ca672fc7c0 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -54,7 +54,11 @@
 Supports both CSR and COO storage formats.
 
 .. note::
+<<<<<<< HEAD
     This function doesn't support computing derivaties with respect to CSR matrices.
+=======
+    This function doesn't support computing derivatives with respect to CSR matrices.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Args:
     mat (Tensor): a dense matrix to be added
@@ -79,7 +83,11 @@
     Supports both CSR and COO storage formats.
 
 .. note::
+<<<<<<< HEAD
     This function doesn't support computing derivaties with respect to CSR matrices.
+=======
+    This function doesn't support computing derivatives with respect to CSR matrices.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     This function also additionally accepts an optional :attr:`reduce` argument that allows
     specification of an optional reduction operation, mathematically performs the following operation:
diff --git a/torch/sparse/_semi_structured_conversions.py b/torch/sparse/_semi_structured_conversions.py
index 2062c3fe9e4d..37a902b7b96f 100644
--- a/torch/sparse/_semi_structured_conversions.py
+++ b/torch/sparse/_semi_structured_conversions.py
@@ -331,11 +331,19 @@ def _compute_compressed_swizzled_bitmask(dense):
     # we first need to split into the 8x8 tiles
     bitmask_8x8_chunks = int_bitmask.unfold(0, 8, 8).unfold(1, 8, 8)
 
+<<<<<<< HEAD
     # then we unfold again to get our indivdual 4x4 tiles
     bitmask_4x4_chunks = bitmask_8x8_chunks.unfold(2, 4, 4).unfold(3, 4, 4)
 
     # Each 4x4 bitmask defines two 8-bit integers, which encode the sparsity pattern
     # of that tile. Note that the least siginificant bit is stored first.
+=======
+    # then we unfold again to get our individual 4x4 tiles
+    bitmask_4x4_chunks = bitmask_8x8_chunks.unfold(2, 4, 4).unfold(3, 4, 4)
+
+    # Each 4x4 bitmask defines two 8-bit integers, which encode the sparsity pattern
+    # of that tile. Note that the least significant bit is stored first.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # [1 1 0 0]
     # [1 1 0 0]  ->  0011 0011 ->   51
     # [0 0 1 1]      1100 1100      204
@@ -346,7 +354,11 @@ def _compute_compressed_swizzled_bitmask(dense):
         *bitmask_4x4_chunks.shape[:2], 4, 2, 8
     )
 
+<<<<<<< HEAD
     # to convert from binary representaiton, we can do a matmul with powers of two
+=======
+    # to convert from binary representation, we can do a matmul with powers of two
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     powers_of_two = 2 ** torch.arange(8, dtype=torch.float, device="cuda")
     # To run on GPU: cast to float to do matmul and then cast back
     compressed_swizzled_bitmask = (
diff --git a/torch/sparse/_semi_structured_ops.py b/torch/sparse/_semi_structured_ops.py
index 11a55d9d523c..41c41c84a420 100644
--- a/torch/sparse/_semi_structured_ops.py
+++ b/torch/sparse/_semi_structured_ops.py
@@ -179,7 +179,11 @@ def semi_sparse_scaled_mm(func, types, args=(), kwargs=None) -> torch.Tensor:
 
     assert A.dtype == torch.float8_e4m3fn
     assert B.dtype == torch.float8_e4m3fn
+<<<<<<< HEAD
     # only cuSPARSELt supports float8_e4m3fn currentl
+=======
+    # only cuSPARSELt supports float8_e4m3fn currently
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     assert isinstance(A, torch.sparse.SparseSemiStructuredTensorCUSPARSELT)
     assert A.packed is not None
     # Currently we only support per-tensor scaling, with float32 scales
diff --git a/torch/sparse/_triton_ops.py b/torch/sparse/_triton_ops.py
index ce0e8446cba2..d5423c793d4f 100644
--- a/torch/sparse/_triton_ops.py
+++ b/torch/sparse/_triton_ops.py
@@ -124,7 +124,11 @@ def multidim_slicer(dims, slices, *tensors):
         for d, d_slice in zip(dims, slices):
             if d is not None:
                 s[d] = d_slice
+<<<<<<< HEAD
         yield t[s]
+=======
+        yield t[tuple(s)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def ptr_stride_extractor(*tensors):
@@ -333,7 +337,11 @@ def scatter_mm(blocks, others, indices_data, *, accumulators=None):
       this property enables defining swizzle operators via
       rearrangements of ``r_offsets`` items..
 
+<<<<<<< HEAD
     Auxilary functions are provided for pre-computing
+=======
+    Auxiliary functions are provided for pre-computing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     :attr:`indices_data`. For example,
     :func:`bsr_scatter_mm_indices_data` is used to define indices data
     for matrix multiplication of BSR and strided tensors.
@@ -836,7 +844,11 @@ def bsr_dense_addmm_meta(
 
 class TensorAsKey:
     """A light-weight wrapper of a tensor that enables storing tensors as
+<<<<<<< HEAD
     keys with efficient memory reference based comparision as an
+=======
+    keys with efficient memory reference based comparison as an
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     approximation to data equality based keys.
 
     Motivation: the hash value of a torch tensor is tensor instance
diff --git a/torch/sparse/_triton_ops_meta.py b/torch/sparse/_triton_ops_meta.py
index 08471ac05888..cb0cafa60a84 100644
--- a/torch/sparse/_triton_ops_meta.py
+++ b/torch/sparse/_triton_ops_meta.py
@@ -9,7 +9,11 @@
 shapes, the usage of a bsr tensor as mat1 argument in addmm-based
 operations typically outperforms the corresponding operation with
 strided-only inputs when the blocked representation of a tensor
+<<<<<<< HEAD
 provides a better alignement with memory access than what the strided
+=======
+provides a better alignment with memory access than what the strided
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 representation would provide.
 
 Pre-computed kernel parameters
@@ -57,7 +61,11 @@
 If the approximations listed above are unacceptable, e.g. when one
 seeks a maximal performance possible, the optimal kernel parameters
 for a particular GPU can be computed by simply running this script in
+<<<<<<< HEAD
 the pytorch developement tree::
+=======
+the pytorch development tree::
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   cd /path/to/pytorch
   python setup.py develop
@@ -91,7 +99,11 @@
 optimal set of kernel parameters.
 
 Note that running tune_bsr_dense_addmm can take several minutes. So,
+<<<<<<< HEAD
 use it wisely, e.g. by implementing persisten storage of optimized
+=======
+use it wisely, e.g. by implementing persistent storage of optimized
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 kernel parameters. See the source code of get_meta and
 tune_bsr_dense_addmm to learn how to register a custom set of optimal
 kernel parameters for addmm-based operations.
@@ -852,7 +864,11 @@ def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True):
 
     if 0:
         # Check performance dependence on sparsity and apply
+<<<<<<< HEAD
         # adjustments when differences are noticable (more than 10%).
+=======
+        # adjustments when differences are noticeable (more than 10%).
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #
         # When using NVIDIA A100 GPU, the performance dependence on
         # sparsity is insignificant (0 % ... 10 %) for majority of
diff --git a/torch/sparse/semi_structured.py b/torch/sparse/semi_structured.py
index 82b20ab792d2..d1f1e06a77a1 100644
--- a/torch/sparse/semi_structured.py
+++ b/torch/sparse/semi_structured.py
@@ -37,7 +37,11 @@
 
 class SparseSemiStructuredTensor(torch.Tensor):
     """
+<<<<<<< HEAD
     This class implementes semi-structured sparsity as a Tensor subclass.
+=======
+    This class implements semi-structured sparsity as a Tensor subclass.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Semi-structured sparsity describes a sparsity pattern where n in every 2n elements are sparse,
     depending on the datatype. It is also referred to as 2:4 sparsity or fine-grained
@@ -46,11 +50,19 @@ class SparseSemiStructuredTensor(torch.Tensor):
     There are two backends available for semi_structred sparsity, either cuSPARSELt or CUTLASS.
     This class is meant to serve as a base class for both implementations. SparseSemiStructuredCUTLASS
     and SparseSemiStructuredCUSPARSELT both inherit from this class and define three backend-specific items.
+<<<<<<< HEAD
     Note that as such, this class cannot be insantiated directly.
 
     -`_DTYPE_SHAPE_CONSTRAINTS` - A dictionary holding backend specific dense/sparse min shape constraints
     - `def from_dense()` - backend specific compression routines
     - `def _mm()` - backend specifc mm op (either torch._cslt_sparse_mm or torch._sparse_semi_structured_(mm|addmm))
+=======
+    Note that as such, this class cannot be instantiated directly.
+
+    -`_DTYPE_SHAPE_CONSTRAINTS` - A dictionary holding backend specific dense/sparse min shape constraints
+    - `def from_dense()` - backend specific compression routines
+    - `def _mm()` - backend specific mm op (either torch._cslt_sparse_mm or torch._sparse_semi_structured_(mm|addmm))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     _DEFAULT_ALG_ID: int = 0
@@ -123,7 +135,11 @@ def __new__(  # noqa: PYI034
             )
             cls._PROTOTYPE_WARNING_SHOWN = True
 
+<<<<<<< HEAD
             # Because this only runs onces, we also load the dispatch table here as well.
+=======
+            # Because this only runs once, we also load the dispatch table here as well.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # We can't define the dispatch table explicitly because of torch.ops import errors, so we do this instead
             # But this is useful since it allows users to overload the dispatch table for debugging / testing.
             cls._load_dispatch_table()
@@ -138,6 +154,7 @@ def __new__(  # noqa: PYI034
         else:
             raise ValueError("At least one of packed or packed_t must be provided")
 
+<<<<<<< HEAD
         kwargs = {
             "device": previous_tensor.device,
             "dtype": previous_tensor.dtype,
@@ -145,6 +162,16 @@ def __new__(  # noqa: PYI034
             "requires_grad": requires_grad,
         }
         tensor = torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+=======
+        tensor = torch.Tensor._make_wrapper_subclass(
+            cls,
+            shape,
+            device=previous_tensor.device,
+            dtype=previous_tensor.dtype,
+            layout=previous_tensor.layout,
+            requires_grad=requires_grad,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         tensor.packed = packed
         tensor.meta = meta
@@ -196,10 +223,17 @@ def __tensor_unflatten__(
             requires_grad=requires_grad,
         )
 
+<<<<<<< HEAD
     __torch_function__ = torch._C._disabled_torch_function_impl
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args, kwargs) -> Any:
+=======
+    __torch_function__ = torch._C._disabled_torch_function_impl  # type: ignore[assignment]
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs) -> Any:  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if func._overloadpacket not in cls.SPARSE_DISPATCH:
             raise NotImplementedError(
                 f"{cls.__name__} only supports a specific set of operations, "
@@ -324,7 +358,11 @@ def to_sparse_semi_structured(
 
     This function will check to ensure the dense tensor has the right dtype, size, dims, and device.
     We currently only support semi-structured sparse tensors for 2d CUDA tensors.
+<<<<<<< HEAD
     Additionally, your tensor must be a positive multiple of the mininum sparse block size, given in
+=======
+    Additionally, your tensor must be a positive multiple of the minimum sparse block size, given in
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     `_DTYPE_TO_SHAPE_CONSTRAINTS` for each dtype (float32, float16, bfloat16, int8).
 
     Args:
@@ -387,7 +425,11 @@ class SparseSemiStructuredTensorCUTLASS(SparseSemiStructuredTensor):
     This class implements semi-structured sparsity for the CUTLASS backend.
 
 
+<<<<<<< HEAD
     In this implementation, the specified elements and metadata are stored seprately,
+=======
+    In this implementation, the specified elements and metadata are stored separately,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     in packed and meta respectively.
 
     When _FORCE_CUTLASS is set, or when cuSPARSELt is not available, this subclass calls into _sparse_semi_structured_(mm|addmm) and
diff --git a/torch/special/__init__.py b/torch/special/__init__.py
index 9f872c93a4f3..b95138bd3d84 100644
--- a/torch/special/__init__.py
+++ b/torch/special/__init__.py
@@ -88,6 +88,10 @@
     out (Tensor, optional): the output tensor.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> a = torch.arange(-0.5, 1, 0.5)
     >>> a
     tensor([-0.5000,  0.0000,  0.5000])
@@ -189,6 +193,10 @@
     {out}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> a = torch.tensor([1, 0.5])
     >>> torch.special.polygamma(1, a)
     tensor([1.64493, 4.9348])
@@ -592,6 +600,10 @@
     {out}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.special.i0e(torch.arange(5, dtype=torch.float32))
     tensor([1.0000, 0.4658, 0.3085, 0.2430, 0.2070])
 """.format(
@@ -618,6 +630,10 @@
     {out}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.special.i1(torch.arange(5, dtype=torch.float32))
     tensor([0.0000, 0.5652, 1.5906, 3.9534, 9.7595])
 """.format(
@@ -645,6 +661,10 @@
     {out}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.special.i1e(torch.arange(5, dtype=torch.float32))
     tensor([0.0000, 0.2079, 0.2153, 0.1968, 0.1788])
 """.format(
@@ -671,6 +691,10 @@
     {out}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.special.ndtr(torch.tensor([-3., -2, -1, 0, 1, 2, 3]))
     tensor([0.0013, 0.0228, 0.1587, 0.5000, 0.8413, 0.9772, 0.9987])
 """.format(
@@ -700,6 +724,10 @@
     {out}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.special.ndtri(torch.tensor([0, 0.25, 0.5, 0.75, 1]))
     tensor([   -inf, -0.6745,  0.0000,  0.6745,     inf])
 """.format(
@@ -726,6 +754,10 @@
     {out}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> torch.special.log_ndtr(torch.tensor([-3., -2, -1, 0, 1, 2, 3]))
     tensor([-6.6077 -3.7832 -1.841  -0.6931 -0.1728 -0.023  -0.0014])
 """.format(
@@ -765,6 +797,10 @@
     {out}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> t = torch.randn(4)
     >>> t
     tensor([ 0.2252, -0.2948,  1.0267, -1.1566])
@@ -838,6 +874,10 @@
         is performed. This is useful for preventing data type overflows. Default: None.
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> t = torch.ones(2, 2)
     >>> torch.special.log_softmax(t, 0)
     tensor([[-0.6931, -0.6931],
@@ -868,6 +908,10 @@
     {out}
 
 Example::
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     >>> x = torch.tensor([2., 4.])
     >>> torch.special.zeta(x, 1)
     tensor([1.6449, 1.0823])
@@ -894,7 +938,11 @@
 
 where :math:`C = \log(\pi) \cdot \frac{p (p - 1)}{4}` and :math:`\Gamma(-)` is the Gamma function.
 
+<<<<<<< HEAD
 All elements must be greater than :math:`\frac{p - 1}{2}`, otherwise the behavior is undefiend.
+=======
+All elements must be greater than :math:`\frac{p - 1}{2}`, otherwise the behavior is undefined.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
     + """
 
@@ -1151,7 +1199,11 @@
 chebyshev_polynomial_u = _add_docstr(
     _special.special_chebyshev_polynomial_u,
     r"""
+<<<<<<< HEAD
 chebyshev_polynomial_t(input, n, *, out=None) -> Tensor
+=======
+chebyshev_polynomial_u(input, n, *, out=None) -> Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 Chebyshev polynomial of the second kind :math:`U_{n}(\text{input})`.
 
@@ -1160,7 +1212,11 @@
 :math:`|\text{input}| > 1`, the recursion:
 
 .. math::
+<<<<<<< HEAD
     T_{n + 1}(\text{input}) = 2 \times \text{input} \times T_{n}(\text{input}) - T_{n - 1}(\text{input})
+=======
+    U_{n + 1}(\text{input}) = 2 \times \text{input} \times U_{n}(\text{input}) - U_{n - 1}(\text{input})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 is evaluated. Otherwise, the explicit trigonometric formula:
 
diff --git a/torch/storage.py b/torch/storage.py
index ceb0f34673af..6098e7c504f0 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -155,6 +155,13 @@ def _new_shared_filename_cpu(
         raise NotImplementedError
 
     @classmethod
+<<<<<<< HEAD
+=======
+    def _release_ipc_counter(cls, *args, device=None, **kwargs):
+        return cls._release_ipc_counter_cuda(*args, **kwargs)
+
+    @classmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _release_ipc_counter_cuda(cls, *args, **kwargs) -> Self:
         raise NotImplementedError
 
@@ -540,6 +547,10 @@ def _new_dtypes():
         torch.float8_e5m2fnuz,
         torch.float8_e4m3fnuz,
         torch.float8_e8m0fnu,
+<<<<<<< HEAD
+=======
+        torch.float4_e2m1fn_x2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.bits8,
         torch.bits16,
         torch.bits1x8,
@@ -1518,7 +1529,11 @@ def __instancecheck__(cls, instance):
 
 class _LegacyStorage(TypedStorage, metaclass=_LegacyStorageMeta):
     @classmethod
+<<<<<<< HEAD
     def _new_shared(cls, size):
+=======
+    def _new_shared(cls, size):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """Create a new storage in shared memory with the same data type."""
         untyped_storage = torch.UntypedStorage._new_shared(size * cls()._element_size())
         return cls(wrap_storage=untyped_storage)
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 996e2dabee9e..7d73441aea64 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -128,6 +128,40 @@ def get_tolerances(
         return default_tolerances(*inputs)
 
 
+<<<<<<< HEAD
+=======
+def _make_bitwise_mismatch_msg(
+    *,
+    default_identifier: str,
+    identifier: Optional[Union[str, Callable[[str], str]]] = None,
+    extra: Optional[str] = None,
+    first_mismatch_idx: Optional[tuple[int]] = None,
+):
+    """Makes a mismatch error message for bitwise values.
+
+    Args:
+        default_identifier (str): Default description of the compared values, e.g. "Tensor-likes".
+        identifier (Optional[Union[str, Callable[[str], str]]]): Optional identifier that overrides
+            ``default_identifier``. Can be passed as callable in which case it will be called with
+            ``default_identifier`` to create the description at runtime.
+        extra (Optional[str]): Extra information to be placed after the message header and the mismatch statistics.
+        first_mismatch_idx (Optional[tuple[int]]): the index of the first mismatch, for each dimension.
+    """
+    if identifier is None:
+        identifier = default_identifier
+    elif callable(identifier):
+        identifier = identifier(default_identifier)
+
+    msg = f"{identifier} are not 'equal'!\n\n"
+
+    if extra:
+        msg += f"{extra.strip()}\n"
+    if first_mismatch_idx is not None:
+        msg += f"The first mismatched element is at index {first_mismatch_idx}.\n"
+    return msg.strip()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _make_mismatch_msg(
     *,
     default_identifier: str,
@@ -263,6 +297,18 @@ def unravel_flat_index(flat_index: int) -> tuple[int, ...]:
         f"Mismatched elements: {total_mismatches} / {number_of_elements} "
         f"({total_mismatches / number_of_elements:.1%})"
     )
+<<<<<<< HEAD
+=======
+    if actual.dtype.is_floating_point and actual.dtype.itemsize == 1:
+        # skip checking for max_abs_diff and max_rel_diff for float8-like values
+        first_mismatch_idx = tuple(torch.nonzero(~matches, as_tuple=False)[0].tolist())
+        return _make_bitwise_mismatch_msg(
+            default_identifier="Tensor-likes",
+            identifier=identifier,
+            extra=extra,
+            first_mismatch_idx=first_mismatch_idx,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     actual_flat = actual.flatten()
     expected_flat = expected.flatten()
@@ -824,6 +870,37 @@ def _compare_values(self, actual: torch.Tensor, expected: torch.Tensor) -> None:
         elif actual.layout == torch.jagged:
             actual, expected = actual.values(), expected.values()
             compare_fn = self._compare_regular_values_close
+<<<<<<< HEAD
+=======
+        elif actual.dtype.is_floating_point and actual.dtype.itemsize == 1:
+
+            def bitwise_comp(
+                actual: torch.Tensor,
+                expected: torch.Tensor,
+                *,
+                rtol: float,
+                atol: float,
+                equal_nan: bool,
+                identifier: Optional[Union[str, Callable[[str], str]]] = None,
+            ) -> None:
+                if rtol != 0.0 or atol != 0.0:
+                    raise ErrorMeta(
+                        AssertionError,
+                        f"Rtol={rtol} and atol={atol} are not supported for bitwise comparison of low"
+                        " dimensional floats. Please use rtol=0.0 and atol=0.0.",
+                    )
+
+                return self._compare_regular_values_close(
+                    actual,
+                    expected,
+                    rtol=rtol,
+                    atol=atol,
+                    equal_nan=equal_nan,
+                    identifier=identifier,
+                )
+
+            compare_fn = bitwise_comp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             compare_fn = self._compare_regular_values_close
 
diff --git a/torch/testing/_internal/check_kernel_launches.py b/torch/testing/_internal/check_kernel_launches.py
index d602c24246f7..2501b4634838 100644
--- a/torch/testing/_internal/check_kernel_launches.py
+++ b/torch/testing/_internal/check_kernel_launches.py
@@ -112,8 +112,13 @@ def check_file(filename):
         return 0
     if should_exclude_file(filename):
         return 0
+<<<<<<< HEAD
     with open(filename) as fo:
         contents = fo.read()
+=======
+    with open(filename) as f:
+        contents = f.read()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         unsafeCount = check_code_for_cuda_kernel_launches(contents, filename)
     return unsafeCount
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index a7eeb672034f..afd88cd022a9 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -5,7 +5,11 @@
 import functools
 import torch
 import torch.cuda
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import LazyVal, TEST_NUMBA, TEST_WITH_ROCM, TEST_CUDA, IS_WINDOWS
+=======
+from torch.testing._internal.common_utils import LazyVal, TEST_NUMBA, TEST_WITH_ROCM, TEST_CUDA, IS_WINDOWS, IS_MACOS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import inspect
 import contextlib
 import os
@@ -24,6 +28,10 @@
     TEST_CUDNN = LazyVal(lambda: TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
 
 TEST_CUDNN_VERSION = LazyVal(lambda: torch.backends.cudnn.version() if TEST_CUDNN else 0)
+<<<<<<< HEAD
+=======
+ROCM_VERSION = LazyVal(lambda : tuple(int(v) for v in torch.version.hip.split('.')[:2]) if torch.version.hip else (0, 0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 SM53OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3))
 SM60OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0))
@@ -33,6 +41,10 @@
 SM89OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9))
 SM90OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0))
 SM100OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (10, 0))
+<<<<<<< HEAD
+=======
+SM120OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (12, 0))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 IS_THOR = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability()[0] == 10
                   and torch.cuda.get_device_capability()[1] > 0)
@@ -48,15 +60,27 @@ def evaluate_gfx_arch_within(arch_list):
     # Hence the matching should be done reversely
     return any(arch in effective_arch for arch in arch_list)
 
+<<<<<<< HEAD
+=======
+def CDNA3OrLater():
+    return evaluate_gfx_arch_within(["gfx940", "gfx941", "gfx942", "gfx950"])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def CDNA2OrLater():
     return evaluate_gfx_arch_within(["gfx90a", "gfx942"])
 
 def evaluate_platform_supports_flash_attention():
     if TEST_WITH_ROCM:
+<<<<<<< HEAD
         arch_list = ["gfx90a", "gfx942", "gfx1100"]
         version = _get_torch_rocm_version()
         if version >= (6, 5):
             arch_list += ["gfx950"]
+=======
+        arch_list = ["gfx90a", "gfx942", "gfx1100", "gfx1201", "gfx950"]
+        if os.environ.get("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL", "0") != "0":
+            arch_list += ["gfx1101", "gfx1150", "gfx1151", "gfx1200"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return evaluate_gfx_arch_within(arch_list)
     if TEST_CUDA:
         return not IS_WINDOWS and SM80OrLater
@@ -64,10 +88,16 @@ def evaluate_platform_supports_flash_attention():
 
 def evaluate_platform_supports_efficient_attention():
     if TEST_WITH_ROCM:
+<<<<<<< HEAD
         arch_list = ["gfx90a", "gfx942", "gfx1100"]
         version = _get_torch_rocm_version()
         if version >= (6, 5):
             arch_list += ["gfx950"]
+=======
+        arch_list = ["gfx90a", "gfx942", "gfx1100", "gfx1201", "gfx950"]
+        if os.environ.get("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL", "0") != "0":
+            arch_list += ["gfx1101", "gfx1150", "gfx1151", "gfx1200"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return evaluate_gfx_arch_within(arch_list)
     if TEST_CUDA:
         return True
@@ -91,7 +121,10 @@ def evaluate_platform_supports_cudnn_attention():
 def evaluate_platform_supports_fp8():
     if torch.cuda.is_available():
         if torch.version.hip:
+<<<<<<< HEAD
             ROCM_VERSION = tuple(int(v) for v in torch.version.hip.split('.')[:2])
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             archs = ['gfx94']
             if ROCM_VERSION >= (6, 3):
                 archs.extend(['gfx120'])
@@ -109,7 +142,12 @@ def evaluate_platform_supports_fp8():
 def _platform_supports_mx_gemm():
     if torch.cuda.is_available():
         if torch.version.hip:
+<<<<<<< HEAD
             return 'gfx95' in torch.cuda.get_device_properties(0).gcnArchName
+=======
+            if ROCM_VERSION >= (7, 0):
+                return 'gfx950' in torch.cuda.get_device_properties(0).gcnArchName
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return SM100OrLater
     return False
@@ -220,7 +258,11 @@ def tf32_enabled():
 # if device is specified, it will check if device is cuda
 # if dtype is specified, it will check if dtype is float32 or complex64
 # tf32 and fp32 are different only when all the three checks pass
+<<<<<<< HEAD
 def tf32_on_and_off(tf32_precision=1e-5):
+=======
+def tf32_on_and_off(tf32_precision=1e-5, only_if=True):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def with_tf32_disabled(self, function_call):
         with tf32_off():
             function_call()
@@ -235,9 +277,14 @@ def wrapper(f):
 
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
+<<<<<<< HEAD
             for k, v in zip(arg_names, args):
                 kwargs[k] = v
             cond = torch.cuda.is_tf32_supported()
+=======
+            kwargs.update(zip(arg_names, args))
+            cond = torch.cuda.is_tf32_supported() and only_if
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if 'device' in kwargs:
                 cond = cond and (torch.device(kwargs['device']).type == 'cuda')
             if 'dtype' in kwargs:
@@ -251,7 +298,10 @@ def wrapped(*args, **kwargs):
         return wrapped
     return wrapper
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This is a wrapper that wraps a test to run it with TF32 turned off.
 # This wrapper is designed to be used when a test uses matmul or convolutions
 # but the purpose of that test is not testing matmul or convolutions.
@@ -279,7 +329,11 @@ def _get_torch_cuda_version():
     return tuple(int(x) for x in cuda_version.split("."))
 
 def _get_torch_rocm_version():
+<<<<<<< HEAD
     if not TEST_WITH_ROCM:
+=======
+    if not TEST_WITH_ROCM or torch.version.hip is None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (0, 0)
     rocm_version = str(torch.version.hip)
     rocm_version = rocm_version.split("-")[0]    # ignore git sha
@@ -340,6 +394,17 @@ def _create_scaling_case(device="cuda", dtype=torch.float, optimizer_ctor=torch.
 def xfailIfSM89(func):
     return func if not IS_SM89 else unittest.expectedFailure(func)
 
+<<<<<<< HEAD
+=======
+def xfailIfSM100OrLater(func):
+    return func if not SM100OrLater else unittest.expectedFailure(func)
+
+def xfailIfSM120OrLater(func):
+    return func if not SM120OrLater else unittest.expectedFailure(func)
+
+def xfailIfDistributedNotSupported(func):
+    return func if not (IS_MACOS or IS_JETSON) else unittest.expectedFailure(func)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Importing this module should NOT eagerly initialize CUDA
 if not CUDA_ALREADY_INITIALIZED_ON_IMPORT:
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 9cd0661cac15..ccb577365a42 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -31,6 +31,10 @@
     dtype_name,
     get_tracked_input,
     IS_FBCODE,
+<<<<<<< HEAD
+=======
+    IS_MACOS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     is_privateuse1_backend_available,
     IS_REMOTE_GPU,
     IS_SANDCASTLE,
@@ -280,6 +284,7 @@
 # they are run. This makes it useful for initializing devices and dependencies.
 
 
+<<<<<<< HEAD
 # Note [Overriding methods in generic tests]
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
@@ -309,6 +314,8 @@
 # then inherit from it for your generic test.
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _dtype_test_suffix(dtypes):
     """Returns the test suffix for a dtype, sequence of dtypes, or None."""
     if isinstance(dtypes, (list, tuple)):
@@ -756,6 +763,7 @@ def filter_desired_device_types(device_type_test_bases, except_for=None, only_fo
     # Replace your privateuse1 backend name with 'privateuse1'
     if is_privateuse1_backend_available():
         privateuse1_backend_name = torch._C._get_privateuse1_backend_name()
+<<<<<<< HEAD
         except_for = (
             ["privateuse1" if x == privateuse1_backend_name else x for x in except_for]
             if except_for is not None
@@ -765,6 +773,21 @@ def filter_desired_device_types(device_type_test_bases, except_for=None, only_fo
             ["privateuse1" if x == privateuse1_backend_name else x for x in only_for]
             if only_for is not None
             else None
+=======
+
+        def func_replace(x: str):
+            return x.replace(privateuse1_backend_name, "privateuse1")
+
+        except_for = (
+            ([func_replace(x) for x in except_for] if except_for is not None else None)
+            if not isinstance(except_for, str)
+            else func_replace(except_for)
+        )
+        only_for = (
+            ([func_replace(x) for x in only_for] if only_for is not None else None)
+            if not isinstance(only_for, str)
+            else func_replace(only_for)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     if except_for:
@@ -892,6 +915,7 @@ def instantiate_device_type_tests(
     # are not discoverable.
     del scope[generic_test_class.__name__]
 
+<<<<<<< HEAD
     # Creates an 'empty' version of the generic_test_class
     # Note: we don't inherit from the generic_test_class directly because
     #   that would add its tests to our test classes and they would be
@@ -906,6 +930,9 @@ def instantiate_device_type_tests(
     generic_members = set(generic_test_class.__dict__.keys()) - set(
         empty_class.__dict__.keys()
     )
+=======
+    generic_members = set(generic_test_class.__dict__.keys())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     generic_tests = [x for x in generic_members if x.startswith("test")]
 
     # Creates device-specific test cases
@@ -914,9 +941,38 @@ def instantiate_device_type_tests(
     ):
         class_name = generic_test_class.__name__ + base.device_type.upper()
 
+<<<<<<< HEAD
         # type set to Any and suppressed due to unsupport runtime class:
         # https://github.com/python/mypy/wiki/Unsupported-Python-Features
         device_type_test_class: Any = type(class_name, (base, empty_class), {})
+=======
+        # type set to Any and suppressed due to unsupported runtime class:
+        # https://github.com/python/mypy/wiki/Unsupported-Python-Features
+        device_type_test_class: Any = type(class_name, (base, generic_test_class), {})
+
+        # Arrange for setUpClass and tearDownClass methods defined both in the test template
+        # class and in the generic base to be called. This allows device-parameterized test
+        # classes to support setup and teardown.
+        # NB: This should be done before instantiate_test() is called as that invokes setup.
+        @classmethod
+        def _setUpClass(cls):
+            # This should always be called, whether or not the test class invokes
+            # super().setUpClass(), to set the primary device.
+            base.setUpClass()
+            # We want to call the @classmethod defined in the generic base, but pass
+            # it the device-specific class object (cls), hence the __func__ call.
+            generic_test_class.setUpClass.__func__(cls)
+
+        @classmethod
+        def _tearDownClass(cls):
+            # We want to call the @classmethod defined in the generic base, but pass
+            # it the device-specific class object (cls), hence the __func__ call.
+            generic_test_class.tearDownClass.__func__(cls)
+            base.tearDownClass()
+
+        device_type_test_class.setUpClass = _setUpClass
+        device_type_test_class.tearDownClass = _tearDownClass
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for name in generic_members:
             if name in generic_tests:  # Instantiates test member
@@ -930,6 +986,7 @@ def instantiate_device_type_tests(
                     )
                 else:
                     device_type_test_class.instantiate_test(name, copy.deepcopy(test))
+<<<<<<< HEAD
             else:  # Ports non-test member
                 assert (
                     name not in device_type_test_class.__dict__
@@ -954,6 +1011,13 @@ def _tearDownClass(cls):
         device_type_test_class.setUpClass = _setUpClass
         device_type_test_class.tearDownClass = _tearDownClass
 
+=======
+            # Ports non-test member. Setup / teardown have already been handled above
+            elif name not in device_type_test_class.__dict__:
+                nontest = getattr(generic_test_class, name)
+                setattr(device_type_test_class, name, nontest)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Mimics defining the instantiated class in the caller's file
         # by setting its module to the given class's and adding
         # the module to the given scope.
@@ -961,6 +1025,16 @@ def _tearDownClass(cls):
         device_type_test_class.__module__ = generic_test_class.__module__
         scope[class_name] = device_type_test_class
 
+<<<<<<< HEAD
+=======
+    # Delete the generic form of the test functions (e.g. TestFoo.test_bar()) so they're
+    # not discoverable. This mutates the original class (TestFoo), which was removed from
+    # scope above. At this point, device-specific tests (e.g. TestFooCUDA.test_bar_cuda)
+    # have already been created and the generic forms are no longer needed.
+    for name in generic_tests:
+        delattr(generic_test_class, name)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Category of dtypes to run an OpInfo-based test for
 # Example use: @ops(dtype=OpDTypes.supported)
@@ -1003,6 +1077,11 @@ class OpDTypes(Enum):
     torch.int8,
     torch.uint8,
     torch.bool,
+<<<<<<< HEAD
+=======
+    torch.float8_e4m3fn,
+    torch.float8_e5m2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -1344,7 +1423,11 @@ def largeTensorTest(size, device=None, inductor=TEST_WITH_TORCHINDUCTOR):
     size may be a number of bytes, a string of the form "N GB", or a callable
 
     If the test is a device generic test, available memory on the primary device will be checked.
+<<<<<<< HEAD
     It can also be overriden by the optional `device=` argument.
+=======
+    It can also be overridden by the optional `device=` argument.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     In other tests, the `device=` argument needs to be specified.
     """
     if isinstance(size, str):
@@ -1976,11 +2059,30 @@ def get_all_device_types() -> list[str]:
     return ["cpu"] if not torch.cuda.is_available() else ["cpu", "cuda"]
 
 
+<<<<<<< HEAD
 flex_attention_supported_platform = unittest.skipUnless(
     torch.cuda.is_available()
     and torch.utils._triton.has_triton()
     and torch.cuda.get_device_capability() >= (8, 0),
     "Requires CUDA and Triton",
+=======
+# skip since currently flex attention requires at least `avx2` support on CPU.
+IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED = (
+    not torch.xpu.is_available()
+    and not torch.cuda.is_available()
+    and not IS_MACOS
+    and torch.cpu._is_avx2_supported()
+    and os.getenv("ATEN_CPU_CAPABILITY") != "default"
+)
+flex_attention_supported_platform = unittest.skipUnless(
+    IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
+    or (
+        torch.cuda.is_available()
+        and torch.utils._triton.has_triton()
+        and torch.cuda.get_device_capability() >= (8, 0)
+    ),
+    "Requires CUDA and Triton, or CPU with avx2 and later",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 if torch.version.hip and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName:
     e4m3_type = torch.float8_e4m3fnuz
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 2a8fc04265c4..62a3007f90dd 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -1,10 +1,17 @@
 # mypy: ignore-errors
 
+<<<<<<< HEAD
 import abc
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import faulthandler
 import itertools
 import logging
 import multiprocessing
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import queue
 import subprocess
@@ -21,17 +28,30 @@
 from enum import Enum
 from functools import partial, reduce, wraps
 from io import StringIO
+<<<<<<< HEAD
 from typing import NamedTuple, Optional, Union, Any, Callable
 from unittest.mock import patch
 
 from torch._logging._internal import trace_log
+=======
+from typing import Any, Callable, NamedTuple, Optional, Union
+from unittest.mock import patch
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch._dynamo.test_case
 import torch.cuda.nccl
 import torch.distributed as c10d
+<<<<<<< HEAD
 from torch._C._autograd import DeviceType
 from torch._C._distributed_c10d import _SymmetricMemory
 import torch.nn as nn
+=======
+import torch.nn as nn
+from torch._C._autograd import DeviceType
+from torch._C._distributed_c10d import _SymmetricMemory
+from torch._logging._internal import trace_log
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     FILE_SCHEMA,
     find_free_port,
@@ -39,22 +59,42 @@
     retry_on_connect_failures,
     skip_but_pass_in_sandcastle,
     skip_but_pass_in_sandcastle_if,
+<<<<<<< HEAD
     TEST_WITH_ROCM,
     TEST_WITH_TSAN,
     TestCase,
     run_tests,
     TEST_HPU,
     TEST_XPU,
+=======
+    TEST_CUDA,
+    TEST_HPU,
+    TEST_WITH_ROCM,
+    TEST_WITH_TSAN,
+    TEST_XPU,
+    TestCase,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.distributed.multi_threaded_pg import (
     _install_threaded_pg,
     _uninstall_threaded_pg,
     ProcessLocalGroup,
 )
+<<<<<<< HEAD
 import operator
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+=======
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+ACCELERATOR_DIST_BACKENDS = ["nccl", "xccl", "hccl"]
+DDP_RANK_DEVICES = ["cuda", "xpu"]
+HAS_ACCELERATOR = TEST_CUDA or TEST_HPU or TEST_XPU
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TestSkip(NamedTuple):
@@ -110,12 +150,20 @@ class DistTestCases:
         backend_feature["xpu"] = {"xccl"}
 
 
+<<<<<<< HEAD
+=======
+def requires_ddp_rank(device):
+    return device in DDP_RANK_DEVICES
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def skip_if_no_gpu(func):
     """Skips if the world size exceeds the number of GPUs, ensuring that if the
     test is run, each rank has its own GPU via ``torch.cuda.device(rank)``."""
 
     @wraps(func)
     def wrapper(*args, **kwargs):
+<<<<<<< HEAD
         if not torch.cuda.is_available():
             sys.exit(TEST_SKIPS["no_cuda"].exit_code)
         world_size = int(os.environ["WORLD_SIZE"])
@@ -125,6 +173,17 @@ def wrapper(*args, **kwargs):
             sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
         if TEST_XPU and torch.xpu.device_count < world_size:
             sys.exit(TEST_SKIPS[f"multi-xpu-{world_size}"].exit_code)
+=======
+        if not (TEST_CUDA or TEST_HPU or TEST_XPU):
+            sys.exit(TEST_SKIPS["no_cuda"].exit_code)
+        world_size = int(os.environ["WORLD_SIZE"])
+        if TEST_CUDA and torch.cuda.device_count() < world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
+        if TEST_HPU and torch.hpu.device_count() < world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
+        if TEST_XPU and torch.xpu.device_count() < world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return func(*args, **kwargs)
 
@@ -178,10 +237,14 @@ def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
             try:
+<<<<<<< HEAD
                 from transformers import (  # noqa: F401
                     AutoModelForMaskedLM,
                     BertConfig,
                 )
+=======
+                from transformers import AutoModelForMaskedLM, BertConfig  # noqa: F401
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 return func(*args, **kwargs)
             except ImportError:
@@ -193,7 +256,17 @@ def wrapper(*args, **kwargs):
 
 
 def at_least_x_gpu(x):
+<<<<<<< HEAD
     return torch.cuda.is_available() and torch.cuda.device_count() >= x
+=======
+    if TEST_CUDA and torch.cuda.device_count() >= x:
+        return True
+    if TEST_HPU and torch.hpu.device_count() >= x:
+        return True
+    if TEST_XPU and torch.xpu.device_count() >= x:
+        return True
+    return False
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def skip_if_lt_x_gpu(x):
@@ -344,12 +417,20 @@ def requires_nccl():
         "c10d was not compiled with the NCCL backend",
     )
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def requires_ucc():
     return skip_but_pass_in_sandcastle_if(
         not c10d.is_ucc_available(),
         "c10d was not compiled with the UCC backend",
     )
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def requires_mpi():
     return skip_but_pass_in_sandcastle_if(
         not c10d.is_mpi_available(),
@@ -357,6 +438,38 @@ def requires_mpi():
     )
 
 
+<<<<<<< HEAD
+=======
+def requires_accelerator_dist_backend(backends=None):
+    """
+    Decorator to skip tests if no accelerator communication backend (NCCL, XCCL, HCCL) is available.
+
+    Args:
+        backends (Optional[List[str]]): Specific accelerator backends to check (e.g., ["nccl", "xccl", "hccl"]).
+                                       If None, checks all supported accelerator backends (NCCL, XCCL, HCCL).
+
+    Returns:
+        callable: A decorator that skips the test if no specified accelerator backend is available.
+    """
+    if backends is None:
+        backends = ACCELERATOR_DIST_BACKENDS
+
+    backend_available = any(
+        {
+            "nccl": c10d.is_nccl_available,
+            "xccl": c10d.is_xccl_available,
+            "hccl": lambda: TEST_HPU,
+        }.get(backend, lambda: False)()
+        for backend in backends
+    )
+
+    return skip_but_pass_in_sandcastle_if(
+        not backend_available,
+        f"No accelerator communication backend available among {backends}",
+    )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def requires_multicast_support():
     has_multicast_support = (
         torch.cuda.is_available()
@@ -425,7 +538,16 @@ def create_tcp_store(
         )
     else:
         return c10d.TCPStore(
+<<<<<<< HEAD
             addr, port, world_size, is_master, wait_for_workers=wait_for_workers, use_libuv=use_libuv
+=======
+            addr,
+            port,
+            world_size,
+            is_master,
+            wait_for_workers=wait_for_workers,
+            use_libuv=use_libuv,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
 
@@ -433,7 +555,11 @@ def create_tcp_store(
     # TSAN runs much slower.
     TIMEOUT_DEFAULT = 500
 else:
+<<<<<<< HEAD
     TIMEOUT_DEFAULT = int(os.getenv('DISTRIBUTED_TESTS_DEFAULT_TIMEOUT', '300'))
+=======
+    TIMEOUT_DEFAULT = int(os.getenv("DISTRIBUTED_TESTS_DEFAULT_TIMEOUT", "300"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
 
 
@@ -442,11 +568,23 @@ def create_tcp_store(
     TIMEOUT_OVERRIDE["test_join_kwargs"] = 200
 
 
+<<<<<<< HEAD
 def create_device(interface=None):
     if sys.platform == "win32" or interface is None:
         return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1")
     else:
         return c10d.ProcessGroupGloo.create_device(interface=interface)
+=======
+def create_device(interface=None, lazy_init: bool = False):
+    if sys.platform == "win32" or interface is None:
+        return c10d.ProcessGroupGloo.create_device(
+            hostname="127.0.0.1", lazy_init=lazy_init
+        )
+    else:
+        return c10d.ProcessGroupGloo.create_device(
+            interface=interface, lazy_init=lazy_init
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_timeout(test_id) -> int:
@@ -612,7 +750,13 @@ def wrapper(self):
     # Constructor patches current instance test method to
     # assume the role of the main process and join its subprocesses,
     # or run the underlying test function.
+<<<<<<< HEAD
     def __init__(self, method_name: str = "runTest", methodName: str = "runTest") -> None:
+=======
+    def __init__(
+        self, method_name: str = "runTest", methodName: str = "runTest"
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # methodName is the correct naming in unittest and testslide uses keyword arguments.
         # So we need to use both to 1) not break BC and, 2) support testslide.
         if methodName != "runTest":
@@ -622,6 +766,7 @@ def __init__(self, method_name: str = "runTest", methodName: str = "runTest") ->
             fn = getattr(self, method_name)
             setattr(self, method_name, self.join_or_run(fn))
         except AttributeError as e:
+<<<<<<< HEAD
             if methodName != 'runTest':
                 # we allow instantiation with no explicit method name
                 # but not an *incorrect* or missing method name
@@ -630,6 +775,26 @@ def __init__(self, method_name: str = "runTest", methodName: str = "runTest") ->
     def setUp(self) -> None:
         super().setUp()
         self.skip_return_code_checks = []  # type: ignore[var-annotated]
+=======
+            if methodName != "runTest":
+                # we allow instantiation with no explicit method name
+                # but not an *incorrect* or missing method name
+                raise ValueError(
+                    f"no such test method in {self.__class__}: {methodName}"
+                ) from e
+
+    def setUp(self) -> None:
+        super().setUp()
+
+        # Used for tests that are expected to return a non-0 exit code, such as
+        # SIGABRT thrown by watchdog.
+        self.special_return_code_checks: dict = {}
+
+        # Used for tests that may return any exit code, which makes it hard to
+        # check. This is rare, use with caution.
+        self.skip_return_code_checks: list = []
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.processes = []  # type: ignore[var-annotated]
         self.rank = self.MAIN_PROCESS_RANK
         self.file_name = tempfile.NamedTemporaryFile(delete=False).name
@@ -660,7 +825,11 @@ def _start_processes(self, proc) -> None:
                 args=(rank, self._current_test_name(), self.file_name, child_conn),
                 kwargs={
                     "fake_pg": getattr(self, "fake_pg", False),
+<<<<<<< HEAD
                 }
+=======
+                },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             process.start()
             logger.info("Started process %s with pid %s", rank, process.pid)
@@ -668,6 +837,14 @@ def _start_processes(self, proc) -> None:
             self.processes.append(process)
 
     def _spawn_processes(self) -> None:
+<<<<<<< HEAD
+=======
+        try:
+            torch.multiprocessing.set_start_method("spawn")
+        except RuntimeError:
+            pass
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         proc = torch.multiprocessing.get_context("spawn").Process
         self._start_processes(proc)
 
@@ -676,15 +853,26 @@ class Event(Enum):
 
     @staticmethod
     def _event_listener(parent_pipe, signal_pipe, rank: int):
+<<<<<<< HEAD
         logger.info("Starting event listener thread for rank %s", rank)
+=======
+        logger.debug("Starting event listener thread for rank %s", rank)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         while True:
             ready_pipes = multiprocessing.connection.wait([parent_pipe, signal_pipe])
 
             if parent_pipe in ready_pipes:
+<<<<<<< HEAD
 
                 if parent_pipe.closed:
                     logger.info(
                         "Pipe closed for process %s, stopping event listener thread", rank
+=======
+                if parent_pipe.closed:
+                    logger.debug(
+                        "Pipe closed for process %s, stopping event listener thread",
+                        rank,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     return
 
@@ -706,7 +894,13 @@ def _event_listener(parent_pipe, signal_pipe, rank: int):
                 return
 
     @classmethod
+<<<<<<< HEAD
     def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe, **kwargs) -> None:
+=======
+    def _run(
+        cls, rank: int, test_name: str, file_name: str, parent_pipe, **kwargs
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self = cls(test_name)
         self.rank = rank
         self.file_name = file_name
@@ -734,14 +928,28 @@ def run_test(self, test_name: str, parent_pipe) -> None:
             getattr(self, test_name)()
         except unittest.SkipTest as se:
             logger.info(
+<<<<<<< HEAD
                 "Process %s skipping test %s for following reason: %s", self.rank, test_name, str(se)
+=======
+                "Process %s skipping test %s for following reason: %s",
+                self.rank,
+                test_name,
+                str(se),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             sys.exit(TEST_SKIPS["generic"].exit_code)
         except Exception:
             logger.error(
+<<<<<<< HEAD
                 "Caught exception: \n%s exiting "
                 "process %s with exit code: %s",
                 traceback.format_exc(), self.rank, MultiProcessTestCase.TEST_ERROR_EXIT_CODE
+=======
+                "Caught exception: \n%s exiting " "process %s with exit code: %s",
+                traceback.format_exc(),
+                self.rank,
+                MultiProcessTestCase.TEST_ERROR_EXIT_CODE,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             # Send error to parent process.
             parent_pipe.send(traceback.format_exc())
@@ -773,7 +981,13 @@ def _get_timedout_process_traceback(self) -> None:
                     pipes.append((i, pipe))
                 except ConnectionError as e:
                     logger.error(
+<<<<<<< HEAD
                         "Encountered error while trying to get traceback for process %s: %s", i, e
+=======
+                        "Encountered error while trying to get traceback for process %s: %s",
+                        i,
+                        e,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
         # Wait for results.
@@ -783,7 +997,12 @@ def _get_timedout_process_traceback(self) -> None:
                 if pipe.poll(5):
                     if pipe.closed:
                         logger.info(
+<<<<<<< HEAD
                             "Pipe closed for process %s, cannot retrieve traceback", rank
+=======
+                            "Pipe closed for process %s, cannot retrieve traceback",
+                            rank,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         continue
 
@@ -797,7 +1016,13 @@ def _get_timedout_process_traceback(self) -> None:
                     )
             except ConnectionError as e:
                 logger.error(
+<<<<<<< HEAD
                     "Encountered error while trying to get traceback for process %s: %s", rank, e
+=======
+                    "Encountered error while trying to get traceback for process %s: %s",
+                    rank,
+                    e,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
     def _join_processes(self, fn) -> None:
@@ -807,7 +1032,11 @@ def _join_processes(self, fn) -> None:
         try:
             while True:
                 # check to see if any subprocess exited with an error early.
+<<<<<<< HEAD
                 for (i, p) in enumerate(self.processes):
+=======
+                for i, p in enumerate(self.processes):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # This is the exit code processes exit with if they
                     # encountered an exception.
                     if p.exitcode == MultiProcessTestCase.TEST_ERROR_EXIT_CODE:
@@ -838,16 +1067,21 @@ def _join_processes(self, fn) -> None:
                 time.sleep(0.1)
 
             elapsed_time = time.time() - start_time
+<<<<<<< HEAD
 
             if fn in self.skip_return_code_checks:
                 self._check_no_test_errors(elapsed_time)
             else:
                 self._check_return_codes(elapsed_time)
+=======
+            self._check_return_codes(fn, elapsed_time)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         finally:
             # Close all pipes
             for pipe in self.pid_to_pipe.values():
                 pipe.close()
 
+<<<<<<< HEAD
     def _check_no_test_errors(self, elapsed_time) -> None:
         """
         Checks that we didn't have any errors thrown in the child processes.
@@ -860,13 +1094,22 @@ def _check_no_test_errors(self, elapsed_time) -> None:
             self.assertNotEqual(self.TEST_ERROR_EXIT_CODE, p.exitcode)
 
     def _check_return_codes(self, elapsed_time) -> None:
+=======
+    def _check_return_codes(self, fn, elapsed_time) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Checks that the return codes of all spawned processes match, and skips
         tests if they returned a return code indicating a skipping condition.
         """
         # If no processes are spawned, there is nothing to check.
         if not self.processes:
+<<<<<<< HEAD
             logger.warning("Note: no subprocesses were spawned, test was likely skipped.")
+=======
+            logger.warning(
+                "Note: no subprocesses were spawned, test was likely skipped."
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         first_process = self.processes[0]
@@ -899,11 +1142,19 @@ def _check_return_codes(self, elapsed_time) -> None:
                 raise RuntimeError(
                     f"Process {i} terminated or timed out after {elapsed_time} seconds"
                 )
+<<<<<<< HEAD
             self.assertEqual(
                 p.exitcode,
                 first_process.exitcode,
                 msg=f"Expect process {i} exit code to match Process 0 exit code of {first_process.exitcode}, but got {p.exitcode}",
             )
+=======
+
+        # Skip the test return code check
+        if fn in self.skip_return_code_checks:
+            return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for skip in TEST_SKIPS.values():
             if first_process.exitcode == skip.exit_code:
                 if IS_SANDCASTLE:
@@ -912,34 +1163,72 @@ def _check_return_codes(self, elapsed_time) -> None:
                     # is some follow-up needed. Instead just "pass" the test
                     # with an appropriate message.
                     logger.info(
+<<<<<<< HEAD
                         "Skipping %s on sandcastle for the following reason: %s", self.id(), skip.message
+=======
+                        "Skipping %s on sandcastle for the following reason: %s",
+                        self.id(),
+                        skip.message,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                     return
                 else:
                     raise unittest.SkipTest(skip.message)
+<<<<<<< HEAD
         self.assertEqual(
             first_process.exitcode,
             0,
             msg=f"Expected zero exit code but got {first_process.exitcode} for pid: {first_process.pid}",
+=======
+
+        # In most cases, we expect test to return exit code 0, standing for success.
+        expected_return_code = 0
+        # In some negative tests, we expect test to return non-zero exit code,
+        # such as watchdog throwing SIGABRT.
+        if fn in self.special_return_code_checks:
+            expected_return_code = self.special_return_code_checks[fn]
+
+        self.assertEqual(
+            first_process.exitcode,
+            expected_return_code,
+            msg=f"Expected exit code {expected_return_code} but got {first_process.exitcode} for pid: {first_process.pid}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @property
     def is_master(self) -> bool:
         return self.rank == 0
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Utility base class for distributed Multi Process Test cases
 # This abstracts the PG creation and deletion, the backends are selected based
 # on device type. The tests functions can be instantiated per device type using
 # common_device_type.instantiate_device_type_tests
 # other backends can add entry in backend() function
 class DistributedTestBase(MultiProcessTestCase):
+<<<<<<< HEAD
 
     def setUp(self):
         super().setUp()
+=======
+    def setUp(self):
+        super().setUp()
+        os.environ["WORLD_SIZE"] = str(self.world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._spawn_processes()
 
     def tearDown(self):
         try:
+<<<<<<< HEAD
+=======
+            torch.distributed.destroy_process_group()
+        except AssertionError:
+            pass
+        try:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             os.remove(self.file_name)
         except OSError:
             pass
@@ -947,6 +1236,7 @@ def tearDown(self):
     def backend(self, device) -> str:
         if "cuda" in device:
             return "nccl"
+<<<<<<< HEAD
         elif "hpu" in device :   # intel gaudi
             return "hccl"
         elif "xpu" in device:
@@ -955,13 +1245,31 @@ def backend(self, device) -> str:
             return "gloo"
 
     def create_pg(self, device):
+=======
+        elif "hpu" in device:  # intel gaudi
+            return "hccl"
+        elif "xpu" in device:
+            return "xccl"
+        else:
+            return "gloo"
+
+    def create_pg(self, device, world_size=None):
+        if world_size is None:
+            world_size = self.world_size
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         num_visible_devices = torch.get_device_module(device).device_count()
         store = torch.distributed.FileStore(self.file_name, num_visible_devices)
         torch.distributed.init_process_group(
             backend=self.backend(device),
+<<<<<<< HEAD
             world_size=self.world_size,
             rank=self.rank,
             store=store
+=======
+            world_size=world_size,
+            rank=self.rank,
+            store=store,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         if "nccl" in self.backend(device) or "xccl" in self.backend(device):
             torch.accelerator.set_device_index(self.rank)
@@ -971,6 +1279,10 @@ def rank_to_device(self, device):
         num_visible_devices = torch.get_device_module(device).device_count()
         return {i: [i % num_visible_devices] for i in range(self.world_size)}
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def run_subtests(
     cls_inst,
     subtest_config: dict[str, list[Any]],
@@ -1021,7 +1333,14 @@ def has_efa() -> bool:
 
     try:
         EFA_PROBE_RESULT = (
+<<<<<<< HEAD
             subprocess.run(["fi_info", "-p", "efa", "-t", "FI_EP_RDM"], check=False).returncode == 0
+=======
+            subprocess.run(
+                ["fi_info", "-p", "efa", "-t", "FI_EP_RDM"], check=False
+            ).returncode
+            == 0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     except FileNotFoundError:
         EFA_PROBE_RESULT = False
@@ -1049,7 +1368,10 @@ def spawn_threads_and_init_comms(
             spawn_threads_and_init_comms, timeout=timeout, world_size=world_size
         )
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _run_test_method_with_multi_threads(world_size, callback):
         world = _install_threaded_pg()
         global_store = c10d.HashStore()
@@ -1066,7 +1388,13 @@ def worker(rank, world_pg, store):
             except BaseException as ex:
                 # Exceptions are handled in MultiThreadedTestCase
                 MultiThreadedTestCase.exception_queue.put((rank, sys.exc_info()))
+<<<<<<< HEAD
                 ProcessLocalGroup.exception_handle(ex)  # trigger _terminate event and awaken worker threads
+=======
+                ProcessLocalGroup.exception_handle(
+                    ex
+                )  # trigger _terminate event and awaken worker threads
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             finally:
                 if world_is_valid():
                     c10d.destroy_process_group()
@@ -1079,13 +1407,22 @@ def worker(rank, world_pg, store):
 
         return threads
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @wraps(func)
     def wrapper(self, *args, **kwargs):
         # TODO: get test name from kwargs
         torch._C._distributed_c10d._set_thread_isolation_mode(True)
         try:
+<<<<<<< HEAD
             threads = _run_test_method_with_multi_threads(world_size, lambda: func(self, *args, **kwargs))
+=======
+            threads = _run_test_method_with_multi_threads(
+                world_size, lambda: func(self, *args, **kwargs)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # join and error handling
             MultiThreadedTestCase._join_threads(threads, func)
         finally:
@@ -1108,6 +1445,10 @@ class MultiThreadedTestCase(TestCase):
     No global state possible
         How bad of a limitation is this?
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     exception_queue = queue.Queue()
 
     MAIN_THREAD_RANK = -1
@@ -1122,7 +1463,13 @@ def wrapper(self):
 
         return types.MethodType(wrapper, self)
 
+<<<<<<< HEAD
     def __init__(self, method_name: str = "runTest", methodName: str = "runTest") -> None:
+=======
+    def __init__(
+        self, method_name: str = "runTest", methodName: str = "runTest"
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # methodName is the correct naming in unittest and testslide uses keyword arguments.
         # So we need to use both to 1) not break BC and, 2) support testslide.
         if methodName != "runTest":
@@ -1132,10 +1479,19 @@ def __init__(self, method_name: str = "runTest", methodName: str = "runTest") ->
             fn = getattr(self, method_name)
             setattr(self, method_name, self.join_or_run(fn))
         except AttributeError as e:
+<<<<<<< HEAD
             if methodName != 'runTest':
                 # we allow instantiation with no explicit method name
                 # but not an *incorrect* or missing method name
                 raise ValueError(f"no such test method in {self.__class__}: {methodName}") from e
+=======
+            if methodName != "runTest":
+                # we allow instantiation with no explicit method name
+                # but not an *incorrect* or missing method name
+                raise ValueError(
+                    f"no such test method in {self.__class__}: {methodName}"
+                ) from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def perThreadSetUp(self):
         # super().setUp()  # TestCase.setUp() calls torch.manual_seed()
@@ -1180,7 +1536,13 @@ def world_is_valid():
             raise RuntimeError("Invalid world")
 
         for rank in range(self.world_size):
+<<<<<<< HEAD
             t = threading.Thread(target=self.__class__._run, args=(test_name, rank, self.world_size))
+=======
+            t = threading.Thread(
+                target=self.__class__._run, args=(test_name, rank, self.world_size)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             t.start()
             self.threads.append(t)
 
@@ -1205,7 +1567,14 @@ def run_test_with_threaded_pg(self, test_name, rank, world_size):
         Run the current test associated with `test_name` using the threaded process group.
         """
         c10d.init_process_group(
+<<<<<<< HEAD
             backend="threaded", rank=rank, world_size=world_size, store=self.__class__.global_store
+=======
+            backend="threaded",
+            rank=rank,
+            world_size=world_size,
+            store=self.__class__.global_store,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.perThreadSetUp()
 
@@ -1213,12 +1582,21 @@ def run_test_with_threaded_pg(self, test_name, rank, world_size):
             getattr(self, test_name)()
         except BaseException as ex:
             self.exception_queue.put((rank, sys.exc_info()))
+<<<<<<< HEAD
             ProcessLocalGroup.exception_handle(ex)  # trigger _terminate event and awaken worker threads
+=======
+            ProcessLocalGroup.exception_handle(
+                ex
+            )  # trigger _terminate event and awaken worker threads
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         finally:
             c10d.destroy_process_group()
             self.perThreadTearDown()
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @classmethod
     def _join_threads(cls, threads, fn):
         timeout = TIMEOUT_DEFAULT
@@ -1262,7 +1640,14 @@ def _check_return_codes(cls, failed_ranks, timeout, fn):
             exc = exc_info[1]
             if isinstance(exc, unittest.SkipTest):
                 logger.info(
+<<<<<<< HEAD
                     "Thread %s skipping test %s for following reason: %s", rank, fn, str(exc)
+=======
+                    "Thread %s skipping test %s for following reason: %s",
+                    rank,
+                    fn,
+                    str(exc),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 if skip_code < 0:
                     skip_code = TEST_SKIPS["generic"].exit_code
@@ -1272,12 +1657,17 @@ def _check_return_codes(cls, failed_ranks, timeout, fn):
                 raise RuntimeError(msg)
             elif isinstance(exc, Exception):
                 msg = "".join(traceback.format_exception(*exc_info))
+<<<<<<< HEAD
                 logger.error(
                     "Caught exception: \n%s exiting thread %s", msg, rank
                 )
                 error_msg += (
                     f"Thread {rank} exited with exception:\n{msg}\n"
                 )
+=======
+                logger.error("Caught exception: \n%s exiting thread %s", msg, rank)
+                error_msg += f"Thread {rank} exited with exception:\n{msg}\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif isinstance(exc, SystemExit):
                 if type(exc.code) == int and skip_code < 0:
                     skip_code = exc.code
@@ -1292,7 +1682,13 @@ def _check_return_codes(cls, failed_ranks, timeout, fn):
                     if IS_SANDCASTLE:
                         # "pass" the test with an appropriate message.
                         logger.info(
+<<<<<<< HEAD
                             "Skipping %s on sandcastle for the following reason: %s", fn, skip.message
+=======
+                            "Skipping %s on sandcastle for the following reason: %s",
+                            fn,
+                            skip.message,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
                         return
                     else:
@@ -1352,14 +1748,27 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         self.forward_inputs[self] = x
         return self.c2(self.c1(x))
 
+<<<<<<< HEAD
 @contextmanager
 def _dynamo_dist_per_rank_init(rank, world_size, init_pg=True, fake_pg=False):
+=======
+
+@contextmanager
+def _dynamo_dist_per_rank_init(
+    rank, world_size, backend="nccl", init_pg=True, fake_pg=False
+):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
     # Just manually implement the most important part of the dynamo behavior to reset/clear.
     if not fake_pg:
         torch.accelerator.set_device_index(rank)
+<<<<<<< HEAD
     os.environ['MASTER_ADDR'] = 'localhost'
     os.environ['MASTER_PORT'] = '6789'
+=======
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "6789"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if init_pg:
         if fake_pg:
             store = torch.testing._internal.distributed.fake_pg.FakeStore()
@@ -1370,7 +1779,11 @@ def _dynamo_dist_per_rank_init(rank, world_size, init_pg=True, fake_pg=False):
                 store=store,
             )
         else:
+<<<<<<< HEAD
             c10d.init_process_group("nccl", rank=rank, world_size=world_size)
+=======
+            c10d.init_process_group(backend=backend, rank=rank, world_size=world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch._dynamo.reset()
     torch._dynamo.utils.counters.clear()
     try:
@@ -1414,7 +1827,11 @@ def tearDownClass(cls):
         super().tearDownClass()
 
 
+<<<<<<< HEAD
 class DynamoDistributedMultiProcTestCase(MultiProcessTestCase):
+=======
+class DynamoDistributedMultiProcTestCase(DistributedTestBase):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Use this for tests that actually run on multiple GPUs.
 
@@ -1424,6 +1841,7 @@ class DynamoDistributedMultiProcTestCase(MultiProcessTestCase):
     Prefer MultiThreadedTestCase for most tests. Perhaps use this one
     sparingly for integration tests.
     """
+<<<<<<< HEAD
     def setUp(self):
         super().setUp()
         self._spawn_processes()
@@ -1441,6 +1859,17 @@ def world_size(self) -> int:
 
     @classmethod
     def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe, **kwargs) -> None:
+=======
+
+    @property
+    def world_size(self) -> int:
+        return torch.accelerator.device_count()
+
+    @classmethod
+    def _run(
+        cls, rank: int, test_name: str, file_name: str, parent_pipe, **kwargs
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         trace_log.addHandler(logging.NullHandler())
 
         # The rest is copypasta from MultiProcessTestCase._run
@@ -1452,14 +1881,23 @@ def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe, **kwargs)
 
 class MultiProcContinousTest(TestCase):
     # Class variables:
+<<<<<<< HEAD
     # number of test processes
     world_size: int = 2
     # rank of the current process
     rank: int = -1  # unset state
+=======
+    MAIN_PROCESS_RANK = -1
+    # number of test processes
+    world_size: int = -2  # unset state
+    # rank of the current process
+    rank: int = -2  # unset state
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Rendezvous file
     rdvz_file: Optional[str] = None
     # timeout configured per class
     timeout: timedelta = timedelta(seconds=120)
+<<<<<<< HEAD
 
 
     @classmethod
@@ -1471,6 +1909,27 @@ def backend_str(cls) -> str:
         Here we raise error.
         """
         raise NotImplementedError("Please implement backend_str in your test class")
+=======
+    # Poison pill for rest of tests if one of them fails
+    poison_pill: bool = False
+
+    @classmethod
+    def backend_str(cls) -> Optional[str]:
+        """
+        ProcessGroup backend str.
+        To be customized by sub test classes, e.g. "nccl".
+        Otherwise we return None -- lazily decided by tensor.
+        """
+        return None
+
+    # Please override if you intend to test on specific device type
+    @classmethod
+    def device_type(cls) -> str:
+        curr_device = torch.accelerator.current_accelerator()
+        if curr_device is None:
+            return "cpu"
+        return curr_device.type
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def opts(cls, high_priority_stream=False):
@@ -1482,12 +1941,111 @@ def opts(cls, high_priority_stream=False):
         return None
 
     @classmethod
+<<<<<<< HEAD
+=======
+    def _init_pg(cls, rank, world_size, rdvz_file):
+        assert rdvz_file is not None
+        store = c10d.FileStore(rdvz_file, world_size)
+
+        # create nccl processgroup with opts
+        c10d.init_process_group(
+            backend=cls.backend_str(),
+            world_size=world_size,
+            rank=rank,
+            store=store,
+            pg_options=cls.opts(),
+            timeout=cls.timeout,
+        )
+        cls.pg = c10d.distributed_c10d._get_default_group()
+
+    @classmethod
+    def _run_test_given_id(cls, test_id: str, **kwargs) -> None:
+        # self.id() == e.g. '__main__.TestDistributed.TestAdditive.test_get_rank'
+        test_name = test_id.split(".")[-1]
+        # Get the test function from the test class
+        self = cls(test_name)
+        self.rank = cls.rank
+        self.world_size = cls.world_size
+        test_fn = getattr(self, test_name)
+        # Run the test function
+        test_fn(**kwargs)
+
+    @classmethod
+    def _worker_loop(cls, rank, world_size, rdvz_file, task_queue, completion_queue):
+        # Sub tests are going to access these values, check first
+        assert 0 <= rank < world_size
+        # set class variables for the test class
+        cls.rank = rank
+        cls.world_size = world_size
+
+        # Initialize the process group
+        cls._init_pg(rank, world_size, rdvz_file)
+
+        # End of bootstrap
+        logger.info("Setup complete")
+
+        # Loop forever, waiting for a test name to run
+        while True:
+            test_id = task_queue.get()
+            logger.debug(f"Got test {test_id}")  # noqa: G004
+            # None means exit
+            if test_id is None:
+                break
+
+            # Run the test
+            try:
+                cls._run_test_given_id(test_id)
+                completion_queue.put(test_id)
+            except BaseException as ex:
+                # Send the exception back to the dispatcher
+                completion_queue.put(ex)
+
+        # Termination
+        logger.info("Terminating ...")
+        c10d.destroy_process_group()
+
+    @classmethod
+    def _spawn_processes(cls, world_size) -> None:
+        cls.processes = []
+        cls.task_queues = []
+        cls.completion_queues = []
+        # Need a rendezvous file for `init_process_group` purpose.
+        cls.rdvz_file = tempfile.NamedTemporaryFile(delete=False).name
+
+        # CUDA multiprocessing requires spawn instead of fork, to make sure
+        # child processes have their own memory space.
+        try:
+            torch.multiprocessing.set_start_method("spawn")
+        except RuntimeError:
+            # The start method has already been set
+            pass
+
+        for rank in range(int(world_size)):
+            task_queue = torch.multiprocessing.Queue()
+            completion_queue = torch.multiprocessing.Queue()
+            process = torch.multiprocessing.Process(
+                target=cls._worker_loop,
+                name="process " + str(rank),
+                daemon=True,  # so that child processes will exit if parent decides to terminate
+                args=(rank, world_size, cls.rdvz_file, task_queue, completion_queue),
+            )
+            process.start()
+            cls.processes.append(process)
+            cls.task_queues.append(task_queue)
+            cls.completion_queues.append(completion_queue)
+            logger.info(
+                "Started process %s with pid %s", rank, process.pid
+            )  # noqa: UP031
+
+    @classmethod
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def setUpClass(cls):
         """
         Class-scope test fixture. Run once for entire test class, before any test starts.
         Set up the process group.
         """
         super().setUpClass()
+<<<<<<< HEAD
         if not 0 <= cls.rank < cls.world_size:
             raise RuntimeError(
                 "Rank must be set and in the range of 0 to world_size. "
@@ -1512,6 +2070,22 @@ def setUpClass(cls):
         )
         cls.pg = c10d.distributed_c10d._get_default_group()
         print(f"Rank {cls.rank} setup complete")
+=======
+
+        # Use device count as world size
+        device_type = cls.device_type()
+        # If world_size is not set, use device count
+        if cls.world_size == -2:
+            cls.world_size = torch.get_device_module(device_type).device_count()
+            if cls.world_size == 0:
+                raise unittest.SkipTest(f"No {device_type} devices available")
+
+        logger.info(
+            f"Testing class {cls.__name__} on {cls.world_size} {device_type}"  # noqa: G004
+        )
+
+        cls._spawn_processes(cls.world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @classmethod
     def tearDownClass(cls):
@@ -1519,6 +2093,7 @@ def tearDownClass(cls):
         Class-scope test fixture. Run once for entire test class, after all tests finish.
         Tear down the process group.
         """
+<<<<<<< HEAD
         c10d.destroy_process_group()
         super().tearDownClass()
         # Clear up the rendezvous file
@@ -1553,3 +2128,93 @@ def run_rank(
         cls.rdvz_file = rdvz_file
         # Launch tests via `common_utils` infra
         run_tests()
+=======
+        logger.debug(f"Joining {cls.world_size} workers")  # noqa: G004
+        # Enqueue "None" to all workers to tell them to exit
+        for task_queue in cls.task_queues:
+            task_queue.put(None)
+
+        # Wait for all workers to exit
+        for process in cls.processes:
+            process.join()
+
+        # Clear up the rendezvous file
+        try:
+            os.remove(cls.rdvz_file)
+        except OSError:
+            pass
+
+        logger.info(f"Class {cls.__name__} finished")  # noqa: G004
+        super().tearDownClass()
+
+    def setUp(self) -> None:
+        """
+        Test fixture. Run before each test.
+        """
+        super().setUp()
+
+        # I am the dispatcher
+        self.rank = self.MAIN_PROCESS_RANK
+
+        # If this test class hits an exception in one test, skip the rest of tests
+        if self.__class__.poison_pill:
+            raise unittest.SkipTest(f"Previous test failed, skipping {self.id()}")
+
+        # Enqueue "current test" to all workers
+        for i, task_queue in enumerate(self.task_queues):
+            logger.debug(f"Sending Rank {i}: {self.id()}")  # noqa: G004
+            task_queue.put(self.id())
+
+    def _worker_run_main_wait(self, fn):
+        @wraps(fn)
+        def wrapper(self):
+            if self.rank == self.MAIN_PROCESS_RANK:
+                logger.debug(f"Waiting for workers to finish {self.id()}")  # noqa: G004
+                # Wait for the workers to finish the test
+                for i, completion_queue in enumerate(self.completion_queues):
+                    rv = completion_queue.get()
+                    if isinstance(rv, BaseException):
+                        # Hit an exception, re-raise it in the main process.
+                        logger.warning(
+                            f"Detected failure from Rank {i} in: {self.id()}, "  # noqa: G004
+                            f"skipping rest of tests in Test class: {self.__class__.__name__}"  # noqa: G004
+                        )
+                        # Poison rest of tests (because ProcessGroup may be not
+                        # reusable now)
+                        self.__class__.poison_pill = True
+                        raise rv
+
+                    # Success
+                    assert rv == self.id()
+                    logger.debug(
+                        f"Main proc detected rank {i} finished {self.id()}"  # noqa: G004
+                    )
+            else:
+                # Worker just runs the test
+                fn()
+
+        return types.MethodType(wrapper, self)
+
+    # The main process spawns N subprocesses that run the test.
+    # Constructor patches current instance test method to
+    # assume the role of the main process and join its subprocesses,
+    # or run the underlying test function.
+    def __init__(
+        self, method_name: str = "runTest", methodName: str = "runTest"
+    ) -> None:
+        # methodName is the correct naming in unittest and testslide uses keyword arguments.
+        # So we need to use both to 1) not break BC and, 2) support testslide.
+        if methodName != "runTest":
+            method_name = methodName
+        super().__init__(method_name)
+        try:
+            fn = getattr(self, method_name)
+            setattr(self, method_name, self._worker_run_main_wait(fn))
+        except AttributeError as e:
+            if methodName != "runTest":
+                # we allow instantiation with no explicit method name
+                # but not an *incorrect* or missing method name
+                raise ValueError(
+                    f"no such test method in {self.__class__}: {methodName}"
+                ) from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 9548f0bf3dad..e71246b6800b 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1180,7 +1180,11 @@ def run_subtests(self, *args, **kwargs):
         return run_subtests(self, *args, **kwargs)
 
     @classmethod
+<<<<<<< HEAD
     def _run(cls, rank, test_name, file_name, pipe, **kwargs):
+=======
+    def _run(cls, rank, test_name, file_name, pipe, **kwargs):  # type: ignore[override]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self = cls(test_name)
         self.rank = rank
         self.file_name = file_name
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 1cb84a8edf15..bb3c2de15ddd 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -22,7 +22,11 @@
 from torch.testing._internal.common_dtype import (
     _dispatch_dtypes, floating_types, floating_types_and, complex_types, floating_and_complex_types,
     floating_and_complex_types_and, all_types_and_complex_and, all_types_and, all_types_and_complex, integral_types_and,
+<<<<<<< HEAD
     empty_types, complex_types_and, integral_types, custom_types, all_types_complex_float8_and,
+=======
+    empty_types, complex_types_and, integral_types, custom_types, all_types_complex_float8_and, float8_types,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.common_device_type import \
     (onlyCPU, onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
@@ -36,10 +40,17 @@
 )
 from torch.testing._internal.common_utils import (
     make_fullrank_matrices_with_distinct_singular_values,
+<<<<<<< HEAD
     TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW,
     TEST_WITH_TORCHINDUCTOR
+=======
+    TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, IS_S390X, TEST_SCIPY,
+    torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
+    GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW,
+    TEST_WITH_TORCHINDUCTOR, MACOS_VERSION
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._utils import wrapper_set_seed
 
@@ -51,7 +62,11 @@
 from torch.utils import _pytree as pytree
 
 
+<<<<<<< HEAD
 from packaging import version
+=======
+from torch._vendor.packaging import version
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.testing._internal.opinfo.core import (  # noqa: F401
     L,
@@ -97,6 +112,10 @@
     sample_inputs_foreach,
     ForeachFuncInfo,
     gradcheck_wrapper_hermitian_input,
+<<<<<<< HEAD
+=======
+    gradcheck_wrapper_ctc_loss,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     gradcheck_wrapper_triangular_input,
     gradcheck_wrapper_triangular_input_real_positive_diagonal,
     gradcheck_wrapper_masked_operation,
@@ -776,9 +795,19 @@ def sample_inputs_add_sub(op, device, dtype, requires_grad, **kwargs):
         yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': False})
 
 def error_inputs_arange(op, device, **kwargs):
+<<<<<<< HEAD
     yield ErrorInput(SampleInput(0, args=(3, 0)), error_type=RuntimeError, error_regex='step must be nonzer')
     yield ErrorInput(SampleInput(0, args=(-3, 2)), error_type=RuntimeError, error_regex='bound inconsistent with step sign')
     yield ErrorInput(SampleInput(0, args=(3, -2)), error_type=RuntimeError, error_regex='bound inconsistent with step sign')
+=======
+    yield ErrorInput(SampleInput(0, args=(3, 0)), error_type=RuntimeError, error_regex='step must be nonzero')
+    yield ErrorInput(SampleInput(0, args=(-3, 2)), error_type=RuntimeError,
+                     error_regex='upper bound and lower bound inconsistent with step sign')
+    yield ErrorInput(SampleInput(0, args=(3, -2)), error_type=RuntimeError,
+                     error_regex='upper bound and lower bound inconsistent with step sign')
+    yield ErrorInput(SampleInput(1549556900, args=(1549556828, 1989724)), error_type=RuntimeError,
+                     error_regex='upper bound and lower bound inconsistent with step sign')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     yield ErrorInput(SampleInput(0, args=(float('inf'), 2)), error_type=RuntimeError, error_regex='unsupported range')
     yield ErrorInput(SampleInput(float('-inf'), args=(1, 2)), error_type=RuntimeError, error_regex='unsupported range')
 
@@ -1601,7 +1630,11 @@ def sample_inputs_like_fns(self, device, dtype, requires_grad, **kwargs):
         ((S,), {'dtype': dtype, 'device': device}),
         # Hard-code some dtypes/devices. We want to test cases where the
         # (dtype, device) is different from the input's (dtype, device)
+<<<<<<< HEAD
         ((S,), {'dtype': torch.double}),
+=======
+        ((S,), {'dtype': torch.double if device != 'mps:0' else torch.float}),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ((S,), {'device': 'cpu'}),
         ((S,), {'dtype': torch.double, 'device': 'cpu'}),
     ]
@@ -1786,7 +1819,10 @@ def error_inputs_margin_ranking_loss(op, device, **kwargs):
                      error_regex='margin_ranking_loss : All input tensors should')
 
 def sample_inputs_new_fns(self, device, dtype, requires_grad, *, is_strided=False, **kwargs):
+<<<<<<< HEAD
     other_dtype = torch.half if torch.backends.mps.is_available() else torch.double
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # input_shape, output_shape, strides, kwargs
     # lengths of output_shape and strides must be equal
     inputs = [
@@ -1796,9 +1832,15 @@ def sample_inputs_new_fns(self, device, dtype, requires_grad, *, is_strided=Fals
         ((S,), (2, 3), (7, 8), {'dtype': dtype, 'device': device}),
         # Hard-code some dtypes/devices. We want to test cases where the
         # (dtype, device) is different from the input's (dtype, device)
+<<<<<<< HEAD
         ((S,), (10,), (S,), {'dtype': other_dtype}),
         ((S,), (1, 1, 12), (S, L, M), {'device': 'cpu'}),
         ((S,), (2, 2, 2), (L, M, S), {'dtype': other_dtype, 'device': 'cpu'}),
+=======
+        ((S,), (10,), (S,), {'dtype': torch.double if device != 'mps:0' else torch.float}),
+        ((S,), (1, 1, 12), (S, L, M), {'device': 'cpu'}),
+        ((S,), (2, 2, 2), (L, M, S), {'dtype': torch.double, 'device': 'cpu'}),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     if torch.cuda.is_available():
         inputs.append(((S,), (7, 2), (3, 4), {'device': 'cuda'}))
@@ -1919,6 +1961,10 @@ def sample_inputs_full_like(self, device, dtype, requires_grad, **kwargs):
     def get_val(dtype):
         return make_tensor([], dtype=dtype, device="cpu").item()
 
+<<<<<<< HEAD
+=======
+    double_dtype = torch.double if device != "mps:0" else torch.float
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     inputs = [
         ((), get_val(dtype), {}),
         ((S, S), get_val(dtype), {}),
@@ -1926,13 +1972,29 @@ def get_val(dtype):
         ((S,), get_val(dtype), {'dtype': dtype, 'device': device}),
         # Hard-code some dtypes/devices. We want to test cases where the
         # (dtype, device) is different from the input's (dtype, device)
+<<<<<<< HEAD
         ((S,), get_val(torch.double), {'dtype': torch.double}),
         ((S,), get_val(dtype), {'device': 'cpu'}),
         ((S,), get_val(torch.double), {'dtype': torch.double, 'device': 'cpu'}),
+=======
+        ((S,), get_val(double_dtype), {'dtype': double_dtype}),
+        ((S,), get_val(dtype), {'device': 'cpu'}),
+        ((S,), get_val(double_dtype), {'dtype': double_dtype, 'device': 'cpu'}),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
     if torch.cuda.is_available():
         inputs.append(((S,), get_val(dtype), {'device': 'cuda'}))
 
+<<<<<<< HEAD
+=======
+    if torch.mps.is_available() and dtype not in [torch.float64, torch.complex128, torch.uint32, torch.uint16]:
+        inputs.append(((S,), get_val(dtype), {'device': 'mps'}))
+
+    if not dtype.is_signed:
+        # For unsigned dtypes, negative values are converted.
+        inputs.append(((S,), -get_val(dtype), {}))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     for shape, fill_value, kwargs in inputs:
         t = make_tensor(shape, dtype=dtype, device=device,
                         low=None, high=None,
@@ -2298,6 +2360,10 @@ def sample_inputs_chunk_cat(op_info, device, dtype, requires_grad, **kwargs):
     #        No requirements for (wrapped_dim, ...)-th dimension.
     # 3. Expect positive num_chunks
     # 4. Expect non-empty input tensor list and each input tensor should have at least 1 element
+<<<<<<< HEAD
+=======
+    # 5. Non-contiguous input tensors are allowed.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
     same_ndim_cases = (
         (
@@ -2344,6 +2410,17 @@ def sample_inputs_chunk_cat(op_info, device, dtype, requires_grad, **kwargs):
             tensors.append(make_arg(size))
         yield SampleInput(tensors, args=(dim, num_chunks))
 
+<<<<<<< HEAD
+=======
+    # non-contiguous
+    for dim in range(max_dim):
+        tensors = []
+        for size in different_ndim_case:
+            # make the last 2 dims column-major (i.e. non-contiguous)
+            t = make_arg(size).transpose(-2, -1).contiguous().transpose(-2, -1)
+            tensors.append(t)
+        yield SampleInput(tensors, args=(dim, num_chunks))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def error_inputs_chunk_cat(op_info, device, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=torch.float32)
@@ -2607,6 +2684,13 @@ def sample_inputs_gather(op_info, device, dtype, requires_grad, **kwargs):
         gather_variable((S, S), 1, M, True, device=device))
     yield SampleInput(
         make_arg((M, S)),
+<<<<<<< HEAD
+=======
+        0,
+        gather_variable((S, S), 1, M, True, device=device).to(torch.int32))
+    yield SampleInput(
+        make_arg((M, S)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         1,
         gather_variable((M, S // 2), 0, S, True, device=device))
     # Empty index tensor case, see: https://github.com/pytorch/pytorch/pull/65006
@@ -2650,11 +2734,14 @@ def error_inputs_gather(op_info, device, **kwargs):
     yield ErrorInput(SampleInput(bad_src, args=(1, idx,)),
                      error_regex="Size does not match at dimension 0")
 
+<<<<<<< HEAD
     # Index must have long dtype
     bad_idx = idx.to(torch.int32)
     yield ErrorInput(SampleInput(src, args=(1, bad_idx)),
                      error_regex="Expected dtype int64 for index")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # TODO: FIXME
     # out.dtype must match src.dtype
     # Creates new src & idx since SampleInputs can't share tensors
@@ -2727,6 +2814,7 @@ def error_inputs_scatter_and_scatter_add(op_info, device, **kwargs):
     yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
                      error_regex="Expected self.dtype to be equal to src.dtype")
 
+<<<<<<< HEAD
     # Index dtype must be long
     src = make_tensor((2, 5), device=device, dtype=torch.float32)
     idx = torch.tensor(((0, 1), (1, 2)), device=device, dtype=torch.int32)
@@ -2734,6 +2822,8 @@ def error_inputs_scatter_and_scatter_add(op_info, device, **kwargs):
     yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
                      error_regex="Expected dtype int64 for index")
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Index and destination must have the same number of dimensions
     src = make_tensor((2, 5), device=device, dtype=torch.float32)
     idx = torch.tensor(((0, 1), (1, 2)), device=device, dtype=torch.long)
@@ -2775,7 +2865,11 @@ def error_inputs_ormqr(op_info, device, **kwargs):
     bool_3 = True
     bool_4 = True
     yield ErrorInput(SampleInput(tensor_0, args=(tensor_1, tensor_2, bool_3, bool_4)), error_type=RuntimeError,
+<<<<<<< HEAD
                      error_regex=r"tau.shape\[-1\] must be less than or equal to input.shape\[-1\]")
+=======
+                     error_regex=r"tau.shape\[-1\] must be equal to min\(other.shape\[-2\], input.shape\[-1\]\)")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def error_inputs_diag(op_info, device, **kwargs):
@@ -3261,6 +3355,7 @@ def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs):
     test_args = [
         ([1, 2],),
         (slice(0, 3),),
+<<<<<<< HEAD
         ([slice(0, 3), 1],),
         ([[0, 2, 3], [1, 3, 3], [0, 0, 2]],),
         ([[0, 0, 3], [1, 1, 3], [0, 0, 2]],),
@@ -3272,6 +3367,19 @@ def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs):
         ([[0, 3], slice(None)],),
         ([[0, 3], Ellipsis],),
         ([[0, 2, 3], [1, 3, 3], torch.LongTensor([0, 0, 2])],),
+=======
+        ((slice(0, 3), 1),),
+        (([0, 2, 3], [1, 3, 3], [0, 0, 2]),),
+        (([0, 0, 3], [1, 1, 3], [0, 0, 2]),),
+        ((slice(None), slice(None), [0, 3]),),
+        ((slice(None), [0, 3], slice(None)),),
+        (([0, 3], slice(None), slice(None)),),
+        (([0, 3], [1, 2], slice(None)),),
+        (([0, 3], ),),
+        (([0, 3], slice(None)),),
+        (([0, 3], Ellipsis),),
+        (([0, 2, 3], [1, 3, 3], torch.LongTensor([0, 0, 2])),),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (index_variable(2, S, device=device),),
         (mask_not_all_zeros((S,)),),
     ]
@@ -3279,7 +3387,11 @@ def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs):
     for args in test_args:
         yield SampleInput(make_arg((S, S, S)), args=args)
 
+<<<<<<< HEAD
     yield SampleInput(make_arg((S, S, S, S)), args=([slice(None), [0, 1], slice(None), [0, 1]],))
+=======
+    yield SampleInput(make_arg((S, S, S, S)), args=((slice(None), [0, 1], slice(None), [0, 1]),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def sample_inputs_index_put(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -4748,6 +4860,16 @@ def shape(size, rank, with_batch_channel=True):
             return tuple([N, C] + ([size] * rank))
         return tuple([size] * rank)
 
+<<<<<<< HEAD
+=======
+    def uneven_shape(size, rank, with_batch_channel=True):
+        rc = list(shape(size, rank, with_batch_channel))
+        rc[-1] += 1
+        if rank > 2:
+            rc[-2] -= 1
+        return tuple(rc)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if mode in ('bilinear', 'bicubic') and dtype == torch.uint8:
         make_arg = partial(
             make_tensor,
@@ -4786,6 +4908,24 @@ def shape(size, rank, with_batch_channel=True):
                 mode=mode,
                 align_corners=align_corners,
             )
+<<<<<<< HEAD
+=======
+            if rank > 1 and dtype.is_floating_point:
+                yield SampleInput(
+                    make_arg(uneven_shape(D, rank)),
+                    uneven_shape(S, rank, False),
+                    scale_factor=None,
+                    mode=mode,
+                    align_corners=align_corners,
+                )
+                yield SampleInput(
+                    make_arg(uneven_shape(D, rank)),
+                    uneven_shape(L, rank, False),
+                    scale_factor=None,
+                    mode=mode,
+                    align_corners=align_corners,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for recompute_scale_factor in [False, True]:
                 for scale_factor in [1.7, 0.6]:
                     yield SampleInput(
@@ -7096,12 +7236,22 @@ def sample_inputs_tensordot(self, device, dtype, requires_grad, **kwargs):
     cases = (
         ((2, 2, 2), (2, 2, 2), (2)),
         ((2, 2, 1), (2, 1, 2), ([0, 1], [2, 0])),
+<<<<<<< HEAD
     )
     for first_shape, second_shape, dims in cases:
         yield SampleInput(make_tensor(first_shape, dtype=dtype, device=device,
                                       requires_grad=requires_grad),
                           make_tensor(second_shape, dtype=dtype, device=device,
                                       requires_grad=requires_grad),
+=======
+        ((1, 1, 1), (2, 1, 2), ([0, 1], [2, 0])),
+    )
+    for first_shape, second_shape, dims in cases:
+        yield SampleInput(make_tensor(first_shape, dtype=dtype, device=device,
+                                      requires_grad=requires_grad, low=-1, high=+2),
+                          make_tensor(second_shape, dtype=dtype, device=device,
+                                      requires_grad=requires_grad, low=-1, high=+2),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                           dims=dims)
 
 def sample_inputs_kron(op_info, device, dtype, requires_grad, **kwargs):
@@ -7131,6 +7281,10 @@ def _gather(shape, index_dim, max_indices):
     zero = torch.tensor(0, dtype=torch.long, device=device)
     test_cases = (
         (_tensor((M, S)), (0, _gather((S, S), 1, M), _tensor((S, S)))),
+<<<<<<< HEAD
+=======
+        (_tensor((M, S)), (0, _gather((S, S), 1, M).to(torch.int32), _tensor((S, S)))),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (_tensor((M, S)), (1, _gather((S, S), 0, S), _tensor((S, S)))),
         (_tensor((M, S)), (-1, _gather((S, S), 0, S), _tensor((S, S)))),
         (_tensor((M, S)), (0, _gather((M, S // 2), 1, M), _tensor((M, S // 2)))),
@@ -7652,6 +7806,10 @@ def sample_inputs_select(op_info, device, dtype, requires_grad, **kwargs):
              ((S, S, S), (-1, 2)),
              ((S, S, S), (-1, -1)),
              ((S, S, S), (1, -1)),
+<<<<<<< HEAD
+=======
+             ((S, S), (-1, 2)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
              ((S,), (0, 2))
              )
 
@@ -7753,7 +7911,11 @@ def sample_inputs_where(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
     def make_bool_mask(shape):
+<<<<<<< HEAD
         # Make sure atleast one element is nonzero,
+=======
+        # Make sure at least one element is nonzero,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # except for empty tensor
         mask_t = make_tensor(shape, dtype=torch.bool, device=device, requires_grad=False)
 
@@ -7800,6 +7962,13 @@ def reference_inputs_where(op, device, dtype, requires_grad, **kwargs):
     # NOTE that the OpInfo for where takes samples of the form a, cond, b
     yield SampleInput(a, args=(c, b))
 
+<<<<<<< HEAD
+=======
+    # MPS does not support float64, which causes issues in the following tests
+    if torch.device(device).type == "mps":
+        return
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # type promoting
     # FIXME(rec): shouldn't other_dtype be used two lines below?
     other_dtype = torch.double if dtype is not torch.double else torch.long  # noqa: F841
@@ -8362,7 +8531,13 @@ def make_log_probs(s):
             input_lengths = input_lengths.tolist()
             target_lengths = target_lengths.tolist()
 
+<<<<<<< HEAD
         yield SampleInput(log_probs, args=(targets, input_lengths, target_lengths,), kwargs=dict(reduction=r, zero_infinity=z))
+=======
+        yield SampleInput(log_probs, args=(targets, input_lengths, target_lengths,),
+                          kwargs=dict(reduction=r, zero_infinity=z))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def sample_inputs_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
     shape = (2, 3)
@@ -12159,6 +12334,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                    'TestSchemaCheckModeOpInfo',
                    'test_schema_correctness',
                    dtypes=(torch.complex64, torch.complex128)),
+<<<<<<< HEAD
+=======
+               DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-3, rtol=2e-3)}),
+                            "TestConsistency", "test_output_grad_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            )),
     OpInfo('addmm',
            # When alpha=beta=1 as compile-time constants, JIT will decompose addmm into mm and add.
@@ -12193,6 +12373,13 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(
                    toleranceOverride({torch.half: tol(atol=1e-5, rtol=3e-3)}),
                    'TestInductorOpInfo', 'test_comprehensive', device_type='cpu'),
+<<<<<<< HEAD
+=======
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=2e-5, rtol=3e-6)}),
+                            "TestConsistency", "test_output_match", device_type="mps"),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=2e-5, rtol=3e-6)}),
+                            "TestConsistency", "test_output_grad_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            ],
            sample_inputs_func=sample_inputs_addmv),
     OpInfo('addbmm',
@@ -12219,7 +12406,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                                       torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
                    'TestCommon', 'test_numpy_ref_mps'),
                DecorateInfo(
+<<<<<<< HEAD
                    toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5)}),
+=======
+                   toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5),
+                                      torch.bfloat16: tol(atol=2e-1, rtol=6e-1)}),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    'TestConsistency',
                    'test_output_match',
                ),
@@ -12315,7 +12507,19 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
                DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5)}),
+<<<<<<< HEAD
                             "TestCommon", "test_out")
+=======
+                            "TestCommon", "test_out"),
+               # Fast math on MacOS-13?
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=2e-5, rtol=5e-6)}),
+                   'TestConsistency',
+                   'test_output_match',
+                   active_if=lambda _: MACOS_VERSION < 14.0,
+                   device_type='mps',
+                   dtypes=(torch.float32,)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            ),
            sample_inputs_func=sample_inputs_bmm),
     OpInfo('mv',
@@ -12494,6 +12698,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                            "test_comprehensive",
                            device_type="cuda"
                        ),
+<<<<<<< HEAD
+=======
+                       DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-3, rtol=2e-3)}),
+                                    "TestConsistency", "test_output_grad_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    ],
                    supports_inplace_autograd=False,
                    supports_forward_ad=True,
@@ -12701,10 +12910,33 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            check_batched_gradgrad=True,
            sample_inputs_func=sample_inputs_linalg_cholesky_inverse,
            gradcheck_wrapper=gradcheck_wrapper_triangular_input_real_positive_diagonal,
+<<<<<<< HEAD
            decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
            skips=(
                # Strides are not the same! Original strides were ((4, 2, 1),) and strides are now ((4, 1, 2),)
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),)),
+=======
+           decorators=[
+               skipCUDAIfNoMagma,
+               skipCPUIfNoLapack,
+               DecorateInfo(
+                   toleranceOverride({
+                       torch.float32: tol(atol=5e-03, rtol=1e-04)
+                   }),
+                   'TestCommon', device_type='cpu',
+               ),
+               DecorateInfo(
+                   toleranceOverride({
+                       torch.float32: tol(atol=5e-03, rtol=1e-04)
+                   }),
+                   'TestEagerFusionOpInfo', device_type='cpu',
+               ),
+           ],
+           skips=(
+               # Strides are not the same! Original strides were ((4, 2, 1),) and strides are now ((4, 1, 2),)
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),),
+           ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpInfo('cholesky_solve',
            op=torch.cholesky_solve,
            dtypes=floating_and_complex_types(),
@@ -13002,6 +13234,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                DecorateInfo(toleranceOverride({torch.float16: tol(atol=8e-3, rtol=1.4e-3)}),
                             "TestInductorOpInfo", "test_comprehensive", device_type="cpu"),
+<<<<<<< HEAD
+=======
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=3e-4, rtol=1e-4)}),
+                            "TestConsistency", "test_output_grad_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            )),
     OpInfo('cross',
            dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
@@ -13153,6 +13390,15 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                                      'test_fn_grad',
                                      dtypes=(torch.float64,),
                                      device_type='cpu'),
+<<<<<<< HEAD
+=======
+                        DecorateInfo(unittest.skip("Broken on MacOS13"),
+                                     'TestConsistency',
+                                     'test_output_match',
+                                     device_type='mps',
+                                     dtypes=(torch.float16,),
+                                     active_if=lambda _: MACOS_VERSION < 14.0),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )),
     BinaryUfuncInfo('true_divide',
                     dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -13381,6 +13627,15 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                             "test_comprehensive",
                             device_type="cuda"
                         ),
+<<<<<<< HEAD
+=======
+                        DecorateInfo(unittest.skip("Broken on MacOS13"),
+                                     'TestConsistency',
+                                     'test_output_match',
+                                     device_type='mps',
+                                     dtypes=(torch.float16,),
+                                     active_if=lambda _: MACOS_VERSION < 14.0),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )),
     UnaryUfuncInfo('frac',
                    ref=lambda x: np.modf(x)[0],
@@ -13629,7 +13884,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                    supports_fwgrad_bwgrad=True,
                    skips=(
                        # skips below tests as torch.frexp returns tuple-like (mantissa, exponent) as outputs,
+<<<<<<< HEAD
                        # while theses tests currently requires output to a single tensor.
+=======
+                       # while these tests currently requires output to a single tensor.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_batch_vs_slicing'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_contig_vs_every_other'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_contig_vs_transposed'),
@@ -14071,6 +14330,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            check_batched_forward_grad=False,
            supports_out=False,
            skips=(
+<<<<<<< HEAD
+=======
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            )),
     OpInfo('masked_select',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -15020,6 +15285,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                 'test_variant_consistency_jit',
                 dtypes=(torch.float32,)
             ),
+<<<<<<< HEAD
+=======
+            DecorateInfo(toleranceOverride({torch.float32: tol(atol=2e-5, rtol=3e-6)}),
+                         "TestConsistency", "test_output_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     ),
     UnaryUfuncInfo(
@@ -15294,6 +15564,13 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                    'TestCommon', 'test_noncontiguous_samples',
                ),
                DecorateInfo(
+<<<<<<< HEAD
+=======
+                   toleranceOverride({torch.complex64: tol(atol=2e-5, rtol=3e-6)}),
+                   'TestCommon', 'test_variant_consistency_eager',
+               ),
+               DecorateInfo(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    toleranceOverride({torch.complex64: tol(atol=5e-5, rtol=5e-6)}),
                    'TestMathBits', 'test_conj_view',
                ),
@@ -15509,6 +15786,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                # NOTE: this failure may not reproduce consistently on different systems
                # false INTERNAL ASSERT FAILED at "...torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185
                DecorateInfo(unittest.skip("Internal assert failed!"), 'TestJit', 'test_variant_consistency_jit'),
+<<<<<<< HEAD
+=======
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            )),
     OpInfo('nn.functional.interpolate',
            aten_name="interpolate",
@@ -15542,9 +15825,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(unittest.expectedFailure, 'TestOperators', 'test_vmapjvpall_has_batch_rule'),
                DecorateInfo(unittest.expectedFailure, 'TestOperators', 'test_vmapvjp_has_batch_rule'),
                DecorateInfo(unittest.expectedFailure, 'TestVmapOperatorsOpInfo', 'test_op_has_batch_rule'),
+<<<<<<< HEAD
                # NotImplementedError: The operator 'aten::_upsample_nearest_exact3d.out' is not currently implemented
                # for the MPS device.
                DecorateInfo(unittest.expectedFailure, 'TestConsistency'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            ),
            supports_out=False),
     OpInfo('nn.functional.interpolate',
@@ -16206,7 +16492,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
     OpInfo(
         'torch._scaled_mm',
         sample_inputs_func=sample_inputs_scaled_mm,
+<<<<<<< HEAD
         dtypes=empty_types(),
+=======
+        dtypes=float8_types(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dtypesIfCUDA=empty_types() + (torch.float8_e4m3fn,),
         supports_out=True,
         supports_forward_ad=False,
@@ -16214,12 +16504,29 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         decorators=[skipCUDAIf(not SM89OrLater or TEST_WITH_ROCM, 'Requires CUDA SM >= 8.9')],
         skips=(
             # Sample inputs isn't really parametrized on dtype
+<<<<<<< HEAD
             DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes',
                          device_type='cuda'),
             # "mul_cuda" not implemented for float8_e4m3fn
             # https://github.com/pytorch/pytorch/issues/107256
             DecorateInfo(unittest.skip("Skipped!"), 'TestSchemaCheckModeOpInfo', 'test_schema_correctness',
                          dtypes=(torch.float8_e4m3fn,)),
+=======
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
+            # "add_stub" not implemented for 'Float8_e4m3fn'
+            # "ufunc_add_CUDA" not implemented for 'Float8_e4m3fn'
+            # https://github.com/pytorch/pytorch/issues/107256
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
+            # "mul_cuda" not implemented for float8_e4m3fn
+            # "mul_cpu_reduced_float" not implemented for 'Float8_e4m3fn'
+            # https://github.com/pytorch/pytorch/issues/107256
+            DecorateInfo(unittest.skip("Skipped!"), 'TestSchemaCheckModeOpInfo', 'test_schema_correctness'),
+            # aten::_scaled_mm hit the vmap fallback which is currently disabled
+            DecorateInfo(unittest.skip("Skipped!"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+            DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                         dtypes=(torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     ),
     OpInfo(
@@ -16869,6 +17176,17 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                    'TestSchemaCheckModeOpInfo',
                    'test_schema_correctness',
                    dtypes=(torch.complex64, torch.complex128)),
+<<<<<<< HEAD
+=======
+               # Fast math on MacOS-13?
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=2e-5, rtol=5e-6)}),
+                   'TestConsistency',
+                   'test_output_match',
+                   active_if=lambda _: MACOS_VERSION < 14.0,
+                   device_type='mps',
+                   dtypes=(torch.float32,)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            )),
     OpInfo('mode',
            op=torch.mode,
@@ -17298,6 +17616,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                                     dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+<<<<<<< HEAD
+=======
+                       DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-3, rtol=2e-3)}),
+                                    "TestConsistency", "test_output_grad_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    ),
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),)),
     UnaryUfuncInfo('sinc',
@@ -17668,6 +17991,13 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                            dtypes=(torch.float16,),
                            device_type="cuda",
                        ),
+<<<<<<< HEAD
+=======
+                       DecorateInfo(toleranceOverride({torch.complex64: tol(atol=3e-5, rtol=7e-6)}),
+                                    "TestConsistency", "test_output_match", device_type="mps"),
+                       DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-3, rtol=2e-3)}),
+                                    "TestConsistency", "test_output_grad_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    ),
                    # tan(pi/2 * odd_number) is nan
                    reference_numerics_filter=NumericsFilter(
@@ -17702,6 +18032,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                                     active_if=(IS_MACOS or IS_WINDOWS)),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+<<<<<<< HEAD
+=======
+                       DecorateInfo(toleranceOverride({torch.complex64: tol(atol=3e-5, rtol=7e-6)}),
+                                    "TestConsistency", "test_output_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    ),
                    # tan(j * pi/2 * odd_number) is nan
                    reference_numerics_filter=NumericsFilter(
@@ -17905,6 +18240,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                                     active_if=IS_MACOS),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+<<<<<<< HEAD
+=======
+                       DecorateInfo(toleranceOverride({torch.complex64: tol(atol=2e-5, rtol=3e-6)}),
+                                    "TestConsistency", "test_output_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    )),
     UnaryUfuncInfo('square',
                    ref=np.square,
@@ -18375,6 +18715,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            ),
     OpInfo('index_fill',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.complex32),
+<<<<<<< HEAD
+=======
+           inplace_variant=torch.Tensor.index_fill_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -18410,6 +18754,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
     OpInfo('index_add',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+<<<<<<< HEAD
+=======
+           inplace_variant=torch.Tensor.index_add_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            supports_out=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -18547,7 +18895,16 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_scatter,
+<<<<<<< HEAD
            error_inputs_func=error_inputs_scatter_and_scatter_add),
+=======
+           error_inputs_func=error_inputs_scatter_and_scatter_add,
+           skips=(
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     UnaryUfuncInfo(
         'bfloat16',
         op=lambda x, *args, **kwargs: x.bfloat16(*args, **kwargs),
@@ -18898,12 +19255,21 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
            )),
     OpInfo('full_like',
+<<<<<<< HEAD
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            supports_out=False,
            sample_inputs_func=sample_inputs_full_like,
            supports_autograd=False,
            skips=(
            )),
+=======
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16,
+                                            torch.uint16, torch.uint32),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_full_like,
+           supports_autograd=False,
+           ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpInfo('new_zeros',
            op=lambda x, *args, **kwargs: x.new_zeros(*args, **kwargs),
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
@@ -19269,7 +19635,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_comprehensive'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_quick'),
                # The inplace variant (Tensor.normal_) is different from torch.normal
+<<<<<<< HEAD
                # inplace varaint Tensor.normal_ is decomposed using randn_like()
+=======
+               # inplace variant Tensor.normal_ is decomposed using randn_like()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'))),
     OpInfo('normal',
            # This has its own variant b/c OpInfos assume the first arg is a Tensor but it is not here
@@ -19331,11 +19701,23 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'))),
     OpInfo('scatter_add',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+<<<<<<< HEAD
+=======
+           inplace_variant=torch.Tensor.scatter_add_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            sample_inputs_func=sample_inputs_scatter_add,
            error_inputs_func=error_inputs_scatter_and_scatter_add,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+<<<<<<< HEAD
            ),
+=======
+           skips=(
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpInfo('stack',
            dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_stack,
@@ -19346,8 +19728,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                # https://github.com/pytorch/pytorch/issues/77046
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+<<<<<<< HEAD
            ),
            ),
+=======
+           )),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpInfo('_chunk_cat',
            dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_chunk_cat,
@@ -19580,6 +19966,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                # Decomp max diff: 1.8187482915266173e-06
                DecorateInfo(unittest.skip("Inconsistent accuracy"), 'TestDecomp', 'test_comprehensive',
                             device_type='cpu', dtypes=(torch.float16,)),
+<<<<<<< HEAD
+=======
+               DecorateInfo(toleranceOverride({torch.float16: tol(atol=3e-4, rtol=3e-6)}),
+                            "TestConsistency", "test_output_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
            )),
     ShapeFuncInfo('repeat',
                   op=lambda x, dims: x.repeat(dims),
@@ -19925,13 +20316,31 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            error_inputs_func=error_inputs_tril_triu,
+<<<<<<< HEAD
            sample_inputs_func=sample_inputs_tril_triu),
+=======
+           sample_inputs_func=sample_inputs_tril_triu,
+           skips=(
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpInfo('triu',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            error_inputs_func=error_inputs_tril_triu,
+<<<<<<< HEAD
            sample_inputs_func=sample_inputs_tril_triu),
+=======
+           sample_inputs_func=sample_inputs_tril_triu,
+           skips=(
+               # Compiler issue on ROCm. Regression started in ROCm 6.4.
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     OpInfo('triu_indices',
            dtypes=_dispatch_dtypes((torch.int32, torch.int64)),
            sample_inputs_func=sample_inputs_trilu_indices,
@@ -20036,7 +20445,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                # AssertionError: UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type='cuda'),
                # RuntimeError: "max_values_cpu" not implemented for 'ComplexDouble'
+<<<<<<< HEAD
                # Falling back to non-numerically stablized exp, causing nan in the results.
+=======
+               # Falling back to non-numerically stabilized exp, causing nan in the results.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD', dtypes=[torch.complex128]),
                DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad', dtypes=[torch.complex128]),
                DecorateInfo(
@@ -20325,6 +20738,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             # ROCm generates -inf+infj instead of nan+infj for complex64 for some of the results
             DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                          dtypes=[torch.complex64], active_if=TEST_WITH_ROCM),
+<<<<<<< HEAD
+=======
+            # Newer numpy generates -inf+infj instead of nan+infj for complex64 for some of the results
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                         dtypes=[torch.complex64], device_type='cuda'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Expected failure: torch.jiterator_unary is not a valid op
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
             # Skip Nvfuser
@@ -20659,7 +21078,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             # AssertionError: Tensor-likes are not close!
             # Fails in cuda11.7
+<<<<<<< HEAD
             # Error Log: https://github.com/pytorch/pytorch/actions/runs/3440108478/jobs/5738475757
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='cuda'),
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),),
     # In training mode, feature_alpha_dropout currently doesn't support inputs of complex dtype
@@ -21027,6 +21449,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                          device_type='cuda', dtypes=[torch.float16]),
             DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_extremal_values',
                          device_type='cuda', dtypes=[torch.complex64]),
+<<<<<<< HEAD
+=======
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=2e-5, rtol=4e-2)}),
+                         "TestConsistency", "test_output_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     ),
     ReductionOpInfo(
@@ -21214,6 +21641,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             # possibly bad low precision reference in numpy
             DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
                          dtypes=[torch.float16]),
+<<<<<<< HEAD
+=======
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=3e-3, rtol=4e-2)}),
+                         "TestConsistency", "test_output_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     ),
     OpInfo(
@@ -21221,6 +21653,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=floating_types(),
         supports_out=False,
         sample_inputs_func=sample_inputs_ctc_loss,
+<<<<<<< HEAD
         skips=(
             # https://github.com/pytorch/pytorch/issues/67462
             # torch.autograd.gradcheck.GradcheckError: Jacobian mismatch for output 0 with respect to input 0
@@ -21230,6 +21663,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                 "test_fn_grad",
                 dtypes=(torch.float64,),
             ),
+=======
+        # gradcheck_wrapper, see https://github.com/pytorch/pytorch/issues/52241
+        gradcheck_wrapper=gradcheck_wrapper_ctc_loss,
+        skips=(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # RuntimeError: derivative for aten::_ctc_loss_backward is not implemented
             DecorateInfo(
                 unittest.expectedFailure,
@@ -21313,6 +21751,13 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270,
             # please report a bug to PyTorch.
             DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit", dtypes=(torch.float32,),),
+<<<<<<< HEAD
+=======
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=8e-3, rtol=2e-3)}),
+                         "TestConsistency", "test_output_match", device_type="mps"),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=8e-3, rtol=2e-3)}),
+                         "TestConsistency", "test_output_grad_match", device_type="mps"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     ),
     OpInfo(
@@ -21495,12 +21940,24 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
     OpInfo(
         'scatter_reduce',
         variant_test_name='sum',
+<<<<<<< HEAD
+=======
+        inplace_variant=torch.Tensor.scatter_reduce_,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # complex not added to dtypes as complex gradients are not properly handled
         # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_scatter_reduce,
+<<<<<<< HEAD
+=======
+        skips=(
+            # Compiler issue on ROCm. Regression started in ROCm 6.4.
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                         dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     OpInfo(
         'scatter_reduce',
@@ -23178,6 +23635,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                 "test_python_ref",
                 dtypes=(torch.bfloat16,),
                 device_type="cpu",
+<<<<<<< HEAD
+=======
+                active_if=not IS_S390X,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ),
         ),
     ),
diff --git a/torch/testing/_internal/common_mkldnn.py b/torch/testing/_internal/common_mkldnn.py
index 4a9d01cf9cde..f007d2d5b525 100644
--- a/torch/testing/_internal/common_mkldnn.py
+++ b/torch/testing/_internal/common_mkldnn.py
@@ -60,8 +60,12 @@ def wrapper(f):
 
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
+<<<<<<< HEAD
             for k, v in zip(arg_names, args):
                 kwargs[k] = v
+=======
+            kwargs.update(zip(arg_names, args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cond = bf32_is_not_fp32()
             if "device" in kwargs:
                 cond = cond and (torch.device(kwargs["device"]).type == "cpu")
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index 9f1db0a33de7..affaafabbc3a 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -16,8 +16,12 @@
     floating_types, floating_and_complex_types_and, get_all_fp_dtypes)
 from torch.testing._internal.common_device_type import (
     _TestParametrizer, _update_param_kwargs, expectedFailureMPS, toleranceOverride, tol,
+<<<<<<< HEAD
     skipCUDAIfCudnnVersionLessThan, skipCUDAIfRocm, precisionOverride, skipMeta, skipMPS,
     skipCUDAVersionIn)
+=======
+    skipCUDAIfRocm, precisionOverride, skipMeta, skipMPS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_methods_invocations import DecorateInfo
 from torch.testing._internal.common_nn import (
     cosineembeddingloss_reference, cross_entropy_loss_reference, ctcloss_reference,
@@ -3172,6 +3176,7 @@ def padding3d_circular_ref(inp, pad):
     DecorateInfo(
         unittest.expectedFailure, "TestModule", "test_non_contiguous_tensors",
         active_if=(TEST_CUDNN and TEST_WITH_ROCM), dtypes=(torch.float,), device_type='cuda'
+<<<<<<< HEAD
     ),
     DecorateInfo(
         skipCUDAVersionIn([(11, 7)]), "TestExpandedWeightModule", "test_module",
@@ -3180,6 +3185,8 @@ def padding3d_circular_ref(inp, pad):
     DecorateInfo(
         skipCUDAVersionIn([(11, 7)]), "TestDecomp", "test_rnn_decomp_module",
         device_type='cuda'
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 )
 
@@ -3506,8 +3513,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
@@ -3525,8 +3535,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # This was wrongly being skipped before and needs investigation.
@@ -3551,8 +3564,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 8005
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=8005), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Conv3d is not supported on MPS backend
@@ -3570,8 +3586,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_memformat_affects_out=True,
                dtypes=floating_and_complex_types_and(torch.chalf),
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Not implemented for chalf on CPU
@@ -3593,8 +3612,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_memformat_affects_out=True,
                dtypes=floating_and_complex_types_and(torch.chalf),
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Fails on backward check because ViewAsRealBackward apply contiguous for grad
@@ -3627,8 +3649,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 8005
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=8005), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # ConvTranspose3d is not supported on MPS backend
@@ -3698,8 +3723,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
@@ -3720,8 +3748,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
@@ -3749,8 +3780,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 8005
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=8005), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
@@ -3770,8 +3804,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
@@ -3792,8 +3829,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
@@ -3821,8 +3861,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
+<<<<<<< HEAD
                    # channels_last support on cuda requires cudnn >= 8005
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=8005), 'TestModule', 'test_memory_format'),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
@@ -3876,7 +3919,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                        unittest.expectedFailure,
                        'TestModule',
                        'test_memory_format',
+<<<<<<< HEAD
                        active_if=operator.itemgetter('training'),
+=======
+                       active_if=operator.itemgetter('training') and not _macos15_or_newer,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                        device_type='mps',
                    ),)
                ),
@@ -4064,6 +4111,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                ),
     ModuleInfo(torch.nn.Hardshrink,
                module_inputs_func=module_inputs_torch_nn_Hardshrink,
+<<<<<<< HEAD
                skips=(
                    # not supported on MPS backend
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_forward', device_type='mps'),
@@ -4084,6 +4132,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                        active_if=operator.itemgetter('training'),
                        device_type='mps',
                    ),),
+=======
+               ),
+    ModuleInfo(torch.nn.Hardswish,
+               module_inputs_func=module_inputs_torch_nn_Hardswish,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                supports_gradgrad=False),
     ModuleInfo(torch.nn.Hardtanh,
                module_inputs_func=module_inputs_torch_nn_Hardtanh,
diff --git a/torch/testing/_internal/common_mps.py b/torch/testing/_internal/common_mps.py
new file mode 100644
index 000000000000..3e93a3552dff
--- /dev/null
+++ b/torch/testing/_internal/common_mps.py
@@ -0,0 +1,1006 @@
+import unittest
+from collections.abc import Sequence
+from typing import Optional
+
+import torch
+
+from .common_utils import MACOS_VERSION
+from .opinfo.core import DecorateInfo, OpInfo
+
+
+if torch.backends.mps.is_available():
+
+    def mps_ops_modifier(
+        ops: Sequence[OpInfo],
+        device_type: Optional[str] = None,
+        xfail_exclusion: Optional[list[str]] = None,
+    ) -> Sequence[OpInfo]:
+        if xfail_exclusion is None:
+            xfail_exclusion = []
+
+        # Supported complex OPS
+        SUPPORTED_COMPLEX_OPS = {
+            "__radd__",
+            "__rmul__",
+            "__rsub__",
+            "__getitem__",
+            "_unsafe_masked_index",
+            "abs",
+            "add",
+            "alias_copy",
+            "argwhere",
+            "atleast_1d",
+            "atleast_2d",
+            "atleast_3d",
+            "as_strided",
+            "as_strided_copy",
+            "as_strided_scatter",
+            "asin",
+            "acos",
+            "atan",
+            "broadcast_tensors",
+            "broadcast_to",
+            "chalf",
+            "cfloat",
+            "chunk",
+            "clone",
+            "conj",
+            "conj_physical",
+            "contiguous",
+            "cos",
+            "cosh",
+            "diag",
+            "diag_embed",
+            "diagflat",
+            "diagonal",
+            "diagonal_copy",
+            "diagonal_scatter",
+            "divno_rounding_mode",
+            "dsplit",
+            "empty",
+            "empty_permuted",
+            "empty_strided",
+            "exp",
+            "expm1",
+            "exp2",
+            "expand",
+            "expand_as",
+            "expand_copy",
+            "flatten",
+            "fill",
+            "full",
+            "full_like",
+            "H",
+            "hsplit",
+            "imag",
+            "index_copy",
+            "index_select",
+            "isfinite",
+            "isinf",
+            "isreal",
+            "item",
+            "kron",
+            "linalg.diagonal",
+            "linalg.svd",
+            "log10",
+            "log1p",
+            "log2",
+            "log",
+            "mH",
+            "mT",
+            "masked_fill",
+            "masked_scatter",
+            "masked_select",
+            "meshgridlist_of_tensors",
+            "meshgridvariadic_tensors",
+            "movedim",
+            "mul",
+            "narrow",
+            "narrow_copy",
+            "neg",
+            "new_full",
+            "new_ones",
+            "new_zeros",
+            "nn.functional.conv1d",
+            "nn.functional.conv2d",
+            "nn.functional.conv_transpose1d",
+            "nn.functional.conv_transpose2d",
+            "nn.functional.conv_transpose3d",
+            "nn.functional.feature_alpha_dropoutwithout_train",
+            "nn.functional.padcircular",
+            "nn.functional.softsign",
+            "nn.functional.tanhshrink",
+            "nn.functional.unfold",
+            "nonzero",
+            "ones",
+            "ones_like",
+            "outer",
+            "permute",
+            "permute_copy",
+            "positive",
+            "randn",
+            "ravel",
+            "real",
+            "repeat_interleave",
+            "reshape_as",
+            "reshape",
+            "resolve_conj",
+            "resolve_neg",
+            "rsqrt",
+            "rsub",
+            "scalar_tensor",
+            "select",
+            "sgn",
+            "sigmoid",
+            "sin",
+            "sinc",
+            "sinh",
+            "slice",
+            "special.spherical_bessel_j0",
+            "special.entr",
+            "special.xlog1py",
+            "special.zeta",
+            "split",
+            "split_with_sizes",
+            "split_with_sizes_copy",
+            "splitlist_args",
+            "sqrt",
+            "squeeze",
+            "squeeze_copy",
+            "squeezemultiple",
+            "sub",
+            "svd",
+            "t",
+            "t_copy",
+            "tanh",
+            "tan",
+            "tensor_split",
+            "transpose",
+            "transpose_copy",
+            "tril",
+            "triu",
+            "true_divide",
+            "T",
+            "unbind",
+            "unbind_copy",
+            "unflatten",
+            "unfold",
+            "unfold_copy",
+            "unsafe_chunk",
+            "unsafe_split",
+            "unsqueeze",
+            "unsqueeze_copy",
+            "view_as",
+            "view_as_real",
+            "view",
+            "view_copy",
+            "vsplit",
+            "zero_",
+            "zeros",
+            "zeros_like",
+        }
+
+        AFTER_MACOS_14_0_SUPPORTED_COMPLEX_OPS = {
+            "__rdiv__",
+            "__rmatmul__",
+            "_chunk_cat",
+            "acosh",
+            "all",
+            "allclose",
+            "angle",
+            "any",
+            "addcdiv",
+            "addcmul",
+            "addmmdecomposed",
+            "addmv",
+            "atanh",
+            "bfloat16",
+            "bmm",
+            "bool",
+            "cartesian_prod",
+            "cat",
+            "char",
+            "column_stack",
+            "combinations",
+            "corrcoef",
+            "constant_pad_nd",
+            "cov",
+            "count_nonzero",
+            "diff",
+            "div",
+            "dot",
+            "dstack",
+            "einsum",
+            "eq",
+            "equal",
+            "eye",
+            "fft.fft",
+            "fft.fft2",
+            "fft.fftn",
+            "fft.fftshift",
+            "fft.ifft",
+            "fft.ifft2",
+            "fft.ifftn",
+            "fft.ifftshift",
+            "fft.irfftn",
+            "fft.irfft2",
+            "fft.irfft",
+            "fft.hfftn",
+            "fft.hfft2",
+            "fft.hfft",
+            "flip",
+            "fliplr",
+            "flipud",
+            "float",
+            "gradient",
+            "half",
+            "hstack",
+            "inner",
+            "int",
+            "isclose",
+            "isnan",
+            "ldexp",
+            "lerp",
+            "linalg.multi_dot",
+            "linalg.pinv",
+            "linspace",
+            "linspacetensor_overload",
+            "logical_and",
+            "logical_not",
+            "logical_or",
+            "logical_xor",
+            "logsumexp",
+            "long",
+            "masked.mean",
+            "masked.prod",
+            "masked.std",
+            "masked.sum",
+            "masked.var",
+            "masked.logsumexp",
+            "matmul",
+            "mean",
+            "mm",
+            "mv",
+            "ne",
+            "nn.functional.padconstant",
+            "nn.functional.padreflect",
+            "nn.functional.padreplicate",
+            "nn.functional.pixel_shuffle",
+            "nn.functional.pixel_unshuffle",
+            "nn.functional.rms_norm",
+            "pinverse",
+            "prod",
+            "reciprocal",
+            "roll",
+            "rot90",
+            "short",
+            "sinh",
+            "sqrt",
+            "square",
+            "stack",
+            "stft",
+            "sum",
+            "sum_to_size",
+            "tensordot",
+            "trace",
+            "trapz",
+            "trapezoid",
+            "vstack",
+            "where",
+            "byte",
+        }
+        # Those ops worked on MacOS12, but broken on MacOS13, see https://github.com/pytorch/pytorch/issues/85758
+        MACOS_BEFORE_13_3_XFAILLIST = {
+            # Failures due to precision issues (due to fast-math). These has been fixed in MacOS 13.3+
+            "cdist": [torch.float32],
+            # CPU Error: cpu not giving nan for x/0.0
+            "atan2": [
+                torch.bool,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+                torch.uint8,
+                torch.int8,
+            ],
+            # test blow pass on macOS 12 as it falls back to cpu
+            # Argsort case using duplicate indices (undefined behaviour):
+            #  - CPU output: tensor([2546, 6917, 3181,  ..., 7128, 5133,   30], device='cpu')
+            #  - MPS output: tensor([2546, 6917, 3181,  ..., 7128,   30, 5133], device='mps:0')
+            # Elements from index 30 and 5133 are both equal.
+            # Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
+            "argsort": [torch.float16, torch.int8, torch.uint8, torch.bool],
+            # Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
+            # The values of the sorted tensor match the CPU,
+            # but in case of the returned indices this results in undefined behaviour.
+            "sort": [torch.int8, torch.uint8, torch.bool, torch.float16],
+            # Unsupported dtypes
+            "cumsum": [torch.int64],
+            "cumprod": [torch.int64],
+            "cumulative_trapezoid": [torch.int64],
+            "masked.cumsum": [torch.int64],
+            "masked.cumprod": [torch.int64],
+            "linalg.vander": [torch.int64],
+            # Fail with `Expected 1.0 but got nan.` for empty tensors
+            # Caused by sample input at index 23: SampleInput(
+            #     input=Tensor[size=(), device="mps:0", dtype=torch.float32],
+            #     args=(0),
+            #     kwargs={'mask': 'Tensor[size=(), device="mps:0", dtype=torch.bool]'},
+            #     broadcasts_input=False, name='')
+            "masked.softmin": [torch.float32, torch.float16],
+            "masked.softmax": [torch.float32, torch.float16],
+            "masked.log_softmax": [torch.float32, torch.float16],
+        }
+
+        MACOS_AFTER_13_1_XFAILLIST = {
+            # before macOS 13.2 it falls back to cpu and pass the forward pass
+            "grid_sampler_2d": [
+                torch.float32,
+                torch.float16,
+                torch.bfloat16,
+            ],  # Unsupported Border padding mode
+        }
+
+        MACOS_13_3_XFAILLIST = {
+            # Failure due to precision issue for fp16
+            # on both cpu and mps there are test cases that might produce inf result
+            # 'nn.functional.pairwise_distance': [torch.float16],
+            # test blow pass on macOS 12 as it falls back to cpu
+            # Argsort case using duplicate indices (undefined behaviour):
+            #  - CPU output: tensor([2546, 6917, 3181,  ..., 7128, 5133,   30], device='cpu')
+            #  - MPS output: tensor([2546, 6917, 3181,  ..., 7128,   30, 5133], device='mps:0')
+            # Elements from index 30 and 5133 are both equal.
+            # Since CPU is not using argsort with stable=True, these cases result in undefined behaviour.
+            "argsort": [
+                torch.float16,
+                torch.int8,
+                torch.uint8,
+                torch.bool,
+                torch.bfloat16,
+            ],
+            # Same issue as `argsort` with duplicate indices. This test checks both the sorted values and the indices.
+            # The values of the sorted tensor match the CPU,
+            # but in case of the returned indices this results in undefined behaviour.
+            "sort": [
+                torch.int8,
+                torch.uint8,
+                torch.bool,
+                torch.float16,
+                torch.bfloat16,
+            ],
+        }
+
+        MACOS_BEFORE_14_4_XFAILLIST = {
+            # These ops work fine in 14.4 but fail in 14.2 or 13.x
+            "fft.hfft2": [torch.complex64],
+        }
+
+        # Those ops are not expected to work
+        UNIMPLEMENTED_XFAILLIST = {
+            # Failures due to lack of op implementation on MPS backend
+            "logspace": None,
+            "logspacetensor_overload": None,
+            "linalg.eig": None,
+            "linalg.eigvals": None,
+            "put": None,
+            "cauchy_": None,
+            "cauchy": None,
+            "cholesky_inverse": None,
+            "cholesky_solve": None,
+            "frexp": None,
+            "gcd": None,
+            "geqrf": None,
+            "nn.functional.grid_sample": None,  # Unsupported Border padding mode
+            "heaviside": None,
+            "igamma": None,
+            "igammac": None,
+            "index_reduceprod": None,
+            "index_reducemean": None,
+            "index_reduceamax": None,
+            "index_reduceamin": None,
+            "kthvalue": None,
+            "lcm": None,
+            "linalg.cond": None,
+            "linalg.eigh": None,
+            "linalg.eigvalsh": None,
+            "linalg.householder_product": None,
+            "linalg.ldl_factor": None,
+            "linalg.ldl_factor_ex": None,
+            "linalg.ldl_solve": None,
+            "linalg.lstsq": None,
+            "linalg.lstsqgrad_oriented": None,
+            "linalg.lu": None,
+            "linalg.lu_solve": None,
+            "linalg.matrix_norm": [torch.float32],
+            "linalg.norm": [torch.float32],
+            "linalg.normsubgradients_at_zero": [torch.float32],
+            "linalg.qr": None,
+            "linalg.svdvals": None,
+            "linalg.vecdot": None,
+            "logcumsumexp": None,
+            "lu_solve": None,
+            "masked.median": None,
+            "matrix_exp": None,
+            "mode": None,
+            "native_dropout_backward": None,
+            "normnuc": None,
+            "nn.functional.fractional_max_pool2d": None,
+            "nn.functional.fractional_max_pool3d": None,
+            "nn.functional.adaptive_avg_pool3d": None,
+            "nn.functional.adaptive_max_pool3d": None,
+            "nn.functional.interpolatearea": None,
+            "nn.functional.interpolatebicubic": [torch.uint8],
+            "nn.functional.max_unpool1dgrad": None,
+            "nn.functional.max_unpool2dgrad": None,
+            "nn.functional.max_unpool3dgrad": None,
+            "nn.functional.avg_pool3d": None,
+            "nn.functional.ctc_loss": None,
+            "nn.functional.embedding_bag": None,
+            "nn.functional.max_pool3d": None,
+            "nn.functional.max_unpool1d": None,
+            "nn.functional.max_unpool2d": None,
+            "nn.functional.max_unpool3d": None,
+            "nn.functional.multi_margin_loss": None,
+            "nn.functional.multilabel_margin_loss": None,
+            "nn.functional.pdist": None,
+            "nn.functional.rrelu": None,
+            "nn.functional.norm": None,
+            "ormqr": None,
+            "pca_lowrank": None,
+            "qr": None,
+            "scatter_reduceamax": [torch.int32, torch.int64]
+            if MACOS_VERSION < 15.0
+            else [torch.int64],
+            "scatter_reduceamin": [torch.int32, torch.int64]
+            if MACOS_VERSION < 15.0
+            else [torch.int64],
+            "segment_reduce": None,
+            "_segment.reduce": None,
+            "segment.reduce": None,
+            "segment_reduce_offsets": None,
+            "_segment_reduce_offsets": None,
+            "_segment_reduce_lengths": None,
+            "_segment_reducelengths": None,
+            "_segment_reduceoffsets": None,
+            "sparse.mm": None,
+            "sparse.sampled_addmm": None,
+            "sparse.mmreduce": None,
+            "special.airy_ai": None,
+            "special.erfcx": None,
+            "special.laguerre_polynomial_l": None,
+            "special.log_ndtr": None,
+            "special.ndtri": None,
+            "svd_lowrank": None,
+            "symeig": None,
+            "take": None,
+            "to": None,
+            "to_sparse": None,
+            "unique": None,
+            "vdot": None,
+            "segment_reduce_": None,
+            "_upsample_bilinear2d_aa": [torch.uint8],  # uint8 is for CPU only
+            "_upsample_bicubic2d_aa": [torch.uint8],  # uint8 is for CPU only
+            "geometric": None,
+            "geometric_": None,
+            "log_normal_": None,
+            "log_normal": None,
+            "cdouble": None,
+            "double": None,
+            "nn.functional.softminwith_dtype": None,
+            "log_softmaxwith_dtype": None,
+            "softmaxwith_dtype": None,
+            "float_power": None,
+            "linalg.matrix_rankhermitian": None,
+            "linalg.pinvhermitian": None,
+            "nonzero_static": None,
+            # MPS: input sizes must be divisible by output sizes
+            "nn.functional.adaptive_avg_pool1d": None,
+            "nn.functional.adaptive_avg_pool2d": None,
+            # Convolution for integral types is not supported on MPS
+            "nn.functional.conv1d": [torch.int64],
+            "nn.functional.conv2d": [torch.int64],
+            "nn.functional.conv3d": [torch.int64],
+            "nn.functional.conv_transpose1d": [torch.int64],
+            "nn.functional.conv_transpose2d": [torch.int64, torch.bfloat16],
+            "nn.functional.conv_transpose3d": [
+                torch.int64,
+                torch.bfloat16,
+                torch.float16,
+            ],
+            # Unsupported dtypes
+            "dot": [torch.int64] if MACOS_VERSION < 14.0 else [],
+            "histc": [torch.float16, torch.bfloat16],
+            "index_add": [torch.int64],
+            # GEMM on MPS is not supported for integral types
+            "nn.functional.linear": [
+                torch.int16,
+                torch.int32,
+                torch.int64,
+                torch.uint8,
+                torch.int8,
+            ],
+            "addmmdecomposed": [
+                torch.int16,
+                torch.int32,
+                torch.int64,
+                torch.uint8,
+                torch.int8,
+            ],
+            "addbmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+            "addmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+            "baddbmm": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+            "mat": [torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8],
+            "matmul": [torch.int64] if MACOS_VERSION < 14.0 else [],
+            "__rmatmul__": [torch.int64] if MACOS_VERSION < 14.0 else [],
+            # returned output on CPU is float64
+            "bincount": [
+                torch.int16,
+                torch.int32,
+                torch.int64,
+                torch.uint8,
+                torch.int8,
+            ],
+            # round not working properly for float16 and bfloat16
+            "round": [torch.float16, torch.bfloat16],
+            "rounddecimals_0": [torch.bfloat16],
+            # atomic operations not supported
+            "_unsafe_masked_index_put_accumulate": [
+                torch.int8,
+                torch.uint8,
+                torch.int16,
+                torch.int64,
+            ],
+        }
+
+        if MACOS_VERSION < 14.0:
+            # FFT and BFloat16 support was added in MacOS 14
+            UNIMPLEMENTED_XFAILLIST.update(
+                {
+                    "bfloat16": None,
+                    "fft.fft": None,
+                    "fft.fft2": None,
+                    "fft.fftn": None,
+                    "fft.hfft": None,
+                    "fft.hfft2": None,
+                    "fft.hfftn": None,
+                    "fft.ifft": None,
+                    "fft.ifft2": None,
+                    "fft.ifftn": None,
+                    "fft.ihfft": None,
+                    "fft.ihfft2": None,
+                    "fft.ihfftn": None,
+                    "fft.irfft": None,
+                    "fft.irfft2": None,
+                    "fft.irfftn": None,
+                    "fft.rfft": None,
+                    "fft.rfft2": None,
+                    "fft.rfftn": None,
+                    "stft": None,
+                    # Error in TestConsistencyCPU.test_output_match_isin_cpu fails for integers,
+                    # not reproducible in later OS. Added assert to op if used in < 14.0
+                    "isin": [
+                        torch.int64,
+                        torch.int32,
+                        torch.int16,
+                        torch.uint8,
+                        torch.int8,
+                    ],
+                    "nn.functional.max_pool2d": [torch.uint8],
+                }
+            )
+
+        if MACOS_VERSION < 15.0:
+            UNIMPLEMENTED_XFAILLIST.update(
+                {
+                    "quantile": None,
+                    "nanquantile": None,
+                }
+            )
+
+        UNDEFINED_XFAILLIST = {
+            # Top 60 operators
+            # topk fails with duplicate indices
+            "topk": [
+                torch.int16,
+                torch.int32,
+                torch.int64,
+                torch.uint8,
+                torch.int8,
+            ],
+            # Failures due to random output that they generate using
+            # Philox engine causing mismatch with CPU results
+            "multinomial": [
+                torch.float16,
+                torch.float32,
+                torch.bfloat16,
+            ],  # random results
+            "uniform": [torch.float16, torch.float32, torch.bfloat16],
+            "rand_like": [torch.float16, torch.float32, torch.bfloat16],
+            "randint": None,
+            "randint_like": None,
+            "randn": None,
+            "randn_like": None,
+            "bernoulli": [torch.float16, torch.float32, torch.bfloat16],
+            "exponential": [torch.float16, torch.float32, torch.bfloat16],
+            "nn.functional.feature_alpha_dropoutwith_train": [
+                torch.float16,
+                torch.float32,
+                torch.bfloat16,
+            ],
+            "normal": [torch.float16, torch.float32, torch.bfloat16],
+            "normalin_place": [torch.float16, torch.float32, torch.bfloat16],
+            "normalnumber_mean": [torch.float16, torch.float32, torch.bfloat16],
+            "nn.functional.alpha_dropout": [
+                torch.float16,
+                torch.float32,
+                torch.bfloat16,
+            ],
+            "nn.functional.dropout": [torch.float16, torch.float32, torch.bfloat16],
+            "nn.functional.dropout2d": [torch.float16, torch.float32, torch.bfloat16],
+            "nn.functional.dropout3d": [torch.float16, torch.float32, torch.bfloat16],
+            # See https://github.com/pytorch/pytorch/issues/111479
+            "nn.functional.multi_head_attention_forward": [
+                torch.float32,
+                torch.float16,
+                torch.bfloat16,
+            ],
+            "index_put": [
+                torch.uint8,
+                torch.int8,
+                torch.int16,
+                torch.int64,
+            ],
+            # zero to negative integer powers are undefined
+            "__rpow__": [torch.int8, torch.int16, torch.int32, torch.int64],
+            "resize_": [torch.float16, torch.float32, torch.bfloat16],
+            "resize_as_": [torch.float16, torch.float32, torch.bfloat16],
+            # CPU Errors:
+            "addr": [
+                torch.bool,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+                torch.uint8,
+                torch.int8,
+            ],  # "addmv_impl_cpu" not implemented for 'Half'
+            "as_stridedpartial_views": None,  # cpu result off, showing random values
+            # random results
+            # mps vs cpu:
+            # Mismatched elements: 40 / 96 (41.7%)
+            # Greatest absolute difference: 17.892311096191406 at index (1, 0, 2) (up to 1e-05 allowed)
+            # Greatest relative difference: inf at index (1, 0, 0) (up to 1.3e-06 allowed)
+            # cuda(2.0.0.dev20230301+cu117) vs cpu:
+            # Mismatched elements: 56 / 96 (58.3%)
+            # Greatest absolute difference: 17.892311096191406 at index (1, 0, 2) (up to 1e-05 allowed)
+            # Greatest relative difference: inf at index (1, 0, 0) (up to 1.3e-06 allowed)
+            "nn.functional.scaled_dot_product_attention": [
+                torch.float32,
+                torch.float16,
+                torch.bfloat16,
+            ],
+        }
+
+        ON_MPS_XFAILLIST = {
+            # Failures due to lack of implementation of downstream functions on MPS backend
+            # TODO: remove these once downstream function 'aten::_linalg_svd.U' have been implemented
+            "linalg.matrix_rank": None,
+            # Exception: Caused by `torch.arange(-8.001, -4.0, dtype=torch.uint8, device="mps")`
+            "arange": [torch.uint8],
+        }
+
+        EMPTY_OPS_SKIPLIST = {
+            # Fill tensors with uninitialized data, causing mismatch with CPU.
+            # They occasionally match, thus skipping them.
+            # See https://github.com/pytorch/pytorch/issues/100175
+            "new_empty": None,
+            "new_empty_strided": None,
+            "empty_strided": None,
+            # CPU: empty is returning all 0's and there is a mismatch with MPS
+            # allocation (MacOS 13). According to
+            # https://pytorch.org/docs/2.0/generated/torch.empty.html
+            "empty": None,
+            "empty_like": None,
+            "empty_permuted": None,
+        }
+
+        SKIPLIST = {
+            # Unsupported
+            # This doesn't work on M1, but is partially working on M2 with the exception of torch.float16
+            "nn.functional.conv3d": None,
+        }
+
+        def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
+            if device_type is not None:
+                d.device_type = device_type
+
+            op.decorators = op.decorators + (d,)
+
+        for op in ops:
+            key = op.name + op.variant_test_name
+            if key in EMPTY_OPS_SKIPLIST:
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.skip("Skipping empty ops."),
+                        dtypes=EMPTY_OPS_SKIPLIST[key],
+                    ),
+                )
+            if key in SKIPLIST:
+                addDecorator(
+                    op, DecorateInfo(unittest.skip("Skipped!"), dtypes=SKIPLIST[key])
+                )
+            for xfaillist in [
+                UNIMPLEMENTED_XFAILLIST,
+                UNDEFINED_XFAILLIST,
+                ON_MPS_XFAILLIST,
+            ]:
+                if key in xfaillist and key not in xfail_exclusion:
+                    addDecorator(
+                        op,
+                        DecorateInfo(unittest.expectedFailure, dtypes=xfaillist[key]),
+                    )
+
+            if (
+                key in MACOS_BEFORE_14_4_XFAILLIST
+                and key not in xfail_exclusion
+                and (MACOS_VERSION < 14.4)
+            ):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure,
+                        dtypes=MACOS_BEFORE_14_4_XFAILLIST[key],
+                    ),
+                )
+
+            if (
+                key in MACOS_BEFORE_13_3_XFAILLIST
+                and key not in xfail_exclusion
+                and (torch.backends.mps.is_macos13_or_newer() and MACOS_VERSION < 13.3)
+            ):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure,
+                        dtypes=MACOS_BEFORE_13_3_XFAILLIST[key],
+                    ),
+                )
+
+            if (
+                key in MACOS_AFTER_13_1_XFAILLIST
+                and key not in xfail_exclusion
+                and torch.backends.mps.is_macos13_or_newer(2)
+            ):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure, dtypes=MACOS_AFTER_13_1_XFAILLIST[key]
+                    ),
+                )
+
+            if (
+                key in MACOS_13_3_XFAILLIST
+                and key not in xfail_exclusion
+                and (MACOS_VERSION >= 13.3)
+            ):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure, dtypes=MACOS_13_3_XFAILLIST[key]
+                    ),
+                )
+
+            # If ops is not supported for complex types, expect it to fail
+            if key not in SUPPORTED_COMPLEX_OPS and (
+                key not in AFTER_MACOS_14_0_SUPPORTED_COMPLEX_OPS
+                or MACOS_VERSION < 14.0
+            ):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure,
+                        dtypes=[torch.complex32, torch.complex64],
+                    ),
+                )
+
+        return ops
+
+    def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
+        XFAILLIST_GRAD = {
+            # Unimplemented ops
+            "_segment_reduce": [torch.float16, torch.float32],
+            "_chunk_cat": [torch.float16, torch.float32],
+            "_upsample_bilinear2d_aa": None,  # `_upsample_bilinear2d_aa_backward_out` not implemented for MPS
+            "_upsample_bicubic2d_aa": None,  # `_upsample_bilinear2d_aa_backward_out` not implemented for MPS
+            "sparse.mmreduce": [torch.float32],  # csr not supported
+            "unique_consecutive": [torch.float16, torch.float32],
+            "scalar_tensor": [torch.float16, torch.float32],
+            "cdist": [torch.float32],
+            "masked.scatter": [torch.float16, torch.float32],
+            "index_fill": [torch.float16, torch.float32],  # missing `aten::_unique`.
+            "linalg.solve": [torch.float16, torch.float32],  # missing `aten::lu_solve`.
+            "linalg.solve_ex": [
+                torch.float16,
+                torch.float32,
+            ],  # missing `aten::lu_solve`.
+            "linalg.tensorsolve": [
+                torch.float16,
+                torch.float32,
+            ],  # missing `aten::lu_solve`.
+            "linalg.det": [torch.float16, torch.float32],  # missing aten::lu_solve.out
+            "linalg.slogdet": [
+                torch.float16,
+                torch.float32,
+            ],  # missing aten::lu_solve.out
+            "logdet": [torch.float16, torch.float32],  # missing aten::lu_solve.out
+            "aminmax": [torch.float32, torch.float16],
+            "special.i1": [torch.float16],  # "i1_backward" not implemented for 'Half'
+            "special.i1e": [torch.float16],  # "i1e_backward" not implemented for 'Half'
+            # Correctness issues
+            "atanh": [torch.float32],
+            # Random output
+            "exponential": [torch.float16, torch.float32],
+            # CPU errors
+            # derivative for zeta is not implemented
+            "special.zeta": None,
+            # derivative for aten::nextafter is not implemented on CPU
+            "nextafter": None,
+            # derivative for aten::floor_divide is not implemented on CPU
+            "floor_divide": [torch.float16, torch.float32],
+            # derivative for aten::narrow_copy is not implemented on CPU
+            "narrow_copy": [torch.float16, torch.float32],
+            # derivative for aten::_histogramdd_from_bin_cts is not implemented on CPU
+            "histogramdd": [torch.float16, torch.float32],
+            # derivative for aten::histogram is not implemented
+            "histogram": [torch.float16, torch.float32],
+            # 'bool' object is not iterable
+            "allclose": [torch.float16, torch.float32],
+            "equal": [torch.float16, torch.float32],
+            # 'float' object is not iterable
+            "item": [torch.float16, torch.float32],
+            # "smooth_l1_backward_cpu_out" not implemented for 'Half'
+            "nn.functional.smooth_l1_loss": [torch.float16],
+            # cpu error: grad requires non-empty inputs
+            "randn": [torch.float16, torch.float32],
+            "signal.windows.bartlett": [torch.float32],
+            "signal.windows.blackman": [torch.float32],
+            "signal.windows.cosine": [torch.float32],
+            "signal.windows.exponential": [torch.float32],
+            "signal.windows.gaussian": [torch.float32],
+            "signal.windows.general_cosine": [torch.float32],
+            "signal.windows.general_hamming": [torch.float32],
+            "signal.windows.hamming": [torch.float32],
+            "signal.windows.hann": [torch.float32],
+            "signal.windows.kaiser": [torch.float32],
+            "signal.windows.nuttall": [torch.float32],
+            "eye": [torch.float16, torch.float32],
+            # round not working properly for float16
+            "round": [torch.float16],
+            # topk fails with duplicate indices
+            "topk": [torch.float16],
+        }
+
+        MACOS_BEFORE_13_3_XFAILLIST_GRAD = {
+            # Failures due to precision issues (may be fast-math). These has been fixed in MacOS 14
+            "masked.softmin": [torch.float32, torch.float16],
+            "masked.softmax": [torch.float32, torch.float16],
+            "masked.log_softmax": [torch.float32, torch.float16],
+            "atanh": [torch.float16],
+            "triangular_solve": [torch.float32],
+            # Unsupported Border padding mode, forward pass success as fallback to cpu
+            "grid_sampler_2d": [torch.float32, torch.float16, torch.bfloat16],
+            # Same issue as `argsort` and `sort` with duplicate elements (undefined behaviour).
+            # Forward pass is passing since `msort` doesn't return the indices, just the values, which match the CPU.
+            # On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
+            # Running `msort` with stable `sort` passes.
+            "msort": [torch.float16],
+        }
+
+        SKIPLIST_GRAD = {
+            "nn.functional.pairwise_distance": [torch.float16],
+            # failed assertion `destination datatype must be fp32'
+            "nn.functional.conv1d": [torch.float16],
+            "nn.functional.conv2d": [torch.float16],
+            "nn.functional.conv3d": [torch.float16],
+            "nn.functional.conv_transpose1d": [torch.float16],
+            "nn.functional.conv_transpose2d": [torch.float16],
+            "nn.functional.conv_transpose3d": [torch.float16],
+        }
+
+        MACOS_13_3_XFAILLIST_GRAD = {
+            # Same issue as `argsort` and `sort` with duplicate elements (undefined behaviour).
+            # Forward pass is passing since `msort` doesn't return the indices, just the values, which match the CPU.
+            # On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
+            # Running `msort` with stable `sort` passes.
+            "msort": [torch.float16],
+        }
+
+        ON_MPS_XFAILLIST = {
+            # Failures due to lack of implementation of downstream functions on MPS backend
+            # TODO: remove these once downstream function 'aten::_linalg_svd.U' have been implemented
+            "linalg.matrix_rank": None,
+            # Exception: Caused by sample input at index 3 on MPS
+            "nn.functional.conv3d": [torch.float32],
+        }
+
+        def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
+            op.decorators = op.decorators + (d,)
+
+        for op in ops:
+            key = op.name + op.variant_test_name
+            if key in XFAILLIST_GRAD:
+                addDecorator(
+                    op,
+                    DecorateInfo(unittest.expectedFailure, dtypes=XFAILLIST_GRAD[key]),
+                )
+
+            if key in SKIPLIST_GRAD:
+                addDecorator(op, DecorateInfo(unittest.skip, dtypes=SKIPLIST_GRAD[key]))
+
+            if key in ON_MPS_XFAILLIST:
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure, dtypes=ON_MPS_XFAILLIST[key]
+                    ),
+                )
+
+            if key in MACOS_BEFORE_13_3_XFAILLIST_GRAD and (
+                torch.backends.mps.is_macos13_or_newer() and MACOS_VERSION < 13.3
+            ):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure,
+                        dtypes=MACOS_BEFORE_13_3_XFAILLIST_GRAD[key],
+                    ),
+                )
+
+            if key in MACOS_13_3_XFAILLIST_GRAD and (MACOS_VERSION >= 13.3):
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.expectedFailure, dtypes=MACOS_13_3_XFAILLIST_GRAD[key]
+                    ),
+                )
+        return ops
+
+    def mps_ops_error_inputs_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
+        # Error input samples do not take a dtype argument.
+        XFAILLIST = {
+            # Exceptions are not raised
+            "__rmod__",
+            "__rsub__",
+            "__rpow__",
+            "bernoulli",
+            "clamp_max",
+            "clamp_min",
+            "masked_scatter",
+            # unsupported float64 dtype
+            "cat",
+            "complex",
+            "multinomial",
+            "nn.functional.conv1d",
+            "nn.functional.conv2d",
+            "nn.functional.conv3d",
+            "gather",
+            "scatter",
+            "scatter_add",
+            # MPS does not support tensor dimensions > 16
+            "amax",
+            "amin",
+            "aminmax",
+            # memory overlapping checks
+            "index_select",
+            # unimplemented
+            "logcumsumexp",
+        }
+
+        def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
+            op.decorators = op.decorators + (d,)
+
+        for op in ops:
+            key = op.name + op.variant_test_name
+            if key in XFAILLIST:
+                addDecorator(op, DecorateInfo(unittest.expectedFailure))
+
+        return ops
diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
index 05e68df6e71d..957629c0dd67 100644
--- a/torch/testing/_internal/common_optimizers.py
+++ b/torch/testing/_internal/common_optimizers.py
@@ -1543,6 +1543,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         ),
         skips=(
             DecorateInfo(
+<<<<<<< HEAD
                 skipIfMPS,  # addcdiv doesn't work for non-contiguous, see #118115
                 "TestOptimRenewed",
                 "test_forloop_goes_right_direction",
@@ -1550,6 +1551,8 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 device_type="mps",
             ),
             DecorateInfo(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 skipIfTorchDynamo("See #116028"),
                 "TestOptimRenewed",
                 "test_set_default_dtype_works_with_foreach",
@@ -1640,6 +1643,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         ),
         skips=(
             DecorateInfo(
+<<<<<<< HEAD
                 skipIfMPS,  # addcdiv doesn't work for non-contiguous, see #118115
                 "TestOptimRenewed",
                 "test_forloop_goes_right_direction",
@@ -1647,6 +1651,8 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 device_type="mps",
             ),
             DecorateInfo(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 skipIfTorchDynamo(
                     "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
                 ),
@@ -1677,6 +1683,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         has_capturable_arg=True,
         skips=(
             DecorateInfo(
+<<<<<<< HEAD
                 skipIfMPS,  # addcdiv doesn't work for non-contiguous, see #118115
                 "TestOptimRenewed",
                 "test_forloop_goes_right_direction",
@@ -1684,6 +1691,8 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 device_type="mps",
             ),
             DecorateInfo(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 skipIfTorchDynamo("See #116028"),
                 "TestOptimRenewed",
                 "test_set_default_dtype_works_with_foreach",
@@ -1764,6 +1773,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         ),
         skips=(
             DecorateInfo(
+<<<<<<< HEAD
                 skipIfMPS,  # addcdiv doesn't work for non-contiguous, see #118115
                 "TestOptimRenewed",
                 "test_forloop_goes_right_direction",
@@ -1771,6 +1781,8 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 device_type="mps",
             ),
             DecorateInfo(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 skipIfTorchDynamo(
                     "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
                 ),
@@ -1905,6 +1917,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         has_capturable_arg=True,
         skips=(
             DecorateInfo(
+<<<<<<< HEAD
                 skipIfMPS,  # addcdiv doesn't work for non-contiguous, see #118115
                 "TestOptimRenewed",
                 "test_forloop_goes_right_direction",
@@ -1912,6 +1925,8 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 device_type="mps",
             ),
             DecorateInfo(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 skipIfTorchDynamo(
                     "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
                 ),
@@ -1989,6 +2004,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         has_capturable_arg=True,
         skips=(
             DecorateInfo(
+<<<<<<< HEAD
                 skipIfMPS,  # addcdiv doesn't work for non-contiguous, see #118115
                 "TestOptimRenewed",
                 "test_forloop_goes_right_direction",
@@ -1996,6 +2012,8 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 device_type="mps",
             ),
             DecorateInfo(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 skipIfTorchDynamo("See #116028"),
                 "TestOptimRenewed",
                 "test_set_default_dtype_works_with_foreach",
@@ -2034,6 +2052,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         has_capturable_arg=True,
         skips=(
             DecorateInfo(
+<<<<<<< HEAD
                 skipIfMPS,  # Rprop doesn't update for non-contiguous, see #118117
                 "TestOptimRenewed",
                 "test_forloop_goes_right_direction",
@@ -2041,6 +2060,8 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 device_type="mps",
             ),
             DecorateInfo(
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 skipIfTorchDynamo("See #116028"),
                 "TestOptimRenewed",
                 "test_set_default_dtype_works_with_foreach",
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 07e7da55eafc..a9a99f7369fb 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -1,5 +1,6 @@
 # mypy: ignore-errors
 
+<<<<<<< HEAD
 r"""Importing this file includes common utility methods and base clases for
 checking quantization api and properties of resulting modules.
 """
@@ -22,6 +23,54 @@
     default_dynamic_qat_qconfig,
     default_embedding_qat_qconfig,
     default_symmetric_qnnpack_qat_qconfig,
+=======
+r"""Importing this file includes common utility methods and base classes for
+checking quantization api and properties of resulting modules.
+"""
+
+import torch
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.quantized.dynamic as nnqd
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from functorch.experimental import control_flow
+from torch.ao.nn.intrinsic import _FusedModule
+from torch.ao.quantization import (
+    convert,
+    default_dynamic_qat_qconfig,
+    default_dynamic_qconfig,
+    default_dynamic_quant_observer,
+    default_embedding_qat_qconfig,
+    default_observer,
+    default_per_channel_qconfig,
+    default_qconfig,
+    default_symmetric_qnnpack_qat_qconfig,
+    default_weight_observer,
+    DeQuantStub,
+    float_qparams_weight_only_qconfig,
+    get_default_qat_qconfig,
+    get_default_qat_qconfig_mapping,
+    get_default_qconfig,
+    get_default_qconfig_mapping,
+    PerChannelMinMaxObserver,
+    propagate_qconfig_,
+    QConfig,
+    QConfigMapping,
+    quantize,
+    quantize_dynamic_jit,
+    quantize_jit,
+    QuantStub,
+    QuantType,
+    QuantWrapper,
+)
+from torch.ao.quantization.backend_config import get_executorch_backend_config
+from torch.ao.quantization.quantization_mappings import (
+    get_default_dynamic_quant_module_mappings,
+    get_default_qat_module_mappings,
+    get_default_qconfig_propagation_list,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.ao.quantization.quantize_pt2e import (
     _convert_to_reference_decomposed_fx,
@@ -29,6 +78,7 @@
     prepare_pt2e,
     prepare_qat_pt2e,
 )
+<<<<<<< HEAD
 from torch.ao.quantization.backend_config import (
     get_executorch_backend_config,
 )
@@ -62,10 +112,36 @@
     from torch.ao.ns.fx.ns_types import NSSingleResultValuesType, NSSubgraph
     from torch.fx.graph import Node
     from torch.fx import GraphModule
+=======
+from torch.ao.quantization.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+
+from torch.export import export_for_training
+from torch.jit.mobile import _load_for_lite_interpreter
+from torch.testing._internal.common_quantized import override_quantized_engine
+from torch.testing._internal.common_utils import TEST_WITH_ROCM, TestCase
+
+try:
+    from torch.ao.ns.fx.ns_types import NSSingleResultValuesType, NSSubgraph
+
+    # graph mode quantization based on fx
+    from torch.ao.quantization.quantize_fx import (
+        convert_fx,
+        convert_to_reference_fx,
+        prepare_fx,
+        prepare_qat_fx,
+    )
+    from torch.fx import GraphModule
+    from torch.fx.graph import Node
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     HAS_FX = True
 except ImportError:
     HAS_FX = False
 
+<<<<<<< HEAD
 import copy
 import io
 import functools
@@ -75,11 +151,24 @@
 import numpy as np
 from torch.testing import FileCheck
 from typing import Callable, Any, Union, Optional
+=======
+import contextlib
+import copy
+import functools
+import io
+import os
+
+import unittest
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch._dynamo as torchdynamo
 import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
 import torch.ao.quantization.quantizer.xpu_inductor_quantizer as xpuiq
 from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
 from torch.ao.quantization.quantizer.xpu_inductor_quantizer import XPUInductorQuantizer
+<<<<<<< HEAD
 import contextlib
 
 class NodeSpec:
@@ -87,16 +176,31 @@ class NodeSpec:
     '''
     def __init__(self, op, target):
         '''
+=======
+from torch.testing import FileCheck
+
+
+class NodeSpec:
+    """Used for checking GraphModule Node"""
+
+    def __init__(self, op, target):
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         op: call_function | call_module
         target:
           for call_function, target would be a function
           for call_module, target would be the type of PyTorch module
+<<<<<<< HEAD
         '''
+=======
+        """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.op = op
         self.target = target
 
     @classmethod
     def call_function(cls, target):
+<<<<<<< HEAD
         return NodeSpec('call_function', target)
 
     @classmethod
@@ -106,6 +210,17 @@ def call_method(cls, target):
     @classmethod
     def call_module(cls, target):
         return NodeSpec('call_module', target)
+=======
+        return NodeSpec("call_function", target)
+
+    @classmethod
+    def call_method(cls, target):
+        return NodeSpec("call_method", target)
+
+    @classmethod
+    def call_module(cls, target):
+        return NodeSpec("call_module", target)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __hash__(self):
         return hash((self.op, self.target))
@@ -119,8 +234,17 @@ def __eq__(self, other):
     def __repr__(self):
         return repr(self.op) + " " + repr(self.target)
 
+<<<<<<< HEAD
 def get_supported_device_types():
     return ['cpu', 'cuda'] if torch.cuda.is_available() and not TEST_WITH_ROCM else ['cpu']
+=======
+
+def get_supported_device_types():
+    return (
+        ["cpu", "cuda"] if torch.cuda.is_available() and not TEST_WITH_ROCM else ["cpu"]
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def test_only_eval_fn(model, calib_data):
     r"""
@@ -130,7 +254,14 @@ def test_only_eval_fn(model, calib_data):
     for inp in calib_data:
         model(*inp)
 
+<<<<<<< HEAD
 _default_loss_fn = torch.nn.CrossEntropyLoss()
+=======
+
+_default_loss_fn = torch.nn.CrossEntropyLoss()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def test_only_train_fn(model, train_data, loss_fn=_default_loss_fn):
     r"""
     Default train function takes a torch.utils.data.Dataset and train the model
@@ -153,9 +284,17 @@ def test_only_train_fn(model, train_data, loss_fn=_default_loss_fn):
             correct += (predicted == target).sum().item()
     return train_loss, correct, total
 
+<<<<<<< HEAD
 class AverageMeter:
     """Computes and stores the average and current value"""
     def __init__(self, name, fmt=':f'):
+=======
+
+class AverageMeter:
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=":f"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.name = name
         self.fmt = fmt
         self.reset()
@@ -173,7 +312,11 @@ def update(self, val, n=1):
         self.avg = self.sum / self.count
 
     def __str__(self):
+<<<<<<< HEAD
         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+=======
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return fmtstr.format(**self.__dict__)
 
 
@@ -193,10 +336,18 @@ def accuracy(output, target, topk=(1,)):
             res.append(correct_k.mul_(100.0 / batch_size))
         return res
 
+<<<<<<< HEAD
 def train_one_epoch(model, criterion, optimizer, data_loader, device, ntrain_batches):
     model.train()
     for cnt, (image, target) in enumerate(data_loader, start=1):
         print('.', end='')
+=======
+
+def train_one_epoch(model, criterion, optimizer, data_loader, device, ntrain_batches):
+    model.train()
+    for cnt, (image, target) in enumerate(data_loader, start=1):
+        print(".", end="")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         image, target = image.to(device), target.to(device)
         output = model(image)
         loss = criterion(output, target)
@@ -208,16 +359,31 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, ntrain_bat
             return
     return
 
+<<<<<<< HEAD
 def ddp_setup(rank, world_size):
     os.environ['MASTER_ADDR'] = 'localhost'
     os.environ['MASTER_PORT'] = '12355'
+=======
+
+def ddp_setup(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # initialize the process group
     dist.init_process_group("gloo", rank=rank, world_size=world_size)
 
+<<<<<<< HEAD
 def ddp_cleanup():
     dist.destroy_process_group()
 
+=======
+
+def ddp_cleanup():
+    dist.destroy_process_group()
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def run_ddp(rank, world_size, prepared):
     ddp_setup(rank, world_size)
     prepared.cuda()
@@ -232,6 +398,7 @@ def run_ddp(rank, world_size, prepared):
 def convert_dynamic(module):
     convert(module, get_default_dynamic_quant_module_mappings(), inplace=True)
 
+<<<<<<< HEAD
 def prepare_dynamic(model, qconfig_dict=None):
     propagate_qconfig_(model, qconfig_dict)
 
@@ -239,17 +406,52 @@ def _make_conv_test_input(
     batch_size, in_channels_per_group, input_feature_map_size,
     out_channels_per_group, groups, kernel_size, X_scale, X_zero_point, W_scale,
     W_zero_point, use_bias, use_channelwise,
+=======
+
+def prepare_dynamic(model, qconfig_dict=None):
+    propagate_qconfig_(model, qconfig_dict)
+
+
+def _make_conv_test_input(
+    batch_size,
+    in_channels_per_group,
+    input_feature_map_size,
+    out_channels_per_group,
+    groups,
+    kernel_size,
+    X_scale,
+    X_zero_point,
+    W_scale,
+    W_zero_point,
+    use_bias,
+    use_channelwise,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     in_channels = in_channels_per_group * groups
     out_channels = out_channels_per_group * groups
 
     (X_value_min, X_value_max) = (0, 4)
     X_init = torch.randint(
+<<<<<<< HEAD
         X_value_min, X_value_max,
         (batch_size, in_channels,) + input_feature_map_size)
     X = X_scale * (X_init - X_zero_point).float()
     X_q = torch.quantize_per_tensor(
         X, scale=X_scale, zero_point=X_zero_point, dtype=torch.quint8)
+=======
+        X_value_min,
+        X_value_max,
+        (
+            batch_size,
+            in_channels,
+        )
+        + input_feature_map_size,
+    )
+    X = X_scale * (X_init - X_zero_point).float()
+    X_q = torch.quantize_per_tensor(
+        X, scale=X_scale, zero_point=X_zero_point, dtype=torch.quint8
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     W_scale = W_scale * out_channels
     W_zero_point = W_zero_point * out_channels
@@ -266,33 +468,69 @@ def _make_conv_test_input(
     # The operator expects them in the format
     # (out_channels, in_channels/groups,) + kernel_size
     W_init = torch.randint(
+<<<<<<< HEAD
         W_value_min, W_value_max,
         (out_channels, in_channels_per_group,) + kernel_size)
+=======
+        W_value_min,
+        W_value_max,
+        (
+            out_channels,
+            in_channels_per_group,
+        )
+        + kernel_size,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     b_init = torch.randint(0, 10, (out_channels,))
 
     if use_channelwise:
         W_shape = (-1, 1) + (1,) * len(kernel_size)
         W_scales_tensor = torch.tensor(W_scale, dtype=torch.float)
         W_zero_points_tensor = torch.tensor(W_zero_point, dtype=torch.float)
+<<<<<<< HEAD
         W = W_scales_tensor.reshape(*W_shape) * (
             W_init.float() - W_zero_points_tensor.reshape(*W_shape)).float()
         b = X_scale * W_scales_tensor * b_init.float()
         W_q = torch.quantize_per_channel(
             W, W_scales_tensor.double(), W_zero_points_tensor.long(), 0,
             dtype=torch.qint8)
+=======
+        W = (
+            W_scales_tensor.reshape(*W_shape)
+            * (W_init.float() - W_zero_points_tensor.reshape(*W_shape)).float()
+        )
+        b = X_scale * W_scales_tensor * b_init.float()
+        W_q = torch.quantize_per_channel(
+            W,
+            W_scales_tensor.double(),
+            W_zero_points_tensor.long(),
+            0,
+            dtype=torch.qint8,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         W = W_scale[0] * (W_init - W_zero_point[0]).float()
         b = X_scale * W_scale[0] * b_init.float()
         W_q = torch.quantize_per_tensor(
+<<<<<<< HEAD
             W, scale=W_scale[0], zero_point=W_zero_point[0], dtype=torch.qint8)
 
     return (X, X_q, W, W_q, b if use_bias else None)
 
+=======
+            W, scale=W_scale[0], zero_point=W_zero_point[0], dtype=torch.qint8
+        )
+
+    return (X, X_q, W, W_q, b if use_bias else None)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _make_conv_add_extra_input_tensor(scale, zero_point, sizes):
     (X_value_min, X_value_max) = (0, 4)
     X_init = torch.randint(
         X_value_min,
         X_value_max,
+<<<<<<< HEAD
         sizes  # Infer the size of tensor to do the add
     )
     X = scale * (X_init - zero_point).float()
@@ -304,12 +542,28 @@ def skipIfNoFBGEMM(fn):
     reason = 'Quantized operations require FBGEMM. FBGEMM is only optimized for CPUs with instruction set support AVX2 or newer.'
     if isinstance(fn, type):
         if 'fbgemm' not in torch.backends.quantized.supported_engines:
+=======
+        sizes,  # Infer the size of tensor to do the add
+    )
+    X = scale * (X_init - zero_point).float()
+    X_q = torch.quantize_per_tensor(
+        X, scale=scale, zero_point=zero_point, dtype=torch.quint8
+    )
+    return X, X_q
+
+
+def skipIfNoFBGEMM(fn):
+    reason = "Quantized operations require FBGEMM. FBGEMM is only optimized for CPUs with instruction set support AVX2 or newer."
+    if isinstance(fn, type):
+        if "fbgemm" not in torch.backends.quantized.supported_engines:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn.__unittest_skip__ = True
             fn.__unittest_skip_why__ = reason
         return fn
 
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
+<<<<<<< HEAD
         if 'fbgemm' not in torch.backends.quantized.supported_engines:
             raise unittest.SkipTest(reason)
         else:
@@ -320,12 +574,27 @@ def skipIfNoQNNPACK(fn):
     reason = 'Quantized operations require QNNPACK.'
     if isinstance(fn, type):
         if 'qnnpack' not in torch.backends.quantized.supported_engines:
+=======
+        if "fbgemm" not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+
+    return wrapper
+
+
+def skipIfNoQNNPACK(fn):
+    reason = "Quantized operations require QNNPACK."
+    if isinstance(fn, type):
+        if "qnnpack" not in torch.backends.quantized.supported_engines:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn.__unittest_skip__ = True
             fn.__unittest_skip_why__ = reason
         return fn
 
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
+<<<<<<< HEAD
         if 'qnnpack' not in torch.backends.quantized.supported_engines:
             raise unittest.SkipTest(reason)
         else:
@@ -338,29 +607,60 @@ def withQNNPACKBackend(fn):
     reason = 'Quantized operations require QNNPACK.'
     if isinstance(fn, type):
         if 'qnnpack' not in torch.backends.quantized.supported_engines:
+=======
+        if "qnnpack" not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+
+    return wrapper
+
+
+def withQNNPACKBackend(fn):
+    # TODO(future PR): consider combining with skipIfNoQNNPACK,
+    # will require testing of existing callsites
+    reason = "Quantized operations require QNNPACK."
+    if isinstance(fn, type):
+        if "qnnpack" not in torch.backends.quantized.supported_engines:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn.__unittest_skip__ = True
             fn.__unittest_skip_why__ = reason
         return fn
 
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
+<<<<<<< HEAD
         if 'qnnpack' not in torch.backends.quantized.supported_engines:
             raise unittest.SkipTest(reason)
         with override_quantized_engine('qnnpack'):
+=======
+        if "qnnpack" not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        with override_quantized_engine("qnnpack"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn(*args, **kwargs)
 
     return wrapper
 
+<<<<<<< HEAD
 def skipIfNoONEDNN(fn):
     reason = 'Quantized operations require ONEDNN.'
     if isinstance(fn, type):
         if 'onednn' not in torch.backends.quantized.supported_engines:
+=======
+
+def skipIfNoONEDNN(fn):
+    reason = "Quantized operations require ONEDNN."
+    if isinstance(fn, type):
+        if "onednn" not in torch.backends.quantized.supported_engines:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn.__unittest_skip__ = True
             fn.__unittest_skip_why__ = reason
         return fn
 
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
+<<<<<<< HEAD
         if 'onednn' not in torch.backends.quantized.supported_engines:
             raise unittest.SkipTest(reason)
         else:
@@ -369,6 +669,18 @@ def wrapper(*args, **kwargs):
 
 def skipIfNoONEDNNBF16(fn):
     reason = 'Quantized operations require BF16 support.'
+=======
+        if "onednn" not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+
+    return wrapper
+
+
+def skipIfNoONEDNNBF16(fn):
+    reason = "Quantized operations require BF16 support."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(fn, type):
         if not torch.ops.mkldnn._is_mkldnn_bf16_supported():
             fn.__unittest_skip__ = True
@@ -381,24 +693,46 @@ def wrapper(*args, **kwargs):
             raise unittest.SkipTest(reason)
         else:
             fn(*args, **kwargs)
+<<<<<<< HEAD
     return wrapper
 
 def skipIfNoX86(fn):
     reason = 'Quantized operations require X86.'
     if isinstance(fn, type):
         if 'x86' not in torch.backends.quantized.supported_engines:
+=======
+
+    return wrapper
+
+
+def skipIfNoX86(fn):
+    reason = "Quantized operations require X86."
+    if isinstance(fn, type):
+        if "x86" not in torch.backends.quantized.supported_engines:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn.__unittest_skip__ = True
             fn.__unittest_skip_why__ = reason
         return fn
 
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
+<<<<<<< HEAD
         if 'x86' not in torch.backends.quantized.supported_engines:
             raise unittest.SkipTest(reason)
         else:
             fn(*args, **kwargs)
     return wrapper
 
+=======
+        if "x86" not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+
+    return wrapper
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def skipIfNoDynamoSupport(fn):
     reason = "dynamo doesn't support."
     if isinstance(fn, type):
@@ -413,8 +747,15 @@ def wrapper(*args, **kwargs):
             raise unittest.SkipTest(reason)
         else:
             fn(*args, **kwargs)
+<<<<<<< HEAD
     return wrapper
 
+=======
+
+    return wrapper
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def skipIfNoInductorSupport(fn):
     reason = "inductor doesn't support."
     if isinstance(fn, type):
@@ -429,18 +770,36 @@ def wrapper(*args, **kwargs):
             raise unittest.SkipTest(reason)
         else:
             fn(*args, **kwargs)
+<<<<<<< HEAD
+    return wrapper
+
+try:
+    import torchvision  # noqa: F401
+=======
+
     return wrapper
 
+
 try:
     import torchvision  # noqa: F401
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     HAS_TORCHVISION = True
 except ImportError:
     HAS_TORCHVISION = False
 skip_if_no_torchvision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
+<<<<<<< HEAD
+def get_script_module(model, tracing, data):
+    return torch.jit.trace(model, data) if tracing else torch.jit.script(model)
+
+=======
+
 def get_script_module(model, tracing, data):
     return torch.jit.trace(model, data) if tracing else torch.jit.script(model)
 
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def lengths_to_offsets(t, offset_type=np.int64, use_begin_offset=True):
     """
     Convert lengths to offsets for embedding_bag
@@ -464,7 +823,11 @@ def _group_quantize_tensor(w, n_bit=4, q_group_size=16):
 
     max_val = to_quant.amax(dim=1, keepdim=True)
     min_val = to_quant.amin(dim=1, keepdim=True)
+<<<<<<< HEAD
     max_int = 2 ** n_bit - 1
+=======
+    max_int = 2**n_bit - 1
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     min_int = 0
     scales = (max_val - min_val).clamp(min=1e-6) / max_int
     assert torch.isnan(scales).sum() == 0
@@ -476,7 +839,11 @@ def _group_quantize_tensor(w, n_bit=4, q_group_size=16):
     assert torch.isnan(out).sum() == 0
 
     out = out.to(dtype=torch.int32).reshape(w.shape)
+<<<<<<< HEAD
     if out.device != torch.device('cpu'):
+=======
+    if out.device != torch.device("cpu"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         out = (out[::, ::2] << 4 | out[::, 1::2]).to(torch.uint8)
 
     # Scales and zeros for the same q-group should be contiguous, so we can
@@ -490,15 +857,25 @@ def _group_quantize_tensor(w, n_bit=4, q_group_size=16):
                 zeros.reshape(zeros.size(0), zeros.size(1), 1),
             ],
             2,
+<<<<<<< HEAD
         ).transpose(0, 1).contiguous()
+=======
+        )
+        .transpose(0, 1)
+        .contiguous()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
     return out, scales_and_zeros
 
 
+<<<<<<< HEAD
 def _group_quantize_tensor_symmetric(
     w, n_bit=4, groupsize=32
 ):
+=======
+def _group_quantize_tensor_symmetric(w, n_bit=4, groupsize=32):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # W is of shape [K x N]
     # We transpose W as Quantization is applied on [N x K]
     w = w.transpose(0, 1).contiguous()
@@ -566,6 +943,7 @@ class QuantizationTestCase(TestCase):
     def setUp(self):
         super().setUp()
         self.calib_data = [[torch.rand(2, 5, dtype=torch.float)] for _ in range(2)]
+<<<<<<< HEAD
         self.train_data = [[torch.rand(2, 5, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)] for _ in range(2)]
         self.img_data_1d = [[torch.rand(2, 3, 10, dtype=torch.float)]
                             for _ in range(2)]
@@ -586,6 +964,49 @@ def setUp(self):
         self.img_data_dict = {1 : self.img_data_1d,
                               2 : self.img_data_2d,
                               3 : self.img_data_3d}
+=======
+        self.train_data = [
+            [
+                torch.rand(2, 5, dtype=torch.float),
+                torch.randint(0, 1, (2,), dtype=torch.long),
+            ]
+            for _ in range(2)
+        ]
+        self.img_data_1d = [[torch.rand(2, 3, 10, dtype=torch.float)] for _ in range(2)]
+        self.img_data_2d = [
+            [torch.rand(1, 3, 10, 10, dtype=torch.float)] for _ in range(2)
+        ]
+        self.img_data_3d = [
+            [torch.rand(1, 3, 5, 5, 5, dtype=torch.float)] for _ in range(2)
+        ]
+        self.img_data_1d_train = [
+            [
+                torch.rand(2, 3, 10, dtype=torch.float),
+                torch.randint(0, 1, (1,), dtype=torch.long),
+            ]
+            for _ in range(2)
+        ]
+        self.img_data_2d_train = [
+            [
+                torch.rand(1, 3, 10, 10, dtype=torch.float),
+                torch.randint(0, 1, (1,), dtype=torch.long),
+            ]
+            for _ in range(2)
+        ]
+        self.img_data_3d_train = [
+            [
+                torch.rand(1, 3, 5, 5, 5, dtype=torch.float),
+                torch.randint(0, 1, (1,), dtype=torch.long),
+            ]
+            for _ in range(2)
+        ]
+
+        self.img_data_dict = {
+            1: self.img_data_1d,
+            2: self.img_data_2d,
+            3: self.img_data_3d,
+        }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Quant types that produce statically quantized ops
         self.static_quant_types = [QuantType.STATIC, QuantType.QAT]
@@ -594,6 +1015,7 @@ def setUp(self):
 
     def checkNoPrepModules(self, module):
         r"""Checks the module does not contain child
+<<<<<<< HEAD
             modules for quantization preparation, e.g.
             quant, dequant and observer
         """
@@ -604,12 +1026,24 @@ def checkNoQconfig(self, module):
         r"""Checks the module does not contain qconfig
         """
         self.assertFalse(hasattr(module, 'qconfig'))
+=======
+        modules for quantization preparation, e.g.
+        quant, dequant and observer
+        """
+        self.assertFalse(hasattr(module, "quant"))
+        self.assertFalse(hasattr(module, "dequant"))
+
+    def checkNoQconfig(self, module):
+        r"""Checks the module does not contain qconfig"""
+        self.assertFalse(hasattr(module, "qconfig"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         for child in module.children():
             self.checkNoQconfig(child)
 
     def checkHasPrepModules(self, module):
         r"""Checks the module contains child
+<<<<<<< HEAD
             modules for quantization preparation, e.g.
             quant, dequant and observer
         """
@@ -620,17 +1054,38 @@ def checkHasPrepModules(self, module):
     def checkObservers(self, module, propagate_qconfig_list=None, prepare_custom_config_dict=None):
         r"""Checks the module or module's leaf descendants
             have observers in preparation for quantization
+=======
+        modules for quantization preparation, e.g.
+        quant, dequant and observer
+        """
+        self.assertTrue(hasattr(module, "module"))
+        self.assertTrue(hasattr(module, "quant"))
+        self.assertTrue(hasattr(module, "dequant"))
+
+    def checkObservers(
+        self, module, propagate_qconfig_list=None, prepare_custom_config_dict=None
+    ):
+        r"""Checks the module or module's leaf descendants
+        have observers in preparation for quantization
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         if propagate_qconfig_list is None:
             propagate_qconfig_list = get_default_qconfig_propagation_list()
         if prepare_custom_config_dict is None:
             prepare_custom_config_dict = {}
+<<<<<<< HEAD
         float_to_observed_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {})
+=======
+        float_to_observed_module_class_mapping = prepare_custom_config_dict.get(
+            "float_to_observed_custom_module_class", {}
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # check if a module is a leaf module, ignoring activation_post_process attribute
         def is_leaf_module(module):
             submodule_name_count = 0
             for name, _ in module.named_children():
+<<<<<<< HEAD
                 if name != 'activation_post_process':
                     submodule_name_count += 1
             return submodule_name_count == 0
@@ -655,14 +1110,59 @@ def is_leaf_module(module):
     def checkQuantDequant(self, mod):
         r"""Checks that mod has nn.Quantize and
             nn.DeQuantize submodules inserted
+=======
+                if name != "activation_post_process":
+                    submodule_name_count += 1
+            return submodule_name_count == 0
+
+        if (
+            hasattr(module, "qconfig")
+            and module.qconfig is not None
+            and (
+                (
+                    is_leaf_module(module)
+                    and not isinstance(module, torch.nn.Sequential)
+                    and type(module) in propagate_qconfig_list
+                )
+                or type(module) in float_to_observed_module_class_mapping.keys()
+            )
+            and not isinstance(module, torch.ao.quantization.DeQuantStub)
+        ):
+            self.assertTrue(
+                hasattr(module, "activation_post_process"),
+                "module: " + str(type(module)) + " do not have observer",
+            )
+        # we don't need to check observers for child modules of the
+        # qat modules
+        if (
+            type(module) not in get_default_qat_module_mappings().values()
+            and type(module) not in float_to_observed_module_class_mapping.values()
+            and not isinstance(module, _FusedModule)
+        ):
+            for child in module.children():
+                if type(child) in [nn.Dropout]:
+                    continue
+                self.checkObservers(
+                    child, propagate_qconfig_list, prepare_custom_config_dict
+                )
+
+    def checkQuantDequant(self, mod):
+        r"""Checks that mod has nn.Quantize and
+        nn.DeQuantize submodules inserted
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         self.assertEqual(type(mod.quant), nnq.Quantize)
         self.assertEqual(type(mod.dequant), nnq.DeQuantize)
 
     def checkWrappedQuantizedLinear(self, mod):
         r"""Checks that mod has been swapped for an nnq.Linear
+<<<<<<< HEAD
             module, the bias is qint32, and that the module
             has Quantize and DeQuantize submodules
+=======
+        module, the bias is qint32, and that the module
+        has Quantize and DeQuantize submodules
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         self.assertEqual(type(mod.module), nnq.Linear)
         self.checkQuantDequant(mod)
@@ -672,14 +1172,22 @@ def checkQuantizedLinear(self, mod):
 
     def checkDynamicQuantizedLinear(self, mod, dtype):
         r"""Checks that mod has been swapped for an nnqd.Linear
+<<<<<<< HEAD
             module, the bias is float.
+=======
+        module, the bias is float.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         self.assertEqual(type(mod), nnqd.Linear)
         self.assertEqual(mod._packed_params.dtype, dtype)
 
     def checkDynamicQuantizedLinearRelu(self, mod, dtype):
         r"""Checks that mod has been swapped for an nnqd.Linear
+<<<<<<< HEAD
             module, the bias is float.
+=======
+        module, the bias is float.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         self.assertEqual(type(mod), nniqd.LinearReLU)
         self.assertEqual(mod._packed_params.dtype, dtype)
@@ -721,18 +1229,33 @@ def check_weight_bias_api(self, ref_model, weight_keys, bias_keys):
 
     def checkDynamicQuantizedLSTM(self, mod, reference_module_type, dtype):
         r"""Checks that mod has been swapped for an nnqd.LSTM type
+<<<<<<< HEAD
             module, the bias is float.
         """
         wt_dtype_map = {torch.qint8: 'quantized_dynamic', torch.float16: 'quantized_fp16'}
         self.assertEqual(type(mod), reference_module_type)
         for packed_params in mod._all_weight_values:
             self.assertEqual(packed_params.param.__getstate__()[0][0], wt_dtype_map[dtype])
+=======
+        module, the bias is float.
+        """
+        wt_dtype_map = {
+            torch.qint8: "quantized_dynamic",
+            torch.float16: "quantized_fp16",
+        }
+        self.assertEqual(type(mod), reference_module_type)
+        for packed_params in mod._all_weight_values:
+            self.assertEqual(
+                packed_params.param.__getstate__()[0][0], wt_dtype_map[dtype]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def checkLinear(self, mod):
         self.assertEqual(type(mod), torch.nn.Linear)
 
     def checkDynamicQuantizedModule(self, mod, reference_module_type, dtype):
         r"""Checks that mod has been swapped for an nnqd.Linear
+<<<<<<< HEAD
             module, the bias is float.
         """
         wt_dtype_map = {torch.qint8: 'quantized_dynamic', torch.float16: 'quantized_fp16'}
@@ -740,6 +1263,20 @@ def checkDynamicQuantizedModule(self, mod, reference_module_type, dtype):
         if hasattr(mod, '_all_weight_values'):
             for packed_params in mod._all_weight_values:
                 self.assertEqual(packed_params.param.__getstate__()[0][0], wt_dtype_map[dtype])
+=======
+        module, the bias is float.
+        """
+        wt_dtype_map = {
+            torch.qint8: "quantized_dynamic",
+            torch.float16: "quantized_fp16",
+        }
+        self.assertEqual(type(mod), reference_module_type)
+        if hasattr(mod, "_all_weight_values"):
+            for packed_params in mod._all_weight_values:
+                self.assertEqual(
+                    packed_params.param.__getstate__()[0][0], wt_dtype_map[dtype]
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def checkScriptable(self, orig_mod, calib_data, check_save_load=False):
         scripted = torch.jit.script(orig_mod)
@@ -770,20 +1307,45 @@ def _checkModuleCorrectnessAgainstOrig(self, orig_mod, test_mod, calib_data):
             scripted_output = test_mod(*inp)
             self.assertEqual(scripted_output, ref_output)
 
+<<<<<<< HEAD
 
     def checkGraphModeOp(self, module, inputs, quantized_op, tracing=False, debug=False,
                          check=True, eval_mode=True, dynamic=False, qconfig=None):
         if debug:
             print('Testing:', str(module))
         qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
+=======
+    def checkGraphModeOp(
+        self,
+        module,
+        inputs,
+        quantized_op,
+        tracing=False,
+        debug=False,
+        check=True,
+        eval_mode=True,
+        dynamic=False,
+        qconfig=None,
+    ):
+        if debug:
+            print("Testing:", str(module))
+        qconfig_dict = {"": get_default_qconfig(torch.backends.quantized.engine)}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if eval_mode:
             module = module.eval()
         if dynamic:
+<<<<<<< HEAD
             qconfig_dict = {'': default_dynamic_qconfig if qconfig is None else qconfig}
         model = get_script_module(module, tracing, inputs[0]).eval()
         if debug:
             print('input graph:', model.graph)
+=======
+            qconfig_dict = {"": default_dynamic_qconfig if qconfig is None else qconfig}
+        model = get_script_module(module, tracing, inputs[0]).eval()
+        if debug:
+            print("input graph:", model.graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         models = {}
         outputs = {}
         for debug in [True, False]:
@@ -796,31 +1358,60 @@ def checkGraphModeOp(self, module, inputs, quantized_op, tracing=False, debug=Fa
                 # input data staying constant for comparisons
                 inputs_copy = copy.deepcopy(inputs)
                 models[debug] = quantize_jit(
+<<<<<<< HEAD
                     model, qconfig_dict, test_only_eval_fn, [inputs_copy], inplace=False,
                     debug=debug)
+=======
+                    model,
+                    qconfig_dict,
+                    test_only_eval_fn,
+                    [inputs_copy],
+                    inplace=False,
+                    debug=debug,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # make sure it runs
                 outputs[debug] = models[debug](*inputs[0])
 
         if debug:
+<<<<<<< HEAD
             print('debug graph:', models[True].graph)
             print('non debug graph:', models[False].graph)
+=======
+            print("debug graph:", models[True].graph)
+            print("non debug graph:", models[False].graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if check:
             # debug and non-debug option should have the same numerics
             self.assertEqual(outputs[True], outputs[False])
 
             # non debug graph should produce quantized op
+<<<<<<< HEAD
             FileCheck().check(quantized_op) \
                        .run(models[False].graph)
+=======
+            FileCheck().check(quantized_op).run(models[False].graph)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         return models[False]
 
     def checkGraphModuleNodes(
+<<<<<<< HEAD
             self, graph_module,
             expected_node=None,
             expected_node_occurrence=None,
             expected_node_list=None):
         """ Check if GraphModule contains the target node
+=======
+        self,
+        graph_module,
+        expected_node=None,
+        expected_node_occurrence=None,
+        expected_node_list=None,
+    ):
+        """Check if GraphModule contains the target node
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         Args:
             graph_module: the GraphModule instance we want to check
             expected_node, expected_node_occurrence, expected_node_list:
@@ -831,9 +1422,15 @@ def checkGraphModuleNodes(
         modules = dict(graph_module.named_modules(remove_duplicate=False))
         for node in graph_module.graph.nodes:
             n = None
+<<<<<<< HEAD
             if node.op == 'call_function' or node.op == 'call_method':
                 n = NodeSpec(node.op, node.target)
             elif node.op == 'call_module':
+=======
+            if node.op == "call_function" or node.op == "call_method":
+                n = NodeSpec(node.op, node.target)
+            elif node.op == "call_module":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 n = NodeSpec(node.op, type(modules[node.target]))
 
             if n is not None:
@@ -844,14 +1441,22 @@ def checkGraphModuleNodes(
                     nodes_in_graph[n] = 1
 
         if expected_node is not None:
+<<<<<<< HEAD
             self.assertTrue(expected_node in nodes_in_graph, 'node:' + str(expected_node) +
                             ' not found in the graph module')
+=======
+            self.assertTrue(
+                expected_node in nodes_in_graph,
+                "node:" + str(expected_node) + " not found in the graph module",
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if expected_node_occurrence is not None:
             for expected_node, occurrence in expected_node_occurrence.items():
                 if occurrence != 0:
                     self.assertTrue(
                         expected_node in nodes_in_graph,
+<<<<<<< HEAD
                         'Check failed for node:' + str(expected_node) +
                         ' not found')
                     self.assertTrue(
@@ -864,6 +1469,26 @@ def checkGraphModuleNodes(
                         expected_node not in nodes_in_graph,
                         'Check failed for node:' + str(expected_node) +
                         ' expected no occurrence but found')
+=======
+                        "Check failed for node:" + str(expected_node) + " not found",
+                    )
+                    self.assertTrue(
+                        nodes_in_graph[expected_node] == occurrence,
+                        "Check failed for node:"
+                        + str(expected_node)
+                        + " Expected occurrence:"
+                        + str(occurrence)
+                        + " Found occurrence:"
+                        + str(nodes_in_graph[expected_node]),
+                    )
+                else:
+                    self.assertTrue(
+                        expected_node not in nodes_in_graph,
+                        "Check failed for node:"
+                        + str(expected_node)
+                        + " expected no occurrence but found",
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if expected_node_list is not None:
             cur_index = 0
@@ -874,20 +1499,36 @@ def checkGraphModuleNodes(
                     cur_index += 1
             self.assertTrue(
                 cur_index == len(expected_node_list),
+<<<<<<< HEAD
                 "Check failed for graph:" +
                 self.printGraphModule(graph_module, print_str=False) +
                 "Expected ordered list:" +
                 str(expected_node_list))
+=======
+                "Check failed for graph:"
+                + self.printGraphModule(graph_module, print_str=False)
+                + "Expected ordered list:"
+                + str(expected_node_list),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def printGraphModule(self, graph_module, print_str=True):
         modules = dict(graph_module.named_modules(remove_duplicate=False))
         node_infos = []
         for n in graph_module.graph.nodes:
+<<<<<<< HEAD
             node_info = ' '.join(map(repr, [n.op, n.name, n.target, n.args, n.kwargs]))
             if n.op == 'call_module':
                 node_info += ' module type: ' + repr(type(modules[n.target]))
             node_infos.append(node_info)
         str_to_print = '\n'.join(node_infos)
+=======
+            node_info = " ".join(map(repr, [n.op, n.name, n.target, n.args, n.kwargs]))
+            if n.op == "call_module":
+                node_info += " module type: " + repr(type(modules[n.target]))
+            node_infos.append(node_info)
+        str_to_print = "\n".join(node_infos)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if print_str:
             print(str_to_print)
         return str_to_print
@@ -897,7 +1538,13 @@ def printGraphModule(self, graph_module, print_str=True):
         def assert_types_for_matched_subgraph_pairs(
             self,
             matched_subgraph_pairs: dict[str, tuple[NSSubgraph, NSSubgraph]],
+<<<<<<< HEAD
             expected_types: dict[str, tuple[tuple[Callable, Callable], tuple[Callable, Callable]]],
+=======
+            expected_types: dict[
+                str, tuple[tuple[Callable, Callable], tuple[Callable, Callable]]
+            ],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             gm_a: GraphModule,
             gm_b: GraphModule,
         ) -> None:
@@ -917,16 +1564,28 @@ def assert_types_for_matched_subgraph_pairs(
             def _get_underlying_op_type(
                 node: Node, gm: GraphModule
             ) -> Union[Callable, str]:
+<<<<<<< HEAD
                 if node.op == 'call_module':
                     mod = getattr(gm, node.target)
                     return type(mod)
                 else:
                     assert node.op in ('call_function', 'call_method')
+=======
+                if node.op == "call_module":
+                    mod = getattr(gm, node.target)
+                    return type(mod)
+                else:
+                    assert node.op in ("call_function", "call_method")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     return node.target
 
             self.assertTrue(
                 len(matched_subgraph_pairs) == len(expected_types),
+<<<<<<< HEAD
                 f'Expected length of results to match, but got {len(matched_subgraph_pairs)} and {len(expected_types)}'
+=======
+                f"Expected length of results to match, but got {len(matched_subgraph_pairs)} and {len(expected_types)}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for k, v in expected_types.items():
                 expected_types_a, expected_types_b = v
@@ -938,6 +1597,7 @@ def _get_underlying_op_type(
                 act_type_start_b = _get_underlying_op_type(subgraph_b.start_node, gm_b)
                 act_type_end_a = _get_underlying_op_type(subgraph_a.end_node, gm_a)
                 act_type_end_b = _get_underlying_op_type(subgraph_b.end_node, gm_b)
+<<<<<<< HEAD
                 types_match = (exp_type_start_a is act_type_start_a) and \
                     (exp_type_end_a is act_type_end_a) and \
                     (exp_type_start_b is act_type_start_b) and \
@@ -946,6 +1606,18 @@ def _get_underlying_op_type(
                     types_match,
                     f'Type mismatch at {k}: expected {(exp_type_start_a, exp_type_end_a, exp_type_start_b, exp_type_end_b)}, '
                     f'got {(act_type_start_a, act_type_end_a, act_type_start_b, act_type_end_b)}'
+=======
+                types_match = (
+                    (exp_type_start_a is act_type_start_a)
+                    and (exp_type_end_a is act_type_end_a)
+                    and (exp_type_start_b is act_type_start_b)
+                    and (exp_type_end_b is act_type_end_b)
+                )
+                self.assertTrue(
+                    types_match,
+                    f"Type mismatch at {k}: expected {(exp_type_start_a, exp_type_end_a, exp_type_start_b, exp_type_end_b)}, "
+                    f"got {(act_type_start_a, act_type_end_a, act_type_start_b, act_type_end_b)}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
         def assert_ns_compare_dict_valid(
@@ -962,12 +1634,18 @@ def assert_ns_compare_dict_valid(
                 for result_type, layer_data in result_type_to_data.items():
                     self.assertTrue(
                         len(layer_data) == 2,
+<<<<<<< HEAD
                         f"Layer {layer_name} does not have exactly two model results.")
+=======
+                        f"Layer {layer_name} does not have exactly two model results.",
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     model_name_0, model_name_1 = layer_data.keys()
                     for res_idx in range(len(layer_data[model_name_0])):
                         layer_data_0 = layer_data[model_name_0][res_idx]
                         layer_data_1 = layer_data[model_name_1][res_idx]
                         self.assertTrue(
+<<<<<<< HEAD
                             layer_data_0['type'] == layer_data_0['type'],
                             f"Layer {layer_name}, {model_name_0} and {model_name_1} do not have the same type.")
 
@@ -975,10 +1653,21 @@ def assert_ns_compare_dict_valid(
                             len(layer_data_0['values']) ==
                             len(layer_data_1['values']),
                             f"Layer {layer_name}, {model_name_0} and {model_name_1} do not have the same number of seen Tensors.")
+=======
+                            layer_data_0["type"] == layer_data_0["type"],
+                            f"Layer {layer_name}, {model_name_0} and {model_name_1} do not have the same type.",
+                        )
+
+                        self.assertTrue(
+                            len(layer_data_0["values"]) == len(layer_data_1["values"]),
+                            f"Layer {layer_name}, {model_name_0} and {model_name_1} do not have the same number of seen Tensors.",
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                         # F.conv1d weight has rank 3, and toq.conv1d unpacked weight
                         # has rank 4. For now, skip the length check for conv1d only.
                         is_weight_functional_conv1d = (
+<<<<<<< HEAD
                             result_type == NSSingleResultValuesType.WEIGHT.value and
                             (
                                 'conv1d' in layer_data_0['prev_node_target_type'] or
@@ -994,16 +1683,44 @@ def assert_ns_compare_dict_valid(
                                         values_0.shape == values_1.shape,
                                         f"Layer {layer_name}, {model_name_0} and {model_name_1} " +
                                         f"have a shape mismatch at idx {idx}.")
+=======
+                            result_type == NSSingleResultValuesType.WEIGHT.value
+                            and (
+                                "conv1d" in layer_data_0["prev_node_target_type"]
+                                or "conv1d" in layer_data_1["prev_node_target_type"]
+                            )
+                        )
+                        if not is_weight_functional_conv1d:
+                            for idx in range(len(layer_data_0["values"])):
+                                values_0 = layer_data_0["values"][idx]
+                                values_1 = layer_data_1["values"][idx]
+                                if isinstance(values_0, torch.Tensor):
+                                    self.assertTrue(
+                                        values_0.shape == values_1.shape,
+                                        f"Layer {layer_name}, {model_name_0} and {model_name_1} "
+                                        + f"have a shape mismatch at idx {idx}.",
+                                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                 elif isinstance(values_0, list):
                                     values_0 = values_0[0]
                                     values_1 = values_1[0]
                                     self.assertTrue(
                                         values_0.shape == values_1.shape,
+<<<<<<< HEAD
                                         f"Layer {layer_name}, {model_name_0} and {model_name_1} " +
                                         f"have a shape mismatch at idx {idx}.")
                                 else:
                                     assert isinstance(values_0, tuple), \
                                         f"unhandled type {type(values_0)}"
+=======
+                                        f"Layer {layer_name}, {model_name_0} and {model_name_1} "
+                                        + f"have a shape mismatch at idx {idx}.",
+                                    )
+                                else:
+                                    assert isinstance(
+                                        values_0, tuple
+                                    ), f"unhandled type {type(values_0)}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                                     assert len(values_0) == 2
                                     assert len(values_0[1]) == 2
                                     assert values_0[0].shape == values_1[0].shape
@@ -1011,6 +1728,7 @@ def assert_ns_compare_dict_valid(
                                     assert values_0[1][1].shape == values_1[1][1].shape
 
                         # verify that ref_node_name is valid
+<<<<<<< HEAD
                         ref_node_name_0 = layer_data_0['ref_node_name']
                         ref_node_name_1 = layer_data_1['ref_node_name']
                         prev_node_name_0 = layer_data_0['prev_node_name']
@@ -1019,10 +1737,27 @@ def assert_ns_compare_dict_valid(
                             self.assertTrue(ref_node_name_0 == prev_node_name_0)
                             self.assertTrue(ref_node_name_1 == prev_node_name_1)
                         elif layer_data_0['type'] == NSSingleResultValuesType.NODE_INPUT.value:
+=======
+                        ref_node_name_0 = layer_data_0["ref_node_name"]
+                        ref_node_name_1 = layer_data_1["ref_node_name"]
+                        prev_node_name_0 = layer_data_0["prev_node_name"]
+                        prev_node_name_1 = layer_data_1["prev_node_name"]
+                        if (
+                            layer_data_0["type"]
+                            == NSSingleResultValuesType.NODE_OUTPUT.value
+                        ):
+                            self.assertTrue(ref_node_name_0 == prev_node_name_0)
+                            self.assertTrue(ref_node_name_1 == prev_node_name_1)
+                        elif (
+                            layer_data_0["type"]
+                            == NSSingleResultValuesType.NODE_INPUT.value
+                        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             self.assertTrue(ref_node_name_0 != prev_node_name_0)
                             self.assertTrue(ref_node_name_1 != prev_node_name_1)
 
         def checkGraphModeFxOp(
+<<<<<<< HEAD
                 self,
                 model,
                 inputs,
@@ -1075,16 +1810,82 @@ def checkGraphModeFxOp(
                                        # quantized_reference model depending on the
                                        # is_reference argument
                    }
+=======
+            self,
+            model,
+            inputs,
+            quant_type,
+            expected_node=None,
+            expected_node_occurrence=None,
+            expected_node_list=None,
+            is_reference=False,
+            print_debug_info=False,
+            custom_qconfig_dict=None,
+            prepare_expected_node=None,
+            prepare_expected_node_occurrence=None,
+            prepare_expected_node_list=None,
+            prepare_custom_config=None,
+            backend_config=None,
+        ):
+            """Quantizes model with graph mode quantization on fx and check if the
+            quantized model contains the quantized_node
+
+            Args:
+                model: floating point torch.nn.Module
+                inputs: one positional sample input arguments for model
+                expected_node: NodeSpec
+                    e.g. NodeSpec.call_function(torch.quantize_per_tensor)
+                expected_node_occurrence: a dict from NodeSpec to
+                    expected number of occurrences (int)
+                    e.g. {NodeSpec.call_function(torch.quantize_per_tensor) : 1,
+                            NodeSpec.call_method('dequantize'): 1}
+                expected_node_list: a list of NodeSpec, used to check the order
+                    of the occurrence of Node
+                    e.g. [NodeSpec.call_function(torch.quantize_per_tensor),
+                            NodeSpec.call_module(nnq.Conv2d),
+                            NodeSpec.call_function(F.hardtanh_),
+                            NodeSpec.call_method('dequantize')]
+                is_reference: if True, enables reference mode
+                print_debug_info: if True, prints debug info
+                custom_qconfig_dict: overrides default qconfig_dict
+                prepare_expected_node: same as expected_node, but for prepare
+                prepare_expected_node_occurrence: same as
+                    expected_node_occurrence, but for prepare
+                prepare_expected_node_list: same as expected_node_list, but
+                    for prepare
+
+            Returns:
+                A dictionary with the following structure:
+               {
+                   "prepared": ...,  # the prepared model
+                   "quantized": ...,  # the quantized non-reference model
+                   "quantized_reference": ...,  # the quantized reference model
+                   "result": ...,  # the result for either quantized or
+                                   # quantized_reference model depending on the
+                                   # is_reference argument
+               }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             """
             # TODO: make img_data a single example instead of a list
             if type(inputs) == list:
                 inputs = inputs[0]
 
             if quant_type == QuantType.QAT:
+<<<<<<< HEAD
                 qconfig_mapping = get_default_qat_qconfig_mapping(torch.backends.quantized.engine)
                 model.train()
             elif quant_type == QuantType.STATIC:
                 qconfig_mapping = get_default_qconfig_mapping(torch.backends.quantized.engine)
+=======
+                qconfig_mapping = get_default_qat_qconfig_mapping(
+                    torch.backends.quantized.engine
+                )
+                model.train()
+            elif quant_type == QuantType.STATIC:
+                qconfig_mapping = get_default_qconfig_mapping(
+                    torch.backends.quantized.engine
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 model.eval()
             else:
                 qconfig = default_dynamic_qconfig
@@ -1098,22 +1899,39 @@ def checkGraphModeFxOp(
 
             # overwrite qconfig_dict with custom_qconfig_dict
             if custom_qconfig_dict is not None:
+<<<<<<< HEAD
                 assert type(custom_qconfig_dict) in (QConfigMapping, dict), \
                     'custom_qconfig_dict should be a QConfigMapping or a dict'
+=======
+                assert type(custom_qconfig_dict) in (
+                    QConfigMapping,
+                    dict,
+                ), "custom_qconfig_dict should be a QConfigMapping or a dict"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if isinstance(custom_qconfig_dict, QConfigMapping):
                     qconfig_mapping = custom_qconfig_dict
                 else:
                     qconfig_mapping = QConfigMapping.from_dict(custom_qconfig_dict)
             prepared = prepare(
+<<<<<<< HEAD
                 model, qconfig_mapping,
                 example_inputs=inputs,
                 prepare_custom_config=prepare_custom_config,
                 backend_config=backend_config)
+=======
+                model,
+                qconfig_mapping,
+                example_inputs=inputs,
+                prepare_custom_config=prepare_custom_config,
+                backend_config=backend_config,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not quant_type == QuantType.DYNAMIC:
                 prepared(*inputs)
 
             if print_debug_info:
                 print()
+<<<<<<< HEAD
                 print('quant type:\n', quant_type)
                 print('original model:\n', model)
                 print()
@@ -1122,6 +1940,19 @@ def checkGraphModeFxOp(
             self.checkGraphModuleNodes(
                 prepared, prepare_expected_node,
                 prepare_expected_node_occurrence, prepare_expected_node_list)
+=======
+                print("quant type:\n", quant_type)
+                print("original model:\n", model)
+                print()
+                print("prepared model:\n", prepared)
+
+            self.checkGraphModuleNodes(
+                prepared,
+                prepare_expected_node,
+                prepare_expected_node_occurrence,
+                prepare_expected_node_list,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             prepared_copy = copy.deepcopy(prepared)
             qgraph = convert_fx(copy.deepcopy(prepared))
@@ -1134,6 +1965,7 @@ def checkGraphModeFxOp(
             qgraph_to_check = qgraph_reference if is_reference else qgraph
             if print_debug_info:
                 print()
+<<<<<<< HEAD
                 print('quantized model:\n', qgraph_to_check)
                 self.printGraphModule(qgraph_to_check)
                 print()
@@ -1148,6 +1980,36 @@ def checkGraphModeFxOp(
 
     def checkEmbeddingSerialization(self, qemb, num_embeddings, embedding_dim, indices, offsets,
                                     set_qconfig, is_emb_bag, dtype=torch.quint8):
+=======
+                print("quantized model:\n", qgraph_to_check)
+                self.printGraphModule(qgraph_to_check)
+                print()
+            self.checkGraphModuleNodes(
+                qgraph_to_check,
+                expected_node,
+                expected_node_occurrence,
+                expected_node_list,
+            )
+            return {
+                "prepared": prepared_copy,
+                "quantized": qgraph_copy,
+                "quantized_reference": qgraph_reference_copy,
+                "quantized_output": result,
+                "quantized_reference_output": result_reference,
+            }
+
+    def checkEmbeddingSerialization(
+        self,
+        qemb,
+        num_embeddings,
+        embedding_dim,
+        indices,
+        offsets,
+        set_qconfig,
+        is_emb_bag,
+        dtype=torch.quint8,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test serialization of dynamic EmbeddingBag module using state_dict
         if is_emb_bag:
             inputs = [indices, offsets]
@@ -1169,6 +2031,7 @@ def checkEmbeddingSerialization(self, qemb, num_embeddings, embedding_dim, indic
 
         # Check state dict serialization and torch.save APIs
         if is_emb_bag:
+<<<<<<< HEAD
             loaded_qemb = nnq.EmbeddingBag(num_embeddings=num_embeddings, embedding_dim=embedding_dim,
                                            include_last_offset=True, mode='sum', dtype=dtype)
         else:
@@ -1179,12 +2042,33 @@ def checkEmbeddingSerialization(self, qemb, num_embeddings, embedding_dim, indic
         self.assertEqual(embedding_unpack(qemb._packed_params._packed_weight),
                          embedding_unpack(loaded_qemb._packed_params._packed_weight))
 
+=======
+            loaded_qemb = nnq.EmbeddingBag(
+                num_embeddings=num_embeddings,
+                embedding_dim=embedding_dim,
+                include_last_offset=True,
+                mode="sum",
+                dtype=dtype,
+            )
+        else:
+            loaded_qemb = nnq.Embedding(
+                num_embeddings=num_embeddings, embedding_dim=embedding_dim, dtype=dtype
+            )
+        self.check_eager_serialization(qemb, loaded_qemb, inputs)
+
+        loaded_qemb.load_state_dict(loaded_dict)
+        self.assertEqual(
+            embedding_unpack(qemb._packed_params._packed_weight),
+            embedding_unpack(loaded_qemb._packed_params._packed_weight),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Test JIT serialization
         self.checkScriptable(qemb, [inputs], check_save_load=True)
 
         # Test from_float call
         if is_emb_bag:
+<<<<<<< HEAD
             float_embedding = torch.nn.EmbeddingBag(num_embeddings=num_embeddings, embedding_dim=embedding_dim,
                                                     include_last_offset=True, scale_grad_by_freq=False, mode='sum')
         else:
@@ -1196,6 +2080,27 @@ def checkEmbeddingSerialization(self, qemb, num_embeddings, embedding_dim, indic
                                                                         ch_axis=0)
             float_embedding.qconfig = QConfig(activation=default_dynamic_quant_observer,
                                               weight=float_qparams_observer)
+=======
+            float_embedding = torch.nn.EmbeddingBag(
+                num_embeddings=num_embeddings,
+                embedding_dim=embedding_dim,
+                include_last_offset=True,
+                scale_grad_by_freq=False,
+                mode="sum",
+            )
+        else:
+            float_embedding = torch.nn.Embedding(
+                num_embeddings=num_embeddings, embedding_dim=embedding_dim
+            )
+
+        if set_qconfig:
+            float_qparams_observer = PerChannelMinMaxObserver.with_args(
+                dtype=dtype, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0
+            )
+            float_embedding.qconfig = QConfig(
+                activation=default_dynamic_quant_observer, weight=float_qparams_observer
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         prepare_dynamic(float_embedding)
 
@@ -1211,6 +2116,10 @@ def checkEmbeddingSerialization(self, qemb, num_embeddings, embedding_dim, indic
 
         self.assertTrue(expected_name in str(q_embeddingbag))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class QuantizationLiteTestCase(QuantizationTestCase):
     def _create_quantized_model(self, model_class: type[torch.nn.Module], **kwargs):
         # Creates quantized model for testing mobile script modules
@@ -1223,9 +2132,13 @@ def _create_quantized_model(self, model_class: type[torch.nn.Module], **kwargs):
 
         return model
 
+<<<<<<< HEAD
     def _compare_script_and_mobile(self,
                                    model: torch.nn.Module,
                                    input: torch.Tensor):
+=======
+    def _compare_script_and_mobile(self, model: torch.nn.Module, input: torch.Tensor):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Compares the numerical outputs for script and lite modules
         qengine = "qnnpack"
         with override_quantized_engine(qengine):
@@ -1236,18 +2149,41 @@ def _compare_script_and_mobile(self,
             for retry in range(1, max_retry + 1):
                 # retries `max_retry` times; breaks iff succeeds else throws exception
                 try:
+<<<<<<< HEAD
                     buffer = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter())
+=======
+                    buffer = io.BytesIO(
+                        script_module._save_to_buffer_for_lite_interpreter()
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     buffer.seek(0)
                     mobile_module = _load_for_lite_interpreter(buffer)
 
                     mobile_module_result = mobile_module(input)
 
+<<<<<<< HEAD
                     torch.testing.assert_close(script_module_result, mobile_module_result)
                     mobile_module_forward_result = mobile_module.forward(input)
                     torch.testing.assert_close(script_module_result, mobile_module_forward_result)
 
                     mobile_module_run_method_result = mobile_module.run_method("forward", input)
                     torch.testing.assert_close(script_module_result, mobile_module_run_method_result)
+=======
+                    torch.testing.assert_close(
+                        script_module_result, mobile_module_result
+                    )
+                    mobile_module_forward_result = mobile_module.forward(input)
+                    torch.testing.assert_close(
+                        script_module_result, mobile_module_forward_result
+                    )
+
+                    mobile_module_run_method_result = mobile_module.run_method(
+                        "forward", input
+                    )
+                    torch.testing.assert_close(
+                        script_module_result, mobile_module_run_method_result
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except AssertionError as e:
                     if retry == max_retry:
                         raise e
@@ -1260,6 +2196,10 @@ class PT2EQuantizationTestCase(QuantizationTestCase):
     """
     Base QuantizationTestCase for PT2 with some helper methods.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _MAP_TO_FX_TRACED_OPS = {
         torch.ops.quantized_decomposed.quantize_per_tensor: torch.ops.quantized_decomposed.quantize_per_tensor.default,
         torch.ops.quantized_decomposed.dequantize_per_tensor: torch.ops.quantized_decomposed.dequantize_per_tensor.default,
@@ -1297,6 +2237,10 @@ def _test_quantizer(
             m,
             example_inputs,
             dynamic_shapes=dynamic_shapes if export_with_dynamic_shape else None,
+<<<<<<< HEAD
+=======
+            strict=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ).module()
 
         if is_qat:
@@ -1337,6 +2281,10 @@ def _test_quantizer(
                 m_fx,
                 example_inputs,
                 dynamic_shapes=dynamic_shapes if export_with_dynamic_shape else None,
+<<<<<<< HEAD
+=======
+                strict=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ).module()
             node_occurrence = {}
             for k, v in PT2EQuantizationTestCase._MAP_TO_FX_TRACED_OPS.items():
@@ -1344,7 +2292,12 @@ def _test_quantizer(
                     node_occurrence[ns.call_function(v)] = expected_node_occurrence[k]
             if training_ir_node_occurrence is not None:
                 node_occurrence = {
+<<<<<<< HEAD
                     ns.call_function(k): v for k, v in training_ir_node_occurrence.items()
+=======
+                    ns.call_function(k): v
+                    for k, v in training_ir_node_occurrence.items()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 }
             self.checkGraphModuleNodes(m_fx, expected_node_occurrence=node_occurrence)
             fx_quant_output = m_fx(*example_inputs)
@@ -1355,10 +2308,14 @@ def _quantize(self, m, quantizer, example_inputs, is_qat: bool = False):
         # resetting dynamo cache
         torch._dynamo.reset()
 
+<<<<<<< HEAD
         m = export_for_training(
             m,
             example_inputs,
         ).module()
+=======
+        m = export_for_training(m, example_inputs, strict=True).module()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if is_qat:
             m = prepare_qat_pt2e(m, quantizer)
         else:
@@ -1377,14 +2334,27 @@ def forward(self, x):
                 return self.linear(x)
 
         quantizer = XNNPACKQuantizer()
+<<<<<<< HEAD
         operator_config = get_symmetric_quantization_config(is_per_channel=is_per_channel)
+=======
+        operator_config = get_symmetric_quantization_config(
+            is_per_channel=is_per_channel
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         quantizer.set_global(operator_config)
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
         return self._quantize(m, quantizer, example_inputs)
 
+<<<<<<< HEAD
 # Below are a series of toy models to use in testing quantization
 
+=======
+
+# Below are a series of toy models to use in testing quantization
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SingleLayerLinearModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1397,8 +2367,14 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
 class AnnotatedSingleLayerLinearModel(torch.nn.Module):
     def __init__(self, qengine='fbgemm'):
+=======
+
+class AnnotatedSingleLayerLinearModel(torch.nn.Module):
+    def __init__(self, qengine="fbgemm"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.fc1 = QuantWrapper(torch.nn.Linear(5, 5).to(dtype=torch.float))
@@ -1410,8 +2386,14 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
 class SingleLayerLinearDynamicModel(torch.nn.Module):
     def __init__(self, qengine='fbgemm'):
+=======
+
+class SingleLayerLinearDynamicModel(torch.nn.Module):
+    def __init__(self, qengine="fbgemm"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.fc1 = torch.nn.Linear(5, 5).to(dtype=torch.float)
@@ -1423,6 +2405,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LinearAddModel(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1438,23 +2424,38 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class RNNDynamicModel(torch.nn.Module):
     def __init__(self, mod_type):
         super().__init__()
         self.qconfig = default_dynamic_qconfig
+<<<<<<< HEAD
         if mod_type == 'GRU':
             self.mod = torch.nn.GRU(2, 2).to(dtype=torch.float)
         if mod_type == 'LSTM':
+=======
+        if mod_type == "GRU":
+            self.mod = torch.nn.GRU(2, 2).to(dtype=torch.float)
+        if mod_type == "LSTM":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.mod = torch.nn.LSTM(2, 2).to(dtype=torch.float)
 
     def forward(self, x):
         x = self.mod(x)
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class RNNCellDynamicModel(torch.nn.Module):
     def __init__(self, mod_type):
         super().__init__()
         self.qconfig = default_dynamic_qconfig
+<<<<<<< HEAD
         if mod_type == 'GRUCell':
             self.mod = torch.nn.GRUCell(2, 2).to(dtype=torch.float)
         if mod_type == 'LSTMCell':
@@ -1463,13 +2464,29 @@ def __init__(self, mod_type):
             self.mod = torch.nn.RNNCell(2, 2, nonlinearity='relu').to(dtype=torch.float)
         if mod_type == 'RNNTanh':
             self.mod = torch.nn.RNNCell(2, 2, nonlinearity='tanh').to(dtype=torch.float)
+=======
+        if mod_type == "GRUCell":
+            self.mod = torch.nn.GRUCell(2, 2).to(dtype=torch.float)
+        if mod_type == "LSTMCell":
+            self.mod = torch.nn.LSTMCell(2, 2).to(dtype=torch.float)
+        if mod_type == "RNNReLU":
+            self.mod = torch.nn.RNNCell(2, 2, nonlinearity="relu").to(dtype=torch.float)
+        if mod_type == "RNNTanh":
+            self.mod = torch.nn.RNNCell(2, 2, nonlinearity="tanh").to(dtype=torch.float)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, x):
         x = self.mod(x)
         return x
 
+<<<<<<< HEAD
 class LSTMwithHiddenDynamicModel(torch.nn.Module):
     def __init__(self, qengine='fbgemm'):
+=======
+
+class LSTMwithHiddenDynamicModel(torch.nn.Module):
+    def __init__(self, qengine="fbgemm"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.lstm = torch.nn.LSTM(2, 2).to(dtype=torch.float)
@@ -1478,6 +2495,10 @@ def forward(self, x, hid):
         x, hid = self.lstm(x, hid)
         return x, hid
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ConvModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1490,6 +2511,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ConvTransposeModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1502,6 +2527,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AnnotatedConvModel(torch.nn.Module):
     def __init__(self, qengine):
         super().__init__()
@@ -1519,6 +2548,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AnnotatedConvTransposeModel(torch.nn.Module):
     def __init__(self, qengine):
         super().__init__()
@@ -1536,6 +2569,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ConvBnModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1550,6 +2587,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AnnotatedConvBnModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1569,6 +2610,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ConvBnReLUModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1585,8 +2630,14 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
 class AnnotatedConvBnReLUModel(torch.nn.Module):
     def __init__(self, qengine='fbgemm'):
+=======
+
+class AnnotatedConvBnReLUModel(torch.nn.Module):
+    def __init__(self, qengine="fbgemm"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
@@ -1606,13 +2657,27 @@ def forward(self, x):
     def fuse_model(self):
         # TODO: remove this check and define two fuse_modules function on this module
         if self.training:
+<<<<<<< HEAD
             torch.ao.quantization.fuse_modules_qat(self, [['conv', 'bn', 'relu']], inplace=True)
         else:
             torch.ao.quantization.fuse_modules(self, [['conv', 'bn', 'relu']], inplace=True)
+=======
+            torch.ao.quantization.fuse_modules_qat(
+                self, [["conv", "bn", "relu"]], inplace=True
+            )
+        else:
+            torch.ao.quantization.fuse_modules(
+                self, [["conv", "bn", "relu"]], inplace=True
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TwoLayerConvModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1627,6 +2692,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TwoLayerLinearModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1641,6 +2710,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LinearModelWithSubmodule(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1655,6 +2728,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return self.subm.get_example_inputs()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AnnotatedTwoLayerLinearModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1670,6 +2747,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ActivationsTestModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1686,6 +2767,10 @@ def forward(self, x):
         x = self.dequant(x)
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LinearReluModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1716,6 +2801,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LinearReluAddModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1734,6 +2823,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LinearBnLeakyReluModel(torch.nn.Module):
     def __init__(self, with_bn=True):
         super().__init__()
@@ -1752,6 +2845,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class LinearTanhModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1766,6 +2863,7 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
 class ConvBnAddReluModel(torch.nn.Module):
     def __init__(self,
                  with_bn=True,
@@ -1773,6 +2871,18 @@ def __init__(self,
                  left_conv=True,
                  two_conv=True,
                  use_torch_add=True):
+=======
+
+class ConvBnAddReluModel(torch.nn.Module):
+    def __init__(
+        self,
+        with_bn=True,
+        with_relu=True,
+        left_conv=True,
+        two_conv=True,
+        use_torch_add=True,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         self.conv = nn.Conv2d(5, 5, (2, 2))
         self.conv2 = nn.Conv2d(5, 5, (2, 2))
@@ -1826,6 +2936,10 @@ def forward(self, x1, x2):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5, 3, 3), torch.rand(1, 5, 2, 2))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: self.fc should be self.conv
 class ConvReluModel(torch.nn.Module):
     def __init__(self) -> None:
@@ -1840,6 +2954,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: self.fc should be self.conv
 class ConvReluConvModel(torch.nn.Module):
     def __init__(self) -> None:
@@ -1857,6 +2975,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: self.fc should be self.conv
 class ConvReluAddModel(torch.nn.Module):
     def __init__(self) -> None:
@@ -1876,6 +2998,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class NormalizationTestModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1897,6 +3023,10 @@ def forward(self, x):
         x = self.instance_norm3d(x.unsqueeze(-1))
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class NestedModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1910,6 +3040,10 @@ def forward(self, x):
         x = self.fc3(x)
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AnnotatedNestedModel(torch.nn.Module):
     def __init__(self, qengine):
         super().__init__()
@@ -1918,7 +3052,11 @@ def __init__(self, qengine):
         self.fc3 = QuantWrapper(torch.nn.Linear(5, 5).to(dtype=torch.float))
         self.fc3.qconfig = default_qconfig
         self.sub2.fc1 = QuantWrapper(self.sub2.fc1)
+<<<<<<< HEAD
         if qengine == 'fbgemm':
+=======
+        if qengine == "fbgemm":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.sub2.fc1.qconfig = default_per_channel_qconfig
         else:
             self.sub2.fc1.qconfig = default_qconfig
@@ -1929,6 +3067,10 @@ def forward(self, x):
         x = self.fc3(x)
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AnnotatedSubNestedModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1944,6 +3086,10 @@ def forward(self, x):
         x = self.fc3(x)
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AnnotatedCustomConfigNestedModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1953,12 +3099,20 @@ def __init__(self) -> None:
         self.fc3.qconfig = default_qconfig
         self.sub2.qconfig = default_qconfig
 
+<<<<<<< HEAD
         custom_options = {
             'dtype': torch.quint8,
             'qscheme': torch.per_tensor_affine
         }
         custom_qconfig = QConfig(activation=default_observer.with_args(**custom_options),
                                  weight=default_weight_observer)
+=======
+        custom_options = {"dtype": torch.quint8, "qscheme": torch.per_tensor_affine}
+        custom_qconfig = QConfig(
+            activation=default_observer.with_args(**custom_options),
+            weight=default_weight_observer,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.sub2.fc1.qconfig = custom_qconfig
 
         self.sub2.fc1 = QuantWrapper(self.sub2.fc1)
@@ -1970,6 +3124,10 @@ def forward(self, x):
         x = self.fc3(x)
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class QuantSubModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -1985,6 +3143,10 @@ def forward(self, x):
         x = self.fc3(x)
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class InnerModule(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2004,14 +3166,22 @@ def fuse_modules(self):
                 if idx >= len(named_children) - 1:
                     break
                 if isinstance(named_children[idx + 1][1], torch.nn.ReLU):
+<<<<<<< HEAD
                     fusable_layers.append([current_name,
                                            named_children[idx + 1][0]])
+=======
+                    fusable_layers.append([current_name, named_children[idx + 1][0]])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # TODO: remove this check and define two fuse_modules function on this module
         if self.training:
             torch.ao.quantization.fuse_modules_qat(self, fusable_layers, inplace=True)
         else:
             torch.ao.quantization.fuse_modules(self, fusable_layers, inplace=True)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FunctionalLinear(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2024,6 +3194,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SingleLayerFunctionalLinearModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2036,6 +3210,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return self.linear1.get_example_inputs()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TwoLayerFunctionalLinearModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2050,6 +3228,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return self.linear1.get_example_inputs()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FunctionalLinearAddModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2065,6 +3247,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return self.linear1.get_example_inputs()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FunctionalLinearReluModel(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2078,6 +3264,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return self.linear.get_example_inputs()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FunctionalLinearReluLinearModel(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2094,6 +3284,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return self.linear1.get_example_inputs()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FunctionalConv2d(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2105,11 +3299,27 @@ def __init__(self) -> None:
         self.groups = 1
 
     def forward(self, x):
+<<<<<<< HEAD
         return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+=======
+        return F.conv2d(
+            x,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def get_example_inputs(self) -> tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SingleLayerFunctionalConvModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2122,6 +3332,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return self.conv1.get_example_inputs()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TwoLayerFunctionalConvModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2136,6 +3350,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return self.conv1.get_example_inputs()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FunctionalConvReluModel(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2149,6 +3367,10 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return self.conv.get_example_inputs()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FunctionalConvReluConvModel(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2165,10 +3387,18 @@ def forward(self, x):
     def get_example_inputs(self) -> tuple[Any, ...]:
         return self.conv1.get_example_inputs()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SkipQuantModel(torch.nn.Module):
     r"""We can skip quantization by explicitly
     setting qconfig of a submodule to None
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self) -> None:
         super().__init__()
         self.sub = InnerModule()
@@ -2180,10 +3410,18 @@ def forward(self, x):
     def fuse_modules(self):
         self.sub.fuse_modules()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AnnotatedSkipQuantModel(torch.nn.Module):
     r"""We can skip quantization by explicitly
     setting qconfig of a submodule to None
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, qengine):
         super().__init__()
         self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
@@ -2198,9 +3436,16 @@ def forward(self, x):
     def fuse_modules(self):
         self.sub.module.fuse_modules()
 
+<<<<<<< HEAD
 class QuantStubModel(torch.nn.Module):
     r"""A Module with manually inserted `QuantStub` and `DeQuantStub`
     """
+=======
+
+class QuantStubModel(torch.nn.Module):
+    r"""A Module with manually inserted `QuantStub` and `DeQuantStub`"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self) -> None:
         super().__init__()
         self.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
@@ -2213,9 +3458,16 @@ def forward(self, x):
         x = self.fc(x)
         return self.dequant(x)
 
+<<<<<<< HEAD
 class ManualLinearQATModel(torch.nn.Module):
     r"""A Module with manually inserted `QuantStub` and `DeQuantStub`
     """
+=======
+
+class ManualLinearQATModel(torch.nn.Module):
+    r"""A Module with manually inserted `QuantStub` and `DeQuantStub`"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, qengine):
         super().__init__()
         self.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
@@ -2230,9 +3482,16 @@ def forward(self, x):
         x = self.fc2(x)
         return self.dequant(x)
 
+<<<<<<< HEAD
 class ManualDropoutQATModel(torch.nn.Module):
     r"""A Module with manually inserted `QuantStub` and `DeQuantStub`
     """
+=======
+
+class ManualDropoutQATModel(torch.nn.Module):
+    r"""A Module with manually inserted `QuantStub` and `DeQuantStub`"""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, qengine):
         super().__init__()
         self.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
@@ -2247,9 +3506,16 @@ def forward(self, x):
         x = self.dropout(x)
         return self.dequant(x)
 
+<<<<<<< HEAD
 class ManualLinearDynamicQATModel(torch.nn.Module):
     r"""A Module that uses a dynamic QAT by default.
     """
+=======
+
+class ManualLinearDynamicQATModel(torch.nn.Module):
+    r"""A Module that uses a dynamic QAT by default."""
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, qconfig=None):
         super().__init__()
         self.qconfig = qconfig or default_dynamic_qat_qconfig
@@ -2261,13 +3527,28 @@ def forward(self, x):
         x = self.fc2(x)
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ManualConvLinearQATModel(torch.nn.Module):
     r"""A module with manually inserted `QuantStub` and `DeQuantStub`
     and contains both linear and conv modules
     """
+<<<<<<< HEAD
     def __init__(self, qconfig=None):
         super().__init__()
         self.qconfig = qconfig if qconfig else torch.ao.quantization.get_default_qat_qconfig("qnnpack")
+=======
+
+    def __init__(self, qconfig=None):
+        super().__init__()
+        self.qconfig = (
+            qconfig
+            if qconfig
+            else torch.ao.quantization.get_default_qat_qconfig("qnnpack")
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
         self.conv = torch.nn.Conv2d(3, 1, kernel_size=3).to(dtype=torch.float)
@@ -2282,10 +3563,15 @@ def forward(self, x):
         x = self.fc2(x)
         return self.dequant(x)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ManualConvLinearSymmQATModel(ManualConvLinearQATModel):
     r"""Same as ManualConvLinearQATModule but with Symmetric Quantization.
     Supported only with qnnpack.
     """
+<<<<<<< HEAD
     def __init__(self) -> None:
         super().__init__(default_symmetric_qnnpack_qat_qconfig)
 
@@ -2293,19 +3579,43 @@ class ManualEmbeddingBagLinear(nn.Module):
     def __init__(self) -> None:
         super().__init__()
         self.emb = nn.EmbeddingBag(num_embeddings=10, embedding_dim=12, mode='sum')
+=======
+
+    def __init__(self) -> None:
+        super().__init__(default_symmetric_qnnpack_qat_qconfig)
+
+
+class ManualEmbeddingBagLinear(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.emb = nn.EmbeddingBag(num_embeddings=10, embedding_dim=12, mode="sum")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.emb.qconfig = default_embedding_qat_qconfig
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
         self.linear = nn.Linear(12, 1).to(dtype=torch.float)
         self.qconfig = get_default_qat_qconfig("qnnpack")
 
+<<<<<<< HEAD
     def forward(self, input: torch.Tensor, offsets: Optional[torch.Tensor] = None,
                 per_sample_weights: Optional[torch.Tensor] = None):
+=======
+    def forward(
+        self,
+        input: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+        per_sample_weights: Optional[torch.Tensor] = None,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         x = self.emb(input, offsets, per_sample_weights)
         x = self.quant(x)
         x = self.linear(x)
         return self.dequant(x)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DeFusedEmbeddingBagLinear(nn.Module):
     r"""A module to simulate QAT embedding bag with a linear layer,
     this module uses a separate embedding and bagging op, similar
@@ -2313,6 +3623,10 @@ class DeFusedEmbeddingBagLinear(nn.Module):
 
     https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self) -> None:
         super().__init__()
         self.emb = nn.Embedding(num_embeddings=10, embedding_dim=12)
@@ -2329,6 +3643,10 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         x = self.linear(x)
         return self.dequant(x)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SubModelForFusion(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2350,6 +3668,10 @@ def __init__(self) -> None:
     def forward(self, x):
         return self.relu(self.conv(x))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ModelForFusion(nn.Module):
     def __init__(self, qconfig):
         super().__init__()
@@ -2396,6 +3718,7 @@ def forward(self, x):
         y = self.dequant(y)
         return x
 
+<<<<<<< HEAD
 class ConvBNReLU(nn.Sequential):
     def __init__(self) -> None:
         super().__init__(
@@ -2404,6 +3727,16 @@ def __init__(self) -> None:
             nn.ReLU(inplace=False)
         )
 
+=======
+
+class ConvBNReLU(nn.Sequential):
+    def __init__(self) -> None:
+        super().__init__(
+            nn.Conv2d(3, 3, 1, 1, bias=False), nn.BatchNorm2d(3), nn.ReLU(inplace=False)
+        )
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ModelWithSequentialFusion(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2428,6 +3761,10 @@ def forward(self, x):
         x = self.dequant(x)
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ModelForFusionWithBias(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2449,6 +3786,10 @@ def forward(self, x):
         x = self.dequant(x)
         return x
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ModelForLinearBNFusion(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2460,6 +3801,10 @@ def __init__(self) -> None:
     def forward(self, x):
         return self.bn(self.fc(x))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class DummyObserver(torch.nn.Module):
     def calculate_qparams(self):
         return 1.0, 0
@@ -2497,7 +3842,11 @@ def __init__(self) -> None:
         self.myadd = nnq.FloatFunctional()
         self.myadd_relu = nnq.FloatFunctional()
         self.mymatmul = nnq.FloatFunctional()
+<<<<<<< HEAD
         # Tracing doesnt work yet for c10 ops with scalar inputs
+=======
+        # Tracing doesn't work yet for c10 ops with scalar inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # https://github.com/pytorch/pytorch/issues/27097
         # self.my_scalar_add = nnq.FloatFunctional()
         # self.my_scalar_mul = nnq.FloatFunctional()
@@ -2507,7 +3856,11 @@ def forward(self, x):
         z = self.myadd.add(y, y)
         w = self.myadd_relu.add_relu(z, z)
         u = self.mymatmul.matmul(w, w.T)
+<<<<<<< HEAD
         # Tracing doesnt work yet for c10 ops with scalar inputs
+=======
+        # Tracing doesn't work yet for c10 ops with scalar inputs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # https://github.com/pytorch/pytorch/issues/27097
         # w = self.my_scalar_add.add_scalar(w, -0.5)
         # w = self.my_scalar_mul.mul_scalar(w, 0.5)
@@ -2543,9 +3896,20 @@ def forward(self, x):
     def fuse_model(self):
         # TODO: remove this check and define two fuse_model function on this module
         if self.training:
+<<<<<<< HEAD
             torch.ao.quantization.fuse_modules_qat(self, [['conv1', 'bn1', 'relu1']], inplace=True)
         else:
             torch.ao.quantization.fuse_modules(self, [['conv1', 'bn1', 'relu1']], inplace=True)
+=======
+            torch.ao.quantization.fuse_modules_qat(
+                self, [["conv1", "bn1", "relu1"]], inplace=True
+            )
+        else:
+            torch.ao.quantization.fuse_modules(
+                self, [["conv1", "bn1", "relu1"]], inplace=True
+            )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class ModelMultipleOps(torch.nn.Module):
     def __init__(self) -> None:
@@ -2578,6 +3942,10 @@ def forward(self, x):
         out = self.fc(out)
         return out
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Model to ensure consistency of fake quant with true quant
 # Average pooling and mean operations are not modelled
 # accurately with fake-quant so this model does not
@@ -2612,15 +3980,33 @@ def forward(self, x):
         out = self.fc(out)
         return out
 
+<<<<<<< HEAD
 class EmbeddingBagModule(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
         self.emb = torch.nn.EmbeddingBag(num_embeddings=10, embedding_dim=12,
                                          include_last_offset=True, scale_grad_by_freq=False, mode='sum')
+=======
+
+class EmbeddingBagModule(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.emb = torch.nn.EmbeddingBag(
+            num_embeddings=10,
+            embedding_dim=12,
+            include_last_offset=True,
+            scale_grad_by_freq=False,
+            mode="sum",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, indices, offsets, per_sample_weights):
         return self.emb(indices, offsets, per_sample_weights)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class EmbeddingModule(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2629,6 +4015,10 @@ def __init__(self) -> None:
     def forward(self, indices):
         return self.emb(indices)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class EmbeddingWithStaticLinear(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -2647,9 +4037,17 @@ def forward(self, indices, offsets, linear_in):
         features = torch.cat([fc] + [emb], dim=1)
         return features
 
+<<<<<<< HEAD
 class DenseTopMLP(nn.Module):
 
     def __init__(self, dense_dim, dense_out, embedding_dim, top_out_in, top_out_out) -> None:
+=======
+
+class DenseTopMLP(nn.Module):
+    def __init__(
+        self, dense_dim, dense_out, embedding_dim, top_out_in, top_out_out
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
 
         self.dense_mlp = nn.Sequential(
@@ -2671,16 +4069,28 @@ def forward(
         out = self.top_mlp(features)
         return out
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # thin wrapper around embedding bag, because tracing inside nn.Embedding
 # bag is not supported at the moment and this is top level
 class EmbBagWrapper(nn.Module):
     def __init__(self, num_embeddings, embedding_dim):
         super().__init__()
+<<<<<<< HEAD
         self.emb_bag = nn.EmbeddingBag(num_embeddings, embedding_dim, mode='sum')
+=======
+        self.emb_bag = nn.EmbeddingBag(num_embeddings, embedding_dim, mode="sum")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, indices, offsets):
         return self.emb_bag(indices, offsets)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class SparseNNModel(nn.Module):
     _NUM_EMBEDDINGS = 10
     _EMBEDDING_DIM = 5
@@ -2695,8 +4105,17 @@ def __init__(self) -> None:
 
         self.model_sparse = EmbBagWrapper(self._NUM_EMBEDDINGS, self._EMBEDDING_DIM)
         self.dense_top = DenseTopMLP(
+<<<<<<< HEAD
             self._DENSE_DIM, self._DENSE_OUTPUT, self._EMBEDDING_DIM, self._TOP_OUT_IN,
             self._TOP_OUT_OUT)
+=======
+            self._DENSE_DIM,
+            self._DENSE_OUTPUT,
+            self._EMBEDDING_DIM,
+            self._TOP_OUT_IN,
+            self._TOP_OUT_OUT,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(
         self,
@@ -2704,12 +4123,19 @@ def forward(
         sparse_offsets: torch.Tensor,
         dense: torch.Tensor,
     ) -> torch.Tensor:
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sparse_feature = self.model_sparse(sparse_indices, sparse_offsets)
         out = self.dense_top(sparse_feature, dense)
 
         return out
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestHelperModules:
     class ControlFlow(torch.nn.Module):
         def forward(
@@ -2719,7 +4145,10 @@ def forward(
             pred2: torch.Tensor,
             y: torch.Tensor,
         ) -> torch.Tensor:
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             def true_nested(y: torch.Tensor) -> torch.Tensor:
                 y = y + y
                 y = torch.mm(y, y)
@@ -2736,7 +4165,14 @@ def false_fn(x: torch.Tensor, _) -> torch.Tensor:
                 return x.cos()
 
             def map_fn(
+<<<<<<< HEAD
                 x: torch.Tensor, pred1: torch.Tensor, pred2: torch.Tensor, y: torch.Tensor
+=======
+                x: torch.Tensor,
+                pred1: torch.Tensor,
+                pred2: torch.Tensor,
+                y: torch.Tensor,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ) -> torch.Tensor:
                 x = x.cos()
                 y = control_flow.cond(pred1, true_fn, false_fn, [y, pred2])
@@ -2747,7 +4183,16 @@ def map_fn(
             return control_flow.map(map_fn, xs, pred1, pred2, y)
 
         def example_inputs(self):
+<<<<<<< HEAD
             return (torch.ones(2, 2), torch.tensor([False]), torch.tensor([False]), torch.ones(2, 2),)
+=======
+            return (
+                torch.ones(2, 2),
+                torch.tensor([False]),
+                torch.tensor([False]),
+                torch.ones(2, 2),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     class Conv2dPropAnnotaton(torch.nn.Module):
         def __init__(self) -> None:
@@ -2845,12 +4290,22 @@ def forward(self, x):
             x = self.adaptive_avg_pool2d(x)
             return x
 
+<<<<<<< HEAD
     class ConvWithBNRelu(torch.nn.Module):
         def __init__(self, relu, dim=2, bn=True, bias=True):
             super().__init__()
             convs = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d}
             bns = {1: torch.nn.BatchNorm1d, 2: torch.nn.BatchNorm2d}
             self.conv = convs[dim](3, 3, 3, bias=bias)
+=======
+
+    class ConvWithBNRelu(torch.nn.Module):
+        def __init__(self, relu, dim=2, bn=True, bias=True, padding=0):
+            super().__init__()
+            convs = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
+            bns = {1: torch.nn.BatchNorm1d, 2: torch.nn.BatchNorm2d, 3: torch.nn.BatchNorm3d}
+            self.conv = convs[dim](3, 3, 3, bias=bias, padding=padding)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if bn:
                 self.bn = bns[dim](3)
@@ -3029,6 +4484,7 @@ def forward(self, x):
             x = self.relu(self.fc(x))
             return x
 
+<<<<<<< HEAD
 def _generate_qdq_quantized_model(
     mod, inputs, is_qat=False, is_dynamic=False, quantizer=None
 ):
@@ -3039,6 +4495,22 @@ def get_default_quantizer(is_qat, is_dynamic, inputs):
         if has_xpu:
             quantizer = XPUInductorQuantizer()
             assert (not is_qat) and (not is_dynamic), "QAT and dynamic quantization is not supported at XPU backend currently"
+=======
+
+def _generate_qdq_quantized_model(
+    mod, inputs, is_qat=False, is_dynamic=False, quantizer=None
+):
+    def get_default_quantizer(is_qat, is_dynamic, inputs):
+        has_xpu = any(
+            isinstance(input, torch.Tensor) and input.device.type == "xpu"
+            for input in inputs
+        )
+        if has_xpu:
+            quantizer = XPUInductorQuantizer()
+            assert (not is_qat) and (
+                not is_dynamic
+            ), "QAT and dynamic quantization is not supported at XPU backend currently"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             quantizer.set_global(xpuiq.get_default_xpu_inductor_quantization_config())
         else:
             quantizer = X86InductorQuantizer()
@@ -3051,12 +4523,20 @@ def get_default_quantizer(is_qat, is_dynamic, inputs):
 
     maybe_no_grad = contextlib.nullcontext() if is_qat else torch.no_grad()
     with maybe_no_grad:
+<<<<<<< HEAD
         export_model = export_for_training(
             mod,
             inputs,
         ).module()
         quantizer = (
             quantizer if quantizer else get_default_quantizer(is_qat, is_dynamic, inputs)
+=======
+        export_model = export_for_training(mod, inputs, strict=True).module()
+        quantizer = (
+            quantizer
+            if quantizer
+            else get_default_quantizer(is_qat, is_dynamic, inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         prepare_model = (
             prepare_qat_pt2e(export_model, quantizer)
diff --git a/torch/testing/_internal/common_quantized.py b/torch/testing/_internal/common_quantized.py
index 2daddf546760..e255f2379382 100644
--- a/torch/testing/_internal/common_quantized.py
+++ b/torch/testing/_internal/common_quantized.py
@@ -5,6 +5,10 @@
 """
 import numpy as np
 import torch
+<<<<<<< HEAD
+=======
+from torch import Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager
 from torch.testing._internal.common_utils import TEST_WITH_TSAN, IS_PPC, IS_MACOS, IS_WINDOWS
 
@@ -222,3 +226,263 @@ def to_tensor(X, device):
     else:
         X = X.detach().clone()
     return X.to(device=torch.device(device), dtype=torch.float32)
+<<<<<<< HEAD
+=======
+
+# copy-pasted from
+# https://github.com/pytorch/ao/blob/bc4f51da86956275da7db0da6e420c506df97820/torchao/prototype/custom_fp_utils.py#L27C1-L142C29
+def _n_ones(n: int) -> int:
+    return (1 << n) - 1
+
+EBITS_F32, MBITS_F32 = 8, 23
+F32_EXP_BIAS = _n_ones(EBITS_F32 - 1)
+
+# copy-pasted from
+# https://github.com/pytorch/ao/blob/bc4f51da86956275da7db0da6e420c506df97820/torchao/prototype/custom_fp_utils.py#L27C1-L142C29
+def _f32_to_floatx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor:
+    """Convert FP32 numbers to sub-byte floating point numbers with the given
+    number of exponent and mantissa bits.
+
+    Input: torch.Tensor of dtype torch.float
+    Output: torch.Tensor of dtype torch.uint8, where the bit encoding is stored
+    in the least significant bits. e.g.
+      fp4: bits 0-3 empty and bits 4-7 in fp4_e2m1 encoding
+      fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding
+
+    Note: there are no special values (NaN, inf) support in this code. Values
+    outside the representable range of Floatx after rounding are clamped to the
+    maximum Floatx magnitude (sign is preserved).
+
+    Code below is an adaptation of https://fburl.com/code/ciwofcg4
+
+    Background 1: last answer in https://stackoverflow.com/q/8981913
+    Background 2: Computer Organization and Design, RISC-V edition, Chapter 3.5
+    """
+    assert x.dtype == torch.float
+    assert 1 + ebits + mbits <= 8
+
+    # calculate constants
+    exp_bias = _n_ones(ebits - 1)
+    max_int = _n_ones(ebits + mbits)
+    sign_mask = 1 << (ebits + mbits)
+
+    # TODO document this better
+    magic_adder = _n_ones(MBITS_F32 - mbits - 1)
+
+    # all E bits and M bits are 1s
+    max_normal = 2 ** (_n_ones(ebits) - exp_bias) * (_n_ones(mbits + 1) / (2**mbits))
+
+    # E bits = 1, M bits = 0
+    min_normal = 2 ** (1 - exp_bias)
+
+    denorm_exp = (
+        # exp bias conversion between formats
+        (F32_EXP_BIAS - exp_bias)
+        # mantissa length difference between formats
+        + (MBITS_F32 - mbits)
+        # add one to encoded exponent for denormalized numbers
+        + 1
+    )
+    denorm_mask_int = denorm_exp << MBITS_F32
+
+    # reinterpret int32 as float32
+    denorm_mask_float = torch.tensor(denorm_mask_int, dtype=torch.int32).view(
+        torch.float32
+    )
+
+    # save the sign
+    # Note that we have torch.uint32, but some ops like cpu bit shifts
+    # do not work on it. So, we stay in int32.
+    x = x.view(torch.int32)
+    sign = x & 0x80000000
+
+    # set everything to positive, will add sign back at the end
+    x = x ^ sign
+
+    # TODO: can the branch floating point comparisons below be done without
+    # converting to float? probably but need to verify
+    x = x.view(torch.float)
+
+    # rewrite saturate/denorm/norm branches without explicit data dependent
+    # control flow, to be more compiler friendly
+    saturate_mask = x >= max_normal
+    denormal_mask = torch.logical_and(torch.logical_not(saturate_mask), x < min_normal)
+    normal_mask = torch.logical_not(torch.logical_or(saturate_mask, denormal_mask))
+
+    #
+    # branch 1: saturate to max val - handled later in the code which combines
+    #   the branches
+    #
+
+    #
+    # branch 2: to conversion to denormal as well as rounding up to normal
+    #
+    denormal_x = x + denorm_mask_float
+    denormal_x = denormal_x.view(torch.int32)
+    denormal_x -= denorm_mask_int
+    denormal_x = denormal_x.to(torch.uint8)
+
+    #
+    # branch 3: stay in normal range, adjust the exponent and round
+    #
+    normal_x = x.view(torch.int32)
+    # resulting mantissa is odd
+    mant_odd = (normal_x >> (MBITS_F32 - mbits)) & 1
+    # update exponent, rounding bias part 1
+    val_to_add = ((exp_bias - F32_EXP_BIAS) << MBITS_F32) + magic_adder
+    normal_x += val_to_add
+    # rounding bias part 2
+    normal_x += mant_odd
+    # take the bits!
+    normal_x = normal_x >> (MBITS_F32 - mbits)
+    normal_x = normal_x.to(torch.uint8)
+
+    #
+    # combine the branches
+    #
+    x = torch.full_like(x, max_int, dtype=torch.uint8)
+    x = torch.where(denormal_mask, denormal_x, x)
+    x = torch.where(normal_mask, normal_x, x)
+
+    # add sign back
+    sign_lp = sign >> (MBITS_F32 + EBITS_F32 - mbits - ebits)
+    sign_lp = sign_lp.to(torch.uint8)
+    # Right shift of a negative signed integer can fill the least significant
+    # bits with either 1s or 0s, depending on the implementation. Since PyTorch
+    # doesn't have an uint32 dtype, we mask out these bits to get just the
+    # f4 sign bit
+    sign_lp = sign_lp & sign_mask
+    x = x | sign_lp
+
+    return x.to(torch.uint8)
+
+
+# copy-pasted from
+# https://github.com/pytorch/ao/blob/29488018d99af7f7339f06353c6b5bbeae8a1493/torchao/prototype/custom_fp_utils.py#L147
+def _floatx_unpacked_to_f32(x: Tensor, ebits: int, mbits: int) -> Tensor:
+    """Convert sub-byte floating point numbers with the given number of exponent
+    and mantissa bits to FP32.
+
+    Input: torch.Tensor of dtype uint8, where the bit encoding is stored
+    in the least significant bits. e.g.
+      fp4: bits 0-3 empty and bits 4-7 in fp4_e2m1 encoding
+      fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding
+    Output: torch.Tensor of dtype fp32 with the dequantized value
+    """
+    assert x.dtype == torch.uint8
+    assert 1 + ebits + mbits <= 8
+
+    sign_mask = 1 << (ebits + mbits)
+    exp_bias = _n_ones(ebits - 1)
+    mantissa_mask = _n_ones(mbits)
+
+    # save the sign
+    sign_lp = x & sign_mask
+
+    # set everything to positive, will add sign back at the end
+    x_pos = x ^ sign_lp
+
+    #
+    # 1. Calculate zero mask
+    #
+    zero_mask = x_pos == 0
+
+    #
+    # 2. Calculate the denormal path mask
+    #
+    denormal_mask = torch.logical_and((x_pos > 0), ((x_pos >> mbits) == 0))
+
+    #
+    # 3. Calculate the normal path
+    #
+
+    # calculate the new exponent and shift it to bits 2:9 of the result
+    exp_biased_lp = x_pos >> mbits
+    exp_biased_f32 = exp_biased_lp - exp_bias + F32_EXP_BIAS
+    exp_biased_f32 = exp_biased_f32.to(torch.int32) << MBITS_F32
+
+    # shift the mantissa to bits 10:32 of the result
+    mantissa_lp_int32 = (x_pos & mantissa_mask).to(torch.int32)
+    mantissa_f32 = mantissa_lp_int32 << (MBITS_F32 - mbits)
+    result = exp_biased_f32 | mantissa_f32
+
+    #
+    # 4. Add the zero and denormal casts to the already casted normal path
+    #
+    result[zero_mask] = 0
+
+    denormal_exp_biased = 1 - exp_bias + F32_EXP_BIAS
+
+    # fast path.
+    # without this, performance for FP4_E2M1 is slower by 2x
+    if mbits == 1:
+        result[denormal_mask] = (denormal_exp_biased - mbits) << MBITS_F32
+
+    else:
+        # iterate over all possible values of mantissa
+        # i=0, j=1
+        # i=1, j=10,11
+        # i=2, j=100,101,110,111
+        # and so on
+        for i in range(mbits):
+            for mantissa_cmp in range(1 << i, 1 << (i + 1)):
+                # left shift mantissa until it overflows (create an implicit 1)
+                # subtract exponent by the same amount
+                left_shift = mbits - i
+                mantissa_f32 = (mantissa_cmp - (1 << i)) << (
+                    left_shift + MBITS_F32 - mbits
+                )
+                exp_biased_f32 = (denormal_exp_biased - left_shift) << MBITS_F32
+
+                # we can update this in-place since the values won't overlap
+                # torch.compile() may complain unsupported operand type(s) for |: 'SymInt' and 'int'
+                # thus we use + instead of | here
+                mantissa_lp_int32[mantissa_lp_int32 == mantissa_cmp] = (
+                    exp_biased_f32 + mantissa_f32
+                )
+
+        result = torch.where(denormal_mask, mantissa_lp_int32, result)
+
+    # add sign back
+    sign_f32 = sign_lp.to(torch.int32) << (MBITS_F32 - mbits + EBITS_F32 - ebits)
+    result = result | sign_f32
+
+    return result.view(torch.float)
+
+# copied from https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/mx/to_blocked.py
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+def to_blocked(input_matrix) -> torch.Tensor:
+    """
+    Rearrange a large matrix by breaking it into blocks and applying the rearrangement pattern.
+
+    See:
+        https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
+
+    Args:
+        input_matrix: Input tensor of shape (H, W)
+
+    Returns:
+        Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4))
+    """
+    rows, cols = input_matrix.shape
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+
+    # Calculate the padded shape
+    padded_rows = n_row_blocks * 128
+    padded_cols = n_col_blocks * 4
+
+    padded = input_matrix
+    # Ideally we would use torch.nn.pad but it doesn't support float8_e8m0fnu for now
+    if (rows, cols) != (padded_rows, padded_cols):
+        padded = torch.zeros((padded_rows, padded_cols), device=input_matrix.device, dtype=input_matrix.dtype)
+        padded[:rows, :cols] = input_matrix
+
+    # Rearrange the blocks
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+
+    return rearranged.flatten()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index d3d30d06cbdf..86bcd593b1ab 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -104,7 +104,10 @@
 
 
 MI300_ARCH = ("gfx940", "gfx941", "gfx942")
+<<<<<<< HEAD
 MI350_ARCH = ("gfx950")
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
 NAVI3_ARCH = ("gfx1100", "gfx1101")
 NAVI4_ARCH = ("gfx1200", "gfx1201")
@@ -124,7 +127,11 @@ def freeze_rng_state(*args, **kwargs):
 # Class to keep track of test flags configurable by environment variables.
 # Flags set here are intended to be read-only and should not be modified after
 # definition.
+<<<<<<< HEAD
 # TODO: Expand this class to handle abritrary settings in addition to boolean flags?
+=======
+# TODO: Expand this class to handle arbitrary settings in addition to boolean flags?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class TestEnvironment:
     # Set of env vars to set for the repro command that is output on test failure.
     # Specifically, this includes env vars that are set to non-default values and
@@ -914,6 +921,14 @@ def prof_callable(callable, *args, **kwargs):
 
     return callable(*args, **kwargs)
 
+<<<<<<< HEAD
+=======
+def raise_on_run_directly(file_to_call):
+    raise RuntimeError("This test file is not meant to be run directly, "
+                       f"use:\n\n\tpython {file_to_call} TESTNAME\n\n"
+                       "instead.")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def prof_func_call(*args, **kwargs):
     return prof_callable(func_call, *args, **kwargs)
 
@@ -1137,8 +1152,13 @@ def lint_test_case_extension(suite):
             test_case = first_test
 
         if test_case is not None:
+<<<<<<< HEAD
             test_class = test_case.id().split('.', 1)[1].split('.')[0]
             if not isinstance(test_case, TestCase):
+=======
+            if not isinstance(test_case, TestCase):
+                test_class = test_case.id().split('.', 1)[1].split('.')[0]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 err = "This test class should extend from torch.testing._internal.common_utils.TestCase but it doesn't."
                 print(f"{test_class} - failed. {err}")
                 succeed = False
@@ -1243,7 +1263,11 @@ def run_tests(argv=UNITTEST_ARGS):
         if RERUN_DISABLED_TESTS:
             other_args.append("--rerun-disabled-tests")
         if TEST_SAVE_XML:
+<<<<<<< HEAD
             other_args += ['--save-xml', args.save_xml]
+=======
+            other_args += ['--save-xml', TEST_SAVE_XML]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         test_cases = (
             get_pytest_test_cases(argv) if USE_PYTEST else
@@ -1309,7 +1333,11 @@ def run_tests(argv=UNITTEST_ARGS):
         # exitcode of 5 means no tests were found, which happens since some test configs don't
         # run tests from certain files
         sys.exit(0 if exit_code == 5 else exit_code)
+<<<<<<< HEAD
     elif TEST_SAVE_XML is not None:
+=======
+    elif TEST_SAVE_XML:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # import here so that non-CI doesn't need xmlrunner installed
         import xmlrunner  # type: ignore[import]
         from xmlrunner.result import _XMLTestResult  # type: ignore[import]
@@ -1512,6 +1540,29 @@ def split_if_not_empty(x: str):
 
 TEST_CUDA_CUDSS = TEST_CUDA and (torch.version.cuda and int(torch.version.cuda.split(".")[0]) >= 12)
 
+<<<<<<< HEAD
+=======
+TEST_CUDA_PYTHON_BINDINGS = _check_module_exists("cuda.bindings") and (
+    torch.version.cuda and int(torch.version.cuda.split(".")[0]) >= 12
+)
+
+if TEST_CUDA_PYTHON_BINDINGS:
+    def cuda_python_error_check(function_call_output):
+        """Makes calls to cuda-python's cuda runtime functions more
+        pythonic by throwing an exception if they return a status
+        which is not cudaSuccess
+        """
+        import cuda.bindings  # type: ignore[import]
+
+        error, *others = function_call_output
+        if error != cuda.bindings.runtime.cudaError_t.cudaSuccess:
+            raise ValueError(f"CUDA failure! {error}")
+        else:
+            return tuple(others)
+else:
+    cuda_python_error_check = None  # type: ignore[assignment]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def allocator_option_enabled_fn(allocator_config, _, option):
     if allocator_config is None:
         return False
@@ -1567,6 +1618,13 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     env_var="PYTORCH_TEST_WITH_DYNAMO",
     implied_by_fn=lambda: TEST_WITH_TORCHINDUCTOR or TEST_WITH_AOT_EAGER,
 )
+<<<<<<< HEAD
+=======
+TEST_WITHOUT_COMPILED_AUTOGRAD: bool = TestEnvironment.def_flag(
+    "TEST_WITHOUT_COMPILED_AUTOGRAD",
+    env_var="PYTORCH_TEST_WITHOUT_COMPILED_AUTOGRAD",
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TEST_WITH_TORCHDYNAMO:
     import torch._dynamo
@@ -1579,6 +1637,12 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     if TEST_WITH_TORCHINDUCTOR:
         import torch._inductor.config
         torch._inductor.config.fallback_random = True
+<<<<<<< HEAD
+=======
+    else:
+        # only dynamo for now
+        torch._dynamo.config.compiled_autograd = not TEST_WITHOUT_COMPILED_AUTOGRAD
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # seems like this is only used in test/torch_np
@@ -1654,6 +1718,27 @@ def wrapper(*args, **kwargs):
 
     return decorator
 
+<<<<<<< HEAD
+=======
+def runWithoutCompiledAutograd(msg="test doesn't currently work with compiled autograd"):
+    """
+    Usage:
+    @runWithoutCompiledAutograd(msg)
+    def test_blah(self):
+        ...
+    """
+    assert isinstance(msg, str)
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            with torch._dynamo.compiled_autograd._disable():
+                func(*args, **kwargs)
+        return wrapper
+
+    return decorator
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def serialTest(condition=True):
     """
     Decorator for running tests serially.  Requires pytest
@@ -1774,6 +1859,7 @@ def test_fn(self: Any, **kwargs) -> None:
 if TEST_WITH_TV:
     torch.fx.experimental._config.translation_validation = True
 
+<<<<<<< HEAD
 # Some tests take too long when dynamic_shapes is combined with
 # translation_validation. Whenever that happens, we solve that by
 # disabling translation_validation.
@@ -1787,6 +1873,8 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Determine whether to enable cuda memory leak check.
 # CUDA mem leak check is expensive and thus we don't want to execute it on every
 # test case / configuration.
@@ -2425,7 +2513,11 @@ def __enter__(self):
                                      device_type=deviceStream.device_type)
         torch._C._cuda_setDevice(beforeDevice)
 
+<<<<<<< HEAD
     def __exit__(self, exec_type, exec_value, traceback):
+=======
+    def __exit__(self, exc_type, exc_value, traceback):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # After completing CUDA test load previously active streams on all
         # CUDA devices.
         beforeDevice = torch.cuda.current_device()
@@ -2473,9 +2565,15 @@ def __enter__(self):
             driver_mem_allocated = bytes_total - bytes_free
             self.driver_befores.append(driver_mem_allocated)
 
+<<<<<<< HEAD
     def __exit__(self, exec_type, exec_value, traceback):
         # Don't check for leaks if an exception was thrown
         if exec_type is not None:
+=======
+    def __exit__(self, exc_type, exc_value, traceback):
+        # Don't check for leaks if an exception was thrown
+        if exc_type is not None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return
 
         # Compares caching allocator before/after statistics
@@ -2842,7 +2940,14 @@ def _to_number(self, number_like, *, id):
         elif isinstance(number_like, Enum):
             return int(number_like)  # type: ignore[call-overload]
         else:
+<<<<<<< HEAD
             return super()._to_number(number_like, id=id)
+=======
+            number = super()._to_number(number_like, id=id)
+            if type(number) not in self._TYPE_TO_DTYPE.keys():
+                self._inputs_not_supported()
+            return number
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TensorOrArrayPair(TensorLikePair):
@@ -3062,9 +3167,13 @@ def _get_rel_test_path(abs_test_path):
                         # The path isn't strictly correct but it's arguably better than nothing.
                         return os.path.split(abs_test_path)[1]
 
+<<<<<<< HEAD
                     # NB: In Python 3.8, the getfile() call will return a path relative
                     # to the working directory, so convert that to absolute.
                     abs_test_path = os.path.abspath(inspect.getfile(type(self)))
+=======
+                    abs_test_path = inspect.getfile(type(self))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     test_filename = _get_rel_test_path(abs_test_path)
                     class_name = type(self).__name__
                     test_run_cmd = f"python {test_filename} {class_name}.{method_name}"
@@ -3185,6 +3294,16 @@ def wrapper(self, *args, **kwargs):
     def wrap_with_cuda_memory_check(self, method):
         return self.wrap_method_with_policy(method, self.assertLeaksNoCudaTensors)
 
+<<<<<<< HEAD
+=======
+    def _dynamo_test_key(self):
+        return f"{self.__class__.__name__}.{self._testMethodName}"
+
+    def compile_fn(self, fn, backend, nopython):
+        # Allows subclasses to control compilation
+        return torch._dynamo.optimize(backend, nopython=nopython)(fn)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _run_custom(self, result=None):
         using_unittest = isinstance(result, unittest.TestResult)
 
@@ -3260,6 +3379,7 @@ def _run_custom(self, result=None):
 
         with unittest.mock.patch("torch._dynamo.config.suppress_errors", suppress_errors), maybe_disable_size_asserts:
             if TEST_WITH_AOT_EAGER:
+<<<<<<< HEAD
                 super_run = torch._dynamo.optimize("aot_eager_decomp_partition")(super_run)
             elif TEST_WITH_TORCHDYNAMO or TEST_WITH_TORCHINDUCTOR:
                 if TEST_WITH_TORCHINDUCTOR:
@@ -3270,6 +3390,18 @@ def _run_custom(self, result=None):
                     super_run = torch._dynamo.optimize("eager_noexcept", nopython=nopython)(super_run)
 
                 key = f"{self.__class__.__name__}.{self._testMethodName}"
+=======
+                super_run = self.compile_fn(super_run, "aot_eager_decomp_partition", nopython)
+            elif TEST_WITH_TORCHDYNAMO or TEST_WITH_TORCHINDUCTOR:
+                if TEST_WITH_TORCHINDUCTOR:
+                    super_run = self.compile_fn(super_run, "inductor", nopython)
+                else:
+                    # Assume eager-generated GraphModules will not error out.
+                    # If we do, this is probably a Dynamo bug!
+                    super_run = self.compile_fn(super_run, "eager_noexcept", nopython)
+
+                key = self._dynamo_test_key()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 def expect_failure(f, file_name):
                     @wraps(f)
@@ -3319,6 +3451,14 @@ def wrapper(*args, **kwargs):
                     file_name = os.path.join(subdir, key)
                     setattr(self, self._testMethodName, ignore_failure(method, file_name))
 
+<<<<<<< HEAD
+=======
+                from .dynamo_test_failures import compiled_autograd_skips
+                if torch._dynamo.config.compiled_autograd and key in compiled_autograd_skips:
+                    # Still run the test, but with compiled autograd disabled
+                    super_run = runWithoutCompiledAutograd()(super_run)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             super_run(result=result)
 
         if strict_mode or should_reset_dynamo:
@@ -3781,14 +3921,22 @@ def get_batch_sparse_data(pattern, blocksize):
                         if target is None:
                             target = batch_data[layout] = (ext_coo_indices1, d[1])
                         else:
+<<<<<<< HEAD
                             target[0].set_(torch.cat((target[0], ext_coo_indices1), 1))
+=======
+                            target[0].set_(torch.cat((target[0], ext_coo_indices1), 1))  # type: ignore[call-overload]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             target[1].set_(torch.cat((target[1], d[1])))
                     else:
                         if target is None:
                             target = batch_data[layout] = tuple(d[j].unsqueeze(0) for j in range(len(d)))
                         else:
                             for j in range(len(d)):
+<<<<<<< HEAD
                                 target[j].set_(torch.cat((target[j], d[j].unsqueeze(0))))
+=======
+                                target[j].set_(torch.cat((target[j], d[j].unsqueeze(0))))  # type: ignore[call-overload]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return batch_data
 
         def generate_values(base, densesize):
@@ -3929,7 +4077,11 @@ def non_contiguous_copy(t, dim=-1, offset=0):
                     ((0, 0), [(1, 2)], [()]),
             ]:
                 for blocksize in blocksizes:
+<<<<<<< HEAD
                     for densesize in densesizes:
+=======
+                    for densesize in densesizes:  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         if layout == torch.strided:
                             indices = ()  # type: ignore[assignment]
                             values = torch.empty((basesize + densesize), device=device, dtype=dtype)
@@ -4525,7 +4677,11 @@ def find_free_port():
 
     NOTE: If this function is being used to allocate a port to Store (or
     indirectly via init_process_group or init_rpc), it should be used
+<<<<<<< HEAD
     in conjuction with the `retry_on_connect_failures` decorator as there is a potential
+=======
+    in conjunction with the `retry_on_connect_failures` decorator as there is a potential
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     race condition where the allocated port may become unavailable before it can be used
     """
     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
@@ -5251,6 +5407,7 @@ def dtype_name(dtype):
     return str(dtype).split('.')[1]
 
 
+<<<<<<< HEAD
 dtype_abbrs = {
     torch.bfloat16: 'bf16',
     torch.float64: 'f64',
@@ -5268,6 +5425,8 @@ def dtype_name(dtype):
 }
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @functools.lru_cache
 def get_cycles_per_ms() -> float:
     """Measure and return approximate number of cycles per millisecond for torch.cuda._sleep
@@ -5587,6 +5746,10 @@ def repl_frame(m):
     s = re.sub(r'  File "([^"]+)", line \d+, in (.+)\n(    .+\n( +[~^]+ *\n)?)+', repl_frame, s)
     s = re.sub(r"line \d+", "line N", s)
     s = re.sub(r".py:\d+", ".py:N", s)
+<<<<<<< HEAD
+=======
+    s = re.sub(r'https:/([a-zA-Z0-9_.-]+)', r'https://\1', s)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     s = re.sub(file, _as_posix_path(os.path.basename(file)), s)
     s = re.sub(_as_posix_path(os.path.join(os.path.dirname(torch.__file__), "")), "", s)
     if suppress_suffix:
@@ -5629,7 +5792,11 @@ def match_obj(obj):
                 f"{num_garbage_objs} tensors were found in the garbage. Did you introduce a reference cycle?"
             )
             try:
+<<<<<<< HEAD
                 import objgraph  # type: ignore[import-not-found]
+=======
+                import objgraph  # type: ignore[import-not-found,import-untyped]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 warnings.warn(
                     f"Dumping first {limit} objgraphs of leaked {matched_type}s rendered to png"
                 )
@@ -5655,6 +5822,39 @@ def remove_cpp_extensions_build_root():
         else:
             shutil.rmtree(default_build_root, ignore_errors=True)
 
+<<<<<<< HEAD
+=======
+
+def install_cpp_extension(extension_root):
+    # Wipe the build / install dirs if they exist
+    build_dir = os.path.join(extension_root, "build")
+    install_dir = os.path.join(extension_root, "install")
+    for d in (build_dir, install_dir):
+        if os.path.exists(d):
+            shutil.rmtree(d)
+
+    # Build the extension
+    setup_py_path = os.path.join(extension_root, "setup.py")
+    cmd = [sys.executable, setup_py_path, "install", "--root", install_dir]
+    return_code = shell(cmd, cwd=extension_root, env=os.environ)
+    if return_code != 0:
+        raise RuntimeError(f"build failed for cpp extension at {extension_root}")
+
+    mod_install_dir = None
+    # install directory is the one that is named site-packages
+    for root, directories, _ in os.walk(install_dir):
+        for directory in directories:
+            if "-packages" in directory:
+                mod_install_dir = os.path.join(root, directory)
+
+    if mod_install_dir is None:
+        raise RuntimeError(f"installation failed for cpp extension at {extension_root}")
+
+    if mod_install_dir not in sys.path:
+        sys.path.insert(0, mod_install_dir)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Decorator to provide a helper to load inline extensions to a temp directory
 def scoped_load_inline(func):
 
diff --git a/torch/testing/_internal/composite_compliance.py b/torch/testing/_internal/composite_compliance.py
index c0ce944c641d..9457076cbb88 100644
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@@ -127,7 +127,11 @@ def __new__(cls, elem, mode, *args, **kwargs):
             # by a Composite operation; if the Composite
             # operator attempts to read from the storage without dispatching then it'll
             # raise a RuntimeError due to it being a meta storage.
+<<<<<<< HEAD
             r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+=======
+            r = torch.Tensor._make_wrapper_subclass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cls, elem.size(),
                 dtype=elem.dtype, layout=elem.layout,
                 device=elem.device, requires_grad=elem.requires_grad,
@@ -359,7 +363,11 @@ def check_all_permutations(op, args, kwargs, assert_equal_fn):
         # - data_ptr accesses
         # The first is easy to filter for (we could make the error a different
         # error class), the second is always going to be a RuntimeError due to
+<<<<<<< HEAD
         # how it is implemented (if you try to access the data_ptr of thex
+=======
+        # how it is implemented (if you try to access the data_ptr of the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # wrapper Tensor, it raises you some internal RuntimeError).
         #
         # So the most general thing to catch here was RuntimeError. If you
@@ -552,8 +560,21 @@ def compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs):
 
         expected = compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs)
         expected = tree_map(fwAD.unpack_dual, expected)
+<<<<<<< HEAD
         expected_primals = tree_map(lambda x: x.primal, expected)
         expected_tangents = tree_map(lambda x: x.tangent, expected)
+=======
+        expected_primals = tree_map(
+            lambda x: x.primal,
+            expected,
+            is_leaf=lambda x: type(x) is fwAD.UnpackedDualTensor,
+        )
+        expected_tangents = tree_map(
+            lambda x: x.tangent,
+            expected,
+            is_leaf=lambda x: type(x) is fwAD.UnpackedDualTensor,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Permutations of arg and kwargs in CCT.
         for choice in generate_subclass_choices_args_kwargs(args, kwargs, CCT, cct_mode):
@@ -586,7 +607,20 @@ def unwrap(e):
                     return e.elem if isinstance(e, CCT) else e
 
                 actual = tree_map(fwAD.unpack_dual, actual)
+<<<<<<< HEAD
                 actual_primals = tree_map(lambda x: unwrap(x.primal), actual)
                 actual_tangents = tree_map(lambda x: unwrap(x.tangent), actual)
+=======
+                actual_primals = tree_map(
+                    lambda x: unwrap(x.primal),
+                    actual,
+                    is_leaf=lambda x: type(x) is fwAD.UnpackedDualTensor,
+                )
+                actual_tangents = tree_map(
+                    lambda x: unwrap(x.tangent),
+                    actual,
+                    is_leaf=lambda x: type(x) is fwAD.UnpackedDualTensor,
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 assert_equal_fn(actual_primals, expected_primals, equal_nan=True)
                 assert_equal_fn(actual_tangents, expected_tangents, equal_nan=True)
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
index 8fce5a8313f3..e97614e9bd24 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
@@ -1,7 +1,11 @@
 # mypy: allow-untyped-defs
 
 import sys
+<<<<<<< HEAD
 from functools import wraps, partial
+=======
+from functools import partial, wraps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -12,8 +16,15 @@
     tp_transports,
 )
 
+<<<<<<< HEAD
 TEST_GPU_NUM = 4
 
+=======
+
+TEST_GPU_NUM = 4
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ShardedTensorTestBase(MultiProcessTestCase):
     @property
     def world_size(self):
@@ -34,9 +45,16 @@ def init_pg(self, backend="nccl"):
         if backend == "nccl":
             torch.cuda.set_device(self.rank)
 
+<<<<<<< HEAD
 
     def init_rpc(self):
         rpc_backend_options = rpc.TensorPipeRpcBackendOptions(_transports=tp_transports())
+=======
+    def init_rpc(self):
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
+            _transports=tp_transports()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rpc_backend_options.init_method = f"file://{self.file_name}"
         for rank in range(self.world_size):
             rpc_backend_options.set_device_map(
@@ -79,6 +97,10 @@ def assert_sharded_tensor_equal(self, st1, st2):
         self.assertEqual(st1.sharding_spec(), st2.sharding_spec())
         self.assertEqual(len(st1.remote_shards()), len(st2.remote_shards()))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # wrapper to initialize comms (processgroup + rpc)
 def with_comms(func=None, init_rpc=True, backend="nccl"):
     if func is None:
@@ -95,4 +117,8 @@ def wrapper(self, *args, **kwargs):
         self.init_comms(init_rpc=init_rpc, backend=backend)
         func(self, *args, **kwargs)
         self.destroy_comms(destroy_rpc=init_rpc)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return wrapper
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
index 398b2fd8a36a..5832ab1e3643 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
@@ -121,6 +121,7 @@ def clone_module_parameter(module, param_name):
     tensor = getattr(module, param_name)
     return torch.nn.Parameter(tensor.detach().clone())
 
+<<<<<<< HEAD
 def gen_binary_op_func(python_op, inplace=False):
     src_lines = ['def f(lhs, rhs):']
     if "torch" in python_op:
@@ -132,5 +133,19 @@ def gen_binary_op_func(python_op, inplace=False):
 
     code_str = '\n'.join(src_lines)
     g = {'torch': torch}
+=======
+
+def gen_binary_op_func(python_op, inplace=False):
+    src_lines = ["def f(lhs, rhs):"]
+    if "torch" in python_op:
+        src_lines.append(f"  return {python_op}(lhs, rhs)\n")
+    elif inplace:
+        src_lines.append(f"  lhs {python_op}= rhs\n  return lhs\n")
+    else:
+        src_lines.append(f"  return lhs {python_op} rhs\n")
+
+    code_str = "\n".join(src_lines)
+    g = {"torch": torch}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     builtins.exec(code_str, g)
     return g["f"]
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
index b1e7a23b6f52..ecf7d40338b2 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
@@ -2,12 +2,20 @@
 
 import copy
 import random
+<<<<<<< HEAD
 import torch
 from torch.distributed._shard import sharded_tensor
 
 from torch.distributed._shard.sharding_spec import (
     ChunkShardingSpec,
 )
+=======
+
+import torch
+from torch.distributed._shard import sharded_tensor
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 PLACEMENTS = [
     "rank:0/cuda:0",
@@ -31,6 +39,7 @@ def _chunk_sharding_specs_list_for_test(sharding_dims, seed=0):
         )
     return spec_list
 
+<<<<<<< HEAD
 class MyShardedModel2(torch.nn.Module):
     def __init__(
         self,
@@ -38,6 +47,11 @@ def __init__(
         group=None,
         init_rrefs=True
     ) -> None:
+=======
+
+class MyShardedModel2(torch.nn.Module):
+    def __init__(self, spec=None, group=None, init_rrefs=True) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         if spec is not None:
             self.sharded_tensor2 = sharded_tensor.rand(
@@ -49,12 +63,16 @@ def __init__(
 
 
 class MyShardedModel1(torch.nn.Module):
+<<<<<<< HEAD
     def __init__(
         self,
         spec=None,
         group=None,
         init_rrefs=True
     ) -> None:
+=======
+    def __init__(self, spec=None, group=None, init_rrefs=True) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         super().__init__()
         if spec is not None:
             self.sharded_tensor1 = sharded_tensor.rand(
diff --git a/torch/testing/_internal/distributed/_shard/test_common.py b/torch/testing/_internal/distributed/_shard/test_common.py
index 26bb19dfd68e..887c4d813850 100644
--- a/torch/testing/_internal/distributed/_shard/test_common.py
+++ b/torch/testing/_internal/distributed/_shard/test_common.py
@@ -2,7 +2,10 @@
 
 import torch
 import torch.nn as nn
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 
 
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index e77098ddee49..0b9cbbaab711 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -4,6 +4,7 @@
 
 import itertools
 import sys
+<<<<<<< HEAD
 from dataclasses import dataclass
 from functools import partial, wraps
 from typing import (
@@ -14,14 +15,31 @@
     Union,
 )
 from collections.abc import Iterator, Sequence
+=======
+from collections.abc import Iterator, Sequence
+from dataclasses import dataclass
+from functools import partial, wraps
+from typing import Any, Callable, cast, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
+<<<<<<< HEAD
 
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
 from torch.distributed._tensor.placement_types import Placement
+=======
+from torch._utils import _get_device_module
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    Placement,
+    Replicate,
+    Shard,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
@@ -29,6 +47,7 @@
     RowwiseParallel,
     SequenceParallel,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import (
     TEST_HPU,
     TEST_CUDA,
@@ -44,6 +63,18 @@
 
 from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
 from torch._utils import _get_device_module
+=======
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    MultiThreadedTestCase,
+    run_subtests,
+    skip_if_lt_x_gpu,
+    TEST_SKIPS,
+)
+from torch.testing._internal.common_utils import TEST_CUDA, TEST_HPU, TEST_XPU
+from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 if TEST_CUDA:
     DEVICE_TYPE = "cuda"
@@ -64,7 +95,11 @@
 NUM_DEVICES = 4
 
 # We use this as a proxy for "multiple GPUs exist"
+<<<<<<< HEAD
 if (TEST_CUDA or TEST_XPU) and DEVICE_COUNT > 1:
+=======
+if (TEST_CUDA or TEST_XPU or TEST_HPU) and DEVICE_COUNT > 1:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # when we actually have multiple GPUs, relax the requirement to smaller counts.
     NUM_DEVICES = min(NUM_DEVICES, DEVICE_COUNT)
 
@@ -232,20 +267,45 @@ def forward(self, tokens):
 
     @staticmethod
     def parallelize(
+<<<<<<< HEAD
         module: "Transformer", device_mesh: DeviceMesh, use_seq_parallel: bool, local_output_for_attn: bool = False
+=======
+        module: "Transformer",
+        device_mesh: DeviceMesh,
+        use_seq_parallel: bool,
+        local_output_for_attn: bool = False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> nn.Module:
         assert isinstance(module, Transformer), f"Requires Transformer but got {module}"
         # Parallelize the root submodules.
         if use_seq_parallel:
             root_plan = {
+<<<<<<< HEAD
                 "tok_embeddings": RowwiseParallel(input_layouts=Replicate(), output_layouts=Shard(1)),
                 "pos_embeddings": RowwiseParallel(input_layouts=Replicate(), output_layouts=Shard(0)),
+=======
+                "tok_embeddings": RowwiseParallel(
+                    input_layouts=Replicate(), output_layouts=Shard(1)
+                ),
+                "pos_embeddings": RowwiseParallel(
+                    input_layouts=Replicate(), output_layouts=Shard(0)
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "norm": SequenceParallel(),
             }
         else:
             root_plan = {
+<<<<<<< HEAD
                 "tok_embeddings": RowwiseParallel(input_layouts=Replicate(), output_layouts=Replicate()),
                 "pos_embeddings": RowwiseParallel(input_layouts=Replicate(), output_layouts=Replicate()),
+=======
+                "tok_embeddings": RowwiseParallel(
+                    input_layouts=Replicate(), output_layouts=Replicate()
+                ),
+                "pos_embeddings": RowwiseParallel(
+                    input_layouts=Replicate(), output_layouts=Replicate()
+                ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
 
         module_tp = parallelize_module(module, device_mesh, root_plan)
@@ -260,9 +320,21 @@ def parallelize(
                 # shard the RMSNorms
                 layer_parallelize_plan["attention_norm"] = SequenceParallel()
                 layer_parallelize_plan["ffn_norm"] = SequenceParallel()
+<<<<<<< HEAD
             layer_parallelize_plan["attention.wq"] = ColwiseParallel(use_local_output=local_output_for_attn)
             layer_parallelize_plan["attention.wk"] = ColwiseParallel(use_local_output=local_output_for_attn)
             layer_parallelize_plan["attention.wv"] = ColwiseParallel(use_local_output=local_output_for_attn)
+=======
+            layer_parallelize_plan["attention.wq"] = ColwiseParallel(
+                use_local_output=local_output_for_attn
+            )
+            layer_parallelize_plan["attention.wk"] = ColwiseParallel(
+                use_local_output=local_output_for_attn
+            )
+            layer_parallelize_plan["attention.wv"] = ColwiseParallel(
+                use_local_output=local_output_for_attn
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             layer_parallelize_plan["attention.wo"] = (
                 RowwiseParallel(output_layouts=Shard(1))
                 if use_seq_parallel
@@ -297,7 +369,13 @@ def parallelize(
 
         if local_output_for_attn:
             for layer in module_tp.layers:
+<<<<<<< HEAD
                 layer.attention.n_heads = module_tp.model_args.n_heads // device_mesh.size()
+=======
+                layer.attention.n_heads = (
+                    module_tp.model_args.n_heads // device_mesh.size()
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Manually set output.weight so that parameters and gradients are shared.
         if module_tp.model_args.weight_tying:
@@ -325,6 +403,17 @@ def world_size(self) -> int:
         return NUM_DEVICES
 
     @property
+<<<<<<< HEAD
+=======
+    def device_type(self) -> str:
+        # if enough GPU/XPU/HPU we can use those devices, otherwise we fallback to CPU
+        if not (TEST_CUDA or TEST_XPU or TEST_HPU) or DEVICE_COUNT < self.world_size:
+            return "cpu"
+        else:
+            return DEVICE_TYPE
+
+    @property
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def backend(self) -> str:
         backend = dist.get_default_backend_for_device(DEVICE_TYPE)
         return backend
@@ -336,7 +425,18 @@ def init_pg(self, eager_init) -> None:
         if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
 
+<<<<<<< HEAD
         if self.backend not in ["nccl", "gloo", "mpi", "cpu:gloo,cuda:nccl", "hccl", "xccl"]:
+=======
+        if self.backend not in [
+            "nccl",
+            "gloo",
+            "mpi",
+            "cpu:gloo,cuda:nccl",
+            "hccl",
+            "xccl",
+        ]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise RuntimeError(f"Backend {self.backend} not supported!")
 
         device_id = None
@@ -344,7 +444,13 @@ def init_pg(self, eager_init) -> None:
             # set device for nccl pg for collectives
             torch.accelerator.set_device_index(self.rank)
             # we only need to set device_id for nccl backend with eager init
+<<<<<<< HEAD
             device_id = torch.device(f"{self.device_type}:{self.rank}") if eager_init else None
+=======
+            device_id = (
+                torch.device(f"{self.device_type}:{self.rank}") if eager_init else None
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # For nccl backend, bind the device to the process if device_id is not None
         # so the nccl communicator is immediately formed and we can use `ncclCommSplit`
         # for form subgroup to avoid unnecesssary overhead.
@@ -356,13 +462,25 @@ def init_pg(self, eager_init) -> None:
             device_id=device_id,
         )
 
+<<<<<<< HEAD
     def destroy_pg(self) -> None:
+=======
+    def destroy_pg(self, device_id: Optional[int] = None) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Wait for all ranks to reach here before starting shutdown.
         # FIXME dist.barrier deadlocks with multiple threads and NCCL: https://github.com/pytorch/pytorch/issues/95895
         # dist.all_reduce(torch.zeros((1,), device="cuda" if TEST_CUDA else "cpu"))
         # FIXME can't use the above all_reduce as it causes hangs on bionic and focal. It hangs:
         #  test_dtensor.py  -- DTensorMeshTest.test_dtensor_device_mesh_device_conversion
+<<<<<<< HEAD
         dist.barrier()
+=======
+        if device_id is None:
+            device_id = (
+                torch.cuda.current_device() if self.device_type == "cuda" else self.rank
+            )
+        dist.barrier(device_ids=[device_id])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         dist.destroy_process_group()
 
     def setUp(self) -> None:
@@ -388,19 +506,26 @@ def run_subtests(self, *args, **kwargs):
 
 # wrapper to initialize comms (processgroup)
 def with_comms(eager_init: Union[TestFunc, bool] = False) -> TestFunc:
+<<<<<<< HEAD
 
     def decorator(func, eager_init: bool = False):
 
+=======
+    def decorator(func, eager_init: bool = False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @wraps(func)  # pyre-ignore[6]
         def wrapper(
             self, *args: tuple[object], **kwargs: dict[str, Any]  # type: ignore[misc]
         ) -> None:
+<<<<<<< HEAD
             # if enough GPU we can use GPU, otherwise we fallback to CPU
             if not (TEST_CUDA or TEST_XPU) or torch.accelerator.device_count() < self.world_size:
                 self.device_type = "cpu"
             else:
                 self.device_type = DEVICE_TYPE
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.init_pg(eager_init)
 
             try:
@@ -413,7 +538,15 @@ def wrapper(
 
         return wrapper
 
+<<<<<<< HEAD
     return decorator(func=eager_init) if callable(eager_init) else partial(decorator, eager_init=eager_init)
+=======
+    return (
+        decorator(func=eager_init)
+        if callable(eager_init)
+        else partial(decorator, eager_init=eager_init)
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class DTensorOpTestBase(MultiThreadedTestCase):
@@ -454,10 +587,23 @@ def __init__(
         self.flatten_kwargs: list[object] = flatten_kwargs
         self.flatten_kwargs_spec: TreeSpec = flatten_kwargs_spec
 
+<<<<<<< HEAD
         choices_for_args = [self.gen_sharding_choices_for_arg(arg) for arg in self.flatten_args if isinstance(arg, torch.Tensor)]
 
         choices_for_args.extend(
             self.gen_sharding_choices_for_arg(arg) for arg in self.flatten_kwargs if isinstance(arg, torch.Tensor)
+=======
+        choices_for_args = [
+            self.gen_sharding_choices_for_arg(arg)
+            for arg in self.flatten_args
+            if isinstance(arg, torch.Tensor)
+        ]
+
+        choices_for_args.extend(
+            self.gen_sharding_choices_for_arg(arg)
+            for arg in self.flatten_kwargs
+            if isinstance(arg, torch.Tensor)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.sharding_combs: Iterator[Sequence[Placement]] = iter(
diff --git a/torch/testing/_internal/distributed/common_state_dict.py b/torch/testing/_internal/distributed/common_state_dict.py
index 148a1749c433..bbca61cafa95 100644
--- a/torch/testing/_internal/distributed/common_state_dict.py
+++ b/torch/testing/_internal/distributed/common_state_dict.py
@@ -10,13 +10,20 @@
 import torch.nn as nn
 from torch.distributed._sharded_tensor import ShardedTensor
 from torch.distributed._state_dict_utils import _gather_state_dict
+<<<<<<< HEAD
 from torch.distributed._tensor import DTensor
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.distributed.checkpoint.state_dict import (
     _PG,
     _STATE,
     set_state_dict,
     StateDictOptions,
 )
+<<<<<<< HEAD
+=======
+from torch.distributed.tensor import DTensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class VerifyStateDictMixin:
@@ -141,7 +148,11 @@ def __init__(self, vocab_size: int, fusion_vocab_size: int, embed_dim: int) -> N
 
     def _state_dict_hook(self, destination, prefix, keep_vars):
         """Remove "embedding" from the original embedding in the state_dict
+<<<<<<< HEAD
         name. This keeps the orginal state dict name for the embedding
+=======
+        name. This keeps the original state dict name for the embedding
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         from before fusing with the FusionEmbedding.
         """
         key = prefix + "embedding.weight"
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 00a67b20a73c..b60fe076ce26 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -20,7 +20,11 @@
     skip_if_lt_x_gpu,
     skip_if_rocm_multiprocess,
 )
+<<<<<<< HEAD
 from torch.testing._internal.dist_utils import INIT_METHOD_TEMPLATE, dist_init
+=======
+from torch.testing._internal.dist_utils import dist_init, INIT_METHOD_TEMPLATE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
@@ -68,7 +72,11 @@ def init_logger():
 
 
 class FeatureSet(NamedTuple):
+<<<<<<< HEAD
     """ A feature set has 2 types of features"""
+=======
+    """A feature set has 2 types of features"""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     dense_features: torch.Tensor
     sparse_features: torch.LongTensor
@@ -210,7 +218,12 @@ def __init__(
         gLogger.info(
             "Succeeded in creating a HybridModel instance with "
             "%s ddp params and %s other local params.",
+<<<<<<< HEAD
             len(self.ddp_params), len(self.non_ddp_params)
+=======
+            len(self.ddp_params),
+            len(self.non_ddp_params),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def destroy_pg(self):
@@ -246,7 +259,12 @@ def train_batch(
                 gLogger.info(
                     "Trainer reduced input patches from %s "
                     "to %s to simulate uneven inputs.",
+<<<<<<< HEAD
                     len(batches), len(input_batches)
+=======
+                    len(batches),
+                    len(input_batches),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             else:
                 input_batches = batches
@@ -260,7 +278,15 @@ def train_batch(
                     grads_dict = dist_autograd.get_gradients(context_id)
                     gLogger.info(
                         "Loss is %s for mini batch: %s. "
+<<<<<<< HEAD
                         "Grads dict has %s entries: %s", loss, mini_batch, len(grads_dict), grads_dict
+=======
+                        "Grads dict has %s entries: %s",
+                        loss,
+                        mini_batch,
+                        len(grads_dict),
+                        grads_dict,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
         return (
             tuple(grads_dict[param] for param in self.ddp_params),
@@ -348,7 +374,13 @@ def _remote_worker_process(self, ddp_mode):
     def _trainer_process(self, rank: int):
         gLogger.info("Running the trainer #%s...", rank)
         gLogger.info(
+<<<<<<< HEAD
             "Initing trainer process group by trainer #%s with ranks %s", rank, TRAINER_RANKS
+=======
+            "Initing trainer process group by trainer #%s with ranks %s",
+            rank,
+            TRAINER_RANKS,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         dist.init_process_group(
             backend="gloo",
@@ -534,7 +566,13 @@ def _run_test_ddp_comparision(self, simulate_uneven_inputs=False):
         inputs_list = [torch.rand((3, 2)) for _ in range(num_inputs)]
 
         if simulate_uneven_inputs:
+<<<<<<< HEAD
             gLogger.info("Rank %s training with %s inputs.", self.rank, len(inputs_list))
+=======
+            gLogger.info(
+                "Rank %s training with %s inputs.", self.rank, len(inputs_list)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Use distributed autograd. The gradients will be in RPC context map.
         grads_dict = {}
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index f9f46ea151ad..931c81d8faf4 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 
 import copy
+<<<<<<< HEAD
 import json
 import itertools
 import math
@@ -10,19 +11,41 @@
 import tempfile
 import time
 from collections import namedtuple, OrderedDict, defaultdict
+=======
+import itertools
+import json
+import math
+import operator
+import os
+import random
+import re
+import sys
+import tempfile
+import time
+import unittest
+from collections import defaultdict, namedtuple, OrderedDict
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from datetime import timedelta
 from functools import reduce
+<<<<<<< HEAD
 from typing import Union, NamedTuple, Callable, Any
 import unittest
 import numpy as np
+=======
+from typing import Any, Callable, NamedTuple, Union
+
+import numpy as np
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch
 import torch.cuda
 import torch.distributed as dist
 import torch.distributed.algorithms.model_averaging.averagers as averagers
 import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
 import torch.distributed.algorithms.model_averaging.utils as model_averaging_utils
+<<<<<<< HEAD
 import torch.nn as nn
 import torch.nn.functional as F
 from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
@@ -91,12 +114,81 @@
 
 from torch.utils.data.distributed import DistributedSampler
 import operator
+=======
+import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer
+import torch.nn as nn
+import torch.nn.functional as F
+from torch._utils_internal import (
+    TEST_MASTER_ADDR as MASTER_ADDR,
+    TEST_MASTER_PORT as MASTER_PORT,
+)
+from torch.autograd import DeviceType
+from torch.cuda.amp import autocast, GradScaler
+from torch.distributed.algorithms.ddp_comm_hooks import (
+    default_hooks as default,
+    post_localSGD_hook as post_localSGD,
+    powerSGD_hook as powerSGD,
+    quantization as quantization_hooks,
+)
+from torch.distributed.distributed_c10d import (
+    _get_default_group,
+    _get_pg_config,
+    get_world_size,
+)
+from torch.distributed.optim import _apply_optimizer_in_backward
+from torch.distributed.utils import (
+    _sync_module_states,
+    _verify_param_shape_across_processes,
+)
+from torch.nn.parallel import DistributedDataParallel
+from torch.nn.parallel.distributed import _dump_DDP_relevant_env_vars, _MixedPrecision
+from torch.profiler import ExecutionTraceObserver, ProfilerActivity
+from torch.testing._internal.common_distributed import (
+    captured_output,
+    cleanup_temp_dir,
+    DistTestCases,
+    init_multigpu_helper,
+    initialize_temp_directories,
+    MultiProcessTestCase,
+    nccl_skip_if_lt_x_gpu,
+    require_n_gpus_for_nccl_backend,
+    requires_nccl_version,
+    simple_sparse_reduce_tests,
+    skip_if_lt_x_gpu,
+    skip_if_no_gpu,
+    skip_if_odd_worldsize,
+    skip_if_rocm_multiprocess,
+    skip_if_small_worldsize,
+    TEST_SKIPS,
+    verify_ddp_error_logged,
+    with_dist_debug_levels,
+    with_nccl_blocking_wait,
+)
+from torch.testing._internal.common_utils import (
+    FILE_SCHEMA,
+    instantiate_parametrized_tests,
+    IS_FBCODE,
+    IS_MACOS,
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    skip_but_pass_in_sandcastle,
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+)
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils.data.distributed import DistributedSampler
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 try:
     import torchvision
 
     HAS_TORCHVISION = True
+<<<<<<< HEAD
 except ImportError:
+=======
+except Exception:  # Covering both ImportError and RuntimeError
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     HAS_TORCHVISION = False
 
 if sys.platform == "win32":
@@ -201,19 +293,32 @@ def get_profiling_event(event_name, profiler, dedup_gpu_user_annotation=False):
         else profiler.function_events
     )
     return [
+<<<<<<< HEAD
         event for event in event_list
+=======
+        event
+        for event in event_list
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if (
             (event.name.endswith(event_name) or event.name.startswith(event_name))
             and (not dedup_gpu_user_annotation or event.device_type != DeviceType.CUDA)
         )
     ]
 
+<<<<<<< HEAD
 def get_profiler_nccl_meta(prof):
     """Torch profiler includes nccl metadata in an inserted operator called "record_param_comms"
     We will need to test metadata obtained from profiler here"""
     tf = tempfile.NamedTemporaryFile(
         mode="w+t", suffix=".json", delete=False
     )
+=======
+
+def get_profiler_nccl_meta(prof):
+    """Torch profiler includes nccl metadata in an inserted operator called "record_param_comms"
+    We will need to test metadata obtained from profiler here"""
+    tf = tempfile.NamedTemporaryFile(mode="w+t", suffix=".json", delete=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tf.close()
     trace_file = tf.name
 
@@ -227,6 +332,10 @@ def get_profiler_nccl_meta(prof):
 
     return [e for e in events if e.get("name") == "record_param_comms"]
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Base error message substring on unfinished reductions.
 ddp_prev_reduction_unfinished_str = (
     "Expected to have finished reduction in the prior iteration"
@@ -424,6 +533,10 @@ def get_timeout(test_id):
     "test_ddp_has_finalized": 5,
 }
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def require_backend_is_available(backends):
     def check(backend):
         if backend == dist.Backend.GLOO:
@@ -672,7 +785,11 @@ def _verify_buffers_equal(self, m1, m2):
             # Verify buffers across ranks.
             m1_buffers = list(m1.buffers())
             m2_buffers = list(m2.buffers())
+<<<<<<< HEAD
             for (buf1, buf2) in zip(m1_buffers, m2_buffers):
+=======
+            for buf1, buf2 in zip(m1_buffers, m2_buffers):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 gathered_bufs = [
                     torch.empty_like(buf1) for _ in range(dist.get_world_size())
                 ]
@@ -920,6 +1037,43 @@ def test_new_subgroups(self):
             BACKEND not in DistTestCases.backend_feature["subgroup"],
             f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
         )
+<<<<<<< HEAD
+=======
+        @require_world_size(4)
+        @skip_if_lt_x_gpu(4)
+        def test_new_subgroups_with_group_param(self):
+            # Initialize global test environment
+            self._init_global_test()
+            # Set up GPU devices for each rank
+            init_multigpu_helper(dist.get_world_size(), BACKEND)
+            # Create two subgroups: one with ranks [0,2] and another with ranks [1,3]
+            cur_subgroup, subgroups = dist.new_subgroups_by_enumeration(
+                ranks_per_subgroup_list=[[0, 2], [1, 3]]
+            )
+
+            # Further divide the current subgroup into sub-subgroups of size 1
+            cur_sub_subgroup, sub_subgroups = dist.new_subgroups(
+                group_size=1, group=cur_subgroup
+            )
+            # Verify we have 2 sub-subgroups (one for each rank in the original subgroup)
+            self.assertEqual(len(sub_subgroups), 2)
+            # Verify the current process's sub-subgroup has size 1
+            self.assertEqual(cur_sub_subgroup.size(), 1)
+            # Verify the current process is in its assigned sub-subgroup
+            self.assertFalse(dist._rank_not_in_group(group=cur_sub_subgroup))
+
+            # Clean up by destroying all created process groups
+            for sub_subgroup in sub_subgroups:
+                dist.destroy_process_group(sub_subgroup)
+
+            for subgroup in subgroups:
+                dist.destroy_process_group(subgroup)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @skip_if_no_gpu
         def test_new_subgroups_group_size_exceeds_world_size(self):
             with self.assertRaisesRegex(ValueError, "must not exceed"):
@@ -933,7 +1087,12 @@ def test_new_subgroups_group_size_exceeds_world_size(self):
         @skip_if_lt_x_gpu(4)
         def test_new_subgroups_world_size_not_divisible_by_group_size(self):
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 ValueError, "The world size must be divisible by 'group_size'"
+=======
+                ValueError,
+                re.escape("The world size (4) must be divisible by 'group_size=3'"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 dist.new_subgroups(3)
 
@@ -1218,8 +1377,13 @@ def test_3_level_hierarchical_model_averager(self):
 
             subgroup1 = averager.period_process_group_dict[subgroup_avg_period1]
             subgroup2 = averager.period_process_group_dict[subgroup_avg_period2]
+<<<<<<< HEAD
             real_group_ranks_res1 = _get_pg_config(subgroup1)['ranks']
             real_group_ranks_res2 = _get_pg_config(subgroup2)['ranks']
+=======
+            real_group_ranks_res1 = _get_pg_config(subgroup1)["ranks"]
+            real_group_ranks_res2 = _get_pg_config(subgroup2)["ranks"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             expect_group_ranks_res1 = (
                 rank // subgroup_size1 * subgroup_size1
@@ -1269,7 +1433,11 @@ def test_3_level_hierarchical_model_averager(self):
         @skip_if_no_gpu
         @skip_but_pass_in_sandcastle_if(
             BACKEND != "nccl" or IS_FBCODE or IS_SANDCASTLE,
+<<<<<<< HEAD
             "Coalescing manager currently tests with NCCL only; internal test flaky"
+=======
+            "Coalescing manager currently tests with NCCL only; internal test flaky",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         def test_coalescing_manager(self):
             self._barrier()
@@ -1294,7 +1462,11 @@ def test_coalescing_manager(self):
             for i in range(num_colls):
                 self.assertEqual(
                     small_tensors[i],
+<<<<<<< HEAD
                     big_tensor[i * size_per_coll : (i + 1) * size_per_coll]
+=======
+                    big_tensor[i * size_per_coll : (i + 1) * size_per_coll],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             self._barrier()
@@ -1303,7 +1475,11 @@ def test_coalescing_manager(self):
         @skip_if_no_gpu
         @skip_but_pass_in_sandcastle_if(
             BACKEND != "nccl" or IS_FBCODE or IS_SANDCASTLE,
+<<<<<<< HEAD
             "Coalescing manager currently tests with NCCL only; internal test flaky"
+=======
+            "Coalescing manager currently tests with NCCL only; internal test flaky",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         def test_coalescing_manager_async(self):
             self._barrier()
@@ -1329,7 +1505,11 @@ def test_coalescing_manager_async(self):
             for i in range(num_colls):
                 self.assertEqual(
                     small_tensors[i],
+<<<<<<< HEAD
                     big_tensor[i * size_per_coll : (i + 1) * size_per_coll]
+=======
+                    big_tensor[i * size_per_coll : (i + 1) * size_per_coll],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             self._barrier()
@@ -1585,7 +1765,13 @@ def _test_send_recv_nccl(self, profiler_ctx=None):
                 backend = dist.get_backend()
                 if backend in SEND_RECV_PROFILING_SUPPORTED_BACKENDS:
                     for event_name in [f"{backend}:send", f"{backend}:recv"]:
+<<<<<<< HEAD
                         events = get_profiling_event(event_name, prof, dedup_gpu_user_annotation=True)
+=======
+                        events = get_profiling_event(
+                            event_name, prof, dedup_gpu_user_annotation=True
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         self.assertTrue(events)
                         # Event order is not deterministic, so simply assert their shape
                         # is found in the following list.
@@ -1595,7 +1781,10 @@ def _test_send_recv_nccl(self, profiler_ctx=None):
                         for event in events:
                             self.assertTrue(event.input_shapes in expected_shapes)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @skip_if_no_gpu
         @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Send Recv Only")
         @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
@@ -2068,7 +2257,10 @@ def test_broadcast_full_group(self):
             "Only NCCL backend supports high priority stream",
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_nccl_high_priority_stream(self):
             group, _, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -2746,9 +2938,13 @@ def test_all_reduce_complex_unsupported_ops(self):
             ]
             _group, group_id, _rank = self._init_global_test()
             for unsupported_op in unsupported_ops:
+<<<<<<< HEAD
                 with self.assertRaisesRegex(
                     ValueError, "all_reduce does not support"
                 ):
+=======
+                with self.assertRaisesRegex(ValueError, "all_reduce does not support"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     dist.all_reduce(
                         _build_tensor(1, dtype=torch.cfloat), unsupported_op, group_id
                     )
@@ -3229,7 +3425,10 @@ def test_scatter(self):
             BACKEND != "nccl", "Only Nccl supports CUDA gather"
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_scatter_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -3420,7 +3619,10 @@ def test_all_gather(self):
             BACKEND != "nccl", "Only Nccl supports CUDA all gather"
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_all_gather_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -3437,7 +3639,10 @@ def test_all_gather_complex(self):
             BACKEND != "nccl", "Only Nccl supports CUDA all gather"
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_all_gather_cuda_complex(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -3550,7 +3755,10 @@ def test_all_gather_into_cat_tensor_cuda(self):
             BACKEND != "nccl", "Only Nccl supports CUDA all_gather_into_tensor"
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_all_gather_into_stack_tensor_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -3806,7 +4014,10 @@ def test_all_to_all_single_equal_split(self):
             BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_all_to_all_single_equal_split_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -4379,9 +4590,13 @@ def __init__(self) -> None:
                     self.net2 = nn.Linear(10, 0)
 
             model = ToyModel().to(self.rank)
+<<<<<<< HEAD
             nn.parallel.DistributedDataParallel(
                 model, device_ids=[self.rank]
             )
+=======
+            nn.parallel.DistributedDataParallel(model, device_ids=[self.rank])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @skip_but_pass_in_sandcastle_if(BACKEND == "nccl", "Gloo-only test")
         def test_ddp_create_graph(self):
@@ -4527,7 +4742,11 @@ def _test_ddp_hook_with_optimizer_parity(
                 models_to_test.append(
                     (torchvision.models.resnet50(), torch.randn(1, 3, 3, 1000).cuda())
                 )
+<<<<<<< HEAD
             for (model, inp) in models_to_test:
+=======
+            for model, inp in models_to_test:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 # Enable determinism in cudnn operators
                 with torch.backends.cudnn.flags(
                     enabled=True, deterministic=True, benchmark=False
@@ -4727,11 +4946,19 @@ def test_get_data_parallel_params(self):
             torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
                 model, params_to_ignore
             )
+<<<<<<< HEAD
             torch.nn.parallel.DistributedDataParallel(
                 model, device_ids=[self.rank]
             )
             dp_params = torch.nn.parallel.DistributedDataParallel._get_data_parallel_params(
                 model, named_params=True
+=======
+            torch.nn.parallel.DistributedDataParallel(model, device_ids=[self.rank])
+            dp_params = (
+                torch.nn.parallel.DistributedDataParallel._get_data_parallel_params(
+                    model, named_params=True
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             for name, _ in dp_params:
                 self.assertNotEqual(f"module.{params_to_ignore[0]}", name)
@@ -4740,7 +4967,15 @@ def test_get_data_parallel_params(self):
             # no of parameters.
             num_ddp_params = len(list(model.parameters())) - 1
             count = 0
+<<<<<<< HEAD
             dp_params = torch.nn.parallel.DistributedDataParallel._get_data_parallel_params(model, named_params=False)
+=======
+            dp_params = (
+                torch.nn.parallel.DistributedDataParallel._get_data_parallel_params(
+                    model, named_params=False
+                )
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in dp_params:
                 count += 1
             self.assertEqual(count, num_ddp_params)
@@ -4823,10 +5058,14 @@ def _test_ddp_apply_optim_in_backward(
                         # case.
                         optim.zero_grad(set_to_none=True)
 
+<<<<<<< HEAD
         @skip_but_pass_in_sandcastle_if(
             BACKEND == "gloo" and HAS_TORCHVISION,
             "Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
         )
+=======
+        @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @skip_if_lt_x_gpu(2)
         def test_ddp_apply_optim_in_backward(self):
             for optim_cls, init_before in itertools.product(
@@ -4839,10 +5078,14 @@ def test_ddp_apply_optim_in_backward(self):
                         init_before=init_before,
                     )
 
+<<<<<<< HEAD
         @skip_but_pass_in_sandcastle_if(
             BACKEND == "gloo" and HAS_TORCHVISION,
             "Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
         )
+=======
+        @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @skip_if_lt_x_gpu(2)
         def test_ddp_apply_optim_in_backward_grad_as_bucket_view_false(self):
             for init_before in [True, False]:
@@ -4853,6 +5096,10 @@ def test_ddp_apply_optim_in_backward_grad_as_bucket_view_false(self):
                     gradient_as_bucket_view=False,
                 )
 
+<<<<<<< HEAD
+=======
+        @skipIfRocm
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @skip_if_lt_x_gpu(2)
         def test_ddp_apply_optim_in_backward_ignored_params(self):
             torch.cuda.set_device(self.rank)
@@ -4916,7 +5163,12 @@ def test_ddp_native_mixed_precision_ignored_params(self):
             # Parameters to ignore are in the format {module_name}.{param_name}
             to_ignore = ["a.weight", "buffer"]
             torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+<<<<<<< HEAD
                 model, to_ignore,
+=======
+                model,
+                to_ignore,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             mp_config = self._get_fp16_config()
             net = torch.nn.parallel.DistributedDataParallel(
@@ -4929,11 +5181,19 @@ def test_ddp_native_mixed_precision_ignored_params(self):
             expected_ignored = len(to_ignore)
             n_ignored = 0
             # ignored params should not have _mp_param or _fp_param fields.
+<<<<<<< HEAD
             for (n, p) in itertools.chain(net.named_parameters(), net.named_buffers()):
                 if n in to_ignore:
                     n_ignored += 1
                     self.assertFalse(hasattr(p, '_mp_param'))
                     self.assertFalse(hasattr(p, '_fp_param'))
+=======
+            for n, p in itertools.chain(net.named_parameters(), net.named_buffers()):
+                if n in to_ignore:
+                    n_ignored += 1
+                    self.assertFalse(hasattr(p, "_mp_param"))
+                    self.assertFalse(hasattr(p, "_fp_param"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     self.assertEqual(mp_config.param_dtype, p._mp_param.dtype)
                     self.assertEqual(torch.float32, p._fp_param.dtype)
@@ -4954,10 +5214,15 @@ class MyModel(torch.nn.Module):
                 def __init__(self) -> None:
                     super().__init__()
                     self.m = torch.nn.Linear(1, 5)
+<<<<<<< HEAD
                     self.register_buffer('buffer', torch.randn(1, 2))
                     self.p = torch.nn.Parameter(
                         torch.randn(10, 5), requires_grad=False
                     )
+=======
+                    self.register_buffer("buffer", torch.randn(1, 2))
+                    self.p = torch.nn.Parameter(torch.randn(10, 5), requires_grad=False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 def forward(self_, x):  # noqa: B902
                     params = self_.m.parameters()
@@ -4992,7 +5257,11 @@ def forward(self_, x):  # noqa: B902
                 for n, param in net.named_parameters():
                     self.assertEqual(param.dtype, torch.float32)
                     if param.grad is None:
+<<<<<<< HEAD
                         assert n == 'module.p'  # Only param that doesn't require grad
+=======
+                        assert n == "module.p"  # Only param that doesn't require grad
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     else:
                         self.assertEqual(param.grad.dtype, torch.float32)
                         tensor_list = [
@@ -5008,7 +5277,13 @@ def forward(self_, x):  # noqa: B902
                 net.zero_grad(set_to_none=set_grad_to_none)
 
         @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
         def test_ddp_native_mixed_precision_no_grad_as_bucket_view_no_set_grad_none(self):
+=======
+        def test_ddp_native_mixed_precision_no_grad_as_bucket_view_no_set_grad_none(
+            self,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._test_ddp_native_mixed_precision(
                 gradient_as_bucket_view=False,
                 set_grad_to_none=False,
@@ -5028,7 +5303,13 @@ def test_ddp_native_mixed_precision_grad_as_bucket_view_set_grad_to_none(self):
             )
 
         @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
         def test_ddp_native_mixed_precision_no_grad_as_bucket_view_set_grad_to_none(self):
+=======
+        def test_ddp_native_mixed_precision_no_grad_as_bucket_view_set_grad_to_none(
+            self,
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self._test_ddp_native_mixed_precision(
                 gradient_as_bucket_view=True, set_grad_to_none=True
             )
@@ -5416,7 +5697,10 @@ def add(fut):
             f"The {BACKEND} backend does not support DistributedDataParallel",
         )
         @skip_if_no_gpu
+<<<<<<< HEAD
         @skip_if_rocm_multiprocess #enable via https://github.com/ROCm/frameworks-internal/issues/13115
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_DistributedDataParallel(self):
             _group, _group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -6094,7 +6378,13 @@ def test_DistributedDataParallel_SyncBatchNorm_half(self):
             model = copy.deepcopy(BN_NET)
             model = model.half()
             model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+<<<<<<< HEAD
             model = nn.parallel.DistributedDataParallel(model.cuda(rank), device_ids=[rank])
+=======
+            model = nn.parallel.DistributedDataParallel(
+                model.cuda(rank), device_ids=[rank]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             inp = torch.randn(2, 2, dtype=torch.float16, device=torch.device(rank))
             # Check that forward/backward do not error with dtype mismatch
             out = model(inp)
@@ -6843,7 +7133,11 @@ def test_ddp_grad_div_uneven_inputs(self):
         def _test_ddp_profiling(self, profiler_ctx, profiler_ctx2=None):
             """Runs DDP based model training and captures profiles.
             This test will do two profiler runs.
+<<<<<<< HEAD
             1. An inital basic run to check if profiler events are correctly captured.
+=======
+            1. An initial basic run to check if profiler events are correctly captured.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             2. A second profiling pass after running some iterations of DDP, to check robustness of thread local state.
 
             args
@@ -6873,7 +7167,13 @@ def _test_ddp_profiling(self, profiler_ctx, profiler_ctx2=None):
                     loss.backward()
 
             all_reduce_event_name = f"{dist.get_backend()}:all_reduce"
+<<<<<<< HEAD
             events = get_profiling_event(all_reduce_event_name, prof, dedup_gpu_user_annotation=True)
+=======
+            events = get_profiling_event(
+                all_reduce_event_name, prof, dedup_gpu_user_annotation=True
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             event_count = sum(e.count for e in events)
             self.assertEqual(event_count, num_iters)
             for event in events:
@@ -6881,7 +7181,13 @@ def _test_ddp_profiling(self, profiler_ctx, profiler_ctx2=None):
                 self.assertEqual(event.name, all_reduce_event_name)
 
             broadcast_event_name = f"{dist.get_backend()}:broadcast"
+<<<<<<< HEAD
             broadcast_events = get_profiling_event(broadcast_event_name, prof, dedup_gpu_user_annotation=True)
+=======
+            broadcast_events = get_profiling_event(
+                broadcast_event_name, prof, dedup_gpu_user_annotation=True
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             event_count = sum(e.count for e in broadcast_events)
             # Broadcast is called during rebuild_buckets
             self.assertGreaterEqual(event_count, 1)
@@ -6904,7 +7210,13 @@ def _test_ddp_profiling(self, profiler_ctx, profiler_ctx2=None):
                 loss = net(inp).sum()
                 loss.backward()
 
+<<<<<<< HEAD
             events = get_profiling_event(all_reduce_event_name, prof, dedup_gpu_user_annotation=True)
+=======
+            events = get_profiling_event(
+                all_reduce_event_name, prof, dedup_gpu_user_annotation=True
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertGreaterEqual(len(events), 1)
             self.assertGreaterEqual(events[0].count, 1)
             self.assertEqual(events[0].name, all_reduce_event_name)
@@ -6960,6 +7272,7 @@ def test_ddp_profiling_torch_profiler(self):
 
         def _validate_execution_trace_nccl(self, et_file: str) -> None:
             """Torch profiler includes nccl metadata in an inserted operator called "record_param_comms"
+<<<<<<< HEAD
             We test for basic fields in theese nodes in the Execution Trace.
             """
             with open(et_file) as f:
@@ -6967,6 +7280,19 @@ def _validate_execution_trace_nccl(self, et_file: str) -> None:
             pg_cfg_node = [n for n in et["nodes"] if n["name"] == "## process_group:init ##"]
             self.assertGreaterEqual(len(pg_cfg_node), 1)
             nccl_meta_nodes = [n for n in et["nodes"] if n["name"] == "record_param_comms"]
+=======
+            We test for basic fields in these nodes in the Execution Trace.
+            """
+            with open(et_file) as f:
+                et = json.load(f)
+            pg_cfg_node = [
+                n for n in et["nodes"] if n["name"] == "## process_group:init ##"
+            ]
+            self.assertGreaterEqual(len(pg_cfg_node), 1)
+            nccl_meta_nodes = [
+                n for n in et["nodes"] if n["name"] == "record_param_comms"
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(len(nccl_meta_nodes), 3)
             per_coll_meta = defaultdict(list)
 
@@ -6984,7 +7310,11 @@ def _validate_execution_trace_nccl(self, et_file: str) -> None:
                 if collname in {"wait"}:
                     continue
 
+<<<<<<< HEAD
                 self.assertEqual(attrs["pg_name"], "0")   # yes this is a string
+=======
+                self.assertEqual(attrs["pg_name"], "0")  # yes this is a string
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(attrs["pg_desc"], "default_pg")
                 self.assertEqual(attrs["pg_size"], 2)
 
@@ -7007,7 +7337,10 @@ def _validate_execution_trace_nccl(self, et_file: str) -> None:
             self.assertEqual(a1["out_msg_nelems"], 1, msg=f"{a1}")
             self.assertEqual(a1["dtype"], "Int", msg=f"{a1}")
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @skip_if_lt_x_gpu(2)
         @skip_but_pass_in_sandcastle_if(IS_FBCODE, "Kineto in fbcode code causes hang")
@@ -7031,7 +7364,11 @@ def test_ddp_profiling_execution_trace(self):
             # collect ET in second profiler pass
             torch_profiler_ctx2 = torch.profiler.profile(
                 activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+<<<<<<< HEAD
                 execution_trace_observer=et
+=======
+                execution_trace_observer=et,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             self._test_ddp_profiling(
                 profiler_ctx=torch_profiler_ctx1,
@@ -7041,7 +7378,10 @@ def test_ddp_profiling_execution_trace(self):
             print(f"Execution trace saved at {fp.name}")
             self._validate_execution_trace_nccl(et_file)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @skip_if_lt_x_gpu(2)
         @skip_but_pass_in_sandcastle_if(
             BACKEND not in DistTestCases.backend_feature["ddp"],
@@ -7419,7 +7759,13 @@ def forward(self, x, rank):
             for num_early_join_ranks in num_uneven_ranks:
                 for baseline_iter in baseline_num_iters:
                     for offset in iteration_offsets:
+<<<<<<< HEAD
                         mapping = dict.fromkeys(range(0, num_early_join_ranks), baseline_iter)
+=======
+                        mapping = dict.fromkeys(
+                            range(0, num_early_join_ranks), baseline_iter
+                        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         # if num_early_join_ranks > 1, ranks > 0 that will join early
                         # iterate offset//2 more times than rank 0, to test nodes
                         # depleting inputs at different times.
@@ -7428,11 +7774,22 @@ def forward(self, x, rank):
                                 if rank > 0:
                                     mapping[rank] += offset // 2
                         mapping.update(
+<<<<<<< HEAD
                             dict.fromkeys(range(num_early_join_ranks, dist.get_world_size()), baseline_iter + offset)
                         )
                         iteration_mappings.append(mapping)
 
             for (test_case, iteration_mapping) in itertools.product(
+=======
+                            dict.fromkeys(
+                                range(num_early_join_ranks, dist.get_world_size()),
+                                baseline_iter + offset,
+                            )
+                        )
+                        iteration_mappings.append(mapping)
+
+            for test_case, iteration_mapping in itertools.product(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 models_to_test, iteration_mappings
             ):
                 if self.rank == 0:
@@ -7585,7 +7942,13 @@ def _test_broadcast_object_list(self, group=None):
             int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"]
         )
         @with_dist_debug_levels(levels=["DETAIL"])
+<<<<<<< HEAD
         @unittest.skip("Test is failing, see https://github.com/pytorch/pytorch/pull/113620")
+=======
+        @unittest.skip(
+            "Test is failing, see https://github.com/pytorch/pytorch/pull/113620"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_broadcast_object_list(self):
             return self._test_broadcast_object_list()
 
@@ -7620,7 +7983,11 @@ def forward(self, x):
 
             device_id = self.rank
             # Ensure the test works for both find_unused_parameter and broadcast_buffer settings.
+<<<<<<< HEAD
             for (find_unused, broadcast_buffers) in itertools.product(
+=======
+            for find_unused, broadcast_buffers in itertools.product(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 [False, True], [False, True]
             ):
                 model = TestModel(self.rank).float().to(device_id)
@@ -7969,16 +8336,26 @@ def forward(self, input):
                     context = nullcontext
 
                 with context():
+<<<<<<< HEAD
                     input = torch.rand((1, ))
                     output = model.forward(input)
                     target = torch.rand((1, ))
+=======
+                    input = torch.rand((1,))
+                    output = model.forward(input)
+                    target = torch.rand((1,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                     loss = mse_loss(output, target)
                     loss.backward()
 
             self.assertTrue(
                 not any(p.grad is None for p in model.parameters()),
+<<<<<<< HEAD
                 "Gradients can't be None for any model parameter."
+=======
+                "Gradients can't be None for any model parameter.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             grads = torch.cat([p.grad.view(-1) for p in model.parameters()])
 
@@ -7993,7 +8370,11 @@ def forward(self, input):
                 for g in gathered_grads[1:]:
                     self.assertTrue(
                         torch.allclose(gathered_grads[0], g),
+<<<<<<< HEAD
                         "Gradients are not the same for all ranks."
+=======
+                        "Gradients are not the same for all ranks.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
 
         @with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
@@ -8305,9 +8686,13 @@ def _test_compute_bucket_assignment_by_size(self, use_logger):
                         tensors_sparse, [400], logger=net.logger
                     )
                 else:
+<<<<<<< HEAD
                     dist._compute_bucket_assignment_by_size(
                         tensors_sparse, [400]
                     )
+=======
+                    dist._compute_bucket_assignment_by_size(tensors_sparse, [400])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if use_logger:
                 verify_ddp_error_logged(net, expected_err)
 
@@ -8325,6 +8710,7 @@ def test_compute_bucket_assignment_by_size_sparse_error_without_logger(self):
         def test_compute_bucket_assignment_by_size_sparse_error_with_logger(self):
             self._test_compute_bucket_assignment_by_size(use_logger=True)
 
+<<<<<<< HEAD
         def _determine_expected_error_verify_model_across_rank(
             self, group_to_use, diff_num_params=False
         ):
@@ -8355,20 +8741,28 @@ def _determine_expected_error_verify_model_across_rank(
                 ctx = self.assertRaisesRegex(RuntimeError, expected_err)
             return ctx, expected_err
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def _test_verify_model_across_rank(self, use_logger):
             group_gloo = dist.new_group(
                 timeout=timedelta(seconds=60), backend=dist.Backend.GLOO
             )
+<<<<<<< HEAD
             # Set TORCH_NCCL_BLOCKING_WAIT and use a new NCCL group to improve test
             # determinism.
             os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_to_use = dist.new_group(
                 backend=dist.get_backend(), timeout=timedelta(seconds=5)
             )
             torch.cuda.set_device(self.rank)
+<<<<<<< HEAD
             ctx, expected_err = self._determine_expected_error_verify_model_across_rank(
                 group_to_use
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Create a valid model. The constructor initializes the logger that we use later.
             net = EmbeddingNetDifferentParams(0)
@@ -8386,7 +8780,12 @@ def _test_verify_model_across_rank(self, use_logger):
             net.module.lin = nn.Linear(100 if self.rank == 0 else 10, 1)
 
             # if we pass a logger we can verify that it was logged
+<<<<<<< HEAD
             with ctx:
+=======
+            caught = 0
+            try:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if use_logger:
                     _verify_param_shape_across_processes(
                         net.process_group, list(net.parameters()), net.logger
@@ -8395,6 +8794,7 @@ def _test_verify_model_across_rank(self, use_logger):
                     _verify_param_shape_across_processes(
                         net.process_group, list(net.parameters())
                     )
+<<<<<<< HEAD
                 # Should only be run by rank 0, and blocking_wait catches and
                 # reports exception.
                 dist.barrier(group_to_use)
@@ -8407,6 +8807,15 @@ def _test_verify_model_across_rank(self, use_logger):
             # Perform gloo-based barrier to ensure one rank doesn't exit test
             # early which causes failure with Barrier.sync.
             dist.barrier(group_gloo)
+=======
+            except Exception:
+                caught = 1
+
+            # As long as there is one rank catching the exception
+            t = torch.Tensor([caught])
+            dist.all_reduce(t, group=group_gloo)
+            self.assertGreater(t, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @skip_but_pass_in_sandcastle_if(
@@ -8424,6 +8833,7 @@ def test_verify_model_across_rank_with_logger(self):
         def test_verify_model_across_rank_without_logger(self):
             self._test_verify_model_across_rank(use_logger=False)
 
+<<<<<<< HEAD
         def _run_test_ddp_model_with_diff_params(self, ctx, net, ddp_group, group_gloo):
             with ctx:
                 net = torch.nn.parallel.DistributedDataParallel(
@@ -8438,6 +8848,21 @@ def _run_test_ddp_model_with_diff_params(self, ctx, net, ddp_group, group_gloo):
             # Perform gloo-based barrier to ensure one rank doesn't exit test
             # early which causes failure with Barrier.sync.
             dist.barrier(group_gloo)
+=======
+        def _run_test_ddp_model_with_diff_params(self, net, ddp_group, group_gloo):
+            caught = 0
+            try:
+                net = torch.nn.parallel.DistributedDataParallel(
+                    net.to(self.rank), device_ids=[self.rank], process_group=ddp_group
+                )
+            except Exception:
+                caught = 1
+
+            # As long as there is one rank catching the exception
+            t = torch.Tensor([caught])
+            dist.all_reduce(t, group=group_gloo)
+            self.assertGreater(t, 0)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @skip_but_pass_in_sandcastle_if(
@@ -8448,13 +8873,17 @@ def test_ddp_model_diff_shape_across_ranks(self):
             group_gloo = dist.new_group(
                 timeout=timedelta(seconds=60), backend=dist.Backend.GLOO
             )
+<<<<<<< HEAD
             # Set TORCH_NCCL_BLOCKING_WAIT and use a new NCCL group to improve test
             # determinism.
             os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_to_use = dist.new_group(
                 backend=dist.get_backend(), timeout=timedelta(seconds=10)
             )
             torch.cuda.set_device(self.rank)
+<<<<<<< HEAD
             ctx, _expected_err = self._determine_expected_error_verify_model_across_rank(
                 group_to_use
             )
@@ -8464,6 +8893,12 @@ def test_ddp_model_diff_shape_across_ranks(self):
             self._run_test_ddp_model_with_diff_params(
                 ctx, net, group_to_use, group_gloo
             )
+=======
+            # Creates network with different sized embedding table on different
+            # ranks. This should throw an error during DDP init.
+            net = EmbeddingNetDifferentParams(self.rank)
+            self._run_test_ddp_model_with_diff_params(net, group_to_use, group_gloo)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @skip_but_pass_in_sandcastle_if(
@@ -8474,16 +8909,22 @@ def test_ddp_model_diff_num_params_across_ranks(self):
             group_gloo = dist.new_group(
                 timeout=timedelta(seconds=60), backend=dist.Backend.GLOO
             )
+<<<<<<< HEAD
             # Set TORCH_NCCL_BLOCKING_WAIT and use a new NCCL group to improve test
             # determinism.
             os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             group_to_use = dist.new_group(
                 backend=dist.get_backend(), timeout=timedelta(seconds=10)
             )
             torch.cuda.set_device(self.rank)
+<<<<<<< HEAD
             ctx, _expected_err = self._determine_expected_error_verify_model_across_rank(
                 group_to_use, diff_num_params=True
             )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Creates network with diff # of param across ranks, reducer should
             # recognize this and throw appropriate error.
@@ -8492,7 +8933,10 @@ def test_ddp_model_diff_num_params_across_ranks(self):
             )
 
             self._run_test_ddp_model_with_diff_params(
+<<<<<<< HEAD
                 ctx,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 net,
                 group_to_use,
                 group_gloo,
@@ -8585,7 +9029,11 @@ def _test_output_unused_in_loss(self, module_cls, gradient_as_bucket_view):
                         self.assertEqual(local_net.a.weight.grad, saved_a_local_grad)
 
                 # Verify grads are the same
+<<<<<<< HEAD
                 for (local_param, dist_param) in zip(
+=======
+                for local_param, dist_param in zip(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     local_net.parameters(), net.parameters()
                 ):
                     local_grad = local_param.grad
@@ -9004,7 +9452,13 @@ def forward(self, x):
             if ignore_sparse:
                 for module_name, module in model.named_modules():
                     if module == model.sub_module.embedding_net.embedding:
+<<<<<<< HEAD
                         for parameter_name, _param in module.named_parameters(recurse=False):
+=======
+                        for parameter_name, _param in module.named_parameters(
+                            recurse=False
+                        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                             fqn = f"{module_name}.{parameter_name}"
                             sparse_embedding_fqns.append(fqn)
 
@@ -9148,7 +9602,13 @@ def test_ddp_inference(self):
             f"The {BACKEND} backend does not support DistributedDataParallel",
         )
         @skip_if_lt_x_gpu(2)
+<<<<<<< HEAD
         @unittest.skip("Test is failing, see https://github.com/pytorch/pytorch/pull/113620")
+=======
+        @unittest.skip(
+            "Test is failing, see https://github.com/pytorch/pytorch/pull/113620"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def test_ddp_sync_bn_training_vs_eval(self):
             rank = self.rank
             torch.cuda.set_device(rank)
@@ -9297,7 +9757,11 @@ def get_loss(model_output):
                     loss_static = get_loss(out_static)
                     loss_static.backward()
                     self._model_step(model_static_graph)
+<<<<<<< HEAD
                     for (p, p_static) in zip(
+=======
+                    for p, p_static in zip(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         model.parameters(), model_static_graph.parameters()
                     ):
                         self.assertEqual(p, p_static)
@@ -9326,7 +9790,11 @@ def forward(self, x):
 
             model = MyModel().to(self.rank)
             inp = torch.randn(1, 10, device=self.rank)
+<<<<<<< HEAD
             for (find_unused, static_graph) in itertools.product(
+=======
+            for find_unused, static_graph in itertools.product(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 [True, False], [True, False]
             ):
                 ddp = DistributedDataParallel(
@@ -9595,7 +10063,10 @@ def buffer_comm_hook(ddp, named_buffers):
             f"The {BACKEND} backend does not support DistributedDataParallel",
         )
         def test_ddp_remove_autograd_hooks(self):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             class SimulateError(torch.autograd.Function):
                 @staticmethod
                 def forward(ctx, input):
@@ -9617,7 +10088,10 @@ def forward(self, inp):
                     else:
                         return self.fc1(inp)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # Run with error to trigger backward pass that marks fc1 as being marked
             # ready. If we don't remove autograd hooks before running below it would
             # fail on the old autograd hook.
@@ -9647,9 +10121,16 @@ def forward(self, inp):
             BACKEND not in DistTestCases.backend_feature["ddp"],
             f"The {BACKEND} backend does not support DistributedDataParallel",
         )
+<<<<<<< HEAD
         @unittest.skip("Test is failing, tracking issue at https://github.com/pytorch/pytorch/issues/102751")
         def test_ddp_has_finalized(self):
 
+=======
+        @unittest.skip(
+            "Test is failing, tracking issue at https://github.com/pytorch/pytorch/issues/102751"
+        )
+        def test_ddp_has_finalized(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             @dataclass
             class MyClass:
                 obj: torch.Tensor
@@ -9684,10 +10165,23 @@ def forward(self, inp):
                 (out1.sum() + out2.sum()).backward()
 
             if self.rank == 0:
+<<<<<<< HEAD
                 with self.assertRaisesRegex(RuntimeError, "Expected to have finished reduction in the prior iteration"):
                     ddp._check_reducer_finalized()
 
                 with self.assertRaisesRegex(RuntimeError, "Expected to have finished reduction in the prior iteration"):
+=======
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Expected to have finished reduction in the prior iteration",
+                ):
+                    ddp._check_reducer_finalized()
+
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Expected to have finished reduction in the prior iteration",
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     ddp(input)
             else:
                 ddp._check_reducer_finalized()
@@ -9970,6 +10464,7 @@ def forward(self, x):
                         p.grad.data = p.grad / iters
 
                     for p_ddp, p_local in zip(
+<<<<<<< HEAD
                         model.parameters(),
                         local_model.parameters()
                     ):
@@ -9978,6 +10473,13 @@ def forward(self, x):
                                 p_ddp.grad, p_local.grad
                             ),
                             f"{p_ddp.grad} vs {p_local.grad}"
+=======
+                        model.parameters(), local_model.parameters()
+                    ):
+                        self.assertTrue(
+                            torch.allclose(p_ddp.grad, p_local.grad),
+                            f"{p_ddp.grad} vs {p_local.grad}",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         )
 
             dist.barrier()
@@ -10224,11 +10726,16 @@ def _test_hook_pickling(self, hook, hook_state):
             f"The {BACKEND} backend does not support DDP communication hook on CUDA devices",
         )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+<<<<<<< HEAD
         @skip_but_pass_in_sandcastle_if(
             True, "Skipped due to flakiness"
         )
         def test_ddp_hook_pickling_powerSGD(self):
 
+=======
+        @skip_but_pass_in_sandcastle_if(True, "Skipped due to flakiness")
+        def test_ddp_hook_pickling_powerSGD(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             hook = powerSGD.powerSGD_hook
             powersgd_state = powerSGD.PowerSGDState(
                 process_group=None,
@@ -10246,17 +10753,32 @@ def test_ddp_device_mesh_initialization(self):
             world_size = int(os.environ["WORLD_SIZE"])
 
             from torch.distributed.device_mesh import init_device_mesh
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             device_mesh = init_device_mesh("cuda", (world_size,))
 
             pg = _get_default_group()
 
             torch.cuda.set_device(self.rank)
             model = TwoLinLayerNet().cuda()
+<<<<<<< HEAD
             ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_mesh=device_mesh)
             self.assertEqual(ddp_model.device_mesh, device_mesh)
 
             with self.assertRaisesRegex(
                 RuntimeError, "Cannot specify both process_group and device_mesh arguments."
+=======
+            ddp_model = torch.nn.parallel.DistributedDataParallel(
+                model, device_mesh=device_mesh
+            )
+            self.assertEqual(ddp_model.device_mesh, device_mesh)
+
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Cannot specify both process_group and device_mesh arguments.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 ddp_model = torch.nn.parallel.DistributedDataParallel(
                     model, process_group=pg, device_mesh=device_mesh
@@ -10270,7 +10792,10 @@ def test_ddp_device_mesh_initialization(self):
                     model, device_mesh=device_mesh
                 )
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         @skip_if_lt_x_gpu(2)
         @require_world_size(2)
         @skip_but_pass_in_sandcastle_if(
@@ -10286,9 +10811,13 @@ def test_ddp_compile_static_graph(self):
                 device_ids=[self.rank],
             )
             ddp_static = torch.nn.parallel.DistributedDataParallel(
+<<<<<<< HEAD
                 model_clone,
                 device_ids=[self.rank],
                 static_graph=True
+=======
+                model_clone, device_ids=[self.rank], static_graph=True
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
             ddp = torch.compile(ddp)
             ddp_static = torch.compile(ddp_static)
@@ -10340,6 +10869,64 @@ def forward(self, input):
             with OpPatcher():
                 ddp(input).sum().backward()
 
+<<<<<<< HEAD
+=======
+        def _test_skip_all_reduce_unused_parameters(
+            self,
+            find_unused_parameters=False,
+            static_graph=False,
+            skip_all_reduce_unused_params=False,
+        ):
+            class LargeNet(nn.Module):
+                def __init__(self) -> None:
+                    super().__init__()
+                    self.fc1 = nn.Linear(100, 5000, bias=False)
+                    # fc2 is unused
+                    self.fc2 = nn.Linear(100, 100, bias=False)
+
+                def forward(self, x):
+                    y = self.fc1(x)
+                    return y
+
+            torch.manual_seed(31415)
+            torch.cuda.set_device(self.rank)
+            model = LargeNet().cuda(self.rank)
+            ddp_model = torch.nn.parallel.DistributedDataParallel(
+                model,
+                find_unused_parameters=find_unused_parameters,
+                static_graph=static_graph,
+                bucket_cap_mb=1.5,
+                skip_all_reduce_unused_params=skip_all_reduce_unused_params,
+            )
+            random_input = torch.randn(20, 100, device=self.rank)
+            for _ in range(10):
+                out = ddp_model(random_input)
+                loss = out.sum()
+                loss.backward()
+            return ddp_model
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_skip_all_reduce_unused_parameters(self):
+            base_model = self._test_skip_all_reduce_unused_parameters(
+                find_unused_parameters=True, static_graph=False
+            )
+            test_model_1 = self._test_skip_all_reduce_unused_parameters(
+                find_unused_parameters=True,
+                static_graph=False,
+                skip_all_reduce_unused_params=True,
+            )
+
+            self.assertEqual(
+                base_model._get_ddp_logging_data().get("num_buckets_reduced"), 2
+            )
+            self.assertEqual(
+                test_model_1._get_ddp_logging_data().get("num_buckets_reduced"), 1
+            )
+
+            for i, j in zip(base_model.parameters(), test_model_1.parameters()):
+                self.assertEqual(i, j)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 instantiate_parametrized_tests(DistributedTest._DistTestBase)
diff --git a/torch/testing/_internal/distributed/distributed_utils.py b/torch/testing/_internal/distributed/distributed_utils.py
index 9230043c18c2..df4017504ea3 100644
--- a/torch/testing/_internal/distributed/distributed_utils.py
+++ b/torch/testing/_internal/distributed/distributed_utils.py
@@ -2,26 +2,45 @@
 
 from contextlib import contextmanager
 from datetime import timedelta
+<<<<<<< HEAD
 from functools import (
     partial,
     wraps,
 )
+=======
+from functools import partial, wraps
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch.distributed as dist
 import torch.distributed.distributed_c10d as c10d
 
+<<<<<<< HEAD
 class MockProcessGroup(dist.ProcessGroup):
 
+=======
+
+class MockProcessGroup(dist.ProcessGroup):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, rank, world):
         super().__init__(rank, world)
 
     def getBackendName(self):
         return "mock_process_group"
 
+<<<<<<< HEAD
 def create_mock_pg(prefix_store, rank, world_size, timeout):
     return MockProcessGroup(rank, world_size)
 
 dist.Backend.register_backend('mock_process_group', create_mock_pg)
+=======
+
+def create_mock_pg(prefix_store, rank, world_size, timeout):
+    return MockProcessGroup(rank, world_size)
+
+
+dist.Backend.register_backend("mock_process_group", create_mock_pg)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def mock_init_dist(rank, world_size):
     # !!! WARNING !!!
@@ -38,7 +57,13 @@ def mock_init_dist(rank, world_size):
         world_size=world_size,
         store=store,
         group_name="fake",
+<<<<<<< HEAD
         timeout=timedelta(seconds=1))
+=======
+        timeout=timedelta(seconds=1),
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 @contextmanager
 def with_dist(rank=0, world_size=2):
@@ -51,6 +76,10 @@ def with_dist(rank=0, world_size=2):
     finally:
         dist.destroy_process_group()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def with_fake_comms(func=None, rank=0, world_size=2):
     """
     Function wrapper that inits a fake process group designed for testing.
@@ -63,4 +92,8 @@ def with_fake_comms(func=None, rank=0, world_size=2):
     def wrapper(self, *args, **kwargs):
         with with_dist(rank, world_size):
             func(self, *args, **kwargs)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return wrapper
diff --git a/torch/testing/_internal/distributed/fake_pg.py b/torch/testing/_internal/distributed/fake_pg.py
index ff4cbe56abc9..581d827d6d1d 100644
--- a/torch/testing/_internal/distributed/fake_pg.py
+++ b/torch/testing/_internal/distributed/fake_pg.py
@@ -1,10 +1,14 @@
 # mypy: allow-untyped-defs
 
 import torch.distributed as dist
+<<<<<<< HEAD
 
 from torch._C._distributed_c10d import (
     FakeProcessGroup,
 )
+=======
+from torch._C._distributed_c10d import FakeProcessGroup
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class FakeStore(dist.Store):
@@ -22,10 +26,18 @@ def _create_fake_pg(prefix_store, rank, world_size, timeout):
     without needing multiple processes (simulates per-rank behavior)
 
     NOTE: This is not a real process group, and it would produce wrong results
+<<<<<<< HEAD
     for every collective. It should be used as a convinient tool when playing
+=======
+    for every collective. It should be used as a convenient tool when playing
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with distributed but don't care about the actual data.
     """
     return FakeProcessGroup(rank, world_size)
 
 
+<<<<<<< HEAD
 dist.Backend.register_backend("fake", _create_fake_pg, devices=['cpu', 'cuda'])
+=======
+dist.Backend.register_backend("fake", _create_fake_pg, devices=["cpu", "cuda", "hpu"])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 72dae8538683..7139c832a62b 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -2,6 +2,7 @@
 
 import sys
 import threading
+<<<<<<< HEAD
 from dataclasses import dataclass
 from typing import Optional, Union
 from functools import partial, reduce
@@ -9,6 +10,15 @@
 import torch
 import torch.distributed as dist
 import weakref
+=======
+import weakref
+from dataclasses import dataclass
+from functools import partial, reduce
+from typing import Optional, Union
+
+import torch
+import torch.distributed as dist
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch._C._distributed_c10d import (
     _create_work_from_future,
     AllgatherOptions,
@@ -16,15 +26,26 @@
     AllToAllOptions,
     BarrierOptions,
     BroadcastOptions,
+<<<<<<< HEAD
     ReduceScatterOptions,
     ScatterOptions,
     Store,
     ReduceOp,
+=======
+    ReduceOp,
+    ReduceScatterOptions,
+    ScatterOptions,
+    Store,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.distributed.distributed_c10d import _CollOp, _store_based_barrier, P2POp
 from torch.futures import Future
 from torch.utils import _pytree as pytree
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 """
 TODO:
 Lots of missing collectives.
@@ -45,6 +66,10 @@ def ret_work(ret):
     fut.set_result(ret)
     return _create_work_from_future(fut)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def binop_reduce(tensors, op):
     res = op(torch.stack(tensors), dim=0)
     if isinstance(res, torch.Tensor):
@@ -52,9 +77,17 @@ def binop_reduce(tensors, op):
     # min/max return a namedtuple
     return res.values
 
+<<<<<<< HEAD
+def bitwise_reduce(tensors, op):
+    return reduce(op, tensors)
+
+=======
+
 def bitwise_reduce(tensors, op):
     return reduce(op, tensors)
 
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _reduce_ops = {
     ReduceOp.SUM: partial(binop_reduce, op=torch.sum),
     ReduceOp.AVG: partial(binop_reduce, op=torch.mean),
@@ -66,6 +99,10 @@ def bitwise_reduce(tensors, op):
     ReduceOp.BXOR: partial(bitwise_reduce, op=torch.bitwise_xor),
 }
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AllToAll:
     @torch.no_grad()
     def work(self, data):
@@ -76,6 +113,10 @@ def work(self, data):
                 _, input_tensor_list = data[src_rank]
                 output_tensor_list[src_rank].copy_(input_tensor_list[dest_rank])
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AllToAllBase:
     @torch.no_grad()
     def work(self, data):
@@ -83,6 +124,7 @@ def work(self, data):
         for dest_rank in range(world_size):
             output_buffer, _, output_split_sizes, _ = data[dest_rank]
 
+<<<<<<< HEAD
             output_indexes = self._size_cumsum(output_buffer.size(0), output_split_sizes, world_size)
 
             for src_rank in range(world_size):
@@ -98,11 +140,40 @@ def _size_cumsum(self, buf_size: int, sizes: Union[torch.Tensor, list[int], None
             sizes = torch.full(
                 (world_size,), buf_size // world_size, dtype=torch.int64
             )
+=======
+            output_indexes = self._size_cumsum(
+                output_buffer.size(0), output_split_sizes, world_size
+            )
+
+            for src_rank in range(world_size):
+                _, input_buffer, _, input_split_sizes = data[src_rank]
+                input_indexes = self._size_cumsum(
+                    input_buffer.size(0), input_split_sizes, world_size
+                )
+
+                output_buffer[
+                    output_indexes[src_rank] : output_indexes[src_rank + 1]
+                ].copy_(
+                    input_buffer[
+                        input_indexes[dest_rank] : input_indexes[dest_rank + 1]
+                    ]
+                )
+
+    def _size_cumsum(
+        self,
+        buf_size: int,
+        sizes: Union[torch.Tensor, list[int], None],
+        world_size: int,
+    ) -> torch.Tensor:
+        if sizes is None or len(sizes) == 0:
+            sizes = torch.full((world_size,), buf_size // world_size, dtype=torch.int64)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not isinstance(sizes, torch.Tensor):
             sizes = torch.tensor(sizes, dtype=torch.int64)
         assert sizes.dtype == torch.int64
         sizes = torch.cumsum(
             torch.cat(
+<<<<<<< HEAD
                 (
                     torch.tensor([0], dtype=torch.int64, device=sizes.device), sizes
                 ),
@@ -112,6 +183,16 @@ def _size_cumsum(self, buf_size: int, sizes: Union[torch.Tensor, list[int], None
         )
         return sizes
 
+=======
+                (torch.tensor([0], dtype=torch.int64, device=sizes.device), sizes),
+                dim=0,
+            ),
+            dim=0,
+        )
+        return sizes
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class AllReduce:
     def __init__(self, op):
         if op.op not in _reduce_ops:
@@ -127,7 +208,13 @@ def work(self, data):
             rank_0_device = data[0][i].device
             # collect all data to the list and make them
             # all on rank 0 device
+<<<<<<< HEAD
             tensors = [data[src_rank][i].to(rank_0_device) for src_rank in range(0, len(data))]
+=======
+            tensors = [
+                data[src_rank][i].to(rank_0_device) for src_rank in range(0, len(data))
+            ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # now mimic reduce across all ranks
             res = _reduce_ops[self.op](tensors)
@@ -186,6 +273,10 @@ def work(self, data):
             dest_tensor = out_tensor_list[rank]
             dest_tensor.copy_(src_in_tensor_list[0])
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ReduceScatter:
     def __init__(self, op):
         if op != dist.ReduceOp.SUM and op != dist.ReduceOp.AVG:
@@ -254,7 +345,12 @@ def join(self, rank, data):
 
             if rank == 0:
                 self._start_cond.wait_for(
+<<<<<<< HEAD
                     lambda: self._count == self._world_size or self._pg._terminate.is_set()
+=======
+                    lambda: self._count == self._world_size
+                    or self._pg._terminate.is_set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 # SystemExit is not a subclass of Exception but BaseException
                 # and can be distinguished from normal exception raised from program errors
@@ -265,7 +361,13 @@ def join(self, rank, data):
         with self._done_cond:
             # wait for rank 0 to finish
             if rank > 0:
+<<<<<<< HEAD
                 self._done_cond.wait_for(lambda: self._done or self._pg._terminate.is_set())
+=======
+                self._done_cond.wait_for(
+                    lambda: self._done or self._pg._terminate.is_set()
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if self._pg._terminate.is_set():
                     sys.exit("Test termination event occurs.")
             else:
@@ -287,14 +389,27 @@ def _start_coll(cls, collective, pg):
         with cls._coll_lock:
             # pg_name is unique, we use that to record the mapping between pg and collective
             if pg.pg_name not in cls._cur_coll_on_pgs:
+<<<<<<< HEAD
                 cls._cur_coll_on_pgs[pg.pg_name] = Collective(pg.size(), collective, cls)
+=======
+                cls._cur_coll_on_pgs[pg.pg_name] = Collective(
+                    pg.size(), collective, cls
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return cls._cur_coll_on_pgs[pg.pg_name]
 
     @classmethod
     def _end_coll(cls, collective, pg):
         # This is racily called by all ranks, so only one will work
         with cls._coll_lock:
+<<<<<<< HEAD
             if pg.pg_name in cls._cur_coll_on_pgs and cls._cur_coll_on_pgs[pg.pg_name] == collective:
+=======
+            if (
+                pg.pg_name in cls._cur_coll_on_pgs
+                and cls._cur_coll_on_pgs[pg.pg_name] == collective
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cls._cur_coll_on_pgs.pop(pg.pg_name)
 
     @classmethod
@@ -318,10 +433,20 @@ def alltoall_base(
         input_buffer: torch.Tensor,
         output_split_sizes: Optional[list[int]],
         input_split_sizes: Optional[list[int]],
+<<<<<<< HEAD
         opts=AllToAllOptions()
     ) -> torch.Tensor:
         coll = ProcessLocalGroup._start_coll(AllToAllBase(), self)
         res = coll.join(self._rank, (output_buffer, input_buffer, output_split_sizes, input_split_sizes))
+=======
+        opts=AllToAllOptions(),
+    ) -> torch.Tensor:
+        coll = ProcessLocalGroup._start_coll(AllToAllBase(), self)
+        res = coll.join(
+            self._rank,
+            (output_buffer, input_buffer, output_split_sizes, input_split_sizes),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ProcessLocalGroup._end_coll(coll, self)
         return res
 
@@ -380,6 +505,7 @@ def reduce_scatter(self, output_tensor, scatter_list, opts=ReduceScatterOptions(
         ProcessLocalGroup._end_coll(coll, self)
         return res
 
+<<<<<<< HEAD
     def _reduce_scatter_base(self, output_tensor, input_tensor, opts=ReduceScatterOptions()):
         tensor_list = list(torch.chunk(input_tensor, self._world_size))
         return self.reduce_scatter([output_tensor], [tensor_list], opts)
@@ -389,12 +515,32 @@ def reduce_scatter_tensor_coalesced(self, output_tensors, input_tensors, opts=Re
             self._reduce_scatter_base(output_tensor, input_tensor, opts)
             for output_tensor, input_tensor
             in zip(output_tensors, input_tensors)
+=======
+    def _reduce_scatter_base(
+        self, output_tensor, input_tensor, opts=ReduceScatterOptions()
+    ):
+        tensor_list = list(torch.chunk(input_tensor, self._world_size))
+        return self.reduce_scatter([output_tensor], [tensor_list], opts)
+
+    def reduce_scatter_tensor_coalesced(
+        self, output_tensors, input_tensors, opts=ReduceScatterOptions()
+    ):
+        works = [
+            self._reduce_scatter_base(output_tensor, input_tensor, opts)
+            for output_tensor, input_tensor in zip(output_tensors, input_tensors)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         for work in works[:-1]:
             work.wait()
         return works[-1]
 
+<<<<<<< HEAD
     def allgather_into_tensor_coalesced(self, output_tensor_list, input_tensor_list, opts=AllgatherOptions()):
+=======
+    def allgather_into_tensor_coalesced(
+        self, output_tensor_list, input_tensor_list, opts=AllgatherOptions()
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res = None
         for o_t, i_t in zip(output_tensor_list, input_tensor_list):
             res = self._allgather_base(o_t, i_t)
@@ -470,7 +616,13 @@ class ThreadLocalWorld:
 
     def _get_world(self) -> WorldData:
         if not hasattr(ThreadLocalWorld._world, "world"):
+<<<<<<< HEAD
             ThreadLocalWorld._world.world = WorldData(None, {}, {}, {}, {}, 0, {}, {}, {})
+=======
+            ThreadLocalWorld._world.world = WorldData(
+                None, {}, {}, {}, {}, 0, {}, {}, {}
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return ThreadLocalWorld._world.world
 
     @property
diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
index 05025b328819..ed0f92765771 100644
--- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py
+++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
@@ -5,11 +5,21 @@
 import torch
 import torch.distributed.rpc as rpc
 import torch.testing._internal.dist_utils as dist_utils
+<<<<<<< HEAD
 from torch import Tensor, nn
 from torch._jit_internal import Future
 from torch.distributed.nn import RemoteModule
 from torch.distributed.nn.api.remote_module import _REMOTE_MODULE_PICKLED_ATTRIBUTES
 from torch.distributed.nn.api.remote_module import _RemoteModule
+=======
+from torch import nn, Tensor
+from torch._jit_internal import Future
+from torch.distributed.nn import RemoteModule
+from torch.distributed.nn.api.remote_module import (
+    _REMOTE_MODULE_PICKLED_ATTRIBUTES,
+    _RemoteModule,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import TemporaryFileName, TEST_WITH_ROCM
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
@@ -35,16 +45,28 @@ def remote_module_attributes(remote_module):
 def remote_forward(remote_module, args):
     return remote_module.forward(*args)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # RPC handler for running forward_async on the destination worker.
 def remote_forward_async(remote_module, args):
     # Since future cannot be pickled and sent over the RPC layer,
     # have to wait and behave just like ``forward_sync``.
     return remote_module.forward_async(*args).wait()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # RPC handler for getting training mode on the destination worker.
 def get_remote_training_arg(module_rref):
     return module_rref.local_value().training
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class ModuleCreationMode(enum.Enum):
     MODULE_CTOR_WITH_INTERFACE = "module_ctor_with_interface"
     MODULE_CTOR = "module_ctor"
@@ -147,7 +169,10 @@ def test_bad_module(self):
         ):
             RemoteModule(remote_device, BadModule, args, kwargs).forward()
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dist_utils.dist_init
     def test_forward_async(self):
         if self.rank != 0:
@@ -269,11 +294,27 @@ def test_train_eval(self):
             dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR]
         ):
             remote_module.train()
+<<<<<<< HEAD
             ret1 = rpc.rpc_sync(dst_worker_name, get_remote_training_arg, args=(remote_module.get_module_rref(),))
             self.assertEqual(ret1, True)
 
             remote_module.eval()
             ret2 = rpc.rpc_sync(dst_worker_name, get_remote_training_arg, args=(remote_module.get_module_rref(),))
+=======
+            ret1 = rpc.rpc_sync(
+                dst_worker_name,
+                get_remote_training_arg,
+                args=(remote_module.get_module_rref(),),
+            )
+            self.assertEqual(ret1, True)
+
+            remote_module.eval()
+            ret2 = rpc.rpc_sync(
+                dst_worker_name,
+                get_remote_training_arg,
+                args=(remote_module.get_module_rref(),),
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(ret2, False)
 
     @dist_utils.dist_init
@@ -466,7 +507,13 @@ def test_remote_module_py_pickle_not_supported_script(self):
             dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE]
         ):
             with TemporaryFileName() as fname:
+<<<<<<< HEAD
                 with self.assertRaisesRegex(torch.jit.Error, "can only be pickled when using RPC"):
+=======
+                with self.assertRaisesRegex(
+                    torch.jit.Error, "can only be pickled when using RPC"
+                ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     torch.save(remote_module, fname)
 
 
@@ -556,9 +603,13 @@ def test_create_remote_module_from_module_rref(self):
             )
 
             args = (torch.ones(1), 2, "3")
+<<<<<<< HEAD
             ret1 = rpc.rpc_sync(
                 dst_worker1_name, remote_forward, (remote_module, args)
             )
+=======
+            ret1 = rpc.rpc_sync(dst_worker1_name, remote_forward, (remote_module, args))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ret2 = rpc.rpc_sync(
                 dst_worker2_name, remote_forward, (remote_module2, args)
             )
@@ -613,6 +664,7 @@ def test_invalid_devices(self):
             ]
 
         if TEST_WITH_ROCM:
+<<<<<<< HEAD
             errorString = (r"HIP error: invalid device ordinal\n"
                            r"HIP kernel errors might be asynchronously reported at some other API call, "
                            r"so the stacktrace below might be incorrect.\n"
@@ -622,6 +674,17 @@ def test_invalid_devices(self):
         with self.assertRaisesRegex(
             RuntimeError, errorString
         ):
+=======
+            errorString = (
+                r"HIP error: invalid device ordinal\n"
+                r"HIP kernel errors might be asynchronously reported at some other API call, "
+                r"so the stacktrace below might be incorrect.\n"
+                r"For debugging consider passing AMD_SERIALIZE_KERNEL=3"
+            )
+        else:
+            errorString = r"CUDA error: invalid device ordinal"
+        with self.assertRaisesRegex(RuntimeError, errorString):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             [
                 m.forward()
                 for m in self._create_remote_module_iter(
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index e113b87ccebd..e1f20cc2b843 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
 import sys
 import threading
 import time
@@ -11,11 +12,33 @@
 import torch.distributed as dist
 import torch.distributed.autograd as dist_autograd
 import torch.distributed.rpc as rpc
+=======
+import random
+import sys
+import threading
+import time
+from datetime import timedelta
+from enum import Enum
+
+import torch
+import torch.distributed as dist
+import torch.distributed.autograd as dist_autograd
+import torch.distributed.rpc as rpc
+import torch.nn as nn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.testing._internal.dist_utils
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.distributed.rpc import RRef
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import IS_MACOS, skip_but_pass_in_sandcastle_if
+=======
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import (
+    IS_MACOS,
+    skip_but_pass_in_sandcastle_if,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.dist_utils import (
     dist_init,
     initialize_pg,
@@ -25,7 +48,10 @@
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Right now we test up to 3-layer nested rpc calls.
@@ -41,6 +67,10 @@
 
 requires_grad_tensor = torch.ones(3, 3, requires_grad=True)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Send rpc done info and context_id to
 # dst_rank = (self.rank + rank_distance) % self.world_size
 # we don't need a lock here since the GIL is held while executing remote
@@ -62,6 +92,10 @@ def _check_rpc_done(rank_distance):
 def _torch_ones(sizes, requires_grad=False):
     return torch.ones(sizes, requires_grad=requires_grad)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This method must be called on the rref owner, and verifies that the grad of
 # rref tensor equals to the given grad.
 def _compare_owner_value(context_id, rref, grad):
@@ -175,6 +209,10 @@ def _run_trainer(rref_t1, t2, ps, rank_diff, sparse):
         rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff))
         rpc.rpc_sync(ps, _check_rpc_done, args=(0,))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This function is the same as _run_trainer, except rpc calls torchscript
 # function "my_script_ref_add" instead of python function "my_rref_add"
 def _run_trainer_torchscript(rref_t1, t2, ps, rank_diff, sparse):
@@ -231,9 +269,13 @@ def _exec_func_with_dst(self, dst, exec_mode, method, *args):
             raise ValueError(f"Unrecognized ExecMode {exec_mode}")
 
     def _exec_func(self, exec_mode, method, *args):
+<<<<<<< HEAD
         return self._exec_func_with_dst(
             self._next_rank(), exec_mode, method, *args
         )
+=======
+        return self._exec_func_with_dst(self._next_rank(), exec_mode, method, *args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _next_rank(self):
         if hasattr(self, "dst_rank"):
@@ -286,6 +328,7 @@ def _test_graph(self, fn, exec_mode, sparse):
             if ExecMode.RPC_SYNC == exec_mode:
                 ret = rpc.rpc_sync(worker_name(dst_rank), fn, args=(t1, t2))
             elif ExecMode.REMOTE == exec_mode:
+<<<<<<< HEAD
                 ret = rpc.remote(
                     worker_name(dst_rank), fn, args=(t1, t2)
                 ).to_here()
@@ -295,6 +338,13 @@ def _test_graph(self, fn, exec_mode, sparse):
             rpc.rpc_sync(
                 worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
             )
+=======
+                ret = rpc.remote(worker_name(dst_rank), fn, args=(t1, t2)).to_here()
+            else:
+                raise ValueError(f"Unrecognized ExecMode {exec_mode}")
+
+            rpc.rpc_sync(worker_name(dst_rank), _set_rpc_done, args=(context_id, 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Verify graph for current context id.
             ctx = dist_autograd._current_context()
@@ -498,6 +548,7 @@ def _test_no_graph_with_tensors_not_require_grad(self, exec_mode, sparse):
                 t1 = torch.ones(3, 3, requires_grad=False)
                 t2 = torch.zeros(3, 3, requires_grad=False)
             if ExecMode.RPC_SYNC == exec_mode:
+<<<<<<< HEAD
                 rpc.rpc_sync(
                     worker_name(dst_rank), torch.add, args=(t1, t2)
                 )
@@ -511,6 +562,15 @@ def _test_no_graph_with_tensors_not_require_grad(self, exec_mode, sparse):
             rpc.rpc_sync(
                 worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
             )
+=======
+                rpc.rpc_sync(worker_name(dst_rank), torch.add, args=(t1, t2))
+            elif ExecMode.REMOTE == exec_mode:
+                rpc.remote(worker_name(dst_rank), torch.add, args=(t1, t2)).to_here()
+            else:
+                raise ValueError(f"Unrecognized ExecMode {exec_mode}")
+
+            rpc.rpc_sync(worker_name(dst_rank), _set_rpc_done, args=(context_id, 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             ctx = dist_autograd._current_context()
             send_functions = ctx._send_functions()
@@ -541,9 +601,13 @@ def _test_rpc_complex_args(self, exec_mode, sparse):
                 tensors.append(tensor)
             dst_rank = self._next_rank()
             if ExecMode.RPC_SYNC == exec_mode:
+<<<<<<< HEAD
                 ret = rpc.rpc_sync(
                     worker_name(dst_rank), torch.stack, args=(tensors,)
                 )
+=======
+                ret = rpc.rpc_sync(worker_name(dst_rank), torch.stack, args=(tensors,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             elif ExecMode.REMOTE == exec_mode:
                 ret = rpc.remote(
                     worker_name(dst_rank), torch.stack, args=(tensors,)
@@ -554,7 +618,13 @@ def _test_rpc_complex_args(self, exec_mode, sparse):
             self.assertEqual(torch.stack(tensors), ret)
 
             # Verify appropriate tensors have been attached the autograd graph.
+<<<<<<< HEAD
             next_funcs = next(iter(dist_autograd._current_context()._send_functions().values())).next_functions
+=======
+            next_funcs = next(
+                iter(dist_autograd._current_context()._send_functions().values())
+            ).next_functions
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for i in range(len(next_funcs)):
                 self.assertEqual(
                     "torch::autograd::AccumulateGrad", next_funcs[i][0].name()
@@ -585,9 +655,13 @@ def context_cleanup_test_helper(self, rpc_args, func, nested=False):
         with dist_autograd.context() as context_id:
             for dst_rank in dst_ranks:
                 rpc.rpc_sync(worker_name(dst_rank), func, args=rpc_args)
+<<<<<<< HEAD
                 rpc.rpc_sync(
                     worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
                 )
+=======
+                rpc.rpc_sync(worker_name(dst_rank), _set_rpc_done, args=(context_id, 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if nested:
                     rpc.rpc_sync(
                         worker_name(nested_dst_rank),
@@ -607,9 +681,14 @@ def context_cleanup_test_helper(self, rpc_args, func, nested=False):
     def _backward_no_grad_on_tensor(self, t1, t2, sparse):
         with dist_autograd.context() as context_id:
             loss = rpc.rpc_sync(
+<<<<<<< HEAD
                 worker_name(self._next_rank()),
                 torch.add,
                 args=(t1, t2))
+=======
+                worker_name(self._next_rank()), torch.add, args=(t1, t2)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if sparse:
                 loss = torch.sparse.sum(loss)
             else:
@@ -650,11 +729,27 @@ def _backward_rref(self, callee, rref_owner, t1, t2, local_grads, sparse):
         with dist_autograd.context() as context_id:
             if sparse:
                 rref_t1 = rpc.remote(
+<<<<<<< HEAD
                     rref_owner, build_sparse_tensor, args=(False, True,)
                 )
             else:
                 rref_t1 = rpc.remote(
                     rref_owner, _torch_ones, args=((3, 3),), kwargs={"requires_grad": True}
+=======
+                    rref_owner,
+                    build_sparse_tensor,
+                    args=(
+                        False,
+                        True,
+                    ),
+                )
+            else:
+                rref_t1 = rpc.remote(
+                    rref_owner,
+                    _torch_ones,
+                    args=((3, 3),),
+                    kwargs={"requires_grad": True},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             if callee == rref_owner:
                 rref = rpc.remote(callee, my_rref_add, args=(rref_t1, t2))
@@ -707,10 +802,14 @@ def _test_trainer_ps(self, create_ref_fn, trainer_fn, sparse):
             local_ret.sum().backward()
 
         # create rref on self
+<<<<<<< HEAD
         rref_t1 = rpc.remote(
             worker_name(self.rank),
             create_ref_fn,
             args=())
+=======
+        rref_t1 = rpc.remote(worker_name(self.rank), create_ref_fn, args=())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # kick off forward and backward pass on three other workers (trainers)
         rank_diffs = [1, 2, 3]
@@ -719,7 +818,12 @@ def _test_trainer_ps(self, create_ref_fn, trainer_fn, sparse):
                 worker_name((self.rank + rank_diff) % self.world_size),
                 trainer_fn,
                 args=(rref_t1, t2, worker_name(self.rank), rank_diff, sparse),
+<<<<<<< HEAD
             ) for rank_diff in rank_diffs
+=======
+            )
+            for rank_diff in rank_diffs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         # check if the trainers have done with their backward pass
@@ -877,9 +981,14 @@ def _mixed_requires_grad(self, t1, t2, sparse):
     def _multiple_backward(self, t1, t2, sparse):
         with dist_autograd.context() as context_id:
             loss = rpc.rpc_sync(
+<<<<<<< HEAD
                 worker_name(self._next_rank()),
                 torch.add,
                 args=(t1, t2))
+=======
+                worker_name(self._next_rank()), torch.add, args=(t1, t2)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if sparse:
                 loss = torch.sparse.sum(loss)
             else:
@@ -924,9 +1033,13 @@ def _verify_graph_for_first_rpc_call(
     def _backward_simple(self, dst, t1, t2, local_grads, sparse):
         for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]:
             with dist_autograd.context() as context_id:
+<<<<<<< HEAD
                 ret = self._exec_func_with_dst(
                     dst, exec_mode, torch.add, t1, t2
                 )
+=======
+                ret = self._exec_func_with_dst(dst, exec_mode, torch.add, t1, t2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 if sparse:
                     loss = torch.sparse.sum(ret)
                 else:
@@ -1005,7 +1118,10 @@ def _verify_graph_for_nested_rpc_call(self, ctx):
 
 
 class TensorPipeAgentDistAutogradTest(CommonDistAutogradTest):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Sparse tests only work with TensorPipeAgent.
     @dist_init
     def test_graph_for_builtin_call_sparse(self):
@@ -1081,7 +1197,11 @@ def test_backward_no_grad_on_tensor_sparse(self):
         self._backward_no_grad_on_tensor(
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=True),
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1091,7 +1211,11 @@ def test_backward_simple_sparse(self):
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=True),
             None,
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1101,7 +1225,11 @@ def test_backward_simple_self_sparse(self):
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=True),
             None,
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1115,7 +1243,11 @@ def test_backward_rref_multi_sparse(self):
                 build_sparse_tensor(requires_grad=True),
                 build_sparse_tensor(requires_grad=True),
                 None,
+<<<<<<< HEAD
                 True
+=======
+                True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @dist_init
@@ -1128,7 +1260,11 @@ def test_backward_rref_sparse(self):
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=True),
             None,
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1141,16 +1277,24 @@ def test_backward_rref_nested_sparse(self):
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=True),
             None,
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_trainer_ps_sparse(self):
+<<<<<<< HEAD
         self._test_trainer_ps(
             build_sparse_tensor,
             _run_trainer,
             True
         )
+=======
+        self._test_trainer_ps(build_sparse_tensor, _run_trainer, True)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_backward_multiple_round_trips_sparse(self):
@@ -1161,7 +1305,11 @@ def test_backward_multiple_round_trips_sparse(self):
             build_sparse_tensor(requires_grad=False),
             build_sparse_tensor(requires_grad=True),
             None,
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1169,7 +1317,11 @@ def test_backward_different_dtypes_sparse(self):
         self._backward_different_dtypes(
             build_sparse_tensor(requires_grad=True, dtype=torch.float32),
             build_sparse_tensor(requires_grad=True, dtype=torch.float64),
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1177,7 +1329,11 @@ def test_backward_simple_python_udf_sparse(self):
         self._backward_simple_python_udf(
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=True),
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1185,7 +1341,11 @@ def test_backward_simple_script_call_sparse(self):
         self._backward_simple_script_call(
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=True),
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1193,7 +1353,11 @@ def test_nested_backward_accumulate_grads_sparse(self):
         self._nested_backward_accumulate_grads(
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=True),
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1202,7 +1366,11 @@ def test_backwards_nested_python_udf_sparse(self):
         self._backwards_nested_python_udf(
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=True),
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1210,7 +1378,11 @@ def test_mixed_requires_grad_sparse(self):
         self._mixed_requires_grad(
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=False),
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1218,7 +1390,11 @@ def test_multiple_backward_sparse(self):
         self._multiple_backward(
             build_sparse_tensor(requires_grad=True),
             build_sparse_tensor(requires_grad=True),
+<<<<<<< HEAD
             True
+=======
+            True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1348,17 +1524,25 @@ def _test_grad_only_on_return_value(self, exec_mode):
             if ExecMode.RPC_SYNC == exec_mode:
                 ret = rpc.rpc_sync(worker_name(dst_rank), ret_requires_grad)
             elif ExecMode.REMOTE == exec_mode:
+<<<<<<< HEAD
                 ret = rpc.remote(
                     worker_name(dst_rank), ret_requires_grad
                 ).to_here()
+=======
+                ret = rpc.remote(worker_name(dst_rank), ret_requires_grad).to_here()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 raise ValueError(f"Unrecognized ExecMode {exec_mode}")
 
             dist_autograd.backward(context_id, [ret.sum()])
 
+<<<<<<< HEAD
             rpc.rpc_sync(
                 worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
             )
+=======
+            rpc.rpc_sync(worker_name(dst_rank), _set_rpc_done, args=(context_id, 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Wait for the prev rank to be done with rpc.
             self._check_rpc_done(1)
@@ -1421,9 +1605,13 @@ def test_worker_ids_recorded(self):
             t2 = torch.zeros(3, 3, requires_grad=False)
             for dst_rank in dst_ranks:
                 rpc.rpc_sync(worker_name(dst_rank), torch.add, args=(t1, t2))
+<<<<<<< HEAD
                 rpc.rpc_sync(
                     worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
                 )
+=======
+                rpc.rpc_sync(worker_name(dst_rank), _set_rpc_done, args=(context_id, 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # all worker_ids in dst_ranks should be recorded.
             ctx = dist_autograd._current_context()
             worker_ids = ctx._known_worker_ids()
@@ -1433,12 +1621,17 @@ def test_worker_ids_recorded(self):
             t1.requires_grad = True
             t2.requires_grad = True
             for dst_rank in dst_ranks:
+<<<<<<< HEAD
                 rpc.rpc_sync(
                     worker_name(dst_rank), torch.add, args=(t1, t2)
                 )
                 rpc.rpc_sync(
                     worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
                 )
+=======
+                rpc.rpc_sync(worker_name(dst_rank), torch.add, args=(t1, t2))
+                rpc.rpc_sync(worker_name(dst_rank), _set_rpc_done, args=(context_id, 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # all worker_ids in dst_ranks should be recorded.
             worker_ids = ctx._known_worker_ids()
             self.assertEqual(worker_ids, dst_ranks)
@@ -1448,7 +1641,13 @@ def test_dist_autograd_profiling(self):
         with dist_autograd.context() as context_id:
             t1 = torch.rand(3, 3, requires_grad=True)
             t2 = torch.rand(3, 3, requires_grad=True)
+<<<<<<< HEAD
             loss = rpc.rpc_sync(worker_name(self._next_rank()), torch.add, args=(t1, t2)).sum()
+=======
+            loss = rpc.rpc_sync(
+                worker_name(self._next_rank()), torch.add, args=(t1, t2)
+            ).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with torch.autograd.profiler.profile() as p:
                 dist_autograd.backward(context_id, [loss])
 
@@ -1485,7 +1684,11 @@ def test_backward_no_grad_on_tensor(self):
         self._backward_no_grad_on_tensor(
             torch.rand((3, 3), requires_grad=True),
             torch.rand((3, 3), requires_grad=True),
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1495,7 +1698,11 @@ def test_backward_simple(self):
             torch.rand((3, 3), requires_grad=True),
             torch.rand((3, 3), requires_grad=True),
             None,
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1505,7 +1712,11 @@ def test_backward_simple_self(self):
             torch.rand((3, 3), requires_grad=True),
             torch.rand((3, 3), requires_grad=True),
             None,
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1518,7 +1729,11 @@ def test_backward_rref(self):
             torch.rand((3, 3), requires_grad=True),
             torch.rand((3, 3), requires_grad=True),
             None,
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1532,7 +1747,11 @@ def test_backward_rref_multi(self):
                 torch.rand((3, 3), requires_grad=True),
                 torch.rand((3, 3), requires_grad=True),
                 None,
+<<<<<<< HEAD
                 False
+=======
+                False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @dist_init
@@ -1545,16 +1764,24 @@ def test_backward_rref_nested(self):
             torch.rand((3, 3), requires_grad=True),
             torch.rand((3, 3), requires_grad=True),
             None,
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_trainer_ps(self):
+<<<<<<< HEAD
         self._test_trainer_ps(
             create_tensor,
             _run_trainer,
             False
         )
+=======
+        self._test_trainer_ps(create_tensor, _run_trainer, False)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_trainer_ps_torchscript_functions(self):
@@ -1563,9 +1790,18 @@ def test_trainer_ps_torchscript_functions(self):
         # ref as arg is passed to pybind boundary, and the ref is not garbage
         # collected by python when calling shutdown()
         import torch.distributed.rpc.api as api
+<<<<<<< HEAD
         api._ignore_rref_leak = True
 
         self._test_trainer_ps(create_torchscript_tensor, _run_trainer_torchscript, False)
+=======
+
+        api._ignore_rref_leak = True
+
+        self._test_trainer_ps(
+            create_torchscript_tensor, _run_trainer_torchscript, False
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_backward_multiple_round_trips(self):
@@ -1576,7 +1812,11 @@ def test_backward_multiple_round_trips(self):
             torch.rand((3, 3)),
             torch.rand((3, 3), requires_grad=True),
             None,
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1646,9 +1886,13 @@ def _run_test_backward_unused_send_function_in_thread(self):
 
             # We don't use the result of an RPC function, as a result the
             # backward pass would hang in the "FAST" mode.
+<<<<<<< HEAD
             rpc.rpc_sync(
                 worker_name(self._next_rank()), torch.add, args=(t1, t2)
             )
+=======
+            rpc.rpc_sync(worker_name(self._next_rank()), torch.add, args=(t1, t2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             val = torch.mul(t1, t2)
 
@@ -1679,9 +1923,13 @@ def test_backward_autograd_engine_error(self):
 
             # Run multiple round trips across different nodes and verify the
             # original node receives an error thrown on a node deep in the chain.
+<<<<<<< HEAD
             val = rpc.rpc_sync(
                 worker_name(self._next_rank()), torch.add, args=(t2, t3)
             )
+=======
+            val = rpc.rpc_sync(worker_name(self._next_rank()), torch.add, args=(t2, t3))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             val = rpc.rpc_sync(
                 worker_name(self._next_rank()), torch.mul, args=(val, t2)
             )
@@ -1710,9 +1958,13 @@ def test_backward_node_failure(self):
         with dist_autograd.context() as context_id:
             t1 = torch.rand((3, 3), requires_grad=True)
             t2 = torch.rand((3, 3), requires_grad=True)
+<<<<<<< HEAD
             res = rpc.rpc_sync(
                 worker_name(self._next_rank()), torch.add, args=(t1, t2)
             )
+=======
+            res = rpc.rpc_sync(worker_name(self._next_rank()), torch.add, args=(t1, t2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Wait for all RPCs to be done.
             dist.barrier()
@@ -1745,9 +1997,13 @@ def test_backward_without_context(self):
             RuntimeError,
             f"Could not find autograd context with id: {context_id}",
         ):
+<<<<<<< HEAD
             res = rpc.rpc_sync(
                 worker_name(self._next_rank()), torch.add, args=(t1, t2)
             )
+=======
+            res = rpc.rpc_sync(worker_name(self._next_rank()), torch.add, args=(t1, t2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             dist_autograd.backward(context_id, [res.sum()])
 
     @dist_init
@@ -1768,7 +2024,10 @@ def test_backward_without_rpc(self):
     @dist_init
     def test_backward_invalid_args(self):
         with dist_autograd.context() as context_id:
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
                 dist_autograd.backward(context_id, None)
 
@@ -1817,7 +2076,11 @@ def test_backward_different_dtypes(self):
         self._backward_different_dtypes(
             torch.rand((3, 3), requires_grad=True, dtype=torch.float32),
             torch.rand((3, 3), requires_grad=True, dtype=torch.float64),
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1825,7 +2088,11 @@ def test_backward_simple_python_udf(self):
         self._backward_simple_python_udf(
             torch.rand(3, 3, requires_grad=True),
             torch.rand(3, 3, requires_grad=True),
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1833,7 +2100,11 @@ def test_backward_simple_script_call(self):
         self._backward_simple_script_call(
             torch.rand(3, 3, requires_grad=True),
             torch.rand(3, 3, requires_grad=True),
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @staticmethod
@@ -1934,10 +2205,20 @@ def test_backward_node_failure_python_udf(self):
 
                 # Mark rank 0 is done in the store, since the RPC framework on
                 # some nodes might be broken at this point.
+<<<<<<< HEAD
                 store.set('test_backward_node_failure_python_udf_rank0_done', "True")
             else:
                 # Wait for backward to finish on rank 0.
                 store.wait(['test_backward_node_failure_python_udf_rank0_done'], timedelta(seconds=10))
+=======
+                store.set("test_backward_node_failure_python_udf_rank0_done", "True")
+            else:
+                # Wait for backward to finish on rank 0.
+                store.wait(
+                    ["test_backward_node_failure_python_udf_rank0_done"],
+                    timedelta(seconds=10),
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def _nested_python_udf(t1, t2, dst):
@@ -1952,7 +2233,11 @@ def test_backwards_nested_python_udf(self):
         self._backwards_nested_python_udf(
             torch.rand(3, 3, requires_grad=True),
             torch.rand(3, 3, requires_grad=True),
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     _test_clean_context_backward_context_id = None
@@ -2063,7 +2348,11 @@ def test_mixed_requires_grad(self):
         self._mixed_requires_grad(
             torch.rand(3, 3, requires_grad=True),
             torch.rand(3, 3, requires_grad=False),
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     class TestDebugInfoFunc(Function):
@@ -2117,7 +2406,11 @@ def test_debug_info(self):
 
             debug_info = dist_autograd._get_debug_info()
             num_autograd_context = int(debug_info["num_autograd_contexts"])
+<<<<<<< HEAD
             # Need atleast one context and not more than 4.
+=======
+            # Need at least one context and not more than 4.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertTrue(num_autograd_context >= 1 and num_autograd_context <= 4)
 
         for rd in range(self.world_size - 1):
@@ -2210,7 +2503,11 @@ def test_nested_backward_accumulate_grads(self):
         self._nested_backward_accumulate_grads(
             torch.rand(3, 3, requires_grad=True),
             torch.rand(3, 3, requires_grad=True),
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -2218,7 +2515,11 @@ def test_multiple_backward(self):
         self._multiple_backward(
             torch.rand(3, 3, requires_grad=True),
             torch.rand(3, 3, requires_grad=True),
+<<<<<<< HEAD
             False
+=======
+            False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init(clean_shutdown=False)
@@ -2228,16 +2529,32 @@ def test_multiple_backward_with_errors(self):
         t2 = torch.rand((3, 3), requires_grad=True)
         with dist_autograd.context() as context_id:
             loss = rpc.rpc_sync(
+<<<<<<< HEAD
                 f'worker{self._next_rank()}',
                 DistAutogradTest._python_udf_with_backward_error,
                 args=(t1, t2)).sum()
+=======
+                f"worker{self._next_rank()}",
+                DistAutogradTest._python_udf_with_backward_error,
+                args=(t1, t2),
+            ).sum()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             try:
                 # Run backward in a loop multiple times.
                 for i in range(100):
                     if i < 50:
+<<<<<<< HEAD
                         with self.assertRaisesRegex(RuntimeError, "Simulate error on backward pass"):
                             dist_autograd.backward(context_id, [loss], retain_graph=True)
+=======
+                        with self.assertRaisesRegex(
+                            RuntimeError, "Simulate error on backward pass"
+                        ):
+                            dist_autograd.backward(
+                                context_id, [loss], retain_graph=True
+                            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     elif i > 50:
                         # Recovered from error.
                         dist_autograd.backward(context_id, [loss], retain_graph=True)
@@ -2270,9 +2587,16 @@ def test_backward_verify_hooks(self):
 
     @dist_init
     def test_no_grad_copy(self):
+<<<<<<< HEAD
         '''
         Similar to test in test_autograd.py.
         '''
+=======
+        """
+        Similar to test in test_autograd.py.
+        """
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # create autograd function that saves grad pointer as class static
         class MyFunc(Function):
             static_grad_ptr = None
@@ -2302,7 +2626,11 @@ class NonContGradFunc(Function):
             @staticmethod
             def forward(ctx, inp1):
                 ctx.size = inp1.size()
+<<<<<<< HEAD
                 return torch.tensor([1.])
+=======
+                return torch.tensor([1.0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             @staticmethod
             def backward(ctx, grad):
@@ -2312,7 +2640,13 @@ def backward(ctx, grad):
         b = torch.randn(5, 6, requires_grad=True)
         # non-contiguous grad should be copied
         with dist_autograd.context() as context_id:
+<<<<<<< HEAD
             dist_autograd.backward(context_id, [NonContGradFunc.apply(MyFunc.apply(a, b))])
+=======
+            dist_autograd.backward(
+                context_id, [NonContGradFunc.apply(MyFunc.apply(a, b))]
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             grads = dist_autograd.get_gradients(context_id)
             self.assertFalse(grads[a].data_ptr() == MyFunc.static_grad_ptr)
             self.assertFalse(grads[b].data_ptr() == MyFunc.static_grad_ptr)
@@ -2516,9 +2850,13 @@ def test_thread_local_context_id(self):
             dist_autograd.backward(context_id, [loss])
             self.assertTrue(
                 rpc.rpc_sync(
+<<<<<<< HEAD
                     dst,
                     _compare_owner_value,
                     args=(context_id, rref, t3.grad)
+=======
+                    dst, _compare_owner_value, args=(context_id, rref, t3.grad)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
 
@@ -2602,9 +2940,13 @@ def context_cleanup_test_helper(self, rpc_args, func):
         with dist_autograd.context() as context_id:
             for dst_rank in dst_ranks:
                 rpc.rpc_sync(worker_name(dst_rank), func, args=rpc_args)
+<<<<<<< HEAD
                 rpc.rpc_sync(
                     worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
                 )
+=======
+                rpc.rpc_sync(worker_name(dst_rank), _set_rpc_done, args=(context_id, 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # the thread's context id should be cleaned up
         with self.assertRaises(RuntimeError):
             dist_autograd._retrieve_context(context_id)
@@ -2625,7 +2967,13 @@ def test_context_cleanup_tensor_with_grad(self):
 
     @dist_init
     def test_verify_backend_options(self):
+<<<<<<< HEAD
         self.assertEqual(self.rpc_backend, rpc.backend_registry.BackendType.FAULTY_TENSORPIPE)
+=======
+        self.assertEqual(
+            self.rpc_backend, rpc.backend_registry.BackendType.FAULTY_TENSORPIPE
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(self.rpc_backend_options.num_worker_threads, 8)
         self.assertEqual(self.rpc_backend_options.num_fail_sends, 3)
         self.assertEqual(len(self.rpc_backend_options.messages_to_fail), 4)
@@ -2645,7 +2993,10 @@ def gradients(self, ctx_id):
 
 
 class TensorPipeCudaDistAutogradTest(RpcAgentTestFixture):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(4)
     def test_device_maps_backward_pass(self):
         options = self.rpc_backend_options
@@ -2690,7 +3041,10 @@ def forward(self, input):
 
     @skip_if_lt_x_gpu(4)
     def test_dist_autograd_sync_streams(self):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         options = self.rpc_backend_options
         dst = worker_name((self.rank + 1) % self.world_size)
 
@@ -2747,10 +3101,16 @@ def test_gradients_synchronizations(self):
             local_layers = [l.to(0) for l in layers]
             remote_layers = [
                 rpc.remote(
+<<<<<<< HEAD
                     worker_name(rank),
                     WrapperModule,
                     args=(layers[rank - 1], rank)
                 ) for rank in range(1, self.world_size)
+=======
+                    worker_name(rank), WrapperModule, args=(layers[rank - 1], rank)
+                )
+                for rank in range(1, self.world_size)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ]
 
             x = torch.randn(5000, 2000).to(0)
diff --git a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
index bd2afbcdff92..ccd019a4a190 100644
--- a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
@@ -204,7 +204,13 @@ def test_dist_optim(self):
         self._test_dist_optim_base(optim.Adam, lr=1e-2, amsgrad=True)
         self._test_dist_optim_base(optim.AdamW, lr=0.05, amsgrad=True)
         self._test_dist_optim_base(optim.SGD, lr=0.05)
+<<<<<<< HEAD
         self._test_dist_optim_base(optim.SGD, lr=1e-3, momentum=1, weight_decay=1, nesterov=True)
+=======
+        self._test_dist_optim_base(
+            optim.SGD, lr=1e-3, momentum=1, weight_decay=1, nesterov=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._test_dist_optim_base(optim.Adadelta, rho=0.95)
         self._test_dist_optim_base(optim.RMSprop, lr=0.05)
         self._test_dist_optim_base(optim.Adamax, lr=0.05)
diff --git a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
index 1bad7694fdf1..2d6ff71139fa 100644
--- a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
@@ -12,12 +12,20 @@
 import torch.distributed.rpc as rpc
 import torch.nn as nn
 from torch import optim
+<<<<<<< HEAD
 
 from torch.testing._internal.dist_utils import (
     dist_init,
     worker_name,
 )
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import RpcAgentTestFixture
+=======
+from torch.testing._internal.dist_utils import dist_init, worker_name
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 batch_size = 20
 in_features = 100
@@ -30,7 +38,10 @@ def timed_log(text):
 
 
 class BatchUpdateParameterServer:
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, batch_update_size):
         self.model = nn.Linear(in_features, out_features)
         self.lock = threading.Lock()
@@ -54,7 +65,13 @@ def update_and_fetch_model(ps_rref, grads):
             else:
                 p.grad += g
         with self.lock:
+<<<<<<< HEAD
             timed_log(f"PS got {self.curr_update_size}/{self.batch_update_size} updates")
+=======
+            timed_log(
+                f"PS got {self.curr_update_size}/{self.batch_update_size} updates"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.curr_update_size += 1
             fut = self.future_model
 
@@ -72,7 +89,10 @@ def update_and_fetch_model(ps_rref, grads):
 
 
 class Trainer:
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, ps_rref):
         self.ps_rref = ps_rref
         self.loss_fn = nn.L1Loss()
@@ -107,18 +127,31 @@ def run_ps(trainers):
     timed_log("Start training")
     start = perf_counter()
     ps_rref = rpc.RRef(BatchUpdateParameterServer(len(trainers)))
+<<<<<<< HEAD
     futs = [rpc.rpc_async(trainer, run_trainer, args=(ps_rref,)) for trainer in trainers]
+=======
+    futs = [
+        rpc.rpc_async(trainer, run_trainer, args=(ps_rref,)) for trainer in trainers
+    ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     torch.futures.wait_all(futs)
     stop = perf_counter()
     timed_log("Finish training")
     timed_log(f"Time spent training: {stop - start}s")
 
+<<<<<<< HEAD
 class ParameterServerTest(RpcAgentTestFixture):
 
     @dist_init(setup_rpc=False)
     def test_batch_updating_parameter_server(self):
 
+=======
+
+class ParameterServerTest(RpcAgentTestFixture):
+    @dist_init(setup_rpc=False)
+    def test_batch_updating_parameter_server(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.rank != 0:
             rpc.init_rpc(
                 name=worker_name(self.rank),
diff --git a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
index 4b3ba7de3f4a..945900ecaf63 100644
--- a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
@@ -11,16 +11,30 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
+<<<<<<< HEAD
 from torch.distributed.rpc import RRef, rpc_sync, rpc_async, remote
 from torch.distributions import Categorical
 
 from torch.testing._internal.dist_utils import dist_init, worker_name
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import RpcAgentTestFixture
+=======
+from torch.distributed.rpc import remote, rpc_async, rpc_sync, RRef
+from torch.distributions import Categorical
+from torch.testing._internal.dist_utils import dist_init, worker_name
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TOTAL_EPISODE_STEP = 5000
 GAMMA = 0.1
 SEED = 543
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _call_method(method, rref, *args, **kwargs):
     r"""
     a helper function to call a method on the given RRef
@@ -43,6 +57,10 @@ class Policy(nn.Module):
     Copying the code to make these two examples independent.
     See https://github.com/pytorch/examples/tree/master/reinforcement_learning
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self) -> None:
         super().__init__()
         self.affine1 = nn.Linear(4, 128)
@@ -67,6 +85,10 @@ class DummyEnv:
     tests in this file. It is designed to run for a set max number of iterations,
     returning random states and rewards at each step.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, state_dim=4, num_iters=10, reward_threshold=475.0):
         self.state_dim = state_dim
         self.num_iters = num_iters
@@ -96,6 +118,10 @@ class Observer:
     select an action. Then, the observer applies the action to its environment
     and reports the reward to the agent.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self) -> None:
         self.id = rpc.get_worker_info().id
         self.env = DummyEnv()
@@ -171,8 +197,14 @@ def run_episode(self, n_steps=0):
             rpc_async(
                 ob_rref.owner(),
                 _call_method,
+<<<<<<< HEAD
                 args=(Observer.run_episode, ob_rref, self.agent_rref, n_steps)
             ) for ob_rref in self.ob_rrefs
+=======
+                args=(Observer.run_episode, ob_rref, self.agent_rref, n_steps),
+            )
+            for ob_rref in self.ob_rrefs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         # wait until all observers have finished this episode
diff --git a/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py b/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
index 132e30e5b5cf..aea8232d5766 100644
--- a/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
@@ -1,32 +1,60 @@
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
 import torch
 import time
+=======
+import time
+
+import torch
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import torch.distributed.rpc as rpc
 from torch.distributed.rpc.api import _delete_all_user_and_unforked_owner_rrefs
 from torch.testing._internal.dist_utils import (
     dist_init,
+<<<<<<< HEAD
     wait_until_pending_futures_and_users_flushed,
     wait_until_owners_and_forks_on_rank,
+=======
+    wait_until_owners_and_forks_on_rank,
+    wait_until_pending_futures_and_users_flushed,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     worker_name,
 )
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def my_sleep_func(seconds=1):
     time.sleep(seconds)
     return torch.mul(torch.tensor(1), torch.tensor(1))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def my_script_func(tensor):
     return torch.add(tensor, tensor)
 
+<<<<<<< HEAD
 def add_rref_to_value(rref, value):
     return rref.to_here() + value
 
 class FaultyAgentRpcTest(RpcAgentTestFixture):
 
+=======
+
+def add_rref_to_value(rref, value):
+    return rref.to_here() + value
+
+
+class FaultyAgentRpcTest(RpcAgentTestFixture):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # no faulty_messages defined so this fails all retryable messages - see
     # faulty_rpc_agent_test_fixture.py for the list of retryable messages.
     @dist_init(messages_to_delay={})
@@ -36,22 +64,46 @@ def test_check_failed_messages(self):
             dst_worker_c = worker_name((self.rank + 2) % self.world_size)
 
             # Worker0 sends RPC to Worker1 and creates an RRef there
+<<<<<<< HEAD
             rref = rpc.remote(dst_worker_b, torch.add, args=(torch.ones(2, 2), torch.ones(2, 2)))
             # Worker0 sends an RPC to Worker2 with the RRef as an arg
             rpc.remote(dst_worker_c, add_rref_to_value, args=(rref, torch.ones(2, 2)))
             # check if the output is as expected
             self.assertEqual(rref.to_here(), torch.add(torch.ones(2, 2), torch.ones(2, 2)))
+=======
+            rref = rpc.remote(
+                dst_worker_b, torch.add, args=(torch.ones(2, 2), torch.ones(2, 2))
+            )
+            # Worker0 sends an RPC to Worker2 with the RRef as an arg
+            rpc.remote(dst_worker_c, add_rref_to_value, args=(rref, torch.ones(2, 2)))
+            # check if the output is as expected
+            self.assertEqual(
+                rref.to_here(), torch.add(torch.ones(2, 2), torch.ones(2, 2))
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # explicitly delete all User RRefs
         _delete_all_user_and_unforked_owner_rrefs()
 
     @dist_init
     def test_verify_backend_options(self):
+<<<<<<< HEAD
         self.assertEqual(self.rpc_backend, rpc.backend_registry.BackendType.FAULTY_TENSORPIPE)
+=======
+        self.assertEqual(
+            self.rpc_backend, rpc.backend_registry.BackendType.FAULTY_TENSORPIPE
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(self.rpc_backend_options.num_worker_threads, 8)
         self.assertEqual(self.rpc_backend_options.num_fail_sends, 3)
         self.assertEqual(len(self.rpc_backend_options.messages_to_fail), 4)
         self.assertEqual(len(self.rpc_backend_options.messages_to_delay), 2)
+<<<<<<< HEAD
         self.assertEqual(self.rpc_backend_options.rpc_timeout, rpc.constants.DEFAULT_RPC_TIMEOUT_SEC)
+=======
+        self.assertEqual(
+            self.rpc_backend_options.rpc_timeout, rpc.constants.DEFAULT_RPC_TIMEOUT_SEC
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init(faulty_messages=["RREF_FORK_REQUEST", "RREF_CHILD_ACCEPT"])
     def test_custom_faulty_messages(self):
@@ -66,7 +118,13 @@ def test_no_faulty_messages(self):
 
     @dist_init(messages_to_delay={"SCRIPT_CALL": 1.5})
     def test_custom_messages_to_delay(self):
+<<<<<<< HEAD
         self.assertEqual(self.rpc_backend_options.messages_to_delay, {"SCRIPT_CALL": 1.5})
+=======
+        self.assertEqual(
+            self.rpc_backend_options.messages_to_delay, {"SCRIPT_CALL": 1.5}
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _test_remote_message_dropped_pickle(self, dst=None):
         if self.rank != 0:
@@ -95,7 +153,10 @@ def test_remote_message_dropped_pickle(self):
     def test_remote_message_dropped_pickle_to_self(self):
         self._test_remote_message_dropped_pickle(self.rank)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_remote_message_dropped_timeout(self, func, args, dst=None):
         if self.rank != 0:
             return
@@ -297,22 +358,36 @@ def test_rpc_script_timeout(self):
         with self.assertRaisesRegex(RuntimeError, expected_error):
             rpc.rpc_sync(dst_worker, my_script_func, args=(torch.tensor(1),), timeout=1)
 
+<<<<<<< HEAD
         fut = rpc.rpc_async(dst_worker, my_script_func, args=(torch.tensor(1),), timeout=1)
+=======
+        fut = rpc.rpc_async(
+            dst_worker, my_script_func, args=(torch.tensor(1),), timeout=1
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(RuntimeError, expected_error):
             fut.wait()
 
         # Ensure that the currently set default timeout is large enough such
         # that RPCs with delays still complete.
+<<<<<<< HEAD
         fut = rpc.rpc_async(
             dst_worker, my_script_func, args=(torch.tensor(1),)
         )
+=======
+        fut = rpc.rpc_async(dst_worker, my_script_func, args=(torch.tensor(1),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fut.wait()
 
         # Ensure timeout if we set a new default and don't override
         rpc._set_rpc_timeout(0.001)
+<<<<<<< HEAD
         fut = rpc.rpc_async(
             dst_worker, my_script_func, args=(torch.tensor(1),)
         )
+=======
+        fut = rpc.rpc_async(dst_worker, my_script_func, args=(torch.tensor(1),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(RuntimeError, expected_error):
             fut.wait()
 
diff --git a/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
index ca584c1dc95a..5f43798cec60 100644
--- a/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
+++ b/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
@@ -6,6 +6,7 @@
     RpcAgentTestFixture,
 )
 
+<<<<<<< HEAD
 # The following message types are currently retried in the RREF protocol and
 # distributed autograd. Thus only these messages should be tested with the
 # Faulty RPC Agent.
@@ -13,6 +14,18 @@
                            "RREF_CHILD_ACCEPT",
                            "RREF_USER_DELETE",
                            "CLEANUP_AUTOGRAD_CONTEXT_REQ"]
+=======
+
+# The following message types are currently retried in the RREF protocol and
+# distributed autograd. Thus only these messages should be tested with the
+# Faulty RPC Agent.
+retryable_message_types = [
+    "RREF_FORK_REQUEST",
+    "RREF_CHILD_ACCEPT",
+    "RREF_USER_DELETE",
+    "CLEANUP_AUTOGRAD_CONTEXT_REQ",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # The following messages incur the corresponding delay in seconds while being
 # processed in FaultyTensorPipeAgent's enqueueSend() function.
@@ -21,6 +34,10 @@
     "SCRIPT_CALL": 1.5,  # Script/Builtin
 }
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class FaultyRpcAgentTestFixture(RpcAgentTestFixture):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -29,9 +46,13 @@ def __init__(self, *args, **kwargs):
 
     @property
     def rpc_backend(self):
+<<<<<<< HEAD
         return rpc.backend_registry.BackendType[
             "FAULTY_TENSORPIPE"
         ]
+=======
+        return rpc.backend_registry.BackendType["FAULTY_TENSORPIPE"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def rpc_backend_options(self):
@@ -54,7 +75,11 @@ def get_shutdown_error_regex(self):
         error_regexes = [
             "Exception in thread pool task",
             "Connection reset by peer",
+<<<<<<< HEAD
             "Connection closed by peer"
+=======
+            "Connection closed by peer",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
         return "|".join([f"({error_str})" for error_str in error_regexes])
 
diff --git a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
index b8ac87036bc7..07aed6750119 100644
--- a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
@@ -32,9 +32,14 @@ def fork_add(t1, t2, dst: str):
 class JitDistAutogradTest(RpcAgentTestFixture):
     @dist_init
     def test_get_gradients(self):
+<<<<<<< HEAD
 
         @torch.jit.script
         def dist_get_gradients(context_id: int) -> (dict[Tensor, Tensor]):
+=======
+        @torch.jit.script
+        def dist_get_gradients(context_id: int) -> dict[Tensor, Tensor]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return dist_autograd.get_gradients(context_id)
 
         FileCheck().check("get_gradients").run(str(dist_get_gradients.graph))
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
index 91f670f33fcb..63fb6ca4ccfb 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -1,7 +1,12 @@
 # mypy: allow-untyped-defs
 
+<<<<<<< HEAD
 import time
 import io
+=======
+import io
+import time
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import Any
 
 import torch
@@ -9,8 +14,14 @@
 import torch.distributed.rpc as rpc
 from torch import Tensor
 from torch.autograd.profiler import record_function
+<<<<<<< HEAD
 from torch.distributed.rpc import RRef
 from torch.distributed.rpc.internal import RPCExecMode, _build_rpc_profiling_key
+=======
+from torch.autograd.profiler_legacy import profile as _profile
+from torch.distributed.rpc import RRef
+from torch.distributed.rpc.internal import _build_rpc_profiling_key, RPCExecMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.futures import Future
 from torch.testing._internal.common_utils import TemporaryFileName
 from torch.testing._internal.dist_utils import (
@@ -23,11 +34,18 @@
     RpcAgentTestFixture,
 )
 
+<<<<<<< HEAD
 from torch.autograd.profiler_legacy import profile as _profile
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def rref_isinstance(rref, cls_to_check):
     return isinstance(rref.local_value(), cls_to_check)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def sleep(t):
     time.sleep(t)
 
@@ -140,10 +158,18 @@ def no_arg():
 def one_arg(value):
     return value + 1
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def script_add_ones(x):
     return torch.add(x, torch.ones(1))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def script_add_ones_with_record_function(x, block: str):
     with record_function(block):
@@ -154,16 +180,27 @@ def script_add_ones_with_record_function(x, block: str):
 def record_function_on_caller_rpc_async(dst_worker_name: str, block: str) -> Tensor:
     t: Tensor = torch.ones(1)
     with record_function(block):
+<<<<<<< HEAD
         fut1 = rpc.rpc_async(dst_worker_name, script_add_ones, (t, ))
         # Extra operator call to avoid de-duplication of the next async call
         # see https://github.com/pytorch/pytorch/pull/62710#discussion_r694680279
         zero = torch.zeros_like(t)
         fut2 = rpc.rpc_async(dst_worker_name, script_add_ones, (t, ))
+=======
+        fut1 = rpc.rpc_async(dst_worker_name, script_add_ones, (t,))
+        # Extra operator call to avoid de-duplication of the next async call
+        # see https://github.com/pytorch/pytorch/pull/62710#discussion_r694680279
+        zero = torch.zeros_like(t)
+        fut2 = rpc.rpc_async(dst_worker_name, script_add_ones, (t,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         res = fut1.wait() + fut2.wait() + zero
     return res
 
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def script_fork_wait_udf(tensor):
     fut = torch.jit._fork(script_add_ones, tensor)
@@ -196,7 +233,13 @@ def script_fork_wait_throw(invalue):
 
 
 @torch.jit.script
+<<<<<<< HEAD
 def call_rpc_with_profiling(record: torch.classes.profiler._RecordFunction, dst_worker_name: str) -> Tensor:
+=======
+def call_rpc_with_profiling(
+    record: torch.classes.profiler._RecordFunction, dst_worker_name: str
+) -> Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Call rpc_async from within ScriptFunction and ensure that we can attach
     # profiling callbacks. Note that handle here is a Tensor representation of
     # RecordFunction.
@@ -205,9 +248,20 @@ def call_rpc_with_profiling(record: torch.classes.profiler._RecordFunction, dst_
     ret = fut.wait()
     return ret
 
+<<<<<<< HEAD
 @torch.jit.script
 def call_rpc_torchscript_with_record_function(dst_worker_name: str, block: str) -> Tensor:
     fut = rpc.rpc_async(dst_worker_name, script_add_ones_with_record_function, (torch.tensor(1), block))
+=======
+
+@torch.jit.script
+def call_rpc_torchscript_with_record_function(
+    dst_worker_name: str, block: str
+) -> Tensor:
+    fut = rpc.rpc_async(
+        dst_worker_name, script_add_ones_with_record_function, (torch.tensor(1), block)
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return fut.wait()
 
 
@@ -311,9 +365,13 @@ def future_wait_in_script(fut: Future[Tensor]) -> Tensor:
         def future_return_to_python(
             dst_rank: int, inputs: tuple[Tensor, Tensor]
         ) -> Future[Tensor]:
+<<<<<<< HEAD
             return rpc.rpc_async(
                 f"worker{dst_rank}", two_args_two_kwargs, inputs
             )
+=======
+            return rpc.rpc_async(f"worker{dst_rank}", two_args_two_kwargs, inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         fut_res = future_return_to_python(dst_rank, inputs)
         self.assertEqual(fut_res.wait(), expected_res)
@@ -524,6 +582,10 @@ def script_rpc_async_call(
     ret = fut.wait()
     return ret
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def script_rpc_sync_call(
     dst_worker_name: str, args: tuple[Tensor, Tensor], kwargs: dict[str, Tensor]
@@ -531,6 +593,10 @@ def script_rpc_sync_call(
     res = rpc.rpc_sync(dst_worker_name, two_args_two_kwargs, args, kwargs)
     return res
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def script_rpc_remote_call(
     dst_worker_name: str, args: tuple[Tensor, Tensor], kwargs: dict[str, Tensor]
@@ -538,6 +604,10 @@ def script_rpc_remote_call(
     rref_res = rpc.remote(dst_worker_name, two_args_two_kwargs, args, kwargs)
     return rref_res.to_here()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class JitRpcOpTest:
     # Call functions remotely from Script.
     @dist_init
@@ -550,10 +620,19 @@ def test_all_kwargs_are_populated_by_defaults(self):
         args = (torch.tensor([1, 1]), torch.tensor([2, 2]))
         kwargs = {}
 
+<<<<<<< HEAD
         for script_op in [script_rpc_async_call, script_rpc_sync_call, script_rpc_remote_call]:
             ret = script_op(
                 dst_worker_name, args, kwargs
             )
+=======
+        for script_op in [
+            script_rpc_async_call,
+            script_rpc_sync_call,
+            script_rpc_remote_call,
+        ]:
+            ret = script_op(dst_worker_name, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(ret, torch.tensor([10, 10]))
 
     @dist_init
@@ -566,10 +645,19 @@ def test_some_kwargs_are_populated_by_defaults(self):
         args = (torch.tensor([1, 1]), torch.tensor([2, 2]))
         kwargs = {"first_kwarg": torch.tensor([2, 2])}
 
+<<<<<<< HEAD
         for script_op in [script_rpc_async_call, script_rpc_sync_call, script_rpc_remote_call]:
             ret = script_op(
                 dst_worker_name, args, kwargs
             )
+=======
+        for script_op in [
+            script_rpc_async_call,
+            script_rpc_sync_call,
+            script_rpc_remote_call,
+        ]:
+            ret = script_op(dst_worker_name, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(ret, torch.tensor([9, 9]))
 
     @dist_init
@@ -584,10 +672,19 @@ def test_no_kwargs_are_populated_by_defaults(self):
             "first_kwarg": torch.tensor([2, 2]),
             "second_kwarg": torch.tensor([3, 3]),
         }
+<<<<<<< HEAD
         for script_op in [script_rpc_async_call, script_rpc_sync_call, script_rpc_remote_call]:
             ret = script_op(
                 dst_worker_name, args, kwargs
             )
+=======
+        for script_op in [
+            script_rpc_async_call,
+            script_rpc_sync_call,
+            script_rpc_remote_call,
+        ]:
+            ret = script_op(dst_worker_name, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(ret, torch.tensor([8, 8]))
 
     @dist_init
@@ -618,9 +715,13 @@ def script_rpc_async_call_with_assorted_types(
             ret = fut.wait()
             return ret
 
+<<<<<<< HEAD
         ret = script_rpc_async_call_with_assorted_types(
             dst_worker_name
         )
+=======
+        ret = script_rpc_async_call_with_assorted_types(dst_worker_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, (torch.tensor([4, 4]), "str_arg_str_kwarg", 4))
 
     @dist_init
@@ -639,9 +740,13 @@ def script_rpc_async_call_without_kwargs_passed(
             ret = fut.wait()
             return ret
 
+<<<<<<< HEAD
         ret = script_rpc_async_call_without_kwargs_passed(
             dst_worker_name
         )
+=======
+        ret = script_rpc_async_call_without_kwargs_passed(dst_worker_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, 0)
 
     @dist_init
@@ -659,9 +764,13 @@ def script_rpc_async_call_without_args_kwargs_passed(
             ret = fut.wait()
             return ret
 
+<<<<<<< HEAD
         ret = script_rpc_async_call_without_args_kwargs_passed(
             dst_worker_name
         )
+=======
+        ret = script_rpc_async_call_without_args_kwargs_passed(dst_worker_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, 0)
 
     @dist_init
@@ -730,9 +839,13 @@ def script_rpc_async_call_with_unexpected_kwarg(
         with self.assertRaisesRegex(
             RuntimeError, "Unknown keyword argument 'third_kwarg'"
         ):
+<<<<<<< HEAD
             ret = script_rpc_async_call_with_unexpected_kwarg(
                 dst_worker_name
             )
+=======
+            ret = script_rpc_async_call_with_unexpected_kwarg(dst_worker_name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(ret, 0)
 
     @dist_init
@@ -915,9 +1028,13 @@ def test_torchscript_functions_not_supported(self):
         # Python 3.5 and Python 3.6 throw different error message, the only
         # common word can be greped is "pickle".
         with self.assertRaisesRegex(TypeError, "pickle"):
+<<<<<<< HEAD
             rpc.rpc_async(
                 dst_worker_name, my_local_script_module.forward, args=()
             )
+=======
+            rpc.rpc_async(dst_worker_name, my_local_script_module.forward, args=())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_remote_script_module(self):
@@ -1005,9 +1122,13 @@ def test_load_script_module_with_pickled_rref(self):
         rpc._disable_jit_rref_pickle()
 
         out1 = rpc.rpc_sync(
+<<<<<<< HEAD
             dst_name,
             load_script_module_with_pickled_rref,
             args=(f.getvalue(),)
+=======
+            dst_name, load_script_module_with_pickled_rref, args=(f.getvalue(),)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         out2 = m2()
         self.assertEqual(out1, out2)
@@ -1150,7 +1271,13 @@ def test_call_rpc_with_profiling(self):
             # After that, this test should be modified to validate the function time.
             events = prof.function_events
             function_event = get_function_event(events, prof_key)
+<<<<<<< HEAD
             self.assertTrue(torch._jit_internal._qualified_name(one_arg) in function_event.name)
+=======
+            self.assertTrue(
+                torch._jit_internal._qualified_name(one_arg) in function_event.name
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_rpc_async_jit_profiled(self):
@@ -1162,9 +1289,13 @@ def test_rpc_async_jit_profiled(self):
             args = (torch.tensor([1, 1]), torch.tensor([2, 2]))
             kwargs = {}
             with _profile() as prof:
+<<<<<<< HEAD
                 script_rpc_async_call(
                     dst_worker_name, args, kwargs
                 )
+=======
+                script_rpc_async_call(dst_worker_name, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Ensure rpc_async call is profiled
             function_events = prof.function_events
@@ -1358,10 +1489,16 @@ def test_async_function_remote_multi(self):
         num = 20
         rrefs = [
             rpc.remote(
+<<<<<<< HEAD
                 dst1,
                 async_add,
                 args=(dst2, torch.ones(2, 2), torch.ones(2, 2) * i)
             ) for i in range(num)
+=======
+                dst1, async_add, args=(dst2, torch.ones(2, 2), torch.ones(2, 2) * i)
+            )
+            for i in range(num)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         for i in range(num):
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
index ccf5257cbefe..c1b53cd13068 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
@@ -7,8 +7,13 @@
 from torch.distributed.rpc import RRef
 from torch.testing._internal.dist_utils import (
     dist_init,
+<<<<<<< HEAD
     worker_name,
     wait_until_pending_futures_and_users_flushed
+=======
+    wait_until_pending_futures_and_users_flushed,
+    worker_name,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
@@ -64,14 +69,26 @@ def rpc_async_call_future_ret(
     fut = rpc.rpc_async(dst_worker_name, two_args_two_kwargs, args, kwargs)
     return fut
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def rref_to_here(rref_var: RRef[Tensor]) -> Tensor:
     return rref_var.to_here()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def rref_to_here_with_timeout(rref_var: RRef[Tensor], timeout: float) -> Tensor:
     return rref_var.to_here(timeout)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def rpc_async_with_rref_arg(dst_worker_name: str, args: tuple[RRef[Tensor]]) -> Tensor:
     fut = rpc.rpc_async(dst_worker_name, rref_to_here, args)
@@ -84,6 +101,10 @@ class JitFaultyAgentRpcTest(RpcAgentTestFixture):
     Run tests for rpc_async in JIT under the faulty agent test fixture to test
     arbitrary timeouts.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dist_init(faulty_messages=[], messages_to_delay={"SCRIPT_CALL": 1.5})
     def test_timeout_in_torchscript_function(self):
         # Call rpc_async + fut.wait() in torchscript function and ensure that
@@ -108,9 +129,13 @@ def test_timeout_in_torchscript_function(self):
         # is less than the RPC takes to execute.
         rpc._set_rpc_timeout(0.001)
         with self.assertRaisesRegex(RuntimeError, expected_error):
+<<<<<<< HEAD
             script_rpc_async_call(
                 dst_worker_name, args, kwargs
             )
+=======
+            script_rpc_async_call(dst_worker_name, args, kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # Ensure that we run to completion if zero timeout is specified.
         ret = rpc_async_call_with_timeout(dst_worker_name, args, kwargs, 0)
@@ -198,7 +223,11 @@ def test_rref_timeout_pickle_in_jit(self):
         # Call RPC with RRef arg in JIT, which will go through JIT pickling and
         # ensure error is raised.
         with self.assertRaisesRegex(RuntimeError, "RRef creation"):
+<<<<<<< HEAD
             rpc_async_with_rref_arg(dst_worker, (rref, ))
+=======
+            rpc_async_with_rref_arg(dst_worker, (rref,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init(faulty_messages=["SCRIPT_REMOTE_CALL"])
     def test_rref_timeout_pickle_script_func(self):
@@ -214,4 +243,8 @@ def test_rref_timeout_pickle_script_func(self):
         wait_until_pending_futures_and_users_flushed()
         # Call RPC with script function that takes RRef, ensure timeout during pickling
         with self.assertRaisesRegex(RuntimeError, "RRef creation"):
+<<<<<<< HEAD
             rpc.rpc_sync(dst_worker, rref_to_here, args=(rref, ))
+=======
+            rpc.rpc_sync(dst_worker, rref_to_here, args=(rref,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index a0a909e5c81e..1295f32b6979 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -3,10 +3,15 @@
 import concurrent.futures
 import contextlib
 import json
+<<<<<<< HEAD
+=======
+import operator
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import os
 import sys
 import threading
 import time
+<<<<<<< HEAD
 
 from collections import namedtuple
 from functools import partial
@@ -40,27 +45,78 @@
     get_cycles_per_ms,
 )
 
+=======
+from collections import namedtuple
+from functools import partial
+from threading import Event, Lock
+from unittest import mock
+
+import torch
+import torch.distributed as dist
+import torch.distributed.autograd as dist_autograd
+import torch.distributed.rpc as rpc
+import torch.nn as nn
+from torch.autograd.profiler_legacy import profile as _profile
+from torch.distributed.rpc import (
+    _get_debug_info,
+    _rref_context_get_debug_info,
+    RRef,
+    WorkerInfo,
+)
+from torch.distributed.rpc.api import _thread_local_var, _use_rpc_pickler, _wait_all
+from torch.distributed.rpc.internal import (
+    _build_rpc_profiling_key,
+    _internal_rpc_pickler,
+    PythonUDF,
+    RPCExecMode,
+)
+from torch.futures import Future
+from torch.testing._internal.common_distributed import (
+    captured_output,
+    skip_if_lt_x_gpu,
+    tp_transports,
+)
+from torch.testing._internal.common_utils import (
+    get_cycles_per_ms,
+    IS_MACOS,
+    load_tests,
+    skip_but_pass_in_sandcastle_if,
+    TemporaryFileName,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.dist_utils import (
     dist_init,
     get_function_event,
     initialize_pg,
     wait_until_node_failure,
+<<<<<<< HEAD
     wait_until_pending_futures_and_users_flushed,
     wait_until_owners_and_forks_on_rank,
+=======
+    wait_until_owners_and_forks_on_rank,
+    wait_until_pending_futures_and_users_flushed,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     worker_name,
 )
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
+<<<<<<< HEAD
 from torch.testing._internal.common_utils import TemporaryFileName
 
 from torch.autograd.profiler_legacy import profile as _profile
 import operator
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def foo_add():
     return torch.add(torch.ones(1), torch.ones(1))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def udf_with_torch_ops(device=-1, use_record_function=False):
     device_ctx = contextlib.nullcontext() if device == -1 else torch.cuda.device(device)
     record_function_ctx = (
@@ -75,6 +131,10 @@ def udf_with_torch_ops(device=-1, use_record_function=False):
         t = t.relu()
         t = t.sigmoid()
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Events (operator invocations) that are expected to be ran as part of the above
 # function.
 EXPECTED_REMOTE_EVENTS = [
@@ -98,14 +158,26 @@ def udf_with_torch_ops(device=-1, use_record_function=False):
 
 _rpc_barrier_count = 0
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _increment_count():
     global _rpc_barrier_count
     _rpc_barrier_count += 1
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _reset_count():
     global _rpc_barrier_count
     _rpc_barrier_count = 0
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class StubRpcAgent:
     def __init__(self, world_size):
         self.world_size = world_size
@@ -143,6 +215,10 @@ def set_and_check_done(value):
 # methods over rpc
 TensorClass = namedtuple("TensorClass", ["tensors"])
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class MyPickleClass:
     def __init__(self) -> None:
         self.t = None
@@ -168,7 +244,11 @@ def __init__(self, t):
 
     def __getstate__(self):
         time.sleep(self.t)
+<<<<<<< HEAD
         return (self.t, )
+=======
+        return (self.t,)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __setstate__(self, obj):
         self.t = obj[0]
@@ -219,6 +299,10 @@ def add_rref_to_value(rref, value):
 def run_nested_pickle(pickle_cls_instance, tensor):
     return pickle_cls_instance.t + tensor
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def build_sparse_tensor(coalesce=False):
     i = [[0, 1, 1], [2, 0, 2]]
     v = [3, 4, 5]
@@ -227,6 +311,10 @@ def build_sparse_tensor(coalesce=False):
         tensor = tensor.coalesce()
     return tensor
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def build_complex_tensors():
     a = torch.ones(3, 3)
     b = [a, a]
@@ -235,6 +323,7 @@ def build_complex_tensors():
     e = {a: d}
     return [a, b, c, d, e]
 
+<<<<<<< HEAD
 def non_cont_test(t_view, t_cont):
     if t_view.is_contiguous():
         raise Exception('t_view is contiguous!')  # noqa: TRY002
@@ -244,6 +333,19 @@ def non_cont_test(t_view, t_cont):
         raise Exception('t_view is not equal to t_cont!')  # noqa: TRY002
     return t_view
 
+=======
+
+def non_cont_test(t_view, t_cont):
+    if t_view.is_contiguous():
+        raise Exception("t_view is contiguous!")  # noqa: TRY002
+    if not t_cont.is_contiguous():
+        raise Exception("t_cont is not contiguous!")  # noqa: TRY002
+    if not torch.equal(t_view, t_cont):
+        raise Exception("t_view is not equal to t_cont!")  # noqa: TRY002
+    return t_view
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def my_function(a, b, c):
     return a + b + c
 
@@ -251,6 +353,10 @@ def my_function(a, b, c):
 def my_tensor_function(a, b):
     return a + b
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def my_container_sum(a):
     result = a[0]
     for tensor in a[1:]:
@@ -285,23 +391,39 @@ def delayed_add(a, b, seconds=0.05):
 def identity(a):
     return a
 
+<<<<<<< HEAD
+def no_result():
+    print("do nothing")
+
+=======
+
 def no_result():
     print("do nothing")
 
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def raise_or_inc(value):
     if value.numel() == 2:
         raise ValueError("Expected error")
     return value + 1
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def nested_rpc(dst):
     return rpc.rpc_sync(dst, torch.add, args=(torch.ones(2, 2), 1))
 
 
 def nested_rpc_sparse(dst):
     return rpc.rpc_sync(
+<<<<<<< HEAD
         dst,
         torch.add,
         args=(build_sparse_tensor(), build_sparse_tensor())
+=======
+        dst, torch.add, args=(build_sparse_tensor(), build_sparse_tensor())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -328,6 +450,7 @@ def nested_rref(dst):
 
 def nested_rref_sparse(dst):
     return (
+<<<<<<< HEAD
         rpc.remote(
             dst,
             torch.add,
@@ -338,6 +461,10 @@ def nested_rref_sparse(dst):
             torch.add,
             args=(build_sparse_tensor(), build_sparse_tensor())
         ),
+=======
+        rpc.remote(dst, torch.add, args=(build_sparse_tensor(), build_sparse_tensor())),
+        rpc.remote(dst, torch.add, args=(build_sparse_tensor(), build_sparse_tensor())),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -345,8 +472,16 @@ def nested_remote(dst):
     rref = rpc.remote(dst, torch.add, args=(torch.ones(2, 2), 3))
     return rref.to_here()
 
+<<<<<<< HEAD
 def nested_remote_sparse(dst):
     rref = rpc.remote(dst, torch.add, args=(build_sparse_tensor(), build_sparse_tensor()))
+=======
+
+def nested_remote_sparse(dst):
+    rref = rpc.remote(
+        dst, torch.add, args=(build_sparse_tensor(), build_sparse_tensor())
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return rref.to_here()
 
 
@@ -383,6 +518,10 @@ def heavy_rpc_sparse(tensor):
         tensor = tensor / (i + 1)
     return 0
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def heavy_rpc_torchscript(tensor):
     for i in range(1, 100):
@@ -398,6 +537,10 @@ def my_script_func(tensor):
 
 expected_err = "Expected error"
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Note that it needs to inherit from Exception, not BaseException. See comment
 # in rpc/internal.py
 class CustomException(Exception):
@@ -405,17 +548,38 @@ def __init__(self, bool, msg):
         self.bool = bool
         super().__init__(msg)
 
+<<<<<<< HEAD
 def raise_func():
     raise ValueError(expected_err)
 
 def custom_raise_func():
     raise CustomException(True, "foo")
 
+=======
+
+def raise_func():
+    raise ValueError(expected_err)
+
+
+def custom_raise_func():
+    raise CustomException(True, "foo")
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @torch.jit.script
 def raise_func_script(expected_err: str) -> torch.Tensor:
     raise ValueError(expected_err)
 
+<<<<<<< HEAD
 expected_err_escape = "\nFirst line of error \n next line of error \n last line of error"
+=======
+
+expected_err_escape = (
+    "\nFirst line of error \n next line of error \n last line of error"
+)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def raise_func_escape():
     raise ValueError(expected_err_escape)
 
@@ -459,7 +623,11 @@ def get_events_from_profile(profile_rref):
 def add_use_future_set_result(to, x, y, z):
     out = torch.futures.Future()
     fut = rpc.rpc_async(to, torch.add, args=(x, y))
+<<<<<<< HEAD
     fut.then(lambda fut : out.set_result(fut.wait() + z))
+=======
+    fut.then(lambda fut: out.set_result(fut.wait() + z))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return out.wait()
 
 
@@ -468,7 +636,11 @@ def add_use_future_nested_cb(to, x, y, z):
 
     def callback(fut1):
         fut2 = rpc.rpc_async(to, torch.add, args=(fut1.wait(), z))
+<<<<<<< HEAD
         fut2.then(lambda fut2 : out.set_result(fut2.wait()))
+=======
+        fut2.then(lambda fut2: out.set_result(fut2.wait()))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     fut1 = rpc.rpc_async(to, torch.add, args=(x, y))
     fut1.then(callback)
@@ -517,9 +689,13 @@ def async_add_with_future_ctor(to, x, y, z):
 
 @rpc.functions.async_execution
 def async_add_chained(to, x, y, z):
+<<<<<<< HEAD
     return rpc.rpc_async(to, torch.add, args=(x, y)).then(
         lambda fut: fut.wait() + z
     )
+=======
+    return rpc.rpc_async(to, torch.add, args=(x, y)).then(lambda fut: fut.wait() + z)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @rpc.functions.async_execution
@@ -611,7 +787,10 @@ def sum(self):
 
 
 class AsyncExecutionClass:
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @staticmethod
     @rpc.functions.async_execution
     def static_async_add(to, x, y, z):
@@ -655,11 +834,15 @@ def __init__(self, init_method):
 class MyEmbeddingBagModel(torch.nn.Module):
     def __init__(self, sparse):
         super().__init__()
+<<<<<<< HEAD
         self.eb = torch.nn.EmbeddingBag(
             10,
             10,
             sparse=sparse
         )
+=======
+        self.eb = torch.nn.EmbeddingBag(10, 10, sparse=sparse)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def forward(self, x):
         return self.eb(x)
@@ -757,9 +940,13 @@ def _self_remote_rref_as_remote_arg(self, dst, x, y, z):
         self_worker_info = rpc.get_worker_info()
         rref = rpc.remote(self_worker_info, my_function, args=(x, y, z))
         ret_rref = rpc.remote(dst, add_rref_to_value, args=(rref, x))
+<<<<<<< HEAD
         self.assertEqual(
             ret_rref.to_here(), x + y + z + x
         )
+=======
+        self.assertEqual(ret_rref.to_here(), x + y + z + x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _world_size_one(self, a, b):
         if self.rank == 0:
@@ -773,29 +960,41 @@ def _world_size_one(self, a, b):
 
             def _rpc_sync(x, y):
                 expect = x * 2
+<<<<<<< HEAD
                 result = rpc.rpc_sync(
                     "me",
                     my_tensor_function,
                     args=(x, y)
                 )
+=======
+                result = rpc.rpc_sync("me", my_tensor_function, args=(x, y))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(expect, result)
 
             def _rpc_async(x, y):
                 expect = x * 2
+<<<<<<< HEAD
                 result = rpc.rpc_async(
                     "me",
                     my_tensor_function,
                     args=(x, y)
                 ).wait()
+=======
+                result = rpc.rpc_async("me", my_tensor_function, args=(x, y)).wait()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(expect, result)
 
             def _remote(x, y):
                 expect = x * 2
+<<<<<<< HEAD
                 result = rpc.remote(
                     "me",
                     my_tensor_function,
                     args=(x, y)
                 ).to_here()
+=======
+                result = rpc.remote("me", my_tensor_function, args=(x, y)).to_here()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(expect, result)
 
             _rpc_sync(a, b)
@@ -938,7 +1137,13 @@ def _builtin_remote_self(self, x, y, expected):
         )
         self.assertEqual(rref.local_value(), expected)
 
+<<<<<<< HEAD
     def _test_multi_remote_call(self, fn, sparse, args_fn=lambda x, y: (), kwargs_fn=lambda x, y: {}):
+=======
+    def _test_multi_remote_call(
+        self, fn, sparse, args_fn=lambda x, y: (), kwargs_fn=lambda x, y: {}
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         m = 10
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -962,12 +1167,17 @@ def _test_multi_remote_call(self, fn, sparse, args_fn=lambda x, y: (), kwargs_fn
     def _py_rref_args(self, a, b, x, y, expected):
         n = self.rank + 1
         dst_rank = n % self.world_size
+<<<<<<< HEAD
         rref_a = rpc.remote(
             worker_name(dst_rank), torch.add, args=(a, b)
         )
         rref_b = rpc.remote(
             worker_name(dst_rank), torch.add, args=(x, y)
         )
+=======
+        rref_a = rpc.remote(worker_name(dst_rank), torch.add, args=(a, b))
+        rref_b = rpc.remote(worker_name(dst_rank), torch.add, args=(x, y))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rref_c = rpc.remote(
             worker_name(dst_rank), my_rref_function, args=(rref_a, rref_b)
         )
@@ -977,12 +1187,17 @@ def _py_rref_args_user_share(self, a, b, c, x, y, z, expected):
         n = self.rank + 1
         owner_rank = n % self.world_size
         user_rank = (n + 1) % self.world_size
+<<<<<<< HEAD
         rref_a = rpc.remote(
             worker_name(owner_rank), my_function, args=(a, b, c)
         )
         rref_b = rpc.remote(
             worker_name(owner_rank), my_function, args=(x, y, z)
         )
+=======
+        rref_a = rpc.remote(worker_name(owner_rank), my_function, args=(a, b, c))
+        rref_b = rpc.remote(worker_name(owner_rank), my_function, args=(x, y, z))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rref_c = rpc.remote(
             worker_name(user_rank), my_rref_function, args=(rref_a, rref_b)
         )
@@ -991,6 +1206,7 @@ def _py_rref_args_user_share(self, a, b, c, x, y, z, expected):
     def _py_rpc_rref_args(self, a, b, c, x, y, z, expected):
         n = self.rank + 1
         dst_rank = n % self.world_size
+<<<<<<< HEAD
         rref_a = rpc.remote(
             worker_name(dst_rank), my_function, args=(a, b, c)
         )
@@ -1001,6 +1217,12 @@ def _py_rpc_rref_args(self, a, b, c, x, y, z, expected):
         c = rpc.rpc_sync(
             worker_name(dst_rank), my_rref_function, args=(rref_a, rref_b)
         )
+=======
+        rref_a = rpc.remote(worker_name(dst_rank), my_function, args=(a, b, c))
+        rref_b = rpc.remote(worker_name(dst_rank), my_function, args=(x, y, z))
+
+        c = rpc.rpc_sync(worker_name(dst_rank), my_rref_function, args=(rref_a, rref_b))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(c, expected)
 
     def _nested_remote(self, f, expected):
@@ -1043,7 +1265,12 @@ def _nested_rref_stress(self, f, expected1, expected2):
                 worker_name(dst_rank1),
                 f,
                 args=(worker_name(dst_rank2),),
+<<<<<<< HEAD
             ) for _ in range(20)
+=======
+            )
+            for _ in range(20)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         for i in range(20):
@@ -1075,11 +1302,18 @@ def _my_parameter_server(self, sparse):
             rpc.rpc_async(
                 worker_name((self.rank + index) % self.world_size),
                 self._trainer_func,
+<<<<<<< HEAD
                 args=(
                     ps_rref,
                     sparse
                 ),
             ) for index in range(1, self.world_size)]
+=======
+                args=(ps_rref, sparse),
+            )
+            for index in range(1, self.world_size)
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.futures.wait_all(futures)
 
     def _test_cuda_future_extraction(self, wrapper, unwrapper, sparse_tensor):
@@ -1106,8 +1340,19 @@ def _test_cuda_future_extraction(self, wrapper, unwrapper, sparse_tensor):
             with torch.cuda.stream(another_stream):
                 tensor = unwrapper(future.wait())
                 if sparse_tensor:
+<<<<<<< HEAD
                     self.assertTrue(torch.eq(tensor.indices(), expected_tensor.indices()).all().item())
                     self.assertTrue(torch.eq(tensor.values(), expected_tensor.values()).all().item())
+=======
+                    self.assertTrue(
+                        torch.eq(tensor.indices(), expected_tensor.indices())
+                        .all()
+                        .item()
+                    )
+                    self.assertTrue(
+                        torch.eq(tensor.values(), expected_tensor.values()).all().item()
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertEqual(tensor.size(), expected_tensor.size())
                 else:
                     self.assertTrue(torch.eq(tensor, expected_tensor).all().item())
@@ -1132,9 +1377,13 @@ def test_get_worker_infos(self):
         worker_infos = rpc.api._get_current_rpc_agent().get_worker_infos()
 
         worker_names = {worker_info.name for worker_info in worker_infos}
+<<<<<<< HEAD
         expected_worker_names = {
             worker_name(rank) for rank in range(self.world_size)
         }
+=======
+        expected_worker_names = {worker_name(rank) for rank in range(self.world_size)}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(worker_names, expected_worker_names)
 
         worker_ids = {worker_info.id for worker_info in worker_infos}
@@ -1155,12 +1404,19 @@ def test_send_to_rank(self):
 
         # Test dense tensor
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+<<<<<<< HEAD
             ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+=======
+            ret = self._run_func_in_mode(
+                dst_rank, torch.add, exec_mode, args=(torch.ones(2, 2), 1)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(ret, torch.ones(2, 2) + 1)
 
         # Test invalid ranks
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
             with self.assertRaises(RuntimeError):
+<<<<<<< HEAD
                 self._run_func_in_mode(self.world_size + 1, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
 
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
@@ -1183,10 +1439,41 @@ def test_self_py_udf_remote(self):
             1,
             3
         )
+=======
+                self._run_func_in_mode(
+                    self.world_size + 1,
+                    torch.add,
+                    exec_mode,
+                    args=(torch.ones(2, 2), 1),
+                )
+
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            with self.assertRaises(RuntimeError):
+                self._run_func_in_mode(
+                    -1, torch.add, exec_mode, args=(torch.ones(2, 2), 1)
+                )
+
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            with self.assertRaises(ValueError):
+                self._run_func_in_mode(
+                    dst_rank + 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1)
+                )
+
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            with self.assertRaises(ValueError):
+                self._run_func_in_mode(
+                    dst_rank - 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1)
+                )
+
+    @dist_init
+    def test_self_py_udf_remote(self):
+        self._self_py_udf_remote(rpc.get_worker_info(), torch.ones(2, 2), 1, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_self_remote_rref_as_rpc_arg(self):
         dst = worker_name((self.rank + 1) % self.world_size)
+<<<<<<< HEAD
         self._self_remote_rref_as_rpc_arg(
             dst,
             torch.ones(2, 2),
@@ -1202,24 +1489,39 @@ def test_self_remote_rref_as_self_rpc_arg(self):
             1,
             3
         )
+=======
+        self._self_remote_rref_as_rpc_arg(dst, torch.ones(2, 2), 1, 3)
+
+    @dist_init
+    def test_self_remote_rref_as_self_rpc_arg(self):
+        self._self_remote_rref_as_rpc_arg(rpc.get_worker_info(), torch.ones(2, 2), 1, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_self_remote_rref_as_remote_arg(self):
         dst = worker_name((self.rank + 1) % self.world_size)
+<<<<<<< HEAD
         self._self_remote_rref_as_remote_arg(
             dst,
             torch.ones(2, 2),
             1,
             3
         )
+=======
+        self._self_remote_rref_as_remote_arg(dst, torch.ones(2, 2), 1, 3)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_self_remote_rref_as_self_remote_arg(self):
         self._self_remote_rref_as_remote_arg(
+<<<<<<< HEAD
             rpc.get_worker_info(),
             torch.ones(2, 2),
             1,
             3
+=======
+            rpc.get_worker_info(), torch.ones(2, 2), 1, 3
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1257,7 +1559,11 @@ def test_rref_proxy_reuse(self):
         rref = rpc.remote(
             worker_name((self.rank + 1) % self.world_size),
             my_function,
+<<<<<<< HEAD
             args=(torch.ones(2, 2), 1, 3)
+=======
+            args=(torch.ones(2, 2), 1, 3),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         expected = torch.ones(2, 2) + 1 + 3
 
@@ -1294,6 +1600,7 @@ def _test_rref_proxy_class(self, dst):
         self.assertEqual(expected.get_value(), rref.remote().get_value().to_here())
 
         self.assertEqual(
+<<<<<<< HEAD
             expected.my_instance_method(2),
             rref.rpc_sync().my_instance_method(2)
         )
@@ -1317,10 +1624,32 @@ def _test_rref_proxy_class(self, dst):
         self.assertEqual(
             expected.my_static_method(11),
             rref.remote().my_static_method(11).to_here()
+=======
+            expected.my_instance_method(2), rref.rpc_sync().my_instance_method(2)
+        )
+        self.assertEqual(
+            expected.my_instance_method(3),
+            rref.rpc_async().my_instance_method(3).wait(),
+        )
+        self.assertEqual(
+            expected.my_instance_method(4),
+            rref.remote().my_instance_method(4).to_here(),
+        )
+
+        self.assertEqual(
+            expected.my_static_method(9), rref.rpc_sync().my_static_method(9)
+        )
+        self.assertEqual(
+            expected.my_static_method(10), rref.rpc_async().my_static_method(10).wait()
+        )
+        self.assertEqual(
+            expected.my_static_method(11), rref.remote().my_static_method(11).to_here()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.assertEqual(
             expected.my_class_method(2, torch.zeros(2, 2)),
+<<<<<<< HEAD
             rref.rpc_sync().my_class_method(2, torch.zeros(2, 2))
         )
         self.assertEqual(
@@ -1330,6 +1659,17 @@ def _test_rref_proxy_class(self, dst):
         self.assertEqual(
             expected.my_class_method(2, torch.ones(4, 4)),
             rref.remote().my_class_method(2, torch.ones(4, 4)).to_here()
+=======
+            rref.rpc_sync().my_class_method(2, torch.zeros(2, 2)),
+        )
+        self.assertEqual(
+            expected.my_class_method(2, torch.ones(3, 3)),
+            rref.rpc_async().my_class_method(2, torch.ones(3, 3)).wait(),
+        )
+        self.assertEqual(
+            expected.my_class_method(2, torch.ones(4, 4)),
+            rref.remote().my_class_method(2, torch.ones(4, 4)).to_here(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -1434,10 +1774,18 @@ def test_reinit(self):
     @dist_init(setup_rpc=False)
     def test_pg_init_no_rpc_init(self):
         dist.init_process_group(
+<<<<<<< HEAD
             backend='gloo',
             init_method=self.file_init_method,
             rank=self.rank,
             world_size=self.world_size)
+=======
+            backend="gloo",
+            init_method=self.file_init_method,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         class MyModel(torch.nn.Module):
             def __init__(self) -> None:
@@ -1451,6 +1799,7 @@ def forward(self, x):
         model.train()
         model = torch.nn.parallel.DistributedDataParallel(model)
 
+<<<<<<< HEAD
         with self.assertRaisesRegex(RuntimeError, 'Current RPC agent is not set! Did you initialize the RPC framework'):
             [RRef(param) for param in model.parameters()]
 
@@ -1463,6 +1812,19 @@ def test_world_size_one(self):
     @dist_init(setup_rpc=False)
     def test_invalid_names(self):
 
+=======
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Current RPC agent is not set! Did you initialize the RPC framework",
+        ):
+            [RRef(param) for param in model.parameters()]
+
+    def test_world_size_one(self):
+        self._world_size_one(torch.ones(2, 2), torch.ones(2, 2))
+
+    @dist_init(setup_rpc=False)
+    def test_invalid_names(self):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         worker_id = 0
         with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
             WorkerInfo("abc*", worker_id)
@@ -1522,9 +1884,13 @@ def test_add_with_id(self):
     def test_scalar_add(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
+<<<<<<< HEAD
         ret = rpc.rpc_sync(
             worker_name(dst_rank), torch.add, args=(torch.ones(n, n), n)
         )
+=======
+        ret = rpc.rpc_sync(worker_name(dst_rank), torch.add, args=(torch.ones(n, n), n))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, (torch.ones(n, n) + n))
 
     @dist_init
@@ -1583,7 +1949,11 @@ def wait_all_workers_sleep(timeout):
         rpc.api._wait_all_workers = wait_all_workers_sleep
 
         try:
+<<<<<<< HEAD
             with self.assertRaisesRegex(RuntimeError, ''):
+=======
+            with self.assertRaisesRegex(RuntimeError, ""):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 rpc.shutdown(graceful=True, timeout=0.01)
         finally:
             rpc.api._wait_all_workers = og_func
@@ -1611,8 +1981,12 @@ def test_all_gather_timeout(self):
 
         if self.rank == 0:
             with self.assertRaisesRegex(
+<<<<<<< HEAD
                 RuntimeError,
                 "timed out in _all_gather after 0\\.10 seconds"
+=======
+                RuntimeError, "timed out in _all_gather after 0\\.10 seconds"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ):
                 rpc.api._all_gather(SlowPickleClass(0.5))
         else:
@@ -1671,7 +2045,13 @@ def test_rpc_barrier_multithreaded(self):
         names = [worker.name for worker in all_worker_info]
         threads = []
         for _ in range(3):
+<<<<<<< HEAD
             th = threading.Thread(target=self._test_barrier_helper, args=(info, names, True))
+=======
+            th = threading.Thread(
+                target=self._test_barrier_helper, args=(info, names, True)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             threads.append(th)
             th.start()
         for th in threads:
@@ -1748,7 +2128,13 @@ def test_build_rpc_profiling_key(self):
             self.assertIn("worker0", rpc_profiling_key)
             self.assertIn("worker1", rpc_profiling_key)
 
+<<<<<<< HEAD
     def check_profiling_info(self, self_worker_name, dst_worker_name, func, rpc_event, rpc_exec_mode):
+=======
+    def check_profiling_info(
+        self, self_worker_name, dst_worker_name, func, rpc_event, rpc_exec_mode
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(self_worker_name in rpc_event.name)
         self.assertTrue(dst_worker_name in rpc_event.name)
         if isinstance(func, torch.jit.ScriptFunction):
@@ -1824,9 +2210,19 @@ def test_profiler_export_trace(self):
             p.export_chrome_trace(path)
             with open(path) as f:
                 trace = json.load(f)
+<<<<<<< HEAD
                 event_names = [event['name'] for event in trace]
                 for expected_event_name in EXPECTED_REMOTE_EVENTS + [RPCExecMode.ASYNC.value]:
                     event_exists = any(expected_event_name in event_name for event_name in event_names)
+=======
+                event_names = [event["name"] for event in trace]
+                for expected_event_name in EXPECTED_REMOTE_EVENTS + [
+                    RPCExecMode.ASYNC.value
+                ]:
+                    event_exists = any(
+                        expected_event_name in event_name for event_name in event_names
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     self.assertTrue(event_exists)
 
     @dist_init
@@ -1931,7 +2327,13 @@ def _run_test_profiler_remote_events_profiled(self):
             )
 
             for expected_remote_event_name in EXPECTED_REMOTE_EVENTS:
+<<<<<<< HEAD
                 expected_key = rpc_profiling_key + REMOTE_OP_STR + expected_remote_event_name
+=======
+                expected_key = (
+                    rpc_profiling_key + REMOTE_OP_STR + expected_remote_event_name
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertTrue(expected_key in remote_events)
                 remote_event = remote_events[expected_key]
                 # Remote event should have a node ID corresponding to the worker
@@ -1941,10 +2343,14 @@ def _run_test_profiler_remote_events_profiled(self):
             # Validate order remote events show up in profiling output.
             def convert_remote_to_local(event_name):
                 remote_op_key = rpc_profiling_key + REMOTE_OP_STR
+<<<<<<< HEAD
                 return event_name[
                     event_name.find(remote_op_key)
                     + len(remote_op_key) :
                 ]
+=======
+                return event_name[event_name.find(remote_op_key) + len(remote_op_key) :]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             remote_events_list = [
                 convert_remote_to_local(event.name)
@@ -2077,24 +2483,35 @@ def get_cpu_children(event):
                 evt for evt in local_function_events if "##forward##" in evt.name
             )
             local_children = get_cpu_children(local_record_function_event)
+<<<<<<< HEAD
             local_children_names = [
                 evt.name for evt in local_children
             ]
+=======
+            local_children_names = [evt.name for evt in local_children]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             REMOTE_OP_STR = "#remote_op: "
 
             def convert_remote_to_local(event_name):
                 remote_op_key = REMOTE_OP_STR
+<<<<<<< HEAD
                 return event_name[
                     event_name.find(remote_op_key) + len(remote_op_key) :
                 ]
+=======
+                return event_name[event_name.find(remote_op_key) + len(remote_op_key) :]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             for evt in remote_children:
                 local_name = convert_remote_to_local(evt.name)
                 self.assertTrue(local_name in local_children_names)
 
     def validate_profiling_workload(self, dst, prof):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def convert_remote_to_local(event_name):
             return event_name[event_name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR) :]
 
@@ -2142,7 +2559,17 @@ def test_profiler_with_autograd_context(self):
         self._run_test_profiler_with_autograd_context()
 
     def _profiler_test_with_rpc(
+<<<<<<< HEAD
         self, rpc_exec_mode, func, args, use_record_function=False, dst=None, kineto_profile=False
+=======
+        self,
+        rpc_exec_mode,
+        func,
+        args,
+        use_record_function=False,
+        dst=None,
+        kineto_profile=False,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         dst = dst if dst is not None else (self.rank + 1) % self.world_size
 
@@ -2153,9 +2580,13 @@ def _profiler_test_with_rpc(
                 record_function_ctx_mgr = (
                     contextlib.nullcontext()
                     if not use_record_function
+<<<<<<< HEAD
                     else torch.autograd.profiler.record_function(
                         "foo"
                     )
+=======
+                    else torch.autograd.profiler.record_function("foo")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
                 with record_function_ctx_mgr:
                     if rpc_exec_mode == RPCExecMode.SYNC:
@@ -2195,7 +2626,13 @@ def _profiler_test_with_rpc(
             # verify Node ID for this rpc event.
             self.assertEqual(rpc_event.node_id, self.rank)
             # Ensure recording of remote events.
+<<<<<<< HEAD
             remote_events = {event for event in events if event.node_id == dst} - {rpc_event}
+=======
+            remote_events = {event for event in events if event.node_id == dst} - {
+                rpc_event
+            }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertGreaterEqual(len(remote_events), 1)
             for remote_event in remote_events:
                 self.assertEqual(remote_event.node_id, dst)
@@ -2204,24 +2641,56 @@ def _profiler_test_with_rpc(
                 scope_event = get_function_event(events, "foo")
                 # Since RPC call is within the scope, its CPU interval should be
                 # contained within foo's interval.
+<<<<<<< HEAD
                 self.assertLessEqual(scope_event.time_range.start, rpc_event.time_range.start)
                 self.assertGreaterEqual(scope_event.time_range.end, rpc_event.time_range.end)
+=======
+                self.assertLessEqual(
+                    scope_event.time_range.start, rpc_event.time_range.start
+                )
+                self.assertGreaterEqual(
+                    scope_event.time_range.end, rpc_event.time_range.end
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # the sender, dest worker, function run, and type of RPC should all
             # be recorded.
             self_worker_name = worker_name(self.rank)
             dst_worker_name = worker_name(dst)
+<<<<<<< HEAD
             self.check_profiling_info(self_worker_name, dst_worker_name, func, rpc_event, rpc_exec_mode)
             if use_record_function:
                 # verify order by ensuring that the outer context comes
                 # before the rpc event.
                 foo_event_ix = next(i for i, event in enumerate(events) if "foo" in event.name)
                 rpc_event_idx = next(i for i, event in enumerate(events) if rpc_exec_mode.value in event.name)
+=======
+            self.check_profiling_info(
+                self_worker_name, dst_worker_name, func, rpc_event, rpc_exec_mode
+            )
+            if use_record_function:
+                # verify order by ensuring that the outer context comes
+                # before the rpc event.
+                foo_event_ix = next(
+                    i for i, event in enumerate(events) if "foo" in event.name
+                )
+                rpc_event_idx = next(
+                    i
+                    for i, event in enumerate(events)
+                    if rpc_exec_mode.value in event.name
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertLess(foo_event_ix, rpc_event_idx)
 
     def _run_test_profiler_with_sync_rpc_udf(self):
         self._profiler_test_with_rpc(RPCExecMode.SYNC, my_sleep_func, args=(1,))
+<<<<<<< HEAD
         self._profiler_test_with_rpc(RPCExecMode.SYNC, my_sleep_func, args=(1,),
                                      use_record_function=True)
+=======
+        self._profiler_test_with_rpc(
+            RPCExecMode.SYNC, my_sleep_func, args=(1,), use_record_function=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_profiler_with_sync_rpc_udf(self):
@@ -2236,8 +2705,15 @@ def _run_test_profiler_with_sync_rpc_builtin(self):
             RPCExecMode.SYNC, torch.mul, args=(torch.ones(1), torch.ones(1))
         )
         self._profiler_test_with_rpc(
+<<<<<<< HEAD
             RPCExecMode.SYNC, torch.mul, args=(torch.ones(1), torch.ones(1)),
             use_record_function=True
+=======
+            RPCExecMode.SYNC,
+            torch.mul,
+            args=(torch.ones(1), torch.ones(1)),
+            use_record_function=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -2250,8 +2726,14 @@ def test_profiler_with_sync_rpc_builtin_single_threaded(self):
 
     def _run_test_profiler_with_async_rpc_udf(self):
         self._profiler_test_with_rpc(RPCExecMode.ASYNC, my_sleep_func, args=(1,))
+<<<<<<< HEAD
         self._profiler_test_with_rpc(RPCExecMode.ASYNC, my_sleep_func, args=(1,),
                                      use_record_function=True)
+=======
+        self._profiler_test_with_rpc(
+            RPCExecMode.ASYNC, my_sleep_func, args=(1,), use_record_function=True
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Test to ensure that kineto profiler enabled in RPC does not enable
         # RPC profiling (it is unsupported) and does not result in issues.
         self._profiler_test_with_rpc(
@@ -2271,8 +2753,15 @@ def _run_test_profiler_with_async_rpc_builtin(self):
             RPCExecMode.ASYNC, torch.mul, args=(torch.ones(1), torch.ones(1))
         )
         self._profiler_test_with_rpc(
+<<<<<<< HEAD
             RPCExecMode.ASYNC, torch.mul, args=(torch.ones(1), torch.ones(1)),
             use_record_function=True
+=======
+            RPCExecMode.ASYNC,
+            torch.mul,
+            args=(torch.ones(1), torch.ones(1)),
+            use_record_function=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -2306,8 +2795,15 @@ def _run_test_profiler_with_remote_builtin(self):
             RPCExecMode.REMOTE, torch.mul, args=(torch.ones(1), torch.ones(1))
         )
         self._profiler_test_with_rpc(
+<<<<<<< HEAD
             RPCExecMode.REMOTE, torch.mul, args=(torch.ones(1), torch.ones(1)),
             use_record_function=True
+=======
+            RPCExecMode.REMOTE,
+            torch.mul,
+            args=(torch.ones(1), torch.ones(1)),
+            use_record_function=True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         # test remote to self
         self._profiler_test_with_rpc(
@@ -2386,7 +2882,13 @@ def test_profiler_with_script_remote_rpc(self):
     def test_profiler_with_script_remote_rpc_single_threaded(self):
         self._run_test_profiler_with_script_remote_rpc()
 
+<<<<<<< HEAD
     def _assert_top_level_events(self, process_global_events, expected_top_level_event_names):
+=======
+    def _assert_top_level_events(
+        self, process_global_events, expected_top_level_event_names
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         top_level_event_names = []
         for thread_local_events in process_global_events:
             # Get top-level events from all events happened on a thread.
@@ -2416,21 +2918,45 @@ def test_server_process_global_profiler(self):
         x = torch.tensor(1)
         y = torch.tensor(2)
 
+<<<<<<< HEAD
         outer_profile_rref = rpc.remote(dst_worker_name, rpc._server_process_global_profile)
         outer_profile_rref.rpc_sync().__enter__()
         rpc.rpc_sync(dst_worker_name, torch.add, (x, y))
         inner_profile_rref = rpc.remote(dst_worker_name, rpc._server_process_global_profile)
+=======
+        outer_profile_rref = rpc.remote(
+            dst_worker_name, rpc._server_process_global_profile
+        )
+        outer_profile_rref.rpc_sync().__enter__()
+        rpc.rpc_sync(dst_worker_name, torch.add, (x, y))
+        inner_profile_rref = rpc.remote(
+            dst_worker_name, rpc._server_process_global_profile
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         inner_profile_rref.rpc_sync().__enter__()
         rpc.rpc_sync(dst_worker_name, torch.sub, (x, y))
         inner_profile_rref.rpc_sync().__exit__(None, None, None)
         outer_profile_rref.rpc_sync().__exit__(None, None, None)
 
+<<<<<<< HEAD
         inner_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (inner_profile_rref,))
         expected_inner_events = ['aten::sub']
         expected_outer_events = expected_inner_events + ['aten::add']
 
         self._assert_top_level_events(inner_events, expected_inner_events)
         outer_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (outer_profile_rref,))
+=======
+        inner_events = rpc.rpc_sync(
+            dst_worker_name, get_events_from_profile, (inner_profile_rref,)
+        )
+        expected_inner_events = ["aten::sub"]
+        expected_outer_events = expected_inner_events + ["aten::add"]
+
+        self._assert_top_level_events(inner_events, expected_inner_events)
+        outer_events = rpc.rpc_sync(
+            dst_worker_name, get_events_from_profile, (outer_profile_rref,)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._assert_top_level_events(outer_events, expected_outer_events)
 
         inner_profile_rref.rpc_sync().key_averages()
@@ -2486,7 +3012,13 @@ def test_async_record_function_cbs_jit_call(self):
                         worker_name(0), my_script_func, args=(torch.tensor(1),)
                     )
                     # Intentionally calling record_function internals
+<<<<<<< HEAD
                     fut = torch.ops.profiler._call_end_callbacks_on_jit_fut(rf.record, fut)
+=======
+                    fut = torch.ops.profiler._call_end_callbacks_on_jit_fut(
+                        rf.record, fut
+                    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 result = fut.wait()
                 # Validate that the profiling future returns the same value as the RPC
                 # future.
@@ -2496,7 +3028,13 @@ def test_async_record_function_cbs_jit_call(self):
             rpc_event = get_function_event(
                 events, torch._jit_internal._qualified_name(my_script_func)
             )
+<<<<<<< HEAD
             self.assertTrue(torch._jit_internal._qualified_name(my_script_func) in rpc_event.name)
+=======
+            self.assertTrue(
+                torch._jit_internal._qualified_name(my_script_func) in rpc_event.name
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_py_class_constructor(self):
@@ -2660,22 +3198,36 @@ def test_stress_heavy_rpc(self):
 
     @dist_init
     def test_stress_heavy_rpc_torchscript(self):
+<<<<<<< HEAD
         self._stress_test_rpc(heavy_rpc_torchscript, repeat=20, args=(torch.ones(100, 100),))
+=======
+        self._stress_test_rpc(
+            heavy_rpc_torchscript, repeat=20, args=(torch.ones(100, 100),)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_builtin_remote_ret(self):
         self._builtin_remote_ret(
+<<<<<<< HEAD
             torch.ones(2, 2),
             torch.ones(2, 2),
             torch.ones(2, 2) * 2
+=======
+            torch.ones(2, 2), torch.ones(2, 2), torch.ones(2, 2) * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_builtin_remote_self(self):
         self._builtin_remote_self(
+<<<<<<< HEAD
             torch.ones(2, 2),
             torch.ones(2, 2),
             torch.ones(2, 2) * 2
+=======
+            torch.ones(2, 2), torch.ones(2, 2), torch.ones(2, 2) * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @staticmethod
@@ -2687,10 +3239,14 @@ def _multi_args_fn(n, sparse=False):
 
     @dist_init
     def test_multi_builtin_remote_ret(self):
+<<<<<<< HEAD
         self._test_multi_remote_call(
             torch.add, False,
             args_fn=RpcTest._multi_args_fn
         )
+=======
+        self._test_multi_remote_call(torch.add, False, args_fn=RpcTest._multi_args_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_py_udf_remote(self):
@@ -2709,7 +3265,11 @@ def _multi_kwargs_fn(n, sparse=False):
             return {
                 "a": build_sparse_tensor(),
                 "b": build_sparse_tensor(),
+<<<<<<< HEAD
                 "c": build_sparse_tensor()
+=======
+                "c": build_sparse_tensor(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             }
         else:
             return {"a": torch.ones(n, n), "b": torch.ones(n, n), "c": torch.ones(n, n)}
@@ -2717,23 +3277,33 @@ def _multi_kwargs_fn(n, sparse=False):
     @dist_init
     def test_multi_py_udf_remote(self):
         self._test_multi_remote_call(
+<<<<<<< HEAD
             my_function,
             False,
             kwargs_fn=RpcTest._multi_kwargs_fn
+=======
+            my_function, False, kwargs_fn=RpcTest._multi_kwargs_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_py_rref_args(self):
         self._py_rref_args(
+<<<<<<< HEAD
             torch.ones(2, 2),
             1,
             torch.ones(2, 2),
             2,
             torch.ones(2, 2) * 2 + 3)
+=======
+            torch.ones(2, 2), 1, torch.ones(2, 2), 2, torch.ones(2, 2) * 2 + 3
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_py_rref_args_user_share(self):
         self._py_rref_args_user_share(
+<<<<<<< HEAD
             torch.ones(2, 2),
             1,
             2,
@@ -2741,11 +3311,15 @@ def test_py_rref_args_user_share(self):
             3,
             4,
             torch.ones(2, 2) * 2 + 10
+=======
+            torch.ones(2, 2), 1, 2, torch.ones(2, 2), 3, 4, torch.ones(2, 2) * 2 + 10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_py_rpc_rref_args(self):
         self._py_rpc_rref_args(
+<<<<<<< HEAD
             torch.ones(2, 2),
             1,
             2,
@@ -2753,10 +3327,14 @@ def test_py_rpc_rref_args(self):
             3,
             4,
             torch.ones(2, 2) * 2 + 10
+=======
+            torch.ones(2, 2), 1, 2, torch.ones(2, 2), 3, 4, torch.ones(2, 2) * 2 + 10
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_nested_remote(self):
+<<<<<<< HEAD
         self._nested_remote(
             nested_remote,
             torch.ones(2, 2) + 3
@@ -2769,13 +3347,24 @@ def test_nested_rref(self):
             torch.ones(2, 2) + 1,
             torch.ones(2, 2) + 2
         )
+=======
+        self._nested_remote(nested_remote, torch.ones(2, 2) + 3)
+
+    @dist_init
+    def test_nested_rref(self):
+        self._nested_rref(nested_rref, torch.ones(2, 2) + 1, torch.ones(2, 2) + 2)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_nested_rref_stress(self):
         self._nested_rref_stress(
+<<<<<<< HEAD
             nested_rref,
             torch.ones(2, 2) + 1,
             torch.ones(2, 2) + 2
+=======
+            nested_rref, torch.ones(2, 2) + 1, torch.ones(2, 2) + 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -2821,9 +3410,13 @@ def test_rref_forward_chain(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
 
+<<<<<<< HEAD
         rref = rpc.remote(
             worker_name(dst_rank), torch.add, args=(torch.ones(n, n), 1)
         )
+=======
+        rref = rpc.remote(worker_name(dst_rank), torch.add, args=(torch.ones(n, n), 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         ret_rref = rref_forward_chain(dst_rank, self.world_size, rref, ttl)
 
@@ -2848,6 +3441,7 @@ def test_local_value_not_on_owner(self):
             worker_name(next_rank), torch.add, args=(torch.ones(1), torch.ones(1))
         )
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError, (
                 fr"For UserRRef\(rref_id=GloballyUniqueId\(created_on={self.rank}, local_id=0\), "
                 fr"fork_id=GloballyUniqueId\(created_on={self.rank}, local_id=1\)\), "
@@ -2855,6 +3449,16 @@ def test_local_value_not_on_owner(self):
                 fr"WorkerInfo\(id={self.rank}, name={worker_name(self.rank)}\). "
                 fr"Call it on owner WorkerInfo\(id={next_rank}, name={worker_name(next_rank)}\)"
             )
+=======
+            RuntimeError,
+            (
+                rf"For UserRRef\(rref_id=GloballyUniqueId\(created_on={self.rank}, local_id=0\), "
+                rf"fork_id=GloballyUniqueId\(created_on={self.rank}, local_id=1\)\), "
+                r"can't call localValue\(\) on user "
+                rf"WorkerInfo\(id={self.rank}, name={worker_name(self.rank)}\). "
+                rf"Call it on owner WorkerInfo\(id={next_rank}, name={worker_name(next_rank)}\)"
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             rref.local_value()
 
@@ -2885,7 +3489,10 @@ def test_return_local_rrefs(self):
 
     @dist_init
     def _test_rref_type(self, blocking):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def launched_rpc(events):
             expected_name = f"rpc_{RPCExecMode.ASYNC.value}#_rref_typeof_on_owner"
             return any(e.name.startswith(expected_name) for e in events)
@@ -2954,7 +3561,10 @@ def _test_rref_type_with_error(self, blocking):
             with self.assertRaisesRegex(ValueError, "Expected error"):
                 fut.wait()
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_rref_type_with_error_blocking(self):
         self._test_rref_type_with_error(blocking=True)
 
@@ -3098,8 +3708,17 @@ def test_call_method_on_rref(self):
     # `torch.distributed.rpc.api`, so patching
     # `torch.distributed.rpc._delete_all_user_and_unforked_owner_rrefs` will
     # not help.
+<<<<<<< HEAD
     @mock.patch.object(torch.distributed.rpc.api, "_delete_all_user_and_unforked_owner_rrefs")
     def _test_rref_leak(self, _mock_delete_all_user_and_unforked_owner_rrefs, ignore_leak):
+=======
+    @mock.patch.object(
+        torch.distributed.rpc.api, "_delete_all_user_and_unforked_owner_rrefs"
+    )
+    def _test_rref_leak(
+        self, _mock_delete_all_user_and_unforked_owner_rrefs, ignore_leak
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rpc.init_rpc(
             name=worker_name(self.rank),
             backend=self.rpc_backend,
@@ -3141,6 +3760,7 @@ def test_rref_str(self):
         rref1 = RRef(self.rank)
         id_class = "GloballyUniqueId"
         self.assertEqual(
+<<<<<<< HEAD
             f"OwnerRRef({id_class}(created_on={self.rank}, local_id=0))", rref1.__str__()
         )
 
@@ -3148,6 +3768,14 @@ def test_rref_str(self):
         rref2 = rpc.remote(
             worker_name(dst_rank), torch.add, args=(torch.ones(2, 2), 1)
         )
+=======
+            f"OwnerRRef({id_class}(created_on={self.rank}, local_id=0))",
+            rref1.__str__(),
+        )
+
+        dst_rank = (self.rank + 1) % self.world_size
+        rref2 = rpc.remote(worker_name(dst_rank), torch.add, args=(torch.ones(2, 2), 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(
             rref2.__str__(),
             f"UserRRef(RRefId = {id_class}(created_on={self.rank}, local_id=1), "
@@ -3172,12 +3800,19 @@ def test_rref_get_future(self):
             self.assertIsInstance(fut, torch._C.Future)
 
             # Script
+<<<<<<< HEAD
             rref = rpc.remote(worker_name(1), my_script_func, args=(torch.tensor(1), ))
+=======
+            rref = rpc.remote(worker_name(1), my_script_func, args=(torch.tensor(1),))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rref.to_here()
             fut = rref._get_future()
             self.assertIsInstance(fut, torch._C.Future)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dist_init
     def test_rref_context_debug_info(self):
         # This test checks local states that are modified by remote workers.
@@ -3231,12 +3866,17 @@ def test_rref_context_debug_info(self):
 
         # Check 3: rpc.remote call should update owners_ map
         ####################################################
+<<<<<<< HEAD
         rref2 = rpc.remote(
             worker_name(dst_rank), torch.add, args=(torch.ones(2, 2), 1)
         )
         rref3 = rpc.remote(
             worker_name(dst_rank), torch.add, args=(torch.ones(2, 2), 1)
         )
+=======
+        rref2 = rpc.remote(worker_name(dst_rank), torch.add, args=(torch.ones(2, 2), 1))
+        rref3 = rpc.remote(worker_name(dst_rank), torch.add, args=(torch.ones(2, 2), 1))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         rref2.to_here()
         rref3.to_here()
 
@@ -3517,7 +4157,13 @@ def test_wait_all(self):
             dst = worker_name((self.rank + 1) % self.world_size)
             fut = rpc.rpc_async(dst, torch.add, (torch.ones(2, 2), 1))
             self.assertTrue(len(_thread_local_var.future_list) == 1)
+<<<<<<< HEAD
             self.assertTrue(isinstance(_thread_local_var.future_list[0], torch._C.Future))
+=======
+            self.assertTrue(
+                isinstance(_thread_local_var.future_list[0], torch._C.Future)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertTrue(fut.done())
         self.assertEqual(fut.wait(), torch.ones(2, 2) + 1)
         self.assertFalse(hasattr(_thread_local_var, "future_list"))
@@ -3581,15 +4227,22 @@ def test_custom_exception_throw_during_reconstruction(self):
                 self.assertTrue("Original exception on remote side was" in msg)
                 self.assertTrue("CustomException" in msg)
             except BaseException as e:
+<<<<<<< HEAD
                 raise RuntimeError(
                     f"Failure - expected RuntimeError, got {e}"
                 ) from e
+=======
+                raise RuntimeError(f"Failure - expected RuntimeError, got {e}") from e
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             finally:
                 self.assertTrue(exc_caught)
 
         dist.barrier()
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     timed_out_rpc_event = None
 
     @staticmethod
@@ -3659,7 +4312,10 @@ def test_wait_all_exit_early_script_function(self):
         # Unblock RPC thread for fut1
         RpcTest.timed_out_rpc_event.set()
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @dist_init
     def test_function_not_on_callee(self):
         # test that if a function does not exist on a callee, we don't crash,
@@ -3680,9 +4336,13 @@ def test_function_not_on_callee(self):
             wait_for_value_future()
             # Ensure that we have the attribute on this module. Otherwise, the test could fail due to a caller-side pickling error.
             self.assertTrue(hasattr(this_module, "foo_add"))
+<<<<<<< HEAD
             with self.assertRaisesRegex(
                 RuntimeError, "RPC pickler does not serialize"
             ):
+=======
+            with self.assertRaisesRegex(RuntimeError, "RPC pickler does not serialize"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 rpc.rpc_sync(callee_worker, foo_add, args=())
 
     @dist_init
@@ -3697,11 +4357,15 @@ def test_non_garbage_collected_user_rref_due_to_local_circular_dependency(self):
         b.other = a
 
         n = self.rank
+<<<<<<< HEAD
         a.rref = rpc.remote(
             dst_worker_name,
             torch.add,
             args=(torch.ones(n, n), 2)
         )
+=======
+        a.rref = rpc.remote(dst_worker_name, torch.add, args=(torch.ones(n, n), 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init(setup_rpc=False)
     def test_use_rref_after_shutdown(self):
@@ -3731,6 +4395,10 @@ def test_use_rref_after_shutdown(self):
             RuntimeError, "Cannot call fork an UserRRef after deletion."
         ):
             import torch.distributed.rpc.internal as internal
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             internal.serialize(rref)
 
     @staticmethod
@@ -3748,45 +4416,71 @@ def _gpu_tensor_list_arg(tensor_list):
     def _create_rref(self):
         owner_rank = (self.rank + 2) % self.world_size
         return rpc.remote(
+<<<<<<< HEAD
             worker_name(owner_rank),
             torch.add,
             args=(torch.zeros(2, 2), 1)
+=======
+            worker_name(owner_rank), torch.add, args=(torch.zeros(2, 2), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_user_rrefs_confirmed(self):
         dst_rank = (self.rank + 1) % self.world_size
         rref = self._create_rref()
+<<<<<<< HEAD
         ret = rpc.rpc_sync(
             worker_name(dst_rank),
             check_rref_confirmed,
             args=(rref,)
         )
+=======
+        ret = rpc.rpc_sync(worker_name(dst_rank), check_rref_confirmed, args=(rref,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, True)
 
     @dist_init
     def test_user_rrefs_confirmed_remote(self):
         dst_rank = (self.rank + 1) % self.world_size
         rref = self._create_rref()
+<<<<<<< HEAD
         ret_rref = rpc.remote(
             worker_name(dst_rank),
             check_rref_confirmed,
             args=(rref,)
         )
+=======
+        ret_rref = rpc.remote(worker_name(dst_rank), check_rref_confirmed, args=(rref,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret_rref.to_here(), True)
 
     @dist_init
     def test_rref_py_pickle_not_supported(self):
         local_rref = RRef(35)
         with TemporaryFileName() as fname:
+<<<<<<< HEAD
             with self.assertRaisesRegex(RuntimeError, "Can not pickle rref in python pickler"):
+=======
+            with self.assertRaisesRegex(
+                RuntimeError, "Can not pickle rref in python pickler"
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 torch.save(local_rref, fname)
 
     @dist_init
     def test_remote_throw(self):
+<<<<<<< HEAD
         rref = rpc.remote(worker_name((self.rank + 1) % self.world_size),
                           raise_or_inc,
                           args=(torch.ones(2),))
+=======
+        rref = rpc.remote(
+            worker_name((self.rank + 1) % self.world_size),
+            raise_or_inc,
+            args=(torch.ones(2),),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         with self.assertRaisesRegex(Exception, ".*Expected error.*"):
             rref.to_here()
 
@@ -3803,7 +4497,13 @@ def test_non_cont_tensors(self):
 
             # Send non-cont tensor over RPC.
             next_rank = (self.rank + 1) % self.world_size
+<<<<<<< HEAD
             t_ret = rpc.rpc_sync(worker_name(next_rank), non_cont_test, args=(t_view, t_cont))
+=======
+            t_ret = rpc.rpc_sync(
+                worker_name(next_rank), non_cont_test, args=(t_view, t_cont)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Verify the returned tensor.
             self.assertEqual(t_view, t_ret)
@@ -3822,7 +4522,11 @@ def callback(fut):
         fut = rpc.rpc_async(
             worker_name(n % self.world_size),
             torch.add,
+<<<<<<< HEAD
             args=(torch.ones(n, n), torch.ones(n, n))
+=======
+            args=(torch.ones(n, n), torch.ones(n, n)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         fut.then(callback)
@@ -3838,7 +4542,11 @@ def test_callback_wrong_arg_num(self):
         fut = rpc.rpc_async(
             worker_name(n % self.world_size),
             torch.add,
+<<<<<<< HEAD
             args=(torch.ones(n, n), torch.ones(n, n))
+=======
+            args=(torch.ones(n, n), torch.ones(n, n)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         cb_fut = fut.then(my_function)
@@ -3846,8 +4554,12 @@ def test_callback_wrong_arg_num(self):
         self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError,
             "my\\_function\\(\\) missing 2 required positional arguments"
+=======
+            RuntimeError, "my\\_function\\(\\) missing 2 required positional arguments"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             cb_fut.wait()
 
@@ -3859,8 +4571,12 @@ def test_callback_wrong_arg_type(self):
         fut1 = fut0.then(lambda x: x + 1)
 
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError,
             "unsupported operand type\\(s\\) for \\+"
+=======
+            RuntimeError, "unsupported operand type\\(s\\) for \\+"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             fut1.wait()
 
@@ -3877,7 +4593,11 @@ def callback(idx, fut):
         fut = rpc.rpc_async(
             worker_name(n % self.world_size),
             torch.add,
+<<<<<<< HEAD
             args=(torch.ones(n, n), torch.ones(n, n))
+=======
+            args=(torch.ones(n, n), torch.ones(n, n)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         cb_futs = [fut.then(partial(callback, idx)) for idx in range(num_cbs)]
@@ -3885,10 +4605,14 @@ def callback(idx, fut):
         self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
 
         for idx in range(num_cbs):
+<<<<<<< HEAD
             self.assertEqual(
                 cb_futs[idx].wait(),
                 torch.ones(n, n) * 2 + idx
             )
+=======
+            self.assertEqual(cb_futs[idx].wait(), torch.ones(n, n) * 2 + idx)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
 
@@ -3900,9 +4624,13 @@ def callback(fut):
             return fut.wait() + 1
 
         fut = rpc.rpc_async(
+<<<<<<< HEAD
             worker_name(n % self.world_size),
             torch.add,
             args=(torch.ones(n, n), 1)
+=======
+            worker_name(n % self.world_size), torch.add, args=(torch.ones(n, n), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         num_cbs = 20
@@ -3916,11 +4644,15 @@ def test_callback_in_rpc(self):
         dst1 = worker_name((self.rank + 1) % self.world_size)
         dst2 = worker_name((self.rank + 2) % self.world_size)
 
+<<<<<<< HEAD
         ret = rpc.rpc_sync(
             dst1,
             add_use_future_cb,
             args=(dst2, torch.ones(2, 2), 1, 2)
         )
+=======
+        ret = rpc.rpc_sync(dst1, add_use_future_cb, args=(dst2, torch.ones(2, 2), 1, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, torch.ones(2, 2) + 1 + 2)
 
     @dist_init
@@ -3928,6 +4660,7 @@ def test_callback_with_ret(self):
         dst = worker_name((self.rank + 1) % self.world_size)
 
         def callback(fut0):
+<<<<<<< HEAD
             fut2 = rpc.rpc_async(
                 dst,
                 torch.add,
@@ -3941,6 +4674,15 @@ def callback(fut0):
             torch.add,
             args=(torch.ones(2, 2), 1)
         ).then(callback)
+=======
+            fut2 = rpc.rpc_async(dst, torch.add, args=(fut0.wait(), 1)).then(
+                lambda fut1: fut1.wait() + 1
+            )
+
+            return fut2.wait()
+
+        fut3 = rpc.rpc_async(dst, torch.add, args=(torch.ones(2, 2), 1)).then(callback)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         self.assertEqual(fut3.wait(), torch.ones(2, 2) + 3)
 
@@ -3960,10 +4702,14 @@ def callback(fut0):
     @dist_init
     def test_callback_none(self):
         dst = worker_name((self.rank + 1) % self.world_size)
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             TypeError,
             "incompatible function arguments."
         ):
+=======
+        with self.assertRaisesRegex(TypeError, "incompatible function arguments."):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rpc.rpc_async(dst, raise_func).then(None)
 
     @dist_init
@@ -3979,7 +4725,11 @@ def callback(fut):
         fut = rpc.rpc_async(
             worker_name(n % self.world_size),
             torch.add,
+<<<<<<< HEAD
             args=(torch.ones(n, n), torch.ones(n, n))
+=======
+            args=(torch.ones(n, n), torch.ones(n, n)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         fut.add_done_callback(callback)
@@ -3998,12 +4748,20 @@ def test_mark_future_twice(self):
         fut = rpc.rpc_async(
             worker_name((self.rank + 1) % self.world_size),
             torch.add,
+<<<<<<< HEAD
             args=(torch.zeros(2, 2), 1)
         )
         self.assertEqual(fut.wait(), torch.zeros(2, 2) + 1)
         with self.assertRaisesRegex(
             RuntimeError,
             "Future can only be marked completed once"
+=======
+            args=(torch.zeros(2, 2), 1),
+        )
+        self.assertEqual(fut.wait(), torch.zeros(2, 2) + 1)
+        with self.assertRaisesRegex(
+            RuntimeError, "Future can only be marked completed once"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             fut.set_result(1)
 
@@ -4044,11 +4802,15 @@ def _test_future_cb(self, func):
         dst1 = worker_name((self.rank + 1) % self.world_size)
         dst2 = worker_name((self.rank + 2) % self.world_size)
 
+<<<<<<< HEAD
         ret = rpc.rpc_sync(
             dst1,
             func,
             args=(dst2, torch.ones(2, 2), 1, 2)
         )
+=======
+        ret = rpc.rpc_sync(dst1, func, args=(dst2, torch.ones(2, 2), 1, 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, torch.ones(2, 2) + 1 + 2)
 
     @dist_init
@@ -4062,9 +4824,13 @@ def test_future_nested_callback(self):
     def _test_async_function_raise(self, mode):
         with self.assertRaisesRegex(RuntimeError, "Expected error"):
             self._run_func_in_mode(
+<<<<<<< HEAD
                 worker_name((self.rank + 1) % self.world_size),
                 async_raise_func,
                 mode
+=======
+                worker_name((self.rank + 1) % self.world_size), async_raise_func, mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @dist_init
@@ -4086,9 +4852,13 @@ def _test_async_function_wrong_return_type(self, mode):
         )
         with self.assertRaisesRegex(RuntimeError, errMsg):
             self._run_func_in_mode(
+<<<<<<< HEAD
                 worker_name((self.rank + 1) % self.world_size),
                 async_wrong_type,
                 mode
+=======
+                worker_name((self.rank + 1) % self.world_size), async_wrong_type, mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @dist_init
@@ -4125,10 +4895,14 @@ def test_async_function_with_future_ctor(self):
 
     @dist_init
     def test_async_function_with_future_ctor_remote(self):
+<<<<<<< HEAD
         self._test_async_function(
             async_add_with_future_ctor,
             RPCExecMode.REMOTE
         )
+=======
+        self._test_async_function(async_add_with_future_ctor, RPCExecMode.REMOTE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_async_function_chained(self):
@@ -4153,8 +4927,12 @@ def test_async_static_method(self):
     @dist_init
     def test_async_static_method_remote(self):
         self._test_async_function(
+<<<<<<< HEAD
             AsyncExecutionClass.static_async_add,
             RPCExecMode.REMOTE
+=======
+            AsyncExecutionClass.static_async_add, RPCExecMode.REMOTE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -4164,8 +4942,12 @@ def test_async_class_method(self):
     @dist_init
     def test_async_class_method_remote(self):
         self._test_async_function(
+<<<<<<< HEAD
             AsyncExecutionClass.class_async_add,
             RPCExecMode.REMOTE
+=======
+            AsyncExecutionClass.class_async_add, RPCExecMode.REMOTE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _test_test_async_class_rref_proxy(self, mode=RPCExecMode.SYNC):
@@ -4218,6 +5000,7 @@ def test_async_function_multi_chained(self):
 
     @dist_init
     def test_async_function_multi_chained_async(self):
+<<<<<<< HEAD
         self._test_async_function_multi(
             async_add_chained_multi,
             RPCExecMode.ASYNC
@@ -4229,6 +5012,13 @@ def test_async_function_multi_chained_remote(self):
             async_add_chained_multi,
             RPCExecMode.REMOTE
         )
+=======
+        self._test_async_function_multi(async_add_chained_multi, RPCExecMode.ASYNC)
+
+    @dist_init
+    def test_async_function_multi_chained_remote(self):
+        self._test_async_function_multi(async_add_chained_multi, RPCExecMode.REMOTE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_async_function_multi_fanout(self):
@@ -4236,6 +5026,7 @@ def test_async_function_multi_fanout(self):
 
     @dist_init
     def test_async_function_multi_fanout_async(self):
+<<<<<<< HEAD
         self._test_async_function_multi(
             async_add_multi_fanout,
             RPCExecMode.ASYNC
@@ -4257,6 +5048,20 @@ def _test_return_future(self, mode):
                 worker_name((self.rank + 1) % self.world_size),
                 return_future,
                 mode
+=======
+        self._test_async_function_multi(async_add_multi_fanout, RPCExecMode.ASYNC)
+
+    @dist_init
+    def test_async_function_multi_fanout_remote(self):
+        self._test_async_function_multi(async_add_multi_fanout, RPCExecMode.REMOTE)
+
+    def _test_return_future(self, mode):
+        with self.assertRaisesRegex(
+            RuntimeError, "Can not pickle torch.futures.Future"
+        ):
+            self._run_func_in_mode(
+                worker_name((self.rank + 1) % self.world_size), return_future, mode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     @dist_init
@@ -4281,7 +5086,11 @@ def test_rref_timeout(self):
         dst_rank = (self.rank + 1) % self.world_size
         dst_worker = f"worker{dst_rank}"
         # 10 ms timeout
+<<<<<<< HEAD
         rref = rpc.remote(dst_worker, my_sleep_func, args=(2, ), timeout=0.01)
+=======
+        rref = rpc.remote(dst_worker, my_sleep_func, args=(2,), timeout=0.01)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Future corresponding to the remote creation should time out.
         expected_error = self.get_timeout_error_regex()
         with self.assertRaisesRegex(RuntimeError, expected_error):
@@ -4296,7 +5105,11 @@ def test_rref_timeout(self):
     @dist_init(setup_rpc=False)
     @skip_but_pass_in_sandcastle_if(
         os.environ.get("RPC_INIT_WITH_TCP", None) == "1",
+<<<<<<< HEAD
         "init_pg_then_rpc does not work with TCP init, see https://github.com/pytorch/pytorch/issues/41614."
+=======
+        "init_pg_then_rpc does not work with TCP init, see https://github.com/pytorch/pytorch/issues/41614.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_init_pg_then_rpc(self):
         dist.init_process_group(
@@ -4316,7 +5129,13 @@ def test_init_pg_then_rpc(self):
 
         # Test RPC.
         next_rank = (self.rank + 1) % self.world_size
+<<<<<<< HEAD
         ret = rpc.rpc_sync(worker_name(next_rank), torch.add, args=(torch.ones(2, 2), 1))
+=======
+        ret = rpc.rpc_sync(
+            worker_name(next_rank), torch.add, args=(torch.ones(2, 2), 1)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, torch.ones(2, 2) + 1)
 
         # Test PG
@@ -4327,7 +5146,11 @@ def test_init_pg_then_rpc(self):
     @dist_init(setup_rpc=False)
     @skip_but_pass_in_sandcastle_if(
         os.environ.get("RPC_INIT_WITH_TCP", None) == "1",
+<<<<<<< HEAD
         "init_rpc_then_pg does not work with TCP init, see https://github.com/pytorch/pytorch/issues/41614."
+=======
+        "init_rpc_then_pg does not work with TCP init, see https://github.com/pytorch/pytorch/issues/41614.",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
     def test_init_rpc_then_pg(self):
         rpc.init_rpc(
@@ -4347,7 +5170,13 @@ def test_init_rpc_then_pg(self):
 
         # Test RPC.
         next_rank = (self.rank + 1) % self.world_size
+<<<<<<< HEAD
         ret = rpc.rpc_sync(worker_name(next_rank), torch.add, args=(torch.ones(2, 2), 1))
+=======
+        ret = rpc.rpc_sync(
+            worker_name(next_rank), torch.add, args=(torch.ones(2, 2), 1)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, torch.ones(2, 2) + 1)
 
         # Test PG
@@ -4366,7 +5195,13 @@ def test_wait_all_with_exception(self):
     @dist_init
     def test_wait_all_with_partial_exception(self):
         dst = worker_name((self.rank + 1) % self.world_size)
+<<<<<<< HEAD
         futs = [rpc.rpc_async(dst, torch.add, args=(torch.ones(2), 1)) for _ in range(10)]
+=======
+        futs = [
+            rpc.rpc_async(dst, torch.add, args=(torch.ones(2), 1)) for _ in range(10)
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         futs.append(rpc.rpc_async(dst, raise_func))
 
@@ -4434,7 +5269,11 @@ def test_wrong_types(self):
                 rank=self.rank,
                 world_size=self.world_size,
                 backend=self.rpc_backend,
+<<<<<<< HEAD
                 rpc_backend_options={"init_method": self.init_method}
+=======
+                rpc_backend_options={"init_method": self.init_method},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
     def test_cannot_infer_backend_from_options(self):
@@ -4473,6 +5312,7 @@ def test_owner_rref_backward(self):
             rref = rpc.RRef(t2.sum())
             rref.backward(context_id, retain_graph=True)
             rref.backward(context_id)
+<<<<<<< HEAD
             self.assertEqual(expected_grad * 2, dist_autograd.get_gradients(context_id)[t1])
 
         # Test errors.
@@ -4486,6 +5326,31 @@ def test_owner_rref_backward(self):
             rpc.RRef(torch.rand(10, requires_grad=True).sum()).backward(100)
 
         with self.assertRaisesRegex(RuntimeError, "RRef should contain a tensor for .backward()"):
+=======
+            self.assertEqual(
+                expected_grad * 2, dist_autograd.get_gradients(context_id)[t1]
+            )
+
+        # Test errors.
+        with self.assertRaisesRegex(
+            RuntimeError, "tensors does not require grad and does not have a grad_fn"
+        ):
+            rpc.RRef(torch.rand(10)).backward()
+
+        with self.assertRaisesRegex(
+            RuntimeError, "grad can be implicitly created only for scalar outputs"
+        ):
+            rpc.RRef(torch.rand(10, requires_grad=True)).backward()
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Could not find autograd context with id: 100"
+        ):
+            rpc.RRef(torch.rand(10, requires_grad=True).sum()).backward(100)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "RRef should contain a tensor for .backward()"
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rpc.RRef("foo").backward()
 
     @staticmethod
@@ -4504,6 +5369,7 @@ def test_user_rref_backward(self):
             rref = rpc.remote(dst, RpcTest._sum, args=(t,))
             rref.backward(context_id, retain_graph=True)
             rref.backward(context_id)
+<<<<<<< HEAD
             self.assertEqual(torch.ones_like(t) * 2, dist_autograd.get_gradients(context_id)[t])
 
         with dist_autograd.context() as context_id:
@@ -4512,6 +5378,23 @@ def test_user_rref_backward(self):
                 rref.backward(context_id)
 
             with self.assertRaisesRegex(RuntimeError, "User RRefs require 'dist_autograd_ctx_id' to be specified"):
+=======
+            self.assertEqual(
+                torch.ones_like(t) * 2, dist_autograd.get_gradients(context_id)[t]
+            )
+
+        with dist_autograd.context() as context_id:
+            rref = rpc.remote(dst, RpcTest._identity, args=("foo",))
+            with self.assertRaisesRegex(
+                RuntimeError, "RRef should contain a tensor for .backward()"
+            ):
+                rref.backward(context_id)
+
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "User RRefs require 'dist_autograd_ctx_id' to be specified",
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 rref.backward()
 
     @dist_init(setup_rpc=False)
@@ -4534,23 +5417,39 @@ def test_shutdown_errors(self):
             # _all_gather on leader raises an exception.
             def raise_error(sequence_id, objects_map):
                 og_func(sequence_id, objects_map)
+<<<<<<< HEAD
                 raise RuntimeError('simulation')
+=======
+                raise RuntimeError("simulation")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             # Monkey-patch _delete_all_user_and_unforked_owner_rrefs to fail,
             # which would ensure barrier is not called on followers.
             def rref_error():
+<<<<<<< HEAD
                 raise RuntimeError('simulation rref')
+=======
+                raise RuntimeError("simulation rref")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             try:
                 rpc.api._broadcast_to_followers = raise_error
                 rpc.api._delete_all_user_and_unforked_owner_rrefs = rref_error
+<<<<<<< HEAD
                 with self.assertRaisesRegex(RuntimeError, 'simulation rref'):
+=======
+                with self.assertRaisesRegex(RuntimeError, "simulation rref"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     rpc.shutdown()
             finally:
                 rpc.api._broadcast_to_followers = og_func
                 rpc.api._delete_all_user_and_unforked_owner_rrefs = og_rref_func
         else:
+<<<<<<< HEAD
             with self.assertRaisesRegex(RuntimeError, 'timed out in _all_gather'):
+=======
+            with self.assertRaisesRegex(RuntimeError, "timed out in _all_gather"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 rpc.shutdown()
 
         dist.barrier()
@@ -4561,7 +5460,10 @@ def test_my_parameter_server(self):
 
 
 class CudaRpcTest(RpcAgentTestFixture):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @skip_if_lt_x_gpu(2)
     @dist_init
     def test_profiler_remote_cuda(self):
@@ -4574,13 +5476,22 @@ def test_profiler_remote_cuda(self):
         dst_worker_cuda_1 = worker_name(dst_cuda_1)
 
         with _profile(use_cuda=True) as p:
+<<<<<<< HEAD
             fut1 = rpc.rpc_async(dst_worker_cuda_0, udf_with_torch_ops, args=(0, ))
             fut2 = rpc.rpc_async(dst_worker_cuda_1, udf_with_torch_ops, args=(1, ))
+=======
+            fut1 = rpc.rpc_async(dst_worker_cuda_0, udf_with_torch_ops, args=(0,))
+            fut2 = rpc.rpc_async(dst_worker_cuda_1, udf_with_torch_ops, args=(1,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fut1.wait()
             fut2.wait()
 
         def get_name(event):
+<<<<<<< HEAD
             return event.name[event.name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR):]
+=======
+            return event.name[event.name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR) :]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         function_events = p.function_events
         for event in function_events:
@@ -4605,12 +5516,23 @@ def get_name(event):
         # Validate that EXPECTED_REMOTE_EVENTS is a subset of remotely profiled
         # events.
         remote_events = [event for event in function_events if event.is_remote]
+<<<<<<< HEAD
         remote_event_names = [get_name(event) for event in remote_events if get_name(event) in EXPECTED_REMOTE_EVENTS]
+=======
+        remote_event_names = [
+            get_name(event)
+            for event in remote_events
+            if get_name(event) in EXPECTED_REMOTE_EVENTS
+        ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(set(remote_event_names), set(EXPECTED_REMOTE_EVENTS))
 
 
 class TensorPipeAgentRpcTest(RpcAgentTestFixture, RpcTestCommon):
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def test_mismatched_type_for_options(self):
         # An exception should be raised if the options are not an instance of
         # TensorPipeRpcBackendOptions.
@@ -4629,8 +5551,12 @@ def test_mismatched_type_for_options(self):
 
     def test_infer_backend_from_options(self):
         rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
+<<<<<<< HEAD
             init_method=self.init_method,
             _transports=tp_transports()
+=======
+            init_method=self.init_method, _transports=tp_transports()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         rpc.init_rpc(
@@ -4735,18 +5661,33 @@ def test_rref_get_type_timeout_non_blocking(self):
     def test_op_with_invalid_args(self):
         dst = worker_name((self.rank + 1) % self.world_size)
         with self.assertRaisesRegex(
+<<<<<<< HEAD
             RuntimeError, "Overloaded torch operator invoked from Python failed to match any schema"
+=======
+            RuntimeError,
+            "Overloaded torch operator invoked from Python failed to match any schema",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             rpc.rpc_sync(dst, torch.add, args=())
 
     def _test_rref_proxy_timeout(self, rref_proxy_api):
         dst_rank = (self.rank + 1) % self.world_size
         dst = worker_name(dst_rank)
+<<<<<<< HEAD
         rref = rpc.remote(dst, MyClass, args=(torch.ones(2, 2), ))
         # Ensure RRef is created on remote node.
         rref.to_here()
         rref_api = getattr(rref, rref_proxy_api)
         self.assertTrue(rref_api is not None, f"Failed to get RRef proxy api: {rref_proxy_api}")
+=======
+        rref = rpc.remote(dst, MyClass, args=(torch.ones(2, 2),))
+        # Ensure RRef is created on remote node.
+        rref.to_here()
+        rref_api = getattr(rref, rref_proxy_api)
+        self.assertTrue(
+            rref_api is not None, f"Failed to get RRef proxy api: {rref_proxy_api}"
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         expected_error = self.get_timeout_error_regex()
         timeout = 2
         with self.assertRaisesRegex(RuntimeError, expected_error):
@@ -4788,14 +5729,22 @@ def test_send_to_rank_sparse(self):
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
             x = build_sparse_tensor()
             y = build_sparse_tensor()
+<<<<<<< HEAD
             expected_tensor = (x + y)
+=======
+            expected_tensor = x + y
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y))
             self.assertEqual(expected_tensor, ret)
 
         for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
             x = build_sparse_tensor(coalesce=True)
             y = build_sparse_tensor(coalesce=True)
+<<<<<<< HEAD
             expected_tensor = (x + y)
+=======
+            expected_tensor = x + y
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y))
             self.assertEqual(expected_tensor, ret)
 
@@ -4805,17 +5754,25 @@ def test_self_py_udf_remote_sparse(self):
             rpc.get_worker_info(),
             build_sparse_tensor(),
             build_sparse_tensor(),
+<<<<<<< HEAD
             build_sparse_tensor()
+=======
+            build_sparse_tensor(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_self_remote_rref_as_rpc_arg_sparse(self):
         dst = worker_name((self.rank + 1) % self.world_size)
         self._self_remote_rref_as_rpc_arg(
+<<<<<<< HEAD
             dst,
             build_sparse_tensor(),
             build_sparse_tensor(),
             build_sparse_tensor()
+=======
+            dst, build_sparse_tensor(), build_sparse_tensor(), build_sparse_tensor()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -4824,17 +5781,25 @@ def test_self_remote_rref_as_self_rpc_arg_sparse(self):
             rpc.get_worker_info(),
             build_sparse_tensor(),
             build_sparse_tensor(),
+<<<<<<< HEAD
             build_sparse_tensor()
+=======
+            build_sparse_tensor(),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_self_remote_rref_as_remote_arg_sparse(self):
         dst = worker_name((self.rank + 1) % self.world_size)
         self._self_remote_rref_as_remote_arg(
+<<<<<<< HEAD
             dst,
             build_sparse_tensor(),
             build_sparse_tensor(),
             build_sparse_tensor()
+=======
+            dst, build_sparse_tensor(), build_sparse_tensor(), build_sparse_tensor()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -4843,6 +5808,7 @@ def test_self_remote_rref_as_self_remote_arg_sparse(self):
             rpc.get_worker_info(),
             build_sparse_tensor(),
             build_sparse_tensor(),
+<<<<<<< HEAD
             build_sparse_tensor()
         )
 
@@ -4851,6 +5817,13 @@ def test_world_size_one_sparse(self):
             build_sparse_tensor(),
             build_sparse_tensor()
         )
+=======
+            build_sparse_tensor(),
+        )
+
+    def test_world_size_one_sparse(self):
+        self._world_size_one(build_sparse_tensor(), build_sparse_tensor())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_multi_rpc_sparse(self):
@@ -4867,9 +5840,13 @@ def test_py_sparse_tensors_in_container(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         a = [build_sparse_tensor(), build_sparse_tensor()]
+<<<<<<< HEAD
         ret = rpc.rpc_sync(
             worker_name(dst_rank), my_container_sum, args=(a,)
         )
+=======
+        ret = rpc.rpc_sync(worker_name(dst_rank), my_container_sum, args=(a,))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.assertEqual(ret, my_container_sum(a))
 
     @dist_init
@@ -4878,37 +5855,59 @@ def test_nested_rpc_sparse(self):
 
     @dist_init
     def test_stress_heavy_rpc_sparse(self):
+<<<<<<< HEAD
         self._stress_test_rpc(heavy_rpc_sparse, repeat=20, args=(build_sparse_tensor(),))
+=======
+        self._stress_test_rpc(
+            heavy_rpc_sparse, repeat=20, args=(build_sparse_tensor(),)
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_builtin_remote_ret_sparse(self):
         self._builtin_remote_ret(
+<<<<<<< HEAD
             build_sparse_tensor(),
             build_sparse_tensor(),
             build_sparse_tensor() * 2
+=======
+            build_sparse_tensor(), build_sparse_tensor(), build_sparse_tensor() * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_builtin_remote_self_sparse(self):
         self._builtin_remote_self(
+<<<<<<< HEAD
             build_sparse_tensor(),
             build_sparse_tensor(),
             build_sparse_tensor() * 2
+=======
+            build_sparse_tensor(), build_sparse_tensor(), build_sparse_tensor() * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_multi_builtin_remote_ret_sparse(self):
+<<<<<<< HEAD
         self._test_multi_remote_call(
             torch.add, True,
             args_fn=RpcTest._multi_args_fn
         )
+=======
+        self._test_multi_remote_call(torch.add, True, args_fn=RpcTest._multi_args_fn)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @dist_init
     def test_multi_py_udf_remote_sparse(self):
         self._test_multi_remote_call(
+<<<<<<< HEAD
             my_function,
             True,
             kwargs_fn=RpcTest._multi_kwargs_fn
+=======
+            my_function, True, kwargs_fn=RpcTest._multi_kwargs_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -4918,7 +5917,11 @@ def test_py_rref_args_sparse(self):
             build_sparse_tensor(),
             build_sparse_tensor(),
             build_sparse_tensor(),
+<<<<<<< HEAD
             build_sparse_tensor() * 4
+=======
+            build_sparse_tensor() * 4,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -4930,7 +5933,11 @@ def test_py_rref_args_user_share_sparse(self):
             build_sparse_tensor(),
             build_sparse_tensor(),
             build_sparse_tensor(),
+<<<<<<< HEAD
             build_sparse_tensor() * 6
+=======
+            build_sparse_tensor() * 6,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -4942,30 +5949,46 @@ def test_py_rpc_rref_args_sparse(self):
             build_sparse_tensor(),
             build_sparse_tensor(),
             build_sparse_tensor(),
+<<<<<<< HEAD
             build_sparse_tensor() * 6
+=======
+            build_sparse_tensor() * 6,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_nested_remote_sparse(self):
         self._nested_remote(
+<<<<<<< HEAD
             nested_remote_sparse,
             build_sparse_tensor() + build_sparse_tensor()
+=======
+            nested_remote_sparse, build_sparse_tensor() + build_sparse_tensor()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_nested_rref_sparse(self):
         self._nested_rref(
+<<<<<<< HEAD
             nested_rref_sparse,
             build_sparse_tensor() * 2,
             build_sparse_tensor() * 2
+=======
+            nested_rref_sparse, build_sparse_tensor() * 2, build_sparse_tensor() * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
     def test_nested_rref_stress_sparse(self):
         self._nested_rref_stress(
+<<<<<<< HEAD
             nested_rref_sparse,
             build_sparse_tensor() * 2,
             build_sparse_tensor() * 2
+=======
+            nested_rref_sparse, build_sparse_tensor() * 2, build_sparse_tensor() * 2
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @dist_init
@@ -5007,7 +6030,13 @@ def test_dynamic_rpc_new_rank_can_communicated_with_existing_rank(self):
                 rank=self.rank,
                 rpc_backend_options=self.rpc_backend_options,
             )
+<<<<<<< HEAD
             result = rpc.rpc_sync(worker_name(0), torch.add, args=(torch.tensor(1), torch.tensor(1)))
+=======
+            result = rpc.rpc_sync(
+                worker_name(0), torch.add, args=(torch.tensor(1), torch.tensor(1))
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.assertEqual(torch.add(torch.tensor(1), torch.tensor(1)), result)
 
         # Barrier to ensure that all rpc_sync calls are finished
@@ -5043,7 +6072,13 @@ def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank(self):
         dist.barrier()
         if self.rank == 0:
             for i in range(1, self.world_size):
+<<<<<<< HEAD
                 result = rpc.rpc_sync(worker_name(i), torch.add, args=(torch.tensor(1), torch.tensor(1)))
+=======
+                result = rpc.rpc_sync(
+                    worker_name(i), torch.add, args=(torch.tensor(1), torch.tensor(1))
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.assertEqual(torch.add(torch.tensor(1), torch.tensor(1)), result)
 
         # Barrier to ensure that all rpc_sync calls are finished
@@ -5123,7 +6158,13 @@ def test_dynamic_rpc_init_rpc_without_rank(self):
 
         # tcp init
         with self.assertRaisesRegex(ValueError, "rank parameter missing"):
+<<<<<<< HEAD
             rpc_backend_options = rpc.TensorPipeRpcBackendOptions(init_method="tcp://127.0.0.1:23456")
+=======
+            rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
+                init_method="tcp://127.0.0.1:23456"
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             rpc.init_rpc(
                 name=worker_name(self.rank),
                 backend=self.rpc_backend,
@@ -5134,10 +6175,18 @@ def test_dynamic_rpc_init_rpc_without_rank(self):
     def test_dynamic_and_static_init_rpc_together(self):
         # Initialize a static rpc group with size = self.world_size - 1
         dist.init_process_group(
+<<<<<<< HEAD
             backend='gloo',
             init_method=self.file_init_method,
             rank=self.rank,
             world_size=self.world_size)
+=======
+            backend="gloo",
+            init_method=self.file_init_method,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         world_size_minus_one = self.world_size - 1
         if self.rank < world_size_minus_one:
@@ -5154,8 +6203,16 @@ def test_dynamic_and_static_init_rpc_together(self):
         # Attempt to add an additional dynamic group member
         if self.rank == world_size_minus_one:
             # Expect error message to be thrown
+<<<<<<< HEAD
             with self.assertRaisesRegex(RuntimeError, "RPC group mixes statically and dynamically\
  initialized members which is not supported."):
+=======
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "RPC group mixes statically and dynamically\
+ initialized members which is not supported.",
+            ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 rpc.init_rpc(
                     name=worker_name(self.rank),
                     backend=self.rpc_backend,
@@ -5163,8 +6220,13 @@ def test_dynamic_and_static_init_rpc_together(self):
                     rpc_backend_options=self.rpc_backend_options,
                 )
 
+<<<<<<< HEAD
 class TensorPipeAgentCudaRpcTest(RpcAgentTestFixture, RpcTestCommon):
 
+=======
+
+class TensorPipeAgentCudaRpcTest(RpcAgentTestFixture, RpcTestCommon):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _test_device_maps(self, options, errMsg):
         with self.assertRaisesRegex(ValueError, errMsg):
             rpc.init_rpc(
@@ -5184,7 +6246,11 @@ def test_device_maps_wrong_worker_name(self):
 
         self._test_device_maps(
             options,
+<<<<<<< HEAD
             errMsg="Node worker0 has invalid target node names in its device maps"
+=======
+            errMsg="Node worker0 has invalid target node names in its device maps",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(1)
@@ -5195,7 +6261,11 @@ def test_device_maps_invalid_max_local_device(self):
 
         self._test_device_maps(
             options,
+<<<<<<< HEAD
             errMsg="Node worker0 has source devices with invalid indices in its device map for worker1"
+=======
+            errMsg="Node worker0 has source devices with invalid indices in its device map for worker1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(1)
@@ -5206,7 +6276,11 @@ def test_device_maps_invalid_max_remote_device(self):
 
         self._test_device_maps(
             options,
+<<<<<<< HEAD
             errMsg="Node worker0 has target devices with invalid indices in its device map for worker1"
+=======
+            errMsg="Node worker0 has target devices with invalid indices in its device map for worker1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(2)
@@ -5218,7 +6292,11 @@ def test_device_maps_many_to_one(self):
 
         self._test_device_maps(
             options,
+<<<<<<< HEAD
             errMsg="Node worker0 has duplicated target devices in its device map for worker1"
+=======
+            errMsg="Node worker0 has duplicated target devices in its device map for worker1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(2)
@@ -5236,6 +6314,7 @@ def test_device_maps_one_to_many(self):
     def test_device_maps_invalid_min_device(self):
         options = self.rpc_backend_options
         dst = worker_name((self.rank + 1) % self.world_size)
+<<<<<<< HEAD
         with self.assertRaisesRegex(
             RuntimeError, "Device index must not be negative"
         ):
@@ -5244,6 +6323,12 @@ def test_device_maps_invalid_min_device(self):
         with self.assertRaisesRegex(
             RuntimeError, "Device index must not be negative"
         ):
+=======
+        with self.assertRaisesRegex(RuntimeError, "Device index must not be negative"):
+            options.set_device_map(dst, {-1: 0})
+
+        with self.assertRaisesRegex(RuntimeError, "Device index must not be negative"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             options.set_device_map(dst, {0: -1})
 
     @staticmethod
@@ -5270,7 +6355,11 @@ def test_device_maps_gpu(self):
         ret = rpc.rpc_sync(
             dst,
             TensorPipeAgentCudaRpcTest._gpu_add,
+<<<<<<< HEAD
             args=(torch.zeros(2).to(0), torch.ones(2).to(0))
+=======
+            args=(torch.zeros(2).to(0), torch.ones(2).to(0)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(ret.device, torch.device(1))
         self.assertEqual(ret, (torch.zeros(2) + torch.ones(2)).to(1))
@@ -5285,7 +6374,13 @@ def _gpu_add_given_devices(x, y, x_to, y_to, z_to):
         else:
             raise ValueError("Wrong device affinity")
 
+<<<<<<< HEAD
     def _test_device_maps_gpu(self, x_from, y_from, z_to, device_map, dst=None, fn=None):
+=======
+    def _test_device_maps_gpu(
+        self, x_from, y_from, z_to, device_map, dst=None, fn=None
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         fn = TensorPipeAgentCudaRpcTest._gpu_add_given_devices if fn is None else fn
         x_to = device_map[x_from]
         y_to = device_map[y_from]
@@ -5307,7 +6402,11 @@ def _test_device_maps_gpu(self, x_from, y_from, z_to, device_map, dst=None, fn=N
 
         ret = rpc.rpc_sync(dst, fn, args=(x, y, x_to, y_to, z_to))
 
+<<<<<<< HEAD
         reverse_device_map = {device_map[k] : k for k in device_map}
+=======
+        reverse_device_map = {device_map[k]: k for k in device_map}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         z_from = reverse_device_map[z_to]
 
         ret_device = "cpu" if ret.device.type == "cpu" else ret.device.index
@@ -5321,7 +6420,11 @@ def test_device_map_cpu(self):
             x_from="cpu",
             y_from="cpu",
             z_to="cpu",
+<<<<<<< HEAD
             device_map={"cpu" : "cpu"},
+=======
+            device_map={"cpu": "cpu"},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn=TensorPipeAgentCudaRpcTest._gpu_add_given_devices,
         )
 
@@ -5331,7 +6434,11 @@ def test_device_map_cpu_to_gpu_default(self):
             x_from="cpu",
             y_from="cpu",
             z_to=0,
+<<<<<<< HEAD
             device_map={"cpu" : 0},
+=======
+            device_map={"cpu": 0},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn=TensorPipeAgentCudaRpcTest._gpu_add_given_devices,
         )
 
@@ -5341,7 +6448,11 @@ def test_device_map_cpu_to_gpu_non_default(self):
             x_from="cpu",
             y_from="cpu",
             z_to=1,
+<<<<<<< HEAD
             device_map={"cpu" : 1},
+=======
+            device_map={"cpu": 1},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn=TensorPipeAgentCudaRpcTest._gpu_add_given_devices,
         )
 
@@ -5351,7 +6462,11 @@ def test_device_map_gpu_to_cpu_default(self):
             x_from=0,
             y_from=0,
             z_to="cpu",
+<<<<<<< HEAD
             device_map={0 : "cpu"},
+=======
+            device_map={0: "cpu"},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn=TensorPipeAgentCudaRpcTest._gpu_add_given_devices,
         )
 
@@ -5361,12 +6476,17 @@ def test_device_map_gpu_to_cpu_non_default(self):
             x_from=1,
             y_from=1,
             z_to="cpu",
+<<<<<<< HEAD
             device_map={1 : "cpu"},
+=======
+            device_map={1: "cpu"},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             fn=TensorPipeAgentCudaRpcTest._gpu_add_given_devices,
         )
 
     @skip_if_lt_x_gpu(2)
     def test_device_map_gpu_default(self):
+<<<<<<< HEAD
         self._test_device_maps_gpu(
             x_from=0,
             y_from=0,
@@ -5472,6 +6592,53 @@ def test_device_map_gpu_mixed_8(self):
             z_to=1,
             device_map={0 : 1, 1 : 0}
         )
+=======
+        self._test_device_maps_gpu(x_from=0, y_from=0, z_to=0, device_map={0: 0})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_non_default(self):
+        self._test_device_maps_gpu(x_from=1, y_from=1, z_to=1, device_map={1: 1})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_default_to_non_default(self):
+        self._test_device_maps_gpu(x_from=0, y_from=0, z_to=1, device_map={0: 1})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_non_default_to_default(self):
+        self._test_device_maps_gpu(x_from=1, y_from=1, z_to=0, device_map={1: 0})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_1(self):
+        self._test_device_maps_gpu(x_from=0, y_from=1, z_to=0, device_map={0: 0, 1: 1})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_2(self):
+        self._test_device_maps_gpu(x_from=0, y_from=1, z_to=1, device_map={0: 0, 1: 1})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_3(self):
+        self._test_device_maps_gpu(x_from=1, y_from=0, z_to=0, device_map={0: 0, 1: 1})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_4(self):
+        self._test_device_maps_gpu(x_from=1, y_from=0, z_to=1, device_map={0: 0, 1: 1})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_5(self):
+        self._test_device_maps_gpu(x_from=0, y_from=1, z_to=0, device_map={0: 1, 1: 0})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_6(self):
+        self._test_device_maps_gpu(x_from=0, y_from=1, z_to=1, device_map={0: 1, 1: 0})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_7(self):
+        self._test_device_maps_gpu(x_from=1, y_from=0, z_to=0, device_map={0: 1, 1: 0})
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_8(self):
+        self._test_device_maps_gpu(x_from=1, y_from=0, z_to=1, device_map={0: 1, 1: 0})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @skip_if_lt_x_gpu(2)
     def test_device_map_gpu_mixed_self_1(self):
@@ -5479,8 +6646,13 @@ def test_device_map_gpu_mixed_self_1(self):
             x_from=0,
             y_from=1,
             z_to=0,
+<<<<<<< HEAD
             device_map={0 : 0, 1 : 1},
             dst=worker_name(self.rank)
+=======
+            device_map={0: 0, 1: 1},
+            dst=worker_name(self.rank),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(2)
@@ -5489,8 +6661,13 @@ def test_device_map_gpu_mixed_self_2(self):
             x_from=0,
             y_from=1,
             z_to=1,
+<<<<<<< HEAD
             device_map={0 : 0, 1 : 1},
             dst=worker_name(self.rank)
+=======
+            device_map={0: 0, 1: 1},
+            dst=worker_name(self.rank),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(2)
@@ -5499,8 +6676,13 @@ def test_device_map_gpu_mixed_self_3(self):
             x_from=1,
             y_from=0,
             z_to=0,
+<<<<<<< HEAD
             device_map={0 : 0, 1 : 1},
             dst=worker_name(self.rank)
+=======
+            device_map={0: 0, 1: 1},
+            dst=worker_name(self.rank),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(2)
@@ -5509,8 +6691,13 @@ def test_device_map_gpu_mixed_self_4(self):
             x_from=1,
             y_from=0,
             z_to=1,
+<<<<<<< HEAD
             device_map={0 : 0, 1 : 1},
             dst=worker_name(self.rank)
+=======
+            device_map={0: 0, 1: 1},
+            dst=worker_name(self.rank),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(2)
@@ -5519,8 +6706,13 @@ def test_device_map_gpu_mixed_self_5(self):
             x_from=0,
             y_from=1,
             z_to=0,
+<<<<<<< HEAD
             device_map={0 : 1, 1 : 0},
             dst=worker_name(self.rank)
+=======
+            device_map={0: 1, 1: 0},
+            dst=worker_name(self.rank),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(2)
@@ -5529,8 +6721,13 @@ def test_device_map_gpu_mixed_self_6(self):
             x_from=0,
             y_from=1,
             z_to=1,
+<<<<<<< HEAD
             device_map={0 : 1, 1 : 0},
             dst=worker_name(self.rank)
+=======
+            device_map={0: 1, 1: 0},
+            dst=worker_name(self.rank),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(2)
@@ -5539,8 +6736,13 @@ def test_device_map_gpu_mixed_self_7(self):
             x_from=1,
             y_from=0,
             z_to=0,
+<<<<<<< HEAD
             device_map={0 : 1, 1 : 0},
             dst=worker_name(self.rank)
+=======
+            device_map={0: 1, 1: 0},
+            dst=worker_name(self.rank),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @skip_if_lt_x_gpu(2)
@@ -5549,8 +6751,13 @@ def test_device_map_gpu_mixed_self_8(self):
             x_from=1,
             y_from=0,
             z_to=1,
+<<<<<<< HEAD
             device_map={0 : 1, 1 : 0},
             dst=worker_name(self.rank)
+=======
+            device_map={0: 1, 1: 0},
+            dst=worker_name(self.rank),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @staticmethod
@@ -5576,9 +6783,13 @@ def _test_device_maps_multi_gpu(self, dst):
         x = torch.zeros(2).to(0)
         y = torch.ones(2).to(1)
         rets = rpc.rpc_sync(
+<<<<<<< HEAD
             dst,
             TensorPipeAgentCudaRpcTest._gpu_add_multi_gpu,
             args=(x, y)
+=======
+            dst, TensorPipeAgentCudaRpcTest._gpu_add_multi_gpu, args=(x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.assertEqual(rets[0].device, torch.device(1))
@@ -5599,7 +6810,11 @@ def test_device_maps_multi_gpu_self(self):
 
     @staticmethod
     def _gpu_add_return_to_gpu(x, y):
+<<<<<<< HEAD
         if x.device.type == 'cpu' and y.device.type == 'cpu':
+=======
+        if x.device.type == "cpu" and y.device.type == "cpu":
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return (x + y).to(0), (x - y).to(1), (x * y).to(2), (x / y).to(3)
         else:
             raise ValueError("Wrong device affinity")
@@ -5618,14 +6833,23 @@ def test_device_maps_in_options(self):
                 init_method=options.init_method,
                 num_worker_threads=options.num_worker_threads,
                 device_maps={dst: {0: 1, 1: 0}},
+<<<<<<< HEAD
                 _transports=tp_transports()
             )
+=======
+                _transports=tp_transports(),
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         rets = rpc.rpc_sync(
             dst,
             TensorPipeAgentCudaRpcTest._gpu_add_multi_gpu,
+<<<<<<< HEAD
             args=(torch.zeros(2).to(0), torch.ones(2).to(1))
+=======
+            args=(torch.zeros(2).to(0), torch.ones(2).to(1)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(rets[0].device, torch.device(1))
         self.assertEqual(rets[1].device, torch.device(0))
@@ -5652,7 +6876,11 @@ def _test_device_maps_return_to_gpu(self, dst):
         rets = rpc.rpc_sync(
             dst,
             TensorPipeAgentCudaRpcTest._gpu_add_return_to_gpu,
+<<<<<<< HEAD
             args=(torch.zeros(2), torch.ones(2))
+=======
+            args=(torch.zeros(2), torch.ones(2)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         for i in range(len(rets)):
             self.assertEqual(rets[i].device, torch.device((3 + i) % 4))
@@ -5704,13 +6932,21 @@ def _test_device_maps_missing_config_response(self, mode):
                 rpc.rpc_sync(
                     dst,
                     TensorPipeAgentCudaRpcTest._add_to_gpu,
+<<<<<<< HEAD
                     args=(torch.zeros(2), 1)
+=======
+                    args=(torch.zeros(2), 1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             elif mode == RPCExecMode.REMOTE:
                 rpc.remote(
                     dst,
                     TensorPipeAgentCudaRpcTest._add_to_gpu,
+<<<<<<< HEAD
                     args=(torch.zeros(2), 1)
+=======
+                    args=(torch.zeros(2), 1),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ).to_here()
             else:
                 raise ValueError(f"unexpected mode {mode}")
@@ -5731,7 +6967,11 @@ def test_device_maps_missing_config_not_timeout(self):
             backend=self.rpc_backend,
             rank=self.rank,
             world_size=self.world_size,
+<<<<<<< HEAD
             rpc_backend_options=self.rpc_backend_options
+=======
+            rpc_backend_options=self.rpc_backend_options,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         timeout = rpc.get_rpc_timeout()
@@ -5785,9 +7025,13 @@ def test_device_maps_remote(self):
         )
 
         rref = rpc.remote(
+<<<<<<< HEAD
             dst,
             TensorPipeAgentCudaRpcTest._add_to_gpu,
             args=(torch.zeros(2), 1)
+=======
+            dst, TensorPipeAgentCudaRpcTest._add_to_gpu, args=(torch.zeros(2), 1)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         self.assertEqual(rref.to_here().device.index, 1)
@@ -5829,9 +7073,13 @@ def _test_custom_stream(self, fn, device_map):
     def _test_stream_sync(self, dst):
         x = torch.ones(2, 2).to(0)
         ret = rpc.rpc_sync(
+<<<<<<< HEAD
             dst,
             TensorPipeAgentCudaRpcTest._slow_add_on_user_stream,
             args=(x, x)
+=======
+            dst, TensorPipeAgentCudaRpcTest._slow_add_on_user_stream, args=(x, x)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(ret, 2 * x)
 
@@ -5847,7 +7095,11 @@ def _test_stream_multi_async(self, dst):
                 rpc.rpc_async(
                     dst,
                     TensorPipeAgentCudaRpcTest._slow_add_on_user_stream,
+<<<<<<< HEAD
                     args=(x, x)
+=======
+                    args=(x, x),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
             )
 
@@ -5856,17 +7108,25 @@ def _test_stream_multi_async(self, dst):
 
     @skip_if_lt_x_gpu(2)
     def test_custom_stream_multi(self):
+<<<<<<< HEAD
         self._test_custom_stream(
             self._test_stream_multi_async,
             {"cuda:0": "cuda:1"}
         )
+=======
+        self._test_custom_stream(self._test_stream_multi_async, {"cuda:0": "cuda:1"})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def _nested_slow_add_on_user_stream(dst, x, y, z):
         ret = rpc.rpc_sync(
+<<<<<<< HEAD
             dst,
             TensorPipeAgentCudaRpcTest._slow_add_on_user_stream,
             args=(x, y)
+=======
+            dst, TensorPipeAgentCudaRpcTest._slow_add_on_user_stream, args=(x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
         return TensorPipeAgentCudaRpcTest._slow_add_on_user_stream(ret, z)
@@ -5879,15 +7139,23 @@ def _test_stream_nested_sync(self, dst):
         ret = rpc.rpc_sync(
             dst,
             TensorPipeAgentCudaRpcTest._nested_slow_add_on_user_stream,
+<<<<<<< HEAD
             args=(nested_dst, x, y, z)
+=======
+            args=(nested_dst, x, y, z),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self.assertEqual(ret, 6 * x)
 
     @skip_if_lt_x_gpu(2)
     def test_custom_stream_nested(self):
         self._test_custom_stream(
+<<<<<<< HEAD
             self._test_stream_nested_sync,
             {"cuda:0": "cuda:1", "cuda:1": "cuda:0"}
+=======
+            self._test_stream_nested_sync, {"cuda:0": "cuda:1", "cuda:1": "cuda:0"}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def _test_stream_nested_multi_async(self, dst):
@@ -5907,7 +7175,11 @@ def _test_stream_nested_multi_async(self, dst):
                     rpc.rpc_async(
                         dst,
                         TensorPipeAgentCudaRpcTest._nested_slow_add_on_user_stream,
+<<<<<<< HEAD
                         args=(nested_dst, x, y, z)
+=======
+                        args=(nested_dst, x, y, z),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     )
                 )
 
@@ -5918,7 +7190,11 @@ def _test_stream_nested_multi_async(self, dst):
     def test_custom_stream_nested_multi(self):
         self._test_custom_stream(
             self._test_stream_nested_multi_async,
+<<<<<<< HEAD
             {"cuda:0": "cuda:1", "cuda:1": "cuda:0"}
+=======
+            {"cuda:0": "cuda:1", "cuda:1": "cuda:0"},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     @staticmethod
@@ -5947,12 +7223,19 @@ def test_device_mismatch(self):
 
         with self.assertRaisesRegex(
             RuntimeError,
+<<<<<<< HEAD
             "Expected all tensors to be on the same device, but found at least two devices"
         ):
             rpc.rpc_sync(
                 dst,
                 TensorPipeAgentCudaRpcTest._gpu_add_wrong_gpus,
                 args=(x, y)
+=======
+            "Expected all tensors to be on the same device, but found at least two devices",
+        ):
+            rpc.rpc_sync(
+                dst, TensorPipeAgentCudaRpcTest._gpu_add_wrong_gpus, args=(x, y)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         rpc.shutdown()
@@ -5960,7 +7243,11 @@ def test_device_mismatch(self):
     def _test_rref_synchronization(self, local_device, remote_device):
         dst = worker_name((self.rank + 1) % self.world_size)
         options = self.rpc_backend_options
+<<<<<<< HEAD
         options.set_device_map(dst, {local_device : remote_device})
+=======
+        options.set_device_map(dst, {local_device: remote_device})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         rpc.init_rpc(
             name=worker_name(self.rank),
@@ -6002,10 +7289,14 @@ def test_rref_to_here_synchronization4(self):
         self._test_rref_synchronization("cuda:0", "cuda:1")
 
     def _test_rref_as_arg_synchronization(
+<<<<<<< HEAD
         self,
         local_device,
         remote_device,
         devicesOptions=None
+=======
+        self, local_device, remote_device, devicesOptions=None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         dst = worker_name((self.rank + 1) % self.world_size)
         options = self.rpc_backend_options
@@ -6111,9 +7402,13 @@ def _test_rref_forward_synchronization(self, local_device, remote_device):
                 rref_input = RRef(torch.randn(200, 1, 28, 28).to(local_device))
                 rref_out = rref.remote().forward(rref_input, True)
                 out = rpc.remote(
+<<<<<<< HEAD
                     out_relay,
                     TensorPipeAgentCudaRpcTest._rref_relay,
                     args=(rref_out,)
+=======
+                    out_relay, TensorPipeAgentCudaRpcTest._rref_relay, args=(rref_out,)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ).to_here()
                 expected = rref.rpc_sync().forward(rref_input, True)
                 self.assertEqual(out, expected)
@@ -6140,6 +7435,7 @@ def _test_owner_rref_forward_synchronization(self, local_device, remote_device):
         if self.rank == 0:
             options = self.rpc_backend_options
             options.set_device_map("w0", {local_device: remote_device})
+<<<<<<< HEAD
             rpc.init_rpc(
                 "w0",
                 rank=0,
@@ -6150,6 +7446,15 @@ def _test_owner_rref_forward_synchronization(self, local_device, remote_device):
             model = rpc.remote(
                 "w0", torch.nn.Linear, (2048, 20000)
             ).remote().to(remote_device)
+=======
+            rpc.init_rpc("w0", rank=0, world_size=1, rpc_backend_options=options)
+
+            model = (
+                rpc.remote("w0", torch.nn.Linear, (2048, 20000))
+                .remote()
+                .to(remote_device)
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             for _ in range(30):
                 data = torch.rand(2048, 2048).to(local_device)
                 output = model.rpc_sync().forward(data)
@@ -6189,7 +7494,11 @@ def _return_tensor_view(i):
     def test_tensor_view_as_return_value(self):
         dst = worker_name((self.rank + 1) % self.world_size)
         options = self.rpc_backend_options
+<<<<<<< HEAD
         options.set_device_map(dst, {0 : 0})
+=======
+        options.set_device_map(dst, {0: 0})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         rpc.init_rpc(
             name=worker_name(self.rank),
@@ -6201,10 +7510,16 @@ def test_tensor_view_as_return_value(self):
 
         futs = [
             rpc.rpc_async(
+<<<<<<< HEAD
                 dst,
                 TensorPipeAgentCudaRpcTest._return_tensor_view,
                 args=(i,)
             ) for i in range(5)
+=======
+                dst, TensorPipeAgentCudaRpcTest._return_tensor_view, args=(i,)
+            )
+            for i in range(5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ]
 
         for i in range(5):
@@ -6216,11 +7531,19 @@ def test_tensor_view_as_return_value(self):
     def test_devices_option_mismatch(self):
         with self.assertRaisesRegex(
             ValueError,
+<<<<<<< HEAD
             "Node worker0 has unexpected source devices in its device map for worker1"
         ):
             dst = worker_name((self.rank + 1) % self.world_size)
             options = self.rpc_backend_options
             options.set_device_map(dst, {0 : 0})
+=======
+            "Node worker0 has unexpected source devices in its device map for worker1",
+        ):
+            dst = worker_name((self.rank + 1) % self.world_size)
+            options = self.rpc_backend_options
+            options.set_device_map(dst, {0: 0})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             options.set_devices([1])
 
             rpc.init_rpc(
@@ -6237,15 +7560,24 @@ def test_devices_option_mismatch(self):
     def test_devices_option_mismatch_reverse(self):
         with self.assertRaisesRegex(
             ValueError,
+<<<<<<< HEAD
             "Node worker0 has unexpected target devices in its device map for worker1"
+=======
+            "Node worker0 has unexpected target devices in its device map for worker1",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ):
             dst = worker_name((self.rank + 1) % self.world_size)
 
             options = rpc.TensorPipeRpcBackendOptions(
                 init_method=self.rpc_backend_options.init_method,
                 num_worker_threads=self.rpc_backend_options.num_worker_threads,
+<<<<<<< HEAD
                 device_maps={dst: {0 : 1}},
                 devices=[0]
+=======
+                device_maps={dst: {0: 1}},
+                devices=[0],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
             rpc.init_rpc(
diff --git a/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
index 629b0328f1ae..37515d6a597b 100644
--- a/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
+++ b/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
@@ -1,27 +1,42 @@
 # mypy: allow-untyped-defs
 
 import torch.distributed.rpc as rpc
+<<<<<<< HEAD
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
 from torch.testing._internal.common_distributed import (
     tp_transports,
 )
+=======
+from torch.testing._internal.common_distributed import tp_transports
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TensorPipeRpcAgentTestFixture(RpcAgentTestFixture):
     @property
     def rpc_backend(self):
+<<<<<<< HEAD
         return rpc.backend_registry.BackendType[
             "TENSORPIPE"
         ]
+=======
+        return rpc.backend_registry.BackendType["TENSORPIPE"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property
     def rpc_backend_options(self):
         return rpc.backend_registry.construct_rpc_backend_options(
+<<<<<<< HEAD
             self.rpc_backend,
             init_method=self.init_method,
             _transports=tp_transports()
+=======
+            self.rpc_backend, init_method=self.init_method, _transports=tp_transports()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def get_shutdown_error_regex(self):
diff --git a/torch/testing/_internal/distributed/rpc_utils.py b/torch/testing/_internal/distributed/rpc_utils.py
index 9db79173d1eb..27f326c07e28 100644
--- a/torch/testing/_internal/distributed/rpc_utils.py
+++ b/torch/testing/_internal/distributed/rpc_utils.py
@@ -6,9 +6,15 @@
 
 from torch.testing._internal.common_distributed import MultiProcessTestCase
 from torch.testing._internal.common_utils import (
+<<<<<<< HEAD
     TEST_WITH_DEV_DBG_ASAN,
     find_free_port,
     IS_SANDCASTLE,
+=======
+    find_free_port,
+    IS_SANDCASTLE,
+    TEST_WITH_DEV_DBG_ASAN,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.distributed.ddp_under_dist_autograd_test import (
     CudaDdpComparisonTest,
@@ -21,15 +27,35 @@
     ThreeWorkersRemoteModuleTest,
 )
 from torch.testing._internal.distributed.rpc.dist_autograd_test import (
+<<<<<<< HEAD
     DistAutogradTest,
     CudaDistAutogradTest,
     FaultyAgentDistAutogradTest,
     TensorPipeAgentDistAutogradTest,
     TensorPipeCudaDistAutogradTest
+=======
+    CudaDistAutogradTest,
+    DistAutogradTest,
+    FaultyAgentDistAutogradTest,
+    TensorPipeAgentDistAutogradTest,
+    TensorPipeCudaDistAutogradTest,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torch.testing._internal.distributed.rpc.dist_optimizer_test import (
     DistOptimizerTest,
 )
+<<<<<<< HEAD
+=======
+from torch.testing._internal.distributed.rpc.examples.parameter_server_test import (
+    ParameterServerTest,
+)
+from torch.testing._internal.distributed.rpc.examples.reinforcement_learning_rpc_test import (
+    ReinforcementLearningRpcTest,
+)
+from torch.testing._internal.distributed.rpc.faulty_agent_rpc_test import (
+    FaultyAgentRpcTest,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.distributed.rpc.jit.dist_autograd_test import (
     JitDistAutogradTest,
 )
@@ -40,6 +66,7 @@
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
+<<<<<<< HEAD
 from torch.testing._internal.distributed.rpc.faulty_agent_rpc_test import (
     FaultyAgentRpcTest,
 )
@@ -52,6 +79,13 @@
 from torch.testing._internal.distributed.rpc.examples.parameter_server_test import ParameterServerTest
 from torch.testing._internal.distributed.rpc.examples.reinforcement_learning_rpc_test import (
     ReinforcementLearningRpcTest,
+=======
+from torch.testing._internal.distributed.rpc.rpc_test import (
+    CudaRpcTest,
+    RpcTest,
+    TensorPipeAgentCudaRpcTest,
+    TensorPipeAgentRpcTest,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 
 
@@ -61,15 +95,26 @@ def _check_and_set_tcp_init():
     # different ports.
     use_tcp_init = os.environ.get("RPC_INIT_WITH_TCP", None)
     if use_tcp_init == "1":
+<<<<<<< HEAD
         os.environ["MASTER_ADDR"] = '127.0.0.1'
         os.environ["MASTER_PORT"] = str(find_free_port())
 
+=======
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = str(find_free_port())
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _check_and_unset_tcp_init():
     use_tcp_init = os.environ.get("RPC_INIT_WITH_TCP", None)
     if use_tcp_init == "1":
         del os.environ["MASTER_ADDR"]
         del os.environ["MASTER_PORT"]
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # The tests for the RPC module need to cover multiple possible combinations:
 # - different aspects of the API, each one having its own suite of tests;
 # - different agents (ProcessGroup, TensorPipe, ...);
@@ -80,8 +125,15 @@ def _check_and_unset_tcp_init():
 # we call the generate_tests function of this file, passing to it a fixture for
 # the agent, which then gets mixed-in with each test suite.
 
+<<<<<<< HEAD
 @unittest.skipIf(
     TEST_WITH_DEV_DBG_ASAN, "Skip ASAN as torch + multiprocessing spawn have known issues"
+=======
+
+@unittest.skipIf(
+    TEST_WITH_DEV_DBG_ASAN,
+    "Skip ASAN as torch + multiprocessing spawn have known issues",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 class SpawnHelper(MultiProcessTestCase):
     def setUp(self):
@@ -169,8 +221,15 @@ def generate_tests(
     for test_class in tests:
         if IS_SANDCASTLE and TEST_WITH_DEV_DBG_ASAN:
             print(
+<<<<<<< HEAD
                 f'Skipping test {test_class} on sandcastle for the following reason: '
                 'Skip dev-asan as torch + multiprocessing spawn have known issues', file=sys.stderr)
+=======
+                f"Skipping test {test_class} on sandcastle for the following reason: "
+                "Skip dev-asan as torch + multiprocessing spawn have known issues",
+                file=sys.stderr,
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             continue
 
         name = f"{prefix}{test_class.__name__}"
diff --git a/torch/testing/_internal/dynamo_test_failures.py b/torch/testing/_internal/dynamo_test_failures.py
index 7d56c6ebd05b..ba393e8c59a7 100644
--- a/torch/testing/_internal/dynamo_test_failures.py
+++ b/torch/testing/_internal/dynamo_test_failures.py
@@ -8,6 +8,10 @@
 #
 # We generate xFailIfTorchDynamo* for all tests in `dynamo_expected_failures`
 # We generate skipIfTorchDynamo* for all tests in `dynamo_skips`
+<<<<<<< HEAD
+=======
+# We generate runWithoutCompiledAutograd for all tests in `compiled_autograd_skips`
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #
 # For an easier-than-manual way of generating and updating these lists,
 # see scripts/compile_tests/update_failures.py
@@ -82,6 +86,11 @@ def find_test_dir():
 
     inductor_expected_failures = set()
     inductor_skips = set()
+<<<<<<< HEAD
+=======
+
+    compiled_autograd_skips = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 else:
     dynamo_failures_directory = os.path.join(test_dir, "dynamo_expected_failures")
     dynamo_skips_directory = os.path.join(test_dir, "dynamo_skips")
@@ -95,6 +104,14 @@ def find_test_dir():
     inductor_expected_failures = set(os.listdir(inductor_failures_directory))
     inductor_skips = set(os.listdir(inductor_skips_directory))
 
+<<<<<<< HEAD
+=======
+    compiled_autograd_skips_directory = os.path.join(
+        test_dir, "compiled_autograd_skips"
+    )
+    compiled_autograd_skips = set(os.listdir(compiled_autograd_skips_directory))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TODO: due to case sensitivity problems, for now list these files by hand
 extra_dynamo_skips = {
     "TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_T_cpu_float32",
diff --git a/torch/testing/_internal/hop_db.py b/torch/testing/_internal/hop_db.py
index 814df49f0f71..b0f69ede5dc2 100644
--- a/torch/testing/_internal/hop_db.py
+++ b/torch/testing/_internal/hop_db.py
@@ -100,6 +100,10 @@ def f2(x, y0, y1):
     "triton_kernel_wrapper_mutation",
     "triton_kernel_wrapper_functional",
     "hints_wrapper",
+<<<<<<< HEAD
+=======
+    "dynamo_bypassing_wrapper",  # TODO(soulitzer)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "foreach_map",
     "aoti_call_delegate",
 ]
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
index c5e2b19c4192..5c22ad173463 100644
--- a/torch/testing/_internal/inductor_utils.py
+++ b/torch/testing/_internal/inductor_utils.py
@@ -14,9 +14,28 @@
 from torch._inductor.graph import GraphLowering
 from torch._inductor.compile_fx import shape_env_from_inputs
 from torch._inductor.codecache import CppCodeCache
+<<<<<<< HEAD
 from torch._inductor.utils import get_gpu_shared_memory, is_big_gpu
 from torch._inductor.utils import GPU_TYPES, get_gpu_type
 from torch.utils._triton import has_triton
+=======
+from torch._inductor.custom_graph_pass import CustomGraphModulePass
+from torch._inductor.codegen.common import (
+    get_custom_backend_pass_for_device,
+    get_scheduling_for_device,
+    get_wrapper_codegen_for_device,
+    init_backend_registration,
+    register_backend_for_device
+)
+from torch._inductor.codegen.wrapper import PythonWrapperCodegen
+from torch._inductor.utils import get_gpu_shared_memory, is_big_gpu
+from torch._inductor.utils import GPU_TYPES, get_gpu_type, is_gpu
+from torch.utils._helion import has_helion
+from torch.utils._triton import has_triton
+from torch.testing._internal.common_device_type import (
+    get_desired_device_type_test_bases,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.testing._internal.common_utils import (
     LazyVal,
     IS_FBCODE,
@@ -45,6 +64,11 @@ def test_cpu():
 
 HAS_TRITON = has_triton()
 
+<<<<<<< HEAD
+=======
+HAS_HELION = has_helion()
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 if HAS_TRITON:
     import triton
     TRITON_HAS_CPU = "cpu" in triton.backends.backends
@@ -67,6 +91,20 @@ def test_cpu():
     for gpu in GPU_TYPES
 )
 
+<<<<<<< HEAD
+=======
+_desired_test_bases = get_desired_device_type_test_bases(allow_xpu=True)
+RUN_GPU = (
+    HAS_GPU
+    and any(is_gpu(getattr(x, "device_type", "")) for x in _desired_test_bases)
+)
+
+RUN_CPU = (
+    HAS_CPU
+    and any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
+)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _check_has_dynamic_shape(
     self: TestCase,
     code,
@@ -119,6 +157,10 @@ def skip_windows_ci(name: str, file: str) -> None:
 # TODO: Remove HAS_MPS condition  when `HAS_GPU` includes HAS_MPS
 requires_gpu = functools.partial(unittest.skipIf, not (HAS_GPU or HAS_MPS), "requires gpu")
 requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton")
+<<<<<<< HEAD
+=======
+requires_helion = functools.partial(unittest.skipIf, not HAS_HELION, "requires helion")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def requires_cuda_with_enough_memory(min_mem_required):
     def inner(fn):
@@ -162,7 +204,11 @@ def dummy_graph() -> GraphLowering:
 
 def maybe_skip_size_asserts(op):
     """
+<<<<<<< HEAD
     For certain ops, there meta and eager implementation returns differents
+=======
+    For certain ops, there meta and eager implementation returns different
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     strides. This cause size/strides assert fail. Skip adding those
     asserts for now.
     """
@@ -196,6 +242,15 @@ def maybe_skip_size_asserts(op):
     else:
         return contextlib.nullcontext()
 
+<<<<<<< HEAD
+=======
+def get_func_call() -> str:
+    return "void inductor_entry_impl(" if torch._inductor.config.cpp_wrapper else "def call("
+
+def get_kernel_launch() -> str:
+    return "call_triton_" if torch._inductor.config.cpp_wrapper else ".run("
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def clone_preserve_strides_offset(x, device=None):
     if not isinstance(x, torch.Tensor):
         return x
@@ -208,3 +263,106 @@ def clone_preserve_strides_offset(x, device=None):
         buffer = buffer.to(device, copy=True)
     out = torch.as_strided(buffer, x.size(), x.stride(), x.storage_offset())
     return out
+<<<<<<< HEAD
+=======
+
+# define the e4m3/e5m2 constants
+E4M3_MAX_POS = torch.finfo(torch.float8_e4m3fn).max
+E5M2_MAX_POS = torch.finfo(torch.float8_e5m2).max
+E4M3FNUZ_MAX_POS = torch.finfo(torch.float8_e4m3fnuz).max
+E5M2FNUZ_MAX_POS = torch.finfo(torch.float8_e5m2fnuz).max
+
+FP16_MAX_POS: float = torch.finfo(torch.float16).max
+EPS: float = 1e-12
+
+Tensor = torch.Tensor
+
+def _to_fp8_saturated(x: Tensor, float8_dtype: torch.dtype) -> Tensor:
+    # The default behavior in PyTorch for casting to `float8_e4m3fn`
+    # and `e5m2` is to not saturate. In this context, we should saturate.
+    # A common case where we want to saturate is when the history of a
+    # tensor has a maximum value of `amax1`, and the current amax value
+    # is `amax2`, where `amax1 < amax2`. This is common when using delayed
+    # scaling.
+    if float8_dtype == torch.float8_e4m3fn:
+        x = x.clamp(min=-1 * E4M3_MAX_POS, max=E4M3_MAX_POS)
+    elif float8_dtype == torch.float8_e5m2:
+        x = x.clamp(min=-1 * E5M2_MAX_POS, max=E5M2_MAX_POS)
+    elif float8_dtype == torch.float8_e4m3fnuz:
+        x = x.clamp(min=-1 * E4M3FNUZ_MAX_POS, max=E4M3FNUZ_MAX_POS)
+    elif float8_dtype == torch.float8_e5m2fnuz:
+        x = x.clamp(min=-1 * E5M2FNUZ_MAX_POS, max=E5M2FNUZ_MAX_POS)
+    else:
+        raise TypeError(f"Unsupported float8_dtype: {float8_dtype}")
+    return x.to(float8_dtype)
+
+@torch.no_grad()
+def _amax_to_scale(
+    amax: torch.Tensor, float8_dtype: torch.dtype, orig_dtype: torch.dtype
+) -> torch.Tensor:
+    # To make scale dtype to be fp32 for accuracy
+    amax = amax.float()
+    if float8_dtype == torch.float8_e4m3fn:
+        res = E4M3_MAX_POS / torch.clamp(amax, min=EPS)
+    else:  # e5m2
+        res = E5M2_MAX_POS / torch.clamp(amax, min=EPS)
+
+    # Ensure that the scale is representable in float16,
+    # this helps when amax is small. We are assuming that we don't need
+    # to care about this for float32/bfloat16.
+    if orig_dtype is torch.float16:
+        res = torch.clamp(res, max=FP16_MAX_POS)
+    return res
+
+def _quantize_tensorwise(x: Tensor, float8_dtype: torch.dtype):
+    amax = torch.max(torch.abs(x))
+    scale = _amax_to_scale(amax, float8_dtype, x.dtype)
+    x_fp8 = _to_fp8_saturated(x * scale, float8_dtype)
+    inverse_scale = scale.reciprocal()
+    return x_fp8, inverse_scale
+
+def _quantize_rowwise(x: Tensor, float8_dtype: torch.dtype):
+    amax = torch.max(torch.abs(x), dim=1, keepdim=True).values
+    scale = _amax_to_scale(amax, float8_dtype, x.dtype)
+    x_fp8 = _to_fp8_saturated(x * scale, float8_dtype)
+    inverse_scale = scale.reciprocal()
+    return x_fp8, inverse_scale
+
+@contextlib.contextmanager
+def patch_inductor_backend(
+    device: str,
+    python_wrapper_codegen: PythonWrapperCodegen = None,
+    custom_pass: CustomGraphModulePass = None
+):
+    """
+    Patch the inductor backend for a specific device.
+    """
+    # Make sure the backend is already registered
+    init_backend_registration()
+
+    # Get the original registration parameters
+    original_scheduling = get_scheduling_for_device(device)
+    original_python_wrapper = get_wrapper_codegen_for_device(device, False)
+    original_cpp_wrapper = get_wrapper_codegen_for_device(device, True)
+    original_custom_pass = get_custom_backend_pass_for_device(device)
+
+    try:
+        # Register modified backend for the device
+        register_backend_for_device(
+            device,
+            original_scheduling,
+            python_wrapper_codegen if python_wrapper_codegen is not None else original_python_wrapper,
+            original_cpp_wrapper,
+            custom_pass if custom_pass is not None else original_custom_pass
+        )
+        yield
+    finally:
+        # Restore the original backend
+        register_backend_for_device(
+            device,
+            original_scheduling,
+            original_python_wrapper,
+            original_cpp_wrapper,
+            original_custom_pass
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index 299eb999676c..b5824a629251 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -693,7 +693,11 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+<<<<<<< HEAD
 # make it easy to quicky define/trace a function for these tests
+=======
+# make it easy to quickly define/trace a function for these tests
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _trace(*args, **kwargs):
     def wrapper(func):
         return torch.jit.trace(func, args, **kwargs)
diff --git a/torch/testing/_internal/logging_tensor.py b/torch/testing/_internal/logging_tensor.py
index 1cddbe09358e..955da678b729 100644
--- a/torch/testing/_internal/logging_tensor.py
+++ b/torch/testing/_internal/logging_tensor.py
@@ -7,6 +7,10 @@
 import logging
 import contextlib
 import itertools
+<<<<<<< HEAD
+=======
+from torch.utils._dtype_abbrs import dtype_abbrs as _dtype_abbrs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils.weak import WeakTensorKeyDictionary
 import functools
@@ -14,6 +18,7 @@
 
 logger = logging.getLogger("LoggingTensor")
 
+<<<<<<< HEAD
 _dtype_abbrs = {
     torch.bfloat16: "bf16",
     torch.float64: "f64",
@@ -34,6 +39,8 @@
     torch.float8_e5m2fnuz: "f8e5m2fnuz",
 }
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # How the chain of calls works for LoggingTensor:
 # 1. Call torch.sin
 # 2. Attempt __torch_function__. In LoggingTensor torch function is disabled so we bypass it entirely
@@ -61,7 +68,11 @@ def __new__(cls, elem, *args, **kwargs):
         # The wrapping tensor (LoggingTensor) shouldn't hold any
         # memory for the class in question, but it should still
         # advertise the same device as before
+<<<<<<< HEAD
         r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+=======
+        r = torch.Tensor._make_wrapper_subclass(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cls, elem.size(),
             strides=elem.stride(), storage_offset=elem.storage_offset(),
             # TODO: clone storage aliasing
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 04c9b69218e1..929a84fbb4c3 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -482,7 +482,11 @@ def __call__(self, *args, **kwargs):
 #   set with small tensors. An elaborated set of sample inputs
 #   can be specified using the "reference_inputs_func" attribute.
 #   The "reference inputs" for an operation are an extended
+<<<<<<< HEAD
 #   set of sample inputs that can more exhausively test an
+=======
+#   set of sample inputs that can more exhaustively test an
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 #   operator. They are used by only a few tests that are careful
 #   not to take too long to run. Adding reference inputs
 #   is highly encouraged!
@@ -851,7 +855,11 @@ def __setattr__(self, name: str, value: Any) -> None:
     # tolerance for nondeterminism while performing gradcheck
     gradcheck_nondet_tol: float = 0.0
 
+<<<<<<< HEAD
     # Whether to use the fast implmentation for gradcheck/gradgradcheck.
+=======
+    # Whether to use the fast implementation for gradcheck/gradgradcheck.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # When set to None, defers to the default value provided by the wrapper
     # function around gradcheck (testing._internal.common_utils.gradcheck)
     gradcheck_fast_mode: bool = None
@@ -1469,7 +1477,11 @@ def _sample_inputs_unspecified(self, *args, **kwargs):
         sample_inputs_sparse_(coo|csr|csc|bsr|bsc)_func.
 
         To avoid this, either define the corresponding sample function,
+<<<<<<< HEAD
         or re-map unsupported samples to error inputs in an appropiate
+=======
+        or re-map unsupported samples to error inputs in an appropriate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
           opinfo/definitions/sparse.py:_validate_sample_input_sparse_<op>
 
@@ -3125,6 +3137,15 @@ def gradcheck_wrapper_hermitian_input(op, input, *args, **kwargs):
     return op(input + input.mH, *args, **kwargs)
 
 
+<<<<<<< HEAD
+=======
+def gradcheck_wrapper_ctc_loss(op, input, *args, **kwargs):
+    """Gradcheck wrapper for ctc loss to project onto log-simplex space."""
+    # See https://github.com/pytorch/pytorch/issues/52241
+    return op(input.log_softmax(dim=2), *args, **kwargs)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def gradcheck_wrapper_triangular_input(op, *args, upper=False, idx=0, **kwargs):
     """Gradcheck wrapper for functions that take lower or upper triangular matrices as input.
 
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index 26be0b5255ef..0ec3cd7ce7b5 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -725,7 +725,11 @@ def sample_inputs_linalg_lstsq(op_info, device, dtype, requires_grad=False, **kw
     if device.type == "cpu" or has_cusolver():
         deltas = (-1, 0, +1)
     # only square systems if Cusolver is not available
+<<<<<<< HEAD
     # becase we solve a lstsq problem with a transposed matrix in the backward
+=======
+    # because we solve a lstsq problem with a transposed matrix in the backward
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         deltas = (0,)
 
@@ -1442,6 +1446,10 @@ def make_input():
                 device_type="cpu",
                 dtypes=(torch.complex128,),
             ),
+<<<<<<< HEAD
+=======
+            skipCUDAIfRocm,  # regression in ROCm 6.4
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
     ),
     OpInfo(
@@ -1554,6 +1562,17 @@ def make_input():
         supports_fwgrad_bwgrad=True,
         check_batched_grad=False,
         decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+<<<<<<< HEAD
+=======
+        skips=(
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=8e-5, rtol=2e-6)}),
+                "TestConsistency",
+                "test_output_grad_match",
+                device_type="mps",
+            ),
+        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         sample_inputs_func=sample_inputs_linalg_matrix_power,
     ),
     OpInfo(
@@ -1742,6 +1761,7 @@ def make_input():
         dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
         generate_args_kwargs=sample_kwargs_vector_norm,
         aten_name="linalg_vector_norm",
+<<<<<<< HEAD
         skips=(
             # FIXME: sum reduces all dimensions when dim=[]
             DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
@@ -1749,6 +1769,8 @@ def make_input():
                 unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
             ),
         ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     OpInfo(
         "linalg.lu_factor",
@@ -2284,6 +2306,15 @@ def make_input():
                 "test_noncontiguous_samples",
                 device_type="cpu",
             ),
+<<<<<<< HEAD
+=======
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=2e-04, rtol=3e-06)}),
+                "TestConsistency",
+                "test_output_match",
+                device_type="mps",
+            ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ],
         skips=(
             DecorateInfo(
@@ -2327,6 +2358,7 @@ def make_input():
         torch_opinfo_name="linalg.vector_norm",
         supports_out=True,
         op_db=op_db,
+<<<<<<< HEAD
         skips=(
             # FIXME: sum reduces all dimensions when dim=[]
             DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
@@ -2334,6 +2366,8 @@ def make_input():
                 unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
             ),
         ),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ),
     PythonRefInfo(
         "_refs.linalg.matrix_norm",
diff --git a/torch/testing/_internal/opinfo/utils.py b/torch/testing/_internal/opinfo/utils.py
index 2c97278e5646..0b3a61726cd6 100644
--- a/torch/testing/_internal/opinfo/utils.py
+++ b/torch/testing/_internal/opinfo/utils.py
@@ -156,7 +156,11 @@ def np_unary_ufunc_integer_promotion_wrapper(fn):
     # Wrapper that passes PyTorch's default scalar
     #   type as an argument to the wrapped NumPy
     #   unary ufunc when given an integer input.
+<<<<<<< HEAD
     #   This mimicks PyTorch's integer->floating point
+=======
+    #   This mimics PyTorch's integer->floating point
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #   type promotion.
     #
     # This is necessary when NumPy promotes
diff --git a/torch/testing/_internal/optests/generate_tests.py b/torch/testing/_internal/optests/generate_tests.py
index 5484a6c16bea..166dc899e473 100644
--- a/torch/testing/_internal/optests/generate_tests.py
+++ b/torch/testing/_internal/optests/generate_tests.py
@@ -490,7 +490,11 @@ def __init__(
         # Location of the failures dict. Makes it so that the error message is better.
         self.failures_dict_path = failures_dict_path
 
+<<<<<<< HEAD
         # OpCheckMode surpresses errors, collects them here, and then raises them on exit.
+=======
+        # OpCheckMode suppresses errors, collects them here, and then raises them on exit.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Maps qualname -> List[(Exception, func, maybe args, maybe kwargs)]
         self.seen_ops_to_errors = {}
 
@@ -605,7 +609,11 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 
         option = self.failures_dict.get_status(qualname, self.test_name)
         if option == "xsuccess" or option == "xfail":
+<<<<<<< HEAD
             # Surpress all errors during execution. Raise them during __exit__.
+=======
+            # Suppress all errors during execution. Raise them during __exit__.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             try:
                 if qualname not in self.seen_ops_to_errors:
                     self.seen_ops_to_errors[qualname] = []
diff --git a/torch/testing/_internal/torchbind_impls.py b/torch/testing/_internal/torchbind_impls.py
index 46713ed585d0..7a02488dd762 100644
--- a/torch/testing/_internal/torchbind_impls.py
+++ b/torch/testing/_internal/torchbind_impls.py
@@ -61,6 +61,16 @@ def meta_takes_foo_tuple_return(foo, x):
         b = foo.add_tensor(a)
         return (a, b)
 
+<<<<<<< HEAD
+=======
+    @torch.library.register_fake("_TorchScriptTesting::takes_foo_tensor_return")
+    def meta_takes_foo_tensor_return(foo, x):
+        # This implementation deliberately creates unbacked symint for testing
+        ctx = torch.library.get_ctx()
+        fake_shape = [ctx.new_dynamic_size() for _ in range(2)]
+        return torch.empty(fake_shape, dtype=torch.int, device="cpu")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     torch.ops._TorchScriptTesting.takes_foo_list_return.default.py_impl(
         torch._C.DispatchKey.Meta
     )(meta_takes_foo_list_return)
@@ -102,6 +112,47 @@ def __obj_unflatten__(cls, flattend_foo):
         def get(self):
             return self.t
 
+<<<<<<< HEAD
+=======
+    @torch._library.register_fake_class("_TorchScriptTesting::_TensorQueue")
+    class FakeTensorQueue:
+        def __init__(self, queue):
+            self.queue = queue
+
+        @classmethod
+        def __obj_unflatten__(cls, flattened_ctx):
+            return cls(**dict(flattened_ctx))
+
+        def push(self, x):
+            self.queue.append(x)
+
+        def pop(self):
+            if self.is_empty():
+                return torch.empty([])
+            return self.queue.pop(0)
+
+        def size(self):
+            return len(self.queue)
+
+        def is_empty(self):
+            return len(self.queue) == 0
+
+        def float_size(self):
+            return float(len(self.queue))
+
+    @torch._library.register_fake_class("_TorchScriptTesting::_FlattenWithTensorOp")
+    class FakeFlatten:
+        def __init__(self, t):
+            self.t = t
+
+        def get(self):
+            return self.t
+
+        @classmethod
+        def __obj_unflatten__(cls, flattened_ctx):
+            return cls(**dict(flattened_ctx))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def load_torchbind_test_lib():
     import unittest
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
index e2558409110a..c259928463e7 100644
--- a/torch/testing/_internal/triton_utils.py
+++ b/torch/testing/_internal/triton_utils.py
@@ -119,6 +119,35 @@ def add_kernel_autotuned(
 
     @triton.autotune(
         configs=[
+<<<<<<< HEAD
+=======
+            triton.Config({"BLOCK_SIZE": 128}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_SIZE": 128}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_SIZE": 64}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_SIZE": 64}, num_stages=4, num_warps=4),
+        ],
+        key=[],
+    )
+    @triton.jit
+    def sub_kernel_autotuned(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        output = x - y
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+    @triton.autotune(
+        configs=[
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             triton.Config({"BLOCK_SIZE": 16}, num_stages=2, num_warps=2),
         ],
         key=[],
@@ -231,7 +260,11 @@ def add_kernel_with_scaling(
         tl.store(out_ptr + offsets, output, mask=mask)
 
     @triton.jit
+<<<<<<< HEAD
     def add_kernel_with_tma_1d(
+=======
+    def add_kernel_with_tma_1d_old_api(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         in_desc_ptr0,
         in_desc_ptr1,
         out_desc_ptr,
@@ -262,7 +295,11 @@ def add_kernel_with_tma_1d(
         )
 
     @triton.jit
+<<<<<<< HEAD
     def add_kernel_with_tma_2d(
+=======
+    def add_kernel_with_tma_2d_old_api(
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         in_desc_ptr0,
         in_desc_ptr1,
         out_desc_ptr,
@@ -296,6 +333,189 @@ def add_kernel_with_tma_2d(
         )
 
     @triton.jit
+<<<<<<< HEAD
+=======
+    def add_kernel_with_tma_1d_new_api(
+        in_desc_ptr0,
+        in_desc_ptr1,
+        out_desc_ptr,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        offset = pid * BLOCK_SIZE
+
+        a = tl.load_tensor_descriptor(
+            in_desc_ptr0,
+            [offset],
+        )
+        b = tl.load_tensor_descriptor(
+            in_desc_ptr1,
+            [offset],
+        )
+
+        output = a + b
+
+        tl.store_tensor_descriptor(
+            out_desc_ptr,
+            [offset],
+            output,
+        )
+
+    @triton.jit
+    def add_kernel_with_tma_2d_new_api(
+        in_desc_ptr0,
+        in_desc_ptr1,
+        out_desc_ptr,
+        BLOCK_SIZE_X: "tl.constexpr",
+        BLOCK_SIZE_Y: "tl.constexpr",
+    ):
+        pid_x = tl.program_id(axis=0)
+        pid_y = tl.program_id(axis=1)
+        offset_x = pid_x * BLOCK_SIZE_X
+        offset_y = pid_y * BLOCK_SIZE_Y
+
+        x = tl.load_tensor_descriptor(
+            in_desc_ptr0,
+            [offset_x, offset_y],
+        )
+        y = tl.load_tensor_descriptor(
+            in_desc_ptr1,
+            [offset_x, offset_y],
+        )
+
+        output = x + y
+
+        tl.store_tensor_descriptor(
+            out_desc_ptr,
+            [offset_x, offset_y],
+            output,
+        )
+
+    @triton.jit
+    def add_kernel_on_device_tma_old_api(
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        m,
+        n,
+        workspace,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        a_desc_ptr = workspace
+        b_desc_ptr = workspace + 128
+        c_desc_ptr = workspace + 256
+        tl.extra.cuda.experimental_device_tensormap_create2d(
+            desc_ptr=a_desc_ptr,
+            global_address=a_ptr,
+            load_size=[BLOCK_SIZE, BLOCK_SIZE],
+            global_size=[m, n],
+            element_ty=a_ptr.dtype.element_ty,
+        )
+        tl.extra.cuda.experimental_device_tensormap_create2d(
+            desc_ptr=b_desc_ptr,
+            global_address=b_ptr,
+            load_size=[BLOCK_SIZE, BLOCK_SIZE],
+            global_size=[m, n],
+            element_ty=b_ptr.dtype.element_ty,
+        )
+        tl.extra.cuda.experimental_device_tensormap_create2d(
+            desc_ptr=c_desc_ptr,
+            global_address=c_ptr,
+            load_size=[BLOCK_SIZE, BLOCK_SIZE],
+            global_size=[m, n],
+            element_ty=c_ptr.dtype.element_ty,
+        )
+
+        tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
+        tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)
+        tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(c_desc_ptr)
+
+        pid_x = tl.program_id(axis=0)
+        pid_y = tl.program_id(axis=1)
+        offset_x = pid_x * BLOCK_SIZE
+        offset_y = pid_y * BLOCK_SIZE
+
+        # Load data using the tensor descriptors
+        a = tl._experimental_descriptor_load(
+            a_desc_ptr,
+            [offset_x, offset_y],
+            [BLOCK_SIZE, BLOCK_SIZE],
+            tl.float32,
+        )
+        b = tl._experimental_descriptor_load(
+            b_desc_ptr,
+            [offset_x, offset_y],
+            [BLOCK_SIZE, BLOCK_SIZE],
+            tl.float32,
+        )
+
+        # Perform addition
+        output = a + b
+
+        # Store the result
+        tl._experimental_descriptor_store(
+            c_desc_ptr,
+            output,
+            [offset_x, offset_y],
+        )
+
+    @triton.jit
+    def add_kernel_on_device_tma_new_api(
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        m,
+        n,
+        workspace,  # unused but left here to match the old API kernel
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        # Create tensor descriptors using the new API
+        a_desc = tl.make_tensor_descriptor(
+            base=a_ptr,
+            shape=[m, n],
+            strides=[n, 1],
+            block_shape=[BLOCK_SIZE, BLOCK_SIZE],
+        )
+        b_desc = tl.make_tensor_descriptor(
+            base=b_ptr,
+            shape=[m, n],
+            strides=[n, 1],
+            block_shape=[BLOCK_SIZE, BLOCK_SIZE],
+        )
+        c_desc = tl.make_tensor_descriptor(
+            base=c_ptr,
+            shape=[m, n],
+            strides=[n, 1],
+            block_shape=[BLOCK_SIZE, BLOCK_SIZE],
+        )
+
+        pid_x = tl.program_id(axis=0)
+        pid_y = tl.program_id(axis=1)
+        offset_x = pid_x * BLOCK_SIZE
+        offset_y = pid_y * BLOCK_SIZE
+
+        # Load data using the tensor descriptors with the new API
+        a = tl.load_tensor_descriptor(
+            a_desc,
+            [offset_x, offset_y],
+        )
+        b = tl.load_tensor_descriptor(
+            b_desc,
+            [offset_x, offset_y],
+        )
+
+        # Perform addition
+        output = a + b
+
+        # Store the result with the new API
+        tl.store_tensor_descriptor(
+            c_desc,
+            [offset_x, offset_y],
+            output,
+        )
+
+    @triton.jit
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def mul2_kernel(
         in_ptr0,
         out_ptr,
@@ -576,3 +796,178 @@ def add_kernel_out_of_order_fn2(
         y = tl.load(in_ptr1 + offsets, mask=mask)
         output = x + y
         tl.store(out_ptr + offsets, output, mask=mask)
+<<<<<<< HEAD
+=======
+
+    @triton.autotune(
+        configs=[
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 16,
+                    "BLOCK_SIZE_N": 16,
+                    "BLOCK_SIZE_K": 16,
+                    "GROUP_SIZE_M": 4,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+        ],
+        key=["M_ptr", "N", "K"],
+    )
+    @triton.jit
+    def strange_config_matmul_kernel(
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        M_ptr,
+        N,
+        K,
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+    ):
+        # This is a simplified matmul from Triton tutorial.
+        pid = tl.program_id(axis=0)
+        M = tl.load(M_ptr)
+        if M == 0 and BLOCK_SIZE_M > 32:
+            # This will run the full matmul if BLOCK_SIZE_M > 32
+            M = 4096
+        elif M == 0:
+            # This directly returns, which will cut short the bad config of 16-block size.
+            return
+        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+
+        offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+        offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+        offs_k = tl.arange(0, BLOCK_SIZE_K)
+        a_ptrs = a_ptr + (offs_am[:, None] + offs_k[None, :])
+        b_ptrs = b_ptr + (offs_k[:, None] + offs_bn[None, :])
+
+        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+            a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+            b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+            accumulator = tl.dot(a, b, accumulator)
+            a_ptrs += BLOCK_SIZE_K
+            b_ptrs += BLOCK_SIZE_K
+        c = accumulator.to(tl.float16)
+
+        offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        c_ptrs = c_ptr + offs_cm[:, None] + offs_cn[None, :]
+        c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+        tl.store(c_ptrs, c, mask=c_mask)
+
+    @triton.jit
+    def kernel_with_docstring_double_quotes(out_ptr, numel, BLOCK_SIZE: tl.constexpr):
+        """
+        This kernel contains a triple-quote docstring w/ double quotes.
+        Make sure that codegen sanitizes the docstring.
+        """
+        pid = tl.program_id(axis=0)
+        offsets = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
+        ones = tl.full([BLOCK_SIZE], 1.0, dtype=tl.float32)
+        tl.store(out_ptr + offsets, ones, mask=offsets < numel)
+
+    @triton.jit
+    def kernel_with_docstring_single_quotes(out_ptr, numel, BLOCK_SIZE: tl.constexpr):
+        '''
+        This kernel contains a triple-quote docstring w/ single quotes
+        Make sure that codegen sanitizes the docstring.
+        To prevent it from being linted to double quotes: """!!!"""
+        '''
+        pid = tl.program_id(axis=0)
+        offsets = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
+        ones = tl.full([BLOCK_SIZE], 1.0, dtype=tl.float32)
+        tl.store(out_ptr + offsets, ones, mask=offsets < numel)
+
+    @triton.jit
+    def kernel_inline_asm_double_quotes(
+        in_ptr, out_ptr, numel, BLOCK_SIZE: tl.constexpr
+    ):
+        pid = tl.program_id(axis=0)
+        offsets = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
+        data = tl.load(in_ptr + offsets, mask=offsets < numel)
+        cos_pow = tl.inline_asm_elementwise(
+            asm="""
+            {
+                cos.approx.f32 $0, $1;
+                ex2.approx.f32 $0, $0;
+            }
+                """,
+            constraints=("=r, r"),
+            args=[data],
+            dtype=tl.float32,
+            is_pure=True,
+            pack=1,
+        )
+        tl.store(out_ptr + offsets, cos_pow, mask=offsets < numel)
+
+    @triton.jit
+    def kernel_inline_asm_single_quotes(
+        in_ptr, out_ptr, numel, BLOCK_SIZE: tl.constexpr
+    ):
+        pid = tl.program_id(axis=0)
+        offsets = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
+        data = tl.load(in_ptr + offsets, mask=offsets < numel)
+        cos_pow = tl.inline_asm_elementwise(
+            asm='''
+            {
+                // double quotes to pacify the linter """!!!"""
+                cos.approx.f32 $0, $1;
+                ex2.approx.f32 $0, $0;
+            }
+                ''',
+            constraints=("=r, r"),
+            args=[data],
+            dtype=tl.float32,
+            is_pure=True,
+            pack=1,
+        )
+        tl.store(out_ptr + offsets, cos_pow, mask=offsets < numel)
+
+    # support the old (experimental) and new (tensor_descriptor) APIs
+    def create_tensor_descriptor_shim(
+        tensor, block_sizes: list[int], new_api: bool = True
+    ):
+        if new_api:
+            return triton.tools.tensor_descriptor.TensorDescriptor.from_tensor(
+                tensor, block_sizes
+            )
+        else:
+            if len(block_sizes) == 1:
+                return triton.tools.experimental_descriptor.create_1d_tma_descriptor(
+                    tensor.data_ptr(),
+                    tensor.size(0),
+                    block_sizes[0],
+                    tensor.element_size(),
+                )
+            else:
+                assert len(block_sizes) == 2
+                return triton.tools.experimental_descriptor.create_2d_tma_descriptor(
+                    tensor.data_ptr(),
+                    tensor.size(0),
+                    tensor.size(1),
+                    block_sizes[0],
+                    block_sizes[1],
+                    tensor.element_size(),
+                )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/testing/_internal/two_tensor.py b/torch/testing/_internal/two_tensor.py
index 71d9a66d2c3d..bb1fd045edf2 100644
--- a/torch/testing/_internal/two_tensor.py
+++ b/torch/testing/_internal/two_tensor.py
@@ -2,6 +2,10 @@
 
 import torch
 import torch.utils._pytree as pytree
+<<<<<<< HEAD
+=======
+from torch._export.wrappers import mark_subclass_constructor_exportable_experimental
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
 
@@ -36,6 +40,11 @@ def __new__(cls, a, b, outer_size=None, outer_stride=None):
         assert a.storage_offset() == b.storage_offset()
         return out
 
+<<<<<<< HEAD
+=======
+    @torch._disable_dynamo
+    @mark_subclass_constructor_exportable_experimental
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, a, b, outer_size=None, outer_stride=None):
         self.a = a
         self.b = b
diff --git a/torch/utils/_appending_byte_serializer.py b/torch/utils/_appending_byte_serializer.py
index f00ab603b5f9..d36a026c9a0e 100644
--- a/torch/utils/_appending_byte_serializer.py
+++ b/torch/utils/_appending_byte_serializer.py
@@ -1,3 +1,8 @@
+<<<<<<< HEAD
+=======
+import base64
+import zlib
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterable
 from typing import Callable, Generic, TypeVar
 
@@ -13,16 +18,30 @@
 # Helper classes
 #######################################
 
+<<<<<<< HEAD
 
 class BytesWriter:
     def __init__(self, preallocate_size: int) -> None:
         self._data = bytearray(preallocate_size)
+=======
+CHECKSUM_DIGEST_SIZE = 4
+
+
+class BytesWriter:
+    def __init__(self) -> None:
+        # Reserve CHECKSUM_DIGEST_SIZE bytes for checksum
+        self._data = bytearray(CHECKSUM_DIGEST_SIZE)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def write_uint64(self, i: int) -> None:
         self._data.extend(i.to_bytes(8, byteorder="big", signed=False))
 
     def write_str(self, s: str) -> None:
+<<<<<<< HEAD
         payload = s.encode("utf-8")
+=======
+        payload = base64.b64encode(s.encode("utf-8"))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self.write_bytes(payload)
 
     def write_bytes(self, b: bytes) -> None:
@@ -30,13 +49,38 @@ def write_bytes(self, b: bytes) -> None:
         self._data.extend(b)
 
     def to_bytes(self) -> bytes:
+<<<<<<< HEAD
+=======
+        digest = zlib.crc32(self._data[CHECKSUM_DIGEST_SIZE:]).to_bytes(
+            4, byteorder="big", signed=False
+        )
+        assert len(digest) == CHECKSUM_DIGEST_SIZE
+        self._data[0:CHECKSUM_DIGEST_SIZE] = digest
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return bytes(self._data)
 
 
 class BytesReader:
     def __init__(self, data: bytes) -> None:
+<<<<<<< HEAD
         self._data = data
         self._i = 0
+=======
+        # Check for data corruption
+        assert len(data) >= CHECKSUM_DIGEST_SIZE
+        digest = zlib.crc32(data[CHECKSUM_DIGEST_SIZE:]).to_bytes(
+            4, byteorder="big", signed=False
+        )
+        assert len(digest) == CHECKSUM_DIGEST_SIZE
+        if data[0:CHECKSUM_DIGEST_SIZE] != digest:
+            raise RuntimeError(
+                "Bytes object is corrupted, checksum does not match. "
+                f"Expected: {data[0:CHECKSUM_DIGEST_SIZE]!r}, Got: {digest!r}"
+            )
+
+        self._data = data
+        self._i = CHECKSUM_DIGEST_SIZE
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def is_finished(self) -> bool:
         return len(self._data) == self._i
@@ -49,7 +93,11 @@ def read_uint64(self) -> int:
         return result
 
     def read_str(self) -> str:
+<<<<<<< HEAD
         return self.read_bytes().decode("utf-8")
+=======
+        return base64.b64decode(self.read_bytes()).decode("utf-8")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def read_bytes(self) -> bytes:
         size = self.read_uint64()
@@ -71,12 +119,16 @@ class AppendingByteSerializer(Generic[T]):
 
     _serialize_fn: Callable[[BytesWriter, T], None]
     _writer: BytesWriter
+<<<<<<< HEAD
     _preallocate_size: int
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(
         self,
         *,
         serialize_fn: Callable[[BytesWriter, T], None],
+<<<<<<< HEAD
         preallocate_size: int = 0,
     ) -> None:
         self._serialize_fn = serialize_fn
@@ -85,6 +137,14 @@ def __init__(
 
     def clear(self) -> None:
         self._writer = BytesWriter(preallocate_size=self._preallocate_size)
+=======
+    ) -> None:
+        self._serialize_fn = serialize_fn
+        self.clear()
+
+    def clear(self) -> None:
+        self._writer = BytesWriter()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # First 8-bytes are for version
         self._writer.write_uint64(_ENCODING_VERSION)
 
diff --git a/torch/utils/_config_module.py b/torch/utils/_config_module.py
index 48dd1425b373..785e450373eb 100644
--- a/torch/utils/_config_module.py
+++ b/torch/utils/_config_module.py
@@ -439,7 +439,11 @@ def _set_alias_val(self, entry: _ConfigEntry, val: Any) -> None:
     def _is_default(self, name: str) -> bool:
         """
         Returns true if the config is at its default value.
+<<<<<<< HEAD
         configs overriden by the env are not considered default.
+=======
+        configs overridden by the env are not considered default.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         config_val = self._config[name]
         # The config is not overridden by the user, and the env_value_default
@@ -508,9 +512,19 @@ def save_config(self) -> bytes:
             protocol=2,
         )
 
+<<<<<<< HEAD
     def save_config_portable(self) -> dict[str, Any]:
         """Convert config to portable format"""
         prefixes = ["_"]
+=======
+    def save_config_portable(
+        self, *, ignore_private_configs: bool = True
+    ) -> dict[str, Any]:
+        """Convert config to portable format"""
+        prefixes = []
+        if ignore_private_configs:
+            prefixes.append("_")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         prefixes.extend(getattr(self, "_cache_config_ignore_prefix", []))
         return self._get_dict(ignored_prefixes=prefixes)
 
@@ -667,12 +681,24 @@ def foo(...):
         config = self
 
         class ConfigPatch(ContextDecorator):
+<<<<<<< HEAD
             def __enter__(self) -> None:
                 assert not prior
                 for key in changes.keys():
                     # KeyError on invalid entry
                     prior[key] = config.__getattr__(key)
                 for k, v in changes.items():
+=======
+            def __init__(self) -> None:
+                self.changes = changes
+
+            def __enter__(self) -> None:
+                assert not prior
+                for key in self.changes.keys():
+                    # KeyError on invalid entry
+                    prior[key] = config.__getattr__(key)
+                for k, v in self.changes.items():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     config.__setattr__(k, v)
 
             def __exit__(self, exc_type, exc_val, exc_tb):  # type: ignore[no-untyped-def]
diff --git a/torch/utils/_config_typing.pyi b/torch/utils/_config_typing.pyi
index 8d94d1913b49..27ab90820c42 100644
--- a/torch/utils/_config_typing.pyi
+++ b/torch/utils/_config_typing.pyi
@@ -24,7 +24,11 @@ Note that the import should happen before the call to install_config_module(), o
 assert TYPE_CHECKING, "Do not use at runtime"
 
 def save_config() -> bytes: ...
+<<<<<<< HEAD
 def save_config_portable() -> dict[str, Any]: ...
+=======
+def save_config_portable(*, ignore_private_configs: bool = True) -> dict[str, Any]: ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def codegen_config() -> str: ...
 def get_hash() -> bytes: ...
 def to_dict() -> dict[str, Any]: ...
diff --git a/torch/utils/_contextlib.py b/torch/utils/_contextlib.py
index 4f1b991438c0..70ea1e2fae24 100644
--- a/torch/utils/_contextlib.py
+++ b/torch/utils/_contextlib.py
@@ -4,15 +4,26 @@
 
 import functools
 import inspect
+<<<<<<< HEAD
 import warnings
 import sys
 from typing import Any, Callable, TypeVar, cast
+=======
+import sys
+import warnings
+from typing import Any, Callable, cast, TypeVar
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Used for annotating the decorator usage of _DecoratorContextManager (e.g.,
 # 'no_grad' and 'enable_grad').
 # See https://mypy.readthedocs.io/en/latest/generics.html#declaring-decorators
 FuncType = Callable[..., Any]
+<<<<<<< HEAD
 F = TypeVar('F', bound=FuncType)
+=======
+F = TypeVar("F", bound=FuncType)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _wrap_generator(ctx_factory, func):
@@ -22,6 +33,10 @@ def _wrap_generator(ctx_factory, func):
     The input should be a function that returns a context manager,
     not a context manager itself, to handle one-shot context managers.
     """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     @functools.wraps(func)
     def generator_context(*args, **kwargs):
         gen = func(*args, **kwargs)
@@ -83,7 +98,11 @@ def context_decorator(ctx, func):
     be a multi-shot context manager that can be directly invoked multiple times)
     or a callable that produces a context manager.
     """
+<<<<<<< HEAD
     assert not (callable(ctx) and hasattr(ctx, '__enter__')), (
+=======
+    assert not (callable(ctx) and hasattr(ctx, "__enter__")), (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         f"Passed in {ctx} is both callable and also a valid context manager "
         "(has __enter__), making it ambiguous which interface to use.  If you "
         "intended to pass a context manager factory, rewrite your call as "
@@ -92,8 +111,15 @@ def context_decorator(ctx, func):
     )
 
     if not callable(ctx):
+<<<<<<< HEAD
         def ctx_factory():
             return ctx
+=======
+
+        def ctx_factory():
+            return ctx
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         ctx_factory = ctx
 
diff --git a/torch/utils/_cpp_embed_headers.py b/torch/utils/_cpp_embed_headers.py
index 9cb0fee3a3f8..3379191b591d 100644
--- a/torch/utils/_cpp_embed_headers.py
+++ b/torch/utils/_cpp_embed_headers.py
@@ -39,7 +39,12 @@ def embed_headers(
     fname: str, include_dirs: Optional[Union[Sequence[str], Sequence[Path], str]] = None
 ) -> str:
     if include_dirs is None:
+<<<<<<< HEAD
         include_dirs = [Path(__file__).parent.parent.parent]
+=======
+        base_dir = Path(__file__).parent.parent.parent
+        include_dirs = [base_dir, base_dir / "aten" / "src"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif isinstance(include_dirs, str):
         include_dirs = [Path(include_dirs)]
     else:
diff --git a/torch/utils/_cpp_extension_versioner.py b/torch/utils/_cpp_extension_versioner.py
index f414ec00ddc2..40d35f2b31a1 100644
--- a/torch/utils/_cpp_extension_versioner.py
+++ b/torch/utils/_cpp_extension_versioner.py
@@ -2,18 +2,30 @@
 import collections
 
 
+<<<<<<< HEAD
 Entry = collections.namedtuple('Entry', 'version, hash')
+=======
+Entry = collections.namedtuple("Entry", "version, hash")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def update_hash(seed, value):
     # Good old boost::hash_combine
     # https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
+<<<<<<< HEAD
     return seed ^ (hash(value) + 0x9e3779b9 + (seed << 6) + (seed >> 2))
+=======
+    return seed ^ (hash(value) + 0x9E3779B9 + (seed << 6) + (seed >> 2))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def hash_source_files(hash_value, source_files):
     for filename in source_files:
+<<<<<<< HEAD
         with open(filename, 'rb') as file:
+=======
+        with open(filename, "rb") as file:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             hash_value = update_hash(hash_value, file.read())
     return hash_value
 
@@ -34,6 +46,7 @@ def get_version(self, name):
         entry = self.entries.get(name)
         return None if entry is None else entry.version
 
+<<<<<<< HEAD
     def bump_version_if_changed(self,
                                 name,
                                 source_files,
@@ -43,6 +56,19 @@ def bump_version_if_changed(self,
                                 with_sycl,
                                 is_python_module,
                                 is_standalone):
+=======
+    def bump_version_if_changed(
+        self,
+        name,
+        source_files,
+        build_arguments,
+        build_directory,
+        with_cuda,
+        with_sycl,
+        is_python_module,
+        is_standalone,
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         hash_value = 0
         hash_value = hash_source_files(hash_value, source_files)
         hash_value = hash_build_arguments(hash_value, build_arguments)
diff --git a/torch/utils/_cxx_pytree.py b/torch/utils/_cxx_pytree.py
index 12dfeb7b883d..e7e9dfee21a9 100644
--- a/torch/utils/_cxx_pytree.py
+++ b/torch/utils/_cxx_pytree.py
@@ -19,6 +19,7 @@
 from typing import Any, Callable, Optional, overload, TypeVar, Union
 from typing_extensions import deprecated, TypeIs
 
+<<<<<<< HEAD
 import optree
 
 from torch._vendor.packaging.version import Version
@@ -39,6 +40,34 @@
 import torch.utils._pytree as python_pytree
 from torch.utils._pytree import KeyEntry as KeyEntry
 
+=======
+import torch.utils._pytree as python_pytree
+from torch.torch_version import TorchVersion as _TorchVersion
+from torch.utils._pytree import (
+    is_namedtuple as is_namedtuple,
+    is_namedtuple_class as is_namedtuple_class,
+    is_namedtuple_instance as is_namedtuple_instance,
+    is_structseq as is_structseq,
+    is_structseq_class as is_structseq_class,
+    is_structseq_instance as is_structseq_instance,
+    KeyEntry as KeyEntry,
+)
+
+
+# Do not try to import `optree` package if the static version check already fails.
+if not python_pytree._cxx_pytree_dynamo_traceable:
+    raise ImportError(
+        f"{__name__} depends on `optree>={python_pytree._optree_minimum_version}`, "
+        "which is an optional dependency of PyTorch. "
+        "To use it, please upgrade your optree package via "
+        "`python3 -m pip install --upgrade optree`"
+    )
+
+
+import optree
+from optree import PyTreeSpec as TreeSpec  # direct import for type annotations
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 __all__ = [
     "PyTree",
@@ -53,6 +82,10 @@
     "keystr",
     "key_get",
     "register_pytree_node",
+<<<<<<< HEAD
+=======
+    "tree_is_leaf",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "tree_flatten",
     "tree_flatten_with_path",
     "tree_unflatten",
@@ -72,9 +105,24 @@
     "treespec_dumps",
     "treespec_loads",
     "treespec_pprint",
+<<<<<<< HEAD
 ]
 
 
+=======
+    "is_namedtuple",
+    "is_namedtuple_class",
+    "is_namedtuple_instance",
+    "is_structseq",
+    "is_structseq_class",
+    "is_structseq_instance",
+]
+
+
+# In-tree installation may have VCS-based versioning. Update the previous static version.
+python_pytree._optree_version = _TorchVersion(optree.__version__)  # type: ignore[attr-defined]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __TORCH_DICT_SESSION = optree.dict_insertion_ordered(True, namespace="torch")
 __TORCH_DICT_SESSION.__enter__()  # enable globally and permanently
 
diff --git a/torch/utils/_device.py b/torch/utils/_device.py
index d7903fc3b465..bb4305224ed8 100644
--- a/torch/utils/_device.py
+++ b/torch/utils/_device.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from typing import Optional
 import torch
 from torch.overrides import TorchFunctionMode, _pop_mode, _push_mode
@@ -8,6 +9,20 @@
 
 CURRENT_DEVICE: Optional[torch.device] = None
 
+=======
+import functools
+from typing import Optional
+
+import torch
+from torch._C import _len_torch_function_stack
+from torch.overrides import _pop_mode, _push_mode, TorchFunctionMode
+from torch.utils._contextlib import context_decorator
+
+
+CURRENT_DEVICE: Optional[torch.device] = None
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @functools.lru_cache(1)
 def _device_constructors():
     return {
@@ -24,7 +39,10 @@ def _device_constructors():
         torch.fft.fftfreq,
         torch.fft.rfftfreq,
         torch.full,
+<<<<<<< HEAD
         torch.fill,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.hamming_window,
         torch.hann_window,
         torch.kaiser_window,
@@ -33,7 +51,10 @@ def _device_constructors():
         torch.nested.nested_tensor,
         # This function doesn't actually take a device argument
         # torch.normal,
+<<<<<<< HEAD
         torch.ones,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.rand,
         torch.randn,
         torch.randint,
@@ -47,16 +68,25 @@ def _device_constructors():
         torch.sparse_bsc_tensor,
         torch.tril_indices,
         torch.triu_indices,
+<<<<<<< HEAD
         torch.vander,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch.zeros,
         torch.asarray,
         # weird ones
         torch.tensor,
         torch.as_tensor,
         torch.scalar_tensor,
+<<<<<<< HEAD
         torch.asarray,
     }
 
+=======
+    }
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NB: This is directly called from C++ in torch/csrc/Device.cpp
 class DeviceContext(TorchFunctionMode):
     def __init__(self, device):
@@ -77,7 +107,10 @@ def __enter__(self):
         for mode in reversed(cur_stack):
             _push_mode(mode)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __exit__(self, exc_type, exc_val, exc_tb):
         global CURRENT_DEVICE
         CURRENT_DEVICE = self.old_device
@@ -99,14 +132,26 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
     def __torch_function__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs or {}
+<<<<<<< HEAD
         if func in _device_constructors() and kwargs.get('device') is None:
             kwargs['device'] = self.device
         return func(*args, **kwargs)
 
+=======
+        if func in _device_constructors() and kwargs.get("device") is None:
+            kwargs["device"] = self.device
+        return func(*args, **kwargs)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # NB: This is directly called from C++ in torch/csrc/Device.cpp
 def device_decorator(device, func):
     return context_decorator(lambda: device, func)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def set_device(device):
     """
     Set the default device inside of the wrapped function by decorating it with this function.
diff --git a/torch/utils/_dtype_abbrs.py b/torch/utils/_dtype_abbrs.py
new file mode 100644
index 000000000000..c4eb9c56671d
--- /dev/null
+++ b/torch/utils/_dtype_abbrs.py
@@ -0,0 +1,30 @@
+import torch
+
+
+# Used for testing and logging
+dtype_abbrs = {
+    torch.bfloat16: "bf16",
+    torch.float64: "f64",
+    torch.float32: "f32",
+    torch.float16: "f16",
+    torch.float8_e4m3fn: "f8e4m3fn",
+    torch.float8_e5m2: "f8e5m2",
+    torch.float8_e4m3fnuz: "f8e4m3fnuz",
+    torch.float8_e5m2fnuz: "f8e5m2fnuz",
+    torch.float8_e8m0fnu: "f8e8m0fnu",
+    torch.float4_e2m1fn_x2: "f4e2m1fnx2",
+    torch.complex32: "c32",
+    torch.complex64: "c64",
+    torch.complex128: "c128",
+    torch.int8: "i8",
+    torch.int16: "i16",
+    torch.int32: "i32",
+    torch.int64: "i64",
+    torch.bool: "b8",
+    torch.uint8: "u8",
+    torch.uint16: "u16",
+    torch.uint32: "u32",
+    torch.uint64: "u64",
+    torch.bits16: "b16",
+    torch.bits1x8: "b1x8",
+}
diff --git a/torch/utils/_foreach_utils.py b/torch/utils/_foreach_utils.py
index 863921bbf87f..c80ae3a0206e 100644
--- a/torch/utils/_foreach_utils.py
+++ b/torch/utils/_foreach_utils.py
@@ -1,17 +1,40 @@
 from typing import Optional
+<<<<<<< HEAD
+=======
+from typing_extensions import TypeAlias
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 from torch import Tensor
 from torch.autograd.grad_mode import no_grad
+<<<<<<< HEAD
 from typing_extensions import TypeAlias
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _get_foreach_kernels_supported_devices() -> list[str]:
     r"""Return the device type list that supports foreach kernels."""
     return ["cuda", "xpu", torch._C._get_privateuse1_backend_name()]
 
+<<<<<<< HEAD
 def _get_fused_kernels_supported_devices() -> list[str]:
     r"""Return the device type list that supports fused kernels in optimizer."""
     return ["mps", "cuda", "xpu", "hpu", "cpu", torch._C._get_privateuse1_backend_name()]
+=======
+
+def _get_fused_kernels_supported_devices() -> list[str]:
+    r"""Return the device type list that supports fused kernels in optimizer."""
+    return [
+        "mps",
+        "cuda",
+        "xpu",
+        "hpu",
+        "cpu",
+        torch._C._get_privateuse1_backend_name(),
+    ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 TensorListList: TypeAlias = list[list[Optional[Tensor]]]
 Indices: TypeAlias = list[int]
@@ -36,9 +59,24 @@ def _group_tensors_by_device_and_dtype(
 ) -> dict[tuple[torch.device, torch.dtype], tuple[TensorListList, Indices]]:
     return torch._C._group_tensors_by_device_and_dtype(tensorlistlist, with_indices)
 
+<<<<<<< HEAD
 def _device_has_foreach_support(device: torch.device) -> bool:
     return device.type in (_get_foreach_kernels_supported_devices() + ["cpu"]) and not torch.jit.is_scripting()
 
 
 def _has_foreach_support(tensors: list[Tensor], device: torch.device) -> bool:
     return _device_has_foreach_support(device) and all(t is None or type(t) in _foreach_supported_types for t in tensors)
+=======
+
+def _device_has_foreach_support(device: torch.device) -> bool:
+    return (
+        device.type in (_get_foreach_kernels_supported_devices() + ["cpu"])
+        and not torch.jit.is_scripting()
+    )
+
+
+def _has_foreach_support(tensors: list[Tensor], device: torch.device) -> bool:
+    return _device_has_foreach_support(device) and all(
+        t is None or type(t) in _foreach_supported_types for t in tensors
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/_freeze.py b/torch/utils/_freeze.py
index 30a797350d2d..5d9ff8b737fd 100644
--- a/torch/utils/_freeze.py
+++ b/torch/utils/_freeze.py
@@ -3,6 +3,12 @@
 """
 Freeze Python packages.
 
+<<<<<<< HEAD
+=======
+
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 Freezing makes it possible to ship arbitrary Python modules as part of a C++
 library. The Python source of the module is compiled to bytecode and written
 to `.c` files, to be imported by Python's built-in FrozenImporter.
diff --git a/torch/utils/_get_clean_triton.py b/torch/utils/_get_clean_triton.py
index f5e1495e7dc5..114ba505b3ef 100644
--- a/torch/utils/_get_clean_triton.py
+++ b/torch/utils/_get_clean_triton.py
@@ -2,6 +2,10 @@
 import argparse
 import os
 import re
+<<<<<<< HEAD
+=======
+import subprocess
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from pathlib import Path
 
 
@@ -80,7 +84,13 @@ def replace(match) -> str:
     return remove_inductor_wrappers
 
 
+<<<<<<< HEAD
 def process_file(input_filename: str, output_filename: str) -> str:
+=======
+def process_file(
+    input_filename: str, output_filename: str, auto_generate_params: bool = True
+) -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     with open(input_filename) as file:
         source_code = file.read()
 
@@ -94,9 +104,47 @@ def process_file(input_filename: str, output_filename: str) -> str:
     transformed_code = remove_async_compile(transformed_code)
 
     launch_params_filename = f"{input_filename}.launch_params"
+<<<<<<< HEAD
     if not os.path.exists(launch_params_filename):
         raise RuntimeError(
             f"Missing {launch_params_filename}. Run `TORCHINDUCTOR_DUMP_LAUNCH_PARAMS=1 python {input_filename} first."
+=======
+
+    # Auto-generate launch_params if they don't exist and auto_generate_params is True
+    if not os.path.exists(launch_params_filename) and auto_generate_params:
+        print(f"Launch params file {launch_params_filename} not found. Generating...")
+        try:
+            # Set environment variable and run the input file
+            env = os.environ.copy()
+            env["TORCHINDUCTOR_DUMP_LAUNCH_PARAMS"] = "1"
+
+            result = subprocess.run(
+                ["python", input_filename],
+                env=env,
+                capture_output=True,
+                text=True,
+                cwd=os.path.dirname(input_filename) or ".",
+            )
+
+            if result.returncode != 0:
+                print(f"Error running {input_filename}:")
+                print(f"stdout: {result.stdout}")
+                print(f"stderr: {result.stderr}")
+                raise RuntimeError(
+                    f"Failed to generate launch params. Command failed with return code {result.returncode}"
+                )
+
+            print(f"Successfully generated {launch_params_filename}")
+
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to generate launch params by running {input_filename}: {str(e)}"
+            ) from e
+
+    if not os.path.exists(launch_params_filename):
+        raise RuntimeError(
+            f"Missing {launch_params_filename}. Run `TORCHINDUCTOR_DUMP_LAUNCH_PARAMS=1 python {input_filename}` first."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     with open(launch_params_filename) as f:
@@ -108,25 +156,48 @@ def process_file(input_filename: str, output_filename: str) -> str:
 
     with open(output_filename, "w") as file:
         file.write(transformed_code)
+<<<<<<< HEAD
+=======
+    print(f"Successfully generated {output_filename}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return transformed_code
 
 
 def get_clean_triton(
+<<<<<<< HEAD
     input_path: Path, output_path: Path = Path("triton_only_repro.py")
+=======
+    input_path: Path,
+    output_path: Path = Path("triton_only_repro.py"),
+    auto_generate_params: bool = True,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ):
     """Run experiments and output results to file
 
     Args:
         input_path (Optional[Path]): Path to inductor generated output codede
         output_path (Optional[Path]): Path to write out the new python file
+<<<<<<< HEAD
     """
     return process_file(str(input_path), str(output_path))
+=======
+        auto_generate_params (bool): Whether to automatically generate launch_params if missing
+    """
+    return process_file(str(input_path), str(output_path), auto_generate_params)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if __name__ == "__main__":
     """Sample usage:
     # Running sweep
+<<<<<<< HEAD
     python inputcode.py
+=======
+    python _get_clean_triton.py output_code.py
+
+    # To disable auto-generation of launch params:
+    python _get_clean_triton.py output_code.py --no-auto-generate
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     parser = argparse.ArgumentParser(
         description="Clean Inductor generated code to remove Inductor dependencies"
@@ -142,9 +213,23 @@ def get_clean_triton(
         default=Path("triton_only_repro.py"),
         help="Path to write out the clean triton output",
     )
+<<<<<<< HEAD
+=======
+    parser.add_argument(
+        "--no-auto-generate",
+        action="store_true",
+        help="Disable automatic generation of launch_params file",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Parse the arguments
     args = parser.parse_args()
 
     # Call the function with parsed arguments
+<<<<<<< HEAD
     result = get_clean_triton(args.input_path, args.output_path)
+=======
+    result = get_clean_triton(
+        args.input_path, args.output_path, not args.no_auto_generate
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/_helion.py b/torch/utils/_helion.py
new file mode 100644
index 000000000000..6d30832cf3f7
--- /dev/null
+++ b/torch/utils/_helion.py
@@ -0,0 +1,17 @@
+import functools
+
+from torch.utils._triton import has_triton
+
+
+@functools.cache
+def has_helion_package() -> bool:
+    try:
+        import helion  # type: ignore[import-untyped, import-not-found]  # noqa: F401
+    except ImportError:
+        return False
+    return True
+
+
+@functools.cache
+def has_helion() -> bool:
+    return has_helion_package() and has_triton()
diff --git a/torch/utils/_mode_utils.py b/torch/utils/_mode_utils.py
index 91c0e07b3d93..3a9fbe6a95ca 100644
--- a/torch/utils/_mode_utils.py
+++ b/torch/utils/_mode_utils.py
@@ -1,11 +1,25 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import torch
 from typing import TypeVar
 
 T = TypeVar('T')
+=======
+from typing import TypeVar
+
+import torch
+
+
+T = TypeVar("T")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # returns if all are the same mode
 def all_same_mode(modes):
     return all(tuple(mode == modes[0] for mode in modes))
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 no_dispatch = torch._C._DisableTorchDispatch
diff --git a/torch/utils/_ordered_set.py b/torch/utils/_ordered_set.py
index 29373289c426..38212e31acc6 100644
--- a/torch/utils/_ordered_set.py
+++ b/torch/utils/_ordered_set.py
@@ -33,7 +33,11 @@ def _from_dict(dict_inp: dict[T, None]) -> OrderedSet[T]:
         return s
 
     #
+<<<<<<< HEAD
     # Required overriden abstract methods
+=======
+    # Required overridden abstract methods
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #
     def __contains__(self, elem: object) -> bool:
         return elem in self._dict
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index 4a06fd13ae46..e7952f0739c0 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import contextlib
+<<<<<<< HEAD
 
 import warnings
 from dataclasses import dataclass
@@ -7,6 +8,14 @@
 from collections.abc import Sequence
 from typing_extensions import TypeIs
 from collections import deque
+=======
+import warnings
+from collections import deque
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any, Optional, overload, Protocol, Union
+from typing_extensions import TypeIs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torchgen
@@ -29,8 +38,18 @@
 _is_in_torch_dispatch_mode = False
 _is_in_non_infra_torch_dispatch_mode = False
 
+<<<<<<< HEAD
 def is_in_torch_dispatch_mode(include_infra_modes=True) -> bool:
     return _is_in_torch_dispatch_mode if include_infra_modes else _is_in_non_infra_torch_dispatch_mode
+=======
+
+def is_in_torch_dispatch_mode(include_infra_modes=True) -> bool:
+    return (
+        _is_in_torch_dispatch_mode
+        if include_infra_modes
+        else _is_in_non_infra_torch_dispatch_mode
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TorchDispatchMode:
@@ -79,7 +98,10 @@ def _lazy_init_old_dispatch_mode_flags(self):
         if not hasattr(self, "old_non_infra_dispatch_mode_flags"):
             self.old_non_infra_dispatch_mode_flags: deque[bool] = deque()  # type: ignore[no-redef]
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         raise NotImplementedError
 
@@ -93,8 +115,17 @@ def __enter__(self):
         self._lazy_init_old_dispatch_mode_flags()
         self.old_dispatch_mode_flags.append(_is_in_torch_dispatch_mode)
         _is_in_torch_dispatch_mode = True
+<<<<<<< HEAD
         self.old_non_infra_dispatch_mode_flags.append(_is_in_non_infra_torch_dispatch_mode)
         _is_in_non_infra_torch_dispatch_mode = _is_in_non_infra_torch_dispatch_mode or not self.is_infra_mode()
+=======
+        self.old_non_infra_dispatch_mode_flags.append(
+            _is_in_non_infra_torch_dispatch_mode
+        )
+        _is_in_non_infra_torch_dispatch_mode = (
+            _is_in_non_infra_torch_dispatch_mode or not self.is_infra_mode()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _push_mode(self)
         return self
 
@@ -107,7 +138,13 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         global _is_in_torch_dispatch_mode
         _is_in_torch_dispatch_mode = self.old_dispatch_mode_flags.pop()
         global _is_in_non_infra_torch_dispatch_mode
+<<<<<<< HEAD
         _is_in_non_infra_torch_dispatch_mode = self.old_non_infra_dispatch_mode_flags.pop()
+=======
+        _is_in_non_infra_torch_dispatch_mode = (
+            self.old_non_infra_dispatch_mode_flags.pop()
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _pop_mode(mb_dk_or_mode_key)
 
     @classmethod
@@ -123,7 +160,10 @@ def is_infra_mode(cls):
         return False
 
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_current_dispatch_mode():
     stack_len = _len_torch_dispatch_stack()
     # Return a user mode on the stack if there are any
@@ -133,6 +173,7 @@ def _get_current_dispatch_mode():
 
 
 def _detect_infra_mode(key):
+<<<<<<< HEAD
     assert key in [torch._C._TorchDispatchModeKey.FUNCTIONAL, torch._C._TorchDispatchModeKey.PROXY]
     from torch._ops import _get_dispatch_mode_pre_dispatch
 
@@ -146,6 +187,18 @@ def _detect_infra_mode(key):
     assert (pre_dispatch_mode is None) or (
         post_dispatch_mode is None
     )
+=======
+    assert key in [
+        torch._C._TorchDispatchModeKey.FUNCTIONAL,
+        torch._C._TorchDispatchModeKey.PROXY,
+    ]
+    from torch._ops import _get_dispatch_mode_pre_dispatch
+
+    pre_dispatch_mode = _get_dispatch_mode_pre_dispatch(key)
+    post_dispatch_mode = torch._C._get_dispatch_mode(key)
+
+    assert (pre_dispatch_mode is None) or (post_dispatch_mode is None)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if pre_dispatch_mode is None:
         return post_dispatch_mode
@@ -232,8 +285,13 @@ def _disable_current_modes():
         _pop_mode_from_pre_dispatch,
     )
     from torch._subclasses.functional_tensor import FunctionalTensorMode
+<<<<<<< HEAD
     from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
     from torch._subclasses.schema_check_mode import SchemaCheckMode
+=======
+    from torch._subclasses.schema_check_mode import SchemaCheckMode
+    from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     mode_len_pre_dispatch = _len_torch_dispatch_stack_pre_dispatch()
     old_pre_dispatch_modes = [
@@ -267,10 +325,14 @@ def _disable_current_modes():
             raise AssertionError(
                 "Can't have ProxyTorchDispatchMode available both in PreDispatch and Python Key"
             )
+<<<<<<< HEAD
         if (
             isinstance(old, SchemaCheckMode)
             and has_schema_check_mode_in_pre_dispatch
         ):
+=======
+        if isinstance(old, SchemaCheckMode) and has_schema_check_mode_in_pre_dispatch:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             raise AssertionError(
                 "Can't have SchemaCheckMode available both in PreDispatch and Python Key"
             )
@@ -298,7 +360,13 @@ def __tensor_flatten__(self) -> tuple[Sequence[str], object]:
         ...
 
     @staticmethod
+<<<<<<< HEAD
     def __tensor_unflatten__(inner_tensors: int, flatten_spec: int, outer_size: int, outer_stride: int) -> torch.Tensor:
+=======
+    def __tensor_unflatten__(
+        inner_tensors: int, flatten_spec: int, outer_size: int, outer_stride: int
+    ) -> torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ...
 
     # It would be really nice to be able to say that the return of
@@ -331,17 +399,27 @@ def dim(self) -> int:
 
     @overload
     def to(
+<<<<<<< HEAD
             self,
             dtype: torch.types._dtype,
             non_blocking: bool = False,
             copy: bool = False,
             *,
             memory_format: Optional[torch.memory_format] = None
+=======
+        self,
+        dtype: torch.types._dtype,
+        non_blocking: bool = False,
+        copy: bool = False,
+        *,
+        memory_format: Optional[torch.memory_format] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> torch.Tensor:
         ...
 
     @overload
     def to(
+<<<<<<< HEAD
             self,
             device: Optional["torch._prims_common.DeviceLikeType"] = None,
             dtype: Optional[torch.types._dtype] = None,
@@ -349,23 +427,44 @@ def to(
             copy: bool = False,
             *,
             memory_format: Optional[torch.memory_format] = None
+=======
+        self,
+        device: Optional["torch._prims_common.DeviceLikeType"] = None,
+        dtype: Optional[torch.types._dtype] = None,
+        non_blocking: bool = False,
+        copy: bool = False,
+        *,
+        memory_format: Optional[torch.memory_format] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> torch.Tensor:
         ...
 
     @overload
     def to(
+<<<<<<< HEAD
             self,
             other: torch.Tensor,
             non_blocking: bool = False,
             copy: bool = False,
             *,
             memory_format: Optional[torch.memory_format] = None
+=======
+        self,
+        other: torch.Tensor,
+        non_blocking: bool = False,
+        copy: bool = False,
+        *,
+        memory_format: Optional[torch.memory_format] = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> torch.Tensor:
         ...
 
 
+<<<<<<< HEAD
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def is_traceable_wrapper_subclass(t: object) -> TypeIs[TensorWithFlatten]:
     """
     Returns whether or not a tensor subclass that implements __torch_dispatch__
@@ -403,10 +502,22 @@ def is_traceable_wrapper_subclass(t: object) -> TypeIs[TensorWithFlatten]:
         and hasattr(t, "__tensor_unflatten__")
     )
 
+<<<<<<< HEAD
 def is_traceable_wrapper_subclass_type(t: type) -> TypeIs[type[TensorWithFlatten]]:
     """Same as above, but takes a type argument instead of an instance."""
     return (issubclass(t, torch.Tensor) and t != torch.Tensor
             and hasattr(t, "__tensor_flatten__") and hasattr(t, "__tensor_unflatten__"))
+=======
+
+def is_traceable_wrapper_subclass_type(t: type) -> TypeIs[type[TensorWithFlatten]]:
+    """Same as above, but takes a type argument instead of an instance."""
+    return (
+        issubclass(t, torch.Tensor)
+        and t != torch.Tensor
+        and hasattr(t, "__tensor_flatten__")
+        and hasattr(t, "__tensor_unflatten__")
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def transform_subclass(t, callback, outer_size=None, outer_stride=None):
@@ -551,7 +662,13 @@ def get_alias_info(func) -> SchemaInfo:
         torchgen_schema_str = re.sub(r"=\[[0, ]+\]", "=0", torchgen_schema_str)
         torchgen_schema_str = re.sub(r"=\[[1, ]+\]", "=1", torchgen_schema_str)
         # for aten::rot90 / aten:fft_*
+<<<<<<< HEAD
         torchgen_schema_str = re.sub(r"=\[(-?[0-9]+), (-?[0-9]+)\]", r"=[\1,\2]", torchgen_schema_str)
+=======
+        torchgen_schema_str = re.sub(
+            r"=\[(-?[0-9]+), (-?[0-9]+)\]", r"=[\1,\2]", torchgen_schema_str
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torchgen_schema = torchgen.model.FunctionSchema.parse(torchgen_schema_str)
         arg_schemas = [
             AliasInfo(
diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py
index 1b8859e62829..e9de1aaeeea5 100644
--- a/torch/utils/_pytree.py
+++ b/torch/utils/_pytree.py
@@ -31,14 +31,27 @@
     Any,
     Callable,
     cast,
+<<<<<<< HEAD
     Generic,
+=======
+    ClassVar,
+    Final,
+    Generic,
+    NoReturn,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Optional,
     overload,
     Protocol,
     TypeVar,
     Union,
 )
+<<<<<<< HEAD
 from typing_extensions import deprecated, NamedTuple
+=======
+from typing_extensions import deprecated, NamedTuple, Self
+
+from torch.torch_version import TorchVersion as _TorchVersion
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 __all__ = [
@@ -54,6 +67,10 @@
     "keystr",
     "key_get",
     "register_pytree_node",
+<<<<<<< HEAD
+=======
+    "tree_is_leaf",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "tree_flatten",
     "tree_flatten_with_path",
     "tree_unflatten",
@@ -73,6 +90,15 @@
     "treespec_dumps",
     "treespec_loads",
     "treespec_pprint",
+<<<<<<< HEAD
+=======
+    "is_namedtuple",
+    "is_namedtuple_class",
+    "is_namedtuple_instance",
+    "is_structseq",
+    "is_structseq_class",
+    "is_structseq_instance",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -101,10 +127,21 @@ def get(self, parent: Any) -> Any:
 
 
 class EnumEncoder(json.JSONEncoder):
+<<<<<<< HEAD
     def default(self, obj: object) -> str:
         if isinstance(obj, Enum):
             return obj.value  # type: ignore[no-any-return]
         return super().default(obj)  # type: ignore[no-any-return]
+=======
+    def default(self, obj: object) -> Union[str, dict[str, Any]]:
+        if isinstance(obj, Enum):
+            return {
+                "__enum__": True,
+                "fqn": f"{obj.__class__.__module__}:{obj.__class__.__qualname__}",
+                "name": obj.name,
+            }
+        return cast(str, super().default(obj))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 Context = Any
@@ -160,16 +197,27 @@ class _SerializeNodeDef(NamedTuple):
 # NB: we try really hard to not import _cxx_pytree (which depends on optree)
 # as much as possible. This is for isolation: a user who is not using C++ pytree
 # shouldn't pay for it, and it helps makes things like cpython upgrades easier.
+<<<<<<< HEAD
+=======
+_optree_minimum_version = _TorchVersion("0.13.0")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 try:
     _optree_version = importlib.metadata.version("optree")
 except importlib.metadata.PackageNotFoundError:
     # No optree package found
     _cxx_pytree_dynamo_traceable = _cxx_pytree_exists = False
+<<<<<<< HEAD
 else:
     from torch._vendor.packaging.version import Version
 
     # Keep this in sync with torch.utils._cxx_pytree!
     if Version(_optree_version) < Version("0.13.0"):
+=======
+    _optree_version = _TorchVersion("0.0.0a0")
+else:
+    _optree_version = _TorchVersion(_optree_version)
+    if _optree_version < _optree_minimum_version:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # optree package less than our required minimum version.
         # Pretend the optree package doesn't exist.
         # NB: We will raise ImportError if the user directly tries to
@@ -194,6 +242,13 @@ def register_pytree_node(
 ) -> None:
     """Register a container-like type as pytree node.
 
+<<<<<<< HEAD
+=======
+    Note:
+        :func:`register_dataclass` is a simpler way of registering a container-like
+        type as a pytree node.
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Args:
         cls: the type to register
         flatten_fn: A callable that takes a pytree and returns a flattened
@@ -254,6 +309,7 @@ def register_pytree_node(
         _cxx_pytree_pending_imports.append((args, kwargs))
 
 
+<<<<<<< HEAD
 def register_dataclass(cls: type[Any]) -> None:
     """Registers a ``dataclasses.dataclass`` type as a pytree node.
 
@@ -262,6 +318,36 @@ def register_dataclass(cls: type[Any]) -> None:
 
     Args:
         cls: the dataclass type to register
+=======
+def register_dataclass(
+    cls: type[Any],
+    *,
+    field_names: Optional[list[str]] = None,
+    drop_field_names: Optional[list[str]] = None,
+    serialized_type_name: Optional[str] = None,
+) -> None:
+    """
+    Registers a type that has the semantics of a ``dataclasses.dataclass`` type
+    as a pytree node.
+
+    This is a simpler API than :func:`register_pytree_node` for registering
+    a dataclass or a custom class with the semantics of a dataclass.
+
+    Args:
+        cls: The python type to register. The class must have the semantics of a
+        dataclass; in particular, it must be constructed by passing the fields
+        in.
+        field_names (Optional[List[str]]): A list of field names that correspond
+            to the **non-constant data** in this class. This list must contain
+            all the fields that are used to initialize the class. This argument
+            is optional if ``cls`` is a dataclass, in which case the fields will
+            be taken from ``dataclasses.fields()``.
+        drop_field_names (Optional[List[str]]): A list of field names that
+            should not be included in the pytree.
+        serialized_type_name: A keyword argument used to specify the fully
+            qualified name used when serializing the tree spec. This is only
+            needed for serializing the treespec in torch.export.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example:
 
@@ -282,11 +368,75 @@ def register_dataclass(cls: type[Any]) -> None:
         >>> assert torch.allclose(point.y, torch.tensor(2))
 
     """
+<<<<<<< HEAD
     import torch.export
 
     # Eventually we should move the export code here. It is not specific to export,
     # aside from the serialization pieces.
     torch.export.register_dataclass(cls)
+=======
+    drop_field_names = drop_field_names or []
+
+    if not dataclasses.is_dataclass(cls):
+        if field_names is None:
+            raise ValueError(
+                "field_names must be specified with a list of all fields used to "
+                f"initialize {cls}, as it is not a dataclass."
+            )
+    elif field_names is None:
+        field_names = [f.name for f in dataclasses.fields(cls) if f.init]
+    else:
+        dataclass_init_fields = {f.name for f in dataclasses.fields(cls) if f.init}
+        dataclass_init_fields.difference_update(drop_field_names)
+
+        if dataclass_init_fields != set(field_names):
+            error_msg = "field_names does not include all dataclass fields.\n"
+
+            if missing := dataclass_init_fields - set(field_names):
+                error_msg += (
+                    f"Missing fields in `field_names`: {missing}. If you want "
+                    "to include these fields in the pytree, please add them "
+                    "to `field_names`, otherwise please add them to "
+                    "`drop_field_names`.\n"
+                )
+
+            if unexpected := set(field_names) - dataclass_init_fields:
+                error_msg += (
+                    f"Unexpected fields in `field_names`: {unexpected}. "
+                    "Please remove these fields, or add them to `drop_field_names`.\n"
+                )
+
+            raise ValueError(error_msg)
+
+    def _flatten_fn(obj: Any) -> tuple[list[Any], Context]:
+        flattened = []
+        flat_names = []
+        none_names = []
+        for name in field_names:
+            val = getattr(obj, name)
+            if val is not None:
+                flattened.append(val)
+                flat_names.append(name)
+            else:
+                none_names.append(name)
+        return flattened, [flat_names, none_names]
+
+    def _unflatten_fn(values: Iterable[Any], context: Context) -> Any:
+        flat_names, none_names = context
+        return cls(**dict(zip(flat_names, values)), **dict.fromkeys(none_names))
+
+    def _flatten_fn_with_keys(obj: Any) -> tuple[list[Any], Context]:
+        flattened, (flat_names, _none_names) = _flatten_fn(obj)  # type: ignore[misc]
+        return [(MappingKey(k), v) for k, v in zip(flat_names, flattened)], flat_names
+
+    _private_register_pytree_node(
+        cls,
+        _flatten_fn,
+        _unflatten_fn,
+        serialized_type_name=serialized_type_name,
+        flatten_with_keys_fn=_flatten_fn_with_keys,
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 CONSTANT_NODES: set[type] = set()
@@ -572,6 +722,93 @@ def get(self, obj: Any) -> Any:
         return getattr(obj, self.name)
 
 
+<<<<<<< HEAD
+=======
+# Reference: https://github.com/metaopt/optree/blob/main/optree/typing.py
+def is_namedtuple(obj: Union[object, type]) -> bool:
+    """Return whether the object is an instance of namedtuple or a subclass of namedtuple."""
+    cls = obj if isinstance(obj, type) else type(obj)
+    return is_namedtuple_class(cls)
+
+
+# Reference: https://github.com/metaopt/optree/blob/main/optree/typing.py
+def is_namedtuple_class(cls: type) -> bool:
+    """Return whether the class is a subclass of namedtuple."""
+    return (
+        isinstance(cls, type)
+        and issubclass(cls, tuple)
+        and isinstance(getattr(cls, "_fields", None), tuple)
+        and all(type(field) is str for field in cls._fields)  # type: ignore[attr-defined]
+        and callable(getattr(cls, "_make", None))
+        and callable(getattr(cls, "_asdict", None))
+    )
+
+
+# Reference: https://github.com/metaopt/optree/blob/main/optree/typing.py
+def is_namedtuple_instance(obj: object) -> bool:
+    """Return whether the object is an instance of namedtuple."""
+    return is_namedtuple_class(type(obj))
+
+
+_T_co = TypeVar("_T_co", covariant=True)
+
+
+# Reference: https://github.com/metaopt/optree/blob/main/optree/typing.py
+class structseq(tuple[_T_co, ...]):
+    """A generic type stub for CPython's ``PyStructSequence`` type."""
+
+    __slots__: ClassVar[tuple[()]] = ()
+
+    n_fields: Final[int]  # type: ignore[misc]
+    n_sequence_fields: Final[int]  # type: ignore[misc]
+    n_unnamed_fields: Final[int]  # type: ignore[misc]
+
+    def __init_subclass__(cls) -> NoReturn:
+        """Prohibit subclassing."""
+        raise TypeError("type 'structseq' is not an acceptable base type")
+
+    def __new__(
+        cls: type[Self],
+        sequence: Iterable[_T_co],
+        dict: dict[str, Any] = ...,
+    ) -> Self:
+        raise NotImplementedError
+
+
+# Reference: https://github.com/metaopt/optree/blob/main/optree/typing.py
+def is_structseq(obj: Union[object, type]) -> bool:
+    """Return whether the object is an instance of PyStructSequence or a class of PyStructSequence."""
+    cls = obj if isinstance(obj, type) else type(obj)
+    return is_structseq_class(cls)
+
+
+# Set if the type allows subclassing (see CPython's Include/object.h)
+Py_TPFLAGS_BASETYPE: int = 1 << 10
+
+
+# Reference: https://github.com/metaopt/optree/blob/main/optree/typing.py
+def is_structseq_class(cls: type) -> bool:
+    """Return whether the class is a class of PyStructSequence."""
+    return (
+        isinstance(cls, type)
+        # Check direct inheritance from `tuple` rather than `issubclass(cls, tuple)`
+        and cls.__bases__ == (tuple,)
+        # Check PyStructSequence members
+        and isinstance(getattr(cls, "n_fields", None), int)
+        and isinstance(getattr(cls, "n_sequence_fields", None), int)
+        and isinstance(getattr(cls, "n_unnamed_fields", None), int)
+        # Check the type does not allow subclassing
+        and not bool(cls.__flags__ & Py_TPFLAGS_BASETYPE)  # only works for CPython
+    )
+
+
+# Reference: https://github.com/metaopt/optree/blob/main/optree/typing.py
+def is_structseq_instance(obj: object) -> bool:
+    """Return whether the object is an instance of PyStructSequence."""
+    return is_structseq_class(type(obj))
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _tuple_flatten(d: tuple[T, ...]) -> tuple[list[T], Context]:
     return list(d), None
 
@@ -806,6 +1043,7 @@ def _deque_unflatten(values: Iterable[T], context: Context) -> deque[T]:
 )
 
 
+<<<<<<< HEAD
 STANDARD_DICT_TYPES: frozenset[type] = frozenset(
     {dict, OrderedDict, defaultdict},
 )
@@ -837,6 +1075,74 @@ def _is_leaf(tree: PyTree, is_leaf: Optional[Callable[[PyTree], bool]] = None) -
     return (is_leaf is not None and is_leaf(tree)) or _get_node_type(
         tree
     ) not in SUPPORTED_NODES
+=======
+STANDARD_DICT_TYPES: frozenset[type] = frozenset({dict, OrderedDict, defaultdict})
+BUILTIN_TYPES: frozenset[type] = frozenset(
+    {
+        tuple,
+        list,
+        dict,
+        namedtuple,  # type: ignore[arg-type]
+        OrderedDict,
+        defaultdict,
+        deque,
+    },
+)
+
+
+@deprecated(
+    "torch.utils._pytree._is_namedtuple_instance is private and will be removed in a future release. "
+    "Please use torch.utils._pytree.is_namedtuple_instance instead.",
+    category=FutureWarning,
+)
+def _is_namedtuple_instance(tree: Any) -> bool:
+    return is_namedtuple_instance(tree)
+
+
+def _get_node_type(tree: Any) -> Any:
+    node_type = type(tree)
+    # All namedtuple types are implicitly registered as pytree nodes.
+    # XXX: Other parts of the codebase expect namedtuple types always return
+    #      `namedtuple` instead of the actual namedtuple type. Even if the type
+    #      is explicitly registered.
+    if is_namedtuple_class(node_type):
+        return namedtuple
+    return node_type
+
+
+# A leaf is defined as anything that is not a Node.
+def tree_is_leaf(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    """Check if a pytree is a leaf.
+
+    >>> tree_is_leaf(1)
+    True
+    >>> tree_is_leaf(None)
+    True
+    >>> tree_is_leaf([1, 2, 3])
+    False
+    >>> tree_is_leaf((1, 2, 3), is_leaf=lambda x: isinstance(x, tuple))
+    True
+    >>> tree_is_leaf({'a': 1, 'b': 2, 'c': 3})
+    False
+    >>> tree_is_leaf({'a': 1, 'b': 2, 'c': None})
+    False
+    """
+    if is_leaf is not None and is_leaf(tree):
+        return True
+    return _get_node_type(tree) not in SUPPORTED_NODES
+
+
+@deprecated(
+    "torch.utils._pytree._is_leaf is private and will be removed in a future release. "
+    "Please use torch.utils._pytree.tree_is_leaf instead.",
+    category=FutureWarning,
+)
+def _is_leaf(tree: PyTree, is_leaf: Optional[Callable[[PyTree], bool]] = None) -> bool:
+    return tree_is_leaf(tree, is_leaf=is_leaf)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # A TreeSpec represents the structure of a pytree. It holds:
@@ -1002,6 +1308,29 @@ def unflatten(self, leaves: Iterable[Any]) -> PyTree:
 
         return unflatten_fn(child_pytrees, self.context)
 
+<<<<<<< HEAD
+=======
+    def __hash__(self) -> int:
+        node_type = self.type
+        if node_type is defaultdict:
+            default_factory, dict_context = self.context
+            hashable_context = (default_factory, tuple(dict_context))
+        elif node_type in (dict, OrderedDict):
+            hashable_context = tuple(self.context)
+        elif node_type is None or node_type in BUILTIN_TYPES:
+            hashable_context = self.context
+        elif isinstance(self.context, ConstantNode):
+            hashable_context = self.context.value
+        else:
+            # The context for user-defined node types might not be hashable.
+            # Ignore it for hashing.
+            # This does not break the correctness that equal objects imply the
+            # same hash. This might increase the hash collision rate, but we
+            # don't care about that.
+            hashable_context = None
+        return hash((node_type, hashable_context, tuple(self.children_specs)))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # NOTE: subclassing a dataclass is subtle. In order to enable reasoning about
 # this class with `dataclasses.fields`, etc., while having a simplified
@@ -1039,7 +1368,11 @@ def tree_flatten(
     """
 
     def helper(node: PyTree, leaves: list[Any]) -> TreeSpec:
+<<<<<<< HEAD
         if _is_leaf(node, is_leaf=is_leaf):
+=======
+        if tree_is_leaf(node, is_leaf=is_leaf):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             leaves.append(node)
             return _LEAF_SPEC
 
@@ -1073,7 +1406,11 @@ def tree_iter(
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> Iterable[Any]:
     """Get an iterator over the leaves of a pytree."""
+<<<<<<< HEAD
     if _is_leaf(tree, is_leaf=is_leaf):
+=======
+    if tree_is_leaf(tree, is_leaf=is_leaf):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         yield tree
     else:
         node_type = _get_node_type(tree)
@@ -1519,7 +1856,11 @@ def _broadcast_to_and_flatten(
 ) -> Optional[list[Any]]:
     assert isinstance(treespec, TreeSpec)
 
+<<<<<<< HEAD
     if _is_leaf(tree, is_leaf=is_leaf):
+=======
+    if tree_is_leaf(tree, is_leaf=is_leaf):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return [tree] * treespec.num_leaves
     if treespec.is_leaf():
         return None
@@ -1605,6 +1946,21 @@ def _treespec_to_json(treespec: TreeSpec) -> _TreeSpecSchema:
     return _TreeSpecSchema(serialized_type_name, serialized_context, child_schemas)
 
 
+<<<<<<< HEAD
+=======
+def enum_object_hook(obj: dict[str, Any]) -> Union[Enum, dict[str, Any]]:
+    if "__enum__" in obj:
+        modname, _, classname = obj["fqn"].partition(":")
+        mod = importlib.import_module(modname)
+        enum_cls = mod
+        for attr in classname.split("."):
+            enum_cls = getattr(enum_cls, attr)
+        enum_cls = cast(type[Enum], enum_cls)
+        return enum_cls[obj["name"]]
+    return obj
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _json_to_treespec(json_schema: DumpableContext) -> TreeSpec:
     if (
         json_schema["type"] is None
@@ -1623,7 +1979,11 @@ def _json_to_treespec(json_schema: DumpableContext) -> TreeSpec:
 
     if serialize_node_def.from_dumpable_context is None:
         try:
+<<<<<<< HEAD
             context = json.loads(json_schema["context"])
+=======
+            context = json.loads(json_schema["context"], object_hook=enum_object_hook)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         except TypeError as ex:
             raise TypeError(
                 "Unable to deserialize context. "
diff --git a/torch/utils/_stats.py b/torch/utils/_stats.py
index 6d9d48233ee0..24a0b95fdff2 100644
--- a/torch/utils/_stats.py
+++ b/torch/utils/_stats.py
@@ -3,8 +3,13 @@
 # AND SCRUB AWAY TORCH NOTIONS THERE.
 import collections
 import functools
+<<<<<<< HEAD
 from typing import Callable, TypeVar
 from collections import OrderedDict
+=======
+from collections import OrderedDict
+from typing import Callable, TypeVar
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing_extensions import ParamSpec
 
 
@@ -18,6 +23,10 @@ def count_label(label: str) -> None:
     prev = simple_call_counter.setdefault(label, 0)
     simple_call_counter[label] = prev + 1
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def count(fn: Callable[_P, _R]) -> Callable[_P, _R]:
     @functools.wraps(fn)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
@@ -25,4 +34,8 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
             simple_call_counter[fn.__qualname__] = 0
         simple_call_counter[fn.__qualname__] = simple_call_counter[fn.__qualname__] + 1
         return fn(*args, **kwargs)
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return wrapper
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index 6393180058d3..8ad83357e0af 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -342,9 +342,15 @@ def eval(
                         and isinstance(term.args[0], sympy.Integer)
                         and term.args[0] < 0
                     ):
+<<<<<<< HEAD
                         # workaround for https://github.com/openai/triton/issues/619,
                         # if there are negative terms, // produces wrong result
                         # TODO if https://github.com/openai/triton/issues/619 is fixed
+=======
+                        # workaround for https://github.com/triton-lang/triton/issues/619,
+                        # if there are negative terms, // produces wrong result
+                        # TODO if https://github.com/triton-lang/triton/issues/619 is fixed
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         # this optimization would become valid
                         all_positive = False
                         break
@@ -363,10 +369,13 @@ def _eval_is_nonnegative(self) -> Optional[bool]:
         p, q = self.args[:2]
         return fuzzy_eq(p.is_nonnegative, q.is_nonnegative)  # type: ignore[attr-defined]
 
+<<<<<<< HEAD
     def _eval_is_positive(self) -> Optional[bool]:
         p, q = self.args[:2]
         return fuzzy_eq(p.is_positive, q.is_positive)  # type: ignore[attr-defined]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class Where(sympy.Function):
     """
@@ -535,6 +544,13 @@ def eval(cls, number):
         if isinstance(number, sympy.Number):
             return sympy.Integer(math.ceil(float(number)))
 
+<<<<<<< HEAD
+=======
+    def _ccode(self, printer):
+        number = printer.parenthesize(self.args[0], self.args[0].precedence - 0.5)
+        return f"ceil({number})"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class FloorToInt(sympy.Function):
     is_integer = True
@@ -1127,6 +1143,14 @@ def eval(cls, base, divisor):
         if isinstance(base, sympy.Integer) and isinstance(divisor, sympy.Integer):
             return sympy.Float(int(base) / int(divisor))
 
+<<<<<<< HEAD
+=======
+    def _ccode(self, printer):
+        base = printer.parenthesize(self.args[0], PRECEDENCE["Atom"] - 0.5)
+        divisor = printer.parenthesize(self.args[1], PRECEDENCE["Atom"] - 0.5)
+        return f"((int){base}/(int){divisor})"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: As an indicator, this != 0 implies == 1 (and vice versa).
 # Because we do not have the ability to guard on the stride permutation
@@ -1295,6 +1319,15 @@ def _eval_expand_identity(self, **hints):
         # Removes the identity op.
         return self.args[0]
 
+<<<<<<< HEAD
+=======
+    def __int__(self) -> int:
+        return int(self.args[0])
+
+    def __float__(self) -> float:
+        return float(self.args[0])
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def make_opaque_unary_fn(name):
     class OpaqueUnaryFn(sympy.Function):
@@ -1309,6 +1342,10 @@ class OpaqueUnaryFn(sympy.Function):
         """
 
         _torch_handler_name = name
+<<<<<<< HEAD
+=======
+        _torch_unpickler = make_opaque_unary_fn
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @classmethod
         def eval(cls, a):
@@ -1369,6 +1406,12 @@ def make_opaque_bitwise_fn(name, real_op_name):
     class BitwiseFn(sympy.Function):
         _torch_handler_name = name
         precedence: int = prec
+<<<<<<< HEAD
+=======
+        _torch_unpickler = functools.partial(
+            make_opaque_bitwise_fn, real_op_name=real_op_name
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         @classmethod
         def eval(cls, a, b):
diff --git a/torch/utils/_sympy/interp.py b/torch/utils/_sympy/interp.py
index 396d1d46d289..e88d66be2c57 100644
--- a/torch/utils/_sympy/interp.py
+++ b/torch/utils/_sympy/interp.py
@@ -51,7 +51,11 @@
 # TODO: Dedupe this with SYMPY_INTERP
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def handlers():
     # TODO add CeilDiv (it doesn't appear in the index_expr)
 
diff --git a/torch/utils/_sympy/printers.py b/torch/utils/_sympy/printers.py
index 996c68a8bb7a..8f11ec0a48d4 100644
--- a/torch/utils/_sympy/printers.py
+++ b/torch/utils/_sympy/printers.py
@@ -7,6 +7,11 @@
 
 
 INDEX_TYPE = "int64_t"
+<<<<<<< HEAD
+=======
+INDEX_TYPE_MAX = (1 << 63) - 1
+INDEX_TYPE_MIN = -1 << 63
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # This printer contains rules that are supposed to be generic for both C/C++ and
@@ -45,6 +50,18 @@ def _print_CleanDiv(self, expr: sympy.Expr) -> str:
     def _print_Identity(self, expr: sympy.Expr) -> str:
         return self._print(expr.args[0])
 
+<<<<<<< HEAD
+=======
+    def _print_Float(self, expr: sympy.Expr) -> str:
+        if expr._prec == 53:
+            # IEEE-754 double precision have 53 bits. SymPy prints them with
+            # 15 digits, but we need 17 for round-trip correctness
+            return str(sympy.Float(expr, dps=17))
+        else:
+            # We don't use other precisions in pytorch
+            return str(expr)
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # This must be implemented because sympy will collect x * x into Pow(x, 2), without
     # any explicit intervention.  We print it just like x * x, notably, we
     # never generate sympy.Pow with floats.
@@ -253,6 +270,13 @@ def _print_OpaqueUnaryFn_atan(self, expr: sympy.Expr) -> str:
         assert len(expr.args) == 1
         return f"math.atan({self._print(expr.args[0])})"
 
+<<<<<<< HEAD
+=======
+    def _print_OpaqueUnaryFn_log2(self, expr: sympy.Expr) -> str:
+        assert len(expr.args) == 1
+        return f"math.log2({self._print(expr.args[0])})"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _print_RoundToInt(self, expr: sympy.Expr) -> str:
         assert len(expr.args) == 1
         return f"round({self._print(expr.args[0])})"
@@ -266,9 +290,22 @@ def _print_RoundDecimal(self, expr: sympy.Expr) -> str:
 
 class CppPrinter(ExprPrinter):
     def _print_Integer(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
         return (
             f"{int(expr)}LL" if sys.platform in ["darwin", "win32"] else f"{int(expr)}L"
         )
+=======
+        suffix = "LL" if sys.platform in ["darwin", "win32"] else "L"
+        i = int(expr)
+        if i > INDEX_TYPE_MAX or i < INDEX_TYPE_MIN:
+            raise OverflowError(f"{i} too big to convert to {INDEX_TYPE}")
+        elif i == INDEX_TYPE_MIN:
+            assert i == (-1) << 63
+            # Writing -9223372036854775808L makes the value overflow
+            # as it is parsed as -(9223372036854775808L) by the C/C++ compiler
+            return f"(-1{suffix} << 63)"
+        return f"{i}{suffix}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _print_Where(self, expr: sympy.Expr) -> str:
         c, p, q = (
@@ -333,6 +370,13 @@ def _print_IntTrueDiv(self, expr: sympy.Expr) -> str:
     # TODO: PowByNatural: we need to implement our own int-int pow.  Do NOT
     # use std::pow, that operates on floats
     def _print_PowByNatural(self, expr: sympy.Expr) -> str:
+<<<<<<< HEAD
+=======
+        # Implement the special-case of 2**x for now
+        base, exp = expr.args
+        if base == 2:
+            return f"(1 << ({self._print(exp)}))"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise NotImplementedError(
             f"_print_PowByNatural not implemented for {type(self)}"
         )
@@ -393,7 +437,11 @@ def _print_Min(self, expr: sympy.Expr) -> str:
         else:
             # Initializer list overload
             il = "{" + ", ".join(args) + "}"
+<<<<<<< HEAD
             return f"std::min({il})"
+=======
+            return f"std::min<{INDEX_TYPE}>({il})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _print_Max(self, expr: sympy.Expr) -> str:
         args = [self._print(a) for a in expr.args]
@@ -402,7 +450,11 @@ def _print_Max(self, expr: sympy.Expr) -> str:
         else:
             # Initializer list overload
             il = "{" + ", ".join(args) + "}"
+<<<<<<< HEAD
             return f"std::max({il})"
+=======
+            return f"std::max<{INDEX_TYPE}>({il})"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def _print_Abs(self, expr: sympy.Expr) -> str:
         assert len(expr.args) == 1
@@ -447,6 +499,12 @@ def _print_OpaqueUnaryFn_atan(self, expr: sympy.Expr) -> str:
     def _print_OpaqueUnaryFn_sqrt(self, expr: sympy.Expr) -> str:
         return f"std::sqrt({self._print(expr.args[0])})"
 
+<<<<<<< HEAD
+=======
+    def _print_OpaqueUnaryFn_log2(self, expr: sympy.Expr) -> str:
+        return f"std::log2({self._print(expr.args[0])})"
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def _print_RoundToInt(self, expr: sympy.Expr) -> str:
         assert len(expr.args) == 1
         # TODO: dispatch to llrint depending on index type
diff --git a/torch/utils/_traceback.py b/torch/utils/_traceback.py
index 08aead476818..0d23994a264e 100644
--- a/torch/utils/_traceback.py
+++ b/torch/utils/_traceback.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from types import TracebackType
 from typing import Optional
 import tempfile
@@ -6,6 +7,16 @@
 import contextlib
 import inspect
 import os.path
+=======
+import contextlib
+import inspect
+import os.path
+import tempfile
+import traceback
+from types import TracebackType
+from typing import Optional
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # This file contains utilities for ensuring dynamically compile()'d
 # code fragments display their line numbers in backtraces.
@@ -44,6 +55,10 @@
 # - Before running the compiled code, enter the
 #   report_compile_source_on_error() context manager.
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @contextlib.contextmanager
 def report_compile_source_on_error():
     try:
@@ -83,15 +98,28 @@ def report_compile_source_on_error():
                 # Don't delete the temporary file so the user can inspect it
                 # TODO: This creates a temporary file for every frame, but we
                 # technically only need one per distinct __compile_source__
+<<<<<<< HEAD
                 with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".py") as f:
+=======
+                with tempfile.NamedTemporaryFile(
+                    mode="w", delete=False, suffix=".py"
+                ) as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     f.write(source)
                 # Create a frame.  Python doesn't let you construct
                 # FrameType directly, so just make one with compile
                 frame = tb.tb_frame
+<<<<<<< HEAD
                 code = compile('__inspect_currentframe()', f.name, 'eval')
                 code = code.replace(co_name=frame.f_code.co_name)
                 # Python 3.11 only
                 if hasattr(frame.f_code, 'co_linetable'):
+=======
+                code = compile("__inspect_currentframe()", f.name, "eval")
+                code = code.replace(co_name=frame.f_code.co_name)
+                # Python 3.11 only
+                if hasattr(frame.f_code, "co_linetable"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     # We can't copy ALL of the metadata over, because you
                     # can cause Python to segfault this way.  What exactly
                     # do we need?  We need enough information for
@@ -109,6 +137,7 @@ def report_compile_source_on_error():
                 fake_frame = eval(
                     code,
                     frame.f_globals,
+<<<<<<< HEAD
                     {
                         **frame.f_locals,
                         '__inspect_currentframe': inspect.currentframe
@@ -117,6 +146,11 @@ def report_compile_source_on_error():
                 fake_tb = TracebackType(
                     None, fake_frame, tb.tb_lasti, tb.tb_lineno
                 )
+=======
+                    {**frame.f_locals, "__inspect_currentframe": inspect.currentframe},
+                )
+                fake_tb = TracebackType(None, fake_frame, tb.tb_lasti, tb.tb_lineno)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 stack.append(fake_tb)
             else:
                 stack.append(tb)
@@ -131,6 +165,10 @@ def report_compile_source_on_error():
 
         raise exc.with_traceback(tb_next)  # noqa: B904
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def shorten_filename(fn, *, base=None):
     """Shorten a source filepath, with the assumption that torch/ subdirectories don't need to be shown to user."""
     if base is None:
@@ -141,7 +179,12 @@ def shorten_filename(fn, *, base=None):
     except ValueError:
         return fn
     else:
+<<<<<<< HEAD
         return fn[len(prefix) + 1:]
+=======
+        return fn[len(prefix) + 1 :]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def format_frame(frame, *, base=None, line=False):
     """
@@ -154,12 +197,22 @@ def format_frame(frame, *, base=None, line=False):
         extra_line = f"{frame.line}  # "
     return f"{extra_line}{shorten_filename(frame.filename, base=base)}:{frame.lineno} in {frame.name}"
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def format_traceback_short(tb):
     """Format a TracebackType in a short way, printing only the inner-most frame."""
     return format_frame(traceback.extract_tb(tb)[-1])
 
+<<<<<<< HEAD
 class CapturedTraceback:
     __slots__ = ['tb', 'skip']
+=======
+
+class CapturedTraceback:
+    __slots__ = ["tb", "skip"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, tb, skip=0):
         self.tb = tb
@@ -176,6 +229,7 @@ def summary(self):
             return traceback.StackSummary()
 
         return _extract_symbolized_tb(
+<<<<<<< HEAD
             torch._C._profiler.symbolize_tracebacks([self.tb])[0],
             self.skip
         )
@@ -185,6 +239,19 @@ def __getstate__(self):
             'tb': None,  # TB is not pickleable
             'skip': self.skip,
         })
+=======
+            torch._C._profiler.symbolize_tracebacks([self.tb])[0], self.skip
+        )
+
+    def __getstate__(self):
+        return (
+            None,
+            {
+                "tb": None,  # TB is not pickleable
+                "skip": self.skip,
+            },
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @staticmethod
     def extract(*, script=False, cpp=False, skip=0):
@@ -207,7 +274,11 @@ def extract(*, script=False, cpp=False, skip=0):
             torch._C._profiler.gather_traceback(python=True, script=script, cpp=cpp),
             # Elide extract() frame if we don't have script/cpp frames.  If
             # we do have those frames, it doesn't work so force zero.
+<<<<<<< HEAD
             0 if script or cpp else skip + 1
+=======
+            0 if script or cpp else skip + 1,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
 
     def format(self):
@@ -251,5 +322,9 @@ def _extract_symbolized_tb(tb, skip):
     """
     stack = traceback.StackSummary()
     for f in reversed(tb[skip:]):
+<<<<<<< HEAD
         stack.append(traceback.FrameSummary(f['filename'], f['line'], f['name']))
+=======
+        stack.append(traceback.FrameSummary(f["filename"], f["line"], f["name"]))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return stack
diff --git a/torch/utils/_triton.py b/torch/utils/_triton.py
index 4b6d135f1846..cb7ebaa0d568 100644
--- a/torch/utils/_triton.py
+++ b/torch/utils/_triton.py
@@ -1,9 +1,18 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
 import functools
 import hashlib
 
 
 @functools.lru_cache(None)
+=======
+import functools
+import hashlib
+from typing import Any
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def has_triton_package() -> bool:
     try:
         from triton.compiler.compiler import triton_key
@@ -15,6 +24,7 @@ def has_triton_package() -> bool:
         return False
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
 def has_triton_tma():
     if has_triton_package():
@@ -61,6 +71,8 @@ def has_triton_tma_device():
     return False
 
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @functools.cache
 def get_triton_version(fallback: tuple[int, int] = (0, 0)) -> tuple[int, int]:
     try:
@@ -72,22 +84,137 @@ def get_triton_version(fallback: tuple[int, int] = (0, 0)) -> tuple[int, int]:
         return fallback
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
+=======
+@functools.cache
+def _device_supports_tma() -> bool:
+    import torch
+
+    return (
+        torch.cuda.is_available()
+        and torch.cuda.get_device_capability() >= (9, 0)
+        and not torch.version.hip
+    )
+
+
+@functools.cache
+def has_triton_experimental_host_tma() -> bool:
+    if has_triton_package():
+        if _device_supports_tma():
+            try:
+                from triton.tools.experimental_descriptor import (  # noqa: F401
+                    create_1d_tma_descriptor,
+                    create_2d_tma_descriptor,
+                )
+
+                return True
+            except ImportError:
+                pass
+
+    return False
+
+
+@functools.cache
+def has_triton_tensor_descriptor_host_tma() -> bool:
+    if has_triton_package():
+        if _device_supports_tma():
+            try:
+                from triton.tools.tensor_descriptor import (  # noqa: F401
+                    TensorDescriptor,
+                )
+
+                return True
+            except ImportError:
+                pass
+
+    return False
+
+
+@functools.cache
+def has_triton_tma() -> bool:
+    return has_triton_tensor_descriptor_host_tma() or has_triton_experimental_host_tma()
+
+
+@functools.cache
+def has_triton_tma_device() -> bool:
+    if has_triton_package():
+        import torch
+
+        if (
+            torch.cuda.is_available()
+            and torch.cuda.get_device_capability() >= (9, 0)
+            and not torch.version.hip
+        ):
+            # old API
+            try:
+                from triton.language.extra.cuda import (  # noqa: F401
+                    experimental_device_tensormap_create1d,
+                    experimental_device_tensormap_create2d,
+                )
+
+                return True
+            except ImportError:
+                pass
+
+            # new API
+            try:
+                from triton.language import make_tensor_descriptor  # noqa: F401
+
+                return True
+            except ImportError:
+                pass
+
+    return False
+
+
+@functools.lru_cache(None)
+def has_triton_stable_tma_api() -> bool:
+    if has_triton_package():
+        import torch
+
+        if (
+            torch.cuda.is_available()
+            and torch.cuda.get_device_capability() >= (9, 0)
+            and not torch.version.hip
+        ):
+            try:
+                from triton.language import make_tensor_descriptor  # noqa: F401
+
+                return True
+            except ImportError:
+                pass
+    return False
+
+
+@functools.cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def has_triton() -> bool:
     if not has_triton_package():
         return False
 
     from torch._dynamo.device_interface import get_interface_for_device
 
+<<<<<<< HEAD
     def cuda_extra_check(device_interface):
         return device_interface.Worker.get_device_properties().major >= 7
 
     def cpu_extra_check(device_interface):
+=======
+    def cuda_extra_check(device_interface: Any) -> bool:
+        return device_interface.Worker.get_device_properties().major >= 7
+
+    def cpu_extra_check(device_interface: Any) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         import triton.backends
 
         return "cpu" in triton.backends.backends
 
+<<<<<<< HEAD
     def _return_true(device_interface):
+=======
+    def _return_true(device_interface: Any) -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return True
 
     triton_supported_devices = {
@@ -96,7 +223,11 @@ def _return_true(device_interface):
         "cpu": cpu_extra_check,
     }
 
+<<<<<<< HEAD
     def is_device_compatible_with_triton():
+=======
+    def is_device_compatible_with_triton() -> bool:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         for device, extra_check in triton_supported_devices.items():
             device_interface = get_interface_for_device(device)
             if device_interface.is_available() and extra_check(device_interface):
@@ -106,8 +237,13 @@ def is_device_compatible_with_triton():
     return is_device_compatible_with_triton()
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
 def triton_backend():
+=======
+@functools.cache
+def triton_backend() -> Any:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from triton.compiler.compiler import make_backend
     from triton.runtime.driver import driver
 
@@ -115,8 +251,13 @@ def triton_backend():
     return make_backend(target)
 
 
+<<<<<<< HEAD
 @functools.lru_cache(None)
 def triton_hash_with_backend():
+=======
+@functools.cache
+def triton_hash_with_backend() -> str:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     from triton.compiler.compiler import triton_key
 
     backend = triton_backend()
@@ -124,6 +265,7 @@ def triton_hash_with_backend():
 
     # Hash is upper case so that it can't contain any Python keywords.
     return hashlib.sha256(key.encode("utf-8")).hexdigest().upper()
+<<<<<<< HEAD
 
 
 def dtype_to_string(dtype):
@@ -144,3 +286,5 @@ def patch_triton_dtype_repr():
     # exist
     # REMOVE when https://github.com/openai/triton/pull/3342 lands
     triton.language.dtype.__repr__ = lambda self: dtype_to_string(self)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/_zip.py b/torch/utils/_zip.py
index c7dd6445fabe..c2bbf7251f78 100644
--- a/torch/utils/_zip.py
+++ b/torch/utils/_zip.py
@@ -5,6 +5,10 @@
 from pathlib import Path
 from zipfile import ZipFile
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Exclude some standard library modules to:
 # 1. Slim down the final zipped file size
 # 2. Remove functionality we don't want to support.
diff --git a/torch/utils/backcompat/__init__.py b/torch/utils/backcompat/__init__.py
index 6a53076c90a6..b1954e123ff1 100644
--- a/torch/utils/backcompat/__init__.py
+++ b/torch/utils/backcompat/__init__.py
@@ -1,8 +1,17 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 from torch._C import _set_backcompat_broadcast_warn
 from torch._C import _get_backcompat_broadcast_warn
 from torch._C import _set_backcompat_keepdim_warn
 from torch._C import _get_backcompat_keepdim_warn
+=======
+from torch._C import (
+    _get_backcompat_broadcast_warn,
+    _get_backcompat_keepdim_warn,
+    _set_backcompat_broadcast_warn,
+    _set_backcompat_keepdim_warn,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class Warning:
@@ -18,5 +27,12 @@ def get_enabled(self):
 
     enabled = property(get_enabled, set_enabled)
 
+<<<<<<< HEAD
 broadcast_warning = Warning(_set_backcompat_broadcast_warn, _get_backcompat_broadcast_warn)
+=======
+
+broadcast_warning = Warning(
+    _set_backcompat_broadcast_warn, _get_backcompat_broadcast_warn
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 keepdim_warning = Warning(_set_backcompat_keepdim_warn, _get_backcompat_keepdim_warn)
diff --git a/torch/utils/backend_registration.py b/torch/utils/backend_registration.py
index aa687de37ea0..64008427f704 100644
--- a/torch/utils/backend_registration.py
+++ b/torch/utils/backend_registration.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+<<<<<<< HEAD
 import torch
 from torch.overrides import (
     handle_torch_function,
@@ -7,6 +8,15 @@
 from torch._C import _rename_privateuse1_backend, _get_privateuse1_backend_name
 from typing import Optional, Union
 
+=======
+from typing import Optional, Union
+
+import torch
+from torch._C import _get_privateuse1_backend_name, _rename_privateuse1_backend
+from torch.overrides import handle_torch_function, has_torch_function_unary
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = ["rename_privateuse1_backend", "generate_methods_for_privateuse1_backend"]
 
 # TODO: Should use `torch._C._get_privateuse1_backend_name()` to get
@@ -15,6 +25,10 @@
 # `_privateuse1_backend_name`.
 _privateuse1_backend_name = "privateuseone"
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def rename_privateuse1_backend(backend_name: str) -> None:
     r"""
     Rename the privateuse1 backend device to make it more convenient to use as a device name within PyTorch APIs.
@@ -78,6 +92,7 @@ def rename_privateuse1_backend(backend_name: str) -> None:
     global _privateuse1_backend_name
     _privateuse1_backend_name = backend_name
 
+<<<<<<< HEAD
 def _check_register_once(module, attr):
     if hasattr(module, attr):
         raise RuntimeError(f"The custom device module of {module} has already been registered with {attr}")
@@ -88,6 +103,24 @@ def _get_current_device_index():
         _get_device_index = "current_device"
         if hasattr(torch, custom_backend_name) and \
                 hasattr(getattr(torch, custom_backend_name), _get_device_index):
+=======
+
+def _check_register_once(module, attr):
+    if hasattr(module, attr):
+        raise RuntimeError(
+            f"The custom device module of {module} has already been registered with {attr}"
+        )
+
+
+def _normalization_device(
+    custom_backend_name: str, device: Optional[Union[int, str, torch.device]] = None
+) -> int:
+    def _get_current_device_index():
+        _get_device_index = "current_device"
+        if hasattr(torch, custom_backend_name) and hasattr(
+            getattr(torch, custom_backend_name), _get_device_index
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return getattr(getattr(torch, custom_backend_name), _get_device_index)()
         else:
             # The default device index is 0.
@@ -122,12 +155,25 @@ def wrap_tensor_backend(self: torch.Tensor) -> bool:
             return handle_torch_function(wrap_tensor_backend.__get__, (self,), self)  # type: ignore[attr-defined]
         return self.device.type == custom_backend_name
 
+<<<<<<< HEAD
     _check_register_once(torch.Tensor, f'is_{custom_backend_name}')
     wrap_tensor_backend.fget.__name__ = f'is_{custom_backend_name}'  # type: ignore[attr-defined]
     setattr(torch.Tensor, f'is_{custom_backend_name}', wrap_tensor_backend)
 
     def wrap_tensor_to(self: torch.Tensor, device: Optional[Union[int, torch.device]] = None, non_blocking=False,
                        **kwargs) -> torch.Tensor:
+=======
+    _check_register_once(torch.Tensor, f"is_{custom_backend_name}")
+    wrap_tensor_backend.fget.__name__ = f"is_{custom_backend_name}"  # type: ignore[attr-defined]
+    setattr(torch.Tensor, f"is_{custom_backend_name}", wrap_tensor_backend)
+
+    def wrap_tensor_to(
+        self: torch.Tensor,
+        device: Optional[Union[int, torch.device]] = None,
+        non_blocking=False,
+        **kwargs,
+    ) -> torch.Tensor:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Perform Tensor device conversion. Call the to operator implementation.
 
         .. note::
@@ -143,9 +189,26 @@ def wrap_tensor_to(self: torch.Tensor, device: Optional[Union[int, torch.device]
             **kwargs (dict): For compatibility, may contain the key ``memory_format`` argument.
         """
         if has_torch_function_unary(self):
+<<<<<<< HEAD
             return handle_torch_function(wrap_tensor_to, (self,), self, device=device, non_blocking=False, **kwargs)
         device_idx = _normalization_device(custom_backend_name, device)
         return self.to(device=torch.device(f'{custom_backend_name}:{device_idx}'), non_blocking=non_blocking, **kwargs)
+=======
+            return handle_torch_function(
+                wrap_tensor_to,
+                (self,),
+                self,
+                device=device,
+                non_blocking=False,
+                **kwargs,
+            )
+        device_idx = _normalization_device(custom_backend_name, device)
+        return self.to(
+            device=torch.device(f"{custom_backend_name}:{device_idx}"),
+            non_blocking=non_blocking,
+            **kwargs,
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     _check_register_once(torch.Tensor, custom_backend_name)
     wrap_tensor_to.__name__ = custom_backend_name
@@ -159,10 +222,20 @@ def _generate_module_methods_for_privateuse1_backend(custom_backend_name: str) -
         raise RuntimeError(
             f"Can not automatically generate {custom_backend_name}() method for torch.nn.Module."
             f"Because torch.Tensor doesn't has the method {custom_backend_name}()."
+<<<<<<< HEAD
             f"For this error, you can try setting for_tensor=True.")
 
     def wrap_module_to(self: torch.nn.modules.module.T,
                        device: Optional[Union[int, torch.device]] = None) -> torch.nn.modules.module.T:
+=======
+            f"For this error, you can try setting for_tensor=True."
+        )
+
+    def wrap_module_to(
+        self: torch.nn.modules.module.T,
+        device: Optional[Union[int, torch.device]] = None,
+    ) -> torch.nn.modules.module.T:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Move all model parameters and buffers to the custom device.
 
         This also makes associated parameters and buffers different objects. So
@@ -180,27 +253,57 @@ def wrap_module_to(self: torch.nn.modules.module.T,
     _check_register_once(torch.nn.Module, custom_backend_name)
     setattr(torch.nn.Module, custom_backend_name, wrap_module_to)
 
+<<<<<<< HEAD
 def _generate_packed_sequence_methods_for_privateuse1_backend(custom_backend_name: str) -> None:
     # Generate PackedSequence Module attributes and methods depends on Tensor methods,
     # so we need to check whether Tensor methods is already registered.
     if not hasattr(torch.Tensor, f'is_{custom_backend_name}') or \
        not hasattr(torch.Tensor, custom_backend_name):
+=======
+
+def _generate_packed_sequence_methods_for_privateuse1_backend(
+    custom_backend_name: str,
+) -> None:
+    # Generate PackedSequence Module attributes and methods depends on Tensor methods,
+    # so we need to check whether Tensor methods is already registered.
+    if not hasattr(torch.Tensor, f"is_{custom_backend_name}") or not hasattr(
+        torch.Tensor, custom_backend_name
+    ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         raise RuntimeError(
             f"Can not automatically generate is_{custom_backend_name}() or "
             f"{custom_backend_name}() method for torch.nn.utils.rnn.PackedSequence."
             f"Because torch.Tensor doesn't has the method is_{custom_backend_name}()"
             f"or {custom_backend_name}()."
+<<<<<<< HEAD
             f"For this error, you can try setting for_tensor=True.")
+=======
+            f"For this error, you can try setting for_tensor=True."
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     @property  # type: ignore[misc]
     def wrap_tensor_backend(self: torch.nn.utils.rnn.PackedSequence) -> bool:
         return self.data.device.type == custom_backend_name
 
+<<<<<<< HEAD
     _check_register_once(torch.nn.utils.rnn.PackedSequence, f'is_{custom_backend_name}')
     setattr(torch.nn.utils.rnn.PackedSequence, f'is_{custom_backend_name}', wrap_tensor_backend)
 
     def wrap_module_to(self: torch.nn.utils.rnn.PackedSequence,
                        *args, **kwargs) -> torch.nn.utils.rnn.PackedSequence:
+=======
+    _check_register_once(torch.nn.utils.rnn.PackedSequence, f"is_{custom_backend_name}")
+    setattr(
+        torch.nn.utils.rnn.PackedSequence,
+        f"is_{custom_backend_name}",
+        wrap_tensor_backend,
+    )
+
+    def wrap_module_to(
+        self: torch.nn.utils.rnn.PackedSequence, *args, **kwargs
+    ) -> torch.nn.utils.rnn.PackedSequence:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         r"""Move all model parameters and buffers to the custom device.
 
         This also makes associated parameters and buffers different objects. So
@@ -213,17 +316,33 @@ def wrap_module_to(self: torch.nn.utils.rnn.PackedSequence,
         Args:
             device (int, optional): if specified, all parameters will be copied to that device
         """
+<<<<<<< HEAD
         ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to(*args, **kwargs)
         if ex.device.type == custom_backend_name:
             return self.to(*args, **kwargs)
         kwargs.update({'device': custom_backend_name})
+=======
+        ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to(
+            *args, **kwargs
+        )
+        if ex.device.type == custom_backend_name:
+            return self.to(*args, **kwargs)
+        kwargs.update({"device": custom_backend_name})
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self.to(*args, **kwargs)
 
     _check_register_once(torch.nn.utils.rnn.PackedSequence, custom_backend_name)
     setattr(torch.nn.utils.rnn.PackedSequence, custom_backend_name, wrap_module_to)
 
+<<<<<<< HEAD
 def _generate_storage_methods_for_privateuse1_backend(custom_backend_name: str,
                                                       unsupported_dtype: Optional[list[torch.dtype]] = None) -> None:
+=======
+
+def _generate_storage_methods_for_privateuse1_backend(
+    custom_backend_name: str, unsupported_dtype: Optional[list[torch.dtype]] = None
+) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Attribute is registered in the _StorageBase class
     # and UntypedStorage obtains through inheritance.
     @property  # type: ignore[misc]
@@ -231,8 +350,15 @@ def wrap_storage_backend(self: torch.storage._StorageBase) -> bool:
         r"""Return the internal :class:`torch.UntypedStorage`."""
         return self.device.type == custom_backend_name
 
+<<<<<<< HEAD
     _check_register_once(torch.storage._StorageBase, f'is_{custom_backend_name}')
     setattr(torch.storage._StorageBase, f'is_{custom_backend_name}', wrap_storage_backend)
+=======
+    _check_register_once(torch.storage._StorageBase, f"is_{custom_backend_name}")
+    setattr(
+        torch.storage._StorageBase, f"is_{custom_backend_name}", wrap_storage_backend
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def wrap_storage_to(self, device=None, non_blocking=False):
         r"""Return a copy of this object in custom device memory.
@@ -250,16 +376,29 @@ def wrap_storage_to(self, device=None, non_blocking=False):
         # but it depends on the extended function, so this part is temporarily omitted in the automatic generation.
         device_idx = _normalization_device(custom_backend_name, device)
 
+<<<<<<< HEAD
         if getattr(self, f'is_{custom_backend_name}'):
+=======
+        if getattr(self, f"is_{custom_backend_name}"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             # storage has already on expected device.
             if self.get_device() == device_idx:
                 return self
         # For sparse storage, custom need to extend the implementation by themselves.
         if self.is_sparse:
+<<<<<<< HEAD
             raise RuntimeError(f"Can not support a sparse storage move to {custom_backend_name} backend")
         # create untyped_storage and copy data
         untyped_storage = torch.UntypedStorage(
             self.size(), device=torch.device(f'{custom_backend_name}:{device_idx}')
+=======
+            raise RuntimeError(
+                f"Can not support a sparse storage move to {custom_backend_name} backend"
+            )
+        # create untyped_storage and copy data
+        untyped_storage = torch.UntypedStorage(
+            self.size(), device=torch.device(f"{custom_backend_name}:{device_idx}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         untyped_storage.copy_(self, non_blocking)
         return untyped_storage
@@ -275,6 +414,7 @@ def wrap_typed_storage_backend(self: torch.storage.TypedStorage) -> bool:
         torch.storage._warn_typed_storage_removal()
         return self._untyped_storage.device.type == custom_backend_name
 
+<<<<<<< HEAD
     _check_register_once(torch.TypedStorage, f'is_{custom_backend_name}')
     setattr(torch.storage.TypedStorage, f'is_{custom_backend_name}', wrap_typed_storage_backend)
 
@@ -286,16 +426,47 @@ def wrap_typed_storage_to(self: torch.storage.TypedStorage,
                                f"as {self.dtype} dtype is not supported by this backend")
         custom_backend_storage: torch.UntypedStorage = getattr(
             self._untyped_storage, custom_backend_name)(device, non_blocking, **kwargs)
+=======
+    _check_register_once(torch.TypedStorage, f"is_{custom_backend_name}")
+    setattr(
+        torch.storage.TypedStorage,
+        f"is_{custom_backend_name}",
+        wrap_typed_storage_backend,
+    )
+
+    def wrap_typed_storage_to(
+        self: torch.storage.TypedStorage, device=None, non_blocking=False, **kwargs
+    ) -> torch.storage.TypedStorage:
+        torch.storage._warn_typed_storage_removal()
+        if unsupported_dtype and self.dtype in unsupported_dtype:
+            raise RuntimeError(
+                f"Cannot create {custom_backend_name} storage "
+                f"as {self.dtype} dtype is not supported by this backend"
+            )
+        custom_backend_storage: torch.UntypedStorage = getattr(
+            self._untyped_storage, custom_backend_name
+        )(device, non_blocking, **kwargs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self._new_wrapped_storage(custom_backend_storage)
 
     _check_register_once(torch.TypedStorage, custom_backend_name)
     setattr(torch.TypedStorage, custom_backend_name, wrap_typed_storage_to)
 
 
+<<<<<<< HEAD
 def generate_methods_for_privateuse1_backend(for_tensor: bool = True, for_module: bool = True,
                                              for_packed_sequence: bool = True,
                                              for_storage: bool = False,
                                              unsupported_dtype: Optional[list[torch.dtype]] = None) -> None:
+=======
+def generate_methods_for_privateuse1_backend(
+    for_tensor: bool = True,
+    for_module: bool = True,
+    for_packed_sequence: bool = True,
+    for_storage: bool = False,
+    unsupported_dtype: Optional[list[torch.dtype]] = None,
+) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r"""
     Automatically generate attributes and methods for the custom backend after rename privateuse1 backend.
 
@@ -337,11 +508,21 @@ def generate_methods_for_privateuse1_backend(for_tensor: bool = True, for_module
         _generate_module_methods_for_privateuse1_backend(custom_backend_name)
 
     if for_storage:
+<<<<<<< HEAD
         _generate_storage_methods_for_privateuse1_backend(custom_backend_name, unsupported_dtype)
+=======
+        _generate_storage_methods_for_privateuse1_backend(
+            custom_backend_name, unsupported_dtype
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if for_packed_sequence:
         _generate_packed_sequence_methods_for_privateuse1_backend(custom_backend_name)
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def _get_custom_mod_func(func_name: str):
     r"""
     Return the func named `func_name` defined in custom device module. If not defined,
@@ -370,12 +551,22 @@ def func_name(*args, **kwargs):
     it is marked as private. It is a convenience function for backend implementers to
     more easily call the hooks into their backend extensions.
     """
+<<<<<<< HEAD
     assert isinstance(func_name, str), f"func_name must be `str`, but got `{type(func_name)}`."
+=======
+    assert isinstance(
+        func_name, str
+    ), f"func_name must be `str`, but got `{type(func_name)}`."
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     backend_name = _get_privateuse1_backend_name()
     custom_device_mod = getattr(torch, backend_name, None)  # type: ignore[arg-type]
     function = getattr(custom_device_mod, func_name, None)  # type: ignore[arg-type]
     if custom_device_mod is None or function is None:
+<<<<<<< HEAD
         message = f'Try to call torch.{backend_name}.{func_name}. The backend must register a custom backend '
+=======
+        message = f"Try to call torch.{backend_name}.{func_name}. The backend must register a custom backend "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         message += f"module with `torch._register_device_module('{backend_name}', BackendModule)`. And "
         message += f"BackendModule needs to have the following API's:\n `{func_name}(*args, **kwargs)`. \n"
         raise RuntimeError(message)
diff --git a/torch/utils/benchmark/utils/_stubs.py b/torch/utils/benchmark/utils/_stubs.py
index 60861d1f412a..839489deb590 100644
--- a/torch/utils/benchmark/utils/_stubs.py
+++ b/torch/utils/benchmark/utils/_stubs.py
@@ -1,4 +1,9 @@
+<<<<<<< HEAD
 from typing import Any, Callable, Protocol, runtime_checkable
+=======
+from typing import Any, Callable
+from typing_extensions import Protocol, runtime_checkable
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class TimerClass(Protocol):
diff --git a/torch/utils/benchmark/utils/fuzzer.py b/torch/utils/benchmark/utils/fuzzer.py
index 31d5ea3b6cc7..f1e6299407dc 100644
--- a/torch/utils/benchmark/utils/fuzzer.py
+++ b/torch/utils/benchmark/utils/fuzzer.py
@@ -290,7 +290,11 @@ def _make_tensor(self, params, state):
             raw_tensor = raw_tensor.permute(tuple(np.argsort(order)))
 
         slices = [slice(0, size * step, step) for size, step in zip(size, steps)]
+<<<<<<< HEAD
         tensor = raw_tensor[slices]
+=======
+        tensor = raw_tensor[tuple(slices)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         properties = {
             "numel": int(tensor.numel()),
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index 8ca576818da8..4bdc1c89b764 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -11,7 +11,10 @@
 
 import torch
 import torch.fx.traceback as fx_traceback
+<<<<<<< HEAD
 from torch._functorch._aot_autograd.functional_utils import is_fun
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torch.utils._pytree import tree_map
 from torch.testing._internal.logging_tensor import capture_logs, LoggingTensorMode
 from torch.utils._python_dispatch import TorchDispatchMode
@@ -329,6 +332,10 @@ def backward(ctx, *args):
 def noop_context_fn():
     return contextlib.nullcontext(), contextlib.nullcontext()
 
+<<<<<<< HEAD
+=======
+# Note: [torch.compile and checkpoint]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # TorchDynamo does not step inside utils.checkpoint function.  The flow
 # looks likes this
 #  1) TorchDynamo tries to wrap utils.checkpoint in a HigherOrderOp by
@@ -865,7 +872,12 @@ def check_recomputed_tensors_match(self, gid):
                 "torch.utils.checkpoint: A different number of tensors was saved "
                 "during the original forward and recomputation.\n"
                 f"Number of tensors saved during forward: {len(self.weak_holders)}\n"
+<<<<<<< HEAD
                 f"Number of tensors saved during recomputation: {self.recomp_counter[gid]}"
+=======
+                f"Number of tensors saved during recomputation: {self.recomp_counter[gid]}.\n"
+                f"{_debug_tip_msg}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
 
         # 3. During recompute, the same tensors were saved, but they
@@ -902,10 +914,26 @@ def check_recomputed_tensors_match(self, gid):
             raise CheckpointError(
                 "torch.utils.checkpoint: Recomputed values for the following tensors "
                 "have different metadata than during the forward pass.\n"
+<<<<<<< HEAD
                 f"{mismatched_tensors}"
             )
 
 
+=======
+                f"{mismatched_tensors}.\n"
+                f"{_debug_tip_msg}"
+            )
+
+
+_debug_tip_msg = """
+Tip: To see a more detailed error message, either pass `debug=True` to
+`torch.utils.checkpoint.checkpoint(...)` or wrap the code block
+with `with torch.utils.checkpoint.set_checkpoint_debug_enabled(True):` to
+enable checkpoint‑debug mode globally.
+"""
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _checkpoint_error_template = """ \
 An error happened while unpacking tensors; dumping logs of latest computation
 because you passed `debug=True` to `torch.utils.checkpoint.checkpoint()`.
@@ -1068,7 +1096,12 @@ def pack_hook(x):
                     return x
                 raise CheckpointError(
                     "torch.utils.checkpoint: trying to save more tensors during "
+<<<<<<< HEAD
                     "recomputation than during the original forward pass."
+=======
+                    "recomputation than during the original forward pass.\n"
+                    f"{_debug_tip_msg}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 )
 
             holder = target_frame.weak_holders[recomp_idx]()
@@ -1095,6 +1128,19 @@ def unpack_hook(x):
         super().__init__(pack_hook, unpack_hook)
 
 
+<<<<<<< HEAD
+=======
+# torch._disable_dynamo creates a reference cycle with decorated function
+# This function is used to ensure that the decorated function does not have
+# a closure, so that other objects aren't also kept alive.
+# https://github.com/pytorch/pytorch/issues/154642
+# Note: does not work when fn is compiled
+@torch._disable_dynamo
+def _run_fn_with_dynamo_disabled(fn, *args, **kwargs):
+    return fn(*args, **kwargs)
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 class _checkpoint_hook(torch.autograd.graph.saved_tensors_hooks):
     def __init__(self, frame):
         def pack_hook(x):
@@ -1121,7 +1167,12 @@ def unpack_hook(holder):
                     with _recomputation_hook(
                         weakref.ref(frame), gid
                     ), torch.autograd.enable_grad():
+<<<<<<< HEAD
                         frame.recompute_fn(*args)
+=======
+                        # See Note: [compiled autograd and checkpoint unpack hook]
+                        _run_fn_with_dynamo_disabled(frame.recompute_fn, *args)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 except _StopRecomputationError:
                     pass
                 frame.is_recomputed[gid] = True
@@ -1153,12 +1204,17 @@ def unpack_hook_with_error_cb(holder):
 
 def _is_compiling(func, args, kwargs):
     # Check if we are under AOTAutograd tracing
+<<<<<<< HEAD
     # There should probably be a better way to do this...
     # TODO: unify _is_compiling across all compile stacks
     for arg in args:
         if isinstance(arg, torch.Tensor) and is_fun(arg):
             return True
     return False
+=======
+    # Checking that a functional mode is active should always do what we want
+    return torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.FUNCTIONAL) is not None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class _VersionWrapper:
@@ -1290,7 +1346,17 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
         out = func(*args, **kwargs)
 
+<<<<<<< HEAD
         any_ret_has_alias_info = any(ret.alias_info is not None for ret in func._schema.returns)
+=======
+        # HOPs don't support func._schema
+        # HOPs don't alias -> this is always true today and will be always true for a long time
+        # TODO HOPs don't mutate -> this is always true today but will not be true forever
+        if isinstance(func, torch._ops.HigherOrderOperator):
+            any_ret_has_alias_info = False
+        else:
+            any_ret_has_alias_info = any(ret.alias_info is not None for ret in func._schema.returns)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         if policy in (CheckpointPolicy.MUST_SAVE, CheckpointPolicy.PREFER_SAVE) or is_compiling:
             self.storage[func].append(tree_map(lambda x: _VersionWrapper(_maybe_detach(x, any_ret_has_alias_info)), out))
@@ -1389,7 +1455,11 @@ def create_selective_checkpoint_contexts(policy_fn_or_list, allow_cache_entry_mu
     #     context_fn anyway, so proceed as usual.
     if isinstance(policy_fn_or_list, list):
         for op in policy_fn_or_list:
+<<<<<<< HEAD
             if not isinstance(op, torch._ops.OpOverload):
+=======
+            if not isinstance(op, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 _extra_msg = (
                     "Please update the OpOverloadPacket to a specific OpOverload."
                     "For example, if you have `torch.ops.aten.mm`, change it to `torch.ops.aten.mm.default`."
@@ -1546,3 +1616,20 @@ def recompute_fn(*inputs):
         )
 
     return
+<<<<<<< HEAD
+=======
+
+# Note: [compiled autograd and checkpoint unpack hook]
+# When tracing via compiled autograd, this hook will be visible to the
+# compiler if the forward of this checkpointed region ran in eager.
+# If the forward had ran under compile, it would have been wrapped in a
+# higher order op. See Note: [torch.compile and checkpoint].
+#
+# Since we run the recomputation hook under a enable_grad context,
+# AOTDispatch will trace a joint graph for this hook, and may
+# save different activations than in eager. This conflicts with the
+# strict activation count checks in `frame.check_recomputed_tensors_match`.
+# So, we disable this hook to force it to recompute eager checkpointed regions
+# in eager. This could be removed if we can disable the partitioner for this
+# graph segment.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/collect_env.py b/torch/utils/collect_env.py
index ce397d6090ea..1b58856028d8 100644
--- a/torch/utils/collect_env.py
+++ b/torch/utils/collect_env.py
@@ -199,8 +199,13 @@ def get_cudnn_version(run_lambda):
         cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
     elif get_platform() == 'darwin':
         # CUDA libraries and drivers can be found in /usr/local/cuda/. See
+<<<<<<< HEAD
         # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
         # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
+=======
+        # https://docs.nvidia.com/cuda/archive/9.0/cuda-installation-guide-mac-os-x/index.html#installation
+        # https://docs.nvidia.com/deeplearning/cudnn/installation/latest/
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
         cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*'
     else:
@@ -439,12 +444,22 @@ def get_pip_packages(run_lambda, patterns=None):
     if patterns is None:
         patterns = PIP_PATTERNS + COMMON_PATTERNS + NVIDIA_PATTERNS
 
+<<<<<<< HEAD
     pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
+=======
+    pip_version = 'pip3' if sys.version_info.major == 3 else 'pip'
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     os.environ['PIP_DISABLE_PIP_VERSION_CHECK'] = '1'
     # People generally have pip as `pip` or `pip3`
     # But here it is invoked as `python -mpip`
     out = run_and_read_all(run_lambda, [sys.executable, '-mpip', 'list', '--format=freeze'])
+<<<<<<< HEAD
+=======
+    if out is None:
+        return pip_version, out
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     filtered_out = '\n'.join(
         line
         for line in out.splitlines()
@@ -456,6 +471,11 @@ def get_pip_packages(run_lambda, patterns=None):
 
 def get_cachingallocator_config():
     ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
+<<<<<<< HEAD
+=======
+    if not ca_config:
+        ca_config = os.environ.get('PYTORCH_HIP_ALLOC_CONF', '')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return ca_config
 
 
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index a85ba1e942ef..93247fba37a8 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -11,10 +11,19 @@
 import subprocess
 import sys
 import sysconfig
+<<<<<<< HEAD
 import warnings
 import collections
 from pathlib import Path
 import errno
+=======
+import collections
+from pathlib import Path
+import errno
+import logging
+
+logger = logging.getLogger(__name__)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch._appdirs
@@ -113,8 +122,12 @@ def _find_cuda_home() -> Optional[str]:
             if not os.path.exists(cuda_home):
                 cuda_home = None
     if cuda_home and not torch.cuda.is_available():
+<<<<<<< HEAD
         print(f"No CUDA runtime is found, using CUDA_HOME='{cuda_home}'",
               file=sys.stderr)
+=======
+        logger.warning("No CUDA runtime is found, using CUDA_HOME='%s'", cuda_home)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return cuda_home
 
 def _find_rocm_home() -> Optional[str]:
@@ -136,8 +149,12 @@ def _find_rocm_home() -> Optional[str]:
             if os.path.exists(fallback_path):
                 rocm_home = fallback_path
     if rocm_home and torch.version.hip is None:
+<<<<<<< HEAD
         print(f"No ROCm runtime is found, using ROCM_HOME='{rocm_home}'",
               file=sys.stderr)
+=======
+        logger.warning("No ROCm runtime is found, using ROCM_HOME='%s'", rocm_home)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return rocm_home
 
 def _find_sycl_home() -> Optional[str]:
@@ -159,8 +176,12 @@ def _find_sycl_home() -> Optional[str]:
                     sycl_home = os.path.dirname(Path(f.locate()).parent.resolve())
                     break
         except importlib.metadata.PackageNotFoundError:
+<<<<<<< HEAD
             print("Trying to find SYCL_HOME from intel-sycl-rt package, but it is not installed.",
                   file=sys.stderr)
+=======
+            logger.warning("Trying to find SYCL_HOME from intel-sycl-rt package, but it is not installed.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return sycl_home
 
 def _join_rocm_home(*paths) -> str:
@@ -173,9 +194,12 @@ def _join_rocm_home(*paths) -> str:
     if ROCM_HOME is None:
         raise OSError('ROCM_HOME environment variable is not set. '
                       'Please set it to your ROCm install root.')
+<<<<<<< HEAD
     elif IS_WINDOWS:
         raise OSError('Building PyTorch extensions using '
                       'ROCm and Windows is not supported.')
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return os.path.join(ROCM_HOME, *paths)
 
 def _join_sycl_home(*paths) -> str:
@@ -195,6 +219,7 @@ def _join_sycl_home(*paths) -> str:
 
 
 
+<<<<<<< HEAD
 ABI_INCOMPATIBILITY_WARNING = '''
 
                                !! WARNING !!
@@ -237,17 +262,59 @@ def _join_sycl_home(*paths) -> str:
 environment variable or add NVCC to your system PATH. The extension compilation will fail.
 '''
 ROCM_HOME = _find_rocm_home()
+=======
+ABI_INCOMPATIBILITY_WARNING = (
+    "                               !! WARNING !!"
+    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
+    "Your compiler (%s) may be ABI-incompatible with PyTorch!"
+    "Please use a compiler that is ABI-compatible with GCC 5.0 and above."
+    "See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html."
+    "See https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6"
+    "for instructions on how to install GCC 5 or higher."
+    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
+    "                              !! WARNING !!"
+)
+WRONG_COMPILER_WARNING = (
+    "                               !! WARNING !!"
+    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
+    "Your compiler (%s) is not compatible with the compiler Pytorch was"
+    "built with for this platform, which is %s on %s. Please"
+    "use %s to to compile your extension. Alternatively, you may"
+    "compile PyTorch from source using %s, and then you can also use"
+    "%s to compile your extension."
+    "See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help"
+    "with compiling PyTorch from source."
+    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
+    "                              !! WARNING !!"
+)
+CUDA_MISMATCH_MESSAGE = (
+    "The detected CUDA version (%s) mismatches the version that was used to compile"
+    "PyTorch (%s). Please make sure to use the same CUDA versions."
+)
+CUDA_MISMATCH_WARN = (
+    "The detected CUDA version (%s) has a minor version mismatch with the version that was used to compile PyTorch (%s). Most likely this shouldn't be a problem."
+)
+CUDA_NOT_FOUND_MESSAGE = (
+    "CUDA was not found on the system, please set the CUDA_HOME or the CUDA_PATH"
+    "environment variable or add NVCC to your system PATH. The extension compilation will fail."
+)
+ROCM_HOME = _find_rocm_home() if (torch.cuda._is_compiled() and torch.version.hip) else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 HIP_HOME = _join_rocm_home('hip') if ROCM_HOME else None
 IS_HIP_EXTENSION = True if ((ROCM_HOME is not None) and (torch.version.hip is not None)) else False
 ROCM_VERSION = None
 if torch.version.hip is not None:
     ROCM_VERSION = tuple(int(v) for v in torch.version.hip.split('.')[:2])
 
+<<<<<<< HEAD
 CUDA_HOME = (
     _find_cuda_home()
     if ((not IS_HIP_EXTENSION) and (torch.cuda._is_compiled()))
     else None
 )
+=======
+CUDA_HOME = _find_cuda_home() if (torch.cuda._is_compiled() and torch.version.cuda) else None
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CUDNN_HOME = os.environ.get('CUDNN_HOME') or os.environ.get('CUDNN_PATH')
 SYCL_HOME = _find_sycl_home() if torch.xpu._is_compiled() else None
 
@@ -274,12 +341,21 @@ def _join_sycl_home(*paths) -> str:
 ]
 
 COMMON_HIP_FLAGS = [
+<<<<<<< HEAD
     '-fPIC',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     '-D__HIP_PLATFORM_AMD__=1',
     '-DUSE_ROCM=1',
     '-DHIPBLAS_V2',
 ]
 
+<<<<<<< HEAD
+=======
+if not IS_WINDOWS:
+    COMMON_HIP_FLAGS.append('-fPIC')
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 COMMON_HIPCC_FLAGS = [
     '-DCUDA_HAS_FP16=1',
     '-D__HIP_NO_HALF_OPERATORS__=1',
@@ -287,26 +363,72 @@ def _join_sycl_home(*paths) -> str:
     '-DHIP_ENABLE_WARP_SYNC_BUILTINS=1'
 ]
 
+<<<<<<< HEAD
 _COMMON_SYCL_FLAGS = [
     '-fsycl',
     '-fsycl-targets=spir64_gen,spir64',
 ]
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _get_sycl_arch_list():
     if 'TORCH_XPU_ARCH_LIST' in os.environ:
         return os.environ.get('TORCH_XPU_ARCH_LIST')
     arch_list = torch.xpu.get_arch_list()
+<<<<<<< HEAD
     # Dropping dg2-* archs since they lack hardware support for fp64 and require
     # special consideration from the user. If needed these platforms can
     # be requested thru TORCH_XPU_ARCH_LIST environment variable.
     arch_list = [x for x in arch_list if not x.startswith('dg2-')]
     return ','.join(arch_list)
 
+=======
+    # Dropping dg2* archs since they lack hardware support for fp64 and require
+    # special consideration from the user. If needed these platforms can
+    # be requested thru TORCH_XPU_ARCH_LIST environment variable.
+    arch_list = [x for x in arch_list if not x.startswith('dg2')]
+    return ','.join(arch_list)
+
+
+# If arch list returned by _get_sycl_arch_list() is empty, then sycl kernels will be compiled
+# for default spir64 target and avoid device specific compilations entirely. Further, kernels
+# will be JIT compiled at runtime.
+def _append_sycl_targets_if_missing(cflags):
+    if any(flag.startswith('-fsycl-targets=') for flag in cflags):
+        # do nothing: user has manually specified sycl targets
+        return
+    if _get_sycl_arch_list() != '':
+        # AOT (spir64_gen) + JIT (spir64)
+        cflags.append('-fsycl-targets=spir64_gen,spir64')
+    else:
+        # JIT (spir64)
+        cflags.append('-fsycl-targets=spir64')
+
+def _get_sycl_device_flags(cflags):
+    # We need last occurence of -fsycl-targets as it will be the one taking effect.
+    # So searching in reversed list.
+    flags = [f for f in reversed(cflags) if f.startswith('-fsycl-targets=')]
+    assert flags, "bug: -fsycl-targets should have been ammended to cflags"
+
+    arch_list = _get_sycl_arch_list()
+    if arch_list != '':
+        flags += [f'-Xs "-device {arch_list}"']
+    return flags
+
+_COMMON_SYCL_FLAGS = [
+    '-fsycl',
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 _SYCL_DLINK_FLAGS = [
     *_COMMON_SYCL_FLAGS,
     '-fsycl-link',
     '--offload-compress',
+<<<<<<< HEAD
     f'-Xs "-device {_get_sycl_arch_list()}"',
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 JIT_EXTENSION_VERSIONER = ExtensionVersioner()
@@ -388,7 +510,15 @@ def check_compiler_ok_for_platform(compiler: str) -> bool:
     # If compiler wrapper is used try to infer the actual compiler by invoking it with -v flag
     env = os.environ.copy()
     env['LC_ALL'] = 'C'  # Don't localize output
+<<<<<<< HEAD
     version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
+=======
+    try:
+        version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
+    except subprocess.CalledProcessError:
+        # If '-v' fails, try '--version'
+        version_string = subprocess.check_output([compiler, '--version'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if IS_LINUX:
         # Check for 'gcc' or 'g++' for sccache wrapper
         pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE)
@@ -427,10 +557,14 @@ def get_compiler_abi_compatibility_and_version(compiler) -> tuple[bool, TorchVer
 
     # First check if the compiler is one of the expected ones for the particular platform.
     if not check_compiler_ok_for_platform(compiler):
+<<<<<<< HEAD
         warnings.warn(WRONG_COMPILER_WARNING.format(
             user_compiler=compiler,
             pytorch_compiler=_accepted_compilers_for_platform()[0],
             platform=sys.platform))
+=======
+        logger.warning(WRONG_COMPILER_WARNING, compiler, _accepted_compilers_for_platform()[0], sys.platform, _accepted_compilers_for_platform()[0])
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return (False, TorchVersion('0.0.0'))
 
     if IS_MACOS:
@@ -448,6 +582,7 @@ def get_compiler_abi_compatibility_and_version(compiler) -> tuple[bool, TorchVer
             version = ['0', '0', '0'] if match is None else list(match.groups())
     except Exception:
         _, error, _ = sys.exc_info()
+<<<<<<< HEAD
         warnings.warn(f'Error checking compiler version for {compiler}: {error}')
         return (False, TorchVersion('0.0.0'))
 
@@ -458,13 +593,36 @@ def get_compiler_abi_compatibility_and_version(compiler) -> tuple[bool, TorchVer
     warnings.warn(ABI_INCOMPATIBILITY_WARNING.format(compiler))
 
     return (False, TorchVersion('.'.join(version)))
+=======
+        logger.warning('Error checking compiler version for %s: %s', compiler, error)
+        return (False, TorchVersion('0.0.0'))
+
+    # convert alpha-numeric string to numeric string
+    # amdclang++ returns str like 0.0.0git, others return 0.0.0
+    numeric_version = [re.sub(r'\D', '', v) for v in version]
+
+    if tuple(map(int, numeric_version)) >= minimum_required_version:
+        return (True, TorchVersion('.'.join(numeric_version)))
+
+    compiler = f'{compiler} {".".join(numeric_version)}'
+    logger.warning(ABI_INCOMPATIBILITY_WARNING, compiler)
+
+    return (False, TorchVersion('.'.join(numeric_version)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> None:
     if not CUDA_HOME:
         raise RuntimeError(CUDA_NOT_FOUND_MESSAGE)
 
+<<<<<<< HEAD
     nvcc = os.path.join(CUDA_HOME, 'bin', 'nvcc')
+=======
+    nvcc = os.path.join(CUDA_HOME, 'bin', 'nvcc.exe' if IS_WINDOWS else 'nvcc')
+    if not os.path.exists(nvcc):
+        raise FileNotFoundError(f"nvcc not found at '{nvcc}'. Ensure CUDA path '{CUDA_HOME}' is correct.")
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     cuda_version_str = subprocess.check_output([nvcc, '--version']).strip().decode(*SUBPROCESS_DECODE_ARGS)
     cuda_version = re.search(r'release (\d+[.]\d+)', cuda_version_str)
     if cuda_version is None:
@@ -481,8 +639,13 @@ def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> N
         if getattr(cuda_ver, "major", None) is None:
             raise ValueError("setuptools>=49.4.0 is required")
         if cuda_ver.major != torch_cuda_version.major:
+<<<<<<< HEAD
             raise RuntimeError(CUDA_MISMATCH_MESSAGE.format(cuda_str_version, torch.version.cuda))
         warnings.warn(CUDA_MISMATCH_WARN.format(cuda_str_version, torch.version.cuda))
+=======
+            raise RuntimeError(CUDA_MISMATCH_MESSAGE, cuda_str_version, torch.version.cuda)
+        logger.warning(CUDA_MISMATCH_WARN, cuda_str_version, torch.version.cuda)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if not (sys.platform.startswith('linux') and
             os.environ.get('TORCH_DONT_CHECK_COMPILER_ABI') not in ['ON', '1', 'YES', 'TRUE', 'Y'] and
@@ -492,7 +655,11 @@ def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> N
     cuda_compiler_bounds: VersionMap = CUDA_CLANG_VERSIONS if compiler_name.startswith('clang') else CUDA_GCC_VERSIONS
 
     if cuda_str_version not in cuda_compiler_bounds:
+<<<<<<< HEAD
         warnings.warn(f'There are no {compiler_name} version bounds defined for CUDA version {cuda_str_version}')
+=======
+        logger.warning('There are no %s version bounds defined for CUDA version %s', compiler_name, cuda_str_version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         min_compiler_version, max_excl_compiler_version = cuda_compiler_bounds[cuda_str_version]
         # Special case for 11.4.0, which has lower compiler bounds than 11.4.1
@@ -516,6 +683,21 @@ def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> N
                 f'Please make sure to use an adequate version of {compiler_name} ({version_bound_str}).'
             )
 
+<<<<<<< HEAD
+=======
+# Specify Visual Studio C runtime library for hipcc
+def _set_hipcc_runtime_lib(is_standalone, debug):
+    if is_standalone:
+        if debug:
+            COMMON_HIP_FLAGS.append('-fms-runtime-lib=static_dbg')
+        else:
+            COMMON_HIP_FLAGS.append('-fms-runtime-lib=static')
+    else:
+        if debug:
+            COMMON_HIP_FLAGS.append('-fms-runtime-lib=dll_dbg')
+        else:
+            COMMON_HIP_FLAGS.append('-fms-runtime-lib=dll')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _append_sycl_std_if_no_std_present(cflags):
     if not any(flag.startswith('-sycl-std=') for flag in cflags):
@@ -569,6 +751,7 @@ def __init__(self, *args, **kwargs):
         return cls_with_options
 
     def __init__(self, *args, **kwargs) -> None:
+<<<<<<< HEAD
         self.no_python_abi_suffix = kwargs.get("no_python_abi_suffix", False)
         self.use_ninja = kwargs.get('use_ninja', True)
         filtered_kwargs = {kw: val for kw, val in kwargs.items() if kw not in ["no_python_abi_suffix", "use_ninja"]}
@@ -580,6 +763,18 @@ def __init__(self, *args, **kwargs) -> None:
                    '{}. Falling back to using the slow distutils backend.')
             if not is_ninja_available():
                 warnings.warn(msg.format('we could not find ninja.'))
+=======
+        super().__init__(*args, **kwargs)
+        self.no_python_abi_suffix = kwargs.get("no_python_abi_suffix", False)
+
+        self.use_ninja = kwargs.get('use_ninja', True)
+        if self.use_ninja:
+            # Test if we can use ninja. Fallback otherwise.
+            msg = ('Attempted to use ninja as the BuildExtension backend but '
+                   '%s. Falling back to using the slow distutils backend.')
+            if not is_ninja_available():
+                logger.warning(msg, 'we could not find ninja.')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 self.use_ninja = False
 
     def finalize_options(self) -> None:
@@ -650,7 +845,10 @@ def build_extensions(self) -> None:
                     if val is not None and not IS_WINDOWS:
                         self._add_compile_flag(extension, f'-DPYBIND11_{name}="{val}"')
             self._define_torch_extension_name(extension)
+<<<<<<< HEAD
             self._add_gnu_cpp_abi_flag(extension)
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             if 'nvcc_dlink' in extension.extra_compile_args:
                 assert self.use_ninja, f"With dlink=True, ninja is required to build cuda extension {extension.name}."
@@ -791,6 +989,10 @@ def unix_wrap_ninja_compile(sources,
 
             if isinstance(extra_postargs, dict) and 'nvcc_dlink' in extra_postargs:
                 cuda_dlink_post_cflags = unix_cuda_flags(extra_postargs['nvcc_dlink'])
+<<<<<<< HEAD
+=======
+                cuda_dlink_post_cflags = [shlex.quote(f) for f in cuda_dlink_post_cflags]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 cuda_dlink_post_cflags = None
 
@@ -803,6 +1005,10 @@ def unix_wrap_ninja_compile(sources,
                     sycl_post_cflags = extra_postargs['sycl']
                 else:
                     sycl_post_cflags = list(extra_postargs)
+<<<<<<< HEAD
+=======
+                _append_sycl_targets_if_missing(sycl_post_cflags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 append_std17_if_no_std_present(sycl_cflags)
                 _append_sycl_std_if_no_std_present(sycl_cflags)
                 host_cflags = extra_cc_cflags + common_cflags + post_cflags
@@ -815,7 +1021,12 @@ def unix_wrap_ninja_compile(sources,
                 # strings passed to SYCL compiler.
                 sycl_cflags = [shlex.quote(f) for f in sycl_cflags]
                 sycl_cflags += _wrap_sycl_host_flags(host_cflags)
+<<<<<<< HEAD
                 sycl_dlink_post_cflags = _SYCL_DLINK_FLAGS
+=======
+                sycl_dlink_post_cflags = _SYCL_DLINK_FLAGS.copy()
+                sycl_dlink_post_cflags += _get_sycl_device_flags(sycl_post_cflags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 sycl_post_cflags = [shlex.quote(f) for f in sycl_post_cflags]
 
             _write_ninja_file_and_compile_objects(
@@ -841,6 +1052,12 @@ def win_cuda_flags(cflags):
             return (COMMON_NVCC_FLAGS +
                     cflags + _get_cuda_arch_flags(cflags))
 
+<<<<<<< HEAD
+=======
+        def win_hip_flags(cflags):
+            return (COMMON_HIPCC_FLAGS + COMMON_HIP_FLAGS + cflags + _get_rocm_arch_flags(cflags))
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         def win_wrap_single_compile(sources,
                                     output_dir=None,
                                     macros=None,
@@ -877,7 +1094,14 @@ def spawn(cmd):
                     src = src_list[0]
                     obj = obj_list[0]
                     if _is_cuda_file(src):
+<<<<<<< HEAD
                         nvcc = _join_cuda_home('bin', 'nvcc')
+=======
+                        if IS_HIP_EXTENSION:
+                            nvcc = _get_hipcc_path()
+                        else:
+                            nvcc = _join_cuda_home('bin', 'nvcc')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         if isinstance(self.cflags, dict):
                             cflags = self.cflags['nvcc']
                         elif isinstance(self.cflags, list):
@@ -885,11 +1109,22 @@ def spawn(cmd):
                         else:
                             cflags = []
 
+<<<<<<< HEAD
                         cflags = win_cuda_flags(cflags) + ['-std=c++17', '--use-local-env']
                         for flag in COMMON_MSVC_FLAGS:
                             cflags = ['-Xcompiler', flag] + cflags
                         for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
                             cflags = ['-Xcudafe', '--diag_suppress=' + ignore_warning] + cflags
+=======
+                        if IS_HIP_EXTENSION:
+                            cflags = win_hip_flags(cflags)
+                        else:
+                            cflags = win_cuda_flags(cflags) + ['-std=c++17', '--use-local-env']
+                            for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
+                                cflags = ['-Xcudafe', '--diag_suppress=' + ignore_warning] + cflags
+                        for flag in COMMON_MSVC_FLAGS:
+                            cflags = ['-Xcompiler', flag] + cflags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                         cmd = [nvcc, '-c', src, '-o', obj] + include_list + cflags
                     elif isinstance(self.cflags, dict):
                         cflags = COMMON_MSVC_FLAGS + self.cflags['cxx']
@@ -917,8 +1152,13 @@ def win_wrap_ninja_compile(sources,
                                    debug=0,
                                    extra_preargs=None,
                                    extra_postargs=None,
+<<<<<<< HEAD
                                    depends=None):
 
+=======
+                                   depends=None,
+                                   is_standalone=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             if not self.compiler.initialized:
                 self.compiler.initialize()
             output_dir = os.path.abspath(output_dir)
@@ -935,14 +1175,29 @@ def win_wrap_ninja_compile(sources,
                 self.compiler._setup_compile(output_dir, macros,
                                              include_dirs, sources,
                                              depends, extra_postargs)
+<<<<<<< HEAD
+=======
+            # Replace space with \ when using hipcc (hipcc passes includes to clang without ""s so clang sees space in include paths as new argument)
+            if IS_HIP_EXTENSION:
+                pp_opts = ["-I{}".format(s[2:].replace(" ", "\\")) if s.startswith('-I') else s for s in pp_opts]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             common_cflags = extra_preargs or []
             cflags = []
             if debug:
                 cflags.extend(self.compiler.compile_options_debug)
             else:
                 cflags.extend(self.compiler.compile_options)
+<<<<<<< HEAD
             common_cflags.extend(COMMON_MSVC_FLAGS)
             cflags = cflags + common_cflags + pp_opts
+=======
+            cflags = cflags + common_cflags + pp_opts + COMMON_MSVC_FLAGS
+            if IS_HIP_EXTENSION:
+                _set_hipcc_runtime_lib(is_standalone, debug)
+                common_cflags.extend(COMMON_HIP_FLAGS)
+            else:
+                common_cflags.extend(COMMON_MSVC_FLAGS)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             with_cuda = any(map(_is_cuda_file, sources))
 
             # extra_postargs can be either:
@@ -952,11 +1207,17 @@ def win_wrap_ninja_compile(sources,
                 post_cflags = extra_postargs['cxx']
             else:
                 post_cflags = list(extra_postargs)
+<<<<<<< HEAD
+=======
+            if IS_HIP_EXTENSION:
+                post_cflags = COMMON_HIP_FLAGS + post_cflags
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             append_std17_if_no_std_present(post_cflags)
 
             cuda_post_cflags = None
             cuda_cflags = None
             if with_cuda:
+<<<<<<< HEAD
                 cuda_cflags = ['-std=c++17', '--use-local-env']
                 for common_cflag in common_cflags:
                     cuda_cflags.append('-Xcompiler')
@@ -964,13 +1225,31 @@ def win_wrap_ninja_compile(sources,
                 for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
                     cuda_cflags.append('-Xcudafe')
                     cuda_cflags.append('--diag_suppress=' + ignore_warning)
+=======
+                cuda_cflags = ['-std=c++17']
+                for common_cflag in common_cflags:
+                    cuda_cflags.append('-Xcompiler')
+                    cuda_cflags.append(common_cflag)
+                if not IS_HIP_EXTENSION:
+                    cuda_cflags.append('--use-local-env')
+                    for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
+                        cuda_cflags.append('-Xcudafe')
+                        cuda_cflags.append('--diag_suppress=' + ignore_warning)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 cuda_cflags.extend(pp_opts)
                 if isinstance(extra_postargs, dict):
                     cuda_post_cflags = extra_postargs['nvcc']
                 else:
                     cuda_post_cflags = list(extra_postargs)
+<<<<<<< HEAD
                 cuda_post_cflags = win_cuda_flags(cuda_post_cflags)
 
+=======
+                if IS_HIP_EXTENSION:
+                    cuda_post_cflags = win_hip_flags(cuda_post_cflags)
+                else:
+                    cuda_post_cflags = win_cuda_flags(cuda_post_cflags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             cflags = _nt_quote_args(cflags)
             post_cflags = _nt_quote_args(post_cflags)
             if with_cuda:
@@ -999,7 +1278,10 @@ def win_wrap_ninja_compile(sources,
 
             # Return *all* object filenames, not just the ones we just built.
             return objects
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Monkey-patch the _compile or compile method.
         # https://github.com/python/cpython/blob/dc0284ee8f7a270b6005467f26d8e5773d76e959/Lib/distutils/ccompiler.py#L511
         if self.compiler.compiler_type == 'msvc':
@@ -1072,7 +1354,11 @@ def _hipify_compile_flags(self, extension):
                         # replace fist instance of "CUDA" with "HIP" in flag
                         modified_flag = flag.replace("CUDA", "HIP", 1)
                     modified_flags.append(modified_flag)
+<<<<<<< HEAD
                     print(f'Modified flag: {flag} -> {modified_flag}', file=sys.stderr)
+=======
+                    logger.info('Modified flag: %s -> %s', flag, modified_flag)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 else:
                     modified_flags.append(flag)
             extension.extra_compile_args['nvcc'] = modified_flags
@@ -1087,10 +1373,13 @@ def _define_torch_extension_name(self, extension):
         define = f'-DTORCH_EXTENSION_NAME={name}'
         self._add_compile_flag(extension, define)
 
+<<<<<<< HEAD
     def _add_gnu_cpp_abi_flag(self, extension):
         # use the same CXX ABI as what PyTorch was compiled with
         self._add_compile_flag(extension, '-D_GLIBCXX_USE_CXX11_ABI=' + str(int(torch._C._GLIBCXX_USE_CXX11_ABI)))
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def CppExtension(name, sources, *args, **kwargs):
     """
@@ -1662,10 +1951,13 @@ def _get_pybind11_abi_build_flags():
             abi_cflags.append(f'-DPYBIND11_{pname}=\\"{pval}\\"')
     return abi_cflags
 
+<<<<<<< HEAD
 def _get_glibcxx_abi_build_flags():
     glibcxx_abi_cflags = ['-D_GLIBCXX_USE_CXX11_ABI=' + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))]
     return glibcxx_abi_cflags
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def check_compiler_is_gcc(compiler):
     if not IS_LINUX:
         return False
@@ -1796,7 +2088,10 @@ def build_precompile_header(pch_cmd):
 
     common_cflags += ['-std=c++17', '-fPIC']
     common_cflags += [f"{x}" for x in _get_pybind11_abi_build_flags()]
+<<<<<<< HEAD
     common_cflags += [f"{x}" for x in _get_glibcxx_abi_build_flags()]
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     common_cflags_str = listToString(common_cflags)
 
     pch_cmd = format_precompiler_header_cmd(compiler, head_file, head_file_pch, common_cflags_str, torch_include_dirs_str, extra_cflags_str, extra_include_paths_str)
@@ -1839,7 +2134,12 @@ def load_inline(name,
                 is_python_module=True,
                 with_pytorch_error_handling=True,
                 keep_intermediates=True,
+<<<<<<< HEAD
                 use_pch=False):
+=======
+                use_pch=False,
+                no_implicit_headers=False):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     r'''
     Load a PyTorch C++ extension just-in-time (JIT) from string sources.
 
@@ -1856,7 +2156,11 @@ def load_inline(name,
     the necessary header includes, as well as the (pybind11) binding code. More
     precisely, strings passed to ``cpp_sources`` are first concatenated into a
     single ``.cpp`` file. This file is then prepended with ``#include
+<<<<<<< HEAD
     <torch/extension.h>``.
+=======
+    <torch/extension.h>``
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Furthermore, if the ``functions`` argument is supplied, bindings will be
     automatically generated for each function specified. ``functions`` can
@@ -1882,6 +2186,11 @@ def load_inline(name,
     C++ function in one of the ``cpp_sources`` (and include its name
     in ``functions``).
 
+<<<<<<< HEAD
+=======
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     See :func:`load` for a description of arguments omitted below.
 
     Args:
@@ -1907,6 +2216,13 @@ def load_inline(name,
             function. This redirection might cause issues in obscure cases
             of cpp. This flag should be set to ``False`` when this redirect
             causes issues.
+<<<<<<< HEAD
+=======
+        no_implicit_headers: If ``True``, skips automatically adding headers, most notably
+            ``#include <torch/extension.h>`` and ``#include <torch/types.h>`` lines.
+            Use this option to improve cold start times when you
+            already include the necessary headers in your source code. Default: ``False``.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     Example:
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT)
@@ -1944,7 +2260,12 @@ def load_inline(name,
     if isinstance(sycl_sources, str):
         sycl_sources = [sycl_sources]
 
+<<<<<<< HEAD
     cpp_sources.insert(0, '#include <torch/extension.h>')
+=======
+    if not no_implicit_headers:
+        cpp_sources.insert(0, '#include <torch/extension.h>')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if use_pch is True:
         # Using PreCompile Header('torch/extension.h') to reduce compile time.
@@ -1979,9 +2300,16 @@ def load_inline(name,
     sources = [cpp_source_path]
 
     if cuda_sources:
+<<<<<<< HEAD
         cuda_sources.insert(0, '#include <torch/types.h>')
         cuda_sources.insert(1, '#include <cuda.h>')
         cuda_sources.insert(2, '#include <cuda_runtime.h>')
+=======
+        if not no_implicit_headers:
+            cuda_sources.insert(0, '#include <torch/types.h>')
+            cuda_sources.insert(1, '#include <cuda.h>')
+            cuda_sources.insert(2, '#include <cuda_runtime.h>')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         cuda_source_path = os.path.join(build_directory, 'cuda.cu')
         _maybe_write(cuda_source_path, "\n".join(cuda_sources))
@@ -1989,8 +2317,14 @@ def load_inline(name,
         sources.append(cuda_source_path)
 
     if sycl_sources:
+<<<<<<< HEAD
         sycl_sources.insert(0, '#include <torch/types.h>')
         sycl_sources.insert(1, '#include <sycl/sycl.hpp>')
+=======
+        if not no_implicit_headers:
+            sycl_sources.insert(0, '#include <torch/types.h>')
+            sycl_sources.insert(1, '#include <sycl/sycl.hpp>')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         sycl_source_path = os.path.join(build_directory, 'sycl.sycl')
         _maybe_write(sycl_source_path, "\n".join(sycl_sources))
@@ -2049,9 +2383,14 @@ def _jit_compile(name,
     )
     if version > 0:
         if version != old_version and verbose:
+<<<<<<< HEAD
             print(f'The input conditions for extension module {name} have changed. ' +
                   f'Bumping to version {version} and re-building as {name}_v{version}...',
                   file=sys.stderr)
+=======
+            logger.info('The input conditions for extension module %s have changed.', name)
+            logger.info('Bumping to version %s and re-building as %s_v%s...', version, name, version)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         name = f'{name}_v{version}'
 
     baton = FileBaton(os.path.join(build_directory, 'lock'))
@@ -2093,21 +2432,39 @@ def _jit_compile(name,
                         with_sycl=with_sycl,
                         is_standalone=is_standalone)
             elif verbose:
+<<<<<<< HEAD
                 print('No modifications detected for re-loaded extension '
                       f'module {name}, skipping build step...', file=sys.stderr)
+=======
+                logger.debug('No modifications detected for re-loaded extension module %s, skipping build step...', name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         finally:
             baton.release()
     else:
         baton.wait()
 
     if verbose:
+<<<<<<< HEAD
         print(f'Loading extension module {name}...', file=sys.stderr)
+=======
+        logger.info('Loading extension module %s...', name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if is_standalone:
         return _get_exec_path(name, build_directory)
 
     return _import_module_from_library(name, build_directory, is_python_module)
 
+<<<<<<< HEAD
+=======
+def _get_hipcc_path():
+    if IS_WINDOWS:
+        # mypy thinks ROCM_VERSION is None but it will never be None here
+        hipcc_exe = 'hipcc.exe' if ROCM_VERSION >= (6, 4) else 'hipcc.bat'  # type: ignore[operator]
+        return _join_rocm_home('bin', hipcc_exe)
+    else:
+        return _join_rocm_home('bin', 'hipcc')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _write_ninja_file_and_compile_objects(
         sources: list[str],
@@ -2135,12 +2492,20 @@ def _write_ninja_file_and_compile_objects(
         with_sycl = any(map(_is_sycl_file, sources))
     build_file_path = os.path.join(build_directory, 'build.ninja')
     if verbose:
+<<<<<<< HEAD
         print(f'Emitting ninja build file {build_file_path}...', file=sys.stderr)
+=======
+        logger.debug('Emitting ninja build file %s...', build_file_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Create build_directory if it does not exist
     if not os.path.exists(build_directory):
         if verbose:
+<<<<<<< HEAD
             print(f'Creating directory {build_directory}...', file=sys.stderr)
+=======
+            logger.debug('Creating directory %s...', build_directory)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This is like mkdir -p, i.e. will also create parent directories.
         os.makedirs(build_directory, exist_ok=True)
 
@@ -2161,7 +2526,11 @@ def _write_ninja_file_and_compile_objects(
         with_cuda=with_cuda,
         with_sycl=with_sycl)
     if verbose:
+<<<<<<< HEAD
         print('Compiling objects...', file=sys.stderr)
+=======
+        logger.info('Compiling objects...')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _run_ninja_build(
         build_directory,
         verbose,
@@ -2199,12 +2568,20 @@ def _write_ninja_file_and_build_library(
         is_standalone)
     build_file_path = os.path.join(build_directory, 'build.ninja')
     if verbose:
+<<<<<<< HEAD
         print(f'Emitting ninja build file {build_file_path}...', file=sys.stderr)
+=======
+        logger.debug('Emitting ninja build file %s...', build_file_path)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # Create build_directory if it does not exist
     if not os.path.exists(build_directory):
         if verbose:
+<<<<<<< HEAD
             print(f'Creating directory {build_directory}...', file=sys.stderr)
+=======
+            logger.debug('Creating directory %s...', build_directory)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This is like mkdir -p, i.e. will also create parent directories.
         os.makedirs(build_directory, exist_ok=True)
 
@@ -2224,7 +2601,11 @@ def _write_ninja_file_and_build_library(
         is_standalone=is_standalone)
 
     if verbose:
+<<<<<<< HEAD
         print(f'Building extension module {name}...', file=sys.stderr)
+=======
+        logger.info('Building extension module %s...', name)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     _run_ninja_build(
         build_directory,
         verbose,
@@ -2244,7 +2625,11 @@ def is_ninja_available():
 def verify_ninja_availability():
     """Raise ``RuntimeError`` if `ninja <https://ninja-build.org/>`_ build system is not available on the system, does nothing otherwise."""
     if not is_ninja_available():
+<<<<<<< HEAD
         raise RuntimeError("Ninja is required to load C++ extensions")
+=======
+        raise RuntimeError("Ninja is required to load C++ extensions (pip install ninja to get it)")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
@@ -2283,7 +2668,11 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
 
     if with_cuda:
         if verbose:
+<<<<<<< HEAD
             print('Detected CUDA files, patching ldflags', file=sys.stderr)
+=======
+            logger.info('Detected CUDA files, patching ldflags')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if IS_WINDOWS:
             extra_ldflags.append(f'/LIBPATH:{_join_cuda_home("lib", "x64")}')
             extra_ldflags.append('cudart.lib')
@@ -2343,12 +2732,21 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
         ('Ada', '8.9+PTX'),
         ('Hopper', '9.0+PTX'),
         ('Blackwell+Tegra', '10.1'),
+<<<<<<< HEAD
         ('Blackwell', '10.0;12.0+PTX'),
+=======
+        ('Blackwell', '10.0;10.3;12.0;12.1+PTX'),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ])
 
     supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
                         '7.0', '7.2', '7.5', '8.0', '8.6', '8.7', '8.9', '9.0', '9.0a',
+<<<<<<< HEAD
                         '10.0', '10.0a', '10.1', '10.1a', '12.0', '12.0a']
+=======
+                        '10.0', '10.0a', '10.1', '10.1a', '10.3', '10.3a', '12.0',
+                        '12.0a', '12.1', '12.1a']
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
 
     # The default is sm_30 for CUDA 9.x and 10.x
@@ -2359,9 +2757,15 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
 
     # If not given, determine what's best for the GPU / CUDA version that can be found
     if not _arch_list:
+<<<<<<< HEAD
         warnings.warn(
             "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n"
             "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].")
+=======
+        logger.warning(
+            "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n"
+            "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         arch_list = []
         # the assumption is that the extension should run on any of the currently visible cards,
         # which could be of different types - therefore all archs for visible cards should be included
@@ -2435,6 +2839,19 @@ def _get_rocm_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
     return flags
 
 def _get_build_directory(name: str, verbose: bool) -> str:
+<<<<<<< HEAD
+=======
+    """
+    Get the build directory for an extension.
+
+    Args:
+        name: The name of the extension
+        verbose: Whether to print verbose information
+
+    Returns:
+        The path to the build directory
+    """
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     root_extensions_directory = os.environ.get('TORCH_EXTENSIONS_DIR')
     if root_extensions_directory is None:
         root_extensions_directory = get_default_build_root()
@@ -2447,12 +2864,20 @@ def _get_build_directory(name: str, verbose: bool) -> str:
             root_extensions_directory, build_folder)
 
     if verbose:
+<<<<<<< HEAD
         print(f'Using {root_extensions_directory} as PyTorch extensions root...', file=sys.stderr)
+=======
+        logger.info('Using %s as PyTorch extensions root...', root_extensions_directory)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     build_directory = os.path.join(root_extensions_directory, name)
     if not os.path.exists(build_directory):
         if verbose:
+<<<<<<< HEAD
             print(f'Creating extension directory {build_directory}...', file=sys.stderr)
+=======
+            logger.debug('Creating extension directory %s...', build_directory)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # This is like mkdir -p, i.e. will also create parent directories.
         os.makedirs(build_directory, exist_ok=True)
 
@@ -2463,6 +2888,7 @@ def _get_num_workers(verbose: bool) -> Optional[int]:
     max_jobs = os.environ.get('MAX_JOBS')
     if max_jobs is not None and max_jobs.isdigit():
         if verbose:
+<<<<<<< HEAD
             print(f'Using envvar MAX_JOBS ({max_jobs}) as the number of workers...',
                   file=sys.stderr)
         return int(max_jobs)
@@ -2470,17 +2896,38 @@ def _get_num_workers(verbose: bool) -> Optional[int]:
         print('Allowing ninja to set a default number of workers... '
               '(overridable by setting the environment variable MAX_JOBS=N)',
               file=sys.stderr)
+=======
+            logger.debug('Using envvar MAX_JOBS (%s) as the number of workers...', max_jobs)
+        return int(max_jobs)
+    if verbose:
+        logger.info(
+            'Allowing ninja to set a default number of workers... '
+            '(overridable by setting the environment variable MAX_JOBS=N)'
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return None
 
 
 def _get_vc_env(vc_arch: str) -> dict[str, str]:
     try:
+<<<<<<< HEAD
         from setuptools import distutils
         return distutils._msvccompiler._get_vc_env(vc_arch)
     except AttributeError:
         from setuptools._distutils import _msvccompiler
         return _msvccompiler._get_vc_env(vc_arch)
 
+=======
+        from setuptools import distutils  # type: ignore[attr-defined]
+        return distutils._msvccompiler._get_vc_env(vc_arch)
+    except AttributeError:
+        try:
+            from setuptools._distutils import _msvccompiler
+            return _msvccompiler._get_vc_env(vc_arch)  # type: ignore[attr-defined]
+        except AttributeError:
+            from setuptools._distutils.compilers.C import msvc
+            return msvc._get_vc_env(vc_arch)  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) -> None:
     command = ['ninja', '-v']
@@ -2490,7 +2937,11 @@ def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) ->
     env = os.environ.copy()
     # Try to activate the vc env for the users
     if IS_WINDOWS and 'VSCMD_ARG_TGT_ARCH' not in env:
+<<<<<<< HEAD
         from setuptools import distutils
+=======
+        from setuptools import distutils  # type: ignore[attr-defined]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         plat_name = distutils.util.get_platform()
         plat_spec = PLAT_TO_VCVARS[plat_name]
@@ -2518,6 +2969,10 @@ def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) ->
         stdout_fileno = 1
         subprocess.run(
             command,
+<<<<<<< HEAD
+=======
+            shell=IS_WINDOWS and IS_HIP_EXTENSION,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             stdout=stdout_fileno if verbose else subprocess.PIPE,
             stderr=subprocess.STDOUT,
             cwd=build_directory,
@@ -2609,10 +3064,16 @@ def _write_ninja_file_to_build_library(path,
         common_cflags += [f'-I{shlex.quote(include)}' for include in user_includes]
         common_cflags += [f'-isystem {shlex.quote(include)}' for include in system_includes]
 
+<<<<<<< HEAD
     common_cflags += [f"{x}" for x in _get_glibcxx_abi_build_flags()]
 
     if IS_WINDOWS:
         cflags = common_cflags + COMMON_MSVC_FLAGS + ['/std:c++17'] + extra_cflags
+=======
+    if IS_WINDOWS:
+        cflags = common_cflags + ['/std:c++17'] + extra_cflags
+        cflags += COMMON_HIP_FLAGS if IS_HIP_EXTENSION else COMMON_MSVC_FLAGS
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         cflags = _nt_quote_args(cflags)
     else:
         cflags = common_cflags + ['-fPIC', '-std=c++17'] + extra_cflags
@@ -2645,13 +3106,22 @@ def _write_ninja_file_to_build_library(path,
     if with_sycl:
         sycl_cflags = cflags + _COMMON_SYCL_FLAGS
         sycl_cflags += extra_sycl_cflags
+<<<<<<< HEAD
+=======
+        _append_sycl_targets_if_missing(sycl_cflags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         _append_sycl_std_if_no_std_present(sycl_cflags)
         host_cflags = cflags
         # escaping quoted arguments to pass them thru SYCL compiler
         host_cflags = [item.replace('\\"', '\\\\"') for item in host_cflags]
         host_cflags = ' '.join(host_cflags)
         sycl_cflags += _wrap_sycl_host_flags(host_cflags)
+<<<<<<< HEAD
         sycl_dlink_post_cflags = _SYCL_DLINK_FLAGS
+=======
+        sycl_dlink_post_cflags = _SYCL_DLINK_FLAGS.copy()
+        sycl_dlink_post_cflags += _get_sycl_device_flags(sycl_cflags)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         sycl_cflags = None
         sycl_dlink_post_cflags = None
@@ -2763,7 +3233,11 @@ def sanitize_flags(flags):
             nvcc = os.getenv("PYTORCH_NVCC")    # user can set nvcc compiler with ccache using the environment variable here
         else:
             if IS_HIP_EXTENSION:
+<<<<<<< HEAD
                 nvcc = _join_rocm_home('bin', 'hipcc')
+=======
+                nvcc = _get_hipcc_path()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             else:
                 nvcc = _join_cuda_home('bin', 'nvcc')
         config.append(f'nvcc = {nvcc}')
@@ -2792,9 +3266,17 @@ def sanitize_flags(flags):
     # See https://ninja-build.org/build.ninja.html for reference.
     compile_rule = ['rule compile']
     if IS_WINDOWS:
+<<<<<<< HEAD
         compile_rule.append(
             '  command = cl /showIncludes $cflags -c $in /Fo$out $post_cflags')
         compile_rule.append('  deps = msvc')
+=======
+        compiler_name = "$cxx" if IS_HIP_EXTENSION else "cl"
+        compile_rule.append(
+            f'  command = {compiler_name} /showIncludes $cflags -c $in /Fo$out $post_cflags')
+        if not IS_HIP_EXTENSION:
+            compile_rule.append('  deps = msvc')
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     else:
         compile_rule.append(
             '  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags')
diff --git a/torch/utils/data/dataframes_pipes.ipynb b/torch/utils/data/dataframes_pipes.ipynb
index 74e36ce94ea0..614eeafcac77 100644
--- a/torch/utils/data/dataframes_pipes.ipynb
+++ b/torch/utils/data/dataframes_pipes.ipynb
@@ -1,4 +1,5 @@
 {
+<<<<<<< HEAD
  "metadata": {
   "language_info": {
    "codemirror_mode": {
@@ -27,6 +28,15 @@
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## \\[RFC\\] How DataFrames (DF) and DataPipes (DP) work together"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -51,8 +61,12 @@
     "    def __init__(self, range = 20):\n",
     "        self.range = range\n",
     "    def __iter__(self):\n",
+<<<<<<< HEAD
     "        for i in range(self.range):\n",
     "            yield i\n",
+=======
+    "        yield from self.range\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "\n",
     "def get_dataframes_pipe(range = 10, dataframe_size = 7):\n",
     "    return ExampleIterPipe(range = range).map(lambda i: (i, i % 3))._to_dataframes_pipe(columns = ['i','j'], dataframe_size = dataframe_size)\n",
@@ -62,11 +76,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Doesn't matter how DF composed internally, iterator over DF Pipe gives single rows to user. This is similar to regular DataPipe."
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Doesn't matter how DF composed internally, iterator over DF Pipe gives single rows to user. This is similar to regular DataPipe."
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -74,10 +96,38 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "DataFrames Pipe\n(0, 0)\n(1, 1)\n(2, 2)\n(3, 0)\n(4, 1)\n(5, 2)\n(6, 0)\n(7, 1)\n(8, 2)\n(9, 0)\nRegular DataPipe\n(0, 0)\n(1, 1)\n(2, 2)\n(3, 0)\n(4, 1)\n(5, 2)\n(6, 0)\n(7, 1)\n(8, 2)\n(9, 0)\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DataFrames Pipe\n",
+      "(0, 0)\n",
+      "(1, 1)\n",
+      "(2, 2)\n",
+      "(3, 0)\n",
+      "(4, 1)\n",
+      "(5, 2)\n",
+      "(6, 0)\n",
+      "(7, 1)\n",
+      "(8, 2)\n",
+      "(9, 0)\n",
+      "Regular DataPipe\n",
+      "(0, 0)\n",
+      "(1, 1)\n",
+      "(2, 2)\n",
+      "(3, 0)\n",
+      "(4, 1)\n",
+      "(5, 2)\n",
+      "(6, 0)\n",
+      "(7, 1)\n",
+      "(8, 2)\n",
+      "(9, 0)\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -94,11 +144,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "You can iterate over raw DF using `raw_iterator`"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can iterate over raw DF using `raw_iterator`"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -106,10 +164,28 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "   i  j\n0  0  0\n1  1  1\n2  2  2\n3  3  0\n4  4  1\n5  5  2\n6  6  0\n   i  j\n0  7  1\n1  8  2\n2  9  0\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   i  j\n",
+      "0  0  0\n",
+      "1  1  1\n",
+      "2  2  2\n",
+      "3  3  0\n",
+      "4  4  1\n",
+      "5  5  2\n",
+      "6  6  0\n",
+      "   i  j\n",
+      "0  7  1\n",
+      "1  8  2\n",
+      "2  9  0\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -120,11 +196,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Operations over DF Pipe is captured"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Operations over DF Pipe is captured"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -134,10 +218,20 @@
    },
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "var_3 = input_var_2.i * 100\nvar_4 = var_3 + input_var_2.j\nvar_5 = var_4 - 2.7\ninput_var_2[\"y\"] = var_5\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "var_3 = input_var_2.i * 100\n",
+      "var_4 = var_3 + input_var_2.j\n",
+      "var_5 = var_4 - 2.7\n",
+      "input_var_2[\"y\"] = var_5\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -148,11 +242,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Captured operations executed on `__next__` calls of constructed DataPipe"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Captured operations executed on `__next__` calls of constructed DataPipe"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -160,10 +262,30 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "   i  j      y\n0  0  0   -2.7\n1  1  1   98.3\n2  2  2  199.3\n   i  j      y\n0  3  0  297.3\n1  4  1  398.3\n2  5  2  499.3\n   i  j      y\n0  6  0  597.3\n1  7  1  698.3\n2  8  2  799.3\n   i  j      y\n0  9  0  897.3\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   i  j      y\n",
+      "0  0  0   -2.7\n",
+      "1  1  1   98.3\n",
+      "2  2  2  199.3\n",
+      "   i  j      y\n",
+      "0  3  0  297.3\n",
+      "1  4  1  398.3\n",
+      "2  5  2  499.3\n",
+      "   i  j      y\n",
+      "0  6  0  597.3\n",
+      "1  7  1  698.3\n",
+      "2  8  2  799.3\n",
+      "   i  j      y\n",
+      "0  9  0  897.3\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -175,11 +297,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "`shuffle` of DataFramePipe effects rows in individual manner"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`shuffle` of DataFramePipe effects rows in individual manner"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -187,10 +317,53 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "Raw DataFrames iterator\n   i  j\n2  8  2\n2  2  2\n2  5  2\n   i  j\n1  4  1\n1  1  1\n0  3  0\n   i  j\n1  7  1\n0  9  0\n0  6  0\n   i  j\n0  0  0\nRegular DataFrames iterator\n(1, 1)\n(5, 2)\n(8, 2)\n(9, 0)\n(7, 1)\n(6, 0)\n(3, 0)\n(4, 1)\n(0, 0)\n(2, 2)\nRegular iterator\n(5, 2)\n(6, 0)\n(0, 0)\n(9, 0)\n(3, 0)\n(1, 1)\n(2, 2)\n(8, 2)\n(4, 1)\n(7, 1)\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Raw DataFrames iterator\n",
+      "   i  j\n",
+      "2  8  2\n",
+      "2  2  2\n",
+      "2  5  2\n",
+      "   i  j\n",
+      "1  4  1\n",
+      "1  1  1\n",
+      "0  3  0\n",
+      "   i  j\n",
+      "1  7  1\n",
+      "0  9  0\n",
+      "0  6  0\n",
+      "   i  j\n",
+      "0  0  0\n",
+      "Regular DataFrames iterator\n",
+      "(1, 1)\n",
+      "(5, 2)\n",
+      "(8, 2)\n",
+      "(9, 0)\n",
+      "(7, 1)\n",
+      "(6, 0)\n",
+      "(3, 0)\n",
+      "(4, 1)\n",
+      "(0, 0)\n",
+      "(2, 2)\n",
+      "Regular iterator\n",
+      "(5, 2)\n",
+      "(6, 0)\n",
+      "(0, 0)\n",
+      "(9, 0)\n",
+      "(3, 0)\n",
+      "(1, 1)\n",
+      "(2, 2)\n",
+      "(8, 2)\n",
+      "(4, 1)\n",
+      "(7, 1)\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -215,11 +388,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "You can continue mixing DF and DP operations"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can continue mixing DF and DP operations"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -227,10 +408,30 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "    i   j          y\n0 -17 -17  -197000.0\n1 -13 -16  3813000.0\n0 -11 -17  5803000.0\n    i   j          y\n2 -12 -15  4823000.0\n1 -10 -16  6813000.0\n1 -16 -16   813000.0\n    i   j          y\n0  -8 -17  8803000.0\n2  -9 -15  7823000.0\n0 -14 -17  2803000.0\n    i   j          y\n2 -15 -15  1823000.0\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "    i   j          y\n",
+      "0 -17 -17  -197000.0\n",
+      "1 -13 -16  3813000.0\n",
+      "0 -11 -17  5803000.0\n",
+      "    i   j          y\n",
+      "2 -12 -15  4823000.0\n",
+      "1 -10 -16  6813000.0\n",
+      "1 -16 -16   813000.0\n",
+      "    i   j          y\n",
+      "0  -8 -17  8803000.0\n",
+      "2  -9 -15  7823000.0\n",
+      "0 -14 -17  2803000.0\n",
+      "    i   j          y\n",
+      "2 -15 -15  1823000.0\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -245,11 +446,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Batching combines everything into `list` it is possible to nest `list`s. List may have any number of DataFrames as soon as total number of rows equal to batch size."
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Batching combines everything into `list` it is possible to nest `list`s. List may have any number of DataFrames as soon as total number of rows equal to batch size."
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -257,10 +466,28 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "Iterate over DataFrame batches\n[(6, 0),(0, 0)]\n[(4, 1),(1, 1)]\n[(2, 2),(9, 0)]\n[(3, 0),(5, 2)]\n[(7, 1),(8, 2)]\nIterate over regular batches\n[(1, 1),(4, 1)]\n[(2, 2),(3, 0)]\n[(6, 0),(7, 1)]\n[(8, 2),(0, 0)]\n[(5, 2),(9, 0)]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iterate over DataFrame batches\n",
+      "[(6, 0),(0, 0)]\n",
+      "[(4, 1),(1, 1)]\n",
+      "[(2, 2),(9, 0)]\n",
+      "[(3, 0),(5, 2)]\n",
+      "[(7, 1),(8, 2)]\n",
+      "Iterate over regular batches\n",
+      "[(1, 1),(4, 1)]\n",
+      "[(2, 2),(3, 0)]\n",
+      "[(6, 0),(7, 1)]\n",
+      "[(8, 2),(0, 0)]\n",
+      "[(5, 2),(9, 0)]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -282,11 +509,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Some details about internal storage of batched DataFrames and how they are iterated"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Some details about internal storage of batched DataFrames and how they are iterated"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -294,8 +529,13 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
+=======
+     "name": "stdout",
+     "output_type": "stream",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      "text": [
       "Type:  <class 'torch.utils.data.datapipes.iter.dataframes.DataChunkDF'>\n",
       "As string:  [(0, 0),(3, 0)]\n",
@@ -381,6 +621,7 @@
     "    print('-- df batch start --')\n",
     "    for item in i.raw_iterator():\n",
     "        print(item)\n",
+<<<<<<< HEAD
     "    print('-- df batch end --')   "
    ]
   },
@@ -390,6 +631,17 @@
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+    "    print('-- df batch end --')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`concat` should work only of DF with same schema, this code should produce an error "
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -407,12 +659,21 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "`unbatch` of `list` with DataFrame works similarly to regular unbatch.\n",
     "Note: DataFrame sizes might change"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`unbatch` of `list` with DataFrame works similarly to regular unbatch.\n",
+    "Note: DataFrame sizes might change"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -420,9 +681,15 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "error",
      "ename": "AttributeError",
      "evalue": "",
+=======
+     "ename": "AttributeError",
+     "evalue": "",
+     "output_type": "error",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
@@ -445,11 +712,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "`map` applied to individual rows, `nesting_level` argument used to penetrate batching"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`map` applied to individual rows, `nesting_level` argument used to penetrate batching"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -457,10 +732,22 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "Iterate over DataFrame batches\n[(1111000, 1111000),(1112000, 1112000),(1113000, 1113000),(1114000, 1111000),(1115000, 1112000)]\n[(1116000, 1113000),(1117000, 1111000),(1118000, 1112000),(1119000, 1113000),(1120000, 1111000)]\nIterate over regular batches\n[(1111000, 0),(1112000, 1),(1113000, 2),(1114000, 0),(1115000, 1)]\n[(1116000, 2),(1117000, 0),(1118000, 1),(1119000, 2),(1120000, 0)]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iterate over DataFrame batches\n",
+      "[(1111000, 1111000),(1112000, 1112000),(1113000, 1113000),(1114000, 1111000),(1115000, 1112000)]\n",
+      "[(1116000, 1113000),(1117000, 1111000),(1118000, 1112000),(1119000, 1113000),(1120000, 1111000)]\n",
+      "Iterate over regular batches\n",
+      "[(1111000, 0),(1112000, 1),(1113000, 2),(1114000, 0),(1115000, 1)]\n",
+      "[(1116000, 2),(1117000, 0),(1118000, 1),(1119000, 2),(1120000, 0)]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -483,11 +770,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "`filter` applied to individual rows, `nesting_level` argument used to penetrate batching"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`filter` applied to individual rows, `nesting_level` argument used to penetrate batching"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -495,10 +790,22 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "Iterate over DataFrame batches\n[(6, 0),(7, 1),(8, 2),(9, 0),(10, 1)]\n[(11, 2),(12, 0)]\nIterate over regular batches\n[(6, 0),(7, 1),(8, 2),(9, 0),(10, 1)]\n[(11, 2),(12, 0)]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iterate over DataFrame batches\n",
+      "[(6, 0),(7, 1),(8, 2),(9, 0),(10, 1)]\n",
+      "[(11, 2),(12, 0)]\n",
+      "Iterate over regular batches\n",
+      "[(6, 0),(7, 1),(8, 2),(9, 0),(10, 1)]\n",
+      "[(11, 2),(12, 0)]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -519,5 +826,31 @@
     "    print(i)"
    ]
   }
+<<<<<<< HEAD
  ]
-}
\ No newline at end of file
+}
+=======
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.6.10 64-bit ('dataloader': conda)",
+   "name": "python3610jvsc74a57bd0eb5e09632d6ea1cbf3eb9da7e37b7cf581db5ed13074b21cc44e159dc62acdab"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  },
+  "orig_nbformat": 2
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 66a371085b39..7d5b38d300f6 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -5,6 +5,10 @@
 functions to be run in multiprocessing. E.g., the data loading worker loop is
 in `./_utils/worker.py`.
 """
+<<<<<<< HEAD
+=======
+from __future__ import annotations
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import functools
 import itertools
@@ -14,8 +18,13 @@
 import queue
 import threading
 import warnings
+<<<<<<< HEAD
 from collections.abc import Iterable
 from typing import Any, Callable, Generic, Optional, TypeVar, Union
+=======
+from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import Self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torch
 import torch.distributed as dist
@@ -37,6 +46,12 @@
 )
 
 
+<<<<<<< HEAD
+=======
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 __all__ = [
     "DataLoader",
     "get_worker_info",
@@ -171,7 +186,13 @@ class DataLoader(Generic[_T_co]):
             worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
             input, after seeding and before data loading. (default: ``None``)
         multiprocessing_context (str or multiprocessing.context.BaseContext, optional): If
+<<<<<<< HEAD
             ``None``, the default `multiprocessing context`_ of your operating system will
+=======
+            ``None``, the default
+            `multiprocessing context <https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods>`_ # noqa: D401
+            of your operating system will
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             be used. (default: ``None``)
         generator (torch.Generator, optional): If not ``None``, this RNG will be used
             by RandomSampler to generate random indexes and multiprocessing to generate
@@ -219,9 +240,12 @@ class DataLoader(Generic[_T_co]):
 
     .. warning:: Setting `in_order` to `False` can harm reproducibility and may lead to a skewed data
                  distribution being fed to the trainer in cases with imbalanced data.
+<<<<<<< HEAD
 
     .. _multiprocessing context:
         https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
 
     dataset: Dataset[_T_co]
@@ -233,7 +257,11 @@ class DataLoader(Generic[_T_co]):
     sampler: Union[Sampler, Iterable]
     pin_memory_device: str
     prefetch_factor: Optional[int]
+<<<<<<< HEAD
     _iterator: Optional["_BaseDataLoaderIter"]
+=======
+    _iterator: Optional[_BaseDataLoaderIter]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     __initialized = False
 
     def __init__(
@@ -256,7 +284,11 @@ def __init__(
         persistent_workers: bool = False,
         pin_memory_device: str = "",
         in_order: bool = True,
+<<<<<<< HEAD
     ):
+=======
+    ) -> None:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         torch._C._log_api_usage_once("python.data_loader")
 
         if num_workers < 0:
@@ -416,7 +448,11 @@ def __init__(
 
         torch.set_vital("Dataloader", "enabled", "True")  # type: ignore[attr-defined]
 
+<<<<<<< HEAD
     def _get_iterator(self) -> "_BaseDataLoaderIter":
+=======
+    def _get_iterator(self) -> _BaseDataLoaderIter:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.num_workers == 0:
             return _SingleProcessDataLoaderIter(self)
         else:
@@ -475,9 +511,13 @@ def __setattr__(self, attr, val):
 
         super().__setattr__(attr, val)
 
+<<<<<<< HEAD
     # We quote '_BaseDataLoaderIter' since it isn't defined yet and the definition can't be moved up
     # since '_BaseDataLoaderIter' references 'DataLoader'.
     def __iter__(self) -> "_BaseDataLoaderIter":
+=======
+    def __iter__(self) -> _BaseDataLoaderIter:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # When using a single worker the returned iterator should be
         # created everytime to avoid resetting its state
         # However, in the case of a multiple workers iterator
@@ -704,7 +744,11 @@ def __init__(self, loader: DataLoader) -> None:
         self._num_yielded = 0
         self._profile_name = f"enumerate(DataLoader)#{self.__class__.__name__}.__next__"
 
+<<<<<<< HEAD
     def __iter__(self) -> "_BaseDataLoaderIter":
+=======
+    def __iter__(self) -> Self:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         return self
 
     def _reset(self, loader, first_iter=False):
diff --git a/torch/utils/data/datapipes/datapipe.pyi.in b/torch/utils/data/datapipes/datapipe.pyi.in
index dfc81f649315..647b1ce38533 100644
--- a/torch/utils/data/datapipes/datapipe.pyi.in
+++ b/torch/utils/data/datapipes/datapipe.pyi.in
@@ -1,9 +1,14 @@
+<<<<<<< HEAD
+=======
+# ${generated_comment}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # mypy: allow-untyped-defs
 # This base template ("datapipe.pyi.in") is generated from mypy stubgen with minimal editing for code injection
 # The output file will be "datapipe.pyi". This is executed as part of torch/CMakeLists.txt
 # Note that, for mypy, .pyi file takes precedent over .py file, such that we must define the interface for other
 # classes/objects here, even though we are not injecting extra code into them at the moment.
 
+<<<<<<< HEAD
 from typing import (
     Any,
     Callable,
@@ -17,6 +22,10 @@ from typing import (
     TypeVar,
     Union,
 )
+=======
+from collections.abc import Iterable, Iterator
+from typing import Any, Callable, Literal, Optional, TypeVar, Union
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torch.utils.data import Dataset, default_collate, IterableDataset
 from torch.utils.data.datapipes._hook_iterator import _SnapshotState
@@ -26,19 +35,32 @@ _T = TypeVar("_T")
 _T_co = TypeVar("_T_co", covariant=True)
 UNTRACABLE_DATAFRAME_PIPES: Any
 
+<<<<<<< HEAD
 class DataChunk(List[_T]):
     items: List[_T]
+=======
+class DataChunk(list[_T]):
+    items: list[_T]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __init__(self, items: Iterable[_T]) -> None: ...
     def as_str(self, indent: str = "") -> str: ...
     def __iter__(self) -> Iterator[_T]: ...
     def raw_iterator(self) -> Iterator[_T]: ...
 
 class MapDataPipe(Dataset[_T_co], metaclass=_DataPipeMeta):
+<<<<<<< HEAD
     functions: Dict[str, Callable] = ...
     reduce_ex_hook: Optional[Callable] = ...
     getstate_hook: Optional[Callable] = ...
     str_hook: Optional[Callable] = ...
     repr_hook: Optional[Callable] = ...
+=======
+    functions: dict[str, Callable] = ...
+    reduce_ex_hook: Callable | None = ...
+    getstate_hook: Callable | None = ...
+    str_hook: Callable | None = ...
+    repr_hook: Callable | None = ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def __getattr__(self, attribute_name: Any): ...
     @classmethod
     def register_function(cls, function_name: Any, function: Any) -> None: ...
@@ -57,7 +79,11 @@ class MapDataPipe(Dataset[_T_co], metaclass=_DataPipeMeta):
     ${MapDataPipeMethods}
 
 class IterDataPipe(IterableDataset[_T_co], metaclass=_IterDataPipeMeta):
+<<<<<<< HEAD
     functions: Dict[str, Callable] = ...
+=======
+    functions: dict[str, Callable] = ...
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     reduce_ex_hook: Optional[Callable] = ...
     getstate_hook: Optional[Callable] = ...
     str_hook: Optional[Callable] = ...
diff --git a/torch/utils/data/datapipes/gen_pyi.py b/torch/utils/data/datapipes/gen_pyi.py
index 59d81d28b2af..58b5cb96b2fc 100644
--- a/torch/utils/data/datapipes/gen_pyi.py
+++ b/torch/utils/data/datapipes/gen_pyi.py
@@ -1,10 +1,46 @@
 # mypy: allow-untyped-defs
 import os
+<<<<<<< HEAD
 import pathlib
 from collections import defaultdict
 from typing import Any, Union
 
 
+=======
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Union
+from typing_extensions import deprecated
+
+
+try:
+    from torchgen.api.python import format_function_signature
+    from torchgen.utils import FileManager as FileManager
+except ImportError:
+    import sys
+
+    REPO_ROOT = Path(__file__).absolute().parents[4]
+    sys.path.insert(0, str(REPO_ROOT))
+
+    from torchgen.api.python import format_function_signature
+    from torchgen.utils import FileManager
+
+    if len(sys.path) > 0 and sys.path[0] == str(REPO_ROOT):
+        del sys.path[0]
+
+
+__all__: list[str] = []  # not intended to expose any symbols
+
+
+def __dir__() -> list[str]:
+    return []  # appease public API test
+
+
+@deprecated(
+    "`torch.utils.data.datapipes.gen_pyi.materialize_lines` is deprecated and will be removed in the future.",
+    category=FutureWarning,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def materialize_lines(lines: list[str], indentation: int) -> str:
     output = ""
     new_line_with_indent = "\n" + " " * indentation
@@ -15,6 +51,13 @@ def materialize_lines(lines: list[str], indentation: int) -> str:
     return output
 
 
+<<<<<<< HEAD
+=======
+@deprecated(
+    "`torch.utils.data.datapipes.gen_pyi.gen_from_template` is deprecated and will be removed in the future.",
+    category=FutureWarning,
+)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def gen_from_template(
     dir: str,
     template_name: str,
@@ -24,10 +67,17 @@ def gen_from_template(
     template_path = os.path.join(dir, template_name)
     output_path = os.path.join(dir, output_name)
 
+<<<<<<< HEAD
     with open(template_path) as f:
         content = f.read()
     for placeholder, lines, indentation in replacements:
         with open(output_path, "w") as f:
+=======
+    with open(template_path, encoding="utf-8") as f:
+        content = f.read()
+    for placeholder, lines, indentation in replacements:
+        with open(output_path, "w", encoding="utf-8") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             content = content.replace(
                 placeholder, materialize_lines(lines, indentation)
             )
@@ -75,11 +125,19 @@ def extract_class_name(line: str) -> str:
 
 def parse_datapipe_file(
     file_path: str,
+<<<<<<< HEAD
 ) -> tuple[dict[str, str], dict[str, str], set[str], dict[str, list[str]]]:
     """Given a path to file, parses the file and returns a dictionary of method names to function signatures."""
     method_to_signature, method_to_class_name, special_output_type = {}, {}, set()
     doc_string_dict = defaultdict(list)
     with open(file_path) as f:
+=======
+) -> tuple[dict[str, list[str]], dict[str, str], set[str], dict[str, list[str]]]:
+    """Given a path to file, parses the file and returns a dictionary of method names to function signatures."""
+    method_to_signature, method_to_class_name, special_output_type = {}, {}, set()
+    doc_string_dict = defaultdict(list)
+    with open(file_path, encoding="utf-8") as f:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         open_paren_count = 0
         method_name, class_name, signature = "", "", ""
         skip = False
@@ -116,7 +174,11 @@ def parse_datapipe_file(
                         "open parenthesis count < 0. This shouldn't be possible."
                     )
                 else:
+<<<<<<< HEAD
                     signature += line.strip("\n").strip(" ")
+=======
+                    signature += line.strip()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     return (
         method_to_signature,
         method_to_class_name,
@@ -127,12 +189,19 @@ def parse_datapipe_file(
 
 def parse_datapipe_files(
     file_paths: set[str],
+<<<<<<< HEAD
 ) -> tuple[dict[str, str], dict[str, str], set[str], dict[str, list[str]]]:
     (
         methods_and_signatures,
         methods_and_class_names,
         methods_with_special_output_types,
     ) = ({}, {}, set())
+=======
+) -> tuple[dict[str, list[str]], dict[str, str], set[str], dict[str, list[str]]]:
+    methods_and_signatures = {}
+    methods_and_class_names = {}
+    methods_with_special_output_types = set()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     methods_and_doc_strings = {}
     for path in file_paths:
         (
@@ -172,7 +241,11 @@ def split_outside_bracket(line: str, delimiter: str = ",") -> list[str]:
     return res
 
 
+<<<<<<< HEAD
 def process_signature(line: str) -> str:
+=======
+def process_signature(line: str) -> list[str]:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     """
     Clean up a given raw function signature.
 
@@ -188,11 +261,18 @@ def process_signature(line: str) -> str:
             # Remove the datapipe after 'self' or 'cls' unless it has '*'
             tokens[i] = ""
         elif "Callable =" in token:  # Remove default argument if it is a function
+<<<<<<< HEAD
             head, _default_arg = token.rsplit("=", 2)
             tokens[i] = head.strip(" ") + "= ..."
     tokens = [t for t in tokens if t != ""]
     line = ", ".join(tokens)
     return line
+=======
+            head = token.rpartition("=")[0]
+            tokens[i] = head.strip(" ") + " = ..."
+    tokens = [t for t in tokens if t != ""]
+    return tokens
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def get_method_definitions(
@@ -211,7 +291,11 @@ def get_method_definitions(
     # 3. Remove first argument after self (unless it is "*datapipes"), default args, and spaces
     """
     if root == "":
+<<<<<<< HEAD
         root = str(pathlib.Path(__file__).parent.resolve())
+=======
+        root = str(Path(__file__).parent.resolve())
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     file_path = [file_path] if isinstance(file_path, str) else file_path
     file_path = [os.path.join(root, path) for path in file_path]
     file_paths = find_file_paths(
@@ -237,11 +321,22 @@ def get_method_definitions(
             output_type = default_output_type
         doc_string = "".join(methods_and_doc_strings[method_name])
         if doc_string == "":
+<<<<<<< HEAD
             doc_string = "    ...\n"
         method_definitions.append(
             f"# Functional form of '{class_name}'\n"
             f"def {method_name}({arguments}) -> {output_type}:\n"
             f"{doc_string}"
+=======
+            doc_string = " ..."
+        else:
+            doc_string = "\n" + doc_string
+        definition = format_function_signature(method_name, arguments, output_type)
+        method_definitions.append(
+            f"# Functional form of '{class_name}'\n"
+            + definition.removesuffix("...").rstrip()  # remove "..."
+            + doc_string,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
     method_definitions.sort(
         key=lambda s: s.split("\n")[1]
@@ -255,8 +350,13 @@ def get_method_definitions(
 iterDP_files_to_exclude: set[str] = {"__init__.py", "utils.py"}
 iterDP_deprecated_files: set[str] = set()
 iterDP_method_to_special_output_type: dict[str, str] = {
+<<<<<<< HEAD
     "demux": "List[IterDataPipe]",
     "fork": "List[IterDataPipe]",
+=======
+    "demux": "list[IterDataPipe]",
+    "fork": "list[IterDataPipe]",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 
 mapDP_file_path: str = "map"
@@ -288,6 +388,7 @@ def main() -> None:
         mapDP_method_to_special_output_type,
     )
 
+<<<<<<< HEAD
     path = pathlib.Path(__file__).parent.resolve()
     replacements = [
         ("${IterDataPipeMethods}", iter_method_definitions, 4),
@@ -298,6 +399,17 @@ def main() -> None:
         template_name="datapipe.pyi.in",
         output_name="datapipe.pyi",
         replacements=replacements,
+=======
+    path = Path(__file__).absolute().parent
+    fm = FileManager(install_dir=path, template_dir=path, dry_run=False)
+    fm.write_with_template(
+        "datapipe.pyi",
+        "datapipe.pyi.in",
+        lambda: {
+            "IterDataPipeMethods": iter_method_definitions,
+            "MapDataPipeMethods": map_method_definitions,
+        },
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
diff --git a/torch/utils/data/datapipes/iter/filelister.py b/torch/utils/data/datapipes/iter/filelister.py
index 9de99cf9b4a2..c72c9b1b1d07 100644
--- a/torch/utils/data/datapipes/iter/filelister.py
+++ b/torch/utils/data/datapipes/iter/filelister.py
@@ -1,4 +1,7 @@
+<<<<<<< HEAD
 # mypy: allow-untyped-defs
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from collections.abc import Iterator, Sequence
 from typing import Union
 
@@ -63,7 +66,11 @@ def __iter__(self) -> Iterator[str]:
                 path, self.masks, self.recursive, self.abspath, self.non_deterministic
             )
 
+<<<<<<< HEAD
     def __len__(self):
+=======
+    def __len__(self) -> int:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if self.length == -1:
             raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
         return self.length
diff --git a/torch/utils/data/datapipes/utils/decoder.py b/torch/utils/data/datapipes/utils/decoder.py
index fc32ae6dc223..9df74d06caf9 100644
--- a/torch/utils/data/datapipes/utils/decoder.py
+++ b/torch/utils/data/datapipes/utils/decoder.py
@@ -243,7 +243,11 @@ def videohandler(extension, data):
     except ImportError as e:
         raise ModuleNotFoundError(
             "Package `torchvision` is required to be installed for default video file loader."
+<<<<<<< HEAD
             "Please use `pip install torchvision` or `conda install torchvision -c pytorch`"
+=======
+            "Please use `pip install torchvision`"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "to install the package"
         ) from e
 
@@ -266,7 +270,11 @@ def audiohandler(extension, data):
     except ImportError as e:
         raise ModuleNotFoundError(
             "Package `torchaudio` is required to be installed for default audio file loader."
+<<<<<<< HEAD
             "Please use `pip install torchaudio` or `conda install torchaudio -c pytorch`"
+=======
+            "Please use `pip install torchaudio`"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "to install the package"
         ) from e
 
@@ -287,7 +295,11 @@ def __init__(self, **loadmat_kwargs) -> None:
         except ImportError as e:
             raise ModuleNotFoundError(
                 "Package `scipy` is required to be installed for mat file."
+<<<<<<< HEAD
                 "Please use `pip install scipy` or `conda install scipy`"
+=======
+                "Please use `pip install scipy`"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "to install the package"
             ) from e
         self.sio = sio
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
index ce5c8a09734c..0241fc7e8f56 100644
--- a/torch/utils/data/sampler.py
+++ b/torch/utils/data/sampler.py
@@ -210,7 +210,11 @@ def __init__(self, indices: Sequence[int], generator=None) -> None:
         self.generator = generator
 
     def __iter__(self) -> Iterator[int]:
+<<<<<<< HEAD
         for i in torch.randperm(len(self.indices), generator=self.generator):
+=======
+        for i in torch.randperm(len(self.indices), generator=self.generator).tolist():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             yield self.indices[i]
 
     def __len__(self) -> int:
diff --git a/torch/utils/data/standard_pipes.ipynb b/torch/utils/data/standard_pipes.ipynb
index 3f58a365dffb..6076be3d2dea 100644
--- a/torch/utils/data/standard_pipes.ipynb
+++ b/torch/utils/data/standard_pipes.ipynb
@@ -1,4 +1,5 @@
 {
+<<<<<<< HEAD
  "metadata": {
   "language_info": {
    "codemirror_mode": {
@@ -27,6 +28,15 @@
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Standard flow control and data processing DataPipes"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -48,11 +58,20 @@
     "    def __init__(self, range = 20):\n",
     "        self.range = range\n",
     "    def __iter__(self):\n",
+<<<<<<< HEAD
     "        for i in range(self.range):\n",
     "            yield i"
    ]
   },
   {
+=======
+    "        yield from self.range"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    "source": [
     "## Batch\n",
     "\n",
@@ -70,9 +89,13 @@
     "Example:\n",
     "\n",
     "Classic batching produce partial batches by default\n"
+<<<<<<< HEAD
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -80,10 +103,20 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[0, 1, 2]\n[3, 4, 5]\n[6, 7, 8]\n[9]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 1, 2]\n",
+      "[3, 4, 5]\n",
+      "[6, 7, 8]\n",
+      "[9]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -94,11 +127,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "To drop incomplete batches add `drop_last` argument"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To drop incomplete batches add `drop_last` argument"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -106,10 +147,19 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[0, 1, 2]\n[3, 4, 5]\n[6, 7, 8]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 1, 2]\n",
+      "[3, 4, 5]\n",
+      "[6, 7, 8]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -120,11 +170,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Sequential calling of `batch` produce nested batches"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Sequential calling of `batch` produce nested batches"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -132,10 +190,21 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[[0, 1, 2], [3, 4, 5]]\n[[6, 7, 8], [9, 10, 11]]\n[[12, 13, 14], [15, 16, 17]]\n[[18, 19, 20], [21, 22, 23]]\n[[24, 25, 26], [27, 28, 29]]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0, 1, 2], [3, 4, 5]]\n",
+      "[[6, 7, 8], [9, 10, 11]]\n",
+      "[[12, 13, 14], [15, 16, 17]]\n",
+      "[[18, 19, 20], [21, 22, 23]]\n",
+      "[[24, 25, 26], [27, 28, 29]]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -146,11 +215,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "It is possible to unbatch source data before applying the new batching rule using `unbatch_level` argument"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It is possible to unbatch source data before applying the new batching rule using `unbatch_level` argument"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -158,10 +235,19 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]\n[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n",
+      "[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]\n",
+      "[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -172,6 +258,11 @@
    ]
   },
   {
+<<<<<<< HEAD
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    "source": [
     "## Unbatch\n",
     "\n",
@@ -185,9 +276,13 @@
     "    `unbatch_level:int = 1`\n",
     " \n",
     "Example:"
+<<<<<<< HEAD
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -195,10 +290,26 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "9\n0\n1\n2\n6\n7\n8\n3\n4\n5\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "9\n",
+      "0\n",
+      "1\n",
+      "2\n",
+      "6\n",
+      "7\n",
+      "8\n",
+      "3\n",
+      "4\n",
+      "5\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -209,11 +320,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "By default unbatching is applied only on the first layer, to unbatch deeper use `unbatch_level` argument"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "By default unbatching is applied only on the first layer, to unbatch deeper use `unbatch_level` argument"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -221,10 +340,36 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[0, 1]\n[2, 3]\n[4, 5]\n[6, 7]\n[8, 9]\n[10, 11]\n[12, 13]\n[14, 15]\n[16, 17]\n[18, 19]\n[20, 21]\n[22, 23]\n[24, 25]\n[26, 27]\n[28, 29]\n[30, 31]\n[32, 33]\n[34, 35]\n[36, 37]\n[38, 39]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 1]\n",
+      "[2, 3]\n",
+      "[4, 5]\n",
+      "[6, 7]\n",
+      "[8, 9]\n",
+      "[10, 11]\n",
+      "[12, 13]\n",
+      "[14, 15]\n",
+      "[16, 17]\n",
+      "[18, 19]\n",
+      "[20, 21]\n",
+      "[22, 23]\n",
+      "[24, 25]\n",
+      "[26, 27]\n",
+      "[28, 29]\n",
+      "[30, 31]\n",
+      "[32, 33]\n",
+      "[34, 35]\n",
+      "[36, 37]\n",
+      "[38, 39]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -235,11 +380,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Setting `unbatch_level` to `-1` will unbatch to the lowest level"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Setting `unbatch_level` to `-1` will unbatch to the lowest level"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -247,10 +400,56 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n30\n31\n32\n33\n34\n35\n36\n37\n38\n39\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "1\n",
+      "2\n",
+      "3\n",
+      "4\n",
+      "5\n",
+      "6\n",
+      "7\n",
+      "8\n",
+      "9\n",
+      "10\n",
+      "11\n",
+      "12\n",
+      "13\n",
+      "14\n",
+      "15\n",
+      "16\n",
+      "17\n",
+      "18\n",
+      "19\n",
+      "20\n",
+      "21\n",
+      "22\n",
+      "23\n",
+      "24\n",
+      "25\n",
+      "26\n",
+      "27\n",
+      "28\n",
+      "29\n",
+      "30\n",
+      "31\n",
+      "32\n",
+      "33\n",
+      "34\n",
+      "35\n",
+      "36\n",
+      "37\n",
+      "38\n",
+      "39\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -261,6 +460,11 @@
    ]
   },
   {
+<<<<<<< HEAD
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    "source": [
     "## Map\n",
     "\n",
@@ -274,9 +478,13 @@
     "  - `nesting_level: int = 0`\n",
     " \n",
     "Example:"
+<<<<<<< HEAD
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -284,10 +492,26 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "0\n2\n4\n6\n8\n10\n12\n14\n16\n18\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "2\n",
+      "4\n",
+      "6\n",
+      "8\n",
+      "10\n",
+      "12\n",
+      "14\n",
+      "16\n",
+      "18\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -298,11 +522,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "`map` by default applies function to every mini-batch as a whole\n"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`map` by default applies function to every mini-batch as a whole\n"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -310,10 +542,20 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[0, 1, 2, 0, 1, 2]\n[3, 4, 5, 3, 4, 5]\n[6, 7, 8, 6, 7, 8]\n[9, 9]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 1, 2, 0, 1, 2]\n",
+      "[3, 4, 5, 3, 4, 5]\n",
+      "[6, 7, 8, 6, 7, 8]\n",
+      "[9, 9]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -324,11 +566,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "To apply function on individual items of the mini-batch use `nesting_level` argument"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To apply function on individual items of the mini-batch use `nesting_level` argument"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -336,10 +586,18 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[[0, 2, 4], [6, 8, 10]]\n[[12, 14, 16], [18]]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0, 2, 4], [6, 8, 10]]\n",
+      "[[12, 14, 16], [18]]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -350,11 +608,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Setting `nesting_level` to `-1` will apply `map` function to the lowest level possible"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Setting `nesting_level` to `-1` will apply `map` function to the lowest level possible"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -362,8 +628,13 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
+=======
+     "name": "stdout",
+     "output_type": "stream",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      "text": [
       "[[[0, 2, 4], [6, 8, 10]], [[12, 14, 16], [18]]]\n"
      ]
@@ -376,6 +647,11 @@
    ]
   },
   {
+<<<<<<< HEAD
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    "source": [
     "## Filter\n",
     "\n",
@@ -390,9 +666,13 @@
     "  - `drop_empty_batches = True` whether empty many batches dropped or not.\n",
     " \n",
     "Example:"
+<<<<<<< HEAD
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -400,10 +680,21 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "0\n2\n4\n6\n8\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "2\n",
+      "4\n",
+      "6\n",
+      "8\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -414,11 +705,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Classic `filter` by default applies filter function to every mini-batches as a whole \n"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Classic `filter` by default applies filter function to every mini-batches as a whole \n"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -426,10 +725,19 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[0, 1, 2]\n[3, 4, 5]\n[6, 7, 8]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 1, 2]\n",
+      "[3, 4, 5]\n",
+      "[6, 7, 8]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -441,11 +749,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "You can apply filter function on individual elements by setting `nesting_level` argument"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can apply filter function on individual elements by setting `nesting_level` argument"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -453,10 +769,19 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[5]\n[6, 7, 8]\n[9]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[5]\n",
+      "[6, 7, 8]\n",
+      "[9]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -468,11 +793,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "If mini-batch ends with zero elements after filtering default behaviour would be to drop them from the response. You can override this behaviour using `drop_empty_batches` argument.\n"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If mini-batch ends with zero elements after filtering default behaviour would be to drop them from the response. You can override this behaviour using `drop_empty_batches` argument.\n"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -480,10 +813,20 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[]\n[5]\n[6, 7, 8]\n[9]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[]\n",
+      "[5]\n",
+      "[6, 7, 8]\n",
+      "[9]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -500,10 +843,18 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[[[0, 1, 2], [3]], [[], [10, 11]]]\n[[[12, 13, 14], [15, 16, 17]], [[18, 19]]]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[[0, 1, 2], [3]], [[], [10, 11]]]\n",
+      "[[[12, 13, 14], [15, 16, 17]], [[18, 19]]]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -515,6 +866,11 @@
    ]
   },
   {
+<<<<<<< HEAD
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    "source": [
     "## Shuffle\n",
     "\n",
@@ -529,9 +885,13 @@
     "  - `buffer_size: int = 10000`\n",
     " \n",
     "Example:"
+<<<<<<< HEAD
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -539,10 +899,26 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "2\n9\n4\n0\n3\n7\n8\n5\n6\n1\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2\n",
+      "9\n",
+      "4\n",
+      "0\n",
+      "3\n",
+      "7\n",
+      "8\n",
+      "5\n",
+      "6\n",
+      "1\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -553,11 +929,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "`shuffle` operates on input mini-batches similar as on individual items"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`shuffle` operates on input mini-batches similar as on individual items"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -565,10 +949,20 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[0, 1, 2]\n[3, 4, 5]\n[9]\n[6, 7, 8]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 1, 2]\n",
+      "[3, 4, 5]\n",
+      "[9]\n",
+      "[6, 7, 8]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -579,11 +973,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "To shuffle elements across batches use `shuffle(unbatch_level)` followed by `batch` pattern "
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To shuffle elements across batches use `shuffle(unbatch_level)` followed by `batch` pattern "
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -591,10 +993,20 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[2, 1, 0]\n[7, 9, 6]\n[3, 5, 4]\n[8]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 1, 0]\n",
+      "[7, 9, 6]\n",
+      "[3, 5, 4]\n",
+      "[8]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -605,6 +1017,11 @@
    ]
   },
   {
+<<<<<<< HEAD
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    "source": [
     "## Collate\n",
     "\n",
@@ -617,9 +1034,13 @@
     "Arguments:\n",
     " \n",
     "Example:"
+<<<<<<< HEAD
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -627,10 +1048,20 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "tensor([0, 1, 2])\ntensor([3, 4, 5])\ntensor([6, 7, 8])\ntensor([9])\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([0, 1, 2])\n",
+      "tensor([3, 4, 5])\n",
+      "tensor([6, 7, 8])\n",
+      "tensor([9])\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -641,6 +1072,11 @@
    ]
   },
   {
+<<<<<<< HEAD
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    "source": [
     "## GroupBy\n",
     "\n",
@@ -658,9 +1094,13 @@
     "\n",
     "#### Attention\n",
     "As datasteam can be arbitrary large, grouping is done on best effort basis and there is no guarantee that same key will never present in the different groups. You can call it local groupby where locallity is the one DataPipe process/thread."
+<<<<<<< HEAD
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -668,10 +1108,19 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[0, 3, 6, 9]\n[1, 4, 7]\n[5, 2, 8]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 3, 6, 9]\n",
+      "[1, 4, 7]\n",
+      "[5, 2, 8]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -682,11 +1131,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "By default group key function is applied to entire input (mini-batch)"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "By default group key function is applied to entire input (mini-batch)"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -694,10 +1151,18 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[[0, 1, 2], [3, 4, 5], [6, 7, 8]]\n[[9]]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0, 1, 2], [3, 4, 5], [6, 7, 8]]\n",
+      "[[9]]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -708,11 +1173,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "It is possible to unnest items from the mini-batches using `unbatch_level` argument"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It is possible to unnest items from the mini-batches using `unbatch_level` argument"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -720,10 +1193,19 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[0, 3, 6, 9]\n[1, 4, 7]\n[2, 5, 8]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 3, 6, 9]\n",
+      "[1, 4, 7]\n",
+      "[2, 5, 8]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -734,11 +1216,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "When internal buffer (defined by `buffer_size`) is overfilled, groupby will yield biggest group available"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When internal buffer (defined by `buffer_size`) is overfilled, groupby will yield biggest group available"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -746,10 +1236,22 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[9, 3]\n[13, 4, 7]\n[2, 11, 14, 5]\n[0, 6, 12]\n[1, 10]\n[8]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[9, 3]\n",
+      "[13, 4, 7]\n",
+      "[2, 11, 14, 5]\n",
+      "[0, 6, 12]\n",
+      "[1, 10]\n",
+      "[8]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -760,11 +1262,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "`groupby` will produce `group_size` sized batches on as fast as possible basis"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`groupby` will produce `group_size` sized batches on as fast as possible basis"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -772,10 +1282,22 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[6, 3, 12]\n[1, 16, 7]\n[2, 5, 8]\n[14, 11, 17]\n[15, 9, 0]\n[10, 4, 13]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[6, 3, 12]\n",
+      "[1, 16, 7]\n",
+      "[2, 5, 8]\n",
+      "[14, 11, 17]\n",
+      "[15, 9, 0]\n",
+      "[10, 4, 13]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -786,11 +1308,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Remaining groups must be at least `guaranteed_group_size` big. "
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Remaining groups must be at least `guaranteed_group_size` big. "
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -798,10 +1328,22 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[11, 2, 5]\n[1, 4, 10]\n[0, 9, 6]\n[14, 8]\n[13, 7]\n[12, 3]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[11, 2, 5]\n",
+      "[1, 4, 10]\n",
+      "[0, 9, 6]\n",
+      "[14, 8]\n",
+      "[13, 7]\n",
+      "[12, 3]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -812,11 +1354,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "Without defined `group_size` function will try to accumulate at least `guaranteed_group_size` elements before yielding resulted group"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Without defined `group_size` function will try to accumulate at least `guaranteed_group_size` elements before yielding resulted group"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -824,10 +1374,19 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[3, 6, 9, 12, 0]\n[14, 2, 8, 11, 5]\n[7, 4, 1, 13, 10]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[3, 6, 9, 12, 0]\n",
+      "[14, 2, 8, 11, 5]\n",
+      "[7, 4, 1, 13, 10]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -838,11 +1397,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "This behaviour becomes noticable when data is bigger than buffer and some groups getting evicted before gathering all potential items"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This behaviour becomes noticable when data is bigger than buffer and some groups getting evicted before gathering all potential items"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -850,10 +1417,22 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[0, 3]\n[1, 4, 7]\n[2, 5, 8]\n[6, 9, 12]\n[10, 13]\n[11, 14]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0, 3]\n",
+      "[1, 4, 7]\n",
+      "[2, 5, 8]\n",
+      "[6, 9, 12]\n",
+      "[10, 13]\n",
+      "[11, 14]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -864,11 +1443,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "With randomness involved you might end up with incomplete groups (so next example expected to fail in most cases)"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With randomness involved you might end up with incomplete groups (so next example expected to fail in most cases)"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -876,6 +1463,7 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
@@ -886,6 +1474,22 @@
      "output_type": "error",
      "ename": "Exception",
      "evalue": "('Failed to group items', '[13]')",
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[14, 5, 11]\n",
+      "[1, 7, 4, 10]\n",
+      "[0, 12, 6]\n",
+      "[8, 2]\n",
+      "[9, 3]\n"
+     ]
+    },
+    {
+     "ename": "Exception",
+     "evalue": "('Failed to group items', '[13]')",
+     "output_type": "error",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
       "\u001b[0;31mException\u001b[0m                                 Traceback (most recent call last)",
@@ -902,11 +1506,19 @@
    ]
   },
   {
+<<<<<<< HEAD
    "source": [
     "To avoid this error and drop incomplete groups, use `drop_remaining` argument"
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To avoid this error and drop incomplete groups, use `drop_remaining` argument"
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -914,10 +1526,20 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "[5, 2, 14]\n[4, 7, 13, 1, 10]\n[12, 6, 3, 9]\n[8, 11]\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[5, 2, 14]\n",
+      "[4, 7, 13, 1, 10]\n",
+      "[12, 6, 3, 9]\n",
+      "[8, 11]\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -928,6 +1550,11 @@
    ]
   },
   {
+<<<<<<< HEAD
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    "source": [
     "## Zip\n",
     "\n",
@@ -940,9 +1567,13 @@
     "Arguments:\n",
     " \n",
     "Example:"
+<<<<<<< HEAD
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -950,10 +1581,21 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "(0, 3)\n(1, 0)\n(2, 4)\n(3, 2)\n(4, 1)\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0, 3)\n",
+      "(1, 0)\n",
+      "(2, 4)\n",
+      "(3, 2)\n",
+      "(4, 1)\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -965,6 +1607,11 @@
    ]
   },
   {
+<<<<<<< HEAD
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    "source": [
     "## Fork\n",
     "\n",
@@ -977,9 +1624,13 @@
     "Arguments:\n",
     " \n",
     "Example:"
+<<<<<<< HEAD
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -987,10 +1638,22 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "0\n1\n0\n1\n0\n1\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "1\n",
+      "0\n",
+      "1\n",
+      "0\n",
+      "1\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -1089,6 +1752,11 @@
    ]
   },
   {
+<<<<<<< HEAD
+=======
+   "cell_type": "markdown",
+   "metadata": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    "source": [
     "## Concat\n",
     "\n",
@@ -1102,9 +1770,13 @@
     "    `dp = dp.concat(*datapipes_list)`\n",
     "\n",
     "Example:\n"
+<<<<<<< HEAD
    ],
    "cell_type": "markdown",
    "metadata": {}
+=======
+   ]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   },
   {
    "cell_type": "code",
@@ -1112,10 +1784,23 @@
    "metadata": {},
    "outputs": [
     {
+<<<<<<< HEAD
      "output_type": "stream",
      "name": "stdout",
      "text": [
       "0\n1\n2\n3\n0\n1\n2\n"
+=======
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "1\n",
+      "2\n",
+      "3\n",
+      "0\n",
+      "1\n",
+      "2\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
      ]
     }
    ],
@@ -1127,5 +1812,31 @@
     "    print(i)"
    ]
   }
+<<<<<<< HEAD
  ]
-}
\ No newline at end of file
+}
+=======
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.6.10 64-bit ('dataloader': conda)",
+   "name": "python3610jvsc74a57bd0eb5e09632d6ea1cbf3eb9da7e37b7cf581db5ed13074b21cc44e159dc62acdab"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  },
+  "orig_nbformat": 2
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torch/utils/data/typing.ipynb b/torch/utils/data/typing.ipynb
index 6431665a14ae..9ae0ae3d0263 100644
--- a/torch/utils/data/typing.ipynb
+++ b/torch/utils/data/typing.ipynb
@@ -16,7 +16,13 @@
    "outputs": [],
    "source": [
     "from torch.utils.data import IterDataPipe\n",
+<<<<<<< HEAD
     "from typing import Any, Iterator, List, Tuple, TypeVar, Set, Union\n",
+=======
+    "from typing import Any, TypeVar, Union\n",
+    "from collections.abc import Iterator\n",
+    "import sys\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "\n",
     "T_co = TypeVar('T_co', covariant=True)"
    ]
@@ -122,9 +128,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
     "class DP(IterDataPipe[Tuple]):\n",
     "    def __iter__(self) -> Iterator[Tuple[int, str]]:\n",
     "        pass"
+=======
+    "class DP(IterDataPipe[tuple]):\n",
+    "    def __iter__(self) -> Iterator[tuple[int, str]]:\n",
+    "        pass\n",
+    "print(DP.type)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ]
   },
   {
@@ -135,7 +148,12 @@
    "source": [
     "class DP(IterDataPipe):\n",
     "    def __iter__(self) -> Iterator[int]:\n",
+<<<<<<< HEAD
     "        pass"
+=======
+    "        pass\n",
+    "print(DP.type)"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ]
   },
   {
@@ -197,14 +215,24 @@
     }
    ],
    "source": [
+<<<<<<< HEAD
     "class DP(IterDataPipe[Tuple[T_co, str]]):\n",
     "    def __iter__(self) -> Iterator[Tuple[T_co, str]]:\n",
+=======
+    "class DP(IterDataPipe[tuple[T_co, str]]):\n",
+    "    def __iter__(self) -> Iterator[tuple[T_co, str]]:\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "        pass\n",
     "print(DP.type)\n",
     "\n",
     "T = TypeVar('T', int, str)  # equals to Union[int, str]\n",
+<<<<<<< HEAD
     "class DP(IterDataPipe[Tuple[T, str]]):\n",
     "    def __iter__(self) -> Iterator[Tuple[Union[int, str], str]]:\n",
+=======
+    "class DP(IterDataPipe[tuple[T, str]]):\n",
+    "    def __iter__(self) -> Iterator[tuple[Union[int, str], str]]:\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "        pass\n",
     "print(DP.type)"
    ]
@@ -242,8 +270,13 @@
     }
    ],
    "source": [
+<<<<<<< HEAD
     "class DP(IterDataPipe[List[int]]):\n",
     "    def __iter__(self) -> Iterator[List[int]]:\n",
+=======
+    "class DP(IterDataPipe[list[int]]):\n",
+    "    def __iter__(self) -> Iterator[list[int]]:\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "        pass\n",
     "print_helper(DP, DP())"
    ]
@@ -313,8 +346,12 @@
     "        self.dp = dp\n",
     "\n",
     "    def __iter__(self):\n",
+<<<<<<< HEAD
     "        for d in self.dp:\n",
     "            yield d"
+=======
+    "        yield from self.dp"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ]
   },
   {
@@ -378,7 +415,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
     "class Temp(IterDataPipe[Tuple[int, T_co]]):\n",
+=======
+    "class Temp(IterDataPipe[tuple[int, T_co]]):\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "    def __iter__(self):\n",
     "        pass\n",
     "dp = DP(Temp())"
@@ -407,14 +448,22 @@
    "source": [
     "from torch.utils.data import runtime_validation, runtime_validation_disabled\n",
     "\n",
+<<<<<<< HEAD
     "class DP(IterDataPipe[Tuple[int, T_co]]):\n",
+=======
+    "class DP(IterDataPipe[tuple[int, T_co]]):\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "    def __init__(self, datasource):\n",
     "        self.ds = datasource\n",
     "\n",
     "    @runtime_validation\n",
     "    def __iter__(self):\n",
+<<<<<<< HEAD
     "        for d in self.ds:\n",
     "            yield d"
+=======
+    "        yield from self.ds"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ]
   },
   {
@@ -608,8 +657,12 @@
     "        self.ds = ds\n",
     "\n",
     "    def __iter__(self):\n",
+<<<<<<< HEAD
     "        for d in self.ds:\n",
     "            yield d\n",
+=======
+    "        yield from self.ds\n",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     "dp = DP(ds).reinforce_type(int)"
    ]
   },
@@ -625,8 +678,12 @@
     "\n",
     "    @runtime_validation\n",
     "    def __iter__(self):\n",
+<<<<<<< HEAD
     "        for d in self.ds:\n",
     "            yield d"
+=======
+    "        yield from self.ds"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
    ]
   },
   {
diff --git a/torch/utils/dlpack.py b/torch/utils/dlpack.py
index 6bfa4b9f85bd..2c724112dcd6 100644
--- a/torch/utils/dlpack.py
+++ b/torch/utils/dlpack.py
@@ -6,6 +6,15 @@
 from torch._C import _from_dlpack
 from torch._C import _to_dlpack as to_dlpack
 
+<<<<<<< HEAD
+=======
+__all__ = [
+    "DLDeviceType",
+    "from_dlpack",
+    "to_dlpack",
+]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 class DLDeviceType(enum.IntEnum):
     # Enums as in DLPack specification (aten/src/ATen/dlpack.h)
diff --git a/torch/utils/file_baton.py b/torch/utils/file_baton.py
index 77ee5091b3f7..9b258e6f9baa 100644
--- a/torch/utils/file_baton.py
+++ b/torch/utils/file_baton.py
@@ -1,12 +1,20 @@
 # mypy: allow-untyped-defs
 import os
 import time
+<<<<<<< HEAD
+=======
+import warnings
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class FileBaton:
     """A primitive, file-based synchronization utility."""
 
+<<<<<<< HEAD
     def __init__(self, lock_file_path, wait_seconds=0.1):
+=======
+    def __init__(self, lock_file_path, wait_seconds=0.1, warn_after_seconds=None):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         Create a new :class:`FileBaton`.
 
@@ -14,10 +22,19 @@ def __init__(self, lock_file_path, wait_seconds=0.1):
             lock_file_path: The path to the file used for locking.
             wait_seconds: The seconds to periodically sleep (spin) when
                 calling ``wait()``.
+<<<<<<< HEAD
+=======
+            warn_after_seconds: The seconds to wait before showing
+                lock file path to warn existing lock file.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         self.lock_file_path = lock_file_path
         self.wait_seconds = wait_seconds
         self.fd = None
+<<<<<<< HEAD
+=======
+        self.warn_after_seconds = warn_after_seconds
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def try_acquire(self):
         """
@@ -39,9 +56,24 @@ def wait(self):
         The amount of time slept depends on the ``wait_seconds`` parameter
         passed to the constructor.
         """
+<<<<<<< HEAD
         while os.path.exists(self.lock_file_path):
             time.sleep(self.wait_seconds)
 
+=======
+        has_warned = False
+
+        start_time = time.time()
+        while os.path.exists(self.lock_file_path):
+            time.sleep(self.wait_seconds)
+
+            if self.warn_after_seconds is not None:
+                if time.time() - start_time > self.warn_after_seconds and not has_warned:
+                    warnings.warn(f'Waited on lock file "{self.lock_file_path}" for '
+                                  f'{self.warn_after_seconds} seconds.')
+                    has_warned = True
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     def release(self):
         """Release the baton and removes its file."""
         if self.fd is not None:
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 79167cfbad07..6ac261d0a189 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -16,7 +16,11 @@
 
 """ Mapping of CUDA functions, include files, constants, and types to ROCm/HIP equivalents
 This closely follows the implementation in hipify-clang
+<<<<<<< HEAD
 https://github.com/ROCm-Developer-Tools/HIP/blob/master/hipify-clang/src/CUDA2HipMap.cpp
+=======
+https://github.com/ROCm/hip/blob/59071b895ed1c86d9698b4c859cefcdd5acda06f/hipify-clang/src/CUDA2HipMap.cpp
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 and its structure.
 There are different maps for fundamental names, include files, identifies, sparse, and
 PyTorch specific translations.
@@ -607,6 +611,10 @@
         ("curand_precalc.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
         ("curand_uniform.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
         ("cusparse.h", ("hipsparse/hipsparse.h", CONV_INCLUDE, API_RAND)),
+<<<<<<< HEAD
+=======
+        ("cusparseLt.h", ("hipsparselt/hipsparselt.h", CONV_INCLUDE, API_RAND)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cufft.h", ("hipfft/hipfft.h", CONV_INCLUDE, API_BLAS)),
         ("cufftXt.h", ("hipfft/hipfftXt.h", CONV_INCLUDE, API_BLAS)),
         # PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate
@@ -629,7 +637,10 @@
         ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_select.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("nvtx3/nvtx3.hpp", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
+<<<<<<< HEAD
         ("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)),
     ]
 )
@@ -793,6 +804,13 @@
             ("hipErrorSetOnActiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
         ),
         (
+<<<<<<< HEAD
+=======
+            "cudaErrorContextIsDestroyed",
+            ("hipErrorContextIsDestroyed", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             "cudaErrorInvalidSurface",
             ("hipErrorInvalidSurface", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
         ),
@@ -2920,6 +2938,10 @@
             "cuFuncSetBlockShape",
             ("hipFuncSetBlockShape", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
         ),
+<<<<<<< HEAD
+=======
+        ("cudaLaunchKernel", ("hipLaunchKernel", CONV_MODULE, API_DRIVER)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             "cuFuncSetSharedSize",
             ("hipFuncSetSharedSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
@@ -4052,6 +4074,26 @@
             ("hipMemset3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
         ),
         ("cudaMemGetInfo", ("hipMemGetInfo", CONV_MEM, API_RUNTIME)),
+<<<<<<< HEAD
+=======
+        ("cudaDeviceGetDefaultMemPool", ("hipDeviceGetDefaultMemPool", CONV_MEM, API_RUNTIME)),
+        ("cudaMemAccessDesc", ("hipMemAccessDesc", CONV_MEM, API_RUNTIME)),
+        ("cudaMemAccessFlagsProtReadWrite", ("hipMemAccessFlagsProtReadWrite", CONV_MEM, API_RUNTIME)),
+        ("cudaMemLocationTypeDevice", ("hipMemLocationTypeDevice", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolAttrReleaseThreshold", ("hipMemPoolAttrReleaseThreshold", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolAttrReservedMemCurrent", ("hipMemPoolAttrReservedMemCurrent", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolAttrReservedMemHigh", ("hipMemPoolAttrReservedMemHigh", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolAttrUsedMemCurrent", ("hipMemPoolAttrUsedMemCurrent", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolAttrUsedMemHigh", ("hipMemPoolAttrUsedMemHigh", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolGetAttribute", ("hipMemPoolGetAttribute", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolReuseAllowInternalDependencies", ("hipMemPoolReuseAllowInternalDependencies", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolReuseAllowOpportunistic", ("hipMemPoolReuseAllowOpportunistic", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolReuseFollowEventDependencies", ("hipMemPoolReuseFollowEventDependencies", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolSetAccess", ("hipMemPoolSetAccess", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolSetAttribute", ("hipMemPoolSetAttribute", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPoolTrimTo", ("hipMemPoolTrimTo", CONV_MEM, API_RUNTIME)),
+        ("cudaMemPool_t", ("hipMemPool_t", CONV_MEM, API_RUNTIME)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             "cudaArrayGetInfo",
             ("hipArrayGetInfo", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
@@ -8157,11 +8199,22 @@
         ("cusparseSpGEMMDescr_t", ("hipsparseSpGEMMDescr_t", CONV_TYPE, API_SPECIAL)),
         ("CUSPARSE_INDEX_32I", ("HIPSPARSE_INDEX_32I", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSE_INDEX_64I", ("HIPSPARSE_INDEX_64I", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+<<<<<<< HEAD
         ("CUSPARSE_ORDER_COL", ("HIPSPARSE_ORDER_COLUMN", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+=======
+        ("CUSPARSE_ORDER_COL", ("HIPSPARSE_ORDER_COL", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_ORDER_ROW", ("HIPSPARSE_ORDER_ROW", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("CUSPARSE_MV_ALG_DEFAULT", ("HIPSPARSE_MV_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSE_MM_ALG_DEFAULT", ("HIPSPARSE_MM_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSE_SPMM_COO_ALG1", ("HIPSPARSE_SPMM_COO_ALG1", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSE_SPMM_COO_ALG2", ("HIPSPARSE_SPMM_COO_ALG2", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+<<<<<<< HEAD
+=======
+        ("CUSPARSE_SPMM_CSR_ALG1", ("HIPSPARSE_SPMM_CSR_ALG1", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_SPMM_CSR_ALG2", ("HIPSPARSE_SPMM_CSR_ALG2", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_SPMM_CSR_ALG3", ("HIPSPARSE_SPMM_CSR_ALG3", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("CUSPARSE_COOMV_ALG", ("HIPSPARSE_COOMV_ALG", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSE_SPMM_CSR_ALG1", ("HIPSPARSE_CSRMM_ALG1", CONV_NUMERIC_LITERAL, API_SPECIAL)),
         ("CUSPARSE_SPGEMM_DEFAULT", ("HIPSPARSE_SPGEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
@@ -8238,6 +8291,47 @@
             "CUSPARSE_MATRIX_TYPE_GENERAL",
             ("HIPSPARSE_MATRIX_TYPE_GENERAL", CONV_NUMERIC_LITERAL, API_SPECIAL),
         ),
+<<<<<<< HEAD
+=======
+        # SparseLt
+        ("cuSPARSELt", ("hipSPARSELt", CONV_TYPE, API_SPECIAL)),
+        ("AT_CUSPARSELT_ENABLED", ("AT_HIPSPARSELT_ENABLED", CONV_TYPE, API_SPECIAL)),
+        ("CUSPARSE_ORDER_ROW", ("HIPSPARSE_ORDER_ROW", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_ORDER_COL", ("HIPSPARSE_ORDER_COL", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSELT_SPARSITY_50_PERCENT", ("HIPSPARSELT_SPARSITY_50_PERCENT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("cusparseComputeType", ("hipsparseLtComputetype_t", CONV_TYPE, API_SPECIAL)),
+        ("CUSPARSE_COMPUTE_32F", ("HIPSPARSELT_COMPUTE_32F", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_COMPUTE_16F", ("HIPSPARSELT_COMPUTE_16F", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_COMPUTE_32I", ("HIPSPARSELT_COMPUTE_32I", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_COMPUTE_TF32", ("HIPSPARSELT_COMPUTE_TF32", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSELT_MATMUL_BIAS_POINTER", ("HIPSPARSELT_MATMUL_BIAS_POINTER", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSELT_MATMUL_ALG_DEFAULT", ("HIPSPARSELT_MATMUL_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSELT_MATMUL_ALG_CONFIG_ID", ("HIPSPARSELT_MATMUL_ALG_CONFIG_ID", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING", ("HIPSPARSELT_MATMUL_ALPHA_VECTOR_SCALING", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("cusparseLtHandle_t", ("hipsparseLtHandle_t", CONV_TYPE, API_SPECIAL)),
+        ("cusparseLtMatDescriptor_t", ("hipsparseLtMatDescriptor_t", CONV_TYPE, API_SPECIAL)),
+        ("cusparseLtInit", ("hipsparseLtInit", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtStructuredDescriptorInit", ("hipsparseLtStructuredDescriptorInit", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtSpMMACompressedSize2", ("hipsparseLtSpMMACompressedSize2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtSpMMACompress2", ("hipsparseLtSpMMACompress2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmulDescriptor_t", ("hipsparseLtMatmulDescriptor_t", CONV_TYPE, API_SPECIAL)),
+        ("cusparseLtMatmulPlan_t", ("hipsparseLtMatmulPlan_t", CONV_TYPE, API_SPECIAL)),
+        ("cusparseLtMatmulAlgSelection_t", ("hipsparseLtMatmulAlgSelection_t", CONV_TYPE, API_SPECIAL)),
+        ("cusparseLtStructuredDescriptorInit", ("hipsparseLtStructuredDescriptorInit", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtDenseDescriptorInit", ("hipsparseLtDenseDescriptorInit", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmulDescriptorInit", ("hipsparseLtMatmulDescriptorInit", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmulDescSetAttribute", ("hipsparseLtMatmulDescSetAttribute", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmulAlgSelectionInit", ("hipsparseLtMatmulAlgSelectionInit", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmulAlgSetAttribute", ("hipsparseLtMatmulAlgSetAttribute", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmulPlanInit", ("hipsparseLtMatmulPlanInit", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmulGetWorkspace", ("hipsparseLtMatmulGetWorkspace", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmulSearch", ("hipsparseLtMatmulSearch", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmulAlgGetAttribute", ("hipsparseLtMatmulAlgGetAttribute", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmul", ("hipsparseLtMatmul", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatDescriptorDestroy", ("hipsparseLtMatDescriptorDestroy", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseLtMatmulPlanDestroy", ("hipsparseLtMatmulPlanDestroy", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseGetErrorString", ("hipsparseGetErrorString", CONV_MATH_FUNC, API_SPECIAL)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # SOLVER
         ("cublasOperation_t", ("hipsolverOperation_t", CONV_TYPE, API_SPECIAL)),
         ("CUBLAS_OP_N", ("HIPSOLVER_OP_N", CONV_NUMERIC_LITERAL, API_SPECIAL)),
@@ -8872,6 +8966,10 @@
             ("gloo/hip_allreduce_halving_doubling_pipelined.h", API_PYTORCH),
         ),
         ("gloo/cuda_allreduce_ring.h", ("gloo/hip_allreduce_ring.h", API_PYTORCH)),
+<<<<<<< HEAD
+=======
+        ("gloo/cuda_allreduce_ring_chunked.h", ("gloo/hip_allreduce_ring_chunked.h", API_PYTORCH)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         (
             "gloo/cuda_broadcast_one_to_all.h",
             ("gloo/hip_broadcast_one_to_all.h", API_PYTORCH),
@@ -8880,6 +8978,13 @@
             "gloo::CudaAllreduceHalvingDoublingPipelined",
             ("gloo::HipAllreduceHalvingDoublingPipelined", API_PYTORCH),
         ),
+<<<<<<< HEAD
+=======
+        (
+            "gloo::CudaAllreduceRingChunked",
+            ("gloo::HipAllreduceRingChunked", API_PYTORCH),
+        ),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("gloo::CudaBroadcastOneToAll", ("gloo::HipBroadcastOneToAll", API_PYTORCH)),
         ("gloo::CudaHostWorkspace", ("gloo::HipHostWorkspace", API_PYTORCH)),
         ("gloo::CudaDeviceWorkspace", ("gloo::HipDeviceWorkspace", API_PYTORCH)),
@@ -8895,6 +9000,10 @@
 CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict(
     [
         ("PYTORCH_NO_CUDA_MEMORY_CACHING", ("PYTORCH_NO_CUDA_MEMORY_CACHING", API_CAFFE2)),
+<<<<<<< HEAD
+=======
+        ("PYTORCH_CUDA_ALLOC_CONF", ("PYTORCH_CUDA_ALLOC_CONF", API_CAFFE2)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ("cuda_stream", ("hip_stream", API_CAFFE2)),
         # if the header is a native hip folder (under hip directory),
         # there is no need to add a hip path to it; the trie in hipify script
@@ -9033,7 +9142,13 @@
         ("CUDAStreamCaptureModeGuard", ("HIPStreamCaptureModeGuard", API_C10)),
         ("cuda::CUDAStreamCaptureModeGuard", ("cuda::HIPStreamCaptureModeGuard", API_C10)),
         ("CUDAAllocator", ("HIPAllocator", API_C10)),
+<<<<<<< HEAD
         ("C10_CUDA_KERNEL_LAUNCH_CHECK", ("C10_HIP_KERNEL_LAUNCH_CHECK", API_C10))
+=======
+        ("C10_CUDA_KERNEL_LAUNCH_CHECK", ("C10_HIP_KERNEL_LAUNCH_CHECK", API_C10)),
+        ("CUDAKernelLaunchRegistry", ("HIPKernelLaunchRegistry", API_C10)),
+        ("c10::cuda::get_cuda_check_suffix", ("c10::hip::get_hip_check_suffix", API_C10)),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ]
 )
 
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index 6cbdf6c7ecc0..cf612d36eb6b 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -139,7 +139,10 @@ def __exit__(self, type, value, traceback):
             for d in self.dirs_to_clean[::-1]:
                 os.rmdir(d)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Follow UNIX convention for paths to use '/' instead of '\\' on Windows
 def _to_unix_path(path: str) -> str:
     return path.replace(os.sep, '/')
@@ -531,7 +534,11 @@ def hip_header_magic(input_string):
 
 def replace_extern_shared(input_string):
     """Match extern __shared__ type foo[]; syntax and use HIP_DYNAMIC_SHARED() MACRO instead.
+<<<<<<< HEAD
        https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#__shared__
+=======
+       https://github.com/ROCm/hip/blob/master/docs/markdown/hip_kernel_language.md#__shared__
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Example:
         "extern __shared__ char smemChar[];" => "HIP_DYNAMIC_SHARED( char, smemChar)"
         "extern __shared__ unsigned char smem[];" => "HIP_DYNAMIC_SHARED( unsigned char, my_smem)"
@@ -830,6 +837,10 @@ def preprocessor(
         show_progress: bool) -> HipifyResult:
     """ Executes the CUDA -> HIP conversion on the specified file. """
     fin_path = os.path.abspath(os.path.join(output_directory, filepath))
+<<<<<<< HEAD
+=======
+    filepath = _to_unix_path(filepath)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     hipify_result = HIPIFY_FINAL_RESULT[fin_path]
     if filepath not in all_files:
         hipify_result.hipified_path = None
@@ -932,8 +943,13 @@ def repl(m):
                         return templ.format(os.path.relpath(header_fout_path if header_fout_path is not None
                                                             else header_filepath, header_dir))
                 hipified_header_filepath = HIPIFY_FINAL_RESULT[header_filepath].hipified_path
+<<<<<<< HEAD
                 return templ.format(os.path.relpath(hipified_header_filepath if hipified_header_filepath is not None
                                                     else header_filepath, header_dir))
+=======
+                return templ.format(_to_unix_path(os.path.relpath(hipified_header_filepath if hipified_header_filepath is not None
+                                                                  else header_filepath, header_dir)))
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             return m.group(0)
         return repl
@@ -989,7 +1005,11 @@ def repl(m):
             hipify_result.status = "[ok]"
             hipify_result.current_state = CurrentState.DONE
             return hipify_result
+<<<<<<< HEAD
         except PermissionError as e:
+=======
+        except OSError as e:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             print(f'{bcolors.WARNING}Failed to save {fout_path} with "{e.strerror}", leaving {fin_path} unchanged.{bcolors.ENDC}',
                   file=sys.stderr)
             hipify_result.hipified_path = fin_path
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
index c41add0fcbef..7551e342a587 100644
--- a/torch/utils/hooks.py
+++ b/torch/utils/hooks.py
@@ -223,6 +223,14 @@ def hook(_, grad_output):
                 # Special case if no input required gradients, this hook should call the user
                 # hook directly
                 if self.input_tensors_index is None:
+<<<<<<< HEAD
+=======
+                    warnings.warn("Full backward hook is firing when gradients are computed "
+                                  "with respect to module outputs since no inputs require gradients. See "
+                                  "https://docs.pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.register_full_backward_hook "  # noqa: B950
+                                  "for more details.",
+                                  stacklevel=5)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     grad_inputs = self._pack_with_none([], [], self.n_inputs)
                     for user_hook in self.user_hooks:
                         res = user_hook(self.module, grad_inputs, self.grad_outputs)
diff --git a/torch/utils/tensorboard/_pytorch_graph.py b/torch/utils/tensorboard/_pytorch_graph.py
index 0e9e453183dd..27bcef8095d9 100644
--- a/torch/utils/tensorboard/_pytorch_graph.py
+++ b/torch/utils/tensorboard/_pytorch_graph.py
@@ -341,7 +341,11 @@ def graph(model, args, verbose=False, use_strict_trace=True):
     # and pass it correctly to TensorBoard.
     #
     # Definition of StepStats and DeviceStepStats can be found at
+<<<<<<< HEAD
     # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/graph/tf_graph_common/test/graph-test.ts
+=======
+    # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/graph/tf_graph_common/proto.ts
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # and
     # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/step_stats.proto
     stepstats = RunMetadata(
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index a6792c5b8ab0..0bcc9c8650fc 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -472,7 +472,11 @@ def add_histogram(
             values (torch.Tensor, numpy.ndarray, or string/blobname): Values to build histogram
             global_step (int): Global step value to record
             bins (str): One of {'tensorflow','auto', 'fd', ...}. This determines how the bins are made. You can find
+<<<<<<< HEAD
               other options in: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html
+=======
+              other options in: https://numpy.org/doc/stable/reference/generated/numpy.histogram.html
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             walltime (float): Optional override default walltime (time.time())
               seconds after epoch of event
 
diff --git a/torch/utils/viz/MemoryViz.js b/torch/utils/viz/MemoryViz.js
index 7576b0f70068..df5ddc89c366 100644
--- a/torch/utils/viz/MemoryViz.js
+++ b/torch/utils/viz/MemoryViz.js
@@ -980,6 +980,11 @@ function process_alloc_data(snapshot, device, plot_segments, max_entries) {
       text = `${text}, Total memory used after allocation: ${formatSize(
         elem.max_allocated_mem,
       )}`;
+<<<<<<< HEAD
+=======
+      const context = elem?.compile_context ?? 'None';
+      text = `${text}, Compile context: ${context}`;
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (elem.stream !== null) {
         text = `${text}, stream ${elem.stream}`;
       }
@@ -1226,6 +1231,10 @@ function create_trace_view(
   dst.selectAll('svg').remove();
   dst.selectAll('div').remove();
 
+<<<<<<< HEAD
+=======
+  max_entries = Math.min(max_entries, data.elements_length);
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const d = dst.append('div');
   d.append('input')
     .attr('type', 'range')
@@ -1235,7 +1244,13 @@ function create_trace_view(
     .on('change', function () {
       create_trace_view(dst, snapshot, device, plot_segments, this.value);
     });
+<<<<<<< HEAD
   d.append('label').text('Detail');
+=======
+  d.append('label').text(
+    `Detail: ${max_entries} of ${data.elements_length} entries`,
+  );
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
   const grid_container = dst
     .append('div')
@@ -1292,6 +1307,22 @@ function create_settings_view(dst, snapshot, device) {
 }
 
 function unpickle(buffer) {
+<<<<<<< HEAD
+=======
+  try {
+    const decoder = new TextDecoder();
+    const jsonString = decoder.decode(new Uint8Array(buffer));
+    const data = JSON.parse(jsonString);
+
+    return data;
+  } catch (e) {
+    console.log('Failed to decode the data as JSON, fall back to pickle', e);
+  }
+  return unpickleData(buffer);
+}
+
+function unpickleData(buffer) {
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const bytebuffer = new Uint8Array(buffer);
   const decoder = new TextDecoder();
 
@@ -1620,6 +1651,10 @@ function unpickle_and_annotate(data) {
 
 function snapshot_change(f) {
   const view_value = view.node().value;
+<<<<<<< HEAD
+=======
+  let no_starting_gpu = gpu.node().value == '';
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   let device = Number(gpu.node().value);
   const snapshot = snapshot_cache[f];
   gpu.selectAll('option').remove();
@@ -1628,9 +1663,21 @@ function snapshot_change(f) {
     has_segments[s.device] = true;
   }
   let device_valid = false;
+<<<<<<< HEAD
   for (const [i, trace] of snapshot.device_traces.entries()) {
     if (trace.length > 0 || i in has_segments) {
       gpu.append('option').text(i);
+=======
+  let maxTraceLength = -1;
+  let defaultDevice = null;
+  for (const [i, trace] of snapshot.device_traces.entries()) {
+    if (trace.length > 0 || i in has_segments) {
+      gpu.append('option').text(i);
+      if (trace.length > maxTraceLength) {
+        maxTraceLength = trace.length;
+        defaultDevice = i;
+      }
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       if (i === device) {
         device_valid = true;
         gpu.node().selectedIndex = gpu.node().children.length - 1;
@@ -1640,6 +1687,15 @@ function snapshot_change(f) {
   if (!device_valid) {
     device = Number(gpu.node().value);
   }
+<<<<<<< HEAD
+=======
+
+  if (no_starting_gpu) {
+    device = defaultDevice;
+    gpu.node().value = device;
+  }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
   const key = [f, view_value, device];
   if (!(key in selection_to_div)) {
     selection_to_div[key] = d3.select('body').append('div');
diff --git a/torch/utils/weak.py b/torch/utils/weak.py
index f729ff06489f..2a2ac3187fbc 100644
--- a/torch/utils/weak.py
+++ b/torch/utils/weak.py
@@ -1,18 +1,38 @@
 # mypy: allow-untyped-defs
 from __future__ import annotations
 
+<<<<<<< HEAD
 import weakref
 from weakref import ref
 from _weakrefset import _IterationGuard  # type: ignore[attr-defined]
 from collections.abc import MutableMapping, Mapping
 from torch import Tensor
 import collections.abc as _collections_abc
+=======
+import collections.abc as _collections_abc
+import weakref
+
+from _weakrefset import _IterationGuard  # type: ignore[attr-defined]
+from collections.abc import Mapping, MutableMapping
+from weakref import ref
+
+from torch import Tensor
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 WeakRef = ref
 
 
+<<<<<<< HEAD
 __all__ = ['TensorWeakRef', 'WeakIdRef', 'WeakIdKeyDictionary', 'WeakTensorKeyDictionary']
+=======
+__all__ = [
+    "TensorWeakRef",
+    "WeakIdRef",
+    "WeakIdKeyDictionary",
+    "WeakTensorKeyDictionary",
+]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # This file defines a variant of WeakKeyDictionary that overrides the hashing
@@ -41,7 +61,11 @@
 # WeakIdRef(tensor) rather than weakref.ref(tensor); it handles a number of
 # easy to get wrong cases transparently for you.
 class WeakIdRef(weakref.ref):
+<<<<<<< HEAD
     __slots__ = ['_id']
+=======
+    __slots__ = ["_id"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, key, callback=None):
         # Unlike stock weakref, which preserves hash semantics of the
@@ -55,7 +79,11 @@ def __init__(self, key, callback=None):
     def __call__(self):
         r = super().__call__()
         # Special logic for Tensor PyObject resurrection
+<<<<<<< HEAD
         if hasattr(r, '_fix_weakref'):
+=======
+        if hasattr(r, "_fix_weakref"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r._fix_weakref()  # type: ignore[union-attr]
         return r
 
@@ -81,10 +109,18 @@ def __eq__(self, other):
             return a is b
         return self is other
 
+<<<<<<< HEAD
 # This is the same as WeakIdRef but equality is checked using hash() rather than id.
 # This will be equivalent to the one above except for classes where hash is not their id.
 class _WeakHashRef(weakref.ref):
     __slots__ = ['_id']
+=======
+
+# This is the same as WeakIdRef but equality is checked using hash() rather than id.
+# This will be equivalent to the one above except for classes where hash is not their id.
+class _WeakHashRef(weakref.ref):
+    __slots__ = ["_id"]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def __init__(self, key, callback=None):
         # Unlike stock weakref, which preserves hash semantics of the
@@ -98,7 +134,11 @@ def __init__(self, key, callback=None):
     def __call__(self):
         r = super().__call__()
         # Special logic for Tensor PyObject resurrection
+<<<<<<< HEAD
         if hasattr(r, '_fix_weakref'):
+=======
+        if hasattr(r, "_fix_weakref"):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             r._fix_weakref()  # type: ignore[union-attr]
         return r
 
@@ -115,6 +155,10 @@ def __eq__(self, other):
             return hash(a) == hash(b)
         return self is other
 
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # This is directly adapted from cpython/Lib/weakref.py
 class WeakIdKeyDictionary(MutableMapping):
     def __init__(self, dict=None, ref_type=WeakIdRef):  # CHANGED
@@ -132,6 +176,10 @@ def remove(k, selfref=ref(self)):
                         del self.data[k]
                     except KeyError:
                         pass
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         self._remove = remove
         # A list of dead weakrefs (keys to be removed)
         self._pending_removals = []
@@ -196,6 +244,10 @@ def copy(self):
 
     def __deepcopy__(self, memo):
         from copy import deepcopy
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new = self.__class__()
         with _IterationGuard(self):
             for key, value in self.data.items():
@@ -261,7 +313,13 @@ def pop(self, key, *args):
         return self.data.pop(self.ref_type(key), *args)  # CHANGED
 
     def setdefault(self, key, default=None):
+<<<<<<< HEAD
         return self.data.setdefault(self.ref_type(key, self._remove), default)  # CHANGED
+=======
+        return self.data.setdefault(
+            self.ref_type(key, self._remove), default
+        )  # CHANGED
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def update(self, dict=None, **kwargs):  # type: ignore[override]
         d = self.data
@@ -297,7 +355,14 @@ def __ror__(self, other):
     def __eq__(self, other):
         if not isinstance(other, Mapping):
             return NotImplemented
+<<<<<<< HEAD
         return {id(k): v for k, v in self.items()} == {id(k): v for k, v in other.items()}
+=======
+        return {id(k): v for k, v in self.items()} == {
+            id(k): v for k, v in other.items()
+        }
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Convenience alias
 WeakTensorKeyDictionary = WeakIdKeyDictionary
diff --git a/torch/xpu/__init__.py b/torch/xpu/__init__.py
index 18a88220c417..2e099d1444e0 100644
--- a/torch/xpu/__init__.py
+++ b/torch/xpu/__init__.py
@@ -62,6 +62,7 @@ def device_count() -> int:
 
 def is_available() -> bool:
     r"""Return a bool indicating if XPU is currently available."""
+<<<<<<< HEAD
     # This function nerver throws.
     return device_count() > 0
 
@@ -69,6 +70,20 @@ def is_available() -> bool:
 def is_bf16_supported():
     r"""Return a bool indicating if the current XPU device supports dtype bfloat16."""
     return True
+=======
+    # This function never throws.
+    return device_count() > 0
+
+
+def is_bf16_supported(including_emulation: bool = True) -> bool:
+    r"""Return a bool indicating if the current XPU device supports dtype bfloat16."""
+    if not is_available():
+        return False
+    return (
+        including_emulation
+        or torch.xpu.get_device_properties().has_bfloat16_conversions
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def is_initialized():
diff --git a/torchgen/_autoheuristic/README.md b/torchgen/_autoheuristic/README.md
index 58613e54fb87..e795807b13c9 100644
--- a/torchgen/_autoheuristic/README.md
+++ b/torchgen/_autoheuristic/README.md
@@ -89,7 +89,11 @@ context = AHContext()
 context.add_feature("m", mat1.shape[0])
 context.add_feature("k", mat1.shape[1])
 
+<<<<<<< HEAD
 # adding a categorical feture
+=======
+# adding a categorical feature
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 context.add_feature("mat1_dtype", mat1.dtype, is_categorical=True)
 ```
 
diff --git a/torchgen/_autoheuristic/benchmark_utils.py b/torchgen/_autoheuristic/benchmark_utils.py
index f0161065a3a0..794e9b6388f8 100644
--- a/torchgen/_autoheuristic/benchmark_utils.py
+++ b/torchgen/_autoheuristic/benchmark_utils.py
@@ -18,7 +18,11 @@ def transpose_tensors(p_transpose_both: float = 0.05) -> tuple[bool, bool]:
 
 def fits_in_memory(dtype: Any, m: int, k: int, n: int) -> Any:
     threshold_memory = torch.cuda.get_device_properties(0).total_memory / 4
+<<<<<<< HEAD
     # dividing by 4 beause we otherwise sometimes run out of memory, I assume because
+=======
+    # dividing by 4 because we otherwise sometimes run out of memory, I assume because
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # inductor creates copies of tensors for benchmarking?
     return dtype.itemsize * (m * k + k * n + m * n) < threshold_memory
 
diff --git a/torchgen/_autoheuristic/collect_data.sh b/torchgen/_autoheuristic/collect_data.sh
index 442f6120327f..227777a11d21 100644
--- a/torchgen/_autoheuristic/collect_data.sh
+++ b/torchgen/_autoheuristic/collect_data.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 
+<<<<<<< HEAD
 # this script makes it easy parallize collecting data across using multiple GPUs
+=======
+# This script makes it easy to parallelize data collection across multiple GPUs
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # Check if tmux is installed
 if ! command -v tmux &> /dev/null; then
diff --git a/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py b/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py
index 48dfa788977d..b94429aee537 100644
--- a/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py
+++ b/torchgen/_autoheuristic/mixed_mm/gen_data_mixed_mm.py
@@ -15,7 +15,11 @@
 )
 
 import torch
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class BenchmarkRunnerMixedMM(BenchmarkRunner):  # type: ignore[misc, no-any-unimported]
@@ -59,7 +63,11 @@ def run_benchmark(
         )
         b = b.to(dtype=dtype_right)
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def mixed_mm(A, B):
                 return torch.mm(A, B.to(A.dtype))
diff --git a/torchgen/_autoheuristic/mixed_mm/generate_heuristic_mixedmm.sh b/torchgen/_autoheuristic/mixed_mm/generate_heuristic_mixedmm.sh
index dd6ac78e9dfb..86485b5c8325 100644
--- a/torchgen/_autoheuristic/mixed_mm/generate_heuristic_mixedmm.sh
+++ b/torchgen/_autoheuristic/mixed_mm/generate_heuristic_mixedmm.sh
@@ -12,7 +12,11 @@ MODE=$1
 # !!! SPECIFY THE GPUs THAT YOU WANT TO USE HERE !!!
 GPU_DEVICE_IDS="4,5"
 
+<<<<<<< HEAD
 # !!! SPECIFY THE CONDA ENVIRONEMNT THAT YOU WANT TO BE ACTIVATED HERE !!!
+=======
+# !!! SPECIFY THE CONDA ENVIRONMENT THAT YOU WANT TO BE ACTIVATED HERE !!!
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CONDA_ENV=heuristic-pr
 
 NUM_SAMPLES=2000
diff --git a/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh b/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh
index fd50b2e79fb7..3587bc8c6f5f 100644
--- a/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh
+++ b/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 
+<<<<<<< HEAD
 base_url='https://github.com/AlnisM/autoheuristic-datasets/raw/main/'
+=======
+base_url='https://github.com/AlnisM/autoheuristic-datasets/raw/main/'  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 a100_data='mixedmm_a100_data.zip'
 h100_data='mixedmm_h100_data.zip'
 datasets=("${a100_data}" "${h100_data}")
diff --git a/torchgen/_autoheuristic/mm/gen_data_mm.py b/torchgen/_autoheuristic/mm/gen_data_mm.py
index 8ad6dc1c008d..28751a417af6 100644
--- a/torchgen/_autoheuristic/mm/gen_data_mm.py
+++ b/torchgen/_autoheuristic/mm/gen_data_mm.py
@@ -16,7 +16,11 @@
 )
 
 import torch
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class BenchmarkRunnerMM(BenchmarkRunner):  # type: ignore[misc, no-any-unimported]
@@ -57,7 +61,11 @@ def run_benchmark(
                 dtype_right=dtype,
             )
 
+<<<<<<< HEAD
             with fresh_inductor_cache():
+=======
+            with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
                 def mixed_mm(A: Any, B: Any) -> Any:
                     return torch.mm(A, B)
diff --git a/torchgen/_autoheuristic/mm/get_mm_dataset.sh b/torchgen/_autoheuristic/mm/get_mm_dataset.sh
index 7461dec41dd8..18417e84230e 100644
--- a/torchgen/_autoheuristic/mm/get_mm_dataset.sh
+++ b/torchgen/_autoheuristic/mm/get_mm_dataset.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 
+<<<<<<< HEAD
 base_url='https://github.com/AlnisM/autoheuristic-datasets/raw/main/'
+=======
+base_url='https://github.com/AlnisM/autoheuristic-datasets/raw/main/'  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 a100_data='a100_mm.zip'
 h100_data='h100_mm.zip'
 datasets=("${a100_data}" "${h100_data}")
diff --git a/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py b/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py
index d5ddc44c1b7b..4cf558fed7b2 100644
--- a/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py
+++ b/torchgen/_autoheuristic/pad_mm/gen_data_pad_mm.py
@@ -18,7 +18,11 @@
 from torch._inductor.fx_passes.pad_mm import (  # type: ignore[import-not-found]
     get_alignment_size_dtype,
 )
+<<<<<<< HEAD
 from torch._inductor.utils import fresh_inductor_cache
+=======
+from torch._inductor.utils import fresh_cache
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 class BenchmarkRunnerPadMM(BenchmarkRunner):  # type: ignore[misc, no-any-unimported]
@@ -74,7 +78,11 @@ def run_benchmark(
         print(f"transpose_left={transpose_left} transpose_right={transpose_right}")
         print(f"prepadded_left={prepadded_left} prepadded_right={prepadded_right}")
 
+<<<<<<< HEAD
         with fresh_inductor_cache():
+=======
+        with fresh_cache():
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
             def mm(a: Any, b: Any) -> Any:
                 return torch.mm(a, b)
diff --git a/torchgen/_autoheuristic/pad_mm/generate_heuristic_pad_mm.sh b/torchgen/_autoheuristic/pad_mm/generate_heuristic_pad_mm.sh
index d7cb6b99164c..b6eb01f8f499 100644
--- a/torchgen/_autoheuristic/pad_mm/generate_heuristic_pad_mm.sh
+++ b/torchgen/_autoheuristic/pad_mm/generate_heuristic_pad_mm.sh
@@ -12,7 +12,11 @@ MODE=$1
 # !!! SPECIFY THE GPUs THAT YOU WANT TO USE HERE !!!
 GPU_DEVICE_IDS="4,5"
 
+<<<<<<< HEAD
 # !!! SPECIFY THE CONDA ENVIRONEMNT THAT YOU WANT TO BE ACTIVATED HERE !!!
+=======
+# !!! SPECIFY THE CONDA ENVIRONMENT THAT YOU WANT TO BE ACTIVATED HERE !!!
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 CONDA_ENV=heuristic-pr
 
 NUM_SAMPLES=2000
diff --git a/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh b/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh
index b8ab60d943ed..3d1b65034ede 100644
--- a/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh
+++ b/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
 
 a100_zip="pad_mm_a100_data.zip"
+<<<<<<< HEAD
 a100_data="https://github.com/AlnisM/autoheuristic-datasets/raw/main/${a100_zip}"
+=======
+a100_data="https://github.com/AlnisM/autoheuristic-datasets/raw/main/${a100_zip}"  # @lint-ignore
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 rm -f ${a100_zip}
 wget ${a100_data}
 unzip -o ${a100_zip}
diff --git a/torchgen/_autoheuristic/train_decision.py b/torchgen/_autoheuristic/train_decision.py
index f27a30b48fb5..8be45244e871 100644
--- a/torchgen/_autoheuristic/train_decision.py
+++ b/torchgen/_autoheuristic/train_decision.py
@@ -94,7 +94,11 @@ def get_allowed_wrong_prediction_pct(self):
 
     def get_grid_search_values(self):
         """
+<<<<<<< HEAD
         Standard values for grid search. Can be overriden.
+=======
+        Standard values for grid search. Can be overridden.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         """
         return {
             "max_depth": [5, 6, 7],
diff --git a/torchgen/aoti/fallback_ops.py b/torchgen/aoti/fallback_ops.py
index a2a6cf1b1afc..60a12c1f4813 100644
--- a/torchgen/aoti/fallback_ops.py
+++ b/torchgen/aoti/fallback_ops.py
@@ -1,12 +1,21 @@
+<<<<<<< HEAD
 # Be extra careful when you edit this file, because it affects AOTInductor ABI compatbility. See
+=======
+# Be extra careful when you edit this file, because it affects AOTInductor ABI compatibility. See
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # https://github.com/pytorch/pytorch/blob/7e86a7c0155295539996e0cf422883571126073e/torchgen/gen.py#L2424-L2436
 # for details.
 #
 # The inductor_fallback_ops list is based on the fallback ops from torch/_inductor/lowering.py.
+<<<<<<< HEAD
+=======
+#
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # Generally speaking, it is ok to add a new op to the list, but you need to run
 # `python torchgen/gen.py --update-aoti-c-shim` in order to regenerate C shim header files.
 # But it is NOT ok to remove an existing fallback op from the list, since that will break
 # some existing AOTInductor-compiled models.
+<<<<<<< HEAD
 inductor_fallback_ops = {
     "aten._adaptive_avg_pool2d_backward.default",
     "aten._adaptive_avg_pool2d.default",
@@ -155,4 +164,170 @@
     "aten.view_as_real.default",
     "aten.view.dtype",
     "aten._weight_int8pack_mm.default",
+=======
+#
+# A fallback op version defaults to 1. If you want to extend an existing fallback op by adding
+# a new argument with a default value, while it is fine in the Python world, it will be BC-breaking
+# when generating C shim. Thus you need to bump up the version number of that fallback op by
+# updating the entry in the inductor_fallback_ops list, adding a new version number with a list
+# of new arguments, and then run `python torchgen/gen.py --update-aoti-c-shim` to regenerate.
+
+inductor_fallback_ops: dict[str, dict[str, list[str]]] = {
+    "aten._adaptive_avg_pool2d_backward.default": {},
+    "aten._adaptive_avg_pool2d.default": {},
+    "aten._adaptive_avg_pool3d_backward.default": {},
+    "aten._adaptive_avg_pool3d.default": {},
+    "aten._addmm_activation.default": {},
+    "aten._cdist_backward.default": {},
+    "aten._cdist_forward.default": {},
+    "aten._cudnn_rnn.default": {},
+    "aten._dyn_quant_matmul_4bit.default": {},
+    "aten._dyn_quant_pack_4bit_weight.default": {},
+    "aten._efficient_attention_backward.default": {},
+    "aten._efficient_attention_forward.default": {},
+    "aten._efficientzerotensor.default": {},
+    "aten._embedding_bag_dense_backward.default": {},
+    "aten._embedding_bag_forward_only.default": {},
+    "aten._embedding_bag_per_sample_weights_backward.default": {},
+    "aten._embedding_bag.default": {},
+    "aten._fft_c2c.default": {},
+    "aten._fft_r2c.default": {},
+    "aten._flash_attention_backward.default": {},
+    "aten._flash_attention_forward.default": {},
+    "aten._fused_moving_avg_obs_fq_helper_functional.default": {},
+    "aten._fused_moving_avg_obs_fq_helper.default": {},
+    "aten._histogramdd_from_bin_cts.default": {},
+    "aten._int_mm.out": {},
+    "aten._pdist_backward.default": {},
+    "aten._pdist_forward.default": {},
+    "aten._scaled_dot_product_cudnn_attention_backward.default": {},
+    "aten._scaled_dot_product_cudnn_attention.default": {},
+    "aten._scaled_dot_product_efficient_attention_backward.default": {},
+    "aten._scaled_dot_product_efficient_attention.default": {},
+    "aten._scaled_dot_product_flash_attention_backward.default": {},
+    "aten._scaled_dot_product_flash_attention_for_cpu_backward.default": {},
+    "aten._scaled_dot_product_flash_attention_for_cpu.default": {},
+    "aten._scaled_dot_product_flash_attention.default": {},
+    "aten._scaled_dot_product_fused_attention_overrideable_backward.default": {},
+    "aten._scaled_dot_product_fused_attention_overrideable.default": {},
+    "aten._scaled_mm.default": {},
+    "aten._scaled_mm.out": {},
+    "aten._segment_reduce_backward.default": {},
+    "aten._thnn_fused_lstm_cell.default": {},
+    "aten._to_sparse.default": {},
+    "aten._trilinear.default": {},
+    "aten._weight_int4pack_mm.default": {},
+    "aten._weight_int8pack_mm.default": {},
+    "aten.abs.default": {},
+    "aten.adaptive_max_pool2d_backward.default": {},
+    "aten.adaptive_max_pool2d.default": {},
+    "aten.adaptive_max_pool3d_backward.default": {},
+    "aten.adaptive_max_pool3d.default": {},
+    "aten.add.Scalar": {},
+    "aten.add.Tensor": {},
+    "aten.addbmm.default": {},
+    "aten.addmm.out": {},
+    "aten.addmv.default": {},
+    "aten.angle.default": {},
+    "aten.avg_pool2d_backward.default": {},
+    "aten.avg_pool2d.default": {},
+    "aten.avg_pool3d_backward.default": {},
+    "aten.avg_pool3d.default": {},
+    "aten.baddbmm.out": {},
+    "aten.bernoulli_.float": {},
+    "aten.bernoulli_.Tensor": {},
+    "aten.bmm.out": {},
+    "aten.bucketize.Tensor": {},
+    "aten.cat.default": {},
+    "aten.cholesky_inverse.default": {},
+    "aten.cholesky_solve.default": {},
+    "aten.convolution_backward.default": {},
+    "aten.convolution.default": {},
+    "aten.cummax.default": {},
+    "aten.cummin.default": {},
+    "aten.cumprod.default": {},
+    "aten.cumsum.default": {},
+    "aten.exponential.default": {},
+    "aten.fill_.Scalar": {},
+    "aten.fractional_max_pool2d_backward.default": {},
+    "aten.fractional_max_pool2d.default": {},
+    "aten.fractional_max_pool3d_backward.default": {},
+    "aten.fractional_max_pool3d.default": {},
+    "aten.gcd.default": {},
+    "aten.geqrf.default": {},
+    "aten.grid_sampler_2d_backward.default": {},
+    "aten.hann_window.default": {},
+    "aten.histc.default": {},
+    "aten.histogram.bin_ct": {},
+    "aten.index_put.default": {},
+    "aten.index_reduce.default": {},
+    "aten.index.Tensor": {},
+    "aten.kthvalue.default": {},
+    "aten.logcumsumexp.default": {},
+    "aten.lu_unpack.default": {},
+    "aten.masked_scatter_backward.default": {},
+    "aten.masked_scatter.default": {},
+    "aten.masked_select.default": {},
+    "aten.max_pool2d_with_indices_backward.default": {},
+    "aten.max_pool2d_with_indices.default": {},
+    "aten.max_pool3d_with_indices_backward.default": {},
+    "aten.max_pool3d_with_indices.default": {},
+    "aten.max_unpool2d.default": {},
+    "aten.max_unpool3d.default": {},
+    "aten.median.default": {},
+    "aten.mm.out": {},
+    "aten.mode.default": {},
+    "aten.mul.Scalar": {},
+    "aten.mul.Tensor": {},
+    "aten.nanmedian.default": {},
+    "aten.narrow.default": {},
+    "aten.native_dropout.default": {},
+    "aten.nonzero.default": {},
+    "aten.normal_functional.default": {},
+    "aten.ormqr.default": {},
+    "aten.pad.default": {},
+    "aten.permute.default": {},
+    "aten.polar.default": {},
+    "aten.pow.Scalar": {},
+    "aten.pow.Tensor_Scalar": {},
+    "aten.pow.Tensor_Tensor": {},
+    "aten.rand.default": {},
+    "aten.rand.generator": {},
+    "aten.randint.default": {},
+    "aten.randint.generator": {},
+    "aten.randint.low_out": {},
+    "aten.randint.low": {},
+    "aten.randn.default": {},
+    "aten.randn.generator": {},
+    "aten.randperm.default": {},
+    "aten.repeat_interleave.Tensor": {},
+    "aten.replication_pad1d_backward.default": {},
+    "aten.replication_pad2d_backward.default": {},
+    "aten.reshape.default": {},
+    "aten.resize_.default": {},
+    "aten.resize_as_.default": {},
+    "aten.scatter_reduce.two_out": {},
+    "aten.scatter.src_out": {},
+    "aten.scatter.value_out": {},
+    "aten.searchsorted.Scalar": {},
+    "aten.searchsorted.Tensor": {},
+    "aten.segment_reduce.default": {},
+    "aten.set_.source_Tensor": {},
+    "aten.slice.Tensor": {},
+    "aten.soft_margin_loss_backward.default": {},
+    "aten.sort.default": {},
+    "aten.sort.stable": {},
+    "aten.squeeze.dim": {},
+    "aten.to_sparse.default": {},
+    "aten.topk.default": {},
+    "aten.triangular_solve.default": {},
+    "aten.uniform.default": {},
+    "aten.upsample_bicubic2d_backward.default": {},
+    "aten.upsample_linear1d_backward.default": {},
+    "aten.upsample_trilinear3d_backward.default": {},
+    "aten.view_as_complex.default": {},
+    "aten.view_as_real.default": {},
+    "aten.view.dtype": {},
+    "aten._weight_int4pack_mm_with_scales_and_zeros.default": {},
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
diff --git a/torchgen/api/autograd.py b/torchgen/api/autograd.py
index 3f3b793825c9..952d02ed7379 100644
--- a/torchgen/api/autograd.py
+++ b/torchgen/api/autograd.py
@@ -93,7 +93,11 @@ class ForwardDerivative:
     # This is only used by inplace operations
     required_original_self_value: bool
 
+<<<<<<< HEAD
     # If this formula is specified in derivatives.yaml or if we are re-using the
+=======
+    # If this formula is specified in derivatives.yaml or if we are reusing the
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # out of place formula for inplace
     is_reusing_outplace_formula: bool
 
@@ -632,7 +636,11 @@ def find_info(
             info_dict = non_functional_info_by_signature[f_sig]
             # See https://github.com/pytorch/pytorch/pull/76320/files#r874816389
             assert not any(
+<<<<<<< HEAD
                 any("self" in str(inpt.nctype.name) for inpt in info.all_saved_inputs)
+=======
+                any("self" in str(input.nctype.name) for input in info.all_saved_inputs)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 for info in info_dict.values()
             ), f"""\
 Attempted to convert a derivative formula for a mutable operator
@@ -699,7 +707,11 @@ def find_info(
                 #      we make sure that the original value of the input that is being modified inplace (self_p) is
                 #      not used in the formula. Note that the formula can use "original_self_p" here and that would
                 #      trigger a clone of the original input.
+<<<<<<< HEAD
                 #    - If we are re-using the out of place formula (is_exact_match == False) then we replace every
+=======
+                #    - If we are reusing the out of place formula (is_exact_match == False) then we replace every
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 #      occurrence of self_p and self_t by original_self_p and original_self_t. These will be
                 #      populated by cloned version of the original input (either the clone done by the backward AD
                 #      logic if self is also used in a backward formula or a special clone that we add).
diff --git a/torchgen/api/cpp.py b/torchgen/api/cpp.py
index 554bfa4a5c79..69d3d3aba33e 100644
--- a/torchgen/api/cpp.py
+++ b/torchgen/api/cpp.py
@@ -1,6 +1,10 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
+<<<<<<< HEAD
+=======
+from typing_extensions import assert_never
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torchgen import local
 from torchgen.api.types import (
@@ -48,7 +52,10 @@
     TensorOptionsArguments,
     Type,
 )
+<<<<<<< HEAD
 from torchgen.utils import assert_never
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -127,7 +134,11 @@ def valuetype_type(
 
 
 # Translation of types occurring in JIT arguments to a C++ argument type.
+<<<<<<< HEAD
 # If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type.
+=======
+# If remove_non_owning_ref_types is set, we'll guarantee that the output CType is not a non-owning reference type.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 # For example, we'll return std::vector<int> instead of IntArrayRef.
 # See Note [translation from C++ reference to value types]
 def argumenttype_type(
diff --git a/torchgen/api/dispatcher.py b/torchgen/api/dispatcher.py
index 4cc6186d7e0e..7053784ae267 100644
--- a/torchgen/api/dispatcher.py
+++ b/torchgen/api/dispatcher.py
@@ -2,6 +2,10 @@
 
 import itertools
 from typing import TYPE_CHECKING
+<<<<<<< HEAD
+=======
+from typing_extensions import assert_never
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torchgen.api import cpp
 from torchgen.api.types import ArgName, Binding, CType, NamedCType
@@ -13,7 +17,11 @@
     TensorOptionsArguments,
     Type,
 )
+<<<<<<< HEAD
 from torchgen.utils import assert_never, concatMap
+=======
+from torchgen.utils import concatMap
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
diff --git a/torchgen/api/native.py b/torchgen/api/native.py
index 82bc051a6832..28b5b868e8e2 100644
--- a/torchgen/api/native.py
+++ b/torchgen/api/native.py
@@ -1,6 +1,10 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
+<<<<<<< HEAD
+=======
+from typing_extensions import assert_never
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torchgen import local
 from torchgen.api import cpp
@@ -29,7 +33,10 @@
     TensorOptionsArguments,
     Type,
 )
+<<<<<<< HEAD
 from torchgen.utils import assert_never
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
diff --git a/torchgen/api/python.py b/torchgen/api/python.py
index 1d40d607f4bc..89851f218b2d 100644
--- a/torchgen/api/python.py
+++ b/torchgen/api/python.py
@@ -21,7 +21,11 @@
 
 
 if TYPE_CHECKING:
+<<<<<<< HEAD
     from collections.abc import Sequence
+=======
+    from collections.abc import Iterable, Sequence
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
@@ -201,6 +205,33 @@
 # For examples, only pyi signatures include return types.
 
 
+<<<<<<< HEAD
+=======
+def format_function_signature(
+    name: str, arguments: Iterable[str] = (), return_type: str | None = None
+) -> str:
+    if not isinstance(arguments, (list, tuple)):
+        arguments = tuple(arguments)
+    return_type = f" -> {return_type}" if return_type is not None else ""
+
+    sig = f"def {name}({', '.join(arguments)}){return_type}: ..."
+    if len(sig) <= 80 or len(arguments) == 0 or tuple(arguments) == ("self",):
+        return sig
+
+    lines = [
+        f"def {name}(",
+        *(f"    {arg}," for arg in arguments),
+        f"){return_type}: ...",
+    ]
+    sig = "\n".join(lines)
+    if all(len(line) <= 80 for line in lines):
+        return sig
+    # ruff format bug for compound statements: https://github.com/astral-sh/ruff/issues/18658
+    # use `skip` instead of `on` + `off`
+    return sig.removesuffix(" ...") + "  # fmt: skip\n    ..."
+
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 @dataclass(frozen=True)
 class PythonReturns:
     returns: tuple[Return, ...]
@@ -265,7 +296,11 @@ def argument_str_pyi(
 
         # pyi merges the _out and functional variants into the same signature, with an optional out arg
         if name == "out" and type_str == "Tensor" and not deprecated:
+<<<<<<< HEAD
             type_str = "Optional[" + type_str + "]"
+=======
+            type_str = f"{type_str} | None".replace(" | None | None", " | None")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         # pyi deprecated signatures don't get defaults for their out arg
         treat_as_no_default = (
@@ -421,7 +456,11 @@ def signature_str_pyi(self, *, skip_outputs: bool = False) -> str:
         # pyi also includes self (with no typing/defaults) for methods
         if self.method:
             schema_formals.insert(0, "self")
+<<<<<<< HEAD
         return f"def {self.name}({', '.join(schema_formals)}) -> {returns_str}: ..."
+=======
+        return format_function_signature(self.name, schema_formals, returns_str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> str | None:
         # only pyi uses vararg signatures
@@ -431,6 +470,7 @@ def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> str | None:
         ]
         # vararg only applies to pyi signatures. vararg variants are not generated for all signatures
         num_args = self.arguments_count()
+<<<<<<< HEAD
         num_positionalargs = len(self.input_args)
 
         have_vararg_version = False
@@ -444,6 +484,19 @@ def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> str | None:
                 have_vararg_version = True
 
         if not have_vararg_version:
+=======
+        if num_args == 0:
+            return None
+
+        num_positionalargs = len(self.input_args)
+
+        vararg_type = args[0].type
+        if not (
+            isinstance(vararg_type, ListType)
+            and str(vararg_type.elem) in ["int", "SymInt"]
+            and num_positionalargs == 1
+        ):
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
 
         # Below are the major changes in vararg vs. regular pyi signatures
@@ -457,7 +510,11 @@ def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> str | None:
         # pyi also includes self (with no typing/defaults) for methods
         if self.method:
             schema_formals.insert(0, "self")
+<<<<<<< HEAD
         return f"def {self.name}({', '.join(schema_formals)}) -> {returns_str}: ..."
+=======
+        return format_function_signature(self.name, schema_formals, returns_str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # The deprecated python signature involves some special logic, so create a
@@ -498,7 +555,11 @@ def signature_str_pyi(self, *, skip_outputs: bool = False) -> str:
             schema_formals.insert(positional_argc, "*")
 
         returns_str = returns_str_pyi(self)
+<<<<<<< HEAD
         return f"def {self.name}({', '.join(schema_formals)}) -> {returns_str}: ..."
+=======
+        return format_function_signature(self.name, schema_formals, returns_str)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> str | None:
         # the codegen doesn't include vararg variants for deprecated signatures
@@ -915,17 +976,29 @@ def argument_type_str_pyi(t: Type) -> str:
         t = t.elem
         add_optional = True
 
+<<<<<<< HEAD
+=======
+    ret = ""
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     if isinstance(t, BaseType):
         if t.name in [BaseTy.int, BaseTy.DeviceIndex]:
             ret = "_int"
         if t.name == BaseTy.SymInt:
+<<<<<<< HEAD
             ret = "Union[_int, SymInt]"
+=======
+            ret = "_int | SymInt"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif t.name == BaseTy.float:
             ret = "_float"
         elif t.name == BaseTy.str:
             ret = "str"
         elif t.name == BaseTy.Scalar:
+<<<<<<< HEAD
             ret = "Union[Number, _complex]"
+=======
+            ret = "Number | _complex"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif t.name == BaseTy.ScalarType:
             ret = "_dtype"
         elif t.name == BaseTy.bool:
@@ -935,6 +1008,7 @@ def argument_type_str_pyi(t: Type) -> str:
         elif t.name == BaseTy.Layout:
             ret = "_layout"
         elif t.name == BaseTy.Device:
+<<<<<<< HEAD
             ret = "Optional[DeviceLikeType]"
         elif t.name == BaseTy.MemoryFormat:
             ret = "memory_format"
@@ -942,12 +1016,22 @@ def argument_type_str_pyi(t: Type) -> str:
             ret = "Union[str, ellipsis, None]"
         elif t.name == BaseTy.Storage:
             ret = "Union[Storage, UntypedStorage]"
+=======
+            ret = "DeviceLikeType | None"
+        elif t.name == BaseTy.MemoryFormat:
+            ret = "memory_format"
+        elif t.name == BaseTy.Dimname:
+            ret = "str | EllipsisType | None"
+        elif t.name == BaseTy.Storage:
+            ret = "Storage | UntypedStorage"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif t.name in [BaseTy.Tensor, BaseTy.Generator, BaseTy.Stream]:
             # These python schema type names line up with their function schema names
             ret = t.name.name
 
     elif isinstance(t, ListType):
         if str(t.elem) == "int":
+<<<<<<< HEAD
             ret = "Union[_int, _size]" if t.size is not None else "_size"
         elif t.is_tensor_like():
             # TODO: this doesn't seem right...
@@ -958,12 +1042,28 @@ def argument_type_str_pyi(t: Type) -> str:
                 "Union[Tensor, tuple[Tensor, ...], list[Tensor]]"
                 if t.size is not None
                 else "Union[tuple[Tensor, ...], list[Tensor]]"
+=======
+            ret = "_int | _size" if t.size is not None else "_size"
+        elif t.is_tensor_like():
+            # TODO: this doesn't seem right...
+            # Tensor?[] currently translates to tuple[Tensor, ...] | list[Tensor] | None
+            # It should probably translate to   tuple[Tensor | None, ...] | list[Tensor | None]
+            add_optional = True
+            ret = (
+                "Tensor | tuple[Tensor, ...] | list[Tensor]"
+                if t.size is not None
+                else "tuple[Tensor, ...] | list[Tensor]"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             )
         elif str(t.elem) == "float":
             ret = "Sequence[_float]"
         elif str(t.elem) == "SymInt" and t.size is not None:
             elem = argument_type_str_pyi(t.elem)
+<<<<<<< HEAD
             ret = f"Union[{elem}, Sequence[{elem}]]"
+=======
+            ret = f"{elem} | Sequence[{elem}]"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             elem = argument_type_str_pyi(t.elem)
             ret = f"Sequence[{elem}]"
@@ -972,7 +1072,11 @@ def argument_type_str_pyi(t: Type) -> str:
         raise RuntimeError(f"unrecognized type {repr(t)}")
 
     if add_optional:
+<<<<<<< HEAD
         ret = "Optional[" + ret + "]"
+=======
+        ret = f"{ret} | None".replace(" | None | None", " | None")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     return ret
 
@@ -983,13 +1087,21 @@ def return_type_str_pyi(t: Type) -> str:
 
     if isinstance(t, OptionalType):
         inner = return_type_str_pyi(t.elem)
+<<<<<<< HEAD
         return f"Optional[{inner}]"
+=======
+        return f"{inner} | None".replace(" | None | None", " | None")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     if isinstance(t, BaseType):
         if t.name == BaseTy.Device:
             return "_device"
         elif t.name == BaseTy.Dimname:
+<<<<<<< HEAD
             return "Optional[str]"
+=======
+            return "str | None"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             return argument_type_str_pyi(t)
 
@@ -1010,6 +1122,7 @@ def returns_structseq_pyi(signature: PythonSignature) -> tuple[str, str] | None:
         # does not allow us to override __init__.
         seq_type = f"tuple[{', '.join(python_returns)}]"
         structseq_def_lines = [
+<<<<<<< HEAD
             f"class {structseq_name}({seq_type}):",
         ]
         for name, typ in zip(field_names, python_returns):
@@ -1017,14 +1130,34 @@ def returns_structseq_pyi(signature: PythonSignature) -> tuple[str, str] | None:
                 [
                     "    @property",
                     f"    def {name}(self) -> {typ}: ...",
+=======
+            f"class {structseq_name}({seq_type}):  # fmt: skip",
+        ]
+        for name, ret_type in zip(field_names, python_returns):
+            structseq_def_lines.extend(
+                [
+                    "    @property",
+                    f"    def {name}(self) -> {ret_type}: ...",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 ]
             )
         structseq_def_lines.extend(
             [
+<<<<<<< HEAD
                 f"    def __new__(cls, sequence: {seq_type}): ...",
                 f"    n_fields: _int = {len(field_names)}",
                 f"    n_sequeunce_fields: _int = {len(field_names)}",
                 "    n_unnamed_fields: _int = 0",
+=======
+                "    def __new__(",
+                "        cls,",
+                f"        sequence: {seq_type},",
+                "    ) -> Self:  # fmt: skip",
+                "        ...",
+                f"    n_fields: Final[_int] = {len(field_names)}",
+                f"    n_sequence_fields: Final[_int] = {len(field_names)}",
+                "    n_unnamed_fields: Final[_int] = 0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 "    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing",
                 "",  # add an extra newline
             ]
@@ -1032,15 +1165,30 @@ def returns_structseq_pyi(signature: PythonSignature) -> tuple[str, str] | None:
         structseq_def = "\n".join(structseq_def_lines)
         # Example:
         # structseq_def = (
+<<<<<<< HEAD
         #     "class max(tuple[Tensor, Tensor]):\n"
+=======
+        #     "class max(tuple[Tensor, Tensor]):  # fmt: skip\n"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #     "    @property\n"
         #     "    def values(self) -> Tensor: ...\n"
         #     "    @property\n"
         #     "    def indices(self) -> Tensor: ...\n"
+<<<<<<< HEAD
         #     "    def __new__(cls, sequence: tuple[Tensor, Tensor]): ...\n"
         #     "    n_fields: _int = 2",
         #     "    n_sequeunce_fields: _int = 2",
         #     "    n_unnamed_fields: _int = 0",
+=======
+        #     "    def __new__(\n"
+        #     "        cls,\n"
+        #     "        sequence: tuple[Tensor, Tensor],\n"
+        #     "    ) -> Self:  # fmt: skip\n"
+        #     "        ...\n"
+        #     "    n_fields: Final[_int] = 2",
+        #     "    n_sequence_fields: Final[_int] = 2",
+        #     "    n_unnamed_fields: Final[_int] = 0",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         #     "    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing",
         # )
         return structseq_name, structseq_def
diff --git a/torchgen/api/structured.py b/torchgen/api/structured.py
index 93a72eb2b4a5..341cc1e846fe 100644
--- a/torchgen/api/structured.py
+++ b/torchgen/api/structured.py
@@ -1,5 +1,10 @@
 from __future__ import annotations
 
+<<<<<<< HEAD
+=======
+from typing_extensions import assert_never
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torchgen.api import cpp
 from torchgen.api.types import (
     ArgName,
@@ -30,7 +35,10 @@
     TensorOptionsArguments,
     Type,
 )
+<<<<<<< HEAD
 from torchgen.utils import assert_never
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # This file describes the translation of JIT schema to the structured functions API.
diff --git a/torchgen/api/types/signatures.py b/torchgen/api/types/signatures.py
index 384eeeb8e483..11abd70e376f 100644
--- a/torchgen/api/types/signatures.py
+++ b/torchgen/api/types/signatures.py
@@ -49,7 +49,11 @@ class CppSignature:
 
     # Is this a fallback C++ binding?  Fallback bindings are enabled by
     # manual_cpp_binding: True and are alternate, non-public API that
+<<<<<<< HEAD
     # lets manual C++ binding implementors access the binding that would
+=======
+    # lets manual C++ binding implementers access the binding that would
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # have been automatically generated
     fallback_binding: bool = False
 
diff --git a/torchgen/build.bzl b/torchgen/build.bzl
index 2ec68955df91..907aea44f492 100644
--- a/torchgen/build.bzl
+++ b/torchgen/build.bzl
@@ -18,6 +18,7 @@ def define_targets(rules):
             rules.requirement("typing-extensions"),
         ],
     )
+<<<<<<< HEAD
 
     rules.py_binary(
         name = "gen_executorch",
@@ -28,3 +29,5 @@ def define_targets(rules):
             rules.requirement("typing-extensions"),
         ],
     )
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torchgen/code_template.py b/torchgen/code_template.py
index 8c33aad126f8..62482867af8b 100644
--- a/torchgen/code_template.py
+++ b/torchgen/code_template.py
@@ -1,6 +1,12 @@
 from __future__ import annotations
 
+<<<<<<< HEAD
 import re
+=======
+import itertools
+import re
+import textwrap
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from typing import TYPE_CHECKING
 
 
@@ -45,9 +51,18 @@ def lookup(v: str) -> object:
             return kwargs[v] if v in kwargs else env[v]
 
         def indent_lines(indent: str, v: Sequence[object]) -> str:
+<<<<<<< HEAD
             return "".join(
                 [indent + l + "\n" for e in v for l in str(e).splitlines()]
             ).rstrip()
+=======
+            content = "\n".join(
+                itertools.chain.from_iterable(str(e).splitlines() for e in v)
+            )
+            content = textwrap.indent(content, prefix=indent)
+            # Remove trailing whitespace on each line
+            return "\n".join(map(str.rstrip, content.splitlines())).rstrip()
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
         def replace(match: re.Match[str]) -> str:
             indent = match.group(1)
diff --git a/torchgen/dest/native_functions.py b/torchgen/dest/native_functions.py
index b1488b4f1887..d09927997957 100644
--- a/torchgen/dest/native_functions.py
+++ b/torchgen/dest/native_functions.py
@@ -12,7 +12,11 @@ def torch_api_key_word_prefix(bankend_index: BackendIndex) -> str:
     if bankend_index.external:
         return ""
 
+<<<<<<< HEAD
     # Although Intel GPU ATen library is out-of-tree, it still utilizes torchgen to produce structrued
+=======
+    # Although Intel GPU ATen library is out-of-tree, it still utilizes torchgen to produce structured
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # kernels. Regarding these produced structured kernels, they should be visible for the Intel GPU ATen
     # library. Therefore, we need to add "TORCH_XPU_API" prefix to these structured kernels,
     # rather than "TORCH_API". Because the semantic of "TORCH_API" is "hidden" for out-of-tree backends.
diff --git a/torchgen/dest/register_dispatch_key.py b/torchgen/dest/register_dispatch_key.py
index 5b7feef83237..14d125d90d7a 100644
--- a/torchgen/dest/register_dispatch_key.py
+++ b/torchgen/dest/register_dispatch_key.py
@@ -4,6 +4,10 @@
 import textwrap
 from dataclasses import dataclass
 from typing import Literal, TYPE_CHECKING
+<<<<<<< HEAD
+=======
+from typing_extensions import assert_never
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import torchgen.api.cpp as cpp
 import torchgen.api.meta as meta
@@ -36,7 +40,11 @@
     SchemaKind,
     TensorOptionsArguments,
 )
+<<<<<<< HEAD
 from torchgen.utils import assert_never, mapMaybe, Target
+=======
+from torchgen.utils import mapMaybe, Target
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -65,6 +73,11 @@ def gen_registration_headers(
     elif backend_index.dispatch_key == DispatchKey.XPU:
         # XPU specific, this header resides in third_party/torch-xpu-ops
         headers.append("#include <ATen/xpu/EmptyTensor.h>")
+<<<<<<< HEAD
+=======
+    elif backend_index.dispatch_key == DispatchKey.MTIA:
+        headers.append("#include <ATen/native/mtia/EmptyTensor.h>")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     elif per_operator_headers:
         headers += [
             "#include <ATen/ops/empty.h>",
@@ -91,6 +104,10 @@ def gen_empty_impl_names(
         DispatchKey.CUDA,
         DispatchKey.MPS,
         DispatchKey.XPU,
+<<<<<<< HEAD
+=======
+        DispatchKey.MTIA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ):
         dispatch = str(backend_index.dispatch_key).lower()
         empty_impl = f"at::detail::empty_{dispatch}"
@@ -644,6 +661,10 @@ def gen_class_set_output_body(self, k: SchemaKind, maybe_create_proxy: bool) ->
                 DispatchKey.CUDA,
                 DispatchKey.MPS,
                 DispatchKey.XPU,
+<<<<<<< HEAD
+=======
+                DispatchKey.MTIA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 DispatchKey.CompositeExplicitAutogradNonFunctional,
             )
             return f"""{maybe_set_guard_line}
@@ -723,6 +744,11 @@ def gen_class(
             guard_field = "c10::OptionalDeviceGuard guard_;"
         elif self.backend_index.dispatch_key == DispatchKey.XPU:
             guard_field = "c10::OptionalDeviceGuard guard_;"
+<<<<<<< HEAD
+=======
+        elif self.backend_index.dispatch_key == DispatchKey.MTIA:
+            guard_field = "c10::OptionalDeviceGuard guard_;"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         else:
             guard_field = ""
 
@@ -757,7 +783,11 @@ def gen_one(self, f: NativeFunction) -> str | None:
         # we generate CompositeExplicitAutogradNonFunctional implementations of functional and inplace
         # based on the out implementation.  But in fact, out is definable by
         # functional too (just not very efficiently), and this is honestly the
+<<<<<<< HEAD
         # MORE likely situation for a backend implementor.  How do we pick?
+=======
+        # MORE likely situation for a backend implementer.  How do we pick?
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         # Well, taking a page from Haskell type classes and default methods,
         # we could conceivably register a circular definition (out in terms
         # of functional, and functional in terms of out) and just require
@@ -770,7 +800,11 @@ def gen_one(self, f: NativeFunction) -> str | None:
             and f.func.kind() is SchemaKind.out
         ):
             # Never generate a default implementation for out, that's what you
+<<<<<<< HEAD
             # have to define as a backend implementor
+=======
+            # have to define as a backend implementer
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return None
 
         # Note [Direct dispatch bindings]
diff --git a/torchgen/dest/ufunc.py b/torchgen/dest/ufunc.py
index 832316d018e8..8b42dcc28e90 100644
--- a/torchgen/dest/ufunc.py
+++ b/torchgen/dest/ufunc.py
@@ -42,7 +42,11 @@
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 # NB: not bothering to generate dispatch stub forward declaration in header,
+<<<<<<< HEAD
 # we can just paste it whereever necessary
+=======
+# we can just paste it wherever necessary
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 # TODO: use BackendIndex
 # dispatch_key: DispatchKey  # only CPU/CUDA right now
diff --git a/torchgen/gen.py b/torchgen/gen.py
index 609d338887e6..9830652ba6ea 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -9,6 +9,10 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Callable, Literal, TYPE_CHECKING, TypeVar
+<<<<<<< HEAD
+=======
+from typing_extensions import assert_never
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 import yaml
 
@@ -17,7 +21,10 @@
 import torchgen.api.native as native
 import torchgen.api.structured as structured
 import torchgen.dest as dest
+<<<<<<< HEAD
 from torchgen.aoti.fallback_ops import inductor_fallback_ops
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torchgen.api import cpp
 from torchgen.api.translate import translate
 from torchgen.api.types import (
@@ -36,10 +43,15 @@
     with_native_function_and_indices,
 )
 from torchgen.gen_aoti_c_shim import (
+<<<<<<< HEAD
     gen_aoti_c_shim,
     gen_static_dispatch_backend_call_signature,
     get_fallback_op_name,
     get_header_for_aoti,
+=======
+    gen_aoti_c_shim_files,
+    gen_static_dispatch_backend_call_signature,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 )
 from torchgen.gen_functionalization_type import (
     gen_functionalization_definition,
@@ -84,7 +96,10 @@
 )
 from torchgen.selective_build.selector import SelectiveBuilder
 from torchgen.utils import (
+<<<<<<< HEAD
     assert_never,
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     concatMap,
     context,
     FileManager,
@@ -509,7 +524,11 @@ def static_dispatch(
 ) -> str:
     """
     For a given `NativeFunction`, find out the corresponding backend and dispatch to it. If more than one
+<<<<<<< HEAD
     backends exsit, fallback to static dispatch by determining dispatch key from inputs.
+=======
+    backends exist, fallback to static dispatch by determining dispatch key from inputs.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     Arguments:
         sig: A CppSignature or DispatcherSignature for this native function we want to use.
         f: NativeFunction to generate static dispatch.
@@ -2395,6 +2414,7 @@ def register_dispatch_key_env_callable(
             else:
                 raise AssertionError(f"unrecognized {dispatch_key} for ufunc")
 
+<<<<<<< HEAD
         structured_func_group_dict = {}
         for func_group in structured_native_functions:
             for func in func_group.functions():
@@ -2490,6 +2510,21 @@ def headers_for_aoti() -> str:
 
         del fm
 
+=======
+        del fm
+
+    gen_aoti_c_shim_files(
+        aoti_fm=aoti_fm,
+        aoti_backends=aoti_backends,
+        native_functions=native_functions,
+        backend_indices=backend_indices,
+        structured_native_functions=structured_native_functions,
+        extra_cuda_headers=extra_cuda_headers,
+        update_aoti_c_shim=update_aoti_c_shim,
+        extend_aoti_c_shim=extend_aoti_c_shim,
+    )
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # BackendSelect is generated specially
     def gen_backend_select() -> dict[str, list[str]]:
         relevant_fns = [
@@ -2696,7 +2731,11 @@ def gen_op_headers(
     #     but they could theoretically be called from user code (I added these kernels for completeness,
     #     since the ops are part of the public API).
     # (2) A derivative formula for every {view}_copy operator
+<<<<<<< HEAD
     #     {view}_copy operators can re-use the same derivative formulas as their {view} op counterparts,
+=======
+    #     {view}_copy operators can reuse the same derivative formulas as their {view} op counterparts,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     #     so rather than stamping all of the entries out in derivatives.yaml,
     #     we codegen them in.
     #     This is similar to how autograd codegen doesn't require inplace ops to have a derivatives.yaml entry.
@@ -2820,6 +2859,14 @@ def main() -> None:
         action="store_true",
         help="Generate XPU registration code when set",
     )
+<<<<<<< HEAD
+=======
+    parser.add_argument(
+        "--mtia",
+        action="store_true",
+        help="Generate MTIA registration code when set",
+    )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
     # TODO: --op-registration-whitelist will be removed when all call-sites
     # for gen.py are moved over to using the operator YAML file for mobile
@@ -2904,20 +2951,71 @@ def main() -> None:
 
     from torchgen.model import dispatch_keys
 
+<<<<<<< HEAD
     # TODO: stop generating CUDA kernels for non-CUDA builds
     ignore_keys = set()
     if not options.mps:
+=======
+    # Only a limited set of dispatch keys get CPUFunctions.h headers generated
+    # for them; this is the set
+    functions_keys = {
+        DispatchKey.CPU,
+        DispatchKey.CUDA,
+        DispatchKey.CompositeImplicitAutograd,
+        DispatchKey.CompositeImplicitAutogradNestedTensor,
+        DispatchKey.CompositeExplicitAutograd,
+        DispatchKey.CompositeExplicitAutogradNonFunctional,
+        DispatchKey.Meta,
+        DispatchKey.MTIA,
+    }
+
+    aoti_backends = {
+        DispatchKey.CPU,
+        DispatchKey.CUDA,
+    }
+
+    # TODO: stop generating CUDA kernels for non-CUDA builds
+    ignore_keys = set()
+
+    if options.mps or options.update_aoti_c_shim:
+        functions_keys.add(DispatchKey.MPS)
+        aoti_backends.add(DispatchKey.MPS)
+    else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ignore_keys.add(DispatchKey.MPS)
 
         if DispatchKey.MPS in dispatch_keys:
             del dispatch_keys[dispatch_keys.index(DispatchKey.MPS)]
 
+<<<<<<< HEAD
     if not options.xpu:
+=======
+    if options.xpu or options.update_aoti_c_shim:
+        functions_keys.add(DispatchKey.XPU)
+        aoti_backends.add(DispatchKey.XPU)
+    else:
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ignore_keys.add(DispatchKey.XPU)
 
         if DispatchKey.XPU in dispatch_keys:
             del dispatch_keys[dispatch_keys.index(DispatchKey.XPU)]
 
+<<<<<<< HEAD
+=======
+    if not options.mtia:
+        ignore_keys.add(DispatchKey.MTIA)
+
+        if DispatchKey.MTIA in dispatch_keys:
+            del dispatch_keys[dispatch_keys.index(DispatchKey.MTIA)]
+
+    if options.backend_whitelist:
+        dispatch_keys = [
+            k
+            for k in dispatch_keys
+            if is_generic_dispatch_key(k) or str(k) in options.backend_whitelist
+        ]
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path, ignore_keys)
     valid_tags = _GLOBAL_PARSE_TAGS_YAML_CACHE[tags_yaml_path]
     native_functions, backend_indices = (
@@ -2967,6 +3065,7 @@ def main() -> None:
     if options.xpu:
         device_fms["xpu"] = make_file_manager(options=options)
 
+<<<<<<< HEAD
     # Only a limited set of dispatch keys get CPUFunctions.h headers generated
     # for them; this is the set
     functions_keys = {
@@ -2998,6 +3097,8 @@ def main() -> None:
             if is_generic_dispatch_key(k) or str(k) in options.backend_whitelist
         ]
 
+=======
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     static_dispatch_idx: list[BackendIndex] = []
     if options.static_dispatch_backend:
         static_dispatch_idx = [
diff --git a/torchgen/gen_aoti_c_shim.py b/torchgen/gen_aoti_c_shim.py
index 1ed53d8a3fae..8f75abcd782f 100644
--- a/torchgen/gen_aoti_c_shim.py
+++ b/torchgen/gen_aoti_c_shim.py
@@ -1,9 +1,18 @@
 from __future__ import annotations
 
+<<<<<<< HEAD
+=======
+import difflib
+import os
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 import textwrap
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
+<<<<<<< HEAD
+=======
+from torchgen.aoti.fallback_ops import inductor_fallback_ops
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 from torchgen.api.types import DispatcherSignature
 from torchgen.api.types.signatures import CppSignature, CppSignatureGroup
 from torchgen.context import method_with_native_function
@@ -14,6 +23,10 @@
     BaseType,
     DispatchKey,
     FunctionSchema,
+<<<<<<< HEAD
+=======
+    is_cuda_dispatch_key,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ListType,
     NativeFunction,
     NativeFunctionsGroup,
@@ -21,7 +34,11 @@
     OptionalType,
     Type,
 )
+<<<<<<< HEAD
 from torchgen.utils import mapMaybe
+=======
+from torchgen.utils import FileManager, mapMaybe
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -50,7 +67,11 @@
     BaseTy.SymInt: "c10::SymInt",
     BaseTy.Scalar: "c10::Scalar",
     BaseTy.float: "double",
+<<<<<<< HEAD
     BaseTy.str: "c10::string_view",
+=======
+    BaseTy.str: "::std::string_view",
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     BaseTy.DeviceIndex: "c10::DeviceIndex",
     BaseTy.Layout: "c10::Layout",
     BaseTy.MemoryFormat: "c10::MemoryFormat",
@@ -199,11 +220,24 @@ def zip_type_and_name(types: list[str], names: list[str]) -> list[str]:
 
 
 # Generate argument declarations and callsite expressions
+<<<<<<< HEAD
 def gen_arguments(flat_arguments: Sequence[Argument]) -> tuple[list[str], list[str]]:
     types = []
     new_names = []
     callsite_exprs = []
     for arg in flat_arguments:
+=======
+def gen_arguments(
+    flat_arguments: Sequence[Argument], skipped_args: set[str]
+) -> tuple[list[str], list[str]]:
+    types: list[str] = []
+    new_names: list[str] = []
+    callsite_exprs: list[str] = []
+    for arg in flat_arguments:
+        if arg.name in skipped_args:
+            callsite_exprs.append("std::nullopt")
+            continue
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         new_types, names, _, new_callsite_exprs = convert_arg_type_and_name(
             arg.type, arg.name, arg.is_write
         )
@@ -230,7 +264,11 @@ def gen_returns(schema: FunctionSchema) -> tuple[list[str], list[str]]:
 
     def convert_return(typ: BaseType, val: str) -> str:
         if typ.name == BaseTy.Tensor:
+<<<<<<< HEAD
             return f"new_tensor_handle(std::move({val}));"
+=======
+            return f"new_tensor_handle(std::move({val}))"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         elif typ.name == BaseTy.SymInt:
             return f"{val}.expect_int()"
         elif typ.name == BaseTy.Scalar:
@@ -269,6 +307,7 @@ def convert_return(typ: BaseType, val: str) -> str:
 
 
 def gen_declaration_and_definition(
+<<<<<<< HEAD
     schema: FunctionSchema, device: str, backend_call: str
 ) -> tuple[str, str]:
     func_name = schema.name.unambiguous_name()
@@ -310,6 +349,95 @@ def gen_declaration_and_definition(
         definition,
     )
     return declaration, definition
+=======
+    schema: FunctionSchema,
+    device: str,
+    backend_call: str,
+    version_info: dict[str, list[str]],
+) -> tuple[str, str]:
+    base_name = schema.name.unambiguous_name()
+
+    global declaration_definition_cache
+    if (base_name, device, backend_call) in declaration_definition_cache:
+        return declaration_definition_cache[(base_name, device, backend_call)]
+
+    # Check the validity of version_info. The format should look like
+    # {"v2" : ["new_arg1"], "v3": ["new_arg2, new_arg3"]}.
+    indexed_version_info: dict[int, list[str]] = {1: []}
+    for ver_str, new_args in sorted(version_info.items()):
+        assert ver_str.startswith("v"), (
+            f"Version number for {base_name} is {ver_str}, not starting with 'v'"
+        )
+        try:
+            ver_id = int(ver_str[1:])
+        except ValueError as e:
+            raise AssertionError(
+                f"Version number for {base_name} is {ver_str}, not a valid integer after 'v'"
+            ) from e
+        assert ver_id not in indexed_version_info, (
+            f"{ver_str} for {base_name} has already been defined"
+        )
+        indexed_version_info[ver_id] = new_args
+
+    declarations: list[str] = []
+    definitions: list[str] = []
+    skipped_args: set[str] = set()
+
+    for ver_id, new_args in sorted(indexed_version_info.items(), reverse=True):
+        # Iterate in the reverse order, so the latest version of an op will get generated first
+        # with all the arguments included, while a set of to-be-trimmed args is carried down
+        # to generate earlier version of the op.
+        func_name = base_name if ver_id == 1 else f"{base_name}_v{ver_id}"
+        if schema.is_out_fn():
+            # out_variant has out arguments in the front, and it's ok to ignore return values
+            # because C shim functions only return AOTITorchError
+            args, callsite_exprs = gen_arguments(
+                [*schema.arguments.out, *schema.arguments.flat_non_out], skipped_args
+            )
+            ret_assignments: list[str] = []
+        else:
+            args, callsite_exprs = gen_arguments(
+                schema.arguments.flat_all, skipped_args
+            )
+            # ignore return values for inplace ops
+            ret_declarations, ret_assignments = (
+                ([], []) if schema.name.name.inplace else gen_returns(schema)
+            )
+            args.extend(ret_declarations)
+
+        declaration = textwrap.dedent(
+            f"AOTITorchError aoti_torch_{device}_{func_name}({', '.join(args)})"
+        )
+
+        tmp_result = "auto tmp_result = " if ret_assignments else ""
+        indent = "\t\t"
+        ret_assignments_str = (
+            "\n".join(indent + r for r in ret_assignments) if ret_assignments else ""
+        )
+        definition = (
+            textwrap.dedent(f"""
+        {declaration} {{
+            AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({{
+                {tmp_result}{backend_call}(
+                    {", ".join(callsite_exprs)}
+                );
+        """)
+            + ret_assignments_str
+            + textwrap.dedent("""
+            });
+        }
+        """)
+        )
+        skipped_args.update(new_args)
+        declarations.append(f"AOTI_TORCH_EXPORT {declaration};")
+        definitions.append(definition)
+
+    declaration_definition_cache[(base_name, device, backend_call)] = (
+        "\n".join(declarations),
+        "\n".join(definitions),
+    )
+    return declaration_definition_cache[(base_name, device, backend_call)]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 def gen_static_dispatch_backend_call_signature(
@@ -402,6 +530,10 @@ def get_fallback_op_name(func: NativeFunction) -> str:
 
 def gen_c_shim(
     func: NativeFunction,
+<<<<<<< HEAD
+=======
+    version_info: dict[str, list[str]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     func_group_mapping: dict[OperatorName, NativeFunctionsGroup],
     dispatch_key: DispatchKey,
     backend_indices: dict[DispatchKey, BackendIndex],
@@ -424,11 +556,21 @@ def gen_c_shim(
     try:
         if header:
             declaration, _ = gen_declaration_and_definition(
+<<<<<<< HEAD
                 schema, device, backend_call
             )
             return f"AOTI_TORCH_EXPORT {declaration};"
         else:
             _, definition = gen_declaration_and_definition(schema, device, backend_call)
+=======
+                schema, device, backend_call, version_info
+            )
+            return declaration
+        else:
+            _, definition = gen_declaration_and_definition(
+                schema, device, backend_call, version_info
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             return definition
 
     except NotImplementedError:
@@ -437,6 +579,10 @@ def gen_c_shim(
 
 @dataclass(frozen=True)
 class ShimGenerator:
+<<<<<<< HEAD
+=======
+    inductor_fallback_ops: dict[str, dict[str, list[str]]]
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     func_group_mapping: dict[OperatorName, NativeFunctionsGroup]
     dispatch_key: DispatchKey
     backend_indices: dict[DispatchKey, BackendIndex]
@@ -448,8 +594,15 @@ def __call__(
         self,
         func: NativeFunction,
     ) -> str | None:
+<<<<<<< HEAD
+        result = gen_c_shim(
+            func,
+=======
+        version_info = self.inductor_fallback_ops[get_fallback_op_name(func)]
         result = gen_c_shim(
             func,
+            version_info,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
             self.func_group_mapping,
             self.dispatch_key,
             self.backend_indices,
@@ -461,6 +614,10 @@ def __call__(
 
 def gen_aoti_c_shim(
     native_functions: Sequence[NativeFunction],
+<<<<<<< HEAD
+=======
+    inductor_fallback_ops: dict[str, dict[str, list[str]]],
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     func_group_mapping: dict[OperatorName, NativeFunctionsGroup],
     dispatch_key: DispatchKey,
     backend_indices: dict[DispatchKey, BackendIndex],
@@ -472,6 +629,10 @@ def gen_aoti_c_shim(
         list(
             mapMaybe(
                 ShimGenerator(
+<<<<<<< HEAD
+=======
+                    inductor_fallback_ops,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     func_group_mapping,
                     dispatch_key,
                     backend_indices,
@@ -484,10 +645,15 @@ def gen_aoti_c_shim(
     )
     device = dispatch_key.lower()
     warning = """
+<<<<<<< HEAD
+=======
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 // WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND.
 // See https://github.com/pytorch/pytorch/blob/7e86a7c0155295539996e0cf422883571126073e/torchgen/gen.py#L2424-L2436 for details"""
 
     if header:
+<<<<<<< HEAD
         return f"""
 {warning}
 
@@ -525,3 +691,166 @@ def gen_aoti_c_shim(
 using namespace torch::aot_inductor;
 
 {body}"""
+=======
+        return (
+            warning
+            + textwrap.dedent("""
+
+            #pragma once
+
+            #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+            #ifdef __cplusplus
+            extern "C" {
+            #endif
+
+            """)
+            + body
+            + textwrap.dedent("""
+
+            #ifdef __cplusplus
+            } // extern "C"
+            #endif
+            """)
+        )
+    else:
+        return (
+            warning
+            + textwrap.dedent(f"""
+
+            #include <torch/csrc/inductor/aoti_torch/generated/{"extend/" if extend_aoti_c_shim else ""}c_shim_{device}.h>
+            #include <torch/csrc/inductor/aoti_torch/utils.h>
+
+            #ifndef AT_PER_OPERATOR_HEADERS
+            #include <ATen/{str(dispatch_key)}Functions.h>
+            #include <ATen/CompositeExplicitAutogradFunctions.h>
+            #include <ATen/CompositeExplicitAutogradNonFunctionalFunctions.h>
+            #include <ATen/CompositeImplicitAutogradFunctions.h>
+            #else
+            """)
+            + includes
+            + textwrap.dedent("""
+            #endif // AT_PER_OPERATOR_HEADERS
+
+            using namespace torch::aot_inductor;
+
+            """)
+            + body
+        )
+
+
+def gen_aoti_c_shim_files(
+    aoti_fm: FileManager,
+    aoti_backends: set[DispatchKey],
+    native_functions: Sequence[NativeFunction],
+    backend_indices: dict[DispatchKey, BackendIndex],
+    structured_native_functions: Sequence[NativeFunctionsGroup],
+    extra_cuda_headers: str,
+    extend_aoti_c_shim: bool,
+    update_aoti_c_shim: bool,
+) -> None:
+    structured_func_group_dict = {}
+    for func_group in structured_native_functions:
+        for func in func_group.functions():
+            if func.structured_delegate is not None:
+                structured_func_group_dict[func.structured_delegate] = func_group
+                break
+
+    for dispatch_key in aoti_backends:
+        fallbacks = {}
+        for func in native_functions:
+            op_name = get_fallback_op_name(func)
+            if op_name in inductor_fallback_ops:
+                fallbacks[op_name] = func
+        fallback_native_functions = tuple(
+            value for _, value in sorted(fallbacks.items())
+        )
+
+        # header files were checked in for ABI-compatiblilty checking
+        header_file_name = f"c_shim_{dispatch_key.lower()}.h"
+        new_header = gen_aoti_c_shim(
+            fallback_native_functions,
+            inductor_fallback_ops,
+            structured_func_group_dict,
+            dispatch_key,
+            backend_indices,
+            header=True,
+            extend_aoti_c_shim=extend_aoti_c_shim,
+            includes="",
+        )
+        if update_aoti_c_shim:
+            aoti_fm.write(
+                header_file_name,
+                lambda: new_header,
+            )
+        else:
+            try:
+                with open(
+                    os.path.join(aoti_fm.install_dir, header_file_name)
+                ) as old_file:
+                    old_header = old_file.read()
+
+                    if old_header != new_header:
+                        diff = "\n".join(
+                            difflib.unified_diff(
+                                old_header.splitlines(),
+                                new_header.splitlines(),
+                                fromfile="expected",
+                                tofile="actual",
+                                lineterm="",
+                            )
+                        )
+
+                        raise RuntimeError(f"""
+The generated AOTInductor C shim header files have unexpectedly changed. This
+indicates an AOTInductor fallback operator ABI backward compatibility breakage!!!
+Only in a limited number of situations, this is allowed:
+
+1. You added a fallback op to the inductor_fallback_ops list in torchgen/aoti/fallback_ops.py.
+If that's the case, run `python torchgen/gen.py --update-aoti-c-shim` to add a new entry to
+existing C shim header files.
+
+2. You added a new default argument to an existing fallback op. This is clearly a BC breaking
+change in the AOTInductor land. You need to annotate the new default argument in
+torchgen/aoti/fallback_ops.py, and then run `python torchgen/gen.py --update-aoti-c-shim` to
+update the C shim header files by creating different versions of the fallback op. See
+https://github.com/pytorch/pytorch/pull/154848 as an example.
+
+{diff}
+                    """)
+            except FileNotFoundError:
+                print(
+                    f"{os.path.join(aoti_fm.install_dir, header_file_name)} not found"
+                )
+
+        # cpp files are always generated on-the-fly
+        def headers_for_aoti() -> str:
+            headers = []
+            for func in fallback_native_functions:
+                header = get_header_for_aoti(
+                    func,
+                    structured_func_group_dict,
+                    dispatch_key,
+                    backend_indices,
+                    extend_aoti_c_shim=extend_aoti_c_shim,
+                )
+                if header is not None:
+                    headers.append(header)
+            return "\n".join(sorted(set(headers)))
+
+        extra_headers = extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else ""
+
+        aoti_fm.write(
+            f"c_shim_{dispatch_key.lower()}.cpp",
+            lambda: gen_aoti_c_shim(
+                fallback_native_functions,
+                inductor_fallback_ops,
+                structured_func_group_dict,
+                dispatch_key,
+                backend_indices,
+                header=False,
+                extend_aoti_c_shim=extend_aoti_c_shim,
+                includes=headers_for_aoti() + "\n" + extra_headers,
+            ),
+        )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
index bf4b884d849f..722bfaad0208 100644
--- a/torchgen/gen_functionalization_type.py
+++ b/torchgen/gen_functionalization_type.py
@@ -198,7 +198,11 @@ def is_tensor_like(a: Argument | TensorOptionsArguments | SelfArgument) -> bool:
 # We need to wrap / unwrap various arguments from the op in the functionalization kernels.
 # Some op schemas include non-owning types though (like TensorList),
 # and when we unwrap them we expect to get out an owning type!.
+<<<<<<< HEAD
 # We also return a lambda that tells you how to conver the non-owning type argument into the owning type.
+=======
+# We also return a lambda that tells you how to convert the non-owning type argument into the owning type.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 def get_owning_type(t: CType) -> tuple[CType, Callable[[str], str]]:
     if t == BaseCType(tensorListT):
         return VectorCType(BaseCType(tensorT)), lambda x: f"{x}.vec()"
@@ -441,7 +445,11 @@ def emit_view_functionalization_body(
       // This function adds the above view meta to the current tensor and replays them off the base,
       // mutating the size/stride info of the current FunctionalTensorWrapper.
       // Because of this, we need to make sure to run the reference shape function above,
+<<<<<<< HEAD
       // BEFORE doing this (otherwise we'll end up runnin the reference function using the wrong sizes/strides)
+=======
+      // BEFORE doing this (otherwise we'll end up running the reference function using the wrong sizes/strides)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
       at::functionalization::impl::mutate_view_meta({view_tensor_name}, view_meta);
       // See  Note [Propagating strides in the functionalization pass]
       // XLA/LTC don't implement the logic to propagate strides correctly, so we need to rely
diff --git a/torchgen/gen_schema_utils.py b/torchgen/gen_schema_utils.py
index 1095d2e7e431..d61a0488e027 100644
--- a/torchgen/gen_schema_utils.py
+++ b/torchgen/gen_schema_utils.py
@@ -47,7 +47,11 @@ def from_example(obj: Any) -> Union[BaseType, ListType, CustomClassType]:
             all_base_tys = [TypeGen.from_example(x) for x in obj]
             if len(set(all_base_tys)) > 1:
                 raise RuntimeError(
+<<<<<<< HEAD
                     f"Cannot generate schema for a seqeunce of args of heterogeneous types: {all_base_tys}. "
+=======
+                    f"Cannot generate schema for a sequence of args of heterogeneous types: {all_base_tys}. "
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                     "Consider unpacking the argument and give proper names to them if possible "
                     "instead of using *args."
                 )
diff --git a/torchgen/model.py b/torchgen/model.py
index 6fa9c8277900..a29139e70eb4 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -6,8 +6,14 @@
 from dataclasses import dataclass
 from enum import auto, Enum
 from typing import Callable, Optional, TYPE_CHECKING
+<<<<<<< HEAD
 
 from torchgen.utils import assert_never, NamespaceHelper, OrderedSet
+=======
+from typing_extensions import assert_never
+
+from torchgen.utils import NamespaceHelper, OrderedSet
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 if TYPE_CHECKING:
@@ -270,6 +276,10 @@ def codegen_per_backend_entries() -> str:
     DispatchKey.CUDA,
     DispatchKey.CPU,
     DispatchKey.XPU,
+<<<<<<< HEAD
+=======
+    DispatchKey.MTIA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 }
 UFUNC_DISPATCH_KEYS = {DispatchKey.CUDA, DispatchKey.CPU}
 
@@ -295,6 +305,10 @@ def codegen_per_backend_entries() -> str:
     DispatchKey.NestedTensorCPU,
     DispatchKey.NestedTensorCUDA,
     DispatchKey.NestedTensorXPU,
+<<<<<<< HEAD
+=======
+    DispatchKey.NestedTensorHPU,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # Meta is a magic key: it is automatically generated for structured
     # kernels
     DispatchKey.Meta,
@@ -303,6 +317,10 @@ def codegen_per_backend_entries() -> str:
     DispatchKey.QuantizedMeta,
     DispatchKey.NestedTensorMeta,
     DispatchKey.ZeroTensor,
+<<<<<<< HEAD
+=======
+    DispatchKey.MTIA,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ]
 
 
@@ -589,7 +607,11 @@ class NativeFunction:
     has_composite_explicit_autograd_non_functional_kernel: bool
 
     # Tags are used to describe semantic information about (groups of) operators,
+<<<<<<< HEAD
     # That aren't easily inferrable directly from the operator's schema.
+=======
+    # That aren't easily inferable directly from the operator's schema.
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     tags: set[str]
 
     # NB: The benefit of defining a dataclass is that we automatically get
diff --git a/torchgen/operator_versions/gen_mobile_upgraders.py b/torchgen/operator_versions/gen_mobile_upgraders.py
index 845034cb7484..b6de978d89d4 100644
--- a/torchgen/operator_versions/gen_mobile_upgraders.py
+++ b/torchgen/operator_versions/gen_mobile_upgraders.py
@@ -129,12 +129,18 @@ class ByteCode(Enum):
     MOBILE_UPGRADERS_HEADER_DESCRIPTION
     + """
 #include <caffe2/serialize/versions.h>
+<<<<<<< HEAD
 #include <torch/csrc/jit/mobile/upgrader_mobile.h>
 
 namespace c10 {
 TypePtr parseType(const std::string& pythonStr);
 } // namespace c10
 
+=======
+#include <torch/csrc/jit/mobile/type_parser.h>
+#include <torch/csrc/jit/mobile/upgrader_mobile.h>
+
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 namespace torch {
 namespace jit {
 
diff --git a/torchgen/shape_functions/gen_jit_shape_functions.py b/torchgen/shape_functions/gen_jit_shape_functions.py
index 56a3d8bf0dd3..31aa27328f4f 100644
--- a/torchgen/shape_functions/gen_jit_shape_functions.py
+++ b/torchgen/shape_functions/gen_jit_shape_functions.py
@@ -102,7 +102,11 @@ def gen_serialized_decompisitions() -> str:
     output_strs.append(curr_str)
 
     final_output = ""
+<<<<<<< HEAD
     # Windows compiler doesnt correctly handle adjacent
+=======
+    # Windows compiler doesn't correctly handle adjacent
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     # string literals
     for output_str in output_strs:
         start = '+ std::string(R"=====('
diff --git a/torchgen/static_runtime/generator.py b/torchgen/static_runtime/generator.py
index bc1772422a8f..a6536bd6960b 100644
--- a/torchgen/static_runtime/generator.py
+++ b/torchgen/static_runtime/generator.py
@@ -324,6 +324,10 @@ def ivalue_type_conversion_method(
         BaseTy.str: (
             (False, "toStringView()"),
             (False, "toOptional<c10::string_view>()"),
+<<<<<<< HEAD
+=======
+            (False, "toOptional<::std::string_view>()"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         ),
     }
 
diff --git a/torchgen/utils.py b/torchgen/utils.py
index 2d760a51145b..a211014dc6e6 100644
--- a/torchgen/utils.py
+++ b/torchgen/utils.py
@@ -11,7 +11,11 @@
 from enum import auto, Enum
 from pathlib import Path
 from typing import Any, Callable, Generic, Literal, NoReturn, TYPE_CHECKING, TypeVar
+<<<<<<< HEAD
 from typing_extensions import Self
+=======
+from typing_extensions import assert_never, deprecated, Self
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 from torchgen.code_template import CodeTemplate
 
@@ -21,7 +25,12 @@
     from collections.abc import Iterable, Iterator, Sequence
 
 
+<<<<<<< HEAD
 REPO_ROOT = Path(__file__).absolute().parent.parent
+=======
+TORCHGEN_ROOT = Path(__file__).absolute().parent
+REPO_ROOT = TORCHGEN_ROOT.parent
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 # Many of these functions share logic for defining both the definition
@@ -96,11 +105,21 @@ def context(msg_fn: Callable[[], str]) -> Iterator[None]:
         raise
 
 
+<<<<<<< HEAD
 # A little trick from https://github.com/python/mypy/issues/6366
 # for getting mypy to do exhaustiveness checking
 # TODO: put this somewhere else, maybe
 def assert_never(x: NoReturn) -> NoReturn:
     raise AssertionError(f"Unhandled type: {type(x).__name__}")
+=======
+if TYPE_CHECKING:
+    # A little trick from https://github.com/python/mypy/issues/6366
+    # for getting mypy to do exhaustiveness checking
+    # TODO: put this somewhere else, maybe
+    @deprecated("Use typing_extensions.assert_never instead")
+    def assert_never(x: NoReturn) -> NoReturn:  # type: ignore[misc] # noqa: F811
+        raise AssertionError(f"Unhandled type: {type(x).__name__}")
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 
 
 @functools.cache
@@ -118,6 +137,7 @@ def string_stable_hash(s: str) -> int:
 # of what files have been written (so you can write out a list of output
 # files)
 class FileManager:
+<<<<<<< HEAD
     install_dir: str
     template_dir: str
     dry_run: bool
@@ -151,6 +171,49 @@ def substitute_with_template(
         if isinstance(env, dict):
             if "generated_comment" not in env:
                 generator_default = REPO_ROOT / "torchgen" / "gen.py"
+=======
+    def __init__(
+        self,
+        install_dir: str | Path,
+        template_dir: str | Path,
+        dry_run: bool,
+    ) -> None:
+        self.install_dir = Path(install_dir)
+        self.template_dir = Path(template_dir)
+        self.files: set[Path] = set()
+        self.dry_run = dry_run
+
+    @property
+    def filenames(self) -> frozenset[str]:
+        return frozenset({file.as_posix() for file in self.files})
+
+    def _write_if_changed(self, filename: str | Path, contents: str) -> None:
+        file = Path(filename)
+        old_contents: str | None = None
+        try:
+            old_contents = file.read_text(encoding="utf-8")
+        except OSError:
+            pass
+        if contents != old_contents:
+            # Create output directory if it doesn't exist
+            file.parent.mkdir(parents=True, exist_ok=True)
+            file.write_text(contents, encoding="utf-8")
+
+    # Read from template file and replace pattern with callable (type could be dict or str).
+    def substitute_with_template(
+        self,
+        template_fn: str | Path,
+        env_callable: Callable[[], str | dict[str, Any]],
+    ) -> str:
+        assert not Path(template_fn).is_absolute(), (
+            f"template_fn must be relative: {template_fn}"
+        )
+        template_path = self.template_dir / template_fn
+        env = env_callable()
+        if isinstance(env, dict):
+            if "generated_comment" not in env:
+                generator_default = TORCHGEN_ROOT / "gen.py"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 try:
                     generator = Path(
                         sys.modules["__main__"].__file__ or generator_default
@@ -170,6 +233,7 @@ def substitute_with_template(
                     ),
                 }
             template = _read_template(template_path)
+<<<<<<< HEAD
             return template.substitute(env)
         elif isinstance(env, str):
             return env
@@ -185,23 +249,70 @@ def write_with_template(
         filename = f"{self.install_dir}/{filename}"
         assert filename not in self.filenames, "duplicate file write {filename}"
         self.filenames.add(filename)
+=======
+            substitute_out = template.substitute(env)
+            # Ensure an extra blank line between the class/function definition
+            # and the docstring of the previous class/function definition.
+            # NB: It is generally not recommended to have docstrings in pyi stub
+            #     files. But if there are any, we need to ensure that the file
+            #     is properly formatted.
+            return re.sub(
+                r'''
+                (""")\n+             # match triple quotes
+                (
+                    (\s*@.+\n)*     # match decorators if any
+                    \s*(class|def)  # match class/function definition
+                )
+                ''',
+                r"\g<1>\n\n\g<2>",
+                substitute_out,
+                flags=re.VERBOSE,
+            )
+        if isinstance(env, str):
+            return env
+        assert_never(env)
+
+    def write_with_template(
+        self,
+        filename: str | Path,
+        template_fn: str | Path,
+        env_callable: Callable[[], str | dict[str, Any]],
+    ) -> None:
+        filename = Path(filename)
+        assert not filename.is_absolute(), f"filename must be relative: {filename}"
+        file = self.install_dir / filename
+        assert file not in self.files, f"duplicate file write {file}"
+        self.files.add(file)
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         if not self.dry_run:
             substitute_out = self.substitute_with_template(
                 template_fn=template_fn,
                 env_callable=env_callable,
             )
+<<<<<<< HEAD
             self._write_if_changed(filename=filename, contents=substitute_out)
 
     def write(
         self,
         filename: str,
+=======
+            self._write_if_changed(filename=file, contents=substitute_out)
+
+    def write(
+        self,
+        filename: str | Path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         env_callable: Callable[[], str | dict[str, Any]],
     ) -> None:
         self.write_with_template(filename, filename, env_callable)
 
     def write_sharded(
         self,
+<<<<<<< HEAD
         filename: str,
+=======
+        filename: str | Path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         items: Iterable[T],
         *,
         key_fn: Callable[[T], str],
@@ -223,8 +334,13 @@ def write_sharded(
 
     def write_sharded_with_template(
         self,
+<<<<<<< HEAD
         filename: str,
         template_fn: str,
+=======
+        filename: str | Path,
+        template_fn: str | Path,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         items: Iterable[T],
         *,
         key_fn: Callable[[T], str],
@@ -233,6 +349,11 @@ def write_sharded_with_template(
         base_env: dict[str, Any] | None = None,
         sharded_keys: set[str],
     ) -> None:
+<<<<<<< HEAD
+=======
+        file = Path(filename)
+        assert not file.is_absolute(), f"filename must be relative: {filename}"
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         everything: dict[str, Any] = {"shard_id": "Everything"}
         shards: list[dict[str, Any]] = [
             {"shard_id": f"_{i}"} for i in range(num_shards)
@@ -270,6 +391,7 @@ def merge_env(into: dict[str, list[str]], from_: dict[str, list[str]]) -> None:
             merge_env(shards[sid], env)
             merge_env(everything, env)
 
+<<<<<<< HEAD
         dot_pos = filename.rfind(".")
         if dot_pos == -1:
             dot_pos = len(filename)
@@ -280,11 +402,18 @@ def merge_env(into: dict[str, list[str]], from_: dict[str, list[str]]) -> None:
             shard_id = shard["shard_id"]
             self.write_with_template(
                 f"{base_filename}{shard_id}{extension}",
+=======
+        for shard in all_shards:
+            shard_id = shard["shard_id"]
+            self.write_with_template(
+                file.with_stem(f"{file.stem}{shard_id}"),
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
                 template_fn,
                 lambda: shard,
             )
 
         # filenames is used to track compiled files, but FooEverything.cpp isn't meant to be compiled
+<<<<<<< HEAD
         self.filenames.discard(
             f"{self.install_dir}/{base_filename}Everything{extension}"
         )
@@ -295,6 +424,20 @@ def write_outputs(self, variable_name: str, filename: str) -> None:
         content = "set({}\n    {})".format(
             variable_name,
             "\n    ".join('"' + name + '"' for name in sorted(self.filenames)),
+=======
+        self.files.discard(self.install_dir / file.with_stem(f"{file.stem}Everything"))
+
+    def write_outputs(self, variable_name: str, filename: str | Path) -> None:
+        """Write a file containing the list of all outputs which are generated by this script."""
+        content = "\n".join(
+            (
+                "set(",
+                variable_name,
+                # Use POSIX paths to avoid invalid escape sequences on Windows
+                *(f'    "{file.as_posix()}"' for file in sorted(self.files)),
+                ")",
+            )
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
         )
         self._write_if_changed(filename, content)
 
@@ -309,12 +452,23 @@ def template_dir_for_comments(self) -> str:
 
 # Helper function to generate file manager
 def make_file_manager(
+<<<<<<< HEAD
     options: Namespace, install_dir: str | None = None
+=======
+    options: Namespace,
+    install_dir: str | Path | None = None,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
 ) -> FileManager:
     template_dir = os.path.join(options.source_path, "templates")
     install_dir = install_dir if install_dir else options.install_dir
     return FileManager(
+<<<<<<< HEAD
         install_dir=install_dir, template_dir=template_dir, dry_run=options.dry_run
+=======
+        install_dir=install_dir,
+        template_dir=template_dir,
+        dry_run=options.dry_run,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     )
 
 
@@ -437,7 +591,14 @@ class NamespaceHelper:
     """
 
     def __init__(
+<<<<<<< HEAD
         self, namespace_str: str, entity_name: str = "", max_level: int = 2
+=======
+        self,
+        namespace_str: str,
+        entity_name: str = "",
+        max_level: int = 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> None:
         # cpp_namespace can be a colon joined string such as torch::lazy
         cpp_namespaces = namespace_str.split("::")
@@ -454,7 +615,12 @@ def __init__(
 
     @staticmethod
     def from_namespaced_entity(
+<<<<<<< HEAD
         namespaced_entity: str, max_level: int = 2
+=======
+        namespaced_entity: str,
+        max_level: int = 2,
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))
     ) -> NamespaceHelper:
         """
         Generate helper from nested namespaces as long as class/function name. E.g.: "torch::lazy::add"
diff --git a/version.txt b/version.txt
index 860487ca19ce..f95bf51d6331 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1,5 @@
+<<<<<<< HEAD
 2.7.1
+=======
+2.8.0
+>>>>>>> 5729657180 ([ROCm] Specialized binary elementwise broadcast kernel for mixed dtypes with float/bfloat16/half (#2791))